Files
MistralMathOctopus-7B/trainer_state.json
ModelHub XC e2e68e87c8 初始化项目,由ModelHub XC社区提供模型
Model: kevinpro/MistralMathOctopus-7B
Source: Original Platform
2026-04-11 23:58:59 +08:00

25651 lines
570 KiB
JSON

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9997951239500102,
"eval_steps": 500,
"global_step": 3660,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 4.540920734405518,
"learning_rate": 1.818181818181818e-08,
"loss": 0.1451,
"step": 1
},
{
"epoch": 0.0,
"grad_norm": 6.235923767089844,
"learning_rate": 3.636363636363636e-08,
"loss": 0.1932,
"step": 2
},
{
"epoch": 0.0,
"grad_norm": 6.495096683502197,
"learning_rate": 5.454545454545454e-08,
"loss": 0.2138,
"step": 3
},
{
"epoch": 0.0,
"grad_norm": 5.446401119232178,
"learning_rate": 7.272727272727273e-08,
"loss": 0.1645,
"step": 4
},
{
"epoch": 0.0,
"grad_norm": 6.433992862701416,
"learning_rate": 9.09090909090909e-08,
"loss": 0.1969,
"step": 5
},
{
"epoch": 0.0,
"grad_norm": 6.203385353088379,
"learning_rate": 1.0909090909090908e-07,
"loss": 0.2056,
"step": 6
},
{
"epoch": 0.0,
"grad_norm": 4.300374984741211,
"learning_rate": 1.2727272727272726e-07,
"loss": 0.138,
"step": 7
},
{
"epoch": 0.0,
"grad_norm": 5.246260643005371,
"learning_rate": 1.4545454545454545e-07,
"loss": 0.1673,
"step": 8
},
{
"epoch": 0.0,
"grad_norm": 5.447892665863037,
"learning_rate": 1.6363636363636364e-07,
"loss": 0.1739,
"step": 9
},
{
"epoch": 0.0,
"grad_norm": 5.377528190612793,
"learning_rate": 1.818181818181818e-07,
"loss": 0.168,
"step": 10
},
{
"epoch": 0.0,
"grad_norm": 5.494555950164795,
"learning_rate": 2e-07,
"loss": 0.173,
"step": 11
},
{
"epoch": 0.0,
"grad_norm": 5.380703926086426,
"learning_rate": 2.1818181818181815e-07,
"loss": 0.1594,
"step": 12
},
{
"epoch": 0.0,
"grad_norm": 5.685830116271973,
"learning_rate": 2.3636363636363634e-07,
"loss": 0.1892,
"step": 13
},
{
"epoch": 0.0,
"grad_norm": 4.251393795013428,
"learning_rate": 2.5454545454545453e-07,
"loss": 0.123,
"step": 14
},
{
"epoch": 0.0,
"grad_norm": 4.709303379058838,
"learning_rate": 2.727272727272727e-07,
"loss": 0.1555,
"step": 15
},
{
"epoch": 0.0,
"grad_norm": 5.874042987823486,
"learning_rate": 2.909090909090909e-07,
"loss": 0.1811,
"step": 16
},
{
"epoch": 0.0,
"grad_norm": 5.070192337036133,
"learning_rate": 3.0909090909090907e-07,
"loss": 0.1657,
"step": 17
},
{
"epoch": 0.0,
"grad_norm": 4.039353370666504,
"learning_rate": 3.272727272727273e-07,
"loss": 0.1373,
"step": 18
},
{
"epoch": 0.01,
"grad_norm": 5.17448091506958,
"learning_rate": 3.4545454545454544e-07,
"loss": 0.1782,
"step": 19
},
{
"epoch": 0.01,
"grad_norm": 5.469040393829346,
"learning_rate": 3.636363636363636e-07,
"loss": 0.1865,
"step": 20
},
{
"epoch": 0.01,
"grad_norm": 4.591649532318115,
"learning_rate": 3.818181818181818e-07,
"loss": 0.1772,
"step": 21
},
{
"epoch": 0.01,
"grad_norm": 3.8730628490448,
"learning_rate": 4e-07,
"loss": 0.1458,
"step": 22
},
{
"epoch": 0.01,
"grad_norm": 3.1940574645996094,
"learning_rate": 4.1818181818181814e-07,
"loss": 0.117,
"step": 23
},
{
"epoch": 0.01,
"grad_norm": 3.8417506217956543,
"learning_rate": 4.363636363636363e-07,
"loss": 0.1477,
"step": 24
},
{
"epoch": 0.01,
"grad_norm": 3.924102306365967,
"learning_rate": 4.545454545454545e-07,
"loss": 0.1496,
"step": 25
},
{
"epoch": 0.01,
"grad_norm": 4.038068771362305,
"learning_rate": 4.727272727272727e-07,
"loss": 0.1745,
"step": 26
},
{
"epoch": 0.01,
"grad_norm": 3.5735890865325928,
"learning_rate": 4.909090909090909e-07,
"loss": 0.1278,
"step": 27
},
{
"epoch": 0.01,
"grad_norm": 4.1404571533203125,
"learning_rate": 5.090909090909091e-07,
"loss": 0.1648,
"step": 28
},
{
"epoch": 0.01,
"grad_norm": 4.566656112670898,
"learning_rate": 5.272727272727272e-07,
"loss": 0.1759,
"step": 29
},
{
"epoch": 0.01,
"grad_norm": 3.562042713165283,
"learning_rate": 5.454545454545454e-07,
"loss": 0.1438,
"step": 30
},
{
"epoch": 0.01,
"grad_norm": 3.4260830879211426,
"learning_rate": 5.636363636363635e-07,
"loss": 0.1328,
"step": 31
},
{
"epoch": 0.01,
"grad_norm": 3.2388596534729004,
"learning_rate": 5.818181818181818e-07,
"loss": 0.1379,
"step": 32
},
{
"epoch": 0.01,
"grad_norm": 3.5923690795898438,
"learning_rate": 6e-07,
"loss": 0.143,
"step": 33
},
{
"epoch": 0.01,
"grad_norm": 3.057919979095459,
"learning_rate": 6.181818181818181e-07,
"loss": 0.1169,
"step": 34
},
{
"epoch": 0.01,
"grad_norm": 3.3284711837768555,
"learning_rate": 6.363636363636363e-07,
"loss": 0.125,
"step": 35
},
{
"epoch": 0.01,
"grad_norm": 3.4238357543945312,
"learning_rate": 6.545454545454546e-07,
"loss": 0.1247,
"step": 36
},
{
"epoch": 0.01,
"grad_norm": 3.1603212356567383,
"learning_rate": 6.727272727272727e-07,
"loss": 0.1399,
"step": 37
},
{
"epoch": 0.01,
"grad_norm": 3.2443954944610596,
"learning_rate": 6.909090909090909e-07,
"loss": 0.1494,
"step": 38
},
{
"epoch": 0.01,
"grad_norm": 3.3392746448516846,
"learning_rate": 7.09090909090909e-07,
"loss": 0.1325,
"step": 39
},
{
"epoch": 0.01,
"grad_norm": 3.302252769470215,
"learning_rate": 7.272727272727272e-07,
"loss": 0.1234,
"step": 40
},
{
"epoch": 0.01,
"grad_norm": 3.4786174297332764,
"learning_rate": 7.454545454545455e-07,
"loss": 0.1372,
"step": 41
},
{
"epoch": 0.01,
"grad_norm": 3.567997694015503,
"learning_rate": 7.636363636363636e-07,
"loss": 0.1225,
"step": 42
},
{
"epoch": 0.01,
"grad_norm": 3.4705660343170166,
"learning_rate": 7.818181818181818e-07,
"loss": 0.123,
"step": 43
},
{
"epoch": 0.01,
"grad_norm": 3.383411169052124,
"learning_rate": 8e-07,
"loss": 0.1274,
"step": 44
},
{
"epoch": 0.01,
"grad_norm": 3.456897258758545,
"learning_rate": 8.181818181818182e-07,
"loss": 0.1047,
"step": 45
},
{
"epoch": 0.01,
"grad_norm": 3.369755268096924,
"learning_rate": 8.363636363636363e-07,
"loss": 0.1195,
"step": 46
},
{
"epoch": 0.01,
"grad_norm": 3.9524924755096436,
"learning_rate": 8.545454545454544e-07,
"loss": 0.1294,
"step": 47
},
{
"epoch": 0.01,
"grad_norm": 3.9460361003875732,
"learning_rate": 8.727272727272726e-07,
"loss": 0.1534,
"step": 48
},
{
"epoch": 0.01,
"grad_norm": 3.7565395832061768,
"learning_rate": 8.909090909090909e-07,
"loss": 0.1297,
"step": 49
},
{
"epoch": 0.01,
"grad_norm": 3.749035596847534,
"learning_rate": 9.09090909090909e-07,
"loss": 0.1435,
"step": 50
},
{
"epoch": 0.01,
"grad_norm": 3.6407580375671387,
"learning_rate": 9.272727272727272e-07,
"loss": 0.1163,
"step": 51
},
{
"epoch": 0.01,
"grad_norm": 3.7361981868743896,
"learning_rate": 9.454545454545454e-07,
"loss": 0.1302,
"step": 52
},
{
"epoch": 0.01,
"grad_norm": 3.205831527709961,
"learning_rate": 9.636363636363636e-07,
"loss": 0.0986,
"step": 53
},
{
"epoch": 0.01,
"grad_norm": 3.8467040061950684,
"learning_rate": 9.818181818181818e-07,
"loss": 0.1389,
"step": 54
},
{
"epoch": 0.02,
"grad_norm": 3.386436700820923,
"learning_rate": 1e-06,
"loss": 0.1114,
"step": 55
},
{
"epoch": 0.02,
"grad_norm": 3.471832752227783,
"learning_rate": 1.0181818181818181e-06,
"loss": 0.1278,
"step": 56
},
{
"epoch": 0.02,
"grad_norm": 3.7745180130004883,
"learning_rate": 1.0363636363636363e-06,
"loss": 0.1184,
"step": 57
},
{
"epoch": 0.02,
"grad_norm": 3.520988702774048,
"learning_rate": 1.0545454545454544e-06,
"loss": 0.1025,
"step": 58
},
{
"epoch": 0.02,
"grad_norm": 3.45341157913208,
"learning_rate": 1.0727272727272726e-06,
"loss": 0.1162,
"step": 59
},
{
"epoch": 0.02,
"grad_norm": 3.98226261138916,
"learning_rate": 1.0909090909090908e-06,
"loss": 0.1496,
"step": 60
},
{
"epoch": 0.02,
"grad_norm": 3.621644973754883,
"learning_rate": 1.1090909090909091e-06,
"loss": 0.1232,
"step": 61
},
{
"epoch": 0.02,
"grad_norm": 3.586500406265259,
"learning_rate": 1.127272727272727e-06,
"loss": 0.1345,
"step": 62
},
{
"epoch": 0.02,
"grad_norm": 3.3112382888793945,
"learning_rate": 1.1454545454545455e-06,
"loss": 0.1423,
"step": 63
},
{
"epoch": 0.02,
"grad_norm": 3.1299896240234375,
"learning_rate": 1.1636363636363636e-06,
"loss": 0.1267,
"step": 64
},
{
"epoch": 0.02,
"grad_norm": 3.3250696659088135,
"learning_rate": 1.1818181818181818e-06,
"loss": 0.1108,
"step": 65
},
{
"epoch": 0.02,
"grad_norm": 3.5346617698669434,
"learning_rate": 1.2e-06,
"loss": 0.1243,
"step": 66
},
{
"epoch": 0.02,
"grad_norm": 3.2549474239349365,
"learning_rate": 1.2181818181818181e-06,
"loss": 0.1323,
"step": 67
},
{
"epoch": 0.02,
"grad_norm": 3.8255019187927246,
"learning_rate": 1.2363636363636363e-06,
"loss": 0.1358,
"step": 68
},
{
"epoch": 0.02,
"grad_norm": 3.330427646636963,
"learning_rate": 1.2545454545454546e-06,
"loss": 0.1232,
"step": 69
},
{
"epoch": 0.02,
"grad_norm": 3.0509235858917236,
"learning_rate": 1.2727272727272726e-06,
"loss": 0.103,
"step": 70
},
{
"epoch": 0.02,
"grad_norm": 3.553762912750244,
"learning_rate": 1.290909090909091e-06,
"loss": 0.1228,
"step": 71
},
{
"epoch": 0.02,
"grad_norm": 3.2095208168029785,
"learning_rate": 1.3090909090909091e-06,
"loss": 0.1181,
"step": 72
},
{
"epoch": 0.02,
"grad_norm": 3.5218029022216797,
"learning_rate": 1.3272727272727273e-06,
"loss": 0.117,
"step": 73
},
{
"epoch": 0.02,
"grad_norm": 3.9620566368103027,
"learning_rate": 1.3454545454545455e-06,
"loss": 0.1481,
"step": 74
},
{
"epoch": 0.02,
"grad_norm": 3.876711368560791,
"learning_rate": 1.3636363636363634e-06,
"loss": 0.1501,
"step": 75
},
{
"epoch": 0.02,
"grad_norm": 4.166055202484131,
"learning_rate": 1.3818181818181818e-06,
"loss": 0.1201,
"step": 76
},
{
"epoch": 0.02,
"grad_norm": 3.5768558979034424,
"learning_rate": 1.4e-06,
"loss": 0.1096,
"step": 77
},
{
"epoch": 0.02,
"grad_norm": 3.830570936203003,
"learning_rate": 1.418181818181818e-06,
"loss": 0.1181,
"step": 78
},
{
"epoch": 0.02,
"grad_norm": 3.5578572750091553,
"learning_rate": 1.4363636363636363e-06,
"loss": 0.1111,
"step": 79
},
{
"epoch": 0.02,
"grad_norm": 3.674180030822754,
"learning_rate": 1.4545454545454544e-06,
"loss": 0.1187,
"step": 80
},
{
"epoch": 0.02,
"grad_norm": 3.9807794094085693,
"learning_rate": 1.4727272727272726e-06,
"loss": 0.1532,
"step": 81
},
{
"epoch": 0.02,
"grad_norm": 3.374263048171997,
"learning_rate": 1.490909090909091e-06,
"loss": 0.1135,
"step": 82
},
{
"epoch": 0.02,
"grad_norm": 3.739839553833008,
"learning_rate": 1.509090909090909e-06,
"loss": 0.1328,
"step": 83
},
{
"epoch": 0.02,
"grad_norm": 3.470029354095459,
"learning_rate": 1.5272727272727273e-06,
"loss": 0.1078,
"step": 84
},
{
"epoch": 0.02,
"grad_norm": 3.3445234298706055,
"learning_rate": 1.5454545454545454e-06,
"loss": 0.1257,
"step": 85
},
{
"epoch": 0.02,
"grad_norm": 3.2428488731384277,
"learning_rate": 1.5636363636363636e-06,
"loss": 0.1207,
"step": 86
},
{
"epoch": 0.02,
"grad_norm": 3.345752477645874,
"learning_rate": 1.5818181818181818e-06,
"loss": 0.124,
"step": 87
},
{
"epoch": 0.02,
"grad_norm": 3.6470723152160645,
"learning_rate": 1.6e-06,
"loss": 0.1366,
"step": 88
},
{
"epoch": 0.02,
"grad_norm": 3.7567741870880127,
"learning_rate": 1.618181818181818e-06,
"loss": 0.1384,
"step": 89
},
{
"epoch": 0.02,
"grad_norm": 3.5427284240722656,
"learning_rate": 1.6363636363636365e-06,
"loss": 0.1439,
"step": 90
},
{
"epoch": 0.02,
"grad_norm": 3.694549560546875,
"learning_rate": 1.6545454545454544e-06,
"loss": 0.128,
"step": 91
},
{
"epoch": 0.03,
"grad_norm": 3.5761115550994873,
"learning_rate": 1.6727272727272726e-06,
"loss": 0.1247,
"step": 92
},
{
"epoch": 0.03,
"grad_norm": 3.4093759059906006,
"learning_rate": 1.6909090909090907e-06,
"loss": 0.1429,
"step": 93
},
{
"epoch": 0.03,
"grad_norm": 3.4228475093841553,
"learning_rate": 1.709090909090909e-06,
"loss": 0.1295,
"step": 94
},
{
"epoch": 0.03,
"grad_norm": 4.1292195320129395,
"learning_rate": 1.7272727272727273e-06,
"loss": 0.1133,
"step": 95
},
{
"epoch": 0.03,
"grad_norm": 3.841623306274414,
"learning_rate": 1.7454545454545452e-06,
"loss": 0.1356,
"step": 96
},
{
"epoch": 0.03,
"grad_norm": 3.5247111320495605,
"learning_rate": 1.7636363636363636e-06,
"loss": 0.1217,
"step": 97
},
{
"epoch": 0.03,
"grad_norm": 3.483203172683716,
"learning_rate": 1.7818181818181818e-06,
"loss": 0.1324,
"step": 98
},
{
"epoch": 0.03,
"grad_norm": 3.9401931762695312,
"learning_rate": 1.8e-06,
"loss": 0.1559,
"step": 99
},
{
"epoch": 0.03,
"grad_norm": 3.1736230850219727,
"learning_rate": 1.818181818181818e-06,
"loss": 0.1124,
"step": 100
},
{
"epoch": 0.03,
"grad_norm": 3.3248181343078613,
"learning_rate": 1.8363636363636362e-06,
"loss": 0.1061,
"step": 101
},
{
"epoch": 0.03,
"grad_norm": 3.954529285430908,
"learning_rate": 1.8545454545454544e-06,
"loss": 0.1396,
"step": 102
},
{
"epoch": 0.03,
"grad_norm": 3.5198869705200195,
"learning_rate": 1.8727272727272728e-06,
"loss": 0.1246,
"step": 103
},
{
"epoch": 0.03,
"grad_norm": 3.517188310623169,
"learning_rate": 1.8909090909090907e-06,
"loss": 0.1191,
"step": 104
},
{
"epoch": 0.03,
"grad_norm": 3.422008991241455,
"learning_rate": 1.909090909090909e-06,
"loss": 0.1402,
"step": 105
},
{
"epoch": 0.03,
"grad_norm": 3.771535634994507,
"learning_rate": 1.9272727272727273e-06,
"loss": 0.1457,
"step": 106
},
{
"epoch": 0.03,
"grad_norm": 4.059760093688965,
"learning_rate": 1.9454545454545454e-06,
"loss": 0.1344,
"step": 107
},
{
"epoch": 0.03,
"grad_norm": 3.780738592147827,
"learning_rate": 1.9636363636363636e-06,
"loss": 0.1361,
"step": 108
},
{
"epoch": 0.03,
"grad_norm": 4.163651943206787,
"learning_rate": 1.9818181818181817e-06,
"loss": 0.1372,
"step": 109
},
{
"epoch": 0.03,
"grad_norm": 3.797593355178833,
"learning_rate": 2e-06,
"loss": 0.137,
"step": 110
},
{
"epoch": 0.03,
"grad_norm": 3.6726772785186768,
"learning_rate": 1.99999960842675e-06,
"loss": 0.1179,
"step": 111
},
{
"epoch": 0.03,
"grad_norm": 3.261941909790039,
"learning_rate": 1.9999984337073077e-06,
"loss": 0.1152,
"step": 112
},
{
"epoch": 0.03,
"grad_norm": 3.536269187927246,
"learning_rate": 1.999996475842593e-06,
"loss": 0.1425,
"step": 113
},
{
"epoch": 0.03,
"grad_norm": 3.9828357696533203,
"learning_rate": 1.9999937348341392e-06,
"loss": 0.1322,
"step": 114
},
{
"epoch": 0.03,
"grad_norm": 3.628679037094116,
"learning_rate": 1.999990210684092e-06,
"loss": 0.1106,
"step": 115
},
{
"epoch": 0.03,
"grad_norm": 3.5205116271972656,
"learning_rate": 1.9999859033952126e-06,
"loss": 0.1336,
"step": 116
},
{
"epoch": 0.03,
"grad_norm": 3.6973910331726074,
"learning_rate": 1.999980812970873e-06,
"loss": 0.1272,
"step": 117
},
{
"epoch": 0.03,
"grad_norm": 3.650892734527588,
"learning_rate": 1.9999749394150607e-06,
"loss": 0.1363,
"step": 118
},
{
"epoch": 0.03,
"grad_norm": 3.479020833969116,
"learning_rate": 1.9999682827323754e-06,
"loss": 0.1367,
"step": 119
},
{
"epoch": 0.03,
"grad_norm": 3.836937189102173,
"learning_rate": 1.99996084292803e-06,
"loss": 0.1392,
"step": 120
},
{
"epoch": 0.03,
"grad_norm": 3.656867027282715,
"learning_rate": 1.9999526200078507e-06,
"loss": 0.1285,
"step": 121
},
{
"epoch": 0.03,
"grad_norm": 3.851539373397827,
"learning_rate": 1.999943613978278e-06,
"loss": 0.1475,
"step": 122
},
{
"epoch": 0.03,
"grad_norm": 3.374127149581909,
"learning_rate": 1.9999338248463646e-06,
"loss": 0.1198,
"step": 123
},
{
"epoch": 0.03,
"grad_norm": 3.398015260696411,
"learning_rate": 1.9999232526197767e-06,
"loss": 0.1155,
"step": 124
},
{
"epoch": 0.03,
"grad_norm": 3.809936761856079,
"learning_rate": 1.999911897306794e-06,
"loss": 0.1338,
"step": 125
},
{
"epoch": 0.03,
"grad_norm": 3.7471814155578613,
"learning_rate": 1.9998997589163095e-06,
"loss": 0.1581,
"step": 126
},
{
"epoch": 0.03,
"grad_norm": 3.1297061443328857,
"learning_rate": 1.9998868374578286e-06,
"loss": 0.1184,
"step": 127
},
{
"epoch": 0.03,
"grad_norm": 3.4692442417144775,
"learning_rate": 1.999873132941472e-06,
"loss": 0.1152,
"step": 128
},
{
"epoch": 0.04,
"grad_norm": 3.715061902999878,
"learning_rate": 1.999858645377971e-06,
"loss": 0.148,
"step": 129
},
{
"epoch": 0.04,
"grad_norm": 3.395211935043335,
"learning_rate": 1.9998433747786726e-06,
"loss": 0.1261,
"step": 130
},
{
"epoch": 0.04,
"grad_norm": 3.7628817558288574,
"learning_rate": 1.9998273211555354e-06,
"loss": 0.1354,
"step": 131
},
{
"epoch": 0.04,
"grad_norm": 3.3477649688720703,
"learning_rate": 1.9998104845211313e-06,
"loss": 0.13,
"step": 132
},
{
"epoch": 0.04,
"grad_norm": 3.556675434112549,
"learning_rate": 1.9997928648886467e-06,
"loss": 0.1297,
"step": 133
},
{
"epoch": 0.04,
"grad_norm": 3.8331122398376465,
"learning_rate": 1.9997744622718796e-06,
"loss": 0.1407,
"step": 134
},
{
"epoch": 0.04,
"grad_norm": 3.4685864448547363,
"learning_rate": 1.999755276685243e-06,
"loss": 0.1145,
"step": 135
},
{
"epoch": 0.04,
"grad_norm": 3.602905750274658,
"learning_rate": 1.999735308143761e-06,
"loss": 0.1325,
"step": 136
},
{
"epoch": 0.04,
"grad_norm": 3.2092838287353516,
"learning_rate": 1.999714556663072e-06,
"loss": 0.1233,
"step": 137
},
{
"epoch": 0.04,
"grad_norm": 3.2409422397613525,
"learning_rate": 1.999693022259428e-06,
"loss": 0.1339,
"step": 138
},
{
"epoch": 0.04,
"grad_norm": 3.6716156005859375,
"learning_rate": 1.999670704949693e-06,
"loss": 0.1235,
"step": 139
},
{
"epoch": 0.04,
"grad_norm": 3.52691388130188,
"learning_rate": 1.999647604751345e-06,
"loss": 0.136,
"step": 140
},
{
"epoch": 0.04,
"grad_norm": 3.946507453918457,
"learning_rate": 1.999623721682475e-06,
"loss": 0.1521,
"step": 141
},
{
"epoch": 0.04,
"grad_norm": 3.8056299686431885,
"learning_rate": 1.999599055761787e-06,
"loss": 0.1221,
"step": 142
},
{
"epoch": 0.04,
"grad_norm": 3.361619710922241,
"learning_rate": 1.9995736070085978e-06,
"loss": 0.1337,
"step": 143
},
{
"epoch": 0.04,
"grad_norm": 3.38295578956604,
"learning_rate": 1.999547375442837e-06,
"loss": 0.1297,
"step": 144
},
{
"epoch": 0.04,
"grad_norm": 3.585200786590576,
"learning_rate": 1.999520361085049e-06,
"loss": 0.1276,
"step": 145
},
{
"epoch": 0.04,
"grad_norm": 3.449899673461914,
"learning_rate": 1.9994925639563886e-06,
"loss": 0.1283,
"step": 146
},
{
"epoch": 0.04,
"grad_norm": 3.813476324081421,
"learning_rate": 1.999463984078626e-06,
"loss": 0.1475,
"step": 147
},
{
"epoch": 0.04,
"grad_norm": 3.5906283855438232,
"learning_rate": 1.999434621474143e-06,
"loss": 0.1287,
"step": 148
},
{
"epoch": 0.04,
"grad_norm": 4.171735763549805,
"learning_rate": 1.999404476165935e-06,
"loss": 0.1531,
"step": 149
},
{
"epoch": 0.04,
"grad_norm": 3.4671435356140137,
"learning_rate": 1.99937354817761e-06,
"loss": 0.1396,
"step": 150
},
{
"epoch": 0.04,
"grad_norm": 3.950822114944458,
"learning_rate": 1.99934183753339e-06,
"loss": 0.14,
"step": 151
},
{
"epoch": 0.04,
"grad_norm": 3.534167528152466,
"learning_rate": 1.9993093442581075e-06,
"loss": 0.1262,
"step": 152
},
{
"epoch": 0.04,
"grad_norm": 3.732804298400879,
"learning_rate": 1.999276068377211e-06,
"loss": 0.1449,
"step": 153
},
{
"epoch": 0.04,
"grad_norm": 3.422449827194214,
"learning_rate": 1.999242009916759e-06,
"loss": 0.1539,
"step": 154
},
{
"epoch": 0.04,
"grad_norm": 3.5805094242095947,
"learning_rate": 1.9992071689034255e-06,
"loss": 0.153,
"step": 155
},
{
"epoch": 0.04,
"grad_norm": 3.6584551334381104,
"learning_rate": 1.999171545364496e-06,
"loss": 0.1491,
"step": 156
},
{
"epoch": 0.04,
"grad_norm": 3.4344966411590576,
"learning_rate": 1.999135139327868e-06,
"loss": 0.1421,
"step": 157
},
{
"epoch": 0.04,
"grad_norm": 3.415125846862793,
"learning_rate": 1.9990979508220536e-06,
"loss": 0.1391,
"step": 158
},
{
"epoch": 0.04,
"grad_norm": 3.371690034866333,
"learning_rate": 1.9990599798761766e-06,
"loss": 0.1394,
"step": 159
},
{
"epoch": 0.04,
"grad_norm": 3.242325782775879,
"learning_rate": 1.9990212265199736e-06,
"loss": 0.1355,
"step": 160
},
{
"epoch": 0.04,
"grad_norm": 3.316002368927002,
"learning_rate": 1.9989816907837944e-06,
"loss": 0.1423,
"step": 161
},
{
"epoch": 0.04,
"grad_norm": 3.9251341819763184,
"learning_rate": 1.998941372698601e-06,
"loss": 0.1408,
"step": 162
},
{
"epoch": 0.04,
"grad_norm": 3.750389814376831,
"learning_rate": 1.998900272295969e-06,
"loss": 0.154,
"step": 163
},
{
"epoch": 0.04,
"grad_norm": 3.681218147277832,
"learning_rate": 1.9988583896080856e-06,
"loss": 0.151,
"step": 164
},
{
"epoch": 0.05,
"grad_norm": 3.407083511352539,
"learning_rate": 1.9988157246677513e-06,
"loss": 0.1354,
"step": 165
},
{
"epoch": 0.05,
"grad_norm": 3.2584445476531982,
"learning_rate": 1.9987722775083785e-06,
"loss": 0.1321,
"step": 166
},
{
"epoch": 0.05,
"grad_norm": 3.265322685241699,
"learning_rate": 1.998728048163993e-06,
"loss": 0.1146,
"step": 167
},
{
"epoch": 0.05,
"grad_norm": 3.724404811859131,
"learning_rate": 1.998683036669233e-06,
"loss": 0.1424,
"step": 168
},
{
"epoch": 0.05,
"grad_norm": 4.1619486808776855,
"learning_rate": 1.998637243059349e-06,
"loss": 0.1428,
"step": 169
},
{
"epoch": 0.05,
"grad_norm": 3.1609082221984863,
"learning_rate": 1.998590667370204e-06,
"loss": 0.1218,
"step": 170
},
{
"epoch": 0.05,
"grad_norm": 3.0653064250946045,
"learning_rate": 1.9985433096382735e-06,
"loss": 0.1122,
"step": 171
},
{
"epoch": 0.05,
"grad_norm": 3.4336159229278564,
"learning_rate": 1.998495169900646e-06,
"loss": 0.1296,
"step": 172
},
{
"epoch": 0.05,
"grad_norm": 3.300739049911499,
"learning_rate": 1.998446248195021e-06,
"loss": 0.128,
"step": 173
},
{
"epoch": 0.05,
"grad_norm": 3.1947007179260254,
"learning_rate": 1.998396544559713e-06,
"loss": 0.1192,
"step": 174
},
{
"epoch": 0.05,
"grad_norm": 3.543092966079712,
"learning_rate": 1.9983460590336457e-06,
"loss": 0.1405,
"step": 175
},
{
"epoch": 0.05,
"grad_norm": 3.321154832839966,
"learning_rate": 1.998294791656357e-06,
"loss": 0.1222,
"step": 176
},
{
"epoch": 0.05,
"grad_norm": 3.4130699634552,
"learning_rate": 1.9982427424679976e-06,
"loss": 0.1494,
"step": 177
},
{
"epoch": 0.05,
"grad_norm": 3.2330543994903564,
"learning_rate": 1.9981899115093287e-06,
"loss": 0.1447,
"step": 178
},
{
"epoch": 0.05,
"grad_norm": 3.6075711250305176,
"learning_rate": 1.9981362988217246e-06,
"loss": 0.1532,
"step": 179
},
{
"epoch": 0.05,
"grad_norm": 3.4685580730438232,
"learning_rate": 1.998081904447173e-06,
"loss": 0.1178,
"step": 180
},
{
"epoch": 0.05,
"grad_norm": 3.286449909210205,
"learning_rate": 1.9980267284282714e-06,
"loss": 0.1435,
"step": 181
},
{
"epoch": 0.05,
"grad_norm": 3.7313830852508545,
"learning_rate": 1.9979707708082315e-06,
"loss": 0.1509,
"step": 182
},
{
"epoch": 0.05,
"grad_norm": 3.748180627822876,
"learning_rate": 1.9979140316308762e-06,
"loss": 0.1365,
"step": 183
},
{
"epoch": 0.05,
"grad_norm": 3.4357450008392334,
"learning_rate": 1.9978565109406402e-06,
"loss": 0.1301,
"step": 184
},
{
"epoch": 0.05,
"grad_norm": 3.5195868015289307,
"learning_rate": 1.9977982087825712e-06,
"loss": 0.141,
"step": 185
},
{
"epoch": 0.05,
"grad_norm": 3.6003425121307373,
"learning_rate": 1.9977391252023277e-06,
"loss": 0.1346,
"step": 186
},
{
"epoch": 0.05,
"grad_norm": 3.1029670238494873,
"learning_rate": 1.9976792602461813e-06,
"loss": 0.1241,
"step": 187
},
{
"epoch": 0.05,
"grad_norm": 3.2697935104370117,
"learning_rate": 1.9976186139610146e-06,
"loss": 0.132,
"step": 188
},
{
"epoch": 0.05,
"grad_norm": 4.49591064453125,
"learning_rate": 1.997557186394323e-06,
"loss": 0.1508,
"step": 189
},
{
"epoch": 0.05,
"grad_norm": 3.4977004528045654,
"learning_rate": 1.9974949775942133e-06,
"loss": 0.1464,
"step": 190
},
{
"epoch": 0.05,
"grad_norm": 3.6156258583068848,
"learning_rate": 1.997431987609403e-06,
"loss": 0.1564,
"step": 191
},
{
"epoch": 0.05,
"grad_norm": 3.6873908042907715,
"learning_rate": 1.9973682164892242e-06,
"loss": 0.1439,
"step": 192
},
{
"epoch": 0.05,
"grad_norm": 3.5300700664520264,
"learning_rate": 1.997303664283618e-06,
"loss": 0.1521,
"step": 193
},
{
"epoch": 0.05,
"grad_norm": 3.2912561893463135,
"learning_rate": 1.997238331043138e-06,
"loss": 0.1269,
"step": 194
},
{
"epoch": 0.05,
"grad_norm": 3.3216238021850586,
"learning_rate": 1.9971722168189506e-06,
"loss": 0.1286,
"step": 195
},
{
"epoch": 0.05,
"grad_norm": 3.44690203666687,
"learning_rate": 1.997105321662832e-06,
"loss": 0.1423,
"step": 196
},
{
"epoch": 0.05,
"grad_norm": 3.240175247192383,
"learning_rate": 1.9970376456271718e-06,
"loss": 0.122,
"step": 197
},
{
"epoch": 0.05,
"grad_norm": 3.19724440574646,
"learning_rate": 1.9969691887649696e-06,
"loss": 0.1327,
"step": 198
},
{
"epoch": 0.05,
"grad_norm": 3.4572486877441406,
"learning_rate": 1.9968999511298373e-06,
"loss": 0.1373,
"step": 199
},
{
"epoch": 0.05,
"grad_norm": 3.4547650814056396,
"learning_rate": 1.9968299327759985e-06,
"loss": 0.1327,
"step": 200
},
{
"epoch": 0.05,
"grad_norm": 3.5186169147491455,
"learning_rate": 1.996759133758287e-06,
"loss": 0.144,
"step": 201
},
{
"epoch": 0.06,
"grad_norm": 3.53615665435791,
"learning_rate": 1.9966875541321497e-06,
"loss": 0.1261,
"step": 202
},
{
"epoch": 0.06,
"grad_norm": 3.302018404006958,
"learning_rate": 1.996615193953643e-06,
"loss": 0.139,
"step": 203
},
{
"epoch": 0.06,
"grad_norm": 3.4731132984161377,
"learning_rate": 1.9965420532794364e-06,
"loss": 0.1453,
"step": 204
},
{
"epoch": 0.06,
"grad_norm": 3.6803643703460693,
"learning_rate": 1.9964681321668095e-06,
"loss": 0.1512,
"step": 205
},
{
"epoch": 0.06,
"grad_norm": 3.640664577484131,
"learning_rate": 1.996393430673653e-06,
"loss": 0.1257,
"step": 206
},
{
"epoch": 0.06,
"grad_norm": 3.283768653869629,
"learning_rate": 1.9963179488584697e-06,
"loss": 0.1347,
"step": 207
},
{
"epoch": 0.06,
"grad_norm": 3.3293819427490234,
"learning_rate": 1.9962416867803726e-06,
"loss": 0.139,
"step": 208
},
{
"epoch": 0.06,
"grad_norm": 3.2843964099884033,
"learning_rate": 1.9961646444990855e-06,
"loss": 0.1399,
"step": 209
},
{
"epoch": 0.06,
"grad_norm": 3.1588640213012695,
"learning_rate": 1.9960868220749447e-06,
"loss": 0.1338,
"step": 210
},
{
"epoch": 0.06,
"grad_norm": 3.0526747703552246,
"learning_rate": 1.9960082195688964e-06,
"loss": 0.1225,
"step": 211
},
{
"epoch": 0.06,
"grad_norm": 3.466364622116089,
"learning_rate": 1.9959288370424975e-06,
"loss": 0.1609,
"step": 212
},
{
"epoch": 0.06,
"grad_norm": 3.3527302742004395,
"learning_rate": 1.9958486745579162e-06,
"loss": 0.1312,
"step": 213
},
{
"epoch": 0.06,
"grad_norm": 3.9912383556365967,
"learning_rate": 1.995767732177932e-06,
"loss": 0.1449,
"step": 214
},
{
"epoch": 0.06,
"grad_norm": 3.5762126445770264,
"learning_rate": 1.995686009965934e-06,
"loss": 0.1323,
"step": 215
},
{
"epoch": 0.06,
"grad_norm": 3.335552215576172,
"learning_rate": 1.995603507985923e-06,
"loss": 0.1179,
"step": 216
},
{
"epoch": 0.06,
"grad_norm": 3.964589834213257,
"learning_rate": 1.9955202263025103e-06,
"loss": 0.1593,
"step": 217
},
{
"epoch": 0.06,
"grad_norm": 3.3723831176757812,
"learning_rate": 1.995436164980917e-06,
"loss": 0.1316,
"step": 218
},
{
"epoch": 0.06,
"grad_norm": 3.1864852905273438,
"learning_rate": 1.9953513240869763e-06,
"loss": 0.1237,
"step": 219
},
{
"epoch": 0.06,
"grad_norm": 3.535829782485962,
"learning_rate": 1.9952657036871305e-06,
"loss": 0.139,
"step": 220
},
{
"epoch": 0.06,
"grad_norm": 3.485399007797241,
"learning_rate": 1.9951793038484326e-06,
"loss": 0.1551,
"step": 221
},
{
"epoch": 0.06,
"grad_norm": 3.34142804145813,
"learning_rate": 1.995092124638547e-06,
"loss": 0.1322,
"step": 222
},
{
"epoch": 0.06,
"grad_norm": 3.582733631134033,
"learning_rate": 1.995004166125748e-06,
"loss": 0.1296,
"step": 223
},
{
"epoch": 0.06,
"grad_norm": 3.348628044128418,
"learning_rate": 1.994915428378919e-06,
"loss": 0.1287,
"step": 224
},
{
"epoch": 0.06,
"grad_norm": 3.2857093811035156,
"learning_rate": 1.994825911467555e-06,
"loss": 0.1184,
"step": 225
},
{
"epoch": 0.06,
"grad_norm": 3.5528059005737305,
"learning_rate": 1.994735615461762e-06,
"loss": 0.1434,
"step": 226
},
{
"epoch": 0.06,
"grad_norm": 3.3763883113861084,
"learning_rate": 1.9946445404322533e-06,
"loss": 0.1324,
"step": 227
},
{
"epoch": 0.06,
"grad_norm": 3.357837438583374,
"learning_rate": 1.9945526864503547e-06,
"loss": 0.1291,
"step": 228
},
{
"epoch": 0.06,
"grad_norm": 3.7160394191741943,
"learning_rate": 1.9944600535880018e-06,
"loss": 0.1464,
"step": 229
},
{
"epoch": 0.06,
"grad_norm": 3.718874931335449,
"learning_rate": 1.994366641917739e-06,
"loss": 0.1195,
"step": 230
},
{
"epoch": 0.06,
"grad_norm": 3.4990196228027344,
"learning_rate": 1.9942724515127216e-06,
"loss": 0.1474,
"step": 231
},
{
"epoch": 0.06,
"grad_norm": 3.5642192363739014,
"learning_rate": 1.9941774824467148e-06,
"loss": 0.1436,
"step": 232
},
{
"epoch": 0.06,
"grad_norm": 3.294753313064575,
"learning_rate": 1.9940817347940927e-06,
"loss": 0.1169,
"step": 233
},
{
"epoch": 0.06,
"grad_norm": 3.635255813598633,
"learning_rate": 1.9939852086298397e-06,
"loss": 0.1528,
"step": 234
},
{
"epoch": 0.06,
"grad_norm": 3.3293750286102295,
"learning_rate": 1.9938879040295507e-06,
"loss": 0.1367,
"step": 235
},
{
"epoch": 0.06,
"grad_norm": 3.4504055976867676,
"learning_rate": 1.993789821069429e-06,
"loss": 0.1424,
"step": 236
},
{
"epoch": 0.06,
"grad_norm": 3.2252650260925293,
"learning_rate": 1.993690959826288e-06,
"loss": 0.1381,
"step": 237
},
{
"epoch": 0.07,
"grad_norm": 3.5592312812805176,
"learning_rate": 1.99359132037755e-06,
"loss": 0.1304,
"step": 238
},
{
"epoch": 0.07,
"grad_norm": 3.446719169616699,
"learning_rate": 1.9934909028012477e-06,
"loss": 0.1378,
"step": 239
},
{
"epoch": 0.07,
"grad_norm": 3.1963205337524414,
"learning_rate": 1.9933897071760235e-06,
"loss": 0.1279,
"step": 240
},
{
"epoch": 0.07,
"grad_norm": 3.2924935817718506,
"learning_rate": 1.993287733581127e-06,
"loss": 0.13,
"step": 241
},
{
"epoch": 0.07,
"grad_norm": 3.186282157897949,
"learning_rate": 1.9931849820964196e-06,
"loss": 0.1148,
"step": 242
},
{
"epoch": 0.07,
"grad_norm": 3.7735495567321777,
"learning_rate": 1.9930814528023703e-06,
"loss": 0.1398,
"step": 243
},
{
"epoch": 0.07,
"grad_norm": 3.284437656402588,
"learning_rate": 1.992977145780058e-06,
"loss": 0.13,
"step": 244
},
{
"epoch": 0.07,
"grad_norm": 3.665106773376465,
"learning_rate": 1.9928720611111695e-06,
"loss": 0.1325,
"step": 245
},
{
"epoch": 0.07,
"grad_norm": 3.378911018371582,
"learning_rate": 1.9927661988780024e-06,
"loss": 0.1286,
"step": 246
},
{
"epoch": 0.07,
"grad_norm": 3.492617130279541,
"learning_rate": 1.9926595591634625e-06,
"loss": 0.1539,
"step": 247
},
{
"epoch": 0.07,
"grad_norm": 3.062282085418701,
"learning_rate": 1.992552142051063e-06,
"loss": 0.1134,
"step": 248
},
{
"epoch": 0.07,
"grad_norm": 3.0996599197387695,
"learning_rate": 1.9924439476249287e-06,
"loss": 0.1115,
"step": 249
},
{
"epoch": 0.07,
"grad_norm": 3.804471969604492,
"learning_rate": 1.992334975969791e-06,
"loss": 0.1418,
"step": 250
},
{
"epoch": 0.07,
"grad_norm": 3.4804577827453613,
"learning_rate": 1.9922252271709913e-06,
"loss": 0.1405,
"step": 251
},
{
"epoch": 0.07,
"grad_norm": 3.598825693130493,
"learning_rate": 1.9921147013144777e-06,
"loss": 0.1347,
"step": 252
},
{
"epoch": 0.07,
"grad_norm": 3.65277361869812,
"learning_rate": 1.9920033984868093e-06,
"loss": 0.1333,
"step": 253
},
{
"epoch": 0.07,
"grad_norm": 3.512441873550415,
"learning_rate": 1.9918913187751516e-06,
"loss": 0.1644,
"step": 254
},
{
"epoch": 0.07,
"grad_norm": 3.4071273803710938,
"learning_rate": 1.9917784622672805e-06,
"loss": 0.1446,
"step": 255
},
{
"epoch": 0.07,
"grad_norm": 3.3086330890655518,
"learning_rate": 1.9916648290515785e-06,
"loss": 0.1265,
"step": 256
},
{
"epoch": 0.07,
"grad_norm": 3.5405070781707764,
"learning_rate": 1.9915504192170373e-06,
"loss": 0.1339,
"step": 257
},
{
"epoch": 0.07,
"grad_norm": 3.329610586166382,
"learning_rate": 1.991435232853256e-06,
"loss": 0.1289,
"step": 258
},
{
"epoch": 0.07,
"grad_norm": 3.483816385269165,
"learning_rate": 1.9913192700504435e-06,
"loss": 0.1314,
"step": 259
},
{
"epoch": 0.07,
"grad_norm": 3.278635025024414,
"learning_rate": 1.9912025308994145e-06,
"loss": 0.1329,
"step": 260
},
{
"epoch": 0.07,
"grad_norm": 3.243156671524048,
"learning_rate": 1.9910850154915936e-06,
"loss": 0.138,
"step": 261
},
{
"epoch": 0.07,
"grad_norm": 3.3385517597198486,
"learning_rate": 1.9909667239190123e-06,
"loss": 0.144,
"step": 262
},
{
"epoch": 0.07,
"grad_norm": 3.1529195308685303,
"learning_rate": 1.99084765627431e-06,
"loss": 0.1347,
"step": 263
},
{
"epoch": 0.07,
"grad_norm": 3.207628011703491,
"learning_rate": 1.9907278126507347e-06,
"loss": 0.1571,
"step": 264
},
{
"epoch": 0.07,
"grad_norm": 3.169715404510498,
"learning_rate": 1.9906071931421412e-06,
"loss": 0.1397,
"step": 265
},
{
"epoch": 0.07,
"grad_norm": 3.044822931289673,
"learning_rate": 1.990485797842992e-06,
"loss": 0.1111,
"step": 266
},
{
"epoch": 0.07,
"grad_norm": 3.2648305892944336,
"learning_rate": 1.9903636268483577e-06,
"loss": 0.1504,
"step": 267
},
{
"epoch": 0.07,
"grad_norm": 3.314117431640625,
"learning_rate": 1.990240680253916e-06,
"loss": 0.1539,
"step": 268
},
{
"epoch": 0.07,
"grad_norm": 3.51577091217041,
"learning_rate": 1.990116958155953e-06,
"loss": 0.1374,
"step": 269
},
{
"epoch": 0.07,
"grad_norm": 3.2009201049804688,
"learning_rate": 1.989992460651359e-06,
"loss": 0.1344,
"step": 270
},
{
"epoch": 0.07,
"grad_norm": 3.7722017765045166,
"learning_rate": 1.9898671878376363e-06,
"loss": 0.162,
"step": 271
},
{
"epoch": 0.07,
"grad_norm": 3.3505313396453857,
"learning_rate": 1.98974113981289e-06,
"loss": 0.1588,
"step": 272
},
{
"epoch": 0.07,
"grad_norm": 3.020047426223755,
"learning_rate": 1.989614316675835e-06,
"loss": 0.1215,
"step": 273
},
{
"epoch": 0.07,
"grad_norm": 3.8189098834991455,
"learning_rate": 1.9894867185257924e-06,
"loss": 0.1685,
"step": 274
},
{
"epoch": 0.08,
"grad_norm": 3.6801252365112305,
"learning_rate": 1.98935834546269e-06,
"loss": 0.139,
"step": 275
},
{
"epoch": 0.08,
"grad_norm": 3.7327663898468018,
"learning_rate": 1.989229197587063e-06,
"loss": 0.154,
"step": 276
},
{
"epoch": 0.08,
"grad_norm": 3.297316789627075,
"learning_rate": 1.9890992750000527e-06,
"loss": 0.1504,
"step": 277
},
{
"epoch": 0.08,
"grad_norm": 3.2879798412323,
"learning_rate": 1.988968577803408e-06,
"loss": 0.1384,
"step": 278
},
{
"epoch": 0.08,
"grad_norm": 3.343733787536621,
"learning_rate": 1.9888371060994836e-06,
"loss": 0.1463,
"step": 279
},
{
"epoch": 0.08,
"grad_norm": 3.16746187210083,
"learning_rate": 1.9887048599912412e-06,
"loss": 0.1187,
"step": 280
},
{
"epoch": 0.08,
"grad_norm": 3.205296516418457,
"learning_rate": 1.9885718395822487e-06,
"loss": 0.1305,
"step": 281
},
{
"epoch": 0.08,
"grad_norm": 3.2150719165802,
"learning_rate": 1.988438044976681e-06,
"loss": 0.1242,
"step": 282
},
{
"epoch": 0.08,
"grad_norm": 3.5434811115264893,
"learning_rate": 1.988303476279319e-06,
"loss": 0.1469,
"step": 283
},
{
"epoch": 0.08,
"grad_norm": 3.290196418762207,
"learning_rate": 1.9881681335955487e-06,
"loss": 0.144,
"step": 284
},
{
"epoch": 0.08,
"grad_norm": 3.230937957763672,
"learning_rate": 1.9880320170313638e-06,
"loss": 0.1345,
"step": 285
},
{
"epoch": 0.08,
"grad_norm": 3.4872334003448486,
"learning_rate": 1.987895126693364e-06,
"loss": 0.1481,
"step": 286
},
{
"epoch": 0.08,
"grad_norm": 3.31950306892395,
"learning_rate": 1.987757462688754e-06,
"loss": 0.1334,
"step": 287
},
{
"epoch": 0.08,
"grad_norm": 3.483234405517578,
"learning_rate": 1.987619025125345e-06,
"loss": 0.148,
"step": 288
},
{
"epoch": 0.08,
"grad_norm": 3.3412797451019287,
"learning_rate": 1.987479814111554e-06,
"loss": 0.1506,
"step": 289
},
{
"epoch": 0.08,
"grad_norm": 3.706984758377075,
"learning_rate": 1.9873398297564034e-06,
"loss": 0.1575,
"step": 290
},
{
"epoch": 0.08,
"grad_norm": 3.5726327896118164,
"learning_rate": 1.987199072169521e-06,
"loss": 0.1498,
"step": 291
},
{
"epoch": 0.08,
"grad_norm": 3.05886173248291,
"learning_rate": 1.987057541461142e-06,
"loss": 0.1301,
"step": 292
},
{
"epoch": 0.08,
"grad_norm": 3.118166446685791,
"learning_rate": 1.9869152377421047e-06,
"loss": 0.1351,
"step": 293
},
{
"epoch": 0.08,
"grad_norm": 3.4826500415802,
"learning_rate": 1.9867721611238535e-06,
"loss": 0.1481,
"step": 294
},
{
"epoch": 0.08,
"grad_norm": 3.4809482097625732,
"learning_rate": 1.986628311718439e-06,
"loss": 0.1407,
"step": 295
},
{
"epoch": 0.08,
"grad_norm": 3.4283668994903564,
"learning_rate": 1.986483689638516e-06,
"loss": 0.1426,
"step": 296
},
{
"epoch": 0.08,
"grad_norm": 3.3706417083740234,
"learning_rate": 1.986338294997345e-06,
"loss": 0.1154,
"step": 297
},
{
"epoch": 0.08,
"grad_norm": 3.6070687770843506,
"learning_rate": 1.986192127908791e-06,
"loss": 0.1531,
"step": 298
},
{
"epoch": 0.08,
"grad_norm": 3.358705759048462,
"learning_rate": 1.9860451884873245e-06,
"loss": 0.1445,
"step": 299
},
{
"epoch": 0.08,
"grad_norm": 3.1420834064483643,
"learning_rate": 1.9858974768480202e-06,
"loss": 0.1328,
"step": 300
},
{
"epoch": 0.08,
"grad_norm": 3.0843026638031006,
"learning_rate": 1.985748993106559e-06,
"loss": 0.121,
"step": 301
},
{
"epoch": 0.08,
"grad_norm": 3.4602112770080566,
"learning_rate": 1.9855997373792237e-06,
"loss": 0.125,
"step": 302
},
{
"epoch": 0.08,
"grad_norm": 3.8304550647735596,
"learning_rate": 1.9854497097829052e-06,
"loss": 0.163,
"step": 303
},
{
"epoch": 0.08,
"grad_norm": 3.5888307094573975,
"learning_rate": 1.985298910435096e-06,
"loss": 0.1431,
"step": 304
},
{
"epoch": 0.08,
"grad_norm": 3.4069719314575195,
"learning_rate": 1.9851473394538946e-06,
"loss": 0.1382,
"step": 305
},
{
"epoch": 0.08,
"grad_norm": 3.813002347946167,
"learning_rate": 1.984994996958003e-06,
"loss": 0.1319,
"step": 306
},
{
"epoch": 0.08,
"grad_norm": 3.3693482875823975,
"learning_rate": 1.9848418830667276e-06,
"loss": 0.1368,
"step": 307
},
{
"epoch": 0.08,
"grad_norm": 3.2808310985565186,
"learning_rate": 1.984687997899979e-06,
"loss": 0.1448,
"step": 308
},
{
"epoch": 0.08,
"grad_norm": 3.611994743347168,
"learning_rate": 1.9845333415782723e-06,
"loss": 0.1405,
"step": 309
},
{
"epoch": 0.08,
"grad_norm": 3.4369349479675293,
"learning_rate": 1.9843779142227253e-06,
"loss": 0.1732,
"step": 310
},
{
"epoch": 0.08,
"grad_norm": 3.140673875808716,
"learning_rate": 1.984221715955061e-06,
"loss": 0.1309,
"step": 311
},
{
"epoch": 0.09,
"grad_norm": 3.4380476474761963,
"learning_rate": 1.984064746897606e-06,
"loss": 0.1307,
"step": 312
},
{
"epoch": 0.09,
"grad_norm": 3.550687789916992,
"learning_rate": 1.983907007173289e-06,
"loss": 0.1362,
"step": 313
},
{
"epoch": 0.09,
"grad_norm": 3.2162210941314697,
"learning_rate": 1.9837484969056433e-06,
"loss": 0.1298,
"step": 314
},
{
"epoch": 0.09,
"grad_norm": 3.265251398086548,
"learning_rate": 1.983589216218806e-06,
"loss": 0.1308,
"step": 315
},
{
"epoch": 0.09,
"grad_norm": 3.379631280899048,
"learning_rate": 1.983429165237518e-06,
"loss": 0.1592,
"step": 316
},
{
"epoch": 0.09,
"grad_norm": 3.2787411212921143,
"learning_rate": 1.9832683440871217e-06,
"loss": 0.1356,
"step": 317
},
{
"epoch": 0.09,
"grad_norm": 3.5051069259643555,
"learning_rate": 1.9831067528935635e-06,
"loss": 0.1701,
"step": 318
},
{
"epoch": 0.09,
"grad_norm": 3.3926029205322266,
"learning_rate": 1.982944391783394e-06,
"loss": 0.1298,
"step": 319
},
{
"epoch": 0.09,
"grad_norm": 3.2148022651672363,
"learning_rate": 1.982781260883765e-06,
"loss": 0.1296,
"step": 320
},
{
"epoch": 0.09,
"grad_norm": 3.4440078735351562,
"learning_rate": 1.9826173603224317e-06,
"loss": 0.1449,
"step": 321
},
{
"epoch": 0.09,
"grad_norm": 3.4502224922180176,
"learning_rate": 1.9824526902277525e-06,
"loss": 0.1521,
"step": 322
},
{
"epoch": 0.09,
"grad_norm": 3.1308059692382812,
"learning_rate": 1.9822872507286887e-06,
"loss": 0.1293,
"step": 323
},
{
"epoch": 0.09,
"grad_norm": 3.0398175716400146,
"learning_rate": 1.982121041954803e-06,
"loss": 0.1165,
"step": 324
},
{
"epoch": 0.09,
"grad_norm": 3.3087451457977295,
"learning_rate": 1.981954064036261e-06,
"loss": 0.1424,
"step": 325
},
{
"epoch": 0.09,
"grad_norm": 3.4104042053222656,
"learning_rate": 1.981786317103832e-06,
"loss": 0.1446,
"step": 326
},
{
"epoch": 0.09,
"grad_norm": 3.2183303833007812,
"learning_rate": 1.981617801288885e-06,
"loss": 0.1324,
"step": 327
},
{
"epoch": 0.09,
"grad_norm": 3.486673593521118,
"learning_rate": 1.981448516723394e-06,
"loss": 0.1461,
"step": 328
},
{
"epoch": 0.09,
"grad_norm": 3.439990997314453,
"learning_rate": 1.9812784635399326e-06,
"loss": 0.1511,
"step": 329
},
{
"epoch": 0.09,
"grad_norm": 3.738765239715576,
"learning_rate": 1.981107641871678e-06,
"loss": 0.1323,
"step": 330
},
{
"epoch": 0.09,
"grad_norm": 3.526998519897461,
"learning_rate": 1.9809360518524078e-06,
"loss": 0.1542,
"step": 331
},
{
"epoch": 0.09,
"grad_norm": 3.3102715015411377,
"learning_rate": 1.980763693616503e-06,
"loss": 0.142,
"step": 332
},
{
"epoch": 0.09,
"grad_norm": 3.672074317932129,
"learning_rate": 1.9805905672989445e-06,
"loss": 0.1476,
"step": 333
},
{
"epoch": 0.09,
"grad_norm": 3.2519290447235107,
"learning_rate": 1.980416673035316e-06,
"loss": 0.1518,
"step": 334
},
{
"epoch": 0.09,
"grad_norm": 3.2809934616088867,
"learning_rate": 1.9802420109618028e-06,
"loss": 0.1261,
"step": 335
},
{
"epoch": 0.09,
"grad_norm": 3.447441577911377,
"learning_rate": 1.98006658121519e-06,
"loss": 0.1478,
"step": 336
},
{
"epoch": 0.09,
"grad_norm": 2.9944467544555664,
"learning_rate": 1.9798903839328647e-06,
"loss": 0.1254,
"step": 337
},
{
"epoch": 0.09,
"grad_norm": 3.4824864864349365,
"learning_rate": 1.979713419252816e-06,
"loss": 0.146,
"step": 338
},
{
"epoch": 0.09,
"grad_norm": 3.4047210216522217,
"learning_rate": 1.9795356873136324e-06,
"loss": 0.144,
"step": 339
},
{
"epoch": 0.09,
"grad_norm": 3.480583906173706,
"learning_rate": 1.9793571882545048e-06,
"loss": 0.1608,
"step": 340
},
{
"epoch": 0.09,
"grad_norm": 3.3297982215881348,
"learning_rate": 1.9791779222152232e-06,
"loss": 0.1301,
"step": 341
},
{
"epoch": 0.09,
"grad_norm": 3.241499662399292,
"learning_rate": 1.97899788933618e-06,
"loss": 0.1421,
"step": 342
},
{
"epoch": 0.09,
"grad_norm": 3.4256677627563477,
"learning_rate": 1.978817089758367e-06,
"loss": 0.1504,
"step": 343
},
{
"epoch": 0.09,
"grad_norm": 3.3894009590148926,
"learning_rate": 1.9786355236233767e-06,
"loss": 0.1473,
"step": 344
},
{
"epoch": 0.09,
"grad_norm": 3.277958631515503,
"learning_rate": 1.978453191073402e-06,
"loss": 0.1345,
"step": 345
},
{
"epoch": 0.09,
"grad_norm": 3.110243558883667,
"learning_rate": 1.9782700922512356e-06,
"loss": 0.1271,
"step": 346
},
{
"epoch": 0.09,
"grad_norm": 3.212660551071167,
"learning_rate": 1.9780862273002718e-06,
"loss": 0.1419,
"step": 347
},
{
"epoch": 0.1,
"grad_norm": 3.4038431644439697,
"learning_rate": 1.977901596364503e-06,
"loss": 0.1476,
"step": 348
},
{
"epoch": 0.1,
"grad_norm": 3.4911129474639893,
"learning_rate": 1.9777161995885216e-06,
"loss": 0.1649,
"step": 349
},
{
"epoch": 0.1,
"grad_norm": 3.139709711074829,
"learning_rate": 1.977530037117522e-06,
"loss": 0.1388,
"step": 350
},
{
"epoch": 0.1,
"grad_norm": 3.3062047958374023,
"learning_rate": 1.977343109097296e-06,
"loss": 0.1431,
"step": 351
},
{
"epoch": 0.1,
"grad_norm": 3.1570358276367188,
"learning_rate": 1.977155415674235e-06,
"loss": 0.1324,
"step": 352
},
{
"epoch": 0.1,
"grad_norm": 3.35162091255188,
"learning_rate": 1.976966956995331e-06,
"loss": 0.14,
"step": 353
},
{
"epoch": 0.1,
"grad_norm": 3.1200473308563232,
"learning_rate": 1.976777733208175e-06,
"loss": 0.1448,
"step": 354
},
{
"epoch": 0.1,
"grad_norm": 3.0821895599365234,
"learning_rate": 1.9765877444609565e-06,
"loss": 0.1233,
"step": 355
},
{
"epoch": 0.1,
"grad_norm": 3.0552830696105957,
"learning_rate": 1.976396990902465e-06,
"loss": 0.1182,
"step": 356
},
{
"epoch": 0.1,
"grad_norm": 3.555692195892334,
"learning_rate": 1.976205472682088e-06,
"loss": 0.1527,
"step": 357
},
{
"epoch": 0.1,
"grad_norm": 3.5902371406555176,
"learning_rate": 1.9760131899498125e-06,
"loss": 0.152,
"step": 358
},
{
"epoch": 0.1,
"grad_norm": 3.108504056930542,
"learning_rate": 1.975820142856224e-06,
"loss": 0.1257,
"step": 359
},
{
"epoch": 0.1,
"grad_norm": 3.2323732376098633,
"learning_rate": 1.975626331552507e-06,
"loss": 0.1446,
"step": 360
},
{
"epoch": 0.1,
"grad_norm": 3.161468267440796,
"learning_rate": 1.9754317561904433e-06,
"loss": 0.1334,
"step": 361
},
{
"epoch": 0.1,
"grad_norm": 3.21755051612854,
"learning_rate": 1.9752364169224148e-06,
"loss": 0.1433,
"step": 362
},
{
"epoch": 0.1,
"grad_norm": 3.455519437789917,
"learning_rate": 1.9750403139014003e-06,
"loss": 0.1634,
"step": 363
},
{
"epoch": 0.1,
"grad_norm": 3.1519174575805664,
"learning_rate": 1.9748434472809776e-06,
"loss": 0.1327,
"step": 364
},
{
"epoch": 0.1,
"grad_norm": 3.293189287185669,
"learning_rate": 1.974645817215322e-06,
"loss": 0.1276,
"step": 365
},
{
"epoch": 0.1,
"grad_norm": 3.327479362487793,
"learning_rate": 1.974447423859206e-06,
"loss": 0.1424,
"step": 366
},
{
"epoch": 0.1,
"grad_norm": 3.365499973297119,
"learning_rate": 1.9742482673680015e-06,
"loss": 0.1553,
"step": 367
},
{
"epoch": 0.1,
"grad_norm": 3.1190264225006104,
"learning_rate": 1.974048347897677e-06,
"loss": 0.1255,
"step": 368
},
{
"epoch": 0.1,
"grad_norm": 3.285921096801758,
"learning_rate": 1.973847665604799e-06,
"loss": 0.131,
"step": 369
},
{
"epoch": 0.1,
"grad_norm": 3.1432747840881348,
"learning_rate": 1.973646220646531e-06,
"loss": 0.1287,
"step": 370
},
{
"epoch": 0.1,
"grad_norm": 3.507528066635132,
"learning_rate": 1.973444013180633e-06,
"loss": 0.1615,
"step": 371
},
{
"epoch": 0.1,
"grad_norm": 3.055715322494507,
"learning_rate": 1.973241043365464e-06,
"loss": 0.1417,
"step": 372
},
{
"epoch": 0.1,
"grad_norm": 3.1101040840148926,
"learning_rate": 1.9730373113599796e-06,
"loss": 0.1285,
"step": 373
},
{
"epoch": 0.1,
"grad_norm": 3.195751905441284,
"learning_rate": 1.972832817323731e-06,
"loss": 0.1537,
"step": 374
},
{
"epoch": 0.1,
"grad_norm": 3.517488956451416,
"learning_rate": 1.9726275614168667e-06,
"loss": 0.1525,
"step": 375
},
{
"epoch": 0.1,
"grad_norm": 3.0122179985046387,
"learning_rate": 1.972421543800133e-06,
"loss": 0.1225,
"step": 376
},
{
"epoch": 0.1,
"grad_norm": 2.995920181274414,
"learning_rate": 1.9722147646348712e-06,
"loss": 0.1355,
"step": 377
},
{
"epoch": 0.1,
"grad_norm": 3.0849361419677734,
"learning_rate": 1.97200722408302e-06,
"loss": 0.1286,
"step": 378
},
{
"epoch": 0.1,
"grad_norm": 2.914358615875244,
"learning_rate": 1.9717989223071143e-06,
"loss": 0.1265,
"step": 379
},
{
"epoch": 0.1,
"grad_norm": 3.264975070953369,
"learning_rate": 1.971589859470284e-06,
"loss": 0.1448,
"step": 380
},
{
"epoch": 0.1,
"grad_norm": 3.29422926902771,
"learning_rate": 1.971380035736257e-06,
"loss": 0.143,
"step": 381
},
{
"epoch": 0.1,
"grad_norm": 3.24412202835083,
"learning_rate": 1.9711694512693557e-06,
"loss": 0.1493,
"step": 382
},
{
"epoch": 0.1,
"grad_norm": 3.6297881603240967,
"learning_rate": 1.970958106234498e-06,
"loss": 0.1577,
"step": 383
},
{
"epoch": 0.1,
"grad_norm": 3.3670804500579834,
"learning_rate": 1.9707460007971986e-06,
"loss": 0.1528,
"step": 384
},
{
"epoch": 0.11,
"grad_norm": 3.3906354904174805,
"learning_rate": 1.9705331351235673e-06,
"loss": 0.166,
"step": 385
},
{
"epoch": 0.11,
"grad_norm": 3.1111812591552734,
"learning_rate": 1.9703195093803084e-06,
"loss": 0.1404,
"step": 386
},
{
"epoch": 0.11,
"grad_norm": 3.115222454071045,
"learning_rate": 1.9701051237347228e-06,
"loss": 0.1351,
"step": 387
},
{
"epoch": 0.11,
"grad_norm": 3.2881312370300293,
"learning_rate": 1.9698899783547055e-06,
"loss": 0.1387,
"step": 388
},
{
"epoch": 0.11,
"grad_norm": 3.433166742324829,
"learning_rate": 1.969674073408747e-06,
"loss": 0.136,
"step": 389
},
{
"epoch": 0.11,
"grad_norm": 3.301527261734009,
"learning_rate": 1.969457409065933e-06,
"loss": 0.1417,
"step": 390
},
{
"epoch": 0.11,
"grad_norm": 3.092268705368042,
"learning_rate": 1.9692399854959423e-06,
"loss": 0.1297,
"step": 391
},
{
"epoch": 0.11,
"grad_norm": 3.169426441192627,
"learning_rate": 1.96902180286905e-06,
"loss": 0.1402,
"step": 392
},
{
"epoch": 0.11,
"grad_norm": 3.187683343887329,
"learning_rate": 1.968802861356125e-06,
"loss": 0.1463,
"step": 393
},
{
"epoch": 0.11,
"grad_norm": 3.421762704849243,
"learning_rate": 1.968583161128631e-06,
"loss": 0.1452,
"step": 394
},
{
"epoch": 0.11,
"grad_norm": 3.173125743865967,
"learning_rate": 1.968362702358625e-06,
"loss": 0.1479,
"step": 395
},
{
"epoch": 0.11,
"grad_norm": 3.2461776733398438,
"learning_rate": 1.9681414852187584e-06,
"loss": 0.1326,
"step": 396
},
{
"epoch": 0.11,
"grad_norm": 3.0317840576171875,
"learning_rate": 1.9679195098822773e-06,
"loss": 0.1202,
"step": 397
},
{
"epoch": 0.11,
"grad_norm": 3.3286936283111572,
"learning_rate": 1.96769677652302e-06,
"loss": 0.1455,
"step": 398
},
{
"epoch": 0.11,
"grad_norm": 3.585869312286377,
"learning_rate": 1.9674732853154204e-06,
"loss": 0.1587,
"step": 399
},
{
"epoch": 0.11,
"grad_norm": 3.1691548824310303,
"learning_rate": 1.9672490364345037e-06,
"loss": 0.1374,
"step": 400
},
{
"epoch": 0.11,
"grad_norm": 3.390744924545288,
"learning_rate": 1.9670240300558903e-06,
"loss": 0.1359,
"step": 401
},
{
"epoch": 0.11,
"grad_norm": 3.236043930053711,
"learning_rate": 1.9667982663557935e-06,
"loss": 0.1424,
"step": 402
},
{
"epoch": 0.11,
"grad_norm": 3.2566139698028564,
"learning_rate": 1.9665717455110186e-06,
"loss": 0.1354,
"step": 403
},
{
"epoch": 0.11,
"grad_norm": 3.3723509311676025,
"learning_rate": 1.966344467698965e-06,
"loss": 0.1277,
"step": 404
},
{
"epoch": 0.11,
"grad_norm": 3.2254810333251953,
"learning_rate": 1.9661164330976243e-06,
"loss": 0.1184,
"step": 405
},
{
"epoch": 0.11,
"grad_norm": 3.084118127822876,
"learning_rate": 1.965887641885581e-06,
"loss": 0.1313,
"step": 406
},
{
"epoch": 0.11,
"grad_norm": 3.266711711883545,
"learning_rate": 1.965658094242013e-06,
"loss": 0.1366,
"step": 407
},
{
"epoch": 0.11,
"grad_norm": 3.0545456409454346,
"learning_rate": 1.965427790346688e-06,
"loss": 0.1258,
"step": 408
},
{
"epoch": 0.11,
"grad_norm": 3.190474510192871,
"learning_rate": 1.965196730379969e-06,
"loss": 0.1327,
"step": 409
},
{
"epoch": 0.11,
"grad_norm": 3.1471309661865234,
"learning_rate": 1.96496491452281e-06,
"loss": 0.1278,
"step": 410
},
{
"epoch": 0.11,
"grad_norm": 3.120482921600342,
"learning_rate": 1.964732342956756e-06,
"loss": 0.1311,
"step": 411
},
{
"epoch": 0.11,
"grad_norm": 3.269605875015259,
"learning_rate": 1.9644990158639447e-06,
"loss": 0.1376,
"step": 412
},
{
"epoch": 0.11,
"grad_norm": 2.940713405609131,
"learning_rate": 1.964264933427106e-06,
"loss": 0.1248,
"step": 413
},
{
"epoch": 0.11,
"grad_norm": 3.0395448207855225,
"learning_rate": 1.9640300958295597e-06,
"loss": 0.1282,
"step": 414
},
{
"epoch": 0.11,
"grad_norm": 3.565747022628784,
"learning_rate": 1.963794503255219e-06,
"loss": 0.1351,
"step": 415
},
{
"epoch": 0.11,
"grad_norm": 4.205630779266357,
"learning_rate": 1.963558155888587e-06,
"loss": 0.1632,
"step": 416
},
{
"epoch": 0.11,
"grad_norm": 3.1847383975982666,
"learning_rate": 1.9633210539147582e-06,
"loss": 0.1279,
"step": 417
},
{
"epoch": 0.11,
"grad_norm": 3.4606165885925293,
"learning_rate": 1.963083197519419e-06,
"loss": 0.162,
"step": 418
},
{
"epoch": 0.11,
"grad_norm": 3.155571222305298,
"learning_rate": 1.9628445868888444e-06,
"loss": 0.1218,
"step": 419
},
{
"epoch": 0.11,
"grad_norm": 3.181898355484009,
"learning_rate": 1.962605222209903e-06,
"loss": 0.1267,
"step": 420
},
{
"epoch": 0.12,
"grad_norm": 3.7715001106262207,
"learning_rate": 1.962365103670051e-06,
"loss": 0.1607,
"step": 421
},
{
"epoch": 0.12,
"grad_norm": 3.2761077880859375,
"learning_rate": 1.9621242314573374e-06,
"loss": 0.1328,
"step": 422
},
{
"epoch": 0.12,
"grad_norm": 3.0447583198547363,
"learning_rate": 1.9618826057604002e-06,
"loss": 0.1402,
"step": 423
},
{
"epoch": 0.12,
"grad_norm": 2.9950973987579346,
"learning_rate": 1.9616402267684673e-06,
"loss": 0.1199,
"step": 424
},
{
"epoch": 0.12,
"grad_norm": 3.516925573348999,
"learning_rate": 1.9613970946713573e-06,
"loss": 0.1546,
"step": 425
},
{
"epoch": 0.12,
"grad_norm": 3.3416996002197266,
"learning_rate": 1.961153209659478e-06,
"loss": 0.1426,
"step": 426
},
{
"epoch": 0.12,
"grad_norm": 3.3621819019317627,
"learning_rate": 1.9609085719238275e-06,
"loss": 0.1522,
"step": 427
},
{
"epoch": 0.12,
"grad_norm": 3.4265851974487305,
"learning_rate": 1.960663181655993e-06,
"loss": 0.15,
"step": 428
},
{
"epoch": 0.12,
"grad_norm": 3.2601821422576904,
"learning_rate": 1.960417039048151e-06,
"loss": 0.1418,
"step": 429
},
{
"epoch": 0.12,
"grad_norm": 3.084263324737549,
"learning_rate": 1.9601701442930666e-06,
"loss": 0.1385,
"step": 430
},
{
"epoch": 0.12,
"grad_norm": 3.3431873321533203,
"learning_rate": 1.9599224975840947e-06,
"loss": 0.1563,
"step": 431
},
{
"epoch": 0.12,
"grad_norm": 2.996354341506958,
"learning_rate": 1.9596740991151798e-06,
"loss": 0.1197,
"step": 432
},
{
"epoch": 0.12,
"grad_norm": 3.18942928314209,
"learning_rate": 1.9594249490808535e-06,
"loss": 0.13,
"step": 433
},
{
"epoch": 0.12,
"grad_norm": 3.276121139526367,
"learning_rate": 1.9591750476762373e-06,
"loss": 0.1306,
"step": 434
},
{
"epoch": 0.12,
"grad_norm": 3.223360538482666,
"learning_rate": 1.95892439509704e-06,
"loss": 0.1518,
"step": 435
},
{
"epoch": 0.12,
"grad_norm": 3.457029342651367,
"learning_rate": 1.9586729915395595e-06,
"loss": 0.1507,
"step": 436
},
{
"epoch": 0.12,
"grad_norm": 2.9880590438842773,
"learning_rate": 1.9584208372006823e-06,
"loss": 0.1303,
"step": 437
},
{
"epoch": 0.12,
"grad_norm": 3.204850435256958,
"learning_rate": 1.9581679322778813e-06,
"loss": 0.1304,
"step": 438
},
{
"epoch": 0.12,
"grad_norm": 3.195963144302368,
"learning_rate": 1.9579142769692183e-06,
"loss": 0.1457,
"step": 439
},
{
"epoch": 0.12,
"grad_norm": 3.0109193325042725,
"learning_rate": 1.957659871473343e-06,
"loss": 0.1233,
"step": 440
},
{
"epoch": 0.12,
"grad_norm": 3.1999616622924805,
"learning_rate": 1.9574047159894915e-06,
"loss": 0.1376,
"step": 441
},
{
"epoch": 0.12,
"grad_norm": 3.2869439125061035,
"learning_rate": 1.9571488107174887e-06,
"loss": 0.1477,
"step": 442
},
{
"epoch": 0.12,
"grad_norm": 3.073430061340332,
"learning_rate": 1.9568921558577452e-06,
"loss": 0.1331,
"step": 443
},
{
"epoch": 0.12,
"grad_norm": 3.045060873031616,
"learning_rate": 1.9566347516112596e-06,
"loss": 0.1276,
"step": 444
},
{
"epoch": 0.12,
"grad_norm": 3.1451849937438965,
"learning_rate": 1.9563765981796176e-06,
"loss": 0.1363,
"step": 445
},
{
"epoch": 0.12,
"grad_norm": 3.5179929733276367,
"learning_rate": 1.9561176957649907e-06,
"loss": 0.1421,
"step": 446
},
{
"epoch": 0.12,
"grad_norm": 3.413295269012451,
"learning_rate": 1.955858044570137e-06,
"loss": 0.1554,
"step": 447
},
{
"epoch": 0.12,
"grad_norm": 3.3223888874053955,
"learning_rate": 1.9555976447984026e-06,
"loss": 0.1416,
"step": 448
},
{
"epoch": 0.12,
"grad_norm": 3.189138174057007,
"learning_rate": 1.9553364966537176e-06,
"loss": 0.1336,
"step": 449
},
{
"epoch": 0.12,
"grad_norm": 3.597012519836426,
"learning_rate": 1.9550746003405995e-06,
"loss": 0.1471,
"step": 450
},
{
"epoch": 0.12,
"grad_norm": 3.439164161682129,
"learning_rate": 1.954811956064152e-06,
"loss": 0.1459,
"step": 451
},
{
"epoch": 0.12,
"grad_norm": 3.2802419662475586,
"learning_rate": 1.954548564030063e-06,
"loss": 0.1391,
"step": 452
},
{
"epoch": 0.12,
"grad_norm": 2.9764251708984375,
"learning_rate": 1.9542844244446083e-06,
"loss": 0.1271,
"step": 453
},
{
"epoch": 0.12,
"grad_norm": 3.5547447204589844,
"learning_rate": 1.9540195375146465e-06,
"loss": 0.1483,
"step": 454
},
{
"epoch": 0.12,
"grad_norm": 3.2675955295562744,
"learning_rate": 1.9537539034476243e-06,
"loss": 0.1227,
"step": 455
},
{
"epoch": 0.12,
"grad_norm": 3.3789680004119873,
"learning_rate": 1.9534875224515718e-06,
"loss": 0.1528,
"step": 456
},
{
"epoch": 0.12,
"grad_norm": 3.386911153793335,
"learning_rate": 1.9532203947351033e-06,
"loss": 0.1198,
"step": 457
},
{
"epoch": 0.13,
"grad_norm": 3.5681533813476562,
"learning_rate": 1.95295252050742e-06,
"loss": 0.1549,
"step": 458
},
{
"epoch": 0.13,
"grad_norm": 3.107218027114868,
"learning_rate": 1.9526838999783062e-06,
"loss": 0.1206,
"step": 459
},
{
"epoch": 0.13,
"grad_norm": 3.2751224040985107,
"learning_rate": 1.9524145333581313e-06,
"loss": 0.1519,
"step": 460
},
{
"epoch": 0.13,
"grad_norm": 3.1982295513153076,
"learning_rate": 1.9521444208578484e-06,
"loss": 0.132,
"step": 461
},
{
"epoch": 0.13,
"grad_norm": 3.3096084594726562,
"learning_rate": 1.951873562688996e-06,
"loss": 0.1345,
"step": 462
},
{
"epoch": 0.13,
"grad_norm": 3.5432839393615723,
"learning_rate": 1.9516019590636953e-06,
"loss": 0.1724,
"step": 463
},
{
"epoch": 0.13,
"grad_norm": 2.7536401748657227,
"learning_rate": 1.9513296101946515e-06,
"loss": 0.1061,
"step": 464
},
{
"epoch": 0.13,
"grad_norm": 3.1745247840881348,
"learning_rate": 1.9510565162951534e-06,
"loss": 0.125,
"step": 465
},
{
"epoch": 0.13,
"grad_norm": 3.4509172439575195,
"learning_rate": 1.9507826775790743e-06,
"loss": 0.1532,
"step": 466
},
{
"epoch": 0.13,
"grad_norm": 3.3914451599121094,
"learning_rate": 1.9505080942608698e-06,
"loss": 0.1488,
"step": 467
},
{
"epoch": 0.13,
"grad_norm": 2.9867324829101562,
"learning_rate": 1.9502327665555787e-06,
"loss": 0.127,
"step": 468
},
{
"epoch": 0.13,
"grad_norm": 3.1437714099884033,
"learning_rate": 1.949956694678823e-06,
"loss": 0.1305,
"step": 469
},
{
"epoch": 0.13,
"grad_norm": 3.198093891143799,
"learning_rate": 1.9496798788468074e-06,
"loss": 0.137,
"step": 470
},
{
"epoch": 0.13,
"grad_norm": 3.217594861984253,
"learning_rate": 1.949402319276319e-06,
"loss": 0.1423,
"step": 471
},
{
"epoch": 0.13,
"grad_norm": 3.119245767593384,
"learning_rate": 1.949124016184728e-06,
"loss": 0.1438,
"step": 472
},
{
"epoch": 0.13,
"grad_norm": 3.3736212253570557,
"learning_rate": 1.948844969789987e-06,
"loss": 0.1316,
"step": 473
},
{
"epoch": 0.13,
"grad_norm": 3.4069814682006836,
"learning_rate": 1.9485651803106283e-06,
"loss": 0.1458,
"step": 474
},
{
"epoch": 0.13,
"grad_norm": 3.2153573036193848,
"learning_rate": 1.9482846479657704e-06,
"loss": 0.1349,
"step": 475
},
{
"epoch": 0.13,
"grad_norm": 3.210415840148926,
"learning_rate": 1.9480033729751096e-06,
"loss": 0.1389,
"step": 476
},
{
"epoch": 0.13,
"grad_norm": 3.301241636276245,
"learning_rate": 1.947721355558926e-06,
"loss": 0.1443,
"step": 477
},
{
"epoch": 0.13,
"grad_norm": 3.1576645374298096,
"learning_rate": 1.9474385959380806e-06,
"loss": 0.1443,
"step": 478
},
{
"epoch": 0.13,
"grad_norm": 2.980865240097046,
"learning_rate": 1.9471550943340157e-06,
"loss": 0.1351,
"step": 479
},
{
"epoch": 0.13,
"grad_norm": 2.9259071350097656,
"learning_rate": 1.9468708509687544e-06,
"loss": 0.131,
"step": 480
},
{
"epoch": 0.13,
"grad_norm": 3.2585418224334717,
"learning_rate": 1.946585866064901e-06,
"loss": 0.1225,
"step": 481
},
{
"epoch": 0.13,
"grad_norm": 3.609048843383789,
"learning_rate": 1.9463001398456397e-06,
"loss": 0.152,
"step": 482
},
{
"epoch": 0.13,
"grad_norm": 3.1587791442871094,
"learning_rate": 1.946013672534737e-06,
"loss": 0.1297,
"step": 483
},
{
"epoch": 0.13,
"grad_norm": 3.1544039249420166,
"learning_rate": 1.9457264643565383e-06,
"loss": 0.1448,
"step": 484
},
{
"epoch": 0.13,
"grad_norm": 3.3397443294525146,
"learning_rate": 1.94543851553597e-06,
"loss": 0.1272,
"step": 485
},
{
"epoch": 0.13,
"grad_norm": 3.064429521560669,
"learning_rate": 1.9451498262985384e-06,
"loss": 0.1337,
"step": 486
},
{
"epoch": 0.13,
"grad_norm": 3.0073888301849365,
"learning_rate": 1.944860396870328e-06,
"loss": 0.1347,
"step": 487
},
{
"epoch": 0.13,
"grad_norm": 3.140315294265747,
"learning_rate": 1.944570227478006e-06,
"loss": 0.1303,
"step": 488
},
{
"epoch": 0.13,
"grad_norm": 3.46236252784729,
"learning_rate": 1.9442793183488174e-06,
"loss": 0.1379,
"step": 489
},
{
"epoch": 0.13,
"grad_norm": 3.236910104751587,
"learning_rate": 1.943987669710586e-06,
"loss": 0.1427,
"step": 490
},
{
"epoch": 0.13,
"grad_norm": 3.339592218399048,
"learning_rate": 1.943695281791716e-06,
"loss": 0.1543,
"step": 491
},
{
"epoch": 0.13,
"grad_norm": 3.673352003097534,
"learning_rate": 1.943402154821189e-06,
"loss": 0.1482,
"step": 492
},
{
"epoch": 0.13,
"grad_norm": 3.3469910621643066,
"learning_rate": 1.943108289028568e-06,
"loss": 0.1361,
"step": 493
},
{
"epoch": 0.13,
"grad_norm": 3.042558431625366,
"learning_rate": 1.9428136846439915e-06,
"loss": 0.1351,
"step": 494
},
{
"epoch": 0.14,
"grad_norm": 3.3642477989196777,
"learning_rate": 1.942518341898178e-06,
"loss": 0.1321,
"step": 495
},
{
"epoch": 0.14,
"grad_norm": 3.2358176708221436,
"learning_rate": 1.942222261022425e-06,
"loss": 0.13,
"step": 496
},
{
"epoch": 0.14,
"grad_norm": 3.4853944778442383,
"learning_rate": 1.941925442248607e-06,
"loss": 0.1424,
"step": 497
},
{
"epoch": 0.14,
"grad_norm": 3.204458236694336,
"learning_rate": 1.9416278858091757e-06,
"loss": 0.1329,
"step": 498
},
{
"epoch": 0.14,
"grad_norm": 3.2569594383239746,
"learning_rate": 1.9413295919371626e-06,
"loss": 0.1409,
"step": 499
},
{
"epoch": 0.14,
"grad_norm": 3.180896759033203,
"learning_rate": 1.9410305608661742e-06,
"loss": 0.1315,
"step": 500
},
{
"epoch": 0.14,
"grad_norm": 2.9764926433563232,
"learning_rate": 1.940730792830397e-06,
"loss": 0.1212,
"step": 501
},
{
"epoch": 0.14,
"grad_norm": 3.056260347366333,
"learning_rate": 1.9404302880645925e-06,
"loss": 0.1228,
"step": 502
},
{
"epoch": 0.14,
"grad_norm": 3.196409225463867,
"learning_rate": 1.9401290468041002e-06,
"loss": 0.128,
"step": 503
},
{
"epoch": 0.14,
"grad_norm": 3.324312925338745,
"learning_rate": 1.939827069284836e-06,
"loss": 0.1387,
"step": 504
},
{
"epoch": 0.14,
"grad_norm": 3.0371274948120117,
"learning_rate": 1.9395243557432923e-06,
"loss": 0.1243,
"step": 505
},
{
"epoch": 0.14,
"grad_norm": 3.1022558212280273,
"learning_rate": 1.939220906416539e-06,
"loss": 0.121,
"step": 506
},
{
"epoch": 0.14,
"grad_norm": 2.9566850662231445,
"learning_rate": 1.9389167215422203e-06,
"loss": 0.1215,
"step": 507
},
{
"epoch": 0.14,
"grad_norm": 3.5804319381713867,
"learning_rate": 1.938611801358558e-06,
"loss": 0.1528,
"step": 508
},
{
"epoch": 0.14,
"grad_norm": 3.33417010307312,
"learning_rate": 1.9383061461043496e-06,
"loss": 0.1439,
"step": 509
},
{
"epoch": 0.14,
"grad_norm": 3.3231208324432373,
"learning_rate": 1.9379997560189675e-06,
"loss": 0.1525,
"step": 510
},
{
"epoch": 0.14,
"grad_norm": 3.470815420150757,
"learning_rate": 1.93769263134236e-06,
"loss": 0.1498,
"step": 511
},
{
"epoch": 0.14,
"grad_norm": 3.2947394847869873,
"learning_rate": 1.937384772315051e-06,
"loss": 0.1342,
"step": 512
},
{
"epoch": 0.14,
"grad_norm": 3.2775120735168457,
"learning_rate": 1.9370761791781392e-06,
"loss": 0.1403,
"step": 513
},
{
"epoch": 0.14,
"grad_norm": 3.1588900089263916,
"learning_rate": 1.936766852173298e-06,
"loss": 0.1399,
"step": 514
},
{
"epoch": 0.14,
"grad_norm": 3.2581050395965576,
"learning_rate": 1.936456791542776e-06,
"loss": 0.1545,
"step": 515
},
{
"epoch": 0.14,
"grad_norm": 3.052593946456909,
"learning_rate": 1.936145997529396e-06,
"loss": 0.1411,
"step": 516
},
{
"epoch": 0.14,
"grad_norm": 3.378387212753296,
"learning_rate": 1.9358344703765553e-06,
"loss": 0.1567,
"step": 517
},
{
"epoch": 0.14,
"grad_norm": 2.960052251815796,
"learning_rate": 1.935522210328225e-06,
"loss": 0.1387,
"step": 518
},
{
"epoch": 0.14,
"grad_norm": 3.3264615535736084,
"learning_rate": 1.9352092176289508e-06,
"loss": 0.1632,
"step": 519
},
{
"epoch": 0.14,
"grad_norm": 3.1621196269989014,
"learning_rate": 1.934895492523852e-06,
"loss": 0.1396,
"step": 520
},
{
"epoch": 0.14,
"grad_norm": 3.1209771633148193,
"learning_rate": 1.9345810352586203e-06,
"loss": 0.1562,
"step": 521
},
{
"epoch": 0.14,
"grad_norm": 3.20269775390625,
"learning_rate": 1.934265846079523e-06,
"loss": 0.1369,
"step": 522
},
{
"epoch": 0.14,
"grad_norm": 3.130948066711426,
"learning_rate": 1.9339499252333995e-06,
"loss": 0.1299,
"step": 523
},
{
"epoch": 0.14,
"grad_norm": 3.228376865386963,
"learning_rate": 1.9336332729676606e-06,
"loss": 0.1579,
"step": 524
},
{
"epoch": 0.14,
"grad_norm": 3.1680591106414795,
"learning_rate": 1.933315889530293e-06,
"loss": 0.1319,
"step": 525
},
{
"epoch": 0.14,
"grad_norm": 3.06646466255188,
"learning_rate": 1.932997775169854e-06,
"loss": 0.1354,
"step": 526
},
{
"epoch": 0.14,
"grad_norm": 3.227132558822632,
"learning_rate": 1.932678930135473e-06,
"loss": 0.1317,
"step": 527
},
{
"epoch": 0.14,
"grad_norm": 3.0347650051116943,
"learning_rate": 1.932359354676853e-06,
"loss": 0.1213,
"step": 528
},
{
"epoch": 0.14,
"grad_norm": 3.594651699066162,
"learning_rate": 1.9320390490442685e-06,
"loss": 0.1418,
"step": 529
},
{
"epoch": 0.14,
"grad_norm": 3.1397430896759033,
"learning_rate": 1.9317180134885657e-06,
"loss": 0.1388,
"step": 530
},
{
"epoch": 0.15,
"grad_norm": 3.069596767425537,
"learning_rate": 1.931396248261162e-06,
"loss": 0.1194,
"step": 531
},
{
"epoch": 0.15,
"grad_norm": 3.0973358154296875,
"learning_rate": 1.9310737536140476e-06,
"loss": 0.1387,
"step": 532
},
{
"epoch": 0.15,
"grad_norm": 3.301105499267578,
"learning_rate": 1.930750529799782e-06,
"loss": 0.1397,
"step": 533
},
{
"epoch": 0.15,
"grad_norm": 3.406813621520996,
"learning_rate": 1.9304265770714976e-06,
"loss": 0.1447,
"step": 534
},
{
"epoch": 0.15,
"grad_norm": 3.189361095428467,
"learning_rate": 1.9301018956828963e-06,
"loss": 0.1361,
"step": 535
},
{
"epoch": 0.15,
"grad_norm": 3.1613755226135254,
"learning_rate": 1.929776485888251e-06,
"loss": 0.1354,
"step": 536
},
{
"epoch": 0.15,
"grad_norm": 3.0347232818603516,
"learning_rate": 1.9294503479424066e-06,
"loss": 0.1175,
"step": 537
},
{
"epoch": 0.15,
"grad_norm": 3.215599775314331,
"learning_rate": 1.9291234821007755e-06,
"loss": 0.1424,
"step": 538
},
{
"epoch": 0.15,
"grad_norm": 3.0372154712677,
"learning_rate": 1.928795888619342e-06,
"loss": 0.138,
"step": 539
},
{
"epoch": 0.15,
"grad_norm": 3.246412754058838,
"learning_rate": 1.9284675677546602e-06,
"loss": 0.1211,
"step": 540
},
{
"epoch": 0.15,
"grad_norm": 3.2461938858032227,
"learning_rate": 1.9281385197638525e-06,
"loss": 0.1422,
"step": 541
},
{
"epoch": 0.15,
"grad_norm": 3.447664260864258,
"learning_rate": 1.9278087449046125e-06,
"loss": 0.1451,
"step": 542
},
{
"epoch": 0.15,
"grad_norm": 3.254608631134033,
"learning_rate": 1.9274782434352014e-06,
"loss": 0.1429,
"step": 543
},
{
"epoch": 0.15,
"grad_norm": 3.195387601852417,
"learning_rate": 1.9271470156144514e-06,
"loss": 0.1412,
"step": 544
},
{
"epoch": 0.15,
"grad_norm": 3.034229278564453,
"learning_rate": 1.926815061701762e-06,
"loss": 0.141,
"step": 545
},
{
"epoch": 0.15,
"grad_norm": 3.351449728012085,
"learning_rate": 1.926482381957101e-06,
"loss": 0.1477,
"step": 546
},
{
"epoch": 0.15,
"grad_norm": 3.1248385906219482,
"learning_rate": 1.926148976641006e-06,
"loss": 0.1162,
"step": 547
},
{
"epoch": 0.15,
"grad_norm": 3.154493808746338,
"learning_rate": 1.9258148460145826e-06,
"loss": 0.1252,
"step": 548
},
{
"epoch": 0.15,
"grad_norm": 3.1947855949401855,
"learning_rate": 1.925479990339503e-06,
"loss": 0.1309,
"step": 549
},
{
"epoch": 0.15,
"grad_norm": 3.2486939430236816,
"learning_rate": 1.925144409878009e-06,
"loss": 0.1375,
"step": 550
},
{
"epoch": 0.15,
"grad_norm": 2.9871749877929688,
"learning_rate": 1.9248081048929095e-06,
"loss": 0.1317,
"step": 551
},
{
"epoch": 0.15,
"grad_norm": 3.172790765762329,
"learning_rate": 1.9244710756475797e-06,
"loss": 0.1391,
"step": 552
},
{
"epoch": 0.15,
"grad_norm": 2.975095272064209,
"learning_rate": 1.9241333224059637e-06,
"loss": 0.1373,
"step": 553
},
{
"epoch": 0.15,
"grad_norm": 3.308891534805298,
"learning_rate": 1.923794845432571e-06,
"loss": 0.157,
"step": 554
},
{
"epoch": 0.15,
"grad_norm": 3.0941014289855957,
"learning_rate": 1.9234556449924794e-06,
"loss": 0.1287,
"step": 555
},
{
"epoch": 0.15,
"grad_norm": 3.365940570831299,
"learning_rate": 1.9231157213513323e-06,
"loss": 0.154,
"step": 556
},
{
"epoch": 0.15,
"grad_norm": 3.150245428085327,
"learning_rate": 1.9227750747753393e-06,
"loss": 0.1316,
"step": 557
},
{
"epoch": 0.15,
"grad_norm": 3.226799488067627,
"learning_rate": 1.922433705531277e-06,
"loss": 0.1437,
"step": 558
},
{
"epoch": 0.15,
"grad_norm": 3.223442316055298,
"learning_rate": 1.9220916138864875e-06,
"loss": 0.1448,
"step": 559
},
{
"epoch": 0.15,
"grad_norm": 3.3040995597839355,
"learning_rate": 1.921748800108878e-06,
"loss": 0.1517,
"step": 560
},
{
"epoch": 0.15,
"grad_norm": 3.551745653152466,
"learning_rate": 1.9214052644669235e-06,
"loss": 0.1675,
"step": 561
},
{
"epoch": 0.15,
"grad_norm": 3.008906364440918,
"learning_rate": 1.921061007229661e-06,
"loss": 0.1415,
"step": 562
},
{
"epoch": 0.15,
"grad_norm": 3.0802981853485107,
"learning_rate": 1.920716028666695e-06,
"loss": 0.1341,
"step": 563
},
{
"epoch": 0.15,
"grad_norm": 2.821687936782837,
"learning_rate": 1.9203703290481946e-06,
"loss": 0.1234,
"step": 564
},
{
"epoch": 0.15,
"grad_norm": 3.080923080444336,
"learning_rate": 1.920023908644893e-06,
"loss": 0.1472,
"step": 565
},
{
"epoch": 0.15,
"grad_norm": 3.197179079055786,
"learning_rate": 1.9196767677280885e-06,
"loss": 0.1292,
"step": 566
},
{
"epoch": 0.15,
"grad_norm": 3.17303729057312,
"learning_rate": 1.919328906569642e-06,
"loss": 0.1379,
"step": 567
},
{
"epoch": 0.16,
"grad_norm": 3.1359729766845703,
"learning_rate": 1.9189803254419812e-06,
"loss": 0.1374,
"step": 568
},
{
"epoch": 0.16,
"grad_norm": 2.890521287918091,
"learning_rate": 1.9186310246180956e-06,
"loss": 0.1247,
"step": 569
},
{
"epoch": 0.16,
"grad_norm": 3.2141013145446777,
"learning_rate": 1.9182810043715388e-06,
"loss": 0.1314,
"step": 570
},
{
"epoch": 0.16,
"grad_norm": 3.1341261863708496,
"learning_rate": 1.9179302649764282e-06,
"loss": 0.1274,
"step": 571
},
{
"epoch": 0.16,
"grad_norm": 3.1453020572662354,
"learning_rate": 1.9175788067074445e-06,
"loss": 0.1334,
"step": 572
},
{
"epoch": 0.16,
"grad_norm": 3.1976587772369385,
"learning_rate": 1.9172266298398297e-06,
"loss": 0.1243,
"step": 573
},
{
"epoch": 0.16,
"grad_norm": 3.1611573696136475,
"learning_rate": 1.9168737346493914e-06,
"loss": 0.1177,
"step": 574
},
{
"epoch": 0.16,
"grad_norm": 3.2642691135406494,
"learning_rate": 1.9165201214124972e-06,
"loss": 0.1462,
"step": 575
},
{
"epoch": 0.16,
"grad_norm": 2.8818516731262207,
"learning_rate": 1.9161657904060784e-06,
"loss": 0.1242,
"step": 576
},
{
"epoch": 0.16,
"grad_norm": 3.033656358718872,
"learning_rate": 1.915810741907628e-06,
"loss": 0.1324,
"step": 577
},
{
"epoch": 0.16,
"grad_norm": 3.5656187534332275,
"learning_rate": 1.915454976195201e-06,
"loss": 0.1505,
"step": 578
},
{
"epoch": 0.16,
"grad_norm": 3.210245132446289,
"learning_rate": 1.9150984935474146e-06,
"loss": 0.1566,
"step": 579
},
{
"epoch": 0.16,
"grad_norm": 3.1418895721435547,
"learning_rate": 1.9147412942434463e-06,
"loss": 0.1382,
"step": 580
},
{
"epoch": 0.16,
"grad_norm": 3.0061264038085938,
"learning_rate": 1.9143833785630354e-06,
"loss": 0.1343,
"step": 581
},
{
"epoch": 0.16,
"grad_norm": 3.0668489933013916,
"learning_rate": 1.914024746786483e-06,
"loss": 0.1503,
"step": 582
},
{
"epoch": 0.16,
"grad_norm": 3.207681894302368,
"learning_rate": 1.91366539919465e-06,
"loss": 0.1406,
"step": 583
},
{
"epoch": 0.16,
"grad_norm": 3.234178066253662,
"learning_rate": 1.9133053360689576e-06,
"loss": 0.1447,
"step": 584
},
{
"epoch": 0.16,
"grad_norm": 3.5382869243621826,
"learning_rate": 1.9129445576913886e-06,
"loss": 0.1483,
"step": 585
},
{
"epoch": 0.16,
"grad_norm": 3.5630078315734863,
"learning_rate": 1.9125830643444854e-06,
"loss": 0.1652,
"step": 586
},
{
"epoch": 0.16,
"grad_norm": 3.188985824584961,
"learning_rate": 1.91222085631135e-06,
"loss": 0.1467,
"step": 587
},
{
"epoch": 0.16,
"grad_norm": 3.0435938835144043,
"learning_rate": 1.9118579338756445e-06,
"loss": 0.1316,
"step": 588
},
{
"epoch": 0.16,
"grad_norm": 2.9583663940429688,
"learning_rate": 1.9114942973215902e-06,
"loss": 0.1249,
"step": 589
},
{
"epoch": 0.16,
"grad_norm": 3.3159563541412354,
"learning_rate": 1.911129946933968e-06,
"loss": 0.152,
"step": 590
},
{
"epoch": 0.16,
"grad_norm": 3.066227674484253,
"learning_rate": 1.9107648829981172e-06,
"loss": 0.1417,
"step": 591
},
{
"epoch": 0.16,
"grad_norm": 3.287661552429199,
"learning_rate": 1.910399105799937e-06,
"loss": 0.1417,
"step": 592
},
{
"epoch": 0.16,
"grad_norm": 3.2641029357910156,
"learning_rate": 1.910032615625884e-06,
"loss": 0.1383,
"step": 593
},
{
"epoch": 0.16,
"grad_norm": 3.0615787506103516,
"learning_rate": 1.909665412762974e-06,
"loss": 0.1294,
"step": 594
},
{
"epoch": 0.16,
"grad_norm": 2.9307010173797607,
"learning_rate": 1.90929749749878e-06,
"loss": 0.1256,
"step": 595
},
{
"epoch": 0.16,
"grad_norm": 3.220402479171753,
"learning_rate": 1.9089288701214344e-06,
"loss": 0.1378,
"step": 596
},
{
"epoch": 0.16,
"grad_norm": 3.3105733394622803,
"learning_rate": 1.908559530919626e-06,
"loss": 0.1542,
"step": 597
},
{
"epoch": 0.16,
"grad_norm": 3.1800825595855713,
"learning_rate": 1.908189480182602e-06,
"loss": 0.1324,
"step": 598
},
{
"epoch": 0.16,
"grad_norm": 3.283498764038086,
"learning_rate": 1.9078187182001654e-06,
"loss": 0.1571,
"step": 599
},
{
"epoch": 0.16,
"grad_norm": 3.2634048461914062,
"learning_rate": 1.9074472452626775e-06,
"loss": 0.1504,
"step": 600
},
{
"epoch": 0.16,
"grad_norm": 3.1880810260772705,
"learning_rate": 1.9070750616610565e-06,
"loss": 0.1261,
"step": 601
},
{
"epoch": 0.16,
"grad_norm": 2.873277187347412,
"learning_rate": 1.9067021676867765e-06,
"loss": 0.1364,
"step": 602
},
{
"epoch": 0.16,
"grad_norm": 3.0402424335479736,
"learning_rate": 1.906328563631868e-06,
"loss": 0.137,
"step": 603
},
{
"epoch": 0.16,
"grad_norm": 2.894537925720215,
"learning_rate": 1.9059542497889176e-06,
"loss": 0.1266,
"step": 604
},
{
"epoch": 0.17,
"grad_norm": 3.748373031616211,
"learning_rate": 1.905579226451068e-06,
"loss": 0.1681,
"step": 605
},
{
"epoch": 0.17,
"grad_norm": 3.571746826171875,
"learning_rate": 1.9052034939120174e-06,
"loss": 0.1575,
"step": 606
},
{
"epoch": 0.17,
"grad_norm": 2.8718080520629883,
"learning_rate": 1.9048270524660196e-06,
"loss": 0.1227,
"step": 607
},
{
"epoch": 0.17,
"grad_norm": 3.5282645225524902,
"learning_rate": 1.904449902407883e-06,
"loss": 0.1603,
"step": 608
},
{
"epoch": 0.17,
"grad_norm": 3.7271251678466797,
"learning_rate": 1.9040720440329715e-06,
"loss": 0.1463,
"step": 609
},
{
"epoch": 0.17,
"grad_norm": 2.921295642852783,
"learning_rate": 1.9036934776372039e-06,
"loss": 0.1235,
"step": 610
},
{
"epoch": 0.17,
"grad_norm": 3.2682065963745117,
"learning_rate": 1.9033142035170526e-06,
"loss": 0.1262,
"step": 611
},
{
"epoch": 0.17,
"grad_norm": 3.3024415969848633,
"learning_rate": 1.9029342219695452e-06,
"loss": 0.1317,
"step": 612
},
{
"epoch": 0.17,
"grad_norm": 3.1507554054260254,
"learning_rate": 1.902553533292263e-06,
"loss": 0.1329,
"step": 613
},
{
"epoch": 0.17,
"grad_norm": 3.013246774673462,
"learning_rate": 1.9021721377833403e-06,
"loss": 0.1265,
"step": 614
},
{
"epoch": 0.17,
"grad_norm": 3.1882128715515137,
"learning_rate": 1.9017900357414667e-06,
"loss": 0.1462,
"step": 615
},
{
"epoch": 0.17,
"grad_norm": 3.45387864112854,
"learning_rate": 1.9014072274658831e-06,
"loss": 0.1331,
"step": 616
},
{
"epoch": 0.17,
"grad_norm": 3.2602956295013428,
"learning_rate": 1.9010237132563853e-06,
"loss": 0.1427,
"step": 617
},
{
"epoch": 0.17,
"grad_norm": 3.3179290294647217,
"learning_rate": 1.9006394934133206e-06,
"loss": 0.1478,
"step": 618
},
{
"epoch": 0.17,
"grad_norm": 3.0537261962890625,
"learning_rate": 1.9002545682375896e-06,
"loss": 0.1348,
"step": 619
},
{
"epoch": 0.17,
"grad_norm": 3.1373305320739746,
"learning_rate": 1.8998689380306448e-06,
"loss": 0.1306,
"step": 620
},
{
"epoch": 0.17,
"grad_norm": 3.3811371326446533,
"learning_rate": 1.8994826030944915e-06,
"loss": 0.1288,
"step": 621
},
{
"epoch": 0.17,
"grad_norm": 3.559420585632324,
"learning_rate": 1.8990955637316862e-06,
"loss": 0.1517,
"step": 622
},
{
"epoch": 0.17,
"grad_norm": 3.1817328929901123,
"learning_rate": 1.898707820245338e-06,
"loss": 0.1287,
"step": 623
},
{
"epoch": 0.17,
"grad_norm": 3.299144983291626,
"learning_rate": 1.8983193729391066e-06,
"loss": 0.1358,
"step": 624
},
{
"epoch": 0.17,
"grad_norm": 3.3346171379089355,
"learning_rate": 1.8979302221172027e-06,
"loss": 0.1554,
"step": 625
},
{
"epoch": 0.17,
"grad_norm": 3.2857518196105957,
"learning_rate": 1.897540368084389e-06,
"loss": 0.152,
"step": 626
},
{
"epoch": 0.17,
"grad_norm": 3.3210465908050537,
"learning_rate": 1.8971498111459778e-06,
"loss": 0.1434,
"step": 627
},
{
"epoch": 0.17,
"grad_norm": 3.127253770828247,
"learning_rate": 1.8967585516078328e-06,
"loss": 0.1214,
"step": 628
},
{
"epoch": 0.17,
"grad_norm": 2.917435884475708,
"learning_rate": 1.8963665897763677e-06,
"loss": 0.1383,
"step": 629
},
{
"epoch": 0.17,
"grad_norm": 2.965036392211914,
"learning_rate": 1.8959739259585454e-06,
"loss": 0.1205,
"step": 630
},
{
"epoch": 0.17,
"grad_norm": 2.9649155139923096,
"learning_rate": 1.8955805604618798e-06,
"loss": 0.1287,
"step": 631
},
{
"epoch": 0.17,
"grad_norm": 3.0632846355438232,
"learning_rate": 1.8951864935944334e-06,
"loss": 0.1295,
"step": 632
},
{
"epoch": 0.17,
"grad_norm": 3.091184616088867,
"learning_rate": 1.8947917256648186e-06,
"loss": 0.1298,
"step": 633
},
{
"epoch": 0.17,
"grad_norm": 2.8987550735473633,
"learning_rate": 1.894396256982196e-06,
"loss": 0.1319,
"step": 634
},
{
"epoch": 0.17,
"grad_norm": 2.84979248046875,
"learning_rate": 1.8940000878562755e-06,
"loss": 0.1337,
"step": 635
},
{
"epoch": 0.17,
"grad_norm": 2.992001533508301,
"learning_rate": 1.8936032185973164e-06,
"loss": 0.137,
"step": 636
},
{
"epoch": 0.17,
"grad_norm": 2.981226921081543,
"learning_rate": 1.8932056495161247e-06,
"loss": 0.1407,
"step": 637
},
{
"epoch": 0.17,
"grad_norm": 3.0171751976013184,
"learning_rate": 1.8928073809240551e-06,
"loss": 0.1271,
"step": 638
},
{
"epoch": 0.17,
"grad_norm": 3.175798177719116,
"learning_rate": 1.892408413133011e-06,
"loss": 0.1352,
"step": 639
},
{
"epoch": 0.17,
"grad_norm": 2.949618339538574,
"learning_rate": 1.8920087464554424e-06,
"loss": 0.1271,
"step": 640
},
{
"epoch": 0.18,
"grad_norm": 3.176654577255249,
"learning_rate": 1.8916083812043463e-06,
"loss": 0.1388,
"step": 641
},
{
"epoch": 0.18,
"grad_norm": 3.2517740726470947,
"learning_rate": 1.891207317693268e-06,
"loss": 0.137,
"step": 642
},
{
"epoch": 0.18,
"grad_norm": 3.248309850692749,
"learning_rate": 1.890805556236299e-06,
"loss": 0.1328,
"step": 643
},
{
"epoch": 0.18,
"grad_norm": 3.2052953243255615,
"learning_rate": 1.8904030971480767e-06,
"loss": 0.1341,
"step": 644
},
{
"epoch": 0.18,
"grad_norm": 3.079794406890869,
"learning_rate": 1.8899999407437859e-06,
"loss": 0.1322,
"step": 645
},
{
"epoch": 0.18,
"grad_norm": 3.074662685394287,
"learning_rate": 1.8895960873391573e-06,
"loss": 0.1317,
"step": 646
},
{
"epoch": 0.18,
"grad_norm": 3.106125593185425,
"learning_rate": 1.889191537250467e-06,
"loss": 0.1501,
"step": 647
},
{
"epoch": 0.18,
"grad_norm": 3.11283802986145,
"learning_rate": 1.8887862907945373e-06,
"loss": 0.1598,
"step": 648
},
{
"epoch": 0.18,
"grad_norm": 3.104076862335205,
"learning_rate": 1.8883803482887352e-06,
"loss": 0.131,
"step": 649
},
{
"epoch": 0.18,
"grad_norm": 2.8443734645843506,
"learning_rate": 1.8879737100509737e-06,
"loss": 0.139,
"step": 650
},
{
"epoch": 0.18,
"grad_norm": 2.937448024749756,
"learning_rate": 1.8875663763997095e-06,
"loss": 0.1407,
"step": 651
},
{
"epoch": 0.18,
"grad_norm": 2.8431577682495117,
"learning_rate": 1.8871583476539445e-06,
"loss": 0.1291,
"step": 652
},
{
"epoch": 0.18,
"grad_norm": 3.3876118659973145,
"learning_rate": 1.8867496241332255e-06,
"loss": 0.1332,
"step": 653
},
{
"epoch": 0.18,
"grad_norm": 3.0150880813598633,
"learning_rate": 1.8863402061576428e-06,
"loss": 0.1321,
"step": 654
},
{
"epoch": 0.18,
"grad_norm": 3.4667561054229736,
"learning_rate": 1.8859300940478302e-06,
"loss": 0.1642,
"step": 655
},
{
"epoch": 0.18,
"grad_norm": 3.204092264175415,
"learning_rate": 1.885519288124966e-06,
"loss": 0.1446,
"step": 656
},
{
"epoch": 0.18,
"grad_norm": 3.096914768218994,
"learning_rate": 1.8851077887107714e-06,
"loss": 0.1375,
"step": 657
},
{
"epoch": 0.18,
"grad_norm": 2.9773669242858887,
"learning_rate": 1.8846955961275103e-06,
"loss": 0.1328,
"step": 658
},
{
"epoch": 0.18,
"grad_norm": 3.0036585330963135,
"learning_rate": 1.8842827106979904e-06,
"loss": 0.154,
"step": 659
},
{
"epoch": 0.18,
"grad_norm": 3.532486915588379,
"learning_rate": 1.8838691327455609e-06,
"loss": 0.1609,
"step": 660
},
{
"epoch": 0.18,
"grad_norm": 3.165400266647339,
"learning_rate": 1.8834548625941146e-06,
"loss": 0.1423,
"step": 661
},
{
"epoch": 0.18,
"grad_norm": 3.1645941734313965,
"learning_rate": 1.8830399005680854e-06,
"loss": 0.1355,
"step": 662
},
{
"epoch": 0.18,
"grad_norm": 3.250861644744873,
"learning_rate": 1.8826242469924493e-06,
"loss": 0.1651,
"step": 663
},
{
"epoch": 0.18,
"grad_norm": 3.0005064010620117,
"learning_rate": 1.8822079021927242e-06,
"loss": 0.1383,
"step": 664
},
{
"epoch": 0.18,
"grad_norm": 2.876596689224243,
"learning_rate": 1.8817908664949686e-06,
"loss": 0.1101,
"step": 665
},
{
"epoch": 0.18,
"grad_norm": 3.1340842247009277,
"learning_rate": 1.8813731402257829e-06,
"loss": 0.1384,
"step": 666
},
{
"epoch": 0.18,
"grad_norm": 3.11590576171875,
"learning_rate": 1.8809547237123077e-06,
"loss": 0.1439,
"step": 667
},
{
"epoch": 0.18,
"grad_norm": 2.904118776321411,
"learning_rate": 1.8805356172822248e-06,
"loss": 0.1284,
"step": 668
},
{
"epoch": 0.18,
"grad_norm": 3.2419793605804443,
"learning_rate": 1.880115821263756e-06,
"loss": 0.1479,
"step": 669
},
{
"epoch": 0.18,
"grad_norm": 3.2196366786956787,
"learning_rate": 1.8796953359856625e-06,
"loss": 0.135,
"step": 670
},
{
"epoch": 0.18,
"grad_norm": 3.2696025371551514,
"learning_rate": 1.8792741617772462e-06,
"loss": 0.1379,
"step": 671
},
{
"epoch": 0.18,
"grad_norm": 2.964501142501831,
"learning_rate": 1.8788522989683485e-06,
"loss": 0.1304,
"step": 672
},
{
"epoch": 0.18,
"grad_norm": 2.754298686981201,
"learning_rate": 1.8784297478893491e-06,
"loss": 0.1319,
"step": 673
},
{
"epoch": 0.18,
"grad_norm": 3.0672099590301514,
"learning_rate": 1.878006508871168e-06,
"loss": 0.1423,
"step": 674
},
{
"epoch": 0.18,
"grad_norm": 3.1538891792297363,
"learning_rate": 1.8775825822452634e-06,
"loss": 0.1479,
"step": 675
},
{
"epoch": 0.18,
"grad_norm": 3.1253247261047363,
"learning_rate": 1.8771579683436313e-06,
"loss": 0.1325,
"step": 676
},
{
"epoch": 0.18,
"grad_norm": 3.3955092430114746,
"learning_rate": 1.8767326674988069e-06,
"loss": 0.1466,
"step": 677
},
{
"epoch": 0.19,
"grad_norm": 2.8525750637054443,
"learning_rate": 1.8763066800438634e-06,
"loss": 0.127,
"step": 678
},
{
"epoch": 0.19,
"grad_norm": 2.9499616622924805,
"learning_rate": 1.8758800063124114e-06,
"loss": 0.1326,
"step": 679
},
{
"epoch": 0.19,
"grad_norm": 3.2043933868408203,
"learning_rate": 1.8754526466385983e-06,
"loss": 0.1335,
"step": 680
},
{
"epoch": 0.19,
"grad_norm": 3.0339996814727783,
"learning_rate": 1.8750246013571098e-06,
"loss": 0.1313,
"step": 681
},
{
"epoch": 0.19,
"grad_norm": 3.287776231765747,
"learning_rate": 1.874595870803168e-06,
"loss": 0.1472,
"step": 682
},
{
"epoch": 0.19,
"grad_norm": 2.9506630897521973,
"learning_rate": 1.8741664553125316e-06,
"loss": 0.136,
"step": 683
},
{
"epoch": 0.19,
"grad_norm": 3.030766010284424,
"learning_rate": 1.8737363552214962e-06,
"loss": 0.131,
"step": 684
},
{
"epoch": 0.19,
"grad_norm": 3.2087364196777344,
"learning_rate": 1.8733055708668925e-06,
"loss": 0.1448,
"step": 685
},
{
"epoch": 0.19,
"grad_norm": 3.3978960514068604,
"learning_rate": 1.8728741025860887e-06,
"loss": 0.1547,
"step": 686
},
{
"epoch": 0.19,
"grad_norm": 2.826280117034912,
"learning_rate": 1.872441950716987e-06,
"loss": 0.1236,
"step": 687
},
{
"epoch": 0.19,
"grad_norm": 3.1583199501037598,
"learning_rate": 1.8720091155980255e-06,
"loss": 0.1566,
"step": 688
},
{
"epoch": 0.19,
"grad_norm": 3.0519423484802246,
"learning_rate": 1.871575597568178e-06,
"loss": 0.1415,
"step": 689
},
{
"epoch": 0.19,
"grad_norm": 3.523808479309082,
"learning_rate": 1.8711413969669525e-06,
"loss": 0.1594,
"step": 690
},
{
"epoch": 0.19,
"grad_norm": 3.3291847705841064,
"learning_rate": 1.8707065141343916e-06,
"loss": 0.1476,
"step": 691
},
{
"epoch": 0.19,
"grad_norm": 3.1632869243621826,
"learning_rate": 1.870270949411072e-06,
"loss": 0.1525,
"step": 692
},
{
"epoch": 0.19,
"grad_norm": 2.94757080078125,
"learning_rate": 1.8698347031381052e-06,
"loss": 0.134,
"step": 693
},
{
"epoch": 0.19,
"grad_norm": 3.0513572692871094,
"learning_rate": 1.8693977756571357e-06,
"loss": 0.1513,
"step": 694
},
{
"epoch": 0.19,
"grad_norm": 3.512118101119995,
"learning_rate": 1.8689601673103417e-06,
"loss": 0.1543,
"step": 695
},
{
"epoch": 0.19,
"grad_norm": 3.3068912029266357,
"learning_rate": 1.8685218784404346e-06,
"loss": 0.1543,
"step": 696
},
{
"epoch": 0.19,
"grad_norm": 3.037355899810791,
"learning_rate": 1.868082909390659e-06,
"loss": 0.1358,
"step": 697
},
{
"epoch": 0.19,
"grad_norm": 3.0297398567199707,
"learning_rate": 1.8676432605047915e-06,
"loss": 0.1392,
"step": 698
},
{
"epoch": 0.19,
"grad_norm": 3.0772452354431152,
"learning_rate": 1.8672029321271423e-06,
"loss": 0.1386,
"step": 699
},
{
"epoch": 0.19,
"grad_norm": 3.000218152999878,
"learning_rate": 1.8667619246025526e-06,
"loss": 0.1231,
"step": 700
},
{
"epoch": 0.19,
"grad_norm": 2.958463430404663,
"learning_rate": 1.866320238276396e-06,
"loss": 0.1304,
"step": 701
},
{
"epoch": 0.19,
"grad_norm": 3.007695198059082,
"learning_rate": 1.8658778734945773e-06,
"loss": 0.1259,
"step": 702
},
{
"epoch": 0.19,
"grad_norm": 3.208649158477783,
"learning_rate": 1.8654348306035335e-06,
"loss": 0.1414,
"step": 703
},
{
"epoch": 0.19,
"grad_norm": 3.0507309436798096,
"learning_rate": 1.8649911099502314e-06,
"loss": 0.1221,
"step": 704
},
{
"epoch": 0.19,
"grad_norm": 3.497598648071289,
"learning_rate": 1.8645467118821698e-06,
"loss": 0.1366,
"step": 705
},
{
"epoch": 0.19,
"grad_norm": 3.0675430297851562,
"learning_rate": 1.8641016367473775e-06,
"loss": 0.1393,
"step": 706
},
{
"epoch": 0.19,
"grad_norm": 3.0553345680236816,
"learning_rate": 1.8636558848944133e-06,
"loss": 0.1463,
"step": 707
},
{
"epoch": 0.19,
"grad_norm": 2.814319610595703,
"learning_rate": 1.863209456672366e-06,
"loss": 0.1266,
"step": 708
},
{
"epoch": 0.19,
"grad_norm": 2.928010940551758,
"learning_rate": 1.862762352430855e-06,
"loss": 0.1433,
"step": 709
},
{
"epoch": 0.19,
"grad_norm": 3.2125368118286133,
"learning_rate": 1.8623145725200277e-06,
"loss": 0.1298,
"step": 710
},
{
"epoch": 0.19,
"grad_norm": 2.9775197505950928,
"learning_rate": 1.8618661172905617e-06,
"loss": 0.112,
"step": 711
},
{
"epoch": 0.19,
"grad_norm": 3.340319871902466,
"learning_rate": 1.8614169870936634e-06,
"loss": 0.1322,
"step": 712
},
{
"epoch": 0.19,
"grad_norm": 3.5040061473846436,
"learning_rate": 1.860967182281067e-06,
"loss": 0.1424,
"step": 713
},
{
"epoch": 0.2,
"grad_norm": 3.348180055618286,
"learning_rate": 1.8605167032050357e-06,
"loss": 0.1503,
"step": 714
},
{
"epoch": 0.2,
"grad_norm": 2.845379590988159,
"learning_rate": 1.8600655502183608e-06,
"loss": 0.1203,
"step": 715
},
{
"epoch": 0.2,
"grad_norm": 3.2701401710510254,
"learning_rate": 1.8596137236743611e-06,
"loss": 0.1561,
"step": 716
},
{
"epoch": 0.2,
"grad_norm": 3.179831027984619,
"learning_rate": 1.8591612239268831e-06,
"loss": 0.1348,
"step": 717
},
{
"epoch": 0.2,
"grad_norm": 3.3915798664093018,
"learning_rate": 1.8587080513303005e-06,
"loss": 0.1387,
"step": 718
},
{
"epoch": 0.2,
"grad_norm": 3.1097445487976074,
"learning_rate": 1.8582542062395131e-06,
"loss": 0.1484,
"step": 719
},
{
"epoch": 0.2,
"grad_norm": 2.8442914485931396,
"learning_rate": 1.8577996890099489e-06,
"loss": 0.1348,
"step": 720
},
{
"epoch": 0.2,
"grad_norm": 2.8544671535491943,
"learning_rate": 1.8573444999975612e-06,
"loss": 0.1327,
"step": 721
},
{
"epoch": 0.2,
"grad_norm": 3.004678249359131,
"learning_rate": 1.8568886395588295e-06,
"loss": 0.1393,
"step": 722
},
{
"epoch": 0.2,
"grad_norm": 2.9679996967315674,
"learning_rate": 1.8564321080507596e-06,
"loss": 0.1397,
"step": 723
},
{
"epoch": 0.2,
"grad_norm": 3.111448287963867,
"learning_rate": 1.8559749058308824e-06,
"loss": 0.1578,
"step": 724
},
{
"epoch": 0.2,
"grad_norm": 3.253422260284424,
"learning_rate": 1.8555170332572542e-06,
"loss": 0.1608,
"step": 725
},
{
"epoch": 0.2,
"grad_norm": 3.186513662338257,
"learning_rate": 1.8550584906884565e-06,
"loss": 0.1529,
"step": 726
},
{
"epoch": 0.2,
"grad_norm": 3.253415584564209,
"learning_rate": 1.8545992784835952e-06,
"loss": 0.1379,
"step": 727
},
{
"epoch": 0.2,
"grad_norm": 3.2245519161224365,
"learning_rate": 1.8541393970023004e-06,
"loss": 0.137,
"step": 728
},
{
"epoch": 0.2,
"grad_norm": 3.0035579204559326,
"learning_rate": 1.8536788466047272e-06,
"loss": 0.1171,
"step": 729
},
{
"epoch": 0.2,
"grad_norm": 2.9544918537139893,
"learning_rate": 1.8532176276515534e-06,
"loss": 0.13,
"step": 730
},
{
"epoch": 0.2,
"grad_norm": 3.0871593952178955,
"learning_rate": 1.8527557405039817e-06,
"loss": 0.1345,
"step": 731
},
{
"epoch": 0.2,
"grad_norm": 3.2445547580718994,
"learning_rate": 1.852293185523737e-06,
"loss": 0.1407,
"step": 732
},
{
"epoch": 0.2,
"grad_norm": 3.1610848903656006,
"learning_rate": 1.8518299630730678e-06,
"loss": 0.1449,
"step": 733
},
{
"epoch": 0.2,
"grad_norm": 3.135960578918457,
"learning_rate": 1.851366073514745e-06,
"loss": 0.1454,
"step": 734
},
{
"epoch": 0.2,
"grad_norm": 3.1653096675872803,
"learning_rate": 1.850901517212062e-06,
"loss": 0.1353,
"step": 735
},
{
"epoch": 0.2,
"grad_norm": 3.5717997550964355,
"learning_rate": 1.8504362945288347e-06,
"loss": 0.1383,
"step": 736
},
{
"epoch": 0.2,
"grad_norm": 2.9212334156036377,
"learning_rate": 1.8499704058294007e-06,
"loss": 0.1348,
"step": 737
},
{
"epoch": 0.2,
"grad_norm": 2.8877100944519043,
"learning_rate": 1.8495038514786184e-06,
"loss": 0.1258,
"step": 738
},
{
"epoch": 0.2,
"grad_norm": 3.737501621246338,
"learning_rate": 1.8490366318418692e-06,
"loss": 0.1574,
"step": 739
},
{
"epoch": 0.2,
"grad_norm": 2.9736099243164062,
"learning_rate": 1.8485687472850537e-06,
"loss": 0.1316,
"step": 740
},
{
"epoch": 0.2,
"grad_norm": 3.177656650543213,
"learning_rate": 1.8481001981745945e-06,
"loss": 0.1243,
"step": 741
},
{
"epoch": 0.2,
"grad_norm": 2.998987913131714,
"learning_rate": 1.8476309848774343e-06,
"loss": 0.1302,
"step": 742
},
{
"epoch": 0.2,
"grad_norm": 3.0486202239990234,
"learning_rate": 1.8471611077610353e-06,
"loss": 0.1395,
"step": 743
},
{
"epoch": 0.2,
"grad_norm": 2.892500638961792,
"learning_rate": 1.8466905671933806e-06,
"loss": 0.1232,
"step": 744
},
{
"epoch": 0.2,
"grad_norm": 3.2417190074920654,
"learning_rate": 1.846219363542972e-06,
"loss": 0.1426,
"step": 745
},
{
"epoch": 0.2,
"grad_norm": 2.9307663440704346,
"learning_rate": 1.8457474971788315e-06,
"loss": 0.1439,
"step": 746
},
{
"epoch": 0.2,
"grad_norm": 3.0509347915649414,
"learning_rate": 1.8452749684704992e-06,
"loss": 0.1312,
"step": 747
},
{
"epoch": 0.2,
"grad_norm": 2.960585355758667,
"learning_rate": 1.8448017777880347e-06,
"loss": 0.1315,
"step": 748
},
{
"epoch": 0.2,
"grad_norm": 3.291780471801758,
"learning_rate": 1.844327925502015e-06,
"loss": 0.1479,
"step": 749
},
{
"epoch": 0.2,
"grad_norm": 2.964261054992676,
"learning_rate": 1.8438534119835362e-06,
"loss": 0.1281,
"step": 750
},
{
"epoch": 0.21,
"grad_norm": 3.006636619567871,
"learning_rate": 1.8433782376042123e-06,
"loss": 0.1418,
"step": 751
},
{
"epoch": 0.21,
"grad_norm": 2.929056167602539,
"learning_rate": 1.8429024027361737e-06,
"loss": 0.1345,
"step": 752
},
{
"epoch": 0.21,
"grad_norm": 3.0413315296173096,
"learning_rate": 1.8424259077520693e-06,
"loss": 0.1422,
"step": 753
},
{
"epoch": 0.21,
"grad_norm": 3.3456640243530273,
"learning_rate": 1.8419487530250644e-06,
"loss": 0.1559,
"step": 754
},
{
"epoch": 0.21,
"grad_norm": 3.007039785385132,
"learning_rate": 1.841470938928841e-06,
"loss": 0.1417,
"step": 755
},
{
"epoch": 0.21,
"grad_norm": 3.312983989715576,
"learning_rate": 1.8409924658375973e-06,
"loss": 0.1475,
"step": 756
},
{
"epoch": 0.21,
"grad_norm": 3.082507848739624,
"learning_rate": 1.8405133341260483e-06,
"loss": 0.1463,
"step": 757
},
{
"epoch": 0.21,
"grad_norm": 3.0856258869171143,
"learning_rate": 1.840033544169424e-06,
"loss": 0.1322,
"step": 758
},
{
"epoch": 0.21,
"grad_norm": 2.7325146198272705,
"learning_rate": 1.8395530963434704e-06,
"loss": 0.1307,
"step": 759
},
{
"epoch": 0.21,
"grad_norm": 3.127000570297241,
"learning_rate": 1.8390719910244486e-06,
"loss": 0.1431,
"step": 760
},
{
"epoch": 0.21,
"grad_norm": 3.2272660732269287,
"learning_rate": 1.838590228589134e-06,
"loss": 0.1284,
"step": 761
},
{
"epoch": 0.21,
"grad_norm": 3.585092306137085,
"learning_rate": 1.8381078094148182e-06,
"loss": 0.1347,
"step": 762
},
{
"epoch": 0.21,
"grad_norm": 2.858229160308838,
"learning_rate": 1.837624733879305e-06,
"loss": 0.1305,
"step": 763
},
{
"epoch": 0.21,
"grad_norm": 2.713216781616211,
"learning_rate": 1.8371410023609138e-06,
"loss": 0.1189,
"step": 764
},
{
"epoch": 0.21,
"grad_norm": 2.635714054107666,
"learning_rate": 1.836656615238477e-06,
"loss": 0.1229,
"step": 765
},
{
"epoch": 0.21,
"grad_norm": 3.3685312271118164,
"learning_rate": 1.8361715728913411e-06,
"loss": 0.1653,
"step": 766
},
{
"epoch": 0.21,
"grad_norm": 3.0395724773406982,
"learning_rate": 1.8356858756993652e-06,
"loss": 0.1332,
"step": 767
},
{
"epoch": 0.21,
"grad_norm": 3.4399709701538086,
"learning_rate": 1.8351995240429213e-06,
"loss": 0.144,
"step": 768
},
{
"epoch": 0.21,
"grad_norm": 3.529384136199951,
"learning_rate": 1.8347125183028938e-06,
"loss": 0.1436,
"step": 769
},
{
"epoch": 0.21,
"grad_norm": 2.861670970916748,
"learning_rate": 1.8342248588606796e-06,
"loss": 0.1263,
"step": 770
},
{
"epoch": 0.21,
"grad_norm": 3.187953472137451,
"learning_rate": 1.833736546098188e-06,
"loss": 0.1372,
"step": 771
},
{
"epoch": 0.21,
"grad_norm": 2.8213915824890137,
"learning_rate": 1.8332475803978388e-06,
"loss": 0.1343,
"step": 772
},
{
"epoch": 0.21,
"grad_norm": 3.0806078910827637,
"learning_rate": 1.8327579621425637e-06,
"loss": 0.1489,
"step": 773
},
{
"epoch": 0.21,
"grad_norm": 2.9299821853637695,
"learning_rate": 1.8322676917158062e-06,
"loss": 0.1397,
"step": 774
},
{
"epoch": 0.21,
"grad_norm": 2.94779109954834,
"learning_rate": 1.8317767695015194e-06,
"loss": 0.14,
"step": 775
},
{
"epoch": 0.21,
"grad_norm": 3.0014383792877197,
"learning_rate": 1.8312851958841672e-06,
"loss": 0.1343,
"step": 776
},
{
"epoch": 0.21,
"grad_norm": 3.195161819458008,
"learning_rate": 1.830792971248724e-06,
"loss": 0.1501,
"step": 777
},
{
"epoch": 0.21,
"grad_norm": 3.0580642223358154,
"learning_rate": 1.8303000959806739e-06,
"loss": 0.1343,
"step": 778
},
{
"epoch": 0.21,
"grad_norm": 3.2072629928588867,
"learning_rate": 1.8298065704660102e-06,
"loss": 0.1449,
"step": 779
},
{
"epoch": 0.21,
"grad_norm": 3.1455795764923096,
"learning_rate": 1.829312395091236e-06,
"loss": 0.1367,
"step": 780
},
{
"epoch": 0.21,
"grad_norm": 3.1127781867980957,
"learning_rate": 1.8288175702433623e-06,
"loss": 0.1296,
"step": 781
},
{
"epoch": 0.21,
"grad_norm": 2.8094706535339355,
"learning_rate": 1.8283220963099101e-06,
"loss": 0.1293,
"step": 782
},
{
"epoch": 0.21,
"grad_norm": 3.0397279262542725,
"learning_rate": 1.8278259736789083e-06,
"loss": 0.1294,
"step": 783
},
{
"epoch": 0.21,
"grad_norm": 2.868248701095581,
"learning_rate": 1.827329202738893e-06,
"loss": 0.1356,
"step": 784
},
{
"epoch": 0.21,
"grad_norm": 3.0783395767211914,
"learning_rate": 1.8268317838789087e-06,
"loss": 0.1595,
"step": 785
},
{
"epoch": 0.21,
"grad_norm": 2.824054002761841,
"learning_rate": 1.8263337174885074e-06,
"loss": 0.1178,
"step": 786
},
{
"epoch": 0.21,
"grad_norm": 2.7530617713928223,
"learning_rate": 1.8258350039577482e-06,
"loss": 0.1348,
"step": 787
},
{
"epoch": 0.22,
"grad_norm": 3.053938388824463,
"learning_rate": 1.8253356436771962e-06,
"loss": 0.125,
"step": 788
},
{
"epoch": 0.22,
"grad_norm": 2.9758999347686768,
"learning_rate": 1.8248356370379247e-06,
"loss": 0.1452,
"step": 789
},
{
"epoch": 0.22,
"grad_norm": 3.1671602725982666,
"learning_rate": 1.8243349844315115e-06,
"loss": 0.1436,
"step": 790
},
{
"epoch": 0.22,
"grad_norm": 3.3275105953216553,
"learning_rate": 1.8238336862500408e-06,
"loss": 0.1345,
"step": 791
},
{
"epoch": 0.22,
"grad_norm": 3.104665517807007,
"learning_rate": 1.823331742886103e-06,
"loss": 0.1352,
"step": 792
},
{
"epoch": 0.22,
"grad_norm": 3.321075916290283,
"learning_rate": 1.8228291547327928e-06,
"loss": 0.1661,
"step": 793
},
{
"epoch": 0.22,
"grad_norm": 3.2859387397766113,
"learning_rate": 1.8223259221837106e-06,
"loss": 0.1478,
"step": 794
},
{
"epoch": 0.22,
"grad_norm": 3.207505464553833,
"learning_rate": 1.8218220456329614e-06,
"loss": 0.1415,
"step": 795
},
{
"epoch": 0.22,
"grad_norm": 3.0411899089813232,
"learning_rate": 1.821317525475154e-06,
"loss": 0.1327,
"step": 796
},
{
"epoch": 0.22,
"grad_norm": 2.902604103088379,
"learning_rate": 1.8208123621054016e-06,
"loss": 0.1452,
"step": 797
},
{
"epoch": 0.22,
"grad_norm": 2.8644320964813232,
"learning_rate": 1.8203065559193212e-06,
"loss": 0.1413,
"step": 798
},
{
"epoch": 0.22,
"grad_norm": 3.0004799365997314,
"learning_rate": 1.8198001073130333e-06,
"loss": 0.136,
"step": 799
},
{
"epoch": 0.22,
"grad_norm": 3.020627737045288,
"learning_rate": 1.8192930166831615e-06,
"loss": 0.145,
"step": 800
},
{
"epoch": 0.22,
"grad_norm": 2.8599250316619873,
"learning_rate": 1.8187852844268318e-06,
"loss": 0.1313,
"step": 801
},
{
"epoch": 0.22,
"grad_norm": 2.9492738246917725,
"learning_rate": 1.8182769109416727e-06,
"loss": 0.1326,
"step": 802
},
{
"epoch": 0.22,
"grad_norm": 2.7574639320373535,
"learning_rate": 1.8177678966258155e-06,
"loss": 0.1369,
"step": 803
},
{
"epoch": 0.22,
"grad_norm": 2.846613645553589,
"learning_rate": 1.817258241877893e-06,
"loss": 0.1317,
"step": 804
},
{
"epoch": 0.22,
"grad_norm": 3.2272794246673584,
"learning_rate": 1.8167479470970391e-06,
"loss": 0.1472,
"step": 805
},
{
"epoch": 0.22,
"grad_norm": 2.8653202056884766,
"learning_rate": 1.81623701268289e-06,
"loss": 0.1297,
"step": 806
},
{
"epoch": 0.22,
"grad_norm": 3.3714654445648193,
"learning_rate": 1.8157254390355812e-06,
"loss": 0.1624,
"step": 807
},
{
"epoch": 0.22,
"grad_norm": 3.1940484046936035,
"learning_rate": 1.815213226555751e-06,
"loss": 0.1449,
"step": 808
},
{
"epoch": 0.22,
"grad_norm": 3.4637551307678223,
"learning_rate": 1.8147003756445361e-06,
"loss": 0.1484,
"step": 809
},
{
"epoch": 0.22,
"grad_norm": 3.2469146251678467,
"learning_rate": 1.8141868867035744e-06,
"loss": 0.1406,
"step": 810
},
{
"epoch": 0.22,
"grad_norm": 3.205935478210449,
"learning_rate": 1.813672760135002e-06,
"loss": 0.149,
"step": 811
},
{
"epoch": 0.22,
"grad_norm": 3.0534427165985107,
"learning_rate": 1.8131579963414563e-06,
"loss": 0.1341,
"step": 812
},
{
"epoch": 0.22,
"grad_norm": 3.206599473953247,
"learning_rate": 1.8126425957260722e-06,
"loss": 0.1311,
"step": 813
},
{
"epoch": 0.22,
"grad_norm": 2.7452621459960938,
"learning_rate": 1.8121265586924846e-06,
"loss": 0.1222,
"step": 814
},
{
"epoch": 0.22,
"grad_norm": 2.8834660053253174,
"learning_rate": 1.8116098856448251e-06,
"loss": 0.1356,
"step": 815
},
{
"epoch": 0.22,
"grad_norm": 2.8721609115600586,
"learning_rate": 1.8110925769877252e-06,
"loss": 0.1254,
"step": 816
},
{
"epoch": 0.22,
"grad_norm": 3.101356267929077,
"learning_rate": 1.810574633126313e-06,
"loss": 0.1375,
"step": 817
},
{
"epoch": 0.22,
"grad_norm": 3.010807514190674,
"learning_rate": 1.8100560544662144e-06,
"loss": 0.1252,
"step": 818
},
{
"epoch": 0.22,
"grad_norm": 2.8136961460113525,
"learning_rate": 1.8095368414135525e-06,
"loss": 0.1231,
"step": 819
},
{
"epoch": 0.22,
"grad_norm": 3.085277557373047,
"learning_rate": 1.8090169943749474e-06,
"loss": 0.1451,
"step": 820
},
{
"epoch": 0.22,
"grad_norm": 3.15325927734375,
"learning_rate": 1.808496513757515e-06,
"loss": 0.1428,
"step": 821
},
{
"epoch": 0.22,
"grad_norm": 3.2210240364074707,
"learning_rate": 1.8079753999688686e-06,
"loss": 0.1531,
"step": 822
},
{
"epoch": 0.22,
"grad_norm": 3.0681214332580566,
"learning_rate": 1.8074536534171158e-06,
"loss": 0.1286,
"step": 823
},
{
"epoch": 0.23,
"grad_norm": 2.953857898712158,
"learning_rate": 1.8069312745108614e-06,
"loss": 0.129,
"step": 824
},
{
"epoch": 0.23,
"grad_norm": 3.2039496898651123,
"learning_rate": 1.806408263659204e-06,
"loss": 0.1484,
"step": 825
},
{
"epoch": 0.23,
"grad_norm": 3.072256326675415,
"learning_rate": 1.8058846212717379e-06,
"loss": 0.1209,
"step": 826
},
{
"epoch": 0.23,
"grad_norm": 2.9288153648376465,
"learning_rate": 1.805360347758552e-06,
"loss": 0.1425,
"step": 827
},
{
"epoch": 0.23,
"grad_norm": 3.1752777099609375,
"learning_rate": 1.8048354435302289e-06,
"loss": 0.138,
"step": 828
},
{
"epoch": 0.23,
"grad_norm": 2.991837739944458,
"learning_rate": 1.8043099089978457e-06,
"loss": 0.1459,
"step": 829
},
{
"epoch": 0.23,
"grad_norm": 2.880028009414673,
"learning_rate": 1.8037837445729732e-06,
"loss": 0.1226,
"step": 830
},
{
"epoch": 0.23,
"grad_norm": 3.154533863067627,
"learning_rate": 1.8032569506676748e-06,
"loss": 0.1419,
"step": 831
},
{
"epoch": 0.23,
"grad_norm": 3.134220600128174,
"learning_rate": 1.8027295276945075e-06,
"loss": 0.1417,
"step": 832
},
{
"epoch": 0.23,
"grad_norm": 2.910787582397461,
"learning_rate": 1.802201476066521e-06,
"loss": 0.1455,
"step": 833
},
{
"epoch": 0.23,
"grad_norm": 2.9492084980010986,
"learning_rate": 1.8016727961972564e-06,
"loss": 0.1274,
"step": 834
},
{
"epoch": 0.23,
"grad_norm": 2.8318257331848145,
"learning_rate": 1.8011434885007479e-06,
"loss": 0.1205,
"step": 835
},
{
"epoch": 0.23,
"grad_norm": 3.070709705352783,
"learning_rate": 1.8006135533915212e-06,
"loss": 0.1402,
"step": 836
},
{
"epoch": 0.23,
"grad_norm": 3.4233484268188477,
"learning_rate": 1.8000829912845929e-06,
"loss": 0.1334,
"step": 837
},
{
"epoch": 0.23,
"grad_norm": 3.1346375942230225,
"learning_rate": 1.7995518025954707e-06,
"loss": 0.1343,
"step": 838
},
{
"epoch": 0.23,
"grad_norm": 3.0293099880218506,
"learning_rate": 1.7990199877401535e-06,
"loss": 0.141,
"step": 839
},
{
"epoch": 0.23,
"grad_norm": 3.2051947116851807,
"learning_rate": 1.79848754713513e-06,
"loss": 0.1401,
"step": 840
},
{
"epoch": 0.23,
"grad_norm": 3.3078722953796387,
"learning_rate": 1.7979544811973791e-06,
"loss": 0.168,
"step": 841
},
{
"epoch": 0.23,
"grad_norm": 3.4778082370758057,
"learning_rate": 1.7974207903443699e-06,
"loss": 0.164,
"step": 842
},
{
"epoch": 0.23,
"grad_norm": 3.250641345977783,
"learning_rate": 1.7968864749940603e-06,
"loss": 0.1409,
"step": 843
},
{
"epoch": 0.23,
"grad_norm": 3.047170639038086,
"learning_rate": 1.7963515355648972e-06,
"loss": 0.1436,
"step": 844
},
{
"epoch": 0.23,
"grad_norm": 2.890998363494873,
"learning_rate": 1.795815972475817e-06,
"loss": 0.121,
"step": 845
},
{
"epoch": 0.23,
"grad_norm": 3.205892562866211,
"learning_rate": 1.7952797861462442e-06,
"loss": 0.1467,
"step": 846
},
{
"epoch": 0.23,
"grad_norm": 2.950531482696533,
"learning_rate": 1.7947429769960904e-06,
"loss": 0.1389,
"step": 847
},
{
"epoch": 0.23,
"grad_norm": 3.001091241836548,
"learning_rate": 1.7942055454457568e-06,
"loss": 0.143,
"step": 848
},
{
"epoch": 0.23,
"grad_norm": 3.553637742996216,
"learning_rate": 1.7936674919161305e-06,
"loss": 0.1711,
"step": 849
},
{
"epoch": 0.23,
"grad_norm": 3.0406932830810547,
"learning_rate": 1.793128816828586e-06,
"loss": 0.1519,
"step": 850
},
{
"epoch": 0.23,
"grad_norm": 2.908801555633545,
"learning_rate": 1.7925895206049858e-06,
"loss": 0.1184,
"step": 851
},
{
"epoch": 0.23,
"grad_norm": 3.0099973678588867,
"learning_rate": 1.7920496036676765e-06,
"loss": 0.1418,
"step": 852
},
{
"epoch": 0.23,
"grad_norm": 3.1775577068328857,
"learning_rate": 1.791509066439493e-06,
"loss": 0.1461,
"step": 853
},
{
"epoch": 0.23,
"grad_norm": 3.443354606628418,
"learning_rate": 1.790967909343755e-06,
"loss": 0.1538,
"step": 854
},
{
"epoch": 0.23,
"grad_norm": 3.434736728668213,
"learning_rate": 1.790426132804268e-06,
"loss": 0.1405,
"step": 855
},
{
"epoch": 0.23,
"grad_norm": 3.2804572582244873,
"learning_rate": 1.7898837372453221e-06,
"loss": 0.148,
"step": 856
},
{
"epoch": 0.23,
"grad_norm": 3.0672659873962402,
"learning_rate": 1.7893407230916924e-06,
"loss": 0.1477,
"step": 857
},
{
"epoch": 0.23,
"grad_norm": 3.0499002933502197,
"learning_rate": 1.788797090768639e-06,
"loss": 0.1387,
"step": 858
},
{
"epoch": 0.23,
"grad_norm": 3.054581642150879,
"learning_rate": 1.7882528407019048e-06,
"loss": 0.1431,
"step": 859
},
{
"epoch": 0.23,
"grad_norm": 3.286684513092041,
"learning_rate": 1.7877079733177183e-06,
"loss": 0.1417,
"step": 860
},
{
"epoch": 0.24,
"grad_norm": 3.0893189907073975,
"learning_rate": 1.7871624890427896e-06,
"loss": 0.135,
"step": 861
},
{
"epoch": 0.24,
"grad_norm": 3.071838855743408,
"learning_rate": 1.7866163883043139e-06,
"loss": 0.1455,
"step": 862
},
{
"epoch": 0.24,
"grad_norm": 3.244340658187866,
"learning_rate": 1.786069671529967e-06,
"loss": 0.1417,
"step": 863
},
{
"epoch": 0.24,
"grad_norm": 3.050936698913574,
"learning_rate": 1.7855223391479086e-06,
"loss": 0.1429,
"step": 864
},
{
"epoch": 0.24,
"grad_norm": 2.821762800216675,
"learning_rate": 1.7849743915867806e-06,
"loss": 0.1278,
"step": 865
},
{
"epoch": 0.24,
"grad_norm": 2.879225969314575,
"learning_rate": 1.7844258292757054e-06,
"loss": 0.1322,
"step": 866
},
{
"epoch": 0.24,
"grad_norm": 2.966362714767456,
"learning_rate": 1.7838766526442886e-06,
"loss": 0.144,
"step": 867
},
{
"epoch": 0.24,
"grad_norm": 2.860746145248413,
"learning_rate": 1.7833268621226148e-06,
"loss": 0.1338,
"step": 868
},
{
"epoch": 0.24,
"grad_norm": 3.343733072280884,
"learning_rate": 1.7827764581412515e-06,
"loss": 0.1579,
"step": 869
},
{
"epoch": 0.24,
"grad_norm": 2.8615481853485107,
"learning_rate": 1.7822254411312451e-06,
"loss": 0.1268,
"step": 870
},
{
"epoch": 0.24,
"grad_norm": 2.838470697402954,
"learning_rate": 1.781673811524123e-06,
"loss": 0.134,
"step": 871
},
{
"epoch": 0.24,
"grad_norm": 2.8155670166015625,
"learning_rate": 1.781121569751892e-06,
"loss": 0.1247,
"step": 872
},
{
"epoch": 0.24,
"grad_norm": 3.1020331382751465,
"learning_rate": 1.7805687162470378e-06,
"loss": 0.1358,
"step": 873
},
{
"epoch": 0.24,
"grad_norm": 2.99312424659729,
"learning_rate": 1.7800152514425265e-06,
"loss": 0.1452,
"step": 874
},
{
"epoch": 0.24,
"grad_norm": 3.434626340866089,
"learning_rate": 1.7794611757718011e-06,
"loss": 0.1574,
"step": 875
},
{
"epoch": 0.24,
"grad_norm": 2.9138333797454834,
"learning_rate": 1.7789064896687848e-06,
"loss": 0.1414,
"step": 876
},
{
"epoch": 0.24,
"grad_norm": 2.970022439956665,
"learning_rate": 1.7783511935678779e-06,
"loss": 0.1371,
"step": 877
},
{
"epoch": 0.24,
"grad_norm": 2.739241361618042,
"learning_rate": 1.7777952879039585e-06,
"loss": 0.1295,
"step": 878
},
{
"epoch": 0.24,
"grad_norm": 2.763500690460205,
"learning_rate": 1.7772387731123825e-06,
"loss": 0.1163,
"step": 879
},
{
"epoch": 0.24,
"grad_norm": 2.9955568313598633,
"learning_rate": 1.776681649628982e-06,
"loss": 0.1274,
"step": 880
},
{
"epoch": 0.24,
"grad_norm": 3.2668027877807617,
"learning_rate": 1.7761239178900667e-06,
"loss": 0.1637,
"step": 881
},
{
"epoch": 0.24,
"grad_norm": 3.040350914001465,
"learning_rate": 1.775565578332422e-06,
"loss": 0.1295,
"step": 882
},
{
"epoch": 0.24,
"grad_norm": 2.8555662631988525,
"learning_rate": 1.7750066313933096e-06,
"loss": 0.129,
"step": 883
},
{
"epoch": 0.24,
"grad_norm": 3.162750720977783,
"learning_rate": 1.774447077510467e-06,
"loss": 0.1485,
"step": 884
},
{
"epoch": 0.24,
"grad_norm": 3.2075698375701904,
"learning_rate": 1.7738869171221067e-06,
"loss": 0.1428,
"step": 885
},
{
"epoch": 0.24,
"grad_norm": 2.953458309173584,
"learning_rate": 1.7733261506669165e-06,
"loss": 0.129,
"step": 886
},
{
"epoch": 0.24,
"grad_norm": 3.3823306560516357,
"learning_rate": 1.7727647785840588e-06,
"loss": 0.1798,
"step": 887
},
{
"epoch": 0.24,
"grad_norm": 3.3550498485565186,
"learning_rate": 1.7722028013131695e-06,
"loss": 0.1642,
"step": 888
},
{
"epoch": 0.24,
"grad_norm": 3.0226235389709473,
"learning_rate": 1.77164021929436e-06,
"loss": 0.1301,
"step": 889
},
{
"epoch": 0.24,
"grad_norm": 3.0606689453125,
"learning_rate": 1.7710770329682143e-06,
"loss": 0.1472,
"step": 890
},
{
"epoch": 0.24,
"grad_norm": 2.988096237182617,
"learning_rate": 1.7705132427757892e-06,
"loss": 0.1399,
"step": 891
},
{
"epoch": 0.24,
"grad_norm": 3.045409679412842,
"learning_rate": 1.7699488491586154e-06,
"loss": 0.1208,
"step": 892
},
{
"epoch": 0.24,
"grad_norm": 2.9872851371765137,
"learning_rate": 1.769383852558696e-06,
"loss": 0.1435,
"step": 893
},
{
"epoch": 0.24,
"grad_norm": 3.2067313194274902,
"learning_rate": 1.7688182534185056e-06,
"loss": 0.1401,
"step": 894
},
{
"epoch": 0.24,
"grad_norm": 3.144598960876465,
"learning_rate": 1.7682520521809917e-06,
"loss": 0.1409,
"step": 895
},
{
"epoch": 0.24,
"grad_norm": 3.270148754119873,
"learning_rate": 1.7676852492895724e-06,
"loss": 0.1564,
"step": 896
},
{
"epoch": 0.25,
"grad_norm": 3.129302978515625,
"learning_rate": 1.7671178451881375e-06,
"loss": 0.1334,
"step": 897
},
{
"epoch": 0.25,
"grad_norm": 2.916828155517578,
"learning_rate": 1.7665498403210476e-06,
"loss": 0.1362,
"step": 898
},
{
"epoch": 0.25,
"grad_norm": 2.9184865951538086,
"learning_rate": 1.7659812351331342e-06,
"loss": 0.1359,
"step": 899
},
{
"epoch": 0.25,
"grad_norm": 3.1969926357269287,
"learning_rate": 1.7654120300696978e-06,
"loss": 0.1496,
"step": 900
},
{
"epoch": 0.25,
"grad_norm": 3.058776378631592,
"learning_rate": 1.7648422255765095e-06,
"loss": 0.1416,
"step": 901
},
{
"epoch": 0.25,
"grad_norm": 3.2968432903289795,
"learning_rate": 1.7642718220998093e-06,
"loss": 0.1299,
"step": 902
},
{
"epoch": 0.25,
"grad_norm": 3.108567953109741,
"learning_rate": 1.7637008200863077e-06,
"loss": 0.1533,
"step": 903
},
{
"epoch": 0.25,
"grad_norm": 2.989795207977295,
"learning_rate": 1.7631292199831824e-06,
"loss": 0.1295,
"step": 904
},
{
"epoch": 0.25,
"grad_norm": 3.2122561931610107,
"learning_rate": 1.7625570222380796e-06,
"loss": 0.1367,
"step": 905
},
{
"epoch": 0.25,
"grad_norm": 3.3966312408447266,
"learning_rate": 1.7619842272991145e-06,
"loss": 0.1526,
"step": 906
},
{
"epoch": 0.25,
"grad_norm": 3.062476634979248,
"learning_rate": 1.7614108356148693e-06,
"loss": 0.1203,
"step": 907
},
{
"epoch": 0.25,
"grad_norm": 3.133892774581909,
"learning_rate": 1.760836847634394e-06,
"loss": 0.1424,
"step": 908
},
{
"epoch": 0.25,
"grad_norm": 3.282561779022217,
"learning_rate": 1.7602622638072047e-06,
"loss": 0.1392,
"step": 909
},
{
"epoch": 0.25,
"grad_norm": 3.0799286365509033,
"learning_rate": 1.7596870845832847e-06,
"loss": 0.1433,
"step": 910
},
{
"epoch": 0.25,
"grad_norm": 3.043998956680298,
"learning_rate": 1.7591113104130844e-06,
"loss": 0.1511,
"step": 911
},
{
"epoch": 0.25,
"grad_norm": 2.924272060394287,
"learning_rate": 1.7585349417475184e-06,
"loss": 0.1295,
"step": 912
},
{
"epoch": 0.25,
"grad_norm": 3.174017906188965,
"learning_rate": 1.7579579790379683e-06,
"loss": 0.143,
"step": 913
},
{
"epoch": 0.25,
"grad_norm": 3.3196375370025635,
"learning_rate": 1.7573804227362805e-06,
"loss": 0.1654,
"step": 914
},
{
"epoch": 0.25,
"grad_norm": 3.114105224609375,
"learning_rate": 1.756802273294766e-06,
"loss": 0.1305,
"step": 915
},
{
"epoch": 0.25,
"grad_norm": 2.9059255123138428,
"learning_rate": 1.7562235311662e-06,
"loss": 0.134,
"step": 916
},
{
"epoch": 0.25,
"grad_norm": 3.0459864139556885,
"learning_rate": 1.7556441968038237e-06,
"loss": 0.1294,
"step": 917
},
{
"epoch": 0.25,
"grad_norm": 2.786449670791626,
"learning_rate": 1.7550642706613395e-06,
"loss": 0.1302,
"step": 918
},
{
"epoch": 0.25,
"grad_norm": 3.0151493549346924,
"learning_rate": 1.754483753192915e-06,
"loss": 0.1356,
"step": 919
},
{
"epoch": 0.25,
"grad_norm": 2.8167083263397217,
"learning_rate": 1.7539026448531806e-06,
"loss": 0.1304,
"step": 920
},
{
"epoch": 0.25,
"grad_norm": 3.0963945388793945,
"learning_rate": 1.7533209460972292e-06,
"loss": 0.1348,
"step": 921
},
{
"epoch": 0.25,
"grad_norm": 3.0987884998321533,
"learning_rate": 1.752738657380616e-06,
"loss": 0.1527,
"step": 922
},
{
"epoch": 0.25,
"grad_norm": 2.9413533210754395,
"learning_rate": 1.7521557791593582e-06,
"loss": 0.1344,
"step": 923
},
{
"epoch": 0.25,
"grad_norm": 3.198122501373291,
"learning_rate": 1.751572311889935e-06,
"loss": 0.1427,
"step": 924
},
{
"epoch": 0.25,
"grad_norm": 2.9854321479797363,
"learning_rate": 1.750988256029287e-06,
"loss": 0.143,
"step": 925
},
{
"epoch": 0.25,
"grad_norm": 3.3399744033813477,
"learning_rate": 1.7504036120348154e-06,
"loss": 0.1478,
"step": 926
},
{
"epoch": 0.25,
"grad_norm": 3.10494327545166,
"learning_rate": 1.7498183803643819e-06,
"loss": 0.1167,
"step": 927
},
{
"epoch": 0.25,
"grad_norm": 2.8649749755859375,
"learning_rate": 1.7492325614763086e-06,
"loss": 0.1218,
"step": 928
},
{
"epoch": 0.25,
"grad_norm": 3.151996374130249,
"learning_rate": 1.7486461558293777e-06,
"loss": 0.1409,
"step": 929
},
{
"epoch": 0.25,
"grad_norm": 2.9325687885284424,
"learning_rate": 1.7480591638828307e-06,
"loss": 0.1317,
"step": 930
},
{
"epoch": 0.25,
"grad_norm": 2.6797404289245605,
"learning_rate": 1.7474715860963683e-06,
"loss": 0.1371,
"step": 931
},
{
"epoch": 0.25,
"grad_norm": 3.1968817710876465,
"learning_rate": 1.74688342293015e-06,
"loss": 0.1521,
"step": 932
},
{
"epoch": 0.25,
"grad_norm": 2.7755022048950195,
"learning_rate": 1.7462946748447935e-06,
"loss": 0.1307,
"step": 933
},
{
"epoch": 0.26,
"grad_norm": 2.925846815109253,
"learning_rate": 1.7457053423013751e-06,
"loss": 0.1253,
"step": 934
},
{
"epoch": 0.26,
"grad_norm": 2.949812173843384,
"learning_rate": 1.7451154257614284e-06,
"loss": 0.1332,
"step": 935
},
{
"epoch": 0.26,
"grad_norm": 3.158405065536499,
"learning_rate": 1.7445249256869444e-06,
"loss": 0.1421,
"step": 936
},
{
"epoch": 0.26,
"grad_norm": 2.9394330978393555,
"learning_rate": 1.7439338425403713e-06,
"loss": 0.1313,
"step": 937
},
{
"epoch": 0.26,
"grad_norm": 2.8409595489501953,
"learning_rate": 1.7433421767846136e-06,
"loss": 0.1312,
"step": 938
},
{
"epoch": 0.26,
"grad_norm": 2.929218292236328,
"learning_rate": 1.7427499288830326e-06,
"loss": 0.138,
"step": 939
},
{
"epoch": 0.26,
"grad_norm": 3.0145485401153564,
"learning_rate": 1.7421570992994447e-06,
"loss": 0.1491,
"step": 940
},
{
"epoch": 0.26,
"grad_norm": 2.813136339187622,
"learning_rate": 1.741563688498123e-06,
"loss": 0.1303,
"step": 941
},
{
"epoch": 0.26,
"grad_norm": 3.102907419204712,
"learning_rate": 1.7409696969437943e-06,
"loss": 0.134,
"step": 942
},
{
"epoch": 0.26,
"grad_norm": 2.875605344772339,
"learning_rate": 1.7403751251016416e-06,
"loss": 0.1387,
"step": 943
},
{
"epoch": 0.26,
"grad_norm": 2.903993844985962,
"learning_rate": 1.7397799734373012e-06,
"loss": 0.1309,
"step": 944
},
{
"epoch": 0.26,
"grad_norm": 3.1668875217437744,
"learning_rate": 1.7391842424168647e-06,
"loss": 0.1359,
"step": 945
},
{
"epoch": 0.26,
"grad_norm": 3.0324251651763916,
"learning_rate": 1.7385879325068764e-06,
"loss": 0.149,
"step": 946
},
{
"epoch": 0.26,
"grad_norm": 3.013434410095215,
"learning_rate": 1.7379910441743345e-06,
"loss": 0.1489,
"step": 947
},
{
"epoch": 0.26,
"grad_norm": 3.1340384483337402,
"learning_rate": 1.7373935778866895e-06,
"loss": 0.1504,
"step": 948
},
{
"epoch": 0.26,
"grad_norm": 3.3014206886291504,
"learning_rate": 1.7367955341118456e-06,
"loss": 0.1362,
"step": 949
},
{
"epoch": 0.26,
"grad_norm": 2.800163507461548,
"learning_rate": 1.7361969133181584e-06,
"loss": 0.1218,
"step": 950
},
{
"epoch": 0.26,
"grad_norm": 3.1261839866638184,
"learning_rate": 1.7355977159744358e-06,
"loss": 0.145,
"step": 951
},
{
"epoch": 0.26,
"grad_norm": 2.8605103492736816,
"learning_rate": 1.734997942549937e-06,
"loss": 0.1259,
"step": 952
},
{
"epoch": 0.26,
"grad_norm": 3.1533775329589844,
"learning_rate": 1.7343975935143727e-06,
"loss": 0.1496,
"step": 953
},
{
"epoch": 0.26,
"grad_norm": 3.145339012145996,
"learning_rate": 1.733796669337904e-06,
"loss": 0.1392,
"step": 954
},
{
"epoch": 0.26,
"grad_norm": 2.741110324859619,
"learning_rate": 1.7331951704911424e-06,
"loss": 0.1363,
"step": 955
},
{
"epoch": 0.26,
"grad_norm": 2.8262789249420166,
"learning_rate": 1.7325930974451497e-06,
"loss": 0.1374,
"step": 956
},
{
"epoch": 0.26,
"grad_norm": 3.010044813156128,
"learning_rate": 1.7319904506714375e-06,
"loss": 0.1433,
"step": 957
},
{
"epoch": 0.26,
"grad_norm": 3.2525150775909424,
"learning_rate": 1.7313872306419662e-06,
"loss": 0.163,
"step": 958
},
{
"epoch": 0.26,
"grad_norm": 2.9591891765594482,
"learning_rate": 1.730783437829146e-06,
"loss": 0.1165,
"step": 959
},
{
"epoch": 0.26,
"grad_norm": 3.2669708728790283,
"learning_rate": 1.7301790727058343e-06,
"loss": 0.1521,
"step": 960
},
{
"epoch": 0.26,
"grad_norm": 2.821305751800537,
"learning_rate": 1.729574135745338e-06,
"loss": 0.1233,
"step": 961
},
{
"epoch": 0.26,
"grad_norm": 3.1753952503204346,
"learning_rate": 1.7289686274214115e-06,
"loss": 0.1391,
"step": 962
},
{
"epoch": 0.26,
"grad_norm": 2.831979274749756,
"learning_rate": 1.7283625482082563e-06,
"loss": 0.1227,
"step": 963
},
{
"epoch": 0.26,
"grad_norm": 3.01729679107666,
"learning_rate": 1.7277558985805211e-06,
"loss": 0.1396,
"step": 964
},
{
"epoch": 0.26,
"grad_norm": 2.9155466556549072,
"learning_rate": 1.727148679013302e-06,
"loss": 0.1327,
"step": 965
},
{
"epoch": 0.26,
"grad_norm": 3.0422449111938477,
"learning_rate": 1.7265408899821403e-06,
"loss": 0.1333,
"step": 966
},
{
"epoch": 0.26,
"grad_norm": 2.878432035446167,
"learning_rate": 1.725932531963024e-06,
"loss": 0.1286,
"step": 967
},
{
"epoch": 0.26,
"grad_norm": 3.600102186203003,
"learning_rate": 1.7253236054323868e-06,
"loss": 0.1424,
"step": 968
},
{
"epoch": 0.26,
"grad_norm": 2.899467706680298,
"learning_rate": 1.724714110867107e-06,
"loss": 0.1304,
"step": 969
},
{
"epoch": 0.26,
"grad_norm": 2.8585548400878906,
"learning_rate": 1.724104048744508e-06,
"loss": 0.1274,
"step": 970
},
{
"epoch": 0.27,
"grad_norm": 3.1076653003692627,
"learning_rate": 1.7234934195423584e-06,
"loss": 0.1335,
"step": 971
},
{
"epoch": 0.27,
"grad_norm": 3.2046873569488525,
"learning_rate": 1.7228822237388703e-06,
"loss": 0.1397,
"step": 972
},
{
"epoch": 0.27,
"grad_norm": 2.9000132083892822,
"learning_rate": 1.722270461812699e-06,
"loss": 0.1213,
"step": 973
},
{
"epoch": 0.27,
"grad_norm": 3.2703990936279297,
"learning_rate": 1.721658134242944e-06,
"loss": 0.1243,
"step": 974
},
{
"epoch": 0.27,
"grad_norm": 2.9981961250305176,
"learning_rate": 1.7210452415091475e-06,
"loss": 0.1451,
"step": 975
},
{
"epoch": 0.27,
"grad_norm": 2.869926929473877,
"learning_rate": 1.7204317840912944e-06,
"loss": 0.121,
"step": 976
},
{
"epoch": 0.27,
"grad_norm": 2.680950880050659,
"learning_rate": 1.7198177624698116e-06,
"loss": 0.1215,
"step": 977
},
{
"epoch": 0.27,
"grad_norm": 2.8917808532714844,
"learning_rate": 1.7192031771255682e-06,
"loss": 0.1189,
"step": 978
},
{
"epoch": 0.27,
"grad_norm": 2.8637447357177734,
"learning_rate": 1.718588028539874e-06,
"loss": 0.1409,
"step": 979
},
{
"epoch": 0.27,
"grad_norm": 3.082494020462036,
"learning_rate": 1.717972317194481e-06,
"loss": 0.1482,
"step": 980
},
{
"epoch": 0.27,
"grad_norm": 3.49234938621521,
"learning_rate": 1.7173560435715814e-06,
"loss": 0.1397,
"step": 981
},
{
"epoch": 0.27,
"grad_norm": 3.003164291381836,
"learning_rate": 1.7167392081538074e-06,
"loss": 0.1362,
"step": 982
},
{
"epoch": 0.27,
"grad_norm": 2.805999279022217,
"learning_rate": 1.7161218114242316e-06,
"loss": 0.1315,
"step": 983
},
{
"epoch": 0.27,
"grad_norm": 2.8828787803649902,
"learning_rate": 1.7155038538663663e-06,
"loss": 0.1282,
"step": 984
},
{
"epoch": 0.27,
"grad_norm": 2.8884527683258057,
"learning_rate": 1.7148853359641625e-06,
"loss": 0.1297,
"step": 985
},
{
"epoch": 0.27,
"grad_norm": 3.0269837379455566,
"learning_rate": 1.7142662582020104e-06,
"loss": 0.1316,
"step": 986
},
{
"epoch": 0.27,
"grad_norm": 3.180825710296631,
"learning_rate": 1.7136466210647387e-06,
"loss": 0.1409,
"step": 987
},
{
"epoch": 0.27,
"grad_norm": 3.00687313079834,
"learning_rate": 1.7130264250376142e-06,
"loss": 0.1441,
"step": 988
},
{
"epoch": 0.27,
"grad_norm": 2.7717976570129395,
"learning_rate": 1.7124056706063408e-06,
"loss": 0.1282,
"step": 989
},
{
"epoch": 0.27,
"grad_norm": 2.762643337249756,
"learning_rate": 1.7117843582570606e-06,
"loss": 0.1209,
"step": 990
},
{
"epoch": 0.27,
"grad_norm": 2.950422763824463,
"learning_rate": 1.7111624884763517e-06,
"loss": 0.1222,
"step": 991
},
{
"epoch": 0.27,
"grad_norm": 3.0254971981048584,
"learning_rate": 1.7105400617512298e-06,
"loss": 0.1289,
"step": 992
},
{
"epoch": 0.27,
"grad_norm": 2.8435542583465576,
"learning_rate": 1.7099170785691456e-06,
"loss": 0.127,
"step": 993
},
{
"epoch": 0.27,
"grad_norm": 2.956089973449707,
"learning_rate": 1.709293539417987e-06,
"loss": 0.1308,
"step": 994
},
{
"epoch": 0.27,
"grad_norm": 2.9792909622192383,
"learning_rate": 1.708669444786076e-06,
"loss": 0.1277,
"step": 995
},
{
"epoch": 0.27,
"grad_norm": 3.3625175952911377,
"learning_rate": 1.70804479516217e-06,
"loss": 0.1641,
"step": 996
},
{
"epoch": 0.27,
"grad_norm": 2.9496147632598877,
"learning_rate": 1.7074195910354616e-06,
"loss": 0.1231,
"step": 997
},
{
"epoch": 0.27,
"grad_norm": 3.3361380100250244,
"learning_rate": 1.7067938328955766e-06,
"loss": 0.1371,
"step": 998
},
{
"epoch": 0.27,
"grad_norm": 3.1837551593780518,
"learning_rate": 1.7061675212325759e-06,
"loss": 0.1359,
"step": 999
},
{
"epoch": 0.27,
"grad_norm": 2.8014943599700928,
"learning_rate": 1.705540656536953e-06,
"loss": 0.1261,
"step": 1000
},
{
"epoch": 0.27,
"grad_norm": 3.034485101699829,
"learning_rate": 1.704913239299635e-06,
"loss": 0.1322,
"step": 1001
},
{
"epoch": 0.27,
"grad_norm": 2.8884332180023193,
"learning_rate": 1.7042852700119811e-06,
"loss": 0.1368,
"step": 1002
},
{
"epoch": 0.27,
"grad_norm": 3.1377642154693604,
"learning_rate": 1.7036567491657836e-06,
"loss": 0.143,
"step": 1003
},
{
"epoch": 0.27,
"grad_norm": 3.1927852630615234,
"learning_rate": 1.7030276772532664e-06,
"loss": 0.1582,
"step": 1004
},
{
"epoch": 0.27,
"grad_norm": 2.8954274654388428,
"learning_rate": 1.7023980547670846e-06,
"loss": 0.1382,
"step": 1005
},
{
"epoch": 0.27,
"grad_norm": 3.169952630996704,
"learning_rate": 1.7017678822003253e-06,
"loss": 0.1336,
"step": 1006
},
{
"epoch": 0.28,
"grad_norm": 2.876800537109375,
"learning_rate": 1.701137160046506e-06,
"loss": 0.1259,
"step": 1007
},
{
"epoch": 0.28,
"grad_norm": 2.769343852996826,
"learning_rate": 1.700505888799574e-06,
"loss": 0.1253,
"step": 1008
},
{
"epoch": 0.28,
"grad_norm": 3.1073548793792725,
"learning_rate": 1.6998740689539075e-06,
"loss": 0.1275,
"step": 1009
},
{
"epoch": 0.28,
"grad_norm": 3.218838930130005,
"learning_rate": 1.699241701004314e-06,
"loss": 0.1474,
"step": 1010
},
{
"epoch": 0.28,
"grad_norm": 2.921640157699585,
"learning_rate": 1.6986087854460305e-06,
"loss": 0.1291,
"step": 1011
},
{
"epoch": 0.28,
"grad_norm": 2.973304271697998,
"learning_rate": 1.697975322774722e-06,
"loss": 0.1244,
"step": 1012
},
{
"epoch": 0.28,
"grad_norm": 3.119814157485962,
"learning_rate": 1.6973413134864827e-06,
"loss": 0.1264,
"step": 1013
},
{
"epoch": 0.28,
"grad_norm": 3.0828561782836914,
"learning_rate": 1.6967067580778353e-06,
"loss": 0.1439,
"step": 1014
},
{
"epoch": 0.28,
"grad_norm": 3.010824680328369,
"learning_rate": 1.6960716570457291e-06,
"loss": 0.1339,
"step": 1015
},
{
"epoch": 0.28,
"grad_norm": 2.9271926879882812,
"learning_rate": 1.6954360108875415e-06,
"loss": 0.1437,
"step": 1016
},
{
"epoch": 0.28,
"grad_norm": 3.0377440452575684,
"learning_rate": 1.6947998201010767e-06,
"loss": 0.1377,
"step": 1017
},
{
"epoch": 0.28,
"grad_norm": 3.0867815017700195,
"learning_rate": 1.694163085184565e-06,
"loss": 0.1362,
"step": 1018
},
{
"epoch": 0.28,
"grad_norm": 2.6888203620910645,
"learning_rate": 1.6935258066366632e-06,
"loss": 0.1228,
"step": 1019
},
{
"epoch": 0.28,
"grad_norm": 2.6803104877471924,
"learning_rate": 1.6928879849564539e-06,
"loss": 0.1151,
"step": 1020
},
{
"epoch": 0.28,
"grad_norm": 2.61885142326355,
"learning_rate": 1.6922496206434444e-06,
"loss": 0.1319,
"step": 1021
},
{
"epoch": 0.28,
"grad_norm": 3.1043663024902344,
"learning_rate": 1.6916107141975685e-06,
"loss": 0.17,
"step": 1022
},
{
"epoch": 0.28,
"grad_norm": 2.94313383102417,
"learning_rate": 1.6909712661191823e-06,
"loss": 0.1372,
"step": 1023
},
{
"epoch": 0.28,
"grad_norm": 3.073957920074463,
"learning_rate": 1.690331276909068e-06,
"loss": 0.1356,
"step": 1024
},
{
"epoch": 0.28,
"grad_norm": 2.8185484409332275,
"learning_rate": 1.6896907470684315e-06,
"loss": 0.141,
"step": 1025
},
{
"epoch": 0.28,
"grad_norm": 3.179748773574829,
"learning_rate": 1.6890496770989001e-06,
"loss": 0.1498,
"step": 1026
},
{
"epoch": 0.28,
"grad_norm": 2.92128849029541,
"learning_rate": 1.6884080675025268e-06,
"loss": 0.1308,
"step": 1027
},
{
"epoch": 0.28,
"grad_norm": 2.9293651580810547,
"learning_rate": 1.687765918781785e-06,
"loss": 0.1294,
"step": 1028
},
{
"epoch": 0.28,
"grad_norm": 3.2544984817504883,
"learning_rate": 1.6871232314395718e-06,
"loss": 0.143,
"step": 1029
},
{
"epoch": 0.28,
"grad_norm": 3.0878231525421143,
"learning_rate": 1.6864800059792055e-06,
"loss": 0.1269,
"step": 1030
},
{
"epoch": 0.28,
"grad_norm": 3.029195547103882,
"learning_rate": 1.6858362429044256e-06,
"loss": 0.1413,
"step": 1031
},
{
"epoch": 0.28,
"grad_norm": 2.8506369590759277,
"learning_rate": 1.6851919427193925e-06,
"loss": 0.1364,
"step": 1032
},
{
"epoch": 0.28,
"grad_norm": 2.8560402393341064,
"learning_rate": 1.6845471059286886e-06,
"loss": 0.1205,
"step": 1033
},
{
"epoch": 0.28,
"grad_norm": 2.9102232456207275,
"learning_rate": 1.6839017330373151e-06,
"loss": 0.1332,
"step": 1034
},
{
"epoch": 0.28,
"grad_norm": 2.859626531600952,
"learning_rate": 1.6832558245506933e-06,
"loss": 0.1265,
"step": 1035
},
{
"epoch": 0.28,
"grad_norm": 3.1594340801239014,
"learning_rate": 1.6826093809746649e-06,
"loss": 0.1344,
"step": 1036
},
{
"epoch": 0.28,
"grad_norm": 2.971975803375244,
"learning_rate": 1.681962402815489e-06,
"loss": 0.1427,
"step": 1037
},
{
"epoch": 0.28,
"grad_norm": 3.0042905807495117,
"learning_rate": 1.6813148905798446e-06,
"loss": 0.1411,
"step": 1038
},
{
"epoch": 0.28,
"grad_norm": 3.0483460426330566,
"learning_rate": 1.6806668447748292e-06,
"loss": 0.1345,
"step": 1039
},
{
"epoch": 0.28,
"grad_norm": 3.240797758102417,
"learning_rate": 1.6800182659079567e-06,
"loss": 0.151,
"step": 1040
},
{
"epoch": 0.28,
"grad_norm": 3.112478256225586,
"learning_rate": 1.6793691544871603e-06,
"loss": 0.1556,
"step": 1041
},
{
"epoch": 0.28,
"grad_norm": 2.8727810382843018,
"learning_rate": 1.6787195110207884e-06,
"loss": 0.1336,
"step": 1042
},
{
"epoch": 0.28,
"grad_norm": 2.958864212036133,
"learning_rate": 1.6780693360176075e-06,
"loss": 0.1366,
"step": 1043
},
{
"epoch": 0.29,
"grad_norm": 2.757554292678833,
"learning_rate": 1.6774186299868e-06,
"loss": 0.1361,
"step": 1044
},
{
"epoch": 0.29,
"grad_norm": 5.423801422119141,
"learning_rate": 1.6767673934379639e-06,
"loss": 0.1544,
"step": 1045
},
{
"epoch": 0.29,
"grad_norm": 2.8387649059295654,
"learning_rate": 1.6761156268811128e-06,
"loss": 0.1287,
"step": 1046
},
{
"epoch": 0.29,
"grad_norm": 3.4863409996032715,
"learning_rate": 1.6754633308266752e-06,
"loss": 0.1576,
"step": 1047
},
{
"epoch": 0.29,
"grad_norm": 2.8142569065093994,
"learning_rate": 1.674810505785495e-06,
"loss": 0.125,
"step": 1048
},
{
"epoch": 0.29,
"grad_norm": 2.872755527496338,
"learning_rate": 1.6741571522688294e-06,
"loss": 0.1368,
"step": 1049
},
{
"epoch": 0.29,
"grad_norm": 2.6436572074890137,
"learning_rate": 1.67350327078835e-06,
"loss": 0.1154,
"step": 1050
},
{
"epoch": 0.29,
"grad_norm": 2.924184560775757,
"learning_rate": 1.6728488618561417e-06,
"loss": 0.1307,
"step": 1051
},
{
"epoch": 0.29,
"grad_norm": 2.9721291065216064,
"learning_rate": 1.672193925984703e-06,
"loss": 0.1295,
"step": 1052
},
{
"epoch": 0.29,
"grad_norm": 3.0370213985443115,
"learning_rate": 1.6715384636869442e-06,
"loss": 0.1244,
"step": 1053
},
{
"epoch": 0.29,
"grad_norm": 3.0612335205078125,
"learning_rate": 1.6708824754761886e-06,
"loss": 0.1366,
"step": 1054
},
{
"epoch": 0.29,
"grad_norm": 2.968006134033203,
"learning_rate": 1.6702259618661708e-06,
"loss": 0.1287,
"step": 1055
},
{
"epoch": 0.29,
"grad_norm": 2.730593681335449,
"learning_rate": 1.669568923371037e-06,
"loss": 0.1293,
"step": 1056
},
{
"epoch": 0.29,
"grad_norm": 3.0833163261413574,
"learning_rate": 1.668911360505345e-06,
"loss": 0.1294,
"step": 1057
},
{
"epoch": 0.29,
"grad_norm": 2.8949716091156006,
"learning_rate": 1.6682532737840628e-06,
"loss": 0.1335,
"step": 1058
},
{
"epoch": 0.29,
"grad_norm": 3.068634033203125,
"learning_rate": 1.6675946637225688e-06,
"loss": 0.1331,
"step": 1059
},
{
"epoch": 0.29,
"grad_norm": 2.908865213394165,
"learning_rate": 1.6669355308366507e-06,
"loss": 0.1341,
"step": 1060
},
{
"epoch": 0.29,
"grad_norm": 2.7697980403900146,
"learning_rate": 1.6662758756425063e-06,
"loss": 0.1248,
"step": 1061
},
{
"epoch": 0.29,
"grad_norm": 2.9143946170806885,
"learning_rate": 1.6656156986567427e-06,
"loss": 0.1277,
"step": 1062
},
{
"epoch": 0.29,
"grad_norm": 3.1084022521972656,
"learning_rate": 1.6649550003963745e-06,
"loss": 0.1373,
"step": 1063
},
{
"epoch": 0.29,
"grad_norm": 3.0052878856658936,
"learning_rate": 1.6642937813788258e-06,
"loss": 0.1433,
"step": 1064
},
{
"epoch": 0.29,
"grad_norm": 3.1521522998809814,
"learning_rate": 1.6636320421219277e-06,
"loss": 0.1681,
"step": 1065
},
{
"epoch": 0.29,
"grad_norm": 2.8705294132232666,
"learning_rate": 1.662969783143919e-06,
"loss": 0.139,
"step": 1066
},
{
"epoch": 0.29,
"grad_norm": 2.733748197555542,
"learning_rate": 1.6623070049634453e-06,
"loss": 0.1144,
"step": 1067
},
{
"epoch": 0.29,
"grad_norm": 2.8265879154205322,
"learning_rate": 1.6616437080995595e-06,
"loss": 0.1339,
"step": 1068
},
{
"epoch": 0.29,
"grad_norm": 3.024449110031128,
"learning_rate": 1.6609798930717198e-06,
"loss": 0.1484,
"step": 1069
},
{
"epoch": 0.29,
"grad_norm": 2.904423475265503,
"learning_rate": 1.6603155603997908e-06,
"loss": 0.1308,
"step": 1070
},
{
"epoch": 0.29,
"grad_norm": 3.1895720958709717,
"learning_rate": 1.6596507106040422e-06,
"loss": 0.1501,
"step": 1071
},
{
"epoch": 0.29,
"grad_norm": 2.9068169593811035,
"learning_rate": 1.658985344205149e-06,
"loss": 0.1423,
"step": 1072
},
{
"epoch": 0.29,
"grad_norm": 2.645341157913208,
"learning_rate": 1.6583194617241906e-06,
"loss": 0.1242,
"step": 1073
},
{
"epoch": 0.29,
"grad_norm": 3.0872840881347656,
"learning_rate": 1.6576530636826498e-06,
"loss": 0.1323,
"step": 1074
},
{
"epoch": 0.29,
"grad_norm": 3.080601453781128,
"learning_rate": 1.6569861506024148e-06,
"loss": 0.1289,
"step": 1075
},
{
"epoch": 0.29,
"grad_norm": 2.7821171283721924,
"learning_rate": 1.6563187230057759e-06,
"loss": 0.1206,
"step": 1076
},
{
"epoch": 0.29,
"grad_norm": 3.501741886138916,
"learning_rate": 1.6556507814154264e-06,
"loss": 0.1353,
"step": 1077
},
{
"epoch": 0.29,
"grad_norm": 2.96410870552063,
"learning_rate": 1.6549823263544628e-06,
"loss": 0.1301,
"step": 1078
},
{
"epoch": 0.29,
"grad_norm": 3.634714126586914,
"learning_rate": 1.6543133583463833e-06,
"loss": 0.1515,
"step": 1079
},
{
"epoch": 0.3,
"grad_norm": 3.070134401321411,
"learning_rate": 1.6536438779150878e-06,
"loss": 0.1389,
"step": 1080
},
{
"epoch": 0.3,
"grad_norm": 3.6893770694732666,
"learning_rate": 1.6529738855848776e-06,
"loss": 0.1598,
"step": 1081
},
{
"epoch": 0.3,
"grad_norm": 3.5830516815185547,
"learning_rate": 1.6523033818804549e-06,
"loss": 0.1607,
"step": 1082
},
{
"epoch": 0.3,
"grad_norm": 2.9699463844299316,
"learning_rate": 1.6516323673269219e-06,
"loss": 0.1406,
"step": 1083
},
{
"epoch": 0.3,
"grad_norm": 2.899401903152466,
"learning_rate": 1.650960842449782e-06,
"loss": 0.1256,
"step": 1084
},
{
"epoch": 0.3,
"grad_norm": 3.0776729583740234,
"learning_rate": 1.650288807774937e-06,
"loss": 0.1515,
"step": 1085
},
{
"epoch": 0.3,
"grad_norm": 3.043003797531128,
"learning_rate": 1.6496162638286886e-06,
"loss": 0.1195,
"step": 1086
},
{
"epoch": 0.3,
"grad_norm": 3.337824583053589,
"learning_rate": 1.6489432111377372e-06,
"loss": 0.1433,
"step": 1087
},
{
"epoch": 0.3,
"grad_norm": 3.0308029651641846,
"learning_rate": 1.6482696502291819e-06,
"loss": 0.1308,
"step": 1088
},
{
"epoch": 0.3,
"grad_norm": 2.727417230606079,
"learning_rate": 1.6475955816305195e-06,
"loss": 0.1352,
"step": 1089
},
{
"epoch": 0.3,
"grad_norm": 2.7782535552978516,
"learning_rate": 1.6469210058696446e-06,
"loss": 0.1307,
"step": 1090
},
{
"epoch": 0.3,
"grad_norm": 2.9431943893432617,
"learning_rate": 1.6462459234748484e-06,
"loss": 0.133,
"step": 1091
},
{
"epoch": 0.3,
"grad_norm": 2.778409957885742,
"learning_rate": 1.6455703349748197e-06,
"loss": 0.1405,
"step": 1092
},
{
"epoch": 0.3,
"grad_norm": 3.0530734062194824,
"learning_rate": 1.644894240898643e-06,
"loss": 0.14,
"step": 1093
},
{
"epoch": 0.3,
"grad_norm": 3.2378525733947754,
"learning_rate": 1.6442176417757992e-06,
"loss": 0.1477,
"step": 1094
},
{
"epoch": 0.3,
"grad_norm": 2.7790303230285645,
"learning_rate": 1.6435405381361643e-06,
"loss": 0.1168,
"step": 1095
},
{
"epoch": 0.3,
"grad_norm": 3.0893919467926025,
"learning_rate": 1.6428629305100102e-06,
"loss": 0.1435,
"step": 1096
},
{
"epoch": 0.3,
"grad_norm": 2.8517370223999023,
"learning_rate": 1.6421848194280024e-06,
"loss": 0.1342,
"step": 1097
},
{
"epoch": 0.3,
"grad_norm": 2.777329921722412,
"learning_rate": 1.6415062054212011e-06,
"loss": 0.1223,
"step": 1098
},
{
"epoch": 0.3,
"grad_norm": 2.9436533451080322,
"learning_rate": 1.6408270890210612e-06,
"loss": 0.1206,
"step": 1099
},
{
"epoch": 0.3,
"grad_norm": 2.989617347717285,
"learning_rate": 1.6401474707594296e-06,
"loss": 0.1218,
"step": 1100
},
{
"epoch": 0.3,
"grad_norm": 3.3940844535827637,
"learning_rate": 1.6394673511685472e-06,
"loss": 0.134,
"step": 1101
},
{
"epoch": 0.3,
"grad_norm": 3.0965664386749268,
"learning_rate": 1.6387867307810476e-06,
"loss": 0.1305,
"step": 1102
},
{
"epoch": 0.3,
"grad_norm": 2.931014060974121,
"learning_rate": 1.638105610129956e-06,
"loss": 0.1352,
"step": 1103
},
{
"epoch": 0.3,
"grad_norm": 3.0383260250091553,
"learning_rate": 1.6374239897486897e-06,
"loss": 0.132,
"step": 1104
},
{
"epoch": 0.3,
"grad_norm": 3.2072699069976807,
"learning_rate": 1.6367418701710572e-06,
"loss": 0.1673,
"step": 1105
},
{
"epoch": 0.3,
"grad_norm": 2.983436346054077,
"learning_rate": 1.6360592519312579e-06,
"loss": 0.1254,
"step": 1106
},
{
"epoch": 0.3,
"grad_norm": 2.875274896621704,
"learning_rate": 1.6353761355638827e-06,
"loss": 0.1351,
"step": 1107
},
{
"epoch": 0.3,
"grad_norm": 3.192133665084839,
"learning_rate": 1.6346925216039106e-06,
"loss": 0.1503,
"step": 1108
},
{
"epoch": 0.3,
"grad_norm": 2.901218891143799,
"learning_rate": 1.6340084105867121e-06,
"loss": 0.1483,
"step": 1109
},
{
"epoch": 0.3,
"grad_norm": 2.900606632232666,
"learning_rate": 1.633323803048047e-06,
"loss": 0.1298,
"step": 1110
},
{
"epoch": 0.3,
"grad_norm": 2.7904560565948486,
"learning_rate": 1.6326386995240622e-06,
"loss": 0.135,
"step": 1111
},
{
"epoch": 0.3,
"grad_norm": 2.844744920730591,
"learning_rate": 1.6319531005512945e-06,
"loss": 0.1274,
"step": 1112
},
{
"epoch": 0.3,
"grad_norm": 2.9561710357666016,
"learning_rate": 1.6312670066666686e-06,
"loss": 0.1205,
"step": 1113
},
{
"epoch": 0.3,
"grad_norm": 3.266465663909912,
"learning_rate": 1.6305804184074963e-06,
"loss": 0.1351,
"step": 1114
},
{
"epoch": 0.3,
"grad_norm": 3.0080487728118896,
"learning_rate": 1.6298933363114767e-06,
"loss": 0.1396,
"step": 1115
},
{
"epoch": 0.3,
"grad_norm": 2.7729556560516357,
"learning_rate": 1.629205760916696e-06,
"loss": 0.1238,
"step": 1116
},
{
"epoch": 0.31,
"grad_norm": 3.0315845012664795,
"learning_rate": 1.6285176927616262e-06,
"loss": 0.1336,
"step": 1117
},
{
"epoch": 0.31,
"grad_norm": 3.1767919063568115,
"learning_rate": 1.6278291323851257e-06,
"loss": 0.147,
"step": 1118
},
{
"epoch": 0.31,
"grad_norm": 3.098306179046631,
"learning_rate": 1.6271400803264378e-06,
"loss": 0.1425,
"step": 1119
},
{
"epoch": 0.31,
"grad_norm": 3.0536861419677734,
"learning_rate": 1.6264505371251915e-06,
"loss": 0.1281,
"step": 1120
},
{
"epoch": 0.31,
"grad_norm": 3.0145273208618164,
"learning_rate": 1.6257605033214005e-06,
"loss": 0.1387,
"step": 1121
},
{
"epoch": 0.31,
"grad_norm": 3.1438162326812744,
"learning_rate": 1.6250699794554614e-06,
"loss": 0.1323,
"step": 1122
},
{
"epoch": 0.31,
"grad_norm": 2.763699531555176,
"learning_rate": 1.6243789660681565e-06,
"loss": 0.1337,
"step": 1123
},
{
"epoch": 0.31,
"grad_norm": 3.288756847381592,
"learning_rate": 1.6236874637006497e-06,
"loss": 0.1484,
"step": 1124
},
{
"epoch": 0.31,
"grad_norm": 2.956301212310791,
"learning_rate": 1.6229954728944895e-06,
"loss": 0.1422,
"step": 1125
},
{
"epoch": 0.31,
"grad_norm": 3.0409741401672363,
"learning_rate": 1.6223029941916056e-06,
"loss": 0.1502,
"step": 1126
},
{
"epoch": 0.31,
"grad_norm": 2.7094151973724365,
"learning_rate": 1.62161002813431e-06,
"loss": 0.1272,
"step": 1127
},
{
"epoch": 0.31,
"grad_norm": 3.1903421878814697,
"learning_rate": 1.6209165752652974e-06,
"loss": 0.1405,
"step": 1128
},
{
"epoch": 0.31,
"grad_norm": 2.828828811645508,
"learning_rate": 1.620222636127642e-06,
"loss": 0.1287,
"step": 1129
},
{
"epoch": 0.31,
"grad_norm": 2.87174916267395,
"learning_rate": 1.6195282112648006e-06,
"loss": 0.1181,
"step": 1130
},
{
"epoch": 0.31,
"grad_norm": 2.855774402618408,
"learning_rate": 1.6188333012206096e-06,
"loss": 0.1347,
"step": 1131
},
{
"epoch": 0.31,
"grad_norm": 2.7991037368774414,
"learning_rate": 1.6181379065392848e-06,
"loss": 0.1213,
"step": 1132
},
{
"epoch": 0.31,
"grad_norm": 3.3876779079437256,
"learning_rate": 1.6174420277654224e-06,
"loss": 0.1382,
"step": 1133
},
{
"epoch": 0.31,
"grad_norm": 2.8736510276794434,
"learning_rate": 1.6167456654439978e-06,
"loss": 0.1243,
"step": 1134
},
{
"epoch": 0.31,
"grad_norm": 2.677625894546509,
"learning_rate": 1.6160488201203642e-06,
"loss": 0.1202,
"step": 1135
},
{
"epoch": 0.31,
"grad_norm": 2.976384162902832,
"learning_rate": 1.6153514923402536e-06,
"loss": 0.1351,
"step": 1136
},
{
"epoch": 0.31,
"grad_norm": 3.1343348026275635,
"learning_rate": 1.614653682649776e-06,
"loss": 0.1427,
"step": 1137
},
{
"epoch": 0.31,
"grad_norm": 2.829636573791504,
"learning_rate": 1.6139553915954186e-06,
"loss": 0.1188,
"step": 1138
},
{
"epoch": 0.31,
"grad_norm": 3.0276970863342285,
"learning_rate": 1.6132566197240456e-06,
"loss": 0.1205,
"step": 1139
},
{
"epoch": 0.31,
"grad_norm": 2.8864333629608154,
"learning_rate": 1.612557367582898e-06,
"loss": 0.1335,
"step": 1140
},
{
"epoch": 0.31,
"grad_norm": 2.779438018798828,
"learning_rate": 1.6118576357195921e-06,
"loss": 0.1298,
"step": 1141
},
{
"epoch": 0.31,
"grad_norm": 3.176299571990967,
"learning_rate": 1.6111574246821208e-06,
"loss": 0.1432,
"step": 1142
},
{
"epoch": 0.31,
"grad_norm": 2.911555528640747,
"learning_rate": 1.6104567350188515e-06,
"loss": 0.1326,
"step": 1143
},
{
"epoch": 0.31,
"grad_norm": 3.7277283668518066,
"learning_rate": 1.6097555672785276e-06,
"loss": 0.1629,
"step": 1144
},
{
"epoch": 0.31,
"grad_norm": 3.019008159637451,
"learning_rate": 1.6090539220102657e-06,
"loss": 0.1422,
"step": 1145
},
{
"epoch": 0.31,
"grad_norm": 2.8250324726104736,
"learning_rate": 1.6083517997635569e-06,
"loss": 0.128,
"step": 1146
},
{
"epoch": 0.31,
"grad_norm": 2.724057197570801,
"learning_rate": 1.6076492010882658e-06,
"loss": 0.134,
"step": 1147
},
{
"epoch": 0.31,
"grad_norm": 2.6791696548461914,
"learning_rate": 1.60694612653463e-06,
"loss": 0.1404,
"step": 1148
},
{
"epoch": 0.31,
"grad_norm": 3.141669511795044,
"learning_rate": 1.6062425766532602e-06,
"loss": 0.1369,
"step": 1149
},
{
"epoch": 0.31,
"grad_norm": 3.1686415672302246,
"learning_rate": 1.6055385519951387e-06,
"loss": 0.1465,
"step": 1150
},
{
"epoch": 0.31,
"grad_norm": 2.7634310722351074,
"learning_rate": 1.60483405311162e-06,
"loss": 0.128,
"step": 1151
},
{
"epoch": 0.31,
"grad_norm": 2.9050796031951904,
"learning_rate": 1.6041290805544301e-06,
"loss": 0.1227,
"step": 1152
},
{
"epoch": 0.31,
"grad_norm": 2.8661465644836426,
"learning_rate": 1.6034236348756651e-06,
"loss": 0.1305,
"step": 1153
},
{
"epoch": 0.32,
"grad_norm": 2.8756277561187744,
"learning_rate": 1.6027177166277937e-06,
"loss": 0.1381,
"step": 1154
},
{
"epoch": 0.32,
"grad_norm": 2.9020004272460938,
"learning_rate": 1.602011326363652e-06,
"loss": 0.1432,
"step": 1155
},
{
"epoch": 0.32,
"grad_norm": 2.8704707622528076,
"learning_rate": 1.6013044646364476e-06,
"loss": 0.1422,
"step": 1156
},
{
"epoch": 0.32,
"grad_norm": 3.0167694091796875,
"learning_rate": 1.6005971319997568e-06,
"loss": 0.1421,
"step": 1157
},
{
"epoch": 0.32,
"grad_norm": 2.8320274353027344,
"learning_rate": 1.5998893290075245e-06,
"loss": 0.1186,
"step": 1158
},
{
"epoch": 0.32,
"grad_norm": 2.787231683731079,
"learning_rate": 1.5991810562140643e-06,
"loss": 0.1309,
"step": 1159
},
{
"epoch": 0.32,
"grad_norm": 2.6868605613708496,
"learning_rate": 1.5984723141740574e-06,
"loss": 0.1243,
"step": 1160
},
{
"epoch": 0.32,
"grad_norm": 2.9979305267333984,
"learning_rate": 1.5977631034425528e-06,
"loss": 0.1373,
"step": 1161
},
{
"epoch": 0.32,
"grad_norm": 2.7995948791503906,
"learning_rate": 1.5970534245749663e-06,
"loss": 0.1372,
"step": 1162
},
{
"epoch": 0.32,
"grad_norm": 2.86142635345459,
"learning_rate": 1.5963432781270805e-06,
"loss": 0.1222,
"step": 1163
},
{
"epoch": 0.32,
"grad_norm": 2.7392685413360596,
"learning_rate": 1.5956326646550442e-06,
"loss": 0.1303,
"step": 1164
},
{
"epoch": 0.32,
"grad_norm": 2.9346110820770264,
"learning_rate": 1.5949215847153715e-06,
"loss": 0.136,
"step": 1165
},
{
"epoch": 0.32,
"grad_norm": 3.010697841644287,
"learning_rate": 1.5942100388649427e-06,
"loss": 0.1435,
"step": 1166
},
{
"epoch": 0.32,
"grad_norm": 2.903467893600464,
"learning_rate": 1.5934980276610019e-06,
"loss": 0.1426,
"step": 1167
},
{
"epoch": 0.32,
"grad_norm": 2.727959632873535,
"learning_rate": 1.5927855516611586e-06,
"loss": 0.1367,
"step": 1168
},
{
"epoch": 0.32,
"grad_norm": 2.812208414077759,
"learning_rate": 1.5920726114233856e-06,
"loss": 0.1162,
"step": 1169
},
{
"epoch": 0.32,
"grad_norm": 3.0076522827148438,
"learning_rate": 1.5913592075060197e-06,
"loss": 0.1387,
"step": 1170
},
{
"epoch": 0.32,
"grad_norm": 3.120340347290039,
"learning_rate": 1.5906453404677606e-06,
"loss": 0.1477,
"step": 1171
},
{
"epoch": 0.32,
"grad_norm": 3.080254554748535,
"learning_rate": 1.5899310108676708e-06,
"loss": 0.141,
"step": 1172
},
{
"epoch": 0.32,
"grad_norm": 3.1656322479248047,
"learning_rate": 1.589216219265175e-06,
"loss": 0.1391,
"step": 1173
},
{
"epoch": 0.32,
"grad_norm": 3.0118024349212646,
"learning_rate": 1.5885009662200596e-06,
"loss": 0.1309,
"step": 1174
},
{
"epoch": 0.32,
"grad_norm": 3.1373941898345947,
"learning_rate": 1.587785252292473e-06,
"loss": 0.1389,
"step": 1175
},
{
"epoch": 0.32,
"grad_norm": 2.8171396255493164,
"learning_rate": 1.5870690780429237e-06,
"loss": 0.1255,
"step": 1176
},
{
"epoch": 0.32,
"grad_norm": 2.952279806137085,
"learning_rate": 1.5863524440322809e-06,
"loss": 0.1387,
"step": 1177
},
{
"epoch": 0.32,
"grad_norm": 2.9205703735351562,
"learning_rate": 1.5856353508217747e-06,
"loss": 0.1454,
"step": 1178
},
{
"epoch": 0.32,
"grad_norm": 2.916257858276367,
"learning_rate": 1.5849177989729931e-06,
"loss": 0.1304,
"step": 1179
},
{
"epoch": 0.32,
"grad_norm": 2.962117910385132,
"learning_rate": 1.584199789047885e-06,
"loss": 0.1311,
"step": 1180
},
{
"epoch": 0.32,
"grad_norm": 2.8570611476898193,
"learning_rate": 1.5834813216087578e-06,
"loss": 0.1321,
"step": 1181
},
{
"epoch": 0.32,
"grad_norm": 2.999396800994873,
"learning_rate": 1.5827623972182753e-06,
"loss": 0.1277,
"step": 1182
},
{
"epoch": 0.32,
"grad_norm": 3.1423332691192627,
"learning_rate": 1.5820430164394621e-06,
"loss": 0.1399,
"step": 1183
},
{
"epoch": 0.32,
"grad_norm": 3.01912522315979,
"learning_rate": 1.581323179835698e-06,
"loss": 0.1346,
"step": 1184
},
{
"epoch": 0.32,
"grad_norm": 2.9051058292388916,
"learning_rate": 1.5806028879707207e-06,
"loss": 0.1247,
"step": 1185
},
{
"epoch": 0.32,
"grad_norm": 2.99544358253479,
"learning_rate": 1.5798821414086244e-06,
"loss": 0.1292,
"step": 1186
},
{
"epoch": 0.32,
"grad_norm": 3.1393465995788574,
"learning_rate": 1.5791609407138587e-06,
"loss": 0.1335,
"step": 1187
},
{
"epoch": 0.32,
"grad_norm": 2.643645763397217,
"learning_rate": 1.5784392864512297e-06,
"loss": 0.1178,
"step": 1188
},
{
"epoch": 0.32,
"grad_norm": 2.9057555198669434,
"learning_rate": 1.5777171791858986e-06,
"loss": 0.1292,
"step": 1189
},
{
"epoch": 0.33,
"grad_norm": 2.9893436431884766,
"learning_rate": 1.5769946194833813e-06,
"loss": 0.1371,
"step": 1190
},
{
"epoch": 0.33,
"grad_norm": 2.859166145324707,
"learning_rate": 1.5762716079095477e-06,
"loss": 0.1278,
"step": 1191
},
{
"epoch": 0.33,
"grad_norm": 2.822263479232788,
"learning_rate": 1.5755481450306216e-06,
"loss": 0.1246,
"step": 1192
},
{
"epoch": 0.33,
"grad_norm": 2.7825675010681152,
"learning_rate": 1.5748242314131806e-06,
"loss": 0.1225,
"step": 1193
},
{
"epoch": 0.33,
"grad_norm": 3.0317020416259766,
"learning_rate": 1.5740998676241548e-06,
"loss": 0.1336,
"step": 1194
},
{
"epoch": 0.33,
"grad_norm": 2.8351964950561523,
"learning_rate": 1.5733750542308277e-06,
"loss": 0.1202,
"step": 1195
},
{
"epoch": 0.33,
"grad_norm": 3.0110056400299072,
"learning_rate": 1.572649791800834e-06,
"loss": 0.135,
"step": 1196
},
{
"epoch": 0.33,
"grad_norm": 2.9327712059020996,
"learning_rate": 1.5719240809021606e-06,
"loss": 0.128,
"step": 1197
},
{
"epoch": 0.33,
"grad_norm": 3.1209583282470703,
"learning_rate": 1.5711979221031455e-06,
"loss": 0.132,
"step": 1198
},
{
"epoch": 0.33,
"grad_norm": 2.9687564373016357,
"learning_rate": 1.5704713159724771e-06,
"loss": 0.129,
"step": 1199
},
{
"epoch": 0.33,
"grad_norm": 2.8928444385528564,
"learning_rate": 1.5697442630791948e-06,
"loss": 0.1357,
"step": 1200
},
{
"epoch": 0.33,
"grad_norm": 2.8500170707702637,
"learning_rate": 1.5690167639926875e-06,
"loss": 0.1326,
"step": 1201
},
{
"epoch": 0.33,
"grad_norm": 3.0176877975463867,
"learning_rate": 1.5682888192826933e-06,
"loss": 0.1498,
"step": 1202
},
{
"epoch": 0.33,
"grad_norm": 3.251095771789551,
"learning_rate": 1.5675604295193e-06,
"loss": 0.1399,
"step": 1203
},
{
"epoch": 0.33,
"grad_norm": 2.977865219116211,
"learning_rate": 1.5668315952729427e-06,
"loss": 0.1282,
"step": 1204
},
{
"epoch": 0.33,
"grad_norm": 3.0575249195098877,
"learning_rate": 1.5661023171144062e-06,
"loss": 0.1307,
"step": 1205
},
{
"epoch": 0.33,
"grad_norm": 3.0762460231781006,
"learning_rate": 1.5653725956148215e-06,
"loss": 0.1465,
"step": 1206
},
{
"epoch": 0.33,
"grad_norm": 2.9267191886901855,
"learning_rate": 1.564642431345668e-06,
"loss": 0.1264,
"step": 1207
},
{
"epoch": 0.33,
"grad_norm": 3.029406785964966,
"learning_rate": 1.5639118248787714e-06,
"loss": 0.1431,
"step": 1208
},
{
"epoch": 0.33,
"grad_norm": 3.220940351486206,
"learning_rate": 1.5631807767863029e-06,
"loss": 0.1373,
"step": 1209
},
{
"epoch": 0.33,
"grad_norm": 3.025521993637085,
"learning_rate": 1.5624492876407807e-06,
"loss": 0.1385,
"step": 1210
},
{
"epoch": 0.33,
"grad_norm": 2.761337995529175,
"learning_rate": 1.5617173580150675e-06,
"loss": 0.1198,
"step": 1211
},
{
"epoch": 0.33,
"grad_norm": 2.8094804286956787,
"learning_rate": 1.5609849884823723e-06,
"loss": 0.1316,
"step": 1212
},
{
"epoch": 0.33,
"grad_norm": 3.0511271953582764,
"learning_rate": 1.560252179616247e-06,
"loss": 0.1406,
"step": 1213
},
{
"epoch": 0.33,
"grad_norm": 2.9625461101531982,
"learning_rate": 1.5595189319905887e-06,
"loss": 0.1428,
"step": 1214
},
{
"epoch": 0.33,
"grad_norm": 2.8088529109954834,
"learning_rate": 1.5587852461796373e-06,
"loss": 0.1335,
"step": 1215
},
{
"epoch": 0.33,
"grad_norm": 3.0297744274139404,
"learning_rate": 1.5580511227579764e-06,
"loss": 0.1427,
"step": 1216
},
{
"epoch": 0.33,
"grad_norm": 2.8155922889709473,
"learning_rate": 1.5573165623005328e-06,
"loss": 0.133,
"step": 1217
},
{
"epoch": 0.33,
"grad_norm": 2.878079891204834,
"learning_rate": 1.556581565382574e-06,
"loss": 0.1207,
"step": 1218
},
{
"epoch": 0.33,
"grad_norm": 2.9472103118896484,
"learning_rate": 1.5558461325797109e-06,
"loss": 0.1327,
"step": 1219
},
{
"epoch": 0.33,
"grad_norm": 3.120007038116455,
"learning_rate": 1.555110264467895e-06,
"loss": 0.1539,
"step": 1220
},
{
"epoch": 0.33,
"grad_norm": 2.8870222568511963,
"learning_rate": 1.5543739616234186e-06,
"loss": 0.1399,
"step": 1221
},
{
"epoch": 0.33,
"grad_norm": 2.865922212600708,
"learning_rate": 1.553637224622915e-06,
"loss": 0.1292,
"step": 1222
},
{
"epoch": 0.33,
"grad_norm": 2.926393747329712,
"learning_rate": 1.5529000540433573e-06,
"loss": 0.1386,
"step": 1223
},
{
"epoch": 0.33,
"grad_norm": 2.885589122772217,
"learning_rate": 1.5521624504620574e-06,
"loss": 0.1231,
"step": 1224
},
{
"epoch": 0.33,
"grad_norm": 2.996002197265625,
"learning_rate": 1.5514244144566676e-06,
"loss": 0.1482,
"step": 1225
},
{
"epoch": 0.33,
"grad_norm": 3.2223353385925293,
"learning_rate": 1.550685946605178e-06,
"loss": 0.1406,
"step": 1226
},
{
"epoch": 0.34,
"grad_norm": 3.146404504776001,
"learning_rate": 1.5499470474859172e-06,
"loss": 0.1316,
"step": 1227
},
{
"epoch": 0.34,
"grad_norm": 3.082017421722412,
"learning_rate": 1.5492077176775513e-06,
"loss": 0.1521,
"step": 1228
},
{
"epoch": 0.34,
"grad_norm": 3.1566617488861084,
"learning_rate": 1.548467957759084e-06,
"loss": 0.1516,
"step": 1229
},
{
"epoch": 0.34,
"grad_norm": 2.732489824295044,
"learning_rate": 1.5477277683098552e-06,
"loss": 0.1292,
"step": 1230
},
{
"epoch": 0.34,
"grad_norm": 2.7503480911254883,
"learning_rate": 1.5469871499095425e-06,
"loss": 0.1376,
"step": 1231
},
{
"epoch": 0.34,
"grad_norm": 2.9067938327789307,
"learning_rate": 1.5462461031381584e-06,
"loss": 0.1331,
"step": 1232
},
{
"epoch": 0.34,
"grad_norm": 2.9652981758117676,
"learning_rate": 1.5455046285760505e-06,
"loss": 0.129,
"step": 1233
},
{
"epoch": 0.34,
"grad_norm": 2.9126827716827393,
"learning_rate": 1.5447627268039028e-06,
"loss": 0.1296,
"step": 1234
},
{
"epoch": 0.34,
"grad_norm": 2.70180344581604,
"learning_rate": 1.5440203984027322e-06,
"loss": 0.1253,
"step": 1235
},
{
"epoch": 0.34,
"grad_norm": 2.670848846435547,
"learning_rate": 1.5432776439538912e-06,
"loss": 0.1295,
"step": 1236
},
{
"epoch": 0.34,
"grad_norm": 3.3088035583496094,
"learning_rate": 1.5425344640390653e-06,
"loss": 0.1369,
"step": 1237
},
{
"epoch": 0.34,
"grad_norm": 2.6741421222686768,
"learning_rate": 1.5417908592402734e-06,
"loss": 0.12,
"step": 1238
},
{
"epoch": 0.34,
"grad_norm": 3.0382497310638428,
"learning_rate": 1.5410468301398663e-06,
"loss": 0.1408,
"step": 1239
},
{
"epoch": 0.34,
"grad_norm": 2.7590854167938232,
"learning_rate": 1.5403023773205284e-06,
"loss": 0.1349,
"step": 1240
},
{
"epoch": 0.34,
"grad_norm": 2.749650001525879,
"learning_rate": 1.5395575013652753e-06,
"loss": 0.1329,
"step": 1241
},
{
"epoch": 0.34,
"grad_norm": 2.8700966835021973,
"learning_rate": 1.5388122028574538e-06,
"loss": 0.1402,
"step": 1242
},
{
"epoch": 0.34,
"grad_norm": 2.932111978530884,
"learning_rate": 1.5380664823807416e-06,
"loss": 0.128,
"step": 1243
},
{
"epoch": 0.34,
"grad_norm": 3.245429515838623,
"learning_rate": 1.5373203405191477e-06,
"loss": 0.1282,
"step": 1244
},
{
"epoch": 0.34,
"grad_norm": 2.9517509937286377,
"learning_rate": 1.53657377785701e-06,
"loss": 0.1332,
"step": 1245
},
{
"epoch": 0.34,
"grad_norm": 2.945868492126465,
"learning_rate": 1.5358267949789964e-06,
"loss": 0.1345,
"step": 1246
},
{
"epoch": 0.34,
"grad_norm": 2.7037761211395264,
"learning_rate": 1.5350793924701045e-06,
"loss": 0.1319,
"step": 1247
},
{
"epoch": 0.34,
"grad_norm": 3.136314630508423,
"learning_rate": 1.5343315709156594e-06,
"loss": 0.1516,
"step": 1248
},
{
"epoch": 0.34,
"grad_norm": 2.9882936477661133,
"learning_rate": 1.533583330901315e-06,
"loss": 0.1215,
"step": 1249
},
{
"epoch": 0.34,
"grad_norm": 3.243441104888916,
"learning_rate": 1.532834673013053e-06,
"loss": 0.1336,
"step": 1250
},
{
"epoch": 0.34,
"grad_norm": 3.034088134765625,
"learning_rate": 1.5320855978371818e-06,
"loss": 0.1412,
"step": 1251
},
{
"epoch": 0.34,
"grad_norm": 2.9239449501037598,
"learning_rate": 1.531336105960338e-06,
"loss": 0.124,
"step": 1252
},
{
"epoch": 0.34,
"grad_norm": 2.957061290740967,
"learning_rate": 1.5305861979694826e-06,
"loss": 0.1381,
"step": 1253
},
{
"epoch": 0.34,
"grad_norm": 2.8607163429260254,
"learning_rate": 1.5298358744519036e-06,
"loss": 0.1175,
"step": 1254
},
{
"epoch": 0.34,
"grad_norm": 2.9602956771850586,
"learning_rate": 1.5290851359952144e-06,
"loss": 0.1445,
"step": 1255
},
{
"epoch": 0.34,
"grad_norm": 3.1619811058044434,
"learning_rate": 1.5283339831873529e-06,
"loss": 0.1551,
"step": 1256
},
{
"epoch": 0.34,
"grad_norm": 2.7596523761749268,
"learning_rate": 1.5275824166165823e-06,
"loss": 0.1187,
"step": 1257
},
{
"epoch": 0.34,
"grad_norm": 2.7872233390808105,
"learning_rate": 1.5268304368714891e-06,
"loss": 0.1342,
"step": 1258
},
{
"epoch": 0.34,
"grad_norm": 3.116015911102295,
"learning_rate": 1.5260780445409833e-06,
"loss": 0.1358,
"step": 1259
},
{
"epoch": 0.34,
"grad_norm": 3.3103036880493164,
"learning_rate": 1.5253252402142986e-06,
"loss": 0.1591,
"step": 1260
},
{
"epoch": 0.34,
"grad_norm": 2.861786127090454,
"learning_rate": 1.5245720244809914e-06,
"loss": 0.1184,
"step": 1261
},
{
"epoch": 0.34,
"grad_norm": 2.9362566471099854,
"learning_rate": 1.5238183979309397e-06,
"loss": 0.1436,
"step": 1262
},
{
"epoch": 0.35,
"grad_norm": 2.962371349334717,
"learning_rate": 1.523064361154343e-06,
"loss": 0.1398,
"step": 1263
},
{
"epoch": 0.35,
"grad_norm": 2.906949996948242,
"learning_rate": 1.5223099147417226e-06,
"loss": 0.1313,
"step": 1264
},
{
"epoch": 0.35,
"grad_norm": 2.570661783218384,
"learning_rate": 1.5215550592839217e-06,
"loss": 0.1268,
"step": 1265
},
{
"epoch": 0.35,
"grad_norm": 3.0509450435638428,
"learning_rate": 1.5207997953721017e-06,
"loss": 0.1342,
"step": 1266
},
{
"epoch": 0.35,
"grad_norm": 2.721755027770996,
"learning_rate": 1.5200441235977454e-06,
"loss": 0.1323,
"step": 1267
},
{
"epoch": 0.35,
"grad_norm": 3.1234641075134277,
"learning_rate": 1.5192880445526537e-06,
"loss": 0.1385,
"step": 1268
},
{
"epoch": 0.35,
"grad_norm": 2.9297051429748535,
"learning_rate": 1.5185315588289478e-06,
"loss": 0.1339,
"step": 1269
},
{
"epoch": 0.35,
"grad_norm": 2.916425943374634,
"learning_rate": 1.5177746670190671e-06,
"loss": 0.1321,
"step": 1270
},
{
"epoch": 0.35,
"grad_norm": 2.894190788269043,
"learning_rate": 1.5170173697157687e-06,
"loss": 0.133,
"step": 1271
},
{
"epoch": 0.35,
"grad_norm": 2.728078842163086,
"learning_rate": 1.516259667512127e-06,
"loss": 0.1322,
"step": 1272
},
{
"epoch": 0.35,
"grad_norm": 2.996042013168335,
"learning_rate": 1.515501561001534e-06,
"loss": 0.1413,
"step": 1273
},
{
"epoch": 0.35,
"grad_norm": 2.8164424896240234,
"learning_rate": 1.5147430507776978e-06,
"loss": 0.1314,
"step": 1274
},
{
"epoch": 0.35,
"grad_norm": 2.785353899002075,
"learning_rate": 1.5139841374346437e-06,
"loss": 0.1167,
"step": 1275
},
{
"epoch": 0.35,
"grad_norm": 3.056356191635132,
"learning_rate": 1.5132248215667115e-06,
"loss": 0.1388,
"step": 1276
},
{
"epoch": 0.35,
"grad_norm": 2.864875316619873,
"learning_rate": 1.512465103768557e-06,
"loss": 0.1264,
"step": 1277
},
{
"epoch": 0.35,
"grad_norm": 2.9353301525115967,
"learning_rate": 1.5117049846351508e-06,
"loss": 0.1321,
"step": 1278
},
{
"epoch": 0.35,
"grad_norm": 2.9219601154327393,
"learning_rate": 1.510944464761777e-06,
"loss": 0.131,
"step": 1279
},
{
"epoch": 0.35,
"grad_norm": 2.9697318077087402,
"learning_rate": 1.5101835447440344e-06,
"loss": 0.119,
"step": 1280
},
{
"epoch": 0.35,
"grad_norm": 3.0618062019348145,
"learning_rate": 1.5094222251778343e-06,
"loss": 0.1382,
"step": 1281
},
{
"epoch": 0.35,
"grad_norm": 3.128476142883301,
"learning_rate": 1.5086605066594024e-06,
"loss": 0.16,
"step": 1282
},
{
"epoch": 0.35,
"grad_norm": 2.7182974815368652,
"learning_rate": 1.5078983897852753e-06,
"loss": 0.1217,
"step": 1283
},
{
"epoch": 0.35,
"grad_norm": 3.076162338256836,
"learning_rate": 1.507135875152302e-06,
"loss": 0.1355,
"step": 1284
},
{
"epoch": 0.35,
"grad_norm": 2.9355263710021973,
"learning_rate": 1.506372963357644e-06,
"loss": 0.1468,
"step": 1285
},
{
"epoch": 0.35,
"grad_norm": 2.685256242752075,
"learning_rate": 1.5056096549987718e-06,
"loss": 0.1282,
"step": 1286
},
{
"epoch": 0.35,
"grad_norm": 2.9893975257873535,
"learning_rate": 1.5048459506734687e-06,
"loss": 0.1212,
"step": 1287
},
{
"epoch": 0.35,
"grad_norm": 2.740032196044922,
"learning_rate": 1.5040818509798263e-06,
"loss": 0.1323,
"step": 1288
},
{
"epoch": 0.35,
"grad_norm": 3.0219709873199463,
"learning_rate": 1.5033173565162472e-06,
"loss": 0.1366,
"step": 1289
},
{
"epoch": 0.35,
"grad_norm": 3.0304818153381348,
"learning_rate": 1.5025524678814425e-06,
"loss": 0.1386,
"step": 1290
},
{
"epoch": 0.35,
"grad_norm": 2.810936212539673,
"learning_rate": 1.5017871856744315e-06,
"loss": 0.1259,
"step": 1291
},
{
"epoch": 0.35,
"grad_norm": 2.741853713989258,
"learning_rate": 1.501021510494543e-06,
"loss": 0.1293,
"step": 1292
},
{
"epoch": 0.35,
"grad_norm": 3.019928455352783,
"learning_rate": 1.5002554429414123e-06,
"loss": 0.1341,
"step": 1293
},
{
"epoch": 0.35,
"grad_norm": 3.0014054775238037,
"learning_rate": 1.4994889836149827e-06,
"loss": 0.142,
"step": 1294
},
{
"epoch": 0.35,
"grad_norm": 3.092749834060669,
"learning_rate": 1.4987221331155042e-06,
"loss": 0.1576,
"step": 1295
},
{
"epoch": 0.35,
"grad_norm": 2.9108452796936035,
"learning_rate": 1.4979548920435332e-06,
"loss": 0.1313,
"step": 1296
},
{
"epoch": 0.35,
"grad_norm": 2.6839687824249268,
"learning_rate": 1.4971872609999315e-06,
"loss": 0.1292,
"step": 1297
},
{
"epoch": 0.35,
"grad_norm": 3.0319478511810303,
"learning_rate": 1.496419240585867e-06,
"loss": 0.148,
"step": 1298
},
{
"epoch": 0.35,
"grad_norm": 2.8133440017700195,
"learning_rate": 1.4956508314028118e-06,
"loss": 0.1273,
"step": 1299
},
{
"epoch": 0.36,
"grad_norm": 2.979665517807007,
"learning_rate": 1.4948820340525437e-06,
"loss": 0.1349,
"step": 1300
},
{
"epoch": 0.36,
"grad_norm": 2.826272487640381,
"learning_rate": 1.4941128491371426e-06,
"loss": 0.1206,
"step": 1301
},
{
"epoch": 0.36,
"grad_norm": 2.9452009201049805,
"learning_rate": 1.4933432772589936e-06,
"loss": 0.1387,
"step": 1302
},
{
"epoch": 0.36,
"grad_norm": 2.8416945934295654,
"learning_rate": 1.4925733190207839e-06,
"loss": 0.1481,
"step": 1303
},
{
"epoch": 0.36,
"grad_norm": 2.7696831226348877,
"learning_rate": 1.4918029750255039e-06,
"loss": 0.1186,
"step": 1304
},
{
"epoch": 0.36,
"grad_norm": 3.0068044662475586,
"learning_rate": 1.491032245876446e-06,
"loss": 0.1231,
"step": 1305
},
{
"epoch": 0.36,
"grad_norm": 3.0028553009033203,
"learning_rate": 1.490261132177203e-06,
"loss": 0.127,
"step": 1306
},
{
"epoch": 0.36,
"grad_norm": 2.9873032569885254,
"learning_rate": 1.4894896345316713e-06,
"loss": 0.136,
"step": 1307
},
{
"epoch": 0.36,
"grad_norm": 2.8812692165374756,
"learning_rate": 1.4887177535440456e-06,
"loss": 0.1322,
"step": 1308
},
{
"epoch": 0.36,
"grad_norm": 3.014873743057251,
"learning_rate": 1.4879454898188222e-06,
"loss": 0.1282,
"step": 1309
},
{
"epoch": 0.36,
"grad_norm": 3.1590218544006348,
"learning_rate": 1.4871728439607964e-06,
"loss": 0.1455,
"step": 1310
},
{
"epoch": 0.36,
"grad_norm": 2.8542122840881348,
"learning_rate": 1.4863998165750636e-06,
"loss": 0.1448,
"step": 1311
},
{
"epoch": 0.36,
"grad_norm": 2.68994140625,
"learning_rate": 1.4856264082670169e-06,
"loss": 0.127,
"step": 1312
},
{
"epoch": 0.36,
"grad_norm": 3.079030752182007,
"learning_rate": 1.484852619642349e-06,
"loss": 0.1415,
"step": 1313
},
{
"epoch": 0.36,
"grad_norm": 2.891287088394165,
"learning_rate": 1.484078451307049e-06,
"loss": 0.1374,
"step": 1314
},
{
"epoch": 0.36,
"grad_norm": 3.1313259601593018,
"learning_rate": 1.4833039038674046e-06,
"loss": 0.1287,
"step": 1315
},
{
"epoch": 0.36,
"grad_norm": 2.799778938293457,
"learning_rate": 1.4825289779299998e-06,
"loss": 0.1307,
"step": 1316
},
{
"epoch": 0.36,
"grad_norm": 2.9091029167175293,
"learning_rate": 1.4817536741017151e-06,
"loss": 0.1322,
"step": 1317
},
{
"epoch": 0.36,
"grad_norm": 2.757341146469116,
"learning_rate": 1.4809779929897272e-06,
"loss": 0.1218,
"step": 1318
},
{
"epoch": 0.36,
"grad_norm": 3.112070083618164,
"learning_rate": 1.4802019352015078e-06,
"loss": 0.1241,
"step": 1319
},
{
"epoch": 0.36,
"grad_norm": 2.776374578475952,
"learning_rate": 1.479425501344824e-06,
"loss": 0.1369,
"step": 1320
},
{
"epoch": 0.36,
"grad_norm": 2.764132499694824,
"learning_rate": 1.478648692027737e-06,
"loss": 0.1197,
"step": 1321
},
{
"epoch": 0.36,
"grad_norm": 2.757923126220703,
"learning_rate": 1.477871507858602e-06,
"loss": 0.1193,
"step": 1322
},
{
"epoch": 0.36,
"grad_norm": 3.072037696838379,
"learning_rate": 1.4770939494460696e-06,
"loss": 0.1236,
"step": 1323
},
{
"epoch": 0.36,
"grad_norm": 2.9252185821533203,
"learning_rate": 1.4763160173990801e-06,
"loss": 0.1221,
"step": 1324
},
{
"epoch": 0.36,
"grad_norm": 3.2856593132019043,
"learning_rate": 1.475537712326869e-06,
"loss": 0.1436,
"step": 1325
},
{
"epoch": 0.36,
"grad_norm": 3.1054296493530273,
"learning_rate": 1.4747590348389638e-06,
"loss": 0.1369,
"step": 1326
},
{
"epoch": 0.36,
"grad_norm": 2.757472276687622,
"learning_rate": 1.4739799855451819e-06,
"loss": 0.1284,
"step": 1327
},
{
"epoch": 0.36,
"grad_norm": 2.970815896987915,
"learning_rate": 1.473200565055634e-06,
"loss": 0.1452,
"step": 1328
},
{
"epoch": 0.36,
"grad_norm": 2.9534873962402344,
"learning_rate": 1.4724207739807199e-06,
"loss": 0.1456,
"step": 1329
},
{
"epoch": 0.36,
"grad_norm": 3.152365207672119,
"learning_rate": 1.4716406129311306e-06,
"loss": 0.1288,
"step": 1330
},
{
"epoch": 0.36,
"grad_norm": 2.8408286571502686,
"learning_rate": 1.4708600825178463e-06,
"loss": 0.1315,
"step": 1331
},
{
"epoch": 0.36,
"grad_norm": 2.619940996170044,
"learning_rate": 1.4700791833521365e-06,
"loss": 0.1284,
"step": 1332
},
{
"epoch": 0.36,
"grad_norm": 2.636654853820801,
"learning_rate": 1.4692979160455603e-06,
"loss": 0.1132,
"step": 1333
},
{
"epoch": 0.36,
"grad_norm": 2.939162015914917,
"learning_rate": 1.4685162812099637e-06,
"loss": 0.1359,
"step": 1334
},
{
"epoch": 0.36,
"grad_norm": 2.7760133743286133,
"learning_rate": 1.4677342794574815e-06,
"loss": 0.1246,
"step": 1335
},
{
"epoch": 0.36,
"grad_norm": 2.7537975311279297,
"learning_rate": 1.4669519114005365e-06,
"loss": 0.132,
"step": 1336
},
{
"epoch": 0.37,
"grad_norm": 2.7049872875213623,
"learning_rate": 1.4661691776518358e-06,
"loss": 0.1351,
"step": 1337
},
{
"epoch": 0.37,
"grad_norm": 2.9597678184509277,
"learning_rate": 1.4653860788243764e-06,
"loss": 0.1461,
"step": 1338
},
{
"epoch": 0.37,
"grad_norm": 3.550565004348755,
"learning_rate": 1.4646026155314382e-06,
"loss": 0.1254,
"step": 1339
},
{
"epoch": 0.37,
"grad_norm": 2.9513895511627197,
"learning_rate": 1.463818788386588e-06,
"loss": 0.1199,
"step": 1340
},
{
"epoch": 0.37,
"grad_norm": 2.597242593765259,
"learning_rate": 1.4630345980036773e-06,
"loss": 0.1265,
"step": 1341
},
{
"epoch": 0.37,
"grad_norm": 2.9622340202331543,
"learning_rate": 1.4622500449968424e-06,
"loss": 0.1487,
"step": 1342
},
{
"epoch": 0.37,
"grad_norm": 2.835066795349121,
"learning_rate": 1.461465129980503e-06,
"loss": 0.1357,
"step": 1343
},
{
"epoch": 0.37,
"grad_norm": 2.7730233669281006,
"learning_rate": 1.4606798535693625e-06,
"loss": 0.1332,
"step": 1344
},
{
"epoch": 0.37,
"grad_norm": 3.090608596801758,
"learning_rate": 1.459894216378407e-06,
"loss": 0.1248,
"step": 1345
},
{
"epoch": 0.37,
"grad_norm": 2.9221718311309814,
"learning_rate": 1.4591082190229065e-06,
"loss": 0.1263,
"step": 1346
},
{
"epoch": 0.37,
"grad_norm": 2.7651219367980957,
"learning_rate": 1.458321862118411e-06,
"loss": 0.1321,
"step": 1347
},
{
"epoch": 0.37,
"grad_norm": 2.8736658096313477,
"learning_rate": 1.4575351462807542e-06,
"loss": 0.1211,
"step": 1348
},
{
"epoch": 0.37,
"grad_norm": 2.7251627445220947,
"learning_rate": 1.4567480721260487e-06,
"loss": 0.1309,
"step": 1349
},
{
"epoch": 0.37,
"grad_norm": 2.879901647567749,
"learning_rate": 1.4559606402706898e-06,
"loss": 0.138,
"step": 1350
},
{
"epoch": 0.37,
"grad_norm": 2.9609620571136475,
"learning_rate": 1.4551728513313514e-06,
"loss": 0.1315,
"step": 1351
},
{
"epoch": 0.37,
"grad_norm": 2.6929612159729004,
"learning_rate": 1.4543847059249882e-06,
"loss": 0.1304,
"step": 1352
},
{
"epoch": 0.37,
"grad_norm": 2.9134647846221924,
"learning_rate": 1.4535962046688332e-06,
"loss": 0.1422,
"step": 1353
},
{
"epoch": 0.37,
"grad_norm": 3.054995059967041,
"learning_rate": 1.4528073481803984e-06,
"loss": 0.1358,
"step": 1354
},
{
"epoch": 0.37,
"grad_norm": 2.8526723384857178,
"learning_rate": 1.452018137077474e-06,
"loss": 0.132,
"step": 1355
},
{
"epoch": 0.37,
"grad_norm": 2.7417056560516357,
"learning_rate": 1.4512285719781278e-06,
"loss": 0.1258,
"step": 1356
},
{
"epoch": 0.37,
"grad_norm": 3.0152394771575928,
"learning_rate": 1.4504386535007054e-06,
"loss": 0.1325,
"step": 1357
},
{
"epoch": 0.37,
"grad_norm": 2.9597837924957275,
"learning_rate": 1.4496483822638283e-06,
"loss": 0.1428,
"step": 1358
},
{
"epoch": 0.37,
"grad_norm": 2.6742889881134033,
"learning_rate": 1.4488577588863947e-06,
"loss": 0.1235,
"step": 1359
},
{
"epoch": 0.37,
"grad_norm": 2.8367764949798584,
"learning_rate": 1.4480667839875784e-06,
"loss": 0.1384,
"step": 1360
},
{
"epoch": 0.37,
"grad_norm": 3.017707586288452,
"learning_rate": 1.447275458186829e-06,
"loss": 0.1345,
"step": 1361
},
{
"epoch": 0.37,
"grad_norm": 2.8236801624298096,
"learning_rate": 1.4464837821038702e-06,
"loss": 0.1328,
"step": 1362
},
{
"epoch": 0.37,
"grad_norm": 2.7663307189941406,
"learning_rate": 1.4456917563587006e-06,
"loss": 0.1258,
"step": 1363
},
{
"epoch": 0.37,
"grad_norm": 2.5681021213531494,
"learning_rate": 1.444899381571592e-06,
"loss": 0.1166,
"step": 1364
},
{
"epoch": 0.37,
"grad_norm": 3.04805588722229,
"learning_rate": 1.4441066583630903e-06,
"loss": 0.1209,
"step": 1365
},
{
"epoch": 0.37,
"grad_norm": 3.0715489387512207,
"learning_rate": 1.4433135873540139e-06,
"loss": 0.1524,
"step": 1366
},
{
"epoch": 0.37,
"grad_norm": 3.092496871948242,
"learning_rate": 1.4425201691654534e-06,
"loss": 0.1462,
"step": 1367
},
{
"epoch": 0.37,
"grad_norm": 2.8307652473449707,
"learning_rate": 1.4417264044187718e-06,
"loss": 0.1315,
"step": 1368
},
{
"epoch": 0.37,
"grad_norm": 2.9191513061523438,
"learning_rate": 1.4409322937356026e-06,
"loss": 0.1332,
"step": 1369
},
{
"epoch": 0.37,
"grad_norm": 2.7125518321990967,
"learning_rate": 1.440137837737851e-06,
"loss": 0.1276,
"step": 1370
},
{
"epoch": 0.37,
"grad_norm": 3.0837535858154297,
"learning_rate": 1.4393430370476931e-06,
"loss": 0.1375,
"step": 1371
},
{
"epoch": 0.37,
"grad_norm": 3.0928070545196533,
"learning_rate": 1.4385478922875734e-06,
"loss": 0.139,
"step": 1372
},
{
"epoch": 0.38,
"grad_norm": 2.916564464569092,
"learning_rate": 1.4377524040802072e-06,
"loss": 0.1268,
"step": 1373
},
{
"epoch": 0.38,
"grad_norm": 3.063411235809326,
"learning_rate": 1.4369565730485785e-06,
"loss": 0.1293,
"step": 1374
},
{
"epoch": 0.38,
"grad_norm": 2.8868045806884766,
"learning_rate": 1.4361603998159387e-06,
"loss": 0.1239,
"step": 1375
},
{
"epoch": 0.38,
"grad_norm": 3.281874179840088,
"learning_rate": 1.4353638850058092e-06,
"loss": 0.1504,
"step": 1376
},
{
"epoch": 0.38,
"grad_norm": 2.732192039489746,
"learning_rate": 1.434567029241977e-06,
"loss": 0.1331,
"step": 1377
},
{
"epoch": 0.38,
"grad_norm": 2.7928121089935303,
"learning_rate": 1.433769833148497e-06,
"loss": 0.1115,
"step": 1378
},
{
"epoch": 0.38,
"grad_norm": 3.060171604156494,
"learning_rate": 1.4329722973496908e-06,
"loss": 0.1312,
"step": 1379
},
{
"epoch": 0.38,
"grad_norm": 3.192661762237549,
"learning_rate": 1.4321744224701458e-06,
"loss": 0.145,
"step": 1380
},
{
"epoch": 0.38,
"grad_norm": 2.8441617488861084,
"learning_rate": 1.4313762091347148e-06,
"loss": 0.1391,
"step": 1381
},
{
"epoch": 0.38,
"grad_norm": 2.771820545196533,
"learning_rate": 1.4305776579685155e-06,
"loss": 0.1377,
"step": 1382
},
{
"epoch": 0.38,
"grad_norm": 2.9881622791290283,
"learning_rate": 1.4297787695969308e-06,
"loss": 0.1382,
"step": 1383
},
{
"epoch": 0.38,
"grad_norm": 2.8311970233917236,
"learning_rate": 1.4289795446456074e-06,
"loss": 0.1364,
"step": 1384
},
{
"epoch": 0.38,
"grad_norm": 2.8374383449554443,
"learning_rate": 1.428179983740455e-06,
"loss": 0.137,
"step": 1385
},
{
"epoch": 0.38,
"grad_norm": 2.8031299114227295,
"learning_rate": 1.4273800875076478e-06,
"loss": 0.1374,
"step": 1386
},
{
"epoch": 0.38,
"grad_norm": 2.781954526901245,
"learning_rate": 1.4265798565736209e-06,
"loss": 0.1407,
"step": 1387
},
{
"epoch": 0.38,
"grad_norm": 2.935701370239258,
"learning_rate": 1.4257792915650725e-06,
"loss": 0.1431,
"step": 1388
},
{
"epoch": 0.38,
"grad_norm": 2.691863775253296,
"learning_rate": 1.424978393108963e-06,
"loss": 0.1233,
"step": 1389
},
{
"epoch": 0.38,
"grad_norm": 3.0290353298187256,
"learning_rate": 1.424177161832512e-06,
"loss": 0.142,
"step": 1390
},
{
"epoch": 0.38,
"grad_norm": 2.838080406188965,
"learning_rate": 1.423375598363202e-06,
"loss": 0.1301,
"step": 1391
},
{
"epoch": 0.38,
"grad_norm": 2.8826422691345215,
"learning_rate": 1.422573703328774e-06,
"loss": 0.1289,
"step": 1392
},
{
"epoch": 0.38,
"grad_norm": 3.0484912395477295,
"learning_rate": 1.42177147735723e-06,
"loss": 0.1292,
"step": 1393
},
{
"epoch": 0.38,
"grad_norm": 3.0856308937072754,
"learning_rate": 1.42096892107683e-06,
"loss": 0.1358,
"step": 1394
},
{
"epoch": 0.38,
"grad_norm": 2.663012742996216,
"learning_rate": 1.4201660351160928e-06,
"loss": 0.1213,
"step": 1395
},
{
"epoch": 0.38,
"grad_norm": 2.953725814819336,
"learning_rate": 1.4193628201037964e-06,
"loss": 0.1262,
"step": 1396
},
{
"epoch": 0.38,
"grad_norm": 3.0396006107330322,
"learning_rate": 1.4185592766689751e-06,
"loss": 0.1444,
"step": 1397
},
{
"epoch": 0.38,
"grad_norm": 3.0202651023864746,
"learning_rate": 1.4177554054409219e-06,
"loss": 0.141,
"step": 1398
},
{
"epoch": 0.38,
"grad_norm": 2.6195216178894043,
"learning_rate": 1.4169512070491852e-06,
"loss": 0.124,
"step": 1399
},
{
"epoch": 0.38,
"grad_norm": 3.101203680038452,
"learning_rate": 1.4161466821235703e-06,
"loss": 0.1425,
"step": 1400
},
{
"epoch": 0.38,
"grad_norm": 2.9621312618255615,
"learning_rate": 1.4153418312941386e-06,
"loss": 0.1407,
"step": 1401
},
{
"epoch": 0.38,
"grad_norm": 3.1151602268218994,
"learning_rate": 1.4145366551912052e-06,
"loss": 0.1453,
"step": 1402
},
{
"epoch": 0.38,
"grad_norm": 3.0556440353393555,
"learning_rate": 1.4137311544453416e-06,
"loss": 0.1287,
"step": 1403
},
{
"epoch": 0.38,
"grad_norm": 2.853315830230713,
"learning_rate": 1.4129253296873727e-06,
"loss": 0.1268,
"step": 1404
},
{
"epoch": 0.38,
"grad_norm": 3.153733968734741,
"learning_rate": 1.4121191815483774e-06,
"loss": 0.1389,
"step": 1405
},
{
"epoch": 0.38,
"grad_norm": 3.0757758617401123,
"learning_rate": 1.411312710659688e-06,
"loss": 0.1498,
"step": 1406
},
{
"epoch": 0.38,
"grad_norm": 3.0497725009918213,
"learning_rate": 1.410505917652889e-06,
"loss": 0.1516,
"step": 1407
},
{
"epoch": 0.38,
"grad_norm": 2.795180082321167,
"learning_rate": 1.4096988031598178e-06,
"loss": 0.1285,
"step": 1408
},
{
"epoch": 0.38,
"grad_norm": 2.8632426261901855,
"learning_rate": 1.4088913678125628e-06,
"loss": 0.1316,
"step": 1409
},
{
"epoch": 0.39,
"grad_norm": 2.789442539215088,
"learning_rate": 1.4080836122434648e-06,
"loss": 0.1299,
"step": 1410
},
{
"epoch": 0.39,
"grad_norm": 2.9751696586608887,
"learning_rate": 1.4072755370851147e-06,
"loss": 0.1414,
"step": 1411
},
{
"epoch": 0.39,
"grad_norm": 2.9262402057647705,
"learning_rate": 1.406467142970353e-06,
"loss": 0.1327,
"step": 1412
},
{
"epoch": 0.39,
"grad_norm": 2.9993863105773926,
"learning_rate": 1.4056584305322714e-06,
"loss": 0.1201,
"step": 1413
},
{
"epoch": 0.39,
"grad_norm": 2.710305690765381,
"learning_rate": 1.4048494004042102e-06,
"loss": 0.1314,
"step": 1414
},
{
"epoch": 0.39,
"grad_norm": 2.9878225326538086,
"learning_rate": 1.404040053219758e-06,
"loss": 0.128,
"step": 1415
},
{
"epoch": 0.39,
"grad_norm": 2.8981685638427734,
"learning_rate": 1.403230389612753e-06,
"loss": 0.1177,
"step": 1416
},
{
"epoch": 0.39,
"grad_norm": 2.8175344467163086,
"learning_rate": 1.4024204102172797e-06,
"loss": 0.1441,
"step": 1417
},
{
"epoch": 0.39,
"grad_norm": 2.7358124256134033,
"learning_rate": 1.401610115667671e-06,
"loss": 0.1288,
"step": 1418
},
{
"epoch": 0.39,
"grad_norm": 2.9867782592773438,
"learning_rate": 1.400799506598506e-06,
"loss": 0.1303,
"step": 1419
},
{
"epoch": 0.39,
"grad_norm": 3.090707778930664,
"learning_rate": 1.3999885836446104e-06,
"loss": 0.1429,
"step": 1420
},
{
"epoch": 0.39,
"grad_norm": 2.998790740966797,
"learning_rate": 1.399177347441056e-06,
"loss": 0.1298,
"step": 1421
},
{
"epoch": 0.39,
"grad_norm": 2.760016679763794,
"learning_rate": 1.3983657986231596e-06,
"loss": 0.1381,
"step": 1422
},
{
"epoch": 0.39,
"grad_norm": 2.9793732166290283,
"learning_rate": 1.3975539378264823e-06,
"loss": 0.1343,
"step": 1423
},
{
"epoch": 0.39,
"grad_norm": 2.8108506202697754,
"learning_rate": 1.3967417656868301e-06,
"loss": 0.1386,
"step": 1424
},
{
"epoch": 0.39,
"grad_norm": 2.9230880737304688,
"learning_rate": 1.395929282840253e-06,
"loss": 0.1454,
"step": 1425
},
{
"epoch": 0.39,
"grad_norm": 2.837275981903076,
"learning_rate": 1.3951164899230446e-06,
"loss": 0.1343,
"step": 1426
},
{
"epoch": 0.39,
"grad_norm": 2.712369680404663,
"learning_rate": 1.3943033875717403e-06,
"loss": 0.1331,
"step": 1427
},
{
"epoch": 0.39,
"grad_norm": 2.855681896209717,
"learning_rate": 1.3934899764231177e-06,
"loss": 0.1184,
"step": 1428
},
{
"epoch": 0.39,
"grad_norm": 2.837350368499756,
"learning_rate": 1.392676257114198e-06,
"loss": 0.1389,
"step": 1429
},
{
"epoch": 0.39,
"grad_norm": 2.9050211906433105,
"learning_rate": 1.3918622302822423e-06,
"loss": 0.132,
"step": 1430
},
{
"epoch": 0.39,
"grad_norm": 2.8807296752929688,
"learning_rate": 1.3910478965647524e-06,
"loss": 0.1399,
"step": 1431
},
{
"epoch": 0.39,
"grad_norm": 2.598497152328491,
"learning_rate": 1.3902332565994719e-06,
"loss": 0.1257,
"step": 1432
},
{
"epoch": 0.39,
"grad_norm": 2.9906957149505615,
"learning_rate": 1.3894183110243819e-06,
"loss": 0.1305,
"step": 1433
},
{
"epoch": 0.39,
"grad_norm": 2.8242135047912598,
"learning_rate": 1.3886030604777052e-06,
"loss": 0.1277,
"step": 1434
},
{
"epoch": 0.39,
"grad_norm": 2.646484851837158,
"learning_rate": 1.387787505597902e-06,
"loss": 0.1137,
"step": 1435
},
{
"epoch": 0.39,
"grad_norm": 2.8431029319763184,
"learning_rate": 1.3869716470236714e-06,
"loss": 0.1386,
"step": 1436
},
{
"epoch": 0.39,
"grad_norm": 3.0383403301239014,
"learning_rate": 1.3861554853939503e-06,
"loss": 0.1364,
"step": 1437
},
{
"epoch": 0.39,
"grad_norm": 3.039416551589966,
"learning_rate": 1.385339021347912e-06,
"loss": 0.1301,
"step": 1438
},
{
"epoch": 0.39,
"grad_norm": 2.651421308517456,
"learning_rate": 1.384522255524969e-06,
"loss": 0.1134,
"step": 1439
},
{
"epoch": 0.39,
"grad_norm": 2.703716278076172,
"learning_rate": 1.383705188564767e-06,
"loss": 0.1272,
"step": 1440
},
{
"epoch": 0.39,
"grad_norm": 2.7087182998657227,
"learning_rate": 1.3828878211071902e-06,
"loss": 0.1262,
"step": 1441
},
{
"epoch": 0.39,
"grad_norm": 3.084522008895874,
"learning_rate": 1.3820701537923567e-06,
"loss": 0.1377,
"step": 1442
},
{
"epoch": 0.39,
"grad_norm": 3.0757529735565186,
"learning_rate": 1.3812521872606192e-06,
"loss": 0.1368,
"step": 1443
},
{
"epoch": 0.39,
"grad_norm": 3.411841869354248,
"learning_rate": 1.3804339221525667e-06,
"loss": 0.1441,
"step": 1444
},
{
"epoch": 0.39,
"grad_norm": 2.864745616912842,
"learning_rate": 1.3796153591090193e-06,
"loss": 0.1391,
"step": 1445
},
{
"epoch": 0.4,
"grad_norm": 3.1198856830596924,
"learning_rate": 1.3787964987710325e-06,
"loss": 0.1379,
"step": 1446
},
{
"epoch": 0.4,
"grad_norm": 2.7072486877441406,
"learning_rate": 1.3779773417798942e-06,
"loss": 0.1187,
"step": 1447
},
{
"epoch": 0.4,
"grad_norm": 2.6911025047302246,
"learning_rate": 1.3771578887771231e-06,
"loss": 0.1217,
"step": 1448
},
{
"epoch": 0.4,
"grad_norm": 2.9282870292663574,
"learning_rate": 1.3763381404044723e-06,
"loss": 0.1371,
"step": 1449
},
{
"epoch": 0.4,
"grad_norm": 2.673795461654663,
"learning_rate": 1.375518097303924e-06,
"loss": 0.1298,
"step": 1450
},
{
"epoch": 0.4,
"grad_norm": 3.007877826690674,
"learning_rate": 1.3746977601176925e-06,
"loss": 0.1257,
"step": 1451
},
{
"epoch": 0.4,
"grad_norm": 2.7329001426696777,
"learning_rate": 1.3738771294882222e-06,
"loss": 0.1255,
"step": 1452
},
{
"epoch": 0.4,
"grad_norm": 3.172227621078491,
"learning_rate": 1.373056206058186e-06,
"loss": 0.1372,
"step": 1453
},
{
"epoch": 0.4,
"grad_norm": 2.843055009841919,
"learning_rate": 1.372234990470489e-06,
"loss": 0.139,
"step": 1454
},
{
"epoch": 0.4,
"grad_norm": 2.7065436840057373,
"learning_rate": 1.3714134833682616e-06,
"loss": 0.1245,
"step": 1455
},
{
"epoch": 0.4,
"grad_norm": 3.0000193119049072,
"learning_rate": 1.3705916853948652e-06,
"loss": 0.1405,
"step": 1456
},
{
"epoch": 0.4,
"grad_norm": 2.6848971843719482,
"learning_rate": 1.3697695971938875e-06,
"loss": 0.1198,
"step": 1457
},
{
"epoch": 0.4,
"grad_norm": 2.851579189300537,
"learning_rate": 1.3689472194091442e-06,
"loss": 0.1305,
"step": 1458
},
{
"epoch": 0.4,
"grad_norm": 3.1303484439849854,
"learning_rate": 1.3681245526846781e-06,
"loss": 0.1533,
"step": 1459
},
{
"epoch": 0.4,
"grad_norm": 2.7993600368499756,
"learning_rate": 1.3673015976647567e-06,
"loss": 0.1332,
"step": 1460
},
{
"epoch": 0.4,
"grad_norm": 3.001685380935669,
"learning_rate": 1.3664783549938752e-06,
"loss": 0.1393,
"step": 1461
},
{
"epoch": 0.4,
"grad_norm": 2.8103721141815186,
"learning_rate": 1.3656548253167529e-06,
"loss": 0.1439,
"step": 1462
},
{
"epoch": 0.4,
"grad_norm": 2.807375907897949,
"learning_rate": 1.3648310092783342e-06,
"loss": 0.1367,
"step": 1463
},
{
"epoch": 0.4,
"grad_norm": 2.8973469734191895,
"learning_rate": 1.364006907523788e-06,
"loss": 0.1412,
"step": 1464
},
{
"epoch": 0.4,
"grad_norm": 3.0541858673095703,
"learning_rate": 1.3631825206985062e-06,
"loss": 0.1372,
"step": 1465
},
{
"epoch": 0.4,
"grad_norm": 3.022650957107544,
"learning_rate": 1.3623578494481045e-06,
"loss": 0.1332,
"step": 1466
},
{
"epoch": 0.4,
"grad_norm": 2.6765482425689697,
"learning_rate": 1.3615328944184219e-06,
"loss": 0.122,
"step": 1467
},
{
"epoch": 0.4,
"grad_norm": 2.8813273906707764,
"learning_rate": 1.3607076562555185e-06,
"loss": 0.1403,
"step": 1468
},
{
"epoch": 0.4,
"grad_norm": 2.8541016578674316,
"learning_rate": 1.3598821356056766e-06,
"loss": 0.1278,
"step": 1469
},
{
"epoch": 0.4,
"grad_norm": 2.8218047618865967,
"learning_rate": 1.3590563331154005e-06,
"loss": 0.1287,
"step": 1470
},
{
"epoch": 0.4,
"grad_norm": 2.771939992904663,
"learning_rate": 1.358230249431414e-06,
"loss": 0.1186,
"step": 1471
},
{
"epoch": 0.4,
"grad_norm": 2.6950502395629883,
"learning_rate": 1.3574038852006618e-06,
"loss": 0.1417,
"step": 1472
},
{
"epoch": 0.4,
"grad_norm": 2.8820621967315674,
"learning_rate": 1.3565772410703077e-06,
"loss": 0.1333,
"step": 1473
},
{
"epoch": 0.4,
"grad_norm": 3.0206656455993652,
"learning_rate": 1.3557503176877356e-06,
"loss": 0.1288,
"step": 1474
},
{
"epoch": 0.4,
"grad_norm": 2.8983829021453857,
"learning_rate": 1.3549231157005482e-06,
"loss": 0.1346,
"step": 1475
},
{
"epoch": 0.4,
"grad_norm": 3.0689306259155273,
"learning_rate": 1.3540956357565648e-06,
"loss": 0.1483,
"step": 1476
},
{
"epoch": 0.4,
"grad_norm": 2.4660563468933105,
"learning_rate": 1.3532678785038236e-06,
"loss": 0.1109,
"step": 1477
},
{
"epoch": 0.4,
"grad_norm": 3.0170247554779053,
"learning_rate": 1.3524398445905802e-06,
"loss": 0.1235,
"step": 1478
},
{
"epoch": 0.4,
"grad_norm": 2.884615421295166,
"learning_rate": 1.3516115346653063e-06,
"loss": 0.1431,
"step": 1479
},
{
"epoch": 0.4,
"grad_norm": 3.03633451461792,
"learning_rate": 1.3507829493766903e-06,
"loss": 0.1381,
"step": 1480
},
{
"epoch": 0.4,
"grad_norm": 3.0857226848602295,
"learning_rate": 1.3499540893736351e-06,
"loss": 0.1444,
"step": 1481
},
{
"epoch": 0.4,
"grad_norm": 3.016254186630249,
"learning_rate": 1.34912495530526e-06,
"loss": 0.1364,
"step": 1482
},
{
"epoch": 0.41,
"grad_norm": 2.948850631713867,
"learning_rate": 1.3482955478208983e-06,
"loss": 0.1385,
"step": 1483
},
{
"epoch": 0.41,
"grad_norm": 2.800083637237549,
"learning_rate": 1.3474658675700976e-06,
"loss": 0.1338,
"step": 1484
},
{
"epoch": 0.41,
"grad_norm": 2.733985424041748,
"learning_rate": 1.3466359152026195e-06,
"loss": 0.127,
"step": 1485
},
{
"epoch": 0.41,
"grad_norm": 2.8768179416656494,
"learning_rate": 1.3458056913684372e-06,
"loss": 0.1219,
"step": 1486
},
{
"epoch": 0.41,
"grad_norm": 2.8934342861175537,
"learning_rate": 1.344975196717739e-06,
"loss": 0.1369,
"step": 1487
},
{
"epoch": 0.41,
"grad_norm": 3.561100721359253,
"learning_rate": 1.3441444319009226e-06,
"loss": 0.122,
"step": 1488
},
{
"epoch": 0.41,
"grad_norm": 2.6823904514312744,
"learning_rate": 1.3433133975685994e-06,
"loss": 0.1253,
"step": 1489
},
{
"epoch": 0.41,
"grad_norm": 2.866231679916382,
"learning_rate": 1.342482094371591e-06,
"loss": 0.135,
"step": 1490
},
{
"epoch": 0.41,
"grad_norm": 3.0388472080230713,
"learning_rate": 1.3416505229609285e-06,
"loss": 0.1488,
"step": 1491
},
{
"epoch": 0.41,
"grad_norm": 2.9190943241119385,
"learning_rate": 1.3408186839878556e-06,
"loss": 0.1332,
"step": 1492
},
{
"epoch": 0.41,
"grad_norm": 2.707432985305786,
"learning_rate": 1.3399865781038233e-06,
"loss": 0.1141,
"step": 1493
},
{
"epoch": 0.41,
"grad_norm": 2.6063971519470215,
"learning_rate": 1.3391542059604926e-06,
"loss": 0.1226,
"step": 1494
},
{
"epoch": 0.41,
"grad_norm": 3.019693374633789,
"learning_rate": 1.3383215682097328e-06,
"loss": 0.1216,
"step": 1495
},
{
"epoch": 0.41,
"grad_norm": 2.6912519931793213,
"learning_rate": 1.337488665503621e-06,
"loss": 0.1328,
"step": 1496
},
{
"epoch": 0.41,
"grad_norm": 2.7698280811309814,
"learning_rate": 1.3366554984944428e-06,
"loss": 0.1277,
"step": 1497
},
{
"epoch": 0.41,
"grad_norm": 2.834601402282715,
"learning_rate": 1.335822067834689e-06,
"loss": 0.1325,
"step": 1498
},
{
"epoch": 0.41,
"grad_norm": 2.886516809463501,
"learning_rate": 1.3349883741770586e-06,
"loss": 0.1219,
"step": 1499
},
{
"epoch": 0.41,
"grad_norm": 2.703620195388794,
"learning_rate": 1.3341544181744557e-06,
"loss": 0.1192,
"step": 1500
},
{
"epoch": 0.41,
"grad_norm": 2.719348430633545,
"learning_rate": 1.3333202004799897e-06,
"loss": 0.1162,
"step": 1501
},
{
"epoch": 0.41,
"grad_norm": 2.807950258255005,
"learning_rate": 1.332485721746976e-06,
"loss": 0.1315,
"step": 1502
},
{
"epoch": 0.41,
"grad_norm": 3.306149959564209,
"learning_rate": 1.3316509826289331e-06,
"loss": 0.1516,
"step": 1503
},
{
"epoch": 0.41,
"grad_norm": 3.049546718597412,
"learning_rate": 1.330815983779584e-06,
"loss": 0.1318,
"step": 1504
},
{
"epoch": 0.41,
"grad_norm": 2.990818500518799,
"learning_rate": 1.3299807258528555e-06,
"loss": 0.1396,
"step": 1505
},
{
"epoch": 0.41,
"grad_norm": 2.500312328338623,
"learning_rate": 1.3291452095028766e-06,
"loss": 0.1095,
"step": 1506
},
{
"epoch": 0.41,
"grad_norm": 3.0452494621276855,
"learning_rate": 1.3283094353839792e-06,
"loss": 0.1336,
"step": 1507
},
{
"epoch": 0.41,
"grad_norm": 2.7827515602111816,
"learning_rate": 1.3274734041506968e-06,
"loss": 0.1277,
"step": 1508
},
{
"epoch": 0.41,
"grad_norm": 3.001279830932617,
"learning_rate": 1.3266371164577642e-06,
"loss": 0.1424,
"step": 1509
},
{
"epoch": 0.41,
"grad_norm": 3.0327725410461426,
"learning_rate": 1.3258005729601176e-06,
"loss": 0.1428,
"step": 1510
},
{
"epoch": 0.41,
"grad_norm": 2.7685489654541016,
"learning_rate": 1.3249637743128926e-06,
"loss": 0.1107,
"step": 1511
},
{
"epoch": 0.41,
"grad_norm": 2.811110019683838,
"learning_rate": 1.3241267211714255e-06,
"loss": 0.1332,
"step": 1512
},
{
"epoch": 0.41,
"grad_norm": 2.7999536991119385,
"learning_rate": 1.3232894141912512e-06,
"loss": 0.1293,
"step": 1513
},
{
"epoch": 0.41,
"grad_norm": 2.9219651222229004,
"learning_rate": 1.322451854028104e-06,
"loss": 0.1312,
"step": 1514
},
{
"epoch": 0.41,
"grad_norm": 2.8820810317993164,
"learning_rate": 1.3216140413379164e-06,
"loss": 0.1518,
"step": 1515
},
{
"epoch": 0.41,
"grad_norm": 2.7859690189361572,
"learning_rate": 1.3207759767768177e-06,
"loss": 0.1397,
"step": 1516
},
{
"epoch": 0.41,
"grad_norm": 2.8043460845947266,
"learning_rate": 1.3199376610011359e-06,
"loss": 0.1325,
"step": 1517
},
{
"epoch": 0.41,
"grad_norm": 3.043303966522217,
"learning_rate": 1.3190990946673951e-06,
"loss": 0.1485,
"step": 1518
},
{
"epoch": 0.41,
"grad_norm": 2.965022325515747,
"learning_rate": 1.3182602784323155e-06,
"loss": 0.1364,
"step": 1519
},
{
"epoch": 0.42,
"grad_norm": 2.5356366634368896,
"learning_rate": 1.317421212952813e-06,
"loss": 0.1113,
"step": 1520
},
{
"epoch": 0.42,
"grad_norm": 2.8506321907043457,
"learning_rate": 1.3165818988859984e-06,
"loss": 0.1414,
"step": 1521
},
{
"epoch": 0.42,
"grad_norm": 2.8930158615112305,
"learning_rate": 1.315742336889178e-06,
"loss": 0.1405,
"step": 1522
},
{
"epoch": 0.42,
"grad_norm": 3.2710516452789307,
"learning_rate": 1.3149025276198522e-06,
"loss": 0.1554,
"step": 1523
},
{
"epoch": 0.42,
"grad_norm": 2.6492574214935303,
"learning_rate": 1.3140624717357141e-06,
"loss": 0.1185,
"step": 1524
},
{
"epoch": 0.42,
"grad_norm": 2.9090943336486816,
"learning_rate": 1.3132221698946506e-06,
"loss": 0.1411,
"step": 1525
},
{
"epoch": 0.42,
"grad_norm": 3.1961607933044434,
"learning_rate": 1.3123816227547413e-06,
"loss": 0.1485,
"step": 1526
},
{
"epoch": 0.42,
"grad_norm": 3.009904384613037,
"learning_rate": 1.3115408309742577e-06,
"loss": 0.1512,
"step": 1527
},
{
"epoch": 0.42,
"grad_norm": 2.8224146366119385,
"learning_rate": 1.310699795211663e-06,
"loss": 0.1305,
"step": 1528
},
{
"epoch": 0.42,
"grad_norm": 2.929018974304199,
"learning_rate": 1.3098585161256112e-06,
"loss": 0.1303,
"step": 1529
},
{
"epoch": 0.42,
"grad_norm": 2.6305510997772217,
"learning_rate": 1.3090169943749473e-06,
"loss": 0.1268,
"step": 1530
},
{
"epoch": 0.42,
"grad_norm": 2.6673898696899414,
"learning_rate": 1.308175230618706e-06,
"loss": 0.121,
"step": 1531
},
{
"epoch": 0.42,
"grad_norm": 2.835024833679199,
"learning_rate": 1.3073332255161119e-06,
"loss": 0.1347,
"step": 1532
},
{
"epoch": 0.42,
"grad_norm": 3.1054320335388184,
"learning_rate": 1.3064909797265782e-06,
"loss": 0.1493,
"step": 1533
},
{
"epoch": 0.42,
"grad_norm": 2.9714009761810303,
"learning_rate": 1.3056484939097063e-06,
"loss": 0.1341,
"step": 1534
},
{
"epoch": 0.42,
"grad_norm": 2.5670108795166016,
"learning_rate": 1.3048057687252865e-06,
"loss": 0.1191,
"step": 1535
},
{
"epoch": 0.42,
"grad_norm": 2.666649580001831,
"learning_rate": 1.303962804833296e-06,
"loss": 0.1176,
"step": 1536
},
{
"epoch": 0.42,
"grad_norm": 2.820812940597534,
"learning_rate": 1.303119602893899e-06,
"loss": 0.1305,
"step": 1537
},
{
"epoch": 0.42,
"grad_norm": 2.6476902961730957,
"learning_rate": 1.3022761635674465e-06,
"loss": 0.1249,
"step": 1538
},
{
"epoch": 0.42,
"grad_norm": 2.881699800491333,
"learning_rate": 1.3014324875144742e-06,
"loss": 0.1276,
"step": 1539
},
{
"epoch": 0.42,
"grad_norm": 3.017333745956421,
"learning_rate": 1.3005885753957046e-06,
"loss": 0.1471,
"step": 1540
},
{
"epoch": 0.42,
"grad_norm": 2.6423754692077637,
"learning_rate": 1.2997444278720445e-06,
"loss": 0.1205,
"step": 1541
},
{
"epoch": 0.42,
"grad_norm": 2.902148723602295,
"learning_rate": 1.298900045604585e-06,
"loss": 0.1288,
"step": 1542
},
{
"epoch": 0.42,
"grad_norm": 2.9265713691711426,
"learning_rate": 1.2980554292546015e-06,
"loss": 0.1328,
"step": 1543
},
{
"epoch": 0.42,
"grad_norm": 3.1541149616241455,
"learning_rate": 1.2972105794835518e-06,
"loss": 0.1408,
"step": 1544
},
{
"epoch": 0.42,
"grad_norm": 2.520120143890381,
"learning_rate": 1.296365496953077e-06,
"loss": 0.1096,
"step": 1545
},
{
"epoch": 0.42,
"grad_norm": 2.5080769062042236,
"learning_rate": 1.295520182325001e-06,
"loss": 0.1125,
"step": 1546
},
{
"epoch": 0.42,
"grad_norm": 2.9557883739471436,
"learning_rate": 1.2946746362613285e-06,
"loss": 0.1298,
"step": 1547
},
{
"epoch": 0.42,
"grad_norm": 2.8763039112091064,
"learning_rate": 1.2938288594242464e-06,
"loss": 0.1346,
"step": 1548
},
{
"epoch": 0.42,
"grad_norm": 3.1307058334350586,
"learning_rate": 1.2929828524761215e-06,
"loss": 0.1293,
"step": 1549
},
{
"epoch": 0.42,
"grad_norm": 2.775430679321289,
"learning_rate": 1.2921366160795016e-06,
"loss": 0.1333,
"step": 1550
},
{
"epoch": 0.42,
"grad_norm": 3.194153070449829,
"learning_rate": 1.2912901508971132e-06,
"loss": 0.1429,
"step": 1551
},
{
"epoch": 0.42,
"grad_norm": 2.6535837650299072,
"learning_rate": 1.290443457591863e-06,
"loss": 0.1203,
"step": 1552
},
{
"epoch": 0.42,
"grad_norm": 2.8033483028411865,
"learning_rate": 1.289596536826836e-06,
"loss": 0.1278,
"step": 1553
},
{
"epoch": 0.42,
"grad_norm": 3.0123229026794434,
"learning_rate": 1.2887493892652945e-06,
"loss": 0.1342,
"step": 1554
},
{
"epoch": 0.42,
"grad_norm": 2.6336660385131836,
"learning_rate": 1.2879020155706802e-06,
"loss": 0.1209,
"step": 1555
},
{
"epoch": 0.43,
"grad_norm": 3.0163450241088867,
"learning_rate": 1.2870544164066099e-06,
"loss": 0.1182,
"step": 1556
},
{
"epoch": 0.43,
"grad_norm": 3.007383346557617,
"learning_rate": 1.286206592436878e-06,
"loss": 0.1435,
"step": 1557
},
{
"epoch": 0.43,
"grad_norm": 2.7327332496643066,
"learning_rate": 1.285358544325456e-06,
"loss": 0.1235,
"step": 1558
},
{
"epoch": 0.43,
"grad_norm": 3.0295119285583496,
"learning_rate": 1.284510272736488e-06,
"loss": 0.1362,
"step": 1559
},
{
"epoch": 0.43,
"grad_norm": 2.937821626663208,
"learning_rate": 1.2836617783342967e-06,
"loss": 0.1343,
"step": 1560
},
{
"epoch": 0.43,
"grad_norm": 2.598306179046631,
"learning_rate": 1.2828130617833766e-06,
"loss": 0.1369,
"step": 1561
},
{
"epoch": 0.43,
"grad_norm": 2.768315553665161,
"learning_rate": 1.281964123748397e-06,
"loss": 0.1195,
"step": 1562
},
{
"epoch": 0.43,
"grad_norm": 3.2559378147125244,
"learning_rate": 1.281114964894201e-06,
"loss": 0.1329,
"step": 1563
},
{
"epoch": 0.43,
"grad_norm": 3.381107807159424,
"learning_rate": 1.2802655858858042e-06,
"loss": 0.1181,
"step": 1564
},
{
"epoch": 0.43,
"grad_norm": 2.7674670219421387,
"learning_rate": 1.279415987388395e-06,
"loss": 0.1355,
"step": 1565
},
{
"epoch": 0.43,
"grad_norm": 2.898472309112549,
"learning_rate": 1.2785661700673338e-06,
"loss": 0.134,
"step": 1566
},
{
"epoch": 0.43,
"grad_norm": 2.7646608352661133,
"learning_rate": 1.2777161345881512e-06,
"loss": 0.1413,
"step": 1567
},
{
"epoch": 0.43,
"grad_norm": 3.0112876892089844,
"learning_rate": 1.2768658816165504e-06,
"loss": 0.1389,
"step": 1568
},
{
"epoch": 0.43,
"grad_norm": 3.059509038925171,
"learning_rate": 1.2760154118184035e-06,
"loss": 0.1329,
"step": 1569
},
{
"epoch": 0.43,
"grad_norm": 2.6462533473968506,
"learning_rate": 1.275164725859753e-06,
"loss": 0.1262,
"step": 1570
},
{
"epoch": 0.43,
"grad_norm": 2.63196063041687,
"learning_rate": 1.274313824406811e-06,
"loss": 0.11,
"step": 1571
},
{
"epoch": 0.43,
"grad_norm": 2.9079229831695557,
"learning_rate": 1.2734627081259574e-06,
"loss": 0.1432,
"step": 1572
},
{
"epoch": 0.43,
"grad_norm": 2.9190385341644287,
"learning_rate": 1.2726113776837415e-06,
"loss": 0.1422,
"step": 1573
},
{
"epoch": 0.43,
"grad_norm": 2.7876791954040527,
"learning_rate": 1.2717598337468793e-06,
"loss": 0.1329,
"step": 1574
},
{
"epoch": 0.43,
"grad_norm": 2.8222997188568115,
"learning_rate": 1.2709080769822546e-06,
"loss": 0.1449,
"step": 1575
},
{
"epoch": 0.43,
"grad_norm": 2.7818732261657715,
"learning_rate": 1.270056108056918e-06,
"loss": 0.1331,
"step": 1576
},
{
"epoch": 0.43,
"grad_norm": 2.854518175125122,
"learning_rate": 1.269203927638086e-06,
"loss": 0.1285,
"step": 1577
},
{
"epoch": 0.43,
"grad_norm": 2.730489730834961,
"learning_rate": 1.2683515363931401e-06,
"loss": 0.1294,
"step": 1578
},
{
"epoch": 0.43,
"grad_norm": 3.053382158279419,
"learning_rate": 1.2674989349896279e-06,
"loss": 0.1213,
"step": 1579
},
{
"epoch": 0.43,
"grad_norm": 3.0139403343200684,
"learning_rate": 1.2666461240952612e-06,
"loss": 0.1413,
"step": 1580
},
{
"epoch": 0.43,
"grad_norm": 3.037388801574707,
"learning_rate": 1.2657931043779162e-06,
"loss": 0.149,
"step": 1581
},
{
"epoch": 0.43,
"grad_norm": 3.1191294193267822,
"learning_rate": 1.2649398765056316e-06,
"loss": 0.1341,
"step": 1582
},
{
"epoch": 0.43,
"grad_norm": 2.7319812774658203,
"learning_rate": 1.2640864411466103e-06,
"loss": 0.1293,
"step": 1583
},
{
"epoch": 0.43,
"grad_norm": 3.1083836555480957,
"learning_rate": 1.2632327989692172e-06,
"loss": 0.1389,
"step": 1584
},
{
"epoch": 0.43,
"grad_norm": 2.9495601654052734,
"learning_rate": 1.262378950641979e-06,
"loss": 0.1371,
"step": 1585
},
{
"epoch": 0.43,
"grad_norm": 2.947674036026001,
"learning_rate": 1.2615248968335844e-06,
"loss": 0.1299,
"step": 1586
},
{
"epoch": 0.43,
"grad_norm": 2.8491618633270264,
"learning_rate": 1.2606706382128823e-06,
"loss": 0.1219,
"step": 1587
},
{
"epoch": 0.43,
"grad_norm": 2.953118324279785,
"learning_rate": 1.259816175448882e-06,
"loss": 0.1383,
"step": 1588
},
{
"epoch": 0.43,
"grad_norm": 2.879915952682495,
"learning_rate": 1.2589615092107538e-06,
"loss": 0.136,
"step": 1589
},
{
"epoch": 0.43,
"grad_norm": 2.6063785552978516,
"learning_rate": 1.258106640167826e-06,
"loss": 0.1122,
"step": 1590
},
{
"epoch": 0.43,
"grad_norm": 2.721730947494507,
"learning_rate": 1.2572515689895868e-06,
"loss": 0.1309,
"step": 1591
},
{
"epoch": 0.43,
"grad_norm": 3.0574612617492676,
"learning_rate": 1.2563962963456818e-06,
"loss": 0.1512,
"step": 1592
},
{
"epoch": 0.44,
"grad_norm": 2.758530855178833,
"learning_rate": 1.2555408229059148e-06,
"loss": 0.1333,
"step": 1593
},
{
"epoch": 0.44,
"grad_norm": 2.7727010250091553,
"learning_rate": 1.254685149340247e-06,
"loss": 0.1284,
"step": 1594
},
{
"epoch": 0.44,
"grad_norm": 2.840688943862915,
"learning_rate": 1.253829276318796e-06,
"loss": 0.1376,
"step": 1595
},
{
"epoch": 0.44,
"grad_norm": 2.5974299907684326,
"learning_rate": 1.2529732045118363e-06,
"loss": 0.1234,
"step": 1596
},
{
"epoch": 0.44,
"grad_norm": 2.957911968231201,
"learning_rate": 1.2521169345897963e-06,
"loss": 0.1265,
"step": 1597
},
{
"epoch": 0.44,
"grad_norm": 2.975114107131958,
"learning_rate": 1.251260467223262e-06,
"loss": 0.1308,
"step": 1598
},
{
"epoch": 0.44,
"grad_norm": 2.8173699378967285,
"learning_rate": 1.2504038030829724e-06,
"loss": 0.1155,
"step": 1599
},
{
"epoch": 0.44,
"grad_norm": 2.790580987930298,
"learning_rate": 1.249546942839821e-06,
"loss": 0.1238,
"step": 1600
},
{
"epoch": 0.44,
"grad_norm": 2.909597873687744,
"learning_rate": 1.2486898871648551e-06,
"loss": 0.1377,
"step": 1601
},
{
"epoch": 0.44,
"grad_norm": 2.7134740352630615,
"learning_rate": 1.2478326367292741e-06,
"loss": 0.1198,
"step": 1602
},
{
"epoch": 0.44,
"grad_norm": 2.6046814918518066,
"learning_rate": 1.2469751922044315e-06,
"loss": 0.1271,
"step": 1603
},
{
"epoch": 0.44,
"grad_norm": 2.4541776180267334,
"learning_rate": 1.2461175542618318e-06,
"loss": 0.1134,
"step": 1604
},
{
"epoch": 0.44,
"grad_norm": 2.6802122592926025,
"learning_rate": 1.245259723573131e-06,
"loss": 0.1184,
"step": 1605
},
{
"epoch": 0.44,
"grad_norm": 2.7969250679016113,
"learning_rate": 1.2444017008101365e-06,
"loss": 0.1345,
"step": 1606
},
{
"epoch": 0.44,
"grad_norm": 2.6568596363067627,
"learning_rate": 1.2435434866448053e-06,
"loss": 0.128,
"step": 1607
},
{
"epoch": 0.44,
"grad_norm": 3.19606614112854,
"learning_rate": 1.2426850817492455e-06,
"loss": 0.1431,
"step": 1608
},
{
"epoch": 0.44,
"grad_norm": 2.808906316757202,
"learning_rate": 1.2418264867957132e-06,
"loss": 0.1297,
"step": 1609
},
{
"epoch": 0.44,
"grad_norm": 2.6693055629730225,
"learning_rate": 1.2409677024566143e-06,
"loss": 0.1281,
"step": 1610
},
{
"epoch": 0.44,
"grad_norm": 2.5801992416381836,
"learning_rate": 1.2401087294045031e-06,
"loss": 0.1179,
"step": 1611
},
{
"epoch": 0.44,
"grad_norm": 2.660011053085327,
"learning_rate": 1.2392495683120806e-06,
"loss": 0.1207,
"step": 1612
},
{
"epoch": 0.44,
"grad_norm": 2.9305505752563477,
"learning_rate": 1.2383902198521963e-06,
"loss": 0.1258,
"step": 1613
},
{
"epoch": 0.44,
"grad_norm": 2.7699851989746094,
"learning_rate": 1.2375306846978462e-06,
"loss": 0.1227,
"step": 1614
},
{
"epoch": 0.44,
"grad_norm": 2.96335768699646,
"learning_rate": 1.2366709635221716e-06,
"loss": 0.1377,
"step": 1615
},
{
"epoch": 0.44,
"grad_norm": 2.8513786792755127,
"learning_rate": 1.2358110569984608e-06,
"loss": 0.1319,
"step": 1616
},
{
"epoch": 0.44,
"grad_norm": 2.6833722591400146,
"learning_rate": 1.2349509658001458e-06,
"loss": 0.1338,
"step": 1617
},
{
"epoch": 0.44,
"grad_norm": 3.0930840969085693,
"learning_rate": 1.2340906906008046e-06,
"loss": 0.1468,
"step": 1618
},
{
"epoch": 0.44,
"grad_norm": 3.274177074432373,
"learning_rate": 1.2332302320741587e-06,
"loss": 0.1296,
"step": 1619
},
{
"epoch": 0.44,
"grad_norm": 2.8681862354278564,
"learning_rate": 1.2323695908940728e-06,
"loss": 0.1322,
"step": 1620
},
{
"epoch": 0.44,
"grad_norm": 2.6954641342163086,
"learning_rate": 1.2315087677345556e-06,
"loss": 0.121,
"step": 1621
},
{
"epoch": 0.44,
"grad_norm": 2.7411937713623047,
"learning_rate": 1.2306477632697568e-06,
"loss": 0.1218,
"step": 1622
},
{
"epoch": 0.44,
"grad_norm": 2.734572649002075,
"learning_rate": 1.2297865781739699e-06,
"loss": 0.1217,
"step": 1623
},
{
"epoch": 0.44,
"grad_norm": 2.7590692043304443,
"learning_rate": 1.228925213121629e-06,
"loss": 0.1248,
"step": 1624
},
{
"epoch": 0.44,
"grad_norm": 2.5387656688690186,
"learning_rate": 1.2280636687873087e-06,
"loss": 0.1206,
"step": 1625
},
{
"epoch": 0.44,
"grad_norm": 2.6590218544006348,
"learning_rate": 1.2272019458457243e-06,
"loss": 0.1141,
"step": 1626
},
{
"epoch": 0.44,
"grad_norm": 2.614525556564331,
"learning_rate": 1.2263400449717317e-06,
"loss": 0.119,
"step": 1627
},
{
"epoch": 0.44,
"grad_norm": 2.675663709640503,
"learning_rate": 1.225477966840325e-06,
"loss": 0.1254,
"step": 1628
},
{
"epoch": 0.44,
"grad_norm": 2.7015058994293213,
"learning_rate": 1.2246157121266383e-06,
"loss": 0.1243,
"step": 1629
},
{
"epoch": 0.45,
"grad_norm": 2.6724138259887695,
"learning_rate": 1.2237532815059426e-06,
"loss": 0.1191,
"step": 1630
},
{
"epoch": 0.45,
"grad_norm": 2.769043207168579,
"learning_rate": 1.2228906756536478e-06,
"loss": 0.1425,
"step": 1631
},
{
"epoch": 0.45,
"grad_norm": 3.219146728515625,
"learning_rate": 1.222027895245301e-06,
"loss": 0.1366,
"step": 1632
},
{
"epoch": 0.45,
"grad_norm": 2.754021644592285,
"learning_rate": 1.221164940956585e-06,
"loss": 0.1353,
"step": 1633
},
{
"epoch": 0.45,
"grad_norm": 3.2277939319610596,
"learning_rate": 1.22030181346332e-06,
"loss": 0.1366,
"step": 1634
},
{
"epoch": 0.45,
"grad_norm": 3.1995861530303955,
"learning_rate": 1.2194385134414606e-06,
"loss": 0.149,
"step": 1635
},
{
"epoch": 0.45,
"grad_norm": 2.7883148193359375,
"learning_rate": 1.2185750415670977e-06,
"loss": 0.1293,
"step": 1636
},
{
"epoch": 0.45,
"grad_norm": 2.66025710105896,
"learning_rate": 1.2177113985164562e-06,
"loss": 0.1173,
"step": 1637
},
{
"epoch": 0.45,
"grad_norm": 2.7542994022369385,
"learning_rate": 1.2168475849658951e-06,
"loss": 0.1167,
"step": 1638
},
{
"epoch": 0.45,
"grad_norm": 3.27205491065979,
"learning_rate": 1.2159836015919075e-06,
"loss": 0.1246,
"step": 1639
},
{
"epoch": 0.45,
"grad_norm": 2.6621532440185547,
"learning_rate": 1.2151194490711177e-06,
"loss": 0.1231,
"step": 1640
},
{
"epoch": 0.45,
"grad_norm": 2.733459234237671,
"learning_rate": 1.2142551280802846e-06,
"loss": 0.1187,
"step": 1641
},
{
"epoch": 0.45,
"grad_norm": 3.0605053901672363,
"learning_rate": 1.213390639296298e-06,
"loss": 0.131,
"step": 1642
},
{
"epoch": 0.45,
"grad_norm": 2.8399031162261963,
"learning_rate": 1.2125259833961795e-06,
"loss": 0.1391,
"step": 1643
},
{
"epoch": 0.45,
"grad_norm": 3.032268762588501,
"learning_rate": 1.211661161057081e-06,
"loss": 0.1441,
"step": 1644
},
{
"epoch": 0.45,
"grad_norm": 2.698371410369873,
"learning_rate": 1.210796172956285e-06,
"loss": 0.1222,
"step": 1645
},
{
"epoch": 0.45,
"grad_norm": 3.248314142227173,
"learning_rate": 1.209931019771204e-06,
"loss": 0.1333,
"step": 1646
},
{
"epoch": 0.45,
"grad_norm": 2.890303373336792,
"learning_rate": 1.20906570217938e-06,
"loss": 0.1224,
"step": 1647
},
{
"epoch": 0.45,
"grad_norm": 2.574625015258789,
"learning_rate": 1.2082002208584832e-06,
"loss": 0.1142,
"step": 1648
},
{
"epoch": 0.45,
"grad_norm": 2.6554789543151855,
"learning_rate": 1.2073345764863125e-06,
"loss": 0.117,
"step": 1649
},
{
"epoch": 0.45,
"grad_norm": 2.891599178314209,
"learning_rate": 1.2064687697407937e-06,
"loss": 0.1257,
"step": 1650
},
{
"epoch": 0.45,
"grad_norm": 2.9254744052886963,
"learning_rate": 1.2056028012999808e-06,
"loss": 0.1402,
"step": 1651
},
{
"epoch": 0.45,
"grad_norm": 2.9599831104278564,
"learning_rate": 1.204736671842054e-06,
"loss": 0.1278,
"step": 1652
},
{
"epoch": 0.45,
"grad_norm": 2.616624593734741,
"learning_rate": 1.2038703820453192e-06,
"loss": 0.1252,
"step": 1653
},
{
"epoch": 0.45,
"grad_norm": 2.9988479614257812,
"learning_rate": 1.2030039325882085e-06,
"loss": 0.1411,
"step": 1654
},
{
"epoch": 0.45,
"grad_norm": 3.0054354667663574,
"learning_rate": 1.2021373241492785e-06,
"loss": 0.1527,
"step": 1655
},
{
"epoch": 0.45,
"grad_norm": 2.780163049697876,
"learning_rate": 1.2012705574072105e-06,
"loss": 0.1247,
"step": 1656
},
{
"epoch": 0.45,
"grad_norm": 2.769150495529175,
"learning_rate": 1.2004036330408104e-06,
"loss": 0.1185,
"step": 1657
},
{
"epoch": 0.45,
"grad_norm": 2.947723388671875,
"learning_rate": 1.1995365517290066e-06,
"loss": 0.1252,
"step": 1658
},
{
"epoch": 0.45,
"grad_norm": 2.8834493160247803,
"learning_rate": 1.198669314150851e-06,
"loss": 0.1293,
"step": 1659
},
{
"epoch": 0.45,
"grad_norm": 3.040342330932617,
"learning_rate": 1.1978019209855173e-06,
"loss": 0.1528,
"step": 1660
},
{
"epoch": 0.45,
"grad_norm": 2.6697583198547363,
"learning_rate": 1.1969343729123014e-06,
"loss": 0.1285,
"step": 1661
},
{
"epoch": 0.45,
"grad_norm": 3.018949031829834,
"learning_rate": 1.1960666706106213e-06,
"loss": 0.1321,
"step": 1662
},
{
"epoch": 0.45,
"grad_norm": 3.0014421939849854,
"learning_rate": 1.195198814760014e-06,
"loss": 0.1287,
"step": 1663
},
{
"epoch": 0.45,
"grad_norm": 2.7734243869781494,
"learning_rate": 1.1943308060401389e-06,
"loss": 0.1279,
"step": 1664
},
{
"epoch": 0.45,
"grad_norm": 2.76412034034729,
"learning_rate": 1.1934626451307726e-06,
"loss": 0.1191,
"step": 1665
},
{
"epoch": 0.46,
"grad_norm": 2.704519748687744,
"learning_rate": 1.1925943327118132e-06,
"loss": 0.1317,
"step": 1666
},
{
"epoch": 0.46,
"grad_norm": 2.8398492336273193,
"learning_rate": 1.1917258694632767e-06,
"loss": 0.1309,
"step": 1667
},
{
"epoch": 0.46,
"grad_norm": 3.1674561500549316,
"learning_rate": 1.1908572560652968e-06,
"loss": 0.1405,
"step": 1668
},
{
"epoch": 0.46,
"grad_norm": 2.771796464920044,
"learning_rate": 1.1899884931981247e-06,
"loss": 0.1224,
"step": 1669
},
{
"epoch": 0.46,
"grad_norm": 2.8627328872680664,
"learning_rate": 1.189119581542129e-06,
"loss": 0.1408,
"step": 1670
},
{
"epoch": 0.46,
"grad_norm": 2.843153238296509,
"learning_rate": 1.1882505217777953e-06,
"loss": 0.1176,
"step": 1671
},
{
"epoch": 0.46,
"grad_norm": 2.753012180328369,
"learning_rate": 1.1873813145857248e-06,
"loss": 0.1276,
"step": 1672
},
{
"epoch": 0.46,
"grad_norm": 2.54606294631958,
"learning_rate": 1.1865119606466332e-06,
"loss": 0.116,
"step": 1673
},
{
"epoch": 0.46,
"grad_norm": 2.690112590789795,
"learning_rate": 1.1856424606413528e-06,
"loss": 0.1194,
"step": 1674
},
{
"epoch": 0.46,
"grad_norm": 2.8299365043640137,
"learning_rate": 1.1847728152508291e-06,
"loss": 0.136,
"step": 1675
},
{
"epoch": 0.46,
"grad_norm": 2.7167913913726807,
"learning_rate": 1.1839030251561222e-06,
"loss": 0.1138,
"step": 1676
},
{
"epoch": 0.46,
"grad_norm": 3.147341251373291,
"learning_rate": 1.183033091038405e-06,
"loss": 0.1526,
"step": 1677
},
{
"epoch": 0.46,
"grad_norm": 2.694190263748169,
"learning_rate": 1.1821630135789634e-06,
"loss": 0.1208,
"step": 1678
},
{
"epoch": 0.46,
"grad_norm": 2.7382729053497314,
"learning_rate": 1.181292793459195e-06,
"loss": 0.1191,
"step": 1679
},
{
"epoch": 0.46,
"grad_norm": 2.824538469314575,
"learning_rate": 1.1804224313606102e-06,
"loss": 0.132,
"step": 1680
},
{
"epoch": 0.46,
"grad_norm": 3.1137707233428955,
"learning_rate": 1.17955192796483e-06,
"loss": 0.1323,
"step": 1681
},
{
"epoch": 0.46,
"grad_norm": 2.78731107711792,
"learning_rate": 1.178681283953586e-06,
"loss": 0.1196,
"step": 1682
},
{
"epoch": 0.46,
"grad_norm": 2.640028953552246,
"learning_rate": 1.1778105000087197e-06,
"loss": 0.1248,
"step": 1683
},
{
"epoch": 0.46,
"grad_norm": 2.5602917671203613,
"learning_rate": 1.176939576812183e-06,
"loss": 0.1143,
"step": 1684
},
{
"epoch": 0.46,
"grad_norm": 2.6920769214630127,
"learning_rate": 1.1760685150460361e-06,
"loss": 0.1245,
"step": 1685
},
{
"epoch": 0.46,
"grad_norm": 3.0413594245910645,
"learning_rate": 1.175197315392448e-06,
"loss": 0.1419,
"step": 1686
},
{
"epoch": 0.46,
"grad_norm": 2.9711055755615234,
"learning_rate": 1.174325978533696e-06,
"loss": 0.14,
"step": 1687
},
{
"epoch": 0.46,
"grad_norm": 2.8140616416931152,
"learning_rate": 1.1734545051521639e-06,
"loss": 0.1252,
"step": 1688
},
{
"epoch": 0.46,
"grad_norm": 2.883749485015869,
"learning_rate": 1.1725828959303432e-06,
"loss": 0.1263,
"step": 1689
},
{
"epoch": 0.46,
"grad_norm": 2.8676676750183105,
"learning_rate": 1.1717111515508317e-06,
"loss": 0.1366,
"step": 1690
},
{
"epoch": 0.46,
"grad_norm": 3.049114942550659,
"learning_rate": 1.170839272696333e-06,
"loss": 0.1348,
"step": 1691
},
{
"epoch": 0.46,
"grad_norm": 2.8665969371795654,
"learning_rate": 1.169967260049656e-06,
"loss": 0.1291,
"step": 1692
},
{
"epoch": 0.46,
"grad_norm": 2.4342026710510254,
"learning_rate": 1.1690951142937146e-06,
"loss": 0.1083,
"step": 1693
},
{
"epoch": 0.46,
"grad_norm": 3.093519687652588,
"learning_rate": 1.168222836111526e-06,
"loss": 0.1535,
"step": 1694
},
{
"epoch": 0.46,
"grad_norm": 2.86672306060791,
"learning_rate": 1.1673504261862123e-06,
"loss": 0.1263,
"step": 1695
},
{
"epoch": 0.46,
"grad_norm": 2.8939149379730225,
"learning_rate": 1.1664778852009983e-06,
"loss": 0.1229,
"step": 1696
},
{
"epoch": 0.46,
"grad_norm": 2.8285973072052,
"learning_rate": 1.1656052138392113e-06,
"loss": 0.143,
"step": 1697
},
{
"epoch": 0.46,
"grad_norm": 3.0051968097686768,
"learning_rate": 1.1647324127842808e-06,
"loss": 0.1485,
"step": 1698
},
{
"epoch": 0.46,
"grad_norm": 2.889620304107666,
"learning_rate": 1.1638594827197378e-06,
"loss": 0.1375,
"step": 1699
},
{
"epoch": 0.46,
"grad_norm": 2.844740867614746,
"learning_rate": 1.1629864243292146e-06,
"loss": 0.1382,
"step": 1700
},
{
"epoch": 0.46,
"grad_norm": 2.7689208984375,
"learning_rate": 1.1621132382964438e-06,
"loss": 0.1291,
"step": 1701
},
{
"epoch": 0.46,
"grad_norm": 2.8812568187713623,
"learning_rate": 1.161239925305258e-06,
"loss": 0.1248,
"step": 1702
},
{
"epoch": 0.47,
"grad_norm": 2.9800965785980225,
"learning_rate": 1.160366486039589e-06,
"loss": 0.1427,
"step": 1703
},
{
"epoch": 0.47,
"grad_norm": 2.602057456970215,
"learning_rate": 1.1594929211834679e-06,
"loss": 0.1124,
"step": 1704
},
{
"epoch": 0.47,
"grad_norm": 2.93133544921875,
"learning_rate": 1.1586192314210239e-06,
"loss": 0.1396,
"step": 1705
},
{
"epoch": 0.47,
"grad_norm": 2.7856240272521973,
"learning_rate": 1.157745417436484e-06,
"loss": 0.134,
"step": 1706
},
{
"epoch": 0.47,
"grad_norm": 2.871933698654175,
"learning_rate": 1.156871479914173e-06,
"loss": 0.1342,
"step": 1707
},
{
"epoch": 0.47,
"grad_norm": 2.958693265914917,
"learning_rate": 1.1559974195385117e-06,
"loss": 0.1339,
"step": 1708
},
{
"epoch": 0.47,
"grad_norm": 2.81894588470459,
"learning_rate": 1.1551232369940166e-06,
"loss": 0.1279,
"step": 1709
},
{
"epoch": 0.47,
"grad_norm": 2.6474156379699707,
"learning_rate": 1.1542489329653022e-06,
"loss": 0.1177,
"step": 1710
},
{
"epoch": 0.47,
"grad_norm": 2.7948195934295654,
"learning_rate": 1.1533745081370759e-06,
"loss": 0.1289,
"step": 1711
},
{
"epoch": 0.47,
"grad_norm": 2.8775722980499268,
"learning_rate": 1.1524999631941405e-06,
"loss": 0.1259,
"step": 1712
},
{
"epoch": 0.47,
"grad_norm": 3.051638603210449,
"learning_rate": 1.1516252988213926e-06,
"loss": 0.1351,
"step": 1713
},
{
"epoch": 0.47,
"grad_norm": 2.770986557006836,
"learning_rate": 1.1507505157038226e-06,
"loss": 0.1236,
"step": 1714
},
{
"epoch": 0.47,
"grad_norm": 2.9146666526794434,
"learning_rate": 1.1498756145265142e-06,
"loss": 0.1296,
"step": 1715
},
{
"epoch": 0.47,
"grad_norm": 3.0440783500671387,
"learning_rate": 1.149000595974643e-06,
"loss": 0.1333,
"step": 1716
},
{
"epoch": 0.47,
"grad_norm": 2.7425525188446045,
"learning_rate": 1.1481254607334766e-06,
"loss": 0.1248,
"step": 1717
},
{
"epoch": 0.47,
"grad_norm": 2.9264187812805176,
"learning_rate": 1.1472502094883743e-06,
"loss": 0.1146,
"step": 1718
},
{
"epoch": 0.47,
"grad_norm": 2.5261569023132324,
"learning_rate": 1.1463748429247852e-06,
"loss": 0.1227,
"step": 1719
},
{
"epoch": 0.47,
"grad_norm": 2.8340601921081543,
"learning_rate": 1.1454993617282512e-06,
"loss": 0.1337,
"step": 1720
},
{
"epoch": 0.47,
"grad_norm": 2.691579818725586,
"learning_rate": 1.144623766584401e-06,
"loss": 0.1253,
"step": 1721
},
{
"epoch": 0.47,
"grad_norm": 2.659074306488037,
"learning_rate": 1.1437480581789546e-06,
"loss": 0.1306,
"step": 1722
},
{
"epoch": 0.47,
"grad_norm": 2.750162363052368,
"learning_rate": 1.1428722371977192e-06,
"loss": 0.1175,
"step": 1723
},
{
"epoch": 0.47,
"grad_norm": 2.9749059677124023,
"learning_rate": 1.1419963043265915e-06,
"loss": 0.1367,
"step": 1724
},
{
"epoch": 0.47,
"grad_norm": 2.6689412593841553,
"learning_rate": 1.1411202602515554e-06,
"loss": 0.1299,
"step": 1725
},
{
"epoch": 0.47,
"grad_norm": 2.8214049339294434,
"learning_rate": 1.1402441056586813e-06,
"loss": 0.1257,
"step": 1726
},
{
"epoch": 0.47,
"grad_norm": 2.8539130687713623,
"learning_rate": 1.139367841234127e-06,
"loss": 0.1413,
"step": 1727
},
{
"epoch": 0.47,
"grad_norm": 3.122183084487915,
"learning_rate": 1.1384914676641355e-06,
"loss": 0.1445,
"step": 1728
},
{
"epoch": 0.47,
"grad_norm": 2.572101593017578,
"learning_rate": 1.137614985635036e-06,
"loss": 0.1177,
"step": 1729
},
{
"epoch": 0.47,
"grad_norm": 3.196446657180786,
"learning_rate": 1.1367383958332425e-06,
"loss": 0.1555,
"step": 1730
},
{
"epoch": 0.47,
"grad_norm": 2.7023680210113525,
"learning_rate": 1.1358616989452527e-06,
"loss": 0.1217,
"step": 1731
},
{
"epoch": 0.47,
"grad_norm": 2.5879292488098145,
"learning_rate": 1.1349848956576492e-06,
"loss": 0.1186,
"step": 1732
},
{
"epoch": 0.47,
"grad_norm": 2.875540256500244,
"learning_rate": 1.134107986657097e-06,
"loss": 0.1266,
"step": 1733
},
{
"epoch": 0.47,
"grad_norm": 2.756075382232666,
"learning_rate": 1.1332309726303447e-06,
"loss": 0.1346,
"step": 1734
},
{
"epoch": 0.47,
"grad_norm": 2.7358367443084717,
"learning_rate": 1.1323538542642227e-06,
"loss": 0.1257,
"step": 1735
},
{
"epoch": 0.47,
"grad_norm": 2.8995721340179443,
"learning_rate": 1.1314766322456425e-06,
"loss": 0.1388,
"step": 1736
},
{
"epoch": 0.47,
"grad_norm": 2.6761934757232666,
"learning_rate": 1.1305993072615984e-06,
"loss": 0.1153,
"step": 1737
},
{
"epoch": 0.47,
"grad_norm": 2.726055383682251,
"learning_rate": 1.1297218799991641e-06,
"loss": 0.1269,
"step": 1738
},
{
"epoch": 0.48,
"grad_norm": 2.8937668800354004,
"learning_rate": 1.1288443511454935e-06,
"loss": 0.1306,
"step": 1739
},
{
"epoch": 0.48,
"grad_norm": 2.7245571613311768,
"learning_rate": 1.1279667213878203e-06,
"loss": 0.1214,
"step": 1740
},
{
"epoch": 0.48,
"grad_norm": 2.902127504348755,
"learning_rate": 1.1270889914134573e-06,
"loss": 0.146,
"step": 1741
},
{
"epoch": 0.48,
"grad_norm": 2.838588237762451,
"learning_rate": 1.1262111619097956e-06,
"loss": 0.1233,
"step": 1742
},
{
"epoch": 0.48,
"grad_norm": 2.698322057723999,
"learning_rate": 1.1253332335643042e-06,
"loss": 0.1128,
"step": 1743
},
{
"epoch": 0.48,
"grad_norm": 2.864337682723999,
"learning_rate": 1.1244552070645298e-06,
"loss": 0.1328,
"step": 1744
},
{
"epoch": 0.48,
"grad_norm": 2.853121519088745,
"learning_rate": 1.1235770830980956e-06,
"loss": 0.1177,
"step": 1745
},
{
"epoch": 0.48,
"grad_norm": 2.7600739002227783,
"learning_rate": 1.1226988623527013e-06,
"loss": 0.1197,
"step": 1746
},
{
"epoch": 0.48,
"grad_norm": 3.1173315048217773,
"learning_rate": 1.1218205455161227e-06,
"loss": 0.1439,
"step": 1747
},
{
"epoch": 0.48,
"grad_norm": 2.690671682357788,
"learning_rate": 1.12094213327621e-06,
"loss": 0.1203,
"step": 1748
},
{
"epoch": 0.48,
"grad_norm": 2.8105766773223877,
"learning_rate": 1.1200636263208894e-06,
"loss": 0.1304,
"step": 1749
},
{
"epoch": 0.48,
"grad_norm": 2.6360340118408203,
"learning_rate": 1.1191850253381601e-06,
"loss": 0.1284,
"step": 1750
},
{
"epoch": 0.48,
"grad_norm": 2.691828727722168,
"learning_rate": 1.1183063310160953e-06,
"loss": 0.1257,
"step": 1751
},
{
"epoch": 0.48,
"grad_norm": 2.6944680213928223,
"learning_rate": 1.1174275440428415e-06,
"loss": 0.1203,
"step": 1752
},
{
"epoch": 0.48,
"grad_norm": 2.6057870388031006,
"learning_rate": 1.1165486651066176e-06,
"loss": 0.1129,
"step": 1753
},
{
"epoch": 0.48,
"grad_norm": 2.9981822967529297,
"learning_rate": 1.1156696948957146e-06,
"loss": 0.129,
"step": 1754
},
{
"epoch": 0.48,
"grad_norm": 2.8687524795532227,
"learning_rate": 1.1147906340984953e-06,
"loss": 0.1346,
"step": 1755
},
{
"epoch": 0.48,
"grad_norm": 2.571953535079956,
"learning_rate": 1.1139114834033928e-06,
"loss": 0.1185,
"step": 1756
},
{
"epoch": 0.48,
"grad_norm": 3.296785593032837,
"learning_rate": 1.1130322434989102e-06,
"loss": 0.1572,
"step": 1757
},
{
"epoch": 0.48,
"grad_norm": 2.9241154193878174,
"learning_rate": 1.1121529150736223e-06,
"loss": 0.1358,
"step": 1758
},
{
"epoch": 0.48,
"grad_norm": 3.0534586906433105,
"learning_rate": 1.1112734988161716e-06,
"loss": 0.1336,
"step": 1759
},
{
"epoch": 0.48,
"grad_norm": 3.169816493988037,
"learning_rate": 1.1103939954152699e-06,
"loss": 0.1397,
"step": 1760
},
{
"epoch": 0.48,
"grad_norm": 2.8303816318511963,
"learning_rate": 1.109514405559697e-06,
"loss": 0.1156,
"step": 1761
},
{
"epoch": 0.48,
"grad_norm": 2.6899285316467285,
"learning_rate": 1.1086347299383003e-06,
"loss": 0.1221,
"step": 1762
},
{
"epoch": 0.48,
"grad_norm": 3.023895025253296,
"learning_rate": 1.1077549692399958e-06,
"loss": 0.127,
"step": 1763
},
{
"epoch": 0.48,
"grad_norm": 2.7877230644226074,
"learning_rate": 1.1068751241537641e-06,
"loss": 0.1355,
"step": 1764
},
{
"epoch": 0.48,
"grad_norm": 2.621915102005005,
"learning_rate": 1.1059951953686534e-06,
"loss": 0.1194,
"step": 1765
},
{
"epoch": 0.48,
"grad_norm": 2.7423007488250732,
"learning_rate": 1.1051151835737762e-06,
"loss": 0.1247,
"step": 1766
},
{
"epoch": 0.48,
"grad_norm": 2.949045181274414,
"learning_rate": 1.1042350894583108e-06,
"loss": 0.1252,
"step": 1767
},
{
"epoch": 0.48,
"grad_norm": 2.995800495147705,
"learning_rate": 1.1033549137115004e-06,
"loss": 0.1432,
"step": 1768
},
{
"epoch": 0.48,
"grad_norm": 2.968546152114868,
"learning_rate": 1.1024746570226508e-06,
"loss": 0.1292,
"step": 1769
},
{
"epoch": 0.48,
"grad_norm": 2.7684168815612793,
"learning_rate": 1.1015943200811323e-06,
"loss": 0.1211,
"step": 1770
},
{
"epoch": 0.48,
"grad_norm": 2.9041082859039307,
"learning_rate": 1.1007139035763782e-06,
"loss": 0.1341,
"step": 1771
},
{
"epoch": 0.48,
"grad_norm": 2.9197094440460205,
"learning_rate": 1.0998334081978825e-06,
"loss": 0.1325,
"step": 1772
},
{
"epoch": 0.48,
"grad_norm": 2.745283365249634,
"learning_rate": 1.098952834635203e-06,
"loss": 0.1174,
"step": 1773
},
{
"epoch": 0.48,
"grad_norm": 2.711155891418457,
"learning_rate": 1.0980721835779572e-06,
"loss": 0.1228,
"step": 1774
},
{
"epoch": 0.48,
"grad_norm": 2.708275556564331,
"learning_rate": 1.0971914557158242e-06,
"loss": 0.1096,
"step": 1775
},
{
"epoch": 0.49,
"grad_norm": 2.7584431171417236,
"learning_rate": 1.0963106517385433e-06,
"loss": 0.1226,
"step": 1776
},
{
"epoch": 0.49,
"grad_norm": 2.963024139404297,
"learning_rate": 1.0954297723359118e-06,
"loss": 0.1328,
"step": 1777
},
{
"epoch": 0.49,
"grad_norm": 2.7407984733581543,
"learning_rate": 1.0945488181977889e-06,
"loss": 0.1238,
"step": 1778
},
{
"epoch": 0.49,
"grad_norm": 2.9212539196014404,
"learning_rate": 1.0936677900140898e-06,
"loss": 0.1301,
"step": 1779
},
{
"epoch": 0.49,
"grad_norm": 2.6921145915985107,
"learning_rate": 1.092786688474789e-06,
"loss": 0.115,
"step": 1780
},
{
"epoch": 0.49,
"grad_norm": 2.883453607559204,
"learning_rate": 1.0919055142699178e-06,
"loss": 0.1363,
"step": 1781
},
{
"epoch": 0.49,
"grad_norm": 2.5044760704040527,
"learning_rate": 1.0910242680895648e-06,
"loss": 0.1039,
"step": 1782
},
{
"epoch": 0.49,
"grad_norm": 2.7206735610961914,
"learning_rate": 1.0901429506238748e-06,
"loss": 0.1314,
"step": 1783
},
{
"epoch": 0.49,
"grad_norm": 2.778576374053955,
"learning_rate": 1.0892615625630488e-06,
"loss": 0.125,
"step": 1784
},
{
"epoch": 0.49,
"grad_norm": 2.572385549545288,
"learning_rate": 1.0883801045973423e-06,
"loss": 0.1111,
"step": 1785
},
{
"epoch": 0.49,
"grad_norm": 2.9828879833221436,
"learning_rate": 1.0874985774170667e-06,
"loss": 0.1285,
"step": 1786
},
{
"epoch": 0.49,
"grad_norm": 2.835440158843994,
"learning_rate": 1.0866169817125861e-06,
"loss": 0.1198,
"step": 1787
},
{
"epoch": 0.49,
"grad_norm": 2.6918647289276123,
"learning_rate": 1.0857353181743198e-06,
"loss": 0.1209,
"step": 1788
},
{
"epoch": 0.49,
"grad_norm": 2.776198387145996,
"learning_rate": 1.084853587492739e-06,
"loss": 0.1235,
"step": 1789
},
{
"epoch": 0.49,
"grad_norm": 2.8532979488372803,
"learning_rate": 1.0839717903583683e-06,
"loss": 0.1351,
"step": 1790
},
{
"epoch": 0.49,
"grad_norm": 2.714279890060425,
"learning_rate": 1.083089927461784e-06,
"loss": 0.1205,
"step": 1791
},
{
"epoch": 0.49,
"grad_norm": 2.987715482711792,
"learning_rate": 1.0822079994936138e-06,
"loss": 0.1314,
"step": 1792
},
{
"epoch": 0.49,
"grad_norm": 2.685398578643799,
"learning_rate": 1.0813260071445368e-06,
"loss": 0.1276,
"step": 1793
},
{
"epoch": 0.49,
"grad_norm": 2.7523744106292725,
"learning_rate": 1.0804439511052817e-06,
"loss": 0.1207,
"step": 1794
},
{
"epoch": 0.49,
"grad_norm": 2.781987428665161,
"learning_rate": 1.079561832066628e-06,
"loss": 0.1248,
"step": 1795
},
{
"epoch": 0.49,
"grad_norm": 2.8596763610839844,
"learning_rate": 1.0786796507194037e-06,
"loss": 0.1373,
"step": 1796
},
{
"epoch": 0.49,
"grad_norm": 2.9784350395202637,
"learning_rate": 1.0777974077544869e-06,
"loss": 0.1283,
"step": 1797
},
{
"epoch": 0.49,
"grad_norm": 2.8105826377868652,
"learning_rate": 1.0769151038628026e-06,
"loss": 0.124,
"step": 1798
},
{
"epoch": 0.49,
"grad_norm": 2.6100962162017822,
"learning_rate": 1.0760327397353237e-06,
"loss": 0.1153,
"step": 1799
},
{
"epoch": 0.49,
"grad_norm": 2.808901309967041,
"learning_rate": 1.0751503160630708e-06,
"loss": 0.1327,
"step": 1800
},
{
"epoch": 0.49,
"grad_norm": 2.954089879989624,
"learning_rate": 1.0742678335371111e-06,
"loss": 0.1347,
"step": 1801
},
{
"epoch": 0.49,
"grad_norm": 2.784660816192627,
"learning_rate": 1.0733852928485574e-06,
"loss": 0.1265,
"step": 1802
},
{
"epoch": 0.49,
"grad_norm": 2.7837114334106445,
"learning_rate": 1.0725026946885689e-06,
"loss": 0.1236,
"step": 1803
},
{
"epoch": 0.49,
"grad_norm": 2.7887163162231445,
"learning_rate": 1.0716200397483483e-06,
"loss": 0.1303,
"step": 1804
},
{
"epoch": 0.49,
"grad_norm": 2.7233433723449707,
"learning_rate": 1.0707373287191448e-06,
"loss": 0.1224,
"step": 1805
},
{
"epoch": 0.49,
"grad_norm": 2.7041988372802734,
"learning_rate": 1.0698545622922497e-06,
"loss": 0.1193,
"step": 1806
},
{
"epoch": 0.49,
"grad_norm": 2.861286163330078,
"learning_rate": 1.0689717411589984e-06,
"loss": 0.1321,
"step": 1807
},
{
"epoch": 0.49,
"grad_norm": 2.8406999111175537,
"learning_rate": 1.06808886601077e-06,
"loss": 0.1309,
"step": 1808
},
{
"epoch": 0.49,
"grad_norm": 2.8524794578552246,
"learning_rate": 1.0672059375389844e-06,
"loss": 0.1334,
"step": 1809
},
{
"epoch": 0.49,
"grad_norm": 2.7974801063537598,
"learning_rate": 1.066322956435104e-06,
"loss": 0.1343,
"step": 1810
},
{
"epoch": 0.49,
"grad_norm": 2.945446252822876,
"learning_rate": 1.0654399233906324e-06,
"loss": 0.149,
"step": 1811
},
{
"epoch": 0.49,
"grad_norm": 2.9166078567504883,
"learning_rate": 1.064556839097114e-06,
"loss": 0.1313,
"step": 1812
},
{
"epoch": 0.5,
"grad_norm": 2.9036660194396973,
"learning_rate": 1.063673704246133e-06,
"loss": 0.1388,
"step": 1813
},
{
"epoch": 0.5,
"grad_norm": 3.048311471939087,
"learning_rate": 1.0627905195293135e-06,
"loss": 0.1263,
"step": 1814
},
{
"epoch": 0.5,
"grad_norm": 2.9274511337280273,
"learning_rate": 1.061907285638318e-06,
"loss": 0.1477,
"step": 1815
},
{
"epoch": 0.5,
"grad_norm": 2.708244800567627,
"learning_rate": 1.0610240032648492e-06,
"loss": 0.1016,
"step": 1816
},
{
"epoch": 0.5,
"grad_norm": 3.3445119857788086,
"learning_rate": 1.0601406731006454e-06,
"loss": 0.1459,
"step": 1817
},
{
"epoch": 0.5,
"grad_norm": 2.64389705657959,
"learning_rate": 1.059257295837484e-06,
"loss": 0.1246,
"step": 1818
},
{
"epoch": 0.5,
"grad_norm": 3.626828193664551,
"learning_rate": 1.058373872167179e-06,
"loss": 0.1302,
"step": 1819
},
{
"epoch": 0.5,
"grad_norm": 2.9049739837646484,
"learning_rate": 1.0574904027815801e-06,
"loss": 0.1321,
"step": 1820
},
{
"epoch": 0.5,
"grad_norm": 2.6238348484039307,
"learning_rate": 1.056606888372574e-06,
"loss": 0.1133,
"step": 1821
},
{
"epoch": 0.5,
"grad_norm": 2.8575427532196045,
"learning_rate": 1.0557233296320811e-06,
"loss": 0.1346,
"step": 1822
},
{
"epoch": 0.5,
"grad_norm": 3.0136499404907227,
"learning_rate": 1.0548397272520578e-06,
"loss": 0.1431,
"step": 1823
},
{
"epoch": 0.5,
"grad_norm": 3.065021276473999,
"learning_rate": 1.053956081924494e-06,
"loss": 0.1479,
"step": 1824
},
{
"epoch": 0.5,
"grad_norm": 2.9963295459747314,
"learning_rate": 1.0530723943414133e-06,
"loss": 0.1461,
"step": 1825
},
{
"epoch": 0.5,
"grad_norm": 2.8013620376586914,
"learning_rate": 1.052188665194873e-06,
"loss": 0.1198,
"step": 1826
},
{
"epoch": 0.5,
"grad_norm": 2.6950151920318604,
"learning_rate": 1.0513048951769624e-06,
"loss": 0.1145,
"step": 1827
},
{
"epoch": 0.5,
"grad_norm": 2.88051700592041,
"learning_rate": 1.0504210849798026e-06,
"loss": 0.1344,
"step": 1828
},
{
"epoch": 0.5,
"grad_norm": 2.5159189701080322,
"learning_rate": 1.0495372352955467e-06,
"loss": 0.1121,
"step": 1829
},
{
"epoch": 0.5,
"grad_norm": 2.8956034183502197,
"learning_rate": 1.0486533468163782e-06,
"loss": 0.1189,
"step": 1830
},
{
"epoch": 0.5,
"grad_norm": 2.808065414428711,
"learning_rate": 1.0477694202345116e-06,
"loss": 0.1341,
"step": 1831
},
{
"epoch": 0.5,
"grad_norm": 2.836740732192993,
"learning_rate": 1.0468854562421905e-06,
"loss": 0.1183,
"step": 1832
},
{
"epoch": 0.5,
"grad_norm": 2.8136489391326904,
"learning_rate": 1.0460014555316886e-06,
"loss": 0.1361,
"step": 1833
},
{
"epoch": 0.5,
"grad_norm": 2.7605788707733154,
"learning_rate": 1.0451174187953083e-06,
"loss": 0.1095,
"step": 1834
},
{
"epoch": 0.5,
"grad_norm": 3.0163135528564453,
"learning_rate": 1.0442333467253788e-06,
"loss": 0.1405,
"step": 1835
},
{
"epoch": 0.5,
"grad_norm": 2.564580202102661,
"learning_rate": 1.0433492400142589e-06,
"loss": 0.1124,
"step": 1836
},
{
"epoch": 0.5,
"grad_norm": 2.578174352645874,
"learning_rate": 1.0424650993543337e-06,
"loss": 0.1146,
"step": 1837
},
{
"epoch": 0.5,
"grad_norm": 3.0214920043945312,
"learning_rate": 1.0415809254380141e-06,
"loss": 0.1362,
"step": 1838
},
{
"epoch": 0.5,
"grad_norm": 3.012301206588745,
"learning_rate": 1.0406967189577387e-06,
"loss": 0.136,
"step": 1839
},
{
"epoch": 0.5,
"grad_norm": 3.025559186935425,
"learning_rate": 1.03981248060597e-06,
"loss": 0.1573,
"step": 1840
},
{
"epoch": 0.5,
"grad_norm": 2.862858295440674,
"learning_rate": 1.038928211075197e-06,
"loss": 0.1335,
"step": 1841
},
{
"epoch": 0.5,
"grad_norm": 2.7830710411071777,
"learning_rate": 1.0380439110579313e-06,
"loss": 0.1228,
"step": 1842
},
{
"epoch": 0.5,
"grad_norm": 2.717508554458618,
"learning_rate": 1.0371595812467098e-06,
"loss": 0.1284,
"step": 1843
},
{
"epoch": 0.5,
"grad_norm": 2.8358590602874756,
"learning_rate": 1.0362752223340925e-06,
"loss": 0.1205,
"step": 1844
},
{
"epoch": 0.5,
"grad_norm": 2.7825779914855957,
"learning_rate": 1.0353908350126618e-06,
"loss": 0.1365,
"step": 1845
},
{
"epoch": 0.5,
"grad_norm": 3.3972370624542236,
"learning_rate": 1.034506419975023e-06,
"loss": 0.1296,
"step": 1846
},
{
"epoch": 0.5,
"grad_norm": 2.787891387939453,
"learning_rate": 1.0336219779138015e-06,
"loss": 0.1295,
"step": 1847
},
{
"epoch": 0.5,
"grad_norm": 2.9575958251953125,
"learning_rate": 1.032737509521646e-06,
"loss": 0.1358,
"step": 1848
},
{
"epoch": 0.51,
"grad_norm": 2.6589770317077637,
"learning_rate": 1.0318530154912244e-06,
"loss": 0.1228,
"step": 1849
},
{
"epoch": 0.51,
"grad_norm": 2.7603044509887695,
"learning_rate": 1.0309684965152252e-06,
"loss": 0.126,
"step": 1850
},
{
"epoch": 0.51,
"grad_norm": 3.1505537033081055,
"learning_rate": 1.0300839532863569e-06,
"loss": 0.1423,
"step": 1851
},
{
"epoch": 0.51,
"grad_norm": 2.751417636871338,
"learning_rate": 1.0291993864973455e-06,
"loss": 0.1275,
"step": 1852
},
{
"epoch": 0.51,
"grad_norm": 2.5051889419555664,
"learning_rate": 1.0283147968409365e-06,
"loss": 0.1169,
"step": 1853
},
{
"epoch": 0.51,
"grad_norm": 2.7912323474884033,
"learning_rate": 1.0274301850098936e-06,
"loss": 0.1272,
"step": 1854
},
{
"epoch": 0.51,
"grad_norm": 2.8508260250091553,
"learning_rate": 1.0265455516969976e-06,
"loss": 0.1191,
"step": 1855
},
{
"epoch": 0.51,
"grad_norm": 2.967703104019165,
"learning_rate": 1.0256608975950458e-06,
"loss": 0.1365,
"step": 1856
},
{
"epoch": 0.51,
"grad_norm": 2.7232398986816406,
"learning_rate": 1.0247762233968516e-06,
"loss": 0.1233,
"step": 1857
},
{
"epoch": 0.51,
"grad_norm": 2.7901456356048584,
"learning_rate": 1.0238915297952449e-06,
"loss": 0.1177,
"step": 1858
},
{
"epoch": 0.51,
"grad_norm": 2.8532421588897705,
"learning_rate": 1.0230068174830701e-06,
"loss": 0.1295,
"step": 1859
},
{
"epoch": 0.51,
"grad_norm": 2.837003231048584,
"learning_rate": 1.0221220871531869e-06,
"loss": 0.1376,
"step": 1860
},
{
"epoch": 0.51,
"grad_norm": 2.890450954437256,
"learning_rate": 1.0212373394984688e-06,
"loss": 0.1461,
"step": 1861
},
{
"epoch": 0.51,
"grad_norm": 2.723402261734009,
"learning_rate": 1.0203525752118023e-06,
"loss": 0.117,
"step": 1862
},
{
"epoch": 0.51,
"grad_norm": 2.9096994400024414,
"learning_rate": 1.0194677949860878e-06,
"loss": 0.1375,
"step": 1863
},
{
"epoch": 0.51,
"grad_norm": 2.727287769317627,
"learning_rate": 1.0185829995142377e-06,
"loss": 0.1172,
"step": 1864
},
{
"epoch": 0.51,
"grad_norm": 2.74131441116333,
"learning_rate": 1.0176981894891767e-06,
"loss": 0.1274,
"step": 1865
},
{
"epoch": 0.51,
"grad_norm": 2.803450345993042,
"learning_rate": 1.0168133656038407e-06,
"loss": 0.1277,
"step": 1866
},
{
"epoch": 0.51,
"grad_norm": 2.807425022125244,
"learning_rate": 1.0159285285511762e-06,
"loss": 0.1303,
"step": 1867
},
{
"epoch": 0.51,
"grad_norm": 2.745000123977661,
"learning_rate": 1.0150436790241404e-06,
"loss": 0.1299,
"step": 1868
},
{
"epoch": 0.51,
"grad_norm": 2.740635395050049,
"learning_rate": 1.0141588177156998e-06,
"loss": 0.1241,
"step": 1869
},
{
"epoch": 0.51,
"grad_norm": 2.799743413925171,
"learning_rate": 1.0132739453188308e-06,
"loss": 0.1213,
"step": 1870
},
{
"epoch": 0.51,
"grad_norm": 2.739351749420166,
"learning_rate": 1.0123890625265182e-06,
"loss": 0.1205,
"step": 1871
},
{
"epoch": 0.51,
"grad_norm": 3.156585454940796,
"learning_rate": 1.0115041700317543e-06,
"loss": 0.1447,
"step": 1872
},
{
"epoch": 0.51,
"grad_norm": 2.9910778999328613,
"learning_rate": 1.01061926852754e-06,
"loss": 0.1357,
"step": 1873
},
{
"epoch": 0.51,
"grad_norm": 2.6979784965515137,
"learning_rate": 1.009734358706883e-06,
"loss": 0.1157,
"step": 1874
},
{
"epoch": 0.51,
"grad_norm": 2.8156447410583496,
"learning_rate": 1.0088494412627967e-06,
"loss": 0.1256,
"step": 1875
},
{
"epoch": 0.51,
"grad_norm": 2.799232006072998,
"learning_rate": 1.0079645168883018e-06,
"loss": 0.1217,
"step": 1876
},
{
"epoch": 0.51,
"grad_norm": 2.747084617614746,
"learning_rate": 1.0070795862764232e-06,
"loss": 0.1189,
"step": 1877
},
{
"epoch": 0.51,
"grad_norm": 2.956552505493164,
"learning_rate": 1.0061946501201913e-06,
"loss": 0.1358,
"step": 1878
},
{
"epoch": 0.51,
"grad_norm": 3.008732557296753,
"learning_rate": 1.005309709112641e-06,
"loss": 0.1396,
"step": 1879
},
{
"epoch": 0.51,
"grad_norm": 2.9576668739318848,
"learning_rate": 1.0044247639468105e-06,
"loss": 0.1288,
"step": 1880
},
{
"epoch": 0.51,
"grad_norm": 2.854033946990967,
"learning_rate": 1.0035398153157416e-06,
"loss": 0.1265,
"step": 1881
},
{
"epoch": 0.51,
"grad_norm": 2.9028520584106445,
"learning_rate": 1.002654863912479e-06,
"loss": 0.1369,
"step": 1882
},
{
"epoch": 0.51,
"grad_norm": 2.773729085922241,
"learning_rate": 1.0017699104300685e-06,
"loss": 0.1256,
"step": 1883
},
{
"epoch": 0.51,
"grad_norm": 2.6618807315826416,
"learning_rate": 1.0008849555615593e-06,
"loss": 0.1246,
"step": 1884
},
{
"epoch": 0.51,
"grad_norm": 2.5700972080230713,
"learning_rate": 1e-06,
"loss": 0.1149,
"step": 1885
},
{
"epoch": 0.52,
"grad_norm": 3.0188512802124023,
"learning_rate": 9.991150444384408e-07,
"loss": 0.1381,
"step": 1886
},
{
"epoch": 0.52,
"grad_norm": 2.6191985607147217,
"learning_rate": 9.982300895699316e-07,
"loss": 0.1209,
"step": 1887
},
{
"epoch": 0.52,
"grad_norm": 2.8856725692749023,
"learning_rate": 9.973451360875212e-07,
"loss": 0.1322,
"step": 1888
},
{
"epoch": 0.52,
"grad_norm": 2.8278348445892334,
"learning_rate": 9.964601846842583e-07,
"loss": 0.1279,
"step": 1889
},
{
"epoch": 0.52,
"grad_norm": 2.9562864303588867,
"learning_rate": 9.955752360531894e-07,
"loss": 0.1282,
"step": 1890
},
{
"epoch": 0.52,
"grad_norm": 2.8889904022216797,
"learning_rate": 9.94690290887359e-07,
"loss": 0.1235,
"step": 1891
},
{
"epoch": 0.52,
"grad_norm": 2.9565634727478027,
"learning_rate": 9.938053498798088e-07,
"loss": 0.1188,
"step": 1892
},
{
"epoch": 0.52,
"grad_norm": 2.7623958587646484,
"learning_rate": 9.929204137235767e-07,
"loss": 0.1233,
"step": 1893
},
{
"epoch": 0.52,
"grad_norm": 2.517465353012085,
"learning_rate": 9.920354831116983e-07,
"loss": 0.1157,
"step": 1894
},
{
"epoch": 0.52,
"grad_norm": 2.7477364540100098,
"learning_rate": 9.911505587372032e-07,
"loss": 0.1247,
"step": 1895
},
{
"epoch": 0.52,
"grad_norm": 2.8950042724609375,
"learning_rate": 9.90265641293117e-07,
"loss": 0.1397,
"step": 1896
},
{
"epoch": 0.52,
"grad_norm": 2.5006961822509766,
"learning_rate": 9.8938073147246e-07,
"loss": 0.1138,
"step": 1897
},
{
"epoch": 0.52,
"grad_norm": 3.071542501449585,
"learning_rate": 9.884958299682456e-07,
"loss": 0.1351,
"step": 1898
},
{
"epoch": 0.52,
"grad_norm": 2.998857259750366,
"learning_rate": 9.87610937473482e-07,
"loss": 0.1301,
"step": 1899
},
{
"epoch": 0.52,
"grad_norm": 2.7444169521331787,
"learning_rate": 9.867260546811692e-07,
"loss": 0.1098,
"step": 1900
},
{
"epoch": 0.52,
"grad_norm": 3.1419379711151123,
"learning_rate": 9.858411822842999e-07,
"loss": 0.1331,
"step": 1901
},
{
"epoch": 0.52,
"grad_norm": 2.681468963623047,
"learning_rate": 9.8495632097586e-07,
"loss": 0.1203,
"step": 1902
},
{
"epoch": 0.52,
"grad_norm": 2.7569844722747803,
"learning_rate": 9.840714714488237e-07,
"loss": 0.1123,
"step": 1903
},
{
"epoch": 0.52,
"grad_norm": 2.6136653423309326,
"learning_rate": 9.831866343961594e-07,
"loss": 0.1208,
"step": 1904
},
{
"epoch": 0.52,
"grad_norm": 2.7803595066070557,
"learning_rate": 9.823018105108232e-07,
"loss": 0.1232,
"step": 1905
},
{
"epoch": 0.52,
"grad_norm": 3.172673463821411,
"learning_rate": 9.81417000485762e-07,
"loss": 0.1377,
"step": 1906
},
{
"epoch": 0.52,
"grad_norm": 2.8042352199554443,
"learning_rate": 9.805322050139125e-07,
"loss": 0.1241,
"step": 1907
},
{
"epoch": 0.52,
"grad_norm": 2.8047232627868652,
"learning_rate": 9.796474247881978e-07,
"loss": 0.1243,
"step": 1908
},
{
"epoch": 0.52,
"grad_norm": 3.208739757537842,
"learning_rate": 9.787626605015315e-07,
"loss": 0.1397,
"step": 1909
},
{
"epoch": 0.52,
"grad_norm": 2.6388320922851562,
"learning_rate": 9.778779128468133e-07,
"loss": 0.1206,
"step": 1910
},
{
"epoch": 0.52,
"grad_norm": 2.615647077560425,
"learning_rate": 9.769931825169296e-07,
"loss": 0.1149,
"step": 1911
},
{
"epoch": 0.52,
"grad_norm": 2.6340813636779785,
"learning_rate": 9.761084702047555e-07,
"loss": 0.115,
"step": 1912
},
{
"epoch": 0.52,
"grad_norm": 2.719238758087158,
"learning_rate": 9.752237766031485e-07,
"loss": 0.116,
"step": 1913
},
{
"epoch": 0.52,
"grad_norm": 2.8169920444488525,
"learning_rate": 9.743391024049545e-07,
"loss": 0.1344,
"step": 1914
},
{
"epoch": 0.52,
"grad_norm": 2.631563425064087,
"learning_rate": 9.734544483030025e-07,
"loss": 0.117,
"step": 1915
},
{
"epoch": 0.52,
"grad_norm": 2.8048999309539795,
"learning_rate": 9.725698149901061e-07,
"loss": 0.1291,
"step": 1916
},
{
"epoch": 0.52,
"grad_norm": 2.9323315620422363,
"learning_rate": 9.716852031590638e-07,
"loss": 0.1283,
"step": 1917
},
{
"epoch": 0.52,
"grad_norm": 2.747880697250366,
"learning_rate": 9.708006135026546e-07,
"loss": 0.1323,
"step": 1918
},
{
"epoch": 0.52,
"grad_norm": 2.6227073669433594,
"learning_rate": 9.699160467136433e-07,
"loss": 0.1271,
"step": 1919
},
{
"epoch": 0.52,
"grad_norm": 2.853242874145508,
"learning_rate": 9.690315034847747e-07,
"loss": 0.1407,
"step": 1920
},
{
"epoch": 0.52,
"grad_norm": 2.7962889671325684,
"learning_rate": 9.681469845087755e-07,
"loss": 0.1094,
"step": 1921
},
{
"epoch": 0.53,
"grad_norm": 2.5797970294952393,
"learning_rate": 9.672624904783542e-07,
"loss": 0.1097,
"step": 1922
},
{
"epoch": 0.53,
"grad_norm": 2.655869722366333,
"learning_rate": 9.663780220861986e-07,
"loss": 0.1193,
"step": 1923
},
{
"epoch": 0.53,
"grad_norm": 2.7732772827148438,
"learning_rate": 9.654935800249772e-07,
"loss": 0.1229,
"step": 1924
},
{
"epoch": 0.53,
"grad_norm": 2.618173837661743,
"learning_rate": 9.646091649873383e-07,
"loss": 0.1222,
"step": 1925
},
{
"epoch": 0.53,
"grad_norm": 2.573154926300049,
"learning_rate": 9.637247776659074e-07,
"loss": 0.1137,
"step": 1926
},
{
"epoch": 0.53,
"grad_norm": 2.6391992568969727,
"learning_rate": 9.628404187532901e-07,
"loss": 0.1226,
"step": 1927
},
{
"epoch": 0.53,
"grad_norm": 2.5460689067840576,
"learning_rate": 9.619560889420688e-07,
"loss": 0.1076,
"step": 1928
},
{
"epoch": 0.53,
"grad_norm": 2.684030532836914,
"learning_rate": 9.610717889248032e-07,
"loss": 0.1167,
"step": 1929
},
{
"epoch": 0.53,
"grad_norm": 2.813068151473999,
"learning_rate": 9.6018751939403e-07,
"loss": 0.1198,
"step": 1930
},
{
"epoch": 0.53,
"grad_norm": 2.90629243850708,
"learning_rate": 9.593032810422612e-07,
"loss": 0.1298,
"step": 1931
},
{
"epoch": 0.53,
"grad_norm": 2.8330368995666504,
"learning_rate": 9.58419074561986e-07,
"loss": 0.1224,
"step": 1932
},
{
"epoch": 0.53,
"grad_norm": 2.701951742172241,
"learning_rate": 9.575349006456664e-07,
"loss": 0.113,
"step": 1933
},
{
"epoch": 0.53,
"grad_norm": 2.9516077041625977,
"learning_rate": 9.56650759985741e-07,
"loss": 0.1378,
"step": 1934
},
{
"epoch": 0.53,
"grad_norm": 2.7260758876800537,
"learning_rate": 9.557666532746213e-07,
"loss": 0.1233,
"step": 1935
},
{
"epoch": 0.53,
"grad_norm": 2.8595633506774902,
"learning_rate": 9.548825812046918e-07,
"loss": 0.1265,
"step": 1936
},
{
"epoch": 0.53,
"grad_norm": 2.5643112659454346,
"learning_rate": 9.539985444683113e-07,
"loss": 0.1107,
"step": 1937
},
{
"epoch": 0.53,
"grad_norm": 3.2145724296569824,
"learning_rate": 9.531145437578094e-07,
"loss": 0.132,
"step": 1938
},
{
"epoch": 0.53,
"grad_norm": 2.6504106521606445,
"learning_rate": 9.522305797654886e-07,
"loss": 0.1364,
"step": 1939
},
{
"epoch": 0.53,
"grad_norm": 2.5980429649353027,
"learning_rate": 9.513466531836221e-07,
"loss": 0.1153,
"step": 1940
},
{
"epoch": 0.53,
"grad_norm": 2.897460460662842,
"learning_rate": 9.504627647044534e-07,
"loss": 0.1324,
"step": 1941
},
{
"epoch": 0.53,
"grad_norm": 2.8686153888702393,
"learning_rate": 9.495789150201977e-07,
"loss": 0.1301,
"step": 1942
},
{
"epoch": 0.53,
"grad_norm": 3.08678936958313,
"learning_rate": 9.486951048230377e-07,
"loss": 0.1349,
"step": 1943
},
{
"epoch": 0.53,
"grad_norm": 2.923462390899658,
"learning_rate": 9.478113348051268e-07,
"loss": 0.1258,
"step": 1944
},
{
"epoch": 0.53,
"grad_norm": 2.8492536544799805,
"learning_rate": 9.469276056585867e-07,
"loss": 0.1335,
"step": 1945
},
{
"epoch": 0.53,
"grad_norm": 2.766394853591919,
"learning_rate": 9.46043918075506e-07,
"loss": 0.1202,
"step": 1946
},
{
"epoch": 0.53,
"grad_norm": 2.8734710216522217,
"learning_rate": 9.451602727479424e-07,
"loss": 0.1261,
"step": 1947
},
{
"epoch": 0.53,
"grad_norm": 3.0753471851348877,
"learning_rate": 9.44276670367919e-07,
"loss": 0.1428,
"step": 1948
},
{
"epoch": 0.53,
"grad_norm": 2.7942495346069336,
"learning_rate": 9.433931116274258e-07,
"loss": 0.1217,
"step": 1949
},
{
"epoch": 0.53,
"grad_norm": 2.8738746643066406,
"learning_rate": 9.425095972184198e-07,
"loss": 0.1352,
"step": 1950
},
{
"epoch": 0.53,
"grad_norm": 2.901026725769043,
"learning_rate": 9.416261278328209e-07,
"loss": 0.1225,
"step": 1951
},
{
"epoch": 0.53,
"grad_norm": 2.6596286296844482,
"learning_rate": 9.40742704162516e-07,
"loss": 0.1211,
"step": 1952
},
{
"epoch": 0.53,
"grad_norm": 2.844707727432251,
"learning_rate": 9.398593268993546e-07,
"loss": 0.1428,
"step": 1953
},
{
"epoch": 0.53,
"grad_norm": 2.7951433658599854,
"learning_rate": 9.389759967351507e-07,
"loss": 0.1151,
"step": 1954
},
{
"epoch": 0.53,
"grad_norm": 2.6156182289123535,
"learning_rate": 9.380927143616819e-07,
"loss": 0.1171,
"step": 1955
},
{
"epoch": 0.53,
"grad_norm": 2.552344799041748,
"learning_rate": 9.372094804706866e-07,
"loss": 0.1173,
"step": 1956
},
{
"epoch": 0.53,
"grad_norm": 2.7790207862854004,
"learning_rate": 9.363262957538671e-07,
"loss": 0.12,
"step": 1957
},
{
"epoch": 0.53,
"grad_norm": 2.931581497192383,
"learning_rate": 9.354431609028861e-07,
"loss": 0.136,
"step": 1958
},
{
"epoch": 0.54,
"grad_norm": 2.9483675956726074,
"learning_rate": 9.345600766093674e-07,
"loss": 0.1281,
"step": 1959
},
{
"epoch": 0.54,
"grad_norm": 2.7541463375091553,
"learning_rate": 9.336770435648963e-07,
"loss": 0.1225,
"step": 1960
},
{
"epoch": 0.54,
"grad_norm": 2.890131950378418,
"learning_rate": 9.327940624610155e-07,
"loss": 0.1273,
"step": 1961
},
{
"epoch": 0.54,
"grad_norm": 2.757310390472412,
"learning_rate": 9.319111339892302e-07,
"loss": 0.1241,
"step": 1962
},
{
"epoch": 0.54,
"grad_norm": 2.8916938304901123,
"learning_rate": 9.310282588410014e-07,
"loss": 0.125,
"step": 1963
},
{
"epoch": 0.54,
"grad_norm": 2.793872356414795,
"learning_rate": 9.301454377077502e-07,
"loss": 0.1245,
"step": 1964
},
{
"epoch": 0.54,
"grad_norm": 2.905181407928467,
"learning_rate": 9.292626712808555e-07,
"loss": 0.1256,
"step": 1965
},
{
"epoch": 0.54,
"grad_norm": 2.534851312637329,
"learning_rate": 9.283799602516516e-07,
"loss": 0.1066,
"step": 1966
},
{
"epoch": 0.54,
"grad_norm": 2.8749680519104004,
"learning_rate": 9.274973053114314e-07,
"loss": 0.1314,
"step": 1967
},
{
"epoch": 0.54,
"grad_norm": 2.922060966491699,
"learning_rate": 9.266147071514426e-07,
"loss": 0.1357,
"step": 1968
},
{
"epoch": 0.54,
"grad_norm": 2.6175835132598877,
"learning_rate": 9.257321664628888e-07,
"loss": 0.1164,
"step": 1969
},
{
"epoch": 0.54,
"grad_norm": 2.780647039413452,
"learning_rate": 9.248496839369292e-07,
"loss": 0.1241,
"step": 1970
},
{
"epoch": 0.54,
"grad_norm": 3.3802411556243896,
"learning_rate": 9.239672602646764e-07,
"loss": 0.1132,
"step": 1971
},
{
"epoch": 0.54,
"grad_norm": 2.62099289894104,
"learning_rate": 9.230848961371978e-07,
"loss": 0.1172,
"step": 1972
},
{
"epoch": 0.54,
"grad_norm": 2.7328224182128906,
"learning_rate": 9.222025922455133e-07,
"loss": 0.1175,
"step": 1973
},
{
"epoch": 0.54,
"grad_norm": 2.7471697330474854,
"learning_rate": 9.213203492805959e-07,
"loss": 0.1111,
"step": 1974
},
{
"epoch": 0.54,
"grad_norm": 2.845108985900879,
"learning_rate": 9.204381679333722e-07,
"loss": 0.1194,
"step": 1975
},
{
"epoch": 0.54,
"grad_norm": 2.360717296600342,
"learning_rate": 9.195560488947184e-07,
"loss": 0.1028,
"step": 1976
},
{
"epoch": 0.54,
"grad_norm": 2.8350260257720947,
"learning_rate": 9.186739928554634e-07,
"loss": 0.1274,
"step": 1977
},
{
"epoch": 0.54,
"grad_norm": 2.673199415206909,
"learning_rate": 9.177920005063864e-07,
"loss": 0.1183,
"step": 1978
},
{
"epoch": 0.54,
"grad_norm": 2.875682830810547,
"learning_rate": 9.169100725382159e-07,
"loss": 0.1223,
"step": 1979
},
{
"epoch": 0.54,
"grad_norm": 2.9900684356689453,
"learning_rate": 9.160282096416316e-07,
"loss": 0.131,
"step": 1980
},
{
"epoch": 0.54,
"grad_norm": 2.7693305015563965,
"learning_rate": 9.15146412507261e-07,
"loss": 0.125,
"step": 1981
},
{
"epoch": 0.54,
"grad_norm": 3.0410921573638916,
"learning_rate": 9.142646818256802e-07,
"loss": 0.1342,
"step": 1982
},
{
"epoch": 0.54,
"grad_norm": 2.9950759410858154,
"learning_rate": 9.13383018287414e-07,
"loss": 0.1199,
"step": 1983
},
{
"epoch": 0.54,
"grad_norm": 2.8740227222442627,
"learning_rate": 9.125014225829333e-07,
"loss": 0.1308,
"step": 1984
},
{
"epoch": 0.54,
"grad_norm": 2.735966444015503,
"learning_rate": 9.116198954026576e-07,
"loss": 0.1261,
"step": 1985
},
{
"epoch": 0.54,
"grad_norm": 2.5520291328430176,
"learning_rate": 9.107384374369513e-07,
"loss": 0.1076,
"step": 1986
},
{
"epoch": 0.54,
"grad_norm": 2.961522340774536,
"learning_rate": 9.098570493761251e-07,
"loss": 0.1396,
"step": 1987
},
{
"epoch": 0.54,
"grad_norm": 2.695842981338501,
"learning_rate": 9.089757319104354e-07,
"loss": 0.1164,
"step": 1988
},
{
"epoch": 0.54,
"grad_norm": 2.570087194442749,
"learning_rate": 9.080944857300822e-07,
"loss": 0.1154,
"step": 1989
},
{
"epoch": 0.54,
"grad_norm": 2.604621171951294,
"learning_rate": 9.072133115252112e-07,
"loss": 0.1189,
"step": 1990
},
{
"epoch": 0.54,
"grad_norm": 2.649380683898926,
"learning_rate": 9.063322099859102e-07,
"loss": 0.1366,
"step": 1991
},
{
"epoch": 0.54,
"grad_norm": 2.7982804775238037,
"learning_rate": 9.05451181802211e-07,
"loss": 0.1205,
"step": 1992
},
{
"epoch": 0.54,
"grad_norm": 2.7476391792297363,
"learning_rate": 9.045702276640882e-07,
"loss": 0.1283,
"step": 1993
},
{
"epoch": 0.54,
"grad_norm": 2.9363062381744385,
"learning_rate": 9.03689348261457e-07,
"loss": 0.1309,
"step": 1994
},
{
"epoch": 0.54,
"grad_norm": 2.87986421585083,
"learning_rate": 9.028085442841759e-07,
"loss": 0.1281,
"step": 1995
},
{
"epoch": 0.55,
"grad_norm": 2.5653114318847656,
"learning_rate": 9.019278164220428e-07,
"loss": 0.1143,
"step": 1996
},
{
"epoch": 0.55,
"grad_norm": 2.7345714569091797,
"learning_rate": 9.01047165364797e-07,
"loss": 0.1243,
"step": 1997
},
{
"epoch": 0.55,
"grad_norm": 3.0675840377807617,
"learning_rate": 9.001665918021178e-07,
"loss": 0.1216,
"step": 1998
},
{
"epoch": 0.55,
"grad_norm": 2.9636502265930176,
"learning_rate": 8.99286096423622e-07,
"loss": 0.126,
"step": 1999
},
{
"epoch": 0.55,
"grad_norm": 2.9458322525024414,
"learning_rate": 8.984056799188676e-07,
"loss": 0.131,
"step": 2000
},
{
"epoch": 0.55,
"grad_norm": 2.8930435180664062,
"learning_rate": 8.975253429773492e-07,
"loss": 0.1255,
"step": 2001
},
{
"epoch": 0.55,
"grad_norm": 2.7570371627807617,
"learning_rate": 8.966450862884994e-07,
"loss": 0.144,
"step": 2002
},
{
"epoch": 0.55,
"grad_norm": 2.710038661956787,
"learning_rate": 8.957649105416893e-07,
"loss": 0.1319,
"step": 2003
},
{
"epoch": 0.55,
"grad_norm": 2.9276740550994873,
"learning_rate": 8.948848164262238e-07,
"loss": 0.1369,
"step": 2004
},
{
"epoch": 0.55,
"grad_norm": 2.948108673095703,
"learning_rate": 8.940048046313469e-07,
"loss": 0.1268,
"step": 2005
},
{
"epoch": 0.55,
"grad_norm": 2.8122799396514893,
"learning_rate": 8.931248758462358e-07,
"loss": 0.1228,
"step": 2006
},
{
"epoch": 0.55,
"grad_norm": 2.95597505569458,
"learning_rate": 8.922450307600039e-07,
"loss": 0.1213,
"step": 2007
},
{
"epoch": 0.55,
"grad_norm": 2.8452467918395996,
"learning_rate": 8.913652700616996e-07,
"loss": 0.1326,
"step": 2008
},
{
"epoch": 0.55,
"grad_norm": 2.68770694732666,
"learning_rate": 8.904855944403031e-07,
"loss": 0.1266,
"step": 2009
},
{
"epoch": 0.55,
"grad_norm": 2.7697670459747314,
"learning_rate": 8.896060045847303e-07,
"loss": 0.1131,
"step": 2010
},
{
"epoch": 0.55,
"grad_norm": 2.6158204078674316,
"learning_rate": 8.887265011838284e-07,
"loss": 0.1165,
"step": 2011
},
{
"epoch": 0.55,
"grad_norm": 3.025125503540039,
"learning_rate": 8.878470849263774e-07,
"loss": 0.1365,
"step": 2012
},
{
"epoch": 0.55,
"grad_norm": 2.772484540939331,
"learning_rate": 8.869677565010898e-07,
"loss": 0.1322,
"step": 2013
},
{
"epoch": 0.55,
"grad_norm": 2.7702977657318115,
"learning_rate": 8.860885165966074e-07,
"loss": 0.1204,
"step": 2014
},
{
"epoch": 0.55,
"grad_norm": 2.765949249267578,
"learning_rate": 8.852093659015049e-07,
"loss": 0.1244,
"step": 2015
},
{
"epoch": 0.55,
"grad_norm": 2.8452184200286865,
"learning_rate": 8.843303051042853e-07,
"loss": 0.1321,
"step": 2016
},
{
"epoch": 0.55,
"grad_norm": 2.527989387512207,
"learning_rate": 8.834513348933822e-07,
"loss": 0.1148,
"step": 2017
},
{
"epoch": 0.55,
"grad_norm": 2.9338791370391846,
"learning_rate": 8.825724559571586e-07,
"loss": 0.1422,
"step": 2018
},
{
"epoch": 0.55,
"grad_norm": 2.7240614891052246,
"learning_rate": 8.816936689839048e-07,
"loss": 0.1262,
"step": 2019
},
{
"epoch": 0.55,
"grad_norm": 2.6370601654052734,
"learning_rate": 8.808149746618402e-07,
"loss": 0.1147,
"step": 2020
},
{
"epoch": 0.55,
"grad_norm": 2.624903440475464,
"learning_rate": 8.799363736791106e-07,
"loss": 0.1099,
"step": 2021
},
{
"epoch": 0.55,
"grad_norm": 2.865074872970581,
"learning_rate": 8.790578667237897e-07,
"loss": 0.1356,
"step": 2022
},
{
"epoch": 0.55,
"grad_norm": 2.718155860900879,
"learning_rate": 8.781794544838774e-07,
"loss": 0.1106,
"step": 2023
},
{
"epoch": 0.55,
"grad_norm": 2.8180630207061768,
"learning_rate": 8.773011376472986e-07,
"loss": 0.1345,
"step": 2024
},
{
"epoch": 0.55,
"grad_norm": 2.8360700607299805,
"learning_rate": 8.764229169019046e-07,
"loss": 0.1225,
"step": 2025
},
{
"epoch": 0.55,
"grad_norm": 2.8616783618927,
"learning_rate": 8.755447929354704e-07,
"loss": 0.126,
"step": 2026
},
{
"epoch": 0.55,
"grad_norm": 2.901785373687744,
"learning_rate": 8.746667664356955e-07,
"loss": 0.1365,
"step": 2027
},
{
"epoch": 0.55,
"grad_norm": 2.877326011657715,
"learning_rate": 8.737888380902044e-07,
"loss": 0.1327,
"step": 2028
},
{
"epoch": 0.55,
"grad_norm": 2.972341537475586,
"learning_rate": 8.729110085865426e-07,
"loss": 0.1308,
"step": 2029
},
{
"epoch": 0.55,
"grad_norm": 2.934622049331665,
"learning_rate": 8.720332786121798e-07,
"loss": 0.1237,
"step": 2030
},
{
"epoch": 0.55,
"grad_norm": 2.8102076053619385,
"learning_rate": 8.711556488545067e-07,
"loss": 0.133,
"step": 2031
},
{
"epoch": 0.56,
"grad_norm": 2.740907907485962,
"learning_rate": 8.702781200008358e-07,
"loss": 0.1228,
"step": 2032
},
{
"epoch": 0.56,
"grad_norm": 2.946277379989624,
"learning_rate": 8.694006927384016e-07,
"loss": 0.1217,
"step": 2033
},
{
"epoch": 0.56,
"grad_norm": 2.9555811882019043,
"learning_rate": 8.685233677543575e-07,
"loss": 0.1225,
"step": 2034
},
{
"epoch": 0.56,
"grad_norm": 2.7638421058654785,
"learning_rate": 8.676461457357776e-07,
"loss": 0.124,
"step": 2035
},
{
"epoch": 0.56,
"grad_norm": 2.890644073486328,
"learning_rate": 8.667690273696555e-07,
"loss": 0.1232,
"step": 2036
},
{
"epoch": 0.56,
"grad_norm": 2.742115020751953,
"learning_rate": 8.658920133429028e-07,
"loss": 0.1109,
"step": 2037
},
{
"epoch": 0.56,
"grad_norm": 2.385840892791748,
"learning_rate": 8.650151043423509e-07,
"loss": 0.1084,
"step": 2038
},
{
"epoch": 0.56,
"grad_norm": 2.701353073120117,
"learning_rate": 8.641383010547473e-07,
"loss": 0.1258,
"step": 2039
},
{
"epoch": 0.56,
"grad_norm": 2.8525030612945557,
"learning_rate": 8.632616041667575e-07,
"loss": 0.1242,
"step": 2040
},
{
"epoch": 0.56,
"grad_norm": 2.592622756958008,
"learning_rate": 8.62385014364964e-07,
"loss": 0.1158,
"step": 2041
},
{
"epoch": 0.56,
"grad_norm": 2.4827704429626465,
"learning_rate": 8.615085323358643e-07,
"loss": 0.1169,
"step": 2042
},
{
"epoch": 0.56,
"grad_norm": 3.138923406600952,
"learning_rate": 8.60632158765873e-07,
"loss": 0.1586,
"step": 2043
},
{
"epoch": 0.56,
"grad_norm": 3.0605967044830322,
"learning_rate": 8.597558943413186e-07,
"loss": 0.1314,
"step": 2044
},
{
"epoch": 0.56,
"grad_norm": 2.625797986984253,
"learning_rate": 8.588797397484444e-07,
"loss": 0.1317,
"step": 2045
},
{
"epoch": 0.56,
"grad_norm": 3.0117976665496826,
"learning_rate": 8.580036956734085e-07,
"loss": 0.1449,
"step": 2046
},
{
"epoch": 0.56,
"grad_norm": 2.7856662273406982,
"learning_rate": 8.571277628022806e-07,
"loss": 0.1233,
"step": 2047
},
{
"epoch": 0.56,
"grad_norm": 2.7608420848846436,
"learning_rate": 8.562519418210457e-07,
"loss": 0.1159,
"step": 2048
},
{
"epoch": 0.56,
"grad_norm": 2.6647586822509766,
"learning_rate": 8.553762334155989e-07,
"loss": 0.1081,
"step": 2049
},
{
"epoch": 0.56,
"grad_norm": 2.610696315765381,
"learning_rate": 8.545006382717485e-07,
"loss": 0.1272,
"step": 2050
},
{
"epoch": 0.56,
"grad_norm": 2.8872148990631104,
"learning_rate": 8.536251570752147e-07,
"loss": 0.1274,
"step": 2051
},
{
"epoch": 0.56,
"grad_norm": 2.914360523223877,
"learning_rate": 8.527497905116259e-07,
"loss": 0.122,
"step": 2052
},
{
"epoch": 0.56,
"grad_norm": 2.5790927410125732,
"learning_rate": 8.518745392665236e-07,
"loss": 0.1245,
"step": 2053
},
{
"epoch": 0.56,
"grad_norm": 2.682628870010376,
"learning_rate": 8.509994040253571e-07,
"loss": 0.1254,
"step": 2054
},
{
"epoch": 0.56,
"grad_norm": 2.5411360263824463,
"learning_rate": 8.501243854734856e-07,
"loss": 0.1102,
"step": 2055
},
{
"epoch": 0.56,
"grad_norm": 2.8040127754211426,
"learning_rate": 8.492494842961775e-07,
"loss": 0.1316,
"step": 2056
},
{
"epoch": 0.56,
"grad_norm": 2.9163131713867188,
"learning_rate": 8.483747011786074e-07,
"loss": 0.1303,
"step": 2057
},
{
"epoch": 0.56,
"grad_norm": 3.061216354370117,
"learning_rate": 8.475000368058598e-07,
"loss": 0.1313,
"step": 2058
},
{
"epoch": 0.56,
"grad_norm": 2.576927900314331,
"learning_rate": 8.466254918629242e-07,
"loss": 0.1247,
"step": 2059
},
{
"epoch": 0.56,
"grad_norm": 2.627513885498047,
"learning_rate": 8.457510670346974e-07,
"loss": 0.1237,
"step": 2060
},
{
"epoch": 0.56,
"grad_norm": 2.779042959213257,
"learning_rate": 8.448767630059833e-07,
"loss": 0.1216,
"step": 2061
},
{
"epoch": 0.56,
"grad_norm": 2.6417055130004883,
"learning_rate": 8.440025804614886e-07,
"loss": 0.1263,
"step": 2062
},
{
"epoch": 0.56,
"grad_norm": 3.055684804916382,
"learning_rate": 8.431285200858271e-07,
"loss": 0.1311,
"step": 2063
},
{
"epoch": 0.56,
"grad_norm": 2.9278390407562256,
"learning_rate": 8.422545825635159e-07,
"loss": 0.1287,
"step": 2064
},
{
"epoch": 0.56,
"grad_norm": 2.593324899673462,
"learning_rate": 8.413807685789759e-07,
"loss": 0.1071,
"step": 2065
},
{
"epoch": 0.56,
"grad_norm": 2.6678123474121094,
"learning_rate": 8.405070788165321e-07,
"loss": 0.1282,
"step": 2066
},
{
"epoch": 0.56,
"grad_norm": 3.019789695739746,
"learning_rate": 8.396335139604111e-07,
"loss": 0.1273,
"step": 2067
},
{
"epoch": 0.56,
"grad_norm": 2.9056313037872314,
"learning_rate": 8.387600746947423e-07,
"loss": 0.1299,
"step": 2068
},
{
"epoch": 0.57,
"grad_norm": 2.581608533859253,
"learning_rate": 8.378867617035564e-07,
"loss": 0.1216,
"step": 2069
},
{
"epoch": 0.57,
"grad_norm": 2.6597416400909424,
"learning_rate": 8.370135756707852e-07,
"loss": 0.1235,
"step": 2070
},
{
"epoch": 0.57,
"grad_norm": 2.624035120010376,
"learning_rate": 8.361405172802623e-07,
"loss": 0.1148,
"step": 2071
},
{
"epoch": 0.57,
"grad_norm": 2.6497409343719482,
"learning_rate": 8.352675872157192e-07,
"loss": 0.1169,
"step": 2072
},
{
"epoch": 0.57,
"grad_norm": 2.865757942199707,
"learning_rate": 8.343947861607888e-07,
"loss": 0.1304,
"step": 2073
},
{
"epoch": 0.57,
"grad_norm": 2.6306159496307373,
"learning_rate": 8.335221147990017e-07,
"loss": 0.1155,
"step": 2074
},
{
"epoch": 0.57,
"grad_norm": 2.8795928955078125,
"learning_rate": 8.326495738137875e-07,
"loss": 0.1188,
"step": 2075
},
{
"epoch": 0.57,
"grad_norm": 2.7957141399383545,
"learning_rate": 8.31777163888474e-07,
"loss": 0.1234,
"step": 2076
},
{
"epoch": 0.57,
"grad_norm": 2.5149428844451904,
"learning_rate": 8.309048857062855e-07,
"loss": 0.1081,
"step": 2077
},
{
"epoch": 0.57,
"grad_norm": 2.843853235244751,
"learning_rate": 8.300327399503439e-07,
"loss": 0.1368,
"step": 2078
},
{
"epoch": 0.57,
"grad_norm": 2.855299234390259,
"learning_rate": 8.291607273036669e-07,
"loss": 0.1424,
"step": 2079
},
{
"epoch": 0.57,
"grad_norm": 2.8537607192993164,
"learning_rate": 8.282888484491681e-07,
"loss": 0.1281,
"step": 2080
},
{
"epoch": 0.57,
"grad_norm": 2.6667559146881104,
"learning_rate": 8.274171040696569e-07,
"loss": 0.1174,
"step": 2081
},
{
"epoch": 0.57,
"grad_norm": 2.697756767272949,
"learning_rate": 8.265454948478363e-07,
"loss": 0.1188,
"step": 2082
},
{
"epoch": 0.57,
"grad_norm": 2.676966667175293,
"learning_rate": 8.256740214663042e-07,
"loss": 0.1333,
"step": 2083
},
{
"epoch": 0.57,
"grad_norm": 2.9605026245117188,
"learning_rate": 8.24802684607552e-07,
"loss": 0.1243,
"step": 2084
},
{
"epoch": 0.57,
"grad_norm": 2.8657665252685547,
"learning_rate": 8.239314849539637e-07,
"loss": 0.1269,
"step": 2085
},
{
"epoch": 0.57,
"grad_norm": 2.6637632846832275,
"learning_rate": 8.23060423187817e-07,
"loss": 0.1254,
"step": 2086
},
{
"epoch": 0.57,
"grad_norm": 2.775421619415283,
"learning_rate": 8.221894999912802e-07,
"loss": 0.121,
"step": 2087
},
{
"epoch": 0.57,
"grad_norm": 2.9669418334960938,
"learning_rate": 8.213187160464143e-07,
"loss": 0.1401,
"step": 2088
},
{
"epoch": 0.57,
"grad_norm": 2.907231569290161,
"learning_rate": 8.204480720351702e-07,
"loss": 0.1314,
"step": 2089
},
{
"epoch": 0.57,
"grad_norm": 2.8540337085723877,
"learning_rate": 8.195775686393896e-07,
"loss": 0.126,
"step": 2090
},
{
"epoch": 0.57,
"grad_norm": 2.768364191055298,
"learning_rate": 8.18707206540805e-07,
"loss": 0.1324,
"step": 2091
},
{
"epoch": 0.57,
"grad_norm": 2.9182331562042236,
"learning_rate": 8.178369864210368e-07,
"loss": 0.1208,
"step": 2092
},
{
"epoch": 0.57,
"grad_norm": 2.9153683185577393,
"learning_rate": 8.169669089615947e-07,
"loss": 0.1245,
"step": 2093
},
{
"epoch": 0.57,
"grad_norm": 2.692770004272461,
"learning_rate": 8.160969748438777e-07,
"loss": 0.1173,
"step": 2094
},
{
"epoch": 0.57,
"grad_norm": 2.8993098735809326,
"learning_rate": 8.152271847491705e-07,
"loss": 0.1298,
"step": 2095
},
{
"epoch": 0.57,
"grad_norm": 2.8526406288146973,
"learning_rate": 8.143575393586471e-07,
"loss": 0.1374,
"step": 2096
},
{
"epoch": 0.57,
"grad_norm": 2.7569406032562256,
"learning_rate": 8.134880393533667e-07,
"loss": 0.1228,
"step": 2097
},
{
"epoch": 0.57,
"grad_norm": 2.7927284240722656,
"learning_rate": 8.126186854142751e-07,
"loss": 0.1245,
"step": 2098
},
{
"epoch": 0.57,
"grad_norm": 2.929137706756592,
"learning_rate": 8.117494782222047e-07,
"loss": 0.1227,
"step": 2099
},
{
"epoch": 0.57,
"grad_norm": 2.835092544555664,
"learning_rate": 8.108804184578708e-07,
"loss": 0.1233,
"step": 2100
},
{
"epoch": 0.57,
"grad_norm": 3.0549395084381104,
"learning_rate": 8.100115068018756e-07,
"loss": 0.1423,
"step": 2101
},
{
"epoch": 0.57,
"grad_norm": 2.9805521965026855,
"learning_rate": 8.091427439347033e-07,
"loss": 0.1338,
"step": 2102
},
{
"epoch": 0.57,
"grad_norm": 2.843991756439209,
"learning_rate": 8.082741305367229e-07,
"loss": 0.1348,
"step": 2103
},
{
"epoch": 0.57,
"grad_norm": 2.965733051300049,
"learning_rate": 8.074056672881867e-07,
"loss": 0.1262,
"step": 2104
},
{
"epoch": 0.58,
"grad_norm": 2.747220754623413,
"learning_rate": 8.065373548692271e-07,
"loss": 0.1179,
"step": 2105
},
{
"epoch": 0.58,
"grad_norm": 2.8165981769561768,
"learning_rate": 8.056691939598615e-07,
"loss": 0.1217,
"step": 2106
},
{
"epoch": 0.58,
"grad_norm": 2.787397623062134,
"learning_rate": 8.048011852399859e-07,
"loss": 0.1298,
"step": 2107
},
{
"epoch": 0.58,
"grad_norm": 2.7715790271759033,
"learning_rate": 8.039333293893785e-07,
"loss": 0.1173,
"step": 2108
},
{
"epoch": 0.58,
"grad_norm": 2.5719151496887207,
"learning_rate": 8.030656270876985e-07,
"loss": 0.114,
"step": 2109
},
{
"epoch": 0.58,
"grad_norm": 2.8761417865753174,
"learning_rate": 8.021980790144826e-07,
"loss": 0.1332,
"step": 2110
},
{
"epoch": 0.58,
"grad_norm": 2.492717742919922,
"learning_rate": 8.013306858491492e-07,
"loss": 0.1121,
"step": 2111
},
{
"epoch": 0.58,
"grad_norm": 2.8049910068511963,
"learning_rate": 8.004634482709933e-07,
"loss": 0.1208,
"step": 2112
},
{
"epoch": 0.58,
"grad_norm": 2.7873449325561523,
"learning_rate": 7.995963669591893e-07,
"loss": 0.125,
"step": 2113
},
{
"epoch": 0.58,
"grad_norm": 2.719320774078369,
"learning_rate": 7.987294425927893e-07,
"loss": 0.1283,
"step": 2114
},
{
"epoch": 0.58,
"grad_norm": 3.1067440509796143,
"learning_rate": 7.978626758507216e-07,
"loss": 0.1252,
"step": 2115
},
{
"epoch": 0.58,
"grad_norm": 2.72916316986084,
"learning_rate": 7.969960674117918e-07,
"loss": 0.114,
"step": 2116
},
{
"epoch": 0.58,
"grad_norm": 2.606856107711792,
"learning_rate": 7.96129617954681e-07,
"loss": 0.114,
"step": 2117
},
{
"epoch": 0.58,
"grad_norm": 2.792314052581787,
"learning_rate": 7.952633281579459e-07,
"loss": 0.134,
"step": 2118
},
{
"epoch": 0.58,
"grad_norm": 2.6764605045318604,
"learning_rate": 7.943971987000191e-07,
"loss": 0.1224,
"step": 2119
},
{
"epoch": 0.58,
"grad_norm": 2.646055221557617,
"learning_rate": 7.935312302592062e-07,
"loss": 0.1042,
"step": 2120
},
{
"epoch": 0.58,
"grad_norm": 2.9272406101226807,
"learning_rate": 7.926654235136878e-07,
"loss": 0.1242,
"step": 2121
},
{
"epoch": 0.58,
"grad_norm": 2.86539363861084,
"learning_rate": 7.917997791415168e-07,
"loss": 0.1298,
"step": 2122
},
{
"epoch": 0.58,
"grad_norm": 2.893162727355957,
"learning_rate": 7.909342978206197e-07,
"loss": 0.1345,
"step": 2123
},
{
"epoch": 0.58,
"grad_norm": 3.0884931087493896,
"learning_rate": 7.900689802287959e-07,
"loss": 0.1189,
"step": 2124
},
{
"epoch": 0.58,
"grad_norm": 2.792923927307129,
"learning_rate": 7.892038270437152e-07,
"loss": 0.1192,
"step": 2125
},
{
"epoch": 0.58,
"grad_norm": 2.8450675010681152,
"learning_rate": 7.883388389429193e-07,
"loss": 0.1304,
"step": 2126
},
{
"epoch": 0.58,
"grad_norm": 2.8471620082855225,
"learning_rate": 7.874740166038207e-07,
"loss": 0.1287,
"step": 2127
},
{
"epoch": 0.58,
"grad_norm": 2.7425572872161865,
"learning_rate": 7.866093607037017e-07,
"loss": 0.1115,
"step": 2128
},
{
"epoch": 0.58,
"grad_norm": 2.7999820709228516,
"learning_rate": 7.857448719197154e-07,
"loss": 0.1102,
"step": 2129
},
{
"epoch": 0.58,
"grad_norm": 2.686962842941284,
"learning_rate": 7.848805509288824e-07,
"loss": 0.1202,
"step": 2130
},
{
"epoch": 0.58,
"grad_norm": 2.8658480644226074,
"learning_rate": 7.84016398408093e-07,
"loss": 0.1233,
"step": 2131
},
{
"epoch": 0.58,
"grad_norm": 2.7625598907470703,
"learning_rate": 7.831524150341049e-07,
"loss": 0.1239,
"step": 2132
},
{
"epoch": 0.58,
"grad_norm": 2.915579080581665,
"learning_rate": 7.822886014835435e-07,
"loss": 0.1378,
"step": 2133
},
{
"epoch": 0.58,
"grad_norm": 2.6005876064300537,
"learning_rate": 7.814249584329022e-07,
"loss": 0.1212,
"step": 2134
},
{
"epoch": 0.58,
"grad_norm": 2.9378654956817627,
"learning_rate": 7.805614865585395e-07,
"loss": 0.1225,
"step": 2135
},
{
"epoch": 0.58,
"grad_norm": 2.862884759902954,
"learning_rate": 7.796981865366804e-07,
"loss": 0.1266,
"step": 2136
},
{
"epoch": 0.58,
"grad_norm": 2.624966859817505,
"learning_rate": 7.788350590434152e-07,
"loss": 0.1136,
"step": 2137
},
{
"epoch": 0.58,
"grad_norm": 2.667940378189087,
"learning_rate": 7.77972104754699e-07,
"loss": 0.1224,
"step": 2138
},
{
"epoch": 0.58,
"grad_norm": 2.760246753692627,
"learning_rate": 7.77109324346352e-07,
"loss": 0.1287,
"step": 2139
},
{
"epoch": 0.58,
"grad_norm": 2.884101152420044,
"learning_rate": 7.762467184940573e-07,
"loss": 0.1172,
"step": 2140
},
{
"epoch": 0.58,
"grad_norm": 2.6695945262908936,
"learning_rate": 7.75384287873362e-07,
"loss": 0.1228,
"step": 2141
},
{
"epoch": 0.59,
"grad_norm": 2.7546286582946777,
"learning_rate": 7.745220331596749e-07,
"loss": 0.1221,
"step": 2142
},
{
"epoch": 0.59,
"grad_norm": 2.5956485271453857,
"learning_rate": 7.73659955028268e-07,
"loss": 0.1228,
"step": 2143
},
{
"epoch": 0.59,
"grad_norm": 2.7694950103759766,
"learning_rate": 7.727980541542757e-07,
"loss": 0.1237,
"step": 2144
},
{
"epoch": 0.59,
"grad_norm": 2.599437952041626,
"learning_rate": 7.719363312126914e-07,
"loss": 0.1144,
"step": 2145
},
{
"epoch": 0.59,
"grad_norm": 2.906019926071167,
"learning_rate": 7.710747868783713e-07,
"loss": 0.138,
"step": 2146
},
{
"epoch": 0.59,
"grad_norm": 2.8566348552703857,
"learning_rate": 7.702134218260301e-07,
"loss": 0.1258,
"step": 2147
},
{
"epoch": 0.59,
"grad_norm": 2.5982682704925537,
"learning_rate": 7.693522367302429e-07,
"loss": 0.113,
"step": 2148
},
{
"epoch": 0.59,
"grad_norm": 2.8539483547210693,
"learning_rate": 7.684912322654448e-07,
"loss": 0.1258,
"step": 2149
},
{
"epoch": 0.59,
"grad_norm": 3.0186097621917725,
"learning_rate": 7.676304091059272e-07,
"loss": 0.1422,
"step": 2150
},
{
"epoch": 0.59,
"grad_norm": 2.9395763874053955,
"learning_rate": 7.667697679258416e-07,
"loss": 0.1194,
"step": 2151
},
{
"epoch": 0.59,
"grad_norm": 2.899362564086914,
"learning_rate": 7.659093093991956e-07,
"loss": 0.1408,
"step": 2152
},
{
"epoch": 0.59,
"grad_norm": 2.491899013519287,
"learning_rate": 7.650490341998541e-07,
"loss": 0.1168,
"step": 2153
},
{
"epoch": 0.59,
"grad_norm": 2.636516809463501,
"learning_rate": 7.641889430015393e-07,
"loss": 0.1135,
"step": 2154
},
{
"epoch": 0.59,
"grad_norm": 2.9226226806640625,
"learning_rate": 7.633290364778283e-07,
"loss": 0.1339,
"step": 2155
},
{
"epoch": 0.59,
"grad_norm": 2.9808900356292725,
"learning_rate": 7.624693153021536e-07,
"loss": 0.1295,
"step": 2156
},
{
"epoch": 0.59,
"grad_norm": 2.777230978012085,
"learning_rate": 7.616097801478036e-07,
"loss": 0.1233,
"step": 2157
},
{
"epoch": 0.59,
"grad_norm": 2.622840404510498,
"learning_rate": 7.607504316879191e-07,
"loss": 0.1271,
"step": 2158
},
{
"epoch": 0.59,
"grad_norm": 2.8252713680267334,
"learning_rate": 7.598912705954972e-07,
"loss": 0.1216,
"step": 2159
},
{
"epoch": 0.59,
"grad_norm": 2.814364433288574,
"learning_rate": 7.590322975433856e-07,
"loss": 0.1247,
"step": 2160
},
{
"epoch": 0.59,
"grad_norm": 2.7726142406463623,
"learning_rate": 7.581735132042866e-07,
"loss": 0.1151,
"step": 2161
},
{
"epoch": 0.59,
"grad_norm": 2.667328119277954,
"learning_rate": 7.573149182507545e-07,
"loss": 0.113,
"step": 2162
},
{
"epoch": 0.59,
"grad_norm": 2.812042474746704,
"learning_rate": 7.564565133551945e-07,
"loss": 0.1367,
"step": 2163
},
{
"epoch": 0.59,
"grad_norm": 2.9198157787323,
"learning_rate": 7.555982991898636e-07,
"loss": 0.1263,
"step": 2164
},
{
"epoch": 0.59,
"grad_norm": 2.490802526473999,
"learning_rate": 7.547402764268689e-07,
"loss": 0.1111,
"step": 2165
},
{
"epoch": 0.59,
"grad_norm": 2.663837194442749,
"learning_rate": 7.538824457381679e-07,
"loss": 0.1175,
"step": 2166
},
{
"epoch": 0.59,
"grad_norm": 3.181468963623047,
"learning_rate": 7.530248077955683e-07,
"loss": 0.1515,
"step": 2167
},
{
"epoch": 0.59,
"grad_norm": 2.888683795928955,
"learning_rate": 7.521673632707259e-07,
"loss": 0.1306,
"step": 2168
},
{
"epoch": 0.59,
"grad_norm": 2.684504508972168,
"learning_rate": 7.513101128351453e-07,
"loss": 0.1224,
"step": 2169
},
{
"epoch": 0.59,
"grad_norm": 2.837757110595703,
"learning_rate": 7.504530571601791e-07,
"loss": 0.1198,
"step": 2170
},
{
"epoch": 0.59,
"grad_norm": 2.9464797973632812,
"learning_rate": 7.495961969170275e-07,
"loss": 0.1255,
"step": 2171
},
{
"epoch": 0.59,
"grad_norm": 2.7112324237823486,
"learning_rate": 7.487395327767381e-07,
"loss": 0.1133,
"step": 2172
},
{
"epoch": 0.59,
"grad_norm": 2.80619215965271,
"learning_rate": 7.478830654102036e-07,
"loss": 0.1108,
"step": 2173
},
{
"epoch": 0.59,
"grad_norm": 2.8480100631713867,
"learning_rate": 7.470267954881642e-07,
"loss": 0.1249,
"step": 2174
},
{
"epoch": 0.59,
"grad_norm": 2.9832444190979004,
"learning_rate": 7.461707236812041e-07,
"loss": 0.1369,
"step": 2175
},
{
"epoch": 0.59,
"grad_norm": 2.8048830032348633,
"learning_rate": 7.453148506597529e-07,
"loss": 0.1201,
"step": 2176
},
{
"epoch": 0.59,
"grad_norm": 2.626363515853882,
"learning_rate": 7.444591770940852e-07,
"loss": 0.1115,
"step": 2177
},
{
"epoch": 0.59,
"grad_norm": 2.7009074687957764,
"learning_rate": 7.436037036543183e-07,
"loss": 0.1205,
"step": 2178
},
{
"epoch": 0.6,
"grad_norm": 2.7532544136047363,
"learning_rate": 7.427484310104135e-07,
"loss": 0.114,
"step": 2179
},
{
"epoch": 0.6,
"grad_norm": 2.7397122383117676,
"learning_rate": 7.41893359832174e-07,
"loss": 0.1201,
"step": 2180
},
{
"epoch": 0.6,
"grad_norm": 2.975896120071411,
"learning_rate": 7.410384907892461e-07,
"loss": 0.1218,
"step": 2181
},
{
"epoch": 0.6,
"grad_norm": 2.816995620727539,
"learning_rate": 7.401838245511181e-07,
"loss": 0.1258,
"step": 2182
},
{
"epoch": 0.6,
"grad_norm": 2.8698947429656982,
"learning_rate": 7.393293617871179e-07,
"loss": 0.1315,
"step": 2183
},
{
"epoch": 0.6,
"grad_norm": 2.6808691024780273,
"learning_rate": 7.384751031664158e-07,
"loss": 0.1219,
"step": 2184
},
{
"epoch": 0.6,
"grad_norm": 2.5810961723327637,
"learning_rate": 7.376210493580211e-07,
"loss": 0.1111,
"step": 2185
},
{
"epoch": 0.6,
"grad_norm": 2.4069201946258545,
"learning_rate": 7.367672010307826e-07,
"loss": 0.11,
"step": 2186
},
{
"epoch": 0.6,
"grad_norm": 3.1182568073272705,
"learning_rate": 7.359135588533896e-07,
"loss": 0.1351,
"step": 2187
},
{
"epoch": 0.6,
"grad_norm": 2.869210958480835,
"learning_rate": 7.350601234943683e-07,
"loss": 0.1249,
"step": 2188
},
{
"epoch": 0.6,
"grad_norm": 3.0283706188201904,
"learning_rate": 7.342068956220842e-07,
"loss": 0.1372,
"step": 2189
},
{
"epoch": 0.6,
"grad_norm": 2.760991334915161,
"learning_rate": 7.333538759047389e-07,
"loss": 0.1298,
"step": 2190
},
{
"epoch": 0.6,
"grad_norm": 2.651310443878174,
"learning_rate": 7.32501065010372e-07,
"loss": 0.1134,
"step": 2191
},
{
"epoch": 0.6,
"grad_norm": 2.4590115547180176,
"learning_rate": 7.316484636068601e-07,
"loss": 0.1139,
"step": 2192
},
{
"epoch": 0.6,
"grad_norm": 2.888315200805664,
"learning_rate": 7.307960723619142e-07,
"loss": 0.1244,
"step": 2193
},
{
"epoch": 0.6,
"grad_norm": 2.830096960067749,
"learning_rate": 7.29943891943082e-07,
"loss": 0.1294,
"step": 2194
},
{
"epoch": 0.6,
"grad_norm": 2.517449140548706,
"learning_rate": 7.290919230177454e-07,
"loss": 0.1124,
"step": 2195
},
{
"epoch": 0.6,
"grad_norm": 2.952615261077881,
"learning_rate": 7.282401662531205e-07,
"loss": 0.1371,
"step": 2196
},
{
"epoch": 0.6,
"grad_norm": 2.8877615928649902,
"learning_rate": 7.273886223162586e-07,
"loss": 0.1271,
"step": 2197
},
{
"epoch": 0.6,
"grad_norm": 3.0385706424713135,
"learning_rate": 7.265372918740425e-07,
"loss": 0.1291,
"step": 2198
},
{
"epoch": 0.6,
"grad_norm": 2.778867483139038,
"learning_rate": 7.256861755931894e-07,
"loss": 0.1178,
"step": 2199
},
{
"epoch": 0.6,
"grad_norm": 2.944976568222046,
"learning_rate": 7.24835274140247e-07,
"loss": 0.1254,
"step": 2200
},
{
"epoch": 0.6,
"grad_norm": 2.80108642578125,
"learning_rate": 7.239845881815964e-07,
"loss": 0.1279,
"step": 2201
},
{
"epoch": 0.6,
"grad_norm": 2.6430721282958984,
"learning_rate": 7.231341183834496e-07,
"loss": 0.1106,
"step": 2202
},
{
"epoch": 0.6,
"grad_norm": 2.6731457710266113,
"learning_rate": 7.222838654118487e-07,
"loss": 0.112,
"step": 2203
},
{
"epoch": 0.6,
"grad_norm": 2.8311150074005127,
"learning_rate": 7.214338299326666e-07,
"loss": 0.1208,
"step": 2204
},
{
"epoch": 0.6,
"grad_norm": 2.6590795516967773,
"learning_rate": 7.20584012611605e-07,
"loss": 0.123,
"step": 2205
},
{
"epoch": 0.6,
"grad_norm": 2.7941184043884277,
"learning_rate": 7.197344141141957e-07,
"loss": 0.1184,
"step": 2206
},
{
"epoch": 0.6,
"grad_norm": 2.796651601791382,
"learning_rate": 7.188850351057992e-07,
"loss": 0.1295,
"step": 2207
},
{
"epoch": 0.6,
"grad_norm": 2.8743748664855957,
"learning_rate": 7.180358762516033e-07,
"loss": 0.136,
"step": 2208
},
{
"epoch": 0.6,
"grad_norm": 3.0480072498321533,
"learning_rate": 7.171869382166237e-07,
"loss": 0.1353,
"step": 2209
},
{
"epoch": 0.6,
"grad_norm": 2.6174540519714355,
"learning_rate": 7.163382216657033e-07,
"loss": 0.1106,
"step": 2210
},
{
"epoch": 0.6,
"grad_norm": 2.704287528991699,
"learning_rate": 7.154897272635116e-07,
"loss": 0.1105,
"step": 2211
},
{
"epoch": 0.6,
"grad_norm": 3.0001027584075928,
"learning_rate": 7.146414556745444e-07,
"loss": 0.1249,
"step": 2212
},
{
"epoch": 0.6,
"grad_norm": 2.7849698066711426,
"learning_rate": 7.137934075631218e-07,
"loss": 0.1157,
"step": 2213
},
{
"epoch": 0.6,
"grad_norm": 2.784977436065674,
"learning_rate": 7.129455835933899e-07,
"loss": 0.1213,
"step": 2214
},
{
"epoch": 0.61,
"grad_norm": 2.60339617729187,
"learning_rate": 7.1209798442932e-07,
"loss": 0.1076,
"step": 2215
},
{
"epoch": 0.61,
"grad_norm": 2.718926191329956,
"learning_rate": 7.112506107347052e-07,
"loss": 0.1218,
"step": 2216
},
{
"epoch": 0.61,
"grad_norm": 3.181000232696533,
"learning_rate": 7.104034631731642e-07,
"loss": 0.133,
"step": 2217
},
{
"epoch": 0.61,
"grad_norm": 2.939786911010742,
"learning_rate": 7.095565424081369e-07,
"loss": 0.114,
"step": 2218
},
{
"epoch": 0.61,
"grad_norm": 3.1787872314453125,
"learning_rate": 7.087098491028865e-07,
"loss": 0.1272,
"step": 2219
},
{
"epoch": 0.61,
"grad_norm": 2.8389174938201904,
"learning_rate": 7.078633839204984e-07,
"loss": 0.1197,
"step": 2220
},
{
"epoch": 0.61,
"grad_norm": 3.22316837310791,
"learning_rate": 7.070171475238785e-07,
"loss": 0.1404,
"step": 2221
},
{
"epoch": 0.61,
"grad_norm": 2.7424261569976807,
"learning_rate": 7.061711405757537e-07,
"loss": 0.1129,
"step": 2222
},
{
"epoch": 0.61,
"grad_norm": 2.948007822036743,
"learning_rate": 7.053253637386715e-07,
"loss": 0.1078,
"step": 2223
},
{
"epoch": 0.61,
"grad_norm": 3.12904953956604,
"learning_rate": 7.04479817674999e-07,
"loss": 0.1465,
"step": 2224
},
{
"epoch": 0.61,
"grad_norm": 2.7254724502563477,
"learning_rate": 7.03634503046923e-07,
"loss": 0.1244,
"step": 2225
},
{
"epoch": 0.61,
"grad_norm": 2.7801690101623535,
"learning_rate": 7.027894205164484e-07,
"loss": 0.1188,
"step": 2226
},
{
"epoch": 0.61,
"grad_norm": 2.992872953414917,
"learning_rate": 7.019445707453988e-07,
"loss": 0.1373,
"step": 2227
},
{
"epoch": 0.61,
"grad_norm": 2.7374370098114014,
"learning_rate": 7.01099954395415e-07,
"loss": 0.1178,
"step": 2228
},
{
"epoch": 0.61,
"grad_norm": 2.887143611907959,
"learning_rate": 7.002555721279553e-07,
"loss": 0.1181,
"step": 2229
},
{
"epoch": 0.61,
"grad_norm": 2.7964494228363037,
"learning_rate": 6.994114246042955e-07,
"loss": 0.1256,
"step": 2230
},
{
"epoch": 0.61,
"grad_norm": 2.726381778717041,
"learning_rate": 6.985675124855259e-07,
"loss": 0.1188,
"step": 2231
},
{
"epoch": 0.61,
"grad_norm": 2.7800586223602295,
"learning_rate": 6.977238364325539e-07,
"loss": 0.1193,
"step": 2232
},
{
"epoch": 0.61,
"grad_norm": 2.6497507095336914,
"learning_rate": 6.96880397106101e-07,
"loss": 0.1149,
"step": 2233
},
{
"epoch": 0.61,
"grad_norm": 2.7353129386901855,
"learning_rate": 6.960371951667036e-07,
"loss": 0.1292,
"step": 2234
},
{
"epoch": 0.61,
"grad_norm": 2.649967670440674,
"learning_rate": 6.951942312747134e-07,
"loss": 0.1086,
"step": 2235
},
{
"epoch": 0.61,
"grad_norm": 2.7503740787506104,
"learning_rate": 6.943515060902935e-07,
"loss": 0.1278,
"step": 2236
},
{
"epoch": 0.61,
"grad_norm": 2.9312124252319336,
"learning_rate": 6.93509020273422e-07,
"loss": 0.1215,
"step": 2237
},
{
"epoch": 0.61,
"grad_norm": 2.8223354816436768,
"learning_rate": 6.926667744838881e-07,
"loss": 0.127,
"step": 2238
},
{
"epoch": 0.61,
"grad_norm": 2.670806646347046,
"learning_rate": 6.918247693812936e-07,
"loss": 0.1347,
"step": 2239
},
{
"epoch": 0.61,
"grad_norm": 2.6457080841064453,
"learning_rate": 6.909830056250526e-07,
"loss": 0.1146,
"step": 2240
},
{
"epoch": 0.61,
"grad_norm": 2.948946714401245,
"learning_rate": 6.901414838743886e-07,
"loss": 0.1344,
"step": 2241
},
{
"epoch": 0.61,
"grad_norm": 2.909689426422119,
"learning_rate": 6.893002047883372e-07,
"loss": 0.1219,
"step": 2242
},
{
"epoch": 0.61,
"grad_norm": 2.824042558670044,
"learning_rate": 6.884591690257425e-07,
"loss": 0.1231,
"step": 2243
},
{
"epoch": 0.61,
"grad_norm": 2.743844985961914,
"learning_rate": 6.876183772452587e-07,
"loss": 0.1161,
"step": 2244
},
{
"epoch": 0.61,
"grad_norm": 2.7554304599761963,
"learning_rate": 6.867778301053495e-07,
"loss": 0.1207,
"step": 2245
},
{
"epoch": 0.61,
"grad_norm": 3.1746580600738525,
"learning_rate": 6.85937528264286e-07,
"loss": 0.1354,
"step": 2246
},
{
"epoch": 0.61,
"grad_norm": 2.7574260234832764,
"learning_rate": 6.850974723801479e-07,
"loss": 0.1264,
"step": 2247
},
{
"epoch": 0.61,
"grad_norm": 2.657491683959961,
"learning_rate": 6.842576631108219e-07,
"loss": 0.1035,
"step": 2248
},
{
"epoch": 0.61,
"grad_norm": 2.733588218688965,
"learning_rate": 6.834181011140014e-07,
"loss": 0.1119,
"step": 2249
},
{
"epoch": 0.61,
"grad_norm": 2.7875351905822754,
"learning_rate": 6.825787870471872e-07,
"loss": 0.1282,
"step": 2250
},
{
"epoch": 0.61,
"grad_norm": 2.7033324241638184,
"learning_rate": 6.817397215676845e-07,
"loss": 0.1197,
"step": 2251
},
{
"epoch": 0.62,
"grad_norm": 3.04990291595459,
"learning_rate": 6.809009053326049e-07,
"loss": 0.1303,
"step": 2252
},
{
"epoch": 0.62,
"grad_norm": 2.886972188949585,
"learning_rate": 6.800623389988641e-07,
"loss": 0.1342,
"step": 2253
},
{
"epoch": 0.62,
"grad_norm": 2.8929286003112793,
"learning_rate": 6.792240232231821e-07,
"loss": 0.1216,
"step": 2254
},
{
"epoch": 0.62,
"grad_norm": 2.6549034118652344,
"learning_rate": 6.783859586620839e-07,
"loss": 0.1084,
"step": 2255
},
{
"epoch": 0.62,
"grad_norm": 2.9380385875701904,
"learning_rate": 6.775481459718959e-07,
"loss": 0.1389,
"step": 2256
},
{
"epoch": 0.62,
"grad_norm": 2.8192174434661865,
"learning_rate": 6.767105858087489e-07,
"loss": 0.1221,
"step": 2257
},
{
"epoch": 0.62,
"grad_norm": 2.571758270263672,
"learning_rate": 6.758732788285746e-07,
"loss": 0.1193,
"step": 2258
},
{
"epoch": 0.62,
"grad_norm": 2.8911731243133545,
"learning_rate": 6.750362256871074e-07,
"loss": 0.1301,
"step": 2259
},
{
"epoch": 0.62,
"grad_norm": 2.6638855934143066,
"learning_rate": 6.741994270398825e-07,
"loss": 0.1195,
"step": 2260
},
{
"epoch": 0.62,
"grad_norm": 2.737165927886963,
"learning_rate": 6.733628835422358e-07,
"loss": 0.1212,
"step": 2261
},
{
"epoch": 0.62,
"grad_norm": 2.893892526626587,
"learning_rate": 6.725265958493034e-07,
"loss": 0.1329,
"step": 2262
},
{
"epoch": 0.62,
"grad_norm": 2.9284043312072754,
"learning_rate": 6.716905646160208e-07,
"loss": 0.1224,
"step": 2263
},
{
"epoch": 0.62,
"grad_norm": 2.7682137489318848,
"learning_rate": 6.708547904971233e-07,
"loss": 0.1164,
"step": 2264
},
{
"epoch": 0.62,
"grad_norm": 2.6199045181274414,
"learning_rate": 6.700192741471446e-07,
"loss": 0.1106,
"step": 2265
},
{
"epoch": 0.62,
"grad_norm": 2.7449543476104736,
"learning_rate": 6.691840162204161e-07,
"loss": 0.1181,
"step": 2266
},
{
"epoch": 0.62,
"grad_norm": 3.066565990447998,
"learning_rate": 6.683490173710673e-07,
"loss": 0.129,
"step": 2267
},
{
"epoch": 0.62,
"grad_norm": 2.739189624786377,
"learning_rate": 6.675142782530241e-07,
"loss": 0.1096,
"step": 2268
},
{
"epoch": 0.62,
"grad_norm": 2.745358467102051,
"learning_rate": 6.6667979952001e-07,
"loss": 0.1259,
"step": 2269
},
{
"epoch": 0.62,
"grad_norm": 2.9944846630096436,
"learning_rate": 6.658455818255444e-07,
"loss": 0.1265,
"step": 2270
},
{
"epoch": 0.62,
"grad_norm": 2.9806787967681885,
"learning_rate": 6.650116258229414e-07,
"loss": 0.1276,
"step": 2271
},
{
"epoch": 0.62,
"grad_norm": 2.8522584438323975,
"learning_rate": 6.641779321653108e-07,
"loss": 0.1233,
"step": 2272
},
{
"epoch": 0.62,
"grad_norm": 2.688368558883667,
"learning_rate": 6.633445015055574e-07,
"loss": 0.1166,
"step": 2273
},
{
"epoch": 0.62,
"grad_norm": 2.6915407180786133,
"learning_rate": 6.625113344963787e-07,
"loss": 0.1152,
"step": 2274
},
{
"epoch": 0.62,
"grad_norm": 2.67826247215271,
"learning_rate": 6.616784317902673e-07,
"loss": 0.125,
"step": 2275
},
{
"epoch": 0.62,
"grad_norm": 2.910579204559326,
"learning_rate": 6.608457940395075e-07,
"loss": 0.1248,
"step": 2276
},
{
"epoch": 0.62,
"grad_norm": 2.5989644527435303,
"learning_rate": 6.600134218961764e-07,
"loss": 0.12,
"step": 2277
},
{
"epoch": 0.62,
"grad_norm": 2.681771993637085,
"learning_rate": 6.591813160121444e-07,
"loss": 0.1236,
"step": 2278
},
{
"epoch": 0.62,
"grad_norm": 2.7182021141052246,
"learning_rate": 6.583494770390713e-07,
"loss": 0.1242,
"step": 2279
},
{
"epoch": 0.62,
"grad_norm": 2.9027819633483887,
"learning_rate": 6.575179056284095e-07,
"loss": 0.1234,
"step": 2280
},
{
"epoch": 0.62,
"grad_norm": 2.901743173599243,
"learning_rate": 6.566866024314007e-07,
"loss": 0.1412,
"step": 2281
},
{
"epoch": 0.62,
"grad_norm": 2.7416176795959473,
"learning_rate": 6.558555680990771e-07,
"loss": 0.1088,
"step": 2282
},
{
"epoch": 0.62,
"grad_norm": 2.6983866691589355,
"learning_rate": 6.550248032822612e-07,
"loss": 0.1332,
"step": 2283
},
{
"epoch": 0.62,
"grad_norm": 2.726213216781616,
"learning_rate": 6.541943086315625e-07,
"loss": 0.121,
"step": 2284
},
{
"epoch": 0.62,
"grad_norm": 2.6889567375183105,
"learning_rate": 6.533640847973808e-07,
"loss": 0.1182,
"step": 2285
},
{
"epoch": 0.62,
"grad_norm": 2.8857784271240234,
"learning_rate": 6.525341324299023e-07,
"loss": 0.118,
"step": 2286
},
{
"epoch": 0.62,
"grad_norm": 2.8600974082946777,
"learning_rate": 6.517044521791015e-07,
"loss": 0.1364,
"step": 2287
},
{
"epoch": 0.63,
"grad_norm": 3.007960557937622,
"learning_rate": 6.5087504469474e-07,
"loss": 0.1312,
"step": 2288
},
{
"epoch": 0.63,
"grad_norm": 2.543405771255493,
"learning_rate": 6.500459106263649e-07,
"loss": 0.1056,
"step": 2289
},
{
"epoch": 0.63,
"grad_norm": 2.672752857208252,
"learning_rate": 6.492170506233099e-07,
"loss": 0.1115,
"step": 2290
},
{
"epoch": 0.63,
"grad_norm": 2.805952310562134,
"learning_rate": 6.483884653346936e-07,
"loss": 0.1235,
"step": 2291
},
{
"epoch": 0.63,
"grad_norm": 2.4604685306549072,
"learning_rate": 6.475601554094196e-07,
"loss": 0.1159,
"step": 2292
},
{
"epoch": 0.63,
"grad_norm": 2.747077226638794,
"learning_rate": 6.467321214961765e-07,
"loss": 0.1313,
"step": 2293
},
{
"epoch": 0.63,
"grad_norm": 2.718703269958496,
"learning_rate": 6.459043642434355e-07,
"loss": 0.1281,
"step": 2294
},
{
"epoch": 0.63,
"grad_norm": 2.6458544731140137,
"learning_rate": 6.450768842994522e-07,
"loss": 0.116,
"step": 2295
},
{
"epoch": 0.63,
"grad_norm": 2.7539422512054443,
"learning_rate": 6.442496823122643e-07,
"loss": 0.1172,
"step": 2296
},
{
"epoch": 0.63,
"grad_norm": 3.0029823780059814,
"learning_rate": 6.434227589296921e-07,
"loss": 0.1219,
"step": 2297
},
{
"epoch": 0.63,
"grad_norm": 2.757765054702759,
"learning_rate": 6.425961147993384e-07,
"loss": 0.1249,
"step": 2298
},
{
"epoch": 0.63,
"grad_norm": 2.759751319885254,
"learning_rate": 6.417697505685859e-07,
"loss": 0.1235,
"step": 2299
},
{
"epoch": 0.63,
"grad_norm": 2.6895663738250732,
"learning_rate": 6.409436668845996e-07,
"loss": 0.1117,
"step": 2300
},
{
"epoch": 0.63,
"grad_norm": 2.910151243209839,
"learning_rate": 6.401178643943233e-07,
"loss": 0.131,
"step": 2301
},
{
"epoch": 0.63,
"grad_norm": 2.927711248397827,
"learning_rate": 6.392923437444815e-07,
"loss": 0.117,
"step": 2302
},
{
"epoch": 0.63,
"grad_norm": 2.8698267936706543,
"learning_rate": 6.384671055815782e-07,
"loss": 0.1251,
"step": 2303
},
{
"epoch": 0.63,
"grad_norm": 3.05301833152771,
"learning_rate": 6.376421505518954e-07,
"loss": 0.1375,
"step": 2304
},
{
"epoch": 0.63,
"grad_norm": 2.545640468597412,
"learning_rate": 6.368174793014943e-07,
"loss": 0.1141,
"step": 2305
},
{
"epoch": 0.63,
"grad_norm": 2.779310703277588,
"learning_rate": 6.359930924762122e-07,
"loss": 0.1162,
"step": 2306
},
{
"epoch": 0.63,
"grad_norm": 2.912362575531006,
"learning_rate": 6.351689907216657e-07,
"loss": 0.1192,
"step": 2307
},
{
"epoch": 0.63,
"grad_norm": 2.7148022651672363,
"learning_rate": 6.343451746832471e-07,
"loss": 0.1133,
"step": 2308
},
{
"epoch": 0.63,
"grad_norm": 2.3892126083374023,
"learning_rate": 6.335216450061247e-07,
"loss": 0.112,
"step": 2309
},
{
"epoch": 0.63,
"grad_norm": 2.9409687519073486,
"learning_rate": 6.326984023352434e-07,
"loss": 0.1256,
"step": 2310
},
{
"epoch": 0.63,
"grad_norm": 2.7799158096313477,
"learning_rate": 6.31875447315322e-07,
"loss": 0.1267,
"step": 2311
},
{
"epoch": 0.63,
"grad_norm": 2.873560905456543,
"learning_rate": 6.310527805908556e-07,
"loss": 0.1264,
"step": 2312
},
{
"epoch": 0.63,
"grad_norm": 2.6879260540008545,
"learning_rate": 6.302304028061125e-07,
"loss": 0.1344,
"step": 2313
},
{
"epoch": 0.63,
"grad_norm": 2.8451976776123047,
"learning_rate": 6.29408314605135e-07,
"loss": 0.1267,
"step": 2314
},
{
"epoch": 0.63,
"grad_norm": 2.7981550693511963,
"learning_rate": 6.285865166317386e-07,
"loss": 0.1287,
"step": 2315
},
{
"epoch": 0.63,
"grad_norm": 2.8838305473327637,
"learning_rate": 6.277650095295112e-07,
"loss": 0.1207,
"step": 2316
},
{
"epoch": 0.63,
"grad_norm": 2.844289779663086,
"learning_rate": 6.269437939418136e-07,
"loss": 0.1218,
"step": 2317
},
{
"epoch": 0.63,
"grad_norm": 2.7332122325897217,
"learning_rate": 6.26122870511778e-07,
"loss": 0.1265,
"step": 2318
},
{
"epoch": 0.63,
"grad_norm": 2.9675590991973877,
"learning_rate": 6.253022398823075e-07,
"loss": 0.1281,
"step": 2319
},
{
"epoch": 0.63,
"grad_norm": 2.5885355472564697,
"learning_rate": 6.244819026960761e-07,
"loss": 0.1178,
"step": 2320
},
{
"epoch": 0.63,
"grad_norm": 2.498624086380005,
"learning_rate": 6.236618595955277e-07,
"loss": 0.1143,
"step": 2321
},
{
"epoch": 0.63,
"grad_norm": 2.7661285400390625,
"learning_rate": 6.228421112228767e-07,
"loss": 0.1229,
"step": 2322
},
{
"epoch": 0.63,
"grad_norm": 2.705780029296875,
"learning_rate": 6.220226582201061e-07,
"loss": 0.1307,
"step": 2323
},
{
"epoch": 0.63,
"grad_norm": 2.7333617210388184,
"learning_rate": 6.212035012289674e-07,
"loss": 0.1181,
"step": 2324
},
{
"epoch": 0.64,
"grad_norm": 2.924672842025757,
"learning_rate": 6.203846408909808e-07,
"loss": 0.1299,
"step": 2325
},
{
"epoch": 0.64,
"grad_norm": 2.5709447860717773,
"learning_rate": 6.195660778474334e-07,
"loss": 0.1165,
"step": 2326
},
{
"epoch": 0.64,
"grad_norm": 2.628356695175171,
"learning_rate": 6.187478127393806e-07,
"loss": 0.1154,
"step": 2327
},
{
"epoch": 0.64,
"grad_norm": 2.796377420425415,
"learning_rate": 6.179298462076437e-07,
"loss": 0.1121,
"step": 2328
},
{
"epoch": 0.64,
"grad_norm": 2.963430166244507,
"learning_rate": 6.1711217889281e-07,
"loss": 0.1289,
"step": 2329
},
{
"epoch": 0.64,
"grad_norm": 2.8228838443756104,
"learning_rate": 6.162948114352328e-07,
"loss": 0.1259,
"step": 2330
},
{
"epoch": 0.64,
"grad_norm": 2.864867687225342,
"learning_rate": 6.154777444750312e-07,
"loss": 0.13,
"step": 2331
},
{
"epoch": 0.64,
"grad_norm": 2.9602138996124268,
"learning_rate": 6.146609786520877e-07,
"loss": 0.1195,
"step": 2332
},
{
"epoch": 0.64,
"grad_norm": 2.576875925064087,
"learning_rate": 6.1384451460605e-07,
"loss": 0.1136,
"step": 2333
},
{
"epoch": 0.64,
"grad_norm": 2.717334032058716,
"learning_rate": 6.130283529763286e-07,
"loss": 0.1239,
"step": 2334
},
{
"epoch": 0.64,
"grad_norm": 2.629843235015869,
"learning_rate": 6.122124944020977e-07,
"loss": 0.1163,
"step": 2335
},
{
"epoch": 0.64,
"grad_norm": 2.5194149017333984,
"learning_rate": 6.113969395222948e-07,
"loss": 0.1007,
"step": 2336
},
{
"epoch": 0.64,
"grad_norm": 2.5218937397003174,
"learning_rate": 6.105816889756179e-07,
"loss": 0.1052,
"step": 2337
},
{
"epoch": 0.64,
"grad_norm": 2.9415934085845947,
"learning_rate": 6.097667434005285e-07,
"loss": 0.1188,
"step": 2338
},
{
"epoch": 0.64,
"grad_norm": 2.800436019897461,
"learning_rate": 6.089521034352474e-07,
"loss": 0.1134,
"step": 2339
},
{
"epoch": 0.64,
"grad_norm": 3.1739115715026855,
"learning_rate": 6.081377697177576e-07,
"loss": 0.1232,
"step": 2340
},
{
"epoch": 0.64,
"grad_norm": 2.7899343967437744,
"learning_rate": 6.073237428858019e-07,
"loss": 0.1195,
"step": 2341
},
{
"epoch": 0.64,
"grad_norm": 2.8649964332580566,
"learning_rate": 6.06510023576882e-07,
"loss": 0.1286,
"step": 2342
},
{
"epoch": 0.64,
"grad_norm": 2.611206531524658,
"learning_rate": 6.0569661242826e-07,
"loss": 0.1059,
"step": 2343
},
{
"epoch": 0.64,
"grad_norm": 2.775590658187866,
"learning_rate": 6.048835100769555e-07,
"loss": 0.1102,
"step": 2344
},
{
"epoch": 0.64,
"grad_norm": 2.5424020290374756,
"learning_rate": 6.040707171597465e-07,
"loss": 0.1076,
"step": 2345
},
{
"epoch": 0.64,
"grad_norm": 2.8652491569519043,
"learning_rate": 6.032582343131698e-07,
"loss": 0.1248,
"step": 2346
},
{
"epoch": 0.64,
"grad_norm": 2.5659899711608887,
"learning_rate": 6.024460621735179e-07,
"loss": 0.1187,
"step": 2347
},
{
"epoch": 0.64,
"grad_norm": 2.6558399200439453,
"learning_rate": 6.016342013768407e-07,
"loss": 0.1201,
"step": 2348
},
{
"epoch": 0.64,
"grad_norm": 2.8213798999786377,
"learning_rate": 6.00822652558944e-07,
"loss": 0.1217,
"step": 2349
},
{
"epoch": 0.64,
"grad_norm": 3.105112075805664,
"learning_rate": 6.000114163553893e-07,
"loss": 0.1288,
"step": 2350
},
{
"epoch": 0.64,
"grad_norm": 2.8976495265960693,
"learning_rate": 5.99200493401494e-07,
"loss": 0.1345,
"step": 2351
},
{
"epoch": 0.64,
"grad_norm": 3.064840316772461,
"learning_rate": 5.983898843323291e-07,
"loss": 0.1432,
"step": 2352
},
{
"epoch": 0.64,
"grad_norm": 2.7999205589294434,
"learning_rate": 5.975795897827205e-07,
"loss": 0.1311,
"step": 2353
},
{
"epoch": 0.64,
"grad_norm": 2.657651424407959,
"learning_rate": 5.967696103872471e-07,
"loss": 0.1074,
"step": 2354
},
{
"epoch": 0.64,
"grad_norm": 3.456746816635132,
"learning_rate": 5.959599467802417e-07,
"loss": 0.131,
"step": 2355
},
{
"epoch": 0.64,
"grad_norm": 2.8039300441741943,
"learning_rate": 5.951505995957899e-07,
"loss": 0.1156,
"step": 2356
},
{
"epoch": 0.64,
"grad_norm": 2.621748447418213,
"learning_rate": 5.943415694677285e-07,
"loss": 0.1154,
"step": 2357
},
{
"epoch": 0.64,
"grad_norm": 3.092664957046509,
"learning_rate": 5.935328570296472e-07,
"loss": 0.1358,
"step": 2358
},
{
"epoch": 0.64,
"grad_norm": 2.7029240131378174,
"learning_rate": 5.927244629148854e-07,
"loss": 0.1096,
"step": 2359
},
{
"epoch": 0.64,
"grad_norm": 2.715873956680298,
"learning_rate": 5.919163877565349e-07,
"loss": 0.115,
"step": 2360
},
{
"epoch": 0.64,
"grad_norm": 2.766658306121826,
"learning_rate": 5.911086321874371e-07,
"loss": 0.1164,
"step": 2361
},
{
"epoch": 0.65,
"grad_norm": 2.7069411277770996,
"learning_rate": 5.903011968401823e-07,
"loss": 0.1233,
"step": 2362
},
{
"epoch": 0.65,
"grad_norm": 2.9014713764190674,
"learning_rate": 5.894940823471112e-07,
"loss": 0.1331,
"step": 2363
},
{
"epoch": 0.65,
"grad_norm": 2.8393664360046387,
"learning_rate": 5.886872893403118e-07,
"loss": 0.115,
"step": 2364
},
{
"epoch": 0.65,
"grad_norm": 2.6533043384552,
"learning_rate": 5.878808184516224e-07,
"loss": 0.113,
"step": 2365
},
{
"epoch": 0.65,
"grad_norm": 2.948765993118286,
"learning_rate": 5.870746703126272e-07,
"loss": 0.1353,
"step": 2366
},
{
"epoch": 0.65,
"grad_norm": 3.019033670425415,
"learning_rate": 5.862688455546585e-07,
"loss": 0.1352,
"step": 2367
},
{
"epoch": 0.65,
"grad_norm": 2.90289044380188,
"learning_rate": 5.854633448087951e-07,
"loss": 0.1364,
"step": 2368
},
{
"epoch": 0.65,
"grad_norm": 2.740635871887207,
"learning_rate": 5.846581687058616e-07,
"loss": 0.1271,
"step": 2369
},
{
"epoch": 0.65,
"grad_norm": 2.637136459350586,
"learning_rate": 5.838533178764294e-07,
"loss": 0.1151,
"step": 2370
},
{
"epoch": 0.65,
"grad_norm": 2.9797353744506836,
"learning_rate": 5.830487929508147e-07,
"loss": 0.1489,
"step": 2371
},
{
"epoch": 0.65,
"grad_norm": 2.595749855041504,
"learning_rate": 5.82244594559078e-07,
"loss": 0.1263,
"step": 2372
},
{
"epoch": 0.65,
"grad_norm": 2.810940742492676,
"learning_rate": 5.814407233310248e-07,
"loss": 0.1234,
"step": 2373
},
{
"epoch": 0.65,
"grad_norm": 2.7239415645599365,
"learning_rate": 5.806371798962039e-07,
"loss": 0.1184,
"step": 2374
},
{
"epoch": 0.65,
"grad_norm": 2.6839990615844727,
"learning_rate": 5.798339648839073e-07,
"loss": 0.1225,
"step": 2375
},
{
"epoch": 0.65,
"grad_norm": 2.758370876312256,
"learning_rate": 5.790310789231703e-07,
"loss": 0.1281,
"step": 2376
},
{
"epoch": 0.65,
"grad_norm": 2.836064577102661,
"learning_rate": 5.782285226427699e-07,
"loss": 0.1255,
"step": 2377
},
{
"epoch": 0.65,
"grad_norm": 2.8172318935394287,
"learning_rate": 5.774262966712258e-07,
"loss": 0.1118,
"step": 2378
},
{
"epoch": 0.65,
"grad_norm": 2.9178755283355713,
"learning_rate": 5.766244016367981e-07,
"loss": 0.1438,
"step": 2379
},
{
"epoch": 0.65,
"grad_norm": 2.657327890396118,
"learning_rate": 5.758228381674878e-07,
"loss": 0.1161,
"step": 2380
},
{
"epoch": 0.65,
"grad_norm": 2.856813669204712,
"learning_rate": 5.750216068910374e-07,
"loss": 0.143,
"step": 2381
},
{
"epoch": 0.65,
"grad_norm": 2.58343243598938,
"learning_rate": 5.742207084349273e-07,
"loss": 0.1159,
"step": 2382
},
{
"epoch": 0.65,
"grad_norm": 2.7350480556488037,
"learning_rate": 5.734201434263792e-07,
"loss": 0.1337,
"step": 2383
},
{
"epoch": 0.65,
"grad_norm": 2.882652997970581,
"learning_rate": 5.726199124923526e-07,
"loss": 0.1339,
"step": 2384
},
{
"epoch": 0.65,
"grad_norm": 2.671844482421875,
"learning_rate": 5.718200162595448e-07,
"loss": 0.1202,
"step": 2385
},
{
"epoch": 0.65,
"grad_norm": 2.901212215423584,
"learning_rate": 5.710204553543927e-07,
"loss": 0.1299,
"step": 2386
},
{
"epoch": 0.65,
"grad_norm": 2.746882200241089,
"learning_rate": 5.702212304030689e-07,
"loss": 0.1198,
"step": 2387
},
{
"epoch": 0.65,
"grad_norm": 2.987044095993042,
"learning_rate": 5.694223420314845e-07,
"loss": 0.1174,
"step": 2388
},
{
"epoch": 0.65,
"grad_norm": 2.5937628746032715,
"learning_rate": 5.686237908652854e-07,
"loss": 0.1078,
"step": 2389
},
{
"epoch": 0.65,
"grad_norm": 2.7164485454559326,
"learning_rate": 5.678255775298542e-07,
"loss": 0.1222,
"step": 2390
},
{
"epoch": 0.65,
"grad_norm": 2.930262804031372,
"learning_rate": 5.670277026503092e-07,
"loss": 0.1263,
"step": 2391
},
{
"epoch": 0.65,
"grad_norm": 3.0716938972473145,
"learning_rate": 5.662301668515029e-07,
"loss": 0.1377,
"step": 2392
},
{
"epoch": 0.65,
"grad_norm": 2.950962543487549,
"learning_rate": 5.654329707580232e-07,
"loss": 0.1325,
"step": 2393
},
{
"epoch": 0.65,
"grad_norm": 2.8141679763793945,
"learning_rate": 5.646361149941911e-07,
"loss": 0.1124,
"step": 2394
},
{
"epoch": 0.65,
"grad_norm": 2.8364176750183105,
"learning_rate": 5.638396001840612e-07,
"loss": 0.1165,
"step": 2395
},
{
"epoch": 0.65,
"grad_norm": 2.9714386463165283,
"learning_rate": 5.630434269514218e-07,
"loss": 0.1164,
"step": 2396
},
{
"epoch": 0.65,
"grad_norm": 2.8221168518066406,
"learning_rate": 5.622475959197925e-07,
"loss": 0.1156,
"step": 2397
},
{
"epoch": 0.66,
"grad_norm": 2.6841065883636475,
"learning_rate": 5.614521077124266e-07,
"loss": 0.1061,
"step": 2398
},
{
"epoch": 0.66,
"grad_norm": 2.5104100704193115,
"learning_rate": 5.606569629523072e-07,
"loss": 0.106,
"step": 2399
},
{
"epoch": 0.66,
"grad_norm": 3.1628897190093994,
"learning_rate": 5.598621622621489e-07,
"loss": 0.1297,
"step": 2400
},
{
"epoch": 0.66,
"grad_norm": 3.006103992462158,
"learning_rate": 5.590677062643976e-07,
"loss": 0.1284,
"step": 2401
},
{
"epoch": 0.66,
"grad_norm": 2.9417686462402344,
"learning_rate": 5.582735955812283e-07,
"loss": 0.1252,
"step": 2402
},
{
"epoch": 0.66,
"grad_norm": 2.9639692306518555,
"learning_rate": 5.574798308345468e-07,
"loss": 0.1342,
"step": 2403
},
{
"epoch": 0.66,
"grad_norm": 2.877833366394043,
"learning_rate": 5.566864126459863e-07,
"loss": 0.1246,
"step": 2404
},
{
"epoch": 0.66,
"grad_norm": 2.974093198776245,
"learning_rate": 5.558933416369097e-07,
"loss": 0.1227,
"step": 2405
},
{
"epoch": 0.66,
"grad_norm": 2.7533552646636963,
"learning_rate": 5.551006184284082e-07,
"loss": 0.125,
"step": 2406
},
{
"epoch": 0.66,
"grad_norm": 2.908942461013794,
"learning_rate": 5.543082436412994e-07,
"loss": 0.1246,
"step": 2407
},
{
"epoch": 0.66,
"grad_norm": 2.829738140106201,
"learning_rate": 5.535162178961299e-07,
"loss": 0.1216,
"step": 2408
},
{
"epoch": 0.66,
"grad_norm": 2.8664655685424805,
"learning_rate": 5.527245418131713e-07,
"loss": 0.1132,
"step": 2409
},
{
"epoch": 0.66,
"grad_norm": 2.7472426891326904,
"learning_rate": 5.519332160124215e-07,
"loss": 0.1195,
"step": 2410
},
{
"epoch": 0.66,
"grad_norm": 2.919637441635132,
"learning_rate": 5.511422411136056e-07,
"loss": 0.126,
"step": 2411
},
{
"epoch": 0.66,
"grad_norm": 2.673835039138794,
"learning_rate": 5.503516177361717e-07,
"loss": 0.1224,
"step": 2412
},
{
"epoch": 0.66,
"grad_norm": 2.6640067100524902,
"learning_rate": 5.495613464992943e-07,
"loss": 0.1164,
"step": 2413
},
{
"epoch": 0.66,
"grad_norm": 2.688185214996338,
"learning_rate": 5.487714280218722e-07,
"loss": 0.1049,
"step": 2414
},
{
"epoch": 0.66,
"grad_norm": 2.598675012588501,
"learning_rate": 5.479818629225259e-07,
"loss": 0.106,
"step": 2415
},
{
"epoch": 0.66,
"grad_norm": 2.9329745769500732,
"learning_rate": 5.471926518196017e-07,
"loss": 0.1236,
"step": 2416
},
{
"epoch": 0.66,
"grad_norm": 3.0776431560516357,
"learning_rate": 5.464037953311667e-07,
"loss": 0.1253,
"step": 2417
},
{
"epoch": 0.66,
"grad_norm": 2.7781357765197754,
"learning_rate": 5.456152940750113e-07,
"loss": 0.1181,
"step": 2418
},
{
"epoch": 0.66,
"grad_norm": 2.645616054534912,
"learning_rate": 5.448271486686486e-07,
"loss": 0.1118,
"step": 2419
},
{
"epoch": 0.66,
"grad_norm": 2.6715989112854004,
"learning_rate": 5.440393597293102e-07,
"loss": 0.1135,
"step": 2420
},
{
"epoch": 0.66,
"grad_norm": 2.7128186225891113,
"learning_rate": 5.432519278739514e-07,
"loss": 0.1079,
"step": 2421
},
{
"epoch": 0.66,
"grad_norm": 2.6940815448760986,
"learning_rate": 5.42464853719246e-07,
"loss": 0.1138,
"step": 2422
},
{
"epoch": 0.66,
"grad_norm": 2.902479410171509,
"learning_rate": 5.416781378815885e-07,
"loss": 0.1217,
"step": 2423
},
{
"epoch": 0.66,
"grad_norm": 2.772676944732666,
"learning_rate": 5.408917809770938e-07,
"loss": 0.118,
"step": 2424
},
{
"epoch": 0.66,
"grad_norm": 2.912195920944214,
"learning_rate": 5.401057836215927e-07,
"loss": 0.1201,
"step": 2425
},
{
"epoch": 0.66,
"grad_norm": 2.821068286895752,
"learning_rate": 5.393201464306378e-07,
"loss": 0.1235,
"step": 2426
},
{
"epoch": 0.66,
"grad_norm": 2.5337672233581543,
"learning_rate": 5.38534870019497e-07,
"loss": 0.1128,
"step": 2427
},
{
"epoch": 0.66,
"grad_norm": 2.879474639892578,
"learning_rate": 5.377499550031572e-07,
"loss": 0.1218,
"step": 2428
},
{
"epoch": 0.66,
"grad_norm": 2.6883537769317627,
"learning_rate": 5.369654019963228e-07,
"loss": 0.109,
"step": 2429
},
{
"epoch": 0.66,
"grad_norm": 2.7536401748657227,
"learning_rate": 5.361812116134121e-07,
"loss": 0.1268,
"step": 2430
},
{
"epoch": 0.66,
"grad_norm": 2.6968419551849365,
"learning_rate": 5.35397384468562e-07,
"loss": 0.1168,
"step": 2431
},
{
"epoch": 0.66,
"grad_norm": 2.8103954792022705,
"learning_rate": 5.346139211756236e-07,
"loss": 0.111,
"step": 2432
},
{
"epoch": 0.66,
"grad_norm": 2.854973793029785,
"learning_rate": 5.338308223481637e-07,
"loss": 0.1319,
"step": 2433
},
{
"epoch": 0.66,
"grad_norm": 2.7689507007598877,
"learning_rate": 5.330480885994639e-07,
"loss": 0.1263,
"step": 2434
},
{
"epoch": 0.67,
"grad_norm": 2.8621950149536133,
"learning_rate": 5.322657205425183e-07,
"loss": 0.1284,
"step": 2435
},
{
"epoch": 0.67,
"grad_norm": 2.8837008476257324,
"learning_rate": 5.314837187900366e-07,
"loss": 0.1369,
"step": 2436
},
{
"epoch": 0.67,
"grad_norm": 2.7992589473724365,
"learning_rate": 5.307020839544398e-07,
"loss": 0.1098,
"step": 2437
},
{
"epoch": 0.67,
"grad_norm": 2.9363605976104736,
"learning_rate": 5.299208166478632e-07,
"loss": 0.1278,
"step": 2438
},
{
"epoch": 0.67,
"grad_norm": 2.9051101207733154,
"learning_rate": 5.291399174821538e-07,
"loss": 0.1304,
"step": 2439
},
{
"epoch": 0.67,
"grad_norm": 2.629927635192871,
"learning_rate": 5.283593870688697e-07,
"loss": 0.1085,
"step": 2440
},
{
"epoch": 0.67,
"grad_norm": 2.664454936981201,
"learning_rate": 5.275792260192804e-07,
"loss": 0.1234,
"step": 2441
},
{
"epoch": 0.67,
"grad_norm": 3.012160301208496,
"learning_rate": 5.267994349443661e-07,
"loss": 0.1287,
"step": 2442
},
{
"epoch": 0.67,
"grad_norm": 3.0345184803009033,
"learning_rate": 5.260200144548177e-07,
"loss": 0.1313,
"step": 2443
},
{
"epoch": 0.67,
"grad_norm": 3.057971715927124,
"learning_rate": 5.252409651610363e-07,
"loss": 0.1307,
"step": 2444
},
{
"epoch": 0.67,
"grad_norm": 2.6942265033721924,
"learning_rate": 5.244622876731308e-07,
"loss": 0.1145,
"step": 2445
},
{
"epoch": 0.67,
"grad_norm": 2.55898380279541,
"learning_rate": 5.236839826009201e-07,
"loss": 0.1121,
"step": 2446
},
{
"epoch": 0.67,
"grad_norm": 3.0462722778320312,
"learning_rate": 5.229060505539307e-07,
"loss": 0.1275,
"step": 2447
},
{
"epoch": 0.67,
"grad_norm": 2.7524399757385254,
"learning_rate": 5.221284921413973e-07,
"loss": 0.1218,
"step": 2448
},
{
"epoch": 0.67,
"grad_norm": 2.9039595127105713,
"learning_rate": 5.21351307972263e-07,
"loss": 0.1348,
"step": 2449
},
{
"epoch": 0.67,
"grad_norm": 2.817802906036377,
"learning_rate": 5.205744986551762e-07,
"loss": 0.1172,
"step": 2450
},
{
"epoch": 0.67,
"grad_norm": 2.7943778038024902,
"learning_rate": 5.197980647984921e-07,
"loss": 0.1326,
"step": 2451
},
{
"epoch": 0.67,
"grad_norm": 2.7645349502563477,
"learning_rate": 5.190220070102727e-07,
"loss": 0.1091,
"step": 2452
},
{
"epoch": 0.67,
"grad_norm": 2.627145290374756,
"learning_rate": 5.182463258982846e-07,
"loss": 0.119,
"step": 2453
},
{
"epoch": 0.67,
"grad_norm": 2.776512384414673,
"learning_rate": 5.1747102207e-07,
"loss": 0.1259,
"step": 2454
},
{
"epoch": 0.67,
"grad_norm": 2.8029136657714844,
"learning_rate": 5.166960961325955e-07,
"loss": 0.116,
"step": 2455
},
{
"epoch": 0.67,
"grad_norm": 3.0958304405212402,
"learning_rate": 5.159215486929509e-07,
"loss": 0.1193,
"step": 2456
},
{
"epoch": 0.67,
"grad_norm": 3.364405870437622,
"learning_rate": 5.151473803576512e-07,
"loss": 0.1107,
"step": 2457
},
{
"epoch": 0.67,
"grad_norm": 3.164470672607422,
"learning_rate": 5.143735917329827e-07,
"loss": 0.1337,
"step": 2458
},
{
"epoch": 0.67,
"grad_norm": 2.607489824295044,
"learning_rate": 5.136001834249364e-07,
"loss": 0.1072,
"step": 2459
},
{
"epoch": 0.67,
"grad_norm": 2.848759889602661,
"learning_rate": 5.128271560392037e-07,
"loss": 0.12,
"step": 2460
},
{
"epoch": 0.67,
"grad_norm": 2.950552225112915,
"learning_rate": 5.120545101811777e-07,
"loss": 0.1168,
"step": 2461
},
{
"epoch": 0.67,
"grad_norm": 2.749840021133423,
"learning_rate": 5.112822464559544e-07,
"loss": 0.1175,
"step": 2462
},
{
"epoch": 0.67,
"grad_norm": 2.6408185958862305,
"learning_rate": 5.105103654683285e-07,
"loss": 0.1178,
"step": 2463
},
{
"epoch": 0.67,
"grad_norm": 2.799758195877075,
"learning_rate": 5.097388678227967e-07,
"loss": 0.1226,
"step": 2464
},
{
"epoch": 0.67,
"grad_norm": 2.613769769668579,
"learning_rate": 5.089677541235543e-07,
"loss": 0.1129,
"step": 2465
},
{
"epoch": 0.67,
"grad_norm": 2.95413875579834,
"learning_rate": 5.081970249744959e-07,
"loss": 0.129,
"step": 2466
},
{
"epoch": 0.67,
"grad_norm": 2.8071627616882324,
"learning_rate": 5.07426680979216e-07,
"loss": 0.1158,
"step": 2467
},
{
"epoch": 0.67,
"grad_norm": 2.932875871658325,
"learning_rate": 5.066567227410063e-07,
"loss": 0.1216,
"step": 2468
},
{
"epoch": 0.67,
"grad_norm": 3.034184217453003,
"learning_rate": 5.058871508628575e-07,
"loss": 0.1504,
"step": 2469
},
{
"epoch": 0.67,
"grad_norm": 2.742445945739746,
"learning_rate": 5.051179659474567e-07,
"loss": 0.1194,
"step": 2470
},
{
"epoch": 0.67,
"grad_norm": 2.574291706085205,
"learning_rate": 5.043491685971879e-07,
"loss": 0.103,
"step": 2471
},
{
"epoch": 0.68,
"grad_norm": 2.6973471641540527,
"learning_rate": 5.035807594141332e-07,
"loss": 0.1068,
"step": 2472
},
{
"epoch": 0.68,
"grad_norm": 2.9927546977996826,
"learning_rate": 5.028127390000683e-07,
"loss": 0.1222,
"step": 2473
},
{
"epoch": 0.68,
"grad_norm": 2.7439558506011963,
"learning_rate": 5.020451079564669e-07,
"loss": 0.1153,
"step": 2474
},
{
"epoch": 0.68,
"grad_norm": 2.746530532836914,
"learning_rate": 5.012778668844959e-07,
"loss": 0.1248,
"step": 2475
},
{
"epoch": 0.68,
"grad_norm": 2.7004928588867188,
"learning_rate": 5.005110163850173e-07,
"loss": 0.1228,
"step": 2476
},
{
"epoch": 0.68,
"grad_norm": 2.6957995891571045,
"learning_rate": 4.997445570585878e-07,
"loss": 0.1207,
"step": 2477
},
{
"epoch": 0.68,
"grad_norm": 2.921882390975952,
"learning_rate": 4.98978489505457e-07,
"loss": 0.1311,
"step": 2478
},
{
"epoch": 0.68,
"grad_norm": 2.918653964996338,
"learning_rate": 4.982128143255684e-07,
"loss": 0.1262,
"step": 2479
},
{
"epoch": 0.68,
"grad_norm": 2.9749364852905273,
"learning_rate": 4.974475321185578e-07,
"loss": 0.1117,
"step": 2480
},
{
"epoch": 0.68,
"grad_norm": 2.6308913230895996,
"learning_rate": 4.966826434837527e-07,
"loss": 0.118,
"step": 2481
},
{
"epoch": 0.68,
"grad_norm": 2.6968283653259277,
"learning_rate": 4.959181490201736e-07,
"loss": 0.1064,
"step": 2482
},
{
"epoch": 0.68,
"grad_norm": 3.173630952835083,
"learning_rate": 4.951540493265313e-07,
"loss": 0.139,
"step": 2483
},
{
"epoch": 0.68,
"grad_norm": 2.7071099281311035,
"learning_rate": 4.943903450012281e-07,
"loss": 0.1234,
"step": 2484
},
{
"epoch": 0.68,
"grad_norm": 2.7777867317199707,
"learning_rate": 4.936270366423563e-07,
"loss": 0.1213,
"step": 2485
},
{
"epoch": 0.68,
"grad_norm": 2.982285261154175,
"learning_rate": 4.928641248476977e-07,
"loss": 0.1232,
"step": 2486
},
{
"epoch": 0.68,
"grad_norm": 2.5845227241516113,
"learning_rate": 4.921016102147247e-07,
"loss": 0.106,
"step": 2487
},
{
"epoch": 0.68,
"grad_norm": 2.687197208404541,
"learning_rate": 4.913394933405974e-07,
"loss": 0.1112,
"step": 2488
},
{
"epoch": 0.68,
"grad_norm": 2.9104907512664795,
"learning_rate": 4.905777748221656e-07,
"loss": 0.1381,
"step": 2489
},
{
"epoch": 0.68,
"grad_norm": 2.659237861633301,
"learning_rate": 4.89816455255966e-07,
"loss": 0.1051,
"step": 2490
},
{
"epoch": 0.68,
"grad_norm": 2.7831077575683594,
"learning_rate": 4.89055535238223e-07,
"loss": 0.1163,
"step": 2491
},
{
"epoch": 0.68,
"grad_norm": 2.8132784366607666,
"learning_rate": 4.882950153648492e-07,
"loss": 0.1323,
"step": 2492
},
{
"epoch": 0.68,
"grad_norm": 2.8530664443969727,
"learning_rate": 4.875348962314426e-07,
"loss": 0.122,
"step": 2493
},
{
"epoch": 0.68,
"grad_norm": 2.664740800857544,
"learning_rate": 4.867751784332884e-07,
"loss": 0.1148,
"step": 2494
},
{
"epoch": 0.68,
"grad_norm": 2.7489805221557617,
"learning_rate": 4.860158625653564e-07,
"loss": 0.1177,
"step": 2495
},
{
"epoch": 0.68,
"grad_norm": 2.747615098953247,
"learning_rate": 4.852569492223021e-07,
"loss": 0.1161,
"step": 2496
},
{
"epoch": 0.68,
"grad_norm": 2.894929885864258,
"learning_rate": 4.844984389984663e-07,
"loss": 0.1238,
"step": 2497
},
{
"epoch": 0.68,
"grad_norm": 3.0197770595550537,
"learning_rate": 4.83740332487873e-07,
"loss": 0.1434,
"step": 2498
},
{
"epoch": 0.68,
"grad_norm": 2.6346349716186523,
"learning_rate": 4.829826302842314e-07,
"loss": 0.1021,
"step": 2499
},
{
"epoch": 0.68,
"grad_norm": 2.76969313621521,
"learning_rate": 4.82225332980933e-07,
"loss": 0.1169,
"step": 2500
},
{
"epoch": 0.68,
"grad_norm": 2.4715230464935303,
"learning_rate": 4.81468441171052e-07,
"loss": 0.1039,
"step": 2501
},
{
"epoch": 0.68,
"grad_norm": 2.683863878250122,
"learning_rate": 4.807119554473465e-07,
"loss": 0.1121,
"step": 2502
},
{
"epoch": 0.68,
"grad_norm": 3.25837779045105,
"learning_rate": 4.799558764022549e-07,
"loss": 0.1323,
"step": 2503
},
{
"epoch": 0.68,
"grad_norm": 3.516805648803711,
"learning_rate": 4.792002046278984e-07,
"loss": 0.1342,
"step": 2504
},
{
"epoch": 0.68,
"grad_norm": 2.6152870655059814,
"learning_rate": 4.784449407160786e-07,
"loss": 0.1062,
"step": 2505
},
{
"epoch": 0.68,
"grad_norm": 2.5211596488952637,
"learning_rate": 4.776900852582771e-07,
"loss": 0.1045,
"step": 2506
},
{
"epoch": 0.68,
"grad_norm": 2.9381637573242188,
"learning_rate": 4.769356388456573e-07,
"loss": 0.1261,
"step": 2507
},
{
"epoch": 0.69,
"grad_norm": 2.643887519836426,
"learning_rate": 4.7618160206906056e-07,
"loss": 0.1156,
"step": 2508
},
{
"epoch": 0.69,
"grad_norm": 2.8069515228271484,
"learning_rate": 4.7542797551900824e-07,
"loss": 0.1125,
"step": 2509
},
{
"epoch": 0.69,
"grad_norm": 2.7435302734375,
"learning_rate": 4.7467475978570136e-07,
"loss": 0.1125,
"step": 2510
},
{
"epoch": 0.69,
"grad_norm": 2.5782227516174316,
"learning_rate": 4.7392195545901657e-07,
"loss": 0.1056,
"step": 2511
},
{
"epoch": 0.69,
"grad_norm": 2.9041056632995605,
"learning_rate": 4.731695631285111e-07,
"loss": 0.1408,
"step": 2512
},
{
"epoch": 0.69,
"grad_norm": 2.7634119987487793,
"learning_rate": 4.7241758338341763e-07,
"loss": 0.1168,
"step": 2513
},
{
"epoch": 0.69,
"grad_norm": 2.9808712005615234,
"learning_rate": 4.7166601681264673e-07,
"loss": 0.1344,
"step": 2514
},
{
"epoch": 0.69,
"grad_norm": 3.2197020053863525,
"learning_rate": 4.70914864004786e-07,
"loss": 0.127,
"step": 2515
},
{
"epoch": 0.69,
"grad_norm": 2.745090961456299,
"learning_rate": 4.701641255480965e-07,
"loss": 0.1106,
"step": 2516
},
{
"epoch": 0.69,
"grad_norm": 2.7435736656188965,
"learning_rate": 4.6941380203051774e-07,
"loss": 0.1186,
"step": 2517
},
{
"epoch": 0.69,
"grad_norm": 2.9876623153686523,
"learning_rate": 4.68663894039662e-07,
"loss": 0.129,
"step": 2518
},
{
"epoch": 0.69,
"grad_norm": 2.932159423828125,
"learning_rate": 4.679144021628176e-07,
"loss": 0.1198,
"step": 2519
},
{
"epoch": 0.69,
"grad_norm": 2.509103298187256,
"learning_rate": 4.6716532698694734e-07,
"loss": 0.1117,
"step": 2520
},
{
"epoch": 0.69,
"grad_norm": 2.417865514755249,
"learning_rate": 4.6641666909868506e-07,
"loss": 0.1042,
"step": 2521
},
{
"epoch": 0.69,
"grad_norm": 2.7826478481292725,
"learning_rate": 4.656684290843409e-07,
"loss": 0.1106,
"step": 2522
},
{
"epoch": 0.69,
"grad_norm": 2.655090808868408,
"learning_rate": 4.649206075298955e-07,
"loss": 0.1068,
"step": 2523
},
{
"epoch": 0.69,
"grad_norm": 2.719024181365967,
"learning_rate": 4.641732050210031e-07,
"loss": 0.1144,
"step": 2524
},
{
"epoch": 0.69,
"grad_norm": 3.037306785583496,
"learning_rate": 4.634262221429902e-07,
"loss": 0.1299,
"step": 2525
},
{
"epoch": 0.69,
"grad_norm": 2.895413875579834,
"learning_rate": 4.626796594808523e-07,
"loss": 0.1322,
"step": 2526
},
{
"epoch": 0.69,
"grad_norm": 2.520425796508789,
"learning_rate": 4.619335176192585e-07,
"loss": 0.1072,
"step": 2527
},
{
"epoch": 0.69,
"grad_norm": 2.7348873615264893,
"learning_rate": 4.611877971425462e-07,
"loss": 0.1101,
"step": 2528
},
{
"epoch": 0.69,
"grad_norm": 2.6879892349243164,
"learning_rate": 4.6044249863472453e-07,
"loss": 0.1187,
"step": 2529
},
{
"epoch": 0.69,
"grad_norm": 2.752936363220215,
"learning_rate": 4.5969762267947175e-07,
"loss": 0.1217,
"step": 2530
},
{
"epoch": 0.69,
"grad_norm": 2.7619338035583496,
"learning_rate": 4.5895316986013366e-07,
"loss": 0.113,
"step": 2531
},
{
"epoch": 0.69,
"grad_norm": 2.8417179584503174,
"learning_rate": 4.5820914075972696e-07,
"loss": 0.1207,
"step": 2532
},
{
"epoch": 0.69,
"grad_norm": 2.787196159362793,
"learning_rate": 4.574655359609345e-07,
"loss": 0.1234,
"step": 2533
},
{
"epoch": 0.69,
"grad_norm": 2.7195425033569336,
"learning_rate": 4.5672235604610845e-07,
"loss": 0.1167,
"step": 2534
},
{
"epoch": 0.69,
"grad_norm": 2.5553388595581055,
"learning_rate": 4.5597960159726767e-07,
"loss": 0.118,
"step": 2535
},
{
"epoch": 0.69,
"grad_norm": 3.0006186962127686,
"learning_rate": 4.552372731960974e-07,
"loss": 0.1386,
"step": 2536
},
{
"epoch": 0.69,
"grad_norm": 2.705505609512329,
"learning_rate": 4.5449537142394956e-07,
"loss": 0.1207,
"step": 2537
},
{
"epoch": 0.69,
"grad_norm": 2.9590044021606445,
"learning_rate": 4.537538968618416e-07,
"loss": 0.1184,
"step": 2538
},
{
"epoch": 0.69,
"grad_norm": 2.792473316192627,
"learning_rate": 4.530128500904571e-07,
"loss": 0.1181,
"step": 2539
},
{
"epoch": 0.69,
"grad_norm": 2.780245304107666,
"learning_rate": 4.522722316901445e-07,
"loss": 0.1181,
"step": 2540
},
{
"epoch": 0.69,
"grad_norm": 2.752324342727661,
"learning_rate": 4.5153204224091614e-07,
"loss": 0.1153,
"step": 2541
},
{
"epoch": 0.69,
"grad_norm": 2.748352527618408,
"learning_rate": 4.507922823224489e-07,
"loss": 0.1225,
"step": 2542
},
{
"epoch": 0.69,
"grad_norm": 2.9857993125915527,
"learning_rate": 4.500529525140828e-07,
"loss": 0.1247,
"step": 2543
},
{
"epoch": 0.69,
"grad_norm": 2.933687925338745,
"learning_rate": 4.493140533948216e-07,
"loss": 0.1158,
"step": 2544
},
{
"epoch": 0.7,
"grad_norm": 2.8095855712890625,
"learning_rate": 4.485755855433322e-07,
"loss": 0.1146,
"step": 2545
},
{
"epoch": 0.7,
"grad_norm": 3.357168674468994,
"learning_rate": 4.478375495379426e-07,
"loss": 0.1366,
"step": 2546
},
{
"epoch": 0.7,
"grad_norm": 2.8076658248901367,
"learning_rate": 4.47099945956643e-07,
"loss": 0.1123,
"step": 2547
},
{
"epoch": 0.7,
"grad_norm": 2.7585599422454834,
"learning_rate": 4.4636277537708487e-07,
"loss": 0.1144,
"step": 2548
},
{
"epoch": 0.7,
"grad_norm": 2.844555616378784,
"learning_rate": 4.45626038376581e-07,
"loss": 0.1139,
"step": 2549
},
{
"epoch": 0.7,
"grad_norm": 2.71215558052063,
"learning_rate": 4.4488973553210483e-07,
"loss": 0.117,
"step": 2550
},
{
"epoch": 0.7,
"grad_norm": 2.676067352294922,
"learning_rate": 4.4415386742028903e-07,
"loss": 0.1176,
"step": 2551
},
{
"epoch": 0.7,
"grad_norm": 2.9423668384552,
"learning_rate": 4.434184346174261e-07,
"loss": 0.1243,
"step": 2552
},
{
"epoch": 0.7,
"grad_norm": 2.7105016708374023,
"learning_rate": 4.426834376994673e-07,
"loss": 0.1182,
"step": 2553
},
{
"epoch": 0.7,
"grad_norm": 2.604095697402954,
"learning_rate": 4.419488772420231e-07,
"loss": 0.1062,
"step": 2554
},
{
"epoch": 0.7,
"grad_norm": 2.8293213844299316,
"learning_rate": 4.4121475382036253e-07,
"loss": 0.1244,
"step": 2555
},
{
"epoch": 0.7,
"grad_norm": 2.634725570678711,
"learning_rate": 4.4048106800941143e-07,
"loss": 0.104,
"step": 2556
},
{
"epoch": 0.7,
"grad_norm": 2.670680284500122,
"learning_rate": 4.3974782038375313e-07,
"loss": 0.1233,
"step": 2557
},
{
"epoch": 0.7,
"grad_norm": 2.6359667778015137,
"learning_rate": 4.3901501151762764e-07,
"loss": 0.1121,
"step": 2558
},
{
"epoch": 0.7,
"grad_norm": 2.7859766483306885,
"learning_rate": 4.3828264198493206e-07,
"loss": 0.1134,
"step": 2559
},
{
"epoch": 0.7,
"grad_norm": 2.64312481880188,
"learning_rate": 4.3755071235921935e-07,
"loss": 0.1115,
"step": 2560
},
{
"epoch": 0.7,
"grad_norm": 2.8879735469818115,
"learning_rate": 4.3681922321369726e-07,
"loss": 0.1286,
"step": 2561
},
{
"epoch": 0.7,
"grad_norm": 2.6948938369750977,
"learning_rate": 4.3608817512122887e-07,
"loss": 0.1062,
"step": 2562
},
{
"epoch": 0.7,
"grad_norm": 3.2535085678100586,
"learning_rate": 4.353575686543318e-07,
"loss": 0.1344,
"step": 2563
},
{
"epoch": 0.7,
"grad_norm": 2.81569242477417,
"learning_rate": 4.346274043851781e-07,
"loss": 0.1186,
"step": 2564
},
{
"epoch": 0.7,
"grad_norm": 2.7789556980133057,
"learning_rate": 4.338976828855938e-07,
"loss": 0.1175,
"step": 2565
},
{
"epoch": 0.7,
"grad_norm": 2.9362411499023438,
"learning_rate": 4.331684047270574e-07,
"loss": 0.12,
"step": 2566
},
{
"epoch": 0.7,
"grad_norm": 2.8243889808654785,
"learning_rate": 4.3243957048070015e-07,
"loss": 0.1224,
"step": 2567
},
{
"epoch": 0.7,
"grad_norm": 2.859623670578003,
"learning_rate": 4.317111807173067e-07,
"loss": 0.1249,
"step": 2568
},
{
"epoch": 0.7,
"grad_norm": 2.561389684677124,
"learning_rate": 4.3098323600731233e-07,
"loss": 0.1084,
"step": 2569
},
{
"epoch": 0.7,
"grad_norm": 2.856942892074585,
"learning_rate": 4.302557369208051e-07,
"loss": 0.1191,
"step": 2570
},
{
"epoch": 0.7,
"grad_norm": 2.8887505531311035,
"learning_rate": 4.2952868402752285e-07,
"loss": 0.1152,
"step": 2571
},
{
"epoch": 0.7,
"grad_norm": 2.62001895904541,
"learning_rate": 4.288020778968544e-07,
"loss": 0.1097,
"step": 2572
},
{
"epoch": 0.7,
"grad_norm": 2.8298003673553467,
"learning_rate": 4.2807591909783937e-07,
"loss": 0.1214,
"step": 2573
},
{
"epoch": 0.7,
"grad_norm": 2.9238924980163574,
"learning_rate": 4.273502081991658e-07,
"loss": 0.1352,
"step": 2574
},
{
"epoch": 0.7,
"grad_norm": 3.063486099243164,
"learning_rate": 4.266249457691723e-07,
"loss": 0.1278,
"step": 2575
},
{
"epoch": 0.7,
"grad_norm": 2.689558506011963,
"learning_rate": 4.259001323758452e-07,
"loss": 0.1274,
"step": 2576
},
{
"epoch": 0.7,
"grad_norm": 2.7789130210876465,
"learning_rate": 4.2517576858681945e-07,
"loss": 0.1235,
"step": 2577
},
{
"epoch": 0.7,
"grad_norm": 2.6102023124694824,
"learning_rate": 4.244518549693785e-07,
"loss": 0.1104,
"step": 2578
},
{
"epoch": 0.7,
"grad_norm": 2.763643980026245,
"learning_rate": 4.237283920904522e-07,
"loss": 0.1106,
"step": 2579
},
{
"epoch": 0.7,
"grad_norm": 2.6615610122680664,
"learning_rate": 4.2300538051661847e-07,
"loss": 0.1098,
"step": 2580
},
{
"epoch": 0.71,
"grad_norm": 2.615488052368164,
"learning_rate": 4.2228282081410126e-07,
"loss": 0.116,
"step": 2581
},
{
"epoch": 0.71,
"grad_norm": 2.768125057220459,
"learning_rate": 4.215607135487701e-07,
"loss": 0.1142,
"step": 2582
},
{
"epoch": 0.71,
"grad_norm": 2.794842004776001,
"learning_rate": 4.2083905928614147e-07,
"loss": 0.1158,
"step": 2583
},
{
"epoch": 0.71,
"grad_norm": 2.6256704330444336,
"learning_rate": 4.2011785859137574e-07,
"loss": 0.1043,
"step": 2584
},
{
"epoch": 0.71,
"grad_norm": 2.975555658340454,
"learning_rate": 4.193971120292793e-07,
"loss": 0.1156,
"step": 2585
},
{
"epoch": 0.71,
"grad_norm": 2.6213598251342773,
"learning_rate": 4.1867682016430215e-07,
"loss": 0.1147,
"step": 2586
},
{
"epoch": 0.71,
"grad_norm": 2.846608877182007,
"learning_rate": 4.179569835605379e-07,
"loss": 0.1154,
"step": 2587
},
{
"epoch": 0.71,
"grad_norm": 2.513469934463501,
"learning_rate": 4.172376027817246e-07,
"loss": 0.1084,
"step": 2588
},
{
"epoch": 0.71,
"grad_norm": 2.876136302947998,
"learning_rate": 4.1651867839124234e-07,
"loss": 0.1304,
"step": 2589
},
{
"epoch": 0.71,
"grad_norm": 3.251068115234375,
"learning_rate": 4.158002109521148e-07,
"loss": 0.1285,
"step": 2590
},
{
"epoch": 0.71,
"grad_norm": 3.049268960952759,
"learning_rate": 4.15082201027007e-07,
"loss": 0.1333,
"step": 2591
},
{
"epoch": 0.71,
"grad_norm": 2.8080382347106934,
"learning_rate": 4.1436464917822546e-07,
"loss": 0.1092,
"step": 2592
},
{
"epoch": 0.71,
"grad_norm": 2.5367066860198975,
"learning_rate": 4.136475559677191e-07,
"loss": 0.1116,
"step": 2593
},
{
"epoch": 0.71,
"grad_norm": 3.02242112159729,
"learning_rate": 4.129309219570761e-07,
"loss": 0.1264,
"step": 2594
},
{
"epoch": 0.71,
"grad_norm": 2.5939440727233887,
"learning_rate": 4.1221474770752696e-07,
"loss": 0.1088,
"step": 2595
},
{
"epoch": 0.71,
"grad_norm": 2.8899600505828857,
"learning_rate": 4.1149903377994035e-07,
"loss": 0.1167,
"step": 2596
},
{
"epoch": 0.71,
"grad_norm": 2.6632378101348877,
"learning_rate": 4.107837807348249e-07,
"loss": 0.1078,
"step": 2597
},
{
"epoch": 0.71,
"grad_norm": 2.7490880489349365,
"learning_rate": 4.1006898913232937e-07,
"loss": 0.1196,
"step": 2598
},
{
"epoch": 0.71,
"grad_norm": 2.7322561740875244,
"learning_rate": 4.0935465953223936e-07,
"loss": 0.1212,
"step": 2599
},
{
"epoch": 0.71,
"grad_norm": 2.687798023223877,
"learning_rate": 4.086407924939803e-07,
"loss": 0.1208,
"step": 2600
},
{
"epoch": 0.71,
"grad_norm": 2.668848991394043,
"learning_rate": 4.079273885766146e-07,
"loss": 0.11,
"step": 2601
},
{
"epoch": 0.71,
"grad_norm": 3.0383217334747314,
"learning_rate": 4.0721444833884134e-07,
"loss": 0.1337,
"step": 2602
},
{
"epoch": 0.71,
"grad_norm": 2.8997931480407715,
"learning_rate": 4.065019723389981e-07,
"loss": 0.1216,
"step": 2603
},
{
"epoch": 0.71,
"grad_norm": 2.6722733974456787,
"learning_rate": 4.0578996113505713e-07,
"loss": 0.1163,
"step": 2604
},
{
"epoch": 0.71,
"grad_norm": 3.0935604572296143,
"learning_rate": 4.0507841528462837e-07,
"loss": 0.1295,
"step": 2605
},
{
"epoch": 0.71,
"grad_norm": 2.7999727725982666,
"learning_rate": 4.0436733534495595e-07,
"loss": 0.1199,
"step": 2606
},
{
"epoch": 0.71,
"grad_norm": 2.807915687561035,
"learning_rate": 4.036567218729193e-07,
"loss": 0.1136,
"step": 2607
},
{
"epoch": 0.71,
"grad_norm": 2.9019205570220947,
"learning_rate": 4.0294657542503373e-07,
"loss": 0.1194,
"step": 2608
},
{
"epoch": 0.71,
"grad_norm": 3.148651599884033,
"learning_rate": 4.022368965574471e-07,
"loss": 0.1307,
"step": 2609
},
{
"epoch": 0.71,
"grad_norm": 2.4384212493896484,
"learning_rate": 4.0152768582594266e-07,
"loss": 0.1085,
"step": 2610
},
{
"epoch": 0.71,
"grad_norm": 2.7910211086273193,
"learning_rate": 4.008189437859361e-07,
"loss": 0.1298,
"step": 2611
},
{
"epoch": 0.71,
"grad_norm": 2.623723030090332,
"learning_rate": 4.0011067099247565e-07,
"loss": 0.1188,
"step": 2612
},
{
"epoch": 0.71,
"grad_norm": 2.447112560272217,
"learning_rate": 3.994028680002435e-07,
"loss": 0.0984,
"step": 2613
},
{
"epoch": 0.71,
"grad_norm": 2.915776491165161,
"learning_rate": 3.9869553536355236e-07,
"loss": 0.1257,
"step": 2614
},
{
"epoch": 0.71,
"grad_norm": 2.71044921875,
"learning_rate": 3.9798867363634815e-07,
"loss": 0.1199,
"step": 2615
},
{
"epoch": 0.71,
"grad_norm": 2.633300304412842,
"learning_rate": 3.972822833722067e-07,
"loss": 0.1203,
"step": 2616
},
{
"epoch": 0.71,
"grad_norm": 2.6894752979278564,
"learning_rate": 3.9657636512433466e-07,
"loss": 0.1224,
"step": 2617
},
{
"epoch": 0.72,
"grad_norm": 2.559011459350586,
"learning_rate": 3.9587091944557015e-07,
"loss": 0.104,
"step": 2618
},
{
"epoch": 0.72,
"grad_norm": 3.1206631660461426,
"learning_rate": 3.951659468883799e-07,
"loss": 0.1313,
"step": 2619
},
{
"epoch": 0.72,
"grad_norm": 2.814870595932007,
"learning_rate": 3.9446144800486135e-07,
"loss": 0.1229,
"step": 2620
},
{
"epoch": 0.72,
"grad_norm": 2.710951566696167,
"learning_rate": 3.9375742334674e-07,
"loss": 0.1126,
"step": 2621
},
{
"epoch": 0.72,
"grad_norm": 3.0050394535064697,
"learning_rate": 3.9305387346536976e-07,
"loss": 0.1282,
"step": 2622
},
{
"epoch": 0.72,
"grad_norm": 2.760265588760376,
"learning_rate": 3.9235079891173427e-07,
"loss": 0.1193,
"step": 2623
},
{
"epoch": 0.72,
"grad_norm": 3.1326093673706055,
"learning_rate": 3.9164820023644297e-07,
"loss": 0.1216,
"step": 2624
},
{
"epoch": 0.72,
"grad_norm": 2.85953950881958,
"learning_rate": 3.909460779897339e-07,
"loss": 0.127,
"step": 2625
},
{
"epoch": 0.72,
"grad_norm": 2.9403843879699707,
"learning_rate": 3.9024443272147256e-07,
"loss": 0.1282,
"step": 2626
},
{
"epoch": 0.72,
"grad_norm": 2.566781997680664,
"learning_rate": 3.895432649811483e-07,
"loss": 0.1111,
"step": 2627
},
{
"epoch": 0.72,
"grad_norm": 3.1966028213500977,
"learning_rate": 3.8884257531787945e-07,
"loss": 0.1208,
"step": 2628
},
{
"epoch": 0.72,
"grad_norm": 3.0294313430786133,
"learning_rate": 3.881423642804079e-07,
"loss": 0.1236,
"step": 2629
},
{
"epoch": 0.72,
"grad_norm": 3.069186210632324,
"learning_rate": 3.8744263241710184e-07,
"loss": 0.1455,
"step": 2630
},
{
"epoch": 0.72,
"grad_norm": 2.585867166519165,
"learning_rate": 3.867433802759541e-07,
"loss": 0.1192,
"step": 2631
},
{
"epoch": 0.72,
"grad_norm": 2.7379586696624756,
"learning_rate": 3.860446084045813e-07,
"loss": 0.1193,
"step": 2632
},
{
"epoch": 0.72,
"grad_norm": 2.710688591003418,
"learning_rate": 3.8534631735022406e-07,
"loss": 0.112,
"step": 2633
},
{
"epoch": 0.72,
"grad_norm": 2.7601280212402344,
"learning_rate": 3.846485076597463e-07,
"loss": 0.1209,
"step": 2634
},
{
"epoch": 0.72,
"grad_norm": 3.2363827228546143,
"learning_rate": 3.8395117987963565e-07,
"loss": 0.148,
"step": 2635
},
{
"epoch": 0.72,
"grad_norm": 3.099475383758545,
"learning_rate": 3.832543345560021e-07,
"loss": 0.1408,
"step": 2636
},
{
"epoch": 0.72,
"grad_norm": 2.758754253387451,
"learning_rate": 3.825579722345774e-07,
"loss": 0.1125,
"step": 2637
},
{
"epoch": 0.72,
"grad_norm": 3.01164174079895,
"learning_rate": 3.818620934607153e-07,
"loss": 0.116,
"step": 2638
},
{
"epoch": 0.72,
"grad_norm": 2.769646406173706,
"learning_rate": 3.8116669877939044e-07,
"loss": 0.1124,
"step": 2639
},
{
"epoch": 0.72,
"grad_norm": 3.0166378021240234,
"learning_rate": 3.80471788735199e-07,
"loss": 0.1279,
"step": 2640
},
{
"epoch": 0.72,
"grad_norm": 2.505793333053589,
"learning_rate": 3.797773638723578e-07,
"loss": 0.1052,
"step": 2641
},
{
"epoch": 0.72,
"grad_norm": 3.0237605571746826,
"learning_rate": 3.790834247347028e-07,
"loss": 0.1154,
"step": 2642
},
{
"epoch": 0.72,
"grad_norm": 2.958221435546875,
"learning_rate": 3.783899718656901e-07,
"loss": 0.1155,
"step": 2643
},
{
"epoch": 0.72,
"grad_norm": 2.838883399963379,
"learning_rate": 3.7769700580839447e-07,
"loss": 0.1272,
"step": 2644
},
{
"epoch": 0.72,
"grad_norm": 2.793994665145874,
"learning_rate": 3.7700452710551025e-07,
"loss": 0.1181,
"step": 2645
},
{
"epoch": 0.72,
"grad_norm": 2.674825429916382,
"learning_rate": 3.7631253629935e-07,
"loss": 0.1159,
"step": 2646
},
{
"epoch": 0.72,
"grad_norm": 2.790365219116211,
"learning_rate": 3.756210339318436e-07,
"loss": 0.1255,
"step": 2647
},
{
"epoch": 0.72,
"grad_norm": 2.7378766536712646,
"learning_rate": 3.749300205445387e-07,
"loss": 0.1216,
"step": 2648
},
{
"epoch": 0.72,
"grad_norm": 2.8433449268341064,
"learning_rate": 3.7423949667859967e-07,
"loss": 0.1286,
"step": 2649
},
{
"epoch": 0.72,
"grad_norm": 2.6227731704711914,
"learning_rate": 3.735494628748082e-07,
"loss": 0.1144,
"step": 2650
},
{
"epoch": 0.72,
"grad_norm": 2.631502389907837,
"learning_rate": 3.72859919673562e-07,
"loss": 0.1143,
"step": 2651
},
{
"epoch": 0.72,
"grad_norm": 2.8619110584259033,
"learning_rate": 3.721708676148745e-07,
"loss": 0.1293,
"step": 2652
},
{
"epoch": 0.72,
"grad_norm": 3.010019540786743,
"learning_rate": 3.71482307238374e-07,
"loss": 0.1067,
"step": 2653
},
{
"epoch": 0.72,
"grad_norm": 2.6308083534240723,
"learning_rate": 3.707942390833041e-07,
"loss": 0.1157,
"step": 2654
},
{
"epoch": 0.73,
"grad_norm": 2.5576210021972656,
"learning_rate": 3.7010666368852305e-07,
"loss": 0.1043,
"step": 2655
},
{
"epoch": 0.73,
"grad_norm": 2.6919572353363037,
"learning_rate": 3.694195815925036e-07,
"loss": 0.1177,
"step": 2656
},
{
"epoch": 0.73,
"grad_norm": 2.72977876663208,
"learning_rate": 3.687329933333315e-07,
"loss": 0.1093,
"step": 2657
},
{
"epoch": 0.73,
"grad_norm": 2.6672637462615967,
"learning_rate": 3.680468994487056e-07,
"loss": 0.1036,
"step": 2658
},
{
"epoch": 0.73,
"grad_norm": 2.9066519737243652,
"learning_rate": 3.6736130047593784e-07,
"loss": 0.114,
"step": 2659
},
{
"epoch": 0.73,
"grad_norm": 2.4339230060577393,
"learning_rate": 3.666761969519528e-07,
"loss": 0.1071,
"step": 2660
},
{
"epoch": 0.73,
"grad_norm": 2.600412368774414,
"learning_rate": 3.6599158941328755e-07,
"loss": 0.1174,
"step": 2661
},
{
"epoch": 0.73,
"grad_norm": 2.992175817489624,
"learning_rate": 3.6530747839608943e-07,
"loss": 0.118,
"step": 2662
},
{
"epoch": 0.73,
"grad_norm": 2.8061258792877197,
"learning_rate": 3.646238644361177e-07,
"loss": 0.1185,
"step": 2663
},
{
"epoch": 0.73,
"grad_norm": 2.548417329788208,
"learning_rate": 3.63940748068742e-07,
"loss": 0.0997,
"step": 2664
},
{
"epoch": 0.73,
"grad_norm": 2.507392644882202,
"learning_rate": 3.632581298289427e-07,
"loss": 0.107,
"step": 2665
},
{
"epoch": 0.73,
"grad_norm": 2.87033748626709,
"learning_rate": 3.625760102513102e-07,
"loss": 0.1306,
"step": 2666
},
{
"epoch": 0.73,
"grad_norm": 2.7267277240753174,
"learning_rate": 3.6189438987004403e-07,
"loss": 0.1062,
"step": 2667
},
{
"epoch": 0.73,
"grad_norm": 2.7781479358673096,
"learning_rate": 3.6121326921895245e-07,
"loss": 0.1218,
"step": 2668
},
{
"epoch": 0.73,
"grad_norm": 2.554163694381714,
"learning_rate": 3.605326488314526e-07,
"loss": 0.1071,
"step": 2669
},
{
"epoch": 0.73,
"grad_norm": 2.9026176929473877,
"learning_rate": 3.5985252924057017e-07,
"loss": 0.1226,
"step": 2670
},
{
"epoch": 0.73,
"grad_norm": 2.7796084880828857,
"learning_rate": 3.591729109789389e-07,
"loss": 0.1152,
"step": 2671
},
{
"epoch": 0.73,
"grad_norm": 2.7688584327697754,
"learning_rate": 3.584937945787989e-07,
"loss": 0.1228,
"step": 2672
},
{
"epoch": 0.73,
"grad_norm": 2.813920497894287,
"learning_rate": 3.57815180571998e-07,
"loss": 0.1103,
"step": 2673
},
{
"epoch": 0.73,
"grad_norm": 2.761597156524658,
"learning_rate": 3.571370694899899e-07,
"loss": 0.1128,
"step": 2674
},
{
"epoch": 0.73,
"grad_norm": 2.8307440280914307,
"learning_rate": 3.5645946186383544e-07,
"loss": 0.1232,
"step": 2675
},
{
"epoch": 0.73,
"grad_norm": 2.8210718631744385,
"learning_rate": 3.557823582242008e-07,
"loss": 0.1217,
"step": 2676
},
{
"epoch": 0.73,
"grad_norm": 2.6191835403442383,
"learning_rate": 3.551057591013572e-07,
"loss": 0.1026,
"step": 2677
},
{
"epoch": 0.73,
"grad_norm": 2.7669758796691895,
"learning_rate": 3.544296650251807e-07,
"loss": 0.1103,
"step": 2678
},
{
"epoch": 0.73,
"grad_norm": 2.8590447902679443,
"learning_rate": 3.5375407652515166e-07,
"loss": 0.1139,
"step": 2679
},
{
"epoch": 0.73,
"grad_norm": 2.8868892192840576,
"learning_rate": 3.5307899413035534e-07,
"loss": 0.1303,
"step": 2680
},
{
"epoch": 0.73,
"grad_norm": 2.5479912757873535,
"learning_rate": 3.524044183694803e-07,
"loss": 0.1164,
"step": 2681
},
{
"epoch": 0.73,
"grad_norm": 2.819713830947876,
"learning_rate": 3.5173034977081807e-07,
"loss": 0.1207,
"step": 2682
},
{
"epoch": 0.73,
"grad_norm": 2.73115873336792,
"learning_rate": 3.51056788862263e-07,
"loss": 0.1191,
"step": 2683
},
{
"epoch": 0.73,
"grad_norm": 2.7490108013153076,
"learning_rate": 3.5038373617131156e-07,
"loss": 0.123,
"step": 2684
},
{
"epoch": 0.73,
"grad_norm": 2.8641395568847656,
"learning_rate": 3.4971119222506296e-07,
"loss": 0.1169,
"step": 2685
},
{
"epoch": 0.73,
"grad_norm": 2.8399951457977295,
"learning_rate": 3.4903915755021806e-07,
"loss": 0.1289,
"step": 2686
},
{
"epoch": 0.73,
"grad_norm": 2.8281517028808594,
"learning_rate": 3.4836763267307814e-07,
"loss": 0.1224,
"step": 2687
},
{
"epoch": 0.73,
"grad_norm": 3.023627758026123,
"learning_rate": 3.476966181195451e-07,
"loss": 0.1337,
"step": 2688
},
{
"epoch": 0.73,
"grad_norm": 2.7076070308685303,
"learning_rate": 3.470261144151224e-07,
"loss": 0.1098,
"step": 2689
},
{
"epoch": 0.73,
"grad_norm": 2.512364387512207,
"learning_rate": 3.4635612208491193e-07,
"loss": 0.1058,
"step": 2690
},
{
"epoch": 0.74,
"grad_norm": 2.8544671535491943,
"learning_rate": 3.456866416536166e-07,
"loss": 0.1208,
"step": 2691
},
{
"epoch": 0.74,
"grad_norm": 2.7098636627197266,
"learning_rate": 3.4501767364553723e-07,
"loss": 0.1177,
"step": 2692
},
{
"epoch": 0.74,
"grad_norm": 2.382507801055908,
"learning_rate": 3.4434921858457355e-07,
"loss": 0.0982,
"step": 2693
},
{
"epoch": 0.74,
"grad_norm": 2.758180856704712,
"learning_rate": 3.4368127699422434e-07,
"loss": 0.1061,
"step": 2694
},
{
"epoch": 0.74,
"grad_norm": 2.895392417907715,
"learning_rate": 3.4301384939758513e-07,
"loss": 0.1188,
"step": 2695
},
{
"epoch": 0.74,
"grad_norm": 2.702798366546631,
"learning_rate": 3.4234693631735026e-07,
"loss": 0.1074,
"step": 2696
},
{
"epoch": 0.74,
"grad_norm": 3.1038873195648193,
"learning_rate": 3.416805382758099e-07,
"loss": 0.1179,
"step": 2697
},
{
"epoch": 0.74,
"grad_norm": 2.643678665161133,
"learning_rate": 3.41014655794851e-07,
"loss": 0.1083,
"step": 2698
},
{
"epoch": 0.74,
"grad_norm": 2.7079343795776367,
"learning_rate": 3.4034928939595785e-07,
"loss": 0.1143,
"step": 2699
},
{
"epoch": 0.74,
"grad_norm": 2.768155813217163,
"learning_rate": 3.3968443960020907e-07,
"loss": 0.1199,
"step": 2700
},
{
"epoch": 0.74,
"grad_norm": 2.9409587383270264,
"learning_rate": 3.390201069282802e-07,
"loss": 0.1215,
"step": 2701
},
{
"epoch": 0.74,
"grad_norm": 2.918840169906616,
"learning_rate": 3.3835629190044066e-07,
"loss": 0.1339,
"step": 2702
},
{
"epoch": 0.74,
"grad_norm": 2.6571919918060303,
"learning_rate": 3.3769299503655457e-07,
"loss": 0.112,
"step": 2703
},
{
"epoch": 0.74,
"grad_norm": 2.999922752380371,
"learning_rate": 3.3703021685608115e-07,
"loss": 0.1276,
"step": 2704
},
{
"epoch": 0.74,
"grad_norm": 2.910522222518921,
"learning_rate": 3.3636795787807225e-07,
"loss": 0.1206,
"step": 2705
},
{
"epoch": 0.74,
"grad_norm": 2.843425750732422,
"learning_rate": 3.3570621862117423e-07,
"loss": 0.1137,
"step": 2706
},
{
"epoch": 0.74,
"grad_norm": 2.7062761783599854,
"learning_rate": 3.350449996036255e-07,
"loss": 0.1029,
"step": 2707
},
{
"epoch": 0.74,
"grad_norm": 2.678119421005249,
"learning_rate": 3.3438430134325734e-07,
"loss": 0.1009,
"step": 2708
},
{
"epoch": 0.74,
"grad_norm": 2.6809935569763184,
"learning_rate": 3.337241243574936e-07,
"loss": 0.1043,
"step": 2709
},
{
"epoch": 0.74,
"grad_norm": 2.797308921813965,
"learning_rate": 3.330644691633492e-07,
"loss": 0.1257,
"step": 2710
},
{
"epoch": 0.74,
"grad_norm": 2.5868420600891113,
"learning_rate": 3.3240533627743126e-07,
"loss": 0.1183,
"step": 2711
},
{
"epoch": 0.74,
"grad_norm": 2.9918274879455566,
"learning_rate": 3.3174672621593726e-07,
"loss": 0.1381,
"step": 2712
},
{
"epoch": 0.74,
"grad_norm": 2.644981622695923,
"learning_rate": 3.310886394946548e-07,
"loss": 0.1087,
"step": 2713
},
{
"epoch": 0.74,
"grad_norm": 2.868210554122925,
"learning_rate": 3.3043107662896295e-07,
"loss": 0.1302,
"step": 2714
},
{
"epoch": 0.74,
"grad_norm": 2.591280221939087,
"learning_rate": 3.297740381338292e-07,
"loss": 0.119,
"step": 2715
},
{
"epoch": 0.74,
"grad_norm": 2.8235762119293213,
"learning_rate": 3.2911752452381146e-07,
"loss": 0.1144,
"step": 2716
},
{
"epoch": 0.74,
"grad_norm": 2.710702657699585,
"learning_rate": 3.2846153631305584e-07,
"loss": 0.1193,
"step": 2717
},
{
"epoch": 0.74,
"grad_norm": 2.6694576740264893,
"learning_rate": 3.278060740152969e-07,
"loss": 0.1206,
"step": 2718
},
{
"epoch": 0.74,
"grad_norm": 2.732609987258911,
"learning_rate": 3.271511381438582e-07,
"loss": 0.1225,
"step": 2719
},
{
"epoch": 0.74,
"grad_norm": 2.437609910964966,
"learning_rate": 3.2649672921164993e-07,
"loss": 0.0987,
"step": 2720
},
{
"epoch": 0.74,
"grad_norm": 2.895250082015991,
"learning_rate": 3.2584284773117066e-07,
"loss": 0.1314,
"step": 2721
},
{
"epoch": 0.74,
"grad_norm": 2.8994832038879395,
"learning_rate": 3.2518949421450525e-07,
"loss": 0.128,
"step": 2722
},
{
"epoch": 0.74,
"grad_norm": 2.6383893489837646,
"learning_rate": 3.2453666917332465e-07,
"loss": 0.1139,
"step": 2723
},
{
"epoch": 0.74,
"grad_norm": 2.823533535003662,
"learning_rate": 3.2388437311888737e-07,
"loss": 0.111,
"step": 2724
},
{
"epoch": 0.74,
"grad_norm": 2.771145820617676,
"learning_rate": 3.232326065620361e-07,
"loss": 0.1279,
"step": 2725
},
{
"epoch": 0.74,
"grad_norm": 2.848806619644165,
"learning_rate": 3.2258137001320007e-07,
"loss": 0.1205,
"step": 2726
},
{
"epoch": 0.74,
"grad_norm": 2.788727283477783,
"learning_rate": 3.219306639823923e-07,
"loss": 0.1162,
"step": 2727
},
{
"epoch": 0.75,
"grad_norm": 2.8441965579986572,
"learning_rate": 3.212804889792117e-07,
"loss": 0.1159,
"step": 2728
},
{
"epoch": 0.75,
"grad_norm": 2.997556209564209,
"learning_rate": 3.2063084551284004e-07,
"loss": 0.1231,
"step": 2729
},
{
"epoch": 0.75,
"grad_norm": 3.102024555206299,
"learning_rate": 3.1998173409204323e-07,
"loss": 0.1174,
"step": 2730
},
{
"epoch": 0.75,
"grad_norm": 2.7217159271240234,
"learning_rate": 3.19333155225171e-07,
"loss": 0.1165,
"step": 2731
},
{
"epoch": 0.75,
"grad_norm": 2.6979618072509766,
"learning_rate": 3.186851094201551e-07,
"loss": 0.1135,
"step": 2732
},
{
"epoch": 0.75,
"grad_norm": 2.8890743255615234,
"learning_rate": 3.1803759718451107e-07,
"loss": 0.1199,
"step": 2733
},
{
"epoch": 0.75,
"grad_norm": 2.689387083053589,
"learning_rate": 3.173906190253355e-07,
"loss": 0.1155,
"step": 2734
},
{
"epoch": 0.75,
"grad_norm": 2.8426594734191895,
"learning_rate": 3.1674417544930653e-07,
"loss": 0.1201,
"step": 2735
},
{
"epoch": 0.75,
"grad_norm": 3.093418836593628,
"learning_rate": 3.1609826696268507e-07,
"loss": 0.12,
"step": 2736
},
{
"epoch": 0.75,
"grad_norm": 2.7327778339385986,
"learning_rate": 3.154528940713113e-07,
"loss": 0.1129,
"step": 2737
},
{
"epoch": 0.75,
"grad_norm": 2.849271774291992,
"learning_rate": 3.1480805728060745e-07,
"loss": 0.1167,
"step": 2738
},
{
"epoch": 0.75,
"grad_norm": 2.769320011138916,
"learning_rate": 3.1416375709557483e-07,
"loss": 0.1068,
"step": 2739
},
{
"epoch": 0.75,
"grad_norm": 2.5702242851257324,
"learning_rate": 3.1351999402079465e-07,
"loss": 0.1012,
"step": 2740
},
{
"epoch": 0.75,
"grad_norm": 2.8913018703460693,
"learning_rate": 3.1287676856042824e-07,
"loss": 0.1223,
"step": 2741
},
{
"epoch": 0.75,
"grad_norm": 2.8712058067321777,
"learning_rate": 3.122340812182148e-07,
"loss": 0.1196,
"step": 2742
},
{
"epoch": 0.75,
"grad_norm": 3.083658456802368,
"learning_rate": 3.1159193249747327e-07,
"loss": 0.1138,
"step": 2743
},
{
"epoch": 0.75,
"grad_norm": 2.3772547245025635,
"learning_rate": 3.109503229010999e-07,
"loss": 0.0929,
"step": 2744
},
{
"epoch": 0.75,
"grad_norm": 2.6538403034210205,
"learning_rate": 3.103092529315686e-07,
"loss": 0.1268,
"step": 2745
},
{
"epoch": 0.75,
"grad_norm": 2.9843966960906982,
"learning_rate": 3.096687230909315e-07,
"loss": 0.1143,
"step": 2746
},
{
"epoch": 0.75,
"grad_norm": 2.629732847213745,
"learning_rate": 3.090287338808175e-07,
"loss": 0.114,
"step": 2747
},
{
"epoch": 0.75,
"grad_norm": 2.844168186187744,
"learning_rate": 3.083892858024317e-07,
"loss": 0.1233,
"step": 2748
},
{
"epoch": 0.75,
"grad_norm": 2.8624022006988525,
"learning_rate": 3.077503793565557e-07,
"loss": 0.1256,
"step": 2749
},
{
"epoch": 0.75,
"grad_norm": 2.8195860385894775,
"learning_rate": 3.0711201504354623e-07,
"loss": 0.1229,
"step": 2750
},
{
"epoch": 0.75,
"grad_norm": 2.623037338256836,
"learning_rate": 3.0647419336333656e-07,
"loss": 0.1032,
"step": 2751
},
{
"epoch": 0.75,
"grad_norm": 2.8414342403411865,
"learning_rate": 3.0583691481543493e-07,
"loss": 0.1271,
"step": 2752
},
{
"epoch": 0.75,
"grad_norm": 2.9264533519744873,
"learning_rate": 3.052001798989233e-07,
"loss": 0.1232,
"step": 2753
},
{
"epoch": 0.75,
"grad_norm": 2.7810049057006836,
"learning_rate": 3.045639891124585e-07,
"loss": 0.1071,
"step": 2754
},
{
"epoch": 0.75,
"grad_norm": 2.9589149951934814,
"learning_rate": 3.039283429542707e-07,
"loss": 0.1192,
"step": 2755
},
{
"epoch": 0.75,
"grad_norm": 2.6391968727111816,
"learning_rate": 3.032932419221644e-07,
"loss": 0.1068,
"step": 2756
},
{
"epoch": 0.75,
"grad_norm": 2.6416263580322266,
"learning_rate": 3.026586865135171e-07,
"loss": 0.1014,
"step": 2757
},
{
"epoch": 0.75,
"grad_norm": 2.9932734966278076,
"learning_rate": 3.0202467722527823e-07,
"loss": 0.1235,
"step": 2758
},
{
"epoch": 0.75,
"grad_norm": 2.700866460800171,
"learning_rate": 3.0139121455396985e-07,
"loss": 0.1189,
"step": 2759
},
{
"epoch": 0.75,
"grad_norm": 2.708264112472534,
"learning_rate": 3.0075829899568593e-07,
"loss": 0.1093,
"step": 2760
},
{
"epoch": 0.75,
"grad_norm": 2.864259958267212,
"learning_rate": 3.001259310460923e-07,
"loss": 0.121,
"step": 2761
},
{
"epoch": 0.75,
"grad_norm": 3.0818142890930176,
"learning_rate": 2.99494111200426e-07,
"loss": 0.1325,
"step": 2762
},
{
"epoch": 0.75,
"grad_norm": 2.729863166809082,
"learning_rate": 2.9886283995349413e-07,
"loss": 0.1107,
"step": 2763
},
{
"epoch": 0.76,
"grad_norm": 2.9157660007476807,
"learning_rate": 2.9823211779967485e-07,
"loss": 0.1162,
"step": 2764
},
{
"epoch": 0.76,
"grad_norm": 2.7944540977478027,
"learning_rate": 2.9760194523291525e-07,
"loss": 0.1138,
"step": 2765
},
{
"epoch": 0.76,
"grad_norm": 2.658810615539551,
"learning_rate": 2.9697232274673355e-07,
"loss": 0.1138,
"step": 2766
},
{
"epoch": 0.76,
"grad_norm": 2.8354992866516113,
"learning_rate": 2.963432508342164e-07,
"loss": 0.1384,
"step": 2767
},
{
"epoch": 0.76,
"grad_norm": 2.7997586727142334,
"learning_rate": 2.9571472998801903e-07,
"loss": 0.12,
"step": 2768
},
{
"epoch": 0.76,
"grad_norm": 2.548947811126709,
"learning_rate": 2.950867607003653e-07,
"loss": 0.1161,
"step": 2769
},
{
"epoch": 0.76,
"grad_norm": 2.7747020721435547,
"learning_rate": 2.9445934346304703e-07,
"loss": 0.1112,
"step": 2770
},
{
"epoch": 0.76,
"grad_norm": 2.5072803497314453,
"learning_rate": 2.938324787674239e-07,
"loss": 0.1101,
"step": 2771
},
{
"epoch": 0.76,
"grad_norm": 2.520566463470459,
"learning_rate": 2.9320616710442326e-07,
"loss": 0.1012,
"step": 2772
},
{
"epoch": 0.76,
"grad_norm": 2.9922077655792236,
"learning_rate": 2.9258040896453864e-07,
"loss": 0.1182,
"step": 2773
},
{
"epoch": 0.76,
"grad_norm": 2.825206756591797,
"learning_rate": 2.919552048378302e-07,
"loss": 0.1155,
"step": 2774
},
{
"epoch": 0.76,
"grad_norm": 2.78865647315979,
"learning_rate": 2.91330555213924e-07,
"loss": 0.123,
"step": 2775
},
{
"epoch": 0.76,
"grad_norm": 2.683983564376831,
"learning_rate": 2.9070646058201276e-07,
"loss": 0.1176,
"step": 2776
},
{
"epoch": 0.76,
"grad_norm": 3.0451362133026123,
"learning_rate": 2.9008292143085413e-07,
"loss": 0.1172,
"step": 2777
},
{
"epoch": 0.76,
"grad_norm": 2.7335100173950195,
"learning_rate": 2.8945993824877033e-07,
"loss": 0.1129,
"step": 2778
},
{
"epoch": 0.76,
"grad_norm": 2.9792160987854004,
"learning_rate": 2.8883751152364843e-07,
"loss": 0.1227,
"step": 2779
},
{
"epoch": 0.76,
"grad_norm": 2.7221388816833496,
"learning_rate": 2.8821564174293957e-07,
"loss": 0.1123,
"step": 2780
},
{
"epoch": 0.76,
"grad_norm": 2.673191785812378,
"learning_rate": 2.875943293936591e-07,
"loss": 0.1078,
"step": 2781
},
{
"epoch": 0.76,
"grad_norm": 2.7910141944885254,
"learning_rate": 2.8697357496238584e-07,
"loss": 0.1108,
"step": 2782
},
{
"epoch": 0.76,
"grad_norm": 2.705504894256592,
"learning_rate": 2.8635337893526137e-07,
"loss": 0.1151,
"step": 2783
},
{
"epoch": 0.76,
"grad_norm": 2.7229225635528564,
"learning_rate": 2.857337417979898e-07,
"loss": 0.1106,
"step": 2784
},
{
"epoch": 0.76,
"grad_norm": 2.706153154373169,
"learning_rate": 2.851146640358376e-07,
"loss": 0.1175,
"step": 2785
},
{
"epoch": 0.76,
"grad_norm": 2.9575228691101074,
"learning_rate": 2.844961461336336e-07,
"loss": 0.126,
"step": 2786
},
{
"epoch": 0.76,
"grad_norm": 2.809384822845459,
"learning_rate": 2.838781885757684e-07,
"loss": 0.1084,
"step": 2787
},
{
"epoch": 0.76,
"grad_norm": 2.77486252784729,
"learning_rate": 2.8326079184619266e-07,
"loss": 0.116,
"step": 2788
},
{
"epoch": 0.76,
"grad_norm": 2.8169262409210205,
"learning_rate": 2.826439564284189e-07,
"loss": 0.1309,
"step": 2789
},
{
"epoch": 0.76,
"grad_norm": 2.7485415935516357,
"learning_rate": 2.820276828055189e-07,
"loss": 0.1179,
"step": 2790
},
{
"epoch": 0.76,
"grad_norm": 3.030266761779785,
"learning_rate": 2.8141197146012575e-07,
"loss": 0.1189,
"step": 2791
},
{
"epoch": 0.76,
"grad_norm": 2.5383927822113037,
"learning_rate": 2.8079682287443186e-07,
"loss": 0.1062,
"step": 2792
},
{
"epoch": 0.76,
"grad_norm": 2.9429070949554443,
"learning_rate": 2.8018223753018844e-07,
"loss": 0.1112,
"step": 2793
},
{
"epoch": 0.76,
"grad_norm": 3.001762866973877,
"learning_rate": 2.795682159087057e-07,
"loss": 0.1282,
"step": 2794
},
{
"epoch": 0.76,
"grad_norm": 2.8901684284210205,
"learning_rate": 2.7895475849085246e-07,
"loss": 0.124,
"step": 2795
},
{
"epoch": 0.76,
"grad_norm": 2.7526793479919434,
"learning_rate": 2.7834186575705585e-07,
"loss": 0.1157,
"step": 2796
},
{
"epoch": 0.76,
"grad_norm": 2.978522777557373,
"learning_rate": 2.7772953818730106e-07,
"loss": 0.123,
"step": 2797
},
{
"epoch": 0.76,
"grad_norm": 2.629232168197632,
"learning_rate": 2.7711777626112984e-07,
"loss": 0.1056,
"step": 2798
},
{
"epoch": 0.76,
"grad_norm": 2.5201046466827393,
"learning_rate": 2.7650658045764175e-07,
"loss": 0.1092,
"step": 2799
},
{
"epoch": 0.76,
"grad_norm": 2.606663465499878,
"learning_rate": 2.7589595125549193e-07,
"loss": 0.1103,
"step": 2800
},
{
"epoch": 0.77,
"grad_norm": 2.587350606918335,
"learning_rate": 2.7528588913289305e-07,
"loss": 0.112,
"step": 2801
},
{
"epoch": 0.77,
"grad_norm": 2.537712335586548,
"learning_rate": 2.7467639456761337e-07,
"loss": 0.114,
"step": 2802
},
{
"epoch": 0.77,
"grad_norm": 2.9924392700195312,
"learning_rate": 2.740674680369761e-07,
"loss": 0.1395,
"step": 2803
},
{
"epoch": 0.77,
"grad_norm": 2.8187615871429443,
"learning_rate": 2.734591100178597e-07,
"loss": 0.1202,
"step": 2804
},
{
"epoch": 0.77,
"grad_norm": 3.0450353622436523,
"learning_rate": 2.728513209866981e-07,
"loss": 0.1207,
"step": 2805
},
{
"epoch": 0.77,
"grad_norm": 2.924205780029297,
"learning_rate": 2.722441014194786e-07,
"loss": 0.1403,
"step": 2806
},
{
"epoch": 0.77,
"grad_norm": 2.9638490676879883,
"learning_rate": 2.716374517917437e-07,
"loss": 0.1245,
"step": 2807
},
{
"epoch": 0.77,
"grad_norm": 2.8933804035186768,
"learning_rate": 2.7103137257858863e-07,
"loss": 0.1244,
"step": 2808
},
{
"epoch": 0.77,
"grad_norm": 2.6209239959716797,
"learning_rate": 2.7042586425466194e-07,
"loss": 0.1152,
"step": 2809
},
{
"epoch": 0.77,
"grad_norm": 2.8527228832244873,
"learning_rate": 2.6982092729416585e-07,
"loss": 0.1264,
"step": 2810
},
{
"epoch": 0.77,
"grad_norm": 2.9052624702453613,
"learning_rate": 2.692165621708541e-07,
"loss": 0.1313,
"step": 2811
},
{
"epoch": 0.77,
"grad_norm": 3.0797393321990967,
"learning_rate": 2.686127693580338e-07,
"loss": 0.1176,
"step": 2812
},
{
"epoch": 0.77,
"grad_norm": 2.8544461727142334,
"learning_rate": 2.680095493285627e-07,
"loss": 0.1256,
"step": 2813
},
{
"epoch": 0.77,
"grad_norm": 2.90590500831604,
"learning_rate": 2.674069025548502e-07,
"loss": 0.1123,
"step": 2814
},
{
"epoch": 0.77,
"grad_norm": 2.509225368499756,
"learning_rate": 2.668048295088577e-07,
"loss": 0.1049,
"step": 2815
},
{
"epoch": 0.77,
"grad_norm": 2.545013666152954,
"learning_rate": 2.66203330662096e-07,
"loss": 0.1069,
"step": 2816
},
{
"epoch": 0.77,
"grad_norm": 2.698673963546753,
"learning_rate": 2.6560240648562727e-07,
"loss": 0.1135,
"step": 2817
},
{
"epoch": 0.77,
"grad_norm": 2.9910836219787598,
"learning_rate": 2.6500205745006296e-07,
"loss": 0.12,
"step": 2818
},
{
"epoch": 0.77,
"grad_norm": 2.9469501972198486,
"learning_rate": 2.644022840255641e-07,
"loss": 0.118,
"step": 2819
},
{
"epoch": 0.77,
"grad_norm": 2.9956037998199463,
"learning_rate": 2.638030866818416e-07,
"loss": 0.1385,
"step": 2820
},
{
"epoch": 0.77,
"grad_norm": 2.940786123275757,
"learning_rate": 2.6320446588815425e-07,
"loss": 0.1197,
"step": 2821
},
{
"epoch": 0.77,
"grad_norm": 2.6238014698028564,
"learning_rate": 2.6260642211331055e-07,
"loss": 0.112,
"step": 2822
},
{
"epoch": 0.77,
"grad_norm": 2.8474788665771484,
"learning_rate": 2.620089558256655e-07,
"loss": 0.1142,
"step": 2823
},
{
"epoch": 0.77,
"grad_norm": 2.917656898498535,
"learning_rate": 2.614120674931235e-07,
"loss": 0.1298,
"step": 2824
},
{
"epoch": 0.77,
"grad_norm": 2.8403565883636475,
"learning_rate": 2.608157575831352e-07,
"loss": 0.1276,
"step": 2825
},
{
"epoch": 0.77,
"grad_norm": 2.9889371395111084,
"learning_rate": 2.6022002656269846e-07,
"loss": 0.1164,
"step": 2826
},
{
"epoch": 0.77,
"grad_norm": 2.703030824661255,
"learning_rate": 2.596248748983585e-07,
"loss": 0.1034,
"step": 2827
},
{
"epoch": 0.77,
"grad_norm": 2.5715551376342773,
"learning_rate": 2.5903030305620545e-07,
"loss": 0.1192,
"step": 2828
},
{
"epoch": 0.77,
"grad_norm": 3.081249952316284,
"learning_rate": 2.5843631150187707e-07,
"loss": 0.1333,
"step": 2829
},
{
"epoch": 0.77,
"grad_norm": 2.708294630050659,
"learning_rate": 2.5784290070055514e-07,
"loss": 0.1148,
"step": 2830
},
{
"epoch": 0.77,
"grad_norm": 2.5890491008758545,
"learning_rate": 2.572500711169673e-07,
"loss": 0.1203,
"step": 2831
},
{
"epoch": 0.77,
"grad_norm": 3.0034663677215576,
"learning_rate": 2.566578232153863e-07,
"loss": 0.1327,
"step": 2832
},
{
"epoch": 0.77,
"grad_norm": 2.52756667137146,
"learning_rate": 2.560661574596284e-07,
"loss": 0.1102,
"step": 2833
},
{
"epoch": 0.77,
"grad_norm": 2.7598891258239746,
"learning_rate": 2.5547507431305547e-07,
"loss": 0.1031,
"step": 2834
},
{
"epoch": 0.77,
"grad_norm": 2.7306227684020996,
"learning_rate": 2.548845742385717e-07,
"loss": 0.117,
"step": 2835
},
{
"epoch": 0.77,
"grad_norm": 2.724799156188965,
"learning_rate": 2.5429465769862477e-07,
"loss": 0.1139,
"step": 2836
},
{
"epoch": 0.77,
"grad_norm": 2.8449912071228027,
"learning_rate": 2.537053251552065e-07,
"loss": 0.119,
"step": 2837
},
{
"epoch": 0.78,
"grad_norm": 2.7894270420074463,
"learning_rate": 2.531165770698499e-07,
"loss": 0.1238,
"step": 2838
},
{
"epoch": 0.78,
"grad_norm": 2.513648509979248,
"learning_rate": 2.5252841390363165e-07,
"loss": 0.1098,
"step": 2839
},
{
"epoch": 0.78,
"grad_norm": 3.1153371334075928,
"learning_rate": 2.519408361171693e-07,
"loss": 0.1096,
"step": 2840
},
{
"epoch": 0.78,
"grad_norm": 2.546874523162842,
"learning_rate": 2.513538441706221e-07,
"loss": 0.0989,
"step": 2841
},
{
"epoch": 0.78,
"grad_norm": 2.72564435005188,
"learning_rate": 2.5076743852369145e-07,
"loss": 0.1205,
"step": 2842
},
{
"epoch": 0.78,
"grad_norm": 2.6190879344940186,
"learning_rate": 2.50181619635618e-07,
"loss": 0.1044,
"step": 2843
},
{
"epoch": 0.78,
"grad_norm": 2.7285542488098145,
"learning_rate": 2.4959638796518455e-07,
"loss": 0.1099,
"step": 2844
},
{
"epoch": 0.78,
"grad_norm": 2.8141350746154785,
"learning_rate": 2.49011743970713e-07,
"loss": 0.1249,
"step": 2845
},
{
"epoch": 0.78,
"grad_norm": 2.9485819339752197,
"learning_rate": 2.4842768811006477e-07,
"loss": 0.1236,
"step": 2846
},
{
"epoch": 0.78,
"grad_norm": 2.8233511447906494,
"learning_rate": 2.478442208406418e-07,
"loss": 0.1155,
"step": 2847
},
{
"epoch": 0.78,
"grad_norm": 2.608671188354492,
"learning_rate": 2.47261342619384e-07,
"loss": 0.1157,
"step": 2848
},
{
"epoch": 0.78,
"grad_norm": 2.568787097930908,
"learning_rate": 2.466790539027708e-07,
"loss": 0.1108,
"step": 2849
},
{
"epoch": 0.78,
"grad_norm": 2.983731746673584,
"learning_rate": 2.460973551468194e-07,
"loss": 0.1264,
"step": 2850
},
{
"epoch": 0.78,
"grad_norm": 2.872974395751953,
"learning_rate": 2.4551624680708484e-07,
"loss": 0.1288,
"step": 2851
},
{
"epoch": 0.78,
"grad_norm": 2.807682514190674,
"learning_rate": 2.449357293386606e-07,
"loss": 0.1282,
"step": 2852
},
{
"epoch": 0.78,
"grad_norm": 2.9407236576080322,
"learning_rate": 2.4435580319617624e-07,
"loss": 0.1158,
"step": 2853
},
{
"epoch": 0.78,
"grad_norm": 2.835533380508423,
"learning_rate": 2.437764688337998e-07,
"loss": 0.1142,
"step": 2854
},
{
"epoch": 0.78,
"grad_norm": 2.828916072845459,
"learning_rate": 2.431977267052343e-07,
"loss": 0.1264,
"step": 2855
},
{
"epoch": 0.78,
"grad_norm": 2.686336040496826,
"learning_rate": 2.426195772637195e-07,
"loss": 0.1134,
"step": 2856
},
{
"epoch": 0.78,
"grad_norm": 2.516364812850952,
"learning_rate": 2.4204202096203163e-07,
"loss": 0.0996,
"step": 2857
},
{
"epoch": 0.78,
"grad_norm": 2.841402769088745,
"learning_rate": 2.4146505825248143e-07,
"loss": 0.1279,
"step": 2858
},
{
"epoch": 0.78,
"grad_norm": 2.809781074523926,
"learning_rate": 2.408886895869157e-07,
"loss": 0.1215,
"step": 2859
},
{
"epoch": 0.78,
"grad_norm": 2.394096612930298,
"learning_rate": 2.403129154167153e-07,
"loss": 0.099,
"step": 2860
},
{
"epoch": 0.78,
"grad_norm": 2.788119316101074,
"learning_rate": 2.3973773619279533e-07,
"loss": 0.1118,
"step": 2861
},
{
"epoch": 0.78,
"grad_norm": 2.585268497467041,
"learning_rate": 2.391631523656058e-07,
"loss": 0.099,
"step": 2862
},
{
"epoch": 0.78,
"grad_norm": 2.89695143699646,
"learning_rate": 2.3858916438513043e-07,
"loss": 0.1131,
"step": 2863
},
{
"epoch": 0.78,
"grad_norm": 2.754573345184326,
"learning_rate": 2.3801577270088535e-07,
"loss": 0.1168,
"step": 2864
},
{
"epoch": 0.78,
"grad_norm": 2.724893093109131,
"learning_rate": 2.3744297776192047e-07,
"loss": 0.1237,
"step": 2865
},
{
"epoch": 0.78,
"grad_norm": 2.6611545085906982,
"learning_rate": 2.368707800168176e-07,
"loss": 0.1104,
"step": 2866
},
{
"epoch": 0.78,
"grad_norm": 2.899855136871338,
"learning_rate": 2.3629917991369198e-07,
"loss": 0.1189,
"step": 2867
},
{
"epoch": 0.78,
"grad_norm": 2.6722350120544434,
"learning_rate": 2.357281779001904e-07,
"loss": 0.1111,
"step": 2868
},
{
"epoch": 0.78,
"grad_norm": 2.7901811599731445,
"learning_rate": 2.351577744234907e-07,
"loss": 0.1186,
"step": 2869
},
{
"epoch": 0.78,
"grad_norm": 2.8303353786468506,
"learning_rate": 2.345879699303025e-07,
"loss": 0.1247,
"step": 2870
},
{
"epoch": 0.78,
"grad_norm": 2.8870644569396973,
"learning_rate": 2.340187648668658e-07,
"loss": 0.1221,
"step": 2871
},
{
"epoch": 0.78,
"grad_norm": 2.6357645988464355,
"learning_rate": 2.3345015967895197e-07,
"loss": 0.1229,
"step": 2872
},
{
"epoch": 0.78,
"grad_norm": 2.5500104427337646,
"learning_rate": 2.3288215481186235e-07,
"loss": 0.1128,
"step": 2873
},
{
"epoch": 0.79,
"grad_norm": 2.651008367538452,
"learning_rate": 2.3231475071042773e-07,
"loss": 0.1109,
"step": 2874
},
{
"epoch": 0.79,
"grad_norm": 2.873354196548462,
"learning_rate": 2.3174794781900853e-07,
"loss": 0.1139,
"step": 2875
},
{
"epoch": 0.79,
"grad_norm": 3.0768046379089355,
"learning_rate": 2.3118174658149436e-07,
"loss": 0.1296,
"step": 2876
},
{
"epoch": 0.79,
"grad_norm": 2.6786396503448486,
"learning_rate": 2.30616147441304e-07,
"loss": 0.1111,
"step": 2877
},
{
"epoch": 0.79,
"grad_norm": 3.07175612449646,
"learning_rate": 2.300511508413845e-07,
"loss": 0.1286,
"step": 2878
},
{
"epoch": 0.79,
"grad_norm": 2.588007926940918,
"learning_rate": 2.2948675722421085e-07,
"loss": 0.1002,
"step": 2879
},
{
"epoch": 0.79,
"grad_norm": 2.700286626815796,
"learning_rate": 2.2892296703178592e-07,
"loss": 0.1204,
"step": 2880
},
{
"epoch": 0.79,
"grad_norm": 2.6056156158447266,
"learning_rate": 2.283597807056399e-07,
"loss": 0.0977,
"step": 2881
},
{
"epoch": 0.79,
"grad_norm": 2.98291277885437,
"learning_rate": 2.2779719868683013e-07,
"loss": 0.1211,
"step": 2882
},
{
"epoch": 0.79,
"grad_norm": 2.6762681007385254,
"learning_rate": 2.272352214159412e-07,
"loss": 0.1167,
"step": 2883
},
{
"epoch": 0.79,
"grad_norm": 2.624441623687744,
"learning_rate": 2.2667384933308352e-07,
"loss": 0.1106,
"step": 2884
},
{
"epoch": 0.79,
"grad_norm": 2.845133066177368,
"learning_rate": 2.2611308287789344e-07,
"loss": 0.1255,
"step": 2885
},
{
"epoch": 0.79,
"grad_norm": 2.8640010356903076,
"learning_rate": 2.2555292248953305e-07,
"loss": 0.1304,
"step": 2886
},
{
"epoch": 0.79,
"grad_norm": 2.504380702972412,
"learning_rate": 2.2499336860669028e-07,
"loss": 0.1037,
"step": 2887
},
{
"epoch": 0.79,
"grad_norm": 2.75575852394104,
"learning_rate": 2.244344216675781e-07,
"loss": 0.1044,
"step": 2888
},
{
"epoch": 0.79,
"grad_norm": 2.4212238788604736,
"learning_rate": 2.2387608210993346e-07,
"loss": 0.0993,
"step": 2889
},
{
"epoch": 0.79,
"grad_norm": 2.851504325866699,
"learning_rate": 2.233183503710182e-07,
"loss": 0.122,
"step": 2890
},
{
"epoch": 0.79,
"grad_norm": 2.895587682723999,
"learning_rate": 2.2276122688761757e-07,
"loss": 0.1126,
"step": 2891
},
{
"epoch": 0.79,
"grad_norm": 2.7983148097991943,
"learning_rate": 2.2220471209604119e-07,
"loss": 0.1244,
"step": 2892
},
{
"epoch": 0.79,
"grad_norm": 2.6924493312835693,
"learning_rate": 2.2164880643212192e-07,
"loss": 0.104,
"step": 2893
},
{
"epoch": 0.79,
"grad_norm": 2.982330322265625,
"learning_rate": 2.2109351033121514e-07,
"loss": 0.1264,
"step": 2894
},
{
"epoch": 0.79,
"grad_norm": 2.5642998218536377,
"learning_rate": 2.2053882422819902e-07,
"loss": 0.1027,
"step": 2895
},
{
"epoch": 0.79,
"grad_norm": 2.423588752746582,
"learning_rate": 2.1998474855747373e-07,
"loss": 0.0971,
"step": 2896
},
{
"epoch": 0.79,
"grad_norm": 2.9514458179473877,
"learning_rate": 2.1943128375296194e-07,
"loss": 0.1204,
"step": 2897
},
{
"epoch": 0.79,
"grad_norm": 2.645742893218994,
"learning_rate": 2.1887843024810803e-07,
"loss": 0.1074,
"step": 2898
},
{
"epoch": 0.79,
"grad_norm": 2.7427890300750732,
"learning_rate": 2.183261884758769e-07,
"loss": 0.1093,
"step": 2899
},
{
"epoch": 0.79,
"grad_norm": 2.7665889263153076,
"learning_rate": 2.1777455886875496e-07,
"loss": 0.1043,
"step": 2900
},
{
"epoch": 0.79,
"grad_norm": 2.7598376274108887,
"learning_rate": 2.1722354185874846e-07,
"loss": 0.1128,
"step": 2901
},
{
"epoch": 0.79,
"grad_norm": 2.80429744720459,
"learning_rate": 2.1667313787738496e-07,
"loss": 0.1105,
"step": 2902
},
{
"epoch": 0.79,
"grad_norm": 2.705533742904663,
"learning_rate": 2.161233473557116e-07,
"loss": 0.111,
"step": 2903
},
{
"epoch": 0.79,
"grad_norm": 2.8968639373779297,
"learning_rate": 2.1557417072429451e-07,
"loss": 0.1219,
"step": 2904
},
{
"epoch": 0.79,
"grad_norm": 2.773904323577881,
"learning_rate": 2.150256084132196e-07,
"loss": 0.1118,
"step": 2905
},
{
"epoch": 0.79,
"grad_norm": 2.6877477169036865,
"learning_rate": 2.144776608520913e-07,
"loss": 0.1079,
"step": 2906
},
{
"epoch": 0.79,
"grad_norm": 2.963486433029175,
"learning_rate": 2.1393032847003289e-07,
"loss": 0.1081,
"step": 2907
},
{
"epoch": 0.79,
"grad_norm": 2.715536117553711,
"learning_rate": 2.133836116956862e-07,
"loss": 0.1106,
"step": 2908
},
{
"epoch": 0.79,
"grad_norm": 2.7190189361572266,
"learning_rate": 2.1283751095721024e-07,
"loss": 0.1108,
"step": 2909
},
{
"epoch": 0.79,
"grad_norm": 2.8466503620147705,
"learning_rate": 2.1229202668228196e-07,
"loss": 0.1122,
"step": 2910
},
{
"epoch": 0.8,
"grad_norm": 3.005613088607788,
"learning_rate": 2.1174715929809516e-07,
"loss": 0.1126,
"step": 2911
},
{
"epoch": 0.8,
"grad_norm": 2.653109312057495,
"learning_rate": 2.1120290923136107e-07,
"loss": 0.1151,
"step": 2912
},
{
"epoch": 0.8,
"grad_norm": 2.745866060256958,
"learning_rate": 2.1065927690830752e-07,
"loss": 0.1112,
"step": 2913
},
{
"epoch": 0.8,
"grad_norm": 3.076171636581421,
"learning_rate": 2.1011626275467808e-07,
"loss": 0.1361,
"step": 2914
},
{
"epoch": 0.8,
"grad_norm": 2.815768003463745,
"learning_rate": 2.0957386719573223e-07,
"loss": 0.1189,
"step": 2915
},
{
"epoch": 0.8,
"grad_norm": 2.729424476623535,
"learning_rate": 2.0903209065624484e-07,
"loss": 0.1127,
"step": 2916
},
{
"epoch": 0.8,
"grad_norm": 2.8785223960876465,
"learning_rate": 2.0849093356050685e-07,
"loss": 0.1361,
"step": 2917
},
{
"epoch": 0.8,
"grad_norm": 2.7613465785980225,
"learning_rate": 2.0795039633232346e-07,
"loss": 0.1212,
"step": 2918
},
{
"epoch": 0.8,
"grad_norm": 2.8793344497680664,
"learning_rate": 2.0741047939501434e-07,
"loss": 0.1197,
"step": 2919
},
{
"epoch": 0.8,
"grad_norm": 2.81070876121521,
"learning_rate": 2.0687118317141406e-07,
"loss": 0.1142,
"step": 2920
},
{
"epoch": 0.8,
"grad_norm": 2.676225423812866,
"learning_rate": 2.063325080838697e-07,
"loss": 0.1138,
"step": 2921
},
{
"epoch": 0.8,
"grad_norm": 2.6334424018859863,
"learning_rate": 2.0579445455424315e-07,
"loss": 0.119,
"step": 2922
},
{
"epoch": 0.8,
"grad_norm": 2.856642484664917,
"learning_rate": 2.0525702300390945e-07,
"loss": 0.123,
"step": 2923
},
{
"epoch": 0.8,
"grad_norm": 2.81278395652771,
"learning_rate": 2.0472021385375572e-07,
"loss": 0.1154,
"step": 2924
},
{
"epoch": 0.8,
"grad_norm": 2.5518946647644043,
"learning_rate": 2.0418402752418283e-07,
"loss": 0.1129,
"step": 2925
},
{
"epoch": 0.8,
"grad_norm": 2.406761646270752,
"learning_rate": 2.0364846443510276e-07,
"loss": 0.1062,
"step": 2926
},
{
"epoch": 0.8,
"grad_norm": 2.778789758682251,
"learning_rate": 2.031135250059397e-07,
"loss": 0.1211,
"step": 2927
},
{
"epoch": 0.8,
"grad_norm": 2.820254325866699,
"learning_rate": 2.0257920965563012e-07,
"loss": 0.1083,
"step": 2928
},
{
"epoch": 0.8,
"grad_norm": 2.8451895713806152,
"learning_rate": 2.0204551880262066e-07,
"loss": 0.1135,
"step": 2929
},
{
"epoch": 0.8,
"grad_norm": 3.0596845149993896,
"learning_rate": 2.0151245286486996e-07,
"loss": 0.1306,
"step": 2930
},
{
"epoch": 0.8,
"grad_norm": 2.6155900955200195,
"learning_rate": 2.009800122598465e-07,
"loss": 0.1066,
"step": 2931
},
{
"epoch": 0.8,
"grad_norm": 2.4980719089508057,
"learning_rate": 2.0044819740452911e-07,
"loss": 0.1001,
"step": 2932
},
{
"epoch": 0.8,
"grad_norm": 2.474144458770752,
"learning_rate": 1.9991700871540708e-07,
"loss": 0.1033,
"step": 2933
},
{
"epoch": 0.8,
"grad_norm": 2.7342374324798584,
"learning_rate": 1.993864466084786e-07,
"loss": 0.1024,
"step": 2934
},
{
"epoch": 0.8,
"grad_norm": 2.7930715084075928,
"learning_rate": 1.9885651149925188e-07,
"loss": 0.1055,
"step": 2935
},
{
"epoch": 0.8,
"grad_norm": 2.949699640274048,
"learning_rate": 1.983272038027437e-07,
"loss": 0.1316,
"step": 2936
},
{
"epoch": 0.8,
"grad_norm": 2.5166850090026855,
"learning_rate": 1.9779852393347907e-07,
"loss": 0.1125,
"step": 2937
},
{
"epoch": 0.8,
"grad_norm": 2.5932557582855225,
"learning_rate": 1.9727047230549242e-07,
"loss": 0.105,
"step": 2938
},
{
"epoch": 0.8,
"grad_norm": 2.8534560203552246,
"learning_rate": 1.9674304933232498e-07,
"loss": 0.1143,
"step": 2939
},
{
"epoch": 0.8,
"grad_norm": 3.1329243183135986,
"learning_rate": 1.962162554270267e-07,
"loss": 0.1247,
"step": 2940
},
{
"epoch": 0.8,
"grad_norm": 3.1219422817230225,
"learning_rate": 1.9569009100215418e-07,
"loss": 0.1282,
"step": 2941
},
{
"epoch": 0.8,
"grad_norm": 2.9756643772125244,
"learning_rate": 1.9516455646977103e-07,
"loss": 0.1147,
"step": 2942
},
{
"epoch": 0.8,
"grad_norm": 2.833127498626709,
"learning_rate": 1.9463965224144807e-07,
"loss": 0.1119,
"step": 2943
},
{
"epoch": 0.8,
"grad_norm": 2.7026820182800293,
"learning_rate": 1.94115378728262e-07,
"loss": 0.1098,
"step": 2944
},
{
"epoch": 0.8,
"grad_norm": 2.903571844100952,
"learning_rate": 1.9359173634079606e-07,
"loss": 0.1277,
"step": 2945
},
{
"epoch": 0.8,
"grad_norm": 2.713010787963867,
"learning_rate": 1.9306872548913876e-07,
"loss": 0.1058,
"step": 2946
},
{
"epoch": 0.81,
"grad_norm": 3.126382827758789,
"learning_rate": 1.9254634658288405e-07,
"loss": 0.1176,
"step": 2947
},
{
"epoch": 0.81,
"grad_norm": 2.9158408641815186,
"learning_rate": 1.920246000311315e-07,
"loss": 0.1177,
"step": 2948
},
{
"epoch": 0.81,
"grad_norm": 2.6403298377990723,
"learning_rate": 1.9150348624248468e-07,
"loss": 0.1078,
"step": 2949
},
{
"epoch": 0.81,
"grad_norm": 2.958865165710449,
"learning_rate": 1.9098300562505264e-07,
"loss": 0.1322,
"step": 2950
},
{
"epoch": 0.81,
"grad_norm": 2.862724542617798,
"learning_rate": 1.9046315858644746e-07,
"loss": 0.1204,
"step": 2951
},
{
"epoch": 0.81,
"grad_norm": 2.818732738494873,
"learning_rate": 1.8994394553378556e-07,
"loss": 0.1227,
"step": 2952
},
{
"epoch": 0.81,
"grad_norm": 2.4232325553894043,
"learning_rate": 1.8942536687368703e-07,
"loss": 0.104,
"step": 2953
},
{
"epoch": 0.81,
"grad_norm": 2.7659695148468018,
"learning_rate": 1.8890742301227468e-07,
"loss": 0.1233,
"step": 2954
},
{
"epoch": 0.81,
"grad_norm": 2.607856273651123,
"learning_rate": 1.883901143551747e-07,
"loss": 0.1081,
"step": 2955
},
{
"epoch": 0.81,
"grad_norm": 2.6588351726531982,
"learning_rate": 1.878734413075156e-07,
"loss": 0.1048,
"step": 2956
},
{
"epoch": 0.81,
"grad_norm": 2.793586015701294,
"learning_rate": 1.8735740427392755e-07,
"loss": 0.1179,
"step": 2957
},
{
"epoch": 0.81,
"grad_norm": 2.70487117767334,
"learning_rate": 1.8684200365854375e-07,
"loss": 0.1067,
"step": 2958
},
{
"epoch": 0.81,
"grad_norm": 2.7607598304748535,
"learning_rate": 1.8632723986499787e-07,
"loss": 0.1112,
"step": 2959
},
{
"epoch": 0.81,
"grad_norm": 2.9593124389648438,
"learning_rate": 1.8581311329642591e-07,
"loss": 0.1169,
"step": 2960
},
{
"epoch": 0.81,
"grad_norm": 2.719787359237671,
"learning_rate": 1.8529962435546398e-07,
"loss": 0.1193,
"step": 2961
},
{
"epoch": 0.81,
"grad_norm": 2.843885898590088,
"learning_rate": 1.8478677344424898e-07,
"loss": 0.1216,
"step": 2962
},
{
"epoch": 0.81,
"grad_norm": 2.6336824893951416,
"learning_rate": 1.8427456096441874e-07,
"loss": 0.109,
"step": 2963
},
{
"epoch": 0.81,
"grad_norm": 2.624443769454956,
"learning_rate": 1.8376298731711016e-07,
"loss": 0.1055,
"step": 2964
},
{
"epoch": 0.81,
"grad_norm": 2.865316152572632,
"learning_rate": 1.8325205290296098e-07,
"loss": 0.1169,
"step": 2965
},
{
"epoch": 0.81,
"grad_norm": 2.6282095909118652,
"learning_rate": 1.8274175812210724e-07,
"loss": 0.1084,
"step": 2966
},
{
"epoch": 0.81,
"grad_norm": 2.7483646869659424,
"learning_rate": 1.822321033741845e-07,
"loss": 0.1177,
"step": 2967
},
{
"epoch": 0.81,
"grad_norm": 2.7712483406066895,
"learning_rate": 1.8172308905832735e-07,
"loss": 0.1124,
"step": 2968
},
{
"epoch": 0.81,
"grad_norm": 2.7491652965545654,
"learning_rate": 1.8121471557316813e-07,
"loss": 0.1081,
"step": 2969
},
{
"epoch": 0.81,
"grad_norm": 2.5816409587860107,
"learning_rate": 1.8070698331683841e-07,
"loss": 0.1048,
"step": 2970
},
{
"epoch": 0.81,
"grad_norm": 2.7578561305999756,
"learning_rate": 1.8019989268696666e-07,
"loss": 0.1077,
"step": 2971
},
{
"epoch": 0.81,
"grad_norm": 2.678293466567993,
"learning_rate": 1.7969344408067866e-07,
"loss": 0.1237,
"step": 2972
},
{
"epoch": 0.81,
"grad_norm": 2.8176612854003906,
"learning_rate": 1.7918763789459857e-07,
"loss": 0.1211,
"step": 2973
},
{
"epoch": 0.81,
"grad_norm": 2.752276659011841,
"learning_rate": 1.7868247452484608e-07,
"loss": 0.1069,
"step": 2974
},
{
"epoch": 0.81,
"grad_norm": 2.606996536254883,
"learning_rate": 1.7817795436703874e-07,
"loss": 0.1107,
"step": 2975
},
{
"epoch": 0.81,
"grad_norm": 2.7089779376983643,
"learning_rate": 1.776740778162895e-07,
"loss": 0.1176,
"step": 2976
},
{
"epoch": 0.81,
"grad_norm": 3.056824207305908,
"learning_rate": 1.7717084526720728e-07,
"loss": 0.1214,
"step": 2977
},
{
"epoch": 0.81,
"grad_norm": 2.5314202308654785,
"learning_rate": 1.7666825711389722e-07,
"loss": 0.0998,
"step": 2978
},
{
"epoch": 0.81,
"grad_norm": 2.7676849365234375,
"learning_rate": 1.7616631374995904e-07,
"loss": 0.117,
"step": 2979
},
{
"epoch": 0.81,
"grad_norm": 2.4773690700531006,
"learning_rate": 1.7566501556848855e-07,
"loss": 0.0979,
"step": 2980
},
{
"epoch": 0.81,
"grad_norm": 2.7643463611602783,
"learning_rate": 1.7516436296207538e-07,
"loss": 0.1172,
"step": 2981
},
{
"epoch": 0.81,
"grad_norm": 2.681955337524414,
"learning_rate": 1.7466435632280352e-07,
"loss": 0.1206,
"step": 2982
},
{
"epoch": 0.81,
"grad_norm": 2.6423914432525635,
"learning_rate": 1.7416499604225176e-07,
"loss": 0.111,
"step": 2983
},
{
"epoch": 0.82,
"grad_norm": 2.6484272480010986,
"learning_rate": 1.7366628251149252e-07,
"loss": 0.1061,
"step": 2984
},
{
"epoch": 0.82,
"grad_norm": 3.124119281768799,
"learning_rate": 1.7316821612109135e-07,
"loss": 0.1196,
"step": 2985
},
{
"epoch": 0.82,
"grad_norm": 2.889885187149048,
"learning_rate": 1.7267079726110723e-07,
"loss": 0.1266,
"step": 2986
},
{
"epoch": 0.82,
"grad_norm": 2.9232211112976074,
"learning_rate": 1.721740263210918e-07,
"loss": 0.1216,
"step": 2987
},
{
"epoch": 0.82,
"grad_norm": 2.7191336154937744,
"learning_rate": 1.716779036900895e-07,
"loss": 0.1033,
"step": 2988
},
{
"epoch": 0.82,
"grad_norm": 2.7927796840667725,
"learning_rate": 1.7118242975663754e-07,
"loss": 0.116,
"step": 2989
},
{
"epoch": 0.82,
"grad_norm": 3.0129342079162598,
"learning_rate": 1.7068760490876422e-07,
"loss": 0.1265,
"step": 2990
},
{
"epoch": 0.82,
"grad_norm": 2.707798480987549,
"learning_rate": 1.7019342953398997e-07,
"loss": 0.1153,
"step": 2991
},
{
"epoch": 0.82,
"grad_norm": 3.0737831592559814,
"learning_rate": 1.696999040193261e-07,
"loss": 0.1102,
"step": 2992
},
{
"epoch": 0.82,
"grad_norm": 2.4257168769836426,
"learning_rate": 1.692070287512758e-07,
"loss": 0.1031,
"step": 2993
},
{
"epoch": 0.82,
"grad_norm": 2.5452399253845215,
"learning_rate": 1.6871480411583283e-07,
"loss": 0.0914,
"step": 2994
},
{
"epoch": 0.82,
"grad_norm": 2.894639253616333,
"learning_rate": 1.6822323049848087e-07,
"loss": 0.133,
"step": 2995
},
{
"epoch": 0.82,
"grad_norm": 2.816392183303833,
"learning_rate": 1.6773230828419405e-07,
"loss": 0.1206,
"step": 2996
},
{
"epoch": 0.82,
"grad_norm": 3.0011281967163086,
"learning_rate": 1.672420378574363e-07,
"loss": 0.1242,
"step": 2997
},
{
"epoch": 0.82,
"grad_norm": 2.957533597946167,
"learning_rate": 1.6675241960216125e-07,
"loss": 0.1177,
"step": 2998
},
{
"epoch": 0.82,
"grad_norm": 2.866485595703125,
"learning_rate": 1.6626345390181206e-07,
"loss": 0.1113,
"step": 2999
},
{
"epoch": 0.82,
"grad_norm": 3.1493520736694336,
"learning_rate": 1.6577514113932035e-07,
"loss": 0.1318,
"step": 3000
},
{
"epoch": 0.82,
"grad_norm": 2.880171298980713,
"learning_rate": 1.6528748169710638e-07,
"loss": 0.1252,
"step": 3001
},
{
"epoch": 0.82,
"grad_norm": 2.9624109268188477,
"learning_rate": 1.648004759570787e-07,
"loss": 0.1167,
"step": 3002
},
{
"epoch": 0.82,
"grad_norm": 2.7028636932373047,
"learning_rate": 1.6431412430063462e-07,
"loss": 0.1129,
"step": 3003
},
{
"epoch": 0.82,
"grad_norm": 2.9838626384735107,
"learning_rate": 1.6382842710865875e-07,
"loss": 0.1261,
"step": 3004
},
{
"epoch": 0.82,
"grad_norm": 2.775270938873291,
"learning_rate": 1.6334338476152288e-07,
"loss": 0.1173,
"step": 3005
},
{
"epoch": 0.82,
"grad_norm": 3.03607177734375,
"learning_rate": 1.628589976390865e-07,
"loss": 0.1228,
"step": 3006
},
{
"epoch": 0.82,
"grad_norm": 2.8795881271362305,
"learning_rate": 1.6237526612069508e-07,
"loss": 0.1097,
"step": 3007
},
{
"epoch": 0.82,
"grad_norm": 3.080247640609741,
"learning_rate": 1.6189219058518177e-07,
"loss": 0.1245,
"step": 3008
},
{
"epoch": 0.82,
"grad_norm": 2.7216033935546875,
"learning_rate": 1.6140977141086575e-07,
"loss": 0.1128,
"step": 3009
},
{
"epoch": 0.82,
"grad_norm": 2.648832082748413,
"learning_rate": 1.6092800897555148e-07,
"loss": 0.1059,
"step": 3010
},
{
"epoch": 0.82,
"grad_norm": 2.8429346084594727,
"learning_rate": 1.6044690365652957e-07,
"loss": 0.1191,
"step": 3011
},
{
"epoch": 0.82,
"grad_norm": 3.0376110076904297,
"learning_rate": 1.599664558305759e-07,
"loss": 0.1244,
"step": 3012
},
{
"epoch": 0.82,
"grad_norm": 2.5975003242492676,
"learning_rate": 1.5948666587395142e-07,
"loss": 0.0968,
"step": 3013
},
{
"epoch": 0.82,
"grad_norm": 2.4642364978790283,
"learning_rate": 1.5900753416240255e-07,
"loss": 0.0967,
"step": 3014
},
{
"epoch": 0.82,
"grad_norm": 3.507802724838257,
"learning_rate": 1.5852906107115893e-07,
"loss": 0.1174,
"step": 3015
},
{
"epoch": 0.82,
"grad_norm": 2.7020273208618164,
"learning_rate": 1.5805124697493578e-07,
"loss": 0.1116,
"step": 3016
},
{
"epoch": 0.82,
"grad_norm": 2.912220001220703,
"learning_rate": 1.5757409224793072e-07,
"loss": 0.1152,
"step": 3017
},
{
"epoch": 0.82,
"grad_norm": 2.606558084487915,
"learning_rate": 1.5709759726382621e-07,
"loss": 0.0978,
"step": 3018
},
{
"epoch": 0.82,
"grad_norm": 3.11519718170166,
"learning_rate": 1.5662176239578773e-07,
"loss": 0.127,
"step": 3019
},
{
"epoch": 0.82,
"grad_norm": 2.914428949356079,
"learning_rate": 1.5614658801646353e-07,
"loss": 0.1095,
"step": 3020
},
{
"epoch": 0.83,
"grad_norm": 2.8984978199005127,
"learning_rate": 1.5567207449798515e-07,
"loss": 0.1234,
"step": 3021
},
{
"epoch": 0.83,
"grad_norm": 2.8738179206848145,
"learning_rate": 1.5519822221196544e-07,
"loss": 0.1194,
"step": 3022
},
{
"epoch": 0.83,
"grad_norm": 2.75415301322937,
"learning_rate": 1.5472503152950056e-07,
"loss": 0.1151,
"step": 3023
},
{
"epoch": 0.83,
"grad_norm": 2.94234561920166,
"learning_rate": 1.5425250282116842e-07,
"loss": 0.1204,
"step": 3024
},
{
"epoch": 0.83,
"grad_norm": 2.6877970695495605,
"learning_rate": 1.5378063645702766e-07,
"loss": 0.1145,
"step": 3025
},
{
"epoch": 0.83,
"grad_norm": 2.7449467182159424,
"learning_rate": 1.5330943280661967e-07,
"loss": 0.1202,
"step": 3026
},
{
"epoch": 0.83,
"grad_norm": 2.5695841312408447,
"learning_rate": 1.5283889223896474e-07,
"loss": 0.107,
"step": 3027
},
{
"epoch": 0.83,
"grad_norm": 2.8050339221954346,
"learning_rate": 1.5236901512256573e-07,
"loss": 0.1172,
"step": 3028
},
{
"epoch": 0.83,
"grad_norm": 2.907428503036499,
"learning_rate": 1.518998018254054e-07,
"loss": 0.1275,
"step": 3029
},
{
"epoch": 0.83,
"grad_norm": 2.703794002532959,
"learning_rate": 1.5143125271494606e-07,
"loss": 0.103,
"step": 3030
},
{
"epoch": 0.83,
"grad_norm": 2.751523733139038,
"learning_rate": 1.5096336815813103e-07,
"loss": 0.1107,
"step": 3031
},
{
"epoch": 0.83,
"grad_norm": 2.788540840148926,
"learning_rate": 1.5049614852138148e-07,
"loss": 0.1229,
"step": 3032
},
{
"epoch": 0.83,
"grad_norm": 2.535400867462158,
"learning_rate": 1.5002959417059935e-07,
"loss": 0.0967,
"step": 3033
},
{
"epoch": 0.83,
"grad_norm": 2.8100125789642334,
"learning_rate": 1.4956370547116527e-07,
"loss": 0.1078,
"step": 3034
},
{
"epoch": 0.83,
"grad_norm": 3.075692892074585,
"learning_rate": 1.490984827879378e-07,
"loss": 0.1261,
"step": 3035
},
{
"epoch": 0.83,
"grad_norm": 2.9490807056427,
"learning_rate": 1.486339264852553e-07,
"loss": 0.1387,
"step": 3036
},
{
"epoch": 0.83,
"grad_norm": 2.610565662384033,
"learning_rate": 1.481700369269323e-07,
"loss": 0.1008,
"step": 3037
},
{
"epoch": 0.83,
"grad_norm": 2.6819276809692383,
"learning_rate": 1.47706814476263e-07,
"loss": 0.115,
"step": 3038
},
{
"epoch": 0.83,
"grad_norm": 2.665633201599121,
"learning_rate": 1.4724425949601837e-07,
"loss": 0.1104,
"step": 3039
},
{
"epoch": 0.83,
"grad_norm": 2.8097269535064697,
"learning_rate": 1.4678237234844648e-07,
"loss": 0.1225,
"step": 3040
},
{
"epoch": 0.83,
"grad_norm": 2.511746644973755,
"learning_rate": 1.4632115339527306e-07,
"loss": 0.1047,
"step": 3041
},
{
"epoch": 0.83,
"grad_norm": 2.7344541549682617,
"learning_rate": 1.4586060299769975e-07,
"loss": 0.115,
"step": 3042
},
{
"epoch": 0.83,
"grad_norm": 2.909001111984253,
"learning_rate": 1.4540072151640493e-07,
"loss": 0.11,
"step": 3043
},
{
"epoch": 0.83,
"grad_norm": 2.7571046352386475,
"learning_rate": 1.4494150931154358e-07,
"loss": 0.1176,
"step": 3044
},
{
"epoch": 0.83,
"grad_norm": 2.781496524810791,
"learning_rate": 1.4448296674274564e-07,
"loss": 0.1262,
"step": 3045
},
{
"epoch": 0.83,
"grad_norm": 3.0702664852142334,
"learning_rate": 1.4402509416911756e-07,
"loss": 0.1353,
"step": 3046
},
{
"epoch": 0.83,
"grad_norm": 2.733211040496826,
"learning_rate": 1.4356789194924045e-07,
"loss": 0.1069,
"step": 3047
},
{
"epoch": 0.83,
"grad_norm": 2.7156100273132324,
"learning_rate": 1.4311136044117033e-07,
"loss": 0.1042,
"step": 3048
},
{
"epoch": 0.83,
"grad_norm": 3.0435407161712646,
"learning_rate": 1.4265550000243886e-07,
"loss": 0.1176,
"step": 3049
},
{
"epoch": 0.83,
"grad_norm": 2.901156425476074,
"learning_rate": 1.4220031099005092e-07,
"loss": 0.1114,
"step": 3050
},
{
"epoch": 0.83,
"grad_norm": 2.631953477859497,
"learning_rate": 1.417457937604868e-07,
"loss": 0.1265,
"step": 3051
},
{
"epoch": 0.83,
"grad_norm": 2.4306070804595947,
"learning_rate": 1.4129194866969973e-07,
"loss": 0.1032,
"step": 3052
},
{
"epoch": 0.83,
"grad_norm": 2.781864881515503,
"learning_rate": 1.4083877607311667e-07,
"loss": 0.1239,
"step": 3053
},
{
"epoch": 0.83,
"grad_norm": 2.5214765071868896,
"learning_rate": 1.4038627632563882e-07,
"loss": 0.1117,
"step": 3054
},
{
"epoch": 0.83,
"grad_norm": 2.526747941970825,
"learning_rate": 1.3993444978163904e-07,
"loss": 0.1056,
"step": 3055
},
{
"epoch": 0.83,
"grad_norm": 2.746837854385376,
"learning_rate": 1.394832967949643e-07,
"loss": 0.1145,
"step": 3056
},
{
"epoch": 0.84,
"grad_norm": 2.6512248516082764,
"learning_rate": 1.3903281771893316e-07,
"loss": 0.1101,
"step": 3057
},
{
"epoch": 0.84,
"grad_norm": 2.8380045890808105,
"learning_rate": 1.3858301290633667e-07,
"loss": 0.1224,
"step": 3058
},
{
"epoch": 0.84,
"grad_norm": 2.926839590072632,
"learning_rate": 1.3813388270943828e-07,
"loss": 0.1253,
"step": 3059
},
{
"epoch": 0.84,
"grad_norm": 2.591266632080078,
"learning_rate": 1.3768542747997214e-07,
"loss": 0.1157,
"step": 3060
},
{
"epoch": 0.84,
"grad_norm": 2.8145010471343994,
"learning_rate": 1.37237647569145e-07,
"loss": 0.1189,
"step": 3061
},
{
"epoch": 0.84,
"grad_norm": 2.6650164127349854,
"learning_rate": 1.3679054332763397e-07,
"loss": 0.103,
"step": 3062
},
{
"epoch": 0.84,
"grad_norm": 2.683551549911499,
"learning_rate": 1.3634411510558675e-07,
"loss": 0.1099,
"step": 3063
},
{
"epoch": 0.84,
"grad_norm": 2.7763309478759766,
"learning_rate": 1.358983632526226e-07,
"loss": 0.1106,
"step": 3064
},
{
"epoch": 0.84,
"grad_norm": 3.0121800899505615,
"learning_rate": 1.3545328811783007e-07,
"loss": 0.1264,
"step": 3065
},
{
"epoch": 0.84,
"grad_norm": 2.6209750175476074,
"learning_rate": 1.3500889004976857e-07,
"loss": 0.1112,
"step": 3066
},
{
"epoch": 0.84,
"grad_norm": 2.937760829925537,
"learning_rate": 1.3456516939646679e-07,
"loss": 0.1195,
"step": 3067
},
{
"epoch": 0.84,
"grad_norm": 3.2091565132141113,
"learning_rate": 1.3412212650542265e-07,
"loss": 0.122,
"step": 3068
},
{
"epoch": 0.84,
"grad_norm": 2.7288503646850586,
"learning_rate": 1.3367976172360418e-07,
"loss": 0.109,
"step": 3069
},
{
"epoch": 0.84,
"grad_norm": 2.8172175884246826,
"learning_rate": 1.3323807539744726e-07,
"loss": 0.1085,
"step": 3070
},
{
"epoch": 0.84,
"grad_norm": 2.8133957386016846,
"learning_rate": 1.327970678728576e-07,
"loss": 0.1076,
"step": 3071
},
{
"epoch": 0.84,
"grad_norm": 2.762995719909668,
"learning_rate": 1.3235673949520842e-07,
"loss": 0.1232,
"step": 3072
},
{
"epoch": 0.84,
"grad_norm": 2.835566997528076,
"learning_rate": 1.3191709060934098e-07,
"loss": 0.1282,
"step": 3073
},
{
"epoch": 0.84,
"grad_norm": 2.9513890743255615,
"learning_rate": 1.314781215595654e-07,
"loss": 0.1148,
"step": 3074
},
{
"epoch": 0.84,
"grad_norm": 2.9118540287017822,
"learning_rate": 1.3103983268965824e-07,
"loss": 0.1184,
"step": 3075
},
{
"epoch": 0.84,
"grad_norm": 2.788700819015503,
"learning_rate": 1.3060222434286429e-07,
"loss": 0.115,
"step": 3076
},
{
"epoch": 0.84,
"grad_norm": 2.9123387336730957,
"learning_rate": 1.3016529686189482e-07,
"loss": 0.1153,
"step": 3077
},
{
"epoch": 0.84,
"grad_norm": 2.6774423122406006,
"learning_rate": 1.297290505889278e-07,
"loss": 0.0998,
"step": 3078
},
{
"epoch": 0.84,
"grad_norm": 2.7714312076568604,
"learning_rate": 1.2929348586560852e-07,
"loss": 0.1215,
"step": 3079
},
{
"epoch": 0.84,
"grad_norm": 2.696655511856079,
"learning_rate": 1.288586030330474e-07,
"loss": 0.1153,
"step": 3080
},
{
"epoch": 0.84,
"grad_norm": 2.9519054889678955,
"learning_rate": 1.2842440243182196e-07,
"loss": 0.1238,
"step": 3081
},
{
"epoch": 0.84,
"grad_norm": 2.8077220916748047,
"learning_rate": 1.2799088440197447e-07,
"loss": 0.1207,
"step": 3082
},
{
"epoch": 0.84,
"grad_norm": 2.6261038780212402,
"learning_rate": 1.2755804928301306e-07,
"loss": 0.1133,
"step": 3083
},
{
"epoch": 0.84,
"grad_norm": 2.76934814453125,
"learning_rate": 1.2712589741391143e-07,
"loss": 0.1221,
"step": 3084
},
{
"epoch": 0.84,
"grad_norm": 2.999305009841919,
"learning_rate": 1.2669442913310723e-07,
"loss": 0.1298,
"step": 3085
},
{
"epoch": 0.84,
"grad_norm": 2.842081308364868,
"learning_rate": 1.2626364477850394e-07,
"loss": 0.1106,
"step": 3086
},
{
"epoch": 0.84,
"grad_norm": 2.7725205421447754,
"learning_rate": 1.2583354468746843e-07,
"loss": 0.099,
"step": 3087
},
{
"epoch": 0.84,
"grad_norm": 2.852376937866211,
"learning_rate": 1.2540412919683208e-07,
"loss": 0.118,
"step": 3088
},
{
"epoch": 0.84,
"grad_norm": 3.0244083404541016,
"learning_rate": 1.249753986428903e-07,
"loss": 0.1244,
"step": 3089
},
{
"epoch": 0.84,
"grad_norm": 2.580749034881592,
"learning_rate": 1.2454735336140166e-07,
"loss": 0.1113,
"step": 3090
},
{
"epoch": 0.84,
"grad_norm": 2.531583547592163,
"learning_rate": 1.2411999368758874e-07,
"loss": 0.1045,
"step": 3091
},
{
"epoch": 0.84,
"grad_norm": 2.663281202316284,
"learning_rate": 1.2369331995613663e-07,
"loss": 0.125,
"step": 3092
},
{
"epoch": 0.84,
"grad_norm": 3.0186655521392822,
"learning_rate": 1.2326733250119292e-07,
"loss": 0.1226,
"step": 3093
},
{
"epoch": 0.85,
"grad_norm": 2.77361798286438,
"learning_rate": 1.2284203165636886e-07,
"loss": 0.1202,
"step": 3094
},
{
"epoch": 0.85,
"grad_norm": 2.643756628036499,
"learning_rate": 1.224174177547368e-07,
"loss": 0.1127,
"step": 3095
},
{
"epoch": 0.85,
"grad_norm": 2.6107122898101807,
"learning_rate": 1.2199349112883194e-07,
"loss": 0.1011,
"step": 3096
},
{
"epoch": 0.85,
"grad_norm": 2.981204032897949,
"learning_rate": 1.2157025211065097e-07,
"loss": 0.1206,
"step": 3097
},
{
"epoch": 0.85,
"grad_norm": 2.899815082550049,
"learning_rate": 1.211477010316516e-07,
"loss": 0.1165,
"step": 3098
},
{
"epoch": 0.85,
"grad_norm": 2.7035837173461914,
"learning_rate": 1.207258382227536e-07,
"loss": 0.1101,
"step": 3099
},
{
"epoch": 0.85,
"grad_norm": 2.7130398750305176,
"learning_rate": 1.2030466401433748e-07,
"loss": 0.1076,
"step": 3100
},
{
"epoch": 0.85,
"grad_norm": 2.477532386779785,
"learning_rate": 1.1988417873624414e-07,
"loss": 0.1037,
"step": 3101
},
{
"epoch": 0.85,
"grad_norm": 2.8978729248046875,
"learning_rate": 1.1946438271777514e-07,
"loss": 0.1227,
"step": 3102
},
{
"epoch": 0.85,
"grad_norm": 2.6164002418518066,
"learning_rate": 1.1904527628769212e-07,
"loss": 0.1118,
"step": 3103
},
{
"epoch": 0.85,
"grad_norm": 2.697582960128784,
"learning_rate": 1.1862685977421704e-07,
"loss": 0.1168,
"step": 3104
},
{
"epoch": 0.85,
"grad_norm": 2.663384199142456,
"learning_rate": 1.1820913350503137e-07,
"loss": 0.1111,
"step": 3105
},
{
"epoch": 0.85,
"grad_norm": 2.7724335193634033,
"learning_rate": 1.1779209780727594e-07,
"loss": 0.1192,
"step": 3106
},
{
"epoch": 0.85,
"grad_norm": 2.4050371646881104,
"learning_rate": 1.1737575300755077e-07,
"loss": 0.0984,
"step": 3107
},
{
"epoch": 0.85,
"grad_norm": 2.6711740493774414,
"learning_rate": 1.1696009943191454e-07,
"loss": 0.118,
"step": 3108
},
{
"epoch": 0.85,
"grad_norm": 2.795835494995117,
"learning_rate": 1.1654513740588523e-07,
"loss": 0.1257,
"step": 3109
},
{
"epoch": 0.85,
"grad_norm": 2.6235530376434326,
"learning_rate": 1.1613086725443888e-07,
"loss": 0.1092,
"step": 3110
},
{
"epoch": 0.85,
"grad_norm": 3.0112788677215576,
"learning_rate": 1.1571728930200952e-07,
"loss": 0.1253,
"step": 3111
},
{
"epoch": 0.85,
"grad_norm": 3.473054885864258,
"learning_rate": 1.1530440387248985e-07,
"loss": 0.1355,
"step": 3112
},
{
"epoch": 0.85,
"grad_norm": 2.6995739936828613,
"learning_rate": 1.1489221128922878e-07,
"loss": 0.0993,
"step": 3113
},
{
"epoch": 0.85,
"grad_norm": 2.7281997203826904,
"learning_rate": 1.1448071187503383e-07,
"loss": 0.116,
"step": 3114
},
{
"epoch": 0.85,
"grad_norm": 2.543882131576538,
"learning_rate": 1.140699059521697e-07,
"loss": 0.1018,
"step": 3115
},
{
"epoch": 0.85,
"grad_norm": 2.9289021492004395,
"learning_rate": 1.1365979384235713e-07,
"loss": 0.1199,
"step": 3116
},
{
"epoch": 0.85,
"grad_norm": 2.6259896755218506,
"learning_rate": 1.1325037586677444e-07,
"loss": 0.1138,
"step": 3117
},
{
"epoch": 0.85,
"grad_norm": 2.5959458351135254,
"learning_rate": 1.1284165234605536e-07,
"loss": 0.1066,
"step": 3118
},
{
"epoch": 0.85,
"grad_norm": 2.7366015911102295,
"learning_rate": 1.124336236002904e-07,
"loss": 0.1035,
"step": 3119
},
{
"epoch": 0.85,
"grad_norm": 2.8336050510406494,
"learning_rate": 1.1202628994902629e-07,
"loss": 0.1038,
"step": 3120
},
{
"epoch": 0.85,
"grad_norm": 2.832141160964966,
"learning_rate": 1.1161965171126441e-07,
"loss": 0.1117,
"step": 3121
},
{
"epoch": 0.85,
"grad_norm": 2.7790372371673584,
"learning_rate": 1.1121370920546269e-07,
"loss": 0.1147,
"step": 3122
},
{
"epoch": 0.85,
"grad_norm": 2.91813063621521,
"learning_rate": 1.1080846274953281e-07,
"loss": 0.1195,
"step": 3123
},
{
"epoch": 0.85,
"grad_norm": 2.7322089672088623,
"learning_rate": 1.104039126608426e-07,
"loss": 0.1129,
"step": 3124
},
{
"epoch": 0.85,
"grad_norm": 2.848047971725464,
"learning_rate": 1.1000005925621403e-07,
"loss": 0.1192,
"step": 3125
},
{
"epoch": 0.85,
"grad_norm": 2.6769890785217285,
"learning_rate": 1.0959690285192324e-07,
"loss": 0.1088,
"step": 3126
},
{
"epoch": 0.85,
"grad_norm": 3.087414503097534,
"learning_rate": 1.0919444376370135e-07,
"loss": 0.1287,
"step": 3127
},
{
"epoch": 0.85,
"grad_norm": 3.322653293609619,
"learning_rate": 1.0879268230673188e-07,
"loss": 0.1203,
"step": 3128
},
{
"epoch": 0.85,
"grad_norm": 2.6233925819396973,
"learning_rate": 1.083916187956534e-07,
"loss": 0.1033,
"step": 3129
},
{
"epoch": 0.86,
"grad_norm": 2.9331064224243164,
"learning_rate": 1.0799125354455752e-07,
"loss": 0.1305,
"step": 3130
},
{
"epoch": 0.86,
"grad_norm": 2.7068936824798584,
"learning_rate": 1.0759158686698865e-07,
"loss": 0.1115,
"step": 3131
},
{
"epoch": 0.86,
"grad_norm": 2.684677839279175,
"learning_rate": 1.071926190759448e-07,
"loss": 0.108,
"step": 3132
},
{
"epoch": 0.86,
"grad_norm": 2.8053512573242188,
"learning_rate": 1.0679435048387542e-07,
"loss": 0.1084,
"step": 3133
},
{
"epoch": 0.86,
"grad_norm": 2.674454689025879,
"learning_rate": 1.063967814026836e-07,
"loss": 0.1179,
"step": 3134
},
{
"epoch": 0.86,
"grad_norm": 2.6979193687438965,
"learning_rate": 1.0599991214372439e-07,
"loss": 0.1132,
"step": 3135
},
{
"epoch": 0.86,
"grad_norm": 2.542658567428589,
"learning_rate": 1.0560374301780405e-07,
"loss": 0.1097,
"step": 3136
},
{
"epoch": 0.86,
"grad_norm": 2.8575947284698486,
"learning_rate": 1.0520827433518154e-07,
"loss": 0.1201,
"step": 3137
},
{
"epoch": 0.86,
"grad_norm": 3.025064706802368,
"learning_rate": 1.0481350640556652e-07,
"loss": 0.1374,
"step": 3138
},
{
"epoch": 0.86,
"grad_norm": 2.638869047164917,
"learning_rate": 1.0441943953812005e-07,
"loss": 0.1114,
"step": 3139
},
{
"epoch": 0.86,
"grad_norm": 3.0275416374206543,
"learning_rate": 1.0402607404145447e-07,
"loss": 0.1191,
"step": 3140
},
{
"epoch": 0.86,
"grad_norm": 2.9517245292663574,
"learning_rate": 1.0363341022363225e-07,
"loss": 0.1214,
"step": 3141
},
{
"epoch": 0.86,
"grad_norm": 2.52838397026062,
"learning_rate": 1.0324144839216698e-07,
"loss": 0.1034,
"step": 3142
},
{
"epoch": 0.86,
"grad_norm": 2.6634321212768555,
"learning_rate": 1.0285018885402219e-07,
"loss": 0.107,
"step": 3143
},
{
"epoch": 0.86,
"grad_norm": 2.892220973968506,
"learning_rate": 1.0245963191561103e-07,
"loss": 0.1302,
"step": 3144
},
{
"epoch": 0.86,
"grad_norm": 3.1188881397247314,
"learning_rate": 1.0206977788279736e-07,
"loss": 0.1304,
"step": 3145
},
{
"epoch": 0.86,
"grad_norm": 2.8132681846618652,
"learning_rate": 1.0168062706089354e-07,
"loss": 0.107,
"step": 3146
},
{
"epoch": 0.86,
"grad_norm": 2.581531286239624,
"learning_rate": 1.0129217975466197e-07,
"loss": 0.1005,
"step": 3147
},
{
"epoch": 0.86,
"grad_norm": 2.7717487812042236,
"learning_rate": 1.0090443626831368e-07,
"loss": 0.1067,
"step": 3148
},
{
"epoch": 0.86,
"grad_norm": 3.0757551193237305,
"learning_rate": 1.0051739690550854e-07,
"loss": 0.1415,
"step": 3149
},
{
"epoch": 0.86,
"grad_norm": 2.7506611347198486,
"learning_rate": 1.0013106196935529e-07,
"loss": 0.1161,
"step": 3150
},
{
"epoch": 0.86,
"grad_norm": 2.8733327388763428,
"learning_rate": 9.974543176241046e-08,
"loss": 0.1065,
"step": 3151
},
{
"epoch": 0.86,
"grad_norm": 2.8457095623016357,
"learning_rate": 9.936050658667938e-08,
"loss": 0.1221,
"step": 3152
},
{
"epoch": 0.86,
"grad_norm": 3.0331854820251465,
"learning_rate": 9.897628674361469e-08,
"loss": 0.1265,
"step": 3153
},
{
"epoch": 0.86,
"grad_norm": 2.7377963066101074,
"learning_rate": 9.859277253411668e-08,
"loss": 0.1275,
"step": 3154
},
{
"epoch": 0.86,
"grad_norm": 2.556013822555542,
"learning_rate": 9.820996425853333e-08,
"loss": 0.111,
"step": 3155
},
{
"epoch": 0.86,
"grad_norm": 2.742039918899536,
"learning_rate": 9.782786221665939e-08,
"loss": 0.1095,
"step": 3156
},
{
"epoch": 0.86,
"grad_norm": 2.950411319732666,
"learning_rate": 9.744646670773716e-08,
"loss": 0.137,
"step": 3157
},
{
"epoch": 0.86,
"grad_norm": 2.9282593727111816,
"learning_rate": 9.70657780304548e-08,
"loss": 0.1261,
"step": 3158
},
{
"epoch": 0.86,
"grad_norm": 2.6660447120666504,
"learning_rate": 9.668579648294728e-08,
"loss": 0.1152,
"step": 3159
},
{
"epoch": 0.86,
"grad_norm": 2.705693483352661,
"learning_rate": 9.630652236279625e-08,
"loss": 0.1192,
"step": 3160
},
{
"epoch": 0.86,
"grad_norm": 2.8832311630249023,
"learning_rate": 9.59279559670284e-08,
"loss": 0.1123,
"step": 3161
},
{
"epoch": 0.86,
"grad_norm": 2.8442935943603516,
"learning_rate": 9.555009759211707e-08,
"loss": 0.1222,
"step": 3162
},
{
"epoch": 0.86,
"grad_norm": 2.8412744998931885,
"learning_rate": 9.517294753398064e-08,
"loss": 0.113,
"step": 3163
},
{
"epoch": 0.86,
"grad_norm": 2.9322669506073,
"learning_rate": 9.479650608798251e-08,
"loss": 0.1135,
"step": 3164
},
{
"epoch": 0.86,
"grad_norm": 2.6838173866271973,
"learning_rate": 9.442077354893196e-08,
"loss": 0.1029,
"step": 3165
},
{
"epoch": 0.86,
"grad_norm": 2.8191077709198,
"learning_rate": 9.404575021108229e-08,
"loss": 0.1205,
"step": 3166
},
{
"epoch": 0.87,
"grad_norm": 2.5669922828674316,
"learning_rate": 9.367143636813202e-08,
"loss": 0.1166,
"step": 3167
},
{
"epoch": 0.87,
"grad_norm": 3.1334023475646973,
"learning_rate": 9.329783231322352e-08,
"loss": 0.129,
"step": 3168
},
{
"epoch": 0.87,
"grad_norm": 2.73909330368042,
"learning_rate": 9.292493833894332e-08,
"loss": 0.1129,
"step": 3169
},
{
"epoch": 0.87,
"grad_norm": 2.9867608547210693,
"learning_rate": 9.255275473732238e-08,
"loss": 0.1213,
"step": 3170
},
{
"epoch": 0.87,
"grad_norm": 2.9223008155822754,
"learning_rate": 9.218128179983476e-08,
"loss": 0.1194,
"step": 3171
},
{
"epoch": 0.87,
"grad_norm": 2.776965379714966,
"learning_rate": 9.18105198173984e-08,
"loss": 0.1065,
"step": 3172
},
{
"epoch": 0.87,
"grad_norm": 2.9588122367858887,
"learning_rate": 9.144046908037407e-08,
"loss": 0.1183,
"step": 3173
},
{
"epoch": 0.87,
"grad_norm": 2.636444091796875,
"learning_rate": 9.107112987856558e-08,
"loss": 0.0999,
"step": 3174
},
{
"epoch": 0.87,
"grad_norm": 2.6867878437042236,
"learning_rate": 9.070250250122003e-08,
"loss": 0.1139,
"step": 3175
},
{
"epoch": 0.87,
"grad_norm": 2.6267762184143066,
"learning_rate": 9.033458723702625e-08,
"loss": 0.0993,
"step": 3176
},
{
"epoch": 0.87,
"grad_norm": 2.670147180557251,
"learning_rate": 8.99673843741161e-08,
"loss": 0.1044,
"step": 3177
},
{
"epoch": 0.87,
"grad_norm": 2.864187717437744,
"learning_rate": 8.960089420006312e-08,
"loss": 0.1235,
"step": 3178
},
{
"epoch": 0.87,
"grad_norm": 2.868659019470215,
"learning_rate": 8.923511700188258e-08,
"loss": 0.1157,
"step": 3179
},
{
"epoch": 0.87,
"grad_norm": 2.7398617267608643,
"learning_rate": 8.887005306603201e-08,
"loss": 0.1162,
"step": 3180
},
{
"epoch": 0.87,
"grad_norm": 2.597961902618408,
"learning_rate": 8.850570267840963e-08,
"loss": 0.1011,
"step": 3181
},
{
"epoch": 0.87,
"grad_norm": 3.0051791667938232,
"learning_rate": 8.814206612435549e-08,
"loss": 0.1243,
"step": 3182
},
{
"epoch": 0.87,
"grad_norm": 2.9067301750183105,
"learning_rate": 8.777914368865003e-08,
"loss": 0.1244,
"step": 3183
},
{
"epoch": 0.87,
"grad_norm": 2.6510720252990723,
"learning_rate": 8.741693565551456e-08,
"loss": 0.1083,
"step": 3184
},
{
"epoch": 0.87,
"grad_norm": 3.2123913764953613,
"learning_rate": 8.70554423086114e-08,
"loss": 0.1268,
"step": 3185
},
{
"epoch": 0.87,
"grad_norm": 2.637896776199341,
"learning_rate": 8.669466393104241e-08,
"loss": 0.1012,
"step": 3186
},
{
"epoch": 0.87,
"grad_norm": 2.6280598640441895,
"learning_rate": 8.633460080535038e-08,
"loss": 0.0991,
"step": 3187
},
{
"epoch": 0.87,
"grad_norm": 2.9076666831970215,
"learning_rate": 8.597525321351717e-08,
"loss": 0.1122,
"step": 3188
},
{
"epoch": 0.87,
"grad_norm": 2.5415005683898926,
"learning_rate": 8.561662143696446e-08,
"loss": 0.1123,
"step": 3189
},
{
"epoch": 0.87,
"grad_norm": 2.803208112716675,
"learning_rate": 8.525870575655392e-08,
"loss": 0.1091,
"step": 3190
},
{
"epoch": 0.87,
"grad_norm": 2.746464252471924,
"learning_rate": 8.490150645258542e-08,
"loss": 0.1099,
"step": 3191
},
{
"epoch": 0.87,
"grad_norm": 2.8301329612731934,
"learning_rate": 8.454502380479889e-08,
"loss": 0.1172,
"step": 3192
},
{
"epoch": 0.87,
"grad_norm": 2.7032482624053955,
"learning_rate": 8.418925809237209e-08,
"loss": 0.108,
"step": 3193
},
{
"epoch": 0.87,
"grad_norm": 2.8203492164611816,
"learning_rate": 8.383420959392174e-08,
"loss": 0.1136,
"step": 3194
},
{
"epoch": 0.87,
"grad_norm": 2.732386350631714,
"learning_rate": 8.347987858750306e-08,
"loss": 0.1028,
"step": 3195
},
{
"epoch": 0.87,
"grad_norm": 2.6165077686309814,
"learning_rate": 8.312626535060874e-08,
"loss": 0.11,
"step": 3196
},
{
"epoch": 0.87,
"grad_norm": 2.588101387023926,
"learning_rate": 8.277337016017016e-08,
"loss": 0.1039,
"step": 3197
},
{
"epoch": 0.87,
"grad_norm": 2.8107681274414062,
"learning_rate": 8.242119329255582e-08,
"loss": 0.1229,
"step": 3198
},
{
"epoch": 0.87,
"grad_norm": 2.847639799118042,
"learning_rate": 8.206973502357151e-08,
"loss": 0.1006,
"step": 3199
},
{
"epoch": 0.87,
"grad_norm": 2.666560411453247,
"learning_rate": 8.171899562846097e-08,
"loss": 0.104,
"step": 3200
},
{
"epoch": 0.87,
"grad_norm": 2.765098810195923,
"learning_rate": 8.136897538190424e-08,
"loss": 0.1061,
"step": 3201
},
{
"epoch": 0.87,
"grad_norm": 2.895667552947998,
"learning_rate": 8.101967455801861e-08,
"loss": 0.1382,
"step": 3202
},
{
"epoch": 0.87,
"grad_norm": 3.130695343017578,
"learning_rate": 8.067109343035783e-08,
"loss": 0.1241,
"step": 3203
},
{
"epoch": 0.88,
"grad_norm": 3.034909963607788,
"learning_rate": 8.032323227191173e-08,
"loss": 0.1121,
"step": 3204
},
{
"epoch": 0.88,
"grad_norm": 2.6597087383270264,
"learning_rate": 7.997609135510685e-08,
"loss": 0.106,
"step": 3205
},
{
"epoch": 0.88,
"grad_norm": 2.7990424633026123,
"learning_rate": 7.962967095180518e-08,
"loss": 0.1166,
"step": 3206
},
{
"epoch": 0.88,
"grad_norm": 2.961679220199585,
"learning_rate": 7.928397133330467e-08,
"loss": 0.1265,
"step": 3207
},
{
"epoch": 0.88,
"grad_norm": 2.8379929065704346,
"learning_rate": 7.89389927703391e-08,
"loss": 0.1171,
"step": 3208
},
{
"epoch": 0.88,
"grad_norm": 2.9817962646484375,
"learning_rate": 7.859473553307672e-08,
"loss": 0.1145,
"step": 3209
},
{
"epoch": 0.88,
"grad_norm": 2.8842551708221436,
"learning_rate": 7.825119989112172e-08,
"loss": 0.121,
"step": 3210
},
{
"epoch": 0.88,
"grad_norm": 2.9692680835723877,
"learning_rate": 7.790838611351258e-08,
"loss": 0.1243,
"step": 3211
},
{
"epoch": 0.88,
"grad_norm": 2.4734480381011963,
"learning_rate": 7.756629446872288e-08,
"loss": 0.1074,
"step": 3212
},
{
"epoch": 0.88,
"grad_norm": 2.689995050430298,
"learning_rate": 7.722492522466073e-08,
"loss": 0.1163,
"step": 3213
},
{
"epoch": 0.88,
"grad_norm": 2.814525604248047,
"learning_rate": 7.688427864866776e-08,
"loss": 0.1113,
"step": 3214
},
{
"epoch": 0.88,
"grad_norm": 2.8867626190185547,
"learning_rate": 7.654435500752055e-08,
"loss": 0.116,
"step": 3215
},
{
"epoch": 0.88,
"grad_norm": 2.705634832382202,
"learning_rate": 7.620515456742871e-08,
"loss": 0.1116,
"step": 3216
},
{
"epoch": 0.88,
"grad_norm": 2.635175943374634,
"learning_rate": 7.586667759403608e-08,
"loss": 0.1133,
"step": 3217
},
{
"epoch": 0.88,
"grad_norm": 3.0241026878356934,
"learning_rate": 7.55289243524202e-08,
"loss": 0.1254,
"step": 3218
},
{
"epoch": 0.88,
"grad_norm": 2.9283294677734375,
"learning_rate": 7.519189510709045e-08,
"loss": 0.129,
"step": 3219
},
{
"epoch": 0.88,
"grad_norm": 2.9080424308776855,
"learning_rate": 7.485559012199061e-08,
"loss": 0.1152,
"step": 3220
},
{
"epoch": 0.88,
"grad_norm": 2.4616005420684814,
"learning_rate": 7.452000966049676e-08,
"loss": 0.0941,
"step": 3221
},
{
"epoch": 0.88,
"grad_norm": 2.6700146198272705,
"learning_rate": 7.418515398541736e-08,
"loss": 0.1021,
"step": 3222
},
{
"epoch": 0.88,
"grad_norm": 3.0130770206451416,
"learning_rate": 7.385102335899396e-08,
"loss": 0.1133,
"step": 3223
},
{
"epoch": 0.88,
"grad_norm": 3.161093235015869,
"learning_rate": 7.351761804289902e-08,
"loss": 0.1035,
"step": 3224
},
{
"epoch": 0.88,
"grad_norm": 2.837735891342163,
"learning_rate": 7.318493829823813e-08,
"loss": 0.1171,
"step": 3225
},
{
"epoch": 0.88,
"grad_norm": 2.5457699298858643,
"learning_rate": 7.285298438554844e-08,
"loss": 0.1152,
"step": 3226
},
{
"epoch": 0.88,
"grad_norm": 2.6670989990234375,
"learning_rate": 7.25217565647982e-08,
"loss": 0.1025,
"step": 3227
},
{
"epoch": 0.88,
"grad_norm": 2.919137954711914,
"learning_rate": 7.219125509538782e-08,
"loss": 0.1215,
"step": 3228
},
{
"epoch": 0.88,
"grad_norm": 2.712557315826416,
"learning_rate": 7.186148023614758e-08,
"loss": 0.1115,
"step": 3229
},
{
"epoch": 0.88,
"grad_norm": 2.7375035285949707,
"learning_rate": 7.153243224534e-08,
"loss": 0.1163,
"step": 3230
},
{
"epoch": 0.88,
"grad_norm": 2.5851237773895264,
"learning_rate": 7.120411138065796e-08,
"loss": 0.1167,
"step": 3231
},
{
"epoch": 0.88,
"grad_norm": 3.0387001037597656,
"learning_rate": 7.087651789922445e-08,
"loss": 0.1095,
"step": 3232
},
{
"epoch": 0.88,
"grad_norm": 2.860739231109619,
"learning_rate": 7.054965205759345e-08,
"loss": 0.1246,
"step": 3233
},
{
"epoch": 0.88,
"grad_norm": 2.760218620300293,
"learning_rate": 7.022351411174865e-08,
"loss": 0.1116,
"step": 3234
},
{
"epoch": 0.88,
"grad_norm": 2.8284144401550293,
"learning_rate": 6.989810431710374e-08,
"loss": 0.1259,
"step": 3235
},
{
"epoch": 0.88,
"grad_norm": 2.6409196853637695,
"learning_rate": 6.957342292850266e-08,
"loss": 0.1059,
"step": 3236
},
{
"epoch": 0.88,
"grad_norm": 2.788323163986206,
"learning_rate": 6.924947020021798e-08,
"loss": 0.11,
"step": 3237
},
{
"epoch": 0.88,
"grad_norm": 2.9088175296783447,
"learning_rate": 6.892624638595257e-08,
"loss": 0.124,
"step": 3238
},
{
"epoch": 0.88,
"grad_norm": 2.5352680683135986,
"learning_rate": 6.860375173883781e-08,
"loss": 0.1007,
"step": 3239
},
{
"epoch": 0.89,
"grad_norm": 2.6265170574188232,
"learning_rate": 6.828198651143424e-08,
"loss": 0.1046,
"step": 3240
},
{
"epoch": 0.89,
"grad_norm": 2.8138463497161865,
"learning_rate": 6.79609509557313e-08,
"loss": 0.1237,
"step": 3241
},
{
"epoch": 0.89,
"grad_norm": 3.054506778717041,
"learning_rate": 6.764064532314672e-08,
"loss": 0.1146,
"step": 3242
},
{
"epoch": 0.89,
"grad_norm": 2.840134859085083,
"learning_rate": 6.73210698645269e-08,
"loss": 0.119,
"step": 3243
},
{
"epoch": 0.89,
"grad_norm": 3.059105157852173,
"learning_rate": 6.700222483014617e-08,
"loss": 0.1336,
"step": 3244
},
{
"epoch": 0.89,
"grad_norm": 2.816920280456543,
"learning_rate": 6.668411046970679e-08,
"loss": 0.1128,
"step": 3245
},
{
"epoch": 0.89,
"grad_norm": 2.9585933685302734,
"learning_rate": 6.636672703233914e-08,
"loss": 0.109,
"step": 3246
},
{
"epoch": 0.89,
"grad_norm": 2.939216375350952,
"learning_rate": 6.605007476660063e-08,
"loss": 0.1314,
"step": 3247
},
{
"epoch": 0.89,
"grad_norm": 3.184189558029175,
"learning_rate": 6.573415392047666e-08,
"loss": 0.1366,
"step": 3248
},
{
"epoch": 0.89,
"grad_norm": 3.0321738719940186,
"learning_rate": 6.541896474137954e-08,
"loss": 0.1171,
"step": 3249
},
{
"epoch": 0.89,
"grad_norm": 2.797530174255371,
"learning_rate": 6.510450747614815e-08,
"loss": 0.1283,
"step": 3250
},
{
"epoch": 0.89,
"grad_norm": 2.5570662021636963,
"learning_rate": 6.479078237104918e-08,
"loss": 0.1077,
"step": 3251
},
{
"epoch": 0.89,
"grad_norm": 2.6202569007873535,
"learning_rate": 6.447778967177497e-08,
"loss": 0.1191,
"step": 3252
},
{
"epoch": 0.89,
"grad_norm": 2.5749614238739014,
"learning_rate": 6.416552962344479e-08,
"loss": 0.1011,
"step": 3253
},
{
"epoch": 0.89,
"grad_norm": 2.7007076740264893,
"learning_rate": 6.385400247060402e-08,
"loss": 0.1059,
"step": 3254
},
{
"epoch": 0.89,
"grad_norm": 2.770843744277954,
"learning_rate": 6.354320845722394e-08,
"loss": 0.123,
"step": 3255
},
{
"epoch": 0.89,
"grad_norm": 2.8237979412078857,
"learning_rate": 6.323314782670197e-08,
"loss": 0.1167,
"step": 3256
},
{
"epoch": 0.89,
"grad_norm": 2.7554519176483154,
"learning_rate": 6.292382082186065e-08,
"loss": 0.1061,
"step": 3257
},
{
"epoch": 0.89,
"grad_norm": 2.568547487258911,
"learning_rate": 6.261522768494886e-08,
"loss": 0.1108,
"step": 3258
},
{
"epoch": 0.89,
"grad_norm": 2.847320318222046,
"learning_rate": 6.230736865763997e-08,
"loss": 0.124,
"step": 3259
},
{
"epoch": 0.89,
"grad_norm": 2.849395513534546,
"learning_rate": 6.200024398103253e-08,
"loss": 0.1153,
"step": 3260
},
{
"epoch": 0.89,
"grad_norm": 2.570420503616333,
"learning_rate": 6.169385389565051e-08,
"loss": 0.1051,
"step": 3261
},
{
"epoch": 0.89,
"grad_norm": 2.8272502422332764,
"learning_rate": 6.138819864144185e-08,
"loss": 0.1056,
"step": 3262
},
{
"epoch": 0.89,
"grad_norm": 2.8246591091156006,
"learning_rate": 6.108327845777972e-08,
"loss": 0.1134,
"step": 3263
},
{
"epoch": 0.89,
"grad_norm": 3.0128746032714844,
"learning_rate": 6.077909358346123e-08,
"loss": 0.1236,
"step": 3264
},
{
"epoch": 0.89,
"grad_norm": 2.628431797027588,
"learning_rate": 6.047564425670748e-08,
"loss": 0.1047,
"step": 3265
},
{
"epoch": 0.89,
"grad_norm": 3.2802343368530273,
"learning_rate": 6.017293071516406e-08,
"loss": 0.1057,
"step": 3266
},
{
"epoch": 0.89,
"grad_norm": 2.8381617069244385,
"learning_rate": 5.987095319589963e-08,
"loss": 0.12,
"step": 3267
},
{
"epoch": 0.89,
"grad_norm": 2.6833271980285645,
"learning_rate": 5.956971193540728e-08,
"loss": 0.1183,
"step": 3268
},
{
"epoch": 0.89,
"grad_norm": 2.6846632957458496,
"learning_rate": 5.926920716960282e-08,
"loss": 0.1145,
"step": 3269
},
{
"epoch": 0.89,
"grad_norm": 2.3703134059906006,
"learning_rate": 5.896943913382546e-08,
"loss": 0.0957,
"step": 3270
},
{
"epoch": 0.89,
"grad_norm": 2.9268476963043213,
"learning_rate": 5.8670408062837516e-08,
"loss": 0.1244,
"step": 3271
},
{
"epoch": 0.89,
"grad_norm": 2.760392904281616,
"learning_rate": 5.837211419082411e-08,
"loss": 0.1191,
"step": 3272
},
{
"epoch": 0.89,
"grad_norm": 2.5630156993865967,
"learning_rate": 5.807455775139325e-08,
"loss": 0.1049,
"step": 3273
},
{
"epoch": 0.89,
"grad_norm": 2.6172001361846924,
"learning_rate": 5.7777738977574984e-08,
"loss": 0.113,
"step": 3274
},
{
"epoch": 0.89,
"grad_norm": 3.231501579284668,
"learning_rate": 5.748165810182182e-08,
"loss": 0.1492,
"step": 3275
},
{
"epoch": 0.89,
"grad_norm": 2.893700122833252,
"learning_rate": 5.718631535600882e-08,
"loss": 0.109,
"step": 3276
},
{
"epoch": 0.9,
"grad_norm": 3.144315719604492,
"learning_rate": 5.6891710971432194e-08,
"loss": 0.1307,
"step": 3277
},
{
"epoch": 0.9,
"grad_norm": 2.6779861450195312,
"learning_rate": 5.659784517881072e-08,
"loss": 0.1073,
"step": 3278
},
{
"epoch": 0.9,
"grad_norm": 3.0189919471740723,
"learning_rate": 5.6304718208284194e-08,
"loss": 0.1221,
"step": 3279
},
{
"epoch": 0.9,
"grad_norm": 2.769556760787964,
"learning_rate": 5.601233028941388e-08,
"loss": 0.118,
"step": 3280
},
{
"epoch": 0.9,
"grad_norm": 2.5217833518981934,
"learning_rate": 5.57206816511826e-08,
"loss": 0.1071,
"step": 3281
},
{
"epoch": 0.9,
"grad_norm": 3.1602933406829834,
"learning_rate": 5.5429772521993544e-08,
"loss": 0.123,
"step": 3282
},
{
"epoch": 0.9,
"grad_norm": 2.82954740524292,
"learning_rate": 5.51396031296717e-08,
"loss": 0.1155,
"step": 3283
},
{
"epoch": 0.9,
"grad_norm": 2.6372883319854736,
"learning_rate": 5.485017370146194e-08,
"loss": 0.1128,
"step": 3284
},
{
"epoch": 0.9,
"grad_norm": 2.437962532043457,
"learning_rate": 5.456148446402975e-08,
"loss": 0.0993,
"step": 3285
},
{
"epoch": 0.9,
"grad_norm": 2.8904833793640137,
"learning_rate": 5.427353564346138e-08,
"loss": 0.1219,
"step": 3286
},
{
"epoch": 0.9,
"grad_norm": 3.2096025943756104,
"learning_rate": 5.398632746526277e-08,
"loss": 0.1355,
"step": 3287
},
{
"epoch": 0.9,
"grad_norm": 2.730506658554077,
"learning_rate": 5.369986015436012e-08,
"loss": 0.1147,
"step": 3288
},
{
"epoch": 0.9,
"grad_norm": 2.7148666381835938,
"learning_rate": 5.3414133935099304e-08,
"loss": 0.1071,
"step": 3289
},
{
"epoch": 0.9,
"grad_norm": 2.7386884689331055,
"learning_rate": 5.312914903124566e-08,
"loss": 0.1117,
"step": 3290
},
{
"epoch": 0.9,
"grad_norm": 2.748896837234497,
"learning_rate": 5.284490566598421e-08,
"loss": 0.1199,
"step": 3291
},
{
"epoch": 0.9,
"grad_norm": 2.766477346420288,
"learning_rate": 5.2561404061919114e-08,
"loss": 0.1066,
"step": 3292
},
{
"epoch": 0.9,
"grad_norm": 2.713313341140747,
"learning_rate": 5.227864444107377e-08,
"loss": 0.1159,
"step": 3293
},
{
"epoch": 0.9,
"grad_norm": 2.8210034370422363,
"learning_rate": 5.1996627024890383e-08,
"loss": 0.1103,
"step": 3294
},
{
"epoch": 0.9,
"grad_norm": 2.5246126651763916,
"learning_rate": 5.171535203422961e-08,
"loss": 0.1037,
"step": 3295
},
{
"epoch": 0.9,
"grad_norm": 2.7380261421203613,
"learning_rate": 5.1434819689371464e-08,
"loss": 0.1046,
"step": 3296
},
{
"epoch": 0.9,
"grad_norm": 2.7986435890197754,
"learning_rate": 5.115503021001333e-08,
"loss": 0.104,
"step": 3297
},
{
"epoch": 0.9,
"grad_norm": 2.699871063232422,
"learning_rate": 5.087598381527181e-08,
"loss": 0.1102,
"step": 3298
},
{
"epoch": 0.9,
"grad_norm": 2.7522006034851074,
"learning_rate": 5.059768072368098e-08,
"loss": 0.1139,
"step": 3299
},
{
"epoch": 0.9,
"grad_norm": 2.90615177154541,
"learning_rate": 5.032012115319273e-08,
"loss": 0.1167,
"step": 3300
},
{
"epoch": 0.9,
"grad_norm": 2.7631006240844727,
"learning_rate": 5.004330532117707e-08,
"loss": 0.1105,
"step": 3301
},
{
"epoch": 0.9,
"grad_norm": 2.9251725673675537,
"learning_rate": 4.976723344442124e-08,
"loss": 0.1312,
"step": 3302
},
{
"epoch": 0.9,
"grad_norm": 2.746147394180298,
"learning_rate": 4.949190573913009e-08,
"loss": 0.1159,
"step": 3303
},
{
"epoch": 0.9,
"grad_norm": 2.6207473278045654,
"learning_rate": 4.921732242092569e-08,
"loss": 0.0947,
"step": 3304
},
{
"epoch": 0.9,
"grad_norm": 2.4841485023498535,
"learning_rate": 4.8943483704846465e-08,
"loss": 0.1086,
"step": 3305
},
{
"epoch": 0.9,
"grad_norm": 2.878854274749756,
"learning_rate": 4.867038980534877e-08,
"loss": 0.1119,
"step": 3306
},
{
"epoch": 0.9,
"grad_norm": 2.6521964073181152,
"learning_rate": 4.839804093630484e-08,
"loss": 0.1153,
"step": 3307
},
{
"epoch": 0.9,
"grad_norm": 2.6814706325531006,
"learning_rate": 4.8126437311003745e-08,
"loss": 0.1061,
"step": 3308
},
{
"epoch": 0.9,
"grad_norm": 2.625558376312256,
"learning_rate": 4.785557914215132e-08,
"loss": 0.1155,
"step": 3309
},
{
"epoch": 0.9,
"grad_norm": 2.7465507984161377,
"learning_rate": 4.7585466641868685e-08,
"loss": 0.1149,
"step": 3310
},
{
"epoch": 0.9,
"grad_norm": 2.826021194458008,
"learning_rate": 4.731610002169384e-08,
"loss": 0.1145,
"step": 3311
},
{
"epoch": 0.9,
"grad_norm": 2.7166543006896973,
"learning_rate": 4.704747949257992e-08,
"loss": 0.1053,
"step": 3312
},
{
"epoch": 0.91,
"grad_norm": 2.670891761779785,
"learning_rate": 4.677960526489644e-08,
"loss": 0.1169,
"step": 3313
},
{
"epoch": 0.91,
"grad_norm": 2.9541141986846924,
"learning_rate": 4.6512477548428465e-08,
"loss": 0.116,
"step": 3314
},
{
"epoch": 0.91,
"grad_norm": 2.7535674571990967,
"learning_rate": 4.624609655237544e-08,
"loss": 0.1112,
"step": 3315
},
{
"epoch": 0.91,
"grad_norm": 2.9081640243530273,
"learning_rate": 4.5980462485353254e-08,
"loss": 0.1203,
"step": 3316
},
{
"epoch": 0.91,
"grad_norm": 2.966815948486328,
"learning_rate": 4.5715575555391964e-08,
"loss": 0.1325,
"step": 3317
},
{
"epoch": 0.91,
"grad_norm": 2.7810537815093994,
"learning_rate": 4.545143596993695e-08,
"loss": 0.1202,
"step": 3318
},
{
"epoch": 0.91,
"grad_norm": 2.504977226257324,
"learning_rate": 4.518804393584852e-08,
"loss": 0.0956,
"step": 3319
},
{
"epoch": 0.91,
"grad_norm": 2.7092530727386475,
"learning_rate": 4.492539965940056e-08,
"loss": 0.1074,
"step": 3320
},
{
"epoch": 0.91,
"grad_norm": 3.001920461654663,
"learning_rate": 4.466350334628266e-08,
"loss": 0.1087,
"step": 3321
},
{
"epoch": 0.91,
"grad_norm": 2.7182984352111816,
"learning_rate": 4.440235520159752e-08,
"loss": 0.105,
"step": 3322
},
{
"epoch": 0.91,
"grad_norm": 2.900406837463379,
"learning_rate": 4.414195542986265e-08,
"loss": 0.1168,
"step": 3323
},
{
"epoch": 0.91,
"grad_norm": 2.4970622062683105,
"learning_rate": 4.3882304235009496e-08,
"loss": 0.1086,
"step": 3324
},
{
"epoch": 0.91,
"grad_norm": 2.6406490802764893,
"learning_rate": 4.362340182038238e-08,
"loss": 0.1003,
"step": 3325
},
{
"epoch": 0.91,
"grad_norm": 2.74210524559021,
"learning_rate": 4.336524838874023e-08,
"loss": 0.1209,
"step": 3326
},
{
"epoch": 0.91,
"grad_norm": 2.576692581176758,
"learning_rate": 4.310784414225466e-08,
"loss": 0.112,
"step": 3327
},
{
"epoch": 0.91,
"grad_norm": 2.711721897125244,
"learning_rate": 4.285118928251119e-08,
"loss": 0.117,
"step": 3328
},
{
"epoch": 0.91,
"grad_norm": 2.781630039215088,
"learning_rate": 4.259528401050827e-08,
"loss": 0.12,
"step": 3329
},
{
"epoch": 0.91,
"grad_norm": 2.912167549133301,
"learning_rate": 4.2340128526657024e-08,
"loss": 0.1087,
"step": 3330
},
{
"epoch": 0.91,
"grad_norm": 2.9487767219543457,
"learning_rate": 4.208572303078162e-08,
"loss": 0.1218,
"step": 3331
},
{
"epoch": 0.91,
"grad_norm": 2.5757272243499756,
"learning_rate": 4.183206772211867e-08,
"loss": 0.1067,
"step": 3332
},
{
"epoch": 0.91,
"grad_norm": 2.787808656692505,
"learning_rate": 4.157916279931761e-08,
"loss": 0.1124,
"step": 3333
},
{
"epoch": 0.91,
"grad_norm": 2.7106688022613525,
"learning_rate": 4.132700846044013e-08,
"loss": 0.1207,
"step": 3334
},
{
"epoch": 0.91,
"grad_norm": 2.7858059406280518,
"learning_rate": 4.1075604902959915e-08,
"loss": 0.1198,
"step": 3335
},
{
"epoch": 0.91,
"grad_norm": 3.217522144317627,
"learning_rate": 4.082495232376271e-08,
"loss": 0.1314,
"step": 3336
},
{
"epoch": 0.91,
"grad_norm": 2.835294485092163,
"learning_rate": 4.0575050919146256e-08,
"loss": 0.1193,
"step": 3337
},
{
"epoch": 0.91,
"grad_norm": 2.501631021499634,
"learning_rate": 4.032590088482002e-08,
"loss": 0.0932,
"step": 3338
},
{
"epoch": 0.91,
"grad_norm": 2.8946824073791504,
"learning_rate": 4.007750241590502e-08,
"loss": 0.1205,
"step": 3339
},
{
"epoch": 0.91,
"grad_norm": 2.566824197769165,
"learning_rate": 3.9829855706933536e-08,
"loss": 0.1044,
"step": 3340
},
{
"epoch": 0.91,
"grad_norm": 2.884089469909668,
"learning_rate": 3.95829609518491e-08,
"loss": 0.1233,
"step": 3341
},
{
"epoch": 0.91,
"grad_norm": 2.869565486907959,
"learning_rate": 3.933681834400682e-08,
"loss": 0.117,
"step": 3342
},
{
"epoch": 0.91,
"grad_norm": 2.831082582473755,
"learning_rate": 3.909142807617205e-08,
"loss": 0.1067,
"step": 3343
},
{
"epoch": 0.91,
"grad_norm": 2.764885902404785,
"learning_rate": 3.884679034052163e-08,
"loss": 0.1119,
"step": 3344
},
{
"epoch": 0.91,
"grad_norm": 3.153815507888794,
"learning_rate": 3.8602905328642634e-08,
"loss": 0.1282,
"step": 3345
},
{
"epoch": 0.91,
"grad_norm": 3.548212766647339,
"learning_rate": 3.835977323153261e-08,
"loss": 0.1286,
"step": 3346
},
{
"epoch": 0.91,
"grad_norm": 2.814499855041504,
"learning_rate": 3.811739423959992e-08,
"loss": 0.1176,
"step": 3347
},
{
"epoch": 0.91,
"grad_norm": 2.7057933807373047,
"learning_rate": 3.787576854266239e-08,
"loss": 0.107,
"step": 3348
},
{
"epoch": 0.91,
"grad_norm": 2.667623519897461,
"learning_rate": 3.763489632994876e-08,
"loss": 0.1013,
"step": 3349
},
{
"epoch": 0.92,
"grad_norm": 3.1889543533325195,
"learning_rate": 3.739477779009703e-08,
"loss": 0.1145,
"step": 3350
},
{
"epoch": 0.92,
"grad_norm": 2.842881917953491,
"learning_rate": 3.715541311115522e-08,
"loss": 0.1128,
"step": 3351
},
{
"epoch": 0.92,
"grad_norm": 2.948249101638794,
"learning_rate": 3.6916802480581046e-08,
"loss": 0.1165,
"step": 3352
},
{
"epoch": 0.92,
"grad_norm": 2.9211549758911133,
"learning_rate": 3.6678946085241356e-08,
"loss": 0.118,
"step": 3353
},
{
"epoch": 0.92,
"grad_norm": 2.9507830142974854,
"learning_rate": 3.6441844111412824e-08,
"loss": 0.118,
"step": 3354
},
{
"epoch": 0.92,
"grad_norm": 2.799957036972046,
"learning_rate": 3.6205496744781014e-08,
"loss": 0.1047,
"step": 3355
},
{
"epoch": 0.92,
"grad_norm": 2.6917316913604736,
"learning_rate": 3.5969904170440214e-08,
"loss": 0.1118,
"step": 3356
},
{
"epoch": 0.92,
"grad_norm": 2.744602918624878,
"learning_rate": 3.573506657289427e-08,
"loss": 0.1169,
"step": 3357
},
{
"epoch": 0.92,
"grad_norm": 2.906572103500366,
"learning_rate": 3.550098413605529e-08,
"loss": 0.125,
"step": 3358
},
{
"epoch": 0.92,
"grad_norm": 2.6737864017486572,
"learning_rate": 3.5267657043244084e-08,
"loss": 0.1044,
"step": 3359
},
{
"epoch": 0.92,
"grad_norm": 2.5877609252929688,
"learning_rate": 3.503508547719014e-08,
"loss": 0.1093,
"step": 3360
},
{
"epoch": 0.92,
"grad_norm": 2.703770875930786,
"learning_rate": 3.480326962003077e-08,
"loss": 0.1046,
"step": 3361
},
{
"epoch": 0.92,
"grad_norm": 2.6350808143615723,
"learning_rate": 3.4572209653311977e-08,
"loss": 0.103,
"step": 3362
},
{
"epoch": 0.92,
"grad_norm": 2.9087467193603516,
"learning_rate": 3.434190575798734e-08,
"loss": 0.1097,
"step": 3363
},
{
"epoch": 0.92,
"grad_norm": 2.8744146823883057,
"learning_rate": 3.4112358114418815e-08,
"loss": 0.1224,
"step": 3364
},
{
"epoch": 0.92,
"grad_norm": 3.058384895324707,
"learning_rate": 3.388356690237582e-08,
"loss": 0.1101,
"step": 3365
},
{
"epoch": 0.92,
"grad_norm": 2.962033987045288,
"learning_rate": 3.3655532301035017e-08,
"loss": 0.1196,
"step": 3366
},
{
"epoch": 0.92,
"grad_norm": 2.707650661468506,
"learning_rate": 3.3428254488981455e-08,
"loss": 0.1104,
"step": 3367
},
{
"epoch": 0.92,
"grad_norm": 2.9100685119628906,
"learning_rate": 3.320173364420642e-08,
"loss": 0.1153,
"step": 3368
},
{
"epoch": 0.92,
"grad_norm": 2.714811325073242,
"learning_rate": 3.297596994410934e-08,
"loss": 0.1062,
"step": 3369
},
{
"epoch": 0.92,
"grad_norm": 2.850252151489258,
"learning_rate": 3.2750963565496114e-08,
"loss": 0.1274,
"step": 3370
},
{
"epoch": 0.92,
"grad_norm": 2.6645967960357666,
"learning_rate": 3.252671468457957e-08,
"loss": 0.1126,
"step": 3371
},
{
"epoch": 0.92,
"grad_norm": 2.775787115097046,
"learning_rate": 3.230322347697967e-08,
"loss": 0.0974,
"step": 3372
},
{
"epoch": 0.92,
"grad_norm": 2.9510257244110107,
"learning_rate": 3.208049011772263e-08,
"loss": 0.1183,
"step": 3373
},
{
"epoch": 0.92,
"grad_norm": 3.2880499362945557,
"learning_rate": 3.1858514781241355e-08,
"loss": 0.126,
"step": 3374
},
{
"epoch": 0.92,
"grad_norm": 2.773585319519043,
"learning_rate": 3.1637297641375015e-08,
"loss": 0.1108,
"step": 3375
},
{
"epoch": 0.92,
"grad_norm": 2.9001801013946533,
"learning_rate": 3.141683887136892e-08,
"loss": 0.1184,
"step": 3376
},
{
"epoch": 0.92,
"grad_norm": 2.721400260925293,
"learning_rate": 3.1197138643874744e-08,
"loss": 0.1012,
"step": 3377
},
{
"epoch": 0.92,
"grad_norm": 2.6031508445739746,
"learning_rate": 3.097819713094996e-08,
"loss": 0.1048,
"step": 3378
},
{
"epoch": 0.92,
"grad_norm": 3.1122517585754395,
"learning_rate": 3.076001450405785e-08,
"loss": 0.1241,
"step": 3379
},
{
"epoch": 0.92,
"grad_norm": 2.5732295513153076,
"learning_rate": 3.05425909340673e-08,
"loss": 0.1012,
"step": 3380
},
{
"epoch": 0.92,
"grad_norm": 2.6614997386932373,
"learning_rate": 3.032592659125277e-08,
"loss": 0.1095,
"step": 3381
},
{
"epoch": 0.92,
"grad_norm": 2.649832248687744,
"learning_rate": 3.0110021645294415e-08,
"loss": 0.0958,
"step": 3382
},
{
"epoch": 0.92,
"grad_norm": 3.089282512664795,
"learning_rate": 2.989487626527709e-08,
"loss": 0.134,
"step": 3383
},
{
"epoch": 0.92,
"grad_norm": 2.8147947788238525,
"learning_rate": 2.9680490619691467e-08,
"loss": 0.1098,
"step": 3384
},
{
"epoch": 0.92,
"grad_norm": 2.984261989593506,
"learning_rate": 2.9466864876432794e-08,
"loss": 0.1176,
"step": 3385
},
{
"epoch": 0.92,
"grad_norm": 2.651834011077881,
"learning_rate": 2.925399920280114e-08,
"loss": 0.108,
"step": 3386
},
{
"epoch": 0.93,
"grad_norm": 2.6562538146972656,
"learning_rate": 2.9041893765501925e-08,
"loss": 0.1043,
"step": 3387
},
{
"epoch": 0.93,
"grad_norm": 2.8498289585113525,
"learning_rate": 2.8830548730644278e-08,
"loss": 0.1117,
"step": 3388
},
{
"epoch": 0.93,
"grad_norm": 2.793269634246826,
"learning_rate": 2.8619964263742802e-08,
"loss": 0.1228,
"step": 3389
},
{
"epoch": 0.93,
"grad_norm": 2.785788059234619,
"learning_rate": 2.84101405297158e-08,
"loss": 0.1186,
"step": 3390
},
{
"epoch": 0.93,
"grad_norm": 2.6449506282806396,
"learning_rate": 2.820107769288571e-08,
"loss": 0.1069,
"step": 3391
},
{
"epoch": 0.93,
"grad_norm": 3.226897716522217,
"learning_rate": 2.7992775916979795e-08,
"loss": 0.1224,
"step": 3392
},
{
"epoch": 0.93,
"grad_norm": 2.5964953899383545,
"learning_rate": 2.778523536512867e-08,
"loss": 0.1094,
"step": 3393
},
{
"epoch": 0.93,
"grad_norm": 2.71575665473938,
"learning_rate": 2.7578456199866983e-08,
"loss": 0.1149,
"step": 3394
},
{
"epoch": 0.93,
"grad_norm": 2.5593042373657227,
"learning_rate": 2.7372438583133208e-08,
"loss": 0.1139,
"step": 3395
},
{
"epoch": 0.93,
"grad_norm": 2.764103412628174,
"learning_rate": 2.716718267626905e-08,
"loss": 0.1139,
"step": 3396
},
{
"epoch": 0.93,
"grad_norm": 2.595871686935425,
"learning_rate": 2.696268864002027e-08,
"loss": 0.1053,
"step": 3397
},
{
"epoch": 0.93,
"grad_norm": 2.902984857559204,
"learning_rate": 2.6758956634535536e-08,
"loss": 0.1249,
"step": 3398
},
{
"epoch": 0.93,
"grad_norm": 2.6692392826080322,
"learning_rate": 2.6555986819366772e-08,
"loss": 0.1171,
"step": 3399
},
{
"epoch": 0.93,
"grad_norm": 2.914112091064453,
"learning_rate": 2.6353779353469385e-08,
"loss": 0.1193,
"step": 3400
},
{
"epoch": 0.93,
"grad_norm": 2.8296852111816406,
"learning_rate": 2.6152334395200925e-08,
"loss": 0.1169,
"step": 3401
},
{
"epoch": 0.93,
"grad_norm": 2.7452199459075928,
"learning_rate": 2.5951652102322862e-08,
"loss": 0.1093,
"step": 3402
},
{
"epoch": 0.93,
"grad_norm": 2.833092451095581,
"learning_rate": 2.575173263199837e-08,
"loss": 0.1177,
"step": 3403
},
{
"epoch": 0.93,
"grad_norm": 2.8856849670410156,
"learning_rate": 2.555257614079387e-08,
"loss": 0.1178,
"step": 3404
},
{
"epoch": 0.93,
"grad_norm": 2.7653207778930664,
"learning_rate": 2.535418278467838e-08,
"loss": 0.1273,
"step": 3405
},
{
"epoch": 0.93,
"grad_norm": 2.9552650451660156,
"learning_rate": 2.5156552719022394e-08,
"loss": 0.1129,
"step": 3406
},
{
"epoch": 0.93,
"grad_norm": 2.7580583095550537,
"learning_rate": 2.4959686098599554e-08,
"loss": 0.1214,
"step": 3407
},
{
"epoch": 0.93,
"grad_norm": 2.765307903289795,
"learning_rate": 2.4763583077585083e-08,
"loss": 0.1117,
"step": 3408
},
{
"epoch": 0.93,
"grad_norm": 2.7255513668060303,
"learning_rate": 2.4568243809556577e-08,
"loss": 0.1127,
"step": 3409
},
{
"epoch": 0.93,
"grad_norm": 2.5587587356567383,
"learning_rate": 2.4373668447493224e-08,
"loss": 0.1042,
"step": 3410
},
{
"epoch": 0.93,
"grad_norm": 2.553455352783203,
"learning_rate": 2.4179857143776017e-08,
"loss": 0.1137,
"step": 3411
},
{
"epoch": 0.93,
"grad_norm": 2.822624444961548,
"learning_rate": 2.3986810050187543e-08,
"loss": 0.1121,
"step": 3412
},
{
"epoch": 0.93,
"grad_norm": 2.892587661743164,
"learning_rate": 2.3794527317911983e-08,
"loss": 0.113,
"step": 3413
},
{
"epoch": 0.93,
"grad_norm": 2.899445056915283,
"learning_rate": 2.3603009097534986e-08,
"loss": 0.1241,
"step": 3414
},
{
"epoch": 0.93,
"grad_norm": 2.861823320388794,
"learning_rate": 2.3412255539043357e-08,
"loss": 0.1123,
"step": 3415
},
{
"epoch": 0.93,
"grad_norm": 2.8747031688690186,
"learning_rate": 2.3222266791824928e-08,
"loss": 0.1169,
"step": 3416
},
{
"epoch": 0.93,
"grad_norm": 2.610382318496704,
"learning_rate": 2.3033043004668907e-08,
"loss": 0.108,
"step": 3417
},
{
"epoch": 0.93,
"grad_norm": 2.993743419647217,
"learning_rate": 2.2844584325765083e-08,
"loss": 0.132,
"step": 3418
},
{
"epoch": 0.93,
"grad_norm": 2.6418840885162354,
"learning_rate": 2.2656890902704175e-08,
"loss": 0.0968,
"step": 3419
},
{
"epoch": 0.93,
"grad_norm": 2.785472869873047,
"learning_rate": 2.2469962882478043e-08,
"loss": 0.1225,
"step": 3420
},
{
"epoch": 0.93,
"grad_norm": 2.6220009326934814,
"learning_rate": 2.228380041147815e-08,
"loss": 0.1124,
"step": 3421
},
{
"epoch": 0.93,
"grad_norm": 2.8879494667053223,
"learning_rate": 2.209840363549742e-08,
"loss": 0.1219,
"step": 3422
},
{
"epoch": 0.94,
"grad_norm": 3.032655715942383,
"learning_rate": 2.1913772699728273e-08,
"loss": 0.1195,
"step": 3423
},
{
"epoch": 0.94,
"grad_norm": 3.276188373565674,
"learning_rate": 2.1729907748764152e-08,
"loss": 0.1382,
"step": 3424
},
{
"epoch": 0.94,
"grad_norm": 2.5606958866119385,
"learning_rate": 2.1546808926598103e-08,
"loss": 0.1086,
"step": 3425
},
{
"epoch": 0.94,
"grad_norm": 3.0688209533691406,
"learning_rate": 2.136447637662342e-08,
"loss": 0.1297,
"step": 3426
},
{
"epoch": 0.94,
"grad_norm": 2.614536762237549,
"learning_rate": 2.118291024163299e-08,
"loss": 0.1041,
"step": 3427
},
{
"epoch": 0.94,
"grad_norm": 2.9773361682891846,
"learning_rate": 2.100211066381985e-08,
"loss": 0.1188,
"step": 3428
},
{
"epoch": 0.94,
"grad_norm": 2.830986261367798,
"learning_rate": 2.0822077784776516e-08,
"loss": 0.1141,
"step": 3429
},
{
"epoch": 0.94,
"grad_norm": 2.704211473464966,
"learning_rate": 2.0642811745495204e-08,
"loss": 0.1225,
"step": 3430
},
{
"epoch": 0.94,
"grad_norm": 2.675964832305908,
"learning_rate": 2.046431268636739e-08,
"loss": 0.1136,
"step": 3431
},
{
"epoch": 0.94,
"grad_norm": 2.658668279647827,
"learning_rate": 2.0286580747184035e-08,
"loss": 0.1036,
"step": 3432
},
{
"epoch": 0.94,
"grad_norm": 2.5797698497772217,
"learning_rate": 2.0109616067135126e-08,
"loss": 0.1096,
"step": 3433
},
{
"epoch": 0.94,
"grad_norm": 3.078524351119995,
"learning_rate": 1.993341878481003e-08,
"loss": 0.1322,
"step": 3434
},
{
"epoch": 0.94,
"grad_norm": 2.666562080383301,
"learning_rate": 1.9757989038197143e-08,
"loss": 0.1189,
"step": 3435
},
{
"epoch": 0.94,
"grad_norm": 2.893813133239746,
"learning_rate": 1.9583326964683678e-08,
"loss": 0.1166,
"step": 3436
},
{
"epoch": 0.94,
"grad_norm": 2.696438789367676,
"learning_rate": 1.940943270105544e-08,
"loss": 0.1163,
"step": 3437
},
{
"epoch": 0.94,
"grad_norm": 2.7810068130493164,
"learning_rate": 1.9236306383497048e-08,
"loss": 0.1085,
"step": 3438
},
{
"epoch": 0.94,
"grad_norm": 2.816138744354248,
"learning_rate": 1.9063948147592045e-08,
"loss": 0.1224,
"step": 3439
},
{
"epoch": 0.94,
"grad_norm": 2.743136167526245,
"learning_rate": 1.8892358128322017e-08,
"loss": 0.1186,
"step": 3440
},
{
"epoch": 0.94,
"grad_norm": 2.7357254028320312,
"learning_rate": 1.8721536460067244e-08,
"loss": 0.1059,
"step": 3441
},
{
"epoch": 0.94,
"grad_norm": 2.933887004852295,
"learning_rate": 1.8551483276605938e-08,
"loss": 0.1349,
"step": 3442
},
{
"epoch": 0.94,
"grad_norm": 2.620206832885742,
"learning_rate": 1.8382198711114572e-08,
"loss": 0.1046,
"step": 3443
},
{
"epoch": 0.94,
"grad_norm": 3.2741897106170654,
"learning_rate": 1.821368289616798e-08,
"loss": 0.1254,
"step": 3444
},
{
"epoch": 0.94,
"grad_norm": 2.9951748847961426,
"learning_rate": 1.8045935963738712e-08,
"loss": 0.128,
"step": 3445
},
{
"epoch": 0.94,
"grad_norm": 3.1498196125030518,
"learning_rate": 1.7878958045197123e-08,
"loss": 0.1163,
"step": 3446
},
{
"epoch": 0.94,
"grad_norm": 2.7662010192871094,
"learning_rate": 1.771274927131139e-08,
"loss": 0.1206,
"step": 3447
},
{
"epoch": 0.94,
"grad_norm": 2.666364908218384,
"learning_rate": 1.7547309772247278e-08,
"loss": 0.1154,
"step": 3448
},
{
"epoch": 0.94,
"grad_norm": 2.7426912784576416,
"learning_rate": 1.7382639677568146e-08,
"loss": 0.1209,
"step": 3449
},
{
"epoch": 0.94,
"grad_norm": 2.5739829540252686,
"learning_rate": 1.721873911623506e-08,
"loss": 0.1134,
"step": 3450
},
{
"epoch": 0.94,
"grad_norm": 2.8193235397338867,
"learning_rate": 1.70556082166059e-08,
"loss": 0.1101,
"step": 3451
},
{
"epoch": 0.94,
"grad_norm": 2.6647956371307373,
"learning_rate": 1.6893247106436136e-08,
"loss": 0.1111,
"step": 3452
},
{
"epoch": 0.94,
"grad_norm": 2.510910987854004,
"learning_rate": 1.6731655912878284e-08,
"loss": 0.1016,
"step": 3453
},
{
"epoch": 0.94,
"grad_norm": 2.9538257122039795,
"learning_rate": 1.657083476248189e-08,
"loss": 0.1216,
"step": 3454
},
{
"epoch": 0.94,
"grad_norm": 2.696749448776245,
"learning_rate": 1.641078378119365e-08,
"loss": 0.1062,
"step": 3455
},
{
"epoch": 0.94,
"grad_norm": 2.8078157901763916,
"learning_rate": 1.6251503094356743e-08,
"loss": 0.103,
"step": 3456
},
{
"epoch": 0.94,
"grad_norm": 2.706535816192627,
"learning_rate": 1.609299282671128e-08,
"loss": 0.1236,
"step": 3457
},
{
"epoch": 0.94,
"grad_norm": 2.724426507949829,
"learning_rate": 1.5935253102394185e-08,
"loss": 0.1068,
"step": 3458
},
{
"epoch": 0.94,
"grad_norm": 2.6226446628570557,
"learning_rate": 1.5778284044938528e-08,
"loss": 0.1058,
"step": 3459
},
{
"epoch": 0.95,
"grad_norm": 2.7852203845977783,
"learning_rate": 1.5622085777274417e-08,
"loss": 0.1154,
"step": 3460
},
{
"epoch": 0.95,
"grad_norm": 2.600698947906494,
"learning_rate": 1.5466658421727675e-08,
"loss": 0.1023,
"step": 3461
},
{
"epoch": 0.95,
"grad_norm": 2.805701494216919,
"learning_rate": 1.5312002100020816e-08,
"loss": 0.1214,
"step": 3462
},
{
"epoch": 0.95,
"grad_norm": 2.6820991039276123,
"learning_rate": 1.5158116933272402e-08,
"loss": 0.1003,
"step": 3463
},
{
"epoch": 0.95,
"grad_norm": 2.6976981163024902,
"learning_rate": 1.500500304199692e-08,
"loss": 0.1091,
"step": 3464
},
{
"epoch": 0.95,
"grad_norm": 2.4099295139312744,
"learning_rate": 1.4852660546105234e-08,
"loss": 0.0961,
"step": 3465
},
{
"epoch": 0.95,
"grad_norm": 2.765096664428711,
"learning_rate": 1.470108956490379e-08,
"loss": 0.1169,
"step": 3466
},
{
"epoch": 0.95,
"grad_norm": 2.628892660140991,
"learning_rate": 1.4550290217094529e-08,
"loss": 0.1113,
"step": 3467
},
{
"epoch": 0.95,
"grad_norm": 3.0857925415039062,
"learning_rate": 1.4400262620775871e-08,
"loss": 0.1271,
"step": 3468
},
{
"epoch": 0.95,
"grad_norm": 3.171969175338745,
"learning_rate": 1.4251006893441164e-08,
"loss": 0.1541,
"step": 3469
},
{
"epoch": 0.95,
"grad_norm": 2.68430233001709,
"learning_rate": 1.4102523151979572e-08,
"loss": 0.1288,
"step": 3470
},
{
"epoch": 0.95,
"grad_norm": 2.7666542530059814,
"learning_rate": 1.3954811512675636e-08,
"loss": 0.103,
"step": 3471
},
{
"epoch": 0.95,
"grad_norm": 2.5653326511383057,
"learning_rate": 1.3807872091209038e-08,
"loss": 0.1054,
"step": 3472
},
{
"epoch": 0.95,
"grad_norm": 2.939504623413086,
"learning_rate": 1.3661705002655177e-08,
"loss": 0.1227,
"step": 3473
},
{
"epoch": 0.95,
"grad_norm": 2.5027456283569336,
"learning_rate": 1.351631036148404e-08,
"loss": 0.1034,
"step": 3474
},
{
"epoch": 0.95,
"grad_norm": 2.8690476417541504,
"learning_rate": 1.3371688281560988e-08,
"loss": 0.1131,
"step": 3475
},
{
"epoch": 0.95,
"grad_norm": 2.7211506366729736,
"learning_rate": 1.3227838876146425e-08,
"loss": 0.122,
"step": 3476
},
{
"epoch": 0.95,
"grad_norm": 3.132707118988037,
"learning_rate": 1.3084762257895344e-08,
"loss": 0.1387,
"step": 3477
},
{
"epoch": 0.95,
"grad_norm": 2.824662208557129,
"learning_rate": 1.2942458538857893e-08,
"loss": 0.1177,
"step": 3478
},
{
"epoch": 0.95,
"grad_norm": 2.608438491821289,
"learning_rate": 1.280092783047848e-08,
"loss": 0.12,
"step": 3479
},
{
"epoch": 0.95,
"grad_norm": 2.574810028076172,
"learning_rate": 1.2660170243596558e-08,
"loss": 0.1164,
"step": 3480
},
{
"epoch": 0.95,
"grad_norm": 2.7951624393463135,
"learning_rate": 1.2520185888445945e-08,
"loss": 0.1081,
"step": 3481
},
{
"epoch": 0.95,
"grad_norm": 2.6384437084198,
"learning_rate": 1.2380974874654837e-08,
"loss": 0.1061,
"step": 3482
},
{
"epoch": 0.95,
"grad_norm": 2.7016963958740234,
"learning_rate": 1.2242537311245804e-08,
"loss": 0.1099,
"step": 3483
},
{
"epoch": 0.95,
"grad_norm": 2.588914155960083,
"learning_rate": 1.2104873306635788e-08,
"loss": 0.0982,
"step": 3484
},
{
"epoch": 0.95,
"grad_norm": 2.8033766746520996,
"learning_rate": 1.1967982968635992e-08,
"loss": 0.1134,
"step": 3485
},
{
"epoch": 0.95,
"grad_norm": 2.7870523929595947,
"learning_rate": 1.1831866404451441e-08,
"loss": 0.0995,
"step": 3486
},
{
"epoch": 0.95,
"grad_norm": 2.943394899368286,
"learning_rate": 1.1696523720681306e-08,
"loss": 0.1316,
"step": 3487
},
{
"epoch": 0.95,
"grad_norm": 2.647881507873535,
"learning_rate": 1.1561955023318915e-08,
"loss": 0.1152,
"step": 3488
},
{
"epoch": 0.95,
"grad_norm": 2.8770816326141357,
"learning_rate": 1.1428160417751186e-08,
"loss": 0.135,
"step": 3489
},
{
"epoch": 0.95,
"grad_norm": 2.982604503631592,
"learning_rate": 1.1295140008758863e-08,
"loss": 0.1231,
"step": 3490
},
{
"epoch": 0.95,
"grad_norm": 2.55593204498291,
"learning_rate": 1.1162893900516501e-08,
"loss": 0.0984,
"step": 3491
},
{
"epoch": 0.95,
"grad_norm": 2.924186944961548,
"learning_rate": 1.1031422196592033e-08,
"loss": 0.1278,
"step": 3492
},
{
"epoch": 0.95,
"grad_norm": 2.758187770843506,
"learning_rate": 1.090072499994732e-08,
"loss": 0.11,
"step": 3493
},
{
"epoch": 0.95,
"grad_norm": 2.764726161956787,
"learning_rate": 1.0770802412937041e-08,
"loss": 0.1144,
"step": 3494
},
{
"epoch": 0.95,
"grad_norm": 2.798654794692993,
"learning_rate": 1.064165453731003e-08,
"loss": 0.1093,
"step": 3495
},
{
"epoch": 0.95,
"grad_norm": 2.662637233734131,
"learning_rate": 1.0513281474207714e-08,
"loss": 0.106,
"step": 3496
},
{
"epoch": 0.96,
"grad_norm": 2.7021846771240234,
"learning_rate": 1.0385683324165007e-08,
"loss": 0.1158,
"step": 3497
},
{
"epoch": 0.96,
"grad_norm": 2.717860698699951,
"learning_rate": 1.0258860187110085e-08,
"loss": 0.1185,
"step": 3498
},
{
"epoch": 0.96,
"grad_norm": 2.8753104209899902,
"learning_rate": 1.0132812162363835e-08,
"loss": 0.1122,
"step": 3499
},
{
"epoch": 0.96,
"grad_norm": 2.7315731048583984,
"learning_rate": 1.0007539348640736e-08,
"loss": 0.1039,
"step": 3500
},
{
"epoch": 0.96,
"grad_norm": 2.7231922149658203,
"learning_rate": 9.883041844047313e-09,
"loss": 0.1079,
"step": 3501
},
{
"epoch": 0.96,
"grad_norm": 2.508077621459961,
"learning_rate": 9.759319746083571e-09,
"loss": 0.1022,
"step": 3502
},
{
"epoch": 0.96,
"grad_norm": 2.6812562942504883,
"learning_rate": 9.636373151642008e-09,
"loss": 0.1047,
"step": 3503
},
{
"epoch": 0.96,
"grad_norm": 2.833235263824463,
"learning_rate": 9.514202157007822e-09,
"loss": 0.1244,
"step": 3504
},
{
"epoch": 0.96,
"grad_norm": 2.7213900089263916,
"learning_rate": 9.392806857858815e-09,
"loss": 0.1147,
"step": 3505
},
{
"epoch": 0.96,
"grad_norm": 2.867309808731079,
"learning_rate": 9.27218734926527e-09,
"loss": 0.1177,
"step": 3506
},
{
"epoch": 0.96,
"grad_norm": 2.6550357341766357,
"learning_rate": 9.152343725689848e-09,
"loss": 0.1129,
"step": 3507
},
{
"epoch": 0.96,
"grad_norm": 2.9081578254699707,
"learning_rate": 9.033276080987805e-09,
"loss": 0.1199,
"step": 3508
},
{
"epoch": 0.96,
"grad_norm": 2.612014055252075,
"learning_rate": 8.914984508406331e-09,
"loss": 0.1026,
"step": 3509
},
{
"epoch": 0.96,
"grad_norm": 2.647467613220215,
"learning_rate": 8.79746910058543e-09,
"loss": 0.1042,
"step": 3510
},
{
"epoch": 0.96,
"grad_norm": 2.7883193492889404,
"learning_rate": 8.680729949556597e-09,
"loss": 0.1047,
"step": 3511
},
{
"epoch": 0.96,
"grad_norm": 2.792728900909424,
"learning_rate": 8.564767146743701e-09,
"loss": 0.1172,
"step": 3512
},
{
"epoch": 0.96,
"grad_norm": 2.797672748565674,
"learning_rate": 8.449580782962763e-09,
"loss": 0.1229,
"step": 3513
},
{
"epoch": 0.96,
"grad_norm": 2.8624258041381836,
"learning_rate": 8.335170948421288e-09,
"loss": 0.1176,
"step": 3514
},
{
"epoch": 0.96,
"grad_norm": 2.7784693241119385,
"learning_rate": 8.221537732719275e-09,
"loss": 0.107,
"step": 3515
},
{
"epoch": 0.96,
"grad_norm": 2.8000707626342773,
"learning_rate": 8.108681224848091e-09,
"loss": 0.1218,
"step": 3516
},
{
"epoch": 0.96,
"grad_norm": 2.9061808586120605,
"learning_rate": 7.996601513190704e-09,
"loss": 0.1111,
"step": 3517
},
{
"epoch": 0.96,
"grad_norm": 2.769812822341919,
"learning_rate": 7.885298685522235e-09,
"loss": 0.1137,
"step": 3518
},
{
"epoch": 0.96,
"grad_norm": 3.251732110977173,
"learning_rate": 7.774772829008847e-09,
"loss": 0.1348,
"step": 3519
},
{
"epoch": 0.96,
"grad_norm": 2.7810757160186768,
"learning_rate": 7.665024030208633e-09,
"loss": 0.1214,
"step": 3520
},
{
"epoch": 0.96,
"grad_norm": 2.72450590133667,
"learning_rate": 7.556052375070954e-09,
"loss": 0.1058,
"step": 3521
},
{
"epoch": 0.96,
"grad_norm": 2.5659492015838623,
"learning_rate": 7.447857948936654e-09,
"loss": 0.1089,
"step": 3522
},
{
"epoch": 0.96,
"grad_norm": 2.7431042194366455,
"learning_rate": 7.340440836537731e-09,
"loss": 0.1078,
"step": 3523
},
{
"epoch": 0.96,
"grad_norm": 2.6699774265289307,
"learning_rate": 7.2338011219973405e-09,
"loss": 0.1102,
"step": 3524
},
{
"epoch": 0.96,
"grad_norm": 2.532212734222412,
"learning_rate": 7.1279388888303425e-09,
"loss": 0.094,
"step": 3525
},
{
"epoch": 0.96,
"grad_norm": 2.7536232471466064,
"learning_rate": 7.022854219942198e-09,
"loss": 0.1141,
"step": 3526
},
{
"epoch": 0.96,
"grad_norm": 2.757707118988037,
"learning_rate": 6.9185471976296314e-09,
"loss": 0.1162,
"step": 3527
},
{
"epoch": 0.96,
"grad_norm": 2.800217628479004,
"learning_rate": 6.8150179035803e-09,
"loss": 0.1091,
"step": 3528
},
{
"epoch": 0.96,
"grad_norm": 2.950460195541382,
"learning_rate": 6.712266418872792e-09,
"loss": 0.125,
"step": 3529
},
{
"epoch": 0.96,
"grad_norm": 2.503347873687744,
"learning_rate": 6.610292823976627e-09,
"loss": 0.0961,
"step": 3530
},
{
"epoch": 0.96,
"grad_norm": 2.7922205924987793,
"learning_rate": 6.509097198752144e-09,
"loss": 0.1141,
"step": 3531
},
{
"epoch": 0.96,
"grad_norm": 2.7933483123779297,
"learning_rate": 6.408679622450064e-09,
"loss": 0.1178,
"step": 3532
},
{
"epoch": 0.97,
"grad_norm": 2.8715362548828125,
"learning_rate": 6.309040173712366e-09,
"loss": 0.1126,
"step": 3533
},
{
"epoch": 0.97,
"grad_norm": 2.7140932083129883,
"learning_rate": 6.210178930571186e-09,
"loss": 0.1133,
"step": 3534
},
{
"epoch": 0.97,
"grad_norm": 2.7340691089630127,
"learning_rate": 6.11209597044926e-09,
"loss": 0.1003,
"step": 3535
},
{
"epoch": 0.97,
"grad_norm": 2.64202618598938,
"learning_rate": 6.0147913701601436e-09,
"loss": 0.1133,
"step": 3536
},
{
"epoch": 0.97,
"grad_norm": 2.721435308456421,
"learning_rate": 5.918265205907547e-09,
"loss": 0.1208,
"step": 3537
},
{
"epoch": 0.97,
"grad_norm": 2.8083791732788086,
"learning_rate": 5.822517553285444e-09,
"loss": 0.1217,
"step": 3538
},
{
"epoch": 0.97,
"grad_norm": 2.845024347305298,
"learning_rate": 5.7275484872783e-09,
"loss": 0.1258,
"step": 3539
},
{
"epoch": 0.97,
"grad_norm": 2.8469386100769043,
"learning_rate": 5.633358082260953e-09,
"loss": 0.1186,
"step": 3540
},
{
"epoch": 0.97,
"grad_norm": 2.926581859588623,
"learning_rate": 5.539946411998286e-09,
"loss": 0.1174,
"step": 3541
},
{
"epoch": 0.97,
"grad_norm": 3.118927478790283,
"learning_rate": 5.447313549645116e-09,
"loss": 0.1265,
"step": 3542
},
{
"epoch": 0.97,
"grad_norm": 2.5963892936706543,
"learning_rate": 5.3554595677467455e-09,
"loss": 0.1034,
"step": 3543
},
{
"epoch": 0.97,
"grad_norm": 2.7718214988708496,
"learning_rate": 5.264384538238187e-09,
"loss": 0.1155,
"step": 3544
},
{
"epoch": 0.97,
"grad_norm": 2.664069414138794,
"learning_rate": 5.174088532444609e-09,
"loss": 0.1048,
"step": 3545
},
{
"epoch": 0.97,
"grad_norm": 2.590613842010498,
"learning_rate": 5.084571621080891e-09,
"loss": 0.1121,
"step": 3546
},
{
"epoch": 0.97,
"grad_norm": 2.689129114151001,
"learning_rate": 4.995833874252064e-09,
"loss": 0.1129,
"step": 3547
},
{
"epoch": 0.97,
"grad_norm": 3.0058634281158447,
"learning_rate": 4.907875361452762e-09,
"loss": 0.1259,
"step": 3548
},
{
"epoch": 0.97,
"grad_norm": 3.1234350204467773,
"learning_rate": 4.820696151567105e-09,
"loss": 0.1205,
"step": 3549
},
{
"epoch": 0.97,
"grad_norm": 2.5136115550994873,
"learning_rate": 4.734296312869479e-09,
"loss": 0.095,
"step": 3550
},
{
"epoch": 0.97,
"grad_norm": 2.9749624729156494,
"learning_rate": 4.648675913023648e-09,
"loss": 0.1194,
"step": 3551
},
{
"epoch": 0.97,
"grad_norm": 2.788620948791504,
"learning_rate": 4.563835019082751e-09,
"loss": 0.1075,
"step": 3552
},
{
"epoch": 0.97,
"grad_norm": 2.709120273590088,
"learning_rate": 4.479773697489642e-09,
"loss": 0.1075,
"step": 3553
},
{
"epoch": 0.97,
"grad_norm": 2.753480911254883,
"learning_rate": 4.396492014076769e-09,
"loss": 0.1168,
"step": 3554
},
{
"epoch": 0.97,
"grad_norm": 2.555407762527466,
"learning_rate": 4.31399003406574e-09,
"loss": 0.105,
"step": 3555
},
{
"epoch": 0.97,
"grad_norm": 2.8909194469451904,
"learning_rate": 4.23226782206787e-09,
"loss": 0.106,
"step": 3556
},
{
"epoch": 0.97,
"grad_norm": 2.591259002685547,
"learning_rate": 4.15132544208352e-09,
"loss": 0.1094,
"step": 3557
},
{
"epoch": 0.97,
"grad_norm": 2.816685914993286,
"learning_rate": 4.071162957502428e-09,
"loss": 0.1203,
"step": 3558
},
{
"epoch": 0.97,
"grad_norm": 2.5624732971191406,
"learning_rate": 3.991780431103597e-09,
"loss": 0.1138,
"step": 3559
},
{
"epoch": 0.97,
"grad_norm": 2.464156150817871,
"learning_rate": 3.913177925055189e-09,
"loss": 0.0939,
"step": 3560
},
{
"epoch": 0.97,
"grad_norm": 2.878142833709717,
"learning_rate": 3.835355500914405e-09,
"loss": 0.1185,
"step": 3561
},
{
"epoch": 0.97,
"grad_norm": 2.9618802070617676,
"learning_rate": 3.758313219627718e-09,
"loss": 0.1303,
"step": 3562
},
{
"epoch": 0.97,
"grad_norm": 2.6873321533203125,
"learning_rate": 3.682051141530418e-09,
"loss": 0.1128,
"step": 3563
},
{
"epoch": 0.97,
"grad_norm": 2.9435203075408936,
"learning_rate": 3.606569326346842e-09,
"loss": 0.1114,
"step": 3564
},
{
"epoch": 0.97,
"grad_norm": 2.599876642227173,
"learning_rate": 3.531867833190483e-09,
"loss": 0.1017,
"step": 3565
},
{
"epoch": 0.97,
"grad_norm": 2.731457471847534,
"learning_rate": 3.4579467205634315e-09,
"loss": 0.1232,
"step": 3566
},
{
"epoch": 0.97,
"grad_norm": 2.5069997310638428,
"learning_rate": 3.384806046356714e-09,
"loss": 0.1017,
"step": 3567
},
{
"epoch": 0.97,
"grad_norm": 2.9809789657592773,
"learning_rate": 3.3124458678503996e-09,
"loss": 0.1224,
"step": 3568
},
{
"epoch": 0.97,
"grad_norm": 2.742586612701416,
"learning_rate": 3.240866241712825e-09,
"loss": 0.1144,
"step": 3569
},
{
"epoch": 0.98,
"grad_norm": 2.6126275062561035,
"learning_rate": 3.1700672240014825e-09,
"loss": 0.1083,
"step": 3570
},
{
"epoch": 0.98,
"grad_norm": 2.58613657951355,
"learning_rate": 3.100048870162353e-09,
"loss": 0.1104,
"step": 3571
},
{
"epoch": 0.98,
"grad_norm": 3.0407299995422363,
"learning_rate": 3.0308112350301284e-09,
"loss": 0.1241,
"step": 3572
},
{
"epoch": 0.98,
"grad_norm": 2.790891647338867,
"learning_rate": 2.9623543728279908e-09,
"loss": 0.113,
"step": 3573
},
{
"epoch": 0.98,
"grad_norm": 2.999558210372925,
"learning_rate": 2.894678337167611e-09,
"loss": 0.1163,
"step": 3574
},
{
"epoch": 0.98,
"grad_norm": 2.754976511001587,
"learning_rate": 2.827783181049259e-09,
"loss": 0.1083,
"step": 3575
},
{
"epoch": 0.98,
"grad_norm": 2.7892959117889404,
"learning_rate": 2.7616689568616957e-09,
"loss": 0.1154,
"step": 3576
},
{
"epoch": 0.98,
"grad_norm": 2.6803035736083984,
"learning_rate": 2.696335716382059e-09,
"loss": 0.1106,
"step": 3577
},
{
"epoch": 0.98,
"grad_norm": 2.6981587409973145,
"learning_rate": 2.6317835107757535e-09,
"loss": 0.111,
"step": 3578
},
{
"epoch": 0.98,
"grad_norm": 2.82981276512146,
"learning_rate": 2.5680123905966745e-09,
"loss": 0.125,
"step": 3579
},
{
"epoch": 0.98,
"grad_norm": 2.6334357261657715,
"learning_rate": 2.5050224057868716e-09,
"loss": 0.1105,
"step": 3580
},
{
"epoch": 0.98,
"grad_norm": 2.6155431270599365,
"learning_rate": 2.4428136056768856e-09,
"loss": 0.1023,
"step": 3581
},
{
"epoch": 0.98,
"grad_norm": 2.829718828201294,
"learning_rate": 2.3813860389853004e-09,
"loss": 0.111,
"step": 3582
},
{
"epoch": 0.98,
"grad_norm": 2.755837917327881,
"learning_rate": 2.320739753818746e-09,
"loss": 0.1192,
"step": 3583
},
{
"epoch": 0.98,
"grad_norm": 2.7120723724365234,
"learning_rate": 2.260874797672341e-09,
"loss": 0.108,
"step": 3584
},
{
"epoch": 0.98,
"grad_norm": 2.784351110458374,
"learning_rate": 2.2017912174289164e-09,
"loss": 0.0985,
"step": 3585
},
{
"epoch": 0.98,
"grad_norm": 2.7912638187408447,
"learning_rate": 2.1434890593596823e-09,
"loss": 0.1152,
"step": 3586
},
{
"epoch": 0.98,
"grad_norm": 2.814800262451172,
"learning_rate": 2.0859683691238916e-09,
"loss": 0.1122,
"step": 3587
},
{
"epoch": 0.98,
"grad_norm": 2.7484853267669678,
"learning_rate": 2.0292291917684e-09,
"loss": 0.1215,
"step": 3588
},
{
"epoch": 0.98,
"grad_norm": 2.751107931137085,
"learning_rate": 1.973271571728441e-09,
"loss": 0.1075,
"step": 3589
},
{
"epoch": 0.98,
"grad_norm": 2.893585681915283,
"learning_rate": 1.9180955528270705e-09,
"loss": 0.1162,
"step": 3590
},
{
"epoch": 0.98,
"grad_norm": 2.7889368534088135,
"learning_rate": 1.8637011782751675e-09,
"loss": 0.1089,
"step": 3591
},
{
"epoch": 0.98,
"grad_norm": 2.986337184906006,
"learning_rate": 1.8100884906714353e-09,
"loss": 0.1218,
"step": 3592
},
{
"epoch": 0.98,
"grad_norm": 2.7190985679626465,
"learning_rate": 1.7572575320023987e-09,
"loss": 0.1069,
"step": 3593
},
{
"epoch": 0.98,
"grad_norm": 2.880392551422119,
"learning_rate": 1.705208343642739e-09,
"loss": 0.1255,
"step": 3594
},
{
"epoch": 0.98,
"grad_norm": 2.922776460647583,
"learning_rate": 1.6539409663542947e-09,
"loss": 0.1243,
"step": 3595
},
{
"epoch": 0.98,
"grad_norm": 2.5914292335510254,
"learning_rate": 1.6034554402870603e-09,
"loss": 0.107,
"step": 3596
},
{
"epoch": 0.98,
"grad_norm": 2.5544075965881348,
"learning_rate": 1.5537518049785204e-09,
"loss": 0.1048,
"step": 3597
},
{
"epoch": 0.98,
"grad_norm": 2.5028624534606934,
"learning_rate": 1.504830099353982e-09,
"loss": 0.1016,
"step": 3598
},
{
"epoch": 0.98,
"grad_norm": 2.731548309326172,
"learning_rate": 1.4566903617263537e-09,
"loss": 0.1201,
"step": 3599
},
{
"epoch": 0.98,
"grad_norm": 2.51127552986145,
"learning_rate": 1.409332629795923e-09,
"loss": 0.108,
"step": 3600
},
{
"epoch": 0.98,
"grad_norm": 2.9954733848571777,
"learning_rate": 1.3627569406509109e-09,
"loss": 0.1191,
"step": 3601
},
{
"epoch": 0.98,
"grad_norm": 2.8282675743103027,
"learning_rate": 1.316963330766807e-09,
"loss": 0.1226,
"step": 3602
},
{
"epoch": 0.98,
"grad_norm": 2.9065797328948975,
"learning_rate": 1.2719518360068127e-09,
"loss": 0.1172,
"step": 3603
},
{
"epoch": 0.98,
"grad_norm": 2.609205484390259,
"learning_rate": 1.227722491621397e-09,
"loss": 0.1067,
"step": 3604
},
{
"epoch": 0.98,
"grad_norm": 2.8958191871643066,
"learning_rate": 1.18427533224863e-09,
"loss": 0.1272,
"step": 3605
},
{
"epoch": 0.99,
"grad_norm": 3.1069350242614746,
"learning_rate": 1.1416103919141828e-09,
"loss": 0.1145,
"step": 3606
},
{
"epoch": 0.99,
"grad_norm": 3.0344345569610596,
"learning_rate": 1.0997277040306619e-09,
"loss": 0.1246,
"step": 3607
},
{
"epoch": 0.99,
"grad_norm": 2.7096669673919678,
"learning_rate": 1.058627301398607e-09,
"loss": 0.106,
"step": 3608
},
{
"epoch": 0.99,
"grad_norm": 2.5916988849639893,
"learning_rate": 1.018309216205493e-09,
"loss": 0.1099,
"step": 3609
},
{
"epoch": 0.99,
"grad_norm": 2.8465921878814697,
"learning_rate": 9.787734800263959e-10,
"loss": 0.1218,
"step": 3610
},
{
"epoch": 0.99,
"grad_norm": 2.4942994117736816,
"learning_rate": 9.400201238235484e-10,
"loss": 0.0934,
"step": 3611
},
{
"epoch": 0.99,
"grad_norm": 2.657019853591919,
"learning_rate": 9.020491779464512e-10,
"loss": 0.1116,
"step": 3612
},
{
"epoch": 0.99,
"grad_norm": 2.7186219692230225,
"learning_rate": 8.64860672131984e-10,
"loss": 0.1143,
"step": 3613
},
{
"epoch": 0.99,
"grad_norm": 2.7532966136932373,
"learning_rate": 8.284546355041833e-10,
"loss": 0.1102,
"step": 3614
},
{
"epoch": 0.99,
"grad_norm": 2.6225197315216064,
"learning_rate": 7.928310965742424e-10,
"loss": 0.0991,
"step": 3615
},
{
"epoch": 0.99,
"grad_norm": 2.7968692779541016,
"learning_rate": 7.579900832407338e-10,
"loss": 0.1177,
"step": 3616
},
{
"epoch": 0.99,
"grad_norm": 2.871469020843506,
"learning_rate": 7.239316227891645e-10,
"loss": 0.1258,
"step": 3617
},
{
"epoch": 0.99,
"grad_norm": 2.570632219314575,
"learning_rate": 6.906557418923098e-10,
"loss": 0.1111,
"step": 3618
},
{
"epoch": 0.99,
"grad_norm": 2.5789928436279297,
"learning_rate": 6.581624666102126e-10,
"loss": 0.1092,
"step": 3619
},
{
"epoch": 0.99,
"grad_norm": 2.8352792263031006,
"learning_rate": 6.264518223896287e-10,
"loss": 0.1321,
"step": 3620
},
{
"epoch": 0.99,
"grad_norm": 2.631747245788574,
"learning_rate": 5.955238340648039e-10,
"loss": 0.1173,
"step": 3621
},
{
"epoch": 0.99,
"grad_norm": 2.6391682624816895,
"learning_rate": 5.653785258568078e-10,
"loss": 0.1041,
"step": 3622
},
{
"epoch": 0.99,
"grad_norm": 2.706432342529297,
"learning_rate": 5.360159213738669e-10,
"loss": 0.1133,
"step": 3623
},
{
"epoch": 0.99,
"grad_norm": 2.7213144302368164,
"learning_rate": 5.074360436112535e-10,
"loss": 0.1156,
"step": 3624
},
{
"epoch": 0.99,
"grad_norm": 2.980194568634033,
"learning_rate": 4.796389149511748e-10,
"loss": 0.1339,
"step": 3625
},
{
"epoch": 0.99,
"grad_norm": 2.680392265319824,
"learning_rate": 4.526245571627729e-10,
"loss": 0.1139,
"step": 3626
},
{
"epoch": 0.99,
"grad_norm": 2.7701408863067627,
"learning_rate": 4.2639299140223574e-10,
"loss": 0.1123,
"step": 3627
},
{
"epoch": 0.99,
"grad_norm": 2.9241676330566406,
"learning_rate": 4.00944238212797e-10,
"loss": 0.1376,
"step": 3628
},
{
"epoch": 0.99,
"grad_norm": 2.728848934173584,
"learning_rate": 3.7627831752462534e-10,
"loss": 0.1079,
"step": 3629
},
{
"epoch": 0.99,
"grad_norm": 2.6433091163635254,
"learning_rate": 3.5239524865460224e-10,
"loss": 0.1052,
"step": 3630
},
{
"epoch": 0.99,
"grad_norm": 2.6879498958587646,
"learning_rate": 3.2929505030676594e-10,
"loss": 0.1137,
"step": 3631
},
{
"epoch": 0.99,
"grad_norm": 2.6512322425842285,
"learning_rate": 3.0697774057197867e-10,
"loss": 0.1084,
"step": 3632
},
{
"epoch": 0.99,
"grad_norm": 2.888759136199951,
"learning_rate": 2.854433369278153e-10,
"loss": 0.1168,
"step": 3633
},
{
"epoch": 0.99,
"grad_norm": 2.929271697998047,
"learning_rate": 2.646918562390077e-10,
"loss": 0.1282,
"step": 3634
},
{
"epoch": 0.99,
"grad_norm": 2.8085947036743164,
"learning_rate": 2.447233147570005e-10,
"loss": 0.1083,
"step": 3635
},
{
"epoch": 0.99,
"grad_norm": 2.3818328380584717,
"learning_rate": 2.255377281199511e-10,
"loss": 0.0916,
"step": 3636
},
{
"epoch": 0.99,
"grad_norm": 2.786105155944824,
"learning_rate": 2.0713511135317386e-10,
"loss": 0.1027,
"step": 3637
},
{
"epoch": 0.99,
"grad_norm": 2.8876585960388184,
"learning_rate": 1.8951547886858488e-10,
"loss": 0.1132,
"step": 3638
},
{
"epoch": 0.99,
"grad_norm": 2.8356857299804688,
"learning_rate": 1.7267884446470205e-10,
"loss": 0.1197,
"step": 3639
},
{
"epoch": 0.99,
"grad_norm": 2.8191254138946533,
"learning_rate": 1.5662522132742218e-10,
"loss": 0.128,
"step": 3640
},
{
"epoch": 0.99,
"grad_norm": 2.6730127334594727,
"learning_rate": 1.4135462202879977e-10,
"loss": 0.1158,
"step": 3641
},
{
"epoch": 0.99,
"grad_norm": 2.982252359390259,
"learning_rate": 1.2686705852804625e-10,
"loss": 0.1205,
"step": 3642
},
{
"epoch": 1.0,
"grad_norm": 2.9249508380889893,
"learning_rate": 1.1316254217119681e-10,
"loss": 0.1337,
"step": 3643
},
{
"epoch": 1.0,
"grad_norm": 3.0043392181396484,
"learning_rate": 1.0024108369066641e-10,
"loss": 0.1161,
"step": 3644
},
{
"epoch": 1.0,
"grad_norm": 2.7565393447875977,
"learning_rate": 8.810269320591591e-11,
"loss": 0.106,
"step": 3645
},
{
"epoch": 1.0,
"grad_norm": 3.222933530807495,
"learning_rate": 7.674738022311888e-11,
"loss": 0.1317,
"step": 3646
},
{
"epoch": 1.0,
"grad_norm": 2.6725003719329834,
"learning_rate": 6.617515363527282e-11,
"loss": 0.107,
"step": 3647
},
{
"epoch": 1.0,
"grad_norm": 2.7961084842681885,
"learning_rate": 5.638602172175488e-11,
"loss": 0.1191,
"step": 3648
},
{
"epoch": 1.0,
"grad_norm": 2.64941668510437,
"learning_rate": 4.737999214898814e-11,
"loss": 0.1056,
"step": 3649
},
{
"epoch": 1.0,
"grad_norm": 2.6278815269470215,
"learning_rate": 3.91570719699974e-11,
"loss": 0.1107,
"step": 3650
},
{
"epoch": 1.0,
"grad_norm": 2.79891037940979,
"learning_rate": 3.1717267624520316e-11,
"loss": 0.122,
"step": 3651
},
{
"epoch": 1.0,
"grad_norm": 2.6240437030792236,
"learning_rate": 2.5060584939118334e-11,
"loss": 0.1045,
"step": 3652
},
{
"epoch": 1.0,
"grad_norm": 2.7435734272003174,
"learning_rate": 1.9187029126843666e-11,
"loss": 0.1113,
"step": 3653
},
{
"epoch": 1.0,
"grad_norm": 2.657651662826538,
"learning_rate": 1.4096604787572353e-11,
"loss": 0.1108,
"step": 3654
},
{
"epoch": 1.0,
"grad_norm": 2.700277328491211,
"learning_rate": 9.789315907893226e-12,
"loss": 0.118,
"step": 3655
},
{
"epoch": 1.0,
"grad_norm": 2.825643301010132,
"learning_rate": 6.2651658608858795e-12,
"loss": 0.1242,
"step": 3656
},
{
"epoch": 1.0,
"grad_norm": 2.6722335815429688,
"learning_rate": 3.5241574067867983e-12,
"loss": 0.111,
"step": 3657
},
{
"epoch": 1.0,
"grad_norm": 2.6393723487854004,
"learning_rate": 1.566292691879134e-12,
"loss": 0.1113,
"step": 3658
},
{
"epoch": 1.0,
"grad_norm": 3.331031560897827,
"learning_rate": 3.9157324960292783e-13,
"loss": 0.1432,
"step": 3659
},
{
"epoch": 1.0,
"grad_norm": 2.9109230041503906,
"learning_rate": 0.0,
"loss": 0.1082,
"step": 3660
},
{
"epoch": 1.0,
"step": 3660,
"total_flos": 1.0073310583974789e+18,
"train_loss": 0.12687737517710265,
"train_runtime": 6334.926,
"train_samples_per_second": 73.964,
"train_steps_per_second": 0.578
}
],
"logging_steps": 1.0,
"max_steps": 3660,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100000,
"total_flos": 1.0073310583974789e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}