Files
Aerospace-llama3_1_8B_instruct/trainer_state.json
ModelHub XC 548485b2b9 初始化项目,由ModelHub XC社区提供模型
Model: BAAI/Aerospace-llama3_1_8B_instruct
Source: Original Platform
2026-05-19 12:00:47 +08:00

26249 lines
634 KiB
JSON

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 3737,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0002675943270002676,
"grad_norm": 12.188919067382812,
"learning_rate": 1.0695187165775401e-08,
"loss": 1.3815,
"step": 1
},
{
"epoch": 0.0005351886540005352,
"grad_norm": 9.284855842590332,
"learning_rate": 2.1390374331550803e-08,
"loss": 1.4795,
"step": 2
},
{
"epoch": 0.0008027829810008028,
"grad_norm": 10.941431999206543,
"learning_rate": 3.2085561497326206e-08,
"loss": 1.3657,
"step": 3
},
{
"epoch": 0.0010703773080010704,
"grad_norm": 13.973172187805176,
"learning_rate": 4.2780748663101606e-08,
"loss": 1.4645,
"step": 4
},
{
"epoch": 0.001337971635001338,
"grad_norm": 10.671640396118164,
"learning_rate": 5.3475935828877005e-08,
"loss": 1.3105,
"step": 5
},
{
"epoch": 0.0016055659620016055,
"grad_norm": 12.536456108093262,
"learning_rate": 6.417112299465241e-08,
"loss": 1.4371,
"step": 6
},
{
"epoch": 0.0018731602890018732,
"grad_norm": 12.72536849975586,
"learning_rate": 7.48663101604278e-08,
"loss": 1.3752,
"step": 7
},
{
"epoch": 0.0021407546160021407,
"grad_norm": 11.476215362548828,
"learning_rate": 8.556149732620321e-08,
"loss": 1.3656,
"step": 8
},
{
"epoch": 0.002408348943002408,
"grad_norm": 11.222175598144531,
"learning_rate": 9.625668449197862e-08,
"loss": 1.2515,
"step": 9
},
{
"epoch": 0.002675943270002676,
"grad_norm": 10.473676681518555,
"learning_rate": 1.0695187165775401e-07,
"loss": 1.3108,
"step": 10
},
{
"epoch": 0.0029435375970029436,
"grad_norm": 13.289176940917969,
"learning_rate": 1.1764705882352942e-07,
"loss": 1.4861,
"step": 11
},
{
"epoch": 0.003211131924003211,
"grad_norm": 12.152190208435059,
"learning_rate": 1.2834224598930482e-07,
"loss": 1.4196,
"step": 12
},
{
"epoch": 0.0034787262510034785,
"grad_norm": 12.78124713897705,
"learning_rate": 1.3903743315508023e-07,
"loss": 1.3621,
"step": 13
},
{
"epoch": 0.0037463205780037465,
"grad_norm": 13.670173645019531,
"learning_rate": 1.497326203208556e-07,
"loss": 1.3979,
"step": 14
},
{
"epoch": 0.004013914905004014,
"grad_norm": 11.728761672973633,
"learning_rate": 1.6042780748663104e-07,
"loss": 1.4335,
"step": 15
},
{
"epoch": 0.004281509232004281,
"grad_norm": 12.166805267333984,
"learning_rate": 1.7112299465240642e-07,
"loss": 1.2879,
"step": 16
},
{
"epoch": 0.004549103559004549,
"grad_norm": 11.862377166748047,
"learning_rate": 1.8181818181818183e-07,
"loss": 1.5192,
"step": 17
},
{
"epoch": 0.004816697886004816,
"grad_norm": 14.839558601379395,
"learning_rate": 1.9251336898395724e-07,
"loss": 1.4282,
"step": 18
},
{
"epoch": 0.005084292213005084,
"grad_norm": 9.195609092712402,
"learning_rate": 2.0320855614973264e-07,
"loss": 1.3435,
"step": 19
},
{
"epoch": 0.005351886540005352,
"grad_norm": 12.21860408782959,
"learning_rate": 2.1390374331550802e-07,
"loss": 1.438,
"step": 20
},
{
"epoch": 0.00561948086700562,
"grad_norm": 14.291121482849121,
"learning_rate": 2.2459893048128345e-07,
"loss": 1.478,
"step": 21
},
{
"epoch": 0.005887075194005887,
"grad_norm": 11.789957046508789,
"learning_rate": 2.3529411764705883e-07,
"loss": 1.3549,
"step": 22
},
{
"epoch": 0.006154669521006155,
"grad_norm": 8.266191482543945,
"learning_rate": 2.459893048128342e-07,
"loss": 1.3308,
"step": 23
},
{
"epoch": 0.006422263848006422,
"grad_norm": 14.99113941192627,
"learning_rate": 2.5668449197860965e-07,
"loss": 1.4145,
"step": 24
},
{
"epoch": 0.00668985817500669,
"grad_norm": 12.698420524597168,
"learning_rate": 2.6737967914438503e-07,
"loss": 1.3392,
"step": 25
},
{
"epoch": 0.006957452502006957,
"grad_norm": 9.676013946533203,
"learning_rate": 2.7807486631016046e-07,
"loss": 1.2585,
"step": 26
},
{
"epoch": 0.0072250468290072254,
"grad_norm": 10.009092330932617,
"learning_rate": 2.8877005347593584e-07,
"loss": 1.4387,
"step": 27
},
{
"epoch": 0.007492641156007493,
"grad_norm": 10.498503684997559,
"learning_rate": 2.994652406417112e-07,
"loss": 1.2998,
"step": 28
},
{
"epoch": 0.00776023548300776,
"grad_norm": 8.508102416992188,
"learning_rate": 3.1016042780748665e-07,
"loss": 1.4256,
"step": 29
},
{
"epoch": 0.008027829810008028,
"grad_norm": 13.676020622253418,
"learning_rate": 3.208556149732621e-07,
"loss": 1.4097,
"step": 30
},
{
"epoch": 0.008295424137008296,
"grad_norm": 8.145886421203613,
"learning_rate": 3.3155080213903747e-07,
"loss": 1.4282,
"step": 31
},
{
"epoch": 0.008563018464008563,
"grad_norm": 9.673611640930176,
"learning_rate": 3.4224598930481285e-07,
"loss": 1.4322,
"step": 32
},
{
"epoch": 0.008830612791008831,
"grad_norm": 8.00688362121582,
"learning_rate": 3.529411764705883e-07,
"loss": 1.2729,
"step": 33
},
{
"epoch": 0.009098207118009098,
"grad_norm": 8.261327743530273,
"learning_rate": 3.6363636363636366e-07,
"loss": 1.3191,
"step": 34
},
{
"epoch": 0.009365801445009366,
"grad_norm": 7.580234527587891,
"learning_rate": 3.7433155080213904e-07,
"loss": 1.3552,
"step": 35
},
{
"epoch": 0.009633395772009633,
"grad_norm": 6.747824668884277,
"learning_rate": 3.8502673796791447e-07,
"loss": 1.3651,
"step": 36
},
{
"epoch": 0.009900990099009901,
"grad_norm": 7.468786239624023,
"learning_rate": 3.957219251336899e-07,
"loss": 1.4516,
"step": 37
},
{
"epoch": 0.010168584426010168,
"grad_norm": 7.431623935699463,
"learning_rate": 4.064171122994653e-07,
"loss": 1.3,
"step": 38
},
{
"epoch": 0.010436178753010436,
"grad_norm": 7.183818817138672,
"learning_rate": 4.1711229946524066e-07,
"loss": 1.279,
"step": 39
},
{
"epoch": 0.010703773080010704,
"grad_norm": 7.624693393707275,
"learning_rate": 4.2780748663101604e-07,
"loss": 1.2432,
"step": 40
},
{
"epoch": 0.010971367407010971,
"grad_norm": 6.89495325088501,
"learning_rate": 4.3850267379679153e-07,
"loss": 1.3082,
"step": 41
},
{
"epoch": 0.01123896173401124,
"grad_norm": 6.955787181854248,
"learning_rate": 4.491978609625669e-07,
"loss": 1.3646,
"step": 42
},
{
"epoch": 0.011506556061011506,
"grad_norm": 6.60789155960083,
"learning_rate": 4.598930481283423e-07,
"loss": 1.2452,
"step": 43
},
{
"epoch": 0.011774150388011774,
"grad_norm": 5.894134998321533,
"learning_rate": 4.7058823529411767e-07,
"loss": 1.2042,
"step": 44
},
{
"epoch": 0.012041744715012041,
"grad_norm": 6.411407470703125,
"learning_rate": 4.812834224598931e-07,
"loss": 1.3002,
"step": 45
},
{
"epoch": 0.01230933904201231,
"grad_norm": 6.972503185272217,
"learning_rate": 4.919786096256684e-07,
"loss": 1.3314,
"step": 46
},
{
"epoch": 0.012576933369012578,
"grad_norm": 6.686539649963379,
"learning_rate": 5.02673796791444e-07,
"loss": 1.3888,
"step": 47
},
{
"epoch": 0.012844527696012844,
"grad_norm": 5.989624500274658,
"learning_rate": 5.133689839572193e-07,
"loss": 1.2706,
"step": 48
},
{
"epoch": 0.013112122023013113,
"grad_norm": 5.950018405914307,
"learning_rate": 5.240641711229947e-07,
"loss": 1.2127,
"step": 49
},
{
"epoch": 0.01337971635001338,
"grad_norm": 6.040707588195801,
"learning_rate": 5.347593582887701e-07,
"loss": 1.3227,
"step": 50
},
{
"epoch": 0.013647310677013648,
"grad_norm": 5.798983573913574,
"learning_rate": 5.454545454545455e-07,
"loss": 1.274,
"step": 51
},
{
"epoch": 0.013914905004013914,
"grad_norm": 6.321473598480225,
"learning_rate": 5.561497326203209e-07,
"loss": 1.3229,
"step": 52
},
{
"epoch": 0.014182499331014183,
"grad_norm": 6.0828447341918945,
"learning_rate": 5.668449197860964e-07,
"loss": 1.2053,
"step": 53
},
{
"epoch": 0.014450093658014451,
"grad_norm": 6.756501197814941,
"learning_rate": 5.775401069518717e-07,
"loss": 1.2211,
"step": 54
},
{
"epoch": 0.014717687985014717,
"grad_norm": 6.133362770080566,
"learning_rate": 5.882352941176471e-07,
"loss": 1.0539,
"step": 55
},
{
"epoch": 0.014985282312014986,
"grad_norm": 6.778918743133545,
"learning_rate": 5.989304812834224e-07,
"loss": 1.2877,
"step": 56
},
{
"epoch": 0.015252876639015252,
"grad_norm": 6.3190836906433105,
"learning_rate": 6.096256684491979e-07,
"loss": 1.2435,
"step": 57
},
{
"epoch": 0.01552047096601552,
"grad_norm": 6.347466468811035,
"learning_rate": 6.203208556149733e-07,
"loss": 1.2126,
"step": 58
},
{
"epoch": 0.01578806529301579,
"grad_norm": 6.51020622253418,
"learning_rate": 6.310160427807486e-07,
"loss": 1.2392,
"step": 59
},
{
"epoch": 0.016055659620016056,
"grad_norm": 5.295354843139648,
"learning_rate": 6.417112299465242e-07,
"loss": 1.2241,
"step": 60
},
{
"epoch": 0.016323253947016322,
"grad_norm": 5.243033409118652,
"learning_rate": 6.524064171122996e-07,
"loss": 1.1209,
"step": 61
},
{
"epoch": 0.016590848274016592,
"grad_norm": 6.233066558837891,
"learning_rate": 6.631016042780749e-07,
"loss": 1.3712,
"step": 62
},
{
"epoch": 0.01685844260101686,
"grad_norm": 6.268922328948975,
"learning_rate": 6.737967914438504e-07,
"loss": 1.2861,
"step": 63
},
{
"epoch": 0.017126036928017126,
"grad_norm": 6.287389755249023,
"learning_rate": 6.844919786096257e-07,
"loss": 1.3305,
"step": 64
},
{
"epoch": 0.017393631255017392,
"grad_norm": 5.882622718811035,
"learning_rate": 6.951871657754011e-07,
"loss": 1.2595,
"step": 65
},
{
"epoch": 0.017661225582017662,
"grad_norm": 6.735020637512207,
"learning_rate": 7.058823529411766e-07,
"loss": 1.2909,
"step": 66
},
{
"epoch": 0.01792881990901793,
"grad_norm": 6.339001178741455,
"learning_rate": 7.165775401069519e-07,
"loss": 1.4648,
"step": 67
},
{
"epoch": 0.018196414236018196,
"grad_norm": 5.405106544494629,
"learning_rate": 7.272727272727273e-07,
"loss": 1.1235,
"step": 68
},
{
"epoch": 0.018464008563018466,
"grad_norm": 5.867051124572754,
"learning_rate": 7.379679144385026e-07,
"loss": 1.3021,
"step": 69
},
{
"epoch": 0.018731602890018732,
"grad_norm": 6.250916481018066,
"learning_rate": 7.486631016042781e-07,
"loss": 1.2687,
"step": 70
},
{
"epoch": 0.018999197217019,
"grad_norm": 5.591010093688965,
"learning_rate": 7.593582887700536e-07,
"loss": 1.1995,
"step": 71
},
{
"epoch": 0.019266791544019266,
"grad_norm": 5.43657112121582,
"learning_rate": 7.700534759358289e-07,
"loss": 1.2271,
"step": 72
},
{
"epoch": 0.019534385871019536,
"grad_norm": 5.751687526702881,
"learning_rate": 7.807486631016044e-07,
"loss": 1.3095,
"step": 73
},
{
"epoch": 0.019801980198019802,
"grad_norm": 5.247724533081055,
"learning_rate": 7.914438502673798e-07,
"loss": 1.257,
"step": 74
},
{
"epoch": 0.02006957452502007,
"grad_norm": 5.353349208831787,
"learning_rate": 8.021390374331551e-07,
"loss": 1.1566,
"step": 75
},
{
"epoch": 0.020337168852020335,
"grad_norm": 5.401248455047607,
"learning_rate": 8.128342245989306e-07,
"loss": 1.1666,
"step": 76
},
{
"epoch": 0.020604763179020606,
"grad_norm": 5.6218953132629395,
"learning_rate": 8.235294117647059e-07,
"loss": 1.2186,
"step": 77
},
{
"epoch": 0.020872357506020872,
"grad_norm": 4.6730875968933105,
"learning_rate": 8.342245989304813e-07,
"loss": 1.0226,
"step": 78
},
{
"epoch": 0.02113995183302114,
"grad_norm": 5.531125545501709,
"learning_rate": 8.449197860962568e-07,
"loss": 1.1753,
"step": 79
},
{
"epoch": 0.02140754616002141,
"grad_norm": 5.861851215362549,
"learning_rate": 8.556149732620321e-07,
"loss": 1.21,
"step": 80
},
{
"epoch": 0.021675140487021675,
"grad_norm": 5.872004508972168,
"learning_rate": 8.663101604278075e-07,
"loss": 1.1952,
"step": 81
},
{
"epoch": 0.021942734814021942,
"grad_norm": 5.292346000671387,
"learning_rate": 8.770053475935831e-07,
"loss": 1.1094,
"step": 82
},
{
"epoch": 0.02221032914102221,
"grad_norm": 5.308231353759766,
"learning_rate": 8.877005347593584e-07,
"loss": 1.1175,
"step": 83
},
{
"epoch": 0.02247792346802248,
"grad_norm": 5.5853071212768555,
"learning_rate": 8.983957219251338e-07,
"loss": 1.2289,
"step": 84
},
{
"epoch": 0.022745517795022745,
"grad_norm": 5.129279613494873,
"learning_rate": 9.090909090909091e-07,
"loss": 1.1456,
"step": 85
},
{
"epoch": 0.023013112122023012,
"grad_norm": 5.8378753662109375,
"learning_rate": 9.197860962566846e-07,
"loss": 1.3367,
"step": 86
},
{
"epoch": 0.023280706449023282,
"grad_norm": 5.8775715827941895,
"learning_rate": 9.3048128342246e-07,
"loss": 1.2341,
"step": 87
},
{
"epoch": 0.02354830077602355,
"grad_norm": 5.519059658050537,
"learning_rate": 9.411764705882353e-07,
"loss": 1.2504,
"step": 88
},
{
"epoch": 0.023815895103023815,
"grad_norm": 5.483979225158691,
"learning_rate": 9.518716577540108e-07,
"loss": 1.1266,
"step": 89
},
{
"epoch": 0.024083489430024082,
"grad_norm": 6.002791881561279,
"learning_rate": 9.625668449197862e-07,
"loss": 1.279,
"step": 90
},
{
"epoch": 0.024351083757024352,
"grad_norm": 5.5504021644592285,
"learning_rate": 9.732620320855615e-07,
"loss": 1.1381,
"step": 91
},
{
"epoch": 0.02461867808402462,
"grad_norm": 5.171264171600342,
"learning_rate": 9.839572192513369e-07,
"loss": 1.217,
"step": 92
},
{
"epoch": 0.024886272411024885,
"grad_norm": 5.0262370109558105,
"learning_rate": 9.946524064171124e-07,
"loss": 1.2421,
"step": 93
},
{
"epoch": 0.025153866738025155,
"grad_norm": 6.277072429656982,
"learning_rate": 1.005347593582888e-06,
"loss": 1.3543,
"step": 94
},
{
"epoch": 0.025421461065025422,
"grad_norm": 5.420050144195557,
"learning_rate": 1.0160427807486633e-06,
"loss": 1.2239,
"step": 95
},
{
"epoch": 0.02568905539202569,
"grad_norm": 5.405261516571045,
"learning_rate": 1.0267379679144386e-06,
"loss": 1.1811,
"step": 96
},
{
"epoch": 0.025956649719025955,
"grad_norm": 5.828834533691406,
"learning_rate": 1.037433155080214e-06,
"loss": 1.1913,
"step": 97
},
{
"epoch": 0.026224244046026225,
"grad_norm": 5.571322441101074,
"learning_rate": 1.0481283422459895e-06,
"loss": 1.1917,
"step": 98
},
{
"epoch": 0.026491838373026492,
"grad_norm": 5.72471284866333,
"learning_rate": 1.0588235294117648e-06,
"loss": 1.3015,
"step": 99
},
{
"epoch": 0.02675943270002676,
"grad_norm": 5.436988353729248,
"learning_rate": 1.0695187165775401e-06,
"loss": 1.3139,
"step": 100
},
{
"epoch": 0.02702702702702703,
"grad_norm": 6.0810394287109375,
"learning_rate": 1.0802139037433156e-06,
"loss": 1.1722,
"step": 101
},
{
"epoch": 0.027294621354027295,
"grad_norm": 5.316585540771484,
"learning_rate": 1.090909090909091e-06,
"loss": 1.1669,
"step": 102
},
{
"epoch": 0.027562215681027562,
"grad_norm": 4.8456950187683105,
"learning_rate": 1.1016042780748663e-06,
"loss": 1.063,
"step": 103
},
{
"epoch": 0.02782981000802783,
"grad_norm": 5.4848952293396,
"learning_rate": 1.1122994652406418e-06,
"loss": 1.1014,
"step": 104
},
{
"epoch": 0.0280974043350281,
"grad_norm": 5.489200592041016,
"learning_rate": 1.1229946524064172e-06,
"loss": 1.1174,
"step": 105
},
{
"epoch": 0.028364998662028365,
"grad_norm": 5.760312080383301,
"learning_rate": 1.1336898395721927e-06,
"loss": 1.3387,
"step": 106
},
{
"epoch": 0.02863259298902863,
"grad_norm": 5.3693413734436035,
"learning_rate": 1.144385026737968e-06,
"loss": 1.2021,
"step": 107
},
{
"epoch": 0.028900187316028902,
"grad_norm": 5.530979633331299,
"learning_rate": 1.1550802139037434e-06,
"loss": 1.3164,
"step": 108
},
{
"epoch": 0.02916778164302917,
"grad_norm": 6.090900897979736,
"learning_rate": 1.165775401069519e-06,
"loss": 1.3232,
"step": 109
},
{
"epoch": 0.029435375970029435,
"grad_norm": 5.393311023712158,
"learning_rate": 1.1764705882352942e-06,
"loss": 1.2509,
"step": 110
},
{
"epoch": 0.0297029702970297,
"grad_norm": 5.429086685180664,
"learning_rate": 1.1871657754010696e-06,
"loss": 1.115,
"step": 111
},
{
"epoch": 0.02997056462402997,
"grad_norm": 5.629342079162598,
"learning_rate": 1.1978609625668449e-06,
"loss": 1.0911,
"step": 112
},
{
"epoch": 0.03023815895103024,
"grad_norm": 5.1444621086120605,
"learning_rate": 1.2085561497326204e-06,
"loss": 1.2814,
"step": 113
},
{
"epoch": 0.030505753278030505,
"grad_norm": 5.754062652587891,
"learning_rate": 1.2192513368983957e-06,
"loss": 1.2656,
"step": 114
},
{
"epoch": 0.030773347605030775,
"grad_norm": 5.319810390472412,
"learning_rate": 1.2299465240641713e-06,
"loss": 1.1083,
"step": 115
},
{
"epoch": 0.03104094193203104,
"grad_norm": 5.084403991699219,
"learning_rate": 1.2406417112299466e-06,
"loss": 1.0987,
"step": 116
},
{
"epoch": 0.03130853625903131,
"grad_norm": 5.552883625030518,
"learning_rate": 1.251336898395722e-06,
"loss": 1.1867,
"step": 117
},
{
"epoch": 0.03157613058603158,
"grad_norm": 5.5615410804748535,
"learning_rate": 1.2620320855614973e-06,
"loss": 1.3064,
"step": 118
},
{
"epoch": 0.03184372491303184,
"grad_norm": 5.348892688751221,
"learning_rate": 1.2727272727272728e-06,
"loss": 1.2016,
"step": 119
},
{
"epoch": 0.03211131924003211,
"grad_norm": 5.782661437988281,
"learning_rate": 1.2834224598930483e-06,
"loss": 1.1626,
"step": 120
},
{
"epoch": 0.03237891356703238,
"grad_norm": 4.8546977043151855,
"learning_rate": 1.2941176470588237e-06,
"loss": 1.0428,
"step": 121
},
{
"epoch": 0.032646507894032645,
"grad_norm": 5.160636901855469,
"learning_rate": 1.3048128342245992e-06,
"loss": 1.1558,
"step": 122
},
{
"epoch": 0.032914102221032915,
"grad_norm": 5.260807514190674,
"learning_rate": 1.3155080213903745e-06,
"loss": 1.1594,
"step": 123
},
{
"epoch": 0.033181696548033185,
"grad_norm": 5.723674774169922,
"learning_rate": 1.3262032085561499e-06,
"loss": 1.0968,
"step": 124
},
{
"epoch": 0.03344929087503345,
"grad_norm": 5.457815170288086,
"learning_rate": 1.3368983957219254e-06,
"loss": 1.2186,
"step": 125
},
{
"epoch": 0.03371688520203372,
"grad_norm": 5.3614501953125,
"learning_rate": 1.3475935828877007e-06,
"loss": 1.2612,
"step": 126
},
{
"epoch": 0.03398447952903398,
"grad_norm": 5.161847114562988,
"learning_rate": 1.358288770053476e-06,
"loss": 1.1159,
"step": 127
},
{
"epoch": 0.03425207385603425,
"grad_norm": 5.856586456298828,
"learning_rate": 1.3689839572192514e-06,
"loss": 1.2016,
"step": 128
},
{
"epoch": 0.03451966818303452,
"grad_norm": 5.2745490074157715,
"learning_rate": 1.379679144385027e-06,
"loss": 1.241,
"step": 129
},
{
"epoch": 0.034787262510034785,
"grad_norm": 5.403688907623291,
"learning_rate": 1.3903743315508022e-06,
"loss": 1.1044,
"step": 130
},
{
"epoch": 0.035054856837035055,
"grad_norm": 5.162592887878418,
"learning_rate": 1.4010695187165776e-06,
"loss": 1.0964,
"step": 131
},
{
"epoch": 0.035322451164035325,
"grad_norm": 5.914812088012695,
"learning_rate": 1.4117647058823531e-06,
"loss": 1.2008,
"step": 132
},
{
"epoch": 0.03559004549103559,
"grad_norm": 5.173002243041992,
"learning_rate": 1.4224598930481284e-06,
"loss": 1.2276,
"step": 133
},
{
"epoch": 0.03585763981803586,
"grad_norm": 5.35471773147583,
"learning_rate": 1.4331550802139038e-06,
"loss": 1.2126,
"step": 134
},
{
"epoch": 0.03612523414503613,
"grad_norm": 5.432989120483398,
"learning_rate": 1.4438502673796793e-06,
"loss": 1.2237,
"step": 135
},
{
"epoch": 0.03639282847203639,
"grad_norm": 5.986301898956299,
"learning_rate": 1.4545454545454546e-06,
"loss": 1.3529,
"step": 136
},
{
"epoch": 0.03666042279903666,
"grad_norm": 4.566310882568359,
"learning_rate": 1.46524064171123e-06,
"loss": 1.1274,
"step": 137
},
{
"epoch": 0.03692801712603693,
"grad_norm": 4.906930923461914,
"learning_rate": 1.4759358288770053e-06,
"loss": 1.077,
"step": 138
},
{
"epoch": 0.037195611453037195,
"grad_norm": 5.325079917907715,
"learning_rate": 1.4866310160427808e-06,
"loss": 1.1759,
"step": 139
},
{
"epoch": 0.037463205780037465,
"grad_norm": 5.840808868408203,
"learning_rate": 1.4973262032085562e-06,
"loss": 1.2583,
"step": 140
},
{
"epoch": 0.03773080010703773,
"grad_norm": 5.32767915725708,
"learning_rate": 1.5080213903743315e-06,
"loss": 1.1345,
"step": 141
},
{
"epoch": 0.037998394434038,
"grad_norm": 5.714527130126953,
"learning_rate": 1.5187165775401072e-06,
"loss": 1.1113,
"step": 142
},
{
"epoch": 0.03826598876103827,
"grad_norm": 5.20102596282959,
"learning_rate": 1.5294117647058826e-06,
"loss": 1.1802,
"step": 143
},
{
"epoch": 0.03853358308803853,
"grad_norm": 5.638003826141357,
"learning_rate": 1.5401069518716579e-06,
"loss": 1.1773,
"step": 144
},
{
"epoch": 0.0388011774150388,
"grad_norm": 6.044027328491211,
"learning_rate": 1.5508021390374334e-06,
"loss": 1.2413,
"step": 145
},
{
"epoch": 0.03906877174203907,
"grad_norm": 5.571484565734863,
"learning_rate": 1.5614973262032088e-06,
"loss": 1.2316,
"step": 146
},
{
"epoch": 0.039336366069039334,
"grad_norm": 4.9136457443237305,
"learning_rate": 1.572192513368984e-06,
"loss": 1.2048,
"step": 147
},
{
"epoch": 0.039603960396039604,
"grad_norm": 5.684943675994873,
"learning_rate": 1.5828877005347596e-06,
"loss": 1.1958,
"step": 148
},
{
"epoch": 0.039871554723039875,
"grad_norm": 5.061483860015869,
"learning_rate": 1.593582887700535e-06,
"loss": 1.0256,
"step": 149
},
{
"epoch": 0.04013914905004014,
"grad_norm": 5.615631580352783,
"learning_rate": 1.6042780748663103e-06,
"loss": 1.1739,
"step": 150
},
{
"epoch": 0.04040674337704041,
"grad_norm": 5.1106791496276855,
"learning_rate": 1.6149732620320858e-06,
"loss": 1.1252,
"step": 151
},
{
"epoch": 0.04067433770404067,
"grad_norm": 5.248224258422852,
"learning_rate": 1.6256684491978611e-06,
"loss": 1.1636,
"step": 152
},
{
"epoch": 0.04094193203104094,
"grad_norm": 5.454551696777344,
"learning_rate": 1.6363636363636365e-06,
"loss": 1.2073,
"step": 153
},
{
"epoch": 0.04120952635804121,
"grad_norm": 4.683351039886475,
"learning_rate": 1.6470588235294118e-06,
"loss": 1.0651,
"step": 154
},
{
"epoch": 0.041477120685041474,
"grad_norm": 5.561789512634277,
"learning_rate": 1.6577540106951873e-06,
"loss": 1.1654,
"step": 155
},
{
"epoch": 0.041744715012041744,
"grad_norm": 4.899862766265869,
"learning_rate": 1.6684491978609627e-06,
"loss": 1.116,
"step": 156
},
{
"epoch": 0.042012309339042014,
"grad_norm": 5.253740310668945,
"learning_rate": 1.679144385026738e-06,
"loss": 1.1066,
"step": 157
},
{
"epoch": 0.04227990366604228,
"grad_norm": 5.0830183029174805,
"learning_rate": 1.6898395721925135e-06,
"loss": 1.0951,
"step": 158
},
{
"epoch": 0.04254749799304255,
"grad_norm": 5.221453666687012,
"learning_rate": 1.7005347593582888e-06,
"loss": 1.1063,
"step": 159
},
{
"epoch": 0.04281509232004282,
"grad_norm": 5.58682918548584,
"learning_rate": 1.7112299465240642e-06,
"loss": 1.0932,
"step": 160
},
{
"epoch": 0.04308268664704308,
"grad_norm": 5.066821098327637,
"learning_rate": 1.7219251336898395e-06,
"loss": 1.2249,
"step": 161
},
{
"epoch": 0.04335028097404335,
"grad_norm": 4.870170593261719,
"learning_rate": 1.732620320855615e-06,
"loss": 1.0948,
"step": 162
},
{
"epoch": 0.04361787530104362,
"grad_norm": 5.372590065002441,
"learning_rate": 1.7433155080213904e-06,
"loss": 1.1966,
"step": 163
},
{
"epoch": 0.043885469628043884,
"grad_norm": 5.09646463394165,
"learning_rate": 1.7540106951871661e-06,
"loss": 1.0542,
"step": 164
},
{
"epoch": 0.044153063955044154,
"grad_norm": 5.390144348144531,
"learning_rate": 1.7647058823529414e-06,
"loss": 1.197,
"step": 165
},
{
"epoch": 0.04442065828204442,
"grad_norm": 5.653879642486572,
"learning_rate": 1.7754010695187168e-06,
"loss": 1.2669,
"step": 166
},
{
"epoch": 0.04468825260904469,
"grad_norm": 5.457263469696045,
"learning_rate": 1.7860962566844923e-06,
"loss": 1.1263,
"step": 167
},
{
"epoch": 0.04495584693604496,
"grad_norm": 5.214939594268799,
"learning_rate": 1.7967914438502676e-06,
"loss": 1.272,
"step": 168
},
{
"epoch": 0.04522344126304522,
"grad_norm": 5.01685905456543,
"learning_rate": 1.807486631016043e-06,
"loss": 1.1515,
"step": 169
},
{
"epoch": 0.04549103559004549,
"grad_norm": 5.313577175140381,
"learning_rate": 1.8181818181818183e-06,
"loss": 1.0548,
"step": 170
},
{
"epoch": 0.04575862991704576,
"grad_norm": 5.595825672149658,
"learning_rate": 1.8288770053475938e-06,
"loss": 1.2787,
"step": 171
},
{
"epoch": 0.046026224244046024,
"grad_norm": 5.331969738006592,
"learning_rate": 1.8395721925133692e-06,
"loss": 1.2543,
"step": 172
},
{
"epoch": 0.046293818571046294,
"grad_norm": 5.564863204956055,
"learning_rate": 1.8502673796791445e-06,
"loss": 1.1323,
"step": 173
},
{
"epoch": 0.046561412898046564,
"grad_norm": 4.885172367095947,
"learning_rate": 1.86096256684492e-06,
"loss": 1.136,
"step": 174
},
{
"epoch": 0.04682900722504683,
"grad_norm": 5.40251350402832,
"learning_rate": 1.8716577540106954e-06,
"loss": 1.1442,
"step": 175
},
{
"epoch": 0.0470966015520471,
"grad_norm": 5.090615749359131,
"learning_rate": 1.8823529411764707e-06,
"loss": 1.1505,
"step": 176
},
{
"epoch": 0.04736419587904737,
"grad_norm": 5.03092622756958,
"learning_rate": 1.893048128342246e-06,
"loss": 1.2347,
"step": 177
},
{
"epoch": 0.04763179020604763,
"grad_norm": 5.308589935302734,
"learning_rate": 1.9037433155080215e-06,
"loss": 1.1856,
"step": 178
},
{
"epoch": 0.0478993845330479,
"grad_norm": 5.609830379486084,
"learning_rate": 1.914438502673797e-06,
"loss": 1.208,
"step": 179
},
{
"epoch": 0.048166978860048164,
"grad_norm": 4.926831245422363,
"learning_rate": 1.9251336898395724e-06,
"loss": 1.1978,
"step": 180
},
{
"epoch": 0.048434573187048434,
"grad_norm": 5.084292888641357,
"learning_rate": 1.9358288770053475e-06,
"loss": 1.2949,
"step": 181
},
{
"epoch": 0.048702167514048704,
"grad_norm": 4.909692764282227,
"learning_rate": 1.946524064171123e-06,
"loss": 1.1134,
"step": 182
},
{
"epoch": 0.04896976184104897,
"grad_norm": 5.650701522827148,
"learning_rate": 1.9572192513368986e-06,
"loss": 1.2427,
"step": 183
},
{
"epoch": 0.04923735616804924,
"grad_norm": 5.057121753692627,
"learning_rate": 1.9679144385026737e-06,
"loss": 1.1445,
"step": 184
},
{
"epoch": 0.04950495049504951,
"grad_norm": 5.226599216461182,
"learning_rate": 1.9786096256684497e-06,
"loss": 1.1969,
"step": 185
},
{
"epoch": 0.04977254482204977,
"grad_norm": 5.469078540802002,
"learning_rate": 1.989304812834225e-06,
"loss": 1.2631,
"step": 186
},
{
"epoch": 0.05004013914905004,
"grad_norm": 5.3292460441589355,
"learning_rate": 2.0000000000000003e-06,
"loss": 1.2036,
"step": 187
},
{
"epoch": 0.05030773347605031,
"grad_norm": 5.132472991943359,
"learning_rate": 2.010695187165776e-06,
"loss": 1.1629,
"step": 188
},
{
"epoch": 0.050575327803050574,
"grad_norm": 5.4047369956970215,
"learning_rate": 2.021390374331551e-06,
"loss": 1.1455,
"step": 189
},
{
"epoch": 0.050842922130050844,
"grad_norm": 5.656977653503418,
"learning_rate": 2.0320855614973265e-06,
"loss": 1.205,
"step": 190
},
{
"epoch": 0.051110516457051114,
"grad_norm": 5.2581963539123535,
"learning_rate": 2.0427807486631016e-06,
"loss": 1.1071,
"step": 191
},
{
"epoch": 0.05137811078405138,
"grad_norm": 6.479303359985352,
"learning_rate": 2.053475935828877e-06,
"loss": 1.1909,
"step": 192
},
{
"epoch": 0.05164570511105165,
"grad_norm": 5.340463638305664,
"learning_rate": 2.0641711229946527e-06,
"loss": 1.139,
"step": 193
},
{
"epoch": 0.05191329943805191,
"grad_norm": 5.29105281829834,
"learning_rate": 2.074866310160428e-06,
"loss": 1.1934,
"step": 194
},
{
"epoch": 0.05218089376505218,
"grad_norm": 5.378291606903076,
"learning_rate": 2.0855614973262034e-06,
"loss": 1.2956,
"step": 195
},
{
"epoch": 0.05244848809205245,
"grad_norm": 5.540526390075684,
"learning_rate": 2.096256684491979e-06,
"loss": 1.2126,
"step": 196
},
{
"epoch": 0.052716082419052714,
"grad_norm": 4.78275728225708,
"learning_rate": 2.106951871657754e-06,
"loss": 1.126,
"step": 197
},
{
"epoch": 0.052983676746052984,
"grad_norm": 5.544436931610107,
"learning_rate": 2.1176470588235296e-06,
"loss": 1.1113,
"step": 198
},
{
"epoch": 0.053251271073053254,
"grad_norm": 5.038266658782959,
"learning_rate": 2.128342245989305e-06,
"loss": 1.1551,
"step": 199
},
{
"epoch": 0.05351886540005352,
"grad_norm": 5.480011463165283,
"learning_rate": 2.1390374331550802e-06,
"loss": 1.1223,
"step": 200
},
{
"epoch": 0.05378645972705379,
"grad_norm": 5.360974311828613,
"learning_rate": 2.1497326203208558e-06,
"loss": 1.1213,
"step": 201
},
{
"epoch": 0.05405405405405406,
"grad_norm": 4.914999008178711,
"learning_rate": 2.1604278074866313e-06,
"loss": 1.1067,
"step": 202
},
{
"epoch": 0.05432164838105432,
"grad_norm": 5.201199531555176,
"learning_rate": 2.1711229946524064e-06,
"loss": 1.0909,
"step": 203
},
{
"epoch": 0.05458924270805459,
"grad_norm": 5.064680576324463,
"learning_rate": 2.181818181818182e-06,
"loss": 1.1301,
"step": 204
},
{
"epoch": 0.05485683703505485,
"grad_norm": 5.714580535888672,
"learning_rate": 2.1925133689839575e-06,
"loss": 1.3127,
"step": 205
},
{
"epoch": 0.055124431362055124,
"grad_norm": 5.075433731079102,
"learning_rate": 2.2032085561497326e-06,
"loss": 1.2102,
"step": 206
},
{
"epoch": 0.055392025689055394,
"grad_norm": 5.047552108764648,
"learning_rate": 2.213903743315508e-06,
"loss": 1.0519,
"step": 207
},
{
"epoch": 0.05565962001605566,
"grad_norm": 5.115383148193359,
"learning_rate": 2.2245989304812837e-06,
"loss": 1.0859,
"step": 208
},
{
"epoch": 0.05592721434305593,
"grad_norm": 4.877355575561523,
"learning_rate": 2.2352941176470592e-06,
"loss": 0.9786,
"step": 209
},
{
"epoch": 0.0561948086700562,
"grad_norm": 4.9516921043396,
"learning_rate": 2.2459893048128343e-06,
"loss": 1.1192,
"step": 210
},
{
"epoch": 0.05646240299705646,
"grad_norm": 4.995131969451904,
"learning_rate": 2.25668449197861e-06,
"loss": 1.1766,
"step": 211
},
{
"epoch": 0.05672999732405673,
"grad_norm": 5.119645595550537,
"learning_rate": 2.2673796791443854e-06,
"loss": 1.0791,
"step": 212
},
{
"epoch": 0.056997591651057,
"grad_norm": 5.06790828704834,
"learning_rate": 2.2780748663101605e-06,
"loss": 1.2245,
"step": 213
},
{
"epoch": 0.05726518597805726,
"grad_norm": 5.313665390014648,
"learning_rate": 2.288770053475936e-06,
"loss": 1.1242,
"step": 214
},
{
"epoch": 0.05753278030505753,
"grad_norm": 5.126317977905273,
"learning_rate": 2.2994652406417116e-06,
"loss": 1.234,
"step": 215
},
{
"epoch": 0.057800374632057804,
"grad_norm": 5.17064094543457,
"learning_rate": 2.3101604278074867e-06,
"loss": 1.1468,
"step": 216
},
{
"epoch": 0.05806796895905807,
"grad_norm": 5.41604471206665,
"learning_rate": 2.3208556149732623e-06,
"loss": 1.1766,
"step": 217
},
{
"epoch": 0.05833556328605834,
"grad_norm": 5.253145217895508,
"learning_rate": 2.331550802139038e-06,
"loss": 1.2682,
"step": 218
},
{
"epoch": 0.0586031576130586,
"grad_norm": 5.144425868988037,
"learning_rate": 2.342245989304813e-06,
"loss": 1.1138,
"step": 219
},
{
"epoch": 0.05887075194005887,
"grad_norm": 5.531948089599609,
"learning_rate": 2.3529411764705885e-06,
"loss": 1.2583,
"step": 220
},
{
"epoch": 0.05913834626705914,
"grad_norm": 5.511721134185791,
"learning_rate": 2.363636363636364e-06,
"loss": 1.2773,
"step": 221
},
{
"epoch": 0.0594059405940594,
"grad_norm": 5.116267204284668,
"learning_rate": 2.374331550802139e-06,
"loss": 1.2009,
"step": 222
},
{
"epoch": 0.05967353492105967,
"grad_norm": 5.6089630126953125,
"learning_rate": 2.3850267379679146e-06,
"loss": 1.2062,
"step": 223
},
{
"epoch": 0.05994112924805994,
"grad_norm": 4.710421085357666,
"learning_rate": 2.3957219251336898e-06,
"loss": 1.1464,
"step": 224
},
{
"epoch": 0.060208723575060207,
"grad_norm": 5.434922218322754,
"learning_rate": 2.4064171122994653e-06,
"loss": 1.3069,
"step": 225
},
{
"epoch": 0.06047631790206048,
"grad_norm": 4.87479305267334,
"learning_rate": 2.417112299465241e-06,
"loss": 1.1407,
"step": 226
},
{
"epoch": 0.06074391222906075,
"grad_norm": 4.942396640777588,
"learning_rate": 2.427807486631016e-06,
"loss": 1.0771,
"step": 227
},
{
"epoch": 0.06101150655606101,
"grad_norm": 4.659417629241943,
"learning_rate": 2.4385026737967915e-06,
"loss": 1.0804,
"step": 228
},
{
"epoch": 0.06127910088306128,
"grad_norm": 4.819082260131836,
"learning_rate": 2.449197860962567e-06,
"loss": 1.1493,
"step": 229
},
{
"epoch": 0.06154669521006155,
"grad_norm": 5.2966437339782715,
"learning_rate": 2.4598930481283426e-06,
"loss": 1.0462,
"step": 230
},
{
"epoch": 0.06181428953706181,
"grad_norm": 4.857460021972656,
"learning_rate": 2.470588235294118e-06,
"loss": 1.1203,
"step": 231
},
{
"epoch": 0.06208188386406208,
"grad_norm": 5.175891399383545,
"learning_rate": 2.4812834224598932e-06,
"loss": 1.2988,
"step": 232
},
{
"epoch": 0.062349478191062346,
"grad_norm": 5.051662445068359,
"learning_rate": 2.4919786096256688e-06,
"loss": 1.1729,
"step": 233
},
{
"epoch": 0.06261707251806262,
"grad_norm": 5.011804580688477,
"learning_rate": 2.502673796791444e-06,
"loss": 1.1163,
"step": 234
},
{
"epoch": 0.06288466684506289,
"grad_norm": 5.881048202514648,
"learning_rate": 2.5133689839572194e-06,
"loss": 1.1809,
"step": 235
},
{
"epoch": 0.06315226117206316,
"grad_norm": 5.304912567138672,
"learning_rate": 2.5240641711229945e-06,
"loss": 1.1589,
"step": 236
},
{
"epoch": 0.06341985549906343,
"grad_norm": 4.8746256828308105,
"learning_rate": 2.5347593582887705e-06,
"loss": 1.1181,
"step": 237
},
{
"epoch": 0.06368744982606368,
"grad_norm": 5.147508144378662,
"learning_rate": 2.5454545454545456e-06,
"loss": 1.2074,
"step": 238
},
{
"epoch": 0.06395504415306395,
"grad_norm": 5.325894355773926,
"learning_rate": 2.556149732620321e-06,
"loss": 1.0711,
"step": 239
},
{
"epoch": 0.06422263848006422,
"grad_norm": 5.343993186950684,
"learning_rate": 2.5668449197860967e-06,
"loss": 1.1532,
"step": 240
},
{
"epoch": 0.0644902328070645,
"grad_norm": 5.7134270668029785,
"learning_rate": 2.577540106951872e-06,
"loss": 1.2013,
"step": 241
},
{
"epoch": 0.06475782713406476,
"grad_norm": 4.556005001068115,
"learning_rate": 2.5882352941176473e-06,
"loss": 0.9915,
"step": 242
},
{
"epoch": 0.06502542146106502,
"grad_norm": 5.159268856048584,
"learning_rate": 2.5989304812834225e-06,
"loss": 1.1166,
"step": 243
},
{
"epoch": 0.06529301578806529,
"grad_norm": 5.182318210601807,
"learning_rate": 2.6096256684491984e-06,
"loss": 1.1421,
"step": 244
},
{
"epoch": 0.06556061011506556,
"grad_norm": 5.662712574005127,
"learning_rate": 2.6203208556149735e-06,
"loss": 1.2302,
"step": 245
},
{
"epoch": 0.06582820444206583,
"grad_norm": 5.712917804718018,
"learning_rate": 2.631016042780749e-06,
"loss": 1.3106,
"step": 246
},
{
"epoch": 0.0660957987690661,
"grad_norm": 5.176303863525391,
"learning_rate": 2.641711229946524e-06,
"loss": 1.2056,
"step": 247
},
{
"epoch": 0.06636339309606637,
"grad_norm": 5.5327982902526855,
"learning_rate": 2.6524064171122997e-06,
"loss": 1.2833,
"step": 248
},
{
"epoch": 0.06663098742306663,
"grad_norm": 5.480837345123291,
"learning_rate": 2.663101604278075e-06,
"loss": 1.1822,
"step": 249
},
{
"epoch": 0.0668985817500669,
"grad_norm": 5.221635818481445,
"learning_rate": 2.673796791443851e-06,
"loss": 1.1049,
"step": 250
},
{
"epoch": 0.06716617607706717,
"grad_norm": 5.3436689376831055,
"learning_rate": 2.684491978609626e-06,
"loss": 1.1557,
"step": 251
},
{
"epoch": 0.06743377040406744,
"grad_norm": 5.277728080749512,
"learning_rate": 2.6951871657754015e-06,
"loss": 1.1756,
"step": 252
},
{
"epoch": 0.0677013647310677,
"grad_norm": 4.869785308837891,
"learning_rate": 2.7058823529411766e-06,
"loss": 1.0731,
"step": 253
},
{
"epoch": 0.06796895905806796,
"grad_norm": 4.855271339416504,
"learning_rate": 2.716577540106952e-06,
"loss": 1.1427,
"step": 254
},
{
"epoch": 0.06823655338506823,
"grad_norm": 5.062397003173828,
"learning_rate": 2.7272727272727272e-06,
"loss": 1.0324,
"step": 255
},
{
"epoch": 0.0685041477120685,
"grad_norm": 5.3740620613098145,
"learning_rate": 2.7379679144385028e-06,
"loss": 1.2113,
"step": 256
},
{
"epoch": 0.06877174203906877,
"grad_norm": 5.443506717681885,
"learning_rate": 2.748663101604278e-06,
"loss": 1.2146,
"step": 257
},
{
"epoch": 0.06903933636606904,
"grad_norm": 5.629642009735107,
"learning_rate": 2.759358288770054e-06,
"loss": 1.1804,
"step": 258
},
{
"epoch": 0.06930693069306931,
"grad_norm": 4.957646369934082,
"learning_rate": 2.770053475935829e-06,
"loss": 1.0639,
"step": 259
},
{
"epoch": 0.06957452502006957,
"grad_norm": 5.224496841430664,
"learning_rate": 2.7807486631016045e-06,
"loss": 1.0334,
"step": 260
},
{
"epoch": 0.06984211934706984,
"grad_norm": 4.813484191894531,
"learning_rate": 2.79144385026738e-06,
"loss": 1.0519,
"step": 261
},
{
"epoch": 0.07010971367407011,
"grad_norm": 5.483233451843262,
"learning_rate": 2.802139037433155e-06,
"loss": 1.1352,
"step": 262
},
{
"epoch": 0.07037730800107038,
"grad_norm": 5.673671245574951,
"learning_rate": 2.812834224598931e-06,
"loss": 1.1306,
"step": 263
},
{
"epoch": 0.07064490232807065,
"grad_norm": 5.3372955322265625,
"learning_rate": 2.8235294117647062e-06,
"loss": 1.3173,
"step": 264
},
{
"epoch": 0.0709124966550709,
"grad_norm": 5.20352840423584,
"learning_rate": 2.8342245989304818e-06,
"loss": 1.2713,
"step": 265
},
{
"epoch": 0.07118009098207118,
"grad_norm": 5.352963924407959,
"learning_rate": 2.844919786096257e-06,
"loss": 1.1604,
"step": 266
},
{
"epoch": 0.07144768530907145,
"grad_norm": 5.504600524902344,
"learning_rate": 2.8556149732620324e-06,
"loss": 1.1683,
"step": 267
},
{
"epoch": 0.07171527963607172,
"grad_norm": 4.989468097686768,
"learning_rate": 2.8663101604278075e-06,
"loss": 1.1422,
"step": 268
},
{
"epoch": 0.07198287396307199,
"grad_norm": 5.034707069396973,
"learning_rate": 2.8770053475935835e-06,
"loss": 1.1341,
"step": 269
},
{
"epoch": 0.07225046829007226,
"grad_norm": 5.23364782333374,
"learning_rate": 2.8877005347593586e-06,
"loss": 1.2117,
"step": 270
},
{
"epoch": 0.07251806261707251,
"grad_norm": 4.977064609527588,
"learning_rate": 2.898395721925134e-06,
"loss": 1.1179,
"step": 271
},
{
"epoch": 0.07278565694407278,
"grad_norm": 5.462220668792725,
"learning_rate": 2.9090909090909093e-06,
"loss": 1.1499,
"step": 272
},
{
"epoch": 0.07305325127107305,
"grad_norm": 4.852994918823242,
"learning_rate": 2.919786096256685e-06,
"loss": 1.0156,
"step": 273
},
{
"epoch": 0.07332084559807332,
"grad_norm": 5.298532485961914,
"learning_rate": 2.93048128342246e-06,
"loss": 1.2225,
"step": 274
},
{
"epoch": 0.07358843992507359,
"grad_norm": 4.783885955810547,
"learning_rate": 2.9411764705882355e-06,
"loss": 1.1382,
"step": 275
},
{
"epoch": 0.07385603425207386,
"grad_norm": 5.855717182159424,
"learning_rate": 2.9518716577540106e-06,
"loss": 1.2779,
"step": 276
},
{
"epoch": 0.07412362857907412,
"grad_norm": 4.655195713043213,
"learning_rate": 2.9625668449197865e-06,
"loss": 1.0411,
"step": 277
},
{
"epoch": 0.07439122290607439,
"grad_norm": 5.513675689697266,
"learning_rate": 2.9732620320855617e-06,
"loss": 1.2022,
"step": 278
},
{
"epoch": 0.07465881723307466,
"grad_norm": 4.99501371383667,
"learning_rate": 2.983957219251337e-06,
"loss": 1.2175,
"step": 279
},
{
"epoch": 0.07492641156007493,
"grad_norm": 5.31617546081543,
"learning_rate": 2.9946524064171123e-06,
"loss": 1.2313,
"step": 280
},
{
"epoch": 0.0751940058870752,
"grad_norm": 4.424401760101318,
"learning_rate": 3.005347593582888e-06,
"loss": 1.0215,
"step": 281
},
{
"epoch": 0.07546160021407546,
"grad_norm": 6.041825771331787,
"learning_rate": 3.016042780748663e-06,
"loss": 1.1797,
"step": 282
},
{
"epoch": 0.07572919454107573,
"grad_norm": 5.0878424644470215,
"learning_rate": 3.026737967914439e-06,
"loss": 1.117,
"step": 283
},
{
"epoch": 0.075996788868076,
"grad_norm": 5.10042667388916,
"learning_rate": 3.0374331550802145e-06,
"loss": 1.1119,
"step": 284
},
{
"epoch": 0.07626438319507627,
"grad_norm": 4.863668918609619,
"learning_rate": 3.0481283422459896e-06,
"loss": 1.1497,
"step": 285
},
{
"epoch": 0.07653197752207654,
"grad_norm": 5.0864739418029785,
"learning_rate": 3.058823529411765e-06,
"loss": 1.1288,
"step": 286
},
{
"epoch": 0.0767995718490768,
"grad_norm": 5.120104789733887,
"learning_rate": 3.0695187165775402e-06,
"loss": 1.111,
"step": 287
},
{
"epoch": 0.07706716617607706,
"grad_norm": 4.764978885650635,
"learning_rate": 3.0802139037433158e-06,
"loss": 1.1646,
"step": 288
},
{
"epoch": 0.07733476050307733,
"grad_norm": 4.789579391479492,
"learning_rate": 3.090909090909091e-06,
"loss": 1.1771,
"step": 289
},
{
"epoch": 0.0776023548300776,
"grad_norm": 5.163838863372803,
"learning_rate": 3.101604278074867e-06,
"loss": 1.1569,
"step": 290
},
{
"epoch": 0.07786994915707787,
"grad_norm": 5.361174583435059,
"learning_rate": 3.112299465240642e-06,
"loss": 1.0451,
"step": 291
},
{
"epoch": 0.07813754348407814,
"grad_norm": 5.318236827850342,
"learning_rate": 3.1229946524064175e-06,
"loss": 1.1614,
"step": 292
},
{
"epoch": 0.0784051378110784,
"grad_norm": 5.045052528381348,
"learning_rate": 3.1336898395721926e-06,
"loss": 1.2178,
"step": 293
},
{
"epoch": 0.07867273213807867,
"grad_norm": 5.2366414070129395,
"learning_rate": 3.144385026737968e-06,
"loss": 1.1697,
"step": 294
},
{
"epoch": 0.07894032646507894,
"grad_norm": 4.973846912384033,
"learning_rate": 3.1550802139037433e-06,
"loss": 1.1503,
"step": 295
},
{
"epoch": 0.07920792079207921,
"grad_norm": 5.384598731994629,
"learning_rate": 3.1657754010695192e-06,
"loss": 1.0959,
"step": 296
},
{
"epoch": 0.07947551511907948,
"grad_norm": 5.26463508605957,
"learning_rate": 3.1764705882352943e-06,
"loss": 1.122,
"step": 297
},
{
"epoch": 0.07974310944607975,
"grad_norm": 5.0575737953186035,
"learning_rate": 3.18716577540107e-06,
"loss": 1.226,
"step": 298
},
{
"epoch": 0.08001070377308,
"grad_norm": 5.238304615020752,
"learning_rate": 3.197860962566845e-06,
"loss": 1.2016,
"step": 299
},
{
"epoch": 0.08027829810008028,
"grad_norm": 5.158024787902832,
"learning_rate": 3.2085561497326205e-06,
"loss": 1.174,
"step": 300
},
{
"epoch": 0.08054589242708055,
"grad_norm": 5.37693452835083,
"learning_rate": 3.2192513368983957e-06,
"loss": 1.2009,
"step": 301
},
{
"epoch": 0.08081348675408082,
"grad_norm": 5.042771816253662,
"learning_rate": 3.2299465240641716e-06,
"loss": 1.0364,
"step": 302
},
{
"epoch": 0.08108108108108109,
"grad_norm": 5.316686153411865,
"learning_rate": 3.2406417112299467e-06,
"loss": 1.1912,
"step": 303
},
{
"epoch": 0.08134867540808134,
"grad_norm": 4.9131011962890625,
"learning_rate": 3.2513368983957223e-06,
"loss": 1.1555,
"step": 304
},
{
"epoch": 0.08161626973508161,
"grad_norm": 5.539770126342773,
"learning_rate": 3.262032085561498e-06,
"loss": 1.3409,
"step": 305
},
{
"epoch": 0.08188386406208188,
"grad_norm": 4.658355712890625,
"learning_rate": 3.272727272727273e-06,
"loss": 1.2177,
"step": 306
},
{
"epoch": 0.08215145838908215,
"grad_norm": 4.792846202850342,
"learning_rate": 3.2834224598930485e-06,
"loss": 0.9993,
"step": 307
},
{
"epoch": 0.08241905271608242,
"grad_norm": 4.886536598205566,
"learning_rate": 3.2941176470588236e-06,
"loss": 1.0884,
"step": 308
},
{
"epoch": 0.08268664704308269,
"grad_norm": 4.976652145385742,
"learning_rate": 3.3048128342245995e-06,
"loss": 1.1197,
"step": 309
},
{
"epoch": 0.08295424137008295,
"grad_norm": 4.819093704223633,
"learning_rate": 3.3155080213903747e-06,
"loss": 1.1089,
"step": 310
},
{
"epoch": 0.08322183569708322,
"grad_norm": 5.2282843589782715,
"learning_rate": 3.32620320855615e-06,
"loss": 1.1593,
"step": 311
},
{
"epoch": 0.08348943002408349,
"grad_norm": 5.575779438018799,
"learning_rate": 3.3368983957219253e-06,
"loss": 1.1697,
"step": 312
},
{
"epoch": 0.08375702435108376,
"grad_norm": 5.0405354499816895,
"learning_rate": 3.347593582887701e-06,
"loss": 1.2207,
"step": 313
},
{
"epoch": 0.08402461867808403,
"grad_norm": 4.537448406219482,
"learning_rate": 3.358288770053476e-06,
"loss": 0.9928,
"step": 314
},
{
"epoch": 0.0842922130050843,
"grad_norm": 4.672475337982178,
"learning_rate": 3.368983957219252e-06,
"loss": 1.0073,
"step": 315
},
{
"epoch": 0.08455980733208456,
"grad_norm": 5.076086044311523,
"learning_rate": 3.379679144385027e-06,
"loss": 1.1783,
"step": 316
},
{
"epoch": 0.08482740165908483,
"grad_norm": 4.888884544372559,
"learning_rate": 3.3903743315508026e-06,
"loss": 1.1204,
"step": 317
},
{
"epoch": 0.0850949959860851,
"grad_norm": 5.4211554527282715,
"learning_rate": 3.4010695187165777e-06,
"loss": 1.2754,
"step": 318
},
{
"epoch": 0.08536259031308537,
"grad_norm": 5.268496513366699,
"learning_rate": 3.4117647058823532e-06,
"loss": 1.1334,
"step": 319
},
{
"epoch": 0.08563018464008564,
"grad_norm": 4.835329532623291,
"learning_rate": 3.4224598930481284e-06,
"loss": 1.1577,
"step": 320
},
{
"epoch": 0.08589777896708589,
"grad_norm": 4.869121074676514,
"learning_rate": 3.433155080213904e-06,
"loss": 1.173,
"step": 321
},
{
"epoch": 0.08616537329408616,
"grad_norm": 5.070735931396484,
"learning_rate": 3.443850267379679e-06,
"loss": 1.1476,
"step": 322
},
{
"epoch": 0.08643296762108643,
"grad_norm": 5.45928430557251,
"learning_rate": 3.454545454545455e-06,
"loss": 1.1205,
"step": 323
},
{
"epoch": 0.0867005619480867,
"grad_norm": 5.189511299133301,
"learning_rate": 3.46524064171123e-06,
"loss": 1.3158,
"step": 324
},
{
"epoch": 0.08696815627508697,
"grad_norm": 4.831698417663574,
"learning_rate": 3.4759358288770056e-06,
"loss": 1.0329,
"step": 325
},
{
"epoch": 0.08723575060208724,
"grad_norm": 4.864784240722656,
"learning_rate": 3.4866310160427807e-06,
"loss": 1.0805,
"step": 326
},
{
"epoch": 0.0875033449290875,
"grad_norm": 4.9103240966796875,
"learning_rate": 3.4973262032085563e-06,
"loss": 1.1277,
"step": 327
},
{
"epoch": 0.08777093925608777,
"grad_norm": 4.773064136505127,
"learning_rate": 3.5080213903743322e-06,
"loss": 1.0463,
"step": 328
},
{
"epoch": 0.08803853358308804,
"grad_norm": 5.021261692047119,
"learning_rate": 3.5187165775401074e-06,
"loss": 1.0795,
"step": 329
},
{
"epoch": 0.08830612791008831,
"grad_norm": 5.683427810668945,
"learning_rate": 3.529411764705883e-06,
"loss": 1.094,
"step": 330
},
{
"epoch": 0.08857372223708858,
"grad_norm": 4.894428253173828,
"learning_rate": 3.540106951871658e-06,
"loss": 1.2105,
"step": 331
},
{
"epoch": 0.08884131656408883,
"grad_norm": 5.363523006439209,
"learning_rate": 3.5508021390374335e-06,
"loss": 1.3021,
"step": 332
},
{
"epoch": 0.0891089108910891,
"grad_norm": 4.995115756988525,
"learning_rate": 3.5614973262032087e-06,
"loss": 1.2159,
"step": 333
},
{
"epoch": 0.08937650521808937,
"grad_norm": 4.57165002822876,
"learning_rate": 3.5721925133689846e-06,
"loss": 1.1042,
"step": 334
},
{
"epoch": 0.08964409954508964,
"grad_norm": 4.659427165985107,
"learning_rate": 3.5828877005347597e-06,
"loss": 1.0984,
"step": 335
},
{
"epoch": 0.08991169387208992,
"grad_norm": 4.8430986404418945,
"learning_rate": 3.5935828877005353e-06,
"loss": 1.0503,
"step": 336
},
{
"epoch": 0.09017928819909019,
"grad_norm": 5.450077056884766,
"learning_rate": 3.6042780748663104e-06,
"loss": 1.2578,
"step": 337
},
{
"epoch": 0.09044688252609044,
"grad_norm": 5.203562259674072,
"learning_rate": 3.614973262032086e-06,
"loss": 1.1925,
"step": 338
},
{
"epoch": 0.09071447685309071,
"grad_norm": 5.250705718994141,
"learning_rate": 3.625668449197861e-06,
"loss": 1.1434,
"step": 339
},
{
"epoch": 0.09098207118009098,
"grad_norm": 5.062129020690918,
"learning_rate": 3.6363636363636366e-06,
"loss": 1.1685,
"step": 340
},
{
"epoch": 0.09124966550709125,
"grad_norm": 5.255050182342529,
"learning_rate": 3.6470588235294117e-06,
"loss": 1.1763,
"step": 341
},
{
"epoch": 0.09151725983409152,
"grad_norm": 5.397471904754639,
"learning_rate": 3.6577540106951877e-06,
"loss": 1.1878,
"step": 342
},
{
"epoch": 0.09178485416109179,
"grad_norm": 4.998739242553711,
"learning_rate": 3.6684491978609628e-06,
"loss": 1.2435,
"step": 343
},
{
"epoch": 0.09205244848809205,
"grad_norm": 5.0231475830078125,
"learning_rate": 3.6791443850267383e-06,
"loss": 1.201,
"step": 344
},
{
"epoch": 0.09232004281509232,
"grad_norm": 4.59348201751709,
"learning_rate": 3.6898395721925134e-06,
"loss": 1.1119,
"step": 345
},
{
"epoch": 0.09258763714209259,
"grad_norm": 5.16015100479126,
"learning_rate": 3.700534759358289e-06,
"loss": 1.3093,
"step": 346
},
{
"epoch": 0.09285523146909286,
"grad_norm": 4.382453441619873,
"learning_rate": 3.711229946524064e-06,
"loss": 1.0883,
"step": 347
},
{
"epoch": 0.09312282579609313,
"grad_norm": 4.668209075927734,
"learning_rate": 3.72192513368984e-06,
"loss": 1.0241,
"step": 348
},
{
"epoch": 0.09339042012309338,
"grad_norm": 4.655612945556641,
"learning_rate": 3.7326203208556156e-06,
"loss": 1.0818,
"step": 349
},
{
"epoch": 0.09365801445009365,
"grad_norm": 4.565972805023193,
"learning_rate": 3.7433155080213907e-06,
"loss": 1.1419,
"step": 350
},
{
"epoch": 0.09392560877709392,
"grad_norm": 5.171647548675537,
"learning_rate": 3.7540106951871662e-06,
"loss": 1.2268,
"step": 351
},
{
"epoch": 0.0941932031040942,
"grad_norm": 4.986495018005371,
"learning_rate": 3.7647058823529414e-06,
"loss": 1.1435,
"step": 352
},
{
"epoch": 0.09446079743109446,
"grad_norm": 5.132668495178223,
"learning_rate": 3.775401069518717e-06,
"loss": 1.1817,
"step": 353
},
{
"epoch": 0.09472839175809473,
"grad_norm": 4.863659858703613,
"learning_rate": 3.786096256684492e-06,
"loss": 1.1636,
"step": 354
},
{
"epoch": 0.09499598608509499,
"grad_norm": 4.890793323516846,
"learning_rate": 3.796791443850268e-06,
"loss": 1.099,
"step": 355
},
{
"epoch": 0.09526358041209526,
"grad_norm": 4.9269208908081055,
"learning_rate": 3.807486631016043e-06,
"loss": 0.9963,
"step": 356
},
{
"epoch": 0.09553117473909553,
"grad_norm": 5.276472091674805,
"learning_rate": 3.818181818181819e-06,
"loss": 1.1995,
"step": 357
},
{
"epoch": 0.0957987690660958,
"grad_norm": 5.187767505645752,
"learning_rate": 3.828877005347594e-06,
"loss": 0.9875,
"step": 358
},
{
"epoch": 0.09606636339309607,
"grad_norm": 4.884994983673096,
"learning_rate": 3.839572192513369e-06,
"loss": 1.1166,
"step": 359
},
{
"epoch": 0.09633395772009633,
"grad_norm": 4.8466715812683105,
"learning_rate": 3.850267379679145e-06,
"loss": 1.1187,
"step": 360
},
{
"epoch": 0.0966015520470966,
"grad_norm": 4.6448655128479,
"learning_rate": 3.86096256684492e-06,
"loss": 1.2536,
"step": 361
},
{
"epoch": 0.09686914637409687,
"grad_norm": 4.447425365447998,
"learning_rate": 3.871657754010695e-06,
"loss": 1.0654,
"step": 362
},
{
"epoch": 0.09713674070109714,
"grad_norm": 4.957208633422852,
"learning_rate": 3.882352941176471e-06,
"loss": 1.2859,
"step": 363
},
{
"epoch": 0.09740433502809741,
"grad_norm": 4.9123735427856445,
"learning_rate": 3.893048128342246e-06,
"loss": 1.1159,
"step": 364
},
{
"epoch": 0.09767192935509768,
"grad_norm": 5.830307960510254,
"learning_rate": 3.903743315508022e-06,
"loss": 1.05,
"step": 365
},
{
"epoch": 0.09793952368209793,
"grad_norm": 4.788443088531494,
"learning_rate": 3.914438502673797e-06,
"loss": 1.1427,
"step": 366
},
{
"epoch": 0.0982071180090982,
"grad_norm": 4.874475479125977,
"learning_rate": 3.925133689839573e-06,
"loss": 1.1247,
"step": 367
},
{
"epoch": 0.09847471233609847,
"grad_norm": 5.284448623657227,
"learning_rate": 3.9358288770053474e-06,
"loss": 1.1494,
"step": 368
},
{
"epoch": 0.09874230666309874,
"grad_norm": 5.056131839752197,
"learning_rate": 3.946524064171123e-06,
"loss": 1.1843,
"step": 369
},
{
"epoch": 0.09900990099009901,
"grad_norm": 4.933049201965332,
"learning_rate": 3.957219251336899e-06,
"loss": 1.0876,
"step": 370
},
{
"epoch": 0.09927749531709927,
"grad_norm": 5.440591335296631,
"learning_rate": 3.967914438502674e-06,
"loss": 1.0903,
"step": 371
},
{
"epoch": 0.09954508964409954,
"grad_norm": 5.242448806762695,
"learning_rate": 3.97860962566845e-06,
"loss": 1.1539,
"step": 372
},
{
"epoch": 0.09981268397109981,
"grad_norm": 5.424898147583008,
"learning_rate": 3.989304812834225e-06,
"loss": 1.1798,
"step": 373
},
{
"epoch": 0.10008027829810008,
"grad_norm": 5.486216068267822,
"learning_rate": 4.000000000000001e-06,
"loss": 1.195,
"step": 374
},
{
"epoch": 0.10034787262510035,
"grad_norm": 4.9611029624938965,
"learning_rate": 4.010695187165775e-06,
"loss": 1.1332,
"step": 375
},
{
"epoch": 0.10061546695210062,
"grad_norm": 5.002806663513184,
"learning_rate": 4.021390374331552e-06,
"loss": 1.0903,
"step": 376
},
{
"epoch": 0.10088306127910088,
"grad_norm": 5.656718730926514,
"learning_rate": 4.0320855614973264e-06,
"loss": 1.2747,
"step": 377
},
{
"epoch": 0.10115065560610115,
"grad_norm": 5.98917293548584,
"learning_rate": 4.042780748663102e-06,
"loss": 1.1138,
"step": 378
},
{
"epoch": 0.10141824993310142,
"grad_norm": 4.683370590209961,
"learning_rate": 4.0534759358288775e-06,
"loss": 1.1109,
"step": 379
},
{
"epoch": 0.10168584426010169,
"grad_norm": 5.65017557144165,
"learning_rate": 4.064171122994653e-06,
"loss": 1.3569,
"step": 380
},
{
"epoch": 0.10195343858710196,
"grad_norm": 5.555070400238037,
"learning_rate": 4.074866310160428e-06,
"loss": 1.4389,
"step": 381
},
{
"epoch": 0.10222103291410223,
"grad_norm": 4.874694347381592,
"learning_rate": 4.085561497326203e-06,
"loss": 1.0894,
"step": 382
},
{
"epoch": 0.10248862724110248,
"grad_norm": 4.907220840454102,
"learning_rate": 4.096256684491979e-06,
"loss": 1.1923,
"step": 383
},
{
"epoch": 0.10275622156810275,
"grad_norm": 4.953684329986572,
"learning_rate": 4.106951871657754e-06,
"loss": 1.3313,
"step": 384
},
{
"epoch": 0.10302381589510302,
"grad_norm": 4.817149639129639,
"learning_rate": 4.11764705882353e-06,
"loss": 1.139,
"step": 385
},
{
"epoch": 0.1032914102221033,
"grad_norm": 5.111240386962891,
"learning_rate": 4.1283422459893054e-06,
"loss": 1.1787,
"step": 386
},
{
"epoch": 0.10355900454910356,
"grad_norm": 4.8276519775390625,
"learning_rate": 4.13903743315508e-06,
"loss": 1.1326,
"step": 387
},
{
"epoch": 0.10382659887610382,
"grad_norm": 4.992558479309082,
"learning_rate": 4.149732620320856e-06,
"loss": 1.2047,
"step": 388
},
{
"epoch": 0.10409419320310409,
"grad_norm": 4.973186016082764,
"learning_rate": 4.160427807486631e-06,
"loss": 1.2083,
"step": 389
},
{
"epoch": 0.10436178753010436,
"grad_norm": 5.174978733062744,
"learning_rate": 4.171122994652407e-06,
"loss": 1.1953,
"step": 390
},
{
"epoch": 0.10462938185710463,
"grad_norm": 5.181015968322754,
"learning_rate": 4.181818181818182e-06,
"loss": 1.1337,
"step": 391
},
{
"epoch": 0.1048969761841049,
"grad_norm": 5.914229869842529,
"learning_rate": 4.192513368983958e-06,
"loss": 1.3395,
"step": 392
},
{
"epoch": 0.10516457051110517,
"grad_norm": 5.254291534423828,
"learning_rate": 4.203208556149733e-06,
"loss": 1.2202,
"step": 393
},
{
"epoch": 0.10543216483810543,
"grad_norm": 5.055797100067139,
"learning_rate": 4.213903743315508e-06,
"loss": 1.1474,
"step": 394
},
{
"epoch": 0.1056997591651057,
"grad_norm": 4.354243755340576,
"learning_rate": 4.224598930481284e-06,
"loss": 0.958,
"step": 395
},
{
"epoch": 0.10596735349210597,
"grad_norm": 4.838346004486084,
"learning_rate": 4.235294117647059e-06,
"loss": 1.1825,
"step": 396
},
{
"epoch": 0.10623494781910624,
"grad_norm": 4.711790561676025,
"learning_rate": 4.245989304812835e-06,
"loss": 1.196,
"step": 397
},
{
"epoch": 0.10650254214610651,
"grad_norm": 4.71934175491333,
"learning_rate": 4.25668449197861e-06,
"loss": 1.0371,
"step": 398
},
{
"epoch": 0.10677013647310676,
"grad_norm": 4.441000938415527,
"learning_rate": 4.267379679144386e-06,
"loss": 1.1646,
"step": 399
},
{
"epoch": 0.10703773080010703,
"grad_norm": 5.28547477722168,
"learning_rate": 4.2780748663101604e-06,
"loss": 1.2089,
"step": 400
},
{
"epoch": 0.1073053251271073,
"grad_norm": 4.684313774108887,
"learning_rate": 4.288770053475936e-06,
"loss": 1.1529,
"step": 401
},
{
"epoch": 0.10757291945410757,
"grad_norm": 4.92221212387085,
"learning_rate": 4.2994652406417115e-06,
"loss": 1.1109,
"step": 402
},
{
"epoch": 0.10784051378110784,
"grad_norm": 4.630762100219727,
"learning_rate": 4.310160427807487e-06,
"loss": 1.2035,
"step": 403
},
{
"epoch": 0.10810810810810811,
"grad_norm": 5.12864875793457,
"learning_rate": 4.320855614973263e-06,
"loss": 1.2126,
"step": 404
},
{
"epoch": 0.10837570243510837,
"grad_norm": 5.339291095733643,
"learning_rate": 4.331550802139038e-06,
"loss": 1.2983,
"step": 405
},
{
"epoch": 0.10864329676210864,
"grad_norm": 5.000173568725586,
"learning_rate": 4.342245989304813e-06,
"loss": 1.2949,
"step": 406
},
{
"epoch": 0.10891089108910891,
"grad_norm": 5.139687538146973,
"learning_rate": 4.352941176470588e-06,
"loss": 1.1753,
"step": 407
},
{
"epoch": 0.10917848541610918,
"grad_norm": 5.229654788970947,
"learning_rate": 4.363636363636364e-06,
"loss": 1.1499,
"step": 408
},
{
"epoch": 0.10944607974310945,
"grad_norm": 4.853805065155029,
"learning_rate": 4.3743315508021394e-06,
"loss": 1.2261,
"step": 409
},
{
"epoch": 0.1097136740701097,
"grad_norm": 5.141970157623291,
"learning_rate": 4.385026737967915e-06,
"loss": 1.2721,
"step": 410
},
{
"epoch": 0.10998126839710998,
"grad_norm": 5.1554436683654785,
"learning_rate": 4.3957219251336905e-06,
"loss": 1.2238,
"step": 411
},
{
"epoch": 0.11024886272411025,
"grad_norm": 5.058832168579102,
"learning_rate": 4.406417112299465e-06,
"loss": 1.2816,
"step": 412
},
{
"epoch": 0.11051645705111052,
"grad_norm": 4.609223365783691,
"learning_rate": 4.417112299465241e-06,
"loss": 1.1348,
"step": 413
},
{
"epoch": 0.11078405137811079,
"grad_norm": 5.325019359588623,
"learning_rate": 4.427807486631016e-06,
"loss": 1.1162,
"step": 414
},
{
"epoch": 0.11105164570511106,
"grad_norm": 4.600208759307861,
"learning_rate": 4.438502673796792e-06,
"loss": 1.0443,
"step": 415
},
{
"epoch": 0.11131924003211131,
"grad_norm": 5.451298236846924,
"learning_rate": 4.449197860962567e-06,
"loss": 1.197,
"step": 416
},
{
"epoch": 0.11158683435911158,
"grad_norm": 5.0797505378723145,
"learning_rate": 4.459893048128343e-06,
"loss": 1.2068,
"step": 417
},
{
"epoch": 0.11185442868611185,
"grad_norm": 5.17997932434082,
"learning_rate": 4.4705882352941184e-06,
"loss": 1.137,
"step": 418
},
{
"epoch": 0.11212202301311212,
"grad_norm": 5.312300682067871,
"learning_rate": 4.481283422459893e-06,
"loss": 1.176,
"step": 419
},
{
"epoch": 0.1123896173401124,
"grad_norm": 4.956272602081299,
"learning_rate": 4.491978609625669e-06,
"loss": 1.1349,
"step": 420
},
{
"epoch": 0.11265721166711266,
"grad_norm": 4.7235517501831055,
"learning_rate": 4.502673796791444e-06,
"loss": 1.1691,
"step": 421
},
{
"epoch": 0.11292480599411292,
"grad_norm": 4.887537956237793,
"learning_rate": 4.51336898395722e-06,
"loss": 1.1562,
"step": 422
},
{
"epoch": 0.11319240032111319,
"grad_norm": 4.688408851623535,
"learning_rate": 4.524064171122995e-06,
"loss": 1.1013,
"step": 423
},
{
"epoch": 0.11345999464811346,
"grad_norm": 5.215854644775391,
"learning_rate": 4.534759358288771e-06,
"loss": 1.1452,
"step": 424
},
{
"epoch": 0.11372758897511373,
"grad_norm": 5.092518329620361,
"learning_rate": 4.5454545454545455e-06,
"loss": 1.274,
"step": 425
},
{
"epoch": 0.113995183302114,
"grad_norm": 4.888270854949951,
"learning_rate": 4.556149732620321e-06,
"loss": 1.1974,
"step": 426
},
{
"epoch": 0.11426277762911426,
"grad_norm": 5.114696979522705,
"learning_rate": 4.566844919786097e-06,
"loss": 1.1434,
"step": 427
},
{
"epoch": 0.11453037195611453,
"grad_norm": 5.443094730377197,
"learning_rate": 4.577540106951872e-06,
"loss": 1.3022,
"step": 428
},
{
"epoch": 0.1147979662831148,
"grad_norm": 4.617439270019531,
"learning_rate": 4.588235294117647e-06,
"loss": 1.1046,
"step": 429
},
{
"epoch": 0.11506556061011507,
"grad_norm": 5.151831150054932,
"learning_rate": 4.598930481283423e-06,
"loss": 1.1525,
"step": 430
},
{
"epoch": 0.11533315493711534,
"grad_norm": 4.646505355834961,
"learning_rate": 4.609625668449198e-06,
"loss": 1.0613,
"step": 431
},
{
"epoch": 0.11560074926411561,
"grad_norm": 4.780506610870361,
"learning_rate": 4.6203208556149734e-06,
"loss": 1.0874,
"step": 432
},
{
"epoch": 0.11586834359111586,
"grad_norm": 4.499149322509766,
"learning_rate": 4.631016042780749e-06,
"loss": 1.0649,
"step": 433
},
{
"epoch": 0.11613593791811613,
"grad_norm": 5.027551651000977,
"learning_rate": 4.6417112299465245e-06,
"loss": 1.1688,
"step": 434
},
{
"epoch": 0.1164035322451164,
"grad_norm": 4.565614223480225,
"learning_rate": 4.6524064171123e-06,
"loss": 1.1468,
"step": 435
},
{
"epoch": 0.11667112657211667,
"grad_norm": 4.508991241455078,
"learning_rate": 4.663101604278076e-06,
"loss": 1.1169,
"step": 436
},
{
"epoch": 0.11693872089911694,
"grad_norm": 4.733094692230225,
"learning_rate": 4.673796791443851e-06,
"loss": 1.145,
"step": 437
},
{
"epoch": 0.1172063152261172,
"grad_norm": 4.995217323303223,
"learning_rate": 4.684491978609626e-06,
"loss": 1.1946,
"step": 438
},
{
"epoch": 0.11747390955311747,
"grad_norm": 4.931241512298584,
"learning_rate": 4.695187165775401e-06,
"loss": 1.0798,
"step": 439
},
{
"epoch": 0.11774150388011774,
"grad_norm": 4.939948558807373,
"learning_rate": 4.705882352941177e-06,
"loss": 1.3138,
"step": 440
},
{
"epoch": 0.11800909820711801,
"grad_norm": 5.553315162658691,
"learning_rate": 4.7165775401069524e-06,
"loss": 1.2837,
"step": 441
},
{
"epoch": 0.11827669253411828,
"grad_norm": 5.024171829223633,
"learning_rate": 4.727272727272728e-06,
"loss": 1.0769,
"step": 442
},
{
"epoch": 0.11854428686111855,
"grad_norm": 4.998294353485107,
"learning_rate": 4.7379679144385035e-06,
"loss": 1.2191,
"step": 443
},
{
"epoch": 0.1188118811881188,
"grad_norm": 5.217951774597168,
"learning_rate": 4.748663101604278e-06,
"loss": 1.3233,
"step": 444
},
{
"epoch": 0.11907947551511908,
"grad_norm": 4.932075500488281,
"learning_rate": 4.759358288770054e-06,
"loss": 1.2598,
"step": 445
},
{
"epoch": 0.11934706984211935,
"grad_norm": 4.477123260498047,
"learning_rate": 4.770053475935829e-06,
"loss": 1.0329,
"step": 446
},
{
"epoch": 0.11961466416911962,
"grad_norm": 4.998135566711426,
"learning_rate": 4.780748663101605e-06,
"loss": 1.1807,
"step": 447
},
{
"epoch": 0.11988225849611989,
"grad_norm": 5.117345333099365,
"learning_rate": 4.7914438502673795e-06,
"loss": 1.1254,
"step": 448
},
{
"epoch": 0.12014985282312014,
"grad_norm": 4.747807025909424,
"learning_rate": 4.802139037433156e-06,
"loss": 1.0701,
"step": 449
},
{
"epoch": 0.12041744715012041,
"grad_norm": 4.674474716186523,
"learning_rate": 4.812834224598931e-06,
"loss": 1.2146,
"step": 450
},
{
"epoch": 0.12068504147712068,
"grad_norm": 5.200889587402344,
"learning_rate": 4.823529411764706e-06,
"loss": 1.1634,
"step": 451
},
{
"epoch": 0.12095263580412095,
"grad_norm": 4.857826232910156,
"learning_rate": 4.834224598930482e-06,
"loss": 1.13,
"step": 452
},
{
"epoch": 0.12122023013112122,
"grad_norm": 4.851617336273193,
"learning_rate": 4.844919786096257e-06,
"loss": 1.1596,
"step": 453
},
{
"epoch": 0.1214878244581215,
"grad_norm": 4.770223617553711,
"learning_rate": 4.855614973262032e-06,
"loss": 1.1326,
"step": 454
},
{
"epoch": 0.12175541878512175,
"grad_norm": 5.090690612792969,
"learning_rate": 4.866310160427808e-06,
"loss": 1.1485,
"step": 455
},
{
"epoch": 0.12202301311212202,
"grad_norm": 4.741364002227783,
"learning_rate": 4.877005347593583e-06,
"loss": 1.2239,
"step": 456
},
{
"epoch": 0.12229060743912229,
"grad_norm": 4.698870658874512,
"learning_rate": 4.8877005347593585e-06,
"loss": 1.1703,
"step": 457
},
{
"epoch": 0.12255820176612256,
"grad_norm": 5.272980213165283,
"learning_rate": 4.898395721925134e-06,
"loss": 1.1262,
"step": 458
},
{
"epoch": 0.12282579609312283,
"grad_norm": 4.762371063232422,
"learning_rate": 4.90909090909091e-06,
"loss": 1.1365,
"step": 459
},
{
"epoch": 0.1230933904201231,
"grad_norm": 4.594496726989746,
"learning_rate": 4.919786096256685e-06,
"loss": 1.1921,
"step": 460
},
{
"epoch": 0.12336098474712336,
"grad_norm": 4.638429164886475,
"learning_rate": 4.93048128342246e-06,
"loss": 1.0768,
"step": 461
},
{
"epoch": 0.12362857907412363,
"grad_norm": 5.253578186035156,
"learning_rate": 4.941176470588236e-06,
"loss": 1.2366,
"step": 462
},
{
"epoch": 0.1238961734011239,
"grad_norm": 5.03195858001709,
"learning_rate": 4.951871657754011e-06,
"loss": 1.1402,
"step": 463
},
{
"epoch": 0.12416376772812417,
"grad_norm": 4.741814136505127,
"learning_rate": 4.9625668449197864e-06,
"loss": 1.038,
"step": 464
},
{
"epoch": 0.12443136205512444,
"grad_norm": 5.368718147277832,
"learning_rate": 4.973262032085562e-06,
"loss": 1.2952,
"step": 465
},
{
"epoch": 0.12469895638212469,
"grad_norm": 4.668884754180908,
"learning_rate": 4.9839572192513375e-06,
"loss": 1.1318,
"step": 466
},
{
"epoch": 0.12496655070912496,
"grad_norm": 5.185303688049316,
"learning_rate": 4.994652406417112e-06,
"loss": 1.1945,
"step": 467
},
{
"epoch": 0.12523414503612523,
"grad_norm": 4.929427623748779,
"learning_rate": 5.005347593582888e-06,
"loss": 1.2231,
"step": 468
},
{
"epoch": 0.1255017393631255,
"grad_norm": 4.767603397369385,
"learning_rate": 5.016042780748663e-06,
"loss": 1.2551,
"step": 469
},
{
"epoch": 0.12576933369012577,
"grad_norm": 5.308717250823975,
"learning_rate": 5.026737967914439e-06,
"loss": 1.1529,
"step": 470
},
{
"epoch": 0.12603692801712604,
"grad_norm": 4.83845329284668,
"learning_rate": 5.037433155080214e-06,
"loss": 1.2712,
"step": 471
},
{
"epoch": 0.1263045223441263,
"grad_norm": 4.922050476074219,
"learning_rate": 5.048128342245989e-06,
"loss": 1.0939,
"step": 472
},
{
"epoch": 0.12657211667112658,
"grad_norm": 4.888375282287598,
"learning_rate": 5.058823529411765e-06,
"loss": 1.1761,
"step": 473
},
{
"epoch": 0.12683971099812685,
"grad_norm": 4.710062026977539,
"learning_rate": 5.069518716577541e-06,
"loss": 1.236,
"step": 474
},
{
"epoch": 0.1271073053251271,
"grad_norm": 5.260262966156006,
"learning_rate": 5.0802139037433165e-06,
"loss": 1.2279,
"step": 475
},
{
"epoch": 0.12737489965212737,
"grad_norm": 4.409514904022217,
"learning_rate": 5.090909090909091e-06,
"loss": 1.2382,
"step": 476
},
{
"epoch": 0.12764249397912764,
"grad_norm": 4.516629695892334,
"learning_rate": 5.101604278074867e-06,
"loss": 0.9502,
"step": 477
},
{
"epoch": 0.1279100883061279,
"grad_norm": 5.594369888305664,
"learning_rate": 5.112299465240642e-06,
"loss": 1.1869,
"step": 478
},
{
"epoch": 0.12817768263312818,
"grad_norm": 4.493462562561035,
"learning_rate": 5.122994652406418e-06,
"loss": 1.0508,
"step": 479
},
{
"epoch": 0.12844527696012845,
"grad_norm": 4.775510787963867,
"learning_rate": 5.133689839572193e-06,
"loss": 1.2422,
"step": 480
},
{
"epoch": 0.12871287128712872,
"grad_norm": 4.813394546508789,
"learning_rate": 5.144385026737968e-06,
"loss": 1.1887,
"step": 481
},
{
"epoch": 0.128980465614129,
"grad_norm": 4.693298816680908,
"learning_rate": 5.155080213903744e-06,
"loss": 1.0501,
"step": 482
},
{
"epoch": 0.12924805994112926,
"grad_norm": 4.395559787750244,
"learning_rate": 5.165775401069519e-06,
"loss": 1.1917,
"step": 483
},
{
"epoch": 0.12951565426812953,
"grad_norm": 5.032355785369873,
"learning_rate": 5.176470588235295e-06,
"loss": 1.2152,
"step": 484
},
{
"epoch": 0.1297832485951298,
"grad_norm": 4.638949871063232,
"learning_rate": 5.187165775401069e-06,
"loss": 1.0553,
"step": 485
},
{
"epoch": 0.13005084292213004,
"grad_norm": 4.831664562225342,
"learning_rate": 5.197860962566845e-06,
"loss": 1.0504,
"step": 486
},
{
"epoch": 0.1303184372491303,
"grad_norm": 5.181875705718994,
"learning_rate": 5.208556149732621e-06,
"loss": 1.2088,
"step": 487
},
{
"epoch": 0.13058603157613058,
"grad_norm": 5.028466701507568,
"learning_rate": 5.219251336898397e-06,
"loss": 1.0759,
"step": 488
},
{
"epoch": 0.13085362590313085,
"grad_norm": 4.613313674926758,
"learning_rate": 5.2299465240641715e-06,
"loss": 1.2305,
"step": 489
},
{
"epoch": 0.13112122023013112,
"grad_norm": 4.535508155822754,
"learning_rate": 5.240641711229947e-06,
"loss": 1.1403,
"step": 490
},
{
"epoch": 0.1313888145571314,
"grad_norm": 4.558447360992432,
"learning_rate": 5.251336898395723e-06,
"loss": 1.1301,
"step": 491
},
{
"epoch": 0.13165640888413166,
"grad_norm": 4.6473588943481445,
"learning_rate": 5.262032085561498e-06,
"loss": 1.0308,
"step": 492
},
{
"epoch": 0.13192400321113193,
"grad_norm": 4.9026198387146,
"learning_rate": 5.272727272727273e-06,
"loss": 1.3224,
"step": 493
},
{
"epoch": 0.1321915975381322,
"grad_norm": 4.340352535247803,
"learning_rate": 5.283422459893048e-06,
"loss": 1.1216,
"step": 494
},
{
"epoch": 0.13245919186513247,
"grad_norm": 4.947085857391357,
"learning_rate": 5.294117647058824e-06,
"loss": 1.3081,
"step": 495
},
{
"epoch": 0.13272678619213274,
"grad_norm": 5.271705627441406,
"learning_rate": 5.3048128342245995e-06,
"loss": 1.2907,
"step": 496
},
{
"epoch": 0.13299438051913298,
"grad_norm": 4.9826507568359375,
"learning_rate": 5.315508021390374e-06,
"loss": 1.1883,
"step": 497
},
{
"epoch": 0.13326197484613325,
"grad_norm": 4.606426239013672,
"learning_rate": 5.32620320855615e-06,
"loss": 1.129,
"step": 498
},
{
"epoch": 0.13352956917313352,
"grad_norm": 4.6019392013549805,
"learning_rate": 5.336898395721925e-06,
"loss": 1.1887,
"step": 499
},
{
"epoch": 0.1337971635001338,
"grad_norm": 5.553493976593018,
"learning_rate": 5.347593582887702e-06,
"loss": 1.2985,
"step": 500
},
{
"epoch": 0.1337971635001338,
"eval_loss": 1.1709299087524414,
"eval_runtime": 11.4546,
"eval_samples_per_second": 34.92,
"eval_steps_per_second": 4.365,
"step": 500
},
{
"epoch": 0.13406475782713406,
"grad_norm": 4.656076431274414,
"learning_rate": 5.358288770053477e-06,
"loss": 1.1667,
"step": 501
},
{
"epoch": 0.13433235215413433,
"grad_norm": 4.8764543533325195,
"learning_rate": 5.368983957219252e-06,
"loss": 1.1945,
"step": 502
},
{
"epoch": 0.1345999464811346,
"grad_norm": 4.712137699127197,
"learning_rate": 5.379679144385027e-06,
"loss": 1.1168,
"step": 503
},
{
"epoch": 0.13486754080813487,
"grad_norm": 4.951474666595459,
"learning_rate": 5.390374331550803e-06,
"loss": 1.1441,
"step": 504
},
{
"epoch": 0.13513513513513514,
"grad_norm": 5.019460678100586,
"learning_rate": 5.4010695187165785e-06,
"loss": 1.2449,
"step": 505
},
{
"epoch": 0.1354027294621354,
"grad_norm": 4.8025689125061035,
"learning_rate": 5.411764705882353e-06,
"loss": 1.062,
"step": 506
},
{
"epoch": 0.13567032378913568,
"grad_norm": 4.835244655609131,
"learning_rate": 5.422459893048129e-06,
"loss": 1.0729,
"step": 507
},
{
"epoch": 0.13593791811613593,
"grad_norm": 5.318262577056885,
"learning_rate": 5.433155080213904e-06,
"loss": 1.1688,
"step": 508
},
{
"epoch": 0.1362055124431362,
"grad_norm": 4.434688568115234,
"learning_rate": 5.44385026737968e-06,
"loss": 1.0925,
"step": 509
},
{
"epoch": 0.13647310677013647,
"grad_norm": 4.781643867492676,
"learning_rate": 5.4545454545454545e-06,
"loss": 1.192,
"step": 510
},
{
"epoch": 0.13674070109713674,
"grad_norm": 4.806861877441406,
"learning_rate": 5.46524064171123e-06,
"loss": 1.2121,
"step": 511
},
{
"epoch": 0.137008295424137,
"grad_norm": 4.502013206481934,
"learning_rate": 5.4759358288770055e-06,
"loss": 1.075,
"step": 512
},
{
"epoch": 0.13727588975113728,
"grad_norm": 5.604802131652832,
"learning_rate": 5.486631016042782e-06,
"loss": 1.2734,
"step": 513
},
{
"epoch": 0.13754348407813755,
"grad_norm": 5.166036128997803,
"learning_rate": 5.497326203208556e-06,
"loss": 1.1035,
"step": 514
},
{
"epoch": 0.13781107840513782,
"grad_norm": 5.001628875732422,
"learning_rate": 5.508021390374332e-06,
"loss": 1.2467,
"step": 515
},
{
"epoch": 0.13807867273213809,
"grad_norm": 4.5005693435668945,
"learning_rate": 5.518716577540108e-06,
"loss": 1.1308,
"step": 516
},
{
"epoch": 0.13834626705913836,
"grad_norm": 5.138829231262207,
"learning_rate": 5.529411764705883e-06,
"loss": 1.174,
"step": 517
},
{
"epoch": 0.13861386138613863,
"grad_norm": 4.762211322784424,
"learning_rate": 5.540106951871658e-06,
"loss": 1.1242,
"step": 518
},
{
"epoch": 0.13888145571313887,
"grad_norm": 5.18784761428833,
"learning_rate": 5.5508021390374335e-06,
"loss": 1.2918,
"step": 519
},
{
"epoch": 0.13914905004013914,
"grad_norm": 4.61662483215332,
"learning_rate": 5.561497326203209e-06,
"loss": 1.1913,
"step": 520
},
{
"epoch": 0.1394166443671394,
"grad_norm": 4.903599262237549,
"learning_rate": 5.5721925133689845e-06,
"loss": 1.3014,
"step": 521
},
{
"epoch": 0.13968423869413968,
"grad_norm": 4.205623149871826,
"learning_rate": 5.58288770053476e-06,
"loss": 1.0465,
"step": 522
},
{
"epoch": 0.13995183302113995,
"grad_norm": 3.9266059398651123,
"learning_rate": 5.593582887700535e-06,
"loss": 1.0034,
"step": 523
},
{
"epoch": 0.14021942734814022,
"grad_norm": 5.096248626708984,
"learning_rate": 5.60427807486631e-06,
"loss": 1.2393,
"step": 524
},
{
"epoch": 0.1404870216751405,
"grad_norm": 4.701903820037842,
"learning_rate": 5.614973262032086e-06,
"loss": 1.1296,
"step": 525
},
{
"epoch": 0.14075461600214076,
"grad_norm": 4.736352443695068,
"learning_rate": 5.625668449197862e-06,
"loss": 1.1166,
"step": 526
},
{
"epoch": 0.14102221032914103,
"grad_norm": 4.55366325378418,
"learning_rate": 5.636363636363636e-06,
"loss": 1.2381,
"step": 527
},
{
"epoch": 0.1412898046561413,
"grad_norm": 4.388349533081055,
"learning_rate": 5.6470588235294125e-06,
"loss": 1.0277,
"step": 528
},
{
"epoch": 0.14155739898314157,
"grad_norm": 4.596952438354492,
"learning_rate": 5.657754010695188e-06,
"loss": 0.9623,
"step": 529
},
{
"epoch": 0.1418249933101418,
"grad_norm": 4.9525251388549805,
"learning_rate": 5.6684491978609635e-06,
"loss": 1.1406,
"step": 530
},
{
"epoch": 0.14209258763714208,
"grad_norm": 4.623518466949463,
"learning_rate": 5.679144385026738e-06,
"loss": 1.0717,
"step": 531
},
{
"epoch": 0.14236018196414235,
"grad_norm": 4.766755104064941,
"learning_rate": 5.689839572192514e-06,
"loss": 1.2016,
"step": 532
},
{
"epoch": 0.14262777629114262,
"grad_norm": 4.868133068084717,
"learning_rate": 5.700534759358289e-06,
"loss": 1.1728,
"step": 533
},
{
"epoch": 0.1428953706181429,
"grad_norm": 4.722245216369629,
"learning_rate": 5.711229946524065e-06,
"loss": 1.2795,
"step": 534
},
{
"epoch": 0.14316296494514316,
"grad_norm": 4.916394233703613,
"learning_rate": 5.7219251336898395e-06,
"loss": 1.141,
"step": 535
},
{
"epoch": 0.14343055927214343,
"grad_norm": 4.942296028137207,
"learning_rate": 5.732620320855615e-06,
"loss": 1.2017,
"step": 536
},
{
"epoch": 0.1436981535991437,
"grad_norm": 4.585607051849365,
"learning_rate": 5.743315508021391e-06,
"loss": 1.2109,
"step": 537
},
{
"epoch": 0.14396574792614397,
"grad_norm": 4.965005874633789,
"learning_rate": 5.754010695187167e-06,
"loss": 1.1582,
"step": 538
},
{
"epoch": 0.14423334225314424,
"grad_norm": 5.032000541687012,
"learning_rate": 5.764705882352941e-06,
"loss": 1.0772,
"step": 539
},
{
"epoch": 0.1445009365801445,
"grad_norm": 4.349190711975098,
"learning_rate": 5.775401069518717e-06,
"loss": 1.2469,
"step": 540
},
{
"epoch": 0.14476853090714478,
"grad_norm": 4.369176387786865,
"learning_rate": 5.786096256684493e-06,
"loss": 1.131,
"step": 541
},
{
"epoch": 0.14503612523414502,
"grad_norm": 4.241110324859619,
"learning_rate": 5.796791443850268e-06,
"loss": 0.9937,
"step": 542
},
{
"epoch": 0.1453037195611453,
"grad_norm": 4.846850395202637,
"learning_rate": 5.807486631016043e-06,
"loss": 1.2059,
"step": 543
},
{
"epoch": 0.14557131388814556,
"grad_norm": 5.102479457855225,
"learning_rate": 5.8181818181818185e-06,
"loss": 1.1612,
"step": 544
},
{
"epoch": 0.14583890821514583,
"grad_norm": 4.706130027770996,
"learning_rate": 5.828877005347594e-06,
"loss": 1.1608,
"step": 545
},
{
"epoch": 0.1461065025421461,
"grad_norm": 5.125561237335205,
"learning_rate": 5.83957219251337e-06,
"loss": 1.143,
"step": 546
},
{
"epoch": 0.14637409686914638,
"grad_norm": 4.503932952880859,
"learning_rate": 5.850267379679145e-06,
"loss": 1.127,
"step": 547
},
{
"epoch": 0.14664169119614665,
"grad_norm": 4.410585880279541,
"learning_rate": 5.86096256684492e-06,
"loss": 1.0207,
"step": 548
},
{
"epoch": 0.14690928552314692,
"grad_norm": 4.253677845001221,
"learning_rate": 5.871657754010695e-06,
"loss": 1.0741,
"step": 549
},
{
"epoch": 0.14717687985014719,
"grad_norm": 4.8487868309021,
"learning_rate": 5.882352941176471e-06,
"loss": 1.1049,
"step": 550
},
{
"epoch": 0.14744447417714746,
"grad_norm": 5.069744110107422,
"learning_rate": 5.893048128342247e-06,
"loss": 1.2483,
"step": 551
},
{
"epoch": 0.14771206850414773,
"grad_norm": 4.3907470703125,
"learning_rate": 5.903743315508021e-06,
"loss": 1.0139,
"step": 552
},
{
"epoch": 0.14797966283114797,
"grad_norm": 5.0639142990112305,
"learning_rate": 5.9144385026737975e-06,
"loss": 1.2476,
"step": 553
},
{
"epoch": 0.14824725715814824,
"grad_norm": 4.5384016036987305,
"learning_rate": 5.925133689839573e-06,
"loss": 1.1994,
"step": 554
},
{
"epoch": 0.1485148514851485,
"grad_norm": 4.997219085693359,
"learning_rate": 5.935828877005349e-06,
"loss": 1.133,
"step": 555
},
{
"epoch": 0.14878244581214878,
"grad_norm": 5.0610551834106445,
"learning_rate": 5.946524064171123e-06,
"loss": 1.3546,
"step": 556
},
{
"epoch": 0.14905004013914905,
"grad_norm": 4.485021114349365,
"learning_rate": 5.957219251336899e-06,
"loss": 1.1425,
"step": 557
},
{
"epoch": 0.14931763446614932,
"grad_norm": 4.8803229331970215,
"learning_rate": 5.967914438502674e-06,
"loss": 1.2082,
"step": 558
},
{
"epoch": 0.1495852287931496,
"grad_norm": 4.79873514175415,
"learning_rate": 5.97860962566845e-06,
"loss": 1.225,
"step": 559
},
{
"epoch": 0.14985282312014986,
"grad_norm": 4.734536170959473,
"learning_rate": 5.989304812834225e-06,
"loss": 1.1506,
"step": 560
},
{
"epoch": 0.15012041744715013,
"grad_norm": 4.322850227355957,
"learning_rate": 6e-06,
"loss": 1.0387,
"step": 561
},
{
"epoch": 0.1503880117741504,
"grad_norm": 4.721519947052002,
"learning_rate": 6.010695187165776e-06,
"loss": 1.0448,
"step": 562
},
{
"epoch": 0.15065560610115067,
"grad_norm": 4.884403228759766,
"learning_rate": 6.021390374331551e-06,
"loss": 1.1416,
"step": 563
},
{
"epoch": 0.1509232004281509,
"grad_norm": 5.24191427230835,
"learning_rate": 6.032085561497326e-06,
"loss": 1.2222,
"step": 564
},
{
"epoch": 0.15119079475515118,
"grad_norm": 4.954929351806641,
"learning_rate": 6.0427807486631015e-06,
"loss": 1.2712,
"step": 565
},
{
"epoch": 0.15145838908215145,
"grad_norm": 4.613723278045654,
"learning_rate": 6.053475935828878e-06,
"loss": 1.1032,
"step": 566
},
{
"epoch": 0.15172598340915172,
"grad_norm": 4.400996685028076,
"learning_rate": 6.064171122994653e-06,
"loss": 1.091,
"step": 567
},
{
"epoch": 0.151993577736152,
"grad_norm": 4.841631889343262,
"learning_rate": 6.074866310160429e-06,
"loss": 1.2592,
"step": 568
},
{
"epoch": 0.15226117206315226,
"grad_norm": 5.009564399719238,
"learning_rate": 6.085561497326204e-06,
"loss": 1.1364,
"step": 569
},
{
"epoch": 0.15252876639015253,
"grad_norm": 4.932076930999756,
"learning_rate": 6.096256684491979e-06,
"loss": 1.1759,
"step": 570
},
{
"epoch": 0.1527963607171528,
"grad_norm": 5.142986297607422,
"learning_rate": 6.106951871657755e-06,
"loss": 1.2236,
"step": 571
},
{
"epoch": 0.15306395504415307,
"grad_norm": 5.11539363861084,
"learning_rate": 6.11764705882353e-06,
"loss": 1.1623,
"step": 572
},
{
"epoch": 0.15333154937115334,
"grad_norm": 4.659823417663574,
"learning_rate": 6.128342245989305e-06,
"loss": 1.2424,
"step": 573
},
{
"epoch": 0.1535991436981536,
"grad_norm": 5.004172325134277,
"learning_rate": 6.1390374331550805e-06,
"loss": 1.2536,
"step": 574
},
{
"epoch": 0.15386673802515385,
"grad_norm": 4.277651309967041,
"learning_rate": 6.149732620320856e-06,
"loss": 1.1239,
"step": 575
},
{
"epoch": 0.15413433235215412,
"grad_norm": 4.292529582977295,
"learning_rate": 6.1604278074866315e-06,
"loss": 1.1876,
"step": 576
},
{
"epoch": 0.1544019266791544,
"grad_norm": 4.799615859985352,
"learning_rate": 6.171122994652406e-06,
"loss": 1.1205,
"step": 577
},
{
"epoch": 0.15466952100615466,
"grad_norm": 4.678570747375488,
"learning_rate": 6.181818181818182e-06,
"loss": 1.1356,
"step": 578
},
{
"epoch": 0.15493711533315493,
"grad_norm": 4.6860246658325195,
"learning_rate": 6.192513368983958e-06,
"loss": 1.1228,
"step": 579
},
{
"epoch": 0.1552047096601552,
"grad_norm": 4.289163112640381,
"learning_rate": 6.203208556149734e-06,
"loss": 1.0872,
"step": 580
},
{
"epoch": 0.15547230398715547,
"grad_norm": 4.854632377624512,
"learning_rate": 6.213903743315508e-06,
"loss": 1.3285,
"step": 581
},
{
"epoch": 0.15573989831415574,
"grad_norm": 5.2167253494262695,
"learning_rate": 6.224598930481284e-06,
"loss": 1.076,
"step": 582
},
{
"epoch": 0.15600749264115601,
"grad_norm": 4.5353264808654785,
"learning_rate": 6.2352941176470595e-06,
"loss": 1.1824,
"step": 583
},
{
"epoch": 0.15627508696815628,
"grad_norm": 4.770082950592041,
"learning_rate": 6.245989304812835e-06,
"loss": 1.0642,
"step": 584
},
{
"epoch": 0.15654268129515655,
"grad_norm": 5.027703762054443,
"learning_rate": 6.25668449197861e-06,
"loss": 1.2616,
"step": 585
},
{
"epoch": 0.1568102756221568,
"grad_norm": 4.812859058380127,
"learning_rate": 6.267379679144385e-06,
"loss": 1.2037,
"step": 586
},
{
"epoch": 0.15707786994915707,
"grad_norm": 5.672885894775391,
"learning_rate": 6.278074866310161e-06,
"loss": 1.3296,
"step": 587
},
{
"epoch": 0.15734546427615734,
"grad_norm": 4.318905830383301,
"learning_rate": 6.288770053475936e-06,
"loss": 1.0487,
"step": 588
},
{
"epoch": 0.1576130586031576,
"grad_norm": 4.390570163726807,
"learning_rate": 6.299465240641713e-06,
"loss": 1.1174,
"step": 589
},
{
"epoch": 0.15788065293015788,
"grad_norm": 5.302069664001465,
"learning_rate": 6.3101604278074865e-06,
"loss": 1.0765,
"step": 590
},
{
"epoch": 0.15814824725715815,
"grad_norm": 5.114290237426758,
"learning_rate": 6.320855614973262e-06,
"loss": 1.2838,
"step": 591
},
{
"epoch": 0.15841584158415842,
"grad_norm": 4.3737335205078125,
"learning_rate": 6.3315508021390385e-06,
"loss": 1.072,
"step": 592
},
{
"epoch": 0.1586834359111587,
"grad_norm": 4.571005344390869,
"learning_rate": 6.342245989304814e-06,
"loss": 1.1507,
"step": 593
},
{
"epoch": 0.15895103023815896,
"grad_norm": 4.546551704406738,
"learning_rate": 6.352941176470589e-06,
"loss": 1.1058,
"step": 594
},
{
"epoch": 0.15921862456515923,
"grad_norm": 4.901880741119385,
"learning_rate": 6.363636363636364e-06,
"loss": 1.1524,
"step": 595
},
{
"epoch": 0.1594862188921595,
"grad_norm": 4.456069469451904,
"learning_rate": 6.37433155080214e-06,
"loss": 1.1826,
"step": 596
},
{
"epoch": 0.15975381321915974,
"grad_norm": 4.513467788696289,
"learning_rate": 6.385026737967915e-06,
"loss": 1.1069,
"step": 597
},
{
"epoch": 0.16002140754616,
"grad_norm": 4.525417804718018,
"learning_rate": 6.39572192513369e-06,
"loss": 1.1583,
"step": 598
},
{
"epoch": 0.16028900187316028,
"grad_norm": 4.3607177734375,
"learning_rate": 6.4064171122994655e-06,
"loss": 1.1446,
"step": 599
},
{
"epoch": 0.16055659620016055,
"grad_norm": 4.644144058227539,
"learning_rate": 6.417112299465241e-06,
"loss": 1.1466,
"step": 600
},
{
"epoch": 0.16082419052716082,
"grad_norm": 4.352504730224609,
"learning_rate": 6.427807486631017e-06,
"loss": 1.1164,
"step": 601
},
{
"epoch": 0.1610917848541611,
"grad_norm": 5.058422088623047,
"learning_rate": 6.438502673796791e-06,
"loss": 1.1716,
"step": 602
},
{
"epoch": 0.16135937918116136,
"grad_norm": 4.505871772766113,
"learning_rate": 6.449197860962567e-06,
"loss": 1.1306,
"step": 603
},
{
"epoch": 0.16162697350816163,
"grad_norm": 4.627199649810791,
"learning_rate": 6.459893048128343e-06,
"loss": 1.2105,
"step": 604
},
{
"epoch": 0.1618945678351619,
"grad_norm": 5.190435409545898,
"learning_rate": 6.470588235294119e-06,
"loss": 1.2797,
"step": 605
},
{
"epoch": 0.16216216216216217,
"grad_norm": 4.629772186279297,
"learning_rate": 6.4812834224598935e-06,
"loss": 1.0904,
"step": 606
},
{
"epoch": 0.16242975648916244,
"grad_norm": 4.735287189483643,
"learning_rate": 6.491978609625669e-06,
"loss": 1.1999,
"step": 607
},
{
"epoch": 0.16269735081616268,
"grad_norm": 5.2313008308410645,
"learning_rate": 6.5026737967914445e-06,
"loss": 1.3026,
"step": 608
},
{
"epoch": 0.16296494514316295,
"grad_norm": 4.605459213256836,
"learning_rate": 6.51336898395722e-06,
"loss": 1.1164,
"step": 609
},
{
"epoch": 0.16323253947016322,
"grad_norm": 4.5824480056762695,
"learning_rate": 6.524064171122996e-06,
"loss": 1.0526,
"step": 610
},
{
"epoch": 0.1635001337971635,
"grad_norm": 4.864238739013672,
"learning_rate": 6.53475935828877e-06,
"loss": 1.1137,
"step": 611
},
{
"epoch": 0.16376772812416376,
"grad_norm": 4.430417537689209,
"learning_rate": 6.545454545454546e-06,
"loss": 1.1407,
"step": 612
},
{
"epoch": 0.16403532245116403,
"grad_norm": 4.643566131591797,
"learning_rate": 6.556149732620321e-06,
"loss": 1.115,
"step": 613
},
{
"epoch": 0.1643029167781643,
"grad_norm": 5.602782249450684,
"learning_rate": 6.566844919786097e-06,
"loss": 1.4065,
"step": 614
},
{
"epoch": 0.16457051110516457,
"grad_norm": 4.812868118286133,
"learning_rate": 6.577540106951872e-06,
"loss": 1.1505,
"step": 615
},
{
"epoch": 0.16483810543216484,
"grad_norm": 4.6687235832214355,
"learning_rate": 6.588235294117647e-06,
"loss": 1.1733,
"step": 616
},
{
"epoch": 0.16510569975916511,
"grad_norm": 4.8625264167785645,
"learning_rate": 6.5989304812834235e-06,
"loss": 1.14,
"step": 617
},
{
"epoch": 0.16537329408616538,
"grad_norm": 5.044530868530273,
"learning_rate": 6.609625668449199e-06,
"loss": 1.2254,
"step": 618
},
{
"epoch": 0.16564088841316565,
"grad_norm": 4.458752632141113,
"learning_rate": 6.620320855614974e-06,
"loss": 1.1927,
"step": 619
},
{
"epoch": 0.1659084827401659,
"grad_norm": 4.7606377601623535,
"learning_rate": 6.631016042780749e-06,
"loss": 1.1916,
"step": 620
},
{
"epoch": 0.16617607706716617,
"grad_norm": 5.007805824279785,
"learning_rate": 6.641711229946525e-06,
"loss": 1.2655,
"step": 621
},
{
"epoch": 0.16644367139416644,
"grad_norm": 4.409674167633057,
"learning_rate": 6.6524064171123e-06,
"loss": 1.0725,
"step": 622
},
{
"epoch": 0.1667112657211667,
"grad_norm": 4.561901569366455,
"learning_rate": 6.663101604278075e-06,
"loss": 1.1336,
"step": 623
},
{
"epoch": 0.16697886004816698,
"grad_norm": 5.645256996154785,
"learning_rate": 6.673796791443851e-06,
"loss": 1.2585,
"step": 624
},
{
"epoch": 0.16724645437516725,
"grad_norm": 5.0422139167785645,
"learning_rate": 6.684491978609626e-06,
"loss": 1.2117,
"step": 625
},
{
"epoch": 0.16751404870216752,
"grad_norm": 5.541776180267334,
"learning_rate": 6.695187165775402e-06,
"loss": 1.2715,
"step": 626
},
{
"epoch": 0.1677816430291678,
"grad_norm": 4.81757116317749,
"learning_rate": 6.705882352941176e-06,
"loss": 1.1431,
"step": 627
},
{
"epoch": 0.16804923735616806,
"grad_norm": 5.481652736663818,
"learning_rate": 6.716577540106952e-06,
"loss": 1.174,
"step": 628
},
{
"epoch": 0.16831683168316833,
"grad_norm": 4.777329444885254,
"learning_rate": 6.7272727272727275e-06,
"loss": 1.1886,
"step": 629
},
{
"epoch": 0.1685844260101686,
"grad_norm": 4.763789176940918,
"learning_rate": 6.737967914438504e-06,
"loss": 1.1154,
"step": 630
},
{
"epoch": 0.16885202033716884,
"grad_norm": 4.949760437011719,
"learning_rate": 6.748663101604279e-06,
"loss": 1.1888,
"step": 631
},
{
"epoch": 0.1691196146641691,
"grad_norm": 4.344736099243164,
"learning_rate": 6.759358288770054e-06,
"loss": 1.2278,
"step": 632
},
{
"epoch": 0.16938720899116938,
"grad_norm": 4.495877265930176,
"learning_rate": 6.77005347593583e-06,
"loss": 1.1668,
"step": 633
},
{
"epoch": 0.16965480331816965,
"grad_norm": 4.895537853240967,
"learning_rate": 6.780748663101605e-06,
"loss": 1.3816,
"step": 634
},
{
"epoch": 0.16992239764516992,
"grad_norm": 4.664587497711182,
"learning_rate": 6.791443850267381e-06,
"loss": 1.2055,
"step": 635
},
{
"epoch": 0.1701899919721702,
"grad_norm": 4.564089775085449,
"learning_rate": 6.802139037433155e-06,
"loss": 1.1425,
"step": 636
},
{
"epoch": 0.17045758629917046,
"grad_norm": 4.690885066986084,
"learning_rate": 6.812834224598931e-06,
"loss": 1.0908,
"step": 637
},
{
"epoch": 0.17072518062617073,
"grad_norm": 4.54403018951416,
"learning_rate": 6.8235294117647065e-06,
"loss": 1.0681,
"step": 638
},
{
"epoch": 0.170992774953171,
"grad_norm": 4.301973342895508,
"learning_rate": 6.834224598930482e-06,
"loss": 1.1184,
"step": 639
},
{
"epoch": 0.17126036928017127,
"grad_norm": 4.822204113006592,
"learning_rate": 6.844919786096257e-06,
"loss": 1.1718,
"step": 640
},
{
"epoch": 0.17152796360717154,
"grad_norm": 4.2204413414001465,
"learning_rate": 6.855614973262032e-06,
"loss": 1.141,
"step": 641
},
{
"epoch": 0.17179555793417178,
"grad_norm": 4.727780818939209,
"learning_rate": 6.866310160427808e-06,
"loss": 1.2378,
"step": 642
},
{
"epoch": 0.17206315226117205,
"grad_norm": 4.156445503234863,
"learning_rate": 6.877005347593584e-06,
"loss": 1.1066,
"step": 643
},
{
"epoch": 0.17233074658817232,
"grad_norm": 4.479008197784424,
"learning_rate": 6.887700534759358e-06,
"loss": 1.171,
"step": 644
},
{
"epoch": 0.1725983409151726,
"grad_norm": 4.782415866851807,
"learning_rate": 6.898395721925134e-06,
"loss": 1.1557,
"step": 645
},
{
"epoch": 0.17286593524217286,
"grad_norm": 4.781481742858887,
"learning_rate": 6.90909090909091e-06,
"loss": 1.3044,
"step": 646
},
{
"epoch": 0.17313352956917313,
"grad_norm": 4.513900279998779,
"learning_rate": 6.9197860962566855e-06,
"loss": 1.189,
"step": 647
},
{
"epoch": 0.1734011238961734,
"grad_norm": 5.123539924621582,
"learning_rate": 6.93048128342246e-06,
"loss": 1.2388,
"step": 648
},
{
"epoch": 0.17366871822317367,
"grad_norm": 5.24996280670166,
"learning_rate": 6.941176470588236e-06,
"loss": 1.2528,
"step": 649
},
{
"epoch": 0.17393631255017394,
"grad_norm": 4.407766819000244,
"learning_rate": 6.951871657754011e-06,
"loss": 1.0828,
"step": 650
},
{
"epoch": 0.1742039068771742,
"grad_norm": 4.964326858520508,
"learning_rate": 6.962566844919787e-06,
"loss": 1.1248,
"step": 651
},
{
"epoch": 0.17447150120417448,
"grad_norm": 4.530794620513916,
"learning_rate": 6.9732620320855615e-06,
"loss": 1.1584,
"step": 652
},
{
"epoch": 0.17473909553117473,
"grad_norm": 4.297457218170166,
"learning_rate": 6.983957219251337e-06,
"loss": 1.1548,
"step": 653
},
{
"epoch": 0.175006689858175,
"grad_norm": 4.825823783874512,
"learning_rate": 6.9946524064171125e-06,
"loss": 1.2084,
"step": 654
},
{
"epoch": 0.17527428418517527,
"grad_norm": 4.5333709716796875,
"learning_rate": 7.005347593582889e-06,
"loss": 1.1385,
"step": 655
},
{
"epoch": 0.17554187851217554,
"grad_norm": 4.513311386108398,
"learning_rate": 7.0160427807486645e-06,
"loss": 1.1604,
"step": 656
},
{
"epoch": 0.1758094728391758,
"grad_norm": 4.645889759063721,
"learning_rate": 7.026737967914438e-06,
"loss": 1.3132,
"step": 657
},
{
"epoch": 0.17607706716617608,
"grad_norm": 4.844141006469727,
"learning_rate": 7.037433155080215e-06,
"loss": 1.1617,
"step": 658
},
{
"epoch": 0.17634466149317635,
"grad_norm": 4.618659973144531,
"learning_rate": 7.04812834224599e-06,
"loss": 1.1599,
"step": 659
},
{
"epoch": 0.17661225582017662,
"grad_norm": 4.780247688293457,
"learning_rate": 7.058823529411766e-06,
"loss": 1.2249,
"step": 660
},
{
"epoch": 0.1768798501471769,
"grad_norm": 4.695610046386719,
"learning_rate": 7.0695187165775405e-06,
"loss": 1.2523,
"step": 661
},
{
"epoch": 0.17714744447417716,
"grad_norm": 4.643034934997559,
"learning_rate": 7.080213903743316e-06,
"loss": 1.2802,
"step": 662
},
{
"epoch": 0.17741503880117743,
"grad_norm": 4.363466739654541,
"learning_rate": 7.0909090909090916e-06,
"loss": 1.0768,
"step": 663
},
{
"epoch": 0.17768263312817767,
"grad_norm": 4.794258117675781,
"learning_rate": 7.101604278074867e-06,
"loss": 1.2522,
"step": 664
},
{
"epoch": 0.17795022745517794,
"grad_norm": 4.560819149017334,
"learning_rate": 7.112299465240642e-06,
"loss": 1.1901,
"step": 665
},
{
"epoch": 0.1782178217821782,
"grad_norm": 4.56439733505249,
"learning_rate": 7.122994652406417e-06,
"loss": 1.1813,
"step": 666
},
{
"epoch": 0.17848541610917848,
"grad_norm": 4.605260848999023,
"learning_rate": 7.133689839572193e-06,
"loss": 1.1981,
"step": 667
},
{
"epoch": 0.17875301043617875,
"grad_norm": 4.7326483726501465,
"learning_rate": 7.144385026737969e-06,
"loss": 1.1832,
"step": 668
},
{
"epoch": 0.17902060476317902,
"grad_norm": 4.547402858734131,
"learning_rate": 7.155080213903743e-06,
"loss": 1.0722,
"step": 669
},
{
"epoch": 0.1792881990901793,
"grad_norm": 4.594086170196533,
"learning_rate": 7.1657754010695195e-06,
"loss": 1.1557,
"step": 670
},
{
"epoch": 0.17955579341717956,
"grad_norm": 4.440776824951172,
"learning_rate": 7.176470588235295e-06,
"loss": 1.1161,
"step": 671
},
{
"epoch": 0.17982338774417983,
"grad_norm": 5.013535976409912,
"learning_rate": 7.1871657754010706e-06,
"loss": 1.1546,
"step": 672
},
{
"epoch": 0.1800909820711801,
"grad_norm": 5.5731000900268555,
"learning_rate": 7.197860962566845e-06,
"loss": 1.315,
"step": 673
},
{
"epoch": 0.18035857639818037,
"grad_norm": 4.811005592346191,
"learning_rate": 7.208556149732621e-06,
"loss": 1.0571,
"step": 674
},
{
"epoch": 0.1806261707251806,
"grad_norm": 4.496854782104492,
"learning_rate": 7.219251336898396e-06,
"loss": 1.0867,
"step": 675
},
{
"epoch": 0.18089376505218088,
"grad_norm": 4.781049728393555,
"learning_rate": 7.229946524064172e-06,
"loss": 1.0135,
"step": 676
},
{
"epoch": 0.18116135937918115,
"grad_norm": 4.150574684143066,
"learning_rate": 7.240641711229947e-06,
"loss": 1.1745,
"step": 677
},
{
"epoch": 0.18142895370618142,
"grad_norm": 4.843429088592529,
"learning_rate": 7.251336898395722e-06,
"loss": 1.2394,
"step": 678
},
{
"epoch": 0.1816965480331817,
"grad_norm": 4.525768280029297,
"learning_rate": 7.262032085561498e-06,
"loss": 1.2715,
"step": 679
},
{
"epoch": 0.18196414236018196,
"grad_norm": 4.916580677032471,
"learning_rate": 7.272727272727273e-06,
"loss": 1.2347,
"step": 680
},
{
"epoch": 0.18223173668718223,
"grad_norm": 4.803800106048584,
"learning_rate": 7.2834224598930496e-06,
"loss": 1.1586,
"step": 681
},
{
"epoch": 0.1824993310141825,
"grad_norm": 4.679764747619629,
"learning_rate": 7.294117647058823e-06,
"loss": 1.268,
"step": 682
},
{
"epoch": 0.18276692534118277,
"grad_norm": 4.965787410736084,
"learning_rate": 7.3048128342246e-06,
"loss": 1.1855,
"step": 683
},
{
"epoch": 0.18303451966818304,
"grad_norm": 4.892383575439453,
"learning_rate": 7.315508021390375e-06,
"loss": 1.1683,
"step": 684
},
{
"epoch": 0.1833021139951833,
"grad_norm": 4.476233005523682,
"learning_rate": 7.326203208556151e-06,
"loss": 1.1109,
"step": 685
},
{
"epoch": 0.18356970832218358,
"grad_norm": 4.431989669799805,
"learning_rate": 7.3368983957219256e-06,
"loss": 1.1575,
"step": 686
},
{
"epoch": 0.18383730264918383,
"grad_norm": 4.837761878967285,
"learning_rate": 7.347593582887701e-06,
"loss": 1.2535,
"step": 687
},
{
"epoch": 0.1841048969761841,
"grad_norm": 4.285210132598877,
"learning_rate": 7.358288770053477e-06,
"loss": 1.0863,
"step": 688
},
{
"epoch": 0.18437249130318437,
"grad_norm": 4.910134315490723,
"learning_rate": 7.368983957219252e-06,
"loss": 1.1904,
"step": 689
},
{
"epoch": 0.18464008563018464,
"grad_norm": 4.774014472961426,
"learning_rate": 7.379679144385027e-06,
"loss": 1.1826,
"step": 690
},
{
"epoch": 0.1849076799571849,
"grad_norm": 5.281838893890381,
"learning_rate": 7.390374331550802e-06,
"loss": 1.3556,
"step": 691
},
{
"epoch": 0.18517527428418518,
"grad_norm": 4.740875244140625,
"learning_rate": 7.401069518716578e-06,
"loss": 1.3182,
"step": 692
},
{
"epoch": 0.18544286861118545,
"grad_norm": 4.560650825500488,
"learning_rate": 7.4117647058823535e-06,
"loss": 1.2666,
"step": 693
},
{
"epoch": 0.18571046293818572,
"grad_norm": 4.770612716674805,
"learning_rate": 7.422459893048128e-06,
"loss": 1.0551,
"step": 694
},
{
"epoch": 0.185978057265186,
"grad_norm": 4.479051113128662,
"learning_rate": 7.433155080213904e-06,
"loss": 1.1718,
"step": 695
},
{
"epoch": 0.18624565159218626,
"grad_norm": 4.537865161895752,
"learning_rate": 7.44385026737968e-06,
"loss": 1.0874,
"step": 696
},
{
"epoch": 0.18651324591918653,
"grad_norm": 4.282291412353516,
"learning_rate": 7.454545454545456e-06,
"loss": 1.2062,
"step": 697
},
{
"epoch": 0.18678084024618677,
"grad_norm": 4.386539459228516,
"learning_rate": 7.465240641711231e-06,
"loss": 1.313,
"step": 698
},
{
"epoch": 0.18704843457318704,
"grad_norm": 4.664721488952637,
"learning_rate": 7.475935828877006e-06,
"loss": 1.2399,
"step": 699
},
{
"epoch": 0.1873160289001873,
"grad_norm": 5.261703014373779,
"learning_rate": 7.486631016042781e-06,
"loss": 1.3199,
"step": 700
},
{
"epoch": 0.18758362322718758,
"grad_norm": 4.195591449737549,
"learning_rate": 7.497326203208557e-06,
"loss": 1.035,
"step": 701
},
{
"epoch": 0.18785121755418785,
"grad_norm": 4.815860271453857,
"learning_rate": 7.5080213903743325e-06,
"loss": 1.2643,
"step": 702
},
{
"epoch": 0.18811881188118812,
"grad_norm": 5.00251579284668,
"learning_rate": 7.518716577540107e-06,
"loss": 1.1611,
"step": 703
},
{
"epoch": 0.1883864062081884,
"grad_norm": 4.371436595916748,
"learning_rate": 7.529411764705883e-06,
"loss": 1.1734,
"step": 704
},
{
"epoch": 0.18865400053518866,
"grad_norm": 4.646690368652344,
"learning_rate": 7.540106951871658e-06,
"loss": 1.1298,
"step": 705
},
{
"epoch": 0.18892159486218893,
"grad_norm": 4.49533748626709,
"learning_rate": 7.550802139037434e-06,
"loss": 1.0886,
"step": 706
},
{
"epoch": 0.1891891891891892,
"grad_norm": 4.740173816680908,
"learning_rate": 7.5614973262032085e-06,
"loss": 1.1291,
"step": 707
},
{
"epoch": 0.18945678351618947,
"grad_norm": 4.919492721557617,
"learning_rate": 7.572192513368984e-06,
"loss": 1.1714,
"step": 708
},
{
"epoch": 0.1897243778431897,
"grad_norm": 4.677563190460205,
"learning_rate": 7.58288770053476e-06,
"loss": 1.1417,
"step": 709
},
{
"epoch": 0.18999197217018998,
"grad_norm": 4.335318088531494,
"learning_rate": 7.593582887700536e-06,
"loss": 1.0924,
"step": 710
},
{
"epoch": 0.19025956649719025,
"grad_norm": 4.638528347015381,
"learning_rate": 7.604278074866311e-06,
"loss": 1.2186,
"step": 711
},
{
"epoch": 0.19052716082419052,
"grad_norm": 4.537407398223877,
"learning_rate": 7.614973262032086e-06,
"loss": 1.1797,
"step": 712
},
{
"epoch": 0.1907947551511908,
"grad_norm": 4.735195159912109,
"learning_rate": 7.625668449197862e-06,
"loss": 1.2728,
"step": 713
},
{
"epoch": 0.19106234947819106,
"grad_norm": 4.434914588928223,
"learning_rate": 7.636363636363638e-06,
"loss": 1.1357,
"step": 714
},
{
"epoch": 0.19132994380519133,
"grad_norm": 4.431911945343018,
"learning_rate": 7.647058823529411e-06,
"loss": 1.2785,
"step": 715
},
{
"epoch": 0.1915975381321916,
"grad_norm": 4.211305618286133,
"learning_rate": 7.657754010695187e-06,
"loss": 1.1322,
"step": 716
},
{
"epoch": 0.19186513245919187,
"grad_norm": 4.698652267456055,
"learning_rate": 7.668449197860964e-06,
"loss": 1.2302,
"step": 717
},
{
"epoch": 0.19213272678619214,
"grad_norm": 4.491962909698486,
"learning_rate": 7.679144385026739e-06,
"loss": 1.1325,
"step": 718
},
{
"epoch": 0.1924003211131924,
"grad_norm": 4.714018821716309,
"learning_rate": 7.689839572192515e-06,
"loss": 1.152,
"step": 719
},
{
"epoch": 0.19266791544019266,
"grad_norm": 4.598504066467285,
"learning_rate": 7.70053475935829e-06,
"loss": 1.0786,
"step": 720
},
{
"epoch": 0.19293550976719293,
"grad_norm": 4.4915008544921875,
"learning_rate": 7.711229946524064e-06,
"loss": 1.1606,
"step": 721
},
{
"epoch": 0.1932031040941932,
"grad_norm": 4.305722236633301,
"learning_rate": 7.72192513368984e-06,
"loss": 1.1375,
"step": 722
},
{
"epoch": 0.19347069842119347,
"grad_norm": 4.845047473907471,
"learning_rate": 7.732620320855615e-06,
"loss": 1.0849,
"step": 723
},
{
"epoch": 0.19373829274819374,
"grad_norm": 4.809256553649902,
"learning_rate": 7.74331550802139e-06,
"loss": 1.0855,
"step": 724
},
{
"epoch": 0.194005887075194,
"grad_norm": 5.05698823928833,
"learning_rate": 7.754010695187166e-06,
"loss": 1.215,
"step": 725
},
{
"epoch": 0.19427348140219428,
"grad_norm": 4.64973258972168,
"learning_rate": 7.764705882352941e-06,
"loss": 1.1856,
"step": 726
},
{
"epoch": 0.19454107572919455,
"grad_norm": 4.284728050231934,
"learning_rate": 7.775401069518718e-06,
"loss": 1.013,
"step": 727
},
{
"epoch": 0.19480867005619482,
"grad_norm": 4.597956657409668,
"learning_rate": 7.786096256684492e-06,
"loss": 1.1597,
"step": 728
},
{
"epoch": 0.19507626438319509,
"grad_norm": 4.795129299163818,
"learning_rate": 7.796791443850269e-06,
"loss": 1.26,
"step": 729
},
{
"epoch": 0.19534385871019536,
"grad_norm": 4.353721618652344,
"learning_rate": 7.807486631016043e-06,
"loss": 1.1202,
"step": 730
},
{
"epoch": 0.1956114530371956,
"grad_norm": 4.6432108879089355,
"learning_rate": 7.81818181818182e-06,
"loss": 1.1496,
"step": 731
},
{
"epoch": 0.19587904736419587,
"grad_norm": 4.320937156677246,
"learning_rate": 7.828877005347594e-06,
"loss": 1.0743,
"step": 732
},
{
"epoch": 0.19614664169119614,
"grad_norm": 4.268731594085693,
"learning_rate": 7.839572192513369e-06,
"loss": 1.0991,
"step": 733
},
{
"epoch": 0.1964142360181964,
"grad_norm": 4.839014530181885,
"learning_rate": 7.850267379679145e-06,
"loss": 1.142,
"step": 734
},
{
"epoch": 0.19668183034519668,
"grad_norm": 4.309354305267334,
"learning_rate": 7.86096256684492e-06,
"loss": 1.055,
"step": 735
},
{
"epoch": 0.19694942467219695,
"grad_norm": 4.399764060974121,
"learning_rate": 7.871657754010695e-06,
"loss": 1.2,
"step": 736
},
{
"epoch": 0.19721701899919722,
"grad_norm": 4.814887523651123,
"learning_rate": 7.882352941176471e-06,
"loss": 1.1129,
"step": 737
},
{
"epoch": 0.1974846133261975,
"grad_norm": 4.662134647369385,
"learning_rate": 7.893048128342246e-06,
"loss": 1.3224,
"step": 738
},
{
"epoch": 0.19775220765319776,
"grad_norm": 4.743928909301758,
"learning_rate": 7.903743315508022e-06,
"loss": 1.2364,
"step": 739
},
{
"epoch": 0.19801980198019803,
"grad_norm": 4.6992716789245605,
"learning_rate": 7.914438502673799e-06,
"loss": 1.0913,
"step": 740
},
{
"epoch": 0.1982873963071983,
"grad_norm": 4.529000759124756,
"learning_rate": 7.925133689839572e-06,
"loss": 1.1083,
"step": 741
},
{
"epoch": 0.19855499063419854,
"grad_norm": 4.22991418838501,
"learning_rate": 7.935828877005348e-06,
"loss": 1.1632,
"step": 742
},
{
"epoch": 0.1988225849611988,
"grad_norm": 4.685365676879883,
"learning_rate": 7.946524064171124e-06,
"loss": 1.1653,
"step": 743
},
{
"epoch": 0.19909017928819908,
"grad_norm": 5.151124954223633,
"learning_rate": 7.9572192513369e-06,
"loss": 1.1468,
"step": 744
},
{
"epoch": 0.19935777361519935,
"grad_norm": 4.344570636749268,
"learning_rate": 7.967914438502674e-06,
"loss": 1.084,
"step": 745
},
{
"epoch": 0.19962536794219962,
"grad_norm": 4.775820255279541,
"learning_rate": 7.97860962566845e-06,
"loss": 1.0849,
"step": 746
},
{
"epoch": 0.1998929622691999,
"grad_norm": 4.6123433113098145,
"learning_rate": 7.989304812834225e-06,
"loss": 1.1837,
"step": 747
},
{
"epoch": 0.20016055659620016,
"grad_norm": 4.325228691101074,
"learning_rate": 8.000000000000001e-06,
"loss": 1.1404,
"step": 748
},
{
"epoch": 0.20042815092320043,
"grad_norm": 4.531330108642578,
"learning_rate": 8.010695187165776e-06,
"loss": 1.4233,
"step": 749
},
{
"epoch": 0.2006957452502007,
"grad_norm": 4.567444801330566,
"learning_rate": 8.02139037433155e-06,
"loss": 1.1898,
"step": 750
},
{
"epoch": 0.20096333957720097,
"grad_norm": 4.629062175750732,
"learning_rate": 8.032085561497327e-06,
"loss": 1.2206,
"step": 751
},
{
"epoch": 0.20123093390420124,
"grad_norm": 4.17169713973999,
"learning_rate": 8.042780748663103e-06,
"loss": 1.0784,
"step": 752
},
{
"epoch": 0.20149852823120148,
"grad_norm": 4.538808345794678,
"learning_rate": 8.053475935828876e-06,
"loss": 1.2097,
"step": 753
},
{
"epoch": 0.20176612255820175,
"grad_norm": 4.794569492340088,
"learning_rate": 8.064171122994653e-06,
"loss": 1.2594,
"step": 754
},
{
"epoch": 0.20203371688520202,
"grad_norm": 4.9203972816467285,
"learning_rate": 8.07486631016043e-06,
"loss": 1.4261,
"step": 755
},
{
"epoch": 0.2023013112122023,
"grad_norm": 4.924014091491699,
"learning_rate": 8.085561497326204e-06,
"loss": 1.1343,
"step": 756
},
{
"epoch": 0.20256890553920257,
"grad_norm": 4.558595657348633,
"learning_rate": 8.096256684491979e-06,
"loss": 1.1778,
"step": 757
},
{
"epoch": 0.20283649986620284,
"grad_norm": 4.965837478637695,
"learning_rate": 8.106951871657755e-06,
"loss": 1.406,
"step": 758
},
{
"epoch": 0.2031040941932031,
"grad_norm": 4.6557207107543945,
"learning_rate": 8.11764705882353e-06,
"loss": 1.2242,
"step": 759
},
{
"epoch": 0.20337168852020338,
"grad_norm": 4.143162250518799,
"learning_rate": 8.128342245989306e-06,
"loss": 1.0762,
"step": 760
},
{
"epoch": 0.20363928284720365,
"grad_norm": 4.899580001831055,
"learning_rate": 8.13903743315508e-06,
"loss": 1.2824,
"step": 761
},
{
"epoch": 0.20390687717420392,
"grad_norm": 4.938472270965576,
"learning_rate": 8.149732620320855e-06,
"loss": 1.3493,
"step": 762
},
{
"epoch": 0.20417447150120419,
"grad_norm": 4.2447943687438965,
"learning_rate": 8.160427807486632e-06,
"loss": 1.1482,
"step": 763
},
{
"epoch": 0.20444206582820446,
"grad_norm": 4.182919025421143,
"learning_rate": 8.171122994652407e-06,
"loss": 1.0402,
"step": 764
},
{
"epoch": 0.2047096601552047,
"grad_norm": 4.690080165863037,
"learning_rate": 8.181818181818183e-06,
"loss": 1.3051,
"step": 765
},
{
"epoch": 0.20497725448220497,
"grad_norm": 4.1133832931518555,
"learning_rate": 8.192513368983958e-06,
"loss": 1.0852,
"step": 766
},
{
"epoch": 0.20524484880920524,
"grad_norm": 4.17720365524292,
"learning_rate": 8.203208556149734e-06,
"loss": 1.0509,
"step": 767
},
{
"epoch": 0.2055124431362055,
"grad_norm": 4.893587589263916,
"learning_rate": 8.213903743315509e-06,
"loss": 1.2572,
"step": 768
},
{
"epoch": 0.20578003746320578,
"grad_norm": 4.39441442489624,
"learning_rate": 8.224598930481285e-06,
"loss": 1.0804,
"step": 769
},
{
"epoch": 0.20604763179020605,
"grad_norm": 4.479729652404785,
"learning_rate": 8.23529411764706e-06,
"loss": 1.129,
"step": 770
},
{
"epoch": 0.20631522611720632,
"grad_norm": 4.792821407318115,
"learning_rate": 8.245989304812834e-06,
"loss": 1.3325,
"step": 771
},
{
"epoch": 0.2065828204442066,
"grad_norm": 4.284221649169922,
"learning_rate": 8.256684491978611e-06,
"loss": 1.2608,
"step": 772
},
{
"epoch": 0.20685041477120686,
"grad_norm": 4.049210071563721,
"learning_rate": 8.267379679144386e-06,
"loss": 1.1589,
"step": 773
},
{
"epoch": 0.20711800909820713,
"grad_norm": 4.67439079284668,
"learning_rate": 8.27807486631016e-06,
"loss": 1.2496,
"step": 774
},
{
"epoch": 0.2073856034252074,
"grad_norm": 4.5758843421936035,
"learning_rate": 8.288770053475937e-06,
"loss": 1.1285,
"step": 775
},
{
"epoch": 0.20765319775220764,
"grad_norm": 4.632938861846924,
"learning_rate": 8.299465240641711e-06,
"loss": 1.3281,
"step": 776
},
{
"epoch": 0.2079207920792079,
"grad_norm": 4.83327054977417,
"learning_rate": 8.310160427807488e-06,
"loss": 1.2126,
"step": 777
},
{
"epoch": 0.20818838640620818,
"grad_norm": 4.5972137451171875,
"learning_rate": 8.320855614973262e-06,
"loss": 1.1829,
"step": 778
},
{
"epoch": 0.20845598073320845,
"grad_norm": 4.194045543670654,
"learning_rate": 8.331550802139037e-06,
"loss": 1.1207,
"step": 779
},
{
"epoch": 0.20872357506020872,
"grad_norm": 4.589977264404297,
"learning_rate": 8.342245989304813e-06,
"loss": 1.0869,
"step": 780
},
{
"epoch": 0.208991169387209,
"grad_norm": 4.733802795410156,
"learning_rate": 8.35294117647059e-06,
"loss": 1.2378,
"step": 781
},
{
"epoch": 0.20925876371420926,
"grad_norm": 4.47822380065918,
"learning_rate": 8.363636363636365e-06,
"loss": 1.1433,
"step": 782
},
{
"epoch": 0.20952635804120953,
"grad_norm": 3.875276803970337,
"learning_rate": 8.37433155080214e-06,
"loss": 1.0709,
"step": 783
},
{
"epoch": 0.2097939523682098,
"grad_norm": 4.252140522003174,
"learning_rate": 8.385026737967916e-06,
"loss": 1.1106,
"step": 784
},
{
"epoch": 0.21006154669521007,
"grad_norm": 4.29549503326416,
"learning_rate": 8.39572192513369e-06,
"loss": 1.1556,
"step": 785
},
{
"epoch": 0.21032914102221034,
"grad_norm": 4.358144760131836,
"learning_rate": 8.406417112299467e-06,
"loss": 1.053,
"step": 786
},
{
"epoch": 0.21059673534921058,
"grad_norm": 4.602996826171875,
"learning_rate": 8.417112299465241e-06,
"loss": 1.2131,
"step": 787
},
{
"epoch": 0.21086432967621085,
"grad_norm": 4.466192722320557,
"learning_rate": 8.427807486631016e-06,
"loss": 1.3099,
"step": 788
},
{
"epoch": 0.21113192400321112,
"grad_norm": 4.629776954650879,
"learning_rate": 8.438502673796792e-06,
"loss": 1.176,
"step": 789
},
{
"epoch": 0.2113995183302114,
"grad_norm": 4.807766437530518,
"learning_rate": 8.449197860962567e-06,
"loss": 1.2082,
"step": 790
},
{
"epoch": 0.21166711265721166,
"grad_norm": 4.741950035095215,
"learning_rate": 8.459893048128342e-06,
"loss": 1.228,
"step": 791
},
{
"epoch": 0.21193470698421193,
"grad_norm": 4.33003044128418,
"learning_rate": 8.470588235294118e-06,
"loss": 1.1136,
"step": 792
},
{
"epoch": 0.2122023013112122,
"grad_norm": 4.555398941040039,
"learning_rate": 8.481283422459895e-06,
"loss": 1.2896,
"step": 793
},
{
"epoch": 0.21246989563821247,
"grad_norm": 4.31208610534668,
"learning_rate": 8.49197860962567e-06,
"loss": 1.1487,
"step": 794
},
{
"epoch": 0.21273748996521274,
"grad_norm": 4.462785243988037,
"learning_rate": 8.502673796791444e-06,
"loss": 1.1853,
"step": 795
},
{
"epoch": 0.21300508429221301,
"grad_norm": 4.457045078277588,
"learning_rate": 8.51336898395722e-06,
"loss": 1.1769,
"step": 796
},
{
"epoch": 0.21327267861921329,
"grad_norm": 4.705628871917725,
"learning_rate": 8.524064171122995e-06,
"loss": 1.1599,
"step": 797
},
{
"epoch": 0.21354027294621353,
"grad_norm": 4.765135288238525,
"learning_rate": 8.534759358288771e-06,
"loss": 1.3051,
"step": 798
},
{
"epoch": 0.2138078672732138,
"grad_norm": 4.394601345062256,
"learning_rate": 8.545454545454546e-06,
"loss": 1.1349,
"step": 799
},
{
"epoch": 0.21407546160021407,
"grad_norm": 4.035240173339844,
"learning_rate": 8.556149732620321e-06,
"loss": 1.1051,
"step": 800
},
{
"epoch": 0.21434305592721434,
"grad_norm": 4.072005271911621,
"learning_rate": 8.566844919786097e-06,
"loss": 1.1045,
"step": 801
},
{
"epoch": 0.2146106502542146,
"grad_norm": 4.543212413787842,
"learning_rate": 8.577540106951872e-06,
"loss": 1.144,
"step": 802
},
{
"epoch": 0.21487824458121488,
"grad_norm": 4.204556941986084,
"learning_rate": 8.588235294117647e-06,
"loss": 1.2146,
"step": 803
},
{
"epoch": 0.21514583890821515,
"grad_norm": 3.9721314907073975,
"learning_rate": 8.598930481283423e-06,
"loss": 1.1586,
"step": 804
},
{
"epoch": 0.21541343323521542,
"grad_norm": 3.9580788612365723,
"learning_rate": 8.609625668449198e-06,
"loss": 1.0575,
"step": 805
},
{
"epoch": 0.2156810275622157,
"grad_norm": 4.39721155166626,
"learning_rate": 8.620320855614974e-06,
"loss": 1.2462,
"step": 806
},
{
"epoch": 0.21594862188921596,
"grad_norm": 4.285038471221924,
"learning_rate": 8.63101604278075e-06,
"loss": 1.1557,
"step": 807
},
{
"epoch": 0.21621621621621623,
"grad_norm": 4.489853382110596,
"learning_rate": 8.641711229946525e-06,
"loss": 1.3112,
"step": 808
},
{
"epoch": 0.21648381054321647,
"grad_norm": 4.501437664031982,
"learning_rate": 8.6524064171123e-06,
"loss": 1.1479,
"step": 809
},
{
"epoch": 0.21675140487021674,
"grad_norm": 4.656176567077637,
"learning_rate": 8.663101604278076e-06,
"loss": 1.2141,
"step": 810
},
{
"epoch": 0.217018999197217,
"grad_norm": 4.57153844833374,
"learning_rate": 8.673796791443851e-06,
"loss": 1.1898,
"step": 811
},
{
"epoch": 0.21728659352421728,
"grad_norm": 4.217146873474121,
"learning_rate": 8.684491978609626e-06,
"loss": 1.1277,
"step": 812
},
{
"epoch": 0.21755418785121755,
"grad_norm": 5.0501227378845215,
"learning_rate": 8.695187165775402e-06,
"loss": 1.2793,
"step": 813
},
{
"epoch": 0.21782178217821782,
"grad_norm": 4.156916618347168,
"learning_rate": 8.705882352941177e-06,
"loss": 1.0608,
"step": 814
},
{
"epoch": 0.2180893765052181,
"grad_norm": 4.722466468811035,
"learning_rate": 8.716577540106953e-06,
"loss": 1.2908,
"step": 815
},
{
"epoch": 0.21835697083221836,
"grad_norm": 4.382132053375244,
"learning_rate": 8.727272727272728e-06,
"loss": 1.1733,
"step": 816
},
{
"epoch": 0.21862456515921863,
"grad_norm": 4.8200225830078125,
"learning_rate": 8.737967914438502e-06,
"loss": 1.3151,
"step": 817
},
{
"epoch": 0.2188921594862189,
"grad_norm": 4.401098728179932,
"learning_rate": 8.748663101604279e-06,
"loss": 1.1091,
"step": 818
},
{
"epoch": 0.21915975381321917,
"grad_norm": 4.914200305938721,
"learning_rate": 8.759358288770055e-06,
"loss": 1.3693,
"step": 819
},
{
"epoch": 0.2194273481402194,
"grad_norm": 4.085461616516113,
"learning_rate": 8.77005347593583e-06,
"loss": 1.1384,
"step": 820
},
{
"epoch": 0.21969494246721968,
"grad_norm": 3.852440595626831,
"learning_rate": 8.780748663101605e-06,
"loss": 1.1228,
"step": 821
},
{
"epoch": 0.21996253679421995,
"grad_norm": 4.607455253601074,
"learning_rate": 8.791443850267381e-06,
"loss": 1.2419,
"step": 822
},
{
"epoch": 0.22023013112122022,
"grad_norm": 4.384522438049316,
"learning_rate": 8.802139037433156e-06,
"loss": 1.3108,
"step": 823
},
{
"epoch": 0.2204977254482205,
"grad_norm": 4.342321872711182,
"learning_rate": 8.81283422459893e-06,
"loss": 1.1794,
"step": 824
},
{
"epoch": 0.22076531977522076,
"grad_norm": 4.432126045227051,
"learning_rate": 8.823529411764707e-06,
"loss": 1.1947,
"step": 825
},
{
"epoch": 0.22103291410222103,
"grad_norm": 4.4877777099609375,
"learning_rate": 8.834224598930481e-06,
"loss": 1.2243,
"step": 826
},
{
"epoch": 0.2213005084292213,
"grad_norm": 4.3614325523376465,
"learning_rate": 8.844919786096258e-06,
"loss": 1.2309,
"step": 827
},
{
"epoch": 0.22156810275622157,
"grad_norm": 4.3788580894470215,
"learning_rate": 8.855614973262033e-06,
"loss": 1.1391,
"step": 828
},
{
"epoch": 0.22183569708322184,
"grad_norm": 4.09984016418457,
"learning_rate": 8.866310160427807e-06,
"loss": 1.1843,
"step": 829
},
{
"epoch": 0.22210329141022211,
"grad_norm": 4.093768119812012,
"learning_rate": 8.877005347593584e-06,
"loss": 1.1382,
"step": 830
},
{
"epoch": 0.22237088573722238,
"grad_norm": 4.717266082763672,
"learning_rate": 8.88770053475936e-06,
"loss": 1.3751,
"step": 831
},
{
"epoch": 0.22263848006422263,
"grad_norm": 4.382028579711914,
"learning_rate": 8.898395721925135e-06,
"loss": 1.2114,
"step": 832
},
{
"epoch": 0.2229060743912229,
"grad_norm": 4.509121894836426,
"learning_rate": 8.90909090909091e-06,
"loss": 1.2096,
"step": 833
},
{
"epoch": 0.22317366871822317,
"grad_norm": 4.2888078689575195,
"learning_rate": 8.919786096256686e-06,
"loss": 1.2023,
"step": 834
},
{
"epoch": 0.22344126304522344,
"grad_norm": 3.797525405883789,
"learning_rate": 8.93048128342246e-06,
"loss": 1.1453,
"step": 835
},
{
"epoch": 0.2237088573722237,
"grad_norm": 3.918774127960205,
"learning_rate": 8.941176470588237e-06,
"loss": 1.0776,
"step": 836
},
{
"epoch": 0.22397645169922398,
"grad_norm": 4.301737308502197,
"learning_rate": 8.951871657754012e-06,
"loss": 1.1801,
"step": 837
},
{
"epoch": 0.22424404602622425,
"grad_norm": 4.121411323547363,
"learning_rate": 8.962566844919786e-06,
"loss": 1.0812,
"step": 838
},
{
"epoch": 0.22451164035322452,
"grad_norm": 4.318382740020752,
"learning_rate": 8.973262032085563e-06,
"loss": 1.1899,
"step": 839
},
{
"epoch": 0.2247792346802248,
"grad_norm": 4.362233638763428,
"learning_rate": 8.983957219251337e-06,
"loss": 1.137,
"step": 840
},
{
"epoch": 0.22504682900722506,
"grad_norm": 4.285608291625977,
"learning_rate": 8.994652406417112e-06,
"loss": 1.1922,
"step": 841
},
{
"epoch": 0.22531442333422533,
"grad_norm": 4.41885232925415,
"learning_rate": 9.005347593582888e-06,
"loss": 1.2026,
"step": 842
},
{
"epoch": 0.22558201766122557,
"grad_norm": 4.712429046630859,
"learning_rate": 9.016042780748663e-06,
"loss": 1.2112,
"step": 843
},
{
"epoch": 0.22584961198822584,
"grad_norm": 3.9474940299987793,
"learning_rate": 9.02673796791444e-06,
"loss": 1.0856,
"step": 844
},
{
"epoch": 0.2261172063152261,
"grad_norm": 4.865321159362793,
"learning_rate": 9.037433155080214e-06,
"loss": 1.2806,
"step": 845
},
{
"epoch": 0.22638480064222638,
"grad_norm": 4.013378620147705,
"learning_rate": 9.04812834224599e-06,
"loss": 1.112,
"step": 846
},
{
"epoch": 0.22665239496922665,
"grad_norm": 4.2192702293396,
"learning_rate": 9.058823529411765e-06,
"loss": 1.2246,
"step": 847
},
{
"epoch": 0.22691998929622692,
"grad_norm": 4.709174633026123,
"learning_rate": 9.069518716577542e-06,
"loss": 1.2746,
"step": 848
},
{
"epoch": 0.2271875836232272,
"grad_norm": 4.175418376922607,
"learning_rate": 9.080213903743316e-06,
"loss": 1.1651,
"step": 849
},
{
"epoch": 0.22745517795022746,
"grad_norm": 4.398164749145508,
"learning_rate": 9.090909090909091e-06,
"loss": 1.2002,
"step": 850
},
{
"epoch": 0.22772277227722773,
"grad_norm": 4.27931022644043,
"learning_rate": 9.101604278074867e-06,
"loss": 1.1041,
"step": 851
},
{
"epoch": 0.227990366604228,
"grad_norm": 4.752706050872803,
"learning_rate": 9.112299465240642e-06,
"loss": 1.3382,
"step": 852
},
{
"epoch": 0.22825796093122827,
"grad_norm": 4.658750057220459,
"learning_rate": 9.122994652406418e-06,
"loss": 1.2518,
"step": 853
},
{
"epoch": 0.2285255552582285,
"grad_norm": 4.37801456451416,
"learning_rate": 9.133689839572193e-06,
"loss": 1.2284,
"step": 854
},
{
"epoch": 0.22879314958522878,
"grad_norm": 4.360160827636719,
"learning_rate": 9.144385026737968e-06,
"loss": 1.107,
"step": 855
},
{
"epoch": 0.22906074391222905,
"grad_norm": 4.552803993225098,
"learning_rate": 9.155080213903744e-06,
"loss": 1.2773,
"step": 856
},
{
"epoch": 0.22932833823922932,
"grad_norm": 4.319884300231934,
"learning_rate": 9.16577540106952e-06,
"loss": 1.3304,
"step": 857
},
{
"epoch": 0.2295959325662296,
"grad_norm": 4.7817840576171875,
"learning_rate": 9.176470588235294e-06,
"loss": 1.3394,
"step": 858
},
{
"epoch": 0.22986352689322986,
"grad_norm": 3.597621202468872,
"learning_rate": 9.18716577540107e-06,
"loss": 1.0479,
"step": 859
},
{
"epoch": 0.23013112122023013,
"grad_norm": 4.924500465393066,
"learning_rate": 9.197860962566846e-06,
"loss": 1.2405,
"step": 860
},
{
"epoch": 0.2303987155472304,
"grad_norm": 4.659447193145752,
"learning_rate": 9.208556149732621e-06,
"loss": 1.2231,
"step": 861
},
{
"epoch": 0.23066630987423067,
"grad_norm": 4.4317145347595215,
"learning_rate": 9.219251336898396e-06,
"loss": 1.2094,
"step": 862
},
{
"epoch": 0.23093390420123094,
"grad_norm": 3.976191520690918,
"learning_rate": 9.229946524064172e-06,
"loss": 1.0144,
"step": 863
},
{
"epoch": 0.23120149852823121,
"grad_norm": 4.48732852935791,
"learning_rate": 9.240641711229947e-06,
"loss": 1.0518,
"step": 864
},
{
"epoch": 0.23146909285523146,
"grad_norm": 4.009017467498779,
"learning_rate": 9.251336898395723e-06,
"loss": 1.1445,
"step": 865
},
{
"epoch": 0.23173668718223173,
"grad_norm": 4.176751136779785,
"learning_rate": 9.262032085561498e-06,
"loss": 1.1074,
"step": 866
},
{
"epoch": 0.232004281509232,
"grad_norm": 4.7490763664245605,
"learning_rate": 9.272727272727273e-06,
"loss": 1.351,
"step": 867
},
{
"epoch": 0.23227187583623227,
"grad_norm": 4.492088794708252,
"learning_rate": 9.283422459893049e-06,
"loss": 1.2427,
"step": 868
},
{
"epoch": 0.23253947016323254,
"grad_norm": 3.9468204975128174,
"learning_rate": 9.294117647058824e-06,
"loss": 1.0236,
"step": 869
},
{
"epoch": 0.2328070644902328,
"grad_norm": 4.703409194946289,
"learning_rate": 9.3048128342246e-06,
"loss": 1.1154,
"step": 870
},
{
"epoch": 0.23307465881723308,
"grad_norm": 4.1995110511779785,
"learning_rate": 9.315508021390375e-06,
"loss": 1.1273,
"step": 871
},
{
"epoch": 0.23334225314423335,
"grad_norm": 4.209486484527588,
"learning_rate": 9.326203208556151e-06,
"loss": 1.1375,
"step": 872
},
{
"epoch": 0.23360984747123362,
"grad_norm": 3.9918205738067627,
"learning_rate": 9.336898395721926e-06,
"loss": 1.1212,
"step": 873
},
{
"epoch": 0.2338774417982339,
"grad_norm": 4.315709114074707,
"learning_rate": 9.347593582887702e-06,
"loss": 1.1351,
"step": 874
},
{
"epoch": 0.23414503612523416,
"grad_norm": 4.223841190338135,
"learning_rate": 9.358288770053477e-06,
"loss": 1.224,
"step": 875
},
{
"epoch": 0.2344126304522344,
"grad_norm": 4.296685218811035,
"learning_rate": 9.368983957219252e-06,
"loss": 1.1524,
"step": 876
},
{
"epoch": 0.23468022477923467,
"grad_norm": 4.791153430938721,
"learning_rate": 9.379679144385028e-06,
"loss": 1.315,
"step": 877
},
{
"epoch": 0.23494781910623494,
"grad_norm": 4.414406776428223,
"learning_rate": 9.390374331550803e-06,
"loss": 1.2116,
"step": 878
},
{
"epoch": 0.2352154134332352,
"grad_norm": 4.986870288848877,
"learning_rate": 9.401069518716577e-06,
"loss": 1.2292,
"step": 879
},
{
"epoch": 0.23548300776023548,
"grad_norm": 4.407514572143555,
"learning_rate": 9.411764705882354e-06,
"loss": 1.3158,
"step": 880
},
{
"epoch": 0.23575060208723575,
"grad_norm": 4.413543701171875,
"learning_rate": 9.422459893048129e-06,
"loss": 0.968,
"step": 881
},
{
"epoch": 0.23601819641423602,
"grad_norm": 4.498653411865234,
"learning_rate": 9.433155080213905e-06,
"loss": 1.1329,
"step": 882
},
{
"epoch": 0.2362857907412363,
"grad_norm": 4.2039313316345215,
"learning_rate": 9.44385026737968e-06,
"loss": 1.1976,
"step": 883
},
{
"epoch": 0.23655338506823656,
"grad_norm": 4.075275421142578,
"learning_rate": 9.454545454545456e-06,
"loss": 1.1194,
"step": 884
},
{
"epoch": 0.23682097939523683,
"grad_norm": 4.131809234619141,
"learning_rate": 9.46524064171123e-06,
"loss": 1.2058,
"step": 885
},
{
"epoch": 0.2370885737222371,
"grad_norm": 4.1411824226379395,
"learning_rate": 9.475935828877007e-06,
"loss": 1.1203,
"step": 886
},
{
"epoch": 0.23735616804923734,
"grad_norm": 5.270638942718506,
"learning_rate": 9.486631016042782e-06,
"loss": 1.2022,
"step": 887
},
{
"epoch": 0.2376237623762376,
"grad_norm": 4.125979900360107,
"learning_rate": 9.497326203208556e-06,
"loss": 1.0407,
"step": 888
},
{
"epoch": 0.23789135670323788,
"grad_norm": 4.254225730895996,
"learning_rate": 9.508021390374333e-06,
"loss": 1.1919,
"step": 889
},
{
"epoch": 0.23815895103023815,
"grad_norm": 4.1460723876953125,
"learning_rate": 9.518716577540108e-06,
"loss": 1.1937,
"step": 890
},
{
"epoch": 0.23842654535723842,
"grad_norm": 4.267801761627197,
"learning_rate": 9.529411764705882e-06,
"loss": 1.2268,
"step": 891
},
{
"epoch": 0.2386941396842387,
"grad_norm": 4.095164775848389,
"learning_rate": 9.540106951871659e-06,
"loss": 1.2323,
"step": 892
},
{
"epoch": 0.23896173401123896,
"grad_norm": 4.400330066680908,
"learning_rate": 9.550802139037433e-06,
"loss": 1.3224,
"step": 893
},
{
"epoch": 0.23922932833823923,
"grad_norm": 4.906595706939697,
"learning_rate": 9.56149732620321e-06,
"loss": 1.3625,
"step": 894
},
{
"epoch": 0.2394969226652395,
"grad_norm": 4.529881000518799,
"learning_rate": 9.572192513368986e-06,
"loss": 1.1608,
"step": 895
},
{
"epoch": 0.23976451699223977,
"grad_norm": 4.229710102081299,
"learning_rate": 9.582887700534759e-06,
"loss": 1.1961,
"step": 896
},
{
"epoch": 0.24003211131924004,
"grad_norm": 4.66829776763916,
"learning_rate": 9.593582887700535e-06,
"loss": 1.2154,
"step": 897
},
{
"epoch": 0.24029970564624029,
"grad_norm": 4.366943836212158,
"learning_rate": 9.604278074866312e-06,
"loss": 1.1817,
"step": 898
},
{
"epoch": 0.24056729997324056,
"grad_norm": 4.251003265380859,
"learning_rate": 9.614973262032087e-06,
"loss": 1.3212,
"step": 899
},
{
"epoch": 0.24083489430024083,
"grad_norm": 5.345521450042725,
"learning_rate": 9.625668449197861e-06,
"loss": 1.1808,
"step": 900
},
{
"epoch": 0.2411024886272411,
"grad_norm": 4.079299449920654,
"learning_rate": 9.636363636363638e-06,
"loss": 1.1816,
"step": 901
},
{
"epoch": 0.24137008295424137,
"grad_norm": 4.181840896606445,
"learning_rate": 9.647058823529412e-06,
"loss": 1.141,
"step": 902
},
{
"epoch": 0.24163767728124164,
"grad_norm": 4.736073017120361,
"learning_rate": 9.657754010695189e-06,
"loss": 1.0541,
"step": 903
},
{
"epoch": 0.2419052716082419,
"grad_norm": 4.228132724761963,
"learning_rate": 9.668449197860963e-06,
"loss": 1.0897,
"step": 904
},
{
"epoch": 0.24217286593524218,
"grad_norm": 4.429383277893066,
"learning_rate": 9.679144385026738e-06,
"loss": 1.1879,
"step": 905
},
{
"epoch": 0.24244046026224245,
"grad_norm": 4.360840320587158,
"learning_rate": 9.689839572192514e-06,
"loss": 1.1828,
"step": 906
},
{
"epoch": 0.24270805458924272,
"grad_norm": 4.852614879608154,
"learning_rate": 9.700534759358289e-06,
"loss": 1.2815,
"step": 907
},
{
"epoch": 0.242975648916243,
"grad_norm": 4.6722846031188965,
"learning_rate": 9.711229946524064e-06,
"loss": 1.3035,
"step": 908
},
{
"epoch": 0.24324324324324326,
"grad_norm": 4.601790904998779,
"learning_rate": 9.72192513368984e-06,
"loss": 1.2348,
"step": 909
},
{
"epoch": 0.2435108375702435,
"grad_norm": 4.581474781036377,
"learning_rate": 9.732620320855617e-06,
"loss": 1.2717,
"step": 910
},
{
"epoch": 0.24377843189724377,
"grad_norm": 4.073735237121582,
"learning_rate": 9.743315508021391e-06,
"loss": 1.2133,
"step": 911
},
{
"epoch": 0.24404602622424404,
"grad_norm": 4.351081848144531,
"learning_rate": 9.754010695187166e-06,
"loss": 1.1797,
"step": 912
},
{
"epoch": 0.2443136205512443,
"grad_norm": 3.7765159606933594,
"learning_rate": 9.764705882352942e-06,
"loss": 1.163,
"step": 913
},
{
"epoch": 0.24458121487824458,
"grad_norm": 4.727344512939453,
"learning_rate": 9.775401069518717e-06,
"loss": 1.3226,
"step": 914
},
{
"epoch": 0.24484880920524485,
"grad_norm": 4.661051273345947,
"learning_rate": 9.786096256684493e-06,
"loss": 1.1071,
"step": 915
},
{
"epoch": 0.24511640353224512,
"grad_norm": 4.205208778381348,
"learning_rate": 9.796791443850268e-06,
"loss": 1.1642,
"step": 916
},
{
"epoch": 0.2453839978592454,
"grad_norm": 4.339627265930176,
"learning_rate": 9.807486631016043e-06,
"loss": 1.0605,
"step": 917
},
{
"epoch": 0.24565159218624566,
"grad_norm": 4.589977741241455,
"learning_rate": 9.81818181818182e-06,
"loss": 1.2584,
"step": 918
},
{
"epoch": 0.24591918651324593,
"grad_norm": 4.377978801727295,
"learning_rate": 9.828877005347594e-06,
"loss": 1.303,
"step": 919
},
{
"epoch": 0.2461867808402462,
"grad_norm": 4.110877513885498,
"learning_rate": 9.83957219251337e-06,
"loss": 1.1833,
"step": 920
},
{
"epoch": 0.24645437516724644,
"grad_norm": 4.5038743019104,
"learning_rate": 9.850267379679145e-06,
"loss": 1.2471,
"step": 921
},
{
"epoch": 0.2467219694942467,
"grad_norm": 4.485939025878906,
"learning_rate": 9.86096256684492e-06,
"loss": 1.1391,
"step": 922
},
{
"epoch": 0.24698956382124698,
"grad_norm": 4.139279365539551,
"learning_rate": 9.871657754010696e-06,
"loss": 1.1146,
"step": 923
},
{
"epoch": 0.24725715814824725,
"grad_norm": 3.7969651222229004,
"learning_rate": 9.882352941176472e-06,
"loss": 1.0658,
"step": 924
},
{
"epoch": 0.24752475247524752,
"grad_norm": 3.978060483932495,
"learning_rate": 9.893048128342247e-06,
"loss": 1.1051,
"step": 925
},
{
"epoch": 0.2477923468022478,
"grad_norm": 4.101005554199219,
"learning_rate": 9.903743315508022e-06,
"loss": 1.1725,
"step": 926
},
{
"epoch": 0.24805994112924806,
"grad_norm": 3.89359188079834,
"learning_rate": 9.914438502673798e-06,
"loss": 1.1501,
"step": 927
},
{
"epoch": 0.24832753545624833,
"grad_norm": 4.291905879974365,
"learning_rate": 9.925133689839573e-06,
"loss": 1.2188,
"step": 928
},
{
"epoch": 0.2485951297832486,
"grad_norm": 4.537034034729004,
"learning_rate": 9.935828877005348e-06,
"loss": 1.2074,
"step": 929
},
{
"epoch": 0.24886272411024887,
"grad_norm": 4.324453830718994,
"learning_rate": 9.946524064171124e-06,
"loss": 1.1561,
"step": 930
},
{
"epoch": 0.24913031843724914,
"grad_norm": 4.010372638702393,
"learning_rate": 9.957219251336899e-06,
"loss": 1.156,
"step": 931
},
{
"epoch": 0.24939791276424939,
"grad_norm": 4.636694431304932,
"learning_rate": 9.967914438502675e-06,
"loss": 1.3193,
"step": 932
},
{
"epoch": 0.24966550709124966,
"grad_norm": 4.060527324676514,
"learning_rate": 9.97860962566845e-06,
"loss": 1.2258,
"step": 933
},
{
"epoch": 0.24993310141824993,
"grad_norm": 4.463012218475342,
"learning_rate": 9.989304812834224e-06,
"loss": 1.1717,
"step": 934
},
{
"epoch": 0.2502006957452502,
"grad_norm": 3.8592917919158936,
"learning_rate": 1e-05,
"loss": 1.1587,
"step": 935
},
{
"epoch": 0.25046829007225047,
"grad_norm": 4.335379600524902,
"learning_rate": 9.999999921685345e-06,
"loss": 1.3002,
"step": 936
},
{
"epoch": 0.25073588439925076,
"grad_norm": 4.3303680419921875,
"learning_rate": 9.999999686741384e-06,
"loss": 1.1696,
"step": 937
},
{
"epoch": 0.251003478726251,
"grad_norm": 4.4580607414245605,
"learning_rate": 9.999999295168122e-06,
"loss": 1.2942,
"step": 938
},
{
"epoch": 0.25127107305325125,
"grad_norm": 3.974984884262085,
"learning_rate": 9.999998746965573e-06,
"loss": 1.1459,
"step": 939
},
{
"epoch": 0.25153866738025155,
"grad_norm": 4.089166164398193,
"learning_rate": 9.999998042133754e-06,
"loss": 1.2405,
"step": 940
},
{
"epoch": 0.2518062617072518,
"grad_norm": 4.120057582855225,
"learning_rate": 9.999997180672684e-06,
"loss": 1.1968,
"step": 941
},
{
"epoch": 0.2520738560342521,
"grad_norm": 3.840859889984131,
"learning_rate": 9.999996162582396e-06,
"loss": 1.144,
"step": 942
},
{
"epoch": 0.25234145036125233,
"grad_norm": 4.502830505371094,
"learning_rate": 9.999994987862916e-06,
"loss": 1.2564,
"step": 943
},
{
"epoch": 0.2526090446882526,
"grad_norm": 4.325287818908691,
"learning_rate": 9.999993656514284e-06,
"loss": 1.2646,
"step": 944
},
{
"epoch": 0.25287663901525287,
"grad_norm": 4.583348751068115,
"learning_rate": 9.999992168536542e-06,
"loss": 1.1912,
"step": 945
},
{
"epoch": 0.25314423334225317,
"grad_norm": 4.394077301025391,
"learning_rate": 9.999990523929734e-06,
"loss": 1.242,
"step": 946
},
{
"epoch": 0.2534118276692534,
"grad_norm": 4.394894599914551,
"learning_rate": 9.999988722693914e-06,
"loss": 1.1904,
"step": 947
},
{
"epoch": 0.2536794219962537,
"grad_norm": 4.380218982696533,
"learning_rate": 9.999986764829137e-06,
"loss": 1.3293,
"step": 948
},
{
"epoch": 0.25394701632325395,
"grad_norm": 4.508794784545898,
"learning_rate": 9.999984650335468e-06,
"loss": 1.2141,
"step": 949
},
{
"epoch": 0.2542146106502542,
"grad_norm": 4.407951831817627,
"learning_rate": 9.999982379212967e-06,
"loss": 1.1078,
"step": 950
},
{
"epoch": 0.2544822049772545,
"grad_norm": 5.039391040802002,
"learning_rate": 9.99997995146171e-06,
"loss": 1.3326,
"step": 951
},
{
"epoch": 0.25474979930425473,
"grad_norm": 4.499945640563965,
"learning_rate": 9.99997736708177e-06,
"loss": 1.3474,
"step": 952
},
{
"epoch": 0.25501739363125503,
"grad_norm": 4.34948205947876,
"learning_rate": 9.99997462607323e-06,
"loss": 1.1468,
"step": 953
},
{
"epoch": 0.25528498795825527,
"grad_norm": 4.453915596008301,
"learning_rate": 9.999971728436174e-06,
"loss": 1.4107,
"step": 954
},
{
"epoch": 0.25555258228525557,
"grad_norm": 4.159339904785156,
"learning_rate": 9.999968674170697e-06,
"loss": 1.2531,
"step": 955
},
{
"epoch": 0.2558201766122558,
"grad_norm": 4.395799160003662,
"learning_rate": 9.999965463276888e-06,
"loss": 1.372,
"step": 956
},
{
"epoch": 0.2560877709392561,
"grad_norm": 4.233826637268066,
"learning_rate": 9.999962095754854e-06,
"loss": 1.1476,
"step": 957
},
{
"epoch": 0.25635536526625635,
"grad_norm": 4.740029335021973,
"learning_rate": 9.999958571604697e-06,
"loss": 1.2241,
"step": 958
},
{
"epoch": 0.25662295959325665,
"grad_norm": 4.2474846839904785,
"learning_rate": 9.999954890826528e-06,
"loss": 1.1625,
"step": 959
},
{
"epoch": 0.2568905539202569,
"grad_norm": 4.043703556060791,
"learning_rate": 9.99995105342046e-06,
"loss": 1.1664,
"step": 960
},
{
"epoch": 0.25715814824725713,
"grad_norm": 4.318393707275391,
"learning_rate": 9.99994705938662e-06,
"loss": 1.2221,
"step": 961
},
{
"epoch": 0.25742574257425743,
"grad_norm": 4.372133731842041,
"learning_rate": 9.999942908725127e-06,
"loss": 1.1626,
"step": 962
},
{
"epoch": 0.2576933369012577,
"grad_norm": 4.380350589752197,
"learning_rate": 9.999938601436111e-06,
"loss": 1.1594,
"step": 963
},
{
"epoch": 0.257960931228258,
"grad_norm": 4.29257345199585,
"learning_rate": 9.999934137519711e-06,
"loss": 1.1906,
"step": 964
},
{
"epoch": 0.2582285255552582,
"grad_norm": 3.729611873626709,
"learning_rate": 9.999929516976063e-06,
"loss": 0.9909,
"step": 965
},
{
"epoch": 0.2584961198822585,
"grad_norm": 9.831878662109375,
"learning_rate": 9.999924739805313e-06,
"loss": 1.2233,
"step": 966
},
{
"epoch": 0.25876371420925875,
"grad_norm": 4.3758544921875,
"learning_rate": 9.999919806007612e-06,
"loss": 1.3428,
"step": 967
},
{
"epoch": 0.25903130853625905,
"grad_norm": 4.322572708129883,
"learning_rate": 9.999914715583114e-06,
"loss": 1.1024,
"step": 968
},
{
"epoch": 0.2592989028632593,
"grad_norm": 4.245995998382568,
"learning_rate": 9.999909468531977e-06,
"loss": 1.1555,
"step": 969
},
{
"epoch": 0.2595664971902596,
"grad_norm": 3.9678421020507812,
"learning_rate": 9.999904064854367e-06,
"loss": 1.0857,
"step": 970
},
{
"epoch": 0.25983409151725984,
"grad_norm": 4.337567329406738,
"learning_rate": 9.999898504550452e-06,
"loss": 1.2654,
"step": 971
},
{
"epoch": 0.2601016858442601,
"grad_norm": 4.172070503234863,
"learning_rate": 9.999892787620407e-06,
"loss": 1.1528,
"step": 972
},
{
"epoch": 0.2603692801712604,
"grad_norm": 4.25397253036499,
"learning_rate": 9.999886914064411e-06,
"loss": 1.1656,
"step": 973
},
{
"epoch": 0.2606368744982606,
"grad_norm": 4.268383979797363,
"learning_rate": 9.999880883882647e-06,
"loss": 1.1387,
"step": 974
},
{
"epoch": 0.2609044688252609,
"grad_norm": 4.6072235107421875,
"learning_rate": 9.999874697075304e-06,
"loss": 1.1754,
"step": 975
},
{
"epoch": 0.26117206315226116,
"grad_norm": 4.203128814697266,
"learning_rate": 9.999868353642579e-06,
"loss": 1.2258,
"step": 976
},
{
"epoch": 0.26143965747926146,
"grad_norm": 3.7577922344207764,
"learning_rate": 9.999861853584666e-06,
"loss": 1.0342,
"step": 977
},
{
"epoch": 0.2617072518062617,
"grad_norm": 4.237786769866943,
"learning_rate": 9.999855196901773e-06,
"loss": 1.2248,
"step": 978
},
{
"epoch": 0.261974846133262,
"grad_norm": 4.332390785217285,
"learning_rate": 9.999848383594107e-06,
"loss": 1.2544,
"step": 979
},
{
"epoch": 0.26224244046026224,
"grad_norm": 4.468963146209717,
"learning_rate": 9.999841413661878e-06,
"loss": 1.1949,
"step": 980
},
{
"epoch": 0.26251003478726254,
"grad_norm": 4.479465484619141,
"learning_rate": 9.999834287105307e-06,
"loss": 1.1904,
"step": 981
},
{
"epoch": 0.2627776291142628,
"grad_norm": 4.083310127258301,
"learning_rate": 9.99982700392462e-06,
"loss": 1.1588,
"step": 982
},
{
"epoch": 0.263045223441263,
"grad_norm": 4.362917900085449,
"learning_rate": 9.999819564120042e-06,
"loss": 1.1189,
"step": 983
},
{
"epoch": 0.2633128177682633,
"grad_norm": 4.328512191772461,
"learning_rate": 9.999811967691805e-06,
"loss": 1.045,
"step": 984
},
{
"epoch": 0.26358041209526356,
"grad_norm": 4.410714149475098,
"learning_rate": 9.999804214640151e-06,
"loss": 1.157,
"step": 985
},
{
"epoch": 0.26384800642226386,
"grad_norm": 3.88704776763916,
"learning_rate": 9.999796304965318e-06,
"loss": 1.2426,
"step": 986
},
{
"epoch": 0.2641156007492641,
"grad_norm": 4.107239246368408,
"learning_rate": 9.999788238667558e-06,
"loss": 1.1627,
"step": 987
},
{
"epoch": 0.2643831950762644,
"grad_norm": 4.336009979248047,
"learning_rate": 9.999780015747122e-06,
"loss": 1.2247,
"step": 988
},
{
"epoch": 0.26465078940326464,
"grad_norm": 4.0795464515686035,
"learning_rate": 9.999771636204267e-06,
"loss": 1.1928,
"step": 989
},
{
"epoch": 0.26491838373026494,
"grad_norm": 4.309201717376709,
"learning_rate": 9.999763100039256e-06,
"loss": 1.1767,
"step": 990
},
{
"epoch": 0.2651859780572652,
"grad_norm": 4.34153938293457,
"learning_rate": 9.999754407252356e-06,
"loss": 1.3238,
"step": 991
},
{
"epoch": 0.2654535723842655,
"grad_norm": 4.0108842849731445,
"learning_rate": 9.99974555784384e-06,
"loss": 1.0851,
"step": 992
},
{
"epoch": 0.2657211667112657,
"grad_norm": 4.6918768882751465,
"learning_rate": 9.999736551813986e-06,
"loss": 1.24,
"step": 993
},
{
"epoch": 0.26598876103826596,
"grad_norm": 4.068446636199951,
"learning_rate": 9.999727389163074e-06,
"loss": 1.1031,
"step": 994
},
{
"epoch": 0.26625635536526626,
"grad_norm": 4.262712478637695,
"learning_rate": 9.999718069891392e-06,
"loss": 1.274,
"step": 995
},
{
"epoch": 0.2665239496922665,
"grad_norm": 4.3243889808654785,
"learning_rate": 9.999708593999234e-06,
"loss": 1.2472,
"step": 996
},
{
"epoch": 0.2667915440192668,
"grad_norm": 4.188782691955566,
"learning_rate": 9.999698961486892e-06,
"loss": 1.2658,
"step": 997
},
{
"epoch": 0.26705913834626704,
"grad_norm": 3.9021859169006348,
"learning_rate": 9.999689172354672e-06,
"loss": 1.0972,
"step": 998
},
{
"epoch": 0.26732673267326734,
"grad_norm": 4.386773586273193,
"learning_rate": 9.999679226602878e-06,
"loss": 1.1707,
"step": 999
},
{
"epoch": 0.2675943270002676,
"grad_norm": 4.411870956420898,
"learning_rate": 9.999669124231824e-06,
"loss": 1.188,
"step": 1000
},
{
"epoch": 0.2675943270002676,
"eval_loss": 1.216786503791809,
"eval_runtime": 11.6813,
"eval_samples_per_second": 34.243,
"eval_steps_per_second": 4.28,
"step": 1000
},
{
"epoch": 0.2678619213272679,
"grad_norm": 4.481796741485596,
"learning_rate": 9.999658865241827e-06,
"loss": 1.2667,
"step": 1001
},
{
"epoch": 0.2681295156542681,
"grad_norm": 3.979875087738037,
"learning_rate": 9.999648449633204e-06,
"loss": 1.1305,
"step": 1002
},
{
"epoch": 0.2683971099812684,
"grad_norm": 3.8728244304656982,
"learning_rate": 9.999637877406284e-06,
"loss": 1.1231,
"step": 1003
},
{
"epoch": 0.26866470430826866,
"grad_norm": 4.459341526031494,
"learning_rate": 9.999627148561399e-06,
"loss": 1.1543,
"step": 1004
},
{
"epoch": 0.2689322986352689,
"grad_norm": 4.173006534576416,
"learning_rate": 9.999616263098886e-06,
"loss": 1.2025,
"step": 1005
},
{
"epoch": 0.2691998929622692,
"grad_norm": 4.177968978881836,
"learning_rate": 9.999605221019082e-06,
"loss": 1.194,
"step": 1006
},
{
"epoch": 0.26946748728926945,
"grad_norm": 4.970066547393799,
"learning_rate": 9.999594022322334e-06,
"loss": 1.2869,
"step": 1007
},
{
"epoch": 0.26973508161626975,
"grad_norm": 4.600182056427002,
"learning_rate": 9.999582667008995e-06,
"loss": 1.3119,
"step": 1008
},
{
"epoch": 0.27000267594327,
"grad_norm": 4.465086460113525,
"learning_rate": 9.999571155079422e-06,
"loss": 1.1683,
"step": 1009
},
{
"epoch": 0.2702702702702703,
"grad_norm": 4.228415012359619,
"learning_rate": 9.999559486533971e-06,
"loss": 1.1939,
"step": 1010
},
{
"epoch": 0.2705378645972705,
"grad_norm": 4.573855400085449,
"learning_rate": 9.99954766137301e-06,
"loss": 1.325,
"step": 1011
},
{
"epoch": 0.2708054589242708,
"grad_norm": 4.398594379425049,
"learning_rate": 9.99953567959691e-06,
"loss": 1.1402,
"step": 1012
},
{
"epoch": 0.27107305325127107,
"grad_norm": 4.025271892547607,
"learning_rate": 9.999523541206044e-06,
"loss": 1.1139,
"step": 1013
},
{
"epoch": 0.27134064757827137,
"grad_norm": 4.192676067352295,
"learning_rate": 9.999511246200795e-06,
"loss": 1.1699,
"step": 1014
},
{
"epoch": 0.2716082419052716,
"grad_norm": 4.185833930969238,
"learning_rate": 9.999498794581548e-06,
"loss": 1.1549,
"step": 1015
},
{
"epoch": 0.27187583623227185,
"grad_norm": 4.256872653961182,
"learning_rate": 9.99948618634869e-06,
"loss": 1.1454,
"step": 1016
},
{
"epoch": 0.27214343055927215,
"grad_norm": 4.221078395843506,
"learning_rate": 9.99947342150262e-06,
"loss": 1.1097,
"step": 1017
},
{
"epoch": 0.2724110248862724,
"grad_norm": 4.532137870788574,
"learning_rate": 9.999460500043734e-06,
"loss": 1.341,
"step": 1018
},
{
"epoch": 0.2726786192132727,
"grad_norm": 3.931379795074463,
"learning_rate": 9.999447421972439e-06,
"loss": 1.1228,
"step": 1019
},
{
"epoch": 0.27294621354027293,
"grad_norm": 4.363259792327881,
"learning_rate": 9.999434187289145e-06,
"loss": 1.1633,
"step": 1020
},
{
"epoch": 0.27321380786727323,
"grad_norm": 3.8973734378814697,
"learning_rate": 9.999420795994266e-06,
"loss": 1.2312,
"step": 1021
},
{
"epoch": 0.27348140219427347,
"grad_norm": 4.490160942077637,
"learning_rate": 9.99940724808822e-06,
"loss": 1.2081,
"step": 1022
},
{
"epoch": 0.27374899652127377,
"grad_norm": 4.149991035461426,
"learning_rate": 9.999393543571434e-06,
"loss": 1.2413,
"step": 1023
},
{
"epoch": 0.274016590848274,
"grad_norm": 4.234299659729004,
"learning_rate": 9.999379682444338e-06,
"loss": 1.2663,
"step": 1024
},
{
"epoch": 0.2742841851752743,
"grad_norm": 4.101381301879883,
"learning_rate": 9.999365664707361e-06,
"loss": 1.1646,
"step": 1025
},
{
"epoch": 0.27455177950227455,
"grad_norm": 4.511719226837158,
"learning_rate": 9.999351490360947e-06,
"loss": 1.3628,
"step": 1026
},
{
"epoch": 0.2748193738292748,
"grad_norm": 4.125613212585449,
"learning_rate": 9.999337159405538e-06,
"loss": 1.1866,
"step": 1027
},
{
"epoch": 0.2750869681562751,
"grad_norm": 4.274496555328369,
"learning_rate": 9.999322671841583e-06,
"loss": 1.1889,
"step": 1028
},
{
"epoch": 0.27535456248327533,
"grad_norm": 4.035276889801025,
"learning_rate": 9.999308027669537e-06,
"loss": 1.096,
"step": 1029
},
{
"epoch": 0.27562215681027563,
"grad_norm": 4.041557312011719,
"learning_rate": 9.999293226889857e-06,
"loss": 1.1437,
"step": 1030
},
{
"epoch": 0.2758897511372759,
"grad_norm": 4.458560943603516,
"learning_rate": 9.999278269503008e-06,
"loss": 1.1608,
"step": 1031
},
{
"epoch": 0.27615734546427617,
"grad_norm": 3.992985486984253,
"learning_rate": 9.999263155509459e-06,
"loss": 1.0251,
"step": 1032
},
{
"epoch": 0.2764249397912764,
"grad_norm": 3.9736506938934326,
"learning_rate": 9.999247884909682e-06,
"loss": 1.2267,
"step": 1033
},
{
"epoch": 0.2766925341182767,
"grad_norm": 4.004456996917725,
"learning_rate": 9.999232457704155e-06,
"loss": 1.0958,
"step": 1034
},
{
"epoch": 0.27696012844527695,
"grad_norm": 4.022693157196045,
"learning_rate": 9.999216873893364e-06,
"loss": 1.1375,
"step": 1035
},
{
"epoch": 0.27722772277227725,
"grad_norm": 3.8458046913146973,
"learning_rate": 9.999201133477793e-06,
"loss": 1.1408,
"step": 1036
},
{
"epoch": 0.2774953170992775,
"grad_norm": 4.127901554107666,
"learning_rate": 9.999185236457941e-06,
"loss": 1.3119,
"step": 1037
},
{
"epoch": 0.27776291142627774,
"grad_norm": 4.242637634277344,
"learning_rate": 9.9991691828343e-06,
"loss": 1.1941,
"step": 1038
},
{
"epoch": 0.27803050575327803,
"grad_norm": 4.139479160308838,
"learning_rate": 9.999152972607377e-06,
"loss": 1.0765,
"step": 1039
},
{
"epoch": 0.2782981000802783,
"grad_norm": 4.560730457305908,
"learning_rate": 9.999136605777678e-06,
"loss": 1.4193,
"step": 1040
},
{
"epoch": 0.2785656944072786,
"grad_norm": 4.292839050292969,
"learning_rate": 9.999120082345714e-06,
"loss": 1.2548,
"step": 1041
},
{
"epoch": 0.2788332887342788,
"grad_norm": 4.288617134094238,
"learning_rate": 9.999103402312005e-06,
"loss": 1.2433,
"step": 1042
},
{
"epoch": 0.2791008830612791,
"grad_norm": 4.059001445770264,
"learning_rate": 9.999086565677075e-06,
"loss": 1.2556,
"step": 1043
},
{
"epoch": 0.27936847738827936,
"grad_norm": 4.238238334655762,
"learning_rate": 9.999069572441448e-06,
"loss": 1.2258,
"step": 1044
},
{
"epoch": 0.27963607171527965,
"grad_norm": 3.9122133255004883,
"learning_rate": 9.999052422605657e-06,
"loss": 1.1679,
"step": 1045
},
{
"epoch": 0.2799036660422799,
"grad_norm": 4.169795513153076,
"learning_rate": 9.999035116170241e-06,
"loss": 1.19,
"step": 1046
},
{
"epoch": 0.2801712603692802,
"grad_norm": 3.937116861343384,
"learning_rate": 9.999017653135744e-06,
"loss": 1.1576,
"step": 1047
},
{
"epoch": 0.28043885469628044,
"grad_norm": 4.014969348907471,
"learning_rate": 9.999000033502706e-06,
"loss": 1.2584,
"step": 1048
},
{
"epoch": 0.2807064490232807,
"grad_norm": 3.8742942810058594,
"learning_rate": 9.998982257271685e-06,
"loss": 1.0865,
"step": 1049
},
{
"epoch": 0.280974043350281,
"grad_norm": 4.11707878112793,
"learning_rate": 9.998964324443235e-06,
"loss": 1.2188,
"step": 1050
},
{
"epoch": 0.2812416376772812,
"grad_norm": 4.459323406219482,
"learning_rate": 9.998946235017918e-06,
"loss": 1.2243,
"step": 1051
},
{
"epoch": 0.2815092320042815,
"grad_norm": 4.1629815101623535,
"learning_rate": 9.998927988996303e-06,
"loss": 1.3085,
"step": 1052
},
{
"epoch": 0.28177682633128176,
"grad_norm": 4.150962829589844,
"learning_rate": 9.998909586378959e-06,
"loss": 1.1903,
"step": 1053
},
{
"epoch": 0.28204442065828206,
"grad_norm": 3.7340071201324463,
"learning_rate": 9.998891027166466e-06,
"loss": 1.0209,
"step": 1054
},
{
"epoch": 0.2823120149852823,
"grad_norm": 4.1432695388793945,
"learning_rate": 9.9988723113594e-06,
"loss": 1.1885,
"step": 1055
},
{
"epoch": 0.2825796093122826,
"grad_norm": 4.373791217803955,
"learning_rate": 9.998853438958352e-06,
"loss": 1.2612,
"step": 1056
},
{
"epoch": 0.28284720363928284,
"grad_norm": 4.626842021942139,
"learning_rate": 9.99883440996391e-06,
"loss": 1.3311,
"step": 1057
},
{
"epoch": 0.28311479796628314,
"grad_norm": 4.128498077392578,
"learning_rate": 9.998815224376672e-06,
"loss": 1.2753,
"step": 1058
},
{
"epoch": 0.2833823922932834,
"grad_norm": 4.729836463928223,
"learning_rate": 9.998795882197238e-06,
"loss": 1.3457,
"step": 1059
},
{
"epoch": 0.2836499866202836,
"grad_norm": 4.123654365539551,
"learning_rate": 9.998776383426217e-06,
"loss": 1.2035,
"step": 1060
},
{
"epoch": 0.2839175809472839,
"grad_norm": 3.9608731269836426,
"learning_rate": 9.998756728064213e-06,
"loss": 1.1559,
"step": 1061
},
{
"epoch": 0.28418517527428416,
"grad_norm": 3.8772714138031006,
"learning_rate": 9.998736916111848e-06,
"loss": 1.1901,
"step": 1062
},
{
"epoch": 0.28445276960128446,
"grad_norm": 4.203121185302734,
"learning_rate": 9.998716947569741e-06,
"loss": 1.1789,
"step": 1063
},
{
"epoch": 0.2847203639282847,
"grad_norm": 4.262762069702148,
"learning_rate": 9.998696822438516e-06,
"loss": 1.3464,
"step": 1064
},
{
"epoch": 0.284987958255285,
"grad_norm": 4.056782245635986,
"learning_rate": 9.998676540718805e-06,
"loss": 1.1512,
"step": 1065
},
{
"epoch": 0.28525555258228524,
"grad_norm": 4.1949639320373535,
"learning_rate": 9.998656102411245e-06,
"loss": 1.2288,
"step": 1066
},
{
"epoch": 0.28552314690928554,
"grad_norm": 4.408857345581055,
"learning_rate": 9.99863550751647e-06,
"loss": 1.2456,
"step": 1067
},
{
"epoch": 0.2857907412362858,
"grad_norm": 4.202237606048584,
"learning_rate": 9.998614756035132e-06,
"loss": 1.2651,
"step": 1068
},
{
"epoch": 0.2860583355632861,
"grad_norm": 4.139695167541504,
"learning_rate": 9.998593847967877e-06,
"loss": 1.1924,
"step": 1069
},
{
"epoch": 0.2863259298902863,
"grad_norm": 4.123232841491699,
"learning_rate": 9.998572783315361e-06,
"loss": 1.2642,
"step": 1070
},
{
"epoch": 0.28659352421728657,
"grad_norm": 4.614407539367676,
"learning_rate": 9.998551562078245e-06,
"loss": 1.1987,
"step": 1071
},
{
"epoch": 0.28686111854428686,
"grad_norm": 4.054043769836426,
"learning_rate": 9.998530184257194e-06,
"loss": 1.1046,
"step": 1072
},
{
"epoch": 0.2871287128712871,
"grad_norm": 4.0423760414123535,
"learning_rate": 9.998508649852874e-06,
"loss": 1.1435,
"step": 1073
},
{
"epoch": 0.2873963071982874,
"grad_norm": 4.187506198883057,
"learning_rate": 9.998486958865965e-06,
"loss": 1.1171,
"step": 1074
},
{
"epoch": 0.28766390152528765,
"grad_norm": 4.307306289672852,
"learning_rate": 9.998465111297141e-06,
"loss": 1.1656,
"step": 1075
},
{
"epoch": 0.28793149585228794,
"grad_norm": 4.108502388000488,
"learning_rate": 9.99844310714709e-06,
"loss": 1.1522,
"step": 1076
},
{
"epoch": 0.2881990901792882,
"grad_norm": 4.7379069328308105,
"learning_rate": 9.9984209464165e-06,
"loss": 1.3547,
"step": 1077
},
{
"epoch": 0.2884666845062885,
"grad_norm": 4.607676029205322,
"learning_rate": 9.998398629106068e-06,
"loss": 1.2304,
"step": 1078
},
{
"epoch": 0.2887342788332887,
"grad_norm": 4.571547031402588,
"learning_rate": 9.998376155216487e-06,
"loss": 1.2825,
"step": 1079
},
{
"epoch": 0.289001873160289,
"grad_norm": 4.113447189331055,
"learning_rate": 9.998353524748468e-06,
"loss": 1.143,
"step": 1080
},
{
"epoch": 0.28926946748728927,
"grad_norm": 4.24326753616333,
"learning_rate": 9.998330737702714e-06,
"loss": 1.2782,
"step": 1081
},
{
"epoch": 0.28953706181428956,
"grad_norm": 3.839808464050293,
"learning_rate": 9.998307794079942e-06,
"loss": 1.1638,
"step": 1082
},
{
"epoch": 0.2898046561412898,
"grad_norm": 4.0532355308532715,
"learning_rate": 9.998284693880871e-06,
"loss": 1.2157,
"step": 1083
},
{
"epoch": 0.29007225046829005,
"grad_norm": 4.362560272216797,
"learning_rate": 9.998261437106223e-06,
"loss": 1.3047,
"step": 1084
},
{
"epoch": 0.29033984479529035,
"grad_norm": 4.096391201019287,
"learning_rate": 9.998238023756727e-06,
"loss": 1.3003,
"step": 1085
},
{
"epoch": 0.2906074391222906,
"grad_norm": 3.965895891189575,
"learning_rate": 9.998214453833118e-06,
"loss": 1.2066,
"step": 1086
},
{
"epoch": 0.2908750334492909,
"grad_norm": 4.827084541320801,
"learning_rate": 9.998190727336133e-06,
"loss": 1.3066,
"step": 1087
},
{
"epoch": 0.29114262777629113,
"grad_norm": 3.926433563232422,
"learning_rate": 9.998166844266515e-06,
"loss": 1.1789,
"step": 1088
},
{
"epoch": 0.2914102221032914,
"grad_norm": 4.2660651206970215,
"learning_rate": 9.998142804625011e-06,
"loss": 1.259,
"step": 1089
},
{
"epoch": 0.29167781643029167,
"grad_norm": 4.009738922119141,
"learning_rate": 9.998118608412378e-06,
"loss": 1.1624,
"step": 1090
},
{
"epoch": 0.29194541075729197,
"grad_norm": 4.329594612121582,
"learning_rate": 9.99809425562937e-06,
"loss": 1.2244,
"step": 1091
},
{
"epoch": 0.2922130050842922,
"grad_norm": 4.109816551208496,
"learning_rate": 9.998069746276752e-06,
"loss": 1.1706,
"step": 1092
},
{
"epoch": 0.2924805994112925,
"grad_norm": 4.28621244430542,
"learning_rate": 9.998045080355291e-06,
"loss": 1.2071,
"step": 1093
},
{
"epoch": 0.29274819373829275,
"grad_norm": 4.0972747802734375,
"learning_rate": 9.99802025786576e-06,
"loss": 1.1558,
"step": 1094
},
{
"epoch": 0.293015788065293,
"grad_norm": 4.451328277587891,
"learning_rate": 9.997995278808936e-06,
"loss": 1.3491,
"step": 1095
},
{
"epoch": 0.2932833823922933,
"grad_norm": 3.864147663116455,
"learning_rate": 9.997970143185603e-06,
"loss": 1.1395,
"step": 1096
},
{
"epoch": 0.29355097671929353,
"grad_norm": 4.177571773529053,
"learning_rate": 9.997944850996546e-06,
"loss": 1.319,
"step": 1097
},
{
"epoch": 0.29381857104629383,
"grad_norm": 3.75541615486145,
"learning_rate": 9.99791940224256e-06,
"loss": 1.1284,
"step": 1098
},
{
"epoch": 0.2940861653732941,
"grad_norm": 3.947469711303711,
"learning_rate": 9.99789379692444e-06,
"loss": 1.0627,
"step": 1099
},
{
"epoch": 0.29435375970029437,
"grad_norm": 4.161018371582031,
"learning_rate": 9.99786803504299e-06,
"loss": 1.2537,
"step": 1100
},
{
"epoch": 0.2946213540272946,
"grad_norm": 4.357724189758301,
"learning_rate": 9.997842116599014e-06,
"loss": 1.3133,
"step": 1101
},
{
"epoch": 0.2948889483542949,
"grad_norm": 4.223912715911865,
"learning_rate": 9.997816041593327e-06,
"loss": 1.2574,
"step": 1102
},
{
"epoch": 0.29515654268129515,
"grad_norm": 3.6964030265808105,
"learning_rate": 9.997789810026746e-06,
"loss": 1.0187,
"step": 1103
},
{
"epoch": 0.29542413700829545,
"grad_norm": 3.9817559719085693,
"learning_rate": 9.99776342190009e-06,
"loss": 1.1749,
"step": 1104
},
{
"epoch": 0.2956917313352957,
"grad_norm": 4.123600959777832,
"learning_rate": 9.997736877214187e-06,
"loss": 1.192,
"step": 1105
},
{
"epoch": 0.29595932566229594,
"grad_norm": 4.295464515686035,
"learning_rate": 9.99771017596987e-06,
"loss": 1.1886,
"step": 1106
},
{
"epoch": 0.29622691998929623,
"grad_norm": 4.501376628875732,
"learning_rate": 9.997683318167972e-06,
"loss": 1.2161,
"step": 1107
},
{
"epoch": 0.2964945143162965,
"grad_norm": 4.243162631988525,
"learning_rate": 9.997656303809338e-06,
"loss": 1.2048,
"step": 1108
},
{
"epoch": 0.2967621086432968,
"grad_norm": 4.504419326782227,
"learning_rate": 9.997629132894812e-06,
"loss": 1.263,
"step": 1109
},
{
"epoch": 0.297029702970297,
"grad_norm": 4.300513744354248,
"learning_rate": 9.997601805425246e-06,
"loss": 1.0954,
"step": 1110
},
{
"epoch": 0.2972972972972973,
"grad_norm": 4.057127952575684,
"learning_rate": 9.997574321401495e-06,
"loss": 1.1716,
"step": 1111
},
{
"epoch": 0.29756489162429756,
"grad_norm": 3.755995035171509,
"learning_rate": 9.997546680824422e-06,
"loss": 1.0806,
"step": 1112
},
{
"epoch": 0.29783248595129785,
"grad_norm": 4.294164180755615,
"learning_rate": 9.99751888369489e-06,
"loss": 1.358,
"step": 1113
},
{
"epoch": 0.2981000802782981,
"grad_norm": 3.7034502029418945,
"learning_rate": 9.997490930013773e-06,
"loss": 1.1258,
"step": 1114
},
{
"epoch": 0.2983676746052984,
"grad_norm": 4.4115071296691895,
"learning_rate": 9.997462819781944e-06,
"loss": 1.3023,
"step": 1115
},
{
"epoch": 0.29863526893229864,
"grad_norm": 4.0352678298950195,
"learning_rate": 9.997434553000286e-06,
"loss": 1.1215,
"step": 1116
},
{
"epoch": 0.2989028632592989,
"grad_norm": 3.9848623275756836,
"learning_rate": 9.997406129669682e-06,
"loss": 1.1101,
"step": 1117
},
{
"epoch": 0.2991704575862992,
"grad_norm": 4.004817962646484,
"learning_rate": 9.997377549791025e-06,
"loss": 1.2029,
"step": 1118
},
{
"epoch": 0.2994380519132994,
"grad_norm": 4.244535446166992,
"learning_rate": 9.997348813365207e-06,
"loss": 1.1389,
"step": 1119
},
{
"epoch": 0.2997056462402997,
"grad_norm": 4.067032337188721,
"learning_rate": 9.997319920393131e-06,
"loss": 1.0913,
"step": 1120
},
{
"epoch": 0.29997324056729996,
"grad_norm": 3.9365594387054443,
"learning_rate": 9.997290870875703e-06,
"loss": 1.1128,
"step": 1121
},
{
"epoch": 0.30024083489430026,
"grad_norm": 3.7220211029052734,
"learning_rate": 9.997261664813827e-06,
"loss": 1.0801,
"step": 1122
},
{
"epoch": 0.3005084292213005,
"grad_norm": 4.3840131759643555,
"learning_rate": 9.997232302208425e-06,
"loss": 1.2494,
"step": 1123
},
{
"epoch": 0.3007760235483008,
"grad_norm": 3.811455011367798,
"learning_rate": 9.997202783060413e-06,
"loss": 1.13,
"step": 1124
},
{
"epoch": 0.30104361787530104,
"grad_norm": 4.011319637298584,
"learning_rate": 9.997173107370717e-06,
"loss": 1.216,
"step": 1125
},
{
"epoch": 0.30131121220230134,
"grad_norm": 3.8559553623199463,
"learning_rate": 9.997143275140266e-06,
"loss": 1.1322,
"step": 1126
},
{
"epoch": 0.3015788065293016,
"grad_norm": 3.9331884384155273,
"learning_rate": 9.997113286369995e-06,
"loss": 1.132,
"step": 1127
},
{
"epoch": 0.3018464008563018,
"grad_norm": 3.7718307971954346,
"learning_rate": 9.997083141060842e-06,
"loss": 1.2221,
"step": 1128
},
{
"epoch": 0.3021139951833021,
"grad_norm": 4.190203666687012,
"learning_rate": 9.997052839213752e-06,
"loss": 1.2322,
"step": 1129
},
{
"epoch": 0.30238158951030236,
"grad_norm": 3.806379795074463,
"learning_rate": 9.997022380829677e-06,
"loss": 1.1844,
"step": 1130
},
{
"epoch": 0.30264918383730266,
"grad_norm": 3.8173298835754395,
"learning_rate": 9.996991765909568e-06,
"loss": 1.1185,
"step": 1131
},
{
"epoch": 0.3029167781643029,
"grad_norm": 4.177835464477539,
"learning_rate": 9.996960994454383e-06,
"loss": 1.2292,
"step": 1132
},
{
"epoch": 0.3031843724913032,
"grad_norm": 4.42379903793335,
"learning_rate": 9.996930066465091e-06,
"loss": 1.2661,
"step": 1133
},
{
"epoch": 0.30345196681830344,
"grad_norm": 4.7846455574035645,
"learning_rate": 9.996898981942655e-06,
"loss": 1.1785,
"step": 1134
},
{
"epoch": 0.30371956114530374,
"grad_norm": 4.796987533569336,
"learning_rate": 9.996867740888052e-06,
"loss": 1.2928,
"step": 1135
},
{
"epoch": 0.303987155472304,
"grad_norm": 4.014819145202637,
"learning_rate": 9.996836343302261e-06,
"loss": 1.1626,
"step": 1136
},
{
"epoch": 0.3042547497993043,
"grad_norm": 4.26397180557251,
"learning_rate": 9.996804789186263e-06,
"loss": 1.0987,
"step": 1137
},
{
"epoch": 0.3045223441263045,
"grad_norm": 4.477066516876221,
"learning_rate": 9.99677307854105e-06,
"loss": 1.28,
"step": 1138
},
{
"epoch": 0.30478993845330477,
"grad_norm": 4.38161039352417,
"learning_rate": 9.996741211367613e-06,
"loss": 1.2668,
"step": 1139
},
{
"epoch": 0.30505753278030506,
"grad_norm": 4.141867160797119,
"learning_rate": 9.996709187666951e-06,
"loss": 1.2651,
"step": 1140
},
{
"epoch": 0.3053251271073053,
"grad_norm": 3.891883134841919,
"learning_rate": 9.996677007440065e-06,
"loss": 1.2046,
"step": 1141
},
{
"epoch": 0.3055927214343056,
"grad_norm": 4.575502395629883,
"learning_rate": 9.996644670687966e-06,
"loss": 1.3873,
"step": 1142
},
{
"epoch": 0.30586031576130585,
"grad_norm": 3.6624574661254883,
"learning_rate": 9.996612177411667e-06,
"loss": 1.1507,
"step": 1143
},
{
"epoch": 0.30612791008830614,
"grad_norm": 4.047989845275879,
"learning_rate": 9.996579527612182e-06,
"loss": 1.1696,
"step": 1144
},
{
"epoch": 0.3063955044153064,
"grad_norm": 4.16288948059082,
"learning_rate": 9.99654672129054e-06,
"loss": 1.191,
"step": 1145
},
{
"epoch": 0.3066630987423067,
"grad_norm": 4.057130813598633,
"learning_rate": 9.996513758447764e-06,
"loss": 1.2684,
"step": 1146
},
{
"epoch": 0.3069306930693069,
"grad_norm": 4.151482582092285,
"learning_rate": 9.996480639084887e-06,
"loss": 1.3478,
"step": 1147
},
{
"epoch": 0.3071982873963072,
"grad_norm": 3.9425978660583496,
"learning_rate": 9.996447363202947e-06,
"loss": 1.1573,
"step": 1148
},
{
"epoch": 0.30746588172330747,
"grad_norm": 4.016078948974609,
"learning_rate": 9.996413930802988e-06,
"loss": 1.175,
"step": 1149
},
{
"epoch": 0.3077334760503077,
"grad_norm": 4.067404270172119,
"learning_rate": 9.996380341886055e-06,
"loss": 1.2151,
"step": 1150
},
{
"epoch": 0.308001070377308,
"grad_norm": 4.010457992553711,
"learning_rate": 9.996346596453202e-06,
"loss": 1.2175,
"step": 1151
},
{
"epoch": 0.30826866470430825,
"grad_norm": 4.103924751281738,
"learning_rate": 9.996312694505486e-06,
"loss": 1.1351,
"step": 1152
},
{
"epoch": 0.30853625903130855,
"grad_norm": 4.0794243812561035,
"learning_rate": 9.996278636043966e-06,
"loss": 1.1801,
"step": 1153
},
{
"epoch": 0.3088038533583088,
"grad_norm": 3.910602331161499,
"learning_rate": 9.996244421069714e-06,
"loss": 1.2453,
"step": 1154
},
{
"epoch": 0.3090714476853091,
"grad_norm": 4.31195068359375,
"learning_rate": 9.996210049583796e-06,
"loss": 1.1257,
"step": 1155
},
{
"epoch": 0.30933904201230933,
"grad_norm": 4.224134922027588,
"learning_rate": 9.996175521587294e-06,
"loss": 1.2855,
"step": 1156
},
{
"epoch": 0.3096066363393096,
"grad_norm": 4.1098198890686035,
"learning_rate": 9.996140837081288e-06,
"loss": 1.2321,
"step": 1157
},
{
"epoch": 0.30987423066630987,
"grad_norm": 4.49318265914917,
"learning_rate": 9.996105996066862e-06,
"loss": 1.2987,
"step": 1158
},
{
"epoch": 0.31014182499331017,
"grad_norm": 4.257841110229492,
"learning_rate": 9.99607099854511e-06,
"loss": 1.2069,
"step": 1159
},
{
"epoch": 0.3104094193203104,
"grad_norm": 4.5224385261535645,
"learning_rate": 9.996035844517129e-06,
"loss": 1.1976,
"step": 1160
},
{
"epoch": 0.31067701364731065,
"grad_norm": 4.277895927429199,
"learning_rate": 9.996000533984017e-06,
"loss": 1.2005,
"step": 1161
},
{
"epoch": 0.31094460797431095,
"grad_norm": 4.397223472595215,
"learning_rate": 9.995965066946885e-06,
"loss": 1.1852,
"step": 1162
},
{
"epoch": 0.3112122023013112,
"grad_norm": 4.2652764320373535,
"learning_rate": 9.995929443406838e-06,
"loss": 1.2549,
"step": 1163
},
{
"epoch": 0.3114797966283115,
"grad_norm": 4.212392807006836,
"learning_rate": 9.995893663364997e-06,
"loss": 1.1547,
"step": 1164
},
{
"epoch": 0.31174739095531173,
"grad_norm": 3.8345203399658203,
"learning_rate": 9.99585772682248e-06,
"loss": 1.0897,
"step": 1165
},
{
"epoch": 0.31201498528231203,
"grad_norm": 4.444775581359863,
"learning_rate": 9.995821633780413e-06,
"loss": 1.2715,
"step": 1166
},
{
"epoch": 0.31228257960931227,
"grad_norm": 4.151453971862793,
"learning_rate": 9.99578538423993e-06,
"loss": 1.1795,
"step": 1167
},
{
"epoch": 0.31255017393631257,
"grad_norm": 4.223361492156982,
"learning_rate": 9.99574897820216e-06,
"loss": 1.2359,
"step": 1168
},
{
"epoch": 0.3128177682633128,
"grad_norm": 4.0662841796875,
"learning_rate": 9.99571241566825e-06,
"loss": 1.1072,
"step": 1169
},
{
"epoch": 0.3130853625903131,
"grad_norm": 4.007144451141357,
"learning_rate": 9.99567569663934e-06,
"loss": 1.198,
"step": 1170
},
{
"epoch": 0.31335295691731335,
"grad_norm": 3.9480855464935303,
"learning_rate": 9.995638821116585e-06,
"loss": 1.0293,
"step": 1171
},
{
"epoch": 0.3136205512443136,
"grad_norm": 3.7751834392547607,
"learning_rate": 9.995601789101138e-06,
"loss": 1.0231,
"step": 1172
},
{
"epoch": 0.3138881455713139,
"grad_norm": 4.609216690063477,
"learning_rate": 9.995564600594159e-06,
"loss": 1.1539,
"step": 1173
},
{
"epoch": 0.31415573989831413,
"grad_norm": 4.05670166015625,
"learning_rate": 9.995527255596812e-06,
"loss": 1.1977,
"step": 1174
},
{
"epoch": 0.31442333422531443,
"grad_norm": 3.651618242263794,
"learning_rate": 9.995489754110268e-06,
"loss": 1.0947,
"step": 1175
},
{
"epoch": 0.3146909285523147,
"grad_norm": 4.308838844299316,
"learning_rate": 9.995452096135703e-06,
"loss": 1.1942,
"step": 1176
},
{
"epoch": 0.314958522879315,
"grad_norm": 3.8746747970581055,
"learning_rate": 9.995414281674294e-06,
"loss": 1.1572,
"step": 1177
},
{
"epoch": 0.3152261172063152,
"grad_norm": 4.089914321899414,
"learning_rate": 9.995376310727227e-06,
"loss": 1.2842,
"step": 1178
},
{
"epoch": 0.3154937115333155,
"grad_norm": 4.342733860015869,
"learning_rate": 9.995338183295693e-06,
"loss": 1.285,
"step": 1179
},
{
"epoch": 0.31576130586031576,
"grad_norm": 3.697603225708008,
"learning_rate": 9.995299899380884e-06,
"loss": 1.1125,
"step": 1180
},
{
"epoch": 0.31602890018731605,
"grad_norm": 4.422861099243164,
"learning_rate": 9.995261458983999e-06,
"loss": 1.2552,
"step": 1181
},
{
"epoch": 0.3162964945143163,
"grad_norm": 4.288775444030762,
"learning_rate": 9.995222862106245e-06,
"loss": 1.3295,
"step": 1182
},
{
"epoch": 0.31656408884131654,
"grad_norm": 3.8024377822875977,
"learning_rate": 9.995184108748827e-06,
"loss": 1.1542,
"step": 1183
},
{
"epoch": 0.31683168316831684,
"grad_norm": 4.05307674407959,
"learning_rate": 9.995145198912962e-06,
"loss": 1.1841,
"step": 1184
},
{
"epoch": 0.3170992774953171,
"grad_norm": 4.2776265144348145,
"learning_rate": 9.995106132599869e-06,
"loss": 1.1981,
"step": 1185
},
{
"epoch": 0.3173668718223174,
"grad_norm": 3.631357431411743,
"learning_rate": 9.995066909810771e-06,
"loss": 1.1741,
"step": 1186
},
{
"epoch": 0.3176344661493176,
"grad_norm": 3.9295742511749268,
"learning_rate": 9.995027530546895e-06,
"loss": 1.0733,
"step": 1187
},
{
"epoch": 0.3179020604763179,
"grad_norm": 3.839838743209839,
"learning_rate": 9.994987994809478e-06,
"loss": 1.2681,
"step": 1188
},
{
"epoch": 0.31816965480331816,
"grad_norm": 3.7129993438720703,
"learning_rate": 9.994948302599757e-06,
"loss": 1.0686,
"step": 1189
},
{
"epoch": 0.31843724913031846,
"grad_norm": 4.252348899841309,
"learning_rate": 9.994908453918973e-06,
"loss": 1.2652,
"step": 1190
},
{
"epoch": 0.3187048434573187,
"grad_norm": 4.2434916496276855,
"learning_rate": 9.994868448768378e-06,
"loss": 1.227,
"step": 1191
},
{
"epoch": 0.318972437784319,
"grad_norm": 3.613661766052246,
"learning_rate": 9.994828287149224e-06,
"loss": 1.0919,
"step": 1192
},
{
"epoch": 0.31924003211131924,
"grad_norm": 4.0223469734191895,
"learning_rate": 9.994787969062767e-06,
"loss": 1.161,
"step": 1193
},
{
"epoch": 0.3195076264383195,
"grad_norm": 3.802426815032959,
"learning_rate": 9.994747494510274e-06,
"loss": 1.2561,
"step": 1194
},
{
"epoch": 0.3197752207653198,
"grad_norm": 3.8129377365112305,
"learning_rate": 9.994706863493007e-06,
"loss": 1.1638,
"step": 1195
},
{
"epoch": 0.32004281509232,
"grad_norm": 4.0562872886657715,
"learning_rate": 9.994666076012245e-06,
"loss": 1.2713,
"step": 1196
},
{
"epoch": 0.3203104094193203,
"grad_norm": 4.090336322784424,
"learning_rate": 9.994625132069263e-06,
"loss": 1.1567,
"step": 1197
},
{
"epoch": 0.32057800374632056,
"grad_norm": 4.030067443847656,
"learning_rate": 9.994584031665345e-06,
"loss": 1.1686,
"step": 1198
},
{
"epoch": 0.32084559807332086,
"grad_norm": 4.26224422454834,
"learning_rate": 9.994542774801774e-06,
"loss": 1.1967,
"step": 1199
},
{
"epoch": 0.3211131924003211,
"grad_norm": 4.3625102043151855,
"learning_rate": 9.994501361479847e-06,
"loss": 1.2828,
"step": 1200
},
{
"epoch": 0.3213807867273214,
"grad_norm": 4.201301574707031,
"learning_rate": 9.99445979170086e-06,
"loss": 1.2463,
"step": 1201
},
{
"epoch": 0.32164838105432164,
"grad_norm": 3.624171733856201,
"learning_rate": 9.994418065466116e-06,
"loss": 1.1004,
"step": 1202
},
{
"epoch": 0.32191597538132194,
"grad_norm": 3.7936317920684814,
"learning_rate": 9.99437618277692e-06,
"loss": 1.0925,
"step": 1203
},
{
"epoch": 0.3221835697083222,
"grad_norm": 3.740135431289673,
"learning_rate": 9.994334143634587e-06,
"loss": 1.1698,
"step": 1204
},
{
"epoch": 0.3224511640353224,
"grad_norm": 4.0127458572387695,
"learning_rate": 9.994291948040429e-06,
"loss": 1.2089,
"step": 1205
},
{
"epoch": 0.3227187583623227,
"grad_norm": 4.131107807159424,
"learning_rate": 9.994249595995774e-06,
"loss": 1.1964,
"step": 1206
},
{
"epoch": 0.32298635268932296,
"grad_norm": 3.945056200027466,
"learning_rate": 9.994207087501945e-06,
"loss": 1.2649,
"step": 1207
},
{
"epoch": 0.32325394701632326,
"grad_norm": 4.262823581695557,
"learning_rate": 9.994164422560273e-06,
"loss": 1.2617,
"step": 1208
},
{
"epoch": 0.3235215413433235,
"grad_norm": 4.310561180114746,
"learning_rate": 9.994121601172097e-06,
"loss": 1.2077,
"step": 1209
},
{
"epoch": 0.3237891356703238,
"grad_norm": 4.025747299194336,
"learning_rate": 9.994078623338757e-06,
"loss": 1.1637,
"step": 1210
},
{
"epoch": 0.32405672999732404,
"grad_norm": 3.766697883605957,
"learning_rate": 9.9940354890616e-06,
"loss": 1.0568,
"step": 1211
},
{
"epoch": 0.32432432432432434,
"grad_norm": 4.299594402313232,
"learning_rate": 9.993992198341976e-06,
"loss": 1.2301,
"step": 1212
},
{
"epoch": 0.3245919186513246,
"grad_norm": 3.945216417312622,
"learning_rate": 9.993948751181243e-06,
"loss": 1.2631,
"step": 1213
},
{
"epoch": 0.3248595129783249,
"grad_norm": 4.33341121673584,
"learning_rate": 9.99390514758076e-06,
"loss": 1.2464,
"step": 1214
},
{
"epoch": 0.3251271073053251,
"grad_norm": 4.582106590270996,
"learning_rate": 9.993861387541894e-06,
"loss": 1.2877,
"step": 1215
},
{
"epoch": 0.32539470163232537,
"grad_norm": 4.363495349884033,
"learning_rate": 9.993817471066016e-06,
"loss": 1.1593,
"step": 1216
},
{
"epoch": 0.32566229595932566,
"grad_norm": 4.1914873123168945,
"learning_rate": 9.9937733981545e-06,
"loss": 1.2427,
"step": 1217
},
{
"epoch": 0.3259298902863259,
"grad_norm": 4.207976341247559,
"learning_rate": 9.99372916880873e-06,
"loss": 1.2129,
"step": 1218
},
{
"epoch": 0.3261974846133262,
"grad_norm": 3.9144184589385986,
"learning_rate": 9.99368478303009e-06,
"loss": 1.1604,
"step": 1219
},
{
"epoch": 0.32646507894032645,
"grad_norm": 3.9831881523132324,
"learning_rate": 9.993640240819966e-06,
"loss": 1.3163,
"step": 1220
},
{
"epoch": 0.32673267326732675,
"grad_norm": 4.0437331199646,
"learning_rate": 9.993595542179762e-06,
"loss": 1.2957,
"step": 1221
},
{
"epoch": 0.327000267594327,
"grad_norm": 3.9239695072174072,
"learning_rate": 9.99355068711087e-06,
"loss": 1.1359,
"step": 1222
},
{
"epoch": 0.3272678619213273,
"grad_norm": 3.7763185501098633,
"learning_rate": 9.993505675614699e-06,
"loss": 1.1569,
"step": 1223
},
{
"epoch": 0.32753545624832753,
"grad_norm": 3.6293134689331055,
"learning_rate": 9.99346050769266e-06,
"loss": 1.0692,
"step": 1224
},
{
"epoch": 0.3278030505753278,
"grad_norm": 3.8709805011749268,
"learning_rate": 9.993415183346168e-06,
"loss": 1.0574,
"step": 1225
},
{
"epoch": 0.32807064490232807,
"grad_norm": 4.066141605377197,
"learning_rate": 9.993369702576638e-06,
"loss": 1.2466,
"step": 1226
},
{
"epoch": 0.32833823922932837,
"grad_norm": 4.21537446975708,
"learning_rate": 9.993324065385499e-06,
"loss": 1.2357,
"step": 1227
},
{
"epoch": 0.3286058335563286,
"grad_norm": 3.732475757598877,
"learning_rate": 9.99327827177418e-06,
"loss": 1.0866,
"step": 1228
},
{
"epoch": 0.32887342788332885,
"grad_norm": 3.632660150527954,
"learning_rate": 9.993232321744117e-06,
"loss": 1.0751,
"step": 1229
},
{
"epoch": 0.32914102221032915,
"grad_norm": 4.376312732696533,
"learning_rate": 9.993186215296747e-06,
"loss": 1.1535,
"step": 1230
},
{
"epoch": 0.3294086165373294,
"grad_norm": 4.29062557220459,
"learning_rate": 9.993139952433513e-06,
"loss": 1.298,
"step": 1231
},
{
"epoch": 0.3296762108643297,
"grad_norm": 4.182230472564697,
"learning_rate": 9.99309353315587e-06,
"loss": 1.1764,
"step": 1232
},
{
"epoch": 0.32994380519132993,
"grad_norm": 4.025058269500732,
"learning_rate": 9.993046957465264e-06,
"loss": 1.0655,
"step": 1233
},
{
"epoch": 0.33021139951833023,
"grad_norm": 4.065793514251709,
"learning_rate": 9.99300022536316e-06,
"loss": 1.2838,
"step": 1234
},
{
"epoch": 0.33047899384533047,
"grad_norm": 4.3913397789001465,
"learning_rate": 9.99295333685102e-06,
"loss": 1.3101,
"step": 1235
},
{
"epoch": 0.33074658817233077,
"grad_norm": 4.253934860229492,
"learning_rate": 9.992906291930315e-06,
"loss": 1.2625,
"step": 1236
},
{
"epoch": 0.331014182499331,
"grad_norm": 4.285333633422852,
"learning_rate": 9.992859090602515e-06,
"loss": 1.246,
"step": 1237
},
{
"epoch": 0.3312817768263313,
"grad_norm": 4.150413513183594,
"learning_rate": 9.992811732869102e-06,
"loss": 1.1379,
"step": 1238
},
{
"epoch": 0.33154937115333155,
"grad_norm": 4.417994976043701,
"learning_rate": 9.992764218731556e-06,
"loss": 1.2882,
"step": 1239
},
{
"epoch": 0.3318169654803318,
"grad_norm": 4.12443733215332,
"learning_rate": 9.992716548191369e-06,
"loss": 1.1555,
"step": 1240
},
{
"epoch": 0.3320845598073321,
"grad_norm": 3.823629140853882,
"learning_rate": 9.992668721250031e-06,
"loss": 1.145,
"step": 1241
},
{
"epoch": 0.33235215413433233,
"grad_norm": 3.9831488132476807,
"learning_rate": 9.992620737909045e-06,
"loss": 1.1958,
"step": 1242
},
{
"epoch": 0.33261974846133263,
"grad_norm": 4.018994331359863,
"learning_rate": 9.99257259816991e-06,
"loss": 1.1069,
"step": 1243
},
{
"epoch": 0.3328873427883329,
"grad_norm": 3.987264394760132,
"learning_rate": 9.992524302034133e-06,
"loss": 1.0961,
"step": 1244
},
{
"epoch": 0.33315493711533317,
"grad_norm": 4.08268404006958,
"learning_rate": 9.992475849503232e-06,
"loss": 1.2255,
"step": 1245
},
{
"epoch": 0.3334225314423334,
"grad_norm": 4.237321853637695,
"learning_rate": 9.992427240578719e-06,
"loss": 1.2254,
"step": 1246
},
{
"epoch": 0.3336901257693337,
"grad_norm": 4.234129905700684,
"learning_rate": 9.99237847526212e-06,
"loss": 1.31,
"step": 1247
},
{
"epoch": 0.33395772009633395,
"grad_norm": 3.7745895385742188,
"learning_rate": 9.992329553554964e-06,
"loss": 1.2299,
"step": 1248
},
{
"epoch": 0.33422531442333425,
"grad_norm": 3.8982863426208496,
"learning_rate": 9.99228047545878e-06,
"loss": 1.1502,
"step": 1249
},
{
"epoch": 0.3344929087503345,
"grad_norm": 3.888578414916992,
"learning_rate": 9.992231240975107e-06,
"loss": 1.2675,
"step": 1250
},
{
"epoch": 0.33476050307733474,
"grad_norm": 3.7839229106903076,
"learning_rate": 9.992181850105488e-06,
"loss": 1.1895,
"step": 1251
},
{
"epoch": 0.33502809740433503,
"grad_norm": 3.796337366104126,
"learning_rate": 9.992132302851471e-06,
"loss": 1.1802,
"step": 1252
},
{
"epoch": 0.3352956917313353,
"grad_norm": 3.6959662437438965,
"learning_rate": 9.992082599214605e-06,
"loss": 1.0366,
"step": 1253
},
{
"epoch": 0.3355632860583356,
"grad_norm": 4.231655120849609,
"learning_rate": 9.99203273919645e-06,
"loss": 1.3236,
"step": 1254
},
{
"epoch": 0.3358308803853358,
"grad_norm": 3.774073600769043,
"learning_rate": 9.991982722798565e-06,
"loss": 1.2142,
"step": 1255
},
{
"epoch": 0.3360984747123361,
"grad_norm": 3.9392263889312744,
"learning_rate": 9.99193255002252e-06,
"loss": 1.1911,
"step": 1256
},
{
"epoch": 0.33636606903933636,
"grad_norm": 3.8191981315612793,
"learning_rate": 9.991882220869885e-06,
"loss": 1.1639,
"step": 1257
},
{
"epoch": 0.33663366336633666,
"grad_norm": 3.6881232261657715,
"learning_rate": 9.991831735342235e-06,
"loss": 1.179,
"step": 1258
},
{
"epoch": 0.3369012576933369,
"grad_norm": 3.9517464637756348,
"learning_rate": 9.991781093441156e-06,
"loss": 1.3195,
"step": 1259
},
{
"epoch": 0.3371688520203372,
"grad_norm": 4.152409076690674,
"learning_rate": 9.991730295168229e-06,
"loss": 1.3316,
"step": 1260
},
{
"epoch": 0.33743644634733744,
"grad_norm": 4.356308937072754,
"learning_rate": 9.991679340525048e-06,
"loss": 1.3085,
"step": 1261
},
{
"epoch": 0.3377040406743377,
"grad_norm": 3.8445913791656494,
"learning_rate": 9.991628229513212e-06,
"loss": 1.1096,
"step": 1262
},
{
"epoch": 0.337971635001338,
"grad_norm": 4.005192279815674,
"learning_rate": 9.991576962134317e-06,
"loss": 1.0536,
"step": 1263
},
{
"epoch": 0.3382392293283382,
"grad_norm": 4.3194355964660645,
"learning_rate": 9.991525538389971e-06,
"loss": 1.228,
"step": 1264
},
{
"epoch": 0.3385068236553385,
"grad_norm": 4.254610538482666,
"learning_rate": 9.991473958281787e-06,
"loss": 1.2584,
"step": 1265
},
{
"epoch": 0.33877441798233876,
"grad_norm": 4.035154819488525,
"learning_rate": 9.991422221811377e-06,
"loss": 1.2187,
"step": 1266
},
{
"epoch": 0.33904201230933906,
"grad_norm": 4.172974109649658,
"learning_rate": 9.991370328980365e-06,
"loss": 1.229,
"step": 1267
},
{
"epoch": 0.3393096066363393,
"grad_norm": 4.167996883392334,
"learning_rate": 9.991318279790376e-06,
"loss": 1.1717,
"step": 1268
},
{
"epoch": 0.3395772009633396,
"grad_norm": 3.9925200939178467,
"learning_rate": 9.991266074243038e-06,
"loss": 1.0795,
"step": 1269
},
{
"epoch": 0.33984479529033984,
"grad_norm": 4.081603050231934,
"learning_rate": 9.99121371233999e-06,
"loss": 1.2579,
"step": 1270
},
{
"epoch": 0.34011238961734014,
"grad_norm": 3.936547040939331,
"learning_rate": 9.991161194082868e-06,
"loss": 1.1121,
"step": 1271
},
{
"epoch": 0.3403799839443404,
"grad_norm": 3.9780871868133545,
"learning_rate": 9.991108519473321e-06,
"loss": 1.0613,
"step": 1272
},
{
"epoch": 0.3406475782713406,
"grad_norm": 4.46980619430542,
"learning_rate": 9.991055688512996e-06,
"loss": 1.3501,
"step": 1273
},
{
"epoch": 0.3409151725983409,
"grad_norm": 4.383254051208496,
"learning_rate": 9.991002701203552e-06,
"loss": 1.2561,
"step": 1274
},
{
"epoch": 0.34118276692534116,
"grad_norm": 3.765019178390503,
"learning_rate": 9.990949557546644e-06,
"loss": 1.1322,
"step": 1275
},
{
"epoch": 0.34145036125234146,
"grad_norm": 4.063820838928223,
"learning_rate": 9.99089625754394e-06,
"loss": 1.194,
"step": 1276
},
{
"epoch": 0.3417179555793417,
"grad_norm": 3.9652814865112305,
"learning_rate": 9.990842801197109e-06,
"loss": 1.2013,
"step": 1277
},
{
"epoch": 0.341985549906342,
"grad_norm": 3.9601919651031494,
"learning_rate": 9.990789188507827e-06,
"loss": 1.4019,
"step": 1278
},
{
"epoch": 0.34225314423334224,
"grad_norm": 3.9643354415893555,
"learning_rate": 9.990735419477771e-06,
"loss": 1.1947,
"step": 1279
},
{
"epoch": 0.34252073856034254,
"grad_norm": 4.578568458557129,
"learning_rate": 9.990681494108625e-06,
"loss": 1.2858,
"step": 1280
},
{
"epoch": 0.3427883328873428,
"grad_norm": 3.9634618759155273,
"learning_rate": 9.990627412402081e-06,
"loss": 1.0718,
"step": 1281
},
{
"epoch": 0.3430559272143431,
"grad_norm": 4.035841941833496,
"learning_rate": 9.990573174359831e-06,
"loss": 1.0878,
"step": 1282
},
{
"epoch": 0.3433235215413433,
"grad_norm": 3.8855788707733154,
"learning_rate": 9.990518779983575e-06,
"loss": 1.08,
"step": 1283
},
{
"epoch": 0.34359111586834357,
"grad_norm": 4.141454219818115,
"learning_rate": 9.990464229275017e-06,
"loss": 1.2422,
"step": 1284
},
{
"epoch": 0.34385871019534386,
"grad_norm": 4.580347537994385,
"learning_rate": 9.990409522235866e-06,
"loss": 1.2064,
"step": 1285
},
{
"epoch": 0.3441263045223441,
"grad_norm": 4.343654155731201,
"learning_rate": 9.990354658867833e-06,
"loss": 1.2125,
"step": 1286
},
{
"epoch": 0.3443938988493444,
"grad_norm": 4.314458847045898,
"learning_rate": 9.990299639172643e-06,
"loss": 1.2558,
"step": 1287
},
{
"epoch": 0.34466149317634465,
"grad_norm": 3.7052886486053467,
"learning_rate": 9.990244463152012e-06,
"loss": 1.0901,
"step": 1288
},
{
"epoch": 0.34492908750334494,
"grad_norm": 4.012930393218994,
"learning_rate": 9.990189130807672e-06,
"loss": 1.2457,
"step": 1289
},
{
"epoch": 0.3451966818303452,
"grad_norm": 4.098269462585449,
"learning_rate": 9.990133642141359e-06,
"loss": 1.1806,
"step": 1290
},
{
"epoch": 0.3454642761573455,
"grad_norm": 3.833272695541382,
"learning_rate": 9.990077997154807e-06,
"loss": 1.1566,
"step": 1291
},
{
"epoch": 0.3457318704843457,
"grad_norm": 4.549833297729492,
"learning_rate": 9.99002219584976e-06,
"loss": 1.3042,
"step": 1292
},
{
"epoch": 0.345999464811346,
"grad_norm": 3.901430368423462,
"learning_rate": 9.989966238227967e-06,
"loss": 1.1204,
"step": 1293
},
{
"epoch": 0.34626705913834627,
"grad_norm": 4.016772270202637,
"learning_rate": 9.989910124291182e-06,
"loss": 1.0479,
"step": 1294
},
{
"epoch": 0.3465346534653465,
"grad_norm": 3.785675048828125,
"learning_rate": 9.989853854041158e-06,
"loss": 1.0056,
"step": 1295
},
{
"epoch": 0.3468022477923468,
"grad_norm": 3.9900505542755127,
"learning_rate": 9.989797427479663e-06,
"loss": 1.1034,
"step": 1296
},
{
"epoch": 0.34706984211934705,
"grad_norm": 3.6167192459106445,
"learning_rate": 9.989740844608464e-06,
"loss": 1.1602,
"step": 1297
},
{
"epoch": 0.34733743644634735,
"grad_norm": 4.165998458862305,
"learning_rate": 9.989684105429332e-06,
"loss": 1.2538,
"step": 1298
},
{
"epoch": 0.3476050307733476,
"grad_norm": 3.6239192485809326,
"learning_rate": 9.989627209944044e-06,
"loss": 1.1523,
"step": 1299
},
{
"epoch": 0.3478726251003479,
"grad_norm": 3.9420888423919678,
"learning_rate": 9.989570158154383e-06,
"loss": 1.1796,
"step": 1300
},
{
"epoch": 0.34814021942734813,
"grad_norm": 3.861833333969116,
"learning_rate": 9.989512950062135e-06,
"loss": 1.1694,
"step": 1301
},
{
"epoch": 0.3484078137543484,
"grad_norm": 4.555881023406982,
"learning_rate": 9.989455585669093e-06,
"loss": 1.1641,
"step": 1302
},
{
"epoch": 0.34867540808134867,
"grad_norm": 3.9727768898010254,
"learning_rate": 9.989398064977057e-06,
"loss": 1.2632,
"step": 1303
},
{
"epoch": 0.34894300240834897,
"grad_norm": 4.150755882263184,
"learning_rate": 9.989340387987823e-06,
"loss": 1.288,
"step": 1304
},
{
"epoch": 0.3492105967353492,
"grad_norm": 4.13301944732666,
"learning_rate": 9.989282554703202e-06,
"loss": 1.3014,
"step": 1305
},
{
"epoch": 0.34947819106234945,
"grad_norm": 4.18637752532959,
"learning_rate": 9.989224565125003e-06,
"loss": 1.2925,
"step": 1306
},
{
"epoch": 0.34974578538934975,
"grad_norm": 4.216982841491699,
"learning_rate": 9.989166419255047e-06,
"loss": 1.2506,
"step": 1307
},
{
"epoch": 0.35001337971635,
"grad_norm": 4.059083938598633,
"learning_rate": 9.989108117095152e-06,
"loss": 1.3471,
"step": 1308
},
{
"epoch": 0.3502809740433503,
"grad_norm": 4.72033166885376,
"learning_rate": 9.989049658647146e-06,
"loss": 1.273,
"step": 1309
},
{
"epoch": 0.35054856837035053,
"grad_norm": 3.916358232498169,
"learning_rate": 9.988991043912857e-06,
"loss": 1.2104,
"step": 1310
},
{
"epoch": 0.35081616269735083,
"grad_norm": 4.03465461730957,
"learning_rate": 9.988932272894123e-06,
"loss": 1.2057,
"step": 1311
},
{
"epoch": 0.3510837570243511,
"grad_norm": 3.646699905395508,
"learning_rate": 9.988873345592786e-06,
"loss": 1.1352,
"step": 1312
},
{
"epoch": 0.35135135135135137,
"grad_norm": 3.854741334915161,
"learning_rate": 9.988814262010692e-06,
"loss": 1.1613,
"step": 1313
},
{
"epoch": 0.3516189456783516,
"grad_norm": 4.387909889221191,
"learning_rate": 9.988755022149692e-06,
"loss": 1.3018,
"step": 1314
},
{
"epoch": 0.3518865400053519,
"grad_norm": 3.9308204650878906,
"learning_rate": 9.988695626011639e-06,
"loss": 1.045,
"step": 1315
},
{
"epoch": 0.35215413433235215,
"grad_norm": 4.186442852020264,
"learning_rate": 9.988636073598396e-06,
"loss": 1.1992,
"step": 1316
},
{
"epoch": 0.3524217286593524,
"grad_norm": 3.7117176055908203,
"learning_rate": 9.98857636491183e-06,
"loss": 1.1623,
"step": 1317
},
{
"epoch": 0.3526893229863527,
"grad_norm": 3.9418506622314453,
"learning_rate": 9.988516499953807e-06,
"loss": 1.1666,
"step": 1318
},
{
"epoch": 0.35295691731335294,
"grad_norm": 4.194133758544922,
"learning_rate": 9.988456478726207e-06,
"loss": 1.3279,
"step": 1319
},
{
"epoch": 0.35322451164035323,
"grad_norm": 3.8580989837646484,
"learning_rate": 9.988396301230908e-06,
"loss": 1.2154,
"step": 1320
},
{
"epoch": 0.3534921059673535,
"grad_norm": 4.107762813568115,
"learning_rate": 9.988335967469794e-06,
"loss": 1.3117,
"step": 1321
},
{
"epoch": 0.3537597002943538,
"grad_norm": 3.7596476078033447,
"learning_rate": 9.988275477444756e-06,
"loss": 1.1496,
"step": 1322
},
{
"epoch": 0.354027294621354,
"grad_norm": 4.2492289543151855,
"learning_rate": 9.98821483115769e-06,
"loss": 1.1693,
"step": 1323
},
{
"epoch": 0.3542948889483543,
"grad_norm": 3.3397791385650635,
"learning_rate": 9.988154028610496e-06,
"loss": 0.9744,
"step": 1324
},
{
"epoch": 0.35456248327535456,
"grad_norm": 3.7433207035064697,
"learning_rate": 9.988093069805074e-06,
"loss": 1.1848,
"step": 1325
},
{
"epoch": 0.35483007760235485,
"grad_norm": 3.7859861850738525,
"learning_rate": 9.98803195474334e-06,
"loss": 1.1676,
"step": 1326
},
{
"epoch": 0.3550976719293551,
"grad_norm": 4.002213954925537,
"learning_rate": 9.987970683427205e-06,
"loss": 1.1791,
"step": 1327
},
{
"epoch": 0.35536526625635534,
"grad_norm": 4.300050258636475,
"learning_rate": 9.987909255858588e-06,
"loss": 1.1968,
"step": 1328
},
{
"epoch": 0.35563286058335564,
"grad_norm": 3.8620917797088623,
"learning_rate": 9.987847672039416e-06,
"loss": 1.101,
"step": 1329
},
{
"epoch": 0.3559004549103559,
"grad_norm": 4.3346757888793945,
"learning_rate": 9.987785931971616e-06,
"loss": 1.105,
"step": 1330
},
{
"epoch": 0.3561680492373562,
"grad_norm": 3.949228048324585,
"learning_rate": 9.987724035657122e-06,
"loss": 1.225,
"step": 1331
},
{
"epoch": 0.3564356435643564,
"grad_norm": 4.213968276977539,
"learning_rate": 9.987661983097875e-06,
"loss": 1.3405,
"step": 1332
},
{
"epoch": 0.3567032378913567,
"grad_norm": 3.694033622741699,
"learning_rate": 9.987599774295815e-06,
"loss": 1.1712,
"step": 1333
},
{
"epoch": 0.35697083221835696,
"grad_norm": 3.9766345024108887,
"learning_rate": 9.987537409252895e-06,
"loss": 1.1874,
"step": 1334
},
{
"epoch": 0.35723842654535726,
"grad_norm": 3.906832456588745,
"learning_rate": 9.987474887971067e-06,
"loss": 1.1985,
"step": 1335
},
{
"epoch": 0.3575060208723575,
"grad_norm": 3.593148708343506,
"learning_rate": 9.987412210452288e-06,
"loss": 1.0753,
"step": 1336
},
{
"epoch": 0.3577736151993578,
"grad_norm": 4.227734088897705,
"learning_rate": 9.987349376698522e-06,
"loss": 1.2267,
"step": 1337
},
{
"epoch": 0.35804120952635804,
"grad_norm": 3.786079168319702,
"learning_rate": 9.98728638671174e-06,
"loss": 1.113,
"step": 1338
},
{
"epoch": 0.3583088038533583,
"grad_norm": 4.259524345397949,
"learning_rate": 9.987223240493912e-06,
"loss": 1.233,
"step": 1339
},
{
"epoch": 0.3585763981803586,
"grad_norm": 4.269252777099609,
"learning_rate": 9.987159938047018e-06,
"loss": 1.328,
"step": 1340
},
{
"epoch": 0.3588439925073588,
"grad_norm": 3.9735991954803467,
"learning_rate": 9.98709647937304e-06,
"loss": 1.2262,
"step": 1341
},
{
"epoch": 0.3591115868343591,
"grad_norm": 3.9140255451202393,
"learning_rate": 9.987032864473966e-06,
"loss": 1.1495,
"step": 1342
},
{
"epoch": 0.35937918116135936,
"grad_norm": 3.9749045372009277,
"learning_rate": 9.986969093351789e-06,
"loss": 1.071,
"step": 1343
},
{
"epoch": 0.35964677548835966,
"grad_norm": 4.553966522216797,
"learning_rate": 9.986905166008506e-06,
"loss": 1.2779,
"step": 1344
},
{
"epoch": 0.3599143698153599,
"grad_norm": 4.131070613861084,
"learning_rate": 9.98684108244612e-06,
"loss": 1.315,
"step": 1345
},
{
"epoch": 0.3601819641423602,
"grad_norm": 3.922656297683716,
"learning_rate": 9.986776842666641e-06,
"loss": 1.1049,
"step": 1346
},
{
"epoch": 0.36044955846936044,
"grad_norm": 4.215112209320068,
"learning_rate": 9.98671244667208e-06,
"loss": 1.1876,
"step": 1347
},
{
"epoch": 0.36071715279636074,
"grad_norm": 3.91481614112854,
"learning_rate": 9.986647894464452e-06,
"loss": 1.1058,
"step": 1348
},
{
"epoch": 0.360984747123361,
"grad_norm": 4.0664777755737305,
"learning_rate": 9.98658318604578e-06,
"loss": 1.205,
"step": 1349
},
{
"epoch": 0.3612523414503612,
"grad_norm": 4.033042907714844,
"learning_rate": 9.986518321418091e-06,
"loss": 1.2229,
"step": 1350
},
{
"epoch": 0.3615199357773615,
"grad_norm": 4.329224109649658,
"learning_rate": 9.986453300583419e-06,
"loss": 1.1991,
"step": 1351
},
{
"epoch": 0.36178753010436177,
"grad_norm": 4.066847324371338,
"learning_rate": 9.986388123543798e-06,
"loss": 1.0257,
"step": 1352
},
{
"epoch": 0.36205512443136206,
"grad_norm": 4.082132816314697,
"learning_rate": 9.986322790301272e-06,
"loss": 1.3073,
"step": 1353
},
{
"epoch": 0.3623227187583623,
"grad_norm": 4.0690765380859375,
"learning_rate": 9.986257300857885e-06,
"loss": 1.3173,
"step": 1354
},
{
"epoch": 0.3625903130853626,
"grad_norm": 3.81072735786438,
"learning_rate": 9.986191655215692e-06,
"loss": 1.1571,
"step": 1355
},
{
"epoch": 0.36285790741236285,
"grad_norm": 4.073006629943848,
"learning_rate": 9.986125853376747e-06,
"loss": 1.2328,
"step": 1356
},
{
"epoch": 0.36312550173936314,
"grad_norm": 4.015659332275391,
"learning_rate": 9.986059895343113e-06,
"loss": 1.1722,
"step": 1357
},
{
"epoch": 0.3633930960663634,
"grad_norm": 4.220362663269043,
"learning_rate": 9.985993781116853e-06,
"loss": 1.2666,
"step": 1358
},
{
"epoch": 0.3636606903933637,
"grad_norm": 4.876560688018799,
"learning_rate": 9.985927510700043e-06,
"loss": 1.3855,
"step": 1359
},
{
"epoch": 0.3639282847203639,
"grad_norm": 3.7819228172302246,
"learning_rate": 9.985861084094754e-06,
"loss": 1.2191,
"step": 1360
},
{
"epoch": 0.36419587904736417,
"grad_norm": 5.581944465637207,
"learning_rate": 9.98579450130307e-06,
"loss": 1.3441,
"step": 1361
},
{
"epoch": 0.36446347337436447,
"grad_norm": 4.042576789855957,
"learning_rate": 9.985727762327075e-06,
"loss": 1.23,
"step": 1362
},
{
"epoch": 0.3647310677013647,
"grad_norm": 3.75724720954895,
"learning_rate": 9.985660867168862e-06,
"loss": 1.1289,
"step": 1363
},
{
"epoch": 0.364998662028365,
"grad_norm": 3.9243931770324707,
"learning_rate": 9.985593815830524e-06,
"loss": 1.1703,
"step": 1364
},
{
"epoch": 0.36526625635536525,
"grad_norm": 3.6307761669158936,
"learning_rate": 9.985526608314162e-06,
"loss": 1.1219,
"step": 1365
},
{
"epoch": 0.36553385068236555,
"grad_norm": 4.060052394866943,
"learning_rate": 9.985459244621883e-06,
"loss": 1.2764,
"step": 1366
},
{
"epoch": 0.3658014450093658,
"grad_norm": 4.747690200805664,
"learning_rate": 9.985391724755796e-06,
"loss": 1.2811,
"step": 1367
},
{
"epoch": 0.3660690393363661,
"grad_norm": 3.579979658126831,
"learning_rate": 9.985324048718014e-06,
"loss": 1.1203,
"step": 1368
},
{
"epoch": 0.36633663366336633,
"grad_norm": 3.809176206588745,
"learning_rate": 9.985256216510661e-06,
"loss": 1.1502,
"step": 1369
},
{
"epoch": 0.3666042279903666,
"grad_norm": 3.7964789867401123,
"learning_rate": 9.98518822813586e-06,
"loss": 1.0763,
"step": 1370
},
{
"epoch": 0.36687182231736687,
"grad_norm": 3.559234619140625,
"learning_rate": 9.985120083595742e-06,
"loss": 1.2019,
"step": 1371
},
{
"epoch": 0.36713941664436717,
"grad_norm": 4.064184665679932,
"learning_rate": 9.985051782892439e-06,
"loss": 1.3257,
"step": 1372
},
{
"epoch": 0.3674070109713674,
"grad_norm": 3.9166617393493652,
"learning_rate": 9.984983326028093e-06,
"loss": 1.0911,
"step": 1373
},
{
"epoch": 0.36767460529836765,
"grad_norm": 3.9536726474761963,
"learning_rate": 9.984914713004847e-06,
"loss": 1.1804,
"step": 1374
},
{
"epoch": 0.36794219962536795,
"grad_norm": 4.240631103515625,
"learning_rate": 9.98484594382485e-06,
"loss": 1.2855,
"step": 1375
},
{
"epoch": 0.3682097939523682,
"grad_norm": 3.7650909423828125,
"learning_rate": 9.984777018490258e-06,
"loss": 1.1098,
"step": 1376
},
{
"epoch": 0.3684773882793685,
"grad_norm": 3.4372477531433105,
"learning_rate": 9.98470793700323e-06,
"loss": 1.1356,
"step": 1377
},
{
"epoch": 0.36874498260636873,
"grad_norm": 3.8635547161102295,
"learning_rate": 9.984638699365928e-06,
"loss": 1.0865,
"step": 1378
},
{
"epoch": 0.36901257693336903,
"grad_norm": 4.2199554443359375,
"learning_rate": 9.984569305580523e-06,
"loss": 1.1303,
"step": 1379
},
{
"epoch": 0.36928017126036927,
"grad_norm": 4.431107044219971,
"learning_rate": 9.984499755649188e-06,
"loss": 1.3348,
"step": 1380
},
{
"epoch": 0.36954776558736957,
"grad_norm": 3.5743043422698975,
"learning_rate": 9.984430049574103e-06,
"loss": 1.1717,
"step": 1381
},
{
"epoch": 0.3698153599143698,
"grad_norm": 3.696826934814453,
"learning_rate": 9.98436018735745e-06,
"loss": 1.1214,
"step": 1382
},
{
"epoch": 0.3700829542413701,
"grad_norm": 4.062804222106934,
"learning_rate": 9.984290169001418e-06,
"loss": 1.182,
"step": 1383
},
{
"epoch": 0.37035054856837035,
"grad_norm": 3.5979297161102295,
"learning_rate": 9.984219994508199e-06,
"loss": 1.0661,
"step": 1384
},
{
"epoch": 0.3706181428953706,
"grad_norm": 3.755028247833252,
"learning_rate": 9.984149663879994e-06,
"loss": 1.1072,
"step": 1385
},
{
"epoch": 0.3708857372223709,
"grad_norm": 3.7246670722961426,
"learning_rate": 9.984079177119003e-06,
"loss": 1.1284,
"step": 1386
},
{
"epoch": 0.37115333154937113,
"grad_norm": 3.9325149059295654,
"learning_rate": 9.984008534227439e-06,
"loss": 1.2296,
"step": 1387
},
{
"epoch": 0.37142092587637143,
"grad_norm": 4.277532577514648,
"learning_rate": 9.983937735207509e-06,
"loss": 1.2242,
"step": 1388
},
{
"epoch": 0.3716885202033717,
"grad_norm": 3.9127614498138428,
"learning_rate": 9.983866780061435e-06,
"loss": 1.1925,
"step": 1389
},
{
"epoch": 0.371956114530372,
"grad_norm": 3.858065366744995,
"learning_rate": 9.983795668791435e-06,
"loss": 1.1462,
"step": 1390
},
{
"epoch": 0.3722237088573722,
"grad_norm": 3.7166876792907715,
"learning_rate": 9.983724401399745e-06,
"loss": 1.1978,
"step": 1391
},
{
"epoch": 0.3724913031843725,
"grad_norm": 4.299930572509766,
"learning_rate": 9.98365297788859e-06,
"loss": 1.1856,
"step": 1392
},
{
"epoch": 0.37275889751137276,
"grad_norm": 3.803140640258789,
"learning_rate": 9.983581398260211e-06,
"loss": 1.1731,
"step": 1393
},
{
"epoch": 0.37302649183837305,
"grad_norm": 3.9758286476135254,
"learning_rate": 9.983509662516848e-06,
"loss": 1.2402,
"step": 1394
},
{
"epoch": 0.3732940861653733,
"grad_norm": 3.708829402923584,
"learning_rate": 9.98343777066075e-06,
"loss": 1.1367,
"step": 1395
},
{
"epoch": 0.37356168049237354,
"grad_norm": 3.941568613052368,
"learning_rate": 9.983365722694166e-06,
"loss": 1.187,
"step": 1396
},
{
"epoch": 0.37382927481937384,
"grad_norm": 3.5624454021453857,
"learning_rate": 9.983293518619358e-06,
"loss": 1.0969,
"step": 1397
},
{
"epoch": 0.3740968691463741,
"grad_norm": 4.011288642883301,
"learning_rate": 9.983221158438585e-06,
"loss": 1.1643,
"step": 1398
},
{
"epoch": 0.3743644634733744,
"grad_norm": 5.278192520141602,
"learning_rate": 9.983148642154114e-06,
"loss": 1.1881,
"step": 1399
},
{
"epoch": 0.3746320578003746,
"grad_norm": 3.4236741065979004,
"learning_rate": 9.983075969768217e-06,
"loss": 1.1508,
"step": 1400
},
{
"epoch": 0.3748996521273749,
"grad_norm": 3.687683582305908,
"learning_rate": 9.98300314128317e-06,
"loss": 1.1077,
"step": 1401
},
{
"epoch": 0.37516724645437516,
"grad_norm": 3.9771618843078613,
"learning_rate": 9.982930156701254e-06,
"loss": 1.1796,
"step": 1402
},
{
"epoch": 0.37543484078137546,
"grad_norm": 3.828674554824829,
"learning_rate": 9.982857016024757e-06,
"loss": 1.2694,
"step": 1403
},
{
"epoch": 0.3757024351083757,
"grad_norm": 3.8206582069396973,
"learning_rate": 9.982783719255968e-06,
"loss": 1.1139,
"step": 1404
},
{
"epoch": 0.375970029435376,
"grad_norm": 3.9748029708862305,
"learning_rate": 9.982710266397184e-06,
"loss": 1.1027,
"step": 1405
},
{
"epoch": 0.37623762376237624,
"grad_norm": 3.5067262649536133,
"learning_rate": 9.982636657450706e-06,
"loss": 1.1222,
"step": 1406
},
{
"epoch": 0.3765052180893765,
"grad_norm": 4.0315093994140625,
"learning_rate": 9.98256289241884e-06,
"loss": 1.3018,
"step": 1407
},
{
"epoch": 0.3767728124163768,
"grad_norm": 3.5691301822662354,
"learning_rate": 9.982488971303899e-06,
"loss": 1.0404,
"step": 1408
},
{
"epoch": 0.377040406743377,
"grad_norm": 4.119424819946289,
"learning_rate": 9.982414894108194e-06,
"loss": 1.234,
"step": 1409
},
{
"epoch": 0.3773080010703773,
"grad_norm": 3.957841157913208,
"learning_rate": 9.982340660834049e-06,
"loss": 1.4368,
"step": 1410
},
{
"epoch": 0.37757559539737756,
"grad_norm": 3.7822635173797607,
"learning_rate": 9.982266271483787e-06,
"loss": 1.1002,
"step": 1411
},
{
"epoch": 0.37784318972437786,
"grad_norm": 3.610924005508423,
"learning_rate": 9.982191726059742e-06,
"loss": 1.1034,
"step": 1412
},
{
"epoch": 0.3781107840513781,
"grad_norm": 3.8048479557037354,
"learning_rate": 9.982117024564244e-06,
"loss": 1.1641,
"step": 1413
},
{
"epoch": 0.3783783783783784,
"grad_norm": 3.299978256225586,
"learning_rate": 9.982042166999639e-06,
"loss": 1.0626,
"step": 1414
},
{
"epoch": 0.37864597270537864,
"grad_norm": 4.0182061195373535,
"learning_rate": 9.981967153368266e-06,
"loss": 1.2374,
"step": 1415
},
{
"epoch": 0.37891356703237894,
"grad_norm": 3.6433804035186768,
"learning_rate": 9.981891983672481e-06,
"loss": 1.0366,
"step": 1416
},
{
"epoch": 0.3791811613593792,
"grad_norm": 3.748567581176758,
"learning_rate": 9.981816657914633e-06,
"loss": 1.0423,
"step": 1417
},
{
"epoch": 0.3794487556863794,
"grad_norm": 3.7218706607818604,
"learning_rate": 9.981741176097084e-06,
"loss": 1.0621,
"step": 1418
},
{
"epoch": 0.3797163500133797,
"grad_norm": 4.026986122131348,
"learning_rate": 9.981665538222201e-06,
"loss": 1.1626,
"step": 1419
},
{
"epoch": 0.37998394434037996,
"grad_norm": 4.299746513366699,
"learning_rate": 9.98158974429235e-06,
"loss": 1.2972,
"step": 1420
},
{
"epoch": 0.38025153866738026,
"grad_norm": 3.5890026092529297,
"learning_rate": 9.981513794309905e-06,
"loss": 1.1985,
"step": 1421
},
{
"epoch": 0.3805191329943805,
"grad_norm": 3.7244949340820312,
"learning_rate": 9.981437688277248e-06,
"loss": 1.0564,
"step": 1422
},
{
"epoch": 0.3807867273213808,
"grad_norm": 4.018440246582031,
"learning_rate": 9.981361426196763e-06,
"loss": 1.1399,
"step": 1423
},
{
"epoch": 0.38105432164838104,
"grad_norm": 4.051723003387451,
"learning_rate": 9.981285008070836e-06,
"loss": 1.3185,
"step": 1424
},
{
"epoch": 0.38132191597538134,
"grad_norm": 3.7371041774749756,
"learning_rate": 9.981208433901864e-06,
"loss": 1.1645,
"step": 1425
},
{
"epoch": 0.3815895103023816,
"grad_norm": 3.600698709487915,
"learning_rate": 9.981131703692241e-06,
"loss": 1.1944,
"step": 1426
},
{
"epoch": 0.3818571046293819,
"grad_norm": 3.748783826828003,
"learning_rate": 9.981054817444378e-06,
"loss": 1.1702,
"step": 1427
},
{
"epoch": 0.3821246989563821,
"grad_norm": 4.829683303833008,
"learning_rate": 9.980977775160676e-06,
"loss": 1.1126,
"step": 1428
},
{
"epoch": 0.38239229328338237,
"grad_norm": 3.9870803356170654,
"learning_rate": 9.980900576843555e-06,
"loss": 1.1593,
"step": 1429
},
{
"epoch": 0.38265988761038267,
"grad_norm": 4.103924751281738,
"learning_rate": 9.980823222495429e-06,
"loss": 1.3044,
"step": 1430
},
{
"epoch": 0.3829274819373829,
"grad_norm": 3.9906015396118164,
"learning_rate": 9.980745712118722e-06,
"loss": 1.1956,
"step": 1431
},
{
"epoch": 0.3831950762643832,
"grad_norm": 3.7663869857788086,
"learning_rate": 9.980668045715864e-06,
"loss": 1.1876,
"step": 1432
},
{
"epoch": 0.38346267059138345,
"grad_norm": 3.752241373062134,
"learning_rate": 9.980590223289284e-06,
"loss": 1.2941,
"step": 1433
},
{
"epoch": 0.38373026491838375,
"grad_norm": 3.90246844291687,
"learning_rate": 9.980512244841424e-06,
"loss": 1.1741,
"step": 1434
},
{
"epoch": 0.383997859245384,
"grad_norm": 3.9306254386901855,
"learning_rate": 9.980434110374725e-06,
"loss": 1.2318,
"step": 1435
},
{
"epoch": 0.3842654535723843,
"grad_norm": 3.9048655033111572,
"learning_rate": 9.980355819891634e-06,
"loss": 1.1667,
"step": 1436
},
{
"epoch": 0.38453304789938453,
"grad_norm": 4.364803314208984,
"learning_rate": 9.980277373394604e-06,
"loss": 1.3952,
"step": 1437
},
{
"epoch": 0.3848006422263848,
"grad_norm": 4.139352798461914,
"learning_rate": 9.980198770886094e-06,
"loss": 1.1513,
"step": 1438
},
{
"epoch": 0.38506823655338507,
"grad_norm": 4.017728805541992,
"learning_rate": 9.980120012368564e-06,
"loss": 1.3192,
"step": 1439
},
{
"epoch": 0.3853358308803853,
"grad_norm": 4.370884418487549,
"learning_rate": 9.980041097844482e-06,
"loss": 1.2861,
"step": 1440
},
{
"epoch": 0.3856034252073856,
"grad_norm": 3.9100890159606934,
"learning_rate": 9.979962027316322e-06,
"loss": 1.1261,
"step": 1441
},
{
"epoch": 0.38587101953438585,
"grad_norm": 4.041008949279785,
"learning_rate": 9.979882800786556e-06,
"loss": 1.223,
"step": 1442
},
{
"epoch": 0.38613861386138615,
"grad_norm": 3.6667256355285645,
"learning_rate": 9.97980341825767e-06,
"loss": 1.217,
"step": 1443
},
{
"epoch": 0.3864062081883864,
"grad_norm": 3.8240602016448975,
"learning_rate": 9.979723879732151e-06,
"loss": 1.098,
"step": 1444
},
{
"epoch": 0.3866738025153867,
"grad_norm": 3.704866647720337,
"learning_rate": 9.979644185212489e-06,
"loss": 1.0308,
"step": 1445
},
{
"epoch": 0.38694139684238693,
"grad_norm": 3.873335123062134,
"learning_rate": 9.97956433470118e-06,
"loss": 1.1319,
"step": 1446
},
{
"epoch": 0.38720899116938723,
"grad_norm": 3.9278151988983154,
"learning_rate": 9.979484328200726e-06,
"loss": 1.1702,
"step": 1447
},
{
"epoch": 0.38747658549638747,
"grad_norm": 4.0510077476501465,
"learning_rate": 9.979404165713633e-06,
"loss": 1.2193,
"step": 1448
},
{
"epoch": 0.38774417982338777,
"grad_norm": 3.8461694717407227,
"learning_rate": 9.979323847242414e-06,
"loss": 1.2015,
"step": 1449
},
{
"epoch": 0.388011774150388,
"grad_norm": 3.905766010284424,
"learning_rate": 9.979243372789583e-06,
"loss": 1.1764,
"step": 1450
},
{
"epoch": 0.38827936847738825,
"grad_norm": 3.528315305709839,
"learning_rate": 9.979162742357661e-06,
"loss": 1.1254,
"step": 1451
},
{
"epoch": 0.38854696280438855,
"grad_norm": 3.523634433746338,
"learning_rate": 9.979081955949176e-06,
"loss": 1.1075,
"step": 1452
},
{
"epoch": 0.3888145571313888,
"grad_norm": 3.9340994358062744,
"learning_rate": 9.979001013566656e-06,
"loss": 1.1919,
"step": 1453
},
{
"epoch": 0.3890821514583891,
"grad_norm": 3.8037092685699463,
"learning_rate": 9.978919915212637e-06,
"loss": 1.2163,
"step": 1454
},
{
"epoch": 0.38934974578538933,
"grad_norm": 4.22614049911499,
"learning_rate": 9.978838660889662e-06,
"loss": 1.3132,
"step": 1455
},
{
"epoch": 0.38961734011238963,
"grad_norm": 3.9353504180908203,
"learning_rate": 9.978757250600273e-06,
"loss": 1.0995,
"step": 1456
},
{
"epoch": 0.3898849344393899,
"grad_norm": 3.8454160690307617,
"learning_rate": 9.978675684347022e-06,
"loss": 1.1743,
"step": 1457
},
{
"epoch": 0.39015252876639017,
"grad_norm": 3.7511603832244873,
"learning_rate": 9.978593962132464e-06,
"loss": 1.0399,
"step": 1458
},
{
"epoch": 0.3904201230933904,
"grad_norm": 3.736814022064209,
"learning_rate": 9.97851208395916e-06,
"loss": 1.1429,
"step": 1459
},
{
"epoch": 0.3906877174203907,
"grad_norm": 4.143425464630127,
"learning_rate": 9.978430049829672e-06,
"loss": 1.2349,
"step": 1460
},
{
"epoch": 0.39095531174739095,
"grad_norm": 3.73406720161438,
"learning_rate": 9.978347859746572e-06,
"loss": 1.1704,
"step": 1461
},
{
"epoch": 0.3912229060743912,
"grad_norm": 3.936199903488159,
"learning_rate": 9.978265513712435e-06,
"loss": 1.0558,
"step": 1462
},
{
"epoch": 0.3914905004013915,
"grad_norm": 3.9773452281951904,
"learning_rate": 9.97818301172984e-06,
"loss": 1.0348,
"step": 1463
},
{
"epoch": 0.39175809472839174,
"grad_norm": 4.385336875915527,
"learning_rate": 9.97810035380137e-06,
"loss": 1.3109,
"step": 1464
},
{
"epoch": 0.39202568905539203,
"grad_norm": 3.7281599044799805,
"learning_rate": 9.978017539929617e-06,
"loss": 1.0922,
"step": 1465
},
{
"epoch": 0.3922932833823923,
"grad_norm": 3.8303089141845703,
"learning_rate": 9.977934570117173e-06,
"loss": 1.2507,
"step": 1466
},
{
"epoch": 0.3925608777093926,
"grad_norm": 4.046004295349121,
"learning_rate": 9.97785144436664e-06,
"loss": 1.2282,
"step": 1467
},
{
"epoch": 0.3928284720363928,
"grad_norm": 3.8748390674591064,
"learning_rate": 9.977768162680616e-06,
"loss": 1.2089,
"step": 1468
},
{
"epoch": 0.3930960663633931,
"grad_norm": 3.917059898376465,
"learning_rate": 9.977684725061716e-06,
"loss": 1.3255,
"step": 1469
},
{
"epoch": 0.39336366069039336,
"grad_norm": 3.7279913425445557,
"learning_rate": 9.977601131512553e-06,
"loss": 1.326,
"step": 1470
},
{
"epoch": 0.39363125501739366,
"grad_norm": 3.7477800846099854,
"learning_rate": 9.977517382035743e-06,
"loss": 1.066,
"step": 1471
},
{
"epoch": 0.3938988493443939,
"grad_norm": 3.699467420578003,
"learning_rate": 9.97743347663391e-06,
"loss": 1.1433,
"step": 1472
},
{
"epoch": 0.39416644367139414,
"grad_norm": 4.19885778427124,
"learning_rate": 9.977349415309682e-06,
"loss": 1.2562,
"step": 1473
},
{
"epoch": 0.39443403799839444,
"grad_norm": 4.173817157745361,
"learning_rate": 9.977265198065696e-06,
"loss": 1.3787,
"step": 1474
},
{
"epoch": 0.3947016323253947,
"grad_norm": 3.8569087982177734,
"learning_rate": 9.977180824904586e-06,
"loss": 1.1537,
"step": 1475
},
{
"epoch": 0.394969226652395,
"grad_norm": 3.5903382301330566,
"learning_rate": 9.977096295828998e-06,
"loss": 1.082,
"step": 1476
},
{
"epoch": 0.3952368209793952,
"grad_norm": 4.043254375457764,
"learning_rate": 9.977011610841579e-06,
"loss": 1.2256,
"step": 1477
},
{
"epoch": 0.3955044153063955,
"grad_norm": 3.817080020904541,
"learning_rate": 9.97692676994498e-06,
"loss": 1.162,
"step": 1478
},
{
"epoch": 0.39577200963339576,
"grad_norm": 3.870049238204956,
"learning_rate": 9.976841773141862e-06,
"loss": 1.2824,
"step": 1479
},
{
"epoch": 0.39603960396039606,
"grad_norm": 3.870774984359741,
"learning_rate": 9.976756620434882e-06,
"loss": 1.0247,
"step": 1480
},
{
"epoch": 0.3963071982873963,
"grad_norm": 4.375112056732178,
"learning_rate": 9.976671311826714e-06,
"loss": 1.2235,
"step": 1481
},
{
"epoch": 0.3965747926143966,
"grad_norm": 3.724886894226074,
"learning_rate": 9.976585847320028e-06,
"loss": 1.1431,
"step": 1482
},
{
"epoch": 0.39684238694139684,
"grad_norm": 3.800994873046875,
"learning_rate": 9.9765002269175e-06,
"loss": 1.0945,
"step": 1483
},
{
"epoch": 0.3971099812683971,
"grad_norm": 4.027763843536377,
"learning_rate": 9.976414450621812e-06,
"loss": 1.0553,
"step": 1484
},
{
"epoch": 0.3973775755953974,
"grad_norm": 3.982628345489502,
"learning_rate": 9.976328518435654e-06,
"loss": 1.2203,
"step": 1485
},
{
"epoch": 0.3976451699223976,
"grad_norm": 3.9969089031219482,
"learning_rate": 9.976242430361714e-06,
"loss": 1.2567,
"step": 1486
},
{
"epoch": 0.3979127642493979,
"grad_norm": 3.8449573516845703,
"learning_rate": 9.976156186402691e-06,
"loss": 1.2397,
"step": 1487
},
{
"epoch": 0.39818035857639816,
"grad_norm": 4.029581069946289,
"learning_rate": 9.976069786561286e-06,
"loss": 1.2279,
"step": 1488
},
{
"epoch": 0.39844795290339846,
"grad_norm": 3.9746222496032715,
"learning_rate": 9.975983230840208e-06,
"loss": 1.3537,
"step": 1489
},
{
"epoch": 0.3987155472303987,
"grad_norm": 3.8316519260406494,
"learning_rate": 9.975896519242165e-06,
"loss": 1.103,
"step": 1490
},
{
"epoch": 0.398983141557399,
"grad_norm": 3.714109182357788,
"learning_rate": 9.975809651769874e-06,
"loss": 1.0821,
"step": 1491
},
{
"epoch": 0.39925073588439924,
"grad_norm": 3.9708900451660156,
"learning_rate": 9.97572262842606e-06,
"loss": 1.1406,
"step": 1492
},
{
"epoch": 0.39951833021139954,
"grad_norm": 3.9574031829833984,
"learning_rate": 9.975635449213443e-06,
"loss": 1.2042,
"step": 1493
},
{
"epoch": 0.3997859245383998,
"grad_norm": 3.648296594619751,
"learning_rate": 9.975548114134756e-06,
"loss": 1.1087,
"step": 1494
},
{
"epoch": 0.4000535188654,
"grad_norm": 3.9714877605438232,
"learning_rate": 9.975460623192738e-06,
"loss": 1.3016,
"step": 1495
},
{
"epoch": 0.4003211131924003,
"grad_norm": 4.270079612731934,
"learning_rate": 9.975372976390126e-06,
"loss": 1.2158,
"step": 1496
},
{
"epoch": 0.40058870751940057,
"grad_norm": 3.275508165359497,
"learning_rate": 9.975285173729668e-06,
"loss": 1.0631,
"step": 1497
},
{
"epoch": 0.40085630184640086,
"grad_norm": 3.839301824569702,
"learning_rate": 9.975197215214113e-06,
"loss": 1.2388,
"step": 1498
},
{
"epoch": 0.4011238961734011,
"grad_norm": 3.7420620918273926,
"learning_rate": 9.975109100846216e-06,
"loss": 1.1396,
"step": 1499
},
{
"epoch": 0.4013914905004014,
"grad_norm": 3.785066604614258,
"learning_rate": 9.975020830628741e-06,
"loss": 1.2032,
"step": 1500
},
{
"epoch": 0.4013914905004014,
"eval_loss": 1.2037502527236938,
"eval_runtime": 11.6819,
"eval_samples_per_second": 34.241,
"eval_steps_per_second": 4.28,
"step": 1500
},
{
"epoch": 0.40165908482740165,
"grad_norm": 3.9376888275146484,
"learning_rate": 9.974932404564448e-06,
"loss": 1.1378,
"step": 1501
},
{
"epoch": 0.40192667915440194,
"grad_norm": 4.073405742645264,
"learning_rate": 9.97484382265611e-06,
"loss": 1.1687,
"step": 1502
},
{
"epoch": 0.4021942734814022,
"grad_norm": 3.9197230339050293,
"learning_rate": 9.974755084906503e-06,
"loss": 1.164,
"step": 1503
},
{
"epoch": 0.4024618678084025,
"grad_norm": 4.129556655883789,
"learning_rate": 9.974666191318402e-06,
"loss": 1.2084,
"step": 1504
},
{
"epoch": 0.4027294621354027,
"grad_norm": 4.0111799240112305,
"learning_rate": 9.974577141894597e-06,
"loss": 1.2002,
"step": 1505
},
{
"epoch": 0.40299705646240297,
"grad_norm": 4.588496685028076,
"learning_rate": 9.974487936637873e-06,
"loss": 1.115,
"step": 1506
},
{
"epoch": 0.40326465078940327,
"grad_norm": 3.992095708847046,
"learning_rate": 9.974398575551029e-06,
"loss": 1.2977,
"step": 1507
},
{
"epoch": 0.4035322451164035,
"grad_norm": 4.14756965637207,
"learning_rate": 9.97430905863686e-06,
"loss": 1.2135,
"step": 1508
},
{
"epoch": 0.4037998394434038,
"grad_norm": 3.6382899284362793,
"learning_rate": 9.974219385898174e-06,
"loss": 1.1663,
"step": 1509
},
{
"epoch": 0.40406743377040405,
"grad_norm": 3.954108953475952,
"learning_rate": 9.974129557337777e-06,
"loss": 1.2709,
"step": 1510
},
{
"epoch": 0.40433502809740435,
"grad_norm": 3.5056028366088867,
"learning_rate": 9.974039572958486e-06,
"loss": 1.0011,
"step": 1511
},
{
"epoch": 0.4046026224244046,
"grad_norm": 3.9158694744110107,
"learning_rate": 9.973949432763117e-06,
"loss": 1.2319,
"step": 1512
},
{
"epoch": 0.4048702167514049,
"grad_norm": 3.6687309741973877,
"learning_rate": 9.973859136754495e-06,
"loss": 1.1885,
"step": 1513
},
{
"epoch": 0.40513781107840513,
"grad_norm": 4.025513172149658,
"learning_rate": 9.973768684935448e-06,
"loss": 1.1389,
"step": 1514
},
{
"epoch": 0.40540540540540543,
"grad_norm": 3.8120996952056885,
"learning_rate": 9.973678077308811e-06,
"loss": 1.1946,
"step": 1515
},
{
"epoch": 0.40567299973240567,
"grad_norm": 3.88718318939209,
"learning_rate": 9.97358731387742e-06,
"loss": 1.2046,
"step": 1516
},
{
"epoch": 0.40594059405940597,
"grad_norm": 4.027118682861328,
"learning_rate": 9.97349639464412e-06,
"loss": 1.2053,
"step": 1517
},
{
"epoch": 0.4062081883864062,
"grad_norm": 3.675534963607788,
"learning_rate": 9.973405319611757e-06,
"loss": 1.1274,
"step": 1518
},
{
"epoch": 0.40647578271340645,
"grad_norm": 3.914788007736206,
"learning_rate": 9.973314088783188e-06,
"loss": 1.2117,
"step": 1519
},
{
"epoch": 0.40674337704040675,
"grad_norm": 3.8196732997894287,
"learning_rate": 9.973222702161267e-06,
"loss": 1.1037,
"step": 1520
},
{
"epoch": 0.407010971367407,
"grad_norm": 3.492936611175537,
"learning_rate": 9.97313115974886e-06,
"loss": 1.1087,
"step": 1521
},
{
"epoch": 0.4072785656944073,
"grad_norm": 4.102333068847656,
"learning_rate": 9.97303946154883e-06,
"loss": 1.2887,
"step": 1522
},
{
"epoch": 0.40754616002140753,
"grad_norm": 3.951390027999878,
"learning_rate": 9.972947607564056e-06,
"loss": 1.2433,
"step": 1523
},
{
"epoch": 0.40781375434840783,
"grad_norm": 3.7122180461883545,
"learning_rate": 9.972855597797408e-06,
"loss": 1.0165,
"step": 1524
},
{
"epoch": 0.4080813486754081,
"grad_norm": 3.8031108379364014,
"learning_rate": 9.972763432251775e-06,
"loss": 1.1836,
"step": 1525
},
{
"epoch": 0.40834894300240837,
"grad_norm": 3.916783571243286,
"learning_rate": 9.972671110930041e-06,
"loss": 1.2287,
"step": 1526
},
{
"epoch": 0.4086165373294086,
"grad_norm": 4.449172019958496,
"learning_rate": 9.972578633835096e-06,
"loss": 1.212,
"step": 1527
},
{
"epoch": 0.4088841316564089,
"grad_norm": 3.9276909828186035,
"learning_rate": 9.972486000969842e-06,
"loss": 1.2655,
"step": 1528
},
{
"epoch": 0.40915172598340915,
"grad_norm": 4.05131196975708,
"learning_rate": 9.972393212337178e-06,
"loss": 1.2497,
"step": 1529
},
{
"epoch": 0.4094193203104094,
"grad_norm": 3.8336915969848633,
"learning_rate": 9.972300267940009e-06,
"loss": 1.2201,
"step": 1530
},
{
"epoch": 0.4096869146374097,
"grad_norm": 3.7255733013153076,
"learning_rate": 9.97220716778125e-06,
"loss": 1.2857,
"step": 1531
},
{
"epoch": 0.40995450896440994,
"grad_norm": 3.4183714389801025,
"learning_rate": 9.972113911863815e-06,
"loss": 1.0868,
"step": 1532
},
{
"epoch": 0.41022210329141023,
"grad_norm": 3.8064022064208984,
"learning_rate": 9.972020500190626e-06,
"loss": 1.2251,
"step": 1533
},
{
"epoch": 0.4104896976184105,
"grad_norm": 4.1284565925598145,
"learning_rate": 9.971926932764609e-06,
"loss": 1.246,
"step": 1534
},
{
"epoch": 0.4107572919454108,
"grad_norm": 4.064891338348389,
"learning_rate": 9.971833209588696e-06,
"loss": 1.1329,
"step": 1535
},
{
"epoch": 0.411024886272411,
"grad_norm": 3.891404151916504,
"learning_rate": 9.971739330665821e-06,
"loss": 1.2359,
"step": 1536
},
{
"epoch": 0.4112924805994113,
"grad_norm": 3.7822113037109375,
"learning_rate": 9.971645295998929e-06,
"loss": 1.243,
"step": 1537
},
{
"epoch": 0.41156007492641156,
"grad_norm": 3.785557746887207,
"learning_rate": 9.97155110559096e-06,
"loss": 1.2446,
"step": 1538
},
{
"epoch": 0.41182766925341185,
"grad_norm": 3.562366008758545,
"learning_rate": 9.971456759444869e-06,
"loss": 1.1905,
"step": 1539
},
{
"epoch": 0.4120952635804121,
"grad_norm": 3.9148495197296143,
"learning_rate": 9.971362257563609e-06,
"loss": 1.2355,
"step": 1540
},
{
"epoch": 0.41236285790741234,
"grad_norm": 4.079963684082031,
"learning_rate": 9.971267599950142e-06,
"loss": 1.2154,
"step": 1541
},
{
"epoch": 0.41263045223441264,
"grad_norm": 4.090665340423584,
"learning_rate": 9.971172786607433e-06,
"loss": 1.0667,
"step": 1542
},
{
"epoch": 0.4128980465614129,
"grad_norm": 3.9606683254241943,
"learning_rate": 9.97107781753845e-06,
"loss": 1.1426,
"step": 1543
},
{
"epoch": 0.4131656408884132,
"grad_norm": 3.7528765201568604,
"learning_rate": 9.970982692746171e-06,
"loss": 1.2156,
"step": 1544
},
{
"epoch": 0.4134332352154134,
"grad_norm": 3.5512781143188477,
"learning_rate": 9.970887412233574e-06,
"loss": 1.1394,
"step": 1545
},
{
"epoch": 0.4137008295424137,
"grad_norm": 3.8421177864074707,
"learning_rate": 9.970791976003644e-06,
"loss": 1.11,
"step": 1546
},
{
"epoch": 0.41396842386941396,
"grad_norm": 3.389683961868286,
"learning_rate": 9.97069638405937e-06,
"loss": 1.2226,
"step": 1547
},
{
"epoch": 0.41423601819641426,
"grad_norm": 4.224984645843506,
"learning_rate": 9.97060063640375e-06,
"loss": 1.1678,
"step": 1548
},
{
"epoch": 0.4145036125234145,
"grad_norm": 3.8544325828552246,
"learning_rate": 9.970504733039778e-06,
"loss": 1.1398,
"step": 1549
},
{
"epoch": 0.4147712068504148,
"grad_norm": 3.8598458766937256,
"learning_rate": 9.970408673970464e-06,
"loss": 1.1928,
"step": 1550
},
{
"epoch": 0.41503880117741504,
"grad_norm": 3.5871057510375977,
"learning_rate": 9.970312459198812e-06,
"loss": 1.0778,
"step": 1551
},
{
"epoch": 0.4153063955044153,
"grad_norm": 3.948990821838379,
"learning_rate": 9.970216088727838e-06,
"loss": 1.2415,
"step": 1552
},
{
"epoch": 0.4155739898314156,
"grad_norm": 3.909735679626465,
"learning_rate": 9.970119562560562e-06,
"loss": 1.1809,
"step": 1553
},
{
"epoch": 0.4158415841584158,
"grad_norm": 3.529320240020752,
"learning_rate": 9.970022880700006e-06,
"loss": 1.1612,
"step": 1554
},
{
"epoch": 0.4161091784854161,
"grad_norm": 3.7973170280456543,
"learning_rate": 9.9699260431492e-06,
"loss": 1.1705,
"step": 1555
},
{
"epoch": 0.41637677281241636,
"grad_norm": 3.872828483581543,
"learning_rate": 9.969829049911178e-06,
"loss": 1.1882,
"step": 1556
},
{
"epoch": 0.41664436713941666,
"grad_norm": 3.9552464485168457,
"learning_rate": 9.969731900988975e-06,
"loss": 1.1696,
"step": 1557
},
{
"epoch": 0.4169119614664169,
"grad_norm": 3.7764220237731934,
"learning_rate": 9.969634596385637e-06,
"loss": 1.2108,
"step": 1558
},
{
"epoch": 0.4171795557934172,
"grad_norm": 3.64782452583313,
"learning_rate": 9.969537136104213e-06,
"loss": 1.2066,
"step": 1559
},
{
"epoch": 0.41744715012041744,
"grad_norm": 3.8925864696502686,
"learning_rate": 9.969439520147754e-06,
"loss": 1.1694,
"step": 1560
},
{
"epoch": 0.41771474444741774,
"grad_norm": 3.523120880126953,
"learning_rate": 9.969341748519319e-06,
"loss": 1.1243,
"step": 1561
},
{
"epoch": 0.417982338774418,
"grad_norm": 3.78109073638916,
"learning_rate": 9.969243821221972e-06,
"loss": 1.0945,
"step": 1562
},
{
"epoch": 0.4182499331014182,
"grad_norm": 3.5155014991760254,
"learning_rate": 9.969145738258776e-06,
"loss": 1.0598,
"step": 1563
},
{
"epoch": 0.4185175274284185,
"grad_norm": 4.167483806610107,
"learning_rate": 9.969047499632808e-06,
"loss": 1.2385,
"step": 1564
},
{
"epoch": 0.41878512175541877,
"grad_norm": 3.761597156524658,
"learning_rate": 9.968949105347146e-06,
"loss": 1.2115,
"step": 1565
},
{
"epoch": 0.41905271608241906,
"grad_norm": 3.6628715991973877,
"learning_rate": 9.968850555404867e-06,
"loss": 1.084,
"step": 1566
},
{
"epoch": 0.4193203104094193,
"grad_norm": 3.8627333641052246,
"learning_rate": 9.968751849809063e-06,
"loss": 1.2907,
"step": 1567
},
{
"epoch": 0.4195879047364196,
"grad_norm": 4.0308518409729,
"learning_rate": 9.968652988562826e-06,
"loss": 1.2336,
"step": 1568
},
{
"epoch": 0.41985549906341985,
"grad_norm": 3.72426438331604,
"learning_rate": 9.96855397166925e-06,
"loss": 1.2231,
"step": 1569
},
{
"epoch": 0.42012309339042014,
"grad_norm": 3.9212002754211426,
"learning_rate": 9.968454799131439e-06,
"loss": 1.1774,
"step": 1570
},
{
"epoch": 0.4203906877174204,
"grad_norm": 3.7344274520874023,
"learning_rate": 9.968355470952498e-06,
"loss": 1.1205,
"step": 1571
},
{
"epoch": 0.4206582820444207,
"grad_norm": 3.6700868606567383,
"learning_rate": 9.96825598713554e-06,
"loss": 1.201,
"step": 1572
},
{
"epoch": 0.4209258763714209,
"grad_norm": 3.989650249481201,
"learning_rate": 9.968156347683682e-06,
"loss": 1.2025,
"step": 1573
},
{
"epoch": 0.42119347069842117,
"grad_norm": 3.851297616958618,
"learning_rate": 9.968056552600043e-06,
"loss": 1.2169,
"step": 1574
},
{
"epoch": 0.42146106502542147,
"grad_norm": 3.8957512378692627,
"learning_rate": 9.967956601887751e-06,
"loss": 1.3342,
"step": 1575
},
{
"epoch": 0.4217286593524217,
"grad_norm": 3.9603443145751953,
"learning_rate": 9.967856495549935e-06,
"loss": 1.2206,
"step": 1576
},
{
"epoch": 0.421996253679422,
"grad_norm": 3.6075241565704346,
"learning_rate": 9.967756233589734e-06,
"loss": 1.1752,
"step": 1577
},
{
"epoch": 0.42226384800642225,
"grad_norm": 3.690418243408203,
"learning_rate": 9.967655816010287e-06,
"loss": 1.2098,
"step": 1578
},
{
"epoch": 0.42253144233342255,
"grad_norm": 3.748853921890259,
"learning_rate": 9.967555242814738e-06,
"loss": 1.1701,
"step": 1579
},
{
"epoch": 0.4227990366604228,
"grad_norm": 3.668382406234741,
"learning_rate": 9.96745451400624e-06,
"loss": 1.1536,
"step": 1580
},
{
"epoch": 0.4230666309874231,
"grad_norm": 4.043965816497803,
"learning_rate": 9.967353629587948e-06,
"loss": 1.1062,
"step": 1581
},
{
"epoch": 0.42333422531442333,
"grad_norm": 3.860582113265991,
"learning_rate": 9.967252589563023e-06,
"loss": 1.202,
"step": 1582
},
{
"epoch": 0.4236018196414236,
"grad_norm": 3.919570207595825,
"learning_rate": 9.967151393934628e-06,
"loss": 1.0513,
"step": 1583
},
{
"epoch": 0.42386941396842387,
"grad_norm": 4.271496772766113,
"learning_rate": 9.967050042705934e-06,
"loss": 1.198,
"step": 1584
},
{
"epoch": 0.4241370082954241,
"grad_norm": 3.9681901931762695,
"learning_rate": 9.966948535880118e-06,
"loss": 1.0558,
"step": 1585
},
{
"epoch": 0.4244046026224244,
"grad_norm": 3.7213032245635986,
"learning_rate": 9.966846873460357e-06,
"loss": 1.1218,
"step": 1586
},
{
"epoch": 0.42467219694942465,
"grad_norm": 3.8692944049835205,
"learning_rate": 9.966745055449835e-06,
"loss": 1.1898,
"step": 1587
},
{
"epoch": 0.42493979127642495,
"grad_norm": 3.475710868835449,
"learning_rate": 9.966643081851746e-06,
"loss": 1.0925,
"step": 1588
},
{
"epoch": 0.4252073856034252,
"grad_norm": 3.985151767730713,
"learning_rate": 9.966540952669279e-06,
"loss": 1.1674,
"step": 1589
},
{
"epoch": 0.4254749799304255,
"grad_norm": 3.913224935531616,
"learning_rate": 9.966438667905637e-06,
"loss": 1.2583,
"step": 1590
},
{
"epoch": 0.42574257425742573,
"grad_norm": 4.151821613311768,
"learning_rate": 9.966336227564022e-06,
"loss": 1.1612,
"step": 1591
},
{
"epoch": 0.42601016858442603,
"grad_norm": 4.1624603271484375,
"learning_rate": 9.966233631647646e-06,
"loss": 1.2323,
"step": 1592
},
{
"epoch": 0.4262777629114263,
"grad_norm": 3.706627368927002,
"learning_rate": 9.96613088015972e-06,
"loss": 1.1456,
"step": 1593
},
{
"epoch": 0.42654535723842657,
"grad_norm": 3.398106336593628,
"learning_rate": 9.966027973103462e-06,
"loss": 1.0205,
"step": 1594
},
{
"epoch": 0.4268129515654268,
"grad_norm": 3.6161367893218994,
"learning_rate": 9.9659249104821e-06,
"loss": 1.1139,
"step": 1595
},
{
"epoch": 0.42708054589242705,
"grad_norm": 3.886651039123535,
"learning_rate": 9.965821692298858e-06,
"loss": 1.1711,
"step": 1596
},
{
"epoch": 0.42734814021942735,
"grad_norm": 4.018932342529297,
"learning_rate": 9.965718318556971e-06,
"loss": 1.3092,
"step": 1597
},
{
"epoch": 0.4276157345464276,
"grad_norm": 3.6425957679748535,
"learning_rate": 9.96561478925968e-06,
"loss": 1.1201,
"step": 1598
},
{
"epoch": 0.4278833288734279,
"grad_norm": 4.067368030548096,
"learning_rate": 9.965511104410224e-06,
"loss": 1.2909,
"step": 1599
},
{
"epoch": 0.42815092320042814,
"grad_norm": 3.597480297088623,
"learning_rate": 9.965407264011852e-06,
"loss": 1.264,
"step": 1600
},
{
"epoch": 0.42841851752742843,
"grad_norm": 3.685746669769287,
"learning_rate": 9.965303268067819e-06,
"loss": 1.2146,
"step": 1601
},
{
"epoch": 0.4286861118544287,
"grad_norm": 3.8286211490631104,
"learning_rate": 9.965199116581381e-06,
"loss": 1.1627,
"step": 1602
},
{
"epoch": 0.428953706181429,
"grad_norm": 3.950927495956421,
"learning_rate": 9.9650948095558e-06,
"loss": 1.1935,
"step": 1603
},
{
"epoch": 0.4292213005084292,
"grad_norm": 3.7244269847869873,
"learning_rate": 9.964990346994346e-06,
"loss": 1.1994,
"step": 1604
},
{
"epoch": 0.4294888948354295,
"grad_norm": 3.8138673305511475,
"learning_rate": 9.96488572890029e-06,
"loss": 1.0713,
"step": 1605
},
{
"epoch": 0.42975648916242976,
"grad_norm": 3.689394474029541,
"learning_rate": 9.964780955276909e-06,
"loss": 1.1475,
"step": 1606
},
{
"epoch": 0.43002408348943,
"grad_norm": 3.922783851623535,
"learning_rate": 9.964676026127484e-06,
"loss": 1.2439,
"step": 1607
},
{
"epoch": 0.4302916778164303,
"grad_norm": 4.035757541656494,
"learning_rate": 9.964570941455304e-06,
"loss": 1.2235,
"step": 1608
},
{
"epoch": 0.43055927214343054,
"grad_norm": 3.4453186988830566,
"learning_rate": 9.96446570126366e-06,
"loss": 1.0857,
"step": 1609
},
{
"epoch": 0.43082686647043084,
"grad_norm": 3.796252727508545,
"learning_rate": 9.96436030555585e-06,
"loss": 1.3118,
"step": 1610
},
{
"epoch": 0.4310944607974311,
"grad_norm": 3.9162800312042236,
"learning_rate": 9.964254754335172e-06,
"loss": 1.2191,
"step": 1611
},
{
"epoch": 0.4313620551244314,
"grad_norm": 3.400801658630371,
"learning_rate": 9.964149047604936e-06,
"loss": 1.1256,
"step": 1612
},
{
"epoch": 0.4316296494514316,
"grad_norm": 3.6633102893829346,
"learning_rate": 9.964043185368453e-06,
"loss": 1.0817,
"step": 1613
},
{
"epoch": 0.4318972437784319,
"grad_norm": 3.536027193069458,
"learning_rate": 9.963937167629039e-06,
"loss": 1.0987,
"step": 1614
},
{
"epoch": 0.43216483810543216,
"grad_norm": 3.694162368774414,
"learning_rate": 9.963830994390014e-06,
"loss": 1.2215,
"step": 1615
},
{
"epoch": 0.43243243243243246,
"grad_norm": 3.681429862976074,
"learning_rate": 9.963724665654704e-06,
"loss": 1.2003,
"step": 1616
},
{
"epoch": 0.4327000267594327,
"grad_norm": 3.9966611862182617,
"learning_rate": 9.963618181426443e-06,
"loss": 1.1236,
"step": 1617
},
{
"epoch": 0.43296762108643294,
"grad_norm": 3.875614643096924,
"learning_rate": 9.96351154170856e-06,
"loss": 1.1395,
"step": 1618
},
{
"epoch": 0.43323521541343324,
"grad_norm": 3.63798451423645,
"learning_rate": 9.963404746504403e-06,
"loss": 1.1578,
"step": 1619
},
{
"epoch": 0.4335028097404335,
"grad_norm": 3.4750473499298096,
"learning_rate": 9.963297795817312e-06,
"loss": 1.1385,
"step": 1620
},
{
"epoch": 0.4337704040674338,
"grad_norm": 3.4065208435058594,
"learning_rate": 9.963190689650642e-06,
"loss": 0.9694,
"step": 1621
},
{
"epoch": 0.434037998394434,
"grad_norm": 3.5707483291625977,
"learning_rate": 9.963083428007744e-06,
"loss": 1.1541,
"step": 1622
},
{
"epoch": 0.4343055927214343,
"grad_norm": 4.226485252380371,
"learning_rate": 9.96297601089198e-06,
"loss": 1.2521,
"step": 1623
},
{
"epoch": 0.43457318704843456,
"grad_norm": 3.8039398193359375,
"learning_rate": 9.962868438306714e-06,
"loss": 1.1212,
"step": 1624
},
{
"epoch": 0.43484078137543486,
"grad_norm": 3.8403160572052,
"learning_rate": 9.962760710255317e-06,
"loss": 1.1157,
"step": 1625
},
{
"epoch": 0.4351083757024351,
"grad_norm": 3.634899139404297,
"learning_rate": 9.962652826741164e-06,
"loss": 1.1387,
"step": 1626
},
{
"epoch": 0.4353759700294354,
"grad_norm": 3.621347188949585,
"learning_rate": 9.962544787767634e-06,
"loss": 1.1823,
"step": 1627
},
{
"epoch": 0.43564356435643564,
"grad_norm": 4.029088973999023,
"learning_rate": 9.962436593338109e-06,
"loss": 1.0639,
"step": 1628
},
{
"epoch": 0.4359111586834359,
"grad_norm": 3.6505720615386963,
"learning_rate": 9.962328243455983e-06,
"loss": 1.1589,
"step": 1629
},
{
"epoch": 0.4361787530104362,
"grad_norm": 3.7965028285980225,
"learning_rate": 9.962219738124645e-06,
"loss": 1.1716,
"step": 1630
},
{
"epoch": 0.4364463473374364,
"grad_norm": 3.631714105606079,
"learning_rate": 9.962111077347499e-06,
"loss": 1.1798,
"step": 1631
},
{
"epoch": 0.4367139416644367,
"grad_norm": 3.4737563133239746,
"learning_rate": 9.962002261127946e-06,
"loss": 1.224,
"step": 1632
},
{
"epoch": 0.43698153599143696,
"grad_norm": 4.074963092803955,
"learning_rate": 9.961893289469394e-06,
"loss": 1.2144,
"step": 1633
},
{
"epoch": 0.43724913031843726,
"grad_norm": 3.729600429534912,
"learning_rate": 9.961784162375258e-06,
"loss": 1.2326,
"step": 1634
},
{
"epoch": 0.4375167246454375,
"grad_norm": 3.4806137084960938,
"learning_rate": 9.961674879848957e-06,
"loss": 0.9848,
"step": 1635
},
{
"epoch": 0.4377843189724378,
"grad_norm": 3.5938944816589355,
"learning_rate": 9.961565441893914e-06,
"loss": 1.0944,
"step": 1636
},
{
"epoch": 0.43805191329943804,
"grad_norm": 3.7552433013916016,
"learning_rate": 9.961455848513557e-06,
"loss": 1.1817,
"step": 1637
},
{
"epoch": 0.43831950762643834,
"grad_norm": 3.6959292888641357,
"learning_rate": 9.961346099711319e-06,
"loss": 1.1635,
"step": 1638
},
{
"epoch": 0.4385871019534386,
"grad_norm": 4.031107425689697,
"learning_rate": 9.961236195490638e-06,
"loss": 1.3005,
"step": 1639
},
{
"epoch": 0.4388546962804388,
"grad_norm": 3.8287787437438965,
"learning_rate": 9.961126135854957e-06,
"loss": 1.0702,
"step": 1640
},
{
"epoch": 0.4391222906074391,
"grad_norm": 3.573241949081421,
"learning_rate": 9.961015920807722e-06,
"loss": 1.0902,
"step": 1641
},
{
"epoch": 0.43938988493443937,
"grad_norm": 3.861870050430298,
"learning_rate": 9.96090555035239e-06,
"loss": 1.1281,
"step": 1642
},
{
"epoch": 0.43965747926143967,
"grad_norm": 3.52722430229187,
"learning_rate": 9.960795024492413e-06,
"loss": 1.0831,
"step": 1643
},
{
"epoch": 0.4399250735884399,
"grad_norm": 3.898618221282959,
"learning_rate": 9.960684343231258e-06,
"loss": 1.1533,
"step": 1644
},
{
"epoch": 0.4401926679154402,
"grad_norm": 3.61409854888916,
"learning_rate": 9.960573506572391e-06,
"loss": 1.103,
"step": 1645
},
{
"epoch": 0.44046026224244045,
"grad_norm": 3.5603694915771484,
"learning_rate": 9.96046251451928e-06,
"loss": 1.0903,
"step": 1646
},
{
"epoch": 0.44072785656944075,
"grad_norm": 3.9450957775115967,
"learning_rate": 9.960351367075407e-06,
"loss": 1.316,
"step": 1647
},
{
"epoch": 0.440995450896441,
"grad_norm": 3.022848606109619,
"learning_rate": 9.960240064244253e-06,
"loss": 0.9156,
"step": 1648
},
{
"epoch": 0.4412630452234413,
"grad_norm": 3.8123509883880615,
"learning_rate": 9.960128606029302e-06,
"loss": 1.2591,
"step": 1649
},
{
"epoch": 0.44153063955044153,
"grad_norm": 3.739405870437622,
"learning_rate": 9.960016992434047e-06,
"loss": 1.1892,
"step": 1650
},
{
"epoch": 0.44179823387744177,
"grad_norm": 4.091071605682373,
"learning_rate": 9.959905223461985e-06,
"loss": 1.2138,
"step": 1651
},
{
"epoch": 0.44206582820444207,
"grad_norm": 3.7963550090789795,
"learning_rate": 9.959793299116617e-06,
"loss": 1.1637,
"step": 1652
},
{
"epoch": 0.4423334225314423,
"grad_norm": 3.7637977600097656,
"learning_rate": 9.959681219401449e-06,
"loss": 1.1168,
"step": 1653
},
{
"epoch": 0.4426010168584426,
"grad_norm": 3.74827241897583,
"learning_rate": 9.959568984319991e-06,
"loss": 1.1661,
"step": 1654
},
{
"epoch": 0.44286861118544285,
"grad_norm": 3.401951551437378,
"learning_rate": 9.95945659387576e-06,
"loss": 1.1605,
"step": 1655
},
{
"epoch": 0.44313620551244315,
"grad_norm": 3.677436590194702,
"learning_rate": 9.959344048072278e-06,
"loss": 1.2423,
"step": 1656
},
{
"epoch": 0.4434037998394434,
"grad_norm": 4.045171737670898,
"learning_rate": 9.959231346913068e-06,
"loss": 1.2646,
"step": 1657
},
{
"epoch": 0.4436713941664437,
"grad_norm": 4.872179985046387,
"learning_rate": 9.95911849040166e-06,
"loss": 1.2235,
"step": 1658
},
{
"epoch": 0.44393898849344393,
"grad_norm": 3.714308023452759,
"learning_rate": 9.959005478541592e-06,
"loss": 1.074,
"step": 1659
},
{
"epoch": 0.44420658282044423,
"grad_norm": 3.844395160675049,
"learning_rate": 9.958892311336404e-06,
"loss": 1.1599,
"step": 1660
},
{
"epoch": 0.44447417714744447,
"grad_norm": 3.9484751224517822,
"learning_rate": 9.958778988789639e-06,
"loss": 1.088,
"step": 1661
},
{
"epoch": 0.44474177147444477,
"grad_norm": 4.029232501983643,
"learning_rate": 9.958665510904849e-06,
"loss": 1.2989,
"step": 1662
},
{
"epoch": 0.445009365801445,
"grad_norm": 4.975223064422607,
"learning_rate": 9.958551877685586e-06,
"loss": 1.104,
"step": 1663
},
{
"epoch": 0.44527696012844525,
"grad_norm": 3.774137496948242,
"learning_rate": 9.958438089135413e-06,
"loss": 1.0331,
"step": 1664
},
{
"epoch": 0.44554455445544555,
"grad_norm": 3.653740644454956,
"learning_rate": 9.958324145257893e-06,
"loss": 1.1029,
"step": 1665
},
{
"epoch": 0.4458121487824458,
"grad_norm": 4.142993450164795,
"learning_rate": 9.958210046056596e-06,
"loss": 1.2692,
"step": 1666
},
{
"epoch": 0.4460797431094461,
"grad_norm": 4.0442609786987305,
"learning_rate": 9.958095791535095e-06,
"loss": 1.1862,
"step": 1667
},
{
"epoch": 0.44634733743644633,
"grad_norm": 3.8392021656036377,
"learning_rate": 9.957981381696971e-06,
"loss": 1.2256,
"step": 1668
},
{
"epoch": 0.44661493176344663,
"grad_norm": 3.4839348793029785,
"learning_rate": 9.957866816545804e-06,
"loss": 1.1202,
"step": 1669
},
{
"epoch": 0.4468825260904469,
"grad_norm": 3.9099719524383545,
"learning_rate": 9.957752096085187e-06,
"loss": 1.1985,
"step": 1670
},
{
"epoch": 0.44715012041744717,
"grad_norm": 3.7614877223968506,
"learning_rate": 9.957637220318711e-06,
"loss": 1.2736,
"step": 1671
},
{
"epoch": 0.4474177147444474,
"grad_norm": 4.555272579193115,
"learning_rate": 9.957522189249979e-06,
"loss": 1.1661,
"step": 1672
},
{
"epoch": 0.4476853090714477,
"grad_norm": 3.6910009384155273,
"learning_rate": 9.95740700288259e-06,
"loss": 1.3053,
"step": 1673
},
{
"epoch": 0.44795290339844795,
"grad_norm": 3.7056405544281006,
"learning_rate": 9.957291661220154e-06,
"loss": 1.1668,
"step": 1674
},
{
"epoch": 0.4482204977254482,
"grad_norm": 3.738818645477295,
"learning_rate": 9.957176164266283e-06,
"loss": 1.2925,
"step": 1675
},
{
"epoch": 0.4484880920524485,
"grad_norm": 3.5437395572662354,
"learning_rate": 9.957060512024595e-06,
"loss": 1.1237,
"step": 1676
},
{
"epoch": 0.44875568637944874,
"grad_norm": 3.508234977722168,
"learning_rate": 9.956944704498715e-06,
"loss": 1.1684,
"step": 1677
},
{
"epoch": 0.44902328070644904,
"grad_norm": 3.910888433456421,
"learning_rate": 9.95682874169227e-06,
"loss": 1.1898,
"step": 1678
},
{
"epoch": 0.4492908750334493,
"grad_norm": 3.8787474632263184,
"learning_rate": 9.956712623608892e-06,
"loss": 1.1573,
"step": 1679
},
{
"epoch": 0.4495584693604496,
"grad_norm": 3.672773838043213,
"learning_rate": 9.95659635025222e-06,
"loss": 1.042,
"step": 1680
},
{
"epoch": 0.4498260636874498,
"grad_norm": 3.811082601547241,
"learning_rate": 9.956479921625892e-06,
"loss": 1.2272,
"step": 1681
},
{
"epoch": 0.4500936580144501,
"grad_norm": 4.042838096618652,
"learning_rate": 9.95636333773356e-06,
"loss": 1.2527,
"step": 1682
},
{
"epoch": 0.45036125234145036,
"grad_norm": 3.418757915496826,
"learning_rate": 9.956246598578874e-06,
"loss": 1.1833,
"step": 1683
},
{
"epoch": 0.45062884666845066,
"grad_norm": 3.8703603744506836,
"learning_rate": 9.956129704165491e-06,
"loss": 1.2848,
"step": 1684
},
{
"epoch": 0.4508964409954509,
"grad_norm": 3.5439083576202393,
"learning_rate": 9.956012654497073e-06,
"loss": 1.0018,
"step": 1685
},
{
"epoch": 0.45116403532245114,
"grad_norm": 3.858811616897583,
"learning_rate": 9.955895449577289e-06,
"loss": 1.204,
"step": 1686
},
{
"epoch": 0.45143162964945144,
"grad_norm": 3.636284351348877,
"learning_rate": 9.955778089409806e-06,
"loss": 1.1952,
"step": 1687
},
{
"epoch": 0.4516992239764517,
"grad_norm": 3.6916327476501465,
"learning_rate": 9.955660573998305e-06,
"loss": 1.1277,
"step": 1688
},
{
"epoch": 0.451966818303452,
"grad_norm": 3.550443172454834,
"learning_rate": 9.955542903346462e-06,
"loss": 1.1669,
"step": 1689
},
{
"epoch": 0.4522344126304522,
"grad_norm": 3.5514254570007324,
"learning_rate": 9.95542507745797e-06,
"loss": 1.2038,
"step": 1690
},
{
"epoch": 0.4525020069574525,
"grad_norm": 3.6606192588806152,
"learning_rate": 9.955307096336513e-06,
"loss": 1.0902,
"step": 1691
},
{
"epoch": 0.45276960128445276,
"grad_norm": 3.676407814025879,
"learning_rate": 9.955188959985792e-06,
"loss": 1.1543,
"step": 1692
},
{
"epoch": 0.45303719561145306,
"grad_norm": 3.630408763885498,
"learning_rate": 9.955070668409505e-06,
"loss": 1.1552,
"step": 1693
},
{
"epoch": 0.4533047899384533,
"grad_norm": 3.992326259613037,
"learning_rate": 9.954952221611359e-06,
"loss": 1.2438,
"step": 1694
},
{
"epoch": 0.4535723842654536,
"grad_norm": 3.313997507095337,
"learning_rate": 9.954833619595062e-06,
"loss": 1.1001,
"step": 1695
},
{
"epoch": 0.45383997859245384,
"grad_norm": 3.4902310371398926,
"learning_rate": 9.954714862364331e-06,
"loss": 1.0505,
"step": 1696
},
{
"epoch": 0.4541075729194541,
"grad_norm": 3.9076476097106934,
"learning_rate": 9.954595949922889e-06,
"loss": 1.3215,
"step": 1697
},
{
"epoch": 0.4543751672464544,
"grad_norm": 4.445606708526611,
"learning_rate": 9.954476882274458e-06,
"loss": 1.2867,
"step": 1698
},
{
"epoch": 0.4546427615734546,
"grad_norm": 4.114322662353516,
"learning_rate": 9.954357659422766e-06,
"loss": 1.2867,
"step": 1699
},
{
"epoch": 0.4549103559004549,
"grad_norm": 4.046489238739014,
"learning_rate": 9.95423828137155e-06,
"loss": 1.3018,
"step": 1700
},
{
"epoch": 0.45517795022745516,
"grad_norm": 4.090691089630127,
"learning_rate": 9.954118748124552e-06,
"loss": 1.1618,
"step": 1701
},
{
"epoch": 0.45544554455445546,
"grad_norm": 3.791952133178711,
"learning_rate": 9.953999059685513e-06,
"loss": 1.2585,
"step": 1702
},
{
"epoch": 0.4557131388814557,
"grad_norm": 3.9405517578125,
"learning_rate": 9.953879216058185e-06,
"loss": 1.2347,
"step": 1703
},
{
"epoch": 0.455980733208456,
"grad_norm": 3.291191816329956,
"learning_rate": 9.953759217246318e-06,
"loss": 1.1054,
"step": 1704
},
{
"epoch": 0.45624832753545624,
"grad_norm": 3.942545175552368,
"learning_rate": 9.953639063253675e-06,
"loss": 1.1939,
"step": 1705
},
{
"epoch": 0.45651592186245654,
"grad_norm": 3.7756292819976807,
"learning_rate": 9.953518754084019e-06,
"loss": 1.248,
"step": 1706
},
{
"epoch": 0.4567835161894568,
"grad_norm": 3.8124051094055176,
"learning_rate": 9.953398289741116e-06,
"loss": 1.2876,
"step": 1707
},
{
"epoch": 0.457051110516457,
"grad_norm": 3.8491921424865723,
"learning_rate": 9.953277670228745e-06,
"loss": 1.1995,
"step": 1708
},
{
"epoch": 0.4573187048434573,
"grad_norm": 3.814667224884033,
"learning_rate": 9.95315689555068e-06,
"loss": 1.0789,
"step": 1709
},
{
"epoch": 0.45758629917045757,
"grad_norm": 3.2791123390197754,
"learning_rate": 9.953035965710707e-06,
"loss": 1.0521,
"step": 1710
},
{
"epoch": 0.45785389349745786,
"grad_norm": 3.559877395629883,
"learning_rate": 9.952914880712611e-06,
"loss": 1.0571,
"step": 1711
},
{
"epoch": 0.4581214878244581,
"grad_norm": 3.3893074989318848,
"learning_rate": 9.952793640560189e-06,
"loss": 1.0054,
"step": 1712
},
{
"epoch": 0.4583890821514584,
"grad_norm": 3.697608232498169,
"learning_rate": 9.952672245257238e-06,
"loss": 1.1619,
"step": 1713
},
{
"epoch": 0.45865667647845865,
"grad_norm": 3.5724332332611084,
"learning_rate": 9.95255069480756e-06,
"loss": 1.1086,
"step": 1714
},
{
"epoch": 0.45892427080545894,
"grad_norm": 4.316122055053711,
"learning_rate": 9.952428989214962e-06,
"loss": 1.3456,
"step": 1715
},
{
"epoch": 0.4591918651324592,
"grad_norm": 3.5813887119293213,
"learning_rate": 9.952307128483257e-06,
"loss": 1.2169,
"step": 1716
},
{
"epoch": 0.4594594594594595,
"grad_norm": 4.544564723968506,
"learning_rate": 9.952185112616263e-06,
"loss": 1.3719,
"step": 1717
},
{
"epoch": 0.4597270537864597,
"grad_norm": 3.653928518295288,
"learning_rate": 9.952062941617801e-06,
"loss": 1.1425,
"step": 1718
},
{
"epoch": 0.45999464811345997,
"grad_norm": 3.9661028385162354,
"learning_rate": 9.9519406154917e-06,
"loss": 1.2444,
"step": 1719
},
{
"epoch": 0.46026224244046027,
"grad_norm": 3.9497625827789307,
"learning_rate": 9.95181813424179e-06,
"loss": 1.1364,
"step": 1720
},
{
"epoch": 0.4605298367674605,
"grad_norm": 3.682626962661743,
"learning_rate": 9.95169549787191e-06,
"loss": 1.0874,
"step": 1721
},
{
"epoch": 0.4607974310944608,
"grad_norm": 4.0337233543396,
"learning_rate": 9.951572706385901e-06,
"loss": 1.2206,
"step": 1722
},
{
"epoch": 0.46106502542146105,
"grad_norm": 3.902106761932373,
"learning_rate": 9.951449759787608e-06,
"loss": 1.2841,
"step": 1723
},
{
"epoch": 0.46133261974846135,
"grad_norm": 3.6979806423187256,
"learning_rate": 9.951326658080881e-06,
"loss": 1.1272,
"step": 1724
},
{
"epoch": 0.4616002140754616,
"grad_norm": 3.5009329319000244,
"learning_rate": 9.951203401269582e-06,
"loss": 1.0615,
"step": 1725
},
{
"epoch": 0.4618678084024619,
"grad_norm": 3.846033811569214,
"learning_rate": 9.951079989357569e-06,
"loss": 1.267,
"step": 1726
},
{
"epoch": 0.46213540272946213,
"grad_norm": 3.7044687271118164,
"learning_rate": 9.950956422348708e-06,
"loss": 1.228,
"step": 1727
},
{
"epoch": 0.46240299705646243,
"grad_norm": 3.8444931507110596,
"learning_rate": 9.950832700246868e-06,
"loss": 1.1271,
"step": 1728
},
{
"epoch": 0.46267059138346267,
"grad_norm": 3.61540150642395,
"learning_rate": 9.950708823055926e-06,
"loss": 1.1431,
"step": 1729
},
{
"epoch": 0.4629381857104629,
"grad_norm": 3.617910623550415,
"learning_rate": 9.950584790779765e-06,
"loss": 1.1046,
"step": 1730
},
{
"epoch": 0.4632057800374632,
"grad_norm": 4.220783233642578,
"learning_rate": 9.950460603422266e-06,
"loss": 1.2734,
"step": 1731
},
{
"epoch": 0.46347337436446345,
"grad_norm": 3.5768558979034424,
"learning_rate": 9.950336260987323e-06,
"loss": 1.1693,
"step": 1732
},
{
"epoch": 0.46374096869146375,
"grad_norm": 3.8983094692230225,
"learning_rate": 9.950211763478829e-06,
"loss": 1.1813,
"step": 1733
},
{
"epoch": 0.464008563018464,
"grad_norm": 3.659959316253662,
"learning_rate": 9.950087110900686e-06,
"loss": 1.204,
"step": 1734
},
{
"epoch": 0.4642761573454643,
"grad_norm": 3.40910005569458,
"learning_rate": 9.949962303256796e-06,
"loss": 1.1507,
"step": 1735
},
{
"epoch": 0.46454375167246453,
"grad_norm": 3.770167112350464,
"learning_rate": 9.949837340551072e-06,
"loss": 1.121,
"step": 1736
},
{
"epoch": 0.46481134599946483,
"grad_norm": 3.7433769702911377,
"learning_rate": 9.949712222787426e-06,
"loss": 1.1993,
"step": 1737
},
{
"epoch": 0.4650789403264651,
"grad_norm": 3.9215848445892334,
"learning_rate": 9.94958694996978e-06,
"loss": 1.211,
"step": 1738
},
{
"epoch": 0.46534653465346537,
"grad_norm": 3.646552324295044,
"learning_rate": 9.949461522102056e-06,
"loss": 1.1609,
"step": 1739
},
{
"epoch": 0.4656141289804656,
"grad_norm": 3.452594757080078,
"learning_rate": 9.949335939188181e-06,
"loss": 1.0887,
"step": 1740
},
{
"epoch": 0.46588172330746586,
"grad_norm": 3.6262283325195312,
"learning_rate": 9.949210201232094e-06,
"loss": 1.2381,
"step": 1741
},
{
"epoch": 0.46614931763446615,
"grad_norm": 3.7934439182281494,
"learning_rate": 9.949084308237731e-06,
"loss": 1.2578,
"step": 1742
},
{
"epoch": 0.4664169119614664,
"grad_norm": 4.024771690368652,
"learning_rate": 9.948958260209036e-06,
"loss": 1.2287,
"step": 1743
},
{
"epoch": 0.4666845062884667,
"grad_norm": 4.1637115478515625,
"learning_rate": 9.948832057149958e-06,
"loss": 1.4259,
"step": 1744
},
{
"epoch": 0.46695210061546694,
"grad_norm": 3.3593788146972656,
"learning_rate": 9.948705699064452e-06,
"loss": 1.0906,
"step": 1745
},
{
"epoch": 0.46721969494246723,
"grad_norm": 4.606101989746094,
"learning_rate": 9.948579185956472e-06,
"loss": 1.3554,
"step": 1746
},
{
"epoch": 0.4674872892694675,
"grad_norm": 4.054565906524658,
"learning_rate": 9.948452517829984e-06,
"loss": 1.316,
"step": 1747
},
{
"epoch": 0.4677548835964678,
"grad_norm": 3.526566982269287,
"learning_rate": 9.948325694688957e-06,
"loss": 1.1971,
"step": 1748
},
{
"epoch": 0.468022477923468,
"grad_norm": 3.647592544555664,
"learning_rate": 9.948198716537361e-06,
"loss": 1.1751,
"step": 1749
},
{
"epoch": 0.4682900722504683,
"grad_norm": 4.159237384796143,
"learning_rate": 9.948071583379176e-06,
"loss": 1.1648,
"step": 1750
},
{
"epoch": 0.46855766657746856,
"grad_norm": 3.9114818572998047,
"learning_rate": 9.947944295218384e-06,
"loss": 1.2213,
"step": 1751
},
{
"epoch": 0.4688252609044688,
"grad_norm": 3.539741039276123,
"learning_rate": 9.947816852058972e-06,
"loss": 1.1406,
"step": 1752
},
{
"epoch": 0.4690928552314691,
"grad_norm": 3.950688600540161,
"learning_rate": 9.947689253904932e-06,
"loss": 1.1622,
"step": 1753
},
{
"epoch": 0.46936044955846934,
"grad_norm": 3.5240933895111084,
"learning_rate": 9.94756150076026e-06,
"loss": 1.1184,
"step": 1754
},
{
"epoch": 0.46962804388546964,
"grad_norm": 3.455580472946167,
"learning_rate": 9.947433592628964e-06,
"loss": 1.1482,
"step": 1755
},
{
"epoch": 0.4698956382124699,
"grad_norm": 3.8551056385040283,
"learning_rate": 9.947305529515041e-06,
"loss": 1.2818,
"step": 1756
},
{
"epoch": 0.4701632325394702,
"grad_norm": 3.4578497409820557,
"learning_rate": 9.947177311422513e-06,
"loss": 1.1184,
"step": 1757
},
{
"epoch": 0.4704308268664704,
"grad_norm": 3.092772960662842,
"learning_rate": 9.947048938355389e-06,
"loss": 1.1181,
"step": 1758
},
{
"epoch": 0.4706984211934707,
"grad_norm": 3.7363767623901367,
"learning_rate": 9.946920410317694e-06,
"loss": 1.1509,
"step": 1759
},
{
"epoch": 0.47096601552047096,
"grad_norm": 3.5836987495422363,
"learning_rate": 9.946791727313453e-06,
"loss": 1.2198,
"step": 1760
},
{
"epoch": 0.47123360984747126,
"grad_norm": 3.2804808616638184,
"learning_rate": 9.946662889346693e-06,
"loss": 1.2418,
"step": 1761
},
{
"epoch": 0.4715012041744715,
"grad_norm": 3.3668692111968994,
"learning_rate": 9.94653389642146e-06,
"loss": 1.0194,
"step": 1762
},
{
"epoch": 0.47176879850147174,
"grad_norm": 3.753690481185913,
"learning_rate": 9.946404748541787e-06,
"loss": 1.194,
"step": 1763
},
{
"epoch": 0.47203639282847204,
"grad_norm": 3.6186954975128174,
"learning_rate": 9.946275445711722e-06,
"loss": 1.2537,
"step": 1764
},
{
"epoch": 0.4723039871554723,
"grad_norm": 3.8216750621795654,
"learning_rate": 9.946145987935315e-06,
"loss": 1.0967,
"step": 1765
},
{
"epoch": 0.4725715814824726,
"grad_norm": 3.866121530532837,
"learning_rate": 9.946016375216624e-06,
"loss": 1.1481,
"step": 1766
},
{
"epoch": 0.4728391758094728,
"grad_norm": 3.7349894046783447,
"learning_rate": 9.945886607559703e-06,
"loss": 1.1937,
"step": 1767
},
{
"epoch": 0.4731067701364731,
"grad_norm": 3.244333505630493,
"learning_rate": 9.945756684968624e-06,
"loss": 1.062,
"step": 1768
},
{
"epoch": 0.47337436446347336,
"grad_norm": 3.341917037963867,
"learning_rate": 9.945626607447452e-06,
"loss": 1.1912,
"step": 1769
},
{
"epoch": 0.47364195879047366,
"grad_norm": 3.748084545135498,
"learning_rate": 9.945496375000265e-06,
"loss": 1.1499,
"step": 1770
},
{
"epoch": 0.4739095531174739,
"grad_norm": 4.144589900970459,
"learning_rate": 9.94536598763114e-06,
"loss": 1.3394,
"step": 1771
},
{
"epoch": 0.4741771474444742,
"grad_norm": 3.9339520931243896,
"learning_rate": 9.945235445344164e-06,
"loss": 1.319,
"step": 1772
},
{
"epoch": 0.47444474177147444,
"grad_norm": 3.8457014560699463,
"learning_rate": 9.945104748143426e-06,
"loss": 1.1954,
"step": 1773
},
{
"epoch": 0.4747123360984747,
"grad_norm": 3.6537280082702637,
"learning_rate": 9.944973896033017e-06,
"loss": 1.1368,
"step": 1774
},
{
"epoch": 0.474979930425475,
"grad_norm": 3.5252084732055664,
"learning_rate": 9.944842889017042e-06,
"loss": 1.1575,
"step": 1775
},
{
"epoch": 0.4752475247524752,
"grad_norm": 3.692296266555786,
"learning_rate": 9.944711727099597e-06,
"loss": 1.1172,
"step": 1776
},
{
"epoch": 0.4755151190794755,
"grad_norm": 3.7176449298858643,
"learning_rate": 9.944580410284799e-06,
"loss": 1.3006,
"step": 1777
},
{
"epoch": 0.47578271340647577,
"grad_norm": 3.9314935207366943,
"learning_rate": 9.944448938576755e-06,
"loss": 1.1859,
"step": 1778
},
{
"epoch": 0.47605030773347606,
"grad_norm": 3.7619473934173584,
"learning_rate": 9.944317311979587e-06,
"loss": 1.273,
"step": 1779
},
{
"epoch": 0.4763179020604763,
"grad_norm": 3.6345512866973877,
"learning_rate": 9.944185530497419e-06,
"loss": 1.1356,
"step": 1780
},
{
"epoch": 0.4765854963874766,
"grad_norm": 4.064966201782227,
"learning_rate": 9.944053594134374e-06,
"loss": 1.3057,
"step": 1781
},
{
"epoch": 0.47685309071447685,
"grad_norm": 3.3477392196655273,
"learning_rate": 9.943921502894593e-06,
"loss": 1.1066,
"step": 1782
},
{
"epoch": 0.47712068504147714,
"grad_norm": 3.5054497718811035,
"learning_rate": 9.943789256782208e-06,
"loss": 1.1497,
"step": 1783
},
{
"epoch": 0.4773882793684774,
"grad_norm": 3.596972703933716,
"learning_rate": 9.943656855801364e-06,
"loss": 1.195,
"step": 1784
},
{
"epoch": 0.47765587369547763,
"grad_norm": 3.870314598083496,
"learning_rate": 9.943524299956206e-06,
"loss": 1.1513,
"step": 1785
},
{
"epoch": 0.4779234680224779,
"grad_norm": 3.6923234462738037,
"learning_rate": 9.94339158925089e-06,
"loss": 1.2705,
"step": 1786
},
{
"epoch": 0.47819106234947817,
"grad_norm": 3.6829121112823486,
"learning_rate": 9.94325872368957e-06,
"loss": 1.1401,
"step": 1787
},
{
"epoch": 0.47845865667647847,
"grad_norm": 3.6094305515289307,
"learning_rate": 9.943125703276411e-06,
"loss": 1.2009,
"step": 1788
},
{
"epoch": 0.4787262510034787,
"grad_norm": 3.806605339050293,
"learning_rate": 9.94299252801558e-06,
"loss": 1.1255,
"step": 1789
},
{
"epoch": 0.478993845330479,
"grad_norm": 3.4413986206054688,
"learning_rate": 9.942859197911246e-06,
"loss": 1.099,
"step": 1790
},
{
"epoch": 0.47926143965747925,
"grad_norm": 3.7464005947113037,
"learning_rate": 9.942725712967587e-06,
"loss": 1.1829,
"step": 1791
},
{
"epoch": 0.47952903398447955,
"grad_norm": 4.125034809112549,
"learning_rate": 9.942592073188783e-06,
"loss": 1.3325,
"step": 1792
},
{
"epoch": 0.4797966283114798,
"grad_norm": 3.741257429122925,
"learning_rate": 9.942458278579026e-06,
"loss": 1.1842,
"step": 1793
},
{
"epoch": 0.4800642226384801,
"grad_norm": 3.944084405899048,
"learning_rate": 9.9423243291425e-06,
"loss": 1.3479,
"step": 1794
},
{
"epoch": 0.48033181696548033,
"grad_norm": 3.8254520893096924,
"learning_rate": 9.942190224883406e-06,
"loss": 1.204,
"step": 1795
},
{
"epoch": 0.48059941129248057,
"grad_norm": 3.4253695011138916,
"learning_rate": 9.942055965805943e-06,
"loss": 1.0251,
"step": 1796
},
{
"epoch": 0.48086700561948087,
"grad_norm": 3.6683967113494873,
"learning_rate": 9.941921551914318e-06,
"loss": 1.1936,
"step": 1797
},
{
"epoch": 0.4811345999464811,
"grad_norm": 3.3697001934051514,
"learning_rate": 9.94178698321274e-06,
"loss": 1.0839,
"step": 1798
},
{
"epoch": 0.4814021942734814,
"grad_norm": 3.724254846572876,
"learning_rate": 9.941652259705425e-06,
"loss": 1.2582,
"step": 1799
},
{
"epoch": 0.48166978860048165,
"grad_norm": 3.8191325664520264,
"learning_rate": 9.941517381396594e-06,
"loss": 1.1972,
"step": 1800
},
{
"epoch": 0.48193738292748195,
"grad_norm": 3.812429904937744,
"learning_rate": 9.941382348290471e-06,
"loss": 1.1348,
"step": 1801
},
{
"epoch": 0.4822049772544822,
"grad_norm": 3.5466363430023193,
"learning_rate": 9.941247160391288e-06,
"loss": 1.2157,
"step": 1802
},
{
"epoch": 0.4824725715814825,
"grad_norm": 3.9619693756103516,
"learning_rate": 9.94111181770328e-06,
"loss": 1.2637,
"step": 1803
},
{
"epoch": 0.48274016590848273,
"grad_norm": 3.724550485610962,
"learning_rate": 9.940976320230682e-06,
"loss": 1.1937,
"step": 1804
},
{
"epoch": 0.48300776023548303,
"grad_norm": 3.91395902633667,
"learning_rate": 9.940840667977745e-06,
"loss": 1.1857,
"step": 1805
},
{
"epoch": 0.4832753545624833,
"grad_norm": 3.2114241123199463,
"learning_rate": 9.940704860948713e-06,
"loss": 1.0522,
"step": 1806
},
{
"epoch": 0.48354294888948357,
"grad_norm": 3.380030870437622,
"learning_rate": 9.940568899147844e-06,
"loss": 1.1107,
"step": 1807
},
{
"epoch": 0.4838105432164838,
"grad_norm": 3.728666067123413,
"learning_rate": 9.940432782579395e-06,
"loss": 1.2433,
"step": 1808
},
{
"epoch": 0.48407813754348405,
"grad_norm": 3.6429684162139893,
"learning_rate": 9.940296511247631e-06,
"loss": 1.0235,
"step": 1809
},
{
"epoch": 0.48434573187048435,
"grad_norm": 3.6570608615875244,
"learning_rate": 9.94016008515682e-06,
"loss": 1.1839,
"step": 1810
},
{
"epoch": 0.4846133261974846,
"grad_norm": 3.6205060482025146,
"learning_rate": 9.940023504311237e-06,
"loss": 1.0398,
"step": 1811
},
{
"epoch": 0.4848809205244849,
"grad_norm": 3.8173046112060547,
"learning_rate": 9.93988676871516e-06,
"loss": 1.1421,
"step": 1812
},
{
"epoch": 0.48514851485148514,
"grad_norm": 3.792316198348999,
"learning_rate": 9.939749878372873e-06,
"loss": 1.2172,
"step": 1813
},
{
"epoch": 0.48541610917848543,
"grad_norm": 3.5528106689453125,
"learning_rate": 9.939612833288662e-06,
"loss": 1.0964,
"step": 1814
},
{
"epoch": 0.4856837035054857,
"grad_norm": 3.7796194553375244,
"learning_rate": 9.939475633466822e-06,
"loss": 1.1715,
"step": 1815
},
{
"epoch": 0.485951297832486,
"grad_norm": 4.104870319366455,
"learning_rate": 9.93933827891165e-06,
"loss": 1.321,
"step": 1816
},
{
"epoch": 0.4862188921594862,
"grad_norm": 3.6252858638763428,
"learning_rate": 9.93920076962745e-06,
"loss": 1.1834,
"step": 1817
},
{
"epoch": 0.4864864864864865,
"grad_norm": 3.8419394493103027,
"learning_rate": 9.939063105618525e-06,
"loss": 1.0172,
"step": 1818
},
{
"epoch": 0.48675408081348676,
"grad_norm": 3.7632715702056885,
"learning_rate": 9.938925286889194e-06,
"loss": 1.1501,
"step": 1819
},
{
"epoch": 0.487021675140487,
"grad_norm": 3.928379535675049,
"learning_rate": 9.938787313443771e-06,
"loss": 1.2283,
"step": 1820
},
{
"epoch": 0.4872892694674873,
"grad_norm": 3.341074228286743,
"learning_rate": 9.93864918528658e-06,
"loss": 1.038,
"step": 1821
},
{
"epoch": 0.48755686379448754,
"grad_norm": 3.8273613452911377,
"learning_rate": 9.938510902421945e-06,
"loss": 1.2315,
"step": 1822
},
{
"epoch": 0.48782445812148784,
"grad_norm": 3.6578738689422607,
"learning_rate": 9.938372464854198e-06,
"loss": 1.1331,
"step": 1823
},
{
"epoch": 0.4880920524484881,
"grad_norm": 3.7590830326080322,
"learning_rate": 9.93823387258768e-06,
"loss": 1.0829,
"step": 1824
},
{
"epoch": 0.4883596467754884,
"grad_norm": 3.6043503284454346,
"learning_rate": 9.938095125626726e-06,
"loss": 1.0529,
"step": 1825
},
{
"epoch": 0.4886272411024886,
"grad_norm": 3.854071617126465,
"learning_rate": 9.93795622397569e-06,
"loss": 1.2383,
"step": 1826
},
{
"epoch": 0.4888948354294889,
"grad_norm": 3.758488416671753,
"learning_rate": 9.937817167638914e-06,
"loss": 1.0957,
"step": 1827
},
{
"epoch": 0.48916242975648916,
"grad_norm": 3.695533514022827,
"learning_rate": 9.937677956620764e-06,
"loss": 1.3151,
"step": 1828
},
{
"epoch": 0.48943002408348946,
"grad_norm": 3.5443248748779297,
"learning_rate": 9.937538590925593e-06,
"loss": 1.0494,
"step": 1829
},
{
"epoch": 0.4896976184104897,
"grad_norm": 3.6536788940429688,
"learning_rate": 9.937399070557771e-06,
"loss": 1.218,
"step": 1830
},
{
"epoch": 0.48996521273748994,
"grad_norm": 3.929737091064453,
"learning_rate": 9.937259395521667e-06,
"loss": 1.1923,
"step": 1831
},
{
"epoch": 0.49023280706449024,
"grad_norm": 3.3342623710632324,
"learning_rate": 9.937119565821658e-06,
"loss": 1.1186,
"step": 1832
},
{
"epoch": 0.4905004013914905,
"grad_norm": 3.6802546977996826,
"learning_rate": 9.936979581462122e-06,
"loss": 1.2171,
"step": 1833
},
{
"epoch": 0.4907679957184908,
"grad_norm": 3.4526920318603516,
"learning_rate": 9.936839442447446e-06,
"loss": 1.0922,
"step": 1834
},
{
"epoch": 0.491035590045491,
"grad_norm": 3.7504050731658936,
"learning_rate": 9.936699148782018e-06,
"loss": 1.0743,
"step": 1835
},
{
"epoch": 0.4913031843724913,
"grad_norm": 3.873074531555176,
"learning_rate": 9.936558700470234e-06,
"loss": 1.2213,
"step": 1836
},
{
"epoch": 0.49157077869949156,
"grad_norm": 3.6571013927459717,
"learning_rate": 9.936418097516495e-06,
"loss": 1.123,
"step": 1837
},
{
"epoch": 0.49183837302649186,
"grad_norm": 3.5951497554779053,
"learning_rate": 9.936277339925205e-06,
"loss": 1.1968,
"step": 1838
},
{
"epoch": 0.4921059673534921,
"grad_norm": 3.9791698455810547,
"learning_rate": 9.93613642770077e-06,
"loss": 1.1248,
"step": 1839
},
{
"epoch": 0.4923735616804924,
"grad_norm": 4.2198100090026855,
"learning_rate": 9.935995360847608e-06,
"loss": 1.1946,
"step": 1840
},
{
"epoch": 0.49264115600749264,
"grad_norm": 3.915623188018799,
"learning_rate": 9.935854139370139e-06,
"loss": 1.1836,
"step": 1841
},
{
"epoch": 0.4929087503344929,
"grad_norm": 3.8059470653533936,
"learning_rate": 9.93571276327278e-06,
"loss": 1.2146,
"step": 1842
},
{
"epoch": 0.4931763446614932,
"grad_norm": 4.118159770965576,
"learning_rate": 9.93557123255997e-06,
"loss": 1.1451,
"step": 1843
},
{
"epoch": 0.4934439389884934,
"grad_norm": 4.63586950302124,
"learning_rate": 9.935429547236131e-06,
"loss": 1.4108,
"step": 1844
},
{
"epoch": 0.4937115333154937,
"grad_norm": 3.541332244873047,
"learning_rate": 9.935287707305712e-06,
"loss": 1.0874,
"step": 1845
},
{
"epoch": 0.49397912764249396,
"grad_norm": 3.4757399559020996,
"learning_rate": 9.93514571277315e-06,
"loss": 1.1712,
"step": 1846
},
{
"epoch": 0.49424672196949426,
"grad_norm": 3.835604190826416,
"learning_rate": 9.935003563642895e-06,
"loss": 1.1442,
"step": 1847
},
{
"epoch": 0.4945143162964945,
"grad_norm": 3.701040029525757,
"learning_rate": 9.934861259919399e-06,
"loss": 1.0242,
"step": 1848
},
{
"epoch": 0.4947819106234948,
"grad_norm": 3.7247939109802246,
"learning_rate": 9.934718801607122e-06,
"loss": 1.2422,
"step": 1849
},
{
"epoch": 0.49504950495049505,
"grad_norm": 4.011390209197998,
"learning_rate": 9.934576188710524e-06,
"loss": 1.2711,
"step": 1850
},
{
"epoch": 0.49531709927749534,
"grad_norm": 3.3684377670288086,
"learning_rate": 9.934433421234073e-06,
"loss": 1.1051,
"step": 1851
},
{
"epoch": 0.4955846936044956,
"grad_norm": 3.550625801086426,
"learning_rate": 9.934290499182244e-06,
"loss": 1.2236,
"step": 1852
},
{
"epoch": 0.4958522879314958,
"grad_norm": 4.006191253662109,
"learning_rate": 9.93414742255951e-06,
"loss": 1.2605,
"step": 1853
},
{
"epoch": 0.4961198822584961,
"grad_norm": 3.7999353408813477,
"learning_rate": 9.934004191370356e-06,
"loss": 1.2019,
"step": 1854
},
{
"epoch": 0.49638747658549637,
"grad_norm": 3.6491141319274902,
"learning_rate": 9.933860805619269e-06,
"loss": 1.1939,
"step": 1855
},
{
"epoch": 0.49665507091249667,
"grad_norm": 3.60182785987854,
"learning_rate": 9.933717265310739e-06,
"loss": 1.185,
"step": 1856
},
{
"epoch": 0.4969226652394969,
"grad_norm": 3.517396926879883,
"learning_rate": 9.933573570449262e-06,
"loss": 1.0801,
"step": 1857
},
{
"epoch": 0.4971902595664972,
"grad_norm": 3.847062349319458,
"learning_rate": 9.93342972103934e-06,
"loss": 1.1699,
"step": 1858
},
{
"epoch": 0.49745785389349745,
"grad_norm": 3.5466854572296143,
"learning_rate": 9.933285717085482e-06,
"loss": 1.088,
"step": 1859
},
{
"epoch": 0.49772544822049775,
"grad_norm": 4.013504981994629,
"learning_rate": 9.933141558592196e-06,
"loss": 1.2217,
"step": 1860
},
{
"epoch": 0.497993042547498,
"grad_norm": 4.0954155921936035,
"learning_rate": 9.932997245563997e-06,
"loss": 1.231,
"step": 1861
},
{
"epoch": 0.4982606368744983,
"grad_norm": 3.723498821258545,
"learning_rate": 9.93285277800541e-06,
"loss": 1.1645,
"step": 1862
},
{
"epoch": 0.49852823120149853,
"grad_norm": 3.436872720718384,
"learning_rate": 9.932708155920957e-06,
"loss": 1.1673,
"step": 1863
},
{
"epoch": 0.49879582552849877,
"grad_norm": 3.8395087718963623,
"learning_rate": 9.932563379315168e-06,
"loss": 1.2485,
"step": 1864
},
{
"epoch": 0.49906341985549907,
"grad_norm": 3.937257766723633,
"learning_rate": 9.93241844819258e-06,
"loss": 1.2447,
"step": 1865
},
{
"epoch": 0.4993310141824993,
"grad_norm": 3.5979080200195312,
"learning_rate": 9.932273362557734e-06,
"loss": 1.153,
"step": 1866
},
{
"epoch": 0.4995986085094996,
"grad_norm": 3.8511085510253906,
"learning_rate": 9.932128122415173e-06,
"loss": 1.1053,
"step": 1867
},
{
"epoch": 0.49986620283649985,
"grad_norm": 4.010068893432617,
"learning_rate": 9.931982727769448e-06,
"loss": 1.155,
"step": 1868
},
{
"epoch": 0.5001337971635001,
"grad_norm": 3.749917507171631,
"learning_rate": 9.931837178625111e-06,
"loss": 1.1328,
"step": 1869
},
{
"epoch": 0.5004013914905004,
"grad_norm": 3.668951988220215,
"learning_rate": 9.931691474986726e-06,
"loss": 1.0613,
"step": 1870
},
{
"epoch": 0.5006689858175006,
"grad_norm": 3.563898801803589,
"learning_rate": 9.931545616858853e-06,
"loss": 1.1231,
"step": 1871
},
{
"epoch": 0.5009365801445009,
"grad_norm": 3.758409023284912,
"learning_rate": 9.931399604246064e-06,
"loss": 1.2123,
"step": 1872
},
{
"epoch": 0.5012041744715012,
"grad_norm": 3.4294962882995605,
"learning_rate": 9.93125343715293e-06,
"loss": 1.0552,
"step": 1873
},
{
"epoch": 0.5014717687985015,
"grad_norm": 3.464952230453491,
"learning_rate": 9.931107115584034e-06,
"loss": 1.1708,
"step": 1874
},
{
"epoch": 0.5017393631255017,
"grad_norm": 3.9118897914886475,
"learning_rate": 9.930960639543956e-06,
"loss": 1.1202,
"step": 1875
},
{
"epoch": 0.502006957452502,
"grad_norm": 3.2876811027526855,
"learning_rate": 9.930814009037286e-06,
"loss": 1.0269,
"step": 1876
},
{
"epoch": 0.5022745517795023,
"grad_norm": 3.571906566619873,
"learning_rate": 9.930667224068618e-06,
"loss": 1.1515,
"step": 1877
},
{
"epoch": 0.5025421461065025,
"grad_norm": 3.775341510772705,
"learning_rate": 9.930520284642548e-06,
"loss": 1.1708,
"step": 1878
},
{
"epoch": 0.5028097404335028,
"grad_norm": 3.8571135997772217,
"learning_rate": 9.93037319076368e-06,
"loss": 1.1639,
"step": 1879
},
{
"epoch": 0.5030773347605031,
"grad_norm": 3.8149497509002686,
"learning_rate": 9.930225942436623e-06,
"loss": 1.2267,
"step": 1880
},
{
"epoch": 0.5033449290875034,
"grad_norm": 3.51364803314209,
"learning_rate": 9.930078539665988e-06,
"loss": 1.1698,
"step": 1881
},
{
"epoch": 0.5036125234145036,
"grad_norm": 3.4830048084259033,
"learning_rate": 9.929930982456395e-06,
"loss": 1.1729,
"step": 1882
},
{
"epoch": 0.5038801177415039,
"grad_norm": 3.5782647132873535,
"learning_rate": 9.929783270812464e-06,
"loss": 1.1596,
"step": 1883
},
{
"epoch": 0.5041477120685042,
"grad_norm": 3.836897373199463,
"learning_rate": 9.929635404738822e-06,
"loss": 1.2473,
"step": 1884
},
{
"epoch": 0.5044153063955045,
"grad_norm": 3.5359864234924316,
"learning_rate": 9.929487384240103e-06,
"loss": 1.052,
"step": 1885
},
{
"epoch": 0.5046829007225047,
"grad_norm": 3.7053215503692627,
"learning_rate": 9.929339209320944e-06,
"loss": 1.1623,
"step": 1886
},
{
"epoch": 0.504950495049505,
"grad_norm": 3.838304042816162,
"learning_rate": 9.929190879985982e-06,
"loss": 1.2307,
"step": 1887
},
{
"epoch": 0.5052180893765053,
"grad_norm": 3.681903123855591,
"learning_rate": 9.929042396239869e-06,
"loss": 1.3262,
"step": 1888
},
{
"epoch": 0.5054856837035054,
"grad_norm": 3.6203062534332275,
"learning_rate": 9.928893758087254e-06,
"loss": 1.1616,
"step": 1889
},
{
"epoch": 0.5057532780305057,
"grad_norm": 3.843017339706421,
"learning_rate": 9.928744965532795e-06,
"loss": 1.184,
"step": 1890
},
{
"epoch": 0.506020872357506,
"grad_norm": 4.003540992736816,
"learning_rate": 9.928596018581151e-06,
"loss": 1.0719,
"step": 1891
},
{
"epoch": 0.5062884666845063,
"grad_norm": 3.225344657897949,
"learning_rate": 9.928446917236988e-06,
"loss": 0.9902,
"step": 1892
},
{
"epoch": 0.5065560610115065,
"grad_norm": 4.046036720275879,
"learning_rate": 9.928297661504978e-06,
"loss": 1.1583,
"step": 1893
},
{
"epoch": 0.5068236553385068,
"grad_norm": 3.522110939025879,
"learning_rate": 9.928148251389796e-06,
"loss": 1.0941,
"step": 1894
},
{
"epoch": 0.5070912496655071,
"grad_norm": 3.5445072650909424,
"learning_rate": 9.92799868689612e-06,
"loss": 1.1043,
"step": 1895
},
{
"epoch": 0.5073588439925074,
"grad_norm": 3.7460379600524902,
"learning_rate": 9.927848968028642e-06,
"loss": 1.1259,
"step": 1896
},
{
"epoch": 0.5076264383195076,
"grad_norm": 3.518141508102417,
"learning_rate": 9.927699094792045e-06,
"loss": 1.0938,
"step": 1897
},
{
"epoch": 0.5078940326465079,
"grad_norm": 4.169661521911621,
"learning_rate": 9.927549067191026e-06,
"loss": 1.3043,
"step": 1898
},
{
"epoch": 0.5081616269735082,
"grad_norm": 3.7639896869659424,
"learning_rate": 9.927398885230286e-06,
"loss": 1.254,
"step": 1899
},
{
"epoch": 0.5084292213005084,
"grad_norm": 3.5845093727111816,
"learning_rate": 9.927248548914528e-06,
"loss": 1.0115,
"step": 1900
},
{
"epoch": 0.5086968156275087,
"grad_norm": 3.618220806121826,
"learning_rate": 9.927098058248463e-06,
"loss": 1.1713,
"step": 1901
},
{
"epoch": 0.508964409954509,
"grad_norm": 3.6645729541778564,
"learning_rate": 9.926947413236806e-06,
"loss": 1.1468,
"step": 1902
},
{
"epoch": 0.5092320042815093,
"grad_norm": 3.4273576736450195,
"learning_rate": 9.926796613884271e-06,
"loss": 1.0282,
"step": 1903
},
{
"epoch": 0.5094995986085095,
"grad_norm": 4.018494606018066,
"learning_rate": 9.926645660195588e-06,
"loss": 1.2789,
"step": 1904
},
{
"epoch": 0.5097671929355098,
"grad_norm": 3.431507110595703,
"learning_rate": 9.926494552175484e-06,
"loss": 1.1095,
"step": 1905
},
{
"epoch": 0.5100347872625101,
"grad_norm": 3.723026752471924,
"learning_rate": 9.926343289828689e-06,
"loss": 1.1774,
"step": 1906
},
{
"epoch": 0.5103023815895104,
"grad_norm": 4.003593921661377,
"learning_rate": 9.926191873159945e-06,
"loss": 1.2947,
"step": 1907
},
{
"epoch": 0.5105699759165105,
"grad_norm": 3.923344373703003,
"learning_rate": 9.926040302173995e-06,
"loss": 1.3416,
"step": 1908
},
{
"epoch": 0.5108375702435108,
"grad_norm": 4.057835578918457,
"learning_rate": 9.925888576875588e-06,
"loss": 1.1635,
"step": 1909
},
{
"epoch": 0.5111051645705111,
"grad_norm": 3.939828395843506,
"learning_rate": 9.925736697269474e-06,
"loss": 1.3077,
"step": 1910
},
{
"epoch": 0.5113727588975113,
"grad_norm": 4.334293365478516,
"learning_rate": 9.925584663360412e-06,
"loss": 1.2711,
"step": 1911
},
{
"epoch": 0.5116403532245116,
"grad_norm": 3.6700150966644287,
"learning_rate": 9.925432475153166e-06,
"loss": 1.2447,
"step": 1912
},
{
"epoch": 0.5119079475515119,
"grad_norm": 3.7518320083618164,
"learning_rate": 9.925280132652503e-06,
"loss": 1.1256,
"step": 1913
},
{
"epoch": 0.5121755418785122,
"grad_norm": 3.581819534301758,
"learning_rate": 9.925127635863195e-06,
"loss": 1.0175,
"step": 1914
},
{
"epoch": 0.5124431362055124,
"grad_norm": 3.7574949264526367,
"learning_rate": 9.924974984790016e-06,
"loss": 1.2528,
"step": 1915
},
{
"epoch": 0.5127107305325127,
"grad_norm": 3.8194570541381836,
"learning_rate": 9.924822179437752e-06,
"loss": 1.2685,
"step": 1916
},
{
"epoch": 0.512978324859513,
"grad_norm": 3.690627336502075,
"learning_rate": 9.924669219811188e-06,
"loss": 1.2214,
"step": 1917
},
{
"epoch": 0.5132459191865133,
"grad_norm": 3.204648494720459,
"learning_rate": 9.924516105915116e-06,
"loss": 0.9857,
"step": 1918
},
{
"epoch": 0.5135135135135135,
"grad_norm": 3.702674627304077,
"learning_rate": 9.924362837754334e-06,
"loss": 1.0301,
"step": 1919
},
{
"epoch": 0.5137811078405138,
"grad_norm": 3.366229295730591,
"learning_rate": 9.92420941533364e-06,
"loss": 1.0714,
"step": 1920
},
{
"epoch": 0.5140487021675141,
"grad_norm": 3.501063823699951,
"learning_rate": 9.92405583865784e-06,
"loss": 1.1025,
"step": 1921
},
{
"epoch": 0.5143162964945143,
"grad_norm": 3.3142244815826416,
"learning_rate": 9.92390210773175e-06,
"loss": 1.0532,
"step": 1922
},
{
"epoch": 0.5145838908215146,
"grad_norm": 3.998425006866455,
"learning_rate": 9.923748222560181e-06,
"loss": 1.1796,
"step": 1923
},
{
"epoch": 0.5148514851485149,
"grad_norm": 3.6948330402374268,
"learning_rate": 9.923594183147954e-06,
"loss": 1.0869,
"step": 1924
},
{
"epoch": 0.5151190794755152,
"grad_norm": 3.7560575008392334,
"learning_rate": 9.923439989499897e-06,
"loss": 1.1566,
"step": 1925
},
{
"epoch": 0.5153866738025153,
"grad_norm": 3.8775906562805176,
"learning_rate": 9.923285641620838e-06,
"loss": 1.1781,
"step": 1926
},
{
"epoch": 0.5156542681295156,
"grad_norm": 3.8323404788970947,
"learning_rate": 9.923131139515613e-06,
"loss": 1.1228,
"step": 1927
},
{
"epoch": 0.515921862456516,
"grad_norm": 3.4766688346862793,
"learning_rate": 9.922976483189061e-06,
"loss": 1.0528,
"step": 1928
},
{
"epoch": 0.5161894567835162,
"grad_norm": 3.5990777015686035,
"learning_rate": 9.922821672646028e-06,
"loss": 1.0601,
"step": 1929
},
{
"epoch": 0.5164570511105164,
"grad_norm": 3.702481269836426,
"learning_rate": 9.922666707891361e-06,
"loss": 1.1455,
"step": 1930
},
{
"epoch": 0.5167246454375167,
"grad_norm": 3.4668917655944824,
"learning_rate": 9.92251158892992e-06,
"loss": 1.0795,
"step": 1931
},
{
"epoch": 0.516992239764517,
"grad_norm": 3.769757032394409,
"learning_rate": 9.922356315766557e-06,
"loss": 1.1749,
"step": 1932
},
{
"epoch": 0.5172598340915172,
"grad_norm": 3.681917428970337,
"learning_rate": 9.922200888406142e-06,
"loss": 1.1752,
"step": 1933
},
{
"epoch": 0.5175274284185175,
"grad_norm": 3.566633701324463,
"learning_rate": 9.922045306853542e-06,
"loss": 1.0806,
"step": 1934
},
{
"epoch": 0.5177950227455178,
"grad_norm": 3.5221433639526367,
"learning_rate": 9.921889571113629e-06,
"loss": 1.1242,
"step": 1935
},
{
"epoch": 0.5180626170725181,
"grad_norm": 3.574681043624878,
"learning_rate": 9.921733681191283e-06,
"loss": 1.117,
"step": 1936
},
{
"epoch": 0.5183302113995183,
"grad_norm": 4.224633693695068,
"learning_rate": 9.921577637091388e-06,
"loss": 1.2129,
"step": 1937
},
{
"epoch": 0.5185978057265186,
"grad_norm": 3.799368381500244,
"learning_rate": 9.92142143881883e-06,
"loss": 1.0609,
"step": 1938
},
{
"epoch": 0.5188654000535189,
"grad_norm": 3.3646318912506104,
"learning_rate": 9.921265086378504e-06,
"loss": 1.1139,
"step": 1939
},
{
"epoch": 0.5191329943805192,
"grad_norm": 4.179952621459961,
"learning_rate": 9.921108579775307e-06,
"loss": 1.2536,
"step": 1940
},
{
"epoch": 0.5194005887075194,
"grad_norm": 3.8131916522979736,
"learning_rate": 9.920951919014144e-06,
"loss": 1.1239,
"step": 1941
},
{
"epoch": 0.5196681830345197,
"grad_norm": 3.50144624710083,
"learning_rate": 9.920795104099919e-06,
"loss": 1.0744,
"step": 1942
},
{
"epoch": 0.51993577736152,
"grad_norm": 3.8942971229553223,
"learning_rate": 9.920638135037545e-06,
"loss": 1.1104,
"step": 1943
},
{
"epoch": 0.5202033716885202,
"grad_norm": 3.6234724521636963,
"learning_rate": 9.920481011831941e-06,
"loss": 1.1766,
"step": 1944
},
{
"epoch": 0.5204709660155205,
"grad_norm": 4.271646022796631,
"learning_rate": 9.92032373448803e-06,
"loss": 1.3459,
"step": 1945
},
{
"epoch": 0.5207385603425208,
"grad_norm": 3.910745143890381,
"learning_rate": 9.920166303010737e-06,
"loss": 1.1466,
"step": 1946
},
{
"epoch": 0.521006154669521,
"grad_norm": 3.472041606903076,
"learning_rate": 9.92000871740499e-06,
"loss": 1.2786,
"step": 1947
},
{
"epoch": 0.5212737489965212,
"grad_norm": 3.5486903190612793,
"learning_rate": 9.919850977675732e-06,
"loss": 1.1269,
"step": 1948
},
{
"epoch": 0.5215413433235215,
"grad_norm": 3.486093044281006,
"learning_rate": 9.919693083827902e-06,
"loss": 1.0447,
"step": 1949
},
{
"epoch": 0.5218089376505218,
"grad_norm": 3.836215019226074,
"learning_rate": 9.919535035866444e-06,
"loss": 1.179,
"step": 1950
},
{
"epoch": 0.5220765319775221,
"grad_norm": 3.5467727184295654,
"learning_rate": 9.919376833796312e-06,
"loss": 1.0668,
"step": 1951
},
{
"epoch": 0.5223441263045223,
"grad_norm": 3.5442044734954834,
"learning_rate": 9.91921847762246e-06,
"loss": 1.0542,
"step": 1952
},
{
"epoch": 0.5226117206315226,
"grad_norm": 3.7540347576141357,
"learning_rate": 9.919059967349848e-06,
"loss": 1.0402,
"step": 1953
},
{
"epoch": 0.5228793149585229,
"grad_norm": 4.026261329650879,
"learning_rate": 9.918901302983445e-06,
"loss": 1.2437,
"step": 1954
},
{
"epoch": 0.5231469092855231,
"grad_norm": 3.6572134494781494,
"learning_rate": 9.918742484528218e-06,
"loss": 1.1397,
"step": 1955
},
{
"epoch": 0.5234145036125234,
"grad_norm": 3.5838277339935303,
"learning_rate": 9.918583511989142e-06,
"loss": 1.0844,
"step": 1956
},
{
"epoch": 0.5236820979395237,
"grad_norm": 3.8754079341888428,
"learning_rate": 9.918424385371199e-06,
"loss": 1.2264,
"step": 1957
},
{
"epoch": 0.523949692266524,
"grad_norm": 3.196148633956909,
"learning_rate": 9.918265104679371e-06,
"loss": 1.0584,
"step": 1958
},
{
"epoch": 0.5242172865935242,
"grad_norm": 4.228190898895264,
"learning_rate": 9.918105669918652e-06,
"loss": 1.2559,
"step": 1959
},
{
"epoch": 0.5244848809205245,
"grad_norm": 3.834376573562622,
"learning_rate": 9.917946081094033e-06,
"loss": 1.0941,
"step": 1960
},
{
"epoch": 0.5247524752475248,
"grad_norm": 3.5881540775299072,
"learning_rate": 9.917786338210513e-06,
"loss": 1.1777,
"step": 1961
},
{
"epoch": 0.5250200695745251,
"grad_norm": 3.671957492828369,
"learning_rate": 9.917626441273099e-06,
"loss": 1.2193,
"step": 1962
},
{
"epoch": 0.5252876639015253,
"grad_norm": 3.508430242538452,
"learning_rate": 9.917466390286797e-06,
"loss": 1.1494,
"step": 1963
},
{
"epoch": 0.5255552582285256,
"grad_norm": 4.060336112976074,
"learning_rate": 9.917306185256621e-06,
"loss": 1.2024,
"step": 1964
},
{
"epoch": 0.5258228525555259,
"grad_norm": 3.5298852920532227,
"learning_rate": 9.91714582618759e-06,
"loss": 1.1166,
"step": 1965
},
{
"epoch": 0.526090446882526,
"grad_norm": 3.5156521797180176,
"learning_rate": 9.91698531308473e-06,
"loss": 1.1366,
"step": 1966
},
{
"epoch": 0.5263580412095263,
"grad_norm": 3.63799786567688,
"learning_rate": 9.916824645953065e-06,
"loss": 1.2219,
"step": 1967
},
{
"epoch": 0.5266256355365266,
"grad_norm": 3.7056069374084473,
"learning_rate": 9.916663824797633e-06,
"loss": 1.16,
"step": 1968
},
{
"epoch": 0.5268932298635269,
"grad_norm": 3.2435388565063477,
"learning_rate": 9.916502849623467e-06,
"loss": 1.0117,
"step": 1969
},
{
"epoch": 0.5271608241905271,
"grad_norm": 3.5529932975769043,
"learning_rate": 9.916341720435609e-06,
"loss": 1.0804,
"step": 1970
},
{
"epoch": 0.5274284185175274,
"grad_norm": 3.3724541664123535,
"learning_rate": 9.91618043723911e-06,
"loss": 1.0444,
"step": 1971
},
{
"epoch": 0.5276960128445277,
"grad_norm": 3.614671230316162,
"learning_rate": 9.916019000039024e-06,
"loss": 1.0751,
"step": 1972
},
{
"epoch": 0.527963607171528,
"grad_norm": 3.8645894527435303,
"learning_rate": 9.915857408840405e-06,
"loss": 1.25,
"step": 1973
},
{
"epoch": 0.5282312014985282,
"grad_norm": 3.3444855213165283,
"learning_rate": 9.915695663648315e-06,
"loss": 1.0344,
"step": 1974
},
{
"epoch": 0.5284987958255285,
"grad_norm": 3.8077821731567383,
"learning_rate": 9.91553376446782e-06,
"loss": 1.1244,
"step": 1975
},
{
"epoch": 0.5287663901525288,
"grad_norm": 3.517341375350952,
"learning_rate": 9.915371711303994e-06,
"loss": 1.1202,
"step": 1976
},
{
"epoch": 0.529033984479529,
"grad_norm": 4.2117767333984375,
"learning_rate": 9.915209504161914e-06,
"loss": 1.2998,
"step": 1977
},
{
"epoch": 0.5293015788065293,
"grad_norm": 3.684497117996216,
"learning_rate": 9.915047143046656e-06,
"loss": 1.1494,
"step": 1978
},
{
"epoch": 0.5295691731335296,
"grad_norm": 4.2827630043029785,
"learning_rate": 9.914884627963312e-06,
"loss": 1.3014,
"step": 1979
},
{
"epoch": 0.5298367674605299,
"grad_norm": 3.7791380882263184,
"learning_rate": 9.914721958916971e-06,
"loss": 1.2749,
"step": 1980
},
{
"epoch": 0.5301043617875301,
"grad_norm": 3.7178707122802734,
"learning_rate": 9.91455913591273e-06,
"loss": 1.0304,
"step": 1981
},
{
"epoch": 0.5303719561145304,
"grad_norm": 3.6490297317504883,
"learning_rate": 9.914396158955685e-06,
"loss": 1.0867,
"step": 1982
},
{
"epoch": 0.5306395504415307,
"grad_norm": 4.041894912719727,
"learning_rate": 9.914233028050945e-06,
"loss": 1.1857,
"step": 1983
},
{
"epoch": 0.530907144768531,
"grad_norm": 3.4716479778289795,
"learning_rate": 9.91406974320362e-06,
"loss": 1.0954,
"step": 1984
},
{
"epoch": 0.5311747390955311,
"grad_norm": 3.4045979976654053,
"learning_rate": 9.913906304418825e-06,
"loss": 1.2435,
"step": 1985
},
{
"epoch": 0.5314423334225314,
"grad_norm": 3.823096752166748,
"learning_rate": 9.91374271170168e-06,
"loss": 1.1779,
"step": 1986
},
{
"epoch": 0.5317099277495317,
"grad_norm": 3.7356925010681152,
"learning_rate": 9.91357896505731e-06,
"loss": 1.2326,
"step": 1987
},
{
"epoch": 0.5319775220765319,
"grad_norm": 3.4389915466308594,
"learning_rate": 9.91341506449084e-06,
"loss": 1.0635,
"step": 1988
},
{
"epoch": 0.5322451164035322,
"grad_norm": 3.3921926021575928,
"learning_rate": 9.913251010007413e-06,
"loss": 1.0636,
"step": 1989
},
{
"epoch": 0.5325127107305325,
"grad_norm": 3.5323266983032227,
"learning_rate": 9.913086801612159e-06,
"loss": 1.0485,
"step": 1990
},
{
"epoch": 0.5327803050575328,
"grad_norm": 3.2040328979492188,
"learning_rate": 9.91292243931023e-06,
"loss": 1.0664,
"step": 1991
},
{
"epoch": 0.533047899384533,
"grad_norm": 3.604896068572998,
"learning_rate": 9.912757923106769e-06,
"loss": 1.1632,
"step": 1992
},
{
"epoch": 0.5333154937115333,
"grad_norm": 3.466099262237549,
"learning_rate": 9.91259325300693e-06,
"loss": 1.118,
"step": 1993
},
{
"epoch": 0.5335830880385336,
"grad_norm": 3.788372039794922,
"learning_rate": 9.912428429015874e-06,
"loss": 1.2205,
"step": 1994
},
{
"epoch": 0.5338506823655339,
"grad_norm": 3.699796199798584,
"learning_rate": 9.912263451138764e-06,
"loss": 1.0773,
"step": 1995
},
{
"epoch": 0.5341182766925341,
"grad_norm": 3.928880453109741,
"learning_rate": 9.912098319380767e-06,
"loss": 1.24,
"step": 1996
},
{
"epoch": 0.5343858710195344,
"grad_norm": 3.5852925777435303,
"learning_rate": 9.911933033747056e-06,
"loss": 1.0727,
"step": 1997
},
{
"epoch": 0.5346534653465347,
"grad_norm": 4.054876327514648,
"learning_rate": 9.91176759424281e-06,
"loss": 1.1812,
"step": 1998
},
{
"epoch": 0.5349210596735349,
"grad_norm": 3.9897444248199463,
"learning_rate": 9.91160200087321e-06,
"loss": 1.2996,
"step": 1999
},
{
"epoch": 0.5351886540005352,
"grad_norm": 3.893026113510132,
"learning_rate": 9.911436253643445e-06,
"loss": 1.1287,
"step": 2000
},
{
"epoch": 0.5351886540005352,
"eval_loss": 1.1917240619659424,
"eval_runtime": 11.6396,
"eval_samples_per_second": 34.365,
"eval_steps_per_second": 4.296,
"step": 2000
},
{
"epoch": 0.5354562483275355,
"grad_norm": 4.189493656158447,
"learning_rate": 9.911270352558703e-06,
"loss": 1.2612,
"step": 2001
},
{
"epoch": 0.5357238426545358,
"grad_norm": 3.7188894748687744,
"learning_rate": 9.911104297624186e-06,
"loss": 1.1238,
"step": 2002
},
{
"epoch": 0.535991436981536,
"grad_norm": 3.495906352996826,
"learning_rate": 9.910938088845095e-06,
"loss": 1.0895,
"step": 2003
},
{
"epoch": 0.5362590313085362,
"grad_norm": 3.8715004920959473,
"learning_rate": 9.910771726226634e-06,
"loss": 1.1578,
"step": 2004
},
{
"epoch": 0.5365266256355365,
"grad_norm": 5.872176170349121,
"learning_rate": 9.910605209774016e-06,
"loss": 1.2899,
"step": 2005
},
{
"epoch": 0.5367942199625368,
"grad_norm": 3.8072023391723633,
"learning_rate": 9.910438539492457e-06,
"loss": 1.0038,
"step": 2006
},
{
"epoch": 0.537061814289537,
"grad_norm": 3.388889789581299,
"learning_rate": 9.91027171538718e-06,
"loss": 1.0829,
"step": 2007
},
{
"epoch": 0.5373294086165373,
"grad_norm": 3.782205104827881,
"learning_rate": 9.910104737463406e-06,
"loss": 1.1912,
"step": 2008
},
{
"epoch": 0.5375970029435376,
"grad_norm": 3.77671217918396,
"learning_rate": 9.90993760572637e-06,
"loss": 1.2044,
"step": 2009
},
{
"epoch": 0.5378645972705378,
"grad_norm": 3.633802652359009,
"learning_rate": 9.909770320181306e-06,
"loss": 1.3179,
"step": 2010
},
{
"epoch": 0.5381321915975381,
"grad_norm": 3.744126558303833,
"learning_rate": 9.909602880833458e-06,
"loss": 1.1907,
"step": 2011
},
{
"epoch": 0.5383997859245384,
"grad_norm": 3.903366804122925,
"learning_rate": 9.909435287688065e-06,
"loss": 1.1737,
"step": 2012
},
{
"epoch": 0.5386673802515387,
"grad_norm": 3.7042882442474365,
"learning_rate": 9.90926754075038e-06,
"loss": 1.2119,
"step": 2013
},
{
"epoch": 0.5389349745785389,
"grad_norm": 3.662655830383301,
"learning_rate": 9.90909964002566e-06,
"loss": 1.1722,
"step": 2014
},
{
"epoch": 0.5392025689055392,
"grad_norm": 3.9184234142303467,
"learning_rate": 9.90893158551916e-06,
"loss": 1.176,
"step": 2015
},
{
"epoch": 0.5394701632325395,
"grad_norm": 3.6793618202209473,
"learning_rate": 9.90876337723615e-06,
"loss": 1.1942,
"step": 2016
},
{
"epoch": 0.5397377575595398,
"grad_norm": 3.438577175140381,
"learning_rate": 9.908595015181893e-06,
"loss": 1.0737,
"step": 2017
},
{
"epoch": 0.54000535188654,
"grad_norm": 3.8159797191619873,
"learning_rate": 9.908426499361668e-06,
"loss": 1.2024,
"step": 2018
},
{
"epoch": 0.5402729462135403,
"grad_norm": 3.6021339893341064,
"learning_rate": 9.908257829780752e-06,
"loss": 1.0793,
"step": 2019
},
{
"epoch": 0.5405405405405406,
"grad_norm": 3.960874319076538,
"learning_rate": 9.908089006444427e-06,
"loss": 1.2732,
"step": 2020
},
{
"epoch": 0.5408081348675408,
"grad_norm": 3.724120616912842,
"learning_rate": 9.907920029357986e-06,
"loss": 1.1938,
"step": 2021
},
{
"epoch": 0.541075729194541,
"grad_norm": 3.7229902744293213,
"learning_rate": 9.90775089852672e-06,
"loss": 1.124,
"step": 2022
},
{
"epoch": 0.5413433235215414,
"grad_norm": 3.4035604000091553,
"learning_rate": 9.907581613955924e-06,
"loss": 1.0212,
"step": 2023
},
{
"epoch": 0.5416109178485417,
"grad_norm": 3.6770973205566406,
"learning_rate": 9.907412175650905e-06,
"loss": 1.0409,
"step": 2024
},
{
"epoch": 0.5418785121755418,
"grad_norm": 3.5748701095581055,
"learning_rate": 9.907242583616972e-06,
"loss": 1.0902,
"step": 2025
},
{
"epoch": 0.5421461065025421,
"grad_norm": 4.297303199768066,
"learning_rate": 9.907072837859434e-06,
"loss": 1.1205,
"step": 2026
},
{
"epoch": 0.5424137008295424,
"grad_norm": 3.765982151031494,
"learning_rate": 9.90690293838361e-06,
"loss": 1.2402,
"step": 2027
},
{
"epoch": 0.5426812951565427,
"grad_norm": 3.948046922683716,
"learning_rate": 9.906732885194821e-06,
"loss": 1.2607,
"step": 2028
},
{
"epoch": 0.5429488894835429,
"grad_norm": 3.599590539932251,
"learning_rate": 9.906562678298394e-06,
"loss": 1.113,
"step": 2029
},
{
"epoch": 0.5432164838105432,
"grad_norm": 3.43281626701355,
"learning_rate": 9.906392317699665e-06,
"loss": 1.0782,
"step": 2030
},
{
"epoch": 0.5434840781375435,
"grad_norm": 3.7561564445495605,
"learning_rate": 9.906221803403967e-06,
"loss": 1.2796,
"step": 2031
},
{
"epoch": 0.5437516724645437,
"grad_norm": 3.3608622550964355,
"learning_rate": 9.90605113541664e-06,
"loss": 1.1263,
"step": 2032
},
{
"epoch": 0.544019266791544,
"grad_norm": 3.4457077980041504,
"learning_rate": 9.905880313743035e-06,
"loss": 1.1016,
"step": 2033
},
{
"epoch": 0.5442868611185443,
"grad_norm": 3.601628065109253,
"learning_rate": 9.905709338388499e-06,
"loss": 1.1218,
"step": 2034
},
{
"epoch": 0.5445544554455446,
"grad_norm": 3.8327248096466064,
"learning_rate": 9.90553820935839e-06,
"loss": 1.0964,
"step": 2035
},
{
"epoch": 0.5448220497725448,
"grad_norm": 3.6931264400482178,
"learning_rate": 9.905366926658068e-06,
"loss": 1.2855,
"step": 2036
},
{
"epoch": 0.5450896440995451,
"grad_norm": 3.9936089515686035,
"learning_rate": 9.9051954902929e-06,
"loss": 1.2546,
"step": 2037
},
{
"epoch": 0.5453572384265454,
"grad_norm": 4.2173991203308105,
"learning_rate": 9.905023900268255e-06,
"loss": 1.2468,
"step": 2038
},
{
"epoch": 0.5456248327535457,
"grad_norm": 3.5092899799346924,
"learning_rate": 9.904852156589508e-06,
"loss": 1.0156,
"step": 2039
},
{
"epoch": 0.5458924270805459,
"grad_norm": 3.5375232696533203,
"learning_rate": 9.90468025926204e-06,
"loss": 1.1003,
"step": 2040
},
{
"epoch": 0.5461600214075462,
"grad_norm": 3.232635974884033,
"learning_rate": 9.904508208291236e-06,
"loss": 1.1159,
"step": 2041
},
{
"epoch": 0.5464276157345465,
"grad_norm": 3.6317005157470703,
"learning_rate": 9.904336003682484e-06,
"loss": 1.2561,
"step": 2042
},
{
"epoch": 0.5466952100615466,
"grad_norm": 3.4912993907928467,
"learning_rate": 9.90416364544118e-06,
"loss": 1.2616,
"step": 2043
},
{
"epoch": 0.5469628043885469,
"grad_norm": 3.744119882583618,
"learning_rate": 9.903991133572722e-06,
"loss": 1.1231,
"step": 2044
},
{
"epoch": 0.5472303987155472,
"grad_norm": 3.8304286003112793,
"learning_rate": 9.903818468082515e-06,
"loss": 1.1488,
"step": 2045
},
{
"epoch": 0.5474979930425475,
"grad_norm": 4.009277820587158,
"learning_rate": 9.903645648975967e-06,
"loss": 1.1767,
"step": 2046
},
{
"epoch": 0.5477655873695477,
"grad_norm": 3.655991315841675,
"learning_rate": 9.903472676258494e-06,
"loss": 1.1274,
"step": 2047
},
{
"epoch": 0.548033181696548,
"grad_norm": 3.522969961166382,
"learning_rate": 9.903299549935514e-06,
"loss": 1.0944,
"step": 2048
},
{
"epoch": 0.5483007760235483,
"grad_norm": 3.9753992557525635,
"learning_rate": 9.903126270012446e-06,
"loss": 1.1597,
"step": 2049
},
{
"epoch": 0.5485683703505486,
"grad_norm": 3.4897477626800537,
"learning_rate": 9.902952836494724e-06,
"loss": 1.213,
"step": 2050
},
{
"epoch": 0.5488359646775488,
"grad_norm": 3.901291847229004,
"learning_rate": 9.902779249387777e-06,
"loss": 1.1803,
"step": 2051
},
{
"epoch": 0.5491035590045491,
"grad_norm": 3.8959672451019287,
"learning_rate": 9.902605508697045e-06,
"loss": 1.1598,
"step": 2052
},
{
"epoch": 0.5493711533315494,
"grad_norm": 3.5497238636016846,
"learning_rate": 9.90243161442797e-06,
"loss": 1.1193,
"step": 2053
},
{
"epoch": 0.5496387476585496,
"grad_norm": 3.6735222339630127,
"learning_rate": 9.902257566585997e-06,
"loss": 1.2209,
"step": 2054
},
{
"epoch": 0.5499063419855499,
"grad_norm": 3.902233123779297,
"learning_rate": 9.902083365176583e-06,
"loss": 1.2137,
"step": 2055
},
{
"epoch": 0.5501739363125502,
"grad_norm": 3.133127212524414,
"learning_rate": 9.90190901020518e-06,
"loss": 1.0771,
"step": 2056
},
{
"epoch": 0.5504415306395505,
"grad_norm": 3.489025115966797,
"learning_rate": 9.901734501677254e-06,
"loss": 1.0428,
"step": 2057
},
{
"epoch": 0.5507091249665507,
"grad_norm": 3.8350815773010254,
"learning_rate": 9.90155983959827e-06,
"loss": 1.1333,
"step": 2058
},
{
"epoch": 0.550976719293551,
"grad_norm": 3.393089771270752,
"learning_rate": 9.901385023973698e-06,
"loss": 1.1228,
"step": 2059
},
{
"epoch": 0.5512443136205513,
"grad_norm": 3.4112391471862793,
"learning_rate": 9.901210054809015e-06,
"loss": 1.0732,
"step": 2060
},
{
"epoch": 0.5515119079475516,
"grad_norm": 3.4298675060272217,
"learning_rate": 9.901034932109702e-06,
"loss": 1.1072,
"step": 2061
},
{
"epoch": 0.5517795022745517,
"grad_norm": 3.8485376834869385,
"learning_rate": 9.900859655881248e-06,
"loss": 1.2126,
"step": 2062
},
{
"epoch": 0.552047096601552,
"grad_norm": 3.713818073272705,
"learning_rate": 9.90068422612914e-06,
"loss": 1.0999,
"step": 2063
},
{
"epoch": 0.5523146909285523,
"grad_norm": 3.7916266918182373,
"learning_rate": 9.900508642858874e-06,
"loss": 1.2815,
"step": 2064
},
{
"epoch": 0.5525822852555525,
"grad_norm": 3.449904203414917,
"learning_rate": 9.900332906075951e-06,
"loss": 1.056,
"step": 2065
},
{
"epoch": 0.5528498795825528,
"grad_norm": 3.417433500289917,
"learning_rate": 9.900157015785876e-06,
"loss": 0.9831,
"step": 2066
},
{
"epoch": 0.5531174739095531,
"grad_norm": 4.198076248168945,
"learning_rate": 9.899980971994158e-06,
"loss": 1.2668,
"step": 2067
},
{
"epoch": 0.5533850682365534,
"grad_norm": 3.3924946784973145,
"learning_rate": 9.899804774706314e-06,
"loss": 1.1,
"step": 2068
},
{
"epoch": 0.5536526625635536,
"grad_norm": 3.6874961853027344,
"learning_rate": 9.899628423927861e-06,
"loss": 1.2336,
"step": 2069
},
{
"epoch": 0.5539202568905539,
"grad_norm": 3.614410161972046,
"learning_rate": 9.899451919664325e-06,
"loss": 1.1163,
"step": 2070
},
{
"epoch": 0.5541878512175542,
"grad_norm": 3.503385066986084,
"learning_rate": 9.899275261921236e-06,
"loss": 1.1361,
"step": 2071
},
{
"epoch": 0.5544554455445545,
"grad_norm": 3.857766628265381,
"learning_rate": 9.899098450704125e-06,
"loss": 1.2757,
"step": 2072
},
{
"epoch": 0.5547230398715547,
"grad_norm": 3.7878856658935547,
"learning_rate": 9.898921486018532e-06,
"loss": 1.1438,
"step": 2073
},
{
"epoch": 0.554990634198555,
"grad_norm": 3.2437705993652344,
"learning_rate": 9.898744367870001e-06,
"loss": 1.0622,
"step": 2074
},
{
"epoch": 0.5552582285255553,
"grad_norm": 3.193298816680908,
"learning_rate": 9.898567096264082e-06,
"loss": 1.0384,
"step": 2075
},
{
"epoch": 0.5555258228525555,
"grad_norm": 3.3267760276794434,
"learning_rate": 9.898389671206324e-06,
"loss": 1.0635,
"step": 2076
},
{
"epoch": 0.5557934171795558,
"grad_norm": 3.255155086517334,
"learning_rate": 9.898212092702288e-06,
"loss": 1.0574,
"step": 2077
},
{
"epoch": 0.5560610115065561,
"grad_norm": 3.881344795227051,
"learning_rate": 9.898034360757538e-06,
"loss": 1.2048,
"step": 2078
},
{
"epoch": 0.5563286058335564,
"grad_norm": 3.6974213123321533,
"learning_rate": 9.897856475377638e-06,
"loss": 1.2133,
"step": 2079
},
{
"epoch": 0.5565962001605566,
"grad_norm": 3.4741365909576416,
"learning_rate": 9.897678436568164e-06,
"loss": 1.1787,
"step": 2080
},
{
"epoch": 0.5568637944875569,
"grad_norm": 3.6926300525665283,
"learning_rate": 9.89750024433469e-06,
"loss": 1.1299,
"step": 2081
},
{
"epoch": 0.5571313888145571,
"grad_norm": 3.9215118885040283,
"learning_rate": 9.8973218986828e-06,
"loss": 1.2031,
"step": 2082
},
{
"epoch": 0.5573989831415574,
"grad_norm": 3.4052512645721436,
"learning_rate": 9.897143399618081e-06,
"loss": 1.1094,
"step": 2083
},
{
"epoch": 0.5576665774685576,
"grad_norm": 3.8671302795410156,
"learning_rate": 9.896964747146125e-06,
"loss": 1.2339,
"step": 2084
},
{
"epoch": 0.5579341717955579,
"grad_norm": 3.723543643951416,
"learning_rate": 9.896785941272524e-06,
"loss": 1.2115,
"step": 2085
},
{
"epoch": 0.5582017661225582,
"grad_norm": 3.7372453212738037,
"learning_rate": 9.896606982002886e-06,
"loss": 1.1701,
"step": 2086
},
{
"epoch": 0.5584693604495584,
"grad_norm": 3.7154757976531982,
"learning_rate": 9.896427869342812e-06,
"loss": 1.0744,
"step": 2087
},
{
"epoch": 0.5587369547765587,
"grad_norm": 3.733175039291382,
"learning_rate": 9.896248603297915e-06,
"loss": 1.0824,
"step": 2088
},
{
"epoch": 0.559004549103559,
"grad_norm": 3.589911460876465,
"learning_rate": 9.896069183873809e-06,
"loss": 1.1208,
"step": 2089
},
{
"epoch": 0.5592721434305593,
"grad_norm": 3.778308868408203,
"learning_rate": 9.895889611076119e-06,
"loss": 1.2553,
"step": 2090
},
{
"epoch": 0.5595397377575595,
"grad_norm": 3.737415313720703,
"learning_rate": 9.895709884910464e-06,
"loss": 1.1706,
"step": 2091
},
{
"epoch": 0.5598073320845598,
"grad_norm": 3.6272811889648438,
"learning_rate": 9.895530005382478e-06,
"loss": 1.099,
"step": 2092
},
{
"epoch": 0.5600749264115601,
"grad_norm": 3.2790331840515137,
"learning_rate": 9.895349972497796e-06,
"loss": 0.9707,
"step": 2093
},
{
"epoch": 0.5603425207385604,
"grad_norm": 3.618961811065674,
"learning_rate": 9.895169786262055e-06,
"loss": 1.1975,
"step": 2094
},
{
"epoch": 0.5606101150655606,
"grad_norm": 3.8534488677978516,
"learning_rate": 9.894989446680901e-06,
"loss": 1.2889,
"step": 2095
},
{
"epoch": 0.5608777093925609,
"grad_norm": 3.748040199279785,
"learning_rate": 9.894808953759984e-06,
"loss": 1.1573,
"step": 2096
},
{
"epoch": 0.5611453037195612,
"grad_norm": 3.8036909103393555,
"learning_rate": 9.894628307504959e-06,
"loss": 1.0905,
"step": 2097
},
{
"epoch": 0.5614128980465614,
"grad_norm": 3.3763818740844727,
"learning_rate": 9.894447507921482e-06,
"loss": 0.9967,
"step": 2098
},
{
"epoch": 0.5616804923735617,
"grad_norm": 3.4757957458496094,
"learning_rate": 9.894266555015218e-06,
"loss": 1.199,
"step": 2099
},
{
"epoch": 0.561948086700562,
"grad_norm": 3.526400089263916,
"learning_rate": 9.894085448791836e-06,
"loss": 1.3028,
"step": 2100
},
{
"epoch": 0.5622156810275623,
"grad_norm": 3.7849979400634766,
"learning_rate": 9.89390418925701e-06,
"loss": 1.166,
"step": 2101
},
{
"epoch": 0.5624832753545624,
"grad_norm": 3.8639450073242188,
"learning_rate": 9.893722776416415e-06,
"loss": 1.1507,
"step": 2102
},
{
"epoch": 0.5627508696815627,
"grad_norm": 3.6054041385650635,
"learning_rate": 9.893541210275736e-06,
"loss": 1.216,
"step": 2103
},
{
"epoch": 0.563018464008563,
"grad_norm": 3.552934408187866,
"learning_rate": 9.893359490840662e-06,
"loss": 1.2079,
"step": 2104
},
{
"epoch": 0.5632860583355633,
"grad_norm": 3.804652690887451,
"learning_rate": 9.893177618116885e-06,
"loss": 1.2398,
"step": 2105
},
{
"epoch": 0.5635536526625635,
"grad_norm": 3.506537675857544,
"learning_rate": 9.892995592110099e-06,
"loss": 1.1581,
"step": 2106
},
{
"epoch": 0.5638212469895638,
"grad_norm": 3.4469141960144043,
"learning_rate": 9.89281341282601e-06,
"loss": 1.1475,
"step": 2107
},
{
"epoch": 0.5640888413165641,
"grad_norm": 3.478013753890991,
"learning_rate": 9.892631080270325e-06,
"loss": 1.2376,
"step": 2108
},
{
"epoch": 0.5643564356435643,
"grad_norm": 3.774752378463745,
"learning_rate": 9.89244859444875e-06,
"loss": 1.1787,
"step": 2109
},
{
"epoch": 0.5646240299705646,
"grad_norm": 3.786384344100952,
"learning_rate": 9.89226595536701e-06,
"loss": 1.2119,
"step": 2110
},
{
"epoch": 0.5648916242975649,
"grad_norm": 3.7795796394348145,
"learning_rate": 9.892083163030822e-06,
"loss": 1.1884,
"step": 2111
},
{
"epoch": 0.5651592186245652,
"grad_norm": 3.500213146209717,
"learning_rate": 9.89190021744591e-06,
"loss": 1.01,
"step": 2112
},
{
"epoch": 0.5654268129515654,
"grad_norm": 3.490860939025879,
"learning_rate": 9.891717118618008e-06,
"loss": 1.1551,
"step": 2113
},
{
"epoch": 0.5656944072785657,
"grad_norm": 3.658153772354126,
"learning_rate": 9.891533866552852e-06,
"loss": 1.2155,
"step": 2114
},
{
"epoch": 0.565962001605566,
"grad_norm": 3.7145233154296875,
"learning_rate": 9.891350461256179e-06,
"loss": 1.2243,
"step": 2115
},
{
"epoch": 0.5662295959325663,
"grad_norm": 3.5172886848449707,
"learning_rate": 9.89116690273374e-06,
"loss": 1.206,
"step": 2116
},
{
"epoch": 0.5664971902595665,
"grad_norm": 3.58321475982666,
"learning_rate": 9.890983190991278e-06,
"loss": 1.2536,
"step": 2117
},
{
"epoch": 0.5667647845865668,
"grad_norm": 3.534895420074463,
"learning_rate": 9.890799326034556e-06,
"loss": 1.1384,
"step": 2118
},
{
"epoch": 0.5670323789135671,
"grad_norm": 3.564685583114624,
"learning_rate": 9.890615307869326e-06,
"loss": 1.1677,
"step": 2119
},
{
"epoch": 0.5672999732405672,
"grad_norm": 4.110241413116455,
"learning_rate": 9.89043113650136e-06,
"loss": 1.1706,
"step": 2120
},
{
"epoch": 0.5675675675675675,
"grad_norm": 3.5671589374542236,
"learning_rate": 9.890246811936421e-06,
"loss": 1.1117,
"step": 2121
},
{
"epoch": 0.5678351618945678,
"grad_norm": 3.429584264755249,
"learning_rate": 9.890062334180286e-06,
"loss": 1.1273,
"step": 2122
},
{
"epoch": 0.5681027562215681,
"grad_norm": 3.8296971321105957,
"learning_rate": 9.889877703238732e-06,
"loss": 1.2361,
"step": 2123
},
{
"epoch": 0.5683703505485683,
"grad_norm": 3.43332839012146,
"learning_rate": 9.889692919117546e-06,
"loss": 1.0847,
"step": 2124
},
{
"epoch": 0.5686379448755686,
"grad_norm": 3.417013168334961,
"learning_rate": 9.889507981822515e-06,
"loss": 1.1709,
"step": 2125
},
{
"epoch": 0.5689055392025689,
"grad_norm": 3.507187843322754,
"learning_rate": 9.88932289135943e-06,
"loss": 1.1102,
"step": 2126
},
{
"epoch": 0.5691731335295692,
"grad_norm": 3.821469783782959,
"learning_rate": 9.889137647734094e-06,
"loss": 1.1736,
"step": 2127
},
{
"epoch": 0.5694407278565694,
"grad_norm": 3.63112735748291,
"learning_rate": 9.888952250952305e-06,
"loss": 1.1239,
"step": 2128
},
{
"epoch": 0.5697083221835697,
"grad_norm": 4.068948745727539,
"learning_rate": 9.888766701019873e-06,
"loss": 1.2714,
"step": 2129
},
{
"epoch": 0.56997591651057,
"grad_norm": 3.552907943725586,
"learning_rate": 9.88858099794261e-06,
"loss": 1.0754,
"step": 2130
},
{
"epoch": 0.5702435108375702,
"grad_norm": 4.019528388977051,
"learning_rate": 9.888395141726335e-06,
"loss": 1.2183,
"step": 2131
},
{
"epoch": 0.5705111051645705,
"grad_norm": 3.55165696144104,
"learning_rate": 9.888209132376866e-06,
"loss": 1.0137,
"step": 2132
},
{
"epoch": 0.5707786994915708,
"grad_norm": 3.8330440521240234,
"learning_rate": 9.888022969900036e-06,
"loss": 1.2188,
"step": 2133
},
{
"epoch": 0.5710462938185711,
"grad_norm": 3.5315418243408203,
"learning_rate": 9.887836654301671e-06,
"loss": 1.1769,
"step": 2134
},
{
"epoch": 0.5713138881455713,
"grad_norm": 3.613337755203247,
"learning_rate": 9.887650185587612e-06,
"loss": 1.1539,
"step": 2135
},
{
"epoch": 0.5715814824725716,
"grad_norm": 3.3528521060943604,
"learning_rate": 9.887463563763695e-06,
"loss": 1.1673,
"step": 2136
},
{
"epoch": 0.5718490767995719,
"grad_norm": 3.672227382659912,
"learning_rate": 9.887276788835772e-06,
"loss": 1.3125,
"step": 2137
},
{
"epoch": 0.5721166711265722,
"grad_norm": 3.4449851512908936,
"learning_rate": 9.88708986080969e-06,
"loss": 1.1545,
"step": 2138
},
{
"epoch": 0.5723842654535723,
"grad_norm": 3.5263442993164062,
"learning_rate": 9.886902779691306e-06,
"loss": 1.1188,
"step": 2139
},
{
"epoch": 0.5726518597805726,
"grad_norm": 3.499302864074707,
"learning_rate": 9.88671554548648e-06,
"loss": 1.2045,
"step": 2140
},
{
"epoch": 0.572919454107573,
"grad_norm": 3.5615437030792236,
"learning_rate": 9.886528158201076e-06,
"loss": 1.1357,
"step": 2141
},
{
"epoch": 0.5731870484345731,
"grad_norm": 3.0443129539489746,
"learning_rate": 9.886340617840968e-06,
"loss": 0.9957,
"step": 2142
},
{
"epoch": 0.5734546427615734,
"grad_norm": 3.494044542312622,
"learning_rate": 9.886152924412027e-06,
"loss": 1.1044,
"step": 2143
},
{
"epoch": 0.5737222370885737,
"grad_norm": 3.9444684982299805,
"learning_rate": 9.885965077920135e-06,
"loss": 1.2436,
"step": 2144
},
{
"epoch": 0.573989831415574,
"grad_norm": 3.808692455291748,
"learning_rate": 9.885777078371174e-06,
"loss": 1.2591,
"step": 2145
},
{
"epoch": 0.5742574257425742,
"grad_norm": 3.586069107055664,
"learning_rate": 9.885588925771037e-06,
"loss": 1.1695,
"step": 2146
},
{
"epoch": 0.5745250200695745,
"grad_norm": 3.6232335567474365,
"learning_rate": 9.885400620125616e-06,
"loss": 1.2411,
"step": 2147
},
{
"epoch": 0.5747926143965748,
"grad_norm": 4.283682346343994,
"learning_rate": 9.885212161440808e-06,
"loss": 1.2519,
"step": 2148
},
{
"epoch": 0.5750602087235751,
"grad_norm": 3.391270160675049,
"learning_rate": 9.885023549722518e-06,
"loss": 1.1671,
"step": 2149
},
{
"epoch": 0.5753278030505753,
"grad_norm": 3.8860385417938232,
"learning_rate": 9.884834784976658e-06,
"loss": 1.1987,
"step": 2150
},
{
"epoch": 0.5755953973775756,
"grad_norm": 3.611828565597534,
"learning_rate": 9.884645867209133e-06,
"loss": 1.2138,
"step": 2151
},
{
"epoch": 0.5758629917045759,
"grad_norm": 3.7692012786865234,
"learning_rate": 9.884456796425869e-06,
"loss": 1.2613,
"step": 2152
},
{
"epoch": 0.5761305860315761,
"grad_norm": 3.578130006790161,
"learning_rate": 9.884267572632786e-06,
"loss": 1.2619,
"step": 2153
},
{
"epoch": 0.5763981803585764,
"grad_norm": 3.362647771835327,
"learning_rate": 9.884078195835812e-06,
"loss": 1.138,
"step": 2154
},
{
"epoch": 0.5766657746855767,
"grad_norm": 3.4358744621276855,
"learning_rate": 9.883888666040876e-06,
"loss": 1.1468,
"step": 2155
},
{
"epoch": 0.576933369012577,
"grad_norm": 3.8814890384674072,
"learning_rate": 9.88369898325392e-06,
"loss": 1.2645,
"step": 2156
},
{
"epoch": 0.5772009633395772,
"grad_norm": 3.75591778755188,
"learning_rate": 9.883509147480883e-06,
"loss": 1.2342,
"step": 2157
},
{
"epoch": 0.5774685576665775,
"grad_norm": 3.7901089191436768,
"learning_rate": 9.883319158727714e-06,
"loss": 1.2423,
"step": 2158
},
{
"epoch": 0.5777361519935778,
"grad_norm": 3.8552255630493164,
"learning_rate": 9.88312901700036e-06,
"loss": 1.2367,
"step": 2159
},
{
"epoch": 0.578003746320578,
"grad_norm": 3.6209921836853027,
"learning_rate": 9.882938722304785e-06,
"loss": 1.0368,
"step": 2160
},
{
"epoch": 0.5782713406475782,
"grad_norm": 3.403076171875,
"learning_rate": 9.882748274646942e-06,
"loss": 1.122,
"step": 2161
},
{
"epoch": 0.5785389349745785,
"grad_norm": 3.6946861743927,
"learning_rate": 9.882557674032804e-06,
"loss": 1.2632,
"step": 2162
},
{
"epoch": 0.5788065293015788,
"grad_norm": 3.478731393814087,
"learning_rate": 9.882366920468336e-06,
"loss": 1.1385,
"step": 2163
},
{
"epoch": 0.5790741236285791,
"grad_norm": 3.849747896194458,
"learning_rate": 9.882176013959517e-06,
"loss": 1.1953,
"step": 2164
},
{
"epoch": 0.5793417179555793,
"grad_norm": 3.2899606227874756,
"learning_rate": 9.881984954512325e-06,
"loss": 1.1515,
"step": 2165
},
{
"epoch": 0.5796093122825796,
"grad_norm": 3.6500260829925537,
"learning_rate": 9.881793742132748e-06,
"loss": 1.0992,
"step": 2166
},
{
"epoch": 0.5798769066095799,
"grad_norm": 3.4262735843658447,
"learning_rate": 9.881602376826773e-06,
"loss": 1.202,
"step": 2167
},
{
"epoch": 0.5801445009365801,
"grad_norm": 3.7987382411956787,
"learning_rate": 9.881410858600397e-06,
"loss": 1.1983,
"step": 2168
},
{
"epoch": 0.5804120952635804,
"grad_norm": 3.716843605041504,
"learning_rate": 9.88121918745962e-06,
"loss": 1.2688,
"step": 2169
},
{
"epoch": 0.5806796895905807,
"grad_norm": 3.5449235439300537,
"learning_rate": 9.881027363410441e-06,
"loss": 1.1251,
"step": 2170
},
{
"epoch": 0.580947283917581,
"grad_norm": 3.4094340801239014,
"learning_rate": 9.880835386458873e-06,
"loss": 1.1097,
"step": 2171
},
{
"epoch": 0.5812148782445812,
"grad_norm": 3.626004934310913,
"learning_rate": 9.880643256610931e-06,
"loss": 1.2376,
"step": 2172
},
{
"epoch": 0.5814824725715815,
"grad_norm": 3.6833388805389404,
"learning_rate": 9.880450973872632e-06,
"loss": 1.2113,
"step": 2173
},
{
"epoch": 0.5817500668985818,
"grad_norm": 3.619957685470581,
"learning_rate": 9.880258538250001e-06,
"loss": 1.1476,
"step": 2174
},
{
"epoch": 0.5820176612255821,
"grad_norm": 3.6567726135253906,
"learning_rate": 9.880065949749063e-06,
"loss": 1.1462,
"step": 2175
},
{
"epoch": 0.5822852555525823,
"grad_norm": 3.6730329990386963,
"learning_rate": 9.879873208375854e-06,
"loss": 1.1644,
"step": 2176
},
{
"epoch": 0.5825528498795826,
"grad_norm": 3.4919209480285645,
"learning_rate": 9.879680314136409e-06,
"loss": 1.1,
"step": 2177
},
{
"epoch": 0.5828204442065829,
"grad_norm": 3.7565135955810547,
"learning_rate": 9.879487267036774e-06,
"loss": 1.176,
"step": 2178
},
{
"epoch": 0.583088038533583,
"grad_norm": 3.6856677532196045,
"learning_rate": 9.879294067082994e-06,
"loss": 1.0928,
"step": 2179
},
{
"epoch": 0.5833556328605833,
"grad_norm": 3.7307024002075195,
"learning_rate": 9.87910071428112e-06,
"loss": 1.2073,
"step": 2180
},
{
"epoch": 0.5836232271875836,
"grad_norm": 3.657536745071411,
"learning_rate": 9.878907208637214e-06,
"loss": 1.1352,
"step": 2181
},
{
"epoch": 0.5838908215145839,
"grad_norm": 3.54951810836792,
"learning_rate": 9.878713550157331e-06,
"loss": 1.1183,
"step": 2182
},
{
"epoch": 0.5841584158415841,
"grad_norm": 3.6624770164489746,
"learning_rate": 9.878519738847543e-06,
"loss": 1.1002,
"step": 2183
},
{
"epoch": 0.5844260101685844,
"grad_norm": 3.5097527503967285,
"learning_rate": 9.87832577471392e-06,
"loss": 1.2197,
"step": 2184
},
{
"epoch": 0.5846936044955847,
"grad_norm": 3.292865037918091,
"learning_rate": 9.878131657762535e-06,
"loss": 1.0721,
"step": 2185
},
{
"epoch": 0.584961198822585,
"grad_norm": 3.937479019165039,
"learning_rate": 9.877937387999473e-06,
"loss": 1.3144,
"step": 2186
},
{
"epoch": 0.5852287931495852,
"grad_norm": 3.6731297969818115,
"learning_rate": 9.877742965430816e-06,
"loss": 1.0068,
"step": 2187
},
{
"epoch": 0.5854963874765855,
"grad_norm": 3.6115329265594482,
"learning_rate": 9.877548390062656e-06,
"loss": 1.1998,
"step": 2188
},
{
"epoch": 0.5857639818035858,
"grad_norm": 3.6412646770477295,
"learning_rate": 9.87735366190109e-06,
"loss": 1.1148,
"step": 2189
},
{
"epoch": 0.586031576130586,
"grad_norm": 3.576279401779175,
"learning_rate": 9.877158780952218e-06,
"loss": 1.1437,
"step": 2190
},
{
"epoch": 0.5862991704575863,
"grad_norm": 3.5560824871063232,
"learning_rate": 9.876963747222142e-06,
"loss": 1.1313,
"step": 2191
},
{
"epoch": 0.5865667647845866,
"grad_norm": 3.5082075595855713,
"learning_rate": 9.876768560716972e-06,
"loss": 1.1694,
"step": 2192
},
{
"epoch": 0.5868343591115869,
"grad_norm": 3.724195718765259,
"learning_rate": 9.876573221442824e-06,
"loss": 1.212,
"step": 2193
},
{
"epoch": 0.5871019534385871,
"grad_norm": 3.5083227157592773,
"learning_rate": 9.876377729405817e-06,
"loss": 1.1469,
"step": 2194
},
{
"epoch": 0.5873695477655874,
"grad_norm": 3.8225934505462646,
"learning_rate": 9.876182084612076e-06,
"loss": 1.2833,
"step": 2195
},
{
"epoch": 0.5876371420925877,
"grad_norm": 4.439055919647217,
"learning_rate": 9.875986287067726e-06,
"loss": 1.2021,
"step": 2196
},
{
"epoch": 0.587904736419588,
"grad_norm": 3.612614393234253,
"learning_rate": 9.875790336778903e-06,
"loss": 1.1595,
"step": 2197
},
{
"epoch": 0.5881723307465881,
"grad_norm": 3.598160982131958,
"learning_rate": 9.875594233751746e-06,
"loss": 1.1245,
"step": 2198
},
{
"epoch": 0.5884399250735884,
"grad_norm": 3.281412124633789,
"learning_rate": 9.875397977992397e-06,
"loss": 1.075,
"step": 2199
},
{
"epoch": 0.5887075194005887,
"grad_norm": 3.0353622436523438,
"learning_rate": 9.875201569507004e-06,
"loss": 1.0529,
"step": 2200
},
{
"epoch": 0.5889751137275889,
"grad_norm": 3.3993475437164307,
"learning_rate": 9.875005008301719e-06,
"loss": 1.2462,
"step": 2201
},
{
"epoch": 0.5892427080545892,
"grad_norm": 3.3722541332244873,
"learning_rate": 9.8748082943827e-06,
"loss": 1.1508,
"step": 2202
},
{
"epoch": 0.5895103023815895,
"grad_norm": 3.270134687423706,
"learning_rate": 9.874611427756111e-06,
"loss": 1.2163,
"step": 2203
},
{
"epoch": 0.5897778967085898,
"grad_norm": 3.6814143657684326,
"learning_rate": 9.874414408428116e-06,
"loss": 1.2098,
"step": 2204
},
{
"epoch": 0.59004549103559,
"grad_norm": 3.3593337535858154,
"learning_rate": 9.874217236404889e-06,
"loss": 1.1041,
"step": 2205
},
{
"epoch": 0.5903130853625903,
"grad_norm": 3.556748628616333,
"learning_rate": 9.874019911692606e-06,
"loss": 1.1655,
"step": 2206
},
{
"epoch": 0.5905806796895906,
"grad_norm": 3.696110486984253,
"learning_rate": 9.873822434297448e-06,
"loss": 1.0674,
"step": 2207
},
{
"epoch": 0.5908482740165909,
"grad_norm": 3.481388807296753,
"learning_rate": 9.873624804225602e-06,
"loss": 1.1462,
"step": 2208
},
{
"epoch": 0.5911158683435911,
"grad_norm": 3.828707695007324,
"learning_rate": 9.873427021483256e-06,
"loss": 1.1755,
"step": 2209
},
{
"epoch": 0.5913834626705914,
"grad_norm": 3.4342329502105713,
"learning_rate": 9.87322908607661e-06,
"loss": 1.1296,
"step": 2210
},
{
"epoch": 0.5916510569975917,
"grad_norm": 4.072646141052246,
"learning_rate": 9.873030998011861e-06,
"loss": 1.4232,
"step": 2211
},
{
"epoch": 0.5919186513245919,
"grad_norm": 3.8405468463897705,
"learning_rate": 9.872832757295216e-06,
"loss": 1.2178,
"step": 2212
},
{
"epoch": 0.5921862456515922,
"grad_norm": 3.6950206756591797,
"learning_rate": 9.872634363932887e-06,
"loss": 1.1332,
"step": 2213
},
{
"epoch": 0.5924538399785925,
"grad_norm": 4.053956985473633,
"learning_rate": 9.872435817931085e-06,
"loss": 1.3148,
"step": 2214
},
{
"epoch": 0.5927214343055928,
"grad_norm": 3.2921195030212402,
"learning_rate": 9.87223711929603e-06,
"loss": 1.0644,
"step": 2215
},
{
"epoch": 0.592989028632593,
"grad_norm": 3.3384501934051514,
"learning_rate": 9.87203826803395e-06,
"loss": 1.1453,
"step": 2216
},
{
"epoch": 0.5932566229595932,
"grad_norm": 3.292581558227539,
"learning_rate": 9.871839264151071e-06,
"loss": 1.0399,
"step": 2217
},
{
"epoch": 0.5935242172865935,
"grad_norm": 3.5483226776123047,
"learning_rate": 9.871640107653629e-06,
"loss": 1.2065,
"step": 2218
},
{
"epoch": 0.5937918116135938,
"grad_norm": 3.443068742752075,
"learning_rate": 9.87144079854786e-06,
"loss": 1.1305,
"step": 2219
},
{
"epoch": 0.594059405940594,
"grad_norm": 3.495704412460327,
"learning_rate": 9.871241336840009e-06,
"loss": 1.1877,
"step": 2220
},
{
"epoch": 0.5943270002675943,
"grad_norm": 3.619189739227295,
"learning_rate": 9.871041722536326e-06,
"loss": 1.1417,
"step": 2221
},
{
"epoch": 0.5945945945945946,
"grad_norm": 3.7714147567749023,
"learning_rate": 9.87084195564306e-06,
"loss": 1.2656,
"step": 2222
},
{
"epoch": 0.5948621889215948,
"grad_norm": 3.2320003509521484,
"learning_rate": 9.870642036166474e-06,
"loss": 0.9794,
"step": 2223
},
{
"epoch": 0.5951297832485951,
"grad_norm": 3.6784067153930664,
"learning_rate": 9.870441964112826e-06,
"loss": 1.149,
"step": 2224
},
{
"epoch": 0.5953973775755954,
"grad_norm": 3.8272829055786133,
"learning_rate": 9.870241739488387e-06,
"loss": 1.2293,
"step": 2225
},
{
"epoch": 0.5956649719025957,
"grad_norm": 3.3917317390441895,
"learning_rate": 9.870041362299428e-06,
"loss": 1.0405,
"step": 2226
},
{
"epoch": 0.5959325662295959,
"grad_norm": 3.6060194969177246,
"learning_rate": 9.869840832552224e-06,
"loss": 1.2424,
"step": 2227
},
{
"epoch": 0.5962001605565962,
"grad_norm": 3.5458180904388428,
"learning_rate": 9.86964015025306e-06,
"loss": 1.2202,
"step": 2228
},
{
"epoch": 0.5964677548835965,
"grad_norm": 3.5996251106262207,
"learning_rate": 9.86943931540822e-06,
"loss": 1.2383,
"step": 2229
},
{
"epoch": 0.5967353492105968,
"grad_norm": 3.6928818225860596,
"learning_rate": 9.869238328023996e-06,
"loss": 1.0798,
"step": 2230
},
{
"epoch": 0.597002943537597,
"grad_norm": 3.3863589763641357,
"learning_rate": 9.869037188106684e-06,
"loss": 1.0548,
"step": 2231
},
{
"epoch": 0.5972705378645973,
"grad_norm": 3.744899272918701,
"learning_rate": 9.868835895662588e-06,
"loss": 1.1532,
"step": 2232
},
{
"epoch": 0.5975381321915976,
"grad_norm": 4.080715656280518,
"learning_rate": 9.868634450698009e-06,
"loss": 1.2823,
"step": 2233
},
{
"epoch": 0.5978057265185978,
"grad_norm": 4.020185947418213,
"learning_rate": 9.868432853219259e-06,
"loss": 1.3154,
"step": 2234
},
{
"epoch": 0.598073320845598,
"grad_norm": 3.684755325317383,
"learning_rate": 9.868231103232655e-06,
"loss": 1.1825,
"step": 2235
},
{
"epoch": 0.5983409151725984,
"grad_norm": 3.9021434783935547,
"learning_rate": 9.868029200744515e-06,
"loss": 1.3453,
"step": 2236
},
{
"epoch": 0.5986085094995987,
"grad_norm": 3.224306344985962,
"learning_rate": 9.867827145761164e-06,
"loss": 1.1202,
"step": 2237
},
{
"epoch": 0.5988761038265988,
"grad_norm": 3.318912982940674,
"learning_rate": 9.86762493828893e-06,
"loss": 1.0726,
"step": 2238
},
{
"epoch": 0.5991436981535991,
"grad_norm": 3.5440762042999268,
"learning_rate": 9.867422578334154e-06,
"loss": 1.1485,
"step": 2239
},
{
"epoch": 0.5994112924805994,
"grad_norm": 3.5095126628875732,
"learning_rate": 9.867220065903167e-06,
"loss": 1.1142,
"step": 2240
},
{
"epoch": 0.5996788868075997,
"grad_norm": 3.494436502456665,
"learning_rate": 9.867017401002316e-06,
"loss": 1.0809,
"step": 2241
},
{
"epoch": 0.5999464811345999,
"grad_norm": 3.985200881958008,
"learning_rate": 9.86681458363795e-06,
"loss": 1.1823,
"step": 2242
},
{
"epoch": 0.6002140754616002,
"grad_norm": 3.566523313522339,
"learning_rate": 9.866611613816425e-06,
"loss": 1.2669,
"step": 2243
},
{
"epoch": 0.6004816697886005,
"grad_norm": 3.53113055229187,
"learning_rate": 9.866408491544095e-06,
"loss": 1.0821,
"step": 2244
},
{
"epoch": 0.6007492641156007,
"grad_norm": 3.8554863929748535,
"learning_rate": 9.866205216827323e-06,
"loss": 1.3485,
"step": 2245
},
{
"epoch": 0.601016858442601,
"grad_norm": 3.4552130699157715,
"learning_rate": 9.866001789672479e-06,
"loss": 1.1277,
"step": 2246
},
{
"epoch": 0.6012844527696013,
"grad_norm": 3.2801413536071777,
"learning_rate": 9.865798210085935e-06,
"loss": 1.0526,
"step": 2247
},
{
"epoch": 0.6015520470966016,
"grad_norm": 3.6641762256622314,
"learning_rate": 9.865594478074068e-06,
"loss": 1.1389,
"step": 2248
},
{
"epoch": 0.6018196414236018,
"grad_norm": 3.1713666915893555,
"learning_rate": 9.865390593643261e-06,
"loss": 0.9773,
"step": 2249
},
{
"epoch": 0.6020872357506021,
"grad_norm": 3.2642340660095215,
"learning_rate": 9.8651865567999e-06,
"loss": 1.1162,
"step": 2250
},
{
"epoch": 0.6023548300776024,
"grad_norm": 3.8581626415252686,
"learning_rate": 9.864982367550375e-06,
"loss": 1.2288,
"step": 2251
},
{
"epoch": 0.6026224244046027,
"grad_norm": 3.619734525680542,
"learning_rate": 9.864778025901086e-06,
"loss": 1.1009,
"step": 2252
},
{
"epoch": 0.6028900187316029,
"grad_norm": 3.6816861629486084,
"learning_rate": 9.86457353185843e-06,
"loss": 1.2656,
"step": 2253
},
{
"epoch": 0.6031576130586032,
"grad_norm": 3.9430642127990723,
"learning_rate": 9.864368885428816e-06,
"loss": 1.2013,
"step": 2254
},
{
"epoch": 0.6034252073856035,
"grad_norm": 3.3938138484954834,
"learning_rate": 9.864164086618656e-06,
"loss": 1.0831,
"step": 2255
},
{
"epoch": 0.6036928017126036,
"grad_norm": 3.3266994953155518,
"learning_rate": 9.863959135434361e-06,
"loss": 1.1322,
"step": 2256
},
{
"epoch": 0.6039603960396039,
"grad_norm": 3.3137824535369873,
"learning_rate": 9.863754031882355e-06,
"loss": 1.1232,
"step": 2257
},
{
"epoch": 0.6042279903666042,
"grad_norm": 3.363191604614258,
"learning_rate": 9.863548775969061e-06,
"loss": 1.0118,
"step": 2258
},
{
"epoch": 0.6044955846936045,
"grad_norm": 3.245950222015381,
"learning_rate": 9.863343367700909e-06,
"loss": 1.0168,
"step": 2259
},
{
"epoch": 0.6047631790206047,
"grad_norm": 3.398611545562744,
"learning_rate": 9.863137807084336e-06,
"loss": 1.1561,
"step": 2260
},
{
"epoch": 0.605030773347605,
"grad_norm": 3.793672800064087,
"learning_rate": 9.862932094125778e-06,
"loss": 1.0614,
"step": 2261
},
{
"epoch": 0.6052983676746053,
"grad_norm": 3.716275691986084,
"learning_rate": 9.86272622883168e-06,
"loss": 1.2033,
"step": 2262
},
{
"epoch": 0.6055659620016056,
"grad_norm": 3.418994903564453,
"learning_rate": 9.862520211208493e-06,
"loss": 1.1246,
"step": 2263
},
{
"epoch": 0.6058335563286058,
"grad_norm": 3.4987545013427734,
"learning_rate": 9.862314041262668e-06,
"loss": 1.1269,
"step": 2264
},
{
"epoch": 0.6061011506556061,
"grad_norm": 3.595693826675415,
"learning_rate": 9.862107719000667e-06,
"loss": 1.1729,
"step": 2265
},
{
"epoch": 0.6063687449826064,
"grad_norm": 3.5446066856384277,
"learning_rate": 9.861901244428949e-06,
"loss": 1.1141,
"step": 2266
},
{
"epoch": 0.6066363393096066,
"grad_norm": 3.108658790588379,
"learning_rate": 9.861694617553983e-06,
"loss": 1.0365,
"step": 2267
},
{
"epoch": 0.6069039336366069,
"grad_norm": 3.6176912784576416,
"learning_rate": 9.861487838382244e-06,
"loss": 1.182,
"step": 2268
},
{
"epoch": 0.6071715279636072,
"grad_norm": 3.7221384048461914,
"learning_rate": 9.861280906920208e-06,
"loss": 1.0479,
"step": 2269
},
{
"epoch": 0.6074391222906075,
"grad_norm": 3.526144504547119,
"learning_rate": 9.861073823174357e-06,
"loss": 1.0778,
"step": 2270
},
{
"epoch": 0.6077067166176077,
"grad_norm": 3.49381160736084,
"learning_rate": 9.86086658715118e-06,
"loss": 1.0654,
"step": 2271
},
{
"epoch": 0.607974310944608,
"grad_norm": 3.485805034637451,
"learning_rate": 9.860659198857166e-06,
"loss": 1.138,
"step": 2272
},
{
"epoch": 0.6082419052716083,
"grad_norm": 3.3944783210754395,
"learning_rate": 9.860451658298813e-06,
"loss": 1.1153,
"step": 2273
},
{
"epoch": 0.6085094995986086,
"grad_norm": 3.5149385929107666,
"learning_rate": 9.860243965482623e-06,
"loss": 1.1654,
"step": 2274
},
{
"epoch": 0.6087770939256087,
"grad_norm": 3.7925617694854736,
"learning_rate": 9.860036120415102e-06,
"loss": 1.2223,
"step": 2275
},
{
"epoch": 0.609044688252609,
"grad_norm": 3.524855852127075,
"learning_rate": 9.859828123102759e-06,
"loss": 1.0867,
"step": 2276
},
{
"epoch": 0.6093122825796093,
"grad_norm": 3.534085750579834,
"learning_rate": 9.859619973552112e-06,
"loss": 1.0719,
"step": 2277
},
{
"epoch": 0.6095798769066095,
"grad_norm": 3.5767481327056885,
"learning_rate": 9.859411671769682e-06,
"loss": 1.2826,
"step": 2278
},
{
"epoch": 0.6098474712336098,
"grad_norm": 3.26108980178833,
"learning_rate": 9.859203217761993e-06,
"loss": 1.0839,
"step": 2279
},
{
"epoch": 0.6101150655606101,
"grad_norm": 3.6995849609375,
"learning_rate": 9.858994611535572e-06,
"loss": 1.2193,
"step": 2280
},
{
"epoch": 0.6103826598876104,
"grad_norm": 3.7640321254730225,
"learning_rate": 9.858785853096958e-06,
"loss": 1.2932,
"step": 2281
},
{
"epoch": 0.6106502542146106,
"grad_norm": 3.795732021331787,
"learning_rate": 9.85857694245269e-06,
"loss": 1.3135,
"step": 2282
},
{
"epoch": 0.6109178485416109,
"grad_norm": 3.552950620651245,
"learning_rate": 9.858367879609311e-06,
"loss": 1.106,
"step": 2283
},
{
"epoch": 0.6111854428686112,
"grad_norm": 3.506056547164917,
"learning_rate": 9.85815866457337e-06,
"loss": 1.0596,
"step": 2284
},
{
"epoch": 0.6114530371956115,
"grad_norm": 3.822715997695923,
"learning_rate": 9.857949297351423e-06,
"loss": 1.1044,
"step": 2285
},
{
"epoch": 0.6117206315226117,
"grad_norm": 3.263763427734375,
"learning_rate": 9.857739777950026e-06,
"loss": 1.0387,
"step": 2286
},
{
"epoch": 0.611988225849612,
"grad_norm": 3.378865957260132,
"learning_rate": 9.857530106375743e-06,
"loss": 1.0867,
"step": 2287
},
{
"epoch": 0.6122558201766123,
"grad_norm": 3.8504269123077393,
"learning_rate": 9.857320282635143e-06,
"loss": 1.2017,
"step": 2288
},
{
"epoch": 0.6125234145036125,
"grad_norm": 3.375674247741699,
"learning_rate": 9.857110306734798e-06,
"loss": 1.0055,
"step": 2289
},
{
"epoch": 0.6127910088306128,
"grad_norm": 3.5643208026885986,
"learning_rate": 9.856900178681287e-06,
"loss": 1.2542,
"step": 2290
},
{
"epoch": 0.6130586031576131,
"grad_norm": 3.1768534183502197,
"learning_rate": 9.856689898481191e-06,
"loss": 0.9672,
"step": 2291
},
{
"epoch": 0.6133261974846134,
"grad_norm": 3.2512409687042236,
"learning_rate": 9.856479466141098e-06,
"loss": 1.065,
"step": 2292
},
{
"epoch": 0.6135937918116136,
"grad_norm": 3.486975908279419,
"learning_rate": 9.8562688816676e-06,
"loss": 1.2013,
"step": 2293
},
{
"epoch": 0.6138613861386139,
"grad_norm": 3.7750918865203857,
"learning_rate": 9.856058145067293e-06,
"loss": 1.2465,
"step": 2294
},
{
"epoch": 0.6141289804656141,
"grad_norm": 4.267007827758789,
"learning_rate": 9.85584725634678e-06,
"loss": 1.2918,
"step": 2295
},
{
"epoch": 0.6143965747926144,
"grad_norm": 3.3109710216522217,
"learning_rate": 9.855636215512666e-06,
"loss": 1.0852,
"step": 2296
},
{
"epoch": 0.6146641691196146,
"grad_norm": 3.4727590084075928,
"learning_rate": 9.85542502257156e-06,
"loss": 1.1332,
"step": 2297
},
{
"epoch": 0.6149317634466149,
"grad_norm": 3.525007724761963,
"learning_rate": 9.855213677530083e-06,
"loss": 1.1852,
"step": 2298
},
{
"epoch": 0.6151993577736152,
"grad_norm": 4.421526908874512,
"learning_rate": 9.85500218039485e-06,
"loss": 1.3733,
"step": 2299
},
{
"epoch": 0.6154669521006154,
"grad_norm": 3.6387100219726562,
"learning_rate": 9.854790531172491e-06,
"loss": 1.1027,
"step": 2300
},
{
"epoch": 0.6157345464276157,
"grad_norm": 3.6601171493530273,
"learning_rate": 9.854578729869634e-06,
"loss": 1.1533,
"step": 2301
},
{
"epoch": 0.616002140754616,
"grad_norm": 3.661722183227539,
"learning_rate": 9.854366776492915e-06,
"loss": 1.0665,
"step": 2302
},
{
"epoch": 0.6162697350816163,
"grad_norm": 3.5786993503570557,
"learning_rate": 9.85415467104897e-06,
"loss": 1.2227,
"step": 2303
},
{
"epoch": 0.6165373294086165,
"grad_norm": 3.527582883834839,
"learning_rate": 9.853942413544448e-06,
"loss": 1.2771,
"step": 2304
},
{
"epoch": 0.6168049237356168,
"grad_norm": 3.9386675357818604,
"learning_rate": 9.853730003985995e-06,
"loss": 1.2679,
"step": 2305
},
{
"epoch": 0.6170725180626171,
"grad_norm": 3.3264570236206055,
"learning_rate": 9.853517442380266e-06,
"loss": 1.093,
"step": 2306
},
{
"epoch": 0.6173401123896174,
"grad_norm": 3.631671905517578,
"learning_rate": 9.85330472873392e-06,
"loss": 1.1611,
"step": 2307
},
{
"epoch": 0.6176077067166176,
"grad_norm": 3.9412624835968018,
"learning_rate": 9.853091863053621e-06,
"loss": 1.198,
"step": 2308
},
{
"epoch": 0.6178753010436179,
"grad_norm": 3.4055187702178955,
"learning_rate": 9.852878845346035e-06,
"loss": 1.0783,
"step": 2309
},
{
"epoch": 0.6181428953706182,
"grad_norm": 3.639285087585449,
"learning_rate": 9.852665675617837e-06,
"loss": 1.2475,
"step": 2310
},
{
"epoch": 0.6184104896976184,
"grad_norm": 3.5802559852600098,
"learning_rate": 9.852452353875705e-06,
"loss": 1.1369,
"step": 2311
},
{
"epoch": 0.6186780840246187,
"grad_norm": 3.259661912918091,
"learning_rate": 9.852238880126319e-06,
"loss": 1.0025,
"step": 2312
},
{
"epoch": 0.618945678351619,
"grad_norm": 3.9171831607818604,
"learning_rate": 9.852025254376367e-06,
"loss": 1.2405,
"step": 2313
},
{
"epoch": 0.6192132726786193,
"grad_norm": 3.7371790409088135,
"learning_rate": 9.851811476632544e-06,
"loss": 1.2399,
"step": 2314
},
{
"epoch": 0.6194808670056194,
"grad_norm": 3.9764063358306885,
"learning_rate": 9.851597546901543e-06,
"loss": 1.3006,
"step": 2315
},
{
"epoch": 0.6197484613326197,
"grad_norm": 3.6764659881591797,
"learning_rate": 9.851383465190068e-06,
"loss": 1.1916,
"step": 2316
},
{
"epoch": 0.62001605565962,
"grad_norm": 3.4635825157165527,
"learning_rate": 9.851169231504825e-06,
"loss": 1.0243,
"step": 2317
},
{
"epoch": 0.6202836499866203,
"grad_norm": 3.5511868000030518,
"learning_rate": 9.850954845852522e-06,
"loss": 1.1825,
"step": 2318
},
{
"epoch": 0.6205512443136205,
"grad_norm": 3.948732376098633,
"learning_rate": 9.85074030823988e-06,
"loss": 1.3428,
"step": 2319
},
{
"epoch": 0.6208188386406208,
"grad_norm": 3.748976469039917,
"learning_rate": 9.850525618673615e-06,
"loss": 1.2,
"step": 2320
},
{
"epoch": 0.6210864329676211,
"grad_norm": 3.6761586666107178,
"learning_rate": 9.850310777160454e-06,
"loss": 1.2541,
"step": 2321
},
{
"epoch": 0.6213540272946213,
"grad_norm": 3.328855514526367,
"learning_rate": 9.85009578370713e-06,
"loss": 1.0451,
"step": 2322
},
{
"epoch": 0.6216216216216216,
"grad_norm": 3.2399799823760986,
"learning_rate": 9.849880638320372e-06,
"loss": 1.0936,
"step": 2323
},
{
"epoch": 0.6218892159486219,
"grad_norm": 3.481745481491089,
"learning_rate": 9.849665341006924e-06,
"loss": 1.2136,
"step": 2324
},
{
"epoch": 0.6221568102756222,
"grad_norm": 3.219832181930542,
"learning_rate": 9.849449891773529e-06,
"loss": 1.059,
"step": 2325
},
{
"epoch": 0.6224244046026224,
"grad_norm": 3.4119327068328857,
"learning_rate": 9.849234290626937e-06,
"loss": 1.0072,
"step": 2326
},
{
"epoch": 0.6226919989296227,
"grad_norm": 3.2931737899780273,
"learning_rate": 9.8490185375739e-06,
"loss": 1.169,
"step": 2327
},
{
"epoch": 0.622959593256623,
"grad_norm": 4.0458760261535645,
"learning_rate": 9.848802632621177e-06,
"loss": 1.2028,
"step": 2328
},
{
"epoch": 0.6232271875836233,
"grad_norm": 3.2181153297424316,
"learning_rate": 9.848586575775534e-06,
"loss": 0.9779,
"step": 2329
},
{
"epoch": 0.6234947819106235,
"grad_norm": 3.359768867492676,
"learning_rate": 9.848370367043737e-06,
"loss": 1.0074,
"step": 2330
},
{
"epoch": 0.6237623762376238,
"grad_norm": 3.5515081882476807,
"learning_rate": 9.848154006432559e-06,
"loss": 1.0557,
"step": 2331
},
{
"epoch": 0.6240299705646241,
"grad_norm": 4.038802623748779,
"learning_rate": 9.847937493948778e-06,
"loss": 1.1691,
"step": 2332
},
{
"epoch": 0.6242975648916242,
"grad_norm": 3.4252140522003174,
"learning_rate": 9.847720829599177e-06,
"loss": 1.0728,
"step": 2333
},
{
"epoch": 0.6245651592186245,
"grad_norm": 3.5178418159484863,
"learning_rate": 9.847504013390542e-06,
"loss": 1.0433,
"step": 2334
},
{
"epoch": 0.6248327535456248,
"grad_norm": 4.008810043334961,
"learning_rate": 9.847287045329665e-06,
"loss": 1.2534,
"step": 2335
},
{
"epoch": 0.6251003478726251,
"grad_norm": 3.4519779682159424,
"learning_rate": 9.847069925423342e-06,
"loss": 1.2137,
"step": 2336
},
{
"epoch": 0.6253679421996253,
"grad_norm": 3.9247629642486572,
"learning_rate": 9.846852653678377e-06,
"loss": 1.0946,
"step": 2337
},
{
"epoch": 0.6256355365266256,
"grad_norm": 3.3218302726745605,
"learning_rate": 9.846635230101578e-06,
"loss": 0.992,
"step": 2338
},
{
"epoch": 0.6259031308536259,
"grad_norm": 3.259517192840576,
"learning_rate": 9.846417654699748e-06,
"loss": 1.025,
"step": 2339
},
{
"epoch": 0.6261707251806262,
"grad_norm": 3.9205453395843506,
"learning_rate": 9.846199927479711e-06,
"loss": 1.2215,
"step": 2340
},
{
"epoch": 0.6264383195076264,
"grad_norm": 3.4169704914093018,
"learning_rate": 9.845982048448283e-06,
"loss": 1.0521,
"step": 2341
},
{
"epoch": 0.6267059138346267,
"grad_norm": 3.2617716789245605,
"learning_rate": 9.845764017612291e-06,
"loss": 1.0927,
"step": 2342
},
{
"epoch": 0.626973508161627,
"grad_norm": 3.432112455368042,
"learning_rate": 9.845545834978565e-06,
"loss": 1.0838,
"step": 2343
},
{
"epoch": 0.6272411024886272,
"grad_norm": 3.6730408668518066,
"learning_rate": 9.845327500553938e-06,
"loss": 1.1048,
"step": 2344
},
{
"epoch": 0.6275086968156275,
"grad_norm": 3.4062979221343994,
"learning_rate": 9.845109014345251e-06,
"loss": 1.1069,
"step": 2345
},
{
"epoch": 0.6277762911426278,
"grad_norm": 3.237093687057495,
"learning_rate": 9.844890376359348e-06,
"loss": 1.1357,
"step": 2346
},
{
"epoch": 0.6280438854696281,
"grad_norm": 3.722663640975952,
"learning_rate": 9.844671586603079e-06,
"loss": 1.2362,
"step": 2347
},
{
"epoch": 0.6283114797966283,
"grad_norm": 3.7158944606781006,
"learning_rate": 9.844452645083295e-06,
"loss": 1.2066,
"step": 2348
},
{
"epoch": 0.6285790741236286,
"grad_norm": 3.6207492351531982,
"learning_rate": 9.844233551806857e-06,
"loss": 1.1971,
"step": 2349
},
{
"epoch": 0.6288466684506289,
"grad_norm": 3.799163579940796,
"learning_rate": 9.844014306780627e-06,
"loss": 1.1569,
"step": 2350
},
{
"epoch": 0.6291142627776292,
"grad_norm": 3.326672077178955,
"learning_rate": 9.843794910011476e-06,
"loss": 1.0336,
"step": 2351
},
{
"epoch": 0.6293818571046293,
"grad_norm": 3.5804383754730225,
"learning_rate": 9.84357536150627e-06,
"loss": 1.2486,
"step": 2352
},
{
"epoch": 0.6296494514316296,
"grad_norm": 3.147380828857422,
"learning_rate": 9.843355661271895e-06,
"loss": 1.0599,
"step": 2353
},
{
"epoch": 0.62991704575863,
"grad_norm": 3.6518685817718506,
"learning_rate": 9.843135809315227e-06,
"loss": 1.195,
"step": 2354
},
{
"epoch": 0.6301846400856301,
"grad_norm": 3.393224000930786,
"learning_rate": 9.842915805643156e-06,
"loss": 1.1262,
"step": 2355
},
{
"epoch": 0.6304522344126304,
"grad_norm": 3.6997387409210205,
"learning_rate": 9.842695650262573e-06,
"loss": 1.1872,
"step": 2356
},
{
"epoch": 0.6307198287396307,
"grad_norm": 3.419063091278076,
"learning_rate": 9.842475343180375e-06,
"loss": 1.2947,
"step": 2357
},
{
"epoch": 0.630987423066631,
"grad_norm": 3.517101764678955,
"learning_rate": 9.842254884403463e-06,
"loss": 1.2461,
"step": 2358
},
{
"epoch": 0.6312550173936312,
"grad_norm": 3.4831290245056152,
"learning_rate": 9.842034273938744e-06,
"loss": 1.153,
"step": 2359
},
{
"epoch": 0.6315226117206315,
"grad_norm": 3.965106248855591,
"learning_rate": 9.841813511793126e-06,
"loss": 1.2851,
"step": 2360
},
{
"epoch": 0.6317902060476318,
"grad_norm": 2.9913620948791504,
"learning_rate": 9.841592597973528e-06,
"loss": 1.1356,
"step": 2361
},
{
"epoch": 0.6320578003746321,
"grad_norm": 3.26570463180542,
"learning_rate": 9.841371532486867e-06,
"loss": 1.1497,
"step": 2362
},
{
"epoch": 0.6323253947016323,
"grad_norm": 3.5169339179992676,
"learning_rate": 9.841150315340071e-06,
"loss": 1.1598,
"step": 2363
},
{
"epoch": 0.6325929890286326,
"grad_norm": 3.4498212337493896,
"learning_rate": 9.84092894654007e-06,
"loss": 1.1632,
"step": 2364
},
{
"epoch": 0.6328605833556329,
"grad_norm": 4.316896915435791,
"learning_rate": 9.840707426093795e-06,
"loss": 1.2331,
"step": 2365
},
{
"epoch": 0.6331281776826331,
"grad_norm": 3.5566680431365967,
"learning_rate": 9.840485754008188e-06,
"loss": 1.1958,
"step": 2366
},
{
"epoch": 0.6333957720096334,
"grad_norm": 3.55718994140625,
"learning_rate": 9.840263930290192e-06,
"loss": 1.1707,
"step": 2367
},
{
"epoch": 0.6336633663366337,
"grad_norm": 3.3516623973846436,
"learning_rate": 9.840041954946757e-06,
"loss": 1.1279,
"step": 2368
},
{
"epoch": 0.633930960663634,
"grad_norm": 3.7608842849731445,
"learning_rate": 9.839819827984835e-06,
"loss": 1.0901,
"step": 2369
},
{
"epoch": 0.6341985549906342,
"grad_norm": 3.342604637145996,
"learning_rate": 9.839597549411389e-06,
"loss": 1.1313,
"step": 2370
},
{
"epoch": 0.6344661493176345,
"grad_norm": 3.692324161529541,
"learning_rate": 9.839375119233375e-06,
"loss": 1.1783,
"step": 2371
},
{
"epoch": 0.6347337436446348,
"grad_norm": 3.4589786529541016,
"learning_rate": 9.839152537457764e-06,
"loss": 1.0795,
"step": 2372
},
{
"epoch": 0.635001337971635,
"grad_norm": 3.76045560836792,
"learning_rate": 9.83892980409153e-06,
"loss": 1.2872,
"step": 2373
},
{
"epoch": 0.6352689322986352,
"grad_norm": 3.486509323120117,
"learning_rate": 9.838706919141649e-06,
"loss": 0.9929,
"step": 2374
},
{
"epoch": 0.6355365266256355,
"grad_norm": 3.1999824047088623,
"learning_rate": 9.838483882615101e-06,
"loss": 1.1086,
"step": 2375
},
{
"epoch": 0.6358041209526358,
"grad_norm": 3.3866939544677734,
"learning_rate": 9.838260694518877e-06,
"loss": 1.1782,
"step": 2376
},
{
"epoch": 0.636071715279636,
"grad_norm": 3.8350670337677,
"learning_rate": 9.838037354859967e-06,
"loss": 1.2023,
"step": 2377
},
{
"epoch": 0.6363393096066363,
"grad_norm": 3.401334762573242,
"learning_rate": 9.837813863645367e-06,
"loss": 1.2159,
"step": 2378
},
{
"epoch": 0.6366069039336366,
"grad_norm": 3.399458646774292,
"learning_rate": 9.837590220882076e-06,
"loss": 1.03,
"step": 2379
},
{
"epoch": 0.6368744982606369,
"grad_norm": 3.1889894008636475,
"learning_rate": 9.837366426577102e-06,
"loss": 1.0268,
"step": 2380
},
{
"epoch": 0.6371420925876371,
"grad_norm": 3.411510467529297,
"learning_rate": 9.837142480737457e-06,
"loss": 1.1182,
"step": 2381
},
{
"epoch": 0.6374096869146374,
"grad_norm": 3.746042251586914,
"learning_rate": 9.836918383370153e-06,
"loss": 1.1736,
"step": 2382
},
{
"epoch": 0.6376772812416377,
"grad_norm": 3.678807020187378,
"learning_rate": 9.836694134482212e-06,
"loss": 1.1744,
"step": 2383
},
{
"epoch": 0.637944875568638,
"grad_norm": 3.2534291744232178,
"learning_rate": 9.836469734080658e-06,
"loss": 0.9784,
"step": 2384
},
{
"epoch": 0.6382124698956382,
"grad_norm": 3.305079221725464,
"learning_rate": 9.83624518217252e-06,
"loss": 1.1002,
"step": 2385
},
{
"epoch": 0.6384800642226385,
"grad_norm": 3.6762077808380127,
"learning_rate": 9.836020478764835e-06,
"loss": 1.0991,
"step": 2386
},
{
"epoch": 0.6387476585496388,
"grad_norm": 3.3923799991607666,
"learning_rate": 9.83579562386464e-06,
"loss": 1.123,
"step": 2387
},
{
"epoch": 0.639015252876639,
"grad_norm": 3.8256936073303223,
"learning_rate": 9.835570617478976e-06,
"loss": 1.1498,
"step": 2388
},
{
"epoch": 0.6392828472036393,
"grad_norm": 3.488901138305664,
"learning_rate": 9.835345459614897e-06,
"loss": 1.19,
"step": 2389
},
{
"epoch": 0.6395504415306396,
"grad_norm": 3.408535957336426,
"learning_rate": 9.835120150279454e-06,
"loss": 1.1097,
"step": 2390
},
{
"epoch": 0.6398180358576399,
"grad_norm": 3.648115634918213,
"learning_rate": 9.834894689479703e-06,
"loss": 1.0789,
"step": 2391
},
{
"epoch": 0.64008563018464,
"grad_norm": 3.6117544174194336,
"learning_rate": 9.83466907722271e-06,
"loss": 1.256,
"step": 2392
},
{
"epoch": 0.6403532245116403,
"grad_norm": 3.7180707454681396,
"learning_rate": 9.834443313515542e-06,
"loss": 1.1885,
"step": 2393
},
{
"epoch": 0.6406208188386406,
"grad_norm": 5.158202648162842,
"learning_rate": 9.834217398365268e-06,
"loss": 1.1757,
"step": 2394
},
{
"epoch": 0.6408884131656409,
"grad_norm": 3.770582914352417,
"learning_rate": 9.83399133177897e-06,
"loss": 1.2923,
"step": 2395
},
{
"epoch": 0.6411560074926411,
"grad_norm": 3.824382781982422,
"learning_rate": 9.833765113763723e-06,
"loss": 1.2032,
"step": 2396
},
{
"epoch": 0.6414236018196414,
"grad_norm": 3.29740309715271,
"learning_rate": 9.83353874432662e-06,
"loss": 1.2306,
"step": 2397
},
{
"epoch": 0.6416911961466417,
"grad_norm": 3.3349862098693848,
"learning_rate": 9.83331222347475e-06,
"loss": 1.0847,
"step": 2398
},
{
"epoch": 0.6419587904736419,
"grad_norm": 3.7271625995635986,
"learning_rate": 9.833085551215206e-06,
"loss": 1.252,
"step": 2399
},
{
"epoch": 0.6422263848006422,
"grad_norm": 3.7548937797546387,
"learning_rate": 9.832858727555095e-06,
"loss": 1.1225,
"step": 2400
},
{
"epoch": 0.6424939791276425,
"grad_norm": 3.6367075443267822,
"learning_rate": 9.832631752501515e-06,
"loss": 1.1896,
"step": 2401
},
{
"epoch": 0.6427615734546428,
"grad_norm": 3.585908889770508,
"learning_rate": 9.832404626061582e-06,
"loss": 1.24,
"step": 2402
},
{
"epoch": 0.643029167781643,
"grad_norm": 3.4509429931640625,
"learning_rate": 9.832177348242408e-06,
"loss": 1.1011,
"step": 2403
},
{
"epoch": 0.6432967621086433,
"grad_norm": 3.6890709400177,
"learning_rate": 9.831949919051116e-06,
"loss": 1.1894,
"step": 2404
},
{
"epoch": 0.6435643564356436,
"grad_norm": 3.348698139190674,
"learning_rate": 9.831722338494826e-06,
"loss": 1.2294,
"step": 2405
},
{
"epoch": 0.6438319507626439,
"grad_norm": 3.424172878265381,
"learning_rate": 9.831494606580669e-06,
"loss": 1.0647,
"step": 2406
},
{
"epoch": 0.6440995450896441,
"grad_norm": 3.4821624755859375,
"learning_rate": 9.83126672331578e-06,
"loss": 1.124,
"step": 2407
},
{
"epoch": 0.6443671394166444,
"grad_norm": 3.271749973297119,
"learning_rate": 9.831038688707296e-06,
"loss": 1.0989,
"step": 2408
},
{
"epoch": 0.6446347337436447,
"grad_norm": 3.6748054027557373,
"learning_rate": 9.83081050276236e-06,
"loss": 1.1704,
"step": 2409
},
{
"epoch": 0.6449023280706448,
"grad_norm": 3.3000192642211914,
"learning_rate": 9.830582165488123e-06,
"loss": 1.1656,
"step": 2410
},
{
"epoch": 0.6451699223976451,
"grad_norm": 4.096604824066162,
"learning_rate": 9.830353676891736e-06,
"loss": 1.1799,
"step": 2411
},
{
"epoch": 0.6454375167246454,
"grad_norm": 3.337603807449341,
"learning_rate": 9.830125036980353e-06,
"loss": 1.1693,
"step": 2412
},
{
"epoch": 0.6457051110516457,
"grad_norm": 3.6105048656463623,
"learning_rate": 9.829896245761144e-06,
"loss": 1.2169,
"step": 2413
},
{
"epoch": 0.6459727053786459,
"grad_norm": 3.29010272026062,
"learning_rate": 9.829667303241271e-06,
"loss": 1.0089,
"step": 2414
},
{
"epoch": 0.6462402997056462,
"grad_norm": 3.5054385662078857,
"learning_rate": 9.829438209427907e-06,
"loss": 1.105,
"step": 2415
},
{
"epoch": 0.6465078940326465,
"grad_norm": 3.4805397987365723,
"learning_rate": 9.829208964328228e-06,
"loss": 1.0914,
"step": 2416
},
{
"epoch": 0.6467754883596468,
"grad_norm": 3.1424105167388916,
"learning_rate": 9.828979567949416e-06,
"loss": 1.0573,
"step": 2417
},
{
"epoch": 0.647043082686647,
"grad_norm": 4.121860980987549,
"learning_rate": 9.828750020298656e-06,
"loss": 1.1732,
"step": 2418
},
{
"epoch": 0.6473106770136473,
"grad_norm": 3.2964742183685303,
"learning_rate": 9.828520321383142e-06,
"loss": 1.1536,
"step": 2419
},
{
"epoch": 0.6475782713406476,
"grad_norm": 3.4967031478881836,
"learning_rate": 9.828290471210064e-06,
"loss": 1.1049,
"step": 2420
},
{
"epoch": 0.6478458656676478,
"grad_norm": 3.3950541019439697,
"learning_rate": 9.828060469786626e-06,
"loss": 1.151,
"step": 2421
},
{
"epoch": 0.6481134599946481,
"grad_norm": 3.585238218307495,
"learning_rate": 9.827830317120033e-06,
"loss": 1.1172,
"step": 2422
},
{
"epoch": 0.6483810543216484,
"grad_norm": 2.9747002124786377,
"learning_rate": 9.827600013217496e-06,
"loss": 0.9499,
"step": 2423
},
{
"epoch": 0.6486486486486487,
"grad_norm": 3.2427027225494385,
"learning_rate": 9.827369558086225e-06,
"loss": 1.0767,
"step": 2424
},
{
"epoch": 0.6489162429756489,
"grad_norm": 3.419710874557495,
"learning_rate": 9.827138951733441e-06,
"loss": 1.1198,
"step": 2425
},
{
"epoch": 0.6491838373026492,
"grad_norm": 3.5654327869415283,
"learning_rate": 9.82690819416637e-06,
"loss": 1.1684,
"step": 2426
},
{
"epoch": 0.6494514316296495,
"grad_norm": 3.414553642272949,
"learning_rate": 9.826677285392238e-06,
"loss": 1.1018,
"step": 2427
},
{
"epoch": 0.6497190259566498,
"grad_norm": 3.366098642349243,
"learning_rate": 9.826446225418282e-06,
"loss": 1.0191,
"step": 2428
},
{
"epoch": 0.64998662028365,
"grad_norm": 3.913783311843872,
"learning_rate": 9.826215014251738e-06,
"loss": 1.235,
"step": 2429
},
{
"epoch": 0.6502542146106502,
"grad_norm": 4.037808418273926,
"learning_rate": 9.825983651899847e-06,
"loss": 1.2542,
"step": 2430
},
{
"epoch": 0.6505218089376505,
"grad_norm": 2.9834325313568115,
"learning_rate": 9.82575213836986e-06,
"loss": 0.967,
"step": 2431
},
{
"epoch": 0.6507894032646507,
"grad_norm": 3.3896093368530273,
"learning_rate": 9.825520473669026e-06,
"loss": 1.1163,
"step": 2432
},
{
"epoch": 0.651056997591651,
"grad_norm": 3.965498685836792,
"learning_rate": 9.825288657804606e-06,
"loss": 1.2024,
"step": 2433
},
{
"epoch": 0.6513245919186513,
"grad_norm": 3.836982011795044,
"learning_rate": 9.825056690783859e-06,
"loss": 1.2839,
"step": 2434
},
{
"epoch": 0.6515921862456516,
"grad_norm": 4.303612232208252,
"learning_rate": 9.82482457261405e-06,
"loss": 1.1528,
"step": 2435
},
{
"epoch": 0.6518597805726518,
"grad_norm": 3.613075017929077,
"learning_rate": 9.824592303302455e-06,
"loss": 1.1773,
"step": 2436
},
{
"epoch": 0.6521273748996521,
"grad_norm": 3.2512998580932617,
"learning_rate": 9.824359882856347e-06,
"loss": 1.0795,
"step": 2437
},
{
"epoch": 0.6523949692266524,
"grad_norm": 3.6601617336273193,
"learning_rate": 9.824127311283007e-06,
"loss": 1.1032,
"step": 2438
},
{
"epoch": 0.6526625635536527,
"grad_norm": 3.5576727390289307,
"learning_rate": 9.823894588589722e-06,
"loss": 1.1383,
"step": 2439
},
{
"epoch": 0.6529301578806529,
"grad_norm": 3.50748610496521,
"learning_rate": 9.823661714783781e-06,
"loss": 1.2066,
"step": 2440
},
{
"epoch": 0.6531977522076532,
"grad_norm": 3.7736473083496094,
"learning_rate": 9.823428689872479e-06,
"loss": 1.2547,
"step": 2441
},
{
"epoch": 0.6534653465346535,
"grad_norm": 3.476040840148926,
"learning_rate": 9.823195513863114e-06,
"loss": 1.1075,
"step": 2442
},
{
"epoch": 0.6537329408616537,
"grad_norm": 3.444315195083618,
"learning_rate": 9.822962186762994e-06,
"loss": 1.1135,
"step": 2443
},
{
"epoch": 0.654000535188654,
"grad_norm": 3.9835290908813477,
"learning_rate": 9.822728708579425e-06,
"loss": 1.1706,
"step": 2444
},
{
"epoch": 0.6542681295156543,
"grad_norm": 3.669281482696533,
"learning_rate": 9.822495079319725e-06,
"loss": 1.1828,
"step": 2445
},
{
"epoch": 0.6545357238426546,
"grad_norm": 3.555455446243286,
"learning_rate": 9.822261298991208e-06,
"loss": 1.1348,
"step": 2446
},
{
"epoch": 0.6548033181696548,
"grad_norm": 3.5849578380584717,
"learning_rate": 9.822027367601199e-06,
"loss": 1.1241,
"step": 2447
},
{
"epoch": 0.6550709124966551,
"grad_norm": 3.71714186668396,
"learning_rate": 9.821793285157027e-06,
"loss": 1.255,
"step": 2448
},
{
"epoch": 0.6553385068236554,
"grad_norm": 3.6075050830841064,
"learning_rate": 9.821559051666025e-06,
"loss": 1.1514,
"step": 2449
},
{
"epoch": 0.6556061011506557,
"grad_norm": 3.3877387046813965,
"learning_rate": 9.82132466713553e-06,
"loss": 1.1232,
"step": 2450
},
{
"epoch": 0.6558736954776558,
"grad_norm": 3.499657154083252,
"learning_rate": 9.821090131572883e-06,
"loss": 1.1694,
"step": 2451
},
{
"epoch": 0.6561412898046561,
"grad_norm": 3.8426098823547363,
"learning_rate": 9.820855444985433e-06,
"loss": 1.2109,
"step": 2452
},
{
"epoch": 0.6564088841316564,
"grad_norm": 3.5373287200927734,
"learning_rate": 9.82062060738053e-06,
"loss": 1.0852,
"step": 2453
},
{
"epoch": 0.6566764784586567,
"grad_norm": 3.0332095623016357,
"learning_rate": 9.820385618765532e-06,
"loss": 1.0035,
"step": 2454
},
{
"epoch": 0.6569440727856569,
"grad_norm": 3.5709455013275146,
"learning_rate": 9.8201504791478e-06,
"loss": 1.1012,
"step": 2455
},
{
"epoch": 0.6572116671126572,
"grad_norm": 3.4466726779937744,
"learning_rate": 9.819915188534699e-06,
"loss": 1.1192,
"step": 2456
},
{
"epoch": 0.6574792614396575,
"grad_norm": 3.5553793907165527,
"learning_rate": 9.8196797469336e-06,
"loss": 1.2776,
"step": 2457
},
{
"epoch": 0.6577468557666577,
"grad_norm": 3.3105359077453613,
"learning_rate": 9.81944415435188e-06,
"loss": 1.1082,
"step": 2458
},
{
"epoch": 0.658014450093658,
"grad_norm": 3.7504870891571045,
"learning_rate": 9.819208410796916e-06,
"loss": 1.0523,
"step": 2459
},
{
"epoch": 0.6582820444206583,
"grad_norm": 3.4031195640563965,
"learning_rate": 9.818972516276096e-06,
"loss": 1.1827,
"step": 2460
},
{
"epoch": 0.6585496387476586,
"grad_norm": 3.87593412399292,
"learning_rate": 9.818736470796807e-06,
"loss": 1.1583,
"step": 2461
},
{
"epoch": 0.6588172330746588,
"grad_norm": 3.425092935562134,
"learning_rate": 9.818500274366448e-06,
"loss": 1.0955,
"step": 2462
},
{
"epoch": 0.6590848274016591,
"grad_norm": 3.820794105529785,
"learning_rate": 9.818263926992411e-06,
"loss": 1.2023,
"step": 2463
},
{
"epoch": 0.6593524217286594,
"grad_norm": 3.5655276775360107,
"learning_rate": 9.818027428682104e-06,
"loss": 1.1085,
"step": 2464
},
{
"epoch": 0.6596200160556597,
"grad_norm": 3.5070512294769287,
"learning_rate": 9.817790779442937e-06,
"loss": 1.3138,
"step": 2465
},
{
"epoch": 0.6598876103826599,
"grad_norm": 4.04046106338501,
"learning_rate": 9.81755397928232e-06,
"loss": 1.1665,
"step": 2466
},
{
"epoch": 0.6601552047096602,
"grad_norm": 3.2360928058624268,
"learning_rate": 9.81731702820767e-06,
"loss": 1.0778,
"step": 2467
},
{
"epoch": 0.6604227990366605,
"grad_norm": 3.552029848098755,
"learning_rate": 9.817079926226417e-06,
"loss": 1.181,
"step": 2468
},
{
"epoch": 0.6606903933636606,
"grad_norm": 3.4324593544006348,
"learning_rate": 9.816842673345979e-06,
"loss": 1.153,
"step": 2469
},
{
"epoch": 0.6609579876906609,
"grad_norm": 3.6090657711029053,
"learning_rate": 9.816605269573794e-06,
"loss": 1.0663,
"step": 2470
},
{
"epoch": 0.6612255820176612,
"grad_norm": 4.007713794708252,
"learning_rate": 9.816367714917296e-06,
"loss": 1.2343,
"step": 2471
},
{
"epoch": 0.6614931763446615,
"grad_norm": 3.3371682167053223,
"learning_rate": 9.81613000938393e-06,
"loss": 1.1184,
"step": 2472
},
{
"epoch": 0.6617607706716617,
"grad_norm": 3.256664752960205,
"learning_rate": 9.815892152981138e-06,
"loss": 0.9794,
"step": 2473
},
{
"epoch": 0.662028364998662,
"grad_norm": 3.7881510257720947,
"learning_rate": 9.815654145716376e-06,
"loss": 1.1446,
"step": 2474
},
{
"epoch": 0.6622959593256623,
"grad_norm": 3.406993865966797,
"learning_rate": 9.815415987597096e-06,
"loss": 1.2445,
"step": 2475
},
{
"epoch": 0.6625635536526626,
"grad_norm": 3.7865562438964844,
"learning_rate": 9.81517767863076e-06,
"loss": 1.3335,
"step": 2476
},
{
"epoch": 0.6628311479796628,
"grad_norm": 3.755580425262451,
"learning_rate": 9.814939218824831e-06,
"loss": 1.1506,
"step": 2477
},
{
"epoch": 0.6630987423066631,
"grad_norm": 3.753258466720581,
"learning_rate": 9.814700608186783e-06,
"loss": 1.1372,
"step": 2478
},
{
"epoch": 0.6633663366336634,
"grad_norm": 3.5832202434539795,
"learning_rate": 9.814461846724087e-06,
"loss": 1.158,
"step": 2479
},
{
"epoch": 0.6636339309606636,
"grad_norm": 3.4956367015838623,
"learning_rate": 9.814222934444223e-06,
"loss": 1.1532,
"step": 2480
},
{
"epoch": 0.6639015252876639,
"grad_norm": 3.7934727668762207,
"learning_rate": 9.81398387135468e-06,
"loss": 1.1813,
"step": 2481
},
{
"epoch": 0.6641691196146642,
"grad_norm": 3.5121653079986572,
"learning_rate": 9.813744657462941e-06,
"loss": 1.2199,
"step": 2482
},
{
"epoch": 0.6644367139416645,
"grad_norm": 3.3426973819732666,
"learning_rate": 9.8135052927765e-06,
"loss": 1.18,
"step": 2483
},
{
"epoch": 0.6647043082686647,
"grad_norm": 3.5111615657806396,
"learning_rate": 9.813265777302858e-06,
"loss": 1.1257,
"step": 2484
},
{
"epoch": 0.664971902595665,
"grad_norm": 3.1432745456695557,
"learning_rate": 9.813026111049514e-06,
"loss": 1.0037,
"step": 2485
},
{
"epoch": 0.6652394969226653,
"grad_norm": 3.3801767826080322,
"learning_rate": 9.812786294023983e-06,
"loss": 1.1871,
"step": 2486
},
{
"epoch": 0.6655070912496656,
"grad_norm": 3.4595744609832764,
"learning_rate": 9.812546326233771e-06,
"loss": 1.1732,
"step": 2487
},
{
"epoch": 0.6657746855766657,
"grad_norm": 3.835479736328125,
"learning_rate": 9.812306207686398e-06,
"loss": 1.1428,
"step": 2488
},
{
"epoch": 0.666042279903666,
"grad_norm": 3.608619213104248,
"learning_rate": 9.812065938389384e-06,
"loss": 1.2981,
"step": 2489
},
{
"epoch": 0.6663098742306663,
"grad_norm": 3.5188703536987305,
"learning_rate": 9.811825518350257e-06,
"loss": 1.2452,
"step": 2490
},
{
"epoch": 0.6665774685576665,
"grad_norm": 3.481654644012451,
"learning_rate": 9.81158494757655e-06,
"loss": 1.2473,
"step": 2491
},
{
"epoch": 0.6668450628846668,
"grad_norm": 3.2645812034606934,
"learning_rate": 9.811344226075795e-06,
"loss": 1.0821,
"step": 2492
},
{
"epoch": 0.6671126572116671,
"grad_norm": 3.3354525566101074,
"learning_rate": 9.811103353855535e-06,
"loss": 1.08,
"step": 2493
},
{
"epoch": 0.6673802515386674,
"grad_norm": 3.778996467590332,
"learning_rate": 9.810862330923317e-06,
"loss": 1.3063,
"step": 2494
},
{
"epoch": 0.6676478458656676,
"grad_norm": 3.2988641262054443,
"learning_rate": 9.810621157286688e-06,
"loss": 1.0607,
"step": 2495
},
{
"epoch": 0.6679154401926679,
"grad_norm": 3.771205425262451,
"learning_rate": 9.810379832953207e-06,
"loss": 1.1692,
"step": 2496
},
{
"epoch": 0.6681830345196682,
"grad_norm": 3.594296455383301,
"learning_rate": 9.81013835793043e-06,
"loss": 1.2804,
"step": 2497
},
{
"epoch": 0.6684506288466685,
"grad_norm": 3.33087420463562,
"learning_rate": 9.809896732225923e-06,
"loss": 1.0088,
"step": 2498
},
{
"epoch": 0.6687182231736687,
"grad_norm": 3.6876564025878906,
"learning_rate": 9.809654955847256e-06,
"loss": 1.1182,
"step": 2499
},
{
"epoch": 0.668985817500669,
"grad_norm": 3.4345877170562744,
"learning_rate": 9.809413028802002e-06,
"loss": 1.1175,
"step": 2500
},
{
"epoch": 0.668985817500669,
"eval_loss": 1.1746242046356201,
"eval_runtime": 11.5946,
"eval_samples_per_second": 34.499,
"eval_steps_per_second": 4.312,
"step": 2500
},
{
"epoch": 0.6692534118276693,
"grad_norm": 3.555928945541382,
"learning_rate": 9.809170951097739e-06,
"loss": 1.1236,
"step": 2501
},
{
"epoch": 0.6695210061546695,
"grad_norm": 3.356553316116333,
"learning_rate": 9.80892872274205e-06,
"loss": 1.0821,
"step": 2502
},
{
"epoch": 0.6697886004816698,
"grad_norm": 3.5524895191192627,
"learning_rate": 9.808686343742524e-06,
"loss": 1.3042,
"step": 2503
},
{
"epoch": 0.6700561948086701,
"grad_norm": 3.2740256786346436,
"learning_rate": 9.808443814106754e-06,
"loss": 1.091,
"step": 2504
},
{
"epoch": 0.6703237891356704,
"grad_norm": 3.2746195793151855,
"learning_rate": 9.808201133842337e-06,
"loss": 1.0107,
"step": 2505
},
{
"epoch": 0.6705913834626706,
"grad_norm": 3.7611098289489746,
"learning_rate": 9.807958302956875e-06,
"loss": 1.1164,
"step": 2506
},
{
"epoch": 0.6708589777896709,
"grad_norm": 3.5693981647491455,
"learning_rate": 9.807715321457976e-06,
"loss": 1.1661,
"step": 2507
},
{
"epoch": 0.6711265721166711,
"grad_norm": 3.7224698066711426,
"learning_rate": 9.807472189353249e-06,
"loss": 1.3212,
"step": 2508
},
{
"epoch": 0.6713941664436714,
"grad_norm": 3.767155408859253,
"learning_rate": 9.807228906650312e-06,
"loss": 1.294,
"step": 2509
},
{
"epoch": 0.6716617607706716,
"grad_norm": 4.016858100891113,
"learning_rate": 9.806985473356787e-06,
"loss": 1.1964,
"step": 2510
},
{
"epoch": 0.6719293550976719,
"grad_norm": 3.684230089187622,
"learning_rate": 9.806741889480298e-06,
"loss": 1.1301,
"step": 2511
},
{
"epoch": 0.6721969494246722,
"grad_norm": 3.254202365875244,
"learning_rate": 9.806498155028477e-06,
"loss": 1.0444,
"step": 2512
},
{
"epoch": 0.6724645437516724,
"grad_norm": 3.6285407543182373,
"learning_rate": 9.806254270008959e-06,
"loss": 1.1154,
"step": 2513
},
{
"epoch": 0.6727321380786727,
"grad_norm": 3.4263675212860107,
"learning_rate": 9.806010234429382e-06,
"loss": 1.1158,
"step": 2514
},
{
"epoch": 0.672999732405673,
"grad_norm": 3.619586229324341,
"learning_rate": 9.805766048297392e-06,
"loss": 1.0731,
"step": 2515
},
{
"epoch": 0.6732673267326733,
"grad_norm": 3.5182855129241943,
"learning_rate": 9.80552171162064e-06,
"loss": 1.1724,
"step": 2516
},
{
"epoch": 0.6735349210596735,
"grad_norm": 3.505631446838379,
"learning_rate": 9.805277224406776e-06,
"loss": 1.1398,
"step": 2517
},
{
"epoch": 0.6738025153866738,
"grad_norm": 3.540221929550171,
"learning_rate": 9.805032586663462e-06,
"loss": 1.1665,
"step": 2518
},
{
"epoch": 0.6740701097136741,
"grad_norm": 3.7385308742523193,
"learning_rate": 9.804787798398361e-06,
"loss": 1.1794,
"step": 2519
},
{
"epoch": 0.6743377040406744,
"grad_norm": 3.9296083450317383,
"learning_rate": 9.80454285961914e-06,
"loss": 1.2484,
"step": 2520
},
{
"epoch": 0.6746052983676746,
"grad_norm": 3.585625171661377,
"learning_rate": 9.804297770333472e-06,
"loss": 1.2348,
"step": 2521
},
{
"epoch": 0.6748728926946749,
"grad_norm": 3.768056869506836,
"learning_rate": 9.804052530549038e-06,
"loss": 1.0857,
"step": 2522
},
{
"epoch": 0.6751404870216752,
"grad_norm": 3.6394028663635254,
"learning_rate": 9.803807140273516e-06,
"loss": 1.1641,
"step": 2523
},
{
"epoch": 0.6754080813486754,
"grad_norm": 3.505856990814209,
"learning_rate": 9.803561599514594e-06,
"loss": 1.0889,
"step": 2524
},
{
"epoch": 0.6756756756756757,
"grad_norm": 3.4862112998962402,
"learning_rate": 9.803315908279966e-06,
"loss": 1.1436,
"step": 2525
},
{
"epoch": 0.675943270002676,
"grad_norm": 3.91096568107605,
"learning_rate": 9.803070066577327e-06,
"loss": 1.1813,
"step": 2526
},
{
"epoch": 0.6762108643296763,
"grad_norm": 3.642303228378296,
"learning_rate": 9.802824074414378e-06,
"loss": 1.1385,
"step": 2527
},
{
"epoch": 0.6764784586566764,
"grad_norm": 3.8517065048217773,
"learning_rate": 9.802577931798826e-06,
"loss": 1.1738,
"step": 2528
},
{
"epoch": 0.6767460529836767,
"grad_norm": 3.9151949882507324,
"learning_rate": 9.80233163873838e-06,
"loss": 1.3214,
"step": 2529
},
{
"epoch": 0.677013647310677,
"grad_norm": 3.326645612716675,
"learning_rate": 9.802085195240755e-06,
"loss": 1.1715,
"step": 2530
},
{
"epoch": 0.6772812416376773,
"grad_norm": 3.2645928859710693,
"learning_rate": 9.801838601313674e-06,
"loss": 1.0983,
"step": 2531
},
{
"epoch": 0.6775488359646775,
"grad_norm": 3.471367835998535,
"learning_rate": 9.801591856964859e-06,
"loss": 1.0831,
"step": 2532
},
{
"epoch": 0.6778164302916778,
"grad_norm": 3.4886016845703125,
"learning_rate": 9.80134496220204e-06,
"loss": 1.1807,
"step": 2533
},
{
"epoch": 0.6780840246186781,
"grad_norm": 3.3703372478485107,
"learning_rate": 9.801097917032951e-06,
"loss": 1.1011,
"step": 2534
},
{
"epoch": 0.6783516189456783,
"grad_norm": 3.6529722213745117,
"learning_rate": 9.800850721465334e-06,
"loss": 1.2387,
"step": 2535
},
{
"epoch": 0.6786192132726786,
"grad_norm": 3.183479070663452,
"learning_rate": 9.800603375506928e-06,
"loss": 1.0238,
"step": 2536
},
{
"epoch": 0.6788868075996789,
"grad_norm": 3.3142735958099365,
"learning_rate": 9.800355879165485e-06,
"loss": 1.0489,
"step": 2537
},
{
"epoch": 0.6791544019266792,
"grad_norm": 3.7475812435150146,
"learning_rate": 9.800108232448754e-06,
"loss": 1.2292,
"step": 2538
},
{
"epoch": 0.6794219962536794,
"grad_norm": 3.2578468322753906,
"learning_rate": 9.7998604353645e-06,
"loss": 1.1332,
"step": 2539
},
{
"epoch": 0.6796895905806797,
"grad_norm": 3.501826286315918,
"learning_rate": 9.799612487920476e-06,
"loss": 1.1691,
"step": 2540
},
{
"epoch": 0.67995718490768,
"grad_norm": 3.8417768478393555,
"learning_rate": 9.799364390124456e-06,
"loss": 1.1488,
"step": 2541
},
{
"epoch": 0.6802247792346803,
"grad_norm": 4.020801544189453,
"learning_rate": 9.799116141984209e-06,
"loss": 1.2232,
"step": 2542
},
{
"epoch": 0.6804923735616805,
"grad_norm": 3.748538017272949,
"learning_rate": 9.798867743507512e-06,
"loss": 1.22,
"step": 2543
},
{
"epoch": 0.6807599678886808,
"grad_norm": 3.597007989883423,
"learning_rate": 9.798619194702148e-06,
"loss": 1.1873,
"step": 2544
},
{
"epoch": 0.6810275622156811,
"grad_norm": 3.8766472339630127,
"learning_rate": 9.798370495575901e-06,
"loss": 1.3015,
"step": 2545
},
{
"epoch": 0.6812951565426812,
"grad_norm": 3.568079948425293,
"learning_rate": 9.798121646136562e-06,
"loss": 1.1225,
"step": 2546
},
{
"epoch": 0.6815627508696815,
"grad_norm": 3.2755115032196045,
"learning_rate": 9.797872646391926e-06,
"loss": 1.0251,
"step": 2547
},
{
"epoch": 0.6818303451966818,
"grad_norm": 3.6031720638275146,
"learning_rate": 9.797623496349795e-06,
"loss": 1.0804,
"step": 2548
},
{
"epoch": 0.6820979395236821,
"grad_norm": 3.285602331161499,
"learning_rate": 9.797374196017974e-06,
"loss": 1.0666,
"step": 2549
},
{
"epoch": 0.6823655338506823,
"grad_norm": 3.9866554737091064,
"learning_rate": 9.79712474540427e-06,
"loss": 1.2911,
"step": 2550
},
{
"epoch": 0.6826331281776826,
"grad_norm": 3.1174442768096924,
"learning_rate": 9.796875144516498e-06,
"loss": 1.0572,
"step": 2551
},
{
"epoch": 0.6829007225046829,
"grad_norm": 3.3973238468170166,
"learning_rate": 9.796625393362477e-06,
"loss": 1.0371,
"step": 2552
},
{
"epoch": 0.6831683168316832,
"grad_norm": 3.1947076320648193,
"learning_rate": 9.796375491950034e-06,
"loss": 1.1874,
"step": 2553
},
{
"epoch": 0.6834359111586834,
"grad_norm": 3.0682270526885986,
"learning_rate": 9.796125440286992e-06,
"loss": 1.1072,
"step": 2554
},
{
"epoch": 0.6837035054856837,
"grad_norm": 3.7304959297180176,
"learning_rate": 9.795875238381188e-06,
"loss": 1.1563,
"step": 2555
},
{
"epoch": 0.683971099812684,
"grad_norm": 3.118598461151123,
"learning_rate": 9.795624886240458e-06,
"loss": 1.133,
"step": 2556
},
{
"epoch": 0.6842386941396842,
"grad_norm": 4.15332555770874,
"learning_rate": 9.795374383872645e-06,
"loss": 1.2752,
"step": 2557
},
{
"epoch": 0.6845062884666845,
"grad_norm": 3.629516124725342,
"learning_rate": 9.795123731285595e-06,
"loss": 1.2345,
"step": 2558
},
{
"epoch": 0.6847738827936848,
"grad_norm": 3.6850171089172363,
"learning_rate": 9.794872928487163e-06,
"loss": 1.2808,
"step": 2559
},
{
"epoch": 0.6850414771206851,
"grad_norm": 3.699629306793213,
"learning_rate": 9.7946219754852e-06,
"loss": 1.3603,
"step": 2560
},
{
"epoch": 0.6853090714476853,
"grad_norm": 3.9391591548919678,
"learning_rate": 9.794370872287575e-06,
"loss": 1.2984,
"step": 2561
},
{
"epoch": 0.6855766657746856,
"grad_norm": 3.434231758117676,
"learning_rate": 9.79411961890215e-06,
"loss": 1.2203,
"step": 2562
},
{
"epoch": 0.6858442601016859,
"grad_norm": 3.2698097229003906,
"learning_rate": 9.793868215336792e-06,
"loss": 1.1053,
"step": 2563
},
{
"epoch": 0.6861118544286862,
"grad_norm": 3.335155725479126,
"learning_rate": 9.793616661599384e-06,
"loss": 1.2078,
"step": 2564
},
{
"epoch": 0.6863794487556863,
"grad_norm": 3.870070695877075,
"learning_rate": 9.7933649576978e-06,
"loss": 1.2166,
"step": 2565
},
{
"epoch": 0.6866470430826866,
"grad_norm": 3.4541990756988525,
"learning_rate": 9.79311310363993e-06,
"loss": 1.1562,
"step": 2566
},
{
"epoch": 0.686914637409687,
"grad_norm": 3.1093223094940186,
"learning_rate": 9.792861099433657e-06,
"loss": 1.0556,
"step": 2567
},
{
"epoch": 0.6871822317366871,
"grad_norm": 3.6408331394195557,
"learning_rate": 9.79260894508688e-06,
"loss": 1.1649,
"step": 2568
},
{
"epoch": 0.6874498260636874,
"grad_norm": 3.420346260070801,
"learning_rate": 9.792356640607497e-06,
"loss": 1.0884,
"step": 2569
},
{
"epoch": 0.6877174203906877,
"grad_norm": 3.3369221687316895,
"learning_rate": 9.792104186003412e-06,
"loss": 1.1023,
"step": 2570
},
{
"epoch": 0.687985014717688,
"grad_norm": 3.251084089279175,
"learning_rate": 9.791851581282533e-06,
"loss": 1.0486,
"step": 2571
},
{
"epoch": 0.6882526090446882,
"grad_norm": 3.6394076347351074,
"learning_rate": 9.791598826452773e-06,
"loss": 1.0097,
"step": 2572
},
{
"epoch": 0.6885202033716885,
"grad_norm": 3.8325955867767334,
"learning_rate": 9.79134592152205e-06,
"loss": 1.2028,
"step": 2573
},
{
"epoch": 0.6887877976986888,
"grad_norm": 3.359297037124634,
"learning_rate": 9.791092866498286e-06,
"loss": 1.0754,
"step": 2574
},
{
"epoch": 0.6890553920256891,
"grad_norm": 3.287555694580078,
"learning_rate": 9.790839661389408e-06,
"loss": 1.0958,
"step": 2575
},
{
"epoch": 0.6893229863526893,
"grad_norm": 3.197094202041626,
"learning_rate": 9.790586306203348e-06,
"loss": 1.0084,
"step": 2576
},
{
"epoch": 0.6895905806796896,
"grad_norm": 3.1888086795806885,
"learning_rate": 9.790332800948044e-06,
"loss": 1.1168,
"step": 2577
},
{
"epoch": 0.6898581750066899,
"grad_norm": 3.7352941036224365,
"learning_rate": 9.790079145631434e-06,
"loss": 1.1924,
"step": 2578
},
{
"epoch": 0.6901257693336901,
"grad_norm": 3.5531890392303467,
"learning_rate": 9.789825340261467e-06,
"loss": 1.0547,
"step": 2579
},
{
"epoch": 0.6903933636606904,
"grad_norm": 3.3544304370880127,
"learning_rate": 9.789571384846093e-06,
"loss": 1.0319,
"step": 2580
},
{
"epoch": 0.6906609579876907,
"grad_norm": 3.5817840099334717,
"learning_rate": 9.789317279393267e-06,
"loss": 1.2264,
"step": 2581
},
{
"epoch": 0.690928552314691,
"grad_norm": 3.4651858806610107,
"learning_rate": 9.78906302391095e-06,
"loss": 1.0902,
"step": 2582
},
{
"epoch": 0.6911961466416912,
"grad_norm": 3.6912760734558105,
"learning_rate": 9.788808618407103e-06,
"loss": 1.2353,
"step": 2583
},
{
"epoch": 0.6914637409686915,
"grad_norm": 3.6648828983306885,
"learning_rate": 9.788554062889702e-06,
"loss": 1.2044,
"step": 2584
},
{
"epoch": 0.6917313352956918,
"grad_norm": 3.2582671642303467,
"learning_rate": 9.788299357366717e-06,
"loss": 1.0388,
"step": 2585
},
{
"epoch": 0.691998929622692,
"grad_norm": 3.759870767593384,
"learning_rate": 9.788044501846125e-06,
"loss": 1.2292,
"step": 2586
},
{
"epoch": 0.6922665239496922,
"grad_norm": 3.5340397357940674,
"learning_rate": 9.787789496335913e-06,
"loss": 1.2696,
"step": 2587
},
{
"epoch": 0.6925341182766925,
"grad_norm": 3.402407169342041,
"learning_rate": 9.78753434084407e-06,
"loss": 1.2269,
"step": 2588
},
{
"epoch": 0.6928017126036928,
"grad_norm": 3.7191381454467773,
"learning_rate": 9.787279035378585e-06,
"loss": 1.1591,
"step": 2589
},
{
"epoch": 0.693069306930693,
"grad_norm": 3.3745412826538086,
"learning_rate": 9.78702357994746e-06,
"loss": 1.1188,
"step": 2590
},
{
"epoch": 0.6933369012576933,
"grad_norm": 3.5345706939697266,
"learning_rate": 9.786767974558693e-06,
"loss": 1.2377,
"step": 2591
},
{
"epoch": 0.6936044955846936,
"grad_norm": 3.54662823677063,
"learning_rate": 9.786512219220294e-06,
"loss": 1.2069,
"step": 2592
},
{
"epoch": 0.6938720899116939,
"grad_norm": 3.509596109390259,
"learning_rate": 9.786256313940276e-06,
"loss": 1.2492,
"step": 2593
},
{
"epoch": 0.6941396842386941,
"grad_norm": 3.594794273376465,
"learning_rate": 9.786000258726652e-06,
"loss": 1.1751,
"step": 2594
},
{
"epoch": 0.6944072785656944,
"grad_norm": 3.4790191650390625,
"learning_rate": 9.785744053587445e-06,
"loss": 1.2485,
"step": 2595
},
{
"epoch": 0.6946748728926947,
"grad_norm": 3.57783842086792,
"learning_rate": 9.78548769853068e-06,
"loss": 1.1048,
"step": 2596
},
{
"epoch": 0.694942467219695,
"grad_norm": 3.960777521133423,
"learning_rate": 9.785231193564388e-06,
"loss": 1.2018,
"step": 2597
},
{
"epoch": 0.6952100615466952,
"grad_norm": 3.92084002494812,
"learning_rate": 9.784974538696606e-06,
"loss": 1.2637,
"step": 2598
},
{
"epoch": 0.6954776558736955,
"grad_norm": 3.7744603157043457,
"learning_rate": 9.78471773393537e-06,
"loss": 1.2868,
"step": 2599
},
{
"epoch": 0.6957452502006958,
"grad_norm": 3.3519065380096436,
"learning_rate": 9.784460779288727e-06,
"loss": 1.0901,
"step": 2600
},
{
"epoch": 0.696012844527696,
"grad_norm": 3.282240390777588,
"learning_rate": 9.784203674764727e-06,
"loss": 1.1259,
"step": 2601
},
{
"epoch": 0.6962804388546963,
"grad_norm": 3.7202768325805664,
"learning_rate": 9.783946420371424e-06,
"loss": 1.1036,
"step": 2602
},
{
"epoch": 0.6965480331816966,
"grad_norm": 3.3979485034942627,
"learning_rate": 9.783689016116874e-06,
"loss": 1.1188,
"step": 2603
},
{
"epoch": 0.6968156275086969,
"grad_norm": 3.3660459518432617,
"learning_rate": 9.783431462009146e-06,
"loss": 1.0795,
"step": 2604
},
{
"epoch": 0.697083221835697,
"grad_norm": 3.240844964981079,
"learning_rate": 9.7831737580563e-06,
"loss": 1.0436,
"step": 2605
},
{
"epoch": 0.6973508161626973,
"grad_norm": 3.5097098350524902,
"learning_rate": 9.782915904266416e-06,
"loss": 1.1287,
"step": 2606
},
{
"epoch": 0.6976184104896976,
"grad_norm": 3.2211825847625732,
"learning_rate": 9.782657900647567e-06,
"loss": 1.0162,
"step": 2607
},
{
"epoch": 0.6978860048166979,
"grad_norm": 3.5163321495056152,
"learning_rate": 9.782399747207838e-06,
"loss": 1.258,
"step": 2608
},
{
"epoch": 0.6981535991436981,
"grad_norm": 3.4427928924560547,
"learning_rate": 9.782141443955316e-06,
"loss": 1.1632,
"step": 2609
},
{
"epoch": 0.6984211934706984,
"grad_norm": 3.6478707790374756,
"learning_rate": 9.78188299089809e-06,
"loss": 1.0287,
"step": 2610
},
{
"epoch": 0.6986887877976987,
"grad_norm": 3.5365660190582275,
"learning_rate": 9.781624388044257e-06,
"loss": 1.1929,
"step": 2611
},
{
"epoch": 0.6989563821246989,
"grad_norm": 3.965444803237915,
"learning_rate": 9.78136563540192e-06,
"loss": 1.3651,
"step": 2612
},
{
"epoch": 0.6992239764516992,
"grad_norm": 3.7215042114257812,
"learning_rate": 9.781106732979182e-06,
"loss": 1.1677,
"step": 2613
},
{
"epoch": 0.6994915707786995,
"grad_norm": 3.5624494552612305,
"learning_rate": 9.780847680784156e-06,
"loss": 1.0269,
"step": 2614
},
{
"epoch": 0.6997591651056998,
"grad_norm": 3.623762845993042,
"learning_rate": 9.780588478824953e-06,
"loss": 1.1772,
"step": 2615
},
{
"epoch": 0.7000267594327,
"grad_norm": 3.544771194458008,
"learning_rate": 9.780329127109697e-06,
"loss": 1.1919,
"step": 2616
},
{
"epoch": 0.7002943537597003,
"grad_norm": 3.900216817855835,
"learning_rate": 9.780069625646512e-06,
"loss": 1.2399,
"step": 2617
},
{
"epoch": 0.7005619480867006,
"grad_norm": 3.4038405418395996,
"learning_rate": 9.779809974443525e-06,
"loss": 1.143,
"step": 2618
},
{
"epoch": 0.7008295424137009,
"grad_norm": 3.4808125495910645,
"learning_rate": 9.77955017350887e-06,
"loss": 1.1755,
"step": 2619
},
{
"epoch": 0.7010971367407011,
"grad_norm": 3.106503963470459,
"learning_rate": 9.779290222850686e-06,
"loss": 1.0324,
"step": 2620
},
{
"epoch": 0.7013647310677014,
"grad_norm": 3.0059962272644043,
"learning_rate": 9.779030122477118e-06,
"loss": 1.0385,
"step": 2621
},
{
"epoch": 0.7016323253947017,
"grad_norm": 3.8533339500427246,
"learning_rate": 9.778769872396311e-06,
"loss": 1.2545,
"step": 2622
},
{
"epoch": 0.7018999197217018,
"grad_norm": 3.361427068710327,
"learning_rate": 9.77850947261642e-06,
"loss": 1.1001,
"step": 2623
},
{
"epoch": 0.7021675140487021,
"grad_norm": 3.411195755004883,
"learning_rate": 9.778248923145599e-06,
"loss": 0.9854,
"step": 2624
},
{
"epoch": 0.7024351083757024,
"grad_norm": 3.7567944526672363,
"learning_rate": 9.777988223992014e-06,
"loss": 1.2254,
"step": 2625
},
{
"epoch": 0.7027027027027027,
"grad_norm": 3.7029223442077637,
"learning_rate": 9.777727375163828e-06,
"loss": 1.2784,
"step": 2626
},
{
"epoch": 0.7029702970297029,
"grad_norm": 3.0535287857055664,
"learning_rate": 9.777466376669214e-06,
"loss": 0.9761,
"step": 2627
},
{
"epoch": 0.7032378913567032,
"grad_norm": 3.1361937522888184,
"learning_rate": 9.777205228516349e-06,
"loss": 1.0701,
"step": 2628
},
{
"epoch": 0.7035054856837035,
"grad_norm": 3.5227043628692627,
"learning_rate": 9.776943930713411e-06,
"loss": 1.2202,
"step": 2629
},
{
"epoch": 0.7037730800107038,
"grad_norm": 3.2027533054351807,
"learning_rate": 9.776682483268588e-06,
"loss": 1.1063,
"step": 2630
},
{
"epoch": 0.704040674337704,
"grad_norm": 3.622596025466919,
"learning_rate": 9.776420886190069e-06,
"loss": 1.1919,
"step": 2631
},
{
"epoch": 0.7043082686647043,
"grad_norm": 3.527977466583252,
"learning_rate": 9.776159139486048e-06,
"loss": 1.1076,
"step": 2632
},
{
"epoch": 0.7045758629917046,
"grad_norm": 2.9947965145111084,
"learning_rate": 9.775897243164727e-06,
"loss": 1.1083,
"step": 2633
},
{
"epoch": 0.7048434573187048,
"grad_norm": 3.344877004623413,
"learning_rate": 9.775635197234306e-06,
"loss": 1.1652,
"step": 2634
},
{
"epoch": 0.7051110516457051,
"grad_norm": 3.506344795227051,
"learning_rate": 9.775373001702998e-06,
"loss": 1.1122,
"step": 2635
},
{
"epoch": 0.7053786459727054,
"grad_norm": 3.5416011810302734,
"learning_rate": 9.775110656579015e-06,
"loss": 1.0505,
"step": 2636
},
{
"epoch": 0.7056462402997057,
"grad_norm": 3.234518527984619,
"learning_rate": 9.774848161870574e-06,
"loss": 0.999,
"step": 2637
},
{
"epoch": 0.7059138346267059,
"grad_norm": 4.471454620361328,
"learning_rate": 9.774585517585898e-06,
"loss": 1.146,
"step": 2638
},
{
"epoch": 0.7061814289537062,
"grad_norm": 3.475337505340576,
"learning_rate": 9.774322723733216e-06,
"loss": 0.9791,
"step": 2639
},
{
"epoch": 0.7064490232807065,
"grad_norm": 3.684784173965454,
"learning_rate": 9.774059780320759e-06,
"loss": 1.1905,
"step": 2640
},
{
"epoch": 0.7067166176077068,
"grad_norm": 3.747850179672241,
"learning_rate": 9.773796687356764e-06,
"loss": 1.2367,
"step": 2641
},
{
"epoch": 0.706984211934707,
"grad_norm": 3.962178945541382,
"learning_rate": 9.773533444849475e-06,
"loss": 1.1333,
"step": 2642
},
{
"epoch": 0.7072518062617072,
"grad_norm": 3.6332039833068848,
"learning_rate": 9.773270052807135e-06,
"loss": 1.1832,
"step": 2643
},
{
"epoch": 0.7075194005887075,
"grad_norm": 3.565274715423584,
"learning_rate": 9.773006511237997e-06,
"loss": 1.2,
"step": 2644
},
{
"epoch": 0.7077869949157077,
"grad_norm": 3.718888998031616,
"learning_rate": 9.772742820150316e-06,
"loss": 1.18,
"step": 2645
},
{
"epoch": 0.708054589242708,
"grad_norm": 3.387706756591797,
"learning_rate": 9.772478979552353e-06,
"loss": 1.0484,
"step": 2646
},
{
"epoch": 0.7083221835697083,
"grad_norm": 3.5784285068511963,
"learning_rate": 9.772214989452372e-06,
"loss": 1.1859,
"step": 2647
},
{
"epoch": 0.7085897778967086,
"grad_norm": 3.052260160446167,
"learning_rate": 9.771950849858641e-06,
"loss": 1.0571,
"step": 2648
},
{
"epoch": 0.7088573722237088,
"grad_norm": 3.3145363330841064,
"learning_rate": 9.771686560779438e-06,
"loss": 1.1643,
"step": 2649
},
{
"epoch": 0.7091249665507091,
"grad_norm": 3.7414932250976562,
"learning_rate": 9.771422122223042e-06,
"loss": 1.2321,
"step": 2650
},
{
"epoch": 0.7093925608777094,
"grad_norm": 3.2115883827209473,
"learning_rate": 9.771157534197733e-06,
"loss": 1.1001,
"step": 2651
},
{
"epoch": 0.7096601552047097,
"grad_norm": 3.6394455432891846,
"learning_rate": 9.770892796711804e-06,
"loss": 1.2063,
"step": 2652
},
{
"epoch": 0.7099277495317099,
"grad_norm": 3.6082866191864014,
"learning_rate": 9.770627909773545e-06,
"loss": 1.1383,
"step": 2653
},
{
"epoch": 0.7101953438587102,
"grad_norm": 4.1197919845581055,
"learning_rate": 9.770362873391256e-06,
"loss": 1.2361,
"step": 2654
},
{
"epoch": 0.7104629381857105,
"grad_norm": 3.101154327392578,
"learning_rate": 9.770097687573235e-06,
"loss": 1.0858,
"step": 2655
},
{
"epoch": 0.7107305325127107,
"grad_norm": 3.6352450847625732,
"learning_rate": 9.769832352327795e-06,
"loss": 1.0094,
"step": 2656
},
{
"epoch": 0.710998126839711,
"grad_norm": 3.195739507675171,
"learning_rate": 9.769566867663245e-06,
"loss": 1.0132,
"step": 2657
},
{
"epoch": 0.7112657211667113,
"grad_norm": 3.4742939472198486,
"learning_rate": 9.7693012335879e-06,
"loss": 1.235,
"step": 2658
},
{
"epoch": 0.7115333154937116,
"grad_norm": 3.451916217803955,
"learning_rate": 9.769035450110084e-06,
"loss": 1.1433,
"step": 2659
},
{
"epoch": 0.7118009098207118,
"grad_norm": 3.5261240005493164,
"learning_rate": 9.768769517238124e-06,
"loss": 1.1758,
"step": 2660
},
{
"epoch": 0.7120685041477121,
"grad_norm": 3.142664670944214,
"learning_rate": 9.768503434980348e-06,
"loss": 1.0356,
"step": 2661
},
{
"epoch": 0.7123360984747124,
"grad_norm": 3.1803033351898193,
"learning_rate": 9.76823720334509e-06,
"loss": 1.0501,
"step": 2662
},
{
"epoch": 0.7126036928017127,
"grad_norm": 3.5911192893981934,
"learning_rate": 9.767970822340692e-06,
"loss": 1.0931,
"step": 2663
},
{
"epoch": 0.7128712871287128,
"grad_norm": 3.599949598312378,
"learning_rate": 9.7677042919755e-06,
"loss": 1.1297,
"step": 2664
},
{
"epoch": 0.7131388814557131,
"grad_norm": 3.7325220108032227,
"learning_rate": 9.76743761225786e-06,
"loss": 1.1475,
"step": 2665
},
{
"epoch": 0.7134064757827134,
"grad_norm": 3.2687487602233887,
"learning_rate": 9.767170783196128e-06,
"loss": 1.108,
"step": 2666
},
{
"epoch": 0.7136740701097136,
"grad_norm": 3.567669630050659,
"learning_rate": 9.766903804798663e-06,
"loss": 1.1965,
"step": 2667
},
{
"epoch": 0.7139416644367139,
"grad_norm": 2.9746851921081543,
"learning_rate": 9.766636677073825e-06,
"loss": 0.9885,
"step": 2668
},
{
"epoch": 0.7142092587637142,
"grad_norm": 3.6333086490631104,
"learning_rate": 9.766369400029987e-06,
"loss": 1.1906,
"step": 2669
},
{
"epoch": 0.7144768530907145,
"grad_norm": 3.6311559677124023,
"learning_rate": 9.766101973675519e-06,
"loss": 1.1869,
"step": 2670
},
{
"epoch": 0.7147444474177147,
"grad_norm": 3.632929801940918,
"learning_rate": 9.765834398018797e-06,
"loss": 1.1423,
"step": 2671
},
{
"epoch": 0.715012041744715,
"grad_norm": 3.4885165691375732,
"learning_rate": 9.765566673068206e-06,
"loss": 1.1226,
"step": 2672
},
{
"epoch": 0.7152796360717153,
"grad_norm": 3.2653591632843018,
"learning_rate": 9.765298798832132e-06,
"loss": 1.0427,
"step": 2673
},
{
"epoch": 0.7155472303987156,
"grad_norm": 3.1086037158966064,
"learning_rate": 9.765030775318965e-06,
"loss": 1.0032,
"step": 2674
},
{
"epoch": 0.7158148247257158,
"grad_norm": 3.084402322769165,
"learning_rate": 9.764762602537102e-06,
"loss": 1.0019,
"step": 2675
},
{
"epoch": 0.7160824190527161,
"grad_norm": 3.50754714012146,
"learning_rate": 9.764494280494943e-06,
"loss": 1.0982,
"step": 2676
},
{
"epoch": 0.7163500133797164,
"grad_norm": 3.355750799179077,
"learning_rate": 9.764225809200894e-06,
"loss": 1.1512,
"step": 2677
},
{
"epoch": 0.7166176077067166,
"grad_norm": 3.4063217639923096,
"learning_rate": 9.763957188663366e-06,
"loss": 1.2209,
"step": 2678
},
{
"epoch": 0.7168852020337169,
"grad_norm": 3.5086374282836914,
"learning_rate": 9.76368841889077e-06,
"loss": 1.2162,
"step": 2679
},
{
"epoch": 0.7171527963607172,
"grad_norm": 3.286731719970703,
"learning_rate": 9.763419499891533e-06,
"loss": 1.176,
"step": 2680
},
{
"epoch": 0.7174203906877175,
"grad_norm": 3.622854232788086,
"learning_rate": 9.763150431674072e-06,
"loss": 1.1612,
"step": 2681
},
{
"epoch": 0.7176879850147176,
"grad_norm": 3.5535635948181152,
"learning_rate": 9.762881214246817e-06,
"loss": 1.1771,
"step": 2682
},
{
"epoch": 0.7179555793417179,
"grad_norm": 4.360621452331543,
"learning_rate": 9.762611847618203e-06,
"loss": 1.0991,
"step": 2683
},
{
"epoch": 0.7182231736687182,
"grad_norm": 3.4346253871917725,
"learning_rate": 9.762342331796671e-06,
"loss": 1.1323,
"step": 2684
},
{
"epoch": 0.7184907679957185,
"grad_norm": 3.3075368404388428,
"learning_rate": 9.762072666790658e-06,
"loss": 1.0226,
"step": 2685
},
{
"epoch": 0.7187583623227187,
"grad_norm": 3.4032669067382812,
"learning_rate": 9.761802852608614e-06,
"loss": 1.0554,
"step": 2686
},
{
"epoch": 0.719025956649719,
"grad_norm": 3.6175873279571533,
"learning_rate": 9.76153288925899e-06,
"loss": 1.1181,
"step": 2687
},
{
"epoch": 0.7192935509767193,
"grad_norm": 3.678610324859619,
"learning_rate": 9.761262776750248e-06,
"loss": 1.2954,
"step": 2688
},
{
"epoch": 0.7195611453037195,
"grad_norm": 3.5059852600097656,
"learning_rate": 9.760992515090844e-06,
"loss": 1.1544,
"step": 2689
},
{
"epoch": 0.7198287396307198,
"grad_norm": 3.412489414215088,
"learning_rate": 9.760722104289244e-06,
"loss": 1.2178,
"step": 2690
},
{
"epoch": 0.7200963339577201,
"grad_norm": 3.746623992919922,
"learning_rate": 9.760451544353923e-06,
"loss": 1.2707,
"step": 2691
},
{
"epoch": 0.7203639282847204,
"grad_norm": 3.2864015102386475,
"learning_rate": 9.760180835293352e-06,
"loss": 1.0242,
"step": 2692
},
{
"epoch": 0.7206315226117206,
"grad_norm": 3.267595052719116,
"learning_rate": 9.759909977116016e-06,
"loss": 1.0804,
"step": 2693
},
{
"epoch": 0.7208991169387209,
"grad_norm": 3.310580015182495,
"learning_rate": 9.759638969830395e-06,
"loss": 1.079,
"step": 2694
},
{
"epoch": 0.7211667112657212,
"grad_norm": 3.208405017852783,
"learning_rate": 9.759367813444982e-06,
"loss": 1.0354,
"step": 2695
},
{
"epoch": 0.7214343055927215,
"grad_norm": 3.2207465171813965,
"learning_rate": 9.75909650796827e-06,
"loss": 1.0239,
"step": 2696
},
{
"epoch": 0.7217018999197217,
"grad_norm": 3.3544509410858154,
"learning_rate": 9.758825053408755e-06,
"loss": 1.1832,
"step": 2697
},
{
"epoch": 0.721969494246722,
"grad_norm": 3.2339577674865723,
"learning_rate": 9.758553449774947e-06,
"loss": 1.0818,
"step": 2698
},
{
"epoch": 0.7222370885737223,
"grad_norm": 3.8596277236938477,
"learning_rate": 9.75828169707535e-06,
"loss": 1.1333,
"step": 2699
},
{
"epoch": 0.7225046829007225,
"grad_norm": 3.5846714973449707,
"learning_rate": 9.758009795318477e-06,
"loss": 1.0485,
"step": 2700
},
{
"epoch": 0.7227722772277227,
"grad_norm": 3.747907876968384,
"learning_rate": 9.757737744512846e-06,
"loss": 1.0817,
"step": 2701
},
{
"epoch": 0.723039871554723,
"grad_norm": 3.468989849090576,
"learning_rate": 9.75746554466698e-06,
"loss": 1.2511,
"step": 2702
},
{
"epoch": 0.7233074658817233,
"grad_norm": 3.4616754055023193,
"learning_rate": 9.757193195789404e-06,
"loss": 1.1138,
"step": 2703
},
{
"epoch": 0.7235750602087235,
"grad_norm": 3.6678431034088135,
"learning_rate": 9.75692069788865e-06,
"loss": 1.0741,
"step": 2704
},
{
"epoch": 0.7238426545357238,
"grad_norm": 3.6420814990997314,
"learning_rate": 9.756648050973257e-06,
"loss": 1.1498,
"step": 2705
},
{
"epoch": 0.7241102488627241,
"grad_norm": 3.3295350074768066,
"learning_rate": 9.756375255051765e-06,
"loss": 1.2033,
"step": 2706
},
{
"epoch": 0.7243778431897244,
"grad_norm": 3.3087949752807617,
"learning_rate": 9.756102310132716e-06,
"loss": 1.144,
"step": 2707
},
{
"epoch": 0.7246454375167246,
"grad_norm": 3.582380771636963,
"learning_rate": 9.755829216224662e-06,
"loss": 1.0771,
"step": 2708
},
{
"epoch": 0.7249130318437249,
"grad_norm": 3.513324737548828,
"learning_rate": 9.75555597333616e-06,
"loss": 1.1307,
"step": 2709
},
{
"epoch": 0.7251806261707252,
"grad_norm": 3.110485315322876,
"learning_rate": 9.755282581475769e-06,
"loss": 1.0273,
"step": 2710
},
{
"epoch": 0.7254482204977254,
"grad_norm": 3.4464118480682373,
"learning_rate": 9.75500904065205e-06,
"loss": 1.0884,
"step": 2711
},
{
"epoch": 0.7257158148247257,
"grad_norm": 3.5363407135009766,
"learning_rate": 9.754735350873577e-06,
"loss": 1.1758,
"step": 2712
},
{
"epoch": 0.725983409151726,
"grad_norm": 3.9216394424438477,
"learning_rate": 9.75446151214892e-06,
"loss": 1.3667,
"step": 2713
},
{
"epoch": 0.7262510034787263,
"grad_norm": 4.166318893432617,
"learning_rate": 9.754187524486658e-06,
"loss": 1.3686,
"step": 2714
},
{
"epoch": 0.7265185978057265,
"grad_norm": 3.6010658740997314,
"learning_rate": 9.753913387895373e-06,
"loss": 1.2221,
"step": 2715
},
{
"epoch": 0.7267861921327268,
"grad_norm": 3.6263794898986816,
"learning_rate": 9.753639102383653e-06,
"loss": 1.092,
"step": 2716
},
{
"epoch": 0.7270537864597271,
"grad_norm": 3.779825210571289,
"learning_rate": 9.753364667960093e-06,
"loss": 1.1366,
"step": 2717
},
{
"epoch": 0.7273213807867274,
"grad_norm": 3.460033416748047,
"learning_rate": 9.753090084633288e-06,
"loss": 1.2492,
"step": 2718
},
{
"epoch": 0.7275889751137276,
"grad_norm": 3.507516384124756,
"learning_rate": 9.752815352411837e-06,
"loss": 1.2037,
"step": 2719
},
{
"epoch": 0.7278565694407279,
"grad_norm": 3.2808637619018555,
"learning_rate": 9.752540471304351e-06,
"loss": 1.1144,
"step": 2720
},
{
"epoch": 0.7281241637677281,
"grad_norm": 3.5148873329162598,
"learning_rate": 9.752265441319437e-06,
"loss": 1.1453,
"step": 2721
},
{
"epoch": 0.7283917580947283,
"grad_norm": 3.546168327331543,
"learning_rate": 9.751990262465712e-06,
"loss": 1.1504,
"step": 2722
},
{
"epoch": 0.7286593524217286,
"grad_norm": 3.2594752311706543,
"learning_rate": 9.751714934751795e-06,
"loss": 1.1144,
"step": 2723
},
{
"epoch": 0.7289269467487289,
"grad_norm": 3.6359105110168457,
"learning_rate": 9.751439458186314e-06,
"loss": 1.0574,
"step": 2724
},
{
"epoch": 0.7291945410757292,
"grad_norm": 3.5395443439483643,
"learning_rate": 9.751163832777894e-06,
"loss": 1.1311,
"step": 2725
},
{
"epoch": 0.7294621354027294,
"grad_norm": 3.6369519233703613,
"learning_rate": 9.750888058535175e-06,
"loss": 1.228,
"step": 2726
},
{
"epoch": 0.7297297297297297,
"grad_norm": 3.7972755432128906,
"learning_rate": 9.75061213546679e-06,
"loss": 1.2122,
"step": 2727
},
{
"epoch": 0.72999732405673,
"grad_norm": 3.208137273788452,
"learning_rate": 9.750336063581385e-06,
"loss": 1.1357,
"step": 2728
},
{
"epoch": 0.7302649183837303,
"grad_norm": 3.6804027557373047,
"learning_rate": 9.75005984288761e-06,
"loss": 1.0588,
"step": 2729
},
{
"epoch": 0.7305325127107305,
"grad_norm": 3.5338356494903564,
"learning_rate": 9.749783473394115e-06,
"loss": 1.1344,
"step": 2730
},
{
"epoch": 0.7308001070377308,
"grad_norm": 3.542436122894287,
"learning_rate": 9.74950695510956e-06,
"loss": 1.0266,
"step": 2731
},
{
"epoch": 0.7310677013647311,
"grad_norm": 3.2684695720672607,
"learning_rate": 9.749230288042605e-06,
"loss": 1.1069,
"step": 2732
},
{
"epoch": 0.7313352956917314,
"grad_norm": 3.5910449028015137,
"learning_rate": 9.748953472201919e-06,
"loss": 1.1241,
"step": 2733
},
{
"epoch": 0.7316028900187316,
"grad_norm": 3.693363904953003,
"learning_rate": 9.74867650759617e-06,
"loss": 1.029,
"step": 2734
},
{
"epoch": 0.7318704843457319,
"grad_norm": 3.376753091812134,
"learning_rate": 9.748399394234038e-06,
"loss": 1.1953,
"step": 2735
},
{
"epoch": 0.7321380786727322,
"grad_norm": 3.5596413612365723,
"learning_rate": 9.7481221321242e-06,
"loss": 1.1171,
"step": 2736
},
{
"epoch": 0.7324056729997324,
"grad_norm": 3.588493585586548,
"learning_rate": 9.747844721275345e-06,
"loss": 1.3143,
"step": 2737
},
{
"epoch": 0.7326732673267327,
"grad_norm": 3.4020540714263916,
"learning_rate": 9.747567161696163e-06,
"loss": 1.1001,
"step": 2738
},
{
"epoch": 0.732940861653733,
"grad_norm": 3.346292018890381,
"learning_rate": 9.747289453395348e-06,
"loss": 1.0981,
"step": 2739
},
{
"epoch": 0.7332084559807333,
"grad_norm": 3.401524543762207,
"learning_rate": 9.747011596381597e-06,
"loss": 1.0512,
"step": 2740
},
{
"epoch": 0.7334760503077334,
"grad_norm": 3.25940203666687,
"learning_rate": 9.746733590663616e-06,
"loss": 1.1377,
"step": 2741
},
{
"epoch": 0.7337436446347337,
"grad_norm": 3.116464376449585,
"learning_rate": 9.746455436250116e-06,
"loss": 1.0579,
"step": 2742
},
{
"epoch": 0.734011238961734,
"grad_norm": 3.291623592376709,
"learning_rate": 9.746177133149805e-06,
"loss": 1.0135,
"step": 2743
},
{
"epoch": 0.7342788332887343,
"grad_norm": 4.061689376831055,
"learning_rate": 9.745898681371408e-06,
"loss": 1.3905,
"step": 2744
},
{
"epoch": 0.7345464276157345,
"grad_norm": 3.388113021850586,
"learning_rate": 9.74562008092364e-06,
"loss": 1.1734,
"step": 2745
},
{
"epoch": 0.7348140219427348,
"grad_norm": 3.3899624347686768,
"learning_rate": 9.745341331815237e-06,
"loss": 1.1051,
"step": 2746
},
{
"epoch": 0.7350816162697351,
"grad_norm": 3.5263469219207764,
"learning_rate": 9.745062434054924e-06,
"loss": 1.2386,
"step": 2747
},
{
"epoch": 0.7353492105967353,
"grad_norm": 3.7269399166107178,
"learning_rate": 9.744783387651442e-06,
"loss": 1.1823,
"step": 2748
},
{
"epoch": 0.7356168049237356,
"grad_norm": 3.4878084659576416,
"learning_rate": 9.74450419261353e-06,
"loss": 1.0985,
"step": 2749
},
{
"epoch": 0.7358843992507359,
"grad_norm": 3.7728066444396973,
"learning_rate": 9.744224848949935e-06,
"loss": 1.1252,
"step": 2750
},
{
"epoch": 0.7361519935777362,
"grad_norm": 3.4812254905700684,
"learning_rate": 9.743945356669406e-06,
"loss": 1.1439,
"step": 2751
},
{
"epoch": 0.7364195879047364,
"grad_norm": 3.372687816619873,
"learning_rate": 9.743665715780702e-06,
"loss": 1.1005,
"step": 2752
},
{
"epoch": 0.7366871822317367,
"grad_norm": 3.568819522857666,
"learning_rate": 9.743385926292578e-06,
"loss": 1.1,
"step": 2753
},
{
"epoch": 0.736954776558737,
"grad_norm": 3.3924365043640137,
"learning_rate": 9.743105988213802e-06,
"loss": 1.2092,
"step": 2754
},
{
"epoch": 0.7372223708857373,
"grad_norm": 3.7355546951293945,
"learning_rate": 9.742825901553144e-06,
"loss": 1.2169,
"step": 2755
},
{
"epoch": 0.7374899652127375,
"grad_norm": 3.2989847660064697,
"learning_rate": 9.742545666319376e-06,
"loss": 1.1556,
"step": 2756
},
{
"epoch": 0.7377575595397378,
"grad_norm": 3.268017530441284,
"learning_rate": 9.742265282521278e-06,
"loss": 1.0398,
"step": 2757
},
{
"epoch": 0.7380251538667381,
"grad_norm": 3.880585193634033,
"learning_rate": 9.741984750167632e-06,
"loss": 1.3448,
"step": 2758
},
{
"epoch": 0.7382927481937382,
"grad_norm": 3.6967694759368896,
"learning_rate": 9.741704069267227e-06,
"loss": 1.2818,
"step": 2759
},
{
"epoch": 0.7385603425207385,
"grad_norm": 3.3277947902679443,
"learning_rate": 9.741423239828854e-06,
"loss": 1.1014,
"step": 2760
},
{
"epoch": 0.7388279368477388,
"grad_norm": 3.7067902088165283,
"learning_rate": 9.74114226186131e-06,
"loss": 1.1701,
"step": 2761
},
{
"epoch": 0.7390955311747391,
"grad_norm": 3.5703206062316895,
"learning_rate": 9.740861135373399e-06,
"loss": 1.2229,
"step": 2762
},
{
"epoch": 0.7393631255017393,
"grad_norm": 3.534301519393921,
"learning_rate": 9.740579860373928e-06,
"loss": 1.162,
"step": 2763
},
{
"epoch": 0.7396307198287396,
"grad_norm": 3.224804162979126,
"learning_rate": 9.740298436871705e-06,
"loss": 1.0507,
"step": 2764
},
{
"epoch": 0.7398983141557399,
"grad_norm": 3.5627236366271973,
"learning_rate": 9.74001686487555e-06,
"loss": 1.1545,
"step": 2765
},
{
"epoch": 0.7401659084827402,
"grad_norm": 3.293410301208496,
"learning_rate": 9.73973514439428e-06,
"loss": 1.0306,
"step": 2766
},
{
"epoch": 0.7404335028097404,
"grad_norm": 3.6083991527557373,
"learning_rate": 9.73945327543672e-06,
"loss": 1.2526,
"step": 2767
},
{
"epoch": 0.7407010971367407,
"grad_norm": 3.2375547885894775,
"learning_rate": 9.739171258011703e-06,
"loss": 1.0081,
"step": 2768
},
{
"epoch": 0.740968691463741,
"grad_norm": 3.6871652603149414,
"learning_rate": 9.73888909212806e-06,
"loss": 1.1781,
"step": 2769
},
{
"epoch": 0.7412362857907412,
"grad_norm": 3.2498250007629395,
"learning_rate": 9.738606777794633e-06,
"loss": 1.1097,
"step": 2770
},
{
"epoch": 0.7415038801177415,
"grad_norm": 3.439887046813965,
"learning_rate": 9.738324315020263e-06,
"loss": 1.1992,
"step": 2771
},
{
"epoch": 0.7417714744447418,
"grad_norm": 3.3322887420654297,
"learning_rate": 9.7380417038138e-06,
"loss": 1.2245,
"step": 2772
},
{
"epoch": 0.7420390687717421,
"grad_norm": 3.5965330600738525,
"learning_rate": 9.737758944184096e-06,
"loss": 1.1906,
"step": 2773
},
{
"epoch": 0.7423066630987423,
"grad_norm": 3.299678325653076,
"learning_rate": 9.737476036140011e-06,
"loss": 1.2128,
"step": 2774
},
{
"epoch": 0.7425742574257426,
"grad_norm": 3.468172073364258,
"learning_rate": 9.737192979690404e-06,
"loss": 1.1996,
"step": 2775
},
{
"epoch": 0.7428418517527429,
"grad_norm": 3.4514479637145996,
"learning_rate": 9.736909774844145e-06,
"loss": 1.1241,
"step": 2776
},
{
"epoch": 0.7431094460797432,
"grad_norm": 3.221329927444458,
"learning_rate": 9.736626421610104e-06,
"loss": 1.1235,
"step": 2777
},
{
"epoch": 0.7433770404067434,
"grad_norm": 3.6566755771636963,
"learning_rate": 9.73634291999716e-06,
"loss": 1.0872,
"step": 2778
},
{
"epoch": 0.7436446347337436,
"grad_norm": 3.055006980895996,
"learning_rate": 9.73605927001419e-06,
"loss": 1.0536,
"step": 2779
},
{
"epoch": 0.743912229060744,
"grad_norm": 3.0918097496032715,
"learning_rate": 9.735775471670079e-06,
"loss": 1.0914,
"step": 2780
},
{
"epoch": 0.7441798233877441,
"grad_norm": 3.5324559211730957,
"learning_rate": 9.735491524973723e-06,
"loss": 1.041,
"step": 2781
},
{
"epoch": 0.7444474177147444,
"grad_norm": 3.506650447845459,
"learning_rate": 9.73520742993401e-06,
"loss": 1.1942,
"step": 2782
},
{
"epoch": 0.7447150120417447,
"grad_norm": 3.5160765647888184,
"learning_rate": 9.734923186559845e-06,
"loss": 1.1306,
"step": 2783
},
{
"epoch": 0.744982606368745,
"grad_norm": 3.377394676208496,
"learning_rate": 9.73463879486013e-06,
"loss": 1.1159,
"step": 2784
},
{
"epoch": 0.7452502006957452,
"grad_norm": 3.6153159141540527,
"learning_rate": 9.734354254843773e-06,
"loss": 1.0963,
"step": 2785
},
{
"epoch": 0.7455177950227455,
"grad_norm": 3.4530587196350098,
"learning_rate": 9.734069566519688e-06,
"loss": 1.1871,
"step": 2786
},
{
"epoch": 0.7457853893497458,
"grad_norm": 3.537059783935547,
"learning_rate": 9.733784729896794e-06,
"loss": 1.2016,
"step": 2787
},
{
"epoch": 0.7460529836767461,
"grad_norm": 3.379148483276367,
"learning_rate": 9.733499744984013e-06,
"loss": 1.1321,
"step": 2788
},
{
"epoch": 0.7463205780037463,
"grad_norm": 3.4380931854248047,
"learning_rate": 9.733214611790273e-06,
"loss": 1.1662,
"step": 2789
},
{
"epoch": 0.7465881723307466,
"grad_norm": 3.5000431537628174,
"learning_rate": 9.732929330324505e-06,
"loss": 1.0895,
"step": 2790
},
{
"epoch": 0.7468557666577469,
"grad_norm": 3.2787697315216064,
"learning_rate": 9.732643900595646e-06,
"loss": 1.094,
"step": 2791
},
{
"epoch": 0.7471233609847471,
"grad_norm": 3.452360153198242,
"learning_rate": 9.732358322612639e-06,
"loss": 1.198,
"step": 2792
},
{
"epoch": 0.7473909553117474,
"grad_norm": 3.0884242057800293,
"learning_rate": 9.732072596384427e-06,
"loss": 1.1294,
"step": 2793
},
{
"epoch": 0.7476585496387477,
"grad_norm": 3.0468862056732178,
"learning_rate": 9.731786721919963e-06,
"loss": 1.0767,
"step": 2794
},
{
"epoch": 0.747926143965748,
"grad_norm": 3.783818483352661,
"learning_rate": 9.7315006992282e-06,
"loss": 1.1782,
"step": 2795
},
{
"epoch": 0.7481937382927482,
"grad_norm": 3.290731430053711,
"learning_rate": 9.731214528318101e-06,
"loss": 1.1673,
"step": 2796
},
{
"epoch": 0.7484613326197485,
"grad_norm": 3.4749789237976074,
"learning_rate": 9.730928209198629e-06,
"loss": 1.0845,
"step": 2797
},
{
"epoch": 0.7487289269467488,
"grad_norm": 3.39563250541687,
"learning_rate": 9.730641741878752e-06,
"loss": 1.1038,
"step": 2798
},
{
"epoch": 0.748996521273749,
"grad_norm": 3.1787352561950684,
"learning_rate": 9.730355126367446e-06,
"loss": 0.9825,
"step": 2799
},
{
"epoch": 0.7492641156007492,
"grad_norm": 3.4629030227661133,
"learning_rate": 9.730068362673686e-06,
"loss": 1.1339,
"step": 2800
},
{
"epoch": 0.7495317099277495,
"grad_norm": 3.959449052810669,
"learning_rate": 9.72978145080646e-06,
"loss": 1.3313,
"step": 2801
},
{
"epoch": 0.7497993042547498,
"grad_norm": 2.8361458778381348,
"learning_rate": 9.729494390774753e-06,
"loss": 1.0094,
"step": 2802
},
{
"epoch": 0.75006689858175,
"grad_norm": 3.317673921585083,
"learning_rate": 9.729207182587556e-06,
"loss": 1.0883,
"step": 2803
},
{
"epoch": 0.7503344929087503,
"grad_norm": 3.1433663368225098,
"learning_rate": 9.728919826253872e-06,
"loss": 1.0981,
"step": 2804
},
{
"epoch": 0.7506020872357506,
"grad_norm": 3.8277859687805176,
"learning_rate": 9.728632321782693e-06,
"loss": 1.1252,
"step": 2805
},
{
"epoch": 0.7508696815627509,
"grad_norm": 3.1304125785827637,
"learning_rate": 9.728344669183033e-06,
"loss": 1.0509,
"step": 2806
},
{
"epoch": 0.7511372758897511,
"grad_norm": 3.266526937484741,
"learning_rate": 9.728056868463903e-06,
"loss": 1.1155,
"step": 2807
},
{
"epoch": 0.7514048702167514,
"grad_norm": 3.4399197101593018,
"learning_rate": 9.727768919634314e-06,
"loss": 1.2062,
"step": 2808
},
{
"epoch": 0.7516724645437517,
"grad_norm": 3.5575180053710938,
"learning_rate": 9.72748082270329e-06,
"loss": 1.211,
"step": 2809
},
{
"epoch": 0.751940058870752,
"grad_norm": 3.2147161960601807,
"learning_rate": 9.727192577679852e-06,
"loss": 1.0478,
"step": 2810
},
{
"epoch": 0.7522076531977522,
"grad_norm": 3.8405959606170654,
"learning_rate": 9.726904184573034e-06,
"loss": 1.2406,
"step": 2811
},
{
"epoch": 0.7524752475247525,
"grad_norm": 3.6334824562072754,
"learning_rate": 9.726615643391868e-06,
"loss": 1.1493,
"step": 2812
},
{
"epoch": 0.7527428418517528,
"grad_norm": 3.3839588165283203,
"learning_rate": 9.726326954145391e-06,
"loss": 1.2048,
"step": 2813
},
{
"epoch": 0.753010436178753,
"grad_norm": 3.4611270427703857,
"learning_rate": 9.72603811684265e-06,
"loss": 1.1183,
"step": 2814
},
{
"epoch": 0.7532780305057533,
"grad_norm": 3.1408448219299316,
"learning_rate": 9.725749131492691e-06,
"loss": 1.0962,
"step": 2815
},
{
"epoch": 0.7535456248327536,
"grad_norm": 3.3118159770965576,
"learning_rate": 9.725459998104568e-06,
"loss": 1.1289,
"step": 2816
},
{
"epoch": 0.7538132191597539,
"grad_norm": 3.467696189880371,
"learning_rate": 9.725170716687337e-06,
"loss": 1.1242,
"step": 2817
},
{
"epoch": 0.754080813486754,
"grad_norm": 3.346605062484741,
"learning_rate": 9.72488128725006e-06,
"loss": 1.1256,
"step": 2818
},
{
"epoch": 0.7543484078137543,
"grad_norm": 2.9358856678009033,
"learning_rate": 9.724591709801804e-06,
"loss": 1.0252,
"step": 2819
},
{
"epoch": 0.7546160021407546,
"grad_norm": 3.8935790061950684,
"learning_rate": 9.724301984351642e-06,
"loss": 1.2343,
"step": 2820
},
{
"epoch": 0.7548835964677549,
"grad_norm": 3.3613624572753906,
"learning_rate": 9.724012110908647e-06,
"loss": 1.0944,
"step": 2821
},
{
"epoch": 0.7551511907947551,
"grad_norm": 3.857342004776001,
"learning_rate": 9.723722089481902e-06,
"loss": 1.1819,
"step": 2822
},
{
"epoch": 0.7554187851217554,
"grad_norm": 3.4227402210235596,
"learning_rate": 9.72343192008049e-06,
"loss": 1.1457,
"step": 2823
},
{
"epoch": 0.7556863794487557,
"grad_norm": 3.526207447052002,
"learning_rate": 9.723141602713502e-06,
"loss": 1.1525,
"step": 2824
},
{
"epoch": 0.7559539737757559,
"grad_norm": 3.2722322940826416,
"learning_rate": 9.722851137390032e-06,
"loss": 0.999,
"step": 2825
},
{
"epoch": 0.7562215681027562,
"grad_norm": 3.4464046955108643,
"learning_rate": 9.72256052411918e-06,
"loss": 1.1352,
"step": 2826
},
{
"epoch": 0.7564891624297565,
"grad_norm": 3.817711353302002,
"learning_rate": 9.72226976291005e-06,
"loss": 1.2295,
"step": 2827
},
{
"epoch": 0.7567567567567568,
"grad_norm": 3.3932790756225586,
"learning_rate": 9.721978853771747e-06,
"loss": 1.1339,
"step": 2828
},
{
"epoch": 0.757024351083757,
"grad_norm": 3.173757553100586,
"learning_rate": 9.721687796713388e-06,
"loss": 1.0569,
"step": 2829
},
{
"epoch": 0.7572919454107573,
"grad_norm": 3.3787193298339844,
"learning_rate": 9.721396591744089e-06,
"loss": 1.0959,
"step": 2830
},
{
"epoch": 0.7575595397377576,
"grad_norm": 3.255352258682251,
"learning_rate": 9.72110523887297e-06,
"loss": 1.1342,
"step": 2831
},
{
"epoch": 0.7578271340647579,
"grad_norm": 3.6418159008026123,
"learning_rate": 9.720813738109163e-06,
"loss": 1.158,
"step": 2832
},
{
"epoch": 0.7580947283917581,
"grad_norm": 3.731308937072754,
"learning_rate": 9.720522089461795e-06,
"loss": 1.2893,
"step": 2833
},
{
"epoch": 0.7583623227187584,
"grad_norm": 3.3955862522125244,
"learning_rate": 9.720230292940005e-06,
"loss": 1.2315,
"step": 2834
},
{
"epoch": 0.7586299170457587,
"grad_norm": 3.6548402309417725,
"learning_rate": 9.71993834855293e-06,
"loss": 1.3398,
"step": 2835
},
{
"epoch": 0.7588975113727588,
"grad_norm": 3.494920015335083,
"learning_rate": 9.71964625630972e-06,
"loss": 1.0822,
"step": 2836
},
{
"epoch": 0.7591651056997591,
"grad_norm": 3.2807202339172363,
"learning_rate": 9.719354016219524e-06,
"loss": 1.0012,
"step": 2837
},
{
"epoch": 0.7594327000267594,
"grad_norm": 3.419506549835205,
"learning_rate": 9.719061628291495e-06,
"loss": 1.1424,
"step": 2838
},
{
"epoch": 0.7597002943537597,
"grad_norm": 3.452536106109619,
"learning_rate": 9.718769092534791e-06,
"loss": 1.2047,
"step": 2839
},
{
"epoch": 0.7599678886807599,
"grad_norm": 3.1318492889404297,
"learning_rate": 9.71847640895858e-06,
"loss": 1.1482,
"step": 2840
},
{
"epoch": 0.7602354830077602,
"grad_norm": 3.5314605236053467,
"learning_rate": 9.718183577572027e-06,
"loss": 1.1353,
"step": 2841
},
{
"epoch": 0.7605030773347605,
"grad_norm": 3.302334785461426,
"learning_rate": 9.717890598384308e-06,
"loss": 1.0301,
"step": 2842
},
{
"epoch": 0.7607706716617608,
"grad_norm": 3.7543177604675293,
"learning_rate": 9.7175974714046e-06,
"loss": 1.1595,
"step": 2843
},
{
"epoch": 0.761038265988761,
"grad_norm": 2.979762554168701,
"learning_rate": 9.717304196642084e-06,
"loss": 1.0898,
"step": 2844
},
{
"epoch": 0.7613058603157613,
"grad_norm": 3.3831746578216553,
"learning_rate": 9.717010774105948e-06,
"loss": 1.0798,
"step": 2845
},
{
"epoch": 0.7615734546427616,
"grad_norm": 3.5637614727020264,
"learning_rate": 9.716717203805383e-06,
"loss": 1.2266,
"step": 2846
},
{
"epoch": 0.7618410489697618,
"grad_norm": 3.3989365100860596,
"learning_rate": 9.716423485749587e-06,
"loss": 1.0865,
"step": 2847
},
{
"epoch": 0.7621086432967621,
"grad_norm": 3.465242385864258,
"learning_rate": 9.716129619947759e-06,
"loss": 1.1392,
"step": 2848
},
{
"epoch": 0.7623762376237624,
"grad_norm": 3.8061766624450684,
"learning_rate": 9.715835606409107e-06,
"loss": 1.1998,
"step": 2849
},
{
"epoch": 0.7626438319507627,
"grad_norm": 3.2725119590759277,
"learning_rate": 9.71554144514284e-06,
"loss": 1.1301,
"step": 2850
},
{
"epoch": 0.7629114262777629,
"grad_norm": 3.4146728515625,
"learning_rate": 9.715247136158173e-06,
"loss": 1.1111,
"step": 2851
},
{
"epoch": 0.7631790206047632,
"grad_norm": 3.5250372886657715,
"learning_rate": 9.714952679464324e-06,
"loss": 1.2267,
"step": 2852
},
{
"epoch": 0.7634466149317635,
"grad_norm": 3.194732427597046,
"learning_rate": 9.714658075070518e-06,
"loss": 1.0405,
"step": 2853
},
{
"epoch": 0.7637142092587638,
"grad_norm": 3.0676612854003906,
"learning_rate": 9.714363322985984e-06,
"loss": 0.9979,
"step": 2854
},
{
"epoch": 0.763981803585764,
"grad_norm": 3.3650014400482178,
"learning_rate": 9.714068423219958e-06,
"loss": 1.0496,
"step": 2855
},
{
"epoch": 0.7642493979127643,
"grad_norm": 3.422921657562256,
"learning_rate": 9.713773375781672e-06,
"loss": 1.1274,
"step": 2856
},
{
"epoch": 0.7645169922397645,
"grad_norm": 3.542006492614746,
"learning_rate": 9.713478180680375e-06,
"loss": 1.149,
"step": 2857
},
{
"epoch": 0.7647845865667647,
"grad_norm": 3.3169963359832764,
"learning_rate": 9.71318283792531e-06,
"loss": 1.0904,
"step": 2858
},
{
"epoch": 0.765052180893765,
"grad_norm": 3.1313977241516113,
"learning_rate": 9.71288734752573e-06,
"loss": 1.0458,
"step": 2859
},
{
"epoch": 0.7653197752207653,
"grad_norm": 3.309631586074829,
"learning_rate": 9.712591709490891e-06,
"loss": 1.1167,
"step": 2860
},
{
"epoch": 0.7655873695477656,
"grad_norm": 3.1519172191619873,
"learning_rate": 9.712295923830057e-06,
"loss": 1.0628,
"step": 2861
},
{
"epoch": 0.7658549638747658,
"grad_norm": 3.4607601165771484,
"learning_rate": 9.71199999055249e-06,
"loss": 1.157,
"step": 2862
},
{
"epoch": 0.7661225582017661,
"grad_norm": 3.342031478881836,
"learning_rate": 9.711703909667461e-06,
"loss": 1.1209,
"step": 2863
},
{
"epoch": 0.7663901525287664,
"grad_norm": 3.9641542434692383,
"learning_rate": 9.711407681184248e-06,
"loss": 1.1324,
"step": 2864
},
{
"epoch": 0.7666577468557667,
"grad_norm": 3.4209370613098145,
"learning_rate": 9.711111305112126e-06,
"loss": 1.0754,
"step": 2865
},
{
"epoch": 0.7669253411827669,
"grad_norm": 3.292510509490967,
"learning_rate": 9.710814781460383e-06,
"loss": 1.0765,
"step": 2866
},
{
"epoch": 0.7671929355097672,
"grad_norm": 4.118853569030762,
"learning_rate": 9.710518110238308e-06,
"loss": 1.2039,
"step": 2867
},
{
"epoch": 0.7674605298367675,
"grad_norm": 3.280724287033081,
"learning_rate": 9.71022129145519e-06,
"loss": 0.9847,
"step": 2868
},
{
"epoch": 0.7677281241637677,
"grad_norm": 3.1965861320495605,
"learning_rate": 9.709924325120333e-06,
"loss": 0.9819,
"step": 2869
},
{
"epoch": 0.767995718490768,
"grad_norm": 3.7593677043914795,
"learning_rate": 9.709627211243036e-06,
"loss": 1.259,
"step": 2870
},
{
"epoch": 0.7682633128177683,
"grad_norm": 3.556138277053833,
"learning_rate": 9.709329949832606e-06,
"loss": 1.0214,
"step": 2871
},
{
"epoch": 0.7685309071447686,
"grad_norm": 3.3062312602996826,
"learning_rate": 9.709032540898356e-06,
"loss": 1.1449,
"step": 2872
},
{
"epoch": 0.7687985014717688,
"grad_norm": 3.483119249343872,
"learning_rate": 9.708734984449605e-06,
"loss": 1.2922,
"step": 2873
},
{
"epoch": 0.7690660957987691,
"grad_norm": 3.3715760707855225,
"learning_rate": 9.70843728049567e-06,
"loss": 1.0213,
"step": 2874
},
{
"epoch": 0.7693336901257694,
"grad_norm": 3.3638691902160645,
"learning_rate": 9.70813942904588e-06,
"loss": 1.1002,
"step": 2875
},
{
"epoch": 0.7696012844527697,
"grad_norm": 3.502279281616211,
"learning_rate": 9.707841430109564e-06,
"loss": 1.1725,
"step": 2876
},
{
"epoch": 0.7698688787797698,
"grad_norm": 3.4064905643463135,
"learning_rate": 9.707543283696056e-06,
"loss": 1.1057,
"step": 2877
},
{
"epoch": 0.7701364731067701,
"grad_norm": 3.243762969970703,
"learning_rate": 9.707244989814699e-06,
"loss": 1.039,
"step": 2878
},
{
"epoch": 0.7704040674337704,
"grad_norm": 3.5731518268585205,
"learning_rate": 9.706946548474836e-06,
"loss": 1.0588,
"step": 2879
},
{
"epoch": 0.7706716617607706,
"grad_norm": 3.416506052017212,
"learning_rate": 9.706647959685813e-06,
"loss": 1.0165,
"step": 2880
},
{
"epoch": 0.7709392560877709,
"grad_norm": 3.192201852798462,
"learning_rate": 9.706349223456988e-06,
"loss": 1.0624,
"step": 2881
},
{
"epoch": 0.7712068504147712,
"grad_norm": 3.571995735168457,
"learning_rate": 9.706050339797714e-06,
"loss": 1.1391,
"step": 2882
},
{
"epoch": 0.7714744447417715,
"grad_norm": 3.073079824447632,
"learning_rate": 9.70575130871736e-06,
"loss": 1.0965,
"step": 2883
},
{
"epoch": 0.7717420390687717,
"grad_norm": 3.43789005279541,
"learning_rate": 9.705452130225287e-06,
"loss": 1.0569,
"step": 2884
},
{
"epoch": 0.772009633395772,
"grad_norm": 3.334461212158203,
"learning_rate": 9.705152804330872e-06,
"loss": 1.053,
"step": 2885
},
{
"epoch": 0.7722772277227723,
"grad_norm": 3.743177890777588,
"learning_rate": 9.70485333104349e-06,
"loss": 1.2276,
"step": 2886
},
{
"epoch": 0.7725448220497726,
"grad_norm": 3.443610191345215,
"learning_rate": 9.704553710372524e-06,
"loss": 1.0712,
"step": 2887
},
{
"epoch": 0.7728124163767728,
"grad_norm": 3.481642246246338,
"learning_rate": 9.704253942327357e-06,
"loss": 1.1042,
"step": 2888
},
{
"epoch": 0.7730800107037731,
"grad_norm": 3.3586671352386475,
"learning_rate": 9.703954026917379e-06,
"loss": 1.2262,
"step": 2889
},
{
"epoch": 0.7733476050307734,
"grad_norm": 3.1895251274108887,
"learning_rate": 9.703653964151986e-06,
"loss": 0.9845,
"step": 2890
},
{
"epoch": 0.7736151993577736,
"grad_norm": 3.624223232269287,
"learning_rate": 9.70335375404058e-06,
"loss": 1.3042,
"step": 2891
},
{
"epoch": 0.7738827936847739,
"grad_norm": 3.3234941959381104,
"learning_rate": 9.703053396592562e-06,
"loss": 1.0994,
"step": 2892
},
{
"epoch": 0.7741503880117742,
"grad_norm": 3.3439879417419434,
"learning_rate": 9.702752891817346e-06,
"loss": 1.2438,
"step": 2893
},
{
"epoch": 0.7744179823387745,
"grad_norm": 3.2826695442199707,
"learning_rate": 9.70245223972434e-06,
"loss": 1.0742,
"step": 2894
},
{
"epoch": 0.7746855766657746,
"grad_norm": 3.288297414779663,
"learning_rate": 9.702151440322964e-06,
"loss": 1.0301,
"step": 2895
},
{
"epoch": 0.7749531709927749,
"grad_norm": 3.380511999130249,
"learning_rate": 9.701850493622642e-06,
"loss": 1.1317,
"step": 2896
},
{
"epoch": 0.7752207653197752,
"grad_norm": 3.54054856300354,
"learning_rate": 9.7015493996328e-06,
"loss": 1.0911,
"step": 2897
},
{
"epoch": 0.7754883596467755,
"grad_norm": 3.4729793071746826,
"learning_rate": 9.701248158362871e-06,
"loss": 1.1824,
"step": 2898
},
{
"epoch": 0.7757559539737757,
"grad_norm": 3.3883352279663086,
"learning_rate": 9.700946769822292e-06,
"loss": 1.1696,
"step": 2899
},
{
"epoch": 0.776023548300776,
"grad_norm": 3.5626111030578613,
"learning_rate": 9.700645234020502e-06,
"loss": 1.2284,
"step": 2900
},
{
"epoch": 0.7762911426277763,
"grad_norm": 3.4596970081329346,
"learning_rate": 9.70034355096695e-06,
"loss": 1.0903,
"step": 2901
},
{
"epoch": 0.7765587369547765,
"grad_norm": 3.7619760036468506,
"learning_rate": 9.700041720671082e-06,
"loss": 1.2232,
"step": 2902
},
{
"epoch": 0.7768263312817768,
"grad_norm": 3.1871516704559326,
"learning_rate": 9.69973974314236e-06,
"loss": 1.0154,
"step": 2903
},
{
"epoch": 0.7770939256087771,
"grad_norm": 3.5451059341430664,
"learning_rate": 9.699437618390237e-06,
"loss": 1.0359,
"step": 2904
},
{
"epoch": 0.7773615199357774,
"grad_norm": 3.7833518981933594,
"learning_rate": 9.69913534642418e-06,
"loss": 1.1205,
"step": 2905
},
{
"epoch": 0.7776291142627776,
"grad_norm": 3.3426740169525146,
"learning_rate": 9.69883292725366e-06,
"loss": 1.146,
"step": 2906
},
{
"epoch": 0.7778967085897779,
"grad_norm": 3.564518690109253,
"learning_rate": 9.698530360888146e-06,
"loss": 1.1515,
"step": 2907
},
{
"epoch": 0.7781643029167782,
"grad_norm": 3.3578410148620605,
"learning_rate": 9.69822764733712e-06,
"loss": 1.2046,
"step": 2908
},
{
"epoch": 0.7784318972437785,
"grad_norm": 3.161803722381592,
"learning_rate": 9.697924786610063e-06,
"loss": 1.1811,
"step": 2909
},
{
"epoch": 0.7786994915707787,
"grad_norm": 3.1119868755340576,
"learning_rate": 9.697621778716465e-06,
"loss": 1.0896,
"step": 2910
},
{
"epoch": 0.778967085897779,
"grad_norm": 3.2111477851867676,
"learning_rate": 9.697318623665813e-06,
"loss": 1.0613,
"step": 2911
},
{
"epoch": 0.7792346802247793,
"grad_norm": 3.4069631099700928,
"learning_rate": 9.697015321467606e-06,
"loss": 1.0905,
"step": 2912
},
{
"epoch": 0.7795022745517795,
"grad_norm": 3.5640361309051514,
"learning_rate": 9.696711872131347e-06,
"loss": 1.2176,
"step": 2913
},
{
"epoch": 0.7797698688787797,
"grad_norm": 3.4428586959838867,
"learning_rate": 9.69640827566654e-06,
"loss": 1.1433,
"step": 2914
},
{
"epoch": 0.78003746320578,
"grad_norm": 3.6529276371002197,
"learning_rate": 9.696104532082695e-06,
"loss": 1.2443,
"step": 2915
},
{
"epoch": 0.7803050575327803,
"grad_norm": 3.5144267082214355,
"learning_rate": 9.695800641389327e-06,
"loss": 1.1708,
"step": 2916
},
{
"epoch": 0.7805726518597805,
"grad_norm": 3.2962844371795654,
"learning_rate": 9.695496603595959e-06,
"loss": 1.0612,
"step": 2917
},
{
"epoch": 0.7808402461867808,
"grad_norm": 3.4255483150482178,
"learning_rate": 9.695192418712111e-06,
"loss": 1.1376,
"step": 2918
},
{
"epoch": 0.7811078405137811,
"grad_norm": 3.3936362266540527,
"learning_rate": 9.694888086747315e-06,
"loss": 1.0977,
"step": 2919
},
{
"epoch": 0.7813754348407814,
"grad_norm": 3.835702896118164,
"learning_rate": 9.694583607711102e-06,
"loss": 1.2083,
"step": 2920
},
{
"epoch": 0.7816430291677816,
"grad_norm": 3.329684257507324,
"learning_rate": 9.69427898161301e-06,
"loss": 1.1792,
"step": 2921
},
{
"epoch": 0.7819106234947819,
"grad_norm": 3.5898046493530273,
"learning_rate": 9.693974208462585e-06,
"loss": 1.1128,
"step": 2922
},
{
"epoch": 0.7821782178217822,
"grad_norm": 3.2505688667297363,
"learning_rate": 9.693669288269371e-06,
"loss": 1.1022,
"step": 2923
},
{
"epoch": 0.7824458121487824,
"grad_norm": 3.629041910171509,
"learning_rate": 9.693364221042922e-06,
"loss": 1.0837,
"step": 2924
},
{
"epoch": 0.7827134064757827,
"grad_norm": 3.4234085083007812,
"learning_rate": 9.69305900679279e-06,
"loss": 1.2019,
"step": 2925
},
{
"epoch": 0.782981000802783,
"grad_norm": 3.353302001953125,
"learning_rate": 9.692753645528544e-06,
"loss": 1.133,
"step": 2926
},
{
"epoch": 0.7832485951297833,
"grad_norm": 3.490877866744995,
"learning_rate": 9.692448137259743e-06,
"loss": 1.2875,
"step": 2927
},
{
"epoch": 0.7835161894567835,
"grad_norm": 3.5161383152008057,
"learning_rate": 9.692142481995958e-06,
"loss": 1.1722,
"step": 2928
},
{
"epoch": 0.7837837837837838,
"grad_norm": 3.030968189239502,
"learning_rate": 9.691836679746767e-06,
"loss": 1.0362,
"step": 2929
},
{
"epoch": 0.7840513781107841,
"grad_norm": 3.819481611251831,
"learning_rate": 9.691530730521748e-06,
"loss": 1.2682,
"step": 2930
},
{
"epoch": 0.7843189724377844,
"grad_norm": 3.640918731689453,
"learning_rate": 9.691224634330484e-06,
"loss": 1.1868,
"step": 2931
},
{
"epoch": 0.7845865667647846,
"grad_norm": 3.7881932258605957,
"learning_rate": 9.690918391182568e-06,
"loss": 1.2436,
"step": 2932
},
{
"epoch": 0.7848541610917849,
"grad_norm": 3.8479201793670654,
"learning_rate": 9.690612001087586e-06,
"loss": 1.0979,
"step": 2933
},
{
"epoch": 0.7851217554187851,
"grad_norm": 3.2943499088287354,
"learning_rate": 9.690305464055143e-06,
"loss": 1.1036,
"step": 2934
},
{
"epoch": 0.7853893497457853,
"grad_norm": 3.42976713180542,
"learning_rate": 9.689998780094839e-06,
"loss": 1.1348,
"step": 2935
},
{
"epoch": 0.7856569440727856,
"grad_norm": 3.6888561248779297,
"learning_rate": 9.689691949216278e-06,
"loss": 1.1974,
"step": 2936
},
{
"epoch": 0.7859245383997859,
"grad_norm": 3.266007900238037,
"learning_rate": 9.689384971429077e-06,
"loss": 1.1575,
"step": 2937
},
{
"epoch": 0.7861921327267862,
"grad_norm": 3.421496629714966,
"learning_rate": 9.689077846742847e-06,
"loss": 1.1723,
"step": 2938
},
{
"epoch": 0.7864597270537864,
"grad_norm": 3.432095766067505,
"learning_rate": 9.688770575167215e-06,
"loss": 1.0722,
"step": 2939
},
{
"epoch": 0.7867273213807867,
"grad_norm": 3.453275680541992,
"learning_rate": 9.688463156711801e-06,
"loss": 1.145,
"step": 2940
},
{
"epoch": 0.786994915707787,
"grad_norm": 3.332948923110962,
"learning_rate": 9.688155591386239e-06,
"loss": 1.0426,
"step": 2941
},
{
"epoch": 0.7872625100347873,
"grad_norm": 3.5865981578826904,
"learning_rate": 9.687847879200161e-06,
"loss": 1.2441,
"step": 2942
},
{
"epoch": 0.7875301043617875,
"grad_norm": 3.633302688598633,
"learning_rate": 9.687540020163209e-06,
"loss": 1.2145,
"step": 2943
},
{
"epoch": 0.7877976986887878,
"grad_norm": 3.5107271671295166,
"learning_rate": 9.687232014285025e-06,
"loss": 1.1149,
"step": 2944
},
{
"epoch": 0.7880652930157881,
"grad_norm": 3.2951745986938477,
"learning_rate": 9.686923861575258e-06,
"loss": 1.0549,
"step": 2945
},
{
"epoch": 0.7883328873427883,
"grad_norm": 3.1973979473114014,
"learning_rate": 9.68661556204356e-06,
"loss": 1.1486,
"step": 2946
},
{
"epoch": 0.7886004816697886,
"grad_norm": 3.6182639598846436,
"learning_rate": 9.68630711569959e-06,
"loss": 1.1141,
"step": 2947
},
{
"epoch": 0.7888680759967889,
"grad_norm": 3.3912758827209473,
"learning_rate": 9.685998522553012e-06,
"loss": 1.1695,
"step": 2948
},
{
"epoch": 0.7891356703237892,
"grad_norm": 3.658475637435913,
"learning_rate": 9.68568978261349e-06,
"loss": 1.216,
"step": 2949
},
{
"epoch": 0.7894032646507894,
"grad_norm": 3.5192058086395264,
"learning_rate": 9.685380895890698e-06,
"loss": 1.1805,
"step": 2950
},
{
"epoch": 0.7896708589777897,
"grad_norm": 3.3814120292663574,
"learning_rate": 9.68507186239431e-06,
"loss": 1.1627,
"step": 2951
},
{
"epoch": 0.78993845330479,
"grad_norm": 3.405315399169922,
"learning_rate": 9.684762682134008e-06,
"loss": 1.2152,
"step": 2952
},
{
"epoch": 0.7902060476317903,
"grad_norm": 3.6160085201263428,
"learning_rate": 9.684453355119476e-06,
"loss": 1.2863,
"step": 2953
},
{
"epoch": 0.7904736419587904,
"grad_norm": 3.364459991455078,
"learning_rate": 9.684143881360406e-06,
"loss": 0.9876,
"step": 2954
},
{
"epoch": 0.7907412362857907,
"grad_norm": 3.066523551940918,
"learning_rate": 9.683834260866492e-06,
"loss": 0.9858,
"step": 2955
},
{
"epoch": 0.791008830612791,
"grad_norm": 3.4516170024871826,
"learning_rate": 9.68352449364743e-06,
"loss": 1.3306,
"step": 2956
},
{
"epoch": 0.7912764249397912,
"grad_norm": 3.5677990913391113,
"learning_rate": 9.68321457971293e-06,
"loss": 1.1928,
"step": 2957
},
{
"epoch": 0.7915440192667915,
"grad_norm": 3.59769868850708,
"learning_rate": 9.682904519072696e-06,
"loss": 1.2216,
"step": 2958
},
{
"epoch": 0.7918116135937918,
"grad_norm": 3.0195512771606445,
"learning_rate": 9.682594311736439e-06,
"loss": 0.9886,
"step": 2959
},
{
"epoch": 0.7920792079207921,
"grad_norm": 3.224322557449341,
"learning_rate": 9.68228395771388e-06,
"loss": 1.0185,
"step": 2960
},
{
"epoch": 0.7923468022477923,
"grad_norm": 3.414687395095825,
"learning_rate": 9.681973457014742e-06,
"loss": 1.2,
"step": 2961
},
{
"epoch": 0.7926143965747926,
"grad_norm": 3.225135564804077,
"learning_rate": 9.681662809648749e-06,
"loss": 1.1227,
"step": 2962
},
{
"epoch": 0.7928819909017929,
"grad_norm": 3.3739073276519775,
"learning_rate": 9.681352015625634e-06,
"loss": 1.1265,
"step": 2963
},
{
"epoch": 0.7931495852287932,
"grad_norm": 3.418264389038086,
"learning_rate": 9.681041074955131e-06,
"loss": 1.2126,
"step": 2964
},
{
"epoch": 0.7934171795557934,
"grad_norm": 3.712611198425293,
"learning_rate": 9.68072998764698e-06,
"loss": 1.2432,
"step": 2965
},
{
"epoch": 0.7936847738827937,
"grad_norm": 3.4805774688720703,
"learning_rate": 9.68041875371093e-06,
"loss": 1.2368,
"step": 2966
},
{
"epoch": 0.793952368209794,
"grad_norm": 3.31071400642395,
"learning_rate": 9.68010737315673e-06,
"loss": 1.1119,
"step": 2967
},
{
"epoch": 0.7942199625367942,
"grad_norm": 3.2610623836517334,
"learning_rate": 9.679795845994129e-06,
"loss": 1.0559,
"step": 2968
},
{
"epoch": 0.7944875568637945,
"grad_norm": 3.272242546081543,
"learning_rate": 9.67948417223289e-06,
"loss": 1.1692,
"step": 2969
},
{
"epoch": 0.7947551511907948,
"grad_norm": 3.6862285137176514,
"learning_rate": 9.679172351882778e-06,
"loss": 1.2637,
"step": 2970
},
{
"epoch": 0.7950227455177951,
"grad_norm": 3.0989432334899902,
"learning_rate": 9.678860384953558e-06,
"loss": 1.0365,
"step": 2971
},
{
"epoch": 0.7952903398447952,
"grad_norm": 3.5036988258361816,
"learning_rate": 9.678548271455002e-06,
"loss": 1.1943,
"step": 2972
},
{
"epoch": 0.7955579341717955,
"grad_norm": 3.549891233444214,
"learning_rate": 9.67823601139689e-06,
"loss": 1.2642,
"step": 2973
},
{
"epoch": 0.7958255284987958,
"grad_norm": 2.958547353744507,
"learning_rate": 9.677923604789002e-06,
"loss": 1.0623,
"step": 2974
},
{
"epoch": 0.7960931228257961,
"grad_norm": 3.7506515979766846,
"learning_rate": 9.677611051641126e-06,
"loss": 1.327,
"step": 2975
},
{
"epoch": 0.7963607171527963,
"grad_norm": 3.485591173171997,
"learning_rate": 9.677298351963051e-06,
"loss": 1.1078,
"step": 2976
},
{
"epoch": 0.7966283114797966,
"grad_norm": 3.605431079864502,
"learning_rate": 9.676985505764575e-06,
"loss": 1.1839,
"step": 2977
},
{
"epoch": 0.7968959058067969,
"grad_norm": 3.253654956817627,
"learning_rate": 9.676672513055496e-06,
"loss": 1.0312,
"step": 2978
},
{
"epoch": 0.7971635001337971,
"grad_norm": 3.57499361038208,
"learning_rate": 9.67635937384562e-06,
"loss": 1.114,
"step": 2979
},
{
"epoch": 0.7974310944607974,
"grad_norm": 3.2494728565216064,
"learning_rate": 9.676046088144755e-06,
"loss": 1.0676,
"step": 2980
},
{
"epoch": 0.7976986887877977,
"grad_norm": 3.5611202716827393,
"learning_rate": 9.675732655962716e-06,
"loss": 1.1961,
"step": 2981
},
{
"epoch": 0.797966283114798,
"grad_norm": 3.6243984699249268,
"learning_rate": 9.675419077309323e-06,
"loss": 1.1931,
"step": 2982
},
{
"epoch": 0.7982338774417982,
"grad_norm": 3.6850814819335938,
"learning_rate": 9.675105352194396e-06,
"loss": 1.0914,
"step": 2983
},
{
"epoch": 0.7985014717687985,
"grad_norm": 3.424598455429077,
"learning_rate": 9.674791480627763e-06,
"loss": 1.2153,
"step": 2984
},
{
"epoch": 0.7987690660957988,
"grad_norm": 3.0985870361328125,
"learning_rate": 9.67447746261926e-06,
"loss": 0.9851,
"step": 2985
},
{
"epoch": 0.7990366604227991,
"grad_norm": 3.504242181777954,
"learning_rate": 9.67416329817872e-06,
"loss": 1.1605,
"step": 2986
},
{
"epoch": 0.7993042547497993,
"grad_norm": 3.373812198638916,
"learning_rate": 9.673848987315986e-06,
"loss": 1.2192,
"step": 2987
},
{
"epoch": 0.7995718490767996,
"grad_norm": 3.4131312370300293,
"learning_rate": 9.673534530040905e-06,
"loss": 1.0798,
"step": 2988
},
{
"epoch": 0.7998394434037999,
"grad_norm": 3.4545650482177734,
"learning_rate": 9.673219926363325e-06,
"loss": 1.078,
"step": 2989
},
{
"epoch": 0.8001070377308,
"grad_norm": 3.47906231880188,
"learning_rate": 9.672905176293103e-06,
"loss": 1.0452,
"step": 2990
},
{
"epoch": 0.8003746320578004,
"grad_norm": 3.450021505355835,
"learning_rate": 9.6725902798401e-06,
"loss": 1.1115,
"step": 2991
},
{
"epoch": 0.8006422263848006,
"grad_norm": 3.7371201515197754,
"learning_rate": 9.672275237014178e-06,
"loss": 1.2083,
"step": 2992
},
{
"epoch": 0.800909820711801,
"grad_norm": 3.5337791442871094,
"learning_rate": 9.671960047825207e-06,
"loss": 1.1462,
"step": 2993
},
{
"epoch": 0.8011774150388011,
"grad_norm": 3.226942539215088,
"learning_rate": 9.671644712283061e-06,
"loss": 1.1274,
"step": 2994
},
{
"epoch": 0.8014450093658014,
"grad_norm": 3.127251625061035,
"learning_rate": 9.671329230397616e-06,
"loss": 1.0761,
"step": 2995
},
{
"epoch": 0.8017126036928017,
"grad_norm": 3.322313070297241,
"learning_rate": 9.67101360217876e-06,
"loss": 1.0466,
"step": 2996
},
{
"epoch": 0.801980198019802,
"grad_norm": 3.0869202613830566,
"learning_rate": 9.670697827636374e-06,
"loss": 1.0175,
"step": 2997
},
{
"epoch": 0.8022477923468022,
"grad_norm": 3.7030136585235596,
"learning_rate": 9.670381906780354e-06,
"loss": 1.2653,
"step": 2998
},
{
"epoch": 0.8025153866738025,
"grad_norm": 3.163114070892334,
"learning_rate": 9.670065839620594e-06,
"loss": 1.0952,
"step": 2999
},
{
"epoch": 0.8027829810008028,
"grad_norm": 3.4521522521972656,
"learning_rate": 9.669749626166998e-06,
"loss": 1.1834,
"step": 3000
},
{
"epoch": 0.8027829810008028,
"eval_loss": 1.1542552709579468,
"eval_runtime": 11.69,
"eval_samples_per_second": 34.217,
"eval_steps_per_second": 4.277,
"step": 3000
},
{
"epoch": 0.803050575327803,
"grad_norm": 3.3275747299194336,
"learning_rate": 9.669433266429468e-06,
"loss": 1.127,
"step": 3001
},
{
"epoch": 0.8033181696548033,
"grad_norm": 3.7543275356292725,
"learning_rate": 9.669116760417919e-06,
"loss": 1.2466,
"step": 3002
},
{
"epoch": 0.8035857639818036,
"grad_norm": 3.295300006866455,
"learning_rate": 9.66880010814226e-06,
"loss": 1.0984,
"step": 3003
},
{
"epoch": 0.8038533583088039,
"grad_norm": 3.4165024757385254,
"learning_rate": 9.668483309612415e-06,
"loss": 1.0271,
"step": 3004
},
{
"epoch": 0.8041209526358041,
"grad_norm": 3.307145357131958,
"learning_rate": 9.668166364838306e-06,
"loss": 1.1855,
"step": 3005
},
{
"epoch": 0.8043885469628044,
"grad_norm": 3.409726858139038,
"learning_rate": 9.667849273829861e-06,
"loss": 1.0533,
"step": 3006
},
{
"epoch": 0.8046561412898047,
"grad_norm": 3.490656852722168,
"learning_rate": 9.667532036597017e-06,
"loss": 1.14,
"step": 3007
},
{
"epoch": 0.804923735616805,
"grad_norm": 3.395625352859497,
"learning_rate": 9.667214653149706e-06,
"loss": 1.2552,
"step": 3008
},
{
"epoch": 0.8051913299438052,
"grad_norm": 3.3872432708740234,
"learning_rate": 9.666897123497874e-06,
"loss": 1.1062,
"step": 3009
},
{
"epoch": 0.8054589242708055,
"grad_norm": 3.4740712642669678,
"learning_rate": 9.666579447651467e-06,
"loss": 1.1967,
"step": 3010
},
{
"epoch": 0.8057265185978058,
"grad_norm": 3.5477073192596436,
"learning_rate": 9.666261625620437e-06,
"loss": 1.0399,
"step": 3011
},
{
"epoch": 0.8059941129248059,
"grad_norm": 3.2551109790802,
"learning_rate": 9.665943657414738e-06,
"loss": 1.1364,
"step": 3012
},
{
"epoch": 0.8062617072518062,
"grad_norm": 3.1551992893218994,
"learning_rate": 9.665625543044335e-06,
"loss": 1.1235,
"step": 3013
},
{
"epoch": 0.8065293015788065,
"grad_norm": 3.2002670764923096,
"learning_rate": 9.66530728251919e-06,
"loss": 1.06,
"step": 3014
},
{
"epoch": 0.8067968959058068,
"grad_norm": 3.1332433223724365,
"learning_rate": 9.664988875849271e-06,
"loss": 1.0827,
"step": 3015
},
{
"epoch": 0.807064490232807,
"grad_norm": 3.4181861877441406,
"learning_rate": 9.664670323044555e-06,
"loss": 1.108,
"step": 3016
},
{
"epoch": 0.8073320845598073,
"grad_norm": 3.91221284866333,
"learning_rate": 9.66435162411502e-06,
"loss": 1.0166,
"step": 3017
},
{
"epoch": 0.8075996788868076,
"grad_norm": 3.2280433177948,
"learning_rate": 9.664032779070652e-06,
"loss": 1.1096,
"step": 3018
},
{
"epoch": 0.8078672732138079,
"grad_norm": 3.229264259338379,
"learning_rate": 9.663713787921436e-06,
"loss": 1.0637,
"step": 3019
},
{
"epoch": 0.8081348675408081,
"grad_norm": 3.3362436294555664,
"learning_rate": 9.663394650677368e-06,
"loss": 1.0432,
"step": 3020
},
{
"epoch": 0.8084024618678084,
"grad_norm": 3.3346054553985596,
"learning_rate": 9.66307536734844e-06,
"loss": 1.164,
"step": 3021
},
{
"epoch": 0.8086700561948087,
"grad_norm": 3.382387399673462,
"learning_rate": 9.662755937944657e-06,
"loss": 1.0578,
"step": 3022
},
{
"epoch": 0.808937650521809,
"grad_norm": 3.3161141872406006,
"learning_rate": 9.662436362476026e-06,
"loss": 0.9829,
"step": 3023
},
{
"epoch": 0.8092052448488092,
"grad_norm": 3.457970380783081,
"learning_rate": 9.662116640952558e-06,
"loss": 1.2865,
"step": 3024
},
{
"epoch": 0.8094728391758095,
"grad_norm": 3.1441056728363037,
"learning_rate": 9.661796773384266e-06,
"loss": 1.0722,
"step": 3025
},
{
"epoch": 0.8097404335028098,
"grad_norm": 3.2600796222686768,
"learning_rate": 9.661476759781174e-06,
"loss": 1.0949,
"step": 3026
},
{
"epoch": 0.81000802782981,
"grad_norm": 3.8801653385162354,
"learning_rate": 9.661156600153304e-06,
"loss": 1.2197,
"step": 3027
},
{
"epoch": 0.8102756221568103,
"grad_norm": 3.6208014488220215,
"learning_rate": 9.660836294510685e-06,
"loss": 1.2421,
"step": 3028
},
{
"epoch": 0.8105432164838106,
"grad_norm": 3.5174331665039062,
"learning_rate": 9.660515842863352e-06,
"loss": 1.0628,
"step": 3029
},
{
"epoch": 0.8108108108108109,
"grad_norm": 3.285752058029175,
"learning_rate": 9.660195245221345e-06,
"loss": 1.2271,
"step": 3030
},
{
"epoch": 0.811078405137811,
"grad_norm": 3.3572685718536377,
"learning_rate": 9.659874501594705e-06,
"loss": 1.1451,
"step": 3031
},
{
"epoch": 0.8113459994648113,
"grad_norm": 3.5392873287200928,
"learning_rate": 9.659553611993478e-06,
"loss": 1.2389,
"step": 3032
},
{
"epoch": 0.8116135937918116,
"grad_norm": 3.6059014797210693,
"learning_rate": 9.659232576427718e-06,
"loss": 1.2854,
"step": 3033
},
{
"epoch": 0.8118811881188119,
"grad_norm": 3.2821319103240967,
"learning_rate": 9.65891139490748e-06,
"loss": 1.0982,
"step": 3034
},
{
"epoch": 0.8121487824458121,
"grad_norm": 3.2728023529052734,
"learning_rate": 9.65859006744283e-06,
"loss": 1.0598,
"step": 3035
},
{
"epoch": 0.8124163767728124,
"grad_norm": 3.7586371898651123,
"learning_rate": 9.65826859404383e-06,
"loss": 1.2271,
"step": 3036
},
{
"epoch": 0.8126839710998127,
"grad_norm": 3.513029098510742,
"learning_rate": 9.65794697472055e-06,
"loss": 1.0671,
"step": 3037
},
{
"epoch": 0.8129515654268129,
"grad_norm": 3.1939735412597656,
"learning_rate": 9.657625209483066e-06,
"loss": 1.0949,
"step": 3038
},
{
"epoch": 0.8132191597538132,
"grad_norm": 3.557431221008301,
"learning_rate": 9.65730329834146e-06,
"loss": 1.1211,
"step": 3039
},
{
"epoch": 0.8134867540808135,
"grad_norm": 3.6598188877105713,
"learning_rate": 9.656981241305811e-06,
"loss": 1.1189,
"step": 3040
},
{
"epoch": 0.8137543484078138,
"grad_norm": 3.781261444091797,
"learning_rate": 9.656659038386213e-06,
"loss": 1.1389,
"step": 3041
},
{
"epoch": 0.814021942734814,
"grad_norm": 3.638216733932495,
"learning_rate": 9.656336689592756e-06,
"loss": 1.3045,
"step": 3042
},
{
"epoch": 0.8142895370618143,
"grad_norm": 3.7151739597320557,
"learning_rate": 9.65601419493554e-06,
"loss": 1.3391,
"step": 3043
},
{
"epoch": 0.8145571313888146,
"grad_norm": 3.482971429824829,
"learning_rate": 9.655691554424664e-06,
"loss": 1.0059,
"step": 3044
},
{
"epoch": 0.8148247257158149,
"grad_norm": 3.6908507347106934,
"learning_rate": 9.655368768070239e-06,
"loss": 1.33,
"step": 3045
},
{
"epoch": 0.8150923200428151,
"grad_norm": 3.2988486289978027,
"learning_rate": 9.655045835882373e-06,
"loss": 1.0606,
"step": 3046
},
{
"epoch": 0.8153599143698154,
"grad_norm": 3.3644847869873047,
"learning_rate": 9.654722757871184e-06,
"loss": 1.1128,
"step": 3047
},
{
"epoch": 0.8156275086968157,
"grad_norm": 3.3999931812286377,
"learning_rate": 9.654399534046795e-06,
"loss": 1.1453,
"step": 3048
},
{
"epoch": 0.8158951030238158,
"grad_norm": 3.4176931381225586,
"learning_rate": 9.654076164419326e-06,
"loss": 1.1062,
"step": 3049
},
{
"epoch": 0.8161626973508161,
"grad_norm": 3.199340581893921,
"learning_rate": 9.65375264899891e-06,
"loss": 1.1005,
"step": 3050
},
{
"epoch": 0.8164302916778164,
"grad_norm": 3.3121516704559326,
"learning_rate": 9.653428987795684e-06,
"loss": 1.0958,
"step": 3051
},
{
"epoch": 0.8166978860048167,
"grad_norm": 3.3458409309387207,
"learning_rate": 9.65310518081978e-06,
"loss": 1.1123,
"step": 3052
},
{
"epoch": 0.8169654803318169,
"grad_norm": 3.438964366912842,
"learning_rate": 9.652781228081348e-06,
"loss": 1.2157,
"step": 3053
},
{
"epoch": 0.8172330746588172,
"grad_norm": 3.2331788539886475,
"learning_rate": 9.652457129590534e-06,
"loss": 1.0771,
"step": 3054
},
{
"epoch": 0.8175006689858175,
"grad_norm": 3.585362434387207,
"learning_rate": 9.652132885357488e-06,
"loss": 1.2097,
"step": 3055
},
{
"epoch": 0.8177682633128178,
"grad_norm": 3.1754887104034424,
"learning_rate": 9.65180849539237e-06,
"loss": 1.0281,
"step": 3056
},
{
"epoch": 0.818035857639818,
"grad_norm": 3.4600307941436768,
"learning_rate": 9.651483959705344e-06,
"loss": 1.1359,
"step": 3057
},
{
"epoch": 0.8183034519668183,
"grad_norm": 3.3886213302612305,
"learning_rate": 9.65115927830657e-06,
"loss": 1.0735,
"step": 3058
},
{
"epoch": 0.8185710462938186,
"grad_norm": 3.5005595684051514,
"learning_rate": 9.650834451206225e-06,
"loss": 1.0937,
"step": 3059
},
{
"epoch": 0.8188386406208188,
"grad_norm": 3.592665672302246,
"learning_rate": 9.650509478414483e-06,
"loss": 1.1554,
"step": 3060
},
{
"epoch": 0.8191062349478191,
"grad_norm": 3.504587173461914,
"learning_rate": 9.650184359941522e-06,
"loss": 1.0877,
"step": 3061
},
{
"epoch": 0.8193738292748194,
"grad_norm": 3.803943634033203,
"learning_rate": 9.649859095797526e-06,
"loss": 1.2895,
"step": 3062
},
{
"epoch": 0.8196414236018197,
"grad_norm": 3.929657459259033,
"learning_rate": 9.649533685992687e-06,
"loss": 1.2047,
"step": 3063
},
{
"epoch": 0.8199090179288199,
"grad_norm": 3.074686288833618,
"learning_rate": 9.649208130537199e-06,
"loss": 1.0744,
"step": 3064
},
{
"epoch": 0.8201766122558202,
"grad_norm": 3.3522446155548096,
"learning_rate": 9.648882429441258e-06,
"loss": 1.1105,
"step": 3065
},
{
"epoch": 0.8204442065828205,
"grad_norm": 3.2287309169769287,
"learning_rate": 9.648556582715067e-06,
"loss": 1.1542,
"step": 3066
},
{
"epoch": 0.8207118009098208,
"grad_norm": 3.072052478790283,
"learning_rate": 9.648230590368836e-06,
"loss": 1.0983,
"step": 3067
},
{
"epoch": 0.820979395236821,
"grad_norm": 3.3558244705200195,
"learning_rate": 9.647904452412774e-06,
"loss": 1.1362,
"step": 3068
},
{
"epoch": 0.8212469895638213,
"grad_norm": 3.917283296585083,
"learning_rate": 9.647578168857101e-06,
"loss": 1.1743,
"step": 3069
},
{
"epoch": 0.8215145838908215,
"grad_norm": 3.6273481845855713,
"learning_rate": 9.647251739712034e-06,
"loss": 1.2516,
"step": 3070
},
{
"epoch": 0.8217821782178217,
"grad_norm": 3.0678317546844482,
"learning_rate": 9.646925164987802e-06,
"loss": 1.0271,
"step": 3071
},
{
"epoch": 0.822049772544822,
"grad_norm": 2.849170684814453,
"learning_rate": 9.646598444694631e-06,
"loss": 0.9816,
"step": 3072
},
{
"epoch": 0.8223173668718223,
"grad_norm": 3.5290982723236084,
"learning_rate": 9.64627157884276e-06,
"loss": 1.1911,
"step": 3073
},
{
"epoch": 0.8225849611988226,
"grad_norm": 3.403162717819214,
"learning_rate": 9.645944567442429e-06,
"loss": 1.1342,
"step": 3074
},
{
"epoch": 0.8228525555258228,
"grad_norm": 3.4919958114624023,
"learning_rate": 9.645617410503879e-06,
"loss": 1.256,
"step": 3075
},
{
"epoch": 0.8231201498528231,
"grad_norm": 3.1679487228393555,
"learning_rate": 9.645290108037358e-06,
"loss": 1.0346,
"step": 3076
},
{
"epoch": 0.8233877441798234,
"grad_norm": 3.7232227325439453,
"learning_rate": 9.644962660053122e-06,
"loss": 1.2307,
"step": 3077
},
{
"epoch": 0.8236553385068237,
"grad_norm": 3.162550449371338,
"learning_rate": 9.644635066561426e-06,
"loss": 1.0,
"step": 3078
},
{
"epoch": 0.8239229328338239,
"grad_norm": 3.254295825958252,
"learning_rate": 9.644307327572533e-06,
"loss": 0.9921,
"step": 3079
},
{
"epoch": 0.8241905271608242,
"grad_norm": 3.4591927528381348,
"learning_rate": 9.643979443096711e-06,
"loss": 1.149,
"step": 3080
},
{
"epoch": 0.8244581214878245,
"grad_norm": 3.499791383743286,
"learning_rate": 9.64365141314423e-06,
"loss": 1.1439,
"step": 3081
},
{
"epoch": 0.8247257158148247,
"grad_norm": 4.038766860961914,
"learning_rate": 9.643323237725366e-06,
"loss": 1.2011,
"step": 3082
},
{
"epoch": 0.824993310141825,
"grad_norm": 3.279536247253418,
"learning_rate": 9.6429949168504e-06,
"loss": 1.1845,
"step": 3083
},
{
"epoch": 0.8252609044688253,
"grad_norm": 3.441106081008911,
"learning_rate": 9.642666450529613e-06,
"loss": 1.1406,
"step": 3084
},
{
"epoch": 0.8255284987958256,
"grad_norm": 3.512998104095459,
"learning_rate": 9.6423378387733e-06,
"loss": 1.1178,
"step": 3085
},
{
"epoch": 0.8257960931228258,
"grad_norm": 3.23618483543396,
"learning_rate": 9.642009081591753e-06,
"loss": 1.0476,
"step": 3086
},
{
"epoch": 0.8260636874498261,
"grad_norm": 3.6994218826293945,
"learning_rate": 9.641680178995272e-06,
"loss": 1.3332,
"step": 3087
},
{
"epoch": 0.8263312817768264,
"grad_norm": 3.1423802375793457,
"learning_rate": 9.641351130994155e-06,
"loss": 1.0252,
"step": 3088
},
{
"epoch": 0.8265988761038267,
"grad_norm": 3.7969133853912354,
"learning_rate": 9.641021937598715e-06,
"loss": 1.2813,
"step": 3089
},
{
"epoch": 0.8268664704308268,
"grad_norm": 3.5946247577667236,
"learning_rate": 9.640692598819263e-06,
"loss": 1.2722,
"step": 3090
},
{
"epoch": 0.8271340647578271,
"grad_norm": 3.4758689403533936,
"learning_rate": 9.640363114666115e-06,
"loss": 1.0492,
"step": 3091
},
{
"epoch": 0.8274016590848274,
"grad_norm": 3.2242352962493896,
"learning_rate": 9.640033485149594e-06,
"loss": 1.0117,
"step": 3092
},
{
"epoch": 0.8276692534118276,
"grad_norm": 3.510794162750244,
"learning_rate": 9.639703710280022e-06,
"loss": 1.1141,
"step": 3093
},
{
"epoch": 0.8279368477388279,
"grad_norm": 3.322143077850342,
"learning_rate": 9.639373790067734e-06,
"loss": 1.1013,
"step": 3094
},
{
"epoch": 0.8282044420658282,
"grad_norm": 3.6232595443725586,
"learning_rate": 9.639043724523063e-06,
"loss": 1.0961,
"step": 3095
},
{
"epoch": 0.8284720363928285,
"grad_norm": 3.486630439758301,
"learning_rate": 9.638713513656348e-06,
"loss": 1.2134,
"step": 3096
},
{
"epoch": 0.8287396307198287,
"grad_norm": 3.1779515743255615,
"learning_rate": 9.638383157477935e-06,
"loss": 1.0994,
"step": 3097
},
{
"epoch": 0.829007225046829,
"grad_norm": 3.2651169300079346,
"learning_rate": 9.638052655998172e-06,
"loss": 1.0744,
"step": 3098
},
{
"epoch": 0.8292748193738293,
"grad_norm": 4.6499152183532715,
"learning_rate": 9.63772200922741e-06,
"loss": 1.1698,
"step": 3099
},
{
"epoch": 0.8295424137008296,
"grad_norm": 3.5739243030548096,
"learning_rate": 9.63739121717601e-06,
"loss": 1.1934,
"step": 3100
},
{
"epoch": 0.8298100080278298,
"grad_norm": 3.5878965854644775,
"learning_rate": 9.637060279854331e-06,
"loss": 1.2052,
"step": 3101
},
{
"epoch": 0.8300776023548301,
"grad_norm": 3.2450151443481445,
"learning_rate": 9.636729197272745e-06,
"loss": 1.0594,
"step": 3102
},
{
"epoch": 0.8303451966818304,
"grad_norm": 3.224311351776123,
"learning_rate": 9.636397969441617e-06,
"loss": 1.1101,
"step": 3103
},
{
"epoch": 0.8306127910088306,
"grad_norm": 3.1356136798858643,
"learning_rate": 9.63606659637133e-06,
"loss": 1.0276,
"step": 3104
},
{
"epoch": 0.8308803853358309,
"grad_norm": 3.4842357635498047,
"learning_rate": 9.635735078072259e-06,
"loss": 1.2134,
"step": 3105
},
{
"epoch": 0.8311479796628312,
"grad_norm": 3.485252857208252,
"learning_rate": 9.635403414554791e-06,
"loss": 1.0805,
"step": 3106
},
{
"epoch": 0.8314155739898315,
"grad_norm": 3.630953311920166,
"learning_rate": 9.635071605829315e-06,
"loss": 1.1837,
"step": 3107
},
{
"epoch": 0.8316831683168316,
"grad_norm": 3.5035400390625,
"learning_rate": 9.634739651906227e-06,
"loss": 1.142,
"step": 3108
},
{
"epoch": 0.8319507626438319,
"grad_norm": 3.4590489864349365,
"learning_rate": 9.634407552795924e-06,
"loss": 1.1785,
"step": 3109
},
{
"epoch": 0.8322183569708322,
"grad_norm": 3.730466842651367,
"learning_rate": 9.63407530850881e-06,
"loss": 1.1447,
"step": 3110
},
{
"epoch": 0.8324859512978325,
"grad_norm": 3.295057535171509,
"learning_rate": 9.633742919055294e-06,
"loss": 1.0757,
"step": 3111
},
{
"epoch": 0.8327535456248327,
"grad_norm": 3.471201181411743,
"learning_rate": 9.633410384445785e-06,
"loss": 1.1772,
"step": 3112
},
{
"epoch": 0.833021139951833,
"grad_norm": 3.329434633255005,
"learning_rate": 9.633077704690702e-06,
"loss": 1.2623,
"step": 3113
},
{
"epoch": 0.8332887342788333,
"grad_norm": 3.593980073928833,
"learning_rate": 9.632744879800468e-06,
"loss": 1.1858,
"step": 3114
},
{
"epoch": 0.8335563286058335,
"grad_norm": 3.156765937805176,
"learning_rate": 9.632411909785506e-06,
"loss": 1.1067,
"step": 3115
},
{
"epoch": 0.8338239229328338,
"grad_norm": 3.3649260997772217,
"learning_rate": 9.632078794656249e-06,
"loss": 1.1243,
"step": 3116
},
{
"epoch": 0.8340915172598341,
"grad_norm": 3.270552635192871,
"learning_rate": 9.631745534423132e-06,
"loss": 1.1653,
"step": 3117
},
{
"epoch": 0.8343591115868344,
"grad_norm": 2.9742650985717773,
"learning_rate": 9.631412129096591e-06,
"loss": 1.0039,
"step": 3118
},
{
"epoch": 0.8346267059138346,
"grad_norm": 3.710505247116089,
"learning_rate": 9.631078578687077e-06,
"loss": 1.1613,
"step": 3119
},
{
"epoch": 0.8348943002408349,
"grad_norm": 3.2119741439819336,
"learning_rate": 9.630744883205031e-06,
"loss": 1.1568,
"step": 3120
},
{
"epoch": 0.8351618945678352,
"grad_norm": 3.492464065551758,
"learning_rate": 9.630411042660913e-06,
"loss": 1.3087,
"step": 3121
},
{
"epoch": 0.8354294888948355,
"grad_norm": 4.012518405914307,
"learning_rate": 9.630077057065177e-06,
"loss": 1.1665,
"step": 3122
},
{
"epoch": 0.8356970832218357,
"grad_norm": 3.5707449913024902,
"learning_rate": 9.629742926428287e-06,
"loss": 1.1712,
"step": 3123
},
{
"epoch": 0.835964677548836,
"grad_norm": 3.179173469543457,
"learning_rate": 9.629408650760707e-06,
"loss": 1.0721,
"step": 3124
},
{
"epoch": 0.8362322718758363,
"grad_norm": 3.2190163135528564,
"learning_rate": 9.629074230072913e-06,
"loss": 1.1279,
"step": 3125
},
{
"epoch": 0.8364998662028365,
"grad_norm": 3.036876916885376,
"learning_rate": 9.62873966437538e-06,
"loss": 1.0463,
"step": 3126
},
{
"epoch": 0.8367674605298367,
"grad_norm": 3.333547592163086,
"learning_rate": 9.628404953678585e-06,
"loss": 1.1396,
"step": 3127
},
{
"epoch": 0.837035054856837,
"grad_norm": 3.266360282897949,
"learning_rate": 9.628070097993016e-06,
"loss": 1.1264,
"step": 3128
},
{
"epoch": 0.8373026491838373,
"grad_norm": 3.3879363536834717,
"learning_rate": 9.627735097329161e-06,
"loss": 1.0972,
"step": 3129
},
{
"epoch": 0.8375702435108375,
"grad_norm": 3.3518929481506348,
"learning_rate": 9.627399951697516e-06,
"loss": 1.1234,
"step": 3130
},
{
"epoch": 0.8378378378378378,
"grad_norm": 3.3172409534454346,
"learning_rate": 9.627064661108581e-06,
"loss": 1.0768,
"step": 3131
},
{
"epoch": 0.8381054321648381,
"grad_norm": 3.6157588958740234,
"learning_rate": 9.626729225572854e-06,
"loss": 1.2114,
"step": 3132
},
{
"epoch": 0.8383730264918384,
"grad_norm": 3.2437682151794434,
"learning_rate": 9.626393645100849e-06,
"loss": 1.0175,
"step": 3133
},
{
"epoch": 0.8386406208188386,
"grad_norm": 3.443774461746216,
"learning_rate": 9.626057919703073e-06,
"loss": 1.1866,
"step": 3134
},
{
"epoch": 0.8389082151458389,
"grad_norm": 3.1143884658813477,
"learning_rate": 9.625722049390048e-06,
"loss": 0.9715,
"step": 3135
},
{
"epoch": 0.8391758094728392,
"grad_norm": 3.3151462078094482,
"learning_rate": 9.62538603417229e-06,
"loss": 1.0459,
"step": 3136
},
{
"epoch": 0.8394434037998394,
"grad_norm": 3.691002368927002,
"learning_rate": 9.625049874060331e-06,
"loss": 1.1284,
"step": 3137
},
{
"epoch": 0.8397109981268397,
"grad_norm": 3.0173420906066895,
"learning_rate": 9.624713569064695e-06,
"loss": 0.9815,
"step": 3138
},
{
"epoch": 0.83997859245384,
"grad_norm": 3.3124630451202393,
"learning_rate": 9.624377119195922e-06,
"loss": 1.1042,
"step": 3139
},
{
"epoch": 0.8402461867808403,
"grad_norm": 3.262075424194336,
"learning_rate": 9.624040524464548e-06,
"loss": 1.1501,
"step": 3140
},
{
"epoch": 0.8405137811078405,
"grad_norm": 3.391528367996216,
"learning_rate": 9.623703784881121e-06,
"loss": 1.086,
"step": 3141
},
{
"epoch": 0.8407813754348408,
"grad_norm": 3.690544843673706,
"learning_rate": 9.623366900456186e-06,
"loss": 1.1857,
"step": 3142
},
{
"epoch": 0.8410489697618411,
"grad_norm": 3.2583820819854736,
"learning_rate": 9.6230298712003e-06,
"loss": 1.063,
"step": 3143
},
{
"epoch": 0.8413165640888414,
"grad_norm": 3.278346300125122,
"learning_rate": 9.622692697124016e-06,
"loss": 1.1059,
"step": 3144
},
{
"epoch": 0.8415841584158416,
"grad_norm": 3.320652484893799,
"learning_rate": 9.6223553782379e-06,
"loss": 1.165,
"step": 3145
},
{
"epoch": 0.8418517527428419,
"grad_norm": 3.6142923831939697,
"learning_rate": 9.622017914552519e-06,
"loss": 1.1734,
"step": 3146
},
{
"epoch": 0.8421193470698422,
"grad_norm": 3.483147382736206,
"learning_rate": 9.62168030607844e-06,
"loss": 1.0495,
"step": 3147
},
{
"epoch": 0.8423869413968423,
"grad_norm": 3.2388815879821777,
"learning_rate": 9.621342552826245e-06,
"loss": 1.0552,
"step": 3148
},
{
"epoch": 0.8426545357238426,
"grad_norm": 3.1021432876586914,
"learning_rate": 9.62100465480651e-06,
"loss": 0.9876,
"step": 3149
},
{
"epoch": 0.8429221300508429,
"grad_norm": 3.7463855743408203,
"learning_rate": 9.62066661202982e-06,
"loss": 1.0824,
"step": 3150
},
{
"epoch": 0.8431897243778432,
"grad_norm": 3.345280170440674,
"learning_rate": 9.620328424506767e-06,
"loss": 1.0385,
"step": 3151
},
{
"epoch": 0.8434573187048434,
"grad_norm": 3.582469940185547,
"learning_rate": 9.619990092247943e-06,
"loss": 1.219,
"step": 3152
},
{
"epoch": 0.8437249130318437,
"grad_norm": 3.824211835861206,
"learning_rate": 9.619651615263948e-06,
"loss": 1.4056,
"step": 3153
},
{
"epoch": 0.843992507358844,
"grad_norm": 3.295612335205078,
"learning_rate": 9.619312993565382e-06,
"loss": 1.0493,
"step": 3154
},
{
"epoch": 0.8442601016858443,
"grad_norm": 3.390982151031494,
"learning_rate": 9.618974227162857e-06,
"loss": 1.136,
"step": 3155
},
{
"epoch": 0.8445276960128445,
"grad_norm": 3.5141913890838623,
"learning_rate": 9.618635316066984e-06,
"loss": 1.1228,
"step": 3156
},
{
"epoch": 0.8447952903398448,
"grad_norm": 3.6350278854370117,
"learning_rate": 9.618296260288376e-06,
"loss": 1.2088,
"step": 3157
},
{
"epoch": 0.8450628846668451,
"grad_norm": 3.1930181980133057,
"learning_rate": 9.617957059837659e-06,
"loss": 1.1015,
"step": 3158
},
{
"epoch": 0.8453304789938453,
"grad_norm": 3.7268929481506348,
"learning_rate": 9.617617714725456e-06,
"loss": 1.18,
"step": 3159
},
{
"epoch": 0.8455980733208456,
"grad_norm": 3.712311267852783,
"learning_rate": 9.617278224962398e-06,
"loss": 1.1109,
"step": 3160
},
{
"epoch": 0.8458656676478459,
"grad_norm": 3.326599597930908,
"learning_rate": 9.616938590559121e-06,
"loss": 1.0733,
"step": 3161
},
{
"epoch": 0.8461332619748462,
"grad_norm": 3.1389646530151367,
"learning_rate": 9.616598811526263e-06,
"loss": 1.0736,
"step": 3162
},
{
"epoch": 0.8464008563018464,
"grad_norm": 3.1288650035858154,
"learning_rate": 9.616258887874467e-06,
"loss": 1.0572,
"step": 3163
},
{
"epoch": 0.8466684506288467,
"grad_norm": 3.364788055419922,
"learning_rate": 9.615918819614382e-06,
"loss": 1.0658,
"step": 3164
},
{
"epoch": 0.846936044955847,
"grad_norm": 3.18229341506958,
"learning_rate": 9.615578606756663e-06,
"loss": 1.0498,
"step": 3165
},
{
"epoch": 0.8472036392828473,
"grad_norm": 3.276883125305176,
"learning_rate": 9.615238249311964e-06,
"loss": 1.0673,
"step": 3166
},
{
"epoch": 0.8474712336098474,
"grad_norm": 3.2905640602111816,
"learning_rate": 9.61489774729095e-06,
"loss": 1.0059,
"step": 3167
},
{
"epoch": 0.8477388279368477,
"grad_norm": 3.756727933883667,
"learning_rate": 9.614557100704286e-06,
"loss": 1.2645,
"step": 3168
},
{
"epoch": 0.848006422263848,
"grad_norm": 3.3283801078796387,
"learning_rate": 9.614216309562643e-06,
"loss": 1.0832,
"step": 3169
},
{
"epoch": 0.8482740165908482,
"grad_norm": 3.5391414165496826,
"learning_rate": 9.613875373876698e-06,
"loss": 1.0946,
"step": 3170
},
{
"epoch": 0.8485416109178485,
"grad_norm": 3.3885715007781982,
"learning_rate": 9.61353429365713e-06,
"loss": 1.1686,
"step": 3171
},
{
"epoch": 0.8488092052448488,
"grad_norm": 3.24389386177063,
"learning_rate": 9.613193068914623e-06,
"loss": 1.1554,
"step": 3172
},
{
"epoch": 0.8490767995718491,
"grad_norm": 3.2578012943267822,
"learning_rate": 9.612851699659867e-06,
"loss": 1.1124,
"step": 3173
},
{
"epoch": 0.8493443938988493,
"grad_norm": 3.3463966846466064,
"learning_rate": 9.612510185903554e-06,
"loss": 1.0264,
"step": 3174
},
{
"epoch": 0.8496119882258496,
"grad_norm": 3.320957899093628,
"learning_rate": 9.612168527656386e-06,
"loss": 1.1183,
"step": 3175
},
{
"epoch": 0.8498795825528499,
"grad_norm": 3.1176092624664307,
"learning_rate": 9.611826724929063e-06,
"loss": 1.1182,
"step": 3176
},
{
"epoch": 0.8501471768798502,
"grad_norm": 3.338179349899292,
"learning_rate": 9.611484777732292e-06,
"loss": 1.1178,
"step": 3177
},
{
"epoch": 0.8504147712068504,
"grad_norm": 3.7851016521453857,
"learning_rate": 9.611142686076787e-06,
"loss": 1.2339,
"step": 3178
},
{
"epoch": 0.8506823655338507,
"grad_norm": 3.0936129093170166,
"learning_rate": 9.610800449973261e-06,
"loss": 1.1433,
"step": 3179
},
{
"epoch": 0.850949959860851,
"grad_norm": 3.48248291015625,
"learning_rate": 9.610458069432438e-06,
"loss": 1.1971,
"step": 3180
},
{
"epoch": 0.8512175541878512,
"grad_norm": 3.774419069290161,
"learning_rate": 9.610115544465042e-06,
"loss": 1.1778,
"step": 3181
},
{
"epoch": 0.8514851485148515,
"grad_norm": 3.461056709289551,
"learning_rate": 9.609772875081802e-06,
"loss": 1.1425,
"step": 3182
},
{
"epoch": 0.8517527428418518,
"grad_norm": 3.332552671432495,
"learning_rate": 9.609430061293454e-06,
"loss": 1.0041,
"step": 3183
},
{
"epoch": 0.8520203371688521,
"grad_norm": 3.5970587730407715,
"learning_rate": 9.609087103110737e-06,
"loss": 1.2363,
"step": 3184
},
{
"epoch": 0.8522879314958522,
"grad_norm": 3.4365155696868896,
"learning_rate": 9.608744000544392e-06,
"loss": 1.0534,
"step": 3185
},
{
"epoch": 0.8525555258228525,
"grad_norm": 3.2905330657958984,
"learning_rate": 9.60840075360517e-06,
"loss": 1.2355,
"step": 3186
},
{
"epoch": 0.8528231201498528,
"grad_norm": 3.4680607318878174,
"learning_rate": 9.608057362303823e-06,
"loss": 1.0901,
"step": 3187
},
{
"epoch": 0.8530907144768531,
"grad_norm": 3.351891279220581,
"learning_rate": 9.607713826651107e-06,
"loss": 1.1422,
"step": 3188
},
{
"epoch": 0.8533583088038533,
"grad_norm": 3.7744686603546143,
"learning_rate": 9.607370146657782e-06,
"loss": 1.1692,
"step": 3189
},
{
"epoch": 0.8536259031308536,
"grad_norm": 3.2692463397979736,
"learning_rate": 9.607026322334618e-06,
"loss": 1.0488,
"step": 3190
},
{
"epoch": 0.8538934974578539,
"grad_norm": 3.201399564743042,
"learning_rate": 9.606682353692383e-06,
"loss": 0.9253,
"step": 3191
},
{
"epoch": 0.8541610917848541,
"grad_norm": 3.5281589031219482,
"learning_rate": 9.606338240741851e-06,
"loss": 1.1785,
"step": 3192
},
{
"epoch": 0.8544286861118544,
"grad_norm": 3.3514602184295654,
"learning_rate": 9.605993983493804e-06,
"loss": 1.1364,
"step": 3193
},
{
"epoch": 0.8546962804388547,
"grad_norm": 3.6264495849609375,
"learning_rate": 9.605649581959027e-06,
"loss": 1.1206,
"step": 3194
},
{
"epoch": 0.854963874765855,
"grad_norm": 3.1329174041748047,
"learning_rate": 9.605305036148306e-06,
"loss": 1.0666,
"step": 3195
},
{
"epoch": 0.8552314690928552,
"grad_norm": 3.255485773086548,
"learning_rate": 9.604960346072435e-06,
"loss": 0.9613,
"step": 3196
},
{
"epoch": 0.8554990634198555,
"grad_norm": 3.693399429321289,
"learning_rate": 9.604615511742213e-06,
"loss": 1.169,
"step": 3197
},
{
"epoch": 0.8557666577468558,
"grad_norm": 3.0587754249572754,
"learning_rate": 9.604270533168441e-06,
"loss": 1.0926,
"step": 3198
},
{
"epoch": 0.8560342520738561,
"grad_norm": 3.428370952606201,
"learning_rate": 9.603925410361925e-06,
"loss": 1.111,
"step": 3199
},
{
"epoch": 0.8563018464008563,
"grad_norm": 3.2654330730438232,
"learning_rate": 9.603580143333478e-06,
"loss": 1.1336,
"step": 3200
},
{
"epoch": 0.8565694407278566,
"grad_norm": 3.4805808067321777,
"learning_rate": 9.603234732093913e-06,
"loss": 1.1853,
"step": 3201
},
{
"epoch": 0.8568370350548569,
"grad_norm": 3.3785743713378906,
"learning_rate": 9.602889176654055e-06,
"loss": 1.1114,
"step": 3202
},
{
"epoch": 0.857104629381857,
"grad_norm": 3.4433510303497314,
"learning_rate": 9.602543477024725e-06,
"loss": 1.1787,
"step": 3203
},
{
"epoch": 0.8573722237088574,
"grad_norm": 3.078172445297241,
"learning_rate": 9.602197633216754e-06,
"loss": 1.0089,
"step": 3204
},
{
"epoch": 0.8576398180358576,
"grad_norm": 3.009098768234253,
"learning_rate": 9.601851645240974e-06,
"loss": 0.9585,
"step": 3205
},
{
"epoch": 0.857907412362858,
"grad_norm": 3.31787109375,
"learning_rate": 9.601505513108227e-06,
"loss": 1.097,
"step": 3206
},
{
"epoch": 0.8581750066898581,
"grad_norm": 3.802264451980591,
"learning_rate": 9.601159236829353e-06,
"loss": 1.2097,
"step": 3207
},
{
"epoch": 0.8584426010168584,
"grad_norm": 3.393442392349243,
"learning_rate": 9.600812816415199e-06,
"loss": 1.1896,
"step": 3208
},
{
"epoch": 0.8587101953438587,
"grad_norm": 3.611478090286255,
"learning_rate": 9.600466251876618e-06,
"loss": 1.306,
"step": 3209
},
{
"epoch": 0.858977789670859,
"grad_norm": 3.6949093341827393,
"learning_rate": 9.600119543224467e-06,
"loss": 1.0832,
"step": 3210
},
{
"epoch": 0.8592453839978592,
"grad_norm": 3.2041354179382324,
"learning_rate": 9.599772690469606e-06,
"loss": 1.0338,
"step": 3211
},
{
"epoch": 0.8595129783248595,
"grad_norm": 3.4140734672546387,
"learning_rate": 9.599425693622902e-06,
"loss": 1.1597,
"step": 3212
},
{
"epoch": 0.8597805726518598,
"grad_norm": 2.8042356967926025,
"learning_rate": 9.599078552695223e-06,
"loss": 0.9167,
"step": 3213
},
{
"epoch": 0.86004816697886,
"grad_norm": 3.645156145095825,
"learning_rate": 9.598731267697443e-06,
"loss": 1.3054,
"step": 3214
},
{
"epoch": 0.8603157613058603,
"grad_norm": 3.2911882400512695,
"learning_rate": 9.598383838640443e-06,
"loss": 1.0813,
"step": 3215
},
{
"epoch": 0.8605833556328606,
"grad_norm": 3.168053388595581,
"learning_rate": 9.598036265535104e-06,
"loss": 1.0603,
"step": 3216
},
{
"epoch": 0.8608509499598609,
"grad_norm": 3.6250712871551514,
"learning_rate": 9.597688548392319e-06,
"loss": 1.1671,
"step": 3217
},
{
"epoch": 0.8611185442868611,
"grad_norm": 3.570465326309204,
"learning_rate": 9.597340687222975e-06,
"loss": 1.1248,
"step": 3218
},
{
"epoch": 0.8613861386138614,
"grad_norm": 3.188462495803833,
"learning_rate": 9.596992682037973e-06,
"loss": 1.0927,
"step": 3219
},
{
"epoch": 0.8616537329408617,
"grad_norm": 3.166240692138672,
"learning_rate": 9.596644532848211e-06,
"loss": 1.1719,
"step": 3220
},
{
"epoch": 0.861921327267862,
"grad_norm": 3.369922399520874,
"learning_rate": 9.5962962396646e-06,
"loss": 1.0546,
"step": 3221
},
{
"epoch": 0.8621889215948622,
"grad_norm": 3.611721992492676,
"learning_rate": 9.595947802498046e-06,
"loss": 1.1727,
"step": 3222
},
{
"epoch": 0.8624565159218625,
"grad_norm": 3.5370113849639893,
"learning_rate": 9.595599221359464e-06,
"loss": 1.1045,
"step": 3223
},
{
"epoch": 0.8627241102488628,
"grad_norm": 3.5464746952056885,
"learning_rate": 9.595250496259778e-06,
"loss": 1.1146,
"step": 3224
},
{
"epoch": 0.8629917045758629,
"grad_norm": 3.1489906311035156,
"learning_rate": 9.594901627209908e-06,
"loss": 1.0356,
"step": 3225
},
{
"epoch": 0.8632592989028632,
"grad_norm": 3.69189190864563,
"learning_rate": 9.594552614220785e-06,
"loss": 1.2625,
"step": 3226
},
{
"epoch": 0.8635268932298635,
"grad_norm": 3.298753499984741,
"learning_rate": 9.594203457303339e-06,
"loss": 1.0553,
"step": 3227
},
{
"epoch": 0.8637944875568638,
"grad_norm": 3.2291910648345947,
"learning_rate": 9.593854156468512e-06,
"loss": 1.133,
"step": 3228
},
{
"epoch": 0.864062081883864,
"grad_norm": 3.1540310382843018,
"learning_rate": 9.593504711727243e-06,
"loss": 0.9898,
"step": 3229
},
{
"epoch": 0.8643296762108643,
"grad_norm": 3.049051523208618,
"learning_rate": 9.593155123090479e-06,
"loss": 0.9765,
"step": 3230
},
{
"epoch": 0.8645972705378646,
"grad_norm": 3.2508833408355713,
"learning_rate": 9.592805390569173e-06,
"loss": 1.0901,
"step": 3231
},
{
"epoch": 0.8648648648648649,
"grad_norm": 3.9772286415100098,
"learning_rate": 9.59245551417428e-06,
"loss": 1.2102,
"step": 3232
},
{
"epoch": 0.8651324591918651,
"grad_norm": 3.5919082164764404,
"learning_rate": 9.592105493916758e-06,
"loss": 1.1591,
"step": 3233
},
{
"epoch": 0.8654000535188654,
"grad_norm": 3.9090003967285156,
"learning_rate": 9.591755329807574e-06,
"loss": 1.2031,
"step": 3234
},
{
"epoch": 0.8656676478458657,
"grad_norm": 3.496093273162842,
"learning_rate": 9.591405021857697e-06,
"loss": 1.109,
"step": 3235
},
{
"epoch": 0.8659352421728659,
"grad_norm": 3.8281493186950684,
"learning_rate": 9.5910545700781e-06,
"loss": 1.211,
"step": 3236
},
{
"epoch": 0.8662028364998662,
"grad_norm": 3.2927358150482178,
"learning_rate": 9.59070397447976e-06,
"loss": 1.0816,
"step": 3237
},
{
"epoch": 0.8664704308268665,
"grad_norm": 3.4841480255126953,
"learning_rate": 9.590353235073663e-06,
"loss": 1.1187,
"step": 3238
},
{
"epoch": 0.8667380251538668,
"grad_norm": 3.2719860076904297,
"learning_rate": 9.590002351870793e-06,
"loss": 1.0203,
"step": 3239
},
{
"epoch": 0.867005619480867,
"grad_norm": 3.0735063552856445,
"learning_rate": 9.589651324882143e-06,
"loss": 1.1405,
"step": 3240
},
{
"epoch": 0.8672732138078673,
"grad_norm": 3.6215524673461914,
"learning_rate": 9.58930015411871e-06,
"loss": 1.2612,
"step": 3241
},
{
"epoch": 0.8675408081348676,
"grad_norm": 3.0990355014801025,
"learning_rate": 9.588948839591494e-06,
"loss": 1.1183,
"step": 3242
},
{
"epoch": 0.8678084024618679,
"grad_norm": 2.985930919647217,
"learning_rate": 9.5885973813115e-06,
"loss": 1.0287,
"step": 3243
},
{
"epoch": 0.868075996788868,
"grad_norm": 3.4825994968414307,
"learning_rate": 9.588245779289738e-06,
"loss": 1.0794,
"step": 3244
},
{
"epoch": 0.8683435911158683,
"grad_norm": 3.289504289627075,
"learning_rate": 9.587894033537223e-06,
"loss": 1.1203,
"step": 3245
},
{
"epoch": 0.8686111854428686,
"grad_norm": 3.175842761993408,
"learning_rate": 9.587542144064972e-06,
"loss": 1.0932,
"step": 3246
},
{
"epoch": 0.8688787797698688,
"grad_norm": 3.251260995864868,
"learning_rate": 9.587190110884009e-06,
"loss": 1.0971,
"step": 3247
},
{
"epoch": 0.8691463740968691,
"grad_norm": 2.8951752185821533,
"learning_rate": 9.586837934005363e-06,
"loss": 0.9605,
"step": 3248
},
{
"epoch": 0.8694139684238694,
"grad_norm": 3.264331102371216,
"learning_rate": 9.586485613440064e-06,
"loss": 1.1496,
"step": 3249
},
{
"epoch": 0.8696815627508697,
"grad_norm": 3.1973655223846436,
"learning_rate": 9.586133149199151e-06,
"loss": 1.152,
"step": 3250
},
{
"epoch": 0.8699491570778699,
"grad_norm": 2.9676499366760254,
"learning_rate": 9.585780541293663e-06,
"loss": 1.1106,
"step": 3251
},
{
"epoch": 0.8702167514048702,
"grad_norm": 3.147977352142334,
"learning_rate": 9.585427789734647e-06,
"loss": 1.2677,
"step": 3252
},
{
"epoch": 0.8704843457318705,
"grad_norm": 3.3018641471862793,
"learning_rate": 9.585074894533154e-06,
"loss": 1.0454,
"step": 3253
},
{
"epoch": 0.8707519400588708,
"grad_norm": 3.2543413639068604,
"learning_rate": 9.584721855700238e-06,
"loss": 1.0568,
"step": 3254
},
{
"epoch": 0.871019534385871,
"grad_norm": 3.091062545776367,
"learning_rate": 9.584368673246957e-06,
"loss": 1.0498,
"step": 3255
},
{
"epoch": 0.8712871287128713,
"grad_norm": 3.4238486289978027,
"learning_rate": 9.584015347184376e-06,
"loss": 1.0649,
"step": 3256
},
{
"epoch": 0.8715547230398716,
"grad_norm": 3.575374126434326,
"learning_rate": 9.583661877523565e-06,
"loss": 1.1816,
"step": 3257
},
{
"epoch": 0.8718223173668718,
"grad_norm": 3.7600672245025635,
"learning_rate": 9.583308264275593e-06,
"loss": 1.2363,
"step": 3258
},
{
"epoch": 0.8720899116938721,
"grad_norm": 3.4532859325408936,
"learning_rate": 9.58295450745154e-06,
"loss": 1.2357,
"step": 3259
},
{
"epoch": 0.8723575060208724,
"grad_norm": 2.9904305934906006,
"learning_rate": 9.582600607062486e-06,
"loss": 0.9099,
"step": 3260
},
{
"epoch": 0.8726251003478727,
"grad_norm": 3.143056631088257,
"learning_rate": 9.58224656311952e-06,
"loss": 1.1002,
"step": 3261
},
{
"epoch": 0.8728926946748728,
"grad_norm": 3.371873140335083,
"learning_rate": 9.581892375633729e-06,
"loss": 1.1547,
"step": 3262
},
{
"epoch": 0.8731602890018731,
"grad_norm": 3.277872085571289,
"learning_rate": 9.58153804461621e-06,
"loss": 1.0734,
"step": 3263
},
{
"epoch": 0.8734278833288734,
"grad_norm": 3.2630152702331543,
"learning_rate": 9.581183570078064e-06,
"loss": 1.0204,
"step": 3264
},
{
"epoch": 0.8736954776558737,
"grad_norm": 3.5363354682922363,
"learning_rate": 9.580828952030392e-06,
"loss": 1.2343,
"step": 3265
},
{
"epoch": 0.8739630719828739,
"grad_norm": 3.0592634677886963,
"learning_rate": 9.580474190484306e-06,
"loss": 1.0132,
"step": 3266
},
{
"epoch": 0.8742306663098742,
"grad_norm": 3.0633037090301514,
"learning_rate": 9.580119285450917e-06,
"loss": 1.081,
"step": 3267
},
{
"epoch": 0.8744982606368745,
"grad_norm": 3.3034725189208984,
"learning_rate": 9.579764236941345e-06,
"loss": 1.1423,
"step": 3268
},
{
"epoch": 0.8747658549638747,
"grad_norm": 3.3148138523101807,
"learning_rate": 9.57940904496671e-06,
"loss": 1.1116,
"step": 3269
},
{
"epoch": 0.875033449290875,
"grad_norm": 3.3860421180725098,
"learning_rate": 9.57905370953814e-06,
"loss": 1.0291,
"step": 3270
},
{
"epoch": 0.8753010436178753,
"grad_norm": 3.3635714054107666,
"learning_rate": 9.578698230666767e-06,
"loss": 1.117,
"step": 3271
},
{
"epoch": 0.8755686379448756,
"grad_norm": 3.6900482177734375,
"learning_rate": 9.578342608363723e-06,
"loss": 1.1655,
"step": 3272
},
{
"epoch": 0.8758362322718758,
"grad_norm": 3.450373411178589,
"learning_rate": 9.577986842640152e-06,
"loss": 1.1539,
"step": 3273
},
{
"epoch": 0.8761038265988761,
"grad_norm": 3.40311598777771,
"learning_rate": 9.577630933507196e-06,
"loss": 1.1297,
"step": 3274
},
{
"epoch": 0.8763714209258764,
"grad_norm": 3.1564464569091797,
"learning_rate": 9.577274880976007e-06,
"loss": 1.0134,
"step": 3275
},
{
"epoch": 0.8766390152528767,
"grad_norm": 4.080739498138428,
"learning_rate": 9.576918685057736e-06,
"loss": 1.0344,
"step": 3276
},
{
"epoch": 0.8769066095798769,
"grad_norm": 3.6657655239105225,
"learning_rate": 9.576562345763542e-06,
"loss": 1.1657,
"step": 3277
},
{
"epoch": 0.8771742039068772,
"grad_norm": 3.4710497856140137,
"learning_rate": 9.576205863104588e-06,
"loss": 1.2043,
"step": 3278
},
{
"epoch": 0.8774417982338775,
"grad_norm": 3.297534704208374,
"learning_rate": 9.575849237092042e-06,
"loss": 1.0513,
"step": 3279
},
{
"epoch": 0.8777093925608777,
"grad_norm": 3.476609230041504,
"learning_rate": 9.575492467737074e-06,
"loss": 1.1804,
"step": 3280
},
{
"epoch": 0.877976986887878,
"grad_norm": 3.289842367172241,
"learning_rate": 9.575135555050861e-06,
"loss": 1.1133,
"step": 3281
},
{
"epoch": 0.8782445812148783,
"grad_norm": 3.235844135284424,
"learning_rate": 9.574778499044582e-06,
"loss": 1.0974,
"step": 3282
},
{
"epoch": 0.8785121755418785,
"grad_norm": 3.5740182399749756,
"learning_rate": 9.574421299729424e-06,
"loss": 1.2028,
"step": 3283
},
{
"epoch": 0.8787797698688787,
"grad_norm": 3.4164657592773438,
"learning_rate": 9.574063957116575e-06,
"loss": 1.1063,
"step": 3284
},
{
"epoch": 0.879047364195879,
"grad_norm": 3.4936609268188477,
"learning_rate": 9.573706471217232e-06,
"loss": 1.1992,
"step": 3285
},
{
"epoch": 0.8793149585228793,
"grad_norm": 3.449378252029419,
"learning_rate": 9.573348842042592e-06,
"loss": 1.1717,
"step": 3286
},
{
"epoch": 0.8795825528498796,
"grad_norm": 3.8629961013793945,
"learning_rate": 9.572991069603853e-06,
"loss": 1.2591,
"step": 3287
},
{
"epoch": 0.8798501471768798,
"grad_norm": 3.312222957611084,
"learning_rate": 9.572633153912232e-06,
"loss": 1.0761,
"step": 3288
},
{
"epoch": 0.8801177415038801,
"grad_norm": 3.432467460632324,
"learning_rate": 9.572275094978934e-06,
"loss": 1.211,
"step": 3289
},
{
"epoch": 0.8803853358308804,
"grad_norm": 3.3937036991119385,
"learning_rate": 9.571916892815179e-06,
"loss": 1.2013,
"step": 3290
},
{
"epoch": 0.8806529301578806,
"grad_norm": 3.1374263763427734,
"learning_rate": 9.571558547432185e-06,
"loss": 1.1642,
"step": 3291
},
{
"epoch": 0.8809205244848809,
"grad_norm": 3.3817193508148193,
"learning_rate": 9.57120005884118e-06,
"loss": 1.0251,
"step": 3292
},
{
"epoch": 0.8811881188118812,
"grad_norm": 3.5183558464050293,
"learning_rate": 9.570841427053394e-06,
"loss": 1.103,
"step": 3293
},
{
"epoch": 0.8814557131388815,
"grad_norm": 3.377310037612915,
"learning_rate": 9.57048265208006e-06,
"loss": 1.0861,
"step": 3294
},
{
"epoch": 0.8817233074658817,
"grad_norm": 3.5976264476776123,
"learning_rate": 9.570123733932415e-06,
"loss": 1.2058,
"step": 3295
},
{
"epoch": 0.881990901792882,
"grad_norm": 3.367487668991089,
"learning_rate": 9.569764672621707e-06,
"loss": 1.042,
"step": 3296
},
{
"epoch": 0.8822584961198823,
"grad_norm": 3.392971992492676,
"learning_rate": 9.569405468159183e-06,
"loss": 1.1187,
"step": 3297
},
{
"epoch": 0.8825260904468826,
"grad_norm": 3.862159013748169,
"learning_rate": 9.569046120556092e-06,
"loss": 1.2342,
"step": 3298
},
{
"epoch": 0.8827936847738828,
"grad_norm": 2.9790005683898926,
"learning_rate": 9.568686629823693e-06,
"loss": 1.0197,
"step": 3299
},
{
"epoch": 0.8830612791008831,
"grad_norm": 3.4555444717407227,
"learning_rate": 9.56832699597325e-06,
"loss": 1.1651,
"step": 3300
},
{
"epoch": 0.8833288734278834,
"grad_norm": 4.1415557861328125,
"learning_rate": 9.567967219016024e-06,
"loss": 1.2982,
"step": 3301
},
{
"epoch": 0.8835964677548835,
"grad_norm": 3.344966173171997,
"learning_rate": 9.567607298963288e-06,
"loss": 1.1131,
"step": 3302
},
{
"epoch": 0.8838640620818838,
"grad_norm": 3.7621214389801025,
"learning_rate": 9.567247235826316e-06,
"loss": 1.1731,
"step": 3303
},
{
"epoch": 0.8841316564088841,
"grad_norm": 3.282111167907715,
"learning_rate": 9.56688702961639e-06,
"loss": 1.1508,
"step": 3304
},
{
"epoch": 0.8843992507358844,
"grad_norm": 3.501091957092285,
"learning_rate": 9.566526680344788e-06,
"loss": 1.1034,
"step": 3305
},
{
"epoch": 0.8846668450628846,
"grad_norm": 3.3922479152679443,
"learning_rate": 9.566166188022804e-06,
"loss": 1.054,
"step": 3306
},
{
"epoch": 0.8849344393898849,
"grad_norm": 3.4459426403045654,
"learning_rate": 9.565805552661728e-06,
"loss": 1.2095,
"step": 3307
},
{
"epoch": 0.8852020337168852,
"grad_norm": 3.219888210296631,
"learning_rate": 9.565444774272858e-06,
"loss": 1.106,
"step": 3308
},
{
"epoch": 0.8854696280438855,
"grad_norm": 3.378981113433838,
"learning_rate": 9.565083852867494e-06,
"loss": 1.1935,
"step": 3309
},
{
"epoch": 0.8857372223708857,
"grad_norm": 3.991638660430908,
"learning_rate": 9.564722788456943e-06,
"loss": 1.4006,
"step": 3310
},
{
"epoch": 0.886004816697886,
"grad_norm": 3.2675182819366455,
"learning_rate": 9.564361581052519e-06,
"loss": 1.109,
"step": 3311
},
{
"epoch": 0.8862724110248863,
"grad_norm": 3.029271125793457,
"learning_rate": 9.564000230665534e-06,
"loss": 1.0853,
"step": 3312
},
{
"epoch": 0.8865400053518866,
"grad_norm": 3.339115619659424,
"learning_rate": 9.563638737307307e-06,
"loss": 1.1887,
"step": 3313
},
{
"epoch": 0.8868075996788868,
"grad_norm": 3.1957242488861084,
"learning_rate": 9.56327710098916e-06,
"loss": 1.1597,
"step": 3314
},
{
"epoch": 0.8870751940058871,
"grad_norm": 3.588334798812866,
"learning_rate": 9.562915321722428e-06,
"loss": 1.1607,
"step": 3315
},
{
"epoch": 0.8873427883328874,
"grad_norm": 3.3101933002471924,
"learning_rate": 9.56255339951844e-06,
"loss": 1.1174,
"step": 3316
},
{
"epoch": 0.8876103826598876,
"grad_norm": 3.237942934036255,
"learning_rate": 9.562191334388535e-06,
"loss": 1.1496,
"step": 3317
},
{
"epoch": 0.8878779769868879,
"grad_norm": 3.141970634460449,
"learning_rate": 9.561829126344053e-06,
"loss": 1.0682,
"step": 3318
},
{
"epoch": 0.8881455713138882,
"grad_norm": 3.4344382286071777,
"learning_rate": 9.561466775396342e-06,
"loss": 1.0629,
"step": 3319
},
{
"epoch": 0.8884131656408885,
"grad_norm": 3.219492197036743,
"learning_rate": 9.561104281556752e-06,
"loss": 1.167,
"step": 3320
},
{
"epoch": 0.8886807599678886,
"grad_norm": 3.2902910709381104,
"learning_rate": 9.56074164483664e-06,
"loss": 1.0776,
"step": 3321
},
{
"epoch": 0.8889483542948889,
"grad_norm": 3.4856672286987305,
"learning_rate": 9.560378865247363e-06,
"loss": 1.2053,
"step": 3322
},
{
"epoch": 0.8892159486218892,
"grad_norm": 3.6184751987457275,
"learning_rate": 9.560015942800289e-06,
"loss": 1.1847,
"step": 3323
},
{
"epoch": 0.8894835429488895,
"grad_norm": 3.2977712154388428,
"learning_rate": 9.559652877506785e-06,
"loss": 1.1069,
"step": 3324
},
{
"epoch": 0.8897511372758897,
"grad_norm": 3.0380518436431885,
"learning_rate": 9.559289669378224e-06,
"loss": 1.0513,
"step": 3325
},
{
"epoch": 0.89001873160289,
"grad_norm": 3.521289348602295,
"learning_rate": 9.558926318425986e-06,
"loss": 1.0462,
"step": 3326
},
{
"epoch": 0.8902863259298903,
"grad_norm": 2.921584129333496,
"learning_rate": 9.558562824661448e-06,
"loss": 0.9953,
"step": 3327
},
{
"epoch": 0.8905539202568905,
"grad_norm": 3.5694329738616943,
"learning_rate": 9.558199188096004e-06,
"loss": 1.231,
"step": 3328
},
{
"epoch": 0.8908215145838908,
"grad_norm": 3.6749255657196045,
"learning_rate": 9.557835408741039e-06,
"loss": 1.1523,
"step": 3329
},
{
"epoch": 0.8910891089108911,
"grad_norm": 3.4388997554779053,
"learning_rate": 9.557471486607952e-06,
"loss": 1.1669,
"step": 3330
},
{
"epoch": 0.8913567032378914,
"grad_norm": 3.3783130645751953,
"learning_rate": 9.557107421708142e-06,
"loss": 1.0796,
"step": 3331
},
{
"epoch": 0.8916242975648916,
"grad_norm": 3.5907704830169678,
"learning_rate": 9.556743214053017e-06,
"loss": 1.1456,
"step": 3332
},
{
"epoch": 0.8918918918918919,
"grad_norm": 3.543071746826172,
"learning_rate": 9.55637886365398e-06,
"loss": 1.2526,
"step": 3333
},
{
"epoch": 0.8921594862188922,
"grad_norm": 3.632092237472534,
"learning_rate": 9.55601437052245e-06,
"loss": 1.2962,
"step": 3334
},
{
"epoch": 0.8924270805458925,
"grad_norm": 3.31510066986084,
"learning_rate": 9.55564973466984e-06,
"loss": 1.138,
"step": 3335
},
{
"epoch": 0.8926946748728927,
"grad_norm": 3.437994956970215,
"learning_rate": 9.555284956107578e-06,
"loss": 1.0778,
"step": 3336
},
{
"epoch": 0.892962269199893,
"grad_norm": 3.2904369831085205,
"learning_rate": 9.554920034847088e-06,
"loss": 1.1556,
"step": 3337
},
{
"epoch": 0.8932298635268933,
"grad_norm": 3.6388745307922363,
"learning_rate": 9.5545549708998e-06,
"loss": 1.1318,
"step": 3338
},
{
"epoch": 0.8934974578538935,
"grad_norm": 3.85868239402771,
"learning_rate": 9.554189764277155e-06,
"loss": 1.26,
"step": 3339
},
{
"epoch": 0.8937650521808937,
"grad_norm": 3.250420570373535,
"learning_rate": 9.553824414990588e-06,
"loss": 1.0647,
"step": 3340
},
{
"epoch": 0.894032646507894,
"grad_norm": 3.053664445877075,
"learning_rate": 9.553458923051546e-06,
"loss": 1.0382,
"step": 3341
},
{
"epoch": 0.8943002408348943,
"grad_norm": 3.8642590045928955,
"learning_rate": 9.553093288471479e-06,
"loss": 1.1887,
"step": 3342
},
{
"epoch": 0.8945678351618945,
"grad_norm": 3.3516335487365723,
"learning_rate": 9.552727511261841e-06,
"loss": 1.1131,
"step": 3343
},
{
"epoch": 0.8948354294888948,
"grad_norm": 2.911613702774048,
"learning_rate": 9.55236159143409e-06,
"loss": 1.0118,
"step": 3344
},
{
"epoch": 0.8951030238158951,
"grad_norm": 3.028801441192627,
"learning_rate": 9.551995528999686e-06,
"loss": 1.0264,
"step": 3345
},
{
"epoch": 0.8953706181428954,
"grad_norm": 3.528012752532959,
"learning_rate": 9.5516293239701e-06,
"loss": 1.1946,
"step": 3346
},
{
"epoch": 0.8956382124698956,
"grad_norm": 2.96004581451416,
"learning_rate": 9.551262976356801e-06,
"loss": 0.9409,
"step": 3347
},
{
"epoch": 0.8959058067968959,
"grad_norm": 3.013521194458008,
"learning_rate": 9.550896486171268e-06,
"loss": 1.0383,
"step": 3348
},
{
"epoch": 0.8961734011238962,
"grad_norm": 3.353602409362793,
"learning_rate": 9.550529853424979e-06,
"loss": 1.0802,
"step": 3349
},
{
"epoch": 0.8964409954508964,
"grad_norm": 3.60223126411438,
"learning_rate": 9.55016307812942e-06,
"loss": 1.1218,
"step": 3350
},
{
"epoch": 0.8967085897778967,
"grad_norm": 3.689014434814453,
"learning_rate": 9.549796160296081e-06,
"loss": 1.2695,
"step": 3351
},
{
"epoch": 0.896976184104897,
"grad_norm": 2.9822311401367188,
"learning_rate": 9.549429099936455e-06,
"loss": 0.9881,
"step": 3352
},
{
"epoch": 0.8972437784318973,
"grad_norm": 3.03279447555542,
"learning_rate": 9.549061897062043e-06,
"loss": 1.0138,
"step": 3353
},
{
"epoch": 0.8975113727588975,
"grad_norm": 3.1987500190734863,
"learning_rate": 9.548694551684345e-06,
"loss": 1.0596,
"step": 3354
},
{
"epoch": 0.8977789670858978,
"grad_norm": 3.4811275005340576,
"learning_rate": 9.548327063814871e-06,
"loss": 1.1956,
"step": 3355
},
{
"epoch": 0.8980465614128981,
"grad_norm": 3.577713966369629,
"learning_rate": 9.547959433465128e-06,
"loss": 1.1442,
"step": 3356
},
{
"epoch": 0.8983141557398984,
"grad_norm": 3.459491014480591,
"learning_rate": 9.547591660646637e-06,
"loss": 1.3063,
"step": 3357
},
{
"epoch": 0.8985817500668986,
"grad_norm": 3.2407350540161133,
"learning_rate": 9.54722374537092e-06,
"loss": 1.0682,
"step": 3358
},
{
"epoch": 0.8988493443938989,
"grad_norm": 3.129257917404175,
"learning_rate": 9.546855687649497e-06,
"loss": 1.0516,
"step": 3359
},
{
"epoch": 0.8991169387208992,
"grad_norm": 3.4182918071746826,
"learning_rate": 9.5464874874939e-06,
"loss": 1.1032,
"step": 3360
},
{
"epoch": 0.8993845330478993,
"grad_norm": 3.995587110519409,
"learning_rate": 9.546119144915667e-06,
"loss": 1.1301,
"step": 3361
},
{
"epoch": 0.8996521273748996,
"grad_norm": 3.1512610912323,
"learning_rate": 9.545750659926331e-06,
"loss": 1.0217,
"step": 3362
},
{
"epoch": 0.8999197217018999,
"grad_norm": 3.4359290599823,
"learning_rate": 9.545382032537438e-06,
"loss": 1.2411,
"step": 3363
},
{
"epoch": 0.9001873160289002,
"grad_norm": 3.2754461765289307,
"learning_rate": 9.545013262760535e-06,
"loss": 1.0496,
"step": 3364
},
{
"epoch": 0.9004549103559004,
"grad_norm": 3.3657703399658203,
"learning_rate": 9.544644350607173e-06,
"loss": 1.0734,
"step": 3365
},
{
"epoch": 0.9007225046829007,
"grad_norm": 3.1346230506896973,
"learning_rate": 9.54427529608891e-06,
"loss": 1.1506,
"step": 3366
},
{
"epoch": 0.900990099009901,
"grad_norm": 3.4281997680664062,
"learning_rate": 9.543906099217308e-06,
"loss": 1.0849,
"step": 3367
},
{
"epoch": 0.9012576933369013,
"grad_norm": 3.335317850112915,
"learning_rate": 9.543536760003928e-06,
"loss": 1.1822,
"step": 3368
},
{
"epoch": 0.9015252876639015,
"grad_norm": 3.0482382774353027,
"learning_rate": 9.543167278460345e-06,
"loss": 1.1431,
"step": 3369
},
{
"epoch": 0.9017928819909018,
"grad_norm": 3.4252700805664062,
"learning_rate": 9.54279765459813e-06,
"loss": 1.0517,
"step": 3370
},
{
"epoch": 0.9020604763179021,
"grad_norm": 3.5490097999572754,
"learning_rate": 9.542427888428864e-06,
"loss": 1.1277,
"step": 3371
},
{
"epoch": 0.9023280706449023,
"grad_norm": 3.5822768211364746,
"learning_rate": 9.54205797996413e-06,
"loss": 1.0832,
"step": 3372
},
{
"epoch": 0.9025956649719026,
"grad_norm": 3.2864580154418945,
"learning_rate": 9.541687929215512e-06,
"loss": 1.0394,
"step": 3373
},
{
"epoch": 0.9028632592989029,
"grad_norm": 3.281869888305664,
"learning_rate": 9.541317736194608e-06,
"loss": 1.1343,
"step": 3374
},
{
"epoch": 0.9031308536259032,
"grad_norm": 3.3536150455474854,
"learning_rate": 9.54094740091301e-06,
"loss": 1.1296,
"step": 3375
},
{
"epoch": 0.9033984479529034,
"grad_norm": 3.730140447616577,
"learning_rate": 9.54057692338232e-06,
"loss": 1.1234,
"step": 3376
},
{
"epoch": 0.9036660422799037,
"grad_norm": 3.273059129714966,
"learning_rate": 9.540206303614146e-06,
"loss": 1.0658,
"step": 3377
},
{
"epoch": 0.903933636606904,
"grad_norm": 3.179582357406616,
"learning_rate": 9.539835541620096e-06,
"loss": 1.0502,
"step": 3378
},
{
"epoch": 0.9042012309339043,
"grad_norm": 3.5026938915252686,
"learning_rate": 9.539464637411782e-06,
"loss": 1.1883,
"step": 3379
},
{
"epoch": 0.9044688252609044,
"grad_norm": 3.0967392921447754,
"learning_rate": 9.539093591000828e-06,
"loss": 1.169,
"step": 3380
},
{
"epoch": 0.9047364195879047,
"grad_norm": 3.457193613052368,
"learning_rate": 9.538722402398854e-06,
"loss": 1.1136,
"step": 3381
},
{
"epoch": 0.905004013914905,
"grad_norm": 3.4893715381622314,
"learning_rate": 9.538351071617489e-06,
"loss": 1.1458,
"step": 3382
},
{
"epoch": 0.9052716082419052,
"grad_norm": 3.3818037509918213,
"learning_rate": 9.537979598668364e-06,
"loss": 1.1278,
"step": 3383
},
{
"epoch": 0.9055392025689055,
"grad_norm": 3.309565782546997,
"learning_rate": 9.537607983563117e-06,
"loss": 1.1216,
"step": 3384
},
{
"epoch": 0.9058067968959058,
"grad_norm": 3.428215980529785,
"learning_rate": 9.53723622631339e-06,
"loss": 1.1297,
"step": 3385
},
{
"epoch": 0.9060743912229061,
"grad_norm": 3.088609457015991,
"learning_rate": 9.536864326930826e-06,
"loss": 1.0726,
"step": 3386
},
{
"epoch": 0.9063419855499063,
"grad_norm": 3.1660032272338867,
"learning_rate": 9.536492285427077e-06,
"loss": 1.1357,
"step": 3387
},
{
"epoch": 0.9066095798769066,
"grad_norm": 3.1965155601501465,
"learning_rate": 9.536120101813797e-06,
"loss": 1.0836,
"step": 3388
},
{
"epoch": 0.9068771742039069,
"grad_norm": 3.284010410308838,
"learning_rate": 9.535747776102645e-06,
"loss": 1.1047,
"step": 3389
},
{
"epoch": 0.9071447685309072,
"grad_norm": 3.5011777877807617,
"learning_rate": 9.535375308305283e-06,
"loss": 1.2027,
"step": 3390
},
{
"epoch": 0.9074123628579074,
"grad_norm": 3.8040499687194824,
"learning_rate": 9.535002698433383e-06,
"loss": 1.1663,
"step": 3391
},
{
"epoch": 0.9076799571849077,
"grad_norm": 3.3377859592437744,
"learning_rate": 9.534629946498613e-06,
"loss": 1.2642,
"step": 3392
},
{
"epoch": 0.907947551511908,
"grad_norm": 3.4258129596710205,
"learning_rate": 9.534257052512651e-06,
"loss": 1.1166,
"step": 3393
},
{
"epoch": 0.9082151458389082,
"grad_norm": 3.600273847579956,
"learning_rate": 9.533884016487181e-06,
"loss": 1.0948,
"step": 3394
},
{
"epoch": 0.9084827401659085,
"grad_norm": 3.1784534454345703,
"learning_rate": 9.533510838433884e-06,
"loss": 1.0578,
"step": 3395
},
{
"epoch": 0.9087503344929088,
"grad_norm": 3.3856201171875,
"learning_rate": 9.533137518364453e-06,
"loss": 1.1899,
"step": 3396
},
{
"epoch": 0.9090179288199091,
"grad_norm": 3.489384412765503,
"learning_rate": 9.532764056290582e-06,
"loss": 1.0646,
"step": 3397
},
{
"epoch": 0.9092855231469092,
"grad_norm": 3.3582942485809326,
"learning_rate": 9.53239045222397e-06,
"loss": 1.1614,
"step": 3398
},
{
"epoch": 0.9095531174739095,
"grad_norm": 3.891071319580078,
"learning_rate": 9.53201670617632e-06,
"loss": 1.2788,
"step": 3399
},
{
"epoch": 0.9098207118009098,
"grad_norm": 4.0248026847839355,
"learning_rate": 9.531642818159341e-06,
"loss": 1.2048,
"step": 3400
},
{
"epoch": 0.9100883061279101,
"grad_norm": 3.6524062156677246,
"learning_rate": 9.531268788184744e-06,
"loss": 1.3362,
"step": 3401
},
{
"epoch": 0.9103559004549103,
"grad_norm": 3.208773374557495,
"learning_rate": 9.530894616264248e-06,
"loss": 1.0624,
"step": 3402
},
{
"epoch": 0.9106234947819106,
"grad_norm": 3.280280351638794,
"learning_rate": 9.530520302409572e-06,
"loss": 1.138,
"step": 3403
},
{
"epoch": 0.9108910891089109,
"grad_norm": 3.034221887588501,
"learning_rate": 9.530145846632441e-06,
"loss": 1.1124,
"step": 3404
},
{
"epoch": 0.9111586834359111,
"grad_norm": 3.4571194648742676,
"learning_rate": 9.52977124894459e-06,
"loss": 1.1556,
"step": 3405
},
{
"epoch": 0.9114262777629114,
"grad_norm": 3.2355082035064697,
"learning_rate": 9.529396509357748e-06,
"loss": 1.1784,
"step": 3406
},
{
"epoch": 0.9116938720899117,
"grad_norm": 3.521646499633789,
"learning_rate": 9.529021627883657e-06,
"loss": 1.0774,
"step": 3407
},
{
"epoch": 0.911961466416912,
"grad_norm": 3.313163995742798,
"learning_rate": 9.528646604534058e-06,
"loss": 1.035,
"step": 3408
},
{
"epoch": 0.9122290607439122,
"grad_norm": 3.7362465858459473,
"learning_rate": 9.528271439320703e-06,
"loss": 1.2586,
"step": 3409
},
{
"epoch": 0.9124966550709125,
"grad_norm": 3.2787117958068848,
"learning_rate": 9.527896132255341e-06,
"loss": 1.1981,
"step": 3410
},
{
"epoch": 0.9127642493979128,
"grad_norm": 3.5333304405212402,
"learning_rate": 9.52752068334973e-06,
"loss": 1.1986,
"step": 3411
},
{
"epoch": 0.9130318437249131,
"grad_norm": 3.5400190353393555,
"learning_rate": 9.527145092615631e-06,
"loss": 1.1002,
"step": 3412
},
{
"epoch": 0.9132994380519133,
"grad_norm": 3.786625862121582,
"learning_rate": 9.526769360064812e-06,
"loss": 1.1783,
"step": 3413
},
{
"epoch": 0.9135670323789136,
"grad_norm": 3.3080086708068848,
"learning_rate": 9.526393485709038e-06,
"loss": 1.0919,
"step": 3414
},
{
"epoch": 0.9138346267059139,
"grad_norm": 3.204632043838501,
"learning_rate": 9.526017469560088e-06,
"loss": 1.0737,
"step": 3415
},
{
"epoch": 0.914102221032914,
"grad_norm": 3.2012712955474854,
"learning_rate": 9.52564131162974e-06,
"loss": 1.1405,
"step": 3416
},
{
"epoch": 0.9143698153599144,
"grad_norm": 3.119194746017456,
"learning_rate": 9.525265011929776e-06,
"loss": 1.0067,
"step": 3417
},
{
"epoch": 0.9146374096869146,
"grad_norm": 3.2325518131256104,
"learning_rate": 9.524888570471987e-06,
"loss": 1.2162,
"step": 3418
},
{
"epoch": 0.914905004013915,
"grad_norm": 3.490710973739624,
"learning_rate": 9.524511987268161e-06,
"loss": 1.0244,
"step": 3419
},
{
"epoch": 0.9151725983409151,
"grad_norm": 3.0969996452331543,
"learning_rate": 9.524135262330098e-06,
"loss": 1.1009,
"step": 3420
},
{
"epoch": 0.9154401926679154,
"grad_norm": 3.454273223876953,
"learning_rate": 9.523758395669598e-06,
"loss": 1.1822,
"step": 3421
},
{
"epoch": 0.9157077869949157,
"grad_norm": 3.2115368843078613,
"learning_rate": 9.523381387298469e-06,
"loss": 1.0328,
"step": 3422
},
{
"epoch": 0.915975381321916,
"grad_norm": 3.3276708126068115,
"learning_rate": 9.523004237228517e-06,
"loss": 1.1133,
"step": 3423
},
{
"epoch": 0.9162429756489162,
"grad_norm": 3.601778507232666,
"learning_rate": 9.522626945471561e-06,
"loss": 1.2206,
"step": 3424
},
{
"epoch": 0.9165105699759165,
"grad_norm": 3.281283378601074,
"learning_rate": 9.522249512039417e-06,
"loss": 1.09,
"step": 3425
},
{
"epoch": 0.9167781643029168,
"grad_norm": 3.4089417457580566,
"learning_rate": 9.521871936943907e-06,
"loss": 1.2478,
"step": 3426
},
{
"epoch": 0.917045758629917,
"grad_norm": 3.5590267181396484,
"learning_rate": 9.521494220196862e-06,
"loss": 1.1056,
"step": 3427
},
{
"epoch": 0.9173133529569173,
"grad_norm": 3.1895391941070557,
"learning_rate": 9.521116361810115e-06,
"loss": 1.1022,
"step": 3428
},
{
"epoch": 0.9175809472839176,
"grad_norm": 3.6638033390045166,
"learning_rate": 9.5207383617955e-06,
"loss": 1.3269,
"step": 3429
},
{
"epoch": 0.9178485416109179,
"grad_norm": 3.7325849533081055,
"learning_rate": 9.52036022016486e-06,
"loss": 1.2461,
"step": 3430
},
{
"epoch": 0.9181161359379181,
"grad_norm": 3.56632137298584,
"learning_rate": 9.519981936930038e-06,
"loss": 1.2025,
"step": 3431
},
{
"epoch": 0.9183837302649184,
"grad_norm": 3.4402332305908203,
"learning_rate": 9.519603512102887e-06,
"loss": 1.0808,
"step": 3432
},
{
"epoch": 0.9186513245919187,
"grad_norm": 3.298569917678833,
"learning_rate": 9.51922494569526e-06,
"loss": 1.0316,
"step": 3433
},
{
"epoch": 0.918918918918919,
"grad_norm": 3.712266683578491,
"learning_rate": 9.518846237719018e-06,
"loss": 1.2548,
"step": 3434
},
{
"epoch": 0.9191865132459192,
"grad_norm": 3.270195722579956,
"learning_rate": 9.51846738818602e-06,
"loss": 1.1098,
"step": 3435
},
{
"epoch": 0.9194541075729195,
"grad_norm": 3.471754312515259,
"learning_rate": 9.518088397108138e-06,
"loss": 1.0728,
"step": 3436
},
{
"epoch": 0.9197217018999198,
"grad_norm": 3.256743907928467,
"learning_rate": 9.517709264497242e-06,
"loss": 1.1637,
"step": 3437
},
{
"epoch": 0.9199892962269199,
"grad_norm": 3.3077757358551025,
"learning_rate": 9.51732999036521e-06,
"loss": 1.1242,
"step": 3438
},
{
"epoch": 0.9202568905539202,
"grad_norm": 3.492668867111206,
"learning_rate": 9.516950574723922e-06,
"loss": 1.0478,
"step": 3439
},
{
"epoch": 0.9205244848809205,
"grad_norm": 3.167327880859375,
"learning_rate": 9.516571017585265e-06,
"loss": 1.1084,
"step": 3440
},
{
"epoch": 0.9207920792079208,
"grad_norm": 2.9469475746154785,
"learning_rate": 9.516191318961126e-06,
"loss": 1.0578,
"step": 3441
},
{
"epoch": 0.921059673534921,
"grad_norm": 3.6381478309631348,
"learning_rate": 9.515811478863402e-06,
"loss": 1.2693,
"step": 3442
},
{
"epoch": 0.9213272678619213,
"grad_norm": 3.4339804649353027,
"learning_rate": 9.515431497303992e-06,
"loss": 1.1058,
"step": 3443
},
{
"epoch": 0.9215948621889216,
"grad_norm": 3.2788732051849365,
"learning_rate": 9.515051374294797e-06,
"loss": 0.9658,
"step": 3444
},
{
"epoch": 0.9218624565159219,
"grad_norm": 3.393667697906494,
"learning_rate": 9.514671109847727e-06,
"loss": 1.0991,
"step": 3445
},
{
"epoch": 0.9221300508429221,
"grad_norm": 3.7156498432159424,
"learning_rate": 9.514290703974694e-06,
"loss": 1.0418,
"step": 3446
},
{
"epoch": 0.9223976451699224,
"grad_norm": 3.149561882019043,
"learning_rate": 9.513910156687612e-06,
"loss": 1.1174,
"step": 3447
},
{
"epoch": 0.9226652394969227,
"grad_norm": 3.51082181930542,
"learning_rate": 9.513529467998404e-06,
"loss": 1.1957,
"step": 3448
},
{
"epoch": 0.9229328338239229,
"grad_norm": 3.3628318309783936,
"learning_rate": 9.513148637918995e-06,
"loss": 1.0921,
"step": 3449
},
{
"epoch": 0.9232004281509232,
"grad_norm": 3.5448429584503174,
"learning_rate": 9.512767666461316e-06,
"loss": 1.1887,
"step": 3450
},
{
"epoch": 0.9234680224779235,
"grad_norm": 3.335571050643921,
"learning_rate": 9.512386553637298e-06,
"loss": 1.0374,
"step": 3451
},
{
"epoch": 0.9237356168049238,
"grad_norm": 3.2040200233459473,
"learning_rate": 9.512005299458885e-06,
"loss": 1.1254,
"step": 3452
},
{
"epoch": 0.924003211131924,
"grad_norm": 3.4492249488830566,
"learning_rate": 9.511623903938015e-06,
"loss": 1.1536,
"step": 3453
},
{
"epoch": 0.9242708054589243,
"grad_norm": 3.422445774078369,
"learning_rate": 9.511242367086637e-06,
"loss": 1.2253,
"step": 3454
},
{
"epoch": 0.9245383997859246,
"grad_norm": 3.5041046142578125,
"learning_rate": 9.510860688916704e-06,
"loss": 1.1549,
"step": 3455
},
{
"epoch": 0.9248059941129249,
"grad_norm": 3.517303705215454,
"learning_rate": 9.510478869440172e-06,
"loss": 1.1445,
"step": 3456
},
{
"epoch": 0.925073588439925,
"grad_norm": 3.393651247024536,
"learning_rate": 9.510096908669e-06,
"loss": 1.2026,
"step": 3457
},
{
"epoch": 0.9253411827669253,
"grad_norm": 3.499711751937866,
"learning_rate": 9.509714806615157e-06,
"loss": 1.2276,
"step": 3458
},
{
"epoch": 0.9256087770939256,
"grad_norm": 3.527127742767334,
"learning_rate": 9.509332563290611e-06,
"loss": 1.1068,
"step": 3459
},
{
"epoch": 0.9258763714209258,
"grad_norm": 2.9405460357666016,
"learning_rate": 9.508950178707335e-06,
"loss": 0.984,
"step": 3460
},
{
"epoch": 0.9261439657479261,
"grad_norm": 3.161170482635498,
"learning_rate": 9.508567652877307e-06,
"loss": 1.0606,
"step": 3461
},
{
"epoch": 0.9264115600749264,
"grad_norm": 2.99855375289917,
"learning_rate": 9.508184985812514e-06,
"loss": 0.9767,
"step": 3462
},
{
"epoch": 0.9266791544019267,
"grad_norm": 3.1051480770111084,
"learning_rate": 9.507802177524937e-06,
"loss": 1.0899,
"step": 3463
},
{
"epoch": 0.9269467487289269,
"grad_norm": 3.3303263187408447,
"learning_rate": 9.507419228026574e-06,
"loss": 1.2223,
"step": 3464
},
{
"epoch": 0.9272143430559272,
"grad_norm": 3.6255226135253906,
"learning_rate": 9.507036137329417e-06,
"loss": 1.2277,
"step": 3465
},
{
"epoch": 0.9274819373829275,
"grad_norm": 3.4016709327697754,
"learning_rate": 9.50665290544547e-06,
"loss": 1.0828,
"step": 3466
},
{
"epoch": 0.9277495317099278,
"grad_norm": 3.1820127964019775,
"learning_rate": 9.506269532386736e-06,
"loss": 1.0512,
"step": 3467
},
{
"epoch": 0.928017126036928,
"grad_norm": 3.8635528087615967,
"learning_rate": 9.505886018165223e-06,
"loss": 1.137,
"step": 3468
},
{
"epoch": 0.9282847203639283,
"grad_norm": 3.615046977996826,
"learning_rate": 9.505502362792947e-06,
"loss": 1.1372,
"step": 3469
},
{
"epoch": 0.9285523146909286,
"grad_norm": 3.5528488159179688,
"learning_rate": 9.505118566281928e-06,
"loss": 1.145,
"step": 3470
},
{
"epoch": 0.9288199090179288,
"grad_norm": 3.367374897003174,
"learning_rate": 9.504734628644186e-06,
"loss": 1.1556,
"step": 3471
},
{
"epoch": 0.9290875033449291,
"grad_norm": 3.4801955223083496,
"learning_rate": 9.504350549891748e-06,
"loss": 1.1011,
"step": 3472
},
{
"epoch": 0.9293550976719294,
"grad_norm": 3.310755729675293,
"learning_rate": 9.503966330036646e-06,
"loss": 1.239,
"step": 3473
},
{
"epoch": 0.9296226919989297,
"grad_norm": 3.1718642711639404,
"learning_rate": 9.50358196909092e-06,
"loss": 1.1341,
"step": 3474
},
{
"epoch": 0.9298902863259298,
"grad_norm": 3.5848066806793213,
"learning_rate": 9.503197467066604e-06,
"loss": 1.2817,
"step": 3475
},
{
"epoch": 0.9301578806529301,
"grad_norm": 3.2660577297210693,
"learning_rate": 9.502812823975746e-06,
"loss": 1.1082,
"step": 3476
},
{
"epoch": 0.9304254749799304,
"grad_norm": 3.4839932918548584,
"learning_rate": 9.502428039830395e-06,
"loss": 1.0307,
"step": 3477
},
{
"epoch": 0.9306930693069307,
"grad_norm": 3.186553478240967,
"learning_rate": 9.502043114642607e-06,
"loss": 0.9545,
"step": 3478
},
{
"epoch": 0.9309606636339309,
"grad_norm": 3.0523486137390137,
"learning_rate": 9.501658048424437e-06,
"loss": 0.99,
"step": 3479
},
{
"epoch": 0.9312282579609312,
"grad_norm": 3.2745773792266846,
"learning_rate": 9.501272841187949e-06,
"loss": 1.105,
"step": 3480
},
{
"epoch": 0.9314958522879315,
"grad_norm": 3.30385422706604,
"learning_rate": 9.500887492945208e-06,
"loss": 1.03,
"step": 3481
},
{
"epoch": 0.9317634466149317,
"grad_norm": 3.7593932151794434,
"learning_rate": 9.500502003708287e-06,
"loss": 1.3537,
"step": 3482
},
{
"epoch": 0.932031040941932,
"grad_norm": 3.3091962337493896,
"learning_rate": 9.500116373489264e-06,
"loss": 1.1339,
"step": 3483
},
{
"epoch": 0.9322986352689323,
"grad_norm": 3.1026055812835693,
"learning_rate": 9.499730602300213e-06,
"loss": 1.095,
"step": 3484
},
{
"epoch": 0.9325662295959326,
"grad_norm": 3.178584337234497,
"learning_rate": 9.499344690153226e-06,
"loss": 0.9671,
"step": 3485
},
{
"epoch": 0.9328338239229328,
"grad_norm": 3.1470065116882324,
"learning_rate": 9.498958637060385e-06,
"loss": 1.1067,
"step": 3486
},
{
"epoch": 0.9331014182499331,
"grad_norm": 3.4016363620758057,
"learning_rate": 9.498572443033789e-06,
"loss": 1.1828,
"step": 3487
},
{
"epoch": 0.9333690125769334,
"grad_norm": 3.14091157913208,
"learning_rate": 9.498186108085534e-06,
"loss": 1.0975,
"step": 3488
},
{
"epoch": 0.9336366069039337,
"grad_norm": 3.34956431388855,
"learning_rate": 9.497799632227721e-06,
"loss": 1.2063,
"step": 3489
},
{
"epoch": 0.9339042012309339,
"grad_norm": 3.507667064666748,
"learning_rate": 9.497413015472458e-06,
"loss": 1.1349,
"step": 3490
},
{
"epoch": 0.9341717955579342,
"grad_norm": 3.209160566329956,
"learning_rate": 9.497026257831856e-06,
"loss": 1.0282,
"step": 3491
},
{
"epoch": 0.9344393898849345,
"grad_norm": 3.31990647315979,
"learning_rate": 9.49663935931803e-06,
"loss": 1.1131,
"step": 3492
},
{
"epoch": 0.9347069842119347,
"grad_norm": 3.19960618019104,
"learning_rate": 9.4962523199431e-06,
"loss": 1.1472,
"step": 3493
},
{
"epoch": 0.934974578538935,
"grad_norm": 3.4468166828155518,
"learning_rate": 9.495865139719192e-06,
"loss": 1.1146,
"step": 3494
},
{
"epoch": 0.9352421728659353,
"grad_norm": 3.4722113609313965,
"learning_rate": 9.495477818658432e-06,
"loss": 1.0553,
"step": 3495
},
{
"epoch": 0.9355097671929355,
"grad_norm": 3.26762318611145,
"learning_rate": 9.495090356772955e-06,
"loss": 1.0282,
"step": 3496
},
{
"epoch": 0.9357773615199357,
"grad_norm": 3.4020135402679443,
"learning_rate": 9.494702754074898e-06,
"loss": 1.1634,
"step": 3497
},
{
"epoch": 0.936044955846936,
"grad_norm": 3.114060401916504,
"learning_rate": 9.494315010576405e-06,
"loss": 1.0458,
"step": 3498
},
{
"epoch": 0.9363125501739363,
"grad_norm": 2.988351345062256,
"learning_rate": 9.493927126289619e-06,
"loss": 1.0748,
"step": 3499
},
{
"epoch": 0.9365801445009366,
"grad_norm": 3.3398990631103516,
"learning_rate": 9.493539101226692e-06,
"loss": 1.0633,
"step": 3500
},
{
"epoch": 0.9365801445009366,
"eval_loss": 1.1470533609390259,
"eval_runtime": 11.4325,
"eval_samples_per_second": 34.988,
"eval_steps_per_second": 4.373,
"step": 3500
},
{
"epoch": 0.9368477388279368,
"grad_norm": 3.560861587524414,
"learning_rate": 9.493150935399779e-06,
"loss": 0.9958,
"step": 3501
},
{
"epoch": 0.9371153331549371,
"grad_norm": 3.2529478073120117,
"learning_rate": 9.49276262882104e-06,
"loss": 1.1023,
"step": 3502
},
{
"epoch": 0.9373829274819374,
"grad_norm": 3.466764211654663,
"learning_rate": 9.49237418150264e-06,
"loss": 1.1395,
"step": 3503
},
{
"epoch": 0.9376505218089376,
"grad_norm": 3.3550920486450195,
"learning_rate": 9.491985593456747e-06,
"loss": 1.111,
"step": 3504
},
{
"epoch": 0.9379181161359379,
"grad_norm": 3.469780683517456,
"learning_rate": 9.491596864695534e-06,
"loss": 1.1792,
"step": 3505
},
{
"epoch": 0.9381857104629382,
"grad_norm": 3.7144579887390137,
"learning_rate": 9.491207995231176e-06,
"loss": 1.2884,
"step": 3506
},
{
"epoch": 0.9384533047899385,
"grad_norm": 2.8839621543884277,
"learning_rate": 9.490818985075856e-06,
"loss": 1.0105,
"step": 3507
},
{
"epoch": 0.9387208991169387,
"grad_norm": 3.0779335498809814,
"learning_rate": 9.490429834241763e-06,
"loss": 1.052,
"step": 3508
},
{
"epoch": 0.938988493443939,
"grad_norm": 3.260777711868286,
"learning_rate": 9.490040542741084e-06,
"loss": 1.0654,
"step": 3509
},
{
"epoch": 0.9392560877709393,
"grad_norm": 3.4385604858398438,
"learning_rate": 9.489651110586014e-06,
"loss": 1.2247,
"step": 3510
},
{
"epoch": 0.9395236820979396,
"grad_norm": 3.460442066192627,
"learning_rate": 9.489261537788754e-06,
"loss": 1.0923,
"step": 3511
},
{
"epoch": 0.9397912764249398,
"grad_norm": 3.365694284439087,
"learning_rate": 9.488871824361508e-06,
"loss": 1.0738,
"step": 3512
},
{
"epoch": 0.9400588707519401,
"grad_norm": 3.1729164123535156,
"learning_rate": 9.48848197031648e-06,
"loss": 1.1758,
"step": 3513
},
{
"epoch": 0.9403264650789404,
"grad_norm": 3.1198678016662598,
"learning_rate": 9.488091975665887e-06,
"loss": 1.0894,
"step": 3514
},
{
"epoch": 0.9405940594059405,
"grad_norm": 3.1159262657165527,
"learning_rate": 9.487701840421945e-06,
"loss": 1.1764,
"step": 3515
},
{
"epoch": 0.9408616537329408,
"grad_norm": 3.0507001876831055,
"learning_rate": 9.487311564596875e-06,
"loss": 1.0162,
"step": 3516
},
{
"epoch": 0.9411292480599411,
"grad_norm": 3.2293381690979004,
"learning_rate": 9.486921148202901e-06,
"loss": 1.0442,
"step": 3517
},
{
"epoch": 0.9413968423869414,
"grad_norm": 3.246738910675049,
"learning_rate": 9.486530591252257e-06,
"loss": 1.1221,
"step": 3518
},
{
"epoch": 0.9416644367139416,
"grad_norm": 3.022080659866333,
"learning_rate": 9.486139893757172e-06,
"loss": 0.9833,
"step": 3519
},
{
"epoch": 0.9419320310409419,
"grad_norm": 3.232952356338501,
"learning_rate": 9.485749055729891e-06,
"loss": 1.058,
"step": 3520
},
{
"epoch": 0.9421996253679422,
"grad_norm": 3.4002277851104736,
"learning_rate": 9.485358077182653e-06,
"loss": 1.1898,
"step": 3521
},
{
"epoch": 0.9424672196949425,
"grad_norm": 3.4779772758483887,
"learning_rate": 9.484966958127707e-06,
"loss": 1.0439,
"step": 3522
},
{
"epoch": 0.9427348140219427,
"grad_norm": 3.6369545459747314,
"learning_rate": 9.484575698577305e-06,
"loss": 1.1877,
"step": 3523
},
{
"epoch": 0.943002408348943,
"grad_norm": 3.197024345397949,
"learning_rate": 9.484184298543706e-06,
"loss": 1.0569,
"step": 3524
},
{
"epoch": 0.9432700026759433,
"grad_norm": 3.642747640609741,
"learning_rate": 9.483792758039165e-06,
"loss": 1.2194,
"step": 3525
},
{
"epoch": 0.9435375970029435,
"grad_norm": 3.4038000106811523,
"learning_rate": 9.483401077075954e-06,
"loss": 1.1591,
"step": 3526
},
{
"epoch": 0.9438051913299438,
"grad_norm": 2.978867292404175,
"learning_rate": 9.48300925566634e-06,
"loss": 1.0912,
"step": 3527
},
{
"epoch": 0.9440727856569441,
"grad_norm": 3.416811943054199,
"learning_rate": 9.482617293822596e-06,
"loss": 1.124,
"step": 3528
},
{
"epoch": 0.9443403799839444,
"grad_norm": 3.1118931770324707,
"learning_rate": 9.482225191557001e-06,
"loss": 1.0341,
"step": 3529
},
{
"epoch": 0.9446079743109446,
"grad_norm": 3.495542049407959,
"learning_rate": 9.48183294888184e-06,
"loss": 1.3161,
"step": 3530
},
{
"epoch": 0.9448755686379449,
"grad_norm": 3.3605546951293945,
"learning_rate": 9.481440565809398e-06,
"loss": 1.0635,
"step": 3531
},
{
"epoch": 0.9451431629649452,
"grad_norm": 3.383915901184082,
"learning_rate": 9.481048042351967e-06,
"loss": 1.0157,
"step": 3532
},
{
"epoch": 0.9454107572919455,
"grad_norm": 2.984562873840332,
"learning_rate": 9.480655378521845e-06,
"loss": 1.0117,
"step": 3533
},
{
"epoch": 0.9456783516189456,
"grad_norm": 3.205364942550659,
"learning_rate": 9.480262574331331e-06,
"loss": 1.0787,
"step": 3534
},
{
"epoch": 0.9459459459459459,
"grad_norm": 3.5284080505371094,
"learning_rate": 9.479869629792729e-06,
"loss": 1.2231,
"step": 3535
},
{
"epoch": 0.9462135402729462,
"grad_norm": 3.833712577819824,
"learning_rate": 9.47947654491835e-06,
"loss": 1.1585,
"step": 3536
},
{
"epoch": 0.9464811345999464,
"grad_norm": 3.6991257667541504,
"learning_rate": 9.479083319720508e-06,
"loss": 1.1624,
"step": 3537
},
{
"epoch": 0.9467487289269467,
"grad_norm": 3.437206983566284,
"learning_rate": 9.47868995421152e-06,
"loss": 1.1759,
"step": 3538
},
{
"epoch": 0.947016323253947,
"grad_norm": 4.02100944519043,
"learning_rate": 9.478296448403707e-06,
"loss": 1.3648,
"step": 3539
},
{
"epoch": 0.9472839175809473,
"grad_norm": 3.552727699279785,
"learning_rate": 9.4779028023094e-06,
"loss": 1.1806,
"step": 3540
},
{
"epoch": 0.9475515119079475,
"grad_norm": 3.3923287391662598,
"learning_rate": 9.477509015940928e-06,
"loss": 1.1629,
"step": 3541
},
{
"epoch": 0.9478191062349478,
"grad_norm": 3.3122611045837402,
"learning_rate": 9.477115089310626e-06,
"loss": 1.0938,
"step": 3542
},
{
"epoch": 0.9480867005619481,
"grad_norm": 3.2239348888397217,
"learning_rate": 9.476721022430834e-06,
"loss": 1.2055,
"step": 3543
},
{
"epoch": 0.9483542948889484,
"grad_norm": 3.426645278930664,
"learning_rate": 9.4763268153139e-06,
"loss": 1.1852,
"step": 3544
},
{
"epoch": 0.9486218892159486,
"grad_norm": 3.3436081409454346,
"learning_rate": 9.475932467972169e-06,
"loss": 1.0941,
"step": 3545
},
{
"epoch": 0.9488894835429489,
"grad_norm": 3.2754054069519043,
"learning_rate": 9.475537980417994e-06,
"loss": 1.0063,
"step": 3546
},
{
"epoch": 0.9491570778699492,
"grad_norm": 3.0442628860473633,
"learning_rate": 9.475143352663736e-06,
"loss": 1.0222,
"step": 3547
},
{
"epoch": 0.9494246721969494,
"grad_norm": 3.662236213684082,
"learning_rate": 9.474748584721755e-06,
"loss": 1.2637,
"step": 3548
},
{
"epoch": 0.9496922665239497,
"grad_norm": 3.3807575702667236,
"learning_rate": 9.474353676604416e-06,
"loss": 1.1554,
"step": 3549
},
{
"epoch": 0.94995986085095,
"grad_norm": 3.3155808448791504,
"learning_rate": 9.473958628324093e-06,
"loss": 1.1654,
"step": 3550
},
{
"epoch": 0.9502274551779503,
"grad_norm": 3.54040789604187,
"learning_rate": 9.47356343989316e-06,
"loss": 1.3299,
"step": 3551
},
{
"epoch": 0.9504950495049505,
"grad_norm": 3.640690565109253,
"learning_rate": 9.473168111323995e-06,
"loss": 1.2952,
"step": 3552
},
{
"epoch": 0.9507626438319507,
"grad_norm": 3.141310214996338,
"learning_rate": 9.472772642628984e-06,
"loss": 1.0503,
"step": 3553
},
{
"epoch": 0.951030238158951,
"grad_norm": 3.160971164703369,
"learning_rate": 9.472377033820514e-06,
"loss": 1.065,
"step": 3554
},
{
"epoch": 0.9512978324859513,
"grad_norm": 3.1601452827453613,
"learning_rate": 9.47198128491098e-06,
"loss": 1.1415,
"step": 3555
},
{
"epoch": 0.9515654268129515,
"grad_norm": 3.017057180404663,
"learning_rate": 9.471585395912776e-06,
"loss": 1.1711,
"step": 3556
},
{
"epoch": 0.9518330211399518,
"grad_norm": 3.127492666244507,
"learning_rate": 9.471189366838307e-06,
"loss": 1.0834,
"step": 3557
},
{
"epoch": 0.9521006154669521,
"grad_norm": 3.3001465797424316,
"learning_rate": 9.470793197699977e-06,
"loss": 1.1977,
"step": 3558
},
{
"epoch": 0.9523682097939523,
"grad_norm": 3.1050307750701904,
"learning_rate": 9.470396888510196e-06,
"loss": 1.0702,
"step": 3559
},
{
"epoch": 0.9526358041209526,
"grad_norm": 3.070390224456787,
"learning_rate": 9.470000439281379e-06,
"loss": 0.9667,
"step": 3560
},
{
"epoch": 0.9529033984479529,
"grad_norm": 3.1426901817321777,
"learning_rate": 9.469603850025946e-06,
"loss": 0.9483,
"step": 3561
},
{
"epoch": 0.9531709927749532,
"grad_norm": 3.142430543899536,
"learning_rate": 9.46920712075632e-06,
"loss": 0.9968,
"step": 3562
},
{
"epoch": 0.9534385871019534,
"grad_norm": 3.0528955459594727,
"learning_rate": 9.468810251484928e-06,
"loss": 1.0463,
"step": 3563
},
{
"epoch": 0.9537061814289537,
"grad_norm": 3.309941530227661,
"learning_rate": 9.468413242224204e-06,
"loss": 1.2411,
"step": 3564
},
{
"epoch": 0.953973775755954,
"grad_norm": 3.3797719478607178,
"learning_rate": 9.468016092986583e-06,
"loss": 1.1239,
"step": 3565
},
{
"epoch": 0.9542413700829543,
"grad_norm": 3.4834790229797363,
"learning_rate": 9.467618803784507e-06,
"loss": 1.1477,
"step": 3566
},
{
"epoch": 0.9545089644099545,
"grad_norm": 3.8445494174957275,
"learning_rate": 9.467221374630422e-06,
"loss": 1.2043,
"step": 3567
},
{
"epoch": 0.9547765587369548,
"grad_norm": 3.565916061401367,
"learning_rate": 9.466823805536776e-06,
"loss": 1.0226,
"step": 3568
},
{
"epoch": 0.9550441530639551,
"grad_norm": 3.3251450061798096,
"learning_rate": 9.466426096516024e-06,
"loss": 1.1116,
"step": 3569
},
{
"epoch": 0.9553117473909553,
"grad_norm": 3.7028703689575195,
"learning_rate": 9.466028247580624e-06,
"loss": 1.1634,
"step": 3570
},
{
"epoch": 0.9555793417179556,
"grad_norm": 3.316803455352783,
"learning_rate": 9.465630258743041e-06,
"loss": 1.1836,
"step": 3571
},
{
"epoch": 0.9558469360449559,
"grad_norm": 3.284135341644287,
"learning_rate": 9.46523213001574e-06,
"loss": 1.077,
"step": 3572
},
{
"epoch": 0.9561145303719562,
"grad_norm": 3.4031052589416504,
"learning_rate": 9.464833861411196e-06,
"loss": 1.0654,
"step": 3573
},
{
"epoch": 0.9563821246989563,
"grad_norm": 3.1017119884490967,
"learning_rate": 9.464435452941881e-06,
"loss": 1.055,
"step": 3574
},
{
"epoch": 0.9566497190259566,
"grad_norm": 3.4276275634765625,
"learning_rate": 9.464036904620278e-06,
"loss": 1.1501,
"step": 3575
},
{
"epoch": 0.9569173133529569,
"grad_norm": 3.362846851348877,
"learning_rate": 9.46363821645887e-06,
"loss": 1.1495,
"step": 3576
},
{
"epoch": 0.9571849076799572,
"grad_norm": 3.4557695388793945,
"learning_rate": 9.46323938847015e-06,
"loss": 1.0333,
"step": 3577
},
{
"epoch": 0.9574525020069574,
"grad_norm": 3.3175323009490967,
"learning_rate": 9.462840420666607e-06,
"loss": 1.0388,
"step": 3578
},
{
"epoch": 0.9577200963339577,
"grad_norm": 3.987826347351074,
"learning_rate": 9.462441313060741e-06,
"loss": 1.3387,
"step": 3579
},
{
"epoch": 0.957987690660958,
"grad_norm": 3.354966163635254,
"learning_rate": 9.462042065665057e-06,
"loss": 1.0009,
"step": 3580
},
{
"epoch": 0.9582552849879582,
"grad_norm": 3.2124292850494385,
"learning_rate": 9.461642678492059e-06,
"loss": 1.0912,
"step": 3581
},
{
"epoch": 0.9585228793149585,
"grad_norm": 3.4074666500091553,
"learning_rate": 9.461243151554257e-06,
"loss": 1.1189,
"step": 3582
},
{
"epoch": 0.9587904736419588,
"grad_norm": 3.342137336730957,
"learning_rate": 9.460843484864168e-06,
"loss": 1.1774,
"step": 3583
},
{
"epoch": 0.9590580679689591,
"grad_norm": 3.108454704284668,
"learning_rate": 9.460443678434313e-06,
"loss": 0.9848,
"step": 3584
},
{
"epoch": 0.9593256622959593,
"grad_norm": 2.764338731765747,
"learning_rate": 9.460043732277213e-06,
"loss": 0.9759,
"step": 3585
},
{
"epoch": 0.9595932566229596,
"grad_norm": 3.277522087097168,
"learning_rate": 9.4596436464054e-06,
"loss": 1.0655,
"step": 3586
},
{
"epoch": 0.9598608509499599,
"grad_norm": 3.2596328258514404,
"learning_rate": 9.459243420831406e-06,
"loss": 1.0931,
"step": 3587
},
{
"epoch": 0.9601284452769602,
"grad_norm": 3.3938968181610107,
"learning_rate": 9.45884305556777e-06,
"loss": 1.1095,
"step": 3588
},
{
"epoch": 0.9603960396039604,
"grad_norm": 3.694939374923706,
"learning_rate": 9.45844255062703e-06,
"loss": 1.274,
"step": 3589
},
{
"epoch": 0.9606636339309607,
"grad_norm": 3.2319419384002686,
"learning_rate": 9.458041906021733e-06,
"loss": 1.1096,
"step": 3590
},
{
"epoch": 0.960931228257961,
"grad_norm": 3.1638870239257812,
"learning_rate": 9.457641121764433e-06,
"loss": 1.0875,
"step": 3591
},
{
"epoch": 0.9611988225849611,
"grad_norm": 3.1526119709014893,
"learning_rate": 9.457240197867682e-06,
"loss": 0.9327,
"step": 3592
},
{
"epoch": 0.9614664169119614,
"grad_norm": 3.1602957248687744,
"learning_rate": 9.45683913434404e-06,
"loss": 1.0397,
"step": 3593
},
{
"epoch": 0.9617340112389617,
"grad_norm": 3.8300065994262695,
"learning_rate": 9.45643793120607e-06,
"loss": 1.2436,
"step": 3594
},
{
"epoch": 0.962001605565962,
"grad_norm": 3.174600839614868,
"learning_rate": 9.456036588466342e-06,
"loss": 1.1446,
"step": 3595
},
{
"epoch": 0.9622691998929622,
"grad_norm": 3.3465054035186768,
"learning_rate": 9.455635106137427e-06,
"loss": 1.1416,
"step": 3596
},
{
"epoch": 0.9625367942199625,
"grad_norm": 2.8999526500701904,
"learning_rate": 9.455233484231901e-06,
"loss": 0.9453,
"step": 3597
},
{
"epoch": 0.9628043885469628,
"grad_norm": 3.197765350341797,
"learning_rate": 9.454831722762346e-06,
"loss": 1.0374,
"step": 3598
},
{
"epoch": 0.9630719828739631,
"grad_norm": 3.180861711502075,
"learning_rate": 9.454429821741346e-06,
"loss": 1.051,
"step": 3599
},
{
"epoch": 0.9633395772009633,
"grad_norm": 3.3026978969573975,
"learning_rate": 9.454027781181496e-06,
"loss": 1.0753,
"step": 3600
},
{
"epoch": 0.9636071715279636,
"grad_norm": 3.2190282344818115,
"learning_rate": 9.453625601095385e-06,
"loss": 1.0299,
"step": 3601
},
{
"epoch": 0.9638747658549639,
"grad_norm": 3.302915334701538,
"learning_rate": 9.453223281495612e-06,
"loss": 1.1025,
"step": 3602
},
{
"epoch": 0.9641423601819642,
"grad_norm": 3.600322961807251,
"learning_rate": 9.452820822394783e-06,
"loss": 1.1095,
"step": 3603
},
{
"epoch": 0.9644099545089644,
"grad_norm": 3.5180773735046387,
"learning_rate": 9.452418223805502e-06,
"loss": 1.1045,
"step": 3604
},
{
"epoch": 0.9646775488359647,
"grad_norm": 3.113248825073242,
"learning_rate": 9.452015485740384e-06,
"loss": 0.9298,
"step": 3605
},
{
"epoch": 0.964945143162965,
"grad_norm": 3.2621712684631348,
"learning_rate": 9.451612608212044e-06,
"loss": 1.0531,
"step": 3606
},
{
"epoch": 0.9652127374899652,
"grad_norm": 2.929264783859253,
"learning_rate": 9.4512095912331e-06,
"loss": 1.0108,
"step": 3607
},
{
"epoch": 0.9654803318169655,
"grad_norm": 3.159482955932617,
"learning_rate": 9.45080643481618e-06,
"loss": 1.0752,
"step": 3608
},
{
"epoch": 0.9657479261439658,
"grad_norm": 3.332207202911377,
"learning_rate": 9.450403138973914e-06,
"loss": 1.1162,
"step": 3609
},
{
"epoch": 0.9660155204709661,
"grad_norm": 3.570805072784424,
"learning_rate": 9.44999970371893e-06,
"loss": 1.2211,
"step": 3610
},
{
"epoch": 0.9662831147979662,
"grad_norm": 2.926478624343872,
"learning_rate": 9.449596129063873e-06,
"loss": 0.9727,
"step": 3611
},
{
"epoch": 0.9665507091249665,
"grad_norm": 3.6152682304382324,
"learning_rate": 9.44919241502138e-06,
"loss": 1.0963,
"step": 3612
},
{
"epoch": 0.9668183034519668,
"grad_norm": 2.954878807067871,
"learning_rate": 9.4487885616041e-06,
"loss": 1.0543,
"step": 3613
},
{
"epoch": 0.9670858977789671,
"grad_norm": 3.6669461727142334,
"learning_rate": 9.448384568824685e-06,
"loss": 1.2643,
"step": 3614
},
{
"epoch": 0.9673534921059673,
"grad_norm": 3.3067846298217773,
"learning_rate": 9.447980436695787e-06,
"loss": 1.2128,
"step": 3615
},
{
"epoch": 0.9676210864329676,
"grad_norm": 3.863605499267578,
"learning_rate": 9.44757616523007e-06,
"loss": 1.2189,
"step": 3616
},
{
"epoch": 0.9678886807599679,
"grad_norm": 3.104247570037842,
"learning_rate": 9.447171754440195e-06,
"loss": 1.0933,
"step": 3617
},
{
"epoch": 0.9681562750869681,
"grad_norm": 4.070940017700195,
"learning_rate": 9.446767204338832e-06,
"loss": 1.2085,
"step": 3618
},
{
"epoch": 0.9684238694139684,
"grad_norm": 3.5472705364227295,
"learning_rate": 9.446362514938653e-06,
"loss": 1.1472,
"step": 3619
},
{
"epoch": 0.9686914637409687,
"grad_norm": 3.455854654312134,
"learning_rate": 9.445957686252336e-06,
"loss": 1.2072,
"step": 3620
},
{
"epoch": 0.968959058067969,
"grad_norm": 3.4434685707092285,
"learning_rate": 9.445552718292564e-06,
"loss": 1.0107,
"step": 3621
},
{
"epoch": 0.9692266523949692,
"grad_norm": 3.8100736141204834,
"learning_rate": 9.445147611072019e-06,
"loss": 1.4714,
"step": 3622
},
{
"epoch": 0.9694942467219695,
"grad_norm": 3.2713818550109863,
"learning_rate": 9.444742364603394e-06,
"loss": 1.1,
"step": 3623
},
{
"epoch": 0.9697618410489698,
"grad_norm": 3.6512348651885986,
"learning_rate": 9.444336978899384e-06,
"loss": 1.1204,
"step": 3624
},
{
"epoch": 0.9700294353759701,
"grad_norm": 3.12085223197937,
"learning_rate": 9.443931453972688e-06,
"loss": 1.1352,
"step": 3625
},
{
"epoch": 0.9702970297029703,
"grad_norm": 3.4997334480285645,
"learning_rate": 9.443525789836008e-06,
"loss": 1.184,
"step": 3626
},
{
"epoch": 0.9705646240299706,
"grad_norm": 3.1696505546569824,
"learning_rate": 9.443119986502053e-06,
"loss": 1.0827,
"step": 3627
},
{
"epoch": 0.9708322183569709,
"grad_norm": 2.911027431488037,
"learning_rate": 9.442714043983534e-06,
"loss": 0.9887,
"step": 3628
},
{
"epoch": 0.971099812683971,
"grad_norm": 3.3565635681152344,
"learning_rate": 9.44230796229317e-06,
"loss": 1.0637,
"step": 3629
},
{
"epoch": 0.9713674070109714,
"grad_norm": 3.567798614501953,
"learning_rate": 9.441901741443678e-06,
"loss": 1.0772,
"step": 3630
},
{
"epoch": 0.9716350013379716,
"grad_norm": 3.278237819671631,
"learning_rate": 9.441495381447787e-06,
"loss": 1.1284,
"step": 3631
},
{
"epoch": 0.971902595664972,
"grad_norm": 3.231175661087036,
"learning_rate": 9.441088882318223e-06,
"loss": 1.1555,
"step": 3632
},
{
"epoch": 0.9721701899919721,
"grad_norm": 3.619246006011963,
"learning_rate": 9.440682244067724e-06,
"loss": 1.2106,
"step": 3633
},
{
"epoch": 0.9724377843189724,
"grad_norm": 3.553589344024658,
"learning_rate": 9.440275466709025e-06,
"loss": 1.1818,
"step": 3634
},
{
"epoch": 0.9727053786459727,
"grad_norm": 3.7136471271514893,
"learning_rate": 9.43986855025487e-06,
"loss": 1.2618,
"step": 3635
},
{
"epoch": 0.972972972972973,
"grad_norm": 3.361229658126831,
"learning_rate": 9.439461494718006e-06,
"loss": 1.1693,
"step": 3636
},
{
"epoch": 0.9732405672999732,
"grad_norm": 3.6238696575164795,
"learning_rate": 9.439054300111183e-06,
"loss": 1.2334,
"step": 3637
},
{
"epoch": 0.9735081616269735,
"grad_norm": 3.4375593662261963,
"learning_rate": 9.438646966447158e-06,
"loss": 1.1148,
"step": 3638
},
{
"epoch": 0.9737757559539738,
"grad_norm": 3.1718556880950928,
"learning_rate": 9.438239493738692e-06,
"loss": 1.13,
"step": 3639
},
{
"epoch": 0.974043350280974,
"grad_norm": 3.3983285427093506,
"learning_rate": 9.437831881998548e-06,
"loss": 1.2176,
"step": 3640
},
{
"epoch": 0.9743109446079743,
"grad_norm": 3.254225015640259,
"learning_rate": 9.437424131239496e-06,
"loss": 1.2788,
"step": 3641
},
{
"epoch": 0.9745785389349746,
"grad_norm": 3.181648015975952,
"learning_rate": 9.437016241474307e-06,
"loss": 1.0597,
"step": 3642
},
{
"epoch": 0.9748461332619749,
"grad_norm": 3.5744338035583496,
"learning_rate": 9.43660821271576e-06,
"loss": 1.1843,
"step": 3643
},
{
"epoch": 0.9751137275889751,
"grad_norm": 3.0701401233673096,
"learning_rate": 9.436200044976638e-06,
"loss": 0.9703,
"step": 3644
},
{
"epoch": 0.9753813219159754,
"grad_norm": 3.134881019592285,
"learning_rate": 9.435791738269725e-06,
"loss": 1.1275,
"step": 3645
},
{
"epoch": 0.9756489162429757,
"grad_norm": 3.301896810531616,
"learning_rate": 9.43538329260781e-06,
"loss": 1.0886,
"step": 3646
},
{
"epoch": 0.975916510569976,
"grad_norm": 3.211254358291626,
"learning_rate": 9.434974708003694e-06,
"loss": 1.018,
"step": 3647
},
{
"epoch": 0.9761841048969762,
"grad_norm": 3.36275577545166,
"learning_rate": 9.434565984470172e-06,
"loss": 1.1677,
"step": 3648
},
{
"epoch": 0.9764516992239765,
"grad_norm": 3.162914276123047,
"learning_rate": 9.434157122020047e-06,
"loss": 1.1324,
"step": 3649
},
{
"epoch": 0.9767192935509768,
"grad_norm": 3.0449602603912354,
"learning_rate": 9.433748120666129e-06,
"loss": 1.0288,
"step": 3650
},
{
"epoch": 0.9769868878779769,
"grad_norm": 3.1934330463409424,
"learning_rate": 9.43333898042123e-06,
"loss": 1.1246,
"step": 3651
},
{
"epoch": 0.9772544822049772,
"grad_norm": 3.4677317142486572,
"learning_rate": 9.432929701298166e-06,
"loss": 1.0707,
"step": 3652
},
{
"epoch": 0.9775220765319775,
"grad_norm": 3.6359336376190186,
"learning_rate": 9.432520283309756e-06,
"loss": 1.1178,
"step": 3653
},
{
"epoch": 0.9777896708589778,
"grad_norm": 3.6173250675201416,
"learning_rate": 9.43211072646883e-06,
"loss": 1.2067,
"step": 3654
},
{
"epoch": 0.978057265185978,
"grad_norm": 3.4153003692626953,
"learning_rate": 9.431701030788215e-06,
"loss": 1.1684,
"step": 3655
},
{
"epoch": 0.9783248595129783,
"grad_norm": 3.5359673500061035,
"learning_rate": 9.431291196280745e-06,
"loss": 1.2438,
"step": 3656
},
{
"epoch": 0.9785924538399786,
"grad_norm": 3.2912895679473877,
"learning_rate": 9.430881222959258e-06,
"loss": 1.1959,
"step": 3657
},
{
"epoch": 0.9788600481669789,
"grad_norm": 3.617729902267456,
"learning_rate": 9.430471110836599e-06,
"loss": 1.2038,
"step": 3658
},
{
"epoch": 0.9791276424939791,
"grad_norm": 2.9933791160583496,
"learning_rate": 9.430060859925614e-06,
"loss": 1.0116,
"step": 3659
},
{
"epoch": 0.9793952368209794,
"grad_norm": 3.0839054584503174,
"learning_rate": 9.429650470239152e-06,
"loss": 1.0596,
"step": 3660
},
{
"epoch": 0.9796628311479797,
"grad_norm": 2.8952386379241943,
"learning_rate": 9.429239941790072e-06,
"loss": 1.0405,
"step": 3661
},
{
"epoch": 0.9799304254749799,
"grad_norm": 3.154313564300537,
"learning_rate": 9.428829274591234e-06,
"loss": 0.9956,
"step": 3662
},
{
"epoch": 0.9801980198019802,
"grad_norm": 3.2512032985687256,
"learning_rate": 9.4284184686555e-06,
"loss": 1.0996,
"step": 3663
},
{
"epoch": 0.9804656141289805,
"grad_norm": 3.4779818058013916,
"learning_rate": 9.428007523995741e-06,
"loss": 1.1982,
"step": 3664
},
{
"epoch": 0.9807332084559808,
"grad_norm": 3.4594082832336426,
"learning_rate": 9.427596440624832e-06,
"loss": 1.2461,
"step": 3665
},
{
"epoch": 0.981000802782981,
"grad_norm": 3.6881461143493652,
"learning_rate": 9.427185218555645e-06,
"loss": 1.1537,
"step": 3666
},
{
"epoch": 0.9812683971099813,
"grad_norm": 3.308070182800293,
"learning_rate": 9.426773857801067e-06,
"loss": 1.1589,
"step": 3667
},
{
"epoch": 0.9815359914369816,
"grad_norm": 3.4280967712402344,
"learning_rate": 9.426362358373981e-06,
"loss": 1.0666,
"step": 3668
},
{
"epoch": 0.9818035857639819,
"grad_norm": 3.077514886856079,
"learning_rate": 9.42595072028728e-06,
"loss": 1.2246,
"step": 3669
},
{
"epoch": 0.982071180090982,
"grad_norm": 2.923833131790161,
"learning_rate": 9.425538943553858e-06,
"loss": 0.9993,
"step": 3670
},
{
"epoch": 0.9823387744179823,
"grad_norm": 3.5569820404052734,
"learning_rate": 9.425127028186613e-06,
"loss": 1.1705,
"step": 3671
},
{
"epoch": 0.9826063687449826,
"grad_norm": 3.547506093978882,
"learning_rate": 9.42471497419845e-06,
"loss": 1.1549,
"step": 3672
},
{
"epoch": 0.9828739630719828,
"grad_norm": 3.3490121364593506,
"learning_rate": 9.424302781602277e-06,
"loss": 1.1942,
"step": 3673
},
{
"epoch": 0.9831415573989831,
"grad_norm": 3.038180112838745,
"learning_rate": 9.423890450411005e-06,
"loss": 0.9606,
"step": 3674
},
{
"epoch": 0.9834091517259834,
"grad_norm": 3.5518031120300293,
"learning_rate": 9.423477980637552e-06,
"loss": 1.1898,
"step": 3675
},
{
"epoch": 0.9836767460529837,
"grad_norm": 3.295964002609253,
"learning_rate": 9.42306537229484e-06,
"loss": 1.1399,
"step": 3676
},
{
"epoch": 0.9839443403799839,
"grad_norm": 3.0206878185272217,
"learning_rate": 9.422652625395791e-06,
"loss": 1.0414,
"step": 3677
},
{
"epoch": 0.9842119347069842,
"grad_norm": 3.4383180141448975,
"learning_rate": 9.422239739953337e-06,
"loss": 1.1399,
"step": 3678
},
{
"epoch": 0.9844795290339845,
"grad_norm": 3.2433290481567383,
"learning_rate": 9.42182671598041e-06,
"loss": 1.1698,
"step": 3679
},
{
"epoch": 0.9847471233609848,
"grad_norm": 3.151118755340576,
"learning_rate": 9.421413553489952e-06,
"loss": 1.1247,
"step": 3680
},
{
"epoch": 0.985014717687985,
"grad_norm": 3.5655605792999268,
"learning_rate": 9.421000252494902e-06,
"loss": 1.1653,
"step": 3681
},
{
"epoch": 0.9852823120149853,
"grad_norm": 2.9051101207733154,
"learning_rate": 9.42058681300821e-06,
"loss": 0.9898,
"step": 3682
},
{
"epoch": 0.9855499063419856,
"grad_norm": 3.184781312942505,
"learning_rate": 9.420173235042825e-06,
"loss": 1.0923,
"step": 3683
},
{
"epoch": 0.9858175006689858,
"grad_norm": 3.448216199874878,
"learning_rate": 9.419759518611704e-06,
"loss": 1.1623,
"step": 3684
},
{
"epoch": 0.9860850949959861,
"grad_norm": 3.177147626876831,
"learning_rate": 9.419345663727805e-06,
"loss": 1.0076,
"step": 3685
},
{
"epoch": 0.9863526893229864,
"grad_norm": 3.480036497116089,
"learning_rate": 9.418931670404096e-06,
"loss": 1.1148,
"step": 3686
},
{
"epoch": 0.9866202836499867,
"grad_norm": 3.5168023109436035,
"learning_rate": 9.418517538653541e-06,
"loss": 1.1358,
"step": 3687
},
{
"epoch": 0.9868878779769868,
"grad_norm": 3.518444061279297,
"learning_rate": 9.41810326848912e-06,
"loss": 1.1001,
"step": 3688
},
{
"epoch": 0.9871554723039871,
"grad_norm": 3.6086690425872803,
"learning_rate": 9.417688859923803e-06,
"loss": 1.2808,
"step": 3689
},
{
"epoch": 0.9874230666309874,
"grad_norm": 3.5940322875976562,
"learning_rate": 9.417274312970574e-06,
"loss": 1.1559,
"step": 3690
},
{
"epoch": 0.9876906609579877,
"grad_norm": 3.6169145107269287,
"learning_rate": 9.416859627642421e-06,
"loss": 1.1886,
"step": 3691
},
{
"epoch": 0.9879582552849879,
"grad_norm": 3.5202441215515137,
"learning_rate": 9.416444803952334e-06,
"loss": 1.2057,
"step": 3692
},
{
"epoch": 0.9882258496119882,
"grad_norm": 3.2127108573913574,
"learning_rate": 9.416029841913306e-06,
"loss": 0.9903,
"step": 3693
},
{
"epoch": 0.9884934439389885,
"grad_norm": 3.4693965911865234,
"learning_rate": 9.415614741538334e-06,
"loss": 1.1354,
"step": 3694
},
{
"epoch": 0.9887610382659887,
"grad_norm": 3.5482194423675537,
"learning_rate": 9.415199502840428e-06,
"loss": 1.2641,
"step": 3695
},
{
"epoch": 0.989028632592989,
"grad_norm": 2.904871702194214,
"learning_rate": 9.414784125832592e-06,
"loss": 1.0245,
"step": 3696
},
{
"epoch": 0.9892962269199893,
"grad_norm": 3.295982599258423,
"learning_rate": 9.414368610527835e-06,
"loss": 1.1042,
"step": 3697
},
{
"epoch": 0.9895638212469896,
"grad_norm": 3.2028210163116455,
"learning_rate": 9.413952956939179e-06,
"loss": 1.1382,
"step": 3698
},
{
"epoch": 0.9898314155739898,
"grad_norm": 3.677734136581421,
"learning_rate": 9.413537165079642e-06,
"loss": 1.3448,
"step": 3699
},
{
"epoch": 0.9900990099009901,
"grad_norm": 3.27496337890625,
"learning_rate": 9.413121234962248e-06,
"loss": 1.2098,
"step": 3700
},
{
"epoch": 0.9903666042279904,
"grad_norm": 3.1553688049316406,
"learning_rate": 9.412705166600026e-06,
"loss": 1.1474,
"step": 3701
},
{
"epoch": 0.9906341985549907,
"grad_norm": 3.2582755088806152,
"learning_rate": 9.412288960006016e-06,
"loss": 1.0499,
"step": 3702
},
{
"epoch": 0.9909017928819909,
"grad_norm": 3.174191474914551,
"learning_rate": 9.411872615193247e-06,
"loss": 1.1505,
"step": 3703
},
{
"epoch": 0.9911693872089912,
"grad_norm": 3.137990951538086,
"learning_rate": 9.411456132174768e-06,
"loss": 1.0766,
"step": 3704
},
{
"epoch": 0.9914369815359915,
"grad_norm": 3.366492748260498,
"learning_rate": 9.411039510963622e-06,
"loss": 1.265,
"step": 3705
},
{
"epoch": 0.9917045758629917,
"grad_norm": 3.27744460105896,
"learning_rate": 9.41062275157286e-06,
"loss": 1.0141,
"step": 3706
},
{
"epoch": 0.991972170189992,
"grad_norm": 3.3313217163085938,
"learning_rate": 9.410205854015542e-06,
"loss": 1.1903,
"step": 3707
},
{
"epoch": 0.9922397645169923,
"grad_norm": 3.281294584274292,
"learning_rate": 9.409788818304722e-06,
"loss": 1.164,
"step": 3708
},
{
"epoch": 0.9925073588439925,
"grad_norm": 3.4108831882476807,
"learning_rate": 9.409371644453467e-06,
"loss": 1.0759,
"step": 3709
},
{
"epoch": 0.9927749531709927,
"grad_norm": 2.919935703277588,
"learning_rate": 9.408954332474845e-06,
"loss": 1.0233,
"step": 3710
},
{
"epoch": 0.993042547497993,
"grad_norm": 3.332352876663208,
"learning_rate": 9.408536882381928e-06,
"loss": 1.0486,
"step": 3711
},
{
"epoch": 0.9933101418249933,
"grad_norm": 3.033339023590088,
"learning_rate": 9.408119294187791e-06,
"loss": 0.9737,
"step": 3712
},
{
"epoch": 0.9935777361519936,
"grad_norm": 3.5248947143554688,
"learning_rate": 9.40770156790552e-06,
"loss": 1.2238,
"step": 3713
},
{
"epoch": 0.9938453304789938,
"grad_norm": 3.687695026397705,
"learning_rate": 9.407283703548198e-06,
"loss": 1.1228,
"step": 3714
},
{
"epoch": 0.9941129248059941,
"grad_norm": 3.2966973781585693,
"learning_rate": 9.406865701128916e-06,
"loss": 1.0991,
"step": 3715
},
{
"epoch": 0.9943805191329944,
"grad_norm": 3.3037948608398438,
"learning_rate": 9.406447560660764e-06,
"loss": 1.1622,
"step": 3716
},
{
"epoch": 0.9946481134599946,
"grad_norm": 3.2481751441955566,
"learning_rate": 9.40602928215685e-06,
"loss": 1.0621,
"step": 3717
},
{
"epoch": 0.9949157077869949,
"grad_norm": 3.3406388759613037,
"learning_rate": 9.405610865630266e-06,
"loss": 1.1471,
"step": 3718
},
{
"epoch": 0.9951833021139952,
"grad_norm": 3.2971763610839844,
"learning_rate": 9.405192311094126e-06,
"loss": 1.1473,
"step": 3719
},
{
"epoch": 0.9954508964409955,
"grad_norm": 3.1722514629364014,
"learning_rate": 9.40477361856154e-06,
"loss": 1.0188,
"step": 3720
},
{
"epoch": 0.9957184907679957,
"grad_norm": 3.5597033500671387,
"learning_rate": 9.404354788045624e-06,
"loss": 1.1735,
"step": 3721
},
{
"epoch": 0.995986085094996,
"grad_norm": 3.2064480781555176,
"learning_rate": 9.403935819559497e-06,
"loss": 1.1026,
"step": 3722
},
{
"epoch": 0.9962536794219963,
"grad_norm": 3.40864896774292,
"learning_rate": 9.403516713116286e-06,
"loss": 1.2236,
"step": 3723
},
{
"epoch": 0.9965212737489966,
"grad_norm": 3.1237740516662598,
"learning_rate": 9.403097468729117e-06,
"loss": 1.0975,
"step": 3724
},
{
"epoch": 0.9967888680759968,
"grad_norm": 3.1500935554504395,
"learning_rate": 9.402678086411125e-06,
"loss": 1.0771,
"step": 3725
},
{
"epoch": 0.9970564624029971,
"grad_norm": 3.1867852210998535,
"learning_rate": 9.402258566175448e-06,
"loss": 0.9686,
"step": 3726
},
{
"epoch": 0.9973240567299974,
"grad_norm": 3.5945894718170166,
"learning_rate": 9.401838908035226e-06,
"loss": 1.2021,
"step": 3727
},
{
"epoch": 0.9975916510569975,
"grad_norm": 3.2258009910583496,
"learning_rate": 9.401419112003607e-06,
"loss": 1.1979,
"step": 3728
},
{
"epoch": 0.9978592453839978,
"grad_norm": 3.2605550289154053,
"learning_rate": 9.40099917809374e-06,
"loss": 1.1081,
"step": 3729
},
{
"epoch": 0.9981268397109981,
"grad_norm": 3.4050374031066895,
"learning_rate": 9.400579106318781e-06,
"loss": 1.1763,
"step": 3730
},
{
"epoch": 0.9983944340379984,
"grad_norm": 3.2462353706359863,
"learning_rate": 9.400158896691887e-06,
"loss": 1.0444,
"step": 3731
},
{
"epoch": 0.9986620283649986,
"grad_norm": 3.2882609367370605,
"learning_rate": 9.399738549226226e-06,
"loss": 1.0605,
"step": 3732
},
{
"epoch": 0.9989296226919989,
"grad_norm": 3.3432724475860596,
"learning_rate": 9.399318063934959e-06,
"loss": 1.109,
"step": 3733
},
{
"epoch": 0.9991972170189992,
"grad_norm": 3.5138442516326904,
"learning_rate": 9.398897440831263e-06,
"loss": 1.2697,
"step": 3734
},
{
"epoch": 0.9994648113459995,
"grad_norm": 3.237004280090332,
"learning_rate": 9.398476679928313e-06,
"loss": 1.0746,
"step": 3735
},
{
"epoch": 0.9997324056729997,
"grad_norm": 3.4802517890930176,
"learning_rate": 9.39805578123929e-06,
"loss": 1.1824,
"step": 3736
},
{
"epoch": 1.0,
"grad_norm": 3.3513879776000977,
"learning_rate": 9.39763474477738e-06,
"loss": 1.1062,
"step": 3737
}
],
"logging_steps": 1.0,
"max_steps": 18685,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500.0,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.2477154653844275e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}