Files
M-Prometheus-7B/trainer_state.json
ModelHub XC 862b82c654 初始化项目,由ModelHub XC社区提供模型
Model: Unbabel/M-Prometheus-7B
Source: Original Platform
2026-05-09 19:33:27 +08:00

20834 lines
504 KiB
JSON

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9999663084127893,
"eval_steps": 500,
"global_step": 14840,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 6.7383174421347e-05,
"grad_norm": 44.709284858028134,
"learning_rate": 6.738544474393531e-10,
"loss": 3.4965,
"step": 1
},
{
"epoch": 0.00033691587210673494,
"grad_norm": 44.640535530285916,
"learning_rate": 3.3692722371967655e-09,
"loss": 3.356,
"step": 5
},
{
"epoch": 0.0006738317442134699,
"grad_norm": 51.56048341252242,
"learning_rate": 6.738544474393531e-09,
"loss": 3.3383,
"step": 10
},
{
"epoch": 0.001010747616320205,
"grad_norm": 53.90723002649116,
"learning_rate": 1.0107816711590296e-08,
"loss": 3.3735,
"step": 15
},
{
"epoch": 0.0013476634884269398,
"grad_norm": 46.1101675459699,
"learning_rate": 1.3477088948787062e-08,
"loss": 3.3197,
"step": 20
},
{
"epoch": 0.0016845793605336748,
"grad_norm": 43.15160877782767,
"learning_rate": 1.6846361185983825e-08,
"loss": 3.2933,
"step": 25
},
{
"epoch": 0.00202149523264041,
"grad_norm": 38.262340082474964,
"learning_rate": 2.021563342318059e-08,
"loss": 3.2625,
"step": 30
},
{
"epoch": 0.0023584111047471445,
"grad_norm": 43.90995030755105,
"learning_rate": 2.3584905660377358e-08,
"loss": 3.3362,
"step": 35
},
{
"epoch": 0.0026953269768538795,
"grad_norm": 49.80340871131932,
"learning_rate": 2.6954177897574124e-08,
"loss": 3.3537,
"step": 40
},
{
"epoch": 0.0030322428489606146,
"grad_norm": 44.81802216376179,
"learning_rate": 3.032345013477089e-08,
"loss": 3.379,
"step": 45
},
{
"epoch": 0.0033691587210673496,
"grad_norm": 50.77404389674636,
"learning_rate": 3.369272237196765e-08,
"loss": 3.4307,
"step": 50
},
{
"epoch": 0.0037060745931740842,
"grad_norm": 46.42613238025317,
"learning_rate": 3.706199460916442e-08,
"loss": 3.3911,
"step": 55
},
{
"epoch": 0.00404299046528082,
"grad_norm": 46.258120740760205,
"learning_rate": 4.043126684636118e-08,
"loss": 3.2756,
"step": 60
},
{
"epoch": 0.004379906337387554,
"grad_norm": 43.14076176958262,
"learning_rate": 4.380053908355795e-08,
"loss": 3.355,
"step": 65
},
{
"epoch": 0.004716822209494289,
"grad_norm": 46.81420499567808,
"learning_rate": 4.7169811320754715e-08,
"loss": 3.3903,
"step": 70
},
{
"epoch": 0.005053738081601024,
"grad_norm": 44.52936544164706,
"learning_rate": 5.053908355795148e-08,
"loss": 3.2939,
"step": 75
},
{
"epoch": 0.005390653953707759,
"grad_norm": 34.86781526806953,
"learning_rate": 5.390835579514825e-08,
"loss": 3.198,
"step": 80
},
{
"epoch": 0.005727569825814494,
"grad_norm": 29.870304957206507,
"learning_rate": 5.727762803234501e-08,
"loss": 3.0484,
"step": 85
},
{
"epoch": 0.006064485697921229,
"grad_norm": 35.58464490708436,
"learning_rate": 6.064690026954177e-08,
"loss": 3.201,
"step": 90
},
{
"epoch": 0.006401401570027964,
"grad_norm": 34.646361724277995,
"learning_rate": 6.401617250673854e-08,
"loss": 3.2132,
"step": 95
},
{
"epoch": 0.006738317442134699,
"grad_norm": 32.994364624399495,
"learning_rate": 6.73854447439353e-08,
"loss": 3.225,
"step": 100
},
{
"epoch": 0.007075233314241434,
"grad_norm": 33.94119605988894,
"learning_rate": 7.075471698113207e-08,
"loss": 3.2271,
"step": 105
},
{
"epoch": 0.0074121491863481685,
"grad_norm": 22.362130330257422,
"learning_rate": 7.412398921832884e-08,
"loss": 3.0439,
"step": 110
},
{
"epoch": 0.0077490650584549035,
"grad_norm": 20.209448650967722,
"learning_rate": 7.749326145552561e-08,
"loss": 3.015,
"step": 115
},
{
"epoch": 0.00808598093056164,
"grad_norm": 19.57320336750467,
"learning_rate": 8.086253369272237e-08,
"loss": 2.9764,
"step": 120
},
{
"epoch": 0.008422896802668374,
"grad_norm": 18.543985000117726,
"learning_rate": 8.423180592991913e-08,
"loss": 2.9175,
"step": 125
},
{
"epoch": 0.008759812674775108,
"grad_norm": 18.684249606371242,
"learning_rate": 8.76010781671159e-08,
"loss": 3.0318,
"step": 130
},
{
"epoch": 0.009096728546881844,
"grad_norm": 16.654807296468853,
"learning_rate": 9.097035040431267e-08,
"loss": 2.9674,
"step": 135
},
{
"epoch": 0.009433644418988578,
"grad_norm": 15.119507215593755,
"learning_rate": 9.433962264150943e-08,
"loss": 2.8187,
"step": 140
},
{
"epoch": 0.009770560291095314,
"grad_norm": 13.383270486575368,
"learning_rate": 9.770889487870619e-08,
"loss": 2.8611,
"step": 145
},
{
"epoch": 0.010107476163202048,
"grad_norm": 11.350525217429713,
"learning_rate": 1.0107816711590296e-07,
"loss": 2.7627,
"step": 150
},
{
"epoch": 0.010444392035308784,
"grad_norm": 10.567978119152889,
"learning_rate": 1.0444743935309973e-07,
"loss": 2.7982,
"step": 155
},
{
"epoch": 0.010781307907415518,
"grad_norm": 11.623938979272912,
"learning_rate": 1.078167115902965e-07,
"loss": 2.7693,
"step": 160
},
{
"epoch": 0.011118223779522254,
"grad_norm": 10.879180679917159,
"learning_rate": 1.1118598382749325e-07,
"loss": 2.7249,
"step": 165
},
{
"epoch": 0.011455139651628988,
"grad_norm": 10.08425226855161,
"learning_rate": 1.1455525606469002e-07,
"loss": 2.7114,
"step": 170
},
{
"epoch": 0.011792055523735722,
"grad_norm": 10.568577238407425,
"learning_rate": 1.1792452830188679e-07,
"loss": 2.7127,
"step": 175
},
{
"epoch": 0.012128971395842458,
"grad_norm": 8.851324037587174,
"learning_rate": 1.2129380053908355e-07,
"loss": 2.7347,
"step": 180
},
{
"epoch": 0.012465887267949192,
"grad_norm": 9.055687625401953,
"learning_rate": 1.2466307277628032e-07,
"loss": 2.7112,
"step": 185
},
{
"epoch": 0.012802803140055928,
"grad_norm": 9.473681984573032,
"learning_rate": 1.280323450134771e-07,
"loss": 2.6401,
"step": 190
},
{
"epoch": 0.013139719012162663,
"grad_norm": 8.258411293131203,
"learning_rate": 1.3140161725067383e-07,
"loss": 2.6933,
"step": 195
},
{
"epoch": 0.013476634884269399,
"grad_norm": 8.251773198341327,
"learning_rate": 1.347708894878706e-07,
"loss": 2.6369,
"step": 200
},
{
"epoch": 0.013813550756376133,
"grad_norm": 8.076392324410342,
"learning_rate": 1.3814016172506737e-07,
"loss": 2.6968,
"step": 205
},
{
"epoch": 0.014150466628482869,
"grad_norm": 8.399380306646764,
"learning_rate": 1.4150943396226414e-07,
"loss": 2.6753,
"step": 210
},
{
"epoch": 0.014487382500589603,
"grad_norm": 8.114213020798976,
"learning_rate": 1.448787061994609e-07,
"loss": 2.5847,
"step": 215
},
{
"epoch": 0.014824298372696337,
"grad_norm": 9.006493080130776,
"learning_rate": 1.4824797843665768e-07,
"loss": 2.6046,
"step": 220
},
{
"epoch": 0.015161214244803073,
"grad_norm": 9.572718178175688,
"learning_rate": 1.5161725067385445e-07,
"loss": 2.577,
"step": 225
},
{
"epoch": 0.015498130116909807,
"grad_norm": 8.010214290776423,
"learning_rate": 1.5498652291105122e-07,
"loss": 2.5808,
"step": 230
},
{
"epoch": 0.01583504598901654,
"grad_norm": 8.65551262467679,
"learning_rate": 1.58355795148248e-07,
"loss": 2.5189,
"step": 235
},
{
"epoch": 0.01617196186112328,
"grad_norm": 7.23054831726289,
"learning_rate": 1.6172506738544473e-07,
"loss": 2.5421,
"step": 240
},
{
"epoch": 0.016508877733230013,
"grad_norm": 8.850682737865846,
"learning_rate": 1.650943396226415e-07,
"loss": 2.5049,
"step": 245
},
{
"epoch": 0.016845793605336747,
"grad_norm": 8.26897640609425,
"learning_rate": 1.6846361185983827e-07,
"loss": 2.497,
"step": 250
},
{
"epoch": 0.01718270947744348,
"grad_norm": 7.774269288496068,
"learning_rate": 1.7183288409703504e-07,
"loss": 2.5256,
"step": 255
},
{
"epoch": 0.017519625349550216,
"grad_norm": 7.363973310169826,
"learning_rate": 1.752021563342318e-07,
"loss": 2.4144,
"step": 260
},
{
"epoch": 0.017856541221656953,
"grad_norm": 8.517367385880418,
"learning_rate": 1.7857142857142858e-07,
"loss": 2.5122,
"step": 265
},
{
"epoch": 0.018193457093763687,
"grad_norm": 7.193035308993747,
"learning_rate": 1.8194070080862535e-07,
"loss": 2.4599,
"step": 270
},
{
"epoch": 0.01853037296587042,
"grad_norm": 7.101515399204041,
"learning_rate": 1.853099730458221e-07,
"loss": 2.4951,
"step": 275
},
{
"epoch": 0.018867288837977156,
"grad_norm": 6.722538155052685,
"learning_rate": 1.8867924528301886e-07,
"loss": 2.4476,
"step": 280
},
{
"epoch": 0.019204204710083893,
"grad_norm": 6.943285735322818,
"learning_rate": 1.920485175202156e-07,
"loss": 2.413,
"step": 285
},
{
"epoch": 0.019541120582190628,
"grad_norm": 7.561710369578133,
"learning_rate": 1.9541778975741237e-07,
"loss": 2.4349,
"step": 290
},
{
"epoch": 0.019878036454297362,
"grad_norm": 8.469766417330744,
"learning_rate": 1.9878706199460914e-07,
"loss": 2.4515,
"step": 295
},
{
"epoch": 0.020214952326404096,
"grad_norm": 7.543991515105139,
"learning_rate": 2.021563342318059e-07,
"loss": 2.4827,
"step": 300
},
{
"epoch": 0.02055186819851083,
"grad_norm": 6.931353522398812,
"learning_rate": 2.0552560646900268e-07,
"loss": 2.3415,
"step": 305
},
{
"epoch": 0.020888784070617568,
"grad_norm": 8.001618096058628,
"learning_rate": 2.0889487870619945e-07,
"loss": 2.4162,
"step": 310
},
{
"epoch": 0.021225699942724302,
"grad_norm": 7.264662723671408,
"learning_rate": 2.1226415094339622e-07,
"loss": 2.4595,
"step": 315
},
{
"epoch": 0.021562615814831036,
"grad_norm": 7.210766841546526,
"learning_rate": 2.15633423180593e-07,
"loss": 2.4141,
"step": 320
},
{
"epoch": 0.02189953168693777,
"grad_norm": 8.07327269473754,
"learning_rate": 2.1900269541778973e-07,
"loss": 2.4338,
"step": 325
},
{
"epoch": 0.022236447559044508,
"grad_norm": 7.20614839275969,
"learning_rate": 2.223719676549865e-07,
"loss": 2.4118,
"step": 330
},
{
"epoch": 0.022573363431151242,
"grad_norm": 6.439804675900733,
"learning_rate": 2.2574123989218327e-07,
"loss": 2.3203,
"step": 335
},
{
"epoch": 0.022910279303257976,
"grad_norm": 7.145016131156796,
"learning_rate": 2.2911051212938004e-07,
"loss": 2.3943,
"step": 340
},
{
"epoch": 0.02324719517536471,
"grad_norm": 6.778656371306502,
"learning_rate": 2.324797843665768e-07,
"loss": 2.4236,
"step": 345
},
{
"epoch": 0.023584111047471445,
"grad_norm": 6.722819499030103,
"learning_rate": 2.3584905660377358e-07,
"loss": 2.364,
"step": 350
},
{
"epoch": 0.023921026919578182,
"grad_norm": 6.927729717159921,
"learning_rate": 2.392183288409703e-07,
"loss": 2.4121,
"step": 355
},
{
"epoch": 0.024257942791684917,
"grad_norm": 7.406480382151978,
"learning_rate": 2.425876010781671e-07,
"loss": 2.379,
"step": 360
},
{
"epoch": 0.02459485866379165,
"grad_norm": 7.47620372710672,
"learning_rate": 2.4595687331536387e-07,
"loss": 2.3708,
"step": 365
},
{
"epoch": 0.024931774535898385,
"grad_norm": 8.008131038928196,
"learning_rate": 2.4932614555256063e-07,
"loss": 2.3382,
"step": 370
},
{
"epoch": 0.025268690408005123,
"grad_norm": 7.200531642388836,
"learning_rate": 2.526954177897574e-07,
"loss": 2.304,
"step": 375
},
{
"epoch": 0.025605606280111857,
"grad_norm": 7.44236759253605,
"learning_rate": 2.560646900269542e-07,
"loss": 2.2896,
"step": 380
},
{
"epoch": 0.02594252215221859,
"grad_norm": 7.7421899686207984,
"learning_rate": 2.5943396226415094e-07,
"loss": 2.2727,
"step": 385
},
{
"epoch": 0.026279438024325325,
"grad_norm": 7.444999671536141,
"learning_rate": 2.6280323450134766e-07,
"loss": 2.3202,
"step": 390
},
{
"epoch": 0.02661635389643206,
"grad_norm": 7.695216978308538,
"learning_rate": 2.661725067385445e-07,
"loss": 2.3857,
"step": 395
},
{
"epoch": 0.026953269768538797,
"grad_norm": 7.069350012569017,
"learning_rate": 2.695417789757412e-07,
"loss": 2.2707,
"step": 400
},
{
"epoch": 0.02729018564064553,
"grad_norm": 7.068751554462215,
"learning_rate": 2.72911051212938e-07,
"loss": 2.2706,
"step": 405
},
{
"epoch": 0.027627101512752265,
"grad_norm": 6.929312319633547,
"learning_rate": 2.7628032345013474e-07,
"loss": 2.2942,
"step": 410
},
{
"epoch": 0.027964017384859,
"grad_norm": 7.258446906044968,
"learning_rate": 2.7964959568733156e-07,
"loss": 2.2749,
"step": 415
},
{
"epoch": 0.028300933256965737,
"grad_norm": 7.212702331192978,
"learning_rate": 2.830188679245283e-07,
"loss": 2.3326,
"step": 420
},
{
"epoch": 0.02863784912907247,
"grad_norm": 6.708270480284244,
"learning_rate": 2.863881401617251e-07,
"loss": 2.3198,
"step": 425
},
{
"epoch": 0.028974765001179206,
"grad_norm": 6.502809724253282,
"learning_rate": 2.897574123989218e-07,
"loss": 2.2732,
"step": 430
},
{
"epoch": 0.02931168087328594,
"grad_norm": 6.628997528648987,
"learning_rate": 2.9312668463611853e-07,
"loss": 2.2332,
"step": 435
},
{
"epoch": 0.029648596745392674,
"grad_norm": 7.325695254112328,
"learning_rate": 2.9649595687331536e-07,
"loss": 2.3131,
"step": 440
},
{
"epoch": 0.02998551261749941,
"grad_norm": 8.473803247223215,
"learning_rate": 2.9986522911051207e-07,
"loss": 2.2512,
"step": 445
},
{
"epoch": 0.030322428489606146,
"grad_norm": 6.921554435986179,
"learning_rate": 3.032345013477089e-07,
"loss": 2.2517,
"step": 450
},
{
"epoch": 0.03065934436171288,
"grad_norm": 7.30639146819376,
"learning_rate": 3.066037735849056e-07,
"loss": 2.2854,
"step": 455
},
{
"epoch": 0.030996260233819614,
"grad_norm": 7.431388997634885,
"learning_rate": 3.0997304582210244e-07,
"loss": 2.189,
"step": 460
},
{
"epoch": 0.03133317610592635,
"grad_norm": 6.792538164315586,
"learning_rate": 3.1334231805929915e-07,
"loss": 2.2807,
"step": 465
},
{
"epoch": 0.03167009197803308,
"grad_norm": 7.172802025644924,
"learning_rate": 3.16711590296496e-07,
"loss": 2.2403,
"step": 470
},
{
"epoch": 0.03200700785013982,
"grad_norm": 7.396610718898199,
"learning_rate": 3.200808625336927e-07,
"loss": 2.2779,
"step": 475
},
{
"epoch": 0.03234392372224656,
"grad_norm": 6.645540214064375,
"learning_rate": 3.2345013477088946e-07,
"loss": 2.275,
"step": 480
},
{
"epoch": 0.03268083959435329,
"grad_norm": 6.9885548745948896,
"learning_rate": 3.2681940700808623e-07,
"loss": 2.263,
"step": 485
},
{
"epoch": 0.033017755466460026,
"grad_norm": 6.83925180479448,
"learning_rate": 3.30188679245283e-07,
"loss": 2.2113,
"step": 490
},
{
"epoch": 0.03335467133856676,
"grad_norm": 6.814370294667984,
"learning_rate": 3.3355795148247977e-07,
"loss": 2.298,
"step": 495
},
{
"epoch": 0.033691587210673495,
"grad_norm": 6.750421685926166,
"learning_rate": 3.3692722371967654e-07,
"loss": 2.2178,
"step": 500
},
{
"epoch": 0.03402850308278023,
"grad_norm": 7.5770117147083225,
"learning_rate": 3.402964959568733e-07,
"loss": 2.1977,
"step": 505
},
{
"epoch": 0.03436541895488696,
"grad_norm": 7.294251237933485,
"learning_rate": 3.436657681940701e-07,
"loss": 2.2226,
"step": 510
},
{
"epoch": 0.0347023348269937,
"grad_norm": 6.998637054729564,
"learning_rate": 3.4703504043126685e-07,
"loss": 2.2606,
"step": 515
},
{
"epoch": 0.03503925069910043,
"grad_norm": 6.680594853350787,
"learning_rate": 3.504043126684636e-07,
"loss": 2.2916,
"step": 520
},
{
"epoch": 0.03537616657120717,
"grad_norm": 7.092023051416907,
"learning_rate": 3.5377358490566033e-07,
"loss": 2.2562,
"step": 525
},
{
"epoch": 0.03571308244331391,
"grad_norm": 6.4188706045554085,
"learning_rate": 3.5714285714285716e-07,
"loss": 2.1833,
"step": 530
},
{
"epoch": 0.03604999831542064,
"grad_norm": 7.093402002469701,
"learning_rate": 3.605121293800539e-07,
"loss": 2.2543,
"step": 535
},
{
"epoch": 0.036386914187527375,
"grad_norm": 7.249732464353174,
"learning_rate": 3.638814016172507e-07,
"loss": 2.2095,
"step": 540
},
{
"epoch": 0.03672383005963411,
"grad_norm": 7.0654911164895084,
"learning_rate": 3.672506738544474e-07,
"loss": 2.2404,
"step": 545
},
{
"epoch": 0.03706074593174084,
"grad_norm": 6.603314898627233,
"learning_rate": 3.706199460916442e-07,
"loss": 2.2267,
"step": 550
},
{
"epoch": 0.03739766180384758,
"grad_norm": 6.781857607869617,
"learning_rate": 3.7398921832884095e-07,
"loss": 2.2001,
"step": 555
},
{
"epoch": 0.03773457767595431,
"grad_norm": 6.976038643674048,
"learning_rate": 3.773584905660377e-07,
"loss": 2.2232,
"step": 560
},
{
"epoch": 0.038071493548061046,
"grad_norm": 6.403705006233041,
"learning_rate": 3.807277628032345e-07,
"loss": 2.1761,
"step": 565
},
{
"epoch": 0.03840840942016779,
"grad_norm": 6.661006329480535,
"learning_rate": 3.840970350404312e-07,
"loss": 2.2408,
"step": 570
},
{
"epoch": 0.03874532529227452,
"grad_norm": 6.543237506923167,
"learning_rate": 3.8746630727762803e-07,
"loss": 2.177,
"step": 575
},
{
"epoch": 0.039082241164381255,
"grad_norm": 7.382219270622229,
"learning_rate": 3.9083557951482475e-07,
"loss": 2.1816,
"step": 580
},
{
"epoch": 0.03941915703648799,
"grad_norm": 6.406997261594138,
"learning_rate": 3.9420485175202157e-07,
"loss": 2.2046,
"step": 585
},
{
"epoch": 0.039756072908594724,
"grad_norm": 7.4326501687473465,
"learning_rate": 3.975741239892183e-07,
"loss": 2.2355,
"step": 590
},
{
"epoch": 0.04009298878070146,
"grad_norm": 6.902366223177666,
"learning_rate": 4.009433962264151e-07,
"loss": 2.2618,
"step": 595
},
{
"epoch": 0.04042990465280819,
"grad_norm": 6.688891128794694,
"learning_rate": 4.043126684636118e-07,
"loss": 2.1724,
"step": 600
},
{
"epoch": 0.040766820524914926,
"grad_norm": 6.772799117362383,
"learning_rate": 4.076819407008086e-07,
"loss": 2.2251,
"step": 605
},
{
"epoch": 0.04110373639702166,
"grad_norm": 7.2951116139212395,
"learning_rate": 4.1105121293800537e-07,
"loss": 2.2227,
"step": 610
},
{
"epoch": 0.0414406522691284,
"grad_norm": 7.314163667705275,
"learning_rate": 4.1442048517520213e-07,
"loss": 2.1938,
"step": 615
},
{
"epoch": 0.041777568141235136,
"grad_norm": 7.136287025015161,
"learning_rate": 4.177897574123989e-07,
"loss": 2.2072,
"step": 620
},
{
"epoch": 0.04211448401334187,
"grad_norm": 6.6315151337205664,
"learning_rate": 4.211590296495957e-07,
"loss": 2.1836,
"step": 625
},
{
"epoch": 0.042451399885448604,
"grad_norm": 6.599360278893471,
"learning_rate": 4.2452830188679244e-07,
"loss": 2.1652,
"step": 630
},
{
"epoch": 0.04278831575755534,
"grad_norm": 6.9184973647199595,
"learning_rate": 4.278975741239892e-07,
"loss": 2.2163,
"step": 635
},
{
"epoch": 0.04312523162966207,
"grad_norm": 7.832951280915868,
"learning_rate": 4.31266846361186e-07,
"loss": 2.1826,
"step": 640
},
{
"epoch": 0.04346214750176881,
"grad_norm": 6.941910380094433,
"learning_rate": 4.3463611859838275e-07,
"loss": 2.1137,
"step": 645
},
{
"epoch": 0.04379906337387554,
"grad_norm": 6.563567550854655,
"learning_rate": 4.3800539083557947e-07,
"loss": 2.1699,
"step": 650
},
{
"epoch": 0.044135979245982275,
"grad_norm": 6.309193421506049,
"learning_rate": 4.413746630727763e-07,
"loss": 2.2038,
"step": 655
},
{
"epoch": 0.044472895118089016,
"grad_norm": 7.1931350816677115,
"learning_rate": 4.44743935309973e-07,
"loss": 2.1559,
"step": 660
},
{
"epoch": 0.04480981099019575,
"grad_norm": 7.352209899108727,
"learning_rate": 4.481132075471698e-07,
"loss": 2.2001,
"step": 665
},
{
"epoch": 0.045146726862302484,
"grad_norm": 6.731481980763938,
"learning_rate": 4.5148247978436655e-07,
"loss": 2.1885,
"step": 670
},
{
"epoch": 0.04548364273440922,
"grad_norm": 6.4333683639934724,
"learning_rate": 4.548517520215633e-07,
"loss": 2.2209,
"step": 675
},
{
"epoch": 0.04582055860651595,
"grad_norm": 6.555299933379583,
"learning_rate": 4.582210242587601e-07,
"loss": 2.1986,
"step": 680
},
{
"epoch": 0.04615747447862269,
"grad_norm": 6.555677397835128,
"learning_rate": 4.6159029649595686e-07,
"loss": 2.2115,
"step": 685
},
{
"epoch": 0.04649439035072942,
"grad_norm": 6.8354360632248135,
"learning_rate": 4.649595687331536e-07,
"loss": 2.1043,
"step": 690
},
{
"epoch": 0.046831306222836155,
"grad_norm": 6.728423130936894,
"learning_rate": 4.6832884097035034e-07,
"loss": 2.1579,
"step": 695
},
{
"epoch": 0.04716822209494289,
"grad_norm": 6.904472227996776,
"learning_rate": 4.7169811320754717e-07,
"loss": 2.1363,
"step": 700
},
{
"epoch": 0.04750513796704963,
"grad_norm": 6.802051471379565,
"learning_rate": 4.750673854447439e-07,
"loss": 2.1569,
"step": 705
},
{
"epoch": 0.047842053839156365,
"grad_norm": 6.58219453779912,
"learning_rate": 4.784366576819407e-07,
"loss": 2.2192,
"step": 710
},
{
"epoch": 0.0481789697112631,
"grad_norm": 7.071304617996877,
"learning_rate": 4.818059299191375e-07,
"loss": 2.1741,
"step": 715
},
{
"epoch": 0.04851588558336983,
"grad_norm": 6.383783135641038,
"learning_rate": 4.851752021563342e-07,
"loss": 2.1671,
"step": 720
},
{
"epoch": 0.04885280145547657,
"grad_norm": 7.060000529687919,
"learning_rate": 4.88544474393531e-07,
"loss": 2.1408,
"step": 725
},
{
"epoch": 0.0491897173275833,
"grad_norm": 6.9599946143852405,
"learning_rate": 4.919137466307277e-07,
"loss": 2.1848,
"step": 730
},
{
"epoch": 0.049526633199690036,
"grad_norm": 7.182506932662675,
"learning_rate": 4.952830188679246e-07,
"loss": 2.155,
"step": 735
},
{
"epoch": 0.04986354907179677,
"grad_norm": 7.176366825569252,
"learning_rate": 4.986522911051213e-07,
"loss": 2.0861,
"step": 740
},
{
"epoch": 0.050200464943903504,
"grad_norm": 7.2983414546411565,
"learning_rate": 5.020215633423181e-07,
"loss": 2.1813,
"step": 745
},
{
"epoch": 0.050537380816010245,
"grad_norm": 6.829539638304298,
"learning_rate": 5.053908355795148e-07,
"loss": 2.1295,
"step": 750
},
{
"epoch": 0.05087429668811698,
"grad_norm": 6.222282062474875,
"learning_rate": 5.087601078167115e-07,
"loss": 2.1948,
"step": 755
},
{
"epoch": 0.051211212560223714,
"grad_norm": 6.863565766148611,
"learning_rate": 5.121293800539083e-07,
"loss": 2.1797,
"step": 760
},
{
"epoch": 0.05154812843233045,
"grad_norm": 6.415782001462684,
"learning_rate": 5.154986522911052e-07,
"loss": 2.1882,
"step": 765
},
{
"epoch": 0.05188504430443718,
"grad_norm": 6.77615019110364,
"learning_rate": 5.188679245283019e-07,
"loss": 2.1906,
"step": 770
},
{
"epoch": 0.052221960176543916,
"grad_norm": 6.460890212003313,
"learning_rate": 5.222371967654986e-07,
"loss": 2.1472,
"step": 775
},
{
"epoch": 0.05255887604865065,
"grad_norm": 6.347677993132583,
"learning_rate": 5.256064690026953e-07,
"loss": 2.1341,
"step": 780
},
{
"epoch": 0.052895791920757385,
"grad_norm": 6.751500536981585,
"learning_rate": 5.289757412398921e-07,
"loss": 2.1972,
"step": 785
},
{
"epoch": 0.05323270779286412,
"grad_norm": 7.361128453658074,
"learning_rate": 5.32345013477089e-07,
"loss": 2.1438,
"step": 790
},
{
"epoch": 0.05356962366497086,
"grad_norm": 6.662796897278697,
"learning_rate": 5.357142857142857e-07,
"loss": 2.1632,
"step": 795
},
{
"epoch": 0.053906539537077594,
"grad_norm": 6.094649059417771,
"learning_rate": 5.390835579514824e-07,
"loss": 2.0973,
"step": 800
},
{
"epoch": 0.05424345540918433,
"grad_norm": 7.282836195110237,
"learning_rate": 5.424528301886792e-07,
"loss": 2.133,
"step": 805
},
{
"epoch": 0.05458037128129106,
"grad_norm": 6.23571863669226,
"learning_rate": 5.45822102425876e-07,
"loss": 2.1828,
"step": 810
},
{
"epoch": 0.0549172871533978,
"grad_norm": 7.386022383658537,
"learning_rate": 5.491913746630728e-07,
"loss": 2.0529,
"step": 815
},
{
"epoch": 0.05525420302550453,
"grad_norm": 9.35422370123258,
"learning_rate": 5.525606469002695e-07,
"loss": 2.1733,
"step": 820
},
{
"epoch": 0.055591118897611265,
"grad_norm": 7.192922966723721,
"learning_rate": 5.559299191374662e-07,
"loss": 2.1616,
"step": 825
},
{
"epoch": 0.055928034769718,
"grad_norm": 6.225514128765482,
"learning_rate": 5.592991913746631e-07,
"loss": 2.1473,
"step": 830
},
{
"epoch": 0.05626495064182473,
"grad_norm": 6.788375025181168,
"learning_rate": 5.626684636118598e-07,
"loss": 2.1498,
"step": 835
},
{
"epoch": 0.056601866513931474,
"grad_norm": 6.722245770758269,
"learning_rate": 5.660377358490566e-07,
"loss": 2.1252,
"step": 840
},
{
"epoch": 0.05693878238603821,
"grad_norm": 6.609890705250101,
"learning_rate": 5.694070080862533e-07,
"loss": 2.1435,
"step": 845
},
{
"epoch": 0.05727569825814494,
"grad_norm": 6.603484908307562,
"learning_rate": 5.727762803234502e-07,
"loss": 2.1841,
"step": 850
},
{
"epoch": 0.05761261413025168,
"grad_norm": 6.378869444678416,
"learning_rate": 5.761455525606469e-07,
"loss": 2.1831,
"step": 855
},
{
"epoch": 0.05794953000235841,
"grad_norm": 6.184926864820992,
"learning_rate": 5.795148247978436e-07,
"loss": 2.2116,
"step": 860
},
{
"epoch": 0.058286445874465145,
"grad_norm": 6.900192026852389,
"learning_rate": 5.828840970350404e-07,
"loss": 2.0611,
"step": 865
},
{
"epoch": 0.05862336174657188,
"grad_norm": 6.848190090768429,
"learning_rate": 5.862533692722371e-07,
"loss": 2.1072,
"step": 870
},
{
"epoch": 0.058960277618678614,
"grad_norm": 7.775201016865638,
"learning_rate": 5.89622641509434e-07,
"loss": 2.1282,
"step": 875
},
{
"epoch": 0.05929719349078535,
"grad_norm": 7.029087955968144,
"learning_rate": 5.929919137466307e-07,
"loss": 2.1134,
"step": 880
},
{
"epoch": 0.05963410936289209,
"grad_norm": 7.451133466059003,
"learning_rate": 5.963611859838274e-07,
"loss": 2.1239,
"step": 885
},
{
"epoch": 0.05997102523499882,
"grad_norm": 6.8121792270326065,
"learning_rate": 5.997304582210241e-07,
"loss": 2.1138,
"step": 890
},
{
"epoch": 0.06030794110710556,
"grad_norm": 6.705718982799264,
"learning_rate": 6.030997304582211e-07,
"loss": 2.0839,
"step": 895
},
{
"epoch": 0.06064485697921229,
"grad_norm": 6.870521558441548,
"learning_rate": 6.064690026954178e-07,
"loss": 2.0964,
"step": 900
},
{
"epoch": 0.060981772851319026,
"grad_norm": 7.000218922277794,
"learning_rate": 6.098382749326145e-07,
"loss": 2.0962,
"step": 905
},
{
"epoch": 0.06131868872342576,
"grad_norm": 6.953179430165108,
"learning_rate": 6.132075471698112e-07,
"loss": 2.1055,
"step": 910
},
{
"epoch": 0.061655604595532494,
"grad_norm": 6.928934208954968,
"learning_rate": 6.16576819407008e-07,
"loss": 2.1092,
"step": 915
},
{
"epoch": 0.06199252046763923,
"grad_norm": 6.701423119406395,
"learning_rate": 6.199460916442049e-07,
"loss": 2.0719,
"step": 920
},
{
"epoch": 0.06232943633974596,
"grad_norm": 6.0010161747456,
"learning_rate": 6.233153638814016e-07,
"loss": 2.0888,
"step": 925
},
{
"epoch": 0.0626663522118527,
"grad_norm": 7.097553267015575,
"learning_rate": 6.266846361185983e-07,
"loss": 2.0802,
"step": 930
},
{
"epoch": 0.06300326808395944,
"grad_norm": 6.2986464398316775,
"learning_rate": 6.300539083557951e-07,
"loss": 2.1257,
"step": 935
},
{
"epoch": 0.06334018395606617,
"grad_norm": 6.773220981525513,
"learning_rate": 6.33423180592992e-07,
"loss": 2.0675,
"step": 940
},
{
"epoch": 0.0636770998281729,
"grad_norm": 6.64075808444217,
"learning_rate": 6.367924528301887e-07,
"loss": 2.0588,
"step": 945
},
{
"epoch": 0.06401401570027963,
"grad_norm": 6.704739584304528,
"learning_rate": 6.401617250673854e-07,
"loss": 2.0639,
"step": 950
},
{
"epoch": 0.06435093157238637,
"grad_norm": 6.450367347861596,
"learning_rate": 6.435309973045822e-07,
"loss": 2.1685,
"step": 955
},
{
"epoch": 0.06468784744449312,
"grad_norm": 6.803757171220706,
"learning_rate": 6.469002695417789e-07,
"loss": 2.1149,
"step": 960
},
{
"epoch": 0.06502476331659984,
"grad_norm": 7.0080744698550355,
"learning_rate": 6.502695417789757e-07,
"loss": 2.1161,
"step": 965
},
{
"epoch": 0.06536167918870658,
"grad_norm": 7.172981850795997,
"learning_rate": 6.536388140161725e-07,
"loss": 2.0494,
"step": 970
},
{
"epoch": 0.06569859506081331,
"grad_norm": 6.87372774725762,
"learning_rate": 6.570080862533693e-07,
"loss": 2.0477,
"step": 975
},
{
"epoch": 0.06603551093292005,
"grad_norm": 6.134165071155244,
"learning_rate": 6.60377358490566e-07,
"loss": 2.0151,
"step": 980
},
{
"epoch": 0.06637242680502678,
"grad_norm": 6.658460133134156,
"learning_rate": 6.637466307277628e-07,
"loss": 2.0934,
"step": 985
},
{
"epoch": 0.06670934267713352,
"grad_norm": 6.29191690274154,
"learning_rate": 6.671159029649595e-07,
"loss": 2.0307,
"step": 990
},
{
"epoch": 0.06704625854924025,
"grad_norm": 6.519934752973173,
"learning_rate": 6.704851752021563e-07,
"loss": 2.0922,
"step": 995
},
{
"epoch": 0.06738317442134699,
"grad_norm": 6.84965030274981,
"learning_rate": 6.738544474393531e-07,
"loss": 2.1385,
"step": 1000
},
{
"epoch": 0.06772009029345373,
"grad_norm": 6.503717367498557,
"learning_rate": 6.772237196765498e-07,
"loss": 2.082,
"step": 1005
},
{
"epoch": 0.06805700616556046,
"grad_norm": 7.219950548525877,
"learning_rate": 6.805929919137466e-07,
"loss": 2.0827,
"step": 1010
},
{
"epoch": 0.0683939220376672,
"grad_norm": 6.714972141308813,
"learning_rate": 6.839622641509433e-07,
"loss": 2.1236,
"step": 1015
},
{
"epoch": 0.06873083790977393,
"grad_norm": 6.1863794506801035,
"learning_rate": 6.873315363881402e-07,
"loss": 2.0694,
"step": 1020
},
{
"epoch": 0.06906775378188067,
"grad_norm": 6.519857271262621,
"learning_rate": 6.907008086253369e-07,
"loss": 2.066,
"step": 1025
},
{
"epoch": 0.0694046696539874,
"grad_norm": 6.061987355990859,
"learning_rate": 6.940700808625337e-07,
"loss": 2.0226,
"step": 1030
},
{
"epoch": 0.06974158552609414,
"grad_norm": 6.906351823694033,
"learning_rate": 6.974393530997304e-07,
"loss": 2.1324,
"step": 1035
},
{
"epoch": 0.07007850139820086,
"grad_norm": 5.925534436650087,
"learning_rate": 7.008086253369272e-07,
"loss": 2.1207,
"step": 1040
},
{
"epoch": 0.0704154172703076,
"grad_norm": 7.300437774124698,
"learning_rate": 7.04177897574124e-07,
"loss": 2.0932,
"step": 1045
},
{
"epoch": 0.07075233314241434,
"grad_norm": 7.208321480205573,
"learning_rate": 7.075471698113207e-07,
"loss": 2.1145,
"step": 1050
},
{
"epoch": 0.07108924901452107,
"grad_norm": 6.523539997207994,
"learning_rate": 7.109164420485175e-07,
"loss": 2.1084,
"step": 1055
},
{
"epoch": 0.07142616488662781,
"grad_norm": 6.6532730740194905,
"learning_rate": 7.142857142857143e-07,
"loss": 2.0747,
"step": 1060
},
{
"epoch": 0.07176308075873454,
"grad_norm": 5.910525317424687,
"learning_rate": 7.17654986522911e-07,
"loss": 2.0412,
"step": 1065
},
{
"epoch": 0.07209999663084128,
"grad_norm": 6.671481682607736,
"learning_rate": 7.210242587601077e-07,
"loss": 2.0943,
"step": 1070
},
{
"epoch": 0.07243691250294801,
"grad_norm": 7.410527719144032,
"learning_rate": 7.243935309973046e-07,
"loss": 2.089,
"step": 1075
},
{
"epoch": 0.07277382837505475,
"grad_norm": 6.998731809026702,
"learning_rate": 7.277628032345014e-07,
"loss": 2.0711,
"step": 1080
},
{
"epoch": 0.07311074424716148,
"grad_norm": 6.168070160591785,
"learning_rate": 7.311320754716981e-07,
"loss": 2.1038,
"step": 1085
},
{
"epoch": 0.07344766011926822,
"grad_norm": 6.445599092013598,
"learning_rate": 7.345013477088948e-07,
"loss": 2.1231,
"step": 1090
},
{
"epoch": 0.07378457599137496,
"grad_norm": 6.847043979859011,
"learning_rate": 7.378706199460915e-07,
"loss": 2.0792,
"step": 1095
},
{
"epoch": 0.07412149186348169,
"grad_norm": 6.118594411094878,
"learning_rate": 7.412398921832884e-07,
"loss": 2.0615,
"step": 1100
},
{
"epoch": 0.07445840773558843,
"grad_norm": 6.4444397980063535,
"learning_rate": 7.446091644204852e-07,
"loss": 2.0941,
"step": 1105
},
{
"epoch": 0.07479532360769515,
"grad_norm": 6.900009739269609,
"learning_rate": 7.479784366576819e-07,
"loss": 2.0874,
"step": 1110
},
{
"epoch": 0.0751322394798019,
"grad_norm": 6.715563896490011,
"learning_rate": 7.513477088948786e-07,
"loss": 2.0702,
"step": 1115
},
{
"epoch": 0.07546915535190862,
"grad_norm": 6.683671984170299,
"learning_rate": 7.547169811320754e-07,
"loss": 2.0257,
"step": 1120
},
{
"epoch": 0.07580607122401536,
"grad_norm": 6.50723958019259,
"learning_rate": 7.580862533692723e-07,
"loss": 2.0737,
"step": 1125
},
{
"epoch": 0.07614298709612209,
"grad_norm": 6.061015680440867,
"learning_rate": 7.61455525606469e-07,
"loss": 2.0992,
"step": 1130
},
{
"epoch": 0.07647990296822883,
"grad_norm": 6.587328595415548,
"learning_rate": 7.648247978436657e-07,
"loss": 2.0405,
"step": 1135
},
{
"epoch": 0.07681681884033557,
"grad_norm": 6.387267569159134,
"learning_rate": 7.681940700808624e-07,
"loss": 2.0936,
"step": 1140
},
{
"epoch": 0.0771537347124423,
"grad_norm": 6.5689891488973045,
"learning_rate": 7.715633423180593e-07,
"loss": 2.0483,
"step": 1145
},
{
"epoch": 0.07749065058454904,
"grad_norm": 6.98294967939874,
"learning_rate": 7.749326145552561e-07,
"loss": 2.0804,
"step": 1150
},
{
"epoch": 0.07782756645665577,
"grad_norm": 6.8502328744808665,
"learning_rate": 7.783018867924528e-07,
"loss": 2.0386,
"step": 1155
},
{
"epoch": 0.07816448232876251,
"grad_norm": 6.330263851851983,
"learning_rate": 7.816711590296495e-07,
"loss": 2.0429,
"step": 1160
},
{
"epoch": 0.07850139820086924,
"grad_norm": 6.558472938499403,
"learning_rate": 7.850404312668463e-07,
"loss": 2.1344,
"step": 1165
},
{
"epoch": 0.07883831407297598,
"grad_norm": 6.793056886526596,
"learning_rate": 7.884097035040431e-07,
"loss": 2.053,
"step": 1170
},
{
"epoch": 0.0791752299450827,
"grad_norm": 6.334091419449102,
"learning_rate": 7.917789757412399e-07,
"loss": 2.0741,
"step": 1175
},
{
"epoch": 0.07951214581718945,
"grad_norm": 6.995659281890644,
"learning_rate": 7.951482479784366e-07,
"loss": 2.0299,
"step": 1180
},
{
"epoch": 0.07984906168929619,
"grad_norm": 6.057490760704781,
"learning_rate": 7.985175202156334e-07,
"loss": 2.0367,
"step": 1185
},
{
"epoch": 0.08018597756140292,
"grad_norm": 6.812009588247003,
"learning_rate": 8.018867924528302e-07,
"loss": 2.1046,
"step": 1190
},
{
"epoch": 0.08052289343350966,
"grad_norm": 6.880635288270827,
"learning_rate": 8.052560646900269e-07,
"loss": 2.0334,
"step": 1195
},
{
"epoch": 0.08085980930561638,
"grad_norm": 6.440406473747084,
"learning_rate": 8.086253369272237e-07,
"loss": 2.0336,
"step": 1200
},
{
"epoch": 0.08119672517772313,
"grad_norm": 6.2159531204319896,
"learning_rate": 8.119946091644204e-07,
"loss": 2.0666,
"step": 1205
},
{
"epoch": 0.08153364104982985,
"grad_norm": 6.596526872386366,
"learning_rate": 8.153638814016172e-07,
"loss": 2.1116,
"step": 1210
},
{
"epoch": 0.0818705569219366,
"grad_norm": 6.215209904994124,
"learning_rate": 8.18733153638814e-07,
"loss": 2.0579,
"step": 1215
},
{
"epoch": 0.08220747279404332,
"grad_norm": 6.583210332500679,
"learning_rate": 8.221024258760107e-07,
"loss": 2.014,
"step": 1220
},
{
"epoch": 0.08254438866615006,
"grad_norm": 6.212012798535623,
"learning_rate": 8.254716981132074e-07,
"loss": 2.0458,
"step": 1225
},
{
"epoch": 0.0828813045382568,
"grad_norm": 6.470292156093085,
"learning_rate": 8.288409703504043e-07,
"loss": 2.0949,
"step": 1230
},
{
"epoch": 0.08321822041036353,
"grad_norm": 6.0866198998833685,
"learning_rate": 8.322102425876011e-07,
"loss": 2.0176,
"step": 1235
},
{
"epoch": 0.08355513628247027,
"grad_norm": 6.635224587579871,
"learning_rate": 8.355795148247978e-07,
"loss": 2.0963,
"step": 1240
},
{
"epoch": 0.083892052154577,
"grad_norm": 5.712838593846866,
"learning_rate": 8.389487870619945e-07,
"loss": 2.0503,
"step": 1245
},
{
"epoch": 0.08422896802668374,
"grad_norm": 5.933358327100002,
"learning_rate": 8.423180592991913e-07,
"loss": 2.0016,
"step": 1250
},
{
"epoch": 0.08456588389879047,
"grad_norm": 7.143451933361089,
"learning_rate": 8.456873315363881e-07,
"loss": 2.0677,
"step": 1255
},
{
"epoch": 0.08490279977089721,
"grad_norm": 6.703706638919056,
"learning_rate": 8.490566037735849e-07,
"loss": 2.01,
"step": 1260
},
{
"epoch": 0.08523971564300394,
"grad_norm": 6.0965320200893345,
"learning_rate": 8.524258760107816e-07,
"loss": 1.988,
"step": 1265
},
{
"epoch": 0.08557663151511068,
"grad_norm": 7.193765211165279,
"learning_rate": 8.557951482479784e-07,
"loss": 2.1143,
"step": 1270
},
{
"epoch": 0.08591354738721742,
"grad_norm": 6.301021380513151,
"learning_rate": 8.591644204851751e-07,
"loss": 2.049,
"step": 1275
},
{
"epoch": 0.08625046325932414,
"grad_norm": 6.904570371972653,
"learning_rate": 8.62533692722372e-07,
"loss": 2.0629,
"step": 1280
},
{
"epoch": 0.08658737913143089,
"grad_norm": 6.214009445437056,
"learning_rate": 8.659029649595687e-07,
"loss": 2.0645,
"step": 1285
},
{
"epoch": 0.08692429500353761,
"grad_norm": 6.6425747142232785,
"learning_rate": 8.692722371967655e-07,
"loss": 2.1192,
"step": 1290
},
{
"epoch": 0.08726121087564435,
"grad_norm": 6.715501139786349,
"learning_rate": 8.726415094339622e-07,
"loss": 2.0797,
"step": 1295
},
{
"epoch": 0.08759812674775108,
"grad_norm": 7.117189534146049,
"learning_rate": 8.760107816711589e-07,
"loss": 2.1073,
"step": 1300
},
{
"epoch": 0.08793504261985782,
"grad_norm": 6.677619622948294,
"learning_rate": 8.793800539083558e-07,
"loss": 2.0255,
"step": 1305
},
{
"epoch": 0.08827195849196455,
"grad_norm": 6.784702166999388,
"learning_rate": 8.827493261455526e-07,
"loss": 2.0463,
"step": 1310
},
{
"epoch": 0.08860887436407129,
"grad_norm": 6.6534308801544615,
"learning_rate": 8.861185983827493e-07,
"loss": 2.0355,
"step": 1315
},
{
"epoch": 0.08894579023617803,
"grad_norm": 6.344151360801522,
"learning_rate": 8.89487870619946e-07,
"loss": 2.0036,
"step": 1320
},
{
"epoch": 0.08928270610828476,
"grad_norm": 6.580775236275343,
"learning_rate": 8.928571428571428e-07,
"loss": 2.1212,
"step": 1325
},
{
"epoch": 0.0896196219803915,
"grad_norm": 6.968641119999037,
"learning_rate": 8.962264150943396e-07,
"loss": 2.0261,
"step": 1330
},
{
"epoch": 0.08995653785249823,
"grad_norm": 7.747892608432803,
"learning_rate": 8.995956873315364e-07,
"loss": 2.0266,
"step": 1335
},
{
"epoch": 0.09029345372460497,
"grad_norm": 7.06577595704722,
"learning_rate": 9.029649595687331e-07,
"loss": 2.0086,
"step": 1340
},
{
"epoch": 0.0906303695967117,
"grad_norm": 6.2867926464117065,
"learning_rate": 9.063342318059298e-07,
"loss": 2.0108,
"step": 1345
},
{
"epoch": 0.09096728546881844,
"grad_norm": 6.481546616940274,
"learning_rate": 9.097035040431266e-07,
"loss": 2.0175,
"step": 1350
},
{
"epoch": 0.09130420134092516,
"grad_norm": 6.801814398606904,
"learning_rate": 9.130727762803235e-07,
"loss": 2.0203,
"step": 1355
},
{
"epoch": 0.0916411172130319,
"grad_norm": 5.801249089738751,
"learning_rate": 9.164420485175202e-07,
"loss": 2.0812,
"step": 1360
},
{
"epoch": 0.09197803308513865,
"grad_norm": 5.945627363410938,
"learning_rate": 9.198113207547169e-07,
"loss": 2.0239,
"step": 1365
},
{
"epoch": 0.09231494895724537,
"grad_norm": 5.930218401587245,
"learning_rate": 9.231805929919137e-07,
"loss": 2.0187,
"step": 1370
},
{
"epoch": 0.09265186482935212,
"grad_norm": 6.886170229635843,
"learning_rate": 9.265498652291105e-07,
"loss": 2.0579,
"step": 1375
},
{
"epoch": 0.09298878070145884,
"grad_norm": 6.505667387657664,
"learning_rate": 9.299191374663073e-07,
"loss": 2.0544,
"step": 1380
},
{
"epoch": 0.09332569657356558,
"grad_norm": 6.402314934607021,
"learning_rate": 9.33288409703504e-07,
"loss": 2.0511,
"step": 1385
},
{
"epoch": 0.09366261244567231,
"grad_norm": 6.242297912959462,
"learning_rate": 9.366576819407007e-07,
"loss": 2.0305,
"step": 1390
},
{
"epoch": 0.09399952831777905,
"grad_norm": 5.557824750137879,
"learning_rate": 9.400269541778976e-07,
"loss": 1.975,
"step": 1395
},
{
"epoch": 0.09433644418988578,
"grad_norm": 6.438317330999812,
"learning_rate": 9.433962264150943e-07,
"loss": 2.0444,
"step": 1400
},
{
"epoch": 0.09467336006199252,
"grad_norm": 6.829458975236009,
"learning_rate": 9.46765498652291e-07,
"loss": 2.0997,
"step": 1405
},
{
"epoch": 0.09501027593409926,
"grad_norm": 6.052680993841202,
"learning_rate": 9.501347708894878e-07,
"loss": 1.9701,
"step": 1410
},
{
"epoch": 0.09534719180620599,
"grad_norm": 6.145341438647098,
"learning_rate": 9.535040431266847e-07,
"loss": 2.0437,
"step": 1415
},
{
"epoch": 0.09568410767831273,
"grad_norm": 6.386909941250297,
"learning_rate": 9.568733153638813e-07,
"loss": 1.9988,
"step": 1420
},
{
"epoch": 0.09602102355041946,
"grad_norm": 7.2117096771751585,
"learning_rate": 9.60242587601078e-07,
"loss": 1.9924,
"step": 1425
},
{
"epoch": 0.0963579394225262,
"grad_norm": 7.090905319078084,
"learning_rate": 9.63611859838275e-07,
"loss": 2.0736,
"step": 1430
},
{
"epoch": 0.09669485529463293,
"grad_norm": 5.765675979737204,
"learning_rate": 9.669811320754717e-07,
"loss": 2.0255,
"step": 1435
},
{
"epoch": 0.09703177116673967,
"grad_norm": 6.302835725851336,
"learning_rate": 9.703504043126684e-07,
"loss": 1.9923,
"step": 1440
},
{
"epoch": 0.0973686870388464,
"grad_norm": 6.105386769901037,
"learning_rate": 9.73719676549865e-07,
"loss": 2.0062,
"step": 1445
},
{
"epoch": 0.09770560291095313,
"grad_norm": 5.779479200910098,
"learning_rate": 9.77088948787062e-07,
"loss": 1.9981,
"step": 1450
},
{
"epoch": 0.09804251878305988,
"grad_norm": 6.478566809649276,
"learning_rate": 9.804582210242587e-07,
"loss": 2.014,
"step": 1455
},
{
"epoch": 0.0983794346551666,
"grad_norm": 6.462510705631694,
"learning_rate": 9.838274932614555e-07,
"loss": 2.0597,
"step": 1460
},
{
"epoch": 0.09871635052727334,
"grad_norm": 7.517939119617698,
"learning_rate": 9.871967654986522e-07,
"loss": 2.0555,
"step": 1465
},
{
"epoch": 0.09905326639938007,
"grad_norm": 6.944192458999394,
"learning_rate": 9.90566037735849e-07,
"loss": 1.9715,
"step": 1470
},
{
"epoch": 0.09939018227148681,
"grad_norm": 6.3353086122850275,
"learning_rate": 9.939353099730458e-07,
"loss": 1.9726,
"step": 1475
},
{
"epoch": 0.09972709814359354,
"grad_norm": 6.701570960194593,
"learning_rate": 9.973045822102425e-07,
"loss": 2.0269,
"step": 1480
},
{
"epoch": 0.10006401401570028,
"grad_norm": 6.056963246639003,
"learning_rate": 9.999999861679377e-07,
"loss": 1.9261,
"step": 1485
},
{
"epoch": 0.10040092988780701,
"grad_norm": 6.84014658632624,
"learning_rate": 9.999995020458434e-07,
"loss": 1.9683,
"step": 1490
},
{
"epoch": 0.10073784575991375,
"grad_norm": 6.615520534333261,
"learning_rate": 9.99998326321407e-07,
"loss": 2.0027,
"step": 1495
},
{
"epoch": 0.10107476163202049,
"grad_norm": 6.417598276308663,
"learning_rate": 9.999964589962556e-07,
"loss": 1.9595,
"step": 1500
},
{
"epoch": 0.10141167750412722,
"grad_norm": 6.838049024969315,
"learning_rate": 9.999939000729715e-07,
"loss": 2.0318,
"step": 1505
},
{
"epoch": 0.10174859337623396,
"grad_norm": 5.9861108568338315,
"learning_rate": 9.999906495550946e-07,
"loss": 2.0101,
"step": 1510
},
{
"epoch": 0.10208550924834069,
"grad_norm": 6.672821704365139,
"learning_rate": 9.999867074471207e-07,
"loss": 2.0185,
"step": 1515
},
{
"epoch": 0.10242242512044743,
"grad_norm": 5.864345861096738,
"learning_rate": 9.99982073754503e-07,
"loss": 1.9601,
"step": 1520
},
{
"epoch": 0.10275934099255415,
"grad_norm": 5.6384613813717435,
"learning_rate": 9.999767484836502e-07,
"loss": 1.9809,
"step": 1525
},
{
"epoch": 0.1030962568646609,
"grad_norm": 6.607737085201073,
"learning_rate": 9.999707316419288e-07,
"loss": 1.9371,
"step": 1530
},
{
"epoch": 0.10343317273676762,
"grad_norm": 5.92107920065298,
"learning_rate": 9.99964023237661e-07,
"loss": 1.9583,
"step": 1535
},
{
"epoch": 0.10377008860887436,
"grad_norm": 6.797382899990634,
"learning_rate": 9.999566232801261e-07,
"loss": 1.9857,
"step": 1540
},
{
"epoch": 0.1041070044809811,
"grad_norm": 6.4291929016349,
"learning_rate": 9.9994853177956e-07,
"loss": 1.9806,
"step": 1545
},
{
"epoch": 0.10444392035308783,
"grad_norm": 6.354612439425369,
"learning_rate": 9.999397487471543e-07,
"loss": 2.0837,
"step": 1550
},
{
"epoch": 0.10478083622519457,
"grad_norm": 6.203296467397325,
"learning_rate": 9.999302741950582e-07,
"loss": 2.0008,
"step": 1555
},
{
"epoch": 0.1051177520973013,
"grad_norm": 6.765947240996871,
"learning_rate": 9.999201081363768e-07,
"loss": 2.018,
"step": 1560
},
{
"epoch": 0.10545466796940804,
"grad_norm": 6.364878195328014,
"learning_rate": 9.99909250585172e-07,
"loss": 2.0204,
"step": 1565
},
{
"epoch": 0.10579158384151477,
"grad_norm": 6.249427081573669,
"learning_rate": 9.998977015564617e-07,
"loss": 2.0793,
"step": 1570
},
{
"epoch": 0.10612849971362151,
"grad_norm": 6.090825980208063,
"learning_rate": 9.998854610662209e-07,
"loss": 2.0201,
"step": 1575
},
{
"epoch": 0.10646541558572824,
"grad_norm": 6.387247022298909,
"learning_rate": 9.998725291313805e-07,
"loss": 2.0238,
"step": 1580
},
{
"epoch": 0.10680233145783498,
"grad_norm": 10.962328465461098,
"learning_rate": 9.998589057698283e-07,
"loss": 2.0252,
"step": 1585
},
{
"epoch": 0.10713924732994172,
"grad_norm": 6.758082756394557,
"learning_rate": 9.99844591000408e-07,
"loss": 1.9437,
"step": 1590
},
{
"epoch": 0.10747616320204845,
"grad_norm": 6.409721660335127,
"learning_rate": 9.9982958484292e-07,
"loss": 2.0102,
"step": 1595
},
{
"epoch": 0.10781307907415519,
"grad_norm": 6.521098857160827,
"learning_rate": 9.99813887318121e-07,
"loss": 1.9555,
"step": 1600
},
{
"epoch": 0.10814999494626192,
"grad_norm": 6.719644060906998,
"learning_rate": 9.997974984477236e-07,
"loss": 2.0169,
"step": 1605
},
{
"epoch": 0.10848691081836866,
"grad_norm": 6.2454936784635,
"learning_rate": 9.99780418254397e-07,
"loss": 1.9815,
"step": 1610
},
{
"epoch": 0.10882382669047538,
"grad_norm": 6.301452184455107,
"learning_rate": 9.99762646761767e-07,
"loss": 2.0398,
"step": 1615
},
{
"epoch": 0.10916074256258212,
"grad_norm": 6.6193329719459495,
"learning_rate": 9.99744183994415e-07,
"loss": 1.9679,
"step": 1620
},
{
"epoch": 0.10949765843468885,
"grad_norm": 6.451959168059164,
"learning_rate": 9.997250299778788e-07,
"loss": 2.014,
"step": 1625
},
{
"epoch": 0.1098345743067956,
"grad_norm": 6.906957587107302,
"learning_rate": 9.997051847386524e-07,
"loss": 2.024,
"step": 1630
},
{
"epoch": 0.11017149017890233,
"grad_norm": 6.132267437341465,
"learning_rate": 9.996846483041858e-07,
"loss": 2.0683,
"step": 1635
},
{
"epoch": 0.11050840605100906,
"grad_norm": 6.58193618216666,
"learning_rate": 9.99663420702885e-07,
"loss": 2.0585,
"step": 1640
},
{
"epoch": 0.1108453219231158,
"grad_norm": 6.375002932094628,
"learning_rate": 9.996415019641124e-07,
"loss": 1.9758,
"step": 1645
},
{
"epoch": 0.11118223779522253,
"grad_norm": 6.196537733272767,
"learning_rate": 9.996188921181861e-07,
"loss": 1.9839,
"step": 1650
},
{
"epoch": 0.11151915366732927,
"grad_norm": 6.578190053853053,
"learning_rate": 9.9959559119638e-07,
"loss": 1.966,
"step": 1655
},
{
"epoch": 0.111856069539436,
"grad_norm": 6.219530006448483,
"learning_rate": 9.995715992309244e-07,
"loss": 2.0064,
"step": 1660
},
{
"epoch": 0.11219298541154274,
"grad_norm": 6.08905725259499,
"learning_rate": 9.995469162550048e-07,
"loss": 1.9459,
"step": 1665
},
{
"epoch": 0.11252990128364947,
"grad_norm": 6.718246145997958,
"learning_rate": 9.99521542302763e-07,
"loss": 1.9835,
"step": 1670
},
{
"epoch": 0.11286681715575621,
"grad_norm": 6.254668566153578,
"learning_rate": 9.994954774092962e-07,
"loss": 1.9425,
"step": 1675
},
{
"epoch": 0.11320373302786295,
"grad_norm": 6.53959891506213,
"learning_rate": 9.994687216106579e-07,
"loss": 1.9768,
"step": 1680
},
{
"epoch": 0.11354064889996968,
"grad_norm": 6.0849150556619565,
"learning_rate": 9.994412749438564e-07,
"loss": 1.9348,
"step": 1685
},
{
"epoch": 0.11387756477207642,
"grad_norm": 6.256083226566406,
"learning_rate": 9.994131374468565e-07,
"loss": 1.949,
"step": 1690
},
{
"epoch": 0.11421448064418314,
"grad_norm": 5.7376740963750885,
"learning_rate": 9.993843091585782e-07,
"loss": 1.9952,
"step": 1695
},
{
"epoch": 0.11455139651628989,
"grad_norm": 6.115524205739567,
"learning_rate": 9.993547901188966e-07,
"loss": 1.9722,
"step": 1700
},
{
"epoch": 0.11488831238839661,
"grad_norm": 6.245389899651034,
"learning_rate": 9.993245803686426e-07,
"loss": 1.925,
"step": 1705
},
{
"epoch": 0.11522522826050335,
"grad_norm": 6.247598728078289,
"learning_rate": 9.992936799496029e-07,
"loss": 1.9901,
"step": 1710
},
{
"epoch": 0.11556214413261008,
"grad_norm": 6.325351967024773,
"learning_rate": 9.99262088904519e-07,
"loss": 1.9469,
"step": 1715
},
{
"epoch": 0.11589906000471682,
"grad_norm": 6.393709375328121,
"learning_rate": 9.992298072770877e-07,
"loss": 2.0358,
"step": 1720
},
{
"epoch": 0.11623597587682356,
"grad_norm": 6.288602341315024,
"learning_rate": 9.991968351119612e-07,
"loss": 1.9845,
"step": 1725
},
{
"epoch": 0.11657289174893029,
"grad_norm": 6.754113952908161,
"learning_rate": 9.991631724547467e-07,
"loss": 1.9789,
"step": 1730
},
{
"epoch": 0.11690980762103703,
"grad_norm": 6.39680247647427,
"learning_rate": 9.99128819352007e-07,
"loss": 2.0299,
"step": 1735
},
{
"epoch": 0.11724672349314376,
"grad_norm": 6.269165973260075,
"learning_rate": 9.99093775851259e-07,
"loss": 2.0239,
"step": 1740
},
{
"epoch": 0.1175836393652505,
"grad_norm": 5.989228704848224,
"learning_rate": 9.990580420009755e-07,
"loss": 2.0186,
"step": 1745
},
{
"epoch": 0.11792055523735723,
"grad_norm": 6.264711564747278,
"learning_rate": 9.990216178505835e-07,
"loss": 1.9824,
"step": 1750
},
{
"epoch": 0.11825747110946397,
"grad_norm": 6.220647194759505,
"learning_rate": 9.989845034504651e-07,
"loss": 1.9392,
"step": 1755
},
{
"epoch": 0.1185943869815707,
"grad_norm": 5.876106626219256,
"learning_rate": 9.989466988519572e-07,
"loss": 1.9485,
"step": 1760
},
{
"epoch": 0.11893130285367744,
"grad_norm": 6.628947048760207,
"learning_rate": 9.989082041073517e-07,
"loss": 1.9315,
"step": 1765
},
{
"epoch": 0.11926821872578418,
"grad_norm": 6.623615345814385,
"learning_rate": 9.988690192698944e-07,
"loss": 1.9931,
"step": 1770
},
{
"epoch": 0.1196051345978909,
"grad_norm": 6.281816857100239,
"learning_rate": 9.988291443937857e-07,
"loss": 2.001,
"step": 1775
},
{
"epoch": 0.11994205046999765,
"grad_norm": 5.808970810910876,
"learning_rate": 9.987885795341816e-07,
"loss": 1.9608,
"step": 1780
},
{
"epoch": 0.12027896634210437,
"grad_norm": 6.0557446750595165,
"learning_rate": 9.987473247471908e-07,
"loss": 1.9572,
"step": 1785
},
{
"epoch": 0.12061588221421111,
"grad_norm": 7.277415642252567,
"learning_rate": 9.98705380089878e-07,
"loss": 1.9735,
"step": 1790
},
{
"epoch": 0.12095279808631784,
"grad_norm": 6.413165468216076,
"learning_rate": 9.986627456202608e-07,
"loss": 2.0268,
"step": 1795
},
{
"epoch": 0.12128971395842458,
"grad_norm": 6.842142941489323,
"learning_rate": 9.986194213973113e-07,
"loss": 1.9498,
"step": 1800
},
{
"epoch": 0.12162662983053131,
"grad_norm": 6.284595686241228,
"learning_rate": 9.985754074809562e-07,
"loss": 1.9252,
"step": 1805
},
{
"epoch": 0.12196354570263805,
"grad_norm": 5.898293427626579,
"learning_rate": 9.985307039320756e-07,
"loss": 1.952,
"step": 1810
},
{
"epoch": 0.12230046157474479,
"grad_norm": 6.039978658719599,
"learning_rate": 9.98485310812504e-07,
"loss": 1.9808,
"step": 1815
},
{
"epoch": 0.12263737744685152,
"grad_norm": 5.694946193557863,
"learning_rate": 9.98439228185029e-07,
"loss": 1.9814,
"step": 1820
},
{
"epoch": 0.12297429331895826,
"grad_norm": 6.545566967262388,
"learning_rate": 9.983924561133927e-07,
"loss": 1.8983,
"step": 1825
},
{
"epoch": 0.12331120919106499,
"grad_norm": 6.728834164161067,
"learning_rate": 9.983449946622906e-07,
"loss": 1.9285,
"step": 1830
},
{
"epoch": 0.12364812506317173,
"grad_norm": 5.91994081131544,
"learning_rate": 9.982968438973714e-07,
"loss": 1.9673,
"step": 1835
},
{
"epoch": 0.12398504093527846,
"grad_norm": 6.386494266635038,
"learning_rate": 9.982480038852375e-07,
"loss": 2.0266,
"step": 1840
},
{
"epoch": 0.1243219568073852,
"grad_norm": 6.441402002868234,
"learning_rate": 9.98198474693445e-07,
"loss": 1.968,
"step": 1845
},
{
"epoch": 0.12465887267949192,
"grad_norm": 6.740829824949441,
"learning_rate": 9.981482563905025e-07,
"loss": 2.0041,
"step": 1850
},
{
"epoch": 0.12499578855159867,
"grad_norm": 5.710071008321578,
"learning_rate": 9.980973490458728e-07,
"loss": 1.9811,
"step": 1855
},
{
"epoch": 0.1253327044237054,
"grad_norm": 6.18536895105298,
"learning_rate": 9.980457527299708e-07,
"loss": 1.9666,
"step": 1860
},
{
"epoch": 0.12566962029581213,
"grad_norm": 6.304203854455156,
"learning_rate": 9.979934675141652e-07,
"loss": 1.9504,
"step": 1865
},
{
"epoch": 0.12600653616791888,
"grad_norm": 6.359105927612574,
"learning_rate": 9.979404934707771e-07,
"loss": 1.934,
"step": 1870
},
{
"epoch": 0.12634345204002562,
"grad_norm": 7.021272358637824,
"learning_rate": 9.978868306730804e-07,
"loss": 1.9559,
"step": 1875
},
{
"epoch": 0.12668036791213233,
"grad_norm": 8.721181121099121,
"learning_rate": 9.978324791953018e-07,
"loss": 1.9553,
"step": 1880
},
{
"epoch": 0.12701728378423907,
"grad_norm": 5.928281083657215,
"learning_rate": 9.97777439112621e-07,
"loss": 1.9623,
"step": 1885
},
{
"epoch": 0.1273541996563458,
"grad_norm": 7.636936649110356,
"learning_rate": 9.977217105011693e-07,
"loss": 1.9405,
"step": 1890
},
{
"epoch": 0.12769111552845255,
"grad_norm": 6.378639048271833,
"learning_rate": 9.97665293438031e-07,
"loss": 1.9831,
"step": 1895
},
{
"epoch": 0.12802803140055927,
"grad_norm": 6.416618144613928,
"learning_rate": 9.976081880012426e-07,
"loss": 1.919,
"step": 1900
},
{
"epoch": 0.128364947272666,
"grad_norm": 6.5865892724690625,
"learning_rate": 9.975503942697925e-07,
"loss": 1.9804,
"step": 1905
},
{
"epoch": 0.12870186314477275,
"grad_norm": 6.38034752020973,
"learning_rate": 9.974919123236217e-07,
"loss": 1.9639,
"step": 1910
},
{
"epoch": 0.1290387790168795,
"grad_norm": 6.453173099164715,
"learning_rate": 9.974327422436223e-07,
"loss": 1.9276,
"step": 1915
},
{
"epoch": 0.12937569488898623,
"grad_norm": 5.566683381219873,
"learning_rate": 9.97372884111639e-07,
"loss": 1.9437,
"step": 1920
},
{
"epoch": 0.12971261076109294,
"grad_norm": 6.424615749315436,
"learning_rate": 9.97312338010468e-07,
"loss": 1.996,
"step": 1925
},
{
"epoch": 0.13004952663319969,
"grad_norm": 6.189066134613252,
"learning_rate": 9.97251104023857e-07,
"loss": 2.0165,
"step": 1930
},
{
"epoch": 0.13038644250530643,
"grad_norm": 6.153302464363939,
"learning_rate": 9.971891822365048e-07,
"loss": 1.963,
"step": 1935
},
{
"epoch": 0.13072335837741317,
"grad_norm": 5.898027522950926,
"learning_rate": 9.971265727340627e-07,
"loss": 1.9446,
"step": 1940
},
{
"epoch": 0.13106027424951988,
"grad_norm": 6.621778499253866,
"learning_rate": 9.970632756031322e-07,
"loss": 2.0041,
"step": 1945
},
{
"epoch": 0.13139719012162662,
"grad_norm": 6.622837129275348,
"learning_rate": 9.969992909312658e-07,
"loss": 1.9858,
"step": 1950
},
{
"epoch": 0.13173410599373336,
"grad_norm": 6.052858967214188,
"learning_rate": 9.969346188069684e-07,
"loss": 1.9916,
"step": 1955
},
{
"epoch": 0.1320710218658401,
"grad_norm": 6.179560277489928,
"learning_rate": 9.968692593196943e-07,
"loss": 1.8877,
"step": 1960
},
{
"epoch": 0.13240793773794685,
"grad_norm": 6.162522901829469,
"learning_rate": 9.968032125598493e-07,
"loss": 2.0524,
"step": 1965
},
{
"epoch": 0.13274485361005356,
"grad_norm": 5.902608589754359,
"learning_rate": 9.967364786187894e-07,
"loss": 1.9449,
"step": 1970
},
{
"epoch": 0.1330817694821603,
"grad_norm": 6.323335173726953,
"learning_rate": 9.96669057588822e-07,
"loss": 1.9734,
"step": 1975
},
{
"epoch": 0.13341868535426704,
"grad_norm": 5.8379906828434285,
"learning_rate": 9.966009495632037e-07,
"loss": 1.9404,
"step": 1980
},
{
"epoch": 0.13375560122637378,
"grad_norm": 5.991919976178825,
"learning_rate": 9.965321546361421e-07,
"loss": 1.956,
"step": 1985
},
{
"epoch": 0.1340925170984805,
"grad_norm": 5.967331390409548,
"learning_rate": 9.964626729027948e-07,
"loss": 1.953,
"step": 1990
},
{
"epoch": 0.13442943297058724,
"grad_norm": 6.281340926030527,
"learning_rate": 9.963925044592695e-07,
"loss": 1.9961,
"step": 1995
},
{
"epoch": 0.13476634884269398,
"grad_norm": 6.741635709263175,
"learning_rate": 9.963216494026235e-07,
"loss": 1.912,
"step": 2000
},
{
"epoch": 0.13510326471480072,
"grad_norm": 6.08250513108592,
"learning_rate": 9.962501078308636e-07,
"loss": 1.9913,
"step": 2005
},
{
"epoch": 0.13544018058690746,
"grad_norm": 6.60765627452018,
"learning_rate": 9.96177879842947e-07,
"loss": 1.9449,
"step": 2010
},
{
"epoch": 0.13577709645901417,
"grad_norm": 6.566107141142278,
"learning_rate": 9.961049655387799e-07,
"loss": 1.9879,
"step": 2015
},
{
"epoch": 0.13611401233112091,
"grad_norm": 6.335526898300328,
"learning_rate": 9.960313650192175e-07,
"loss": 1.9439,
"step": 2020
},
{
"epoch": 0.13645092820322766,
"grad_norm": 6.277578595751896,
"learning_rate": 9.959570783860647e-07,
"loss": 1.9433,
"step": 2025
},
{
"epoch": 0.1367878440753344,
"grad_norm": 5.9244821304571085,
"learning_rate": 9.958821057420752e-07,
"loss": 1.9744,
"step": 2030
},
{
"epoch": 0.1371247599474411,
"grad_norm": 6.466226232433509,
"learning_rate": 9.958064471909513e-07,
"loss": 1.8964,
"step": 2035
},
{
"epoch": 0.13746167581954785,
"grad_norm": 6.203526305769233,
"learning_rate": 9.95730102837345e-07,
"loss": 1.9388,
"step": 2040
},
{
"epoch": 0.1377985916916546,
"grad_norm": 5.760768572732212,
"learning_rate": 9.956530727868558e-07,
"loss": 2.0363,
"step": 2045
},
{
"epoch": 0.13813550756376133,
"grad_norm": 5.932958764540261,
"learning_rate": 9.955753571460322e-07,
"loss": 1.974,
"step": 2050
},
{
"epoch": 0.13847242343586808,
"grad_norm": 6.4230773291754355,
"learning_rate": 9.95496956022371e-07,
"loss": 1.9678,
"step": 2055
},
{
"epoch": 0.1388093393079748,
"grad_norm": 6.824106716998237,
"learning_rate": 9.95417869524317e-07,
"loss": 1.9545,
"step": 2060
},
{
"epoch": 0.13914625518008153,
"grad_norm": 5.823178409847294,
"learning_rate": 9.953380977612633e-07,
"loss": 1.9823,
"step": 2065
},
{
"epoch": 0.13948317105218827,
"grad_norm": 6.257993657815616,
"learning_rate": 9.952576408435505e-07,
"loss": 2.0147,
"step": 2070
},
{
"epoch": 0.139820086924295,
"grad_norm": 6.2285691514819055,
"learning_rate": 9.951764988824674e-07,
"loss": 1.9516,
"step": 2075
},
{
"epoch": 0.14015700279640173,
"grad_norm": 6.261536455663687,
"learning_rate": 9.950946719902498e-07,
"loss": 1.9726,
"step": 2080
},
{
"epoch": 0.14049391866850847,
"grad_norm": 5.744470229374497,
"learning_rate": 9.950121602800813e-07,
"loss": 1.915,
"step": 2085
},
{
"epoch": 0.1408308345406152,
"grad_norm": 6.253873692382304,
"learning_rate": 9.949289638660922e-07,
"loss": 1.9089,
"step": 2090
},
{
"epoch": 0.14116775041272195,
"grad_norm": 6.137871997869609,
"learning_rate": 9.948450828633608e-07,
"loss": 1.9406,
"step": 2095
},
{
"epoch": 0.1415046662848287,
"grad_norm": 6.105512836268031,
"learning_rate": 9.947605173879115e-07,
"loss": 1.9098,
"step": 2100
},
{
"epoch": 0.1418415821569354,
"grad_norm": 6.35463151695237,
"learning_rate": 9.94675267556716e-07,
"loss": 1.9155,
"step": 2105
},
{
"epoch": 0.14217849802904214,
"grad_norm": 6.31848845137408,
"learning_rate": 9.94589333487692e-07,
"loss": 1.9398,
"step": 2110
},
{
"epoch": 0.14251541390114889,
"grad_norm": 6.328687308623832,
"learning_rate": 9.945027152997046e-07,
"loss": 2.0107,
"step": 2115
},
{
"epoch": 0.14285232977325563,
"grad_norm": 6.426232569229285,
"learning_rate": 9.944154131125642e-07,
"loss": 1.9887,
"step": 2120
},
{
"epoch": 0.14318924564536234,
"grad_norm": 6.532309209899036,
"learning_rate": 9.94327427047028e-07,
"loss": 1.9792,
"step": 2125
},
{
"epoch": 0.14352616151746908,
"grad_norm": 7.123050490115521,
"learning_rate": 9.942387572247983e-07,
"loss": 1.9612,
"step": 2130
},
{
"epoch": 0.14386307738957582,
"grad_norm": 6.195588758010388,
"learning_rate": 9.941494037685243e-07,
"loss": 1.973,
"step": 2135
},
{
"epoch": 0.14419999326168256,
"grad_norm": 5.832825874342457,
"learning_rate": 9.940593668017998e-07,
"loss": 1.947,
"step": 2140
},
{
"epoch": 0.1445369091337893,
"grad_norm": 6.986313603080345,
"learning_rate": 9.93968646449165e-07,
"loss": 1.9623,
"step": 2145
},
{
"epoch": 0.14487382500589602,
"grad_norm": 6.546972459648574,
"learning_rate": 9.938772428361045e-07,
"loss": 1.9806,
"step": 2150
},
{
"epoch": 0.14521074087800276,
"grad_norm": 6.958183204055856,
"learning_rate": 9.937851560890484e-07,
"loss": 1.9149,
"step": 2155
},
{
"epoch": 0.1455476567501095,
"grad_norm": 6.132052769974667,
"learning_rate": 9.936923863353717e-07,
"loss": 1.9865,
"step": 2160
},
{
"epoch": 0.14588457262221624,
"grad_norm": 6.241571590304282,
"learning_rate": 9.935989337033939e-07,
"loss": 1.9518,
"step": 2165
},
{
"epoch": 0.14622148849432295,
"grad_norm": 5.9203250462384505,
"learning_rate": 9.935047983223794e-07,
"loss": 1.8987,
"step": 2170
},
{
"epoch": 0.1465584043664297,
"grad_norm": 5.985556544125865,
"learning_rate": 9.934099803225367e-07,
"loss": 1.9217,
"step": 2175
},
{
"epoch": 0.14689532023853644,
"grad_norm": 5.730478097660369,
"learning_rate": 9.933144798350188e-07,
"loss": 1.9666,
"step": 2180
},
{
"epoch": 0.14723223611064318,
"grad_norm": 6.707764179086582,
"learning_rate": 9.932182969919228e-07,
"loss": 2.0318,
"step": 2185
},
{
"epoch": 0.14756915198274992,
"grad_norm": 5.5659888513546685,
"learning_rate": 9.931214319262885e-07,
"loss": 1.9082,
"step": 2190
},
{
"epoch": 0.14790606785485663,
"grad_norm": 6.374951323196418,
"learning_rate": 9.930238847721013e-07,
"loss": 1.9317,
"step": 2195
},
{
"epoch": 0.14824298372696337,
"grad_norm": 6.434039061973071,
"learning_rate": 9.929256556642884e-07,
"loss": 1.864,
"step": 2200
},
{
"epoch": 0.14857989959907011,
"grad_norm": 6.367451884410294,
"learning_rate": 9.92826744738721e-07,
"loss": 1.9982,
"step": 2205
},
{
"epoch": 0.14891681547117686,
"grad_norm": 6.6815782297960675,
"learning_rate": 9.927271521322134e-07,
"loss": 1.9709,
"step": 2210
},
{
"epoch": 0.14925373134328357,
"grad_norm": 6.516744855920352,
"learning_rate": 9.926268779825224e-07,
"loss": 1.9566,
"step": 2215
},
{
"epoch": 0.1495906472153903,
"grad_norm": 5.730312993883115,
"learning_rate": 9.925259224283484e-07,
"loss": 1.9446,
"step": 2220
},
{
"epoch": 0.14992756308749705,
"grad_norm": 6.467174318204597,
"learning_rate": 9.924242856093332e-07,
"loss": 1.9286,
"step": 2225
},
{
"epoch": 0.1502644789596038,
"grad_norm": 5.428856070091508,
"learning_rate": 9.923219676660614e-07,
"loss": 1.9131,
"step": 2230
},
{
"epoch": 0.15060139483171053,
"grad_norm": 5.820529761802369,
"learning_rate": 9.922189687400603e-07,
"loss": 1.9089,
"step": 2235
},
{
"epoch": 0.15093831070381725,
"grad_norm": 6.529777214957106,
"learning_rate": 9.921152889737984e-07,
"loss": 1.9469,
"step": 2240
},
{
"epoch": 0.151275226575924,
"grad_norm": 6.345390439569736,
"learning_rate": 9.92010928510686e-07,
"loss": 1.9395,
"step": 2245
},
{
"epoch": 0.15161214244803073,
"grad_norm": 6.62897291068532,
"learning_rate": 9.919058874950754e-07,
"loss": 1.9595,
"step": 2250
},
{
"epoch": 0.15194905832013747,
"grad_norm": 5.504413660555053,
"learning_rate": 9.9180016607226e-07,
"loss": 1.911,
"step": 2255
},
{
"epoch": 0.15228597419224418,
"grad_norm": 6.519560511343574,
"learning_rate": 9.916937643884737e-07,
"loss": 1.9846,
"step": 2260
},
{
"epoch": 0.15262289006435092,
"grad_norm": 6.975549195640513,
"learning_rate": 9.915866825908927e-07,
"loss": 1.977,
"step": 2265
},
{
"epoch": 0.15295980593645767,
"grad_norm": 5.767514386057251,
"learning_rate": 9.914789208276329e-07,
"loss": 2.0107,
"step": 2270
},
{
"epoch": 0.1532967218085644,
"grad_norm": 6.74373390316998,
"learning_rate": 9.913704792477511e-07,
"loss": 1.9681,
"step": 2275
},
{
"epoch": 0.15363363768067115,
"grad_norm": 5.4136605604834305,
"learning_rate": 9.91261358001244e-07,
"loss": 1.8636,
"step": 2280
},
{
"epoch": 0.15397055355277786,
"grad_norm": 6.113357161218679,
"learning_rate": 9.911515572390495e-07,
"loss": 1.9542,
"step": 2285
},
{
"epoch": 0.1543074694248846,
"grad_norm": 6.212873246787139,
"learning_rate": 9.91041077113044e-07,
"loss": 1.9415,
"step": 2290
},
{
"epoch": 0.15464438529699134,
"grad_norm": 6.310708548707665,
"learning_rate": 9.909299177760445e-07,
"loss": 1.9649,
"step": 2295
},
{
"epoch": 0.15498130116909808,
"grad_norm": 6.076947896759005,
"learning_rate": 9.90818079381807e-07,
"loss": 2.0335,
"step": 2300
},
{
"epoch": 0.1553182170412048,
"grad_norm": 6.587950508829805,
"learning_rate": 9.907055620850277e-07,
"loss": 1.918,
"step": 2305
},
{
"epoch": 0.15565513291331154,
"grad_norm": 6.945749330710282,
"learning_rate": 9.905923660413409e-07,
"loss": 1.8805,
"step": 2310
},
{
"epoch": 0.15599204878541828,
"grad_norm": 6.642951255259172,
"learning_rate": 9.904784914073196e-07,
"loss": 1.9538,
"step": 2315
},
{
"epoch": 0.15632896465752502,
"grad_norm": 6.302001230717917,
"learning_rate": 9.903639383404765e-07,
"loss": 1.9267,
"step": 2320
},
{
"epoch": 0.15666588052963176,
"grad_norm": 6.509732828699096,
"learning_rate": 9.902487069992618e-07,
"loss": 1.8941,
"step": 2325
},
{
"epoch": 0.15700279640173848,
"grad_norm": 6.472613626661652,
"learning_rate": 9.901327975430645e-07,
"loss": 1.9944,
"step": 2330
},
{
"epoch": 0.15733971227384522,
"grad_norm": 6.249063324051331,
"learning_rate": 9.900162101322106e-07,
"loss": 1.923,
"step": 2335
},
{
"epoch": 0.15767662814595196,
"grad_norm": 6.534842584367619,
"learning_rate": 9.898989449279653e-07,
"loss": 1.8986,
"step": 2340
},
{
"epoch": 0.1580135440180587,
"grad_norm": 6.129324353524715,
"learning_rate": 9.8978100209253e-07,
"loss": 1.9573,
"step": 2345
},
{
"epoch": 0.1583504598901654,
"grad_norm": 5.824833199759791,
"learning_rate": 9.89662381789044e-07,
"loss": 1.9809,
"step": 2350
},
{
"epoch": 0.15868737576227215,
"grad_norm": 6.143494469198798,
"learning_rate": 9.89543084181584e-07,
"loss": 1.8954,
"step": 2355
},
{
"epoch": 0.1590242916343789,
"grad_norm": 6.651525954696755,
"learning_rate": 9.894231094351628e-07,
"loss": 1.9549,
"step": 2360
},
{
"epoch": 0.15936120750648564,
"grad_norm": 5.827719542621931,
"learning_rate": 9.893024577157303e-07,
"loss": 1.9758,
"step": 2365
},
{
"epoch": 0.15969812337859238,
"grad_norm": 6.101811273783142,
"learning_rate": 9.891811291901727e-07,
"loss": 1.9159,
"step": 2370
},
{
"epoch": 0.1600350392506991,
"grad_norm": 6.166144715136228,
"learning_rate": 9.890591240263124e-07,
"loss": 1.9213,
"step": 2375
},
{
"epoch": 0.16037195512280583,
"grad_norm": 6.391645333892786,
"learning_rate": 9.889364423929075e-07,
"loss": 1.9033,
"step": 2380
},
{
"epoch": 0.16070887099491257,
"grad_norm": 6.144735650701305,
"learning_rate": 9.888130844596524e-07,
"loss": 1.9093,
"step": 2385
},
{
"epoch": 0.1610457868670193,
"grad_norm": 6.010045391891963,
"learning_rate": 9.88689050397176e-07,
"loss": 1.9673,
"step": 2390
},
{
"epoch": 0.16138270273912603,
"grad_norm": 6.10557730478434,
"learning_rate": 9.885643403770431e-07,
"loss": 1.8925,
"step": 2395
},
{
"epoch": 0.16171961861123277,
"grad_norm": 6.608433974354653,
"learning_rate": 9.884389545717538e-07,
"loss": 1.932,
"step": 2400
},
{
"epoch": 0.1620565344833395,
"grad_norm": 6.7409320976993,
"learning_rate": 9.88312893154742e-07,
"loss": 1.8663,
"step": 2405
},
{
"epoch": 0.16239345035544625,
"grad_norm": 5.941750402794903,
"learning_rate": 9.881861563003766e-07,
"loss": 2.0028,
"step": 2410
},
{
"epoch": 0.162730366227553,
"grad_norm": 6.014502158961904,
"learning_rate": 9.880587441839613e-07,
"loss": 1.9156,
"step": 2415
},
{
"epoch": 0.1630672820996597,
"grad_norm": 6.571932040836407,
"learning_rate": 9.87930656981733e-07,
"loss": 1.9331,
"step": 2420
},
{
"epoch": 0.16340419797176645,
"grad_norm": 6.066820980535922,
"learning_rate": 9.878018948708625e-07,
"loss": 1.8293,
"step": 2425
},
{
"epoch": 0.1637411138438732,
"grad_norm": 6.200211415486074,
"learning_rate": 9.876724580294546e-07,
"loss": 1.9523,
"step": 2430
},
{
"epoch": 0.16407802971597993,
"grad_norm": 5.933206529903557,
"learning_rate": 9.875423466365471e-07,
"loss": 1.9296,
"step": 2435
},
{
"epoch": 0.16441494558808664,
"grad_norm": 5.820888744313868,
"learning_rate": 9.874115608721107e-07,
"loss": 1.8975,
"step": 2440
},
{
"epoch": 0.16475186146019338,
"grad_norm": 6.413197920174298,
"learning_rate": 9.872801009170492e-07,
"loss": 1.9998,
"step": 2445
},
{
"epoch": 0.16508877733230012,
"grad_norm": 5.594241105149758,
"learning_rate": 9.871479669531988e-07,
"loss": 1.8642,
"step": 2450
},
{
"epoch": 0.16542569320440687,
"grad_norm": 6.90776423556707,
"learning_rate": 9.87015159163328e-07,
"loss": 1.9158,
"step": 2455
},
{
"epoch": 0.1657626090765136,
"grad_norm": 6.742200847923345,
"learning_rate": 9.868816777311372e-07,
"loss": 1.9738,
"step": 2460
},
{
"epoch": 0.16609952494862032,
"grad_norm": 6.242575863453409,
"learning_rate": 9.867475228412592e-07,
"loss": 1.9085,
"step": 2465
},
{
"epoch": 0.16643644082072706,
"grad_norm": 7.077187802421562,
"learning_rate": 9.866126946792572e-07,
"loss": 1.8906,
"step": 2470
},
{
"epoch": 0.1667733566928338,
"grad_norm": 6.158021286459406,
"learning_rate": 9.864771934316268e-07,
"loss": 1.913,
"step": 2475
},
{
"epoch": 0.16711027256494054,
"grad_norm": 6.122768934226615,
"learning_rate": 9.863410192857938e-07,
"loss": 1.9653,
"step": 2480
},
{
"epoch": 0.16744718843704726,
"grad_norm": 5.9040493114933925,
"learning_rate": 9.862041724301154e-07,
"loss": 1.9317,
"step": 2485
},
{
"epoch": 0.167784104309154,
"grad_norm": 6.091505622820139,
"learning_rate": 9.860666530538787e-07,
"loss": 1.9795,
"step": 2490
},
{
"epoch": 0.16812102018126074,
"grad_norm": 5.55852182181646,
"learning_rate": 9.859284613473017e-07,
"loss": 1.931,
"step": 2495
},
{
"epoch": 0.16845793605336748,
"grad_norm": 5.727558368634606,
"learning_rate": 9.857895975015318e-07,
"loss": 1.952,
"step": 2500
},
{
"epoch": 0.16879485192547422,
"grad_norm": 6.048697203389933,
"learning_rate": 9.856500617086463e-07,
"loss": 1.9123,
"step": 2505
},
{
"epoch": 0.16913176779758093,
"grad_norm": 6.200625030191421,
"learning_rate": 9.85509854161652e-07,
"loss": 1.8941,
"step": 2510
},
{
"epoch": 0.16946868366968768,
"grad_norm": 5.954977123661459,
"learning_rate": 9.853689750544849e-07,
"loss": 1.9823,
"step": 2515
},
{
"epoch": 0.16980559954179442,
"grad_norm": 6.080978939587595,
"learning_rate": 9.852274245820095e-07,
"loss": 1.928,
"step": 2520
},
{
"epoch": 0.17014251541390116,
"grad_norm": 7.661482448972233,
"learning_rate": 9.850852029400198e-07,
"loss": 1.9531,
"step": 2525
},
{
"epoch": 0.17047943128600787,
"grad_norm": 6.286433787084899,
"learning_rate": 9.849423103252374e-07,
"loss": 1.9028,
"step": 2530
},
{
"epoch": 0.1708163471581146,
"grad_norm": 6.3328085180536835,
"learning_rate": 9.84798746935312e-07,
"loss": 1.9073,
"step": 2535
},
{
"epoch": 0.17115326303022135,
"grad_norm": 6.181759007221361,
"learning_rate": 9.846545129688217e-07,
"loss": 1.8744,
"step": 2540
},
{
"epoch": 0.1714901789023281,
"grad_norm": 6.234298316262344,
"learning_rate": 9.845096086252716e-07,
"loss": 1.8956,
"step": 2545
},
{
"epoch": 0.17182709477443484,
"grad_norm": 5.998336546341908,
"learning_rate": 9.843640341050944e-07,
"loss": 1.9759,
"step": 2550
},
{
"epoch": 0.17216401064654155,
"grad_norm": 6.128708725791349,
"learning_rate": 9.842177896096493e-07,
"loss": 1.9879,
"step": 2555
},
{
"epoch": 0.1725009265186483,
"grad_norm": 6.013293725561392,
"learning_rate": 9.84070875341223e-07,
"loss": 1.9182,
"step": 2560
},
{
"epoch": 0.17283784239075503,
"grad_norm": 5.897020597604511,
"learning_rate": 9.83923291503028e-07,
"loss": 1.9216,
"step": 2565
},
{
"epoch": 0.17317475826286177,
"grad_norm": 6.147461118394779,
"learning_rate": 9.837750382992033e-07,
"loss": 1.9187,
"step": 2570
},
{
"epoch": 0.17351167413496849,
"grad_norm": 6.668024086437924,
"learning_rate": 9.836261159348135e-07,
"loss": 1.9328,
"step": 2575
},
{
"epoch": 0.17384859000707523,
"grad_norm": 6.175494197320494,
"learning_rate": 9.834765246158488e-07,
"loss": 1.9537,
"step": 2580
},
{
"epoch": 0.17418550587918197,
"grad_norm": 6.3176648197191065,
"learning_rate": 9.83326264549225e-07,
"loss": 1.9553,
"step": 2585
},
{
"epoch": 0.1745224217512887,
"grad_norm": 5.770121391032835,
"learning_rate": 9.83175335942783e-07,
"loss": 1.8756,
"step": 2590
},
{
"epoch": 0.17485933762339545,
"grad_norm": 6.12386478367424,
"learning_rate": 9.830237390052876e-07,
"loss": 1.8831,
"step": 2595
},
{
"epoch": 0.17519625349550216,
"grad_norm": 6.6115302969443,
"learning_rate": 9.82871473946429e-07,
"loss": 1.9609,
"step": 2600
},
{
"epoch": 0.1755331693676089,
"grad_norm": 6.51170744218166,
"learning_rate": 9.82718540976821e-07,
"loss": 1.9285,
"step": 2605
},
{
"epoch": 0.17587008523971565,
"grad_norm": 6.505287511470605,
"learning_rate": 9.825649403080015e-07,
"loss": 1.9212,
"step": 2610
},
{
"epoch": 0.1762070011118224,
"grad_norm": 6.1451891960370295,
"learning_rate": 9.824106721524317e-07,
"loss": 1.8681,
"step": 2615
},
{
"epoch": 0.1765439169839291,
"grad_norm": 6.153611709519882,
"learning_rate": 9.822557367234962e-07,
"loss": 1.9253,
"step": 2620
},
{
"epoch": 0.17688083285603584,
"grad_norm": 6.464659492133994,
"learning_rate": 9.82100134235503e-07,
"loss": 1.9525,
"step": 2625
},
{
"epoch": 0.17721774872814258,
"grad_norm": 7.049383941068097,
"learning_rate": 9.819438649036823e-07,
"loss": 1.9579,
"step": 2630
},
{
"epoch": 0.17755466460024932,
"grad_norm": 6.424061501155142,
"learning_rate": 9.817869289441864e-07,
"loss": 1.8578,
"step": 2635
},
{
"epoch": 0.17789158047235606,
"grad_norm": 6.040140871168396,
"learning_rate": 9.816293265740907e-07,
"loss": 1.9384,
"step": 2640
},
{
"epoch": 0.17822849634446278,
"grad_norm": 5.889576761087303,
"learning_rate": 9.81471058011391e-07,
"loss": 1.9539,
"step": 2645
},
{
"epoch": 0.17856541221656952,
"grad_norm": 6.08103957840839,
"learning_rate": 9.81312123475006e-07,
"loss": 1.933,
"step": 2650
},
{
"epoch": 0.17890232808867626,
"grad_norm": 6.525506562314697,
"learning_rate": 9.811525231847746e-07,
"loss": 1.9735,
"step": 2655
},
{
"epoch": 0.179239243960783,
"grad_norm": 6.399928080344757,
"learning_rate": 9.809922573614569e-07,
"loss": 1.902,
"step": 2660
},
{
"epoch": 0.17957615983288971,
"grad_norm": 6.7137591420055776,
"learning_rate": 9.808313262267337e-07,
"loss": 1.9049,
"step": 2665
},
{
"epoch": 0.17991307570499646,
"grad_norm": 6.283847903141634,
"learning_rate": 9.806697300032057e-07,
"loss": 1.9117,
"step": 2670
},
{
"epoch": 0.1802499915771032,
"grad_norm": 6.852639330917095,
"learning_rate": 9.805074689143938e-07,
"loss": 1.9271,
"step": 2675
},
{
"epoch": 0.18058690744920994,
"grad_norm": 6.646951296296031,
"learning_rate": 9.803445431847388e-07,
"loss": 1.8809,
"step": 2680
},
{
"epoch": 0.18092382332131668,
"grad_norm": 5.935782369576825,
"learning_rate": 9.801809530396003e-07,
"loss": 2.0025,
"step": 2685
},
{
"epoch": 0.1812607391934234,
"grad_norm": 5.9066632972863475,
"learning_rate": 9.800166987052572e-07,
"loss": 1.8773,
"step": 2690
},
{
"epoch": 0.18159765506553013,
"grad_norm": 5.9865063710361905,
"learning_rate": 9.798517804089072e-07,
"loss": 2.0383,
"step": 2695
},
{
"epoch": 0.18193457093763687,
"grad_norm": 5.631020483070982,
"learning_rate": 9.796861983786661e-07,
"loss": 1.862,
"step": 2700
},
{
"epoch": 0.18227148680974362,
"grad_norm": 5.856519798162219,
"learning_rate": 9.795199528435682e-07,
"loss": 1.9101,
"step": 2705
},
{
"epoch": 0.18260840268185033,
"grad_norm": 6.222085140585286,
"learning_rate": 9.793530440335654e-07,
"loss": 1.8835,
"step": 2710
},
{
"epoch": 0.18294531855395707,
"grad_norm": 6.359042832937223,
"learning_rate": 9.791854721795264e-07,
"loss": 1.8879,
"step": 2715
},
{
"epoch": 0.1832822344260638,
"grad_norm": 6.19407136291253,
"learning_rate": 9.790172375132385e-07,
"loss": 1.9003,
"step": 2720
},
{
"epoch": 0.18361915029817055,
"grad_norm": 5.6877990141134305,
"learning_rate": 9.788483402674041e-07,
"loss": 1.9884,
"step": 2725
},
{
"epoch": 0.1839560661702773,
"grad_norm": 6.151314409969232,
"learning_rate": 9.786787806756434e-07,
"loss": 1.9299,
"step": 2730
},
{
"epoch": 0.184292982042384,
"grad_norm": 6.575318390036048,
"learning_rate": 9.78508558972492e-07,
"loss": 1.934,
"step": 2735
},
{
"epoch": 0.18462989791449075,
"grad_norm": 5.919173357209151,
"learning_rate": 9.783376753934015e-07,
"loss": 1.9586,
"step": 2740
},
{
"epoch": 0.1849668137865975,
"grad_norm": 6.286929037909757,
"learning_rate": 9.781661301747393e-07,
"loss": 1.8594,
"step": 2745
},
{
"epoch": 0.18530372965870423,
"grad_norm": 6.31734888329098,
"learning_rate": 9.779939235537879e-07,
"loss": 1.9917,
"step": 2750
},
{
"epoch": 0.18564064553081094,
"grad_norm": 6.564405236880653,
"learning_rate": 9.778210557687443e-07,
"loss": 1.9372,
"step": 2755
},
{
"epoch": 0.18597756140291768,
"grad_norm": 6.256750421006695,
"learning_rate": 9.776475270587205e-07,
"loss": 1.9249,
"step": 2760
},
{
"epoch": 0.18631447727502443,
"grad_norm": 5.989838119364058,
"learning_rate": 9.774733376637421e-07,
"loss": 1.9155,
"step": 2765
},
{
"epoch": 0.18665139314713117,
"grad_norm": 6.446979854135191,
"learning_rate": 9.772984878247493e-07,
"loss": 1.9294,
"step": 2770
},
{
"epoch": 0.1869883090192379,
"grad_norm": 6.762973678300427,
"learning_rate": 9.771229777835952e-07,
"loss": 1.8937,
"step": 2775
},
{
"epoch": 0.18732522489134462,
"grad_norm": 5.787808739396207,
"learning_rate": 9.769468077830466e-07,
"loss": 1.89,
"step": 2780
},
{
"epoch": 0.18766214076345136,
"grad_norm": 6.000820283110658,
"learning_rate": 9.767699780667827e-07,
"loss": 1.9394,
"step": 2785
},
{
"epoch": 0.1879990566355581,
"grad_norm": 6.393820981303887,
"learning_rate": 9.765924888793955e-07,
"loss": 1.86,
"step": 2790
},
{
"epoch": 0.18833597250766484,
"grad_norm": 5.877371777184333,
"learning_rate": 9.76414340466389e-07,
"loss": 1.9419,
"step": 2795
},
{
"epoch": 0.18867288837977156,
"grad_norm": 6.908257437131703,
"learning_rate": 9.762355330741794e-07,
"loss": 1.9607,
"step": 2800
},
{
"epoch": 0.1890098042518783,
"grad_norm": 5.869255461740951,
"learning_rate": 9.760560669500941e-07,
"loss": 1.925,
"step": 2805
},
{
"epoch": 0.18934672012398504,
"grad_norm": 5.975145370261582,
"learning_rate": 9.758759423423716e-07,
"loss": 1.9067,
"step": 2810
},
{
"epoch": 0.18968363599609178,
"grad_norm": 6.276438455622569,
"learning_rate": 9.756951595001617e-07,
"loss": 1.8599,
"step": 2815
},
{
"epoch": 0.19002055186819852,
"grad_norm": 6.386035436547608,
"learning_rate": 9.755137186735238e-07,
"loss": 1.9101,
"step": 2820
},
{
"epoch": 0.19035746774030524,
"grad_norm": 6.498761340697311,
"learning_rate": 9.753316201134282e-07,
"loss": 1.9375,
"step": 2825
},
{
"epoch": 0.19069438361241198,
"grad_norm": 5.868215854791331,
"learning_rate": 9.75148864071755e-07,
"loss": 1.8861,
"step": 2830
},
{
"epoch": 0.19103129948451872,
"grad_norm": 6.259936017759626,
"learning_rate": 9.74965450801293e-07,
"loss": 1.9049,
"step": 2835
},
{
"epoch": 0.19136821535662546,
"grad_norm": 6.343131958265802,
"learning_rate": 9.747813805557408e-07,
"loss": 1.898,
"step": 2840
},
{
"epoch": 0.19170513122873217,
"grad_norm": 6.048544900470273,
"learning_rate": 9.745966535897054e-07,
"loss": 1.8809,
"step": 2845
},
{
"epoch": 0.19204204710083891,
"grad_norm": 5.9281878870137,
"learning_rate": 9.744112701587024e-07,
"loss": 1.9059,
"step": 2850
},
{
"epoch": 0.19237896297294566,
"grad_norm": 6.21419556733591,
"learning_rate": 9.742252305191551e-07,
"loss": 1.8776,
"step": 2855
},
{
"epoch": 0.1927158788450524,
"grad_norm": 6.4341300465391775,
"learning_rate": 9.740385349283946e-07,
"loss": 1.9311,
"step": 2860
},
{
"epoch": 0.19305279471715914,
"grad_norm": 6.42152192546941,
"learning_rate": 9.738511836446596e-07,
"loss": 1.9562,
"step": 2865
},
{
"epoch": 0.19338971058926585,
"grad_norm": 5.824133775486518,
"learning_rate": 9.736631769270957e-07,
"loss": 1.9235,
"step": 2870
},
{
"epoch": 0.1937266264613726,
"grad_norm": 6.247305043191556,
"learning_rate": 9.734745150357544e-07,
"loss": 1.9226,
"step": 2875
},
{
"epoch": 0.19406354233347933,
"grad_norm": 6.080141784771128,
"learning_rate": 9.732851982315944e-07,
"loss": 1.9777,
"step": 2880
},
{
"epoch": 0.19440045820558607,
"grad_norm": 6.322132356228769,
"learning_rate": 9.730952267764796e-07,
"loss": 1.896,
"step": 2885
},
{
"epoch": 0.1947373740776928,
"grad_norm": 6.433030057968718,
"learning_rate": 9.729046009331798e-07,
"loss": 1.877,
"step": 2890
},
{
"epoch": 0.19507428994979953,
"grad_norm": 6.10661292779823,
"learning_rate": 9.727133209653696e-07,
"loss": 1.9049,
"step": 2895
},
{
"epoch": 0.19541120582190627,
"grad_norm": 5.892069850886527,
"learning_rate": 9.72521387137629e-07,
"loss": 1.9512,
"step": 2900
},
{
"epoch": 0.195748121694013,
"grad_norm": 6.26071736541495,
"learning_rate": 9.723287997154419e-07,
"loss": 1.9702,
"step": 2905
},
{
"epoch": 0.19608503756611975,
"grad_norm": 5.794806052244091,
"learning_rate": 9.72135558965196e-07,
"loss": 1.921,
"step": 2910
},
{
"epoch": 0.19642195343822647,
"grad_norm": 5.809504947809818,
"learning_rate": 9.719416651541837e-07,
"loss": 1.8608,
"step": 2915
},
{
"epoch": 0.1967588693103332,
"grad_norm": 5.814187161790213,
"learning_rate": 9.717471185505996e-07,
"loss": 1.8828,
"step": 2920
},
{
"epoch": 0.19709578518243995,
"grad_norm": 6.093734929582276,
"learning_rate": 9.715519194235422e-07,
"loss": 1.8619,
"step": 2925
},
{
"epoch": 0.1974327010545467,
"grad_norm": 6.3046806166987714,
"learning_rate": 9.713560680430117e-07,
"loss": 1.9335,
"step": 2930
},
{
"epoch": 0.1977696169266534,
"grad_norm": 6.427520474069085,
"learning_rate": 9.71159564679911e-07,
"loss": 1.909,
"step": 2935
},
{
"epoch": 0.19810653279876014,
"grad_norm": 6.232853782135515,
"learning_rate": 9.709624096060449e-07,
"loss": 1.8993,
"step": 2940
},
{
"epoch": 0.19844344867086688,
"grad_norm": 5.853044309136511,
"learning_rate": 9.707646030941192e-07,
"loss": 1.8984,
"step": 2945
},
{
"epoch": 0.19878036454297363,
"grad_norm": 6.136548097509919,
"learning_rate": 9.705661454177416e-07,
"loss": 1.9118,
"step": 2950
},
{
"epoch": 0.19911728041508037,
"grad_norm": 6.534000467489757,
"learning_rate": 9.703670368514192e-07,
"loss": 1.9262,
"step": 2955
},
{
"epoch": 0.19945419628718708,
"grad_norm": 6.290539857874704,
"learning_rate": 9.701672776705609e-07,
"loss": 1.9088,
"step": 2960
},
{
"epoch": 0.19979111215929382,
"grad_norm": 6.302727180291852,
"learning_rate": 9.699668681514746e-07,
"loss": 1.9414,
"step": 2965
},
{
"epoch": 0.20012802803140056,
"grad_norm": 5.664375469611846,
"learning_rate": 9.697658085713676e-07,
"loss": 1.9472,
"step": 2970
},
{
"epoch": 0.2004649439035073,
"grad_norm": 6.866737640209192,
"learning_rate": 9.695640992083471e-07,
"loss": 1.8569,
"step": 2975
},
{
"epoch": 0.20080185977561402,
"grad_norm": 6.152666956064643,
"learning_rate": 9.693617403414188e-07,
"loss": 1.9055,
"step": 2980
},
{
"epoch": 0.20113877564772076,
"grad_norm": 6.514327490736937,
"learning_rate": 9.691587322504865e-07,
"loss": 1.8868,
"step": 2985
},
{
"epoch": 0.2014756915198275,
"grad_norm": 6.682738501012025,
"learning_rate": 9.68955075216352e-07,
"loss": 1.9672,
"step": 2990
},
{
"epoch": 0.20181260739193424,
"grad_norm": 6.522921716857152,
"learning_rate": 9.687507695207154e-07,
"loss": 1.9148,
"step": 2995
},
{
"epoch": 0.20214952326404098,
"grad_norm": 6.318875468016384,
"learning_rate": 9.685458154461731e-07,
"loss": 1.878,
"step": 3000
},
{
"epoch": 0.2024864391361477,
"grad_norm": 5.609414606888322,
"learning_rate": 9.683402132762193e-07,
"loss": 1.937,
"step": 3005
},
{
"epoch": 0.20282335500825444,
"grad_norm": 6.2216093413120985,
"learning_rate": 9.68133963295244e-07,
"loss": 1.9736,
"step": 3010
},
{
"epoch": 0.20316027088036118,
"grad_norm": 6.189142393589986,
"learning_rate": 9.679270657885334e-07,
"loss": 1.8393,
"step": 3015
},
{
"epoch": 0.20349718675246792,
"grad_norm": 6.203512003745445,
"learning_rate": 9.677195210422693e-07,
"loss": 1.9523,
"step": 3020
},
{
"epoch": 0.20383410262457463,
"grad_norm": 6.144164019790604,
"learning_rate": 9.675113293435288e-07,
"loss": 1.9024,
"step": 3025
},
{
"epoch": 0.20417101849668137,
"grad_norm": 5.712284742254756,
"learning_rate": 9.673024909802841e-07,
"loss": 1.8785,
"step": 3030
},
{
"epoch": 0.2045079343687881,
"grad_norm": 5.670735815579872,
"learning_rate": 9.670930062414017e-07,
"loss": 1.8413,
"step": 3035
},
{
"epoch": 0.20484485024089485,
"grad_norm": 6.638834689424272,
"learning_rate": 9.66882875416642e-07,
"loss": 1.9462,
"step": 3040
},
{
"epoch": 0.2051817661130016,
"grad_norm": 6.2519172937020535,
"learning_rate": 9.666720987966595e-07,
"loss": 1.903,
"step": 3045
},
{
"epoch": 0.2055186819851083,
"grad_norm": 6.05215038719685,
"learning_rate": 9.664606766730012e-07,
"loss": 1.9139,
"step": 3050
},
{
"epoch": 0.20585559785721505,
"grad_norm": 6.0549888495357465,
"learning_rate": 9.662486093381082e-07,
"loss": 1.905,
"step": 3055
},
{
"epoch": 0.2061925137293218,
"grad_norm": 5.955088538325352,
"learning_rate": 9.660358970853126e-07,
"loss": 1.8374,
"step": 3060
},
{
"epoch": 0.20652942960142853,
"grad_norm": 5.905908530023023,
"learning_rate": 9.658225402088395e-07,
"loss": 1.8623,
"step": 3065
},
{
"epoch": 0.20686634547353525,
"grad_norm": 5.872087794008919,
"learning_rate": 9.656085390038058e-07,
"loss": 1.9301,
"step": 3070
},
{
"epoch": 0.207203261345642,
"grad_norm": 6.220493564964145,
"learning_rate": 9.653938937662187e-07,
"loss": 1.9098,
"step": 3075
},
{
"epoch": 0.20754017721774873,
"grad_norm": 5.761561040327359,
"learning_rate": 9.651786047929772e-07,
"loss": 1.8888,
"step": 3080
},
{
"epoch": 0.20787709308985547,
"grad_norm": 6.119415940843171,
"learning_rate": 9.649626723818702e-07,
"loss": 1.9355,
"step": 3085
},
{
"epoch": 0.2082140089619622,
"grad_norm": 6.212140870550058,
"learning_rate": 9.647460968315767e-07,
"loss": 1.9884,
"step": 3090
},
{
"epoch": 0.20855092483406892,
"grad_norm": 6.31960790183109,
"learning_rate": 9.645288784416652e-07,
"loss": 1.8941,
"step": 3095
},
{
"epoch": 0.20888784070617566,
"grad_norm": 6.198869449733293,
"learning_rate": 9.643110175125935e-07,
"loss": 1.9418,
"step": 3100
},
{
"epoch": 0.2092247565782824,
"grad_norm": 5.994224284054857,
"learning_rate": 9.640925143457084e-07,
"loss": 1.8749,
"step": 3105
},
{
"epoch": 0.20956167245038915,
"grad_norm": 6.534637990302735,
"learning_rate": 9.638733692432448e-07,
"loss": 1.9156,
"step": 3110
},
{
"epoch": 0.20989858832249586,
"grad_norm": 6.350636283397908,
"learning_rate": 9.636535825083252e-07,
"loss": 1.9408,
"step": 3115
},
{
"epoch": 0.2102355041946026,
"grad_norm": 5.747429709584084,
"learning_rate": 9.634331544449601e-07,
"loss": 1.9303,
"step": 3120
},
{
"epoch": 0.21057242006670934,
"grad_norm": 6.050823517306343,
"learning_rate": 9.632120853580472e-07,
"loss": 1.9323,
"step": 3125
},
{
"epoch": 0.21090933593881608,
"grad_norm": 5.676142963813065,
"learning_rate": 9.6299037555337e-07,
"loss": 1.865,
"step": 3130
},
{
"epoch": 0.21124625181092282,
"grad_norm": 6.296718830562775,
"learning_rate": 9.627680253375997e-07,
"loss": 1.8542,
"step": 3135
},
{
"epoch": 0.21158316768302954,
"grad_norm": 6.182241743324326,
"learning_rate": 9.625450350182918e-07,
"loss": 1.9004,
"step": 3140
},
{
"epoch": 0.21192008355513628,
"grad_norm": 5.703077632023605,
"learning_rate": 9.62321404903888e-07,
"loss": 1.8512,
"step": 3145
},
{
"epoch": 0.21225699942724302,
"grad_norm": 5.649919083346945,
"learning_rate": 9.620971353037148e-07,
"loss": 1.8742,
"step": 3150
},
{
"epoch": 0.21259391529934976,
"grad_norm": 6.162255987071309,
"learning_rate": 9.618722265279835e-07,
"loss": 1.9086,
"step": 3155
},
{
"epoch": 0.21293083117145647,
"grad_norm": 6.877459967522398,
"learning_rate": 9.61646678887789e-07,
"loss": 1.9141,
"step": 3160
},
{
"epoch": 0.21326774704356322,
"grad_norm": 7.064015072257185,
"learning_rate": 9.614204926951102e-07,
"loss": 1.9449,
"step": 3165
},
{
"epoch": 0.21360466291566996,
"grad_norm": 5.8916080827898965,
"learning_rate": 9.611936682628095e-07,
"loss": 1.9092,
"step": 3170
},
{
"epoch": 0.2139415787877767,
"grad_norm": 5.964533043114863,
"learning_rate": 9.609662059046315e-07,
"loss": 1.8819,
"step": 3175
},
{
"epoch": 0.21427849465988344,
"grad_norm": 5.916309521260202,
"learning_rate": 9.607381059352038e-07,
"loss": 1.8941,
"step": 3180
},
{
"epoch": 0.21461541053199015,
"grad_norm": 6.145903770013445,
"learning_rate": 9.605093686700353e-07,
"loss": 1.9525,
"step": 3185
},
{
"epoch": 0.2149523264040969,
"grad_norm": 6.26210111100357,
"learning_rate": 9.602799944255172e-07,
"loss": 1.8631,
"step": 3190
},
{
"epoch": 0.21528924227620364,
"grad_norm": 6.033360078420039,
"learning_rate": 9.60049983518921e-07,
"loss": 1.9637,
"step": 3195
},
{
"epoch": 0.21562615814831038,
"grad_norm": 5.862437650692546,
"learning_rate": 9.598193362683995e-07,
"loss": 1.8619,
"step": 3200
},
{
"epoch": 0.2159630740204171,
"grad_norm": 6.846493212362844,
"learning_rate": 9.59588052992985e-07,
"loss": 1.8711,
"step": 3205
},
{
"epoch": 0.21629998989252383,
"grad_norm": 6.054162969621891,
"learning_rate": 9.5935613401259e-07,
"loss": 1.9317,
"step": 3210
},
{
"epoch": 0.21663690576463057,
"grad_norm": 5.853489681208746,
"learning_rate": 9.591235796480064e-07,
"loss": 1.9221,
"step": 3215
},
{
"epoch": 0.2169738216367373,
"grad_norm": 6.026177124643552,
"learning_rate": 9.588903902209048e-07,
"loss": 1.8886,
"step": 3220
},
{
"epoch": 0.21731073750884405,
"grad_norm": 6.026706526511821,
"learning_rate": 9.586565660538343e-07,
"loss": 1.875,
"step": 3225
},
{
"epoch": 0.21764765338095077,
"grad_norm": 5.931027220154246,
"learning_rate": 9.584221074702217e-07,
"loss": 1.9771,
"step": 3230
},
{
"epoch": 0.2179845692530575,
"grad_norm": 6.178018868545221,
"learning_rate": 9.581870147943715e-07,
"loss": 1.893,
"step": 3235
},
{
"epoch": 0.21832148512516425,
"grad_norm": 5.983546923887888,
"learning_rate": 9.579512883514656e-07,
"loss": 1.9382,
"step": 3240
},
{
"epoch": 0.218658400997271,
"grad_norm": 6.313877194235666,
"learning_rate": 9.577149284675619e-07,
"loss": 1.9233,
"step": 3245
},
{
"epoch": 0.2189953168693777,
"grad_norm": 5.647925594777457,
"learning_rate": 9.574779354695951e-07,
"loss": 1.9285,
"step": 3250
},
{
"epoch": 0.21933223274148445,
"grad_norm": 6.052584204662514,
"learning_rate": 9.572403096853754e-07,
"loss": 1.8937,
"step": 3255
},
{
"epoch": 0.2196691486135912,
"grad_norm": 5.78982516243247,
"learning_rate": 9.570020514435878e-07,
"loss": 1.9158,
"step": 3260
},
{
"epoch": 0.22000606448569793,
"grad_norm": 6.5555394076193165,
"learning_rate": 9.567631610737929e-07,
"loss": 1.8403,
"step": 3265
},
{
"epoch": 0.22034298035780467,
"grad_norm": 6.244304553460857,
"learning_rate": 9.565236389064255e-07,
"loss": 1.9387,
"step": 3270
},
{
"epoch": 0.22067989622991138,
"grad_norm": 6.019749558522282,
"learning_rate": 9.562834852727935e-07,
"loss": 1.9252,
"step": 3275
},
{
"epoch": 0.22101681210201812,
"grad_norm": 6.030689411078262,
"learning_rate": 9.560427005050793e-07,
"loss": 1.9028,
"step": 3280
},
{
"epoch": 0.22135372797412486,
"grad_norm": 5.829221689829186,
"learning_rate": 9.55801284936338e-07,
"loss": 1.88,
"step": 3285
},
{
"epoch": 0.2216906438462316,
"grad_norm": 6.352115907311686,
"learning_rate": 9.555592389004966e-07,
"loss": 1.8563,
"step": 3290
},
{
"epoch": 0.22202755971833832,
"grad_norm": 6.346082954813021,
"learning_rate": 9.553165627323548e-07,
"loss": 1.9118,
"step": 3295
},
{
"epoch": 0.22236447559044506,
"grad_norm": 6.139627054619066,
"learning_rate": 9.55073256767584e-07,
"loss": 1.8908,
"step": 3300
},
{
"epoch": 0.2227013914625518,
"grad_norm": 6.214924215026803,
"learning_rate": 9.548293213427262e-07,
"loss": 1.8842,
"step": 3305
},
{
"epoch": 0.22303830733465854,
"grad_norm": 5.962888824637297,
"learning_rate": 9.545847567951944e-07,
"loss": 1.9077,
"step": 3310
},
{
"epoch": 0.22337522320676528,
"grad_norm": 6.353914244158636,
"learning_rate": 9.543395634632721e-07,
"loss": 1.9208,
"step": 3315
},
{
"epoch": 0.223712139078872,
"grad_norm": 5.778557004198218,
"learning_rate": 9.540937416861117e-07,
"loss": 1.8881,
"step": 3320
},
{
"epoch": 0.22404905495097874,
"grad_norm": 7.2073119475486545,
"learning_rate": 9.538472918037356e-07,
"loss": 1.9496,
"step": 3325
},
{
"epoch": 0.22438597082308548,
"grad_norm": 5.727299882070468,
"learning_rate": 9.536002141570348e-07,
"loss": 1.946,
"step": 3330
},
{
"epoch": 0.22472288669519222,
"grad_norm": 5.994342052634025,
"learning_rate": 9.533525090877688e-07,
"loss": 1.9398,
"step": 3335
},
{
"epoch": 0.22505980256729893,
"grad_norm": 6.327096679874867,
"learning_rate": 9.531041769385641e-07,
"loss": 1.8929,
"step": 3340
},
{
"epoch": 0.22539671843940567,
"grad_norm": 6.51167490064826,
"learning_rate": 9.528552180529161e-07,
"loss": 2.0104,
"step": 3345
},
{
"epoch": 0.22573363431151242,
"grad_norm": 6.516651466812798,
"learning_rate": 9.526056327751856e-07,
"loss": 1.9016,
"step": 3350
},
{
"epoch": 0.22607055018361916,
"grad_norm": 6.6127345474256956,
"learning_rate": 9.523554214506006e-07,
"loss": 1.9314,
"step": 3355
},
{
"epoch": 0.2264074660557259,
"grad_norm": 6.839303175267801,
"learning_rate": 9.521045844252551e-07,
"loss": 1.9305,
"step": 3360
},
{
"epoch": 0.2267443819278326,
"grad_norm": 6.186255355465,
"learning_rate": 9.518531220461084e-07,
"loss": 1.8877,
"step": 3365
},
{
"epoch": 0.22708129779993935,
"grad_norm": 5.369845972157766,
"learning_rate": 9.516010346609845e-07,
"loss": 1.8766,
"step": 3370
},
{
"epoch": 0.2274182136720461,
"grad_norm": 6.378683661332048,
"learning_rate": 9.513483226185723e-07,
"loss": 1.9803,
"step": 3375
},
{
"epoch": 0.22775512954415283,
"grad_norm": 6.484312873315589,
"learning_rate": 9.510949862684248e-07,
"loss": 1.8739,
"step": 3380
},
{
"epoch": 0.22809204541625955,
"grad_norm": 6.0136471572220165,
"learning_rate": 9.508410259609583e-07,
"loss": 1.917,
"step": 3385
},
{
"epoch": 0.2284289612883663,
"grad_norm": 6.250184322801729,
"learning_rate": 9.505864420474522e-07,
"loss": 1.9138,
"step": 3390
},
{
"epoch": 0.22876587716047303,
"grad_norm": 6.3250547018073195,
"learning_rate": 9.503312348800485e-07,
"loss": 1.8632,
"step": 3395
},
{
"epoch": 0.22910279303257977,
"grad_norm": 5.740473890542984,
"learning_rate": 9.500754048117514e-07,
"loss": 1.8823,
"step": 3400
},
{
"epoch": 0.2294397089046865,
"grad_norm": 5.777022768497205,
"learning_rate": 9.498189521964263e-07,
"loss": 1.8323,
"step": 3405
},
{
"epoch": 0.22977662477679323,
"grad_norm": 6.20964502865978,
"learning_rate": 9.495618773888006e-07,
"loss": 1.8181,
"step": 3410
},
{
"epoch": 0.23011354064889997,
"grad_norm": 6.134239103081429,
"learning_rate": 9.49304180744461e-07,
"loss": 1.9124,
"step": 3415
},
{
"epoch": 0.2304504565210067,
"grad_norm": 5.766924227242812,
"learning_rate": 9.490458626198556e-07,
"loss": 1.8891,
"step": 3420
},
{
"epoch": 0.23078737239311345,
"grad_norm": 6.476492599044412,
"learning_rate": 9.487869233722915e-07,
"loss": 1.879,
"step": 3425
},
{
"epoch": 0.23112428826522016,
"grad_norm": 6.1492500389967875,
"learning_rate": 9.485273633599348e-07,
"loss": 1.8936,
"step": 3430
},
{
"epoch": 0.2314612041373269,
"grad_norm": 6.107479826667597,
"learning_rate": 9.482671829418107e-07,
"loss": 1.8926,
"step": 3435
},
{
"epoch": 0.23179812000943364,
"grad_norm": 5.833724685300883,
"learning_rate": 9.480063824778024e-07,
"loss": 1.8725,
"step": 3440
},
{
"epoch": 0.23213503588154039,
"grad_norm": 6.169800273715307,
"learning_rate": 9.477449623286505e-07,
"loss": 1.8361,
"step": 3445
},
{
"epoch": 0.23247195175364713,
"grad_norm": 5.717025967015645,
"learning_rate": 9.474829228559529e-07,
"loss": 1.9336,
"step": 3450
},
{
"epoch": 0.23280886762575384,
"grad_norm": 6.571826219281641,
"learning_rate": 9.472202644221643e-07,
"loss": 1.9702,
"step": 3455
},
{
"epoch": 0.23314578349786058,
"grad_norm": 6.267842479579244,
"learning_rate": 9.469569873905955e-07,
"loss": 1.8596,
"step": 3460
},
{
"epoch": 0.23348269936996732,
"grad_norm": 5.971915133277161,
"learning_rate": 9.466930921254128e-07,
"loss": 1.8525,
"step": 3465
},
{
"epoch": 0.23381961524207406,
"grad_norm": 6.562137662722172,
"learning_rate": 9.464285789916376e-07,
"loss": 1.8986,
"step": 3470
},
{
"epoch": 0.23415653111418078,
"grad_norm": 6.385638926464063,
"learning_rate": 9.461634483551464e-07,
"loss": 1.843,
"step": 3475
},
{
"epoch": 0.23449344698628752,
"grad_norm": 5.845903723462544,
"learning_rate": 9.458977005826691e-07,
"loss": 1.8584,
"step": 3480
},
{
"epoch": 0.23483036285839426,
"grad_norm": 6.2980063600222085,
"learning_rate": 9.456313360417899e-07,
"loss": 1.9289,
"step": 3485
},
{
"epoch": 0.235167278730501,
"grad_norm": 5.683558465001443,
"learning_rate": 9.453643551009459e-07,
"loss": 1.8656,
"step": 3490
},
{
"epoch": 0.23550419460260774,
"grad_norm": 6.508247982606044,
"learning_rate": 9.450967581294265e-07,
"loss": 1.9393,
"step": 3495
},
{
"epoch": 0.23584111047471445,
"grad_norm": 5.700511178061109,
"learning_rate": 9.448285454973737e-07,
"loss": 1.8814,
"step": 3500
},
{
"epoch": 0.2361780263468212,
"grad_norm": 6.903117920839508,
"learning_rate": 9.445597175757806e-07,
"loss": 1.8339,
"step": 3505
},
{
"epoch": 0.23651494221892794,
"grad_norm": 5.983876339449441,
"learning_rate": 9.442902747364918e-07,
"loss": 1.8379,
"step": 3510
},
{
"epoch": 0.23685185809103468,
"grad_norm": 5.918219144963591,
"learning_rate": 9.440202173522022e-07,
"loss": 1.9062,
"step": 3515
},
{
"epoch": 0.2371887739631414,
"grad_norm": 6.053975291994282,
"learning_rate": 9.437495457964568e-07,
"loss": 1.8807,
"step": 3520
},
{
"epoch": 0.23752568983524813,
"grad_norm": 5.791738879936693,
"learning_rate": 9.434782604436502e-07,
"loss": 1.8712,
"step": 3525
},
{
"epoch": 0.23786260570735487,
"grad_norm": 6.018683919389132,
"learning_rate": 9.432063616690258e-07,
"loss": 1.8687,
"step": 3530
},
{
"epoch": 0.23819952157946161,
"grad_norm": 6.984256425631296,
"learning_rate": 9.429338498486758e-07,
"loss": 1.842,
"step": 3535
},
{
"epoch": 0.23853643745156836,
"grad_norm": 6.014792567520862,
"learning_rate": 9.426607253595402e-07,
"loss": 1.915,
"step": 3540
},
{
"epoch": 0.23887335332367507,
"grad_norm": 5.52165584469138,
"learning_rate": 9.423869885794063e-07,
"loss": 1.896,
"step": 3545
},
{
"epoch": 0.2392102691957818,
"grad_norm": 6.341307272489993,
"learning_rate": 9.421126398869086e-07,
"loss": 1.9376,
"step": 3550
},
{
"epoch": 0.23954718506788855,
"grad_norm": 6.3188324705440015,
"learning_rate": 9.418376796615279e-07,
"loss": 1.8711,
"step": 3555
},
{
"epoch": 0.2398841009399953,
"grad_norm": 6.035398818050593,
"learning_rate": 9.415621082835908e-07,
"loss": 1.8746,
"step": 3560
},
{
"epoch": 0.240221016812102,
"grad_norm": 5.711076837553516,
"learning_rate": 9.412859261342691e-07,
"loss": 1.8408,
"step": 3565
},
{
"epoch": 0.24055793268420875,
"grad_norm": 5.993468075806861,
"learning_rate": 9.410091335955798e-07,
"loss": 1.8672,
"step": 3570
},
{
"epoch": 0.2408948485563155,
"grad_norm": 6.360069219854257,
"learning_rate": 9.407317310503841e-07,
"loss": 1.8968,
"step": 3575
},
{
"epoch": 0.24123176442842223,
"grad_norm": 6.56717329121729,
"learning_rate": 9.404537188823869e-07,
"loss": 1.9586,
"step": 3580
},
{
"epoch": 0.24156868030052897,
"grad_norm": 5.796197742968857,
"learning_rate": 9.40175097476136e-07,
"loss": 1.8776,
"step": 3585
},
{
"epoch": 0.24190559617263568,
"grad_norm": 6.672312881743043,
"learning_rate": 9.398958672170225e-07,
"loss": 1.8598,
"step": 3590
},
{
"epoch": 0.24224251204474243,
"grad_norm": 6.144503989213299,
"learning_rate": 9.396160284912795e-07,
"loss": 1.9091,
"step": 3595
},
{
"epoch": 0.24257942791684917,
"grad_norm": 6.0819379510788245,
"learning_rate": 9.393355816859813e-07,
"loss": 1.8915,
"step": 3600
},
{
"epoch": 0.2429163437889559,
"grad_norm": 6.323366435634516,
"learning_rate": 9.390545271890437e-07,
"loss": 1.9043,
"step": 3605
},
{
"epoch": 0.24325325966106262,
"grad_norm": 6.4339698201446875,
"learning_rate": 9.387728653892233e-07,
"loss": 1.881,
"step": 3610
},
{
"epoch": 0.24359017553316936,
"grad_norm": 6.001463813574768,
"learning_rate": 9.384905966761159e-07,
"loss": 1.9053,
"step": 3615
},
{
"epoch": 0.2439270914052761,
"grad_norm": 6.399446616197819,
"learning_rate": 9.382077214401576e-07,
"loss": 1.8755,
"step": 3620
},
{
"epoch": 0.24426400727738284,
"grad_norm": 6.66748750013314,
"learning_rate": 9.379242400726232e-07,
"loss": 1.9006,
"step": 3625
},
{
"epoch": 0.24460092314948959,
"grad_norm": 6.826259347914525,
"learning_rate": 9.376401529656257e-07,
"loss": 1.9043,
"step": 3630
},
{
"epoch": 0.2449378390215963,
"grad_norm": 5.891290217702254,
"learning_rate": 9.373554605121161e-07,
"loss": 1.8244,
"step": 3635
},
{
"epoch": 0.24527475489370304,
"grad_norm": 6.181375359799177,
"learning_rate": 9.370701631058828e-07,
"loss": 1.8704,
"step": 3640
},
{
"epoch": 0.24561167076580978,
"grad_norm": 6.187612179223154,
"learning_rate": 9.367842611415508e-07,
"loss": 1.9051,
"step": 3645
},
{
"epoch": 0.24594858663791652,
"grad_norm": 5.964662147997069,
"learning_rate": 9.364977550145816e-07,
"loss": 1.8619,
"step": 3650
},
{
"epoch": 0.24628550251002324,
"grad_norm": 5.965890038499228,
"learning_rate": 9.362106451212721e-07,
"loss": 1.9405,
"step": 3655
},
{
"epoch": 0.24662241838212998,
"grad_norm": 6.664341876347663,
"learning_rate": 9.359229318587545e-07,
"loss": 1.9019,
"step": 3660
},
{
"epoch": 0.24695933425423672,
"grad_norm": 6.259088317611612,
"learning_rate": 9.356346156249954e-07,
"loss": 1.903,
"step": 3665
},
{
"epoch": 0.24729625012634346,
"grad_norm": 6.153712987251788,
"learning_rate": 9.353456968187958e-07,
"loss": 1.9271,
"step": 3670
},
{
"epoch": 0.2476331659984502,
"grad_norm": 5.8187434752693115,
"learning_rate": 9.350561758397897e-07,
"loss": 1.899,
"step": 3675
},
{
"epoch": 0.2479700818705569,
"grad_norm": 6.9370464485783545,
"learning_rate": 9.347660530884442e-07,
"loss": 1.9089,
"step": 3680
},
{
"epoch": 0.24830699774266365,
"grad_norm": 5.993727430508131,
"learning_rate": 9.344753289660592e-07,
"loss": 1.8896,
"step": 3685
},
{
"epoch": 0.2486439136147704,
"grad_norm": 6.832030622937063,
"learning_rate": 9.34184003874766e-07,
"loss": 1.9308,
"step": 3690
},
{
"epoch": 0.24898082948687714,
"grad_norm": 6.42103736138985,
"learning_rate": 9.338920782175269e-07,
"loss": 1.9103,
"step": 3695
},
{
"epoch": 0.24931774535898385,
"grad_norm": 6.567901996041012,
"learning_rate": 9.335995523981355e-07,
"loss": 1.9014,
"step": 3700
},
{
"epoch": 0.2496546612310906,
"grad_norm": 6.527187605310147,
"learning_rate": 9.333064268212153e-07,
"loss": 1.8274,
"step": 3705
},
{
"epoch": 0.24999157710319733,
"grad_norm": 6.482190165711818,
"learning_rate": 9.330127018922193e-07,
"loss": 1.902,
"step": 3710
},
{
"epoch": 0.2503284929753041,
"grad_norm": 5.668326143123714,
"learning_rate": 9.327183780174296e-07,
"loss": 1.8553,
"step": 3715
},
{
"epoch": 0.2506654088474108,
"grad_norm": 6.277307710196522,
"learning_rate": 9.324234556039567e-07,
"loss": 1.869,
"step": 3720
},
{
"epoch": 0.25100232471951756,
"grad_norm": 6.070880841042985,
"learning_rate": 9.321279350597393e-07,
"loss": 1.8632,
"step": 3725
},
{
"epoch": 0.25133924059162427,
"grad_norm": 5.564591682369898,
"learning_rate": 9.31831816793543e-07,
"loss": 1.8442,
"step": 3730
},
{
"epoch": 0.251676156463731,
"grad_norm": 5.911710322350655,
"learning_rate": 9.315351012149605e-07,
"loss": 1.8729,
"step": 3735
},
{
"epoch": 0.25201307233583775,
"grad_norm": 5.69969301095379,
"learning_rate": 9.312377887344105e-07,
"loss": 1.8273,
"step": 3740
},
{
"epoch": 0.25234998820794446,
"grad_norm": 5.758190049455089,
"learning_rate": 9.309398797631374e-07,
"loss": 1.8973,
"step": 3745
},
{
"epoch": 0.25268690408005123,
"grad_norm": 5.829441665364131,
"learning_rate": 9.306413747132108e-07,
"loss": 1.836,
"step": 3750
},
{
"epoch": 0.25302381995215795,
"grad_norm": 7.494080400403322,
"learning_rate": 9.303422739975246e-07,
"loss": 1.9453,
"step": 3755
},
{
"epoch": 0.25336073582426466,
"grad_norm": 6.367300908264392,
"learning_rate": 9.300425780297968e-07,
"loss": 1.8818,
"step": 3760
},
{
"epoch": 0.25369765169637143,
"grad_norm": 6.432621170142442,
"learning_rate": 9.297422872245686e-07,
"loss": 1.8613,
"step": 3765
},
{
"epoch": 0.25403456756847814,
"grad_norm": 6.108541333471499,
"learning_rate": 9.294414019972043e-07,
"loss": 1.8835,
"step": 3770
},
{
"epoch": 0.2543714834405849,
"grad_norm": 6.7536525309650335,
"learning_rate": 9.291399227638898e-07,
"loss": 1.8982,
"step": 3775
},
{
"epoch": 0.2547083993126916,
"grad_norm": 5.872721539921024,
"learning_rate": 9.288378499416332e-07,
"loss": 1.8544,
"step": 3780
},
{
"epoch": 0.25504531518479834,
"grad_norm": 6.476884234266525,
"learning_rate": 9.285351839482634e-07,
"loss": 1.8855,
"step": 3785
},
{
"epoch": 0.2553822310569051,
"grad_norm": 6.189916648502145,
"learning_rate": 9.2823192520243e-07,
"loss": 1.8873,
"step": 3790
},
{
"epoch": 0.2557191469290118,
"grad_norm": 5.983897060069876,
"learning_rate": 9.27928074123602e-07,
"loss": 1.8622,
"step": 3795
},
{
"epoch": 0.25605606280111853,
"grad_norm": 6.0808815520588615,
"learning_rate": 9.276236311320684e-07,
"loss": 1.8932,
"step": 3800
},
{
"epoch": 0.2563929786732253,
"grad_norm": 5.946657227019734,
"learning_rate": 9.273185966489365e-07,
"loss": 1.9228,
"step": 3805
},
{
"epoch": 0.256729894545332,
"grad_norm": 5.973295448688016,
"learning_rate": 9.270129710961318e-07,
"loss": 1.8135,
"step": 3810
},
{
"epoch": 0.2570668104174388,
"grad_norm": 5.556850084434461,
"learning_rate": 9.267067548963974e-07,
"loss": 1.8453,
"step": 3815
},
{
"epoch": 0.2574037262895455,
"grad_norm": 6.380485974028437,
"learning_rate": 9.263999484732934e-07,
"loss": 1.9156,
"step": 3820
},
{
"epoch": 0.2577406421616522,
"grad_norm": 6.097594854335576,
"learning_rate": 9.260925522511962e-07,
"loss": 1.8943,
"step": 3825
},
{
"epoch": 0.258077558033759,
"grad_norm": 5.871255691492415,
"learning_rate": 9.257845666552984e-07,
"loss": 1.9059,
"step": 3830
},
{
"epoch": 0.2584144739058657,
"grad_norm": 6.011724125372125,
"learning_rate": 9.254759921116073e-07,
"loss": 1.8983,
"step": 3835
},
{
"epoch": 0.25875138977797246,
"grad_norm": 6.4163291191100615,
"learning_rate": 9.251668290469452e-07,
"loss": 1.8849,
"step": 3840
},
{
"epoch": 0.2590883056500792,
"grad_norm": 5.812907082905663,
"learning_rate": 9.248570778889484e-07,
"loss": 1.8675,
"step": 3845
},
{
"epoch": 0.2594252215221859,
"grad_norm": 6.324746139412556,
"learning_rate": 9.245467390660664e-07,
"loss": 1.8303,
"step": 3850
},
{
"epoch": 0.25976213739429266,
"grad_norm": 6.0773552505761606,
"learning_rate": 9.242358130075618e-07,
"loss": 1.9011,
"step": 3855
},
{
"epoch": 0.26009905326639937,
"grad_norm": 5.809493160293788,
"learning_rate": 9.239243001435093e-07,
"loss": 1.8332,
"step": 3860
},
{
"epoch": 0.26043596913850614,
"grad_norm": 5.92861603094248,
"learning_rate": 9.236122009047957e-07,
"loss": 1.8137,
"step": 3865
},
{
"epoch": 0.26077288501061285,
"grad_norm": 6.5347777348808735,
"learning_rate": 9.232995157231182e-07,
"loss": 1.8711,
"step": 3870
},
{
"epoch": 0.26110980088271957,
"grad_norm": 5.818275626030584,
"learning_rate": 9.229862450309851e-07,
"loss": 1.8525,
"step": 3875
},
{
"epoch": 0.26144671675482634,
"grad_norm": 6.042244920418748,
"learning_rate": 9.226723892617141e-07,
"loss": 1.8957,
"step": 3880
},
{
"epoch": 0.26178363262693305,
"grad_norm": 5.006543607090774,
"learning_rate": 9.223579488494327e-07,
"loss": 1.8471,
"step": 3885
},
{
"epoch": 0.26212054849903976,
"grad_norm": 5.959886320776497,
"learning_rate": 9.220429242290763e-07,
"loss": 1.8556,
"step": 3890
},
{
"epoch": 0.26245746437114653,
"grad_norm": 6.192797328450058,
"learning_rate": 9.217273158363894e-07,
"loss": 1.8184,
"step": 3895
},
{
"epoch": 0.26279438024325324,
"grad_norm": 5.392472106463434,
"learning_rate": 9.214111241079232e-07,
"loss": 1.9066,
"step": 3900
},
{
"epoch": 0.26313129611536,
"grad_norm": 5.644651513241768,
"learning_rate": 9.21094349481036e-07,
"loss": 1.9127,
"step": 3905
},
{
"epoch": 0.2634682119874667,
"grad_norm": 6.226679672454185,
"learning_rate": 9.207769923938924e-07,
"loss": 1.8451,
"step": 3910
},
{
"epoch": 0.26380512785957344,
"grad_norm": 5.695454556727201,
"learning_rate": 9.204590532854627e-07,
"loss": 1.7827,
"step": 3915
},
{
"epoch": 0.2641420437316802,
"grad_norm": 5.98442814459207,
"learning_rate": 9.20140532595522e-07,
"loss": 1.9433,
"step": 3920
},
{
"epoch": 0.2644789596037869,
"grad_norm": 5.8671791162801155,
"learning_rate": 9.198214307646504e-07,
"loss": 1.8417,
"step": 3925
},
{
"epoch": 0.2648158754758937,
"grad_norm": 5.870417252327208,
"learning_rate": 9.195017482342313e-07,
"loss": 1.9082,
"step": 3930
},
{
"epoch": 0.2651527913480004,
"grad_norm": 6.4741759723264956,
"learning_rate": 9.191814854464514e-07,
"loss": 1.8705,
"step": 3935
},
{
"epoch": 0.2654897072201071,
"grad_norm": 5.862959275378989,
"learning_rate": 9.188606428443002e-07,
"loss": 1.8633,
"step": 3940
},
{
"epoch": 0.2658266230922139,
"grad_norm": 5.7751274179245,
"learning_rate": 9.185392208715692e-07,
"loss": 1.7832,
"step": 3945
},
{
"epoch": 0.2661635389643206,
"grad_norm": 6.172667926664809,
"learning_rate": 9.182172199728513e-07,
"loss": 1.8827,
"step": 3950
},
{
"epoch": 0.26650045483642737,
"grad_norm": 6.481705605622134,
"learning_rate": 9.178946405935398e-07,
"loss": 1.8974,
"step": 3955
},
{
"epoch": 0.2668373707085341,
"grad_norm": 6.208374646399624,
"learning_rate": 9.175714831798287e-07,
"loss": 1.8953,
"step": 3960
},
{
"epoch": 0.2671742865806408,
"grad_norm": 6.391722651555033,
"learning_rate": 9.172477481787113e-07,
"loss": 1.8014,
"step": 3965
},
{
"epoch": 0.26751120245274757,
"grad_norm": 6.037855150211001,
"learning_rate": 9.169234360379796e-07,
"loss": 1.8232,
"step": 3970
},
{
"epoch": 0.2678481183248543,
"grad_norm": 6.695825556952006,
"learning_rate": 9.165985472062244e-07,
"loss": 1.8885,
"step": 3975
},
{
"epoch": 0.268185034196961,
"grad_norm": 5.964301600300989,
"learning_rate": 9.162730821328337e-07,
"loss": 1.8774,
"step": 3980
},
{
"epoch": 0.26852195006906776,
"grad_norm": 6.396623330843612,
"learning_rate": 9.159470412679928e-07,
"loss": 1.9039,
"step": 3985
},
{
"epoch": 0.2688588659411745,
"grad_norm": 6.719419412679612,
"learning_rate": 9.156204250626836e-07,
"loss": 1.8773,
"step": 3990
},
{
"epoch": 0.26919578181328124,
"grad_norm": 6.879794042484075,
"learning_rate": 9.152932339686833e-07,
"loss": 1.9109,
"step": 3995
},
{
"epoch": 0.26953269768538796,
"grad_norm": 5.975921987217448,
"learning_rate": 9.149654684385647e-07,
"loss": 1.8618,
"step": 4000
},
{
"epoch": 0.26986961355749467,
"grad_norm": 6.629166390382459,
"learning_rate": 9.146371289256952e-07,
"loss": 1.798,
"step": 4005
},
{
"epoch": 0.27020652942960144,
"grad_norm": 5.949398882914333,
"learning_rate": 9.143082158842359e-07,
"loss": 1.9286,
"step": 4010
},
{
"epoch": 0.27054344530170815,
"grad_norm": 5.810972748863941,
"learning_rate": 9.139787297691413e-07,
"loss": 1.906,
"step": 4015
},
{
"epoch": 0.2708803611738149,
"grad_norm": 6.2120367421089115,
"learning_rate": 9.136486710361586e-07,
"loss": 1.9277,
"step": 4020
},
{
"epoch": 0.27121727704592163,
"grad_norm": 5.823041275387848,
"learning_rate": 9.13318040141827e-07,
"loss": 1.8543,
"step": 4025
},
{
"epoch": 0.27155419291802835,
"grad_norm": 6.214201315153956,
"learning_rate": 9.129868375434774e-07,
"loss": 1.8462,
"step": 4030
},
{
"epoch": 0.2718911087901351,
"grad_norm": 6.719918600313508,
"learning_rate": 9.12655063699231e-07,
"loss": 1.8796,
"step": 4035
},
{
"epoch": 0.27222802466224183,
"grad_norm": 6.3796230972963865,
"learning_rate": 9.123227190679994e-07,
"loss": 1.803,
"step": 4040
},
{
"epoch": 0.2725649405343486,
"grad_norm": 6.325649258604688,
"learning_rate": 9.119898041094838e-07,
"loss": 1.9,
"step": 4045
},
{
"epoch": 0.2729018564064553,
"grad_norm": 6.165527100110748,
"learning_rate": 9.116563192841741e-07,
"loss": 1.8718,
"step": 4050
},
{
"epoch": 0.273238772278562,
"grad_norm": 6.505461302130346,
"learning_rate": 9.113222650533486e-07,
"loss": 1.9228,
"step": 4055
},
{
"epoch": 0.2735756881506688,
"grad_norm": 6.263801274705381,
"learning_rate": 9.109876418790731e-07,
"loss": 1.8632,
"step": 4060
},
{
"epoch": 0.2739126040227755,
"grad_norm": 6.661484279149622,
"learning_rate": 9.106524502242004e-07,
"loss": 1.8669,
"step": 4065
},
{
"epoch": 0.2742495198948822,
"grad_norm": 5.738641723072569,
"learning_rate": 9.103166905523699e-07,
"loss": 1.8212,
"step": 4070
},
{
"epoch": 0.274586435766989,
"grad_norm": 6.551069538879794,
"learning_rate": 9.099803633280059e-07,
"loss": 1.9684,
"step": 4075
},
{
"epoch": 0.2749233516390957,
"grad_norm": 6.532119039589357,
"learning_rate": 9.096434690163184e-07,
"loss": 1.8796,
"step": 4080
},
{
"epoch": 0.27526026751120247,
"grad_norm": 5.881963986981689,
"learning_rate": 9.093060080833019e-07,
"loss": 1.8195,
"step": 4085
},
{
"epoch": 0.2755971833833092,
"grad_norm": 6.412172720197176,
"learning_rate": 9.089679809957343e-07,
"loss": 1.8372,
"step": 4090
},
{
"epoch": 0.2759340992554159,
"grad_norm": 7.471271072846122,
"learning_rate": 9.086293882211768e-07,
"loss": 1.9177,
"step": 4095
},
{
"epoch": 0.27627101512752267,
"grad_norm": 6.198988270277827,
"learning_rate": 9.082902302279726e-07,
"loss": 1.8632,
"step": 4100
},
{
"epoch": 0.2766079309996294,
"grad_norm": 5.767940298563196,
"learning_rate": 9.079505074852476e-07,
"loss": 1.8877,
"step": 4105
},
{
"epoch": 0.27694484687173615,
"grad_norm": 6.230279674826198,
"learning_rate": 9.076102204629082e-07,
"loss": 1.8431,
"step": 4110
},
{
"epoch": 0.27728176274384286,
"grad_norm": 5.949567529985799,
"learning_rate": 9.072693696316411e-07,
"loss": 1.8633,
"step": 4115
},
{
"epoch": 0.2776186786159496,
"grad_norm": 5.928871922518486,
"learning_rate": 9.069279554629137e-07,
"loss": 1.8668,
"step": 4120
},
{
"epoch": 0.27795559448805635,
"grad_norm": 6.081834973234626,
"learning_rate": 9.06585978428972e-07,
"loss": 1.8543,
"step": 4125
},
{
"epoch": 0.27829251036016306,
"grad_norm": 6.142536394883528,
"learning_rate": 9.062434390028407e-07,
"loss": 1.8877,
"step": 4130
},
{
"epoch": 0.27862942623226983,
"grad_norm": 5.741508220751473,
"learning_rate": 9.059003376583223e-07,
"loss": 1.877,
"step": 4135
},
{
"epoch": 0.27896634210437654,
"grad_norm": 5.705602714030965,
"learning_rate": 9.055566748699968e-07,
"loss": 1.9019,
"step": 4140
},
{
"epoch": 0.27930325797648325,
"grad_norm": 5.940713375585646,
"learning_rate": 9.052124511132204e-07,
"loss": 1.8499,
"step": 4145
},
{
"epoch": 0.27964017384859,
"grad_norm": 5.985593133684585,
"learning_rate": 9.04867666864126e-07,
"loss": 1.8314,
"step": 4150
},
{
"epoch": 0.27997708972069674,
"grad_norm": 7.084259902044295,
"learning_rate": 9.045223225996207e-07,
"loss": 1.8601,
"step": 4155
},
{
"epoch": 0.28031400559280345,
"grad_norm": 6.259469749054458,
"learning_rate": 9.041764187973871e-07,
"loss": 1.8467,
"step": 4160
},
{
"epoch": 0.2806509214649102,
"grad_norm": 6.211124902310236,
"learning_rate": 9.038299559358815e-07,
"loss": 1.8573,
"step": 4165
},
{
"epoch": 0.28098783733701693,
"grad_norm": 6.0058531329183875,
"learning_rate": 9.034829344943331e-07,
"loss": 1.8623,
"step": 4170
},
{
"epoch": 0.2813247532091237,
"grad_norm": 6.136862011053358,
"learning_rate": 9.031353549527444e-07,
"loss": 1.8878,
"step": 4175
},
{
"epoch": 0.2816616690812304,
"grad_norm": 6.167062756427111,
"learning_rate": 9.027872177918894e-07,
"loss": 1.8131,
"step": 4180
},
{
"epoch": 0.28199858495333713,
"grad_norm": 5.533797632844438,
"learning_rate": 9.024385234933134e-07,
"loss": 1.9209,
"step": 4185
},
{
"epoch": 0.2823355008254439,
"grad_norm": 5.695979000250038,
"learning_rate": 9.020892725393326e-07,
"loss": 1.807,
"step": 4190
},
{
"epoch": 0.2826724166975506,
"grad_norm": 6.377546908781681,
"learning_rate": 9.017394654130332e-07,
"loss": 1.8577,
"step": 4195
},
{
"epoch": 0.2830093325696574,
"grad_norm": 5.756307597482796,
"learning_rate": 9.013891025982703e-07,
"loss": 1.8913,
"step": 4200
},
{
"epoch": 0.2833462484417641,
"grad_norm": 6.166064092582664,
"learning_rate": 9.010381845796677e-07,
"loss": 1.823,
"step": 4205
},
{
"epoch": 0.2836831643138708,
"grad_norm": 6.488806989342812,
"learning_rate": 9.006867118426178e-07,
"loss": 1.8323,
"step": 4210
},
{
"epoch": 0.2840200801859776,
"grad_norm": 6.27973058379338,
"learning_rate": 9.003346848732793e-07,
"loss": 1.8536,
"step": 4215
},
{
"epoch": 0.2843569960580843,
"grad_norm": 5.344380427344846,
"learning_rate": 8.999821041585787e-07,
"loss": 1.8111,
"step": 4220
},
{
"epoch": 0.28469391193019106,
"grad_norm": 6.075842058898948,
"learning_rate": 8.996289701862072e-07,
"loss": 1.9303,
"step": 4225
},
{
"epoch": 0.28503082780229777,
"grad_norm": 6.265878132390271,
"learning_rate": 8.99275283444622e-07,
"loss": 1.8807,
"step": 4230
},
{
"epoch": 0.2853677436744045,
"grad_norm": 5.7219422084040055,
"learning_rate": 8.989210444230449e-07,
"loss": 1.8753,
"step": 4235
},
{
"epoch": 0.28570465954651125,
"grad_norm": 6.215050142707896,
"learning_rate": 8.985662536114612e-07,
"loss": 1.884,
"step": 4240
},
{
"epoch": 0.28604157541861797,
"grad_norm": 6.110998708733845,
"learning_rate": 8.9821091150062e-07,
"loss": 1.8409,
"step": 4245
},
{
"epoch": 0.2863784912907247,
"grad_norm": 6.257750293856616,
"learning_rate": 8.978550185820323e-07,
"loss": 1.8368,
"step": 4250
},
{
"epoch": 0.28671540716283145,
"grad_norm": 6.054417802124323,
"learning_rate": 8.974985753479718e-07,
"loss": 1.8733,
"step": 4255
},
{
"epoch": 0.28705232303493816,
"grad_norm": 6.043179751280904,
"learning_rate": 8.971415822914726e-07,
"loss": 1.8588,
"step": 4260
},
{
"epoch": 0.28738923890704493,
"grad_norm": 6.14176472404905,
"learning_rate": 8.967840399063298e-07,
"loss": 1.8271,
"step": 4265
},
{
"epoch": 0.28772615477915164,
"grad_norm": 6.174678032249467,
"learning_rate": 8.964259486870982e-07,
"loss": 1.8321,
"step": 4270
},
{
"epoch": 0.28806307065125836,
"grad_norm": 5.8832790823420975,
"learning_rate": 8.960673091290916e-07,
"loss": 1.8336,
"step": 4275
},
{
"epoch": 0.2883999865233651,
"grad_norm": 6.488845467073408,
"learning_rate": 8.957081217283825e-07,
"loss": 1.8753,
"step": 4280
},
{
"epoch": 0.28873690239547184,
"grad_norm": 5.749007187923349,
"learning_rate": 8.953483869818013e-07,
"loss": 1.8418,
"step": 4285
},
{
"epoch": 0.2890738182675786,
"grad_norm": 6.049504947740589,
"learning_rate": 8.949881053869348e-07,
"loss": 1.8525,
"step": 4290
},
{
"epoch": 0.2894107341396853,
"grad_norm": 6.396690173802817,
"learning_rate": 8.946272774421271e-07,
"loss": 1.8286,
"step": 4295
},
{
"epoch": 0.28974765001179204,
"grad_norm": 6.108405259192714,
"learning_rate": 8.942659036464775e-07,
"loss": 1.859,
"step": 4300
},
{
"epoch": 0.2900845658838988,
"grad_norm": 5.98212203777026,
"learning_rate": 8.939039844998403e-07,
"loss": 1.869,
"step": 4305
},
{
"epoch": 0.2904214817560055,
"grad_norm": 5.941860278201532,
"learning_rate": 8.935415205028243e-07,
"loss": 1.8408,
"step": 4310
},
{
"epoch": 0.2907583976281123,
"grad_norm": 5.72615071195908,
"learning_rate": 8.931785121567921e-07,
"loss": 1.8055,
"step": 4315
},
{
"epoch": 0.291095313500219,
"grad_norm": 6.117559850305973,
"learning_rate": 8.928149599638588e-07,
"loss": 1.8843,
"step": 4320
},
{
"epoch": 0.2914322293723257,
"grad_norm": 6.450369657828759,
"learning_rate": 8.924508644268921e-07,
"loss": 1.9653,
"step": 4325
},
{
"epoch": 0.2917691452444325,
"grad_norm": 5.856381777788301,
"learning_rate": 8.920862260495111e-07,
"loss": 1.8706,
"step": 4330
},
{
"epoch": 0.2921060611165392,
"grad_norm": 6.354337305203465,
"learning_rate": 8.917210453360859e-07,
"loss": 1.8893,
"step": 4335
},
{
"epoch": 0.2924429769886459,
"grad_norm": 6.906110311477241,
"learning_rate": 8.913553227917365e-07,
"loss": 1.806,
"step": 4340
},
{
"epoch": 0.2927798928607527,
"grad_norm": 6.179040612378133,
"learning_rate": 8.909890589223329e-07,
"loss": 1.86,
"step": 4345
},
{
"epoch": 0.2931168087328594,
"grad_norm": 5.978516176145858,
"learning_rate": 8.906222542344932e-07,
"loss": 1.8849,
"step": 4350
},
{
"epoch": 0.29345372460496616,
"grad_norm": 6.002208675843779,
"learning_rate": 8.902549092355839e-07,
"loss": 1.8798,
"step": 4355
},
{
"epoch": 0.2937906404770729,
"grad_norm": 6.2186044385766195,
"learning_rate": 8.898870244337189e-07,
"loss": 1.9232,
"step": 4360
},
{
"epoch": 0.2941275563491796,
"grad_norm": 6.153726869453081,
"learning_rate": 8.895186003377586e-07,
"loss": 1.8648,
"step": 4365
},
{
"epoch": 0.29446447222128636,
"grad_norm": 6.919968948046176,
"learning_rate": 8.891496374573095e-07,
"loss": 1.9309,
"step": 4370
},
{
"epoch": 0.29480138809339307,
"grad_norm": 6.058343737353778,
"learning_rate": 8.887801363027233e-07,
"loss": 1.8841,
"step": 4375
},
{
"epoch": 0.29513830396549984,
"grad_norm": 5.937731974461376,
"learning_rate": 8.884100973850962e-07,
"loss": 1.9063,
"step": 4380
},
{
"epoch": 0.29547521983760655,
"grad_norm": 5.801901159807343,
"learning_rate": 8.880395212162684e-07,
"loss": 1.8964,
"step": 4385
},
{
"epoch": 0.29581213570971326,
"grad_norm": 6.297331913420712,
"learning_rate": 8.87668408308823e-07,
"loss": 1.7967,
"step": 4390
},
{
"epoch": 0.29614905158182003,
"grad_norm": 5.897350111858126,
"learning_rate": 8.872967591760856e-07,
"loss": 1.8394,
"step": 4395
},
{
"epoch": 0.29648596745392675,
"grad_norm": 5.817511472756013,
"learning_rate": 8.869245743321234e-07,
"loss": 1.8911,
"step": 4400
},
{
"epoch": 0.2968228833260335,
"grad_norm": 6.1323690508053215,
"learning_rate": 8.865518542917452e-07,
"loss": 1.782,
"step": 4405
},
{
"epoch": 0.29715979919814023,
"grad_norm": 5.936037133542599,
"learning_rate": 8.861785995704991e-07,
"loss": 1.8871,
"step": 4410
},
{
"epoch": 0.29749671507024694,
"grad_norm": 5.782678112860589,
"learning_rate": 8.858048106846735e-07,
"loss": 1.8751,
"step": 4415
},
{
"epoch": 0.2978336309423537,
"grad_norm": 6.143794616391879,
"learning_rate": 8.854304881512955e-07,
"loss": 1.8205,
"step": 4420
},
{
"epoch": 0.2981705468144604,
"grad_norm": 6.267922935330008,
"learning_rate": 8.850556324881302e-07,
"loss": 1.9108,
"step": 4425
},
{
"epoch": 0.29850746268656714,
"grad_norm": 5.700982678029616,
"learning_rate": 8.846802442136804e-07,
"loss": 1.8323,
"step": 4430
},
{
"epoch": 0.2988443785586739,
"grad_norm": 5.875620333234486,
"learning_rate": 8.843043238471853e-07,
"loss": 1.8779,
"step": 4435
},
{
"epoch": 0.2991812944307806,
"grad_norm": 6.514010796547643,
"learning_rate": 8.839278719086201e-07,
"loss": 1.8766,
"step": 4440
},
{
"epoch": 0.2995182103028874,
"grad_norm": 6.078146151760929,
"learning_rate": 8.835508889186956e-07,
"loss": 1.8954,
"step": 4445
},
{
"epoch": 0.2998551261749941,
"grad_norm": 6.811365534277628,
"learning_rate": 8.83173375398857e-07,
"loss": 1.8825,
"step": 4450
},
{
"epoch": 0.3001920420471008,
"grad_norm": 5.954896691271629,
"learning_rate": 8.827953318712831e-07,
"loss": 1.8576,
"step": 4455
},
{
"epoch": 0.3005289579192076,
"grad_norm": 5.68557129820593,
"learning_rate": 8.824167588588861e-07,
"loss": 1.8302,
"step": 4460
},
{
"epoch": 0.3008658737913143,
"grad_norm": 6.256891159580008,
"learning_rate": 8.820376568853105e-07,
"loss": 1.8263,
"step": 4465
},
{
"epoch": 0.30120278966342107,
"grad_norm": 5.930780965618538,
"learning_rate": 8.816580264749325e-07,
"loss": 1.8577,
"step": 4470
},
{
"epoch": 0.3015397055355278,
"grad_norm": 5.597980182580963,
"learning_rate": 8.81277868152859e-07,
"loss": 1.874,
"step": 4475
},
{
"epoch": 0.3018766214076345,
"grad_norm": 6.170497154326832,
"learning_rate": 8.808971824449274e-07,
"loss": 1.8562,
"step": 4480
},
{
"epoch": 0.30221353727974126,
"grad_norm": 5.543906347986211,
"learning_rate": 8.805159698777045e-07,
"loss": 1.8477,
"step": 4485
},
{
"epoch": 0.302550453151848,
"grad_norm": 5.950757055714713,
"learning_rate": 8.801342309784858e-07,
"loss": 1.8363,
"step": 4490
},
{
"epoch": 0.30288736902395474,
"grad_norm": 5.555554802786965,
"learning_rate": 8.79751966275295e-07,
"loss": 1.7955,
"step": 4495
},
{
"epoch": 0.30322428489606146,
"grad_norm": 6.0842683476071695,
"learning_rate": 8.793691762968827e-07,
"loss": 1.838,
"step": 4500
},
{
"epoch": 0.30356120076816817,
"grad_norm": 6.3854828636373755,
"learning_rate": 8.789858615727264e-07,
"loss": 1.8526,
"step": 4505
},
{
"epoch": 0.30389811664027494,
"grad_norm": 5.849310768859681,
"learning_rate": 8.786020226330295e-07,
"loss": 1.8327,
"step": 4510
},
{
"epoch": 0.30423503251238165,
"grad_norm": 5.90106872765922,
"learning_rate": 8.782176600087203e-07,
"loss": 1.7905,
"step": 4515
},
{
"epoch": 0.30457194838448837,
"grad_norm": 5.901668679720049,
"learning_rate": 8.778327742314513e-07,
"loss": 1.8567,
"step": 4520
},
{
"epoch": 0.30490886425659514,
"grad_norm": 5.8797100239213655,
"learning_rate": 8.77447365833599e-07,
"loss": 1.8321,
"step": 4525
},
{
"epoch": 0.30524578012870185,
"grad_norm": 6.2990354182925135,
"learning_rate": 8.770614353482628e-07,
"loss": 1.8714,
"step": 4530
},
{
"epoch": 0.3055826960008086,
"grad_norm": 5.873265394004258,
"learning_rate": 8.766749833092638e-07,
"loss": 1.9256,
"step": 4535
},
{
"epoch": 0.30591961187291533,
"grad_norm": 5.880033630724263,
"learning_rate": 8.76288010251145e-07,
"loss": 1.8538,
"step": 4540
},
{
"epoch": 0.30625652774502204,
"grad_norm": 6.21041901452895,
"learning_rate": 8.759005167091697e-07,
"loss": 1.9027,
"step": 4545
},
{
"epoch": 0.3065934436171288,
"grad_norm": 6.363251755485691,
"learning_rate": 8.755125032193214e-07,
"loss": 1.788,
"step": 4550
},
{
"epoch": 0.3069303594892355,
"grad_norm": 6.23031482525298,
"learning_rate": 8.751239703183026e-07,
"loss": 1.8438,
"step": 4555
},
{
"epoch": 0.3072672753613423,
"grad_norm": 6.133022983823034,
"learning_rate": 8.747349185435348e-07,
"loss": 1.8369,
"step": 4560
},
{
"epoch": 0.307604191233449,
"grad_norm": 6.285822907751822,
"learning_rate": 8.743453484331562e-07,
"loss": 1.9024,
"step": 4565
},
{
"epoch": 0.3079411071055557,
"grad_norm": 5.835690715657944,
"learning_rate": 8.73955260526023e-07,
"loss": 1.8256,
"step": 4570
},
{
"epoch": 0.3082780229776625,
"grad_norm": 5.847371813030593,
"learning_rate": 8.735646553617069e-07,
"loss": 1.9136,
"step": 4575
},
{
"epoch": 0.3086149388497692,
"grad_norm": 6.043595537844784,
"learning_rate": 8.731735334804953e-07,
"loss": 1.8674,
"step": 4580
},
{
"epoch": 0.308951854721876,
"grad_norm": 5.956889810584438,
"learning_rate": 8.727818954233904e-07,
"loss": 1.8581,
"step": 4585
},
{
"epoch": 0.3092887705939827,
"grad_norm": 6.324696749321057,
"learning_rate": 8.723897417321084e-07,
"loss": 1.8447,
"step": 4590
},
{
"epoch": 0.3096256864660894,
"grad_norm": 6.276116542740643,
"learning_rate": 8.719970729490788e-07,
"loss": 1.873,
"step": 4595
},
{
"epoch": 0.30996260233819617,
"grad_norm": 6.411851214725662,
"learning_rate": 8.716038896174432e-07,
"loss": 1.8802,
"step": 4600
},
{
"epoch": 0.3102995182103029,
"grad_norm": 6.138820411276758,
"learning_rate": 8.712101922810551e-07,
"loss": 1.8773,
"step": 4605
},
{
"epoch": 0.3106364340824096,
"grad_norm": 5.872851484478736,
"learning_rate": 8.708159814844793e-07,
"loss": 1.8485,
"step": 4610
},
{
"epoch": 0.31097334995451636,
"grad_norm": 6.031467898907255,
"learning_rate": 8.704212577729905e-07,
"loss": 1.897,
"step": 4615
},
{
"epoch": 0.3113102658266231,
"grad_norm": 5.9956951123885345,
"learning_rate": 8.700260216925728e-07,
"loss": 1.8205,
"step": 4620
},
{
"epoch": 0.31164718169872985,
"grad_norm": 6.056845802633199,
"learning_rate": 8.696302737899192e-07,
"loss": 1.8795,
"step": 4625
},
{
"epoch": 0.31198409757083656,
"grad_norm": 6.221681582297336,
"learning_rate": 8.692340146124308e-07,
"loss": 1.8269,
"step": 4630
},
{
"epoch": 0.3123210134429433,
"grad_norm": 6.553618267443545,
"learning_rate": 8.688372447082153e-07,
"loss": 1.8764,
"step": 4635
},
{
"epoch": 0.31265792931505004,
"grad_norm": 6.213539651433188,
"learning_rate": 8.684399646260876e-07,
"loss": 1.8413,
"step": 4640
},
{
"epoch": 0.31299484518715676,
"grad_norm": 6.257873473467152,
"learning_rate": 8.680421749155677e-07,
"loss": 1.8354,
"step": 4645
},
{
"epoch": 0.3133317610592635,
"grad_norm": 6.354806527386589,
"learning_rate": 8.676438761268808e-07,
"loss": 1.9039,
"step": 4650
},
{
"epoch": 0.31366867693137024,
"grad_norm": 6.240255748070445,
"learning_rate": 8.672450688109563e-07,
"loss": 1.9127,
"step": 4655
},
{
"epoch": 0.31400559280347695,
"grad_norm": 6.274554512254039,
"learning_rate": 8.668457535194267e-07,
"loss": 1.9452,
"step": 4660
},
{
"epoch": 0.3143425086755837,
"grad_norm": 5.680213033367091,
"learning_rate": 8.664459308046274e-07,
"loss": 1.8823,
"step": 4665
},
{
"epoch": 0.31467942454769043,
"grad_norm": 6.026744567200448,
"learning_rate": 8.660456012195957e-07,
"loss": 1.8442,
"step": 4670
},
{
"epoch": 0.3150163404197972,
"grad_norm": 6.031466170099904,
"learning_rate": 8.656447653180699e-07,
"loss": 1.8205,
"step": 4675
},
{
"epoch": 0.3153532562919039,
"grad_norm": 6.34630037939122,
"learning_rate": 8.652434236544886e-07,
"loss": 1.8885,
"step": 4680
},
{
"epoch": 0.31569017216401063,
"grad_norm": 6.0108468232784,
"learning_rate": 8.648415767839899e-07,
"loss": 1.8477,
"step": 4685
},
{
"epoch": 0.3160270880361174,
"grad_norm": 6.056101862232991,
"learning_rate": 8.644392252624108e-07,
"loss": 1.8009,
"step": 4690
},
{
"epoch": 0.3163640039082241,
"grad_norm": 5.549149326299132,
"learning_rate": 8.640363696462869e-07,
"loss": 1.8893,
"step": 4695
},
{
"epoch": 0.3167009197803308,
"grad_norm": 5.748085710916815,
"learning_rate": 8.636330104928499e-07,
"loss": 1.8659,
"step": 4700
},
{
"epoch": 0.3170378356524376,
"grad_norm": 6.615576133533302,
"learning_rate": 8.632291483600289e-07,
"loss": 1.8309,
"step": 4705
},
{
"epoch": 0.3173747515245443,
"grad_norm": 6.1506018149830055,
"learning_rate": 8.628247838064485e-07,
"loss": 1.8303,
"step": 4710
},
{
"epoch": 0.3177116673966511,
"grad_norm": 6.332160685216612,
"learning_rate": 8.624199173914279e-07,
"loss": 1.9227,
"step": 4715
},
{
"epoch": 0.3180485832687578,
"grad_norm": 6.259268021975856,
"learning_rate": 8.620145496749811e-07,
"loss": 1.8573,
"step": 4720
},
{
"epoch": 0.3183854991408645,
"grad_norm": 6.438165923545349,
"learning_rate": 8.616086812178151e-07,
"loss": 1.8553,
"step": 4725
},
{
"epoch": 0.31872241501297127,
"grad_norm": 5.938254916301874,
"learning_rate": 8.612023125813296e-07,
"loss": 1.8766,
"step": 4730
},
{
"epoch": 0.319059330885078,
"grad_norm": 5.6026652144423,
"learning_rate": 8.607954443276162e-07,
"loss": 1.9063,
"step": 4735
},
{
"epoch": 0.31939624675718475,
"grad_norm": 5.860335890704336,
"learning_rate": 8.603880770194574e-07,
"loss": 1.8725,
"step": 4740
},
{
"epoch": 0.31973316262929147,
"grad_norm": 5.920443790572611,
"learning_rate": 8.59980211220326e-07,
"loss": 1.8371,
"step": 4745
},
{
"epoch": 0.3200700785013982,
"grad_norm": 6.175976373732354,
"learning_rate": 8.595718474943849e-07,
"loss": 1.8342,
"step": 4750
},
{
"epoch": 0.32040699437350495,
"grad_norm": 6.303028162479854,
"learning_rate": 8.591629864064851e-07,
"loss": 1.8751,
"step": 4755
},
{
"epoch": 0.32074391024561166,
"grad_norm": 6.430617243948466,
"learning_rate": 8.587536285221655e-07,
"loss": 1.9291,
"step": 4760
},
{
"epoch": 0.32108082611771843,
"grad_norm": 6.1534051447562055,
"learning_rate": 8.583437744076527e-07,
"loss": 1.8666,
"step": 4765
},
{
"epoch": 0.32141774198982515,
"grad_norm": 6.535863631168725,
"learning_rate": 8.579334246298592e-07,
"loss": 1.8779,
"step": 4770
},
{
"epoch": 0.32175465786193186,
"grad_norm": 5.71107753763816,
"learning_rate": 8.575225797563834e-07,
"loss": 1.911,
"step": 4775
},
{
"epoch": 0.3220915737340386,
"grad_norm": 5.886190655326756,
"learning_rate": 8.571112403555083e-07,
"loss": 1.851,
"step": 4780
},
{
"epoch": 0.32242848960614534,
"grad_norm": 6.095741702151825,
"learning_rate": 8.566994069962012e-07,
"loss": 1.8704,
"step": 4785
},
{
"epoch": 0.32276540547825205,
"grad_norm": 6.02265285150199,
"learning_rate": 8.562870802481126e-07,
"loss": 1.8763,
"step": 4790
},
{
"epoch": 0.3231023213503588,
"grad_norm": 6.5312090704657795,
"learning_rate": 8.55874260681575e-07,
"loss": 1.8484,
"step": 4795
},
{
"epoch": 0.32343923722246554,
"grad_norm": 6.285132315448252,
"learning_rate": 8.554609488676032e-07,
"loss": 1.8557,
"step": 4800
},
{
"epoch": 0.3237761530945723,
"grad_norm": 6.0685785815626705,
"learning_rate": 8.550471453778925e-07,
"loss": 1.8416,
"step": 4805
},
{
"epoch": 0.324113068966679,
"grad_norm": 6.044458918895983,
"learning_rate": 8.546328507848184e-07,
"loss": 1.8412,
"step": 4810
},
{
"epoch": 0.32444998483878573,
"grad_norm": 5.8373873569240855,
"learning_rate": 8.542180656614358e-07,
"loss": 1.7964,
"step": 4815
},
{
"epoch": 0.3247869007108925,
"grad_norm": 6.1204039997103035,
"learning_rate": 8.538027905814778e-07,
"loss": 1.8239,
"step": 4820
},
{
"epoch": 0.3251238165829992,
"grad_norm": 5.906274807482711,
"learning_rate": 8.533870261193556e-07,
"loss": 1.7694,
"step": 4825
},
{
"epoch": 0.325460732455106,
"grad_norm": 6.21208348038065,
"learning_rate": 8.529707728501571e-07,
"loss": 1.8148,
"step": 4830
},
{
"epoch": 0.3257976483272127,
"grad_norm": 6.014309137015161,
"learning_rate": 8.525540313496462e-07,
"loss": 1.836,
"step": 4835
},
{
"epoch": 0.3261345641993194,
"grad_norm": 6.0431816763200805,
"learning_rate": 8.521368021942623e-07,
"loss": 1.9126,
"step": 4840
},
{
"epoch": 0.3264714800714262,
"grad_norm": 6.376773643711095,
"learning_rate": 8.517190859611195e-07,
"loss": 1.806,
"step": 4845
},
{
"epoch": 0.3268083959435329,
"grad_norm": 6.863001260214309,
"learning_rate": 8.513008832280053e-07,
"loss": 1.9519,
"step": 4850
},
{
"epoch": 0.32714531181563966,
"grad_norm": 6.75341977852346,
"learning_rate": 8.508821945733802e-07,
"loss": 1.8704,
"step": 4855
},
{
"epoch": 0.3274822276877464,
"grad_norm": 6.397518156192358,
"learning_rate": 8.504630205763768e-07,
"loss": 1.8695,
"step": 4860
},
{
"epoch": 0.3278191435598531,
"grad_norm": 6.182422350969933,
"learning_rate": 8.500433618167992e-07,
"loss": 1.8985,
"step": 4865
},
{
"epoch": 0.32815605943195986,
"grad_norm": 6.1420508752244265,
"learning_rate": 8.496232188751222e-07,
"loss": 1.8443,
"step": 4870
},
{
"epoch": 0.32849297530406657,
"grad_norm": 5.590831241686126,
"learning_rate": 8.492025923324897e-07,
"loss": 1.8332,
"step": 4875
},
{
"epoch": 0.3288298911761733,
"grad_norm": 6.260067475048003,
"learning_rate": 8.487814827707152e-07,
"loss": 1.8677,
"step": 4880
},
{
"epoch": 0.32916680704828005,
"grad_norm": 6.460645221149728,
"learning_rate": 8.483598907722795e-07,
"loss": 1.8868,
"step": 4885
},
{
"epoch": 0.32950372292038677,
"grad_norm": 5.771598644199668,
"learning_rate": 8.479378169203317e-07,
"loss": 1.7935,
"step": 4890
},
{
"epoch": 0.32984063879249353,
"grad_norm": 6.205757641829904,
"learning_rate": 8.475152617986869e-07,
"loss": 1.8203,
"step": 4895
},
{
"epoch": 0.33017755466460025,
"grad_norm": 6.0547319795697,
"learning_rate": 8.470922259918254e-07,
"loss": 1.815,
"step": 4900
},
{
"epoch": 0.33051447053670696,
"grad_norm": 5.81444773035736,
"learning_rate": 8.466687100848935e-07,
"loss": 1.8813,
"step": 4905
},
{
"epoch": 0.33085138640881373,
"grad_norm": 6.352411344793242,
"learning_rate": 8.462447146637006e-07,
"loss": 1.8406,
"step": 4910
},
{
"epoch": 0.33118830228092044,
"grad_norm": 5.769658940167598,
"learning_rate": 8.458202403147199e-07,
"loss": 1.8585,
"step": 4915
},
{
"epoch": 0.3315252181530272,
"grad_norm": 6.334129261483883,
"learning_rate": 8.453952876250867e-07,
"loss": 1.8119,
"step": 4920
},
{
"epoch": 0.3318621340251339,
"grad_norm": 6.3371647076546145,
"learning_rate": 8.449698571825984e-07,
"loss": 1.8941,
"step": 4925
},
{
"epoch": 0.33219904989724064,
"grad_norm": 6.332880954598852,
"learning_rate": 8.445439495757127e-07,
"loss": 1.8444,
"step": 4930
},
{
"epoch": 0.3325359657693474,
"grad_norm": 6.026022147996478,
"learning_rate": 8.44117565393548e-07,
"loss": 1.8098,
"step": 4935
},
{
"epoch": 0.3328728816414541,
"grad_norm": 6.068938398940136,
"learning_rate": 8.436907052258808e-07,
"loss": 1.896,
"step": 4940
},
{
"epoch": 0.3332097975135609,
"grad_norm": 6.273354421417618,
"learning_rate": 8.432633696631473e-07,
"loss": 1.9326,
"step": 4945
},
{
"epoch": 0.3335467133856676,
"grad_norm": 5.900730766734711,
"learning_rate": 8.428355592964405e-07,
"loss": 1.8764,
"step": 4950
},
{
"epoch": 0.3338836292577743,
"grad_norm": 5.742195611900634,
"learning_rate": 8.424072747175102e-07,
"loss": 1.8853,
"step": 4955
},
{
"epoch": 0.3342205451298811,
"grad_norm": 5.65151284794071,
"learning_rate": 8.419785165187621e-07,
"loss": 1.762,
"step": 4960
},
{
"epoch": 0.3345574610019878,
"grad_norm": 6.1205570132491145,
"learning_rate": 8.415492852932573e-07,
"loss": 1.8033,
"step": 4965
},
{
"epoch": 0.3348943768740945,
"grad_norm": 6.316447919486459,
"learning_rate": 8.41119581634711e-07,
"loss": 1.9436,
"step": 4970
},
{
"epoch": 0.3352312927462013,
"grad_norm": 5.923608163568519,
"learning_rate": 8.406894061374918e-07,
"loss": 1.9223,
"step": 4975
},
{
"epoch": 0.335568208618308,
"grad_norm": 6.211087505657584,
"learning_rate": 8.402587593966213e-07,
"loss": 1.8231,
"step": 4980
},
{
"epoch": 0.33590512449041476,
"grad_norm": 5.960407764689357,
"learning_rate": 8.398276420077726e-07,
"loss": 1.8805,
"step": 4985
},
{
"epoch": 0.3362420403625215,
"grad_norm": 6.353999152530212,
"learning_rate": 8.393960545672698e-07,
"loss": 1.8367,
"step": 4990
},
{
"epoch": 0.3365789562346282,
"grad_norm": 6.102007260398823,
"learning_rate": 8.389639976720873e-07,
"loss": 1.8398,
"step": 4995
},
{
"epoch": 0.33691587210673496,
"grad_norm": 6.530971846468027,
"learning_rate": 8.385314719198487e-07,
"loss": 1.8589,
"step": 5000
},
{
"epoch": 0.3372527879788417,
"grad_norm": 5.855794124939447,
"learning_rate": 8.380984779088264e-07,
"loss": 1.8611,
"step": 5005
},
{
"epoch": 0.33758970385094844,
"grad_norm": 6.670146157816599,
"learning_rate": 8.376650162379404e-07,
"loss": 1.8852,
"step": 5010
},
{
"epoch": 0.33792661972305515,
"grad_norm": 5.686028080563687,
"learning_rate": 8.372310875067572e-07,
"loss": 1.8913,
"step": 5015
},
{
"epoch": 0.33826353559516187,
"grad_norm": 6.097937051428109,
"learning_rate": 8.367966923154899e-07,
"loss": 1.7633,
"step": 5020
},
{
"epoch": 0.33860045146726864,
"grad_norm": 5.9742644322023075,
"learning_rate": 8.363618312649967e-07,
"loss": 1.8452,
"step": 5025
},
{
"epoch": 0.33893736733937535,
"grad_norm": 6.181641201859419,
"learning_rate": 8.359265049567796e-07,
"loss": 1.8412,
"step": 5030
},
{
"epoch": 0.3392742832114821,
"grad_norm": 5.2367742865731985,
"learning_rate": 8.35490713992985e-07,
"loss": 1.861,
"step": 5035
},
{
"epoch": 0.33961119908358883,
"grad_norm": 6.292690765399438,
"learning_rate": 8.350544589764015e-07,
"loss": 1.8772,
"step": 5040
},
{
"epoch": 0.33994811495569555,
"grad_norm": 6.051654185566775,
"learning_rate": 8.346177405104595e-07,
"loss": 1.8692,
"step": 5045
},
{
"epoch": 0.3402850308278023,
"grad_norm": 6.310395089126805,
"learning_rate": 8.341805591992308e-07,
"loss": 1.8857,
"step": 5050
},
{
"epoch": 0.34062194669990903,
"grad_norm": 5.8038677244137515,
"learning_rate": 8.337429156474272e-07,
"loss": 1.7971,
"step": 5055
},
{
"epoch": 0.34095886257201574,
"grad_norm": 5.620673928318591,
"learning_rate": 8.333048104603999e-07,
"loss": 1.7906,
"step": 5060
},
{
"epoch": 0.3412957784441225,
"grad_norm": 5.925283173151023,
"learning_rate": 8.328662442441388e-07,
"loss": 1.7609,
"step": 5065
},
{
"epoch": 0.3416326943162292,
"grad_norm": 5.5148787672747135,
"learning_rate": 8.32427217605271e-07,
"loss": 1.8625,
"step": 5070
},
{
"epoch": 0.341969610188336,
"grad_norm": 5.889799319547815,
"learning_rate": 8.319877311510612e-07,
"loss": 1.8872,
"step": 5075
},
{
"epoch": 0.3423065260604427,
"grad_norm": 5.9730991305417644,
"learning_rate": 8.315477854894095e-07,
"loss": 1.8302,
"step": 5080
},
{
"epoch": 0.3426434419325494,
"grad_norm": 6.112512298326469,
"learning_rate": 8.311073812288513e-07,
"loss": 1.8695,
"step": 5085
},
{
"epoch": 0.3429803578046562,
"grad_norm": 5.960158481758416,
"learning_rate": 8.306665189785567e-07,
"loss": 1.8504,
"step": 5090
},
{
"epoch": 0.3433172736767629,
"grad_norm": 5.762403166494906,
"learning_rate": 8.302251993483289e-07,
"loss": 1.8598,
"step": 5095
},
{
"epoch": 0.34365418954886967,
"grad_norm": 6.407712181083279,
"learning_rate": 8.297834229486039e-07,
"loss": 1.8336,
"step": 5100
},
{
"epoch": 0.3439911054209764,
"grad_norm": 5.83772252154374,
"learning_rate": 8.293411903904496e-07,
"loss": 1.8033,
"step": 5105
},
{
"epoch": 0.3443280212930831,
"grad_norm": 5.833573633058513,
"learning_rate": 8.288985022855645e-07,
"loss": 1.9005,
"step": 5110
},
{
"epoch": 0.34466493716518987,
"grad_norm": 6.066836504891038,
"learning_rate": 8.284553592462778e-07,
"loss": 1.8898,
"step": 5115
},
{
"epoch": 0.3450018530372966,
"grad_norm": 5.819693757597866,
"learning_rate": 8.280117618855475e-07,
"loss": 1.892,
"step": 5120
},
{
"epoch": 0.34533876890940335,
"grad_norm": 6.175735031767354,
"learning_rate": 8.2756771081696e-07,
"loss": 1.8891,
"step": 5125
},
{
"epoch": 0.34567568478151006,
"grad_norm": 5.861251644799363,
"learning_rate": 8.271232066547296e-07,
"loss": 1.837,
"step": 5130
},
{
"epoch": 0.3460126006536168,
"grad_norm": 5.4275825974852685,
"learning_rate": 8.266782500136971e-07,
"loss": 1.8856,
"step": 5135
},
{
"epoch": 0.34634951652572354,
"grad_norm": 6.545104428227057,
"learning_rate": 8.262328415093293e-07,
"loss": 1.8737,
"step": 5140
},
{
"epoch": 0.34668643239783026,
"grad_norm": 5.818410117388077,
"learning_rate": 8.257869817577179e-07,
"loss": 1.8721,
"step": 5145
},
{
"epoch": 0.34702334826993697,
"grad_norm": 5.848986584909808,
"learning_rate": 8.253406713755786e-07,
"loss": 1.8334,
"step": 5150
},
{
"epoch": 0.34736026414204374,
"grad_norm": 6.225066432843631,
"learning_rate": 8.24893910980251e-07,
"loss": 1.7952,
"step": 5155
},
{
"epoch": 0.34769718001415045,
"grad_norm": 6.2022839915786525,
"learning_rate": 8.244467011896965e-07,
"loss": 1.7771,
"step": 5160
},
{
"epoch": 0.3480340958862572,
"grad_norm": 5.666990913330156,
"learning_rate": 8.239990426224986e-07,
"loss": 1.8682,
"step": 5165
},
{
"epoch": 0.34837101175836394,
"grad_norm": 5.725148875884774,
"learning_rate": 8.235509358978611e-07,
"loss": 1.8808,
"step": 5170
},
{
"epoch": 0.34870792763047065,
"grad_norm": 6.191906465540604,
"learning_rate": 8.231023816356081e-07,
"loss": 1.808,
"step": 5175
},
{
"epoch": 0.3490448435025774,
"grad_norm": 6.300765434889074,
"learning_rate": 8.226533804561826e-07,
"loss": 1.826,
"step": 5180
},
{
"epoch": 0.34938175937468413,
"grad_norm": 6.120294191430685,
"learning_rate": 8.222039329806456e-07,
"loss": 1.8224,
"step": 5185
},
{
"epoch": 0.3497186752467909,
"grad_norm": 6.035289816758244,
"learning_rate": 8.217540398306757e-07,
"loss": 1.7948,
"step": 5190
},
{
"epoch": 0.3500555911188976,
"grad_norm": 6.165541268890447,
"learning_rate": 8.213037016285679e-07,
"loss": 1.8672,
"step": 5195
},
{
"epoch": 0.3503925069910043,
"grad_norm": 6.244795566784756,
"learning_rate": 8.208529189972325e-07,
"loss": 1.8504,
"step": 5200
},
{
"epoch": 0.3507294228631111,
"grad_norm": 6.429283407205037,
"learning_rate": 8.204016925601951e-07,
"loss": 1.8786,
"step": 5205
},
{
"epoch": 0.3510663387352178,
"grad_norm": 6.505410428263308,
"learning_rate": 8.199500229415945e-07,
"loss": 1.8322,
"step": 5210
},
{
"epoch": 0.3514032546073246,
"grad_norm": 5.931535804172131,
"learning_rate": 8.19497910766183e-07,
"loss": 1.8532,
"step": 5215
},
{
"epoch": 0.3517401704794313,
"grad_norm": 5.8669374305422135,
"learning_rate": 8.19045356659325e-07,
"loss": 1.858,
"step": 5220
},
{
"epoch": 0.352077086351538,
"grad_norm": 6.587711874884489,
"learning_rate": 8.185923612469958e-07,
"loss": 1.814,
"step": 5225
},
{
"epoch": 0.3524140022236448,
"grad_norm": 6.025476451555223,
"learning_rate": 8.181389251557817e-07,
"loss": 1.834,
"step": 5230
},
{
"epoch": 0.3527509180957515,
"grad_norm": 6.144844744356217,
"learning_rate": 8.176850490128782e-07,
"loss": 1.8182,
"step": 5235
},
{
"epoch": 0.3530878339678582,
"grad_norm": 5.977324102576289,
"learning_rate": 8.172307334460892e-07,
"loss": 1.9074,
"step": 5240
},
{
"epoch": 0.35342474983996497,
"grad_norm": 6.500223474064341,
"learning_rate": 8.167759790838273e-07,
"loss": 1.8393,
"step": 5245
},
{
"epoch": 0.3537616657120717,
"grad_norm": 6.276845349036307,
"learning_rate": 8.163207865551111e-07,
"loss": 1.7774,
"step": 5250
},
{
"epoch": 0.35409858158417845,
"grad_norm": 6.341988614555882,
"learning_rate": 8.158651564895657e-07,
"loss": 1.791,
"step": 5255
},
{
"epoch": 0.35443549745628516,
"grad_norm": 6.143402965283996,
"learning_rate": 8.154090895174215e-07,
"loss": 1.8424,
"step": 5260
},
{
"epoch": 0.3547724133283919,
"grad_norm": 6.025218077365625,
"learning_rate": 8.149525862695131e-07,
"loss": 1.8894,
"step": 5265
},
{
"epoch": 0.35510932920049865,
"grad_norm": 5.665571111499496,
"learning_rate": 8.144956473772784e-07,
"loss": 1.8342,
"step": 5270
},
{
"epoch": 0.35544624507260536,
"grad_norm": 6.059993906864897,
"learning_rate": 8.140382734727581e-07,
"loss": 1.8624,
"step": 5275
},
{
"epoch": 0.35578316094471213,
"grad_norm": 5.834225595488122,
"learning_rate": 8.135804651885946e-07,
"loss": 1.8717,
"step": 5280
},
{
"epoch": 0.35612007681681884,
"grad_norm": 6.162555810291783,
"learning_rate": 8.131222231580313e-07,
"loss": 1.814,
"step": 5285
},
{
"epoch": 0.35645699268892556,
"grad_norm": 6.0975790501152485,
"learning_rate": 8.126635480149107e-07,
"loss": 1.8628,
"step": 5290
},
{
"epoch": 0.3567939085610323,
"grad_norm": 6.105019766737801,
"learning_rate": 8.122044403936759e-07,
"loss": 1.8526,
"step": 5295
},
{
"epoch": 0.35713082443313904,
"grad_norm": 6.306625818941117,
"learning_rate": 8.117449009293668e-07,
"loss": 1.8393,
"step": 5300
},
{
"epoch": 0.3574677403052458,
"grad_norm": 6.135991639401845,
"learning_rate": 8.112849302576212e-07,
"loss": 1.845,
"step": 5305
},
{
"epoch": 0.3578046561773525,
"grad_norm": 5.463394231566084,
"learning_rate": 8.108245290146735e-07,
"loss": 1.7506,
"step": 5310
},
{
"epoch": 0.35814157204945923,
"grad_norm": 6.291456002634215,
"learning_rate": 8.103636978373534e-07,
"loss": 1.7982,
"step": 5315
},
{
"epoch": 0.358478487921566,
"grad_norm": 6.004992246013968,
"learning_rate": 8.099024373630854e-07,
"loss": 1.7842,
"step": 5320
},
{
"epoch": 0.3588154037936727,
"grad_norm": 6.023428148718635,
"learning_rate": 8.094407482298877e-07,
"loss": 1.8286,
"step": 5325
},
{
"epoch": 0.35915231966577943,
"grad_norm": 6.107016316566931,
"learning_rate": 8.089786310763716e-07,
"loss": 1.8459,
"step": 5330
},
{
"epoch": 0.3594892355378862,
"grad_norm": 5.944550432690106,
"learning_rate": 8.085160865417403e-07,
"loss": 1.7915,
"step": 5335
},
{
"epoch": 0.3598261514099929,
"grad_norm": 5.8836749674593944,
"learning_rate": 8.080531152657884e-07,
"loss": 1.8045,
"step": 5340
},
{
"epoch": 0.3601630672820997,
"grad_norm": 6.529445698249722,
"learning_rate": 8.075897178889002e-07,
"loss": 1.83,
"step": 5345
},
{
"epoch": 0.3604999831542064,
"grad_norm": 6.2484120930888,
"learning_rate": 8.071258950520501e-07,
"loss": 1.8925,
"step": 5350
},
{
"epoch": 0.3608368990263131,
"grad_norm": 5.739450767655285,
"learning_rate": 8.066616473968005e-07,
"loss": 1.887,
"step": 5355
},
{
"epoch": 0.3611738148984199,
"grad_norm": 6.136883926564676,
"learning_rate": 8.061969755653013e-07,
"loss": 1.8956,
"step": 5360
},
{
"epoch": 0.3615107307705266,
"grad_norm": 6.099150976787944,
"learning_rate": 8.0573188020029e-07,
"loss": 1.8737,
"step": 5365
},
{
"epoch": 0.36184764664263336,
"grad_norm": 5.92471013455527,
"learning_rate": 8.052663619450889e-07,
"loss": 1.7661,
"step": 5370
},
{
"epoch": 0.36218456251474007,
"grad_norm": 6.235057387424368,
"learning_rate": 8.048004214436058e-07,
"loss": 1.8556,
"step": 5375
},
{
"epoch": 0.3625214783868468,
"grad_norm": 6.17438780027338,
"learning_rate": 8.043340593403325e-07,
"loss": 1.9215,
"step": 5380
},
{
"epoch": 0.36285839425895355,
"grad_norm": 6.424536207341481,
"learning_rate": 8.038672762803437e-07,
"loss": 1.8529,
"step": 5385
},
{
"epoch": 0.36319531013106027,
"grad_norm": 5.795068157540271,
"learning_rate": 8.034000729092967e-07,
"loss": 1.8689,
"step": 5390
},
{
"epoch": 0.36353222600316704,
"grad_norm": 6.7129075662939055,
"learning_rate": 8.029324498734299e-07,
"loss": 1.8011,
"step": 5395
},
{
"epoch": 0.36386914187527375,
"grad_norm": 6.016389228550978,
"learning_rate": 8.024644078195625e-07,
"loss": 1.8532,
"step": 5400
},
{
"epoch": 0.36420605774738046,
"grad_norm": 6.239532053660758,
"learning_rate": 8.01995947395093e-07,
"loss": 1.8392,
"step": 5405
},
{
"epoch": 0.36454297361948723,
"grad_norm": 5.916493777487104,
"learning_rate": 8.015270692479988e-07,
"loss": 1.9247,
"step": 5410
},
{
"epoch": 0.36487988949159395,
"grad_norm": 6.612015986424919,
"learning_rate": 8.010577740268347e-07,
"loss": 1.8547,
"step": 5415
},
{
"epoch": 0.36521680536370066,
"grad_norm": 6.181898400080323,
"learning_rate": 8.005880623807331e-07,
"loss": 1.764,
"step": 5420
},
{
"epoch": 0.3655537212358074,
"grad_norm": 6.493674123294548,
"learning_rate": 8.001179349594016e-07,
"loss": 1.8997,
"step": 5425
},
{
"epoch": 0.36589063710791414,
"grad_norm": 5.998232222470107,
"learning_rate": 7.996473924131236e-07,
"loss": 1.909,
"step": 5430
},
{
"epoch": 0.3662275529800209,
"grad_norm": 6.2309580428499185,
"learning_rate": 7.991764353927562e-07,
"loss": 1.8379,
"step": 5435
},
{
"epoch": 0.3665644688521276,
"grad_norm": 6.1097747554443975,
"learning_rate": 7.987050645497302e-07,
"loss": 1.8493,
"step": 5440
},
{
"epoch": 0.36690138472423434,
"grad_norm": 7.217536394961145,
"learning_rate": 7.982332805360486e-07,
"loss": 1.8435,
"step": 5445
},
{
"epoch": 0.3672383005963411,
"grad_norm": 6.426087127620301,
"learning_rate": 7.977610840042856e-07,
"loss": 1.8376,
"step": 5450
},
{
"epoch": 0.3675752164684478,
"grad_norm": 5.852501492146653,
"learning_rate": 7.972884756075867e-07,
"loss": 1.9335,
"step": 5455
},
{
"epoch": 0.3679121323405546,
"grad_norm": 6.68642611327528,
"learning_rate": 7.968154559996665e-07,
"loss": 1.8761,
"step": 5460
},
{
"epoch": 0.3682490482126613,
"grad_norm": 5.9075032622037575,
"learning_rate": 7.963420258348086e-07,
"loss": 1.8118,
"step": 5465
},
{
"epoch": 0.368585964084768,
"grad_norm": 5.775018051587756,
"learning_rate": 7.958681857678645e-07,
"loss": 1.841,
"step": 5470
},
{
"epoch": 0.3689228799568748,
"grad_norm": 5.519542365682442,
"learning_rate": 7.953939364542523e-07,
"loss": 1.7795,
"step": 5475
},
{
"epoch": 0.3692597958289815,
"grad_norm": 6.051018669323594,
"learning_rate": 7.949192785499573e-07,
"loss": 1.8035,
"step": 5480
},
{
"epoch": 0.36959671170108827,
"grad_norm": 6.173790364293265,
"learning_rate": 7.944442127115285e-07,
"loss": 1.8174,
"step": 5485
},
{
"epoch": 0.369933627573195,
"grad_norm": 6.27057349475925,
"learning_rate": 7.939687395960802e-07,
"loss": 1.8228,
"step": 5490
},
{
"epoch": 0.3702705434453017,
"grad_norm": 6.403064812356834,
"learning_rate": 7.934928598612895e-07,
"loss": 1.7758,
"step": 5495
},
{
"epoch": 0.37060745931740846,
"grad_norm": 5.78875486509689,
"learning_rate": 7.930165741653964e-07,
"loss": 1.8362,
"step": 5500
},
{
"epoch": 0.3709443751895152,
"grad_norm": 6.209627219262116,
"learning_rate": 7.925398831672018e-07,
"loss": 1.8188,
"step": 5505
},
{
"epoch": 0.3712812910616219,
"grad_norm": 5.910263475062196,
"learning_rate": 7.920627875260679e-07,
"loss": 1.9461,
"step": 5510
},
{
"epoch": 0.37161820693372866,
"grad_norm": 5.907600463890629,
"learning_rate": 7.91585287901916e-07,
"loss": 1.8488,
"step": 5515
},
{
"epoch": 0.37195512280583537,
"grad_norm": 6.022008103501391,
"learning_rate": 7.911073849552267e-07,
"loss": 1.8528,
"step": 5520
},
{
"epoch": 0.37229203867794214,
"grad_norm": 6.297851834336272,
"learning_rate": 7.906290793470382e-07,
"loss": 1.8826,
"step": 5525
},
{
"epoch": 0.37262895455004885,
"grad_norm": 6.821824970703982,
"learning_rate": 7.901503717389458e-07,
"loss": 1.8499,
"step": 5530
},
{
"epoch": 0.37296587042215557,
"grad_norm": 6.543670341910588,
"learning_rate": 7.896712627931004e-07,
"loss": 1.8973,
"step": 5535
},
{
"epoch": 0.37330278629426233,
"grad_norm": 5.839271026858895,
"learning_rate": 7.891917531722087e-07,
"loss": 1.8527,
"step": 5540
},
{
"epoch": 0.37363970216636905,
"grad_norm": 5.805332246145514,
"learning_rate": 7.887118435395314e-07,
"loss": 1.8391,
"step": 5545
},
{
"epoch": 0.3739766180384758,
"grad_norm": 6.093441102979253,
"learning_rate": 7.882315345588823e-07,
"loss": 1.8382,
"step": 5550
},
{
"epoch": 0.37431353391058253,
"grad_norm": 5.9164472483146255,
"learning_rate": 7.877508268946275e-07,
"loss": 1.8068,
"step": 5555
},
{
"epoch": 0.37465044978268924,
"grad_norm": 5.783501174118739,
"learning_rate": 7.87269721211685e-07,
"loss": 1.8326,
"step": 5560
},
{
"epoch": 0.374987365654796,
"grad_norm": 6.099236664954797,
"learning_rate": 7.86788218175523e-07,
"loss": 1.8907,
"step": 5565
},
{
"epoch": 0.3753242815269027,
"grad_norm": 6.015599331347923,
"learning_rate": 7.863063184521595e-07,
"loss": 1.7902,
"step": 5570
},
{
"epoch": 0.3756611973990095,
"grad_norm": 6.030584133610423,
"learning_rate": 7.858240227081611e-07,
"loss": 1.7967,
"step": 5575
},
{
"epoch": 0.3759981132711162,
"grad_norm": 6.1561594264866875,
"learning_rate": 7.85341331610642e-07,
"loss": 1.8738,
"step": 5580
},
{
"epoch": 0.3763350291432229,
"grad_norm": 5.7668037858810735,
"learning_rate": 7.848582458272637e-07,
"loss": 1.8994,
"step": 5585
},
{
"epoch": 0.3766719450153297,
"grad_norm": 6.820449431402006,
"learning_rate": 7.843747660262333e-07,
"loss": 1.7353,
"step": 5590
},
{
"epoch": 0.3770088608874364,
"grad_norm": 5.731863695815324,
"learning_rate": 7.838908928763028e-07,
"loss": 1.842,
"step": 5595
},
{
"epoch": 0.3773457767595431,
"grad_norm": 6.0491543709457805,
"learning_rate": 7.834066270467689e-07,
"loss": 1.8412,
"step": 5600
},
{
"epoch": 0.3776826926316499,
"grad_norm": 6.312515900909037,
"learning_rate": 7.829219692074707e-07,
"loss": 1.9166,
"step": 5605
},
{
"epoch": 0.3780196085037566,
"grad_norm": 5.629903210882783,
"learning_rate": 7.824369200287899e-07,
"loss": 1.8475,
"step": 5610
},
{
"epoch": 0.37835652437586337,
"grad_norm": 5.798215148864628,
"learning_rate": 7.819514801816496e-07,
"loss": 1.8445,
"step": 5615
},
{
"epoch": 0.3786934402479701,
"grad_norm": 5.746602784624546,
"learning_rate": 7.814656503375128e-07,
"loss": 1.8142,
"step": 5620
},
{
"epoch": 0.3790303561200768,
"grad_norm": 6.212384108148857,
"learning_rate": 7.809794311683828e-07,
"loss": 1.8433,
"step": 5625
},
{
"epoch": 0.37936727199218356,
"grad_norm": 5.5219568352553186,
"learning_rate": 7.804928233468006e-07,
"loss": 1.8577,
"step": 5630
},
{
"epoch": 0.3797041878642903,
"grad_norm": 5.923816755049023,
"learning_rate": 7.80005827545845e-07,
"loss": 1.8968,
"step": 5635
},
{
"epoch": 0.38004110373639705,
"grad_norm": 6.447298966369827,
"learning_rate": 7.795184444391318e-07,
"loss": 1.8263,
"step": 5640
},
{
"epoch": 0.38037801960850376,
"grad_norm": 5.700024915629011,
"learning_rate": 7.790306747008119e-07,
"loss": 1.801,
"step": 5645
},
{
"epoch": 0.3807149354806105,
"grad_norm": 5.693902051404934,
"learning_rate": 7.785425190055719e-07,
"loss": 1.8346,
"step": 5650
},
{
"epoch": 0.38105185135271724,
"grad_norm": 6.623591217723914,
"learning_rate": 7.780539780286312e-07,
"loss": 1.7844,
"step": 5655
},
{
"epoch": 0.38138876722482395,
"grad_norm": 5.924554882226798,
"learning_rate": 7.77565052445743e-07,
"loss": 1.8806,
"step": 5660
},
{
"epoch": 0.3817256830969307,
"grad_norm": 5.804924090163828,
"learning_rate": 7.770757429331919e-07,
"loss": 1.8837,
"step": 5665
},
{
"epoch": 0.38206259896903744,
"grad_norm": 6.307078378352449,
"learning_rate": 7.765860501677939e-07,
"loss": 1.8476,
"step": 5670
},
{
"epoch": 0.38239951484114415,
"grad_norm": 5.64288963732036,
"learning_rate": 7.760959748268949e-07,
"loss": 1.7921,
"step": 5675
},
{
"epoch": 0.3827364307132509,
"grad_norm": 6.0330255144031755,
"learning_rate": 7.756055175883701e-07,
"loss": 1.8648,
"step": 5680
},
{
"epoch": 0.38307334658535763,
"grad_norm": 6.226955158037713,
"learning_rate": 7.751146791306231e-07,
"loss": 1.8733,
"step": 5685
},
{
"epoch": 0.38341026245746435,
"grad_norm": 5.831296049474745,
"learning_rate": 7.746234601325843e-07,
"loss": 1.8237,
"step": 5690
},
{
"epoch": 0.3837471783295711,
"grad_norm": 6.138631169330866,
"learning_rate": 7.741318612737111e-07,
"loss": 1.8234,
"step": 5695
},
{
"epoch": 0.38408409420167783,
"grad_norm": 5.9623003368564955,
"learning_rate": 7.73639883233986e-07,
"loss": 1.7892,
"step": 5700
},
{
"epoch": 0.3844210100737846,
"grad_norm": 5.900675764770894,
"learning_rate": 7.731475266939158e-07,
"loss": 1.7886,
"step": 5705
},
{
"epoch": 0.3847579259458913,
"grad_norm": 6.598946949032724,
"learning_rate": 7.726547923345313e-07,
"loss": 1.832,
"step": 5710
},
{
"epoch": 0.385094841817998,
"grad_norm": 6.274691790815466,
"learning_rate": 7.721616808373855e-07,
"loss": 1.8692,
"step": 5715
},
{
"epoch": 0.3854317576901048,
"grad_norm": 6.068967193945496,
"learning_rate": 7.716681928845532e-07,
"loss": 1.8135,
"step": 5720
},
{
"epoch": 0.3857686735622115,
"grad_norm": 5.71438663141986,
"learning_rate": 7.711743291586298e-07,
"loss": 1.8432,
"step": 5725
},
{
"epoch": 0.3861055894343183,
"grad_norm": 5.908196677526078,
"learning_rate": 7.706800903427309e-07,
"loss": 1.76,
"step": 5730
},
{
"epoch": 0.386442505306425,
"grad_norm": 5.980404418034513,
"learning_rate": 7.701854771204905e-07,
"loss": 1.8551,
"step": 5735
},
{
"epoch": 0.3867794211785317,
"grad_norm": 5.825977075219841,
"learning_rate": 7.696904901760606e-07,
"loss": 1.8143,
"step": 5740
},
{
"epoch": 0.38711633705063847,
"grad_norm": 6.147112371868563,
"learning_rate": 7.691951301941102e-07,
"loss": 1.8429,
"step": 5745
},
{
"epoch": 0.3874532529227452,
"grad_norm": 6.679128666452009,
"learning_rate": 7.68699397859824e-07,
"loss": 1.9134,
"step": 5750
},
{
"epoch": 0.38779016879485195,
"grad_norm": 5.97814777855936,
"learning_rate": 7.682032938589023e-07,
"loss": 1.8395,
"step": 5755
},
{
"epoch": 0.38812708466695867,
"grad_norm": 5.950552045235804,
"learning_rate": 7.677068188775589e-07,
"loss": 1.7672,
"step": 5760
},
{
"epoch": 0.3884640005390654,
"grad_norm": 5.814114869159672,
"learning_rate": 7.67209973602521e-07,
"loss": 1.8458,
"step": 5765
},
{
"epoch": 0.38880091641117215,
"grad_norm": 6.0582073054991055,
"learning_rate": 7.667127587210282e-07,
"loss": 1.8802,
"step": 5770
},
{
"epoch": 0.38913783228327886,
"grad_norm": 6.334265700850607,
"learning_rate": 7.66215174920831e-07,
"loss": 1.8564,
"step": 5775
},
{
"epoch": 0.3894747481553856,
"grad_norm": 6.002444518331614,
"learning_rate": 7.657172228901905e-07,
"loss": 1.8211,
"step": 5780
},
{
"epoch": 0.38981166402749234,
"grad_norm": 6.1022759973086815,
"learning_rate": 7.652189033178766e-07,
"loss": 1.8229,
"step": 5785
},
{
"epoch": 0.39014857989959906,
"grad_norm": 6.413997576789767,
"learning_rate": 7.647202168931683e-07,
"loss": 1.7785,
"step": 5790
},
{
"epoch": 0.3904854957717058,
"grad_norm": 6.069802887796607,
"learning_rate": 7.642211643058516e-07,
"loss": 1.8022,
"step": 5795
},
{
"epoch": 0.39082241164381254,
"grad_norm": 5.7749180044113,
"learning_rate": 7.637217462462189e-07,
"loss": 1.8342,
"step": 5800
},
{
"epoch": 0.39115932751591925,
"grad_norm": 5.841685430277685,
"learning_rate": 7.632219634050685e-07,
"loss": 1.8166,
"step": 5805
},
{
"epoch": 0.391496243388026,
"grad_norm": 6.113028925689592,
"learning_rate": 7.62721816473703e-07,
"loss": 1.8851,
"step": 5810
},
{
"epoch": 0.39183315926013274,
"grad_norm": 6.574877319300081,
"learning_rate": 7.622213061439287e-07,
"loss": 1.7852,
"step": 5815
},
{
"epoch": 0.3921700751322395,
"grad_norm": 6.591103453708863,
"learning_rate": 7.617204331080544e-07,
"loss": 1.8207,
"step": 5820
},
{
"epoch": 0.3925069910043462,
"grad_norm": 6.161561709386749,
"learning_rate": 7.612191980588907e-07,
"loss": 1.8254,
"step": 5825
},
{
"epoch": 0.39284390687645293,
"grad_norm": 5.913954308903591,
"learning_rate": 7.60717601689749e-07,
"loss": 1.8521,
"step": 5830
},
{
"epoch": 0.3931808227485597,
"grad_norm": 6.061050670183792,
"learning_rate": 7.602156446944405e-07,
"loss": 1.8292,
"step": 5835
},
{
"epoch": 0.3935177386206664,
"grad_norm": 6.056114290565816,
"learning_rate": 7.597133277672751e-07,
"loss": 1.8017,
"step": 5840
},
{
"epoch": 0.3938546544927732,
"grad_norm": 6.659560363196615,
"learning_rate": 7.592106516030607e-07,
"loss": 1.7902,
"step": 5845
},
{
"epoch": 0.3941915703648799,
"grad_norm": 6.349531573185438,
"learning_rate": 7.587076168971022e-07,
"loss": 1.7835,
"step": 5850
},
{
"epoch": 0.3945284862369866,
"grad_norm": 5.973870975420067,
"learning_rate": 7.582042243451998e-07,
"loss": 1.839,
"step": 5855
},
{
"epoch": 0.3948654021090934,
"grad_norm": 6.399608469649776,
"learning_rate": 7.577004746436494e-07,
"loss": 1.8484,
"step": 5860
},
{
"epoch": 0.3952023179812001,
"grad_norm": 6.391523749018398,
"learning_rate": 7.571963684892404e-07,
"loss": 1.8203,
"step": 5865
},
{
"epoch": 0.3955392338533068,
"grad_norm": 5.55251848998224,
"learning_rate": 7.566919065792558e-07,
"loss": 1.8143,
"step": 5870
},
{
"epoch": 0.3958761497254136,
"grad_norm": 6.461778389781404,
"learning_rate": 7.561870896114704e-07,
"loss": 1.7778,
"step": 5875
},
{
"epoch": 0.3962130655975203,
"grad_norm": 5.7346585594091115,
"learning_rate": 7.556819182841498e-07,
"loss": 1.7838,
"step": 5880
},
{
"epoch": 0.39654998146962706,
"grad_norm": 5.843198641258833,
"learning_rate": 7.551763932960502e-07,
"loss": 1.8738,
"step": 5885
},
{
"epoch": 0.39688689734173377,
"grad_norm": 5.993845274934363,
"learning_rate": 7.546705153464168e-07,
"loss": 1.8547,
"step": 5890
},
{
"epoch": 0.3972238132138405,
"grad_norm": 5.7984474701034925,
"learning_rate": 7.54164285134983e-07,
"loss": 1.8138,
"step": 5895
},
{
"epoch": 0.39756072908594725,
"grad_norm": 6.040293907002973,
"learning_rate": 7.536577033619696e-07,
"loss": 1.8491,
"step": 5900
},
{
"epoch": 0.39789764495805396,
"grad_norm": 5.9285219940169664,
"learning_rate": 7.531507707280836e-07,
"loss": 1.816,
"step": 5905
},
{
"epoch": 0.39823456083016073,
"grad_norm": 6.001261820811947,
"learning_rate": 7.526434879345171e-07,
"loss": 1.8569,
"step": 5910
},
{
"epoch": 0.39857147670226745,
"grad_norm": 6.279600146634121,
"learning_rate": 7.521358556829469e-07,
"loss": 1.8775,
"step": 5915
},
{
"epoch": 0.39890839257437416,
"grad_norm": 5.919665080614914,
"learning_rate": 7.51627874675533e-07,
"loss": 1.8932,
"step": 5920
},
{
"epoch": 0.39924530844648093,
"grad_norm": 6.237664010789953,
"learning_rate": 7.511195456149177e-07,
"loss": 1.8613,
"step": 5925
},
{
"epoch": 0.39958222431858764,
"grad_norm": 5.453939969270052,
"learning_rate": 7.50610869204225e-07,
"loss": 1.8167,
"step": 5930
},
{
"epoch": 0.3999191401906944,
"grad_norm": 6.414570793475843,
"learning_rate": 7.50101846147059e-07,
"loss": 1.9238,
"step": 5935
},
{
"epoch": 0.4002560560628011,
"grad_norm": 6.214005751571211,
"learning_rate": 7.495924771475037e-07,
"loss": 1.8575,
"step": 5940
},
{
"epoch": 0.40059297193490784,
"grad_norm": 6.264117402993965,
"learning_rate": 7.490827629101211e-07,
"loss": 1.8432,
"step": 5945
},
{
"epoch": 0.4009298878070146,
"grad_norm": 5.876493353273485,
"learning_rate": 7.485727041399513e-07,
"loss": 1.7964,
"step": 5950
},
{
"epoch": 0.4012668036791213,
"grad_norm": 5.563447588486703,
"learning_rate": 7.480623015425105e-07,
"loss": 1.7979,
"step": 5955
},
{
"epoch": 0.40160371955122803,
"grad_norm": 5.576251643765876,
"learning_rate": 7.475515558237909e-07,
"loss": 1.8397,
"step": 5960
},
{
"epoch": 0.4019406354233348,
"grad_norm": 5.895209504068423,
"learning_rate": 7.470404676902587e-07,
"loss": 1.8415,
"step": 5965
},
{
"epoch": 0.4022775512954415,
"grad_norm": 5.977193260631528,
"learning_rate": 7.465290378488544e-07,
"loss": 1.9116,
"step": 5970
},
{
"epoch": 0.4026144671675483,
"grad_norm": 6.187315083034187,
"learning_rate": 7.460172670069909e-07,
"loss": 1.8695,
"step": 5975
},
{
"epoch": 0.402951383039655,
"grad_norm": 5.699459860019443,
"learning_rate": 7.455051558725524e-07,
"loss": 1.8654,
"step": 5980
},
{
"epoch": 0.4032882989117617,
"grad_norm": 5.988742641572804,
"learning_rate": 7.449927051538944e-07,
"loss": 1.7901,
"step": 5985
},
{
"epoch": 0.4036252147838685,
"grad_norm": 6.166914520856474,
"learning_rate": 7.444799155598419e-07,
"loss": 1.8478,
"step": 5990
},
{
"epoch": 0.4039621306559752,
"grad_norm": 6.290632854081438,
"learning_rate": 7.439667877996884e-07,
"loss": 1.7836,
"step": 5995
},
{
"epoch": 0.40429904652808196,
"grad_norm": 5.578242942924098,
"learning_rate": 7.434533225831951e-07,
"loss": 1.8646,
"step": 6000
},
{
"epoch": 0.4046359624001887,
"grad_norm": 5.922649460674123,
"learning_rate": 7.429395206205908e-07,
"loss": 1.8159,
"step": 6005
},
{
"epoch": 0.4049728782722954,
"grad_norm": 5.973950129607427,
"learning_rate": 7.424253826225689e-07,
"loss": 1.8179,
"step": 6010
},
{
"epoch": 0.40530979414440216,
"grad_norm": 6.1535636579164565,
"learning_rate": 7.419109093002887e-07,
"loss": 1.785,
"step": 6015
},
{
"epoch": 0.40564671001650887,
"grad_norm": 5.99390268127196,
"learning_rate": 7.413961013653725e-07,
"loss": 1.8474,
"step": 6020
},
{
"epoch": 0.40598362588861564,
"grad_norm": 6.095296832831377,
"learning_rate": 7.408809595299057e-07,
"loss": 1.8098,
"step": 6025
},
{
"epoch": 0.40632054176072235,
"grad_norm": 5.854910765823886,
"learning_rate": 7.403654845064358e-07,
"loss": 1.8641,
"step": 6030
},
{
"epoch": 0.40665745763282907,
"grad_norm": 6.488094157305255,
"learning_rate": 7.398496770079709e-07,
"loss": 1.845,
"step": 6035
},
{
"epoch": 0.40699437350493584,
"grad_norm": 5.888696212338066,
"learning_rate": 7.393335377479792e-07,
"loss": 1.8355,
"step": 6040
},
{
"epoch": 0.40733128937704255,
"grad_norm": 5.6364358032730415,
"learning_rate": 7.388170674403872e-07,
"loss": 1.7867,
"step": 6045
},
{
"epoch": 0.40766820524914926,
"grad_norm": 6.120652995181019,
"learning_rate": 7.383002667995804e-07,
"loss": 1.8407,
"step": 6050
},
{
"epoch": 0.40800512112125603,
"grad_norm": 5.9456629334484115,
"learning_rate": 7.377831365404001e-07,
"loss": 1.7944,
"step": 6055
},
{
"epoch": 0.40834203699336274,
"grad_norm": 5.926271311098251,
"learning_rate": 7.372656773781442e-07,
"loss": 1.8741,
"step": 6060
},
{
"epoch": 0.4086789528654695,
"grad_norm": 5.874696662625113,
"learning_rate": 7.367478900285654e-07,
"loss": 1.8321,
"step": 6065
},
{
"epoch": 0.4090158687375762,
"grad_norm": 6.207699178257415,
"learning_rate": 7.362297752078702e-07,
"loss": 1.7815,
"step": 6070
},
{
"epoch": 0.40935278460968294,
"grad_norm": 6.0143311305393565,
"learning_rate": 7.357113336327181e-07,
"loss": 1.854,
"step": 6075
},
{
"epoch": 0.4096897004817897,
"grad_norm": 6.137764433429077,
"learning_rate": 7.351925660202207e-07,
"loss": 1.8032,
"step": 6080
},
{
"epoch": 0.4100266163538964,
"grad_norm": 5.952972611372365,
"learning_rate": 7.346734730879407e-07,
"loss": 1.8577,
"step": 6085
},
{
"epoch": 0.4103635322260032,
"grad_norm": 5.93496116315055,
"learning_rate": 7.341540555538902e-07,
"loss": 1.8655,
"step": 6090
},
{
"epoch": 0.4107004480981099,
"grad_norm": 6.035523911748048,
"learning_rate": 7.33634314136531e-07,
"loss": 1.8782,
"step": 6095
},
{
"epoch": 0.4110373639702166,
"grad_norm": 6.50895901457185,
"learning_rate": 7.331142495547724e-07,
"loss": 1.8557,
"step": 6100
},
{
"epoch": 0.4113742798423234,
"grad_norm": 6.3296523573714465,
"learning_rate": 7.325938625279709e-07,
"loss": 1.771,
"step": 6105
},
{
"epoch": 0.4117111957144301,
"grad_norm": 6.06063180713123,
"learning_rate": 7.320731537759293e-07,
"loss": 1.9209,
"step": 6110
},
{
"epoch": 0.41204811158653687,
"grad_norm": 6.291587004914758,
"learning_rate": 7.315521240188944e-07,
"loss": 1.8921,
"step": 6115
},
{
"epoch": 0.4123850274586436,
"grad_norm": 7.144681065930621,
"learning_rate": 7.310307739775585e-07,
"loss": 1.8353,
"step": 6120
},
{
"epoch": 0.4127219433307503,
"grad_norm": 6.537598454236774,
"learning_rate": 7.305091043730557e-07,
"loss": 1.8657,
"step": 6125
},
{
"epoch": 0.41305885920285706,
"grad_norm": 6.139638408298624,
"learning_rate": 7.299871159269626e-07,
"loss": 1.8291,
"step": 6130
},
{
"epoch": 0.4133957750749638,
"grad_norm": 6.052600414436344,
"learning_rate": 7.294648093612968e-07,
"loss": 1.8582,
"step": 6135
},
{
"epoch": 0.4137326909470705,
"grad_norm": 5.737200220672097,
"learning_rate": 7.28942185398516e-07,
"loss": 1.8016,
"step": 6140
},
{
"epoch": 0.41406960681917726,
"grad_norm": 5.788875630264053,
"learning_rate": 7.284192447615168e-07,
"loss": 1.8144,
"step": 6145
},
{
"epoch": 0.414406522691284,
"grad_norm": 5.809695140298498,
"learning_rate": 7.278959881736338e-07,
"loss": 1.8528,
"step": 6150
},
{
"epoch": 0.41474343856339074,
"grad_norm": 6.291846673442152,
"learning_rate": 7.273724163586387e-07,
"loss": 1.8146,
"step": 6155
},
{
"epoch": 0.41508035443549746,
"grad_norm": 6.592279506836028,
"learning_rate": 7.268485300407394e-07,
"loss": 1.8373,
"step": 6160
},
{
"epoch": 0.41541727030760417,
"grad_norm": 5.2404367912538135,
"learning_rate": 7.263243299445783e-07,
"loss": 1.8687,
"step": 6165
},
{
"epoch": 0.41575418617971094,
"grad_norm": 6.237826143972818,
"learning_rate": 7.257998167952322e-07,
"loss": 1.7559,
"step": 6170
},
{
"epoch": 0.41609110205181765,
"grad_norm": 5.644649333598512,
"learning_rate": 7.25274991318211e-07,
"loss": 1.7745,
"step": 6175
},
{
"epoch": 0.4164280179239244,
"grad_norm": 6.221269732535096,
"learning_rate": 7.247498542394566e-07,
"loss": 1.888,
"step": 6180
},
{
"epoch": 0.41676493379603113,
"grad_norm": 5.846110634880196,
"learning_rate": 7.242244062853416e-07,
"loss": 1.8297,
"step": 6185
},
{
"epoch": 0.41710184966813785,
"grad_norm": 6.0081486154977295,
"learning_rate": 7.236986481826688e-07,
"loss": 1.8962,
"step": 6190
},
{
"epoch": 0.4174387655402446,
"grad_norm": 5.714180142339893,
"learning_rate": 7.231725806586699e-07,
"loss": 1.798,
"step": 6195
},
{
"epoch": 0.41777568141235133,
"grad_norm": 5.924083302663139,
"learning_rate": 7.22646204441005e-07,
"loss": 1.8671,
"step": 6200
},
{
"epoch": 0.4181125972844581,
"grad_norm": 5.884534396453543,
"learning_rate": 7.221195202577606e-07,
"loss": 1.8473,
"step": 6205
},
{
"epoch": 0.4184495131565648,
"grad_norm": 6.505335975345712,
"learning_rate": 7.215925288374496e-07,
"loss": 1.845,
"step": 6210
},
{
"epoch": 0.4187864290286715,
"grad_norm": 6.361143535314032,
"learning_rate": 7.210652309090098e-07,
"loss": 1.8562,
"step": 6215
},
{
"epoch": 0.4191233449007783,
"grad_norm": 6.269558040309333,
"learning_rate": 7.205376272018025e-07,
"loss": 1.8842,
"step": 6220
},
{
"epoch": 0.419460260772885,
"grad_norm": 6.033684433696401,
"learning_rate": 7.200097184456128e-07,
"loss": 1.8744,
"step": 6225
},
{
"epoch": 0.4197971766449917,
"grad_norm": 5.893842528747161,
"learning_rate": 7.19481505370647e-07,
"loss": 1.9029,
"step": 6230
},
{
"epoch": 0.4201340925170985,
"grad_norm": 6.067247445280829,
"learning_rate": 7.189529887075327e-07,
"loss": 1.8535,
"step": 6235
},
{
"epoch": 0.4204710083892052,
"grad_norm": 6.019248123337608,
"learning_rate": 7.184241691873174e-07,
"loss": 1.8598,
"step": 6240
},
{
"epoch": 0.42080792426131197,
"grad_norm": 5.885826420361127,
"learning_rate": 7.178950475414675e-07,
"loss": 1.8843,
"step": 6245
},
{
"epoch": 0.4211448401334187,
"grad_norm": 5.994346100649246,
"learning_rate": 7.173656245018671e-07,
"loss": 1.8454,
"step": 6250
},
{
"epoch": 0.4214817560055254,
"grad_norm": 5.683347136227909,
"learning_rate": 7.168359008008177e-07,
"loss": 1.8528,
"step": 6255
},
{
"epoch": 0.42181867187763217,
"grad_norm": 6.303528633412533,
"learning_rate": 7.163058771710358e-07,
"loss": 1.8118,
"step": 6260
},
{
"epoch": 0.4221555877497389,
"grad_norm": 5.838897189398607,
"learning_rate": 7.157755543456539e-07,
"loss": 1.8554,
"step": 6265
},
{
"epoch": 0.42249250362184565,
"grad_norm": 6.102573455498519,
"learning_rate": 7.152449330582173e-07,
"loss": 1.8515,
"step": 6270
},
{
"epoch": 0.42282941949395236,
"grad_norm": 6.170469429143662,
"learning_rate": 7.147140140426848e-07,
"loss": 1.8718,
"step": 6275
},
{
"epoch": 0.4231663353660591,
"grad_norm": 6.603648144277644,
"learning_rate": 7.141827980334265e-07,
"loss": 1.7583,
"step": 6280
},
{
"epoch": 0.42350325123816585,
"grad_norm": 5.710276452721297,
"learning_rate": 7.136512857652239e-07,
"loss": 1.8225,
"step": 6285
},
{
"epoch": 0.42384016711027256,
"grad_norm": 6.305744171898564,
"learning_rate": 7.131194779732681e-07,
"loss": 1.8758,
"step": 6290
},
{
"epoch": 0.4241770829823793,
"grad_norm": 5.935812543100673,
"learning_rate": 7.125873753931586e-07,
"loss": 1.7446,
"step": 6295
},
{
"epoch": 0.42451399885448604,
"grad_norm": 6.4033336349656205,
"learning_rate": 7.120549787609029e-07,
"loss": 1.8007,
"step": 6300
},
{
"epoch": 0.42485091472659275,
"grad_norm": 5.988053466649247,
"learning_rate": 7.115222888129156e-07,
"loss": 1.8339,
"step": 6305
},
{
"epoch": 0.4251878305986995,
"grad_norm": 5.839719119201011,
"learning_rate": 7.109893062860161e-07,
"loss": 1.7753,
"step": 6310
},
{
"epoch": 0.42552474647080624,
"grad_norm": 6.1546266562601355,
"learning_rate": 7.104560319174296e-07,
"loss": 1.9032,
"step": 6315
},
{
"epoch": 0.42586166234291295,
"grad_norm": 5.868935206755803,
"learning_rate": 7.099224664447841e-07,
"loss": 1.8336,
"step": 6320
},
{
"epoch": 0.4261985782150197,
"grad_norm": 6.897803862049745,
"learning_rate": 7.093886106061106e-07,
"loss": 1.8933,
"step": 6325
},
{
"epoch": 0.42653549408712643,
"grad_norm": 6.298122444093038,
"learning_rate": 7.088544651398421e-07,
"loss": 1.8468,
"step": 6330
},
{
"epoch": 0.4268724099592332,
"grad_norm": 5.946687882042922,
"learning_rate": 7.083200307848115e-07,
"loss": 1.8509,
"step": 6335
},
{
"epoch": 0.4272093258313399,
"grad_norm": 5.9703726574219695,
"learning_rate": 7.077853082802516e-07,
"loss": 1.8723,
"step": 6340
},
{
"epoch": 0.42754624170344663,
"grad_norm": 5.940092483411608,
"learning_rate": 7.072502983657939e-07,
"loss": 1.7817,
"step": 6345
},
{
"epoch": 0.4278831575755534,
"grad_norm": 5.99693275214663,
"learning_rate": 7.067150017814676e-07,
"loss": 1.8194,
"step": 6350
},
{
"epoch": 0.4282200734476601,
"grad_norm": 6.365386492382532,
"learning_rate": 7.061794192676979e-07,
"loss": 1.8713,
"step": 6355
},
{
"epoch": 0.4285569893197669,
"grad_norm": 6.137677012183777,
"learning_rate": 7.056435515653058e-07,
"loss": 1.889,
"step": 6360
},
{
"epoch": 0.4288939051918736,
"grad_norm": 6.197193970097729,
"learning_rate": 7.051073994155068e-07,
"loss": 1.8624,
"step": 6365
},
{
"epoch": 0.4292308210639803,
"grad_norm": 6.262005951949787,
"learning_rate": 7.045709635599098e-07,
"loss": 1.8439,
"step": 6370
},
{
"epoch": 0.4295677369360871,
"grad_norm": 6.5893693825632,
"learning_rate": 7.040342447405161e-07,
"loss": 1.7952,
"step": 6375
},
{
"epoch": 0.4299046528081938,
"grad_norm": 6.342831308535354,
"learning_rate": 7.034972436997184e-07,
"loss": 1.7968,
"step": 6380
},
{
"epoch": 0.4302415686803005,
"grad_norm": 5.704318035103627,
"learning_rate": 7.029599611803e-07,
"loss": 1.8215,
"step": 6385
},
{
"epoch": 0.43057848455240727,
"grad_norm": 6.116744457100223,
"learning_rate": 7.024223979254331e-07,
"loss": 1.8791,
"step": 6390
},
{
"epoch": 0.430915400424514,
"grad_norm": 6.176754693260745,
"learning_rate": 7.018845546786787e-07,
"loss": 1.8132,
"step": 6395
},
{
"epoch": 0.43125231629662075,
"grad_norm": 5.751107796307478,
"learning_rate": 7.013464321839845e-07,
"loss": 1.7256,
"step": 6400
},
{
"epoch": 0.43158923216872747,
"grad_norm": 5.652758571110035,
"learning_rate": 7.00808031185685e-07,
"loss": 1.7953,
"step": 6405
},
{
"epoch": 0.4319261480408342,
"grad_norm": 6.005003107499712,
"learning_rate": 7.002693524284997e-07,
"loss": 1.8096,
"step": 6410
},
{
"epoch": 0.43226306391294095,
"grad_norm": 6.071057162652461,
"learning_rate": 6.997303966575322e-07,
"loss": 1.8587,
"step": 6415
},
{
"epoch": 0.43259997978504766,
"grad_norm": 6.363069927240701,
"learning_rate": 6.991911646182696e-07,
"loss": 1.8238,
"step": 6420
},
{
"epoch": 0.43293689565715443,
"grad_norm": 7.006943760155347,
"learning_rate": 6.986516570565809e-07,
"loss": 1.943,
"step": 6425
},
{
"epoch": 0.43327381152926114,
"grad_norm": 5.627017420401183,
"learning_rate": 6.981118747187163e-07,
"loss": 1.7897,
"step": 6430
},
{
"epoch": 0.43361072740136786,
"grad_norm": 5.4444971068326895,
"learning_rate": 6.975718183513056e-07,
"loss": 1.7919,
"step": 6435
},
{
"epoch": 0.4339476432734746,
"grad_norm": 6.062332184148201,
"learning_rate": 6.970314887013585e-07,
"loss": 1.7791,
"step": 6440
},
{
"epoch": 0.43428455914558134,
"grad_norm": 5.89355423464885,
"learning_rate": 6.964908865162617e-07,
"loss": 1.8423,
"step": 6445
},
{
"epoch": 0.4346214750176881,
"grad_norm": 6.6258832394007765,
"learning_rate": 6.959500125437801e-07,
"loss": 1.888,
"step": 6450
},
{
"epoch": 0.4349583908897948,
"grad_norm": 6.365315795081244,
"learning_rate": 6.954088675320534e-07,
"loss": 1.7969,
"step": 6455
},
{
"epoch": 0.43529530676190153,
"grad_norm": 5.981109456662887,
"learning_rate": 6.948674522295969e-07,
"loss": 1.8629,
"step": 6460
},
{
"epoch": 0.4356322226340083,
"grad_norm": 6.009688578755199,
"learning_rate": 6.943257673852993e-07,
"loss": 1.7985,
"step": 6465
},
{
"epoch": 0.435969138506115,
"grad_norm": 6.299881918823882,
"learning_rate": 6.937838137484225e-07,
"loss": 1.905,
"step": 6470
},
{
"epoch": 0.43630605437822173,
"grad_norm": 6.220203094141343,
"learning_rate": 6.932415920686001e-07,
"loss": 1.7251,
"step": 6475
},
{
"epoch": 0.4366429702503285,
"grad_norm": 5.711822479774969,
"learning_rate": 6.926991030958362e-07,
"loss": 1.8137,
"step": 6480
},
{
"epoch": 0.4369798861224352,
"grad_norm": 6.410027467003412,
"learning_rate": 6.921563475805051e-07,
"loss": 1.7679,
"step": 6485
},
{
"epoch": 0.437316801994542,
"grad_norm": 5.907460533318916,
"learning_rate": 6.916133262733493e-07,
"loss": 1.7704,
"step": 6490
},
{
"epoch": 0.4376537178666487,
"grad_norm": 6.55648674383762,
"learning_rate": 6.910700399254793e-07,
"loss": 1.836,
"step": 6495
},
{
"epoch": 0.4379906337387554,
"grad_norm": 5.720812174587427,
"learning_rate": 6.905264892883721e-07,
"loss": 1.7987,
"step": 6500
},
{
"epoch": 0.4383275496108622,
"grad_norm": 5.349790019313271,
"learning_rate": 6.899826751138701e-07,
"loss": 1.8159,
"step": 6505
},
{
"epoch": 0.4386644654829689,
"grad_norm": 6.281596880445255,
"learning_rate": 6.894385981541804e-07,
"loss": 1.8379,
"step": 6510
},
{
"epoch": 0.43900138135507566,
"grad_norm": 6.043708122899696,
"learning_rate": 6.888942591618736e-07,
"loss": 1.8159,
"step": 6515
},
{
"epoch": 0.4393382972271824,
"grad_norm": 6.024522211212892,
"learning_rate": 6.883496588898827e-07,
"loss": 1.7956,
"step": 6520
},
{
"epoch": 0.4396752130992891,
"grad_norm": 6.26133167110833,
"learning_rate": 6.87804798091502e-07,
"loss": 1.8311,
"step": 6525
},
{
"epoch": 0.44001212897139586,
"grad_norm": 6.350548816132551,
"learning_rate": 6.872596775203864e-07,
"loss": 1.8546,
"step": 6530
},
{
"epoch": 0.44034904484350257,
"grad_norm": 5.911948063541635,
"learning_rate": 6.867142979305498e-07,
"loss": 1.7843,
"step": 6535
},
{
"epoch": 0.44068596071560934,
"grad_norm": 5.978493138464683,
"learning_rate": 6.861686600763648e-07,
"loss": 1.8331,
"step": 6540
},
{
"epoch": 0.44102287658771605,
"grad_norm": 6.2087819303121154,
"learning_rate": 6.856227647125607e-07,
"loss": 1.8502,
"step": 6545
},
{
"epoch": 0.44135979245982276,
"grad_norm": 6.320344130956893,
"learning_rate": 6.850766125942235e-07,
"loss": 1.8667,
"step": 6550
},
{
"epoch": 0.44169670833192953,
"grad_norm": 6.142752431697697,
"learning_rate": 6.84530204476794e-07,
"loss": 1.8206,
"step": 6555
},
{
"epoch": 0.44203362420403625,
"grad_norm": 6.055474371152951,
"learning_rate": 6.839835411160673e-07,
"loss": 1.8678,
"step": 6560
},
{
"epoch": 0.44237054007614296,
"grad_norm": 5.817969085642228,
"learning_rate": 6.834366232681915e-07,
"loss": 1.8149,
"step": 6565
},
{
"epoch": 0.44270745594824973,
"grad_norm": 5.76853321731449,
"learning_rate": 6.828894516896664e-07,
"loss": 1.7588,
"step": 6570
},
{
"epoch": 0.44304437182035644,
"grad_norm": 6.149317072156233,
"learning_rate": 6.823420271373433e-07,
"loss": 1.8405,
"step": 6575
},
{
"epoch": 0.4433812876924632,
"grad_norm": 6.408757055709368,
"learning_rate": 6.817943503684232e-07,
"loss": 1.8757,
"step": 6580
},
{
"epoch": 0.4437182035645699,
"grad_norm": 5.7234737809931095,
"learning_rate": 6.812464221404558e-07,
"loss": 1.806,
"step": 6585
},
{
"epoch": 0.44405511943667664,
"grad_norm": 5.956521950229173,
"learning_rate": 6.806982432113388e-07,
"loss": 1.835,
"step": 6590
},
{
"epoch": 0.4443920353087834,
"grad_norm": 5.847432210874048,
"learning_rate": 6.801498143393168e-07,
"loss": 1.8316,
"step": 6595
},
{
"epoch": 0.4447289511808901,
"grad_norm": 6.25690887107104,
"learning_rate": 6.796011362829794e-07,
"loss": 1.8865,
"step": 6600
},
{
"epoch": 0.4450658670529969,
"grad_norm": 6.128754946117051,
"learning_rate": 6.790522098012621e-07,
"loss": 1.8238,
"step": 6605
},
{
"epoch": 0.4454027829251036,
"grad_norm": 6.087198236864975,
"learning_rate": 6.785030356534428e-07,
"loss": 1.8193,
"step": 6610
},
{
"epoch": 0.4457396987972103,
"grad_norm": 6.429468035527822,
"learning_rate": 6.779536145991427e-07,
"loss": 1.8451,
"step": 6615
},
{
"epoch": 0.4460766146693171,
"grad_norm": 6.34746638921528,
"learning_rate": 6.774039473983243e-07,
"loss": 1.7525,
"step": 6620
},
{
"epoch": 0.4464135305414238,
"grad_norm": 6.139901961547965,
"learning_rate": 6.768540348112906e-07,
"loss": 1.8237,
"step": 6625
},
{
"epoch": 0.44675044641353057,
"grad_norm": 5.66015536542511,
"learning_rate": 6.763038775986842e-07,
"loss": 1.8506,
"step": 6630
},
{
"epoch": 0.4470873622856373,
"grad_norm": 5.9272363359583204,
"learning_rate": 6.757534765214858e-07,
"loss": 1.8595,
"step": 6635
},
{
"epoch": 0.447424278157744,
"grad_norm": 6.161894844697172,
"learning_rate": 6.752028323410134e-07,
"loss": 1.7805,
"step": 6640
},
{
"epoch": 0.44776119402985076,
"grad_norm": 5.84449768884819,
"learning_rate": 6.746519458189214e-07,
"loss": 1.8152,
"step": 6645
},
{
"epoch": 0.4480981099019575,
"grad_norm": 5.657979812571092,
"learning_rate": 6.741008177171993e-07,
"loss": 1.8209,
"step": 6650
},
{
"epoch": 0.4484350257740642,
"grad_norm": 5.944430010944899,
"learning_rate": 6.735494487981711e-07,
"loss": 1.7561,
"step": 6655
},
{
"epoch": 0.44877194164617096,
"grad_norm": 5.520891888250897,
"learning_rate": 6.729978398244935e-07,
"loss": 1.7838,
"step": 6660
},
{
"epoch": 0.44910885751827767,
"grad_norm": 5.996486165077548,
"learning_rate": 6.724459915591551e-07,
"loss": 1.8489,
"step": 6665
},
{
"epoch": 0.44944577339038444,
"grad_norm": 5.733752546274423,
"learning_rate": 6.718939047654763e-07,
"loss": 1.8305,
"step": 6670
},
{
"epoch": 0.44978268926249115,
"grad_norm": 5.820810666342823,
"learning_rate": 6.713415802071064e-07,
"loss": 1.857,
"step": 6675
},
{
"epoch": 0.45011960513459787,
"grad_norm": 5.650158926744362,
"learning_rate": 6.707890186480244e-07,
"loss": 1.8326,
"step": 6680
},
{
"epoch": 0.45045652100670464,
"grad_norm": 5.977907228957739,
"learning_rate": 6.702362208525366e-07,
"loss": 1.7733,
"step": 6685
},
{
"epoch": 0.45079343687881135,
"grad_norm": 5.644462707892311,
"learning_rate": 6.696831875852763e-07,
"loss": 1.7398,
"step": 6690
},
{
"epoch": 0.4511303527509181,
"grad_norm": 5.5200713986530685,
"learning_rate": 6.691299196112025e-07,
"loss": 1.8783,
"step": 6695
},
{
"epoch": 0.45146726862302483,
"grad_norm": 5.829144396241241,
"learning_rate": 6.685764176955991e-07,
"loss": 1.77,
"step": 6700
},
{
"epoch": 0.45180418449513154,
"grad_norm": 5.815077874644452,
"learning_rate": 6.680226826040727e-07,
"loss": 1.8806,
"step": 6705
},
{
"epoch": 0.4521411003672383,
"grad_norm": 6.016668642678767,
"learning_rate": 6.674687151025535e-07,
"loss": 1.7405,
"step": 6710
},
{
"epoch": 0.452478016239345,
"grad_norm": 5.5436173978059315,
"learning_rate": 6.669145159572924e-07,
"loss": 1.8083,
"step": 6715
},
{
"epoch": 0.4528149321114518,
"grad_norm": 5.498353466010969,
"learning_rate": 6.663600859348615e-07,
"loss": 1.8109,
"step": 6720
},
{
"epoch": 0.4531518479835585,
"grad_norm": 5.857461806397999,
"learning_rate": 6.658054258021513e-07,
"loss": 1.8539,
"step": 6725
},
{
"epoch": 0.4534887638556652,
"grad_norm": 5.4613036126443335,
"learning_rate": 6.652505363263712e-07,
"loss": 1.8247,
"step": 6730
},
{
"epoch": 0.453825679727772,
"grad_norm": 6.035072147805387,
"learning_rate": 6.646954182750478e-07,
"loss": 1.7817,
"step": 6735
},
{
"epoch": 0.4541625955998787,
"grad_norm": 5.595146968290168,
"learning_rate": 6.641400724160234e-07,
"loss": 1.7759,
"step": 6740
},
{
"epoch": 0.4544995114719854,
"grad_norm": 5.565775929671658,
"learning_rate": 6.635844995174561e-07,
"loss": 1.8401,
"step": 6745
},
{
"epoch": 0.4548364273440922,
"grad_norm": 6.211211135256073,
"learning_rate": 6.630287003478176e-07,
"loss": 1.852,
"step": 6750
},
{
"epoch": 0.4551733432161989,
"grad_norm": 5.8516613231945,
"learning_rate": 6.624726756758927e-07,
"loss": 1.7437,
"step": 6755
},
{
"epoch": 0.45551025908830567,
"grad_norm": 6.200671363752341,
"learning_rate": 6.619164262707782e-07,
"loss": 1.8437,
"step": 6760
},
{
"epoch": 0.4558471749604124,
"grad_norm": 5.932673768489387,
"learning_rate": 6.613599529018815e-07,
"loss": 1.7869,
"step": 6765
},
{
"epoch": 0.4561840908325191,
"grad_norm": 6.133480058883383,
"learning_rate": 6.608032563389198e-07,
"loss": 1.8157,
"step": 6770
},
{
"epoch": 0.45652100670462586,
"grad_norm": 5.913849561325004,
"learning_rate": 6.602463373519196e-07,
"loss": 1.8171,
"step": 6775
},
{
"epoch": 0.4568579225767326,
"grad_norm": 5.5977482506304215,
"learning_rate": 6.596891967112143e-07,
"loss": 1.7908,
"step": 6780
},
{
"epoch": 0.45719483844883935,
"grad_norm": 5.493955784529846,
"learning_rate": 6.59131835187444e-07,
"loss": 1.7221,
"step": 6785
},
{
"epoch": 0.45753175432094606,
"grad_norm": 6.311459816981047,
"learning_rate": 6.58574253551555e-07,
"loss": 1.8193,
"step": 6790
},
{
"epoch": 0.4578686701930528,
"grad_norm": 6.415344094889377,
"learning_rate": 6.580164525747973e-07,
"loss": 1.7941,
"step": 6795
},
{
"epoch": 0.45820558606515954,
"grad_norm": 6.105511233378877,
"learning_rate": 6.574584330287247e-07,
"loss": 1.8424,
"step": 6800
},
{
"epoch": 0.45854250193726626,
"grad_norm": 5.7943538228072065,
"learning_rate": 6.569001956851932e-07,
"loss": 1.8572,
"step": 6805
},
{
"epoch": 0.458879417809373,
"grad_norm": 5.421853273516655,
"learning_rate": 6.563417413163601e-07,
"loss": 1.8344,
"step": 6810
},
{
"epoch": 0.45921633368147974,
"grad_norm": 5.637838882491133,
"learning_rate": 6.55783070694683e-07,
"loss": 1.8268,
"step": 6815
},
{
"epoch": 0.45955324955358645,
"grad_norm": 5.643321632821095,
"learning_rate": 6.55224184592918e-07,
"loss": 1.8258,
"step": 6820
},
{
"epoch": 0.4598901654256932,
"grad_norm": 5.998748203165365,
"learning_rate": 6.546650837841203e-07,
"loss": 1.8257,
"step": 6825
},
{
"epoch": 0.46022708129779993,
"grad_norm": 5.965057345432399,
"learning_rate": 6.541057690416414e-07,
"loss": 1.7399,
"step": 6830
},
{
"epoch": 0.46056399716990665,
"grad_norm": 6.1720570763462055,
"learning_rate": 6.535462411391284e-07,
"loss": 1.7994,
"step": 6835
},
{
"epoch": 0.4609009130420134,
"grad_norm": 5.86341018032716,
"learning_rate": 6.529865008505244e-07,
"loss": 1.799,
"step": 6840
},
{
"epoch": 0.46123782891412013,
"grad_norm": 6.746892080799635,
"learning_rate": 6.524265489500651e-07,
"loss": 1.8779,
"step": 6845
},
{
"epoch": 0.4615747447862269,
"grad_norm": 5.9098714454502295,
"learning_rate": 6.518663862122794e-07,
"loss": 1.734,
"step": 6850
},
{
"epoch": 0.4619116606583336,
"grad_norm": 5.735456527744707,
"learning_rate": 6.513060134119878e-07,
"loss": 1.7316,
"step": 6855
},
{
"epoch": 0.4622485765304403,
"grad_norm": 5.890116939620567,
"learning_rate": 6.507454313243015e-07,
"loss": 1.7896,
"step": 6860
},
{
"epoch": 0.4625854924025471,
"grad_norm": 5.983532007280158,
"learning_rate": 6.50184640724621e-07,
"loss": 1.8345,
"step": 6865
},
{
"epoch": 0.4629224082746538,
"grad_norm": 6.199632944093907,
"learning_rate": 6.496236423886351e-07,
"loss": 1.8626,
"step": 6870
},
{
"epoch": 0.4632593241467606,
"grad_norm": 6.237427141315285,
"learning_rate": 6.490624370923201e-07,
"loss": 1.8034,
"step": 6875
},
{
"epoch": 0.4635962400188673,
"grad_norm": 5.84748187391112,
"learning_rate": 6.485010256119388e-07,
"loss": 1.7816,
"step": 6880
},
{
"epoch": 0.463933155890974,
"grad_norm": 5.64468712531096,
"learning_rate": 6.479394087240389e-07,
"loss": 1.8375,
"step": 6885
},
{
"epoch": 0.46427007176308077,
"grad_norm": 6.152420602370605,
"learning_rate": 6.473775872054521e-07,
"loss": 1.8895,
"step": 6890
},
{
"epoch": 0.4646069876351875,
"grad_norm": 6.681683712578151,
"learning_rate": 6.468155618332936e-07,
"loss": 1.7815,
"step": 6895
},
{
"epoch": 0.46494390350729425,
"grad_norm": 6.095659229972454,
"learning_rate": 6.462533333849599e-07,
"loss": 1.8292,
"step": 6900
},
{
"epoch": 0.46528081937940097,
"grad_norm": 5.448666114017835,
"learning_rate": 6.456909026381292e-07,
"loss": 1.7838,
"step": 6905
},
{
"epoch": 0.4656177352515077,
"grad_norm": 6.188048616642648,
"learning_rate": 6.451282703707591e-07,
"loss": 1.7777,
"step": 6910
},
{
"epoch": 0.46595465112361445,
"grad_norm": 6.326139726045801,
"learning_rate": 6.445654373610854e-07,
"loss": 1.8323,
"step": 6915
},
{
"epoch": 0.46629156699572116,
"grad_norm": 6.10356320446364,
"learning_rate": 6.440024043876229e-07,
"loss": 1.8562,
"step": 6920
},
{
"epoch": 0.4666284828678279,
"grad_norm": 5.708886402068291,
"learning_rate": 6.434391722291618e-07,
"loss": 1.8621,
"step": 6925
},
{
"epoch": 0.46696539873993465,
"grad_norm": 6.11508671033799,
"learning_rate": 6.428757416647683e-07,
"loss": 1.7965,
"step": 6930
},
{
"epoch": 0.46730231461204136,
"grad_norm": 6.32816603703681,
"learning_rate": 6.42312113473783e-07,
"loss": 1.8077,
"step": 6935
},
{
"epoch": 0.4676392304841481,
"grad_norm": 5.96196567374692,
"learning_rate": 6.417482884358196e-07,
"loss": 1.8217,
"step": 6940
},
{
"epoch": 0.46797614635625484,
"grad_norm": 6.021121675554487,
"learning_rate": 6.411842673307648e-07,
"loss": 1.8155,
"step": 6945
},
{
"epoch": 0.46831306222836155,
"grad_norm": 6.098892855175968,
"learning_rate": 6.406200509387756e-07,
"loss": 1.7478,
"step": 6950
},
{
"epoch": 0.4686499781004683,
"grad_norm": 6.282127245845644,
"learning_rate": 6.400556400402796e-07,
"loss": 1.8861,
"step": 6955
},
{
"epoch": 0.46898689397257504,
"grad_norm": 6.018028715278134,
"learning_rate": 6.394910354159736e-07,
"loss": 1.7715,
"step": 6960
},
{
"epoch": 0.4693238098446818,
"grad_norm": 5.798697336927142,
"learning_rate": 6.389262378468219e-07,
"loss": 1.8519,
"step": 6965
},
{
"epoch": 0.4696607257167885,
"grad_norm": 5.5797942228856705,
"learning_rate": 6.38361248114056e-07,
"loss": 1.7976,
"step": 6970
},
{
"epoch": 0.46999764158889523,
"grad_norm": 5.6996680970531015,
"learning_rate": 6.377960669991733e-07,
"loss": 1.9102,
"step": 6975
},
{
"epoch": 0.470334557461002,
"grad_norm": 6.235143725814519,
"learning_rate": 6.372306952839353e-07,
"loss": 1.8289,
"step": 6980
},
{
"epoch": 0.4706714733331087,
"grad_norm": 5.893001299421735,
"learning_rate": 6.36665133750368e-07,
"loss": 1.8837,
"step": 6985
},
{
"epoch": 0.4710083892052155,
"grad_norm": 5.879461530673223,
"learning_rate": 6.360993831807593e-07,
"loss": 1.7474,
"step": 6990
},
{
"epoch": 0.4713453050773222,
"grad_norm": 6.002930353556234,
"learning_rate": 6.355334443576589e-07,
"loss": 1.8266,
"step": 6995
},
{
"epoch": 0.4716822209494289,
"grad_norm": 5.58752845843444,
"learning_rate": 6.349673180638769e-07,
"loss": 1.7796,
"step": 7000
},
{
"epoch": 0.4720191368215357,
"grad_norm": 6.352723281095035,
"learning_rate": 6.344010050824824e-07,
"loss": 1.8354,
"step": 7005
},
{
"epoch": 0.4723560526936424,
"grad_norm": 5.846163318879495,
"learning_rate": 6.338345061968032e-07,
"loss": 1.8195,
"step": 7010
},
{
"epoch": 0.4726929685657491,
"grad_norm": 5.884896796215211,
"learning_rate": 6.33267822190424e-07,
"loss": 1.7535,
"step": 7015
},
{
"epoch": 0.4730298844378559,
"grad_norm": 5.774564823273729,
"learning_rate": 6.327009538471853e-07,
"loss": 1.8374,
"step": 7020
},
{
"epoch": 0.4733668003099626,
"grad_norm": 6.0627901365864565,
"learning_rate": 6.321339019511828e-07,
"loss": 1.8027,
"step": 7025
},
{
"epoch": 0.47370371618206936,
"grad_norm": 5.788578834075243,
"learning_rate": 6.315666672867664e-07,
"loss": 1.8547,
"step": 7030
},
{
"epoch": 0.47404063205417607,
"grad_norm": 6.177124253642273,
"learning_rate": 6.309992506385385e-07,
"loss": 1.7444,
"step": 7035
},
{
"epoch": 0.4743775479262828,
"grad_norm": 5.982361591747417,
"learning_rate": 6.304316527913531e-07,
"loss": 1.7825,
"step": 7040
},
{
"epoch": 0.47471446379838955,
"grad_norm": 6.061226999093581,
"learning_rate": 6.29863874530315e-07,
"loss": 1.7663,
"step": 7045
},
{
"epoch": 0.47505137967049627,
"grad_norm": 6.222315463271125,
"learning_rate": 6.292959166407785e-07,
"loss": 1.8639,
"step": 7050
},
{
"epoch": 0.47538829554260303,
"grad_norm": 6.34970457479949,
"learning_rate": 6.287277799083466e-07,
"loss": 1.8037,
"step": 7055
},
{
"epoch": 0.47572521141470975,
"grad_norm": 6.03519967937727,
"learning_rate": 6.281594651188693e-07,
"loss": 1.7412,
"step": 7060
},
{
"epoch": 0.47606212728681646,
"grad_norm": 5.751601424455161,
"learning_rate": 6.275909730584431e-07,
"loss": 1.7887,
"step": 7065
},
{
"epoch": 0.47639904315892323,
"grad_norm": 6.115573647645315,
"learning_rate": 6.270223045134095e-07,
"loss": 1.8296,
"step": 7070
},
{
"epoch": 0.47673595903102994,
"grad_norm": 6.074604130360429,
"learning_rate": 6.264534602703546e-07,
"loss": 1.7821,
"step": 7075
},
{
"epoch": 0.4770728749031367,
"grad_norm": 6.424918913746082,
"learning_rate": 6.25884441116107e-07,
"loss": 1.865,
"step": 7080
},
{
"epoch": 0.4774097907752434,
"grad_norm": 5.845730128232722,
"learning_rate": 6.253152478377375e-07,
"loss": 1.797,
"step": 7085
},
{
"epoch": 0.47774670664735014,
"grad_norm": 5.957759182701193,
"learning_rate": 6.247458812225576e-07,
"loss": 1.8609,
"step": 7090
},
{
"epoch": 0.4780836225194569,
"grad_norm": 5.921689289922398,
"learning_rate": 6.241763420581188e-07,
"loss": 1.8602,
"step": 7095
},
{
"epoch": 0.4784205383915636,
"grad_norm": 5.610549553444218,
"learning_rate": 6.23606631132211e-07,
"loss": 1.8074,
"step": 7100
},
{
"epoch": 0.47875745426367033,
"grad_norm": 5.877721261294871,
"learning_rate": 6.23036749232862e-07,
"loss": 1.8379,
"step": 7105
},
{
"epoch": 0.4790943701357771,
"grad_norm": 6.284830816460788,
"learning_rate": 6.224666971483355e-07,
"loss": 1.7489,
"step": 7110
},
{
"epoch": 0.4794312860078838,
"grad_norm": 6.054222103634587,
"learning_rate": 6.218964756671315e-07,
"loss": 1.783,
"step": 7115
},
{
"epoch": 0.4797682018799906,
"grad_norm": 6.127182510154835,
"learning_rate": 6.213260855779834e-07,
"loss": 1.8093,
"step": 7120
},
{
"epoch": 0.4801051177520973,
"grad_norm": 5.902080284141082,
"learning_rate": 6.207555276698584e-07,
"loss": 1.8552,
"step": 7125
},
{
"epoch": 0.480442033624204,
"grad_norm": 5.945336485857451,
"learning_rate": 6.201848027319556e-07,
"loss": 1.8258,
"step": 7130
},
{
"epoch": 0.4807789494963108,
"grad_norm": 5.888205608422822,
"learning_rate": 6.196139115537054e-07,
"loss": 1.7935,
"step": 7135
},
{
"epoch": 0.4811158653684175,
"grad_norm": 6.233570018542256,
"learning_rate": 6.190428549247677e-07,
"loss": 1.8432,
"step": 7140
},
{
"epoch": 0.48145278124052426,
"grad_norm": 5.755763880495322,
"learning_rate": 6.184716336350316e-07,
"loss": 1.8871,
"step": 7145
},
{
"epoch": 0.481789697112631,
"grad_norm": 6.524975298684033,
"learning_rate": 6.179002484746137e-07,
"loss": 1.838,
"step": 7150
},
{
"epoch": 0.4821266129847377,
"grad_norm": 5.819047158294165,
"learning_rate": 6.173287002338577e-07,
"loss": 1.8088,
"step": 7155
},
{
"epoch": 0.48246352885684446,
"grad_norm": 5.975474876362582,
"learning_rate": 6.167569897033322e-07,
"loss": 1.8079,
"step": 7160
},
{
"epoch": 0.4828004447289512,
"grad_norm": 5.71735739384533,
"learning_rate": 6.16185117673831e-07,
"loss": 1.8762,
"step": 7165
},
{
"epoch": 0.48313736060105794,
"grad_norm": 6.123157625142532,
"learning_rate": 6.15613084936371e-07,
"loss": 1.8784,
"step": 7170
},
{
"epoch": 0.48347427647316465,
"grad_norm": 5.767255419265751,
"learning_rate": 6.150408922821911e-07,
"loss": 1.8505,
"step": 7175
},
{
"epoch": 0.48381119234527137,
"grad_norm": 6.395606763447789,
"learning_rate": 6.144685405027518e-07,
"loss": 1.8165,
"step": 7180
},
{
"epoch": 0.48414810821737814,
"grad_norm": 5.917857732608529,
"learning_rate": 6.138960303897335e-07,
"loss": 1.8577,
"step": 7185
},
{
"epoch": 0.48448502408948485,
"grad_norm": 6.1936529585107465,
"learning_rate": 6.133233627350355e-07,
"loss": 1.7746,
"step": 7190
},
{
"epoch": 0.48482193996159156,
"grad_norm": 5.564280808926514,
"learning_rate": 6.127505383307754e-07,
"loss": 1.7958,
"step": 7195
},
{
"epoch": 0.48515885583369833,
"grad_norm": 5.747234003023109,
"learning_rate": 6.121775579692873e-07,
"loss": 1.8808,
"step": 7200
},
{
"epoch": 0.48549577170580505,
"grad_norm": 6.451438702965074,
"learning_rate": 6.116044224431212e-07,
"loss": 1.8364,
"step": 7205
},
{
"epoch": 0.4858326875779118,
"grad_norm": 6.027079769431342,
"learning_rate": 6.110311325450416e-07,
"loss": 1.74,
"step": 7210
},
{
"epoch": 0.48616960345001853,
"grad_norm": 6.240032794806677,
"learning_rate": 6.104576890680263e-07,
"loss": 1.8064,
"step": 7215
},
{
"epoch": 0.48650651932212524,
"grad_norm": 6.510974143301528,
"learning_rate": 6.098840928052663e-07,
"loss": 1.8691,
"step": 7220
},
{
"epoch": 0.486843435194232,
"grad_norm": 6.165787234194551,
"learning_rate": 6.093103445501629e-07,
"loss": 1.7635,
"step": 7225
},
{
"epoch": 0.4871803510663387,
"grad_norm": 6.3112427035306595,
"learning_rate": 6.087364450963286e-07,
"loss": 1.8033,
"step": 7230
},
{
"epoch": 0.4875172669384455,
"grad_norm": 6.162132319624708,
"learning_rate": 6.081623952375843e-07,
"loss": 1.8591,
"step": 7235
},
{
"epoch": 0.4878541828105522,
"grad_norm": 6.168167615327447,
"learning_rate": 6.075881957679593e-07,
"loss": 1.8161,
"step": 7240
},
{
"epoch": 0.4881910986826589,
"grad_norm": 6.532602770115309,
"learning_rate": 6.0701384748169e-07,
"loss": 1.8449,
"step": 7245
},
{
"epoch": 0.4885280145547657,
"grad_norm": 6.211021003395661,
"learning_rate": 6.064393511732181e-07,
"loss": 1.8488,
"step": 7250
},
{
"epoch": 0.4888649304268724,
"grad_norm": 6.206539533335064,
"learning_rate": 6.058647076371906e-07,
"loss": 1.8284,
"step": 7255
},
{
"epoch": 0.48920184629897917,
"grad_norm": 6.036174190363687,
"learning_rate": 6.052899176684579e-07,
"loss": 1.7717,
"step": 7260
},
{
"epoch": 0.4895387621710859,
"grad_norm": 6.047303397207279,
"learning_rate": 6.047149820620729e-07,
"loss": 1.7952,
"step": 7265
},
{
"epoch": 0.4898756780431926,
"grad_norm": 6.07327877713165,
"learning_rate": 6.0413990161329e-07,
"loss": 1.8041,
"step": 7270
},
{
"epoch": 0.49021259391529937,
"grad_norm": 5.821598916540012,
"learning_rate": 6.035646771175642e-07,
"loss": 1.8248,
"step": 7275
},
{
"epoch": 0.4905495097874061,
"grad_norm": 6.079050090689548,
"learning_rate": 6.029893093705491e-07,
"loss": 1.8119,
"step": 7280
},
{
"epoch": 0.4908864256595128,
"grad_norm": 5.628298752995402,
"learning_rate": 6.024137991680973e-07,
"loss": 1.7406,
"step": 7285
},
{
"epoch": 0.49122334153161956,
"grad_norm": 5.732384094969437,
"learning_rate": 6.018381473062575e-07,
"loss": 1.8597,
"step": 7290
},
{
"epoch": 0.4915602574037263,
"grad_norm": 6.352928731604198,
"learning_rate": 6.012623545812754e-07,
"loss": 1.7484,
"step": 7295
},
{
"epoch": 0.49189717327583304,
"grad_norm": 6.042895881529252,
"learning_rate": 6.006864217895906e-07,
"loss": 1.8524,
"step": 7300
},
{
"epoch": 0.49223408914793976,
"grad_norm": 5.7435529216099654,
"learning_rate": 6.001103497278369e-07,
"loss": 1.8325,
"step": 7305
},
{
"epoch": 0.49257100502004647,
"grad_norm": 6.724766787405004,
"learning_rate": 5.995341391928408e-07,
"loss": 1.8231,
"step": 7310
},
{
"epoch": 0.49290792089215324,
"grad_norm": 5.889020383079155,
"learning_rate": 5.9895779098162e-07,
"loss": 1.8216,
"step": 7315
},
{
"epoch": 0.49324483676425995,
"grad_norm": 5.854780725466525,
"learning_rate": 5.983813058913829e-07,
"loss": 1.7734,
"step": 7320
},
{
"epoch": 0.4935817526363667,
"grad_norm": 5.453517656480012,
"learning_rate": 5.978046847195272e-07,
"loss": 1.8492,
"step": 7325
},
{
"epoch": 0.49391866850847344,
"grad_norm": 6.459742174870257,
"learning_rate": 5.97227928263639e-07,
"loss": 1.8713,
"step": 7330
},
{
"epoch": 0.49425558438058015,
"grad_norm": 5.857832814151433,
"learning_rate": 5.96651037321491e-07,
"loss": 1.8556,
"step": 7335
},
{
"epoch": 0.4945925002526869,
"grad_norm": 5.596533367239473,
"learning_rate": 5.960740126910425e-07,
"loss": 1.7902,
"step": 7340
},
{
"epoch": 0.49492941612479363,
"grad_norm": 6.226649318452835,
"learning_rate": 5.954968551704373e-07,
"loss": 1.8362,
"step": 7345
},
{
"epoch": 0.4952663319969004,
"grad_norm": 6.110264934214935,
"learning_rate": 5.949195655580032e-07,
"loss": 1.8022,
"step": 7350
},
{
"epoch": 0.4956032478690071,
"grad_norm": 6.618903452310688,
"learning_rate": 5.943421446522509e-07,
"loss": 1.8546,
"step": 7355
},
{
"epoch": 0.4959401637411138,
"grad_norm": 6.149369987083455,
"learning_rate": 5.93764593251872e-07,
"loss": 1.7979,
"step": 7360
},
{
"epoch": 0.4962770796132206,
"grad_norm": 6.8812512541470685,
"learning_rate": 5.931869121557397e-07,
"loss": 1.8307,
"step": 7365
},
{
"epoch": 0.4966139954853273,
"grad_norm": 6.040974955829878,
"learning_rate": 5.926091021629055e-07,
"loss": 1.7826,
"step": 7370
},
{
"epoch": 0.496950911357434,
"grad_norm": 5.937707008894662,
"learning_rate": 5.920311640726e-07,
"loss": 1.7873,
"step": 7375
},
{
"epoch": 0.4972878272295408,
"grad_norm": 6.470436151813095,
"learning_rate": 5.914530986842307e-07,
"loss": 1.8437,
"step": 7380
},
{
"epoch": 0.4976247431016475,
"grad_norm": 6.3141277294968186,
"learning_rate": 5.908749067973809e-07,
"loss": 1.8271,
"step": 7385
},
{
"epoch": 0.4979616589737543,
"grad_norm": 5.903283938829289,
"learning_rate": 5.902965892118093e-07,
"loss": 1.8597,
"step": 7390
},
{
"epoch": 0.498298574845861,
"grad_norm": 5.773979245288634,
"learning_rate": 5.89718146727448e-07,
"loss": 1.8186,
"step": 7395
},
{
"epoch": 0.4986354907179677,
"grad_norm": 5.905823787046488,
"learning_rate": 5.891395801444026e-07,
"loss": 1.8116,
"step": 7400
},
{
"epoch": 0.49897240659007447,
"grad_norm": 6.486666074459213,
"learning_rate": 5.885608902629496e-07,
"loss": 1.7862,
"step": 7405
},
{
"epoch": 0.4993093224621812,
"grad_norm": 6.126712963890818,
"learning_rate": 5.879820778835364e-07,
"loss": 1.8475,
"step": 7410
},
{
"epoch": 0.49964623833428795,
"grad_norm": 6.769072318754673,
"learning_rate": 5.874031438067799e-07,
"loss": 1.8015,
"step": 7415
},
{
"epoch": 0.49998315420639466,
"grad_norm": 6.260585391210161,
"learning_rate": 5.868240888334652e-07,
"loss": 1.8085,
"step": 7420
},
{
"epoch": 0.5003200700785014,
"grad_norm": 6.346729595046292,
"learning_rate": 5.862449137645444e-07,
"loss": 1.8223,
"step": 7425
},
{
"epoch": 0.5006569859506081,
"grad_norm": 5.907134243732254,
"learning_rate": 5.856656194011365e-07,
"loss": 1.7711,
"step": 7430
},
{
"epoch": 0.5009939018227149,
"grad_norm": 6.063402006959808,
"learning_rate": 5.850862065445243e-07,
"loss": 1.7685,
"step": 7435
},
{
"epoch": 0.5013308176948216,
"grad_norm": 5.890743753183375,
"learning_rate": 5.845066759961557e-07,
"loss": 1.8192,
"step": 7440
},
{
"epoch": 0.5016677335669283,
"grad_norm": 5.660746063214769,
"learning_rate": 5.839270285576407e-07,
"loss": 1.8722,
"step": 7445
},
{
"epoch": 0.5020046494390351,
"grad_norm": 5.733382666328982,
"learning_rate": 5.833472650307509e-07,
"loss": 1.7618,
"step": 7450
},
{
"epoch": 0.5023415653111418,
"grad_norm": 5.958776596261261,
"learning_rate": 5.827673862174192e-07,
"loss": 1.873,
"step": 7455
},
{
"epoch": 0.5026784811832485,
"grad_norm": 5.773391012038019,
"learning_rate": 5.821873929197371e-07,
"loss": 1.8226,
"step": 7460
},
{
"epoch": 0.5030153970553553,
"grad_norm": 6.876209726907723,
"learning_rate": 5.81607285939955e-07,
"loss": 1.7926,
"step": 7465
},
{
"epoch": 0.503352312927462,
"grad_norm": 6.83351949530394,
"learning_rate": 5.810270660804805e-07,
"loss": 1.8214,
"step": 7470
},
{
"epoch": 0.5036892287995688,
"grad_norm": 5.695338110099444,
"learning_rate": 5.80446734143877e-07,
"loss": 1.7878,
"step": 7475
},
{
"epoch": 0.5040261446716755,
"grad_norm": 6.321842315715411,
"learning_rate": 5.798662909328633e-07,
"loss": 1.7732,
"step": 7480
},
{
"epoch": 0.5043630605437822,
"grad_norm": 5.8003219342595695,
"learning_rate": 5.792857372503119e-07,
"loss": 1.8207,
"step": 7485
},
{
"epoch": 0.5046999764158889,
"grad_norm": 6.206954096469576,
"learning_rate": 5.787050738992481e-07,
"loss": 1.8191,
"step": 7490
},
{
"epoch": 0.5050368922879956,
"grad_norm": 6.3174541848095345,
"learning_rate": 5.781243016828492e-07,
"loss": 1.8286,
"step": 7495
},
{
"epoch": 0.5053738081601025,
"grad_norm": 5.943268852990081,
"learning_rate": 5.775434214044427e-07,
"loss": 1.8347,
"step": 7500
},
{
"epoch": 0.5057107240322092,
"grad_norm": 6.366552386945912,
"learning_rate": 5.769624338675057e-07,
"loss": 1.8694,
"step": 7505
},
{
"epoch": 0.5060476399043159,
"grad_norm": 6.09106660535801,
"learning_rate": 5.763813398756637e-07,
"loss": 1.8571,
"step": 7510
},
{
"epoch": 0.5063845557764226,
"grad_norm": 5.927983640648523,
"learning_rate": 5.758001402326895e-07,
"loss": 1.8495,
"step": 7515
},
{
"epoch": 0.5067214716485293,
"grad_norm": 6.347023539008799,
"learning_rate": 5.752188357425019e-07,
"loss": 1.8598,
"step": 7520
},
{
"epoch": 0.5070583875206361,
"grad_norm": 5.7220954371710535,
"learning_rate": 5.746374272091648e-07,
"loss": 1.9033,
"step": 7525
},
{
"epoch": 0.5073953033927429,
"grad_norm": 5.716357729963673,
"learning_rate": 5.74055915436886e-07,
"loss": 1.8422,
"step": 7530
},
{
"epoch": 0.5077322192648496,
"grad_norm": 5.888138683912625,
"learning_rate": 5.734743012300162e-07,
"loss": 1.7706,
"step": 7535
},
{
"epoch": 0.5080691351369563,
"grad_norm": 6.074956239497393,
"learning_rate": 5.728925853930475e-07,
"loss": 1.7751,
"step": 7540
},
{
"epoch": 0.508406051009063,
"grad_norm": 6.30224985754695,
"learning_rate": 5.72310768730613e-07,
"loss": 1.8185,
"step": 7545
},
{
"epoch": 0.5087429668811698,
"grad_norm": 6.273905491547863,
"learning_rate": 5.717288520474849e-07,
"loss": 1.8798,
"step": 7550
},
{
"epoch": 0.5090798827532765,
"grad_norm": 5.877838326642534,
"learning_rate": 5.711468361485739e-07,
"loss": 1.798,
"step": 7555
},
{
"epoch": 0.5094167986253832,
"grad_norm": 5.551724826009454,
"learning_rate": 5.70564721838928e-07,
"loss": 1.8133,
"step": 7560
},
{
"epoch": 0.50975371449749,
"grad_norm": 6.386798899011169,
"learning_rate": 5.69982509923731e-07,
"loss": 1.8332,
"step": 7565
},
{
"epoch": 0.5100906303695967,
"grad_norm": 6.417824627395978,
"learning_rate": 5.694002012083022e-07,
"loss": 1.8929,
"step": 7570
},
{
"epoch": 0.5104275462417035,
"grad_norm": 5.731757336532826,
"learning_rate": 5.688177964980946e-07,
"loss": 1.7638,
"step": 7575
},
{
"epoch": 0.5107644621138102,
"grad_norm": 5.918225192432386,
"learning_rate": 5.682352965986935e-07,
"loss": 1.7946,
"step": 7580
},
{
"epoch": 0.5111013779859169,
"grad_norm": 5.849000801521007,
"learning_rate": 5.676527023158169e-07,
"loss": 1.8941,
"step": 7585
},
{
"epoch": 0.5114382938580236,
"grad_norm": 6.564241414658141,
"learning_rate": 5.670700144553122e-07,
"loss": 1.8329,
"step": 7590
},
{
"epoch": 0.5117752097301304,
"grad_norm": 6.08295990900783,
"learning_rate": 5.664872338231571e-07,
"loss": 1.8346,
"step": 7595
},
{
"epoch": 0.5121121256022371,
"grad_norm": 5.805622916909698,
"learning_rate": 5.659043612254573e-07,
"loss": 1.8188,
"step": 7600
},
{
"epoch": 0.5124490414743439,
"grad_norm": 6.197266946989277,
"learning_rate": 5.653213974684455e-07,
"loss": 1.7496,
"step": 7605
},
{
"epoch": 0.5127859573464506,
"grad_norm": 5.924833947325567,
"learning_rate": 5.647383433584807e-07,
"loss": 1.7737,
"step": 7610
},
{
"epoch": 0.5131228732185573,
"grad_norm": 6.03776739730078,
"learning_rate": 5.641551997020472e-07,
"loss": 1.8332,
"step": 7615
},
{
"epoch": 0.513459789090664,
"grad_norm": 6.1153821883699955,
"learning_rate": 5.635719673057524e-07,
"loss": 1.8323,
"step": 7620
},
{
"epoch": 0.5137967049627707,
"grad_norm": 5.832380420575743,
"learning_rate": 5.629886469763273e-07,
"loss": 1.7999,
"step": 7625
},
{
"epoch": 0.5141336208348776,
"grad_norm": 5.859611583687684,
"learning_rate": 5.624052395206239e-07,
"loss": 1.8414,
"step": 7630
},
{
"epoch": 0.5144705367069843,
"grad_norm": 6.172500864509078,
"learning_rate": 5.618217457456151e-07,
"loss": 1.7948,
"step": 7635
},
{
"epoch": 0.514807452579091,
"grad_norm": 5.9844119060841825,
"learning_rate": 5.612381664583928e-07,
"loss": 1.8405,
"step": 7640
},
{
"epoch": 0.5151443684511977,
"grad_norm": 5.884734313121189,
"learning_rate": 5.606545024661674e-07,
"loss": 1.8402,
"step": 7645
},
{
"epoch": 0.5154812843233044,
"grad_norm": 5.697399014618891,
"learning_rate": 5.600707545762667e-07,
"loss": 1.8065,
"step": 7650
},
{
"epoch": 0.5158182001954112,
"grad_norm": 5.964664311558702,
"learning_rate": 5.594869235961342e-07,
"loss": 1.849,
"step": 7655
},
{
"epoch": 0.516155116067518,
"grad_norm": 5.833081179358123,
"learning_rate": 5.589030103333282e-07,
"loss": 1.8678,
"step": 7660
},
{
"epoch": 0.5164920319396247,
"grad_norm": 5.943839246531296,
"learning_rate": 5.583190155955215e-07,
"loss": 1.7551,
"step": 7665
},
{
"epoch": 0.5168289478117314,
"grad_norm": 6.184069754194008,
"learning_rate": 5.57734940190499e-07,
"loss": 1.779,
"step": 7670
},
{
"epoch": 0.5171658636838381,
"grad_norm": 5.988286442003116,
"learning_rate": 5.571507849261572e-07,
"loss": 1.7786,
"step": 7675
},
{
"epoch": 0.5175027795559449,
"grad_norm": 6.104875157714636,
"learning_rate": 5.565665506105035e-07,
"loss": 1.7767,
"step": 7680
},
{
"epoch": 0.5178396954280516,
"grad_norm": 5.992583843253686,
"learning_rate": 5.559822380516539e-07,
"loss": 1.8409,
"step": 7685
},
{
"epoch": 0.5181766113001584,
"grad_norm": 6.788235374453657,
"learning_rate": 5.553978480578335e-07,
"loss": 1.797,
"step": 7690
},
{
"epoch": 0.5185135271722651,
"grad_norm": 5.480013162835297,
"learning_rate": 5.548133814373738e-07,
"loss": 1.8014,
"step": 7695
},
{
"epoch": 0.5188504430443718,
"grad_norm": 6.2047091655648074,
"learning_rate": 5.542288389987128e-07,
"loss": 1.8545,
"step": 7700
},
{
"epoch": 0.5191873589164786,
"grad_norm": 5.749872893831946,
"learning_rate": 5.536442215503929e-07,
"loss": 1.8309,
"step": 7705
},
{
"epoch": 0.5195242747885853,
"grad_norm": 6.013603014933779,
"learning_rate": 5.530595299010606e-07,
"loss": 1.7978,
"step": 7710
},
{
"epoch": 0.519861190660692,
"grad_norm": 6.665749598193633,
"learning_rate": 5.524747648594651e-07,
"loss": 1.8907,
"step": 7715
},
{
"epoch": 0.5201981065327987,
"grad_norm": 5.806706969722841,
"learning_rate": 5.518899272344568e-07,
"loss": 1.8127,
"step": 7720
},
{
"epoch": 0.5205350224049055,
"grad_norm": 5.8100736277669816,
"learning_rate": 5.513050178349866e-07,
"loss": 1.8411,
"step": 7725
},
{
"epoch": 0.5208719382770123,
"grad_norm": 6.2881327818412815,
"learning_rate": 5.507200374701048e-07,
"loss": 1.8624,
"step": 7730
},
{
"epoch": 0.521208854149119,
"grad_norm": 5.809423166099506,
"learning_rate": 5.501349869489596e-07,
"loss": 1.897,
"step": 7735
},
{
"epoch": 0.5215457700212257,
"grad_norm": 6.093243605312457,
"learning_rate": 5.495498670807967e-07,
"loss": 1.812,
"step": 7740
},
{
"epoch": 0.5218826858933324,
"grad_norm": 6.223494857280929,
"learning_rate": 5.489646786749574e-07,
"loss": 1.8101,
"step": 7745
},
{
"epoch": 0.5222196017654391,
"grad_norm": 6.071573078294997,
"learning_rate": 5.483794225408777e-07,
"loss": 1.8178,
"step": 7750
},
{
"epoch": 0.522556517637546,
"grad_norm": 5.693615324631069,
"learning_rate": 5.477940994880877e-07,
"loss": 1.6893,
"step": 7755
},
{
"epoch": 0.5228934335096527,
"grad_norm": 5.909887730062095,
"learning_rate": 5.472087103262094e-07,
"loss": 1.8432,
"step": 7760
},
{
"epoch": 0.5232303493817594,
"grad_norm": 5.870756025959482,
"learning_rate": 5.46623255864957e-07,
"loss": 1.819,
"step": 7765
},
{
"epoch": 0.5235672652538661,
"grad_norm": 5.649860781638532,
"learning_rate": 5.460377369141345e-07,
"loss": 1.8674,
"step": 7770
},
{
"epoch": 0.5239041811259728,
"grad_norm": 6.094869124377087,
"learning_rate": 5.454521542836351e-07,
"loss": 1.7859,
"step": 7775
},
{
"epoch": 0.5242410969980795,
"grad_norm": 6.4473743410813285,
"learning_rate": 5.448665087834405e-07,
"loss": 1.7703,
"step": 7780
},
{
"epoch": 0.5245780128701863,
"grad_norm": 6.560011859707494,
"learning_rate": 5.442808012236192e-07,
"loss": 1.7882,
"step": 7785
},
{
"epoch": 0.5249149287422931,
"grad_norm": 5.970137427490027,
"learning_rate": 5.436950324143251e-07,
"loss": 1.8465,
"step": 7790
},
{
"epoch": 0.5252518446143998,
"grad_norm": 5.925088761392246,
"learning_rate": 5.431092031657973e-07,
"loss": 1.7958,
"step": 7795
},
{
"epoch": 0.5255887604865065,
"grad_norm": 5.802514268517773,
"learning_rate": 5.425233142883585e-07,
"loss": 1.8614,
"step": 7800
},
{
"epoch": 0.5259256763586132,
"grad_norm": 5.855082730998635,
"learning_rate": 5.419373665924136e-07,
"loss": 1.7962,
"step": 7805
},
{
"epoch": 0.52626259223072,
"grad_norm": 5.429074232700685,
"learning_rate": 5.413513608884491e-07,
"loss": 1.8338,
"step": 7810
},
{
"epoch": 0.5265995081028267,
"grad_norm": 5.6742030630773055,
"learning_rate": 5.407652979870315e-07,
"loss": 1.8029,
"step": 7815
},
{
"epoch": 0.5269364239749335,
"grad_norm": 5.908502877639631,
"learning_rate": 5.401791786988068e-07,
"loss": 1.8193,
"step": 7820
},
{
"epoch": 0.5272733398470402,
"grad_norm": 5.524188749937658,
"learning_rate": 5.395930038344986e-07,
"loss": 1.8287,
"step": 7825
},
{
"epoch": 0.5276102557191469,
"grad_norm": 5.966495919240401,
"learning_rate": 5.390067742049073e-07,
"loss": 1.7747,
"step": 7830
},
{
"epoch": 0.5279471715912537,
"grad_norm": 6.028530102625983,
"learning_rate": 5.384204906209097e-07,
"loss": 1.7576,
"step": 7835
},
{
"epoch": 0.5282840874633604,
"grad_norm": 5.612125576553678,
"learning_rate": 5.378341538934566e-07,
"loss": 1.7362,
"step": 7840
},
{
"epoch": 0.5286210033354671,
"grad_norm": 6.444931841113274,
"learning_rate": 5.372477648335725e-07,
"loss": 1.8254,
"step": 7845
},
{
"epoch": 0.5289579192075738,
"grad_norm": 6.079946107054406,
"learning_rate": 5.366613242523544e-07,
"loss": 1.799,
"step": 7850
},
{
"epoch": 0.5292948350796806,
"grad_norm": 5.9372575273559365,
"learning_rate": 5.360748329609702e-07,
"loss": 1.8219,
"step": 7855
},
{
"epoch": 0.5296317509517874,
"grad_norm": 5.73892160169138,
"learning_rate": 5.354882917706586e-07,
"loss": 1.8598,
"step": 7860
},
{
"epoch": 0.5299686668238941,
"grad_norm": 6.080503146116818,
"learning_rate": 5.349017014927267e-07,
"loss": 1.8695,
"step": 7865
},
{
"epoch": 0.5303055826960008,
"grad_norm": 5.624633270336337,
"learning_rate": 5.343150629385496e-07,
"loss": 1.8171,
"step": 7870
},
{
"epoch": 0.5306424985681075,
"grad_norm": 6.59293279441881,
"learning_rate": 5.337283769195696e-07,
"loss": 1.7082,
"step": 7875
},
{
"epoch": 0.5309794144402142,
"grad_norm": 6.030473422245009,
"learning_rate": 5.331416442472941e-07,
"loss": 1.7242,
"step": 7880
},
{
"epoch": 0.5313163303123211,
"grad_norm": 6.394202607617881,
"learning_rate": 5.325548657332956e-07,
"loss": 1.7612,
"step": 7885
},
{
"epoch": 0.5316532461844278,
"grad_norm": 5.85821715132492,
"learning_rate": 5.319680421892095e-07,
"loss": 1.7571,
"step": 7890
},
{
"epoch": 0.5319901620565345,
"grad_norm": 6.318447507037375,
"learning_rate": 5.313811744267336e-07,
"loss": 1.8143,
"step": 7895
},
{
"epoch": 0.5323270779286412,
"grad_norm": 6.31265305059435,
"learning_rate": 5.30794263257627e-07,
"loss": 1.8149,
"step": 7900
},
{
"epoch": 0.5326639938007479,
"grad_norm": 5.780708946571393,
"learning_rate": 5.302073094937089e-07,
"loss": 1.7723,
"step": 7905
},
{
"epoch": 0.5330009096728547,
"grad_norm": 5.855290550087316,
"learning_rate": 5.296203139468571e-07,
"loss": 1.8075,
"step": 7910
},
{
"epoch": 0.5333378255449615,
"grad_norm": 5.943977721142977,
"learning_rate": 5.290332774290077e-07,
"loss": 1.7952,
"step": 7915
},
{
"epoch": 0.5336747414170682,
"grad_norm": 6.036595542010709,
"learning_rate": 5.284462007521528e-07,
"loss": 1.8388,
"step": 7920
},
{
"epoch": 0.5340116572891749,
"grad_norm": 5.683505340786709,
"learning_rate": 5.278590847283407e-07,
"loss": 1.7796,
"step": 7925
},
{
"epoch": 0.5343485731612816,
"grad_norm": 5.803826687134672,
"learning_rate": 5.27271930169674e-07,
"loss": 1.7972,
"step": 7930
},
{
"epoch": 0.5346854890333884,
"grad_norm": 6.286942676524797,
"learning_rate": 5.266847378883079e-07,
"loss": 1.8071,
"step": 7935
},
{
"epoch": 0.5350224049054951,
"grad_norm": 6.13011871952327,
"learning_rate": 5.260975086964507e-07,
"loss": 1.828,
"step": 7940
},
{
"epoch": 0.5353593207776018,
"grad_norm": 6.465750035776844,
"learning_rate": 5.255102434063612e-07,
"loss": 1.8118,
"step": 7945
},
{
"epoch": 0.5356962366497086,
"grad_norm": 6.173065557365732,
"learning_rate": 5.249229428303486e-07,
"loss": 1.8468,
"step": 7950
},
{
"epoch": 0.5360331525218153,
"grad_norm": 6.419517709100149,
"learning_rate": 5.243356077807704e-07,
"loss": 1.8322,
"step": 7955
},
{
"epoch": 0.536370068393922,
"grad_norm": 5.558598585066808,
"learning_rate": 5.237482390700319e-07,
"loss": 1.8718,
"step": 7960
},
{
"epoch": 0.5367069842660288,
"grad_norm": 5.719795797419367,
"learning_rate": 5.231608375105852e-07,
"loss": 1.7335,
"step": 7965
},
{
"epoch": 0.5370439001381355,
"grad_norm": 5.539968021502532,
"learning_rate": 5.225734039149277e-07,
"loss": 1.8108,
"step": 7970
},
{
"epoch": 0.5373808160102422,
"grad_norm": 5.609804958042693,
"learning_rate": 5.219859390956012e-07,
"loss": 1.7208,
"step": 7975
},
{
"epoch": 0.537717731882349,
"grad_norm": 6.023459731644414,
"learning_rate": 5.213984438651904e-07,
"loss": 1.7977,
"step": 7980
},
{
"epoch": 0.5380546477544557,
"grad_norm": 6.35720441432873,
"learning_rate": 5.208109190363222e-07,
"loss": 1.8483,
"step": 7985
},
{
"epoch": 0.5383915636265625,
"grad_norm": 6.258031001258222,
"learning_rate": 5.202233654216649e-07,
"loss": 1.8099,
"step": 7990
},
{
"epoch": 0.5387284794986692,
"grad_norm": 6.416922084645679,
"learning_rate": 5.196357838339259e-07,
"loss": 1.7425,
"step": 7995
},
{
"epoch": 0.5390653953707759,
"grad_norm": 5.6567156518400346,
"learning_rate": 5.190481750858516e-07,
"loss": 1.8003,
"step": 8000
},
{
"epoch": 0.5394023112428826,
"grad_norm": 6.3126246059511,
"learning_rate": 5.184605399902262e-07,
"loss": 1.8061,
"step": 8005
},
{
"epoch": 0.5397392271149893,
"grad_norm": 5.56295908767564,
"learning_rate": 5.178728793598699e-07,
"loss": 1.8358,
"step": 8010
},
{
"epoch": 0.5400761429870962,
"grad_norm": 6.186318540703496,
"learning_rate": 5.172851940076387e-07,
"loss": 1.8142,
"step": 8015
},
{
"epoch": 0.5404130588592029,
"grad_norm": 6.2506892198636255,
"learning_rate": 5.166974847464223e-07,
"loss": 1.8905,
"step": 8020
},
{
"epoch": 0.5407499747313096,
"grad_norm": 6.313235579462093,
"learning_rate": 5.161097523891437e-07,
"loss": 1.8203,
"step": 8025
},
{
"epoch": 0.5410868906034163,
"grad_norm": 6.171835646132762,
"learning_rate": 5.15521997748758e-07,
"loss": 1.8299,
"step": 8030
},
{
"epoch": 0.541423806475523,
"grad_norm": 5.409317737701678,
"learning_rate": 5.149342216382511e-07,
"loss": 1.7366,
"step": 8035
},
{
"epoch": 0.5417607223476298,
"grad_norm": 5.9172269055877615,
"learning_rate": 5.143464248706381e-07,
"loss": 1.7989,
"step": 8040
},
{
"epoch": 0.5420976382197366,
"grad_norm": 6.3600919594870335,
"learning_rate": 5.137586082589633e-07,
"loss": 1.8075,
"step": 8045
},
{
"epoch": 0.5424345540918433,
"grad_norm": 7.187853592405848,
"learning_rate": 5.131707726162983e-07,
"loss": 1.8324,
"step": 8050
},
{
"epoch": 0.54277146996395,
"grad_norm": 6.1090388109161635,
"learning_rate": 5.125829187557406e-07,
"loss": 1.7967,
"step": 8055
},
{
"epoch": 0.5431083858360567,
"grad_norm": 5.638278745047237,
"learning_rate": 5.119950474904137e-07,
"loss": 1.8695,
"step": 8060
},
{
"epoch": 0.5434453017081635,
"grad_norm": 5.663124594313191,
"learning_rate": 5.114071596334642e-07,
"loss": 1.7512,
"step": 8065
},
{
"epoch": 0.5437822175802702,
"grad_norm": 6.2665370581477795,
"learning_rate": 5.108192559980623e-07,
"loss": 1.844,
"step": 8070
},
{
"epoch": 0.544119133452377,
"grad_norm": 5.931434130921906,
"learning_rate": 5.102313373974e-07,
"loss": 1.876,
"step": 8075
},
{
"epoch": 0.5444560493244837,
"grad_norm": 5.980178228411234,
"learning_rate": 5.096434046446898e-07,
"loss": 1.7887,
"step": 8080
},
{
"epoch": 0.5447929651965904,
"grad_norm": 5.3738710489491295,
"learning_rate": 5.090554585531639e-07,
"loss": 1.7753,
"step": 8085
},
{
"epoch": 0.5451298810686972,
"grad_norm": 6.216952883756544,
"learning_rate": 5.084674999360729e-07,
"loss": 1.7961,
"step": 8090
},
{
"epoch": 0.5454667969408039,
"grad_norm": 6.152275028612343,
"learning_rate": 5.078795296066846e-07,
"loss": 1.8922,
"step": 8095
},
{
"epoch": 0.5458037128129106,
"grad_norm": 5.72793787759331,
"learning_rate": 5.072915483782833e-07,
"loss": 1.7049,
"step": 8100
},
{
"epoch": 0.5461406286850173,
"grad_norm": 5.9020152342288945,
"learning_rate": 5.067035570641678e-07,
"loss": 1.8247,
"step": 8105
},
{
"epoch": 0.546477544557124,
"grad_norm": 6.035798593915844,
"learning_rate": 5.061155564776517e-07,
"loss": 1.807,
"step": 8110
},
{
"epoch": 0.5468144604292309,
"grad_norm": 5.672397446189978,
"learning_rate": 5.055275474320609e-07,
"loss": 1.8352,
"step": 8115
},
{
"epoch": 0.5471513763013376,
"grad_norm": 6.009849527428413,
"learning_rate": 5.049395307407328e-07,
"loss": 1.8354,
"step": 8120
},
{
"epoch": 0.5474882921734443,
"grad_norm": 5.699405940003927,
"learning_rate": 5.04351507217016e-07,
"loss": 1.7494,
"step": 8125
},
{
"epoch": 0.547825208045551,
"grad_norm": 6.144976474040944,
"learning_rate": 5.03763477674268e-07,
"loss": 1.7602,
"step": 8130
},
{
"epoch": 0.5481621239176577,
"grad_norm": 6.340348103559182,
"learning_rate": 5.031754429258549e-07,
"loss": 1.8694,
"step": 8135
},
{
"epoch": 0.5484990397897644,
"grad_norm": 6.0788344731363555,
"learning_rate": 5.025874037851499e-07,
"loss": 1.8078,
"step": 8140
},
{
"epoch": 0.5488359556618713,
"grad_norm": 5.593092481508395,
"learning_rate": 5.019993610655322e-07,
"loss": 1.8018,
"step": 8145
},
{
"epoch": 0.549172871533978,
"grad_norm": 6.096728640322998,
"learning_rate": 5.014113155803863e-07,
"loss": 1.8155,
"step": 8150
},
{
"epoch": 0.5495097874060847,
"grad_norm": 5.870453527758819,
"learning_rate": 5.008232681430999e-07,
"loss": 1.8291,
"step": 8155
},
{
"epoch": 0.5498467032781914,
"grad_norm": 5.891573814310423,
"learning_rate": 5.002352195670643e-07,
"loss": 1.6946,
"step": 8160
},
{
"epoch": 0.5501836191502981,
"grad_norm": 6.217148650257244,
"learning_rate": 4.996471706656715e-07,
"loss": 1.7996,
"step": 8165
},
{
"epoch": 0.5505205350224049,
"grad_norm": 6.003361257252515,
"learning_rate": 4.990591222523142e-07,
"loss": 1.8232,
"step": 8170
},
{
"epoch": 0.5508574508945117,
"grad_norm": 5.517791176689642,
"learning_rate": 4.984710751403849e-07,
"loss": 1.7899,
"step": 8175
},
{
"epoch": 0.5511943667666184,
"grad_norm": 5.751512833055983,
"learning_rate": 4.978830301432738e-07,
"loss": 1.8628,
"step": 8180
},
{
"epoch": 0.5515312826387251,
"grad_norm": 6.65622502830475,
"learning_rate": 4.97294988074368e-07,
"loss": 1.8543,
"step": 8185
},
{
"epoch": 0.5518681985108318,
"grad_norm": 5.9681636803859766,
"learning_rate": 4.96706949747051e-07,
"loss": 1.755,
"step": 8190
},
{
"epoch": 0.5522051143829386,
"grad_norm": 5.96613622311191,
"learning_rate": 4.961189159747015e-07,
"loss": 1.8139,
"step": 8195
},
{
"epoch": 0.5525420302550453,
"grad_norm": 5.939980712328125,
"learning_rate": 4.955308875706905e-07,
"loss": 1.8386,
"step": 8200
},
{
"epoch": 0.552878946127152,
"grad_norm": 6.453800027528329,
"learning_rate": 4.94942865348383e-07,
"loss": 1.8015,
"step": 8205
},
{
"epoch": 0.5532158619992588,
"grad_norm": 5.962843605569349,
"learning_rate": 4.943548501211351e-07,
"loss": 1.8115,
"step": 8210
},
{
"epoch": 0.5535527778713655,
"grad_norm": 5.894927809262481,
"learning_rate": 4.937668427022924e-07,
"loss": 1.7926,
"step": 8215
},
{
"epoch": 0.5538896937434723,
"grad_norm": 5.923480477608677,
"learning_rate": 4.931788439051909e-07,
"loss": 1.7601,
"step": 8220
},
{
"epoch": 0.554226609615579,
"grad_norm": 6.050904501779012,
"learning_rate": 4.925908545431537e-07,
"loss": 1.7591,
"step": 8225
},
{
"epoch": 0.5545635254876857,
"grad_norm": 6.3323162814561815,
"learning_rate": 4.920028754294915e-07,
"loss": 1.7958,
"step": 8230
},
{
"epoch": 0.5549004413597924,
"grad_norm": 6.318956138145999,
"learning_rate": 4.914149073775003e-07,
"loss": 1.8323,
"step": 8235
},
{
"epoch": 0.5552373572318992,
"grad_norm": 5.959228809440483,
"learning_rate": 4.908269512004613e-07,
"loss": 1.7767,
"step": 8240
},
{
"epoch": 0.555574273104006,
"grad_norm": 6.102020489729409,
"learning_rate": 4.902390077116392e-07,
"loss": 1.7939,
"step": 8245
},
{
"epoch": 0.5559111889761127,
"grad_norm": 6.357928982728381,
"learning_rate": 4.896510777242805e-07,
"loss": 1.753,
"step": 8250
},
{
"epoch": 0.5562481048482194,
"grad_norm": 6.131919994254941,
"learning_rate": 4.890631620516141e-07,
"loss": 1.7993,
"step": 8255
},
{
"epoch": 0.5565850207203261,
"grad_norm": 6.255342386981415,
"learning_rate": 4.88475261506848e-07,
"loss": 1.771,
"step": 8260
},
{
"epoch": 0.5569219365924328,
"grad_norm": 6.11740983610477,
"learning_rate": 4.878873769031702e-07,
"loss": 1.8473,
"step": 8265
},
{
"epoch": 0.5572588524645397,
"grad_norm": 5.82647949361438,
"learning_rate": 4.872995090537459e-07,
"loss": 1.7722,
"step": 8270
},
{
"epoch": 0.5575957683366464,
"grad_norm": 6.220989210945246,
"learning_rate": 4.867116587717179e-07,
"loss": 1.8331,
"step": 8275
},
{
"epoch": 0.5579326842087531,
"grad_norm": 5.494470821652724,
"learning_rate": 4.861238268702039e-07,
"loss": 1.798,
"step": 8280
},
{
"epoch": 0.5582696000808598,
"grad_norm": 6.406158728808393,
"learning_rate": 4.855360141622965e-07,
"loss": 1.8354,
"step": 8285
},
{
"epoch": 0.5586065159529665,
"grad_norm": 5.482861394950503,
"learning_rate": 4.849482214610623e-07,
"loss": 1.8003,
"step": 8290
},
{
"epoch": 0.5589434318250733,
"grad_norm": 5.867393207721527,
"learning_rate": 4.843604495795392e-07,
"loss": 1.8327,
"step": 8295
},
{
"epoch": 0.55928034769718,
"grad_norm": 6.322610690410823,
"learning_rate": 4.83772699330737e-07,
"loss": 1.7719,
"step": 8300
},
{
"epoch": 0.5596172635692868,
"grad_norm": 6.042827407314324,
"learning_rate": 4.831849715276355e-07,
"loss": 1.7846,
"step": 8305
},
{
"epoch": 0.5599541794413935,
"grad_norm": 6.449090524228536,
"learning_rate": 4.825972669831834e-07,
"loss": 1.7506,
"step": 8310
},
{
"epoch": 0.5602910953135002,
"grad_norm": 6.010172412411375,
"learning_rate": 4.82009586510297e-07,
"loss": 1.7813,
"step": 8315
},
{
"epoch": 0.5606280111856069,
"grad_norm": 5.862016493948776,
"learning_rate": 4.814219309218594e-07,
"loss": 1.7944,
"step": 8320
},
{
"epoch": 0.5609649270577137,
"grad_norm": 5.560071531221416,
"learning_rate": 4.808343010307199e-07,
"loss": 1.8738,
"step": 8325
},
{
"epoch": 0.5613018429298204,
"grad_norm": 6.017795379555469,
"learning_rate": 4.802466976496911e-07,
"loss": 1.8071,
"step": 8330
},
{
"epoch": 0.5616387588019272,
"grad_norm": 6.220898694052647,
"learning_rate": 4.796591215915498e-07,
"loss": 1.8234,
"step": 8335
},
{
"epoch": 0.5619756746740339,
"grad_norm": 6.061974235031019,
"learning_rate": 4.79071573669035e-07,
"loss": 1.8432,
"step": 8340
},
{
"epoch": 0.5623125905461406,
"grad_norm": 5.588859747562851,
"learning_rate": 4.784840546948463e-07,
"loss": 1.7834,
"step": 8345
},
{
"epoch": 0.5626495064182474,
"grad_norm": 5.981374345991413,
"learning_rate": 4.778965654816435e-07,
"loss": 1.7511,
"step": 8350
},
{
"epoch": 0.5629864222903541,
"grad_norm": 6.024335588504932,
"learning_rate": 4.773091068420455e-07,
"loss": 1.8607,
"step": 8355
},
{
"epoch": 0.5633233381624608,
"grad_norm": 5.760163267537706,
"learning_rate": 4.767216795886281e-07,
"loss": 1.822,
"step": 8360
},
{
"epoch": 0.5636602540345675,
"grad_norm": 6.0879150017865475,
"learning_rate": 4.761342845339246e-07,
"loss": 1.8162,
"step": 8365
},
{
"epoch": 0.5639971699066743,
"grad_norm": 6.335467066206414,
"learning_rate": 4.7554692249042345e-07,
"loss": 1.8653,
"step": 8370
},
{
"epoch": 0.5643340857787811,
"grad_norm": 6.132419754520845,
"learning_rate": 4.7495959427056754e-07,
"loss": 1.8353,
"step": 8375
},
{
"epoch": 0.5646710016508878,
"grad_norm": 5.63466915885368,
"learning_rate": 4.743723006867523e-07,
"loss": 1.7984,
"step": 8380
},
{
"epoch": 0.5650079175229945,
"grad_norm": 5.864284044006815,
"learning_rate": 4.737850425513263e-07,
"loss": 1.8202,
"step": 8385
},
{
"epoch": 0.5653448333951012,
"grad_norm": 5.707122660502422,
"learning_rate": 4.731978206765884e-07,
"loss": 1.8131,
"step": 8390
},
{
"epoch": 0.5656817492672079,
"grad_norm": 6.145138605313598,
"learning_rate": 4.726106358747871e-07,
"loss": 1.8188,
"step": 8395
},
{
"epoch": 0.5660186651393148,
"grad_norm": 5.810736149763557,
"learning_rate": 4.720234889581203e-07,
"loss": 1.854,
"step": 8400
},
{
"epoch": 0.5663555810114215,
"grad_norm": 6.178151042174748,
"learning_rate": 4.714363807387333e-07,
"loss": 1.8249,
"step": 8405
},
{
"epoch": 0.5666924968835282,
"grad_norm": 6.048303318074244,
"learning_rate": 4.708493120287175e-07,
"loss": 1.8319,
"step": 8410
},
{
"epoch": 0.5670294127556349,
"grad_norm": 5.506193309921177,
"learning_rate": 4.7026228364010984e-07,
"loss": 1.7959,
"step": 8415
},
{
"epoch": 0.5673663286277416,
"grad_norm": 6.236662710850324,
"learning_rate": 4.69675296384892e-07,
"loss": 1.8518,
"step": 8420
},
{
"epoch": 0.5677032444998484,
"grad_norm": 5.832229589759707,
"learning_rate": 4.6908835107498775e-07,
"loss": 1.8272,
"step": 8425
},
{
"epoch": 0.5680401603719551,
"grad_norm": 5.7248194948890445,
"learning_rate": 4.685014485222637e-07,
"loss": 1.8037,
"step": 8430
},
{
"epoch": 0.5683770762440619,
"grad_norm": 5.864436881700974,
"learning_rate": 4.679145895385269e-07,
"loss": 1.8137,
"step": 8435
},
{
"epoch": 0.5687139921161686,
"grad_norm": 6.05240631758938,
"learning_rate": 4.673277749355245e-07,
"loss": 1.7116,
"step": 8440
},
{
"epoch": 0.5690509079882753,
"grad_norm": 6.322483796907963,
"learning_rate": 4.667410055249417e-07,
"loss": 1.7331,
"step": 8445
},
{
"epoch": 0.5693878238603821,
"grad_norm": 6.218327051131996,
"learning_rate": 4.6615428211840154e-07,
"loss": 1.7826,
"step": 8450
},
{
"epoch": 0.5697247397324888,
"grad_norm": 5.964493626003666,
"learning_rate": 4.655676055274637e-07,
"loss": 1.8168,
"step": 8455
},
{
"epoch": 0.5700616556045955,
"grad_norm": 6.231161584629056,
"learning_rate": 4.6498097656362247e-07,
"loss": 1.8235,
"step": 8460
},
{
"epoch": 0.5703985714767023,
"grad_norm": 6.198185695145207,
"learning_rate": 4.643943960383067e-07,
"loss": 1.7135,
"step": 8465
},
{
"epoch": 0.570735487348809,
"grad_norm": 6.042628919234112,
"learning_rate": 4.638078647628782e-07,
"loss": 1.9163,
"step": 8470
},
{
"epoch": 0.5710724032209158,
"grad_norm": 6.169541691639398,
"learning_rate": 4.632213835486305e-07,
"loss": 1.8465,
"step": 8475
},
{
"epoch": 0.5714093190930225,
"grad_norm": 6.080905752465426,
"learning_rate": 4.626349532067879e-07,
"loss": 1.8297,
"step": 8480
},
{
"epoch": 0.5717462349651292,
"grad_norm": 5.721896292040939,
"learning_rate": 4.620485745485046e-07,
"loss": 1.7479,
"step": 8485
},
{
"epoch": 0.5720831508372359,
"grad_norm": 5.945565469765499,
"learning_rate": 4.6146224838486287e-07,
"loss": 1.8715,
"step": 8490
},
{
"epoch": 0.5724200667093426,
"grad_norm": 5.659262127072452,
"learning_rate": 4.6087597552687275e-07,
"loss": 1.8375,
"step": 8495
},
{
"epoch": 0.5727569825814494,
"grad_norm": 5.794077318723356,
"learning_rate": 4.602897567854705e-07,
"loss": 1.7981,
"step": 8500
},
{
"epoch": 0.5730938984535562,
"grad_norm": 6.099467621941091,
"learning_rate": 4.5970359297151733e-07,
"loss": 1.7755,
"step": 8505
},
{
"epoch": 0.5734308143256629,
"grad_norm": 5.984648189742544,
"learning_rate": 4.591174848957986e-07,
"loss": 1.8419,
"step": 8510
},
{
"epoch": 0.5737677301977696,
"grad_norm": 6.048661924764337,
"learning_rate": 4.585314333690224e-07,
"loss": 1.816,
"step": 8515
},
{
"epoch": 0.5741046460698763,
"grad_norm": 5.522305365781618,
"learning_rate": 4.579454392018192e-07,
"loss": 1.7642,
"step": 8520
},
{
"epoch": 0.574441561941983,
"grad_norm": 5.53435047065202,
"learning_rate": 4.5735950320473915e-07,
"loss": 1.7835,
"step": 8525
},
{
"epoch": 0.5747784778140899,
"grad_norm": 5.945621879771232,
"learning_rate": 4.5677362618825265e-07,
"loss": 1.7643,
"step": 8530
},
{
"epoch": 0.5751153936861966,
"grad_norm": 6.032046377900596,
"learning_rate": 4.5618780896274866e-07,
"loss": 1.7984,
"step": 8535
},
{
"epoch": 0.5754523095583033,
"grad_norm": 6.444772493536185,
"learning_rate": 4.556020523385326e-07,
"loss": 1.7324,
"step": 8540
},
{
"epoch": 0.57578922543041,
"grad_norm": 5.792224697201741,
"learning_rate": 4.55016357125827e-07,
"loss": 1.7728,
"step": 8545
},
{
"epoch": 0.5761261413025167,
"grad_norm": 5.753574407901725,
"learning_rate": 4.5443072413476877e-07,
"loss": 1.7846,
"step": 8550
},
{
"epoch": 0.5764630571746235,
"grad_norm": 6.350023820699443,
"learning_rate": 4.5384515417540914e-07,
"loss": 1.7999,
"step": 8555
},
{
"epoch": 0.5767999730467303,
"grad_norm": 5.6366643382723876,
"learning_rate": 4.5325964805771187e-07,
"loss": 1.8145,
"step": 8560
},
{
"epoch": 0.577136888918837,
"grad_norm": 5.750861217365839,
"learning_rate": 4.526742065915528e-07,
"loss": 1.7306,
"step": 8565
},
{
"epoch": 0.5774738047909437,
"grad_norm": 5.4864565976564466,
"learning_rate": 4.520888305867181e-07,
"loss": 1.773,
"step": 8570
},
{
"epoch": 0.5778107206630504,
"grad_norm": 6.158976718779292,
"learning_rate": 4.5150352085290315e-07,
"loss": 1.8288,
"step": 8575
},
{
"epoch": 0.5781476365351572,
"grad_norm": 5.909685923044137,
"learning_rate": 4.5091827819971207e-07,
"loss": 1.8347,
"step": 8580
},
{
"epoch": 0.5784845524072639,
"grad_norm": 6.080997476219573,
"learning_rate": 4.503331034366563e-07,
"loss": 1.8089,
"step": 8585
},
{
"epoch": 0.5788214682793706,
"grad_norm": 5.724875421513874,
"learning_rate": 4.4974799737315274e-07,
"loss": 1.7667,
"step": 8590
},
{
"epoch": 0.5791583841514774,
"grad_norm": 5.610253640773116,
"learning_rate": 4.491629608185237e-07,
"loss": 1.7924,
"step": 8595
},
{
"epoch": 0.5794953000235841,
"grad_norm": 5.610898049083143,
"learning_rate": 4.485779945819956e-07,
"loss": 1.7901,
"step": 8600
},
{
"epoch": 0.5798322158956909,
"grad_norm": 5.81753396746651,
"learning_rate": 4.479930994726968e-07,
"loss": 1.7487,
"step": 8605
},
{
"epoch": 0.5801691317677976,
"grad_norm": 6.1155229721373745,
"learning_rate": 4.474082762996581e-07,
"loss": 1.8492,
"step": 8610
},
{
"epoch": 0.5805060476399043,
"grad_norm": 6.165815910325223,
"learning_rate": 4.468235258718105e-07,
"loss": 1.8476,
"step": 8615
},
{
"epoch": 0.580842963512011,
"grad_norm": 5.803183552453552,
"learning_rate": 4.4623884899798397e-07,
"loss": 1.8046,
"step": 8620
},
{
"epoch": 0.5811798793841177,
"grad_norm": 6.357851078191232,
"learning_rate": 4.4565424648690743e-07,
"loss": 1.7539,
"step": 8625
},
{
"epoch": 0.5815167952562246,
"grad_norm": 6.508465623286144,
"learning_rate": 4.450697191472067e-07,
"loss": 1.7605,
"step": 8630
},
{
"epoch": 0.5818537111283313,
"grad_norm": 5.792009077737253,
"learning_rate": 4.4448526778740327e-07,
"loss": 1.7607,
"step": 8635
},
{
"epoch": 0.582190627000438,
"grad_norm": 6.377605622379964,
"learning_rate": 4.439008932159138e-07,
"loss": 1.8341,
"step": 8640
},
{
"epoch": 0.5825275428725447,
"grad_norm": 5.989708484254575,
"learning_rate": 4.4331659624104876e-07,
"loss": 1.8957,
"step": 8645
},
{
"epoch": 0.5828644587446514,
"grad_norm": 6.135183719688721,
"learning_rate": 4.427323776710117e-07,
"loss": 1.8505,
"step": 8650
},
{
"epoch": 0.5832013746167583,
"grad_norm": 6.263770305683316,
"learning_rate": 4.4214823831389663e-07,
"loss": 1.8154,
"step": 8655
},
{
"epoch": 0.583538290488865,
"grad_norm": 5.800608275245644,
"learning_rate": 4.41564178977689e-07,
"loss": 1.7261,
"step": 8660
},
{
"epoch": 0.5838752063609717,
"grad_norm": 5.809205967021153,
"learning_rate": 4.4098020047026343e-07,
"loss": 1.7584,
"step": 8665
},
{
"epoch": 0.5842121222330784,
"grad_norm": 5.963419654218539,
"learning_rate": 4.4039630359938194e-07,
"loss": 1.7529,
"step": 8670
},
{
"epoch": 0.5845490381051851,
"grad_norm": 5.8426129848767845,
"learning_rate": 4.3981248917269477e-07,
"loss": 1.824,
"step": 8675
},
{
"epoch": 0.5848859539772918,
"grad_norm": 5.944106275409081,
"learning_rate": 4.3922875799773735e-07,
"loss": 1.7739,
"step": 8680
},
{
"epoch": 0.5852228698493986,
"grad_norm": 5.773197551120905,
"learning_rate": 4.386451108819302e-07,
"loss": 1.7682,
"step": 8685
},
{
"epoch": 0.5855597857215054,
"grad_norm": 6.552569491362873,
"learning_rate": 4.380615486325774e-07,
"loss": 1.8046,
"step": 8690
},
{
"epoch": 0.5858967015936121,
"grad_norm": 5.456979786224337,
"learning_rate": 4.3747807205686616e-07,
"loss": 1.7698,
"step": 8695
},
{
"epoch": 0.5862336174657188,
"grad_norm": 5.11890654536534,
"learning_rate": 4.3689468196186433e-07,
"loss": 1.7561,
"step": 8700
},
{
"epoch": 0.5865705333378255,
"grad_norm": 5.716069902489952,
"learning_rate": 4.36311379154521e-07,
"loss": 1.8601,
"step": 8705
},
{
"epoch": 0.5869074492099323,
"grad_norm": 6.22330264411751,
"learning_rate": 4.3572816444166406e-07,
"loss": 1.8129,
"step": 8710
},
{
"epoch": 0.587244365082039,
"grad_norm": 6.086909858486789,
"learning_rate": 4.351450386299996e-07,
"loss": 1.721,
"step": 8715
},
{
"epoch": 0.5875812809541457,
"grad_norm": 6.297426986410737,
"learning_rate": 4.3456200252611075e-07,
"loss": 1.8179,
"step": 8720
},
{
"epoch": 0.5879181968262525,
"grad_norm": 5.840418602695064,
"learning_rate": 4.3397905693645653e-07,
"loss": 1.8302,
"step": 8725
},
{
"epoch": 0.5882551126983592,
"grad_norm": 6.3296952466437695,
"learning_rate": 4.3339620266737116e-07,
"loss": 1.7801,
"step": 8730
},
{
"epoch": 0.588592028570466,
"grad_norm": 5.703161646952239,
"learning_rate": 4.328134405250617e-07,
"loss": 1.7261,
"step": 8735
},
{
"epoch": 0.5889289444425727,
"grad_norm": 6.296431483288242,
"learning_rate": 4.322307713156085e-07,
"loss": 1.7399,
"step": 8740
},
{
"epoch": 0.5892658603146794,
"grad_norm": 6.319426786873787,
"learning_rate": 4.316481958449634e-07,
"loss": 1.7458,
"step": 8745
},
{
"epoch": 0.5896027761867861,
"grad_norm": 5.616103643263215,
"learning_rate": 4.310657149189478e-07,
"loss": 1.8584,
"step": 8750
},
{
"epoch": 0.5899396920588929,
"grad_norm": 6.108036015594825,
"learning_rate": 4.3048332934325325e-07,
"loss": 1.8297,
"step": 8755
},
{
"epoch": 0.5902766079309997,
"grad_norm": 6.515815846454065,
"learning_rate": 4.2990103992343893e-07,
"loss": 1.771,
"step": 8760
},
{
"epoch": 0.5906135238031064,
"grad_norm": 5.921183552077239,
"learning_rate": 4.2931884746493107e-07,
"loss": 1.8703,
"step": 8765
},
{
"epoch": 0.5909504396752131,
"grad_norm": 6.096104391048515,
"learning_rate": 4.287367527730216e-07,
"loss": 1.7736,
"step": 8770
},
{
"epoch": 0.5912873555473198,
"grad_norm": 6.05992060448168,
"learning_rate": 4.2815475665286766e-07,
"loss": 1.8258,
"step": 8775
},
{
"epoch": 0.5916242714194265,
"grad_norm": 6.535494644184692,
"learning_rate": 4.2757285990948993e-07,
"loss": 1.8059,
"step": 8780
},
{
"epoch": 0.5919611872915334,
"grad_norm": 6.1860767579422795,
"learning_rate": 4.269910633477711e-07,
"loss": 1.7594,
"step": 8785
},
{
"epoch": 0.5922981031636401,
"grad_norm": 5.972961366882138,
"learning_rate": 4.264093677724561e-07,
"loss": 1.8025,
"step": 8790
},
{
"epoch": 0.5926350190357468,
"grad_norm": 5.556665434802162,
"learning_rate": 4.2582777398814966e-07,
"loss": 1.7414,
"step": 8795
},
{
"epoch": 0.5929719349078535,
"grad_norm": 5.92972532751255,
"learning_rate": 4.252462827993158e-07,
"loss": 1.7877,
"step": 8800
},
{
"epoch": 0.5933088507799602,
"grad_norm": 6.056102795509996,
"learning_rate": 4.246648950102765e-07,
"loss": 1.8636,
"step": 8805
},
{
"epoch": 0.593645766652067,
"grad_norm": 5.980939951020172,
"learning_rate": 4.240836114252112e-07,
"loss": 1.8036,
"step": 8810
},
{
"epoch": 0.5939826825241737,
"grad_norm": 6.003334901011222,
"learning_rate": 4.2350243284815445e-07,
"loss": 1.7733,
"step": 8815
},
{
"epoch": 0.5943195983962805,
"grad_norm": 5.955573412552551,
"learning_rate": 4.229213600829963e-07,
"loss": 1.8063,
"step": 8820
},
{
"epoch": 0.5946565142683872,
"grad_norm": 5.656293844274432,
"learning_rate": 4.223403939334802e-07,
"loss": 1.8226,
"step": 8825
},
{
"epoch": 0.5949934301404939,
"grad_norm": 6.749213331616512,
"learning_rate": 4.217595352032017e-07,
"loss": 1.819,
"step": 8830
},
{
"epoch": 0.5953303460126007,
"grad_norm": 6.117182599544254,
"learning_rate": 4.2117878469560834e-07,
"loss": 1.7613,
"step": 8835
},
{
"epoch": 0.5956672618847074,
"grad_norm": 5.520177923235137,
"learning_rate": 4.205981432139978e-07,
"loss": 1.768,
"step": 8840
},
{
"epoch": 0.5960041777568141,
"grad_norm": 5.711090146795336,
"learning_rate": 4.200176115615169e-07,
"loss": 1.8229,
"step": 8845
},
{
"epoch": 0.5963410936289208,
"grad_norm": 6.036353569783169,
"learning_rate": 4.1943719054116027e-07,
"loss": 1.7739,
"step": 8850
},
{
"epoch": 0.5966780095010276,
"grad_norm": 6.3115691145794965,
"learning_rate": 4.1885688095577e-07,
"loss": 1.7397,
"step": 8855
},
{
"epoch": 0.5970149253731343,
"grad_norm": 5.7168631691824245,
"learning_rate": 4.182766836080339e-07,
"loss": 1.753,
"step": 8860
},
{
"epoch": 0.5973518412452411,
"grad_norm": 6.186256876910364,
"learning_rate": 4.176965993004842e-07,
"loss": 1.8123,
"step": 8865
},
{
"epoch": 0.5976887571173478,
"grad_norm": 6.505307800637823,
"learning_rate": 4.171166288354971e-07,
"loss": 1.8067,
"step": 8870
},
{
"epoch": 0.5980256729894545,
"grad_norm": 5.760914031493992,
"learning_rate": 4.165367730152917e-07,
"loss": 1.8102,
"step": 8875
},
{
"epoch": 0.5983625888615612,
"grad_norm": 6.056216167167739,
"learning_rate": 4.1595703264192737e-07,
"loss": 1.8183,
"step": 8880
},
{
"epoch": 0.598699504733668,
"grad_norm": 5.852866203850851,
"learning_rate": 4.15377408517305e-07,
"loss": 1.7875,
"step": 8885
},
{
"epoch": 0.5990364206057748,
"grad_norm": 6.083621089321414,
"learning_rate": 4.147979014431642e-07,
"loss": 1.7886,
"step": 8890
},
{
"epoch": 0.5993733364778815,
"grad_norm": 6.6265834284264695,
"learning_rate": 4.142185122210823e-07,
"loss": 1.7928,
"step": 8895
},
{
"epoch": 0.5997102523499882,
"grad_norm": 5.789010083005185,
"learning_rate": 4.136392416524742e-07,
"loss": 1.8099,
"step": 8900
},
{
"epoch": 0.6000471682220949,
"grad_norm": 6.131350960422217,
"learning_rate": 4.1306009053859043e-07,
"loss": 1.7875,
"step": 8905
},
{
"epoch": 0.6003840840942016,
"grad_norm": 6.264733608982983,
"learning_rate": 4.124810596805166e-07,
"loss": 1.7994,
"step": 8910
},
{
"epoch": 0.6007209999663085,
"grad_norm": 5.829929322028521,
"learning_rate": 4.119021498791712e-07,
"loss": 1.8682,
"step": 8915
},
{
"epoch": 0.6010579158384152,
"grad_norm": 5.515885939117128,
"learning_rate": 4.113233619353062e-07,
"loss": 1.78,
"step": 8920
},
{
"epoch": 0.6013948317105219,
"grad_norm": 6.529160464329195,
"learning_rate": 4.107446966495044e-07,
"loss": 1.8267,
"step": 8925
},
{
"epoch": 0.6017317475826286,
"grad_norm": 5.767404449030353,
"learning_rate": 4.101661548221792e-07,
"loss": 1.8162,
"step": 8930
},
{
"epoch": 0.6020686634547353,
"grad_norm": 6.2522730607410795,
"learning_rate": 4.0958773725357297e-07,
"loss": 1.7927,
"step": 8935
},
{
"epoch": 0.6024055793268421,
"grad_norm": 5.920238162873758,
"learning_rate": 4.0900944474375674e-07,
"loss": 1.8076,
"step": 8940
},
{
"epoch": 0.6027424951989488,
"grad_norm": 6.070363136220485,
"learning_rate": 4.084312780926279e-07,
"loss": 1.76,
"step": 8945
},
{
"epoch": 0.6030794110710556,
"grad_norm": 6.080971532825581,
"learning_rate": 4.0785323809991006e-07,
"loss": 1.8373,
"step": 8950
},
{
"epoch": 0.6034163269431623,
"grad_norm": 6.008686213516527,
"learning_rate": 4.072753255651521e-07,
"loss": 1.7578,
"step": 8955
},
{
"epoch": 0.603753242815269,
"grad_norm": 6.373636076172385,
"learning_rate": 4.066975412877255e-07,
"loss": 1.7397,
"step": 8960
},
{
"epoch": 0.6040901586873758,
"grad_norm": 7.110282397737251,
"learning_rate": 4.0611988606682544e-07,
"loss": 1.7594,
"step": 8965
},
{
"epoch": 0.6044270745594825,
"grad_norm": 6.569745758459806,
"learning_rate": 4.0554236070146785e-07,
"loss": 1.8107,
"step": 8970
},
{
"epoch": 0.6047639904315892,
"grad_norm": 6.442153100708266,
"learning_rate": 4.0496496599048963e-07,
"loss": 1.7975,
"step": 8975
},
{
"epoch": 0.605100906303696,
"grad_norm": 6.337402589376834,
"learning_rate": 4.0438770273254624e-07,
"loss": 1.7996,
"step": 8980
},
{
"epoch": 0.6054378221758027,
"grad_norm": 6.379223050210244,
"learning_rate": 4.038105717261119e-07,
"loss": 1.757,
"step": 8985
},
{
"epoch": 0.6057747380479095,
"grad_norm": 5.746548813457159,
"learning_rate": 4.03233573769478e-07,
"loss": 1.8244,
"step": 8990
},
{
"epoch": 0.6061116539200162,
"grad_norm": 6.244048991447363,
"learning_rate": 4.026567096607511e-07,
"loss": 1.7605,
"step": 8995
},
{
"epoch": 0.6064485697921229,
"grad_norm": 5.893004419934275,
"learning_rate": 4.020799801978535e-07,
"loss": 1.8646,
"step": 9000
},
{
"epoch": 0.6067854856642296,
"grad_norm": 6.3581750988564565,
"learning_rate": 4.015033861785208e-07,
"loss": 1.8016,
"step": 9005
},
{
"epoch": 0.6071224015363363,
"grad_norm": 6.024642368549191,
"learning_rate": 4.0092692840030126e-07,
"loss": 1.8321,
"step": 9010
},
{
"epoch": 0.6074593174084432,
"grad_norm": 5.692437973796029,
"learning_rate": 4.003506076605547e-07,
"loss": 1.7497,
"step": 9015
},
{
"epoch": 0.6077962332805499,
"grad_norm": 5.904037102274744,
"learning_rate": 3.997744247564519e-07,
"loss": 1.7012,
"step": 9020
},
{
"epoch": 0.6081331491526566,
"grad_norm": 5.770842989006306,
"learning_rate": 3.9919838048497197e-07,
"loss": 1.8748,
"step": 9025
},
{
"epoch": 0.6084700650247633,
"grad_norm": 5.717267913626994,
"learning_rate": 3.98622475642903e-07,
"loss": 1.7992,
"step": 9030
},
{
"epoch": 0.60880698089687,
"grad_norm": 6.54894021053903,
"learning_rate": 3.980467110268405e-07,
"loss": 1.7435,
"step": 9035
},
{
"epoch": 0.6091438967689767,
"grad_norm": 6.259824241149931,
"learning_rate": 3.9747108743318493e-07,
"loss": 1.8146,
"step": 9040
},
{
"epoch": 0.6094808126410836,
"grad_norm": 5.981839546695795,
"learning_rate": 3.968956056581428e-07,
"loss": 1.7738,
"step": 9045
},
{
"epoch": 0.6098177285131903,
"grad_norm": 6.055280113756847,
"learning_rate": 3.9632026649772366e-07,
"loss": 1.7324,
"step": 9050
},
{
"epoch": 0.610154644385297,
"grad_norm": 6.289372477937633,
"learning_rate": 3.9574507074774056e-07,
"loss": 1.7778,
"step": 9055
},
{
"epoch": 0.6104915602574037,
"grad_norm": 5.875270432715915,
"learning_rate": 3.951700192038072e-07,
"loss": 1.7313,
"step": 9060
},
{
"epoch": 0.6108284761295104,
"grad_norm": 5.598591559503298,
"learning_rate": 3.945951126613387e-07,
"loss": 1.8703,
"step": 9065
},
{
"epoch": 0.6111653920016172,
"grad_norm": 6.69707404500823,
"learning_rate": 3.9402035191554937e-07,
"loss": 1.7974,
"step": 9070
},
{
"epoch": 0.611502307873724,
"grad_norm": 6.008857949179827,
"learning_rate": 3.934457377614514e-07,
"loss": 1.7459,
"step": 9075
},
{
"epoch": 0.6118392237458307,
"grad_norm": 5.617095082901805,
"learning_rate": 3.9287127099385483e-07,
"loss": 1.8318,
"step": 9080
},
{
"epoch": 0.6121761396179374,
"grad_norm": 6.535128437503064,
"learning_rate": 3.9229695240736567e-07,
"loss": 1.7986,
"step": 9085
},
{
"epoch": 0.6125130554900441,
"grad_norm": 6.152619512707627,
"learning_rate": 3.917227827963846e-07,
"loss": 1.7792,
"step": 9090
},
{
"epoch": 0.6128499713621509,
"grad_norm": 6.3313341167472155,
"learning_rate": 3.9114876295510653e-07,
"loss": 1.7732,
"step": 9095
},
{
"epoch": 0.6131868872342576,
"grad_norm": 5.895957841166245,
"learning_rate": 3.9057489367751947e-07,
"loss": 1.7714,
"step": 9100
},
{
"epoch": 0.6135238031063643,
"grad_norm": 5.798092152940961,
"learning_rate": 3.900011757574024e-07,
"loss": 1.862,
"step": 9105
},
{
"epoch": 0.613860718978471,
"grad_norm": 6.545396318492506,
"learning_rate": 3.894276099883258e-07,
"loss": 1.7627,
"step": 9110
},
{
"epoch": 0.6141976348505778,
"grad_norm": 6.161928532867873,
"learning_rate": 3.888541971636492e-07,
"loss": 1.7453,
"step": 9115
},
{
"epoch": 0.6145345507226846,
"grad_norm": 6.748188653900746,
"learning_rate": 3.8828093807652095e-07,
"loss": 1.8917,
"step": 9120
},
{
"epoch": 0.6148714665947913,
"grad_norm": 6.159848242287639,
"learning_rate": 3.8770783351987605e-07,
"loss": 1.7577,
"step": 9125
},
{
"epoch": 0.615208382466898,
"grad_norm": 6.534801005504792,
"learning_rate": 3.8713488428643656e-07,
"loss": 1.7931,
"step": 9130
},
{
"epoch": 0.6155452983390047,
"grad_norm": 5.739673674024533,
"learning_rate": 3.8656209116870906e-07,
"loss": 1.7721,
"step": 9135
},
{
"epoch": 0.6158822142111114,
"grad_norm": 5.7311702430866145,
"learning_rate": 3.859894549589847e-07,
"loss": 1.8119,
"step": 9140
},
{
"epoch": 0.6162191300832183,
"grad_norm": 6.415780867852155,
"learning_rate": 3.854169764493371e-07,
"loss": 1.8124,
"step": 9145
},
{
"epoch": 0.616556045955325,
"grad_norm": 5.955671102064489,
"learning_rate": 3.848446564316223e-07,
"loss": 1.7789,
"step": 9150
},
{
"epoch": 0.6168929618274317,
"grad_norm": 6.461847161048739,
"learning_rate": 3.8427249569747656e-07,
"loss": 1.791,
"step": 9155
},
{
"epoch": 0.6172298776995384,
"grad_norm": 6.0663090140076745,
"learning_rate": 3.8370049503831614e-07,
"loss": 1.7913,
"step": 9160
},
{
"epoch": 0.6175667935716451,
"grad_norm": 6.03812934458201,
"learning_rate": 3.8312865524533606e-07,
"loss": 1.773,
"step": 9165
},
{
"epoch": 0.617903709443752,
"grad_norm": 5.945622065488643,
"learning_rate": 3.825569771095082e-07,
"loss": 1.8358,
"step": 9170
},
{
"epoch": 0.6182406253158587,
"grad_norm": 6.041236208673518,
"learning_rate": 3.819854614215814e-07,
"loss": 1.8262,
"step": 9175
},
{
"epoch": 0.6185775411879654,
"grad_norm": 6.344915324522393,
"learning_rate": 3.814141089720796e-07,
"loss": 1.832,
"step": 9180
},
{
"epoch": 0.6189144570600721,
"grad_norm": 6.343605951445983,
"learning_rate": 3.8084292055130126e-07,
"loss": 1.8455,
"step": 9185
},
{
"epoch": 0.6192513729321788,
"grad_norm": 6.0890477862561,
"learning_rate": 3.8027189694931715e-07,
"loss": 1.7903,
"step": 9190
},
{
"epoch": 0.6195882888042856,
"grad_norm": 5.747051553245304,
"learning_rate": 3.797010389559708e-07,
"loss": 1.8008,
"step": 9195
},
{
"epoch": 0.6199252046763923,
"grad_norm": 6.455198765907777,
"learning_rate": 3.7913034736087677e-07,
"loss": 1.8564,
"step": 9200
},
{
"epoch": 0.620262120548499,
"grad_norm": 6.107645595637579,
"learning_rate": 3.785598229534186e-07,
"loss": 1.8038,
"step": 9205
},
{
"epoch": 0.6205990364206058,
"grad_norm": 5.6893786974616205,
"learning_rate": 3.7798946652274943e-07,
"loss": 1.7916,
"step": 9210
},
{
"epoch": 0.6209359522927125,
"grad_norm": 5.386804515229501,
"learning_rate": 3.7741927885778966e-07,
"loss": 1.7064,
"step": 9215
},
{
"epoch": 0.6212728681648192,
"grad_norm": 5.5903465122048805,
"learning_rate": 3.768492607472263e-07,
"loss": 1.8083,
"step": 9220
},
{
"epoch": 0.621609784036926,
"grad_norm": 5.597070370599875,
"learning_rate": 3.7627941297951183e-07,
"loss": 1.8165,
"step": 9225
},
{
"epoch": 0.6219466999090327,
"grad_norm": 5.70072119730533,
"learning_rate": 3.7570973634286334e-07,
"loss": 1.7952,
"step": 9230
},
{
"epoch": 0.6222836157811394,
"grad_norm": 5.808800928790782,
"learning_rate": 3.7514023162526066e-07,
"loss": 1.7582,
"step": 9235
},
{
"epoch": 0.6226205316532462,
"grad_norm": 5.88125644092123,
"learning_rate": 3.745708996144463e-07,
"loss": 1.832,
"step": 9240
},
{
"epoch": 0.6229574475253529,
"grad_norm": 6.384851274473344,
"learning_rate": 3.740017410979239e-07,
"loss": 1.7643,
"step": 9245
},
{
"epoch": 0.6232943633974597,
"grad_norm": 6.047883416599075,
"learning_rate": 3.734327568629569e-07,
"loss": 1.8043,
"step": 9250
},
{
"epoch": 0.6236312792695664,
"grad_norm": 5.338272475290313,
"learning_rate": 3.728639476965678e-07,
"loss": 1.7449,
"step": 9255
},
{
"epoch": 0.6239681951416731,
"grad_norm": 6.428658869577709,
"learning_rate": 3.7229531438553664e-07,
"loss": 1.7997,
"step": 9260
},
{
"epoch": 0.6243051110137798,
"grad_norm": 6.026885048415576,
"learning_rate": 3.7172685771640076e-07,
"loss": 1.8192,
"step": 9265
},
{
"epoch": 0.6246420268858865,
"grad_norm": 6.122237637003847,
"learning_rate": 3.7115857847545264e-07,
"loss": 1.7574,
"step": 9270
},
{
"epoch": 0.6249789427579934,
"grad_norm": 5.558903228452016,
"learning_rate": 3.7059047744873955e-07,
"loss": 1.8024,
"step": 9275
},
{
"epoch": 0.6253158586301001,
"grad_norm": 5.430476107786122,
"learning_rate": 3.700225554220626e-07,
"loss": 1.8056,
"step": 9280
},
{
"epoch": 0.6256527745022068,
"grad_norm": 5.82118125541433,
"learning_rate": 3.694548131809747e-07,
"loss": 1.7898,
"step": 9285
},
{
"epoch": 0.6259896903743135,
"grad_norm": 5.714807325686535,
"learning_rate": 3.6888725151078024e-07,
"loss": 1.8007,
"step": 9290
},
{
"epoch": 0.6263266062464202,
"grad_norm": 5.669878903492906,
"learning_rate": 3.683198711965345e-07,
"loss": 1.746,
"step": 9295
},
{
"epoch": 0.626663522118527,
"grad_norm": 5.814699385958482,
"learning_rate": 3.677526730230408e-07,
"loss": 1.7676,
"step": 9300
},
{
"epoch": 0.6270004379906338,
"grad_norm": 6.813971359453809,
"learning_rate": 3.671856577748512e-07,
"loss": 1.7549,
"step": 9305
},
{
"epoch": 0.6273373538627405,
"grad_norm": 6.810563784485118,
"learning_rate": 3.666188262362648e-07,
"loss": 1.8067,
"step": 9310
},
{
"epoch": 0.6276742697348472,
"grad_norm": 6.310703666355027,
"learning_rate": 3.660521791913265e-07,
"loss": 1.8092,
"step": 9315
},
{
"epoch": 0.6280111856069539,
"grad_norm": 5.895296847285542,
"learning_rate": 3.654857174238256e-07,
"loss": 1.7903,
"step": 9320
},
{
"epoch": 0.6283481014790607,
"grad_norm": 5.588323528807347,
"learning_rate": 3.649194417172957e-07,
"loss": 1.8755,
"step": 9325
},
{
"epoch": 0.6286850173511674,
"grad_norm": 5.7839080812815435,
"learning_rate": 3.6435335285501283e-07,
"loss": 1.7056,
"step": 9330
},
{
"epoch": 0.6290219332232742,
"grad_norm": 5.5844659216765065,
"learning_rate": 3.6378745161999426e-07,
"loss": 1.8071,
"step": 9335
},
{
"epoch": 0.6293588490953809,
"grad_norm": 5.824070138199869,
"learning_rate": 3.632217387949983e-07,
"loss": 1.8042,
"step": 9340
},
{
"epoch": 0.6296957649674876,
"grad_norm": 5.711934278426134,
"learning_rate": 3.626562151625223e-07,
"loss": 1.7275,
"step": 9345
},
{
"epoch": 0.6300326808395944,
"grad_norm": 5.778338968005331,
"learning_rate": 3.6209088150480173e-07,
"loss": 1.7343,
"step": 9350
},
{
"epoch": 0.6303695967117011,
"grad_norm": 6.311857290704742,
"learning_rate": 3.6152573860380964e-07,
"loss": 1.7612,
"step": 9355
},
{
"epoch": 0.6307065125838078,
"grad_norm": 5.999783421284286,
"learning_rate": 3.6096078724125544e-07,
"loss": 1.8523,
"step": 9360
},
{
"epoch": 0.6310434284559145,
"grad_norm": 6.3198427864797315,
"learning_rate": 3.603960281985828e-07,
"loss": 1.8345,
"step": 9365
},
{
"epoch": 0.6313803443280213,
"grad_norm": 6.104616999431709,
"learning_rate": 3.5983146225697007e-07,
"loss": 1.7699,
"step": 9370
},
{
"epoch": 0.6317172602001281,
"grad_norm": 6.1785780292512085,
"learning_rate": 3.5926709019732855e-07,
"loss": 1.8611,
"step": 9375
},
{
"epoch": 0.6320541760722348,
"grad_norm": 5.938144134115797,
"learning_rate": 3.587029128003006e-07,
"loss": 1.7861,
"step": 9380
},
{
"epoch": 0.6323910919443415,
"grad_norm": 6.224929650940109,
"learning_rate": 3.581389308462601e-07,
"loss": 1.7648,
"step": 9385
},
{
"epoch": 0.6327280078164482,
"grad_norm": 6.061589086959369,
"learning_rate": 3.5757514511531016e-07,
"loss": 1.819,
"step": 9390
},
{
"epoch": 0.6330649236885549,
"grad_norm": 5.951184996035118,
"learning_rate": 3.5701155638728297e-07,
"loss": 1.8019,
"step": 9395
},
{
"epoch": 0.6334018395606617,
"grad_norm": 5.682637936259567,
"learning_rate": 3.564481654417374e-07,
"loss": 1.7767,
"step": 9400
},
{
"epoch": 0.6337387554327685,
"grad_norm": 5.676942423843257,
"learning_rate": 3.558849730579594e-07,
"loss": 1.8106,
"step": 9405
},
{
"epoch": 0.6340756713048752,
"grad_norm": 6.070636918840517,
"learning_rate": 3.553219800149603e-07,
"loss": 1.7765,
"step": 9410
},
{
"epoch": 0.6344125871769819,
"grad_norm": 5.891991459452166,
"learning_rate": 3.547591870914752e-07,
"loss": 1.7422,
"step": 9415
},
{
"epoch": 0.6347495030490886,
"grad_norm": 6.0285641037028315,
"learning_rate": 3.5419659506596287e-07,
"loss": 1.7854,
"step": 9420
},
{
"epoch": 0.6350864189211953,
"grad_norm": 5.939362084214073,
"learning_rate": 3.536342047166039e-07,
"loss": 1.7696,
"step": 9425
},
{
"epoch": 0.6354233347933022,
"grad_norm": 5.869634005185154,
"learning_rate": 3.530720168213001e-07,
"loss": 1.7639,
"step": 9430
},
{
"epoch": 0.6357602506654089,
"grad_norm": 5.6278059236397855,
"learning_rate": 3.5251003215767305e-07,
"loss": 1.7762,
"step": 9435
},
{
"epoch": 0.6360971665375156,
"grad_norm": 6.050098422180009,
"learning_rate": 3.519482515030636e-07,
"loss": 1.7914,
"step": 9440
},
{
"epoch": 0.6364340824096223,
"grad_norm": 5.726733009746023,
"learning_rate": 3.5138667563452983e-07,
"loss": 1.8208,
"step": 9445
},
{
"epoch": 0.636770998281729,
"grad_norm": 7.0847167131878335,
"learning_rate": 3.5082530532884703e-07,
"loss": 1.7328,
"step": 9450
},
{
"epoch": 0.6371079141538358,
"grad_norm": 5.768645926917814,
"learning_rate": 3.5026414136250607e-07,
"loss": 1.7745,
"step": 9455
},
{
"epoch": 0.6374448300259425,
"grad_norm": 5.255887609055351,
"learning_rate": 3.497031845117124e-07,
"loss": 1.8214,
"step": 9460
},
{
"epoch": 0.6377817458980493,
"grad_norm": 6.164645047794775,
"learning_rate": 3.4914243555238476e-07,
"loss": 1.8279,
"step": 9465
},
{
"epoch": 0.638118661770156,
"grad_norm": 5.8015039903705645,
"learning_rate": 3.4858189526015453e-07,
"loss": 1.7587,
"step": 9470
},
{
"epoch": 0.6384555776422627,
"grad_norm": 6.166820116860601,
"learning_rate": 3.4802156441036467e-07,
"loss": 1.7864,
"step": 9475
},
{
"epoch": 0.6387924935143695,
"grad_norm": 5.652602800993565,
"learning_rate": 3.4746144377806785e-07,
"loss": 1.787,
"step": 9480
},
{
"epoch": 0.6391294093864762,
"grad_norm": 6.215195219229495,
"learning_rate": 3.4690153413802653e-07,
"loss": 1.8208,
"step": 9485
},
{
"epoch": 0.6394663252585829,
"grad_norm": 6.017628489290521,
"learning_rate": 3.4634183626471125e-07,
"loss": 1.8729,
"step": 9490
},
{
"epoch": 0.6398032411306896,
"grad_norm": 6.027983940121836,
"learning_rate": 3.457823509322992e-07,
"loss": 1.842,
"step": 9495
},
{
"epoch": 0.6401401570027964,
"grad_norm": 6.276935654397374,
"learning_rate": 3.452230789146741e-07,
"loss": 1.8351,
"step": 9500
},
{
"epoch": 0.6404770728749032,
"grad_norm": 6.351214097699412,
"learning_rate": 3.4466402098542435e-07,
"loss": 1.7684,
"step": 9505
},
{
"epoch": 0.6408139887470099,
"grad_norm": 6.052932413421503,
"learning_rate": 3.441051779178422e-07,
"loss": 1.7954,
"step": 9510
},
{
"epoch": 0.6411509046191166,
"grad_norm": 5.871050354755707,
"learning_rate": 3.4354655048492277e-07,
"loss": 1.7081,
"step": 9515
},
{
"epoch": 0.6414878204912233,
"grad_norm": 5.43280461728629,
"learning_rate": 3.429881394593629e-07,
"loss": 1.7498,
"step": 9520
},
{
"epoch": 0.64182473636333,
"grad_norm": 6.277219662282607,
"learning_rate": 3.4242994561356043e-07,
"loss": 1.7507,
"step": 9525
},
{
"epoch": 0.6421616522354369,
"grad_norm": 5.964958889609506,
"learning_rate": 3.4187196971961185e-07,
"loss": 1.8509,
"step": 9530
},
{
"epoch": 0.6424985681075436,
"grad_norm": 6.233012701053734,
"learning_rate": 3.4131421254931326e-07,
"loss": 1.816,
"step": 9535
},
{
"epoch": 0.6428354839796503,
"grad_norm": 5.718317294931223,
"learning_rate": 3.4075667487415785e-07,
"loss": 1.8602,
"step": 9540
},
{
"epoch": 0.643172399851757,
"grad_norm": 6.076698722005338,
"learning_rate": 3.4019935746533474e-07,
"loss": 1.8048,
"step": 9545
},
{
"epoch": 0.6435093157238637,
"grad_norm": 6.344118541757631,
"learning_rate": 3.3964226109372884e-07,
"loss": 1.8794,
"step": 9550
},
{
"epoch": 0.6438462315959704,
"grad_norm": 5.824838401908134,
"learning_rate": 3.390853865299195e-07,
"loss": 1.7912,
"step": 9555
},
{
"epoch": 0.6441831474680773,
"grad_norm": 6.209879017865747,
"learning_rate": 3.385287345441786e-07,
"loss": 1.8699,
"step": 9560
},
{
"epoch": 0.644520063340184,
"grad_norm": 6.082107554907744,
"learning_rate": 3.3797230590647073e-07,
"loss": 1.782,
"step": 9565
},
{
"epoch": 0.6448569792122907,
"grad_norm": 6.251059910397102,
"learning_rate": 3.374161013864515e-07,
"loss": 1.7841,
"step": 9570
},
{
"epoch": 0.6451938950843974,
"grad_norm": 6.216762679041962,
"learning_rate": 3.368601217534661e-07,
"loss": 1.8344,
"step": 9575
},
{
"epoch": 0.6455308109565041,
"grad_norm": 6.1689755886139235,
"learning_rate": 3.3630436777654903e-07,
"loss": 1.8488,
"step": 9580
},
{
"epoch": 0.6458677268286109,
"grad_norm": 6.150257163492163,
"learning_rate": 3.357488402244227e-07,
"loss": 1.8074,
"step": 9585
},
{
"epoch": 0.6462046427007176,
"grad_norm": 6.405818650009187,
"learning_rate": 3.3519353986549604e-07,
"loss": 1.7121,
"step": 9590
},
{
"epoch": 0.6465415585728244,
"grad_norm": 6.4015349458926645,
"learning_rate": 3.346384674678639e-07,
"loss": 1.8637,
"step": 9595
},
{
"epoch": 0.6468784744449311,
"grad_norm": 6.022453268076306,
"learning_rate": 3.3408362379930576e-07,
"loss": 1.7349,
"step": 9600
},
{
"epoch": 0.6472153903170378,
"grad_norm": 5.707068311915508,
"learning_rate": 3.335290096272849e-07,
"loss": 1.7547,
"step": 9605
},
{
"epoch": 0.6475523061891446,
"grad_norm": 6.157901580868346,
"learning_rate": 3.3297462571894673e-07,
"loss": 1.7596,
"step": 9610
},
{
"epoch": 0.6478892220612513,
"grad_norm": 5.651431847485237,
"learning_rate": 3.3242047284111857e-07,
"loss": 1.7731,
"step": 9615
},
{
"epoch": 0.648226137933358,
"grad_norm": 5.84959347365055,
"learning_rate": 3.3186655176030826e-07,
"loss": 1.8499,
"step": 9620
},
{
"epoch": 0.6485630538054648,
"grad_norm": 5.442395829120497,
"learning_rate": 3.3131286324270234e-07,
"loss": 1.6875,
"step": 9625
},
{
"epoch": 0.6488999696775715,
"grad_norm": 5.576850818923356,
"learning_rate": 3.3075940805416654e-07,
"loss": 1.7739,
"step": 9630
},
{
"epoch": 0.6492368855496783,
"grad_norm": 5.776880524135601,
"learning_rate": 3.3020618696024316e-07,
"loss": 1.8221,
"step": 9635
},
{
"epoch": 0.649573801421785,
"grad_norm": 6.1563117132326,
"learning_rate": 3.2965320072615113e-07,
"loss": 1.83,
"step": 9640
},
{
"epoch": 0.6499107172938917,
"grad_norm": 6.116669362115531,
"learning_rate": 3.2910045011678424e-07,
"loss": 1.7656,
"step": 9645
},
{
"epoch": 0.6502476331659984,
"grad_norm": 5.766920463158223,
"learning_rate": 3.2854793589671046e-07,
"loss": 1.8503,
"step": 9650
},
{
"epoch": 0.6505845490381051,
"grad_norm": 5.7263462025342875,
"learning_rate": 3.279956588301712e-07,
"loss": 1.7948,
"step": 9655
},
{
"epoch": 0.650921464910212,
"grad_norm": 5.62578071524695,
"learning_rate": 3.274436196810789e-07,
"loss": 1.7502,
"step": 9660
},
{
"epoch": 0.6512583807823187,
"grad_norm": 6.186976960735761,
"learning_rate": 3.268918192130178e-07,
"loss": 1.7973,
"step": 9665
},
{
"epoch": 0.6515952966544254,
"grad_norm": 6.454846474675306,
"learning_rate": 3.263402581892415e-07,
"loss": 1.7346,
"step": 9670
},
{
"epoch": 0.6519322125265321,
"grad_norm": 6.0382204126825885,
"learning_rate": 3.257889373726726e-07,
"loss": 1.843,
"step": 9675
},
{
"epoch": 0.6522691283986388,
"grad_norm": 5.732458930505232,
"learning_rate": 3.252378575259013e-07,
"loss": 1.7876,
"step": 9680
},
{
"epoch": 0.6526060442707456,
"grad_norm": 5.8367309166096915,
"learning_rate": 3.246870194111849e-07,
"loss": 1.7943,
"step": 9685
},
{
"epoch": 0.6529429601428524,
"grad_norm": 6.303598679052027,
"learning_rate": 3.2413642379044557e-07,
"loss": 1.7715,
"step": 9690
},
{
"epoch": 0.6532798760149591,
"grad_norm": 6.023790656021271,
"learning_rate": 3.235860714252708e-07,
"loss": 1.8188,
"step": 9695
},
{
"epoch": 0.6536167918870658,
"grad_norm": 5.831181231694035,
"learning_rate": 3.2303596307691137e-07,
"loss": 1.7846,
"step": 9700
},
{
"epoch": 0.6539537077591725,
"grad_norm": 5.747323594425591,
"learning_rate": 3.2248609950628023e-07,
"loss": 1.8437,
"step": 9705
},
{
"epoch": 0.6542906236312793,
"grad_norm": 6.344734553809549,
"learning_rate": 3.219364814739522e-07,
"loss": 1.7766,
"step": 9710
},
{
"epoch": 0.654627539503386,
"grad_norm": 6.531726147190786,
"learning_rate": 3.2138710974016226e-07,
"loss": 1.7615,
"step": 9715
},
{
"epoch": 0.6549644553754927,
"grad_norm": 5.850481024867147,
"learning_rate": 3.208379850648046e-07,
"loss": 1.8543,
"step": 9720
},
{
"epoch": 0.6553013712475995,
"grad_norm": 5.845927633738255,
"learning_rate": 3.202891082074318e-07,
"loss": 1.8064,
"step": 9725
},
{
"epoch": 0.6556382871197062,
"grad_norm": 5.677222851765273,
"learning_rate": 3.197404799272537e-07,
"loss": 1.7991,
"step": 9730
},
{
"epoch": 0.6559752029918129,
"grad_norm": 6.57853406738191,
"learning_rate": 3.191921009831365e-07,
"loss": 1.7932,
"step": 9735
},
{
"epoch": 0.6563121188639197,
"grad_norm": 6.208108133755243,
"learning_rate": 3.1864397213360093e-07,
"loss": 1.8054,
"step": 9740
},
{
"epoch": 0.6566490347360264,
"grad_norm": 6.360434354173842,
"learning_rate": 3.180960941368223e-07,
"loss": 1.8019,
"step": 9745
},
{
"epoch": 0.6569859506081331,
"grad_norm": 5.962283203943862,
"learning_rate": 3.175484677506288e-07,
"loss": 1.7591,
"step": 9750
},
{
"epoch": 0.6573228664802399,
"grad_norm": 6.134085347065576,
"learning_rate": 3.1700109373250063e-07,
"loss": 1.7648,
"step": 9755
},
{
"epoch": 0.6576597823523466,
"grad_norm": 6.481907425268963,
"learning_rate": 3.1645397283956843e-07,
"loss": 1.7872,
"step": 9760
},
{
"epoch": 0.6579966982244534,
"grad_norm": 5.6435553605771664,
"learning_rate": 3.159071058286138e-07,
"loss": 1.7808,
"step": 9765
},
{
"epoch": 0.6583336140965601,
"grad_norm": 5.912519778324625,
"learning_rate": 3.1536049345606586e-07,
"loss": 1.8875,
"step": 9770
},
{
"epoch": 0.6586705299686668,
"grad_norm": 5.678335891008834,
"learning_rate": 3.1481413647800247e-07,
"loss": 1.663,
"step": 9775
},
{
"epoch": 0.6590074458407735,
"grad_norm": 5.9482407468813,
"learning_rate": 3.14268035650148e-07,
"loss": 1.8166,
"step": 9780
},
{
"epoch": 0.6593443617128802,
"grad_norm": 5.722651494191131,
"learning_rate": 3.137221917278723e-07,
"loss": 1.8757,
"step": 9785
},
{
"epoch": 0.6596812775849871,
"grad_norm": 6.054054603796881,
"learning_rate": 3.1317660546618986e-07,
"loss": 1.7174,
"step": 9790
},
{
"epoch": 0.6600181934570938,
"grad_norm": 6.112296477029174,
"learning_rate": 3.1263127761975917e-07,
"loss": 1.804,
"step": 9795
},
{
"epoch": 0.6603551093292005,
"grad_norm": 6.24833577705031,
"learning_rate": 3.12086208942881e-07,
"loss": 1.83,
"step": 9800
},
{
"epoch": 0.6606920252013072,
"grad_norm": 5.796380373627383,
"learning_rate": 3.1154140018949736e-07,
"loss": 1.7987,
"step": 9805
},
{
"epoch": 0.6610289410734139,
"grad_norm": 6.091806447131728,
"learning_rate": 3.1099685211319116e-07,
"loss": 1.7743,
"step": 9810
},
{
"epoch": 0.6613658569455207,
"grad_norm": 6.099517758922633,
"learning_rate": 3.104525654671849e-07,
"loss": 1.7681,
"step": 9815
},
{
"epoch": 0.6617027728176275,
"grad_norm": 6.068348509741613,
"learning_rate": 3.099085410043386e-07,
"loss": 1.7287,
"step": 9820
},
{
"epoch": 0.6620396886897342,
"grad_norm": 5.435497802729357,
"learning_rate": 3.0936477947715064e-07,
"loss": 1.7398,
"step": 9825
},
{
"epoch": 0.6623766045618409,
"grad_norm": 5.978768977959412,
"learning_rate": 3.088212816377552e-07,
"loss": 1.7762,
"step": 9830
},
{
"epoch": 0.6627135204339476,
"grad_norm": 5.529402155256499,
"learning_rate": 3.0827804823792157e-07,
"loss": 1.7906,
"step": 9835
},
{
"epoch": 0.6630504363060544,
"grad_norm": 5.745634282596181,
"learning_rate": 3.077350800290537e-07,
"loss": 1.8153,
"step": 9840
},
{
"epoch": 0.6633873521781611,
"grad_norm": 5.678976914931346,
"learning_rate": 3.071923777621885e-07,
"loss": 1.7453,
"step": 9845
},
{
"epoch": 0.6637242680502679,
"grad_norm": 6.0133152567955275,
"learning_rate": 3.066499421879948e-07,
"loss": 1.887,
"step": 9850
},
{
"epoch": 0.6640611839223746,
"grad_norm": 6.145888236549586,
"learning_rate": 3.0610777405677286e-07,
"loss": 1.7914,
"step": 9855
},
{
"epoch": 0.6643980997944813,
"grad_norm": 5.481352078753557,
"learning_rate": 3.05565874118453e-07,
"loss": 1.7645,
"step": 9860
},
{
"epoch": 0.6647350156665881,
"grad_norm": 6.155766828058924,
"learning_rate": 3.050242431225948e-07,
"loss": 1.7972,
"step": 9865
},
{
"epoch": 0.6650719315386948,
"grad_norm": 6.096995270749912,
"learning_rate": 3.0448288181838487e-07,
"loss": 1.7869,
"step": 9870
},
{
"epoch": 0.6654088474108015,
"grad_norm": 5.923689399349553,
"learning_rate": 3.0394179095463804e-07,
"loss": 1.8212,
"step": 9875
},
{
"epoch": 0.6657457632829082,
"grad_norm": 6.183785970604984,
"learning_rate": 3.0340097127979426e-07,
"loss": 1.8298,
"step": 9880
},
{
"epoch": 0.666082679155015,
"grad_norm": 6.130929170868858,
"learning_rate": 3.0286042354191844e-07,
"loss": 1.8104,
"step": 9885
},
{
"epoch": 0.6664195950271218,
"grad_norm": 5.884804572018233,
"learning_rate": 3.0232014848869955e-07,
"loss": 1.8011,
"step": 9890
},
{
"epoch": 0.6667565108992285,
"grad_norm": 6.118852994598744,
"learning_rate": 3.0178014686744966e-07,
"loss": 1.761,
"step": 9895
},
{
"epoch": 0.6670934267713352,
"grad_norm": 5.391353549438332,
"learning_rate": 3.0124041942510175e-07,
"loss": 1.8032,
"step": 9900
},
{
"epoch": 0.6674303426434419,
"grad_norm": 6.996706242225225,
"learning_rate": 3.007009669082103e-07,
"loss": 1.7749,
"step": 9905
},
{
"epoch": 0.6677672585155486,
"grad_norm": 5.769021683253544,
"learning_rate": 3.001617900629496e-07,
"loss": 1.8067,
"step": 9910
},
{
"epoch": 0.6681041743876553,
"grad_norm": 6.106192789372139,
"learning_rate": 2.996228896351119e-07,
"loss": 1.8022,
"step": 9915
},
{
"epoch": 0.6684410902597622,
"grad_norm": 5.8423030937909335,
"learning_rate": 2.9908426637010773e-07,
"loss": 1.8518,
"step": 9920
},
{
"epoch": 0.6687780061318689,
"grad_norm": 5.975794455383276,
"learning_rate": 2.98545921012964e-07,
"loss": 1.7861,
"step": 9925
},
{
"epoch": 0.6691149220039756,
"grad_norm": 6.185465462038598,
"learning_rate": 2.9800785430832354e-07,
"loss": 1.751,
"step": 9930
},
{
"epoch": 0.6694518378760823,
"grad_norm": 5.959682622502289,
"learning_rate": 2.9747006700044295e-07,
"loss": 1.7959,
"step": 9935
},
{
"epoch": 0.669788753748189,
"grad_norm": 5.644052495462014,
"learning_rate": 2.969325598331932e-07,
"loss": 1.7526,
"step": 9940
},
{
"epoch": 0.6701256696202958,
"grad_norm": 5.560965520647446,
"learning_rate": 2.9639533355005773e-07,
"loss": 1.7729,
"step": 9945
},
{
"epoch": 0.6704625854924026,
"grad_norm": 5.6497755675616865,
"learning_rate": 2.958583888941306e-07,
"loss": 1.7338,
"step": 9950
},
{
"epoch": 0.6707995013645093,
"grad_norm": 6.305541997526218,
"learning_rate": 2.9532172660811745e-07,
"loss": 1.8171,
"step": 9955
},
{
"epoch": 0.671136417236616,
"grad_norm": 5.908429580273309,
"learning_rate": 2.9478534743433247e-07,
"loss": 1.7819,
"step": 9960
},
{
"epoch": 0.6714733331087227,
"grad_norm": 5.222620415906802,
"learning_rate": 2.9424925211469876e-07,
"loss": 1.7729,
"step": 9965
},
{
"epoch": 0.6718102489808295,
"grad_norm": 6.572254603044038,
"learning_rate": 2.9371344139074645e-07,
"loss": 1.7642,
"step": 9970
},
{
"epoch": 0.6721471648529362,
"grad_norm": 6.327822112736418,
"learning_rate": 2.9317791600361243e-07,
"loss": 1.6583,
"step": 9975
},
{
"epoch": 0.672484080725043,
"grad_norm": 5.574113505265418,
"learning_rate": 2.9264267669403833e-07,
"loss": 1.8007,
"step": 9980
},
{
"epoch": 0.6728209965971497,
"grad_norm": 5.98463445361315,
"learning_rate": 2.921077242023706e-07,
"loss": 1.7971,
"step": 9985
},
{
"epoch": 0.6731579124692564,
"grad_norm": 6.304951408982249,
"learning_rate": 2.9157305926855893e-07,
"loss": 1.7754,
"step": 9990
},
{
"epoch": 0.6734948283413632,
"grad_norm": 5.969025928551944,
"learning_rate": 2.910386826321549e-07,
"loss": 1.8124,
"step": 9995
},
{
"epoch": 0.6738317442134699,
"grad_norm": 6.2610465949388034,
"learning_rate": 2.905045950323114e-07,
"loss": 1.7963,
"step": 10000
},
{
"epoch": 0.6741686600855766,
"grad_norm": 6.168419758973521,
"learning_rate": 2.899707972077817e-07,
"loss": 1.7718,
"step": 10005
},
{
"epoch": 0.6745055759576833,
"grad_norm": 6.8413981566739155,
"learning_rate": 2.8943728989691857e-07,
"loss": 1.8168,
"step": 10010
},
{
"epoch": 0.6748424918297901,
"grad_norm": 6.184366090083486,
"learning_rate": 2.88904073837672e-07,
"loss": 1.7857,
"step": 10015
},
{
"epoch": 0.6751794077018969,
"grad_norm": 5.907308449749499,
"learning_rate": 2.883711497675899e-07,
"loss": 1.7391,
"step": 10020
},
{
"epoch": 0.6755163235740036,
"grad_norm": 5.657863323168557,
"learning_rate": 2.878385184238163e-07,
"loss": 1.7462,
"step": 10025
},
{
"epoch": 0.6758532394461103,
"grad_norm": 6.0183947518407885,
"learning_rate": 2.8730618054308964e-07,
"loss": 1.7306,
"step": 10030
},
{
"epoch": 0.676190155318217,
"grad_norm": 5.743693775609563,
"learning_rate": 2.8677413686174325e-07,
"loss": 1.8358,
"step": 10035
},
{
"epoch": 0.6765270711903237,
"grad_norm": 6.764854467811837,
"learning_rate": 2.8624238811570325e-07,
"loss": 1.7915,
"step": 10040
},
{
"epoch": 0.6768639870624306,
"grad_norm": 5.519785427555744,
"learning_rate": 2.8571093504048737e-07,
"loss": 1.7907,
"step": 10045
},
{
"epoch": 0.6772009029345373,
"grad_norm": 5.738090769801983,
"learning_rate": 2.851797783712049e-07,
"loss": 1.709,
"step": 10050
},
{
"epoch": 0.677537818806644,
"grad_norm": 6.1121353044767055,
"learning_rate": 2.8464891884255515e-07,
"loss": 1.8104,
"step": 10055
},
{
"epoch": 0.6778747346787507,
"grad_norm": 6.082231792635815,
"learning_rate": 2.8411835718882593e-07,
"loss": 1.8035,
"step": 10060
},
{
"epoch": 0.6782116505508574,
"grad_norm": 5.757889132681298,
"learning_rate": 2.835880941438936e-07,
"loss": 1.7574,
"step": 10065
},
{
"epoch": 0.6785485664229642,
"grad_norm": 5.8861819403022295,
"learning_rate": 2.8305813044122093e-07,
"loss": 1.7928,
"step": 10070
},
{
"epoch": 0.678885482295071,
"grad_norm": 6.509539103297996,
"learning_rate": 2.8252846681385734e-07,
"loss": 1.8665,
"step": 10075
},
{
"epoch": 0.6792223981671777,
"grad_norm": 6.3546347574074655,
"learning_rate": 2.8199910399443625e-07,
"loss": 1.8221,
"step": 10080
},
{
"epoch": 0.6795593140392844,
"grad_norm": 5.934429157563714,
"learning_rate": 2.8147004271517584e-07,
"loss": 1.7238,
"step": 10085
},
{
"epoch": 0.6798962299113911,
"grad_norm": 5.866984996633733,
"learning_rate": 2.8094128370787694e-07,
"loss": 1.8562,
"step": 10090
},
{
"epoch": 0.6802331457834978,
"grad_norm": 6.064723412686676,
"learning_rate": 2.8041282770392196e-07,
"loss": 1.7556,
"step": 10095
},
{
"epoch": 0.6805700616556046,
"grad_norm": 6.549753460763149,
"learning_rate": 2.7988467543427454e-07,
"loss": 1.722,
"step": 10100
},
{
"epoch": 0.6809069775277113,
"grad_norm": 6.655895352398308,
"learning_rate": 2.7935682762947837e-07,
"loss": 1.8285,
"step": 10105
},
{
"epoch": 0.6812438933998181,
"grad_norm": 5.802651770334309,
"learning_rate": 2.788292850196553e-07,
"loss": 1.7974,
"step": 10110
},
{
"epoch": 0.6815808092719248,
"grad_norm": 5.861414227938647,
"learning_rate": 2.783020483345057e-07,
"loss": 1.737,
"step": 10115
},
{
"epoch": 0.6819177251440315,
"grad_norm": 5.698821956793516,
"learning_rate": 2.777751183033067e-07,
"loss": 1.8651,
"step": 10120
},
{
"epoch": 0.6822546410161383,
"grad_norm": 5.688820682473993,
"learning_rate": 2.772484956549107e-07,
"loss": 1.7865,
"step": 10125
},
{
"epoch": 0.682591556888245,
"grad_norm": 6.358195618306636,
"learning_rate": 2.7672218111774566e-07,
"loss": 1.8499,
"step": 10130
},
{
"epoch": 0.6829284727603517,
"grad_norm": 5.6611682532718985,
"learning_rate": 2.7619617541981287e-07,
"loss": 1.7677,
"step": 10135
},
{
"epoch": 0.6832653886324584,
"grad_norm": 6.2197481978105325,
"learning_rate": 2.756704792886869e-07,
"loss": 1.7721,
"step": 10140
},
{
"epoch": 0.6836023045045652,
"grad_norm": 5.844100559661615,
"learning_rate": 2.7514509345151347e-07,
"loss": 1.7516,
"step": 10145
},
{
"epoch": 0.683939220376672,
"grad_norm": 6.384694537823895,
"learning_rate": 2.746200186350097e-07,
"loss": 1.8386,
"step": 10150
},
{
"epoch": 0.6842761362487787,
"grad_norm": 5.510665281210932,
"learning_rate": 2.740952555654622e-07,
"loss": 1.8099,
"step": 10155
},
{
"epoch": 0.6846130521208854,
"grad_norm": 6.1634268813462985,
"learning_rate": 2.735708049687262e-07,
"loss": 1.7247,
"step": 10160
},
{
"epoch": 0.6849499679929921,
"grad_norm": 5.964714807031205,
"learning_rate": 2.730466675702251e-07,
"loss": 1.8141,
"step": 10165
},
{
"epoch": 0.6852868838650988,
"grad_norm": 6.170392267124102,
"learning_rate": 2.7252284409494906e-07,
"loss": 1.7619,
"step": 10170
},
{
"epoch": 0.6856237997372057,
"grad_norm": 5.509149793923222,
"learning_rate": 2.7199933526745364e-07,
"loss": 1.8118,
"step": 10175
},
{
"epoch": 0.6859607156093124,
"grad_norm": 6.54628143404462,
"learning_rate": 2.714761418118596e-07,
"loss": 1.7321,
"step": 10180
},
{
"epoch": 0.6862976314814191,
"grad_norm": 7.2477390544693945,
"learning_rate": 2.7095326445185143e-07,
"loss": 1.8118,
"step": 10185
},
{
"epoch": 0.6866345473535258,
"grad_norm": 5.822832431612233,
"learning_rate": 2.704307039106759e-07,
"loss": 1.7449,
"step": 10190
},
{
"epoch": 0.6869714632256325,
"grad_norm": 5.624150215875334,
"learning_rate": 2.6990846091114205e-07,
"loss": 1.8633,
"step": 10195
},
{
"epoch": 0.6873083790977393,
"grad_norm": 5.557146257249171,
"learning_rate": 2.6938653617561967e-07,
"loss": 1.8189,
"step": 10200
},
{
"epoch": 0.687645294969846,
"grad_norm": 5.6563267734125935,
"learning_rate": 2.688649304260383e-07,
"loss": 1.7889,
"step": 10205
},
{
"epoch": 0.6879822108419528,
"grad_norm": 6.075955336314197,
"learning_rate": 2.683436443838859e-07,
"loss": 1.81,
"step": 10210
},
{
"epoch": 0.6883191267140595,
"grad_norm": 6.259216573221612,
"learning_rate": 2.678226787702086e-07,
"loss": 1.8303,
"step": 10215
},
{
"epoch": 0.6886560425861662,
"grad_norm": 5.884748810679717,
"learning_rate": 2.673020343056094e-07,
"loss": 1.8741,
"step": 10220
},
{
"epoch": 0.688992958458273,
"grad_norm": 6.144015918665497,
"learning_rate": 2.6678171171024657e-07,
"loss": 1.8529,
"step": 10225
},
{
"epoch": 0.6893298743303797,
"grad_norm": 5.50532456214314,
"learning_rate": 2.6626171170383373e-07,
"loss": 1.7156,
"step": 10230
},
{
"epoch": 0.6896667902024864,
"grad_norm": 5.9031282172064605,
"learning_rate": 2.6574203500563776e-07,
"loss": 1.8002,
"step": 10235
},
{
"epoch": 0.6900037060745932,
"grad_norm": 5.789618897247442,
"learning_rate": 2.6522268233447894e-07,
"loss": 1.8153,
"step": 10240
},
{
"epoch": 0.6903406219466999,
"grad_norm": 5.9841417775446475,
"learning_rate": 2.6470365440872866e-07,
"loss": 1.7516,
"step": 10245
},
{
"epoch": 0.6906775378188067,
"grad_norm": 6.137156396832881,
"learning_rate": 2.641849519463099e-07,
"loss": 1.8213,
"step": 10250
},
{
"epoch": 0.6910144536909134,
"grad_norm": 5.63256141938642,
"learning_rate": 2.6366657566469465e-07,
"loss": 1.763,
"step": 10255
},
{
"epoch": 0.6913513695630201,
"grad_norm": 5.7892144401063605,
"learning_rate": 2.631485262809043e-07,
"loss": 1.7543,
"step": 10260
},
{
"epoch": 0.6916882854351268,
"grad_norm": 5.635590710985062,
"learning_rate": 2.6263080451150797e-07,
"loss": 1.7936,
"step": 10265
},
{
"epoch": 0.6920252013072336,
"grad_norm": 6.226656648440483,
"learning_rate": 2.621134110726217e-07,
"loss": 1.8707,
"step": 10270
},
{
"epoch": 0.6923621171793403,
"grad_norm": 6.159797742322993,
"learning_rate": 2.6159634667990683e-07,
"loss": 1.8383,
"step": 10275
},
{
"epoch": 0.6926990330514471,
"grad_norm": 6.186357246890332,
"learning_rate": 2.610796120485701e-07,
"loss": 1.7995,
"step": 10280
},
{
"epoch": 0.6930359489235538,
"grad_norm": 5.607970114608954,
"learning_rate": 2.605632078933623e-07,
"loss": 1.7274,
"step": 10285
},
{
"epoch": 0.6933728647956605,
"grad_norm": 6.434772493415341,
"learning_rate": 2.600471349285763e-07,
"loss": 1.8332,
"step": 10290
},
{
"epoch": 0.6937097806677672,
"grad_norm": 6.214320065920484,
"learning_rate": 2.5953139386804764e-07,
"loss": 1.79,
"step": 10295
},
{
"epoch": 0.6940466965398739,
"grad_norm": 6.399925193314172,
"learning_rate": 2.5901598542515256e-07,
"loss": 1.7514,
"step": 10300
},
{
"epoch": 0.6943836124119808,
"grad_norm": 5.732194115105609,
"learning_rate": 2.5850091031280684e-07,
"loss": 1.8028,
"step": 10305
},
{
"epoch": 0.6947205282840875,
"grad_norm": 5.820562272225368,
"learning_rate": 2.579861692434658e-07,
"loss": 1.7914,
"step": 10310
},
{
"epoch": 0.6950574441561942,
"grad_norm": 5.970701069273288,
"learning_rate": 2.574717629291222e-07,
"loss": 1.7709,
"step": 10315
},
{
"epoch": 0.6953943600283009,
"grad_norm": 5.609714862141825,
"learning_rate": 2.5695769208130615e-07,
"loss": 1.7738,
"step": 10320
},
{
"epoch": 0.6957312759004076,
"grad_norm": 5.730048947871614,
"learning_rate": 2.564439574110833e-07,
"loss": 1.7921,
"step": 10325
},
{
"epoch": 0.6960681917725144,
"grad_norm": 5.885438524238012,
"learning_rate": 2.559305596290547e-07,
"loss": 1.814,
"step": 10330
},
{
"epoch": 0.6964051076446212,
"grad_norm": 5.999810851894432,
"learning_rate": 2.554174994453555e-07,
"loss": 1.7399,
"step": 10335
},
{
"epoch": 0.6967420235167279,
"grad_norm": 5.971456247977796,
"learning_rate": 2.549047775696532e-07,
"loss": 1.7531,
"step": 10340
},
{
"epoch": 0.6970789393888346,
"grad_norm": 6.045304895294886,
"learning_rate": 2.543923947111481e-07,
"loss": 1.7802,
"step": 10345
},
{
"epoch": 0.6974158552609413,
"grad_norm": 5.898273349939425,
"learning_rate": 2.538803515785714e-07,
"loss": 1.8081,
"step": 10350
},
{
"epoch": 0.6977527711330481,
"grad_norm": 5.716447889359928,
"learning_rate": 2.5336864888018393e-07,
"loss": 1.7536,
"step": 10355
},
{
"epoch": 0.6980896870051548,
"grad_norm": 6.1167888708064,
"learning_rate": 2.528572873237761e-07,
"loss": 1.7771,
"step": 10360
},
{
"epoch": 0.6984266028772615,
"grad_norm": 5.814373279694408,
"learning_rate": 2.5234626761666647e-07,
"loss": 1.8857,
"step": 10365
},
{
"epoch": 0.6987635187493683,
"grad_norm": 5.595119203770916,
"learning_rate": 2.5183559046570036e-07,
"loss": 1.8684,
"step": 10370
},
{
"epoch": 0.699100434621475,
"grad_norm": 5.746260139787474,
"learning_rate": 2.513252565772496e-07,
"loss": 1.8153,
"step": 10375
},
{
"epoch": 0.6994373504935818,
"grad_norm": 6.070923517455952,
"learning_rate": 2.5081526665721133e-07,
"loss": 1.779,
"step": 10380
},
{
"epoch": 0.6997742663656885,
"grad_norm": 5.654721545450662,
"learning_rate": 2.503056214110062e-07,
"loss": 1.7561,
"step": 10385
},
{
"epoch": 0.7001111822377952,
"grad_norm": 6.076854962239542,
"learning_rate": 2.497963215435789e-07,
"loss": 1.7356,
"step": 10390
},
{
"epoch": 0.7004480981099019,
"grad_norm": 6.267075920486195,
"learning_rate": 2.492873677593964e-07,
"loss": 1.72,
"step": 10395
},
{
"epoch": 0.7007850139820087,
"grad_norm": 6.495036916039556,
"learning_rate": 2.4877876076244626e-07,
"loss": 1.8802,
"step": 10400
},
{
"epoch": 0.7011219298541155,
"grad_norm": 6.268581169228354,
"learning_rate": 2.482705012562367e-07,
"loss": 1.7963,
"step": 10405
},
{
"epoch": 0.7014588457262222,
"grad_norm": 5.710070783985173,
"learning_rate": 2.4776258994379546e-07,
"loss": 1.8051,
"step": 10410
},
{
"epoch": 0.7017957615983289,
"grad_norm": 6.111357846006603,
"learning_rate": 2.4725502752766883e-07,
"loss": 1.795,
"step": 10415
},
{
"epoch": 0.7021326774704356,
"grad_norm": 5.907713749893674,
"learning_rate": 2.4674781470991967e-07,
"loss": 1.7229,
"step": 10420
},
{
"epoch": 0.7024695933425423,
"grad_norm": 6.204994213224908,
"learning_rate": 2.462409521921282e-07,
"loss": 1.761,
"step": 10425
},
{
"epoch": 0.7028065092146492,
"grad_norm": 5.823631147973025,
"learning_rate": 2.4573444067538985e-07,
"loss": 1.8227,
"step": 10430
},
{
"epoch": 0.7031434250867559,
"grad_norm": 6.33472545976378,
"learning_rate": 2.4522828086031404e-07,
"loss": 1.8701,
"step": 10435
},
{
"epoch": 0.7034803409588626,
"grad_norm": 6.345376423431866,
"learning_rate": 2.4472247344702424e-07,
"loss": 1.7783,
"step": 10440
},
{
"epoch": 0.7038172568309693,
"grad_norm": 6.2754842195154685,
"learning_rate": 2.442170191351566e-07,
"loss": 1.8856,
"step": 10445
},
{
"epoch": 0.704154172703076,
"grad_norm": 6.106369137710223,
"learning_rate": 2.4371191862385816e-07,
"loss": 1.7274,
"step": 10450
},
{
"epoch": 0.7044910885751827,
"grad_norm": 6.1777861768742675,
"learning_rate": 2.4320717261178715e-07,
"loss": 1.808,
"step": 10455
},
{
"epoch": 0.7048280044472895,
"grad_norm": 6.247165515267102,
"learning_rate": 2.4270278179711163e-07,
"loss": 1.7741,
"step": 10460
},
{
"epoch": 0.7051649203193963,
"grad_norm": 6.397283503643184,
"learning_rate": 2.4219874687750754e-07,
"loss": 1.8014,
"step": 10465
},
{
"epoch": 0.705501836191503,
"grad_norm": 6.283686491438517,
"learning_rate": 2.4169506855015923e-07,
"loss": 1.8509,
"step": 10470
},
{
"epoch": 0.7058387520636097,
"grad_norm": 6.181410447908662,
"learning_rate": 2.4119174751175787e-07,
"loss": 1.7941,
"step": 10475
},
{
"epoch": 0.7061756679357164,
"grad_norm": 6.104807693363187,
"learning_rate": 2.406887844584998e-07,
"loss": 1.7714,
"step": 10480
},
{
"epoch": 0.7065125838078232,
"grad_norm": 5.877543591957879,
"learning_rate": 2.401861800860868e-07,
"loss": 1.7639,
"step": 10485
},
{
"epoch": 0.7068494996799299,
"grad_norm": 5.546451747516014,
"learning_rate": 2.396839350897241e-07,
"loss": 1.8287,
"step": 10490
},
{
"epoch": 0.7071864155520367,
"grad_norm": 6.258500520120998,
"learning_rate": 2.391820501641203e-07,
"loss": 1.8497,
"step": 10495
},
{
"epoch": 0.7075233314241434,
"grad_norm": 6.476113202277884,
"learning_rate": 2.3868052600348524e-07,
"loss": 1.8027,
"step": 10500
},
{
"epoch": 0.7078602472962501,
"grad_norm": 5.551605271516278,
"learning_rate": 2.381793633015305e-07,
"loss": 1.7701,
"step": 10505
},
{
"epoch": 0.7081971631683569,
"grad_norm": 6.2164882176078065,
"learning_rate": 2.3767856275146748e-07,
"loss": 1.7781,
"step": 10510
},
{
"epoch": 0.7085340790404636,
"grad_norm": 5.782676813508456,
"learning_rate": 2.3717812504600616e-07,
"loss": 1.7576,
"step": 10515
},
{
"epoch": 0.7088709949125703,
"grad_norm": 5.835660530958148,
"learning_rate": 2.3667805087735516e-07,
"loss": 1.8002,
"step": 10520
},
{
"epoch": 0.709207910784677,
"grad_norm": 5.988522457949844,
"learning_rate": 2.3617834093722033e-07,
"loss": 1.7542,
"step": 10525
},
{
"epoch": 0.7095448266567838,
"grad_norm": 6.439152375610268,
"learning_rate": 2.3567899591680317e-07,
"loss": 1.7935,
"step": 10530
},
{
"epoch": 0.7098817425288906,
"grad_norm": 5.567500010220148,
"learning_rate": 2.351800165068008e-07,
"loss": 1.7577,
"step": 10535
},
{
"epoch": 0.7102186584009973,
"grad_norm": 5.603156857304425,
"learning_rate": 2.346814033974047e-07,
"loss": 1.8004,
"step": 10540
},
{
"epoch": 0.710555574273104,
"grad_norm": 6.1572449811841565,
"learning_rate": 2.3418315727829962e-07,
"loss": 1.8318,
"step": 10545
},
{
"epoch": 0.7108924901452107,
"grad_norm": 6.078503199343157,
"learning_rate": 2.336852788386623e-07,
"loss": 1.8324,
"step": 10550
},
{
"epoch": 0.7112294060173174,
"grad_norm": 6.059107998684295,
"learning_rate": 2.331877687671614e-07,
"loss": 1.7983,
"step": 10555
},
{
"epoch": 0.7115663218894243,
"grad_norm": 5.977209253927193,
"learning_rate": 2.3269062775195596e-07,
"loss": 1.7753,
"step": 10560
},
{
"epoch": 0.711903237761531,
"grad_norm": 5.965166803624046,
"learning_rate": 2.321938564806944e-07,
"loss": 1.839,
"step": 10565
},
{
"epoch": 0.7122401536336377,
"grad_norm": 6.191642482071603,
"learning_rate": 2.3169745564051353e-07,
"loss": 1.7777,
"step": 10570
},
{
"epoch": 0.7125770695057444,
"grad_norm": 5.93892798040206,
"learning_rate": 2.3120142591803825e-07,
"loss": 1.8269,
"step": 10575
},
{
"epoch": 0.7129139853778511,
"grad_norm": 6.285189080508176,
"learning_rate": 2.307057679993797e-07,
"loss": 1.739,
"step": 10580
},
{
"epoch": 0.7132509012499579,
"grad_norm": 5.737737960252139,
"learning_rate": 2.30210482570135e-07,
"loss": 1.8098,
"step": 10585
},
{
"epoch": 0.7135878171220646,
"grad_norm": 5.784964645632823,
"learning_rate": 2.2971557031538607e-07,
"loss": 1.7862,
"step": 10590
},
{
"epoch": 0.7139247329941714,
"grad_norm": 5.754820146278622,
"learning_rate": 2.2922103191969828e-07,
"loss": 1.8497,
"step": 10595
},
{
"epoch": 0.7142616488662781,
"grad_norm": 6.062341451589527,
"learning_rate": 2.2872686806712032e-07,
"loss": 1.758,
"step": 10600
},
{
"epoch": 0.7145985647383848,
"grad_norm": 6.037386034436624,
"learning_rate": 2.2823307944118254e-07,
"loss": 1.7276,
"step": 10605
},
{
"epoch": 0.7149354806104916,
"grad_norm": 6.117163210581693,
"learning_rate": 2.2773966672489665e-07,
"loss": 1.8482,
"step": 10610
},
{
"epoch": 0.7152723964825983,
"grad_norm": 5.934994056880478,
"learning_rate": 2.2724663060075368e-07,
"loss": 1.7635,
"step": 10615
},
{
"epoch": 0.715609312354705,
"grad_norm": 5.888424249364107,
"learning_rate": 2.2675397175072437e-07,
"loss": 1.7819,
"step": 10620
},
{
"epoch": 0.7159462282268118,
"grad_norm": 5.24051766818146,
"learning_rate": 2.2626169085625762e-07,
"loss": 1.7494,
"step": 10625
},
{
"epoch": 0.7162831440989185,
"grad_norm": 5.624907639128319,
"learning_rate": 2.25769788598279e-07,
"loss": 1.7867,
"step": 10630
},
{
"epoch": 0.7166200599710252,
"grad_norm": 6.004337111763599,
"learning_rate": 2.2527826565719084e-07,
"loss": 1.7229,
"step": 10635
},
{
"epoch": 0.716956975843132,
"grad_norm": 5.898393504503456,
"learning_rate": 2.2478712271287087e-07,
"loss": 1.8171,
"step": 10640
},
{
"epoch": 0.7172938917152387,
"grad_norm": 5.941991199667987,
"learning_rate": 2.2429636044467059e-07,
"loss": 1.783,
"step": 10645
},
{
"epoch": 0.7176308075873454,
"grad_norm": 5.456828579892596,
"learning_rate": 2.2380597953141573e-07,
"loss": 1.8494,
"step": 10650
},
{
"epoch": 0.7179677234594521,
"grad_norm": 5.815674818794205,
"learning_rate": 2.2331598065140396e-07,
"loss": 1.7785,
"step": 10655
},
{
"epoch": 0.7183046393315589,
"grad_norm": 5.847892274384262,
"learning_rate": 2.228263644824045e-07,
"loss": 1.8753,
"step": 10660
},
{
"epoch": 0.7186415552036657,
"grad_norm": 5.486645013061549,
"learning_rate": 2.2233713170165757e-07,
"loss": 1.7817,
"step": 10665
},
{
"epoch": 0.7189784710757724,
"grad_norm": 6.219147030446047,
"learning_rate": 2.2184828298587298e-07,
"loss": 1.8092,
"step": 10670
},
{
"epoch": 0.7193153869478791,
"grad_norm": 5.4866490812706195,
"learning_rate": 2.213598190112294e-07,
"loss": 1.7201,
"step": 10675
},
{
"epoch": 0.7196523028199858,
"grad_norm": 6.119492342078513,
"learning_rate": 2.2087174045337275e-07,
"loss": 1.7692,
"step": 10680
},
{
"epoch": 0.7199892186920925,
"grad_norm": 5.758138381844213,
"learning_rate": 2.2038404798741644e-07,
"loss": 1.7833,
"step": 10685
},
{
"epoch": 0.7203261345641994,
"grad_norm": 5.701487797413592,
"learning_rate": 2.1989674228793987e-07,
"loss": 1.7566,
"step": 10690
},
{
"epoch": 0.7206630504363061,
"grad_norm": 6.482581377556036,
"learning_rate": 2.1940982402898684e-07,
"loss": 1.8066,
"step": 10695
},
{
"epoch": 0.7209999663084128,
"grad_norm": 5.6849295801038355,
"learning_rate": 2.1892329388406582e-07,
"loss": 1.7557,
"step": 10700
},
{
"epoch": 0.7213368821805195,
"grad_norm": 5.952259208162528,
"learning_rate": 2.1843715252614847e-07,
"loss": 1.7513,
"step": 10705
},
{
"epoch": 0.7216737980526262,
"grad_norm": 6.19002719248764,
"learning_rate": 2.179514006276681e-07,
"loss": 1.801,
"step": 10710
},
{
"epoch": 0.722010713924733,
"grad_norm": 5.404105686010048,
"learning_rate": 2.1746603886051978e-07,
"loss": 1.8273,
"step": 10715
},
{
"epoch": 0.7223476297968398,
"grad_norm": 5.88089250460163,
"learning_rate": 2.169810678960591e-07,
"loss": 1.768,
"step": 10720
},
{
"epoch": 0.7226845456689465,
"grad_norm": 5.449763025065613,
"learning_rate": 2.1649648840510047e-07,
"loss": 1.7353,
"step": 10725
},
{
"epoch": 0.7230214615410532,
"grad_norm": 6.096695445268082,
"learning_rate": 2.1601230105791751e-07,
"loss": 1.8199,
"step": 10730
},
{
"epoch": 0.7233583774131599,
"grad_norm": 6.397301804989678,
"learning_rate": 2.1552850652424077e-07,
"loss": 1.8495,
"step": 10735
},
{
"epoch": 0.7236952932852667,
"grad_norm": 6.029146969394674,
"learning_rate": 2.150451054732581e-07,
"loss": 1.7442,
"step": 10740
},
{
"epoch": 0.7240322091573734,
"grad_norm": 6.426485862249062,
"learning_rate": 2.1456209857361246e-07,
"loss": 1.769,
"step": 10745
},
{
"epoch": 0.7243691250294801,
"grad_norm": 5.915318242136389,
"learning_rate": 2.1407948649340208e-07,
"loss": 1.7982,
"step": 10750
},
{
"epoch": 0.7247060409015869,
"grad_norm": 6.419855312273165,
"learning_rate": 2.1359726990017908e-07,
"loss": 1.7202,
"step": 10755
},
{
"epoch": 0.7250429567736936,
"grad_norm": 6.343992364049308,
"learning_rate": 2.13115449460948e-07,
"loss": 1.8065,
"step": 10760
},
{
"epoch": 0.7253798726458004,
"grad_norm": 6.240947236635965,
"learning_rate": 2.12634025842166e-07,
"loss": 1.7197,
"step": 10765
},
{
"epoch": 0.7257167885179071,
"grad_norm": 6.320174097494002,
"learning_rate": 2.1215299970974132e-07,
"loss": 1.8149,
"step": 10770
},
{
"epoch": 0.7260537043900138,
"grad_norm": 5.766045008893195,
"learning_rate": 2.116723717290318e-07,
"loss": 1.8156,
"step": 10775
},
{
"epoch": 0.7263906202621205,
"grad_norm": 6.130324245414857,
"learning_rate": 2.111921425648453e-07,
"loss": 1.8454,
"step": 10780
},
{
"epoch": 0.7267275361342272,
"grad_norm": 6.192434248424656,
"learning_rate": 2.1071231288143777e-07,
"loss": 1.8161,
"step": 10785
},
{
"epoch": 0.7270644520063341,
"grad_norm": 5.846908813819069,
"learning_rate": 2.1023288334251222e-07,
"loss": 1.7336,
"step": 10790
},
{
"epoch": 0.7274013678784408,
"grad_norm": 5.245586756603191,
"learning_rate": 2.0975385461121864e-07,
"loss": 1.8321,
"step": 10795
},
{
"epoch": 0.7277382837505475,
"grad_norm": 5.920551725229809,
"learning_rate": 2.0927522735015268e-07,
"loss": 1.6794,
"step": 10800
},
{
"epoch": 0.7280751996226542,
"grad_norm": 5.618144476884494,
"learning_rate": 2.0879700222135416e-07,
"loss": 1.792,
"step": 10805
},
{
"epoch": 0.7284121154947609,
"grad_norm": 6.032158334331574,
"learning_rate": 2.083191798863072e-07,
"loss": 1.783,
"step": 10810
},
{
"epoch": 0.7287490313668676,
"grad_norm": 6.065083852382277,
"learning_rate": 2.0784176100593836e-07,
"loss": 1.7731,
"step": 10815
},
{
"epoch": 0.7290859472389745,
"grad_norm": 6.262727055878644,
"learning_rate": 2.0736474624061655e-07,
"loss": 1.8491,
"step": 10820
},
{
"epoch": 0.7294228631110812,
"grad_norm": 5.725691879050536,
"learning_rate": 2.0688813625015123e-07,
"loss": 1.7585,
"step": 10825
},
{
"epoch": 0.7297597789831879,
"grad_norm": 6.127578280833137,
"learning_rate": 2.064119316937923e-07,
"loss": 1.8265,
"step": 10830
},
{
"epoch": 0.7300966948552946,
"grad_norm": 5.911039793824732,
"learning_rate": 2.0593613323022907e-07,
"loss": 1.75,
"step": 10835
},
{
"epoch": 0.7304336107274013,
"grad_norm": 6.138482118358594,
"learning_rate": 2.054607415175884e-07,
"loss": 1.802,
"step": 10840
},
{
"epoch": 0.7307705265995081,
"grad_norm": 6.265596175254885,
"learning_rate": 2.0498575721343525e-07,
"loss": 1.8145,
"step": 10845
},
{
"epoch": 0.7311074424716149,
"grad_norm": 6.2040452798334895,
"learning_rate": 2.0451118097477093e-07,
"loss": 1.7859,
"step": 10850
},
{
"epoch": 0.7314443583437216,
"grad_norm": 6.550091577089053,
"learning_rate": 2.0403701345803186e-07,
"loss": 1.7903,
"step": 10855
},
{
"epoch": 0.7317812742158283,
"grad_norm": 6.022289220773169,
"learning_rate": 2.0356325531908952e-07,
"loss": 1.787,
"step": 10860
},
{
"epoch": 0.732118190087935,
"grad_norm": 5.63615689024889,
"learning_rate": 2.0308990721324926e-07,
"loss": 1.7921,
"step": 10865
},
{
"epoch": 0.7324551059600418,
"grad_norm": 5.554562145897487,
"learning_rate": 2.0261696979524873e-07,
"loss": 1.7947,
"step": 10870
},
{
"epoch": 0.7327920218321485,
"grad_norm": 5.90754768571259,
"learning_rate": 2.0214444371925793e-07,
"loss": 1.6978,
"step": 10875
},
{
"epoch": 0.7331289377042552,
"grad_norm": 5.737704638109933,
"learning_rate": 2.0167232963887787e-07,
"loss": 1.7567,
"step": 10880
},
{
"epoch": 0.733465853576362,
"grad_norm": 6.505256107780983,
"learning_rate": 2.0120062820713974e-07,
"loss": 1.7953,
"step": 10885
},
{
"epoch": 0.7338027694484687,
"grad_norm": 6.0759411860109855,
"learning_rate": 2.0072934007650345e-07,
"loss": 1.7824,
"step": 10890
},
{
"epoch": 0.7341396853205755,
"grad_norm": 5.67638008354974,
"learning_rate": 2.0025846589885798e-07,
"loss": 1.8058,
"step": 10895
},
{
"epoch": 0.7344766011926822,
"grad_norm": 5.722828442355223,
"learning_rate": 1.99788006325519e-07,
"loss": 1.7025,
"step": 10900
},
{
"epoch": 0.7348135170647889,
"grad_norm": 5.832985351284273,
"learning_rate": 1.9931796200722943e-07,
"loss": 1.7731,
"step": 10905
},
{
"epoch": 0.7351504329368956,
"grad_norm": 5.781715493966306,
"learning_rate": 1.9884833359415698e-07,
"loss": 1.7884,
"step": 10910
},
{
"epoch": 0.7354873488090024,
"grad_norm": 5.648892528599704,
"learning_rate": 1.9837912173589494e-07,
"loss": 1.8245,
"step": 10915
},
{
"epoch": 0.7358242646811092,
"grad_norm": 6.213430881285172,
"learning_rate": 1.979103270814596e-07,
"loss": 1.8618,
"step": 10920
},
{
"epoch": 0.7361611805532159,
"grad_norm": 5.833744394817774,
"learning_rate": 1.9744195027929072e-07,
"loss": 1.7965,
"step": 10925
},
{
"epoch": 0.7364980964253226,
"grad_norm": 6.295540209087126,
"learning_rate": 1.9697399197725023e-07,
"loss": 1.8203,
"step": 10930
},
{
"epoch": 0.7368350122974293,
"grad_norm": 5.9347340029367786,
"learning_rate": 1.965064528226204e-07,
"loss": 1.8165,
"step": 10935
},
{
"epoch": 0.737171928169536,
"grad_norm": 6.24051717756245,
"learning_rate": 1.9603933346210445e-07,
"loss": 1.8334,
"step": 10940
},
{
"epoch": 0.7375088440416429,
"grad_norm": 6.077355157061982,
"learning_rate": 1.9557263454182476e-07,
"loss": 1.7887,
"step": 10945
},
{
"epoch": 0.7378457599137496,
"grad_norm": 5.672952748842877,
"learning_rate": 1.9510635670732216e-07,
"loss": 1.7845,
"step": 10950
},
{
"epoch": 0.7381826757858563,
"grad_norm": 6.078037272901075,
"learning_rate": 1.946405006035548e-07,
"loss": 1.7856,
"step": 10955
},
{
"epoch": 0.738519591657963,
"grad_norm": 5.934637815732029,
"learning_rate": 1.9417506687489772e-07,
"loss": 1.8035,
"step": 10960
},
{
"epoch": 0.7388565075300697,
"grad_norm": 5.667637574057072,
"learning_rate": 1.937100561651418e-07,
"loss": 1.7679,
"step": 10965
},
{
"epoch": 0.7391934234021765,
"grad_norm": 6.121851679797381,
"learning_rate": 1.9324546911749246e-07,
"loss": 1.7986,
"step": 10970
},
{
"epoch": 0.7395303392742832,
"grad_norm": 5.452560236133865,
"learning_rate": 1.9278130637456957e-07,
"loss": 1.7125,
"step": 10975
},
{
"epoch": 0.73986725514639,
"grad_norm": 6.117974741639468,
"learning_rate": 1.923175685784056e-07,
"loss": 1.791,
"step": 10980
},
{
"epoch": 0.7402041710184967,
"grad_norm": 6.418307670126074,
"learning_rate": 1.9185425637044567e-07,
"loss": 1.7894,
"step": 10985
},
{
"epoch": 0.7405410868906034,
"grad_norm": 5.9432415931639415,
"learning_rate": 1.9139137039154584e-07,
"loss": 1.8261,
"step": 10990
},
{
"epoch": 0.7408780027627101,
"grad_norm": 5.899504950225264,
"learning_rate": 1.9092891128197308e-07,
"loss": 1.7587,
"step": 10995
},
{
"epoch": 0.7412149186348169,
"grad_norm": 6.154385724870445,
"learning_rate": 1.904668796814033e-07,
"loss": 1.8855,
"step": 11000
},
{
"epoch": 0.7415518345069236,
"grad_norm": 6.4245473778375795,
"learning_rate": 1.9000527622892154e-07,
"loss": 1.8116,
"step": 11005
},
{
"epoch": 0.7418887503790303,
"grad_norm": 6.0428091022921535,
"learning_rate": 1.895441015630206e-07,
"loss": 1.7959,
"step": 11010
},
{
"epoch": 0.7422256662511371,
"grad_norm": 5.998009175042043,
"learning_rate": 1.8908335632160011e-07,
"loss": 1.8721,
"step": 11015
},
{
"epoch": 0.7425625821232438,
"grad_norm": 5.824437955606014,
"learning_rate": 1.8862304114196542e-07,
"loss": 1.7818,
"step": 11020
},
{
"epoch": 0.7428994979953506,
"grad_norm": 6.578699865645307,
"learning_rate": 1.8816315666082744e-07,
"loss": 1.7565,
"step": 11025
},
{
"epoch": 0.7432364138674573,
"grad_norm": 6.113247957569191,
"learning_rate": 1.877037035143013e-07,
"loss": 1.8026,
"step": 11030
},
{
"epoch": 0.743573329739564,
"grad_norm": 5.882144085105441,
"learning_rate": 1.8724468233790512e-07,
"loss": 1.7853,
"step": 11035
},
{
"epoch": 0.7439102456116707,
"grad_norm": 6.557823258170281,
"learning_rate": 1.867860937665599e-07,
"loss": 1.8762,
"step": 11040
},
{
"epoch": 0.7442471614837775,
"grad_norm": 5.474147682422214,
"learning_rate": 1.8632793843458827e-07,
"loss": 1.823,
"step": 11045
},
{
"epoch": 0.7445840773558843,
"grad_norm": 6.581490827678887,
"learning_rate": 1.8587021697571313e-07,
"loss": 1.7813,
"step": 11050
},
{
"epoch": 0.744920993227991,
"grad_norm": 6.447231328360916,
"learning_rate": 1.854129300230578e-07,
"loss": 1.8047,
"step": 11055
},
{
"epoch": 0.7452579091000977,
"grad_norm": 6.041430889705571,
"learning_rate": 1.849560782091445e-07,
"loss": 1.7589,
"step": 11060
},
{
"epoch": 0.7455948249722044,
"grad_norm": 6.245656556925254,
"learning_rate": 1.8449966216589319e-07,
"loss": 1.8047,
"step": 11065
},
{
"epoch": 0.7459317408443111,
"grad_norm": 6.341804719685981,
"learning_rate": 1.8404368252462128e-07,
"loss": 1.753,
"step": 11070
},
{
"epoch": 0.746268656716418,
"grad_norm": 6.114522479869793,
"learning_rate": 1.8358813991604262e-07,
"loss": 1.8459,
"step": 11075
},
{
"epoch": 0.7466055725885247,
"grad_norm": 5.82791681785764,
"learning_rate": 1.8313303497026673e-07,
"loss": 1.7887,
"step": 11080
},
{
"epoch": 0.7469424884606314,
"grad_norm": 6.059091931920949,
"learning_rate": 1.8267836831679718e-07,
"loss": 1.8508,
"step": 11085
},
{
"epoch": 0.7472794043327381,
"grad_norm": 6.184374099826963,
"learning_rate": 1.8222414058453183e-07,
"loss": 1.8226,
"step": 11090
},
{
"epoch": 0.7476163202048448,
"grad_norm": 5.768389403876578,
"learning_rate": 1.8177035240176136e-07,
"loss": 1.8282,
"step": 11095
},
{
"epoch": 0.7479532360769516,
"grad_norm": 6.626367557762699,
"learning_rate": 1.8131700439616803e-07,
"loss": 1.8547,
"step": 11100
},
{
"epoch": 0.7482901519490583,
"grad_norm": 5.830974589056405,
"learning_rate": 1.8086409719482576e-07,
"loss": 1.729,
"step": 11105
},
{
"epoch": 0.7486270678211651,
"grad_norm": 5.856454265986918,
"learning_rate": 1.8041163142419857e-07,
"loss": 1.8279,
"step": 11110
},
{
"epoch": 0.7489639836932718,
"grad_norm": 6.178371847517803,
"learning_rate": 1.7995960771013962e-07,
"loss": 1.7677,
"step": 11115
},
{
"epoch": 0.7493008995653785,
"grad_norm": 6.657651397465904,
"learning_rate": 1.7950802667789107e-07,
"loss": 1.8415,
"step": 11120
},
{
"epoch": 0.7496378154374853,
"grad_norm": 5.550990585298974,
"learning_rate": 1.7905688895208259e-07,
"loss": 1.7762,
"step": 11125
},
{
"epoch": 0.749974731309592,
"grad_norm": 6.099324547733622,
"learning_rate": 1.7860619515673032e-07,
"loss": 1.8136,
"step": 11130
},
{
"epoch": 0.7503116471816987,
"grad_norm": 6.650418350028828,
"learning_rate": 1.7815594591523687e-07,
"loss": 1.8521,
"step": 11135
},
{
"epoch": 0.7506485630538055,
"grad_norm": 5.902004000771492,
"learning_rate": 1.777061418503898e-07,
"loss": 1.8051,
"step": 11140
},
{
"epoch": 0.7509854789259122,
"grad_norm": 5.952130300679766,
"learning_rate": 1.7725678358436053e-07,
"loss": 1.784,
"step": 11145
},
{
"epoch": 0.751322394798019,
"grad_norm": 5.775716403203176,
"learning_rate": 1.7680787173870454e-07,
"loss": 1.7143,
"step": 11150
},
{
"epoch": 0.7516593106701257,
"grad_norm": 5.483388083311626,
"learning_rate": 1.763594069343589e-07,
"loss": 1.7443,
"step": 11155
},
{
"epoch": 0.7519962265422324,
"grad_norm": 6.1697764587867265,
"learning_rate": 1.7591138979164337e-07,
"loss": 1.7364,
"step": 11160
},
{
"epoch": 0.7523331424143391,
"grad_norm": 5.761947711806687,
"learning_rate": 1.7546382093025758e-07,
"loss": 1.8046,
"step": 11165
},
{
"epoch": 0.7526700582864458,
"grad_norm": 6.194710744116213,
"learning_rate": 1.7501670096928162e-07,
"loss": 1.8163,
"step": 11170
},
{
"epoch": 0.7530069741585526,
"grad_norm": 5.488662638622846,
"learning_rate": 1.7457003052717473e-07,
"loss": 1.859,
"step": 11175
},
{
"epoch": 0.7533438900306594,
"grad_norm": 6.690003543021317,
"learning_rate": 1.741238102217738e-07,
"loss": 1.7725,
"step": 11180
},
{
"epoch": 0.7536808059027661,
"grad_norm": 5.803437120207633,
"learning_rate": 1.736780406702937e-07,
"loss": 1.778,
"step": 11185
},
{
"epoch": 0.7540177217748728,
"grad_norm": 6.069859658324239,
"learning_rate": 1.7323272248932564e-07,
"loss": 1.7409,
"step": 11190
},
{
"epoch": 0.7543546376469795,
"grad_norm": 5.500334561391225,
"learning_rate": 1.727878562948362e-07,
"loss": 1.769,
"step": 11195
},
{
"epoch": 0.7546915535190862,
"grad_norm": 6.708781480901355,
"learning_rate": 1.723434427021671e-07,
"loss": 1.7128,
"step": 11200
},
{
"epoch": 0.7550284693911931,
"grad_norm": 5.826615006643447,
"learning_rate": 1.7189948232603412e-07,
"loss": 1.815,
"step": 11205
},
{
"epoch": 0.7553653852632998,
"grad_norm": 5.4713789742043275,
"learning_rate": 1.7145597578052557e-07,
"loss": 1.8527,
"step": 11210
},
{
"epoch": 0.7557023011354065,
"grad_norm": 5.9722168111861,
"learning_rate": 1.7101292367910259e-07,
"loss": 1.8327,
"step": 11215
},
{
"epoch": 0.7560392170075132,
"grad_norm": 5.927637118304886,
"learning_rate": 1.7057032663459768e-07,
"loss": 1.8339,
"step": 11220
},
{
"epoch": 0.7563761328796199,
"grad_norm": 6.7802666022955815,
"learning_rate": 1.701281852592134e-07,
"loss": 1.7988,
"step": 11225
},
{
"epoch": 0.7567130487517267,
"grad_norm": 5.797191803923179,
"learning_rate": 1.696865001645228e-07,
"loss": 1.7488,
"step": 11230
},
{
"epoch": 0.7570499646238334,
"grad_norm": 5.441139282117675,
"learning_rate": 1.6924527196146692e-07,
"loss": 1.8553,
"step": 11235
},
{
"epoch": 0.7573868804959402,
"grad_norm": 5.812956263017639,
"learning_rate": 1.6880450126035572e-07,
"loss": 1.7806,
"step": 11240
},
{
"epoch": 0.7577237963680469,
"grad_norm": 6.166337691279962,
"learning_rate": 1.683641886708655e-07,
"loss": 1.7556,
"step": 11245
},
{
"epoch": 0.7580607122401536,
"grad_norm": 5.87741529416599,
"learning_rate": 1.6792433480203955e-07,
"loss": 1.7676,
"step": 11250
},
{
"epoch": 0.7583976281122604,
"grad_norm": 6.040766300149007,
"learning_rate": 1.674849402622865e-07,
"loss": 1.8061,
"step": 11255
},
{
"epoch": 0.7587345439843671,
"grad_norm": 5.897769345655216,
"learning_rate": 1.6704600565937927e-07,
"loss": 1.8181,
"step": 11260
},
{
"epoch": 0.7590714598564738,
"grad_norm": 6.171163777497132,
"learning_rate": 1.6660753160045498e-07,
"loss": 1.738,
"step": 11265
},
{
"epoch": 0.7594083757285806,
"grad_norm": 5.981331595498035,
"learning_rate": 1.6616951869201378e-07,
"loss": 1.7648,
"step": 11270
},
{
"epoch": 0.7597452916006873,
"grad_norm": 5.305640332140947,
"learning_rate": 1.6573196753991747e-07,
"loss": 1.8503,
"step": 11275
},
{
"epoch": 0.7600822074727941,
"grad_norm": 6.0520841045058456,
"learning_rate": 1.652948787493896e-07,
"loss": 1.797,
"step": 11280
},
{
"epoch": 0.7604191233449008,
"grad_norm": 6.148814790534166,
"learning_rate": 1.64858252925014e-07,
"loss": 1.8173,
"step": 11285
},
{
"epoch": 0.7607560392170075,
"grad_norm": 5.979828710800014,
"learning_rate": 1.6442209067073442e-07,
"loss": 1.7896,
"step": 11290
},
{
"epoch": 0.7610929550891142,
"grad_norm": 6.217740605002487,
"learning_rate": 1.639863925898527e-07,
"loss": 1.812,
"step": 11295
},
{
"epoch": 0.761429870961221,
"grad_norm": 6.185363065792439,
"learning_rate": 1.6355115928502934e-07,
"loss": 1.8223,
"step": 11300
},
{
"epoch": 0.7617667868333278,
"grad_norm": 6.068122134218195,
"learning_rate": 1.6311639135828176e-07,
"loss": 1.8278,
"step": 11305
},
{
"epoch": 0.7621037027054345,
"grad_norm": 6.41089152387682,
"learning_rate": 1.6268208941098344e-07,
"loss": 1.7763,
"step": 11310
},
{
"epoch": 0.7624406185775412,
"grad_norm": 5.670824470630969,
"learning_rate": 1.6224825404386326e-07,
"loss": 1.7438,
"step": 11315
},
{
"epoch": 0.7627775344496479,
"grad_norm": 5.613534488092596,
"learning_rate": 1.6181488585700541e-07,
"loss": 1.6914,
"step": 11320
},
{
"epoch": 0.7631144503217546,
"grad_norm": 6.215093315606554,
"learning_rate": 1.6138198544984692e-07,
"loss": 1.8232,
"step": 11325
},
{
"epoch": 0.7634513661938614,
"grad_norm": 5.922779615098384,
"learning_rate": 1.609495534211785e-07,
"loss": 1.8005,
"step": 11330
},
{
"epoch": 0.7637882820659682,
"grad_norm": 5.777262125949082,
"learning_rate": 1.6051759036914286e-07,
"loss": 1.7512,
"step": 11335
},
{
"epoch": 0.7641251979380749,
"grad_norm": 6.045493587778366,
"learning_rate": 1.6008609689123364e-07,
"loss": 1.8303,
"step": 11340
},
{
"epoch": 0.7644621138101816,
"grad_norm": 6.160481197853635,
"learning_rate": 1.596550735842953e-07,
"loss": 1.8254,
"step": 11345
},
{
"epoch": 0.7647990296822883,
"grad_norm": 5.995522803991697,
"learning_rate": 1.5922452104452204e-07,
"loss": 1.795,
"step": 11350
},
{
"epoch": 0.765135945554395,
"grad_norm": 6.407506795307969,
"learning_rate": 1.5879443986745678e-07,
"loss": 1.8201,
"step": 11355
},
{
"epoch": 0.7654728614265018,
"grad_norm": 5.868369233442601,
"learning_rate": 1.583648306479901e-07,
"loss": 1.7845,
"step": 11360
},
{
"epoch": 0.7658097772986086,
"grad_norm": 6.300762474878201,
"learning_rate": 1.5793569398036032e-07,
"loss": 1.8119,
"step": 11365
},
{
"epoch": 0.7661466931707153,
"grad_norm": 5.732440809710967,
"learning_rate": 1.57507030458152e-07,
"loss": 1.7298,
"step": 11370
},
{
"epoch": 0.766483609042822,
"grad_norm": 5.691483180721791,
"learning_rate": 1.5707884067429471e-07,
"loss": 1.8783,
"step": 11375
},
{
"epoch": 0.7668205249149287,
"grad_norm": 5.888135695526149,
"learning_rate": 1.566511252210635e-07,
"loss": 1.7552,
"step": 11380
},
{
"epoch": 0.7671574407870355,
"grad_norm": 5.475719381961438,
"learning_rate": 1.5622388469007696e-07,
"loss": 1.7779,
"step": 11385
},
{
"epoch": 0.7674943566591422,
"grad_norm": 5.736297394223776,
"learning_rate": 1.5579711967229652e-07,
"loss": 1.7512,
"step": 11390
},
{
"epoch": 0.7678312725312489,
"grad_norm": 6.108820433367013,
"learning_rate": 1.5537083075802647e-07,
"loss": 1.8151,
"step": 11395
},
{
"epoch": 0.7681681884033557,
"grad_norm": 5.645549075230665,
"learning_rate": 1.5494501853691195e-07,
"loss": 1.8157,
"step": 11400
},
{
"epoch": 0.7685051042754624,
"grad_norm": 6.416767723799877,
"learning_rate": 1.5451968359793927e-07,
"loss": 1.7385,
"step": 11405
},
{
"epoch": 0.7688420201475692,
"grad_norm": 5.795195428364132,
"learning_rate": 1.5409482652943396e-07,
"loss": 1.753,
"step": 11410
},
{
"epoch": 0.7691789360196759,
"grad_norm": 6.287596532670337,
"learning_rate": 1.536704479190611e-07,
"loss": 1.7284,
"step": 11415
},
{
"epoch": 0.7695158518917826,
"grad_norm": 5.7722465474507425,
"learning_rate": 1.5324654835382384e-07,
"loss": 1.8132,
"step": 11420
},
{
"epoch": 0.7698527677638893,
"grad_norm": 5.810495984958396,
"learning_rate": 1.5282312842006238e-07,
"loss": 1.7963,
"step": 11425
},
{
"epoch": 0.770189683635996,
"grad_norm": 5.7818877383121015,
"learning_rate": 1.5240018870345388e-07,
"loss": 1.7625,
"step": 11430
},
{
"epoch": 0.7705265995081029,
"grad_norm": 6.4616000691306015,
"learning_rate": 1.519777297890113e-07,
"loss": 1.7305,
"step": 11435
},
{
"epoch": 0.7708635153802096,
"grad_norm": 6.9500507066394235,
"learning_rate": 1.5155575226108198e-07,
"loss": 1.7646,
"step": 11440
},
{
"epoch": 0.7712004312523163,
"grad_norm": 5.90587314198153,
"learning_rate": 1.51134256703348e-07,
"loss": 1.8436,
"step": 11445
},
{
"epoch": 0.771537347124423,
"grad_norm": 5.866892874363843,
"learning_rate": 1.5071324369882478e-07,
"loss": 1.7458,
"step": 11450
},
{
"epoch": 0.7718742629965297,
"grad_norm": 6.291434417162354,
"learning_rate": 1.5029271382985964e-07,
"loss": 1.7889,
"step": 11455
},
{
"epoch": 0.7722111788686365,
"grad_norm": 6.101683917084692,
"learning_rate": 1.498726676781323e-07,
"loss": 1.8393,
"step": 11460
},
{
"epoch": 0.7725480947407433,
"grad_norm": 6.448679173286852,
"learning_rate": 1.4945310582465327e-07,
"loss": 1.823,
"step": 11465
},
{
"epoch": 0.77288501061285,
"grad_norm": 6.03477234470622,
"learning_rate": 1.4903402884976262e-07,
"loss": 1.8207,
"step": 11470
},
{
"epoch": 0.7732219264849567,
"grad_norm": 6.7380284585238135,
"learning_rate": 1.4861543733313065e-07,
"loss": 1.8021,
"step": 11475
},
{
"epoch": 0.7735588423570634,
"grad_norm": 6.077995753476859,
"learning_rate": 1.4819733185375531e-07,
"loss": 1.8847,
"step": 11480
},
{
"epoch": 0.7738957582291702,
"grad_norm": 5.664292238827305,
"learning_rate": 1.4777971298996288e-07,
"loss": 1.7623,
"step": 11485
},
{
"epoch": 0.7742326741012769,
"grad_norm": 5.799868960607819,
"learning_rate": 1.4736258131940605e-07,
"loss": 1.7456,
"step": 11490
},
{
"epoch": 0.7745695899733837,
"grad_norm": 6.433673442490591,
"learning_rate": 1.4694593741906403e-07,
"loss": 1.8046,
"step": 11495
},
{
"epoch": 0.7749065058454904,
"grad_norm": 6.079136779626077,
"learning_rate": 1.4652978186524135e-07,
"loss": 1.8136,
"step": 11500
},
{
"epoch": 0.7752434217175971,
"grad_norm": 5.575230853873149,
"learning_rate": 1.4611411523356653e-07,
"loss": 1.8148,
"step": 11505
},
{
"epoch": 0.7755803375897039,
"grad_norm": 5.9996748971291,
"learning_rate": 1.4569893809899242e-07,
"loss": 1.8543,
"step": 11510
},
{
"epoch": 0.7759172534618106,
"grad_norm": 5.931034717600519,
"learning_rate": 1.452842510357946e-07,
"loss": 1.756,
"step": 11515
},
{
"epoch": 0.7762541693339173,
"grad_norm": 5.768234880310048,
"learning_rate": 1.4487005461757051e-07,
"loss": 1.7537,
"step": 11520
},
{
"epoch": 0.776591085206024,
"grad_norm": 5.848819600001102,
"learning_rate": 1.4445634941723927e-07,
"loss": 1.7961,
"step": 11525
},
{
"epoch": 0.7769280010781308,
"grad_norm": 5.865945761273097,
"learning_rate": 1.4404313600704054e-07,
"loss": 1.8757,
"step": 11530
},
{
"epoch": 0.7772649169502375,
"grad_norm": 6.156370938984119,
"learning_rate": 1.4363041495853334e-07,
"loss": 1.7886,
"step": 11535
},
{
"epoch": 0.7776018328223443,
"grad_norm": 6.286006570843523,
"learning_rate": 1.4321818684259607e-07,
"loss": 1.8275,
"step": 11540
},
{
"epoch": 0.777938748694451,
"grad_norm": 5.702969599119352,
"learning_rate": 1.4280645222942535e-07,
"loss": 1.786,
"step": 11545
},
{
"epoch": 0.7782756645665577,
"grad_norm": 5.670263286040782,
"learning_rate": 1.4239521168853458e-07,
"loss": 1.7547,
"step": 11550
},
{
"epoch": 0.7786125804386644,
"grad_norm": 5.877323637618684,
"learning_rate": 1.4198446578875444e-07,
"loss": 1.7754,
"step": 11555
},
{
"epoch": 0.7789494963107712,
"grad_norm": 5.950199480950997,
"learning_rate": 1.4157421509823119e-07,
"loss": 1.7511,
"step": 11560
},
{
"epoch": 0.779286412182878,
"grad_norm": 5.888421854077837,
"learning_rate": 1.4116446018442608e-07,
"loss": 1.7078,
"step": 11565
},
{
"epoch": 0.7796233280549847,
"grad_norm": 6.034680017416559,
"learning_rate": 1.4075520161411425e-07,
"loss": 1.7349,
"step": 11570
},
{
"epoch": 0.7799602439270914,
"grad_norm": 6.307386970077885,
"learning_rate": 1.403464399533849e-07,
"loss": 1.7923,
"step": 11575
},
{
"epoch": 0.7802971597991981,
"grad_norm": 5.967516151887473,
"learning_rate": 1.3993817576763983e-07,
"loss": 1.8707,
"step": 11580
},
{
"epoch": 0.7806340756713048,
"grad_norm": 6.351426294821415,
"learning_rate": 1.3953040962159207e-07,
"loss": 1.7889,
"step": 11585
},
{
"epoch": 0.7809709915434117,
"grad_norm": 6.192822266741291,
"learning_rate": 1.3912314207926657e-07,
"loss": 1.77,
"step": 11590
},
{
"epoch": 0.7813079074155184,
"grad_norm": 5.8783177680109855,
"learning_rate": 1.3871637370399824e-07,
"loss": 1.7897,
"step": 11595
},
{
"epoch": 0.7816448232876251,
"grad_norm": 6.222300669334982,
"learning_rate": 1.3831010505843139e-07,
"loss": 1.7571,
"step": 11600
},
{
"epoch": 0.7819817391597318,
"grad_norm": 5.5964193211778825,
"learning_rate": 1.3790433670451927e-07,
"loss": 1.729,
"step": 11605
},
{
"epoch": 0.7823186550318385,
"grad_norm": 5.724469476942663,
"learning_rate": 1.374990692035235e-07,
"loss": 1.7733,
"step": 11610
},
{
"epoch": 0.7826555709039453,
"grad_norm": 6.081763550531529,
"learning_rate": 1.3709430311601205e-07,
"loss": 1.7551,
"step": 11615
},
{
"epoch": 0.782992486776052,
"grad_norm": 5.656334636067317,
"learning_rate": 1.366900390018601e-07,
"loss": 1.689,
"step": 11620
},
{
"epoch": 0.7833294026481588,
"grad_norm": 5.899100005447112,
"learning_rate": 1.3628627742024812e-07,
"loss": 1.8493,
"step": 11625
},
{
"epoch": 0.7836663185202655,
"grad_norm": 6.416010977631392,
"learning_rate": 1.3588301892966182e-07,
"loss": 1.6631,
"step": 11630
},
{
"epoch": 0.7840032343923722,
"grad_norm": 6.2420816157066845,
"learning_rate": 1.3548026408789044e-07,
"loss": 1.8145,
"step": 11635
},
{
"epoch": 0.784340150264479,
"grad_norm": 6.394093878050669,
"learning_rate": 1.350780134520272e-07,
"loss": 1.8385,
"step": 11640
},
{
"epoch": 0.7846770661365857,
"grad_norm": 5.898670325238131,
"learning_rate": 1.3467626757846733e-07,
"loss": 1.765,
"step": 11645
},
{
"epoch": 0.7850139820086924,
"grad_norm": 5.755709771745298,
"learning_rate": 1.342750270229085e-07,
"loss": 1.7411,
"step": 11650
},
{
"epoch": 0.7853508978807991,
"grad_norm": 6.361755561484261,
"learning_rate": 1.338742923403487e-07,
"loss": 1.7003,
"step": 11655
},
{
"epoch": 0.7856878137529059,
"grad_norm": 6.475887456311801,
"learning_rate": 1.3347406408508694e-07,
"loss": 1.7245,
"step": 11660
},
{
"epoch": 0.7860247296250127,
"grad_norm": 5.87715144272717,
"learning_rate": 1.3307434281072106e-07,
"loss": 1.827,
"step": 11665
},
{
"epoch": 0.7863616454971194,
"grad_norm": 6.139319666913612,
"learning_rate": 1.326751290701481e-07,
"loss": 1.7936,
"step": 11670
},
{
"epoch": 0.7866985613692261,
"grad_norm": 5.760142588635934,
"learning_rate": 1.3227642341556306e-07,
"loss": 1.8124,
"step": 11675
},
{
"epoch": 0.7870354772413328,
"grad_norm": 5.731411372422863,
"learning_rate": 1.318782263984577e-07,
"loss": 1.813,
"step": 11680
},
{
"epoch": 0.7873723931134395,
"grad_norm": 6.028570522927185,
"learning_rate": 1.314805385696207e-07,
"loss": 1.8055,
"step": 11685
},
{
"epoch": 0.7877093089855464,
"grad_norm": 6.4147651771131144,
"learning_rate": 1.3108336047913633e-07,
"loss": 1.8518,
"step": 11690
},
{
"epoch": 0.7880462248576531,
"grad_norm": 5.745385502619762,
"learning_rate": 1.3068669267638377e-07,
"loss": 1.766,
"step": 11695
},
{
"epoch": 0.7883831407297598,
"grad_norm": 5.753592382263267,
"learning_rate": 1.3029053571003619e-07,
"loss": 1.8171,
"step": 11700
},
{
"epoch": 0.7887200566018665,
"grad_norm": 6.094153318282813,
"learning_rate": 1.2989489012806033e-07,
"loss": 1.8266,
"step": 11705
},
{
"epoch": 0.7890569724739732,
"grad_norm": 6.626686965131564,
"learning_rate": 1.294997564777157e-07,
"loss": 1.7849,
"step": 11710
},
{
"epoch": 0.7893938883460799,
"grad_norm": 6.089503582912961,
"learning_rate": 1.291051353055534e-07,
"loss": 1.8466,
"step": 11715
},
{
"epoch": 0.7897308042181868,
"grad_norm": 5.949166550268341,
"learning_rate": 1.28711027157416e-07,
"loss": 1.7636,
"step": 11720
},
{
"epoch": 0.7900677200902935,
"grad_norm": 5.777412800846406,
"learning_rate": 1.2831743257843597e-07,
"loss": 1.7805,
"step": 11725
},
{
"epoch": 0.7904046359624002,
"grad_norm": 6.030785841040514,
"learning_rate": 1.279243521130361e-07,
"loss": 1.7691,
"step": 11730
},
{
"epoch": 0.7907415518345069,
"grad_norm": 6.159706148659863,
"learning_rate": 1.2753178630492733e-07,
"loss": 1.874,
"step": 11735
},
{
"epoch": 0.7910784677066136,
"grad_norm": 5.984302184404192,
"learning_rate": 1.271397356971094e-07,
"loss": 1.7516,
"step": 11740
},
{
"epoch": 0.7914153835787204,
"grad_norm": 6.29530906949322,
"learning_rate": 1.267482008318687e-07,
"loss": 1.7803,
"step": 11745
},
{
"epoch": 0.7917522994508271,
"grad_norm": 5.999071523327007,
"learning_rate": 1.2635718225077884e-07,
"loss": 1.9044,
"step": 11750
},
{
"epoch": 0.7920892153229339,
"grad_norm": 6.009971558825039,
"learning_rate": 1.259666804946991e-07,
"loss": 1.7475,
"step": 11755
},
{
"epoch": 0.7924261311950406,
"grad_norm": 6.1762343420480725,
"learning_rate": 1.2557669610377397e-07,
"loss": 1.7928,
"step": 11760
},
{
"epoch": 0.7927630470671473,
"grad_norm": 6.274370196479858,
"learning_rate": 1.25187229617432e-07,
"loss": 1.8475,
"step": 11765
},
{
"epoch": 0.7930999629392541,
"grad_norm": 5.619732178273275,
"learning_rate": 1.247982815743857e-07,
"loss": 1.7979,
"step": 11770
},
{
"epoch": 0.7934368788113608,
"grad_norm": 6.37694088286426,
"learning_rate": 1.2440985251263054e-07,
"loss": 1.7106,
"step": 11775
},
{
"epoch": 0.7937737946834675,
"grad_norm": 5.846264824270768,
"learning_rate": 1.2402194296944363e-07,
"loss": 1.7965,
"step": 11780
},
{
"epoch": 0.7941107105555743,
"grad_norm": 6.051659571401686,
"learning_rate": 1.236345534813839e-07,
"loss": 1.7276,
"step": 11785
},
{
"epoch": 0.794447626427681,
"grad_norm": 6.040361548706306,
"learning_rate": 1.2324768458429107e-07,
"loss": 1.7819,
"step": 11790
},
{
"epoch": 0.7947845422997878,
"grad_norm": 6.5893551244691,
"learning_rate": 1.228613368132842e-07,
"loss": 1.8357,
"step": 11795
},
{
"epoch": 0.7951214581718945,
"grad_norm": 5.556745304140574,
"learning_rate": 1.2247551070276207e-07,
"loss": 1.6579,
"step": 11800
},
{
"epoch": 0.7954583740440012,
"grad_norm": 5.90446121886169,
"learning_rate": 1.2209020678640176e-07,
"loss": 1.8439,
"step": 11805
},
{
"epoch": 0.7957952899161079,
"grad_norm": 6.108546411118622,
"learning_rate": 1.2170542559715775e-07,
"loss": 1.7384,
"step": 11810
},
{
"epoch": 0.7961322057882146,
"grad_norm": 6.183593777221998,
"learning_rate": 1.2132116766726196e-07,
"loss": 1.8085,
"step": 11815
},
{
"epoch": 0.7964691216603215,
"grad_norm": 5.981494846943721,
"learning_rate": 1.2093743352822206e-07,
"loss": 1.8076,
"step": 11820
},
{
"epoch": 0.7968060375324282,
"grad_norm": 7.444395690512402,
"learning_rate": 1.2055422371082168e-07,
"loss": 1.7539,
"step": 11825
},
{
"epoch": 0.7971429534045349,
"grad_norm": 5.876372023884193,
"learning_rate": 1.2017153874511865e-07,
"loss": 1.7757,
"step": 11830
},
{
"epoch": 0.7974798692766416,
"grad_norm": 6.370625401977648,
"learning_rate": 1.1978937916044534e-07,
"loss": 1.8109,
"step": 11835
},
{
"epoch": 0.7978167851487483,
"grad_norm": 5.883242118263617,
"learning_rate": 1.1940774548540733e-07,
"loss": 1.7298,
"step": 11840
},
{
"epoch": 0.7981537010208551,
"grad_norm": 5.767584882456747,
"learning_rate": 1.1902663824788233e-07,
"loss": 1.7702,
"step": 11845
},
{
"epoch": 0.7984906168929619,
"grad_norm": 5.756847394591811,
"learning_rate": 1.1864605797502031e-07,
"loss": 1.7993,
"step": 11850
},
{
"epoch": 0.7988275327650686,
"grad_norm": 5.412926796228684,
"learning_rate": 1.1826600519324237e-07,
"loss": 1.8253,
"step": 11855
},
{
"epoch": 0.7991644486371753,
"grad_norm": 6.040919321374024,
"learning_rate": 1.1788648042823956e-07,
"loss": 1.7589,
"step": 11860
},
{
"epoch": 0.799501364509282,
"grad_norm": 6.478672926194837,
"learning_rate": 1.1750748420497298e-07,
"loss": 1.7519,
"step": 11865
},
{
"epoch": 0.7998382803813888,
"grad_norm": 6.375820517208503,
"learning_rate": 1.1712901704767253e-07,
"loss": 1.7681,
"step": 11870
},
{
"epoch": 0.8001751962534955,
"grad_norm": 5.709068291536156,
"learning_rate": 1.1675107947983615e-07,
"loss": 1.8144,
"step": 11875
},
{
"epoch": 0.8005121121256022,
"grad_norm": 6.054350290149295,
"learning_rate": 1.1637367202422943e-07,
"loss": 1.8536,
"step": 11880
},
{
"epoch": 0.800849027997709,
"grad_norm": 5.698352041079042,
"learning_rate": 1.159967952028848e-07,
"loss": 1.8335,
"step": 11885
},
{
"epoch": 0.8011859438698157,
"grad_norm": 6.031855898334881,
"learning_rate": 1.1562044953710032e-07,
"loss": 1.7183,
"step": 11890
},
{
"epoch": 0.8015228597419224,
"grad_norm": 5.625308544157251,
"learning_rate": 1.152446355474398e-07,
"loss": 1.7847,
"step": 11895
},
{
"epoch": 0.8018597756140292,
"grad_norm": 6.683973873807723,
"learning_rate": 1.1486935375373124e-07,
"loss": 1.8403,
"step": 11900
},
{
"epoch": 0.8021966914861359,
"grad_norm": 6.641861847519315,
"learning_rate": 1.1449460467506689e-07,
"loss": 1.7658,
"step": 11905
},
{
"epoch": 0.8025336073582426,
"grad_norm": 5.610412974235367,
"learning_rate": 1.1412038882980174e-07,
"loss": 1.7931,
"step": 11910
},
{
"epoch": 0.8028705232303494,
"grad_norm": 5.685183233588973,
"learning_rate": 1.1374670673555348e-07,
"loss": 1.7837,
"step": 11915
},
{
"epoch": 0.8032074391024561,
"grad_norm": 5.611660132961658,
"learning_rate": 1.1337355890920169e-07,
"loss": 1.8168,
"step": 11920
},
{
"epoch": 0.8035443549745629,
"grad_norm": 5.403506032815681,
"learning_rate": 1.130009458668863e-07,
"loss": 1.7992,
"step": 11925
},
{
"epoch": 0.8038812708466696,
"grad_norm": 5.79197296037922,
"learning_rate": 1.1262886812400813e-07,
"loss": 1.7595,
"step": 11930
},
{
"epoch": 0.8042181867187763,
"grad_norm": 5.8569410896518574,
"learning_rate": 1.1225732619522754e-07,
"loss": 1.7474,
"step": 11935
},
{
"epoch": 0.804555102590883,
"grad_norm": 6.025795706365198,
"learning_rate": 1.118863205944633e-07,
"loss": 1.7532,
"step": 11940
},
{
"epoch": 0.8048920184629897,
"grad_norm": 6.168101917389683,
"learning_rate": 1.1151585183489266e-07,
"loss": 1.8026,
"step": 11945
},
{
"epoch": 0.8052289343350966,
"grad_norm": 5.827955134646608,
"learning_rate": 1.1114592042895044e-07,
"loss": 1.775,
"step": 11950
},
{
"epoch": 0.8055658502072033,
"grad_norm": 6.076578864373635,
"learning_rate": 1.1077652688832772e-07,
"loss": 1.7875,
"step": 11955
},
{
"epoch": 0.80590276607931,
"grad_norm": 5.7168850287245245,
"learning_rate": 1.1040767172397209e-07,
"loss": 1.7375,
"step": 11960
},
{
"epoch": 0.8062396819514167,
"grad_norm": 6.282872282911842,
"learning_rate": 1.1003935544608612e-07,
"loss": 1.777,
"step": 11965
},
{
"epoch": 0.8065765978235234,
"grad_norm": 5.887798609993405,
"learning_rate": 1.0967157856412739e-07,
"loss": 1.7622,
"step": 11970
},
{
"epoch": 0.8069135136956302,
"grad_norm": 6.199457852026214,
"learning_rate": 1.093043415868069e-07,
"loss": 1.8875,
"step": 11975
},
{
"epoch": 0.807250429567737,
"grad_norm": 5.745866008462109,
"learning_rate": 1.0893764502208891e-07,
"loss": 1.8143,
"step": 11980
},
{
"epoch": 0.8075873454398437,
"grad_norm": 6.110266261481142,
"learning_rate": 1.0857148937719063e-07,
"loss": 1.7584,
"step": 11985
},
{
"epoch": 0.8079242613119504,
"grad_norm": 5.969686237894243,
"learning_rate": 1.0820587515858054e-07,
"loss": 1.8105,
"step": 11990
},
{
"epoch": 0.8082611771840571,
"grad_norm": 5.759859891411606,
"learning_rate": 1.078408028719785e-07,
"loss": 1.7588,
"step": 11995
},
{
"epoch": 0.8085980930561639,
"grad_norm": 5.579575443228889,
"learning_rate": 1.0747627302235491e-07,
"loss": 1.8207,
"step": 12000
},
{
"epoch": 0.8089350089282706,
"grad_norm": 6.213088918697938,
"learning_rate": 1.0711228611392936e-07,
"loss": 1.8158,
"step": 12005
},
{
"epoch": 0.8092719248003774,
"grad_norm": 6.19662993284419,
"learning_rate": 1.0674884265017086e-07,
"loss": 1.7965,
"step": 12010
},
{
"epoch": 0.8096088406724841,
"grad_norm": 5.992976800633545,
"learning_rate": 1.0638594313379678e-07,
"loss": 1.7008,
"step": 12015
},
{
"epoch": 0.8099457565445908,
"grad_norm": 6.142320900455945,
"learning_rate": 1.060235880667717e-07,
"loss": 1.839,
"step": 12020
},
{
"epoch": 0.8102826724166976,
"grad_norm": 5.560359240436788,
"learning_rate": 1.056617779503074e-07,
"loss": 1.7725,
"step": 12025
},
{
"epoch": 0.8106195882888043,
"grad_norm": 6.437317541471631,
"learning_rate": 1.053005132848619e-07,
"loss": 1.7889,
"step": 12030
},
{
"epoch": 0.810956504160911,
"grad_norm": 5.943008760322463,
"learning_rate": 1.0493979457013874e-07,
"loss": 1.8495,
"step": 12035
},
{
"epoch": 0.8112934200330177,
"grad_norm": 5.612578556134183,
"learning_rate": 1.0457962230508599e-07,
"loss": 1.768,
"step": 12040
},
{
"epoch": 0.8116303359051245,
"grad_norm": 6.331371365186378,
"learning_rate": 1.042199969878963e-07,
"loss": 1.7489,
"step": 12045
},
{
"epoch": 0.8119672517772313,
"grad_norm": 5.594775080040045,
"learning_rate": 1.0386091911600564e-07,
"loss": 1.7299,
"step": 12050
},
{
"epoch": 0.812304167649338,
"grad_norm": 5.993230773092134,
"learning_rate": 1.0350238918609244e-07,
"loss": 1.7439,
"step": 12055
},
{
"epoch": 0.8126410835214447,
"grad_norm": 5.79719115106528,
"learning_rate": 1.0314440769407784e-07,
"loss": 1.7425,
"step": 12060
},
{
"epoch": 0.8129779993935514,
"grad_norm": 6.163239487153185,
"learning_rate": 1.0278697513512375e-07,
"loss": 1.6992,
"step": 12065
},
{
"epoch": 0.8133149152656581,
"grad_norm": 6.019555217322019,
"learning_rate": 1.0243009200363295e-07,
"loss": 1.7035,
"step": 12070
},
{
"epoch": 0.8136518311377648,
"grad_norm": 6.411126780956076,
"learning_rate": 1.0207375879324853e-07,
"loss": 1.7633,
"step": 12075
},
{
"epoch": 0.8139887470098717,
"grad_norm": 6.560863671091432,
"learning_rate": 1.0171797599685284e-07,
"loss": 1.8065,
"step": 12080
},
{
"epoch": 0.8143256628819784,
"grad_norm": 5.61227341277097,
"learning_rate": 1.0136274410656653e-07,
"loss": 1.8068,
"step": 12085
},
{
"epoch": 0.8146625787540851,
"grad_norm": 5.998206753083885,
"learning_rate": 1.010080636137487e-07,
"loss": 1.773,
"step": 12090
},
{
"epoch": 0.8149994946261918,
"grad_norm": 6.074166764653066,
"learning_rate": 1.0065393500899549e-07,
"loss": 1.695,
"step": 12095
},
{
"epoch": 0.8153364104982985,
"grad_norm": 5.886255551150027,
"learning_rate": 1.0030035878213988e-07,
"loss": 1.7873,
"step": 12100
},
{
"epoch": 0.8156733263704053,
"grad_norm": 5.998981616215432,
"learning_rate": 9.994733542225037e-08,
"loss": 1.7402,
"step": 12105
},
{
"epoch": 0.8160102422425121,
"grad_norm": 5.68317572627976,
"learning_rate": 9.959486541763118e-08,
"loss": 1.7603,
"step": 12110
},
{
"epoch": 0.8163471581146188,
"grad_norm": 5.900621893113269,
"learning_rate": 9.924294925582105e-08,
"loss": 1.8141,
"step": 12115
},
{
"epoch": 0.8166840739867255,
"grad_norm": 6.333895946186688,
"learning_rate": 9.889158742359233e-08,
"loss": 1.7808,
"step": 12120
},
{
"epoch": 0.8170209898588322,
"grad_norm": 5.947606154639251,
"learning_rate": 9.854078040695096e-08,
"loss": 1.8038,
"step": 12125
},
{
"epoch": 0.817357905730939,
"grad_norm": 5.642752987160135,
"learning_rate": 9.819052869113543e-08,
"loss": 1.7817,
"step": 12130
},
{
"epoch": 0.8176948216030457,
"grad_norm": 6.10066295676423,
"learning_rate": 9.784083276061578e-08,
"loss": 1.773,
"step": 12135
},
{
"epoch": 0.8180317374751525,
"grad_norm": 6.390361061803448,
"learning_rate": 9.749169309909382e-08,
"loss": 1.7252,
"step": 12140
},
{
"epoch": 0.8183686533472592,
"grad_norm": 5.605942954144895,
"learning_rate": 9.714311018950139e-08,
"loss": 1.7809,
"step": 12145
},
{
"epoch": 0.8187055692193659,
"grad_norm": 5.633651526457674,
"learning_rate": 9.67950845140007e-08,
"loss": 1.8077,
"step": 12150
},
{
"epoch": 0.8190424850914727,
"grad_norm": 5.803438387797127,
"learning_rate": 9.644761655398276e-08,
"loss": 1.7152,
"step": 12155
},
{
"epoch": 0.8193794009635794,
"grad_norm": 6.908491498949583,
"learning_rate": 9.61007067900675e-08,
"loss": 1.835,
"step": 12160
},
{
"epoch": 0.8197163168356861,
"grad_norm": 5.867281972222165,
"learning_rate": 9.575435570210266e-08,
"loss": 1.8139,
"step": 12165
},
{
"epoch": 0.8200532327077928,
"grad_norm": 5.7296379548628105,
"learning_rate": 9.5408563769163e-08,
"loss": 1.7873,
"step": 12170
},
{
"epoch": 0.8203901485798996,
"grad_norm": 5.1827663356538896,
"learning_rate": 9.506333146955009e-08,
"loss": 1.7037,
"step": 12175
},
{
"epoch": 0.8207270644520064,
"grad_norm": 5.3980116057512335,
"learning_rate": 9.471865928079148e-08,
"loss": 1.7945,
"step": 12180
},
{
"epoch": 0.8210639803241131,
"grad_norm": 5.4658793637910925,
"learning_rate": 9.437454767963954e-08,
"loss": 1.7733,
"step": 12185
},
{
"epoch": 0.8214008961962198,
"grad_norm": 6.1213965153921075,
"learning_rate": 9.403099714207174e-08,
"loss": 1.7969,
"step": 12190
},
{
"epoch": 0.8217378120683265,
"grad_norm": 5.838123846027778,
"learning_rate": 9.368800814328931e-08,
"loss": 1.8021,
"step": 12195
},
{
"epoch": 0.8220747279404332,
"grad_norm": 6.339463633874931,
"learning_rate": 9.334558115771646e-08,
"loss": 1.789,
"step": 12200
},
{
"epoch": 0.8224116438125401,
"grad_norm": 5.5623582051309155,
"learning_rate": 9.300371665900048e-08,
"loss": 1.7557,
"step": 12205
},
{
"epoch": 0.8227485596846468,
"grad_norm": 6.361797379331405,
"learning_rate": 9.266241512001044e-08,
"loss": 1.7047,
"step": 12210
},
{
"epoch": 0.8230854755567535,
"grad_norm": 5.952969064534977,
"learning_rate": 9.23216770128365e-08,
"loss": 1.7918,
"step": 12215
},
{
"epoch": 0.8234223914288602,
"grad_norm": 6.3011377196826155,
"learning_rate": 9.19815028087898e-08,
"loss": 1.7846,
"step": 12220
},
{
"epoch": 0.8237593073009669,
"grad_norm": 5.5847611704130316,
"learning_rate": 9.164189297840147e-08,
"loss": 1.8166,
"step": 12225
},
{
"epoch": 0.8240962231730737,
"grad_norm": 6.510883218408153,
"learning_rate": 9.130284799142179e-08,
"loss": 1.7582,
"step": 12230
},
{
"epoch": 0.8244331390451805,
"grad_norm": 6.09019683484805,
"learning_rate": 9.09643683168197e-08,
"loss": 1.775,
"step": 12235
},
{
"epoch": 0.8247700549172872,
"grad_norm": 5.852527126802162,
"learning_rate": 9.062645442278244e-08,
"loss": 1.8143,
"step": 12240
},
{
"epoch": 0.8251069707893939,
"grad_norm": 5.8185776473103505,
"learning_rate": 9.028910677671469e-08,
"loss": 1.86,
"step": 12245
},
{
"epoch": 0.8254438866615006,
"grad_norm": 6.0410974739836405,
"learning_rate": 8.995232584523754e-08,
"loss": 1.7393,
"step": 12250
},
{
"epoch": 0.8257808025336073,
"grad_norm": 6.076512425747793,
"learning_rate": 8.961611209418851e-08,
"loss": 1.783,
"step": 12255
},
{
"epoch": 0.8261177184057141,
"grad_norm": 5.948019238377317,
"learning_rate": 8.928046598862065e-08,
"loss": 1.7837,
"step": 12260
},
{
"epoch": 0.8264546342778208,
"grad_norm": 6.210822208377314,
"learning_rate": 8.894538799280138e-08,
"loss": 1.8413,
"step": 12265
},
{
"epoch": 0.8267915501499276,
"grad_norm": 5.430421973136374,
"learning_rate": 8.861087857021282e-08,
"loss": 1.7717,
"step": 12270
},
{
"epoch": 0.8271284660220343,
"grad_norm": 6.478405775826868,
"learning_rate": 8.827693818355048e-08,
"loss": 1.6905,
"step": 12275
},
{
"epoch": 0.827465381894141,
"grad_norm": 6.100376083406547,
"learning_rate": 8.794356729472252e-08,
"loss": 1.7629,
"step": 12280
},
{
"epoch": 0.8278022977662478,
"grad_norm": 5.678122613542731,
"learning_rate": 8.76107663648497e-08,
"loss": 1.7382,
"step": 12285
},
{
"epoch": 0.8281392136383545,
"grad_norm": 5.648135451717112,
"learning_rate": 8.727853585426436e-08,
"loss": 1.8082,
"step": 12290
},
{
"epoch": 0.8284761295104612,
"grad_norm": 5.782391731975333,
"learning_rate": 8.694687622250963e-08,
"loss": 1.7921,
"step": 12295
},
{
"epoch": 0.828813045382568,
"grad_norm": 5.854072035771789,
"learning_rate": 8.661578792833907e-08,
"loss": 1.8038,
"step": 12300
},
{
"epoch": 0.8291499612546747,
"grad_norm": 5.731142272858889,
"learning_rate": 8.628527142971632e-08,
"loss": 1.6797,
"step": 12305
},
{
"epoch": 0.8294868771267815,
"grad_norm": 5.81642283052816,
"learning_rate": 8.595532718381338e-08,
"loss": 1.7891,
"step": 12310
},
{
"epoch": 0.8298237929988882,
"grad_norm": 6.010154250940546,
"learning_rate": 8.562595564701153e-08,
"loss": 1.8121,
"step": 12315
},
{
"epoch": 0.8301607088709949,
"grad_norm": 6.179374881780338,
"learning_rate": 8.529715727489912e-08,
"loss": 1.8339,
"step": 12320
},
{
"epoch": 0.8304976247431016,
"grad_norm": 5.734377900742119,
"learning_rate": 8.496893252227238e-08,
"loss": 1.7897,
"step": 12325
},
{
"epoch": 0.8308345406152083,
"grad_norm": 5.466608782130143,
"learning_rate": 8.464128184313346e-08,
"loss": 1.7676,
"step": 12330
},
{
"epoch": 0.8311714564873152,
"grad_norm": 6.156550917567134,
"learning_rate": 8.431420569069093e-08,
"loss": 1.7143,
"step": 12335
},
{
"epoch": 0.8315083723594219,
"grad_norm": 6.212082720502393,
"learning_rate": 8.398770451735865e-08,
"loss": 1.7742,
"step": 12340
},
{
"epoch": 0.8318452882315286,
"grad_norm": 6.203682522334206,
"learning_rate": 8.366177877475473e-08,
"loss": 1.7803,
"step": 12345
},
{
"epoch": 0.8321822041036353,
"grad_norm": 5.773188612460619,
"learning_rate": 8.333642891370174e-08,
"loss": 1.8809,
"step": 12350
},
{
"epoch": 0.832519119975742,
"grad_norm": 5.572837441862478,
"learning_rate": 8.301165538422577e-08,
"loss": 1.8031,
"step": 12355
},
{
"epoch": 0.8328560358478488,
"grad_norm": 6.1136780454024136,
"learning_rate": 8.268745863555521e-08,
"loss": 1.7058,
"step": 12360
},
{
"epoch": 0.8331929517199556,
"grad_norm": 6.072274649537594,
"learning_rate": 8.236383911612116e-08,
"loss": 1.7625,
"step": 12365
},
{
"epoch": 0.8335298675920623,
"grad_norm": 5.642550548273279,
"learning_rate": 8.204079727355611e-08,
"loss": 1.7976,
"step": 12370
},
{
"epoch": 0.833866783464169,
"grad_norm": 6.656470966965112,
"learning_rate": 8.171833355469354e-08,
"loss": 1.8501,
"step": 12375
},
{
"epoch": 0.8342036993362757,
"grad_norm": 6.377445803343679,
"learning_rate": 8.139644840556703e-08,
"loss": 1.7944,
"step": 12380
},
{
"epoch": 0.8345406152083825,
"grad_norm": 6.003788291285669,
"learning_rate": 8.107514227141032e-08,
"loss": 1.7504,
"step": 12385
},
{
"epoch": 0.8348775310804892,
"grad_norm": 5.763130376288233,
"learning_rate": 8.075441559665569e-08,
"loss": 1.8019,
"step": 12390
},
{
"epoch": 0.835214446952596,
"grad_norm": 5.365364742414105,
"learning_rate": 8.04342688249346e-08,
"loss": 1.8251,
"step": 12395
},
{
"epoch": 0.8355513628247027,
"grad_norm": 5.943722593508419,
"learning_rate": 8.011470239907558e-08,
"loss": 1.7841,
"step": 12400
},
{
"epoch": 0.8358882786968094,
"grad_norm": 6.280412446558371,
"learning_rate": 7.979571676110525e-08,
"loss": 1.7724,
"step": 12405
},
{
"epoch": 0.8362251945689162,
"grad_norm": 6.41052769353216,
"learning_rate": 7.947731235224614e-08,
"loss": 1.8516,
"step": 12410
},
{
"epoch": 0.8365621104410229,
"grad_norm": 5.8087270438920156,
"learning_rate": 7.915948961291729e-08,
"loss": 1.7758,
"step": 12415
},
{
"epoch": 0.8368990263131296,
"grad_norm": 5.948851731569554,
"learning_rate": 7.884224898273322e-08,
"loss": 1.7559,
"step": 12420
},
{
"epoch": 0.8372359421852363,
"grad_norm": 6.26399600567096,
"learning_rate": 7.852559090050276e-08,
"loss": 1.7629,
"step": 12425
},
{
"epoch": 0.837572858057343,
"grad_norm": 6.214317608558152,
"learning_rate": 7.820951580422952e-08,
"loss": 1.7334,
"step": 12430
},
{
"epoch": 0.8379097739294498,
"grad_norm": 5.705699682881419,
"learning_rate": 7.789402413111041e-08,
"loss": 1.7162,
"step": 12435
},
{
"epoch": 0.8382466898015566,
"grad_norm": 5.854199053151604,
"learning_rate": 7.757911631753556e-08,
"loss": 1.7659,
"step": 12440
},
{
"epoch": 0.8385836056736633,
"grad_norm": 5.782064879077919,
"learning_rate": 7.72647927990871e-08,
"loss": 1.8207,
"step": 12445
},
{
"epoch": 0.83892052154577,
"grad_norm": 5.730477635589501,
"learning_rate": 7.695105401053942e-08,
"loss": 1.8184,
"step": 12450
},
{
"epoch": 0.8392574374178767,
"grad_norm": 6.009518784132229,
"learning_rate": 7.663790038585794e-08,
"loss": 1.8255,
"step": 12455
},
{
"epoch": 0.8395943532899834,
"grad_norm": 5.80924413423034,
"learning_rate": 7.63253323581985e-08,
"loss": 1.774,
"step": 12460
},
{
"epoch": 0.8399312691620903,
"grad_norm": 6.029972989497406,
"learning_rate": 7.601335035990714e-08,
"loss": 1.8408,
"step": 12465
},
{
"epoch": 0.840268185034197,
"grad_norm": 6.012125031968545,
"learning_rate": 7.57019548225194e-08,
"loss": 1.7452,
"step": 12470
},
{
"epoch": 0.8406051009063037,
"grad_norm": 6.3070354195548255,
"learning_rate": 7.539114617675941e-08,
"loss": 1.8147,
"step": 12475
},
{
"epoch": 0.8409420167784104,
"grad_norm": 5.467419693731685,
"learning_rate": 7.508092485253936e-08,
"loss": 1.7499,
"step": 12480
},
{
"epoch": 0.8412789326505171,
"grad_norm": 6.055406576016901,
"learning_rate": 7.477129127895954e-08,
"loss": 1.8029,
"step": 12485
},
{
"epoch": 0.8416158485226239,
"grad_norm": 5.736119441083093,
"learning_rate": 7.446224588430678e-08,
"loss": 1.7793,
"step": 12490
},
{
"epoch": 0.8419527643947307,
"grad_norm": 5.502943142670245,
"learning_rate": 7.415378909605457e-08,
"loss": 1.8578,
"step": 12495
},
{
"epoch": 0.8422896802668374,
"grad_norm": 6.003707840545835,
"learning_rate": 7.384592134086231e-08,
"loss": 1.7574,
"step": 12500
},
{
"epoch": 0.8426265961389441,
"grad_norm": 6.0955461129701005,
"learning_rate": 7.353864304457463e-08,
"loss": 1.8106,
"step": 12505
},
{
"epoch": 0.8429635120110508,
"grad_norm": 6.613263301440216,
"learning_rate": 7.323195463222054e-08,
"loss": 1.7904,
"step": 12510
},
{
"epoch": 0.8433004278831576,
"grad_norm": 5.851838960250193,
"learning_rate": 7.292585652801331e-08,
"loss": 1.7135,
"step": 12515
},
{
"epoch": 0.8436373437552643,
"grad_norm": 5.870921208338694,
"learning_rate": 7.262034915534993e-08,
"loss": 1.7985,
"step": 12520
},
{
"epoch": 0.843974259627371,
"grad_norm": 6.341944056999602,
"learning_rate": 7.231543293680969e-08,
"loss": 1.7518,
"step": 12525
},
{
"epoch": 0.8443111754994778,
"grad_norm": 6.7081791221760065,
"learning_rate": 7.20111082941548e-08,
"loss": 1.8246,
"step": 12530
},
{
"epoch": 0.8446480913715845,
"grad_norm": 5.923306053836086,
"learning_rate": 7.170737564832902e-08,
"loss": 1.8312,
"step": 12535
},
{
"epoch": 0.8449850072436913,
"grad_norm": 6.095365797264817,
"learning_rate": 7.14042354194569e-08,
"loss": 1.8126,
"step": 12540
},
{
"epoch": 0.845321923115798,
"grad_norm": 5.684604484852161,
"learning_rate": 7.110168802684408e-08,
"loss": 1.7082,
"step": 12545
},
{
"epoch": 0.8456588389879047,
"grad_norm": 5.725222684944491,
"learning_rate": 7.079973388897592e-08,
"loss": 1.8111,
"step": 12550
},
{
"epoch": 0.8459957548600114,
"grad_norm": 5.693320014755603,
"learning_rate": 7.049837342351706e-08,
"loss": 1.7548,
"step": 12555
},
{
"epoch": 0.8463326707321182,
"grad_norm": 6.398363624106793,
"learning_rate": 7.019760704731131e-08,
"loss": 1.8213,
"step": 12560
},
{
"epoch": 0.846669586604225,
"grad_norm": 5.860647280733894,
"learning_rate": 6.989743517638053e-08,
"loss": 1.7709,
"step": 12565
},
{
"epoch": 0.8470065024763317,
"grad_norm": 6.148625041417426,
"learning_rate": 6.959785822592402e-08,
"loss": 1.826,
"step": 12570
},
{
"epoch": 0.8473434183484384,
"grad_norm": 6.417824137742353,
"learning_rate": 6.929887661031864e-08,
"loss": 1.7246,
"step": 12575
},
{
"epoch": 0.8476803342205451,
"grad_norm": 5.64895725010932,
"learning_rate": 6.900049074311753e-08,
"loss": 1.7656,
"step": 12580
},
{
"epoch": 0.8480172500926518,
"grad_norm": 5.972290040076362,
"learning_rate": 6.870270103705e-08,
"loss": 1.7489,
"step": 12585
},
{
"epoch": 0.8483541659647587,
"grad_norm": 5.887133857519855,
"learning_rate": 6.840550790402027e-08,
"loss": 1.7962,
"step": 12590
},
{
"epoch": 0.8486910818368654,
"grad_norm": 5.782128457023704,
"learning_rate": 6.810891175510792e-08,
"loss": 1.8129,
"step": 12595
},
{
"epoch": 0.8490279977089721,
"grad_norm": 5.874892138790784,
"learning_rate": 6.781291300056647e-08,
"loss": 1.7947,
"step": 12600
},
{
"epoch": 0.8493649135810788,
"grad_norm": 5.812368931391869,
"learning_rate": 6.751751204982309e-08,
"loss": 1.6731,
"step": 12605
},
{
"epoch": 0.8497018294531855,
"grad_norm": 6.020082431564708,
"learning_rate": 6.722270931147827e-08,
"loss": 1.8078,
"step": 12610
},
{
"epoch": 0.8500387453252922,
"grad_norm": 5.960115495572332,
"learning_rate": 6.692850519330506e-08,
"loss": 1.7458,
"step": 12615
},
{
"epoch": 0.850375661197399,
"grad_norm": 6.307539795503096,
"learning_rate": 6.66349001022481e-08,
"loss": 1.7037,
"step": 12620
},
{
"epoch": 0.8507125770695058,
"grad_norm": 5.545091703305936,
"learning_rate": 6.634189444442389e-08,
"loss": 1.8488,
"step": 12625
},
{
"epoch": 0.8510494929416125,
"grad_norm": 6.254283824668138,
"learning_rate": 6.604948862511977e-08,
"loss": 1.8558,
"step": 12630
},
{
"epoch": 0.8513864088137192,
"grad_norm": 6.109361798563485,
"learning_rate": 6.575768304879292e-08,
"loss": 1.8072,
"step": 12635
},
{
"epoch": 0.8517233246858259,
"grad_norm": 5.779243983837955,
"learning_rate": 6.546647811907091e-08,
"loss": 1.6969,
"step": 12640
},
{
"epoch": 0.8520602405579327,
"grad_norm": 5.6901474533716385,
"learning_rate": 6.517587423874988e-08,
"loss": 1.8094,
"step": 12645
},
{
"epoch": 0.8523971564300394,
"grad_norm": 5.452805325410532,
"learning_rate": 6.48858718097951e-08,
"loss": 1.695,
"step": 12650
},
{
"epoch": 0.8527340723021462,
"grad_norm": 5.545529646061517,
"learning_rate": 6.459647123333956e-08,
"loss": 1.6861,
"step": 12655
},
{
"epoch": 0.8530709881742529,
"grad_norm": 6.1029815867327,
"learning_rate": 6.430767290968387e-08,
"loss": 1.7565,
"step": 12660
},
{
"epoch": 0.8534079040463596,
"grad_norm": 5.876283483843472,
"learning_rate": 6.401947723829576e-08,
"loss": 1.7649,
"step": 12665
},
{
"epoch": 0.8537448199184664,
"grad_norm": 5.860811960055388,
"learning_rate": 6.373188461780904e-08,
"loss": 1.7763,
"step": 12670
},
{
"epoch": 0.8540817357905731,
"grad_norm": 5.71660515150186,
"learning_rate": 6.344489544602371e-08,
"loss": 1.8172,
"step": 12675
},
{
"epoch": 0.8544186516626798,
"grad_norm": 6.166026903880695,
"learning_rate": 6.315851011990498e-08,
"loss": 1.8399,
"step": 12680
},
{
"epoch": 0.8547555675347865,
"grad_norm": 5.601035725888883,
"learning_rate": 6.28727290355826e-08,
"loss": 1.7438,
"step": 12685
},
{
"epoch": 0.8550924834068933,
"grad_norm": 6.425970981523913,
"learning_rate": 6.258755258835075e-08,
"loss": 1.7938,
"step": 12690
},
{
"epoch": 0.8554293992790001,
"grad_norm": 5.882378548711108,
"learning_rate": 6.230298117266736e-08,
"loss": 1.7494,
"step": 12695
},
{
"epoch": 0.8557663151511068,
"grad_norm": 6.343902811827818,
"learning_rate": 6.201901518215313e-08,
"loss": 1.764,
"step": 12700
},
{
"epoch": 0.8561032310232135,
"grad_norm": 6.253978212706272,
"learning_rate": 6.173565500959165e-08,
"loss": 1.8211,
"step": 12705
},
{
"epoch": 0.8564401468953202,
"grad_norm": 6.608042387592941,
"learning_rate": 6.14529010469284e-08,
"loss": 1.819,
"step": 12710
},
{
"epoch": 0.8567770627674269,
"grad_norm": 5.752172995272666,
"learning_rate": 6.117075368527053e-08,
"loss": 1.6747,
"step": 12715
},
{
"epoch": 0.8571139786395338,
"grad_norm": 5.840794832771322,
"learning_rate": 6.088921331488566e-08,
"loss": 1.7578,
"step": 12720
},
{
"epoch": 0.8574508945116405,
"grad_norm": 5.849034173026425,
"learning_rate": 6.060828032520249e-08,
"loss": 1.7759,
"step": 12725
},
{
"epoch": 0.8577878103837472,
"grad_norm": 6.213661331265872,
"learning_rate": 6.032795510480904e-08,
"loss": 1.7317,
"step": 12730
},
{
"epoch": 0.8581247262558539,
"grad_norm": 5.6898733401141515,
"learning_rate": 6.004823804145276e-08,
"loss": 1.8348,
"step": 12735
},
{
"epoch": 0.8584616421279606,
"grad_norm": 5.859861873519631,
"learning_rate": 5.976912952204016e-08,
"loss": 1.797,
"step": 12740
},
{
"epoch": 0.8587985580000674,
"grad_norm": 5.76133259829159,
"learning_rate": 5.9490629932635815e-08,
"loss": 1.8532,
"step": 12745
},
{
"epoch": 0.8591354738721741,
"grad_norm": 6.129986293259829,
"learning_rate": 5.921273965846191e-08,
"loss": 1.7627,
"step": 12750
},
{
"epoch": 0.8594723897442809,
"grad_norm": 5.945438078193672,
"learning_rate": 5.893545908389807e-08,
"loss": 1.6993,
"step": 12755
},
{
"epoch": 0.8598093056163876,
"grad_norm": 5.893702356532198,
"learning_rate": 5.865878859248058e-08,
"loss": 1.7023,
"step": 12760
},
{
"epoch": 0.8601462214884943,
"grad_norm": 6.134347621385907,
"learning_rate": 5.838272856690146e-08,
"loss": 1.7135,
"step": 12765
},
{
"epoch": 0.860483137360601,
"grad_norm": 5.8408265930376775,
"learning_rate": 5.810727938900878e-08,
"loss": 1.7885,
"step": 12770
},
{
"epoch": 0.8608200532327078,
"grad_norm": 5.967188418245232,
"learning_rate": 5.7832441439805536e-08,
"loss": 1.8186,
"step": 12775
},
{
"epoch": 0.8611569691048145,
"grad_norm": 5.883049229755388,
"learning_rate": 5.755821509944925e-08,
"loss": 1.7955,
"step": 12780
},
{
"epoch": 0.8614938849769213,
"grad_norm": 6.2544522511675895,
"learning_rate": 5.728460074725133e-08,
"loss": 1.7627,
"step": 12785
},
{
"epoch": 0.861830800849028,
"grad_norm": 5.786365914538278,
"learning_rate": 5.701159876167688e-08,
"loss": 1.7497,
"step": 12790
},
{
"epoch": 0.8621677167211347,
"grad_norm": 6.071667040551235,
"learning_rate": 5.673920952034406e-08,
"loss": 1.7676,
"step": 12795
},
{
"epoch": 0.8625046325932415,
"grad_norm": 5.83884850067031,
"learning_rate": 5.646743340002302e-08,
"loss": 1.7544,
"step": 12800
},
{
"epoch": 0.8628415484653482,
"grad_norm": 6.399413975233235,
"learning_rate": 5.619627077663636e-08,
"loss": 1.8448,
"step": 12805
},
{
"epoch": 0.8631784643374549,
"grad_norm": 6.006783852696938,
"learning_rate": 5.5925722025257746e-08,
"loss": 1.8516,
"step": 12810
},
{
"epoch": 0.8635153802095616,
"grad_norm": 6.048382439323067,
"learning_rate": 5.5655787520111966e-08,
"loss": 1.8342,
"step": 12815
},
{
"epoch": 0.8638522960816684,
"grad_norm": 5.9506691951333135,
"learning_rate": 5.538646763457389e-08,
"loss": 1.7532,
"step": 12820
},
{
"epoch": 0.8641892119537752,
"grad_norm": 5.7101654823015116,
"learning_rate": 5.511776274116864e-08,
"loss": 1.7317,
"step": 12825
},
{
"epoch": 0.8645261278258819,
"grad_norm": 6.3845287996072475,
"learning_rate": 5.484967321157019e-08,
"loss": 1.7862,
"step": 12830
},
{
"epoch": 0.8648630436979886,
"grad_norm": 6.234434470893632,
"learning_rate": 5.4582199416601746e-08,
"loss": 1.7275,
"step": 12835
},
{
"epoch": 0.8651999595700953,
"grad_norm": 6.22322611604707,
"learning_rate": 5.43153417262347e-08,
"loss": 1.8236,
"step": 12840
},
{
"epoch": 0.865536875442202,
"grad_norm": 6.468448015402212,
"learning_rate": 5.404910050958833e-08,
"loss": 1.7972,
"step": 12845
},
{
"epoch": 0.8658737913143089,
"grad_norm": 5.814429402895073,
"learning_rate": 5.378347613492884e-08,
"loss": 1.7756,
"step": 12850
},
{
"epoch": 0.8662107071864156,
"grad_norm": 5.7511147012293,
"learning_rate": 5.351846896966966e-08,
"loss": 1.8017,
"step": 12855
},
{
"epoch": 0.8665476230585223,
"grad_norm": 6.609525748717229,
"learning_rate": 5.32540793803703e-08,
"loss": 1.7982,
"step": 12860
},
{
"epoch": 0.866884538930629,
"grad_norm": 5.593059673213316,
"learning_rate": 5.299030773273594e-08,
"loss": 1.7657,
"step": 12865
},
{
"epoch": 0.8672214548027357,
"grad_norm": 6.277557730074275,
"learning_rate": 5.272715439161718e-08,
"loss": 1.7661,
"step": 12870
},
{
"epoch": 0.8675583706748425,
"grad_norm": 6.0356327137348575,
"learning_rate": 5.246461972100941e-08,
"loss": 1.7348,
"step": 12875
},
{
"epoch": 0.8678952865469493,
"grad_norm": 5.819516781146134,
"learning_rate": 5.220270408405197e-08,
"loss": 1.7576,
"step": 12880
},
{
"epoch": 0.868232202419056,
"grad_norm": 5.901222351752382,
"learning_rate": 5.194140784302836e-08,
"loss": 1.7543,
"step": 12885
},
{
"epoch": 0.8685691182911627,
"grad_norm": 5.102802335028869,
"learning_rate": 5.168073135936496e-08,
"loss": 1.7689,
"step": 12890
},
{
"epoch": 0.8689060341632694,
"grad_norm": 6.347703972438555,
"learning_rate": 5.1420674993631285e-08,
"loss": 1.7232,
"step": 12895
},
{
"epoch": 0.8692429500353762,
"grad_norm": 5.80051625292983,
"learning_rate": 5.116123910553854e-08,
"loss": 1.8073,
"step": 12900
},
{
"epoch": 0.8695798659074829,
"grad_norm": 5.540327273203947,
"learning_rate": 5.0902424053940406e-08,
"loss": 1.8433,
"step": 12905
},
{
"epoch": 0.8699167817795896,
"grad_norm": 5.9566276471635335,
"learning_rate": 5.064423019683106e-08,
"loss": 1.7564,
"step": 12910
},
{
"epoch": 0.8702536976516964,
"grad_norm": 6.138415912931555,
"learning_rate": 5.0386657891346e-08,
"loss": 1.7506,
"step": 12915
},
{
"epoch": 0.8705906135238031,
"grad_norm": 5.255444258029073,
"learning_rate": 5.012970749376083e-08,
"loss": 1.7524,
"step": 12920
},
{
"epoch": 0.8709275293959099,
"grad_norm": 6.279316610951135,
"learning_rate": 4.987337935949087e-08,
"loss": 1.8029,
"step": 12925
},
{
"epoch": 0.8712644452680166,
"grad_norm": 5.837133118530122,
"learning_rate": 4.961767384309068e-08,
"loss": 1.8114,
"step": 12930
},
{
"epoch": 0.8716013611401233,
"grad_norm": 5.4440077222971555,
"learning_rate": 4.936259129825376e-08,
"loss": 1.7704,
"step": 12935
},
{
"epoch": 0.87193827701223,
"grad_norm": 5.8098683047516495,
"learning_rate": 4.9108132077811836e-08,
"loss": 1.7495,
"step": 12940
},
{
"epoch": 0.8722751928843367,
"grad_norm": 6.079084932181276,
"learning_rate": 4.885429653373435e-08,
"loss": 1.7877,
"step": 12945
},
{
"epoch": 0.8726121087564435,
"grad_norm": 6.484226794140705,
"learning_rate": 4.860108501712823e-08,
"loss": 1.796,
"step": 12950
},
{
"epoch": 0.8729490246285503,
"grad_norm": 5.944286752397985,
"learning_rate": 4.834849787823725e-08,
"loss": 1.7539,
"step": 12955
},
{
"epoch": 0.873285940500657,
"grad_norm": 6.719801982192994,
"learning_rate": 4.809653546644132e-08,
"loss": 1.8585,
"step": 12960
},
{
"epoch": 0.8736228563727637,
"grad_norm": 6.021944012578693,
"learning_rate": 4.7845198130256395e-08,
"loss": 1.8295,
"step": 12965
},
{
"epoch": 0.8739597722448704,
"grad_norm": 5.682973842187247,
"learning_rate": 4.759448621733403e-08,
"loss": 1.753,
"step": 12970
},
{
"epoch": 0.8742966881169771,
"grad_norm": 6.027055507715319,
"learning_rate": 4.7344400074460276e-08,
"loss": 1.7675,
"step": 12975
},
{
"epoch": 0.874633603989084,
"grad_norm": 6.019246359137967,
"learning_rate": 4.709494004755571e-08,
"loss": 1.7944,
"step": 12980
},
{
"epoch": 0.8749705198611907,
"grad_norm": 6.205925692533681,
"learning_rate": 4.684610648167503e-08,
"loss": 1.7383,
"step": 12985
},
{
"epoch": 0.8753074357332974,
"grad_norm": 6.541486904892061,
"learning_rate": 4.659789972100647e-08,
"loss": 1.7923,
"step": 12990
},
{
"epoch": 0.8756443516054041,
"grad_norm": 5.7486065087911085,
"learning_rate": 4.635032010887097e-08,
"loss": 1.8291,
"step": 12995
},
{
"epoch": 0.8759812674775108,
"grad_norm": 5.637034264598444,
"learning_rate": 4.610336798772213e-08,
"loss": 1.7808,
"step": 13000
},
{
"epoch": 0.8763181833496176,
"grad_norm": 5.740217106810944,
"learning_rate": 4.5857043699145834e-08,
"loss": 1.8048,
"step": 13005
},
{
"epoch": 0.8766550992217244,
"grad_norm": 6.194258312455204,
"learning_rate": 4.5611347583859095e-08,
"loss": 1.7798,
"step": 13010
},
{
"epoch": 0.8769920150938311,
"grad_norm": 5.900531799707303,
"learning_rate": 4.536627998171033e-08,
"loss": 1.8157,
"step": 13015
},
{
"epoch": 0.8773289309659378,
"grad_norm": 6.470340001417238,
"learning_rate": 4.512184123167867e-08,
"loss": 1.74,
"step": 13020
},
{
"epoch": 0.8776658468380445,
"grad_norm": 5.817971283498176,
"learning_rate": 4.487803167187304e-08,
"loss": 1.7794,
"step": 13025
},
{
"epoch": 0.8780027627101513,
"grad_norm": 5.790877494976903,
"learning_rate": 4.463485163953246e-08,
"loss": 1.8452,
"step": 13030
},
{
"epoch": 0.878339678582258,
"grad_norm": 5.960756473041569,
"learning_rate": 4.4392301471025074e-08,
"loss": 1.8384,
"step": 13035
},
{
"epoch": 0.8786765944543647,
"grad_norm": 6.030953838356603,
"learning_rate": 4.415038150184758e-08,
"loss": 1.7726,
"step": 13040
},
{
"epoch": 0.8790135103264715,
"grad_norm": 6.256975255497656,
"learning_rate": 4.3909092066625245e-08,
"loss": 1.7452,
"step": 13045
},
{
"epoch": 0.8793504261985782,
"grad_norm": 5.843379556994611,
"learning_rate": 4.366843349911109e-08,
"loss": 1.7354,
"step": 13050
},
{
"epoch": 0.879687342070685,
"grad_norm": 6.33305600966883,
"learning_rate": 4.342840613218546e-08,
"loss": 1.7203,
"step": 13055
},
{
"epoch": 0.8800242579427917,
"grad_norm": 6.100154791697077,
"learning_rate": 4.318901029785571e-08,
"loss": 1.8078,
"step": 13060
},
{
"epoch": 0.8803611738148984,
"grad_norm": 5.77833918849904,
"learning_rate": 4.2950246327255523e-08,
"loss": 1.7375,
"step": 13065
},
{
"epoch": 0.8806980896870051,
"grad_norm": 5.645289655518625,
"learning_rate": 4.271211455064483e-08,
"loss": 1.8231,
"step": 13070
},
{
"epoch": 0.8810350055591119,
"grad_norm": 6.012684369012068,
"learning_rate": 4.2474615297408754e-08,
"loss": 1.7604,
"step": 13075
},
{
"epoch": 0.8813719214312187,
"grad_norm": 5.441699463373052,
"learning_rate": 4.223774889605775e-08,
"loss": 1.7773,
"step": 13080
},
{
"epoch": 0.8817088373033254,
"grad_norm": 6.0339587220276325,
"learning_rate": 4.200151567422699e-08,
"loss": 1.8166,
"step": 13085
},
{
"epoch": 0.8820457531754321,
"grad_norm": 6.067671922831784,
"learning_rate": 4.176591595867557e-08,
"loss": 1.8074,
"step": 13090
},
{
"epoch": 0.8823826690475388,
"grad_norm": 5.630193597022125,
"learning_rate": 4.153095007528645e-08,
"loss": 1.7895,
"step": 13095
},
{
"epoch": 0.8827195849196455,
"grad_norm": 5.897156624279815,
"learning_rate": 4.1296618349066e-08,
"loss": 1.723,
"step": 13100
},
{
"epoch": 0.8830565007917524,
"grad_norm": 5.59628907202942,
"learning_rate": 4.106292110414311e-08,
"loss": 1.8166,
"step": 13105
},
{
"epoch": 0.8833934166638591,
"grad_norm": 5.629179966186753,
"learning_rate": 4.082985866376926e-08,
"loss": 1.8097,
"step": 13110
},
{
"epoch": 0.8837303325359658,
"grad_norm": 6.399112941808455,
"learning_rate": 4.05974313503179e-08,
"loss": 1.8253,
"step": 13115
},
{
"epoch": 0.8840672484080725,
"grad_norm": 5.787318489151215,
"learning_rate": 4.036563948528393e-08,
"loss": 1.7877,
"step": 13120
},
{
"epoch": 0.8844041642801792,
"grad_norm": 5.519712124099288,
"learning_rate": 4.01344833892831e-08,
"loss": 1.8243,
"step": 13125
},
{
"epoch": 0.8847410801522859,
"grad_norm": 6.393037581588508,
"learning_rate": 3.990396338205204e-08,
"loss": 1.7897,
"step": 13130
},
{
"epoch": 0.8850779960243927,
"grad_norm": 6.216475201686147,
"learning_rate": 3.967407978244747e-08,
"loss": 1.6919,
"step": 13135
},
{
"epoch": 0.8854149118964995,
"grad_norm": 6.0911141234125,
"learning_rate": 3.944483290844575e-08,
"loss": 1.7612,
"step": 13140
},
{
"epoch": 0.8857518277686062,
"grad_norm": 6.14238918360982,
"learning_rate": 3.9216223077142394e-08,
"loss": 1.8431,
"step": 13145
},
{
"epoch": 0.8860887436407129,
"grad_norm": 5.873505883296761,
"learning_rate": 3.8988250604752135e-08,
"loss": 1.7477,
"step": 13150
},
{
"epoch": 0.8864256595128196,
"grad_norm": 6.636804154561093,
"learning_rate": 3.876091580660762e-08,
"loss": 1.8773,
"step": 13155
},
{
"epoch": 0.8867625753849264,
"grad_norm": 5.990492412539213,
"learning_rate": 3.853421899715992e-08,
"loss": 1.7506,
"step": 13160
},
{
"epoch": 0.8870994912570331,
"grad_norm": 5.796005866130032,
"learning_rate": 3.8308160489977424e-08,
"loss": 1.8137,
"step": 13165
},
{
"epoch": 0.8874364071291398,
"grad_norm": 6.212100150861587,
"learning_rate": 3.808274059774552e-08,
"loss": 1.8317,
"step": 13170
},
{
"epoch": 0.8877733230012466,
"grad_norm": 6.143632521898199,
"learning_rate": 3.785795963226646e-08,
"loss": 1.7555,
"step": 13175
},
{
"epoch": 0.8881102388733533,
"grad_norm": 5.9097879315428,
"learning_rate": 3.7633817904458574e-08,
"loss": 1.8154,
"step": 13180
},
{
"epoch": 0.8884471547454601,
"grad_norm": 6.222570248163253,
"learning_rate": 3.741031572435615e-08,
"loss": 1.7622,
"step": 13185
},
{
"epoch": 0.8887840706175668,
"grad_norm": 5.99694342465949,
"learning_rate": 3.718745340110868e-08,
"loss": 1.7687,
"step": 13190
},
{
"epoch": 0.8891209864896735,
"grad_norm": 6.024623734589238,
"learning_rate": 3.6965231242980624e-08,
"loss": 1.7684,
"step": 13195
},
{
"epoch": 0.8894579023617802,
"grad_norm": 6.0660869612201544,
"learning_rate": 3.6743649557351265e-08,
"loss": 1.8178,
"step": 13200
},
{
"epoch": 0.889794818233887,
"grad_norm": 5.584094991029224,
"learning_rate": 3.652270865071344e-08,
"loss": 1.7802,
"step": 13205
},
{
"epoch": 0.8901317341059938,
"grad_norm": 5.19943663551813,
"learning_rate": 3.630240882867408e-08,
"loss": 1.714,
"step": 13210
},
{
"epoch": 0.8904686499781005,
"grad_norm": 5.960605902380365,
"learning_rate": 3.608275039595332e-08,
"loss": 1.8067,
"step": 13215
},
{
"epoch": 0.8908055658502072,
"grad_norm": 6.364002982000837,
"learning_rate": 3.5863733656383844e-08,
"loss": 1.7506,
"step": 13220
},
{
"epoch": 0.8911424817223139,
"grad_norm": 5.831937518355378,
"learning_rate": 3.564535891291115e-08,
"loss": 1.7529,
"step": 13225
},
{
"epoch": 0.8914793975944206,
"grad_norm": 5.90539818137521,
"learning_rate": 3.542762646759234e-08,
"loss": 1.7931,
"step": 13230
},
{
"epoch": 0.8918163134665275,
"grad_norm": 6.244337889693646,
"learning_rate": 3.521053662159629e-08,
"loss": 1.7838,
"step": 13235
},
{
"epoch": 0.8921532293386342,
"grad_norm": 5.636325790259478,
"learning_rate": 3.499408967520295e-08,
"loss": 1.7723,
"step": 13240
},
{
"epoch": 0.8924901452107409,
"grad_norm": 5.909428853004949,
"learning_rate": 3.477828592780319e-08,
"loss": 1.7905,
"step": 13245
},
{
"epoch": 0.8928270610828476,
"grad_norm": 5.743071012947205,
"learning_rate": 3.456312567789793e-08,
"loss": 1.8337,
"step": 13250
},
{
"epoch": 0.8931639769549543,
"grad_norm": 5.470554904588339,
"learning_rate": 3.4348609223098125e-08,
"loss": 1.7911,
"step": 13255
},
{
"epoch": 0.8935008928270611,
"grad_norm": 5.665375115697222,
"learning_rate": 3.41347368601243e-08,
"loss": 1.7307,
"step": 13260
},
{
"epoch": 0.8938378086991678,
"grad_norm": 6.481341411904355,
"learning_rate": 3.39215088848061e-08,
"loss": 1.8396,
"step": 13265
},
{
"epoch": 0.8941747245712746,
"grad_norm": 6.225439326082594,
"learning_rate": 3.370892559208155e-08,
"loss": 1.7568,
"step": 13270
},
{
"epoch": 0.8945116404433813,
"grad_norm": 5.906481797833057,
"learning_rate": 3.34969872759972e-08,
"loss": 1.8076,
"step": 13275
},
{
"epoch": 0.894848556315488,
"grad_norm": 5.856494710175204,
"learning_rate": 3.328569422970762e-08,
"loss": 1.8648,
"step": 13280
},
{
"epoch": 0.8951854721875948,
"grad_norm": 5.753105226825638,
"learning_rate": 3.307504674547429e-08,
"loss": 1.7815,
"step": 13285
},
{
"epoch": 0.8955223880597015,
"grad_norm": 6.095595815839566,
"learning_rate": 3.286504511466631e-08,
"loss": 1.8328,
"step": 13290
},
{
"epoch": 0.8958593039318082,
"grad_norm": 6.288610606607138,
"learning_rate": 3.265568962775927e-08,
"loss": 1.7913,
"step": 13295
},
{
"epoch": 0.896196219803915,
"grad_norm": 6.5022942580461125,
"learning_rate": 3.2446980574334706e-08,
"loss": 1.7869,
"step": 13300
},
{
"epoch": 0.8965331356760217,
"grad_norm": 6.6487512915161275,
"learning_rate": 3.2238918243080505e-08,
"loss": 1.7442,
"step": 13305
},
{
"epoch": 0.8968700515481284,
"grad_norm": 6.732808022858596,
"learning_rate": 3.203150292178952e-08,
"loss": 1.8084,
"step": 13310
},
{
"epoch": 0.8972069674202352,
"grad_norm": 5.897197143896169,
"learning_rate": 3.182473489736004e-08,
"loss": 1.7462,
"step": 13315
},
{
"epoch": 0.8975438832923419,
"grad_norm": 6.842935750718153,
"learning_rate": 3.161861445579478e-08,
"loss": 1.8202,
"step": 13320
},
{
"epoch": 0.8978807991644486,
"grad_norm": 6.089128345755971,
"learning_rate": 3.1413141882200736e-08,
"loss": 1.7682,
"step": 13325
},
{
"epoch": 0.8982177150365553,
"grad_norm": 5.780557396715805,
"learning_rate": 3.120831746078895e-08,
"loss": 1.7929,
"step": 13330
},
{
"epoch": 0.898554630908662,
"grad_norm": 5.928980468404957,
"learning_rate": 3.100414147487368e-08,
"loss": 1.8481,
"step": 13335
},
{
"epoch": 0.8988915467807689,
"grad_norm": 6.0861515824792844,
"learning_rate": 3.0800614206872413e-08,
"loss": 1.7645,
"step": 13340
},
{
"epoch": 0.8992284626528756,
"grad_norm": 6.467715458691118,
"learning_rate": 3.059773593830539e-08,
"loss": 1.7795,
"step": 13345
},
{
"epoch": 0.8995653785249823,
"grad_norm": 5.804373053320775,
"learning_rate": 3.039550694979492e-08,
"loss": 1.7391,
"step": 13350
},
{
"epoch": 0.899902294397089,
"grad_norm": 5.747917639408956,
"learning_rate": 3.019392752106548e-08,
"loss": 1.7619,
"step": 13355
},
{
"epoch": 0.9002392102691957,
"grad_norm": 5.824464537511226,
"learning_rate": 2.9992997930942954e-08,
"loss": 1.7164,
"step": 13360
},
{
"epoch": 0.9005761261413026,
"grad_norm": 6.003811614386364,
"learning_rate": 2.979271845735426e-08,
"loss": 1.6998,
"step": 13365
},
{
"epoch": 0.9009130420134093,
"grad_norm": 5.8609792524761914,
"learning_rate": 2.9593089377327242e-08,
"loss": 1.7337,
"step": 13370
},
{
"epoch": 0.901249957885516,
"grad_norm": 5.948332344827435,
"learning_rate": 2.9394110966990184e-08,
"loss": 1.7973,
"step": 13375
},
{
"epoch": 0.9015868737576227,
"grad_norm": 5.945282741761599,
"learning_rate": 2.9195783501570982e-08,
"loss": 1.8228,
"step": 13380
},
{
"epoch": 0.9019237896297294,
"grad_norm": 6.262748243809702,
"learning_rate": 2.8998107255397643e-08,
"loss": 1.7952,
"step": 13385
},
{
"epoch": 0.9022607055018362,
"grad_norm": 5.640388167355125,
"learning_rate": 2.880108250189689e-08,
"loss": 1.7446,
"step": 13390
},
{
"epoch": 0.902597621373943,
"grad_norm": 5.800344956552423,
"learning_rate": 2.860470951359478e-08,
"loss": 1.7547,
"step": 13395
},
{
"epoch": 0.9029345372460497,
"grad_norm": 6.291107803853359,
"learning_rate": 2.8408988562115488e-08,
"loss": 1.7873,
"step": 13400
},
{
"epoch": 0.9032714531181564,
"grad_norm": 6.022821073549721,
"learning_rate": 2.8213919918181393e-08,
"loss": 1.8376,
"step": 13405
},
{
"epoch": 0.9036083689902631,
"grad_norm": 5.900130211549773,
"learning_rate": 2.8019503851612837e-08,
"loss": 1.7953,
"step": 13410
},
{
"epoch": 0.9039452848623699,
"grad_norm": 5.901590236995989,
"learning_rate": 2.782574063132703e-08,
"loss": 1.8029,
"step": 13415
},
{
"epoch": 0.9042822007344766,
"grad_norm": 5.913123808511502,
"learning_rate": 2.7632630525338597e-08,
"loss": 1.7911,
"step": 13420
},
{
"epoch": 0.9046191166065833,
"grad_norm": 6.486077531057897,
"learning_rate": 2.7440173800758583e-08,
"loss": 1.8126,
"step": 13425
},
{
"epoch": 0.90495603247869,
"grad_norm": 5.924773230014183,
"learning_rate": 2.7248370723794268e-08,
"loss": 1.7774,
"step": 13430
},
{
"epoch": 0.9052929483507968,
"grad_norm": 6.107738959168259,
"learning_rate": 2.7057221559748822e-08,
"loss": 1.7895,
"step": 13435
},
{
"epoch": 0.9056298642229036,
"grad_norm": 5.945232419314283,
"learning_rate": 2.6866726573021025e-08,
"loss": 1.7537,
"step": 13440
},
{
"epoch": 0.9059667800950103,
"grad_norm": 6.242713673098733,
"learning_rate": 2.667688602710455e-08,
"loss": 1.709,
"step": 13445
},
{
"epoch": 0.906303695967117,
"grad_norm": 5.810035940437741,
"learning_rate": 2.648770018458807e-08,
"loss": 1.7855,
"step": 13450
},
{
"epoch": 0.9066406118392237,
"grad_norm": 5.876560332140629,
"learning_rate": 2.6299169307154535e-08,
"loss": 1.7288,
"step": 13455
},
{
"epoch": 0.9069775277113304,
"grad_norm": 6.026710876222,
"learning_rate": 2.611129365558118e-08,
"loss": 1.7491,
"step": 13460
},
{
"epoch": 0.9073144435834373,
"grad_norm": 5.90332142357701,
"learning_rate": 2.592407348973852e-08,
"loss": 1.7657,
"step": 13465
},
{
"epoch": 0.907651359455544,
"grad_norm": 5.567054827768613,
"learning_rate": 2.573750906859079e-08,
"loss": 1.7703,
"step": 13470
},
{
"epoch": 0.9079882753276507,
"grad_norm": 6.275015331234111,
"learning_rate": 2.5551600650194906e-08,
"loss": 1.7642,
"step": 13475
},
{
"epoch": 0.9083251911997574,
"grad_norm": 5.80583328955002,
"learning_rate": 2.536634849170055e-08,
"loss": 1.7379,
"step": 13480
},
{
"epoch": 0.9086621070718641,
"grad_norm": 6.1557870962000045,
"learning_rate": 2.5181752849349593e-08,
"loss": 1.8367,
"step": 13485
},
{
"epoch": 0.9089990229439708,
"grad_norm": 6.037242378415563,
"learning_rate": 2.4997813978476e-08,
"loss": 1.7544,
"step": 13490
},
{
"epoch": 0.9093359388160777,
"grad_norm": 5.672781989201412,
"learning_rate": 2.481453213350493e-08,
"loss": 1.7149,
"step": 13495
},
{
"epoch": 0.9096728546881844,
"grad_norm": 5.80097338376941,
"learning_rate": 2.463190756795308e-08,
"loss": 1.8256,
"step": 13500
},
{
"epoch": 0.9100097705602911,
"grad_norm": 5.341807531960438,
"learning_rate": 2.4449940534427836e-08,
"loss": 1.7936,
"step": 13505
},
{
"epoch": 0.9103466864323978,
"grad_norm": 5.820785045693387,
"learning_rate": 2.4268631284627027e-08,
"loss": 1.7686,
"step": 13510
},
{
"epoch": 0.9106836023045045,
"grad_norm": 5.816002660909503,
"learning_rate": 2.408798006933882e-08,
"loss": 1.7303,
"step": 13515
},
{
"epoch": 0.9110205181766113,
"grad_norm": 6.275964406450363,
"learning_rate": 2.3907987138440945e-08,
"loss": 1.8489,
"step": 13520
},
{
"epoch": 0.911357434048718,
"grad_norm": 6.08682021587138,
"learning_rate": 2.3728652740900856e-08,
"loss": 1.8246,
"step": 13525
},
{
"epoch": 0.9116943499208248,
"grad_norm": 6.302297868580049,
"learning_rate": 2.3549977124774857e-08,
"loss": 1.8028,
"step": 13530
},
{
"epoch": 0.9120312657929315,
"grad_norm": 6.206494648513327,
"learning_rate": 2.337196053720819e-08,
"loss": 1.8341,
"step": 13535
},
{
"epoch": 0.9123681816650382,
"grad_norm": 5.733867734946892,
"learning_rate": 2.319460322443456e-08,
"loss": 1.8267,
"step": 13540
},
{
"epoch": 0.912705097537145,
"grad_norm": 6.233397928155692,
"learning_rate": 2.301790543177551e-08,
"loss": 1.8284,
"step": 13545
},
{
"epoch": 0.9130420134092517,
"grad_norm": 5.902961201033665,
"learning_rate": 2.284186740364069e-08,
"loss": 1.8105,
"step": 13550
},
{
"epoch": 0.9133789292813584,
"grad_norm": 5.83122816008699,
"learning_rate": 2.266648938352672e-08,
"loss": 1.765,
"step": 13555
},
{
"epoch": 0.9137158451534652,
"grad_norm": 5.692387788783081,
"learning_rate": 2.249177161401783e-08,
"loss": 1.7427,
"step": 13560
},
{
"epoch": 0.9140527610255719,
"grad_norm": 6.083052684924437,
"learning_rate": 2.2317714336784422e-08,
"loss": 1.78,
"step": 13565
},
{
"epoch": 0.9143896768976787,
"grad_norm": 6.01194897181711,
"learning_rate": 2.21443177925838e-08,
"loss": 1.7904,
"step": 13570
},
{
"epoch": 0.9147265927697854,
"grad_norm": 6.2538897195592735,
"learning_rate": 2.1971582221258944e-08,
"loss": 1.6636,
"step": 13575
},
{
"epoch": 0.9150635086418921,
"grad_norm": 5.338562271328569,
"learning_rate": 2.1799507861738788e-08,
"loss": 1.7815,
"step": 13580
},
{
"epoch": 0.9154004245139988,
"grad_norm": 6.02697364861854,
"learning_rate": 2.1628094952037713e-08,
"loss": 1.7755,
"step": 13585
},
{
"epoch": 0.9157373403861055,
"grad_norm": 5.6174748611517025,
"learning_rate": 2.1457343729255062e-08,
"loss": 1.7606,
"step": 13590
},
{
"epoch": 0.9160742562582124,
"grad_norm": 5.953965768567391,
"learning_rate": 2.128725442957491e-08,
"loss": 1.7886,
"step": 13595
},
{
"epoch": 0.9164111721303191,
"grad_norm": 5.733695202042228,
"learning_rate": 2.111782728826583e-08,
"loss": 1.7728,
"step": 13600
},
{
"epoch": 0.9167480880024258,
"grad_norm": 6.073387061708105,
"learning_rate": 2.0949062539680486e-08,
"loss": 1.8331,
"step": 13605
},
{
"epoch": 0.9170850038745325,
"grad_norm": 5.96599535385796,
"learning_rate": 2.07809604172553e-08,
"loss": 1.7551,
"step": 13610
},
{
"epoch": 0.9174219197466392,
"grad_norm": 6.058916705027718,
"learning_rate": 2.0613521153510115e-08,
"loss": 1.7635,
"step": 13615
},
{
"epoch": 0.917758835618746,
"grad_norm": 6.206503726449759,
"learning_rate": 2.0446744980048002e-08,
"loss": 1.777,
"step": 13620
},
{
"epoch": 0.9180957514908528,
"grad_norm": 6.121730585820273,
"learning_rate": 2.0280632127554708e-08,
"loss": 1.8304,
"step": 13625
},
{
"epoch": 0.9184326673629595,
"grad_norm": 5.909262400028377,
"learning_rate": 2.011518282579855e-08,
"loss": 1.7962,
"step": 13630
},
{
"epoch": 0.9187695832350662,
"grad_norm": 5.951133427267503,
"learning_rate": 1.9950397303630075e-08,
"loss": 1.8755,
"step": 13635
},
{
"epoch": 0.9191064991071729,
"grad_norm": 5.749682108699682,
"learning_rate": 1.9786275788981565e-08,
"loss": 1.7573,
"step": 13640
},
{
"epoch": 0.9194434149792797,
"grad_norm": 5.932217476759413,
"learning_rate": 1.9622818508866823e-08,
"loss": 1.7832,
"step": 13645
},
{
"epoch": 0.9197803308513864,
"grad_norm": 6.105403050825645,
"learning_rate": 1.9460025689381043e-08,
"loss": 1.8365,
"step": 13650
},
{
"epoch": 0.9201172467234932,
"grad_norm": 5.352964036275184,
"learning_rate": 1.9297897555700216e-08,
"loss": 1.7852,
"step": 13655
},
{
"epoch": 0.9204541625955999,
"grad_norm": 5.644291930764029,
"learning_rate": 1.9136434332080898e-08,
"loss": 1.7718,
"step": 13660
},
{
"epoch": 0.9207910784677066,
"grad_norm": 5.807485679937865,
"learning_rate": 1.8975636241860048e-08,
"loss": 1.7835,
"step": 13665
},
{
"epoch": 0.9211279943398133,
"grad_norm": 5.9129764742967765,
"learning_rate": 1.8815503507454644e-08,
"loss": 1.7171,
"step": 13670
},
{
"epoch": 0.9214649102119201,
"grad_norm": 5.975444115877748,
"learning_rate": 1.8656036350361117e-08,
"loss": 1.751,
"step": 13675
},
{
"epoch": 0.9218018260840268,
"grad_norm": 6.144902579922595,
"learning_rate": 1.8497234991155463e-08,
"loss": 1.7783,
"step": 13680
},
{
"epoch": 0.9221387419561335,
"grad_norm": 5.666472776654721,
"learning_rate": 1.8339099649492762e-08,
"loss": 1.7597,
"step": 13685
},
{
"epoch": 0.9224756578282403,
"grad_norm": 5.9809396720752,
"learning_rate": 1.8181630544106653e-08,
"loss": 1.7526,
"step": 13690
},
{
"epoch": 0.922812573700347,
"grad_norm": 6.83292618583234,
"learning_rate": 1.8024827892809346e-08,
"loss": 1.8244,
"step": 13695
},
{
"epoch": 0.9231494895724538,
"grad_norm": 5.86478171310939,
"learning_rate": 1.7868691912491352e-08,
"loss": 1.7762,
"step": 13700
},
{
"epoch": 0.9234864054445605,
"grad_norm": 6.790313884006856,
"learning_rate": 1.77132228191208e-08,
"loss": 1.8513,
"step": 13705
},
{
"epoch": 0.9238233213166672,
"grad_norm": 5.9937155458488345,
"learning_rate": 1.7558420827743505e-08,
"loss": 1.7219,
"step": 13710
},
{
"epoch": 0.9241602371887739,
"grad_norm": 5.797828927156594,
"learning_rate": 1.7404286152482573e-08,
"loss": 1.8354,
"step": 13715
},
{
"epoch": 0.9244971530608806,
"grad_norm": 6.102906661047269,
"learning_rate": 1.725081900653791e-08,
"loss": 1.7955,
"step": 13720
},
{
"epoch": 0.9248340689329875,
"grad_norm": 6.010033747492002,
"learning_rate": 1.7098019602186376e-08,
"loss": 1.7936,
"step": 13725
},
{
"epoch": 0.9251709848050942,
"grad_norm": 5.893664038194936,
"learning_rate": 1.6945888150780797e-08,
"loss": 1.7985,
"step": 13730
},
{
"epoch": 0.9255079006772009,
"grad_norm": 5.822012915972412,
"learning_rate": 1.6794424862750568e-08,
"loss": 1.7945,
"step": 13735
},
{
"epoch": 0.9258448165493076,
"grad_norm": 6.3394192953564,
"learning_rate": 1.664362994760038e-08,
"loss": 1.7824,
"step": 13740
},
{
"epoch": 0.9261817324214143,
"grad_norm": 6.035176725514034,
"learning_rate": 1.649350361391083e-08,
"loss": 1.7793,
"step": 13745
},
{
"epoch": 0.9265186482935212,
"grad_norm": 6.654659837949139,
"learning_rate": 1.6344046069337646e-08,
"loss": 1.7988,
"step": 13750
},
{
"epoch": 0.9268555641656279,
"grad_norm": 5.585511061286529,
"learning_rate": 1.6195257520611182e-08,
"loss": 1.8061,
"step": 13755
},
{
"epoch": 0.9271924800377346,
"grad_norm": 5.778820047963369,
"learning_rate": 1.604713817353681e-08,
"loss": 1.7819,
"step": 13760
},
{
"epoch": 0.9275293959098413,
"grad_norm": 6.078945827605927,
"learning_rate": 1.5899688232994147e-08,
"loss": 1.7825,
"step": 13765
},
{
"epoch": 0.927866311781948,
"grad_norm": 5.951451870166644,
"learning_rate": 1.5752907902936707e-08,
"loss": 1.839,
"step": 13770
},
{
"epoch": 0.9282032276540548,
"grad_norm": 5.916252930427277,
"learning_rate": 1.560679738639198e-08,
"loss": 1.7983,
"step": 13775
},
{
"epoch": 0.9285401435261615,
"grad_norm": 5.771537516382258,
"learning_rate": 1.5461356885461075e-08,
"loss": 1.8014,
"step": 13780
},
{
"epoch": 0.9288770593982683,
"grad_norm": 6.107394671320099,
"learning_rate": 1.5316586601317905e-08,
"loss": 1.7635,
"step": 13785
},
{
"epoch": 0.929213975270375,
"grad_norm": 5.785638235806199,
"learning_rate": 1.5172486734209788e-08,
"loss": 1.7735,
"step": 13790
},
{
"epoch": 0.9295508911424817,
"grad_norm": 5.730850538387305,
"learning_rate": 1.502905748345651e-08,
"loss": 1.8355,
"step": 13795
},
{
"epoch": 0.9298878070145885,
"grad_norm": 5.875893406145162,
"learning_rate": 1.4886299047450257e-08,
"loss": 1.8188,
"step": 13800
},
{
"epoch": 0.9302247228866952,
"grad_norm": 5.813097498313215,
"learning_rate": 1.4744211623655356e-08,
"loss": 1.7418,
"step": 13805
},
{
"epoch": 0.9305616387588019,
"grad_norm": 5.935994794147698,
"learning_rate": 1.4602795408607982e-08,
"loss": 1.852,
"step": 13810
},
{
"epoch": 0.9308985546309086,
"grad_norm": 5.807523266406351,
"learning_rate": 1.4462050597915942e-08,
"loss": 1.7984,
"step": 13815
},
{
"epoch": 0.9312354705030154,
"grad_norm": 5.730729459958924,
"learning_rate": 1.4321977386258289e-08,
"loss": 1.7641,
"step": 13820
},
{
"epoch": 0.9315723863751222,
"grad_norm": 5.986047538097552,
"learning_rate": 1.4182575967385092e-08,
"loss": 1.791,
"step": 13825
},
{
"epoch": 0.9319093022472289,
"grad_norm": 6.129565707883838,
"learning_rate": 1.4043846534117331e-08,
"loss": 1.8389,
"step": 13830
},
{
"epoch": 0.9322462181193356,
"grad_norm": 5.760689255089029,
"learning_rate": 1.3905789278346347e-08,
"loss": 1.8328,
"step": 13835
},
{
"epoch": 0.9325831339914423,
"grad_norm": 6.0668336057579255,
"learning_rate": 1.3768404391033717e-08,
"loss": 1.735,
"step": 13840
},
{
"epoch": 0.932920049863549,
"grad_norm": 5.885732912533342,
"learning_rate": 1.3631692062211209e-08,
"loss": 1.7884,
"step": 13845
},
{
"epoch": 0.9332569657356558,
"grad_norm": 5.388443134954862,
"learning_rate": 1.3495652480979947e-08,
"loss": 1.7078,
"step": 13850
},
{
"epoch": 0.9335938816077626,
"grad_norm": 6.045883802907099,
"learning_rate": 1.3360285835510854e-08,
"loss": 1.7818,
"step": 13855
},
{
"epoch": 0.9339307974798693,
"grad_norm": 5.517887293622092,
"learning_rate": 1.322559231304382e-08,
"loss": 1.7703,
"step": 13860
},
{
"epoch": 0.934267713351976,
"grad_norm": 5.541469046961143,
"learning_rate": 1.3091572099887816e-08,
"loss": 1.7328,
"step": 13865
},
{
"epoch": 0.9346046292240827,
"grad_norm": 6.56871395739785,
"learning_rate": 1.2958225381420329e-08,
"loss": 1.8307,
"step": 13870
},
{
"epoch": 0.9349415450961894,
"grad_norm": 5.667419893283266,
"learning_rate": 1.282555234208732e-08,
"loss": 1.7326,
"step": 13875
},
{
"epoch": 0.9352784609682963,
"grad_norm": 5.838203900467372,
"learning_rate": 1.2693553165403104e-08,
"loss": 1.8029,
"step": 13880
},
{
"epoch": 0.935615376840403,
"grad_norm": 5.711248775025883,
"learning_rate": 1.2562228033949628e-08,
"loss": 1.7949,
"step": 13885
},
{
"epoch": 0.9359522927125097,
"grad_norm": 5.676609731870054,
"learning_rate": 1.243157712937659e-08,
"loss": 1.7805,
"step": 13890
},
{
"epoch": 0.9362892085846164,
"grad_norm": 5.8302983836456725,
"learning_rate": 1.230160063240121e-08,
"loss": 1.7759,
"step": 13895
},
{
"epoch": 0.9366261244567231,
"grad_norm": 6.030113265479034,
"learning_rate": 1.2172298722807617e-08,
"loss": 1.7496,
"step": 13900
},
{
"epoch": 0.9369630403288299,
"grad_norm": 6.57072889368483,
"learning_rate": 1.204367157944708e-08,
"loss": 1.7831,
"step": 13905
},
{
"epoch": 0.9372999562009366,
"grad_norm": 6.038296639335411,
"learning_rate": 1.19157193802375e-08,
"loss": 1.747,
"step": 13910
},
{
"epoch": 0.9376368720730434,
"grad_norm": 5.814924414623662,
"learning_rate": 1.1788442302163026e-08,
"loss": 1.7092,
"step": 13915
},
{
"epoch": 0.9379737879451501,
"grad_norm": 5.262932683835106,
"learning_rate": 1.1661840521274168e-08,
"loss": 1.7298,
"step": 13920
},
{
"epoch": 0.9383107038172568,
"grad_norm": 5.546576533364832,
"learning_rate": 1.1535914212687237e-08,
"loss": 1.7256,
"step": 13925
},
{
"epoch": 0.9386476196893636,
"grad_norm": 6.17537281360325,
"learning_rate": 1.1410663550584287e-08,
"loss": 1.8097,
"step": 13930
},
{
"epoch": 0.9389845355614703,
"grad_norm": 5.773326142297128,
"learning_rate": 1.1286088708212793e-08,
"loss": 1.7578,
"step": 13935
},
{
"epoch": 0.939321451433577,
"grad_norm": 5.893824855852281,
"learning_rate": 1.1162189857885362e-08,
"loss": 1.6737,
"step": 13940
},
{
"epoch": 0.9396583673056838,
"grad_norm": 6.293068011289075,
"learning_rate": 1.1038967170979741e-08,
"loss": 1.8323,
"step": 13945
},
{
"epoch": 0.9399952831777905,
"grad_norm": 6.3178382632779675,
"learning_rate": 1.0916420817938254e-08,
"loss": 1.7639,
"step": 13950
},
{
"epoch": 0.9403321990498973,
"grad_norm": 6.366970292404799,
"learning_rate": 1.0794550968267701e-08,
"loss": 1.7629,
"step": 13955
},
{
"epoch": 0.940669114922004,
"grad_norm": 5.228237120022797,
"learning_rate": 1.0673357790539294e-08,
"loss": 1.7724,
"step": 13960
},
{
"epoch": 0.9410060307941107,
"grad_norm": 5.925892786800767,
"learning_rate": 1.0552841452388105e-08,
"loss": 1.7607,
"step": 13965
},
{
"epoch": 0.9413429466662174,
"grad_norm": 6.2915894761092614,
"learning_rate": 1.0433002120513123e-08,
"loss": 1.8291,
"step": 13970
},
{
"epoch": 0.9416798625383241,
"grad_norm": 5.612014170575601,
"learning_rate": 1.0313839960676751e-08,
"loss": 1.7242,
"step": 13975
},
{
"epoch": 0.942016778410431,
"grad_norm": 5.38062147344516,
"learning_rate": 1.019535513770492e-08,
"loss": 1.7061,
"step": 13980
},
{
"epoch": 0.9423536942825377,
"grad_norm": 6.023856824550423,
"learning_rate": 1.0077547815486476e-08,
"loss": 1.8322,
"step": 13985
},
{
"epoch": 0.9426906101546444,
"grad_norm": 5.912398539147373,
"learning_rate": 9.960418156973238e-09,
"loss": 1.8146,
"step": 13990
},
{
"epoch": 0.9430275260267511,
"grad_norm": 6.1113631057581586,
"learning_rate": 9.843966324179609e-09,
"loss": 1.7683,
"step": 13995
},
{
"epoch": 0.9433644418988578,
"grad_norm": 5.615958243889113,
"learning_rate": 9.728192478182573e-09,
"loss": 1.7788,
"step": 14000
},
{
"epoch": 0.9437013577709646,
"grad_norm": 5.854145953223302,
"learning_rate": 9.613096779121089e-09,
"loss": 1.7846,
"step": 14005
},
{
"epoch": 0.9440382736430714,
"grad_norm": 6.076063145121326,
"learning_rate": 9.498679386196417e-09,
"loss": 1.6587,
"step": 14010
},
{
"epoch": 0.9443751895151781,
"grad_norm": 6.399734349425361,
"learning_rate": 9.384940457671186e-09,
"loss": 1.7681,
"step": 14015
},
{
"epoch": 0.9447121053872848,
"grad_norm": 6.338424938973493,
"learning_rate": 9.271880150869882e-09,
"loss": 1.829,
"step": 14020
},
{
"epoch": 0.9450490212593915,
"grad_norm": 5.985103027928022,
"learning_rate": 9.15949862217824e-09,
"loss": 1.7963,
"step": 14025
},
{
"epoch": 0.9453859371314982,
"grad_norm": 6.383395928238325,
"learning_rate": 9.04779602704292e-09,
"loss": 1.8399,
"step": 14030
},
{
"epoch": 0.945722853003605,
"grad_norm": 5.7326489327199655,
"learning_rate": 8.936772519971769e-09,
"loss": 1.7752,
"step": 14035
},
{
"epoch": 0.9460597688757117,
"grad_norm": 5.816190646797327,
"learning_rate": 8.826428254533169e-09,
"loss": 1.8245,
"step": 14040
},
{
"epoch": 0.9463966847478185,
"grad_norm": 5.440371053867433,
"learning_rate": 8.716763383355862e-09,
"loss": 1.7861,
"step": 14045
},
{
"epoch": 0.9467336006199252,
"grad_norm": 6.137934997478103,
"learning_rate": 8.607778058129122e-09,
"loss": 1.8045,
"step": 14050
},
{
"epoch": 0.9470705164920319,
"grad_norm": 5.8353013118261945,
"learning_rate": 8.499472429601972e-09,
"loss": 1.7325,
"step": 14055
},
{
"epoch": 0.9474074323641387,
"grad_norm": 5.964783791471046,
"learning_rate": 8.391846647583468e-09,
"loss": 1.8082,
"step": 14060
},
{
"epoch": 0.9477443482362454,
"grad_norm": 6.710260912678535,
"learning_rate": 8.284900860942246e-09,
"loss": 1.8287,
"step": 14065
},
{
"epoch": 0.9480812641083521,
"grad_norm": 6.257280332373422,
"learning_rate": 8.178635217606367e-09,
"loss": 1.7582,
"step": 14070
},
{
"epoch": 0.9484181799804589,
"grad_norm": 5.687185998282569,
"learning_rate": 8.073049864563142e-09,
"loss": 1.7209,
"step": 14075
},
{
"epoch": 0.9487550958525656,
"grad_norm": 5.285333365404428,
"learning_rate": 7.968144947858801e-09,
"loss": 1.7414,
"step": 14080
},
{
"epoch": 0.9490920117246724,
"grad_norm": 6.183760283797594,
"learning_rate": 7.863920612598496e-09,
"loss": 1.789,
"step": 14085
},
{
"epoch": 0.9494289275967791,
"grad_norm": 5.718203089285701,
"learning_rate": 7.760377002945961e-09,
"loss": 1.7936,
"step": 14090
},
{
"epoch": 0.9497658434688858,
"grad_norm": 5.577317551701951,
"learning_rate": 7.657514262123354e-09,
"loss": 1.7411,
"step": 14095
},
{
"epoch": 0.9501027593409925,
"grad_norm": 7.000453905423672,
"learning_rate": 7.55533253241103e-09,
"loss": 1.7849,
"step": 14100
},
{
"epoch": 0.9504396752130992,
"grad_norm": 5.744188185276807,
"learning_rate": 7.453831955147428e-09,
"loss": 1.7766,
"step": 14105
},
{
"epoch": 0.9507765910852061,
"grad_norm": 5.311037237991293,
"learning_rate": 7.353012670728631e-09,
"loss": 1.7438,
"step": 14110
},
{
"epoch": 0.9511135069573128,
"grad_norm": 5.705160326224774,
"learning_rate": 7.252874818608645e-09,
"loss": 1.7423,
"step": 14115
},
{
"epoch": 0.9514504228294195,
"grad_norm": 5.941266511069281,
"learning_rate": 7.153418537298617e-09,
"loss": 1.8092,
"step": 14120
},
{
"epoch": 0.9517873387015262,
"grad_norm": 6.028018288023328,
"learning_rate": 7.0546439643671685e-09,
"loss": 1.7414,
"step": 14125
},
{
"epoch": 0.9521242545736329,
"grad_norm": 6.0984894459924615,
"learning_rate": 6.9565512364398445e-09,
"loss": 1.799,
"step": 14130
},
{
"epoch": 0.9524611704457397,
"grad_norm": 5.810943364613546,
"learning_rate": 6.859140489199167e-09,
"loss": 1.7548,
"step": 14135
},
{
"epoch": 0.9527980863178465,
"grad_norm": 6.05019358569247,
"learning_rate": 6.762411857384187e-09,
"loss": 1.8126,
"step": 14140
},
{
"epoch": 0.9531350021899532,
"grad_norm": 6.012558005711744,
"learning_rate": 6.666365474790492e-09,
"loss": 1.7978,
"step": 14145
},
{
"epoch": 0.9534719180620599,
"grad_norm": 6.316428272394365,
"learning_rate": 6.571001474270144e-09,
"loss": 1.7144,
"step": 14150
},
{
"epoch": 0.9538088339341666,
"grad_norm": 5.825366175142781,
"learning_rate": 6.4763199877311825e-09,
"loss": 1.7862,
"step": 14155
},
{
"epoch": 0.9541457498062734,
"grad_norm": 5.678194546571653,
"learning_rate": 6.382321146137571e-09,
"loss": 1.848,
"step": 14160
},
{
"epoch": 0.9544826656783801,
"grad_norm": 6.0270794934572285,
"learning_rate": 6.28900507950908e-09,
"loss": 1.8092,
"step": 14165
},
{
"epoch": 0.9548195815504869,
"grad_norm": 6.029126967150501,
"learning_rate": 6.196371916921073e-09,
"loss": 1.7877,
"step": 14170
},
{
"epoch": 0.9551564974225936,
"grad_norm": 6.231081190815106,
"learning_rate": 6.1044217865043325e-09,
"loss": 1.7842,
"step": 14175
},
{
"epoch": 0.9554934132947003,
"grad_norm": 5.951369955420382,
"learning_rate": 6.013154815444732e-09,
"loss": 1.8343,
"step": 14180
},
{
"epoch": 0.9558303291668071,
"grad_norm": 5.698513479571637,
"learning_rate": 5.922571129983456e-09,
"loss": 1.8377,
"step": 14185
},
{
"epoch": 0.9561672450389138,
"grad_norm": 6.336560120824533,
"learning_rate": 5.832670855416277e-09,
"loss": 1.8043,
"step": 14190
},
{
"epoch": 0.9565041609110205,
"grad_norm": 5.567693621557861,
"learning_rate": 5.7434541160938375e-09,
"loss": 1.7857,
"step": 14195
},
{
"epoch": 0.9568410767831272,
"grad_norm": 5.602719462601326,
"learning_rate": 5.6549210354212565e-09,
"loss": 1.8139,
"step": 14200
},
{
"epoch": 0.957177992655234,
"grad_norm": 6.252668788603584,
"learning_rate": 5.567071735858131e-09,
"loss": 1.7325,
"step": 14205
},
{
"epoch": 0.9575149085273407,
"grad_norm": 6.186365782591999,
"learning_rate": 5.4799063389179834e-09,
"loss": 1.7848,
"step": 14210
},
{
"epoch": 0.9578518243994475,
"grad_norm": 6.0170910772312345,
"learning_rate": 5.393424965168702e-09,
"loss": 1.8098,
"step": 14215
},
{
"epoch": 0.9581887402715542,
"grad_norm": 6.416217500285821,
"learning_rate": 5.307627734231657e-09,
"loss": 1.8432,
"step": 14220
},
{
"epoch": 0.9585256561436609,
"grad_norm": 6.079095377891874,
"learning_rate": 5.222514764782193e-09,
"loss": 1.8535,
"step": 14225
},
{
"epoch": 0.9588625720157676,
"grad_norm": 5.691306381084669,
"learning_rate": 5.138086174549083e-09,
"loss": 1.8069,
"step": 14230
},
{
"epoch": 0.9591994878878743,
"grad_norm": 6.315757537930107,
"learning_rate": 5.054342080314522e-09,
"loss": 1.7752,
"step": 14235
},
{
"epoch": 0.9595364037599812,
"grad_norm": 5.6811246579307255,
"learning_rate": 4.97128259791374e-09,
"loss": 1.7077,
"step": 14240
},
{
"epoch": 0.9598733196320879,
"grad_norm": 5.875555272618595,
"learning_rate": 4.888907842235113e-09,
"loss": 1.7176,
"step": 14245
},
{
"epoch": 0.9602102355041946,
"grad_norm": 5.940205015447577,
"learning_rate": 4.807217927220053e-09,
"loss": 1.7786,
"step": 14250
},
{
"epoch": 0.9605471513763013,
"grad_norm": 6.595986946123306,
"learning_rate": 4.726212965862342e-09,
"loss": 1.7806,
"step": 14255
},
{
"epoch": 0.960884067248408,
"grad_norm": 6.100938914389518,
"learning_rate": 4.645893070208684e-09,
"loss": 1.7441,
"step": 14260
},
{
"epoch": 0.9612209831205148,
"grad_norm": 6.225212166717535,
"learning_rate": 4.566258351357988e-09,
"loss": 1.7242,
"step": 14265
},
{
"epoch": 0.9615578989926216,
"grad_norm": 6.038611752498317,
"learning_rate": 4.48730891946153e-09,
"loss": 1.7141,
"step": 14270
},
{
"epoch": 0.9618948148647283,
"grad_norm": 5.687997722577268,
"learning_rate": 4.409044883722568e-09,
"loss": 1.7426,
"step": 14275
},
{
"epoch": 0.962231730736835,
"grad_norm": 5.65260149981692,
"learning_rate": 4.331466352396396e-09,
"loss": 1.8174,
"step": 14280
},
{
"epoch": 0.9625686466089417,
"grad_norm": 5.542702072962062,
"learning_rate": 4.2545734327902315e-09,
"loss": 1.7413,
"step": 14285
},
{
"epoch": 0.9629055624810485,
"grad_norm": 6.429780119232116,
"learning_rate": 4.178366231262665e-09,
"loss": 1.7694,
"step": 14290
},
{
"epoch": 0.9632424783531552,
"grad_norm": 5.910097048238468,
"learning_rate": 4.102844853224041e-09,
"loss": 1.7907,
"step": 14295
},
{
"epoch": 0.963579394225262,
"grad_norm": 6.045190253764579,
"learning_rate": 4.028009403135968e-09,
"loss": 1.774,
"step": 14300
},
{
"epoch": 0.9639163100973687,
"grad_norm": 6.098928200355735,
"learning_rate": 3.95385998451131e-09,
"loss": 1.8191,
"step": 14305
},
{
"epoch": 0.9642532259694754,
"grad_norm": 5.455149987735236,
"learning_rate": 3.880396699913968e-09,
"loss": 1.7874,
"step": 14310
},
{
"epoch": 0.9645901418415822,
"grad_norm": 6.359343013532195,
"learning_rate": 3.807619650958827e-09,
"loss": 1.7665,
"step": 14315
},
{
"epoch": 0.9649270577136889,
"grad_norm": 6.159889529402087,
"learning_rate": 3.7355289383115276e-09,
"loss": 1.8139,
"step": 14320
},
{
"epoch": 0.9652639735857956,
"grad_norm": 5.9449316060723785,
"learning_rate": 3.664124661688417e-09,
"loss": 1.8311,
"step": 14325
},
{
"epoch": 0.9656008894579023,
"grad_norm": 6.470751557779755,
"learning_rate": 3.5934069198562677e-09,
"loss": 1.7104,
"step": 14330
},
{
"epoch": 0.9659378053300091,
"grad_norm": 5.800731201565026,
"learning_rate": 3.5233758106322787e-09,
"loss": 1.7851,
"step": 14335
},
{
"epoch": 0.9662747212021159,
"grad_norm": 6.006742828646833,
"learning_rate": 3.4540314308839635e-09,
"loss": 1.7698,
"step": 14340
},
{
"epoch": 0.9666116370742226,
"grad_norm": 5.905075320021472,
"learning_rate": 3.385373876528874e-09,
"loss": 1.8125,
"step": 14345
},
{
"epoch": 0.9669485529463293,
"grad_norm": 6.577938888492687,
"learning_rate": 3.3174032425345444e-09,
"loss": 1.7513,
"step": 14350
},
{
"epoch": 0.967285468818436,
"grad_norm": 5.903574805231385,
"learning_rate": 3.250119622918379e-09,
"loss": 1.7226,
"step": 14355
},
{
"epoch": 0.9676223846905427,
"grad_norm": 6.078005184292544,
"learning_rate": 3.1835231107474323e-09,
"loss": 1.7905,
"step": 14360
},
{
"epoch": 0.9679593005626496,
"grad_norm": 6.042836567494886,
"learning_rate": 3.1176137981385185e-09,
"loss": 1.7973,
"step": 14365
},
{
"epoch": 0.9682962164347563,
"grad_norm": 5.65602428557149,
"learning_rate": 3.0523917762576568e-09,
"loss": 1.7457,
"step": 14370
},
{
"epoch": 0.968633132306863,
"grad_norm": 5.773086987424218,
"learning_rate": 2.9878571353204595e-09,
"loss": 1.8666,
"step": 14375
},
{
"epoch": 0.9689700481789697,
"grad_norm": 6.0373431050475315,
"learning_rate": 2.924009964591578e-09,
"loss": 1.7397,
"step": 14380
},
{
"epoch": 0.9693069640510764,
"grad_norm": 6.094144908034533,
"learning_rate": 2.8608503523848136e-09,
"loss": 1.8231,
"step": 14385
},
{
"epoch": 0.9696438799231831,
"grad_norm": 6.33990389312829,
"learning_rate": 2.7983783860629496e-09,
"loss": 1.699,
"step": 14390
},
{
"epoch": 0.96998079579529,
"grad_norm": 5.7053703145794605,
"learning_rate": 2.7365941520375303e-09,
"loss": 1.7927,
"step": 14395
},
{
"epoch": 0.9703177116673967,
"grad_norm": 5.636532734952386,
"learning_rate": 2.6754977357689724e-09,
"loss": 1.7631,
"step": 14400
},
{
"epoch": 0.9706546275395034,
"grad_norm": 5.892415294536311,
"learning_rate": 2.6150892217660647e-09,
"loss": 1.7085,
"step": 14405
},
{
"epoch": 0.9709915434116101,
"grad_norm": 6.6153894309409615,
"learning_rate": 2.5553686935864126e-09,
"loss": 1.8487,
"step": 14410
},
{
"epoch": 0.9713284592837168,
"grad_norm": 6.00592621079739,
"learning_rate": 2.496336233835661e-09,
"loss": 1.7502,
"step": 14415
},
{
"epoch": 0.9716653751558236,
"grad_norm": 5.883380002615908,
"learning_rate": 2.437991924167937e-09,
"loss": 1.834,
"step": 14420
},
{
"epoch": 0.9720022910279303,
"grad_norm": 5.457301773132191,
"learning_rate": 2.380335845285464e-09,
"loss": 1.7772,
"step": 14425
},
{
"epoch": 0.9723392069000371,
"grad_norm": 6.009693677479863,
"learning_rate": 2.323368076938448e-09,
"loss": 1.7901,
"step": 14430
},
{
"epoch": 0.9726761227721438,
"grad_norm": 5.926199674258243,
"learning_rate": 2.2670886979250235e-09,
"loss": 1.7362,
"step": 14435
},
{
"epoch": 0.9730130386442505,
"grad_norm": 6.6228262552651955,
"learning_rate": 2.211497786091143e-09,
"loss": 1.7798,
"step": 14440
},
{
"epoch": 0.9733499545163573,
"grad_norm": 5.525416957164009,
"learning_rate": 2.1565954183306313e-09,
"loss": 1.7603,
"step": 14445
},
{
"epoch": 0.973686870388464,
"grad_norm": 6.128785233954746,
"learning_rate": 2.1023816705846853e-09,
"loss": 1.7899,
"step": 14450
},
{
"epoch": 0.9740237862605707,
"grad_norm": 5.597950039158548,
"learning_rate": 2.048856617842043e-09,
"loss": 1.6975,
"step": 14455
},
{
"epoch": 0.9743607021326774,
"grad_norm": 6.07135782851224,
"learning_rate": 1.9960203341389813e-09,
"loss": 1.7247,
"step": 14460
},
{
"epoch": 0.9746976180047842,
"grad_norm": 5.877722240882944,
"learning_rate": 1.943872892558929e-09,
"loss": 1.8057,
"step": 14465
},
{
"epoch": 0.975034533876891,
"grad_norm": 5.852579274221657,
"learning_rate": 1.8924143652325196e-09,
"loss": 1.7745,
"step": 14470
},
{
"epoch": 0.9753714497489977,
"grad_norm": 6.176901784449129,
"learning_rate": 1.8416448233374848e-09,
"loss": 1.7142,
"step": 14475
},
{
"epoch": 0.9757083656211044,
"grad_norm": 6.208387127082091,
"learning_rate": 1.79156433709865e-09,
"loss": 1.8073,
"step": 14480
},
{
"epoch": 0.9760452814932111,
"grad_norm": 5.5727389870896715,
"learning_rate": 1.742172975787548e-09,
"loss": 1.748,
"step": 14485
},
{
"epoch": 0.9763821973653178,
"grad_norm": 5.9251930429951445,
"learning_rate": 1.6934708077226411e-09,
"loss": 1.7577,
"step": 14490
},
{
"epoch": 0.9767191132374247,
"grad_norm": 5.954705180523381,
"learning_rate": 1.6454579002690982e-09,
"loss": 1.7851,
"step": 14495
},
{
"epoch": 0.9770560291095314,
"grad_norm": 6.19174715813602,
"learning_rate": 1.5981343198386288e-09,
"loss": 1.7829,
"step": 14500
},
{
"epoch": 0.9773929449816381,
"grad_norm": 5.893212756172858,
"learning_rate": 1.5515001318895382e-09,
"loss": 1.7271,
"step": 14505
},
{
"epoch": 0.9777298608537448,
"grad_norm": 5.568240553890549,
"learning_rate": 1.5055554009264505e-09,
"loss": 1.8086,
"step": 14510
},
{
"epoch": 0.9780667767258515,
"grad_norm": 5.900260642183186,
"learning_rate": 1.4603001905004187e-09,
"loss": 1.7413,
"step": 14515
},
{
"epoch": 0.9784036925979583,
"grad_norm": 5.940174718298427,
"learning_rate": 1.4157345632087592e-09,
"loss": 1.6663,
"step": 14520
},
{
"epoch": 0.978740608470065,
"grad_norm": 5.696923193549704,
"learning_rate": 1.37185858069494e-09,
"loss": 1.7419,
"step": 14525
},
{
"epoch": 0.9790775243421718,
"grad_norm": 5.732780541053396,
"learning_rate": 1.328672303648415e-09,
"loss": 1.7824,
"step": 14530
},
{
"epoch": 0.9794144402142785,
"grad_norm": 6.147023994098516,
"learning_rate": 1.2861757918046778e-09,
"loss": 1.7452,
"step": 14535
},
{
"epoch": 0.9797513560863852,
"grad_norm": 6.103750948785661,
"learning_rate": 1.2443691039452642e-09,
"loss": 1.7967,
"step": 14540
},
{
"epoch": 0.980088271958492,
"grad_norm": 6.290929925062352,
"learning_rate": 1.203252297897417e-09,
"loss": 1.7872,
"step": 14545
},
{
"epoch": 0.9804251878305987,
"grad_norm": 5.631296340413579,
"learning_rate": 1.1628254305340869e-09,
"loss": 1.7577,
"step": 14550
},
{
"epoch": 0.9807621037027054,
"grad_norm": 6.151781649826895,
"learning_rate": 1.123088557773988e-09,
"loss": 1.7795,
"step": 14555
},
{
"epoch": 0.9810990195748122,
"grad_norm": 5.878601681301532,
"learning_rate": 1.0840417345814312e-09,
"loss": 1.7934,
"step": 14560
},
{
"epoch": 0.9814359354469189,
"grad_norm": 5.661532017486243,
"learning_rate": 1.0456850149662134e-09,
"loss": 1.7786,
"step": 14565
},
{
"epoch": 0.9817728513190256,
"grad_norm": 5.498959828209034,
"learning_rate": 1.0080184519835056e-09,
"loss": 1.7411,
"step": 14570
},
{
"epoch": 0.9821097671911324,
"grad_norm": 5.9308419780727695,
"learning_rate": 9.71042097734076e-10,
"loss": 1.6922,
"step": 14575
},
{
"epoch": 0.9824466830632391,
"grad_norm": 6.224672314052866,
"learning_rate": 9.347560033637347e-10,
"loss": 1.7872,
"step": 14580
},
{
"epoch": 0.9827835989353458,
"grad_norm": 6.456012846690479,
"learning_rate": 8.991602190636105e-10,
"loss": 1.7946,
"step": 14585
},
{
"epoch": 0.9831205148074526,
"grad_norm": 5.885385615000342,
"learning_rate": 8.642547940700961e-10,
"loss": 1.7394,
"step": 14590
},
{
"epoch": 0.9834574306795593,
"grad_norm": 6.523936838219438,
"learning_rate": 8.300397766644595e-10,
"loss": 1.7704,
"step": 14595
},
{
"epoch": 0.9837943465516661,
"grad_norm": 5.787459818959744,
"learning_rate": 7.965152141732878e-10,
"loss": 1.7182,
"step": 14600
},
{
"epoch": 0.9841312624237728,
"grad_norm": 6.66665537166297,
"learning_rate": 7.636811529678211e-10,
"loss": 1.9044,
"step": 14605
},
{
"epoch": 0.9844681782958795,
"grad_norm": 6.253212410487395,
"learning_rate": 7.315376384643968e-10,
"loss": 1.7786,
"step": 14610
},
{
"epoch": 0.9848050941679862,
"grad_norm": 6.212902508637188,
"learning_rate": 7.000847151240608e-10,
"loss": 1.8143,
"step": 14615
},
{
"epoch": 0.9851420100400929,
"grad_norm": 5.743679508987371,
"learning_rate": 6.693224264527897e-10,
"loss": 1.7831,
"step": 14620
},
{
"epoch": 0.9854789259121998,
"grad_norm": 5.626780106553643,
"learning_rate": 6.392508150011023e-10,
"loss": 1.7868,
"step": 14625
},
{
"epoch": 0.9858158417843065,
"grad_norm": 5.914322606080147,
"learning_rate": 6.098699223641701e-10,
"loss": 1.7559,
"step": 14630
},
{
"epoch": 0.9861527576564132,
"grad_norm": 5.280356384405962,
"learning_rate": 5.811797891819847e-10,
"loss": 1.7467,
"step": 14635
},
{
"epoch": 0.9864896735285199,
"grad_norm": 6.098016592220372,
"learning_rate": 5.531804551387464e-10,
"loss": 1.7997,
"step": 14640
},
{
"epoch": 0.9868265894006266,
"grad_norm": 5.913509913806897,
"learning_rate": 5.258719589634198e-10,
"loss": 1.6823,
"step": 14645
},
{
"epoch": 0.9871635052727334,
"grad_norm": 6.600818579713024,
"learning_rate": 4.992543384291781e-10,
"loss": 1.8086,
"step": 14650
},
{
"epoch": 0.9875004211448402,
"grad_norm": 6.463420633584194,
"learning_rate": 4.733276303537925e-10,
"loss": 1.8205,
"step": 14655
},
{
"epoch": 0.9878373370169469,
"grad_norm": 5.856138268133559,
"learning_rate": 4.480918705991321e-10,
"loss": 1.7879,
"step": 14660
},
{
"epoch": 0.9881742528890536,
"grad_norm": 6.322129837266916,
"learning_rate": 4.235470940715524e-10,
"loss": 1.8006,
"step": 14665
},
{
"epoch": 0.9885111687611603,
"grad_norm": 6.141656233222583,
"learning_rate": 3.99693334721507e-10,
"loss": 1.7482,
"step": 14670
},
{
"epoch": 0.9888480846332671,
"grad_norm": 6.526943362952002,
"learning_rate": 3.765306255436029e-10,
"loss": 1.8075,
"step": 14675
},
{
"epoch": 0.9891850005053738,
"grad_norm": 5.793751726786318,
"learning_rate": 3.540589985766562e-10,
"loss": 1.7946,
"step": 14680
},
{
"epoch": 0.9895219163774805,
"grad_norm": 5.973903896846691,
"learning_rate": 3.322784849036364e-10,
"loss": 1.7779,
"step": 14685
},
{
"epoch": 0.9898588322495873,
"grad_norm": 6.014872978513893,
"learning_rate": 3.11189114651389e-10,
"loss": 1.6545,
"step": 14690
},
{
"epoch": 0.990195748121694,
"grad_norm": 6.368229704231421,
"learning_rate": 2.9079091699091287e-10,
"loss": 1.7579,
"step": 14695
},
{
"epoch": 0.9905326639938008,
"grad_norm": 6.193988622544747,
"learning_rate": 2.710839201370829e-10,
"loss": 1.8368,
"step": 14700
},
{
"epoch": 0.9908695798659075,
"grad_norm": 5.758847930865848,
"learning_rate": 2.5206815134881655e-10,
"loss": 1.7821,
"step": 14705
},
{
"epoch": 0.9912064957380142,
"grad_norm": 5.69684814939484,
"learning_rate": 2.337436369287404e-10,
"loss": 1.7896,
"step": 14710
},
{
"epoch": 0.9915434116101209,
"grad_norm": 5.638836518174001,
"learning_rate": 2.1611040222346833e-10,
"loss": 1.7734,
"step": 14715
},
{
"epoch": 0.9918803274822277,
"grad_norm": 5.899552088739335,
"learning_rate": 1.9916847162343432e-10,
"loss": 1.7912,
"step": 14720
},
{
"epoch": 0.9922172433543345,
"grad_norm": 5.829892034986946,
"learning_rate": 1.829178685627264e-10,
"loss": 1.7899,
"step": 14725
},
{
"epoch": 0.9925541592264412,
"grad_norm": 5.795901536554887,
"learning_rate": 1.6735861551936402e-10,
"loss": 1.728,
"step": 14730
},
{
"epoch": 0.9928910750985479,
"grad_norm": 5.766864718819964,
"learning_rate": 1.5249073401502055e-10,
"loss": 1.7406,
"step": 14735
},
{
"epoch": 0.9932279909706546,
"grad_norm": 6.08955650931636,
"learning_rate": 1.3831424461496766e-10,
"loss": 1.7832,
"step": 14740
},
{
"epoch": 0.9935649068427613,
"grad_norm": 6.069229823009566,
"learning_rate": 1.2482916692824197e-10,
"loss": 1.7347,
"step": 14745
},
{
"epoch": 0.993901822714868,
"grad_norm": 5.865594094745515,
"learning_rate": 1.1203551960742297e-10,
"loss": 1.7604,
"step": 14750
},
{
"epoch": 0.9942387385869749,
"grad_norm": 6.41217202434423,
"learning_rate": 9.993332034891056e-11,
"loss": 1.7317,
"step": 14755
},
{
"epoch": 0.9945756544590816,
"grad_norm": 5.818435701385782,
"learning_rate": 8.852258589236994e-11,
"loss": 1.7556,
"step": 14760
},
{
"epoch": 0.9949125703311883,
"grad_norm": 6.258726795030299,
"learning_rate": 7.780333202134226e-11,
"loss": 1.7599,
"step": 14765
},
{
"epoch": 0.995249486203295,
"grad_norm": 5.7871168282439545,
"learning_rate": 6.777557356263397e-11,
"loss": 1.7978,
"step": 14770
},
{
"epoch": 0.9955864020754017,
"grad_norm": 5.995386180636011,
"learning_rate": 5.843932438681643e-11,
"loss": 1.8203,
"step": 14775
},
{
"epoch": 0.9959233179475085,
"grad_norm": 5.821396562583734,
"learning_rate": 4.97945974077818e-11,
"loss": 1.8173,
"step": 14780
},
{
"epoch": 0.9962602338196153,
"grad_norm": 6.117647963442301,
"learning_rate": 4.1841404582965143e-11,
"loss": 1.7567,
"step": 14785
},
{
"epoch": 0.996597149691722,
"grad_norm": 5.655528276296895,
"learning_rate": 3.457975691334436e-11,
"loss": 1.681,
"step": 14790
},
{
"epoch": 0.9969340655638287,
"grad_norm": 5.646705806265958,
"learning_rate": 2.800966444316266e-11,
"loss": 1.7546,
"step": 14795
},
{
"epoch": 0.9972709814359354,
"grad_norm": 5.905296090122313,
"learning_rate": 2.213113626026164e-11,
"loss": 1.8007,
"step": 14800
},
{
"epoch": 0.9976078973080422,
"grad_norm": 5.922158957749158,
"learning_rate": 1.6944180495914728e-11,
"loss": 1.704,
"step": 14805
},
{
"epoch": 0.9979448131801489,
"grad_norm": 6.204962127147208,
"learning_rate": 1.2448804324660667e-11,
"loss": 1.8127,
"step": 14810
},
{
"epoch": 0.9982817290522557,
"grad_norm": 6.224102373975581,
"learning_rate": 8.645013964581061e-12,
"loss": 1.8241,
"step": 14815
},
{
"epoch": 0.9986186449243624,
"grad_norm": 5.952709226062345,
"learning_rate": 5.532814677133846e-12,
"loss": 1.7962,
"step": 14820
},
{
"epoch": 0.9989555607964691,
"grad_norm": 6.246353867410941,
"learning_rate": 3.112210767042267e-12,
"loss": 1.804,
"step": 14825
},
{
"epoch": 0.9992924766685759,
"grad_norm": 6.163743778465804,
"learning_rate": 1.383205582516922e-12,
"loss": 1.6971,
"step": 14830
},
{
"epoch": 0.9996293925406826,
"grad_norm": 5.689291557459708,
"learning_rate": 3.4580151520025024e-13,
"loss": 1.8645,
"step": 14835
},
{
"epoch": 0.9999663084127893,
"grad_norm": 6.244802182387895,
"learning_rate": 0.0,
"loss": 1.8053,
"step": 14840
},
{
"epoch": 0.9999663084127893,
"eval_loss": NaN,
"eval_runtime": 77.4202,
"eval_samples_per_second": 61.973,
"eval_steps_per_second": 7.75,
"step": 14840
},
{
"epoch": 0.9999663084127893,
"step": 14840,
"total_flos": 1.2604318649008783e+18,
"train_loss": 1.874135019435394,
"train_runtime": 22896.7966,
"train_samples_per_second": 20.741,
"train_steps_per_second": 0.648
}
],
"logging_steps": 5,
"max_steps": 14840,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.2604318649008783e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}