Files
unsup-Llama-3.2-1B-Instruct…/trainer_state.json
ModelHub XC 4360715cbd 初始化项目,由ModelHub XC社区提供模型
Model: ferrazzipietro/unsup-Llama-3.2-1B-Instruct-datav2-3ep
Source: Original Platform
2026-06-04 10:56:16 +08:00

33567 lines
820 KiB
JSON

{
"best_global_step": 12000,
"best_metric": 0.3009350597858429,
"best_model_checkpoint": null,
"epoch": 2.99971659791542,
"eval_steps": 1000,
"global_step": 23817,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00012595648203545676,
"grad_norm": 107.34553527832031,
"learning_rate": 0.0,
"loss": 4.9992,
"step": 1
},
{
"epoch": 0.0006297824101772837,
"grad_norm": 132.82723999023438,
"learning_rate": 5.037783375314861e-07,
"loss": 5.2462,
"step": 5
},
{
"epoch": 0.0012595648203545674,
"grad_norm": 61.73713684082031,
"learning_rate": 1.1335012594458437e-06,
"loss": 4.9216,
"step": 10
},
{
"epoch": 0.0018893472305318512,
"grad_norm": 29.1369571685791,
"learning_rate": 1.7632241813602012e-06,
"loss": 4.3103,
"step": 15
},
{
"epoch": 0.0025191296407091348,
"grad_norm": 13.688713073730469,
"learning_rate": 2.392947103274559e-06,
"loss": 3.7045,
"step": 20
},
{
"epoch": 0.0031489120508864186,
"grad_norm": 7.063476085662842,
"learning_rate": 3.0226700251889166e-06,
"loss": 3.255,
"step": 25
},
{
"epoch": 0.0037786944610637024,
"grad_norm": 4.568771839141846,
"learning_rate": 3.6523929471032744e-06,
"loss": 2.9188,
"step": 30
},
{
"epoch": 0.004408476871240986,
"grad_norm": 4.0639753341674805,
"learning_rate": 4.282115869017632e-06,
"loss": 2.6194,
"step": 35
},
{
"epoch": 0.0050382592814182696,
"grad_norm": 3.363410234451294,
"learning_rate": 4.911838790931989e-06,
"loss": 2.3438,
"step": 40
},
{
"epoch": 0.005668041691595554,
"grad_norm": 3.086527109146118,
"learning_rate": 5.541561712846347e-06,
"loss": 2.21,
"step": 45
},
{
"epoch": 0.006297824101772837,
"grad_norm": 2.987258195877075,
"learning_rate": 6.171284634760705e-06,
"loss": 2.063,
"step": 50
},
{
"epoch": 0.006927606511950121,
"grad_norm": 2.956822156906128,
"learning_rate": 6.801007556675062e-06,
"loss": 1.9592,
"step": 55
},
{
"epoch": 0.007557388922127405,
"grad_norm": 2.7860183715820312,
"learning_rate": 7.43073047858942e-06,
"loss": 1.8058,
"step": 60
},
{
"epoch": 0.008187171332304689,
"grad_norm": 2.930774211883545,
"learning_rate": 8.060453400503778e-06,
"loss": 1.7843,
"step": 65
},
{
"epoch": 0.008816953742481972,
"grad_norm": 2.797520637512207,
"learning_rate": 8.690176322418136e-06,
"loss": 1.6922,
"step": 70
},
{
"epoch": 0.009446736152659256,
"grad_norm": 2.945425271987915,
"learning_rate": 9.319899244332492e-06,
"loss": 1.6863,
"step": 75
},
{
"epoch": 0.010076518562836539,
"grad_norm": 3.775211811065674,
"learning_rate": 9.94962216624685e-06,
"loss": 1.6918,
"step": 80
},
{
"epoch": 0.010706300973013824,
"grad_norm": 4.795872688293457,
"learning_rate": 1.0579345088161209e-05,
"loss": 1.5709,
"step": 85
},
{
"epoch": 0.011336083383191108,
"grad_norm": 6.182225227355957,
"learning_rate": 1.1209068010075565e-05,
"loss": 1.582,
"step": 90
},
{
"epoch": 0.011965865793368391,
"grad_norm": 2.548555612564087,
"learning_rate": 1.1838790931989923e-05,
"loss": 1.4157,
"step": 95
},
{
"epoch": 0.012595648203545674,
"grad_norm": 2.5125293731689453,
"learning_rate": 1.246851385390428e-05,
"loss": 1.4848,
"step": 100
},
{
"epoch": 0.01322543061372296,
"grad_norm": 2.3581464290618896,
"learning_rate": 1.309823677581864e-05,
"loss": 1.4169,
"step": 105
},
{
"epoch": 0.013855213023900243,
"grad_norm": 2.4205543994903564,
"learning_rate": 1.3727959697732996e-05,
"loss": 1.3803,
"step": 110
},
{
"epoch": 0.014484995434077526,
"grad_norm": 2.251526355743408,
"learning_rate": 1.4357682619647355e-05,
"loss": 1.3928,
"step": 115
},
{
"epoch": 0.01511477784425481,
"grad_norm": 2.3966078758239746,
"learning_rate": 1.4987405541561711e-05,
"loss": 1.3994,
"step": 120
},
{
"epoch": 0.015744560254432095,
"grad_norm": 2.2904422283172607,
"learning_rate": 1.561712846347607e-05,
"loss": 1.3884,
"step": 125
},
{
"epoch": 0.016374342664609378,
"grad_norm": 2.238481283187866,
"learning_rate": 1.6246851385390428e-05,
"loss": 1.3419,
"step": 130
},
{
"epoch": 0.01700412507478666,
"grad_norm": 2.3208041191101074,
"learning_rate": 1.6876574307304786e-05,
"loss": 1.3282,
"step": 135
},
{
"epoch": 0.017633907484963945,
"grad_norm": 2.34198260307312,
"learning_rate": 1.750629722921914e-05,
"loss": 1.3398,
"step": 140
},
{
"epoch": 0.018263689895141228,
"grad_norm": 2.316502332687378,
"learning_rate": 1.81360201511335e-05,
"loss": 1.3171,
"step": 145
},
{
"epoch": 0.01889347230531851,
"grad_norm": 2.1474759578704834,
"learning_rate": 1.8765743073047857e-05,
"loss": 1.2896,
"step": 150
},
{
"epoch": 0.019523254715495795,
"grad_norm": 2.2437920570373535,
"learning_rate": 1.9395465994962215e-05,
"loss": 1.3504,
"step": 155
},
{
"epoch": 0.020153037125673078,
"grad_norm": 2.3530609607696533,
"learning_rate": 2.002518891687657e-05,
"loss": 1.2937,
"step": 160
},
{
"epoch": 0.020782819535850365,
"grad_norm": 3.199941635131836,
"learning_rate": 2.065491183879093e-05,
"loss": 1.2841,
"step": 165
},
{
"epoch": 0.02141260194602765,
"grad_norm": 2.3354921340942383,
"learning_rate": 2.128463476070529e-05,
"loss": 1.2573,
"step": 170
},
{
"epoch": 0.022042384356204932,
"grad_norm": 1.9546146392822266,
"learning_rate": 2.1914357682619645e-05,
"loss": 1.2179,
"step": 175
},
{
"epoch": 0.022672166766382215,
"grad_norm": 2.329948902130127,
"learning_rate": 2.2544080604534003e-05,
"loss": 1.2417,
"step": 180
},
{
"epoch": 0.0233019491765595,
"grad_norm": 2.110067367553711,
"learning_rate": 2.3173803526448358e-05,
"loss": 1.289,
"step": 185
},
{
"epoch": 0.023931731586736782,
"grad_norm": 1.8875372409820557,
"learning_rate": 2.380352644836272e-05,
"loss": 1.2183,
"step": 190
},
{
"epoch": 0.024561513996914065,
"grad_norm": 2.3789267539978027,
"learning_rate": 2.4433249370277077e-05,
"loss": 1.3136,
"step": 195
},
{
"epoch": 0.02519129640709135,
"grad_norm": 2.4072821140289307,
"learning_rate": 2.5062972292191432e-05,
"loss": 1.2241,
"step": 200
},
{
"epoch": 0.025821078817268632,
"grad_norm": 1.936230182647705,
"learning_rate": 2.569269521410579e-05,
"loss": 1.2715,
"step": 205
},
{
"epoch": 0.02645086122744592,
"grad_norm": 1.8356448411941528,
"learning_rate": 2.632241813602015e-05,
"loss": 1.1728,
"step": 210
},
{
"epoch": 0.027080643637623202,
"grad_norm": 1.7642450332641602,
"learning_rate": 2.6952141057934507e-05,
"loss": 1.2022,
"step": 215
},
{
"epoch": 0.027710426047800486,
"grad_norm": 1.8820688724517822,
"learning_rate": 2.7581863979848865e-05,
"loss": 1.1563,
"step": 220
},
{
"epoch": 0.02834020845797777,
"grad_norm": 1.8875645399093628,
"learning_rate": 2.821158690176322e-05,
"loss": 1.1473,
"step": 225
},
{
"epoch": 0.028969990868155052,
"grad_norm": 1.950432538986206,
"learning_rate": 2.884130982367758e-05,
"loss": 1.1478,
"step": 230
},
{
"epoch": 0.029599773278332336,
"grad_norm": 1.8610332012176514,
"learning_rate": 2.9471032745591936e-05,
"loss": 1.2026,
"step": 235
},
{
"epoch": 0.03022955568850962,
"grad_norm": 1.865122675895691,
"learning_rate": 3.0100755667506295e-05,
"loss": 1.1748,
"step": 240
},
{
"epoch": 0.030859338098686902,
"grad_norm": 1.7126635313034058,
"learning_rate": 3.0730478589420656e-05,
"loss": 1.0982,
"step": 245
},
{
"epoch": 0.03148912050886419,
"grad_norm": 1.809869408607483,
"learning_rate": 3.136020151133501e-05,
"loss": 1.1032,
"step": 250
},
{
"epoch": 0.03211890291904147,
"grad_norm": 1.9480305910110474,
"learning_rate": 3.1989924433249366e-05,
"loss": 1.1195,
"step": 255
},
{
"epoch": 0.032748685329218756,
"grad_norm": 1.762191653251648,
"learning_rate": 3.2619647355163724e-05,
"loss": 1.1413,
"step": 260
},
{
"epoch": 0.03337846773939604,
"grad_norm": 1.5951004028320312,
"learning_rate": 3.324937027707808e-05,
"loss": 1.1514,
"step": 265
},
{
"epoch": 0.03400825014957332,
"grad_norm": 1.9166375398635864,
"learning_rate": 3.387909319899244e-05,
"loss": 1.1129,
"step": 270
},
{
"epoch": 0.034638032559750606,
"grad_norm": 1.8051133155822754,
"learning_rate": 3.45088161209068e-05,
"loss": 1.1827,
"step": 275
},
{
"epoch": 0.03526781496992789,
"grad_norm": 1.8178284168243408,
"learning_rate": 3.513853904282116e-05,
"loss": 1.1567,
"step": 280
},
{
"epoch": 0.03589759738010517,
"grad_norm": 1.5286771059036255,
"learning_rate": 3.5768261964735515e-05,
"loss": 1.011,
"step": 285
},
{
"epoch": 0.036527379790282456,
"grad_norm": 1.6900701522827148,
"learning_rate": 3.639798488664987e-05,
"loss": 1.059,
"step": 290
},
{
"epoch": 0.03715716220045974,
"grad_norm": 1.5855501890182495,
"learning_rate": 3.702770780856423e-05,
"loss": 1.0943,
"step": 295
},
{
"epoch": 0.03778694461063702,
"grad_norm": 1.650087833404541,
"learning_rate": 3.765743073047858e-05,
"loss": 1.0843,
"step": 300
},
{
"epoch": 0.038416727020814306,
"grad_norm": 1.772127389907837,
"learning_rate": 3.828715365239294e-05,
"loss": 1.1161,
"step": 305
},
{
"epoch": 0.03904650943099159,
"grad_norm": 1.8052891492843628,
"learning_rate": 3.8916876574307306e-05,
"loss": 1.1493,
"step": 310
},
{
"epoch": 0.03967629184116887,
"grad_norm": 1.816603183746338,
"learning_rate": 3.954659949622166e-05,
"loss": 1.1113,
"step": 315
},
{
"epoch": 0.040306074251346156,
"grad_norm": 1.7114448547363281,
"learning_rate": 4.0176322418136016e-05,
"loss": 1.1124,
"step": 320
},
{
"epoch": 0.04093585666152345,
"grad_norm": 1.693428874015808,
"learning_rate": 4.080604534005038e-05,
"loss": 1.0994,
"step": 325
},
{
"epoch": 0.04156563907170073,
"grad_norm": 1.6820402145385742,
"learning_rate": 4.143576826196473e-05,
"loss": 1.0989,
"step": 330
},
{
"epoch": 0.042195421481878014,
"grad_norm": 1.5841305255889893,
"learning_rate": 4.206549118387909e-05,
"loss": 1.123,
"step": 335
},
{
"epoch": 0.0428252038920553,
"grad_norm": 1.714936375617981,
"learning_rate": 4.269521410579344e-05,
"loss": 1.0054,
"step": 340
},
{
"epoch": 0.04345498630223258,
"grad_norm": 1.6568523645401,
"learning_rate": 4.332493702770781e-05,
"loss": 1.0397,
"step": 345
},
{
"epoch": 0.044084768712409864,
"grad_norm": 1.9612834453582764,
"learning_rate": 4.3954659949622165e-05,
"loss": 1.0481,
"step": 350
},
{
"epoch": 0.04471455112258715,
"grad_norm": 1.5859555006027222,
"learning_rate": 4.4584382871536516e-05,
"loss": 1.0754,
"step": 355
},
{
"epoch": 0.04534433353276443,
"grad_norm": 1.6280044317245483,
"learning_rate": 4.521410579345088e-05,
"loss": 1.0696,
"step": 360
},
{
"epoch": 0.045974115942941714,
"grad_norm": 1.5000571012496948,
"learning_rate": 4.584382871536523e-05,
"loss": 1.0508,
"step": 365
},
{
"epoch": 0.046603898353119,
"grad_norm": 1.4447442293167114,
"learning_rate": 4.647355163727959e-05,
"loss": 1.0924,
"step": 370
},
{
"epoch": 0.04723368076329628,
"grad_norm": 1.6207387447357178,
"learning_rate": 4.7103274559193956e-05,
"loss": 1.0477,
"step": 375
},
{
"epoch": 0.047863463173473564,
"grad_norm": 1.5458993911743164,
"learning_rate": 4.773299748110831e-05,
"loss": 1.0905,
"step": 380
},
{
"epoch": 0.04849324558365085,
"grad_norm": 1.5125916004180908,
"learning_rate": 4.8362720403022666e-05,
"loss": 1.0858,
"step": 385
},
{
"epoch": 0.04912302799382813,
"grad_norm": 1.5438019037246704,
"learning_rate": 4.899244332493702e-05,
"loss": 1.1133,
"step": 390
},
{
"epoch": 0.049752810404005414,
"grad_norm": 1.5938135385513306,
"learning_rate": 4.962216624685138e-05,
"loss": 1.0814,
"step": 395
},
{
"epoch": 0.0503825928141827,
"grad_norm": 1.4425631761550903,
"learning_rate": 5.025188916876574e-05,
"loss": 1.0668,
"step": 400
},
{
"epoch": 0.05101237522435998,
"grad_norm": 1.505650520324707,
"learning_rate": 5.088161209068009e-05,
"loss": 1.1201,
"step": 405
},
{
"epoch": 0.051642157634537264,
"grad_norm": 1.421877145767212,
"learning_rate": 5.151133501259446e-05,
"loss": 1.0199,
"step": 410
},
{
"epoch": 0.052271940044714554,
"grad_norm": 1.6243743896484375,
"learning_rate": 5.2141057934508815e-05,
"loss": 1.0791,
"step": 415
},
{
"epoch": 0.05290172245489184,
"grad_norm": 1.5055480003356934,
"learning_rate": 5.2770780856423166e-05,
"loss": 1.0553,
"step": 420
},
{
"epoch": 0.05353150486506912,
"grad_norm": 1.5624281167984009,
"learning_rate": 5.340050377833753e-05,
"loss": 1.0778,
"step": 425
},
{
"epoch": 0.054161287275246404,
"grad_norm": 1.4252930879592896,
"learning_rate": 5.403022670025188e-05,
"loss": 1.0354,
"step": 430
},
{
"epoch": 0.05479106968542369,
"grad_norm": 1.4582849740982056,
"learning_rate": 5.465994962216624e-05,
"loss": 0.9825,
"step": 435
},
{
"epoch": 0.05542085209560097,
"grad_norm": 1.4336227178573608,
"learning_rate": 5.52896725440806e-05,
"loss": 1.0753,
"step": 440
},
{
"epoch": 0.056050634505778255,
"grad_norm": 1.4182472229003906,
"learning_rate": 5.591939546599496e-05,
"loss": 1.0005,
"step": 445
},
{
"epoch": 0.05668041691595554,
"grad_norm": 1.4060423374176025,
"learning_rate": 5.6549118387909316e-05,
"loss": 1.0311,
"step": 450
},
{
"epoch": 0.05731019932613282,
"grad_norm": 1.5005807876586914,
"learning_rate": 5.717884130982367e-05,
"loss": 1.0815,
"step": 455
},
{
"epoch": 0.057939981736310105,
"grad_norm": 1.3563963174819946,
"learning_rate": 5.780856423173803e-05,
"loss": 1.0099,
"step": 460
},
{
"epoch": 0.05856976414648739,
"grad_norm": 1.4664981365203857,
"learning_rate": 5.843828715365239e-05,
"loss": 1.0221,
"step": 465
},
{
"epoch": 0.05919954655666467,
"grad_norm": 1.3635119199752808,
"learning_rate": 5.906801007556674e-05,
"loss": 1.0864,
"step": 470
},
{
"epoch": 0.059829328966841955,
"grad_norm": 1.3736735582351685,
"learning_rate": 5.9697732997481107e-05,
"loss": 1.0949,
"step": 475
},
{
"epoch": 0.06045911137701924,
"grad_norm": 1.3973110914230347,
"learning_rate": 6.0327455919395465e-05,
"loss": 1.0936,
"step": 480
},
{
"epoch": 0.06108889378719652,
"grad_norm": 1.4158260822296143,
"learning_rate": 6.0957178841309816e-05,
"loss": 1.0389,
"step": 485
},
{
"epoch": 0.061718676197373805,
"grad_norm": 1.3810491561889648,
"learning_rate": 6.158690176322417e-05,
"loss": 1.0027,
"step": 490
},
{
"epoch": 0.06234845860755109,
"grad_norm": 1.3191838264465332,
"learning_rate": 6.221662468513854e-05,
"loss": 1.0626,
"step": 495
},
{
"epoch": 0.06297824101772838,
"grad_norm": 1.2959917783737183,
"learning_rate": 6.28463476070529e-05,
"loss": 1.064,
"step": 500
},
{
"epoch": 0.06360802342790566,
"grad_norm": 1.321399450302124,
"learning_rate": 6.347607052896724e-05,
"loss": 1.0407,
"step": 505
},
{
"epoch": 0.06423780583808295,
"grad_norm": 1.464593768119812,
"learning_rate": 6.410579345088161e-05,
"loss": 1.0961,
"step": 510
},
{
"epoch": 0.06486758824826022,
"grad_norm": 1.1570991277694702,
"learning_rate": 6.473551637279596e-05,
"loss": 1.0373,
"step": 515
},
{
"epoch": 0.06549737065843751,
"grad_norm": 1.2346997261047363,
"learning_rate": 6.536523929471032e-05,
"loss": 1.0016,
"step": 520
},
{
"epoch": 0.06612715306861479,
"grad_norm": 1.2645131349563599,
"learning_rate": 6.599496221662469e-05,
"loss": 1.0175,
"step": 525
},
{
"epoch": 0.06675693547879208,
"grad_norm": 1.7016206979751587,
"learning_rate": 6.662468513853903e-05,
"loss": 1.0517,
"step": 530
},
{
"epoch": 0.06738671788896936,
"grad_norm": 1.2666237354278564,
"learning_rate": 6.725440806045339e-05,
"loss": 0.9876,
"step": 535
},
{
"epoch": 0.06801650029914665,
"grad_norm": 1.365417242050171,
"learning_rate": 6.788413098236775e-05,
"loss": 1.0262,
"step": 540
},
{
"epoch": 0.06864628270932392,
"grad_norm": 1.1083537340164185,
"learning_rate": 6.851385390428211e-05,
"loss": 0.979,
"step": 545
},
{
"epoch": 0.06927606511950121,
"grad_norm": 1.2279447317123413,
"learning_rate": 6.914357682619647e-05,
"loss": 1.0566,
"step": 550
},
{
"epoch": 0.0699058475296785,
"grad_norm": 1.2323870658874512,
"learning_rate": 6.977329974811082e-05,
"loss": 1.0047,
"step": 555
},
{
"epoch": 0.07053562993985578,
"grad_norm": 1.2166273593902588,
"learning_rate": 7.040302267002518e-05,
"loss": 0.989,
"step": 560
},
{
"epoch": 0.07116541235003307,
"grad_norm": 1.3858731985092163,
"learning_rate": 7.103274559193954e-05,
"loss": 1.0389,
"step": 565
},
{
"epoch": 0.07179519476021035,
"grad_norm": 1.1922657489776611,
"learning_rate": 7.16624685138539e-05,
"loss": 1.0214,
"step": 570
},
{
"epoch": 0.07242497717038764,
"grad_norm": 1.2922885417938232,
"learning_rate": 7.229219143576826e-05,
"loss": 1.0647,
"step": 575
},
{
"epoch": 0.07305475958056491,
"grad_norm": 1.174737811088562,
"learning_rate": 7.292191435768262e-05,
"loss": 1.039,
"step": 580
},
{
"epoch": 0.0736845419907422,
"grad_norm": 1.4469586610794067,
"learning_rate": 7.355163727959697e-05,
"loss": 0.9553,
"step": 585
},
{
"epoch": 0.07431432440091948,
"grad_norm": 1.2497671842575073,
"learning_rate": 7.418136020151133e-05,
"loss": 1.0676,
"step": 590
},
{
"epoch": 0.07494410681109677,
"grad_norm": 1.0431822538375854,
"learning_rate": 7.481108312342569e-05,
"loss": 0.9664,
"step": 595
},
{
"epoch": 0.07557388922127405,
"grad_norm": 1.1643774509429932,
"learning_rate": 7.544080604534005e-05,
"loss": 1.0354,
"step": 600
},
{
"epoch": 0.07620367163145134,
"grad_norm": 1.277813196182251,
"learning_rate": 7.60705289672544e-05,
"loss": 1.0078,
"step": 605
},
{
"epoch": 0.07683345404162861,
"grad_norm": 1.17416250705719,
"learning_rate": 7.670025188916876e-05,
"loss": 0.9802,
"step": 610
},
{
"epoch": 0.0774632364518059,
"grad_norm": 1.3622347116470337,
"learning_rate": 7.732997481108312e-05,
"loss": 0.9323,
"step": 615
},
{
"epoch": 0.07809301886198318,
"grad_norm": 1.1814721822738647,
"learning_rate": 7.795969773299747e-05,
"loss": 0.989,
"step": 620
},
{
"epoch": 0.07872280127216047,
"grad_norm": 1.1661953926086426,
"learning_rate": 7.858942065491183e-05,
"loss": 0.9873,
"step": 625
},
{
"epoch": 0.07935258368233775,
"grad_norm": 1.1950370073318481,
"learning_rate": 7.921914357682618e-05,
"loss": 0.9997,
"step": 630
},
{
"epoch": 0.07998236609251504,
"grad_norm": 1.2635341882705688,
"learning_rate": 7.984886649874056e-05,
"loss": 0.9939,
"step": 635
},
{
"epoch": 0.08061214850269231,
"grad_norm": 1.094709038734436,
"learning_rate": 8.047858942065491e-05,
"loss": 1.035,
"step": 640
},
{
"epoch": 0.0812419309128696,
"grad_norm": 1.0304499864578247,
"learning_rate": 8.110831234256926e-05,
"loss": 1.0385,
"step": 645
},
{
"epoch": 0.0818717133230469,
"grad_norm": 1.203204870223999,
"learning_rate": 8.173803526448362e-05,
"loss": 1.0159,
"step": 650
},
{
"epoch": 0.08250149573322417,
"grad_norm": 1.317717432975769,
"learning_rate": 8.236775818639797e-05,
"loss": 0.9972,
"step": 655
},
{
"epoch": 0.08313127814340146,
"grad_norm": 1.1352335214614868,
"learning_rate": 8.299748110831233e-05,
"loss": 1.0027,
"step": 660
},
{
"epoch": 0.08376106055357874,
"grad_norm": 1.1126285791397095,
"learning_rate": 8.362720403022669e-05,
"loss": 0.9824,
"step": 665
},
{
"epoch": 0.08439084296375603,
"grad_norm": 1.1858975887298584,
"learning_rate": 8.425692695214106e-05,
"loss": 1.0084,
"step": 670
},
{
"epoch": 0.0850206253739333,
"grad_norm": 1.1154075860977173,
"learning_rate": 8.488664987405541e-05,
"loss": 0.968,
"step": 675
},
{
"epoch": 0.0856504077841106,
"grad_norm": 1.1235395669937134,
"learning_rate": 8.551637279596977e-05,
"loss": 0.9828,
"step": 680
},
{
"epoch": 0.08628019019428787,
"grad_norm": 1.251595139503479,
"learning_rate": 8.614609571788412e-05,
"loss": 0.9945,
"step": 685
},
{
"epoch": 0.08690997260446516,
"grad_norm": 1.2071354389190674,
"learning_rate": 8.677581863979848e-05,
"loss": 1.0263,
"step": 690
},
{
"epoch": 0.08753975501464244,
"grad_norm": 1.1954138278961182,
"learning_rate": 8.740554156171283e-05,
"loss": 0.9987,
"step": 695
},
{
"epoch": 0.08816953742481973,
"grad_norm": 1.088759422302246,
"learning_rate": 8.803526448362719e-05,
"loss": 1.0323,
"step": 700
},
{
"epoch": 0.088799319834997,
"grad_norm": 1.0866519212722778,
"learning_rate": 8.866498740554156e-05,
"loss": 1.0009,
"step": 705
},
{
"epoch": 0.0894291022451743,
"grad_norm": 1.1450897455215454,
"learning_rate": 8.929471032745592e-05,
"loss": 1.0493,
"step": 710
},
{
"epoch": 0.09005888465535157,
"grad_norm": 1.0619937181472778,
"learning_rate": 8.992443324937027e-05,
"loss": 1.0007,
"step": 715
},
{
"epoch": 0.09068866706552886,
"grad_norm": 1.1548367738723755,
"learning_rate": 9.055415617128463e-05,
"loss": 1.006,
"step": 720
},
{
"epoch": 0.09131844947570614,
"grad_norm": 1.0847381353378296,
"learning_rate": 9.118387909319898e-05,
"loss": 0.9703,
"step": 725
},
{
"epoch": 0.09194823188588343,
"grad_norm": 1.1227048635482788,
"learning_rate": 9.181360201511333e-05,
"loss": 0.957,
"step": 730
},
{
"epoch": 0.09257801429606072,
"grad_norm": 1.0912779569625854,
"learning_rate": 9.24433249370277e-05,
"loss": 0.9866,
"step": 735
},
{
"epoch": 0.093207796706238,
"grad_norm": 1.0490583181381226,
"learning_rate": 9.307304785894206e-05,
"loss": 1.0605,
"step": 740
},
{
"epoch": 0.09383757911641528,
"grad_norm": 1.0930982828140259,
"learning_rate": 9.370277078085642e-05,
"loss": 1.0061,
"step": 745
},
{
"epoch": 0.09446736152659256,
"grad_norm": 1.1814302206039429,
"learning_rate": 9.433249370277077e-05,
"loss": 1.0187,
"step": 750
},
{
"epoch": 0.09509714393676985,
"grad_norm": 1.113162636756897,
"learning_rate": 9.496221662468513e-05,
"loss": 0.9929,
"step": 755
},
{
"epoch": 0.09572692634694713,
"grad_norm": 1.0789388418197632,
"learning_rate": 9.559193954659948e-05,
"loss": 0.9699,
"step": 760
},
{
"epoch": 0.09635670875712442,
"grad_norm": 1.0445059537887573,
"learning_rate": 9.622166246851384e-05,
"loss": 0.929,
"step": 765
},
{
"epoch": 0.0969864911673017,
"grad_norm": 1.0267964601516724,
"learning_rate": 9.685138539042821e-05,
"loss": 0.9581,
"step": 770
},
{
"epoch": 0.09761627357747898,
"grad_norm": 1.0283193588256836,
"learning_rate": 9.748110831234256e-05,
"loss": 1.0331,
"step": 775
},
{
"epoch": 0.09824605598765626,
"grad_norm": 1.0210477113723755,
"learning_rate": 9.811083123425692e-05,
"loss": 0.96,
"step": 780
},
{
"epoch": 0.09887583839783355,
"grad_norm": 1.0163402557373047,
"learning_rate": 9.874055415617127e-05,
"loss": 0.9923,
"step": 785
},
{
"epoch": 0.09950562080801083,
"grad_norm": 1.0012452602386475,
"learning_rate": 9.937027707808563e-05,
"loss": 0.9722,
"step": 790
},
{
"epoch": 0.10013540321818812,
"grad_norm": 0.9810453653335571,
"learning_rate": 9.999999999999999e-05,
"loss": 0.991,
"step": 795
},
{
"epoch": 0.1007651856283654,
"grad_norm": 1.1151692867279053,
"learning_rate": 0.00010062972292191434,
"loss": 0.9783,
"step": 800
},
{
"epoch": 0.10139496803854268,
"grad_norm": 1.1338117122650146,
"learning_rate": 0.00010125944584382871,
"loss": 0.9979,
"step": 805
},
{
"epoch": 0.10202475044871996,
"grad_norm": 0.9878106117248535,
"learning_rate": 0.00010188916876574307,
"loss": 0.9068,
"step": 810
},
{
"epoch": 0.10265453285889725,
"grad_norm": 1.0334627628326416,
"learning_rate": 0.00010251889168765742,
"loss": 0.9389,
"step": 815
},
{
"epoch": 0.10328431526907453,
"grad_norm": 0.9542704224586487,
"learning_rate": 0.00010314861460957178,
"loss": 0.9699,
"step": 820
},
{
"epoch": 0.10391409767925182,
"grad_norm": 1.003753423690796,
"learning_rate": 0.00010377833753148613,
"loss": 0.9309,
"step": 825
},
{
"epoch": 0.10454388008942911,
"grad_norm": 0.9803423285484314,
"learning_rate": 0.00010440806045340049,
"loss": 0.9711,
"step": 830
},
{
"epoch": 0.10517366249960639,
"grad_norm": 0.9765311479568481,
"learning_rate": 0.00010503778337531486,
"loss": 1.0237,
"step": 835
},
{
"epoch": 0.10580344490978368,
"grad_norm": 1.035510540008545,
"learning_rate": 0.00010566750629722922,
"loss": 0.9737,
"step": 840
},
{
"epoch": 0.10643322731996095,
"grad_norm": 3.4597954750061035,
"learning_rate": 0.00010629722921914357,
"loss": 1.038,
"step": 845
},
{
"epoch": 0.10706300973013824,
"grad_norm": 1.0254745483398438,
"learning_rate": 0.00010692695214105793,
"loss": 1.0044,
"step": 850
},
{
"epoch": 0.10769279214031552,
"grad_norm": 4.8941521644592285,
"learning_rate": 0.00010755667506297228,
"loss": 1.1038,
"step": 855
},
{
"epoch": 0.10832257455049281,
"grad_norm": 1.6676890850067139,
"learning_rate": 0.00010818639798488663,
"loss": 1.0043,
"step": 860
},
{
"epoch": 0.10895235696067009,
"grad_norm": 5.738070964813232,
"learning_rate": 0.00010881612090680099,
"loss": 1.0502,
"step": 865
},
{
"epoch": 0.10958213937084738,
"grad_norm": 1.1913108825683594,
"learning_rate": 0.00010944584382871536,
"loss": 1.0071,
"step": 870
},
{
"epoch": 0.11021192178102465,
"grad_norm": 1.0302019119262695,
"learning_rate": 0.00011007556675062972,
"loss": 0.9732,
"step": 875
},
{
"epoch": 0.11084170419120194,
"grad_norm": 0.92161625623703,
"learning_rate": 0.00011070528967254407,
"loss": 0.9414,
"step": 880
},
{
"epoch": 0.11147148660137922,
"grad_norm": 0.954598605632782,
"learning_rate": 0.00011133501259445843,
"loss": 0.9772,
"step": 885
},
{
"epoch": 0.11210126901155651,
"grad_norm": 0.9241647720336914,
"learning_rate": 0.00011196473551637278,
"loss": 0.9498,
"step": 890
},
{
"epoch": 0.11273105142173379,
"grad_norm": 0.9744060039520264,
"learning_rate": 0.00011259445843828714,
"loss": 0.9501,
"step": 895
},
{
"epoch": 0.11336083383191108,
"grad_norm": 1.0800458192825317,
"learning_rate": 0.0001132241813602015,
"loss": 0.9823,
"step": 900
},
{
"epoch": 0.11399061624208835,
"grad_norm": 1.0275344848632812,
"learning_rate": 0.00011385390428211587,
"loss": 1.0426,
"step": 905
},
{
"epoch": 0.11462039865226564,
"grad_norm": 1.0069867372512817,
"learning_rate": 0.00011448362720403022,
"loss": 0.9933,
"step": 910
},
{
"epoch": 0.11525018106244292,
"grad_norm": 1.0309741497039795,
"learning_rate": 0.00011511335012594457,
"loss": 0.9792,
"step": 915
},
{
"epoch": 0.11587996347262021,
"grad_norm": 0.9738866090774536,
"learning_rate": 0.00011574307304785893,
"loss": 1.0193,
"step": 920
},
{
"epoch": 0.1165097458827975,
"grad_norm": 0.9231003522872925,
"learning_rate": 0.00011637279596977329,
"loss": 0.9741,
"step": 925
},
{
"epoch": 0.11713952829297478,
"grad_norm": 1.1318124532699585,
"learning_rate": 0.00011700251889168764,
"loss": 0.9644,
"step": 930
},
{
"epoch": 0.11776931070315207,
"grad_norm": 1.033288598060608,
"learning_rate": 0.00011763224181360201,
"loss": 0.9216,
"step": 935
},
{
"epoch": 0.11839909311332934,
"grad_norm": 1.003190517425537,
"learning_rate": 0.00011826196473551637,
"loss": 0.9521,
"step": 940
},
{
"epoch": 0.11902887552350663,
"grad_norm": 1.0145738124847412,
"learning_rate": 0.00011889168765743072,
"loss": 0.9676,
"step": 945
},
{
"epoch": 0.11965865793368391,
"grad_norm": 1.1370879411697388,
"learning_rate": 0.00011952141057934508,
"loss": 0.9987,
"step": 950
},
{
"epoch": 0.1202884403438612,
"grad_norm": 0.9657129645347595,
"learning_rate": 0.00012015113350125943,
"loss": 0.9622,
"step": 955
},
{
"epoch": 0.12091822275403848,
"grad_norm": 0.9489335417747498,
"learning_rate": 0.00012078085642317378,
"loss": 0.9402,
"step": 960
},
{
"epoch": 0.12154800516421577,
"grad_norm": 1.0598636865615845,
"learning_rate": 0.00012141057934508814,
"loss": 1.0047,
"step": 965
},
{
"epoch": 0.12217778757439304,
"grad_norm": 0.9747732281684875,
"learning_rate": 0.00012204030226700251,
"loss": 1.009,
"step": 970
},
{
"epoch": 0.12280756998457033,
"grad_norm": 0.9424954056739807,
"learning_rate": 0.00012267002518891686,
"loss": 0.9603,
"step": 975
},
{
"epoch": 0.12343735239474761,
"grad_norm": 1.0061867237091064,
"learning_rate": 0.00012329974811083123,
"loss": 0.9494,
"step": 980
},
{
"epoch": 0.1240671348049249,
"grad_norm": 0.924182116985321,
"learning_rate": 0.00012392947103274558,
"loss": 0.9804,
"step": 985
},
{
"epoch": 0.12469691721510218,
"grad_norm": 0.983267605304718,
"learning_rate": 0.00012455919395465995,
"loss": 0.9814,
"step": 990
},
{
"epoch": 0.12532669962527945,
"grad_norm": 0.896524965763092,
"learning_rate": 0.0001251889168765743,
"loss": 0.9533,
"step": 995
},
{
"epoch": 0.12595648203545676,
"grad_norm": 0.8669747710227966,
"learning_rate": 0.00012581863979848864,
"loss": 0.9544,
"step": 1000
},
{
"epoch": 0.12595648203545676,
"eval_loss": 0.383962482213974,
"eval_runtime": 6.2938,
"eval_samples_per_second": 158.887,
"eval_steps_per_second": 10.01,
"step": 1000
},
{
"epoch": 0.12658626444563403,
"grad_norm": 0.9055171012878418,
"learning_rate": 0.000126448362720403,
"loss": 0.9353,
"step": 1005
},
{
"epoch": 0.1272160468558113,
"grad_norm": 0.9889428019523621,
"learning_rate": 0.00012707808564231738,
"loss": 0.9508,
"step": 1010
},
{
"epoch": 0.12784582926598861,
"grad_norm": 0.8966602683067322,
"learning_rate": 0.00012770780856423173,
"loss": 0.995,
"step": 1015
},
{
"epoch": 0.1284756116761659,
"grad_norm": 0.9995138645172119,
"learning_rate": 0.0001283375314861461,
"loss": 0.9624,
"step": 1020
},
{
"epoch": 0.12910539408634317,
"grad_norm": 0.8536145687103271,
"learning_rate": 0.00012896725440806044,
"loss": 0.9549,
"step": 1025
},
{
"epoch": 0.12973517649652044,
"grad_norm": 0.8860256671905518,
"learning_rate": 0.00012959697732997479,
"loss": 1.0021,
"step": 1030
},
{
"epoch": 0.13036495890669775,
"grad_norm": 0.8574298620223999,
"learning_rate": 0.00013022670025188916,
"loss": 0.9798,
"step": 1035
},
{
"epoch": 0.13099474131687502,
"grad_norm": 1.1180200576782227,
"learning_rate": 0.00013085642317380353,
"loss": 0.9225,
"step": 1040
},
{
"epoch": 0.1316245237270523,
"grad_norm": 0.9391751289367676,
"learning_rate": 0.00013148614609571787,
"loss": 0.9467,
"step": 1045
},
{
"epoch": 0.13225430613722958,
"grad_norm": 0.8861620426177979,
"learning_rate": 0.00013211586901763222,
"loss": 0.9413,
"step": 1050
},
{
"epoch": 0.13288408854740688,
"grad_norm": 0.8499036431312561,
"learning_rate": 0.0001327455919395466,
"loss": 0.9644,
"step": 1055
},
{
"epoch": 0.13351387095758416,
"grad_norm": 0.9816482067108154,
"learning_rate": 0.00013337531486146094,
"loss": 0.9552,
"step": 1060
},
{
"epoch": 0.13414365336776143,
"grad_norm": 0.9725036025047302,
"learning_rate": 0.0001340050377833753,
"loss": 0.9461,
"step": 1065
},
{
"epoch": 0.1347734357779387,
"grad_norm": 0.9366094470024109,
"learning_rate": 0.00013463476070528968,
"loss": 0.9305,
"step": 1070
},
{
"epoch": 0.13540321818811601,
"grad_norm": 0.9212390780448914,
"learning_rate": 0.00013526448362720402,
"loss": 0.9551,
"step": 1075
},
{
"epoch": 0.1360330005982933,
"grad_norm": 0.8980582356452942,
"learning_rate": 0.00013589420654911837,
"loss": 0.9491,
"step": 1080
},
{
"epoch": 0.13666278300847057,
"grad_norm": 0.9107893109321594,
"learning_rate": 0.00013652392947103274,
"loss": 0.9366,
"step": 1085
},
{
"epoch": 0.13729256541864784,
"grad_norm": 0.8583124876022339,
"learning_rate": 0.00013715365239294708,
"loss": 0.9628,
"step": 1090
},
{
"epoch": 0.13792234782882515,
"grad_norm": 0.877052903175354,
"learning_rate": 0.00013778337531486146,
"loss": 0.9675,
"step": 1095
},
{
"epoch": 0.13855213023900242,
"grad_norm": 0.8020456433296204,
"learning_rate": 0.0001384130982367758,
"loss": 0.9015,
"step": 1100
},
{
"epoch": 0.1391819126491797,
"grad_norm": 0.8703967928886414,
"learning_rate": 0.00013904282115869017,
"loss": 0.9658,
"step": 1105
},
{
"epoch": 0.139811695059357,
"grad_norm": 0.7955961227416992,
"learning_rate": 0.00013967254408060452,
"loss": 0.9084,
"step": 1110
},
{
"epoch": 0.14044147746953428,
"grad_norm": 0.893059492111206,
"learning_rate": 0.0001403022670025189,
"loss": 0.9591,
"step": 1115
},
{
"epoch": 0.14107125987971156,
"grad_norm": 0.8481057286262512,
"learning_rate": 0.00014093198992443323,
"loss": 0.9588,
"step": 1120
},
{
"epoch": 0.14170104228988883,
"grad_norm": 0.8342163562774658,
"learning_rate": 0.00014156171284634758,
"loss": 0.947,
"step": 1125
},
{
"epoch": 0.14233082470006614,
"grad_norm": 0.790868878364563,
"learning_rate": 0.00014219143576826195,
"loss": 0.9366,
"step": 1130
},
{
"epoch": 0.14296060711024342,
"grad_norm": 0.8430061340332031,
"learning_rate": 0.0001428211586901763,
"loss": 0.9014,
"step": 1135
},
{
"epoch": 0.1435903895204207,
"grad_norm": 0.9150258302688599,
"learning_rate": 0.00014345088161209067,
"loss": 0.9546,
"step": 1140
},
{
"epoch": 0.14422017193059797,
"grad_norm": 0.8204888105392456,
"learning_rate": 0.00014408060453400504,
"loss": 0.9159,
"step": 1145
},
{
"epoch": 0.14484995434077527,
"grad_norm": 0.7595349550247192,
"learning_rate": 0.00014471032745591938,
"loss": 0.9179,
"step": 1150
},
{
"epoch": 0.14547973675095255,
"grad_norm": 0.8642888069152832,
"learning_rate": 0.00014534005037783373,
"loss": 1.0338,
"step": 1155
},
{
"epoch": 0.14610951916112982,
"grad_norm": 0.9633650183677673,
"learning_rate": 0.0001459697732997481,
"loss": 0.9638,
"step": 1160
},
{
"epoch": 0.1467393015713071,
"grad_norm": 0.8363626599311829,
"learning_rate": 0.00014659949622166244,
"loss": 0.8828,
"step": 1165
},
{
"epoch": 0.1473690839814844,
"grad_norm": 0.8199290633201599,
"learning_rate": 0.00014722921914357682,
"loss": 0.9577,
"step": 1170
},
{
"epoch": 0.14799886639166168,
"grad_norm": 0.7671203017234802,
"learning_rate": 0.0001478589420654912,
"loss": 0.9381,
"step": 1175
},
{
"epoch": 0.14862864880183896,
"grad_norm": 0.8354636430740356,
"learning_rate": 0.00014848866498740553,
"loss": 1.0019,
"step": 1180
},
{
"epoch": 0.14925843121201623,
"grad_norm": 0.911165714263916,
"learning_rate": 0.00014911838790931988,
"loss": 0.8985,
"step": 1185
},
{
"epoch": 0.14988821362219354,
"grad_norm": 0.8125472664833069,
"learning_rate": 0.00014974811083123425,
"loss": 0.9628,
"step": 1190
},
{
"epoch": 0.15051799603237082,
"grad_norm": 0.8937430381774902,
"learning_rate": 0.00015037783375314862,
"loss": 0.9843,
"step": 1195
},
{
"epoch": 0.1511477784425481,
"grad_norm": 0.9609346985816956,
"learning_rate": 0.00015100755667506297,
"loss": 0.9552,
"step": 1200
},
{
"epoch": 0.1517775608527254,
"grad_norm": 0.7975132465362549,
"learning_rate": 0.0001516372795969773,
"loss": 0.9799,
"step": 1205
},
{
"epoch": 0.15240734326290267,
"grad_norm": 0.8690225481987,
"learning_rate": 0.00015226700251889168,
"loss": 0.9604,
"step": 1210
},
{
"epoch": 0.15303712567307995,
"grad_norm": 0.7486653923988342,
"learning_rate": 0.00015289672544080603,
"loss": 0.9022,
"step": 1215
},
{
"epoch": 0.15366690808325723,
"grad_norm": 0.8420302271842957,
"learning_rate": 0.0001535264483627204,
"loss": 0.8791,
"step": 1220
},
{
"epoch": 0.15429669049343453,
"grad_norm": 0.8187466263771057,
"learning_rate": 0.00015415617128463474,
"loss": 0.9332,
"step": 1225
},
{
"epoch": 0.1549264729036118,
"grad_norm": 0.8711130619049072,
"learning_rate": 0.0001547858942065491,
"loss": 0.8924,
"step": 1230
},
{
"epoch": 0.15555625531378908,
"grad_norm": 0.8086002469062805,
"learning_rate": 0.00015541561712846346,
"loss": 0.9491,
"step": 1235
},
{
"epoch": 0.15618603772396636,
"grad_norm": 0.8274957537651062,
"learning_rate": 0.0001560453400503778,
"loss": 0.9392,
"step": 1240
},
{
"epoch": 0.15681582013414366,
"grad_norm": 0.831676721572876,
"learning_rate": 0.00015667506297229218,
"loss": 1.0327,
"step": 1245
},
{
"epoch": 0.15744560254432094,
"grad_norm": 0.8806201219558716,
"learning_rate": 0.00015730478589420652,
"loss": 0.8607,
"step": 1250
},
{
"epoch": 0.15807538495449822,
"grad_norm": 0.905436635017395,
"learning_rate": 0.00015793450881612092,
"loss": 0.9301,
"step": 1255
},
{
"epoch": 0.1587051673646755,
"grad_norm": 0.8631262183189392,
"learning_rate": 0.00015856423173803526,
"loss": 0.9443,
"step": 1260
},
{
"epoch": 0.1593349497748528,
"grad_norm": 0.7483521699905396,
"learning_rate": 0.0001591939546599496,
"loss": 0.901,
"step": 1265
},
{
"epoch": 0.15996473218503007,
"grad_norm": 0.8273198008537292,
"learning_rate": 0.00015982367758186398,
"loss": 0.9608,
"step": 1270
},
{
"epoch": 0.16059451459520735,
"grad_norm": 0.7562909722328186,
"learning_rate": 0.00016045340050377832,
"loss": 0.9219,
"step": 1275
},
{
"epoch": 0.16122429700538463,
"grad_norm": 0.8585835099220276,
"learning_rate": 0.0001610831234256927,
"loss": 0.942,
"step": 1280
},
{
"epoch": 0.16185407941556193,
"grad_norm": 0.8192921876907349,
"learning_rate": 0.00016171284634760704,
"loss": 0.9531,
"step": 1285
},
{
"epoch": 0.1624838618257392,
"grad_norm": 0.8301946520805359,
"learning_rate": 0.00016234256926952139,
"loss": 0.8972,
"step": 1290
},
{
"epoch": 0.16311364423591648,
"grad_norm": 0.8291681408882141,
"learning_rate": 0.00016297229219143576,
"loss": 0.9653,
"step": 1295
},
{
"epoch": 0.1637434266460938,
"grad_norm": 0.8672564625740051,
"learning_rate": 0.0001636020151133501,
"loss": 0.9498,
"step": 1300
},
{
"epoch": 0.16437320905627106,
"grad_norm": 0.7432397603988647,
"learning_rate": 0.00016423173803526445,
"loss": 0.8782,
"step": 1305
},
{
"epoch": 0.16500299146644834,
"grad_norm": 0.7710584402084351,
"learning_rate": 0.00016486146095717882,
"loss": 0.8872,
"step": 1310
},
{
"epoch": 0.16563277387662562,
"grad_norm": 0.7810630798339844,
"learning_rate": 0.00016549118387909316,
"loss": 0.9357,
"step": 1315
},
{
"epoch": 0.16626255628680292,
"grad_norm": 0.7368482947349548,
"learning_rate": 0.00016612090680100756,
"loss": 0.8935,
"step": 1320
},
{
"epoch": 0.1668923386969802,
"grad_norm": 0.7725487947463989,
"learning_rate": 0.0001667506297229219,
"loss": 0.9241,
"step": 1325
},
{
"epoch": 0.16752212110715747,
"grad_norm": 0.7551338076591492,
"learning_rate": 0.00016738035264483628,
"loss": 0.8878,
"step": 1330
},
{
"epoch": 0.16815190351733475,
"grad_norm": 0.8027164340019226,
"learning_rate": 0.00016801007556675062,
"loss": 0.9149,
"step": 1335
},
{
"epoch": 0.16878168592751205,
"grad_norm": 0.7476945519447327,
"learning_rate": 0.00016863979848866497,
"loss": 0.9241,
"step": 1340
},
{
"epoch": 0.16941146833768933,
"grad_norm": 0.7967312335968018,
"learning_rate": 0.00016926952141057934,
"loss": 0.932,
"step": 1345
},
{
"epoch": 0.1700412507478666,
"grad_norm": 0.809727668762207,
"learning_rate": 0.00016989924433249368,
"loss": 0.922,
"step": 1350
},
{
"epoch": 0.17067103315804388,
"grad_norm": 0.7631811499595642,
"learning_rate": 0.00017052896725440806,
"loss": 0.94,
"step": 1355
},
{
"epoch": 0.1713008155682212,
"grad_norm": 0.6545524001121521,
"learning_rate": 0.0001711586901763224,
"loss": 0.8898,
"step": 1360
},
{
"epoch": 0.17193059797839846,
"grad_norm": 0.8232229351997375,
"learning_rate": 0.00017178841309823675,
"loss": 0.9235,
"step": 1365
},
{
"epoch": 0.17256038038857574,
"grad_norm": 0.8617391586303711,
"learning_rate": 0.00017241813602015112,
"loss": 0.9491,
"step": 1370
},
{
"epoch": 0.17319016279875304,
"grad_norm": 0.7971004247665405,
"learning_rate": 0.00017304785894206546,
"loss": 0.8749,
"step": 1375
},
{
"epoch": 0.17381994520893032,
"grad_norm": 0.7876558899879456,
"learning_rate": 0.0001736775818639798,
"loss": 0.9954,
"step": 1380
},
{
"epoch": 0.1744497276191076,
"grad_norm": 0.8051108121871948,
"learning_rate": 0.00017430730478589418,
"loss": 0.897,
"step": 1385
},
{
"epoch": 0.17507951002928487,
"grad_norm": 0.8449770212173462,
"learning_rate": 0.00017493702770780855,
"loss": 0.8881,
"step": 1390
},
{
"epoch": 0.17570929243946218,
"grad_norm": 0.8217072486877441,
"learning_rate": 0.00017556675062972292,
"loss": 0.9027,
"step": 1395
},
{
"epoch": 0.17633907484963945,
"grad_norm": 0.806914210319519,
"learning_rate": 0.00017619647355163727,
"loss": 0.9222,
"step": 1400
},
{
"epoch": 0.17696885725981673,
"grad_norm": 0.8344951868057251,
"learning_rate": 0.00017682619647355164,
"loss": 0.9462,
"step": 1405
},
{
"epoch": 0.177598639669994,
"grad_norm": 0.7249205112457275,
"learning_rate": 0.00017745591939546598,
"loss": 0.917,
"step": 1410
},
{
"epoch": 0.1782284220801713,
"grad_norm": 0.8052341341972351,
"learning_rate": 0.00017808564231738033,
"loss": 0.9168,
"step": 1415
},
{
"epoch": 0.1788582044903486,
"grad_norm": 0.7675748467445374,
"learning_rate": 0.0001787153652392947,
"loss": 0.9186,
"step": 1420
},
{
"epoch": 0.17948798690052586,
"grad_norm": 0.7672801613807678,
"learning_rate": 0.00017934508816120904,
"loss": 0.8637,
"step": 1425
},
{
"epoch": 0.18011776931070314,
"grad_norm": 0.7517289519309998,
"learning_rate": 0.00017997481108312342,
"loss": 0.9053,
"step": 1430
},
{
"epoch": 0.18074755172088044,
"grad_norm": 0.7253280878067017,
"learning_rate": 0.00018060453400503776,
"loss": 0.9047,
"step": 1435
},
{
"epoch": 0.18137733413105772,
"grad_norm": 0.7113356590270996,
"learning_rate": 0.0001812342569269521,
"loss": 0.9288,
"step": 1440
},
{
"epoch": 0.182007116541235,
"grad_norm": 0.6626010537147522,
"learning_rate": 0.00018186397984886648,
"loss": 0.8942,
"step": 1445
},
{
"epoch": 0.18263689895141227,
"grad_norm": 0.7033849358558655,
"learning_rate": 0.00018249370277078082,
"loss": 0.9086,
"step": 1450
},
{
"epoch": 0.18326668136158958,
"grad_norm": 0.701263427734375,
"learning_rate": 0.00018312342569269522,
"loss": 0.9429,
"step": 1455
},
{
"epoch": 0.18389646377176685,
"grad_norm": 0.7362795472145081,
"learning_rate": 0.00018375314861460957,
"loss": 0.8955,
"step": 1460
},
{
"epoch": 0.18452624618194413,
"grad_norm": 0.7902641296386719,
"learning_rate": 0.00018438287153652394,
"loss": 0.8535,
"step": 1465
},
{
"epoch": 0.18515602859212144,
"grad_norm": 0.6855788230895996,
"learning_rate": 0.00018501259445843828,
"loss": 0.9166,
"step": 1470
},
{
"epoch": 0.1857858110022987,
"grad_norm": 0.6782147884368896,
"learning_rate": 0.00018564231738035263,
"loss": 0.8755,
"step": 1475
},
{
"epoch": 0.186415593412476,
"grad_norm": 0.6875694990158081,
"learning_rate": 0.000186272040302267,
"loss": 0.8712,
"step": 1480
},
{
"epoch": 0.18704537582265326,
"grad_norm": 0.7253673672676086,
"learning_rate": 0.00018690176322418134,
"loss": 0.8933,
"step": 1485
},
{
"epoch": 0.18767515823283057,
"grad_norm": 0.8096954822540283,
"learning_rate": 0.0001875314861460957,
"loss": 0.9359,
"step": 1490
},
{
"epoch": 0.18830494064300785,
"grad_norm": 0.7597787380218506,
"learning_rate": 0.00018816120906801006,
"loss": 0.9341,
"step": 1495
},
{
"epoch": 0.18893472305318512,
"grad_norm": 0.7736676931381226,
"learning_rate": 0.0001887909319899244,
"loss": 0.9281,
"step": 1500
},
{
"epoch": 0.1895645054633624,
"grad_norm": 0.6343753337860107,
"learning_rate": 0.00018942065491183878,
"loss": 0.9177,
"step": 1505
},
{
"epoch": 0.1901942878735397,
"grad_norm": 0.728712260723114,
"learning_rate": 0.00019005037783375312,
"loss": 0.9371,
"step": 1510
},
{
"epoch": 0.19082407028371698,
"grad_norm": 0.7092194557189941,
"learning_rate": 0.00019068010075566746,
"loss": 0.8902,
"step": 1515
},
{
"epoch": 0.19145385269389426,
"grad_norm": 0.7485836744308472,
"learning_rate": 0.00019130982367758186,
"loss": 0.8931,
"step": 1520
},
{
"epoch": 0.19208363510407153,
"grad_norm": 0.7485086917877197,
"learning_rate": 0.0001919395465994962,
"loss": 0.9368,
"step": 1525
},
{
"epoch": 0.19271341751424884,
"grad_norm": 0.7100546360015869,
"learning_rate": 0.00019256926952141058,
"loss": 0.8803,
"step": 1530
},
{
"epoch": 0.1933431999244261,
"grad_norm": 0.7371817827224731,
"learning_rate": 0.00019319899244332492,
"loss": 0.8963,
"step": 1535
},
{
"epoch": 0.1939729823346034,
"grad_norm": 0.6849647164344788,
"learning_rate": 0.0001938287153652393,
"loss": 0.9137,
"step": 1540
},
{
"epoch": 0.19460276474478067,
"grad_norm": 0.7108625173568726,
"learning_rate": 0.00019445843828715364,
"loss": 0.9078,
"step": 1545
},
{
"epoch": 0.19523254715495797,
"grad_norm": 0.7581806182861328,
"learning_rate": 0.00019508816120906799,
"loss": 0.9002,
"step": 1550
},
{
"epoch": 0.19586232956513525,
"grad_norm": 0.7299503087997437,
"learning_rate": 0.00019571788413098236,
"loss": 0.8897,
"step": 1555
},
{
"epoch": 0.19649211197531252,
"grad_norm": 0.7815247774124146,
"learning_rate": 0.0001963476070528967,
"loss": 0.8454,
"step": 1560
},
{
"epoch": 0.19712189438548983,
"grad_norm": 0.7475869655609131,
"learning_rate": 0.00019697732997481105,
"loss": 0.9482,
"step": 1565
},
{
"epoch": 0.1977516767956671,
"grad_norm": 0.7469599843025208,
"learning_rate": 0.00019760705289672542,
"loss": 0.9048,
"step": 1570
},
{
"epoch": 0.19838145920584438,
"grad_norm": 0.6186767220497131,
"learning_rate": 0.00019823677581863976,
"loss": 0.8698,
"step": 1575
},
{
"epoch": 0.19901124161602166,
"grad_norm": 0.843999445438385,
"learning_rate": 0.00019886649874055413,
"loss": 0.9567,
"step": 1580
},
{
"epoch": 0.19964102402619896,
"grad_norm": 0.749344527721405,
"learning_rate": 0.00019949622166246848,
"loss": 0.9234,
"step": 1585
},
{
"epoch": 0.20027080643637624,
"grad_norm": 0.6822441220283508,
"learning_rate": 0.00020012594458438288,
"loss": 0.8915,
"step": 1590
},
{
"epoch": 0.2009005888465535,
"grad_norm": 0.7193272113800049,
"learning_rate": 0.00020075566750629722,
"loss": 0.8922,
"step": 1595
},
{
"epoch": 0.2015303712567308,
"grad_norm": 0.7202250361442566,
"learning_rate": 0.00020138539042821157,
"loss": 0.9026,
"step": 1600
},
{
"epoch": 0.2021601536669081,
"grad_norm": 0.6946163773536682,
"learning_rate": 0.00020201511335012594,
"loss": 0.9181,
"step": 1605
},
{
"epoch": 0.20278993607708537,
"grad_norm": 0.7185525894165039,
"learning_rate": 0.00020264483627204028,
"loss": 0.8809,
"step": 1610
},
{
"epoch": 0.20341971848726265,
"grad_norm": 0.6290002465248108,
"learning_rate": 0.00020327455919395466,
"loss": 0.9033,
"step": 1615
},
{
"epoch": 0.20404950089743992,
"grad_norm": 0.6773431897163391,
"learning_rate": 0.000203904282115869,
"loss": 0.838,
"step": 1620
},
{
"epoch": 0.20467928330761723,
"grad_norm": 0.7076095342636108,
"learning_rate": 0.00020453400503778335,
"loss": 0.9158,
"step": 1625
},
{
"epoch": 0.2053090657177945,
"grad_norm": 0.7354462146759033,
"learning_rate": 0.00020516372795969772,
"loss": 0.8336,
"step": 1630
},
{
"epoch": 0.20593884812797178,
"grad_norm": 0.6885705590248108,
"learning_rate": 0.00020579345088161206,
"loss": 0.8971,
"step": 1635
},
{
"epoch": 0.20656863053814906,
"grad_norm": 0.697887659072876,
"learning_rate": 0.00020642317380352643,
"loss": 0.851,
"step": 1640
},
{
"epoch": 0.20719841294832636,
"grad_norm": 0.7369652986526489,
"learning_rate": 0.00020705289672544078,
"loss": 0.8567,
"step": 1645
},
{
"epoch": 0.20782819535850364,
"grad_norm": 0.7226613759994507,
"learning_rate": 0.00020768261964735512,
"loss": 0.9038,
"step": 1650
},
{
"epoch": 0.2084579777686809,
"grad_norm": 0.6973157525062561,
"learning_rate": 0.00020831234256926952,
"loss": 0.8443,
"step": 1655
},
{
"epoch": 0.20908776017885822,
"grad_norm": 0.7276191115379333,
"learning_rate": 0.00020894206549118387,
"loss": 0.8985,
"step": 1660
},
{
"epoch": 0.2097175425890355,
"grad_norm": 0.694542646408081,
"learning_rate": 0.00020957178841309824,
"loss": 0.8914,
"step": 1665
},
{
"epoch": 0.21034732499921277,
"grad_norm": 0.8255221843719482,
"learning_rate": 0.00021020151133501258,
"loss": 0.9072,
"step": 1670
},
{
"epoch": 0.21097710740939005,
"grad_norm": 0.637487530708313,
"learning_rate": 0.00021083123425692693,
"loss": 0.8637,
"step": 1675
},
{
"epoch": 0.21160688981956735,
"grad_norm": 0.6839597821235657,
"learning_rate": 0.0002114609571788413,
"loss": 0.8736,
"step": 1680
},
{
"epoch": 0.21223667222974463,
"grad_norm": 0.6435440182685852,
"learning_rate": 0.00021209068010075564,
"loss": 0.8725,
"step": 1685
},
{
"epoch": 0.2128664546399219,
"grad_norm": 0.7100492715835571,
"learning_rate": 0.00021272040302267002,
"loss": 0.9169,
"step": 1690
},
{
"epoch": 0.21349623705009918,
"grad_norm": 0.6926056742668152,
"learning_rate": 0.00021335012594458436,
"loss": 0.8549,
"step": 1695
},
{
"epoch": 0.21412601946027648,
"grad_norm": 0.8507684469223022,
"learning_rate": 0.0002139798488664987,
"loss": 0.9011,
"step": 1700
},
{
"epoch": 0.21475580187045376,
"grad_norm": 0.7276325821876526,
"learning_rate": 0.00021460957178841308,
"loss": 0.8607,
"step": 1705
},
{
"epoch": 0.21538558428063104,
"grad_norm": 0.6535823941230774,
"learning_rate": 0.00021523929471032742,
"loss": 0.8558,
"step": 1710
},
{
"epoch": 0.2160153666908083,
"grad_norm": 0.6517070531845093,
"learning_rate": 0.0002158690176322418,
"loss": 0.8703,
"step": 1715
},
{
"epoch": 0.21664514910098562,
"grad_norm": 0.7442309260368347,
"learning_rate": 0.00021649874055415614,
"loss": 0.8961,
"step": 1720
},
{
"epoch": 0.2172749315111629,
"grad_norm": 0.7261196374893188,
"learning_rate": 0.00021712846347607054,
"loss": 0.8902,
"step": 1725
},
{
"epoch": 0.21790471392134017,
"grad_norm": 0.7019686698913574,
"learning_rate": 0.00021775818639798488,
"loss": 0.8929,
"step": 1730
},
{
"epoch": 0.21853449633151745,
"grad_norm": 0.7852956056594849,
"learning_rate": 0.00021838790931989923,
"loss": 0.8766,
"step": 1735
},
{
"epoch": 0.21916427874169475,
"grad_norm": 0.7370544672012329,
"learning_rate": 0.0002190176322418136,
"loss": 0.855,
"step": 1740
},
{
"epoch": 0.21979406115187203,
"grad_norm": 0.6246267557144165,
"learning_rate": 0.00021964735516372794,
"loss": 0.9127,
"step": 1745
},
{
"epoch": 0.2204238435620493,
"grad_norm": 0.6939797401428223,
"learning_rate": 0.0002202770780856423,
"loss": 0.8878,
"step": 1750
},
{
"epoch": 0.2210536259722266,
"grad_norm": 0.6594600081443787,
"learning_rate": 0.00022090680100755666,
"loss": 0.9105,
"step": 1755
},
{
"epoch": 0.22168340838240388,
"grad_norm": 0.6578107476234436,
"learning_rate": 0.000221536523929471,
"loss": 0.9016,
"step": 1760
},
{
"epoch": 0.22231319079258116,
"grad_norm": 0.6889748573303223,
"learning_rate": 0.00022216624685138538,
"loss": 0.9091,
"step": 1765
},
{
"epoch": 0.22294297320275844,
"grad_norm": 0.6207224130630493,
"learning_rate": 0.00022279596977329972,
"loss": 0.9058,
"step": 1770
},
{
"epoch": 0.22357275561293574,
"grad_norm": 0.6724773645401001,
"learning_rate": 0.00022342569269521406,
"loss": 0.9144,
"step": 1775
},
{
"epoch": 0.22420253802311302,
"grad_norm": 0.702472448348999,
"learning_rate": 0.00022405541561712844,
"loss": 0.9,
"step": 1780
},
{
"epoch": 0.2248323204332903,
"grad_norm": 0.6482950448989868,
"learning_rate": 0.00022468513853904278,
"loss": 0.88,
"step": 1785
},
{
"epoch": 0.22546210284346757,
"grad_norm": 0.7253268957138062,
"learning_rate": 0.00022531486146095718,
"loss": 0.9147,
"step": 1790
},
{
"epoch": 0.22609188525364488,
"grad_norm": 0.7196680307388306,
"learning_rate": 0.00022594458438287152,
"loss": 0.8687,
"step": 1795
},
{
"epoch": 0.22672166766382215,
"grad_norm": 0.6720924973487854,
"learning_rate": 0.0002265743073047859,
"loss": 0.9173,
"step": 1800
},
{
"epoch": 0.22735145007399943,
"grad_norm": 0.6656882762908936,
"learning_rate": 0.00022720403022670024,
"loss": 0.8237,
"step": 1805
},
{
"epoch": 0.2279812324841767,
"grad_norm": 0.6303510665893555,
"learning_rate": 0.00022783375314861459,
"loss": 0.891,
"step": 1810
},
{
"epoch": 0.228611014894354,
"grad_norm": 0.6595205068588257,
"learning_rate": 0.00022846347607052896,
"loss": 0.8745,
"step": 1815
},
{
"epoch": 0.22924079730453129,
"grad_norm": 0.6373685002326965,
"learning_rate": 0.0002290931989924433,
"loss": 0.895,
"step": 1820
},
{
"epoch": 0.22987057971470856,
"grad_norm": 0.6187670230865479,
"learning_rate": 0.00022972292191435767,
"loss": 0.8954,
"step": 1825
},
{
"epoch": 0.23050036212488584,
"grad_norm": 0.6348496079444885,
"learning_rate": 0.00023035264483627202,
"loss": 0.8462,
"step": 1830
},
{
"epoch": 0.23113014453506314,
"grad_norm": 0.6880120038986206,
"learning_rate": 0.00023098236775818636,
"loss": 0.883,
"step": 1835
},
{
"epoch": 0.23175992694524042,
"grad_norm": 0.7668615579605103,
"learning_rate": 0.00023161209068010073,
"loss": 0.9134,
"step": 1840
},
{
"epoch": 0.2323897093554177,
"grad_norm": 0.6664952635765076,
"learning_rate": 0.00023224181360201508,
"loss": 0.9276,
"step": 1845
},
{
"epoch": 0.233019491765595,
"grad_norm": 0.754509449005127,
"learning_rate": 0.00023287153652392942,
"loss": 0.858,
"step": 1850
},
{
"epoch": 0.23364927417577228,
"grad_norm": 0.6345789432525635,
"learning_rate": 0.00023350125944584382,
"loss": 0.9048,
"step": 1855
},
{
"epoch": 0.23427905658594955,
"grad_norm": 0.6877152323722839,
"learning_rate": 0.00023413098236775817,
"loss": 0.9023,
"step": 1860
},
{
"epoch": 0.23490883899612683,
"grad_norm": 0.6173678636550903,
"learning_rate": 0.00023476070528967254,
"loss": 0.8951,
"step": 1865
},
{
"epoch": 0.23553862140630413,
"grad_norm": 0.6912857294082642,
"learning_rate": 0.00023539042821158688,
"loss": 0.8412,
"step": 1870
},
{
"epoch": 0.2361684038164814,
"grad_norm": 0.6385686993598938,
"learning_rate": 0.00023602015113350126,
"loss": 0.8954,
"step": 1875
},
{
"epoch": 0.23679818622665869,
"grad_norm": 0.6755088567733765,
"learning_rate": 0.0002366498740554156,
"loss": 0.8964,
"step": 1880
},
{
"epoch": 0.23742796863683596,
"grad_norm": 0.6391545534133911,
"learning_rate": 0.00023727959697732995,
"loss": 0.9294,
"step": 1885
},
{
"epoch": 0.23805775104701327,
"grad_norm": 0.7155817747116089,
"learning_rate": 0.00023790931989924432,
"loss": 0.8967,
"step": 1890
},
{
"epoch": 0.23868753345719054,
"grad_norm": 0.681224524974823,
"learning_rate": 0.00023853904282115866,
"loss": 0.8997,
"step": 1895
},
{
"epoch": 0.23931731586736782,
"grad_norm": 0.6473144888877869,
"learning_rate": 0.00023916876574307303,
"loss": 0.9172,
"step": 1900
},
{
"epoch": 0.2399470982775451,
"grad_norm": 0.6562004685401917,
"learning_rate": 0.00023979848866498738,
"loss": 0.8488,
"step": 1905
},
{
"epoch": 0.2405768806877224,
"grad_norm": 0.6842007637023926,
"learning_rate": 0.00024042821158690172,
"loss": 0.9183,
"step": 1910
},
{
"epoch": 0.24120666309789968,
"grad_norm": 0.5957079529762268,
"learning_rate": 0.0002410579345088161,
"loss": 0.8293,
"step": 1915
},
{
"epoch": 0.24183644550807695,
"grad_norm": 0.6745590567588806,
"learning_rate": 0.00024168765743073044,
"loss": 0.8219,
"step": 1920
},
{
"epoch": 0.24246622791825426,
"grad_norm": 0.6895525455474854,
"learning_rate": 0.00024231738035264484,
"loss": 0.9034,
"step": 1925
},
{
"epoch": 0.24309601032843153,
"grad_norm": 0.7394620776176453,
"learning_rate": 0.00024294710327455918,
"loss": 0.8702,
"step": 1930
},
{
"epoch": 0.2437257927386088,
"grad_norm": 0.7846884727478027,
"learning_rate": 0.00024357682619647353,
"loss": 0.9143,
"step": 1935
},
{
"epoch": 0.24435557514878609,
"grad_norm": 0.594127893447876,
"learning_rate": 0.00024420654911838787,
"loss": 0.8838,
"step": 1940
},
{
"epoch": 0.2449853575589634,
"grad_norm": 0.6737518906593323,
"learning_rate": 0.00024483627204030224,
"loss": 0.8657,
"step": 1945
},
{
"epoch": 0.24561513996914067,
"grad_norm": 0.6851866245269775,
"learning_rate": 0.0002454659949622166,
"loss": 0.9133,
"step": 1950
},
{
"epoch": 0.24624492237931794,
"grad_norm": 0.6238758563995361,
"learning_rate": 0.000246095717884131,
"loss": 0.8816,
"step": 1955
},
{
"epoch": 0.24687470478949522,
"grad_norm": 0.6002854704856873,
"learning_rate": 0.0002467254408060453,
"loss": 0.8888,
"step": 1960
},
{
"epoch": 0.24750448719967252,
"grad_norm": 0.6201847791671753,
"learning_rate": 0.0002473551637279597,
"loss": 0.8299,
"step": 1965
},
{
"epoch": 0.2481342696098498,
"grad_norm": 0.6619172692298889,
"learning_rate": 0.00024798488664987405,
"loss": 0.9297,
"step": 1970
},
{
"epoch": 0.24876405202002708,
"grad_norm": 0.6359203457832336,
"learning_rate": 0.00024861460957178837,
"loss": 0.8811,
"step": 1975
},
{
"epoch": 0.24939383443020435,
"grad_norm": 0.6441104412078857,
"learning_rate": 0.00024924433249370274,
"loss": 0.8704,
"step": 1980
},
{
"epoch": 0.25002361684038166,
"grad_norm": 0.7083386778831482,
"learning_rate": 0.0002498740554156171,
"loss": 0.8877,
"step": 1985
},
{
"epoch": 0.2506533992505589,
"grad_norm": 0.642206072807312,
"learning_rate": 0.0002505037783375315,
"loss": 0.8661,
"step": 1990
},
{
"epoch": 0.2512831816607362,
"grad_norm": 0.6782190203666687,
"learning_rate": 0.00025113350125944585,
"loss": 0.901,
"step": 1995
},
{
"epoch": 0.2519129640709135,
"grad_norm": 0.6277428269386292,
"learning_rate": 0.00025176322418136017,
"loss": 0.8212,
"step": 2000
},
{
"epoch": 0.2519129640709135,
"eval_loss": 0.35735848546028137,
"eval_runtime": 6.2326,
"eval_samples_per_second": 160.447,
"eval_steps_per_second": 10.108,
"step": 2000
},
{
"epoch": 0.25254274648109076,
"grad_norm": 0.5980456471443176,
"learning_rate": 0.00025239294710327454,
"loss": 0.8808,
"step": 2005
},
{
"epoch": 0.25317252889126807,
"grad_norm": 0.6398759484291077,
"learning_rate": 0.0002530226700251889,
"loss": 0.8817,
"step": 2010
},
{
"epoch": 0.25380231130144537,
"grad_norm": 0.5681187510490417,
"learning_rate": 0.00025365239294710323,
"loss": 0.8672,
"step": 2015
},
{
"epoch": 0.2544320937116226,
"grad_norm": 0.6202912926673889,
"learning_rate": 0.0002542821158690176,
"loss": 0.8627,
"step": 2020
},
{
"epoch": 0.2550618761217999,
"grad_norm": 0.5921783447265625,
"learning_rate": 0.000254911838790932,
"loss": 0.8214,
"step": 2025
},
{
"epoch": 0.25569165853197723,
"grad_norm": 0.629782497882843,
"learning_rate": 0.00025554156171284635,
"loss": 0.8995,
"step": 2030
},
{
"epoch": 0.2563214409421545,
"grad_norm": 0.6545585989952087,
"learning_rate": 0.00025617128463476066,
"loss": 0.8422,
"step": 2035
},
{
"epoch": 0.2569512233523318,
"grad_norm": 0.6024030447006226,
"learning_rate": 0.00025680100755667504,
"loss": 0.8341,
"step": 2040
},
{
"epoch": 0.25758100576250903,
"grad_norm": 0.6795976161956787,
"learning_rate": 0.0002574307304785894,
"loss": 0.852,
"step": 2045
},
{
"epoch": 0.25821078817268633,
"grad_norm": 0.6465495228767395,
"learning_rate": 0.0002580604534005037,
"loss": 0.8514,
"step": 2050
},
{
"epoch": 0.25884057058286364,
"grad_norm": 0.6498434543609619,
"learning_rate": 0.0002586901763224181,
"loss": 0.8906,
"step": 2055
},
{
"epoch": 0.2594703529930409,
"grad_norm": 0.7072421908378601,
"learning_rate": 0.00025931989924433247,
"loss": 0.9061,
"step": 2060
},
{
"epoch": 0.2601001354032182,
"grad_norm": 0.5902896523475647,
"learning_rate": 0.00025994962216624684,
"loss": 0.8327,
"step": 2065
},
{
"epoch": 0.2607299178133955,
"grad_norm": 0.6410335302352905,
"learning_rate": 0.0002605793450881612,
"loss": 0.9002,
"step": 2070
},
{
"epoch": 0.26135970022357274,
"grad_norm": 1.628951072692871,
"learning_rate": 0.00026120906801007553,
"loss": 0.8944,
"step": 2075
},
{
"epoch": 0.26198948263375005,
"grad_norm": 0.6544843316078186,
"learning_rate": 0.0002618387909319899,
"loss": 0.8656,
"step": 2080
},
{
"epoch": 0.2626192650439273,
"grad_norm": 15.444189071655273,
"learning_rate": 0.0002624685138539043,
"loss": 0.9639,
"step": 2085
},
{
"epoch": 0.2632490474541046,
"grad_norm": 8.399425506591797,
"learning_rate": 0.0002630982367758186,
"loss": 1.1367,
"step": 2090
},
{
"epoch": 0.2638788298642819,
"grad_norm": 24.009044647216797,
"learning_rate": 0.00026372795969773296,
"loss": 1.1429,
"step": 2095
},
{
"epoch": 0.26450861227445915,
"grad_norm": 4.226770877838135,
"learning_rate": 0.00026435768261964733,
"loss": 0.974,
"step": 2100
},
{
"epoch": 0.26513839468463646,
"grad_norm": 1.0910799503326416,
"learning_rate": 0.0002649874055415617,
"loss": 1.0182,
"step": 2105
},
{
"epoch": 0.26576817709481376,
"grad_norm": 2.8835411071777344,
"learning_rate": 0.000265617128463476,
"loss": 1.0283,
"step": 2110
},
{
"epoch": 0.266397959504991,
"grad_norm": 2.8626575469970703,
"learning_rate": 0.0002662468513853904,
"loss": 0.9273,
"step": 2115
},
{
"epoch": 0.2670277419151683,
"grad_norm": 1.4587650299072266,
"learning_rate": 0.00026687657430730477,
"loss": 0.9578,
"step": 2120
},
{
"epoch": 0.2676575243253456,
"grad_norm": 0.7692992091178894,
"learning_rate": 0.00026750629722921914,
"loss": 0.8701,
"step": 2125
},
{
"epoch": 0.26828730673552287,
"grad_norm": 0.8609071373939514,
"learning_rate": 0.0002681360201511335,
"loss": 0.8718,
"step": 2130
},
{
"epoch": 0.26891708914570017,
"grad_norm": 0.7419576048851013,
"learning_rate": 0.00026876574307304783,
"loss": 0.8732,
"step": 2135
},
{
"epoch": 0.2695468715558774,
"grad_norm": 0.8134281635284424,
"learning_rate": 0.0002693954659949622,
"loss": 0.9112,
"step": 2140
},
{
"epoch": 0.2701766539660547,
"grad_norm": 0.7559547424316406,
"learning_rate": 0.00027002518891687657,
"loss": 0.8804,
"step": 2145
},
{
"epoch": 0.27080643637623203,
"grad_norm": 0.7497460842132568,
"learning_rate": 0.0002706549118387909,
"loss": 0.8439,
"step": 2150
},
{
"epoch": 0.2714362187864093,
"grad_norm": 0.775444746017456,
"learning_rate": 0.00027128463476070526,
"loss": 0.888,
"step": 2155
},
{
"epoch": 0.2720660011965866,
"grad_norm": 0.7074035406112671,
"learning_rate": 0.00027191435768261963,
"loss": 0.8628,
"step": 2160
},
{
"epoch": 0.2726957836067639,
"grad_norm": 0.730311393737793,
"learning_rate": 0.00027254408060453395,
"loss": 0.8908,
"step": 2165
},
{
"epoch": 0.27332556601694113,
"grad_norm": 0.7610625624656677,
"learning_rate": 0.0002731738035264483,
"loss": 0.8954,
"step": 2170
},
{
"epoch": 0.27395534842711844,
"grad_norm": 0.6473423838615417,
"learning_rate": 0.0002738035264483627,
"loss": 0.8488,
"step": 2175
},
{
"epoch": 0.2745851308372957,
"grad_norm": 0.7084975838661194,
"learning_rate": 0.00027443324937027707,
"loss": 0.8631,
"step": 2180
},
{
"epoch": 0.275214913247473,
"grad_norm": 0.6844817996025085,
"learning_rate": 0.0002750629722921914,
"loss": 0.9021,
"step": 2185
},
{
"epoch": 0.2758446956576503,
"grad_norm": 0.641327440738678,
"learning_rate": 0.0002756926952141058,
"loss": 0.9002,
"step": 2190
},
{
"epoch": 0.27647447806782754,
"grad_norm": 0.7175489664077759,
"learning_rate": 0.0002763224181360201,
"loss": 0.8794,
"step": 2195
},
{
"epoch": 0.27710426047800485,
"grad_norm": 0.6306767463684082,
"learning_rate": 0.0002769521410579345,
"loss": 0.8732,
"step": 2200
},
{
"epoch": 0.27773404288818215,
"grad_norm": 0.6501113176345825,
"learning_rate": 0.00027758186397984887,
"loss": 0.8725,
"step": 2205
},
{
"epoch": 0.2783638252983594,
"grad_norm": 0.5996410250663757,
"learning_rate": 0.0002782115869017632,
"loss": 0.8828,
"step": 2210
},
{
"epoch": 0.2789936077085367,
"grad_norm": 0.6551349759101868,
"learning_rate": 0.00027884130982367756,
"loss": 0.8725,
"step": 2215
},
{
"epoch": 0.279623390118714,
"grad_norm": 0.6475560069084167,
"learning_rate": 0.00027947103274559193,
"loss": 0.9333,
"step": 2220
},
{
"epoch": 0.28025317252889126,
"grad_norm": 0.6957899928092957,
"learning_rate": 0.00028010075566750625,
"loss": 0.8933,
"step": 2225
},
{
"epoch": 0.28088295493906856,
"grad_norm": 0.6194736361503601,
"learning_rate": 0.0002807304785894206,
"loss": 0.9268,
"step": 2230
},
{
"epoch": 0.2815127373492458,
"grad_norm": 0.6293075084686279,
"learning_rate": 0.000281360201511335,
"loss": 0.8985,
"step": 2235
},
{
"epoch": 0.2821425197594231,
"grad_norm": 0.6805360317230225,
"learning_rate": 0.0002819899244332493,
"loss": 0.854,
"step": 2240
},
{
"epoch": 0.2827723021696004,
"grad_norm": 0.6671084761619568,
"learning_rate": 0.0002826196473551637,
"loss": 0.8774,
"step": 2245
},
{
"epoch": 0.28340208457977767,
"grad_norm": 0.5680047273635864,
"learning_rate": 0.00028324937027707805,
"loss": 0.8273,
"step": 2250
},
{
"epoch": 0.284031866989955,
"grad_norm": 0.5691477060317993,
"learning_rate": 0.0002838790931989924,
"loss": 0.8633,
"step": 2255
},
{
"epoch": 0.2846616494001323,
"grad_norm": 0.6509323120117188,
"learning_rate": 0.0002845088161209068,
"loss": 0.8991,
"step": 2260
},
{
"epoch": 0.2852914318103095,
"grad_norm": 0.714750349521637,
"learning_rate": 0.00028513853904282117,
"loss": 0.8863,
"step": 2265
},
{
"epoch": 0.28592121422048683,
"grad_norm": 0.6934742331504822,
"learning_rate": 0.0002857682619647355,
"loss": 0.8699,
"step": 2270
},
{
"epoch": 0.2865509966306641,
"grad_norm": 0.6048073172569275,
"learning_rate": 0.00028639798488664986,
"loss": 0.8983,
"step": 2275
},
{
"epoch": 0.2871807790408414,
"grad_norm": 0.6630669236183167,
"learning_rate": 0.00028702770780856423,
"loss": 0.9142,
"step": 2280
},
{
"epoch": 0.2878105614510187,
"grad_norm": 0.6518734693527222,
"learning_rate": 0.00028765743073047855,
"loss": 0.8734,
"step": 2285
},
{
"epoch": 0.28844034386119594,
"grad_norm": 0.5939868688583374,
"learning_rate": 0.0002882871536523929,
"loss": 0.8873,
"step": 2290
},
{
"epoch": 0.28907012627137324,
"grad_norm": 0.6081305742263794,
"learning_rate": 0.0002889168765743073,
"loss": 0.8735,
"step": 2295
},
{
"epoch": 0.28969990868155054,
"grad_norm": 0.5869495272636414,
"learning_rate": 0.0002895465994962216,
"loss": 0.8694,
"step": 2300
},
{
"epoch": 0.2903296910917278,
"grad_norm": 0.6381964683532715,
"learning_rate": 0.000290176322418136,
"loss": 0.8638,
"step": 2305
},
{
"epoch": 0.2909594735019051,
"grad_norm": 0.5546308755874634,
"learning_rate": 0.00029080604534005035,
"loss": 0.8748,
"step": 2310
},
{
"epoch": 0.2915892559120824,
"grad_norm": 0.7318828701972961,
"learning_rate": 0.0002914357682619647,
"loss": 0.8594,
"step": 2315
},
{
"epoch": 0.29221903832225965,
"grad_norm": 0.5685531497001648,
"learning_rate": 0.00029206549118387904,
"loss": 0.8815,
"step": 2320
},
{
"epoch": 0.29284882073243695,
"grad_norm": 0.6351069808006287,
"learning_rate": 0.00029269521410579347,
"loss": 0.8351,
"step": 2325
},
{
"epoch": 0.2934786031426142,
"grad_norm": 0.5828582048416138,
"learning_rate": 0.0002933249370277078,
"loss": 0.8678,
"step": 2330
},
{
"epoch": 0.2941083855527915,
"grad_norm": 0.5991604924201965,
"learning_rate": 0.00029395465994962216,
"loss": 0.8939,
"step": 2335
},
{
"epoch": 0.2947381679629688,
"grad_norm": 0.5732405781745911,
"learning_rate": 0.00029458438287153653,
"loss": 0.8594,
"step": 2340
},
{
"epoch": 0.29536795037314606,
"grad_norm": 0.5813714265823364,
"learning_rate": 0.00029521410579345085,
"loss": 0.8412,
"step": 2345
},
{
"epoch": 0.29599773278332336,
"grad_norm": 0.5281296968460083,
"learning_rate": 0.0002958438287153652,
"loss": 0.9049,
"step": 2350
},
{
"epoch": 0.29662751519350067,
"grad_norm": 0.6491068005561829,
"learning_rate": 0.0002964735516372796,
"loss": 0.8955,
"step": 2355
},
{
"epoch": 0.2972572976036779,
"grad_norm": 0.6236696839332581,
"learning_rate": 0.0002971032745591939,
"loss": 0.8792,
"step": 2360
},
{
"epoch": 0.2978870800138552,
"grad_norm": 0.605625331401825,
"learning_rate": 0.0002977329974811083,
"loss": 0.8448,
"step": 2365
},
{
"epoch": 0.29851686242403247,
"grad_norm": 0.6011054515838623,
"learning_rate": 0.00029836272040302265,
"loss": 0.911,
"step": 2370
},
{
"epoch": 0.2991466448342098,
"grad_norm": 0.5662422180175781,
"learning_rate": 0.00029899244332493697,
"loss": 0.872,
"step": 2375
},
{
"epoch": 0.2997764272443871,
"grad_norm": 0.8375005125999451,
"learning_rate": 0.00029962216624685134,
"loss": 0.7924,
"step": 2380
},
{
"epoch": 0.3004062096545643,
"grad_norm": 0.5916186571121216,
"learning_rate": 0.0002999999935557256,
"loss": 0.9007,
"step": 2385
},
{
"epoch": 0.30103599206474163,
"grad_norm": 0.6436251997947693,
"learning_rate": 0.00029999992105764553,
"loss": 0.8247,
"step": 2390
},
{
"epoch": 0.30166577447491894,
"grad_norm": 0.6368377208709717,
"learning_rate": 0.0002999997680061815,
"loss": 0.891,
"step": 2395
},
{
"epoch": 0.3022955568850962,
"grad_norm": 0.5848705172538757,
"learning_rate": 0.0002999995344014156,
"loss": 0.8335,
"step": 2400
},
{
"epoch": 0.3029253392952735,
"grad_norm": 0.5829634070396423,
"learning_rate": 0.0002999992202434735,
"loss": 0.8705,
"step": 2405
},
{
"epoch": 0.3035551217054508,
"grad_norm": 0.6242154240608215,
"learning_rate": 0.0002999988255325237,
"loss": 0.8819,
"step": 2410
},
{
"epoch": 0.30418490411562804,
"grad_norm": 0.5757481455802917,
"learning_rate": 0.0002999983502687783,
"loss": 0.8748,
"step": 2415
},
{
"epoch": 0.30481468652580535,
"grad_norm": 0.5024969577789307,
"learning_rate": 0.00029999779445249243,
"loss": 0.8534,
"step": 2420
},
{
"epoch": 0.3054444689359826,
"grad_norm": 0.5515364408493042,
"learning_rate": 0.00029999715808396463,
"loss": 0.8535,
"step": 2425
},
{
"epoch": 0.3060742513461599,
"grad_norm": 0.5151112079620361,
"learning_rate": 0.00029999644116353666,
"loss": 0.8686,
"step": 2430
},
{
"epoch": 0.3067040337563372,
"grad_norm": 0.5231375098228455,
"learning_rate": 0.0002999956436915935,
"loss": 0.8465,
"step": 2435
},
{
"epoch": 0.30733381616651445,
"grad_norm": 0.5415048003196716,
"learning_rate": 0.0002999947656685634,
"loss": 0.853,
"step": 2440
},
{
"epoch": 0.30796359857669175,
"grad_norm": 0.5642004609107971,
"learning_rate": 0.00029999380709491794,
"loss": 0.8827,
"step": 2445
},
{
"epoch": 0.30859338098686906,
"grad_norm": 0.6197057366371155,
"learning_rate": 0.0002999927679711718,
"loss": 0.9072,
"step": 2450
},
{
"epoch": 0.3092231633970463,
"grad_norm": 0.5865146517753601,
"learning_rate": 0.0002999916482978831,
"loss": 0.837,
"step": 2455
},
{
"epoch": 0.3098529458072236,
"grad_norm": 0.5961802005767822,
"learning_rate": 0.0002999904480756531,
"loss": 0.8657,
"step": 2460
},
{
"epoch": 0.31048272821740086,
"grad_norm": 0.5736685395240784,
"learning_rate": 0.0002999891673051263,
"loss": 0.872,
"step": 2465
},
{
"epoch": 0.31111251062757816,
"grad_norm": 0.5412915945053101,
"learning_rate": 0.0002999878059869905,
"loss": 0.8327,
"step": 2470
},
{
"epoch": 0.31174229303775547,
"grad_norm": 0.5011366605758667,
"learning_rate": 0.0002999863641219769,
"loss": 0.8418,
"step": 2475
},
{
"epoch": 0.3123720754479327,
"grad_norm": 0.5566514134407043,
"learning_rate": 0.0002999848417108597,
"loss": 0.8768,
"step": 2480
},
{
"epoch": 0.31300185785811,
"grad_norm": 0.5639830231666565,
"learning_rate": 0.0002999832387544564,
"loss": 0.8178,
"step": 2485
},
{
"epoch": 0.3136316402682873,
"grad_norm": 0.5784679055213928,
"learning_rate": 0.000299981555253628,
"loss": 0.8698,
"step": 2490
},
{
"epoch": 0.3142614226784646,
"grad_norm": 0.5428637266159058,
"learning_rate": 0.00029997979120927846,
"loss": 0.8671,
"step": 2495
},
{
"epoch": 0.3148912050886419,
"grad_norm": 0.5629287362098694,
"learning_rate": 0.00029997794662235515,
"loss": 0.873,
"step": 2500
},
{
"epoch": 0.3155209874988192,
"grad_norm": 0.5561172366142273,
"learning_rate": 0.00029997602149384856,
"loss": 0.8664,
"step": 2505
},
{
"epoch": 0.31615076990899643,
"grad_norm": 0.5451831221580505,
"learning_rate": 0.0002999740158247927,
"loss": 0.8349,
"step": 2510
},
{
"epoch": 0.31678055231917374,
"grad_norm": 0.5645403861999512,
"learning_rate": 0.00029997192961626456,
"loss": 0.8924,
"step": 2515
},
{
"epoch": 0.317410334729351,
"grad_norm": 0.5120379328727722,
"learning_rate": 0.00029996976286938444,
"loss": 0.8606,
"step": 2520
},
{
"epoch": 0.3180401171395283,
"grad_norm": 0.45988166332244873,
"learning_rate": 0.0002999675155853161,
"loss": 0.8285,
"step": 2525
},
{
"epoch": 0.3186698995497056,
"grad_norm": 0.5446504950523376,
"learning_rate": 0.00029996518776526614,
"loss": 0.8913,
"step": 2530
},
{
"epoch": 0.31929968195988284,
"grad_norm": 0.648369550704956,
"learning_rate": 0.00029996277941048485,
"loss": 0.8753,
"step": 2535
},
{
"epoch": 0.31992946437006015,
"grad_norm": 0.6404165029525757,
"learning_rate": 0.0002999602905222655,
"loss": 0.8747,
"step": 2540
},
{
"epoch": 0.32055924678023745,
"grad_norm": 0.46791502833366394,
"learning_rate": 0.0002999577211019447,
"loss": 0.8132,
"step": 2545
},
{
"epoch": 0.3211890291904147,
"grad_norm": 0.5365081429481506,
"learning_rate": 0.00029995507115090225,
"loss": 0.8363,
"step": 2550
},
{
"epoch": 0.321818811600592,
"grad_norm": 0.5029319524765015,
"learning_rate": 0.00029995234067056124,
"loss": 0.8297,
"step": 2555
},
{
"epoch": 0.32244859401076925,
"grad_norm": 0.509843647480011,
"learning_rate": 0.00029994952966238804,
"loss": 0.828,
"step": 2560
},
{
"epoch": 0.32307837642094656,
"grad_norm": 0.479045569896698,
"learning_rate": 0.0002999466381278922,
"loss": 0.8689,
"step": 2565
},
{
"epoch": 0.32370815883112386,
"grad_norm": 0.5639600157737732,
"learning_rate": 0.0002999436660686265,
"loss": 0.8521,
"step": 2570
},
{
"epoch": 0.3243379412413011,
"grad_norm": 0.5077898502349854,
"learning_rate": 0.00029994061348618715,
"loss": 0.835,
"step": 2575
},
{
"epoch": 0.3249677236514784,
"grad_norm": 0.45198580622673035,
"learning_rate": 0.00029993748038221324,
"loss": 0.8394,
"step": 2580
},
{
"epoch": 0.3255975060616557,
"grad_norm": 0.5617688894271851,
"learning_rate": 0.0002999342667583875,
"loss": 0.8285,
"step": 2585
},
{
"epoch": 0.32622728847183297,
"grad_norm": 0.5159285664558411,
"learning_rate": 0.0002999309726164356,
"loss": 0.7654,
"step": 2590
},
{
"epoch": 0.32685707088201027,
"grad_norm": 0.526965320110321,
"learning_rate": 0.00029992759795812666,
"loss": 0.8392,
"step": 2595
},
{
"epoch": 0.3274868532921876,
"grad_norm": 0.4861494302749634,
"learning_rate": 0.0002999241427852729,
"loss": 0.8177,
"step": 2600
},
{
"epoch": 0.3281166357023648,
"grad_norm": 0.5498744249343872,
"learning_rate": 0.0002999206070997298,
"loss": 0.8006,
"step": 2605
},
{
"epoch": 0.3287464181125421,
"grad_norm": 0.526978075504303,
"learning_rate": 0.0002999169909033962,
"loss": 0.8261,
"step": 2610
},
{
"epoch": 0.3293762005227194,
"grad_norm": 0.5078813433647156,
"learning_rate": 0.0002999132941982139,
"loss": 0.8396,
"step": 2615
},
{
"epoch": 0.3300059829328967,
"grad_norm": 0.5390729308128357,
"learning_rate": 0.00029990951698616834,
"loss": 0.8695,
"step": 2620
},
{
"epoch": 0.330635765343074,
"grad_norm": 0.520889401435852,
"learning_rate": 0.00029990565926928787,
"loss": 0.8489,
"step": 2625
},
{
"epoch": 0.33126554775325123,
"grad_norm": 0.6547030210494995,
"learning_rate": 0.00029990172104964413,
"loss": 0.8821,
"step": 2630
},
{
"epoch": 0.33189533016342854,
"grad_norm": 0.5034601092338562,
"learning_rate": 0.00029989770232935204,
"loss": 0.8202,
"step": 2635
},
{
"epoch": 0.33252511257360584,
"grad_norm": 0.5204071402549744,
"learning_rate": 0.0002998936031105698,
"loss": 0.852,
"step": 2640
},
{
"epoch": 0.3331548949837831,
"grad_norm": 0.499221533536911,
"learning_rate": 0.0002998894233954988,
"loss": 0.8338,
"step": 2645
},
{
"epoch": 0.3337846773939604,
"grad_norm": 0.5096358060836792,
"learning_rate": 0.0002998851631863835,
"loss": 0.8149,
"step": 2650
},
{
"epoch": 0.3344144598041377,
"grad_norm": 0.4654362201690674,
"learning_rate": 0.0002998808224855119,
"loss": 0.8461,
"step": 2655
},
{
"epoch": 0.33504424221431495,
"grad_norm": 0.7029035091400146,
"learning_rate": 0.00029987640129521497,
"loss": 0.8137,
"step": 2660
},
{
"epoch": 0.33567402462449225,
"grad_norm": 0.5634217262268066,
"learning_rate": 0.000299871899617867,
"loss": 0.8434,
"step": 2665
},
{
"epoch": 0.3363038070346695,
"grad_norm": 0.5168646574020386,
"learning_rate": 0.0002998673174558855,
"loss": 0.8554,
"step": 2670
},
{
"epoch": 0.3369335894448468,
"grad_norm": 0.4693644344806671,
"learning_rate": 0.00029986265481173123,
"loss": 0.8246,
"step": 2675
},
{
"epoch": 0.3375633718550241,
"grad_norm": 0.44928330183029175,
"learning_rate": 0.00029985791168790805,
"loss": 0.8554,
"step": 2680
},
{
"epoch": 0.33819315426520136,
"grad_norm": 0.5288846492767334,
"learning_rate": 0.0002998530880869632,
"loss": 0.8319,
"step": 2685
},
{
"epoch": 0.33882293667537866,
"grad_norm": 0.4755760431289673,
"learning_rate": 0.00029984818401148706,
"loss": 0.874,
"step": 2690
},
{
"epoch": 0.33945271908555597,
"grad_norm": 0.541684091091156,
"learning_rate": 0.0002998431994641132,
"loss": 0.8526,
"step": 2695
},
{
"epoch": 0.3400825014957332,
"grad_norm": 0.5160995125770569,
"learning_rate": 0.0002998381344475184,
"loss": 0.8749,
"step": 2700
},
{
"epoch": 0.3407122839059105,
"grad_norm": 0.5409444570541382,
"learning_rate": 0.00029983298896442276,
"loss": 0.8118,
"step": 2705
},
{
"epoch": 0.34134206631608777,
"grad_norm": 0.5148081183433533,
"learning_rate": 0.00029982776301758956,
"loss": 0.8685,
"step": 2710
},
{
"epoch": 0.34197184872626507,
"grad_norm": 0.5689860582351685,
"learning_rate": 0.0002998224566098251,
"loss": 0.8476,
"step": 2715
},
{
"epoch": 0.3426016311364424,
"grad_norm": 0.520268440246582,
"learning_rate": 0.00029981706974397917,
"loss": 0.8128,
"step": 2720
},
{
"epoch": 0.3432314135466196,
"grad_norm": 0.49906817078590393,
"learning_rate": 0.00029981160242294457,
"loss": 0.836,
"step": 2725
},
{
"epoch": 0.34386119595679693,
"grad_norm": 0.47317516803741455,
"learning_rate": 0.0002998060546496575,
"loss": 0.8251,
"step": 2730
},
{
"epoch": 0.34449097836697423,
"grad_norm": 0.49573519825935364,
"learning_rate": 0.0002998004264270971,
"loss": 0.832,
"step": 2735
},
{
"epoch": 0.3451207607771515,
"grad_norm": 0.43803608417510986,
"learning_rate": 0.0002997947177582859,
"loss": 0.7875,
"step": 2740
},
{
"epoch": 0.3457505431873288,
"grad_norm": 0.5324883460998535,
"learning_rate": 0.0002997889286462896,
"loss": 0.824,
"step": 2745
},
{
"epoch": 0.3463803255975061,
"grad_norm": 0.5902321934700012,
"learning_rate": 0.00029978305909421707,
"loss": 0.8265,
"step": 2750
},
{
"epoch": 0.34701010800768334,
"grad_norm": 0.5052042007446289,
"learning_rate": 0.0002997771091052204,
"loss": 0.7715,
"step": 2755
},
{
"epoch": 0.34763989041786064,
"grad_norm": 0.4439961314201355,
"learning_rate": 0.0002997710786824949,
"loss": 0.8387,
"step": 2760
},
{
"epoch": 0.3482696728280379,
"grad_norm": 0.5099385976791382,
"learning_rate": 0.0002997649678292789,
"loss": 0.8424,
"step": 2765
},
{
"epoch": 0.3488994552382152,
"grad_norm": 0.4415825605392456,
"learning_rate": 0.00029975877654885426,
"loss": 0.8066,
"step": 2770
},
{
"epoch": 0.3495292376483925,
"grad_norm": 0.5088052153587341,
"learning_rate": 0.0002997525048445458,
"loss": 0.8172,
"step": 2775
},
{
"epoch": 0.35015902005856975,
"grad_norm": 0.5503986477851868,
"learning_rate": 0.00029974615271972146,
"loss": 0.873,
"step": 2780
},
{
"epoch": 0.35078880246874705,
"grad_norm": 0.4609704613685608,
"learning_rate": 0.0002997397201777926,
"loss": 0.8041,
"step": 2785
},
{
"epoch": 0.35141858487892436,
"grad_norm": 0.49494004249572754,
"learning_rate": 0.00029973320722221356,
"loss": 0.916,
"step": 2790
},
{
"epoch": 0.3520483672891016,
"grad_norm": 0.4820273518562317,
"learning_rate": 0.00029972661385648197,
"loss": 0.8597,
"step": 2795
},
{
"epoch": 0.3526781496992789,
"grad_norm": 0.467641681432724,
"learning_rate": 0.0002997199400841386,
"loss": 0.7944,
"step": 2800
},
{
"epoch": 0.35330793210945616,
"grad_norm": 0.49666082859039307,
"learning_rate": 0.00029971318590876745,
"loss": 0.8204,
"step": 2805
},
{
"epoch": 0.35393771451963346,
"grad_norm": 0.4578961133956909,
"learning_rate": 0.00029970635133399565,
"loss": 0.8426,
"step": 2810
},
{
"epoch": 0.35456749692981077,
"grad_norm": 0.48815369606018066,
"learning_rate": 0.00029969943636349363,
"loss": 0.8277,
"step": 2815
},
{
"epoch": 0.355197279339988,
"grad_norm": 0.4686887562274933,
"learning_rate": 0.0002996924410009747,
"loss": 0.7936,
"step": 2820
},
{
"epoch": 0.3558270617501653,
"grad_norm": 0.4245474636554718,
"learning_rate": 0.0002996853652501956,
"loss": 0.8085,
"step": 2825
},
{
"epoch": 0.3564568441603426,
"grad_norm": 0.5085129737854004,
"learning_rate": 0.0002996782091149562,
"loss": 0.8584,
"step": 2830
},
{
"epoch": 0.35708662657051987,
"grad_norm": 0.4415908455848694,
"learning_rate": 0.0002996709725990995,
"loss": 0.8234,
"step": 2835
},
{
"epoch": 0.3577164089806972,
"grad_norm": 0.44018128514289856,
"learning_rate": 0.00029966365570651164,
"loss": 0.8566,
"step": 2840
},
{
"epoch": 0.3583461913908745,
"grad_norm": 0.4675704836845398,
"learning_rate": 0.000299656258441122,
"loss": 0.8164,
"step": 2845
},
{
"epoch": 0.35897597380105173,
"grad_norm": 0.47553756833076477,
"learning_rate": 0.0002996487808069031,
"loss": 0.8177,
"step": 2850
},
{
"epoch": 0.35960575621122903,
"grad_norm": 0.5298905372619629,
"learning_rate": 0.00029964122280787053,
"loss": 0.8537,
"step": 2855
},
{
"epoch": 0.3602355386214063,
"grad_norm": 0.484838604927063,
"learning_rate": 0.0002996335844480832,
"loss": 0.8495,
"step": 2860
},
{
"epoch": 0.3608653210315836,
"grad_norm": 0.4366026818752289,
"learning_rate": 0.000299625865731643,
"loss": 0.8073,
"step": 2865
},
{
"epoch": 0.3614951034417609,
"grad_norm": 0.4988342225551605,
"learning_rate": 0.00029961806666269503,
"loss": 0.8127,
"step": 2870
},
{
"epoch": 0.36212488585193814,
"grad_norm": 0.5028805732727051,
"learning_rate": 0.00029961018724542767,
"loss": 0.8711,
"step": 2875
},
{
"epoch": 0.36275466826211544,
"grad_norm": 0.5009424686431885,
"learning_rate": 0.00029960222748407226,
"loss": 0.8015,
"step": 2880
},
{
"epoch": 0.36338445067229275,
"grad_norm": 0.4522798955440521,
"learning_rate": 0.00029959418738290344,
"loss": 0.8261,
"step": 2885
},
{
"epoch": 0.36401423308247,
"grad_norm": 0.49349725246429443,
"learning_rate": 0.00029958606694623893,
"loss": 0.8006,
"step": 2890
},
{
"epoch": 0.3646440154926473,
"grad_norm": 0.46870625019073486,
"learning_rate": 0.00029957786617843956,
"loss": 0.8285,
"step": 2895
},
{
"epoch": 0.36527379790282455,
"grad_norm": 0.5234463810920715,
"learning_rate": 0.0002995695850839093,
"loss": 0.8497,
"step": 2900
},
{
"epoch": 0.36590358031300185,
"grad_norm": 0.487884521484375,
"learning_rate": 0.0002995612236670953,
"loss": 0.8033,
"step": 2905
},
{
"epoch": 0.36653336272317916,
"grad_norm": 0.4760074317455292,
"learning_rate": 0.0002995527819324879,
"loss": 0.8412,
"step": 2910
},
{
"epoch": 0.3671631451333564,
"grad_norm": 0.4630395472049713,
"learning_rate": 0.0002995442598846205,
"loss": 0.8244,
"step": 2915
},
{
"epoch": 0.3677929275435337,
"grad_norm": 0.4981043040752411,
"learning_rate": 0.0002995356575280695,
"loss": 0.798,
"step": 2920
},
{
"epoch": 0.368422709953711,
"grad_norm": 0.4630597233772278,
"learning_rate": 0.00029952697486745466,
"loss": 0.8032,
"step": 2925
},
{
"epoch": 0.36905249236388826,
"grad_norm": 0.4962010979652405,
"learning_rate": 0.00029951821190743884,
"loss": 0.8183,
"step": 2930
},
{
"epoch": 0.36968227477406557,
"grad_norm": 0.47193852066993713,
"learning_rate": 0.00029950936865272775,
"loss": 0.841,
"step": 2935
},
{
"epoch": 0.37031205718424287,
"grad_norm": 0.4802277982234955,
"learning_rate": 0.0002995004451080706,
"loss": 0.7433,
"step": 2940
},
{
"epoch": 0.3709418395944201,
"grad_norm": 0.43486830592155457,
"learning_rate": 0.00029949144127825947,
"loss": 0.8051,
"step": 2945
},
{
"epoch": 0.3715716220045974,
"grad_norm": 0.5078021287918091,
"learning_rate": 0.0002994823571681296,
"loss": 0.8662,
"step": 2950
},
{
"epoch": 0.3722014044147747,
"grad_norm": 0.44738146662712097,
"learning_rate": 0.0002994731927825594,
"loss": 0.7997,
"step": 2955
},
{
"epoch": 0.372831186824952,
"grad_norm": 0.44323739409446716,
"learning_rate": 0.0002994639481264704,
"loss": 0.8481,
"step": 2960
},
{
"epoch": 0.3734609692351293,
"grad_norm": 0.525050938129425,
"learning_rate": 0.0002994546232048271,
"loss": 0.8375,
"step": 2965
},
{
"epoch": 0.37409075164530653,
"grad_norm": 0.4817000925540924,
"learning_rate": 0.00029944521802263723,
"loss": 0.8,
"step": 2970
},
{
"epoch": 0.37472053405548383,
"grad_norm": 0.5709639191627502,
"learning_rate": 0.00029943573258495165,
"loss": 0.8104,
"step": 2975
},
{
"epoch": 0.37535031646566114,
"grad_norm": 0.518618643283844,
"learning_rate": 0.00029942616689686416,
"loss": 0.7948,
"step": 2980
},
{
"epoch": 0.3759800988758384,
"grad_norm": 0.4226647615432739,
"learning_rate": 0.00029941652096351174,
"loss": 0.7599,
"step": 2985
},
{
"epoch": 0.3766098812860157,
"grad_norm": 0.4751405119895935,
"learning_rate": 0.0002994067947900746,
"loss": 0.8015,
"step": 2990
},
{
"epoch": 0.37723966369619294,
"grad_norm": 0.4653600752353668,
"learning_rate": 0.0002993969883817758,
"loss": 0.7758,
"step": 2995
},
{
"epoch": 0.37786944610637024,
"grad_norm": 0.512941837310791,
"learning_rate": 0.00029938710174388163,
"loss": 0.8188,
"step": 3000
},
{
"epoch": 0.37786944610637024,
"eval_loss": 0.3438470661640167,
"eval_runtime": 6.225,
"eval_samples_per_second": 160.642,
"eval_steps_per_second": 10.12,
"step": 3000
},
{
"epoch": 0.37849922851654755,
"grad_norm": 0.43524855375289917,
"learning_rate": 0.0002993771348817015,
"loss": 0.803,
"step": 3005
},
{
"epoch": 0.3791290109267248,
"grad_norm": 0.4569668173789978,
"learning_rate": 0.0002993670878005878,
"loss": 0.8777,
"step": 3010
},
{
"epoch": 0.3797587933369021,
"grad_norm": 0.4643417000770569,
"learning_rate": 0.00029935696050593604,
"loss": 0.7621,
"step": 3015
},
{
"epoch": 0.3803885757470794,
"grad_norm": 0.4604712128639221,
"learning_rate": 0.00029934675300318485,
"loss": 0.8216,
"step": 3020
},
{
"epoch": 0.38101835815725665,
"grad_norm": 0.4307630956172943,
"learning_rate": 0.0002993364652978158,
"loss": 0.8163,
"step": 3025
},
{
"epoch": 0.38164814056743396,
"grad_norm": 0.44455698132514954,
"learning_rate": 0.00029932609739535365,
"loss": 0.818,
"step": 3030
},
{
"epoch": 0.38227792297761126,
"grad_norm": 0.43203669786453247,
"learning_rate": 0.0002993156493013663,
"loss": 0.8168,
"step": 3035
},
{
"epoch": 0.3829077053877885,
"grad_norm": 0.42328670620918274,
"learning_rate": 0.00029930512102146453,
"loss": 0.8025,
"step": 3040
},
{
"epoch": 0.3835374877979658,
"grad_norm": 0.43900108337402344,
"learning_rate": 0.0002992945125613023,
"loss": 0.7595,
"step": 3045
},
{
"epoch": 0.38416727020814306,
"grad_norm": 0.46638986468315125,
"learning_rate": 0.00029928382392657656,
"loss": 0.8208,
"step": 3050
},
{
"epoch": 0.38479705261832037,
"grad_norm": 0.4279174208641052,
"learning_rate": 0.00029927305512302736,
"loss": 0.8151,
"step": 3055
},
{
"epoch": 0.38542683502849767,
"grad_norm": 0.4648323357105255,
"learning_rate": 0.0002992622061564378,
"loss": 0.7666,
"step": 3060
},
{
"epoch": 0.3860566174386749,
"grad_norm": 0.45052894949913025,
"learning_rate": 0.000299251277032634,
"loss": 0.7995,
"step": 3065
},
{
"epoch": 0.3866863998488522,
"grad_norm": 0.46262305974960327,
"learning_rate": 0.0002992402677574852,
"loss": 0.8175,
"step": 3070
},
{
"epoch": 0.38731618225902953,
"grad_norm": 0.4934038519859314,
"learning_rate": 0.00029922917833690365,
"loss": 0.821,
"step": 3075
},
{
"epoch": 0.3879459646692068,
"grad_norm": 0.46494096517562866,
"learning_rate": 0.0002992180087768445,
"loss": 0.8081,
"step": 3080
},
{
"epoch": 0.3885757470793841,
"grad_norm": 0.9760459661483765,
"learning_rate": 0.0002992067590833062,
"loss": 0.7673,
"step": 3085
},
{
"epoch": 0.38920552948956133,
"grad_norm": 0.7070348262786865,
"learning_rate": 0.00029919542926233,
"loss": 0.8017,
"step": 3090
},
{
"epoch": 0.38983531189973863,
"grad_norm": 0.6773821711540222,
"learning_rate": 0.00029918401932000027,
"loss": 0.7946,
"step": 3095
},
{
"epoch": 0.39046509430991594,
"grad_norm": 0.4955935478210449,
"learning_rate": 0.0002991725292624445,
"loss": 0.8431,
"step": 3100
},
{
"epoch": 0.3910948767200932,
"grad_norm": 0.7728490829467773,
"learning_rate": 0.000299160959095833,
"loss": 0.8024,
"step": 3105
},
{
"epoch": 0.3917246591302705,
"grad_norm": 0.6880044341087341,
"learning_rate": 0.00029914930882637926,
"loss": 0.788,
"step": 3110
},
{
"epoch": 0.3923544415404478,
"grad_norm": 0.4916500747203827,
"learning_rate": 0.0002991375784603398,
"loss": 0.7878,
"step": 3115
},
{
"epoch": 0.39298422395062504,
"grad_norm": 0.5188093781471252,
"learning_rate": 0.00029912576800401403,
"loss": 0.8404,
"step": 3120
},
{
"epoch": 0.39361400636080235,
"grad_norm": 0.5084378123283386,
"learning_rate": 0.0002991138774637444,
"loss": 0.8277,
"step": 3125
},
{
"epoch": 0.39424378877097965,
"grad_norm": 0.4139776825904846,
"learning_rate": 0.0002991019068459165,
"loss": 0.7672,
"step": 3130
},
{
"epoch": 0.3948735711811569,
"grad_norm": 0.4756976366043091,
"learning_rate": 0.0002990898561569588,
"loss": 0.7936,
"step": 3135
},
{
"epoch": 0.3955033535913342,
"grad_norm": 0.46053630113601685,
"learning_rate": 0.0002990777254033427,
"loss": 0.8102,
"step": 3140
},
{
"epoch": 0.39613313600151145,
"grad_norm": 0.48546189069747925,
"learning_rate": 0.00029906551459158283,
"loss": 0.8184,
"step": 3145
},
{
"epoch": 0.39676291841168876,
"grad_norm": 0.477192223072052,
"learning_rate": 0.0002990532237282366,
"loss": 0.828,
"step": 3150
},
{
"epoch": 0.39739270082186606,
"grad_norm": 0.48900333046913147,
"learning_rate": 0.00029904085281990447,
"loss": 0.8183,
"step": 3155
},
{
"epoch": 0.3980224832320433,
"grad_norm": 0.5019087791442871,
"learning_rate": 0.0002990284018732299,
"loss": 0.8002,
"step": 3160
},
{
"epoch": 0.3986522656422206,
"grad_norm": 0.5127068758010864,
"learning_rate": 0.0002990158708948994,
"loss": 0.8088,
"step": 3165
},
{
"epoch": 0.3992820480523979,
"grad_norm": 0.44172775745391846,
"learning_rate": 0.00029900325989164233,
"loss": 0.8013,
"step": 3170
},
{
"epoch": 0.39991183046257517,
"grad_norm": 0.5318475961685181,
"learning_rate": 0.0002989905688702311,
"loss": 0.8239,
"step": 3175
},
{
"epoch": 0.4005416128727525,
"grad_norm": 0.4257467985153198,
"learning_rate": 0.0002989777978374811,
"loss": 0.7714,
"step": 3180
},
{
"epoch": 0.4011713952829297,
"grad_norm": 0.42196664214134216,
"learning_rate": 0.0002989649468002506,
"loss": 0.7987,
"step": 3185
},
{
"epoch": 0.401801177693107,
"grad_norm": 0.47977736592292786,
"learning_rate": 0.000298952015765441,
"loss": 0.7699,
"step": 3190
},
{
"epoch": 0.40243096010328433,
"grad_norm": 0.4841693639755249,
"learning_rate": 0.0002989390047399965,
"loss": 0.7916,
"step": 3195
},
{
"epoch": 0.4030607425134616,
"grad_norm": 0.5104061961174011,
"learning_rate": 0.0002989259137309043,
"loss": 0.8244,
"step": 3200
},
{
"epoch": 0.4036905249236389,
"grad_norm": 0.46594473719596863,
"learning_rate": 0.00029891274274519464,
"loss": 0.786,
"step": 3205
},
{
"epoch": 0.4043203073338162,
"grad_norm": 0.4309998154640198,
"learning_rate": 0.0002988994917899405,
"loss": 0.8266,
"step": 3210
},
{
"epoch": 0.40495008974399344,
"grad_norm": 0.4976588785648346,
"learning_rate": 0.00029888616087225817,
"loss": 0.7911,
"step": 3215
},
{
"epoch": 0.40557987215417074,
"grad_norm": 0.47657066583633423,
"learning_rate": 0.00029887274999930647,
"loss": 0.7926,
"step": 3220
},
{
"epoch": 0.40620965456434804,
"grad_norm": 0.42497026920318604,
"learning_rate": 0.0002988592591782874,
"loss": 0.7838,
"step": 3225
},
{
"epoch": 0.4068394369745253,
"grad_norm": 0.4974801540374756,
"learning_rate": 0.00029884568841644587,
"loss": 0.7854,
"step": 3230
},
{
"epoch": 0.4074692193847026,
"grad_norm": 0.43505486845970154,
"learning_rate": 0.00029883203772106966,
"loss": 0.8689,
"step": 3235
},
{
"epoch": 0.40809900179487985,
"grad_norm": 0.5216085314750671,
"learning_rate": 0.0002988183070994895,
"loss": 0.8445,
"step": 3240
},
{
"epoch": 0.40872878420505715,
"grad_norm": 0.5993830561637878,
"learning_rate": 0.0002988044965590791,
"loss": 0.7944,
"step": 3245
},
{
"epoch": 0.40935856661523445,
"grad_norm": 0.7245651483535767,
"learning_rate": 0.00029879060610725494,
"loss": 0.8175,
"step": 3250
},
{
"epoch": 0.4099883490254117,
"grad_norm": 0.4758714735507965,
"learning_rate": 0.00029877663575147653,
"loss": 0.7862,
"step": 3255
},
{
"epoch": 0.410618131435589,
"grad_norm": 0.5264742970466614,
"learning_rate": 0.0002987625854992464,
"loss": 0.7625,
"step": 3260
},
{
"epoch": 0.4112479138457663,
"grad_norm": 0.46857550740242004,
"learning_rate": 0.0002987484553581097,
"loss": 0.7878,
"step": 3265
},
{
"epoch": 0.41187769625594356,
"grad_norm": 0.4588899314403534,
"learning_rate": 0.0002987342453356547,
"loss": 0.8435,
"step": 3270
},
{
"epoch": 0.41250747866612086,
"grad_norm": 0.47005462646484375,
"learning_rate": 0.0002987199554395125,
"loss": 0.8343,
"step": 3275
},
{
"epoch": 0.4131372610762981,
"grad_norm": 0.4855548143386841,
"learning_rate": 0.00029870558567735716,
"loss": 0.7944,
"step": 3280
},
{
"epoch": 0.4137670434864754,
"grad_norm": 0.4832567572593689,
"learning_rate": 0.00029869113605690545,
"loss": 0.7999,
"step": 3285
},
{
"epoch": 0.4143968258966527,
"grad_norm": 0.4483296573162079,
"learning_rate": 0.00029867660658591724,
"loss": 0.8074,
"step": 3290
},
{
"epoch": 0.41502660830682997,
"grad_norm": 0.5084306001663208,
"learning_rate": 0.00029866199727219514,
"loss": 0.8173,
"step": 3295
},
{
"epoch": 0.4156563907170073,
"grad_norm": 0.43247321248054504,
"learning_rate": 0.00029864730812358473,
"loss": 0.7904,
"step": 3300
},
{
"epoch": 0.4162861731271846,
"grad_norm": 0.4278540313243866,
"learning_rate": 0.0002986325391479744,
"loss": 0.79,
"step": 3305
},
{
"epoch": 0.4169159555373618,
"grad_norm": 0.4396720230579376,
"learning_rate": 0.00029861769035329546,
"loss": 0.7737,
"step": 3310
},
{
"epoch": 0.41754573794753913,
"grad_norm": 0.4305702745914459,
"learning_rate": 0.0002986027617475219,
"loss": 0.8133,
"step": 3315
},
{
"epoch": 0.41817552035771643,
"grad_norm": 0.4455117881298065,
"learning_rate": 0.0002985877533386709,
"loss": 0.7932,
"step": 3320
},
{
"epoch": 0.4188053027678937,
"grad_norm": 0.45051881670951843,
"learning_rate": 0.00029857266513480226,
"loss": 0.8162,
"step": 3325
},
{
"epoch": 0.419435085178071,
"grad_norm": 0.47537773847579956,
"learning_rate": 0.0002985574971440187,
"loss": 0.7931,
"step": 3330
},
{
"epoch": 0.42006486758824824,
"grad_norm": 0.46828627586364746,
"learning_rate": 0.0002985422493744657,
"loss": 0.8399,
"step": 3335
},
{
"epoch": 0.42069464999842554,
"grad_norm": 0.4528372585773468,
"learning_rate": 0.00029852692183433176,
"loss": 0.7821,
"step": 3340
},
{
"epoch": 0.42132443240860284,
"grad_norm": 0.4476306736469269,
"learning_rate": 0.00029851151453184807,
"loss": 0.7986,
"step": 3345
},
{
"epoch": 0.4219542148187801,
"grad_norm": 0.4092450439929962,
"learning_rate": 0.00029849602747528874,
"loss": 0.7827,
"step": 3350
},
{
"epoch": 0.4225839972289574,
"grad_norm": 0.4776279330253601,
"learning_rate": 0.00029848046067297064,
"loss": 0.8269,
"step": 3355
},
{
"epoch": 0.4232137796391347,
"grad_norm": 0.45867466926574707,
"learning_rate": 0.00029846481413325346,
"loss": 0.8094,
"step": 3360
},
{
"epoch": 0.42384356204931195,
"grad_norm": 0.4665123522281647,
"learning_rate": 0.00029844908786453986,
"loss": 0.7288,
"step": 3365
},
{
"epoch": 0.42447334445948925,
"grad_norm": 0.4820057451725006,
"learning_rate": 0.0002984332818752751,
"loss": 0.8012,
"step": 3370
},
{
"epoch": 0.4251031268696665,
"grad_norm": 0.46213796734809875,
"learning_rate": 0.00029841739617394737,
"loss": 0.7693,
"step": 3375
},
{
"epoch": 0.4257329092798438,
"grad_norm": 0.4329429864883423,
"learning_rate": 0.0002984014307690878,
"loss": 0.7215,
"step": 3380
},
{
"epoch": 0.4263626916900211,
"grad_norm": 0.437621146440506,
"learning_rate": 0.00029838538566926993,
"loss": 0.7839,
"step": 3385
},
{
"epoch": 0.42699247410019836,
"grad_norm": 0.4661789536476135,
"learning_rate": 0.0002983692608831105,
"loss": 0.7827,
"step": 3390
},
{
"epoch": 0.42762225651037566,
"grad_norm": 0.4203425645828247,
"learning_rate": 0.0002983530564192689,
"loss": 0.8096,
"step": 3395
},
{
"epoch": 0.42825203892055297,
"grad_norm": 0.4614803194999695,
"learning_rate": 0.00029833677228644726,
"loss": 0.8189,
"step": 3400
},
{
"epoch": 0.4288818213307302,
"grad_norm": 0.4247860908508301,
"learning_rate": 0.0002983204084933905,
"loss": 0.8123,
"step": 3405
},
{
"epoch": 0.4295116037409075,
"grad_norm": 0.4418291449546814,
"learning_rate": 0.0002983039650488864,
"loss": 0.8036,
"step": 3410
},
{
"epoch": 0.4301413861510848,
"grad_norm": 0.46780282258987427,
"learning_rate": 0.00029828744196176547,
"loss": 0.8122,
"step": 3415
},
{
"epoch": 0.4307711685612621,
"grad_norm": 0.44973024725914,
"learning_rate": 0.0002982708392409009,
"loss": 0.7813,
"step": 3420
},
{
"epoch": 0.4314009509714394,
"grad_norm": 0.3922254741191864,
"learning_rate": 0.00029825415689520887,
"loss": 0.7809,
"step": 3425
},
{
"epoch": 0.4320307333816166,
"grad_norm": 0.391084223985672,
"learning_rate": 0.00029823739493364804,
"loss": 0.7757,
"step": 3430
},
{
"epoch": 0.43266051579179393,
"grad_norm": 0.4502919316291809,
"learning_rate": 0.00029822055336522005,
"loss": 0.7688,
"step": 3435
},
{
"epoch": 0.43329029820197124,
"grad_norm": 0.475436270236969,
"learning_rate": 0.0002982036321989692,
"loss": 0.7667,
"step": 3440
},
{
"epoch": 0.4339200806121485,
"grad_norm": 0.42362409830093384,
"learning_rate": 0.00029818663144398253,
"loss": 0.8098,
"step": 3445
},
{
"epoch": 0.4345498630223258,
"grad_norm": 0.45329517126083374,
"learning_rate": 0.0002981695511093898,
"loss": 0.7706,
"step": 3450
},
{
"epoch": 0.4351796454325031,
"grad_norm": 0.4297536313533783,
"learning_rate": 0.00029815239120436365,
"loss": 0.808,
"step": 3455
},
{
"epoch": 0.43580942784268034,
"grad_norm": 0.4800092577934265,
"learning_rate": 0.0002981351517381192,
"loss": 0.7973,
"step": 3460
},
{
"epoch": 0.43643921025285765,
"grad_norm": 0.5014777779579163,
"learning_rate": 0.00029811783271991454,
"loss": 0.8098,
"step": 3465
},
{
"epoch": 0.4370689926630349,
"grad_norm": 0.4412321448326111,
"learning_rate": 0.00029810043415905027,
"loss": 0.7669,
"step": 3470
},
{
"epoch": 0.4376987750732122,
"grad_norm": 0.4491146206855774,
"learning_rate": 0.00029808295606486993,
"loss": 0.7599,
"step": 3475
},
{
"epoch": 0.4383285574833895,
"grad_norm": 0.42482897639274597,
"learning_rate": 0.0002980653984467596,
"loss": 0.7501,
"step": 3480
},
{
"epoch": 0.43895833989356675,
"grad_norm": 0.4166581332683563,
"learning_rate": 0.0002980477613141482,
"loss": 0.7807,
"step": 3485
},
{
"epoch": 0.43958812230374406,
"grad_norm": 0.48076120018959045,
"learning_rate": 0.0002980300446765071,
"loss": 0.821,
"step": 3490
},
{
"epoch": 0.44021790471392136,
"grad_norm": 0.4148639142513275,
"learning_rate": 0.00029801224854335073,
"loss": 0.781,
"step": 3495
},
{
"epoch": 0.4408476871240986,
"grad_norm": 0.41731131076812744,
"learning_rate": 0.00029799437292423586,
"loss": 0.7784,
"step": 3500
},
{
"epoch": 0.4414774695342759,
"grad_norm": 0.4514264762401581,
"learning_rate": 0.00029797641782876224,
"loss": 0.8066,
"step": 3505
},
{
"epoch": 0.4421072519444532,
"grad_norm": 0.44717252254486084,
"learning_rate": 0.00029795838326657204,
"loss": 0.7761,
"step": 3510
},
{
"epoch": 0.44273703435463047,
"grad_norm": 0.42850586771965027,
"learning_rate": 0.00029794026924735034,
"loss": 0.783,
"step": 3515
},
{
"epoch": 0.44336681676480777,
"grad_norm": 0.7937319278717041,
"learning_rate": 0.00029792207578082476,
"loss": 0.7894,
"step": 3520
},
{
"epoch": 0.443996599174985,
"grad_norm": 0.4401470124721527,
"learning_rate": 0.0002979038028767656,
"loss": 0.8046,
"step": 3525
},
{
"epoch": 0.4446263815851623,
"grad_norm": 0.45515474677085876,
"learning_rate": 0.00029788545054498577,
"loss": 0.8095,
"step": 3530
},
{
"epoch": 0.4452561639953396,
"grad_norm": 0.4676735997200012,
"learning_rate": 0.00029786701879534093,
"loss": 0.7969,
"step": 3535
},
{
"epoch": 0.4458859464055169,
"grad_norm": 0.42322975397109985,
"learning_rate": 0.0002978485076377294,
"loss": 0.8336,
"step": 3540
},
{
"epoch": 0.4465157288156942,
"grad_norm": 0.4256497025489807,
"learning_rate": 0.000297829917082092,
"loss": 0.7773,
"step": 3545
},
{
"epoch": 0.4471455112258715,
"grad_norm": 0.40527772903442383,
"learning_rate": 0.00029781124713841237,
"loss": 0.8058,
"step": 3550
},
{
"epoch": 0.44777529363604873,
"grad_norm": 0.4047418534755707,
"learning_rate": 0.0002977924978167166,
"loss": 0.7769,
"step": 3555
},
{
"epoch": 0.44840507604622604,
"grad_norm": 0.4016299545764923,
"learning_rate": 0.00029777366912707366,
"loss": 0.7531,
"step": 3560
},
{
"epoch": 0.4490348584564033,
"grad_norm": 0.4176371395587921,
"learning_rate": 0.00029775476107959486,
"loss": 0.7865,
"step": 3565
},
{
"epoch": 0.4496646408665806,
"grad_norm": 0.44107553362846375,
"learning_rate": 0.00029773577368443426,
"loss": 0.7735,
"step": 3570
},
{
"epoch": 0.4502944232767579,
"grad_norm": 0.43897122144699097,
"learning_rate": 0.00029771670695178857,
"loss": 0.7715,
"step": 3575
},
{
"epoch": 0.45092420568693514,
"grad_norm": 0.4024025499820709,
"learning_rate": 0.000297697560891897,
"loss": 0.7442,
"step": 3580
},
{
"epoch": 0.45155398809711245,
"grad_norm": 0.4896734356880188,
"learning_rate": 0.0002976783355150415,
"loss": 0.829,
"step": 3585
},
{
"epoch": 0.45218377050728975,
"grad_norm": 0.47621142864227295,
"learning_rate": 0.0002976590308315465,
"loss": 0.7915,
"step": 3590
},
{
"epoch": 0.452813552917467,
"grad_norm": 0.3869519829750061,
"learning_rate": 0.00029763964685177905,
"loss": 0.7696,
"step": 3595
},
{
"epoch": 0.4534433353276443,
"grad_norm": 0.4245865046977997,
"learning_rate": 0.0002976201835861488,
"loss": 0.7281,
"step": 3600
},
{
"epoch": 0.4540731177378216,
"grad_norm": 0.40193241834640503,
"learning_rate": 0.0002976006410451079,
"loss": 0.7435,
"step": 3605
},
{
"epoch": 0.45470290014799886,
"grad_norm": 0.4142551124095917,
"learning_rate": 0.00029758101923915123,
"loss": 0.7627,
"step": 3610
},
{
"epoch": 0.45533268255817616,
"grad_norm": 0.4214671552181244,
"learning_rate": 0.0002975613181788162,
"loss": 0.8084,
"step": 3615
},
{
"epoch": 0.4559624649683534,
"grad_norm": 0.46768540143966675,
"learning_rate": 0.0002975415378746826,
"loss": 0.7502,
"step": 3620
},
{
"epoch": 0.4565922473785307,
"grad_norm": 0.43812718987464905,
"learning_rate": 0.00029752167833737295,
"loss": 0.7555,
"step": 3625
},
{
"epoch": 0.457222029788708,
"grad_norm": 0.44065701961517334,
"learning_rate": 0.00029750173957755223,
"loss": 0.7824,
"step": 3630
},
{
"epoch": 0.45785181219888527,
"grad_norm": 0.46632614731788635,
"learning_rate": 0.00029748172160592816,
"loss": 0.7787,
"step": 3635
},
{
"epoch": 0.45848159460906257,
"grad_norm": 0.3616549074649811,
"learning_rate": 0.00029746162443325066,
"loss": 0.766,
"step": 3640
},
{
"epoch": 0.4591113770192399,
"grad_norm": 0.39852583408355713,
"learning_rate": 0.00029744144807031253,
"loss": 0.7318,
"step": 3645
},
{
"epoch": 0.4597411594294171,
"grad_norm": 0.4082608222961426,
"learning_rate": 0.0002974211925279488,
"loss": 0.7726,
"step": 3650
},
{
"epoch": 0.4603709418395944,
"grad_norm": 0.4750503599643707,
"learning_rate": 0.00029740085781703726,
"loss": 0.7953,
"step": 3655
},
{
"epoch": 0.4610007242497717,
"grad_norm": 0.439531534910202,
"learning_rate": 0.0002973804439484981,
"loss": 0.788,
"step": 3660
},
{
"epoch": 0.461630506659949,
"grad_norm": 0.41563692688941956,
"learning_rate": 0.000297359950933294,
"loss": 0.7935,
"step": 3665
},
{
"epoch": 0.4622602890701263,
"grad_norm": 0.4645535945892334,
"learning_rate": 0.00029733937878243015,
"loss": 0.7716,
"step": 3670
},
{
"epoch": 0.46289007148030353,
"grad_norm": 0.4334595501422882,
"learning_rate": 0.0002973187275069544,
"loss": 0.7455,
"step": 3675
},
{
"epoch": 0.46351985389048084,
"grad_norm": 0.4452027678489685,
"learning_rate": 0.0002972979971179568,
"loss": 0.7533,
"step": 3680
},
{
"epoch": 0.46414963630065814,
"grad_norm": 0.4289001226425171,
"learning_rate": 0.0002972771876265701,
"loss": 0.8066,
"step": 3685
},
{
"epoch": 0.4647794187108354,
"grad_norm": 0.44446882605552673,
"learning_rate": 0.0002972562990439694,
"loss": 0.8017,
"step": 3690
},
{
"epoch": 0.4654092011210127,
"grad_norm": 0.4466266930103302,
"learning_rate": 0.00029723533138137256,
"loss": 0.7686,
"step": 3695
},
{
"epoch": 0.46603898353119,
"grad_norm": 0.44262874126434326,
"learning_rate": 0.0002972142846500395,
"loss": 0.7835,
"step": 3700
},
{
"epoch": 0.46666876594136725,
"grad_norm": 0.40932217240333557,
"learning_rate": 0.0002971931588612729,
"loss": 0.7844,
"step": 3705
},
{
"epoch": 0.46729854835154455,
"grad_norm": 0.38220685720443726,
"learning_rate": 0.0002971719540264177,
"loss": 0.7682,
"step": 3710
},
{
"epoch": 0.4679283307617218,
"grad_norm": 0.4890231788158417,
"learning_rate": 0.0002971506701568614,
"loss": 0.7883,
"step": 3715
},
{
"epoch": 0.4685581131718991,
"grad_norm": 0.44211700558662415,
"learning_rate": 0.00029712930726403397,
"loss": 0.7287,
"step": 3720
},
{
"epoch": 0.4691878955820764,
"grad_norm": 0.4585074782371521,
"learning_rate": 0.0002971078653594078,
"loss": 0.7452,
"step": 3725
},
{
"epoch": 0.46981767799225366,
"grad_norm": 0.3818283975124359,
"learning_rate": 0.00029708634445449754,
"loss": 0.751,
"step": 3730
},
{
"epoch": 0.47044746040243096,
"grad_norm": 0.4351007640361786,
"learning_rate": 0.00029706474456086054,
"loss": 0.7665,
"step": 3735
},
{
"epoch": 0.47107724281260827,
"grad_norm": 0.4167363941669464,
"learning_rate": 0.0002970430656900964,
"loss": 0.7421,
"step": 3740
},
{
"epoch": 0.4717070252227855,
"grad_norm": 0.40461474657058716,
"learning_rate": 0.0002970213078538472,
"loss": 0.7496,
"step": 3745
},
{
"epoch": 0.4723368076329628,
"grad_norm": 0.38994473218917847,
"learning_rate": 0.00029699947106379734,
"loss": 0.773,
"step": 3750
},
{
"epoch": 0.4729665900431401,
"grad_norm": 0.42335331439971924,
"learning_rate": 0.0002969775553316737,
"loss": 0.7496,
"step": 3755
},
{
"epoch": 0.47359637245331737,
"grad_norm": 0.39755743741989136,
"learning_rate": 0.0002969555606692455,
"loss": 0.7794,
"step": 3760
},
{
"epoch": 0.4742261548634947,
"grad_norm": 0.4671246409416199,
"learning_rate": 0.0002969334870883244,
"loss": 0.8289,
"step": 3765
},
{
"epoch": 0.4748559372736719,
"grad_norm": 0.4498395621776581,
"learning_rate": 0.00029691133460076443,
"loss": 0.7856,
"step": 3770
},
{
"epoch": 0.47548571968384923,
"grad_norm": 0.4068240225315094,
"learning_rate": 0.00029688910321846193,
"loss": 0.7572,
"step": 3775
},
{
"epoch": 0.47611550209402653,
"grad_norm": 0.4673171043395996,
"learning_rate": 0.0002968667929533557,
"loss": 0.7972,
"step": 3780
},
{
"epoch": 0.4767452845042038,
"grad_norm": 0.4210684597492218,
"learning_rate": 0.00029684440381742697,
"loss": 0.7566,
"step": 3785
},
{
"epoch": 0.4773750669143811,
"grad_norm": 0.4167214632034302,
"learning_rate": 0.000296821935822699,
"loss": 0.7383,
"step": 3790
},
{
"epoch": 0.4780048493245584,
"grad_norm": 0.3826481103897095,
"learning_rate": 0.0002967993889812378,
"loss": 0.7749,
"step": 3795
},
{
"epoch": 0.47863463173473564,
"grad_norm": 0.416202187538147,
"learning_rate": 0.0002967767633051514,
"loss": 0.7755,
"step": 3800
},
{
"epoch": 0.47926441414491294,
"grad_norm": 0.38089507818222046,
"learning_rate": 0.0002967540588065904,
"loss": 0.7813,
"step": 3805
},
{
"epoch": 0.4798941965550902,
"grad_norm": 0.5257447957992554,
"learning_rate": 0.0002967312754977476,
"loss": 0.7408,
"step": 3810
},
{
"epoch": 0.4805239789652675,
"grad_norm": 0.4472402334213257,
"learning_rate": 0.00029670841339085813,
"loss": 0.7946,
"step": 3815
},
{
"epoch": 0.4811537613754448,
"grad_norm": 0.39956340193748474,
"learning_rate": 0.00029668547249819957,
"loss": 0.7469,
"step": 3820
},
{
"epoch": 0.48178354378562205,
"grad_norm": 0.41508588194847107,
"learning_rate": 0.00029666245283209154,
"loss": 0.7328,
"step": 3825
},
{
"epoch": 0.48241332619579935,
"grad_norm": 0.3888874053955078,
"learning_rate": 0.00029663935440489624,
"loss": 0.7529,
"step": 3830
},
{
"epoch": 0.48304310860597666,
"grad_norm": 0.39619430899620056,
"learning_rate": 0.00029661617722901806,
"loss": 0.7406,
"step": 3835
},
{
"epoch": 0.4836728910161539,
"grad_norm": 0.38242536783218384,
"learning_rate": 0.0002965929213169036,
"loss": 0.7355,
"step": 3840
},
{
"epoch": 0.4843026734263312,
"grad_norm": 0.37065988779067993,
"learning_rate": 0.0002965695866810419,
"loss": 0.7087,
"step": 3845
},
{
"epoch": 0.4849324558365085,
"grad_norm": 0.4015233814716339,
"learning_rate": 0.0002965461733339641,
"loss": 0.7531,
"step": 3850
},
{
"epoch": 0.48556223824668576,
"grad_norm": 0.4394242465496063,
"learning_rate": 0.0002965226812882438,
"loss": 0.7619,
"step": 3855
},
{
"epoch": 0.48619202065686307,
"grad_norm": 0.39809074997901917,
"learning_rate": 0.00029649911055649666,
"loss": 0.7702,
"step": 3860
},
{
"epoch": 0.4868218030670403,
"grad_norm": 0.5184118747711182,
"learning_rate": 0.0002964754611513808,
"loss": 0.7931,
"step": 3865
},
{
"epoch": 0.4874515854772176,
"grad_norm": 0.5002544522285461,
"learning_rate": 0.00029645173308559644,
"loss": 0.7989,
"step": 3870
},
{
"epoch": 0.4880813678873949,
"grad_norm": 0.551458477973938,
"learning_rate": 0.0002964279263718861,
"loss": 0.7345,
"step": 3875
},
{
"epoch": 0.48871115029757217,
"grad_norm": 0.499612420797348,
"learning_rate": 0.0002964040410230345,
"loss": 0.7885,
"step": 3880
},
{
"epoch": 0.4893409327077495,
"grad_norm": 0.5279458165168762,
"learning_rate": 0.0002963800770518687,
"loss": 0.795,
"step": 3885
},
{
"epoch": 0.4899707151179268,
"grad_norm": 0.47077476978302,
"learning_rate": 0.0002963560344712578,
"loss": 0.7716,
"step": 3890
},
{
"epoch": 0.49060049752810403,
"grad_norm": 0.8377729058265686,
"learning_rate": 0.0002963319132941133,
"loss": 0.7625,
"step": 3895
},
{
"epoch": 0.49123027993828133,
"grad_norm": 0.43871793150901794,
"learning_rate": 0.0002963077135333888,
"loss": 0.734,
"step": 3900
},
{
"epoch": 0.4918600623484586,
"grad_norm": 0.44589656591415405,
"learning_rate": 0.00029628343520208004,
"loss": 0.7735,
"step": 3905
},
{
"epoch": 0.4924898447586359,
"grad_norm": 0.7113938927650452,
"learning_rate": 0.00029625907831322515,
"loss": 0.7611,
"step": 3910
},
{
"epoch": 0.4931196271688132,
"grad_norm": 0.3830680847167969,
"learning_rate": 0.0002962346428799043,
"loss": 0.7399,
"step": 3915
},
{
"epoch": 0.49374940957899044,
"grad_norm": 0.4787169396877289,
"learning_rate": 0.00029621012891523985,
"loss": 0.7572,
"step": 3920
},
{
"epoch": 0.49437919198916774,
"grad_norm": 0.428469717502594,
"learning_rate": 0.0002961855364323964,
"loss": 0.7548,
"step": 3925
},
{
"epoch": 0.49500897439934505,
"grad_norm": 0.3982272148132324,
"learning_rate": 0.00029616086544458065,
"loss": 0.7846,
"step": 3930
},
{
"epoch": 0.4956387568095223,
"grad_norm": 0.45961281657218933,
"learning_rate": 0.00029613611596504146,
"loss": 0.8041,
"step": 3935
},
{
"epoch": 0.4962685392196996,
"grad_norm": 0.400146484375,
"learning_rate": 0.00029611128800706996,
"loss": 0.7395,
"step": 3940
},
{
"epoch": 0.4968983216298769,
"grad_norm": 0.3984740674495697,
"learning_rate": 0.00029608638158399925,
"loss": 0.7569,
"step": 3945
},
{
"epoch": 0.49752810404005415,
"grad_norm": 0.4343632161617279,
"learning_rate": 0.0002960613967092046,
"loss": 0.7958,
"step": 3950
},
{
"epoch": 0.49815788645023146,
"grad_norm": 0.4569413363933563,
"learning_rate": 0.0002960363333961036,
"loss": 0.7673,
"step": 3955
},
{
"epoch": 0.4987876688604087,
"grad_norm": 0.4818612039089203,
"learning_rate": 0.0002960111916581557,
"loss": 0.7621,
"step": 3960
},
{
"epoch": 0.499417451270586,
"grad_norm": 0.4524887800216675,
"learning_rate": 0.0002959859715088626,
"loss": 0.7793,
"step": 3965
},
{
"epoch": 0.5000472336807633,
"grad_norm": 0.4414517283439636,
"learning_rate": 0.0002959606729617682,
"loss": 0.7557,
"step": 3970
},
{
"epoch": 0.5006770160909406,
"grad_norm": 0.3883852958679199,
"learning_rate": 0.0002959352960304583,
"loss": 0.7146,
"step": 3975
},
{
"epoch": 0.5013067985011178,
"grad_norm": 0.34819212555885315,
"learning_rate": 0.00029590984072856084,
"loss": 0.7271,
"step": 3980
},
{
"epoch": 0.5019365809112951,
"grad_norm": 0.3943585753440857,
"learning_rate": 0.0002958843070697461,
"loss": 0.817,
"step": 3985
},
{
"epoch": 0.5025663633214724,
"grad_norm": 0.3881372809410095,
"learning_rate": 0.000295858695067726,
"loss": 0.6997,
"step": 3990
},
{
"epoch": 0.5031961457316497,
"grad_norm": 0.4077765941619873,
"learning_rate": 0.00029583300473625497,
"loss": 0.789,
"step": 3995
},
{
"epoch": 0.503825928141827,
"grad_norm": 0.4519467353820801,
"learning_rate": 0.0002958072360891292,
"loss": 0.7081,
"step": 4000
},
{
"epoch": 0.503825928141827,
"eval_loss": 0.3237769305706024,
"eval_runtime": 6.2328,
"eval_samples_per_second": 160.442,
"eval_steps_per_second": 10.108,
"step": 4000
},
{
"epoch": 0.5044557105520043,
"grad_norm": 0.4161011278629303,
"learning_rate": 0.00029578138914018704,
"loss": 0.7426,
"step": 4005
},
{
"epoch": 0.5050854929621815,
"grad_norm": 0.4170863926410675,
"learning_rate": 0.0002957554639033089,
"loss": 0.7614,
"step": 4010
},
{
"epoch": 0.5057152753723588,
"grad_norm": 0.41827666759490967,
"learning_rate": 0.0002957294603924172,
"loss": 0.7339,
"step": 4015
},
{
"epoch": 0.5063450577825361,
"grad_norm": 0.4575699269771576,
"learning_rate": 0.0002957033786214766,
"loss": 0.7506,
"step": 4020
},
{
"epoch": 0.5069748401927134,
"grad_norm": 0.39499175548553467,
"learning_rate": 0.00029567721860449333,
"loss": 0.7227,
"step": 4025
},
{
"epoch": 0.5076046226028907,
"grad_norm": 0.4190625548362732,
"learning_rate": 0.00029565098035551606,
"loss": 0.7375,
"step": 4030
},
{
"epoch": 0.5082344050130679,
"grad_norm": 0.41470181941986084,
"learning_rate": 0.00029562466388863534,
"loss": 0.7953,
"step": 4035
},
{
"epoch": 0.5088641874232452,
"grad_norm": 0.5376180410385132,
"learning_rate": 0.00029559826921798373,
"loss": 0.7927,
"step": 4040
},
{
"epoch": 0.5094939698334225,
"grad_norm": 0.41073331236839294,
"learning_rate": 0.0002955717963577357,
"loss": 0.7175,
"step": 4045
},
{
"epoch": 0.5101237522435998,
"grad_norm": 0.3894195854663849,
"learning_rate": 0.0002955452453221078,
"loss": 0.743,
"step": 4050
},
{
"epoch": 0.5107535346537772,
"grad_norm": 0.37404918670654297,
"learning_rate": 0.00029551861612535856,
"loss": 0.6833,
"step": 4055
},
{
"epoch": 0.5113833170639545,
"grad_norm": 0.4107655882835388,
"learning_rate": 0.0002954919087817885,
"loss": 0.7588,
"step": 4060
},
{
"epoch": 0.5120130994741316,
"grad_norm": 0.4086790382862091,
"learning_rate": 0.00029546512330574004,
"loss": 0.7328,
"step": 4065
},
{
"epoch": 0.512642881884309,
"grad_norm": 0.4246830344200134,
"learning_rate": 0.0002954382597115976,
"loss": 0.7171,
"step": 4070
},
{
"epoch": 0.5132726642944863,
"grad_norm": 0.4021676182746887,
"learning_rate": 0.00029541131801378743,
"loss": 0.8009,
"step": 4075
},
{
"epoch": 0.5139024467046636,
"grad_norm": 0.42611315846443176,
"learning_rate": 0.00029538429822677806,
"loss": 0.7338,
"step": 4080
},
{
"epoch": 0.5145322291148409,
"grad_norm": 0.40971845388412476,
"learning_rate": 0.0002953572003650795,
"loss": 0.7883,
"step": 4085
},
{
"epoch": 0.5151620115250181,
"grad_norm": 0.4226076304912567,
"learning_rate": 0.0002953300244432441,
"loss": 0.7696,
"step": 4090
},
{
"epoch": 0.5157917939351954,
"grad_norm": 0.4504645764827728,
"learning_rate": 0.0002953027704758659,
"loss": 0.7123,
"step": 4095
},
{
"epoch": 0.5164215763453727,
"grad_norm": 0.4032662510871887,
"learning_rate": 0.00029527543847758086,
"loss": 0.6786,
"step": 4100
},
{
"epoch": 0.51705135875555,
"grad_norm": 0.4030795097351074,
"learning_rate": 0.00029524802846306694,
"loss": 0.7335,
"step": 4105
},
{
"epoch": 0.5176811411657273,
"grad_norm": 0.4887290596961975,
"learning_rate": 0.0002952205404470439,
"loss": 0.7238,
"step": 4110
},
{
"epoch": 0.5183109235759045,
"grad_norm": 0.4061615467071533,
"learning_rate": 0.00029519297444427343,
"loss": 0.7733,
"step": 4115
},
{
"epoch": 0.5189407059860818,
"grad_norm": 0.4060840308666229,
"learning_rate": 0.00029516533046955917,
"loss": 0.7268,
"step": 4120
},
{
"epoch": 0.5195704883962591,
"grad_norm": 0.4293743371963501,
"learning_rate": 0.0002951376085377465,
"loss": 0.7234,
"step": 4125
},
{
"epoch": 0.5202002708064364,
"grad_norm": 0.410264790058136,
"learning_rate": 0.00029510980866372273,
"loss": 0.774,
"step": 4130
},
{
"epoch": 0.5208300532166137,
"grad_norm": 0.3944232761859894,
"learning_rate": 0.0002950819308624171,
"loss": 0.7517,
"step": 4135
},
{
"epoch": 0.521459835626791,
"grad_norm": 0.4245687425136566,
"learning_rate": 0.0002950539751488005,
"loss": 0.7612,
"step": 4140
},
{
"epoch": 0.5220896180369682,
"grad_norm": 0.3795585632324219,
"learning_rate": 0.00029502594153788593,
"loss": 0.7778,
"step": 4145
},
{
"epoch": 0.5227194004471455,
"grad_norm": 0.42911338806152344,
"learning_rate": 0.000294997830044728,
"loss": 0.8033,
"step": 4150
},
{
"epoch": 0.5233491828573228,
"grad_norm": 0.4150259494781494,
"learning_rate": 0.0002949696406844232,
"loss": 0.7326,
"step": 4155
},
{
"epoch": 0.5239789652675001,
"grad_norm": 0.3846616744995117,
"learning_rate": 0.0002949413734721099,
"loss": 0.7085,
"step": 4160
},
{
"epoch": 0.5246087476776774,
"grad_norm": 0.3434165418148041,
"learning_rate": 0.00029491302842296824,
"loss": 0.711,
"step": 4165
},
{
"epoch": 0.5252385300878546,
"grad_norm": 0.33985382318496704,
"learning_rate": 0.0002948846055522202,
"loss": 0.7493,
"step": 4170
},
{
"epoch": 0.5258683124980319,
"grad_norm": 0.3809979259967804,
"learning_rate": 0.0002948561048751294,
"loss": 0.7224,
"step": 4175
},
{
"epoch": 0.5264980949082092,
"grad_norm": 0.45042338967323303,
"learning_rate": 0.00029482752640700143,
"loss": 0.7554,
"step": 4180
},
{
"epoch": 0.5271278773183865,
"grad_norm": 0.4068913757801056,
"learning_rate": 0.00029479887016318357,
"loss": 0.7267,
"step": 4185
},
{
"epoch": 0.5277576597285638,
"grad_norm": 0.41964098811149597,
"learning_rate": 0.0002947701361590649,
"loss": 0.7255,
"step": 4190
},
{
"epoch": 0.5283874421387411,
"grad_norm": 0.3956906795501709,
"learning_rate": 0.0002947413244100762,
"loss": 0.7272,
"step": 4195
},
{
"epoch": 0.5290172245489183,
"grad_norm": 0.41254857182502747,
"learning_rate": 0.0002947124349316901,
"loss": 0.7155,
"step": 4200
},
{
"epoch": 0.5296470069590956,
"grad_norm": 0.4162386655807495,
"learning_rate": 0.0002946834677394208,
"loss": 0.7729,
"step": 4205
},
{
"epoch": 0.5302767893692729,
"grad_norm": 0.4521070420742035,
"learning_rate": 0.00029465442284882436,
"loss": 0.7328,
"step": 4210
},
{
"epoch": 0.5309065717794502,
"grad_norm": 0.3702057898044586,
"learning_rate": 0.00029462530027549866,
"loss": 0.7592,
"step": 4215
},
{
"epoch": 0.5315363541896275,
"grad_norm": 0.4132764935493469,
"learning_rate": 0.00029459610003508313,
"loss": 0.7238,
"step": 4220
},
{
"epoch": 0.5321661365998047,
"grad_norm": 0.3817763328552246,
"learning_rate": 0.0002945668221432589,
"loss": 0.7524,
"step": 4225
},
{
"epoch": 0.532795919009982,
"grad_norm": 0.41137659549713135,
"learning_rate": 0.000294537466615749,
"loss": 0.7405,
"step": 4230
},
{
"epoch": 0.5334257014201593,
"grad_norm": 0.446150541305542,
"learning_rate": 0.00029450803346831787,
"loss": 0.7481,
"step": 4235
},
{
"epoch": 0.5340554838303366,
"grad_norm": 0.37535202503204346,
"learning_rate": 0.0002944785227167719,
"loss": 0.7505,
"step": 4240
},
{
"epoch": 0.5346852662405139,
"grad_norm": 0.4109747111797333,
"learning_rate": 0.000294448934376959,
"loss": 0.7406,
"step": 4245
},
{
"epoch": 0.5353150486506912,
"grad_norm": 0.4233269989490509,
"learning_rate": 0.00029441926846476873,
"loss": 0.7823,
"step": 4250
},
{
"epoch": 0.5359448310608684,
"grad_norm": 0.40127456188201904,
"learning_rate": 0.00029438952499613244,
"loss": 0.7486,
"step": 4255
},
{
"epoch": 0.5365746134710457,
"grad_norm": 0.40279653668403625,
"learning_rate": 0.000294359703987023,
"loss": 0.7157,
"step": 4260
},
{
"epoch": 0.537204395881223,
"grad_norm": 0.34208250045776367,
"learning_rate": 0.000294329805453455,
"loss": 0.7158,
"step": 4265
},
{
"epoch": 0.5378341782914003,
"grad_norm": 0.41574689745903015,
"learning_rate": 0.0002942998294114846,
"loss": 0.7668,
"step": 4270
},
{
"epoch": 0.5384639607015776,
"grad_norm": 0.401426762342453,
"learning_rate": 0.0002942697758772097,
"loss": 0.734,
"step": 4275
},
{
"epoch": 0.5390937431117548,
"grad_norm": 0.4085477292537689,
"learning_rate": 0.00029423964486676964,
"loss": 0.7448,
"step": 4280
},
{
"epoch": 0.5397235255219321,
"grad_norm": 0.43037959933280945,
"learning_rate": 0.0002942094363963456,
"loss": 0.7618,
"step": 4285
},
{
"epoch": 0.5403533079321095,
"grad_norm": 0.34685570001602173,
"learning_rate": 0.00029417915048216003,
"loss": 0.7314,
"step": 4290
},
{
"epoch": 0.5409830903422868,
"grad_norm": 0.3967381417751312,
"learning_rate": 0.00029414878714047725,
"loss": 0.7465,
"step": 4295
},
{
"epoch": 0.5416128727524641,
"grad_norm": 0.36378154158592224,
"learning_rate": 0.0002941183463876031,
"loss": 0.7372,
"step": 4300
},
{
"epoch": 0.5422426551626414,
"grad_norm": 0.3804253339767456,
"learning_rate": 0.00029408782823988494,
"loss": 0.7488,
"step": 4305
},
{
"epoch": 0.5428724375728186,
"grad_norm": 0.3679543137550354,
"learning_rate": 0.00029405723271371166,
"loss": 0.7253,
"step": 4310
},
{
"epoch": 0.5435022199829959,
"grad_norm": 0.35688257217407227,
"learning_rate": 0.0002940265598255138,
"loss": 0.7523,
"step": 4315
},
{
"epoch": 0.5441320023931732,
"grad_norm": 0.40890881419181824,
"learning_rate": 0.00029399580959176344,
"loss": 0.756,
"step": 4320
},
{
"epoch": 0.5447617848033505,
"grad_norm": 0.478547602891922,
"learning_rate": 0.00029396498202897406,
"loss": 0.7249,
"step": 4325
},
{
"epoch": 0.5453915672135278,
"grad_norm": 0.40117356181144714,
"learning_rate": 0.0002939340771537009,
"loss": 0.7466,
"step": 4330
},
{
"epoch": 0.546021349623705,
"grad_norm": 0.42868953943252563,
"learning_rate": 0.0002939030949825404,
"loss": 0.7894,
"step": 4335
},
{
"epoch": 0.5466511320338823,
"grad_norm": 0.41796940565109253,
"learning_rate": 0.0002938720355321309,
"loss": 0.7446,
"step": 4340
},
{
"epoch": 0.5472809144440596,
"grad_norm": 0.427336186170578,
"learning_rate": 0.0002938408988191519,
"loss": 0.7824,
"step": 4345
},
{
"epoch": 0.5479106968542369,
"grad_norm": 0.38179048895835876,
"learning_rate": 0.00029380968486032456,
"loss": 0.7427,
"step": 4350
},
{
"epoch": 0.5485404792644142,
"grad_norm": 0.39974477887153625,
"learning_rate": 0.0002937783936724115,
"loss": 0.7347,
"step": 4355
},
{
"epoch": 0.5491702616745914,
"grad_norm": 0.3805896043777466,
"learning_rate": 0.00029374702527221674,
"loss": 0.7547,
"step": 4360
},
{
"epoch": 0.5498000440847687,
"grad_norm": 0.43362486362457275,
"learning_rate": 0.0002937155796765859,
"loss": 0.7651,
"step": 4365
},
{
"epoch": 0.550429826494946,
"grad_norm": 0.38877996802330017,
"learning_rate": 0.000293684056902406,
"loss": 0.7054,
"step": 4370
},
{
"epoch": 0.5510596089051233,
"grad_norm": 0.393184095621109,
"learning_rate": 0.00029365245696660544,
"loss": 0.7453,
"step": 4375
},
{
"epoch": 0.5516893913153006,
"grad_norm": 0.3892836570739746,
"learning_rate": 0.0002936207798861541,
"loss": 0.7036,
"step": 4380
},
{
"epoch": 0.5523191737254779,
"grad_norm": 0.3737259805202484,
"learning_rate": 0.0002935890256780633,
"loss": 0.7403,
"step": 4385
},
{
"epoch": 0.5529489561356551,
"grad_norm": 0.36731937527656555,
"learning_rate": 0.00029355719435938585,
"loss": 0.7098,
"step": 4390
},
{
"epoch": 0.5535787385458324,
"grad_norm": 0.40238016843795776,
"learning_rate": 0.00029352528594721577,
"loss": 0.7625,
"step": 4395
},
{
"epoch": 0.5542085209560097,
"grad_norm": 0.3878697454929352,
"learning_rate": 0.0002934933004586887,
"loss": 0.7486,
"step": 4400
},
{
"epoch": 0.554838303366187,
"grad_norm": 0.36463412642478943,
"learning_rate": 0.00029346123791098157,
"loss": 0.7489,
"step": 4405
},
{
"epoch": 0.5554680857763643,
"grad_norm": 0.3860667049884796,
"learning_rate": 0.0002934290983213126,
"loss": 0.7503,
"step": 4410
},
{
"epoch": 0.5560978681865415,
"grad_norm": 0.40702390670776367,
"learning_rate": 0.0002933968817069417,
"loss": 0.6892,
"step": 4415
},
{
"epoch": 0.5567276505967188,
"grad_norm": 0.4769366979598999,
"learning_rate": 0.0002933645880851697,
"loss": 0.7285,
"step": 4420
},
{
"epoch": 0.5573574330068961,
"grad_norm": 0.37400034070014954,
"learning_rate": 0.00029333221747333913,
"loss": 0.7055,
"step": 4425
},
{
"epoch": 0.5579872154170734,
"grad_norm": 0.4280668795108795,
"learning_rate": 0.00029329976988883374,
"loss": 0.7629,
"step": 4430
},
{
"epoch": 0.5586169978272507,
"grad_norm": 0.3710954189300537,
"learning_rate": 0.00029326724534907856,
"loss": 0.696,
"step": 4435
},
{
"epoch": 0.559246780237428,
"grad_norm": 0.4311872720718384,
"learning_rate": 0.0002932346438715401,
"loss": 0.726,
"step": 4440
},
{
"epoch": 0.5598765626476052,
"grad_norm": 0.3708207309246063,
"learning_rate": 0.000293201965473726,
"loss": 0.7308,
"step": 4445
},
{
"epoch": 0.5605063450577825,
"grad_norm": 0.36177051067352295,
"learning_rate": 0.00029316921017318536,
"loss": 0.7403,
"step": 4450
},
{
"epoch": 0.5611361274679598,
"grad_norm": 0.4313011169433594,
"learning_rate": 0.0002931363779875086,
"loss": 0.7053,
"step": 4455
},
{
"epoch": 0.5617659098781371,
"grad_norm": 0.36055561900138855,
"learning_rate": 0.0002931034689343272,
"loss": 0.7544,
"step": 4460
},
{
"epoch": 0.5623956922883144,
"grad_norm": 0.37126588821411133,
"learning_rate": 0.0002930704830313142,
"loss": 0.7444,
"step": 4465
},
{
"epoch": 0.5630254746984916,
"grad_norm": 0.35056549310684204,
"learning_rate": 0.00029303742029618377,
"loss": 0.7251,
"step": 4470
},
{
"epoch": 0.5636552571086689,
"grad_norm": 0.3944834768772125,
"learning_rate": 0.0002930042807466913,
"loss": 0.771,
"step": 4475
},
{
"epoch": 0.5642850395188462,
"grad_norm": 0.39250391721725464,
"learning_rate": 0.0002929710644006334,
"loss": 0.7177,
"step": 4480
},
{
"epoch": 0.5649148219290235,
"grad_norm": 0.41848230361938477,
"learning_rate": 0.00029293777127584826,
"loss": 0.7362,
"step": 4485
},
{
"epoch": 0.5655446043392008,
"grad_norm": 0.3222586214542389,
"learning_rate": 0.00029290440139021477,
"loss": 0.6746,
"step": 4490
},
{
"epoch": 0.5661743867493781,
"grad_norm": 0.4275425672531128,
"learning_rate": 0.00029287095476165356,
"loss": 0.7641,
"step": 4495
},
{
"epoch": 0.5668041691595553,
"grad_norm": 0.37914222478866577,
"learning_rate": 0.0002928374314081261,
"loss": 0.7367,
"step": 4500
},
{
"epoch": 0.5674339515697326,
"grad_norm": 0.36903491616249084,
"learning_rate": 0.00029280383134763516,
"loss": 0.726,
"step": 4505
},
{
"epoch": 0.56806373397991,
"grad_norm": 0.45876601338386536,
"learning_rate": 0.0002927701545982249,
"loss": 0.7285,
"step": 4510
},
{
"epoch": 0.5686935163900873,
"grad_norm": 0.3885752856731415,
"learning_rate": 0.0002927364011779803,
"loss": 0.7111,
"step": 4515
},
{
"epoch": 0.5693232988002646,
"grad_norm": 0.3738529086112976,
"learning_rate": 0.00029270257110502784,
"loss": 0.7381,
"step": 4520
},
{
"epoch": 0.5699530812104417,
"grad_norm": 0.38678133487701416,
"learning_rate": 0.0002926686643975351,
"loss": 0.7069,
"step": 4525
},
{
"epoch": 0.570582863620619,
"grad_norm": 0.38699817657470703,
"learning_rate": 0.0002926346810737106,
"loss": 0.7456,
"step": 4530
},
{
"epoch": 0.5712126460307964,
"grad_norm": 0.39948272705078125,
"learning_rate": 0.0002926006211518043,
"loss": 0.7018,
"step": 4535
},
{
"epoch": 0.5718424284409737,
"grad_norm": 0.36441704630851746,
"learning_rate": 0.00029256648465010706,
"loss": 0.7155,
"step": 4540
},
{
"epoch": 0.572472210851151,
"grad_norm": 0.38412773609161377,
"learning_rate": 0.00029253227158695103,
"loss": 0.7131,
"step": 4545
},
{
"epoch": 0.5731019932613282,
"grad_norm": 0.3713320791721344,
"learning_rate": 0.0002924979819807094,
"loss": 0.7109,
"step": 4550
},
{
"epoch": 0.5737317756715055,
"grad_norm": 0.41460588574409485,
"learning_rate": 0.00029246361584979637,
"loss": 0.7218,
"step": 4555
},
{
"epoch": 0.5743615580816828,
"grad_norm": 0.37706735730171204,
"learning_rate": 0.0002924291732126675,
"loss": 0.7364,
"step": 4560
},
{
"epoch": 0.5749913404918601,
"grad_norm": 0.3931211829185486,
"learning_rate": 0.00029239465408781914,
"loss": 0.793,
"step": 4565
},
{
"epoch": 0.5756211229020374,
"grad_norm": 0.4280949831008911,
"learning_rate": 0.0002923600584937889,
"loss": 0.7577,
"step": 4570
},
{
"epoch": 0.5762509053122147,
"grad_norm": 0.408357173204422,
"learning_rate": 0.0002923253864491554,
"loss": 0.6866,
"step": 4575
},
{
"epoch": 0.5768806877223919,
"grad_norm": 0.3654685318470001,
"learning_rate": 0.0002922906379725383,
"loss": 0.7409,
"step": 4580
},
{
"epoch": 0.5775104701325692,
"grad_norm": 0.3723433017730713,
"learning_rate": 0.0002922558130825984,
"loss": 0.7106,
"step": 4585
},
{
"epoch": 0.5781402525427465,
"grad_norm": 0.40489017963409424,
"learning_rate": 0.00029222091179803735,
"loss": 0.7311,
"step": 4590
},
{
"epoch": 0.5787700349529238,
"grad_norm": 0.40270909667015076,
"learning_rate": 0.000292185934137598,
"loss": 0.7393,
"step": 4595
},
{
"epoch": 0.5793998173631011,
"grad_norm": 0.4228857159614563,
"learning_rate": 0.0002921508801200642,
"loss": 0.7253,
"step": 4600
},
{
"epoch": 0.5800295997732783,
"grad_norm": 0.39830881357192993,
"learning_rate": 0.0002921157497642607,
"loss": 0.7413,
"step": 4605
},
{
"epoch": 0.5806593821834556,
"grad_norm": 0.40520498156547546,
"learning_rate": 0.00029208054308905323,
"loss": 0.6902,
"step": 4610
},
{
"epoch": 0.5812891645936329,
"grad_norm": 0.3546881377696991,
"learning_rate": 0.0002920452601133487,
"loss": 0.7104,
"step": 4615
},
{
"epoch": 0.5819189470038102,
"grad_norm": 0.40294864773750305,
"learning_rate": 0.0002920099008560949,
"loss": 0.7258,
"step": 4620
},
{
"epoch": 0.5825487294139875,
"grad_norm": 0.36979302763938904,
"learning_rate": 0.0002919744653362804,
"loss": 0.708,
"step": 4625
},
{
"epoch": 0.5831785118241648,
"grad_norm": 0.42616382241249084,
"learning_rate": 0.000291938953572935,
"loss": 0.7044,
"step": 4630
},
{
"epoch": 0.583808294234342,
"grad_norm": 0.3644506335258484,
"learning_rate": 0.0002919033655851293,
"loss": 0.7277,
"step": 4635
},
{
"epoch": 0.5844380766445193,
"grad_norm": 0.34578534960746765,
"learning_rate": 0.0002918677013919749,
"loss": 0.7233,
"step": 4640
},
{
"epoch": 0.5850678590546966,
"grad_norm": 0.3914281725883484,
"learning_rate": 0.00029183196101262423,
"loss": 0.6829,
"step": 4645
},
{
"epoch": 0.5856976414648739,
"grad_norm": 0.35399550199508667,
"learning_rate": 0.0002917961444662707,
"loss": 0.7371,
"step": 4650
},
{
"epoch": 0.5863274238750512,
"grad_norm": 0.3999468684196472,
"learning_rate": 0.0002917602517721486,
"loss": 0.7228,
"step": 4655
},
{
"epoch": 0.5869572062852284,
"grad_norm": 0.4196580946445465,
"learning_rate": 0.0002917242829495332,
"loss": 0.7013,
"step": 4660
},
{
"epoch": 0.5875869886954057,
"grad_norm": 0.38301941752433777,
"learning_rate": 0.0002916882380177405,
"loss": 0.7409,
"step": 4665
},
{
"epoch": 0.588216771105583,
"grad_norm": 0.3997241258621216,
"learning_rate": 0.0002916521169961275,
"loss": 0.7216,
"step": 4670
},
{
"epoch": 0.5888465535157603,
"grad_norm": 0.3389524817466736,
"learning_rate": 0.00029161591990409203,
"loss": 0.7109,
"step": 4675
},
{
"epoch": 0.5894763359259376,
"grad_norm": 0.38282495737075806,
"learning_rate": 0.0002915796467610727,
"loss": 0.7608,
"step": 4680
},
{
"epoch": 0.5901061183361149,
"grad_norm": 0.40406349301338196,
"learning_rate": 0.000291543297586549,
"loss": 0.7062,
"step": 4685
},
{
"epoch": 0.5907359007462921,
"grad_norm": 0.37658101320266724,
"learning_rate": 0.0002915068724000413,
"loss": 0.7305,
"step": 4690
},
{
"epoch": 0.5913656831564694,
"grad_norm": 0.397401362657547,
"learning_rate": 0.0002914703712211108,
"loss": 0.7276,
"step": 4695
},
{
"epoch": 0.5919954655666467,
"grad_norm": 0.4348791539669037,
"learning_rate": 0.0002914337940693594,
"loss": 0.7572,
"step": 4700
},
{
"epoch": 0.592625247976824,
"grad_norm": 0.372659295797348,
"learning_rate": 0.0002913971409644299,
"loss": 0.7436,
"step": 4705
},
{
"epoch": 0.5932550303870013,
"grad_norm": 0.3933033049106598,
"learning_rate": 0.0002913604119260059,
"loss": 0.7229,
"step": 4710
},
{
"epoch": 0.5938848127971785,
"grad_norm": 0.35579994320869446,
"learning_rate": 0.0002913236069738116,
"loss": 0.7055,
"step": 4715
},
{
"epoch": 0.5945145952073558,
"grad_norm": 0.40102267265319824,
"learning_rate": 0.0002912867261276122,
"loss": 0.7167,
"step": 4720
},
{
"epoch": 0.5951443776175331,
"grad_norm": 0.3881862163543701,
"learning_rate": 0.0002912497694072136,
"loss": 0.7395,
"step": 4725
},
{
"epoch": 0.5957741600277104,
"grad_norm": 0.43878352642059326,
"learning_rate": 0.00029121273683246234,
"loss": 0.7251,
"step": 4730
},
{
"epoch": 0.5964039424378877,
"grad_norm": 0.3500851094722748,
"learning_rate": 0.0002911756284232457,
"loss": 0.6989,
"step": 4735
},
{
"epoch": 0.5970337248480649,
"grad_norm": 0.3887772262096405,
"learning_rate": 0.00029113844419949184,
"loss": 0.7324,
"step": 4740
},
{
"epoch": 0.5976635072582422,
"grad_norm": 0.376321405172348,
"learning_rate": 0.0002911011841811695,
"loss": 0.7239,
"step": 4745
},
{
"epoch": 0.5982932896684195,
"grad_norm": 0.3770900070667267,
"learning_rate": 0.00029106384838828816,
"loss": 0.6973,
"step": 4750
},
{
"epoch": 0.5989230720785969,
"grad_norm": 0.366205096244812,
"learning_rate": 0.000291026436840898,
"loss": 0.749,
"step": 4755
},
{
"epoch": 0.5995528544887742,
"grad_norm": 0.40675458312034607,
"learning_rate": 0.00029098894955908983,
"loss": 0.7155,
"step": 4760
},
{
"epoch": 0.6001826368989515,
"grad_norm": 0.4125664532184601,
"learning_rate": 0.0002909513865629953,
"loss": 0.7455,
"step": 4765
},
{
"epoch": 0.6008124193091287,
"grad_norm": 0.39872869849205017,
"learning_rate": 0.0002909137478727864,
"loss": 0.7194,
"step": 4770
},
{
"epoch": 0.601442201719306,
"grad_norm": 0.3824906051158905,
"learning_rate": 0.00029087603350867616,
"loss": 0.742,
"step": 4775
},
{
"epoch": 0.6020719841294833,
"grad_norm": 0.37914398312568665,
"learning_rate": 0.00029083824349091794,
"loss": 0.692,
"step": 4780
},
{
"epoch": 0.6027017665396606,
"grad_norm": 0.35589495301246643,
"learning_rate": 0.0002908003778398059,
"loss": 0.6706,
"step": 4785
},
{
"epoch": 0.6033315489498379,
"grad_norm": 0.34000104665756226,
"learning_rate": 0.0002907624365756748,
"loss": 0.7506,
"step": 4790
},
{
"epoch": 0.6039613313600151,
"grad_norm": 0.34795689582824707,
"learning_rate": 0.0002907244197188998,
"loss": 0.7097,
"step": 4795
},
{
"epoch": 0.6045911137701924,
"grad_norm": 0.38767385482788086,
"learning_rate": 0.00029068632728989697,
"loss": 0.6986,
"step": 4800
},
{
"epoch": 0.6052208961803697,
"grad_norm": 0.40651988983154297,
"learning_rate": 0.00029064815930912276,
"loss": 0.7159,
"step": 4805
},
{
"epoch": 0.605850678590547,
"grad_norm": 0.37715932726860046,
"learning_rate": 0.00029060991579707424,
"loss": 0.7189,
"step": 4810
},
{
"epoch": 0.6064804610007243,
"grad_norm": 0.3925745487213135,
"learning_rate": 0.0002905715967742891,
"loss": 0.6956,
"step": 4815
},
{
"epoch": 0.6071102434109016,
"grad_norm": 0.33669450879096985,
"learning_rate": 0.0002905332022613455,
"loss": 0.6806,
"step": 4820
},
{
"epoch": 0.6077400258210788,
"grad_norm": 0.38812607526779175,
"learning_rate": 0.00029049473227886214,
"loss": 0.6997,
"step": 4825
},
{
"epoch": 0.6083698082312561,
"grad_norm": 0.3890033960342407,
"learning_rate": 0.00029045618684749833,
"loss": 0.7306,
"step": 4830
},
{
"epoch": 0.6089995906414334,
"grad_norm": 0.4020345211029053,
"learning_rate": 0.00029041756598795383,
"loss": 0.7357,
"step": 4835
},
{
"epoch": 0.6096293730516107,
"grad_norm": 0.39244237542152405,
"learning_rate": 0.0002903788697209689,
"loss": 0.6956,
"step": 4840
},
{
"epoch": 0.610259155461788,
"grad_norm": 0.35866880416870117,
"learning_rate": 0.0002903400980673243,
"loss": 0.7219,
"step": 4845
},
{
"epoch": 0.6108889378719652,
"grad_norm": 0.3912501931190491,
"learning_rate": 0.0002903012510478414,
"loss": 0.7194,
"step": 4850
},
{
"epoch": 0.6115187202821425,
"grad_norm": 0.3933585584163666,
"learning_rate": 0.00029026232868338184,
"loss": 0.7136,
"step": 4855
},
{
"epoch": 0.6121485026923198,
"grad_norm": 0.37603482604026794,
"learning_rate": 0.0002902233309948479,
"loss": 0.7208,
"step": 4860
},
{
"epoch": 0.6127782851024971,
"grad_norm": 0.36919742822647095,
"learning_rate": 0.00029018425800318205,
"loss": 0.7499,
"step": 4865
},
{
"epoch": 0.6134080675126744,
"grad_norm": 0.38067975640296936,
"learning_rate": 0.0002901451097293676,
"loss": 0.7468,
"step": 4870
},
{
"epoch": 0.6140378499228517,
"grad_norm": 0.33954986929893494,
"learning_rate": 0.00029010588619442793,
"loss": 0.6894,
"step": 4875
},
{
"epoch": 0.6146676323330289,
"grad_norm": 0.38103437423706055,
"learning_rate": 0.000290066587419427,
"loss": 0.6661,
"step": 4880
},
{
"epoch": 0.6152974147432062,
"grad_norm": 0.3855966031551361,
"learning_rate": 0.00029002721342546924,
"loss": 0.7138,
"step": 4885
},
{
"epoch": 0.6159271971533835,
"grad_norm": 0.5084123611450195,
"learning_rate": 0.00028998776423369923,
"loss": 0.7005,
"step": 4890
},
{
"epoch": 0.6165569795635608,
"grad_norm": 0.36192139983177185,
"learning_rate": 0.0002899482398653022,
"loss": 0.7386,
"step": 4895
},
{
"epoch": 0.6171867619737381,
"grad_norm": 0.37423619627952576,
"learning_rate": 0.0002899086403415037,
"loss": 0.7172,
"step": 4900
},
{
"epoch": 0.6178165443839153,
"grad_norm": 0.3741579055786133,
"learning_rate": 0.00028986896568356933,
"loss": 0.7519,
"step": 4905
},
{
"epoch": 0.6184463267940926,
"grad_norm": 0.4323353171348572,
"learning_rate": 0.0002898292159128055,
"loss": 0.7325,
"step": 4910
},
{
"epoch": 0.6190761092042699,
"grad_norm": 0.3273026645183563,
"learning_rate": 0.00028978939105055873,
"loss": 0.7211,
"step": 4915
},
{
"epoch": 0.6197058916144472,
"grad_norm": 0.38831016421318054,
"learning_rate": 0.0002897494911182158,
"loss": 0.6435,
"step": 4920
},
{
"epoch": 0.6203356740246245,
"grad_norm": 0.36923748254776,
"learning_rate": 0.00028970951613720397,
"loss": 0.7184,
"step": 4925
},
{
"epoch": 0.6209654564348017,
"grad_norm": 0.3658188283443451,
"learning_rate": 0.0002896694661289906,
"loss": 0.7171,
"step": 4930
},
{
"epoch": 0.621595238844979,
"grad_norm": 0.3589092493057251,
"learning_rate": 0.00028962934111508357,
"loss": 0.7173,
"step": 4935
},
{
"epoch": 0.6222250212551563,
"grad_norm": 0.41886886954307556,
"learning_rate": 0.00028958914111703086,
"loss": 0.7412,
"step": 4940
},
{
"epoch": 0.6228548036653336,
"grad_norm": 0.34496763348579407,
"learning_rate": 0.0002895488661564208,
"loss": 0.6608,
"step": 4945
},
{
"epoch": 0.6234845860755109,
"grad_norm": 0.3527592122554779,
"learning_rate": 0.000289508516254882,
"loss": 0.7179,
"step": 4950
},
{
"epoch": 0.6241143684856882,
"grad_norm": 0.3406129479408264,
"learning_rate": 0.0002894680914340833,
"loss": 0.6862,
"step": 4955
},
{
"epoch": 0.6247441508958654,
"grad_norm": 0.33078086376190186,
"learning_rate": 0.00028942759171573374,
"loss": 0.6804,
"step": 4960
},
{
"epoch": 0.6253739333060427,
"grad_norm": 0.3582599461078644,
"learning_rate": 0.00028938701712158247,
"loss": 0.6681,
"step": 4965
},
{
"epoch": 0.62600371571622,
"grad_norm": 0.3656567633152008,
"learning_rate": 0.0002893463676734191,
"loss": 0.6714,
"step": 4970
},
{
"epoch": 0.6266334981263973,
"grad_norm": 0.35537272691726685,
"learning_rate": 0.00028930564339307337,
"loss": 0.6917,
"step": 4975
},
{
"epoch": 0.6272632805365747,
"grad_norm": 0.35100945830345154,
"learning_rate": 0.0002892648443024149,
"loss": 0.7217,
"step": 4980
},
{
"epoch": 0.6278930629467518,
"grad_norm": 0.34070494771003723,
"learning_rate": 0.000289223970423354,
"loss": 0.7237,
"step": 4985
},
{
"epoch": 0.6285228453569291,
"grad_norm": 0.3810268044471741,
"learning_rate": 0.00028918302177784075,
"loss": 0.7513,
"step": 4990
},
{
"epoch": 0.6291526277671065,
"grad_norm": 0.3511486053466797,
"learning_rate": 0.0002891419983878655,
"loss": 0.7112,
"step": 4995
},
{
"epoch": 0.6297824101772838,
"grad_norm": 0.30101874470710754,
"learning_rate": 0.0002891009002754588,
"loss": 0.6666,
"step": 5000
},
{
"epoch": 0.6297824101772838,
"eval_loss": 0.31327521800994873,
"eval_runtime": 6.2403,
"eval_samples_per_second": 160.248,
"eval_steps_per_second": 10.096,
"step": 5000
},
{
"epoch": 0.6304121925874611,
"grad_norm": 0.3446876108646393,
"learning_rate": 0.00028905972746269125,
"loss": 0.6651,
"step": 5005
},
{
"epoch": 0.6310419749976384,
"grad_norm": 0.3606228232383728,
"learning_rate": 0.0002890184799716736,
"loss": 0.7387,
"step": 5010
},
{
"epoch": 0.6316717574078156,
"grad_norm": 0.37057119607925415,
"learning_rate": 0.0002889771578245567,
"loss": 0.7044,
"step": 5015
},
{
"epoch": 0.6323015398179929,
"grad_norm": 0.36304429173469543,
"learning_rate": 0.0002889357610435314,
"loss": 0.7391,
"step": 5020
},
{
"epoch": 0.6329313222281702,
"grad_norm": 0.38329148292541504,
"learning_rate": 0.00028889428965082886,
"loss": 0.7045,
"step": 5025
},
{
"epoch": 0.6335611046383475,
"grad_norm": 0.3362608850002289,
"learning_rate": 0.00028885274366872006,
"loss": 0.6865,
"step": 5030
},
{
"epoch": 0.6341908870485248,
"grad_norm": 0.4079527258872986,
"learning_rate": 0.00028881112311951625,
"loss": 0.6892,
"step": 5035
},
{
"epoch": 0.634820669458702,
"grad_norm": 0.35261860489845276,
"learning_rate": 0.00028876942802556847,
"loss": 0.7189,
"step": 5040
},
{
"epoch": 0.6354504518688793,
"grad_norm": 0.40486040711402893,
"learning_rate": 0.00028872765840926804,
"loss": 0.7385,
"step": 5045
},
{
"epoch": 0.6360802342790566,
"grad_norm": 0.32852765917778015,
"learning_rate": 0.0002886858142930462,
"loss": 0.6267,
"step": 5050
},
{
"epoch": 0.6367100166892339,
"grad_norm": 0.31455445289611816,
"learning_rate": 0.0002886438956993741,
"loss": 0.6813,
"step": 5055
},
{
"epoch": 0.6373397990994112,
"grad_norm": 0.3047012686729431,
"learning_rate": 0.00028860190265076304,
"loss": 0.6862,
"step": 5060
},
{
"epoch": 0.6379695815095885,
"grad_norm": 0.34203359484672546,
"learning_rate": 0.0002885598351697643,
"loss": 0.6996,
"step": 5065
},
{
"epoch": 0.6385993639197657,
"grad_norm": 0.4077922999858856,
"learning_rate": 0.0002885176932789691,
"loss": 0.7018,
"step": 5070
},
{
"epoch": 0.639229146329943,
"grad_norm": 0.3590135872364044,
"learning_rate": 0.00028847547700100836,
"loss": 0.6741,
"step": 5075
},
{
"epoch": 0.6398589287401203,
"grad_norm": 0.33030763268470764,
"learning_rate": 0.0002884331863585535,
"loss": 0.6775,
"step": 5080
},
{
"epoch": 0.6404887111502976,
"grad_norm": 0.3921838104724884,
"learning_rate": 0.0002883908213743153,
"loss": 0.7359,
"step": 5085
},
{
"epoch": 0.6411184935604749,
"grad_norm": 0.35765379667282104,
"learning_rate": 0.0002883483820710449,
"loss": 0.6953,
"step": 5090
},
{
"epoch": 0.6417482759706521,
"grad_norm": 0.3486902415752411,
"learning_rate": 0.0002883058684715331,
"loss": 0.6848,
"step": 5095
},
{
"epoch": 0.6423780583808294,
"grad_norm": 0.35446256399154663,
"learning_rate": 0.0002882632805986108,
"loss": 0.7031,
"step": 5100
},
{
"epoch": 0.6430078407910067,
"grad_norm": 0.3666916489601135,
"learning_rate": 0.00028822061847514843,
"loss": 0.7135,
"step": 5105
},
{
"epoch": 0.643637623201184,
"grad_norm": 0.38766369223594666,
"learning_rate": 0.00028817788212405666,
"loss": 0.6623,
"step": 5110
},
{
"epoch": 0.6442674056113613,
"grad_norm": 0.3532891273498535,
"learning_rate": 0.0002881350715682859,
"loss": 0.699,
"step": 5115
},
{
"epoch": 0.6448971880215385,
"grad_norm": 0.36512479186058044,
"learning_rate": 0.0002880921868308263,
"loss": 0.6859,
"step": 5120
},
{
"epoch": 0.6455269704317158,
"grad_norm": 0.34285515546798706,
"learning_rate": 0.0002880492279347081,
"loss": 0.7254,
"step": 5125
},
{
"epoch": 0.6461567528418931,
"grad_norm": 0.3731713891029358,
"learning_rate": 0.00028800619490300107,
"loss": 0.6995,
"step": 5130
},
{
"epoch": 0.6467865352520704,
"grad_norm": 0.37182632088661194,
"learning_rate": 0.000287963087758815,
"loss": 0.7262,
"step": 5135
},
{
"epoch": 0.6474163176622477,
"grad_norm": 0.371231347322464,
"learning_rate": 0.0002879199065252994,
"loss": 0.7051,
"step": 5140
},
{
"epoch": 0.648046100072425,
"grad_norm": 0.35507723689079285,
"learning_rate": 0.00028787665122564357,
"loss": 0.6799,
"step": 5145
},
{
"epoch": 0.6486758824826022,
"grad_norm": 0.4001401662826538,
"learning_rate": 0.0002878333218830766,
"loss": 0.7718,
"step": 5150
},
{
"epoch": 0.6493056648927795,
"grad_norm": 0.36585733294487,
"learning_rate": 0.0002877899185208673,
"loss": 0.6652,
"step": 5155
},
{
"epoch": 0.6499354473029568,
"grad_norm": 0.3719576895236969,
"learning_rate": 0.00028774644116232436,
"loss": 0.7232,
"step": 5160
},
{
"epoch": 0.6505652297131341,
"grad_norm": 0.40236014127731323,
"learning_rate": 0.000287702889830796,
"loss": 0.6697,
"step": 5165
},
{
"epoch": 0.6511950121233114,
"grad_norm": 0.4343264400959015,
"learning_rate": 0.00028765926454967037,
"loss": 0.6877,
"step": 5170
},
{
"epoch": 0.6518247945334886,
"grad_norm": 0.3576568067073822,
"learning_rate": 0.00028761556534237514,
"loss": 0.7239,
"step": 5175
},
{
"epoch": 0.6524545769436659,
"grad_norm": 0.33383145928382874,
"learning_rate": 0.00028757179223237793,
"loss": 0.6822,
"step": 5180
},
{
"epoch": 0.6530843593538432,
"grad_norm": 0.353253573179245,
"learning_rate": 0.0002875279452431858,
"loss": 0.6925,
"step": 5185
},
{
"epoch": 0.6537141417640205,
"grad_norm": 0.3755667209625244,
"learning_rate": 0.0002874840243983455,
"loss": 0.6872,
"step": 5190
},
{
"epoch": 0.6543439241741978,
"grad_norm": 0.3973848521709442,
"learning_rate": 0.00028744002972144376,
"loss": 0.7251,
"step": 5195
},
{
"epoch": 0.6549737065843751,
"grad_norm": 0.3476422131061554,
"learning_rate": 0.0002873959612361066,
"loss": 0.6964,
"step": 5200
},
{
"epoch": 0.6556034889945523,
"grad_norm": 0.42737796902656555,
"learning_rate": 0.0002873518189659997,
"loss": 0.7106,
"step": 5205
},
{
"epoch": 0.6562332714047296,
"grad_norm": 0.3009507358074188,
"learning_rate": 0.00028730760293482863,
"loss": 0.6614,
"step": 5210
},
{
"epoch": 0.656863053814907,
"grad_norm": 0.38053247332572937,
"learning_rate": 0.00028726331316633835,
"loss": 0.6963,
"step": 5215
},
{
"epoch": 0.6574928362250843,
"grad_norm": 0.4153291583061218,
"learning_rate": 0.00028721894968431345,
"loss": 0.7471,
"step": 5220
},
{
"epoch": 0.6581226186352616,
"grad_norm": 0.36470016837120056,
"learning_rate": 0.0002871745125125782,
"loss": 0.6558,
"step": 5225
},
{
"epoch": 0.6587524010454388,
"grad_norm": 0.3935704827308655,
"learning_rate": 0.00028713000167499627,
"loss": 0.7025,
"step": 5230
},
{
"epoch": 0.659382183455616,
"grad_norm": 0.36777618527412415,
"learning_rate": 0.0002870854171954711,
"loss": 0.7386,
"step": 5235
},
{
"epoch": 0.6600119658657934,
"grad_norm": 0.36549127101898193,
"learning_rate": 0.0002870407590979455,
"loss": 0.703,
"step": 5240
},
{
"epoch": 0.6606417482759707,
"grad_norm": 0.37523144483566284,
"learning_rate": 0.00028699602740640194,
"loss": 0.6708,
"step": 5245
},
{
"epoch": 0.661271530686148,
"grad_norm": 0.3451475203037262,
"learning_rate": 0.00028695122214486237,
"loss": 0.6776,
"step": 5250
},
{
"epoch": 0.6619013130963253,
"grad_norm": 0.35215169191360474,
"learning_rate": 0.00028690634333738816,
"loss": 0.6983,
"step": 5255
},
{
"epoch": 0.6625310955065025,
"grad_norm": 0.37627631425857544,
"learning_rate": 0.00028686139100808037,
"loss": 0.6844,
"step": 5260
},
{
"epoch": 0.6631608779166798,
"grad_norm": 0.34171178936958313,
"learning_rate": 0.0002868163651810793,
"loss": 0.7068,
"step": 5265
},
{
"epoch": 0.6637906603268571,
"grad_norm": 0.3566179573535919,
"learning_rate": 0.0002867712658805649,
"loss": 0.6618,
"step": 5270
},
{
"epoch": 0.6644204427370344,
"grad_norm": 0.3453030586242676,
"learning_rate": 0.00028672609313075664,
"loss": 0.7046,
"step": 5275
},
{
"epoch": 0.6650502251472117,
"grad_norm": 0.40633949637413025,
"learning_rate": 0.00028668084695591316,
"loss": 0.6931,
"step": 5280
},
{
"epoch": 0.6656800075573889,
"grad_norm": 0.3927484154701233,
"learning_rate": 0.00028663552738033275,
"loss": 0.7051,
"step": 5285
},
{
"epoch": 0.6663097899675662,
"grad_norm": 0.35829389095306396,
"learning_rate": 0.000286590134428353,
"loss": 0.7051,
"step": 5290
},
{
"epoch": 0.6669395723777435,
"grad_norm": 0.4202066957950592,
"learning_rate": 0.00028654466812435105,
"loss": 0.7179,
"step": 5295
},
{
"epoch": 0.6675693547879208,
"grad_norm": 0.37852293252944946,
"learning_rate": 0.0002864991284927433,
"loss": 0.7107,
"step": 5300
},
{
"epoch": 0.6681991371980981,
"grad_norm": 0.3831678330898285,
"learning_rate": 0.0002864535155579856,
"loss": 0.659,
"step": 5305
},
{
"epoch": 0.6688289196082754,
"grad_norm": 0.3563750684261322,
"learning_rate": 0.0002864078293445731,
"loss": 0.7111,
"step": 5310
},
{
"epoch": 0.6694587020184526,
"grad_norm": 0.3460354804992676,
"learning_rate": 0.0002863620698770403,
"loss": 0.6822,
"step": 5315
},
{
"epoch": 0.6700884844286299,
"grad_norm": 0.36469632387161255,
"learning_rate": 0.0002863162371799612,
"loss": 0.6298,
"step": 5320
},
{
"epoch": 0.6707182668388072,
"grad_norm": 0.3730217218399048,
"learning_rate": 0.00028627033127794896,
"loss": 0.7137,
"step": 5325
},
{
"epoch": 0.6713480492489845,
"grad_norm": 0.347002774477005,
"learning_rate": 0.00028622435219565606,
"loss": 0.6873,
"step": 5330
},
{
"epoch": 0.6719778316591618,
"grad_norm": 0.35723358392715454,
"learning_rate": 0.00028617829995777433,
"loss": 0.7055,
"step": 5335
},
{
"epoch": 0.672607614069339,
"grad_norm": 0.3175225257873535,
"learning_rate": 0.0002861321745890349,
"loss": 0.6702,
"step": 5340
},
{
"epoch": 0.6732373964795163,
"grad_norm": 0.3599521517753601,
"learning_rate": 0.00028608597611420807,
"loss": 0.6646,
"step": 5345
},
{
"epoch": 0.6738671788896936,
"grad_norm": 0.4381812810897827,
"learning_rate": 0.00028603970455810357,
"loss": 0.7122,
"step": 5350
},
{
"epoch": 0.6744969612998709,
"grad_norm": 0.3400894105434418,
"learning_rate": 0.00028599335994557027,
"loss": 0.705,
"step": 5355
},
{
"epoch": 0.6751267437100482,
"grad_norm": 0.3332962989807129,
"learning_rate": 0.00028594694230149625,
"loss": 0.6497,
"step": 5360
},
{
"epoch": 0.6757565261202254,
"grad_norm": 0.386343389749527,
"learning_rate": 0.00028590045165080883,
"loss": 0.6344,
"step": 5365
},
{
"epoch": 0.6763863085304027,
"grad_norm": 0.4404468834400177,
"learning_rate": 0.0002858538880184746,
"loss": 0.7115,
"step": 5370
},
{
"epoch": 0.67701609094058,
"grad_norm": 0.35227730870246887,
"learning_rate": 0.00028580725142949925,
"loss": 0.702,
"step": 5375
},
{
"epoch": 0.6776458733507573,
"grad_norm": 0.38216719031333923,
"learning_rate": 0.00028576054190892775,
"loss": 0.6845,
"step": 5380
},
{
"epoch": 0.6782756557609346,
"grad_norm": 0.3602873682975769,
"learning_rate": 0.0002857137594818441,
"loss": 0.7156,
"step": 5385
},
{
"epoch": 0.6789054381711119,
"grad_norm": 0.38896870613098145,
"learning_rate": 0.00028566690417337166,
"loss": 0.7029,
"step": 5390
},
{
"epoch": 0.6795352205812891,
"grad_norm": 0.3434313237667084,
"learning_rate": 0.0002856199760086726,
"loss": 0.687,
"step": 5395
},
{
"epoch": 0.6801650029914664,
"grad_norm": 0.381331205368042,
"learning_rate": 0.0002855729750129487,
"loss": 0.6597,
"step": 5400
},
{
"epoch": 0.6807947854016437,
"grad_norm": 0.35004013776779175,
"learning_rate": 0.0002855259012114403,
"loss": 0.6604,
"step": 5405
},
{
"epoch": 0.681424567811821,
"grad_norm": 0.3601452112197876,
"learning_rate": 0.0002854787546294272,
"loss": 0.6949,
"step": 5410
},
{
"epoch": 0.6820543502219983,
"grad_norm": 0.3827126920223236,
"learning_rate": 0.0002854315352922282,
"loss": 0.7121,
"step": 5415
},
{
"epoch": 0.6826841326321755,
"grad_norm": 0.35859569907188416,
"learning_rate": 0.0002853842432252012,
"loss": 0.6662,
"step": 5420
},
{
"epoch": 0.6833139150423528,
"grad_norm": 0.36607855558395386,
"learning_rate": 0.00028533687845374304,
"loss": 0.6716,
"step": 5425
},
{
"epoch": 0.6839436974525301,
"grad_norm": 0.3658086061477661,
"learning_rate": 0.00028528944100328975,
"loss": 0.6718,
"step": 5430
},
{
"epoch": 0.6845734798627074,
"grad_norm": 0.3442821800708771,
"learning_rate": 0.00028524193089931633,
"loss": 0.6474,
"step": 5435
},
{
"epoch": 0.6852032622728847,
"grad_norm": 0.38460132479667664,
"learning_rate": 0.0002851943481673367,
"loss": 0.6973,
"step": 5440
},
{
"epoch": 0.685833044683062,
"grad_norm": 0.3717944622039795,
"learning_rate": 0.000285146692832904,
"loss": 0.6962,
"step": 5445
},
{
"epoch": 0.6864628270932392,
"grad_norm": 0.42136862874031067,
"learning_rate": 0.00028509896492161013,
"loss": 0.6783,
"step": 5450
},
{
"epoch": 0.6870926095034166,
"grad_norm": 0.37208443880081177,
"learning_rate": 0.0002850511644590862,
"loss": 0.6915,
"step": 5455
},
{
"epoch": 0.6877223919135939,
"grad_norm": 0.3807058036327362,
"learning_rate": 0.000285003291471002,
"loss": 0.7269,
"step": 5460
},
{
"epoch": 0.6883521743237712,
"grad_norm": 0.38431763648986816,
"learning_rate": 0.00028495534598306645,
"loss": 0.6589,
"step": 5465
},
{
"epoch": 0.6889819567339485,
"grad_norm": 0.372773140668869,
"learning_rate": 0.0002849073280210274,
"loss": 0.6922,
"step": 5470
},
{
"epoch": 0.6896117391441257,
"grad_norm": 0.3280029892921448,
"learning_rate": 0.00028485923761067164,
"loss": 0.6887,
"step": 5475
},
{
"epoch": 0.690241521554303,
"grad_norm": 0.3463418483734131,
"learning_rate": 0.0002848110747778247,
"loss": 0.6565,
"step": 5480
},
{
"epoch": 0.6908713039644803,
"grad_norm": 0.3423214256763458,
"learning_rate": 0.00028476283954835123,
"loss": 0.6412,
"step": 5485
},
{
"epoch": 0.6915010863746576,
"grad_norm": 0.3461606204509735,
"learning_rate": 0.0002847145319481546,
"loss": 0.6803,
"step": 5490
},
{
"epoch": 0.6921308687848349,
"grad_norm": 0.38746729493141174,
"learning_rate": 0.0002846661520031772,
"loss": 0.6424,
"step": 5495
},
{
"epoch": 0.6927606511950122,
"grad_norm": 0.32353097200393677,
"learning_rate": 0.00028461769973939997,
"loss": 0.6761,
"step": 5500
},
{
"epoch": 0.6933904336051894,
"grad_norm": 0.3790241777896881,
"learning_rate": 0.00028456917518284304,
"loss": 0.6683,
"step": 5505
},
{
"epoch": 0.6940202160153667,
"grad_norm": 0.3713475465774536,
"learning_rate": 0.0002845205783595651,
"loss": 0.6663,
"step": 5510
},
{
"epoch": 0.694649998425544,
"grad_norm": 0.3859196901321411,
"learning_rate": 0.00028447190929566384,
"loss": 0.6717,
"step": 5515
},
{
"epoch": 0.6952797808357213,
"grad_norm": 0.34451383352279663,
"learning_rate": 0.0002844231680172756,
"loss": 0.6368,
"step": 5520
},
{
"epoch": 0.6959095632458986,
"grad_norm": 0.3519328534603119,
"learning_rate": 0.00028437435455057564,
"loss": 0.6882,
"step": 5525
},
{
"epoch": 0.6965393456560758,
"grad_norm": 0.382755309343338,
"learning_rate": 0.0002843254689217778,
"loss": 0.6415,
"step": 5530
},
{
"epoch": 0.6971691280662531,
"grad_norm": 0.35310298204421997,
"learning_rate": 0.0002842765111571349,
"loss": 0.6744,
"step": 5535
},
{
"epoch": 0.6977989104764304,
"grad_norm": 0.3392702341079712,
"learning_rate": 0.0002842274812829382,
"loss": 0.6705,
"step": 5540
},
{
"epoch": 0.6984286928866077,
"grad_norm": 0.36502036452293396,
"learning_rate": 0.00028417837932551805,
"loss": 0.6777,
"step": 5545
},
{
"epoch": 0.699058475296785,
"grad_norm": 0.36270782351493835,
"learning_rate": 0.0002841292053112432,
"loss": 0.6988,
"step": 5550
},
{
"epoch": 0.6996882577069622,
"grad_norm": 0.3752531111240387,
"learning_rate": 0.0002840799592665213,
"loss": 0.6745,
"step": 5555
},
{
"epoch": 0.7003180401171395,
"grad_norm": 0.32373905181884766,
"learning_rate": 0.00028403064121779853,
"loss": 0.664,
"step": 5560
},
{
"epoch": 0.7009478225273168,
"grad_norm": 0.4017639756202698,
"learning_rate": 0.0002839812511915599,
"loss": 0.6793,
"step": 5565
},
{
"epoch": 0.7015776049374941,
"grad_norm": 0.33867186307907104,
"learning_rate": 0.00028393178921432883,
"loss": 0.6811,
"step": 5570
},
{
"epoch": 0.7022073873476714,
"grad_norm": 0.3769174814224243,
"learning_rate": 0.0002838822553126677,
"loss": 0.7118,
"step": 5575
},
{
"epoch": 0.7028371697578487,
"grad_norm": 0.36820533871650696,
"learning_rate": 0.00028383264951317727,
"loss": 0.6581,
"step": 5580
},
{
"epoch": 0.7034669521680259,
"grad_norm": 0.37128061056137085,
"learning_rate": 0.00028378297184249694,
"loss": 0.6722,
"step": 5585
},
{
"epoch": 0.7040967345782032,
"grad_norm": 0.39225873351097107,
"learning_rate": 0.00028373322232730483,
"loss": 0.6846,
"step": 5590
},
{
"epoch": 0.7047265169883805,
"grad_norm": 0.3394504189491272,
"learning_rate": 0.0002836834009943175,
"loss": 0.6815,
"step": 5595
},
{
"epoch": 0.7053562993985578,
"grad_norm": 0.37265124917030334,
"learning_rate": 0.0002836335078702903,
"loss": 0.6614,
"step": 5600
},
{
"epoch": 0.7059860818087351,
"grad_norm": 0.33066150546073914,
"learning_rate": 0.00028358354298201673,
"loss": 0.6701,
"step": 5605
},
{
"epoch": 0.7066158642189123,
"grad_norm": 0.35536128282546997,
"learning_rate": 0.0002835335063563293,
"loss": 0.6149,
"step": 5610
},
{
"epoch": 0.7072456466290896,
"grad_norm": 0.35491225123405457,
"learning_rate": 0.0002834833980200987,
"loss": 0.6773,
"step": 5615
},
{
"epoch": 0.7078754290392669,
"grad_norm": 0.37837696075439453,
"learning_rate": 0.0002834332180002343,
"loss": 0.6899,
"step": 5620
},
{
"epoch": 0.7085052114494442,
"grad_norm": 0.3391937017440796,
"learning_rate": 0.0002833829663236838,
"loss": 0.7041,
"step": 5625
},
{
"epoch": 0.7091349938596215,
"grad_norm": 0.3482423424720764,
"learning_rate": 0.00028333264301743375,
"loss": 0.6597,
"step": 5630
},
{
"epoch": 0.7097647762697988,
"grad_norm": 0.4188586175441742,
"learning_rate": 0.00028328224810850866,
"loss": 0.6916,
"step": 5635
},
{
"epoch": 0.710394558679976,
"grad_norm": 0.32832324504852295,
"learning_rate": 0.0002832317816239718,
"loss": 0.6791,
"step": 5640
},
{
"epoch": 0.7110243410901533,
"grad_norm": 0.343058705329895,
"learning_rate": 0.00028318124359092496,
"loss": 0.6423,
"step": 5645
},
{
"epoch": 0.7116541235003306,
"grad_norm": 0.37011584639549255,
"learning_rate": 0.0002831306340365081,
"loss": 0.6783,
"step": 5650
},
{
"epoch": 0.7122839059105079,
"grad_norm": 0.38297170400619507,
"learning_rate": 0.00028307995298789974,
"loss": 0.6751,
"step": 5655
},
{
"epoch": 0.7129136883206852,
"grad_norm": 0.38705122470855713,
"learning_rate": 0.00028302920047231677,
"loss": 0.6844,
"step": 5660
},
{
"epoch": 0.7135434707308624,
"grad_norm": 0.3647492527961731,
"learning_rate": 0.0002829783765170144,
"loss": 0.6811,
"step": 5665
},
{
"epoch": 0.7141732531410397,
"grad_norm": 0.3796983063220978,
"learning_rate": 0.0002829274811492863,
"loss": 0.6766,
"step": 5670
},
{
"epoch": 0.714803035551217,
"grad_norm": 0.36972787976264954,
"learning_rate": 0.00028287651439646444,
"loss": 0.6701,
"step": 5675
},
{
"epoch": 0.7154328179613944,
"grad_norm": 0.37298983335494995,
"learning_rate": 0.0002828254762859192,
"loss": 0.6439,
"step": 5680
},
{
"epoch": 0.7160626003715717,
"grad_norm": 0.3464621603488922,
"learning_rate": 0.0002827743668450591,
"loss": 0.6626,
"step": 5685
},
{
"epoch": 0.716692382781749,
"grad_norm": 0.34213629364967346,
"learning_rate": 0.00028272318610133104,
"loss": 0.6987,
"step": 5690
},
{
"epoch": 0.7173221651919262,
"grad_norm": 0.38596463203430176,
"learning_rate": 0.0002826719340822204,
"loss": 0.6846,
"step": 5695
},
{
"epoch": 0.7179519476021035,
"grad_norm": 0.3410765826702118,
"learning_rate": 0.0002826206108152506,
"loss": 0.6769,
"step": 5700
},
{
"epoch": 0.7185817300122808,
"grad_norm": 0.3370499610900879,
"learning_rate": 0.0002825692163279834,
"loss": 0.6563,
"step": 5705
},
{
"epoch": 0.7192115124224581,
"grad_norm": 0.3973693549633026,
"learning_rate": 0.0002825177506480189,
"loss": 0.6587,
"step": 5710
},
{
"epoch": 0.7198412948326354,
"grad_norm": 0.3341182470321655,
"learning_rate": 0.0002824662138029952,
"loss": 0.6489,
"step": 5715
},
{
"epoch": 0.7204710772428126,
"grad_norm": 0.3598056733608246,
"learning_rate": 0.00028241460582058883,
"loss": 0.6623,
"step": 5720
},
{
"epoch": 0.7211008596529899,
"grad_norm": 0.34275728464126587,
"learning_rate": 0.00028236292672851443,
"loss": 0.6987,
"step": 5725
},
{
"epoch": 0.7217306420631672,
"grad_norm": 0.3606712222099304,
"learning_rate": 0.000282311176554525,
"loss": 0.6947,
"step": 5730
},
{
"epoch": 0.7223604244733445,
"grad_norm": 0.32409214973449707,
"learning_rate": 0.0002822593553264114,
"loss": 0.6468,
"step": 5735
},
{
"epoch": 0.7229902068835218,
"grad_norm": 0.3465891182422638,
"learning_rate": 0.00028220746307200287,
"loss": 0.647,
"step": 5740
},
{
"epoch": 0.723619989293699,
"grad_norm": 0.3540678918361664,
"learning_rate": 0.0002821554998191667,
"loss": 0.6964,
"step": 5745
},
{
"epoch": 0.7242497717038763,
"grad_norm": 0.35845157504081726,
"learning_rate": 0.0002821034655958084,
"loss": 0.6599,
"step": 5750
},
{
"epoch": 0.7248795541140536,
"grad_norm": 0.3469247817993164,
"learning_rate": 0.00028205136042987156,
"loss": 0.6518,
"step": 5755
},
{
"epoch": 0.7255093365242309,
"grad_norm": 0.3693814277648926,
"learning_rate": 0.0002819991843493377,
"loss": 0.6339,
"step": 5760
},
{
"epoch": 0.7261391189344082,
"grad_norm": 0.35166436433792114,
"learning_rate": 0.0002819469373822268,
"loss": 0.6593,
"step": 5765
},
{
"epoch": 0.7267689013445855,
"grad_norm": 0.376717746257782,
"learning_rate": 0.00028189461955659644,
"loss": 0.6583,
"step": 5770
},
{
"epoch": 0.7273986837547627,
"grad_norm": 0.36365002393722534,
"learning_rate": 0.0002818422309005426,
"loss": 0.707,
"step": 5775
},
{
"epoch": 0.72802846616494,
"grad_norm": 0.3356451392173767,
"learning_rate": 0.00028178977144219914,
"loss": 0.6439,
"step": 5780
},
{
"epoch": 0.7286582485751173,
"grad_norm": 0.33520832657814026,
"learning_rate": 0.00028173724120973806,
"loss": 0.6276,
"step": 5785
},
{
"epoch": 0.7292880309852946,
"grad_norm": 0.3459213376045227,
"learning_rate": 0.00028168464023136926,
"loss": 0.648,
"step": 5790
},
{
"epoch": 0.7299178133954719,
"grad_norm": 0.3563973903656006,
"learning_rate": 0.0002816319685353406,
"loss": 0.6579,
"step": 5795
},
{
"epoch": 0.7305475958056491,
"grad_norm": 0.3637474775314331,
"learning_rate": 0.0002815792261499381,
"loss": 0.6828,
"step": 5800
},
{
"epoch": 0.7311773782158264,
"grad_norm": 0.38304394483566284,
"learning_rate": 0.00028152641310348554,
"loss": 0.6348,
"step": 5805
},
{
"epoch": 0.7318071606260037,
"grad_norm": 0.33336034417152405,
"learning_rate": 0.0002814735294243448,
"loss": 0.6337,
"step": 5810
},
{
"epoch": 0.732436943036181,
"grad_norm": 0.34154805541038513,
"learning_rate": 0.0002814205751409156,
"loss": 0.6885,
"step": 5815
},
{
"epoch": 0.7330667254463583,
"grad_norm": 0.3780697286128998,
"learning_rate": 0.00028136755028163556,
"loss": 0.6558,
"step": 5820
},
{
"epoch": 0.7336965078565356,
"grad_norm": 0.3496229946613312,
"learning_rate": 0.0002813144548749802,
"loss": 0.7058,
"step": 5825
},
{
"epoch": 0.7343262902667128,
"grad_norm": 0.36560389399528503,
"learning_rate": 0.0002812612889494631,
"loss": 0.6991,
"step": 5830
},
{
"epoch": 0.7349560726768901,
"grad_norm": 0.3215349316596985,
"learning_rate": 0.00028120805253363545,
"loss": 0.612,
"step": 5835
},
{
"epoch": 0.7355858550870674,
"grad_norm": 0.36016130447387695,
"learning_rate": 0.00028115474565608656,
"loss": 0.6905,
"step": 5840
},
{
"epoch": 0.7362156374972447,
"grad_norm": 0.3493592441082001,
"learning_rate": 0.00028110136834544336,
"loss": 0.6922,
"step": 5845
},
{
"epoch": 0.736845419907422,
"grad_norm": 0.34350746870040894,
"learning_rate": 0.00028104792063037064,
"loss": 0.6238,
"step": 5850
},
{
"epoch": 0.7374752023175992,
"grad_norm": 0.3633589446544647,
"learning_rate": 0.0002809944025395711,
"loss": 0.6775,
"step": 5855
},
{
"epoch": 0.7381049847277765,
"grad_norm": 0.3892457187175751,
"learning_rate": 0.00028094081410178515,
"loss": 0.6756,
"step": 5860
},
{
"epoch": 0.7387347671379538,
"grad_norm": 0.33569657802581787,
"learning_rate": 0.00028088715534579104,
"loss": 0.63,
"step": 5865
},
{
"epoch": 0.7393645495481311,
"grad_norm": 0.36327067017555237,
"learning_rate": 0.0002808334263004047,
"loss": 0.6653,
"step": 5870
},
{
"epoch": 0.7399943319583084,
"grad_norm": 0.32698652148246765,
"learning_rate": 0.00028077962699448,
"loss": 0.655,
"step": 5875
},
{
"epoch": 0.7406241143684857,
"grad_norm": 0.35473042726516724,
"learning_rate": 0.0002807257574569082,
"loss": 0.6341,
"step": 5880
},
{
"epoch": 0.7412538967786629,
"grad_norm": 0.33008939027786255,
"learning_rate": 0.0002806718177166185,
"loss": 0.6614,
"step": 5885
},
{
"epoch": 0.7418836791888402,
"grad_norm": 0.3434574007987976,
"learning_rate": 0.0002806178078025779,
"loss": 0.6313,
"step": 5890
},
{
"epoch": 0.7425134615990175,
"grad_norm": 0.30766573548316956,
"learning_rate": 0.00028056372774379085,
"loss": 0.6296,
"step": 5895
},
{
"epoch": 0.7431432440091948,
"grad_norm": 0.3676775097846985,
"learning_rate": 0.00028050957756929965,
"loss": 0.628,
"step": 5900
},
{
"epoch": 0.7437730264193722,
"grad_norm": 0.3424786925315857,
"learning_rate": 0.0002804553573081841,
"loss": 0.6141,
"step": 5905
},
{
"epoch": 0.7444028088295493,
"grad_norm": 0.391250878572464,
"learning_rate": 0.0002804010669895618,
"loss": 0.6615,
"step": 5910
},
{
"epoch": 0.7450325912397266,
"grad_norm": 0.34186193346977234,
"learning_rate": 0.0002803467066425878,
"loss": 0.6389,
"step": 5915
},
{
"epoch": 0.745662373649904,
"grad_norm": 0.37509649991989136,
"learning_rate": 0.0002802922762964549,
"loss": 0.6397,
"step": 5920
},
{
"epoch": 0.7462921560600813,
"grad_norm": 0.3327299654483795,
"learning_rate": 0.00028023777598039346,
"loss": 0.6241,
"step": 5925
},
{
"epoch": 0.7469219384702586,
"grad_norm": 0.37098389863967896,
"learning_rate": 0.0002801832057236714,
"loss": 0.7004,
"step": 5930
},
{
"epoch": 0.7475517208804358,
"grad_norm": 0.36630627512931824,
"learning_rate": 0.00028012856555559415,
"loss": 0.6201,
"step": 5935
},
{
"epoch": 0.7481815032906131,
"grad_norm": 0.3580261170864105,
"learning_rate": 0.00028007385550550475,
"loss": 0.6969,
"step": 5940
},
{
"epoch": 0.7488112857007904,
"grad_norm": 0.3491668105125427,
"learning_rate": 0.0002800190756027837,
"loss": 0.6457,
"step": 5945
},
{
"epoch": 0.7494410681109677,
"grad_norm": 0.2999480664730072,
"learning_rate": 0.0002799642258768491,
"loss": 0.6398,
"step": 5950
},
{
"epoch": 0.750070850521145,
"grad_norm": 0.33795973658561707,
"learning_rate": 0.00027990930635715655,
"loss": 0.6672,
"step": 5955
},
{
"epoch": 0.7507006329313223,
"grad_norm": 0.39881202578544617,
"learning_rate": 0.00027985431707319903,
"loss": 0.6796,
"step": 5960
},
{
"epoch": 0.7513304153414995,
"grad_norm": 0.4092641770839691,
"learning_rate": 0.0002797992580545071,
"loss": 0.6488,
"step": 5965
},
{
"epoch": 0.7519601977516768,
"grad_norm": 0.33037346601486206,
"learning_rate": 0.0002797441293306486,
"loss": 0.667,
"step": 5970
},
{
"epoch": 0.7525899801618541,
"grad_norm": 0.35514095425605774,
"learning_rate": 0.00027968893093122896,
"loss": 0.6984,
"step": 5975
},
{
"epoch": 0.7532197625720314,
"grad_norm": 0.4268254339694977,
"learning_rate": 0.0002796336628858911,
"loss": 0.6762,
"step": 5980
},
{
"epoch": 0.7538495449822087,
"grad_norm": 0.33386656641960144,
"learning_rate": 0.00027957832522431503,
"loss": 0.6438,
"step": 5985
},
{
"epoch": 0.7544793273923859,
"grad_norm": 0.374845415353775,
"learning_rate": 0.00027952291797621846,
"loss": 0.6422,
"step": 5990
},
{
"epoch": 0.7551091098025632,
"grad_norm": 0.32742077112197876,
"learning_rate": 0.0002794674411713563,
"loss": 0.6685,
"step": 5995
},
{
"epoch": 0.7557388922127405,
"grad_norm": 0.3118845820426941,
"learning_rate": 0.0002794118948395209,
"loss": 0.6273,
"step": 6000
},
{
"epoch": 0.7557388922127405,
"eval_loss": 0.3097546696662903,
"eval_runtime": 6.2567,
"eval_samples_per_second": 159.828,
"eval_steps_per_second": 10.069,
"step": 6000
},
{
"epoch": 0.7563686746229178,
"grad_norm": 0.3407754898071289,
"learning_rate": 0.00027935627901054197,
"loss": 0.6712,
"step": 6005
},
{
"epoch": 0.7569984570330951,
"grad_norm": 0.34817007184028625,
"learning_rate": 0.0002793005937142863,
"loss": 0.6492,
"step": 6010
},
{
"epoch": 0.7576282394432724,
"grad_norm": 0.36492645740509033,
"learning_rate": 0.00027924483898065833,
"loss": 0.6467,
"step": 6015
},
{
"epoch": 0.7582580218534496,
"grad_norm": 0.33556580543518066,
"learning_rate": 0.0002791890148395995,
"loss": 0.6486,
"step": 6020
},
{
"epoch": 0.7588878042636269,
"grad_norm": 0.36699965596199036,
"learning_rate": 0.00027913312132108874,
"loss": 0.6909,
"step": 6025
},
{
"epoch": 0.7595175866738042,
"grad_norm": 0.32526010274887085,
"learning_rate": 0.0002790771584551421,
"loss": 0.6234,
"step": 6030
},
{
"epoch": 0.7601473690839815,
"grad_norm": 0.38366591930389404,
"learning_rate": 0.00027902112627181295,
"loss": 0.6195,
"step": 6035
},
{
"epoch": 0.7607771514941588,
"grad_norm": 0.33587443828582764,
"learning_rate": 0.0002789650248011918,
"loss": 0.6546,
"step": 6040
},
{
"epoch": 0.761406933904336,
"grad_norm": 0.36170026659965515,
"learning_rate": 0.00027890885407340653,
"loss": 0.6294,
"step": 6045
},
{
"epoch": 0.7620367163145133,
"grad_norm": 0.34692490100860596,
"learning_rate": 0.000278852614118622,
"loss": 0.6468,
"step": 6050
},
{
"epoch": 0.7626664987246906,
"grad_norm": 0.346608966588974,
"learning_rate": 0.0002787963049670404,
"loss": 0.6714,
"step": 6055
},
{
"epoch": 0.7632962811348679,
"grad_norm": 0.3632940948009491,
"learning_rate": 0.00027873992664890097,
"loss": 0.6772,
"step": 6060
},
{
"epoch": 0.7639260635450452,
"grad_norm": 0.38135001063346863,
"learning_rate": 0.00027868347919448027,
"loss": 0.658,
"step": 6065
},
{
"epoch": 0.7645558459552225,
"grad_norm": 0.3518752455711365,
"learning_rate": 0.00027862696263409177,
"loss": 0.6445,
"step": 6070
},
{
"epoch": 0.7651856283653997,
"grad_norm": 0.33004361391067505,
"learning_rate": 0.00027857037699808613,
"loss": 0.6553,
"step": 6075
},
{
"epoch": 0.765815410775577,
"grad_norm": 0.36370858550071716,
"learning_rate": 0.0002785137223168512,
"loss": 0.6632,
"step": 6080
},
{
"epoch": 0.7664451931857543,
"grad_norm": 0.3472859561443329,
"learning_rate": 0.0002784569986208119,
"loss": 0.626,
"step": 6085
},
{
"epoch": 0.7670749755959316,
"grad_norm": 0.3560635447502136,
"learning_rate": 0.00027840020594043,
"loss": 0.6628,
"step": 6090
},
{
"epoch": 0.7677047580061089,
"grad_norm": 0.3515082895755768,
"learning_rate": 0.00027834334430620455,
"loss": 0.7061,
"step": 6095
},
{
"epoch": 0.7683345404162861,
"grad_norm": 0.3222733736038208,
"learning_rate": 0.00027828641374867154,
"loss": 0.617,
"step": 6100
},
{
"epoch": 0.7689643228264634,
"grad_norm": 0.3362828493118286,
"learning_rate": 0.00027822941429840397,
"loss": 0.6825,
"step": 6105
},
{
"epoch": 0.7695941052366407,
"grad_norm": 0.34228187799453735,
"learning_rate": 0.0002781723459860119,
"loss": 0.6306,
"step": 6110
},
{
"epoch": 0.770223887646818,
"grad_norm": 0.3672444820404053,
"learning_rate": 0.0002781152088421422,
"loss": 0.6601,
"step": 6115
},
{
"epoch": 0.7708536700569953,
"grad_norm": 0.3703080415725708,
"learning_rate": 0.00027805800289747894,
"loss": 0.6385,
"step": 6120
},
{
"epoch": 0.7714834524671725,
"grad_norm": 0.34456151723861694,
"learning_rate": 0.0002780007281827429,
"loss": 0.6635,
"step": 6125
},
{
"epoch": 0.7721132348773498,
"grad_norm": 0.3449029326438904,
"learning_rate": 0.00027794338472869205,
"loss": 0.6258,
"step": 6130
},
{
"epoch": 0.7727430172875271,
"grad_norm": 0.3441922068595886,
"learning_rate": 0.0002778859725661211,
"loss": 0.627,
"step": 6135
},
{
"epoch": 0.7733727996977044,
"grad_norm": 0.3855600357055664,
"learning_rate": 0.00027782849172586156,
"loss": 0.6205,
"step": 6140
},
{
"epoch": 0.7740025821078818,
"grad_norm": 0.3838488757610321,
"learning_rate": 0.0002777709422387821,
"loss": 0.6463,
"step": 6145
},
{
"epoch": 0.7746323645180591,
"grad_norm": 0.3128564953804016,
"learning_rate": 0.00027771332413578805,
"loss": 0.6639,
"step": 6150
},
{
"epoch": 0.7752621469282363,
"grad_norm": 0.32142025232315063,
"learning_rate": 0.00027765563744782166,
"loss": 0.6187,
"step": 6155
},
{
"epoch": 0.7758919293384136,
"grad_norm": 0.34378373622894287,
"learning_rate": 0.000277597882205862,
"loss": 0.659,
"step": 6160
},
{
"epoch": 0.7765217117485909,
"grad_norm": 0.35872867703437805,
"learning_rate": 0.0002775400584409249,
"loss": 0.6245,
"step": 6165
},
{
"epoch": 0.7771514941587682,
"grad_norm": 0.32217180728912354,
"learning_rate": 0.00027748216618406316,
"loss": 0.6216,
"step": 6170
},
{
"epoch": 0.7777812765689455,
"grad_norm": 0.3139524757862091,
"learning_rate": 0.00027742420546636616,
"loss": 0.6831,
"step": 6175
},
{
"epoch": 0.7784110589791227,
"grad_norm": 0.3159128427505493,
"learning_rate": 0.00027736617631896017,
"loss": 0.6417,
"step": 6180
},
{
"epoch": 0.7790408413893,
"grad_norm": 0.36738142371177673,
"learning_rate": 0.0002773080787730081,
"loss": 0.6592,
"step": 6185
},
{
"epoch": 0.7796706237994773,
"grad_norm": 0.31971079111099243,
"learning_rate": 0.0002772499128597097,
"loss": 0.6296,
"step": 6190
},
{
"epoch": 0.7803004062096546,
"grad_norm": 0.3699764609336853,
"learning_rate": 0.00027719167861030145,
"loss": 0.6161,
"step": 6195
},
{
"epoch": 0.7809301886198319,
"grad_norm": 0.3316752016544342,
"learning_rate": 0.0002771333760560564,
"loss": 0.6698,
"step": 6200
},
{
"epoch": 0.7815599710300092,
"grad_norm": 0.34318891167640686,
"learning_rate": 0.00027707500522828433,
"loss": 0.6312,
"step": 6205
},
{
"epoch": 0.7821897534401864,
"grad_norm": 0.3325194716453552,
"learning_rate": 0.00027701656615833185,
"loss": 0.6515,
"step": 6210
},
{
"epoch": 0.7828195358503637,
"grad_norm": 0.3374411463737488,
"learning_rate": 0.0002769580588775819,
"loss": 0.6811,
"step": 6215
},
{
"epoch": 0.783449318260541,
"grad_norm": 0.3507198989391327,
"learning_rate": 0.00027689948341745433,
"loss": 0.6177,
"step": 6220
},
{
"epoch": 0.7840791006707183,
"grad_norm": 0.3619876205921173,
"learning_rate": 0.00027684083980940543,
"loss": 0.6812,
"step": 6225
},
{
"epoch": 0.7847088830808956,
"grad_norm": 0.3660729229450226,
"learning_rate": 0.00027678212808492824,
"loss": 0.6888,
"step": 6230
},
{
"epoch": 0.7853386654910728,
"grad_norm": 0.37557917833328247,
"learning_rate": 0.00027672334827555226,
"loss": 0.6516,
"step": 6235
},
{
"epoch": 0.7859684479012501,
"grad_norm": 0.37117084860801697,
"learning_rate": 0.00027666450041284363,
"loss": 0.6503,
"step": 6240
},
{
"epoch": 0.7865982303114274,
"grad_norm": 0.3434617519378662,
"learning_rate": 0.00027660558452840487,
"loss": 0.6582,
"step": 6245
},
{
"epoch": 0.7872280127216047,
"grad_norm": 0.3878399431705475,
"learning_rate": 0.0002765466006538753,
"loss": 0.6309,
"step": 6250
},
{
"epoch": 0.787857795131782,
"grad_norm": 0.3379189968109131,
"learning_rate": 0.0002764875488209305,
"loss": 0.6802,
"step": 6255
},
{
"epoch": 0.7884875775419593,
"grad_norm": 0.3534158170223236,
"learning_rate": 0.0002764284290612827,
"loss": 0.6248,
"step": 6260
},
{
"epoch": 0.7891173599521365,
"grad_norm": 0.3273150324821472,
"learning_rate": 0.0002763692414066806,
"loss": 0.617,
"step": 6265
},
{
"epoch": 0.7897471423623138,
"grad_norm": 0.4256115257740021,
"learning_rate": 0.0002763099858889093,
"loss": 0.6452,
"step": 6270
},
{
"epoch": 0.7903769247724911,
"grad_norm": 0.34881314635276794,
"learning_rate": 0.0002762506625397903,
"loss": 0.6545,
"step": 6275
},
{
"epoch": 0.7910067071826684,
"grad_norm": 0.3283347487449646,
"learning_rate": 0.0002761912713911817,
"loss": 0.6819,
"step": 6280
},
{
"epoch": 0.7916364895928457,
"grad_norm": 0.33939605951309204,
"learning_rate": 0.0002761318124749778,
"loss": 0.6188,
"step": 6285
},
{
"epoch": 0.7922662720030229,
"grad_norm": 0.3786788582801819,
"learning_rate": 0.00027607228582310947,
"loss": 0.6583,
"step": 6290
},
{
"epoch": 0.7928960544132002,
"grad_norm": 0.34528714418411255,
"learning_rate": 0.0002760126914675439,
"loss": 0.6594,
"step": 6295
},
{
"epoch": 0.7935258368233775,
"grad_norm": 0.3494967818260193,
"learning_rate": 0.00027595302944028447,
"loss": 0.6241,
"step": 6300
},
{
"epoch": 0.7941556192335548,
"grad_norm": 0.350005179643631,
"learning_rate": 0.00027589329977337126,
"loss": 0.6724,
"step": 6305
},
{
"epoch": 0.7947854016437321,
"grad_norm": 0.3381168246269226,
"learning_rate": 0.0002758335024988803,
"loss": 0.6062,
"step": 6310
},
{
"epoch": 0.7954151840539094,
"grad_norm": 0.32583653926849365,
"learning_rate": 0.0002757736376489242,
"loss": 0.6602,
"step": 6315
},
{
"epoch": 0.7960449664640866,
"grad_norm": 0.33687326312065125,
"learning_rate": 0.0002757137052556517,
"loss": 0.6391,
"step": 6320
},
{
"epoch": 0.7966747488742639,
"grad_norm": 0.35395026206970215,
"learning_rate": 0.00027565370535124784,
"loss": 0.6445,
"step": 6325
},
{
"epoch": 0.7973045312844412,
"grad_norm": 0.3484829068183899,
"learning_rate": 0.000275593637967934,
"loss": 0.6242,
"step": 6330
},
{
"epoch": 0.7979343136946185,
"grad_norm": 0.32783517241477966,
"learning_rate": 0.0002755335031379677,
"loss": 0.6481,
"step": 6335
},
{
"epoch": 0.7985640961047958,
"grad_norm": 0.3683319389820099,
"learning_rate": 0.0002754733008936427,
"loss": 0.6506,
"step": 6340
},
{
"epoch": 0.799193878514973,
"grad_norm": 0.360219269990921,
"learning_rate": 0.00027541303126728907,
"loss": 0.6377,
"step": 6345
},
{
"epoch": 0.7998236609251503,
"grad_norm": 0.3323548436164856,
"learning_rate": 0.00027535269429127283,
"loss": 0.6278,
"step": 6350
},
{
"epoch": 0.8004534433353276,
"grad_norm": 0.33823835849761963,
"learning_rate": 0.0002752922899979965,
"loss": 0.5999,
"step": 6355
},
{
"epoch": 0.801083225745505,
"grad_norm": 0.35394924879074097,
"learning_rate": 0.0002752318184198984,
"loss": 0.6873,
"step": 6360
},
{
"epoch": 0.8017130081556822,
"grad_norm": 0.35529881715774536,
"learning_rate": 0.00027517127958945315,
"loss": 0.6183,
"step": 6365
},
{
"epoch": 0.8023427905658594,
"grad_norm": 0.35854044556617737,
"learning_rate": 0.00027511067353917166,
"loss": 0.6394,
"step": 6370
},
{
"epoch": 0.8029725729760367,
"grad_norm": 0.32757097482681274,
"learning_rate": 0.0002750500003016006,
"loss": 0.6383,
"step": 6375
},
{
"epoch": 0.803602355386214,
"grad_norm": 0.3267909586429596,
"learning_rate": 0.0002749892599093229,
"loss": 0.5951,
"step": 6380
},
{
"epoch": 0.8042321377963914,
"grad_norm": 0.31262004375457764,
"learning_rate": 0.0002749284523949576,
"loss": 0.6497,
"step": 6385
},
{
"epoch": 0.8048619202065687,
"grad_norm": 0.34036824107170105,
"learning_rate": 0.00027486757779115973,
"loss": 0.6295,
"step": 6390
},
{
"epoch": 0.805491702616746,
"grad_norm": 0.3461470901966095,
"learning_rate": 0.0002748066361306203,
"loss": 0.6537,
"step": 6395
},
{
"epoch": 0.8061214850269232,
"grad_norm": 0.35146886110305786,
"learning_rate": 0.00027474562744606636,
"loss": 0.6217,
"step": 6400
},
{
"epoch": 0.8067512674371005,
"grad_norm": 0.37654054164886475,
"learning_rate": 0.000274684551770261,
"loss": 0.6417,
"step": 6405
},
{
"epoch": 0.8073810498472778,
"grad_norm": 0.36115625500679016,
"learning_rate": 0.0002746234091360032,
"loss": 0.6638,
"step": 6410
},
{
"epoch": 0.8080108322574551,
"grad_norm": 0.3503740727901459,
"learning_rate": 0.00027456219957612804,
"loss": 0.6652,
"step": 6415
},
{
"epoch": 0.8086406146676324,
"grad_norm": 0.3303118646144867,
"learning_rate": 0.0002745009231235064,
"loss": 0.614,
"step": 6420
},
{
"epoch": 0.8092703970778096,
"grad_norm": 0.35880813002586365,
"learning_rate": 0.00027443957981104517,
"loss": 0.6449,
"step": 6425
},
{
"epoch": 0.8099001794879869,
"grad_norm": 0.3664454221725464,
"learning_rate": 0.000274378169671687,
"loss": 0.6448,
"step": 6430
},
{
"epoch": 0.8105299618981642,
"grad_norm": 0.38473254442214966,
"learning_rate": 0.00027431669273841067,
"loss": 0.6576,
"step": 6435
},
{
"epoch": 0.8111597443083415,
"grad_norm": 0.3694675862789154,
"learning_rate": 0.0002742551490442307,
"loss": 0.6365,
"step": 6440
},
{
"epoch": 0.8117895267185188,
"grad_norm": 0.32066047191619873,
"learning_rate": 0.0002741935386221973,
"loss": 0.6563,
"step": 6445
},
{
"epoch": 0.8124193091286961,
"grad_norm": 0.3764455020427704,
"learning_rate": 0.0002741318615053968,
"loss": 0.61,
"step": 6450
},
{
"epoch": 0.8130490915388733,
"grad_norm": 0.3913812041282654,
"learning_rate": 0.00027407011772695124,
"loss": 0.6606,
"step": 6455
},
{
"epoch": 0.8136788739490506,
"grad_norm": 0.2876626253128052,
"learning_rate": 0.0002740083073200184,
"loss": 0.6123,
"step": 6460
},
{
"epoch": 0.8143086563592279,
"grad_norm": 0.37668120861053467,
"learning_rate": 0.0002739464303177919,
"loss": 0.6323,
"step": 6465
},
{
"epoch": 0.8149384387694052,
"grad_norm": 0.3343159854412079,
"learning_rate": 0.000273884486753501,
"loss": 0.6051,
"step": 6470
},
{
"epoch": 0.8155682211795825,
"grad_norm": 0.3852281868457794,
"learning_rate": 0.00027382247666041097,
"loss": 0.6614,
"step": 6475
},
{
"epoch": 0.8161980035897597,
"grad_norm": 0.36491283774375916,
"learning_rate": 0.0002737604000718225,
"loss": 0.6383,
"step": 6480
},
{
"epoch": 0.816827785999937,
"grad_norm": 0.32019633054733276,
"learning_rate": 0.00027369825702107224,
"loss": 0.623,
"step": 6485
},
{
"epoch": 0.8174575684101143,
"grad_norm": 0.3173837661743164,
"learning_rate": 0.0002736360475415324,
"loss": 0.599,
"step": 6490
},
{
"epoch": 0.8180873508202916,
"grad_norm": 0.31505605578422546,
"learning_rate": 0.00027357377166661086,
"loss": 0.6341,
"step": 6495
},
{
"epoch": 0.8187171332304689,
"grad_norm": 0.3370759189128876,
"learning_rate": 0.00027351142942975124,
"loss": 0.6296,
"step": 6500
},
{
"epoch": 0.8193469156406462,
"grad_norm": 0.3554564416408539,
"learning_rate": 0.0002734490208644327,
"loss": 0.6587,
"step": 6505
},
{
"epoch": 0.8199766980508234,
"grad_norm": 0.3487757444381714,
"learning_rate": 0.0002733865460041701,
"loss": 0.6292,
"step": 6510
},
{
"epoch": 0.8206064804610007,
"grad_norm": 0.3280607759952545,
"learning_rate": 0.0002733240048825138,
"loss": 0.5964,
"step": 6515
},
{
"epoch": 0.821236262871178,
"grad_norm": 0.35416868329048157,
"learning_rate": 0.0002732613975330499,
"loss": 0.6089,
"step": 6520
},
{
"epoch": 0.8218660452813553,
"grad_norm": 0.3558996915817261,
"learning_rate": 0.00027319872398939995,
"loss": 0.5791,
"step": 6525
},
{
"epoch": 0.8224958276915326,
"grad_norm": 0.35394206643104553,
"learning_rate": 0.000273135984285221,
"loss": 0.6183,
"step": 6530
},
{
"epoch": 0.8231256101017098,
"grad_norm": 0.33172932267189026,
"learning_rate": 0.0002730731784542058,
"loss": 0.605,
"step": 6535
},
{
"epoch": 0.8237553925118871,
"grad_norm": 0.3498142957687378,
"learning_rate": 0.00027301030653008253,
"loss": 0.6199,
"step": 6540
},
{
"epoch": 0.8243851749220644,
"grad_norm": 0.3364173471927643,
"learning_rate": 0.0002729473685466148,
"loss": 0.6352,
"step": 6545
},
{
"epoch": 0.8250149573322417,
"grad_norm": 0.38148370385169983,
"learning_rate": 0.00027288436453760164,
"loss": 0.6216,
"step": 6550
},
{
"epoch": 0.825644739742419,
"grad_norm": 0.33975306153297424,
"learning_rate": 0.0002728212945368778,
"loss": 0.6155,
"step": 6555
},
{
"epoch": 0.8262745221525962,
"grad_norm": 0.3361944854259491,
"learning_rate": 0.0002727581585783133,
"loss": 0.6084,
"step": 6560
},
{
"epoch": 0.8269043045627735,
"grad_norm": 0.3503773808479309,
"learning_rate": 0.00027269495669581353,
"loss": 0.6355,
"step": 6565
},
{
"epoch": 0.8275340869729508,
"grad_norm": 0.35406753420829773,
"learning_rate": 0.00027263168892331934,
"loss": 0.624,
"step": 6570
},
{
"epoch": 0.8281638693831281,
"grad_norm": 0.3337428569793701,
"learning_rate": 0.00027256835529480697,
"loss": 0.6451,
"step": 6575
},
{
"epoch": 0.8287936517933054,
"grad_norm": 0.3431616425514221,
"learning_rate": 0.00027250495584428807,
"loss": 0.5969,
"step": 6580
},
{
"epoch": 0.8294234342034827,
"grad_norm": 0.4032285511493683,
"learning_rate": 0.0002724414906058096,
"loss": 0.5954,
"step": 6585
},
{
"epoch": 0.8300532166136599,
"grad_norm": 0.3352124094963074,
"learning_rate": 0.00027237795961345383,
"loss": 0.6077,
"step": 6590
},
{
"epoch": 0.8306829990238372,
"grad_norm": 0.3181077837944031,
"learning_rate": 0.0002723143629013383,
"loss": 0.6107,
"step": 6595
},
{
"epoch": 0.8313127814340145,
"grad_norm": 0.32390958070755005,
"learning_rate": 0.000272250700503616,
"loss": 0.6345,
"step": 6600
},
{
"epoch": 0.8319425638441919,
"grad_norm": 0.3199234902858734,
"learning_rate": 0.0002721869724544749,
"loss": 0.6268,
"step": 6605
},
{
"epoch": 0.8325723462543692,
"grad_norm": 0.38811957836151123,
"learning_rate": 0.00027212317878813863,
"loss": 0.643,
"step": 6610
},
{
"epoch": 0.8332021286645463,
"grad_norm": 0.38010820746421814,
"learning_rate": 0.00027205931953886575,
"loss": 0.6055,
"step": 6615
},
{
"epoch": 0.8338319110747237,
"grad_norm": 0.3288145065307617,
"learning_rate": 0.00027199539474095013,
"loss": 0.6311,
"step": 6620
},
{
"epoch": 0.834461693484901,
"grad_norm": 0.33361807465553284,
"learning_rate": 0.0002719314044287209,
"loss": 0.6083,
"step": 6625
},
{
"epoch": 0.8350914758950783,
"grad_norm": 0.350864440202713,
"learning_rate": 0.0002718673486365423,
"loss": 0.5969,
"step": 6630
},
{
"epoch": 0.8357212583052556,
"grad_norm": 0.35795754194259644,
"learning_rate": 0.0002718032273988137,
"loss": 0.6623,
"step": 6635
},
{
"epoch": 0.8363510407154329,
"grad_norm": 0.3748815357685089,
"learning_rate": 0.0002717390407499697,
"loss": 0.6301,
"step": 6640
},
{
"epoch": 0.8369808231256101,
"grad_norm": 0.3146851360797882,
"learning_rate": 0.00027167478872448,
"loss": 0.62,
"step": 6645
},
{
"epoch": 0.8376106055357874,
"grad_norm": 0.3758367598056793,
"learning_rate": 0.0002716104713568495,
"loss": 0.6202,
"step": 6650
},
{
"epoch": 0.8382403879459647,
"grad_norm": 0.4035817086696625,
"learning_rate": 0.0002715460886816179,
"loss": 0.606,
"step": 6655
},
{
"epoch": 0.838870170356142,
"grad_norm": 0.3586306869983673,
"learning_rate": 0.00027148164073336026,
"loss": 0.6523,
"step": 6660
},
{
"epoch": 0.8394999527663193,
"grad_norm": 0.33375057578086853,
"learning_rate": 0.0002714171275466866,
"loss": 0.6193,
"step": 6665
},
{
"epoch": 0.8401297351764965,
"grad_norm": 0.30258163809776306,
"learning_rate": 0.0002713525491562421,
"loss": 0.6225,
"step": 6670
},
{
"epoch": 0.8407595175866738,
"grad_norm": 0.33032524585723877,
"learning_rate": 0.00027128790559670667,
"loss": 0.628,
"step": 6675
},
{
"epoch": 0.8413892999968511,
"grad_norm": 0.36689457297325134,
"learning_rate": 0.00027122319690279535,
"loss": 0.6341,
"step": 6680
},
{
"epoch": 0.8420190824070284,
"grad_norm": 0.35744035243988037,
"learning_rate": 0.00027115842310925837,
"loss": 0.5945,
"step": 6685
},
{
"epoch": 0.8426488648172057,
"grad_norm": 0.3377218246459961,
"learning_rate": 0.0002710935842508806,
"loss": 0.6216,
"step": 6690
},
{
"epoch": 0.843278647227383,
"grad_norm": 0.3244309723377228,
"learning_rate": 0.000271028680362482,
"loss": 0.6045,
"step": 6695
},
{
"epoch": 0.8439084296375602,
"grad_norm": 0.34593185782432556,
"learning_rate": 0.00027096371147891744,
"loss": 0.6277,
"step": 6700
},
{
"epoch": 0.8445382120477375,
"grad_norm": 0.3151993751525879,
"learning_rate": 0.0002708986776350767,
"loss": 0.5929,
"step": 6705
},
{
"epoch": 0.8451679944579148,
"grad_norm": 0.38307860493659973,
"learning_rate": 0.0002708335788658845,
"loss": 0.5934,
"step": 6710
},
{
"epoch": 0.8457977768680921,
"grad_norm": 0.3155449330806732,
"learning_rate": 0.0002707684152063003,
"loss": 0.5838,
"step": 6715
},
{
"epoch": 0.8464275592782694,
"grad_norm": 0.3827744424343109,
"learning_rate": 0.00027070318669131845,
"loss": 0.5976,
"step": 6720
},
{
"epoch": 0.8470573416884466,
"grad_norm": 0.35382625460624695,
"learning_rate": 0.00027063789335596825,
"loss": 0.5997,
"step": 6725
},
{
"epoch": 0.8476871240986239,
"grad_norm": 0.36884164810180664,
"learning_rate": 0.00027057253523531365,
"loss": 0.6373,
"step": 6730
},
{
"epoch": 0.8483169065088012,
"grad_norm": 0.35557276010513306,
"learning_rate": 0.0002705071123644534,
"loss": 0.6717,
"step": 6735
},
{
"epoch": 0.8489466889189785,
"grad_norm": 0.3088480234146118,
"learning_rate": 0.00027044162477852124,
"loss": 0.6011,
"step": 6740
},
{
"epoch": 0.8495764713291558,
"grad_norm": 0.33665964007377625,
"learning_rate": 0.0002703760725126853,
"loss": 0.6039,
"step": 6745
},
{
"epoch": 0.850206253739333,
"grad_norm": 0.3297666311264038,
"learning_rate": 0.0002703104556021488,
"loss": 0.6226,
"step": 6750
},
{
"epoch": 0.8508360361495103,
"grad_norm": 0.33897268772125244,
"learning_rate": 0.00027024477408214945,
"loss": 0.5564,
"step": 6755
},
{
"epoch": 0.8514658185596876,
"grad_norm": 0.3549680709838867,
"learning_rate": 0.0002701790279879597,
"loss": 0.5989,
"step": 6760
},
{
"epoch": 0.8520956009698649,
"grad_norm": 0.31162139773368835,
"learning_rate": 0.0002701132173548868,
"loss": 0.6363,
"step": 6765
},
{
"epoch": 0.8527253833800422,
"grad_norm": 0.35543885827064514,
"learning_rate": 0.0002700473422182724,
"loss": 0.6228,
"step": 6770
},
{
"epoch": 0.8533551657902195,
"grad_norm": 0.3361263871192932,
"learning_rate": 0.0002699814026134932,
"loss": 0.5957,
"step": 6775
},
{
"epoch": 0.8539849482003967,
"grad_norm": 0.2764013707637787,
"learning_rate": 0.00026991539857596,
"loss": 0.5982,
"step": 6780
},
{
"epoch": 0.854614730610574,
"grad_norm": 0.3229328691959381,
"learning_rate": 0.0002698493301411187,
"loss": 0.6562,
"step": 6785
},
{
"epoch": 0.8552445130207513,
"grad_norm": 0.3163946270942688,
"learning_rate": 0.00026978319734444943,
"loss": 0.6125,
"step": 6790
},
{
"epoch": 0.8558742954309286,
"grad_norm": 0.38809090852737427,
"learning_rate": 0.0002697170002214671,
"loss": 0.6308,
"step": 6795
},
{
"epoch": 0.8565040778411059,
"grad_norm": 0.3416973650455475,
"learning_rate": 0.0002696507388077209,
"loss": 0.6565,
"step": 6800
},
{
"epoch": 0.8571338602512831,
"grad_norm": 0.3220008909702301,
"learning_rate": 0.00026958441313879494,
"loss": 0.6211,
"step": 6805
},
{
"epoch": 0.8577636426614604,
"grad_norm": 0.34507647156715393,
"learning_rate": 0.00026951802325030755,
"loss": 0.6384,
"step": 6810
},
{
"epoch": 0.8583934250716377,
"grad_norm": 0.3345770239830017,
"learning_rate": 0.00026945156917791154,
"loss": 0.6566,
"step": 6815
},
{
"epoch": 0.859023207481815,
"grad_norm": 0.32488980889320374,
"learning_rate": 0.0002693850509572943,
"loss": 0.626,
"step": 6820
},
{
"epoch": 0.8596529898919923,
"grad_norm": 0.3537434935569763,
"learning_rate": 0.00026931846862417766,
"loss": 0.6539,
"step": 6825
},
{
"epoch": 0.8602827723021697,
"grad_norm": 0.3165736794471741,
"learning_rate": 0.0002692518222143179,
"loss": 0.6468,
"step": 6830
},
{
"epoch": 0.8609125547123468,
"grad_norm": 0.34746891260147095,
"learning_rate": 0.0002691851117635056,
"loss": 0.6498,
"step": 6835
},
{
"epoch": 0.8615423371225241,
"grad_norm": 0.3370078206062317,
"learning_rate": 0.00026911833730756577,
"loss": 0.5951,
"step": 6840
},
{
"epoch": 0.8621721195327015,
"grad_norm": 0.3180099427700043,
"learning_rate": 0.00026905149888235787,
"loss": 0.609,
"step": 6845
},
{
"epoch": 0.8628019019428788,
"grad_norm": 0.34123897552490234,
"learning_rate": 0.0002689845965237757,
"loss": 0.6228,
"step": 6850
},
{
"epoch": 0.8634316843530561,
"grad_norm": 0.3529733717441559,
"learning_rate": 0.00026891763026774725,
"loss": 0.6101,
"step": 6855
},
{
"epoch": 0.8640614667632333,
"grad_norm": 0.3116464614868164,
"learning_rate": 0.00026885060015023496,
"loss": 0.5734,
"step": 6860
},
{
"epoch": 0.8646912491734106,
"grad_norm": 0.3331621587276459,
"learning_rate": 0.00026878350620723556,
"loss": 0.6004,
"step": 6865
},
{
"epoch": 0.8653210315835879,
"grad_norm": 0.3215835690498352,
"learning_rate": 0.00026871634847478007,
"loss": 0.6105,
"step": 6870
},
{
"epoch": 0.8659508139937652,
"grad_norm": 0.3454177677631378,
"learning_rate": 0.0002686491269889336,
"loss": 0.6203,
"step": 6875
},
{
"epoch": 0.8665805964039425,
"grad_norm": 0.3336181640625,
"learning_rate": 0.0002685818417857958,
"loss": 0.6179,
"step": 6880
},
{
"epoch": 0.8672103788141198,
"grad_norm": 0.3452587127685547,
"learning_rate": 0.00026851449290150024,
"loss": 0.5918,
"step": 6885
},
{
"epoch": 0.867840161224297,
"grad_norm": 0.37552833557128906,
"learning_rate": 0.0002684470803722148,
"loss": 0.6284,
"step": 6890
},
{
"epoch": 0.8684699436344743,
"grad_norm": 0.33525559306144714,
"learning_rate": 0.0002683796042341416,
"loss": 0.6465,
"step": 6895
},
{
"epoch": 0.8690997260446516,
"grad_norm": 0.3272569477558136,
"learning_rate": 0.00026831206452351683,
"loss": 0.636,
"step": 6900
},
{
"epoch": 0.8697295084548289,
"grad_norm": 0.35215091705322266,
"learning_rate": 0.0002682444612766109,
"loss": 0.6415,
"step": 6905
},
{
"epoch": 0.8703592908650062,
"grad_norm": 0.33025211095809937,
"learning_rate": 0.0002681767945297282,
"loss": 0.6677,
"step": 6910
},
{
"epoch": 0.8709890732751834,
"grad_norm": 0.34073176980018616,
"learning_rate": 0.0002681090643192075,
"loss": 0.6386,
"step": 6915
},
{
"epoch": 0.8716188556853607,
"grad_norm": 0.4070134162902832,
"learning_rate": 0.0002680412706814213,
"loss": 0.6365,
"step": 6920
},
{
"epoch": 0.872248638095538,
"grad_norm": 0.33693283796310425,
"learning_rate": 0.00026797341365277644,
"loss": 0.6465,
"step": 6925
},
{
"epoch": 0.8728784205057153,
"grad_norm": 0.3678983747959137,
"learning_rate": 0.0002679054932697136,
"loss": 0.594,
"step": 6930
},
{
"epoch": 0.8735082029158926,
"grad_norm": 0.31632333993911743,
"learning_rate": 0.00026783750956870764,
"loss": 0.6128,
"step": 6935
},
{
"epoch": 0.8741379853260698,
"grad_norm": 0.3184865713119507,
"learning_rate": 0.0002677694625862674,
"loss": 0.5955,
"step": 6940
},
{
"epoch": 0.8747677677362471,
"grad_norm": 0.33729860186576843,
"learning_rate": 0.00026770135235893556,
"loss": 0.609,
"step": 6945
},
{
"epoch": 0.8753975501464244,
"grad_norm": 0.3195466995239258,
"learning_rate": 0.0002676331789232889,
"loss": 0.6399,
"step": 6950
},
{
"epoch": 0.8760273325566017,
"grad_norm": 0.35504212975502014,
"learning_rate": 0.0002675649423159382,
"loss": 0.6162,
"step": 6955
},
{
"epoch": 0.876657114966779,
"grad_norm": 0.3598940372467041,
"learning_rate": 0.000267496642573528,
"loss": 0.6117,
"step": 6960
},
{
"epoch": 0.8772868973769563,
"grad_norm": 0.32016637921333313,
"learning_rate": 0.0002674282797327368,
"loss": 0.6129,
"step": 6965
},
{
"epoch": 0.8779166797871335,
"grad_norm": 0.36968451738357544,
"learning_rate": 0.00026735985383027704,
"loss": 0.619,
"step": 6970
},
{
"epoch": 0.8785464621973108,
"grad_norm": 0.3299955427646637,
"learning_rate": 0.000267291364902895,
"loss": 0.5894,
"step": 6975
},
{
"epoch": 0.8791762446074881,
"grad_norm": 0.34892305731773376,
"learning_rate": 0.0002672228129873708,
"loss": 0.6152,
"step": 6980
},
{
"epoch": 0.8798060270176654,
"grad_norm": 0.379016637802124,
"learning_rate": 0.00026715419812051833,
"loss": 0.6633,
"step": 6985
},
{
"epoch": 0.8804358094278427,
"grad_norm": 0.3378797173500061,
"learning_rate": 0.00026708552033918544,
"loss": 0.5911,
"step": 6990
},
{
"epoch": 0.8810655918380199,
"grad_norm": 0.3348138928413391,
"learning_rate": 0.0002670167796802536,
"loss": 0.5841,
"step": 6995
},
{
"epoch": 0.8816953742481972,
"grad_norm": 0.36374861001968384,
"learning_rate": 0.0002669479761806381,
"loss": 0.5973,
"step": 7000
},
{
"epoch": 0.8816953742481972,
"eval_loss": 0.3066178560256958,
"eval_runtime": 6.2494,
"eval_samples_per_second": 160.014,
"eval_steps_per_second": 10.081,
"step": 7000
},
{
"epoch": 0.8823251566583745,
"grad_norm": 0.31616318225860596,
"learning_rate": 0.000266879109877288,
"loss": 0.6302,
"step": 7005
},
{
"epoch": 0.8829549390685518,
"grad_norm": 0.37413114309310913,
"learning_rate": 0.00026681018080718615,
"loss": 0.6141,
"step": 7010
},
{
"epoch": 0.8835847214787291,
"grad_norm": 0.3616124987602234,
"learning_rate": 0.0002667411890073489,
"loss": 0.6081,
"step": 7015
},
{
"epoch": 0.8842145038889064,
"grad_norm": 0.3536156713962555,
"learning_rate": 0.00026667213451482655,
"loss": 0.6101,
"step": 7020
},
{
"epoch": 0.8848442862990836,
"grad_norm": 0.2826579809188843,
"learning_rate": 0.00026660301736670293,
"loss": 0.5803,
"step": 7025
},
{
"epoch": 0.8854740687092609,
"grad_norm": 0.3352709710597992,
"learning_rate": 0.00026653383760009546,
"loss": 0.5994,
"step": 7030
},
{
"epoch": 0.8861038511194382,
"grad_norm": 0.320122092962265,
"learning_rate": 0.00026646459525215524,
"loss": 0.6159,
"step": 7035
},
{
"epoch": 0.8867336335296155,
"grad_norm": 0.3512963652610779,
"learning_rate": 0.0002663952903600671,
"loss": 0.6034,
"step": 7040
},
{
"epoch": 0.8873634159397928,
"grad_norm": 0.358071506023407,
"learning_rate": 0.00026632592296104926,
"loss": 0.6155,
"step": 7045
},
{
"epoch": 0.88799319834997,
"grad_norm": 0.342318058013916,
"learning_rate": 0.0002662564930923536,
"loss": 0.5997,
"step": 7050
},
{
"epoch": 0.8886229807601473,
"grad_norm": 0.291960746049881,
"learning_rate": 0.0002661870007912656,
"loss": 0.5721,
"step": 7055
},
{
"epoch": 0.8892527631703246,
"grad_norm": 0.3608805239200592,
"learning_rate": 0.0002661174460951042,
"loss": 0.6248,
"step": 7060
},
{
"epoch": 0.889882545580502,
"grad_norm": 0.329289972782135,
"learning_rate": 0.0002660478290412218,
"loss": 0.6163,
"step": 7065
},
{
"epoch": 0.8905123279906793,
"grad_norm": 0.352383553981781,
"learning_rate": 0.0002659781496670044,
"loss": 0.6252,
"step": 7070
},
{
"epoch": 0.8911421104008566,
"grad_norm": 0.3424574136734009,
"learning_rate": 0.0002659084080098714,
"loss": 0.5562,
"step": 7075
},
{
"epoch": 0.8917718928110338,
"grad_norm": 0.32095563411712646,
"learning_rate": 0.0002658386041072757,
"loss": 0.6232,
"step": 7080
},
{
"epoch": 0.892401675221211,
"grad_norm": 0.3307218849658966,
"learning_rate": 0.00026576873799670356,
"loss": 0.5958,
"step": 7085
},
{
"epoch": 0.8930314576313884,
"grad_norm": 0.31858259439468384,
"learning_rate": 0.00026569880971567464,
"loss": 0.6128,
"step": 7090
},
{
"epoch": 0.8936612400415657,
"grad_norm": 0.3014832139015198,
"learning_rate": 0.00026562881930174213,
"loss": 0.5886,
"step": 7095
},
{
"epoch": 0.894291022451743,
"grad_norm": 0.35925576090812683,
"learning_rate": 0.00026555876679249234,
"loss": 0.6032,
"step": 7100
},
{
"epoch": 0.8949208048619202,
"grad_norm": 0.337100625038147,
"learning_rate": 0.0002654886522255452,
"loss": 0.6217,
"step": 7105
},
{
"epoch": 0.8955505872720975,
"grad_norm": 0.34906861186027527,
"learning_rate": 0.00026541847563855373,
"loss": 0.5999,
"step": 7110
},
{
"epoch": 0.8961803696822748,
"grad_norm": 0.2829444110393524,
"learning_rate": 0.00026534823706920443,
"loss": 0.5747,
"step": 7115
},
{
"epoch": 0.8968101520924521,
"grad_norm": 0.3298097550868988,
"learning_rate": 0.00026527793655521697,
"loss": 0.5959,
"step": 7120
},
{
"epoch": 0.8974399345026294,
"grad_norm": 0.3762158453464508,
"learning_rate": 0.0002652075741343444,
"loss": 0.6325,
"step": 7125
},
{
"epoch": 0.8980697169128066,
"grad_norm": 0.3318065106868744,
"learning_rate": 0.00026513714984437284,
"loss": 0.6015,
"step": 7130
},
{
"epoch": 0.8986994993229839,
"grad_norm": 0.3132246434688568,
"learning_rate": 0.0002650666637231218,
"loss": 0.6317,
"step": 7135
},
{
"epoch": 0.8993292817331612,
"grad_norm": 0.3308473527431488,
"learning_rate": 0.00026499611580844403,
"loss": 0.6364,
"step": 7140
},
{
"epoch": 0.8999590641433385,
"grad_norm": 0.31450155377388,
"learning_rate": 0.0002649255061382252,
"loss": 0.6186,
"step": 7145
},
{
"epoch": 0.9005888465535158,
"grad_norm": 0.3408615291118622,
"learning_rate": 0.00026485483475038445,
"loss": 0.5954,
"step": 7150
},
{
"epoch": 0.9012186289636931,
"grad_norm": 0.34355321526527405,
"learning_rate": 0.0002647841016828738,
"loss": 0.6143,
"step": 7155
},
{
"epoch": 0.9018484113738703,
"grad_norm": 0.35341107845306396,
"learning_rate": 0.00026471330697367865,
"loss": 0.5887,
"step": 7160
},
{
"epoch": 0.9024781937840476,
"grad_norm": 0.3439336121082306,
"learning_rate": 0.0002646424506608173,
"loss": 0.6152,
"step": 7165
},
{
"epoch": 0.9031079761942249,
"grad_norm": 0.32301509380340576,
"learning_rate": 0.00026457153278234126,
"loss": 0.6191,
"step": 7170
},
{
"epoch": 0.9037377586044022,
"grad_norm": 0.3085480034351349,
"learning_rate": 0.000264500553376335,
"loss": 0.5993,
"step": 7175
},
{
"epoch": 0.9043675410145795,
"grad_norm": 0.3285475969314575,
"learning_rate": 0.0002644295124809161,
"loss": 0.5832,
"step": 7180
},
{
"epoch": 0.9049973234247567,
"grad_norm": 0.3160327076911926,
"learning_rate": 0.0002643584101342352,
"loss": 0.6258,
"step": 7185
},
{
"epoch": 0.905627105834934,
"grad_norm": 0.30449238419532776,
"learning_rate": 0.0002642872463744759,
"loss": 0.62,
"step": 7190
},
{
"epoch": 0.9062568882451113,
"grad_norm": 0.31154754757881165,
"learning_rate": 0.00026421602123985455,
"loss": 0.5888,
"step": 7195
},
{
"epoch": 0.9068866706552886,
"grad_norm": 0.32224607467651367,
"learning_rate": 0.0002641447347686209,
"loss": 0.5971,
"step": 7200
},
{
"epoch": 0.9075164530654659,
"grad_norm": 0.33809399604797363,
"learning_rate": 0.0002640733869990573,
"loss": 0.5942,
"step": 7205
},
{
"epoch": 0.9081462354756432,
"grad_norm": 0.337990403175354,
"learning_rate": 0.0002640019779694792,
"loss": 0.5996,
"step": 7210
},
{
"epoch": 0.9087760178858204,
"grad_norm": 0.33843520283699036,
"learning_rate": 0.0002639305077182348,
"loss": 0.6009,
"step": 7215
},
{
"epoch": 0.9094058002959977,
"grad_norm": 0.31854307651519775,
"learning_rate": 0.00026385897628370536,
"loss": 0.5929,
"step": 7220
},
{
"epoch": 0.910035582706175,
"grad_norm": 0.31263160705566406,
"learning_rate": 0.0002637873837043049,
"loss": 0.5861,
"step": 7225
},
{
"epoch": 0.9106653651163523,
"grad_norm": 0.3141006827354431,
"learning_rate": 0.00026371573001848005,
"loss": 0.6204,
"step": 7230
},
{
"epoch": 0.9112951475265296,
"grad_norm": 0.3565130829811096,
"learning_rate": 0.00026364401526471077,
"loss": 0.6051,
"step": 7235
},
{
"epoch": 0.9119249299367068,
"grad_norm": 0.3886755108833313,
"learning_rate": 0.0002635722394815094,
"loss": 0.6162,
"step": 7240
},
{
"epoch": 0.9125547123468841,
"grad_norm": 0.32173478603363037,
"learning_rate": 0.0002635004027074211,
"loss": 0.5908,
"step": 7245
},
{
"epoch": 0.9131844947570614,
"grad_norm": 0.3483346998691559,
"learning_rate": 0.0002634285049810239,
"loss": 0.5934,
"step": 7250
},
{
"epoch": 0.9138142771672387,
"grad_norm": 0.31829094886779785,
"learning_rate": 0.00026335654634092857,
"loss": 0.6205,
"step": 7255
},
{
"epoch": 0.914444059577416,
"grad_norm": 0.2864934206008911,
"learning_rate": 0.0002632845268257785,
"loss": 0.5486,
"step": 7260
},
{
"epoch": 0.9150738419875933,
"grad_norm": 0.34583529829978943,
"learning_rate": 0.0002632124464742499,
"loss": 0.5994,
"step": 7265
},
{
"epoch": 0.9157036243977705,
"grad_norm": 0.3405662775039673,
"learning_rate": 0.00026314030532505146,
"loss": 0.5941,
"step": 7270
},
{
"epoch": 0.9163334068079478,
"grad_norm": 0.319985568523407,
"learning_rate": 0.00026306810341692464,
"loss": 0.5949,
"step": 7275
},
{
"epoch": 0.9169631892181251,
"grad_norm": 0.3206420838832855,
"learning_rate": 0.00026299584078864354,
"loss": 0.5895,
"step": 7280
},
{
"epoch": 0.9175929716283024,
"grad_norm": 0.33022215962409973,
"learning_rate": 0.00026292351747901486,
"loss": 0.6018,
"step": 7285
},
{
"epoch": 0.9182227540384797,
"grad_norm": 0.3440692722797394,
"learning_rate": 0.00026285113352687785,
"loss": 0.5818,
"step": 7290
},
{
"epoch": 0.9188525364486569,
"grad_norm": 0.3580811619758606,
"learning_rate": 0.0002627786889711043,
"loss": 0.6024,
"step": 7295
},
{
"epoch": 0.9194823188588342,
"grad_norm": 0.3101358413696289,
"learning_rate": 0.0002627061838505987,
"loss": 0.6241,
"step": 7300
},
{
"epoch": 0.9201121012690116,
"grad_norm": 0.3681425452232361,
"learning_rate": 0.00026263361820429783,
"loss": 0.5759,
"step": 7305
},
{
"epoch": 0.9207418836791889,
"grad_norm": 0.3331769108772278,
"learning_rate": 0.0002625609920711712,
"loss": 0.5696,
"step": 7310
},
{
"epoch": 0.9213716660893662,
"grad_norm": 0.34252071380615234,
"learning_rate": 0.00026248830549022064,
"loss": 0.6171,
"step": 7315
},
{
"epoch": 0.9220014484995434,
"grad_norm": 0.31009170413017273,
"learning_rate": 0.00026241555850048056,
"loss": 0.5758,
"step": 7320
},
{
"epoch": 0.9226312309097207,
"grad_norm": 0.33126717805862427,
"learning_rate": 0.00026234275114101765,
"loss": 0.557,
"step": 7325
},
{
"epoch": 0.923261013319898,
"grad_norm": 0.35423141717910767,
"learning_rate": 0.00026226988345093126,
"loss": 0.6239,
"step": 7330
},
{
"epoch": 0.9238907957300753,
"grad_norm": 0.31321558356285095,
"learning_rate": 0.0002621969554693529,
"loss": 0.5796,
"step": 7335
},
{
"epoch": 0.9245205781402526,
"grad_norm": 0.38709312677383423,
"learning_rate": 0.00026212396723544664,
"loss": 0.5831,
"step": 7340
},
{
"epoch": 0.9251503605504299,
"grad_norm": 0.3205506205558777,
"learning_rate": 0.0002620509187884088,
"loss": 0.5577,
"step": 7345
},
{
"epoch": 0.9257801429606071,
"grad_norm": 0.3263196647167206,
"learning_rate": 0.00026197781016746804,
"loss": 0.5729,
"step": 7350
},
{
"epoch": 0.9264099253707844,
"grad_norm": 0.3553536534309387,
"learning_rate": 0.0002619046414118854,
"loss": 0.5968,
"step": 7355
},
{
"epoch": 0.9270397077809617,
"grad_norm": 0.4170524477958679,
"learning_rate": 0.0002618314125609541,
"loss": 0.5731,
"step": 7360
},
{
"epoch": 0.927669490191139,
"grad_norm": 0.3739701807498932,
"learning_rate": 0.00026175812365399976,
"loss": 0.5785,
"step": 7365
},
{
"epoch": 0.9282992726013163,
"grad_norm": 0.32139813899993896,
"learning_rate": 0.0002616847747303802,
"loss": 0.5909,
"step": 7370
},
{
"epoch": 0.9289290550114935,
"grad_norm": 0.3099890947341919,
"learning_rate": 0.00026161136582948544,
"loss": 0.5579,
"step": 7375
},
{
"epoch": 0.9295588374216708,
"grad_norm": 0.349729984998703,
"learning_rate": 0.0002615378969907378,
"loss": 0.5762,
"step": 7380
},
{
"epoch": 0.9301886198318481,
"grad_norm": 0.3257734775543213,
"learning_rate": 0.00026146436825359167,
"loss": 0.6216,
"step": 7385
},
{
"epoch": 0.9308184022420254,
"grad_norm": 0.3399578332901001,
"learning_rate": 0.0002613907796575337,
"loss": 0.5694,
"step": 7390
},
{
"epoch": 0.9314481846522027,
"grad_norm": 0.3863985240459442,
"learning_rate": 0.0002613171312420826,
"loss": 0.6416,
"step": 7395
},
{
"epoch": 0.93207796706238,
"grad_norm": 0.3288150429725647,
"learning_rate": 0.0002612434230467892,
"loss": 0.5839,
"step": 7400
},
{
"epoch": 0.9327077494725572,
"grad_norm": 0.37783902883529663,
"learning_rate": 0.00026116965511123664,
"loss": 0.5919,
"step": 7405
},
{
"epoch": 0.9333375318827345,
"grad_norm": 0.36346110701560974,
"learning_rate": 0.00026109582747503986,
"loss": 0.5796,
"step": 7410
},
{
"epoch": 0.9339673142929118,
"grad_norm": 0.3194875419139862,
"learning_rate": 0.00026102194017784606,
"loss": 0.5808,
"step": 7415
},
{
"epoch": 0.9345970967030891,
"grad_norm": 0.286823570728302,
"learning_rate": 0.00026094799325933435,
"loss": 0.5605,
"step": 7420
},
{
"epoch": 0.9352268791132664,
"grad_norm": 0.3147251307964325,
"learning_rate": 0.0002608739867592159,
"loss": 0.572,
"step": 7425
},
{
"epoch": 0.9358566615234436,
"grad_norm": 0.34172821044921875,
"learning_rate": 0.000260799920717234,
"loss": 0.5763,
"step": 7430
},
{
"epoch": 0.9364864439336209,
"grad_norm": 0.32804232835769653,
"learning_rate": 0.0002607257951731637,
"loss": 0.5925,
"step": 7435
},
{
"epoch": 0.9371162263437982,
"grad_norm": 0.2969893515110016,
"learning_rate": 0.0002606516101668122,
"loss": 0.5754,
"step": 7440
},
{
"epoch": 0.9377460087539755,
"grad_norm": 0.3364142179489136,
"learning_rate": 0.00026057736573801844,
"loss": 0.6248,
"step": 7445
},
{
"epoch": 0.9383757911641528,
"grad_norm": 0.3493711054325104,
"learning_rate": 0.0002605030619266534,
"loss": 0.5828,
"step": 7450
},
{
"epoch": 0.9390055735743301,
"grad_norm": 0.3338306248188019,
"learning_rate": 0.00026042869877262,
"loss": 0.5947,
"step": 7455
},
{
"epoch": 0.9396353559845073,
"grad_norm": 0.30441364645957947,
"learning_rate": 0.0002603542763158529,
"loss": 0.5743,
"step": 7460
},
{
"epoch": 0.9402651383946846,
"grad_norm": 0.31838342547416687,
"learning_rate": 0.0002602797945963186,
"loss": 0.5493,
"step": 7465
},
{
"epoch": 0.9408949208048619,
"grad_norm": 0.3308780789375305,
"learning_rate": 0.0002602052536540156,
"loss": 0.5984,
"step": 7470
},
{
"epoch": 0.9415247032150392,
"grad_norm": 0.30487555265426636,
"learning_rate": 0.00026013065352897407,
"loss": 0.5687,
"step": 7475
},
{
"epoch": 0.9421544856252165,
"grad_norm": 0.33297523856163025,
"learning_rate": 0.0002600559942612559,
"loss": 0.5728,
"step": 7480
},
{
"epoch": 0.9427842680353937,
"grad_norm": 0.3194848299026489,
"learning_rate": 0.00025998127589095483,
"loss": 0.5939,
"step": 7485
},
{
"epoch": 0.943414050445571,
"grad_norm": 0.3401489555835724,
"learning_rate": 0.0002599064984581964,
"loss": 0.5282,
"step": 7490
},
{
"epoch": 0.9440438328557483,
"grad_norm": 0.3722991943359375,
"learning_rate": 0.0002598316620031378,
"loss": 0.6044,
"step": 7495
},
{
"epoch": 0.9446736152659256,
"grad_norm": 0.3582395613193512,
"learning_rate": 0.0002597567665659678,
"loss": 0.574,
"step": 7500
},
{
"epoch": 0.9453033976761029,
"grad_norm": 0.30922654271125793,
"learning_rate": 0.0002596818121869071,
"loss": 0.6086,
"step": 7505
},
{
"epoch": 0.9459331800862802,
"grad_norm": 0.34381213784217834,
"learning_rate": 0.00025960679890620785,
"loss": 0.6032,
"step": 7510
},
{
"epoch": 0.9465629624964574,
"grad_norm": 0.3153468072414398,
"learning_rate": 0.0002595317267641539,
"loss": 0.5758,
"step": 7515
},
{
"epoch": 0.9471927449066347,
"grad_norm": 0.30763527750968933,
"learning_rate": 0.0002594565958010607,
"loss": 0.6036,
"step": 7520
},
{
"epoch": 0.947822527316812,
"grad_norm": 0.33897343277931213,
"learning_rate": 0.00025938140605727536,
"loss": 0.5879,
"step": 7525
},
{
"epoch": 0.9484523097269894,
"grad_norm": 0.2996034324169159,
"learning_rate": 0.00025930615757317635,
"loss": 0.6095,
"step": 7530
},
{
"epoch": 0.9490820921371667,
"grad_norm": 0.37265533208847046,
"learning_rate": 0.00025923085038917395,
"loss": 0.5718,
"step": 7535
},
{
"epoch": 0.9497118745473438,
"grad_norm": 0.32904815673828125,
"learning_rate": 0.00025915548454570977,
"loss": 0.5689,
"step": 7540
},
{
"epoch": 0.9503416569575212,
"grad_norm": 0.3493824005126953,
"learning_rate": 0.000259080060083257,
"loss": 0.594,
"step": 7545
},
{
"epoch": 0.9509714393676985,
"grad_norm": 0.33561789989471436,
"learning_rate": 0.0002590045770423204,
"loss": 0.5604,
"step": 7550
},
{
"epoch": 0.9516012217778758,
"grad_norm": 0.3272433876991272,
"learning_rate": 0.00025892903546343587,
"loss": 0.5819,
"step": 7555
},
{
"epoch": 0.9522310041880531,
"grad_norm": 0.34539222717285156,
"learning_rate": 0.00025885343538717116,
"loss": 0.591,
"step": 7560
},
{
"epoch": 0.9528607865982303,
"grad_norm": 0.3331897258758545,
"learning_rate": 0.0002587777768541252,
"loss": 0.5885,
"step": 7565
},
{
"epoch": 0.9534905690084076,
"grad_norm": 0.3285147547721863,
"learning_rate": 0.00025870205990492827,
"loss": 0.5561,
"step": 7570
},
{
"epoch": 0.9541203514185849,
"grad_norm": 0.3221907317638397,
"learning_rate": 0.0002586262845802422,
"loss": 0.5837,
"step": 7575
},
{
"epoch": 0.9547501338287622,
"grad_norm": 0.4986007511615753,
"learning_rate": 0.00025855045092076,
"loss": 0.5645,
"step": 7580
},
{
"epoch": 0.9553799162389395,
"grad_norm": 0.33891043066978455,
"learning_rate": 0.00025847455896720615,
"loss": 0.5801,
"step": 7585
},
{
"epoch": 0.9560096986491168,
"grad_norm": 0.345480740070343,
"learning_rate": 0.00025839860876033626,
"loss": 0.5876,
"step": 7590
},
{
"epoch": 0.956639481059294,
"grad_norm": 0.39212220907211304,
"learning_rate": 0.0002583226003409374,
"loss": 0.5949,
"step": 7595
},
{
"epoch": 0.9572692634694713,
"grad_norm": 0.3195202648639679,
"learning_rate": 0.00025824653374982776,
"loss": 0.592,
"step": 7600
},
{
"epoch": 0.9578990458796486,
"grad_norm": 0.31688785552978516,
"learning_rate": 0.00025817040902785694,
"loss": 0.5432,
"step": 7605
},
{
"epoch": 0.9585288282898259,
"grad_norm": 0.3165288269519806,
"learning_rate": 0.00025809422621590554,
"loss": 0.552,
"step": 7610
},
{
"epoch": 0.9591586107000032,
"grad_norm": 0.33528926968574524,
"learning_rate": 0.0002580179853548856,
"loss": 0.5745,
"step": 7615
},
{
"epoch": 0.9597883931101804,
"grad_norm": 0.34123846888542175,
"learning_rate": 0.0002579416864857401,
"loss": 0.6019,
"step": 7620
},
{
"epoch": 0.9604181755203577,
"grad_norm": 0.3223724663257599,
"learning_rate": 0.0002578653296494433,
"loss": 0.5725,
"step": 7625
},
{
"epoch": 0.961047957930535,
"grad_norm": 0.349751740694046,
"learning_rate": 0.0002577889148870006,
"loss": 0.5739,
"step": 7630
},
{
"epoch": 0.9616777403407123,
"grad_norm": 0.3111324608325958,
"learning_rate": 0.0002577124422394484,
"loss": 0.5555,
"step": 7635
},
{
"epoch": 0.9623075227508896,
"grad_norm": 0.364615797996521,
"learning_rate": 0.00025763591174785433,
"loss": 0.5789,
"step": 7640
},
{
"epoch": 0.9629373051610669,
"grad_norm": 0.31817707419395447,
"learning_rate": 0.000257559323453317,
"loss": 0.5799,
"step": 7645
},
{
"epoch": 0.9635670875712441,
"grad_norm": 0.33710840344429016,
"learning_rate": 0.000257482677396966,
"loss": 0.6,
"step": 7650
},
{
"epoch": 0.9641968699814214,
"grad_norm": 0.3512105345726013,
"learning_rate": 0.00025740597361996215,
"loss": 0.5772,
"step": 7655
},
{
"epoch": 0.9648266523915987,
"grad_norm": 0.32505640387535095,
"learning_rate": 0.00025732921216349705,
"loss": 0.5872,
"step": 7660
},
{
"epoch": 0.965456434801776,
"grad_norm": 0.32156363129615784,
"learning_rate": 0.0002572523930687933,
"loss": 0.5842,
"step": 7665
},
{
"epoch": 0.9660862172119533,
"grad_norm": 0.313147634267807,
"learning_rate": 0.0002571755163771046,
"loss": 0.5697,
"step": 7670
},
{
"epoch": 0.9667159996221305,
"grad_norm": 0.3494894504547119,
"learning_rate": 0.00025709858212971545,
"loss": 0.5651,
"step": 7675
},
{
"epoch": 0.9673457820323078,
"grad_norm": 0.317107230424881,
"learning_rate": 0.00025702159036794135,
"loss": 0.5563,
"step": 7680
},
{
"epoch": 0.9679755644424851,
"grad_norm": 0.3228907585144043,
"learning_rate": 0.00025694454113312854,
"loss": 0.5642,
"step": 7685
},
{
"epoch": 0.9686053468526624,
"grad_norm": 0.33400991559028625,
"learning_rate": 0.00025686743446665426,
"loss": 0.5738,
"step": 7690
},
{
"epoch": 0.9692351292628397,
"grad_norm": 0.35151737928390503,
"learning_rate": 0.0002567902704099266,
"loss": 0.562,
"step": 7695
},
{
"epoch": 0.969864911673017,
"grad_norm": 0.33582988381385803,
"learning_rate": 0.00025671304900438437,
"loss": 0.5724,
"step": 7700
},
{
"epoch": 0.9704946940831942,
"grad_norm": 0.4050043523311615,
"learning_rate": 0.00025663577029149727,
"loss": 0.6038,
"step": 7705
},
{
"epoch": 0.9711244764933715,
"grad_norm": 0.3320407271385193,
"learning_rate": 0.00025655843431276565,
"loss": 0.5725,
"step": 7710
},
{
"epoch": 0.9717542589035488,
"grad_norm": 0.33253729343414307,
"learning_rate": 0.00025648104110972074,
"loss": 0.559,
"step": 7715
},
{
"epoch": 0.9723840413137261,
"grad_norm": 0.3316608667373657,
"learning_rate": 0.0002564035907239245,
"loss": 0.5813,
"step": 7720
},
{
"epoch": 0.9730138237239034,
"grad_norm": 0.35272932052612305,
"learning_rate": 0.0002563260831969695,
"loss": 0.5544,
"step": 7725
},
{
"epoch": 0.9736436061340806,
"grad_norm": 0.2942962348461151,
"learning_rate": 0.00025624851857047914,
"loss": 0.5741,
"step": 7730
},
{
"epoch": 0.9742733885442579,
"grad_norm": 0.30799049139022827,
"learning_rate": 0.0002561708968861073,
"loss": 0.5604,
"step": 7735
},
{
"epoch": 0.9749031709544352,
"grad_norm": 0.2929095923900604,
"learning_rate": 0.00025609321818553864,
"loss": 0.5399,
"step": 7740
},
{
"epoch": 0.9755329533646125,
"grad_norm": 0.3074556291103363,
"learning_rate": 0.00025601548251048833,
"loss": 0.5714,
"step": 7745
},
{
"epoch": 0.9761627357747898,
"grad_norm": 0.3233494162559509,
"learning_rate": 0.0002559376899027024,
"loss": 0.5559,
"step": 7750
},
{
"epoch": 0.976792518184967,
"grad_norm": 0.3106531500816345,
"learning_rate": 0.000255859840403957,
"loss": 0.5462,
"step": 7755
},
{
"epoch": 0.9774223005951443,
"grad_norm": 0.35069772601127625,
"learning_rate": 0.00025578193405605923,
"loss": 0.5635,
"step": 7760
},
{
"epoch": 0.9780520830053216,
"grad_norm": 0.310811311006546,
"learning_rate": 0.00025570397090084656,
"loss": 0.5658,
"step": 7765
},
{
"epoch": 0.978681865415499,
"grad_norm": 0.36216944456100464,
"learning_rate": 0.000255625950980187,
"loss": 0.5785,
"step": 7770
},
{
"epoch": 0.9793116478256763,
"grad_norm": 0.30353617668151855,
"learning_rate": 0.000255547874335979,
"loss": 0.5347,
"step": 7775
},
{
"epoch": 0.9799414302358536,
"grad_norm": 0.3112618029117584,
"learning_rate": 0.00025546974101015154,
"loss": 0.5559,
"step": 7780
},
{
"epoch": 0.9805712126460308,
"grad_norm": 0.3782903552055359,
"learning_rate": 0.00025539155104466394,
"loss": 0.5717,
"step": 7785
},
{
"epoch": 0.9812009950562081,
"grad_norm": 0.3308548331260681,
"learning_rate": 0.000255313304481506,
"loss": 0.5511,
"step": 7790
},
{
"epoch": 0.9818307774663854,
"grad_norm": 0.2971625328063965,
"learning_rate": 0.000255235001362698,
"loss": 0.5411,
"step": 7795
},
{
"epoch": 0.9824605598765627,
"grad_norm": 0.3594948351383209,
"learning_rate": 0.0002551566417302904,
"loss": 0.5817,
"step": 7800
},
{
"epoch": 0.98309034228674,
"grad_norm": 0.3537582755088806,
"learning_rate": 0.0002550782256263642,
"loss": 0.5631,
"step": 7805
},
{
"epoch": 0.9837201246969172,
"grad_norm": 0.3132795989513397,
"learning_rate": 0.0002549997530930306,
"loss": 0.5725,
"step": 7810
},
{
"epoch": 0.9843499071070945,
"grad_norm": 0.3250652551651001,
"learning_rate": 0.00025492122417243113,
"loss": 0.5786,
"step": 7815
},
{
"epoch": 0.9849796895172718,
"grad_norm": 0.3318973183631897,
"learning_rate": 0.0002548426389067376,
"loss": 0.5399,
"step": 7820
},
{
"epoch": 0.9856094719274491,
"grad_norm": 0.3335192799568176,
"learning_rate": 0.00025476399733815214,
"loss": 0.5693,
"step": 7825
},
{
"epoch": 0.9862392543376264,
"grad_norm": 0.31399449706077576,
"learning_rate": 0.00025468529950890703,
"loss": 0.5821,
"step": 7830
},
{
"epoch": 0.9868690367478037,
"grad_norm": 0.33886855840682983,
"learning_rate": 0.00025460654546126485,
"loss": 0.556,
"step": 7835
},
{
"epoch": 0.9874988191579809,
"grad_norm": 0.3620472848415375,
"learning_rate": 0.0002545277352375183,
"loss": 0.6104,
"step": 7840
},
{
"epoch": 0.9881286015681582,
"grad_norm": 0.31123921275138855,
"learning_rate": 0.0002544488688799902,
"loss": 0.5802,
"step": 7845
},
{
"epoch": 0.9887583839783355,
"grad_norm": 0.33104339241981506,
"learning_rate": 0.0002543699464310337,
"loss": 0.5882,
"step": 7850
},
{
"epoch": 0.9893881663885128,
"grad_norm": 0.3223660886287689,
"learning_rate": 0.00025429096793303186,
"loss": 0.5649,
"step": 7855
},
{
"epoch": 0.9900179487986901,
"grad_norm": 0.3436056077480316,
"learning_rate": 0.000254211933428398,
"loss": 0.5546,
"step": 7860
},
{
"epoch": 0.9906477312088673,
"grad_norm": 0.29697200655937195,
"learning_rate": 0.00025413284295957547,
"loss": 0.5434,
"step": 7865
},
{
"epoch": 0.9912775136190446,
"grad_norm": 0.32985180616378784,
"learning_rate": 0.0002540536965690376,
"loss": 0.5737,
"step": 7870
},
{
"epoch": 0.9919072960292219,
"grad_norm": 0.31599709391593933,
"learning_rate": 0.0002539744942992878,
"loss": 0.5452,
"step": 7875
},
{
"epoch": 0.9925370784393992,
"grad_norm": 0.30331170558929443,
"learning_rate": 0.00025389523619285956,
"loss": 0.5593,
"step": 7880
},
{
"epoch": 0.9931668608495765,
"grad_norm": 0.3150465786457062,
"learning_rate": 0.0002538159222923163,
"loss": 0.5518,
"step": 7885
},
{
"epoch": 0.9937966432597538,
"grad_norm": 0.3179359436035156,
"learning_rate": 0.00025373655264025134,
"loss": 0.5546,
"step": 7890
},
{
"epoch": 0.994426425669931,
"grad_norm": 0.3226470947265625,
"learning_rate": 0.000253657127279288,
"loss": 0.58,
"step": 7895
},
{
"epoch": 0.9950562080801083,
"grad_norm": 0.3453287184238434,
"learning_rate": 0.0002535776462520795,
"loss": 0.5681,
"step": 7900
},
{
"epoch": 0.9956859904902856,
"grad_norm": 0.3329002261161804,
"learning_rate": 0.0002534981096013091,
"loss": 0.548,
"step": 7905
},
{
"epoch": 0.9963157729004629,
"grad_norm": 0.32592061161994934,
"learning_rate": 0.00025341851736968956,
"loss": 0.5244,
"step": 7910
},
{
"epoch": 0.9969455553106402,
"grad_norm": 0.32833319902420044,
"learning_rate": 0.00025333886959996396,
"loss": 0.5558,
"step": 7915
},
{
"epoch": 0.9975753377208174,
"grad_norm": 0.3146878182888031,
"learning_rate": 0.00025325916633490487,
"loss": 0.595,
"step": 7920
},
{
"epoch": 0.9982051201309947,
"grad_norm": 0.3828830122947693,
"learning_rate": 0.00025317940761731476,
"loss": 0.5675,
"step": 7925
},
{
"epoch": 0.998834902541172,
"grad_norm": 0.3208398222923279,
"learning_rate": 0.0002530995934900259,
"loss": 0.5439,
"step": 7930
},
{
"epoch": 0.9994646849513493,
"grad_norm": 0.3446502983570099,
"learning_rate": 0.00025301972399590023,
"loss": 0.5276,
"step": 7935
},
{
"epoch": 1.0,
"grad_norm": 0.31275373697280884,
"learning_rate": 0.0002529397991778297,
"loss": 0.543,
"step": 7940
},
{
"epoch": 1.0006297824101773,
"grad_norm": 0.3219754099845886,
"learning_rate": 0.0002528598190787355,
"loss": 0.4901,
"step": 7945
},
{
"epoch": 1.0012595648203546,
"grad_norm": 0.33292412757873535,
"learning_rate": 0.0002527797837415689,
"loss": 0.4794,
"step": 7950
},
{
"epoch": 1.001889347230532,
"grad_norm": 0.36561062932014465,
"learning_rate": 0.00025269969320931065,
"loss": 0.4948,
"step": 7955
},
{
"epoch": 1.0025191296407092,
"grad_norm": 0.2977091372013092,
"learning_rate": 0.0002526195475249713,
"loss": 0.5172,
"step": 7960
},
{
"epoch": 1.0031489120508865,
"grad_norm": 0.3075500428676605,
"learning_rate": 0.00025253934673159084,
"loss": 0.4755,
"step": 7965
},
{
"epoch": 1.0037786944610636,
"grad_norm": 0.30956047773361206,
"learning_rate": 0.00025245909087223895,
"loss": 0.4783,
"step": 7970
},
{
"epoch": 1.004408476871241,
"grad_norm": 0.34965232014656067,
"learning_rate": 0.00025237877999001484,
"loss": 0.4876,
"step": 7975
},
{
"epoch": 1.0050382592814182,
"grad_norm": 0.3290039896965027,
"learning_rate": 0.00025229841412804726,
"loss": 0.501,
"step": 7980
},
{
"epoch": 1.0056680416915955,
"grad_norm": 0.3144761323928833,
"learning_rate": 0.00025221799332949456,
"loss": 0.4923,
"step": 7985
},
{
"epoch": 1.0062978241017728,
"grad_norm": 0.3586188554763794,
"learning_rate": 0.0002521375176375446,
"loss": 0.487,
"step": 7990
},
{
"epoch": 1.0069276065119501,
"grad_norm": 0.3210572302341461,
"learning_rate": 0.0002520569870954146,
"loss": 0.4916,
"step": 7995
},
{
"epoch": 1.0075573889221274,
"grad_norm": 0.3171830177307129,
"learning_rate": 0.0002519764017463512,
"loss": 0.4834,
"step": 8000
},
{
"epoch": 1.0075573889221274,
"eval_loss": 0.30723655223846436,
"eval_runtime": 6.2539,
"eval_samples_per_second": 159.899,
"eval_steps_per_second": 10.074,
"step": 8000
},
{
"epoch": 1.0081871713323047,
"grad_norm": 0.3511858880519867,
"learning_rate": 0.00025189576163363076,
"loss": 0.4937,
"step": 8005
},
{
"epoch": 1.008816953742482,
"grad_norm": 0.3305964171886444,
"learning_rate": 0.00025181506680055875,
"loss": 0.4665,
"step": 8010
},
{
"epoch": 1.0094467361526593,
"grad_norm": 0.3735099732875824,
"learning_rate": 0.00025173431729047014,
"loss": 0.5116,
"step": 8015
},
{
"epoch": 1.0100765185628366,
"grad_norm": 0.34169599413871765,
"learning_rate": 0.0002516535131467293,
"loss": 0.475,
"step": 8020
},
{
"epoch": 1.0107063009730137,
"grad_norm": 0.3473950922489166,
"learning_rate": 0.00025157265441272993,
"loss": 0.4812,
"step": 8025
},
{
"epoch": 1.011336083383191,
"grad_norm": 0.31877681612968445,
"learning_rate": 0.00025149174113189496,
"loss": 0.4906,
"step": 8030
},
{
"epoch": 1.0119658657933683,
"grad_norm": 0.364511638879776,
"learning_rate": 0.0002514107733476766,
"loss": 0.4926,
"step": 8035
},
{
"epoch": 1.0125956482035456,
"grad_norm": 0.3073696792125702,
"learning_rate": 0.00025132975110355664,
"loss": 0.4994,
"step": 8040
},
{
"epoch": 1.013225430613723,
"grad_norm": 0.3270637094974518,
"learning_rate": 0.0002512486744430456,
"loss": 0.468,
"step": 8045
},
{
"epoch": 1.0138552130239002,
"grad_norm": 0.3626968264579773,
"learning_rate": 0.0002511675434096837,
"loss": 0.5139,
"step": 8050
},
{
"epoch": 1.0144849954340776,
"grad_norm": 0.30527931451797485,
"learning_rate": 0.00025108635804704,
"loss": 0.4922,
"step": 8055
},
{
"epoch": 1.0151147778442549,
"grad_norm": 0.3518252968788147,
"learning_rate": 0.000251005118398713,
"loss": 0.5297,
"step": 8060
},
{
"epoch": 1.0157445602544322,
"grad_norm": 0.3298850655555725,
"learning_rate": 0.0002509238245083302,
"loss": 0.5292,
"step": 8065
},
{
"epoch": 1.0163743426646095,
"grad_norm": 0.3175168037414551,
"learning_rate": 0.0002508424764195484,
"loss": 0.4907,
"step": 8070
},
{
"epoch": 1.0170041250747868,
"grad_norm": 0.33489352464675903,
"learning_rate": 0.0002507610741760531,
"loss": 0.4869,
"step": 8075
},
{
"epoch": 1.0176339074849639,
"grad_norm": 0.2922315299510956,
"learning_rate": 0.0002506796178215595,
"loss": 0.474,
"step": 8080
},
{
"epoch": 1.0182636898951412,
"grad_norm": 0.32073619961738586,
"learning_rate": 0.00025059810739981125,
"loss": 0.4951,
"step": 8085
},
{
"epoch": 1.0188934723053185,
"grad_norm": 0.2875652611255646,
"learning_rate": 0.0002505165429545815,
"loss": 0.5104,
"step": 8090
},
{
"epoch": 1.0195232547154958,
"grad_norm": 0.33247148990631104,
"learning_rate": 0.0002504349245296721,
"loss": 0.489,
"step": 8095
},
{
"epoch": 1.020153037125673,
"grad_norm": 0.29777953028678894,
"learning_rate": 0.0002503532521689141,
"loss": 0.5172,
"step": 8100
},
{
"epoch": 1.0207828195358504,
"grad_norm": 0.3418375253677368,
"learning_rate": 0.0002502715259161673,
"loss": 0.4464,
"step": 8105
},
{
"epoch": 1.0214126019460277,
"grad_norm": 0.39162155985832214,
"learning_rate": 0.0002501897458153207,
"loss": 0.4953,
"step": 8110
},
{
"epoch": 1.022042384356205,
"grad_norm": 0.32206737995147705,
"learning_rate": 0.000250107911910292,
"loss": 0.4732,
"step": 8115
},
{
"epoch": 1.0226721667663823,
"grad_norm": 0.37178757786750793,
"learning_rate": 0.0002500260242450279,
"loss": 0.504,
"step": 8120
},
{
"epoch": 1.0233019491765596,
"grad_norm": 0.33448055386543274,
"learning_rate": 0.0002499440828635039,
"loss": 0.4774,
"step": 8125
},
{
"epoch": 1.0239317315867367,
"grad_norm": 0.344594806432724,
"learning_rate": 0.00024986208780972455,
"loss": 0.4948,
"step": 8130
},
{
"epoch": 1.024561513996914,
"grad_norm": 0.3440978527069092,
"learning_rate": 0.00024978003912772283,
"loss": 0.4979,
"step": 8135
},
{
"epoch": 1.0251912964070913,
"grad_norm": 0.2915257513523102,
"learning_rate": 0.000249697936861561,
"loss": 0.4875,
"step": 8140
},
{
"epoch": 1.0258210788172686,
"grad_norm": 0.271371990442276,
"learning_rate": 0.0002496157810553296,
"loss": 0.4929,
"step": 8145
},
{
"epoch": 1.0264508612274459,
"grad_norm": 0.3228522539138794,
"learning_rate": 0.0002495335717531484,
"loss": 0.4706,
"step": 8150
},
{
"epoch": 1.0270806436376232,
"grad_norm": 0.3222556412220001,
"learning_rate": 0.00024945130899916554,
"loss": 0.487,
"step": 8155
},
{
"epoch": 1.0277104260478005,
"grad_norm": 0.32311001420021057,
"learning_rate": 0.00024936899283755807,
"loss": 0.5144,
"step": 8160
},
{
"epoch": 1.0283402084579778,
"grad_norm": 0.2946212589740753,
"learning_rate": 0.0002492866233125316,
"loss": 0.4867,
"step": 8165
},
{
"epoch": 1.028969990868155,
"grad_norm": 0.32464465498924255,
"learning_rate": 0.0002492042004683205,
"loss": 0.4729,
"step": 8170
},
{
"epoch": 1.0295997732783324,
"grad_norm": 0.3378526270389557,
"learning_rate": 0.0002491217243491876,
"loss": 0.4843,
"step": 8175
},
{
"epoch": 1.0302295556885097,
"grad_norm": 0.35685908794403076,
"learning_rate": 0.0002490391949994246,
"loss": 0.4941,
"step": 8180
},
{
"epoch": 1.0308593380986868,
"grad_norm": 0.30618053674697876,
"learning_rate": 0.0002489566124633516,
"loss": 0.4985,
"step": 8185
},
{
"epoch": 1.031489120508864,
"grad_norm": 0.34786808490753174,
"learning_rate": 0.0002488739767853173,
"loss": 0.4914,
"step": 8190
},
{
"epoch": 1.0321189029190414,
"grad_norm": 0.35167476534843445,
"learning_rate": 0.00024879128800969893,
"loss": 0.493,
"step": 8195
},
{
"epoch": 1.0327486853292187,
"grad_norm": 0.3278263509273529,
"learning_rate": 0.00024870854618090225,
"loss": 0.4676,
"step": 8200
},
{
"epoch": 1.033378467739396,
"grad_norm": 0.36896881461143494,
"learning_rate": 0.00024862575134336154,
"loss": 0.4995,
"step": 8205
},
{
"epoch": 1.0340082501495733,
"grad_norm": 0.3700760304927826,
"learning_rate": 0.00024854290354153953,
"loss": 0.5189,
"step": 8210
},
{
"epoch": 1.0346380325597506,
"grad_norm": 0.3370974063873291,
"learning_rate": 0.00024846000281992733,
"loss": 0.5044,
"step": 8215
},
{
"epoch": 1.035267814969928,
"grad_norm": 0.3200768232345581,
"learning_rate": 0.00024837704922304457,
"loss": 0.4779,
"step": 8220
},
{
"epoch": 1.0358975973801052,
"grad_norm": 0.2786978781223297,
"learning_rate": 0.0002482940427954392,
"loss": 0.4677,
"step": 8225
},
{
"epoch": 1.0365273797902825,
"grad_norm": 0.3220120668411255,
"learning_rate": 0.00024821098358168757,
"loss": 0.4503,
"step": 8230
},
{
"epoch": 1.0371571622004598,
"grad_norm": 0.3315715491771698,
"learning_rate": 0.00024812787162639444,
"loss": 0.4715,
"step": 8235
},
{
"epoch": 1.037786944610637,
"grad_norm": 0.3595867455005646,
"learning_rate": 0.00024804470697419273,
"loss": 0.4712,
"step": 8240
},
{
"epoch": 1.0384167270208142,
"grad_norm": 0.29993361234664917,
"learning_rate": 0.00024796148966974376,
"loss": 0.47,
"step": 8245
},
{
"epoch": 1.0390465094309915,
"grad_norm": 0.39950379729270935,
"learning_rate": 0.00024787821975773717,
"loss": 0.5233,
"step": 8250
},
{
"epoch": 1.0396762918411688,
"grad_norm": 0.312003493309021,
"learning_rate": 0.0002477948972828908,
"loss": 0.4836,
"step": 8255
},
{
"epoch": 1.0403060742513461,
"grad_norm": 0.29678481817245483,
"learning_rate": 0.0002477115222899507,
"loss": 0.4928,
"step": 8260
},
{
"epoch": 1.0409358566615234,
"grad_norm": 0.35694456100463867,
"learning_rate": 0.0002476280948236912,
"loss": 0.4925,
"step": 8265
},
{
"epoch": 1.0415656390717007,
"grad_norm": 0.3164297640323639,
"learning_rate": 0.00024754461492891474,
"loss": 0.4828,
"step": 8270
},
{
"epoch": 1.042195421481878,
"grad_norm": 0.37906938791275024,
"learning_rate": 0.00024746108265045184,
"loss": 0.4989,
"step": 8275
},
{
"epoch": 1.0428252038920554,
"grad_norm": 0.3458475172519684,
"learning_rate": 0.0002473774980331614,
"loss": 0.5072,
"step": 8280
},
{
"epoch": 1.0434549863022327,
"grad_norm": 0.36052700877189636,
"learning_rate": 0.0002472938611219301,
"loss": 0.4872,
"step": 8285
},
{
"epoch": 1.04408476871241,
"grad_norm": 0.4497036337852478,
"learning_rate": 0.00024721017196167297,
"loss": 0.4921,
"step": 8290
},
{
"epoch": 1.044714551122587,
"grad_norm": 0.357461154460907,
"learning_rate": 0.000247126430597333,
"loss": 0.5035,
"step": 8295
},
{
"epoch": 1.0453443335327643,
"grad_norm": 0.3499346375465393,
"learning_rate": 0.00024704263707388117,
"loss": 0.5242,
"step": 8300
},
{
"epoch": 1.0459741159429417,
"grad_norm": 0.2994784414768219,
"learning_rate": 0.0002469587914363166,
"loss": 0.4575,
"step": 8305
},
{
"epoch": 1.046603898353119,
"grad_norm": 0.3699876666069031,
"learning_rate": 0.0002468748937296662,
"loss": 0.4804,
"step": 8310
},
{
"epoch": 1.0472336807632963,
"grad_norm": 0.37695133686065674,
"learning_rate": 0.000246790943998985,
"loss": 0.4914,
"step": 8315
},
{
"epoch": 1.0478634631734736,
"grad_norm": 0.30732589960098267,
"learning_rate": 0.0002467069422893559,
"loss": 0.458,
"step": 8320
},
{
"epoch": 1.0484932455836509,
"grad_norm": 0.3094361424446106,
"learning_rate": 0.0002466228886458899,
"loss": 0.4584,
"step": 8325
},
{
"epoch": 1.0491230279938282,
"grad_norm": 0.3499257564544678,
"learning_rate": 0.0002465387831137255,
"loss": 0.4717,
"step": 8330
},
{
"epoch": 1.0497528104040055,
"grad_norm": 0.32755059003829956,
"learning_rate": 0.0002464546257380294,
"loss": 0.49,
"step": 8335
},
{
"epoch": 1.0503825928141828,
"grad_norm": 0.3201046884059906,
"learning_rate": 0.000246370416563996,
"loss": 0.4833,
"step": 8340
},
{
"epoch": 1.05101237522436,
"grad_norm": 0.2581581771373749,
"learning_rate": 0.0002462861556368476,
"loss": 0.465,
"step": 8345
},
{
"epoch": 1.0516421576345372,
"grad_norm": 0.3480297923088074,
"learning_rate": 0.00024620184300183423,
"loss": 0.4756,
"step": 8350
},
{
"epoch": 1.0522719400447145,
"grad_norm": 0.36630478501319885,
"learning_rate": 0.00024611747870423366,
"loss": 0.5051,
"step": 8355
},
{
"epoch": 1.0529017224548918,
"grad_norm": 0.3450157940387726,
"learning_rate": 0.0002460330627893515,
"loss": 0.4996,
"step": 8360
},
{
"epoch": 1.053531504865069,
"grad_norm": 0.30790945887565613,
"learning_rate": 0.000245948595302521,
"loss": 0.4826,
"step": 8365
},
{
"epoch": 1.0541612872752464,
"grad_norm": 0.39590683579444885,
"learning_rate": 0.00024586407628910306,
"loss": 0.4963,
"step": 8370
},
{
"epoch": 1.0547910696854237,
"grad_norm": 0.3294634521007538,
"learning_rate": 0.00024577950579448643,
"loss": 0.4868,
"step": 8375
},
{
"epoch": 1.055420852095601,
"grad_norm": 0.33493947982788086,
"learning_rate": 0.00024569488386408736,
"loss": 0.4773,
"step": 8380
},
{
"epoch": 1.0560506345057783,
"grad_norm": 0.32626229524612427,
"learning_rate": 0.00024561021054334974,
"loss": 0.4898,
"step": 8385
},
{
"epoch": 1.0566804169159556,
"grad_norm": 0.3181340992450714,
"learning_rate": 0.00024552548587774507,
"loss": 0.4757,
"step": 8390
},
{
"epoch": 1.057310199326133,
"grad_norm": 0.3592873215675354,
"learning_rate": 0.0002454407099127725,
"loss": 0.5034,
"step": 8395
},
{
"epoch": 1.0579399817363102,
"grad_norm": 0.3184007406234741,
"learning_rate": 0.00024535588269395856,
"loss": 0.4929,
"step": 8400
},
{
"epoch": 1.0585697641464873,
"grad_norm": 0.3555738627910614,
"learning_rate": 0.00024527100426685746,
"loss": 0.503,
"step": 8405
},
{
"epoch": 1.0591995465566646,
"grad_norm": 0.29637908935546875,
"learning_rate": 0.0002451860746770509,
"loss": 0.4716,
"step": 8410
},
{
"epoch": 1.059829328966842,
"grad_norm": 0.3031441569328308,
"learning_rate": 0.0002451010939701479,
"loss": 0.4757,
"step": 8415
},
{
"epoch": 1.0604591113770192,
"grad_norm": 0.28256094455718994,
"learning_rate": 0.0002450160621917851,
"loss": 0.4558,
"step": 8420
},
{
"epoch": 1.0610888937871965,
"grad_norm": 0.3192931115627289,
"learning_rate": 0.0002449309793876266,
"loss": 0.499,
"step": 8425
},
{
"epoch": 1.0617186761973738,
"grad_norm": 0.2788430154323578,
"learning_rate": 0.00024484584560336363,
"loss": 0.4616,
"step": 8430
},
{
"epoch": 1.0623484586075511,
"grad_norm": 0.35733649134635925,
"learning_rate": 0.00024476066088471507,
"loss": 0.4926,
"step": 8435
},
{
"epoch": 1.0629782410177284,
"grad_norm": 0.3398718535900116,
"learning_rate": 0.00024467542527742707,
"loss": 0.4944,
"step": 8440
},
{
"epoch": 1.0636080234279057,
"grad_norm": 0.3338175117969513,
"learning_rate": 0.000244590138827273,
"loss": 0.5181,
"step": 8445
},
{
"epoch": 1.064237805838083,
"grad_norm": 0.33433952927589417,
"learning_rate": 0.00024450480158005384,
"loss": 0.4837,
"step": 8450
},
{
"epoch": 1.0648675882482603,
"grad_norm": 0.3656097948551178,
"learning_rate": 0.0002444194135815974,
"loss": 0.4639,
"step": 8455
},
{
"epoch": 1.0654973706584374,
"grad_norm": 0.31470635533332825,
"learning_rate": 0.0002443339748777592,
"loss": 0.4718,
"step": 8460
},
{
"epoch": 1.0661271530686147,
"grad_norm": 0.29020166397094727,
"learning_rate": 0.00024424848551442166,
"loss": 0.4712,
"step": 8465
},
{
"epoch": 1.066756935478792,
"grad_norm": 0.34259042143821716,
"learning_rate": 0.00024416294553749446,
"loss": 0.5252,
"step": 8470
},
{
"epoch": 1.0673867178889693,
"grad_norm": 0.33828607201576233,
"learning_rate": 0.0002440773549929146,
"loss": 0.4663,
"step": 8475
},
{
"epoch": 1.0680165002991466,
"grad_norm": 0.35722973942756653,
"learning_rate": 0.00024399171392664622,
"loss": 0.4868,
"step": 8480
},
{
"epoch": 1.068646282709324,
"grad_norm": 0.3226557970046997,
"learning_rate": 0.00024390602238468043,
"loss": 0.4785,
"step": 8485
},
{
"epoch": 1.0692760651195012,
"grad_norm": 0.3097434639930725,
"learning_rate": 0.0002438202804130356,
"loss": 0.4677,
"step": 8490
},
{
"epoch": 1.0699058475296785,
"grad_norm": 0.3146856725215912,
"learning_rate": 0.00024373448805775709,
"loss": 0.4802,
"step": 8495
},
{
"epoch": 1.0705356299398558,
"grad_norm": 0.3576582372188568,
"learning_rate": 0.00024364864536491739,
"loss": 0.5113,
"step": 8500
},
{
"epoch": 1.0711654123500332,
"grad_norm": 0.33004313707351685,
"learning_rate": 0.0002435627523806159,
"loss": 0.4625,
"step": 8505
},
{
"epoch": 1.0717951947602105,
"grad_norm": 0.3689037263393402,
"learning_rate": 0.00024347680915097928,
"loss": 0.4923,
"step": 8510
},
{
"epoch": 1.0724249771703875,
"grad_norm": 0.28334125876426697,
"learning_rate": 0.00024339081572216084,
"loss": 0.4818,
"step": 8515
},
{
"epoch": 1.0730547595805648,
"grad_norm": 0.3461993336677551,
"learning_rate": 0.00024330477214034113,
"loss": 0.4807,
"step": 8520
},
{
"epoch": 1.0736845419907421,
"grad_norm": 0.32148951292037964,
"learning_rate": 0.00024321867845172743,
"loss": 0.4829,
"step": 8525
},
{
"epoch": 1.0743143244009195,
"grad_norm": 0.31461793184280396,
"learning_rate": 0.0002431325347025541,
"loss": 0.5045,
"step": 8530
},
{
"epoch": 1.0749441068110968,
"grad_norm": 0.30194273591041565,
"learning_rate": 0.00024304634093908224,
"loss": 0.4747,
"step": 8535
},
{
"epoch": 1.075573889221274,
"grad_norm": 0.27379968762397766,
"learning_rate": 0.0002429600972075999,
"loss": 0.4382,
"step": 8540
},
{
"epoch": 1.0762036716314514,
"grad_norm": 0.3732368052005768,
"learning_rate": 0.0002428738035544219,
"loss": 0.4704,
"step": 8545
},
{
"epoch": 1.0768334540416287,
"grad_norm": 0.3252260088920593,
"learning_rate": 0.00024278746002588997,
"loss": 0.4929,
"step": 8550
},
{
"epoch": 1.077463236451806,
"grad_norm": 0.31606802344322205,
"learning_rate": 0.00024270106666837246,
"loss": 0.4698,
"step": 8555
},
{
"epoch": 1.0780930188619833,
"grad_norm": 0.320529967546463,
"learning_rate": 0.00024261462352826468,
"loss": 0.4531,
"step": 8560
},
{
"epoch": 1.0787228012721606,
"grad_norm": 0.36827871203422546,
"learning_rate": 0.00024252813065198852,
"loss": 0.4948,
"step": 8565
},
{
"epoch": 1.0793525836823377,
"grad_norm": 0.3132867217063904,
"learning_rate": 0.00024244158808599264,
"loss": 0.4836,
"step": 8570
},
{
"epoch": 1.079982366092515,
"grad_norm": 0.32383888959884644,
"learning_rate": 0.00024235499587675236,
"loss": 0.4749,
"step": 8575
},
{
"epoch": 1.0806121485026923,
"grad_norm": 0.32294297218322754,
"learning_rate": 0.0002422683540707697,
"loss": 0.4616,
"step": 8580
},
{
"epoch": 1.0812419309128696,
"grad_norm": 0.3049245774745941,
"learning_rate": 0.00024218166271457322,
"loss": 0.4871,
"step": 8585
},
{
"epoch": 1.0818717133230469,
"grad_norm": 0.3330252170562744,
"learning_rate": 0.00024209492185471826,
"loss": 0.4908,
"step": 8590
},
{
"epoch": 1.0825014957332242,
"grad_norm": 0.35933157801628113,
"learning_rate": 0.00024200813153778654,
"loss": 0.4936,
"step": 8595
},
{
"epoch": 1.0831312781434015,
"grad_norm": 0.345434308052063,
"learning_rate": 0.00024192129181038654,
"loss": 0.4637,
"step": 8600
},
{
"epoch": 1.0837610605535788,
"grad_norm": 0.3012515604496002,
"learning_rate": 0.0002418344027191531,
"loss": 0.4719,
"step": 8605
},
{
"epoch": 1.084390842963756,
"grad_norm": 0.3081362247467041,
"learning_rate": 0.0002417474643107477,
"loss": 0.4852,
"step": 8610
},
{
"epoch": 1.0850206253739334,
"grad_norm": 0.367389053106308,
"learning_rate": 0.00024166047663185826,
"loss": 0.5046,
"step": 8615
},
{
"epoch": 1.0856504077841107,
"grad_norm": 0.3392958641052246,
"learning_rate": 0.0002415734397291991,
"loss": 0.5087,
"step": 8620
},
{
"epoch": 1.0862801901942878,
"grad_norm": 0.2843685746192932,
"learning_rate": 0.000241486353649511,
"loss": 0.4722,
"step": 8625
},
{
"epoch": 1.086909972604465,
"grad_norm": 0.29619672894477844,
"learning_rate": 0.00024139921843956128,
"loss": 0.4645,
"step": 8630
},
{
"epoch": 1.0875397550146424,
"grad_norm": 0.30029621720314026,
"learning_rate": 0.00024131203414614347,
"loss": 0.4434,
"step": 8635
},
{
"epoch": 1.0881695374248197,
"grad_norm": 0.3630850911140442,
"learning_rate": 0.00024122480081607755,
"loss": 0.4772,
"step": 8640
},
{
"epoch": 1.088799319834997,
"grad_norm": 0.32482001185417175,
"learning_rate": 0.00024113751849620974,
"loss": 0.4441,
"step": 8645
},
{
"epoch": 1.0894291022451743,
"grad_norm": 0.3149590492248535,
"learning_rate": 0.00024105018723341275,
"loss": 0.468,
"step": 8650
},
{
"epoch": 1.0900588846553516,
"grad_norm": 0.34652113914489746,
"learning_rate": 0.0002409628070745854,
"loss": 0.4706,
"step": 8655
},
{
"epoch": 1.090688667065529,
"grad_norm": 0.31633374094963074,
"learning_rate": 0.00024087537806665279,
"loss": 0.4693,
"step": 8660
},
{
"epoch": 1.0913184494757062,
"grad_norm": 0.31668806076049805,
"learning_rate": 0.00024078790025656638,
"loss": 0.4619,
"step": 8665
},
{
"epoch": 1.0919482318858835,
"grad_norm": 0.3093356490135193,
"learning_rate": 0.00024070037369130375,
"loss": 0.485,
"step": 8670
},
{
"epoch": 1.0925780142960608,
"grad_norm": 0.31765609979629517,
"learning_rate": 0.0002406127984178686,
"loss": 0.4696,
"step": 8675
},
{
"epoch": 1.093207796706238,
"grad_norm": 0.35910454392433167,
"learning_rate": 0.00024052517448329086,
"loss": 0.4781,
"step": 8680
},
{
"epoch": 1.0938375791164152,
"grad_norm": 0.37290528416633606,
"learning_rate": 0.00024043750193462665,
"loss": 0.4824,
"step": 8685
},
{
"epoch": 1.0944673615265925,
"grad_norm": 0.3106020390987396,
"learning_rate": 0.00024034978081895807,
"loss": 0.4607,
"step": 8690
},
{
"epoch": 1.0950971439367698,
"grad_norm": 0.3306252062320709,
"learning_rate": 0.0002402620111833934,
"loss": 0.4725,
"step": 8695
},
{
"epoch": 1.0957269263469471,
"grad_norm": 0.2956124544143677,
"learning_rate": 0.00024017419307506687,
"loss": 0.4784,
"step": 8700
},
{
"epoch": 1.0963567087571244,
"grad_norm": 0.3285719156265259,
"learning_rate": 0.00024008632654113894,
"loss": 0.4856,
"step": 8705
},
{
"epoch": 1.0969864911673017,
"grad_norm": 0.3430241644382477,
"learning_rate": 0.00023999841162879583,
"loss": 0.5017,
"step": 8710
},
{
"epoch": 1.097616273577479,
"grad_norm": 0.33543142676353455,
"learning_rate": 0.00023991044838524985,
"loss": 0.516,
"step": 8715
},
{
"epoch": 1.0982460559876563,
"grad_norm": 0.28755661845207214,
"learning_rate": 0.0002398224368577394,
"loss": 0.4645,
"step": 8720
},
{
"epoch": 1.0988758383978336,
"grad_norm": 0.34112608432769775,
"learning_rate": 0.00023973437709352851,
"loss": 0.5134,
"step": 8725
},
{
"epoch": 1.0995056208080107,
"grad_norm": 0.3198321759700775,
"learning_rate": 0.00023964626913990743,
"loss": 0.4939,
"step": 8730
},
{
"epoch": 1.100135403218188,
"grad_norm": 0.2985571026802063,
"learning_rate": 0.00023955811304419205,
"loss": 0.4817,
"step": 8735
},
{
"epoch": 1.1007651856283653,
"grad_norm": 0.32038047909736633,
"learning_rate": 0.0002394699088537243,
"loss": 0.524,
"step": 8740
},
{
"epoch": 1.1013949680385426,
"grad_norm": 0.3562256395816803,
"learning_rate": 0.00023938165661587175,
"loss": 0.4779,
"step": 8745
},
{
"epoch": 1.10202475044872,
"grad_norm": 0.3481481969356537,
"learning_rate": 0.00023929335637802788,
"loss": 0.4861,
"step": 8750
},
{
"epoch": 1.1026545328588973,
"grad_norm": 0.3087615966796875,
"learning_rate": 0.00023920500818761198,
"loss": 0.473,
"step": 8755
},
{
"epoch": 1.1032843152690746,
"grad_norm": 0.34575629234313965,
"learning_rate": 0.00023911661209206903,
"loss": 0.4709,
"step": 8760
},
{
"epoch": 1.1039140976792519,
"grad_norm": 0.3505946099758148,
"learning_rate": 0.0002390281681388697,
"loss": 0.4766,
"step": 8765
},
{
"epoch": 1.1045438800894292,
"grad_norm": 0.40102317929267883,
"learning_rate": 0.0002389396763755105,
"loss": 0.5048,
"step": 8770
},
{
"epoch": 1.1051736624996065,
"grad_norm": 0.3319726884365082,
"learning_rate": 0.0002388511368495135,
"loss": 0.4768,
"step": 8775
},
{
"epoch": 1.1058034449097838,
"grad_norm": 0.3191297948360443,
"learning_rate": 0.00023876254960842645,
"loss": 0.5009,
"step": 8780
},
{
"epoch": 1.1064332273199609,
"grad_norm": 0.3122735619544983,
"learning_rate": 0.00023867391469982268,
"loss": 0.4777,
"step": 8785
},
{
"epoch": 1.1070630097301382,
"grad_norm": 0.33340710401535034,
"learning_rate": 0.0002385852321713012,
"loss": 0.459,
"step": 8790
},
{
"epoch": 1.1076927921403155,
"grad_norm": 0.32803764939308167,
"learning_rate": 0.00023849650207048655,
"loss": 0.4784,
"step": 8795
},
{
"epoch": 1.1083225745504928,
"grad_norm": 0.35463786125183105,
"learning_rate": 0.00023840772444502878,
"loss": 0.4739,
"step": 8800
},
{
"epoch": 1.10895235696067,
"grad_norm": 0.3237099349498749,
"learning_rate": 0.00023831889934260357,
"loss": 0.4652,
"step": 8805
},
{
"epoch": 1.1095821393708474,
"grad_norm": 0.34681713581085205,
"learning_rate": 0.000238230026810912,
"loss": 0.4872,
"step": 8810
},
{
"epoch": 1.1102119217810247,
"grad_norm": 0.3360891342163086,
"learning_rate": 0.00023814110689768066,
"loss": 0.496,
"step": 8815
},
{
"epoch": 1.110841704191202,
"grad_norm": 0.32971322536468506,
"learning_rate": 0.0002380521396506615,
"loss": 0.4468,
"step": 8820
},
{
"epoch": 1.1114714866013793,
"grad_norm": 0.3112764060497284,
"learning_rate": 0.00023796312511763205,
"loss": 0.4985,
"step": 8825
},
{
"epoch": 1.1121012690115566,
"grad_norm": 0.30539095401763916,
"learning_rate": 0.0002378740633463951,
"loss": 0.4835,
"step": 8830
},
{
"epoch": 1.1127310514217337,
"grad_norm": 0.274139940738678,
"learning_rate": 0.00023778495438477894,
"loss": 0.5014,
"step": 8835
},
{
"epoch": 1.113360833831911,
"grad_norm": 0.2877870500087738,
"learning_rate": 0.000237695798280637,
"loss": 0.4842,
"step": 8840
},
{
"epoch": 1.1139906162420883,
"grad_norm": 0.262893944978714,
"learning_rate": 0.00023760659508184823,
"loss": 0.4754,
"step": 8845
},
{
"epoch": 1.1146203986522656,
"grad_norm": 0.3255792260169983,
"learning_rate": 0.00023751734483631672,
"loss": 0.489,
"step": 8850
},
{
"epoch": 1.115250181062443,
"grad_norm": 0.3453415632247925,
"learning_rate": 0.00023742804759197195,
"loss": 0.4624,
"step": 8855
},
{
"epoch": 1.1158799634726202,
"grad_norm": 0.3276025354862213,
"learning_rate": 0.00023733870339676856,
"loss": 0.4629,
"step": 8860
},
{
"epoch": 1.1165097458827975,
"grad_norm": 0.32096150517463684,
"learning_rate": 0.0002372493122986864,
"loss": 0.4482,
"step": 8865
},
{
"epoch": 1.1171395282929748,
"grad_norm": 0.33016180992126465,
"learning_rate": 0.00023715987434573055,
"loss": 0.493,
"step": 8870
},
{
"epoch": 1.117769310703152,
"grad_norm": 0.2946653366088867,
"learning_rate": 0.00023707038958593126,
"loss": 0.4365,
"step": 8875
},
{
"epoch": 1.1183990931133294,
"grad_norm": 0.37148308753967285,
"learning_rate": 0.00023698085806734385,
"loss": 0.4974,
"step": 8880
},
{
"epoch": 1.1190288755235067,
"grad_norm": 0.3068748116493225,
"learning_rate": 0.00023689127983804882,
"loss": 0.4886,
"step": 8885
},
{
"epoch": 1.1196586579336838,
"grad_norm": 0.3096564710140228,
"learning_rate": 0.00023680165494615167,
"loss": 0.4592,
"step": 8890
},
{
"epoch": 1.120288440343861,
"grad_norm": 0.3341507613658905,
"learning_rate": 0.00023671198343978308,
"loss": 0.4258,
"step": 8895
},
{
"epoch": 1.1209182227540384,
"grad_norm": 0.30653128027915955,
"learning_rate": 0.00023662226536709868,
"loss": 0.486,
"step": 8900
},
{
"epoch": 1.1215480051642157,
"grad_norm": 0.30991849303245544,
"learning_rate": 0.00023653250077627908,
"loss": 0.4879,
"step": 8905
},
{
"epoch": 1.122177787574393,
"grad_norm": 0.3082162141799927,
"learning_rate": 0.00023644268971552998,
"loss": 0.4538,
"step": 8910
},
{
"epoch": 1.1228075699845703,
"grad_norm": 0.30248114466667175,
"learning_rate": 0.00023635283223308193,
"loss": 0.4501,
"step": 8915
},
{
"epoch": 1.1234373523947476,
"grad_norm": 0.34090158343315125,
"learning_rate": 0.00023626292837719047,
"loss": 0.4825,
"step": 8920
},
{
"epoch": 1.124067134804925,
"grad_norm": 0.28670960664749146,
"learning_rate": 0.00023617297819613598,
"loss": 0.4422,
"step": 8925
},
{
"epoch": 1.1246969172151022,
"grad_norm": 0.37079116702079773,
"learning_rate": 0.0002360829817382239,
"loss": 0.4725,
"step": 8930
},
{
"epoch": 1.1253266996252795,
"grad_norm": 0.35876086354255676,
"learning_rate": 0.00023599293905178417,
"loss": 0.4672,
"step": 8935
},
{
"epoch": 1.1259564820354568,
"grad_norm": 0.28581666946411133,
"learning_rate": 0.00023590285018517196,
"loss": 0.4597,
"step": 8940
},
{
"epoch": 1.126586264445634,
"grad_norm": 0.34076693654060364,
"learning_rate": 0.00023581271518676694,
"loss": 0.4894,
"step": 8945
},
{
"epoch": 1.1272160468558112,
"grad_norm": 0.29919254779815674,
"learning_rate": 0.0002357225341049737,
"loss": 0.4538,
"step": 8950
},
{
"epoch": 1.1278458292659885,
"grad_norm": 0.2799806594848633,
"learning_rate": 0.00023563230698822154,
"loss": 0.4814,
"step": 8955
},
{
"epoch": 1.1284756116761658,
"grad_norm": 0.3249780833721161,
"learning_rate": 0.00023554203388496446,
"loss": 0.4825,
"step": 8960
},
{
"epoch": 1.1291053940863431,
"grad_norm": 0.3509981036186218,
"learning_rate": 0.0002354517148436812,
"loss": 0.4468,
"step": 8965
},
{
"epoch": 1.1297351764965204,
"grad_norm": 0.33016157150268555,
"learning_rate": 0.0002353613499128752,
"loss": 0.449,
"step": 8970
},
{
"epoch": 1.1303649589066977,
"grad_norm": 0.2889571487903595,
"learning_rate": 0.00023527093914107436,
"loss": 0.4584,
"step": 8975
},
{
"epoch": 1.130994741316875,
"grad_norm": 0.31957536935806274,
"learning_rate": 0.00023518048257683145,
"loss": 0.4807,
"step": 8980
},
{
"epoch": 1.1316245237270524,
"grad_norm": 0.31418105959892273,
"learning_rate": 0.00023508998026872365,
"loss": 0.4755,
"step": 8985
},
{
"epoch": 1.1322543061372297,
"grad_norm": 0.3458874523639679,
"learning_rate": 0.00023499943226535278,
"loss": 0.4906,
"step": 8990
},
{
"epoch": 1.132884088547407,
"grad_norm": 0.3091862201690674,
"learning_rate": 0.0002349088386153452,
"loss": 0.4786,
"step": 8995
},
{
"epoch": 1.133513870957584,
"grad_norm": 0.2758231461048126,
"learning_rate": 0.00023481819936735178,
"loss": 0.4189,
"step": 9000
},
{
"epoch": 1.133513870957584,
"eval_loss": 0.3038506805896759,
"eval_runtime": 6.258,
"eval_samples_per_second": 159.795,
"eval_steps_per_second": 10.067,
"step": 9000
},
{
"epoch": 1.1341436533677614,
"grad_norm": 0.3153883218765259,
"learning_rate": 0.00023472751457004782,
"loss": 0.4802,
"step": 9005
},
{
"epoch": 1.1347734357779387,
"grad_norm": 0.3110881745815277,
"learning_rate": 0.00023463678427213317,
"loss": 0.4488,
"step": 9010
},
{
"epoch": 1.135403218188116,
"grad_norm": 0.30957111716270447,
"learning_rate": 0.00023454600852233206,
"loss": 0.476,
"step": 9015
},
{
"epoch": 1.1360330005982933,
"grad_norm": 0.3130200207233429,
"learning_rate": 0.00023445518736939312,
"loss": 0.4396,
"step": 9020
},
{
"epoch": 1.1366627830084706,
"grad_norm": 0.31500178575515747,
"learning_rate": 0.0002343643208620894,
"loss": 0.4644,
"step": 9025
},
{
"epoch": 1.1372925654186479,
"grad_norm": 0.3096972703933716,
"learning_rate": 0.00023427340904921834,
"loss": 0.4775,
"step": 9030
},
{
"epoch": 1.1379223478288252,
"grad_norm": 0.3503490388393402,
"learning_rate": 0.00023418245197960155,
"loss": 0.4617,
"step": 9035
},
{
"epoch": 1.1385521302390025,
"grad_norm": 0.31281721591949463,
"learning_rate": 0.00023409144970208516,
"loss": 0.4703,
"step": 9040
},
{
"epoch": 1.1391819126491798,
"grad_norm": 0.3011356592178345,
"learning_rate": 0.0002340004022655394,
"loss": 0.4472,
"step": 9045
},
{
"epoch": 1.139811695059357,
"grad_norm": 0.3240005075931549,
"learning_rate": 0.00023390930971885888,
"loss": 0.4726,
"step": 9050
},
{
"epoch": 1.1404414774695342,
"grad_norm": 0.35690784454345703,
"learning_rate": 0.0002338181721109623,
"loss": 0.4601,
"step": 9055
},
{
"epoch": 1.1410712598797115,
"grad_norm": 0.30888888239860535,
"learning_rate": 0.0002337269894907927,
"loss": 0.45,
"step": 9060
},
{
"epoch": 1.1417010422898888,
"grad_norm": 0.3118223249912262,
"learning_rate": 0.00023363576190731726,
"loss": 0.4456,
"step": 9065
},
{
"epoch": 1.142330824700066,
"grad_norm": 0.3156544864177704,
"learning_rate": 0.0002335444894095272,
"loss": 0.4744,
"step": 9070
},
{
"epoch": 1.1429606071102434,
"grad_norm": 0.33679795265197754,
"learning_rate": 0.00023345317204643797,
"loss": 0.4662,
"step": 9075
},
{
"epoch": 1.1435903895204207,
"grad_norm": 0.32647955417633057,
"learning_rate": 0.00023336180986708904,
"loss": 0.4573,
"step": 9080
},
{
"epoch": 1.144220171930598,
"grad_norm": 0.3759111762046814,
"learning_rate": 0.00023327040292054412,
"loss": 0.4439,
"step": 9085
},
{
"epoch": 1.1448499543407753,
"grad_norm": 0.31271886825561523,
"learning_rate": 0.00023317895125589066,
"loss": 0.4778,
"step": 9090
},
{
"epoch": 1.1454797367509526,
"grad_norm": 0.2915593385696411,
"learning_rate": 0.0002330874549222404,
"loss": 0.4646,
"step": 9095
},
{
"epoch": 1.14610951916113,
"grad_norm": 0.3337639570236206,
"learning_rate": 0.00023299591396872893,
"loss": 0.4597,
"step": 9100
},
{
"epoch": 1.1467393015713072,
"grad_norm": 0.3345816433429718,
"learning_rate": 0.0002329043284445158,
"loss": 0.519,
"step": 9105
},
{
"epoch": 1.1473690839814843,
"grad_norm": 0.31568819284439087,
"learning_rate": 0.0002328126983987846,
"loss": 0.455,
"step": 9110
},
{
"epoch": 1.1479988663916616,
"grad_norm": 0.3630363643169403,
"learning_rate": 0.00023272102388074265,
"loss": 0.4544,
"step": 9115
},
{
"epoch": 1.148628648801839,
"grad_norm": 0.30382248759269714,
"learning_rate": 0.00023262930493962142,
"loss": 0.485,
"step": 9120
},
{
"epoch": 1.1492584312120162,
"grad_norm": 0.30339518189430237,
"learning_rate": 0.0002325375416246759,
"loss": 0.474,
"step": 9125
},
{
"epoch": 1.1498882136221935,
"grad_norm": 0.33041009306907654,
"learning_rate": 0.00023244573398518523,
"loss": 0.447,
"step": 9130
},
{
"epoch": 1.1505179960323708,
"grad_norm": 0.35708925127983093,
"learning_rate": 0.00023235388207045214,
"loss": 0.4801,
"step": 9135
},
{
"epoch": 1.1511477784425481,
"grad_norm": 0.3497597575187683,
"learning_rate": 0.00023226198592980318,
"loss": 0.4753,
"step": 9140
},
{
"epoch": 1.1517775608527254,
"grad_norm": 0.31747546792030334,
"learning_rate": 0.00023217004561258876,
"loss": 0.4642,
"step": 9145
},
{
"epoch": 1.1524073432629027,
"grad_norm": 0.31225451827049255,
"learning_rate": 0.00023207806116818283,
"loss": 0.501,
"step": 9150
},
{
"epoch": 1.15303712567308,
"grad_norm": 0.31150931119918823,
"learning_rate": 0.00023198603264598327,
"loss": 0.447,
"step": 9155
},
{
"epoch": 1.1536669080832573,
"grad_norm": 0.29207199811935425,
"learning_rate": 0.00023189396009541135,
"loss": 0.448,
"step": 9160
},
{
"epoch": 1.1542966904934344,
"grad_norm": 0.33640962839126587,
"learning_rate": 0.00023180184356591223,
"loss": 0.4725,
"step": 9165
},
{
"epoch": 1.1549264729036117,
"grad_norm": 0.292582631111145,
"learning_rate": 0.00023170968310695457,
"loss": 0.4603,
"step": 9170
},
{
"epoch": 1.155556255313789,
"grad_norm": 0.3217863142490387,
"learning_rate": 0.00023161747876803066,
"loss": 0.4386,
"step": 9175
},
{
"epoch": 1.1561860377239663,
"grad_norm": 0.32607826590538025,
"learning_rate": 0.00023152523059865622,
"loss": 0.4747,
"step": 9180
},
{
"epoch": 1.1568158201341436,
"grad_norm": 0.35956209897994995,
"learning_rate": 0.00023143293864837078,
"loss": 0.4563,
"step": 9185
},
{
"epoch": 1.157445602544321,
"grad_norm": 0.3542852997779846,
"learning_rate": 0.00023134060296673716,
"loss": 0.4907,
"step": 9190
},
{
"epoch": 1.1580753849544982,
"grad_norm": 0.3324996829032898,
"learning_rate": 0.0002312482236033417,
"loss": 0.4539,
"step": 9195
},
{
"epoch": 1.1587051673646755,
"grad_norm": 0.3436378836631775,
"learning_rate": 0.00023115580060779429,
"loss": 0.5107,
"step": 9200
},
{
"epoch": 1.1593349497748529,
"grad_norm": 0.2886941730976105,
"learning_rate": 0.00023106333402972813,
"loss": 0.4547,
"step": 9205
},
{
"epoch": 1.1599647321850302,
"grad_norm": 0.30411913990974426,
"learning_rate": 0.00023097082391879993,
"loss": 0.4517,
"step": 9210
},
{
"epoch": 1.1605945145952075,
"grad_norm": 0.3265014886856079,
"learning_rate": 0.00023087827032468975,
"loss": 0.4589,
"step": 9215
},
{
"epoch": 1.1612242970053845,
"grad_norm": 0.2876526713371277,
"learning_rate": 0.00023078567329710091,
"loss": 0.466,
"step": 9220
},
{
"epoch": 1.1618540794155618,
"grad_norm": 0.29947248101234436,
"learning_rate": 0.0002306930328857602,
"loss": 0.4459,
"step": 9225
},
{
"epoch": 1.1624838618257392,
"grad_norm": 0.33246028423309326,
"learning_rate": 0.00023060034914041753,
"loss": 0.4826,
"step": 9230
},
{
"epoch": 1.1631136442359165,
"grad_norm": 0.29653674364089966,
"learning_rate": 0.0002305076221108463,
"loss": 0.4394,
"step": 9235
},
{
"epoch": 1.1637434266460938,
"grad_norm": 0.30506858229637146,
"learning_rate": 0.00023041485184684308,
"loss": 0.4645,
"step": 9240
},
{
"epoch": 1.164373209056271,
"grad_norm": 0.2603437304496765,
"learning_rate": 0.00023032203839822748,
"loss": 0.4536,
"step": 9245
},
{
"epoch": 1.1650029914664484,
"grad_norm": 0.3310236632823944,
"learning_rate": 0.00023022918181484254,
"loss": 0.4653,
"step": 9250
},
{
"epoch": 1.1656327738766257,
"grad_norm": 0.3645521104335785,
"learning_rate": 0.0002301362821465543,
"loss": 0.4404,
"step": 9255
},
{
"epoch": 1.166262556286803,
"grad_norm": 0.33431464433670044,
"learning_rate": 0.00023004333944325208,
"loss": 0.4389,
"step": 9260
},
{
"epoch": 1.1668923386969803,
"grad_norm": 0.31086647510528564,
"learning_rate": 0.00022995035375484817,
"loss": 0.493,
"step": 9265
},
{
"epoch": 1.1675221211071576,
"grad_norm": 0.34322085976600647,
"learning_rate": 0.00022985732513127805,
"loss": 0.4839,
"step": 9270
},
{
"epoch": 1.1681519035173347,
"grad_norm": 0.3111884593963623,
"learning_rate": 0.0002297642536225002,
"loss": 0.4473,
"step": 9275
},
{
"epoch": 1.168781685927512,
"grad_norm": 0.3494400084018707,
"learning_rate": 0.00022967113927849613,
"loss": 0.469,
"step": 9280
},
{
"epoch": 1.1694114683376893,
"grad_norm": 0.27351829409599304,
"learning_rate": 0.00022957798214927037,
"loss": 0.4617,
"step": 9285
},
{
"epoch": 1.1700412507478666,
"grad_norm": 0.3605945408344269,
"learning_rate": 0.00022948478228485046,
"loss": 0.444,
"step": 9290
},
{
"epoch": 1.1706710331580439,
"grad_norm": 0.31383225321769714,
"learning_rate": 0.0002293915397352869,
"loss": 0.4716,
"step": 9295
},
{
"epoch": 1.1713008155682212,
"grad_norm": 0.3261600732803345,
"learning_rate": 0.00022929825455065292,
"loss": 0.4646,
"step": 9300
},
{
"epoch": 1.1719305979783985,
"grad_norm": 0.29624396562576294,
"learning_rate": 0.00022920492678104492,
"loss": 0.4636,
"step": 9305
},
{
"epoch": 1.1725603803885758,
"grad_norm": 0.39078545570373535,
"learning_rate": 0.00022911155647658201,
"loss": 0.4933,
"step": 9310
},
{
"epoch": 1.173190162798753,
"grad_norm": 0.2990373373031616,
"learning_rate": 0.00022901814368740615,
"loss": 0.4726,
"step": 9315
},
{
"epoch": 1.1738199452089304,
"grad_norm": 0.28325891494750977,
"learning_rate": 0.00022892468846368217,
"loss": 0.4428,
"step": 9320
},
{
"epoch": 1.1744497276191077,
"grad_norm": 0.3357643187046051,
"learning_rate": 0.0002288311908555977,
"loss": 0.4618,
"step": 9325
},
{
"epoch": 1.1750795100292848,
"grad_norm": 0.31550613045692444,
"learning_rate": 0.00022873765091336302,
"loss": 0.4607,
"step": 9330
},
{
"epoch": 1.175709292439462,
"grad_norm": 0.30639806389808655,
"learning_rate": 0.00022864406868721118,
"loss": 0.458,
"step": 9335
},
{
"epoch": 1.1763390748496394,
"grad_norm": 0.3836449086666107,
"learning_rate": 0.0002285504442273981,
"loss": 0.4788,
"step": 9340
},
{
"epoch": 1.1769688572598167,
"grad_norm": 0.2955804467201233,
"learning_rate": 0.00022845677758420217,
"loss": 0.4636,
"step": 9345
},
{
"epoch": 1.177598639669994,
"grad_norm": 0.3264003098011017,
"learning_rate": 0.0002283630688079245,
"loss": 0.4769,
"step": 9350
},
{
"epoch": 1.1782284220801713,
"grad_norm": 0.34578555822372437,
"learning_rate": 0.00022826931794888894,
"loss": 0.4784,
"step": 9355
},
{
"epoch": 1.1788582044903486,
"grad_norm": 0.37039560079574585,
"learning_rate": 0.00022817552505744178,
"loss": 0.5042,
"step": 9360
},
{
"epoch": 1.179487986900526,
"grad_norm": 0.319118857383728,
"learning_rate": 0.00022808169018395192,
"loss": 0.4607,
"step": 9365
},
{
"epoch": 1.1801177693107032,
"grad_norm": 0.32380104064941406,
"learning_rate": 0.00022798781337881086,
"loss": 0.4606,
"step": 9370
},
{
"epoch": 1.1807475517208805,
"grad_norm": 0.3038274943828583,
"learning_rate": 0.00022789389469243256,
"loss": 0.448,
"step": 9375
},
{
"epoch": 1.1813773341310578,
"grad_norm": 0.3078247308731079,
"learning_rate": 0.00022779993417525356,
"loss": 0.4683,
"step": 9380
},
{
"epoch": 1.182007116541235,
"grad_norm": 0.2909676432609558,
"learning_rate": 0.00022770593187773275,
"loss": 0.4778,
"step": 9385
},
{
"epoch": 1.1826368989514122,
"grad_norm": 0.3095955550670624,
"learning_rate": 0.00022761188785035155,
"loss": 0.4523,
"step": 9390
},
{
"epoch": 1.1832666813615895,
"grad_norm": 0.2969966530799866,
"learning_rate": 0.0002275178021436137,
"loss": 0.4735,
"step": 9395
},
{
"epoch": 1.1838964637717668,
"grad_norm": 0.2896679937839508,
"learning_rate": 0.00022742367480804544,
"loss": 0.45,
"step": 9400
},
{
"epoch": 1.1845262461819441,
"grad_norm": 0.31511151790618896,
"learning_rate": 0.0002273295058941952,
"loss": 0.4614,
"step": 9405
},
{
"epoch": 1.1851560285921214,
"grad_norm": 0.3440285623073578,
"learning_rate": 0.00022723529545263399,
"loss": 0.4593,
"step": 9410
},
{
"epoch": 1.1857858110022987,
"grad_norm": 0.29399538040161133,
"learning_rate": 0.00022714104353395483,
"loss": 0.4519,
"step": 9415
},
{
"epoch": 1.186415593412476,
"grad_norm": 0.3958999812602997,
"learning_rate": 0.00022704675018877322,
"loss": 0.4838,
"step": 9420
},
{
"epoch": 1.1870453758226533,
"grad_norm": 0.2960554361343384,
"learning_rate": 0.0002269524154677268,
"loss": 0.459,
"step": 9425
},
{
"epoch": 1.1876751582328307,
"grad_norm": 0.32369253039360046,
"learning_rate": 0.00022685803942147555,
"loss": 0.4542,
"step": 9430
},
{
"epoch": 1.188304940643008,
"grad_norm": 0.320547491312027,
"learning_rate": 0.00022676362210070144,
"loss": 0.4853,
"step": 9435
},
{
"epoch": 1.188934723053185,
"grad_norm": 0.2984744906425476,
"learning_rate": 0.00022666916355610885,
"loss": 0.4201,
"step": 9440
},
{
"epoch": 1.1895645054633623,
"grad_norm": 0.34194597601890564,
"learning_rate": 0.00022657466383842407,
"loss": 0.4705,
"step": 9445
},
{
"epoch": 1.1901942878735396,
"grad_norm": 0.29718858003616333,
"learning_rate": 0.0002264801229983957,
"loss": 0.4403,
"step": 9450
},
{
"epoch": 1.190824070283717,
"grad_norm": 0.29723846912384033,
"learning_rate": 0.0002263855410867943,
"loss": 0.4841,
"step": 9455
},
{
"epoch": 1.1914538526938943,
"grad_norm": 0.31662440299987793,
"learning_rate": 0.00022629091815441245,
"loss": 0.456,
"step": 9460
},
{
"epoch": 1.1920836351040716,
"grad_norm": 0.3458605408668518,
"learning_rate": 0.0002261962542520649,
"loss": 0.4504,
"step": 9465
},
{
"epoch": 1.1927134175142489,
"grad_norm": 0.31829431653022766,
"learning_rate": 0.00022610154943058833,
"loss": 0.4821,
"step": 9470
},
{
"epoch": 1.1933431999244262,
"grad_norm": 0.3380287289619446,
"learning_rate": 0.00022600680374084138,
"loss": 0.4963,
"step": 9475
},
{
"epoch": 1.1939729823346035,
"grad_norm": 0.3048580288887024,
"learning_rate": 0.00022591201723370458,
"loss": 0.4443,
"step": 9480
},
{
"epoch": 1.1946027647447806,
"grad_norm": 0.34586548805236816,
"learning_rate": 0.0002258171899600806,
"loss": 0.473,
"step": 9485
},
{
"epoch": 1.195232547154958,
"grad_norm": 0.2828037440776825,
"learning_rate": 0.0002257223219708937,
"loss": 0.4539,
"step": 9490
},
{
"epoch": 1.1958623295651352,
"grad_norm": 0.31300345063209534,
"learning_rate": 0.00022562741331709024,
"loss": 0.4353,
"step": 9495
},
{
"epoch": 1.1964921119753125,
"grad_norm": 0.311260849237442,
"learning_rate": 0.0002255324640496383,
"loss": 0.4553,
"step": 9500
},
{
"epoch": 1.1971218943854898,
"grad_norm": 0.2941080331802368,
"learning_rate": 0.0002254374742195279,
"loss": 0.4464,
"step": 9505
},
{
"epoch": 1.197751676795667,
"grad_norm": 0.26669132709503174,
"learning_rate": 0.00022534244387777057,
"loss": 0.4368,
"step": 9510
},
{
"epoch": 1.1983814592058444,
"grad_norm": 0.2933709919452667,
"learning_rate": 0.00022524737307539995,
"loss": 0.4526,
"step": 9515
},
{
"epoch": 1.1990112416160217,
"grad_norm": 0.338360458612442,
"learning_rate": 0.0002251522618634711,
"loss": 0.4625,
"step": 9520
},
{
"epoch": 1.199641024026199,
"grad_norm": 0.31670835614204407,
"learning_rate": 0.00022505711029306098,
"loss": 0.4553,
"step": 9525
},
{
"epoch": 1.2002708064363763,
"grad_norm": 0.3221518099308014,
"learning_rate": 0.00022496191841526813,
"loss": 0.475,
"step": 9530
},
{
"epoch": 1.2009005888465536,
"grad_norm": 0.32984668016433716,
"learning_rate": 0.00022486668628121282,
"loss": 0.4526,
"step": 9535
},
{
"epoch": 1.2015303712567307,
"grad_norm": 0.2793140113353729,
"learning_rate": 0.00022477141394203678,
"loss": 0.4374,
"step": 9540
},
{
"epoch": 1.2021601536669082,
"grad_norm": 0.3125605881214142,
"learning_rate": 0.00022467610144890357,
"loss": 0.4569,
"step": 9545
},
{
"epoch": 1.2027899360770853,
"grad_norm": 0.2892754375934601,
"learning_rate": 0.00022458074885299808,
"loss": 0.4747,
"step": 9550
},
{
"epoch": 1.2034197184872626,
"grad_norm": 0.3224146068096161,
"learning_rate": 0.00022448535620552684,
"loss": 0.4372,
"step": 9555
},
{
"epoch": 1.20404950089744,
"grad_norm": 0.33973759412765503,
"learning_rate": 0.00022438992355771787,
"loss": 0.4368,
"step": 9560
},
{
"epoch": 1.2046792833076172,
"grad_norm": 0.37665504217147827,
"learning_rate": 0.00022429445096082073,
"loss": 0.4747,
"step": 9565
},
{
"epoch": 1.2053090657177945,
"grad_norm": 0.2834467589855194,
"learning_rate": 0.00022419893846610634,
"loss": 0.4841,
"step": 9570
},
{
"epoch": 1.2059388481279718,
"grad_norm": 0.3729229271411896,
"learning_rate": 0.00022410338612486715,
"loss": 0.475,
"step": 9575
},
{
"epoch": 1.2065686305381491,
"grad_norm": 0.30668923258781433,
"learning_rate": 0.00022400779398841684,
"loss": 0.4271,
"step": 9580
},
{
"epoch": 1.2071984129483264,
"grad_norm": 0.33016908168792725,
"learning_rate": 0.00022391216210809072,
"loss": 0.4553,
"step": 9585
},
{
"epoch": 1.2078281953585037,
"grad_norm": 0.30926114320755005,
"learning_rate": 0.00022381649053524518,
"loss": 0.4512,
"step": 9590
},
{
"epoch": 1.2084579777686808,
"grad_norm": 0.3481772840023041,
"learning_rate": 0.00022372077932125809,
"loss": 0.4707,
"step": 9595
},
{
"epoch": 1.2090877601788583,
"grad_norm": 0.2549537420272827,
"learning_rate": 0.0002236250285175285,
"loss": 0.4686,
"step": 9600
},
{
"epoch": 1.2097175425890354,
"grad_norm": 0.3111298978328705,
"learning_rate": 0.00022352923817547688,
"loss": 0.4535,
"step": 9605
},
{
"epoch": 1.2103473249992127,
"grad_norm": 0.29062095284461975,
"learning_rate": 0.00022343340834654472,
"loss": 0.4612,
"step": 9610
},
{
"epoch": 1.21097710740939,
"grad_norm": 0.3373335897922516,
"learning_rate": 0.0002233375390821949,
"loss": 0.4233,
"step": 9615
},
{
"epoch": 1.2116068898195673,
"grad_norm": 0.308648943901062,
"learning_rate": 0.0002232416304339114,
"loss": 0.4535,
"step": 9620
},
{
"epoch": 1.2122366722297446,
"grad_norm": 0.32941722869873047,
"learning_rate": 0.00022314568245319935,
"loss": 0.4564,
"step": 9625
},
{
"epoch": 1.212866454639922,
"grad_norm": 0.33229124546051025,
"learning_rate": 0.00022304969519158495,
"loss": 0.458,
"step": 9630
},
{
"epoch": 1.2134962370500992,
"grad_norm": 0.29093366861343384,
"learning_rate": 0.00022295366870061565,
"loss": 0.4315,
"step": 9635
},
{
"epoch": 1.2141260194602765,
"grad_norm": 0.3482106328010559,
"learning_rate": 0.00022285760303185982,
"loss": 0.4311,
"step": 9640
},
{
"epoch": 1.2147558018704538,
"grad_norm": 0.29717814922332764,
"learning_rate": 0.0002227614982369069,
"loss": 0.4261,
"step": 9645
},
{
"epoch": 1.215385584280631,
"grad_norm": 0.3359118700027466,
"learning_rate": 0.00022266535436736738,
"loss": 0.4698,
"step": 9650
},
{
"epoch": 1.2160153666908082,
"grad_norm": 0.3095514476299286,
"learning_rate": 0.0002225691714748727,
"loss": 0.4463,
"step": 9655
},
{
"epoch": 1.2166451491009855,
"grad_norm": 0.29095733165740967,
"learning_rate": 0.0002224729496110753,
"loss": 0.4662,
"step": 9660
},
{
"epoch": 1.2172749315111628,
"grad_norm": 0.34425532817840576,
"learning_rate": 0.00022237668882764847,
"loss": 0.4579,
"step": 9665
},
{
"epoch": 1.2179047139213401,
"grad_norm": 0.32856446504592896,
"learning_rate": 0.0002222803891762865,
"loss": 0.4648,
"step": 9670
},
{
"epoch": 1.2185344963315174,
"grad_norm": 0.35708895325660706,
"learning_rate": 0.00022218405070870451,
"loss": 0.4579,
"step": 9675
},
{
"epoch": 1.2191642787416948,
"grad_norm": 0.26759231090545654,
"learning_rate": 0.0002220876734766384,
"loss": 0.4321,
"step": 9680
},
{
"epoch": 1.219794061151872,
"grad_norm": 0.27995094656944275,
"learning_rate": 0.00022199125753184497,
"loss": 0.4552,
"step": 9685
},
{
"epoch": 1.2204238435620494,
"grad_norm": 0.3591984510421753,
"learning_rate": 0.00022189480292610187,
"loss": 0.4685,
"step": 9690
},
{
"epoch": 1.2210536259722267,
"grad_norm": 0.2892036736011505,
"learning_rate": 0.00022179830971120722,
"loss": 0.4609,
"step": 9695
},
{
"epoch": 1.221683408382404,
"grad_norm": 0.3287111520767212,
"learning_rate": 0.00022170177793898028,
"loss": 0.479,
"step": 9700
},
{
"epoch": 1.222313190792581,
"grad_norm": 0.3088148832321167,
"learning_rate": 0.00022160520766126074,
"loss": 0.4597,
"step": 9705
},
{
"epoch": 1.2229429732027584,
"grad_norm": 0.3263307511806488,
"learning_rate": 0.0002215085989299091,
"loss": 0.4801,
"step": 9710
},
{
"epoch": 1.2235727556129357,
"grad_norm": 0.283078134059906,
"learning_rate": 0.0002214119517968063,
"loss": 0.4476,
"step": 9715
},
{
"epoch": 1.224202538023113,
"grad_norm": 0.3226225674152374,
"learning_rate": 0.00022131526631385422,
"loss": 0.4644,
"step": 9720
},
{
"epoch": 1.2248323204332903,
"grad_norm": 0.32242435216903687,
"learning_rate": 0.00022121854253297514,
"loss": 0.4477,
"step": 9725
},
{
"epoch": 1.2254621028434676,
"grad_norm": 0.3373146057128906,
"learning_rate": 0.0002211217805061119,
"loss": 0.4541,
"step": 9730
},
{
"epoch": 1.2260918852536449,
"grad_norm": 0.28866246342658997,
"learning_rate": 0.00022102498028522786,
"loss": 0.4388,
"step": 9735
},
{
"epoch": 1.2267216676638222,
"grad_norm": 0.308704674243927,
"learning_rate": 0.00022092814192230711,
"loss": 0.425,
"step": 9740
},
{
"epoch": 1.2273514500739995,
"grad_norm": 0.3144040107727051,
"learning_rate": 0.00022083126546935394,
"loss": 0.4532,
"step": 9745
},
{
"epoch": 1.2279812324841768,
"grad_norm": 0.29848021268844604,
"learning_rate": 0.00022073435097839329,
"loss": 0.457,
"step": 9750
},
{
"epoch": 1.228611014894354,
"grad_norm": 0.35102754831314087,
"learning_rate": 0.00022063739850147036,
"loss": 0.4258,
"step": 9755
},
{
"epoch": 1.2292407973045312,
"grad_norm": 0.32105547189712524,
"learning_rate": 0.000220540408090651,
"loss": 0.4226,
"step": 9760
},
{
"epoch": 1.2298705797147085,
"grad_norm": 0.3647817075252533,
"learning_rate": 0.0002204433797980211,
"loss": 0.4556,
"step": 9765
},
{
"epoch": 1.2305003621248858,
"grad_norm": 0.3260333836078644,
"learning_rate": 0.00022034631367568718,
"loss": 0.4834,
"step": 9770
},
{
"epoch": 1.231130144535063,
"grad_norm": 0.30218422412872314,
"learning_rate": 0.00022024920977577596,
"loss": 0.4327,
"step": 9775
},
{
"epoch": 1.2317599269452404,
"grad_norm": 0.3666177988052368,
"learning_rate": 0.0002201520681504344,
"loss": 0.4361,
"step": 9780
},
{
"epoch": 1.2323897093554177,
"grad_norm": 0.3113807737827301,
"learning_rate": 0.00022005488885182975,
"loss": 0.4554,
"step": 9785
},
{
"epoch": 1.233019491765595,
"grad_norm": 0.31085875630378723,
"learning_rate": 0.00021995767193214963,
"loss": 0.4391,
"step": 9790
},
{
"epoch": 1.2336492741757723,
"grad_norm": 0.304509699344635,
"learning_rate": 0.0002198604174436017,
"loss": 0.4754,
"step": 9795
},
{
"epoch": 1.2342790565859496,
"grad_norm": 0.2930733263492584,
"learning_rate": 0.0002197631254384138,
"loss": 0.4194,
"step": 9800
},
{
"epoch": 1.234908838996127,
"grad_norm": 0.30277615785598755,
"learning_rate": 0.00021966579596883394,
"loss": 0.4506,
"step": 9805
},
{
"epoch": 1.2355386214063042,
"grad_norm": 0.2824211120605469,
"learning_rate": 0.00021956842908713037,
"loss": 0.4398,
"step": 9810
},
{
"epoch": 1.2361684038164813,
"grad_norm": 0.31834569573402405,
"learning_rate": 0.00021947102484559121,
"loss": 0.4756,
"step": 9815
},
{
"epoch": 1.2367981862266586,
"grad_norm": 0.355283260345459,
"learning_rate": 0.00021937358329652488,
"loss": 0.456,
"step": 9820
},
{
"epoch": 1.237427968636836,
"grad_norm": 0.2955317497253418,
"learning_rate": 0.00021927610449225962,
"loss": 0.4462,
"step": 9825
},
{
"epoch": 1.2380577510470132,
"grad_norm": 0.2653120756149292,
"learning_rate": 0.00021917858848514383,
"loss": 0.4197,
"step": 9830
},
{
"epoch": 1.2386875334571905,
"grad_norm": 0.3773416578769684,
"learning_rate": 0.0002190810353275458,
"loss": 0.4263,
"step": 9835
},
{
"epoch": 1.2393173158673678,
"grad_norm": 0.28635114431381226,
"learning_rate": 0.00021898344507185384,
"loss": 0.4705,
"step": 9840
},
{
"epoch": 1.2399470982775451,
"grad_norm": 0.3044835031032562,
"learning_rate": 0.00021888581777047608,
"loss": 0.4671,
"step": 9845
},
{
"epoch": 1.2405768806877224,
"grad_norm": 0.293748676776886,
"learning_rate": 0.0002187881534758407,
"loss": 0.436,
"step": 9850
},
{
"epoch": 1.2412066630978997,
"grad_norm": 0.3891184628009796,
"learning_rate": 0.00021869045224039564,
"loss": 0.456,
"step": 9855
},
{
"epoch": 1.241836445508077,
"grad_norm": 0.3140691816806793,
"learning_rate": 0.0002185927141166086,
"loss": 0.4402,
"step": 9860
},
{
"epoch": 1.2424662279182543,
"grad_norm": 0.33889827132225037,
"learning_rate": 0.00021849493915696738,
"loss": 0.4363,
"step": 9865
},
{
"epoch": 1.2430960103284314,
"grad_norm": 0.3084375858306885,
"learning_rate": 0.0002183971274139791,
"loss": 0.4295,
"step": 9870
},
{
"epoch": 1.2437257927386087,
"grad_norm": 0.3091178834438324,
"learning_rate": 0.00021829927894017115,
"loss": 0.4263,
"step": 9875
},
{
"epoch": 1.244355575148786,
"grad_norm": 0.3208729922771454,
"learning_rate": 0.00021820139378809025,
"loss": 0.4233,
"step": 9880
},
{
"epoch": 1.2449853575589633,
"grad_norm": 0.30196666717529297,
"learning_rate": 0.000218103472010303,
"loss": 0.4265,
"step": 9885
},
{
"epoch": 1.2456151399691406,
"grad_norm": 0.3044353127479553,
"learning_rate": 0.0002180055136593956,
"loss": 0.48,
"step": 9890
},
{
"epoch": 1.246244922379318,
"grad_norm": 0.31633850932121277,
"learning_rate": 0.000217907518787974,
"loss": 0.4708,
"step": 9895
},
{
"epoch": 1.2468747047894952,
"grad_norm": 0.29174062609672546,
"learning_rate": 0.0002178094874486636,
"loss": 0.4135,
"step": 9900
},
{
"epoch": 1.2475044871996726,
"grad_norm": 0.33092647790908813,
"learning_rate": 0.00021771141969410956,
"loss": 0.4541,
"step": 9905
},
{
"epoch": 1.2481342696098499,
"grad_norm": 0.30151379108428955,
"learning_rate": 0.00021761331557697635,
"loss": 0.4397,
"step": 9910
},
{
"epoch": 1.2487640520200272,
"grad_norm": 0.31203630566596985,
"learning_rate": 0.00021751517514994836,
"loss": 0.454,
"step": 9915
},
{
"epoch": 1.2493938344302045,
"grad_norm": 0.30847153067588806,
"learning_rate": 0.00021741699846572902,
"loss": 0.4309,
"step": 9920
},
{
"epoch": 1.2500236168403815,
"grad_norm": 0.2937026619911194,
"learning_rate": 0.00021731878557704158,
"loss": 0.4206,
"step": 9925
},
{
"epoch": 1.2506533992505589,
"grad_norm": 0.2875721752643585,
"learning_rate": 0.0002172205365366285,
"loss": 0.4385,
"step": 9930
},
{
"epoch": 1.2512831816607362,
"grad_norm": 0.2834903299808502,
"learning_rate": 0.00021712225139725188,
"loss": 0.423,
"step": 9935
},
{
"epoch": 1.2519129640709135,
"grad_norm": 0.3069617748260498,
"learning_rate": 0.000217023930211693,
"loss": 0.4536,
"step": 9940
},
{
"epoch": 1.2525427464810908,
"grad_norm": 0.32263246178627014,
"learning_rate": 0.0002169255730327526,
"loss": 0.4281,
"step": 9945
},
{
"epoch": 1.253172528891268,
"grad_norm": 0.2980237603187561,
"learning_rate": 0.00021682717991325075,
"loss": 0.4163,
"step": 9950
},
{
"epoch": 1.2538023113014454,
"grad_norm": 0.3552669584751129,
"learning_rate": 0.0002167287509060268,
"loss": 0.4378,
"step": 9955
},
{
"epoch": 1.2544320937116227,
"grad_norm": 0.3207598924636841,
"learning_rate": 0.00021663028606393932,
"loss": 0.4411,
"step": 9960
},
{
"epoch": 1.2550618761218,
"grad_norm": 0.3187711238861084,
"learning_rate": 0.0002165317854398663,
"loss": 0.4384,
"step": 9965
},
{
"epoch": 1.2556916585319773,
"grad_norm": 0.3156946897506714,
"learning_rate": 0.00021643324908670472,
"loss": 0.4227,
"step": 9970
},
{
"epoch": 1.2563214409421546,
"grad_norm": 0.3305997848510742,
"learning_rate": 0.00021633467705737085,
"loss": 0.4521,
"step": 9975
},
{
"epoch": 1.2569512233523317,
"grad_norm": 0.2964983880519867,
"learning_rate": 0.00021623606940480015,
"loss": 0.4373,
"step": 9980
},
{
"epoch": 1.257581005762509,
"grad_norm": 0.29807519912719727,
"learning_rate": 0.00021613742618194727,
"loss": 0.4591,
"step": 9985
},
{
"epoch": 1.2582107881726863,
"grad_norm": 0.29127413034439087,
"learning_rate": 0.00021603874744178576,
"loss": 0.43,
"step": 9990
},
{
"epoch": 1.2588405705828636,
"grad_norm": 0.339418888092041,
"learning_rate": 0.00021594003323730836,
"loss": 0.4407,
"step": 9995
},
{
"epoch": 1.2594703529930409,
"grad_norm": 0.3419913053512573,
"learning_rate": 0.0002158412836215269,
"loss": 0.4678,
"step": 10000
},
{
"epoch": 1.2594703529930409,
"eval_loss": 0.30844178795814514,
"eval_runtime": 6.157,
"eval_samples_per_second": 162.416,
"eval_steps_per_second": 10.232,
"step": 10000
},
{
"epoch": 1.2601001354032182,
"grad_norm": 0.3139461576938629,
"learning_rate": 0.00021574249864747216,
"loss": 0.4491,
"step": 10005
},
{
"epoch": 1.2607299178133955,
"grad_norm": 0.319892555475235,
"learning_rate": 0.00021564367836819393,
"loss": 0.4648,
"step": 10010
},
{
"epoch": 1.2613597002235728,
"grad_norm": 0.30732426047325134,
"learning_rate": 0.00021554482283676093,
"loss": 0.4113,
"step": 10015
},
{
"epoch": 1.26198948263375,
"grad_norm": 0.3234427571296692,
"learning_rate": 0.00021544593210626092,
"loss": 0.4461,
"step": 10020
},
{
"epoch": 1.2626192650439272,
"grad_norm": 0.3298225998878479,
"learning_rate": 0.00021534700622980038,
"loss": 0.4487,
"step": 10025
},
{
"epoch": 1.2632490474541047,
"grad_norm": 0.3394641578197479,
"learning_rate": 0.0002152480452605048,
"loss": 0.4653,
"step": 10030
},
{
"epoch": 1.2638788298642818,
"grad_norm": 0.29091107845306396,
"learning_rate": 0.00021514904925151854,
"loss": 0.4639,
"step": 10035
},
{
"epoch": 1.264508612274459,
"grad_norm": 0.27975961565971375,
"learning_rate": 0.00021505001825600461,
"loss": 0.4094,
"step": 10040
},
{
"epoch": 1.2651383946846364,
"grad_norm": 0.2882293164730072,
"learning_rate": 0.00021495095232714503,
"loss": 0.4212,
"step": 10045
},
{
"epoch": 1.2657681770948137,
"grad_norm": 0.31701260805130005,
"learning_rate": 0.0002148518515181404,
"loss": 0.4427,
"step": 10050
},
{
"epoch": 1.266397959504991,
"grad_norm": 0.33051052689552307,
"learning_rate": 0.00021475271588221014,
"loss": 0.4331,
"step": 10055
},
{
"epoch": 1.2670277419151683,
"grad_norm": 0.32075920701026917,
"learning_rate": 0.00021465354547259234,
"loss": 0.4486,
"step": 10060
},
{
"epoch": 1.2676575243253456,
"grad_norm": 0.3044838309288025,
"learning_rate": 0.00021455434034254375,
"loss": 0.4141,
"step": 10065
},
{
"epoch": 1.268287306735523,
"grad_norm": 0.31618407368659973,
"learning_rate": 0.00021445510054533983,
"loss": 0.446,
"step": 10070
},
{
"epoch": 1.2689170891457002,
"grad_norm": 0.3025960624217987,
"learning_rate": 0.0002143558261342746,
"loss": 0.4233,
"step": 10075
},
{
"epoch": 1.2695468715558773,
"grad_norm": 0.2974034249782562,
"learning_rate": 0.0002142565171626607,
"loss": 0.4078,
"step": 10080
},
{
"epoch": 1.2701766539660548,
"grad_norm": 0.34097397327423096,
"learning_rate": 0.0002141571736838293,
"loss": 0.4555,
"step": 10085
},
{
"epoch": 1.270806436376232,
"grad_norm": 0.30995890498161316,
"learning_rate": 0.0002140577957511302,
"loss": 0.4388,
"step": 10090
},
{
"epoch": 1.2714362187864092,
"grad_norm": 0.24191588163375854,
"learning_rate": 0.00021395838341793145,
"loss": 0.4114,
"step": 10095
},
{
"epoch": 1.2720660011965865,
"grad_norm": 0.31779953837394714,
"learning_rate": 0.00021385893673761986,
"loss": 0.4169,
"step": 10100
},
{
"epoch": 1.2726957836067638,
"grad_norm": 0.31599584221839905,
"learning_rate": 0.0002137594557636006,
"loss": 0.4081,
"step": 10105
},
{
"epoch": 1.2733255660169411,
"grad_norm": 0.31904011964797974,
"learning_rate": 0.00021365994054929713,
"loss": 0.4406,
"step": 10110
},
{
"epoch": 1.2739553484271184,
"grad_norm": 0.2923012375831604,
"learning_rate": 0.00021356039114815145,
"loss": 0.4335,
"step": 10115
},
{
"epoch": 1.2745851308372957,
"grad_norm": 0.27983418107032776,
"learning_rate": 0.00021346080761362385,
"loss": 0.4039,
"step": 10120
},
{
"epoch": 1.275214913247473,
"grad_norm": 0.29870182275772095,
"learning_rate": 0.000213361189999193,
"loss": 0.4311,
"step": 10125
},
{
"epoch": 1.2758446956576504,
"grad_norm": 0.3060225546360016,
"learning_rate": 0.00021326153835835574,
"loss": 0.4722,
"step": 10130
},
{
"epoch": 1.2764744780678274,
"grad_norm": 0.38860756158828735,
"learning_rate": 0.00021316185274462734,
"loss": 0.4276,
"step": 10135
},
{
"epoch": 1.277104260478005,
"grad_norm": 0.32171720266342163,
"learning_rate": 0.0002130621332115413,
"loss": 0.4334,
"step": 10140
},
{
"epoch": 1.277734042888182,
"grad_norm": 0.2947072684764862,
"learning_rate": 0.00021296237981264916,
"loss": 0.411,
"step": 10145
},
{
"epoch": 1.2783638252983593,
"grad_norm": 0.2904439866542816,
"learning_rate": 0.00021286259260152088,
"loss": 0.4222,
"step": 10150
},
{
"epoch": 1.2789936077085367,
"grad_norm": 0.2517947554588318,
"learning_rate": 0.00021276277163174444,
"loss": 0.4336,
"step": 10155
},
{
"epoch": 1.279623390118714,
"grad_norm": 0.295692503452301,
"learning_rate": 0.00021266291695692602,
"loss": 0.4617,
"step": 10160
},
{
"epoch": 1.2802531725288913,
"grad_norm": 0.3214627802371979,
"learning_rate": 0.00021256302863068976,
"loss": 0.4327,
"step": 10165
},
{
"epoch": 1.2808829549390686,
"grad_norm": 0.3030719459056854,
"learning_rate": 0.00021246310670667808,
"loss": 0.4289,
"step": 10170
},
{
"epoch": 1.2815127373492459,
"grad_norm": 0.32924139499664307,
"learning_rate": 0.00021236315123855128,
"loss": 0.4391,
"step": 10175
},
{
"epoch": 1.2821425197594232,
"grad_norm": 0.2978973984718323,
"learning_rate": 0.00021226316227998773,
"loss": 0.4356,
"step": 10180
},
{
"epoch": 1.2827723021696005,
"grad_norm": 0.289858341217041,
"learning_rate": 0.00021216313988468375,
"loss": 0.4302,
"step": 10185
},
{
"epoch": 1.2834020845797776,
"grad_norm": 0.28235578536987305,
"learning_rate": 0.00021206308410635376,
"loss": 0.4581,
"step": 10190
},
{
"epoch": 1.284031866989955,
"grad_norm": 0.28610706329345703,
"learning_rate": 0.0002119629949987299,
"loss": 0.4233,
"step": 10195
},
{
"epoch": 1.2846616494001322,
"grad_norm": 0.347464382648468,
"learning_rate": 0.00021186287261556238,
"loss": 0.4191,
"step": 10200
},
{
"epoch": 1.2852914318103095,
"grad_norm": 0.3228091299533844,
"learning_rate": 0.00021176271701061914,
"loss": 0.4162,
"step": 10205
},
{
"epoch": 1.2859212142204868,
"grad_norm": 0.34487780928611755,
"learning_rate": 0.00021166252823768606,
"loss": 0.4383,
"step": 10210
},
{
"epoch": 1.286550996630664,
"grad_norm": 0.34411466121673584,
"learning_rate": 0.00021156230635056676,
"loss": 0.4532,
"step": 10215
},
{
"epoch": 1.2871807790408414,
"grad_norm": 0.38219863176345825,
"learning_rate": 0.00021146205140308273,
"loss": 0.4656,
"step": 10220
},
{
"epoch": 1.2878105614510187,
"grad_norm": 0.3240879774093628,
"learning_rate": 0.00021136176344907322,
"loss": 0.4174,
"step": 10225
},
{
"epoch": 1.288440343861196,
"grad_norm": 0.34157487750053406,
"learning_rate": 0.00021126144254239503,
"loss": 0.4297,
"step": 10230
},
{
"epoch": 1.2890701262713733,
"grad_norm": 0.2788861095905304,
"learning_rate": 0.00021116108873692286,
"loss": 0.429,
"step": 10235
},
{
"epoch": 1.2896999086815506,
"grad_norm": 0.28119325637817383,
"learning_rate": 0.00021106070208654895,
"loss": 0.4145,
"step": 10240
},
{
"epoch": 1.2903296910917277,
"grad_norm": 0.32004043459892273,
"learning_rate": 0.00021096028264518325,
"loss": 0.4361,
"step": 10245
},
{
"epoch": 1.2909594735019052,
"grad_norm": 0.3054758310317993,
"learning_rate": 0.0002108598304667533,
"loss": 0.4331,
"step": 10250
},
{
"epoch": 1.2915892559120823,
"grad_norm": 0.3827783167362213,
"learning_rate": 0.0002107593456052042,
"loss": 0.4246,
"step": 10255
},
{
"epoch": 1.2922190383222596,
"grad_norm": 0.3008691370487213,
"learning_rate": 0.00021065882811449862,
"loss": 0.4448,
"step": 10260
},
{
"epoch": 1.292848820732437,
"grad_norm": 0.3227977752685547,
"learning_rate": 0.00021055827804861675,
"loss": 0.4308,
"step": 10265
},
{
"epoch": 1.2934786031426142,
"grad_norm": 0.32592520117759705,
"learning_rate": 0.00021045769546155623,
"loss": 0.4472,
"step": 10270
},
{
"epoch": 1.2941083855527915,
"grad_norm": 0.30866268277168274,
"learning_rate": 0.00021035708040733231,
"loss": 0.4193,
"step": 10275
},
{
"epoch": 1.2947381679629688,
"grad_norm": 0.36590054631233215,
"learning_rate": 0.0002102564329399775,
"loss": 0.4554,
"step": 10280
},
{
"epoch": 1.2953679503731461,
"grad_norm": 0.34002235531806946,
"learning_rate": 0.00021015575311354175,
"loss": 0.465,
"step": 10285
},
{
"epoch": 1.2959977327833234,
"grad_norm": 0.26847660541534424,
"learning_rate": 0.00021005504098209248,
"loss": 0.4226,
"step": 10290
},
{
"epoch": 1.2966275151935007,
"grad_norm": 0.2904103398323059,
"learning_rate": 0.00020995429659971445,
"loss": 0.4135,
"step": 10295
},
{
"epoch": 1.2972572976036778,
"grad_norm": 0.2799352705478668,
"learning_rate": 0.00020985352002050962,
"loss": 0.4241,
"step": 10300
},
{
"epoch": 1.2978870800138553,
"grad_norm": 0.3527425229549408,
"learning_rate": 0.00020975271129859734,
"loss": 0.4397,
"step": 10305
},
{
"epoch": 1.2985168624240324,
"grad_norm": 0.30795904994010925,
"learning_rate": 0.00020965187048811417,
"loss": 0.4248,
"step": 10310
},
{
"epoch": 1.2991466448342097,
"grad_norm": 0.31814008951187134,
"learning_rate": 0.00020955099764321402,
"loss": 0.4501,
"step": 10315
},
{
"epoch": 1.299776427244387,
"grad_norm": 0.29917100071907043,
"learning_rate": 0.0002094500928180678,
"loss": 0.4511,
"step": 10320
},
{
"epoch": 1.3004062096545643,
"grad_norm": 0.32853367924690247,
"learning_rate": 0.00020934915606686373,
"loss": 0.4055,
"step": 10325
},
{
"epoch": 1.3010359920647416,
"grad_norm": 0.420550137758255,
"learning_rate": 0.00020924818744380723,
"loss": 0.4417,
"step": 10330
},
{
"epoch": 1.301665774474919,
"grad_norm": 0.3183051347732544,
"learning_rate": 0.0002091471870031207,
"loss": 0.4256,
"step": 10335
},
{
"epoch": 1.3022955568850962,
"grad_norm": 0.30520761013031006,
"learning_rate": 0.00020904615479904362,
"loss": 0.4213,
"step": 10340
},
{
"epoch": 1.3029253392952735,
"grad_norm": 0.3484478294849396,
"learning_rate": 0.0002089450908858327,
"loss": 0.4202,
"step": 10345
},
{
"epoch": 1.3035551217054508,
"grad_norm": 0.3063777983188629,
"learning_rate": 0.00020884399531776154,
"loss": 0.4121,
"step": 10350
},
{
"epoch": 1.304184904115628,
"grad_norm": 0.35436901450157166,
"learning_rate": 0.00020874286814912072,
"loss": 0.4351,
"step": 10355
},
{
"epoch": 1.3048146865258055,
"grad_norm": 0.3233969211578369,
"learning_rate": 0.00020864170943421786,
"loss": 0.4326,
"step": 10360
},
{
"epoch": 1.3054444689359825,
"grad_norm": 0.34073448181152344,
"learning_rate": 0.0002085405192273776,
"loss": 0.4454,
"step": 10365
},
{
"epoch": 1.3060742513461598,
"grad_norm": 0.28455135226249695,
"learning_rate": 0.00020843929758294121,
"loss": 0.4511,
"step": 10370
},
{
"epoch": 1.3067040337563371,
"grad_norm": 0.31585589051246643,
"learning_rate": 0.0002083380445552672,
"loss": 0.4258,
"step": 10375
},
{
"epoch": 1.3073338161665145,
"grad_norm": 0.31528952717781067,
"learning_rate": 0.00020823676019873064,
"loss": 0.424,
"step": 10380
},
{
"epoch": 1.3079635985766918,
"grad_norm": 0.3014485836029053,
"learning_rate": 0.00020813544456772362,
"loss": 0.4429,
"step": 10385
},
{
"epoch": 1.308593380986869,
"grad_norm": 0.2870473861694336,
"learning_rate": 0.00020803409771665484,
"loss": 0.439,
"step": 10390
},
{
"epoch": 1.3092231633970464,
"grad_norm": 0.2971458435058594,
"learning_rate": 0.00020793271969994997,
"loss": 0.4233,
"step": 10395
},
{
"epoch": 1.3098529458072237,
"grad_norm": 0.2853131890296936,
"learning_rate": 0.00020783131057205135,
"loss": 0.4164,
"step": 10400
},
{
"epoch": 1.310482728217401,
"grad_norm": 0.29392004013061523,
"learning_rate": 0.00020772987038741793,
"loss": 0.4234,
"step": 10405
},
{
"epoch": 1.311112510627578,
"grad_norm": 0.2874060273170471,
"learning_rate": 0.00020762839920052543,
"loss": 0.4413,
"step": 10410
},
{
"epoch": 1.3117422930377556,
"grad_norm": 0.2806376516819,
"learning_rate": 0.00020752689706586615,
"loss": 0.4223,
"step": 10415
},
{
"epoch": 1.3123720754479327,
"grad_norm": 0.28510767221450806,
"learning_rate": 0.00020742536403794908,
"loss": 0.4183,
"step": 10420
},
{
"epoch": 1.31300185785811,
"grad_norm": 0.3087919056415558,
"learning_rate": 0.00020732380017129983,
"loss": 0.4241,
"step": 10425
},
{
"epoch": 1.3136316402682873,
"grad_norm": 0.2965323328971863,
"learning_rate": 0.00020722220552046048,
"loss": 0.4225,
"step": 10430
},
{
"epoch": 1.3142614226784646,
"grad_norm": 0.2907772660255432,
"learning_rate": 0.00020712058013998963,
"loss": 0.4176,
"step": 10435
},
{
"epoch": 1.3148912050886419,
"grad_norm": 0.3242434859275818,
"learning_rate": 0.0002070189240844625,
"loss": 0.4377,
"step": 10440
},
{
"epoch": 1.3155209874988192,
"grad_norm": 0.28129857778549194,
"learning_rate": 0.00020691723740847066,
"loss": 0.425,
"step": 10445
},
{
"epoch": 1.3161507699089965,
"grad_norm": 0.3053089380264282,
"learning_rate": 0.00020681552016662224,
"loss": 0.4066,
"step": 10450
},
{
"epoch": 1.3167805523191738,
"grad_norm": 0.27167361974716187,
"learning_rate": 0.00020671377241354168,
"loss": 0.4458,
"step": 10455
},
{
"epoch": 1.317410334729351,
"grad_norm": 0.29331174492836,
"learning_rate": 0.00020661199420386986,
"loss": 0.427,
"step": 10460
},
{
"epoch": 1.3180401171395282,
"grad_norm": 0.329908162355423,
"learning_rate": 0.00020651018559226394,
"loss": 0.4292,
"step": 10465
},
{
"epoch": 1.3186698995497057,
"grad_norm": 0.32669904828071594,
"learning_rate": 0.0002064083466333976,
"loss": 0.4118,
"step": 10470
},
{
"epoch": 1.3192996819598828,
"grad_norm": 0.35706159472465515,
"learning_rate": 0.00020630647738196058,
"loss": 0.4433,
"step": 10475
},
{
"epoch": 1.31992946437006,
"grad_norm": 0.3119877278804779,
"learning_rate": 0.00020620457789265905,
"loss": 0.4206,
"step": 10480
},
{
"epoch": 1.3205592467802374,
"grad_norm": 0.34798958897590637,
"learning_rate": 0.00020610264822021532,
"loss": 0.39,
"step": 10485
},
{
"epoch": 1.3211890291904147,
"grad_norm": 0.36972302198410034,
"learning_rate": 0.000206000688419368,
"loss": 0.4402,
"step": 10490
},
{
"epoch": 1.321818811600592,
"grad_norm": 0.27949050068855286,
"learning_rate": 0.00020589869854487175,
"loss": 0.4221,
"step": 10495
},
{
"epoch": 1.3224485940107693,
"grad_norm": 0.30757853388786316,
"learning_rate": 0.00020579667865149758,
"loss": 0.4402,
"step": 10500
},
{
"epoch": 1.3230783764209466,
"grad_norm": 0.3018808364868164,
"learning_rate": 0.0002056946287940324,
"loss": 0.4088,
"step": 10505
},
{
"epoch": 1.323708158831124,
"grad_norm": 0.2630440592765808,
"learning_rate": 0.00020559254902727942,
"loss": 0.4062,
"step": 10510
},
{
"epoch": 1.3243379412413012,
"grad_norm": 0.3145885169506073,
"learning_rate": 0.00020549043940605767,
"loss": 0.4301,
"step": 10515
},
{
"epoch": 1.3249677236514783,
"grad_norm": 0.3040730655193329,
"learning_rate": 0.0002053882999852025,
"loss": 0.4267,
"step": 10520
},
{
"epoch": 1.3255975060616558,
"grad_norm": 0.2861897945404053,
"learning_rate": 0.00020528613081956498,
"loss": 0.4115,
"step": 10525
},
{
"epoch": 1.326227288471833,
"grad_norm": 0.2938830256462097,
"learning_rate": 0.00020518393196401234,
"loss": 0.4315,
"step": 10530
},
{
"epoch": 1.3268570708820102,
"grad_norm": 0.24550281465053558,
"learning_rate": 0.0002050817034734277,
"loss": 0.4181,
"step": 10535
},
{
"epoch": 1.3274868532921875,
"grad_norm": 0.30074000358581543,
"learning_rate": 0.00020497944540271017,
"loss": 0.4016,
"step": 10540
},
{
"epoch": 1.3281166357023648,
"grad_norm": 0.34675145149230957,
"learning_rate": 0.0002048771578067745,
"loss": 0.4157,
"step": 10545
},
{
"epoch": 1.3287464181125421,
"grad_norm": 0.3144848644733429,
"learning_rate": 0.00020477484074055157,
"loss": 0.4024,
"step": 10550
},
{
"epoch": 1.3293762005227194,
"grad_norm": 0.32153722643852234,
"learning_rate": 0.00020467249425898805,
"loss": 0.4114,
"step": 10555
},
{
"epoch": 1.3300059829328967,
"grad_norm": 0.301707923412323,
"learning_rate": 0.0002045701184170462,
"loss": 0.423,
"step": 10560
},
{
"epoch": 1.330635765343074,
"grad_norm": 0.25224459171295166,
"learning_rate": 0.00020446771326970424,
"loss": 0.4037,
"step": 10565
},
{
"epoch": 1.3312655477532513,
"grad_norm": 0.3072243928909302,
"learning_rate": 0.00020436527887195607,
"loss": 0.4279,
"step": 10570
},
{
"epoch": 1.3318953301634284,
"grad_norm": 0.36949509382247925,
"learning_rate": 0.00020426281527881137,
"loss": 0.4259,
"step": 10575
},
{
"epoch": 1.332525112573606,
"grad_norm": 0.30465519428253174,
"learning_rate": 0.00020416032254529535,
"loss": 0.457,
"step": 10580
},
{
"epoch": 1.333154894983783,
"grad_norm": 0.2719140350818634,
"learning_rate": 0.00020405780072644896,
"loss": 0.3927,
"step": 10585
},
{
"epoch": 1.3337846773939603,
"grad_norm": 0.33556681871414185,
"learning_rate": 0.00020395524987732876,
"loss": 0.4341,
"step": 10590
},
{
"epoch": 1.3344144598041376,
"grad_norm": 0.3145639896392822,
"learning_rate": 0.0002038526700530069,
"loss": 0.4176,
"step": 10595
},
{
"epoch": 1.335044242214315,
"grad_norm": 0.31328147649765015,
"learning_rate": 0.00020375006130857111,
"loss": 0.4332,
"step": 10600
},
{
"epoch": 1.3356740246244923,
"grad_norm": 0.3016543388366699,
"learning_rate": 0.00020364742369912464,
"loss": 0.4173,
"step": 10605
},
{
"epoch": 1.3363038070346696,
"grad_norm": 0.31259703636169434,
"learning_rate": 0.0002035447572797862,
"loss": 0.4091,
"step": 10610
},
{
"epoch": 1.3369335894448469,
"grad_norm": 0.34624606370925903,
"learning_rate": 0.00020344206210569,
"loss": 0.4408,
"step": 10615
},
{
"epoch": 1.3375633718550242,
"grad_norm": 0.3144773542881012,
"learning_rate": 0.00020333933823198566,
"loss": 0.3863,
"step": 10620
},
{
"epoch": 1.3381931542652015,
"grad_norm": 0.3231208026409149,
"learning_rate": 0.00020323658571383833,
"loss": 0.4151,
"step": 10625
},
{
"epoch": 1.3388229366753785,
"grad_norm": 0.3022227883338928,
"learning_rate": 0.00020313380460642842,
"loss": 0.4108,
"step": 10630
},
{
"epoch": 1.339452719085556,
"grad_norm": 0.2899850606918335,
"learning_rate": 0.00020303099496495172,
"loss": 0.412,
"step": 10635
},
{
"epoch": 1.3400825014957332,
"grad_norm": 0.31005537509918213,
"learning_rate": 0.00020292815684461936,
"loss": 0.4114,
"step": 10640
},
{
"epoch": 1.3407122839059105,
"grad_norm": 0.29457420110702515,
"learning_rate": 0.00020282529030065784,
"loss": 0.4292,
"step": 10645
},
{
"epoch": 1.3413420663160878,
"grad_norm": 0.31712374091148376,
"learning_rate": 0.00020272239538830867,
"loss": 0.4029,
"step": 10650
},
{
"epoch": 1.341971848726265,
"grad_norm": 0.3228032886981964,
"learning_rate": 0.00020261947216282896,
"loss": 0.414,
"step": 10655
},
{
"epoch": 1.3426016311364424,
"grad_norm": 0.305351197719574,
"learning_rate": 0.00020251652067949068,
"loss": 0.4233,
"step": 10660
},
{
"epoch": 1.3432314135466197,
"grad_norm": 0.30317017436027527,
"learning_rate": 0.00020241354099358123,
"loss": 0.3816,
"step": 10665
},
{
"epoch": 1.343861195956797,
"grad_norm": 0.3036525845527649,
"learning_rate": 0.00020231053316040293,
"loss": 0.4115,
"step": 10670
},
{
"epoch": 1.3444909783669743,
"grad_norm": 0.33367687463760376,
"learning_rate": 0.00020220749723527353,
"loss": 0.449,
"step": 10675
},
{
"epoch": 1.3451207607771516,
"grad_norm": 0.28938767313957214,
"learning_rate": 0.00020210443327352553,
"loss": 0.3919,
"step": 10680
},
{
"epoch": 1.3457505431873287,
"grad_norm": 0.2946431338787079,
"learning_rate": 0.00020200134133050666,
"loss": 0.4043,
"step": 10685
},
{
"epoch": 1.3463803255975062,
"grad_norm": 0.31588709354400635,
"learning_rate": 0.00020189822146157962,
"loss": 0.4136,
"step": 10690
},
{
"epoch": 1.3470101080076833,
"grad_norm": 0.2830824851989746,
"learning_rate": 0.00020179507372212224,
"loss": 0.4164,
"step": 10695
},
{
"epoch": 1.3476398904178606,
"grad_norm": 0.31364426016807556,
"learning_rate": 0.0002016918981675271,
"loss": 0.4197,
"step": 10700
},
{
"epoch": 1.348269672828038,
"grad_norm": 0.32086437940597534,
"learning_rate": 0.00020158869485320194,
"loss": 0.4346,
"step": 10705
},
{
"epoch": 1.3488994552382152,
"grad_norm": 0.30549678206443787,
"learning_rate": 0.0002014854638345692,
"loss": 0.4134,
"step": 10710
},
{
"epoch": 1.3495292376483925,
"grad_norm": 0.2996455132961273,
"learning_rate": 0.00020138220516706634,
"loss": 0.3846,
"step": 10715
},
{
"epoch": 1.3501590200585698,
"grad_norm": 0.3013511002063751,
"learning_rate": 0.00020127891890614556,
"loss": 0.3994,
"step": 10720
},
{
"epoch": 1.350788802468747,
"grad_norm": 0.28055283427238464,
"learning_rate": 0.00020117560510727402,
"loss": 0.4163,
"step": 10725
},
{
"epoch": 1.3514185848789244,
"grad_norm": 0.3024522364139557,
"learning_rate": 0.00020107226382593357,
"loss": 0.4042,
"step": 10730
},
{
"epoch": 1.3520483672891017,
"grad_norm": 0.28080272674560547,
"learning_rate": 0.00020096889511762083,
"loss": 0.4176,
"step": 10735
},
{
"epoch": 1.3526781496992788,
"grad_norm": 0.3069353997707367,
"learning_rate": 0.00020086549903784715,
"loss": 0.4189,
"step": 10740
},
{
"epoch": 1.353307932109456,
"grad_norm": 0.2898117005825043,
"learning_rate": 0.00020076207564213866,
"loss": 0.4342,
"step": 10745
},
{
"epoch": 1.3539377145196334,
"grad_norm": 0.3365933299064636,
"learning_rate": 0.00020065862498603592,
"loss": 0.3944,
"step": 10750
},
{
"epoch": 1.3545674969298107,
"grad_norm": 0.29901427030563354,
"learning_rate": 0.00020055514712509446,
"loss": 0.4059,
"step": 10755
},
{
"epoch": 1.355197279339988,
"grad_norm": 0.2927230894565582,
"learning_rate": 0.00020045164211488417,
"loss": 0.4137,
"step": 10760
},
{
"epoch": 1.3558270617501653,
"grad_norm": 0.35867777466773987,
"learning_rate": 0.00020034811001098964,
"loss": 0.4108,
"step": 10765
},
{
"epoch": 1.3564568441603426,
"grad_norm": 0.2955409586429596,
"learning_rate": 0.00020024455086900994,
"loss": 0.4328,
"step": 10770
},
{
"epoch": 1.35708662657052,
"grad_norm": 0.29247814416885376,
"learning_rate": 0.00020014096474455873,
"loss": 0.4014,
"step": 10775
},
{
"epoch": 1.3577164089806972,
"grad_norm": 0.30858153104782104,
"learning_rate": 0.00020003735169326413,
"loss": 0.4112,
"step": 10780
},
{
"epoch": 1.3583461913908745,
"grad_norm": 0.4134693145751953,
"learning_rate": 0.0001999337117707687,
"loss": 0.4062,
"step": 10785
},
{
"epoch": 1.3589759738010518,
"grad_norm": 0.3120553195476532,
"learning_rate": 0.0001998300450327294,
"loss": 0.4049,
"step": 10790
},
{
"epoch": 1.359605756211229,
"grad_norm": 0.3146657645702362,
"learning_rate": 0.00019972635153481767,
"loss": 0.4029,
"step": 10795
},
{
"epoch": 1.3602355386214062,
"grad_norm": 0.2997225821018219,
"learning_rate": 0.00019962263133271933,
"loss": 0.3792,
"step": 10800
},
{
"epoch": 1.3608653210315835,
"grad_norm": 0.32136911153793335,
"learning_rate": 0.0001995188844821345,
"loss": 0.3987,
"step": 10805
},
{
"epoch": 1.3614951034417608,
"grad_norm": 0.30875489115715027,
"learning_rate": 0.0001994151110387775,
"loss": 0.4211,
"step": 10810
},
{
"epoch": 1.3621248858519381,
"grad_norm": 0.30939677357673645,
"learning_rate": 0.00019931131105837714,
"loss": 0.451,
"step": 10815
},
{
"epoch": 1.3627546682621154,
"grad_norm": 0.27874892950057983,
"learning_rate": 0.0001992074845966764,
"loss": 0.4102,
"step": 10820
},
{
"epoch": 1.3633844506722927,
"grad_norm": 0.28371527791023254,
"learning_rate": 0.00019910363170943233,
"loss": 0.4153,
"step": 10825
},
{
"epoch": 1.36401423308247,
"grad_norm": 0.2852970063686371,
"learning_rate": 0.00019899975245241643,
"loss": 0.409,
"step": 10830
},
{
"epoch": 1.3646440154926474,
"grad_norm": 0.300521582365036,
"learning_rate": 0.00019889584688141418,
"loss": 0.4032,
"step": 10835
},
{
"epoch": 1.3652737979028244,
"grad_norm": 0.30631181597709656,
"learning_rate": 0.00019879191505222526,
"loss": 0.4299,
"step": 10840
},
{
"epoch": 1.365903580313002,
"grad_norm": 0.3514620363712311,
"learning_rate": 0.00019868795702066342,
"loss": 0.4051,
"step": 10845
},
{
"epoch": 1.366533362723179,
"grad_norm": 0.27533403038978577,
"learning_rate": 0.00019858397284255657,
"loss": 0.4108,
"step": 10850
},
{
"epoch": 1.3671631451333564,
"grad_norm": 0.3143390119075775,
"learning_rate": 0.00019847996257374645,
"loss": 0.426,
"step": 10855
},
{
"epoch": 1.3677929275435337,
"grad_norm": 0.3388061821460724,
"learning_rate": 0.00019837592627008904,
"loss": 0.4163,
"step": 10860
},
{
"epoch": 1.368422709953711,
"grad_norm": 0.34078383445739746,
"learning_rate": 0.00019827186398745417,
"loss": 0.4015,
"step": 10865
},
{
"epoch": 1.3690524923638883,
"grad_norm": 0.33532068133354187,
"learning_rate": 0.00019816777578172582,
"loss": 0.4436,
"step": 10870
},
{
"epoch": 1.3696822747740656,
"grad_norm": 0.3230116069316864,
"learning_rate": 0.0001980636617088015,
"loss": 0.4239,
"step": 10875
},
{
"epoch": 1.3703120571842429,
"grad_norm": 0.31974872946739197,
"learning_rate": 0.00019795952182459297,
"loss": 0.4313,
"step": 10880
},
{
"epoch": 1.3709418395944202,
"grad_norm": 0.2825758159160614,
"learning_rate": 0.0001978553561850257,
"loss": 0.4045,
"step": 10885
},
{
"epoch": 1.3715716220045975,
"grad_norm": 0.2678980529308319,
"learning_rate": 0.00019775116484603908,
"loss": 0.3899,
"step": 10890
},
{
"epoch": 1.3722014044147746,
"grad_norm": 0.3492506146430969,
"learning_rate": 0.00019764694786358612,
"loss": 0.3807,
"step": 10895
},
{
"epoch": 1.372831186824952,
"grad_norm": 0.30808547139167786,
"learning_rate": 0.00019754270529363384,
"loss": 0.4163,
"step": 10900
},
{
"epoch": 1.3734609692351292,
"grad_norm": 0.30980342626571655,
"learning_rate": 0.0001974384371921628,
"loss": 0.3843,
"step": 10905
},
{
"epoch": 1.3740907516453065,
"grad_norm": 0.2915787100791931,
"learning_rate": 0.00019733414361516736,
"loss": 0.4208,
"step": 10910
},
{
"epoch": 1.3747205340554838,
"grad_norm": 0.30979228019714355,
"learning_rate": 0.00019722982461865555,
"loss": 0.4188,
"step": 10915
},
{
"epoch": 1.375350316465661,
"grad_norm": 0.28953999280929565,
"learning_rate": 0.00019712548025864918,
"loss": 0.3934,
"step": 10920
},
{
"epoch": 1.3759800988758384,
"grad_norm": 0.31495416164398193,
"learning_rate": 0.00019702111059118334,
"loss": 0.4117,
"step": 10925
},
{
"epoch": 1.3766098812860157,
"grad_norm": 0.38459569215774536,
"learning_rate": 0.00019691671567230714,
"loss": 0.4229,
"step": 10930
},
{
"epoch": 1.377239663696193,
"grad_norm": 0.31138870120048523,
"learning_rate": 0.00019681229555808285,
"loss": 0.4284,
"step": 10935
},
{
"epoch": 1.3778694461063703,
"grad_norm": 0.2761414051055908,
"learning_rate": 0.0001967078503045866,
"loss": 0.3838,
"step": 10940
},
{
"epoch": 1.3784992285165476,
"grad_norm": 0.31627506017684937,
"learning_rate": 0.00019660337996790772,
"loss": 0.4008,
"step": 10945
},
{
"epoch": 1.3791290109267247,
"grad_norm": 0.29025107622146606,
"learning_rate": 0.00019649888460414937,
"loss": 0.409,
"step": 10950
},
{
"epoch": 1.3797587933369022,
"grad_norm": 0.3379102349281311,
"learning_rate": 0.0001963943642694278,
"loss": 0.4213,
"step": 10955
},
{
"epoch": 1.3803885757470793,
"grad_norm": 0.3209204375743866,
"learning_rate": 0.00019628981901987285,
"loss": 0.3834,
"step": 10960
},
{
"epoch": 1.3810183581572566,
"grad_norm": 0.31717419624328613,
"learning_rate": 0.0001961852489116277,
"loss": 0.4499,
"step": 10965
},
{
"epoch": 1.381648140567434,
"grad_norm": 0.27936458587646484,
"learning_rate": 0.00019608065400084898,
"loss": 0.3987,
"step": 10970
},
{
"epoch": 1.3822779229776112,
"grad_norm": 0.28877684473991394,
"learning_rate": 0.00019597603434370637,
"loss": 0.4252,
"step": 10975
},
{
"epoch": 1.3829077053877885,
"grad_norm": 0.3423072397708893,
"learning_rate": 0.00019587138999638316,
"loss": 0.421,
"step": 10980
},
{
"epoch": 1.3835374877979658,
"grad_norm": 0.26486262679100037,
"learning_rate": 0.00019576672101507568,
"loss": 0.4104,
"step": 10985
},
{
"epoch": 1.3841672702081431,
"grad_norm": 0.2929472029209137,
"learning_rate": 0.00019566202745599365,
"loss": 0.4127,
"step": 10990
},
{
"epoch": 1.3847970526183204,
"grad_norm": 0.2696884870529175,
"learning_rate": 0.00019555730937535976,
"loss": 0.4067,
"step": 10995
},
{
"epoch": 1.3854268350284977,
"grad_norm": 0.32420167326927185,
"learning_rate": 0.0001954525668294102,
"loss": 0.4136,
"step": 11000
},
{
"epoch": 1.3854268350284977,
"eval_loss": 0.3039778470993042,
"eval_runtime": 6.1549,
"eval_samples_per_second": 162.472,
"eval_steps_per_second": 10.236,
"step": 11000
},
{
"epoch": 1.3860566174386748,
"grad_norm": 0.3149106502532959,
"learning_rate": 0.00019534779987439395,
"loss": 0.3954,
"step": 11005
},
{
"epoch": 1.3866863998488523,
"grad_norm": 0.332868367433548,
"learning_rate": 0.0001952430085665733,
"loss": 0.4178,
"step": 11010
},
{
"epoch": 1.3873161822590294,
"grad_norm": 0.285671591758728,
"learning_rate": 0.00019513819296222362,
"loss": 0.3788,
"step": 11015
},
{
"epoch": 1.3879459646692067,
"grad_norm": 0.3317325711250305,
"learning_rate": 0.0001950333531176332,
"loss": 0.4091,
"step": 11020
},
{
"epoch": 1.388575747079384,
"grad_norm": 0.27808326482772827,
"learning_rate": 0.00019492848908910356,
"loss": 0.4104,
"step": 11025
},
{
"epoch": 1.3892055294895613,
"grad_norm": 0.29725268483161926,
"learning_rate": 0.00019482360093294897,
"loss": 0.3981,
"step": 11030
},
{
"epoch": 1.3898353118997386,
"grad_norm": 0.24770186841487885,
"learning_rate": 0.0001947186887054968,
"loss": 0.4052,
"step": 11035
},
{
"epoch": 1.390465094309916,
"grad_norm": 0.31627580523490906,
"learning_rate": 0.00019461375246308734,
"loss": 0.4051,
"step": 11040
},
{
"epoch": 1.3910948767200932,
"grad_norm": 0.2721163332462311,
"learning_rate": 0.00019450879226207368,
"loss": 0.3962,
"step": 11045
},
{
"epoch": 1.3917246591302705,
"grad_norm": 0.31926798820495605,
"learning_rate": 0.00019440380815882187,
"loss": 0.3964,
"step": 11050
},
{
"epoch": 1.3923544415404479,
"grad_norm": 0.3047574460506439,
"learning_rate": 0.0001942988002097108,
"loss": 0.3818,
"step": 11055
},
{
"epoch": 1.392984223950625,
"grad_norm": 0.35394978523254395,
"learning_rate": 0.00019419376847113216,
"loss": 0.4398,
"step": 11060
},
{
"epoch": 1.3936140063608025,
"grad_norm": 0.2855307459831238,
"learning_rate": 0.00019408871299949037,
"loss": 0.4089,
"step": 11065
},
{
"epoch": 1.3942437887709795,
"grad_norm": 0.3066868484020233,
"learning_rate": 0.00019398363385120254,
"loss": 0.3987,
"step": 11070
},
{
"epoch": 1.3948735711811568,
"grad_norm": 0.312775194644928,
"learning_rate": 0.0001938785310826987,
"loss": 0.3794,
"step": 11075
},
{
"epoch": 1.3955033535913342,
"grad_norm": 0.3235652446746826,
"learning_rate": 0.00019377340475042136,
"loss": 0.3852,
"step": 11080
},
{
"epoch": 1.3961331360015115,
"grad_norm": 0.33732032775878906,
"learning_rate": 0.00019366825491082574,
"loss": 0.4003,
"step": 11085
},
{
"epoch": 1.3967629184116888,
"grad_norm": 0.33549800515174866,
"learning_rate": 0.00019356308162037976,
"loss": 0.3699,
"step": 11090
},
{
"epoch": 1.397392700821866,
"grad_norm": 0.3360839784145355,
"learning_rate": 0.00019345788493556394,
"loss": 0.394,
"step": 11095
},
{
"epoch": 1.3980224832320434,
"grad_norm": 0.3089699447154999,
"learning_rate": 0.00019335266491287112,
"loss": 0.4016,
"step": 11100
},
{
"epoch": 1.3986522656422207,
"grad_norm": 0.30863386392593384,
"learning_rate": 0.00019324742160880702,
"loss": 0.3973,
"step": 11105
},
{
"epoch": 1.399282048052398,
"grad_norm": 0.30803561210632324,
"learning_rate": 0.00019314215507988965,
"loss": 0.4119,
"step": 11110
},
{
"epoch": 1.399911830462575,
"grad_norm": 0.2869633138179779,
"learning_rate": 0.0001930368653826495,
"loss": 0.4098,
"step": 11115
},
{
"epoch": 1.4005416128727526,
"grad_norm": 0.25851666927337646,
"learning_rate": 0.00019293155257362957,
"loss": 0.4034,
"step": 11120
},
{
"epoch": 1.4011713952829297,
"grad_norm": 0.32763540744781494,
"learning_rate": 0.00019282621670938527,
"loss": 0.4121,
"step": 11125
},
{
"epoch": 1.401801177693107,
"grad_norm": 0.3531438410282135,
"learning_rate": 0.00019272085784648432,
"loss": 0.4021,
"step": 11130
},
{
"epoch": 1.4024309601032843,
"grad_norm": 0.27890294790267944,
"learning_rate": 0.00019261547604150687,
"loss": 0.3872,
"step": 11135
},
{
"epoch": 1.4030607425134616,
"grad_norm": 0.26616647839546204,
"learning_rate": 0.00019251007135104534,
"loss": 0.4293,
"step": 11140
},
{
"epoch": 1.4036905249236389,
"grad_norm": 0.3214140236377716,
"learning_rate": 0.0001924046438317045,
"loss": 0.3974,
"step": 11145
},
{
"epoch": 1.4043203073338162,
"grad_norm": 0.31075042486190796,
"learning_rate": 0.00019229919354010126,
"loss": 0.3978,
"step": 11150
},
{
"epoch": 1.4049500897439935,
"grad_norm": 0.31546491384506226,
"learning_rate": 0.00019219372053286485,
"loss": 0.3937,
"step": 11155
},
{
"epoch": 1.4055798721541708,
"grad_norm": 0.33116820454597473,
"learning_rate": 0.00019208822486663677,
"loss": 0.3779,
"step": 11160
},
{
"epoch": 1.406209654564348,
"grad_norm": 0.30159297585487366,
"learning_rate": 0.0001919827065980705,
"loss": 0.3822,
"step": 11165
},
{
"epoch": 1.4068394369745252,
"grad_norm": 0.29656147956848145,
"learning_rate": 0.00019187716578383178,
"loss": 0.4047,
"step": 11170
},
{
"epoch": 1.4074692193847027,
"grad_norm": 0.3193992078304291,
"learning_rate": 0.0001917716024805985,
"loss": 0.4088,
"step": 11175
},
{
"epoch": 1.4080990017948798,
"grad_norm": 0.29688236117362976,
"learning_rate": 0.0001916660167450605,
"loss": 0.3693,
"step": 11180
},
{
"epoch": 1.408728784205057,
"grad_norm": 0.33146485686302185,
"learning_rate": 0.00019156040863391977,
"loss": 0.3865,
"step": 11185
},
{
"epoch": 1.4093585666152344,
"grad_norm": 0.3015727698802948,
"learning_rate": 0.00019145477820389027,
"loss": 0.3857,
"step": 11190
},
{
"epoch": 1.4099883490254117,
"grad_norm": 0.27797931432724,
"learning_rate": 0.00019134912551169796,
"loss": 0.4148,
"step": 11195
},
{
"epoch": 1.410618131435589,
"grad_norm": 0.30010297894477844,
"learning_rate": 0.00019124345061408067,
"loss": 0.4076,
"step": 11200
},
{
"epoch": 1.4112479138457663,
"grad_norm": 0.29101455211639404,
"learning_rate": 0.00019113775356778833,
"loss": 0.3802,
"step": 11205
},
{
"epoch": 1.4118776962559436,
"grad_norm": 0.29706794023513794,
"learning_rate": 0.00019103203442958266,
"loss": 0.3867,
"step": 11210
},
{
"epoch": 1.412507478666121,
"grad_norm": 0.2546458840370178,
"learning_rate": 0.00019092629325623723,
"loss": 0.3964,
"step": 11215
},
{
"epoch": 1.4131372610762982,
"grad_norm": 0.3409089148044586,
"learning_rate": 0.0001908205301045375,
"loss": 0.4171,
"step": 11220
},
{
"epoch": 1.4137670434864753,
"grad_norm": 0.27688878774642944,
"learning_rate": 0.00019071474503128057,
"loss": 0.405,
"step": 11225
},
{
"epoch": 1.4143968258966528,
"grad_norm": 0.30704399943351746,
"learning_rate": 0.00019060893809327563,
"loss": 0.4024,
"step": 11230
},
{
"epoch": 1.41502660830683,
"grad_norm": 0.2823016941547394,
"learning_rate": 0.00019050310934734326,
"loss": 0.3908,
"step": 11235
},
{
"epoch": 1.4156563907170072,
"grad_norm": 0.3309246897697449,
"learning_rate": 0.000190397258850316,
"loss": 0.4049,
"step": 11240
},
{
"epoch": 1.4162861731271845,
"grad_norm": 0.2959790527820587,
"learning_rate": 0.00019029138665903794,
"loss": 0.4031,
"step": 11245
},
{
"epoch": 1.4169159555373618,
"grad_norm": 0.29836803674697876,
"learning_rate": 0.00019018549283036497,
"loss": 0.4103,
"step": 11250
},
{
"epoch": 1.4175457379475391,
"grad_norm": 0.3187415301799774,
"learning_rate": 0.00019007957742116433,
"loss": 0.4055,
"step": 11255
},
{
"epoch": 1.4181755203577164,
"grad_norm": 0.3521386981010437,
"learning_rate": 0.00018997364048831515,
"loss": 0.3839,
"step": 11260
},
{
"epoch": 1.4188053027678937,
"grad_norm": 0.3985449969768524,
"learning_rate": 0.00018986768208870792,
"loss": 0.4058,
"step": 11265
},
{
"epoch": 1.419435085178071,
"grad_norm": 0.30885374546051025,
"learning_rate": 0.00018976170227924473,
"loss": 0.394,
"step": 11270
},
{
"epoch": 1.4200648675882483,
"grad_norm": 0.2981209456920624,
"learning_rate": 0.00018965570111683917,
"loss": 0.3917,
"step": 11275
},
{
"epoch": 1.4206946499984254,
"grad_norm": 0.2993827164173126,
"learning_rate": 0.00018954967865841629,
"loss": 0.4016,
"step": 11280
},
{
"epoch": 1.421324432408603,
"grad_norm": 0.283632755279541,
"learning_rate": 0.00018944363496091254,
"loss": 0.3873,
"step": 11285
},
{
"epoch": 1.42195421481878,
"grad_norm": 0.2871907353401184,
"learning_rate": 0.0001893375700812758,
"loss": 0.4136,
"step": 11290
},
{
"epoch": 1.4225839972289573,
"grad_norm": 0.3341853618621826,
"learning_rate": 0.00018923148407646537,
"loss": 0.409,
"step": 11295
},
{
"epoch": 1.4232137796391346,
"grad_norm": 0.32463696599006653,
"learning_rate": 0.00018912537700345192,
"loss": 0.3912,
"step": 11300
},
{
"epoch": 1.423843562049312,
"grad_norm": 0.33242395520210266,
"learning_rate": 0.00018901924891921726,
"loss": 0.4158,
"step": 11305
},
{
"epoch": 1.4244733444594893,
"grad_norm": 0.301289439201355,
"learning_rate": 0.00018891309988075463,
"loss": 0.4012,
"step": 11310
},
{
"epoch": 1.4251031268696666,
"grad_norm": 0.28636494278907776,
"learning_rate": 0.00018880692994506845,
"loss": 0.3817,
"step": 11315
},
{
"epoch": 1.4257329092798439,
"grad_norm": 0.2837861478328705,
"learning_rate": 0.00018870073916917455,
"loss": 0.4116,
"step": 11320
},
{
"epoch": 1.4263626916900212,
"grad_norm": 0.31169527769088745,
"learning_rate": 0.0001885945276100996,
"loss": 0.3967,
"step": 11325
},
{
"epoch": 1.4269924741001985,
"grad_norm": 0.31035301089286804,
"learning_rate": 0.00018848829532488177,
"loss": 0.407,
"step": 11330
},
{
"epoch": 1.4276222565103756,
"grad_norm": 0.3047008812427521,
"learning_rate": 0.00018838204237057023,
"loss": 0.3939,
"step": 11335
},
{
"epoch": 1.428252038920553,
"grad_norm": 0.2646077871322632,
"learning_rate": 0.00018827576880422515,
"loss": 0.3881,
"step": 11340
},
{
"epoch": 1.4288818213307302,
"grad_norm": 0.31041520833969116,
"learning_rate": 0.00018816947468291788,
"loss": 0.3822,
"step": 11345
},
{
"epoch": 1.4295116037409075,
"grad_norm": 0.2699204385280609,
"learning_rate": 0.00018806316006373086,
"loss": 0.3895,
"step": 11350
},
{
"epoch": 1.4301413861510848,
"grad_norm": 0.285363107919693,
"learning_rate": 0.00018795682500375742,
"loss": 0.4027,
"step": 11355
},
{
"epoch": 1.430771168561262,
"grad_norm": 0.27154308557510376,
"learning_rate": 0.00018785046956010194,
"loss": 0.3815,
"step": 11360
},
{
"epoch": 1.4314009509714394,
"grad_norm": 0.29652640223503113,
"learning_rate": 0.00018774409378987972,
"loss": 0.4003,
"step": 11365
},
{
"epoch": 1.4320307333816167,
"grad_norm": 0.2921524941921234,
"learning_rate": 0.00018763769775021695,
"loss": 0.3828,
"step": 11370
},
{
"epoch": 1.432660515791794,
"grad_norm": 0.26934945583343506,
"learning_rate": 0.00018753128149825074,
"loss": 0.3999,
"step": 11375
},
{
"epoch": 1.4332902982019713,
"grad_norm": 0.29320502281188965,
"learning_rate": 0.00018742484509112907,
"loss": 0.4034,
"step": 11380
},
{
"epoch": 1.4339200806121486,
"grad_norm": 0.2842418849468231,
"learning_rate": 0.00018731838858601074,
"loss": 0.3877,
"step": 11385
},
{
"epoch": 1.4345498630223257,
"grad_norm": 0.31208139657974243,
"learning_rate": 0.00018721191204006525,
"loss": 0.3731,
"step": 11390
},
{
"epoch": 1.4351796454325032,
"grad_norm": 0.2809062600135803,
"learning_rate": 0.00018710541551047303,
"loss": 0.3939,
"step": 11395
},
{
"epoch": 1.4358094278426803,
"grad_norm": 0.308969646692276,
"learning_rate": 0.00018699889905442508,
"loss": 0.3874,
"step": 11400
},
{
"epoch": 1.4364392102528576,
"grad_norm": 0.3051275610923767,
"learning_rate": 0.00018689236272912316,
"loss": 0.3676,
"step": 11405
},
{
"epoch": 1.437068992663035,
"grad_norm": 0.31084486842155457,
"learning_rate": 0.0001867858065917798,
"loss": 0.3954,
"step": 11410
},
{
"epoch": 1.4376987750732122,
"grad_norm": 0.28356167674064636,
"learning_rate": 0.000186679230699618,
"loss": 0.3701,
"step": 11415
},
{
"epoch": 1.4383285574833895,
"grad_norm": 0.3026244044303894,
"learning_rate": 0.0001865726351098715,
"loss": 0.3797,
"step": 11420
},
{
"epoch": 1.4389583398935668,
"grad_norm": 0.2909928560256958,
"learning_rate": 0.00018646601987978452,
"loss": 0.4022,
"step": 11425
},
{
"epoch": 1.439588122303744,
"grad_norm": 0.3085511326789856,
"learning_rate": 0.00018635938506661183,
"loss": 0.4099,
"step": 11430
},
{
"epoch": 1.4402179047139214,
"grad_norm": 0.28047701716423035,
"learning_rate": 0.0001862527307276189,
"loss": 0.3789,
"step": 11435
},
{
"epoch": 1.4408476871240987,
"grad_norm": 0.2697209119796753,
"learning_rate": 0.00018614605692008146,
"loss": 0.3864,
"step": 11440
},
{
"epoch": 1.4414774695342758,
"grad_norm": 0.40744665265083313,
"learning_rate": 0.0001860393637012858,
"loss": 0.4085,
"step": 11445
},
{
"epoch": 1.4421072519444533,
"grad_norm": 0.25875118374824524,
"learning_rate": 0.00018593265112852854,
"loss": 0.4033,
"step": 11450
},
{
"epoch": 1.4427370343546304,
"grad_norm": 0.2960642874240875,
"learning_rate": 0.00018582591925911694,
"loss": 0.4214,
"step": 11455
},
{
"epoch": 1.4433668167648077,
"grad_norm": 0.2711925506591797,
"learning_rate": 0.00018571916815036824,
"loss": 0.3537,
"step": 11460
},
{
"epoch": 1.443996599174985,
"grad_norm": 0.28002485632896423,
"learning_rate": 0.0001856123978596104,
"loss": 0.3787,
"step": 11465
},
{
"epoch": 1.4446263815851623,
"grad_norm": 0.3143458366394043,
"learning_rate": 0.00018550560844418138,
"loss": 0.3553,
"step": 11470
},
{
"epoch": 1.4452561639953396,
"grad_norm": 0.3184334337711334,
"learning_rate": 0.00018539879996142962,
"loss": 0.385,
"step": 11475
},
{
"epoch": 1.445885946405517,
"grad_norm": 0.3327188789844513,
"learning_rate": 0.00018529197246871368,
"loss": 0.4074,
"step": 11480
},
{
"epoch": 1.4465157288156942,
"grad_norm": 0.317942351102829,
"learning_rate": 0.0001851851260234024,
"loss": 0.3995,
"step": 11485
},
{
"epoch": 1.4471455112258715,
"grad_norm": 0.2567351758480072,
"learning_rate": 0.00018507826068287473,
"loss": 0.3661,
"step": 11490
},
{
"epoch": 1.4477752936360488,
"grad_norm": 0.29439592361450195,
"learning_rate": 0.0001849713765045198,
"loss": 0.3759,
"step": 11495
},
{
"epoch": 1.448405076046226,
"grad_norm": 0.3125048279762268,
"learning_rate": 0.0001848644735457368,
"loss": 0.4107,
"step": 11500
},
{
"epoch": 1.4490348584564032,
"grad_norm": 0.2855313718318939,
"learning_rate": 0.00018475755186393516,
"loss": 0.4061,
"step": 11505
},
{
"epoch": 1.4496646408665805,
"grad_norm": 0.3040854036808014,
"learning_rate": 0.00018465061151653423,
"loss": 0.3902,
"step": 11510
},
{
"epoch": 1.4502944232767578,
"grad_norm": 0.28425633907318115,
"learning_rate": 0.0001845436525609634,
"loss": 0.3861,
"step": 11515
},
{
"epoch": 1.4509242056869351,
"grad_norm": 0.31335607171058655,
"learning_rate": 0.00018443667505466205,
"loss": 0.3949,
"step": 11520
},
{
"epoch": 1.4515539880971124,
"grad_norm": 0.2725260555744171,
"learning_rate": 0.00018432967905507967,
"loss": 0.3979,
"step": 11525
},
{
"epoch": 1.4521837705072898,
"grad_norm": 0.2674049437046051,
"learning_rate": 0.00018422266461967537,
"loss": 0.3747,
"step": 11530
},
{
"epoch": 1.452813552917467,
"grad_norm": 0.3076520562171936,
"learning_rate": 0.0001841156318059185,
"loss": 0.385,
"step": 11535
},
{
"epoch": 1.4534433353276444,
"grad_norm": 0.23340527713298798,
"learning_rate": 0.00018400858067128806,
"loss": 0.3736,
"step": 11540
},
{
"epoch": 1.4540731177378217,
"grad_norm": 0.29402169585227966,
"learning_rate": 0.00018390151127327295,
"loss": 0.3994,
"step": 11545
},
{
"epoch": 1.454702900147999,
"grad_norm": 0.32409217953681946,
"learning_rate": 0.00018379442366937187,
"loss": 0.3979,
"step": 11550
},
{
"epoch": 1.455332682558176,
"grad_norm": 0.28875911235809326,
"learning_rate": 0.00018368731791709337,
"loss": 0.365,
"step": 11555
},
{
"epoch": 1.4559624649683534,
"grad_norm": 0.26838234066963196,
"learning_rate": 0.0001835801940739556,
"loss": 0.3912,
"step": 11560
},
{
"epoch": 1.4565922473785307,
"grad_norm": 0.31797516345977783,
"learning_rate": 0.00018347305219748665,
"loss": 0.3622,
"step": 11565
},
{
"epoch": 1.457222029788708,
"grad_norm": 0.31115812063217163,
"learning_rate": 0.00018336589234522398,
"loss": 0.4283,
"step": 11570
},
{
"epoch": 1.4578518121988853,
"grad_norm": 0.2730168402194977,
"learning_rate": 0.00018325871457471496,
"loss": 0.3864,
"step": 11575
},
{
"epoch": 1.4584815946090626,
"grad_norm": 0.28333088755607605,
"learning_rate": 0.00018315151894351657,
"loss": 0.3451,
"step": 11580
},
{
"epoch": 1.4591113770192399,
"grad_norm": 0.3169468343257904,
"learning_rate": 0.00018304430550919522,
"loss": 0.3719,
"step": 11585
},
{
"epoch": 1.4597411594294172,
"grad_norm": 0.3411467969417572,
"learning_rate": 0.000182937074329327,
"loss": 0.4073,
"step": 11590
},
{
"epoch": 1.4603709418395945,
"grad_norm": 0.3131183385848999,
"learning_rate": 0.0001828298254614975,
"loss": 0.4117,
"step": 11595
},
{
"epoch": 1.4610007242497716,
"grad_norm": 0.25929832458496094,
"learning_rate": 0.0001827225589633018,
"loss": 0.3834,
"step": 11600
},
{
"epoch": 1.461630506659949,
"grad_norm": 0.32609832286834717,
"learning_rate": 0.00018261527489234444,
"loss": 0.3972,
"step": 11605
},
{
"epoch": 1.4622602890701262,
"grad_norm": 0.3089287579059601,
"learning_rate": 0.00018250797330623953,
"loss": 0.3727,
"step": 11610
},
{
"epoch": 1.4628900714803035,
"grad_norm": 0.2891997992992401,
"learning_rate": 0.00018240065426261033,
"loss": 0.3891,
"step": 11615
},
{
"epoch": 1.4635198538904808,
"grad_norm": 0.3119528293609619,
"learning_rate": 0.00018229331781908971,
"loss": 0.388,
"step": 11620
},
{
"epoch": 1.464149636300658,
"grad_norm": 0.3314844071865082,
"learning_rate": 0.00018218596403331977,
"loss": 0.3803,
"step": 11625
},
{
"epoch": 1.4647794187108354,
"grad_norm": 0.27267536520957947,
"learning_rate": 0.00018207859296295197,
"loss": 0.3665,
"step": 11630
},
{
"epoch": 1.4654092011210127,
"grad_norm": 0.30490440130233765,
"learning_rate": 0.00018197120466564693,
"loss": 0.4051,
"step": 11635
},
{
"epoch": 1.46603898353119,
"grad_norm": 0.3182273209095001,
"learning_rate": 0.00018186379919907472,
"loss": 0.38,
"step": 11640
},
{
"epoch": 1.4666687659413673,
"grad_norm": 0.3026832044124603,
"learning_rate": 0.00018175637662091448,
"loss": 0.3371,
"step": 11645
},
{
"epoch": 1.4672985483515446,
"grad_norm": 0.3287534713745117,
"learning_rate": 0.0001816489369888546,
"loss": 0.4234,
"step": 11650
},
{
"epoch": 1.4679283307617217,
"grad_norm": 0.28076720237731934,
"learning_rate": 0.00018154148036059263,
"loss": 0.3825,
"step": 11655
},
{
"epoch": 1.4685581131718992,
"grad_norm": 0.304766446352005,
"learning_rate": 0.0001814340067938352,
"loss": 0.3905,
"step": 11660
},
{
"epoch": 1.4691878955820763,
"grad_norm": 0.30473533272743225,
"learning_rate": 0.00018132651634629812,
"loss": 0.409,
"step": 11665
},
{
"epoch": 1.4698176779922536,
"grad_norm": 0.32186418771743774,
"learning_rate": 0.00018121900907570618,
"loss": 0.3741,
"step": 11670
},
{
"epoch": 1.470447460402431,
"grad_norm": 0.33314061164855957,
"learning_rate": 0.00018111148503979326,
"loss": 0.3981,
"step": 11675
},
{
"epoch": 1.4710772428126082,
"grad_norm": 0.3202495872974396,
"learning_rate": 0.00018100394429630223,
"loss": 0.4014,
"step": 11680
},
{
"epoch": 1.4717070252227855,
"grad_norm": 0.2801063656806946,
"learning_rate": 0.00018089638690298488,
"loss": 0.3827,
"step": 11685
},
{
"epoch": 1.4723368076329628,
"grad_norm": 0.3252180516719818,
"learning_rate": 0.000180788812917602,
"loss": 0.4207,
"step": 11690
},
{
"epoch": 1.4729665900431401,
"grad_norm": 0.279823899269104,
"learning_rate": 0.0001806812223979233,
"loss": 0.4092,
"step": 11695
},
{
"epoch": 1.4735963724533174,
"grad_norm": 0.29136526584625244,
"learning_rate": 0.00018057361540172733,
"loss": 0.3939,
"step": 11700
},
{
"epoch": 1.4742261548634947,
"grad_norm": 0.2708832621574402,
"learning_rate": 0.00018046599198680153,
"loss": 0.3645,
"step": 11705
},
{
"epoch": 1.4748559372736718,
"grad_norm": 0.34708496928215027,
"learning_rate": 0.00018035835221094214,
"loss": 0.3814,
"step": 11710
},
{
"epoch": 1.4754857196838493,
"grad_norm": 0.3081948161125183,
"learning_rate": 0.00018025069613195413,
"loss": 0.3738,
"step": 11715
},
{
"epoch": 1.4761155020940264,
"grad_norm": 0.26891911029815674,
"learning_rate": 0.0001801430238076513,
"loss": 0.3724,
"step": 11720
},
{
"epoch": 1.4767452845042037,
"grad_norm": 0.3266797363758087,
"learning_rate": 0.00018003533529585612,
"loss": 0.3749,
"step": 11725
},
{
"epoch": 1.477375066914381,
"grad_norm": 0.25788089632987976,
"learning_rate": 0.00017992763065439982,
"loss": 0.3661,
"step": 11730
},
{
"epoch": 1.4780048493245583,
"grad_norm": 0.301270067691803,
"learning_rate": 0.00017981990994112227,
"loss": 0.3832,
"step": 11735
},
{
"epoch": 1.4786346317347356,
"grad_norm": 0.2785583734512329,
"learning_rate": 0.0001797121732138719,
"loss": 0.357,
"step": 11740
},
{
"epoch": 1.479264414144913,
"grad_norm": 0.3153518736362457,
"learning_rate": 0.00017960442053050583,
"loss": 0.3964,
"step": 11745
},
{
"epoch": 1.4798941965550902,
"grad_norm": 0.2862750291824341,
"learning_rate": 0.00017949665194888972,
"loss": 0.3781,
"step": 11750
},
{
"epoch": 1.4805239789652676,
"grad_norm": 0.31263992190361023,
"learning_rate": 0.00017938886752689765,
"loss": 0.3822,
"step": 11755
},
{
"epoch": 1.4811537613754449,
"grad_norm": 0.31964340806007385,
"learning_rate": 0.00017928106732241248,
"loss": 0.3757,
"step": 11760
},
{
"epoch": 1.481783543785622,
"grad_norm": 0.29111340641975403,
"learning_rate": 0.0001791732513933253,
"loss": 0.362,
"step": 11765
},
{
"epoch": 1.4824133261957995,
"grad_norm": 0.32248637080192566,
"learning_rate": 0.00017906541979753572,
"loss": 0.3978,
"step": 11770
},
{
"epoch": 1.4830431086059765,
"grad_norm": 0.2964222729206085,
"learning_rate": 0.0001789575725929518,
"loss": 0.3853,
"step": 11775
},
{
"epoch": 1.4836728910161538,
"grad_norm": 0.32823482155799866,
"learning_rate": 0.0001788497098374899,
"loss": 0.3828,
"step": 11780
},
{
"epoch": 1.4843026734263312,
"grad_norm": 0.30054226517677307,
"learning_rate": 0.0001787418315890748,
"loss": 0.38,
"step": 11785
},
{
"epoch": 1.4849324558365085,
"grad_norm": 0.30829596519470215,
"learning_rate": 0.0001786339379056397,
"loss": 0.3645,
"step": 11790
},
{
"epoch": 1.4855622382466858,
"grad_norm": 0.3095497786998749,
"learning_rate": 0.00017852602884512584,
"loss": 0.3727,
"step": 11795
},
{
"epoch": 1.486192020656863,
"grad_norm": 0.29647621512413025,
"learning_rate": 0.00017841810446548283,
"loss": 0.3764,
"step": 11800
},
{
"epoch": 1.4868218030670404,
"grad_norm": 0.3227784037590027,
"learning_rate": 0.00017831016482466864,
"loss": 0.3797,
"step": 11805
},
{
"epoch": 1.4874515854772177,
"grad_norm": 0.32365646958351135,
"learning_rate": 0.00017820220998064927,
"loss": 0.3766,
"step": 11810
},
{
"epoch": 1.488081367887395,
"grad_norm": 0.36090198159217834,
"learning_rate": 0.0001780942399913989,
"loss": 0.4015,
"step": 11815
},
{
"epoch": 1.488711150297572,
"grad_norm": 0.28814610838890076,
"learning_rate": 0.00017798625491489994,
"loss": 0.3616,
"step": 11820
},
{
"epoch": 1.4893409327077496,
"grad_norm": 0.2654825747013092,
"learning_rate": 0.00017787825480914283,
"loss": 0.3462,
"step": 11825
},
{
"epoch": 1.4899707151179267,
"grad_norm": 0.2913071811199188,
"learning_rate": 0.000177770239732126,
"loss": 0.3707,
"step": 11830
},
{
"epoch": 1.490600497528104,
"grad_norm": 0.33099865913391113,
"learning_rate": 0.0001776622097418562,
"loss": 0.3644,
"step": 11835
},
{
"epoch": 1.4912302799382813,
"grad_norm": 0.2980974018573761,
"learning_rate": 0.0001775541648963478,
"loss": 0.3839,
"step": 11840
},
{
"epoch": 1.4918600623484586,
"grad_norm": 0.2673074007034302,
"learning_rate": 0.00017744610525362352,
"loss": 0.3736,
"step": 11845
},
{
"epoch": 1.4924898447586359,
"grad_norm": 0.26277023553848267,
"learning_rate": 0.00017733803087171372,
"loss": 0.3463,
"step": 11850
},
{
"epoch": 1.4931196271688132,
"grad_norm": 0.27924680709838867,
"learning_rate": 0.00017722994180865696,
"loss": 0.4095,
"step": 11855
},
{
"epoch": 1.4937494095789905,
"grad_norm": 0.2761695086956024,
"learning_rate": 0.00017712183812249938,
"loss": 0.3748,
"step": 11860
},
{
"epoch": 1.4943791919891678,
"grad_norm": 0.312854528427124,
"learning_rate": 0.00017701371987129523,
"loss": 0.3748,
"step": 11865
},
{
"epoch": 1.495008974399345,
"grad_norm": 0.3033592998981476,
"learning_rate": 0.00017690558711310644,
"loss": 0.3728,
"step": 11870
},
{
"epoch": 1.4956387568095222,
"grad_norm": 0.2711508572101593,
"learning_rate": 0.00017679743990600281,
"loss": 0.3748,
"step": 11875
},
{
"epoch": 1.4962685392196997,
"grad_norm": 0.28003159165382385,
"learning_rate": 0.00017668927830806177,
"loss": 0.3658,
"step": 11880
},
{
"epoch": 1.4968983216298768,
"grad_norm": 0.2750314772129059,
"learning_rate": 0.0001765811023773687,
"loss": 0.3705,
"step": 11885
},
{
"epoch": 1.497528104040054,
"grad_norm": 0.31037452816963196,
"learning_rate": 0.00017647291217201644,
"loss": 0.3718,
"step": 11890
},
{
"epoch": 1.4981578864502314,
"grad_norm": 0.33681520819664,
"learning_rate": 0.00017636470775010563,
"loss": 0.37,
"step": 11895
},
{
"epoch": 1.4987876688604087,
"grad_norm": 0.2735719084739685,
"learning_rate": 0.00017625648916974452,
"loss": 0.3898,
"step": 11900
},
{
"epoch": 1.499417451270586,
"grad_norm": 0.2873845398426056,
"learning_rate": 0.00017614825648904902,
"loss": 0.387,
"step": 11905
},
{
"epoch": 1.5000472336807633,
"grad_norm": 0.2826070189476013,
"learning_rate": 0.00017604000976614243,
"loss": 0.3656,
"step": 11910
},
{
"epoch": 1.5006770160909406,
"grad_norm": 0.2709527015686035,
"learning_rate": 0.00017593174905915581,
"loss": 0.3583,
"step": 11915
},
{
"epoch": 1.5013067985011177,
"grad_norm": 0.3088144063949585,
"learning_rate": 0.00017582347442622755,
"loss": 0.3715,
"step": 11920
},
{
"epoch": 1.5019365809112952,
"grad_norm": 0.27996301651000977,
"learning_rate": 0.0001757151859255038,
"loss": 0.3636,
"step": 11925
},
{
"epoch": 1.5025663633214723,
"grad_norm": 0.3117114007472992,
"learning_rate": 0.00017560688361513766,
"loss": 0.351,
"step": 11930
},
{
"epoch": 1.5031961457316498,
"grad_norm": 0.32614433765411377,
"learning_rate": 0.00017549856755329012,
"loss": 0.3711,
"step": 11935
},
{
"epoch": 1.503825928141827,
"grad_norm": 0.23831017315387726,
"learning_rate": 0.0001753902377981294,
"loss": 0.3645,
"step": 11940
},
{
"epoch": 1.5044557105520044,
"grad_norm": 0.27338019013404846,
"learning_rate": 0.000175281894407831,
"loss": 0.3606,
"step": 11945
},
{
"epoch": 1.5050854929621815,
"grad_norm": 0.2813990116119385,
"learning_rate": 0.0001751735374405778,
"loss": 0.3637,
"step": 11950
},
{
"epoch": 1.5057152753723588,
"grad_norm": 0.2607782781124115,
"learning_rate": 0.00017506516695455992,
"loss": 0.3493,
"step": 11955
},
{
"epoch": 1.5063450577825361,
"grad_norm": 0.2825680077075958,
"learning_rate": 0.0001749567830079749,
"loss": 0.3474,
"step": 11960
},
{
"epoch": 1.5069748401927134,
"grad_norm": 0.2957023084163666,
"learning_rate": 0.00017484838565902735,
"loss": 0.3852,
"step": 11965
},
{
"epoch": 1.5076046226028907,
"grad_norm": 0.31363338232040405,
"learning_rate": 0.00017473997496592904,
"loss": 0.3944,
"step": 11970
},
{
"epoch": 1.5082344050130678,
"grad_norm": 0.271010160446167,
"learning_rate": 0.00017463155098689908,
"loss": 0.3667,
"step": 11975
},
{
"epoch": 1.5088641874232454,
"grad_norm": 0.28360188007354736,
"learning_rate": 0.00017452311378016362,
"loss": 0.3564,
"step": 11980
},
{
"epoch": 1.5094939698334224,
"grad_norm": 0.28345590829849243,
"learning_rate": 0.00017441466340395583,
"loss": 0.358,
"step": 11985
},
{
"epoch": 1.5101237522436,
"grad_norm": 0.23574601113796234,
"learning_rate": 0.00017430619991651614,
"loss": 0.3588,
"step": 11990
},
{
"epoch": 1.510753534653777,
"grad_norm": 0.32633906602859497,
"learning_rate": 0.0001741977233760919,
"loss": 0.3786,
"step": 11995
},
{
"epoch": 1.5113833170639546,
"grad_norm": 0.31216609477996826,
"learning_rate": 0.00017408923384093746,
"loss": 0.3949,
"step": 12000
},
{
"epoch": 1.5113833170639546,
"eval_loss": 0.3009350597858429,
"eval_runtime": 6.1573,
"eval_samples_per_second": 162.409,
"eval_steps_per_second": 10.232,
"step": 12000
},
{
"epoch": 1.5120130994741316,
"grad_norm": 0.2735341191291809,
"learning_rate": 0.00017398073136931416,
"loss": 0.3667,
"step": 12005
},
{
"epoch": 1.512642881884309,
"grad_norm": 0.3168368637561798,
"learning_rate": 0.0001738722160194904,
"loss": 0.3693,
"step": 12010
},
{
"epoch": 1.5132726642944863,
"grad_norm": 0.27563655376434326,
"learning_rate": 0.0001737636878497413,
"loss": 0.3721,
"step": 12015
},
{
"epoch": 1.5139024467046636,
"grad_norm": 0.27887552976608276,
"learning_rate": 0.00017365514691834898,
"loss": 0.402,
"step": 12020
},
{
"epoch": 1.5145322291148409,
"grad_norm": 0.30676189064979553,
"learning_rate": 0.0001735465932836024,
"loss": 0.3875,
"step": 12025
},
{
"epoch": 1.515162011525018,
"grad_norm": 0.30623871088027954,
"learning_rate": 0.00017343802700379746,
"loss": 0.3644,
"step": 12030
},
{
"epoch": 1.5157917939351955,
"grad_norm": 0.2534305453300476,
"learning_rate": 0.00017332944813723658,
"loss": 0.3753,
"step": 12035
},
{
"epoch": 1.5164215763453726,
"grad_norm": 0.29374125599861145,
"learning_rate": 0.00017322085674222916,
"loss": 0.3964,
"step": 12040
},
{
"epoch": 1.51705135875555,
"grad_norm": 0.2833009362220764,
"learning_rate": 0.00017311225287709126,
"loss": 0.3778,
"step": 12045
},
{
"epoch": 1.5176811411657272,
"grad_norm": 0.273299902677536,
"learning_rate": 0.0001730036366001456,
"loss": 0.3661,
"step": 12050
},
{
"epoch": 1.5183109235759045,
"grad_norm": 0.32840752601623535,
"learning_rate": 0.00017289500796972165,
"loss": 0.3564,
"step": 12055
},
{
"epoch": 1.5189407059860818,
"grad_norm": 0.289202481508255,
"learning_rate": 0.00017278636704415545,
"loss": 0.3885,
"step": 12060
},
{
"epoch": 1.519570488396259,
"grad_norm": 0.28327277302742004,
"learning_rate": 0.0001726777138817896,
"loss": 0.376,
"step": 12065
},
{
"epoch": 1.5202002708064364,
"grad_norm": 0.2617267370223999,
"learning_rate": 0.00017256904854097343,
"loss": 0.353,
"step": 12070
},
{
"epoch": 1.5208300532166137,
"grad_norm": 0.2693130671977997,
"learning_rate": 0.00017246037108006266,
"loss": 0.386,
"step": 12075
},
{
"epoch": 1.521459835626791,
"grad_norm": 0.260217547416687,
"learning_rate": 0.00017235168155741956,
"loss": 0.3773,
"step": 12080
},
{
"epoch": 1.522089618036968,
"grad_norm": 0.2806963622570038,
"learning_rate": 0.0001722429800314129,
"loss": 0.3703,
"step": 12085
},
{
"epoch": 1.5227194004471456,
"grad_norm": 0.2797011435031891,
"learning_rate": 0.00017213426656041787,
"loss": 0.3523,
"step": 12090
},
{
"epoch": 1.5233491828573227,
"grad_norm": 0.3413710296154022,
"learning_rate": 0.00017202554120281612,
"loss": 0.3825,
"step": 12095
},
{
"epoch": 1.5239789652675002,
"grad_norm": 0.2759542167186737,
"learning_rate": 0.0001719168040169956,
"loss": 0.346,
"step": 12100
},
{
"epoch": 1.5246087476776773,
"grad_norm": 0.28816723823547363,
"learning_rate": 0.00017180805506135068,
"loss": 0.3772,
"step": 12105
},
{
"epoch": 1.5252385300878546,
"grad_norm": 0.2563376724720001,
"learning_rate": 0.00017169929439428207,
"loss": 0.3661,
"step": 12110
},
{
"epoch": 1.525868312498032,
"grad_norm": 0.29572755098342896,
"learning_rate": 0.0001715905220741967,
"loss": 0.3428,
"step": 12115
},
{
"epoch": 1.5264980949082092,
"grad_norm": 0.28491732478141785,
"learning_rate": 0.0001714817381595078,
"loss": 0.3778,
"step": 12120
},
{
"epoch": 1.5271278773183865,
"grad_norm": 0.28429144620895386,
"learning_rate": 0.0001713729427086348,
"loss": 0.351,
"step": 12125
},
{
"epoch": 1.5277576597285638,
"grad_norm": 0.3044835925102234,
"learning_rate": 0.00017126413578000342,
"loss": 0.3651,
"step": 12130
},
{
"epoch": 1.5283874421387411,
"grad_norm": 0.30945730209350586,
"learning_rate": 0.0001711553174320453,
"loss": 0.3731,
"step": 12135
},
{
"epoch": 1.5290172245489182,
"grad_norm": 0.26389655470848083,
"learning_rate": 0.00017104648772319853,
"loss": 0.3527,
"step": 12140
},
{
"epoch": 1.5296470069590957,
"grad_norm": 0.3144720196723938,
"learning_rate": 0.0001709376467119071,
"loss": 0.3776,
"step": 12145
},
{
"epoch": 1.5302767893692728,
"grad_norm": 0.2860710918903351,
"learning_rate": 0.00017082879445662113,
"loss": 0.3575,
"step": 12150
},
{
"epoch": 1.5309065717794503,
"grad_norm": 0.2869095802307129,
"learning_rate": 0.00017071993101579674,
"loss": 0.3322,
"step": 12155
},
{
"epoch": 1.5315363541896274,
"grad_norm": 0.2524400055408478,
"learning_rate": 0.00017061105644789612,
"loss": 0.3743,
"step": 12160
},
{
"epoch": 1.5321661365998047,
"grad_norm": 0.2670304477214813,
"learning_rate": 0.00017050217081138736,
"loss": 0.3735,
"step": 12165
},
{
"epoch": 1.532795919009982,
"grad_norm": 0.2701478898525238,
"learning_rate": 0.00017039327416474456,
"loss": 0.3467,
"step": 12170
},
{
"epoch": 1.5334257014201593,
"grad_norm": 0.2941682040691376,
"learning_rate": 0.0001702843665664477,
"loss": 0.3895,
"step": 12175
},
{
"epoch": 1.5340554838303366,
"grad_norm": 0.28004932403564453,
"learning_rate": 0.00017017544807498264,
"loss": 0.3666,
"step": 12180
},
{
"epoch": 1.534685266240514,
"grad_norm": 0.29110807180404663,
"learning_rate": 0.00017006651874884116,
"loss": 0.3628,
"step": 12185
},
{
"epoch": 1.5353150486506912,
"grad_norm": 0.2467578798532486,
"learning_rate": 0.00016995757864652066,
"loss": 0.35,
"step": 12190
},
{
"epoch": 1.5359448310608683,
"grad_norm": 0.3148331046104431,
"learning_rate": 0.00016984862782652463,
"loss": 0.3535,
"step": 12195
},
{
"epoch": 1.5365746134710458,
"grad_norm": 0.28578343987464905,
"learning_rate": 0.00016973966634736202,
"loss": 0.3477,
"step": 12200
},
{
"epoch": 1.537204395881223,
"grad_norm": 0.24588525295257568,
"learning_rate": 0.0001696306942675477,
"loss": 0.3459,
"step": 12205
},
{
"epoch": 1.5378341782914005,
"grad_norm": 0.2754054665565491,
"learning_rate": 0.00016952171164560213,
"loss": 0.3555,
"step": 12210
},
{
"epoch": 1.5384639607015775,
"grad_norm": 0.28237447142601013,
"learning_rate": 0.00016941271854005148,
"loss": 0.3446,
"step": 12215
},
{
"epoch": 1.5390937431117548,
"grad_norm": 0.27689647674560547,
"learning_rate": 0.00016930371500942755,
"loss": 0.3651,
"step": 12220
},
{
"epoch": 1.5397235255219321,
"grad_norm": 0.29644525051116943,
"learning_rate": 0.0001691947011122677,
"loss": 0.3482,
"step": 12225
},
{
"epoch": 1.5403533079321095,
"grad_norm": 0.3168468475341797,
"learning_rate": 0.0001690856769071149,
"loss": 0.3859,
"step": 12230
},
{
"epoch": 1.5409830903422868,
"grad_norm": 0.282879501581192,
"learning_rate": 0.0001689766424525177,
"loss": 0.3742,
"step": 12235
},
{
"epoch": 1.541612872752464,
"grad_norm": 0.2539578676223755,
"learning_rate": 0.00016886759780702996,
"loss": 0.3467,
"step": 12240
},
{
"epoch": 1.5422426551626414,
"grad_norm": 0.3353635370731354,
"learning_rate": 0.00016875854302921122,
"loss": 0.3831,
"step": 12245
},
{
"epoch": 1.5428724375728184,
"grad_norm": 0.2890516519546509,
"learning_rate": 0.0001686494781776264,
"loss": 0.3672,
"step": 12250
},
{
"epoch": 1.543502219982996,
"grad_norm": 0.3136516213417053,
"learning_rate": 0.00016854040331084583,
"loss": 0.37,
"step": 12255
},
{
"epoch": 1.544132002393173,
"grad_norm": 0.29757821559906006,
"learning_rate": 0.0001684313184874451,
"loss": 0.3681,
"step": 12260
},
{
"epoch": 1.5447617848033506,
"grad_norm": 0.3504684269428253,
"learning_rate": 0.0001683222237660054,
"loss": 0.3868,
"step": 12265
},
{
"epoch": 1.5453915672135277,
"grad_norm": 0.25241127610206604,
"learning_rate": 0.00016821311920511297,
"loss": 0.3425,
"step": 12270
},
{
"epoch": 1.546021349623705,
"grad_norm": 0.27286654710769653,
"learning_rate": 0.00016810400486335953,
"loss": 0.3604,
"step": 12275
},
{
"epoch": 1.5466511320338823,
"grad_norm": 0.3104652166366577,
"learning_rate": 0.0001679948807993419,
"loss": 0.3487,
"step": 12280
},
{
"epoch": 1.5472809144440596,
"grad_norm": 0.2972196042537689,
"learning_rate": 0.00016788574707166226,
"loss": 0.3555,
"step": 12285
},
{
"epoch": 1.5479106968542369,
"grad_norm": 0.29232388734817505,
"learning_rate": 0.00016777660373892787,
"loss": 0.3654,
"step": 12290
},
{
"epoch": 1.5485404792644142,
"grad_norm": 0.29798245429992676,
"learning_rate": 0.00016766745085975126,
"loss": 0.3575,
"step": 12295
},
{
"epoch": 1.5491702616745915,
"grad_norm": 0.2721775472164154,
"learning_rate": 0.0001675582884927499,
"loss": 0.3409,
"step": 12300
},
{
"epoch": 1.5498000440847686,
"grad_norm": 0.3131150007247925,
"learning_rate": 0.00016744911669654662,
"loss": 0.3695,
"step": 12305
},
{
"epoch": 1.550429826494946,
"grad_norm": 0.29543280601501465,
"learning_rate": 0.00016733993552976901,
"loss": 0.3572,
"step": 12310
},
{
"epoch": 1.5510596089051232,
"grad_norm": 0.3287052512168884,
"learning_rate": 0.00016723074505105,
"loss": 0.3681,
"step": 12315
},
{
"epoch": 1.5516893913153007,
"grad_norm": 0.2833183705806732,
"learning_rate": 0.0001671215453190273,
"loss": 0.3709,
"step": 12320
},
{
"epoch": 1.5523191737254778,
"grad_norm": 0.2558510899543762,
"learning_rate": 0.00016701233639234363,
"loss": 0.3404,
"step": 12325
},
{
"epoch": 1.552948956135655,
"grad_norm": 0.2524779438972473,
"learning_rate": 0.0001669031183296467,
"loss": 0.3492,
"step": 12330
},
{
"epoch": 1.5535787385458324,
"grad_norm": 0.2844880521297455,
"learning_rate": 0.00016679389118958918,
"loss": 0.3538,
"step": 12335
},
{
"epoch": 1.5542085209560097,
"grad_norm": 0.28060173988342285,
"learning_rate": 0.0001666846550308285,
"loss": 0.3615,
"step": 12340
},
{
"epoch": 1.554838303366187,
"grad_norm": 0.2490835040807724,
"learning_rate": 0.00016657540991202687,
"loss": 0.3655,
"step": 12345
},
{
"epoch": 1.5554680857763643,
"grad_norm": 0.27524054050445557,
"learning_rate": 0.00016646615589185153,
"loss": 0.3412,
"step": 12350
},
{
"epoch": 1.5560978681865416,
"grad_norm": 0.31142935156822205,
"learning_rate": 0.00016635689302897435,
"loss": 0.347,
"step": 12355
},
{
"epoch": 1.5567276505967187,
"grad_norm": 0.28053995966911316,
"learning_rate": 0.00016624762138207197,
"loss": 0.3838,
"step": 12360
},
{
"epoch": 1.5573574330068962,
"grad_norm": 0.2476169764995575,
"learning_rate": 0.0001661383410098258,
"loss": 0.3636,
"step": 12365
},
{
"epoch": 1.5579872154170733,
"grad_norm": 0.4054109752178192,
"learning_rate": 0.00016602905197092183,
"loss": 0.3657,
"step": 12370
},
{
"epoch": 1.5586169978272508,
"grad_norm": 0.2735072672367096,
"learning_rate": 0.00016591975432405084,
"loss": 0.3593,
"step": 12375
},
{
"epoch": 1.559246780237428,
"grad_norm": 0.2994532883167267,
"learning_rate": 0.00016581044812790817,
"loss": 0.3641,
"step": 12380
},
{
"epoch": 1.5598765626476052,
"grad_norm": 0.263090044260025,
"learning_rate": 0.0001657011334411936,
"loss": 0.3711,
"step": 12385
},
{
"epoch": 1.5605063450577825,
"grad_norm": 0.25073063373565674,
"learning_rate": 0.0001655918103226118,
"loss": 0.3554,
"step": 12390
},
{
"epoch": 1.5611361274679598,
"grad_norm": 0.2575080096721649,
"learning_rate": 0.00016548247883087168,
"loss": 0.3744,
"step": 12395
},
{
"epoch": 1.5617659098781371,
"grad_norm": 0.2630578577518463,
"learning_rate": 0.00016537313902468677,
"loss": 0.3501,
"step": 12400
},
{
"epoch": 1.5623956922883144,
"grad_norm": 0.3097805678844452,
"learning_rate": 0.00016526379096277503,
"loss": 0.3586,
"step": 12405
},
{
"epoch": 1.5630254746984917,
"grad_norm": 0.3104281723499298,
"learning_rate": 0.0001651544347038589,
"loss": 0.3643,
"step": 12410
},
{
"epoch": 1.5636552571086688,
"grad_norm": 0.3604758381843567,
"learning_rate": 0.0001650450703066652,
"loss": 0.3645,
"step": 12415
},
{
"epoch": 1.5642850395188463,
"grad_norm": 0.30638590455055237,
"learning_rate": 0.000164935697829925,
"loss": 0.3572,
"step": 12420
},
{
"epoch": 1.5649148219290234,
"grad_norm": 0.27940669655799866,
"learning_rate": 0.00016482631733237397,
"loss": 0.3636,
"step": 12425
},
{
"epoch": 1.565544604339201,
"grad_norm": 0.28857216238975525,
"learning_rate": 0.00016471692887275185,
"loss": 0.3601,
"step": 12430
},
{
"epoch": 1.566174386749378,
"grad_norm": 0.2992657721042633,
"learning_rate": 0.0001646075325098027,
"loss": 0.3621,
"step": 12435
},
{
"epoch": 1.5668041691595553,
"grad_norm": 0.28050917387008667,
"learning_rate": 0.00016449812830227498,
"loss": 0.3623,
"step": 12440
},
{
"epoch": 1.5674339515697326,
"grad_norm": 0.269634485244751,
"learning_rate": 0.0001643887163089212,
"loss": 0.3375,
"step": 12445
},
{
"epoch": 1.56806373397991,
"grad_norm": 0.2825991213321686,
"learning_rate": 0.00016427929658849807,
"loss": 0.3523,
"step": 12450
},
{
"epoch": 1.5686935163900873,
"grad_norm": 0.3219839334487915,
"learning_rate": 0.00016416986919976645,
"loss": 0.3588,
"step": 12455
},
{
"epoch": 1.5693232988002646,
"grad_norm": 0.2681691646575928,
"learning_rate": 0.00016406043420149146,
"loss": 0.3466,
"step": 12460
},
{
"epoch": 1.5699530812104419,
"grad_norm": 0.2719057500362396,
"learning_rate": 0.0001639509916524421,
"loss": 0.3599,
"step": 12465
},
{
"epoch": 1.570582863620619,
"grad_norm": 0.24405649304389954,
"learning_rate": 0.00016384154161139158,
"loss": 0.3402,
"step": 12470
},
{
"epoch": 1.5712126460307965,
"grad_norm": 0.306537002325058,
"learning_rate": 0.00016373208413711696,
"loss": 0.3283,
"step": 12475
},
{
"epoch": 1.5718424284409735,
"grad_norm": 0.28490665555000305,
"learning_rate": 0.0001636226192883996,
"loss": 0.3529,
"step": 12480
},
{
"epoch": 1.572472210851151,
"grad_norm": 0.2510652542114258,
"learning_rate": 0.00016351314712402442,
"loss": 0.3228,
"step": 12485
},
{
"epoch": 1.5731019932613282,
"grad_norm": 0.2670060694217682,
"learning_rate": 0.0001634036677027806,
"loss": 0.3592,
"step": 12490
},
{
"epoch": 1.5737317756715055,
"grad_norm": 0.29240545630455017,
"learning_rate": 0.00016329418108346105,
"loss": 0.3717,
"step": 12495
},
{
"epoch": 1.5743615580816828,
"grad_norm": 0.29088887572288513,
"learning_rate": 0.00016318468732486255,
"loss": 0.3679,
"step": 12500
},
{
"epoch": 1.57499134049186,
"grad_norm": 0.25105100870132446,
"learning_rate": 0.0001630751864857858,
"loss": 0.3464,
"step": 12505
},
{
"epoch": 1.5756211229020374,
"grad_norm": 0.26624953746795654,
"learning_rate": 0.00016296567862503526,
"loss": 0.3552,
"step": 12510
},
{
"epoch": 1.5762509053122147,
"grad_norm": 0.28500837087631226,
"learning_rate": 0.00016285616380141914,
"loss": 0.3591,
"step": 12515
},
{
"epoch": 1.576880687722392,
"grad_norm": 0.2937677502632141,
"learning_rate": 0.00016274664207374936,
"loss": 0.3664,
"step": 12520
},
{
"epoch": 1.577510470132569,
"grad_norm": 0.28588148951530457,
"learning_rate": 0.00016263711350084165,
"loss": 0.3767,
"step": 12525
},
{
"epoch": 1.5781402525427466,
"grad_norm": 0.31547772884368896,
"learning_rate": 0.0001625275781415153,
"loss": 0.3521,
"step": 12530
},
{
"epoch": 1.5787700349529237,
"grad_norm": 0.29322996735572815,
"learning_rate": 0.00016241803605459334,
"loss": 0.3777,
"step": 12535
},
{
"epoch": 1.5793998173631012,
"grad_norm": 0.29141756892204285,
"learning_rate": 0.00016230848729890238,
"loss": 0.3367,
"step": 12540
},
{
"epoch": 1.5800295997732783,
"grad_norm": 0.2583523094654083,
"learning_rate": 0.00016219893193327258,
"loss": 0.3473,
"step": 12545
},
{
"epoch": 1.5806593821834556,
"grad_norm": 0.26929906010627747,
"learning_rate": 0.00016208937001653765,
"loss": 0.3622,
"step": 12550
},
{
"epoch": 1.581289164593633,
"grad_norm": 0.2727062702178955,
"learning_rate": 0.0001619798016075349,
"loss": 0.3607,
"step": 12555
},
{
"epoch": 1.5819189470038102,
"grad_norm": 0.35366252064704895,
"learning_rate": 0.000161870226765105,
"loss": 0.3453,
"step": 12560
},
{
"epoch": 1.5825487294139875,
"grad_norm": 0.30689889192581177,
"learning_rate": 0.00016176064554809225,
"loss": 0.3672,
"step": 12565
},
{
"epoch": 1.5831785118241648,
"grad_norm": 0.2855357825756073,
"learning_rate": 0.00016165105801534414,
"loss": 0.3715,
"step": 12570
},
{
"epoch": 1.583808294234342,
"grad_norm": 0.33706697821617126,
"learning_rate": 0.00016154146422571176,
"loss": 0.3645,
"step": 12575
},
{
"epoch": 1.5844380766445192,
"grad_norm": 0.24410569667816162,
"learning_rate": 0.00016143186423804944,
"loss": 0.3576,
"step": 12580
},
{
"epoch": 1.5850678590546967,
"grad_norm": 0.33356451988220215,
"learning_rate": 0.00016132225811121492,
"loss": 0.3774,
"step": 12585
},
{
"epoch": 1.5856976414648738,
"grad_norm": 0.2804293632507324,
"learning_rate": 0.00016121264590406912,
"loss": 0.3656,
"step": 12590
},
{
"epoch": 1.5863274238750513,
"grad_norm": 0.29394668340682983,
"learning_rate": 0.0001611030276754764,
"loss": 0.3468,
"step": 12595
},
{
"epoch": 1.5869572062852284,
"grad_norm": 0.2657965421676636,
"learning_rate": 0.0001609934034843042,
"loss": 0.3518,
"step": 12600
},
{
"epoch": 1.5875869886954057,
"grad_norm": 0.25583842396736145,
"learning_rate": 0.00016088377338942318,
"loss": 0.361,
"step": 12605
},
{
"epoch": 1.588216771105583,
"grad_norm": 0.27986687421798706,
"learning_rate": 0.00016077413744970722,
"loss": 0.3771,
"step": 12610
},
{
"epoch": 1.5888465535157603,
"grad_norm": 0.3200220763683319,
"learning_rate": 0.0001606644957240334,
"loss": 0.3666,
"step": 12615
},
{
"epoch": 1.5894763359259376,
"grad_norm": 0.29622554779052734,
"learning_rate": 0.00016055484827128173,
"loss": 0.3469,
"step": 12620
},
{
"epoch": 1.590106118336115,
"grad_norm": 0.3073137700557709,
"learning_rate": 0.00016044519515033545,
"loss": 0.3382,
"step": 12625
},
{
"epoch": 1.5907359007462922,
"grad_norm": 0.31342241168022156,
"learning_rate": 0.00016033553642008077,
"loss": 0.357,
"step": 12630
},
{
"epoch": 1.5913656831564693,
"grad_norm": 0.2913351058959961,
"learning_rate": 0.00016022587213940698,
"loss": 0.3487,
"step": 12635
},
{
"epoch": 1.5919954655666468,
"grad_norm": 0.2823300361633301,
"learning_rate": 0.00016011620236720621,
"loss": 0.3367,
"step": 12640
},
{
"epoch": 1.592625247976824,
"grad_norm": 0.3134678304195404,
"learning_rate": 0.00016000652716237373,
"loss": 0.3393,
"step": 12645
},
{
"epoch": 1.5932550303870014,
"grad_norm": 0.3235761821269989,
"learning_rate": 0.0001598968465838076,
"loss": 0.3752,
"step": 12650
},
{
"epoch": 1.5938848127971785,
"grad_norm": 0.26606664061546326,
"learning_rate": 0.00015978716069040875,
"loss": 0.3413,
"step": 12655
},
{
"epoch": 1.5945145952073558,
"grad_norm": 0.30575528740882874,
"learning_rate": 0.0001596774695410811,
"loss": 0.3715,
"step": 12660
},
{
"epoch": 1.5951443776175331,
"grad_norm": 0.3017826974391937,
"learning_rate": 0.0001595677731947312,
"loss": 0.3586,
"step": 12665
},
{
"epoch": 1.5957741600277104,
"grad_norm": 0.3203217089176178,
"learning_rate": 0.00015945807171026855,
"loss": 0.3753,
"step": 12670
},
{
"epoch": 1.5964039424378877,
"grad_norm": 0.2615835666656494,
"learning_rate": 0.00015934836514660536,
"loss": 0.3641,
"step": 12675
},
{
"epoch": 1.5970337248480648,
"grad_norm": 0.2814265191555023,
"learning_rate": 0.00015923865356265652,
"loss": 0.3467,
"step": 12680
},
{
"epoch": 1.5976635072582424,
"grad_norm": 0.28240394592285156,
"learning_rate": 0.00015912893701733975,
"loss": 0.3405,
"step": 12685
},
{
"epoch": 1.5982932896684194,
"grad_norm": 0.2675967514514923,
"learning_rate": 0.0001590192155695752,
"loss": 0.3341,
"step": 12690
},
{
"epoch": 1.598923072078597,
"grad_norm": 0.33408063650131226,
"learning_rate": 0.00015890948927828593,
"loss": 0.3431,
"step": 12695
},
{
"epoch": 1.599552854488774,
"grad_norm": 0.2793383300304413,
"learning_rate": 0.00015879975820239737,
"loss": 0.3334,
"step": 12700
},
{
"epoch": 1.6001826368989516,
"grad_norm": 0.29299893975257874,
"learning_rate": 0.00015869002240083765,
"loss": 0.3479,
"step": 12705
},
{
"epoch": 1.6008124193091287,
"grad_norm": 0.2782432436943054,
"learning_rate": 0.0001585802819325374,
"loss": 0.3491,
"step": 12710
},
{
"epoch": 1.601442201719306,
"grad_norm": 0.2812289297580719,
"learning_rate": 0.00015847053685642977,
"loss": 0.3406,
"step": 12715
},
{
"epoch": 1.6020719841294833,
"grad_norm": 0.2464970499277115,
"learning_rate": 0.00015836078723145032,
"loss": 0.3539,
"step": 12720
},
{
"epoch": 1.6027017665396606,
"grad_norm": 0.3087675869464874,
"learning_rate": 0.0001582510331165372,
"loss": 0.356,
"step": 12725
},
{
"epoch": 1.6033315489498379,
"grad_norm": 0.2726532816886902,
"learning_rate": 0.0001581412745706308,
"loss": 0.3443,
"step": 12730
},
{
"epoch": 1.603961331360015,
"grad_norm": 0.28401410579681396,
"learning_rate": 0.00015803151165267397,
"loss": 0.3359,
"step": 12735
},
{
"epoch": 1.6045911137701925,
"grad_norm": 0.2700473666191101,
"learning_rate": 0.00015792174442161194,
"loss": 0.3523,
"step": 12740
},
{
"epoch": 1.6052208961803696,
"grad_norm": 0.32183146476745605,
"learning_rate": 0.00015781197293639223,
"loss": 0.3765,
"step": 12745
},
{
"epoch": 1.605850678590547,
"grad_norm": 0.2805304229259491,
"learning_rate": 0.0001577021972559646,
"loss": 0.3546,
"step": 12750
},
{
"epoch": 1.6064804610007242,
"grad_norm": 0.30137768387794495,
"learning_rate": 0.00015759241743928108,
"loss": 0.3721,
"step": 12755
},
{
"epoch": 1.6071102434109017,
"grad_norm": 0.25476494431495667,
"learning_rate": 0.00015748263354529597,
"loss": 0.3281,
"step": 12760
},
{
"epoch": 1.6077400258210788,
"grad_norm": 0.3311167061328888,
"learning_rate": 0.0001573728456329657,
"loss": 0.3875,
"step": 12765
},
{
"epoch": 1.608369808231256,
"grad_norm": 0.27148258686065674,
"learning_rate": 0.00015726305376124897,
"loss": 0.3547,
"step": 12770
},
{
"epoch": 1.6089995906414334,
"grad_norm": 0.25366437435150146,
"learning_rate": 0.00015715325798910644,
"loss": 0.3423,
"step": 12775
},
{
"epoch": 1.6096293730516107,
"grad_norm": 0.2699160873889923,
"learning_rate": 0.000157043458375501,
"loss": 0.3347,
"step": 12780
},
{
"epoch": 1.610259155461788,
"grad_norm": 0.2672334611415863,
"learning_rate": 0.00015693365497939743,
"loss": 0.3354,
"step": 12785
},
{
"epoch": 1.610888937871965,
"grad_norm": 0.3269018828868866,
"learning_rate": 0.00015682384785976284,
"loss": 0.3427,
"step": 12790
},
{
"epoch": 1.6115187202821426,
"grad_norm": 0.2637929320335388,
"learning_rate": 0.00015671403707556605,
"loss": 0.3501,
"step": 12795
},
{
"epoch": 1.6121485026923197,
"grad_norm": 0.2606005072593689,
"learning_rate": 0.00015660422268577801,
"loss": 0.3387,
"step": 12800
},
{
"epoch": 1.6127782851024972,
"grad_norm": 0.30220791697502136,
"learning_rate": 0.00015649440474937152,
"loss": 0.3489,
"step": 12805
},
{
"epoch": 1.6134080675126743,
"grad_norm": 0.29726284742355347,
"learning_rate": 0.0001563845833253213,
"loss": 0.3358,
"step": 12810
},
{
"epoch": 1.6140378499228518,
"grad_norm": 0.2928326427936554,
"learning_rate": 0.000156274758472604,
"loss": 0.3236,
"step": 12815
},
{
"epoch": 1.614667632333029,
"grad_norm": 0.31645599007606506,
"learning_rate": 0.0001561649302501981,
"loss": 0.3571,
"step": 12820
},
{
"epoch": 1.6152974147432062,
"grad_norm": 0.26705339550971985,
"learning_rate": 0.00015605509871708382,
"loss": 0.3671,
"step": 12825
},
{
"epoch": 1.6159271971533835,
"grad_norm": 0.2691219449043274,
"learning_rate": 0.00015594526393224322,
"loss": 0.3452,
"step": 12830
},
{
"epoch": 1.6165569795635608,
"grad_norm": 0.2822478413581848,
"learning_rate": 0.00015583542595466005,
"loss": 0.3273,
"step": 12835
},
{
"epoch": 1.6171867619737381,
"grad_norm": 0.2974461019039154,
"learning_rate": 0.00015572558484331994,
"loss": 0.3652,
"step": 12840
},
{
"epoch": 1.6178165443839152,
"grad_norm": 0.2611928880214691,
"learning_rate": 0.00015561574065720986,
"loss": 0.3445,
"step": 12845
},
{
"epoch": 1.6184463267940927,
"grad_norm": 0.2836850583553314,
"learning_rate": 0.00015550589345531885,
"loss": 0.3326,
"step": 12850
},
{
"epoch": 1.6190761092042698,
"grad_norm": 0.2735482156276703,
"learning_rate": 0.00015539604329663725,
"loss": 0.3532,
"step": 12855
},
{
"epoch": 1.6197058916144473,
"grad_norm": 0.34394770860671997,
"learning_rate": 0.0001552861902401572,
"loss": 0.3532,
"step": 12860
},
{
"epoch": 1.6203356740246244,
"grad_norm": 0.2786300778388977,
"learning_rate": 0.0001551763343448722,
"loss": 0.3591,
"step": 12865
},
{
"epoch": 1.6209654564348017,
"grad_norm": 0.26574140787124634,
"learning_rate": 0.00015506647566977737,
"loss": 0.3527,
"step": 12870
},
{
"epoch": 1.621595238844979,
"grad_norm": 0.23986276984214783,
"learning_rate": 0.00015495661427386944,
"loss": 0.3437,
"step": 12875
},
{
"epoch": 1.6222250212551563,
"grad_norm": 0.29332754015922546,
"learning_rate": 0.0001548467502161464,
"loss": 0.3323,
"step": 12880
},
{
"epoch": 1.6228548036653336,
"grad_norm": 0.34338971972465515,
"learning_rate": 0.0001547368835556078,
"loss": 0.3367,
"step": 12885
},
{
"epoch": 1.623484586075511,
"grad_norm": 0.3112575113773346,
"learning_rate": 0.00015462701435125451,
"loss": 0.3392,
"step": 12890
},
{
"epoch": 1.6241143684856882,
"grad_norm": 0.26299479603767395,
"learning_rate": 0.0001545171426620888,
"loss": 0.3194,
"step": 12895
},
{
"epoch": 1.6247441508958653,
"grad_norm": 0.27403828501701355,
"learning_rate": 0.00015440726854711436,
"loss": 0.3344,
"step": 12900
},
{
"epoch": 1.6253739333060429,
"grad_norm": 0.2603330910205841,
"learning_rate": 0.000154297392065336,
"loss": 0.3564,
"step": 12905
},
{
"epoch": 1.62600371571622,
"grad_norm": 0.2812626361846924,
"learning_rate": 0.00015418751327575994,
"loss": 0.3583,
"step": 12910
},
{
"epoch": 1.6266334981263975,
"grad_norm": 0.27280038595199585,
"learning_rate": 0.0001540776322373936,
"loss": 0.3568,
"step": 12915
},
{
"epoch": 1.6272632805365745,
"grad_norm": 0.3073441982269287,
"learning_rate": 0.0001539677490092456,
"loss": 0.3336,
"step": 12920
},
{
"epoch": 1.6278930629467518,
"grad_norm": 0.2868177890777588,
"learning_rate": 0.00015385786365032576,
"loss": 0.3455,
"step": 12925
},
{
"epoch": 1.6285228453569291,
"grad_norm": 0.2661624550819397,
"learning_rate": 0.000153747976219645,
"loss": 0.3377,
"step": 12930
},
{
"epoch": 1.6291526277671065,
"grad_norm": 0.3179323673248291,
"learning_rate": 0.0001536380867762154,
"loss": 0.3706,
"step": 12935
},
{
"epoch": 1.6297824101772838,
"grad_norm": 0.30941662192344666,
"learning_rate": 0.0001535281953790501,
"loss": 0.3514,
"step": 12940
},
{
"epoch": 1.630412192587461,
"grad_norm": 0.3018413782119751,
"learning_rate": 0.0001534183020871633,
"loss": 0.3516,
"step": 12945
},
{
"epoch": 1.6310419749976384,
"grad_norm": 0.34621462225914,
"learning_rate": 0.00015330840695957019,
"loss": 0.3522,
"step": 12950
},
{
"epoch": 1.6316717574078154,
"grad_norm": 0.2858521342277527,
"learning_rate": 0.000153198510055287,
"loss": 0.3378,
"step": 12955
},
{
"epoch": 1.632301539817993,
"grad_norm": 0.2880783975124359,
"learning_rate": 0.00015308861143333076,
"loss": 0.3615,
"step": 12960
},
{
"epoch": 1.63293132222817,
"grad_norm": 0.24324443936347961,
"learning_rate": 0.00015297871115271976,
"loss": 0.3346,
"step": 12965
},
{
"epoch": 1.6335611046383476,
"grad_norm": 0.26982635259628296,
"learning_rate": 0.00015286880927247273,
"loss": 0.3423,
"step": 12970
},
{
"epoch": 1.6341908870485247,
"grad_norm": 0.27813199162483215,
"learning_rate": 0.00015275890585160961,
"loss": 0.3545,
"step": 12975
},
{
"epoch": 1.634820669458702,
"grad_norm": 0.27575090527534485,
"learning_rate": 0.00015264900094915106,
"loss": 0.3357,
"step": 12980
},
{
"epoch": 1.6354504518688793,
"grad_norm": 0.25838521122932434,
"learning_rate": 0.00015253909462411847,
"loss": 0.3244,
"step": 12985
},
{
"epoch": 1.6360802342790566,
"grad_norm": 0.2889041602611542,
"learning_rate": 0.00015242918693553404,
"loss": 0.3297,
"step": 12990
},
{
"epoch": 1.6367100166892339,
"grad_norm": 0.3074316680431366,
"learning_rate": 0.0001523192779424208,
"loss": 0.3525,
"step": 12995
},
{
"epoch": 1.6373397990994112,
"grad_norm": 0.26425209641456604,
"learning_rate": 0.00015220936770380227,
"loss": 0.3493,
"step": 13000
},
{
"epoch": 1.6373397990994112,
"eval_loss": 0.30248695611953735,
"eval_runtime": 6.1682,
"eval_samples_per_second": 162.123,
"eval_steps_per_second": 10.214,
"step": 13000
},
{
"epoch": 1.6379695815095885,
"grad_norm": 0.2873767018318176,
"learning_rate": 0.00015209945627870283,
"loss": 0.3838,
"step": 13005
},
{
"epoch": 1.6385993639197656,
"grad_norm": 0.2895953059196472,
"learning_rate": 0.0001519895437261474,
"loss": 0.3509,
"step": 13010
},
{
"epoch": 1.639229146329943,
"grad_norm": 0.2910915017127991,
"learning_rate": 0.0001518796301051616,
"loss": 0.326,
"step": 13015
},
{
"epoch": 1.6398589287401202,
"grad_norm": 0.2735256552696228,
"learning_rate": 0.00015176971547477142,
"loss": 0.366,
"step": 13020
},
{
"epoch": 1.6404887111502977,
"grad_norm": 0.3099430501461029,
"learning_rate": 0.00015165979989400366,
"loss": 0.3226,
"step": 13025
},
{
"epoch": 1.6411184935604748,
"grad_norm": 0.2963193655014038,
"learning_rate": 0.00015154988342188543,
"loss": 0.3301,
"step": 13030
},
{
"epoch": 1.641748275970652,
"grad_norm": 0.28377631306648254,
"learning_rate": 0.0001514399661174444,
"loss": 0.3143,
"step": 13035
},
{
"epoch": 1.6423780583808294,
"grad_norm": 0.25847306847572327,
"learning_rate": 0.00015133004803970866,
"loss": 0.325,
"step": 13040
},
{
"epoch": 1.6430078407910067,
"grad_norm": 0.2919864058494568,
"learning_rate": 0.00015122012924770675,
"loss": 0.3543,
"step": 13045
},
{
"epoch": 1.643637623201184,
"grad_norm": 0.31185677647590637,
"learning_rate": 0.00015111020980046756,
"loss": 0.3546,
"step": 13050
},
{
"epoch": 1.6442674056113613,
"grad_norm": 0.27933362126350403,
"learning_rate": 0.00015100028975702036,
"loss": 0.3344,
"step": 13055
},
{
"epoch": 1.6448971880215386,
"grad_norm": 0.2898799777030945,
"learning_rate": 0.00015089036917639468,
"loss": 0.3473,
"step": 13060
},
{
"epoch": 1.6455269704317157,
"grad_norm": 0.31464672088623047,
"learning_rate": 0.00015078044811762047,
"loss": 0.3418,
"step": 13065
},
{
"epoch": 1.6461567528418932,
"grad_norm": 0.2676648199558258,
"learning_rate": 0.00015067052663972775,
"loss": 0.3331,
"step": 13070
},
{
"epoch": 1.6467865352520703,
"grad_norm": 0.30420759320259094,
"learning_rate": 0.0001505606048017469,
"loss": 0.3544,
"step": 13075
},
{
"epoch": 1.6474163176622478,
"grad_norm": 0.3160271942615509,
"learning_rate": 0.00015045068266270848,
"loss": 0.3526,
"step": 13080
},
{
"epoch": 1.648046100072425,
"grad_norm": 0.31276562809944153,
"learning_rate": 0.0001503407602816432,
"loss": 0.3213,
"step": 13085
},
{
"epoch": 1.6486758824826022,
"grad_norm": 0.316756933927536,
"learning_rate": 0.00015023083771758183,
"loss": 0.3446,
"step": 13090
},
{
"epoch": 1.6493056648927795,
"grad_norm": 0.23935994505882263,
"learning_rate": 0.00015012091502955533,
"loss": 0.3416,
"step": 13095
},
{
"epoch": 1.6499354473029568,
"grad_norm": 0.2719472348690033,
"learning_rate": 0.00015001099227659475,
"loss": 0.3567,
"step": 13100
},
{
"epoch": 1.6505652297131341,
"grad_norm": 0.3108009696006775,
"learning_rate": 0.00014990106951773098,
"loss": 0.3524,
"step": 13105
},
{
"epoch": 1.6511950121233114,
"grad_norm": 0.3002628982067108,
"learning_rate": 0.00014979114681199524,
"loss": 0.3314,
"step": 13110
},
{
"epoch": 1.6518247945334887,
"grad_norm": 0.32287389039993286,
"learning_rate": 0.0001496812242184184,
"loss": 0.3376,
"step": 13115
},
{
"epoch": 1.6524545769436658,
"grad_norm": 0.27193522453308105,
"learning_rate": 0.0001495713017960314,
"loss": 0.3443,
"step": 13120
},
{
"epoch": 1.6530843593538433,
"grad_norm": 0.30429700016975403,
"learning_rate": 0.00014946137960386512,
"loss": 0.3345,
"step": 13125
},
{
"epoch": 1.6537141417640204,
"grad_norm": 0.2757263481616974,
"learning_rate": 0.00014935145770095034,
"loss": 0.3405,
"step": 13130
},
{
"epoch": 1.654343924174198,
"grad_norm": 0.274728000164032,
"learning_rate": 0.00014924153614631754,
"loss": 0.3199,
"step": 13135
},
{
"epoch": 1.654973706584375,
"grad_norm": 0.2992052137851715,
"learning_rate": 0.0001491316149989972,
"loss": 0.3641,
"step": 13140
},
{
"epoch": 1.6556034889945523,
"grad_norm": 0.28687140345573425,
"learning_rate": 0.00014902169431801947,
"loss": 0.3586,
"step": 13145
},
{
"epoch": 1.6562332714047296,
"grad_norm": 0.31748563051223755,
"learning_rate": 0.00014891177416241416,
"loss": 0.3318,
"step": 13150
},
{
"epoch": 1.656863053814907,
"grad_norm": 0.2876995801925659,
"learning_rate": 0.00014880185459121103,
"loss": 0.3446,
"step": 13155
},
{
"epoch": 1.6574928362250843,
"grad_norm": 0.2874261736869812,
"learning_rate": 0.00014869193566343934,
"loss": 0.3058,
"step": 13160
},
{
"epoch": 1.6581226186352616,
"grad_norm": 0.2720824182033539,
"learning_rate": 0.00014858201743812806,
"loss": 0.3332,
"step": 13165
},
{
"epoch": 1.6587524010454389,
"grad_norm": 0.27765411138534546,
"learning_rate": 0.00014847209997430582,
"loss": 0.3428,
"step": 13170
},
{
"epoch": 1.659382183455616,
"grad_norm": 0.28871631622314453,
"learning_rate": 0.0001483621833310008,
"loss": 0.3325,
"step": 13175
},
{
"epoch": 1.6600119658657935,
"grad_norm": 0.2875865697860718,
"learning_rate": 0.00014825226756724077,
"loss": 0.3527,
"step": 13180
},
{
"epoch": 1.6606417482759706,
"grad_norm": 0.2774711549282074,
"learning_rate": 0.00014814235274205297,
"loss": 0.335,
"step": 13185
},
{
"epoch": 1.661271530686148,
"grad_norm": 0.2727283537387848,
"learning_rate": 0.00014803243891446416,
"loss": 0.3393,
"step": 13190
},
{
"epoch": 1.6619013130963252,
"grad_norm": 0.27977532148361206,
"learning_rate": 0.00014792252614350055,
"loss": 0.3566,
"step": 13195
},
{
"epoch": 1.6625310955065025,
"grad_norm": 0.29823413491249084,
"learning_rate": 0.0001478126144881879,
"loss": 0.3287,
"step": 13200
},
{
"epoch": 1.6631608779166798,
"grad_norm": 0.2849923372268677,
"learning_rate": 0.00014770270400755125,
"loss": 0.3166,
"step": 13205
},
{
"epoch": 1.663790660326857,
"grad_norm": 0.259219229221344,
"learning_rate": 0.00014759279476061503,
"loss": 0.336,
"step": 13210
},
{
"epoch": 1.6644204427370344,
"grad_norm": 0.2877882719039917,
"learning_rate": 0.00014748288680640302,
"loss": 0.3506,
"step": 13215
},
{
"epoch": 1.6650502251472117,
"grad_norm": 0.2952651381492615,
"learning_rate": 0.00014737298020393828,
"loss": 0.3562,
"step": 13220
},
{
"epoch": 1.665680007557389,
"grad_norm": 0.25878390669822693,
"learning_rate": 0.00014726307501224312,
"loss": 0.3289,
"step": 13225
},
{
"epoch": 1.666309789967566,
"grad_norm": 0.29914605617523193,
"learning_rate": 0.00014715317129033924,
"loss": 0.3321,
"step": 13230
},
{
"epoch": 1.6669395723777436,
"grad_norm": 0.27533242106437683,
"learning_rate": 0.00014704326909724738,
"loss": 0.3234,
"step": 13235
},
{
"epoch": 1.6675693547879207,
"grad_norm": 0.2584016025066376,
"learning_rate": 0.0001469333684919876,
"loss": 0.3181,
"step": 13240
},
{
"epoch": 1.6681991371980982,
"grad_norm": 0.262953519821167,
"learning_rate": 0.00014682346953357898,
"loss": 0.3127,
"step": 13245
},
{
"epoch": 1.6688289196082753,
"grad_norm": 0.3399054706096649,
"learning_rate": 0.00014671357228103978,
"loss": 0.3529,
"step": 13250
},
{
"epoch": 1.6694587020184526,
"grad_norm": 0.26437637209892273,
"learning_rate": 0.00014660367679338732,
"loss": 0.318,
"step": 13255
},
{
"epoch": 1.67008848442863,
"grad_norm": 0.28796815872192383,
"learning_rate": 0.000146493783129638,
"loss": 0.3226,
"step": 13260
},
{
"epoch": 1.6707182668388072,
"grad_norm": 0.3208424150943756,
"learning_rate": 0.00014638389134880722,
"loss": 0.3661,
"step": 13265
},
{
"epoch": 1.6713480492489845,
"grad_norm": 0.2934640347957611,
"learning_rate": 0.00014627400150990941,
"loss": 0.3414,
"step": 13270
},
{
"epoch": 1.6719778316591618,
"grad_norm": 0.28860223293304443,
"learning_rate": 0.0001461641136719579,
"loss": 0.3386,
"step": 13275
},
{
"epoch": 1.672607614069339,
"grad_norm": 0.2960747182369232,
"learning_rate": 0.00014605422789396494,
"loss": 0.3466,
"step": 13280
},
{
"epoch": 1.6732373964795162,
"grad_norm": 0.25040510296821594,
"learning_rate": 0.00014594434423494178,
"loss": 0.3366,
"step": 13285
},
{
"epoch": 1.6738671788896937,
"grad_norm": 0.2958894371986389,
"learning_rate": 0.0001458344627538984,
"loss": 0.3614,
"step": 13290
},
{
"epoch": 1.6744969612998708,
"grad_norm": 0.26937004923820496,
"learning_rate": 0.00014572458350984362,
"loss": 0.3499,
"step": 13295
},
{
"epoch": 1.6751267437100483,
"grad_norm": 0.267607182264328,
"learning_rate": 0.00014561470656178517,
"loss": 0.3268,
"step": 13300
},
{
"epoch": 1.6757565261202254,
"grad_norm": 0.30197760462760925,
"learning_rate": 0.0001455048319687295,
"loss": 0.3212,
"step": 13305
},
{
"epoch": 1.6763863085304027,
"grad_norm": 0.29999008774757385,
"learning_rate": 0.0001453949597896817,
"loss": 0.3492,
"step": 13310
},
{
"epoch": 1.67701609094058,
"grad_norm": 0.30626264214515686,
"learning_rate": 0.00014528509008364572,
"loss": 0.3541,
"step": 13315
},
{
"epoch": 1.6776458733507573,
"grad_norm": 0.2915571630001068,
"learning_rate": 0.0001451752229096241,
"loss": 0.3231,
"step": 13320
},
{
"epoch": 1.6782756557609346,
"grad_norm": 0.2660951018333435,
"learning_rate": 0.0001450653583266179,
"loss": 0.321,
"step": 13325
},
{
"epoch": 1.678905438171112,
"grad_norm": 0.2831597924232483,
"learning_rate": 0.00014495549639362707,
"loss": 0.3243,
"step": 13330
},
{
"epoch": 1.6795352205812892,
"grad_norm": 0.2856467664241791,
"learning_rate": 0.0001448456371696499,
"loss": 0.3134,
"step": 13335
},
{
"epoch": 1.6801650029914663,
"grad_norm": 0.31137335300445557,
"learning_rate": 0.00014473578071368324,
"loss": 0.3266,
"step": 13340
},
{
"epoch": 1.6807947854016438,
"grad_norm": 0.3102738857269287,
"learning_rate": 0.0001446259270847226,
"loss": 0.3368,
"step": 13345
},
{
"epoch": 1.681424567811821,
"grad_norm": 0.2788311839103699,
"learning_rate": 0.00014451607634176196,
"loss": 0.345,
"step": 13350
},
{
"epoch": 1.6820543502219985,
"grad_norm": 0.26762083172798157,
"learning_rate": 0.0001444062285437935,
"loss": 0.3112,
"step": 13355
},
{
"epoch": 1.6826841326321755,
"grad_norm": 0.30155837535858154,
"learning_rate": 0.00014429638374980814,
"loss": 0.3353,
"step": 13360
},
{
"epoch": 1.6833139150423528,
"grad_norm": 0.3196204602718353,
"learning_rate": 0.00014418654201879498,
"loss": 0.3738,
"step": 13365
},
{
"epoch": 1.6839436974525301,
"grad_norm": 0.29560673236846924,
"learning_rate": 0.0001440767034097415,
"loss": 0.3458,
"step": 13370
},
{
"epoch": 1.6845734798627074,
"grad_norm": 0.30189448595046997,
"learning_rate": 0.00014396686798163365,
"loss": 0.3577,
"step": 13375
},
{
"epoch": 1.6852032622728847,
"grad_norm": 0.29545098543167114,
"learning_rate": 0.00014385703579345544,
"loss": 0.3299,
"step": 13380
},
{
"epoch": 1.685833044683062,
"grad_norm": 0.3403629660606384,
"learning_rate": 0.00014374720690418942,
"loss": 0.3349,
"step": 13385
},
{
"epoch": 1.6864628270932394,
"grad_norm": 0.2561693489551544,
"learning_rate": 0.0001436373813728161,
"loss": 0.321,
"step": 13390
},
{
"epoch": 1.6870926095034164,
"grad_norm": 0.2968713641166687,
"learning_rate": 0.00014352755925831428,
"loss": 0.3314,
"step": 13395
},
{
"epoch": 1.687722391913594,
"grad_norm": 0.25213027000427246,
"learning_rate": 0.00014341774061966096,
"loss": 0.3245,
"step": 13400
},
{
"epoch": 1.688352174323771,
"grad_norm": 0.26504096388816833,
"learning_rate": 0.00014330792551583133,
"loss": 0.324,
"step": 13405
},
{
"epoch": 1.6889819567339486,
"grad_norm": 0.31459683179855347,
"learning_rate": 0.00014319811400579854,
"loss": 0.33,
"step": 13410
},
{
"epoch": 1.6896117391441257,
"grad_norm": 0.31566324830055237,
"learning_rate": 0.00014308830614853392,
"loss": 0.3097,
"step": 13415
},
{
"epoch": 1.690241521554303,
"grad_norm": 0.3083827793598175,
"learning_rate": 0.00014297850200300683,
"loss": 0.3345,
"step": 13420
},
{
"epoch": 1.6908713039644803,
"grad_norm": 0.29203763604164124,
"learning_rate": 0.0001428687016281845,
"loss": 0.3459,
"step": 13425
},
{
"epoch": 1.6915010863746576,
"grad_norm": 0.28596800565719604,
"learning_rate": 0.00014275890508303225,
"loss": 0.3188,
"step": 13430
},
{
"epoch": 1.6921308687848349,
"grad_norm": 0.3753102421760559,
"learning_rate": 0.00014264911242651342,
"loss": 0.3457,
"step": 13435
},
{
"epoch": 1.6927606511950122,
"grad_norm": 0.28502312302589417,
"learning_rate": 0.0001425393237175891,
"loss": 0.3295,
"step": 13440
},
{
"epoch": 1.6933904336051895,
"grad_norm": 0.3175462782382965,
"learning_rate": 0.00014242953901521838,
"loss": 0.3094,
"step": 13445
},
{
"epoch": 1.6940202160153666,
"grad_norm": 0.25370490550994873,
"learning_rate": 0.00014231975837835815,
"loss": 0.3446,
"step": 13450
},
{
"epoch": 1.694649998425544,
"grad_norm": 0.2589857876300812,
"learning_rate": 0.00014220998186596315,
"loss": 0.3258,
"step": 13455
},
{
"epoch": 1.6952797808357212,
"grad_norm": 0.31022030115127563,
"learning_rate": 0.00014210020953698573,
"loss": 0.344,
"step": 13460
},
{
"epoch": 1.6959095632458987,
"grad_norm": 0.3099876046180725,
"learning_rate": 0.0001419904414503763,
"loss": 0.3425,
"step": 13465
},
{
"epoch": 1.6965393456560758,
"grad_norm": 0.27715328335762024,
"learning_rate": 0.00014188067766508273,
"loss": 0.3309,
"step": 13470
},
{
"epoch": 1.697169128066253,
"grad_norm": 0.2700579762458801,
"learning_rate": 0.00014177091824005075,
"loss": 0.3191,
"step": 13475
},
{
"epoch": 1.6977989104764304,
"grad_norm": 0.2773703336715698,
"learning_rate": 0.00014166116323422365,
"loss": 0.3321,
"step": 13480
},
{
"epoch": 1.6984286928866077,
"grad_norm": 0.2699192464351654,
"learning_rate": 0.00014155141270654232,
"loss": 0.3318,
"step": 13485
},
{
"epoch": 1.699058475296785,
"grad_norm": 0.26127228140830994,
"learning_rate": 0.00014144166671594544,
"loss": 0.2982,
"step": 13490
},
{
"epoch": 1.699688257706962,
"grad_norm": 0.37218350172042847,
"learning_rate": 0.000141331925321369,
"loss": 0.3335,
"step": 13495
},
{
"epoch": 1.7003180401171396,
"grad_norm": 0.26352524757385254,
"learning_rate": 0.0001412221885817466,
"loss": 0.3246,
"step": 13500
},
{
"epoch": 1.7009478225273167,
"grad_norm": 0.27649009227752686,
"learning_rate": 0.00014111245655600948,
"loss": 0.3117,
"step": 13505
},
{
"epoch": 1.7015776049374942,
"grad_norm": 0.26316478848457336,
"learning_rate": 0.00014100272930308623,
"loss": 0.3268,
"step": 13510
},
{
"epoch": 1.7022073873476713,
"grad_norm": 0.26319512724876404,
"learning_rate": 0.0001408930068819028,
"loss": 0.3083,
"step": 13515
},
{
"epoch": 1.7028371697578488,
"grad_norm": 0.26792389154434204,
"learning_rate": 0.00014078328935138276,
"loss": 0.3317,
"step": 13520
},
{
"epoch": 1.703466952168026,
"grad_norm": 0.2627207338809967,
"learning_rate": 0.0001406735767704469,
"loss": 0.3225,
"step": 13525
},
{
"epoch": 1.7040967345782032,
"grad_norm": 0.30815207958221436,
"learning_rate": 0.00014056386919801325,
"loss": 0.3201,
"step": 13530
},
{
"epoch": 1.7047265169883805,
"grad_norm": 0.296520471572876,
"learning_rate": 0.00014045416669299747,
"loss": 0.3189,
"step": 13535
},
{
"epoch": 1.7053562993985578,
"grad_norm": 0.2739796042442322,
"learning_rate": 0.0001403444693143122,
"loss": 0.3023,
"step": 13540
},
{
"epoch": 1.7059860818087351,
"grad_norm": 0.311927855014801,
"learning_rate": 0.00014023477712086743,
"loss": 0.3311,
"step": 13545
},
{
"epoch": 1.7066158642189122,
"grad_norm": 0.2842674255371094,
"learning_rate": 0.0001401250901715704,
"loss": 0.3376,
"step": 13550
},
{
"epoch": 1.7072456466290897,
"grad_norm": 0.30459704995155334,
"learning_rate": 0.00014001540852532553,
"loss": 0.3276,
"step": 13555
},
{
"epoch": 1.7078754290392668,
"grad_norm": 0.26651817560195923,
"learning_rate": 0.00013990573224103442,
"loss": 0.3309,
"step": 13560
},
{
"epoch": 1.7085052114494443,
"grad_norm": 0.32419687509536743,
"learning_rate": 0.00013979606137759563,
"loss": 0.314,
"step": 13565
},
{
"epoch": 1.7091349938596214,
"grad_norm": 0.2715966999530792,
"learning_rate": 0.000139686395993905,
"loss": 0.3293,
"step": 13570
},
{
"epoch": 1.709764776269799,
"grad_norm": 0.29049497842788696,
"learning_rate": 0.0001395767361488552,
"loss": 0.3159,
"step": 13575
},
{
"epoch": 1.710394558679976,
"grad_norm": 0.3235701024532318,
"learning_rate": 0.00013946708190133627,
"loss": 0.3422,
"step": 13580
},
{
"epoch": 1.7110243410901533,
"grad_norm": 0.2732395529747009,
"learning_rate": 0.00013935743331023492,
"loss": 0.317,
"step": 13585
},
{
"epoch": 1.7116541235003306,
"grad_norm": 0.2833672761917114,
"learning_rate": 0.000139247790434435,
"loss": 0.3619,
"step": 13590
},
{
"epoch": 1.712283905910508,
"grad_norm": 0.2510261535644531,
"learning_rate": 0.00013913815333281728,
"loss": 0.3215,
"step": 13595
},
{
"epoch": 1.7129136883206852,
"grad_norm": 0.29638463258743286,
"learning_rate": 0.00013902852206425925,
"loss": 0.3341,
"step": 13600
},
{
"epoch": 1.7135434707308623,
"grad_norm": 0.26883918046951294,
"learning_rate": 0.0001389188966876355,
"loss": 0.3198,
"step": 13605
},
{
"epoch": 1.7141732531410399,
"grad_norm": 0.280301958322525,
"learning_rate": 0.00013880927726181737,
"loss": 0.3232,
"step": 13610
},
{
"epoch": 1.714803035551217,
"grad_norm": 0.25223594903945923,
"learning_rate": 0.00013869966384567293,
"loss": 0.3362,
"step": 13615
},
{
"epoch": 1.7154328179613945,
"grad_norm": 0.29902294278144836,
"learning_rate": 0.00013859005649806717,
"loss": 0.3169,
"step": 13620
},
{
"epoch": 1.7160626003715715,
"grad_norm": 0.3142664134502411,
"learning_rate": 0.00013848045527786168,
"loss": 0.3149,
"step": 13625
},
{
"epoch": 1.716692382781749,
"grad_norm": 0.312800794839859,
"learning_rate": 0.0001383708602439149,
"loss": 0.3327,
"step": 13630
},
{
"epoch": 1.7173221651919262,
"grad_norm": 0.3177478015422821,
"learning_rate": 0.00013826127145508176,
"loss": 0.3215,
"step": 13635
},
{
"epoch": 1.7179519476021035,
"grad_norm": 0.2900395691394806,
"learning_rate": 0.00013815168897021398,
"loss": 0.3169,
"step": 13640
},
{
"epoch": 1.7185817300122808,
"grad_norm": 0.2877413332462311,
"learning_rate": 0.00013804211284815986,
"loss": 0.3247,
"step": 13645
},
{
"epoch": 1.719211512422458,
"grad_norm": 0.25947847962379456,
"learning_rate": 0.00013793254314776432,
"loss": 0.3091,
"step": 13650
},
{
"epoch": 1.7198412948326354,
"grad_norm": 0.270942747592926,
"learning_rate": 0.00013782297992786873,
"loss": 0.3318,
"step": 13655
},
{
"epoch": 1.7204710772428125,
"grad_norm": 0.2605541944503784,
"learning_rate": 0.00013771342324731106,
"loss": 0.3247,
"step": 13660
},
{
"epoch": 1.72110085965299,
"grad_norm": 0.25236964225769043,
"learning_rate": 0.00013760387316492584,
"loss": 0.3111,
"step": 13665
},
{
"epoch": 1.721730642063167,
"grad_norm": 0.2639407217502594,
"learning_rate": 0.00013749432973954385,
"loss": 0.305,
"step": 13670
},
{
"epoch": 1.7223604244733446,
"grad_norm": 0.3111459016799927,
"learning_rate": 0.0001373847930299924,
"loss": 0.3367,
"step": 13675
},
{
"epoch": 1.7229902068835217,
"grad_norm": 0.31038767099380493,
"learning_rate": 0.00013727526309509531,
"loss": 0.3223,
"step": 13680
},
{
"epoch": 1.723619989293699,
"grad_norm": 0.2571181058883667,
"learning_rate": 0.00013716573999367259,
"loss": 0.3057,
"step": 13685
},
{
"epoch": 1.7242497717038763,
"grad_norm": 0.24940542876720428,
"learning_rate": 0.0001370562237845406,
"loss": 0.319,
"step": 13690
},
{
"epoch": 1.7248795541140536,
"grad_norm": 0.2301412671804428,
"learning_rate": 0.00013694671452651216,
"loss": 0.3099,
"step": 13695
},
{
"epoch": 1.7255093365242309,
"grad_norm": 0.27043718099594116,
"learning_rate": 0.00013683721227839623,
"loss": 0.3345,
"step": 13700
},
{
"epoch": 1.7261391189344082,
"grad_norm": 0.26595422625541687,
"learning_rate": 0.00013672771709899792,
"loss": 0.3162,
"step": 13705
},
{
"epoch": 1.7267689013445855,
"grad_norm": 0.26224714517593384,
"learning_rate": 0.0001366182290471187,
"loss": 0.322,
"step": 13710
},
{
"epoch": 1.7273986837547626,
"grad_norm": 0.26390886306762695,
"learning_rate": 0.00013650874818155618,
"loss": 0.2964,
"step": 13715
},
{
"epoch": 1.72802846616494,
"grad_norm": 0.3042176365852356,
"learning_rate": 0.00013639927456110402,
"loss": 0.3128,
"step": 13720
},
{
"epoch": 1.7286582485751172,
"grad_norm": 0.269771009683609,
"learning_rate": 0.00013628980824455212,
"loss": 0.2963,
"step": 13725
},
{
"epoch": 1.7292880309852947,
"grad_norm": 0.3462948203086853,
"learning_rate": 0.00013618034929068634,
"loss": 0.3445,
"step": 13730
},
{
"epoch": 1.7299178133954718,
"grad_norm": 0.270379900932312,
"learning_rate": 0.0001360708977582887,
"loss": 0.3174,
"step": 13735
},
{
"epoch": 1.730547595805649,
"grad_norm": 0.23746255040168762,
"learning_rate": 0.00013596145370613715,
"loss": 0.3006,
"step": 13740
},
{
"epoch": 1.7311773782158264,
"grad_norm": 0.30519574880599976,
"learning_rate": 0.00013585201719300562,
"loss": 0.3272,
"step": 13745
},
{
"epoch": 1.7318071606260037,
"grad_norm": 0.3508155941963196,
"learning_rate": 0.000135742588277664,
"loss": 0.3385,
"step": 13750
},
{
"epoch": 1.732436943036181,
"grad_norm": 0.2649688720703125,
"learning_rate": 0.00013563316701887816,
"loss": 0.3191,
"step": 13755
},
{
"epoch": 1.7330667254463583,
"grad_norm": 0.25044509768486023,
"learning_rate": 0.0001355237534754098,
"loss": 0.3114,
"step": 13760
},
{
"epoch": 1.7336965078565356,
"grad_norm": 0.27739325165748596,
"learning_rate": 0.00013541434770601653,
"loss": 0.3555,
"step": 13765
},
{
"epoch": 1.7343262902667127,
"grad_norm": 0.27952834963798523,
"learning_rate": 0.00013530494976945172,
"loss": 0.3287,
"step": 13770
},
{
"epoch": 1.7349560726768902,
"grad_norm": 0.29794949293136597,
"learning_rate": 0.00013519555972446454,
"loss": 0.3248,
"step": 13775
},
{
"epoch": 1.7355858550870673,
"grad_norm": 0.3177776634693146,
"learning_rate": 0.00013508617762979992,
"loss": 0.3311,
"step": 13780
},
{
"epoch": 1.7362156374972448,
"grad_norm": 0.29036352038383484,
"learning_rate": 0.0001349768035441986,
"loss": 0.3021,
"step": 13785
},
{
"epoch": 1.736845419907422,
"grad_norm": 0.2803820073604584,
"learning_rate": 0.00013486743752639694,
"loss": 0.3021,
"step": 13790
},
{
"epoch": 1.7374752023175992,
"grad_norm": 0.25854361057281494,
"learning_rate": 0.000134758079635127,
"loss": 0.3215,
"step": 13795
},
{
"epoch": 1.7381049847277765,
"grad_norm": 0.2606901228427887,
"learning_rate": 0.0001346487299291165,
"loss": 0.3093,
"step": 13800
},
{
"epoch": 1.7387347671379538,
"grad_norm": 0.25198522210121155,
"learning_rate": 0.00013453938846708864,
"loss": 0.2954,
"step": 13805
},
{
"epoch": 1.7393645495481311,
"grad_norm": 0.27399036288261414,
"learning_rate": 0.00013443005530776233,
"loss": 0.3212,
"step": 13810
},
{
"epoch": 1.7399943319583084,
"grad_norm": 0.2777753174304962,
"learning_rate": 0.000134320730509852,
"loss": 0.32,
"step": 13815
},
{
"epoch": 1.7406241143684857,
"grad_norm": 0.28130999207496643,
"learning_rate": 0.0001342114141320675,
"loss": 0.305,
"step": 13820
},
{
"epoch": 1.7412538967786628,
"grad_norm": 0.28102371096611023,
"learning_rate": 0.00013410210623311428,
"loss": 0.3066,
"step": 13825
},
{
"epoch": 1.7418836791888404,
"grad_norm": 0.21866032481193542,
"learning_rate": 0.00013399280687169312,
"loss": 0.3181,
"step": 13830
},
{
"epoch": 1.7425134615990174,
"grad_norm": 0.27159667015075684,
"learning_rate": 0.00013388351610650045,
"loss": 0.2983,
"step": 13835
},
{
"epoch": 1.743143244009195,
"grad_norm": 0.26473724842071533,
"learning_rate": 0.00013377423399622764,
"loss": 0.3041,
"step": 13840
},
{
"epoch": 1.743773026419372,
"grad_norm": 0.30044063925743103,
"learning_rate": 0.00013366496059956184,
"loss": 0.3391,
"step": 13845
},
{
"epoch": 1.7444028088295493,
"grad_norm": 0.3015748858451843,
"learning_rate": 0.00013355569597518532,
"loss": 0.3033,
"step": 13850
},
{
"epoch": 1.7450325912397266,
"grad_norm": 0.27009138464927673,
"learning_rate": 0.00013344644018177572,
"loss": 0.2973,
"step": 13855
},
{
"epoch": 1.745662373649904,
"grad_norm": 0.28925400972366333,
"learning_rate": 0.00013333719327800585,
"loss": 0.3137,
"step": 13860
},
{
"epoch": 1.7462921560600813,
"grad_norm": 0.27679139375686646,
"learning_rate": 0.00013322795532254379,
"loss": 0.3119,
"step": 13865
},
{
"epoch": 1.7469219384702586,
"grad_norm": 0.283965140581131,
"learning_rate": 0.0001331187263740529,
"loss": 0.3151,
"step": 13870
},
{
"epoch": 1.7475517208804359,
"grad_norm": 0.24927465617656708,
"learning_rate": 0.0001330095064911915,
"loss": 0.2968,
"step": 13875
},
{
"epoch": 1.748181503290613,
"grad_norm": 0.2976732850074768,
"learning_rate": 0.0001329002957326132,
"loss": 0.3257,
"step": 13880
},
{
"epoch": 1.7488112857007905,
"grad_norm": 0.27860409021377563,
"learning_rate": 0.00013279109415696672,
"loss": 0.2988,
"step": 13885
},
{
"epoch": 1.7494410681109676,
"grad_norm": 0.28782716393470764,
"learning_rate": 0.0001326819018228958,
"loss": 0.3098,
"step": 13890
},
{
"epoch": 1.750070850521145,
"grad_norm": 0.24729984998703003,
"learning_rate": 0.0001325727187890391,
"loss": 0.3123,
"step": 13895
},
{
"epoch": 1.7507006329313222,
"grad_norm": 0.23218853771686554,
"learning_rate": 0.00013246354511403058,
"loss": 0.3025,
"step": 13900
},
{
"epoch": 1.7513304153414995,
"grad_norm": 0.2634672522544861,
"learning_rate": 0.00013235438085649893,
"loss": 0.3123,
"step": 13905
},
{
"epoch": 1.7519601977516768,
"grad_norm": 0.3087509572505951,
"learning_rate": 0.00013224522607506776,
"loss": 0.3515,
"step": 13910
},
{
"epoch": 1.752589980161854,
"grad_norm": 0.28160160779953003,
"learning_rate": 0.00013213608082835576,
"loss": 0.3141,
"step": 13915
},
{
"epoch": 1.7532197625720314,
"grad_norm": 0.2643168866634369,
"learning_rate": 0.0001320269451749764,
"loss": 0.297,
"step": 13920
},
{
"epoch": 1.7538495449822087,
"grad_norm": 0.34547582268714905,
"learning_rate": 0.00013191781917353803,
"loss": 0.3194,
"step": 13925
},
{
"epoch": 1.754479327392386,
"grad_norm": 0.29079994559288025,
"learning_rate": 0.00013180870288264385,
"loss": 0.3334,
"step": 13930
},
{
"epoch": 1.755109109802563,
"grad_norm": 0.2323244959115982,
"learning_rate": 0.00013169959636089167,
"loss": 0.3106,
"step": 13935
},
{
"epoch": 1.7557388922127406,
"grad_norm": 0.29080161452293396,
"learning_rate": 0.00013159049966687437,
"loss": 0.2978,
"step": 13940
},
{
"epoch": 1.7563686746229177,
"grad_norm": 0.2688988149166107,
"learning_rate": 0.00013148141285917924,
"loss": 0.3184,
"step": 13945
},
{
"epoch": 1.7569984570330952,
"grad_norm": 0.25353583693504333,
"learning_rate": 0.0001313723359963884,
"loss": 0.2956,
"step": 13950
},
{
"epoch": 1.7576282394432723,
"grad_norm": 0.32606688141822815,
"learning_rate": 0.0001312632691370786,
"loss": 0.3136,
"step": 13955
},
{
"epoch": 1.7582580218534496,
"grad_norm": 0.24126961827278137,
"learning_rate": 0.0001311542123398213,
"loss": 0.304,
"step": 13960
},
{
"epoch": 1.758887804263627,
"grad_norm": 0.2840232253074646,
"learning_rate": 0.0001310451656631824,
"loss": 0.3126,
"step": 13965
},
{
"epoch": 1.7595175866738042,
"grad_norm": 0.30879929661750793,
"learning_rate": 0.0001309361291657226,
"loss": 0.3115,
"step": 13970
},
{
"epoch": 1.7601473690839815,
"grad_norm": 0.29478558897972107,
"learning_rate": 0.0001308271029059969,
"loss": 0.3035,
"step": 13975
},
{
"epoch": 1.7607771514941588,
"grad_norm": 0.29496970772743225,
"learning_rate": 0.00013071808694255484,
"loss": 0.3417,
"step": 13980
},
{
"epoch": 1.7614069339043361,
"grad_norm": 0.27189967036247253,
"learning_rate": 0.00013060908133394054,
"loss": 0.3146,
"step": 13985
},
{
"epoch": 1.7620367163145132,
"grad_norm": 0.2737963795661926,
"learning_rate": 0.00013050008613869256,
"loss": 0.3223,
"step": 13990
},
{
"epoch": 1.7626664987246907,
"grad_norm": 0.2881993055343628,
"learning_rate": 0.00013039110141534367,
"loss": 0.3039,
"step": 13995
},
{
"epoch": 1.7632962811348678,
"grad_norm": 0.29045918583869934,
"learning_rate": 0.00013028212722242127,
"loss": 0.3193,
"step": 14000
},
{
"epoch": 1.7632962811348678,
"eval_loss": 0.3040441870689392,
"eval_runtime": 6.1585,
"eval_samples_per_second": 162.378,
"eval_steps_per_second": 10.23,
"step": 14000
},
{
"epoch": 1.7639260635450453,
"grad_norm": 0.24037687480449677,
"learning_rate": 0.00013017316361844692,
"loss": 0.2918,
"step": 14005
},
{
"epoch": 1.7645558459552224,
"grad_norm": 0.25562503933906555,
"learning_rate": 0.0001300642106619367,
"loss": 0.2967,
"step": 14010
},
{
"epoch": 1.7651856283653997,
"grad_norm": 0.3410753905773163,
"learning_rate": 0.00012995526841140068,
"loss": 0.3158,
"step": 14015
},
{
"epoch": 1.765815410775577,
"grad_norm": 0.2569274306297302,
"learning_rate": 0.00012984633692534337,
"loss": 0.306,
"step": 14020
},
{
"epoch": 1.7664451931857543,
"grad_norm": 0.26620200276374817,
"learning_rate": 0.00012973741626226348,
"loss": 0.3122,
"step": 14025
},
{
"epoch": 1.7670749755959316,
"grad_norm": 0.2842133045196533,
"learning_rate": 0.00012962850648065393,
"loss": 0.3253,
"step": 14030
},
{
"epoch": 1.767704758006109,
"grad_norm": 0.27718397974967957,
"learning_rate": 0.00012951960763900173,
"loss": 0.3187,
"step": 14035
},
{
"epoch": 1.7683345404162862,
"grad_norm": 0.27699559926986694,
"learning_rate": 0.00012941071979578805,
"loss": 0.33,
"step": 14040
},
{
"epoch": 1.7689643228264633,
"grad_norm": 0.21499434113502502,
"learning_rate": 0.00012930184300948819,
"loss": 0.2765,
"step": 14045
},
{
"epoch": 1.7695941052366408,
"grad_norm": 0.29474014043807983,
"learning_rate": 0.00012919297733857138,
"loss": 0.32,
"step": 14050
},
{
"epoch": 1.770223887646818,
"grad_norm": 0.3570992052555084,
"learning_rate": 0.00012908412284150104,
"loss": 0.3088,
"step": 14055
},
{
"epoch": 1.7708536700569955,
"grad_norm": 0.2408706545829773,
"learning_rate": 0.00012897527957673446,
"loss": 0.2991,
"step": 14060
},
{
"epoch": 1.7714834524671725,
"grad_norm": 0.23086212575435638,
"learning_rate": 0.00012886644760272306,
"loss": 0.2959,
"step": 14065
},
{
"epoch": 1.7721132348773498,
"grad_norm": 0.25117409229278564,
"learning_rate": 0.00012875762697791199,
"loss": 0.2933,
"step": 14070
},
{
"epoch": 1.7727430172875271,
"grad_norm": 0.28731420636177063,
"learning_rate": 0.0001286488177607405,
"loss": 0.3234,
"step": 14075
},
{
"epoch": 1.7733727996977044,
"grad_norm": 0.23875364661216736,
"learning_rate": 0.0001285400200096416,
"loss": 0.2952,
"step": 14080
},
{
"epoch": 1.7740025821078818,
"grad_norm": 0.2722354829311371,
"learning_rate": 0.0001284312337830421,
"loss": 0.2997,
"step": 14085
},
{
"epoch": 1.774632364518059,
"grad_norm": 0.27776023745536804,
"learning_rate": 0.00012832245913936278,
"loss": 0.3256,
"step": 14090
},
{
"epoch": 1.7752621469282364,
"grad_norm": 0.26422828435897827,
"learning_rate": 0.00012821369613701808,
"loss": 0.2983,
"step": 14095
},
{
"epoch": 1.7758919293384134,
"grad_norm": 0.23418962955474854,
"learning_rate": 0.00012810494483441614,
"loss": 0.3024,
"step": 14100
},
{
"epoch": 1.776521711748591,
"grad_norm": 0.300912082195282,
"learning_rate": 0.000127996205289959,
"loss": 0.3001,
"step": 14105
},
{
"epoch": 1.777151494158768,
"grad_norm": 0.2872162461280823,
"learning_rate": 0.00012788747756204222,
"loss": 0.3074,
"step": 14110
},
{
"epoch": 1.7777812765689456,
"grad_norm": 0.2784421145915985,
"learning_rate": 0.00012777876170905515,
"loss": 0.2978,
"step": 14115
},
{
"epoch": 1.7784110589791227,
"grad_norm": 0.28062257170677185,
"learning_rate": 0.00012767005778938062,
"loss": 0.2993,
"step": 14120
},
{
"epoch": 1.7790408413893,
"grad_norm": 0.3496231734752655,
"learning_rate": 0.0001275613658613951,
"loss": 0.3147,
"step": 14125
},
{
"epoch": 1.7796706237994773,
"grad_norm": 0.2595261037349701,
"learning_rate": 0.00012745268598346864,
"loss": 0.2943,
"step": 14130
},
{
"epoch": 1.7803004062096546,
"grad_norm": 0.2795499563217163,
"learning_rate": 0.00012734401821396486,
"loss": 0.3123,
"step": 14135
},
{
"epoch": 1.7809301886198319,
"grad_norm": 0.2615763247013092,
"learning_rate": 0.0001272353626112408,
"loss": 0.3059,
"step": 14140
},
{
"epoch": 1.7815599710300092,
"grad_norm": 0.2783886790275574,
"learning_rate": 0.00012712671923364706,
"loss": 0.3134,
"step": 14145
},
{
"epoch": 1.7821897534401865,
"grad_norm": 0.2884584367275238,
"learning_rate": 0.0001270180881395276,
"loss": 0.3151,
"step": 14150
},
{
"epoch": 1.7828195358503636,
"grad_norm": 0.2677745521068573,
"learning_rate": 0.0001269094693872197,
"loss": 0.3146,
"step": 14155
},
{
"epoch": 1.783449318260541,
"grad_norm": 0.25956082344055176,
"learning_rate": 0.0001268008630350542,
"loss": 0.3118,
"step": 14160
},
{
"epoch": 1.7840791006707182,
"grad_norm": 0.2646723985671997,
"learning_rate": 0.0001266922691413552,
"loss": 0.2861,
"step": 14165
},
{
"epoch": 1.7847088830808957,
"grad_norm": 0.29946067929267883,
"learning_rate": 0.00012658368776444004,
"loss": 0.3349,
"step": 14170
},
{
"epoch": 1.7853386654910728,
"grad_norm": 0.24171167612075806,
"learning_rate": 0.00012647511896261943,
"loss": 0.2805,
"step": 14175
},
{
"epoch": 1.78596844790125,
"grad_norm": 0.26428696513175964,
"learning_rate": 0.0001263665627941973,
"loss": 0.3231,
"step": 14180
},
{
"epoch": 1.7865982303114274,
"grad_norm": 0.2787708044052124,
"learning_rate": 0.0001262580193174709,
"loss": 0.2961,
"step": 14185
},
{
"epoch": 1.7872280127216047,
"grad_norm": 0.2826111614704132,
"learning_rate": 0.00012614948859073036,
"loss": 0.3343,
"step": 14190
},
{
"epoch": 1.787857795131782,
"grad_norm": 0.278361052274704,
"learning_rate": 0.00012604097067225927,
"loss": 0.2919,
"step": 14195
},
{
"epoch": 1.7884875775419593,
"grad_norm": 0.24778404831886292,
"learning_rate": 0.00012593246562033419,
"loss": 0.316,
"step": 14200
},
{
"epoch": 1.7891173599521366,
"grad_norm": 0.28171002864837646,
"learning_rate": 0.00012582397349322484,
"loss": 0.3076,
"step": 14205
},
{
"epoch": 1.7897471423623137,
"grad_norm": 0.26361143589019775,
"learning_rate": 0.00012571549434919392,
"loss": 0.2953,
"step": 14210
},
{
"epoch": 1.7903769247724912,
"grad_norm": 0.27602389454841614,
"learning_rate": 0.0001256070282464973,
"loss": 0.3266,
"step": 14215
},
{
"epoch": 1.7910067071826683,
"grad_norm": 0.2887786328792572,
"learning_rate": 0.00012549857524338378,
"loss": 0.3166,
"step": 14220
},
{
"epoch": 1.7916364895928458,
"grad_norm": 0.272359162569046,
"learning_rate": 0.00012539013539809493,
"loss": 0.3053,
"step": 14225
},
{
"epoch": 1.792266272003023,
"grad_norm": 0.2615000903606415,
"learning_rate": 0.00012528170876886555,
"loss": 0.2974,
"step": 14230
},
{
"epoch": 1.7928960544132002,
"grad_norm": 0.2882770597934723,
"learning_rate": 0.00012517329541392316,
"loss": 0.301,
"step": 14235
},
{
"epoch": 1.7935258368233775,
"grad_norm": 0.29980406165122986,
"learning_rate": 0.00012506489539148823,
"loss": 0.3009,
"step": 14240
},
{
"epoch": 1.7941556192335548,
"grad_norm": 0.2714889943599701,
"learning_rate": 0.0001249565087597741,
"loss": 0.2897,
"step": 14245
},
{
"epoch": 1.7947854016437321,
"grad_norm": 0.3578423261642456,
"learning_rate": 0.00012484813557698678,
"loss": 0.3021,
"step": 14250
},
{
"epoch": 1.7954151840539094,
"grad_norm": 0.29889971017837524,
"learning_rate": 0.00012473977590132524,
"loss": 0.3039,
"step": 14255
},
{
"epoch": 1.7960449664640867,
"grad_norm": 0.27244943380355835,
"learning_rate": 0.000124631429790981,
"loss": 0.3068,
"step": 14260
},
{
"epoch": 1.7966747488742638,
"grad_norm": 0.2793833613395691,
"learning_rate": 0.00012452309730413843,
"loss": 0.3081,
"step": 14265
},
{
"epoch": 1.7973045312844413,
"grad_norm": 0.27198326587677,
"learning_rate": 0.00012441477849897461,
"loss": 0.2957,
"step": 14270
},
{
"epoch": 1.7979343136946184,
"grad_norm": 0.24795940518379211,
"learning_rate": 0.0001243064734336591,
"loss": 0.3094,
"step": 14275
},
{
"epoch": 1.798564096104796,
"grad_norm": 0.29008451104164124,
"learning_rate": 0.0001241981821663543,
"loss": 0.3306,
"step": 14280
},
{
"epoch": 1.799193878514973,
"grad_norm": 0.24478363990783691,
"learning_rate": 0.00012408990475521508,
"loss": 0.291,
"step": 14285
},
{
"epoch": 1.7998236609251503,
"grad_norm": 0.2566664218902588,
"learning_rate": 0.00012398164125838881,
"loss": 0.3087,
"step": 14290
},
{
"epoch": 1.8004534433353276,
"grad_norm": 0.24992555379867554,
"learning_rate": 0.00012387339173401552,
"loss": 0.318,
"step": 14295
},
{
"epoch": 1.801083225745505,
"grad_norm": 0.244164377450943,
"learning_rate": 0.00012376515624022767,
"loss": 0.3096,
"step": 14300
},
{
"epoch": 1.8017130081556822,
"grad_norm": 0.2495235651731491,
"learning_rate": 0.00012365693483515016,
"loss": 0.283,
"step": 14305
},
{
"epoch": 1.8023427905658593,
"grad_norm": 0.2685554027557373,
"learning_rate": 0.00012354872757690038,
"loss": 0.3359,
"step": 14310
},
{
"epoch": 1.8029725729760369,
"grad_norm": 0.23964886367321014,
"learning_rate": 0.0001234405345235881,
"loss": 0.3074,
"step": 14315
},
{
"epoch": 1.803602355386214,
"grad_norm": 0.24736544489860535,
"learning_rate": 0.00012333235573331556,
"loss": 0.2891,
"step": 14320
},
{
"epoch": 1.8042321377963915,
"grad_norm": 0.2994007170200348,
"learning_rate": 0.00012322419126417706,
"loss": 0.3109,
"step": 14325
},
{
"epoch": 1.8048619202065685,
"grad_norm": 0.26516586542129517,
"learning_rate": 0.0001231160411742595,
"loss": 0.2974,
"step": 14330
},
{
"epoch": 1.805491702616746,
"grad_norm": 0.27139636874198914,
"learning_rate": 0.0001230079055216419,
"loss": 0.3023,
"step": 14335
},
{
"epoch": 1.8061214850269232,
"grad_norm": 0.26109209656715393,
"learning_rate": 0.00012289978436439558,
"loss": 0.3059,
"step": 14340
},
{
"epoch": 1.8067512674371005,
"grad_norm": 0.29744458198547363,
"learning_rate": 0.0001227916777605841,
"loss": 0.3088,
"step": 14345
},
{
"epoch": 1.8073810498472778,
"grad_norm": 0.27332085371017456,
"learning_rate": 0.0001226835857682632,
"loss": 0.2888,
"step": 14350
},
{
"epoch": 1.808010832257455,
"grad_norm": 0.2586978077888489,
"learning_rate": 0.00012257550844548074,
"loss": 0.328,
"step": 14355
},
{
"epoch": 1.8086406146676324,
"grad_norm": 0.29042935371398926,
"learning_rate": 0.00012246744585027667,
"loss": 0.3113,
"step": 14360
},
{
"epoch": 1.8092703970778095,
"grad_norm": 0.271710067987442,
"learning_rate": 0.000122359398040683,
"loss": 0.2888,
"step": 14365
},
{
"epoch": 1.809900179487987,
"grad_norm": 0.2969205379486084,
"learning_rate": 0.00012225136507472406,
"loss": 0.312,
"step": 14370
},
{
"epoch": 1.810529961898164,
"grad_norm": 0.301145076751709,
"learning_rate": 0.00012214334701041586,
"loss": 0.2952,
"step": 14375
},
{
"epoch": 1.8111597443083416,
"grad_norm": 0.250630259513855,
"learning_rate": 0.00012203534390576666,
"loss": 0.3073,
"step": 14380
},
{
"epoch": 1.8117895267185187,
"grad_norm": 0.24282781779766083,
"learning_rate": 0.00012192735581877654,
"loss": 0.2863,
"step": 14385
},
{
"epoch": 1.8124193091286962,
"grad_norm": 0.2824462652206421,
"learning_rate": 0.00012181938280743769,
"loss": 0.2999,
"step": 14390
},
{
"epoch": 1.8130490915388733,
"grad_norm": 0.2740934491157532,
"learning_rate": 0.00012171142492973388,
"loss": 0.3131,
"step": 14395
},
{
"epoch": 1.8136788739490506,
"grad_norm": 0.23533669114112854,
"learning_rate": 0.00012160348224364109,
"loss": 0.2846,
"step": 14400
},
{
"epoch": 1.814308656359228,
"grad_norm": 0.26320409774780273,
"learning_rate": 0.00012149555480712697,
"loss": 0.2954,
"step": 14405
},
{
"epoch": 1.8149384387694052,
"grad_norm": 0.2816338837146759,
"learning_rate": 0.00012138764267815105,
"loss": 0.2811,
"step": 14410
},
{
"epoch": 1.8155682211795825,
"grad_norm": 0.23801551759243011,
"learning_rate": 0.00012127974591466455,
"loss": 0.2846,
"step": 14415
},
{
"epoch": 1.8161980035897596,
"grad_norm": 0.3131721317768097,
"learning_rate": 0.00012117186457461056,
"loss": 0.2969,
"step": 14420
},
{
"epoch": 1.816827785999937,
"grad_norm": 0.2892078757286072,
"learning_rate": 0.00012106399871592385,
"loss": 0.3,
"step": 14425
},
{
"epoch": 1.8174575684101142,
"grad_norm": 0.253273606300354,
"learning_rate": 0.00012095614839653074,
"loss": 0.3005,
"step": 14430
},
{
"epoch": 1.8180873508202917,
"grad_norm": 0.2675528824329376,
"learning_rate": 0.00012084831367434937,
"loss": 0.2947,
"step": 14435
},
{
"epoch": 1.8187171332304688,
"grad_norm": 0.2665347158908844,
"learning_rate": 0.00012074049460728945,
"loss": 0.3012,
"step": 14440
},
{
"epoch": 1.8193469156406463,
"grad_norm": 0.2987824082374573,
"learning_rate": 0.00012063269125325228,
"loss": 0.2986,
"step": 14445
},
{
"epoch": 1.8199766980508234,
"grad_norm": 0.2429313212633133,
"learning_rate": 0.00012052490367013076,
"loss": 0.3035,
"step": 14450
},
{
"epoch": 1.8206064804610007,
"grad_norm": 0.28424081206321716,
"learning_rate": 0.00012041713191580925,
"loss": 0.2948,
"step": 14455
},
{
"epoch": 1.821236262871178,
"grad_norm": 0.25087571144104004,
"learning_rate": 0.00012030937604816365,
"loss": 0.2949,
"step": 14460
},
{
"epoch": 1.8218660452813553,
"grad_norm": 0.23633217811584473,
"learning_rate": 0.00012020163612506127,
"loss": 0.2669,
"step": 14465
},
{
"epoch": 1.8224958276915326,
"grad_norm": 0.26396888494491577,
"learning_rate": 0.000120093912204361,
"loss": 0.2912,
"step": 14470
},
{
"epoch": 1.8231256101017097,
"grad_norm": 0.2898525297641754,
"learning_rate": 0.00011998620434391299,
"loss": 0.319,
"step": 14475
},
{
"epoch": 1.8237553925118872,
"grad_norm": 0.25507113337516785,
"learning_rate": 0.00011987851260155881,
"loss": 0.3028,
"step": 14480
},
{
"epoch": 1.8243851749220643,
"grad_norm": 0.2405284345149994,
"learning_rate": 0.00011977083703513145,
"loss": 0.2879,
"step": 14485
},
{
"epoch": 1.8250149573322418,
"grad_norm": 0.27114009857177734,
"learning_rate": 0.00011966317770245507,
"loss": 0.3094,
"step": 14490
},
{
"epoch": 1.825644739742419,
"grad_norm": 0.2708043158054352,
"learning_rate": 0.0001195555346613453,
"loss": 0.3062,
"step": 14495
},
{
"epoch": 1.8262745221525962,
"grad_norm": 0.2507513463497162,
"learning_rate": 0.00011944790796960878,
"loss": 0.2832,
"step": 14500
},
{
"epoch": 1.8269043045627735,
"grad_norm": 0.2864154577255249,
"learning_rate": 0.0001193402976850436,
"loss": 0.3067,
"step": 14505
},
{
"epoch": 1.8275340869729508,
"grad_norm": 0.26530271768569946,
"learning_rate": 0.00011923270386543886,
"loss": 0.2816,
"step": 14510
},
{
"epoch": 1.8281638693831281,
"grad_norm": 0.24444885551929474,
"learning_rate": 0.00011912512656857498,
"loss": 0.2993,
"step": 14515
},
{
"epoch": 1.8287936517933054,
"grad_norm": 0.2591851055622101,
"learning_rate": 0.00011901756585222334,
"loss": 0.2926,
"step": 14520
},
{
"epoch": 1.8294234342034827,
"grad_norm": 0.2942061424255371,
"learning_rate": 0.0001189100217741466,
"loss": 0.3032,
"step": 14525
},
{
"epoch": 1.8300532166136598,
"grad_norm": 0.28199318051338196,
"learning_rate": 0.00011880249439209836,
"loss": 0.291,
"step": 14530
},
{
"epoch": 1.8306829990238374,
"grad_norm": 0.2743484377861023,
"learning_rate": 0.00011869498376382324,
"loss": 0.3101,
"step": 14535
},
{
"epoch": 1.8313127814340144,
"grad_norm": 0.24012960493564606,
"learning_rate": 0.00011858748994705689,
"loss": 0.3,
"step": 14540
},
{
"epoch": 1.831942563844192,
"grad_norm": 0.2856425344944,
"learning_rate": 0.00011848001299952598,
"loss": 0.3042,
"step": 14545
},
{
"epoch": 1.832572346254369,
"grad_norm": 0.2720118463039398,
"learning_rate": 0.00011837255297894808,
"loss": 0.298,
"step": 14550
},
{
"epoch": 1.8332021286645463,
"grad_norm": 0.26973578333854675,
"learning_rate": 0.0001182651099430317,
"loss": 0.2734,
"step": 14555
},
{
"epoch": 1.8338319110747237,
"grad_norm": 0.35720425844192505,
"learning_rate": 0.00011815768394947616,
"loss": 0.3174,
"step": 14560
},
{
"epoch": 1.834461693484901,
"grad_norm": 0.2649666666984558,
"learning_rate": 0.00011805027505597178,
"loss": 0.3009,
"step": 14565
},
{
"epoch": 1.8350914758950783,
"grad_norm": 0.2809504270553589,
"learning_rate": 0.00011794288332019939,
"loss": 0.3075,
"step": 14570
},
{
"epoch": 1.8357212583052556,
"grad_norm": 0.247705340385437,
"learning_rate": 0.00011783550879983097,
"loss": 0.2929,
"step": 14575
},
{
"epoch": 1.8363510407154329,
"grad_norm": 0.3010486364364624,
"learning_rate": 0.00011772815155252901,
"loss": 0.2923,
"step": 14580
},
{
"epoch": 1.83698082312561,
"grad_norm": 0.29634296894073486,
"learning_rate": 0.00011762081163594686,
"loss": 0.2956,
"step": 14585
},
{
"epoch": 1.8376106055357875,
"grad_norm": 0.3235035538673401,
"learning_rate": 0.00011751348910772844,
"loss": 0.299,
"step": 14590
},
{
"epoch": 1.8382403879459646,
"grad_norm": 0.27069565653800964,
"learning_rate": 0.00011740618402550849,
"loss": 0.2885,
"step": 14595
},
{
"epoch": 1.838870170356142,
"grad_norm": 0.26986175775527954,
"learning_rate": 0.00011729889644691227,
"loss": 0.2974,
"step": 14600
},
{
"epoch": 1.8394999527663192,
"grad_norm": 0.24633704125881195,
"learning_rate": 0.00011719162642955559,
"loss": 0.3011,
"step": 14605
},
{
"epoch": 1.8401297351764965,
"grad_norm": 0.2659735381603241,
"learning_rate": 0.00011708437403104491,
"loss": 0.2802,
"step": 14610
},
{
"epoch": 1.8407595175866738,
"grad_norm": 0.2634638547897339,
"learning_rate": 0.00011697713930897728,
"loss": 0.2834,
"step": 14615
},
{
"epoch": 1.841389299996851,
"grad_norm": 0.2891436517238617,
"learning_rate": 0.00011686992232094012,
"loss": 0.2892,
"step": 14620
},
{
"epoch": 1.8420190824070284,
"grad_norm": 0.26533305644989014,
"learning_rate": 0.0001167627231245115,
"loss": 0.2954,
"step": 14625
},
{
"epoch": 1.8426488648172057,
"grad_norm": 0.26114416122436523,
"learning_rate": 0.00011665554177725977,
"loss": 0.2936,
"step": 14630
},
{
"epoch": 1.843278647227383,
"grad_norm": 0.24782754480838776,
"learning_rate": 0.00011654837833674379,
"loss": 0.283,
"step": 14635
},
{
"epoch": 1.84390842963756,
"grad_norm": 0.2653804123401642,
"learning_rate": 0.00011644123286051274,
"loss": 0.2911,
"step": 14640
},
{
"epoch": 1.8445382120477376,
"grad_norm": 0.2524818181991577,
"learning_rate": 0.00011633410540610621,
"loss": 0.29,
"step": 14645
},
{
"epoch": 1.8451679944579147,
"grad_norm": 0.2844378352165222,
"learning_rate": 0.00011622699603105404,
"loss": 0.298,
"step": 14650
},
{
"epoch": 1.8457977768680922,
"grad_norm": 0.2608543038368225,
"learning_rate": 0.0001161199047928765,
"loss": 0.2807,
"step": 14655
},
{
"epoch": 1.8464275592782693,
"grad_norm": 0.2596459984779358,
"learning_rate": 0.000116012831749084,
"loss": 0.29,
"step": 14660
},
{
"epoch": 1.8470573416884466,
"grad_norm": 0.2654721140861511,
"learning_rate": 0.00011590577695717717,
"loss": 0.2878,
"step": 14665
},
{
"epoch": 1.847687124098624,
"grad_norm": 0.283388614654541,
"learning_rate": 0.00011579874047464696,
"loss": 0.2751,
"step": 14670
},
{
"epoch": 1.8483169065088012,
"grad_norm": 0.24917341768741608,
"learning_rate": 0.00011569172235897433,
"loss": 0.3,
"step": 14675
},
{
"epoch": 1.8489466889189785,
"grad_norm": 0.2464076280593872,
"learning_rate": 0.00011558472266763049,
"loss": 0.2848,
"step": 14680
},
{
"epoch": 1.8495764713291558,
"grad_norm": 0.2884039282798767,
"learning_rate": 0.00011547774145807665,
"loss": 0.2698,
"step": 14685
},
{
"epoch": 1.8502062537393331,
"grad_norm": 0.2762083411216736,
"learning_rate": 0.00011537077878776425,
"loss": 0.3151,
"step": 14690
},
{
"epoch": 1.8508360361495102,
"grad_norm": 0.22906774282455444,
"learning_rate": 0.00011526383471413463,
"loss": 0.2669,
"step": 14695
},
{
"epoch": 1.8514658185596877,
"grad_norm": 0.28603047132492065,
"learning_rate": 0.00011515690929461928,
"loss": 0.2922,
"step": 14700
},
{
"epoch": 1.8520956009698648,
"grad_norm": 0.26245948672294617,
"learning_rate": 0.00011505000258663954,
"loss": 0.3095,
"step": 14705
},
{
"epoch": 1.8527253833800423,
"grad_norm": 0.2754320800304413,
"learning_rate": 0.00011494311464760673,
"loss": 0.2843,
"step": 14710
},
{
"epoch": 1.8533551657902194,
"grad_norm": 0.24283255636692047,
"learning_rate": 0.00011483624553492212,
"loss": 0.3039,
"step": 14715
},
{
"epoch": 1.8539849482003967,
"grad_norm": 0.299950510263443,
"learning_rate": 0.00011472939530597691,
"loss": 0.3108,
"step": 14720
},
{
"epoch": 1.854614730610574,
"grad_norm": 0.23872928321361542,
"learning_rate": 0.00011462256401815205,
"loss": 0.3221,
"step": 14725
},
{
"epoch": 1.8552445130207513,
"grad_norm": 0.32674193382263184,
"learning_rate": 0.00011451575172881845,
"loss": 0.3066,
"step": 14730
},
{
"epoch": 1.8558742954309286,
"grad_norm": 0.2620803415775299,
"learning_rate": 0.00011440895849533675,
"loss": 0.2855,
"step": 14735
},
{
"epoch": 1.856504077841106,
"grad_norm": 0.2653051018714905,
"learning_rate": 0.0001143021843750573,
"loss": 0.2827,
"step": 14740
},
{
"epoch": 1.8571338602512832,
"grad_norm": 0.29697105288505554,
"learning_rate": 0.00011419542942532023,
"loss": 0.2848,
"step": 14745
},
{
"epoch": 1.8577636426614603,
"grad_norm": 0.26711151003837585,
"learning_rate": 0.00011408869370345545,
"loss": 0.301,
"step": 14750
},
{
"epoch": 1.8583934250716379,
"grad_norm": 0.26371288299560547,
"learning_rate": 0.0001139819772667824,
"loss": 0.2994,
"step": 14755
},
{
"epoch": 1.859023207481815,
"grad_norm": 0.34920862317085266,
"learning_rate": 0.00011387528017261035,
"loss": 0.2968,
"step": 14760
},
{
"epoch": 1.8596529898919925,
"grad_norm": 0.2951182723045349,
"learning_rate": 0.000113768602478238,
"loss": 0.2935,
"step": 14765
},
{
"epoch": 1.8602827723021695,
"grad_norm": 0.22442401945590973,
"learning_rate": 0.00011366194424095381,
"loss": 0.2952,
"step": 14770
},
{
"epoch": 1.8609125547123468,
"grad_norm": 0.26102137565612793,
"learning_rate": 0.00011355530551803553,
"loss": 0.2823,
"step": 14775
},
{
"epoch": 1.8615423371225241,
"grad_norm": 0.27118000388145447,
"learning_rate": 0.0001134486863667507,
"loss": 0.286,
"step": 14780
},
{
"epoch": 1.8621721195327015,
"grad_norm": 0.2869999408721924,
"learning_rate": 0.00011334208684435617,
"loss": 0.2734,
"step": 14785
},
{
"epoch": 1.8628019019428788,
"grad_norm": 0.32944396138191223,
"learning_rate": 0.0001132355070080984,
"loss": 0.3038,
"step": 14790
},
{
"epoch": 1.863431684353056,
"grad_norm": 0.28535759449005127,
"learning_rate": 0.00011312894691521312,
"loss": 0.3213,
"step": 14795
},
{
"epoch": 1.8640614667632334,
"grad_norm": 0.23751592636108398,
"learning_rate": 0.00011302240662292561,
"loss": 0.2972,
"step": 14800
},
{
"epoch": 1.8646912491734104,
"grad_norm": 0.2352185994386673,
"learning_rate": 0.00011291588618845043,
"loss": 0.2772,
"step": 14805
},
{
"epoch": 1.865321031583588,
"grad_norm": 0.24066108465194702,
"learning_rate": 0.00011280938566899142,
"loss": 0.3053,
"step": 14810
},
{
"epoch": 1.865950813993765,
"grad_norm": 0.33842501044273376,
"learning_rate": 0.0001127029051217418,
"loss": 0.2992,
"step": 14815
},
{
"epoch": 1.8665805964039426,
"grad_norm": 0.24873322248458862,
"learning_rate": 0.00011259644460388412,
"loss": 0.2887,
"step": 14820
},
{
"epoch": 1.8672103788141197,
"grad_norm": 0.27127575874328613,
"learning_rate": 0.00011249000417259005,
"loss": 0.2619,
"step": 14825
},
{
"epoch": 1.867840161224297,
"grad_norm": 0.28289374709129333,
"learning_rate": 0.00011238358388502059,
"loss": 0.2815,
"step": 14830
},
{
"epoch": 1.8684699436344743,
"grad_norm": 0.29764994978904724,
"learning_rate": 0.00011227718379832583,
"loss": 0.3006,
"step": 14835
},
{
"epoch": 1.8690997260446516,
"grad_norm": 0.2869538366794586,
"learning_rate": 0.00011217080396964507,
"loss": 0.2707,
"step": 14840
},
{
"epoch": 1.8697295084548289,
"grad_norm": 0.2732262909412384,
"learning_rate": 0.00011206444445610663,
"loss": 0.2768,
"step": 14845
},
{
"epoch": 1.8703592908650062,
"grad_norm": 0.3032742738723755,
"learning_rate": 0.0001119581053148281,
"loss": 0.2715,
"step": 14850
},
{
"epoch": 1.8709890732751835,
"grad_norm": 0.26171359419822693,
"learning_rate": 0.00011185178660291594,
"loss": 0.2701,
"step": 14855
},
{
"epoch": 1.8716188556853606,
"grad_norm": 0.30940353870391846,
"learning_rate": 0.00011174548837746581,
"loss": 0.2843,
"step": 14860
},
{
"epoch": 1.872248638095538,
"grad_norm": 0.2774769067764282,
"learning_rate": 0.00011163921069556224,
"loss": 0.2951,
"step": 14865
},
{
"epoch": 1.8728784205057152,
"grad_norm": 0.29232633113861084,
"learning_rate": 0.00011153295361427876,
"loss": 0.2938,
"step": 14870
},
{
"epoch": 1.8735082029158927,
"grad_norm": 0.28283149003982544,
"learning_rate": 0.00011142671719067793,
"loss": 0.2875,
"step": 14875
},
{
"epoch": 1.8741379853260698,
"grad_norm": 0.24245183169841766,
"learning_rate": 0.00011132050148181103,
"loss": 0.2829,
"step": 14880
},
{
"epoch": 1.874767767736247,
"grad_norm": 0.27495938539505005,
"learning_rate": 0.00011121430654471837,
"loss": 0.2923,
"step": 14885
},
{
"epoch": 1.8753975501464244,
"grad_norm": 0.3106895089149475,
"learning_rate": 0.00011110813243642906,
"loss": 0.2855,
"step": 14890
},
{
"epoch": 1.8760273325566017,
"grad_norm": 0.263810396194458,
"learning_rate": 0.00011100197921396102,
"loss": 0.271,
"step": 14895
},
{
"epoch": 1.876657114966779,
"grad_norm": 0.23044048249721527,
"learning_rate": 0.00011089584693432091,
"loss": 0.2608,
"step": 14900
},
{
"epoch": 1.8772868973769563,
"grad_norm": 0.29268765449523926,
"learning_rate": 0.00011078973565450427,
"loss": 0.2835,
"step": 14905
},
{
"epoch": 1.8779166797871336,
"grad_norm": 0.2691350281238556,
"learning_rate": 0.00011068364543149527,
"loss": 0.291,
"step": 14910
},
{
"epoch": 1.8785464621973107,
"grad_norm": 0.26748213171958923,
"learning_rate": 0.00011057757632226672,
"loss": 0.2994,
"step": 14915
},
{
"epoch": 1.8791762446074882,
"grad_norm": 0.2624029815196991,
"learning_rate": 0.00011047152838378018,
"loss": 0.2832,
"step": 14920
},
{
"epoch": 1.8798060270176653,
"grad_norm": 0.2670036554336548,
"learning_rate": 0.00011036550167298583,
"loss": 0.284,
"step": 14925
},
{
"epoch": 1.8804358094278428,
"grad_norm": 0.2848396599292755,
"learning_rate": 0.0001102594962468224,
"loss": 0.2831,
"step": 14930
},
{
"epoch": 1.88106559183802,
"grad_norm": 0.2502748668193817,
"learning_rate": 0.0001101535121622173,
"loss": 0.3038,
"step": 14935
},
{
"epoch": 1.8816953742481972,
"grad_norm": 0.2998834252357483,
"learning_rate": 0.0001100475494760863,
"loss": 0.2847,
"step": 14940
},
{
"epoch": 1.8823251566583745,
"grad_norm": 0.229685977101326,
"learning_rate": 0.00010994160824533398,
"loss": 0.261,
"step": 14945
},
{
"epoch": 1.8829549390685518,
"grad_norm": 0.26833808422088623,
"learning_rate": 0.00010983568852685294,
"loss": 0.2923,
"step": 14950
},
{
"epoch": 1.8835847214787291,
"grad_norm": 0.2380465716123581,
"learning_rate": 0.00010972979037752465,
"loss": 0.2664,
"step": 14955
},
{
"epoch": 1.8842145038889064,
"grad_norm": 0.2505188286304474,
"learning_rate": 0.00010962391385421876,
"loss": 0.2914,
"step": 14960
},
{
"epoch": 1.8848442862990837,
"grad_norm": 0.33335885405540466,
"learning_rate": 0.00010951805901379346,
"loss": 0.3092,
"step": 14965
},
{
"epoch": 1.8854740687092608,
"grad_norm": 0.22425580024719238,
"learning_rate": 0.0001094122259130951,
"loss": 0.2583,
"step": 14970
},
{
"epoch": 1.8861038511194383,
"grad_norm": 0.25008514523506165,
"learning_rate": 0.00010930641460895863,
"loss": 0.2936,
"step": 14975
},
{
"epoch": 1.8867336335296154,
"grad_norm": 0.2543163299560547,
"learning_rate": 0.00010920062515820707,
"loss": 0.2855,
"step": 14980
},
{
"epoch": 1.887363415939793,
"grad_norm": 0.25144490599632263,
"learning_rate": 0.00010909485761765172,
"loss": 0.2788,
"step": 14985
},
{
"epoch": 1.88799319834997,
"grad_norm": 0.23470145463943481,
"learning_rate": 0.00010898911204409218,
"loss": 0.2709,
"step": 14990
},
{
"epoch": 1.8886229807601473,
"grad_norm": 0.27916932106018066,
"learning_rate": 0.00010888338849431629,
"loss": 0.279,
"step": 14995
},
{
"epoch": 1.8892527631703246,
"grad_norm": 0.24980424344539642,
"learning_rate": 0.00010877768702509996,
"loss": 0.2982,
"step": 15000
},
{
"epoch": 1.8892527631703246,
"eval_loss": 0.3032541871070862,
"eval_runtime": 6.1659,
"eval_samples_per_second": 162.182,
"eval_steps_per_second": 10.217,
"step": 15000
},
{
"epoch": 1.889882545580502,
"grad_norm": 0.24535268545150757,
"learning_rate": 0.00010867200769320732,
"loss": 0.2667,
"step": 15005
},
{
"epoch": 1.8905123279906793,
"grad_norm": 0.2690826654434204,
"learning_rate": 0.0001085663505553906,
"loss": 0.2703,
"step": 15010
},
{
"epoch": 1.8911421104008566,
"grad_norm": 0.2511346936225891,
"learning_rate": 0.00010846071566839008,
"loss": 0.3011,
"step": 15015
},
{
"epoch": 1.8917718928110339,
"grad_norm": 0.28077587485313416,
"learning_rate": 0.00010835510308893407,
"loss": 0.285,
"step": 15020
},
{
"epoch": 1.892401675221211,
"grad_norm": 0.309238463640213,
"learning_rate": 0.000108249512873739,
"loss": 0.2894,
"step": 15025
},
{
"epoch": 1.8930314576313885,
"grad_norm": 0.26940178871154785,
"learning_rate": 0.00010814394507950917,
"loss": 0.2864,
"step": 15030
},
{
"epoch": 1.8936612400415656,
"grad_norm": 0.27850431203842163,
"learning_rate": 0.00010803839976293694,
"loss": 0.2716,
"step": 15035
},
{
"epoch": 1.894291022451743,
"grad_norm": 0.24114792048931122,
"learning_rate": 0.00010793287698070256,
"loss": 0.2695,
"step": 15040
},
{
"epoch": 1.8949208048619202,
"grad_norm": 0.3137163817882538,
"learning_rate": 0.0001078273767894741,
"loss": 0.3063,
"step": 15045
},
{
"epoch": 1.8955505872720975,
"grad_norm": 0.27090078592300415,
"learning_rate": 0.00010772189924590773,
"loss": 0.2643,
"step": 15050
},
{
"epoch": 1.8961803696822748,
"grad_norm": 0.27956193685531616,
"learning_rate": 0.00010761644440664714,
"loss": 0.271,
"step": 15055
},
{
"epoch": 1.896810152092452,
"grad_norm": 0.24823328852653503,
"learning_rate": 0.00010751101232832401,
"loss": 0.2849,
"step": 15060
},
{
"epoch": 1.8974399345026294,
"grad_norm": 0.2675158977508545,
"learning_rate": 0.00010740560306755787,
"loss": 0.2744,
"step": 15065
},
{
"epoch": 1.8980697169128065,
"grad_norm": 0.2589218318462372,
"learning_rate": 0.0001073002166809558,
"loss": 0.2834,
"step": 15070
},
{
"epoch": 1.898699499322984,
"grad_norm": 0.277705579996109,
"learning_rate": 0.00010719485322511273,
"loss": 0.2826,
"step": 15075
},
{
"epoch": 1.899329281733161,
"grad_norm": 0.23539955914020538,
"learning_rate": 0.0001070895127566113,
"loss": 0.2589,
"step": 15080
},
{
"epoch": 1.8999590641433386,
"grad_norm": 0.3010064661502838,
"learning_rate": 0.00010698419533202172,
"loss": 0.2804,
"step": 15085
},
{
"epoch": 1.9005888465535157,
"grad_norm": 0.25453826785087585,
"learning_rate": 0.00010687890100790175,
"loss": 0.2863,
"step": 15090
},
{
"epoch": 1.9012186289636932,
"grad_norm": 0.2774878144264221,
"learning_rate": 0.00010677362984079699,
"loss": 0.2933,
"step": 15095
},
{
"epoch": 1.9018484113738703,
"grad_norm": 0.26002323627471924,
"learning_rate": 0.00010666838188724038,
"loss": 0.2891,
"step": 15100
},
{
"epoch": 1.9024781937840476,
"grad_norm": 0.25788870453834534,
"learning_rate": 0.00010656315720375246,
"loss": 0.2934,
"step": 15105
},
{
"epoch": 1.903107976194225,
"grad_norm": 0.24301236867904663,
"learning_rate": 0.00010645795584684138,
"loss": 0.2848,
"step": 15110
},
{
"epoch": 1.9037377586044022,
"grad_norm": 0.309514582157135,
"learning_rate": 0.00010635277787300256,
"loss": 0.2846,
"step": 15115
},
{
"epoch": 1.9043675410145795,
"grad_norm": 0.274870902299881,
"learning_rate": 0.00010624762333871913,
"loss": 0.2956,
"step": 15120
},
{
"epoch": 1.9049973234247566,
"grad_norm": 0.24861137568950653,
"learning_rate": 0.00010614249230046129,
"loss": 0.2777,
"step": 15125
},
{
"epoch": 1.905627105834934,
"grad_norm": 0.26125532388687134,
"learning_rate": 0.00010603738481468693,
"loss": 0.2794,
"step": 15130
},
{
"epoch": 1.9062568882451112,
"grad_norm": 0.24094760417938232,
"learning_rate": 0.0001059323009378411,
"loss": 0.2633,
"step": 15135
},
{
"epoch": 1.9068866706552887,
"grad_norm": 0.3418034315109253,
"learning_rate": 0.0001058272407263563,
"loss": 0.3045,
"step": 15140
},
{
"epoch": 1.9075164530654658,
"grad_norm": 0.2657215893268585,
"learning_rate": 0.00010572220423665222,
"loss": 0.3085,
"step": 15145
},
{
"epoch": 1.9081462354756433,
"grad_norm": 0.23728597164154053,
"learning_rate": 0.00010561719152513591,
"loss": 0.2788,
"step": 15150
},
{
"epoch": 1.9087760178858204,
"grad_norm": 0.2741139829158783,
"learning_rate": 0.0001055122026482016,
"loss": 0.2855,
"step": 15155
},
{
"epoch": 1.9094058002959977,
"grad_norm": 0.2415517419576645,
"learning_rate": 0.00010540723766223064,
"loss": 0.2799,
"step": 15160
},
{
"epoch": 1.910035582706175,
"grad_norm": 0.2724277675151825,
"learning_rate": 0.00010530229662359162,
"loss": 0.2821,
"step": 15165
},
{
"epoch": 1.9106653651163523,
"grad_norm": 0.28418639302253723,
"learning_rate": 0.00010519737958864036,
"loss": 0.2899,
"step": 15170
},
{
"epoch": 1.9112951475265296,
"grad_norm": 0.26423749327659607,
"learning_rate": 0.00010509248661371962,
"loss": 0.3033,
"step": 15175
},
{
"epoch": 1.9119249299367067,
"grad_norm": 0.2523916959762573,
"learning_rate": 0.00010498761775515941,
"loss": 0.2763,
"step": 15180
},
{
"epoch": 1.9125547123468842,
"grad_norm": 0.29665645956993103,
"learning_rate": 0.00010488277306927663,
"loss": 0.2918,
"step": 15185
},
{
"epoch": 1.9131844947570613,
"grad_norm": 0.2941978871822357,
"learning_rate": 0.00010477795261237537,
"loss": 0.2753,
"step": 15190
},
{
"epoch": 1.9138142771672388,
"grad_norm": 0.2701078951358795,
"learning_rate": 0.00010467315644074646,
"loss": 0.2925,
"step": 15195
},
{
"epoch": 1.914444059577416,
"grad_norm": 0.2497081160545349,
"learning_rate": 0.00010456838461066793,
"loss": 0.2669,
"step": 15200
},
{
"epoch": 1.9150738419875935,
"grad_norm": 0.2448865920305252,
"learning_rate": 0.00010446363717840462,
"loss": 0.2766,
"step": 15205
},
{
"epoch": 1.9157036243977705,
"grad_norm": 0.26188936829566956,
"learning_rate": 0.00010435891420020833,
"loss": 0.2935,
"step": 15210
},
{
"epoch": 1.9163334068079478,
"grad_norm": 0.3044489622116089,
"learning_rate": 0.00010425421573231767,
"loss": 0.2791,
"step": 15215
},
{
"epoch": 1.9169631892181251,
"grad_norm": 0.30361208319664,
"learning_rate": 0.00010414954183095813,
"loss": 0.277,
"step": 15220
},
{
"epoch": 1.9175929716283024,
"grad_norm": 0.31100359559059143,
"learning_rate": 0.00010404489255234191,
"loss": 0.2687,
"step": 15225
},
{
"epoch": 1.9182227540384797,
"grad_norm": 0.26500749588012695,
"learning_rate": 0.00010394026795266814,
"loss": 0.2804,
"step": 15230
},
{
"epoch": 1.9188525364486568,
"grad_norm": 0.33220374584198,
"learning_rate": 0.00010383566808812257,
"loss": 0.284,
"step": 15235
},
{
"epoch": 1.9194823188588344,
"grad_norm": 0.23146981000900269,
"learning_rate": 0.00010373109301487777,
"loss": 0.2949,
"step": 15240
},
{
"epoch": 1.9201121012690114,
"grad_norm": 0.24833330512046814,
"learning_rate": 0.00010362654278909292,
"loss": 0.2685,
"step": 15245
},
{
"epoch": 1.920741883679189,
"grad_norm": 0.22905099391937256,
"learning_rate": 0.00010352201746691381,
"loss": 0.248,
"step": 15250
},
{
"epoch": 1.921371666089366,
"grad_norm": 0.2544589936733246,
"learning_rate": 0.00010341751710447308,
"loss": 0.2763,
"step": 15255
},
{
"epoch": 1.9220014484995434,
"grad_norm": 0.24207763373851776,
"learning_rate": 0.0001033130417578897,
"loss": 0.2691,
"step": 15260
},
{
"epoch": 1.9226312309097207,
"grad_norm": 0.3025490939617157,
"learning_rate": 0.0001032085914832693,
"loss": 0.2902,
"step": 15265
},
{
"epoch": 1.923261013319898,
"grad_norm": 0.2563372552394867,
"learning_rate": 0.00010310416633670413,
"loss": 0.2937,
"step": 15270
},
{
"epoch": 1.9238907957300753,
"grad_norm": 0.22143816947937012,
"learning_rate": 0.00010299976637427285,
"loss": 0.2615,
"step": 15275
},
{
"epoch": 1.9245205781402526,
"grad_norm": 0.26383697986602783,
"learning_rate": 0.00010289539165204058,
"loss": 0.2834,
"step": 15280
},
{
"epoch": 1.9251503605504299,
"grad_norm": 0.2607567310333252,
"learning_rate": 0.00010279104222605903,
"loss": 0.2875,
"step": 15285
},
{
"epoch": 1.925780142960607,
"grad_norm": 0.23255427181720734,
"learning_rate": 0.0001026867181523662,
"loss": 0.2645,
"step": 15290
},
{
"epoch": 1.9264099253707845,
"grad_norm": 0.2203371226787567,
"learning_rate": 0.00010258241948698641,
"loss": 0.276,
"step": 15295
},
{
"epoch": 1.9270397077809616,
"grad_norm": 0.2557859718799591,
"learning_rate": 0.00010247814628593052,
"loss": 0.2877,
"step": 15300
},
{
"epoch": 1.927669490191139,
"grad_norm": 0.2551586925983429,
"learning_rate": 0.00010237389860519557,
"loss": 0.2678,
"step": 15305
},
{
"epoch": 1.9282992726013162,
"grad_norm": 0.2592737376689911,
"learning_rate": 0.00010226967650076495,
"loss": 0.2645,
"step": 15310
},
{
"epoch": 1.9289290550114935,
"grad_norm": 0.25076064467430115,
"learning_rate": 0.00010216548002860836,
"loss": 0.2595,
"step": 15315
},
{
"epoch": 1.9295588374216708,
"grad_norm": 0.28892189264297485,
"learning_rate": 0.0001020613092446816,
"loss": 0.2658,
"step": 15320
},
{
"epoch": 1.930188619831848,
"grad_norm": 0.28119730949401855,
"learning_rate": 0.00010195716420492692,
"loss": 0.2783,
"step": 15325
},
{
"epoch": 1.9308184022420254,
"grad_norm": 0.23143291473388672,
"learning_rate": 0.00010185304496527239,
"loss": 0.2745,
"step": 15330
},
{
"epoch": 1.9314481846522027,
"grad_norm": 0.23947221040725708,
"learning_rate": 0.00010174895158163252,
"loss": 0.2642,
"step": 15335
},
{
"epoch": 1.93207796706238,
"grad_norm": 0.27924421429634094,
"learning_rate": 0.00010164488410990779,
"loss": 0.2895,
"step": 15340
},
{
"epoch": 1.932707749472557,
"grad_norm": 0.2736763656139374,
"learning_rate": 0.00010154084260598488,
"loss": 0.2798,
"step": 15345
},
{
"epoch": 1.9333375318827346,
"grad_norm": 0.26288047432899475,
"learning_rate": 0.00010143682712573639,
"loss": 0.2799,
"step": 15350
},
{
"epoch": 1.9339673142929117,
"grad_norm": 0.2662082016468048,
"learning_rate": 0.00010133283772502105,
"loss": 0.2708,
"step": 15355
},
{
"epoch": 1.9345970967030892,
"grad_norm": 0.2595316767692566,
"learning_rate": 0.00010122887445968358,
"loss": 0.2631,
"step": 15360
},
{
"epoch": 1.9352268791132663,
"grad_norm": 0.22839054465293884,
"learning_rate": 0.00010112493738555453,
"loss": 0.2533,
"step": 15365
},
{
"epoch": 1.9358566615234436,
"grad_norm": 0.25195086002349854,
"learning_rate": 0.0001010210265584505,
"loss": 0.26,
"step": 15370
},
{
"epoch": 1.936486443933621,
"grad_norm": 0.2431613951921463,
"learning_rate": 0.00010091714203417404,
"loss": 0.2802,
"step": 15375
},
{
"epoch": 1.9371162263437982,
"grad_norm": 0.24503393471240997,
"learning_rate": 0.00010081328386851342,
"loss": 0.2968,
"step": 15380
},
{
"epoch": 1.9377460087539755,
"grad_norm": 0.26283174753189087,
"learning_rate": 0.00010070945211724298,
"loss": 0.2831,
"step": 15385
},
{
"epoch": 1.9383757911641528,
"grad_norm": 0.23644685745239258,
"learning_rate": 0.00010060564683612264,
"loss": 0.2843,
"step": 15390
},
{
"epoch": 1.9390055735743301,
"grad_norm": 0.271457314491272,
"learning_rate": 0.00010050186808089828,
"loss": 0.2736,
"step": 15395
},
{
"epoch": 1.9396353559845072,
"grad_norm": 0.2437523454427719,
"learning_rate": 0.00010039811590730137,
"loss": 0.2839,
"step": 15400
},
{
"epoch": 1.9402651383946847,
"grad_norm": 0.25611042976379395,
"learning_rate": 0.00010029439037104925,
"loss": 0.2671,
"step": 15405
},
{
"epoch": 1.9408949208048618,
"grad_norm": 0.2646775245666504,
"learning_rate": 0.00010019069152784486,
"loss": 0.3072,
"step": 15410
},
{
"epoch": 1.9415247032150393,
"grad_norm": 0.26959145069122314,
"learning_rate": 0.00010008701943337695,
"loss": 0.2655,
"step": 15415
},
{
"epoch": 1.9421544856252164,
"grad_norm": 0.28409838676452637,
"learning_rate": 9.998337414331971e-05,
"loss": 0.2643,
"step": 15420
},
{
"epoch": 1.9427842680353937,
"grad_norm": 0.288766086101532,
"learning_rate": 9.987975571333303e-05,
"loss": 0.2849,
"step": 15425
},
{
"epoch": 1.943414050445571,
"grad_norm": 0.28650057315826416,
"learning_rate": 9.977616419906247e-05,
"loss": 0.2672,
"step": 15430
},
{
"epoch": 1.9440438328557483,
"grad_norm": 0.28229546546936035,
"learning_rate": 9.967259965613893e-05,
"loss": 0.2649,
"step": 15435
},
{
"epoch": 1.9446736152659256,
"grad_norm": 0.21892526745796204,
"learning_rate": 9.956906214017894e-05,
"loss": 0.2668,
"step": 15440
},
{
"epoch": 1.945303397676103,
"grad_norm": 0.27021822333335876,
"learning_rate": 9.946555170678458e-05,
"loss": 0.2725,
"step": 15445
},
{
"epoch": 1.9459331800862802,
"grad_norm": 0.2574271857738495,
"learning_rate": 9.936206841154328e-05,
"loss": 0.2643,
"step": 15450
},
{
"epoch": 1.9465629624964573,
"grad_norm": 0.2907993495464325,
"learning_rate": 9.925861231002792e-05,
"loss": 0.3103,
"step": 15455
},
{
"epoch": 1.9471927449066349,
"grad_norm": 0.225221186876297,
"learning_rate": 9.915518345779681e-05,
"loss": 0.2804,
"step": 15460
},
{
"epoch": 1.947822527316812,
"grad_norm": 0.2557651400566101,
"learning_rate": 9.905178191039365e-05,
"loss": 0.2735,
"step": 15465
},
{
"epoch": 1.9484523097269895,
"grad_norm": 0.26498880982398987,
"learning_rate": 9.894840772334733e-05,
"loss": 0.2664,
"step": 15470
},
{
"epoch": 1.9490820921371665,
"grad_norm": 0.2424790859222412,
"learning_rate": 9.884506095217222e-05,
"loss": 0.2693,
"step": 15475
},
{
"epoch": 1.9497118745473438,
"grad_norm": 0.25802767276763916,
"learning_rate": 9.87417416523679e-05,
"loss": 0.2831,
"step": 15480
},
{
"epoch": 1.9503416569575212,
"grad_norm": 0.2601839005947113,
"learning_rate": 9.863844987941912e-05,
"loss": 0.2629,
"step": 15485
},
{
"epoch": 1.9509714393676985,
"grad_norm": 0.26015961170196533,
"learning_rate": 9.853518568879602e-05,
"loss": 0.2634,
"step": 15490
},
{
"epoch": 1.9516012217778758,
"grad_norm": 0.2370160073041916,
"learning_rate": 9.843194913595374e-05,
"loss": 0.2557,
"step": 15495
},
{
"epoch": 1.952231004188053,
"grad_norm": 0.2519363462924957,
"learning_rate": 9.832874027633281e-05,
"loss": 0.255,
"step": 15500
},
{
"epoch": 1.9528607865982304,
"grad_norm": 0.3419806659221649,
"learning_rate": 9.822555916535858e-05,
"loss": 0.2744,
"step": 15505
},
{
"epoch": 1.9534905690084075,
"grad_norm": 0.24397574365139008,
"learning_rate": 9.812240585844176e-05,
"loss": 0.2619,
"step": 15510
},
{
"epoch": 1.954120351418585,
"grad_norm": 0.2432924211025238,
"learning_rate": 9.801928041097795e-05,
"loss": 0.2581,
"step": 15515
},
{
"epoch": 1.954750133828762,
"grad_norm": 0.27478650212287903,
"learning_rate": 9.791618287834797e-05,
"loss": 0.2606,
"step": 15520
},
{
"epoch": 1.9553799162389396,
"grad_norm": 0.29080766439437866,
"learning_rate": 9.781311331591747e-05,
"loss": 0.2656,
"step": 15525
},
{
"epoch": 1.9560096986491167,
"grad_norm": 0.24801793694496155,
"learning_rate": 9.771007177903723e-05,
"loss": 0.2651,
"step": 15530
},
{
"epoch": 1.956639481059294,
"grad_norm": 0.22467739880084991,
"learning_rate": 9.76070583230429e-05,
"loss": 0.2663,
"step": 15535
},
{
"epoch": 1.9572692634694713,
"grad_norm": 0.24151213467121124,
"learning_rate": 9.750407300325502e-05,
"loss": 0.2612,
"step": 15540
},
{
"epoch": 1.9578990458796486,
"grad_norm": 0.262352854013443,
"learning_rate": 9.7401115874979e-05,
"loss": 0.2508,
"step": 15545
},
{
"epoch": 1.9585288282898259,
"grad_norm": 0.2491580843925476,
"learning_rate": 9.72981869935053e-05,
"loss": 0.2735,
"step": 15550
},
{
"epoch": 1.9591586107000032,
"grad_norm": 0.27000558376312256,
"learning_rate": 9.719528641410898e-05,
"loss": 0.2794,
"step": 15555
},
{
"epoch": 1.9597883931101805,
"grad_norm": 0.2562926113605499,
"learning_rate": 9.709241419205008e-05,
"loss": 0.2829,
"step": 15560
},
{
"epoch": 1.9604181755203576,
"grad_norm": 0.2559642493724823,
"learning_rate": 9.69895703825733e-05,
"loss": 0.2768,
"step": 15565
},
{
"epoch": 1.961047957930535,
"grad_norm": 0.23282787203788757,
"learning_rate": 9.688675504090811e-05,
"loss": 0.2648,
"step": 15570
},
{
"epoch": 1.9616777403407122,
"grad_norm": 0.2280416637659073,
"learning_rate": 9.678396822226868e-05,
"loss": 0.2474,
"step": 15575
},
{
"epoch": 1.9623075227508897,
"grad_norm": 0.2516798973083496,
"learning_rate": 9.668120998185392e-05,
"loss": 0.2855,
"step": 15580
},
{
"epoch": 1.9629373051610668,
"grad_norm": 0.24892964959144592,
"learning_rate": 9.657848037484726e-05,
"loss": 0.2731,
"step": 15585
},
{
"epoch": 1.963567087571244,
"grad_norm": 0.2524420917034149,
"learning_rate": 9.647577945641699e-05,
"loss": 0.275,
"step": 15590
},
{
"epoch": 1.9641968699814214,
"grad_norm": 0.2617582380771637,
"learning_rate": 9.637310728171577e-05,
"loss": 0.293,
"step": 15595
},
{
"epoch": 1.9648266523915987,
"grad_norm": 0.2635948061943054,
"learning_rate": 9.627046390588086e-05,
"loss": 0.2642,
"step": 15600
},
{
"epoch": 1.965456434801776,
"grad_norm": 0.22701425850391388,
"learning_rate": 9.61678493840342e-05,
"loss": 0.2647,
"step": 15605
},
{
"epoch": 1.9660862172119533,
"grad_norm": 0.2594752609729767,
"learning_rate": 9.606526377128207e-05,
"loss": 0.2846,
"step": 15610
},
{
"epoch": 1.9667159996221306,
"grad_norm": 0.25541216135025024,
"learning_rate": 9.596270712271524e-05,
"loss": 0.2712,
"step": 15615
},
{
"epoch": 1.9673457820323077,
"grad_norm": 0.26473337411880493,
"learning_rate": 9.586017949340909e-05,
"loss": 0.2515,
"step": 15620
},
{
"epoch": 1.9679755644424852,
"grad_norm": 0.2370501607656479,
"learning_rate": 9.575768093842321e-05,
"loss": 0.2569,
"step": 15625
},
{
"epoch": 1.9686053468526623,
"grad_norm": 0.25999268889427185,
"learning_rate": 9.565521151280168e-05,
"loss": 0.2846,
"step": 15630
},
{
"epoch": 1.9692351292628398,
"grad_norm": 0.2597227394580841,
"learning_rate": 9.555277127157294e-05,
"loss": 0.2814,
"step": 15635
},
{
"epoch": 1.969864911673017,
"grad_norm": 0.22267143428325653,
"learning_rate": 9.545036026974979e-05,
"loss": 0.2703,
"step": 15640
},
{
"epoch": 1.9704946940831942,
"grad_norm": 0.2599702477455139,
"learning_rate": 9.534797856232913e-05,
"loss": 0.2741,
"step": 15645
},
{
"epoch": 1.9711244764933715,
"grad_norm": 0.23703083395957947,
"learning_rate": 9.524562620429243e-05,
"loss": 0.2657,
"step": 15650
},
{
"epoch": 1.9717542589035488,
"grad_norm": 0.24194732308387756,
"learning_rate": 9.514330325060515e-05,
"loss": 0.2613,
"step": 15655
},
{
"epoch": 1.9723840413137261,
"grad_norm": 0.2648374140262604,
"learning_rate": 9.504100975621709e-05,
"loss": 0.2808,
"step": 15660
},
{
"epoch": 1.9730138237239034,
"grad_norm": 0.2491552084684372,
"learning_rate": 9.493874577606218e-05,
"loss": 0.2622,
"step": 15665
},
{
"epoch": 1.9736436061340807,
"grad_norm": 0.25322696566581726,
"learning_rate": 9.483651136505857e-05,
"loss": 0.2716,
"step": 15670
},
{
"epoch": 1.9742733885442578,
"grad_norm": 0.22565199434757233,
"learning_rate": 9.473430657810838e-05,
"loss": 0.2947,
"step": 15675
},
{
"epoch": 1.9749031709544353,
"grad_norm": 0.2594245672225952,
"learning_rate": 9.463213147009795e-05,
"loss": 0.2793,
"step": 15680
},
{
"epoch": 1.9755329533646124,
"grad_norm": 0.2432025521993637,
"learning_rate": 9.452998609589769e-05,
"loss": 0.2559,
"step": 15685
},
{
"epoch": 1.97616273577479,
"grad_norm": 0.2537454068660736,
"learning_rate": 9.442787051036192e-05,
"loss": 0.2842,
"step": 15690
},
{
"epoch": 1.976792518184967,
"grad_norm": 0.2597581446170807,
"learning_rate": 9.432578476832911e-05,
"loss": 0.2755,
"step": 15695
},
{
"epoch": 1.9774223005951443,
"grad_norm": 0.25382810831069946,
"learning_rate": 9.42237289246216e-05,
"loss": 0.2653,
"step": 15700
},
{
"epoch": 1.9780520830053216,
"grad_norm": 0.23822832107543945,
"learning_rate": 9.412170303404579e-05,
"loss": 0.2624,
"step": 15705
},
{
"epoch": 1.978681865415499,
"grad_norm": 0.2722800076007843,
"learning_rate": 9.40197071513918e-05,
"loss": 0.2712,
"step": 15710
},
{
"epoch": 1.9793116478256763,
"grad_norm": 0.2273283749818802,
"learning_rate": 9.39177413314338e-05,
"loss": 0.2545,
"step": 15715
},
{
"epoch": 1.9799414302358536,
"grad_norm": 0.24674946069717407,
"learning_rate": 9.381580562892972e-05,
"loss": 0.2606,
"step": 15720
},
{
"epoch": 1.9805712126460309,
"grad_norm": 0.23100855946540833,
"learning_rate": 9.371390009862145e-05,
"loss": 0.2632,
"step": 15725
},
{
"epoch": 1.981200995056208,
"grad_norm": 0.23489323258399963,
"learning_rate": 9.361202479523448e-05,
"loss": 0.2833,
"step": 15730
},
{
"epoch": 1.9818307774663855,
"grad_norm": 0.26526087522506714,
"learning_rate": 9.35101797734783e-05,
"loss": 0.2717,
"step": 15735
},
{
"epoch": 1.9824605598765626,
"grad_norm": 0.27042046189308167,
"learning_rate": 9.340836508804595e-05,
"loss": 0.2664,
"step": 15740
},
{
"epoch": 1.98309034228674,
"grad_norm": 0.28461650013923645,
"learning_rate": 9.330658079361422e-05,
"loss": 0.26,
"step": 15745
},
{
"epoch": 1.9837201246969172,
"grad_norm": 0.26529213786125183,
"learning_rate": 9.320482694484356e-05,
"loss": 0.2808,
"step": 15750
},
{
"epoch": 1.9843499071070945,
"grad_norm": 0.32026639580726624,
"learning_rate": 9.310310359637823e-05,
"loss": 0.2631,
"step": 15755
},
{
"epoch": 1.9849796895172718,
"grad_norm": 0.2596029043197632,
"learning_rate": 9.300141080284588e-05,
"loss": 0.2771,
"step": 15760
},
{
"epoch": 1.985609471927449,
"grad_norm": 0.25400853157043457,
"learning_rate": 9.289974861885796e-05,
"loss": 0.2532,
"step": 15765
},
{
"epoch": 1.9862392543376264,
"grad_norm": 0.29176244139671326,
"learning_rate": 9.279811709900934e-05,
"loss": 0.2719,
"step": 15770
},
{
"epoch": 1.9868690367478037,
"grad_norm": 0.2530720829963684,
"learning_rate": 9.26965162978785e-05,
"loss": 0.2676,
"step": 15775
},
{
"epoch": 1.987498819157981,
"grad_norm": 0.23311518132686615,
"learning_rate": 9.259494627002728e-05,
"loss": 0.2632,
"step": 15780
},
{
"epoch": 1.988128601568158,
"grad_norm": 0.2402007132768631,
"learning_rate": 9.249340707000123e-05,
"loss": 0.2419,
"step": 15785
},
{
"epoch": 1.9887583839783356,
"grad_norm": 0.24241755902767181,
"learning_rate": 9.239189875232914e-05,
"loss": 0.2567,
"step": 15790
},
{
"epoch": 1.9893881663885127,
"grad_norm": 0.31180015206336975,
"learning_rate": 9.229042137152337e-05,
"loss": 0.2864,
"step": 15795
},
{
"epoch": 1.9900179487986902,
"grad_norm": 0.29330986738204956,
"learning_rate": 9.218897498207952e-05,
"loss": 0.2626,
"step": 15800
},
{
"epoch": 1.9906477312088673,
"grad_norm": 0.24687117338180542,
"learning_rate": 9.208755963847663e-05,
"loss": 0.2721,
"step": 15805
},
{
"epoch": 1.9912775136190446,
"grad_norm": 0.22886700928211212,
"learning_rate": 9.198617539517714e-05,
"loss": 0.2626,
"step": 15810
},
{
"epoch": 1.991907296029222,
"grad_norm": 0.2331649363040924,
"learning_rate": 9.188482230662662e-05,
"loss": 0.2484,
"step": 15815
},
{
"epoch": 1.9925370784393992,
"grad_norm": 0.23371124267578125,
"learning_rate": 9.178350042725397e-05,
"loss": 0.291,
"step": 15820
},
{
"epoch": 1.9931668608495765,
"grad_norm": 0.2478175163269043,
"learning_rate": 9.168220981147143e-05,
"loss": 0.2748,
"step": 15825
},
{
"epoch": 1.9937966432597538,
"grad_norm": 0.25952714681625366,
"learning_rate": 9.158095051367433e-05,
"loss": 0.2568,
"step": 15830
},
{
"epoch": 1.9944264256699311,
"grad_norm": 0.2522846758365631,
"learning_rate": 9.14797225882412e-05,
"loss": 0.2414,
"step": 15835
},
{
"epoch": 1.9950562080801082,
"grad_norm": 0.24577966332435608,
"learning_rate": 9.137852608953384e-05,
"loss": 0.2573,
"step": 15840
},
{
"epoch": 1.9956859904902857,
"grad_norm": 0.2714809775352478,
"learning_rate": 9.127736107189705e-05,
"loss": 0.2703,
"step": 15845
},
{
"epoch": 1.9963157729004628,
"grad_norm": 0.25562793016433716,
"learning_rate": 9.117622758965866e-05,
"loss": 0.2601,
"step": 15850
},
{
"epoch": 1.9969455553106403,
"grad_norm": 0.23811009526252747,
"learning_rate": 9.107512569712975e-05,
"loss": 0.2474,
"step": 15855
},
{
"epoch": 1.9975753377208174,
"grad_norm": 0.24953188002109528,
"learning_rate": 9.097405544860437e-05,
"loss": 0.2582,
"step": 15860
},
{
"epoch": 1.9982051201309947,
"grad_norm": 0.23611120879650116,
"learning_rate": 9.087301689835944e-05,
"loss": 0.253,
"step": 15865
},
{
"epoch": 1.998834902541172,
"grad_norm": 0.24487170577049255,
"learning_rate": 9.077201010065509e-05,
"loss": 0.2508,
"step": 15870
},
{
"epoch": 1.9994646849513493,
"grad_norm": 0.2839270830154419,
"learning_rate": 9.06710351097342e-05,
"loss": 0.2748,
"step": 15875
},
{
"epoch": 2.0,
"grad_norm": 0.2304636538028717,
"learning_rate": 9.057009197982272e-05,
"loss": 0.2653,
"step": 15880
},
{
"epoch": 2.000629782410177,
"grad_norm": 0.19537301361560822,
"learning_rate": 9.046918076512935e-05,
"loss": 0.1844,
"step": 15885
},
{
"epoch": 2.0012595648203546,
"grad_norm": 0.22054894268512726,
"learning_rate": 9.036830151984571e-05,
"loss": 0.196,
"step": 15890
},
{
"epoch": 2.0018893472305317,
"grad_norm": 0.23987694084644318,
"learning_rate": 9.02674542981463e-05,
"loss": 0.1962,
"step": 15895
},
{
"epoch": 2.002519129640709,
"grad_norm": 0.24562768638134003,
"learning_rate": 9.016663915418835e-05,
"loss": 0.1826,
"step": 15900
},
{
"epoch": 2.0031489120508863,
"grad_norm": 0.27111175656318665,
"learning_rate": 9.00658561421119e-05,
"loss": 0.2025,
"step": 15905
},
{
"epoch": 2.003778694461064,
"grad_norm": 0.21321839094161987,
"learning_rate": 8.99651053160398e-05,
"loss": 0.1743,
"step": 15910
},
{
"epoch": 2.004408476871241,
"grad_norm": 0.2295263558626175,
"learning_rate": 8.986438673007749e-05,
"loss": 0.1856,
"step": 15915
},
{
"epoch": 2.0050382592814184,
"grad_norm": 0.22658327221870422,
"learning_rate": 8.976370043831313e-05,
"loss": 0.1896,
"step": 15920
},
{
"epoch": 2.0056680416915955,
"grad_norm": 0.21595464646816254,
"learning_rate": 8.966304649481753e-05,
"loss": 0.1865,
"step": 15925
},
{
"epoch": 2.006297824101773,
"grad_norm": 0.23339222371578217,
"learning_rate": 8.956242495364426e-05,
"loss": 0.1866,
"step": 15930
},
{
"epoch": 2.00692760651195,
"grad_norm": 0.20041927695274353,
"learning_rate": 8.946183586882929e-05,
"loss": 0.1745,
"step": 15935
},
{
"epoch": 2.007557388922127,
"grad_norm": 0.19914592802524567,
"learning_rate": 8.936127929439131e-05,
"loss": 0.1885,
"step": 15940
},
{
"epoch": 2.0081871713323047,
"grad_norm": 0.20688550174236298,
"learning_rate": 8.926075528433149e-05,
"loss": 0.1932,
"step": 15945
},
{
"epoch": 2.008816953742482,
"grad_norm": 0.23507048189640045,
"learning_rate": 8.916026389263358e-05,
"loss": 0.1865,
"step": 15950
},
{
"epoch": 2.0094467361526593,
"grad_norm": 0.2366725355386734,
"learning_rate": 8.905980517326358e-05,
"loss": 0.1867,
"step": 15955
},
{
"epoch": 2.0100765185628364,
"grad_norm": 0.20678187906742096,
"learning_rate": 8.895937918017028e-05,
"loss": 0.1785,
"step": 15960
},
{
"epoch": 2.010706300973014,
"grad_norm": 0.2642296850681305,
"learning_rate": 8.885898596728463e-05,
"loss": 0.1812,
"step": 15965
},
{
"epoch": 2.011336083383191,
"grad_norm": 0.20598894357681274,
"learning_rate": 8.875862558852016e-05,
"loss": 0.1861,
"step": 15970
},
{
"epoch": 2.0119658657933686,
"grad_norm": 0.23556114733219147,
"learning_rate": 8.865829809777265e-05,
"loss": 0.1873,
"step": 15975
},
{
"epoch": 2.0125956482035456,
"grad_norm": 0.25772175192832947,
"learning_rate": 8.855800354892022e-05,
"loss": 0.1858,
"step": 15980
},
{
"epoch": 2.013225430613723,
"grad_norm": 0.21538549661636353,
"learning_rate": 8.845774199582344e-05,
"loss": 0.1738,
"step": 15985
},
{
"epoch": 2.0138552130239002,
"grad_norm": 0.22819840908050537,
"learning_rate": 8.835751349232496e-05,
"loss": 0.1843,
"step": 15990
},
{
"epoch": 2.0144849954340773,
"grad_norm": 0.23319579660892487,
"learning_rate": 8.825731809224976e-05,
"loss": 0.1878,
"step": 15995
},
{
"epoch": 2.015114777844255,
"grad_norm": 0.24107947945594788,
"learning_rate": 8.815715584940511e-05,
"loss": 0.1867,
"step": 16000
},
{
"epoch": 2.015114777844255,
"eval_loss": 0.3415575921535492,
"eval_runtime": 6.166,
"eval_samples_per_second": 162.181,
"eval_steps_per_second": 10.217,
"step": 16000
},
{
"epoch": 2.015744560254432,
"grad_norm": 0.2272019386291504,
"learning_rate": 8.805702681758042e-05,
"loss": 0.1718,
"step": 16005
},
{
"epoch": 2.0163743426646095,
"grad_norm": 0.22147491574287415,
"learning_rate": 8.795693105054723e-05,
"loss": 0.175,
"step": 16010
},
{
"epoch": 2.0170041250747865,
"grad_norm": 0.21899926662445068,
"learning_rate": 8.785686860205929e-05,
"loss": 0.1749,
"step": 16015
},
{
"epoch": 2.017633907484964,
"grad_norm": 0.24299047887325287,
"learning_rate": 8.775683952585246e-05,
"loss": 0.1902,
"step": 16020
},
{
"epoch": 2.018263689895141,
"grad_norm": 0.24278461933135986,
"learning_rate": 8.765684387564454e-05,
"loss": 0.1872,
"step": 16025
},
{
"epoch": 2.0188934723053187,
"grad_norm": 0.24929705262184143,
"learning_rate": 8.75568817051355e-05,
"loss": 0.1838,
"step": 16030
},
{
"epoch": 2.0195232547154958,
"grad_norm": 0.20675018429756165,
"learning_rate": 8.745695306800738e-05,
"loss": 0.1734,
"step": 16035
},
{
"epoch": 2.0201530371256733,
"grad_norm": 0.25064778327941895,
"learning_rate": 8.73570580179241e-05,
"loss": 0.1821,
"step": 16040
},
{
"epoch": 2.0207828195358504,
"grad_norm": 0.23618988692760468,
"learning_rate": 8.725719660853157e-05,
"loss": 0.1935,
"step": 16045
},
{
"epoch": 2.0214126019460275,
"grad_norm": 0.2201015204191208,
"learning_rate": 8.715736889345766e-05,
"loss": 0.1806,
"step": 16050
},
{
"epoch": 2.022042384356205,
"grad_norm": 0.23748455941677094,
"learning_rate": 8.705757492631214e-05,
"loss": 0.1807,
"step": 16055
},
{
"epoch": 2.022672166766382,
"grad_norm": 0.2563530504703522,
"learning_rate": 8.695781476068664e-05,
"loss": 0.1825,
"step": 16060
},
{
"epoch": 2.0233019491765596,
"grad_norm": 0.27659016847610474,
"learning_rate": 8.685808845015464e-05,
"loss": 0.1861,
"step": 16065
},
{
"epoch": 2.0239317315867367,
"grad_norm": 0.19301186501979828,
"learning_rate": 8.675839604827146e-05,
"loss": 0.1804,
"step": 16070
},
{
"epoch": 2.024561513996914,
"grad_norm": 0.245374858379364,
"learning_rate": 8.665873760857415e-05,
"loss": 0.1785,
"step": 16075
},
{
"epoch": 2.0251912964070913,
"grad_norm": 0.21472232043743134,
"learning_rate": 8.655911318458166e-05,
"loss": 0.1785,
"step": 16080
},
{
"epoch": 2.025821078817269,
"grad_norm": 0.22257132828235626,
"learning_rate": 8.645952282979453e-05,
"loss": 0.1812,
"step": 16085
},
{
"epoch": 2.026450861227446,
"grad_norm": 0.25223472714424133,
"learning_rate": 8.635996659769512e-05,
"loss": 0.1934,
"step": 16090
},
{
"epoch": 2.0270806436376234,
"grad_norm": 0.22251825034618378,
"learning_rate": 8.626044454174724e-05,
"loss": 0.1895,
"step": 16095
},
{
"epoch": 2.0277104260478005,
"grad_norm": 0.2073337882757187,
"learning_rate": 8.616095671539663e-05,
"loss": 0.1851,
"step": 16100
},
{
"epoch": 2.0283402084579776,
"grad_norm": 0.21960042417049408,
"learning_rate": 8.606150317207053e-05,
"loss": 0.1809,
"step": 16105
},
{
"epoch": 2.028969990868155,
"grad_norm": 0.23633064329624176,
"learning_rate": 8.596208396517771e-05,
"loss": 0.1839,
"step": 16110
},
{
"epoch": 2.029599773278332,
"grad_norm": 0.21128375828266144,
"learning_rate": 8.586269914810855e-05,
"loss": 0.1828,
"step": 16115
},
{
"epoch": 2.0302295556885097,
"grad_norm": 0.24467304348945618,
"learning_rate": 8.576334877423505e-05,
"loss": 0.1784,
"step": 16120
},
{
"epoch": 2.030859338098687,
"grad_norm": 0.24976873397827148,
"learning_rate": 8.566403289691062e-05,
"loss": 0.1924,
"step": 16125
},
{
"epoch": 2.0314891205088643,
"grad_norm": 0.258323609828949,
"learning_rate": 8.556475156947008e-05,
"loss": 0.1889,
"step": 16130
},
{
"epoch": 2.0321189029190414,
"grad_norm": 0.24420535564422607,
"learning_rate": 8.546550484522973e-05,
"loss": 0.197,
"step": 16135
},
{
"epoch": 2.032748685329219,
"grad_norm": 0.2438700944185257,
"learning_rate": 8.536629277748746e-05,
"loss": 0.1958,
"step": 16140
},
{
"epoch": 2.033378467739396,
"grad_norm": 0.25343936681747437,
"learning_rate": 8.526711541952236e-05,
"loss": 0.1877,
"step": 16145
},
{
"epoch": 2.0340082501495735,
"grad_norm": 0.24403081834316254,
"learning_rate": 8.516797282459493e-05,
"loss": 0.1774,
"step": 16150
},
{
"epoch": 2.0346380325597506,
"grad_norm": 0.24733777344226837,
"learning_rate": 8.506886504594704e-05,
"loss": 0.1792,
"step": 16155
},
{
"epoch": 2.0352678149699277,
"grad_norm": 0.22619028389453888,
"learning_rate": 8.496979213680177e-05,
"loss": 0.1807,
"step": 16160
},
{
"epoch": 2.0358975973801052,
"grad_norm": 0.23040007054805756,
"learning_rate": 8.48707541503636e-05,
"loss": 0.1804,
"step": 16165
},
{
"epoch": 2.0365273797902823,
"grad_norm": 0.21034270524978638,
"learning_rate": 8.477175113981813e-05,
"loss": 0.1787,
"step": 16170
},
{
"epoch": 2.03715716220046,
"grad_norm": 0.21682168543338776,
"learning_rate": 8.467278315833224e-05,
"loss": 0.1817,
"step": 16175
},
{
"epoch": 2.037786944610637,
"grad_norm": 0.2700116038322449,
"learning_rate": 8.457385025905407e-05,
"loss": 0.1896,
"step": 16180
},
{
"epoch": 2.0384167270208144,
"grad_norm": 0.214239239692688,
"learning_rate": 8.44749524951128e-05,
"loss": 0.1827,
"step": 16185
},
{
"epoch": 2.0390465094309915,
"grad_norm": 0.2243194878101349,
"learning_rate": 8.437608991961885e-05,
"loss": 0.1833,
"step": 16190
},
{
"epoch": 2.039676291841169,
"grad_norm": 0.28487569093704224,
"learning_rate": 8.427726258566353e-05,
"loss": 0.1901,
"step": 16195
},
{
"epoch": 2.040306074251346,
"grad_norm": 0.24857446551322937,
"learning_rate": 8.41784705463195e-05,
"loss": 0.192,
"step": 16200
},
{
"epoch": 2.0409358566615237,
"grad_norm": 0.22208547592163086,
"learning_rate": 8.407971385464032e-05,
"loss": 0.1907,
"step": 16205
},
{
"epoch": 2.0415656390717007,
"grad_norm": 0.22752498090267181,
"learning_rate": 8.398099256366057e-05,
"loss": 0.1827,
"step": 16210
},
{
"epoch": 2.042195421481878,
"grad_norm": 0.25674304366111755,
"learning_rate": 8.388230672639584e-05,
"loss": 0.1889,
"step": 16215
},
{
"epoch": 2.0428252038920554,
"grad_norm": 0.22372281551361084,
"learning_rate": 8.378365639584264e-05,
"loss": 0.1816,
"step": 16220
},
{
"epoch": 2.0434549863022324,
"grad_norm": 0.25298216938972473,
"learning_rate": 8.368504162497859e-05,
"loss": 0.1813,
"step": 16225
},
{
"epoch": 2.04408476871241,
"grad_norm": 0.21058551967144012,
"learning_rate": 8.358646246676197e-05,
"loss": 0.1855,
"step": 16230
},
{
"epoch": 2.044714551122587,
"grad_norm": 0.2757975459098816,
"learning_rate": 8.348791897413196e-05,
"loss": 0.1749,
"step": 16235
},
{
"epoch": 2.0453443335327646,
"grad_norm": 0.22646676003932953,
"learning_rate": 8.338941120000884e-05,
"loss": 0.1852,
"step": 16240
},
{
"epoch": 2.0459741159429417,
"grad_norm": 0.23769816756248474,
"learning_rate": 8.329093919729342e-05,
"loss": 0.1869,
"step": 16245
},
{
"epoch": 2.046603898353119,
"grad_norm": 0.22907455265522003,
"learning_rate": 8.319250301886746e-05,
"loss": 0.1876,
"step": 16250
},
{
"epoch": 2.0472336807632963,
"grad_norm": 0.22925196588039398,
"learning_rate": 8.309410271759342e-05,
"loss": 0.1885,
"step": 16255
},
{
"epoch": 2.0478634631734733,
"grad_norm": 0.22043700516223907,
"learning_rate": 8.299573834631454e-05,
"loss": 0.181,
"step": 16260
},
{
"epoch": 2.048493245583651,
"grad_norm": 0.23858542740345,
"learning_rate": 8.289740995785468e-05,
"loss": 0.1898,
"step": 16265
},
{
"epoch": 2.049123027993828,
"grad_norm": 0.23982049524784088,
"learning_rate": 8.279911760501846e-05,
"loss": 0.1838,
"step": 16270
},
{
"epoch": 2.0497528104040055,
"grad_norm": 0.21694807708263397,
"learning_rate": 8.270086134059113e-05,
"loss": 0.1795,
"step": 16275
},
{
"epoch": 2.0503825928141826,
"grad_norm": 0.20050913095474243,
"learning_rate": 8.260264121733846e-05,
"loss": 0.175,
"step": 16280
},
{
"epoch": 2.05101237522436,
"grad_norm": 0.2118636816740036,
"learning_rate": 8.250445728800706e-05,
"loss": 0.1778,
"step": 16285
},
{
"epoch": 2.051642157634537,
"grad_norm": 0.2250407338142395,
"learning_rate": 8.240630960532382e-05,
"loss": 0.1885,
"step": 16290
},
{
"epoch": 2.0522719400447147,
"grad_norm": 0.2565051019191742,
"learning_rate": 8.230819822199642e-05,
"loss": 0.1901,
"step": 16295
},
{
"epoch": 2.0529017224548918,
"grad_norm": 0.24367564916610718,
"learning_rate": 8.221012319071268e-05,
"loss": 0.1798,
"step": 16300
},
{
"epoch": 2.0535315048650693,
"grad_norm": 0.24313905835151672,
"learning_rate": 8.211208456414135e-05,
"loss": 0.1908,
"step": 16305
},
{
"epoch": 2.0541612872752464,
"grad_norm": 0.23950958251953125,
"learning_rate": 8.201408239493131e-05,
"loss": 0.1815,
"step": 16310
},
{
"epoch": 2.0547910696854235,
"grad_norm": 0.24551273882389069,
"learning_rate": 8.1916116735712e-05,
"loss": 0.1941,
"step": 16315
},
{
"epoch": 2.055420852095601,
"grad_norm": 0.21070988476276398,
"learning_rate": 8.181818763909314e-05,
"loss": 0.1868,
"step": 16320
},
{
"epoch": 2.056050634505778,
"grad_norm": 0.21926933526992798,
"learning_rate": 8.172029515766502e-05,
"loss": 0.1848,
"step": 16325
},
{
"epoch": 2.0566804169159556,
"grad_norm": 0.22517934441566467,
"learning_rate": 8.162243934399812e-05,
"loss": 0.1912,
"step": 16330
},
{
"epoch": 2.0573101993261327,
"grad_norm": 0.2571990489959717,
"learning_rate": 8.152462025064315e-05,
"loss": 0.1834,
"step": 16335
},
{
"epoch": 2.05793998173631,
"grad_norm": 0.22555163502693176,
"learning_rate": 8.14268379301312e-05,
"loss": 0.19,
"step": 16340
},
{
"epoch": 2.0585697641464873,
"grad_norm": 0.2326682209968567,
"learning_rate": 8.13290924349737e-05,
"loss": 0.1786,
"step": 16345
},
{
"epoch": 2.059199546556665,
"grad_norm": 0.22472088038921356,
"learning_rate": 8.123138381766218e-05,
"loss": 0.1843,
"step": 16350
},
{
"epoch": 2.059829328966842,
"grad_norm": 0.2206810563802719,
"learning_rate": 8.113371213066838e-05,
"loss": 0.1781,
"step": 16355
},
{
"epoch": 2.0604591113770194,
"grad_norm": 0.2740577757358551,
"learning_rate": 8.103607742644426e-05,
"loss": 0.1875,
"step": 16360
},
{
"epoch": 2.0610888937871965,
"grad_norm": 0.22217485308647156,
"learning_rate": 8.093847975742185e-05,
"loss": 0.1748,
"step": 16365
},
{
"epoch": 2.0617186761973736,
"grad_norm": 0.2460946887731552,
"learning_rate": 8.084091917601336e-05,
"loss": 0.1839,
"step": 16370
},
{
"epoch": 2.062348458607551,
"grad_norm": 0.2489384114742279,
"learning_rate": 8.074339573461101e-05,
"loss": 0.1818,
"step": 16375
},
{
"epoch": 2.062978241017728,
"grad_norm": 0.22755055129528046,
"learning_rate": 8.06459094855871e-05,
"loss": 0.1885,
"step": 16380
},
{
"epoch": 2.0636080234279057,
"grad_norm": 0.22558000683784485,
"learning_rate": 8.054846048129406e-05,
"loss": 0.1805,
"step": 16385
},
{
"epoch": 2.064237805838083,
"grad_norm": 0.2083364725112915,
"learning_rate": 8.045104877406418e-05,
"loss": 0.1809,
"step": 16390
},
{
"epoch": 2.0648675882482603,
"grad_norm": 0.23679542541503906,
"learning_rate": 8.035367441620976e-05,
"loss": 0.181,
"step": 16395
},
{
"epoch": 2.0654973706584374,
"grad_norm": 0.2173621654510498,
"learning_rate": 8.025633746002311e-05,
"loss": 0.1857,
"step": 16400
},
{
"epoch": 2.066127153068615,
"grad_norm": 0.22376009821891785,
"learning_rate": 8.015903795777634e-05,
"loss": 0.1832,
"step": 16405
},
{
"epoch": 2.066756935478792,
"grad_norm": 0.24444858729839325,
"learning_rate": 8.00617759617215e-05,
"loss": 0.1959,
"step": 16410
},
{
"epoch": 2.0673867178889695,
"grad_norm": 0.21472635865211487,
"learning_rate": 7.996455152409055e-05,
"loss": 0.17,
"step": 16415
},
{
"epoch": 2.0680165002991466,
"grad_norm": 0.22463464736938477,
"learning_rate": 7.986736469709521e-05,
"loss": 0.1847,
"step": 16420
},
{
"epoch": 2.0686462827093237,
"grad_norm": 0.2251402884721756,
"learning_rate": 7.977021553292696e-05,
"loss": 0.1822,
"step": 16425
},
{
"epoch": 2.0692760651195012,
"grad_norm": 0.21793001890182495,
"learning_rate": 7.967310408375725e-05,
"loss": 0.1862,
"step": 16430
},
{
"epoch": 2.0699058475296783,
"grad_norm": 0.2344975620508194,
"learning_rate": 7.957603040173714e-05,
"loss": 0.1791,
"step": 16435
},
{
"epoch": 2.070535629939856,
"grad_norm": 0.23466047644615173,
"learning_rate": 7.947899453899725e-05,
"loss": 0.1867,
"step": 16440
},
{
"epoch": 2.071165412350033,
"grad_norm": 0.2190965861082077,
"learning_rate": 7.93819965476482e-05,
"loss": 0.1831,
"step": 16445
},
{
"epoch": 2.0717951947602105,
"grad_norm": 0.22384218871593475,
"learning_rate": 7.928503647978012e-05,
"loss": 0.1745,
"step": 16450
},
{
"epoch": 2.0724249771703875,
"grad_norm": 0.23837679624557495,
"learning_rate": 7.918811438746272e-05,
"loss": 0.1875,
"step": 16455
},
{
"epoch": 2.073054759580565,
"grad_norm": 0.2510152757167816,
"learning_rate": 7.909123032274542e-05,
"loss": 0.1849,
"step": 16460
},
{
"epoch": 2.073684541990742,
"grad_norm": 0.2514597475528717,
"learning_rate": 7.899438433765711e-05,
"loss": 0.1882,
"step": 16465
},
{
"epoch": 2.0743143244009197,
"grad_norm": 0.20441210269927979,
"learning_rate": 7.889757648420648e-05,
"loss": 0.1754,
"step": 16470
},
{
"epoch": 2.0749441068110968,
"grad_norm": 0.25783875584602356,
"learning_rate": 7.880080681438134e-05,
"loss": 0.1859,
"step": 16475
},
{
"epoch": 2.075573889221274,
"grad_norm": 0.2234499454498291,
"learning_rate": 7.870407538014933e-05,
"loss": 0.1842,
"step": 16480
},
{
"epoch": 2.0762036716314514,
"grad_norm": 0.24572981894016266,
"learning_rate": 7.860738223345734e-05,
"loss": 0.1728,
"step": 16485
},
{
"epoch": 2.0768334540416284,
"grad_norm": 0.23702028393745422,
"learning_rate": 7.851072742623194e-05,
"loss": 0.1748,
"step": 16490
},
{
"epoch": 2.077463236451806,
"grad_norm": 0.23450568318367004,
"learning_rate": 7.84141110103789e-05,
"loss": 0.1826,
"step": 16495
},
{
"epoch": 2.078093018861983,
"grad_norm": 0.23022450506687164,
"learning_rate": 7.831753303778342e-05,
"loss": 0.1684,
"step": 16500
},
{
"epoch": 2.0787228012721606,
"grad_norm": 0.22727181017398834,
"learning_rate": 7.822099356031014e-05,
"loss": 0.1751,
"step": 16505
},
{
"epoch": 2.0793525836823377,
"grad_norm": 0.20935000479221344,
"learning_rate": 7.812449262980289e-05,
"loss": 0.1748,
"step": 16510
},
{
"epoch": 2.079982366092515,
"grad_norm": 0.2445985972881317,
"learning_rate": 7.802803029808492e-05,
"loss": 0.1869,
"step": 16515
},
{
"epoch": 2.0806121485026923,
"grad_norm": 0.21021974086761475,
"learning_rate": 7.793160661695867e-05,
"loss": 0.1778,
"step": 16520
},
{
"epoch": 2.08124193091287,
"grad_norm": 0.20149335265159607,
"learning_rate": 7.783522163820587e-05,
"loss": 0.1685,
"step": 16525
},
{
"epoch": 2.081871713323047,
"grad_norm": 0.2342994064092636,
"learning_rate": 7.773887541358749e-05,
"loss": 0.1714,
"step": 16530
},
{
"epoch": 2.082501495733224,
"grad_norm": 0.2518448829650879,
"learning_rate": 7.764256799484364e-05,
"loss": 0.1899,
"step": 16535
},
{
"epoch": 2.0831312781434015,
"grad_norm": 0.22891752421855927,
"learning_rate": 7.754629943369365e-05,
"loss": 0.1724,
"step": 16540
},
{
"epoch": 2.0837610605535786,
"grad_norm": 0.2348988950252533,
"learning_rate": 7.74500697818358e-05,
"loss": 0.1772,
"step": 16545
},
{
"epoch": 2.084390842963756,
"grad_norm": 0.21126072108745575,
"learning_rate": 7.735387909094772e-05,
"loss": 0.182,
"step": 16550
},
{
"epoch": 2.085020625373933,
"grad_norm": 0.2134072482585907,
"learning_rate": 7.725772741268598e-05,
"loss": 0.1861,
"step": 16555
},
{
"epoch": 2.0856504077841107,
"grad_norm": 0.22559498250484467,
"learning_rate": 7.716161479868623e-05,
"loss": 0.1745,
"step": 16560
},
{
"epoch": 2.086280190194288,
"grad_norm": 0.2076030671596527,
"learning_rate": 7.706554130056315e-05,
"loss": 0.1811,
"step": 16565
},
{
"epoch": 2.0869099726044653,
"grad_norm": 0.24279461801052094,
"learning_rate": 7.696950696991032e-05,
"loss": 0.1829,
"step": 16570
},
{
"epoch": 2.0875397550146424,
"grad_norm": 0.21790249645709991,
"learning_rate": 7.687351185830058e-05,
"loss": 0.1835,
"step": 16575
},
{
"epoch": 2.08816953742482,
"grad_norm": 0.2210235744714737,
"learning_rate": 7.677755601728527e-05,
"loss": 0.1678,
"step": 16580
},
{
"epoch": 2.088799319834997,
"grad_norm": 0.21354030072689056,
"learning_rate": 7.668163949839492e-05,
"loss": 0.1863,
"step": 16585
},
{
"epoch": 2.089429102245174,
"grad_norm": 0.264240026473999,
"learning_rate": 7.658576235313896e-05,
"loss": 0.1879,
"step": 16590
},
{
"epoch": 2.0900588846553516,
"grad_norm": 0.2348974198102951,
"learning_rate": 7.648992463300561e-05,
"loss": 0.1796,
"step": 16595
},
{
"epoch": 2.0906886670655287,
"grad_norm": 0.23128418624401093,
"learning_rate": 7.639412638946186e-05,
"loss": 0.1793,
"step": 16600
},
{
"epoch": 2.091318449475706,
"grad_norm": 0.2405007928609848,
"learning_rate": 7.629836767395359e-05,
"loss": 0.1856,
"step": 16605
},
{
"epoch": 2.0919482318858833,
"grad_norm": 0.23123788833618164,
"learning_rate": 7.620264853790539e-05,
"loss": 0.1752,
"step": 16610
},
{
"epoch": 2.092578014296061,
"grad_norm": 0.22082751989364624,
"learning_rate": 7.610696903272062e-05,
"loss": 0.1731,
"step": 16615
},
{
"epoch": 2.093207796706238,
"grad_norm": 0.23356421291828156,
"learning_rate": 7.601132920978139e-05,
"loss": 0.1839,
"step": 16620
},
{
"epoch": 2.0938375791164154,
"grad_norm": 0.2418486326932907,
"learning_rate": 7.591572912044846e-05,
"loss": 0.1883,
"step": 16625
},
{
"epoch": 2.0944673615265925,
"grad_norm": 0.2357870191335678,
"learning_rate": 7.58201688160612e-05,
"loss": 0.176,
"step": 16630
},
{
"epoch": 2.09509714393677,
"grad_norm": 0.27169832587242126,
"learning_rate": 7.572464834793778e-05,
"loss": 0.1824,
"step": 16635
},
{
"epoch": 2.095726926346947,
"grad_norm": 0.23245801031589508,
"learning_rate": 7.562916776737488e-05,
"loss": 0.1937,
"step": 16640
},
{
"epoch": 2.096356708757124,
"grad_norm": 0.2312193661928177,
"learning_rate": 7.55337271256476e-05,
"loss": 0.1873,
"step": 16645
},
{
"epoch": 2.0969864911673017,
"grad_norm": 0.2394751012325287,
"learning_rate": 7.543832647400989e-05,
"loss": 0.1748,
"step": 16650
},
{
"epoch": 2.097616273577479,
"grad_norm": 0.2679862976074219,
"learning_rate": 7.534296586369402e-05,
"loss": 0.1868,
"step": 16655
},
{
"epoch": 2.0982460559876563,
"grad_norm": 0.2397966831922531,
"learning_rate": 7.524764534591086e-05,
"loss": 0.1768,
"step": 16660
},
{
"epoch": 2.0988758383978334,
"grad_norm": 0.22550681233406067,
"learning_rate": 7.515236497184965e-05,
"loss": 0.1764,
"step": 16665
},
{
"epoch": 2.099505620808011,
"grad_norm": 0.23124639689922333,
"learning_rate": 7.505712479267809e-05,
"loss": 0.1828,
"step": 16670
},
{
"epoch": 2.100135403218188,
"grad_norm": 0.2034096121788025,
"learning_rate": 7.496192485954254e-05,
"loss": 0.179,
"step": 16675
},
{
"epoch": 2.1007651856283656,
"grad_norm": 0.2237498164176941,
"learning_rate": 7.486676522356732e-05,
"loss": 0.1867,
"step": 16680
},
{
"epoch": 2.1013949680385426,
"grad_norm": 0.22583693265914917,
"learning_rate": 7.477164593585537e-05,
"loss": 0.1882,
"step": 16685
},
{
"epoch": 2.10202475044872,
"grad_norm": 0.20145735144615173,
"learning_rate": 7.467656704748792e-05,
"loss": 0.1749,
"step": 16690
},
{
"epoch": 2.1026545328588973,
"grad_norm": 0.204311341047287,
"learning_rate": 7.458152860952458e-05,
"loss": 0.1803,
"step": 16695
},
{
"epoch": 2.1032843152690743,
"grad_norm": 0.23768644034862518,
"learning_rate": 7.448653067300313e-05,
"loss": 0.1915,
"step": 16700
},
{
"epoch": 2.103914097679252,
"grad_norm": 0.21348991990089417,
"learning_rate": 7.439157328893961e-05,
"loss": 0.1778,
"step": 16705
},
{
"epoch": 2.104543880089429,
"grad_norm": 0.22427400946617126,
"learning_rate": 7.429665650832831e-05,
"loss": 0.1712,
"step": 16710
},
{
"epoch": 2.1051736624996065,
"grad_norm": 0.22512148320674896,
"learning_rate": 7.420178038214172e-05,
"loss": 0.1889,
"step": 16715
},
{
"epoch": 2.1058034449097836,
"grad_norm": 0.22715777158737183,
"learning_rate": 7.410694496133048e-05,
"loss": 0.1737,
"step": 16720
},
{
"epoch": 2.106433227319961,
"grad_norm": 0.2505483627319336,
"learning_rate": 7.401215029682339e-05,
"loss": 0.1809,
"step": 16725
},
{
"epoch": 2.107063009730138,
"grad_norm": 0.2218826860189438,
"learning_rate": 7.391739643952725e-05,
"loss": 0.1766,
"step": 16730
},
{
"epoch": 2.1076927921403157,
"grad_norm": 0.2085668295621872,
"learning_rate": 7.38226834403272e-05,
"loss": 0.1739,
"step": 16735
},
{
"epoch": 2.1083225745504928,
"grad_norm": 0.21690475940704346,
"learning_rate": 7.372801135008622e-05,
"loss": 0.1738,
"step": 16740
},
{
"epoch": 2.1089523569606703,
"grad_norm": 0.263988733291626,
"learning_rate": 7.363338021964545e-05,
"loss": 0.1951,
"step": 16745
},
{
"epoch": 2.1095821393708474,
"grad_norm": 0.24228844046592712,
"learning_rate": 7.353879009982377e-05,
"loss": 0.1775,
"step": 16750
},
{
"epoch": 2.1102119217810245,
"grad_norm": 0.2030615508556366,
"learning_rate": 7.344424104141843e-05,
"loss": 0.1754,
"step": 16755
},
{
"epoch": 2.110841704191202,
"grad_norm": 0.22505883872509003,
"learning_rate": 7.334973309520438e-05,
"loss": 0.1814,
"step": 16760
},
{
"epoch": 2.111471486601379,
"grad_norm": 0.28446871042251587,
"learning_rate": 7.32552663119345e-05,
"loss": 0.2009,
"step": 16765
},
{
"epoch": 2.1121012690115566,
"grad_norm": 0.2320084124803543,
"learning_rate": 7.316084074233968e-05,
"loss": 0.1866,
"step": 16770
},
{
"epoch": 2.1127310514217337,
"grad_norm": 0.23432306945323944,
"learning_rate": 7.306645643712851e-05,
"loss": 0.1838,
"step": 16775
},
{
"epoch": 2.113360833831911,
"grad_norm": 0.20252206921577454,
"learning_rate": 7.297211344698769e-05,
"loss": 0.1753,
"step": 16780
},
{
"epoch": 2.1139906162420883,
"grad_norm": 0.25251004099845886,
"learning_rate": 7.28778118225814e-05,
"loss": 0.1836,
"step": 16785
},
{
"epoch": 2.114620398652266,
"grad_norm": 0.2514311373233795,
"learning_rate": 7.278355161455176e-05,
"loss": 0.1838,
"step": 16790
},
{
"epoch": 2.115250181062443,
"grad_norm": 0.21513232588768005,
"learning_rate": 7.268933287351876e-05,
"loss": 0.1745,
"step": 16795
},
{
"epoch": 2.1158799634726204,
"grad_norm": 0.2200087606906891,
"learning_rate": 7.259515565007999e-05,
"loss": 0.1839,
"step": 16800
},
{
"epoch": 2.1165097458827975,
"grad_norm": 0.22383321821689606,
"learning_rate": 7.250101999481073e-05,
"loss": 0.1865,
"step": 16805
},
{
"epoch": 2.1171395282929746,
"grad_norm": 0.2382001131772995,
"learning_rate": 7.2406925958264e-05,
"loss": 0.1862,
"step": 16810
},
{
"epoch": 2.117769310703152,
"grad_norm": 0.2178415209054947,
"learning_rate": 7.231287359097045e-05,
"loss": 0.1799,
"step": 16815
},
{
"epoch": 2.118399093113329,
"grad_norm": 0.22616611421108246,
"learning_rate": 7.221886294343834e-05,
"loss": 0.1819,
"step": 16820
},
{
"epoch": 2.1190288755235067,
"grad_norm": 0.24810658395290375,
"learning_rate": 7.212489406615355e-05,
"loss": 0.181,
"step": 16825
},
{
"epoch": 2.119658657933684,
"grad_norm": 0.2408507764339447,
"learning_rate": 7.20309670095795e-05,
"loss": 0.1867,
"step": 16830
},
{
"epoch": 2.1202884403438613,
"grad_norm": 0.20721390843391418,
"learning_rate": 7.19370818241571e-05,
"loss": 0.175,
"step": 16835
},
{
"epoch": 2.1209182227540384,
"grad_norm": 0.22691728174686432,
"learning_rate": 7.184323856030497e-05,
"loss": 0.1753,
"step": 16840
},
{
"epoch": 2.121548005164216,
"grad_norm": 0.22788456082344055,
"learning_rate": 7.174943726841902e-05,
"loss": 0.1829,
"step": 16845
},
{
"epoch": 2.122177787574393,
"grad_norm": 0.21744227409362793,
"learning_rate": 7.165567799887268e-05,
"loss": 0.1797,
"step": 16850
},
{
"epoch": 2.1228075699845705,
"grad_norm": 0.211074560880661,
"learning_rate": 7.156196080201685e-05,
"loss": 0.1875,
"step": 16855
},
{
"epoch": 2.1234373523947476,
"grad_norm": 0.27859583497047424,
"learning_rate": 7.146828572817975e-05,
"loss": 0.1791,
"step": 16860
},
{
"epoch": 2.1240671348049247,
"grad_norm": 0.202862948179245,
"learning_rate": 7.13746528276671e-05,
"loss": 0.1752,
"step": 16865
},
{
"epoch": 2.1246969172151022,
"grad_norm": 0.2529730498790741,
"learning_rate": 7.128106215076187e-05,
"loss": 0.1734,
"step": 16870
},
{
"epoch": 2.1253266996252793,
"grad_norm": 0.22796177864074707,
"learning_rate": 7.118751374772433e-05,
"loss": 0.1807,
"step": 16875
},
{
"epoch": 2.125956482035457,
"grad_norm": 0.20112904906272888,
"learning_rate": 7.109400766879223e-05,
"loss": 0.1711,
"step": 16880
},
{
"epoch": 2.126586264445634,
"grad_norm": 0.22492708265781403,
"learning_rate": 7.100054396418048e-05,
"loss": 0.1784,
"step": 16885
},
{
"epoch": 2.1272160468558114,
"grad_norm": 0.25224363803863525,
"learning_rate": 7.09071226840811e-05,
"loss": 0.185,
"step": 16890
},
{
"epoch": 2.1278458292659885,
"grad_norm": 0.24734210968017578,
"learning_rate": 7.081374387866346e-05,
"loss": 0.1739,
"step": 16895
},
{
"epoch": 2.128475611676166,
"grad_norm": 0.21726474165916443,
"learning_rate": 7.07204075980742e-05,
"loss": 0.1695,
"step": 16900
},
{
"epoch": 2.129105394086343,
"grad_norm": 0.2073916345834732,
"learning_rate": 7.062711389243703e-05,
"loss": 0.1782,
"step": 16905
},
{
"epoch": 2.1297351764965207,
"grad_norm": 0.2361113578081131,
"learning_rate": 7.053386281185274e-05,
"loss": 0.1787,
"step": 16910
},
{
"epoch": 2.1303649589066977,
"grad_norm": 0.22586499154567719,
"learning_rate": 7.044065440639933e-05,
"loss": 0.1738,
"step": 16915
},
{
"epoch": 2.130994741316875,
"grad_norm": 0.23469188809394836,
"learning_rate": 7.034748872613184e-05,
"loss": 0.1805,
"step": 16920
},
{
"epoch": 2.1316245237270524,
"grad_norm": 0.1897682100534439,
"learning_rate": 7.025436582108234e-05,
"loss": 0.171,
"step": 16925
},
{
"epoch": 2.1322543061372294,
"grad_norm": 0.22100795805454254,
"learning_rate": 7.016128574126e-05,
"loss": 0.1736,
"step": 16930
},
{
"epoch": 2.132884088547407,
"grad_norm": 0.2332223504781723,
"learning_rate": 7.006824853665085e-05,
"loss": 0.1729,
"step": 16935
},
{
"epoch": 2.133513870957584,
"grad_norm": 0.23929065465927124,
"learning_rate": 6.997525425721814e-05,
"loss": 0.1736,
"step": 16940
},
{
"epoch": 2.1341436533677616,
"grad_norm": 0.26240813732147217,
"learning_rate": 6.988230295290185e-05,
"loss": 0.1798,
"step": 16945
},
{
"epoch": 2.1347734357779387,
"grad_norm": 0.22387517988681793,
"learning_rate": 6.978939467361895e-05,
"loss": 0.1734,
"step": 16950
},
{
"epoch": 2.135403218188116,
"grad_norm": 0.246952623128891,
"learning_rate": 6.969652946926332e-05,
"loss": 0.1834,
"step": 16955
},
{
"epoch": 2.1360330005982933,
"grad_norm": 0.25226834416389465,
"learning_rate": 6.960370738970568e-05,
"loss": 0.1798,
"step": 16960
},
{
"epoch": 2.136662783008471,
"grad_norm": 0.22118602693080902,
"learning_rate": 6.951092848479364e-05,
"loss": 0.1863,
"step": 16965
},
{
"epoch": 2.137292565418648,
"grad_norm": 0.2567583918571472,
"learning_rate": 6.941819280435155e-05,
"loss": 0.1828,
"step": 16970
},
{
"epoch": 2.137922347828825,
"grad_norm": 0.28791603446006775,
"learning_rate": 6.93255003981806e-05,
"loss": 0.1817,
"step": 16975
},
{
"epoch": 2.1385521302390025,
"grad_norm": 0.2655430734157562,
"learning_rate": 6.923285131605871e-05,
"loss": 0.1789,
"step": 16980
},
{
"epoch": 2.1391819126491796,
"grad_norm": 0.24513307213783264,
"learning_rate": 6.914024560774061e-05,
"loss": 0.1885,
"step": 16985
},
{
"epoch": 2.139811695059357,
"grad_norm": 0.211643248796463,
"learning_rate": 6.904768332295772e-05,
"loss": 0.188,
"step": 16990
},
{
"epoch": 2.140441477469534,
"grad_norm": 0.2373894900083542,
"learning_rate": 6.895516451141791e-05,
"loss": 0.1819,
"step": 16995
},
{
"epoch": 2.1410712598797117,
"grad_norm": 0.22991600632667542,
"learning_rate": 6.88626892228061e-05,
"loss": 0.189,
"step": 17000
},
{
"epoch": 2.1410712598797117,
"eval_loss": 0.3501429557800293,
"eval_runtime": 6.1606,
"eval_samples_per_second": 162.322,
"eval_steps_per_second": 10.226,
"step": 17000
},
{
"epoch": 2.141701042289889,
"grad_norm": 0.23578788340091705,
"learning_rate": 6.877025750678352e-05,
"loss": 0.1804,
"step": 17005
},
{
"epoch": 2.1423308247000663,
"grad_norm": 0.20814631879329681,
"learning_rate": 6.867786941298816e-05,
"loss": 0.1776,
"step": 17010
},
{
"epoch": 2.1429606071102434,
"grad_norm": 0.24113385379314423,
"learning_rate": 6.858552499103451e-05,
"loss": 0.171,
"step": 17015
},
{
"epoch": 2.143590389520421,
"grad_norm": 0.2317270189523697,
"learning_rate": 6.84932242905136e-05,
"loss": 0.1881,
"step": 17020
},
{
"epoch": 2.144220171930598,
"grad_norm": 0.26681753993034363,
"learning_rate": 6.840096736099314e-05,
"loss": 0.1792,
"step": 17025
},
{
"epoch": 2.144849954340775,
"grad_norm": 0.2119479924440384,
"learning_rate": 6.83087542520171e-05,
"loss": 0.178,
"step": 17030
},
{
"epoch": 2.1454797367509526,
"grad_norm": 0.20759105682373047,
"learning_rate": 6.821658501310604e-05,
"loss": 0.1754,
"step": 17035
},
{
"epoch": 2.1461095191611297,
"grad_norm": 0.23515643179416656,
"learning_rate": 6.812445969375691e-05,
"loss": 0.1854,
"step": 17040
},
{
"epoch": 2.146739301571307,
"grad_norm": 0.20694191753864288,
"learning_rate": 6.803237834344322e-05,
"loss": 0.1801,
"step": 17045
},
{
"epoch": 2.1473690839814843,
"grad_norm": 0.21541932225227356,
"learning_rate": 6.794034101161469e-05,
"loss": 0.1752,
"step": 17050
},
{
"epoch": 2.147998866391662,
"grad_norm": 0.20586980879306793,
"learning_rate": 6.784834774769748e-05,
"loss": 0.1803,
"step": 17055
},
{
"epoch": 2.148628648801839,
"grad_norm": 0.23750190436840057,
"learning_rate": 6.775639860109406e-05,
"loss": 0.1842,
"step": 17060
},
{
"epoch": 2.1492584312120164,
"grad_norm": 0.2041424959897995,
"learning_rate": 6.766449362118324e-05,
"loss": 0.1729,
"step": 17065
},
{
"epoch": 2.1498882136221935,
"grad_norm": 0.24630430340766907,
"learning_rate": 6.757263285732009e-05,
"loss": 0.1821,
"step": 17070
},
{
"epoch": 2.150517996032371,
"grad_norm": 0.23113587498664856,
"learning_rate": 6.748081635883594e-05,
"loss": 0.1821,
"step": 17075
},
{
"epoch": 2.151147778442548,
"grad_norm": 0.203240305185318,
"learning_rate": 6.738904417503829e-05,
"loss": 0.1767,
"step": 17080
},
{
"epoch": 2.151777560852725,
"grad_norm": 0.2500320374965668,
"learning_rate": 6.7297316355211e-05,
"loss": 0.1852,
"step": 17085
},
{
"epoch": 2.1524073432629027,
"grad_norm": 0.2349621206521988,
"learning_rate": 6.720563294861403e-05,
"loss": 0.1764,
"step": 17090
},
{
"epoch": 2.15303712567308,
"grad_norm": 0.2351408451795578,
"learning_rate": 6.71139940044833e-05,
"loss": 0.1835,
"step": 17095
},
{
"epoch": 2.1536669080832573,
"grad_norm": 0.2078278511762619,
"learning_rate": 6.702239957203108e-05,
"loss": 0.1783,
"step": 17100
},
{
"epoch": 2.1542966904934344,
"grad_norm": 0.23805204033851624,
"learning_rate": 6.693084970044574e-05,
"loss": 0.1858,
"step": 17105
},
{
"epoch": 2.154926472903612,
"grad_norm": 0.22789132595062256,
"learning_rate": 6.683934443889161e-05,
"loss": 0.1839,
"step": 17110
},
{
"epoch": 2.155556255313789,
"grad_norm": 0.27035263180732727,
"learning_rate": 6.674788383650911e-05,
"loss": 0.1878,
"step": 17115
},
{
"epoch": 2.1561860377239666,
"grad_norm": 0.21787506341934204,
"learning_rate": 6.665646794241468e-05,
"loss": 0.1854,
"step": 17120
},
{
"epoch": 2.1568158201341436,
"grad_norm": 0.2302270233631134,
"learning_rate": 6.656509680570073e-05,
"loss": 0.1822,
"step": 17125
},
{
"epoch": 2.157445602544321,
"grad_norm": 0.21228045225143433,
"learning_rate": 6.647377047543563e-05,
"loss": 0.1855,
"step": 17130
},
{
"epoch": 2.1580753849544982,
"grad_norm": 0.22131386399269104,
"learning_rate": 6.638248900066375e-05,
"loss": 0.1763,
"step": 17135
},
{
"epoch": 2.1587051673646753,
"grad_norm": 0.2691584527492523,
"learning_rate": 6.629125243040524e-05,
"loss": 0.1815,
"step": 17140
},
{
"epoch": 2.159334949774853,
"grad_norm": 0.22926035523414612,
"learning_rate": 6.620006081365634e-05,
"loss": 0.1833,
"step": 17145
},
{
"epoch": 2.15996473218503,
"grad_norm": 0.20654956996440887,
"learning_rate": 6.610891419938899e-05,
"loss": 0.1755,
"step": 17150
},
{
"epoch": 2.1605945145952075,
"grad_norm": 0.22390377521514893,
"learning_rate": 6.601781263655096e-05,
"loss": 0.1839,
"step": 17155
},
{
"epoch": 2.1612242970053845,
"grad_norm": 0.23877164721488953,
"learning_rate": 6.592675617406593e-05,
"loss": 0.1739,
"step": 17160
},
{
"epoch": 2.161854079415562,
"grad_norm": 0.24347762763500214,
"learning_rate": 6.583574486083325e-05,
"loss": 0.1863,
"step": 17165
},
{
"epoch": 2.162483861825739,
"grad_norm": 0.23407521843910217,
"learning_rate": 6.574477874572811e-05,
"loss": 0.1741,
"step": 17170
},
{
"epoch": 2.1631136442359167,
"grad_norm": 0.23338505625724792,
"learning_rate": 6.565385787760137e-05,
"loss": 0.1754,
"step": 17175
},
{
"epoch": 2.1637434266460938,
"grad_norm": 0.2206541895866394,
"learning_rate": 6.556298230527962e-05,
"loss": 0.1706,
"step": 17180
},
{
"epoch": 2.1643732090562713,
"grad_norm": 0.20819810032844543,
"learning_rate": 6.547215207756504e-05,
"loss": 0.1735,
"step": 17185
},
{
"epoch": 2.1650029914664484,
"grad_norm": 0.22891941666603088,
"learning_rate": 6.53813672432357e-05,
"loss": 0.187,
"step": 17190
},
{
"epoch": 2.1656327738766254,
"grad_norm": 0.2094859778881073,
"learning_rate": 6.52906278510451e-05,
"loss": 0.1795,
"step": 17195
},
{
"epoch": 2.166262556286803,
"grad_norm": 0.20969723165035248,
"learning_rate": 6.519993394972219e-05,
"loss": 0.1679,
"step": 17200
},
{
"epoch": 2.16689233869698,
"grad_norm": 0.25252285599708557,
"learning_rate": 6.510928558797185e-05,
"loss": 0.183,
"step": 17205
},
{
"epoch": 2.1675221211071576,
"grad_norm": 0.22556447982788086,
"learning_rate": 6.501868281447424e-05,
"loss": 0.1694,
"step": 17210
},
{
"epoch": 2.1681519035173347,
"grad_norm": 0.2429586797952652,
"learning_rate": 6.492812567788516e-05,
"loss": 0.18,
"step": 17215
},
{
"epoch": 2.168781685927512,
"grad_norm": 0.2400483787059784,
"learning_rate": 6.483761422683582e-05,
"loss": 0.1818,
"step": 17220
},
{
"epoch": 2.1694114683376893,
"grad_norm": 0.228154718875885,
"learning_rate": 6.47471485099329e-05,
"loss": 0.1744,
"step": 17225
},
{
"epoch": 2.170041250747867,
"grad_norm": 0.21748559176921844,
"learning_rate": 6.465672857575875e-05,
"loss": 0.1765,
"step": 17230
},
{
"epoch": 2.170671033158044,
"grad_norm": 0.2296319603919983,
"learning_rate": 6.456635447287073e-05,
"loss": 0.1881,
"step": 17235
},
{
"epoch": 2.1713008155682214,
"grad_norm": 0.2402602881193161,
"learning_rate": 6.447602624980186e-05,
"loss": 0.1769,
"step": 17240
},
{
"epoch": 2.1719305979783985,
"grad_norm": 0.2783866226673126,
"learning_rate": 6.438574395506043e-05,
"loss": 0.1836,
"step": 17245
},
{
"epoch": 2.1725603803885756,
"grad_norm": 0.20301677286624908,
"learning_rate": 6.429550763713017e-05,
"loss": 0.1655,
"step": 17250
},
{
"epoch": 2.173190162798753,
"grad_norm": 0.21163971722126007,
"learning_rate": 6.420531734447e-05,
"loss": 0.1764,
"step": 17255
},
{
"epoch": 2.17381994520893,
"grad_norm": 0.24942253530025482,
"learning_rate": 6.41151731255142e-05,
"loss": 0.1833,
"step": 17260
},
{
"epoch": 2.1744497276191077,
"grad_norm": 0.22958967089653015,
"learning_rate": 6.402507502867222e-05,
"loss": 0.1703,
"step": 17265
},
{
"epoch": 2.175079510029285,
"grad_norm": 0.21424312889575958,
"learning_rate": 6.393502310232886e-05,
"loss": 0.1757,
"step": 17270
},
{
"epoch": 2.1757092924394623,
"grad_norm": 0.20825864374637604,
"learning_rate": 6.384501739484401e-05,
"loss": 0.1715,
"step": 17275
},
{
"epoch": 2.1763390748496394,
"grad_norm": 0.21387939155101776,
"learning_rate": 6.375505795455281e-05,
"loss": 0.1697,
"step": 17280
},
{
"epoch": 2.176968857259817,
"grad_norm": 0.2073564976453781,
"learning_rate": 6.366514482976546e-05,
"loss": 0.1846,
"step": 17285
},
{
"epoch": 2.177598639669994,
"grad_norm": 0.21405762434005737,
"learning_rate": 6.35752780687675e-05,
"loss": 0.1777,
"step": 17290
},
{
"epoch": 2.1782284220801715,
"grad_norm": 0.22343981266021729,
"learning_rate": 6.348545771981938e-05,
"loss": 0.1801,
"step": 17295
},
{
"epoch": 2.1788582044903486,
"grad_norm": 0.22697073221206665,
"learning_rate": 6.339568383115668e-05,
"loss": 0.1829,
"step": 17300
},
{
"epoch": 2.1794879869005257,
"grad_norm": 0.2561056613922119,
"learning_rate": 6.330595645098996e-05,
"loss": 0.185,
"step": 17305
},
{
"epoch": 2.1801177693107032,
"grad_norm": 0.2563771903514862,
"learning_rate": 6.321627562750495e-05,
"loss": 0.1752,
"step": 17310
},
{
"epoch": 2.1807475517208803,
"grad_norm": 0.21171104907989502,
"learning_rate": 6.312664140886228e-05,
"loss": 0.166,
"step": 17315
},
{
"epoch": 2.181377334131058,
"grad_norm": 0.23899543285369873,
"learning_rate": 6.303705384319757e-05,
"loss": 0.1828,
"step": 17320
},
{
"epoch": 2.182007116541235,
"grad_norm": 0.26108884811401367,
"learning_rate": 6.29475129786214e-05,
"loss": 0.1829,
"step": 17325
},
{
"epoch": 2.1826368989514124,
"grad_norm": 0.2397276908159256,
"learning_rate": 6.285801886321919e-05,
"loss": 0.1733,
"step": 17330
},
{
"epoch": 2.1832666813615895,
"grad_norm": 0.22638286650180817,
"learning_rate": 6.27685715450515e-05,
"loss": 0.1719,
"step": 17335
},
{
"epoch": 2.183896463771767,
"grad_norm": 0.2424623966217041,
"learning_rate": 6.26791710721534e-05,
"loss": 0.1749,
"step": 17340
},
{
"epoch": 2.184526246181944,
"grad_norm": 0.23895704746246338,
"learning_rate": 6.2589817492535e-05,
"loss": 0.178,
"step": 17345
},
{
"epoch": 2.1851560285921217,
"grad_norm": 0.2223139852285385,
"learning_rate": 6.250051085418133e-05,
"loss": 0.1872,
"step": 17350
},
{
"epoch": 2.1857858110022987,
"grad_norm": 0.22255347669124603,
"learning_rate": 6.241125120505204e-05,
"loss": 0.1791,
"step": 17355
},
{
"epoch": 2.186415593412476,
"grad_norm": 0.23792186379432678,
"learning_rate": 6.232203859308157e-05,
"loss": 0.1738,
"step": 17360
},
{
"epoch": 2.1870453758226533,
"grad_norm": 0.24884961545467377,
"learning_rate": 6.223287306617915e-05,
"loss": 0.1778,
"step": 17365
},
{
"epoch": 2.1876751582328304,
"grad_norm": 0.2130117118358612,
"learning_rate": 6.214375467222873e-05,
"loss": 0.1666,
"step": 17370
},
{
"epoch": 2.188304940643008,
"grad_norm": 0.20538979768753052,
"learning_rate": 6.205468345908888e-05,
"loss": 0.1716,
"step": 17375
},
{
"epoch": 2.188934723053185,
"grad_norm": 0.2519354224205017,
"learning_rate": 6.196565947459292e-05,
"loss": 0.1885,
"step": 17380
},
{
"epoch": 2.1895645054633626,
"grad_norm": 0.2644721567630768,
"learning_rate": 6.187668276654872e-05,
"loss": 0.1923,
"step": 17385
},
{
"epoch": 2.1901942878735396,
"grad_norm": 0.22676245868206024,
"learning_rate": 6.178775338273876e-05,
"loss": 0.1745,
"step": 17390
},
{
"epoch": 2.190824070283717,
"grad_norm": 0.21329110860824585,
"learning_rate": 6.169887137092029e-05,
"loss": 0.1782,
"step": 17395
},
{
"epoch": 2.1914538526938943,
"grad_norm": 0.2096760869026184,
"learning_rate": 6.161003677882489e-05,
"loss": 0.1705,
"step": 17400
},
{
"epoch": 2.1920836351040713,
"grad_norm": 0.20192061364650726,
"learning_rate": 6.15212496541588e-05,
"loss": 0.1662,
"step": 17405
},
{
"epoch": 2.192713417514249,
"grad_norm": 0.2351575493812561,
"learning_rate": 6.14325100446027e-05,
"loss": 0.1716,
"step": 17410
},
{
"epoch": 2.193343199924426,
"grad_norm": 0.23202987015247345,
"learning_rate": 6.13438179978118e-05,
"loss": 0.1848,
"step": 17415
},
{
"epoch": 2.1939729823346035,
"grad_norm": 0.22229251265525818,
"learning_rate": 6.125517356141576e-05,
"loss": 0.1757,
"step": 17420
},
{
"epoch": 2.1946027647447806,
"grad_norm": 0.20741891860961914,
"learning_rate": 6.116657678301868e-05,
"loss": 0.1804,
"step": 17425
},
{
"epoch": 2.195232547154958,
"grad_norm": 0.2023356705904007,
"learning_rate": 6.107802771019895e-05,
"loss": 0.168,
"step": 17430
},
{
"epoch": 2.195862329565135,
"grad_norm": 0.30032244324684143,
"learning_rate": 6.098952639050961e-05,
"loss": 0.176,
"step": 17435
},
{
"epoch": 2.1964921119753127,
"grad_norm": 0.2093886286020279,
"learning_rate": 6.090107287147786e-05,
"loss": 0.171,
"step": 17440
},
{
"epoch": 2.1971218943854898,
"grad_norm": 0.20918086171150208,
"learning_rate": 6.081266720060517e-05,
"loss": 0.1705,
"step": 17445
},
{
"epoch": 2.1977516767956673,
"grad_norm": 0.2089412659406662,
"learning_rate": 6.072430942536737e-05,
"loss": 0.1797,
"step": 17450
},
{
"epoch": 2.1983814592058444,
"grad_norm": 0.2460128515958786,
"learning_rate": 6.0635999593214765e-05,
"loss": 0.1752,
"step": 17455
},
{
"epoch": 2.1990112416160215,
"grad_norm": 0.25952646136283875,
"learning_rate": 6.0547737751571654e-05,
"loss": 0.1784,
"step": 17460
},
{
"epoch": 2.199641024026199,
"grad_norm": 0.2011132687330246,
"learning_rate": 6.0459523947836674e-05,
"loss": 0.1714,
"step": 17465
},
{
"epoch": 2.200270806436376,
"grad_norm": 0.19077162444591522,
"learning_rate": 6.03713582293826e-05,
"loss": 0.174,
"step": 17470
},
{
"epoch": 2.2009005888465536,
"grad_norm": 0.22354647517204285,
"learning_rate": 6.02832406435566e-05,
"loss": 0.1754,
"step": 17475
},
{
"epoch": 2.2015303712567307,
"grad_norm": 0.22434799373149872,
"learning_rate": 6.019517123767968e-05,
"loss": 0.1747,
"step": 17480
},
{
"epoch": 2.202160153666908,
"grad_norm": 0.22911998629570007,
"learning_rate": 6.010715005904716e-05,
"loss": 0.1812,
"step": 17485
},
{
"epoch": 2.2027899360770853,
"grad_norm": 0.23919759690761566,
"learning_rate": 6.0019177154928364e-05,
"loss": 0.1771,
"step": 17490
},
{
"epoch": 2.203419718487263,
"grad_norm": 0.21539629995822906,
"learning_rate": 5.993125257256687e-05,
"loss": 0.1799,
"step": 17495
},
{
"epoch": 2.20404950089744,
"grad_norm": 0.22069337964057922,
"learning_rate": 5.984337635918014e-05,
"loss": 0.177,
"step": 17500
},
{
"epoch": 2.2046792833076174,
"grad_norm": 0.20763671398162842,
"learning_rate": 5.97555485619597e-05,
"loss": 0.1664,
"step": 17505
},
{
"epoch": 2.2053090657177945,
"grad_norm": 0.1950199007987976,
"learning_rate": 5.966776922807109e-05,
"loss": 0.1648,
"step": 17510
},
{
"epoch": 2.2059388481279716,
"grad_norm": 0.25142478942871094,
"learning_rate": 5.95800384046538e-05,
"loss": 0.1754,
"step": 17515
},
{
"epoch": 2.206568630538149,
"grad_norm": 0.2232702225446701,
"learning_rate": 5.94923561388213e-05,
"loss": 0.1716,
"step": 17520
},
{
"epoch": 2.207198412948326,
"grad_norm": 0.27657321095466614,
"learning_rate": 5.940472247766097e-05,
"loss": 0.1878,
"step": 17525
},
{
"epoch": 2.2078281953585037,
"grad_norm": 0.21436183154582977,
"learning_rate": 5.9317137468234083e-05,
"loss": 0.1727,
"step": 17530
},
{
"epoch": 2.208457977768681,
"grad_norm": 0.19741742312908173,
"learning_rate": 5.9229601157575744e-05,
"loss": 0.1694,
"step": 17535
},
{
"epoch": 2.2090877601788583,
"grad_norm": 0.2042321413755417,
"learning_rate": 5.914211359269509e-05,
"loss": 0.17,
"step": 17540
},
{
"epoch": 2.2097175425890354,
"grad_norm": 0.21126088500022888,
"learning_rate": 5.9054674820574814e-05,
"loss": 0.1703,
"step": 17545
},
{
"epoch": 2.210347324999213,
"grad_norm": 0.20463821291923523,
"learning_rate": 5.896728488817151e-05,
"loss": 0.172,
"step": 17550
},
{
"epoch": 2.21097710740939,
"grad_norm": 0.204604834318161,
"learning_rate": 5.887994384241569e-05,
"loss": 0.1723,
"step": 17555
},
{
"epoch": 2.2116068898195675,
"grad_norm": 0.18806815147399902,
"learning_rate": 5.879265173021141e-05,
"loss": 0.161,
"step": 17560
},
{
"epoch": 2.2122366722297446,
"grad_norm": 0.22745926678180695,
"learning_rate": 5.870540859843656e-05,
"loss": 0.1653,
"step": 17565
},
{
"epoch": 2.2128664546399217,
"grad_norm": 0.1888933777809143,
"learning_rate": 5.8618214493942675e-05,
"loss": 0.1685,
"step": 17570
},
{
"epoch": 2.2134962370500992,
"grad_norm": 0.19480280578136444,
"learning_rate": 5.853106946355501e-05,
"loss": 0.1676,
"step": 17575
},
{
"epoch": 2.2141260194602763,
"grad_norm": 0.2703428864479065,
"learning_rate": 5.8443973554072383e-05,
"loss": 0.1788,
"step": 17580
},
{
"epoch": 2.214755801870454,
"grad_norm": 0.21035927534103394,
"learning_rate": 5.8356926812267335e-05,
"loss": 0.1806,
"step": 17585
},
{
"epoch": 2.215385584280631,
"grad_norm": 0.21794281899929047,
"learning_rate": 5.826992928488594e-05,
"loss": 0.1641,
"step": 17590
},
{
"epoch": 2.2160153666908085,
"grad_norm": 0.2512260675430298,
"learning_rate": 5.818298101864779e-05,
"loss": 0.1697,
"step": 17595
},
{
"epoch": 2.2166451491009855,
"grad_norm": 0.2089598923921585,
"learning_rate": 5.8096082060246226e-05,
"loss": 0.1656,
"step": 17600
},
{
"epoch": 2.217274931511163,
"grad_norm": 0.2160467952489853,
"learning_rate": 5.80092324563479e-05,
"loss": 0.185,
"step": 17605
},
{
"epoch": 2.21790471392134,
"grad_norm": 0.20858334004878998,
"learning_rate": 5.7922432253593025e-05,
"loss": 0.1721,
"step": 17610
},
{
"epoch": 2.2185344963315172,
"grad_norm": 0.2090991735458374,
"learning_rate": 5.7835681498595327e-05,
"loss": 0.1706,
"step": 17615
},
{
"epoch": 2.2191642787416948,
"grad_norm": 0.21040284633636475,
"learning_rate": 5.77489802379419e-05,
"loss": 0.1789,
"step": 17620
},
{
"epoch": 2.219794061151872,
"grad_norm": 0.22497640550136566,
"learning_rate": 5.766232851819332e-05,
"loss": 0.1779,
"step": 17625
},
{
"epoch": 2.2204238435620494,
"grad_norm": 0.2845938801765442,
"learning_rate": 5.757572638588356e-05,
"loss": 0.1771,
"step": 17630
},
{
"epoch": 2.2210536259722264,
"grad_norm": 0.21166571974754333,
"learning_rate": 5.748917388751985e-05,
"loss": 0.1741,
"step": 17635
},
{
"epoch": 2.221683408382404,
"grad_norm": 0.26706454157829285,
"learning_rate": 5.7402671069583004e-05,
"loss": 0.1715,
"step": 17640
},
{
"epoch": 2.222313190792581,
"grad_norm": 0.2745297849178314,
"learning_rate": 5.731621797852698e-05,
"loss": 0.1843,
"step": 17645
},
{
"epoch": 2.2229429732027586,
"grad_norm": 0.2507629990577698,
"learning_rate": 5.7229814660778985e-05,
"loss": 0.186,
"step": 17650
},
{
"epoch": 2.2235727556129357,
"grad_norm": 0.21768365800380707,
"learning_rate": 5.7143461162739545e-05,
"loss": 0.1731,
"step": 17655
},
{
"epoch": 2.224202538023113,
"grad_norm": 0.22099876403808594,
"learning_rate": 5.705715753078259e-05,
"loss": 0.1802,
"step": 17660
},
{
"epoch": 2.2248323204332903,
"grad_norm": 0.20643608272075653,
"learning_rate": 5.697090381125507e-05,
"loss": 0.1769,
"step": 17665
},
{
"epoch": 2.2254621028434673,
"grad_norm": 0.2723044455051422,
"learning_rate": 5.688470005047722e-05,
"loss": 0.1882,
"step": 17670
},
{
"epoch": 2.226091885253645,
"grad_norm": 0.23548351228237152,
"learning_rate": 5.679854629474238e-05,
"loss": 0.1702,
"step": 17675
},
{
"epoch": 2.226721667663822,
"grad_norm": 0.24578404426574707,
"learning_rate": 5.671244259031722e-05,
"loss": 0.1736,
"step": 17680
},
{
"epoch": 2.2273514500739995,
"grad_norm": 0.21030524373054504,
"learning_rate": 5.662638898344125e-05,
"loss": 0.1711,
"step": 17685
},
{
"epoch": 2.2279812324841766,
"grad_norm": 0.24249999225139618,
"learning_rate": 5.6540385520327275e-05,
"loss": 0.1742,
"step": 17690
},
{
"epoch": 2.228611014894354,
"grad_norm": 0.23971515893936157,
"learning_rate": 5.645443224716106e-05,
"loss": 0.1655,
"step": 17695
},
{
"epoch": 2.229240797304531,
"grad_norm": 0.2133120596408844,
"learning_rate": 5.636852921010161e-05,
"loss": 0.1786,
"step": 17700
},
{
"epoch": 2.2298705797147087,
"grad_norm": 0.23475615680217743,
"learning_rate": 5.628267645528073e-05,
"loss": 0.1753,
"step": 17705
},
{
"epoch": 2.230500362124886,
"grad_norm": 0.22111907601356506,
"learning_rate": 5.619687402880332e-05,
"loss": 0.1617,
"step": 17710
},
{
"epoch": 2.2311301445350633,
"grad_norm": 0.2323450744152069,
"learning_rate": 5.611112197674725e-05,
"loss": 0.167,
"step": 17715
},
{
"epoch": 2.2317599269452404,
"grad_norm": 0.18698996305465698,
"learning_rate": 5.602542034516333e-05,
"loss": 0.1632,
"step": 17720
},
{
"epoch": 2.2323897093554175,
"grad_norm": 0.2252064198255539,
"learning_rate": 5.5939769180075286e-05,
"loss": 0.1709,
"step": 17725
},
{
"epoch": 2.233019491765595,
"grad_norm": 0.2561705410480499,
"learning_rate": 5.5854168527479756e-05,
"loss": 0.1826,
"step": 17730
},
{
"epoch": 2.233649274175772,
"grad_norm": 0.2448531985282898,
"learning_rate": 5.576861843334625e-05,
"loss": 0.1819,
"step": 17735
},
{
"epoch": 2.2342790565859496,
"grad_norm": 0.238671213388443,
"learning_rate": 5.568311894361707e-05,
"loss": 0.1839,
"step": 17740
},
{
"epoch": 2.2349088389961267,
"grad_norm": 0.22651298344135284,
"learning_rate": 5.5597670104207485e-05,
"loss": 0.172,
"step": 17745
},
{
"epoch": 2.235538621406304,
"grad_norm": 0.23881249129772186,
"learning_rate": 5.551227196100549e-05,
"loss": 0.1698,
"step": 17750
},
{
"epoch": 2.2361684038164813,
"grad_norm": 0.23065873980522156,
"learning_rate": 5.542692455987167e-05,
"loss": 0.1727,
"step": 17755
},
{
"epoch": 2.236798186226659,
"grad_norm": 0.19607169926166534,
"learning_rate": 5.534162794663969e-05,
"loss": 0.1719,
"step": 17760
},
{
"epoch": 2.237427968636836,
"grad_norm": 0.2033766806125641,
"learning_rate": 5.525638216711573e-05,
"loss": 0.171,
"step": 17765
},
{
"epoch": 2.2380577510470134,
"grad_norm": 0.20412589609622955,
"learning_rate": 5.5171187267078733e-05,
"loss": 0.1633,
"step": 17770
},
{
"epoch": 2.2386875334571905,
"grad_norm": 0.21895913779735565,
"learning_rate": 5.508604329228028e-05,
"loss": 0.1801,
"step": 17775
},
{
"epoch": 2.2393173158673676,
"grad_norm": 0.19198501110076904,
"learning_rate": 5.50009502884446e-05,
"loss": 0.1764,
"step": 17780
},
{
"epoch": 2.239947098277545,
"grad_norm": 0.21897682547569275,
"learning_rate": 5.4915908301268724e-05,
"loss": 0.1719,
"step": 17785
},
{
"epoch": 2.240576880687722,
"grad_norm": 0.22070536017417908,
"learning_rate": 5.483091737642198e-05,
"loss": 0.1678,
"step": 17790
},
{
"epoch": 2.2412066630978997,
"grad_norm": 0.2158748209476471,
"learning_rate": 5.474597755954651e-05,
"loss": 0.1703,
"step": 17795
},
{
"epoch": 2.241836445508077,
"grad_norm": 0.21174906194210052,
"learning_rate": 5.466108889625687e-05,
"loss": 0.1698,
"step": 17800
},
{
"epoch": 2.2424662279182543,
"grad_norm": 0.23331063985824585,
"learning_rate": 5.457625143214029e-05,
"loss": 0.1855,
"step": 17805
},
{
"epoch": 2.2430960103284314,
"grad_norm": 0.2186896651983261,
"learning_rate": 5.449146521275643e-05,
"loss": 0.1629,
"step": 17810
},
{
"epoch": 2.243725792738609,
"grad_norm": 0.22406966984272003,
"learning_rate": 5.440673028363738e-05,
"loss": 0.1731,
"step": 17815
},
{
"epoch": 2.244355575148786,
"grad_norm": 0.21894322335720062,
"learning_rate": 5.432204669028777e-05,
"loss": 0.1671,
"step": 17820
},
{
"epoch": 2.2449853575589636,
"grad_norm": 0.19151312112808228,
"learning_rate": 5.4237414478184585e-05,
"loss": 0.1634,
"step": 17825
},
{
"epoch": 2.2456151399691406,
"grad_norm": 0.20597226917743683,
"learning_rate": 5.415283369277729e-05,
"loss": 0.1594,
"step": 17830
},
{
"epoch": 2.2462449223793177,
"grad_norm": 0.23415236175060272,
"learning_rate": 5.406830437948767e-05,
"loss": 0.1667,
"step": 17835
},
{
"epoch": 2.2468747047894952,
"grad_norm": 0.21160747110843658,
"learning_rate": 5.398382658370986e-05,
"loss": 0.1694,
"step": 17840
},
{
"epoch": 2.2475044871996723,
"grad_norm": 0.2644958198070526,
"learning_rate": 5.3899400350810466e-05,
"loss": 0.1767,
"step": 17845
},
{
"epoch": 2.24813426960985,
"grad_norm": 0.23654960095882416,
"learning_rate": 5.381502572612826e-05,
"loss": 0.1684,
"step": 17850
},
{
"epoch": 2.248764052020027,
"grad_norm": 0.22581151127815247,
"learning_rate": 5.373070275497439e-05,
"loss": 0.1805,
"step": 17855
},
{
"epoch": 2.2493938344302045,
"grad_norm": 0.21524479985237122,
"learning_rate": 5.364643148263205e-05,
"loss": 0.1753,
"step": 17860
},
{
"epoch": 2.2500236168403815,
"grad_norm": 0.22853802144527435,
"learning_rate": 5.3562211954357006e-05,
"loss": 0.1752,
"step": 17865
},
{
"epoch": 2.250653399250559,
"grad_norm": 0.19708101451396942,
"learning_rate": 5.347804421537701e-05,
"loss": 0.1701,
"step": 17870
},
{
"epoch": 2.251283181660736,
"grad_norm": 0.22857971489429474,
"learning_rate": 5.339392831089209e-05,
"loss": 0.1662,
"step": 17875
},
{
"epoch": 2.2519129640709137,
"grad_norm": 0.2373005598783493,
"learning_rate": 5.33098642860743e-05,
"loss": 0.1878,
"step": 17880
},
{
"epoch": 2.2525427464810908,
"grad_norm": 0.22458739578723907,
"learning_rate": 5.322585218606811e-05,
"loss": 0.1711,
"step": 17885
},
{
"epoch": 2.253172528891268,
"grad_norm": 0.24684731662273407,
"learning_rate": 5.314189205598987e-05,
"loss": 0.1833,
"step": 17890
},
{
"epoch": 2.2538023113014454,
"grad_norm": 0.22604569792747498,
"learning_rate": 5.3057983940928046e-05,
"loss": 0.1683,
"step": 17895
},
{
"epoch": 2.2544320937116225,
"grad_norm": 0.23015649616718292,
"learning_rate": 5.2974127885943166e-05,
"loss": 0.1793,
"step": 17900
},
{
"epoch": 2.2550618761218,
"grad_norm": 0.2156984657049179,
"learning_rate": 5.289032393606797e-05,
"loss": 0.1816,
"step": 17905
},
{
"epoch": 2.255691658531977,
"grad_norm": 0.2468300610780716,
"learning_rate": 5.280657213630704e-05,
"loss": 0.1795,
"step": 17910
},
{
"epoch": 2.2563214409421546,
"grad_norm": 0.19326730072498322,
"learning_rate": 5.2722872531637024e-05,
"loss": 0.1726,
"step": 17915
},
{
"epoch": 2.2569512233523317,
"grad_norm": 0.19111455976963043,
"learning_rate": 5.2639225167006475e-05,
"loss": 0.1709,
"step": 17920
},
{
"epoch": 2.257581005762509,
"grad_norm": 0.24302569031715393,
"learning_rate": 5.255563008733599e-05,
"loss": 0.1752,
"step": 17925
},
{
"epoch": 2.2582107881726863,
"grad_norm": 0.20797547698020935,
"learning_rate": 5.247208733751801e-05,
"loss": 0.1792,
"step": 17930
},
{
"epoch": 2.258840570582864,
"grad_norm": 0.21642006933689117,
"learning_rate": 5.238859696241689e-05,
"loss": 0.1673,
"step": 17935
},
{
"epoch": 2.259470352993041,
"grad_norm": 0.22728614509105682,
"learning_rate": 5.2305159006868885e-05,
"loss": 0.1793,
"step": 17940
},
{
"epoch": 2.260100135403218,
"grad_norm": 0.24052174389362335,
"learning_rate": 5.2221773515682035e-05,
"loss": 0.1791,
"step": 17945
},
{
"epoch": 2.2607299178133955,
"grad_norm": 0.21312139928340912,
"learning_rate": 5.213844053363635e-05,
"loss": 0.177,
"step": 17950
},
{
"epoch": 2.2613597002235726,
"grad_norm": 0.22087723016738892,
"learning_rate": 5.205516010548349e-05,
"loss": 0.1764,
"step": 17955
},
{
"epoch": 2.26198948263375,
"grad_norm": 0.24077439308166504,
"learning_rate": 5.1971932275946967e-05,
"loss": 0.1884,
"step": 17960
},
{
"epoch": 2.262619265043927,
"grad_norm": 0.2120356261730194,
"learning_rate": 5.188875708972198e-05,
"loss": 0.173,
"step": 17965
},
{
"epoch": 2.2632490474541047,
"grad_norm": 0.24573729932308197,
"learning_rate": 5.1805634591475555e-05,
"loss": 0.1824,
"step": 17970
},
{
"epoch": 2.263878829864282,
"grad_norm": 0.20354896783828735,
"learning_rate": 5.1722564825846336e-05,
"loss": 0.1738,
"step": 17975
},
{
"epoch": 2.2645086122744593,
"grad_norm": 0.2105248123407364,
"learning_rate": 5.1639547837444725e-05,
"loss": 0.1694,
"step": 17980
},
{
"epoch": 2.2651383946846364,
"grad_norm": 0.21009747684001923,
"learning_rate": 5.1556583670852636e-05,
"loss": 0.1773,
"step": 17985
},
{
"epoch": 2.265768177094814,
"grad_norm": 0.21542850136756897,
"learning_rate": 5.147367237062387e-05,
"loss": 0.1682,
"step": 17990
},
{
"epoch": 2.266397959504991,
"grad_norm": 0.20584627985954285,
"learning_rate": 5.1390813981283676e-05,
"loss": 0.1734,
"step": 17995
},
{
"epoch": 2.267027741915168,
"grad_norm": 0.2486305981874466,
"learning_rate": 5.130800854732877e-05,
"loss": 0.1825,
"step": 18000
},
{
"epoch": 2.267027741915168,
"eval_loss": 0.35427358746528625,
"eval_runtime": 6.1591,
"eval_samples_per_second": 162.361,
"eval_steps_per_second": 10.229,
"step": 18000
},
{
"epoch": 2.2676575243253456,
"grad_norm": 0.19808907806873322,
"learning_rate": 5.122525611322761e-05,
"loss": 0.1625,
"step": 18005
},
{
"epoch": 2.2682873067355227,
"grad_norm": 0.24098962545394897,
"learning_rate": 5.114255672342022e-05,
"loss": 0.1687,
"step": 18010
},
{
"epoch": 2.2689170891457002,
"grad_norm": 0.22834831476211548,
"learning_rate": 5.105991042231799e-05,
"loss": 0.1695,
"step": 18015
},
{
"epoch": 2.2695468715558773,
"grad_norm": 0.19950784742832184,
"learning_rate": 5.097731725430392e-05,
"loss": 0.1692,
"step": 18020
},
{
"epoch": 2.270176653966055,
"grad_norm": 0.23613286018371582,
"learning_rate": 5.0894777263732405e-05,
"loss": 0.176,
"step": 18025
},
{
"epoch": 2.270806436376232,
"grad_norm": 0.2248247116804123,
"learning_rate": 5.081229049492929e-05,
"loss": 0.1638,
"step": 18030
},
{
"epoch": 2.2714362187864094,
"grad_norm": 0.21063442528247833,
"learning_rate": 5.072985699219186e-05,
"loss": 0.1696,
"step": 18035
},
{
"epoch": 2.2720660011965865,
"grad_norm": 0.26251456141471863,
"learning_rate": 5.064747679978881e-05,
"loss": 0.1784,
"step": 18040
},
{
"epoch": 2.272695783606764,
"grad_norm": 0.20396436750888824,
"learning_rate": 5.056514996196011e-05,
"loss": 0.1733,
"step": 18045
},
{
"epoch": 2.273325566016941,
"grad_norm": 0.21515126526355743,
"learning_rate": 5.048287652291728e-05,
"loss": 0.1625,
"step": 18050
},
{
"epoch": 2.273955348427118,
"grad_norm": 0.24371370673179626,
"learning_rate": 5.0400656526842946e-05,
"loss": 0.1739,
"step": 18055
},
{
"epoch": 2.2745851308372957,
"grad_norm": 0.22852087020874023,
"learning_rate": 5.03184900178912e-05,
"loss": 0.171,
"step": 18060
},
{
"epoch": 2.275214913247473,
"grad_norm": 0.22659562528133392,
"learning_rate": 5.023637704018719e-05,
"loss": 0.1769,
"step": 18065
},
{
"epoch": 2.2758446956576504,
"grad_norm": 0.2462269514799118,
"learning_rate": 5.01543176378276e-05,
"loss": 0.1731,
"step": 18070
},
{
"epoch": 2.2764744780678274,
"grad_norm": 0.21395175158977509,
"learning_rate": 5.007231185488016e-05,
"loss": 0.1705,
"step": 18075
},
{
"epoch": 2.277104260478005,
"grad_norm": 0.2166956514120102,
"learning_rate": 4.9990359735383837e-05,
"loss": 0.1671,
"step": 18080
},
{
"epoch": 2.277734042888182,
"grad_norm": 0.23139755427837372,
"learning_rate": 4.9908461323348754e-05,
"loss": 0.1785,
"step": 18085
},
{
"epoch": 2.2783638252983596,
"grad_norm": 0.23193643987178802,
"learning_rate": 4.982661666275632e-05,
"loss": 0.1746,
"step": 18090
},
{
"epoch": 2.2789936077085367,
"grad_norm": 0.21008536219596863,
"learning_rate": 4.974482579755899e-05,
"loss": 0.1784,
"step": 18095
},
{
"epoch": 2.279623390118714,
"grad_norm": 0.23688139021396637,
"learning_rate": 4.9663088771680235e-05,
"loss": 0.1812,
"step": 18100
},
{
"epoch": 2.2802531725288913,
"grad_norm": 0.20811019837856293,
"learning_rate": 4.958140562901468e-05,
"loss": 0.1721,
"step": 18105
},
{
"epoch": 2.2808829549390683,
"grad_norm": 0.2096734642982483,
"learning_rate": 4.9499776413428167e-05,
"loss": 0.1697,
"step": 18110
},
{
"epoch": 2.281512737349246,
"grad_norm": 0.22839121520519257,
"learning_rate": 4.9418201168757386e-05,
"loss": 0.1729,
"step": 18115
},
{
"epoch": 2.282142519759423,
"grad_norm": 0.21908484399318695,
"learning_rate": 4.9336679938810106e-05,
"loss": 0.1659,
"step": 18120
},
{
"epoch": 2.2827723021696005,
"grad_norm": 0.20620904862880707,
"learning_rate": 4.925521276736511e-05,
"loss": 0.1636,
"step": 18125
},
{
"epoch": 2.2834020845797776,
"grad_norm": 0.28344854712486267,
"learning_rate": 4.9173799698172095e-05,
"loss": 0.1753,
"step": 18130
},
{
"epoch": 2.284031866989955,
"grad_norm": 0.2172774374485016,
"learning_rate": 4.909244077495175e-05,
"loss": 0.1702,
"step": 18135
},
{
"epoch": 2.284661649400132,
"grad_norm": 0.19668060541152954,
"learning_rate": 4.90111360413957e-05,
"loss": 0.1715,
"step": 18140
},
{
"epoch": 2.2852914318103097,
"grad_norm": 0.19766007363796234,
"learning_rate": 4.892988554116642e-05,
"loss": 0.1608,
"step": 18145
},
{
"epoch": 2.2859212142204868,
"grad_norm": 0.2108301967382431,
"learning_rate": 4.884868931789724e-05,
"loss": 0.1633,
"step": 18150
},
{
"epoch": 2.2865509966306643,
"grad_norm": 0.25781720876693726,
"learning_rate": 4.8767547415192476e-05,
"loss": 0.1634,
"step": 18155
},
{
"epoch": 2.2871807790408414,
"grad_norm": 0.21515868604183197,
"learning_rate": 4.8686459876627164e-05,
"loss": 0.1687,
"step": 18160
},
{
"epoch": 2.2878105614510185,
"grad_norm": 0.23936854302883148,
"learning_rate": 4.860542674574713e-05,
"loss": 0.1786,
"step": 18165
},
{
"epoch": 2.288440343861196,
"grad_norm": 0.2083710879087448,
"learning_rate": 4.852444806606904e-05,
"loss": 0.1727,
"step": 18170
},
{
"epoch": 2.289070126271373,
"grad_norm": 0.24087072908878326,
"learning_rate": 4.844352388108028e-05,
"loss": 0.1646,
"step": 18175
},
{
"epoch": 2.2896999086815506,
"grad_norm": 0.22956833243370056,
"learning_rate": 4.836265423423898e-05,
"loss": 0.1667,
"step": 18180
},
{
"epoch": 2.2903296910917277,
"grad_norm": 0.2500525414943695,
"learning_rate": 4.828183916897402e-05,
"loss": 0.1788,
"step": 18185
},
{
"epoch": 2.290959473501905,
"grad_norm": 0.23779354989528656,
"learning_rate": 4.820107872868486e-05,
"loss": 0.1687,
"step": 18190
},
{
"epoch": 2.2915892559120823,
"grad_norm": 0.21519017219543457,
"learning_rate": 4.81203729567418e-05,
"loss": 0.173,
"step": 18195
},
{
"epoch": 2.29221903832226,
"grad_norm": 0.2123459428548813,
"learning_rate": 4.803972189648568e-05,
"loss": 0.1648,
"step": 18200
},
{
"epoch": 2.292848820732437,
"grad_norm": 0.2364078015089035,
"learning_rate": 4.795912559122789e-05,
"loss": 0.1743,
"step": 18205
},
{
"epoch": 2.2934786031426144,
"grad_norm": 0.23717305064201355,
"learning_rate": 4.787858408425045e-05,
"loss": 0.1827,
"step": 18210
},
{
"epoch": 2.2941083855527915,
"grad_norm": 0.197091206908226,
"learning_rate": 4.7798097418806134e-05,
"loss": 0.1713,
"step": 18215
},
{
"epoch": 2.2947381679629686,
"grad_norm": 0.19760344922542572,
"learning_rate": 4.771766563811803e-05,
"loss": 0.1612,
"step": 18220
},
{
"epoch": 2.295367950373146,
"grad_norm": 0.22046242654323578,
"learning_rate": 4.763728878537984e-05,
"loss": 0.1691,
"step": 18225
},
{
"epoch": 2.295997732783323,
"grad_norm": 0.22356641292572021,
"learning_rate": 4.755696690375574e-05,
"loss": 0.1684,
"step": 18230
},
{
"epoch": 2.2966275151935007,
"grad_norm": 0.20664890110492706,
"learning_rate": 4.7476700036380565e-05,
"loss": 0.1656,
"step": 18235
},
{
"epoch": 2.297257297603678,
"grad_norm": 0.2873956859111786,
"learning_rate": 4.73964882263593e-05,
"loss": 0.1811,
"step": 18240
},
{
"epoch": 2.2978870800138553,
"grad_norm": 0.23324726521968842,
"learning_rate": 4.7316331516767575e-05,
"loss": 0.17,
"step": 18245
},
{
"epoch": 2.2985168624240324,
"grad_norm": 0.22407886385917664,
"learning_rate": 4.7236229950651314e-05,
"loss": 0.1589,
"step": 18250
},
{
"epoch": 2.29914664483421,
"grad_norm": 0.2202986776828766,
"learning_rate": 4.7156183571026985e-05,
"loss": 0.1806,
"step": 18255
},
{
"epoch": 2.299776427244387,
"grad_norm": 0.1998445987701416,
"learning_rate": 4.707619242088129e-05,
"loss": 0.1571,
"step": 18260
},
{
"epoch": 2.3004062096545645,
"grad_norm": 0.24477636814117432,
"learning_rate": 4.69962565431713e-05,
"loss": 0.1788,
"step": 18265
},
{
"epoch": 2.3010359920647416,
"grad_norm": 0.2186649590730667,
"learning_rate": 4.691637598082439e-05,
"loss": 0.1837,
"step": 18270
},
{
"epoch": 2.3016657744749187,
"grad_norm": 0.19296254217624664,
"learning_rate": 4.683655077673826e-05,
"loss": 0.1609,
"step": 18275
},
{
"epoch": 2.3022955568850962,
"grad_norm": 0.234447181224823,
"learning_rate": 4.675678097378086e-05,
"loss": 0.1711,
"step": 18280
},
{
"epoch": 2.3029253392952733,
"grad_norm": 0.19974513351917267,
"learning_rate": 4.667706661479041e-05,
"loss": 0.1666,
"step": 18285
},
{
"epoch": 2.303555121705451,
"grad_norm": 0.23064357042312622,
"learning_rate": 4.659740774257527e-05,
"loss": 0.1684,
"step": 18290
},
{
"epoch": 2.304184904115628,
"grad_norm": 0.19428302347660065,
"learning_rate": 4.6517804399914214e-05,
"loss": 0.166,
"step": 18295
},
{
"epoch": 2.3048146865258055,
"grad_norm": 0.23040397465229034,
"learning_rate": 4.6438256629555956e-05,
"loss": 0.1687,
"step": 18300
},
{
"epoch": 2.3054444689359825,
"grad_norm": 0.22161847352981567,
"learning_rate": 4.635876447421955e-05,
"loss": 0.1784,
"step": 18305
},
{
"epoch": 2.30607425134616,
"grad_norm": 0.22831936180591583,
"learning_rate": 4.6279327976593924e-05,
"loss": 0.1731,
"step": 18310
},
{
"epoch": 2.306704033756337,
"grad_norm": 0.25957801938056946,
"learning_rate": 4.619994717933848e-05,
"loss": 0.1823,
"step": 18315
},
{
"epoch": 2.3073338161665147,
"grad_norm": 0.23449194431304932,
"learning_rate": 4.6120622125082426e-05,
"loss": 0.1725,
"step": 18320
},
{
"epoch": 2.3079635985766918,
"grad_norm": 0.24584275484085083,
"learning_rate": 4.604135285642514e-05,
"loss": 0.1857,
"step": 18325
},
{
"epoch": 2.308593380986869,
"grad_norm": 0.21245352923870087,
"learning_rate": 4.5962139415936056e-05,
"loss": 0.164,
"step": 18330
},
{
"epoch": 2.3092231633970464,
"grad_norm": 0.2068212777376175,
"learning_rate": 4.588298184615453e-05,
"loss": 0.1661,
"step": 18335
},
{
"epoch": 2.3098529458072234,
"grad_norm": 0.21349553763866425,
"learning_rate": 4.580388018959013e-05,
"loss": 0.1707,
"step": 18340
},
{
"epoch": 2.310482728217401,
"grad_norm": 0.2073366641998291,
"learning_rate": 4.5724834488722106e-05,
"loss": 0.1608,
"step": 18345
},
{
"epoch": 2.311112510627578,
"grad_norm": 0.2493850737810135,
"learning_rate": 4.564584478599982e-05,
"loss": 0.176,
"step": 18350
},
{
"epoch": 2.3117422930377556,
"grad_norm": 0.25253990292549133,
"learning_rate": 4.556691112384262e-05,
"loss": 0.1744,
"step": 18355
},
{
"epoch": 2.3123720754479327,
"grad_norm": 0.24499280750751495,
"learning_rate": 4.548803354463967e-05,
"loss": 0.1755,
"step": 18360
},
{
"epoch": 2.31300185785811,
"grad_norm": 0.21188803017139435,
"learning_rate": 4.540921209075e-05,
"loss": 0.1675,
"step": 18365
},
{
"epoch": 2.3136316402682873,
"grad_norm": 0.2255249321460724,
"learning_rate": 4.5330446804502543e-05,
"loss": 0.1668,
"step": 18370
},
{
"epoch": 2.314261422678465,
"grad_norm": 0.2088666409254074,
"learning_rate": 4.525173772819606e-05,
"loss": 0.173,
"step": 18375
},
{
"epoch": 2.314891205088642,
"grad_norm": 0.24474313855171204,
"learning_rate": 4.517308490409912e-05,
"loss": 0.1672,
"step": 18380
},
{
"epoch": 2.315520987498819,
"grad_norm": 0.2033611238002777,
"learning_rate": 4.5094488374450085e-05,
"loss": 0.1677,
"step": 18385
},
{
"epoch": 2.3161507699089965,
"grad_norm": 0.22693341970443726,
"learning_rate": 4.50159481814571e-05,
"loss": 0.1653,
"step": 18390
},
{
"epoch": 2.3167805523191736,
"grad_norm": 0.24162709712982178,
"learning_rate": 4.493746436729797e-05,
"loss": 0.1668,
"step": 18395
},
{
"epoch": 2.317410334729351,
"grad_norm": 0.21281133592128754,
"learning_rate": 4.485903697412041e-05,
"loss": 0.167,
"step": 18400
},
{
"epoch": 2.318040117139528,
"grad_norm": 0.2348182648420334,
"learning_rate": 4.478066604404168e-05,
"loss": 0.1683,
"step": 18405
},
{
"epoch": 2.3186698995497057,
"grad_norm": 0.2391456663608551,
"learning_rate": 4.470235161914878e-05,
"loss": 0.1708,
"step": 18410
},
{
"epoch": 2.319299681959883,
"grad_norm": 0.2014867216348648,
"learning_rate": 4.462409374149822e-05,
"loss": 0.1679,
"step": 18415
},
{
"epoch": 2.3199294643700603,
"grad_norm": 0.19464534521102905,
"learning_rate": 4.4545892453116414e-05,
"loss": 0.167,
"step": 18420
},
{
"epoch": 2.3205592467802374,
"grad_norm": 0.18525034189224243,
"learning_rate": 4.446774779599918e-05,
"loss": 0.16,
"step": 18425
},
{
"epoch": 2.321189029190415,
"grad_norm": 0.220379039645195,
"learning_rate": 4.438965981211201e-05,
"loss": 0.1728,
"step": 18430
},
{
"epoch": 2.321818811600592,
"grad_norm": 0.22186563909053802,
"learning_rate": 4.431162854338985e-05,
"loss": 0.1651,
"step": 18435
},
{
"epoch": 2.322448594010769,
"grad_norm": 0.22272159159183502,
"learning_rate": 4.423365403173739e-05,
"loss": 0.171,
"step": 18440
},
{
"epoch": 2.3230783764209466,
"grad_norm": 0.220636785030365,
"learning_rate": 4.4155736319028725e-05,
"loss": 0.1691,
"step": 18445
},
{
"epoch": 2.3237081588311237,
"grad_norm": 0.22500810027122498,
"learning_rate": 4.4077875447107356e-05,
"loss": 0.1648,
"step": 18450
},
{
"epoch": 2.324337941241301,
"grad_norm": 0.2163766771554947,
"learning_rate": 4.4000071457786335e-05,
"loss": 0.1655,
"step": 18455
},
{
"epoch": 2.3249677236514783,
"grad_norm": 0.2258923053741455,
"learning_rate": 4.392232439284829e-05,
"loss": 0.1704,
"step": 18460
},
{
"epoch": 2.325597506061656,
"grad_norm": 0.23461341857910156,
"learning_rate": 4.384463429404511e-05,
"loss": 0.1686,
"step": 18465
},
{
"epoch": 2.326227288471833,
"grad_norm": 0.22406549751758575,
"learning_rate": 4.376700120309816e-05,
"loss": 0.1655,
"step": 18470
},
{
"epoch": 2.3268570708820104,
"grad_norm": 0.21646642684936523,
"learning_rate": 4.368942516169819e-05,
"loss": 0.1682,
"step": 18475
},
{
"epoch": 2.3274868532921875,
"grad_norm": 0.23925819993019104,
"learning_rate": 4.3611906211505284e-05,
"loss": 0.1746,
"step": 18480
},
{
"epoch": 2.328116635702365,
"grad_norm": 0.19920630753040314,
"learning_rate": 4.35344443941489e-05,
"loss": 0.158,
"step": 18485
},
{
"epoch": 2.328746418112542,
"grad_norm": 0.2575379014015198,
"learning_rate": 4.345703975122783e-05,
"loss": 0.1708,
"step": 18490
},
{
"epoch": 2.329376200522719,
"grad_norm": 0.19556741416454315,
"learning_rate": 4.3379692324310056e-05,
"loss": 0.1677,
"step": 18495
},
{
"epoch": 2.3300059829328967,
"grad_norm": 0.2595387101173401,
"learning_rate": 4.3302402154933005e-05,
"loss": 0.1705,
"step": 18500
},
{
"epoch": 2.330635765343074,
"grad_norm": 0.21318422257900238,
"learning_rate": 4.322516928460325e-05,
"loss": 0.1676,
"step": 18505
},
{
"epoch": 2.3312655477532513,
"grad_norm": 0.2212359607219696,
"learning_rate": 4.3147993754796624e-05,
"loss": 0.1661,
"step": 18510
},
{
"epoch": 2.3318953301634284,
"grad_norm": 0.1886136680841446,
"learning_rate": 4.3070875606958006e-05,
"loss": 0.1613,
"step": 18515
},
{
"epoch": 2.332525112573606,
"grad_norm": 0.23505628108978271,
"learning_rate": 4.2993814882501754e-05,
"loss": 0.1687,
"step": 18520
},
{
"epoch": 2.333154894983783,
"grad_norm": 0.18686296045780182,
"learning_rate": 4.2916811622811195e-05,
"loss": 0.1613,
"step": 18525
},
{
"epoch": 2.3337846773939606,
"grad_norm": 0.21165959537029266,
"learning_rate": 4.2839865869238845e-05,
"loss": 0.1604,
"step": 18530
},
{
"epoch": 2.3344144598041376,
"grad_norm": 0.29806169867515564,
"learning_rate": 4.27629776631063e-05,
"loss": 0.1682,
"step": 18535
},
{
"epoch": 2.335044242214315,
"grad_norm": 0.2488899528980255,
"learning_rate": 4.268614704570426e-05,
"loss": 0.1758,
"step": 18540
},
{
"epoch": 2.3356740246244923,
"grad_norm": 0.21834008395671844,
"learning_rate": 4.2609374058292666e-05,
"loss": 0.1587,
"step": 18545
},
{
"epoch": 2.3363038070346693,
"grad_norm": 0.22900566458702087,
"learning_rate": 4.253265874210022e-05,
"loss": 0.1798,
"step": 18550
},
{
"epoch": 2.336933589444847,
"grad_norm": 0.22346030175685883,
"learning_rate": 4.2456001138324794e-05,
"loss": 0.1656,
"step": 18555
},
{
"epoch": 2.337563371855024,
"grad_norm": 0.22244654595851898,
"learning_rate": 4.237940128813336e-05,
"loss": 0.1734,
"step": 18560
},
{
"epoch": 2.3381931542652015,
"grad_norm": 0.19254350662231445,
"learning_rate": 4.230285923266175e-05,
"loss": 0.1619,
"step": 18565
},
{
"epoch": 2.3388229366753785,
"grad_norm": 0.22871673107147217,
"learning_rate": 4.222637501301481e-05,
"loss": 0.166,
"step": 18570
},
{
"epoch": 2.339452719085556,
"grad_norm": 0.20270411670207977,
"learning_rate": 4.2149948670266284e-05,
"loss": 0.1637,
"step": 18575
},
{
"epoch": 2.340082501495733,
"grad_norm": 0.23636558651924133,
"learning_rate": 4.2073580245458874e-05,
"loss": 0.1839,
"step": 18580
},
{
"epoch": 2.3407122839059107,
"grad_norm": 0.24934862554073334,
"learning_rate": 4.1997269779604185e-05,
"loss": 0.1661,
"step": 18585
},
{
"epoch": 2.3413420663160878,
"grad_norm": 0.2234071046113968,
"learning_rate": 4.192101731368267e-05,
"loss": 0.1699,
"step": 18590
},
{
"epoch": 2.3419718487262653,
"grad_norm": 0.20725548267364502,
"learning_rate": 4.1844822888643634e-05,
"loss": 0.1663,
"step": 18595
},
{
"epoch": 2.3426016311364424,
"grad_norm": 0.22668230533599854,
"learning_rate": 4.1768686545405186e-05,
"loss": 0.1647,
"step": 18600
},
{
"epoch": 2.3432314135466195,
"grad_norm": 0.23123641312122345,
"learning_rate": 4.1692608324854384e-05,
"loss": 0.171,
"step": 18605
},
{
"epoch": 2.343861195956797,
"grad_norm": 0.21715596318244934,
"learning_rate": 4.161658826784692e-05,
"loss": 0.1631,
"step": 18610
},
{
"epoch": 2.344490978366974,
"grad_norm": 0.24206319451332092,
"learning_rate": 4.154062641520732e-05,
"loss": 0.1724,
"step": 18615
},
{
"epoch": 2.3451207607771516,
"grad_norm": 0.21535861492156982,
"learning_rate": 4.1464722807728724e-05,
"loss": 0.1673,
"step": 18620
},
{
"epoch": 2.3457505431873287,
"grad_norm": 0.24345341324806213,
"learning_rate": 4.1388877486173245e-05,
"loss": 0.1648,
"step": 18625
},
{
"epoch": 2.346380325597506,
"grad_norm": 0.2361554056406021,
"learning_rate": 4.131309049127149e-05,
"loss": 0.1624,
"step": 18630
},
{
"epoch": 2.3470101080076833,
"grad_norm": 0.20666177570819855,
"learning_rate": 4.1237361863722816e-05,
"loss": 0.1662,
"step": 18635
},
{
"epoch": 2.347639890417861,
"grad_norm": 0.22876566648483276,
"learning_rate": 4.1161691644195165e-05,
"loss": 0.1767,
"step": 18640
},
{
"epoch": 2.348269672828038,
"grad_norm": 0.19370432198047638,
"learning_rate": 4.108607987332529e-05,
"loss": 0.1604,
"step": 18645
},
{
"epoch": 2.3488994552382154,
"grad_norm": 0.22485142946243286,
"learning_rate": 4.101052659171842e-05,
"loss": 0.1667,
"step": 18650
},
{
"epoch": 2.3495292376483925,
"grad_norm": 0.2446049600839615,
"learning_rate": 4.0935031839948315e-05,
"loss": 0.1719,
"step": 18655
},
{
"epoch": 2.3501590200585696,
"grad_norm": 0.22652800381183624,
"learning_rate": 4.0859595658557367e-05,
"loss": 0.1666,
"step": 18660
},
{
"epoch": 2.350788802468747,
"grad_norm": 0.1760840266942978,
"learning_rate": 4.078421808805663e-05,
"loss": 0.1516,
"step": 18665
},
{
"epoch": 2.351418584878924,
"grad_norm": 0.20791617035865784,
"learning_rate": 4.070889916892553e-05,
"loss": 0.164,
"step": 18670
},
{
"epoch": 2.3520483672891017,
"grad_norm": 0.2205626517534256,
"learning_rate": 4.063363894161206e-05,
"loss": 0.1669,
"step": 18675
},
{
"epoch": 2.352678149699279,
"grad_norm": 0.23379269242286682,
"learning_rate": 4.055843744653266e-05,
"loss": 0.1593,
"step": 18680
},
{
"epoch": 2.3533079321094563,
"grad_norm": 0.23451068997383118,
"learning_rate": 4.0483294724072254e-05,
"loss": 0.1633,
"step": 18685
},
{
"epoch": 2.3539377145196334,
"grad_norm": 0.28889602422714233,
"learning_rate": 4.040821081458422e-05,
"loss": 0.1752,
"step": 18690
},
{
"epoch": 2.354567496929811,
"grad_norm": 0.2054235339164734,
"learning_rate": 4.0333185758390307e-05,
"loss": 0.1666,
"step": 18695
},
{
"epoch": 2.355197279339988,
"grad_norm": 0.19770711660385132,
"learning_rate": 4.025821959578067e-05,
"loss": 0.1701,
"step": 18700
},
{
"epoch": 2.3558270617501655,
"grad_norm": 0.25233033299446106,
"learning_rate": 4.0183312367013906e-05,
"loss": 0.1722,
"step": 18705
},
{
"epoch": 2.3564568441603426,
"grad_norm": 0.20867769420146942,
"learning_rate": 4.010846411231689e-05,
"loss": 0.1601,
"step": 18710
},
{
"epoch": 2.3570866265705197,
"grad_norm": 0.21671661734580994,
"learning_rate": 4.003367487188483e-05,
"loss": 0.1658,
"step": 18715
},
{
"epoch": 2.3577164089806972,
"grad_norm": 0.17957130074501038,
"learning_rate": 3.9958944685881265e-05,
"loss": 0.1619,
"step": 18720
},
{
"epoch": 2.3583461913908743,
"grad_norm": 0.21048414707183838,
"learning_rate": 3.988427359443802e-05,
"loss": 0.1668,
"step": 18725
},
{
"epoch": 2.358975973801052,
"grad_norm": 0.21969716250896454,
"learning_rate": 3.980966163765513e-05,
"loss": 0.1619,
"step": 18730
},
{
"epoch": 2.359605756211229,
"grad_norm": 0.22368858754634857,
"learning_rate": 3.9735108855600984e-05,
"loss": 0.168,
"step": 18735
},
{
"epoch": 2.3602355386214064,
"grad_norm": 0.2626504600048065,
"learning_rate": 3.966061528831209e-05,
"loss": 0.1651,
"step": 18740
},
{
"epoch": 2.3608653210315835,
"grad_norm": 0.21985310316085815,
"learning_rate": 3.958618097579316e-05,
"loss": 0.1671,
"step": 18745
},
{
"epoch": 2.361495103441761,
"grad_norm": 0.22451792657375336,
"learning_rate": 3.9511805958017205e-05,
"loss": 0.1609,
"step": 18750
},
{
"epoch": 2.362124885851938,
"grad_norm": 0.2123977243900299,
"learning_rate": 3.943749027492532e-05,
"loss": 0.1719,
"step": 18755
},
{
"epoch": 2.3627546682621157,
"grad_norm": 0.2234313040971756,
"learning_rate": 3.936323396642658e-05,
"loss": 0.1556,
"step": 18760
},
{
"epoch": 2.3633844506722927,
"grad_norm": 0.19645099341869354,
"learning_rate": 3.928903707239846e-05,
"loss": 0.1673,
"step": 18765
},
{
"epoch": 2.36401423308247,
"grad_norm": 0.22249870002269745,
"learning_rate": 3.9214899632686334e-05,
"loss": 0.1589,
"step": 18770
},
{
"epoch": 2.3646440154926474,
"grad_norm": 0.2180803418159485,
"learning_rate": 3.914082168710369e-05,
"loss": 0.1685,
"step": 18775
},
{
"epoch": 2.3652737979028244,
"grad_norm": 0.2156234085559845,
"learning_rate": 3.906680327543212e-05,
"loss": 0.1613,
"step": 18780
},
{
"epoch": 2.365903580313002,
"grad_norm": 0.2180781066417694,
"learning_rate": 3.899284443742112e-05,
"loss": 0.1654,
"step": 18785
},
{
"epoch": 2.366533362723179,
"grad_norm": 0.2102290391921997,
"learning_rate": 3.89189452127884e-05,
"loss": 0.1635,
"step": 18790
},
{
"epoch": 2.3671631451333566,
"grad_norm": 0.26211512088775635,
"learning_rate": 3.884510564121944e-05,
"loss": 0.174,
"step": 18795
},
{
"epoch": 2.3677929275435337,
"grad_norm": 0.1999218463897705,
"learning_rate": 3.877132576236778e-05,
"loss": 0.1619,
"step": 18800
},
{
"epoch": 2.368422709953711,
"grad_norm": 0.21774223446846008,
"learning_rate": 3.8697605615854875e-05,
"loss": 0.1616,
"step": 18805
},
{
"epoch": 2.3690524923638883,
"grad_norm": 0.2304651439189911,
"learning_rate": 3.862394524127023e-05,
"loss": 0.1705,
"step": 18810
},
{
"epoch": 2.369682274774066,
"grad_norm": 0.24826854467391968,
"learning_rate": 3.8550344678171084e-05,
"loss": 0.1734,
"step": 18815
},
{
"epoch": 2.370312057184243,
"grad_norm": 0.21676623821258545,
"learning_rate": 3.847680396608262e-05,
"loss": 0.1669,
"step": 18820
},
{
"epoch": 2.37094183959442,
"grad_norm": 0.21203717589378357,
"learning_rate": 3.840332314449788e-05,
"loss": 0.1633,
"step": 18825
},
{
"epoch": 2.3715716220045975,
"grad_norm": 0.2409755140542984,
"learning_rate": 3.832990225287776e-05,
"loss": 0.1687,
"step": 18830
},
{
"epoch": 2.3722014044147746,
"grad_norm": 0.19998323917388916,
"learning_rate": 3.825654133065094e-05,
"loss": 0.1578,
"step": 18835
},
{
"epoch": 2.372831186824952,
"grad_norm": 0.22083517909049988,
"learning_rate": 3.818324041721391e-05,
"loss": 0.1721,
"step": 18840
},
{
"epoch": 2.373460969235129,
"grad_norm": 0.19865678250789642,
"learning_rate": 3.8109999551930914e-05,
"loss": 0.1613,
"step": 18845
},
{
"epoch": 2.3740907516453067,
"grad_norm": 0.2167719304561615,
"learning_rate": 3.8036818774134037e-05,
"loss": 0.1569,
"step": 18850
},
{
"epoch": 2.374720534055484,
"grad_norm": 0.2173914611339569,
"learning_rate": 3.796369812312298e-05,
"loss": 0.1676,
"step": 18855
},
{
"epoch": 2.3753503164656613,
"grad_norm": 0.22559495270252228,
"learning_rate": 3.7890637638165255e-05,
"loss": 0.169,
"step": 18860
},
{
"epoch": 2.3759800988758384,
"grad_norm": 0.2124035507440567,
"learning_rate": 3.781763735849589e-05,
"loss": 0.1715,
"step": 18865
},
{
"epoch": 2.376609881286016,
"grad_norm": 0.23133991658687592,
"learning_rate": 3.774469732331782e-05,
"loss": 0.162,
"step": 18870
},
{
"epoch": 2.377239663696193,
"grad_norm": 0.1754513680934906,
"learning_rate": 3.7671817571801464e-05,
"loss": 0.1602,
"step": 18875
},
{
"epoch": 2.37786944610637,
"grad_norm": 0.2158019244670868,
"learning_rate": 3.7598998143084924e-05,
"loss": 0.1571,
"step": 18880
},
{
"epoch": 2.3784992285165476,
"grad_norm": 0.20694270730018616,
"learning_rate": 3.752623907627388e-05,
"loss": 0.162,
"step": 18885
},
{
"epoch": 2.3791290109267247,
"grad_norm": 0.250929057598114,
"learning_rate": 3.7453540410441604e-05,
"loss": 0.1744,
"step": 18890
},
{
"epoch": 2.379758793336902,
"grad_norm": 0.23653000593185425,
"learning_rate": 3.738090218462903e-05,
"loss": 0.1789,
"step": 18895
},
{
"epoch": 2.3803885757470793,
"grad_norm": 0.1936427801847458,
"learning_rate": 3.730832443784443e-05,
"loss": 0.1532,
"step": 18900
},
{
"epoch": 2.381018358157257,
"grad_norm": 0.188064306974411,
"learning_rate": 3.7235807209063716e-05,
"loss": 0.1629,
"step": 18905
},
{
"epoch": 2.381648140567434,
"grad_norm": 0.2069697082042694,
"learning_rate": 3.71633505372304e-05,
"loss": 0.1568,
"step": 18910
},
{
"epoch": 2.3822779229776114,
"grad_norm": 0.21809029579162598,
"learning_rate": 3.709095446125529e-05,
"loss": 0.1717,
"step": 18915
},
{
"epoch": 2.3829077053877885,
"grad_norm": 0.23560817539691925,
"learning_rate": 3.701861902001675e-05,
"loss": 0.1662,
"step": 18920
},
{
"epoch": 2.383537487797966,
"grad_norm": 0.19693933427333832,
"learning_rate": 3.694634425236057e-05,
"loss": 0.1558,
"step": 18925
},
{
"epoch": 2.384167270208143,
"grad_norm": 0.19060872495174408,
"learning_rate": 3.687413019709994e-05,
"loss": 0.1621,
"step": 18930
},
{
"epoch": 2.38479705261832,
"grad_norm": 0.2021481990814209,
"learning_rate": 3.680197689301548e-05,
"loss": 0.1551,
"step": 18935
},
{
"epoch": 2.3854268350284977,
"grad_norm": 0.22511224448680878,
"learning_rate": 3.672988437885512e-05,
"loss": 0.1587,
"step": 18940
},
{
"epoch": 2.386056617438675,
"grad_norm": 0.2018289864063263,
"learning_rate": 3.665785269333423e-05,
"loss": 0.1654,
"step": 18945
},
{
"epoch": 2.3866863998488523,
"grad_norm": 0.21350149810314178,
"learning_rate": 3.65858818751354e-05,
"loss": 0.1673,
"step": 18950
},
{
"epoch": 2.3873161822590294,
"grad_norm": 0.21213771402835846,
"learning_rate": 3.65139719629087e-05,
"loss": 0.1791,
"step": 18955
},
{
"epoch": 2.387945964669207,
"grad_norm": 0.24175149202346802,
"learning_rate": 3.644212299527139e-05,
"loss": 0.1714,
"step": 18960
},
{
"epoch": 2.388575747079384,
"grad_norm": 0.2541513741016388,
"learning_rate": 3.63703350108079e-05,
"loss": 0.1685,
"step": 18965
},
{
"epoch": 2.389205529489561,
"grad_norm": 0.24447733163833618,
"learning_rate": 3.629860804807011e-05,
"loss": 0.1728,
"step": 18970
},
{
"epoch": 2.3898353118997386,
"grad_norm": 0.1830032914876938,
"learning_rate": 3.622694214557702e-05,
"loss": 0.1698,
"step": 18975
},
{
"epoch": 2.390465094309916,
"grad_norm": 0.23851166665554047,
"learning_rate": 3.6155337341814844e-05,
"loss": 0.1754,
"step": 18980
},
{
"epoch": 2.3910948767200932,
"grad_norm": 0.1973876655101776,
"learning_rate": 3.608379367523702e-05,
"loss": 0.1703,
"step": 18985
},
{
"epoch": 2.3917246591302703,
"grad_norm": 0.2209198772907257,
"learning_rate": 3.6012311184264046e-05,
"loss": 0.1674,
"step": 18990
},
{
"epoch": 2.392354441540448,
"grad_norm": 0.216825932264328,
"learning_rate": 3.5940889907283834e-05,
"loss": 0.1677,
"step": 18995
},
{
"epoch": 2.392984223950625,
"grad_norm": 0.1855764538049698,
"learning_rate": 3.586952988265106e-05,
"loss": 0.1592,
"step": 19000
},
{
"epoch": 2.392984223950625,
"eval_loss": 0.35235053300857544,
"eval_runtime": 6.1677,
"eval_samples_per_second": 162.135,
"eval_steps_per_second": 10.215,
"step": 19000
},
{
"epoch": 2.3936140063608025,
"grad_norm": 0.25512510538101196,
"learning_rate": 3.579823114868778e-05,
"loss": 0.1649,
"step": 19005
},
{
"epoch": 2.3942437887709795,
"grad_norm": 0.20220012962818146,
"learning_rate": 3.572699374368296e-05,
"loss": 0.1638,
"step": 19010
},
{
"epoch": 2.394873571181157,
"grad_norm": 0.21141274273395538,
"learning_rate": 3.5655817705892814e-05,
"loss": 0.1697,
"step": 19015
},
{
"epoch": 2.395503353591334,
"grad_norm": 0.23368066549301147,
"learning_rate": 3.558470307354046e-05,
"loss": 0.1653,
"step": 19020
},
{
"epoch": 2.3961331360015112,
"grad_norm": 0.20436784625053406,
"learning_rate": 3.5513649884816064e-05,
"loss": 0.1561,
"step": 19025
},
{
"epoch": 2.3967629184116888,
"grad_norm": 0.223700612783432,
"learning_rate": 3.5442658177876835e-05,
"loss": 0.1693,
"step": 19030
},
{
"epoch": 2.3973927008218663,
"grad_norm": 0.26057204604148865,
"learning_rate": 3.5371727990846944e-05,
"loss": 0.1767,
"step": 19035
},
{
"epoch": 2.3980224832320434,
"grad_norm": 0.21637168526649475,
"learning_rate": 3.53008593618175e-05,
"loss": 0.1671,
"step": 19040
},
{
"epoch": 2.3986522656422204,
"grad_norm": 0.23995353281497955,
"learning_rate": 3.5230052328846585e-05,
"loss": 0.1788,
"step": 19045
},
{
"epoch": 2.399282048052398,
"grad_norm": 0.2069759964942932,
"learning_rate": 3.5159306929959144e-05,
"loss": 0.1655,
"step": 19050
},
{
"epoch": 2.399911830462575,
"grad_norm": 0.20498618483543396,
"learning_rate": 3.508862320314717e-05,
"loss": 0.1589,
"step": 19055
},
{
"epoch": 2.4005416128727526,
"grad_norm": 0.20835870504379272,
"learning_rate": 3.501800118636939e-05,
"loss": 0.1556,
"step": 19060
},
{
"epoch": 2.4011713952829297,
"grad_norm": 0.22261159121990204,
"learning_rate": 3.4947440917551475e-05,
"loss": 0.1645,
"step": 19065
},
{
"epoch": 2.401801177693107,
"grad_norm": 0.20514576137065887,
"learning_rate": 3.487694243458578e-05,
"loss": 0.1558,
"step": 19070
},
{
"epoch": 2.4024309601032843,
"grad_norm": 0.18798956274986267,
"learning_rate": 3.480650577533175e-05,
"loss": 0.1635,
"step": 19075
},
{
"epoch": 2.4030607425134614,
"grad_norm": 0.1777620017528534,
"learning_rate": 3.47361309776154e-05,
"loss": 0.1613,
"step": 19080
},
{
"epoch": 2.403690524923639,
"grad_norm": 0.22258161008358002,
"learning_rate": 3.466581807922962e-05,
"loss": 0.1657,
"step": 19085
},
{
"epoch": 2.4043203073338164,
"grad_norm": 0.20584604144096375,
"learning_rate": 3.4595567117934045e-05,
"loss": 0.1609,
"step": 19090
},
{
"epoch": 2.4049500897439935,
"grad_norm": 0.26850444078445435,
"learning_rate": 3.452537813145501e-05,
"loss": 0.165,
"step": 19095
},
{
"epoch": 2.4055798721541706,
"grad_norm": 0.1789688616991043,
"learning_rate": 3.4455251157485706e-05,
"loss": 0.1597,
"step": 19100
},
{
"epoch": 2.406209654564348,
"grad_norm": 0.24336829781532288,
"learning_rate": 3.438518623368581e-05,
"loss": 0.1582,
"step": 19105
},
{
"epoch": 2.406839436974525,
"grad_norm": 0.20159520208835602,
"learning_rate": 3.4315183397681806e-05,
"loss": 0.1572,
"step": 19110
},
{
"epoch": 2.4074692193847027,
"grad_norm": 0.22423477470874786,
"learning_rate": 3.424524268706686e-05,
"loss": 0.1611,
"step": 19115
},
{
"epoch": 2.40809900179488,
"grad_norm": 0.22861574590206146,
"learning_rate": 3.417536413940073e-05,
"loss": 0.1708,
"step": 19120
},
{
"epoch": 2.4087287842050573,
"grad_norm": 0.22517502307891846,
"learning_rate": 3.4105547792209766e-05,
"loss": 0.1498,
"step": 19125
},
{
"epoch": 2.4093585666152344,
"grad_norm": 0.22406402230262756,
"learning_rate": 3.403579368298694e-05,
"loss": 0.1722,
"step": 19130
},
{
"epoch": 2.4099883490254115,
"grad_norm": 0.21624189615249634,
"learning_rate": 3.3966101849191807e-05,
"loss": 0.165,
"step": 19135
},
{
"epoch": 2.410618131435589,
"grad_norm": 0.2186998724937439,
"learning_rate": 3.389647232825048e-05,
"loss": 0.1545,
"step": 19140
},
{
"epoch": 2.4112479138457665,
"grad_norm": 0.20615451037883759,
"learning_rate": 3.38269051575556e-05,
"loss": 0.1653,
"step": 19145
},
{
"epoch": 2.4118776962559436,
"grad_norm": 0.21351304650306702,
"learning_rate": 3.3757400374466323e-05,
"loss": 0.1667,
"step": 19150
},
{
"epoch": 2.4125074786661207,
"grad_norm": 0.2263455092906952,
"learning_rate": 3.368795801630826e-05,
"loss": 0.1635,
"step": 19155
},
{
"epoch": 2.4131372610762982,
"grad_norm": 0.20655429363250732,
"learning_rate": 3.361857812037365e-05,
"loss": 0.1657,
"step": 19160
},
{
"epoch": 2.4137670434864753,
"grad_norm": 0.1987982541322708,
"learning_rate": 3.354926072392101e-05,
"loss": 0.1554,
"step": 19165
},
{
"epoch": 2.414396825896653,
"grad_norm": 0.20431582629680634,
"learning_rate": 3.348000586417539e-05,
"loss": 0.1552,
"step": 19170
},
{
"epoch": 2.41502660830683,
"grad_norm": 0.241183340549469,
"learning_rate": 3.34108135783282e-05,
"loss": 0.1758,
"step": 19175
},
{
"epoch": 2.4156563907170074,
"grad_norm": 0.1910007894039154,
"learning_rate": 3.3341683903537295e-05,
"loss": 0.1609,
"step": 19180
},
{
"epoch": 2.4162861731271845,
"grad_norm": 0.2089349776506424,
"learning_rate": 3.3272616876926916e-05,
"loss": 0.1608,
"step": 19185
},
{
"epoch": 2.4169159555373616,
"grad_norm": 0.20799914002418518,
"learning_rate": 3.3203612535587594e-05,
"loss": 0.1636,
"step": 19190
},
{
"epoch": 2.417545737947539,
"grad_norm": 0.2071768194437027,
"learning_rate": 3.313467091657622e-05,
"loss": 0.1643,
"step": 19195
},
{
"epoch": 2.4181755203577167,
"grad_norm": 0.22981540858745575,
"learning_rate": 3.3065792056916077e-05,
"loss": 0.1749,
"step": 19200
},
{
"epoch": 2.4188053027678937,
"grad_norm": 0.22096100449562073,
"learning_rate": 3.2996975993596706e-05,
"loss": 0.1671,
"step": 19205
},
{
"epoch": 2.419435085178071,
"grad_norm": 0.19851092994213104,
"learning_rate": 3.292822276357382e-05,
"loss": 0.1605,
"step": 19210
},
{
"epoch": 2.4200648675882483,
"grad_norm": 0.21692755818367004,
"learning_rate": 3.285953240376947e-05,
"loss": 0.1629,
"step": 19215
},
{
"epoch": 2.4206946499984254,
"grad_norm": 0.1912955939769745,
"learning_rate": 3.279090495107204e-05,
"loss": 0.1626,
"step": 19220
},
{
"epoch": 2.421324432408603,
"grad_norm": 0.17941045761108398,
"learning_rate": 3.2722340442335993e-05,
"loss": 0.1528,
"step": 19225
},
{
"epoch": 2.42195421481878,
"grad_norm": 0.24879097938537598,
"learning_rate": 3.265383891438203e-05,
"loss": 0.1622,
"step": 19230
},
{
"epoch": 2.4225839972289576,
"grad_norm": 0.27064043283462524,
"learning_rate": 3.258540040399703e-05,
"loss": 0.1677,
"step": 19235
},
{
"epoch": 2.4232137796391346,
"grad_norm": 0.20788533985614777,
"learning_rate": 3.2517024947934046e-05,
"loss": 0.1742,
"step": 19240
},
{
"epoch": 2.4238435620493117,
"grad_norm": 0.20137952268123627,
"learning_rate": 3.2448712582912265e-05,
"loss": 0.1656,
"step": 19245
},
{
"epoch": 2.4244733444594893,
"grad_norm": 0.22439540922641754,
"learning_rate": 3.2380463345616986e-05,
"loss": 0.1704,
"step": 19250
},
{
"epoch": 2.4251031268696663,
"grad_norm": 0.22877377271652222,
"learning_rate": 3.231227727269956e-05,
"loss": 0.1655,
"step": 19255
},
{
"epoch": 2.425732909279844,
"grad_norm": 0.19454975426197052,
"learning_rate": 3.224415440077757e-05,
"loss": 0.1711,
"step": 19260
},
{
"epoch": 2.426362691690021,
"grad_norm": 0.21811099350452423,
"learning_rate": 3.217609476643447e-05,
"loss": 0.1602,
"step": 19265
},
{
"epoch": 2.4269924741001985,
"grad_norm": 0.21663612127304077,
"learning_rate": 3.2108098406219884e-05,
"loss": 0.1626,
"step": 19270
},
{
"epoch": 2.4276222565103756,
"grad_norm": 0.21124348044395447,
"learning_rate": 3.204016535664937e-05,
"loss": 0.1621,
"step": 19275
},
{
"epoch": 2.428252038920553,
"grad_norm": 0.20466844737529755,
"learning_rate": 3.1972295654204554e-05,
"loss": 0.1608,
"step": 19280
},
{
"epoch": 2.42888182133073,
"grad_norm": 0.20153960585594177,
"learning_rate": 3.1904489335333014e-05,
"loss": 0.1699,
"step": 19285
},
{
"epoch": 2.4295116037409077,
"grad_norm": 0.2766586244106293,
"learning_rate": 3.1836746436448294e-05,
"loss": 0.1716,
"step": 19290
},
{
"epoch": 2.4301413861510848,
"grad_norm": 0.1904149353504181,
"learning_rate": 3.176906699392986e-05,
"loss": 0.1756,
"step": 19295
},
{
"epoch": 2.430771168561262,
"grad_norm": 0.19312819838523865,
"learning_rate": 3.170145104412309e-05,
"loss": 0.1666,
"step": 19300
},
{
"epoch": 2.4314009509714394,
"grad_norm": 0.19623906910419464,
"learning_rate": 3.163389862333939e-05,
"loss": 0.1541,
"step": 19305
},
{
"epoch": 2.4320307333816165,
"grad_norm": 0.1920466423034668,
"learning_rate": 3.156640976785592e-05,
"loss": 0.1575,
"step": 19310
},
{
"epoch": 2.432660515791794,
"grad_norm": 0.2178039401769638,
"learning_rate": 3.149898451391565e-05,
"loss": 0.1533,
"step": 19315
},
{
"epoch": 2.433290298201971,
"grad_norm": 0.21117891371250153,
"learning_rate": 3.143162289772757e-05,
"loss": 0.1529,
"step": 19320
},
{
"epoch": 2.4339200806121486,
"grad_norm": 0.21997326612472534,
"learning_rate": 3.1364324955466405e-05,
"loss": 0.167,
"step": 19325
},
{
"epoch": 2.4345498630223257,
"grad_norm": 0.2015310823917389,
"learning_rate": 3.129709072327264e-05,
"loss": 0.1608,
"step": 19330
},
{
"epoch": 2.435179645432503,
"grad_norm": 0.21516267955303192,
"learning_rate": 3.122992023725263e-05,
"loss": 0.159,
"step": 19335
},
{
"epoch": 2.4358094278426803,
"grad_norm": 0.22670945525169373,
"learning_rate": 3.116281353347841e-05,
"loss": 0.1703,
"step": 19340
},
{
"epoch": 2.436439210252858,
"grad_norm": 0.19361603260040283,
"learning_rate": 3.109577064798793e-05,
"loss": 0.1647,
"step": 19345
},
{
"epoch": 2.437068992663035,
"grad_norm": 0.18795832991600037,
"learning_rate": 3.1028791616784624e-05,
"loss": 0.1532,
"step": 19350
},
{
"epoch": 2.437698775073212,
"grad_norm": 0.24311493337154388,
"learning_rate": 3.0961876475837814e-05,
"loss": 0.1599,
"step": 19355
},
{
"epoch": 2.4383285574833895,
"grad_norm": 0.20328237116336823,
"learning_rate": 3.089502526108242e-05,
"loss": 0.1604,
"step": 19360
},
{
"epoch": 2.4389583398935666,
"grad_norm": 0.2067318707704544,
"learning_rate": 3.082823800841914e-05,
"loss": 0.161,
"step": 19365
},
{
"epoch": 2.439588122303744,
"grad_norm": 0.22590148448944092,
"learning_rate": 3.0761514753714235e-05,
"loss": 0.1711,
"step": 19370
},
{
"epoch": 2.440217904713921,
"grad_norm": 0.2264234572649002,
"learning_rate": 3.069485553279958e-05,
"loss": 0.1625,
"step": 19375
},
{
"epoch": 2.4408476871240987,
"grad_norm": 0.21357667446136475,
"learning_rate": 3.062826038147274e-05,
"loss": 0.162,
"step": 19380
},
{
"epoch": 2.441477469534276,
"grad_norm": 0.19787681102752686,
"learning_rate": 3.0561729335496816e-05,
"loss": 0.1566,
"step": 19385
},
{
"epoch": 2.4421072519444533,
"grad_norm": 0.20055502653121948,
"learning_rate": 3.0495262430600487e-05,
"loss": 0.1612,
"step": 19390
},
{
"epoch": 2.4427370343546304,
"grad_norm": 0.2178819328546524,
"learning_rate": 3.0428859702478003e-05,
"loss": 0.1701,
"step": 19395
},
{
"epoch": 2.443366816764808,
"grad_norm": 0.2206692099571228,
"learning_rate": 3.0362521186789125e-05,
"loss": 0.1668,
"step": 19400
},
{
"epoch": 2.443996599174985,
"grad_norm": 0.22452767193317413,
"learning_rate": 3.0296246919159218e-05,
"loss": 0.1713,
"step": 19405
},
{
"epoch": 2.444626381585162,
"grad_norm": 0.23711274564266205,
"learning_rate": 3.023003693517908e-05,
"loss": 0.1637,
"step": 19410
},
{
"epoch": 2.4452561639953396,
"grad_norm": 0.2252456545829773,
"learning_rate": 3.0163891270404904e-05,
"loss": 0.1685,
"step": 19415
},
{
"epoch": 2.4458859464055167,
"grad_norm": 0.2477557361125946,
"learning_rate": 3.0097809960358427e-05,
"loss": 0.1669,
"step": 19420
},
{
"epoch": 2.4465157288156942,
"grad_norm": 0.21543872356414795,
"learning_rate": 3.003179304052689e-05,
"loss": 0.1624,
"step": 19425
},
{
"epoch": 2.4471455112258713,
"grad_norm": 0.18810850381851196,
"learning_rate": 2.9965840546362858e-05,
"loss": 0.1531,
"step": 19430
},
{
"epoch": 2.447775293636049,
"grad_norm": 0.2468540370464325,
"learning_rate": 2.9899952513284307e-05,
"loss": 0.1644,
"step": 19435
},
{
"epoch": 2.448405076046226,
"grad_norm": 0.2639712393283844,
"learning_rate": 2.9834128976674643e-05,
"loss": 0.166,
"step": 19440
},
{
"epoch": 2.4490348584564035,
"grad_norm": 0.18254607915878296,
"learning_rate": 2.9768369971882598e-05,
"loss": 0.1478,
"step": 19445
},
{
"epoch": 2.4496646408665805,
"grad_norm": 0.2060953974723816,
"learning_rate": 2.9702675534222265e-05,
"loss": 0.161,
"step": 19450
},
{
"epoch": 2.450294423276758,
"grad_norm": 0.20919503271579742,
"learning_rate": 2.963704569897305e-05,
"loss": 0.1635,
"step": 19455
},
{
"epoch": 2.450924205686935,
"grad_norm": 0.20381583273410797,
"learning_rate": 2.957148050137963e-05,
"loss": 0.1677,
"step": 19460
},
{
"epoch": 2.4515539880971122,
"grad_norm": 0.23379412293434143,
"learning_rate": 2.9505979976652106e-05,
"loss": 0.1669,
"step": 19465
},
{
"epoch": 2.4521837705072898,
"grad_norm": 0.21713408827781677,
"learning_rate": 2.9440544159965707e-05,
"loss": 0.1639,
"step": 19470
},
{
"epoch": 2.452813552917467,
"grad_norm": 0.2360960692167282,
"learning_rate": 2.9375173086460975e-05,
"loss": 0.1682,
"step": 19475
},
{
"epoch": 2.4534433353276444,
"grad_norm": 0.21496212482452393,
"learning_rate": 2.9309866791243643e-05,
"loss": 0.1508,
"step": 19480
},
{
"epoch": 2.4540731177378214,
"grad_norm": 0.19526614248752594,
"learning_rate": 2.9244625309384706e-05,
"loss": 0.1607,
"step": 19485
},
{
"epoch": 2.454702900147999,
"grad_norm": 0.2625288665294647,
"learning_rate": 2.917944867592031e-05,
"loss": 0.1708,
"step": 19490
},
{
"epoch": 2.455332682558176,
"grad_norm": 0.19196555018424988,
"learning_rate": 2.9114336925851818e-05,
"loss": 0.1715,
"step": 19495
},
{
"epoch": 2.4559624649683536,
"grad_norm": 0.21597431600093842,
"learning_rate": 2.9049290094145726e-05,
"loss": 0.1508,
"step": 19500
},
{
"epoch": 2.4565922473785307,
"grad_norm": 0.2433023750782013,
"learning_rate": 2.8984308215733615e-05,
"loss": 0.1568,
"step": 19505
},
{
"epoch": 2.457222029788708,
"grad_norm": 0.231834277510643,
"learning_rate": 2.8919391325512314e-05,
"loss": 0.1552,
"step": 19510
},
{
"epoch": 2.4578518121988853,
"grad_norm": 0.21281488239765167,
"learning_rate": 2.885453945834369e-05,
"loss": 0.161,
"step": 19515
},
{
"epoch": 2.4584815946090623,
"grad_norm": 0.21355679631233215,
"learning_rate": 2.878975264905455e-05,
"loss": 0.1515,
"step": 19520
},
{
"epoch": 2.45911137701924,
"grad_norm": 0.20718532800674438,
"learning_rate": 2.8725030932437025e-05,
"loss": 0.1622,
"step": 19525
},
{
"epoch": 2.459741159429417,
"grad_norm": 0.21609242260456085,
"learning_rate": 2.8660374343248087e-05,
"loss": 0.1531,
"step": 19530
},
{
"epoch": 2.4603709418395945,
"grad_norm": 0.2453998625278473,
"learning_rate": 2.8595782916209825e-05,
"loss": 0.1605,
"step": 19535
},
{
"epoch": 2.4610007242497716,
"grad_norm": 0.27632614970207214,
"learning_rate": 2.8531256686009306e-05,
"loss": 0.1598,
"step": 19540
},
{
"epoch": 2.461630506659949,
"grad_norm": 0.19357621669769287,
"learning_rate": 2.846679568729855e-05,
"loss": 0.1527,
"step": 19545
},
{
"epoch": 2.462260289070126,
"grad_norm": 0.19920161366462708,
"learning_rate": 2.8402399954694692e-05,
"loss": 0.1561,
"step": 19550
},
{
"epoch": 2.4628900714803037,
"grad_norm": 0.19081860780715942,
"learning_rate": 2.8338069522779595e-05,
"loss": 0.1524,
"step": 19555
},
{
"epoch": 2.463519853890481,
"grad_norm": 0.22451332211494446,
"learning_rate": 2.8273804426100234e-05,
"loss": 0.1628,
"step": 19560
},
{
"epoch": 2.4641496363006583,
"grad_norm": 0.19204290211200714,
"learning_rate": 2.820960469916837e-05,
"loss": 0.1499,
"step": 19565
},
{
"epoch": 2.4647794187108354,
"grad_norm": 0.20258976519107819,
"learning_rate": 2.814547037646081e-05,
"loss": 0.1514,
"step": 19570
},
{
"epoch": 2.4654092011210125,
"grad_norm": 0.21591047942638397,
"learning_rate": 2.8081401492419102e-05,
"loss": 0.1555,
"step": 19575
},
{
"epoch": 2.46603898353119,
"grad_norm": 0.20639857649803162,
"learning_rate": 2.8017398081449728e-05,
"loss": 0.1597,
"step": 19580
},
{
"epoch": 2.466668765941367,
"grad_norm": 0.18190859258174896,
"learning_rate": 2.7953460177923953e-05,
"loss": 0.1676,
"step": 19585
},
{
"epoch": 2.4672985483515446,
"grad_norm": 0.23196272552013397,
"learning_rate": 2.7889587816177884e-05,
"loss": 0.1644,
"step": 19590
},
{
"epoch": 2.4679283307617217,
"grad_norm": 0.23499402403831482,
"learning_rate": 2.782578103051248e-05,
"loss": 0.1596,
"step": 19595
},
{
"epoch": 2.468558113171899,
"grad_norm": 0.19516189396381378,
"learning_rate": 2.7762039855193398e-05,
"loss": 0.1592,
"step": 19600
},
{
"epoch": 2.4691878955820763,
"grad_norm": 0.25550252199172974,
"learning_rate": 2.769836432445109e-05,
"loss": 0.1652,
"step": 19605
},
{
"epoch": 2.469817677992254,
"grad_norm": 0.20900960266590118,
"learning_rate": 2.7634754472480852e-05,
"loss": 0.1576,
"step": 19610
},
{
"epoch": 2.470447460402431,
"grad_norm": 0.19483284652233124,
"learning_rate": 2.757121033344258e-05,
"loss": 0.1671,
"step": 19615
},
{
"epoch": 2.4710772428126084,
"grad_norm": 0.21054719388484955,
"learning_rate": 2.7507731941460952e-05,
"loss": 0.1572,
"step": 19620
},
{
"epoch": 2.4717070252227855,
"grad_norm": 0.23577210307121277,
"learning_rate": 2.7444319330625243e-05,
"loss": 0.1657,
"step": 19625
},
{
"epoch": 2.4723368076329626,
"grad_norm": 0.21099181473255157,
"learning_rate": 2.7380972534989538e-05,
"loss": 0.1696,
"step": 19630
},
{
"epoch": 2.47296659004314,
"grad_norm": 0.20165832340717316,
"learning_rate": 2.7317691588572495e-05,
"loss": 0.1529,
"step": 19635
},
{
"epoch": 2.473596372453317,
"grad_norm": 0.19725088775157928,
"learning_rate": 2.7254476525357443e-05,
"loss": 0.1503,
"step": 19640
},
{
"epoch": 2.4742261548634947,
"grad_norm": 0.23867055773735046,
"learning_rate": 2.7191327379292283e-05,
"loss": 0.1766,
"step": 19645
},
{
"epoch": 2.474855937273672,
"grad_norm": 0.21567271649837494,
"learning_rate": 2.712824418428955e-05,
"loss": 0.1562,
"step": 19650
},
{
"epoch": 2.4754857196838493,
"grad_norm": 0.2127571702003479,
"learning_rate": 2.7065226974226444e-05,
"loss": 0.1588,
"step": 19655
},
{
"epoch": 2.4761155020940264,
"grad_norm": 0.192424476146698,
"learning_rate": 2.700227578294455e-05,
"loss": 0.1632,
"step": 19660
},
{
"epoch": 2.476745284504204,
"grad_norm": 0.19549550116062164,
"learning_rate": 2.693939064425007e-05,
"loss": 0.1666,
"step": 19665
},
{
"epoch": 2.477375066914381,
"grad_norm": 0.21715867519378662,
"learning_rate": 2.6876571591913874e-05,
"loss": 0.1637,
"step": 19670
},
{
"epoch": 2.4780048493245586,
"grad_norm": 0.246476948261261,
"learning_rate": 2.6813818659671167e-05,
"loss": 0.1691,
"step": 19675
},
{
"epoch": 2.4786346317347356,
"grad_norm": 0.19329246878623962,
"learning_rate": 2.6751131881221698e-05,
"loss": 0.1576,
"step": 19680
},
{
"epoch": 2.4792644141449127,
"grad_norm": 0.1897173672914505,
"learning_rate": 2.6688511290229714e-05,
"loss": 0.1566,
"step": 19685
},
{
"epoch": 2.4798941965550902,
"grad_norm": 0.19795387983322144,
"learning_rate": 2.662595692032391e-05,
"loss": 0.159,
"step": 19690
},
{
"epoch": 2.4805239789652673,
"grad_norm": 0.19520628452301025,
"learning_rate": 2.65634688050974e-05,
"loss": 0.1577,
"step": 19695
},
{
"epoch": 2.481153761375445,
"grad_norm": 0.21223746240139008,
"learning_rate": 2.650104697810772e-05,
"loss": 0.1674,
"step": 19700
},
{
"epoch": 2.481783543785622,
"grad_norm": 0.19204822182655334,
"learning_rate": 2.6438691472876828e-05,
"loss": 0.1492,
"step": 19705
},
{
"epoch": 2.4824133261957995,
"grad_norm": 0.2568466067314148,
"learning_rate": 2.6376402322891032e-05,
"loss": 0.1557,
"step": 19710
},
{
"epoch": 2.4830431086059765,
"grad_norm": 0.21695761382579803,
"learning_rate": 2.6314179561601078e-05,
"loss": 0.1715,
"step": 19715
},
{
"epoch": 2.483672891016154,
"grad_norm": 0.21485815942287445,
"learning_rate": 2.625202322242197e-05,
"loss": 0.1599,
"step": 19720
},
{
"epoch": 2.484302673426331,
"grad_norm": 0.18373069167137146,
"learning_rate": 2.6189933338733122e-05,
"loss": 0.1636,
"step": 19725
},
{
"epoch": 2.4849324558365087,
"grad_norm": 0.2190975546836853,
"learning_rate": 2.6127909943878177e-05,
"loss": 0.1613,
"step": 19730
},
{
"epoch": 2.4855622382466858,
"grad_norm": 0.22146424651145935,
"learning_rate": 2.606595307116513e-05,
"loss": 0.1554,
"step": 19735
},
{
"epoch": 2.486192020656863,
"grad_norm": 0.22576889395713806,
"learning_rate": 2.6004062753866228e-05,
"loss": 0.1723,
"step": 19740
},
{
"epoch": 2.4868218030670404,
"grad_norm": 0.22661438584327698,
"learning_rate": 2.5942239025218004e-05,
"loss": 0.1616,
"step": 19745
},
{
"epoch": 2.4874515854772175,
"grad_norm": 0.20992781221866608,
"learning_rate": 2.588048191842118e-05,
"loss": 0.1666,
"step": 19750
},
{
"epoch": 2.488081367887395,
"grad_norm": 0.18685118854045868,
"learning_rate": 2.581879146664078e-05,
"loss": 0.163,
"step": 19755
},
{
"epoch": 2.488711150297572,
"grad_norm": 0.2547582983970642,
"learning_rate": 2.5757167703005987e-05,
"loss": 0.1683,
"step": 19760
},
{
"epoch": 2.4893409327077496,
"grad_norm": 0.18066510558128357,
"learning_rate": 2.569561066061013e-05,
"loss": 0.1581,
"step": 19765
},
{
"epoch": 2.4899707151179267,
"grad_norm": 0.22709952294826508,
"learning_rate": 2.5634120372510708e-05,
"loss": 0.1655,
"step": 19770
},
{
"epoch": 2.490600497528104,
"grad_norm": 0.18300481140613556,
"learning_rate": 2.5572696871729496e-05,
"loss": 0.1634,
"step": 19775
},
{
"epoch": 2.4912302799382813,
"grad_norm": 0.23889437317848206,
"learning_rate": 2.5511340191252294e-05,
"loss": 0.1653,
"step": 19780
},
{
"epoch": 2.491860062348459,
"grad_norm": 0.18972428143024445,
"learning_rate": 2.545005036402904e-05,
"loss": 0.1522,
"step": 19785
},
{
"epoch": 2.492489844758636,
"grad_norm": 0.1869877278804779,
"learning_rate": 2.5388827422973722e-05,
"loss": 0.1587,
"step": 19790
},
{
"epoch": 2.493119627168813,
"grad_norm": 0.200529083609581,
"learning_rate": 2.5327671400964562e-05,
"loss": 0.1621,
"step": 19795
},
{
"epoch": 2.4937494095789905,
"grad_norm": 0.20414294302463531,
"learning_rate": 2.526658233084365e-05,
"loss": 0.1619,
"step": 19800
},
{
"epoch": 2.4943791919891676,
"grad_norm": 0.2506503760814667,
"learning_rate": 2.5205560245417227e-05,
"loss": 0.1711,
"step": 19805
},
{
"epoch": 2.495008974399345,
"grad_norm": 0.2258518785238266,
"learning_rate": 2.5144605177455534e-05,
"loss": 0.1718,
"step": 19810
},
{
"epoch": 2.495638756809522,
"grad_norm": 0.22719348967075348,
"learning_rate": 2.5083717159692902e-05,
"loss": 0.1611,
"step": 19815
},
{
"epoch": 2.4962685392196997,
"grad_norm": 0.18670164048671722,
"learning_rate": 2.502289622482752e-05,
"loss": 0.155,
"step": 19820
},
{
"epoch": 2.496898321629877,
"grad_norm": 0.19051162898540497,
"learning_rate": 2.4962142405521666e-05,
"loss": 0.1528,
"step": 19825
},
{
"epoch": 2.4975281040400543,
"grad_norm": 0.2364228218793869,
"learning_rate": 2.4901455734401508e-05,
"loss": 0.1642,
"step": 19830
},
{
"epoch": 2.4981578864502314,
"grad_norm": 0.1748083382844925,
"learning_rate": 2.484083624405716e-05,
"loss": 0.1536,
"step": 19835
},
{
"epoch": 2.498787668860409,
"grad_norm": 0.21124523878097534,
"learning_rate": 2.4780283967042697e-05,
"loss": 0.1641,
"step": 19840
},
{
"epoch": 2.499417451270586,
"grad_norm": 0.21559958159923553,
"learning_rate": 2.4719798935876073e-05,
"loss": 0.1522,
"step": 19845
},
{
"epoch": 2.500047233680763,
"grad_norm": 0.20545977354049683,
"learning_rate": 2.4659381183039105e-05,
"loss": 0.1492,
"step": 19850
},
{
"epoch": 2.5006770160909406,
"grad_norm": 0.21759046614170074,
"learning_rate": 2.459903074097749e-05,
"loss": 0.1637,
"step": 19855
},
{
"epoch": 2.5013067985011177,
"grad_norm": 0.2807125151157379,
"learning_rate": 2.4538747642100927e-05,
"loss": 0.1701,
"step": 19860
},
{
"epoch": 2.5019365809112952,
"grad_norm": 0.1915740966796875,
"learning_rate": 2.4478531918782656e-05,
"loss": 0.1551,
"step": 19865
},
{
"epoch": 2.5025663633214723,
"grad_norm": 0.1929636150598526,
"learning_rate": 2.441838360335992e-05,
"loss": 0.1561,
"step": 19870
},
{
"epoch": 2.50319614573165,
"grad_norm": 0.23392513394355774,
"learning_rate": 2.4358302728133827e-05,
"loss": 0.1606,
"step": 19875
},
{
"epoch": 2.503825928141827,
"grad_norm": 0.21680179238319397,
"learning_rate": 2.4298289325369137e-05,
"loss": 0.166,
"step": 19880
},
{
"epoch": 2.5044557105520044,
"grad_norm": 0.20863774418830872,
"learning_rate": 2.42383434272944e-05,
"loss": 0.153,
"step": 19885
},
{
"epoch": 2.5050854929621815,
"grad_norm": 0.2562030851840973,
"learning_rate": 2.4178465066101933e-05,
"loss": 0.1591,
"step": 19890
},
{
"epoch": 2.505715275372359,
"grad_norm": 0.22802165150642395,
"learning_rate": 2.4118654273947796e-05,
"loss": 0.1664,
"step": 19895
},
{
"epoch": 2.506345057782536,
"grad_norm": 0.23240098357200623,
"learning_rate": 2.4058911082951764e-05,
"loss": 0.1585,
"step": 19900
},
{
"epoch": 2.506974840192713,
"grad_norm": 0.21342670917510986,
"learning_rate": 2.3999235525197275e-05,
"loss": 0.1471,
"step": 19905
},
{
"epoch": 2.5076046226028907,
"grad_norm": 0.20659485459327698,
"learning_rate": 2.3939627632731458e-05,
"loss": 0.1593,
"step": 19910
},
{
"epoch": 2.508234405013068,
"grad_norm": 0.21293510496616364,
"learning_rate": 2.3880087437565104e-05,
"loss": 0.1575,
"step": 19915
},
{
"epoch": 2.5088641874232454,
"grad_norm": 0.2346251904964447,
"learning_rate": 2.382061497167271e-05,
"loss": 0.1639,
"step": 19920
},
{
"epoch": 2.5094939698334224,
"grad_norm": 0.22029395401477814,
"learning_rate": 2.376121026699232e-05,
"loss": 0.1537,
"step": 19925
},
{
"epoch": 2.5101237522436,
"grad_norm": 0.1979423314332962,
"learning_rate": 2.3701873355425606e-05,
"loss": 0.154,
"step": 19930
},
{
"epoch": 2.510753534653777,
"grad_norm": 0.1969837099313736,
"learning_rate": 2.3642604268837873e-05,
"loss": 0.1623,
"step": 19935
},
{
"epoch": 2.5113833170639546,
"grad_norm": 0.23190250992774963,
"learning_rate": 2.3583403039057946e-05,
"loss": 0.1673,
"step": 19940
},
{
"epoch": 2.5120130994741316,
"grad_norm": 0.20579595863819122,
"learning_rate": 2.3524269697878244e-05,
"loss": 0.1638,
"step": 19945
},
{
"epoch": 2.512642881884309,
"grad_norm": 0.2181597501039505,
"learning_rate": 2.3465204277054734e-05,
"loss": 0.1535,
"step": 19950
},
{
"epoch": 2.5132726642944863,
"grad_norm": 0.27504584193229675,
"learning_rate": 2.3406206808306854e-05,
"loss": 0.1687,
"step": 19955
},
{
"epoch": 2.5139024467046633,
"grad_norm": 0.21288301050662994,
"learning_rate": 2.334727732331765e-05,
"loss": 0.1611,
"step": 19960
},
{
"epoch": 2.514532229114841,
"grad_norm": 0.20768193900585175,
"learning_rate": 2.3288415853733615e-05,
"loss": 0.1595,
"step": 19965
},
{
"epoch": 2.515162011525018,
"grad_norm": 0.1934243142604828,
"learning_rate": 2.322962243116464e-05,
"loss": 0.1573,
"step": 19970
},
{
"epoch": 2.5157917939351955,
"grad_norm": 0.21198545396327972,
"learning_rate": 2.3170897087184133e-05,
"loss": 0.1549,
"step": 19975
},
{
"epoch": 2.5164215763453726,
"grad_norm": 0.25898632407188416,
"learning_rate": 2.3112239853328996e-05,
"loss": 0.171,
"step": 19980
},
{
"epoch": 2.51705135875555,
"grad_norm": 0.2114986777305603,
"learning_rate": 2.3053650761099485e-05,
"loss": 0.1544,
"step": 19985
},
{
"epoch": 2.517681141165727,
"grad_norm": 0.21560825407505035,
"learning_rate": 2.2995129841959266e-05,
"loss": 0.1736,
"step": 19990
},
{
"epoch": 2.5183109235759042,
"grad_norm": 0.18198496103286743,
"learning_rate": 2.2936677127335395e-05,
"loss": 0.154,
"step": 19995
},
{
"epoch": 2.5189407059860818,
"grad_norm": 0.20246680080890656,
"learning_rate": 2.287829264861842e-05,
"loss": 0.1598,
"step": 20000
},
{
"epoch": 2.5189407059860818,
"eval_loss": 0.3499235212802887,
"eval_runtime": 6.1623,
"eval_samples_per_second": 162.278,
"eval_steps_per_second": 10.224,
"step": 20000
},
{
"epoch": 2.5195704883962593,
"grad_norm": 0.2162911742925644,
"learning_rate": 2.2819976437162e-05,
"loss": 0.1623,
"step": 20005
},
{
"epoch": 2.5202002708064364,
"grad_norm": 0.16897226870059967,
"learning_rate": 2.2761728524283344e-05,
"loss": 0.1511,
"step": 20010
},
{
"epoch": 2.5208300532166135,
"grad_norm": 0.19399495422840118,
"learning_rate": 2.2703548941262877e-05,
"loss": 0.1615,
"step": 20015
},
{
"epoch": 2.521459835626791,
"grad_norm": 0.24832330644130707,
"learning_rate": 2.2645437719344424e-05,
"loss": 0.1596,
"step": 20020
},
{
"epoch": 2.522089618036968,
"grad_norm": 0.1990746706724167,
"learning_rate": 2.2587394889734982e-05,
"loss": 0.1517,
"step": 20025
},
{
"epoch": 2.5227194004471456,
"grad_norm": 0.2356463521718979,
"learning_rate": 2.252942048360491e-05,
"loss": 0.1666,
"step": 20030
},
{
"epoch": 2.5233491828573227,
"grad_norm": 0.2032928168773651,
"learning_rate": 2.2471514532087766e-05,
"loss": 0.1463,
"step": 20035
},
{
"epoch": 2.5239789652675,
"grad_norm": 0.24515411257743835,
"learning_rate": 2.2413677066280388e-05,
"loss": 0.1629,
"step": 20040
},
{
"epoch": 2.5246087476776773,
"grad_norm": 0.24597153067588806,
"learning_rate": 2.2355908117242803e-05,
"loss": 0.1507,
"step": 20045
},
{
"epoch": 2.5252385300878544,
"grad_norm": 0.1958838254213333,
"learning_rate": 2.2298207715998246e-05,
"loss": 0.167,
"step": 20050
},
{
"epoch": 2.525868312498032,
"grad_norm": 0.18343359231948853,
"learning_rate": 2.2240575893533176e-05,
"loss": 0.1582,
"step": 20055
},
{
"epoch": 2.5264980949082094,
"grad_norm": 0.2554282248020172,
"learning_rate": 2.218301268079715e-05,
"loss": 0.1701,
"step": 20060
},
{
"epoch": 2.5271278773183865,
"grad_norm": 0.2655259072780609,
"learning_rate": 2.2125518108703e-05,
"loss": 0.1666,
"step": 20065
},
{
"epoch": 2.5277576597285636,
"grad_norm": 0.23147699236869812,
"learning_rate": 2.206809220812662e-05,
"loss": 0.1646,
"step": 20070
},
{
"epoch": 2.528387442138741,
"grad_norm": 0.19453732669353485,
"learning_rate": 2.2010735009906926e-05,
"loss": 0.1595,
"step": 20075
},
{
"epoch": 2.529017224548918,
"grad_norm": 0.21716727316379547,
"learning_rate": 2.195344654484615e-05,
"loss": 0.1656,
"step": 20080
},
{
"epoch": 2.5296470069590957,
"grad_norm": 0.19851936399936676,
"learning_rate": 2.1896226843709475e-05,
"loss": 0.1545,
"step": 20085
},
{
"epoch": 2.530276789369273,
"grad_norm": 0.20362606644630432,
"learning_rate": 2.1839075937225192e-05,
"loss": 0.1534,
"step": 20090
},
{
"epoch": 2.5309065717794503,
"grad_norm": 0.23197387158870697,
"learning_rate": 2.1781993856084633e-05,
"loss": 0.1624,
"step": 20095
},
{
"epoch": 2.5315363541896274,
"grad_norm": 0.2547961473464966,
"learning_rate": 2.1724980630942145e-05,
"loss": 0.1539,
"step": 20100
},
{
"epoch": 2.5321661365998045,
"grad_norm": 0.1867532879114151,
"learning_rate": 2.1668036292415237e-05,
"loss": 0.1518,
"step": 20105
},
{
"epoch": 2.532795919009982,
"grad_norm": 0.19402964413166046,
"learning_rate": 2.161116087108421e-05,
"loss": 0.1522,
"step": 20110
},
{
"epoch": 2.5334257014201595,
"grad_norm": 0.20450226962566376,
"learning_rate": 2.1554354397492517e-05,
"loss": 0.155,
"step": 20115
},
{
"epoch": 2.5340554838303366,
"grad_norm": 0.22179925441741943,
"learning_rate": 2.149761690214649e-05,
"loss": 0.1557,
"step": 20120
},
{
"epoch": 2.5346852662405137,
"grad_norm": 0.2105506807565689,
"learning_rate": 2.1440948415515524e-05,
"loss": 0.1668,
"step": 20125
},
{
"epoch": 2.5353150486506912,
"grad_norm": 0.24963414669036865,
"learning_rate": 2.1384348968031857e-05,
"loss": 0.1597,
"step": 20130
},
{
"epoch": 2.5359448310608683,
"grad_norm": 0.23433445394039154,
"learning_rate": 2.132781859009069e-05,
"loss": 0.1579,
"step": 20135
},
{
"epoch": 2.536574613471046,
"grad_norm": 0.19620360434055328,
"learning_rate": 2.1271357312050126e-05,
"loss": 0.1492,
"step": 20140
},
{
"epoch": 2.537204395881223,
"grad_norm": 0.23040203750133514,
"learning_rate": 2.1214965164231157e-05,
"loss": 0.1585,
"step": 20145
},
{
"epoch": 2.5378341782914005,
"grad_norm": 0.23273873329162598,
"learning_rate": 2.1158642176917647e-05,
"loss": 0.1589,
"step": 20150
},
{
"epoch": 2.5384639607015775,
"grad_norm": 0.2472730576992035,
"learning_rate": 2.1102388380356344e-05,
"loss": 0.1677,
"step": 20155
},
{
"epoch": 2.5390937431117546,
"grad_norm": 0.19982990622520447,
"learning_rate": 2.104620380475679e-05,
"loss": 0.1515,
"step": 20160
},
{
"epoch": 2.539723525521932,
"grad_norm": 0.21257297694683075,
"learning_rate": 2.099008848029143e-05,
"loss": 0.165,
"step": 20165
},
{
"epoch": 2.5403533079321097,
"grad_norm": 0.20112313330173492,
"learning_rate": 2.0934042437095457e-05,
"loss": 0.1497,
"step": 20170
},
{
"epoch": 2.5409830903422868,
"grad_norm": 0.24434730410575867,
"learning_rate": 2.087806570526691e-05,
"loss": 0.1583,
"step": 20175
},
{
"epoch": 2.541612872752464,
"grad_norm": 0.20866596698760986,
"learning_rate": 2.0822158314866467e-05,
"loss": 0.1584,
"step": 20180
},
{
"epoch": 2.5422426551626414,
"grad_norm": 0.1903751641511917,
"learning_rate": 2.076632029591777e-05,
"loss": 0.1447,
"step": 20185
},
{
"epoch": 2.5428724375728184,
"grad_norm": 0.24377766251564026,
"learning_rate": 2.071055167840709e-05,
"loss": 0.1636,
"step": 20190
},
{
"epoch": 2.543502219982996,
"grad_norm": 0.25960245728492737,
"learning_rate": 2.0654852492283446e-05,
"loss": 0.164,
"step": 20195
},
{
"epoch": 2.544132002393173,
"grad_norm": 0.23870185017585754,
"learning_rate": 2.0599222767458533e-05,
"loss": 0.1579,
"step": 20200
},
{
"epoch": 2.5447617848033506,
"grad_norm": 0.2245192676782608,
"learning_rate": 2.0543662533806855e-05,
"loss": 0.1655,
"step": 20205
},
{
"epoch": 2.5453915672135277,
"grad_norm": 0.23136839270591736,
"learning_rate": 2.048817182116554e-05,
"loss": 0.1591,
"step": 20210
},
{
"epoch": 2.5460213496237047,
"grad_norm": 0.21092520654201508,
"learning_rate": 2.043275065933427e-05,
"loss": 0.1536,
"step": 20215
},
{
"epoch": 2.5466511320338823,
"grad_norm": 0.18601630628108978,
"learning_rate": 2.0377399078075485e-05,
"loss": 0.1523,
"step": 20220
},
{
"epoch": 2.54728091444406,
"grad_norm": 0.21489211916923523,
"learning_rate": 2.0322117107114343e-05,
"loss": 0.1554,
"step": 20225
},
{
"epoch": 2.547910696854237,
"grad_norm": 0.2098049521446228,
"learning_rate": 2.026690477613845e-05,
"loss": 0.1522,
"step": 20230
},
{
"epoch": 2.548540479264414,
"grad_norm": 0.2187887281179428,
"learning_rate": 2.021176211479813e-05,
"loss": 0.15,
"step": 20235
},
{
"epoch": 2.5491702616745915,
"grad_norm": 0.2641262710094452,
"learning_rate": 2.0156689152706216e-05,
"loss": 0.1725,
"step": 20240
},
{
"epoch": 2.5498000440847686,
"grad_norm": 0.22713615000247955,
"learning_rate": 2.010168591943817e-05,
"loss": 0.1528,
"step": 20245
},
{
"epoch": 2.550429826494946,
"grad_norm": 0.20724020898342133,
"learning_rate": 2.0046752444531976e-05,
"loss": 0.1646,
"step": 20250
},
{
"epoch": 2.551059608905123,
"grad_norm": 0.19516219198703766,
"learning_rate": 1.9991888757488156e-05,
"loss": 0.1574,
"step": 20255
},
{
"epoch": 2.5516893913153007,
"grad_norm": 0.22299246490001678,
"learning_rate": 1.993709488776979e-05,
"loss": 0.1656,
"step": 20260
},
{
"epoch": 2.552319173725478,
"grad_norm": 0.1897648572921753,
"learning_rate": 1.9882370864802373e-05,
"loss": 0.1639,
"step": 20265
},
{
"epoch": 2.552948956135655,
"grad_norm": 0.23607775568962097,
"learning_rate": 1.9827716717974048e-05,
"loss": 0.1618,
"step": 20270
},
{
"epoch": 2.5535787385458324,
"grad_norm": 0.250823438167572,
"learning_rate": 1.9773132476635285e-05,
"loss": 0.1628,
"step": 20275
},
{
"epoch": 2.55420852095601,
"grad_norm": 0.2012414038181305,
"learning_rate": 1.9718618170099087e-05,
"loss": 0.1536,
"step": 20280
},
{
"epoch": 2.554838303366187,
"grad_norm": 0.17350980639457703,
"learning_rate": 1.9664173827640873e-05,
"loss": 0.1524,
"step": 20285
},
{
"epoch": 2.555468085776364,
"grad_norm": 0.18761439621448517,
"learning_rate": 1.96097994784985e-05,
"loss": 0.1452,
"step": 20290
},
{
"epoch": 2.5560978681865416,
"grad_norm": 0.2061910331249237,
"learning_rate": 1.955549515187223e-05,
"loss": 0.1507,
"step": 20295
},
{
"epoch": 2.5567276505967187,
"grad_norm": 0.20667202770709991,
"learning_rate": 1.9501260876924736e-05,
"loss": 0.1484,
"step": 20300
},
{
"epoch": 2.557357433006896,
"grad_norm": 0.19904933869838715,
"learning_rate": 1.9447096682781015e-05,
"loss": 0.1562,
"step": 20305
},
{
"epoch": 2.5579872154170733,
"grad_norm": 0.20500166714191437,
"learning_rate": 1.9393002598528555e-05,
"loss": 0.1505,
"step": 20310
},
{
"epoch": 2.558616997827251,
"grad_norm": 0.21382258832454681,
"learning_rate": 1.933897865321712e-05,
"loss": 0.1606,
"step": 20315
},
{
"epoch": 2.559246780237428,
"grad_norm": 0.22117263078689575,
"learning_rate": 1.928502487585873e-05,
"loss": 0.1626,
"step": 20320
},
{
"epoch": 2.559876562647605,
"grad_norm": 0.2301877737045288,
"learning_rate": 1.9231141295427794e-05,
"loss": 0.1566,
"step": 20325
},
{
"epoch": 2.5605063450577825,
"grad_norm": 0.23893754184246063,
"learning_rate": 1.917732794086108e-05,
"loss": 0.1571,
"step": 20330
},
{
"epoch": 2.56113612746796,
"grad_norm": 0.2627946734428406,
"learning_rate": 1.9123584841057578e-05,
"loss": 0.1658,
"step": 20335
},
{
"epoch": 2.561765909878137,
"grad_norm": 0.19281533360481262,
"learning_rate": 1.906991202487854e-05,
"loss": 0.1525,
"step": 20340
},
{
"epoch": 2.562395692288314,
"grad_norm": 0.2772383689880371,
"learning_rate": 1.901630952114752e-05,
"loss": 0.1661,
"step": 20345
},
{
"epoch": 2.5630254746984917,
"grad_norm": 0.216465026140213,
"learning_rate": 1.896277735865027e-05,
"loss": 0.1538,
"step": 20350
},
{
"epoch": 2.563655257108669,
"grad_norm": 0.23878604173660278,
"learning_rate": 1.8909315566134782e-05,
"loss": 0.1601,
"step": 20355
},
{
"epoch": 2.5642850395188463,
"grad_norm": 0.2141411155462265,
"learning_rate": 1.8855924172311248e-05,
"loss": 0.1631,
"step": 20360
},
{
"epoch": 2.5649148219290234,
"grad_norm": 0.2064596712589264,
"learning_rate": 1.8802603205852073e-05,
"loss": 0.1578,
"step": 20365
},
{
"epoch": 2.565544604339201,
"grad_norm": 0.19963258504867554,
"learning_rate": 1.8749352695391867e-05,
"loss": 0.1609,
"step": 20370
},
{
"epoch": 2.566174386749378,
"grad_norm": 0.1846475601196289,
"learning_rate": 1.8696172669527336e-05,
"loss": 0.1544,
"step": 20375
},
{
"epoch": 2.566804169159555,
"grad_norm": 0.1911933869123459,
"learning_rate": 1.8643063156817423e-05,
"loss": 0.1565,
"step": 20380
},
{
"epoch": 2.5674339515697326,
"grad_norm": 0.2263742834329605,
"learning_rate": 1.8590024185783042e-05,
"loss": 0.1673,
"step": 20385
},
{
"epoch": 2.56806373397991,
"grad_norm": 0.21621178090572357,
"learning_rate": 1.8537055784907413e-05,
"loss": 0.1587,
"step": 20390
},
{
"epoch": 2.5686935163900873,
"grad_norm": 0.20875446498394012,
"learning_rate": 1.848415798263576e-05,
"loss": 0.1564,
"step": 20395
},
{
"epoch": 2.5693232988002643,
"grad_norm": 0.20144003629684448,
"learning_rate": 1.8431330807375417e-05,
"loss": 0.156,
"step": 20400
},
{
"epoch": 2.569953081210442,
"grad_norm": 0.24883227050304413,
"learning_rate": 1.837857428749575e-05,
"loss": 0.1568,
"step": 20405
},
{
"epoch": 2.570582863620619,
"grad_norm": 0.18426820635795593,
"learning_rate": 1.832588845132827e-05,
"loss": 0.1549,
"step": 20410
},
{
"epoch": 2.5712126460307965,
"grad_norm": 0.2462303191423416,
"learning_rate": 1.827327332716649e-05,
"loss": 0.1625,
"step": 20415
},
{
"epoch": 2.5718424284409735,
"grad_norm": 0.21659249067306519,
"learning_rate": 1.8220728943265837e-05,
"loss": 0.1521,
"step": 20420
},
{
"epoch": 2.572472210851151,
"grad_norm": 0.17811377346515656,
"learning_rate": 1.8168255327843882e-05,
"loss": 0.1586,
"step": 20425
},
{
"epoch": 2.573101993261328,
"grad_norm": 0.19524861872196198,
"learning_rate": 1.8115852509080197e-05,
"loss": 0.1474,
"step": 20430
},
{
"epoch": 2.5737317756715052,
"grad_norm": 0.17528071999549866,
"learning_rate": 1.806352051511627e-05,
"loss": 0.1599,
"step": 20435
},
{
"epoch": 2.5743615580816828,
"grad_norm": 0.19344571232795715,
"learning_rate": 1.801125937405557e-05,
"loss": 0.1579,
"step": 20440
},
{
"epoch": 2.5749913404918603,
"grad_norm": 0.20909984409809113,
"learning_rate": 1.795906911396353e-05,
"loss": 0.1584,
"step": 20445
},
{
"epoch": 2.5756211229020374,
"grad_norm": 0.17548586428165436,
"learning_rate": 1.790694976286752e-05,
"loss": 0.1535,
"step": 20450
},
{
"epoch": 2.5762509053122145,
"grad_norm": 0.25540080666542053,
"learning_rate": 1.7854901348756807e-05,
"loss": 0.1637,
"step": 20455
},
{
"epoch": 2.576880687722392,
"grad_norm": 0.1803160160779953,
"learning_rate": 1.780292389958257e-05,
"loss": 0.1526,
"step": 20460
},
{
"epoch": 2.577510470132569,
"grad_norm": 0.260122686624527,
"learning_rate": 1.775101744325792e-05,
"loss": 0.1704,
"step": 20465
},
{
"epoch": 2.5781402525427466,
"grad_norm": 0.19697842001914978,
"learning_rate": 1.7699182007657736e-05,
"loss": 0.1568,
"step": 20470
},
{
"epoch": 2.5787700349529237,
"grad_norm": 0.2179180532693863,
"learning_rate": 1.7647417620618936e-05,
"loss": 0.1612,
"step": 20475
},
{
"epoch": 2.579399817363101,
"grad_norm": 0.2509031593799591,
"learning_rate": 1.7595724309940117e-05,
"loss": 0.1531,
"step": 20480
},
{
"epoch": 2.5800295997732783,
"grad_norm": 0.19090527296066284,
"learning_rate": 1.754410210338179e-05,
"loss": 0.1477,
"step": 20485
},
{
"epoch": 2.5806593821834554,
"grad_norm": 0.20662526786327362,
"learning_rate": 1.749255102866623e-05,
"loss": 0.1584,
"step": 20490
},
{
"epoch": 2.581289164593633,
"grad_norm": 0.2258034199476242,
"learning_rate": 1.7441071113477572e-05,
"loss": 0.1597,
"step": 20495
},
{
"epoch": 2.5819189470038104,
"grad_norm": 0.22661426663398743,
"learning_rate": 1.738966238546169e-05,
"loss": 0.1582,
"step": 20500
},
{
"epoch": 2.5825487294139875,
"grad_norm": 0.24459710717201233,
"learning_rate": 1.7338324872226227e-05,
"loss": 0.1523,
"step": 20505
},
{
"epoch": 2.5831785118241646,
"grad_norm": 0.18816480040550232,
"learning_rate": 1.728705860134062e-05,
"loss": 0.1554,
"step": 20510
},
{
"epoch": 2.583808294234342,
"grad_norm": 0.18354368209838867,
"learning_rate": 1.7235863600336042e-05,
"loss": 0.1413,
"step": 20515
},
{
"epoch": 2.584438076644519,
"grad_norm": 0.1984662562608719,
"learning_rate": 1.71847398967054e-05,
"loss": 0.1566,
"step": 20520
},
{
"epoch": 2.5850678590546967,
"grad_norm": 0.2628153860569,
"learning_rate": 1.713368751790322e-05,
"loss": 0.1592,
"step": 20525
},
{
"epoch": 2.585697641464874,
"grad_norm": 0.18952016532421112,
"learning_rate": 1.7082706491345806e-05,
"loss": 0.1531,
"step": 20530
},
{
"epoch": 2.5863274238750513,
"grad_norm": 0.16905049979686737,
"learning_rate": 1.7031796844411198e-05,
"loss": 0.1556,
"step": 20535
},
{
"epoch": 2.5869572062852284,
"grad_norm": 0.20969530940055847,
"learning_rate": 1.6980958604438988e-05,
"loss": 0.1585,
"step": 20540
},
{
"epoch": 2.5875869886954055,
"grad_norm": 0.2143043577671051,
"learning_rate": 1.693019179873048e-05,
"loss": 0.1636,
"step": 20545
},
{
"epoch": 2.588216771105583,
"grad_norm": 0.24208824336528778,
"learning_rate": 1.6879496454548585e-05,
"loss": 0.1572,
"step": 20550
},
{
"epoch": 2.5888465535157605,
"grad_norm": 0.22409161925315857,
"learning_rate": 1.6828872599117958e-05,
"loss": 0.162,
"step": 20555
},
{
"epoch": 2.5894763359259376,
"grad_norm": 0.20685546100139618,
"learning_rate": 1.6778320259624654e-05,
"loss": 0.1587,
"step": 20560
},
{
"epoch": 2.5901061183361147,
"grad_norm": 0.19393740594387054,
"learning_rate": 1.672783946321649e-05,
"loss": 0.1491,
"step": 20565
},
{
"epoch": 2.5907359007462922,
"grad_norm": 0.1944616734981537,
"learning_rate": 1.667743023700275e-05,
"loss": 0.1565,
"step": 20570
},
{
"epoch": 2.5913656831564693,
"grad_norm": 0.21134309470653534,
"learning_rate": 1.662709260805442e-05,
"loss": 0.1575,
"step": 20575
},
{
"epoch": 2.591995465566647,
"grad_norm": 0.20300306379795074,
"learning_rate": 1.657682660340392e-05,
"loss": 0.1509,
"step": 20580
},
{
"epoch": 2.592625247976824,
"grad_norm": 0.209407240152359,
"learning_rate": 1.6526632250045237e-05,
"loss": 0.1568,
"step": 20585
},
{
"epoch": 2.5932550303870014,
"grad_norm": 0.18960040807724,
"learning_rate": 1.6476509574933888e-05,
"loss": 0.1561,
"step": 20590
},
{
"epoch": 2.5938848127971785,
"grad_norm": 0.2009792923927307,
"learning_rate": 1.6426458604986897e-05,
"loss": 0.1584,
"step": 20595
},
{
"epoch": 2.5945145952073556,
"grad_norm": 0.2359851896762848,
"learning_rate": 1.6376479367082796e-05,
"loss": 0.1573,
"step": 20600
},
{
"epoch": 2.595144377617533,
"grad_norm": 0.2108912616968155,
"learning_rate": 1.632657188806153e-05,
"loss": 0.1537,
"step": 20605
},
{
"epoch": 2.5957741600277107,
"grad_norm": 0.22792066633701324,
"learning_rate": 1.6276736194724575e-05,
"loss": 0.1611,
"step": 20610
},
{
"epoch": 2.5964039424378877,
"grad_norm": 0.1896820068359375,
"learning_rate": 1.622697231383488e-05,
"loss": 0.1623,
"step": 20615
},
{
"epoch": 2.597033724848065,
"grad_norm": 0.19234326481819153,
"learning_rate": 1.6177280272116728e-05,
"loss": 0.1448,
"step": 20620
},
{
"epoch": 2.5976635072582424,
"grad_norm": 0.17547307908535004,
"learning_rate": 1.6127660096255955e-05,
"loss": 0.1479,
"step": 20625
},
{
"epoch": 2.5982932896684194,
"grad_norm": 0.20076265931129456,
"learning_rate": 1.6078111812899618e-05,
"loss": 0.1504,
"step": 20630
},
{
"epoch": 2.598923072078597,
"grad_norm": 0.1888744831085205,
"learning_rate": 1.6028635448656364e-05,
"loss": 0.1587,
"step": 20635
},
{
"epoch": 2.599552854488774,
"grad_norm": 0.2309001237154007,
"learning_rate": 1.59792310300961e-05,
"loss": 0.1662,
"step": 20640
},
{
"epoch": 2.6001826368989516,
"grad_norm": 0.29581940174102783,
"learning_rate": 1.592989858375013e-05,
"loss": 0.1708,
"step": 20645
},
{
"epoch": 2.6008124193091287,
"grad_norm": 0.19039921462535858,
"learning_rate": 1.588063813611112e-05,
"loss": 0.1548,
"step": 20650
},
{
"epoch": 2.6014422017193057,
"grad_norm": 0.18266427516937256,
"learning_rate": 1.5831449713632993e-05,
"loss": 0.1523,
"step": 20655
},
{
"epoch": 2.6020719841294833,
"grad_norm": 0.1932811439037323,
"learning_rate": 1.5782333342731174e-05,
"loss": 0.1516,
"step": 20660
},
{
"epoch": 2.602701766539661,
"grad_norm": 0.17890222370624542,
"learning_rate": 1.5733289049782177e-05,
"loss": 0.1511,
"step": 20665
},
{
"epoch": 2.603331548949838,
"grad_norm": 0.20573283731937408,
"learning_rate": 1.5684316861123935e-05,
"loss": 0.1525,
"step": 20670
},
{
"epoch": 2.603961331360015,
"grad_norm": 0.21194593608379364,
"learning_rate": 1.5635416803055596e-05,
"loss": 0.1599,
"step": 20675
},
{
"epoch": 2.6045911137701925,
"grad_norm": 0.17930278182029724,
"learning_rate": 1.558658890183768e-05,
"loss": 0.1571,
"step": 20680
},
{
"epoch": 2.6052208961803696,
"grad_norm": 0.1965799480676651,
"learning_rate": 1.5537833183691857e-05,
"loss": 0.1552,
"step": 20685
},
{
"epoch": 2.605850678590547,
"grad_norm": 0.20715682208538055,
"learning_rate": 1.5489149674801054e-05,
"loss": 0.1588,
"step": 20690
},
{
"epoch": 2.606480461000724,
"grad_norm": 0.1894584596157074,
"learning_rate": 1.544053840130943e-05,
"loss": 0.1547,
"step": 20695
},
{
"epoch": 2.6071102434109017,
"grad_norm": 0.20791690051555634,
"learning_rate": 1.539199938932234e-05,
"loss": 0.1479,
"step": 20700
},
{
"epoch": 2.6077400258210788,
"grad_norm": 0.20393605530261993,
"learning_rate": 1.534353266490636e-05,
"loss": 0.1491,
"step": 20705
},
{
"epoch": 2.608369808231256,
"grad_norm": 0.20201466977596283,
"learning_rate": 1.5295138254089206e-05,
"loss": 0.1664,
"step": 20710
},
{
"epoch": 2.6089995906414334,
"grad_norm": 0.220575213432312,
"learning_rate": 1.5246816182859773e-05,
"loss": 0.16,
"step": 20715
},
{
"epoch": 2.609629373051611,
"grad_norm": 0.1888882964849472,
"learning_rate": 1.5198566477168166e-05,
"loss": 0.1592,
"step": 20720
},
{
"epoch": 2.610259155461788,
"grad_norm": 0.2035285383462906,
"learning_rate": 1.5150389162925564e-05,
"loss": 0.149,
"step": 20725
},
{
"epoch": 2.610888937871965,
"grad_norm": 0.21430674195289612,
"learning_rate": 1.5102284266004282e-05,
"loss": 0.1568,
"step": 20730
},
{
"epoch": 2.6115187202821426,
"grad_norm": 0.2220098227262497,
"learning_rate": 1.5054251812237695e-05,
"loss": 0.1601,
"step": 20735
},
{
"epoch": 2.6121485026923197,
"grad_norm": 0.18914029002189636,
"learning_rate": 1.5006291827420397e-05,
"loss": 0.1524,
"step": 20740
},
{
"epoch": 2.612778285102497,
"grad_norm": 0.19741562008857727,
"learning_rate": 1.4958404337307972e-05,
"loss": 0.1418,
"step": 20745
},
{
"epoch": 2.6134080675126743,
"grad_norm": 0.22962430119514465,
"learning_rate": 1.49105893676171e-05,
"loss": 0.1523,
"step": 20750
},
{
"epoch": 2.614037849922852,
"grad_norm": 0.17770111560821533,
"learning_rate": 1.4862846944025469e-05,
"loss": 0.1585,
"step": 20755
},
{
"epoch": 2.614667632333029,
"grad_norm": 0.22975338995456696,
"learning_rate": 1.481517709217191e-05,
"loss": 0.1608,
"step": 20760
},
{
"epoch": 2.615297414743206,
"grad_norm": 0.21070002019405365,
"learning_rate": 1.476757983765624e-05,
"loss": 0.1442,
"step": 20765
},
{
"epoch": 2.6159271971533835,
"grad_norm": 0.19414427876472473,
"learning_rate": 1.47200552060392e-05,
"loss": 0.1445,
"step": 20770
},
{
"epoch": 2.616556979563561,
"grad_norm": 0.18657416105270386,
"learning_rate": 1.4672603222842605e-05,
"loss": 0.1534,
"step": 20775
},
{
"epoch": 2.617186761973738,
"grad_norm": 0.2389591485261917,
"learning_rate": 1.4625223913549323e-05,
"loss": 0.158,
"step": 20780
},
{
"epoch": 2.617816544383915,
"grad_norm": 0.19741186499595642,
"learning_rate": 1.4577917303603081e-05,
"loss": 0.1585,
"step": 20785
},
{
"epoch": 2.6184463267940927,
"grad_norm": 0.18730677664279938,
"learning_rate": 1.4530683418408612e-05,
"loss": 0.1487,
"step": 20790
},
{
"epoch": 2.61907610920427,
"grad_norm": 0.2060120701789856,
"learning_rate": 1.4483522283331606e-05,
"loss": 0.1499,
"step": 20795
},
{
"epoch": 2.6197058916144473,
"grad_norm": 0.2186814844608307,
"learning_rate": 1.4436433923698638e-05,
"loss": 0.1562,
"step": 20800
},
{
"epoch": 2.6203356740246244,
"grad_norm": 0.21503032743930817,
"learning_rate": 1.4389418364797279e-05,
"loss": 0.1456,
"step": 20805
},
{
"epoch": 2.620965456434802,
"grad_norm": 0.17447194457054138,
"learning_rate": 1.4342475631875916e-05,
"loss": 0.1454,
"step": 20810
},
{
"epoch": 2.621595238844979,
"grad_norm": 0.18272021412849426,
"learning_rate": 1.4295605750143851e-05,
"loss": 0.149,
"step": 20815
},
{
"epoch": 2.622225021255156,
"grad_norm": 0.2014734447002411,
"learning_rate": 1.424880874477135e-05,
"loss": 0.1582,
"step": 20820
},
{
"epoch": 2.6228548036653336,
"grad_norm": 0.21231862902641296,
"learning_rate": 1.4202084640889443e-05,
"loss": 0.152,
"step": 20825
},
{
"epoch": 2.623484586075511,
"grad_norm": 0.19817417860031128,
"learning_rate": 1.415543346359006e-05,
"loss": 0.1492,
"step": 20830
},
{
"epoch": 2.6241143684856882,
"grad_norm": 0.20216423273086548,
"learning_rate": 1.410885523792586e-05,
"loss": 0.1452,
"step": 20835
},
{
"epoch": 2.6247441508958653,
"grad_norm": 0.20939548313617706,
"learning_rate": 1.4062349988910515e-05,
"loss": 0.1512,
"step": 20840
},
{
"epoch": 2.625373933306043,
"grad_norm": 0.19018815457820892,
"learning_rate": 1.4015917741518384e-05,
"loss": 0.1579,
"step": 20845
},
{
"epoch": 2.62600371571622,
"grad_norm": 0.20512887835502625,
"learning_rate": 1.396955852068462e-05,
"loss": 0.1624,
"step": 20850
},
{
"epoch": 2.6266334981263975,
"grad_norm": 0.24390068650245667,
"learning_rate": 1.3923272351305193e-05,
"loss": 0.1663,
"step": 20855
},
{
"epoch": 2.6272632805365745,
"grad_norm": 0.21338611841201782,
"learning_rate": 1.38770592582368e-05,
"loss": 0.1695,
"step": 20860
},
{
"epoch": 2.627893062946752,
"grad_norm": 0.21631261706352234,
"learning_rate": 1.3830919266297025e-05,
"loss": 0.1598,
"step": 20865
},
{
"epoch": 2.628522845356929,
"grad_norm": 0.21297498047351837,
"learning_rate": 1.3784852400264013e-05,
"loss": 0.1554,
"step": 20870
},
{
"epoch": 2.6291526277671062,
"grad_norm": 0.22182469069957733,
"learning_rate": 1.3738858684876724e-05,
"loss": 0.1618,
"step": 20875
},
{
"epoch": 2.6297824101772838,
"grad_norm": 0.2602301239967346,
"learning_rate": 1.369293814483487e-05,
"loss": 0.1616,
"step": 20880
},
{
"epoch": 2.6304121925874613,
"grad_norm": 0.20623180270195007,
"learning_rate": 1.3647090804798822e-05,
"loss": 0.1574,
"step": 20885
},
{
"epoch": 2.6310419749976384,
"grad_norm": 0.22888506948947906,
"learning_rate": 1.3601316689389635e-05,
"loss": 0.1476,
"step": 20890
},
{
"epoch": 2.6316717574078154,
"grad_norm": 0.1814454197883606,
"learning_rate": 1.3555615823189065e-05,
"loss": 0.1505,
"step": 20895
},
{
"epoch": 2.632301539817993,
"grad_norm": 0.19201195240020752,
"learning_rate": 1.350998823073951e-05,
"loss": 0.1559,
"step": 20900
},
{
"epoch": 2.63293132222817,
"grad_norm": 0.21456103026866913,
"learning_rate": 1.3464433936544055e-05,
"loss": 0.1519,
"step": 20905
},
{
"epoch": 2.6335611046383476,
"grad_norm": 0.21142180263996124,
"learning_rate": 1.3418952965066365e-05,
"loss": 0.153,
"step": 20910
},
{
"epoch": 2.6341908870485247,
"grad_norm": 0.2231752574443817,
"learning_rate": 1.3373545340730785e-05,
"loss": 0.1641,
"step": 20915
},
{
"epoch": 2.634820669458702,
"grad_norm": 0.19116418063640594,
"learning_rate": 1.3328211087922192e-05,
"loss": 0.1503,
"step": 20920
},
{
"epoch": 2.6354504518688793,
"grad_norm": 0.18010330200195312,
"learning_rate": 1.3282950230986194e-05,
"loss": 0.1434,
"step": 20925
},
{
"epoch": 2.6360802342790564,
"grad_norm": 0.179785817861557,
"learning_rate": 1.3237762794228884e-05,
"loss": 0.1502,
"step": 20930
},
{
"epoch": 2.636710016689234,
"grad_norm": 0.1842677742242813,
"learning_rate": 1.319264880191695e-05,
"loss": 0.1421,
"step": 20935
},
{
"epoch": 2.6373397990994114,
"grad_norm": 0.22725196182727814,
"learning_rate": 1.314760827827756e-05,
"loss": 0.1464,
"step": 20940
},
{
"epoch": 2.6379695815095885,
"grad_norm": 0.21761812269687653,
"learning_rate": 1.3102641247498585e-05,
"loss": 0.1492,
"step": 20945
},
{
"epoch": 2.6385993639197656,
"grad_norm": 0.2054702490568161,
"learning_rate": 1.305774773372834e-05,
"loss": 0.1626,
"step": 20950
},
{
"epoch": 2.639229146329943,
"grad_norm": 0.20189544558525085,
"learning_rate": 1.3012927761075658e-05,
"loss": 0.1672,
"step": 20955
},
{
"epoch": 2.63985892874012,
"grad_norm": 0.2214374542236328,
"learning_rate": 1.2968181353609852e-05,
"loss": 0.159,
"step": 20960
},
{
"epoch": 2.6404887111502977,
"grad_norm": 0.20227175951004028,
"learning_rate": 1.2923508535360833e-05,
"loss": 0.1668,
"step": 20965
},
{
"epoch": 2.641118493560475,
"grad_norm": 0.2125934362411499,
"learning_rate": 1.2878909330318893e-05,
"loss": 0.1587,
"step": 20970
},
{
"epoch": 2.6417482759706523,
"grad_norm": 0.20071247220039368,
"learning_rate": 1.2834383762434807e-05,
"loss": 0.1515,
"step": 20975
},
{
"epoch": 2.6423780583808294,
"grad_norm": 0.18576478958129883,
"learning_rate": 1.2789931855619817e-05,
"loss": 0.152,
"step": 20980
},
{
"epoch": 2.6430078407910065,
"grad_norm": 0.2210751622915268,
"learning_rate": 1.2745553633745642e-05,
"loss": 0.1542,
"step": 20985
},
{
"epoch": 2.643637623201184,
"grad_norm": 0.20466601848602295,
"learning_rate": 1.2701249120644402e-05,
"loss": 0.1599,
"step": 20990
},
{
"epoch": 2.6442674056113615,
"grad_norm": 0.18101972341537476,
"learning_rate": 1.2657018340108616e-05,
"loss": 0.1434,
"step": 20995
},
{
"epoch": 2.6448971880215386,
"grad_norm": 0.19254201650619507,
"learning_rate": 1.2612861315891215e-05,
"loss": 0.1492,
"step": 21000
},
{
"epoch": 2.6448971880215386,
"eval_loss": 0.35662394762039185,
"eval_runtime": 6.1657,
"eval_samples_per_second": 162.189,
"eval_steps_per_second": 10.218,
"step": 21000
},
{
"epoch": 2.6455269704317157,
"grad_norm": 0.21622531116008759,
"learning_rate": 1.2568778071705564e-05,
"loss": 0.1508,
"step": 21005
},
{
"epoch": 2.6461567528418932,
"grad_norm": 0.20254169404506683,
"learning_rate": 1.2524768631225329e-05,
"loss": 0.1541,
"step": 21010
},
{
"epoch": 2.6467865352520703,
"grad_norm": 0.18706361949443817,
"learning_rate": 1.2480833018084619e-05,
"loss": 0.1554,
"step": 21015
},
{
"epoch": 2.647416317662248,
"grad_norm": 0.19682380557060242,
"learning_rate": 1.2436971255877825e-05,
"loss": 0.1527,
"step": 21020
},
{
"epoch": 2.648046100072425,
"grad_norm": 0.193098783493042,
"learning_rate": 1.2393183368159759e-05,
"loss": 0.1505,
"step": 21025
},
{
"epoch": 2.6486758824826024,
"grad_norm": 0.1954520344734192,
"learning_rate": 1.2349469378445493e-05,
"loss": 0.1463,
"step": 21030
},
{
"epoch": 2.6493056648927795,
"grad_norm": 0.2523531913757324,
"learning_rate": 1.2305829310210446e-05,
"loss": 0.1655,
"step": 21035
},
{
"epoch": 2.6499354473029566,
"grad_norm": 0.20331156253814697,
"learning_rate": 1.2262263186890325e-05,
"loss": 0.1514,
"step": 21040
},
{
"epoch": 2.650565229713134,
"grad_norm": 0.2400408834218979,
"learning_rate": 1.221877103188113e-05,
"loss": 0.1673,
"step": 21045
},
{
"epoch": 2.6511950121233117,
"grad_norm": 0.18541163206100464,
"learning_rate": 1.2175352868539162e-05,
"loss": 0.1582,
"step": 21050
},
{
"epoch": 2.6518247945334887,
"grad_norm": 0.24442121386528015,
"learning_rate": 1.2132008720180953e-05,
"loss": 0.1525,
"step": 21055
},
{
"epoch": 2.652454576943666,
"grad_norm": 0.23133227229118347,
"learning_rate": 1.2088738610083282e-05,
"loss": 0.1547,
"step": 21060
},
{
"epoch": 2.6530843593538433,
"grad_norm": 0.21159769594669342,
"learning_rate": 1.2045542561483196e-05,
"loss": 0.1451,
"step": 21065
},
{
"epoch": 2.6537141417640204,
"grad_norm": 0.19382759928703308,
"learning_rate": 1.2002420597577972e-05,
"loss": 0.1532,
"step": 21070
},
{
"epoch": 2.654343924174198,
"grad_norm": 0.18077696859836578,
"learning_rate": 1.1959372741525135e-05,
"loss": 0.1493,
"step": 21075
},
{
"epoch": 2.654973706584375,
"grad_norm": 0.24647746980190277,
"learning_rate": 1.1916399016442264e-05,
"loss": 0.1533,
"step": 21080
},
{
"epoch": 2.6556034889945526,
"grad_norm": 0.19929052889347076,
"learning_rate": 1.1873499445407291e-05,
"loss": 0.1418,
"step": 21085
},
{
"epoch": 2.6562332714047296,
"grad_norm": 0.2208561897277832,
"learning_rate": 1.1830674051458277e-05,
"loss": 0.1628,
"step": 21090
},
{
"epoch": 2.6568630538149067,
"grad_norm": 0.18301671743392944,
"learning_rate": 1.1787922857593406e-05,
"loss": 0.1495,
"step": 21095
},
{
"epoch": 2.6574928362250843,
"grad_norm": 0.1876724660396576,
"learning_rate": 1.1745245886771065e-05,
"loss": 0.1526,
"step": 21100
},
{
"epoch": 2.658122618635262,
"grad_norm": 0.18740510940551758,
"learning_rate": 1.1702643161909736e-05,
"loss": 0.1469,
"step": 21105
},
{
"epoch": 2.658752401045439,
"grad_norm": 0.2159937024116516,
"learning_rate": 1.1660114705888119e-05,
"loss": 0.1534,
"step": 21110
},
{
"epoch": 2.659382183455616,
"grad_norm": 0.19949863851070404,
"learning_rate": 1.1617660541544893e-05,
"loss": 0.1446,
"step": 21115
},
{
"epoch": 2.6600119658657935,
"grad_norm": 0.18760685622692108,
"learning_rate": 1.1575280691678956e-05,
"loss": 0.1495,
"step": 21120
},
{
"epoch": 2.6606417482759706,
"grad_norm": 0.1935281902551651,
"learning_rate": 1.153297517904922e-05,
"loss": 0.1563,
"step": 21125
},
{
"epoch": 2.661271530686148,
"grad_norm": 0.20625917613506317,
"learning_rate": 1.1490744026374743e-05,
"loss": 0.1527,
"step": 21130
},
{
"epoch": 2.661901313096325,
"grad_norm": 0.22293558716773987,
"learning_rate": 1.1448587256334618e-05,
"loss": 0.1573,
"step": 21135
},
{
"epoch": 2.6625310955065027,
"grad_norm": 0.22753605246543884,
"learning_rate": 1.1406504891567986e-05,
"loss": 0.1563,
"step": 21140
},
{
"epoch": 2.6631608779166798,
"grad_norm": 0.1857980489730835,
"learning_rate": 1.1364496954674035e-05,
"loss": 0.1542,
"step": 21145
},
{
"epoch": 2.663790660326857,
"grad_norm": 0.20376616716384888,
"learning_rate": 1.1322563468212003e-05,
"loss": 0.1533,
"step": 21150
},
{
"epoch": 2.6644204427370344,
"grad_norm": 0.18928895890712738,
"learning_rate": 1.1280704454701111e-05,
"loss": 0.151,
"step": 21155
},
{
"epoch": 2.665050225147212,
"grad_norm": 0.2215338945388794,
"learning_rate": 1.1238919936620593e-05,
"loss": 0.1484,
"step": 21160
},
{
"epoch": 2.665680007557389,
"grad_norm": 0.26164811849594116,
"learning_rate": 1.1197209936409702e-05,
"loss": 0.1604,
"step": 21165
},
{
"epoch": 2.666309789967566,
"grad_norm": 0.1930347979068756,
"learning_rate": 1.1155574476467682e-05,
"loss": 0.1578,
"step": 21170
},
{
"epoch": 2.6669395723777436,
"grad_norm": 0.18400873243808746,
"learning_rate": 1.1114013579153719e-05,
"loss": 0.1559,
"step": 21175
},
{
"epoch": 2.6675693547879207,
"grad_norm": 0.19113576412200928,
"learning_rate": 1.1072527266786974e-05,
"loss": 0.1583,
"step": 21180
},
{
"epoch": 2.668199137198098,
"grad_norm": 0.1980462521314621,
"learning_rate": 1.1031115561646476e-05,
"loss": 0.1516,
"step": 21185
},
{
"epoch": 2.6688289196082753,
"grad_norm": 0.24164487421512604,
"learning_rate": 1.0989778485971334e-05,
"loss": 0.1578,
"step": 21190
},
{
"epoch": 2.669458702018453,
"grad_norm": 0.18318641185760498,
"learning_rate": 1.0948516061960478e-05,
"loss": 0.1517,
"step": 21195
},
{
"epoch": 2.67008848442863,
"grad_norm": 0.20504848659038544,
"learning_rate": 1.0907328311772778e-05,
"loss": 0.1619,
"step": 21200
},
{
"epoch": 2.670718266838807,
"grad_norm": 0.214483380317688,
"learning_rate": 1.0866215257526978e-05,
"loss": 0.1445,
"step": 21205
},
{
"epoch": 2.6713480492489845,
"grad_norm": 0.24230434000492096,
"learning_rate": 1.0825176921301698e-05,
"loss": 0.1521,
"step": 21210
},
{
"epoch": 2.671977831659162,
"grad_norm": 0.22668616473674774,
"learning_rate": 1.0784213325135577e-05,
"loss": 0.1539,
"step": 21215
},
{
"epoch": 2.672607614069339,
"grad_norm": 0.18815076351165771,
"learning_rate": 1.0743324491026883e-05,
"loss": 0.1496,
"step": 21220
},
{
"epoch": 2.673237396479516,
"grad_norm": 0.24532602727413177,
"learning_rate": 1.070251044093387e-05,
"loss": 0.1572,
"step": 21225
},
{
"epoch": 2.6738671788896937,
"grad_norm": 0.2050776183605194,
"learning_rate": 1.066177119677467e-05,
"loss": 0.1585,
"step": 21230
},
{
"epoch": 2.674496961299871,
"grad_norm": 0.1992231011390686,
"learning_rate": 1.062110678042717e-05,
"loss": 0.1493,
"step": 21235
},
{
"epoch": 2.6751267437100483,
"grad_norm": 0.2188093513250351,
"learning_rate": 1.0580517213729062e-05,
"loss": 0.1526,
"step": 21240
},
{
"epoch": 2.6757565261202254,
"grad_norm": 0.17839093506336212,
"learning_rate": 1.0540002518477898e-05,
"loss": 0.146,
"step": 21245
},
{
"epoch": 2.676386308530403,
"grad_norm": 0.20759402215480804,
"learning_rate": 1.0499562716430987e-05,
"loss": 0.1527,
"step": 21250
},
{
"epoch": 2.67701609094058,
"grad_norm": 0.20209045708179474,
"learning_rate": 1.0459197829305427e-05,
"loss": 0.1507,
"step": 21255
},
{
"epoch": 2.677645873350757,
"grad_norm": 0.24553018808364868,
"learning_rate": 1.0418907878778077e-05,
"loss": 0.1568,
"step": 21260
},
{
"epoch": 2.6782756557609346,
"grad_norm": 0.24322043359279633,
"learning_rate": 1.0378692886485563e-05,
"loss": 0.1527,
"step": 21265
},
{
"epoch": 2.678905438171112,
"grad_norm": 0.20755696296691895,
"learning_rate": 1.0338552874024242e-05,
"loss": 0.1497,
"step": 21270
},
{
"epoch": 2.6795352205812892,
"grad_norm": 0.19075340032577515,
"learning_rate": 1.0298487862950256e-05,
"loss": 0.1514,
"step": 21275
},
{
"epoch": 2.6801650029914663,
"grad_norm": 0.20733466744422913,
"learning_rate": 1.0258497874779426e-05,
"loss": 0.1531,
"step": 21280
},
{
"epoch": 2.680794785401644,
"grad_norm": 0.20700550079345703,
"learning_rate": 1.0218582930987224e-05,
"loss": 0.1547,
"step": 21285
},
{
"epoch": 2.681424567811821,
"grad_norm": 0.1864207237958908,
"learning_rate": 1.0178743053008969e-05,
"loss": 0.1507,
"step": 21290
},
{
"epoch": 2.6820543502219985,
"grad_norm": 0.21107596158981323,
"learning_rate": 1.0138978262239532e-05,
"loss": 0.1511,
"step": 21295
},
{
"epoch": 2.6826841326321755,
"grad_norm": 0.25058072805404663,
"learning_rate": 1.0099288580033548e-05,
"loss": 0.1573,
"step": 21300
},
{
"epoch": 2.683313915042353,
"grad_norm": 0.18913084268569946,
"learning_rate": 1.005967402770525e-05,
"loss": 0.1415,
"step": 21305
},
{
"epoch": 2.68394369745253,
"grad_norm": 0.18435829877853394,
"learning_rate": 1.002013462652857e-05,
"loss": 0.1432,
"step": 21310
},
{
"epoch": 2.6845734798627072,
"grad_norm": 0.19929499924182892,
"learning_rate": 9.980670397737106e-06,
"loss": 0.1562,
"step": 21315
},
{
"epoch": 2.6852032622728847,
"grad_norm": 0.2412646859884262,
"learning_rate": 9.941281362524007e-06,
"loss": 0.1544,
"step": 21320
},
{
"epoch": 2.6858330446830623,
"grad_norm": 0.23384952545166016,
"learning_rate": 9.9019675420421e-06,
"loss": 0.1598,
"step": 21325
},
{
"epoch": 2.6864628270932394,
"grad_norm": 0.1778135895729065,
"learning_rate": 9.862728957403766e-06,
"loss": 0.1515,
"step": 21330
},
{
"epoch": 2.6870926095034164,
"grad_norm": 0.20782922208309174,
"learning_rate": 9.823565629681079e-06,
"loss": 0.1504,
"step": 21335
},
{
"epoch": 2.687722391913594,
"grad_norm": 0.18523196876049042,
"learning_rate": 9.78447757990562e-06,
"loss": 0.1425,
"step": 21340
},
{
"epoch": 2.688352174323771,
"grad_norm": 0.18965183198451996,
"learning_rate": 9.745464829068561e-06,
"loss": 0.1541,
"step": 21345
},
{
"epoch": 2.6889819567339486,
"grad_norm": 0.18834419548511505,
"learning_rate": 9.706527398120645e-06,
"loss": 0.1536,
"step": 21350
},
{
"epoch": 2.6896117391441257,
"grad_norm": 0.18705077469348907,
"learning_rate": 9.66766530797216e-06,
"loss": 0.1451,
"step": 21355
},
{
"epoch": 2.690241521554303,
"grad_norm": 0.1886008232831955,
"learning_rate": 9.628878579492932e-06,
"loss": 0.1484,
"step": 21360
},
{
"epoch": 2.6908713039644803,
"grad_norm": 0.19375784695148468,
"learning_rate": 9.590167233512314e-06,
"loss": 0.1554,
"step": 21365
},
{
"epoch": 2.6915010863746573,
"grad_norm": 0.18573135137557983,
"learning_rate": 9.551531290819192e-06,
"loss": 0.1608,
"step": 21370
},
{
"epoch": 2.692130868784835,
"grad_norm": 0.18215128779411316,
"learning_rate": 9.512970772161955e-06,
"loss": 0.1564,
"step": 21375
},
{
"epoch": 2.6927606511950124,
"grad_norm": 0.1941639482975006,
"learning_rate": 9.474485698248469e-06,
"loss": 0.1551,
"step": 21380
},
{
"epoch": 2.6933904336051895,
"grad_norm": 0.19289493560791016,
"learning_rate": 9.436076089746153e-06,
"loss": 0.1537,
"step": 21385
},
{
"epoch": 2.6940202160153666,
"grad_norm": 0.19275395572185516,
"learning_rate": 9.397741967281724e-06,
"loss": 0.1441,
"step": 21390
},
{
"epoch": 2.694649998425544,
"grad_norm": 0.18316887319087982,
"learning_rate": 9.359483351441599e-06,
"loss": 0.1496,
"step": 21395
},
{
"epoch": 2.695279780835721,
"grad_norm": 0.16594599187374115,
"learning_rate": 9.321300262771475e-06,
"loss": 0.1408,
"step": 21400
},
{
"epoch": 2.6959095632458987,
"grad_norm": 0.2479625940322876,
"learning_rate": 9.28319272177655e-06,
"loss": 0.1565,
"step": 21405
},
{
"epoch": 2.696539345656076,
"grad_norm": 0.18492808938026428,
"learning_rate": 9.245160748921454e-06,
"loss": 0.143,
"step": 21410
},
{
"epoch": 2.6971691280662533,
"grad_norm": 0.22853007912635803,
"learning_rate": 9.207204364630182e-06,
"loss": 0.1668,
"step": 21415
},
{
"epoch": 2.6977989104764304,
"grad_norm": 0.1997872143983841,
"learning_rate": 9.169323589286264e-06,
"loss": 0.1563,
"step": 21420
},
{
"epoch": 2.6984286928866075,
"grad_norm": 0.23863272368907928,
"learning_rate": 9.131518443232476e-06,
"loss": 0.1554,
"step": 21425
},
{
"epoch": 2.699058475296785,
"grad_norm": 0.17353664338588715,
"learning_rate": 9.09378894677103e-06,
"loss": 0.147,
"step": 21430
},
{
"epoch": 2.699688257706962,
"grad_norm": 0.2168291211128235,
"learning_rate": 9.056135120163582e-06,
"loss": 0.1553,
"step": 21435
},
{
"epoch": 2.7003180401171396,
"grad_norm": 0.23211082816123962,
"learning_rate": 9.018556983631076e-06,
"loss": 0.1493,
"step": 21440
},
{
"epoch": 2.7009478225273167,
"grad_norm": 0.22088773548603058,
"learning_rate": 8.981054557353834e-06,
"loss": 0.1567,
"step": 21445
},
{
"epoch": 2.701577604937494,
"grad_norm": 0.20668818056583405,
"learning_rate": 8.943627861471497e-06,
"loss": 0.1559,
"step": 21450
},
{
"epoch": 2.7022073873476713,
"grad_norm": 0.22993560135364532,
"learning_rate": 8.906276916083072e-06,
"loss": 0.1628,
"step": 21455
},
{
"epoch": 2.702837169757849,
"grad_norm": 0.214871346950531,
"learning_rate": 8.869001741246862e-06,
"loss": 0.1567,
"step": 21460
},
{
"epoch": 2.703466952168026,
"grad_norm": 0.20056143403053284,
"learning_rate": 8.831802356980505e-06,
"loss": 0.1494,
"step": 21465
},
{
"epoch": 2.7040967345782034,
"grad_norm": 0.18365876376628876,
"learning_rate": 8.79467878326089e-06,
"loss": 0.1547,
"step": 21470
},
{
"epoch": 2.7047265169883805,
"grad_norm": 0.1938326060771942,
"learning_rate": 8.757631040024215e-06,
"loss": 0.1591,
"step": 21475
},
{
"epoch": 2.7053562993985576,
"grad_norm": 0.207264244556427,
"learning_rate": 8.72065914716602e-06,
"loss": 0.1588,
"step": 21480
},
{
"epoch": 2.705986081808735,
"grad_norm": 0.23815831542015076,
"learning_rate": 8.683763124541021e-06,
"loss": 0.1551,
"step": 21485
},
{
"epoch": 2.706615864218912,
"grad_norm": 0.20644132792949677,
"learning_rate": 8.646942991963236e-06,
"loss": 0.1496,
"step": 21490
},
{
"epoch": 2.7072456466290897,
"grad_norm": 0.19380377233028412,
"learning_rate": 8.610198769205895e-06,
"loss": 0.1499,
"step": 21495
},
{
"epoch": 2.707875429039267,
"grad_norm": 0.1877509504556656,
"learning_rate": 8.5735304760015e-06,
"loss": 0.1582,
"step": 21500
},
{
"epoch": 2.7085052114494443,
"grad_norm": 0.20092125236988068,
"learning_rate": 8.536938132041781e-06,
"loss": 0.1541,
"step": 21505
},
{
"epoch": 2.7091349938596214,
"grad_norm": 0.20917046070098877,
"learning_rate": 8.500421756977637e-06,
"loss": 0.1555,
"step": 21510
},
{
"epoch": 2.709764776269799,
"grad_norm": 0.18814347684383392,
"learning_rate": 8.463981370419165e-06,
"loss": 0.1511,
"step": 21515
},
{
"epoch": 2.710394558679976,
"grad_norm": 0.2021394819021225,
"learning_rate": 8.427616991935759e-06,
"loss": 0.1539,
"step": 21520
},
{
"epoch": 2.7110243410901536,
"grad_norm": 0.19899116456508636,
"learning_rate": 8.3913286410559e-06,
"loss": 0.1553,
"step": 21525
},
{
"epoch": 2.7116541235003306,
"grad_norm": 0.2093294858932495,
"learning_rate": 8.355116337267231e-06,
"loss": 0.1581,
"step": 21530
},
{
"epoch": 2.7122839059105077,
"grad_norm": 0.215724378824234,
"learning_rate": 8.318980100016564e-06,
"loss": 0.1516,
"step": 21535
},
{
"epoch": 2.7129136883206852,
"grad_norm": 0.21019119024276733,
"learning_rate": 8.28291994870996e-06,
"loss": 0.1521,
"step": 21540
},
{
"epoch": 2.7135434707308623,
"grad_norm": 0.20992571115493774,
"learning_rate": 8.246935902712493e-06,
"loss": 0.1401,
"step": 21545
},
{
"epoch": 2.71417325314104,
"grad_norm": 0.1939440220594406,
"learning_rate": 8.21102798134844e-06,
"loss": 0.1572,
"step": 21550
},
{
"epoch": 2.714803035551217,
"grad_norm": 0.2128129005432129,
"learning_rate": 8.175196203901157e-06,
"loss": 0.1624,
"step": 21555
},
{
"epoch": 2.7154328179613945,
"grad_norm": 0.22001588344573975,
"learning_rate": 8.139440589613122e-06,
"loss": 0.1498,
"step": 21560
},
{
"epoch": 2.7160626003715715,
"grad_norm": 0.24958358705043793,
"learning_rate": 8.103761157685939e-06,
"loss": 0.1614,
"step": 21565
},
{
"epoch": 2.716692382781749,
"grad_norm": 0.21756353974342346,
"learning_rate": 8.068157927280284e-06,
"loss": 0.1515,
"step": 21570
},
{
"epoch": 2.717322165191926,
"grad_norm": 0.19753116369247437,
"learning_rate": 8.032630917515842e-06,
"loss": 0.1504,
"step": 21575
},
{
"epoch": 2.7179519476021037,
"grad_norm": 0.2083761841058731,
"learning_rate": 7.997180147471505e-06,
"loss": 0.1488,
"step": 21580
},
{
"epoch": 2.7185817300122808,
"grad_norm": 0.2009708434343338,
"learning_rate": 7.961805636185126e-06,
"loss": 0.1475,
"step": 21585
},
{
"epoch": 2.719211512422458,
"grad_norm": 0.23513175547122955,
"learning_rate": 7.926507402653609e-06,
"loss": 0.1479,
"step": 21590
},
{
"epoch": 2.7198412948326354,
"grad_norm": 0.1990012526512146,
"learning_rate": 7.891285465832909e-06,
"loss": 0.1498,
"step": 21595
},
{
"epoch": 2.7204710772428125,
"grad_norm": 0.2000730186700821,
"learning_rate": 7.856139844638044e-06,
"loss": 0.1553,
"step": 21600
},
{
"epoch": 2.72110085965299,
"grad_norm": 0.17009419202804565,
"learning_rate": 7.821070557942966e-06,
"loss": 0.138,
"step": 21605
},
{
"epoch": 2.721730642063167,
"grad_norm": 0.19666020572185516,
"learning_rate": 7.786077624580728e-06,
"loss": 0.1505,
"step": 21610
},
{
"epoch": 2.7223604244733446,
"grad_norm": 0.20230218768119812,
"learning_rate": 7.751161063343314e-06,
"loss": 0.1459,
"step": 21615
},
{
"epoch": 2.7229902068835217,
"grad_norm": 0.20249028503894806,
"learning_rate": 7.716320892981692e-06,
"loss": 0.1481,
"step": 21620
},
{
"epoch": 2.7236199892936988,
"grad_norm": 0.183380126953125,
"learning_rate": 7.681557132205861e-06,
"loss": 0.1513,
"step": 21625
},
{
"epoch": 2.7242497717038763,
"grad_norm": 0.22188283503055573,
"learning_rate": 7.646869799684791e-06,
"loss": 0.1534,
"step": 21630
},
{
"epoch": 2.724879554114054,
"grad_norm": 0.19538500905036926,
"learning_rate": 7.6122589140462766e-06,
"loss": 0.1524,
"step": 21635
},
{
"epoch": 2.725509336524231,
"grad_norm": 0.1824834644794464,
"learning_rate": 7.577724493877219e-06,
"loss": 0.1564,
"step": 21640
},
{
"epoch": 2.726139118934408,
"grad_norm": 0.18397974967956543,
"learning_rate": 7.543266557723398e-06,
"loss": 0.1467,
"step": 21645
},
{
"epoch": 2.7267689013445855,
"grad_norm": 0.22993116080760956,
"learning_rate": 7.508885124089481e-06,
"loss": 0.1546,
"step": 21650
},
{
"epoch": 2.7273986837547626,
"grad_norm": 0.18351049721240997,
"learning_rate": 7.47458021143908e-06,
"loss": 0.1616,
"step": 21655
},
{
"epoch": 2.72802846616494,
"grad_norm": 0.20072756707668304,
"learning_rate": 7.440351838194724e-06,
"loss": 0.1451,
"step": 21660
},
{
"epoch": 2.728658248575117,
"grad_norm": 0.19199103116989136,
"learning_rate": 7.406200022737879e-06,
"loss": 0.1518,
"step": 21665
},
{
"epoch": 2.7292880309852947,
"grad_norm": 0.21039634943008423,
"learning_rate": 7.372124783408789e-06,
"loss": 0.154,
"step": 21670
},
{
"epoch": 2.729917813395472,
"grad_norm": 0.2162015289068222,
"learning_rate": 7.33812613850665e-06,
"loss": 0.1459,
"step": 21675
},
{
"epoch": 2.730547595805649,
"grad_norm": 0.192021444439888,
"learning_rate": 7.304204106289507e-06,
"loss": 0.1547,
"step": 21680
},
{
"epoch": 2.7311773782158264,
"grad_norm": 0.20860375463962555,
"learning_rate": 7.270358704974289e-06,
"loss": 0.1501,
"step": 21685
},
{
"epoch": 2.731807160626004,
"grad_norm": 0.1841016709804535,
"learning_rate": 7.236589952736738e-06,
"loss": 0.1538,
"step": 21690
},
{
"epoch": 2.732436943036181,
"grad_norm": 0.23411309719085693,
"learning_rate": 7.202897867711449e-06,
"loss": 0.153,
"step": 21695
},
{
"epoch": 2.733066725446358,
"grad_norm": 0.2005651742219925,
"learning_rate": 7.1692824679918325e-06,
"loss": 0.1505,
"step": 21700
},
{
"epoch": 2.7336965078565356,
"grad_norm": 0.18157663941383362,
"learning_rate": 7.135743771630131e-06,
"loss": 0.1424,
"step": 21705
},
{
"epoch": 2.7343262902667127,
"grad_norm": 0.20939917862415314,
"learning_rate": 7.102281796637388e-06,
"loss": 0.1585,
"step": 21710
},
{
"epoch": 2.7349560726768902,
"grad_norm": 0.17006689310073853,
"learning_rate": 7.068896560983445e-06,
"loss": 0.1529,
"step": 21715
},
{
"epoch": 2.7355858550870673,
"grad_norm": 0.23061016201972961,
"learning_rate": 7.035588082596927e-06,
"loss": 0.1556,
"step": 21720
},
{
"epoch": 2.736215637497245,
"grad_norm": 0.20175643265247345,
"learning_rate": 7.002356379365276e-06,
"loss": 0.1559,
"step": 21725
},
{
"epoch": 2.736845419907422,
"grad_norm": 0.19943305850028992,
"learning_rate": 6.969201469134683e-06,
"loss": 0.147,
"step": 21730
},
{
"epoch": 2.737475202317599,
"grad_norm": 0.22196878492832184,
"learning_rate": 6.936123369710056e-06,
"loss": 0.1517,
"step": 21735
},
{
"epoch": 2.7381049847277765,
"grad_norm": 0.19505414366722107,
"learning_rate": 6.903122098855085e-06,
"loss": 0.1464,
"step": 21740
},
{
"epoch": 2.738734767137954,
"grad_norm": 0.19797982275485992,
"learning_rate": 6.870197674292227e-06,
"loss": 0.1407,
"step": 21745
},
{
"epoch": 2.739364549548131,
"grad_norm": 0.2223568856716156,
"learning_rate": 6.837350113702672e-06,
"loss": 0.1524,
"step": 21750
},
{
"epoch": 2.739994331958308,
"grad_norm": 0.22087423503398895,
"learning_rate": 6.804579434726276e-06,
"loss": 0.1578,
"step": 21755
},
{
"epoch": 2.7406241143684857,
"grad_norm": 0.19389192759990692,
"learning_rate": 6.771885654961662e-06,
"loss": 0.1445,
"step": 21760
},
{
"epoch": 2.741253896778663,
"grad_norm": 0.20979470014572144,
"learning_rate": 6.739268791966118e-06,
"loss": 0.1548,
"step": 21765
},
{
"epoch": 2.7418836791888404,
"grad_norm": 0.22365309298038483,
"learning_rate": 6.7067288632556505e-06,
"loss": 0.1471,
"step": 21770
},
{
"epoch": 2.7425134615990174,
"grad_norm": 0.20007841289043427,
"learning_rate": 6.674265886304964e-06,
"loss": 0.1548,
"step": 21775
},
{
"epoch": 2.743143244009195,
"grad_norm": 0.1756853312253952,
"learning_rate": 6.641879878547379e-06,
"loss": 0.1443,
"step": 21780
},
{
"epoch": 2.743773026419372,
"grad_norm": 0.21500404179096222,
"learning_rate": 6.609570857374952e-06,
"loss": 0.1584,
"step": 21785
},
{
"epoch": 2.744402808829549,
"grad_norm": 0.1938805729150772,
"learning_rate": 6.577338840138369e-06,
"loss": 0.155,
"step": 21790
},
{
"epoch": 2.7450325912397266,
"grad_norm": 0.20673929154872894,
"learning_rate": 6.545183844146951e-06,
"loss": 0.1526,
"step": 21795
},
{
"epoch": 2.745662373649904,
"grad_norm": 0.19749803841114044,
"learning_rate": 6.513105886668668e-06,
"loss": 0.1533,
"step": 21800
},
{
"epoch": 2.7462921560600813,
"grad_norm": 0.212607279419899,
"learning_rate": 6.481104984930107e-06,
"loss": 0.1565,
"step": 21805
},
{
"epoch": 2.7469219384702583,
"grad_norm": 0.1796950250864029,
"learning_rate": 6.449181156116473e-06,
"loss": 0.1464,
"step": 21810
},
{
"epoch": 2.747551720880436,
"grad_norm": 0.18281513452529907,
"learning_rate": 6.417334417371616e-06,
"loss": 0.1482,
"step": 21815
},
{
"epoch": 2.748181503290613,
"grad_norm": 0.23321060836315155,
"learning_rate": 6.385564785797958e-06,
"loss": 0.1489,
"step": 21820
},
{
"epoch": 2.7488112857007905,
"grad_norm": 0.2202220857143402,
"learning_rate": 6.353872278456501e-06,
"loss": 0.1477,
"step": 21825
},
{
"epoch": 2.7494410681109676,
"grad_norm": 0.283456951379776,
"learning_rate": 6.3222569123668635e-06,
"loss": 0.1582,
"step": 21830
},
{
"epoch": 2.750070850521145,
"grad_norm": 0.18883143365383148,
"learning_rate": 6.29071870450723e-06,
"loss": 0.1469,
"step": 21835
},
{
"epoch": 2.750700632931322,
"grad_norm": 0.20364224910736084,
"learning_rate": 6.259257671814272e-06,
"loss": 0.1567,
"step": 21840
},
{
"epoch": 2.7513304153414992,
"grad_norm": 0.19058570265769958,
"learning_rate": 6.227873831183355e-06,
"loss": 0.1449,
"step": 21845
},
{
"epoch": 2.7519601977516768,
"grad_norm": 0.20439192652702332,
"learning_rate": 6.196567199468299e-06,
"loss": 0.1486,
"step": 21850
},
{
"epoch": 2.7525899801618543,
"grad_norm": 0.1962665468454361,
"learning_rate": 6.165337793481473e-06,
"loss": 0.1499,
"step": 21855
},
{
"epoch": 2.7532197625720314,
"grad_norm": 0.22097113728523254,
"learning_rate": 6.134185629993793e-06,
"loss": 0.153,
"step": 21860
},
{
"epoch": 2.7538495449822085,
"grad_norm": 0.20070448517799377,
"learning_rate": 6.103110725734644e-06,
"loss": 0.1463,
"step": 21865
},
{
"epoch": 2.754479327392386,
"grad_norm": 0.20577707886695862,
"learning_rate": 6.072113097392028e-06,
"loss": 0.1549,
"step": 21870
},
{
"epoch": 2.755109109802563,
"grad_norm": 0.1798795461654663,
"learning_rate": 6.041192761612313e-06,
"loss": 0.1454,
"step": 21875
},
{
"epoch": 2.7557388922127406,
"grad_norm": 0.20694920420646667,
"learning_rate": 6.010349735000464e-06,
"loss": 0.1524,
"step": 21880
},
{
"epoch": 2.7563686746229177,
"grad_norm": 0.19873858988285065,
"learning_rate": 5.979584034119867e-06,
"loss": 0.1523,
"step": 21885
},
{
"epoch": 2.756998457033095,
"grad_norm": 0.2215358465909958,
"learning_rate": 5.948895675492421e-06,
"loss": 0.1508,
"step": 21890
},
{
"epoch": 2.7576282394432723,
"grad_norm": 0.21731533110141754,
"learning_rate": 5.918284675598478e-06,
"loss": 0.149,
"step": 21895
},
{
"epoch": 2.7582580218534494,
"grad_norm": 0.21298860013484955,
"learning_rate": 5.887751050876837e-06,
"loss": 0.156,
"step": 21900
},
{
"epoch": 2.758887804263627,
"grad_norm": 0.20131991803646088,
"learning_rate": 5.85729481772475e-06,
"loss": 0.1403,
"step": 21905
},
{
"epoch": 2.7595175866738044,
"grad_norm": 0.17870669066905975,
"learning_rate": 5.826915992497932e-06,
"loss": 0.1483,
"step": 21910
},
{
"epoch": 2.7601473690839815,
"grad_norm": 0.2430955022573471,
"learning_rate": 5.796614591510468e-06,
"loss": 0.1484,
"step": 21915
},
{
"epoch": 2.7607771514941586,
"grad_norm": 0.1986503154039383,
"learning_rate": 5.766390631034939e-06,
"loss": 0.1524,
"step": 21920
},
{
"epoch": 2.761406933904336,
"grad_norm": 0.1926422268152237,
"learning_rate": 5.7362441273022645e-06,
"loss": 0.1484,
"step": 21925
},
{
"epoch": 2.762036716314513,
"grad_norm": 0.23347438871860504,
"learning_rate": 5.706175096501825e-06,
"loss": 0.1512,
"step": 21930
},
{
"epoch": 2.7626664987246907,
"grad_norm": 0.20513305068016052,
"learning_rate": 5.676183554781405e-06,
"loss": 0.1518,
"step": 21935
},
{
"epoch": 2.763296281134868,
"grad_norm": 0.18283484876155853,
"learning_rate": 5.64626951824712e-06,
"loss": 0.1381,
"step": 21940
},
{
"epoch": 2.7639260635450453,
"grad_norm": 0.17075172066688538,
"learning_rate": 5.616433002963472e-06,
"loss": 0.1501,
"step": 21945
},
{
"epoch": 2.7645558459552224,
"grad_norm": 0.2107374221086502,
"learning_rate": 5.5866740249533746e-06,
"loss": 0.1581,
"step": 21950
},
{
"epoch": 2.7651856283653995,
"grad_norm": 0.23205851018428802,
"learning_rate": 5.556992600198079e-06,
"loss": 0.1467,
"step": 21955
},
{
"epoch": 2.765815410775577,
"grad_norm": 0.1973281055688858,
"learning_rate": 5.527388744637201e-06,
"loss": 0.1434,
"step": 21960
},
{
"epoch": 2.7664451931857545,
"grad_norm": 0.20235906541347504,
"learning_rate": 5.497862474168657e-06,
"loss": 0.1454,
"step": 21965
},
{
"epoch": 2.7670749755959316,
"grad_norm": 0.21266506612300873,
"learning_rate": 5.4684138046487134e-06,
"loss": 0.1454,
"step": 21970
},
{
"epoch": 2.7677047580061087,
"grad_norm": 0.1890571415424347,
"learning_rate": 5.43904275189207e-06,
"loss": 0.1414,
"step": 21975
},
{
"epoch": 2.7683345404162862,
"grad_norm": 0.1897963136434555,
"learning_rate": 5.409749331671559e-06,
"loss": 0.1493,
"step": 21980
},
{
"epoch": 2.7689643228264633,
"grad_norm": 0.18935035169124603,
"learning_rate": 5.380533559718414e-06,
"loss": 0.1543,
"step": 21985
},
{
"epoch": 2.769594105236641,
"grad_norm": 0.20879988372325897,
"learning_rate": 5.351395451722251e-06,
"loss": 0.151,
"step": 21990
},
{
"epoch": 2.770223887646818,
"grad_norm": 0.20008423924446106,
"learning_rate": 5.322335023330837e-06,
"loss": 0.1515,
"step": 21995
},
{
"epoch": 2.7708536700569955,
"grad_norm": 0.18473681807518005,
"learning_rate": 5.293352290150321e-06,
"loss": 0.1464,
"step": 22000
},
{
"epoch": 2.7708536700569955,
"eval_loss": 0.3584047257900238,
"eval_runtime": 6.1661,
"eval_samples_per_second": 162.176,
"eval_steps_per_second": 10.217,
"step": 22000
},
{
"epoch": 2.7714834524671725,
"grad_norm": 0.19308076798915863,
"learning_rate": 5.264447267745053e-06,
"loss": 0.1582,
"step": 22005
},
{
"epoch": 2.7721132348773496,
"grad_norm": 0.23008759319782257,
"learning_rate": 5.235619971637734e-06,
"loss": 0.1546,
"step": 22010
},
{
"epoch": 2.772743017287527,
"grad_norm": 0.21323955059051514,
"learning_rate": 5.206870417309245e-06,
"loss": 0.1536,
"step": 22015
},
{
"epoch": 2.7733727996977047,
"grad_norm": 0.23257404565811157,
"learning_rate": 5.17819862019877e-06,
"loss": 0.1516,
"step": 22020
},
{
"epoch": 2.7740025821078818,
"grad_norm": 0.22094878554344177,
"learning_rate": 5.14960459570371e-06,
"loss": 0.1546,
"step": 22025
},
{
"epoch": 2.774632364518059,
"grad_norm": 0.21868959069252014,
"learning_rate": 5.121088359179698e-06,
"loss": 0.1567,
"step": 22030
},
{
"epoch": 2.7752621469282364,
"grad_norm": 0.19147329032421112,
"learning_rate": 5.09264992594065e-06,
"loss": 0.1502,
"step": 22035
},
{
"epoch": 2.7758919293384134,
"grad_norm": 0.17076317965984344,
"learning_rate": 5.064289311258618e-06,
"loss": 0.1511,
"step": 22040
},
{
"epoch": 2.776521711748591,
"grad_norm": 0.23041397333145142,
"learning_rate": 5.036006530363917e-06,
"loss": 0.1611,
"step": 22045
},
{
"epoch": 2.777151494158768,
"grad_norm": 0.21972966194152832,
"learning_rate": 5.007801598445033e-06,
"loss": 0.1493,
"step": 22050
},
{
"epoch": 2.7777812765689456,
"grad_norm": 0.17348721623420715,
"learning_rate": 4.979674530648664e-06,
"loss": 0.1481,
"step": 22055
},
{
"epoch": 2.7784110589791227,
"grad_norm": 0.19225727021694183,
"learning_rate": 4.9516253420796795e-06,
"loss": 0.1493,
"step": 22060
},
{
"epoch": 2.7790408413892997,
"grad_norm": 0.19729195535182953,
"learning_rate": 4.9236540478011625e-06,
"loss": 0.1442,
"step": 22065
},
{
"epoch": 2.7796706237994773,
"grad_norm": 0.17798985540866852,
"learning_rate": 4.8957606628342805e-06,
"loss": 0.1507,
"step": 22070
},
{
"epoch": 2.780300406209655,
"grad_norm": 0.19311825931072235,
"learning_rate": 4.867945202158469e-06,
"loss": 0.149,
"step": 22075
},
{
"epoch": 2.780930188619832,
"grad_norm": 0.18525920808315277,
"learning_rate": 4.840207680711278e-06,
"loss": 0.1635,
"step": 22080
},
{
"epoch": 2.781559971030009,
"grad_norm": 0.18988420069217682,
"learning_rate": 4.812548113388342e-06,
"loss": 0.153,
"step": 22085
},
{
"epoch": 2.7821897534401865,
"grad_norm": 0.18699151277542114,
"learning_rate": 4.784966515043498e-06,
"loss": 0.147,
"step": 22090
},
{
"epoch": 2.7828195358503636,
"grad_norm": 0.23182329535484314,
"learning_rate": 4.757462900488695e-06,
"loss": 0.1496,
"step": 22095
},
{
"epoch": 2.783449318260541,
"grad_norm": 0.20079541206359863,
"learning_rate": 4.730037284494021e-06,
"loss": 0.1583,
"step": 22100
},
{
"epoch": 2.784079100670718,
"grad_norm": 0.21548844873905182,
"learning_rate": 4.702689681787625e-06,
"loss": 0.1481,
"step": 22105
},
{
"epoch": 2.7847088830808957,
"grad_norm": 0.1968826800584793,
"learning_rate": 4.6754201070558105e-06,
"loss": 0.1452,
"step": 22110
},
{
"epoch": 2.785338665491073,
"grad_norm": 0.20061470568180084,
"learning_rate": 4.648228574942997e-06,
"loss": 0.1472,
"step": 22115
},
{
"epoch": 2.78596844790125,
"grad_norm": 0.19061359763145447,
"learning_rate": 4.621115100051604e-06,
"loss": 0.1478,
"step": 22120
},
{
"epoch": 2.7865982303114274,
"grad_norm": 0.23252861201763153,
"learning_rate": 4.594079696942199e-06,
"loss": 0.1527,
"step": 22125
},
{
"epoch": 2.787228012721605,
"grad_norm": 0.1698002964258194,
"learning_rate": 4.56712238013342e-06,
"loss": 0.1379,
"step": 22130
},
{
"epoch": 2.787857795131782,
"grad_norm": 0.19811010360717773,
"learning_rate": 4.540243164101954e-06,
"loss": 0.1417,
"step": 22135
},
{
"epoch": 2.788487577541959,
"grad_norm": 0.2089819759130478,
"learning_rate": 4.513442063282585e-06,
"loss": 0.1517,
"step": 22140
},
{
"epoch": 2.7891173599521366,
"grad_norm": 0.21028514206409454,
"learning_rate": 4.486719092068086e-06,
"loss": 0.1536,
"step": 22145
},
{
"epoch": 2.7897471423623137,
"grad_norm": 0.20895244181156158,
"learning_rate": 4.46007426480931e-06,
"loss": 0.1421,
"step": 22150
},
{
"epoch": 2.790376924772491,
"grad_norm": 0.1925353854894638,
"learning_rate": 4.4335075958151275e-06,
"loss": 0.1506,
"step": 22155
},
{
"epoch": 2.7910067071826683,
"grad_norm": 0.21809720993041992,
"learning_rate": 4.407019099352477e-06,
"loss": 0.1537,
"step": 22160
},
{
"epoch": 2.791636489592846,
"grad_norm": 0.23316286504268646,
"learning_rate": 4.380608789646245e-06,
"loss": 0.1593,
"step": 22165
},
{
"epoch": 2.792266272003023,
"grad_norm": 0.20298117399215698,
"learning_rate": 4.354276680879404e-06,
"loss": 0.1469,
"step": 22170
},
{
"epoch": 2.7928960544132,
"grad_norm": 0.18828284740447998,
"learning_rate": 4.328022787192875e-06,
"loss": 0.1478,
"step": 22175
},
{
"epoch": 2.7935258368233775,
"grad_norm": 0.19351090490818024,
"learning_rate": 4.301847122685614e-06,
"loss": 0.139,
"step": 22180
},
{
"epoch": 2.794155619233555,
"grad_norm": 0.19426658749580383,
"learning_rate": 4.27574970141456e-06,
"loss": 0.148,
"step": 22185
},
{
"epoch": 2.794785401643732,
"grad_norm": 0.18554694950580597,
"learning_rate": 4.2497305373945855e-06,
"loss": 0.1484,
"step": 22190
},
{
"epoch": 2.795415184053909,
"grad_norm": 0.21555371582508087,
"learning_rate": 4.223789644598613e-06,
"loss": 0.1537,
"step": 22195
},
{
"epoch": 2.7960449664640867,
"grad_norm": 0.20736396312713623,
"learning_rate": 4.197927036957499e-06,
"loss": 0.1533,
"step": 22200
},
{
"epoch": 2.796674748874264,
"grad_norm": 0.2143113762140274,
"learning_rate": 4.172142728360017e-06,
"loss": 0.1509,
"step": 22205
},
{
"epoch": 2.7973045312844413,
"grad_norm": 0.1888829916715622,
"learning_rate": 4.146436732652958e-06,
"loss": 0.1507,
"step": 22210
},
{
"epoch": 2.7979343136946184,
"grad_norm": 0.19072696566581726,
"learning_rate": 4.1208090636410286e-06,
"loss": 0.153,
"step": 22215
},
{
"epoch": 2.798564096104796,
"grad_norm": 0.23674504458904266,
"learning_rate": 4.09525973508687e-06,
"loss": 0.1584,
"step": 22220
},
{
"epoch": 2.799193878514973,
"grad_norm": 0.23174551129341125,
"learning_rate": 4.06978876071104e-06,
"loss": 0.1475,
"step": 22225
},
{
"epoch": 2.79982366092515,
"grad_norm": 0.2185906022787094,
"learning_rate": 4.044396154192031e-06,
"loss": 0.1494,
"step": 22230
},
{
"epoch": 2.8004534433353276,
"grad_norm": 0.1940082162618637,
"learning_rate": 4.019081929166268e-06,
"loss": 0.1497,
"step": 22235
},
{
"epoch": 2.801083225745505,
"grad_norm": 0.1945921629667282,
"learning_rate": 3.993846099228093e-06,
"loss": 0.1524,
"step": 22240
},
{
"epoch": 2.8017130081556822,
"grad_norm": 0.21760894358158112,
"learning_rate": 3.968688677929682e-06,
"loss": 0.1459,
"step": 22245
},
{
"epoch": 2.8023427905658593,
"grad_norm": 0.19670112431049347,
"learning_rate": 3.943609678781162e-06,
"loss": 0.151,
"step": 22250
},
{
"epoch": 2.802972572976037,
"grad_norm": 0.2076457440853119,
"learning_rate": 3.918609115250509e-06,
"loss": 0.1515,
"step": 22255
},
{
"epoch": 2.803602355386214,
"grad_norm": 0.20138059556484222,
"learning_rate": 3.893687000763635e-06,
"loss": 0.1492,
"step": 22260
},
{
"epoch": 2.8042321377963915,
"grad_norm": 0.20619480311870575,
"learning_rate": 3.868843348704265e-06,
"loss": 0.1516,
"step": 22265
},
{
"epoch": 2.8048619202065685,
"grad_norm": 0.17885464429855347,
"learning_rate": 3.844078172413994e-06,
"loss": 0.1413,
"step": 22270
},
{
"epoch": 2.805491702616746,
"grad_norm": 0.17029553651809692,
"learning_rate": 3.8193914851922855e-06,
"loss": 0.143,
"step": 22275
},
{
"epoch": 2.806121485026923,
"grad_norm": 0.18624289333820343,
"learning_rate": 3.794783300296483e-06,
"loss": 0.1448,
"step": 22280
},
{
"epoch": 2.8067512674371002,
"grad_norm": 0.20082144439220428,
"learning_rate": 3.7702536309417497e-06,
"loss": 0.1498,
"step": 22285
},
{
"epoch": 2.8073810498472778,
"grad_norm": 0.213558167219162,
"learning_rate": 3.745802490301031e-06,
"loss": 0.165,
"step": 22290
},
{
"epoch": 2.8080108322574553,
"grad_norm": 0.23692555725574493,
"learning_rate": 3.721429891505173e-06,
"loss": 0.1568,
"step": 22295
},
{
"epoch": 2.8086406146676324,
"grad_norm": 0.18088509142398834,
"learning_rate": 3.6971358476428237e-06,
"loss": 0.1508,
"step": 22300
},
{
"epoch": 2.8092703970778095,
"grad_norm": 0.20369915664196014,
"learning_rate": 3.672920371760446e-06,
"loss": 0.1469,
"step": 22305
},
{
"epoch": 2.809900179487987,
"grad_norm": 0.18801896274089813,
"learning_rate": 3.6487834768622883e-06,
"loss": 0.1417,
"step": 22310
},
{
"epoch": 2.810529961898164,
"grad_norm": 0.2028091549873352,
"learning_rate": 3.6247251759104145e-06,
"loss": 0.157,
"step": 22315
},
{
"epoch": 2.8111597443083416,
"grad_norm": 0.16689006984233856,
"learning_rate": 3.600745481824707e-06,
"loss": 0.1393,
"step": 22320
},
{
"epoch": 2.8117895267185187,
"grad_norm": 0.20889881253242493,
"learning_rate": 3.576844407482765e-06,
"loss": 0.1586,
"step": 22325
},
{
"epoch": 2.812419309128696,
"grad_norm": 0.21049942076206207,
"learning_rate": 3.5530219657200543e-06,
"loss": 0.155,
"step": 22330
},
{
"epoch": 2.8130490915388733,
"grad_norm": 0.23332563042640686,
"learning_rate": 3.5292781693297247e-06,
"loss": 0.1557,
"step": 22335
},
{
"epoch": 2.8136788739490504,
"grad_norm": 0.1706390082836151,
"learning_rate": 3.505613031062776e-06,
"loss": 0.1421,
"step": 22340
},
{
"epoch": 2.814308656359228,
"grad_norm": 0.17925478518009186,
"learning_rate": 3.4820265636279265e-06,
"loss": 0.1433,
"step": 22345
},
{
"epoch": 2.8149384387694054,
"grad_norm": 0.15641047060489655,
"learning_rate": 3.458518779691627e-06,
"loss": 0.1423,
"step": 22350
},
{
"epoch": 2.8155682211795825,
"grad_norm": 0.18733102083206177,
"learning_rate": 3.435089691878112e-06,
"loss": 0.1533,
"step": 22355
},
{
"epoch": 2.8161980035897596,
"grad_norm": 0.22065778076648712,
"learning_rate": 3.4117393127693183e-06,
"loss": 0.157,
"step": 22360
},
{
"epoch": 2.816827785999937,
"grad_norm": 0.20604351162910461,
"learning_rate": 3.388467654904947e-06,
"loss": 0.1438,
"step": 22365
},
{
"epoch": 2.817457568410114,
"grad_norm": 0.17883001267910004,
"learning_rate": 3.365274730782419e-06,
"loss": 0.1465,
"step": 22370
},
{
"epoch": 2.8180873508202917,
"grad_norm": 0.18118852376937866,
"learning_rate": 3.3421605528568374e-06,
"loss": 0.1501,
"step": 22375
},
{
"epoch": 2.818717133230469,
"grad_norm": 0.2178465574979782,
"learning_rate": 3.3191251335410564e-06,
"loss": 0.1467,
"step": 22380
},
{
"epoch": 2.8193469156406463,
"grad_norm": 0.18433082103729248,
"learning_rate": 3.29616848520563e-06,
"loss": 0.1478,
"step": 22385
},
{
"epoch": 2.8199766980508234,
"grad_norm": 0.19671761989593506,
"learning_rate": 3.273290620178831e-06,
"loss": 0.144,
"step": 22390
},
{
"epoch": 2.8206064804610005,
"grad_norm": 0.2000323235988617,
"learning_rate": 3.2504915507465144e-06,
"loss": 0.1443,
"step": 22395
},
{
"epoch": 2.821236262871178,
"grad_norm": 0.19443731009960175,
"learning_rate": 3.22777128915237e-06,
"loss": 0.1537,
"step": 22400
},
{
"epoch": 2.8218660452813555,
"grad_norm": 0.19904273748397827,
"learning_rate": 3.2051298475976707e-06,
"loss": 0.1581,
"step": 22405
},
{
"epoch": 2.8224958276915326,
"grad_norm": 0.1972033828496933,
"learning_rate": 3.18256723824139e-06,
"loss": 0.1383,
"step": 22410
},
{
"epoch": 2.8231256101017097,
"grad_norm": 0.21138480305671692,
"learning_rate": 3.16008347320017e-06,
"loss": 0.1442,
"step": 22415
},
{
"epoch": 2.8237553925118872,
"grad_norm": 0.22747448086738586,
"learning_rate": 3.1376785645483016e-06,
"loss": 0.1485,
"step": 22420
},
{
"epoch": 2.8243851749220643,
"grad_norm": 0.23757314682006836,
"learning_rate": 3.11535252431776e-06,
"loss": 0.1568,
"step": 22425
},
{
"epoch": 2.825014957332242,
"grad_norm": 0.2193070352077484,
"learning_rate": 3.0931053644980885e-06,
"loss": 0.1605,
"step": 22430
},
{
"epoch": 2.825644739742419,
"grad_norm": 0.2223901003599167,
"learning_rate": 3.0709370970365464e-06,
"loss": 0.1453,
"step": 22435
},
{
"epoch": 2.8262745221525964,
"grad_norm": 0.23655427992343903,
"learning_rate": 3.0488477338379944e-06,
"loss": 0.1484,
"step": 22440
},
{
"epoch": 2.8269043045627735,
"grad_norm": 0.20859979093074799,
"learning_rate": 3.026837286764944e-06,
"loss": 0.154,
"step": 22445
},
{
"epoch": 2.8275340869729506,
"grad_norm": 0.1994808316230774,
"learning_rate": 3.004905767637472e-06,
"loss": 0.1634,
"step": 22450
},
{
"epoch": 2.828163869383128,
"grad_norm": 0.19530266523361206,
"learning_rate": 2.983053188233342e-06,
"loss": 0.1458,
"step": 22455
},
{
"epoch": 2.8287936517933057,
"grad_norm": 0.19528019428253174,
"learning_rate": 2.9612795602878827e-06,
"loss": 0.1472,
"step": 22460
},
{
"epoch": 2.8294234342034827,
"grad_norm": 0.20543955266475677,
"learning_rate": 2.939584895494007e-06,
"loss": 0.1544,
"step": 22465
},
{
"epoch": 2.83005321661366,
"grad_norm": 0.18907050788402557,
"learning_rate": 2.917969205502263e-06,
"loss": 0.1469,
"step": 22470
},
{
"epoch": 2.8306829990238374,
"grad_norm": 0.2009141594171524,
"learning_rate": 2.896432501920748e-06,
"loss": 0.1463,
"step": 22475
},
{
"epoch": 2.8313127814340144,
"grad_norm": 0.1845710128545761,
"learning_rate": 2.8749747963151937e-06,
"loss": 0.1523,
"step": 22480
},
{
"epoch": 2.831942563844192,
"grad_norm": 0.22671662271022797,
"learning_rate": 2.853596100208866e-06,
"loss": 0.1553,
"step": 22485
},
{
"epoch": 2.832572346254369,
"grad_norm": 0.1716582477092743,
"learning_rate": 2.832296425082614e-06,
"loss": 0.1423,
"step": 22490
},
{
"epoch": 2.8332021286645466,
"grad_norm": 0.17477920651435852,
"learning_rate": 2.8110757823748554e-06,
"loss": 0.142,
"step": 22495
},
{
"epoch": 2.8338319110747237,
"grad_norm": 0.22391197085380554,
"learning_rate": 2.7899341834815236e-06,
"loss": 0.1576,
"step": 22500
},
{
"epoch": 2.8344616934849007,
"grad_norm": 0.19235247373580933,
"learning_rate": 2.7688716397561874e-06,
"loss": 0.1432,
"step": 22505
},
{
"epoch": 2.8350914758950783,
"grad_norm": 0.21828468143939972,
"learning_rate": 2.747888162509898e-06,
"loss": 0.1461,
"step": 22510
},
{
"epoch": 2.835721258305256,
"grad_norm": 0.19712364673614502,
"learning_rate": 2.726983763011259e-06,
"loss": 0.1461,
"step": 22515
},
{
"epoch": 2.836351040715433,
"grad_norm": 0.22868654131889343,
"learning_rate": 2.7061584524864066e-06,
"loss": 0.1546,
"step": 22520
},
{
"epoch": 2.83698082312561,
"grad_norm": 0.18876421451568604,
"learning_rate": 2.685412242119012e-06,
"loss": 0.1481,
"step": 22525
},
{
"epoch": 2.8376106055357875,
"grad_norm": 0.21973784267902374,
"learning_rate": 2.664745143050295e-06,
"loss": 0.1568,
"step": 22530
},
{
"epoch": 2.8382403879459646,
"grad_norm": 0.24478502571582794,
"learning_rate": 2.6441571663788963e-06,
"loss": 0.1558,
"step": 22535
},
{
"epoch": 2.838870170356142,
"grad_norm": 0.18952693045139313,
"learning_rate": 2.6236483231610707e-06,
"loss": 0.1461,
"step": 22540
},
{
"epoch": 2.839499952766319,
"grad_norm": 0.20026876032352448,
"learning_rate": 2.603218624410525e-06,
"loss": 0.1466,
"step": 22545
},
{
"epoch": 2.8401297351764967,
"grad_norm": 0.21935871243476868,
"learning_rate": 2.5828680810984824e-06,
"loss": 0.1563,
"step": 22550
},
{
"epoch": 2.8407595175866738,
"grad_norm": 0.22446821630001068,
"learning_rate": 2.5625967041536354e-06,
"loss": 0.1462,
"step": 22555
},
{
"epoch": 2.841389299996851,
"grad_norm": 0.2072252631187439,
"learning_rate": 2.5424045044621922e-06,
"loss": 0.1505,
"step": 22560
},
{
"epoch": 2.8420190824070284,
"grad_norm": 0.19828562438488007,
"learning_rate": 2.5222914928678285e-06,
"loss": 0.1462,
"step": 22565
},
{
"epoch": 2.842648864817206,
"grad_norm": 0.18411174416542053,
"learning_rate": 2.502257680171671e-06,
"loss": 0.1415,
"step": 22570
},
{
"epoch": 2.843278647227383,
"grad_norm": 0.20017574727535248,
"learning_rate": 2.482303077132347e-06,
"loss": 0.1556,
"step": 22575
},
{
"epoch": 2.84390842963756,
"grad_norm": 0.1881314069032669,
"learning_rate": 2.462427694465935e-06,
"loss": 0.1464,
"step": 22580
},
{
"epoch": 2.8445382120477376,
"grad_norm": 0.2211647629737854,
"learning_rate": 2.4426315428459466e-06,
"loss": 0.1471,
"step": 22585
},
{
"epoch": 2.8451679944579147,
"grad_norm": 0.20288364589214325,
"learning_rate": 2.4229146329033944e-06,
"loss": 0.146,
"step": 22590
},
{
"epoch": 2.845797776868092,
"grad_norm": 0.22115926444530487,
"learning_rate": 2.4032769752267087e-06,
"loss": 0.1422,
"step": 22595
},
{
"epoch": 2.8464275592782693,
"grad_norm": 0.196670264005661,
"learning_rate": 2.3837185803617544e-06,
"loss": 0.153,
"step": 22600
},
{
"epoch": 2.847057341688447,
"grad_norm": 0.23514890670776367,
"learning_rate": 2.3642394588118285e-06,
"loss": 0.1573,
"step": 22605
},
{
"epoch": 2.847687124098624,
"grad_norm": 0.1987423151731491,
"learning_rate": 2.3448396210376807e-06,
"loss": 0.1457,
"step": 22610
},
{
"epoch": 2.848316906508801,
"grad_norm": 0.18859946727752686,
"learning_rate": 2.3255190774574605e-06,
"loss": 0.1533,
"step": 22615
},
{
"epoch": 2.8489466889189785,
"grad_norm": 0.21700045466423035,
"learning_rate": 2.306277838446735e-06,
"loss": 0.1416,
"step": 22620
},
{
"epoch": 2.849576471329156,
"grad_norm": 0.17610225081443787,
"learning_rate": 2.2871159143384723e-06,
"loss": 0.1498,
"step": 22625
},
{
"epoch": 2.850206253739333,
"grad_norm": 0.2066749781370163,
"learning_rate": 2.26803331542309e-06,
"loss": 0.1587,
"step": 22630
},
{
"epoch": 2.85083603614951,
"grad_norm": 0.19877871870994568,
"learning_rate": 2.2490300519484082e-06,
"loss": 0.1526,
"step": 22635
},
{
"epoch": 2.8514658185596877,
"grad_norm": 0.19332483410835266,
"learning_rate": 2.230106134119547e-06,
"loss": 0.1562,
"step": 22640
},
{
"epoch": 2.852095600969865,
"grad_norm": 0.21806974709033966,
"learning_rate": 2.21126157209911e-06,
"loss": 0.1508,
"step": 22645
},
{
"epoch": 2.8527253833800423,
"grad_norm": 0.20896165072917938,
"learning_rate": 2.192496376007069e-06,
"loss": 0.1612,
"step": 22650
},
{
"epoch": 2.8533551657902194,
"grad_norm": 0.2381521761417389,
"learning_rate": 2.1738105559207465e-06,
"loss": 0.1545,
"step": 22655
},
{
"epoch": 2.853984948200397,
"grad_norm": 0.24022352695465088,
"learning_rate": 2.155204121874882e-06,
"loss": 0.1548,
"step": 22660
},
{
"epoch": 2.854614730610574,
"grad_norm": 0.20042377710342407,
"learning_rate": 2.1366770838615322e-06,
"loss": 0.1423,
"step": 22665
},
{
"epoch": 2.855244513020751,
"grad_norm": 0.1943242996931076,
"learning_rate": 2.118229451830139e-06,
"loss": 0.1453,
"step": 22670
},
{
"epoch": 2.8558742954309286,
"grad_norm": 0.20173771679401398,
"learning_rate": 2.0998612356874944e-06,
"loss": 0.1406,
"step": 22675
},
{
"epoch": 2.856504077841106,
"grad_norm": 0.21339194476604462,
"learning_rate": 2.081572445297791e-06,
"loss": 0.1447,
"step": 22680
},
{
"epoch": 2.8571338602512832,
"grad_norm": 0.18814577162265778,
"learning_rate": 2.0633630904824727e-06,
"loss": 0.144,
"step": 22685
},
{
"epoch": 2.8577636426614603,
"grad_norm": 0.1956281065940857,
"learning_rate": 2.045233181020417e-06,
"loss": 0.1503,
"step": 22690
},
{
"epoch": 2.858393425071638,
"grad_norm": 0.22954149544239044,
"learning_rate": 2.027182726647786e-06,
"loss": 0.1491,
"step": 22695
},
{
"epoch": 2.859023207481815,
"grad_norm": 0.18004447221755981,
"learning_rate": 2.009211737058092e-06,
"loss": 0.1492,
"step": 22700
},
{
"epoch": 2.8596529898919925,
"grad_norm": 0.226220041513443,
"learning_rate": 1.991320221902165e-06,
"loss": 0.159,
"step": 22705
},
{
"epoch": 2.8602827723021695,
"grad_norm": 0.1808856725692749,
"learning_rate": 1.9735081907881367e-06,
"loss": 0.1473,
"step": 22710
},
{
"epoch": 2.860912554712347,
"grad_norm": 0.2538818418979645,
"learning_rate": 1.9557756532815216e-06,
"loss": 0.1523,
"step": 22715
},
{
"epoch": 2.861542337122524,
"grad_norm": 0.18744130432605743,
"learning_rate": 1.9381226189050524e-06,
"loss": 0.1501,
"step": 22720
},
{
"epoch": 2.8621721195327012,
"grad_norm": 0.2162604182958603,
"learning_rate": 1.920549097138813e-06,
"loss": 0.1505,
"step": 22725
},
{
"epoch": 2.8628019019428788,
"grad_norm": 0.2076927125453949,
"learning_rate": 1.9030550974202197e-06,
"loss": 0.1493,
"step": 22730
},
{
"epoch": 2.8634316843530563,
"grad_norm": 0.2153797596693039,
"learning_rate": 1.885640629143942e-06,
"loss": 0.1409,
"step": 22735
},
{
"epoch": 2.8640614667632334,
"grad_norm": 0.19790925085544586,
"learning_rate": 1.868305701661932e-06,
"loss": 0.1596,
"step": 22740
},
{
"epoch": 2.8646912491734104,
"grad_norm": 0.18141327798366547,
"learning_rate": 1.8510503242834263e-06,
"loss": 0.1459,
"step": 22745
},
{
"epoch": 2.865321031583588,
"grad_norm": 0.20295578241348267,
"learning_rate": 1.833874506274996e-06,
"loss": 0.1485,
"step": 22750
},
{
"epoch": 2.865950813993765,
"grad_norm": 0.20226307213306427,
"learning_rate": 1.8167782568604127e-06,
"loss": 0.1507,
"step": 22755
},
{
"epoch": 2.8665805964039426,
"grad_norm": 0.19584356248378754,
"learning_rate": 1.7997615852207825e-06,
"loss": 0.1526,
"step": 22760
},
{
"epoch": 2.8672103788141197,
"grad_norm": 0.17093075811862946,
"learning_rate": 1.7828245004944286e-06,
"loss": 0.1481,
"step": 22765
},
{
"epoch": 2.867840161224297,
"grad_norm": 0.17845821380615234,
"learning_rate": 1.7659670117769587e-06,
"loss": 0.1459,
"step": 22770
},
{
"epoch": 2.8684699436344743,
"grad_norm": 0.1874646097421646,
"learning_rate": 1.749189128121231e-06,
"loss": 0.1493,
"step": 22775
},
{
"epoch": 2.8690997260446514,
"grad_norm": 0.18263909220695496,
"learning_rate": 1.7324908585373387e-06,
"loss": 0.1438,
"step": 22780
},
{
"epoch": 2.869729508454829,
"grad_norm": 0.1789528727531433,
"learning_rate": 1.7158722119926583e-06,
"loss": 0.1476,
"step": 22785
},
{
"epoch": 2.8703592908650064,
"grad_norm": 0.21683086454868317,
"learning_rate": 1.6993331974117508e-06,
"loss": 0.1499,
"step": 22790
},
{
"epoch": 2.8709890732751835,
"grad_norm": 0.24182718992233276,
"learning_rate": 1.6828738236764617e-06,
"loss": 0.1541,
"step": 22795
},
{
"epoch": 2.8716188556853606,
"grad_norm": 0.21868962049484253,
"learning_rate": 1.6664940996258702e-06,
"loss": 0.1471,
"step": 22800
},
{
"epoch": 2.872248638095538,
"grad_norm": 0.1993272453546524,
"learning_rate": 1.6501940340562236e-06,
"loss": 0.1526,
"step": 22805
},
{
"epoch": 2.872878420505715,
"grad_norm": 0.201304093003273,
"learning_rate": 1.6339736357210697e-06,
"loss": 0.1516,
"step": 22810
},
{
"epoch": 2.8735082029158927,
"grad_norm": 0.25056761503219604,
"learning_rate": 1.6178329133310908e-06,
"loss": 0.151,
"step": 22815
},
{
"epoch": 2.87413798532607,
"grad_norm": 0.19561152160167694,
"learning_rate": 1.6017718755542696e-06,
"loss": 0.143,
"step": 22820
},
{
"epoch": 2.8747677677362473,
"grad_norm": 0.22097674012184143,
"learning_rate": 1.5857905310157071e-06,
"loss": 0.1512,
"step": 22825
},
{
"epoch": 2.8753975501464244,
"grad_norm": 0.21212686598300934,
"learning_rate": 1.5698888882977712e-06,
"loss": 0.1541,
"step": 22830
},
{
"epoch": 2.8760273325566015,
"grad_norm": 0.20324502885341644,
"learning_rate": 1.5540669559399977e-06,
"loss": 0.1533,
"step": 22835
},
{
"epoch": 2.876657114966779,
"grad_norm": 0.169882133603096,
"learning_rate": 1.5383247424391564e-06,
"loss": 0.1406,
"step": 22840
},
{
"epoch": 2.8772868973769565,
"grad_norm": 0.23402316868305206,
"learning_rate": 1.5226622562491352e-06,
"loss": 0.1569,
"step": 22845
},
{
"epoch": 2.8779166797871336,
"grad_norm": 0.17247354984283447,
"learning_rate": 1.5070795057810559e-06,
"loss": 0.1432,
"step": 22850
},
{
"epoch": 2.8785464621973107,
"grad_norm": 0.21733173727989197,
"learning_rate": 1.4915764994032409e-06,
"loss": 0.1523,
"step": 22855
},
{
"epoch": 2.879176244607488,
"grad_norm": 0.19319911301136017,
"learning_rate": 1.4761532454411306e-06,
"loss": 0.1391,
"step": 22860
},
{
"epoch": 2.8798060270176653,
"grad_norm": 0.19645391404628754,
"learning_rate": 1.4608097521773664e-06,
"loss": 0.1499,
"step": 22865
},
{
"epoch": 2.880435809427843,
"grad_norm": 0.18772046267986298,
"learning_rate": 1.4455460278517572e-06,
"loss": 0.1483,
"step": 22870
},
{
"epoch": 2.88106559183802,
"grad_norm": 0.22282320261001587,
"learning_rate": 1.4303620806612792e-06,
"loss": 0.1468,
"step": 22875
},
{
"epoch": 2.8816953742481974,
"grad_norm": 0.19287440180778503,
"learning_rate": 1.4152579187600599e-06,
"loss": 0.1495,
"step": 22880
},
{
"epoch": 2.8823251566583745,
"grad_norm": 0.1981481909751892,
"learning_rate": 1.400233550259361e-06,
"loss": 0.1506,
"step": 22885
},
{
"epoch": 2.8829549390685516,
"grad_norm": 0.21331623196601868,
"learning_rate": 1.385288983227628e-06,
"loss": 0.1483,
"step": 22890
},
{
"epoch": 2.883584721478729,
"grad_norm": 0.20138582587242126,
"learning_rate": 1.3704242256904252e-06,
"loss": 0.1504,
"step": 22895
},
{
"epoch": 2.8842145038889067,
"grad_norm": 0.18493309617042542,
"learning_rate": 1.3556392856304831e-06,
"loss": 0.1539,
"step": 22900
},
{
"epoch": 2.8848442862990837,
"grad_norm": 0.22465452551841736,
"learning_rate": 1.3409341709876343e-06,
"loss": 0.1511,
"step": 22905
},
{
"epoch": 2.885474068709261,
"grad_norm": 0.19788892567157745,
"learning_rate": 1.326308889658878e-06,
"loss": 0.1504,
"step": 22910
},
{
"epoch": 2.8861038511194383,
"grad_norm": 0.20821528136730194,
"learning_rate": 1.3117634494982986e-06,
"loss": 0.1511,
"step": 22915
},
{
"epoch": 2.8867336335296154,
"grad_norm": 0.24520978331565857,
"learning_rate": 1.2972978583171644e-06,
"loss": 0.1503,
"step": 22920
},
{
"epoch": 2.887363415939793,
"grad_norm": 0.18331633508205414,
"learning_rate": 1.2829121238837947e-06,
"loss": 0.1485,
"step": 22925
},
{
"epoch": 2.88799319834997,
"grad_norm": 0.23501911759376526,
"learning_rate": 1.2686062539236762e-06,
"loss": 0.1452,
"step": 22930
},
{
"epoch": 2.8886229807601476,
"grad_norm": 0.19778122007846832,
"learning_rate": 1.2543802561193806e-06,
"loss": 0.1523,
"step": 22935
},
{
"epoch": 2.8892527631703246,
"grad_norm": 0.21170009672641754,
"learning_rate": 1.2402341381105962e-06,
"loss": 0.1627,
"step": 22940
},
{
"epoch": 2.8898825455805017,
"grad_norm": 0.2205863893032074,
"learning_rate": 1.22616790749413e-06,
"loss": 0.1507,
"step": 22945
},
{
"epoch": 2.8905123279906793,
"grad_norm": 0.1905989944934845,
"learning_rate": 1.2121815718238393e-06,
"loss": 0.15,
"step": 22950
},
{
"epoch": 2.891142110400857,
"grad_norm": 0.19767914712429047,
"learning_rate": 1.1982751386107159e-06,
"loss": 0.1499,
"step": 22955
},
{
"epoch": 2.891771892811034,
"grad_norm": 0.23298701643943787,
"learning_rate": 1.1844486153228361e-06,
"loss": 0.1507,
"step": 22960
},
{
"epoch": 2.892401675221211,
"grad_norm": 0.17696991562843323,
"learning_rate": 1.1707020093853602e-06,
"loss": 0.1458,
"step": 22965
},
{
"epoch": 2.8930314576313885,
"grad_norm": 0.22472181916236877,
"learning_rate": 1.1570353281805334e-06,
"loss": 0.154,
"step": 22970
},
{
"epoch": 2.8936612400415656,
"grad_norm": 0.18597114086151123,
"learning_rate": 1.1434485790476512e-06,
"loss": 0.1531,
"step": 22975
},
{
"epoch": 2.894291022451743,
"grad_norm": 0.20187944173812866,
"learning_rate": 1.1299417692831436e-06,
"loss": 0.1558,
"step": 22980
},
{
"epoch": 2.89492080486192,
"grad_norm": 0.19914616644382477,
"learning_rate": 1.1165149061404422e-06,
"loss": 0.1418,
"step": 22985
},
{
"epoch": 2.8955505872720977,
"grad_norm": 0.2554416060447693,
"learning_rate": 1.1031679968301122e-06,
"loss": 0.1556,
"step": 22990
},
{
"epoch": 2.8961803696822748,
"grad_norm": 0.2221318930387497,
"learning_rate": 1.08990104851972e-06,
"loss": 0.1523,
"step": 22995
},
{
"epoch": 2.896810152092452,
"grad_norm": 0.2058124542236328,
"learning_rate": 1.0767140683339336e-06,
"loss": 0.1403,
"step": 23000
},
{
"epoch": 2.896810152092452,
"eval_loss": 0.36229029297828674,
"eval_runtime": 6.1614,
"eval_samples_per_second": 162.302,
"eval_steps_per_second": 10.225,
"step": 23000
},
{
"epoch": 2.8974399345026294,
"grad_norm": 0.2248660773038864,
"learning_rate": 1.0636070633544547e-06,
"loss": 0.1514,
"step": 23005
},
{
"epoch": 2.8980697169128065,
"grad_norm": 0.1835104525089264,
"learning_rate": 1.0505800406200526e-06,
"loss": 0.1405,
"step": 23010
},
{
"epoch": 2.898699499322984,
"grad_norm": 0.22200733423233032,
"learning_rate": 1.0376330071265482e-06,
"loss": 0.1503,
"step": 23015
},
{
"epoch": 2.899329281733161,
"grad_norm": 0.18144001066684723,
"learning_rate": 1.024765969826763e-06,
"loss": 0.1516,
"step": 23020
},
{
"epoch": 2.8999590641433386,
"grad_norm": 0.17609558999538422,
"learning_rate": 1.0119789356306196e-06,
"loss": 0.1517,
"step": 23025
},
{
"epoch": 2.9005888465535157,
"grad_norm": 0.2412068098783493,
"learning_rate": 9.99271911405025e-07,
"loss": 0.157,
"step": 23030
},
{
"epoch": 2.901218628963693,
"grad_norm": 0.19257797300815582,
"learning_rate": 9.866449039739544e-07,
"loss": 0.1486,
"step": 23035
},
{
"epoch": 2.9018484113738703,
"grad_norm": 0.22018341720104218,
"learning_rate": 9.74097920118383e-07,
"loss": 0.1517,
"step": 23040
},
{
"epoch": 2.902478193784048,
"grad_norm": 0.19569897651672363,
"learning_rate": 9.616309665763544e-07,
"loss": 0.144,
"step": 23045
},
{
"epoch": 2.903107976194225,
"grad_norm": 0.20541365444660187,
"learning_rate": 9.492440500428966e-07,
"loss": 0.1535,
"step": 23050
},
{
"epoch": 2.903737758604402,
"grad_norm": 0.1934703141450882,
"learning_rate": 9.369371771700552e-07,
"loss": 0.1369,
"step": 23055
},
{
"epoch": 2.9043675410145795,
"grad_norm": 0.16967949271202087,
"learning_rate": 9.247103545669266e-07,
"loss": 0.1447,
"step": 23060
},
{
"epoch": 2.9049973234247566,
"grad_norm": 0.15972602367401123,
"learning_rate": 9.125635887995586e-07,
"loss": 0.1484,
"step": 23065
},
{
"epoch": 2.905627105834934,
"grad_norm": 0.21369343996047974,
"learning_rate": 9.004968863910667e-07,
"loss": 0.153,
"step": 23070
},
{
"epoch": 2.906256888245111,
"grad_norm": 0.18001650273799896,
"learning_rate": 8.885102538215338e-07,
"loss": 0.1499,
"step": 23075
},
{
"epoch": 2.9068866706552887,
"grad_norm": 0.20539362728595734,
"learning_rate": 8.766036975280777e-07,
"loss": 0.1495,
"step": 23080
},
{
"epoch": 2.907516453065466,
"grad_norm": 0.2048049122095108,
"learning_rate": 8.647772239047667e-07,
"loss": 0.1477,
"step": 23085
},
{
"epoch": 2.9081462354756433,
"grad_norm": 0.2018105536699295,
"learning_rate": 8.530308393027041e-07,
"loss": 0.1457,
"step": 23090
},
{
"epoch": 2.9087760178858204,
"grad_norm": 0.21647921204566956,
"learning_rate": 8.413645500299437e-07,
"loss": 0.1511,
"step": 23095
},
{
"epoch": 2.909405800295998,
"grad_norm": 0.19362643361091614,
"learning_rate": 8.297783623515741e-07,
"loss": 0.1491,
"step": 23100
},
{
"epoch": 2.910035582706175,
"grad_norm": 0.18819986283779144,
"learning_rate": 8.182722824896182e-07,
"loss": 0.1482,
"step": 23105
},
{
"epoch": 2.910665365116352,
"grad_norm": 0.18341930210590363,
"learning_rate": 8.068463166231332e-07,
"loss": 0.1468,
"step": 23110
},
{
"epoch": 2.9112951475265296,
"grad_norm": 0.17555510997772217,
"learning_rate": 7.955004708881107e-07,
"loss": 0.1538,
"step": 23115
},
{
"epoch": 2.9119249299367067,
"grad_norm": 0.1862919181585312,
"learning_rate": 7.842347513775271e-07,
"loss": 0.1485,
"step": 23120
},
{
"epoch": 2.9125547123468842,
"grad_norm": 0.23576001822948456,
"learning_rate": 7.730491641413262e-07,
"loss": 0.15,
"step": 23125
},
{
"epoch": 2.9131844947570613,
"grad_norm": 0.20176522433757782,
"learning_rate": 7.619437151864194e-07,
"loss": 0.1528,
"step": 23130
},
{
"epoch": 2.913814277167239,
"grad_norm": 0.1988651603460312,
"learning_rate": 7.50918410476703e-07,
"loss": 0.1513,
"step": 23135
},
{
"epoch": 2.914444059577416,
"grad_norm": 0.19462084770202637,
"learning_rate": 7.399732559330074e-07,
"loss": 0.1471,
"step": 23140
},
{
"epoch": 2.9150738419875935,
"grad_norm": 0.2095441371202469,
"learning_rate": 7.291082574331309e-07,
"loss": 0.1488,
"step": 23145
},
{
"epoch": 2.9157036243977705,
"grad_norm": 0.19712376594543457,
"learning_rate": 7.18323420811856e-07,
"loss": 0.146,
"step": 23150
},
{
"epoch": 2.916333406807948,
"grad_norm": 0.20228298008441925,
"learning_rate": 7.076187518608168e-07,
"loss": 0.1408,
"step": 23155
},
{
"epoch": 2.916963189218125,
"grad_norm": 0.21097783744335175,
"learning_rate": 6.969942563287311e-07,
"loss": 0.1463,
"step": 23160
},
{
"epoch": 2.9175929716283022,
"grad_norm": 0.1911788433790207,
"learning_rate": 6.864499399211687e-07,
"loss": 0.1359,
"step": 23165
},
{
"epoch": 2.9182227540384797,
"grad_norm": 0.19333137571811676,
"learning_rate": 6.759858083006831e-07,
"loss": 0.1521,
"step": 23170
},
{
"epoch": 2.918852536448657,
"grad_norm": 0.20187996327877045,
"learning_rate": 6.656018670867125e-07,
"loss": 0.1391,
"step": 23175
},
{
"epoch": 2.9194823188588344,
"grad_norm": 0.26705697178840637,
"learning_rate": 6.55298121855713e-07,
"loss": 0.1531,
"step": 23180
},
{
"epoch": 2.9201121012690114,
"grad_norm": 0.2374356985092163,
"learning_rate": 6.450745781410249e-07,
"loss": 0.1604,
"step": 23185
},
{
"epoch": 2.920741883679189,
"grad_norm": 0.18227587640285492,
"learning_rate": 6.349312414329067e-07,
"loss": 0.1486,
"step": 23190
},
{
"epoch": 2.921371666089366,
"grad_norm": 0.22778551280498505,
"learning_rate": 6.248681171785675e-07,
"loss": 0.1475,
"step": 23195
},
{
"epoch": 2.922001448499543,
"grad_norm": 0.2099718153476715,
"learning_rate": 6.148852107821511e-07,
"loss": 0.1442,
"step": 23200
},
{
"epoch": 2.9226312309097207,
"grad_norm": 0.19987498223781586,
"learning_rate": 6.04982527604686e-07,
"loss": 0.1504,
"step": 23205
},
{
"epoch": 2.923261013319898,
"grad_norm": 0.1993655115365982,
"learning_rate": 5.951600729641515e-07,
"loss": 0.154,
"step": 23210
},
{
"epoch": 2.9238907957300753,
"grad_norm": 0.19234336912631989,
"learning_rate": 5.854178521354113e-07,
"loss": 0.1531,
"step": 23215
},
{
"epoch": 2.9245205781402523,
"grad_norm": 0.20532859861850739,
"learning_rate": 5.757558703502973e-07,
"loss": 0.1522,
"step": 23220
},
{
"epoch": 2.92515036055043,
"grad_norm": 0.23982007801532745,
"learning_rate": 5.661741327974755e-07,
"loss": 0.15,
"step": 23225
},
{
"epoch": 2.925780142960607,
"grad_norm": 0.18457266688346863,
"learning_rate": 5.5667264462258e-07,
"loss": 0.1457,
"step": 23230
},
{
"epoch": 2.9264099253707845,
"grad_norm": 0.1986248642206192,
"learning_rate": 5.472514109281123e-07,
"loss": 0.145,
"step": 23235
},
{
"epoch": 2.9270397077809616,
"grad_norm": 0.2265506535768509,
"learning_rate": 5.379104367735087e-07,
"loss": 0.147,
"step": 23240
},
{
"epoch": 2.927669490191139,
"grad_norm": 0.21421676874160767,
"learning_rate": 5.286497271750733e-07,
"loss": 0.1492,
"step": 23245
},
{
"epoch": 2.928299272601316,
"grad_norm": 0.1608610451221466,
"learning_rate": 5.19469287106028e-07,
"loss": 0.1339,
"step": 23250
},
{
"epoch": 2.9289290550114933,
"grad_norm": 0.18140849471092224,
"learning_rate": 5.103691214964789e-07,
"loss": 0.1469,
"step": 23255
},
{
"epoch": 2.929558837421671,
"grad_norm": 0.21651338040828705,
"learning_rate": 5.013492352334003e-07,
"loss": 0.1596,
"step": 23260
},
{
"epoch": 2.9301886198318483,
"grad_norm": 0.18771570920944214,
"learning_rate": 4.924096331607008e-07,
"loss": 0.1482,
"step": 23265
},
{
"epoch": 2.9308184022420254,
"grad_norm": 0.2401566356420517,
"learning_rate": 4.835503200791402e-07,
"loss": 0.1557,
"step": 23270
},
{
"epoch": 2.9314481846522025,
"grad_norm": 0.22064423561096191,
"learning_rate": 4.747713007463627e-07,
"loss": 0.1621,
"step": 23275
},
{
"epoch": 2.93207796706238,
"grad_norm": 0.18799829483032227,
"learning_rate": 4.660725798769305e-07,
"loss": 0.1517,
"step": 23280
},
{
"epoch": 2.932707749472557,
"grad_norm": 0.24584966897964478,
"learning_rate": 4.574541621422401e-07,
"loss": 0.1563,
"step": 23285
},
{
"epoch": 2.9333375318827346,
"grad_norm": 0.18013089895248413,
"learning_rate": 4.489160521705726e-07,
"loss": 0.1388,
"step": 23290
},
{
"epoch": 2.9339673142929117,
"grad_norm": 0.20351989567279816,
"learning_rate": 4.404582545470936e-07,
"loss": 0.1451,
"step": 23295
},
{
"epoch": 2.934597096703089,
"grad_norm": 0.17512726783752441,
"learning_rate": 4.3208077381383655e-07,
"loss": 0.1413,
"step": 23300
},
{
"epoch": 2.9352268791132663,
"grad_norm": 0.2058653086423874,
"learning_rate": 4.2378361446970267e-07,
"loss": 0.1578,
"step": 23305
},
{
"epoch": 2.9358566615234434,
"grad_norm": 0.23548051714897156,
"learning_rate": 4.155667809704444e-07,
"loss": 0.154,
"step": 23310
},
{
"epoch": 2.936486443933621,
"grad_norm": 0.19606271386146545,
"learning_rate": 4.074302777286986e-07,
"loss": 0.1523,
"step": 23315
},
{
"epoch": 2.9371162263437984,
"grad_norm": 0.19439321756362915,
"learning_rate": 3.993741091139369e-07,
"loss": 0.1447,
"step": 23320
},
{
"epoch": 2.9377460087539755,
"grad_norm": 0.20347769558429718,
"learning_rate": 3.9139827945253167e-07,
"loss": 0.1466,
"step": 23325
},
{
"epoch": 2.9383757911641526,
"grad_norm": 0.23724155128002167,
"learning_rate": 3.835027930276735e-07,
"loss": 0.1597,
"step": 23330
},
{
"epoch": 2.93900557357433,
"grad_norm": 0.1813487559556961,
"learning_rate": 3.7568765407940406e-07,
"loss": 0.1358,
"step": 23335
},
{
"epoch": 2.939635355984507,
"grad_norm": 0.21012306213378906,
"learning_rate": 3.679528668046494e-07,
"loss": 0.1508,
"step": 23340
},
{
"epoch": 2.9402651383946847,
"grad_norm": 0.21550029516220093,
"learning_rate": 3.602984353571703e-07,
"loss": 0.1449,
"step": 23345
},
{
"epoch": 2.940894920804862,
"grad_norm": 0.203638955950737,
"learning_rate": 3.5272436384756186e-07,
"loss": 0.1491,
"step": 23350
},
{
"epoch": 2.9415247032150393,
"grad_norm": 0.21173645555973053,
"learning_rate": 3.452306563432872e-07,
"loss": 0.1553,
"step": 23355
},
{
"epoch": 2.9421544856252164,
"grad_norm": 0.22810731828212738,
"learning_rate": 3.3781731686861047e-07,
"loss": 0.1606,
"step": 23360
},
{
"epoch": 2.9427842680353935,
"grad_norm": 0.19044247269630432,
"learning_rate": 3.3048434940469713e-07,
"loss": 0.1421,
"step": 23365
},
{
"epoch": 2.943414050445571,
"grad_norm": 0.21632073819637299,
"learning_rate": 3.232317578894805e-07,
"loss": 0.1473,
"step": 23370
},
{
"epoch": 2.9440438328557486,
"grad_norm": 0.17703349888324738,
"learning_rate": 3.160595462178117e-07,
"loss": 0.1503,
"step": 23375
},
{
"epoch": 2.9446736152659256,
"grad_norm": 0.19983936846256256,
"learning_rate": 3.089677182412931e-07,
"loss": 0.1427,
"step": 23380
},
{
"epoch": 2.9453033976761027,
"grad_norm": 0.18913906812667847,
"learning_rate": 3.019562777684115e-07,
"loss": 0.1443,
"step": 23385
},
{
"epoch": 2.9459331800862802,
"grad_norm": 0.2024787813425064,
"learning_rate": 2.950252285644883e-07,
"loss": 0.1501,
"step": 23390
},
{
"epoch": 2.9465629624964573,
"grad_norm": 0.22307011485099792,
"learning_rate": 2.8817457435164614e-07,
"loss": 0.1526,
"step": 23395
},
{
"epoch": 2.947192744906635,
"grad_norm": 0.23350244760513306,
"learning_rate": 2.814043188088255e-07,
"loss": 0.1583,
"step": 23400
},
{
"epoch": 2.947822527316812,
"grad_norm": 0.18705366551876068,
"learning_rate": 2.7471446557181807e-07,
"loss": 0.1515,
"step": 23405
},
{
"epoch": 2.9484523097269895,
"grad_norm": 0.18902996182441711,
"learning_rate": 2.681050182332334e-07,
"loss": 0.1489,
"step": 23410
},
{
"epoch": 2.9490820921371665,
"grad_norm": 0.18764075636863708,
"learning_rate": 2.6157598034249885e-07,
"loss": 0.1519,
"step": 23415
},
{
"epoch": 2.9497118745473436,
"grad_norm": 0.20529431104660034,
"learning_rate": 2.5512735540584305e-07,
"loss": 0.1504,
"step": 23420
},
{
"epoch": 2.950341656957521,
"grad_norm": 0.21828597784042358,
"learning_rate": 2.487591468863293e-07,
"loss": 0.147,
"step": 23425
},
{
"epoch": 2.9509714393676987,
"grad_norm": 0.2224801480770111,
"learning_rate": 2.424713582038551e-07,
"loss": 0.1542,
"step": 23430
},
{
"epoch": 2.9516012217778758,
"grad_norm": 0.1950775682926178,
"learning_rate": 2.3626399273506957e-07,
"loss": 0.1462,
"step": 23435
},
{
"epoch": 2.952231004188053,
"grad_norm": 0.17124883830547333,
"learning_rate": 2.3013705381348946e-07,
"loss": 0.1374,
"step": 23440
},
{
"epoch": 2.9528607865982304,
"grad_norm": 0.20156201720237732,
"learning_rate": 2.2409054472941613e-07,
"loss": 0.1471,
"step": 23445
},
{
"epoch": 2.9534905690084075,
"grad_norm": 0.19449764490127563,
"learning_rate": 2.1812446872995214e-07,
"loss": 0.158,
"step": 23450
},
{
"epoch": 2.954120351418585,
"grad_norm": 0.18049843609333038,
"learning_rate": 2.1223882901905132e-07,
"loss": 0.1518,
"step": 23455
},
{
"epoch": 2.954750133828762,
"grad_norm": 0.19583791494369507,
"learning_rate": 2.06433628757402e-07,
"loss": 0.1556,
"step": 23460
},
{
"epoch": 2.9553799162389396,
"grad_norm": 0.1842242181301117,
"learning_rate": 2.0070887106254373e-07,
"loss": 0.1494,
"step": 23465
},
{
"epoch": 2.9560096986491167,
"grad_norm": 0.22290416061878204,
"learning_rate": 1.950645590088007e-07,
"loss": 0.1454,
"step": 23470
},
{
"epoch": 2.9566394810592938,
"grad_norm": 0.18742886185646057,
"learning_rate": 1.895006956272982e-07,
"loss": 0.1432,
"step": 23475
},
{
"epoch": 2.9572692634694713,
"grad_norm": 0.181674525141716,
"learning_rate": 1.8401728390594617e-07,
"loss": 0.1534,
"step": 23480
},
{
"epoch": 2.957899045879649,
"grad_norm": 0.2029273808002472,
"learning_rate": 1.786143267894724e-07,
"loss": 0.1515,
"step": 23485
},
{
"epoch": 2.958528828289826,
"grad_norm": 0.23291803896427155,
"learning_rate": 1.7329182717940594e-07,
"loss": 0.144,
"step": 23490
},
{
"epoch": 2.959158610700003,
"grad_norm": 0.24005140364170074,
"learning_rate": 1.6804978793401036e-07,
"loss": 0.1508,
"step": 23495
},
{
"epoch": 2.9597883931101805,
"grad_norm": 0.18763796985149384,
"learning_rate": 1.6288821186841716e-07,
"loss": 0.1477,
"step": 23500
},
{
"epoch": 2.9604181755203576,
"grad_norm": 0.18949641287326813,
"learning_rate": 1.578071017544924e-07,
"loss": 0.1501,
"step": 23505
},
{
"epoch": 2.961047957930535,
"grad_norm": 0.21106213331222534,
"learning_rate": 1.5280646032092003e-07,
"loss": 0.152,
"step": 23510
},
{
"epoch": 2.961677740340712,
"grad_norm": 0.231742724776268,
"learning_rate": 1.4788629025313526e-07,
"loss": 0.1546,
"step": 23515
},
{
"epoch": 2.9623075227508897,
"grad_norm": 0.21462422609329224,
"learning_rate": 1.430465941934078e-07,
"loss": 0.149,
"step": 23520
},
{
"epoch": 2.962937305161067,
"grad_norm": 0.18480440974235535,
"learning_rate": 1.382873747407587e-07,
"loss": 0.1486,
"step": 23525
},
{
"epoch": 2.963567087571244,
"grad_norm": 0.18864907324314117,
"learning_rate": 1.3360863445097682e-07,
"loss": 0.1556,
"step": 23530
},
{
"epoch": 2.9641968699814214,
"grad_norm": 0.17919088900089264,
"learning_rate": 1.2901037583668562e-07,
"loss": 0.1435,
"step": 23535
},
{
"epoch": 2.964826652391599,
"grad_norm": 0.22338031232357025,
"learning_rate": 1.2449260136722649e-07,
"loss": 0.1538,
"step": 23540
},
{
"epoch": 2.965456434801776,
"grad_norm": 0.23020599782466888,
"learning_rate": 1.200553134687754e-07,
"loss": 0.1559,
"step": 23545
},
{
"epoch": 2.966086217211953,
"grad_norm": 0.20723643898963928,
"learning_rate": 1.1569851452422618e-07,
"loss": 0.1415,
"step": 23550
},
{
"epoch": 2.9667159996221306,
"grad_norm": 0.2118474692106247,
"learning_rate": 1.1142220687330727e-07,
"loss": 0.1567,
"step": 23555
},
{
"epoch": 2.9673457820323077,
"grad_norm": 0.20279090106487274,
"learning_rate": 1.0722639281246503e-07,
"loss": 0.145,
"step": 23560
},
{
"epoch": 2.9679755644424852,
"grad_norm": 0.21842657029628754,
"learning_rate": 1.0311107459498035e-07,
"loss": 0.1557,
"step": 23565
},
{
"epoch": 2.9686053468526623,
"grad_norm": 0.2776351571083069,
"learning_rate": 9.90762544308521e-08,
"loss": 0.1638,
"step": 23570
},
{
"epoch": 2.96923512926284,
"grad_norm": 0.19698885083198547,
"learning_rate": 9.512193448686367e-08,
"loss": 0.1457,
"step": 23575
},
{
"epoch": 2.969864911673017,
"grad_norm": 0.1835564374923706,
"learning_rate": 9.124811688659972e-08,
"loss": 0.1569,
"step": 23580
},
{
"epoch": 2.970494694083194,
"grad_norm": 0.20081757009029388,
"learning_rate": 8.745480371036284e-08,
"loss": 0.1451,
"step": 23585
},
{
"epoch": 2.9711244764933715,
"grad_norm": 0.2095508873462677,
"learning_rate": 8.37419969952735e-08,
"loss": 0.1461,
"step": 23590
},
{
"epoch": 2.971754258903549,
"grad_norm": 0.24606873095035553,
"learning_rate": 8.010969873517015e-08,
"loss": 0.1555,
"step": 23595
},
{
"epoch": 2.972384041313726,
"grad_norm": 0.2295389175415039,
"learning_rate": 7.65579108806924e-08,
"loss": 0.1534,
"step": 23600
},
{
"epoch": 2.973013823723903,
"grad_norm": 0.217758446931839,
"learning_rate": 7.308663533924786e-08,
"loss": 0.1647,
"step": 23605
},
{
"epoch": 2.9736436061340807,
"grad_norm": 0.21592317521572113,
"learning_rate": 6.969587397496201e-08,
"loss": 0.1482,
"step": 23610
},
{
"epoch": 2.974273388544258,
"grad_norm": 0.2219853401184082,
"learning_rate": 6.638562860876162e-08,
"loss": 0.151,
"step": 23615
},
{
"epoch": 2.9749031709544353,
"grad_norm": 0.21385011076927185,
"learning_rate": 6.315590101832468e-08,
"loss": 0.1477,
"step": 23620
},
{
"epoch": 2.9755329533646124,
"grad_norm": 0.18287068605422974,
"learning_rate": 6.000669293808048e-08,
"loss": 0.1478,
"step": 23625
},
{
"epoch": 2.97616273577479,
"grad_norm": 0.23458221554756165,
"learning_rate": 5.693800605924281e-08,
"loss": 0.1521,
"step": 23630
},
{
"epoch": 2.976792518184967,
"grad_norm": 0.2393937110900879,
"learning_rate": 5.394984202976016e-08,
"loss": 0.1443,
"step": 23635
},
{
"epoch": 2.977422300595144,
"grad_norm": 0.2523866891860962,
"learning_rate": 5.104220245434887e-08,
"loss": 0.16,
"step": 23640
},
{
"epoch": 2.9780520830053216,
"grad_norm": 0.20749832689762115,
"learning_rate": 4.821508889445991e-08,
"loss": 0.1493,
"step": 23645
},
{
"epoch": 2.978681865415499,
"grad_norm": 0.20586150884628296,
"learning_rate": 4.546850286834547e-08,
"loss": 0.1425,
"step": 23650
},
{
"epoch": 2.9793116478256763,
"grad_norm": 0.20742247998714447,
"learning_rate": 4.2802445850959046e-08,
"loss": 0.1556,
"step": 23655
},
{
"epoch": 2.9799414302358533,
"grad_norm": 0.178171768784523,
"learning_rate": 4.0216919274038696e-08,
"loss": 0.1532,
"step": 23660
},
{
"epoch": 2.980571212646031,
"grad_norm": 0.24389080703258514,
"learning_rate": 3.771192452607374e-08,
"loss": 0.1551,
"step": 23665
},
{
"epoch": 2.981200995056208,
"grad_norm": 0.18905188143253326,
"learning_rate": 3.528746295232143e-08,
"loss": 0.144,
"step": 23670
},
{
"epoch": 2.9818307774663855,
"grad_norm": 0.258633017539978,
"learning_rate": 3.2943535854756956e-08,
"loss": 0.1624,
"step": 23675
},
{
"epoch": 2.9824605598765626,
"grad_norm": 0.19491221010684967,
"learning_rate": 3.0680144492123416e-08,
"loss": 0.1429,
"step": 23680
},
{
"epoch": 2.98309034228674,
"grad_norm": 0.19454774260520935,
"learning_rate": 2.8497290079898537e-08,
"loss": 0.1439,
"step": 23685
},
{
"epoch": 2.983720124696917,
"grad_norm": 0.21658724546432495,
"learning_rate": 2.6394973790361262e-08,
"loss": 0.1397,
"step": 23690
},
{
"epoch": 2.9843499071070942,
"grad_norm": 0.18729209899902344,
"learning_rate": 2.4373196752475177e-08,
"loss": 0.1494,
"step": 23695
},
{
"epoch": 2.9849796895172718,
"grad_norm": 0.19640378654003143,
"learning_rate": 2.243196005198844e-08,
"loss": 0.1497,
"step": 23700
},
{
"epoch": 2.9856094719274493,
"grad_norm": 0.20427659153938293,
"learning_rate": 2.0571264731383817e-08,
"loss": 0.1533,
"step": 23705
},
{
"epoch": 2.9862392543376264,
"grad_norm": 0.217566579580307,
"learning_rate": 1.8791111789911995e-08,
"loss": 0.1539,
"step": 23710
},
{
"epoch": 2.9868690367478035,
"grad_norm": 0.2069326490163803,
"learning_rate": 1.7091502183541606e-08,
"loss": 0.1481,
"step": 23715
},
{
"epoch": 2.987498819157981,
"grad_norm": 0.17276856303215027,
"learning_rate": 1.5472436825009205e-08,
"loss": 0.1457,
"step": 23720
},
{
"epoch": 2.988128601568158,
"grad_norm": 0.21325160562992096,
"learning_rate": 1.3933916583785954e-08,
"loss": 0.1453,
"step": 23725
},
{
"epoch": 2.9887583839783356,
"grad_norm": 0.2071818858385086,
"learning_rate": 1.2475942286094275e-08,
"loss": 0.1522,
"step": 23730
},
{
"epoch": 2.9893881663885127,
"grad_norm": 0.19544194638729095,
"learning_rate": 1.1098514714891205e-08,
"loss": 0.1427,
"step": 23735
},
{
"epoch": 2.99001794879869,
"grad_norm": 0.2268587052822113,
"learning_rate": 9.801634609901688e-09,
"loss": 0.1574,
"step": 23740
},
{
"epoch": 2.9906477312088673,
"grad_norm": 0.1800483763217926,
"learning_rate": 8.585302667585281e-09,
"loss": 0.1491,
"step": 23745
},
{
"epoch": 2.9912775136190444,
"grad_norm": 0.1946074366569519,
"learning_rate": 7.449519541119498e-09,
"loss": 0.1504,
"step": 23750
},
{
"epoch": 2.991907296029222,
"grad_norm": 0.19125299155712128,
"learning_rate": 6.394285840449764e-09,
"loss": 0.148,
"step": 23755
},
{
"epoch": 2.9925370784393994,
"grad_norm": 0.18596796691417694,
"learning_rate": 5.419602132272771e-09,
"loss": 0.1418,
"step": 23760
},
{
"epoch": 2.9931668608495765,
"grad_norm": 0.20286522805690765,
"learning_rate": 4.525468940003163e-09,
"loss": 0.1425,
"step": 23765
},
{
"epoch": 2.9937966432597536,
"grad_norm": 0.1934228241443634,
"learning_rate": 3.7118867438068465e-09,
"loss": 0.1456,
"step": 23770
},
{
"epoch": 2.994426425669931,
"grad_norm": 0.19864603877067566,
"learning_rate": 2.9788559806176447e-09,
"loss": 0.141,
"step": 23775
},
{
"epoch": 2.995056208080108,
"grad_norm": 0.1812632828950882,
"learning_rate": 2.326377044070682e-09,
"loss": 0.1414,
"step": 23780
},
{
"epoch": 2.9956859904902857,
"grad_norm": 0.18837498128414154,
"learning_rate": 1.7544502845856512e-09,
"loss": 0.1466,
"step": 23785
},
{
"epoch": 2.996315772900463,
"grad_norm": 0.19479897618293762,
"learning_rate": 1.2630760092668946e-09,
"loss": 0.1434,
"step": 23790
},
{
"epoch": 2.9969455553106403,
"grad_norm": 0.2550322413444519,
"learning_rate": 8.522544820199761e-10,
"loss": 0.1527,
"step": 23795
},
{
"epoch": 2.9975753377208174,
"grad_norm": 0.22663426399230957,
"learning_rate": 5.21985923451762e-10,
"loss": 0.154,
"step": 23800
},
{
"epoch": 2.9982051201309945,
"grad_norm": 0.19043132662773132,
"learning_rate": 2.7227051092038043e-10,
"loss": 0.1413,
"step": 23805
},
{
"epoch": 2.998834902541172,
"grad_norm": 0.17332880198955536,
"learning_rate": 1.0310837855187492e-10,
"loss": 0.137,
"step": 23810
},
{
"epoch": 2.9994646849513495,
"grad_norm": 0.2147851288318634,
"learning_rate": 1.4499617156937815e-11,
"loss": 0.1501,
"step": 23815
}
],
"logging_steps": 5,
"max_steps": 23817,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.0258751160588435e+19,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}