Files
eng_100mb_baseline_seed3407/checkpoint-22260/trainer_state.json
ModelHub XC d1cda2f931 初始化项目,由ModelHub XC社区提供模型
Model: fpadovani/eng_100mb_baseline_seed3407
Source: Original Platform
2026-05-30 03:10:20 +08:00

31374 lines
712 KiB
JSON

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 10.0,
"eval_steps": 1000,
"global_step": 22260,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0022461814914645105,
"grad_norm": 54.0,
"learning_rate": 7.499999999999999e-07,
"loss": 10.989,
"step": 5
},
{
"epoch": 0.004492362982929021,
"grad_norm": 52.75,
"learning_rate": 1.4999999999999998e-06,
"loss": 10.984,
"step": 10
},
{
"epoch": 0.006738544474393531,
"grad_norm": 52.5,
"learning_rate": 2.2499999999999996e-06,
"loss": 10.9491,
"step": 15
},
{
"epoch": 0.008984725965858042,
"grad_norm": 50.25,
"learning_rate": 2.9999999999999997e-06,
"loss": 10.8608,
"step": 20
},
{
"epoch": 0.011230907457322551,
"grad_norm": 44.75,
"learning_rate": 3.7499999999999997e-06,
"loss": 10.7375,
"step": 25
},
{
"epoch": 0.013477088948787063,
"grad_norm": 38.0,
"learning_rate": 4.499999999999999e-06,
"loss": 10.5621,
"step": 30
},
{
"epoch": 0.015723270440251572,
"grad_norm": 25.5,
"learning_rate": 5.25e-06,
"loss": 10.3304,
"step": 35
},
{
"epoch": 0.017969451931716084,
"grad_norm": 19.25,
"learning_rate": 5.999999999999999e-06,
"loss": 10.1403,
"step": 40
},
{
"epoch": 0.02021563342318059,
"grad_norm": 13.8125,
"learning_rate": 6.749999999999999e-06,
"loss": 9.9521,
"step": 45
},
{
"epoch": 0.022461814914645103,
"grad_norm": 11.1875,
"learning_rate": 7.499999999999999e-06,
"loss": 9.843,
"step": 50
},
{
"epoch": 0.024707996406109614,
"grad_norm": 10.5,
"learning_rate": 8.249999999999999e-06,
"loss": 9.7584,
"step": 55
},
{
"epoch": 0.026954177897574125,
"grad_norm": 10.0625,
"learning_rate": 8.999999999999999e-06,
"loss": 9.7293,
"step": 60
},
{
"epoch": 0.029200359389038633,
"grad_norm": 9.1875,
"learning_rate": 9.75e-06,
"loss": 9.719,
"step": 65
},
{
"epoch": 0.031446540880503145,
"grad_norm": 8.9375,
"learning_rate": 1.05e-05,
"loss": 9.6908,
"step": 70
},
{
"epoch": 0.03369272237196765,
"grad_norm": 9.125,
"learning_rate": 1.1249999999999999e-05,
"loss": 9.6617,
"step": 75
},
{
"epoch": 0.03593890386343217,
"grad_norm": 9.0625,
"learning_rate": 1.1999999999999999e-05,
"loss": 9.6228,
"step": 80
},
{
"epoch": 0.038185085354896675,
"grad_norm": 9.125,
"learning_rate": 1.275e-05,
"loss": 9.6069,
"step": 85
},
{
"epoch": 0.04043126684636118,
"grad_norm": 9.3125,
"learning_rate": 1.3499999999999998e-05,
"loss": 9.5342,
"step": 90
},
{
"epoch": 0.0426774483378257,
"grad_norm": 8.9375,
"learning_rate": 1.4249999999999999e-05,
"loss": 9.5187,
"step": 95
},
{
"epoch": 0.044923629829290206,
"grad_norm": 9.125,
"learning_rate": 1.4999999999999999e-05,
"loss": 9.4719,
"step": 100
},
{
"epoch": 0.04716981132075472,
"grad_norm": 9.0,
"learning_rate": 1.5749999999999997e-05,
"loss": 9.4167,
"step": 105
},
{
"epoch": 0.04941599281221923,
"grad_norm": 8.9375,
"learning_rate": 1.6499999999999998e-05,
"loss": 9.3825,
"step": 110
},
{
"epoch": 0.051662174303683736,
"grad_norm": 8.75,
"learning_rate": 1.725e-05,
"loss": 9.3577,
"step": 115
},
{
"epoch": 0.05390835579514825,
"grad_norm": 8.625,
"learning_rate": 1.7999999999999997e-05,
"loss": 9.3387,
"step": 120
},
{
"epoch": 0.05615453728661276,
"grad_norm": 9.375,
"learning_rate": 1.875e-05,
"loss": 9.2947,
"step": 125
},
{
"epoch": 0.05840071877807727,
"grad_norm": 8.75,
"learning_rate": 1.95e-05,
"loss": 9.2177,
"step": 130
},
{
"epoch": 0.06064690026954178,
"grad_norm": 8.8125,
"learning_rate": 2.025e-05,
"loss": 9.1683,
"step": 135
},
{
"epoch": 0.06289308176100629,
"grad_norm": 9.875,
"learning_rate": 2.1e-05,
"loss": 9.1444,
"step": 140
},
{
"epoch": 0.0651392632524708,
"grad_norm": 9.625,
"learning_rate": 2.1749999999999997e-05,
"loss": 9.0632,
"step": 145
},
{
"epoch": 0.0673854447439353,
"grad_norm": 8.8125,
"learning_rate": 2.2499999999999998e-05,
"loss": 9.0828,
"step": 150
},
{
"epoch": 0.06963162623539983,
"grad_norm": 9.5625,
"learning_rate": 2.325e-05,
"loss": 9.0005,
"step": 155
},
{
"epoch": 0.07187780772686433,
"grad_norm": 11.1875,
"learning_rate": 2.3999999999999997e-05,
"loss": 8.9463,
"step": 160
},
{
"epoch": 0.07412398921832884,
"grad_norm": 9.3125,
"learning_rate": 2.475e-05,
"loss": 8.9145,
"step": 165
},
{
"epoch": 0.07637017070979335,
"grad_norm": 8.1875,
"learning_rate": 2.55e-05,
"loss": 8.8803,
"step": 170
},
{
"epoch": 0.07861635220125786,
"grad_norm": 7.65625,
"learning_rate": 2.6249999999999998e-05,
"loss": 8.8266,
"step": 175
},
{
"epoch": 0.08086253369272237,
"grad_norm": 7.78125,
"learning_rate": 2.6999999999999996e-05,
"loss": 8.7826,
"step": 180
},
{
"epoch": 0.08310871518418689,
"grad_norm": 8.875,
"learning_rate": 2.7749999999999997e-05,
"loss": 8.7463,
"step": 185
},
{
"epoch": 0.0853548966756514,
"grad_norm": 8.375,
"learning_rate": 2.8499999999999998e-05,
"loss": 8.6836,
"step": 190
},
{
"epoch": 0.0876010781671159,
"grad_norm": 8.5,
"learning_rate": 2.925e-05,
"loss": 8.6827,
"step": 195
},
{
"epoch": 0.08984725965858041,
"grad_norm": 8.25,
"learning_rate": 2.9999999999999997e-05,
"loss": 8.588,
"step": 200
},
{
"epoch": 0.09209344115004492,
"grad_norm": 8.3125,
"learning_rate": 3.0749999999999995e-05,
"loss": 8.5417,
"step": 205
},
{
"epoch": 0.09433962264150944,
"grad_norm": 9.4375,
"learning_rate": 3.149999999999999e-05,
"loss": 8.5287,
"step": 210
},
{
"epoch": 0.09658580413297395,
"grad_norm": 8.125,
"learning_rate": 3.225e-05,
"loss": 8.49,
"step": 215
},
{
"epoch": 0.09883198562443846,
"grad_norm": 7.59375,
"learning_rate": 3.2999999999999996e-05,
"loss": 8.4025,
"step": 220
},
{
"epoch": 0.10107816711590296,
"grad_norm": 8.75,
"learning_rate": 3.375e-05,
"loss": 8.3121,
"step": 225
},
{
"epoch": 0.10332434860736747,
"grad_norm": 7.8125,
"learning_rate": 3.45e-05,
"loss": 8.2635,
"step": 230
},
{
"epoch": 0.10557053009883198,
"grad_norm": 8.3125,
"learning_rate": 3.5249999999999996e-05,
"loss": 8.2691,
"step": 235
},
{
"epoch": 0.1078167115902965,
"grad_norm": 9.4375,
"learning_rate": 3.5999999999999994e-05,
"loss": 8.1828,
"step": 240
},
{
"epoch": 0.11006289308176101,
"grad_norm": 7.0625,
"learning_rate": 3.675e-05,
"loss": 8.0901,
"step": 245
},
{
"epoch": 0.11230907457322552,
"grad_norm": 8.125,
"learning_rate": 3.75e-05,
"loss": 8.0418,
"step": 250
},
{
"epoch": 0.11455525606469003,
"grad_norm": 7.0625,
"learning_rate": 3.8249999999999995e-05,
"loss": 8.0148,
"step": 255
},
{
"epoch": 0.11680143755615453,
"grad_norm": 7.5625,
"learning_rate": 3.9e-05,
"loss": 7.9943,
"step": 260
},
{
"epoch": 0.11904761904761904,
"grad_norm": 7.0625,
"learning_rate": 3.975e-05,
"loss": 7.852,
"step": 265
},
{
"epoch": 0.12129380053908356,
"grad_norm": 6.6875,
"learning_rate": 4.05e-05,
"loss": 7.8506,
"step": 270
},
{
"epoch": 0.12353998203054807,
"grad_norm": 7.46875,
"learning_rate": 4.125e-05,
"loss": 7.7912,
"step": 275
},
{
"epoch": 0.12578616352201258,
"grad_norm": 6.0,
"learning_rate": 4.2e-05,
"loss": 7.7331,
"step": 280
},
{
"epoch": 0.1280323450134771,
"grad_norm": 6.75,
"learning_rate": 4.2749999999999996e-05,
"loss": 7.6362,
"step": 285
},
{
"epoch": 0.1302785265049416,
"grad_norm": 5.9375,
"learning_rate": 4.3499999999999993e-05,
"loss": 7.5867,
"step": 290
},
{
"epoch": 0.13252470799640612,
"grad_norm": 6.40625,
"learning_rate": 4.424999999999999e-05,
"loss": 7.5268,
"step": 295
},
{
"epoch": 0.1347708894878706,
"grad_norm": 5.71875,
"learning_rate": 4.4999999999999996e-05,
"loss": 7.5554,
"step": 300
},
{
"epoch": 0.13701707097933513,
"grad_norm": 5.5,
"learning_rate": 4.5749999999999994e-05,
"loss": 7.4486,
"step": 305
},
{
"epoch": 0.13926325247079965,
"grad_norm": 5.15625,
"learning_rate": 4.65e-05,
"loss": 7.4554,
"step": 310
},
{
"epoch": 0.14150943396226415,
"grad_norm": 4.84375,
"learning_rate": 4.7249999999999997e-05,
"loss": 7.3681,
"step": 315
},
{
"epoch": 0.14375561545372867,
"grad_norm": 7.625,
"learning_rate": 4.7999999999999994e-05,
"loss": 7.2977,
"step": 320
},
{
"epoch": 0.14600179694519316,
"grad_norm": 5.25,
"learning_rate": 4.875e-05,
"loss": 7.2572,
"step": 325
},
{
"epoch": 0.14824797843665768,
"grad_norm": 5.125,
"learning_rate": 4.95e-05,
"loss": 7.322,
"step": 330
},
{
"epoch": 0.15049415992812218,
"grad_norm": 4.96875,
"learning_rate": 5.025e-05,
"loss": 7.2646,
"step": 335
},
{
"epoch": 0.1527403414195867,
"grad_norm": 4.96875,
"learning_rate": 5.1e-05,
"loss": 7.32,
"step": 340
},
{
"epoch": 0.15498652291105122,
"grad_norm": 5.3125,
"learning_rate": 5.174999999999999e-05,
"loss": 7.209,
"step": 345
},
{
"epoch": 0.15723270440251572,
"grad_norm": 5.40625,
"learning_rate": 5.2499999999999995e-05,
"loss": 7.1961,
"step": 350
},
{
"epoch": 0.15947888589398024,
"grad_norm": 4.15625,
"learning_rate": 5.324999999999999e-05,
"loss": 7.2062,
"step": 355
},
{
"epoch": 0.16172506738544473,
"grad_norm": 4.65625,
"learning_rate": 5.399999999999999e-05,
"loss": 7.1401,
"step": 360
},
{
"epoch": 0.16397124887690925,
"grad_norm": 5.71875,
"learning_rate": 5.4749999999999996e-05,
"loss": 7.1402,
"step": 365
},
{
"epoch": 0.16621743036837378,
"grad_norm": 5.34375,
"learning_rate": 5.5499999999999994e-05,
"loss": 7.073,
"step": 370
},
{
"epoch": 0.16846361185983827,
"grad_norm": 5.96875,
"learning_rate": 5.625e-05,
"loss": 7.115,
"step": 375
},
{
"epoch": 0.1707097933513028,
"grad_norm": 4.625,
"learning_rate": 5.6999999999999996e-05,
"loss": 7.1363,
"step": 380
},
{
"epoch": 0.17295597484276728,
"grad_norm": 5.34375,
"learning_rate": 5.7749999999999994e-05,
"loss": 7.1075,
"step": 385
},
{
"epoch": 0.1752021563342318,
"grad_norm": 4.46875,
"learning_rate": 5.85e-05,
"loss": 7.0746,
"step": 390
},
{
"epoch": 0.17744833782569633,
"grad_norm": 4.53125,
"learning_rate": 5.925e-05,
"loss": 7.0877,
"step": 395
},
{
"epoch": 0.17969451931716082,
"grad_norm": 4.6875,
"learning_rate": 5.9999999999999995e-05,
"loss": 7.033,
"step": 400
},
{
"epoch": 0.18194070080862534,
"grad_norm": 4.9375,
"learning_rate": 6.075e-05,
"loss": 7.0603,
"step": 405
},
{
"epoch": 0.18418688230008984,
"grad_norm": 4.8125,
"learning_rate": 6.149999999999999e-05,
"loss": 7.0149,
"step": 410
},
{
"epoch": 0.18643306379155436,
"grad_norm": 4.6875,
"learning_rate": 6.225e-05,
"loss": 6.9823,
"step": 415
},
{
"epoch": 0.18867924528301888,
"grad_norm": 5.65625,
"learning_rate": 6.299999999999999e-05,
"loss": 7.0107,
"step": 420
},
{
"epoch": 0.19092542677448338,
"grad_norm": 4.5625,
"learning_rate": 6.374999999999999e-05,
"loss": 7.0235,
"step": 425
},
{
"epoch": 0.1931716082659479,
"grad_norm": 4.71875,
"learning_rate": 6.45e-05,
"loss": 6.9444,
"step": 430
},
{
"epoch": 0.1954177897574124,
"grad_norm": 4.6875,
"learning_rate": 6.525e-05,
"loss": 6.9067,
"step": 435
},
{
"epoch": 0.1976639712488769,
"grad_norm": 4.375,
"learning_rate": 6.599999999999999e-05,
"loss": 6.9952,
"step": 440
},
{
"epoch": 0.1999101527403414,
"grad_norm": 4.46875,
"learning_rate": 6.675e-05,
"loss": 6.8992,
"step": 445
},
{
"epoch": 0.20215633423180593,
"grad_norm": 4.875,
"learning_rate": 6.75e-05,
"loss": 6.931,
"step": 450
},
{
"epoch": 0.20440251572327045,
"grad_norm": 4.6875,
"learning_rate": 6.824999999999999e-05,
"loss": 6.9036,
"step": 455
},
{
"epoch": 0.20664869721473494,
"grad_norm": 4.75,
"learning_rate": 6.9e-05,
"loss": 6.9332,
"step": 460
},
{
"epoch": 0.20889487870619947,
"grad_norm": 4.25,
"learning_rate": 6.975e-05,
"loss": 7.0612,
"step": 465
},
{
"epoch": 0.21114106019766396,
"grad_norm": 4.59375,
"learning_rate": 7.049999999999999e-05,
"loss": 6.8777,
"step": 470
},
{
"epoch": 0.21338724168912848,
"grad_norm": 4.59375,
"learning_rate": 7.125e-05,
"loss": 6.8593,
"step": 475
},
{
"epoch": 0.215633423180593,
"grad_norm": 5.1875,
"learning_rate": 7.199999999999999e-05,
"loss": 6.9541,
"step": 480
},
{
"epoch": 0.2178796046720575,
"grad_norm": 4.65625,
"learning_rate": 7.274999999999999e-05,
"loss": 6.878,
"step": 485
},
{
"epoch": 0.22012578616352202,
"grad_norm": 5.1875,
"learning_rate": 7.35e-05,
"loss": 6.8284,
"step": 490
},
{
"epoch": 0.2223719676549865,
"grad_norm": 3.9375,
"learning_rate": 7.424999999999999e-05,
"loss": 6.8567,
"step": 495
},
{
"epoch": 0.22461814914645103,
"grad_norm": 5.1875,
"learning_rate": 7.5e-05,
"loss": 6.8235,
"step": 500
},
{
"epoch": 0.22686433063791556,
"grad_norm": 4.65625,
"learning_rate": 7.575e-05,
"loss": 6.8903,
"step": 505
},
{
"epoch": 0.22911051212938005,
"grad_norm": 5.875,
"learning_rate": 7.649999999999999e-05,
"loss": 6.8404,
"step": 510
},
{
"epoch": 0.23135669362084457,
"grad_norm": 5.0625,
"learning_rate": 7.725e-05,
"loss": 6.8318,
"step": 515
},
{
"epoch": 0.23360287511230907,
"grad_norm": 4.5625,
"learning_rate": 7.8e-05,
"loss": 6.8522,
"step": 520
},
{
"epoch": 0.2358490566037736,
"grad_norm": 5.03125,
"learning_rate": 7.874999999999999e-05,
"loss": 6.859,
"step": 525
},
{
"epoch": 0.23809523809523808,
"grad_norm": 4.71875,
"learning_rate": 7.95e-05,
"loss": 6.8336,
"step": 530
},
{
"epoch": 0.2403414195867026,
"grad_norm": 4.875,
"learning_rate": 8.025e-05,
"loss": 6.7897,
"step": 535
},
{
"epoch": 0.24258760107816713,
"grad_norm": 4.375,
"learning_rate": 8.1e-05,
"loss": 6.7873,
"step": 540
},
{
"epoch": 0.24483378256963162,
"grad_norm": 4.34375,
"learning_rate": 8.175e-05,
"loss": 6.7691,
"step": 545
},
{
"epoch": 0.24707996406109614,
"grad_norm": 4.40625,
"learning_rate": 8.25e-05,
"loss": 6.8252,
"step": 550
},
{
"epoch": 0.24932614555256064,
"grad_norm": 4.6875,
"learning_rate": 8.325e-05,
"loss": 6.8071,
"step": 555
},
{
"epoch": 0.25157232704402516,
"grad_norm": 4.65625,
"learning_rate": 8.4e-05,
"loss": 6.7156,
"step": 560
},
{
"epoch": 0.25381850853548965,
"grad_norm": 4.875,
"learning_rate": 8.474999999999999e-05,
"loss": 6.8189,
"step": 565
},
{
"epoch": 0.2560646900269542,
"grad_norm": 4.53125,
"learning_rate": 8.549999999999999e-05,
"loss": 6.8159,
"step": 570
},
{
"epoch": 0.2583108715184187,
"grad_norm": 3.75,
"learning_rate": 8.624999999999998e-05,
"loss": 6.847,
"step": 575
},
{
"epoch": 0.2605570530098832,
"grad_norm": 4.71875,
"learning_rate": 8.699999999999999e-05,
"loss": 6.7576,
"step": 580
},
{
"epoch": 0.2628032345013477,
"grad_norm": 5.375,
"learning_rate": 8.774999999999999e-05,
"loss": 6.7211,
"step": 585
},
{
"epoch": 0.26504941599281223,
"grad_norm": 4.875,
"learning_rate": 8.849999999999998e-05,
"loss": 6.7255,
"step": 590
},
{
"epoch": 0.2672955974842767,
"grad_norm": 4.71875,
"learning_rate": 8.924999999999999e-05,
"loss": 6.6598,
"step": 595
},
{
"epoch": 0.2695417789757412,
"grad_norm": 4.25,
"learning_rate": 8.999999999999999e-05,
"loss": 6.7735,
"step": 600
},
{
"epoch": 0.27178796046720577,
"grad_norm": 4.3125,
"learning_rate": 9.074999999999998e-05,
"loss": 6.7253,
"step": 605
},
{
"epoch": 0.27403414195867026,
"grad_norm": 4.8125,
"learning_rate": 9.149999999999999e-05,
"loss": 6.6825,
"step": 610
},
{
"epoch": 0.27628032345013476,
"grad_norm": 4.40625,
"learning_rate": 9.224999999999999e-05,
"loss": 6.7523,
"step": 615
},
{
"epoch": 0.2785265049415993,
"grad_norm": 4.46875,
"learning_rate": 9.3e-05,
"loss": 6.7212,
"step": 620
},
{
"epoch": 0.2807726864330638,
"grad_norm": 4.875,
"learning_rate": 9.374999999999999e-05,
"loss": 6.7052,
"step": 625
},
{
"epoch": 0.2830188679245283,
"grad_norm": 4.6875,
"learning_rate": 9.449999999999999e-05,
"loss": 6.7031,
"step": 630
},
{
"epoch": 0.2852650494159928,
"grad_norm": 4.1875,
"learning_rate": 9.525e-05,
"loss": 6.7163,
"step": 635
},
{
"epoch": 0.28751123090745734,
"grad_norm": 4.15625,
"learning_rate": 9.599999999999999e-05,
"loss": 6.7148,
"step": 640
},
{
"epoch": 0.28975741239892183,
"grad_norm": 3.78125,
"learning_rate": 9.675e-05,
"loss": 6.7027,
"step": 645
},
{
"epoch": 0.2920035938903863,
"grad_norm": 4.375,
"learning_rate": 9.75e-05,
"loss": 6.6511,
"step": 650
},
{
"epoch": 0.2942497753818509,
"grad_norm": 4.0625,
"learning_rate": 9.824999999999999e-05,
"loss": 6.704,
"step": 655
},
{
"epoch": 0.29649595687331537,
"grad_norm": 4.09375,
"learning_rate": 9.9e-05,
"loss": 6.689,
"step": 660
},
{
"epoch": 0.29874213836477986,
"grad_norm": 3.875,
"learning_rate": 9.975e-05,
"loss": 6.6784,
"step": 665
},
{
"epoch": 0.30098831985624436,
"grad_norm": 4.5,
"learning_rate": 0.0001005,
"loss": 6.597,
"step": 670
},
{
"epoch": 0.3032345013477089,
"grad_norm": 4.3125,
"learning_rate": 0.00010125,
"loss": 6.6198,
"step": 675
},
{
"epoch": 0.3054806828391734,
"grad_norm": 4.03125,
"learning_rate": 0.000102,
"loss": 6.6226,
"step": 680
},
{
"epoch": 0.3077268643306379,
"grad_norm": 4.03125,
"learning_rate": 0.00010275,
"loss": 6.627,
"step": 685
},
{
"epoch": 0.30997304582210244,
"grad_norm": 5.1875,
"learning_rate": 0.00010349999999999998,
"loss": 6.6003,
"step": 690
},
{
"epoch": 0.31221922731356694,
"grad_norm": 3.640625,
"learning_rate": 0.00010424999999999999,
"loss": 6.5845,
"step": 695
},
{
"epoch": 0.31446540880503143,
"grad_norm": 4.4375,
"learning_rate": 0.00010499999999999999,
"loss": 6.6143,
"step": 700
},
{
"epoch": 0.316711590296496,
"grad_norm": 4.90625,
"learning_rate": 0.00010574999999999998,
"loss": 6.6305,
"step": 705
},
{
"epoch": 0.3189577717879605,
"grad_norm": 4.3125,
"learning_rate": 0.00010649999999999999,
"loss": 6.5312,
"step": 710
},
{
"epoch": 0.32120395327942497,
"grad_norm": 4.15625,
"learning_rate": 0.00010724999999999999,
"loss": 6.63,
"step": 715
},
{
"epoch": 0.32345013477088946,
"grad_norm": 4.53125,
"learning_rate": 0.00010799999999999998,
"loss": 6.564,
"step": 720
},
{
"epoch": 0.325696316262354,
"grad_norm": 4.03125,
"learning_rate": 0.00010874999999999999,
"loss": 6.6572,
"step": 725
},
{
"epoch": 0.3279424977538185,
"grad_norm": 4.40625,
"learning_rate": 0.00010949999999999999,
"loss": 6.5728,
"step": 730
},
{
"epoch": 0.330188679245283,
"grad_norm": 4.34375,
"learning_rate": 0.00011024999999999998,
"loss": 6.5245,
"step": 735
},
{
"epoch": 0.33243486073674755,
"grad_norm": 5.5,
"learning_rate": 0.00011099999999999999,
"loss": 6.5883,
"step": 740
},
{
"epoch": 0.33468104222821204,
"grad_norm": 5.53125,
"learning_rate": 0.00011174999999999999,
"loss": 6.5549,
"step": 745
},
{
"epoch": 0.33692722371967654,
"grad_norm": 4.40625,
"learning_rate": 0.0001125,
"loss": 6.5269,
"step": 750
},
{
"epoch": 0.33917340521114103,
"grad_norm": 4.65625,
"learning_rate": 0.00011324999999999999,
"loss": 6.5262,
"step": 755
},
{
"epoch": 0.3414195867026056,
"grad_norm": 4.25,
"learning_rate": 0.00011399999999999999,
"loss": 6.4958,
"step": 760
},
{
"epoch": 0.3436657681940701,
"grad_norm": 4.34375,
"learning_rate": 0.00011475,
"loss": 6.4719,
"step": 765
},
{
"epoch": 0.34591194968553457,
"grad_norm": 3.828125,
"learning_rate": 0.00011549999999999999,
"loss": 6.4948,
"step": 770
},
{
"epoch": 0.3481581311769991,
"grad_norm": 3.890625,
"learning_rate": 0.00011624999999999999,
"loss": 6.5652,
"step": 775
},
{
"epoch": 0.3504043126684636,
"grad_norm": 3.828125,
"learning_rate": 0.000117,
"loss": 6.633,
"step": 780
},
{
"epoch": 0.3526504941599281,
"grad_norm": 3.78125,
"learning_rate": 0.00011774999999999999,
"loss": 6.4617,
"step": 785
},
{
"epoch": 0.35489667565139266,
"grad_norm": 3.9375,
"learning_rate": 0.0001185,
"loss": 6.524,
"step": 790
},
{
"epoch": 0.35714285714285715,
"grad_norm": 7.25,
"learning_rate": 0.00011925,
"loss": 6.4985,
"step": 795
},
{
"epoch": 0.35938903863432164,
"grad_norm": 3.828125,
"learning_rate": 0.00011999999999999999,
"loss": 6.4988,
"step": 800
},
{
"epoch": 0.36163522012578614,
"grad_norm": 5.125,
"learning_rate": 0.00012075,
"loss": 6.5393,
"step": 805
},
{
"epoch": 0.3638814016172507,
"grad_norm": 4.90625,
"learning_rate": 0.0001215,
"loss": 6.4869,
"step": 810
},
{
"epoch": 0.3661275831087152,
"grad_norm": 4.1875,
"learning_rate": 0.00012225,
"loss": 6.4419,
"step": 815
},
{
"epoch": 0.3683737646001797,
"grad_norm": 3.765625,
"learning_rate": 0.00012299999999999998,
"loss": 6.574,
"step": 820
},
{
"epoch": 0.3706199460916442,
"grad_norm": 3.796875,
"learning_rate": 0.00012374999999999997,
"loss": 6.5063,
"step": 825
},
{
"epoch": 0.3728661275831087,
"grad_norm": 3.734375,
"learning_rate": 0.0001245,
"loss": 6.5404,
"step": 830
},
{
"epoch": 0.3751123090745732,
"grad_norm": 3.65625,
"learning_rate": 0.00012524999999999998,
"loss": 6.4726,
"step": 835
},
{
"epoch": 0.37735849056603776,
"grad_norm": 4.0,
"learning_rate": 0.00012599999999999997,
"loss": 6.4099,
"step": 840
},
{
"epoch": 0.37960467205750226,
"grad_norm": 4.25,
"learning_rate": 0.00012675,
"loss": 6.3966,
"step": 845
},
{
"epoch": 0.38185085354896675,
"grad_norm": 3.828125,
"learning_rate": 0.00012749999999999998,
"loss": 6.4607,
"step": 850
},
{
"epoch": 0.38409703504043125,
"grad_norm": 4.28125,
"learning_rate": 0.00012824999999999997,
"loss": 6.4718,
"step": 855
},
{
"epoch": 0.3863432165318958,
"grad_norm": 4.71875,
"learning_rate": 0.000129,
"loss": 6.4569,
"step": 860
},
{
"epoch": 0.3885893980233603,
"grad_norm": 4.375,
"learning_rate": 0.00012974999999999998,
"loss": 6.3576,
"step": 865
},
{
"epoch": 0.3908355795148248,
"grad_norm": 4.65625,
"learning_rate": 0.0001305,
"loss": 6.4259,
"step": 870
},
{
"epoch": 0.39308176100628933,
"grad_norm": 4.96875,
"learning_rate": 0.00013125,
"loss": 6.3831,
"step": 875
},
{
"epoch": 0.3953279424977538,
"grad_norm": 3.90625,
"learning_rate": 0.00013199999999999998,
"loss": 6.4086,
"step": 880
},
{
"epoch": 0.3975741239892183,
"grad_norm": 3.75,
"learning_rate": 0.00013275,
"loss": 6.3207,
"step": 885
},
{
"epoch": 0.3998203054806828,
"grad_norm": 4.28125,
"learning_rate": 0.0001335,
"loss": 6.4129,
"step": 890
},
{
"epoch": 0.40206648697214736,
"grad_norm": 3.8125,
"learning_rate": 0.00013424999999999998,
"loss": 6.4397,
"step": 895
},
{
"epoch": 0.40431266846361186,
"grad_norm": 3.921875,
"learning_rate": 0.000135,
"loss": 6.4104,
"step": 900
},
{
"epoch": 0.40655884995507635,
"grad_norm": 3.984375,
"learning_rate": 0.00013575,
"loss": 6.3327,
"step": 905
},
{
"epoch": 0.4088050314465409,
"grad_norm": 3.859375,
"learning_rate": 0.00013649999999999998,
"loss": 6.3965,
"step": 910
},
{
"epoch": 0.4110512129380054,
"grad_norm": 4.03125,
"learning_rate": 0.00013725,
"loss": 6.3614,
"step": 915
},
{
"epoch": 0.4132973944294699,
"grad_norm": 3.734375,
"learning_rate": 0.000138,
"loss": 6.3743,
"step": 920
},
{
"epoch": 0.41554357592093444,
"grad_norm": 3.984375,
"learning_rate": 0.00013874999999999998,
"loss": 6.4228,
"step": 925
},
{
"epoch": 0.41778975741239893,
"grad_norm": 4.03125,
"learning_rate": 0.0001395,
"loss": 6.4047,
"step": 930
},
{
"epoch": 0.4200359389038634,
"grad_norm": 3.984375,
"learning_rate": 0.00014025,
"loss": 6.3634,
"step": 935
},
{
"epoch": 0.4222821203953279,
"grad_norm": 4.0,
"learning_rate": 0.00014099999999999998,
"loss": 6.3866,
"step": 940
},
{
"epoch": 0.42452830188679247,
"grad_norm": 3.796875,
"learning_rate": 0.00014174999999999998,
"loss": 6.3599,
"step": 945
},
{
"epoch": 0.42677448337825696,
"grad_norm": 4.03125,
"learning_rate": 0.0001425,
"loss": 6.3422,
"step": 950
},
{
"epoch": 0.42902066486972146,
"grad_norm": 4.15625,
"learning_rate": 0.00014324999999999999,
"loss": 6.2791,
"step": 955
},
{
"epoch": 0.431266846361186,
"grad_norm": 3.96875,
"learning_rate": 0.00014399999999999998,
"loss": 6.3505,
"step": 960
},
{
"epoch": 0.4335130278526505,
"grad_norm": 4.5,
"learning_rate": 0.00014475,
"loss": 6.3671,
"step": 965
},
{
"epoch": 0.435759209344115,
"grad_norm": 3.65625,
"learning_rate": 0.00014549999999999999,
"loss": 6.318,
"step": 970
},
{
"epoch": 0.4380053908355795,
"grad_norm": 4.28125,
"learning_rate": 0.00014624999999999998,
"loss": 6.3299,
"step": 975
},
{
"epoch": 0.44025157232704404,
"grad_norm": 3.578125,
"learning_rate": 0.000147,
"loss": 6.4073,
"step": 980
},
{
"epoch": 0.44249775381850853,
"grad_norm": 3.734375,
"learning_rate": 0.00014774999999999999,
"loss": 6.4377,
"step": 985
},
{
"epoch": 0.444743935309973,
"grad_norm": 3.765625,
"learning_rate": 0.00014849999999999998,
"loss": 6.2784,
"step": 990
},
{
"epoch": 0.4469901168014376,
"grad_norm": 3.953125,
"learning_rate": 0.00014925,
"loss": 6.2901,
"step": 995
},
{
"epoch": 0.44923629829290207,
"grad_norm": 4.375,
"learning_rate": 0.00015,
"loss": 6.2973,
"step": 1000
},
{
"epoch": 0.44923629829290207,
"eval_loss": 6.229096412658691,
"eval_runtime": 16.2469,
"eval_samples_per_second": 1908.854,
"eval_steps_per_second": 238.63,
"step": 1000
},
{
"epoch": 0.45148247978436656,
"grad_norm": 3.78125,
"learning_rate": 0.00015074999999999998,
"loss": 6.3253,
"step": 1005
},
{
"epoch": 0.4537286612758311,
"grad_norm": 3.953125,
"learning_rate": 0.0001515,
"loss": 6.2906,
"step": 1010
},
{
"epoch": 0.4559748427672956,
"grad_norm": 3.90625,
"learning_rate": 0.00015224999999999996,
"loss": 6.3351,
"step": 1015
},
{
"epoch": 0.4582210242587601,
"grad_norm": 3.6875,
"learning_rate": 0.00015299999999999998,
"loss": 6.368,
"step": 1020
},
{
"epoch": 0.4604672057502246,
"grad_norm": 3.796875,
"learning_rate": 0.00015374999999999997,
"loss": 6.3008,
"step": 1025
},
{
"epoch": 0.46271338724168914,
"grad_norm": 3.703125,
"learning_rate": 0.0001545,
"loss": 6.283,
"step": 1030
},
{
"epoch": 0.46495956873315364,
"grad_norm": 3.734375,
"learning_rate": 0.00015524999999999998,
"loss": 6.3212,
"step": 1035
},
{
"epoch": 0.46720575022461813,
"grad_norm": 4.15625,
"learning_rate": 0.000156,
"loss": 6.2874,
"step": 1040
},
{
"epoch": 0.4694519317160827,
"grad_norm": 3.484375,
"learning_rate": 0.00015675,
"loss": 6.2944,
"step": 1045
},
{
"epoch": 0.4716981132075472,
"grad_norm": 4.3125,
"learning_rate": 0.00015749999999999998,
"loss": 6.3099,
"step": 1050
},
{
"epoch": 0.47394429469901167,
"grad_norm": 3.734375,
"learning_rate": 0.00015824999999999997,
"loss": 6.2531,
"step": 1055
},
{
"epoch": 0.47619047619047616,
"grad_norm": 3.609375,
"learning_rate": 0.000159,
"loss": 6.2326,
"step": 1060
},
{
"epoch": 0.4784366576819407,
"grad_norm": 3.8125,
"learning_rate": 0.00015974999999999998,
"loss": 6.2059,
"step": 1065
},
{
"epoch": 0.4806828391734052,
"grad_norm": 3.625,
"learning_rate": 0.0001605,
"loss": 6.2798,
"step": 1070
},
{
"epoch": 0.4829290206648697,
"grad_norm": 3.890625,
"learning_rate": 0.00016125,
"loss": 6.2814,
"step": 1075
},
{
"epoch": 0.48517520215633425,
"grad_norm": 3.84375,
"learning_rate": 0.000162,
"loss": 6.1955,
"step": 1080
},
{
"epoch": 0.48742138364779874,
"grad_norm": 4.0,
"learning_rate": 0.00016274999999999997,
"loss": 6.3142,
"step": 1085
},
{
"epoch": 0.48966756513926324,
"grad_norm": 3.71875,
"learning_rate": 0.0001635,
"loss": 6.193,
"step": 1090
},
{
"epoch": 0.4919137466307278,
"grad_norm": 4.0,
"learning_rate": 0.00016424999999999998,
"loss": 6.26,
"step": 1095
},
{
"epoch": 0.4941599281221923,
"grad_norm": 4.0625,
"learning_rate": 0.000165,
"loss": 6.2443,
"step": 1100
},
{
"epoch": 0.4964061096136568,
"grad_norm": 3.671875,
"learning_rate": 0.00016575,
"loss": 6.2278,
"step": 1105
},
{
"epoch": 0.49865229110512127,
"grad_norm": 3.6875,
"learning_rate": 0.0001665,
"loss": 6.2254,
"step": 1110
},
{
"epoch": 0.5008984725965858,
"grad_norm": 3.921875,
"learning_rate": 0.00016724999999999997,
"loss": 6.3325,
"step": 1115
},
{
"epoch": 0.5031446540880503,
"grad_norm": 3.921875,
"learning_rate": 0.000168,
"loss": 6.186,
"step": 1120
},
{
"epoch": 0.5053908355795148,
"grad_norm": 3.859375,
"learning_rate": 0.00016874999999999998,
"loss": 6.2389,
"step": 1125
},
{
"epoch": 0.5076370170709793,
"grad_norm": 4.71875,
"learning_rate": 0.00016949999999999997,
"loss": 6.1268,
"step": 1130
},
{
"epoch": 0.5098831985624438,
"grad_norm": 3.90625,
"learning_rate": 0.00017025,
"loss": 6.1445,
"step": 1135
},
{
"epoch": 0.5121293800539084,
"grad_norm": 3.484375,
"learning_rate": 0.00017099999999999998,
"loss": 6.1658,
"step": 1140
},
{
"epoch": 0.5143755615453729,
"grad_norm": 3.78125,
"learning_rate": 0.00017175,
"loss": 6.1832,
"step": 1145
},
{
"epoch": 0.5166217430368374,
"grad_norm": 3.96875,
"learning_rate": 0.00017249999999999996,
"loss": 6.1621,
"step": 1150
},
{
"epoch": 0.5188679245283019,
"grad_norm": 3.765625,
"learning_rate": 0.00017324999999999998,
"loss": 6.22,
"step": 1155
},
{
"epoch": 0.5211141060197664,
"grad_norm": 3.890625,
"learning_rate": 0.00017399999999999997,
"loss": 6.1432,
"step": 1160
},
{
"epoch": 0.5233602875112309,
"grad_norm": 3.59375,
"learning_rate": 0.00017475,
"loss": 6.1223,
"step": 1165
},
{
"epoch": 0.5256064690026954,
"grad_norm": 3.28125,
"learning_rate": 0.00017549999999999998,
"loss": 6.1839,
"step": 1170
},
{
"epoch": 0.52785265049416,
"grad_norm": 3.9375,
"learning_rate": 0.00017625,
"loss": 6.2021,
"step": 1175
},
{
"epoch": 0.5300988319856245,
"grad_norm": 4.03125,
"learning_rate": 0.00017699999999999997,
"loss": 6.1947,
"step": 1180
},
{
"epoch": 0.532345013477089,
"grad_norm": 4.5,
"learning_rate": 0.00017774999999999998,
"loss": 6.1474,
"step": 1185
},
{
"epoch": 0.5345911949685535,
"grad_norm": 3.671875,
"learning_rate": 0.00017849999999999997,
"loss": 6.1488,
"step": 1190
},
{
"epoch": 0.536837376460018,
"grad_norm": 3.734375,
"learning_rate": 0.00017925,
"loss": 6.1943,
"step": 1195
},
{
"epoch": 0.5390835579514824,
"grad_norm": 3.8125,
"learning_rate": 0.00017999999999999998,
"loss": 6.13,
"step": 1200
},
{
"epoch": 0.541329739442947,
"grad_norm": 3.828125,
"learning_rate": 0.00018075,
"loss": 6.0818,
"step": 1205
},
{
"epoch": 0.5435759209344115,
"grad_norm": 3.546875,
"learning_rate": 0.00018149999999999997,
"loss": 6.1505,
"step": 1210
},
{
"epoch": 0.545822102425876,
"grad_norm": 4.03125,
"learning_rate": 0.00018224999999999998,
"loss": 6.1578,
"step": 1215
},
{
"epoch": 0.5480682839173405,
"grad_norm": 3.921875,
"learning_rate": 0.00018299999999999998,
"loss": 6.0904,
"step": 1220
},
{
"epoch": 0.550314465408805,
"grad_norm": 4.1875,
"learning_rate": 0.00018375,
"loss": 6.0851,
"step": 1225
},
{
"epoch": 0.5525606469002695,
"grad_norm": 4.21875,
"learning_rate": 0.00018449999999999999,
"loss": 6.1133,
"step": 1230
},
{
"epoch": 0.554806828391734,
"grad_norm": 3.765625,
"learning_rate": 0.00018525,
"loss": 6.1453,
"step": 1235
},
{
"epoch": 0.5570530098831986,
"grad_norm": 3.671875,
"learning_rate": 0.000186,
"loss": 6.1572,
"step": 1240
},
{
"epoch": 0.5592991913746631,
"grad_norm": 3.8125,
"learning_rate": 0.00018675,
"loss": 6.2205,
"step": 1245
},
{
"epoch": 0.5615453728661276,
"grad_norm": 4.4375,
"learning_rate": 0.00018749999999999998,
"loss": 6.1114,
"step": 1250
},
{
"epoch": 0.5637915543575921,
"grad_norm": 4.03125,
"learning_rate": 0.00018824999999999997,
"loss": 6.1407,
"step": 1255
},
{
"epoch": 0.5660377358490566,
"grad_norm": 4.1875,
"learning_rate": 0.00018899999999999999,
"loss": 6.1272,
"step": 1260
},
{
"epoch": 0.5682839173405211,
"grad_norm": 4.03125,
"learning_rate": 0.00018974999999999998,
"loss": 6.1264,
"step": 1265
},
{
"epoch": 0.5705300988319856,
"grad_norm": 4.09375,
"learning_rate": 0.0001905,
"loss": 6.0308,
"step": 1270
},
{
"epoch": 0.5727762803234502,
"grad_norm": 3.421875,
"learning_rate": 0.00019124999999999996,
"loss": 6.1028,
"step": 1275
},
{
"epoch": 0.5750224618149147,
"grad_norm": 3.953125,
"learning_rate": 0.00019199999999999998,
"loss": 6.1002,
"step": 1280
},
{
"epoch": 0.5772686433063792,
"grad_norm": 4.1875,
"learning_rate": 0.00019274999999999997,
"loss": 6.1451,
"step": 1285
},
{
"epoch": 0.5795148247978437,
"grad_norm": 4.0625,
"learning_rate": 0.0001935,
"loss": 6.0798,
"step": 1290
},
{
"epoch": 0.5817610062893082,
"grad_norm": 3.609375,
"learning_rate": 0.00019424999999999998,
"loss": 6.0831,
"step": 1295
},
{
"epoch": 0.5840071877807727,
"grad_norm": 3.671875,
"learning_rate": 0.000195,
"loss": 6.1054,
"step": 1300
},
{
"epoch": 0.5862533692722371,
"grad_norm": 3.625,
"learning_rate": 0.00019574999999999996,
"loss": 6.0122,
"step": 1305
},
{
"epoch": 0.5884995507637018,
"grad_norm": 4.0625,
"learning_rate": 0.00019649999999999998,
"loss": 6.0397,
"step": 1310
},
{
"epoch": 0.5907457322551662,
"grad_norm": 3.59375,
"learning_rate": 0.00019724999999999997,
"loss": 5.9765,
"step": 1315
},
{
"epoch": 0.5929919137466307,
"grad_norm": 3.296875,
"learning_rate": 0.000198,
"loss": 6.0359,
"step": 1320
},
{
"epoch": 0.5952380952380952,
"grad_norm": 3.828125,
"learning_rate": 0.00019874999999999998,
"loss": 6.0552,
"step": 1325
},
{
"epoch": 0.5974842767295597,
"grad_norm": 3.5625,
"learning_rate": 0.0001995,
"loss": 6.0254,
"step": 1330
},
{
"epoch": 0.5997304582210242,
"grad_norm": 3.703125,
"learning_rate": 0.00020025,
"loss": 6.0575,
"step": 1335
},
{
"epoch": 0.6019766397124887,
"grad_norm": 3.59375,
"learning_rate": 0.000201,
"loss": 6.004,
"step": 1340
},
{
"epoch": 0.6042228212039533,
"grad_norm": 3.65625,
"learning_rate": 0.00020174999999999997,
"loss": 6.0784,
"step": 1345
},
{
"epoch": 0.6064690026954178,
"grad_norm": 3.78125,
"learning_rate": 0.0002025,
"loss": 6.1157,
"step": 1350
},
{
"epoch": 0.6087151841868823,
"grad_norm": 3.65625,
"learning_rate": 0.00020324999999999998,
"loss": 6.0583,
"step": 1355
},
{
"epoch": 0.6109613656783468,
"grad_norm": 3.4375,
"learning_rate": 0.000204,
"loss": 6.0366,
"step": 1360
},
{
"epoch": 0.6132075471698113,
"grad_norm": 3.4375,
"learning_rate": 0.00020475,
"loss": 6.1213,
"step": 1365
},
{
"epoch": 0.6154537286612758,
"grad_norm": 3.8125,
"learning_rate": 0.0002055,
"loss": 6.1744,
"step": 1370
},
{
"epoch": 0.6176999101527404,
"grad_norm": 3.8125,
"learning_rate": 0.00020624999999999997,
"loss": 6.0912,
"step": 1375
},
{
"epoch": 0.6199460916442049,
"grad_norm": 3.421875,
"learning_rate": 0.00020699999999999996,
"loss": 5.9619,
"step": 1380
},
{
"epoch": 0.6221922731356694,
"grad_norm": 3.78125,
"learning_rate": 0.00020774999999999998,
"loss": 5.9658,
"step": 1385
},
{
"epoch": 0.6244384546271339,
"grad_norm": 3.484375,
"learning_rate": 0.00020849999999999997,
"loss": 6.0913,
"step": 1390
},
{
"epoch": 0.6266846361185984,
"grad_norm": 3.484375,
"learning_rate": 0.00020925,
"loss": 6.0363,
"step": 1395
},
{
"epoch": 0.6289308176100629,
"grad_norm": 3.890625,
"learning_rate": 0.00020999999999999998,
"loss": 5.9513,
"step": 1400
},
{
"epoch": 0.6311769991015274,
"grad_norm": 4.0625,
"learning_rate": 0.00021074999999999997,
"loss": 5.9931,
"step": 1405
},
{
"epoch": 0.633423180592992,
"grad_norm": 4.0,
"learning_rate": 0.00021149999999999996,
"loss": 5.9732,
"step": 1410
},
{
"epoch": 0.6356693620844565,
"grad_norm": 3.671875,
"learning_rate": 0.00021224999999999998,
"loss": 6.0028,
"step": 1415
},
{
"epoch": 0.637915543575921,
"grad_norm": 3.5,
"learning_rate": 0.00021299999999999997,
"loss": 6.0171,
"step": 1420
},
{
"epoch": 0.6401617250673854,
"grad_norm": 3.421875,
"learning_rate": 0.00021375,
"loss": 5.9886,
"step": 1425
},
{
"epoch": 0.6424079065588499,
"grad_norm": 3.875,
"learning_rate": 0.00021449999999999998,
"loss": 5.9436,
"step": 1430
},
{
"epoch": 0.6446540880503144,
"grad_norm": 3.3125,
"learning_rate": 0.00021525,
"loss": 6.0565,
"step": 1435
},
{
"epoch": 0.6469002695417789,
"grad_norm": 3.640625,
"learning_rate": 0.00021599999999999996,
"loss": 6.1117,
"step": 1440
},
{
"epoch": 0.6491464510332435,
"grad_norm": 3.625,
"learning_rate": 0.00021674999999999998,
"loss": 5.9778,
"step": 1445
},
{
"epoch": 0.651392632524708,
"grad_norm": 4.0625,
"learning_rate": 0.00021749999999999997,
"loss": 5.9706,
"step": 1450
},
{
"epoch": 0.6536388140161725,
"grad_norm": 4.15625,
"learning_rate": 0.00021825,
"loss": 5.9358,
"step": 1455
},
{
"epoch": 0.655884995507637,
"grad_norm": 3.5,
"learning_rate": 0.00021899999999999998,
"loss": 6.0584,
"step": 1460
},
{
"epoch": 0.6581311769991015,
"grad_norm": 3.734375,
"learning_rate": 0.00021975,
"loss": 6.0055,
"step": 1465
},
{
"epoch": 0.660377358490566,
"grad_norm": 3.78125,
"learning_rate": 0.00022049999999999997,
"loss": 5.9678,
"step": 1470
},
{
"epoch": 0.6626235399820305,
"grad_norm": 3.703125,
"learning_rate": 0.00022124999999999998,
"loss": 5.9747,
"step": 1475
},
{
"epoch": 0.6648697214734951,
"grad_norm": 3.46875,
"learning_rate": 0.00022199999999999998,
"loss": 5.9542,
"step": 1480
},
{
"epoch": 0.6671159029649596,
"grad_norm": 3.34375,
"learning_rate": 0.00022275,
"loss": 5.9001,
"step": 1485
},
{
"epoch": 0.6693620844564241,
"grad_norm": 3.65625,
"learning_rate": 0.00022349999999999998,
"loss": 5.9689,
"step": 1490
},
{
"epoch": 0.6716082659478886,
"grad_norm": 3.953125,
"learning_rate": 0.00022425,
"loss": 5.9823,
"step": 1495
},
{
"epoch": 0.6738544474393531,
"grad_norm": 3.53125,
"learning_rate": 0.000225,
"loss": 5.9758,
"step": 1500
},
{
"epoch": 0.6761006289308176,
"grad_norm": 3.484375,
"learning_rate": 0.00022574999999999996,
"loss": 5.9994,
"step": 1505
},
{
"epoch": 0.6783468104222821,
"grad_norm": 3.6875,
"learning_rate": 0.00022649999999999998,
"loss": 5.8979,
"step": 1510
},
{
"epoch": 0.6805929919137467,
"grad_norm": 3.328125,
"learning_rate": 0.00022724999999999997,
"loss": 6.0046,
"step": 1515
},
{
"epoch": 0.6828391734052112,
"grad_norm": 3.75,
"learning_rate": 0.00022799999999999999,
"loss": 5.9637,
"step": 1520
},
{
"epoch": 0.6850853548966757,
"grad_norm": 3.296875,
"learning_rate": 0.00022874999999999998,
"loss": 5.939,
"step": 1525
},
{
"epoch": 0.6873315363881402,
"grad_norm": 3.484375,
"learning_rate": 0.0002295,
"loss": 6.0089,
"step": 1530
},
{
"epoch": 0.6895777178796046,
"grad_norm": 3.46875,
"learning_rate": 0.00023024999999999996,
"loss": 5.9247,
"step": 1535
},
{
"epoch": 0.6918238993710691,
"grad_norm": 3.3125,
"learning_rate": 0.00023099999999999998,
"loss": 5.8969,
"step": 1540
},
{
"epoch": 0.6940700808625337,
"grad_norm": 3.734375,
"learning_rate": 0.00023174999999999997,
"loss": 5.8485,
"step": 1545
},
{
"epoch": 0.6963162623539982,
"grad_norm": 3.375,
"learning_rate": 0.00023249999999999999,
"loss": 5.9481,
"step": 1550
},
{
"epoch": 0.6985624438454627,
"grad_norm": 3.5625,
"learning_rate": 0.00023324999999999998,
"loss": 5.9145,
"step": 1555
},
{
"epoch": 0.7008086253369272,
"grad_norm": 3.5,
"learning_rate": 0.000234,
"loss": 5.8711,
"step": 1560
},
{
"epoch": 0.7030548068283917,
"grad_norm": 3.703125,
"learning_rate": 0.00023474999999999996,
"loss": 5.9697,
"step": 1565
},
{
"epoch": 0.7053009883198562,
"grad_norm": 3.75,
"learning_rate": 0.00023549999999999998,
"loss": 5.8905,
"step": 1570
},
{
"epoch": 0.7075471698113207,
"grad_norm": 3.59375,
"learning_rate": 0.00023624999999999997,
"loss": 5.9357,
"step": 1575
},
{
"epoch": 0.7097933513027853,
"grad_norm": 3.453125,
"learning_rate": 0.000237,
"loss": 5.8548,
"step": 1580
},
{
"epoch": 0.7120395327942498,
"grad_norm": 3.484375,
"learning_rate": 0.00023774999999999998,
"loss": 5.9498,
"step": 1585
},
{
"epoch": 0.7142857142857143,
"grad_norm": 3.78125,
"learning_rate": 0.0002385,
"loss": 5.8457,
"step": 1590
},
{
"epoch": 0.7165318957771788,
"grad_norm": 3.5625,
"learning_rate": 0.00023925,
"loss": 5.8717,
"step": 1595
},
{
"epoch": 0.7187780772686433,
"grad_norm": 3.328125,
"learning_rate": 0.00023999999999999998,
"loss": 5.8193,
"step": 1600
},
{
"epoch": 0.7210242587601078,
"grad_norm": 3.296875,
"learning_rate": 0.00024074999999999997,
"loss": 5.8618,
"step": 1605
},
{
"epoch": 0.7232704402515723,
"grad_norm": 3.625,
"learning_rate": 0.0002415,
"loss": 5.8882,
"step": 1610
},
{
"epoch": 0.7255166217430369,
"grad_norm": 3.28125,
"learning_rate": 0.00024224999999999998,
"loss": 5.9087,
"step": 1615
},
{
"epoch": 0.7277628032345014,
"grad_norm": 3.53125,
"learning_rate": 0.000243,
"loss": 5.8994,
"step": 1620
},
{
"epoch": 0.7300089847259659,
"grad_norm": 3.34375,
"learning_rate": 0.00024375,
"loss": 5.9156,
"step": 1625
},
{
"epoch": 0.7322551662174304,
"grad_norm": 3.78125,
"learning_rate": 0.0002445,
"loss": 5.889,
"step": 1630
},
{
"epoch": 0.7345013477088949,
"grad_norm": 3.5,
"learning_rate": 0.00024524999999999997,
"loss": 5.8538,
"step": 1635
},
{
"epoch": 0.7367475292003594,
"grad_norm": 3.53125,
"learning_rate": 0.00024599999999999996,
"loss": 5.914,
"step": 1640
},
{
"epoch": 0.7389937106918238,
"grad_norm": 3.25,
"learning_rate": 0.00024675,
"loss": 5.8628,
"step": 1645
},
{
"epoch": 0.7412398921832885,
"grad_norm": 3.5,
"learning_rate": 0.00024749999999999994,
"loss": 5.8555,
"step": 1650
},
{
"epoch": 0.743486073674753,
"grad_norm": 3.4375,
"learning_rate": 0.00024825,
"loss": 5.8846,
"step": 1655
},
{
"epoch": 0.7457322551662174,
"grad_norm": 3.703125,
"learning_rate": 0.000249,
"loss": 5.8957,
"step": 1660
},
{
"epoch": 0.7479784366576819,
"grad_norm": 3.25,
"learning_rate": 0.00024974999999999997,
"loss": 5.8036,
"step": 1665
},
{
"epoch": 0.7502246181491464,
"grad_norm": 3.375,
"learning_rate": 0.00025049999999999996,
"loss": 5.845,
"step": 1670
},
{
"epoch": 0.7524707996406109,
"grad_norm": 3.1875,
"learning_rate": 0.00025125,
"loss": 5.8801,
"step": 1675
},
{
"epoch": 0.7547169811320755,
"grad_norm": 3.53125,
"learning_rate": 0.00025199999999999995,
"loss": 5.8356,
"step": 1680
},
{
"epoch": 0.75696316262354,
"grad_norm": 3.375,
"learning_rate": 0.00025275,
"loss": 5.851,
"step": 1685
},
{
"epoch": 0.7592093441150045,
"grad_norm": 3.546875,
"learning_rate": 0.0002535,
"loss": 5.8647,
"step": 1690
},
{
"epoch": 0.761455525606469,
"grad_norm": 3.4375,
"learning_rate": 0.00025425,
"loss": 5.8168,
"step": 1695
},
{
"epoch": 0.7637017070979335,
"grad_norm": 3.609375,
"learning_rate": 0.00025499999999999996,
"loss": 5.8514,
"step": 1700
},
{
"epoch": 0.765947888589398,
"grad_norm": 3.3125,
"learning_rate": 0.00025575,
"loss": 5.7495,
"step": 1705
},
{
"epoch": 0.7681940700808625,
"grad_norm": 3.515625,
"learning_rate": 0.00025649999999999995,
"loss": 5.8702,
"step": 1710
},
{
"epoch": 0.7704402515723271,
"grad_norm": 3.640625,
"learning_rate": 0.00025725,
"loss": 5.9178,
"step": 1715
},
{
"epoch": 0.7726864330637916,
"grad_norm": 3.1875,
"learning_rate": 0.000258,
"loss": 5.82,
"step": 1720
},
{
"epoch": 0.7749326145552561,
"grad_norm": 3.765625,
"learning_rate": 0.00025875,
"loss": 5.823,
"step": 1725
},
{
"epoch": 0.7771787960467206,
"grad_norm": 3.4375,
"learning_rate": 0.00025949999999999997,
"loss": 5.8712,
"step": 1730
},
{
"epoch": 0.7794249775381851,
"grad_norm": 3.140625,
"learning_rate": 0.00026025,
"loss": 5.8173,
"step": 1735
},
{
"epoch": 0.7816711590296496,
"grad_norm": 3.28125,
"learning_rate": 0.000261,
"loss": 5.8169,
"step": 1740
},
{
"epoch": 0.7839173405211141,
"grad_norm": 3.4375,
"learning_rate": 0.00026175,
"loss": 5.8047,
"step": 1745
},
{
"epoch": 0.7861635220125787,
"grad_norm": 3.21875,
"learning_rate": 0.0002625,
"loss": 5.8384,
"step": 1750
},
{
"epoch": 0.7884097035040432,
"grad_norm": 3.40625,
"learning_rate": 0.00026325,
"loss": 5.7996,
"step": 1755
},
{
"epoch": 0.7906558849955077,
"grad_norm": 3.4375,
"learning_rate": 0.00026399999999999997,
"loss": 5.7611,
"step": 1760
},
{
"epoch": 0.7929020664869721,
"grad_norm": 3.390625,
"learning_rate": 0.00026474999999999996,
"loss": 5.7925,
"step": 1765
},
{
"epoch": 0.7951482479784366,
"grad_norm": 3.375,
"learning_rate": 0.0002655,
"loss": 5.8187,
"step": 1770
},
{
"epoch": 0.7973944294699011,
"grad_norm": 3.53125,
"learning_rate": 0.00026624999999999994,
"loss": 5.7791,
"step": 1775
},
{
"epoch": 0.7996406109613656,
"grad_norm": 3.8125,
"learning_rate": 0.000267,
"loss": 5.8063,
"step": 1780
},
{
"epoch": 0.8018867924528302,
"grad_norm": 3.25,
"learning_rate": 0.00026775,
"loss": 5.8167,
"step": 1785
},
{
"epoch": 0.8041329739442947,
"grad_norm": 3.46875,
"learning_rate": 0.00026849999999999997,
"loss": 5.7916,
"step": 1790
},
{
"epoch": 0.8063791554357592,
"grad_norm": 3.28125,
"learning_rate": 0.00026924999999999996,
"loss": 5.8446,
"step": 1795
},
{
"epoch": 0.8086253369272237,
"grad_norm": 3.65625,
"learning_rate": 0.00027,
"loss": 5.8757,
"step": 1800
},
{
"epoch": 0.8108715184186882,
"grad_norm": 3.734375,
"learning_rate": 0.00027074999999999994,
"loss": 5.7271,
"step": 1805
},
{
"epoch": 0.8131176999101527,
"grad_norm": 3.765625,
"learning_rate": 0.0002715,
"loss": 5.8397,
"step": 1810
},
{
"epoch": 0.8153638814016172,
"grad_norm": 3.34375,
"learning_rate": 0.00027225,
"loss": 5.7838,
"step": 1815
},
{
"epoch": 0.8176100628930818,
"grad_norm": 3.59375,
"learning_rate": 0.00027299999999999997,
"loss": 5.7907,
"step": 1820
},
{
"epoch": 0.8198562443845463,
"grad_norm": 3.921875,
"learning_rate": 0.00027374999999999996,
"loss": 5.8579,
"step": 1825
},
{
"epoch": 0.8221024258760108,
"grad_norm": 3.46875,
"learning_rate": 0.0002745,
"loss": 5.8342,
"step": 1830
},
{
"epoch": 0.8243486073674753,
"grad_norm": 3.75,
"learning_rate": 0.00027525,
"loss": 5.7949,
"step": 1835
},
{
"epoch": 0.8265947888589398,
"grad_norm": 3.4375,
"learning_rate": 0.000276,
"loss": 5.7715,
"step": 1840
},
{
"epoch": 0.8288409703504043,
"grad_norm": 3.703125,
"learning_rate": 0.00027675,
"loss": 5.7804,
"step": 1845
},
{
"epoch": 0.8310871518418689,
"grad_norm": 3.4375,
"learning_rate": 0.00027749999999999997,
"loss": 5.7288,
"step": 1850
},
{
"epoch": 0.8333333333333334,
"grad_norm": 3.109375,
"learning_rate": 0.00027824999999999996,
"loss": 5.7319,
"step": 1855
},
{
"epoch": 0.8355795148247979,
"grad_norm": 3.21875,
"learning_rate": 0.000279,
"loss": 5.7636,
"step": 1860
},
{
"epoch": 0.8378256963162624,
"grad_norm": 3.234375,
"learning_rate": 0.00027975,
"loss": 5.7395,
"step": 1865
},
{
"epoch": 0.8400718778077269,
"grad_norm": 3.6875,
"learning_rate": 0.0002805,
"loss": 5.7519,
"step": 1870
},
{
"epoch": 0.8423180592991913,
"grad_norm": 3.265625,
"learning_rate": 0.00028125,
"loss": 5.706,
"step": 1875
},
{
"epoch": 0.8445642407906558,
"grad_norm": 3.390625,
"learning_rate": 0.00028199999999999997,
"loss": 5.799,
"step": 1880
},
{
"epoch": 0.8468104222821204,
"grad_norm": 3.265625,
"learning_rate": 0.00028274999999999996,
"loss": 5.7856,
"step": 1885
},
{
"epoch": 0.8490566037735849,
"grad_norm": 3.421875,
"learning_rate": 0.00028349999999999995,
"loss": 5.8625,
"step": 1890
},
{
"epoch": 0.8513027852650494,
"grad_norm": 3.203125,
"learning_rate": 0.00028425,
"loss": 5.7212,
"step": 1895
},
{
"epoch": 0.8535489667565139,
"grad_norm": 3.296875,
"learning_rate": 0.000285,
"loss": 5.7326,
"step": 1900
},
{
"epoch": 0.8557951482479784,
"grad_norm": 3.5,
"learning_rate": 0.00028575,
"loss": 5.7664,
"step": 1905
},
{
"epoch": 0.8580413297394429,
"grad_norm": 3.34375,
"learning_rate": 0.00028649999999999997,
"loss": 5.7231,
"step": 1910
},
{
"epoch": 0.8602875112309074,
"grad_norm": 3.40625,
"learning_rate": 0.00028724999999999996,
"loss": 5.7759,
"step": 1915
},
{
"epoch": 0.862533692722372,
"grad_norm": 3.125,
"learning_rate": 0.00028799999999999995,
"loss": 5.7442,
"step": 1920
},
{
"epoch": 0.8647798742138365,
"grad_norm": 3.15625,
"learning_rate": 0.00028875,
"loss": 5.7252,
"step": 1925
},
{
"epoch": 0.867026055705301,
"grad_norm": 3.265625,
"learning_rate": 0.0002895,
"loss": 5.7196,
"step": 1930
},
{
"epoch": 0.8692722371967655,
"grad_norm": 3.328125,
"learning_rate": 0.00029025,
"loss": 5.7376,
"step": 1935
},
{
"epoch": 0.87151841868823,
"grad_norm": 3.1875,
"learning_rate": 0.00029099999999999997,
"loss": 5.8077,
"step": 1940
},
{
"epoch": 0.8737646001796945,
"grad_norm": 3.625,
"learning_rate": 0.00029174999999999996,
"loss": 5.7826,
"step": 1945
},
{
"epoch": 0.876010781671159,
"grad_norm": 3.609375,
"learning_rate": 0.00029249999999999995,
"loss": 5.736,
"step": 1950
},
{
"epoch": 0.8782569631626236,
"grad_norm": 3.421875,
"learning_rate": 0.00029325,
"loss": 5.7531,
"step": 1955
},
{
"epoch": 0.8805031446540881,
"grad_norm": 3.4375,
"learning_rate": 0.000294,
"loss": 5.7246,
"step": 1960
},
{
"epoch": 0.8827493261455526,
"grad_norm": 3.375,
"learning_rate": 0.00029475,
"loss": 5.7786,
"step": 1965
},
{
"epoch": 0.8849955076370171,
"grad_norm": 3.296875,
"learning_rate": 0.00029549999999999997,
"loss": 5.7237,
"step": 1970
},
{
"epoch": 0.8872416891284816,
"grad_norm": 2.96875,
"learning_rate": 0.00029624999999999996,
"loss": 5.8053,
"step": 1975
},
{
"epoch": 0.889487870619946,
"grad_norm": 3.328125,
"learning_rate": 0.00029699999999999996,
"loss": 5.6918,
"step": 1980
},
{
"epoch": 0.8917340521114105,
"grad_norm": 3.015625,
"learning_rate": 0.00029775,
"loss": 5.8251,
"step": 1985
},
{
"epoch": 0.8939802336028752,
"grad_norm": 3.78125,
"learning_rate": 0.0002985,
"loss": 5.7529,
"step": 1990
},
{
"epoch": 0.8962264150943396,
"grad_norm": 3.640625,
"learning_rate": 0.00029925,
"loss": 5.7181,
"step": 1995
},
{
"epoch": 0.8984725965858041,
"grad_norm": 3.234375,
"learning_rate": 0.0003,
"loss": 5.7413,
"step": 2000
},
{
"epoch": 0.8984725965858041,
"eval_loss": 5.639461517333984,
"eval_runtime": 16.0491,
"eval_samples_per_second": 1932.383,
"eval_steps_per_second": 241.571,
"step": 2000
},
{
"epoch": 0.9007187780772686,
"grad_norm": 3.140625,
"learning_rate": 0.00029999995942443054,
"loss": 5.6436,
"step": 2005
},
{
"epoch": 0.9029649595687331,
"grad_norm": 3.328125,
"learning_rate": 0.00029999983769774674,
"loss": 5.7627,
"step": 2010
},
{
"epoch": 0.9052111410601976,
"grad_norm": 3.171875,
"learning_rate": 0.0002999996348200217,
"loss": 5.7181,
"step": 2015
},
{
"epoch": 0.9074573225516622,
"grad_norm": 3.34375,
"learning_rate": 0.0002999993507913773,
"loss": 5.7097,
"step": 2020
},
{
"epoch": 0.9097035040431267,
"grad_norm": 3.1875,
"learning_rate": 0.0002999989856119844,
"loss": 5.6407,
"step": 2025
},
{
"epoch": 0.9119496855345912,
"grad_norm": 3.453125,
"learning_rate": 0.0002999985392820624,
"loss": 5.6532,
"step": 2030
},
{
"epoch": 0.9141958670260557,
"grad_norm": 3.140625,
"learning_rate": 0.0002999980118018797,
"loss": 5.6993,
"step": 2035
},
{
"epoch": 0.9164420485175202,
"grad_norm": 3.546875,
"learning_rate": 0.0002999974031717533,
"loss": 5.6507,
"step": 2040
},
{
"epoch": 0.9186882300089847,
"grad_norm": 3.546875,
"learning_rate": 0.0002999967133920491,
"loss": 5.6629,
"step": 2045
},
{
"epoch": 0.9209344115004492,
"grad_norm": 3.203125,
"learning_rate": 0.0002999959424631818,
"loss": 5.7172,
"step": 2050
},
{
"epoch": 0.9231805929919138,
"grad_norm": 3.140625,
"learning_rate": 0.0002999950903856147,
"loss": 5.5766,
"step": 2055
},
{
"epoch": 0.9254267744833783,
"grad_norm": 3.234375,
"learning_rate": 0.00029999415715986,
"loss": 5.6546,
"step": 2060
},
{
"epoch": 0.9276729559748428,
"grad_norm": 3.34375,
"learning_rate": 0.0002999931427864788,
"loss": 5.6317,
"step": 2065
},
{
"epoch": 0.9299191374663073,
"grad_norm": 3.1875,
"learning_rate": 0.00029999204726608076,
"loss": 5.6605,
"step": 2070
},
{
"epoch": 0.9321653189577718,
"grad_norm": 3.40625,
"learning_rate": 0.0002999908705993245,
"loss": 5.6958,
"step": 2075
},
{
"epoch": 0.9344115004492363,
"grad_norm": 3.046875,
"learning_rate": 0.00029998961278691725,
"loss": 5.6498,
"step": 2080
},
{
"epoch": 0.9366576819407008,
"grad_norm": 3.203125,
"learning_rate": 0.0002999882738296152,
"loss": 5.6887,
"step": 2085
},
{
"epoch": 0.9389038634321654,
"grad_norm": 3.453125,
"learning_rate": 0.0002999868537282231,
"loss": 5.617,
"step": 2090
},
{
"epoch": 0.9411500449236299,
"grad_norm": 3.25,
"learning_rate": 0.0002999853524835947,
"loss": 5.7708,
"step": 2095
},
{
"epoch": 0.9433962264150944,
"grad_norm": 3.421875,
"learning_rate": 0.0002999837700966324,
"loss": 5.6733,
"step": 2100
},
{
"epoch": 0.9456424079065588,
"grad_norm": 3.359375,
"learning_rate": 0.00029998210656828736,
"loss": 5.7,
"step": 2105
},
{
"epoch": 0.9478885893980233,
"grad_norm": 3.296875,
"learning_rate": 0.0002999803618995596,
"loss": 5.6652,
"step": 2110
},
{
"epoch": 0.9501347708894878,
"grad_norm": 3.71875,
"learning_rate": 0.00029997853609149797,
"loss": 5.7413,
"step": 2115
},
{
"epoch": 0.9523809523809523,
"grad_norm": 3.3125,
"learning_rate": 0.00029997662914519983,
"loss": 5.7038,
"step": 2120
},
{
"epoch": 0.9546271338724169,
"grad_norm": 3.546875,
"learning_rate": 0.0002999746410618116,
"loss": 5.6402,
"step": 2125
},
{
"epoch": 0.9568733153638814,
"grad_norm": 3.09375,
"learning_rate": 0.00029997257184252827,
"loss": 5.5762,
"step": 2130
},
{
"epoch": 0.9591194968553459,
"grad_norm": 3.421875,
"learning_rate": 0.00029997042148859374,
"loss": 5.7327,
"step": 2135
},
{
"epoch": 0.9613656783468104,
"grad_norm": 3.296875,
"learning_rate": 0.0002999681900013006,
"loss": 5.6974,
"step": 2140
},
{
"epoch": 0.9636118598382749,
"grad_norm": 3.140625,
"learning_rate": 0.0002999658773819903,
"loss": 5.7185,
"step": 2145
},
{
"epoch": 0.9658580413297394,
"grad_norm": 3.34375,
"learning_rate": 0.00029996348363205296,
"loss": 5.7269,
"step": 2150
},
{
"epoch": 0.968104222821204,
"grad_norm": 3.0,
"learning_rate": 0.0002999610087529275,
"loss": 5.6719,
"step": 2155
},
{
"epoch": 0.9703504043126685,
"grad_norm": 3.375,
"learning_rate": 0.00029995845274610164,
"loss": 5.6067,
"step": 2160
},
{
"epoch": 0.972596585804133,
"grad_norm": 3.25,
"learning_rate": 0.00029995581561311185,
"loss": 5.612,
"step": 2165
},
{
"epoch": 0.9748427672955975,
"grad_norm": 3.390625,
"learning_rate": 0.00029995309735554327,
"loss": 5.6163,
"step": 2170
},
{
"epoch": 0.977088948787062,
"grad_norm": 3.265625,
"learning_rate": 0.00029995029797503007,
"loss": 5.6468,
"step": 2175
},
{
"epoch": 0.9793351302785265,
"grad_norm": 3.03125,
"learning_rate": 0.00029994741747325487,
"loss": 5.6653,
"step": 2180
},
{
"epoch": 0.981581311769991,
"grad_norm": 3.1875,
"learning_rate": 0.00029994445585194925,
"loss": 5.6416,
"step": 2185
},
{
"epoch": 0.9838274932614556,
"grad_norm": 3.09375,
"learning_rate": 0.00029994141311289347,
"loss": 5.5982,
"step": 2190
},
{
"epoch": 0.9860736747529201,
"grad_norm": 3.328125,
"learning_rate": 0.00029993828925791664,
"loss": 5.6288,
"step": 2195
},
{
"epoch": 0.9883198562443846,
"grad_norm": 3.203125,
"learning_rate": 0.0002999350842888965,
"loss": 5.6725,
"step": 2200
},
{
"epoch": 0.9905660377358491,
"grad_norm": 3.40625,
"learning_rate": 0.0002999317982077596,
"loss": 5.6444,
"step": 2205
},
{
"epoch": 0.9928122192273136,
"grad_norm": 2.921875,
"learning_rate": 0.00029992843101648144,
"loss": 5.6642,
"step": 2210
},
{
"epoch": 0.995058400718778,
"grad_norm": 3.015625,
"learning_rate": 0.00029992498271708595,
"loss": 5.6011,
"step": 2215
},
{
"epoch": 0.9973045822102425,
"grad_norm": 2.90625,
"learning_rate": 0.00029992145331164596,
"loss": 5.6432,
"step": 2220
},
{
"epoch": 0.9995507637017071,
"grad_norm": 3.140625,
"learning_rate": 0.0002999178428022831,
"loss": 5.6428,
"step": 2225
},
{
"epoch": 1.0017969451931716,
"grad_norm": 3.265625,
"learning_rate": 0.0002999141511911678,
"loss": 5.5542,
"step": 2230
},
{
"epoch": 1.0040431266846361,
"grad_norm": 3.296875,
"learning_rate": 0.000299910378480519,
"loss": 5.6403,
"step": 2235
},
{
"epoch": 1.0062893081761006,
"grad_norm": 3.21875,
"learning_rate": 0.0002999065246726047,
"loss": 5.5451,
"step": 2240
},
{
"epoch": 1.0085354896675651,
"grad_norm": 3.0625,
"learning_rate": 0.0002999025897697414,
"loss": 5.6575,
"step": 2245
},
{
"epoch": 1.0107816711590296,
"grad_norm": 3.140625,
"learning_rate": 0.0002998985737742945,
"loss": 5.5892,
"step": 2250
},
{
"epoch": 1.013027852650494,
"grad_norm": 3.203125,
"learning_rate": 0.0002998944766886781,
"loss": 5.6127,
"step": 2255
},
{
"epoch": 1.0152740341419586,
"grad_norm": 3.078125,
"learning_rate": 0.000299890298515355,
"loss": 5.5885,
"step": 2260
},
{
"epoch": 1.017520215633423,
"grad_norm": 3.265625,
"learning_rate": 0.0002998860392568368,
"loss": 5.5215,
"step": 2265
},
{
"epoch": 1.0197663971248876,
"grad_norm": 3.171875,
"learning_rate": 0.00029988169891568373,
"loss": 5.6074,
"step": 2270
},
{
"epoch": 1.0220125786163523,
"grad_norm": 3.171875,
"learning_rate": 0.00029987727749450506,
"loss": 5.6192,
"step": 2275
},
{
"epoch": 1.0242587601078168,
"grad_norm": 3.328125,
"learning_rate": 0.00029987277499595843,
"loss": 5.5663,
"step": 2280
},
{
"epoch": 1.0265049415992813,
"grad_norm": 3.265625,
"learning_rate": 0.0002998681914227504,
"loss": 5.5862,
"step": 2285
},
{
"epoch": 1.0287511230907458,
"grad_norm": 3.0,
"learning_rate": 0.0002998635267776363,
"loss": 5.5536,
"step": 2290
},
{
"epoch": 1.0309973045822103,
"grad_norm": 3.3125,
"learning_rate": 0.0002998587810634201,
"loss": 5.5818,
"step": 2295
},
{
"epoch": 1.0332434860736748,
"grad_norm": 3.234375,
"learning_rate": 0.0002998539542829546,
"loss": 5.6147,
"step": 2300
},
{
"epoch": 1.0354896675651393,
"grad_norm": 3.09375,
"learning_rate": 0.00029984904643914114,
"loss": 5.6629,
"step": 2305
},
{
"epoch": 1.0377358490566038,
"grad_norm": 3.15625,
"learning_rate": 0.00029984405753493006,
"loss": 5.5412,
"step": 2310
},
{
"epoch": 1.0399820305480683,
"grad_norm": 2.984375,
"learning_rate": 0.00029983898757332024,
"loss": 5.5598,
"step": 2315
},
{
"epoch": 1.0422282120395328,
"grad_norm": 2.96875,
"learning_rate": 0.0002998338365573593,
"loss": 5.6111,
"step": 2320
},
{
"epoch": 1.0444743935309972,
"grad_norm": 3.234375,
"learning_rate": 0.0002998286044901436,
"loss": 5.4899,
"step": 2325
},
{
"epoch": 1.0467205750224617,
"grad_norm": 3.453125,
"learning_rate": 0.0002998232913748184,
"loss": 5.5567,
"step": 2330
},
{
"epoch": 1.0489667565139262,
"grad_norm": 3.40625,
"learning_rate": 0.0002998178972145773,
"loss": 5.4968,
"step": 2335
},
{
"epoch": 1.0512129380053907,
"grad_norm": 3.03125,
"learning_rate": 0.000299812422012663,
"loss": 5.6119,
"step": 2340
},
{
"epoch": 1.0534591194968554,
"grad_norm": 3.15625,
"learning_rate": 0.0002998068657723666,
"loss": 5.5563,
"step": 2345
},
{
"epoch": 1.05570530098832,
"grad_norm": 3.203125,
"learning_rate": 0.0002998012284970282,
"loss": 5.5985,
"step": 2350
},
{
"epoch": 1.0579514824797844,
"grad_norm": 3.46875,
"learning_rate": 0.00029979551019003643,
"loss": 5.5002,
"step": 2355
},
{
"epoch": 1.060197663971249,
"grad_norm": 3.046875,
"learning_rate": 0.0002997897108548286,
"loss": 5.6114,
"step": 2360
},
{
"epoch": 1.0624438454627134,
"grad_norm": 3.140625,
"learning_rate": 0.00029978383049489093,
"loss": 5.5056,
"step": 2365
},
{
"epoch": 1.064690026954178,
"grad_norm": 3.109375,
"learning_rate": 0.0002997778691137582,
"loss": 5.515,
"step": 2370
},
{
"epoch": 1.0669362084456424,
"grad_norm": 3.15625,
"learning_rate": 0.00029977182671501383,
"loss": 5.5303,
"step": 2375
},
{
"epoch": 1.069182389937107,
"grad_norm": 3.140625,
"learning_rate": 0.00029976570330229006,
"loss": 5.5147,
"step": 2380
},
{
"epoch": 1.0714285714285714,
"grad_norm": 3.109375,
"learning_rate": 0.00029975949887926784,
"loss": 5.5098,
"step": 2385
},
{
"epoch": 1.073674752920036,
"grad_norm": 3.046875,
"learning_rate": 0.00029975321344967676,
"loss": 5.5533,
"step": 2390
},
{
"epoch": 1.0759209344115004,
"grad_norm": 3.28125,
"learning_rate": 0.000299746847017295,
"loss": 5.5429,
"step": 2395
},
{
"epoch": 1.0781671159029649,
"grad_norm": 3.265625,
"learning_rate": 0.00029974039958594967,
"loss": 5.508,
"step": 2400
},
{
"epoch": 1.0804132973944294,
"grad_norm": 3.1875,
"learning_rate": 0.0002997338711595165,
"loss": 5.5494,
"step": 2405
},
{
"epoch": 1.082659478885894,
"grad_norm": 3.203125,
"learning_rate": 0.00029972726174191965,
"loss": 5.4273,
"step": 2410
},
{
"epoch": 1.0849056603773586,
"grad_norm": 3.0625,
"learning_rate": 0.00029972057133713235,
"loss": 5.5474,
"step": 2415
},
{
"epoch": 1.087151841868823,
"grad_norm": 2.84375,
"learning_rate": 0.00029971379994917624,
"loss": 5.5008,
"step": 2420
},
{
"epoch": 1.0893980233602876,
"grad_norm": 3.359375,
"learning_rate": 0.00029970694758212177,
"loss": 5.4682,
"step": 2425
},
{
"epoch": 1.091644204851752,
"grad_norm": 3.0,
"learning_rate": 0.000299700014240088,
"loss": 5.4666,
"step": 2430
},
{
"epoch": 1.0938903863432166,
"grad_norm": 3.3125,
"learning_rate": 0.00029969299992724273,
"loss": 5.5844,
"step": 2435
},
{
"epoch": 1.096136567834681,
"grad_norm": 3.3125,
"learning_rate": 0.00029968590464780247,
"loss": 5.5141,
"step": 2440
},
{
"epoch": 1.0983827493261455,
"grad_norm": 3.046875,
"learning_rate": 0.0002996787284060322,
"loss": 5.4897,
"step": 2445
},
{
"epoch": 1.10062893081761,
"grad_norm": 3.125,
"learning_rate": 0.00029967147120624573,
"loss": 5.4318,
"step": 2450
},
{
"epoch": 1.1028751123090745,
"grad_norm": 3.4375,
"learning_rate": 0.00029966413305280553,
"loss": 5.506,
"step": 2455
},
{
"epoch": 1.105121293800539,
"grad_norm": 3.390625,
"learning_rate": 0.00029965671395012274,
"loss": 5.4363,
"step": 2460
},
{
"epoch": 1.1073674752920035,
"grad_norm": 3.265625,
"learning_rate": 0.0002996492139026571,
"loss": 5.4077,
"step": 2465
},
{
"epoch": 1.109613656783468,
"grad_norm": 3.265625,
"learning_rate": 0.000299641632914917,
"loss": 5.4435,
"step": 2470
},
{
"epoch": 1.1118598382749325,
"grad_norm": 3.078125,
"learning_rate": 0.0002996339709914596,
"loss": 5.4641,
"step": 2475
},
{
"epoch": 1.1141060197663972,
"grad_norm": 3.015625,
"learning_rate": 0.0002996262281368905,
"loss": 5.5053,
"step": 2480
},
{
"epoch": 1.1163522012578617,
"grad_norm": 3.34375,
"learning_rate": 0.0002996184043558642,
"loss": 5.3987,
"step": 2485
},
{
"epoch": 1.1185983827493262,
"grad_norm": 3.03125,
"learning_rate": 0.0002996104996530837,
"loss": 5.6063,
"step": 2490
},
{
"epoch": 1.1208445642407907,
"grad_norm": 3.328125,
"learning_rate": 0.0002996025140333006,
"loss": 5.4782,
"step": 2495
},
{
"epoch": 1.1230907457322552,
"grad_norm": 3.25,
"learning_rate": 0.00029959444750131533,
"loss": 5.4836,
"step": 2500
},
{
"epoch": 1.1253369272237197,
"grad_norm": 3.140625,
"learning_rate": 0.0002995863000619768,
"loss": 5.5181,
"step": 2505
},
{
"epoch": 1.1275831087151842,
"grad_norm": 3.1875,
"learning_rate": 0.0002995780717201825,
"loss": 5.4469,
"step": 2510
},
{
"epoch": 1.1298292902066487,
"grad_norm": 3.03125,
"learning_rate": 0.0002995697624808788,
"loss": 5.4445,
"step": 2515
},
{
"epoch": 1.1320754716981132,
"grad_norm": 3.125,
"learning_rate": 0.00029956137234906044,
"loss": 5.4844,
"step": 2520
},
{
"epoch": 1.1343216531895777,
"grad_norm": 2.953125,
"learning_rate": 0.00029955290132977093,
"loss": 5.5633,
"step": 2525
},
{
"epoch": 1.1365678346810422,
"grad_norm": 3.109375,
"learning_rate": 0.0002995443494281024,
"loss": 5.4724,
"step": 2530
},
{
"epoch": 1.1388140161725067,
"grad_norm": 3.34375,
"learning_rate": 0.00029953571664919547,
"loss": 5.4786,
"step": 2535
},
{
"epoch": 1.1410601976639712,
"grad_norm": 3.328125,
"learning_rate": 0.0002995270029982396,
"loss": 5.5004,
"step": 2540
},
{
"epoch": 1.1433063791554359,
"grad_norm": 3.0625,
"learning_rate": 0.00029951820848047255,
"loss": 5.4758,
"step": 2545
},
{
"epoch": 1.1455525606469004,
"grad_norm": 3.0,
"learning_rate": 0.0002995093331011811,
"loss": 5.4789,
"step": 2550
},
{
"epoch": 1.1477987421383649,
"grad_norm": 3.03125,
"learning_rate": 0.00029950037686570023,
"loss": 5.3991,
"step": 2555
},
{
"epoch": 1.1500449236298294,
"grad_norm": 3.3125,
"learning_rate": 0.0002994913397794138,
"loss": 5.5046,
"step": 2560
},
{
"epoch": 1.1522911051212938,
"grad_norm": 3.46875,
"learning_rate": 0.00029948222184775415,
"loss": 5.5293,
"step": 2565
},
{
"epoch": 1.1545372866127583,
"grad_norm": 3.125,
"learning_rate": 0.00029947302307620227,
"loss": 5.4079,
"step": 2570
},
{
"epoch": 1.1567834681042228,
"grad_norm": 3.203125,
"learning_rate": 0.0002994637434702877,
"loss": 5.425,
"step": 2575
},
{
"epoch": 1.1590296495956873,
"grad_norm": 3.296875,
"learning_rate": 0.0002994543830355886,
"loss": 5.4591,
"step": 2580
},
{
"epoch": 1.1612758310871518,
"grad_norm": 3.296875,
"learning_rate": 0.0002994449417777317,
"loss": 5.5263,
"step": 2585
},
{
"epoch": 1.1635220125786163,
"grad_norm": 3.140625,
"learning_rate": 0.00029943541970239233,
"loss": 5.4458,
"step": 2590
},
{
"epoch": 1.1657681940700808,
"grad_norm": 3.1875,
"learning_rate": 0.00029942581681529447,
"loss": 5.4449,
"step": 2595
},
{
"epoch": 1.1680143755615453,
"grad_norm": 3.34375,
"learning_rate": 0.00029941613312221046,
"loss": 5.5558,
"step": 2600
},
{
"epoch": 1.1702605570530098,
"grad_norm": 3.0,
"learning_rate": 0.00029940636862896145,
"loss": 5.5165,
"step": 2605
},
{
"epoch": 1.1725067385444743,
"grad_norm": 3.3125,
"learning_rate": 0.0002993965233414171,
"loss": 5.4624,
"step": 2610
},
{
"epoch": 1.1747529200359388,
"grad_norm": 3.203125,
"learning_rate": 0.0002993865972654955,
"loss": 5.4336,
"step": 2615
},
{
"epoch": 1.1769991015274035,
"grad_norm": 3.5,
"learning_rate": 0.0002993765904071635,
"loss": 5.5293,
"step": 2620
},
{
"epoch": 1.179245283018868,
"grad_norm": 3.15625,
"learning_rate": 0.00029936650277243633,
"loss": 5.5603,
"step": 2625
},
{
"epoch": 1.1814914645103325,
"grad_norm": 3.140625,
"learning_rate": 0.0002993563343673779,
"loss": 5.4785,
"step": 2630
},
{
"epoch": 1.183737646001797,
"grad_norm": 3.09375,
"learning_rate": 0.0002993460851981007,
"loss": 5.4188,
"step": 2635
},
{
"epoch": 1.1859838274932615,
"grad_norm": 3.078125,
"learning_rate": 0.00029933575527076565,
"loss": 5.5139,
"step": 2640
},
{
"epoch": 1.188230008984726,
"grad_norm": 3.015625,
"learning_rate": 0.0002993253445915823,
"loss": 5.3998,
"step": 2645
},
{
"epoch": 1.1904761904761905,
"grad_norm": 3.328125,
"learning_rate": 0.0002993148531668087,
"loss": 5.5066,
"step": 2650
},
{
"epoch": 1.192722371967655,
"grad_norm": 3.125,
"learning_rate": 0.0002993042810027514,
"loss": 5.416,
"step": 2655
},
{
"epoch": 1.1949685534591195,
"grad_norm": 3.171875,
"learning_rate": 0.0002992936281057656,
"loss": 5.4367,
"step": 2660
},
{
"epoch": 1.197214734950584,
"grad_norm": 3.125,
"learning_rate": 0.000299282894482255,
"loss": 5.3912,
"step": 2665
},
{
"epoch": 1.1994609164420484,
"grad_norm": 2.9375,
"learning_rate": 0.00029927208013867164,
"loss": 5.4456,
"step": 2670
},
{
"epoch": 1.201707097933513,
"grad_norm": 3.296875,
"learning_rate": 0.0002992611850815163,
"loss": 5.5036,
"step": 2675
},
{
"epoch": 1.2039532794249777,
"grad_norm": 3.234375,
"learning_rate": 0.0002992502093173383,
"loss": 5.4467,
"step": 2680
},
{
"epoch": 1.2061994609164421,
"grad_norm": 3.375,
"learning_rate": 0.0002992391528527353,
"loss": 5.3611,
"step": 2685
},
{
"epoch": 1.2084456424079066,
"grad_norm": 3.359375,
"learning_rate": 0.00029922801569435366,
"loss": 5.4635,
"step": 2690
},
{
"epoch": 1.2106918238993711,
"grad_norm": 3.671875,
"learning_rate": 0.00029921679784888797,
"loss": 5.4823,
"step": 2695
},
{
"epoch": 1.2129380053908356,
"grad_norm": 2.875,
"learning_rate": 0.0002992054993230816,
"loss": 5.378,
"step": 2700
},
{
"epoch": 1.2151841868823001,
"grad_norm": 2.765625,
"learning_rate": 0.0002991941201237263,
"loss": 5.4737,
"step": 2705
},
{
"epoch": 1.2174303683737646,
"grad_norm": 3.0625,
"learning_rate": 0.0002991826602576624,
"loss": 5.4399,
"step": 2710
},
{
"epoch": 1.219676549865229,
"grad_norm": 3.046875,
"learning_rate": 0.00029917111973177857,
"loss": 5.4663,
"step": 2715
},
{
"epoch": 1.2219227313566936,
"grad_norm": 3.484375,
"learning_rate": 0.00029915949855301204,
"loss": 5.3946,
"step": 2720
},
{
"epoch": 1.224168912848158,
"grad_norm": 2.953125,
"learning_rate": 0.0002991477967283485,
"loss": 5.4415,
"step": 2725
},
{
"epoch": 1.2264150943396226,
"grad_norm": 3.125,
"learning_rate": 0.00029913601426482226,
"loss": 5.3648,
"step": 2730
},
{
"epoch": 1.228661275831087,
"grad_norm": 2.953125,
"learning_rate": 0.00029912415116951593,
"loss": 5.4543,
"step": 2735
},
{
"epoch": 1.2309074573225516,
"grad_norm": 2.921875,
"learning_rate": 0.0002991122074495606,
"loss": 5.381,
"step": 2740
},
{
"epoch": 1.233153638814016,
"grad_norm": 3.015625,
"learning_rate": 0.0002991001831121359,
"loss": 5.4367,
"step": 2745
},
{
"epoch": 1.2353998203054806,
"grad_norm": 3.796875,
"learning_rate": 0.00029908807816446994,
"loss": 5.5144,
"step": 2750
},
{
"epoch": 1.2376460017969453,
"grad_norm": 3.140625,
"learning_rate": 0.0002990758926138392,
"loss": 5.4193,
"step": 2755
},
{
"epoch": 1.2398921832884098,
"grad_norm": 3.078125,
"learning_rate": 0.0002990636264675687,
"loss": 5.4758,
"step": 2760
},
{
"epoch": 1.2421383647798743,
"grad_norm": 3.265625,
"learning_rate": 0.00029905127973303176,
"loss": 5.4093,
"step": 2765
},
{
"epoch": 1.2443845462713388,
"grad_norm": 3.015625,
"learning_rate": 0.00029903885241765036,
"loss": 5.4189,
"step": 2770
},
{
"epoch": 1.2466307277628033,
"grad_norm": 2.90625,
"learning_rate": 0.0002990263445288947,
"loss": 5.4447,
"step": 2775
},
{
"epoch": 1.2488769092542678,
"grad_norm": 3.03125,
"learning_rate": 0.0002990137560742836,
"loss": 5.3926,
"step": 2780
},
{
"epoch": 1.2511230907457322,
"grad_norm": 3.203125,
"learning_rate": 0.00029900108706138416,
"loss": 5.3857,
"step": 2785
},
{
"epoch": 1.2533692722371967,
"grad_norm": 2.890625,
"learning_rate": 0.000298988337497812,
"loss": 5.4141,
"step": 2790
},
{
"epoch": 1.2556154537286612,
"grad_norm": 3.0625,
"learning_rate": 0.0002989755073912311,
"loss": 5.422,
"step": 2795
},
{
"epoch": 1.2578616352201257,
"grad_norm": 3.1875,
"learning_rate": 0.0002989625967493541,
"loss": 5.3838,
"step": 2800
},
{
"epoch": 1.2601078167115902,
"grad_norm": 3.046875,
"learning_rate": 0.00029894960557994146,
"loss": 5.5335,
"step": 2805
},
{
"epoch": 1.262353998203055,
"grad_norm": 2.9375,
"learning_rate": 0.00029893653389080274,
"loss": 5.3528,
"step": 2810
},
{
"epoch": 1.2646001796945194,
"grad_norm": 3.15625,
"learning_rate": 0.0002989233816897954,
"loss": 5.3309,
"step": 2815
},
{
"epoch": 1.266846361185984,
"grad_norm": 3.09375,
"learning_rate": 0.0002989101489848256,
"loss": 5.4407,
"step": 2820
},
{
"epoch": 1.2690925426774484,
"grad_norm": 3.421875,
"learning_rate": 0.0002988968357838477,
"loss": 5.3808,
"step": 2825
},
{
"epoch": 1.271338724168913,
"grad_norm": 2.9375,
"learning_rate": 0.0002988834420948647,
"loss": 5.4058,
"step": 2830
},
{
"epoch": 1.2735849056603774,
"grad_norm": 2.953125,
"learning_rate": 0.0002988699679259275,
"loss": 5.4674,
"step": 2835
},
{
"epoch": 1.275831087151842,
"grad_norm": 3.0,
"learning_rate": 0.00029885641328513594,
"loss": 5.4242,
"step": 2840
},
{
"epoch": 1.2780772686433064,
"grad_norm": 3.109375,
"learning_rate": 0.0002988427781806379,
"loss": 5.4332,
"step": 2845
},
{
"epoch": 1.280323450134771,
"grad_norm": 2.953125,
"learning_rate": 0.0002988290626206297,
"loss": 5.3583,
"step": 2850
},
{
"epoch": 1.2825696316262354,
"grad_norm": 3.328125,
"learning_rate": 0.000298815266613356,
"loss": 5.3448,
"step": 2855
},
{
"epoch": 1.2848158131176999,
"grad_norm": 3.03125,
"learning_rate": 0.0002988013901671099,
"loss": 5.4957,
"step": 2860
},
{
"epoch": 1.2870619946091644,
"grad_norm": 3.078125,
"learning_rate": 0.0002987874332902328,
"loss": 5.4692,
"step": 2865
},
{
"epoch": 1.2893081761006289,
"grad_norm": 3.09375,
"learning_rate": 0.0002987733959911144,
"loss": 5.3743,
"step": 2870
},
{
"epoch": 1.2915543575920934,
"grad_norm": 2.890625,
"learning_rate": 0.00029875927827819286,
"loss": 5.368,
"step": 2875
},
{
"epoch": 1.2938005390835579,
"grad_norm": 3.046875,
"learning_rate": 0.00029874508015995463,
"loss": 5.3748,
"step": 2880
},
{
"epoch": 1.2960467205750223,
"grad_norm": 3.140625,
"learning_rate": 0.0002987308016449344,
"loss": 5.3995,
"step": 2885
},
{
"epoch": 1.2982929020664868,
"grad_norm": 3.1875,
"learning_rate": 0.00029871644274171534,
"loss": 5.3753,
"step": 2890
},
{
"epoch": 1.3005390835579516,
"grad_norm": 3.234375,
"learning_rate": 0.00029870200345892876,
"loss": 5.4296,
"step": 2895
},
{
"epoch": 1.302785265049416,
"grad_norm": 3.09375,
"learning_rate": 0.00029868748380525444,
"loss": 5.315,
"step": 2900
},
{
"epoch": 1.3050314465408805,
"grad_norm": 3.125,
"learning_rate": 0.0002986728837894205,
"loss": 5.4592,
"step": 2905
},
{
"epoch": 1.307277628032345,
"grad_norm": 3.203125,
"learning_rate": 0.00029865820342020325,
"loss": 5.4735,
"step": 2910
},
{
"epoch": 1.3095238095238095,
"grad_norm": 3.109375,
"learning_rate": 0.0002986434427064273,
"loss": 5.3768,
"step": 2915
},
{
"epoch": 1.311769991015274,
"grad_norm": 2.890625,
"learning_rate": 0.0002986286016569657,
"loss": 5.381,
"step": 2920
},
{
"epoch": 1.3140161725067385,
"grad_norm": 2.890625,
"learning_rate": 0.0002986136802807396,
"loss": 5.4079,
"step": 2925
},
{
"epoch": 1.316262353998203,
"grad_norm": 3.21875,
"learning_rate": 0.00029859867858671857,
"loss": 5.435,
"step": 2930
},
{
"epoch": 1.3185085354896675,
"grad_norm": 3.171875,
"learning_rate": 0.00029858359658392045,
"loss": 5.4919,
"step": 2935
},
{
"epoch": 1.320754716981132,
"grad_norm": 2.859375,
"learning_rate": 0.00029856843428141127,
"loss": 5.3849,
"step": 2940
},
{
"epoch": 1.3230008984725967,
"grad_norm": 3.703125,
"learning_rate": 0.00029855319168830543,
"loss": 5.4001,
"step": 2945
},
{
"epoch": 1.3252470799640612,
"grad_norm": 3.375,
"learning_rate": 0.0002985378688137656,
"loss": 5.5048,
"step": 2950
},
{
"epoch": 1.3274932614555257,
"grad_norm": 3.03125,
"learning_rate": 0.00029852246566700253,
"loss": 5.367,
"step": 2955
},
{
"epoch": 1.3297394429469902,
"grad_norm": 2.921875,
"learning_rate": 0.0002985069822572754,
"loss": 5.3137,
"step": 2960
},
{
"epoch": 1.3319856244384547,
"grad_norm": 3.15625,
"learning_rate": 0.0002984914185938916,
"loss": 5.3961,
"step": 2965
},
{
"epoch": 1.3342318059299192,
"grad_norm": 3.1875,
"learning_rate": 0.0002984757746862068,
"loss": 5.4488,
"step": 2970
},
{
"epoch": 1.3364779874213837,
"grad_norm": 3.171875,
"learning_rate": 0.00029846005054362474,
"loss": 5.4318,
"step": 2975
},
{
"epoch": 1.3387241689128482,
"grad_norm": 2.96875,
"learning_rate": 0.0002984442461755977,
"loss": 5.3834,
"step": 2980
},
{
"epoch": 1.3409703504043127,
"grad_norm": 3.0625,
"learning_rate": 0.00029842836159162583,
"loss": 5.4205,
"step": 2985
},
{
"epoch": 1.3432165318957772,
"grad_norm": 2.90625,
"learning_rate": 0.0002984123968012577,
"loss": 5.4352,
"step": 2990
},
{
"epoch": 1.3454627133872417,
"grad_norm": 3.03125,
"learning_rate": 0.0002983963518140901,
"loss": 5.4451,
"step": 2995
},
{
"epoch": 1.3477088948787062,
"grad_norm": 3.0625,
"learning_rate": 0.00029838022663976793,
"loss": 5.3171,
"step": 3000
},
{
"epoch": 1.3477088948787062,
"eval_loss": 5.344548225402832,
"eval_runtime": 16.0596,
"eval_samples_per_second": 1931.124,
"eval_steps_per_second": 241.414,
"step": 3000
},
{
"epoch": 1.3499550763701706,
"grad_norm": 2.984375,
"learning_rate": 0.0002983640212879844,
"loss": 5.4371,
"step": 3005
},
{
"epoch": 1.3522012578616351,
"grad_norm": 3.265625,
"learning_rate": 0.0002983477357684809,
"loss": 5.3769,
"step": 3010
},
{
"epoch": 1.3544474393530996,
"grad_norm": 3.421875,
"learning_rate": 0.0002983313700910468,
"loss": 5.4952,
"step": 3015
},
{
"epoch": 1.3566936208445641,
"grad_norm": 2.96875,
"learning_rate": 0.00029831492426552,
"loss": 5.3494,
"step": 3020
},
{
"epoch": 1.3589398023360286,
"grad_norm": 3.0625,
"learning_rate": 0.00029829839830178636,
"loss": 5.4431,
"step": 3025
},
{
"epoch": 1.3611859838274933,
"grad_norm": 2.953125,
"learning_rate": 0.00029828179220977994,
"loss": 5.3644,
"step": 3030
},
{
"epoch": 1.3634321653189578,
"grad_norm": 3.1875,
"learning_rate": 0.000298265105999483,
"loss": 5.3982,
"step": 3035
},
{
"epoch": 1.3656783468104223,
"grad_norm": 3.03125,
"learning_rate": 0.00029824833968092595,
"loss": 5.3913,
"step": 3040
},
{
"epoch": 1.3679245283018868,
"grad_norm": 2.96875,
"learning_rate": 0.00029823149326418735,
"loss": 5.3851,
"step": 3045
},
{
"epoch": 1.3701707097933513,
"grad_norm": 3.0,
"learning_rate": 0.0002982145667593939,
"loss": 5.3206,
"step": 3050
},
{
"epoch": 1.3724168912848158,
"grad_norm": 3.203125,
"learning_rate": 0.00029819756017672043,
"loss": 5.3429,
"step": 3055
},
{
"epoch": 1.3746630727762803,
"grad_norm": 3.25,
"learning_rate": 0.00029818047352639,
"loss": 5.4596,
"step": 3060
},
{
"epoch": 1.3769092542677448,
"grad_norm": 3.078125,
"learning_rate": 0.00029816330681867366,
"loss": 5.3423,
"step": 3065
},
{
"epoch": 1.3791554357592093,
"grad_norm": 2.875,
"learning_rate": 0.0002981460600638907,
"loss": 5.3283,
"step": 3070
},
{
"epoch": 1.3814016172506738,
"grad_norm": 2.921875,
"learning_rate": 0.00029812873327240844,
"loss": 5.3159,
"step": 3075
},
{
"epoch": 1.3836477987421385,
"grad_norm": 2.890625,
"learning_rate": 0.0002981113264546424,
"loss": 5.3529,
"step": 3080
},
{
"epoch": 1.385893980233603,
"grad_norm": 3.125,
"learning_rate": 0.0002980938396210561,
"loss": 5.46,
"step": 3085
},
{
"epoch": 1.3881401617250675,
"grad_norm": 2.890625,
"learning_rate": 0.00029807627278216126,
"loss": 5.4219,
"step": 3090
},
{
"epoch": 1.390386343216532,
"grad_norm": 3.03125,
"learning_rate": 0.0002980586259485177,
"loss": 5.4519,
"step": 3095
},
{
"epoch": 1.3926325247079965,
"grad_norm": 3.15625,
"learning_rate": 0.00029804089913073315,
"loss": 5.4067,
"step": 3100
},
{
"epoch": 1.394878706199461,
"grad_norm": 3.046875,
"learning_rate": 0.0002980230923394637,
"loss": 5.348,
"step": 3105
},
{
"epoch": 1.3971248876909255,
"grad_norm": 3.109375,
"learning_rate": 0.00029800520558541317,
"loss": 5.3693,
"step": 3110
},
{
"epoch": 1.39937106918239,
"grad_norm": 2.96875,
"learning_rate": 0.0002979872388793338,
"loss": 5.3537,
"step": 3115
},
{
"epoch": 1.4016172506738545,
"grad_norm": 2.75,
"learning_rate": 0.00029796919223202563,
"loss": 5.3571,
"step": 3120
},
{
"epoch": 1.403863432165319,
"grad_norm": 3.0,
"learning_rate": 0.0002979510656543369,
"loss": 5.3759,
"step": 3125
},
{
"epoch": 1.4061096136567834,
"grad_norm": 3.109375,
"learning_rate": 0.0002979328591571639,
"loss": 5.3222,
"step": 3130
},
{
"epoch": 1.408355795148248,
"grad_norm": 3.0,
"learning_rate": 0.00029791457275145085,
"loss": 5.2987,
"step": 3135
},
{
"epoch": 1.4106019766397124,
"grad_norm": 2.984375,
"learning_rate": 0.00029789620644819005,
"loss": 5.3843,
"step": 3140
},
{
"epoch": 1.412848158131177,
"grad_norm": 3.03125,
"learning_rate": 0.00029787776025842186,
"loss": 5.3461,
"step": 3145
},
{
"epoch": 1.4150943396226414,
"grad_norm": 3.015625,
"learning_rate": 0.00029785923419323467,
"loss": 5.3381,
"step": 3150
},
{
"epoch": 1.417340521114106,
"grad_norm": 2.890625,
"learning_rate": 0.0002978406282637648,
"loss": 5.3985,
"step": 3155
},
{
"epoch": 1.4195867026055704,
"grad_norm": 2.953125,
"learning_rate": 0.0002978219424811967,
"loss": 5.3383,
"step": 3160
},
{
"epoch": 1.4218328840970351,
"grad_norm": 3.125,
"learning_rate": 0.00029780317685676276,
"loss": 5.4033,
"step": 3165
},
{
"epoch": 1.4240790655884996,
"grad_norm": 3.03125,
"learning_rate": 0.0002977843314017433,
"loss": 5.4135,
"step": 3170
},
{
"epoch": 1.426325247079964,
"grad_norm": 3.0,
"learning_rate": 0.0002977654061274668,
"loss": 5.3461,
"step": 3175
},
{
"epoch": 1.4285714285714286,
"grad_norm": 3.0625,
"learning_rate": 0.0002977464010453095,
"loss": 5.281,
"step": 3180
},
{
"epoch": 1.430817610062893,
"grad_norm": 3.359375,
"learning_rate": 0.0002977273161666957,
"loss": 5.4328,
"step": 3185
},
{
"epoch": 1.4330637915543576,
"grad_norm": 3.0,
"learning_rate": 0.00029770815150309787,
"loss": 5.3081,
"step": 3190
},
{
"epoch": 1.435309973045822,
"grad_norm": 2.984375,
"learning_rate": 0.0002976889070660361,
"loss": 5.4198,
"step": 3195
},
{
"epoch": 1.4375561545372866,
"grad_norm": 2.90625,
"learning_rate": 0.0002976695828670787,
"loss": 5.3054,
"step": 3200
},
{
"epoch": 1.439802336028751,
"grad_norm": 2.984375,
"learning_rate": 0.00029765017891784175,
"loss": 5.4182,
"step": 3205
},
{
"epoch": 1.4420485175202156,
"grad_norm": 2.765625,
"learning_rate": 0.00029763069522998936,
"loss": 5.3818,
"step": 3210
},
{
"epoch": 1.44429469901168,
"grad_norm": 2.78125,
"learning_rate": 0.0002976111318152336,
"loss": 5.34,
"step": 3215
},
{
"epoch": 1.4465408805031448,
"grad_norm": 2.96875,
"learning_rate": 0.0002975914886853344,
"loss": 5.4218,
"step": 3220
},
{
"epoch": 1.4487870619946093,
"grad_norm": 3.078125,
"learning_rate": 0.00029757176585209957,
"loss": 5.3399,
"step": 3225
},
{
"epoch": 1.4510332434860738,
"grad_norm": 3.265625,
"learning_rate": 0.000297551963327385,
"loss": 5.2921,
"step": 3230
},
{
"epoch": 1.4532794249775383,
"grad_norm": 2.890625,
"learning_rate": 0.00029753208112309423,
"loss": 5.3799,
"step": 3235
},
{
"epoch": 1.4555256064690028,
"grad_norm": 2.890625,
"learning_rate": 0.00029751211925117897,
"loss": 5.2984,
"step": 3240
},
{
"epoch": 1.4577717879604672,
"grad_norm": 3.265625,
"learning_rate": 0.00029749207772363867,
"loss": 5.379,
"step": 3245
},
{
"epoch": 1.4600179694519317,
"grad_norm": 3.0,
"learning_rate": 0.0002974719565525207,
"loss": 5.3465,
"step": 3250
},
{
"epoch": 1.4622641509433962,
"grad_norm": 2.90625,
"learning_rate": 0.0002974517557499201,
"loss": 5.413,
"step": 3255
},
{
"epoch": 1.4645103324348607,
"grad_norm": 3.40625,
"learning_rate": 0.00029743147532798023,
"loss": 5.2814,
"step": 3260
},
{
"epoch": 1.4667565139263252,
"grad_norm": 2.96875,
"learning_rate": 0.00029741111529889194,
"loss": 5.3454,
"step": 3265
},
{
"epoch": 1.4690026954177897,
"grad_norm": 3.078125,
"learning_rate": 0.000297390675674894,
"loss": 5.3013,
"step": 3270
},
{
"epoch": 1.4712488769092542,
"grad_norm": 3.09375,
"learning_rate": 0.0002973701564682731,
"loss": 5.2762,
"step": 3275
},
{
"epoch": 1.4734950584007187,
"grad_norm": 3.015625,
"learning_rate": 0.00029734955769136377,
"loss": 5.3686,
"step": 3280
},
{
"epoch": 1.4757412398921832,
"grad_norm": 3.140625,
"learning_rate": 0.00029732887935654827,
"loss": 5.3697,
"step": 3285
},
{
"epoch": 1.4779874213836477,
"grad_norm": 2.953125,
"learning_rate": 0.0002973081214762568,
"loss": 5.2504,
"step": 3290
},
{
"epoch": 1.4802336028751122,
"grad_norm": 2.9375,
"learning_rate": 0.00029728728406296735,
"loss": 5.3318,
"step": 3295
},
{
"epoch": 1.482479784366577,
"grad_norm": 3.078125,
"learning_rate": 0.00029726636712920564,
"loss": 5.3078,
"step": 3300
},
{
"epoch": 1.4847259658580414,
"grad_norm": 3.046875,
"learning_rate": 0.0002972453706875453,
"loss": 5.3814,
"step": 3305
},
{
"epoch": 1.486972147349506,
"grad_norm": 2.875,
"learning_rate": 0.0002972242947506076,
"loss": 5.2753,
"step": 3310
},
{
"epoch": 1.4892183288409704,
"grad_norm": 3.046875,
"learning_rate": 0.0002972031393310619,
"loss": 5.3256,
"step": 3315
},
{
"epoch": 1.4914645103324349,
"grad_norm": 3.0625,
"learning_rate": 0.0002971819044416249,
"loss": 5.3758,
"step": 3320
},
{
"epoch": 1.4937106918238994,
"grad_norm": 2.75,
"learning_rate": 0.00029716059009506145,
"loss": 5.3209,
"step": 3325
},
{
"epoch": 1.4959568733153639,
"grad_norm": 3.109375,
"learning_rate": 0.000297139196304184,
"loss": 5.3075,
"step": 3330
},
{
"epoch": 1.4982030548068284,
"grad_norm": 2.859375,
"learning_rate": 0.0002971177230818527,
"loss": 5.3805,
"step": 3335
},
{
"epoch": 1.5004492362982929,
"grad_norm": 3.0625,
"learning_rate": 0.0002970961704409756,
"loss": 5.3156,
"step": 3340
},
{
"epoch": 1.5026954177897576,
"grad_norm": 2.90625,
"learning_rate": 0.0002970745383945084,
"loss": 5.3465,
"step": 3345
},
{
"epoch": 1.504941599281222,
"grad_norm": 3.015625,
"learning_rate": 0.00029705282695545454,
"loss": 5.3717,
"step": 3350
},
{
"epoch": 1.5071877807726866,
"grad_norm": 3.09375,
"learning_rate": 0.00029703103613686527,
"loss": 5.2288,
"step": 3355
},
{
"epoch": 1.509433962264151,
"grad_norm": 3.1875,
"learning_rate": 0.0002970091659518393,
"loss": 5.2978,
"step": 3360
},
{
"epoch": 1.5116801437556155,
"grad_norm": 2.9375,
"learning_rate": 0.0002969872164135234,
"loss": 5.2993,
"step": 3365
},
{
"epoch": 1.51392632524708,
"grad_norm": 3.0625,
"learning_rate": 0.00029696518753511173,
"loss": 5.3231,
"step": 3370
},
{
"epoch": 1.5161725067385445,
"grad_norm": 2.796875,
"learning_rate": 0.0002969430793298464,
"loss": 5.334,
"step": 3375
},
{
"epoch": 1.518418688230009,
"grad_norm": 3.03125,
"learning_rate": 0.00029692089181101696,
"loss": 5.2514,
"step": 3380
},
{
"epoch": 1.5206648697214735,
"grad_norm": 2.890625,
"learning_rate": 0.0002968986249919609,
"loss": 5.3403,
"step": 3385
},
{
"epoch": 1.522911051212938,
"grad_norm": 3.0625,
"learning_rate": 0.0002968762788860631,
"loss": 5.3209,
"step": 3390
},
{
"epoch": 1.5251572327044025,
"grad_norm": 3.125,
"learning_rate": 0.0002968538535067564,
"loss": 5.3657,
"step": 3395
},
{
"epoch": 1.527403414195867,
"grad_norm": 2.96875,
"learning_rate": 0.000296831348867521,
"loss": 5.3167,
"step": 3400
},
{
"epoch": 1.5296495956873315,
"grad_norm": 2.953125,
"learning_rate": 0.0002968087649818848,
"loss": 5.2753,
"step": 3405
},
{
"epoch": 1.531895777178796,
"grad_norm": 2.984375,
"learning_rate": 0.0002967861018634237,
"loss": 5.3678,
"step": 3410
},
{
"epoch": 1.5341419586702605,
"grad_norm": 3.265625,
"learning_rate": 0.00029676335952576074,
"loss": 5.3243,
"step": 3415
},
{
"epoch": 1.536388140161725,
"grad_norm": 3.109375,
"learning_rate": 0.0002967405379825668,
"loss": 5.2466,
"step": 3420
},
{
"epoch": 1.5386343216531895,
"grad_norm": 2.953125,
"learning_rate": 0.0002967176372475604,
"loss": 5.2428,
"step": 3425
},
{
"epoch": 1.540880503144654,
"grad_norm": 2.890625,
"learning_rate": 0.0002966946573345076,
"loss": 5.2614,
"step": 3430
},
{
"epoch": 1.5431266846361185,
"grad_norm": 2.921875,
"learning_rate": 0.00029667159825722206,
"loss": 5.3399,
"step": 3435
},
{
"epoch": 1.545372866127583,
"grad_norm": 3.0625,
"learning_rate": 0.00029664846002956506,
"loss": 5.2338,
"step": 3440
},
{
"epoch": 1.5476190476190477,
"grad_norm": 3.09375,
"learning_rate": 0.0002966252426654454,
"loss": 5.3445,
"step": 3445
},
{
"epoch": 1.5498652291105122,
"grad_norm": 2.875,
"learning_rate": 0.0002966019461788196,
"loss": 5.2916,
"step": 3450
},
{
"epoch": 1.5521114106019767,
"grad_norm": 2.9375,
"learning_rate": 0.0002965785705836915,
"loss": 5.3159,
"step": 3455
},
{
"epoch": 1.5543575920934412,
"grad_norm": 3.1875,
"learning_rate": 0.0002965551158941127,
"loss": 5.3027,
"step": 3460
},
{
"epoch": 1.5566037735849056,
"grad_norm": 2.96875,
"learning_rate": 0.0002965315821241823,
"loss": 5.2319,
"step": 3465
},
{
"epoch": 1.5588499550763701,
"grad_norm": 3.875,
"learning_rate": 0.00029650796928804685,
"loss": 5.3169,
"step": 3470
},
{
"epoch": 1.5610961365678346,
"grad_norm": 3.03125,
"learning_rate": 0.0002964842773999005,
"loss": 5.2524,
"step": 3475
},
{
"epoch": 1.5633423180592994,
"grad_norm": 2.921875,
"learning_rate": 0.0002964605064739849,
"loss": 5.3455,
"step": 3480
},
{
"epoch": 1.5655884995507638,
"grad_norm": 3.015625,
"learning_rate": 0.0002964366565245892,
"loss": 5.3241,
"step": 3485
},
{
"epoch": 1.5678346810422283,
"grad_norm": 3.015625,
"learning_rate": 0.00029641272756605023,
"loss": 5.301,
"step": 3490
},
{
"epoch": 1.5700808625336928,
"grad_norm": 3.0,
"learning_rate": 0.0002963887196127519,
"loss": 5.2987,
"step": 3495
},
{
"epoch": 1.5723270440251573,
"grad_norm": 2.96875,
"learning_rate": 0.00029636463267912607,
"loss": 5.2262,
"step": 3500
},
{
"epoch": 1.5745732255166218,
"grad_norm": 2.90625,
"learning_rate": 0.00029634046677965174,
"loss": 5.2556,
"step": 3505
},
{
"epoch": 1.5768194070080863,
"grad_norm": 2.90625,
"learning_rate": 0.00029631622192885553,
"loss": 5.3328,
"step": 3510
},
{
"epoch": 1.5790655884995508,
"grad_norm": 3.078125,
"learning_rate": 0.00029629189814131155,
"loss": 5.3252,
"step": 3515
},
{
"epoch": 1.5813117699910153,
"grad_norm": 3.03125,
"learning_rate": 0.0002962674954316413,
"loss": 5.2871,
"step": 3520
},
{
"epoch": 1.5835579514824798,
"grad_norm": 2.890625,
"learning_rate": 0.0002962430138145137,
"loss": 5.2723,
"step": 3525
},
{
"epoch": 1.5858041329739443,
"grad_norm": 2.765625,
"learning_rate": 0.000296218453304645,
"loss": 5.2836,
"step": 3530
},
{
"epoch": 1.5880503144654088,
"grad_norm": 3.015625,
"learning_rate": 0.00029619381391679923,
"loss": 5.3014,
"step": 3535
},
{
"epoch": 1.5902964959568733,
"grad_norm": 2.890625,
"learning_rate": 0.00029616909566578746,
"loss": 5.2194,
"step": 3540
},
{
"epoch": 1.5925426774483378,
"grad_norm": 2.875,
"learning_rate": 0.0002961442985664684,
"loss": 5.3363,
"step": 3545
},
{
"epoch": 1.5947888589398023,
"grad_norm": 2.875,
"learning_rate": 0.000296119422633748,
"loss": 5.2192,
"step": 3550
},
{
"epoch": 1.5970350404312668,
"grad_norm": 3.109375,
"learning_rate": 0.0002960944678825797,
"loss": 5.2585,
"step": 3555
},
{
"epoch": 1.5992812219227313,
"grad_norm": 3.40625,
"learning_rate": 0.0002960694343279643,
"loss": 5.4105,
"step": 3560
},
{
"epoch": 1.6015274034141957,
"grad_norm": 2.953125,
"learning_rate": 0.0002960443219849499,
"loss": 5.2834,
"step": 3565
},
{
"epoch": 1.6037735849056602,
"grad_norm": 2.953125,
"learning_rate": 0.0002960191308686321,
"loss": 5.2917,
"step": 3570
},
{
"epoch": 1.6060197663971247,
"grad_norm": 2.953125,
"learning_rate": 0.0002959938609941537,
"loss": 5.3014,
"step": 3575
},
{
"epoch": 1.6082659478885895,
"grad_norm": 3.0625,
"learning_rate": 0.00029596851237670494,
"loss": 5.2469,
"step": 3580
},
{
"epoch": 1.610512129380054,
"grad_norm": 3.046875,
"learning_rate": 0.00029594308503152344,
"loss": 5.2651,
"step": 3585
},
{
"epoch": 1.6127583108715184,
"grad_norm": 2.9375,
"learning_rate": 0.00029591757897389403,
"loss": 5.2144,
"step": 3590
},
{
"epoch": 1.615004492362983,
"grad_norm": 3.015625,
"learning_rate": 0.00029589199421914885,
"loss": 5.2536,
"step": 3595
},
{
"epoch": 1.6172506738544474,
"grad_norm": 2.90625,
"learning_rate": 0.0002958663307826674,
"loss": 5.2291,
"step": 3600
},
{
"epoch": 1.619496855345912,
"grad_norm": 2.875,
"learning_rate": 0.00029584058867987656,
"loss": 5.2936,
"step": 3605
},
{
"epoch": 1.6217430368373764,
"grad_norm": 3.171875,
"learning_rate": 0.00029581476792625035,
"loss": 5.3135,
"step": 3610
},
{
"epoch": 1.6239892183288411,
"grad_norm": 3.078125,
"learning_rate": 0.0002957888685373101,
"loss": 5.2395,
"step": 3615
},
{
"epoch": 1.6262353998203056,
"grad_norm": 3.015625,
"learning_rate": 0.0002957628905286245,
"loss": 5.2269,
"step": 3620
},
{
"epoch": 1.6284815813117701,
"grad_norm": 2.953125,
"learning_rate": 0.00029573683391580946,
"loss": 5.2192,
"step": 3625
},
{
"epoch": 1.6307277628032346,
"grad_norm": 3.109375,
"learning_rate": 0.000295710698714528,
"loss": 5.2539,
"step": 3630
},
{
"epoch": 1.632973944294699,
"grad_norm": 3.03125,
"learning_rate": 0.0002956844849404906,
"loss": 5.2506,
"step": 3635
},
{
"epoch": 1.6352201257861636,
"grad_norm": 2.78125,
"learning_rate": 0.00029565819260945483,
"loss": 5.2739,
"step": 3640
},
{
"epoch": 1.637466307277628,
"grad_norm": 3.03125,
"learning_rate": 0.00029563182173722555,
"loss": 5.232,
"step": 3645
},
{
"epoch": 1.6397124887690926,
"grad_norm": 2.890625,
"learning_rate": 0.0002956053723396548,
"loss": 5.3054,
"step": 3650
},
{
"epoch": 1.641958670260557,
"grad_norm": 2.9375,
"learning_rate": 0.0002955788444326418,
"loss": 5.2955,
"step": 3655
},
{
"epoch": 1.6442048517520216,
"grad_norm": 3.0,
"learning_rate": 0.00029555223803213305,
"loss": 5.2577,
"step": 3660
},
{
"epoch": 1.646451033243486,
"grad_norm": 2.96875,
"learning_rate": 0.00029552555315412216,
"loss": 5.2796,
"step": 3665
},
{
"epoch": 1.6486972147349506,
"grad_norm": 3.75,
"learning_rate": 0.0002954987898146499,
"loss": 5.3159,
"step": 3670
},
{
"epoch": 1.650943396226415,
"grad_norm": 2.890625,
"learning_rate": 0.0002954719480298043,
"loss": 5.2639,
"step": 3675
},
{
"epoch": 1.6531895777178796,
"grad_norm": 2.875,
"learning_rate": 0.00029544502781572035,
"loss": 5.2906,
"step": 3680
},
{
"epoch": 1.655435759209344,
"grad_norm": 4.75,
"learning_rate": 0.0002954180291885804,
"loss": 5.299,
"step": 3685
},
{
"epoch": 1.6576819407008085,
"grad_norm": 3.046875,
"learning_rate": 0.00029539095216461395,
"loss": 5.2026,
"step": 3690
},
{
"epoch": 1.659928122192273,
"grad_norm": 2.859375,
"learning_rate": 0.0002953637967600974,
"loss": 5.2159,
"step": 3695
},
{
"epoch": 1.6621743036837375,
"grad_norm": 2.953125,
"learning_rate": 0.0002953365629913544,
"loss": 5.22,
"step": 3700
},
{
"epoch": 1.664420485175202,
"grad_norm": 3.015625,
"learning_rate": 0.0002953092508747557,
"loss": 5.1528,
"step": 3705
},
{
"epoch": 1.6666666666666665,
"grad_norm": 3.09375,
"learning_rate": 0.0002952818604267193,
"loss": 5.234,
"step": 3710
},
{
"epoch": 1.668912848158131,
"grad_norm": 3.421875,
"learning_rate": 0.0002952543916637099,
"loss": 5.263,
"step": 3715
},
{
"epoch": 1.6711590296495957,
"grad_norm": 2.984375,
"learning_rate": 0.00029522684460223965,
"loss": 5.2879,
"step": 3720
},
{
"epoch": 1.6734052111410602,
"grad_norm": 3.046875,
"learning_rate": 0.0002951992192588676,
"loss": 5.2081,
"step": 3725
},
{
"epoch": 1.6756513926325247,
"grad_norm": 2.953125,
"learning_rate": 0.0002951715156501999,
"loss": 5.2688,
"step": 3730
},
{
"epoch": 1.6778975741239892,
"grad_norm": 3.0,
"learning_rate": 0.00029514373379288967,
"loss": 5.2266,
"step": 3735
},
{
"epoch": 1.6801437556154537,
"grad_norm": 2.859375,
"learning_rate": 0.0002951158737036372,
"loss": 5.2542,
"step": 3740
},
{
"epoch": 1.6823899371069182,
"grad_norm": 2.984375,
"learning_rate": 0.0002950879353991897,
"loss": 5.2341,
"step": 3745
},
{
"epoch": 1.684636118598383,
"grad_norm": 3.171875,
"learning_rate": 0.0002950599188963414,
"loss": 5.2238,
"step": 3750
},
{
"epoch": 1.6868823000898474,
"grad_norm": 3.09375,
"learning_rate": 0.0002950318242119337,
"loss": 5.3397,
"step": 3755
},
{
"epoch": 1.689128481581312,
"grad_norm": 3.015625,
"learning_rate": 0.0002950036513628547,
"loss": 5.2441,
"step": 3760
},
{
"epoch": 1.6913746630727764,
"grad_norm": 2.859375,
"learning_rate": 0.0002949754003660397,
"loss": 5.3238,
"step": 3765
},
{
"epoch": 1.693620844564241,
"grad_norm": 3.390625,
"learning_rate": 0.00029494707123847095,
"loss": 5.3302,
"step": 3770
},
{
"epoch": 1.6958670260557054,
"grad_norm": 3.28125,
"learning_rate": 0.0002949186639971777,
"loss": 5.2831,
"step": 3775
},
{
"epoch": 1.6981132075471699,
"grad_norm": 3.078125,
"learning_rate": 0.00029489017865923597,
"loss": 5.2566,
"step": 3780
},
{
"epoch": 1.7003593890386344,
"grad_norm": 2.9375,
"learning_rate": 0.00029486161524176893,
"loss": 5.2631,
"step": 3785
},
{
"epoch": 1.7026055705300989,
"grad_norm": 3.046875,
"learning_rate": 0.0002948329737619466,
"loss": 5.2597,
"step": 3790
},
{
"epoch": 1.7048517520215634,
"grad_norm": 3.265625,
"learning_rate": 0.0002948042542369859,
"loss": 5.2838,
"step": 3795
},
{
"epoch": 1.7070979335130279,
"grad_norm": 2.9375,
"learning_rate": 0.0002947754566841508,
"loss": 5.2681,
"step": 3800
},
{
"epoch": 1.7093441150044923,
"grad_norm": 3.046875,
"learning_rate": 0.00029474658112075197,
"loss": 5.3089,
"step": 3805
},
{
"epoch": 1.7115902964959568,
"grad_norm": 3.03125,
"learning_rate": 0.00029471762756414703,
"loss": 5.2663,
"step": 3810
},
{
"epoch": 1.7138364779874213,
"grad_norm": 2.953125,
"learning_rate": 0.00029468859603174065,
"loss": 5.2597,
"step": 3815
},
{
"epoch": 1.7160826594788858,
"grad_norm": 3.046875,
"learning_rate": 0.00029465948654098427,
"loss": 5.2646,
"step": 3820
},
{
"epoch": 1.7183288409703503,
"grad_norm": 2.890625,
"learning_rate": 0.0002946302991093761,
"loss": 5.2662,
"step": 3825
},
{
"epoch": 1.7205750224618148,
"grad_norm": 2.890625,
"learning_rate": 0.00029460103375446116,
"loss": 5.2176,
"step": 3830
},
{
"epoch": 1.7228212039532793,
"grad_norm": 2.84375,
"learning_rate": 0.00029457169049383164,
"loss": 5.225,
"step": 3835
},
{
"epoch": 1.7250673854447438,
"grad_norm": 3.09375,
"learning_rate": 0.00029454226934512624,
"loss": 5.2631,
"step": 3840
},
{
"epoch": 1.7273135669362083,
"grad_norm": 2.8125,
"learning_rate": 0.00029451277032603064,
"loss": 5.2029,
"step": 3845
},
{
"epoch": 1.7295597484276728,
"grad_norm": 2.921875,
"learning_rate": 0.0002944831934542772,
"loss": 5.2321,
"step": 3850
},
{
"epoch": 1.7318059299191375,
"grad_norm": 3.03125,
"learning_rate": 0.00029445353874764526,
"loss": 5.2173,
"step": 3855
},
{
"epoch": 1.734052111410602,
"grad_norm": 2.90625,
"learning_rate": 0.00029442380622396073,
"loss": 5.2293,
"step": 3860
},
{
"epoch": 1.7362982929020665,
"grad_norm": 2.984375,
"learning_rate": 0.00029439399590109645,
"loss": 5.1509,
"step": 3865
},
{
"epoch": 1.738544474393531,
"grad_norm": 2.890625,
"learning_rate": 0.00029436410779697206,
"loss": 5.2911,
"step": 3870
},
{
"epoch": 1.7407906558849955,
"grad_norm": 3.03125,
"learning_rate": 0.00029433414192955377,
"loss": 5.1782,
"step": 3875
},
{
"epoch": 1.74303683737646,
"grad_norm": 3.0,
"learning_rate": 0.0002943040983168547,
"loss": 5.2294,
"step": 3880
},
{
"epoch": 1.7452830188679245,
"grad_norm": 3.171875,
"learning_rate": 0.0002942739769769347,
"loss": 5.2567,
"step": 3885
},
{
"epoch": 1.7475292003593892,
"grad_norm": 3.546875,
"learning_rate": 0.00029424377792790023,
"loss": 5.2894,
"step": 3890
},
{
"epoch": 1.7497753818508537,
"grad_norm": 2.953125,
"learning_rate": 0.0002942135011879046,
"loss": 5.3933,
"step": 3895
},
{
"epoch": 1.7520215633423182,
"grad_norm": 3.1875,
"learning_rate": 0.00029418314677514764,
"loss": 5.295,
"step": 3900
},
{
"epoch": 1.7542677448337827,
"grad_norm": 3.15625,
"learning_rate": 0.0002941527147078761,
"loss": 5.1949,
"step": 3905
},
{
"epoch": 1.7565139263252472,
"grad_norm": 2.96875,
"learning_rate": 0.00029412220500438317,
"loss": 5.1329,
"step": 3910
},
{
"epoch": 1.7587601078167117,
"grad_norm": 3.109375,
"learning_rate": 0.0002940916176830089,
"loss": 5.3141,
"step": 3915
},
{
"epoch": 1.7610062893081762,
"grad_norm": 3.109375,
"learning_rate": 0.0002940609527621399,
"loss": 5.2578,
"step": 3920
},
{
"epoch": 1.7632524707996406,
"grad_norm": 3.0,
"learning_rate": 0.00029403021026020955,
"loss": 5.2614,
"step": 3925
},
{
"epoch": 1.7654986522911051,
"grad_norm": 3.109375,
"learning_rate": 0.00029399939019569767,
"loss": 5.2955,
"step": 3930
},
{
"epoch": 1.7677448337825696,
"grad_norm": 2.9375,
"learning_rate": 0.00029396849258713084,
"loss": 5.2972,
"step": 3935
},
{
"epoch": 1.7699910152740341,
"grad_norm": 3.09375,
"learning_rate": 0.00029393751745308215,
"loss": 5.2714,
"step": 3940
},
{
"epoch": 1.7722371967654986,
"grad_norm": 3.234375,
"learning_rate": 0.0002939064648121714,
"loss": 5.2846,
"step": 3945
},
{
"epoch": 1.7744833782569631,
"grad_norm": 2.90625,
"learning_rate": 0.00029387533468306504,
"loss": 5.263,
"step": 3950
},
{
"epoch": 1.7767295597484276,
"grad_norm": 3.09375,
"learning_rate": 0.0002938441270844758,
"loss": 5.1442,
"step": 3955
},
{
"epoch": 1.778975741239892,
"grad_norm": 2.859375,
"learning_rate": 0.00029381284203516334,
"loss": 5.209,
"step": 3960
},
{
"epoch": 1.7812219227313566,
"grad_norm": 3.078125,
"learning_rate": 0.00029378147955393363,
"loss": 5.2285,
"step": 3965
},
{
"epoch": 1.783468104222821,
"grad_norm": 3.171875,
"learning_rate": 0.00029375003965963935,
"loss": 5.2605,
"step": 3970
},
{
"epoch": 1.7857142857142856,
"grad_norm": 2.921875,
"learning_rate": 0.00029371852237117957,
"loss": 5.2557,
"step": 3975
},
{
"epoch": 1.78796046720575,
"grad_norm": 2.96875,
"learning_rate": 0.00029368692770749994,
"loss": 5.1953,
"step": 3980
},
{
"epoch": 1.7902066486972146,
"grad_norm": 3.0,
"learning_rate": 0.00029365525568759266,
"loss": 5.2138,
"step": 3985
},
{
"epoch": 1.7924528301886793,
"grad_norm": 3.03125,
"learning_rate": 0.0002936235063304964,
"loss": 5.2362,
"step": 3990
},
{
"epoch": 1.7946990116801438,
"grad_norm": 3.703125,
"learning_rate": 0.0002935916796552963,
"loss": 5.238,
"step": 3995
},
{
"epoch": 1.7969451931716083,
"grad_norm": 3.03125,
"learning_rate": 0.00029355977568112403,
"loss": 5.2092,
"step": 4000
},
{
"epoch": 1.7969451931716083,
"eval_loss": 5.183039665222168,
"eval_runtime": 16.1808,
"eval_samples_per_second": 1916.649,
"eval_steps_per_second": 239.604,
"step": 4000
},
{
"epoch": 1.7991913746630728,
"grad_norm": 2.875,
"learning_rate": 0.00029352779442715765,
"loss": 5.2075,
"step": 4005
},
{
"epoch": 1.8014375561545373,
"grad_norm": 3.0,
"learning_rate": 0.0002934957359126218,
"loss": 5.1898,
"step": 4010
},
{
"epoch": 1.8036837376460018,
"grad_norm": 3.25,
"learning_rate": 0.0002934636001567873,
"loss": 5.2844,
"step": 4015
},
{
"epoch": 1.8059299191374663,
"grad_norm": 3.109375,
"learning_rate": 0.0002934313871789718,
"loss": 5.2941,
"step": 4020
},
{
"epoch": 1.808176100628931,
"grad_norm": 3.140625,
"learning_rate": 0.00029339909699853904,
"loss": 5.3192,
"step": 4025
},
{
"epoch": 1.8104222821203955,
"grad_norm": 3.015625,
"learning_rate": 0.00029336672963489925,
"loss": 5.1957,
"step": 4030
},
{
"epoch": 1.81266846361186,
"grad_norm": 2.890625,
"learning_rate": 0.0002933342851075092,
"loss": 5.2322,
"step": 4035
},
{
"epoch": 1.8149146451033245,
"grad_norm": 2.921875,
"learning_rate": 0.00029330176343587175,
"loss": 5.124,
"step": 4040
},
{
"epoch": 1.817160826594789,
"grad_norm": 2.921875,
"learning_rate": 0.00029326916463953646,
"loss": 5.195,
"step": 4045
},
{
"epoch": 1.8194070080862534,
"grad_norm": 3.03125,
"learning_rate": 0.0002932364887380991,
"loss": 5.2398,
"step": 4050
},
{
"epoch": 1.821653189577718,
"grad_norm": 3.03125,
"learning_rate": 0.00029320373575120174,
"loss": 5.1243,
"step": 4055
},
{
"epoch": 1.8238993710691824,
"grad_norm": 2.921875,
"learning_rate": 0.0002931709056985328,
"loss": 5.1875,
"step": 4060
},
{
"epoch": 1.826145552560647,
"grad_norm": 3.140625,
"learning_rate": 0.0002931379985998272,
"loss": 5.2679,
"step": 4065
},
{
"epoch": 1.8283917340521114,
"grad_norm": 3.109375,
"learning_rate": 0.0002931050144748659,
"loss": 5.1371,
"step": 4070
},
{
"epoch": 1.830637915543576,
"grad_norm": 2.921875,
"learning_rate": 0.0002930719533434764,
"loss": 5.2114,
"step": 4075
},
{
"epoch": 1.8328840970350404,
"grad_norm": 2.984375,
"learning_rate": 0.0002930388152255323,
"loss": 5.2132,
"step": 4080
},
{
"epoch": 1.835130278526505,
"grad_norm": 2.96875,
"learning_rate": 0.0002930056001409537,
"loss": 5.211,
"step": 4085
},
{
"epoch": 1.8373764600179694,
"grad_norm": 2.921875,
"learning_rate": 0.0002929723081097067,
"loss": 5.1184,
"step": 4090
},
{
"epoch": 1.8396226415094339,
"grad_norm": 2.796875,
"learning_rate": 0.00029293893915180387,
"loss": 5.1128,
"step": 4095
},
{
"epoch": 1.8418688230008984,
"grad_norm": 3.078125,
"learning_rate": 0.00029290549328730395,
"loss": 5.2356,
"step": 4100
},
{
"epoch": 1.8441150044923629,
"grad_norm": 3.140625,
"learning_rate": 0.0002928719705363118,
"loss": 5.1903,
"step": 4105
},
{
"epoch": 1.8463611859838274,
"grad_norm": 2.9375,
"learning_rate": 0.00029283837091897876,
"loss": 5.1552,
"step": 4110
},
{
"epoch": 1.8486073674752919,
"grad_norm": 3.015625,
"learning_rate": 0.00029280469445550213,
"loss": 5.1519,
"step": 4115
},
{
"epoch": 1.8508535489667564,
"grad_norm": 3.09375,
"learning_rate": 0.0002927709411661255,
"loss": 5.181,
"step": 4120
},
{
"epoch": 1.853099730458221,
"grad_norm": 3.15625,
"learning_rate": 0.00029273711107113856,
"loss": 5.1855,
"step": 4125
},
{
"epoch": 1.8553459119496856,
"grad_norm": 3.015625,
"learning_rate": 0.00029270320419087743,
"loss": 5.2248,
"step": 4130
},
{
"epoch": 1.85759209344115,
"grad_norm": 3.015625,
"learning_rate": 0.00029266922054572395,
"loss": 5.1783,
"step": 4135
},
{
"epoch": 1.8598382749326146,
"grad_norm": 2.890625,
"learning_rate": 0.00029263516015610655,
"loss": 5.2069,
"step": 4140
},
{
"epoch": 1.862084456424079,
"grad_norm": 3.078125,
"learning_rate": 0.0002926010230424995,
"loss": 5.1962,
"step": 4145
},
{
"epoch": 1.8643306379155435,
"grad_norm": 3.0625,
"learning_rate": 0.00029256680922542334,
"loss": 5.1803,
"step": 4150
},
{
"epoch": 1.866576819407008,
"grad_norm": 2.875,
"learning_rate": 0.0002925325187254446,
"loss": 5.2128,
"step": 4155
},
{
"epoch": 1.8688230008984728,
"grad_norm": 2.78125,
"learning_rate": 0.00029249815156317605,
"loss": 5.184,
"step": 4160
},
{
"epoch": 1.8710691823899372,
"grad_norm": 3.109375,
"learning_rate": 0.0002924637077592764,
"loss": 5.2263,
"step": 4165
},
{
"epoch": 1.8733153638814017,
"grad_norm": 3.15625,
"learning_rate": 0.0002924291873344505,
"loss": 5.1901,
"step": 4170
},
{
"epoch": 1.8755615453728662,
"grad_norm": 2.921875,
"learning_rate": 0.00029239459030944935,
"loss": 5.2521,
"step": 4175
},
{
"epoch": 1.8778077268643307,
"grad_norm": 2.9375,
"learning_rate": 0.0002923599167050697,
"loss": 5.167,
"step": 4180
},
{
"epoch": 1.8800539083557952,
"grad_norm": 2.9375,
"learning_rate": 0.0002923251665421547,
"loss": 5.1813,
"step": 4185
},
{
"epoch": 1.8823000898472597,
"grad_norm": 2.8125,
"learning_rate": 0.0002922903398415933,
"loss": 5.2392,
"step": 4190
},
{
"epoch": 1.8845462713387242,
"grad_norm": 3.0,
"learning_rate": 0.0002922554366243205,
"loss": 5.2032,
"step": 4195
},
{
"epoch": 1.8867924528301887,
"grad_norm": 3.421875,
"learning_rate": 0.00029222045691131737,
"loss": 5.1849,
"step": 4200
},
{
"epoch": 1.8890386343216532,
"grad_norm": 2.90625,
"learning_rate": 0.00029218540072361074,
"loss": 5.1958,
"step": 4205
},
{
"epoch": 1.8912848158131177,
"grad_norm": 2.921875,
"learning_rate": 0.0002921502680822738,
"loss": 5.174,
"step": 4210
},
{
"epoch": 1.8935309973045822,
"grad_norm": 3.25,
"learning_rate": 0.0002921150590084252,
"loss": 5.2986,
"step": 4215
},
{
"epoch": 1.8957771787960467,
"grad_norm": 3.125,
"learning_rate": 0.00029207977352323005,
"loss": 5.1103,
"step": 4220
},
{
"epoch": 1.8980233602875112,
"grad_norm": 2.796875,
"learning_rate": 0.000292044411647899,
"loss": 5.2693,
"step": 4225
},
{
"epoch": 1.9002695417789757,
"grad_norm": 3.046875,
"learning_rate": 0.00029200897340368883,
"loss": 5.219,
"step": 4230
},
{
"epoch": 1.9025157232704402,
"grad_norm": 2.921875,
"learning_rate": 0.0002919734588119021,
"loss": 5.1556,
"step": 4235
},
{
"epoch": 1.9047619047619047,
"grad_norm": 3.15625,
"learning_rate": 0.0002919378678938874,
"loss": 5.202,
"step": 4240
},
{
"epoch": 1.9070080862533692,
"grad_norm": 2.921875,
"learning_rate": 0.000291902200671039,
"loss": 5.1384,
"step": 4245
},
{
"epoch": 1.9092542677448336,
"grad_norm": 3.140625,
"learning_rate": 0.00029186645716479734,
"loss": 5.1446,
"step": 4250
},
{
"epoch": 1.9115004492362981,
"grad_norm": 3.3125,
"learning_rate": 0.0002918306373966484,
"loss": 5.3229,
"step": 4255
},
{
"epoch": 1.9137466307277629,
"grad_norm": 2.96875,
"learning_rate": 0.00029179474138812424,
"loss": 5.1863,
"step": 4260
},
{
"epoch": 1.9159928122192273,
"grad_norm": 3.046875,
"learning_rate": 0.0002917587691608026,
"loss": 5.1948,
"step": 4265
},
{
"epoch": 1.9182389937106918,
"grad_norm": 3.25,
"learning_rate": 0.00029172272073630707,
"loss": 5.1398,
"step": 4270
},
{
"epoch": 1.9204851752021563,
"grad_norm": 2.90625,
"learning_rate": 0.000291686596136307,
"loss": 5.2248,
"step": 4275
},
{
"epoch": 1.9227313566936208,
"grad_norm": 2.921875,
"learning_rate": 0.00029165039538251786,
"loss": 5.2137,
"step": 4280
},
{
"epoch": 1.9249775381850853,
"grad_norm": 3.046875,
"learning_rate": 0.00029161411849670034,
"loss": 5.2118,
"step": 4285
},
{
"epoch": 1.9272237196765498,
"grad_norm": 3.09375,
"learning_rate": 0.00029157776550066134,
"loss": 5.1821,
"step": 4290
},
{
"epoch": 1.9294699011680145,
"grad_norm": 2.890625,
"learning_rate": 0.0002915413364162533,
"loss": 5.1385,
"step": 4295
},
{
"epoch": 1.931716082659479,
"grad_norm": 2.9375,
"learning_rate": 0.00029150483126537445,
"loss": 5.1265,
"step": 4300
},
{
"epoch": 1.9339622641509435,
"grad_norm": 2.921875,
"learning_rate": 0.0002914682500699688,
"loss": 5.173,
"step": 4305
},
{
"epoch": 1.936208445642408,
"grad_norm": 3.25,
"learning_rate": 0.00029143159285202597,
"loss": 5.175,
"step": 4310
},
{
"epoch": 1.9384546271338725,
"grad_norm": 2.921875,
"learning_rate": 0.0002913948596335814,
"loss": 5.1925,
"step": 4315
},
{
"epoch": 1.940700808625337,
"grad_norm": 3.21875,
"learning_rate": 0.00029135805043671597,
"loss": 5.1982,
"step": 4320
},
{
"epoch": 1.9429469901168015,
"grad_norm": 3.015625,
"learning_rate": 0.0002913211652835567,
"loss": 5.1497,
"step": 4325
},
{
"epoch": 1.945193171608266,
"grad_norm": 2.890625,
"learning_rate": 0.00029128420419627566,
"loss": 5.151,
"step": 4330
},
{
"epoch": 1.9474393530997305,
"grad_norm": 3.015625,
"learning_rate": 0.00029124716719709114,
"loss": 5.1051,
"step": 4335
},
{
"epoch": 1.949685534591195,
"grad_norm": 2.9375,
"learning_rate": 0.0002912100543082666,
"loss": 5.1568,
"step": 4340
},
{
"epoch": 1.9519317160826595,
"grad_norm": 2.9375,
"learning_rate": 0.0002911728655521115,
"loss": 5.1824,
"step": 4345
},
{
"epoch": 1.954177897574124,
"grad_norm": 3.03125,
"learning_rate": 0.00029113560095098064,
"loss": 5.1908,
"step": 4350
},
{
"epoch": 1.9564240790655885,
"grad_norm": 3.046875,
"learning_rate": 0.0002910982605272745,
"loss": 5.1337,
"step": 4355
},
{
"epoch": 1.958670260557053,
"grad_norm": 2.953125,
"learning_rate": 0.0002910608443034391,
"loss": 5.2017,
"step": 4360
},
{
"epoch": 1.9609164420485174,
"grad_norm": 2.84375,
"learning_rate": 0.00029102335230196615,
"loss": 5.131,
"step": 4365
},
{
"epoch": 1.963162623539982,
"grad_norm": 2.875,
"learning_rate": 0.00029098578454539274,
"loss": 5.1247,
"step": 4370
},
{
"epoch": 1.9654088050314464,
"grad_norm": 3.046875,
"learning_rate": 0.0002909481410563017,
"loss": 5.1947,
"step": 4375
},
{
"epoch": 1.967654986522911,
"grad_norm": 2.9375,
"learning_rate": 0.0002909104218573211,
"loss": 5.162,
"step": 4380
},
{
"epoch": 1.9699011680143754,
"grad_norm": 2.9375,
"learning_rate": 0.00029087262697112494,
"loss": 5.1051,
"step": 4385
},
{
"epoch": 1.97214734950584,
"grad_norm": 2.9375,
"learning_rate": 0.00029083475642043216,
"loss": 5.1855,
"step": 4390
},
{
"epoch": 1.9743935309973046,
"grad_norm": 3.046875,
"learning_rate": 0.0002907968102280077,
"loss": 5.1933,
"step": 4395
},
{
"epoch": 1.9766397124887691,
"grad_norm": 3.234375,
"learning_rate": 0.0002907587884166616,
"loss": 5.1138,
"step": 4400
},
{
"epoch": 1.9788858939802336,
"grad_norm": 3.1875,
"learning_rate": 0.0002907206910092498,
"loss": 5.1579,
"step": 4405
},
{
"epoch": 1.9811320754716981,
"grad_norm": 2.96875,
"learning_rate": 0.000290682518028673,
"loss": 5.1163,
"step": 4410
},
{
"epoch": 1.9833782569631626,
"grad_norm": 2.921875,
"learning_rate": 0.00029064426949787807,
"loss": 5.1887,
"step": 4415
},
{
"epoch": 1.985624438454627,
"grad_norm": 2.984375,
"learning_rate": 0.0002906059454398567,
"loss": 5.2164,
"step": 4420
},
{
"epoch": 1.9878706199460916,
"grad_norm": 3.125,
"learning_rate": 0.0002905675458776464,
"loss": 5.0996,
"step": 4425
},
{
"epoch": 1.9901168014375563,
"grad_norm": 3.03125,
"learning_rate": 0.0002905290708343298,
"loss": 5.1728,
"step": 4430
},
{
"epoch": 1.9923629829290208,
"grad_norm": 3.0,
"learning_rate": 0.00029049052033303514,
"loss": 5.1126,
"step": 4435
},
{
"epoch": 1.9946091644204853,
"grad_norm": 3.03125,
"learning_rate": 0.00029045189439693564,
"loss": 5.1486,
"step": 4440
},
{
"epoch": 1.9968553459119498,
"grad_norm": 2.890625,
"learning_rate": 0.00029041319304925036,
"loss": 5.098,
"step": 4445
},
{
"epoch": 1.9991015274034143,
"grad_norm": 3.0,
"learning_rate": 0.0002903744163132432,
"loss": 5.1236,
"step": 4450
},
{
"epoch": 2.001347708894879,
"grad_norm": 3.171875,
"learning_rate": 0.00029033556421222383,
"loss": 5.1441,
"step": 4455
},
{
"epoch": 2.0035938903863433,
"grad_norm": 2.984375,
"learning_rate": 0.0002902966367695468,
"loss": 5.0451,
"step": 4460
},
{
"epoch": 2.0058400718778078,
"grad_norm": 2.859375,
"learning_rate": 0.00029025763400861236,
"loss": 5.104,
"step": 4465
},
{
"epoch": 2.0080862533692723,
"grad_norm": 2.9375,
"learning_rate": 0.00029021855595286574,
"loss": 5.0897,
"step": 4470
},
{
"epoch": 2.0103324348607368,
"grad_norm": 2.90625,
"learning_rate": 0.0002901794026257975,
"loss": 4.9517,
"step": 4475
},
{
"epoch": 2.0125786163522013,
"grad_norm": 2.859375,
"learning_rate": 0.0002901401740509435,
"loss": 4.9774,
"step": 4480
},
{
"epoch": 2.0148247978436657,
"grad_norm": 2.96875,
"learning_rate": 0.0002901008702518848,
"loss": 4.986,
"step": 4485
},
{
"epoch": 2.0170709793351302,
"grad_norm": 3.015625,
"learning_rate": 0.0002900614912522476,
"loss": 5.0134,
"step": 4490
},
{
"epoch": 2.0193171608265947,
"grad_norm": 3.3125,
"learning_rate": 0.0002900220370757035,
"loss": 5.0922,
"step": 4495
},
{
"epoch": 2.0215633423180592,
"grad_norm": 2.8125,
"learning_rate": 0.0002899825077459692,
"loss": 5.0198,
"step": 4500
},
{
"epoch": 2.0238095238095237,
"grad_norm": 3.203125,
"learning_rate": 0.0002899429032868064,
"loss": 5.1019,
"step": 4505
},
{
"epoch": 2.026055705300988,
"grad_norm": 3.078125,
"learning_rate": 0.0002899032237220223,
"loss": 5.0552,
"step": 4510
},
{
"epoch": 2.0283018867924527,
"grad_norm": 2.859375,
"learning_rate": 0.0002898634690754689,
"loss": 5.0344,
"step": 4515
},
{
"epoch": 2.030548068283917,
"grad_norm": 2.9375,
"learning_rate": 0.0002898236393710436,
"loss": 5.04,
"step": 4520
},
{
"epoch": 2.0327942497753817,
"grad_norm": 3.109375,
"learning_rate": 0.00028978373463268883,
"loss": 5.0868,
"step": 4525
},
{
"epoch": 2.035040431266846,
"grad_norm": 3.234375,
"learning_rate": 0.00028974375488439194,
"loss": 5.0977,
"step": 4530
},
{
"epoch": 2.0372866127583107,
"grad_norm": 2.96875,
"learning_rate": 0.0002897037001501857,
"loss": 5.0351,
"step": 4535
},
{
"epoch": 2.039532794249775,
"grad_norm": 3.5,
"learning_rate": 0.00028966357045414774,
"loss": 5.115,
"step": 4540
},
{
"epoch": 2.0417789757412397,
"grad_norm": 2.875,
"learning_rate": 0.00028962336582040086,
"loss": 5.137,
"step": 4545
},
{
"epoch": 2.0440251572327046,
"grad_norm": 3.15625,
"learning_rate": 0.0002895830862731127,
"loss": 5.0389,
"step": 4550
},
{
"epoch": 2.046271338724169,
"grad_norm": 2.921875,
"learning_rate": 0.0002895427318364963,
"loss": 5.045,
"step": 4555
},
{
"epoch": 2.0485175202156336,
"grad_norm": 3.0,
"learning_rate": 0.00028950230253480935,
"loss": 5.0665,
"step": 4560
},
{
"epoch": 2.050763701707098,
"grad_norm": 3.0,
"learning_rate": 0.00028946179839235475,
"loss": 4.9852,
"step": 4565
},
{
"epoch": 2.0530098831985626,
"grad_norm": 2.890625,
"learning_rate": 0.0002894212194334803,
"loss": 5.1119,
"step": 4570
},
{
"epoch": 2.055256064690027,
"grad_norm": 2.921875,
"learning_rate": 0.00028938056568257874,
"loss": 5.0799,
"step": 4575
},
{
"epoch": 2.0575022461814916,
"grad_norm": 3.125,
"learning_rate": 0.000289339837164088,
"loss": 5.0597,
"step": 4580
},
{
"epoch": 2.059748427672956,
"grad_norm": 2.859375,
"learning_rate": 0.0002892990339024907,
"loss": 5.0044,
"step": 4585
},
{
"epoch": 2.0619946091644206,
"grad_norm": 3.09375,
"learning_rate": 0.0002892581559223144,
"loss": 5.0103,
"step": 4590
},
{
"epoch": 2.064240790655885,
"grad_norm": 3.140625,
"learning_rate": 0.00028921720324813185,
"loss": 5.0157,
"step": 4595
},
{
"epoch": 2.0664869721473496,
"grad_norm": 2.875,
"learning_rate": 0.0002891761759045603,
"loss": 5.0655,
"step": 4600
},
{
"epoch": 2.068733153638814,
"grad_norm": 3.0,
"learning_rate": 0.0002891350739162622,
"loss": 5.1106,
"step": 4605
},
{
"epoch": 2.0709793351302785,
"grad_norm": 3.125,
"learning_rate": 0.0002890938973079447,
"loss": 5.129,
"step": 4610
},
{
"epoch": 2.073225516621743,
"grad_norm": 3.125,
"learning_rate": 0.00028905264610436,
"loss": 5.031,
"step": 4615
},
{
"epoch": 2.0754716981132075,
"grad_norm": 2.859375,
"learning_rate": 0.00028901132033030475,
"loss": 5.0716,
"step": 4620
},
{
"epoch": 2.077717879604672,
"grad_norm": 2.984375,
"learning_rate": 0.000288969920010621,
"loss": 5.0758,
"step": 4625
},
{
"epoch": 2.0799640610961365,
"grad_norm": 3.0625,
"learning_rate": 0.000288928445170195,
"loss": 5.0436,
"step": 4630
},
{
"epoch": 2.082210242587601,
"grad_norm": 2.84375,
"learning_rate": 0.00028888689583395826,
"loss": 5.0841,
"step": 4635
},
{
"epoch": 2.0844564240790655,
"grad_norm": 3.140625,
"learning_rate": 0.00028884527202688683,
"loss": 5.0446,
"step": 4640
},
{
"epoch": 2.08670260557053,
"grad_norm": 3.015625,
"learning_rate": 0.0002888035737740016,
"loss": 4.9765,
"step": 4645
},
{
"epoch": 2.0889487870619945,
"grad_norm": 2.96875,
"learning_rate": 0.00028876180110036823,
"loss": 5.1058,
"step": 4650
},
{
"epoch": 2.091194968553459,
"grad_norm": 2.953125,
"learning_rate": 0.0002887199540310971,
"loss": 5.0546,
"step": 4655
},
{
"epoch": 2.0934411500449235,
"grad_norm": 2.828125,
"learning_rate": 0.00028867803259134326,
"loss": 4.9612,
"step": 4660
},
{
"epoch": 2.095687331536388,
"grad_norm": 3.046875,
"learning_rate": 0.00028863603680630653,
"loss": 5.0064,
"step": 4665
},
{
"epoch": 2.0979335130278525,
"grad_norm": 2.984375,
"learning_rate": 0.00028859396670123135,
"loss": 5.0299,
"step": 4670
},
{
"epoch": 2.100179694519317,
"grad_norm": 3.171875,
"learning_rate": 0.000288551822301407,
"loss": 5.0889,
"step": 4675
},
{
"epoch": 2.1024258760107815,
"grad_norm": 3.0,
"learning_rate": 0.00028850960363216714,
"loss": 5.0944,
"step": 4680
},
{
"epoch": 2.1046720575022464,
"grad_norm": 2.953125,
"learning_rate": 0.0002884673107188904,
"loss": 4.9692,
"step": 4685
},
{
"epoch": 2.106918238993711,
"grad_norm": 3.140625,
"learning_rate": 0.00028842494358699973,
"loss": 4.9994,
"step": 4690
},
{
"epoch": 2.1091644204851754,
"grad_norm": 3.03125,
"learning_rate": 0.000288382502261963,
"loss": 5.0891,
"step": 4695
},
{
"epoch": 2.11141060197664,
"grad_norm": 3.125,
"learning_rate": 0.0002883399867692924,
"loss": 5.0812,
"step": 4700
},
{
"epoch": 2.1136567834681044,
"grad_norm": 3.1875,
"learning_rate": 0.00028829739713454483,
"loss": 5.0365,
"step": 4705
},
{
"epoch": 2.115902964959569,
"grad_norm": 3.0625,
"learning_rate": 0.0002882547333833218,
"loss": 5.0654,
"step": 4710
},
{
"epoch": 2.1181491464510334,
"grad_norm": 2.859375,
"learning_rate": 0.00028821199554126934,
"loss": 4.9854,
"step": 4715
},
{
"epoch": 2.120395327942498,
"grad_norm": 3.046875,
"learning_rate": 0.0002881691836340779,
"loss": 5.0865,
"step": 4720
},
{
"epoch": 2.1226415094339623,
"grad_norm": 2.9375,
"learning_rate": 0.00028812629768748267,
"loss": 5.045,
"step": 4725
},
{
"epoch": 2.124887690925427,
"grad_norm": 3.390625,
"learning_rate": 0.00028808333772726316,
"loss": 5.0897,
"step": 4730
},
{
"epoch": 2.1271338724168913,
"grad_norm": 3.265625,
"learning_rate": 0.00028804030377924345,
"loss": 5.0187,
"step": 4735
},
{
"epoch": 2.129380053908356,
"grad_norm": 2.9375,
"learning_rate": 0.0002879971958692921,
"loss": 5.0898,
"step": 4740
},
{
"epoch": 2.1316262353998203,
"grad_norm": 2.953125,
"learning_rate": 0.00028795401402332215,
"loss": 5.0058,
"step": 4745
},
{
"epoch": 2.133872416891285,
"grad_norm": 2.984375,
"learning_rate": 0.00028791075826729097,
"loss": 5.0468,
"step": 4750
},
{
"epoch": 2.1361185983827493,
"grad_norm": 3.109375,
"learning_rate": 0.00028786742862720055,
"loss": 5.0241,
"step": 4755
},
{
"epoch": 2.138364779874214,
"grad_norm": 3.0625,
"learning_rate": 0.0002878240251290971,
"loss": 5.1405,
"step": 4760
},
{
"epoch": 2.1406109613656783,
"grad_norm": 2.828125,
"learning_rate": 0.0002877805477990713,
"loss": 5.0095,
"step": 4765
},
{
"epoch": 2.142857142857143,
"grad_norm": 3.203125,
"learning_rate": 0.00028773699666325835,
"loss": 5.0425,
"step": 4770
},
{
"epoch": 2.1451033243486073,
"grad_norm": 3.078125,
"learning_rate": 0.00028769337174783754,
"loss": 5.0217,
"step": 4775
},
{
"epoch": 2.147349505840072,
"grad_norm": 3.078125,
"learning_rate": 0.0002876496730790327,
"loss": 5.0803,
"step": 4780
},
{
"epoch": 2.1495956873315363,
"grad_norm": 3.484375,
"learning_rate": 0.00028760590068311194,
"loss": 5.0487,
"step": 4785
},
{
"epoch": 2.1518418688230008,
"grad_norm": 3.0625,
"learning_rate": 0.00028756205458638776,
"loss": 5.0174,
"step": 4790
},
{
"epoch": 2.1540880503144653,
"grad_norm": 3.421875,
"learning_rate": 0.00028751813481521694,
"loss": 5.0855,
"step": 4795
},
{
"epoch": 2.1563342318059298,
"grad_norm": 3.140625,
"learning_rate": 0.00028747414139600034,
"loss": 5.0706,
"step": 4800
},
{
"epoch": 2.1585804132973943,
"grad_norm": 2.984375,
"learning_rate": 0.0002874300743551835,
"loss": 5.1177,
"step": 4805
},
{
"epoch": 2.1608265947888587,
"grad_norm": 3.328125,
"learning_rate": 0.0002873859337192558,
"loss": 5.0589,
"step": 4810
},
{
"epoch": 2.1630727762803232,
"grad_norm": 3.140625,
"learning_rate": 0.00028734171951475104,
"loss": 5.0959,
"step": 4815
},
{
"epoch": 2.165318957771788,
"grad_norm": 3.078125,
"learning_rate": 0.00028729743176824735,
"loss": 5.0754,
"step": 4820
},
{
"epoch": 2.1675651392632527,
"grad_norm": 3.03125,
"learning_rate": 0.0002872530705063669,
"loss": 5.0442,
"step": 4825
},
{
"epoch": 2.169811320754717,
"grad_norm": 3.421875,
"learning_rate": 0.00028720863575577615,
"loss": 4.9739,
"step": 4830
},
{
"epoch": 2.1720575022461817,
"grad_norm": 3.09375,
"learning_rate": 0.0002871641275431856,
"loss": 5.0175,
"step": 4835
},
{
"epoch": 2.174303683737646,
"grad_norm": 3.15625,
"learning_rate": 0.0002871195458953501,
"loss": 5.0096,
"step": 4840
},
{
"epoch": 2.1765498652291106,
"grad_norm": 3.125,
"learning_rate": 0.0002870748908390686,
"loss": 5.0525,
"step": 4845
},
{
"epoch": 2.178796046720575,
"grad_norm": 2.890625,
"learning_rate": 0.0002870301624011839,
"loss": 5.0469,
"step": 4850
},
{
"epoch": 2.1810422282120396,
"grad_norm": 3.0,
"learning_rate": 0.0002869853606085834,
"loss": 5.0679,
"step": 4855
},
{
"epoch": 2.183288409703504,
"grad_norm": 2.9375,
"learning_rate": 0.00028694048548819816,
"loss": 5.0369,
"step": 4860
},
{
"epoch": 2.1855345911949686,
"grad_norm": 3.078125,
"learning_rate": 0.00028689553706700356,
"loss": 5.0443,
"step": 4865
},
{
"epoch": 2.187780772686433,
"grad_norm": 3.0625,
"learning_rate": 0.000286850515372019,
"loss": 4.9984,
"step": 4870
},
{
"epoch": 2.1900269541778976,
"grad_norm": 2.9375,
"learning_rate": 0.00028680542043030787,
"loss": 4.9734,
"step": 4875
},
{
"epoch": 2.192273135669362,
"grad_norm": 2.953125,
"learning_rate": 0.0002867602522689776,
"loss": 5.0096,
"step": 4880
},
{
"epoch": 2.1945193171608266,
"grad_norm": 3.265625,
"learning_rate": 0.00028671501091517967,
"loss": 4.9606,
"step": 4885
},
{
"epoch": 2.196765498652291,
"grad_norm": 3.171875,
"learning_rate": 0.0002866696963961096,
"loss": 5.072,
"step": 4890
},
{
"epoch": 2.1990116801437556,
"grad_norm": 2.96875,
"learning_rate": 0.0002866243087390067,
"loss": 5.0319,
"step": 4895
},
{
"epoch": 2.20125786163522,
"grad_norm": 3.125,
"learning_rate": 0.0002865788479711545,
"loss": 5.0198,
"step": 4900
},
{
"epoch": 2.2035040431266846,
"grad_norm": 3.015625,
"learning_rate": 0.00028653331411988034,
"loss": 5.001,
"step": 4905
},
{
"epoch": 2.205750224618149,
"grad_norm": 2.890625,
"learning_rate": 0.00028648770721255543,
"loss": 5.0652,
"step": 4910
},
{
"epoch": 2.2079964061096136,
"grad_norm": 3.0625,
"learning_rate": 0.000286442027276595,
"loss": 4.9551,
"step": 4915
},
{
"epoch": 2.210242587601078,
"grad_norm": 3.09375,
"learning_rate": 0.0002863962743394583,
"loss": 5.0335,
"step": 4920
},
{
"epoch": 2.2124887690925426,
"grad_norm": 3.265625,
"learning_rate": 0.00028635044842864805,
"loss": 5.0267,
"step": 4925
},
{
"epoch": 2.214734950584007,
"grad_norm": 3.34375,
"learning_rate": 0.0002863045495717113,
"loss": 5.0602,
"step": 4930
},
{
"epoch": 2.2169811320754715,
"grad_norm": 3.09375,
"learning_rate": 0.0002862585777962387,
"loss": 5.0753,
"step": 4935
},
{
"epoch": 2.219227313566936,
"grad_norm": 3.09375,
"learning_rate": 0.0002862125331298648,
"loss": 5.0716,
"step": 4940
},
{
"epoch": 2.2214734950584005,
"grad_norm": 3.328125,
"learning_rate": 0.0002861664156002679,
"loss": 5.0408,
"step": 4945
},
{
"epoch": 2.223719676549865,
"grad_norm": 3.25,
"learning_rate": 0.00028612022523517015,
"loss": 5.0705,
"step": 4950
},
{
"epoch": 2.22596585804133,
"grad_norm": 2.921875,
"learning_rate": 0.0002860739620623375,
"loss": 5.06,
"step": 4955
},
{
"epoch": 2.2282120395327945,
"grad_norm": 2.953125,
"learning_rate": 0.00028602762610957966,
"loss": 5.0575,
"step": 4960
},
{
"epoch": 2.230458221024259,
"grad_norm": 2.90625,
"learning_rate": 0.0002859812174047501,
"loss": 5.0911,
"step": 4965
},
{
"epoch": 2.2327044025157234,
"grad_norm": 3.0625,
"learning_rate": 0.00028593473597574595,
"loss": 5.0714,
"step": 4970
},
{
"epoch": 2.234950584007188,
"grad_norm": 3.15625,
"learning_rate": 0.00028588818185050816,
"loss": 4.9425,
"step": 4975
},
{
"epoch": 2.2371967654986524,
"grad_norm": 3.359375,
"learning_rate": 0.00028584155505702124,
"loss": 5.0257,
"step": 4980
},
{
"epoch": 2.239442946990117,
"grad_norm": 3.015625,
"learning_rate": 0.00028579485562331354,
"loss": 4.9997,
"step": 4985
},
{
"epoch": 2.2416891284815814,
"grad_norm": 3.0,
"learning_rate": 0.00028574808357745697,
"loss": 5.136,
"step": 4990
},
{
"epoch": 2.243935309973046,
"grad_norm": 3.046875,
"learning_rate": 0.0002857012389475671,
"loss": 4.9934,
"step": 4995
},
{
"epoch": 2.2461814914645104,
"grad_norm": 3.203125,
"learning_rate": 0.0002856543217618033,
"loss": 5.0804,
"step": 5000
},
{
"epoch": 2.2461814914645104,
"eval_loss": 5.077792644500732,
"eval_runtime": 16.1311,
"eval_samples_per_second": 1922.556,
"eval_steps_per_second": 240.343,
"step": 5000
},
{
"epoch": 2.248427672955975,
"grad_norm": 3.21875,
"learning_rate": 0.00028560733204836814,
"loss": 5.0199,
"step": 5005
},
{
"epoch": 2.2506738544474394,
"grad_norm": 3.03125,
"learning_rate": 0.0002855602698355083,
"loss": 4.9319,
"step": 5010
},
{
"epoch": 2.252920035938904,
"grad_norm": 3.28125,
"learning_rate": 0.0002855131351515136,
"loss": 5.014,
"step": 5015
},
{
"epoch": 2.2551662174303684,
"grad_norm": 2.96875,
"learning_rate": 0.00028546592802471783,
"loss": 4.9945,
"step": 5020
},
{
"epoch": 2.257412398921833,
"grad_norm": 3.03125,
"learning_rate": 0.0002854186484834979,
"loss": 5.0318,
"step": 5025
},
{
"epoch": 2.2596585804132974,
"grad_norm": 3.015625,
"learning_rate": 0.0002853712965562747,
"loss": 4.9928,
"step": 5030
},
{
"epoch": 2.261904761904762,
"grad_norm": 3.25,
"learning_rate": 0.0002853238722715122,
"loss": 5.0388,
"step": 5035
},
{
"epoch": 2.2641509433962264,
"grad_norm": 3.1875,
"learning_rate": 0.0002852763756577181,
"loss": 5.0577,
"step": 5040
},
{
"epoch": 2.266397124887691,
"grad_norm": 2.984375,
"learning_rate": 0.0002852288067434437,
"loss": 5.0389,
"step": 5045
},
{
"epoch": 2.2686433063791553,
"grad_norm": 3.265625,
"learning_rate": 0.00028518116555728334,
"loss": 5.0881,
"step": 5050
},
{
"epoch": 2.27088948787062,
"grad_norm": 3.140625,
"learning_rate": 0.0002851334521278753,
"loss": 5.0352,
"step": 5055
},
{
"epoch": 2.2731356693620843,
"grad_norm": 2.921875,
"learning_rate": 0.0002850856664839009,
"loss": 5.0594,
"step": 5060
},
{
"epoch": 2.275381850853549,
"grad_norm": 3.078125,
"learning_rate": 0.0002850378086540852,
"loss": 5.0079,
"step": 5065
},
{
"epoch": 2.2776280323450133,
"grad_norm": 2.984375,
"learning_rate": 0.00028498987866719627,
"loss": 4.9529,
"step": 5070
},
{
"epoch": 2.279874213836478,
"grad_norm": 3.09375,
"learning_rate": 0.00028494187655204594,
"loss": 5.0234,
"step": 5075
},
{
"epoch": 2.2821203953279423,
"grad_norm": 3.640625,
"learning_rate": 0.00028489380233748913,
"loss": 4.9815,
"step": 5080
},
{
"epoch": 2.284366576819407,
"grad_norm": 3.03125,
"learning_rate": 0.0002848456560524242,
"loss": 5.0092,
"step": 5085
},
{
"epoch": 2.2866127583108717,
"grad_norm": 2.78125,
"learning_rate": 0.0002847974377257927,
"loss": 5.024,
"step": 5090
},
{
"epoch": 2.288858939802336,
"grad_norm": 3.078125,
"learning_rate": 0.0002847491473865799,
"loss": 4.9528,
"step": 5095
},
{
"epoch": 2.2911051212938007,
"grad_norm": 3.15625,
"learning_rate": 0.0002847007850638138,
"loss": 5.0185,
"step": 5100
},
{
"epoch": 2.2933513027852652,
"grad_norm": 3.078125,
"learning_rate": 0.00028465235078656607,
"loss": 5.066,
"step": 5105
},
{
"epoch": 2.2955974842767297,
"grad_norm": 3.109375,
"learning_rate": 0.00028460384458395147,
"loss": 5.0169,
"step": 5110
},
{
"epoch": 2.297843665768194,
"grad_norm": 3.015625,
"learning_rate": 0.000284555266485128,
"loss": 4.9659,
"step": 5115
},
{
"epoch": 2.3000898472596587,
"grad_norm": 3.3125,
"learning_rate": 0.00028450661651929695,
"loss": 4.9802,
"step": 5120
},
{
"epoch": 2.302336028751123,
"grad_norm": 3.09375,
"learning_rate": 0.00028445789471570273,
"loss": 5.035,
"step": 5125
},
{
"epoch": 2.3045822102425877,
"grad_norm": 2.9375,
"learning_rate": 0.00028440910110363296,
"loss": 5.0922,
"step": 5130
},
{
"epoch": 2.306828391734052,
"grad_norm": 3.046875,
"learning_rate": 0.00028436023571241855,
"loss": 5.0105,
"step": 5135
},
{
"epoch": 2.3090745732255167,
"grad_norm": 3.3125,
"learning_rate": 0.0002843112985714333,
"loss": 4.9729,
"step": 5140
},
{
"epoch": 2.311320754716981,
"grad_norm": 2.953125,
"learning_rate": 0.00028426228971009426,
"loss": 5.0841,
"step": 5145
},
{
"epoch": 2.3135669362084457,
"grad_norm": 3.265625,
"learning_rate": 0.0002842132091578618,
"loss": 5.0193,
"step": 5150
},
{
"epoch": 2.31581311769991,
"grad_norm": 3.03125,
"learning_rate": 0.000284164056944239,
"loss": 5.0195,
"step": 5155
},
{
"epoch": 2.3180592991913747,
"grad_norm": 2.90625,
"learning_rate": 0.00028411483309877234,
"loss": 4.9968,
"step": 5160
},
{
"epoch": 2.320305480682839,
"grad_norm": 3.015625,
"learning_rate": 0.0002840655376510512,
"loss": 5.0199,
"step": 5165
},
{
"epoch": 2.3225516621743036,
"grad_norm": 2.96875,
"learning_rate": 0.000284016170630708,
"loss": 5.0053,
"step": 5170
},
{
"epoch": 2.324797843665768,
"grad_norm": 3.078125,
"learning_rate": 0.00028396673206741827,
"loss": 5.0486,
"step": 5175
},
{
"epoch": 2.3270440251572326,
"grad_norm": 3.15625,
"learning_rate": 0.0002839172219909005,
"loss": 5.0429,
"step": 5180
},
{
"epoch": 2.329290206648697,
"grad_norm": 3.15625,
"learning_rate": 0.0002838676404309162,
"loss": 4.9361,
"step": 5185
},
{
"epoch": 2.3315363881401616,
"grad_norm": 2.921875,
"learning_rate": 0.00028381798741726965,
"loss": 4.9766,
"step": 5190
},
{
"epoch": 2.333782569631626,
"grad_norm": 3.21875,
"learning_rate": 0.0002837682629798084,
"loss": 4.9904,
"step": 5195
},
{
"epoch": 2.3360287511230906,
"grad_norm": 3.078125,
"learning_rate": 0.0002837184671484227,
"loss": 5.0596,
"step": 5200
},
{
"epoch": 2.338274932614555,
"grad_norm": 3.046875,
"learning_rate": 0.0002836685999530459,
"loss": 4.986,
"step": 5205
},
{
"epoch": 2.3405211141060196,
"grad_norm": 2.96875,
"learning_rate": 0.0002836186614236541,
"loss": 4.9947,
"step": 5210
},
{
"epoch": 2.342767295597484,
"grad_norm": 2.96875,
"learning_rate": 0.0002835686515902663,
"loss": 4.9349,
"step": 5215
},
{
"epoch": 2.3450134770889486,
"grad_norm": 3.109375,
"learning_rate": 0.0002835185704829443,
"loss": 4.9609,
"step": 5220
},
{
"epoch": 2.3472596585804135,
"grad_norm": 3.59375,
"learning_rate": 0.000283468418131793,
"loss": 5.0416,
"step": 5225
},
{
"epoch": 2.3495058400718776,
"grad_norm": 3.1875,
"learning_rate": 0.0002834181945669599,
"loss": 5.0325,
"step": 5230
},
{
"epoch": 2.3517520215633425,
"grad_norm": 3.0625,
"learning_rate": 0.0002833678998186354,
"loss": 5.0163,
"step": 5235
},
{
"epoch": 2.353998203054807,
"grad_norm": 3.0625,
"learning_rate": 0.0002833175339170525,
"loss": 4.949,
"step": 5240
},
{
"epoch": 2.3562443845462715,
"grad_norm": 3.140625,
"learning_rate": 0.0002832670968924873,
"loss": 4.976,
"step": 5245
},
{
"epoch": 2.358490566037736,
"grad_norm": 3.109375,
"learning_rate": 0.0002832165887752584,
"loss": 5.017,
"step": 5250
},
{
"epoch": 2.3607367475292005,
"grad_norm": 3.0625,
"learning_rate": 0.00028316600959572727,
"loss": 5.0021,
"step": 5255
},
{
"epoch": 2.362982929020665,
"grad_norm": 3.03125,
"learning_rate": 0.000283115359384298,
"loss": 5.0264,
"step": 5260
},
{
"epoch": 2.3652291105121295,
"grad_norm": 3.21875,
"learning_rate": 0.00028306463817141743,
"loss": 5.0308,
"step": 5265
},
{
"epoch": 2.367475292003594,
"grad_norm": 3.28125,
"learning_rate": 0.00028301384598757506,
"loss": 5.0511,
"step": 5270
},
{
"epoch": 2.3697214734950585,
"grad_norm": 2.8125,
"learning_rate": 0.00028296298286330305,
"loss": 5.0266,
"step": 5275
},
{
"epoch": 2.371967654986523,
"grad_norm": 3.0625,
"learning_rate": 0.0002829120488291763,
"loss": 4.9507,
"step": 5280
},
{
"epoch": 2.3742138364779874,
"grad_norm": 3.0625,
"learning_rate": 0.0002828610439158122,
"loss": 5.0018,
"step": 5285
},
{
"epoch": 2.376460017969452,
"grad_norm": 2.984375,
"learning_rate": 0.0002828099681538708,
"loss": 5.0241,
"step": 5290
},
{
"epoch": 2.3787061994609164,
"grad_norm": 3.109375,
"learning_rate": 0.0002827588215740547,
"loss": 5.0567,
"step": 5295
},
{
"epoch": 2.380952380952381,
"grad_norm": 3.0,
"learning_rate": 0.0002827076042071092,
"loss": 4.9711,
"step": 5300
},
{
"epoch": 2.3831985624438454,
"grad_norm": 3.078125,
"learning_rate": 0.000282656316083822,
"loss": 5.0117,
"step": 5305
},
{
"epoch": 2.38544474393531,
"grad_norm": 3.03125,
"learning_rate": 0.0002826049572350234,
"loss": 4.9413,
"step": 5310
},
{
"epoch": 2.3876909254267744,
"grad_norm": 3.234375,
"learning_rate": 0.00028255352769158623,
"loss": 5.0217,
"step": 5315
},
{
"epoch": 2.389937106918239,
"grad_norm": 3.203125,
"learning_rate": 0.0002825020274844257,
"loss": 4.9966,
"step": 5320
},
{
"epoch": 2.3921832884097034,
"grad_norm": 3.0625,
"learning_rate": 0.00028245045664449973,
"loss": 5.0136,
"step": 5325
},
{
"epoch": 2.394429469901168,
"grad_norm": 3.109375,
"learning_rate": 0.00028239881520280847,
"loss": 5.0281,
"step": 5330
},
{
"epoch": 2.3966756513926324,
"grad_norm": 3.609375,
"learning_rate": 0.00028234710319039466,
"loss": 5.0617,
"step": 5335
},
{
"epoch": 2.398921832884097,
"grad_norm": 3.109375,
"learning_rate": 0.00028229532063834336,
"loss": 4.941,
"step": 5340
},
{
"epoch": 2.4011680143755614,
"grad_norm": 3.171875,
"learning_rate": 0.00028224346757778205,
"loss": 5.0323,
"step": 5345
},
{
"epoch": 2.403414195867026,
"grad_norm": 3.0,
"learning_rate": 0.00028219154403988063,
"loss": 4.9451,
"step": 5350
},
{
"epoch": 2.4056603773584904,
"grad_norm": 2.859375,
"learning_rate": 0.0002821395500558515,
"loss": 5.037,
"step": 5355
},
{
"epoch": 2.4079065588499553,
"grad_norm": 3.046875,
"learning_rate": 0.000282087485656949,
"loss": 4.9873,
"step": 5360
},
{
"epoch": 2.4101527403414194,
"grad_norm": 2.921875,
"learning_rate": 0.00028203535087447025,
"loss": 5.0177,
"step": 5365
},
{
"epoch": 2.4123989218328843,
"grad_norm": 3.171875,
"learning_rate": 0.00028198314573975444,
"loss": 5.1103,
"step": 5370
},
{
"epoch": 2.414645103324349,
"grad_norm": 3.015625,
"learning_rate": 0.00028193087028418305,
"loss": 5.0471,
"step": 5375
},
{
"epoch": 2.4168912848158133,
"grad_norm": 3.3125,
"learning_rate": 0.00028187852453917994,
"loss": 5.021,
"step": 5380
},
{
"epoch": 2.4191374663072778,
"grad_norm": 2.9375,
"learning_rate": 0.0002818261085362111,
"loss": 4.9735,
"step": 5385
},
{
"epoch": 2.4213836477987423,
"grad_norm": 3.296875,
"learning_rate": 0.00028177362230678485,
"loss": 4.9797,
"step": 5390
},
{
"epoch": 2.4236298292902068,
"grad_norm": 3.0,
"learning_rate": 0.00028172106588245175,
"loss": 4.9648,
"step": 5395
},
{
"epoch": 2.4258760107816713,
"grad_norm": 3.0,
"learning_rate": 0.00028166843929480436,
"loss": 5.1115,
"step": 5400
},
{
"epoch": 2.4281221922731357,
"grad_norm": 5.5,
"learning_rate": 0.00028161574257547765,
"loss": 4.9531,
"step": 5405
},
{
"epoch": 2.4303683737646002,
"grad_norm": 2.984375,
"learning_rate": 0.00028156297575614864,
"loss": 4.9762,
"step": 5410
},
{
"epoch": 2.4326145552560647,
"grad_norm": 3.03125,
"learning_rate": 0.00028151013886853647,
"loss": 5.0298,
"step": 5415
},
{
"epoch": 2.4348607367475292,
"grad_norm": 2.921875,
"learning_rate": 0.0002814572319444024,
"loss": 5.0413,
"step": 5420
},
{
"epoch": 2.4371069182389937,
"grad_norm": 3.109375,
"learning_rate": 0.0002814042550155499,
"loss": 5.0045,
"step": 5425
},
{
"epoch": 2.439353099730458,
"grad_norm": 2.921875,
"learning_rate": 0.00028135120811382435,
"loss": 5.0008,
"step": 5430
},
{
"epoch": 2.4415992812219227,
"grad_norm": 3.15625,
"learning_rate": 0.0002812980912711133,
"loss": 5.0294,
"step": 5435
},
{
"epoch": 2.443845462713387,
"grad_norm": 3.359375,
"learning_rate": 0.00028124490451934635,
"loss": 5.0769,
"step": 5440
},
{
"epoch": 2.4460916442048517,
"grad_norm": 3.421875,
"learning_rate": 0.000281191647890495,
"loss": 5.0096,
"step": 5445
},
{
"epoch": 2.448337825696316,
"grad_norm": 3.03125,
"learning_rate": 0.0002811383214165731,
"loss": 4.9805,
"step": 5450
},
{
"epoch": 2.4505840071877807,
"grad_norm": 3.046875,
"learning_rate": 0.000281084925129636,
"loss": 4.949,
"step": 5455
},
{
"epoch": 2.452830188679245,
"grad_norm": 2.9375,
"learning_rate": 0.0002810314590617813,
"loss": 4.9011,
"step": 5460
},
{
"epoch": 2.4550763701707097,
"grad_norm": 2.953125,
"learning_rate": 0.00028097792324514853,
"loss": 4.9974,
"step": 5465
},
{
"epoch": 2.457322551662174,
"grad_norm": 3.234375,
"learning_rate": 0.0002809243177119191,
"loss": 4.9817,
"step": 5470
},
{
"epoch": 2.4595687331536387,
"grad_norm": 2.984375,
"learning_rate": 0.0002808706424943164,
"loss": 5.03,
"step": 5475
},
{
"epoch": 2.461814914645103,
"grad_norm": 2.890625,
"learning_rate": 0.00028081689762460553,
"loss": 5.0556,
"step": 5480
},
{
"epoch": 2.4640610961365677,
"grad_norm": 3.015625,
"learning_rate": 0.00028076308313509365,
"loss": 5.0748,
"step": 5485
},
{
"epoch": 2.466307277628032,
"grad_norm": 2.984375,
"learning_rate": 0.00028070919905812976,
"loss": 4.9664,
"step": 5490
},
{
"epoch": 2.468553459119497,
"grad_norm": 3.109375,
"learning_rate": 0.00028065524542610456,
"loss": 5.0278,
"step": 5495
},
{
"epoch": 2.470799640610961,
"grad_norm": 2.984375,
"learning_rate": 0.00028060122227145065,
"loss": 5.0414,
"step": 5500
},
{
"epoch": 2.473045822102426,
"grad_norm": 3.4375,
"learning_rate": 0.0002805471296266424,
"loss": 4.9741,
"step": 5505
},
{
"epoch": 2.4752920035938906,
"grad_norm": 2.96875,
"learning_rate": 0.00028049296752419593,
"loss": 4.9957,
"step": 5510
},
{
"epoch": 2.477538185085355,
"grad_norm": 2.90625,
"learning_rate": 0.00028043873599666925,
"loss": 5.0025,
"step": 5515
},
{
"epoch": 2.4797843665768196,
"grad_norm": 2.921875,
"learning_rate": 0.0002803844350766618,
"loss": 4.9335,
"step": 5520
},
{
"epoch": 2.482030548068284,
"grad_norm": 3.1875,
"learning_rate": 0.0002803300647968152,
"loss": 4.9839,
"step": 5525
},
{
"epoch": 2.4842767295597485,
"grad_norm": 3.25,
"learning_rate": 0.00028027562518981216,
"loss": 4.9595,
"step": 5530
},
{
"epoch": 2.486522911051213,
"grad_norm": 2.96875,
"learning_rate": 0.0002802211162883776,
"loss": 4.9742,
"step": 5535
},
{
"epoch": 2.4887690925426775,
"grad_norm": 3.390625,
"learning_rate": 0.0002801665381252779,
"loss": 5.0239,
"step": 5540
},
{
"epoch": 2.491015274034142,
"grad_norm": 2.890625,
"learning_rate": 0.0002801118907333209,
"loss": 4.9802,
"step": 5545
},
{
"epoch": 2.4932614555256065,
"grad_norm": 3.015625,
"learning_rate": 0.0002800571741453564,
"loss": 4.9646,
"step": 5550
},
{
"epoch": 2.495507637017071,
"grad_norm": 3.03125,
"learning_rate": 0.0002800023883942755,
"loss": 4.9662,
"step": 5555
},
{
"epoch": 2.4977538185085355,
"grad_norm": 2.890625,
"learning_rate": 0.000279947533513011,
"loss": 5.0099,
"step": 5560
},
{
"epoch": 2.5,
"grad_norm": 3.03125,
"learning_rate": 0.0002798926095345373,
"loss": 4.9691,
"step": 5565
},
{
"epoch": 2.5022461814914645,
"grad_norm": 2.9375,
"learning_rate": 0.00027983761649187015,
"loss": 5.0081,
"step": 5570
},
{
"epoch": 2.504492362982929,
"grad_norm": 3.0,
"learning_rate": 0.00027978255441806713,
"loss": 4.9875,
"step": 5575
},
{
"epoch": 2.5067385444743935,
"grad_norm": 2.890625,
"learning_rate": 0.00027972742334622696,
"loss": 5.0318,
"step": 5580
},
{
"epoch": 2.508984725965858,
"grad_norm": 2.984375,
"learning_rate": 0.00027967222330949006,
"loss": 4.9564,
"step": 5585
},
{
"epoch": 2.5112309074573225,
"grad_norm": 2.96875,
"learning_rate": 0.00027961695434103827,
"loss": 4.9256,
"step": 5590
},
{
"epoch": 2.513477088948787,
"grad_norm": 3.015625,
"learning_rate": 0.00027956161647409486,
"loss": 5.0247,
"step": 5595
},
{
"epoch": 2.5157232704402515,
"grad_norm": 3.234375,
"learning_rate": 0.00027950620974192446,
"loss": 5.0235,
"step": 5600
},
{
"epoch": 2.517969451931716,
"grad_norm": 3.125,
"learning_rate": 0.00027945073417783315,
"loss": 4.956,
"step": 5605
},
{
"epoch": 2.5202156334231804,
"grad_norm": 3.046875,
"learning_rate": 0.0002793951898151684,
"loss": 4.94,
"step": 5610
},
{
"epoch": 2.522461814914645,
"grad_norm": 3.125,
"learning_rate": 0.00027933957668731897,
"loss": 5.0343,
"step": 5615
},
{
"epoch": 2.52470799640611,
"grad_norm": 3.1875,
"learning_rate": 0.0002792838948277151,
"loss": 4.958,
"step": 5620
},
{
"epoch": 2.526954177897574,
"grad_norm": 3.078125,
"learning_rate": 0.0002792281442698281,
"loss": 4.9437,
"step": 5625
},
{
"epoch": 2.529200359389039,
"grad_norm": 3.25,
"learning_rate": 0.0002791723250471708,
"loss": 4.9659,
"step": 5630
},
{
"epoch": 2.531446540880503,
"grad_norm": 3.265625,
"learning_rate": 0.00027911643719329723,
"loss": 4.9834,
"step": 5635
},
{
"epoch": 2.533692722371968,
"grad_norm": 3.171875,
"learning_rate": 0.0002790604807418027,
"loss": 4.9261,
"step": 5640
},
{
"epoch": 2.535938903863432,
"grad_norm": 3.109375,
"learning_rate": 0.0002790044557263236,
"loss": 5.0249,
"step": 5645
},
{
"epoch": 2.538185085354897,
"grad_norm": 3.015625,
"learning_rate": 0.00027894836218053784,
"loss": 4.9651,
"step": 5650
},
{
"epoch": 2.5404312668463613,
"grad_norm": 3.1875,
"learning_rate": 0.00027889220013816416,
"loss": 5.0094,
"step": 5655
},
{
"epoch": 2.542677448337826,
"grad_norm": 3.421875,
"learning_rate": 0.0002788359696329628,
"loss": 4.9669,
"step": 5660
},
{
"epoch": 2.5449236298292903,
"grad_norm": 3.0625,
"learning_rate": 0.00027877967069873494,
"loss": 4.9674,
"step": 5665
},
{
"epoch": 2.547169811320755,
"grad_norm": 3.25,
"learning_rate": 0.0002787233033693231,
"loss": 4.9891,
"step": 5670
},
{
"epoch": 2.5494159928122193,
"grad_norm": 3.0625,
"learning_rate": 0.0002786668676786106,
"loss": 4.913,
"step": 5675
},
{
"epoch": 2.551662174303684,
"grad_norm": 2.84375,
"learning_rate": 0.00027861036366052215,
"loss": 4.9285,
"step": 5680
},
{
"epoch": 2.5539083557951483,
"grad_norm": 3.0,
"learning_rate": 0.0002785537913490233,
"loss": 4.9889,
"step": 5685
},
{
"epoch": 2.556154537286613,
"grad_norm": 3.265625,
"learning_rate": 0.000278497150778121,
"loss": 4.9526,
"step": 5690
},
{
"epoch": 2.5584007187780773,
"grad_norm": 3.109375,
"learning_rate": 0.00027844044198186275,
"loss": 5.052,
"step": 5695
},
{
"epoch": 2.560646900269542,
"grad_norm": 3.375,
"learning_rate": 0.00027838366499433753,
"loss": 4.9283,
"step": 5700
},
{
"epoch": 2.5628930817610063,
"grad_norm": 3.171875,
"learning_rate": 0.00027832681984967493,
"loss": 5.0854,
"step": 5705
},
{
"epoch": 2.5651392632524708,
"grad_norm": 3.3125,
"learning_rate": 0.00027826990658204575,
"loss": 4.8997,
"step": 5710
},
{
"epoch": 2.5673854447439353,
"grad_norm": 3.015625,
"learning_rate": 0.0002782129252256617,
"loss": 5.0075,
"step": 5715
},
{
"epoch": 2.5696316262353998,
"grad_norm": 2.90625,
"learning_rate": 0.0002781558758147754,
"loss": 4.9988,
"step": 5720
},
{
"epoch": 2.5718778077268643,
"grad_norm": 2.90625,
"learning_rate": 0.0002780987583836802,
"loss": 5.0213,
"step": 5725
},
{
"epoch": 2.5741239892183287,
"grad_norm": 3.046875,
"learning_rate": 0.0002780415729667107,
"loss": 4.9287,
"step": 5730
},
{
"epoch": 2.5763701707097932,
"grad_norm": 2.9375,
"learning_rate": 0.0002779843195982421,
"loss": 4.9767,
"step": 5735
},
{
"epoch": 2.5786163522012577,
"grad_norm": 3.140625,
"learning_rate": 0.00027792699831269044,
"loss": 4.9909,
"step": 5740
},
{
"epoch": 2.5808625336927222,
"grad_norm": 3.390625,
"learning_rate": 0.00027786960914451286,
"loss": 5.0782,
"step": 5745
},
{
"epoch": 2.5831087151841867,
"grad_norm": 3.3125,
"learning_rate": 0.00027781215212820684,
"loss": 4.9401,
"step": 5750
},
{
"epoch": 2.5853548966756517,
"grad_norm": 3.03125,
"learning_rate": 0.0002777546272983112,
"loss": 4.973,
"step": 5755
},
{
"epoch": 2.5876010781671157,
"grad_norm": 2.875,
"learning_rate": 0.000277697034689405,
"loss": 4.9916,
"step": 5760
},
{
"epoch": 2.5898472596585806,
"grad_norm": 3.265625,
"learning_rate": 0.00027763937433610843,
"loss": 4.9979,
"step": 5765
},
{
"epoch": 2.5920934411500447,
"grad_norm": 2.984375,
"learning_rate": 0.00027758164627308225,
"loss": 4.9277,
"step": 5770
},
{
"epoch": 2.5943396226415096,
"grad_norm": 3.453125,
"learning_rate": 0.00027752385053502783,
"loss": 5.0059,
"step": 5775
},
{
"epoch": 2.5965858041329737,
"grad_norm": 3.15625,
"learning_rate": 0.0002774659871566874,
"loss": 4.9416,
"step": 5780
},
{
"epoch": 2.5988319856244386,
"grad_norm": 3.046875,
"learning_rate": 0.00027740805617284376,
"loss": 4.9635,
"step": 5785
},
{
"epoch": 2.601078167115903,
"grad_norm": 3.1875,
"learning_rate": 0.0002773500576183203,
"loss": 5.0296,
"step": 5790
},
{
"epoch": 2.6033243486073676,
"grad_norm": 2.9375,
"learning_rate": 0.0002772919915279812,
"loss": 4.9442,
"step": 5795
},
{
"epoch": 2.605570530098832,
"grad_norm": 3.125,
"learning_rate": 0.000277233857936731,
"loss": 4.9411,
"step": 5800
},
{
"epoch": 2.6078167115902966,
"grad_norm": 3.328125,
"learning_rate": 0.000277175656879515,
"loss": 5.0361,
"step": 5805
},
{
"epoch": 2.610062893081761,
"grad_norm": 3.0,
"learning_rate": 0.00027711738839131895,
"loss": 4.9689,
"step": 5810
},
{
"epoch": 2.6123090745732256,
"grad_norm": 3.09375,
"learning_rate": 0.00027705905250716926,
"loss": 4.9799,
"step": 5815
},
{
"epoch": 2.61455525606469,
"grad_norm": 3.015625,
"learning_rate": 0.0002770006492621327,
"loss": 5.0151,
"step": 5820
},
{
"epoch": 2.6168014375561546,
"grad_norm": 2.984375,
"learning_rate": 0.0002769421786913166,
"loss": 4.9397,
"step": 5825
},
{
"epoch": 2.619047619047619,
"grad_norm": 3.171875,
"learning_rate": 0.0002768836408298688,
"loss": 5.0281,
"step": 5830
},
{
"epoch": 2.6212938005390836,
"grad_norm": 3.078125,
"learning_rate": 0.0002768250357129775,
"loss": 4.9577,
"step": 5835
},
{
"epoch": 2.623539982030548,
"grad_norm": 2.890625,
"learning_rate": 0.00027676636337587145,
"loss": 4.9567,
"step": 5840
},
{
"epoch": 2.6257861635220126,
"grad_norm": 3.140625,
"learning_rate": 0.00027670762385381974,
"loss": 4.9665,
"step": 5845
},
{
"epoch": 2.628032345013477,
"grad_norm": 3.25,
"learning_rate": 0.00027664881718213175,
"loss": 4.9683,
"step": 5850
},
{
"epoch": 2.6302785265049415,
"grad_norm": 3.0,
"learning_rate": 0.0002765899433961574,
"loss": 4.9659,
"step": 5855
},
{
"epoch": 2.632524707996406,
"grad_norm": 3.265625,
"learning_rate": 0.00027653100253128687,
"loss": 4.9306,
"step": 5860
},
{
"epoch": 2.6347708894878705,
"grad_norm": 3.53125,
"learning_rate": 0.00027647199462295065,
"loss": 4.9186,
"step": 5865
},
{
"epoch": 2.637017070979335,
"grad_norm": 3.28125,
"learning_rate": 0.00027641291970661953,
"loss": 4.9603,
"step": 5870
},
{
"epoch": 2.6392632524707995,
"grad_norm": 2.953125,
"learning_rate": 0.00027635377781780465,
"loss": 4.9935,
"step": 5875
},
{
"epoch": 2.641509433962264,
"grad_norm": 3.015625,
"learning_rate": 0.00027629456899205725,
"loss": 5.0406,
"step": 5880
},
{
"epoch": 2.6437556154537285,
"grad_norm": 3.203125,
"learning_rate": 0.00027623529326496906,
"loss": 4.9832,
"step": 5885
},
{
"epoch": 2.6460017969451934,
"grad_norm": 3.609375,
"learning_rate": 0.0002761759506721717,
"loss": 4.9561,
"step": 5890
},
{
"epoch": 2.6482479784366575,
"grad_norm": 2.921875,
"learning_rate": 0.0002761165412493373,
"loss": 4.9534,
"step": 5895
},
{
"epoch": 2.6504941599281224,
"grad_norm": 3.015625,
"learning_rate": 0.00027605706503217806,
"loss": 4.9841,
"step": 5900
},
{
"epoch": 2.6527403414195865,
"grad_norm": 3.234375,
"learning_rate": 0.0002759975220564462,
"loss": 5.1202,
"step": 5905
},
{
"epoch": 2.6549865229110514,
"grad_norm": 3.171875,
"learning_rate": 0.0002759379123579341,
"loss": 4.9232,
"step": 5910
},
{
"epoch": 2.6572327044025155,
"grad_norm": 3.140625,
"learning_rate": 0.0002758782359724745,
"loss": 4.923,
"step": 5915
},
{
"epoch": 2.6594788858939804,
"grad_norm": 3.375,
"learning_rate": 0.00027581849293593994,
"loss": 4.9812,
"step": 5920
},
{
"epoch": 2.661725067385445,
"grad_norm": 3.046875,
"learning_rate": 0.00027575868328424307,
"loss": 5.078,
"step": 5925
},
{
"epoch": 2.6639712488769094,
"grad_norm": 3.28125,
"learning_rate": 0.00027569880705333676,
"loss": 4.9466,
"step": 5930
},
{
"epoch": 2.666217430368374,
"grad_norm": 2.984375,
"learning_rate": 0.00027563886427921377,
"loss": 4.9025,
"step": 5935
},
{
"epoch": 2.6684636118598384,
"grad_norm": 3.015625,
"learning_rate": 0.00027557885499790674,
"loss": 4.9652,
"step": 5940
},
{
"epoch": 2.670709793351303,
"grad_norm": 3.0,
"learning_rate": 0.00027551877924548854,
"loss": 5.0297,
"step": 5945
},
{
"epoch": 2.6729559748427674,
"grad_norm": 3.171875,
"learning_rate": 0.0002754586370580719,
"loss": 4.9552,
"step": 5950
},
{
"epoch": 2.675202156334232,
"grad_norm": 3.328125,
"learning_rate": 0.00027539842847180935,
"loss": 5.0043,
"step": 5955
},
{
"epoch": 2.6774483378256964,
"grad_norm": 2.90625,
"learning_rate": 0.00027533815352289353,
"loss": 4.9195,
"step": 5960
},
{
"epoch": 2.679694519317161,
"grad_norm": 3.015625,
"learning_rate": 0.00027527781224755696,
"loss": 4.9867,
"step": 5965
},
{
"epoch": 2.6819407008086253,
"grad_norm": 3.015625,
"learning_rate": 0.0002752174046820718,
"loss": 4.9602,
"step": 5970
},
{
"epoch": 2.68418688230009,
"grad_norm": 3.03125,
"learning_rate": 0.00027515693086275025,
"loss": 4.9846,
"step": 5975
},
{
"epoch": 2.6864330637915543,
"grad_norm": 3.0,
"learning_rate": 0.0002750963908259445,
"loss": 4.9414,
"step": 5980
},
{
"epoch": 2.688679245283019,
"grad_norm": 3.0625,
"learning_rate": 0.00027503578460804604,
"loss": 4.9944,
"step": 5985
},
{
"epoch": 2.6909254267744833,
"grad_norm": 2.96875,
"learning_rate": 0.00027497511224548667,
"loss": 5.0515,
"step": 5990
},
{
"epoch": 2.693171608265948,
"grad_norm": 3.484375,
"learning_rate": 0.0002749143737747377,
"loss": 4.9846,
"step": 5995
},
{
"epoch": 2.6954177897574123,
"grad_norm": 3.984375,
"learning_rate": 0.00027485356923231014,
"loss": 4.9318,
"step": 6000
},
{
"epoch": 2.6954177897574123,
"eval_loss": 5.004312992095947,
"eval_runtime": 16.1624,
"eval_samples_per_second": 1918.841,
"eval_steps_per_second": 239.878,
"step": 6000
},
{
"epoch": 2.697663971248877,
"grad_norm": 3.078125,
"learning_rate": 0.00027479269865475487,
"loss": 4.9818,
"step": 6005
},
{
"epoch": 2.6999101527403413,
"grad_norm": 3.0,
"learning_rate": 0.0002747317620786623,
"loss": 4.9563,
"step": 6010
},
{
"epoch": 2.702156334231806,
"grad_norm": 3.53125,
"learning_rate": 0.0002746707595406627,
"loss": 4.9565,
"step": 6015
},
{
"epoch": 2.7044025157232703,
"grad_norm": 2.890625,
"learning_rate": 0.0002746096910774258,
"loss": 4.955,
"step": 6020
},
{
"epoch": 2.706648697214735,
"grad_norm": 2.890625,
"learning_rate": 0.00027454855672566107,
"loss": 4.948,
"step": 6025
},
{
"epoch": 2.7088948787061993,
"grad_norm": 3.4375,
"learning_rate": 0.0002744873565221176,
"loss": 4.9657,
"step": 6030
},
{
"epoch": 2.711141060197664,
"grad_norm": 2.953125,
"learning_rate": 0.000274426090503584,
"loss": 4.9173,
"step": 6035
},
{
"epoch": 2.7133872416891283,
"grad_norm": 3.265625,
"learning_rate": 0.00027436475870688847,
"loss": 4.9704,
"step": 6040
},
{
"epoch": 2.715633423180593,
"grad_norm": 2.96875,
"learning_rate": 0.00027430336116889876,
"loss": 4.9755,
"step": 6045
},
{
"epoch": 2.7178796046720572,
"grad_norm": 3.1875,
"learning_rate": 0.00027424189792652214,
"loss": 4.9371,
"step": 6050
},
{
"epoch": 2.720125786163522,
"grad_norm": 2.90625,
"learning_rate": 0.00027418036901670533,
"loss": 4.9885,
"step": 6055
},
{
"epoch": 2.7223719676549867,
"grad_norm": 3.0625,
"learning_rate": 0.00027411877447643454,
"loss": 4.8649,
"step": 6060
},
{
"epoch": 2.724618149146451,
"grad_norm": 3.203125,
"learning_rate": 0.0002740571143427356,
"loss": 4.9875,
"step": 6065
},
{
"epoch": 2.7268643306379157,
"grad_norm": 3.078125,
"learning_rate": 0.00027399538865267343,
"loss": 4.9599,
"step": 6070
},
{
"epoch": 2.72911051212938,
"grad_norm": 3.03125,
"learning_rate": 0.0002739335974433527,
"loss": 4.9258,
"step": 6075
},
{
"epoch": 2.7313566936208447,
"grad_norm": 3.1875,
"learning_rate": 0.0002738717407519172,
"loss": 4.9696,
"step": 6080
},
{
"epoch": 2.733602875112309,
"grad_norm": 3.6875,
"learning_rate": 0.00027380981861555026,
"loss": 4.9673,
"step": 6085
},
{
"epoch": 2.7358490566037736,
"grad_norm": 3.09375,
"learning_rate": 0.00027374783107147446,
"loss": 5.0333,
"step": 6090
},
{
"epoch": 2.738095238095238,
"grad_norm": 3.03125,
"learning_rate": 0.00027368577815695176,
"loss": 4.9061,
"step": 6095
},
{
"epoch": 2.7403414195867026,
"grad_norm": 3.171875,
"learning_rate": 0.0002736236599092833,
"loss": 4.8905,
"step": 6100
},
{
"epoch": 2.742587601078167,
"grad_norm": 3.09375,
"learning_rate": 0.0002735614763658097,
"loss": 4.9118,
"step": 6105
},
{
"epoch": 2.7448337825696316,
"grad_norm": 3.4375,
"learning_rate": 0.0002734992275639106,
"loss": 5.0166,
"step": 6110
},
{
"epoch": 2.747079964061096,
"grad_norm": 3.03125,
"learning_rate": 0.000273436913541005,
"loss": 4.9696,
"step": 6115
},
{
"epoch": 2.7493261455525606,
"grad_norm": 3.109375,
"learning_rate": 0.0002733745343345511,
"loss": 4.9343,
"step": 6120
},
{
"epoch": 2.751572327044025,
"grad_norm": 3.046875,
"learning_rate": 0.00027331208998204623,
"loss": 4.9587,
"step": 6125
},
{
"epoch": 2.7538185085354896,
"grad_norm": 3.015625,
"learning_rate": 0.00027324958052102696,
"loss": 4.8604,
"step": 6130
},
{
"epoch": 2.756064690026954,
"grad_norm": 3.140625,
"learning_rate": 0.00027318700598906887,
"loss": 4.8835,
"step": 6135
},
{
"epoch": 2.7583108715184186,
"grad_norm": 3.0,
"learning_rate": 0.0002731243664237868,
"loss": 4.9459,
"step": 6140
},
{
"epoch": 2.760557053009883,
"grad_norm": 3.09375,
"learning_rate": 0.00027306166186283457,
"loss": 4.912,
"step": 6145
},
{
"epoch": 2.7628032345013476,
"grad_norm": 3.109375,
"learning_rate": 0.00027299889234390514,
"loss": 4.962,
"step": 6150
},
{
"epoch": 2.765049415992812,
"grad_norm": 3.296875,
"learning_rate": 0.0002729360579047305,
"loss": 4.8934,
"step": 6155
},
{
"epoch": 2.767295597484277,
"grad_norm": 3.15625,
"learning_rate": 0.00027287315858308164,
"loss": 4.9474,
"step": 6160
},
{
"epoch": 2.769541778975741,
"grad_norm": 3.09375,
"learning_rate": 0.00027281019441676856,
"loss": 4.9378,
"step": 6165
},
{
"epoch": 2.771787960467206,
"grad_norm": 2.984375,
"learning_rate": 0.00027274716544364034,
"loss": 4.9604,
"step": 6170
},
{
"epoch": 2.77403414195867,
"grad_norm": 3.09375,
"learning_rate": 0.00027268407170158486,
"loss": 4.9024,
"step": 6175
},
{
"epoch": 2.776280323450135,
"grad_norm": 2.859375,
"learning_rate": 0.00027262091322852893,
"loss": 4.9575,
"step": 6180
},
{
"epoch": 2.778526504941599,
"grad_norm": 2.984375,
"learning_rate": 0.00027255769006243855,
"loss": 4.9526,
"step": 6185
},
{
"epoch": 2.780772686433064,
"grad_norm": 2.875,
"learning_rate": 0.00027249440224131813,
"loss": 4.9385,
"step": 6190
},
{
"epoch": 2.7830188679245285,
"grad_norm": 3.09375,
"learning_rate": 0.0002724310498032115,
"loss": 4.9669,
"step": 6195
},
{
"epoch": 2.785265049415993,
"grad_norm": 3.0,
"learning_rate": 0.0002723676327862008,
"loss": 4.9392,
"step": 6200
},
{
"epoch": 2.7875112309074574,
"grad_norm": 3.125,
"learning_rate": 0.00027230415122840736,
"loss": 4.873,
"step": 6205
},
{
"epoch": 2.789757412398922,
"grad_norm": 3.390625,
"learning_rate": 0.0002722406051679912,
"loss": 4.9445,
"step": 6210
},
{
"epoch": 2.7920035938903864,
"grad_norm": 3.25,
"learning_rate": 0.00027217699464315105,
"loss": 4.9907,
"step": 6215
},
{
"epoch": 2.794249775381851,
"grad_norm": 3.171875,
"learning_rate": 0.00027211331969212443,
"loss": 5.0226,
"step": 6220
},
{
"epoch": 2.7964959568733154,
"grad_norm": 3.125,
"learning_rate": 0.00027204958035318766,
"loss": 4.9335,
"step": 6225
},
{
"epoch": 2.79874213836478,
"grad_norm": 3.203125,
"learning_rate": 0.00027198577666465574,
"loss": 4.9036,
"step": 6230
},
{
"epoch": 2.8009883198562444,
"grad_norm": 3.125,
"learning_rate": 0.0002719219086648821,
"loss": 4.9858,
"step": 6235
},
{
"epoch": 2.803234501347709,
"grad_norm": 3.484375,
"learning_rate": 0.0002718579763922593,
"loss": 4.9541,
"step": 6240
},
{
"epoch": 2.8054806828391734,
"grad_norm": 3.703125,
"learning_rate": 0.0002717939798852181,
"loss": 4.918,
"step": 6245
},
{
"epoch": 2.807726864330638,
"grad_norm": 3.09375,
"learning_rate": 0.0002717299191822281,
"loss": 4.9336,
"step": 6250
},
{
"epoch": 2.8099730458221024,
"grad_norm": 3.109375,
"learning_rate": 0.0002716657943217975,
"loss": 4.9163,
"step": 6255
},
{
"epoch": 2.812219227313567,
"grad_norm": 3.4375,
"learning_rate": 0.000271601605342473,
"loss": 4.8734,
"step": 6260
},
{
"epoch": 2.8144654088050314,
"grad_norm": 3.265625,
"learning_rate": 0.00027153735228283975,
"loss": 4.9175,
"step": 6265
},
{
"epoch": 2.816711590296496,
"grad_norm": 3.21875,
"learning_rate": 0.0002714730351815216,
"loss": 5.0152,
"step": 6270
},
{
"epoch": 2.8189577717879604,
"grad_norm": 3.03125,
"learning_rate": 0.0002714086540771808,
"loss": 4.9391,
"step": 6275
},
{
"epoch": 2.821203953279425,
"grad_norm": 3.03125,
"learning_rate": 0.0002713442090085181,
"loss": 4.9086,
"step": 6280
},
{
"epoch": 2.8234501347708894,
"grad_norm": 3.109375,
"learning_rate": 0.0002712797000142727,
"loss": 4.9808,
"step": 6285
},
{
"epoch": 2.825696316262354,
"grad_norm": 3.453125,
"learning_rate": 0.0002712151271332222,
"loss": 4.9397,
"step": 6290
},
{
"epoch": 2.827942497753819,
"grad_norm": 3.203125,
"learning_rate": 0.00027115049040418254,
"loss": 4.8799,
"step": 6295
},
{
"epoch": 2.830188679245283,
"grad_norm": 2.9375,
"learning_rate": 0.0002710857898660082,
"loss": 4.9463,
"step": 6300
},
{
"epoch": 2.8324348607367478,
"grad_norm": 3.09375,
"learning_rate": 0.00027102102555759205,
"loss": 4.9161,
"step": 6305
},
{
"epoch": 2.834681042228212,
"grad_norm": 2.828125,
"learning_rate": 0.000270956197517865,
"loss": 4.9685,
"step": 6310
},
{
"epoch": 2.8369272237196768,
"grad_norm": 2.96875,
"learning_rate": 0.0002708913057857965,
"loss": 4.9146,
"step": 6315
},
{
"epoch": 2.839173405211141,
"grad_norm": 2.921875,
"learning_rate": 0.00027082635040039435,
"loss": 4.9201,
"step": 6320
},
{
"epoch": 2.8414195867026057,
"grad_norm": 3.0,
"learning_rate": 0.0002707613314007044,
"loss": 4.9169,
"step": 6325
},
{
"epoch": 2.8436657681940702,
"grad_norm": 3.109375,
"learning_rate": 0.00027069624882581077,
"loss": 4.9624,
"step": 6330
},
{
"epoch": 2.8459119496855347,
"grad_norm": 3.078125,
"learning_rate": 0.000270631102714836,
"loss": 4.9702,
"step": 6335
},
{
"epoch": 2.8481581311769992,
"grad_norm": 3.015625,
"learning_rate": 0.0002705658931069406,
"loss": 4.9867,
"step": 6340
},
{
"epoch": 2.8504043126684637,
"grad_norm": 2.90625,
"learning_rate": 0.0002705006200413235,
"loss": 4.9874,
"step": 6345
},
{
"epoch": 2.852650494159928,
"grad_norm": 2.921875,
"learning_rate": 0.00027043528355722135,
"loss": 4.935,
"step": 6350
},
{
"epoch": 2.8548966756513927,
"grad_norm": 3.03125,
"learning_rate": 0.00027036988369390946,
"loss": 4.8973,
"step": 6355
},
{
"epoch": 2.857142857142857,
"grad_norm": 2.9375,
"learning_rate": 0.00027030442049070076,
"loss": 4.8817,
"step": 6360
},
{
"epoch": 2.8593890386343217,
"grad_norm": 3.140625,
"learning_rate": 0.0002702388939869466,
"loss": 4.9394,
"step": 6365
},
{
"epoch": 2.861635220125786,
"grad_norm": 3.125,
"learning_rate": 0.00027017330422203614,
"loss": 4.844,
"step": 6370
},
{
"epoch": 2.8638814016172507,
"grad_norm": 3.015625,
"learning_rate": 0.0002701076512353968,
"loss": 4.8938,
"step": 6375
},
{
"epoch": 2.866127583108715,
"grad_norm": 2.84375,
"learning_rate": 0.00027004193506649374,
"loss": 4.9167,
"step": 6380
},
{
"epoch": 2.8683737646001797,
"grad_norm": 3.03125,
"learning_rate": 0.00026997615575483026,
"loss": 4.9192,
"step": 6385
},
{
"epoch": 2.870619946091644,
"grad_norm": 3.109375,
"learning_rate": 0.0002699103133399476,
"loss": 4.9162,
"step": 6390
},
{
"epoch": 2.8728661275831087,
"grad_norm": 3.015625,
"learning_rate": 0.00026984440786142496,
"loss": 4.9291,
"step": 6395
},
{
"epoch": 2.875112309074573,
"grad_norm": 3.28125,
"learning_rate": 0.0002697784393588794,
"loss": 5.0111,
"step": 6400
},
{
"epoch": 2.8773584905660377,
"grad_norm": 3.15625,
"learning_rate": 0.0002697124078719659,
"loss": 5.0144,
"step": 6405
},
{
"epoch": 2.879604672057502,
"grad_norm": 2.921875,
"learning_rate": 0.00026964631344037713,
"loss": 4.9051,
"step": 6410
},
{
"epoch": 2.8818508535489666,
"grad_norm": 3.25,
"learning_rate": 0.00026958015610384394,
"loss": 4.9883,
"step": 6415
},
{
"epoch": 2.884097035040431,
"grad_norm": 3.078125,
"learning_rate": 0.00026951393590213474,
"loss": 5.031,
"step": 6420
},
{
"epoch": 2.8863432165318956,
"grad_norm": 3.75,
"learning_rate": 0.0002694476528750557,
"loss": 4.9772,
"step": 6425
},
{
"epoch": 2.88858939802336,
"grad_norm": 3.1875,
"learning_rate": 0.0002693813070624509,
"loss": 4.989,
"step": 6430
},
{
"epoch": 2.8908355795148246,
"grad_norm": 3.109375,
"learning_rate": 0.00026931489850420213,
"loss": 4.9268,
"step": 6435
},
{
"epoch": 2.8930817610062896,
"grad_norm": 3.03125,
"learning_rate": 0.0002692484272402288,
"loss": 4.9483,
"step": 6440
},
{
"epoch": 2.8953279424977536,
"grad_norm": 3.15625,
"learning_rate": 0.00026918189331048825,
"loss": 4.9406,
"step": 6445
},
{
"epoch": 2.8975741239892185,
"grad_norm": 2.9375,
"learning_rate": 0.00026911529675497514,
"loss": 4.9445,
"step": 6450
},
{
"epoch": 2.8998203054806826,
"grad_norm": 3.296875,
"learning_rate": 0.00026904863761372205,
"loss": 4.8945,
"step": 6455
},
{
"epoch": 2.9020664869721475,
"grad_norm": 3.171875,
"learning_rate": 0.0002689819159267991,
"loss": 4.944,
"step": 6460
},
{
"epoch": 2.904312668463612,
"grad_norm": 3.0625,
"learning_rate": 0.00026891513173431394,
"loss": 4.8718,
"step": 6465
},
{
"epoch": 2.9065588499550765,
"grad_norm": 3.328125,
"learning_rate": 0.0002688482850764119,
"loss": 4.9186,
"step": 6470
},
{
"epoch": 2.908805031446541,
"grad_norm": 3.140625,
"learning_rate": 0.0002687813759932758,
"loss": 4.9278,
"step": 6475
},
{
"epoch": 2.9110512129380055,
"grad_norm": 3.734375,
"learning_rate": 0.000268714404525126,
"loss": 4.9514,
"step": 6480
},
{
"epoch": 2.91329739442947,
"grad_norm": 3.015625,
"learning_rate": 0.0002686473707122204,
"loss": 4.8925,
"step": 6485
},
{
"epoch": 2.9155435759209345,
"grad_norm": 3.0,
"learning_rate": 0.00026858027459485427,
"loss": 4.864,
"step": 6490
},
{
"epoch": 2.917789757412399,
"grad_norm": 3.125,
"learning_rate": 0.0002685131162133604,
"loss": 4.9311,
"step": 6495
},
{
"epoch": 2.9200359389038635,
"grad_norm": 3.09375,
"learning_rate": 0.0002684458956081091,
"loss": 4.9183,
"step": 6500
},
{
"epoch": 2.922282120395328,
"grad_norm": 3.296875,
"learning_rate": 0.00026837861281950786,
"loss": 5.0522,
"step": 6505
},
{
"epoch": 2.9245283018867925,
"grad_norm": 3.1875,
"learning_rate": 0.00026831126788800174,
"loss": 4.8712,
"step": 6510
},
{
"epoch": 2.926774483378257,
"grad_norm": 3.125,
"learning_rate": 0.00026824386085407307,
"loss": 4.9477,
"step": 6515
},
{
"epoch": 2.9290206648697215,
"grad_norm": 3.03125,
"learning_rate": 0.0002681763917582416,
"loss": 4.8396,
"step": 6520
},
{
"epoch": 2.931266846361186,
"grad_norm": 3.53125,
"learning_rate": 0.00026810886064106425,
"loss": 5.0069,
"step": 6525
},
{
"epoch": 2.9335130278526504,
"grad_norm": 3.328125,
"learning_rate": 0.00026804126754313533,
"loss": 4.9952,
"step": 6530
},
{
"epoch": 2.935759209344115,
"grad_norm": 3.078125,
"learning_rate": 0.00026797361250508644,
"loss": 4.8484,
"step": 6535
},
{
"epoch": 2.9380053908355794,
"grad_norm": 3.890625,
"learning_rate": 0.0002679058955675862,
"loss": 5.0126,
"step": 6540
},
{
"epoch": 2.940251572327044,
"grad_norm": 3.015625,
"learning_rate": 0.00026783811677134065,
"loss": 4.9527,
"step": 6545
},
{
"epoch": 2.9424977538185084,
"grad_norm": 2.890625,
"learning_rate": 0.00026777027615709304,
"loss": 4.9737,
"step": 6550
},
{
"epoch": 2.944743935309973,
"grad_norm": 3.796875,
"learning_rate": 0.0002677023737656235,
"loss": 4.9099,
"step": 6555
},
{
"epoch": 2.9469901168014374,
"grad_norm": 2.984375,
"learning_rate": 0.00026763440963774966,
"loss": 4.8881,
"step": 6560
},
{
"epoch": 2.949236298292902,
"grad_norm": 3.25,
"learning_rate": 0.00026756638381432603,
"loss": 4.9634,
"step": 6565
},
{
"epoch": 2.9514824797843664,
"grad_norm": 3.015625,
"learning_rate": 0.0002674982963362442,
"loss": 4.8719,
"step": 6570
},
{
"epoch": 2.9537286612758313,
"grad_norm": 3.125,
"learning_rate": 0.00026743014724443293,
"loss": 4.969,
"step": 6575
},
{
"epoch": 2.9559748427672954,
"grad_norm": 3.03125,
"learning_rate": 0.000267361936579858,
"loss": 4.917,
"step": 6580
},
{
"epoch": 2.9582210242587603,
"grad_norm": 3.109375,
"learning_rate": 0.00026729366438352215,
"loss": 4.9,
"step": 6585
},
{
"epoch": 2.9604672057502244,
"grad_norm": 3.015625,
"learning_rate": 0.0002672253306964651,
"loss": 4.9521,
"step": 6590
},
{
"epoch": 2.9627133872416893,
"grad_norm": 3.328125,
"learning_rate": 0.0002671569355597637,
"loss": 4.9743,
"step": 6595
},
{
"epoch": 2.964959568733154,
"grad_norm": 3.09375,
"learning_rate": 0.0002670884790145314,
"loss": 4.9766,
"step": 6600
},
{
"epoch": 2.9672057502246183,
"grad_norm": 3.109375,
"learning_rate": 0.0002670199611019189,
"loss": 4.96,
"step": 6605
},
{
"epoch": 2.969451931716083,
"grad_norm": 3.046875,
"learning_rate": 0.00026695138186311364,
"loss": 4.8694,
"step": 6610
},
{
"epoch": 2.9716981132075473,
"grad_norm": 3.15625,
"learning_rate": 0.0002668827413393399,
"loss": 4.8839,
"step": 6615
},
{
"epoch": 2.973944294699012,
"grad_norm": 3.09375,
"learning_rate": 0.0002668140395718588,
"loss": 4.8949,
"step": 6620
},
{
"epoch": 2.9761904761904763,
"grad_norm": 3.0625,
"learning_rate": 0.0002667452766019685,
"loss": 4.8834,
"step": 6625
},
{
"epoch": 2.9784366576819408,
"grad_norm": 3.0625,
"learning_rate": 0.00026667645247100357,
"loss": 4.9984,
"step": 6630
},
{
"epoch": 2.9806828391734053,
"grad_norm": 3.015625,
"learning_rate": 0.0002666075672203356,
"loss": 4.8534,
"step": 6635
},
{
"epoch": 2.9829290206648698,
"grad_norm": 3.171875,
"learning_rate": 0.00026653862089137296,
"loss": 4.9614,
"step": 6640
},
{
"epoch": 2.9851752021563343,
"grad_norm": 3.09375,
"learning_rate": 0.0002664696135255605,
"loss": 4.8943,
"step": 6645
},
{
"epoch": 2.9874213836477987,
"grad_norm": 2.859375,
"learning_rate": 0.00026640054516437997,
"loss": 4.8697,
"step": 6650
},
{
"epoch": 2.9896675651392632,
"grad_norm": 3.46875,
"learning_rate": 0.0002663314158493496,
"loss": 4.9618,
"step": 6655
},
{
"epoch": 2.9919137466307277,
"grad_norm": 3.09375,
"learning_rate": 0.00026626222562202456,
"loss": 4.8952,
"step": 6660
},
{
"epoch": 2.9941599281221922,
"grad_norm": 3.140625,
"learning_rate": 0.00026619297452399633,
"loss": 4.9478,
"step": 6665
},
{
"epoch": 2.9964061096136567,
"grad_norm": 3.03125,
"learning_rate": 0.0002661236625968931,
"loss": 4.9124,
"step": 6670
},
{
"epoch": 2.998652291105121,
"grad_norm": 3.328125,
"learning_rate": 0.00026605428988237965,
"loss": 4.9819,
"step": 6675
},
{
"epoch": 3.0008984725965857,
"grad_norm": 3.375,
"learning_rate": 0.0002659848564221573,
"loss": 4.8854,
"step": 6680
},
{
"epoch": 3.00314465408805,
"grad_norm": 3.296875,
"learning_rate": 0.0002659153622579638,
"loss": 4.8016,
"step": 6685
},
{
"epoch": 3.0053908355795147,
"grad_norm": 3.859375,
"learning_rate": 0.0002658458074315735,
"loss": 4.8727,
"step": 6690
},
{
"epoch": 3.007637017070979,
"grad_norm": 3.046875,
"learning_rate": 0.0002657761919847971,
"loss": 4.824,
"step": 6695
},
{
"epoch": 3.0098831985624437,
"grad_norm": 3.015625,
"learning_rate": 0.0002657065159594819,
"loss": 4.8335,
"step": 6700
},
{
"epoch": 3.012129380053908,
"grad_norm": 3.0,
"learning_rate": 0.00026563677939751146,
"loss": 4.8298,
"step": 6705
},
{
"epoch": 3.0143755615453727,
"grad_norm": 3.125,
"learning_rate": 0.00026556698234080577,
"loss": 4.8667,
"step": 6710
},
{
"epoch": 3.016621743036837,
"grad_norm": 3.328125,
"learning_rate": 0.0002654971248313213,
"loss": 4.8908,
"step": 6715
},
{
"epoch": 3.018867924528302,
"grad_norm": 3.453125,
"learning_rate": 0.0002654272069110507,
"loss": 4.7931,
"step": 6720
},
{
"epoch": 3.0211141060197666,
"grad_norm": 3.296875,
"learning_rate": 0.0002653572286220229,
"loss": 4.7856,
"step": 6725
},
{
"epoch": 3.023360287511231,
"grad_norm": 3.078125,
"learning_rate": 0.0002652871900063034,
"loss": 4.7742,
"step": 6730
},
{
"epoch": 3.0256064690026956,
"grad_norm": 3.203125,
"learning_rate": 0.0002652170911059937,
"loss": 4.8128,
"step": 6735
},
{
"epoch": 3.02785265049416,
"grad_norm": 3.1875,
"learning_rate": 0.0002651469319632316,
"loss": 4.8296,
"step": 6740
},
{
"epoch": 3.0300988319856246,
"grad_norm": 3.46875,
"learning_rate": 0.00026507671262019115,
"loss": 4.8438,
"step": 6745
},
{
"epoch": 3.032345013477089,
"grad_norm": 3.078125,
"learning_rate": 0.00026500643311908257,
"loss": 4.7799,
"step": 6750
},
{
"epoch": 3.0345911949685536,
"grad_norm": 3.171875,
"learning_rate": 0.0002649360935021522,
"loss": 4.845,
"step": 6755
},
{
"epoch": 3.036837376460018,
"grad_norm": 3.28125,
"learning_rate": 0.00026486569381168267,
"loss": 4.8419,
"step": 6760
},
{
"epoch": 3.0390835579514826,
"grad_norm": 3.265625,
"learning_rate": 0.0002647952340899925,
"loss": 4.8683,
"step": 6765
},
{
"epoch": 3.041329739442947,
"grad_norm": 3.5,
"learning_rate": 0.0002647247143794365,
"loss": 4.8123,
"step": 6770
},
{
"epoch": 3.0435759209344115,
"grad_norm": 3.015625,
"learning_rate": 0.00026465413472240534,
"loss": 4.8202,
"step": 6775
},
{
"epoch": 3.045822102425876,
"grad_norm": 3.0625,
"learning_rate": 0.0002645834951613259,
"loss": 4.8717,
"step": 6780
},
{
"epoch": 3.0480682839173405,
"grad_norm": 3.0625,
"learning_rate": 0.00026451279573866095,
"loss": 4.7503,
"step": 6785
},
{
"epoch": 3.050314465408805,
"grad_norm": 3.140625,
"learning_rate": 0.0002644420364969094,
"loss": 4.8023,
"step": 6790
},
{
"epoch": 3.0525606469002695,
"grad_norm": 3.25,
"learning_rate": 0.0002643712174786059,
"loss": 4.8349,
"step": 6795
},
{
"epoch": 3.054806828391734,
"grad_norm": 3.109375,
"learning_rate": 0.00026430033872632116,
"loss": 4.7862,
"step": 6800
},
{
"epoch": 3.0570530098831985,
"grad_norm": 4.0,
"learning_rate": 0.00026422940028266183,
"loss": 4.8704,
"step": 6805
},
{
"epoch": 3.059299191374663,
"grad_norm": 3.40625,
"learning_rate": 0.0002641584021902704,
"loss": 4.8366,
"step": 6810
},
{
"epoch": 3.0615453728661275,
"grad_norm": 3.296875,
"learning_rate": 0.0002640873444918252,
"loss": 4.8354,
"step": 6815
},
{
"epoch": 3.063791554357592,
"grad_norm": 3.234375,
"learning_rate": 0.00026401622723004034,
"loss": 4.8597,
"step": 6820
},
{
"epoch": 3.0660377358490565,
"grad_norm": 3.03125,
"learning_rate": 0.00026394505044766587,
"loss": 4.8656,
"step": 6825
},
{
"epoch": 3.068283917340521,
"grad_norm": 3.203125,
"learning_rate": 0.0002638738141874876,
"loss": 4.8846,
"step": 6830
},
{
"epoch": 3.0705300988319855,
"grad_norm": 3.140625,
"learning_rate": 0.00026380251849232687,
"loss": 4.8764,
"step": 6835
},
{
"epoch": 3.07277628032345,
"grad_norm": 3.046875,
"learning_rate": 0.00026373116340504103,
"loss": 4.7835,
"step": 6840
},
{
"epoch": 3.0750224618149145,
"grad_norm": 3.15625,
"learning_rate": 0.00026365974896852296,
"loss": 4.8515,
"step": 6845
},
{
"epoch": 3.077268643306379,
"grad_norm": 3.234375,
"learning_rate": 0.0002635882752257013,
"loss": 4.8444,
"step": 6850
},
{
"epoch": 3.079514824797844,
"grad_norm": 3.046875,
"learning_rate": 0.00026351674221954043,
"loss": 4.7379,
"step": 6855
},
{
"epoch": 3.0817610062893084,
"grad_norm": 3.359375,
"learning_rate": 0.0002634451499930401,
"loss": 4.8479,
"step": 6860
},
{
"epoch": 3.084007187780773,
"grad_norm": 3.203125,
"learning_rate": 0.0002633734985892358,
"loss": 4.7955,
"step": 6865
},
{
"epoch": 3.0862533692722374,
"grad_norm": 2.953125,
"learning_rate": 0.00026330178805119853,
"loss": 4.8404,
"step": 6870
},
{
"epoch": 3.088499550763702,
"grad_norm": 3.34375,
"learning_rate": 0.00026323001842203504,
"loss": 4.8335,
"step": 6875
},
{
"epoch": 3.0907457322551664,
"grad_norm": 3.1875,
"learning_rate": 0.00026315818974488744,
"loss": 4.8428,
"step": 6880
},
{
"epoch": 3.092991913746631,
"grad_norm": 3.21875,
"learning_rate": 0.00026308630206293325,
"loss": 4.7151,
"step": 6885
},
{
"epoch": 3.0952380952380953,
"grad_norm": 3.046875,
"learning_rate": 0.0002630143554193857,
"loss": 4.8407,
"step": 6890
},
{
"epoch": 3.09748427672956,
"grad_norm": 3.078125,
"learning_rate": 0.00026294234985749313,
"loss": 4.7312,
"step": 6895
},
{
"epoch": 3.0997304582210243,
"grad_norm": 3.25,
"learning_rate": 0.00026287028542053975,
"loss": 4.7925,
"step": 6900
},
{
"epoch": 3.101976639712489,
"grad_norm": 3.203125,
"learning_rate": 0.0002627981621518447,
"loss": 4.8265,
"step": 6905
},
{
"epoch": 3.1042228212039533,
"grad_norm": 3.03125,
"learning_rate": 0.0002627259800947627,
"loss": 4.8757,
"step": 6910
},
{
"epoch": 3.106469002695418,
"grad_norm": 3.015625,
"learning_rate": 0.00026265373929268383,
"loss": 4.8218,
"step": 6915
},
{
"epoch": 3.1087151841868823,
"grad_norm": 3.046875,
"learning_rate": 0.00026258143978903354,
"loss": 4.8604,
"step": 6920
},
{
"epoch": 3.110961365678347,
"grad_norm": 3.546875,
"learning_rate": 0.00026250908162727234,
"loss": 4.8454,
"step": 6925
},
{
"epoch": 3.1132075471698113,
"grad_norm": 3.125,
"learning_rate": 0.0002624366648508962,
"loss": 4.8521,
"step": 6930
},
{
"epoch": 3.115453728661276,
"grad_norm": 3.1875,
"learning_rate": 0.00026236418950343623,
"loss": 4.7329,
"step": 6935
},
{
"epoch": 3.1176999101527403,
"grad_norm": 3.15625,
"learning_rate": 0.0002622916556284588,
"loss": 4.7403,
"step": 6940
},
{
"epoch": 3.1199460916442048,
"grad_norm": 3.15625,
"learning_rate": 0.0002622190632695655,
"loss": 4.7682,
"step": 6945
},
{
"epoch": 3.1221922731356693,
"grad_norm": 3.21875,
"learning_rate": 0.0002621464124703929,
"loss": 4.8178,
"step": 6950
},
{
"epoch": 3.1244384546271338,
"grad_norm": 3.125,
"learning_rate": 0.00026207370327461284,
"loss": 4.8774,
"step": 6955
},
{
"epoch": 3.1266846361185983,
"grad_norm": 3.578125,
"learning_rate": 0.0002620009357259323,
"loss": 4.8435,
"step": 6960
},
{
"epoch": 3.1289308176100628,
"grad_norm": 3.109375,
"learning_rate": 0.0002619281098680932,
"loss": 4.8077,
"step": 6965
},
{
"epoch": 3.1311769991015272,
"grad_norm": 3.296875,
"learning_rate": 0.0002618552257448727,
"loss": 4.7614,
"step": 6970
},
{
"epoch": 3.1334231805929917,
"grad_norm": 3.21875,
"learning_rate": 0.00026178228340008276,
"loss": 4.8246,
"step": 6975
},
{
"epoch": 3.1356693620844562,
"grad_norm": 3.546875,
"learning_rate": 0.0002617092828775705,
"loss": 4.8138,
"step": 6980
},
{
"epoch": 3.1379155435759207,
"grad_norm": 3.046875,
"learning_rate": 0.0002616362242212179,
"loss": 4.8909,
"step": 6985
},
{
"epoch": 3.1401617250673857,
"grad_norm": 3.390625,
"learning_rate": 0.00026156310747494206,
"loss": 4.826,
"step": 6990
},
{
"epoch": 3.14240790655885,
"grad_norm": 3.140625,
"learning_rate": 0.0002614899326826948,
"loss": 4.7744,
"step": 6995
},
{
"epoch": 3.1446540880503147,
"grad_norm": 3.296875,
"learning_rate": 0.00026141669988846293,
"loss": 4.9305,
"step": 7000
},
{
"epoch": 3.1446540880503147,
"eval_loss": 4.942409038543701,
"eval_runtime": 16.1167,
"eval_samples_per_second": 1924.273,
"eval_steps_per_second": 240.557,
"step": 7000
},
{
"epoch": 3.146900269541779,
"grad_norm": 3.03125,
"learning_rate": 0.00026134340913626814,
"loss": 4.8665,
"step": 7005
},
{
"epoch": 3.1491464510332436,
"grad_norm": 3.328125,
"learning_rate": 0.00026127006047016693,
"loss": 4.8994,
"step": 7010
},
{
"epoch": 3.151392632524708,
"grad_norm": 3.25,
"learning_rate": 0.0002611966539342506,
"loss": 4.8005,
"step": 7015
},
{
"epoch": 3.1536388140161726,
"grad_norm": 3.203125,
"learning_rate": 0.0002611231895726453,
"loss": 4.9289,
"step": 7020
},
{
"epoch": 3.155884995507637,
"grad_norm": 3.359375,
"learning_rate": 0.0002610496674295118,
"loss": 4.7866,
"step": 7025
},
{
"epoch": 3.1581311769991016,
"grad_norm": 3.015625,
"learning_rate": 0.0002609760875490457,
"loss": 4.791,
"step": 7030
},
{
"epoch": 3.160377358490566,
"grad_norm": 3.0625,
"learning_rate": 0.00026090244997547743,
"loss": 4.7871,
"step": 7035
},
{
"epoch": 3.1626235399820306,
"grad_norm": 3.015625,
"learning_rate": 0.00026082875475307184,
"loss": 4.7937,
"step": 7040
},
{
"epoch": 3.164869721473495,
"grad_norm": 2.953125,
"learning_rate": 0.0002607550019261287,
"loss": 4.794,
"step": 7045
},
{
"epoch": 3.1671159029649596,
"grad_norm": 3.3125,
"learning_rate": 0.0002606811915389822,
"loss": 4.8237,
"step": 7050
},
{
"epoch": 3.169362084456424,
"grad_norm": 2.921875,
"learning_rate": 0.00026060732363600113,
"loss": 4.8981,
"step": 7055
},
{
"epoch": 3.1716082659478886,
"grad_norm": 3.296875,
"learning_rate": 0.00026053339826158904,
"loss": 4.8026,
"step": 7060
},
{
"epoch": 3.173854447439353,
"grad_norm": 3.296875,
"learning_rate": 0.0002604594154601839,
"loss": 4.8203,
"step": 7065
},
{
"epoch": 3.1761006289308176,
"grad_norm": 3.3125,
"learning_rate": 0.00026038537527625817,
"loss": 4.8191,
"step": 7070
},
{
"epoch": 3.178346810422282,
"grad_norm": 3.1875,
"learning_rate": 0.00026031127775431894,
"loss": 4.8258,
"step": 7075
},
{
"epoch": 3.1805929919137466,
"grad_norm": 3.109375,
"learning_rate": 0.0002602371229389076,
"loss": 4.8744,
"step": 7080
},
{
"epoch": 3.182839173405211,
"grad_norm": 3.296875,
"learning_rate": 0.0002601629108746001,
"loss": 4.8279,
"step": 7085
},
{
"epoch": 3.1850853548966755,
"grad_norm": 3.390625,
"learning_rate": 0.0002600886416060068,
"loss": 4.7797,
"step": 7090
},
{
"epoch": 3.18733153638814,
"grad_norm": 4.0625,
"learning_rate": 0.00026001431517777226,
"loss": 4.8642,
"step": 7095
},
{
"epoch": 3.1895777178796045,
"grad_norm": 3.265625,
"learning_rate": 0.0002599399316345757,
"loss": 4.7916,
"step": 7100
},
{
"epoch": 3.191823899371069,
"grad_norm": 3.171875,
"learning_rate": 0.0002598654910211304,
"loss": 4.779,
"step": 7105
},
{
"epoch": 3.1940700808625335,
"grad_norm": 3.21875,
"learning_rate": 0.0002597909933821842,
"loss": 4.8629,
"step": 7110
},
{
"epoch": 3.196316262353998,
"grad_norm": 3.21875,
"learning_rate": 0.000259716438762519,
"loss": 4.7861,
"step": 7115
},
{
"epoch": 3.1985624438454625,
"grad_norm": 3.25,
"learning_rate": 0.0002596418272069511,
"loss": 4.8307,
"step": 7120
},
{
"epoch": 3.2008086253369274,
"grad_norm": 3.15625,
"learning_rate": 0.0002595671587603309,
"loss": 4.7688,
"step": 7125
},
{
"epoch": 3.2030548068283915,
"grad_norm": 3.125,
"learning_rate": 0.00025949243346754306,
"loss": 4.8044,
"step": 7130
},
{
"epoch": 3.2053009883198564,
"grad_norm": 3.21875,
"learning_rate": 0.00025941765137350647,
"loss": 4.8947,
"step": 7135
},
{
"epoch": 3.207547169811321,
"grad_norm": 3.28125,
"learning_rate": 0.0002593428125231741,
"loss": 4.8465,
"step": 7140
},
{
"epoch": 3.2097933513027854,
"grad_norm": 3.171875,
"learning_rate": 0.000259267916961533,
"loss": 4.7989,
"step": 7145
},
{
"epoch": 3.21203953279425,
"grad_norm": 3.203125,
"learning_rate": 0.0002591929647336044,
"loss": 4.8914,
"step": 7150
},
{
"epoch": 3.2142857142857144,
"grad_norm": 3.125,
"learning_rate": 0.00025911795588444354,
"loss": 4.861,
"step": 7155
},
{
"epoch": 3.216531895777179,
"grad_norm": 3.28125,
"learning_rate": 0.00025904289045913966,
"loss": 4.8461,
"step": 7160
},
{
"epoch": 3.2187780772686434,
"grad_norm": 3.21875,
"learning_rate": 0.0002589677685028161,
"loss": 4.8025,
"step": 7165
},
{
"epoch": 3.221024258760108,
"grad_norm": 3.265625,
"learning_rate": 0.0002588925900606301,
"loss": 4.7946,
"step": 7170
},
{
"epoch": 3.2232704402515724,
"grad_norm": 3.296875,
"learning_rate": 0.000258817355177773,
"loss": 4.8217,
"step": 7175
},
{
"epoch": 3.225516621743037,
"grad_norm": 3.3125,
"learning_rate": 0.0002587420638994698,
"loss": 4.7985,
"step": 7180
},
{
"epoch": 3.2277628032345014,
"grad_norm": 3.328125,
"learning_rate": 0.0002586667162709797,
"loss": 4.7482,
"step": 7185
},
{
"epoch": 3.230008984725966,
"grad_norm": 3.03125,
"learning_rate": 0.0002585913123375956,
"loss": 4.7843,
"step": 7190
},
{
"epoch": 3.2322551662174304,
"grad_norm": 3.203125,
"learning_rate": 0.00025851585214464414,
"loss": 4.8181,
"step": 7195
},
{
"epoch": 3.234501347708895,
"grad_norm": 3.234375,
"learning_rate": 0.0002584403357374861,
"loss": 4.8006,
"step": 7200
},
{
"epoch": 3.2367475292003594,
"grad_norm": 3.3125,
"learning_rate": 0.0002583647631615158,
"loss": 4.9063,
"step": 7205
},
{
"epoch": 3.238993710691824,
"grad_norm": 3.125,
"learning_rate": 0.00025828913446216133,
"loss": 4.8141,
"step": 7210
},
{
"epoch": 3.2412398921832883,
"grad_norm": 3.140625,
"learning_rate": 0.0002582134496848847,
"loss": 4.8757,
"step": 7215
},
{
"epoch": 3.243486073674753,
"grad_norm": 3.09375,
"learning_rate": 0.0002581377088751814,
"loss": 4.802,
"step": 7220
},
{
"epoch": 3.2457322551662173,
"grad_norm": 3.125,
"learning_rate": 0.00025806191207858076,
"loss": 4.8018,
"step": 7225
},
{
"epoch": 3.247978436657682,
"grad_norm": 3.390625,
"learning_rate": 0.0002579860593406457,
"loss": 4.8861,
"step": 7230
},
{
"epoch": 3.2502246181491463,
"grad_norm": 3.1875,
"learning_rate": 0.0002579101507069728,
"loss": 4.8369,
"step": 7235
},
{
"epoch": 3.252470799640611,
"grad_norm": 3.0625,
"learning_rate": 0.0002578341862231922,
"loss": 4.823,
"step": 7240
},
{
"epoch": 3.2547169811320753,
"grad_norm": 3.203125,
"learning_rate": 0.0002577581659349677,
"loss": 4.8676,
"step": 7245
},
{
"epoch": 3.25696316262354,
"grad_norm": 3.3125,
"learning_rate": 0.0002576820898879965,
"loss": 4.8252,
"step": 7250
},
{
"epoch": 3.2592093441150043,
"grad_norm": 3.046875,
"learning_rate": 0.0002576059581280095,
"loss": 4.7799,
"step": 7255
},
{
"epoch": 3.2614555256064692,
"grad_norm": 3.125,
"learning_rate": 0.0002575297707007709,
"loss": 4.8234,
"step": 7260
},
{
"epoch": 3.2637017070979333,
"grad_norm": 3.1875,
"learning_rate": 0.00025745352765207843,
"loss": 4.7695,
"step": 7265
},
{
"epoch": 3.265947888589398,
"grad_norm": 3.359375,
"learning_rate": 0.0002573772290277633,
"loss": 4.8183,
"step": 7270
},
{
"epoch": 3.2681940700808627,
"grad_norm": 3.15625,
"learning_rate": 0.0002573008748736902,
"loss": 4.8566,
"step": 7275
},
{
"epoch": 3.270440251572327,
"grad_norm": 3.125,
"learning_rate": 0.00025722446523575705,
"loss": 4.7541,
"step": 7280
},
{
"epoch": 3.2726864330637917,
"grad_norm": 3.046875,
"learning_rate": 0.00025714800015989506,
"loss": 4.7738,
"step": 7285
},
{
"epoch": 3.274932614555256,
"grad_norm": 3.359375,
"learning_rate": 0.00025707147969206904,
"loss": 4.8893,
"step": 7290
},
{
"epoch": 3.2771787960467207,
"grad_norm": 3.421875,
"learning_rate": 0.0002569949038782769,
"loss": 4.7991,
"step": 7295
},
{
"epoch": 3.279424977538185,
"grad_norm": 3.265625,
"learning_rate": 0.0002569182727645498,
"loss": 4.7758,
"step": 7300
},
{
"epoch": 3.2816711590296497,
"grad_norm": 3.171875,
"learning_rate": 0.0002568415863969522,
"loss": 4.7917,
"step": 7305
},
{
"epoch": 3.283917340521114,
"grad_norm": 3.390625,
"learning_rate": 0.00025676484482158187,
"loss": 4.7623,
"step": 7310
},
{
"epoch": 3.2861635220125787,
"grad_norm": 3.265625,
"learning_rate": 0.0002566880480845696,
"loss": 4.7615,
"step": 7315
},
{
"epoch": 3.288409703504043,
"grad_norm": 3.4375,
"learning_rate": 0.00025661119623207943,
"loss": 4.7924,
"step": 7320
},
{
"epoch": 3.2906558849955077,
"grad_norm": 3.125,
"learning_rate": 0.00025653428931030856,
"loss": 4.8251,
"step": 7325
},
{
"epoch": 3.292902066486972,
"grad_norm": 3.25,
"learning_rate": 0.00025645732736548707,
"loss": 4.7862,
"step": 7330
},
{
"epoch": 3.2951482479784366,
"grad_norm": 3.296875,
"learning_rate": 0.0002563803104438785,
"loss": 4.8408,
"step": 7335
},
{
"epoch": 3.297394429469901,
"grad_norm": 3.453125,
"learning_rate": 0.0002563032385917791,
"loss": 4.7965,
"step": 7340
},
{
"epoch": 3.2996406109613656,
"grad_norm": 3.1875,
"learning_rate": 0.00025622611185551825,
"loss": 4.8009,
"step": 7345
},
{
"epoch": 3.30188679245283,
"grad_norm": 3.03125,
"learning_rate": 0.0002561489302814585,
"loss": 4.7463,
"step": 7350
},
{
"epoch": 3.3041329739442946,
"grad_norm": 3.296875,
"learning_rate": 0.000256071693915995,
"loss": 4.8728,
"step": 7355
},
{
"epoch": 3.306379155435759,
"grad_norm": 3.640625,
"learning_rate": 0.00025599440280555616,
"loss": 4.8311,
"step": 7360
},
{
"epoch": 3.3086253369272236,
"grad_norm": 3.171875,
"learning_rate": 0.00025591705699660317,
"loss": 4.8349,
"step": 7365
},
{
"epoch": 3.310871518418688,
"grad_norm": 3.03125,
"learning_rate": 0.00025583965653563006,
"loss": 4.8007,
"step": 7370
},
{
"epoch": 3.3131176999101526,
"grad_norm": 3.125,
"learning_rate": 0.00025576220146916376,
"loss": 4.8239,
"step": 7375
},
{
"epoch": 3.315363881401617,
"grad_norm": 3.125,
"learning_rate": 0.0002556846918437641,
"loss": 4.7837,
"step": 7380
},
{
"epoch": 3.3176100628930816,
"grad_norm": 3.1875,
"learning_rate": 0.0002556071277060236,
"loss": 4.8485,
"step": 7385
},
{
"epoch": 3.319856244384546,
"grad_norm": 3.1875,
"learning_rate": 0.0002555295091025675,
"loss": 4.8199,
"step": 7390
},
{
"epoch": 3.322102425876011,
"grad_norm": 3.203125,
"learning_rate": 0.00025545183608005395,
"loss": 4.7623,
"step": 7395
},
{
"epoch": 3.324348607367475,
"grad_norm": 3.265625,
"learning_rate": 0.0002553741086851737,
"loss": 4.8978,
"step": 7400
},
{
"epoch": 3.32659478885894,
"grad_norm": 3.25,
"learning_rate": 0.0002552963269646502,
"loss": 4.7921,
"step": 7405
},
{
"epoch": 3.3288409703504045,
"grad_norm": 3.21875,
"learning_rate": 0.0002552184909652396,
"loss": 4.7919,
"step": 7410
},
{
"epoch": 3.331087151841869,
"grad_norm": 3.203125,
"learning_rate": 0.0002551406007337305,
"loss": 4.8001,
"step": 7415
},
{
"epoch": 3.3333333333333335,
"grad_norm": 3.296875,
"learning_rate": 0.0002550626563169444,
"loss": 4.7865,
"step": 7420
},
{
"epoch": 3.335579514824798,
"grad_norm": 3.015625,
"learning_rate": 0.0002549846577617352,
"loss": 4.7541,
"step": 7425
},
{
"epoch": 3.3378256963162625,
"grad_norm": 3.234375,
"learning_rate": 0.00025490660511498926,
"loss": 4.8102,
"step": 7430
},
{
"epoch": 3.340071877807727,
"grad_norm": 3.03125,
"learning_rate": 0.0002548284984236256,
"loss": 4.7369,
"step": 7435
},
{
"epoch": 3.3423180592991915,
"grad_norm": 3.234375,
"learning_rate": 0.0002547503377345957,
"loss": 4.8052,
"step": 7440
},
{
"epoch": 3.344564240790656,
"grad_norm": 3.078125,
"learning_rate": 0.00025467212309488347,
"loss": 4.8815,
"step": 7445
},
{
"epoch": 3.3468104222821204,
"grad_norm": 3.3125,
"learning_rate": 0.0002545938545515052,
"loss": 4.8297,
"step": 7450
},
{
"epoch": 3.349056603773585,
"grad_norm": 3.375,
"learning_rate": 0.00025451553215150973,
"loss": 4.8577,
"step": 7455
},
{
"epoch": 3.3513027852650494,
"grad_norm": 3.109375,
"learning_rate": 0.0002544371559419781,
"loss": 4.8335,
"step": 7460
},
{
"epoch": 3.353548966756514,
"grad_norm": 3.109375,
"learning_rate": 0.0002543587259700239,
"loss": 4.8204,
"step": 7465
},
{
"epoch": 3.3557951482479784,
"grad_norm": 3.15625,
"learning_rate": 0.0002542802422827928,
"loss": 4.8149,
"step": 7470
},
{
"epoch": 3.358041329739443,
"grad_norm": 3.234375,
"learning_rate": 0.00025420170492746293,
"loss": 4.7721,
"step": 7475
},
{
"epoch": 3.3602875112309074,
"grad_norm": 3.3125,
"learning_rate": 0.0002541231139512447,
"loss": 4.7729,
"step": 7480
},
{
"epoch": 3.362533692722372,
"grad_norm": 3.125,
"learning_rate": 0.0002540444694013805,
"loss": 4.8291,
"step": 7485
},
{
"epoch": 3.3647798742138364,
"grad_norm": 3.515625,
"learning_rate": 0.00025396577132514523,
"loss": 4.8063,
"step": 7490
},
{
"epoch": 3.367026055705301,
"grad_norm": 3.234375,
"learning_rate": 0.00025388701976984587,
"loss": 4.7306,
"step": 7495
},
{
"epoch": 3.3692722371967654,
"grad_norm": 3.125,
"learning_rate": 0.0002538082147828214,
"loss": 4.7863,
"step": 7500
},
{
"epoch": 3.37151841868823,
"grad_norm": 2.96875,
"learning_rate": 0.0002537293564114432,
"loss": 4.8825,
"step": 7505
},
{
"epoch": 3.3737646001796944,
"grad_norm": 3.0625,
"learning_rate": 0.00025365044470311446,
"loss": 4.8224,
"step": 7510
},
{
"epoch": 3.376010781671159,
"grad_norm": 3.25,
"learning_rate": 0.0002535714797052706,
"loss": 4.8422,
"step": 7515
},
{
"epoch": 3.3782569631626234,
"grad_norm": 3.265625,
"learning_rate": 0.000253492461465379,
"loss": 4.7746,
"step": 7520
},
{
"epoch": 3.380503144654088,
"grad_norm": 3.640625,
"learning_rate": 0.00025341339003093905,
"loss": 4.8838,
"step": 7525
},
{
"epoch": 3.382749326145553,
"grad_norm": 3.9375,
"learning_rate": 0.00025333426544948214,
"loss": 4.7781,
"step": 7530
},
{
"epoch": 3.384995507637017,
"grad_norm": 3.3125,
"learning_rate": 0.0002532550877685716,
"loss": 4.808,
"step": 7535
},
{
"epoch": 3.387241689128482,
"grad_norm": 3.09375,
"learning_rate": 0.0002531758570358028,
"loss": 4.8098,
"step": 7540
},
{
"epoch": 3.3894878706199463,
"grad_norm": 3.1875,
"learning_rate": 0.0002530965732988027,
"loss": 4.7755,
"step": 7545
},
{
"epoch": 3.3917340521114108,
"grad_norm": 3.15625,
"learning_rate": 0.00025301723660523044,
"loss": 4.7746,
"step": 7550
},
{
"epoch": 3.3939802336028753,
"grad_norm": 3.0,
"learning_rate": 0.00025293784700277673,
"loss": 4.7915,
"step": 7555
},
{
"epoch": 3.3962264150943398,
"grad_norm": 3.09375,
"learning_rate": 0.0002528584045391644,
"loss": 4.7881,
"step": 7560
},
{
"epoch": 3.3984725965858043,
"grad_norm": 3.265625,
"learning_rate": 0.00025277890926214767,
"loss": 4.8263,
"step": 7565
},
{
"epoch": 3.4007187780772687,
"grad_norm": 3.203125,
"learning_rate": 0.0002526993612195128,
"loss": 4.7642,
"step": 7570
},
{
"epoch": 3.4029649595687332,
"grad_norm": 3.03125,
"learning_rate": 0.0002526197604590777,
"loss": 4.8737,
"step": 7575
},
{
"epoch": 3.4052111410601977,
"grad_norm": 3.203125,
"learning_rate": 0.00025254010702869194,
"loss": 4.7828,
"step": 7580
},
{
"epoch": 3.4074573225516622,
"grad_norm": 3.03125,
"learning_rate": 0.0002524604009762366,
"loss": 4.8156,
"step": 7585
},
{
"epoch": 3.4097035040431267,
"grad_norm": 3.21875,
"learning_rate": 0.00025238064234962474,
"loss": 4.8054,
"step": 7590
},
{
"epoch": 3.411949685534591,
"grad_norm": 3.09375,
"learning_rate": 0.00025230083119680074,
"loss": 4.8162,
"step": 7595
},
{
"epoch": 3.4141958670260557,
"grad_norm": 3.0,
"learning_rate": 0.0002522209675657407,
"loss": 4.8375,
"step": 7600
},
{
"epoch": 3.41644204851752,
"grad_norm": 3.359375,
"learning_rate": 0.0002521410515044522,
"loss": 4.7701,
"step": 7605
},
{
"epoch": 3.4186882300089847,
"grad_norm": 3.328125,
"learning_rate": 0.0002520610830609742,
"loss": 4.7296,
"step": 7610
},
{
"epoch": 3.420934411500449,
"grad_norm": 3.078125,
"learning_rate": 0.0002519810622833775,
"loss": 4.858,
"step": 7615
},
{
"epoch": 3.4231805929919137,
"grad_norm": 3.28125,
"learning_rate": 0.00025190098921976404,
"loss": 4.852,
"step": 7620
},
{
"epoch": 3.425426774483378,
"grad_norm": 3.234375,
"learning_rate": 0.00025182086391826726,
"loss": 4.7896,
"step": 7625
},
{
"epoch": 3.4276729559748427,
"grad_norm": 3.125,
"learning_rate": 0.0002517406864270522,
"loss": 4.8046,
"step": 7630
},
{
"epoch": 3.429919137466307,
"grad_norm": 3.109375,
"learning_rate": 0.000251660456794315,
"loss": 4.8123,
"step": 7635
},
{
"epoch": 3.4321653189577717,
"grad_norm": 3.265625,
"learning_rate": 0.0002515801750682833,
"loss": 4.7836,
"step": 7640
},
{
"epoch": 3.434411500449236,
"grad_norm": 3.265625,
"learning_rate": 0.000251499841297216,
"loss": 4.8299,
"step": 7645
},
{
"epoch": 3.4366576819407006,
"grad_norm": 3.15625,
"learning_rate": 0.0002514194555294033,
"loss": 4.7713,
"step": 7650
},
{
"epoch": 3.438903863432165,
"grad_norm": 3.109375,
"learning_rate": 0.00025133901781316663,
"loss": 4.7489,
"step": 7655
},
{
"epoch": 3.4411500449236296,
"grad_norm": 3.046875,
"learning_rate": 0.0002512585281968588,
"loss": 4.8084,
"step": 7660
},
{
"epoch": 3.4433962264150946,
"grad_norm": 3.234375,
"learning_rate": 0.00025117798672886354,
"loss": 4.7632,
"step": 7665
},
{
"epoch": 3.4456424079065586,
"grad_norm": 3.640625,
"learning_rate": 0.0002510973934575959,
"loss": 4.7962,
"step": 7670
},
{
"epoch": 3.4478885893980236,
"grad_norm": 3.203125,
"learning_rate": 0.0002510167484315022,
"loss": 4.7781,
"step": 7675
},
{
"epoch": 3.450134770889488,
"grad_norm": 3.1875,
"learning_rate": 0.0002509360516990597,
"loss": 4.7245,
"step": 7680
},
{
"epoch": 3.4523809523809526,
"grad_norm": 3.28125,
"learning_rate": 0.00025085530330877666,
"loss": 4.8373,
"step": 7685
},
{
"epoch": 3.454627133872417,
"grad_norm": 3.28125,
"learning_rate": 0.0002507745033091927,
"loss": 4.7834,
"step": 7690
},
{
"epoch": 3.4568733153638815,
"grad_norm": 3.140625,
"learning_rate": 0.00025069365174887814,
"loss": 4.8153,
"step": 7695
},
{
"epoch": 3.459119496855346,
"grad_norm": 3.265625,
"learning_rate": 0.0002506127486764345,
"loss": 4.7421,
"step": 7700
},
{
"epoch": 3.4613656783468105,
"grad_norm": 3.1875,
"learning_rate": 0.00025053179414049416,
"loss": 4.8132,
"step": 7705
},
{
"epoch": 3.463611859838275,
"grad_norm": 3.046875,
"learning_rate": 0.00025045078818972046,
"loss": 4.7787,
"step": 7710
},
{
"epoch": 3.4658580413297395,
"grad_norm": 3.5,
"learning_rate": 0.0002503697308728077,
"loss": 4.8161,
"step": 7715
},
{
"epoch": 3.468104222821204,
"grad_norm": 3.34375,
"learning_rate": 0.0002502886222384811,
"loss": 4.7623,
"step": 7720
},
{
"epoch": 3.4703504043126685,
"grad_norm": 3.15625,
"learning_rate": 0.0002502074623354965,
"loss": 4.9089,
"step": 7725
},
{
"epoch": 3.472596585804133,
"grad_norm": 3.0625,
"learning_rate": 0.0002501262512126408,
"loss": 4.8818,
"step": 7730
},
{
"epoch": 3.4748427672955975,
"grad_norm": 3.375,
"learning_rate": 0.00025004498891873146,
"loss": 4.8681,
"step": 7735
},
{
"epoch": 3.477088948787062,
"grad_norm": 3.046875,
"learning_rate": 0.000249963675502617,
"loss": 4.7931,
"step": 7740
},
{
"epoch": 3.4793351302785265,
"grad_norm": 3.109375,
"learning_rate": 0.00024988231101317647,
"loss": 4.7794,
"step": 7745
},
{
"epoch": 3.481581311769991,
"grad_norm": 3.140625,
"learning_rate": 0.00024980089549931955,
"loss": 4.7498,
"step": 7750
},
{
"epoch": 3.4838274932614555,
"grad_norm": 3.046875,
"learning_rate": 0.0002497194290099868,
"loss": 4.8458,
"step": 7755
},
{
"epoch": 3.48607367475292,
"grad_norm": 3.34375,
"learning_rate": 0.00024963791159414927,
"loss": 4.8934,
"step": 7760
},
{
"epoch": 3.4883198562443845,
"grad_norm": 3.375,
"learning_rate": 0.00024955634330080863,
"loss": 4.7986,
"step": 7765
},
{
"epoch": 3.490566037735849,
"grad_norm": 3.0625,
"learning_rate": 0.00024947472417899733,
"loss": 4.7661,
"step": 7770
},
{
"epoch": 3.4928122192273134,
"grad_norm": 3.296875,
"learning_rate": 0.000249393054277778,
"loss": 4.8551,
"step": 7775
},
{
"epoch": 3.495058400718778,
"grad_norm": 3.078125,
"learning_rate": 0.0002493113336462442,
"loss": 4.8479,
"step": 7780
},
{
"epoch": 3.4973045822102424,
"grad_norm": 3.25,
"learning_rate": 0.00024922956233351976,
"loss": 4.8853,
"step": 7785
},
{
"epoch": 3.499550763701707,
"grad_norm": 3.171875,
"learning_rate": 0.00024914774038875895,
"loss": 4.7687,
"step": 7790
},
{
"epoch": 3.5017969451931714,
"grad_norm": 3.296875,
"learning_rate": 0.0002490658678611466,
"loss": 4.7617,
"step": 7795
},
{
"epoch": 3.5040431266846364,
"grad_norm": 3.078125,
"learning_rate": 0.00024898394479989786,
"loss": 4.7722,
"step": 7800
},
{
"epoch": 3.5062893081761004,
"grad_norm": 3.109375,
"learning_rate": 0.0002489019712542583,
"loss": 4.8146,
"step": 7805
},
{
"epoch": 3.5085354896675653,
"grad_norm": 3.15625,
"learning_rate": 0.00024881994727350373,
"loss": 4.8087,
"step": 7810
},
{
"epoch": 3.5107816711590294,
"grad_norm": 3.125,
"learning_rate": 0.0002487378729069405,
"loss": 4.8572,
"step": 7815
},
{
"epoch": 3.5130278526504943,
"grad_norm": 3.171875,
"learning_rate": 0.0002486557482039051,
"loss": 4.7661,
"step": 7820
},
{
"epoch": 3.515274034141959,
"grad_norm": 3.09375,
"learning_rate": 0.0002485735732137642,
"loss": 4.8137,
"step": 7825
},
{
"epoch": 3.5175202156334233,
"grad_norm": 3.125,
"learning_rate": 0.00024849134798591487,
"loss": 4.8203,
"step": 7830
},
{
"epoch": 3.519766397124888,
"grad_norm": 3.140625,
"learning_rate": 0.00024840907256978433,
"loss": 4.7541,
"step": 7835
},
{
"epoch": 3.5220125786163523,
"grad_norm": 3.03125,
"learning_rate": 0.0002483267470148298,
"loss": 4.7931,
"step": 7840
},
{
"epoch": 3.524258760107817,
"grad_norm": 3.59375,
"learning_rate": 0.0002482443713705389,
"loss": 4.8011,
"step": 7845
},
{
"epoch": 3.5265049415992813,
"grad_norm": 3.296875,
"learning_rate": 0.0002481619456864293,
"loss": 4.8371,
"step": 7850
},
{
"epoch": 3.528751123090746,
"grad_norm": 3.265625,
"learning_rate": 0.0002480794700120485,
"loss": 4.7765,
"step": 7855
},
{
"epoch": 3.5309973045822103,
"grad_norm": 3.4375,
"learning_rate": 0.00024799694439697436,
"loss": 4.7728,
"step": 7860
},
{
"epoch": 3.5332434860736748,
"grad_norm": 3.125,
"learning_rate": 0.00024791436889081466,
"loss": 4.82,
"step": 7865
},
{
"epoch": 3.5354896675651393,
"grad_norm": 3.28125,
"learning_rate": 0.0002478317435432071,
"loss": 4.8269,
"step": 7870
},
{
"epoch": 3.5377358490566038,
"grad_norm": 3.1875,
"learning_rate": 0.00024774906840381935,
"loss": 4.8323,
"step": 7875
},
{
"epoch": 3.5399820305480683,
"grad_norm": 3.1875,
"learning_rate": 0.0002476663435223492,
"loss": 4.769,
"step": 7880
},
{
"epoch": 3.5422282120395328,
"grad_norm": 3.296875,
"learning_rate": 0.00024758356894852404,
"loss": 4.7889,
"step": 7885
},
{
"epoch": 3.5444743935309972,
"grad_norm": 3.21875,
"learning_rate": 0.00024750074473210134,
"loss": 4.7562,
"step": 7890
},
{
"epoch": 3.5467205750224617,
"grad_norm": 3.203125,
"learning_rate": 0.0002474178709228684,
"loss": 4.8241,
"step": 7895
},
{
"epoch": 3.5489667565139262,
"grad_norm": 3.0,
"learning_rate": 0.0002473349475706422,
"loss": 4.7784,
"step": 7900
},
{
"epoch": 3.5512129380053907,
"grad_norm": 2.984375,
"learning_rate": 0.0002472519747252697,
"loss": 4.7175,
"step": 7905
},
{
"epoch": 3.5534591194968552,
"grad_norm": 3.15625,
"learning_rate": 0.00024716895243662737,
"loss": 4.7571,
"step": 7910
},
{
"epoch": 3.5557053009883197,
"grad_norm": 3.234375,
"learning_rate": 0.00024708588075462166,
"loss": 4.7624,
"step": 7915
},
{
"epoch": 3.557951482479784,
"grad_norm": 3.3125,
"learning_rate": 0.0002470027597291885,
"loss": 4.7763,
"step": 7920
},
{
"epoch": 3.560197663971249,
"grad_norm": 3.140625,
"learning_rate": 0.0002469195894102935,
"loss": 4.8054,
"step": 7925
},
{
"epoch": 3.562443845462713,
"grad_norm": 3.15625,
"learning_rate": 0.0002468363698479321,
"loss": 4.7957,
"step": 7930
},
{
"epoch": 3.564690026954178,
"grad_norm": 3.390625,
"learning_rate": 0.0002467531010921292,
"loss": 4.7889,
"step": 7935
},
{
"epoch": 3.566936208445642,
"grad_norm": 3.15625,
"learning_rate": 0.00024666978319293914,
"loss": 4.7675,
"step": 7940
},
{
"epoch": 3.569182389937107,
"grad_norm": 3.21875,
"learning_rate": 0.00024658641620044604,
"loss": 4.843,
"step": 7945
},
{
"epoch": 3.571428571428571,
"grad_norm": 3.1875,
"learning_rate": 0.0002465030001647634,
"loss": 4.7993,
"step": 7950
},
{
"epoch": 3.573674752920036,
"grad_norm": 3.3125,
"learning_rate": 0.0002464195351360343,
"loss": 4.8091,
"step": 7955
},
{
"epoch": 3.5759209344115006,
"grad_norm": 3.21875,
"learning_rate": 0.0002463360211644311,
"loss": 4.7122,
"step": 7960
},
{
"epoch": 3.578167115902965,
"grad_norm": 3.046875,
"learning_rate": 0.0002462524583001557,
"loss": 4.8386,
"step": 7965
},
{
"epoch": 3.5804132973944296,
"grad_norm": 3.203125,
"learning_rate": 0.0002461688465934395,
"loss": 4.752,
"step": 7970
},
{
"epoch": 3.582659478885894,
"grad_norm": 3.125,
"learning_rate": 0.00024608518609454293,
"loss": 4.7621,
"step": 7975
},
{
"epoch": 3.5849056603773586,
"grad_norm": 3.390625,
"learning_rate": 0.0002460014768537561,
"loss": 4.7538,
"step": 7980
},
{
"epoch": 3.587151841868823,
"grad_norm": 3.625,
"learning_rate": 0.00024591771892139817,
"loss": 4.8875,
"step": 7985
},
{
"epoch": 3.5893980233602876,
"grad_norm": 3.125,
"learning_rate": 0.0002458339123478178,
"loss": 4.8174,
"step": 7990
},
{
"epoch": 3.591644204851752,
"grad_norm": 3.046875,
"learning_rate": 0.00024575005718339255,
"loss": 4.7648,
"step": 7995
},
{
"epoch": 3.5938903863432166,
"grad_norm": 3.15625,
"learning_rate": 0.00024566615347852965,
"loss": 4.8448,
"step": 8000
},
{
"epoch": 3.5938903863432166,
"eval_loss": 4.903536796569824,
"eval_runtime": 16.0594,
"eval_samples_per_second": 1931.145,
"eval_steps_per_second": 241.416,
"step": 8000
},
{
"epoch": 3.596136567834681,
"grad_norm": 3.265625,
"learning_rate": 0.0002455822012836651,
"loss": 4.8178,
"step": 8005
},
{
"epoch": 3.5983827493261455,
"grad_norm": 3.015625,
"learning_rate": 0.0002454982006492642,
"loss": 4.7372,
"step": 8010
},
{
"epoch": 3.60062893081761,
"grad_norm": 3.25,
"learning_rate": 0.00024541415162582144,
"loss": 4.8245,
"step": 8015
},
{
"epoch": 3.6028751123090745,
"grad_norm": 3.421875,
"learning_rate": 0.00024533005426386026,
"loss": 4.8042,
"step": 8020
},
{
"epoch": 3.605121293800539,
"grad_norm": 3.453125,
"learning_rate": 0.0002452459086139333,
"loss": 4.7305,
"step": 8025
},
{
"epoch": 3.6073674752920035,
"grad_norm": 3.28125,
"learning_rate": 0.0002451617147266221,
"loss": 4.7429,
"step": 8030
},
{
"epoch": 3.609613656783468,
"grad_norm": 2.96875,
"learning_rate": 0.00024507747265253735,
"loss": 4.8285,
"step": 8035
},
{
"epoch": 3.6118598382749325,
"grad_norm": 3.234375,
"learning_rate": 0.00024499318244231846,
"loss": 4.7211,
"step": 8040
},
{
"epoch": 3.614106019766397,
"grad_norm": 3.140625,
"learning_rate": 0.00024490884414663406,
"loss": 4.8138,
"step": 8045
},
{
"epoch": 3.6163522012578615,
"grad_norm": 3.25,
"learning_rate": 0.00024482445781618144,
"loss": 4.782,
"step": 8050
},
{
"epoch": 3.618598382749326,
"grad_norm": 3.46875,
"learning_rate": 0.0002447400235016869,
"loss": 4.823,
"step": 8055
},
{
"epoch": 3.620844564240791,
"grad_norm": 3.5,
"learning_rate": 0.00024465554125390566,
"loss": 4.7987,
"step": 8060
},
{
"epoch": 3.623090745732255,
"grad_norm": 3.09375,
"learning_rate": 0.00024457101112362146,
"loss": 4.8458,
"step": 8065
},
{
"epoch": 3.62533692722372,
"grad_norm": 3.171875,
"learning_rate": 0.00024448643316164715,
"loss": 4.8168,
"step": 8070
},
{
"epoch": 3.627583108715184,
"grad_norm": 3.265625,
"learning_rate": 0.0002444018074188242,
"loss": 4.819,
"step": 8075
},
{
"epoch": 3.629829290206649,
"grad_norm": 3.203125,
"learning_rate": 0.00024431713394602276,
"loss": 4.8063,
"step": 8080
},
{
"epoch": 3.632075471698113,
"grad_norm": 3.34375,
"learning_rate": 0.0002442324127941417,
"loss": 4.7975,
"step": 8085
},
{
"epoch": 3.634321653189578,
"grad_norm": 3.25,
"learning_rate": 0.00024414764401410854,
"loss": 4.7946,
"step": 8090
},
{
"epoch": 3.6365678346810424,
"grad_norm": 3.234375,
"learning_rate": 0.00024406282765687952,
"loss": 4.8146,
"step": 8095
},
{
"epoch": 3.638814016172507,
"grad_norm": 3.3125,
"learning_rate": 0.00024397796377343938,
"loss": 4.8036,
"step": 8100
},
{
"epoch": 3.6410601976639714,
"grad_norm": 3.25,
"learning_rate": 0.00024389305241480144,
"loss": 4.8005,
"step": 8105
},
{
"epoch": 3.643306379155436,
"grad_norm": 3.28125,
"learning_rate": 0.00024380809363200756,
"loss": 4.83,
"step": 8110
},
{
"epoch": 3.6455525606469004,
"grad_norm": 3.1875,
"learning_rate": 0.0002437230874761282,
"loss": 4.7698,
"step": 8115
},
{
"epoch": 3.647798742138365,
"grad_norm": 3.328125,
"learning_rate": 0.00024363803399826217,
"loss": 4.7716,
"step": 8120
},
{
"epoch": 3.6500449236298294,
"grad_norm": 3.171875,
"learning_rate": 0.0002435529332495368,
"loss": 4.7791,
"step": 8125
},
{
"epoch": 3.652291105121294,
"grad_norm": 3.234375,
"learning_rate": 0.0002434677852811078,
"loss": 4.8309,
"step": 8130
},
{
"epoch": 3.6545372866127583,
"grad_norm": 3.4375,
"learning_rate": 0.00024338259014415923,
"loss": 4.8274,
"step": 8135
},
{
"epoch": 3.656783468104223,
"grad_norm": 3.265625,
"learning_rate": 0.00024329734788990366,
"loss": 4.7547,
"step": 8140
},
{
"epoch": 3.6590296495956873,
"grad_norm": 3.265625,
"learning_rate": 0.00024321205856958178,
"loss": 4.7754,
"step": 8145
},
{
"epoch": 3.661275831087152,
"grad_norm": 3.171875,
"learning_rate": 0.00024312672223446272,
"loss": 4.765,
"step": 8150
},
{
"epoch": 3.6635220125786163,
"grad_norm": 3.1875,
"learning_rate": 0.0002430413389358438,
"loss": 4.7664,
"step": 8155
},
{
"epoch": 3.665768194070081,
"grad_norm": 3.203125,
"learning_rate": 0.00024295590872505055,
"loss": 4.756,
"step": 8160
},
{
"epoch": 3.6680143755615453,
"grad_norm": 3.328125,
"learning_rate": 0.0002428704316534368,
"loss": 4.7967,
"step": 8165
},
{
"epoch": 3.67026055705301,
"grad_norm": 3.421875,
"learning_rate": 0.00024278490777238448,
"loss": 4.7841,
"step": 8170
},
{
"epoch": 3.6725067385444743,
"grad_norm": 3.140625,
"learning_rate": 0.0002426993371333037,
"loss": 4.7319,
"step": 8175
},
{
"epoch": 3.674752920035939,
"grad_norm": 3.296875,
"learning_rate": 0.0002426137197876325,
"loss": 4.8211,
"step": 8180
},
{
"epoch": 3.6769991015274033,
"grad_norm": 3.09375,
"learning_rate": 0.00024252805578683733,
"loss": 4.7469,
"step": 8185
},
{
"epoch": 3.6792452830188678,
"grad_norm": 3.28125,
"learning_rate": 0.00024244234518241235,
"loss": 4.7779,
"step": 8190
},
{
"epoch": 3.6814914645103327,
"grad_norm": 3.296875,
"learning_rate": 0.00024235658802587996,
"loss": 4.7694,
"step": 8195
},
{
"epoch": 3.6837376460017968,
"grad_norm": 3.171875,
"learning_rate": 0.00024227078436879043,
"loss": 4.8198,
"step": 8200
},
{
"epoch": 3.6859838274932617,
"grad_norm": 3.875,
"learning_rate": 0.00024218493426272203,
"loss": 4.8039,
"step": 8205
},
{
"epoch": 3.6882300089847257,
"grad_norm": 3.375,
"learning_rate": 0.00024209903775928093,
"loss": 4.758,
"step": 8210
},
{
"epoch": 3.6904761904761907,
"grad_norm": 3.03125,
"learning_rate": 0.0002420130949101012,
"loss": 4.7523,
"step": 8215
},
{
"epoch": 3.6927223719676547,
"grad_norm": 3.34375,
"learning_rate": 0.00024192710576684476,
"loss": 4.7552,
"step": 8220
},
{
"epoch": 3.6949685534591197,
"grad_norm": 3.125,
"learning_rate": 0.00024184107038120137,
"loss": 4.8155,
"step": 8225
},
{
"epoch": 3.697214734950584,
"grad_norm": 3.046875,
"learning_rate": 0.00024175498880488856,
"loss": 4.7569,
"step": 8230
},
{
"epoch": 3.6994609164420487,
"grad_norm": 2.9375,
"learning_rate": 0.00024166886108965168,
"loss": 4.7675,
"step": 8235
},
{
"epoch": 3.701707097933513,
"grad_norm": 3.8125,
"learning_rate": 0.00024158268728726375,
"loss": 4.7861,
"step": 8240
},
{
"epoch": 3.7039532794249777,
"grad_norm": 3.3125,
"learning_rate": 0.0002414964674495256,
"loss": 4.8087,
"step": 8245
},
{
"epoch": 3.706199460916442,
"grad_norm": 4.71875,
"learning_rate": 0.00024141020162826558,
"loss": 4.8768,
"step": 8250
},
{
"epoch": 3.7084456424079066,
"grad_norm": 3.359375,
"learning_rate": 0.0002413238898753398,
"loss": 4.812,
"step": 8255
},
{
"epoch": 3.710691823899371,
"grad_norm": 3.46875,
"learning_rate": 0.00024123753224263193,
"loss": 4.8085,
"step": 8260
},
{
"epoch": 3.7129380053908356,
"grad_norm": 3.0625,
"learning_rate": 0.00024115112878205321,
"loss": 4.7944,
"step": 8265
},
{
"epoch": 3.7151841868823,
"grad_norm": 3.21875,
"learning_rate": 0.00024106467954554254,
"loss": 4.7821,
"step": 8270
},
{
"epoch": 3.7174303683737646,
"grad_norm": 3.140625,
"learning_rate": 0.0002409781845850661,
"loss": 4.7904,
"step": 8275
},
{
"epoch": 3.719676549865229,
"grad_norm": 3.015625,
"learning_rate": 0.00024089164395261784,
"loss": 4.7672,
"step": 8280
},
{
"epoch": 3.7219227313566936,
"grad_norm": 3.078125,
"learning_rate": 0.0002408050577002189,
"loss": 4.7801,
"step": 8285
},
{
"epoch": 3.724168912848158,
"grad_norm": 3.125,
"learning_rate": 0.00024071842587991806,
"loss": 4.8116,
"step": 8290
},
{
"epoch": 3.7264150943396226,
"grad_norm": 3.125,
"learning_rate": 0.00024063174854379145,
"loss": 4.8041,
"step": 8295
},
{
"epoch": 3.728661275831087,
"grad_norm": 3.34375,
"learning_rate": 0.00024054502574394235,
"loss": 4.8149,
"step": 8300
},
{
"epoch": 3.7309074573225516,
"grad_norm": 3.15625,
"learning_rate": 0.0002404582575325016,
"loss": 4.7656,
"step": 8305
},
{
"epoch": 3.733153638814016,
"grad_norm": 3.046875,
"learning_rate": 0.00024037144396162733,
"loss": 4.8151,
"step": 8310
},
{
"epoch": 3.7353998203054806,
"grad_norm": 3.390625,
"learning_rate": 0.00024028458508350484,
"loss": 4.836,
"step": 8315
},
{
"epoch": 3.737646001796945,
"grad_norm": 3.28125,
"learning_rate": 0.00024019768095034664,
"loss": 4.7813,
"step": 8320
},
{
"epoch": 3.7398921832884096,
"grad_norm": 3.234375,
"learning_rate": 0.00024011073161439255,
"loss": 4.8202,
"step": 8325
},
{
"epoch": 3.742138364779874,
"grad_norm": 3.375,
"learning_rate": 0.00024002373712790956,
"loss": 4.7586,
"step": 8330
},
{
"epoch": 3.7443845462713385,
"grad_norm": 3.296875,
"learning_rate": 0.0002399366975431917,
"loss": 4.7871,
"step": 8335
},
{
"epoch": 3.7466307277628035,
"grad_norm": 3.28125,
"learning_rate": 0.00023984961291256018,
"loss": 4.7348,
"step": 8340
},
{
"epoch": 3.7488769092542675,
"grad_norm": 3.171875,
"learning_rate": 0.00023976248328836327,
"loss": 4.7715,
"step": 8345
},
{
"epoch": 3.7511230907457325,
"grad_norm": 3.421875,
"learning_rate": 0.00023967530872297623,
"loss": 4.76,
"step": 8350
},
{
"epoch": 3.7533692722371965,
"grad_norm": 3.15625,
"learning_rate": 0.0002395880892688015,
"loss": 4.7845,
"step": 8355
},
{
"epoch": 3.7556154537286615,
"grad_norm": 3.203125,
"learning_rate": 0.00023950082497826842,
"loss": 4.8077,
"step": 8360
},
{
"epoch": 3.757861635220126,
"grad_norm": 3.265625,
"learning_rate": 0.00023941351590383314,
"loss": 4.7349,
"step": 8365
},
{
"epoch": 3.7601078167115904,
"grad_norm": 3.859375,
"learning_rate": 0.0002393261620979789,
"loss": 4.7692,
"step": 8370
},
{
"epoch": 3.762353998203055,
"grad_norm": 3.171875,
"learning_rate": 0.00023923876361321583,
"loss": 4.7812,
"step": 8375
},
{
"epoch": 3.7646001796945194,
"grad_norm": 3.1875,
"learning_rate": 0.0002391513205020808,
"loss": 4.786,
"step": 8380
},
{
"epoch": 3.766846361185984,
"grad_norm": 3.671875,
"learning_rate": 0.00023906383281713757,
"loss": 4.8124,
"step": 8385
},
{
"epoch": 3.7690925426774484,
"grad_norm": 3.171875,
"learning_rate": 0.00023897630061097677,
"loss": 4.8414,
"step": 8390
},
{
"epoch": 3.771338724168913,
"grad_norm": 3.1875,
"learning_rate": 0.00023888872393621564,
"loss": 4.7914,
"step": 8395
},
{
"epoch": 3.7735849056603774,
"grad_norm": 3.40625,
"learning_rate": 0.00023880110284549828,
"loss": 4.8036,
"step": 8400
},
{
"epoch": 3.775831087151842,
"grad_norm": 3.171875,
"learning_rate": 0.0002387134373914954,
"loss": 4.7247,
"step": 8405
},
{
"epoch": 3.7780772686433064,
"grad_norm": 3.28125,
"learning_rate": 0.00023862572762690452,
"loss": 4.8149,
"step": 8410
},
{
"epoch": 3.780323450134771,
"grad_norm": 3.28125,
"learning_rate": 0.0002385379736044496,
"loss": 4.7492,
"step": 8415
},
{
"epoch": 3.7825696316262354,
"grad_norm": 3.46875,
"learning_rate": 0.00023845017537688125,
"loss": 4.763,
"step": 8420
},
{
"epoch": 3.7848158131177,
"grad_norm": 3.15625,
"learning_rate": 0.00023836233299697685,
"loss": 4.7334,
"step": 8425
},
{
"epoch": 3.7870619946091644,
"grad_norm": 3.484375,
"learning_rate": 0.00023827444651754005,
"loss": 4.7934,
"step": 8430
},
{
"epoch": 3.789308176100629,
"grad_norm": 3.34375,
"learning_rate": 0.00023818651599140115,
"loss": 4.8119,
"step": 8435
},
{
"epoch": 3.7915543575920934,
"grad_norm": 3.453125,
"learning_rate": 0.00023809854147141695,
"loss": 4.7902,
"step": 8440
},
{
"epoch": 3.793800539083558,
"grad_norm": 3.65625,
"learning_rate": 0.00023801052301047063,
"loss": 4.8311,
"step": 8445
},
{
"epoch": 3.7960467205750223,
"grad_norm": 3.390625,
"learning_rate": 0.00023792246066147186,
"loss": 4.7616,
"step": 8450
},
{
"epoch": 3.798292902066487,
"grad_norm": 3.03125,
"learning_rate": 0.00023783435447735657,
"loss": 4.7668,
"step": 8455
},
{
"epoch": 3.8005390835579513,
"grad_norm": 3.21875,
"learning_rate": 0.00023774620451108707,
"loss": 4.7531,
"step": 8460
},
{
"epoch": 3.802785265049416,
"grad_norm": 3.203125,
"learning_rate": 0.00023765801081565213,
"loss": 4.8713,
"step": 8465
},
{
"epoch": 3.8050314465408803,
"grad_norm": 3.140625,
"learning_rate": 0.00023756977344406663,
"loss": 4.878,
"step": 8470
},
{
"epoch": 3.8072776280323453,
"grad_norm": 3.40625,
"learning_rate": 0.00023748149244937186,
"loss": 4.7309,
"step": 8475
},
{
"epoch": 3.8095238095238093,
"grad_norm": 3.109375,
"learning_rate": 0.00023739316788463517,
"loss": 4.7774,
"step": 8480
},
{
"epoch": 3.8117699910152743,
"grad_norm": 3.109375,
"learning_rate": 0.00023730479980295022,
"loss": 4.7595,
"step": 8485
},
{
"epoch": 3.8140161725067383,
"grad_norm": 3.296875,
"learning_rate": 0.0002372163882574368,
"loss": 4.7536,
"step": 8490
},
{
"epoch": 3.8162623539982032,
"grad_norm": 3.3125,
"learning_rate": 0.00023712793330124077,
"loss": 4.7914,
"step": 8495
},
{
"epoch": 3.8185085354896673,
"grad_norm": 3.3125,
"learning_rate": 0.00023703943498753417,
"loss": 4.7724,
"step": 8500
},
{
"epoch": 3.8207547169811322,
"grad_norm": 3.03125,
"learning_rate": 0.00023695089336951507,
"loss": 4.7454,
"step": 8505
},
{
"epoch": 3.8230008984725967,
"grad_norm": 3.21875,
"learning_rate": 0.00023686230850040758,
"loss": 4.7626,
"step": 8510
},
{
"epoch": 3.825247079964061,
"grad_norm": 3.296875,
"learning_rate": 0.00023677368043346174,
"loss": 4.773,
"step": 8515
},
{
"epoch": 3.8274932614555257,
"grad_norm": 3.1875,
"learning_rate": 0.0002366850092219537,
"loss": 4.7775,
"step": 8520
},
{
"epoch": 3.82973944294699,
"grad_norm": 3.09375,
"learning_rate": 0.00023659629491918534,
"loss": 4.7524,
"step": 8525
},
{
"epoch": 3.8319856244384547,
"grad_norm": 3.15625,
"learning_rate": 0.0002365075375784847,
"loss": 4.783,
"step": 8530
},
{
"epoch": 3.834231805929919,
"grad_norm": 3.28125,
"learning_rate": 0.00023641873725320544,
"loss": 4.7749,
"step": 8535
},
{
"epoch": 3.8364779874213837,
"grad_norm": 3.21875,
"learning_rate": 0.0002363298939967272,
"loss": 4.7647,
"step": 8540
},
{
"epoch": 3.838724168912848,
"grad_norm": 3.40625,
"learning_rate": 0.00023624100786245547,
"loss": 4.7626,
"step": 8545
},
{
"epoch": 3.8409703504043127,
"grad_norm": 3.140625,
"learning_rate": 0.0002361520789038213,
"loss": 4.7869,
"step": 8550
},
{
"epoch": 3.843216531895777,
"grad_norm": 3.34375,
"learning_rate": 0.00023606310717428177,
"loss": 4.8144,
"step": 8555
},
{
"epoch": 3.8454627133872417,
"grad_norm": 3.015625,
"learning_rate": 0.00023597409272731946,
"loss": 4.791,
"step": 8560
},
{
"epoch": 3.847708894878706,
"grad_norm": 3.0625,
"learning_rate": 0.00023588503561644268,
"loss": 4.86,
"step": 8565
},
{
"epoch": 3.8499550763701706,
"grad_norm": 3.046875,
"learning_rate": 0.0002357959358951854,
"loss": 4.7741,
"step": 8570
},
{
"epoch": 3.852201257861635,
"grad_norm": 3.234375,
"learning_rate": 0.00023570679361710728,
"loss": 4.7514,
"step": 8575
},
{
"epoch": 3.8544474393530996,
"grad_norm": 3.296875,
"learning_rate": 0.0002356176088357934,
"loss": 4.7896,
"step": 8580
},
{
"epoch": 3.856693620844564,
"grad_norm": 3.171875,
"learning_rate": 0.00023552838160485453,
"loss": 4.7116,
"step": 8585
},
{
"epoch": 3.8589398023360286,
"grad_norm": 3.6875,
"learning_rate": 0.00023543911197792682,
"loss": 4.7938,
"step": 8590
},
{
"epoch": 3.861185983827493,
"grad_norm": 3.296875,
"learning_rate": 0.0002353498000086721,
"loss": 4.7324,
"step": 8595
},
{
"epoch": 3.8634321653189576,
"grad_norm": 3.234375,
"learning_rate": 0.00023526044575077743,
"loss": 4.7512,
"step": 8600
},
{
"epoch": 3.865678346810422,
"grad_norm": 3.484375,
"learning_rate": 0.0002351710492579555,
"loss": 4.7148,
"step": 8605
},
{
"epoch": 3.867924528301887,
"grad_norm": 3.09375,
"learning_rate": 0.00023508161058394424,
"loss": 4.7609,
"step": 8610
},
{
"epoch": 3.870170709793351,
"grad_norm": 3.40625,
"learning_rate": 0.00023499212978250696,
"loss": 4.8106,
"step": 8615
},
{
"epoch": 3.872416891284816,
"grad_norm": 3.234375,
"learning_rate": 0.00023490260690743235,
"loss": 4.8064,
"step": 8620
},
{
"epoch": 3.87466307277628,
"grad_norm": 3.34375,
"learning_rate": 0.00023481304201253438,
"loss": 4.8099,
"step": 8625
},
{
"epoch": 3.876909254267745,
"grad_norm": 3.46875,
"learning_rate": 0.00023472343515165223,
"loss": 4.8328,
"step": 8630
},
{
"epoch": 3.879155435759209,
"grad_norm": 3.109375,
"learning_rate": 0.00023463378637865036,
"loss": 4.8231,
"step": 8635
},
{
"epoch": 3.881401617250674,
"grad_norm": 3.125,
"learning_rate": 0.00023454409574741843,
"loss": 4.7911,
"step": 8640
},
{
"epoch": 3.8836477987421385,
"grad_norm": 3.078125,
"learning_rate": 0.00023445436331187108,
"loss": 4.7646,
"step": 8645
},
{
"epoch": 3.885893980233603,
"grad_norm": 3.21875,
"learning_rate": 0.0002343645891259484,
"loss": 4.8198,
"step": 8650
},
{
"epoch": 3.8881401617250675,
"grad_norm": 3.375,
"learning_rate": 0.00023427477324361532,
"loss": 4.7684,
"step": 8655
},
{
"epoch": 3.890386343216532,
"grad_norm": 3.21875,
"learning_rate": 0.00023418491571886198,
"loss": 4.7267,
"step": 8660
},
{
"epoch": 3.8926325247079965,
"grad_norm": 3.203125,
"learning_rate": 0.0002340950166057034,
"loss": 4.8222,
"step": 8665
},
{
"epoch": 3.894878706199461,
"grad_norm": 3.359375,
"learning_rate": 0.0002340050759581798,
"loss": 4.8394,
"step": 8670
},
{
"epoch": 3.8971248876909255,
"grad_norm": 2.9375,
"learning_rate": 0.00023391509383035618,
"loss": 4.7634,
"step": 8675
},
{
"epoch": 3.89937106918239,
"grad_norm": 3.25,
"learning_rate": 0.00023382507027632264,
"loss": 4.6996,
"step": 8680
},
{
"epoch": 3.9016172506738545,
"grad_norm": 3.0,
"learning_rate": 0.00023373500535019403,
"loss": 4.8458,
"step": 8685
},
{
"epoch": 3.903863432165319,
"grad_norm": 3.25,
"learning_rate": 0.00023364489910611018,
"loss": 4.7772,
"step": 8690
},
{
"epoch": 3.9061096136567834,
"grad_norm": 3.15625,
"learning_rate": 0.00023355475159823568,
"loss": 4.8331,
"step": 8695
},
{
"epoch": 3.908355795148248,
"grad_norm": 3.1875,
"learning_rate": 0.00023346456288075995,
"loss": 4.8063,
"step": 8700
},
{
"epoch": 3.9106019766397124,
"grad_norm": 3.59375,
"learning_rate": 0.00023337433300789725,
"loss": 4.7402,
"step": 8705
},
{
"epoch": 3.912848158131177,
"grad_norm": 3.046875,
"learning_rate": 0.00023328406203388646,
"loss": 4.7966,
"step": 8710
},
{
"epoch": 3.9150943396226414,
"grad_norm": 3.5,
"learning_rate": 0.00023319375001299125,
"loss": 4.7583,
"step": 8715
},
{
"epoch": 3.917340521114106,
"grad_norm": 3.15625,
"learning_rate": 0.00023310339699949995,
"loss": 4.8278,
"step": 8720
},
{
"epoch": 3.9195867026055704,
"grad_norm": 3.359375,
"learning_rate": 0.0002330130030477255,
"loss": 4.7303,
"step": 8725
},
{
"epoch": 3.921832884097035,
"grad_norm": 3.109375,
"learning_rate": 0.00023292256821200546,
"loss": 4.8432,
"step": 8730
},
{
"epoch": 3.9240790655884994,
"grad_norm": 3.1875,
"learning_rate": 0.00023283209254670203,
"loss": 4.7292,
"step": 8735
},
{
"epoch": 3.926325247079964,
"grad_norm": 3.265625,
"learning_rate": 0.00023274157610620187,
"loss": 4.7701,
"step": 8740
},
{
"epoch": 3.928571428571429,
"grad_norm": 3.28125,
"learning_rate": 0.00023265101894491623,
"loss": 4.7612,
"step": 8745
},
{
"epoch": 3.930817610062893,
"grad_norm": 4.0,
"learning_rate": 0.0002325604211172807,
"loss": 4.8229,
"step": 8750
},
{
"epoch": 3.933063791554358,
"grad_norm": 3.21875,
"learning_rate": 0.00023246978267775546,
"loss": 4.7351,
"step": 8755
},
{
"epoch": 3.935309973045822,
"grad_norm": 3.203125,
"learning_rate": 0.00023237910368082503,
"loss": 4.8045,
"step": 8760
},
{
"epoch": 3.937556154537287,
"grad_norm": 3.40625,
"learning_rate": 0.0002322883841809983,
"loss": 4.8242,
"step": 8765
},
{
"epoch": 3.939802336028751,
"grad_norm": 3.234375,
"learning_rate": 0.00023219762423280863,
"loss": 4.7605,
"step": 8770
},
{
"epoch": 3.942048517520216,
"grad_norm": 3.15625,
"learning_rate": 0.00023210682389081355,
"loss": 4.7575,
"step": 8775
},
{
"epoch": 3.9442946990116803,
"grad_norm": 3.15625,
"learning_rate": 0.00023201598320959487,
"loss": 4.7075,
"step": 8780
},
{
"epoch": 3.9465408805031448,
"grad_norm": 3.46875,
"learning_rate": 0.00023192510224375875,
"loss": 4.7283,
"step": 8785
},
{
"epoch": 3.9487870619946093,
"grad_norm": 3.203125,
"learning_rate": 0.00023183418104793548,
"loss": 4.7323,
"step": 8790
},
{
"epoch": 3.9510332434860738,
"grad_norm": 3.21875,
"learning_rate": 0.00023174321967677958,
"loss": 4.7524,
"step": 8795
},
{
"epoch": 3.9532794249775383,
"grad_norm": 3.265625,
"learning_rate": 0.00023165221818496976,
"loss": 4.7401,
"step": 8800
},
{
"epoch": 3.9555256064690028,
"grad_norm": 3.421875,
"learning_rate": 0.00023156117662720876,
"loss": 4.8213,
"step": 8805
},
{
"epoch": 3.9577717879604672,
"grad_norm": 3.375,
"learning_rate": 0.0002314700950582234,
"loss": 4.778,
"step": 8810
},
{
"epoch": 3.9600179694519317,
"grad_norm": 3.3125,
"learning_rate": 0.00023137897353276468,
"loss": 4.7286,
"step": 8815
},
{
"epoch": 3.9622641509433962,
"grad_norm": 3.296875,
"learning_rate": 0.0002312878121056074,
"loss": 4.7604,
"step": 8820
},
{
"epoch": 3.9645103324348607,
"grad_norm": 3.578125,
"learning_rate": 0.00023119661083155057,
"loss": 4.7712,
"step": 8825
},
{
"epoch": 3.9667565139263252,
"grad_norm": 3.546875,
"learning_rate": 0.0002311053697654171,
"loss": 4.7624,
"step": 8830
},
{
"epoch": 3.9690026954177897,
"grad_norm": 3.109375,
"learning_rate": 0.00023101408896205366,
"loss": 4.7509,
"step": 8835
},
{
"epoch": 3.971248876909254,
"grad_norm": 3.015625,
"learning_rate": 0.00023092276847633101,
"loss": 4.8025,
"step": 8840
},
{
"epoch": 3.9734950584007187,
"grad_norm": 3.234375,
"learning_rate": 0.00023083140836314367,
"loss": 4.8212,
"step": 8845
},
{
"epoch": 3.975741239892183,
"grad_norm": 3.203125,
"learning_rate": 0.00023074000867740995,
"loss": 4.6859,
"step": 8850
},
{
"epoch": 3.9779874213836477,
"grad_norm": 3.390625,
"learning_rate": 0.000230648569474072,
"loss": 4.8425,
"step": 8855
},
{
"epoch": 3.980233602875112,
"grad_norm": 3.375,
"learning_rate": 0.0002305570908080957,
"loss": 4.7836,
"step": 8860
},
{
"epoch": 3.9824797843665767,
"grad_norm": 3.453125,
"learning_rate": 0.00023046557273447075,
"loss": 4.8095,
"step": 8865
},
{
"epoch": 3.984725965858041,
"grad_norm": 3.296875,
"learning_rate": 0.00023037401530821042,
"loss": 4.772,
"step": 8870
},
{
"epoch": 3.9869721473495057,
"grad_norm": 3.484375,
"learning_rate": 0.00023028241858435154,
"loss": 4.7742,
"step": 8875
},
{
"epoch": 3.9892183288409706,
"grad_norm": 3.375,
"learning_rate": 0.0002301907826179548,
"loss": 4.7964,
"step": 8880
},
{
"epoch": 3.9914645103324347,
"grad_norm": 3.25,
"learning_rate": 0.00023009910746410442,
"loss": 4.7904,
"step": 8885
},
{
"epoch": 3.9937106918238996,
"grad_norm": 3.015625,
"learning_rate": 0.00023000739317790805,
"loss": 4.8029,
"step": 8890
},
{
"epoch": 3.9959568733153636,
"grad_norm": 3.421875,
"learning_rate": 0.00022991563981449693,
"loss": 4.721,
"step": 8895
},
{
"epoch": 3.9982030548068286,
"grad_norm": 3.109375,
"learning_rate": 0.00022982384742902586,
"loss": 4.7919,
"step": 8900
},
{
"epoch": 4.000449236298293,
"grad_norm": 3.296875,
"learning_rate": 0.00022973201607667297,
"loss": 4.8092,
"step": 8905
},
{
"epoch": 4.002695417789758,
"grad_norm": 3.28125,
"learning_rate": 0.00022964014581263993,
"loss": 4.6842,
"step": 8910
},
{
"epoch": 4.004941599281222,
"grad_norm": 3.28125,
"learning_rate": 0.0002295482366921517,
"loss": 4.7022,
"step": 8915
},
{
"epoch": 4.007187780772687,
"grad_norm": 3.203125,
"learning_rate": 0.00022945628877045675,
"loss": 4.6657,
"step": 8920
},
{
"epoch": 4.009433962264151,
"grad_norm": 3.21875,
"learning_rate": 0.00022936430210282674,
"loss": 4.6306,
"step": 8925
},
{
"epoch": 4.0116801437556155,
"grad_norm": 3.3125,
"learning_rate": 0.00022927227674455653,
"loss": 4.6586,
"step": 8930
},
{
"epoch": 4.01392632524708,
"grad_norm": 3.265625,
"learning_rate": 0.0002291802127509645,
"loss": 4.7236,
"step": 8935
},
{
"epoch": 4.0161725067385445,
"grad_norm": 3.296875,
"learning_rate": 0.0002290881101773921,
"loss": 4.7126,
"step": 8940
},
{
"epoch": 4.018418688230009,
"grad_norm": 3.25,
"learning_rate": 0.00022899596907920389,
"loss": 4.7159,
"step": 8945
},
{
"epoch": 4.0206648697214735,
"grad_norm": 3.46875,
"learning_rate": 0.0002289037895117878,
"loss": 4.7184,
"step": 8950
},
{
"epoch": 4.022911051212938,
"grad_norm": 3.109375,
"learning_rate": 0.0002288115715305547,
"loss": 4.6531,
"step": 8955
},
{
"epoch": 4.0251572327044025,
"grad_norm": 3.34375,
"learning_rate": 0.00022871931519093867,
"loss": 4.6687,
"step": 8960
},
{
"epoch": 4.0274034141958674,
"grad_norm": 3.21875,
"learning_rate": 0.00022862702054839674,
"loss": 4.6754,
"step": 8965
},
{
"epoch": 4.0296495956873315,
"grad_norm": 3.265625,
"learning_rate": 0.00022853468765840907,
"loss": 4.7094,
"step": 8970
},
{
"epoch": 4.031895777178796,
"grad_norm": 3.25,
"learning_rate": 0.00022844231657647874,
"loss": 4.6216,
"step": 8975
},
{
"epoch": 4.0341419586702605,
"grad_norm": 3.109375,
"learning_rate": 0.00022834990735813186,
"loss": 4.677,
"step": 8980
},
{
"epoch": 4.036388140161725,
"grad_norm": 3.40625,
"learning_rate": 0.0002282574600589174,
"loss": 4.6951,
"step": 8985
},
{
"epoch": 4.0386343216531895,
"grad_norm": 3.15625,
"learning_rate": 0.00022816497473440717,
"loss": 4.7113,
"step": 8990
},
{
"epoch": 4.040880503144654,
"grad_norm": 3.4375,
"learning_rate": 0.00022807245144019594,
"loss": 4.6925,
"step": 8995
},
{
"epoch": 4.0431266846361185,
"grad_norm": 3.328125,
"learning_rate": 0.00022797989023190133,
"loss": 4.6086,
"step": 9000
},
{
"epoch": 4.0431266846361185,
"eval_loss": 4.870049476623535,
"eval_runtime": 16.2198,
"eval_samples_per_second": 1912.043,
"eval_steps_per_second": 239.028,
"step": 9000
},
{
"epoch": 4.045372866127583,
"grad_norm": 3.71875,
"learning_rate": 0.00022788729116516364,
"loss": 4.732,
"step": 9005
},
{
"epoch": 4.0476190476190474,
"grad_norm": 3.4375,
"learning_rate": 0.000227794654295646,
"loss": 4.6506,
"step": 9010
},
{
"epoch": 4.049865229110512,
"grad_norm": 3.171875,
"learning_rate": 0.0002277019796790342,
"loss": 4.6555,
"step": 9015
},
{
"epoch": 4.052111410601976,
"grad_norm": 3.46875,
"learning_rate": 0.00022760926737103683,
"loss": 4.6926,
"step": 9020
},
{
"epoch": 4.054357592093441,
"grad_norm": 3.28125,
"learning_rate": 0.00022751651742738502,
"loss": 4.7167,
"step": 9025
},
{
"epoch": 4.056603773584905,
"grad_norm": 3.421875,
"learning_rate": 0.00022742372990383261,
"loss": 4.6789,
"step": 9030
},
{
"epoch": 4.05884995507637,
"grad_norm": 3.3125,
"learning_rate": 0.00022733090485615594,
"loss": 4.693,
"step": 9035
},
{
"epoch": 4.061096136567834,
"grad_norm": 3.140625,
"learning_rate": 0.00022723804234015403,
"loss": 4.6121,
"step": 9040
},
{
"epoch": 4.063342318059299,
"grad_norm": 3.15625,
"learning_rate": 0.00022714514241164825,
"loss": 4.6792,
"step": 9045
},
{
"epoch": 4.065588499550763,
"grad_norm": 3.3125,
"learning_rate": 0.00022705220512648266,
"loss": 4.7157,
"step": 9050
},
{
"epoch": 4.067834681042228,
"grad_norm": 3.203125,
"learning_rate": 0.0002269592305405237,
"loss": 4.6604,
"step": 9055
},
{
"epoch": 4.070080862533692,
"grad_norm": 3.46875,
"learning_rate": 0.00022686621870966013,
"loss": 4.7952,
"step": 9060
},
{
"epoch": 4.072327044025157,
"grad_norm": 3.25,
"learning_rate": 0.0002267731696898032,
"loss": 4.6736,
"step": 9065
},
{
"epoch": 4.074573225516621,
"grad_norm": 3.09375,
"learning_rate": 0.0002266800835368865,
"loss": 4.6698,
"step": 9070
},
{
"epoch": 4.076819407008086,
"grad_norm": 3.515625,
"learning_rate": 0.00022658696030686598,
"loss": 4.7083,
"step": 9075
},
{
"epoch": 4.07906558849955,
"grad_norm": 3.3125,
"learning_rate": 0.00022649380005571975,
"loss": 4.6677,
"step": 9080
},
{
"epoch": 4.081311769991015,
"grad_norm": 3.375,
"learning_rate": 0.0002264006028394483,
"loss": 4.6811,
"step": 9085
},
{
"epoch": 4.083557951482479,
"grad_norm": 3.1875,
"learning_rate": 0.00022630736871407436,
"loss": 4.7013,
"step": 9090
},
{
"epoch": 4.085804132973944,
"grad_norm": 3.265625,
"learning_rate": 0.00022621409773564269,
"loss": 4.6296,
"step": 9095
},
{
"epoch": 4.088050314465409,
"grad_norm": 3.453125,
"learning_rate": 0.00022612078996022032,
"loss": 4.7159,
"step": 9100
},
{
"epoch": 4.090296495956873,
"grad_norm": 3.1875,
"learning_rate": 0.0002260274454438964,
"loss": 4.6755,
"step": 9105
},
{
"epoch": 4.092542677448338,
"grad_norm": 3.328125,
"learning_rate": 0.00022593406424278214,
"loss": 4.6946,
"step": 9110
},
{
"epoch": 4.094788858939802,
"grad_norm": 3.5625,
"learning_rate": 0.0002258406464130108,
"loss": 4.7235,
"step": 9115
},
{
"epoch": 4.097035040431267,
"grad_norm": 3.1875,
"learning_rate": 0.00022574719201073765,
"loss": 4.6733,
"step": 9120
},
{
"epoch": 4.099281221922731,
"grad_norm": 3.34375,
"learning_rate": 0.00022565370109214,
"loss": 4.6833,
"step": 9125
},
{
"epoch": 4.101527403414196,
"grad_norm": 3.578125,
"learning_rate": 0.00022556017371341703,
"loss": 4.6202,
"step": 9130
},
{
"epoch": 4.10377358490566,
"grad_norm": 3.265625,
"learning_rate": 0.0002254666099307899,
"loss": 4.7795,
"step": 9135
},
{
"epoch": 4.106019766397125,
"grad_norm": 3.3125,
"learning_rate": 0.00022537300980050157,
"loss": 4.6459,
"step": 9140
},
{
"epoch": 4.108265947888589,
"grad_norm": 3.265625,
"learning_rate": 0.00022527937337881698,
"loss": 4.7103,
"step": 9145
},
{
"epoch": 4.110512129380054,
"grad_norm": 3.25,
"learning_rate": 0.0002251857007220228,
"loss": 4.6682,
"step": 9150
},
{
"epoch": 4.112758310871518,
"grad_norm": 3.34375,
"learning_rate": 0.00022509199188642747,
"loss": 4.7415,
"step": 9155
},
{
"epoch": 4.115004492362983,
"grad_norm": 3.140625,
"learning_rate": 0.00022499824692836124,
"loss": 4.688,
"step": 9160
},
{
"epoch": 4.117250673854447,
"grad_norm": 3.28125,
"learning_rate": 0.00022490446590417594,
"loss": 4.654,
"step": 9165
},
{
"epoch": 4.119496855345912,
"grad_norm": 3.265625,
"learning_rate": 0.0002248106488702453,
"loss": 4.7146,
"step": 9170
},
{
"epoch": 4.121743036837376,
"grad_norm": 3.296875,
"learning_rate": 0.00022471679588296456,
"loss": 4.6695,
"step": 9175
},
{
"epoch": 4.123989218328841,
"grad_norm": 3.234375,
"learning_rate": 0.00022462290699875044,
"loss": 4.712,
"step": 9180
},
{
"epoch": 4.126235399820305,
"grad_norm": 3.140625,
"learning_rate": 0.00022452898227404158,
"loss": 4.6607,
"step": 9185
},
{
"epoch": 4.12848158131177,
"grad_norm": 3.46875,
"learning_rate": 0.00022443502176529783,
"loss": 4.5974,
"step": 9190
},
{
"epoch": 4.130727762803234,
"grad_norm": 3.3125,
"learning_rate": 0.00022434102552900073,
"loss": 4.6807,
"step": 9195
},
{
"epoch": 4.132973944294699,
"grad_norm": 3.421875,
"learning_rate": 0.0002242469936216533,
"loss": 4.63,
"step": 9200
},
{
"epoch": 4.135220125786163,
"grad_norm": 3.3125,
"learning_rate": 0.00022415292609977988,
"loss": 4.6643,
"step": 9205
},
{
"epoch": 4.137466307277628,
"grad_norm": 3.375,
"learning_rate": 0.00022405882301992637,
"loss": 4.6353,
"step": 9210
},
{
"epoch": 4.139712488769092,
"grad_norm": 3.265625,
"learning_rate": 0.00022396468443865994,
"loss": 4.6658,
"step": 9215
},
{
"epoch": 4.141958670260557,
"grad_norm": 3.28125,
"learning_rate": 0.00022387051041256907,
"loss": 4.6706,
"step": 9220
},
{
"epoch": 4.144204851752021,
"grad_norm": 3.125,
"learning_rate": 0.00022377630099826366,
"loss": 4.7247,
"step": 9225
},
{
"epoch": 4.146451033243486,
"grad_norm": 3.1875,
"learning_rate": 0.0002236820562523749,
"loss": 4.6981,
"step": 9230
},
{
"epoch": 4.148697214734951,
"grad_norm": 3.3125,
"learning_rate": 0.00022358777623155505,
"loss": 4.6588,
"step": 9235
},
{
"epoch": 4.150943396226415,
"grad_norm": 3.15625,
"learning_rate": 0.00022349346099247768,
"loss": 4.7104,
"step": 9240
},
{
"epoch": 4.15318957771788,
"grad_norm": 3.34375,
"learning_rate": 0.00022339911059183763,
"loss": 4.6564,
"step": 9245
},
{
"epoch": 4.155435759209344,
"grad_norm": 3.34375,
"learning_rate": 0.00022330472508635062,
"loss": 4.6547,
"step": 9250
},
{
"epoch": 4.157681940700809,
"grad_norm": 3.359375,
"learning_rate": 0.0002232103045327537,
"loss": 4.7364,
"step": 9255
},
{
"epoch": 4.159928122192273,
"grad_norm": 3.25,
"learning_rate": 0.00022311584898780494,
"loss": 4.6378,
"step": 9260
},
{
"epoch": 4.162174303683738,
"grad_norm": 3.25,
"learning_rate": 0.00022302135850828337,
"loss": 4.6988,
"step": 9265
},
{
"epoch": 4.164420485175202,
"grad_norm": 3.25,
"learning_rate": 0.00022292683315098904,
"loss": 4.6526,
"step": 9270
},
{
"epoch": 4.166666666666667,
"grad_norm": 3.484375,
"learning_rate": 0.00022283227297274305,
"loss": 4.6805,
"step": 9275
},
{
"epoch": 4.168912848158131,
"grad_norm": 3.703125,
"learning_rate": 0.00022273767803038727,
"loss": 4.6883,
"step": 9280
},
{
"epoch": 4.171159029649596,
"grad_norm": 3.390625,
"learning_rate": 0.00022264304838078475,
"loss": 4.6406,
"step": 9285
},
{
"epoch": 4.17340521114106,
"grad_norm": 3.28125,
"learning_rate": 0.00022254838408081908,
"loss": 4.7056,
"step": 9290
},
{
"epoch": 4.175651392632525,
"grad_norm": 3.515625,
"learning_rate": 0.0002224536851873948,
"loss": 4.7173,
"step": 9295
},
{
"epoch": 4.177897574123989,
"grad_norm": 3.234375,
"learning_rate": 0.00022235895175743743,
"loss": 4.6716,
"step": 9300
},
{
"epoch": 4.180143755615454,
"grad_norm": 3.53125,
"learning_rate": 0.00022226418384789284,
"loss": 4.6478,
"step": 9305
},
{
"epoch": 4.182389937106918,
"grad_norm": 3.34375,
"learning_rate": 0.00022216938151572814,
"loss": 4.6902,
"step": 9310
},
{
"epoch": 4.184636118598383,
"grad_norm": 3.46875,
"learning_rate": 0.00022207454481793063,
"loss": 4.694,
"step": 9315
},
{
"epoch": 4.186882300089847,
"grad_norm": 3.21875,
"learning_rate": 0.00022197967381150867,
"loss": 4.6173,
"step": 9320
},
{
"epoch": 4.189128481581312,
"grad_norm": 3.28125,
"learning_rate": 0.00022188476855349102,
"loss": 4.6479,
"step": 9325
},
{
"epoch": 4.191374663072776,
"grad_norm": 3.53125,
"learning_rate": 0.0002217898291009271,
"loss": 4.7526,
"step": 9330
},
{
"epoch": 4.193620844564241,
"grad_norm": 3.40625,
"learning_rate": 0.00022169485551088678,
"loss": 4.6498,
"step": 9335
},
{
"epoch": 4.195867026055705,
"grad_norm": 3.640625,
"learning_rate": 0.00022159984784046063,
"loss": 4.6994,
"step": 9340
},
{
"epoch": 4.19811320754717,
"grad_norm": 3.46875,
"learning_rate": 0.00022150480614675962,
"loss": 4.674,
"step": 9345
},
{
"epoch": 4.200359389038634,
"grad_norm": 3.375,
"learning_rate": 0.00022140973048691512,
"loss": 4.6204,
"step": 9350
},
{
"epoch": 4.202605570530099,
"grad_norm": 3.203125,
"learning_rate": 0.00022131462091807904,
"loss": 4.6888,
"step": 9355
},
{
"epoch": 4.204851752021563,
"grad_norm": 3.328125,
"learning_rate": 0.00022121947749742353,
"loss": 4.7396,
"step": 9360
},
{
"epoch": 4.207097933513028,
"grad_norm": 3.25,
"learning_rate": 0.0002211243002821412,
"loss": 4.7202,
"step": 9365
},
{
"epoch": 4.209344115004493,
"grad_norm": 3.421875,
"learning_rate": 0.00022102908932944488,
"loss": 4.6249,
"step": 9370
},
{
"epoch": 4.211590296495957,
"grad_norm": 3.140625,
"learning_rate": 0.00022093384469656785,
"loss": 4.6602,
"step": 9375
},
{
"epoch": 4.213836477987422,
"grad_norm": 3.234375,
"learning_rate": 0.00022083856644076338,
"loss": 4.6307,
"step": 9380
},
{
"epoch": 4.216082659478886,
"grad_norm": 3.40625,
"learning_rate": 0.00022074325461930524,
"loss": 4.678,
"step": 9385
},
{
"epoch": 4.218328840970351,
"grad_norm": 3.0625,
"learning_rate": 0.00022064790928948708,
"loss": 4.6617,
"step": 9390
},
{
"epoch": 4.220575022461815,
"grad_norm": 3.0625,
"learning_rate": 0.00022055253050862295,
"loss": 4.6534,
"step": 9395
},
{
"epoch": 4.22282120395328,
"grad_norm": 3.265625,
"learning_rate": 0.00022045711833404682,
"loss": 4.6576,
"step": 9400
},
{
"epoch": 4.225067385444744,
"grad_norm": 3.328125,
"learning_rate": 0.0002203616728231129,
"loss": 4.683,
"step": 9405
},
{
"epoch": 4.227313566936209,
"grad_norm": 3.25,
"learning_rate": 0.0002202661940331953,
"loss": 4.7134,
"step": 9410
},
{
"epoch": 4.229559748427673,
"grad_norm": 3.5,
"learning_rate": 0.00022017068202168818,
"loss": 4.6617,
"step": 9415
},
{
"epoch": 4.231805929919138,
"grad_norm": 3.34375,
"learning_rate": 0.0002200751368460057,
"loss": 4.6808,
"step": 9420
},
{
"epoch": 4.234052111410602,
"grad_norm": 3.265625,
"learning_rate": 0.00021997955856358184,
"loss": 4.6932,
"step": 9425
},
{
"epoch": 4.236298292902067,
"grad_norm": 3.25,
"learning_rate": 0.00021988394723187075,
"loss": 4.6907,
"step": 9430
},
{
"epoch": 4.238544474393531,
"grad_norm": 3.359375,
"learning_rate": 0.00021978830290834614,
"loss": 4.7321,
"step": 9435
},
{
"epoch": 4.240790655884996,
"grad_norm": 3.453125,
"learning_rate": 0.0002196926256505017,
"loss": 4.714,
"step": 9440
},
{
"epoch": 4.24303683737646,
"grad_norm": 3.28125,
"learning_rate": 0.00021959691551585097,
"loss": 4.7027,
"step": 9445
},
{
"epoch": 4.245283018867925,
"grad_norm": 3.578125,
"learning_rate": 0.0002195011725619271,
"loss": 4.5976,
"step": 9450
},
{
"epoch": 4.247529200359389,
"grad_norm": 3.234375,
"learning_rate": 0.00021940539684628307,
"loss": 4.629,
"step": 9455
},
{
"epoch": 4.249775381850854,
"grad_norm": 3.546875,
"learning_rate": 0.00021930958842649156,
"loss": 4.6977,
"step": 9460
},
{
"epoch": 4.252021563342318,
"grad_norm": 3.25,
"learning_rate": 0.00021921374736014488,
"loss": 4.6177,
"step": 9465
},
{
"epoch": 4.254267744833783,
"grad_norm": 3.359375,
"learning_rate": 0.00021911787370485497,
"loss": 4.7463,
"step": 9470
},
{
"epoch": 4.256513926325247,
"grad_norm": 4.34375,
"learning_rate": 0.00021902196751825333,
"loss": 4.6008,
"step": 9475
},
{
"epoch": 4.258760107816712,
"grad_norm": 3.46875,
"learning_rate": 0.0002189260288579911,
"loss": 4.6715,
"step": 9480
},
{
"epoch": 4.261006289308176,
"grad_norm": 3.109375,
"learning_rate": 0.00021883005778173878,
"loss": 4.692,
"step": 9485
},
{
"epoch": 4.263252470799641,
"grad_norm": 3.265625,
"learning_rate": 0.00021873405434718655,
"loss": 4.7109,
"step": 9490
},
{
"epoch": 4.265498652291106,
"grad_norm": 3.234375,
"learning_rate": 0.00021863801861204393,
"loss": 4.7958,
"step": 9495
},
{
"epoch": 4.26774483378257,
"grad_norm": 3.484375,
"learning_rate": 0.00021854195063403988,
"loss": 4.682,
"step": 9500
},
{
"epoch": 4.269991015274034,
"grad_norm": 3.3125,
"learning_rate": 0.00021844585047092274,
"loss": 4.6555,
"step": 9505
},
{
"epoch": 4.272237196765499,
"grad_norm": 3.515625,
"learning_rate": 0.00021834971818046018,
"loss": 4.6723,
"step": 9510
},
{
"epoch": 4.274483378256964,
"grad_norm": 3.53125,
"learning_rate": 0.00021825355382043917,
"loss": 4.688,
"step": 9515
},
{
"epoch": 4.276729559748428,
"grad_norm": 3.25,
"learning_rate": 0.0002181573574486661,
"loss": 4.7326,
"step": 9520
},
{
"epoch": 4.2789757412398925,
"grad_norm": 3.25,
"learning_rate": 0.00021806112912296633,
"loss": 4.6849,
"step": 9525
},
{
"epoch": 4.281221922731357,
"grad_norm": 3.421875,
"learning_rate": 0.00021796486890118474,
"loss": 4.6588,
"step": 9530
},
{
"epoch": 4.2834681042228215,
"grad_norm": 3.40625,
"learning_rate": 0.00021786857684118514,
"loss": 4.7288,
"step": 9535
},
{
"epoch": 4.285714285714286,
"grad_norm": 3.21875,
"learning_rate": 0.00021777225300085055,
"loss": 4.749,
"step": 9540
},
{
"epoch": 4.2879604672057505,
"grad_norm": 3.359375,
"learning_rate": 0.0002176758974380832,
"loss": 4.7648,
"step": 9545
},
{
"epoch": 4.290206648697215,
"grad_norm": 3.59375,
"learning_rate": 0.00021757951021080424,
"loss": 4.7049,
"step": 9550
},
{
"epoch": 4.2924528301886795,
"grad_norm": 3.25,
"learning_rate": 0.00021748309137695394,
"loss": 4.6978,
"step": 9555
},
{
"epoch": 4.294699011680144,
"grad_norm": 3.4375,
"learning_rate": 0.00021738664099449158,
"loss": 4.7332,
"step": 9560
},
{
"epoch": 4.2969451931716085,
"grad_norm": 3.25,
"learning_rate": 0.0002172901591213953,
"loss": 4.6581,
"step": 9565
},
{
"epoch": 4.2991913746630726,
"grad_norm": 3.46875,
"learning_rate": 0.00021719364581566225,
"loss": 4.5986,
"step": 9570
},
{
"epoch": 4.3014375561545375,
"grad_norm": 3.328125,
"learning_rate": 0.00021709710113530851,
"loss": 4.6496,
"step": 9575
},
{
"epoch": 4.3036837376460015,
"grad_norm": 3.296875,
"learning_rate": 0.00021700052513836892,
"loss": 4.599,
"step": 9580
},
{
"epoch": 4.3059299191374665,
"grad_norm": 3.59375,
"learning_rate": 0.00021690391788289725,
"loss": 4.6833,
"step": 9585
},
{
"epoch": 4.3081761006289305,
"grad_norm": 3.4375,
"learning_rate": 0.00021680727942696595,
"loss": 4.6969,
"step": 9590
},
{
"epoch": 4.3104222821203955,
"grad_norm": 3.515625,
"learning_rate": 0.00021671060982866638,
"loss": 4.6736,
"step": 9595
},
{
"epoch": 4.3126684636118595,
"grad_norm": 3.140625,
"learning_rate": 0.00021661390914610846,
"loss": 4.7603,
"step": 9600
},
{
"epoch": 4.3149146451033245,
"grad_norm": 3.28125,
"learning_rate": 0.00021651717743742082,
"loss": 4.6373,
"step": 9605
},
{
"epoch": 4.3171608265947885,
"grad_norm": 3.40625,
"learning_rate": 0.00021642041476075088,
"loss": 4.6946,
"step": 9610
},
{
"epoch": 4.319407008086253,
"grad_norm": 3.546875,
"learning_rate": 0.0002163236211742645,
"loss": 4.6837,
"step": 9615
},
{
"epoch": 4.3216531895777175,
"grad_norm": 3.25,
"learning_rate": 0.00021622679673614621,
"loss": 4.7555,
"step": 9620
},
{
"epoch": 4.323899371069182,
"grad_norm": 3.3125,
"learning_rate": 0.0002161299415045991,
"loss": 4.7089,
"step": 9625
},
{
"epoch": 4.3261455525606465,
"grad_norm": 3.40625,
"learning_rate": 0.00021603305553784472,
"loss": 4.7033,
"step": 9630
},
{
"epoch": 4.328391734052111,
"grad_norm": 3.171875,
"learning_rate": 0.00021593613889412313,
"loss": 4.6401,
"step": 9635
},
{
"epoch": 4.330637915543576,
"grad_norm": 3.4375,
"learning_rate": 0.00021583919163169286,
"loss": 4.6932,
"step": 9640
},
{
"epoch": 4.33288409703504,
"grad_norm": 3.25,
"learning_rate": 0.00021574221380883072,
"loss": 4.6643,
"step": 9645
},
{
"epoch": 4.335130278526505,
"grad_norm": 3.484375,
"learning_rate": 0.000215645205483832,
"loss": 4.685,
"step": 9650
},
{
"epoch": 4.337376460017969,
"grad_norm": 3.4375,
"learning_rate": 0.00021554816671501034,
"loss": 4.6756,
"step": 9655
},
{
"epoch": 4.339622641509434,
"grad_norm": 3.5,
"learning_rate": 0.0002154510975606976,
"loss": 4.6344,
"step": 9660
},
{
"epoch": 4.341868823000898,
"grad_norm": 3.296875,
"learning_rate": 0.00021535399807924398,
"loss": 4.6881,
"step": 9665
},
{
"epoch": 4.344115004492363,
"grad_norm": 3.390625,
"learning_rate": 0.0002152568683290178,
"loss": 4.6898,
"step": 9670
},
{
"epoch": 4.346361185983827,
"grad_norm": 3.109375,
"learning_rate": 0.0002151597083684058,
"loss": 4.7361,
"step": 9675
},
{
"epoch": 4.348607367475292,
"grad_norm": 3.359375,
"learning_rate": 0.00021506251825581255,
"loss": 4.6747,
"step": 9680
},
{
"epoch": 4.350853548966756,
"grad_norm": 3.34375,
"learning_rate": 0.00021496529804966103,
"loss": 4.7717,
"step": 9685
},
{
"epoch": 4.353099730458221,
"grad_norm": 3.203125,
"learning_rate": 0.00021486804780839226,
"loss": 4.6896,
"step": 9690
},
{
"epoch": 4.355345911949685,
"grad_norm": 3.78125,
"learning_rate": 0.00021477076759046513,
"loss": 4.621,
"step": 9695
},
{
"epoch": 4.35759209344115,
"grad_norm": 3.5,
"learning_rate": 0.00021467345745435678,
"loss": 4.7051,
"step": 9700
},
{
"epoch": 4.359838274932614,
"grad_norm": 3.203125,
"learning_rate": 0.0002145761174585622,
"loss": 4.6858,
"step": 9705
},
{
"epoch": 4.362084456424079,
"grad_norm": 3.515625,
"learning_rate": 0.00021447874766159433,
"loss": 4.7533,
"step": 9710
},
{
"epoch": 4.364330637915543,
"grad_norm": 3.265625,
"learning_rate": 0.00021438134812198415,
"loss": 4.7265,
"step": 9715
},
{
"epoch": 4.366576819407008,
"grad_norm": 3.265625,
"learning_rate": 0.00021428391889828034,
"loss": 4.6811,
"step": 9720
},
{
"epoch": 4.368823000898472,
"grad_norm": 3.453125,
"learning_rate": 0.00021418646004904953,
"loss": 4.6191,
"step": 9725
},
{
"epoch": 4.371069182389937,
"grad_norm": 3.640625,
"learning_rate": 0.00021408897163287615,
"loss": 4.7698,
"step": 9730
},
{
"epoch": 4.373315363881401,
"grad_norm": 3.296875,
"learning_rate": 0.00021399145370836238,
"loss": 4.7045,
"step": 9735
},
{
"epoch": 4.375561545372866,
"grad_norm": 3.203125,
"learning_rate": 0.0002138939063341282,
"loss": 4.6745,
"step": 9740
},
{
"epoch": 4.37780772686433,
"grad_norm": 3.5,
"learning_rate": 0.00021379632956881116,
"loss": 4.6972,
"step": 9745
},
{
"epoch": 4.380053908355795,
"grad_norm": 3.296875,
"learning_rate": 0.00021369872347106662,
"loss": 4.6778,
"step": 9750
},
{
"epoch": 4.382300089847259,
"grad_norm": 3.578125,
"learning_rate": 0.00021360108809956752,
"loss": 4.7863,
"step": 9755
},
{
"epoch": 4.384546271338724,
"grad_norm": 3.171875,
"learning_rate": 0.00021350342351300438,
"loss": 4.6412,
"step": 9760
},
{
"epoch": 4.386792452830189,
"grad_norm": 3.296875,
"learning_rate": 0.00021340572977008524,
"loss": 4.6175,
"step": 9765
},
{
"epoch": 4.389038634321653,
"grad_norm": 3.09375,
"learning_rate": 0.0002133080069295358,
"loss": 4.7325,
"step": 9770
},
{
"epoch": 4.391284815813117,
"grad_norm": 3.296875,
"learning_rate": 0.0002132102550500991,
"loss": 4.6518,
"step": 9775
},
{
"epoch": 4.393530997304582,
"grad_norm": 3.328125,
"learning_rate": 0.00021311247419053574,
"loss": 4.701,
"step": 9780
},
{
"epoch": 4.395777178796047,
"grad_norm": 3.21875,
"learning_rate": 0.0002130146644096237,
"loss": 4.6722,
"step": 9785
},
{
"epoch": 4.398023360287511,
"grad_norm": 3.40625,
"learning_rate": 0.00021291682576615837,
"loss": 4.6529,
"step": 9790
},
{
"epoch": 4.400269541778976,
"grad_norm": 3.25,
"learning_rate": 0.00021281895831895247,
"loss": 4.6846,
"step": 9795
},
{
"epoch": 4.40251572327044,
"grad_norm": 3.34375,
"learning_rate": 0.00021272106212683598,
"loss": 4.674,
"step": 9800
},
{
"epoch": 4.404761904761905,
"grad_norm": 3.34375,
"learning_rate": 0.00021262313724865626,
"loss": 4.6378,
"step": 9805
},
{
"epoch": 4.407008086253369,
"grad_norm": 3.34375,
"learning_rate": 0.0002125251837432779,
"loss": 4.7018,
"step": 9810
},
{
"epoch": 4.409254267744834,
"grad_norm": 3.265625,
"learning_rate": 0.00021242720166958257,
"loss": 4.68,
"step": 9815
},
{
"epoch": 4.411500449236298,
"grad_norm": 3.515625,
"learning_rate": 0.00021232919108646933,
"loss": 4.6918,
"step": 9820
},
{
"epoch": 4.413746630727763,
"grad_norm": 3.75,
"learning_rate": 0.00021223115205285418,
"loss": 4.6849,
"step": 9825
},
{
"epoch": 4.415992812219227,
"grad_norm": 3.25,
"learning_rate": 0.00021213308462767025,
"loss": 4.6931,
"step": 9830
},
{
"epoch": 4.418238993710692,
"grad_norm": 3.453125,
"learning_rate": 0.00021203498886986793,
"loss": 4.6482,
"step": 9835
},
{
"epoch": 4.420485175202156,
"grad_norm": 3.34375,
"learning_rate": 0.00021193686483841437,
"loss": 4.6818,
"step": 9840
},
{
"epoch": 4.422731356693621,
"grad_norm": 3.1875,
"learning_rate": 0.00021183871259229393,
"loss": 4.6446,
"step": 9845
},
{
"epoch": 4.424977538185085,
"grad_norm": 3.453125,
"learning_rate": 0.00021174053219050778,
"loss": 4.6766,
"step": 9850
},
{
"epoch": 4.42722371967655,
"grad_norm": 3.3125,
"learning_rate": 0.0002116423236920741,
"loss": 4.6678,
"step": 9855
},
{
"epoch": 4.429469901168014,
"grad_norm": 3.390625,
"learning_rate": 0.00021154408715602795,
"loss": 4.7295,
"step": 9860
},
{
"epoch": 4.431716082659479,
"grad_norm": 3.171875,
"learning_rate": 0.00021144582264142123,
"loss": 4.7037,
"step": 9865
},
{
"epoch": 4.433962264150943,
"grad_norm": 3.4375,
"learning_rate": 0.00021134753020732265,
"loss": 4.6139,
"step": 9870
},
{
"epoch": 4.436208445642408,
"grad_norm": 3.5625,
"learning_rate": 0.00021124920991281778,
"loss": 4.6233,
"step": 9875
},
{
"epoch": 4.438454627133872,
"grad_norm": 3.359375,
"learning_rate": 0.00021115086181700877,
"loss": 4.6462,
"step": 9880
},
{
"epoch": 4.440700808625337,
"grad_norm": 3.234375,
"learning_rate": 0.00021105248597901456,
"loss": 4.692,
"step": 9885
},
{
"epoch": 4.442946990116801,
"grad_norm": 4.03125,
"learning_rate": 0.00021095408245797094,
"loss": 4.7333,
"step": 9890
},
{
"epoch": 4.445193171608266,
"grad_norm": 3.390625,
"learning_rate": 0.00021085565131303004,
"loss": 4.7001,
"step": 9895
},
{
"epoch": 4.44743935309973,
"grad_norm": 3.515625,
"learning_rate": 0.00021075719260336086,
"loss": 4.6733,
"step": 9900
},
{
"epoch": 4.449685534591195,
"grad_norm": 3.375,
"learning_rate": 0.00021065870638814875,
"loss": 4.6761,
"step": 9905
},
{
"epoch": 4.45193171608266,
"grad_norm": 3.65625,
"learning_rate": 0.0002105601927265958,
"loss": 4.7403,
"step": 9910
},
{
"epoch": 4.454177897574124,
"grad_norm": 3.390625,
"learning_rate": 0.0002104616516779204,
"loss": 4.6371,
"step": 9915
},
{
"epoch": 4.456424079065589,
"grad_norm": 3.296875,
"learning_rate": 0.00021036308330135752,
"loss": 4.6903,
"step": 9920
},
{
"epoch": 4.458670260557053,
"grad_norm": 3.078125,
"learning_rate": 0.00021026448765615866,
"loss": 4.6965,
"step": 9925
},
{
"epoch": 4.460916442048518,
"grad_norm": 3.375,
"learning_rate": 0.00021016586480159145,
"loss": 4.7419,
"step": 9930
},
{
"epoch": 4.463162623539982,
"grad_norm": 3.46875,
"learning_rate": 0.0002100672147969401,
"loss": 4.7129,
"step": 9935
},
{
"epoch": 4.465408805031447,
"grad_norm": 3.3125,
"learning_rate": 0.00020996853770150495,
"loss": 4.6758,
"step": 9940
},
{
"epoch": 4.467654986522911,
"grad_norm": 3.171875,
"learning_rate": 0.00020986983357460282,
"loss": 4.6629,
"step": 9945
},
{
"epoch": 4.469901168014376,
"grad_norm": 3.3125,
"learning_rate": 0.00020977110247556667,
"loss": 4.7698,
"step": 9950
},
{
"epoch": 4.47214734950584,
"grad_norm": 3.21875,
"learning_rate": 0.00020967234446374572,
"loss": 4.7114,
"step": 9955
},
{
"epoch": 4.474393530997305,
"grad_norm": 3.453125,
"learning_rate": 0.0002095735595985053,
"loss": 4.7192,
"step": 9960
},
{
"epoch": 4.476639712488769,
"grad_norm": 3.296875,
"learning_rate": 0.00020947474793922699,
"loss": 4.7341,
"step": 9965
},
{
"epoch": 4.478885893980234,
"grad_norm": 3.453125,
"learning_rate": 0.00020937590954530827,
"loss": 4.6937,
"step": 9970
},
{
"epoch": 4.481132075471698,
"grad_norm": 3.515625,
"learning_rate": 0.00020927704447616291,
"loss": 4.6864,
"step": 9975
},
{
"epoch": 4.483378256963163,
"grad_norm": 3.6875,
"learning_rate": 0.0002091781527912207,
"loss": 4.7076,
"step": 9980
},
{
"epoch": 4.485624438454627,
"grad_norm": 3.375,
"learning_rate": 0.00020907923454992729,
"loss": 4.6991,
"step": 9985
},
{
"epoch": 4.487870619946092,
"grad_norm": 3.1875,
"learning_rate": 0.0002089802898117444,
"loss": 4.64,
"step": 9990
},
{
"epoch": 4.490116801437556,
"grad_norm": 3.265625,
"learning_rate": 0.0002088813186361496,
"loss": 4.656,
"step": 9995
},
{
"epoch": 4.492362982929021,
"grad_norm": 3.578125,
"learning_rate": 0.00020878232108263647,
"loss": 4.7397,
"step": 10000
},
{
"epoch": 4.492362982929021,
"eval_loss": 4.845594882965088,
"eval_runtime": 16.0517,
"eval_samples_per_second": 1932.072,
"eval_steps_per_second": 241.532,
"step": 10000
},
{
"epoch": 4.494609164420485,
"grad_norm": 3.453125,
"learning_rate": 0.00020868329721071427,
"loss": 4.6774,
"step": 10005
},
{
"epoch": 4.49685534591195,
"grad_norm": 3.40625,
"learning_rate": 0.00020858424707990828,
"loss": 4.6391,
"step": 10010
},
{
"epoch": 4.499101527403414,
"grad_norm": 3.40625,
"learning_rate": 0.00020848517074975947,
"loss": 4.7102,
"step": 10015
},
{
"epoch": 4.501347708894879,
"grad_norm": 3.328125,
"learning_rate": 0.00020838606827982452,
"loss": 4.6926,
"step": 10020
},
{
"epoch": 4.503593890386343,
"grad_norm": 3.40625,
"learning_rate": 0.00020828693972967587,
"loss": 4.7024,
"step": 10025
},
{
"epoch": 4.505840071877808,
"grad_norm": 3.34375,
"learning_rate": 0.0002081877851589016,
"loss": 4.7202,
"step": 10030
},
{
"epoch": 4.508086253369273,
"grad_norm": 3.5,
"learning_rate": 0.00020808860462710556,
"loss": 4.7569,
"step": 10035
},
{
"epoch": 4.510332434860737,
"grad_norm": 3.53125,
"learning_rate": 0.00020798939819390697,
"loss": 4.6679,
"step": 10040
},
{
"epoch": 4.512578616352201,
"grad_norm": 3.109375,
"learning_rate": 0.00020789016591894085,
"loss": 4.6852,
"step": 10045
},
{
"epoch": 4.514824797843666,
"grad_norm": 3.234375,
"learning_rate": 0.0002077909078618576,
"loss": 4.6635,
"step": 10050
},
{
"epoch": 4.517070979335131,
"grad_norm": 3.34375,
"learning_rate": 0.00020769162408232326,
"loss": 4.6376,
"step": 10055
},
{
"epoch": 4.519317160826595,
"grad_norm": 3.453125,
"learning_rate": 0.00020759231464001916,
"loss": 4.7037,
"step": 10060
},
{
"epoch": 4.52156334231806,
"grad_norm": 3.1875,
"learning_rate": 0.0002074929795946422,
"loss": 4.7064,
"step": 10065
},
{
"epoch": 4.523809523809524,
"grad_norm": 3.25,
"learning_rate": 0.0002073936190059046,
"loss": 4.6143,
"step": 10070
},
{
"epoch": 4.526055705300989,
"grad_norm": 3.46875,
"learning_rate": 0.000207294232933534,
"loss": 4.6545,
"step": 10075
},
{
"epoch": 4.528301886792453,
"grad_norm": 3.359375,
"learning_rate": 0.00020719482143727325,
"loss": 4.6446,
"step": 10080
},
{
"epoch": 4.530548068283918,
"grad_norm": 3.640625,
"learning_rate": 0.00020709538457688054,
"loss": 4.6623,
"step": 10085
},
{
"epoch": 4.532794249775382,
"grad_norm": 3.453125,
"learning_rate": 0.00020699592241212934,
"loss": 4.7065,
"step": 10090
},
{
"epoch": 4.535040431266847,
"grad_norm": 3.234375,
"learning_rate": 0.0002068964350028083,
"loss": 4.6512,
"step": 10095
},
{
"epoch": 4.537286612758311,
"grad_norm": 3.25,
"learning_rate": 0.00020679692240872124,
"loss": 4.6402,
"step": 10100
},
{
"epoch": 4.539532794249776,
"grad_norm": 3.40625,
"learning_rate": 0.0002066973846896871,
"loss": 4.687,
"step": 10105
},
{
"epoch": 4.54177897574124,
"grad_norm": 3.546875,
"learning_rate": 0.00020659782190554,
"loss": 4.7121,
"step": 10110
},
{
"epoch": 4.544025157232705,
"grad_norm": 3.375,
"learning_rate": 0.0002064982341161291,
"loss": 4.6398,
"step": 10115
},
{
"epoch": 4.546271338724169,
"grad_norm": 3.421875,
"learning_rate": 0.00020639862138131841,
"loss": 4.664,
"step": 10120
},
{
"epoch": 4.548517520215634,
"grad_norm": 3.40625,
"learning_rate": 0.0002062989837609873,
"loss": 4.6828,
"step": 10125
},
{
"epoch": 4.550763701707098,
"grad_norm": 3.234375,
"learning_rate": 0.00020619932131502974,
"loss": 4.7815,
"step": 10130
},
{
"epoch": 4.553009883198563,
"grad_norm": 3.5,
"learning_rate": 0.00020609963410335485,
"loss": 4.6678,
"step": 10135
},
{
"epoch": 4.555256064690027,
"grad_norm": 3.390625,
"learning_rate": 0.00020599992218588652,
"loss": 4.6596,
"step": 10140
},
{
"epoch": 4.557502246181492,
"grad_norm": 3.375,
"learning_rate": 0.00020590018562256353,
"loss": 4.6422,
"step": 10145
},
{
"epoch": 4.559748427672956,
"grad_norm": 3.296875,
"learning_rate": 0.00020580042447333952,
"loss": 4.6899,
"step": 10150
},
{
"epoch": 4.561994609164421,
"grad_norm": 3.65625,
"learning_rate": 0.0002057006387981828,
"loss": 4.6943,
"step": 10155
},
{
"epoch": 4.564240790655885,
"grad_norm": 3.34375,
"learning_rate": 0.0002056008286570766,
"loss": 4.649,
"step": 10160
},
{
"epoch": 4.5664869721473496,
"grad_norm": 3.359375,
"learning_rate": 0.00020550099411001862,
"loss": 4.6801,
"step": 10165
},
{
"epoch": 4.568733153638814,
"grad_norm": 3.625,
"learning_rate": 0.00020540113521702147,
"loss": 4.6718,
"step": 10170
},
{
"epoch": 4.5709793351302785,
"grad_norm": 3.359375,
"learning_rate": 0.00020530125203811221,
"loss": 4.6745,
"step": 10175
},
{
"epoch": 4.5732255166217435,
"grad_norm": 3.515625,
"learning_rate": 0.00020520134463333258,
"loss": 4.6223,
"step": 10180
},
{
"epoch": 4.5754716981132075,
"grad_norm": 3.40625,
"learning_rate": 0.0002051014130627389,
"loss": 4.6363,
"step": 10185
},
{
"epoch": 4.577717879604672,
"grad_norm": 3.453125,
"learning_rate": 0.00020500145738640198,
"loss": 4.6598,
"step": 10190
},
{
"epoch": 4.5799640610961365,
"grad_norm": 3.265625,
"learning_rate": 0.00020490147766440714,
"loss": 4.6446,
"step": 10195
},
{
"epoch": 4.5822102425876015,
"grad_norm": 3.59375,
"learning_rate": 0.0002048014739568541,
"loss": 4.7159,
"step": 10200
},
{
"epoch": 4.5844564240790655,
"grad_norm": 3.5,
"learning_rate": 0.00020470144632385705,
"loss": 4.6996,
"step": 10205
},
{
"epoch": 4.5867026055705304,
"grad_norm": 3.640625,
"learning_rate": 0.00020460139482554463,
"loss": 4.7254,
"step": 10210
},
{
"epoch": 4.5889487870619945,
"grad_norm": 3.40625,
"learning_rate": 0.0002045013195220597,
"loss": 4.6386,
"step": 10215
},
{
"epoch": 4.591194968553459,
"grad_norm": 3.28125,
"learning_rate": 0.00020440122047355946,
"loss": 4.6847,
"step": 10220
},
{
"epoch": 4.5934411500449235,
"grad_norm": 3.265625,
"learning_rate": 0.00020430109774021547,
"loss": 4.7036,
"step": 10225
},
{
"epoch": 4.595687331536388,
"grad_norm": 3.421875,
"learning_rate": 0.00020420095138221336,
"loss": 4.6806,
"step": 10230
},
{
"epoch": 4.5979335130278525,
"grad_norm": 3.3125,
"learning_rate": 0.00020410078145975314,
"loss": 4.673,
"step": 10235
},
{
"epoch": 4.600179694519317,
"grad_norm": 3.3125,
"learning_rate": 0.00020400058803304887,
"loss": 4.6314,
"step": 10240
},
{
"epoch": 4.6024258760107815,
"grad_norm": 3.203125,
"learning_rate": 0.00020390037116232884,
"loss": 4.6995,
"step": 10245
},
{
"epoch": 4.604672057502246,
"grad_norm": 3.40625,
"learning_rate": 0.00020380013090783532,
"loss": 4.7094,
"step": 10250
},
{
"epoch": 4.6069182389937104,
"grad_norm": 3.28125,
"learning_rate": 0.00020369986732982472,
"loss": 4.6444,
"step": 10255
},
{
"epoch": 4.609164420485175,
"grad_norm": 3.359375,
"learning_rate": 0.00020359958048856737,
"loss": 4.6719,
"step": 10260
},
{
"epoch": 4.611410601976639,
"grad_norm": 3.3125,
"learning_rate": 0.00020349927044434774,
"loss": 4.7002,
"step": 10265
},
{
"epoch": 4.613656783468104,
"grad_norm": 3.234375,
"learning_rate": 0.00020339893725746403,
"loss": 4.7256,
"step": 10270
},
{
"epoch": 4.615902964959568,
"grad_norm": 3.4375,
"learning_rate": 0.00020329858098822861,
"loss": 4.6411,
"step": 10275
},
{
"epoch": 4.618149146451033,
"grad_norm": 3.265625,
"learning_rate": 0.00020319820169696756,
"loss": 4.7058,
"step": 10280
},
{
"epoch": 4.620395327942497,
"grad_norm": 3.234375,
"learning_rate": 0.00020309779944402079,
"loss": 4.6442,
"step": 10285
},
{
"epoch": 4.622641509433962,
"grad_norm": 3.390625,
"learning_rate": 0.0002029973742897421,
"loss": 4.7026,
"step": 10290
},
{
"epoch": 4.624887690925426,
"grad_norm": 3.359375,
"learning_rate": 0.000202896926294499,
"loss": 4.6634,
"step": 10295
},
{
"epoch": 4.627133872416891,
"grad_norm": 3.328125,
"learning_rate": 0.00020279645551867276,
"loss": 4.6369,
"step": 10300
},
{
"epoch": 4.629380053908356,
"grad_norm": 3.3125,
"learning_rate": 0.00020269596202265828,
"loss": 4.6473,
"step": 10305
},
{
"epoch": 4.63162623539982,
"grad_norm": 3.34375,
"learning_rate": 0.0002025954458668642,
"loss": 4.706,
"step": 10310
},
{
"epoch": 4.633872416891284,
"grad_norm": 3.171875,
"learning_rate": 0.00020249490711171276,
"loss": 4.7147,
"step": 10315
},
{
"epoch": 4.636118598382749,
"grad_norm": 3.78125,
"learning_rate": 0.00020239434581763972,
"loss": 4.6597,
"step": 10320
},
{
"epoch": 4.638364779874214,
"grad_norm": 3.328125,
"learning_rate": 0.0002022937620450945,
"loss": 4.6389,
"step": 10325
},
{
"epoch": 4.640610961365678,
"grad_norm": 3.3125,
"learning_rate": 0.00020219315585453992,
"loss": 4.7479,
"step": 10330
},
{
"epoch": 4.642857142857143,
"grad_norm": 3.390625,
"learning_rate": 0.00020209252730645234,
"loss": 4.7021,
"step": 10335
},
{
"epoch": 4.645103324348607,
"grad_norm": 3.140625,
"learning_rate": 0.00020199187646132162,
"loss": 4.5968,
"step": 10340
},
{
"epoch": 4.647349505840072,
"grad_norm": 3.15625,
"learning_rate": 0.00020189120337965082,
"loss": 4.6683,
"step": 10345
},
{
"epoch": 4.649595687331536,
"grad_norm": 3.421875,
"learning_rate": 0.00020179050812195662,
"loss": 4.7173,
"step": 10350
},
{
"epoch": 4.651841868823001,
"grad_norm": 3.109375,
"learning_rate": 0.0002016897907487688,
"loss": 4.6803,
"step": 10355
},
{
"epoch": 4.654088050314465,
"grad_norm": 3.390625,
"learning_rate": 0.00020158905132063064,
"loss": 4.7211,
"step": 10360
},
{
"epoch": 4.65633423180593,
"grad_norm": 3.546875,
"learning_rate": 0.0002014882898980985,
"loss": 4.6781,
"step": 10365
},
{
"epoch": 4.658580413297394,
"grad_norm": 3.453125,
"learning_rate": 0.00020138750654174212,
"loss": 4.6721,
"step": 10370
},
{
"epoch": 4.660826594788859,
"grad_norm": 3.5625,
"learning_rate": 0.00020128670131214427,
"loss": 4.701,
"step": 10375
},
{
"epoch": 4.663072776280323,
"grad_norm": 3.4375,
"learning_rate": 0.0002011858742699009,
"loss": 4.6892,
"step": 10380
},
{
"epoch": 4.665318957771788,
"grad_norm": 3.46875,
"learning_rate": 0.0002010850254756213,
"loss": 4.6924,
"step": 10385
},
{
"epoch": 4.667565139263252,
"grad_norm": 3.203125,
"learning_rate": 0.00020098415498992752,
"loss": 4.6235,
"step": 10390
},
{
"epoch": 4.669811320754717,
"grad_norm": 3.28125,
"learning_rate": 0.00020088326287345476,
"loss": 4.6235,
"step": 10395
},
{
"epoch": 4.672057502246181,
"grad_norm": 3.421875,
"learning_rate": 0.00020078234918685133,
"loss": 4.6504,
"step": 10400
},
{
"epoch": 4.674303683737646,
"grad_norm": 3.171875,
"learning_rate": 0.00020068141399077837,
"loss": 4.641,
"step": 10405
},
{
"epoch": 4.67654986522911,
"grad_norm": 3.375,
"learning_rate": 0.00020058045734590998,
"loss": 4.6779,
"step": 10410
},
{
"epoch": 4.678796046720575,
"grad_norm": 3.59375,
"learning_rate": 0.0002004794793129332,
"loss": 4.7042,
"step": 10415
},
{
"epoch": 4.681042228212039,
"grad_norm": 3.46875,
"learning_rate": 0.0002003784799525479,
"loss": 4.6635,
"step": 10420
},
{
"epoch": 4.683288409703504,
"grad_norm": 3.375,
"learning_rate": 0.00020027745932546677,
"loss": 4.703,
"step": 10425
},
{
"epoch": 4.685534591194968,
"grad_norm": 3.21875,
"learning_rate": 0.00020017641749241533,
"loss": 4.6605,
"step": 10430
},
{
"epoch": 4.687780772686433,
"grad_norm": 3.578125,
"learning_rate": 0.00020007535451413167,
"loss": 4.6924,
"step": 10435
},
{
"epoch": 4.690026954177897,
"grad_norm": 3.40625,
"learning_rate": 0.00019997427045136687,
"loss": 4.6854,
"step": 10440
},
{
"epoch": 4.692273135669362,
"grad_norm": 3.359375,
"learning_rate": 0.00019987316536488443,
"loss": 4.7355,
"step": 10445
},
{
"epoch": 4.694519317160827,
"grad_norm": 3.234375,
"learning_rate": 0.00019977203931546063,
"loss": 4.631,
"step": 10450
},
{
"epoch": 4.696765498652291,
"grad_norm": 3.40625,
"learning_rate": 0.00019967089236388433,
"loss": 4.7017,
"step": 10455
},
{
"epoch": 4.699011680143755,
"grad_norm": 3.21875,
"learning_rate": 0.00019956972457095692,
"loss": 4.6951,
"step": 10460
},
{
"epoch": 4.70125786163522,
"grad_norm": 3.265625,
"learning_rate": 0.00019946853599749233,
"loss": 4.6927,
"step": 10465
},
{
"epoch": 4.703504043126685,
"grad_norm": 3.375,
"learning_rate": 0.00019936732670431702,
"loss": 4.6368,
"step": 10470
},
{
"epoch": 4.705750224618149,
"grad_norm": 3.40625,
"learning_rate": 0.00019926609675226985,
"loss": 4.6823,
"step": 10475
},
{
"epoch": 4.707996406109614,
"grad_norm": 3.59375,
"learning_rate": 0.00019916484620220213,
"loss": 4.6877,
"step": 10480
},
{
"epoch": 4.710242587601078,
"grad_norm": 3.46875,
"learning_rate": 0.00019906357511497756,
"loss": 4.6462,
"step": 10485
},
{
"epoch": 4.712488769092543,
"grad_norm": 3.5,
"learning_rate": 0.00019896228355147216,
"loss": 4.7508,
"step": 10490
},
{
"epoch": 4.714734950584007,
"grad_norm": 3.3125,
"learning_rate": 0.00019886097157257427,
"loss": 4.7164,
"step": 10495
},
{
"epoch": 4.716981132075472,
"grad_norm": 3.34375,
"learning_rate": 0.00019875963923918447,
"loss": 4.5771,
"step": 10500
},
{
"epoch": 4.719227313566936,
"grad_norm": 3.390625,
"learning_rate": 0.00019865828661221564,
"loss": 4.658,
"step": 10505
},
{
"epoch": 4.721473495058401,
"grad_norm": 3.6875,
"learning_rate": 0.00019855691375259284,
"loss": 4.6967,
"step": 10510
},
{
"epoch": 4.723719676549865,
"grad_norm": 3.34375,
"learning_rate": 0.00019845552072125325,
"loss": 4.6446,
"step": 10515
},
{
"epoch": 4.72596585804133,
"grad_norm": 3.453125,
"learning_rate": 0.00019835410757914617,
"loss": 4.6289,
"step": 10520
},
{
"epoch": 4.728212039532794,
"grad_norm": 3.28125,
"learning_rate": 0.0001982526743872331,
"loss": 4.6711,
"step": 10525
},
{
"epoch": 4.730458221024259,
"grad_norm": 3.40625,
"learning_rate": 0.00019815122120648743,
"loss": 4.6641,
"step": 10530
},
{
"epoch": 4.732704402515723,
"grad_norm": 3.5625,
"learning_rate": 0.00019804974809789472,
"loss": 4.6712,
"step": 10535
},
{
"epoch": 4.734950584007188,
"grad_norm": 3.421875,
"learning_rate": 0.00019794825512245244,
"loss": 4.6884,
"step": 10540
},
{
"epoch": 4.737196765498652,
"grad_norm": 3.609375,
"learning_rate": 0.00019784674234116996,
"loss": 4.6803,
"step": 10545
},
{
"epoch": 4.739442946990117,
"grad_norm": 3.390625,
"learning_rate": 0.00019774520981506857,
"loss": 4.6987,
"step": 10550
},
{
"epoch": 4.741689128481581,
"grad_norm": 3.25,
"learning_rate": 0.00019764365760518152,
"loss": 4.6352,
"step": 10555
},
{
"epoch": 4.743935309973046,
"grad_norm": 3.25,
"learning_rate": 0.00019754208577255384,
"loss": 4.7026,
"step": 10560
},
{
"epoch": 4.74618149146451,
"grad_norm": 3.46875,
"learning_rate": 0.0001974404943782423,
"loss": 4.6432,
"step": 10565
},
{
"epoch": 4.748427672955975,
"grad_norm": 3.453125,
"learning_rate": 0.00019733888348331545,
"loss": 4.6899,
"step": 10570
},
{
"epoch": 4.75067385444744,
"grad_norm": 3.421875,
"learning_rate": 0.00019723725314885364,
"loss": 4.6817,
"step": 10575
},
{
"epoch": 4.752920035938904,
"grad_norm": 3.515625,
"learning_rate": 0.00019713560343594884,
"loss": 4.6453,
"step": 10580
},
{
"epoch": 4.755166217430368,
"grad_norm": 3.296875,
"learning_rate": 0.00019703393440570464,
"loss": 4.6756,
"step": 10585
},
{
"epoch": 4.757412398921833,
"grad_norm": 3.296875,
"learning_rate": 0.00019693224611923632,
"loss": 4.6776,
"step": 10590
},
{
"epoch": 4.759658580413298,
"grad_norm": 3.359375,
"learning_rate": 0.00019683053863767068,
"loss": 4.6709,
"step": 10595
},
{
"epoch": 4.761904761904762,
"grad_norm": 3.1875,
"learning_rate": 0.00019672881202214616,
"loss": 4.6625,
"step": 10600
},
{
"epoch": 4.764150943396227,
"grad_norm": 3.375,
"learning_rate": 0.00019662706633381244,
"loss": 4.6974,
"step": 10605
},
{
"epoch": 4.766397124887691,
"grad_norm": 3.3125,
"learning_rate": 0.00019652530163383094,
"loss": 4.6754,
"step": 10610
},
{
"epoch": 4.768643306379156,
"grad_norm": 3.640625,
"learning_rate": 0.00019642351798337444,
"loss": 4.7521,
"step": 10615
},
{
"epoch": 4.77088948787062,
"grad_norm": 3.5625,
"learning_rate": 0.00019632171544362706,
"loss": 4.7013,
"step": 10620
},
{
"epoch": 4.773135669362085,
"grad_norm": 3.734375,
"learning_rate": 0.00019621989407578425,
"loss": 4.6938,
"step": 10625
},
{
"epoch": 4.775381850853549,
"grad_norm": 3.40625,
"learning_rate": 0.00019611805394105294,
"loss": 4.6451,
"step": 10630
},
{
"epoch": 4.777628032345014,
"grad_norm": 3.390625,
"learning_rate": 0.00019601619510065108,
"loss": 4.6984,
"step": 10635
},
{
"epoch": 4.779874213836478,
"grad_norm": 3.421875,
"learning_rate": 0.00019591431761580813,
"loss": 4.7172,
"step": 10640
},
{
"epoch": 4.782120395327943,
"grad_norm": 3.453125,
"learning_rate": 0.00019581242154776454,
"loss": 4.6923,
"step": 10645
},
{
"epoch": 4.784366576819407,
"grad_norm": 3.671875,
"learning_rate": 0.00019571050695777208,
"loss": 4.6544,
"step": 10650
},
{
"epoch": 4.786612758310872,
"grad_norm": 3.359375,
"learning_rate": 0.00019560857390709362,
"loss": 4.6593,
"step": 10655
},
{
"epoch": 4.788858939802336,
"grad_norm": 3.375,
"learning_rate": 0.0001955066224570031,
"loss": 4.5734,
"step": 10660
},
{
"epoch": 4.791105121293801,
"grad_norm": 3.671875,
"learning_rate": 0.0001954046526687855,
"loss": 4.6368,
"step": 10665
},
{
"epoch": 4.793351302785265,
"grad_norm": 3.53125,
"learning_rate": 0.00019530266460373685,
"loss": 4.6785,
"step": 10670
},
{
"epoch": 4.79559748427673,
"grad_norm": 3.546875,
"learning_rate": 0.00019520065832316419,
"loss": 4.6739,
"step": 10675
},
{
"epoch": 4.797843665768194,
"grad_norm": 3.234375,
"learning_rate": 0.00019509863388838552,
"loss": 4.6251,
"step": 10680
},
{
"epoch": 4.800089847259659,
"grad_norm": 3.390625,
"learning_rate": 0.00019499659136072966,
"loss": 4.7282,
"step": 10685
},
{
"epoch": 4.802336028751123,
"grad_norm": 3.34375,
"learning_rate": 0.0001948945308015364,
"loss": 4.7018,
"step": 10690
},
{
"epoch": 4.804582210242588,
"grad_norm": 3.265625,
"learning_rate": 0.00019479245227215639,
"loss": 4.6882,
"step": 10695
},
{
"epoch": 4.806828391734052,
"grad_norm": 3.265625,
"learning_rate": 0.00019469035583395087,
"loss": 4.6144,
"step": 10700
},
{
"epoch": 4.809074573225517,
"grad_norm": 3.921875,
"learning_rate": 0.00019458824154829215,
"loss": 4.6726,
"step": 10705
},
{
"epoch": 4.811320754716981,
"grad_norm": 3.5,
"learning_rate": 0.00019448610947656313,
"loss": 4.7331,
"step": 10710
},
{
"epoch": 4.813566936208446,
"grad_norm": 3.453125,
"learning_rate": 0.0001943839596801573,
"loss": 4.6982,
"step": 10715
},
{
"epoch": 4.815813117699911,
"grad_norm": 3.359375,
"learning_rate": 0.00019428179222047892,
"loss": 4.6745,
"step": 10720
},
{
"epoch": 4.818059299191375,
"grad_norm": 3.40625,
"learning_rate": 0.00019417960715894294,
"loss": 4.6347,
"step": 10725
},
{
"epoch": 4.820305480682839,
"grad_norm": 3.359375,
"learning_rate": 0.00019407740455697466,
"loss": 4.6806,
"step": 10730
},
{
"epoch": 4.822551662174304,
"grad_norm": 3.515625,
"learning_rate": 0.0001939751844760102,
"loss": 4.6906,
"step": 10735
},
{
"epoch": 4.824797843665769,
"grad_norm": 3.375,
"learning_rate": 0.00019387294697749592,
"loss": 4.6839,
"step": 10740
},
{
"epoch": 4.827044025157233,
"grad_norm": 3.234375,
"learning_rate": 0.0001937706921228889,
"loss": 4.6198,
"step": 10745
},
{
"epoch": 4.829290206648698,
"grad_norm": 3.421875,
"learning_rate": 0.00019366841997365647,
"loss": 4.6804,
"step": 10750
},
{
"epoch": 4.831536388140162,
"grad_norm": 3.578125,
"learning_rate": 0.00019356613059127634,
"loss": 4.6341,
"step": 10755
},
{
"epoch": 4.833782569631627,
"grad_norm": 3.28125,
"learning_rate": 0.00019346382403723683,
"loss": 4.6665,
"step": 10760
},
{
"epoch": 4.836028751123091,
"grad_norm": 3.359375,
"learning_rate": 0.00019336150037303624,
"loss": 4.6748,
"step": 10765
},
{
"epoch": 4.8382749326145555,
"grad_norm": 3.21875,
"learning_rate": 0.00019325915966018344,
"loss": 4.6332,
"step": 10770
},
{
"epoch": 4.84052111410602,
"grad_norm": 3.59375,
"learning_rate": 0.0001931568019601974,
"loss": 4.7308,
"step": 10775
},
{
"epoch": 4.8427672955974845,
"grad_norm": 3.28125,
"learning_rate": 0.00019305442733460733,
"loss": 4.6666,
"step": 10780
},
{
"epoch": 4.845013477088949,
"grad_norm": 3.359375,
"learning_rate": 0.00019295203584495258,
"loss": 4.7263,
"step": 10785
},
{
"epoch": 4.8472596585804135,
"grad_norm": 3.328125,
"learning_rate": 0.00019284962755278273,
"loss": 4.745,
"step": 10790
},
{
"epoch": 4.849505840071878,
"grad_norm": 3.625,
"learning_rate": 0.0001927472025196574,
"loss": 4.6752,
"step": 10795
},
{
"epoch": 4.8517520215633425,
"grad_norm": 3.140625,
"learning_rate": 0.00019264476080714627,
"loss": 4.6648,
"step": 10800
},
{
"epoch": 4.853998203054807,
"grad_norm": 3.25,
"learning_rate": 0.000192542302476829,
"loss": 4.6762,
"step": 10805
},
{
"epoch": 4.8562443845462715,
"grad_norm": 3.40625,
"learning_rate": 0.00019243982759029543,
"loss": 4.7266,
"step": 10810
},
{
"epoch": 4.8584905660377355,
"grad_norm": 3.484375,
"learning_rate": 0.00019233733620914508,
"loss": 4.6201,
"step": 10815
},
{
"epoch": 4.8607367475292005,
"grad_norm": 3.3125,
"learning_rate": 0.0001922348283949876,
"loss": 4.6364,
"step": 10820
},
{
"epoch": 4.8629829290206645,
"grad_norm": 3.546875,
"learning_rate": 0.0001921323042094424,
"loss": 4.6883,
"step": 10825
},
{
"epoch": 4.8652291105121295,
"grad_norm": 3.546875,
"learning_rate": 0.00019202976371413883,
"loss": 4.666,
"step": 10830
},
{
"epoch": 4.8674752920035935,
"grad_norm": 3.1875,
"learning_rate": 0.00019192720697071595,
"loss": 4.7158,
"step": 10835
},
{
"epoch": 4.8697214734950585,
"grad_norm": 3.90625,
"learning_rate": 0.0001918246340408226,
"loss": 4.7058,
"step": 10840
},
{
"epoch": 4.871967654986523,
"grad_norm": 3.484375,
"learning_rate": 0.00019172204498611733,
"loss": 4.6477,
"step": 10845
},
{
"epoch": 4.8742138364779874,
"grad_norm": 3.359375,
"learning_rate": 0.0001916194398682686,
"loss": 4.6853,
"step": 10850
},
{
"epoch": 4.8764600179694515,
"grad_norm": 3.265625,
"learning_rate": 0.0001915168187489542,
"loss": 4.6215,
"step": 10855
},
{
"epoch": 4.878706199460916,
"grad_norm": 3.640625,
"learning_rate": 0.0001914141816898617,
"loss": 4.6465,
"step": 10860
},
{
"epoch": 4.880952380952381,
"grad_norm": 3.234375,
"learning_rate": 0.00019131152875268828,
"loss": 4.6278,
"step": 10865
},
{
"epoch": 4.883198562443845,
"grad_norm": 3.953125,
"learning_rate": 0.00019120885999914067,
"loss": 4.6461,
"step": 10870
},
{
"epoch": 4.8854447439353095,
"grad_norm": 3.421875,
"learning_rate": 0.00019110617549093493,
"loss": 4.6566,
"step": 10875
},
{
"epoch": 4.887690925426774,
"grad_norm": 3.359375,
"learning_rate": 0.00019100347528979691,
"loss": 4.6575,
"step": 10880
},
{
"epoch": 4.889937106918239,
"grad_norm": 3.46875,
"learning_rate": 0.00019090075945746152,
"loss": 4.6042,
"step": 10885
},
{
"epoch": 4.892183288409703,
"grad_norm": 3.4375,
"learning_rate": 0.00019079802805567342,
"loss": 4.6088,
"step": 10890
},
{
"epoch": 4.894429469901168,
"grad_norm": 3.390625,
"learning_rate": 0.00019069528114618636,
"loss": 4.6826,
"step": 10895
},
{
"epoch": 4.896675651392632,
"grad_norm": 3.84375,
"learning_rate": 0.00019059251879076358,
"loss": 4.6487,
"step": 10900
},
{
"epoch": 4.898921832884097,
"grad_norm": 3.328125,
"learning_rate": 0.00019048974105117744,
"loss": 4.6943,
"step": 10905
},
{
"epoch": 4.901168014375561,
"grad_norm": 3.796875,
"learning_rate": 0.00019038694798920975,
"loss": 4.7191,
"step": 10910
},
{
"epoch": 4.903414195867026,
"grad_norm": 3.25,
"learning_rate": 0.0001902841396666514,
"loss": 4.6203,
"step": 10915
},
{
"epoch": 4.90566037735849,
"grad_norm": 3.6875,
"learning_rate": 0.00019018131614530244,
"loss": 4.5897,
"step": 10920
},
{
"epoch": 4.907906558849955,
"grad_norm": 3.3125,
"learning_rate": 0.00019007847748697215,
"loss": 4.6385,
"step": 10925
},
{
"epoch": 4.910152740341419,
"grad_norm": 3.453125,
"learning_rate": 0.00018997562375347882,
"loss": 4.6484,
"step": 10930
},
{
"epoch": 4.912398921832884,
"grad_norm": 3.3125,
"learning_rate": 0.00018987275500664987,
"loss": 4.6698,
"step": 10935
},
{
"epoch": 4.914645103324348,
"grad_norm": 3.34375,
"learning_rate": 0.00018976987130832172,
"loss": 4.6663,
"step": 10940
},
{
"epoch": 4.916891284815813,
"grad_norm": 3.4375,
"learning_rate": 0.00018966697272033975,
"loss": 4.6668,
"step": 10945
},
{
"epoch": 4.919137466307277,
"grad_norm": 3.28125,
"learning_rate": 0.0001895640593045583,
"loss": 4.695,
"step": 10950
},
{
"epoch": 4.921383647798742,
"grad_norm": 3.171875,
"learning_rate": 0.00018946113112284073,
"loss": 4.575,
"step": 10955
},
{
"epoch": 4.923629829290206,
"grad_norm": 3.40625,
"learning_rate": 0.000189358188237059,
"loss": 4.5769,
"step": 10960
},
{
"epoch": 4.925876010781671,
"grad_norm": 3.5,
"learning_rate": 0.00018925523070909426,
"loss": 4.7168,
"step": 10965
},
{
"epoch": 4.928122192273135,
"grad_norm": 3.421875,
"learning_rate": 0.0001891522586008362,
"loss": 4.6358,
"step": 10970
},
{
"epoch": 4.9303683737646,
"grad_norm": 3.25,
"learning_rate": 0.0001890492719741834,
"loss": 4.6406,
"step": 10975
},
{
"epoch": 4.932614555256064,
"grad_norm": 3.484375,
"learning_rate": 0.00018894627089104316,
"loss": 4.6625,
"step": 10980
},
{
"epoch": 4.934860736747529,
"grad_norm": 3.28125,
"learning_rate": 0.00018884325541333142,
"loss": 4.6533,
"step": 10985
},
{
"epoch": 4.937106918238994,
"grad_norm": 3.296875,
"learning_rate": 0.00018874022560297276,
"loss": 4.6776,
"step": 10990
},
{
"epoch": 4.939353099730458,
"grad_norm": 3.515625,
"learning_rate": 0.00018863718152190045,
"loss": 4.7341,
"step": 10995
},
{
"epoch": 4.941599281221922,
"grad_norm": 3.40625,
"learning_rate": 0.00018853412323205634,
"loss": 4.6744,
"step": 11000
},
{
"epoch": 4.941599281221922,
"eval_loss": 4.825163841247559,
"eval_runtime": 16.1797,
"eval_samples_per_second": 1916.78,
"eval_steps_per_second": 239.621,
"step": 11000
},
{
"epoch": 4.943845462713387,
"grad_norm": 3.328125,
"learning_rate": 0.00018843105079539068,
"loss": 4.6552,
"step": 11005
},
{
"epoch": 4.946091644204852,
"grad_norm": 3.546875,
"learning_rate": 0.0001883279642738624,
"loss": 4.6436,
"step": 11010
},
{
"epoch": 4.948337825696316,
"grad_norm": 3.21875,
"learning_rate": 0.00018822486372943885,
"loss": 4.6329,
"step": 11015
},
{
"epoch": 4.950584007187781,
"grad_norm": 3.203125,
"learning_rate": 0.00018812174922409566,
"loss": 4.6874,
"step": 11020
},
{
"epoch": 4.952830188679245,
"grad_norm": 3.34375,
"learning_rate": 0.00018801862081981713,
"loss": 4.6695,
"step": 11025
},
{
"epoch": 4.95507637017071,
"grad_norm": 3.375,
"learning_rate": 0.00018791547857859565,
"loss": 4.703,
"step": 11030
},
{
"epoch": 4.957322551662174,
"grad_norm": 3.6875,
"learning_rate": 0.00018781232256243212,
"loss": 4.6931,
"step": 11035
},
{
"epoch": 4.959568733153639,
"grad_norm": 3.359375,
"learning_rate": 0.00018770915283333555,
"loss": 4.6586,
"step": 11040
},
{
"epoch": 4.961814914645103,
"grad_norm": 3.359375,
"learning_rate": 0.0001876059694533233,
"loss": 4.7529,
"step": 11045
},
{
"epoch": 4.964061096136568,
"grad_norm": 3.0,
"learning_rate": 0.00018750277248442095,
"loss": 4.7353,
"step": 11050
},
{
"epoch": 4.966307277628032,
"grad_norm": 3.375,
"learning_rate": 0.00018739956198866222,
"loss": 4.6483,
"step": 11055
},
{
"epoch": 4.968553459119497,
"grad_norm": 3.328125,
"learning_rate": 0.00018729633802808894,
"loss": 4.6358,
"step": 11060
},
{
"epoch": 4.970799640610961,
"grad_norm": 3.453125,
"learning_rate": 0.0001871931006647511,
"loss": 4.6772,
"step": 11065
},
{
"epoch": 4.973045822102426,
"grad_norm": 3.421875,
"learning_rate": 0.00018708984996070662,
"loss": 4.6191,
"step": 11070
},
{
"epoch": 4.97529200359389,
"grad_norm": 3.375,
"learning_rate": 0.0001869865859780215,
"loss": 4.6018,
"step": 11075
},
{
"epoch": 4.977538185085355,
"grad_norm": 3.21875,
"learning_rate": 0.0001868833087787698,
"loss": 4.6706,
"step": 11080
},
{
"epoch": 4.979784366576819,
"grad_norm": 3.515625,
"learning_rate": 0.00018678001842503347,
"loss": 4.6274,
"step": 11085
},
{
"epoch": 4.982030548068284,
"grad_norm": 3.484375,
"learning_rate": 0.0001866767149789023,
"loss": 4.7188,
"step": 11090
},
{
"epoch": 4.984276729559748,
"grad_norm": 3.453125,
"learning_rate": 0.00018657339850247407,
"loss": 4.6799,
"step": 11095
},
{
"epoch": 4.986522911051213,
"grad_norm": 4.9375,
"learning_rate": 0.0001864700690578543,
"loss": 4.6977,
"step": 11100
},
{
"epoch": 4.988769092542677,
"grad_norm": 3.34375,
"learning_rate": 0.00018636672670715632,
"loss": 4.7117,
"step": 11105
},
{
"epoch": 4.991015274034142,
"grad_norm": 3.25,
"learning_rate": 0.0001862633715125013,
"loss": 4.6384,
"step": 11110
},
{
"epoch": 4.993261455525607,
"grad_norm": 3.25,
"learning_rate": 0.00018616000353601804,
"loss": 4.6931,
"step": 11115
},
{
"epoch": 4.995507637017071,
"grad_norm": 3.421875,
"learning_rate": 0.00018605662283984305,
"loss": 4.702,
"step": 11120
},
{
"epoch": 4.997753818508535,
"grad_norm": 3.46875,
"learning_rate": 0.00018595322948612047,
"loss": 4.7222,
"step": 11125
},
{
"epoch": 5.0,
"grad_norm": 8.5,
"learning_rate": 0.00018584982353700208,
"loss": 4.6256,
"step": 11130
},
{
"epoch": 5.002246181491465,
"grad_norm": 3.5,
"learning_rate": 0.00018574640505464722,
"loss": 4.5416,
"step": 11135
},
{
"epoch": 5.004492362982929,
"grad_norm": 3.328125,
"learning_rate": 0.00018564297410122272,
"loss": 4.5626,
"step": 11140
},
{
"epoch": 5.006738544474394,
"grad_norm": 3.6875,
"learning_rate": 0.00018553953073890305,
"loss": 4.6248,
"step": 11145
},
{
"epoch": 5.008984725965858,
"grad_norm": 3.375,
"learning_rate": 0.00018543607502986996,
"loss": 4.6286,
"step": 11150
},
{
"epoch": 5.011230907457323,
"grad_norm": 3.84375,
"learning_rate": 0.00018533260703631265,
"loss": 4.5629,
"step": 11155
},
{
"epoch": 5.013477088948787,
"grad_norm": 3.375,
"learning_rate": 0.00018522912682042786,
"loss": 4.5586,
"step": 11160
},
{
"epoch": 5.015723270440252,
"grad_norm": 3.765625,
"learning_rate": 0.0001851256344444195,
"loss": 4.6115,
"step": 11165
},
{
"epoch": 5.017969451931716,
"grad_norm": 3.484375,
"learning_rate": 0.00018502212997049893,
"loss": 4.5836,
"step": 11170
},
{
"epoch": 5.020215633423181,
"grad_norm": 3.484375,
"learning_rate": 0.00018491861346088464,
"loss": 4.538,
"step": 11175
},
{
"epoch": 5.022461814914645,
"grad_norm": 3.53125,
"learning_rate": 0.00018481508497780245,
"loss": 4.5572,
"step": 11180
},
{
"epoch": 5.02470799640611,
"grad_norm": 3.34375,
"learning_rate": 0.00018471154458348538,
"loss": 4.5747,
"step": 11185
},
{
"epoch": 5.026954177897574,
"grad_norm": 3.546875,
"learning_rate": 0.00018460799234017354,
"loss": 4.5392,
"step": 11190
},
{
"epoch": 5.029200359389039,
"grad_norm": 3.53125,
"learning_rate": 0.0001845044283101142,
"loss": 4.6448,
"step": 11195
},
{
"epoch": 5.031446540880503,
"grad_norm": 3.578125,
"learning_rate": 0.00018440085255556183,
"loss": 4.5819,
"step": 11200
},
{
"epoch": 5.033692722371968,
"grad_norm": 3.34375,
"learning_rate": 0.00018429726513877773,
"loss": 4.5667,
"step": 11205
},
{
"epoch": 5.035938903863432,
"grad_norm": 3.46875,
"learning_rate": 0.00018419366612203037,
"loss": 4.5275,
"step": 11210
},
{
"epoch": 5.038185085354897,
"grad_norm": 3.546875,
"learning_rate": 0.00018409005556759513,
"loss": 4.6251,
"step": 11215
},
{
"epoch": 5.040431266846361,
"grad_norm": 3.4375,
"learning_rate": 0.0001839864335377543,
"loss": 4.5541,
"step": 11220
},
{
"epoch": 5.042677448337826,
"grad_norm": 3.3125,
"learning_rate": 0.00018388280009479718,
"loss": 4.4648,
"step": 11225
},
{
"epoch": 5.04492362982929,
"grad_norm": 3.703125,
"learning_rate": 0.00018377915530101984,
"loss": 4.5523,
"step": 11230
},
{
"epoch": 5.047169811320755,
"grad_norm": 3.265625,
"learning_rate": 0.00018367549921872512,
"loss": 4.5651,
"step": 11235
},
{
"epoch": 5.049415992812219,
"grad_norm": 3.3125,
"learning_rate": 0.00018357183191022283,
"loss": 4.6121,
"step": 11240
},
{
"epoch": 5.051662174303684,
"grad_norm": 3.421875,
"learning_rate": 0.00018346815343782936,
"loss": 4.5563,
"step": 11245
},
{
"epoch": 5.053908355795148,
"grad_norm": 3.46875,
"learning_rate": 0.00018336446386386782,
"loss": 4.6189,
"step": 11250
},
{
"epoch": 5.056154537286613,
"grad_norm": 3.4375,
"learning_rate": 0.00018326076325066808,
"loss": 4.5833,
"step": 11255
},
{
"epoch": 5.058400718778077,
"grad_norm": 3.453125,
"learning_rate": 0.00018315705166056667,
"loss": 4.5648,
"step": 11260
},
{
"epoch": 5.060646900269542,
"grad_norm": 3.328125,
"learning_rate": 0.0001830533291559066,
"loss": 4.5622,
"step": 11265
},
{
"epoch": 5.062893081761007,
"grad_norm": 3.359375,
"learning_rate": 0.00018294959579903742,
"loss": 4.5399,
"step": 11270
},
{
"epoch": 5.065139263252471,
"grad_norm": 3.578125,
"learning_rate": 0.0001828458516523154,
"loss": 4.6647,
"step": 11275
},
{
"epoch": 5.067385444743936,
"grad_norm": 3.703125,
"learning_rate": 0.0001827420967781031,
"loss": 4.5279,
"step": 11280
},
{
"epoch": 5.0696316262354,
"grad_norm": 3.625,
"learning_rate": 0.00018263833123876962,
"loss": 4.6492,
"step": 11285
},
{
"epoch": 5.071877807726865,
"grad_norm": 3.15625,
"learning_rate": 0.00018253455509669047,
"loss": 4.632,
"step": 11290
},
{
"epoch": 5.074123989218329,
"grad_norm": 3.609375,
"learning_rate": 0.00018243076841424754,
"loss": 4.5971,
"step": 11295
},
{
"epoch": 5.076370170709794,
"grad_norm": 3.515625,
"learning_rate": 0.00018232697125382903,
"loss": 4.5777,
"step": 11300
},
{
"epoch": 5.078616352201258,
"grad_norm": 3.28125,
"learning_rate": 0.0001822231636778293,
"loss": 4.5621,
"step": 11305
},
{
"epoch": 5.080862533692723,
"grad_norm": 3.71875,
"learning_rate": 0.0001821193457486493,
"loss": 4.584,
"step": 11310
},
{
"epoch": 5.083108715184187,
"grad_norm": 3.4375,
"learning_rate": 0.00018201551752869595,
"loss": 4.5465,
"step": 11315
},
{
"epoch": 5.085354896675652,
"grad_norm": 3.4375,
"learning_rate": 0.0001819116790803824,
"loss": 4.6135,
"step": 11320
},
{
"epoch": 5.087601078167116,
"grad_norm": 3.375,
"learning_rate": 0.00018180783046612797,
"loss": 4.5585,
"step": 11325
},
{
"epoch": 5.089847259658581,
"grad_norm": 3.421875,
"learning_rate": 0.00018170397174835812,
"loss": 4.6003,
"step": 11330
},
{
"epoch": 5.092093441150045,
"grad_norm": 3.421875,
"learning_rate": 0.00018160010298950432,
"loss": 4.5887,
"step": 11335
},
{
"epoch": 5.09433962264151,
"grad_norm": 3.71875,
"learning_rate": 0.00018149622425200419,
"loss": 4.6328,
"step": 11340
},
{
"epoch": 5.096585804132974,
"grad_norm": 3.421875,
"learning_rate": 0.00018139233559830118,
"loss": 4.554,
"step": 11345
},
{
"epoch": 5.098831985624439,
"grad_norm": 3.40625,
"learning_rate": 0.00018128843709084484,
"loss": 4.5458,
"step": 11350
},
{
"epoch": 5.101078167115903,
"grad_norm": 3.375,
"learning_rate": 0.00018118452879209055,
"loss": 4.5481,
"step": 11355
},
{
"epoch": 5.103324348607368,
"grad_norm": 3.609375,
"learning_rate": 0.0001810806107644997,
"loss": 4.5575,
"step": 11360
},
{
"epoch": 5.105570530098832,
"grad_norm": 3.5,
"learning_rate": 0.00018097668307053935,
"loss": 4.6343,
"step": 11365
},
{
"epoch": 5.107816711590297,
"grad_norm": 3.515625,
"learning_rate": 0.00018087274577268246,
"loss": 4.5504,
"step": 11370
},
{
"epoch": 5.110062893081761,
"grad_norm": 3.359375,
"learning_rate": 0.00018076879893340794,
"loss": 4.6276,
"step": 11375
},
{
"epoch": 5.112309074573226,
"grad_norm": 3.515625,
"learning_rate": 0.0001806648426152001,
"loss": 4.5617,
"step": 11380
},
{
"epoch": 5.11455525606469,
"grad_norm": 3.46875,
"learning_rate": 0.00018056087688054918,
"loss": 4.617,
"step": 11385
},
{
"epoch": 5.116801437556155,
"grad_norm": 3.609375,
"learning_rate": 0.000180456901791951,
"loss": 4.5855,
"step": 11390
},
{
"epoch": 5.119047619047619,
"grad_norm": 3.453125,
"learning_rate": 0.000180352917411907,
"loss": 4.6229,
"step": 11395
},
{
"epoch": 5.121293800539084,
"grad_norm": 3.53125,
"learning_rate": 0.00018024892380292425,
"loss": 4.576,
"step": 11400
},
{
"epoch": 5.1235399820305485,
"grad_norm": 3.21875,
"learning_rate": 0.00018014492102751535,
"loss": 4.5866,
"step": 11405
},
{
"epoch": 5.1257861635220126,
"grad_norm": 3.484375,
"learning_rate": 0.00018004090914819837,
"loss": 4.6268,
"step": 11410
},
{
"epoch": 5.1280323450134775,
"grad_norm": 3.375,
"learning_rate": 0.00017993688822749696,
"loss": 4.5861,
"step": 11415
},
{
"epoch": 5.1302785265049415,
"grad_norm": 3.453125,
"learning_rate": 0.00017983285832794,
"loss": 4.6557,
"step": 11420
},
{
"epoch": 5.1325247079964065,
"grad_norm": 3.40625,
"learning_rate": 0.00017972881951206193,
"loss": 4.5783,
"step": 11425
},
{
"epoch": 5.1347708894878705,
"grad_norm": 3.6875,
"learning_rate": 0.00017962477184240263,
"loss": 4.5894,
"step": 11430
},
{
"epoch": 5.1370170709793355,
"grad_norm": 3.515625,
"learning_rate": 0.0001795207153815071,
"loss": 4.5467,
"step": 11435
},
{
"epoch": 5.1392632524707995,
"grad_norm": 3.609375,
"learning_rate": 0.0001794166501919257,
"loss": 4.6068,
"step": 11440
},
{
"epoch": 5.1415094339622645,
"grad_norm": 3.453125,
"learning_rate": 0.00017931257633621404,
"loss": 4.5561,
"step": 11445
},
{
"epoch": 5.1437556154537285,
"grad_norm": 3.6875,
"learning_rate": 0.00017920849387693307,
"loss": 4.507,
"step": 11450
},
{
"epoch": 5.146001796945193,
"grad_norm": 3.609375,
"learning_rate": 0.0001791044028766486,
"loss": 4.6283,
"step": 11455
},
{
"epoch": 5.1482479784366575,
"grad_norm": 3.59375,
"learning_rate": 0.00017900030339793193,
"loss": 4.5579,
"step": 11460
},
{
"epoch": 5.150494159928122,
"grad_norm": 3.578125,
"learning_rate": 0.00017889619550335925,
"loss": 4.6677,
"step": 11465
},
{
"epoch": 5.1527403414195865,
"grad_norm": 3.546875,
"learning_rate": 0.00017879207925551179,
"loss": 4.6207,
"step": 11470
},
{
"epoch": 5.154986522911051,
"grad_norm": 3.578125,
"learning_rate": 0.00017868795471697588,
"loss": 4.5903,
"step": 11475
},
{
"epoch": 5.1572327044025155,
"grad_norm": 3.65625,
"learning_rate": 0.00017858382195034284,
"loss": 4.5039,
"step": 11480
},
{
"epoch": 5.15947888589398,
"grad_norm": 3.5,
"learning_rate": 0.0001784796810182089,
"loss": 4.61,
"step": 11485
},
{
"epoch": 5.1617250673854445,
"grad_norm": 3.90625,
"learning_rate": 0.00017837553198317524,
"loss": 4.6192,
"step": 11490
},
{
"epoch": 5.163971248876909,
"grad_norm": 3.609375,
"learning_rate": 0.00017827137490784788,
"loss": 4.6387,
"step": 11495
},
{
"epoch": 5.166217430368373,
"grad_norm": 3.328125,
"learning_rate": 0.0001781672098548376,
"loss": 4.5613,
"step": 11500
},
{
"epoch": 5.168463611859838,
"grad_norm": 3.34375,
"learning_rate": 0.00017806303688676012,
"loss": 4.6324,
"step": 11505
},
{
"epoch": 5.170709793351302,
"grad_norm": 3.53125,
"learning_rate": 0.0001779588560662358,
"loss": 4.6147,
"step": 11510
},
{
"epoch": 5.172955974842767,
"grad_norm": 3.546875,
"learning_rate": 0.00017785466745588984,
"loss": 4.6576,
"step": 11515
},
{
"epoch": 5.175202156334231,
"grad_norm": 3.625,
"learning_rate": 0.000177750471118352,
"loss": 4.5248,
"step": 11520
},
{
"epoch": 5.177448337825696,
"grad_norm": 3.375,
"learning_rate": 0.00017764626711625668,
"loss": 4.5522,
"step": 11525
},
{
"epoch": 5.17969451931716,
"grad_norm": 3.453125,
"learning_rate": 0.0001775420555122431,
"loss": 4.5719,
"step": 11530
},
{
"epoch": 5.181940700808625,
"grad_norm": 3.40625,
"learning_rate": 0.00017743783636895474,
"loss": 4.593,
"step": 11535
},
{
"epoch": 5.184186882300089,
"grad_norm": 3.5,
"learning_rate": 0.00017733360974903984,
"loss": 4.5872,
"step": 11540
},
{
"epoch": 5.186433063791554,
"grad_norm": 3.40625,
"learning_rate": 0.000177229375715151,
"loss": 4.5696,
"step": 11545
},
{
"epoch": 5.188679245283019,
"grad_norm": 3.65625,
"learning_rate": 0.00017712513432994542,
"loss": 4.5896,
"step": 11550
},
{
"epoch": 5.190925426774483,
"grad_norm": 3.359375,
"learning_rate": 0.00017702088565608459,
"loss": 4.6056,
"step": 11555
},
{
"epoch": 5.193171608265948,
"grad_norm": 3.546875,
"learning_rate": 0.00017691662975623435,
"loss": 4.5833,
"step": 11560
},
{
"epoch": 5.195417789757412,
"grad_norm": 3.65625,
"learning_rate": 0.0001768123666930651,
"loss": 4.5589,
"step": 11565
},
{
"epoch": 5.197663971248877,
"grad_norm": 3.5,
"learning_rate": 0.00017670809652925128,
"loss": 4.5404,
"step": 11570
},
{
"epoch": 5.199910152740341,
"grad_norm": 3.453125,
"learning_rate": 0.0001766038193274718,
"loss": 4.5204,
"step": 11575
},
{
"epoch": 5.202156334231806,
"grad_norm": 3.515625,
"learning_rate": 0.00017649953515040976,
"loss": 4.6046,
"step": 11580
},
{
"epoch": 5.20440251572327,
"grad_norm": 3.53125,
"learning_rate": 0.00017639524406075233,
"loss": 4.6078,
"step": 11585
},
{
"epoch": 5.206648697214735,
"grad_norm": 3.515625,
"learning_rate": 0.00017629094612119098,
"loss": 4.5427,
"step": 11590
},
{
"epoch": 5.208894878706199,
"grad_norm": 3.578125,
"learning_rate": 0.00017618664139442116,
"loss": 4.6234,
"step": 11595
},
{
"epoch": 5.211141060197664,
"grad_norm": 3.328125,
"learning_rate": 0.00017608232994314254,
"loss": 4.6195,
"step": 11600
},
{
"epoch": 5.213387241689128,
"grad_norm": 3.625,
"learning_rate": 0.0001759780118300588,
"loss": 4.5626,
"step": 11605
},
{
"epoch": 5.215633423180593,
"grad_norm": 3.390625,
"learning_rate": 0.00017587368711787754,
"loss": 4.5641,
"step": 11610
},
{
"epoch": 5.217879604672057,
"grad_norm": 3.59375,
"learning_rate": 0.00017576935586931046,
"loss": 4.6135,
"step": 11615
},
{
"epoch": 5.220125786163522,
"grad_norm": 3.609375,
"learning_rate": 0.00017566501814707304,
"loss": 4.6373,
"step": 11620
},
{
"epoch": 5.222371967654986,
"grad_norm": 3.453125,
"learning_rate": 0.00017556067401388467,
"loss": 4.6505,
"step": 11625
},
{
"epoch": 5.224618149146451,
"grad_norm": 3.625,
"learning_rate": 0.00017545632353246882,
"loss": 4.4975,
"step": 11630
},
{
"epoch": 5.226864330637915,
"grad_norm": 3.625,
"learning_rate": 0.00017535196676555248,
"loss": 4.6227,
"step": 11635
},
{
"epoch": 5.22911051212938,
"grad_norm": 3.40625,
"learning_rate": 0.00017524760377586655,
"loss": 4.545,
"step": 11640
},
{
"epoch": 5.231356693620844,
"grad_norm": 3.609375,
"learning_rate": 0.0001751432346261457,
"loss": 4.6033,
"step": 11645
},
{
"epoch": 5.233602875112309,
"grad_norm": 3.515625,
"learning_rate": 0.00017503885937912824,
"loss": 4.5615,
"step": 11650
},
{
"epoch": 5.235849056603773,
"grad_norm": 3.5625,
"learning_rate": 0.00017493447809755614,
"loss": 4.5551,
"step": 11655
},
{
"epoch": 5.238095238095238,
"grad_norm": 3.625,
"learning_rate": 0.0001748300908441751,
"loss": 4.5172,
"step": 11660
},
{
"epoch": 5.240341419586702,
"grad_norm": 3.484375,
"learning_rate": 0.00017472569768173436,
"loss": 4.5991,
"step": 11665
},
{
"epoch": 5.242587601078167,
"grad_norm": 3.84375,
"learning_rate": 0.00017462129867298656,
"loss": 4.6592,
"step": 11670
},
{
"epoch": 5.244833782569632,
"grad_norm": 4.03125,
"learning_rate": 0.00017451689388068813,
"loss": 4.596,
"step": 11675
},
{
"epoch": 5.247079964061096,
"grad_norm": 3.921875,
"learning_rate": 0.00017441248336759872,
"loss": 4.5997,
"step": 11680
},
{
"epoch": 5.249326145552561,
"grad_norm": 3.625,
"learning_rate": 0.00017430806719648153,
"loss": 4.6422,
"step": 11685
},
{
"epoch": 5.251572327044025,
"grad_norm": 3.6875,
"learning_rate": 0.00017420364543010327,
"loss": 4.6005,
"step": 11690
},
{
"epoch": 5.25381850853549,
"grad_norm": 3.53125,
"learning_rate": 0.0001740992181312339,
"loss": 4.5657,
"step": 11695
},
{
"epoch": 5.256064690026954,
"grad_norm": 4.0625,
"learning_rate": 0.0001739947853626466,
"loss": 4.5688,
"step": 11700
},
{
"epoch": 5.258310871518419,
"grad_norm": 3.4375,
"learning_rate": 0.00017389034718711795,
"loss": 4.5535,
"step": 11705
},
{
"epoch": 5.260557053009883,
"grad_norm": 3.84375,
"learning_rate": 0.00017378590366742784,
"loss": 4.6458,
"step": 11710
},
{
"epoch": 5.262803234501348,
"grad_norm": 3.671875,
"learning_rate": 0.00017368145486635933,
"loss": 4.617,
"step": 11715
},
{
"epoch": 5.265049415992812,
"grad_norm": 3.890625,
"learning_rate": 0.00017357700084669862,
"loss": 4.5719,
"step": 11720
},
{
"epoch": 5.267295597484277,
"grad_norm": 3.515625,
"learning_rate": 0.000173472541671235,
"loss": 4.5821,
"step": 11725
},
{
"epoch": 5.269541778975741,
"grad_norm": 3.359375,
"learning_rate": 0.00017336807740276098,
"loss": 4.6096,
"step": 11730
},
{
"epoch": 5.271787960467206,
"grad_norm": 3.734375,
"learning_rate": 0.00017326360810407214,
"loss": 4.5613,
"step": 11735
},
{
"epoch": 5.27403414195867,
"grad_norm": 3.625,
"learning_rate": 0.00017315913383796685,
"loss": 4.6638,
"step": 11740
},
{
"epoch": 5.276280323450135,
"grad_norm": 3.5625,
"learning_rate": 0.00017305465466724672,
"loss": 4.6461,
"step": 11745
},
{
"epoch": 5.278526504941599,
"grad_norm": 3.5,
"learning_rate": 0.00017295017065471627,
"loss": 4.6047,
"step": 11750
},
{
"epoch": 5.280772686433064,
"grad_norm": 3.359375,
"learning_rate": 0.00017284568186318286,
"loss": 4.5866,
"step": 11755
},
{
"epoch": 5.283018867924528,
"grad_norm": 3.546875,
"learning_rate": 0.00017274118835545668,
"loss": 4.5601,
"step": 11760
},
{
"epoch": 5.285265049415993,
"grad_norm": 3.734375,
"learning_rate": 0.0001726366901943509,
"loss": 4.5449,
"step": 11765
},
{
"epoch": 5.287511230907457,
"grad_norm": 3.59375,
"learning_rate": 0.00017253218744268137,
"loss": 4.6337,
"step": 11770
},
{
"epoch": 5.289757412398922,
"grad_norm": 3.671875,
"learning_rate": 0.0001724276801632667,
"loss": 4.6278,
"step": 11775
},
{
"epoch": 5.292003593890386,
"grad_norm": 3.546875,
"learning_rate": 0.00017232316841892832,
"loss": 4.626,
"step": 11780
},
{
"epoch": 5.294249775381851,
"grad_norm": 3.484375,
"learning_rate": 0.00017221865227249028,
"loss": 4.6026,
"step": 11785
},
{
"epoch": 5.296495956873315,
"grad_norm": 3.359375,
"learning_rate": 0.00017211413178677923,
"loss": 4.5839,
"step": 11790
},
{
"epoch": 5.29874213836478,
"grad_norm": 3.609375,
"learning_rate": 0.0001720096070246245,
"loss": 4.5683,
"step": 11795
},
{
"epoch": 5.300988319856244,
"grad_norm": 3.375,
"learning_rate": 0.000171905078048858,
"loss": 4.5679,
"step": 11800
},
{
"epoch": 5.303234501347709,
"grad_norm": 3.59375,
"learning_rate": 0.0001718005449223141,
"loss": 4.633,
"step": 11805
},
{
"epoch": 5.305480682839173,
"grad_norm": 3.65625,
"learning_rate": 0.0001716960077078297,
"loss": 4.6708,
"step": 11810
},
{
"epoch": 5.307726864330638,
"grad_norm": 3.59375,
"learning_rate": 0.0001715914664682442,
"loss": 4.613,
"step": 11815
},
{
"epoch": 5.309973045822103,
"grad_norm": 3.5,
"learning_rate": 0.00017148692126639937,
"loss": 4.6032,
"step": 11820
},
{
"epoch": 5.312219227313567,
"grad_norm": 3.59375,
"learning_rate": 0.00017138237216513937,
"loss": 4.5737,
"step": 11825
},
{
"epoch": 5.314465408805032,
"grad_norm": 3.515625,
"learning_rate": 0.00017127781922731067,
"loss": 4.5867,
"step": 11830
},
{
"epoch": 5.316711590296496,
"grad_norm": 3.5625,
"learning_rate": 0.00017117326251576216,
"loss": 4.571,
"step": 11835
},
{
"epoch": 5.318957771787961,
"grad_norm": 3.625,
"learning_rate": 0.00017106870209334488,
"loss": 4.5513,
"step": 11840
},
{
"epoch": 5.321203953279425,
"grad_norm": 3.453125,
"learning_rate": 0.00017096413802291212,
"loss": 4.5808,
"step": 11845
},
{
"epoch": 5.32345013477089,
"grad_norm": 3.578125,
"learning_rate": 0.00017085957036731947,
"loss": 4.5539,
"step": 11850
},
{
"epoch": 5.325696316262354,
"grad_norm": 3.546875,
"learning_rate": 0.0001707549991894245,
"loss": 4.6171,
"step": 11855
},
{
"epoch": 5.327942497753819,
"grad_norm": 3.609375,
"learning_rate": 0.00017065042455208704,
"loss": 4.5978,
"step": 11860
},
{
"epoch": 5.330188679245283,
"grad_norm": 3.359375,
"learning_rate": 0.0001705458465181689,
"loss": 4.6106,
"step": 11865
},
{
"epoch": 5.332434860736748,
"grad_norm": 3.625,
"learning_rate": 0.00017044126515053403,
"loss": 4.5804,
"step": 11870
},
{
"epoch": 5.334681042228212,
"grad_norm": 3.53125,
"learning_rate": 0.00017033668051204837,
"loss": 4.5118,
"step": 11875
},
{
"epoch": 5.336927223719677,
"grad_norm": 3.625,
"learning_rate": 0.00017023209266557967,
"loss": 4.6332,
"step": 11880
},
{
"epoch": 5.339173405211141,
"grad_norm": 3.6875,
"learning_rate": 0.00017012750167399781,
"loss": 4.6062,
"step": 11885
},
{
"epoch": 5.341419586702606,
"grad_norm": 3.53125,
"learning_rate": 0.00017002290760017447,
"loss": 4.603,
"step": 11890
},
{
"epoch": 5.34366576819407,
"grad_norm": 3.515625,
"learning_rate": 0.00016991831050698324,
"loss": 4.5982,
"step": 11895
},
{
"epoch": 5.345911949685535,
"grad_norm": 3.625,
"learning_rate": 0.00016981371045729938,
"loss": 4.5631,
"step": 11900
},
{
"epoch": 5.348158131176999,
"grad_norm": 3.609375,
"learning_rate": 0.00016970910751400007,
"loss": 4.5913,
"step": 11905
},
{
"epoch": 5.350404312668464,
"grad_norm": 3.515625,
"learning_rate": 0.0001696045017399642,
"loss": 4.622,
"step": 11910
},
{
"epoch": 5.352650494159928,
"grad_norm": 3.53125,
"learning_rate": 0.0001694998931980723,
"loss": 4.549,
"step": 11915
},
{
"epoch": 5.354896675651393,
"grad_norm": 3.484375,
"learning_rate": 0.00016939528195120669,
"loss": 4.552,
"step": 11920
},
{
"epoch": 5.357142857142857,
"grad_norm": 3.609375,
"learning_rate": 0.0001692906680622512,
"loss": 4.6126,
"step": 11925
},
{
"epoch": 5.359389038634322,
"grad_norm": 3.40625,
"learning_rate": 0.0001691860515940912,
"loss": 4.574,
"step": 11930
},
{
"epoch": 5.361635220125786,
"grad_norm": 3.578125,
"learning_rate": 0.00016908143260961387,
"loss": 4.5608,
"step": 11935
},
{
"epoch": 5.363881401617251,
"grad_norm": 3.78125,
"learning_rate": 0.00016897681117170748,
"loss": 4.5848,
"step": 11940
},
{
"epoch": 5.366127583108716,
"grad_norm": 4.0,
"learning_rate": 0.00016887218734326222,
"loss": 4.6342,
"step": 11945
},
{
"epoch": 5.36837376460018,
"grad_norm": 3.703125,
"learning_rate": 0.0001687675611871695,
"loss": 4.5628,
"step": 11950
},
{
"epoch": 5.370619946091645,
"grad_norm": 3.515625,
"learning_rate": 0.00016866293276632206,
"loss": 4.7054,
"step": 11955
},
{
"epoch": 5.372866127583109,
"grad_norm": 4.34375,
"learning_rate": 0.00016855830214361416,
"loss": 4.5294,
"step": 11960
},
{
"epoch": 5.375112309074574,
"grad_norm": 3.796875,
"learning_rate": 0.00016845366938194128,
"loss": 4.6392,
"step": 11965
},
{
"epoch": 5.377358490566038,
"grad_norm": 3.578125,
"learning_rate": 0.00016834903454420022,
"loss": 4.6763,
"step": 11970
},
{
"epoch": 5.379604672057503,
"grad_norm": 3.421875,
"learning_rate": 0.000168244397693289,
"loss": 4.6158,
"step": 11975
},
{
"epoch": 5.381850853548967,
"grad_norm": 3.578125,
"learning_rate": 0.00016813975889210696,
"loss": 4.6223,
"step": 11980
},
{
"epoch": 5.384097035040432,
"grad_norm": 3.5,
"learning_rate": 0.00016803511820355447,
"loss": 4.6112,
"step": 11985
},
{
"epoch": 5.386343216531896,
"grad_norm": 3.40625,
"learning_rate": 0.0001679304756905331,
"loss": 4.545,
"step": 11990
},
{
"epoch": 5.388589398023361,
"grad_norm": 3.5,
"learning_rate": 0.0001678258314159455,
"loss": 4.5991,
"step": 11995
},
{
"epoch": 5.390835579514825,
"grad_norm": 3.640625,
"learning_rate": 0.0001677211854426954,
"loss": 4.6018,
"step": 12000
},
{
"epoch": 5.390835579514825,
"eval_loss": 4.814459800720215,
"eval_runtime": 16.0452,
"eval_samples_per_second": 1932.846,
"eval_steps_per_second": 241.629,
"step": 12000
},
{
"epoch": 5.3930817610062896,
"grad_norm": 3.578125,
"learning_rate": 0.00016761653783368754,
"loss": 4.6236,
"step": 12005
},
{
"epoch": 5.395327942497754,
"grad_norm": 3.515625,
"learning_rate": 0.00016751188865182765,
"loss": 4.5989,
"step": 12010
},
{
"epoch": 5.3975741239892185,
"grad_norm": 3.59375,
"learning_rate": 0.0001674072379600224,
"loss": 4.5864,
"step": 12015
},
{
"epoch": 5.399820305480683,
"grad_norm": 3.59375,
"learning_rate": 0.00016730258582117936,
"loss": 4.6238,
"step": 12020
},
{
"epoch": 5.4020664869721475,
"grad_norm": 3.5625,
"learning_rate": 0.000167197932298207,
"loss": 4.5502,
"step": 12025
},
{
"epoch": 5.404312668463612,
"grad_norm": 3.515625,
"learning_rate": 0.00016709327745401448,
"loss": 4.6318,
"step": 12030
},
{
"epoch": 5.4065588499550765,
"grad_norm": 3.515625,
"learning_rate": 0.00016698862135151204,
"loss": 4.6023,
"step": 12035
},
{
"epoch": 5.408805031446541,
"grad_norm": 3.65625,
"learning_rate": 0.00016688396405361043,
"loss": 4.6592,
"step": 12040
},
{
"epoch": 5.4110512129380055,
"grad_norm": 3.515625,
"learning_rate": 0.00016677930562322119,
"loss": 4.5976,
"step": 12045
},
{
"epoch": 5.4132973944294696,
"grad_norm": 3.8125,
"learning_rate": 0.00016667464612325658,
"loss": 4.5946,
"step": 12050
},
{
"epoch": 5.4155435759209345,
"grad_norm": 4.21875,
"learning_rate": 0.0001665699856166294,
"loss": 4.6208,
"step": 12055
},
{
"epoch": 5.4177897574123985,
"grad_norm": 3.6875,
"learning_rate": 0.00016646532416625322,
"loss": 4.535,
"step": 12060
},
{
"epoch": 5.4200359389038635,
"grad_norm": 3.359375,
"learning_rate": 0.000166360661835042,
"loss": 4.5717,
"step": 12065
},
{
"epoch": 5.4222821203953275,
"grad_norm": 3.53125,
"learning_rate": 0.0001662559986859104,
"loss": 4.5988,
"step": 12070
},
{
"epoch": 5.4245283018867925,
"grad_norm": 3.671875,
"learning_rate": 0.00016615133478177342,
"loss": 4.5987,
"step": 12075
},
{
"epoch": 5.4267744833782565,
"grad_norm": 3.609375,
"learning_rate": 0.00016604667018554661,
"loss": 4.5958,
"step": 12080
},
{
"epoch": 5.4290206648697215,
"grad_norm": 3.78125,
"learning_rate": 0.0001659420049601459,
"loss": 4.6424,
"step": 12085
},
{
"epoch": 5.431266846361186,
"grad_norm": 3.421875,
"learning_rate": 0.00016583733916848754,
"loss": 4.5967,
"step": 12090
},
{
"epoch": 5.4335130278526504,
"grad_norm": 3.640625,
"learning_rate": 0.0001657326728734883,
"loss": 4.584,
"step": 12095
},
{
"epoch": 5.435759209344115,
"grad_norm": 3.453125,
"learning_rate": 0.00016562800613806507,
"loss": 4.5748,
"step": 12100
},
{
"epoch": 5.438005390835579,
"grad_norm": 3.578125,
"learning_rate": 0.00016552333902513505,
"loss": 4.6481,
"step": 12105
},
{
"epoch": 5.440251572327044,
"grad_norm": 3.328125,
"learning_rate": 0.00016541867159761573,
"loss": 4.5908,
"step": 12110
},
{
"epoch": 5.442497753818508,
"grad_norm": 3.40625,
"learning_rate": 0.0001653140039184247,
"loss": 4.6422,
"step": 12115
},
{
"epoch": 5.444743935309973,
"grad_norm": 3.59375,
"learning_rate": 0.00016520933605047977,
"loss": 4.576,
"step": 12120
},
{
"epoch": 5.446990116801437,
"grad_norm": 3.765625,
"learning_rate": 0.00016510466805669892,
"loss": 4.619,
"step": 12125
},
{
"epoch": 5.449236298292902,
"grad_norm": 3.328125,
"learning_rate": 0.000165,
"loss": 4.5754,
"step": 12130
},
{
"epoch": 5.451482479784366,
"grad_norm": 3.484375,
"learning_rate": 0.00016489533194330108,
"loss": 4.5766,
"step": 12135
},
{
"epoch": 5.453728661275831,
"grad_norm": 3.5625,
"learning_rate": 0.00016479066394952017,
"loss": 4.5739,
"step": 12140
},
{
"epoch": 5.455974842767295,
"grad_norm": 3.640625,
"learning_rate": 0.0001646859960815753,
"loss": 4.631,
"step": 12145
},
{
"epoch": 5.45822102425876,
"grad_norm": 3.6875,
"learning_rate": 0.00016458132840238427,
"loss": 4.5661,
"step": 12150
},
{
"epoch": 5.460467205750224,
"grad_norm": 3.625,
"learning_rate": 0.00016447666097486494,
"loss": 4.6136,
"step": 12155
},
{
"epoch": 5.462713387241689,
"grad_norm": 3.609375,
"learning_rate": 0.00016437199386193493,
"loss": 4.5473,
"step": 12160
},
{
"epoch": 5.464959568733153,
"grad_norm": 3.484375,
"learning_rate": 0.00016426732712651167,
"loss": 4.5471,
"step": 12165
},
{
"epoch": 5.467205750224618,
"grad_norm": 3.390625,
"learning_rate": 0.00016416266083151243,
"loss": 4.5728,
"step": 12170
},
{
"epoch": 5.469451931716082,
"grad_norm": 3.5625,
"learning_rate": 0.0001640579950398541,
"loss": 4.5791,
"step": 12175
},
{
"epoch": 5.471698113207547,
"grad_norm": 3.453125,
"learning_rate": 0.00016395332981445336,
"loss": 4.6452,
"step": 12180
},
{
"epoch": 5.473944294699011,
"grad_norm": 3.484375,
"learning_rate": 0.00016384866521822655,
"loss": 4.5756,
"step": 12185
},
{
"epoch": 5.476190476190476,
"grad_norm": 3.484375,
"learning_rate": 0.00016374400131408958,
"loss": 4.6024,
"step": 12190
},
{
"epoch": 5.47843665768194,
"grad_norm": 3.5625,
"learning_rate": 0.000163639338164958,
"loss": 4.5795,
"step": 12195
},
{
"epoch": 5.480682839173405,
"grad_norm": 3.5,
"learning_rate": 0.00016353467583374675,
"loss": 4.5962,
"step": 12200
},
{
"epoch": 5.482929020664869,
"grad_norm": 3.671875,
"learning_rate": 0.0001634300143833706,
"loss": 4.5926,
"step": 12205
},
{
"epoch": 5.485175202156334,
"grad_norm": 3.609375,
"learning_rate": 0.0001633253538767435,
"loss": 4.5774,
"step": 12210
},
{
"epoch": 5.487421383647799,
"grad_norm": 3.4375,
"learning_rate": 0.00016322069437677884,
"loss": 4.5415,
"step": 12215
},
{
"epoch": 5.489667565139263,
"grad_norm": 3.5625,
"learning_rate": 0.00016311603594638962,
"loss": 4.5944,
"step": 12220
},
{
"epoch": 5.491913746630728,
"grad_norm": 3.546875,
"learning_rate": 0.00016301137864848799,
"loss": 4.6549,
"step": 12225
},
{
"epoch": 5.494159928122192,
"grad_norm": 3.828125,
"learning_rate": 0.00016290672254598552,
"loss": 4.5839,
"step": 12230
},
{
"epoch": 5.496406109613657,
"grad_norm": 3.484375,
"learning_rate": 0.00016280206770179307,
"loss": 4.5881,
"step": 12235
},
{
"epoch": 5.498652291105121,
"grad_norm": 3.3125,
"learning_rate": 0.00016269741417882064,
"loss": 4.6375,
"step": 12240
},
{
"epoch": 5.500898472596586,
"grad_norm": 3.4375,
"learning_rate": 0.0001625927620399776,
"loss": 4.5649,
"step": 12245
},
{
"epoch": 5.50314465408805,
"grad_norm": 3.5625,
"learning_rate": 0.00016248811134817235,
"loss": 4.6003,
"step": 12250
},
{
"epoch": 5.505390835579515,
"grad_norm": 3.5,
"learning_rate": 0.00016238346216631246,
"loss": 4.6594,
"step": 12255
},
{
"epoch": 5.507637017070979,
"grad_norm": 3.59375,
"learning_rate": 0.0001622788145573046,
"loss": 4.638,
"step": 12260
},
{
"epoch": 5.509883198562444,
"grad_norm": 3.890625,
"learning_rate": 0.0001621741685840545,
"loss": 4.6509,
"step": 12265
},
{
"epoch": 5.512129380053908,
"grad_norm": 3.6875,
"learning_rate": 0.0001620695243094669,
"loss": 4.6186,
"step": 12270
},
{
"epoch": 5.514375561545373,
"grad_norm": 3.421875,
"learning_rate": 0.00016196488179644552,
"loss": 4.7181,
"step": 12275
},
{
"epoch": 5.516621743036837,
"grad_norm": 3.359375,
"learning_rate": 0.00016186024110789304,
"loss": 4.5435,
"step": 12280
},
{
"epoch": 5.518867924528302,
"grad_norm": 3.546875,
"learning_rate": 0.000161755602306711,
"loss": 4.591,
"step": 12285
},
{
"epoch": 5.521114106019766,
"grad_norm": 3.828125,
"learning_rate": 0.00016165096545579978,
"loss": 4.6252,
"step": 12290
},
{
"epoch": 5.523360287511231,
"grad_norm": 3.484375,
"learning_rate": 0.00016154633061805872,
"loss": 4.5458,
"step": 12295
},
{
"epoch": 5.525606469002695,
"grad_norm": 3.265625,
"learning_rate": 0.00016144169785638584,
"loss": 4.5708,
"step": 12300
},
{
"epoch": 5.52785265049416,
"grad_norm": 3.453125,
"learning_rate": 0.00016133706723367794,
"loss": 4.6349,
"step": 12305
},
{
"epoch": 5.530098831985624,
"grad_norm": 3.71875,
"learning_rate": 0.0001612324388128305,
"loss": 4.6165,
"step": 12310
},
{
"epoch": 5.532345013477089,
"grad_norm": 3.5625,
"learning_rate": 0.00016112781265673778,
"loss": 4.612,
"step": 12315
},
{
"epoch": 5.534591194968553,
"grad_norm": 3.75,
"learning_rate": 0.0001610231888282925,
"loss": 4.579,
"step": 12320
},
{
"epoch": 5.536837376460018,
"grad_norm": 3.640625,
"learning_rate": 0.0001609185673903862,
"loss": 4.6182,
"step": 12325
},
{
"epoch": 5.539083557951482,
"grad_norm": 3.40625,
"learning_rate": 0.00016081394840590876,
"loss": 4.5729,
"step": 12330
},
{
"epoch": 5.541329739442947,
"grad_norm": 3.390625,
"learning_rate": 0.0001607093319377488,
"loss": 4.6167,
"step": 12335
},
{
"epoch": 5.543575920934412,
"grad_norm": 3.59375,
"learning_rate": 0.00016060471804879326,
"loss": 4.618,
"step": 12340
},
{
"epoch": 5.545822102425876,
"grad_norm": 3.46875,
"learning_rate": 0.00016050010680192765,
"loss": 4.68,
"step": 12345
},
{
"epoch": 5.54806828391734,
"grad_norm": 3.546875,
"learning_rate": 0.00016039549826003577,
"loss": 4.5647,
"step": 12350
},
{
"epoch": 5.550314465408805,
"grad_norm": 3.65625,
"learning_rate": 0.0001602908924859999,
"loss": 4.5273,
"step": 12355
},
{
"epoch": 5.55256064690027,
"grad_norm": 4.15625,
"learning_rate": 0.0001601862895427006,
"loss": 4.5393,
"step": 12360
},
{
"epoch": 5.554806828391734,
"grad_norm": 3.484375,
"learning_rate": 0.00016008168949301676,
"loss": 4.5627,
"step": 12365
},
{
"epoch": 5.557053009883199,
"grad_norm": 3.34375,
"learning_rate": 0.00015997709239982553,
"loss": 4.6177,
"step": 12370
},
{
"epoch": 5.559299191374663,
"grad_norm": 3.75,
"learning_rate": 0.00015987249832600218,
"loss": 4.6861,
"step": 12375
},
{
"epoch": 5.561545372866128,
"grad_norm": 3.515625,
"learning_rate": 0.00015976790733442036,
"loss": 4.6145,
"step": 12380
},
{
"epoch": 5.563791554357592,
"grad_norm": 3.46875,
"learning_rate": 0.00015966331948795166,
"loss": 4.5544,
"step": 12385
},
{
"epoch": 5.566037735849057,
"grad_norm": 3.46875,
"learning_rate": 0.00015955873484946597,
"loss": 4.6375,
"step": 12390
},
{
"epoch": 5.568283917340521,
"grad_norm": 3.578125,
"learning_rate": 0.00015945415348183112,
"loss": 4.5971,
"step": 12395
},
{
"epoch": 5.570530098831986,
"grad_norm": 3.625,
"learning_rate": 0.00015934957544791302,
"loss": 4.575,
"step": 12400
},
{
"epoch": 5.57277628032345,
"grad_norm": 3.578125,
"learning_rate": 0.0001592450008105755,
"loss": 4.6174,
"step": 12405
},
{
"epoch": 5.575022461814915,
"grad_norm": 3.515625,
"learning_rate": 0.00015914042963268053,
"loss": 4.5955,
"step": 12410
},
{
"epoch": 5.577268643306379,
"grad_norm": 3.75,
"learning_rate": 0.00015903586197708788,
"loss": 4.5669,
"step": 12415
},
{
"epoch": 5.579514824797844,
"grad_norm": 3.671875,
"learning_rate": 0.00015893129790665511,
"loss": 4.5646,
"step": 12420
},
{
"epoch": 5.581761006289308,
"grad_norm": 3.5625,
"learning_rate": 0.00015882673748423784,
"loss": 4.6166,
"step": 12425
},
{
"epoch": 5.584007187780773,
"grad_norm": 3.734375,
"learning_rate": 0.00015872218077268933,
"loss": 4.5632,
"step": 12430
},
{
"epoch": 5.586253369272237,
"grad_norm": 3.53125,
"learning_rate": 0.00015861762783486063,
"loss": 4.568,
"step": 12435
},
{
"epoch": 5.588499550763702,
"grad_norm": 3.421875,
"learning_rate": 0.0001585130787336006,
"loss": 4.5876,
"step": 12440
},
{
"epoch": 5.590745732255166,
"grad_norm": 3.625,
"learning_rate": 0.0001584085335317558,
"loss": 4.5641,
"step": 12445
},
{
"epoch": 5.592991913746631,
"grad_norm": 3.609375,
"learning_rate": 0.0001583039922921703,
"loss": 4.5242,
"step": 12450
},
{
"epoch": 5.595238095238095,
"grad_norm": 3.609375,
"learning_rate": 0.0001581994550776859,
"loss": 4.632,
"step": 12455
},
{
"epoch": 5.59748427672956,
"grad_norm": 3.546875,
"learning_rate": 0.000158094921951142,
"loss": 4.5217,
"step": 12460
},
{
"epoch": 5.599730458221024,
"grad_norm": 3.84375,
"learning_rate": 0.00015799039297537544,
"loss": 4.5639,
"step": 12465
},
{
"epoch": 5.601976639712489,
"grad_norm": 3.609375,
"learning_rate": 0.00015788586821322074,
"loss": 4.5401,
"step": 12470
},
{
"epoch": 5.604222821203953,
"grad_norm": 3.5625,
"learning_rate": 0.00015778134772750972,
"loss": 4.5897,
"step": 12475
},
{
"epoch": 5.606469002695418,
"grad_norm": 3.421875,
"learning_rate": 0.00015767683158107165,
"loss": 4.5933,
"step": 12480
},
{
"epoch": 5.608715184186883,
"grad_norm": 3.53125,
"learning_rate": 0.00015757231983673327,
"loss": 4.5536,
"step": 12485
},
{
"epoch": 5.610961365678347,
"grad_norm": 3.4375,
"learning_rate": 0.00015746781255731863,
"loss": 4.6235,
"step": 12490
},
{
"epoch": 5.613207547169811,
"grad_norm": 3.8125,
"learning_rate": 0.0001573633098056491,
"loss": 4.6643,
"step": 12495
},
{
"epoch": 5.615453728661276,
"grad_norm": 3.5,
"learning_rate": 0.0001572588116445433,
"loss": 4.5779,
"step": 12500
},
{
"epoch": 5.617699910152741,
"grad_norm": 3.515625,
"learning_rate": 0.0001571543181368171,
"loss": 4.5442,
"step": 12505
},
{
"epoch": 5.619946091644205,
"grad_norm": 3.703125,
"learning_rate": 0.00015704982934528367,
"loss": 4.5921,
"step": 12510
},
{
"epoch": 5.62219227313567,
"grad_norm": 3.421875,
"learning_rate": 0.00015694534533275325,
"loss": 4.6018,
"step": 12515
},
{
"epoch": 5.624438454627134,
"grad_norm": 3.515625,
"learning_rate": 0.00015684086616203313,
"loss": 4.6134,
"step": 12520
},
{
"epoch": 5.626684636118599,
"grad_norm": 3.71875,
"learning_rate": 0.00015673639189592788,
"loss": 4.571,
"step": 12525
},
{
"epoch": 5.628930817610063,
"grad_norm": 3.75,
"learning_rate": 0.00015663192259723904,
"loss": 4.5947,
"step": 12530
},
{
"epoch": 5.631176999101528,
"grad_norm": 3.71875,
"learning_rate": 0.00015652745832876502,
"loss": 4.6523,
"step": 12535
},
{
"epoch": 5.633423180592992,
"grad_norm": 3.75,
"learning_rate": 0.0001564229991533014,
"loss": 4.6599,
"step": 12540
},
{
"epoch": 5.635669362084457,
"grad_norm": 3.546875,
"learning_rate": 0.00015631854513364066,
"loss": 4.5874,
"step": 12545
},
{
"epoch": 5.637915543575921,
"grad_norm": 3.609375,
"learning_rate": 0.00015621409633257216,
"loss": 4.563,
"step": 12550
},
{
"epoch": 5.640161725067386,
"grad_norm": 3.40625,
"learning_rate": 0.00015610965281288205,
"loss": 4.5769,
"step": 12555
},
{
"epoch": 5.64240790655885,
"grad_norm": 3.703125,
"learning_rate": 0.00015600521463735346,
"loss": 4.5632,
"step": 12560
},
{
"epoch": 5.644654088050315,
"grad_norm": 3.421875,
"learning_rate": 0.00015590078186876612,
"loss": 4.5776,
"step": 12565
},
{
"epoch": 5.646900269541779,
"grad_norm": 3.421875,
"learning_rate": 0.0001557963545698967,
"loss": 4.5644,
"step": 12570
},
{
"epoch": 5.649146451033244,
"grad_norm": 3.546875,
"learning_rate": 0.00015569193280351844,
"loss": 4.6529,
"step": 12575
},
{
"epoch": 5.651392632524708,
"grad_norm": 3.75,
"learning_rate": 0.00015558751663240127,
"loss": 4.6086,
"step": 12580
},
{
"epoch": 5.653638814016173,
"grad_norm": 3.34375,
"learning_rate": 0.0001554831061193119,
"loss": 4.5621,
"step": 12585
},
{
"epoch": 5.655884995507637,
"grad_norm": 3.640625,
"learning_rate": 0.0001553787013270134,
"loss": 4.5809,
"step": 12590
},
{
"epoch": 5.658131176999102,
"grad_norm": 3.5,
"learning_rate": 0.00015527430231826564,
"loss": 4.5426,
"step": 12595
},
{
"epoch": 5.660377358490566,
"grad_norm": 3.4375,
"learning_rate": 0.00015516990915582487,
"loss": 4.5945,
"step": 12600
},
{
"epoch": 5.662623539982031,
"grad_norm": 3.71875,
"learning_rate": 0.00015506552190244386,
"loss": 4.6063,
"step": 12605
},
{
"epoch": 5.6648697214734955,
"grad_norm": 3.453125,
"learning_rate": 0.00015496114062087175,
"loss": 4.5035,
"step": 12610
},
{
"epoch": 5.66711590296496,
"grad_norm": 3.546875,
"learning_rate": 0.0001548567653738543,
"loss": 4.5661,
"step": 12615
},
{
"epoch": 5.669362084456424,
"grad_norm": 3.71875,
"learning_rate": 0.00015475239622413344,
"loss": 4.6367,
"step": 12620
},
{
"epoch": 5.671608265947889,
"grad_norm": 3.5,
"learning_rate": 0.00015464803323444754,
"loss": 4.5808,
"step": 12625
},
{
"epoch": 5.6738544474393535,
"grad_norm": 3.546875,
"learning_rate": 0.0001545436764675312,
"loss": 4.6723,
"step": 12630
},
{
"epoch": 5.676100628930818,
"grad_norm": 3.5625,
"learning_rate": 0.0001544393259861153,
"loss": 4.5973,
"step": 12635
},
{
"epoch": 5.678346810422282,
"grad_norm": 3.796875,
"learning_rate": 0.00015433498185292695,
"loss": 4.4989,
"step": 12640
},
{
"epoch": 5.680592991913747,
"grad_norm": 3.609375,
"learning_rate": 0.00015423064413068953,
"loss": 4.5821,
"step": 12645
},
{
"epoch": 5.6828391734052115,
"grad_norm": 3.640625,
"learning_rate": 0.00015412631288212243,
"loss": 4.5362,
"step": 12650
},
{
"epoch": 5.6850853548966755,
"grad_norm": 3.671875,
"learning_rate": 0.0001540219881699412,
"loss": 4.5634,
"step": 12655
},
{
"epoch": 5.6873315363881405,
"grad_norm": 3.515625,
"learning_rate": 0.00015391767005685744,
"loss": 4.5416,
"step": 12660
},
{
"epoch": 5.6895777178796045,
"grad_norm": 3.71875,
"learning_rate": 0.0001538133586055788,
"loss": 4.4939,
"step": 12665
},
{
"epoch": 5.6918238993710695,
"grad_norm": 3.796875,
"learning_rate": 0.00015370905387880905,
"loss": 4.6136,
"step": 12670
},
{
"epoch": 5.6940700808625335,
"grad_norm": 3.53125,
"learning_rate": 0.00015360475593924764,
"loss": 4.6813,
"step": 12675
},
{
"epoch": 5.6963162623539985,
"grad_norm": 3.484375,
"learning_rate": 0.00015350046484959023,
"loss": 4.6002,
"step": 12680
},
{
"epoch": 5.6985624438454625,
"grad_norm": 3.78125,
"learning_rate": 0.0001533961806725282,
"loss": 4.5868,
"step": 12685
},
{
"epoch": 5.7008086253369274,
"grad_norm": 3.703125,
"learning_rate": 0.00015329190347074871,
"loss": 4.561,
"step": 12690
},
{
"epoch": 5.7030548068283915,
"grad_norm": 3.453125,
"learning_rate": 0.0001531876333069349,
"loss": 4.5557,
"step": 12695
},
{
"epoch": 5.705300988319856,
"grad_norm": 3.53125,
"learning_rate": 0.00015308337024376564,
"loss": 4.5689,
"step": 12700
},
{
"epoch": 5.7075471698113205,
"grad_norm": 3.53125,
"learning_rate": 0.0001529791143439155,
"loss": 4.5652,
"step": 12705
},
{
"epoch": 5.709793351302785,
"grad_norm": 3.59375,
"learning_rate": 0.0001528748656700546,
"loss": 4.6094,
"step": 12710
},
{
"epoch": 5.7120395327942495,
"grad_norm": 3.65625,
"learning_rate": 0.00015277062428484898,
"loss": 4.6139,
"step": 12715
},
{
"epoch": 5.714285714285714,
"grad_norm": 3.53125,
"learning_rate": 0.0001526663902509602,
"loss": 4.5639,
"step": 12720
},
{
"epoch": 5.7165318957771785,
"grad_norm": 3.609375,
"learning_rate": 0.00015256216363104526,
"loss": 4.5862,
"step": 12725
},
{
"epoch": 5.718778077268643,
"grad_norm": 3.46875,
"learning_rate": 0.0001524579444877569,
"loss": 4.5733,
"step": 12730
},
{
"epoch": 5.7210242587601075,
"grad_norm": 3.9375,
"learning_rate": 0.00015235373288374329,
"loss": 4.5285,
"step": 12735
},
{
"epoch": 5.723270440251572,
"grad_norm": 3.59375,
"learning_rate": 0.000152249528881648,
"loss": 4.6101,
"step": 12740
},
{
"epoch": 5.725516621743036,
"grad_norm": 3.53125,
"learning_rate": 0.00015214533254411016,
"loss": 4.531,
"step": 12745
},
{
"epoch": 5.727762803234501,
"grad_norm": 3.796875,
"learning_rate": 0.0001520411439337642,
"loss": 4.63,
"step": 12750
},
{
"epoch": 5.730008984725966,
"grad_norm": 3.609375,
"learning_rate": 0.00015193696311323988,
"loss": 4.5898,
"step": 12755
},
{
"epoch": 5.73225516621743,
"grad_norm": 3.53125,
"learning_rate": 0.0001518327901451624,
"loss": 4.6105,
"step": 12760
},
{
"epoch": 5.734501347708894,
"grad_norm": 3.578125,
"learning_rate": 0.00015172862509215215,
"loss": 4.5676,
"step": 12765
},
{
"epoch": 5.736747529200359,
"grad_norm": 3.734375,
"learning_rate": 0.00015162446801682476,
"loss": 4.5878,
"step": 12770
},
{
"epoch": 5.738993710691824,
"grad_norm": 3.359375,
"learning_rate": 0.0001515203189817911,
"loss": 4.6522,
"step": 12775
},
{
"epoch": 5.741239892183288,
"grad_norm": 3.484375,
"learning_rate": 0.00015141617804965716,
"loss": 4.5824,
"step": 12780
},
{
"epoch": 5.743486073674753,
"grad_norm": 3.453125,
"learning_rate": 0.00015131204528302412,
"loss": 4.6432,
"step": 12785
},
{
"epoch": 5.745732255166217,
"grad_norm": 3.75,
"learning_rate": 0.0001512079207444882,
"loss": 4.5879,
"step": 12790
},
{
"epoch": 5.747978436657682,
"grad_norm": 3.390625,
"learning_rate": 0.00015110380449664075,
"loss": 4.574,
"step": 12795
},
{
"epoch": 5.750224618149146,
"grad_norm": 4.34375,
"learning_rate": 0.00015099969660206804,
"loss": 4.6326,
"step": 12800
},
{
"epoch": 5.752470799640611,
"grad_norm": 3.671875,
"learning_rate": 0.00015089559712335135,
"loss": 4.5741,
"step": 12805
},
{
"epoch": 5.754716981132075,
"grad_norm": 3.546875,
"learning_rate": 0.00015079150612306693,
"loss": 4.6053,
"step": 12810
},
{
"epoch": 5.75696316262354,
"grad_norm": 3.609375,
"learning_rate": 0.00015068742366378587,
"loss": 4.5251,
"step": 12815
},
{
"epoch": 5.759209344115004,
"grad_norm": 3.8125,
"learning_rate": 0.00015058334980807425,
"loss": 4.6518,
"step": 12820
},
{
"epoch": 5.761455525606469,
"grad_norm": 3.609375,
"learning_rate": 0.00015047928461849286,
"loss": 4.6316,
"step": 12825
},
{
"epoch": 5.763701707097933,
"grad_norm": 3.546875,
"learning_rate": 0.00015037522815759732,
"loss": 4.5181,
"step": 12830
},
{
"epoch": 5.765947888589398,
"grad_norm": 3.40625,
"learning_rate": 0.000150271180487938,
"loss": 4.5048,
"step": 12835
},
{
"epoch": 5.768194070080862,
"grad_norm": 3.8125,
"learning_rate": 0.00015016714167206,
"loss": 4.6388,
"step": 12840
},
{
"epoch": 5.770440251572327,
"grad_norm": 3.59375,
"learning_rate": 0.0001500631117725031,
"loss": 4.594,
"step": 12845
},
{
"epoch": 5.772686433063791,
"grad_norm": 3.40625,
"learning_rate": 0.00014995909085180163,
"loss": 4.5535,
"step": 12850
},
{
"epoch": 5.774932614555256,
"grad_norm": 3.546875,
"learning_rate": 0.00014985507897248465,
"loss": 4.566,
"step": 12855
},
{
"epoch": 5.77717879604672,
"grad_norm": 3.6875,
"learning_rate": 0.00014975107619707577,
"loss": 4.5553,
"step": 12860
},
{
"epoch": 5.779424977538185,
"grad_norm": 3.765625,
"learning_rate": 0.000149647082588093,
"loss": 4.5905,
"step": 12865
},
{
"epoch": 5.781671159029649,
"grad_norm": 3.40625,
"learning_rate": 0.000149543098208049,
"loss": 4.5787,
"step": 12870
},
{
"epoch": 5.783917340521114,
"grad_norm": 3.546875,
"learning_rate": 0.00014943912311945085,
"loss": 4.6191,
"step": 12875
},
{
"epoch": 5.786163522012579,
"grad_norm": 3.578125,
"learning_rate": 0.0001493351573847999,
"loss": 4.6162,
"step": 12880
},
{
"epoch": 5.788409703504043,
"grad_norm": 3.484375,
"learning_rate": 0.00014923120106659205,
"loss": 4.5889,
"step": 12885
},
{
"epoch": 5.790655884995507,
"grad_norm": 3.40625,
"learning_rate": 0.00014912725422731749,
"loss": 4.6641,
"step": 12890
},
{
"epoch": 5.792902066486972,
"grad_norm": 3.78125,
"learning_rate": 0.00014902331692946065,
"loss": 4.544,
"step": 12895
},
{
"epoch": 5.795148247978437,
"grad_norm": 3.75,
"learning_rate": 0.00014891938923550032,
"loss": 4.4786,
"step": 12900
},
{
"epoch": 5.797394429469901,
"grad_norm": 3.59375,
"learning_rate": 0.00014881547120790945,
"loss": 4.5937,
"step": 12905
},
{
"epoch": 5.799640610961365,
"grad_norm": 3.9375,
"learning_rate": 0.00014871156290915515,
"loss": 4.6043,
"step": 12910
},
{
"epoch": 5.80188679245283,
"grad_norm": 3.515625,
"learning_rate": 0.00014860766440169881,
"loss": 4.5986,
"step": 12915
},
{
"epoch": 5.804132973944295,
"grad_norm": 3.6875,
"learning_rate": 0.0001485037757479958,
"loss": 4.5605,
"step": 12920
},
{
"epoch": 5.806379155435759,
"grad_norm": 3.46875,
"learning_rate": 0.00014839989701049563,
"loss": 4.6339,
"step": 12925
},
{
"epoch": 5.808625336927224,
"grad_norm": 3.421875,
"learning_rate": 0.00014829602825164188,
"loss": 4.5654,
"step": 12930
},
{
"epoch": 5.810871518418688,
"grad_norm": 3.578125,
"learning_rate": 0.000148192169533872,
"loss": 4.5828,
"step": 12935
},
{
"epoch": 5.813117699910153,
"grad_norm": 3.8125,
"learning_rate": 0.0001480883209196176,
"loss": 4.5773,
"step": 12940
},
{
"epoch": 5.815363881401617,
"grad_norm": 3.53125,
"learning_rate": 0.00014798448247130405,
"loss": 4.6143,
"step": 12945
},
{
"epoch": 5.817610062893082,
"grad_norm": 3.65625,
"learning_rate": 0.0001478806542513507,
"loss": 4.5957,
"step": 12950
},
{
"epoch": 5.819856244384546,
"grad_norm": 3.640625,
"learning_rate": 0.00014777683632217069,
"loss": 4.6125,
"step": 12955
},
{
"epoch": 5.822102425876011,
"grad_norm": 3.390625,
"learning_rate": 0.00014767302874617096,
"loss": 4.5948,
"step": 12960
},
{
"epoch": 5.824348607367475,
"grad_norm": 3.546875,
"learning_rate": 0.00014756923158575243,
"loss": 4.5827,
"step": 12965
},
{
"epoch": 5.82659478885894,
"grad_norm": 3.5625,
"learning_rate": 0.00014746544490330945,
"loss": 4.5534,
"step": 12970
},
{
"epoch": 5.828840970350404,
"grad_norm": 3.453125,
"learning_rate": 0.0001473616687612303,
"loss": 4.5907,
"step": 12975
},
{
"epoch": 5.831087151841869,
"grad_norm": 3.46875,
"learning_rate": 0.00014725790322189688,
"loss": 4.6352,
"step": 12980
},
{
"epoch": 5.833333333333333,
"grad_norm": 3.59375,
"learning_rate": 0.0001471541483476846,
"loss": 4.5848,
"step": 12985
},
{
"epoch": 5.835579514824798,
"grad_norm": 3.53125,
"learning_rate": 0.00014705040420096252,
"loss": 4.5885,
"step": 12990
},
{
"epoch": 5.837825696316262,
"grad_norm": 3.5625,
"learning_rate": 0.0001469466708440934,
"loss": 4.6434,
"step": 12995
},
{
"epoch": 5.840071877807727,
"grad_norm": 3.734375,
"learning_rate": 0.0001468429483394333,
"loss": 4.5689,
"step": 13000
},
{
"epoch": 5.840071877807727,
"eval_loss": 4.79840087890625,
"eval_runtime": 16.0175,
"eval_samples_per_second": 1936.199,
"eval_steps_per_second": 242.048,
"step": 13000
},
{
"epoch": 5.842318059299191,
"grad_norm": 3.453125,
"learning_rate": 0.00014673923674933192,
"loss": 4.6216,
"step": 13005
},
{
"epoch": 5.844564240790656,
"grad_norm": 3.578125,
"learning_rate": 0.00014663553613613217,
"loss": 4.5931,
"step": 13010
},
{
"epoch": 5.84681042228212,
"grad_norm": 3.5625,
"learning_rate": 0.00014653184656217066,
"loss": 4.6131,
"step": 13015
},
{
"epoch": 5.849056603773585,
"grad_norm": 3.90625,
"learning_rate": 0.0001464281680897772,
"loss": 4.605,
"step": 13020
},
{
"epoch": 5.85130278526505,
"grad_norm": 3.53125,
"learning_rate": 0.0001463245007812749,
"loss": 4.5923,
"step": 13025
},
{
"epoch": 5.853548966756514,
"grad_norm": 3.53125,
"learning_rate": 0.0001462208446989802,
"loss": 4.6215,
"step": 13030
},
{
"epoch": 5.855795148247978,
"grad_norm": 3.546875,
"learning_rate": 0.00014611719990520285,
"loss": 4.6047,
"step": 13035
},
{
"epoch": 5.858041329739443,
"grad_norm": 3.578125,
"learning_rate": 0.0001460135664622457,
"loss": 4.6222,
"step": 13040
},
{
"epoch": 5.860287511230908,
"grad_norm": 3.4375,
"learning_rate": 0.00014590994443240487,
"loss": 4.5746,
"step": 13045
},
{
"epoch": 5.862533692722372,
"grad_norm": 3.421875,
"learning_rate": 0.00014580633387796966,
"loss": 4.5708,
"step": 13050
},
{
"epoch": 5.864779874213837,
"grad_norm": 3.515625,
"learning_rate": 0.00014570273486122227,
"loss": 4.5993,
"step": 13055
},
{
"epoch": 5.867026055705301,
"grad_norm": 3.609375,
"learning_rate": 0.0001455991474444382,
"loss": 4.5421,
"step": 13060
},
{
"epoch": 5.869272237196766,
"grad_norm": 3.5625,
"learning_rate": 0.0001454955716898858,
"loss": 4.5749,
"step": 13065
},
{
"epoch": 5.87151841868823,
"grad_norm": 3.390625,
"learning_rate": 0.00014539200765982646,
"loss": 4.6254,
"step": 13070
},
{
"epoch": 5.873764600179695,
"grad_norm": 3.421875,
"learning_rate": 0.00014528845541651462,
"loss": 4.5774,
"step": 13075
},
{
"epoch": 5.876010781671159,
"grad_norm": 3.5625,
"learning_rate": 0.00014518491502219752,
"loss": 4.58,
"step": 13080
},
{
"epoch": 5.878256963162624,
"grad_norm": 3.75,
"learning_rate": 0.00014508138653911536,
"loss": 4.5937,
"step": 13085
},
{
"epoch": 5.880503144654088,
"grad_norm": 3.5,
"learning_rate": 0.00014497787002950107,
"loss": 4.6329,
"step": 13090
},
{
"epoch": 5.882749326145553,
"grad_norm": 3.46875,
"learning_rate": 0.00014487436555558046,
"loss": 4.6449,
"step": 13095
},
{
"epoch": 5.884995507637017,
"grad_norm": 3.46875,
"learning_rate": 0.00014477087317957212,
"loss": 4.6336,
"step": 13100
},
{
"epoch": 5.887241689128482,
"grad_norm": 3.3125,
"learning_rate": 0.00014466739296368732,
"loss": 4.5848,
"step": 13105
},
{
"epoch": 5.889487870619946,
"grad_norm": 3.40625,
"learning_rate": 0.00014456392497013006,
"loss": 4.5894,
"step": 13110
},
{
"epoch": 5.891734052111411,
"grad_norm": 3.71875,
"learning_rate": 0.00014446046926109695,
"loss": 4.547,
"step": 13115
},
{
"epoch": 5.893980233602875,
"grad_norm": 3.4375,
"learning_rate": 0.00014435702589877725,
"loss": 4.5287,
"step": 13120
},
{
"epoch": 5.89622641509434,
"grad_norm": 3.609375,
"learning_rate": 0.00014425359494535275,
"loss": 4.5491,
"step": 13125
},
{
"epoch": 5.898472596585804,
"grad_norm": 3.578125,
"learning_rate": 0.00014415017646299792,
"loss": 4.6074,
"step": 13130
},
{
"epoch": 5.900718778077269,
"grad_norm": 3.59375,
"learning_rate": 0.00014404677051387948,
"loss": 4.5964,
"step": 13135
},
{
"epoch": 5.902964959568733,
"grad_norm": 3.453125,
"learning_rate": 0.00014394337716015692,
"loss": 4.5608,
"step": 13140
},
{
"epoch": 5.905211141060198,
"grad_norm": 3.46875,
"learning_rate": 0.00014383999646398193,
"loss": 4.5739,
"step": 13145
},
{
"epoch": 5.907457322551663,
"grad_norm": 3.75,
"learning_rate": 0.00014373662848749866,
"loss": 4.6323,
"step": 13150
},
{
"epoch": 5.909703504043127,
"grad_norm": 3.703125,
"learning_rate": 0.00014363327329284362,
"loss": 4.5731,
"step": 13155
},
{
"epoch": 5.911949685534591,
"grad_norm": 3.390625,
"learning_rate": 0.00014352993094214573,
"loss": 4.5454,
"step": 13160
},
{
"epoch": 5.914195867026056,
"grad_norm": 3.578125,
"learning_rate": 0.00014342660149752596,
"loss": 4.5988,
"step": 13165
},
{
"epoch": 5.916442048517521,
"grad_norm": 3.703125,
"learning_rate": 0.00014332328502109773,
"loss": 4.5969,
"step": 13170
},
{
"epoch": 5.918688230008985,
"grad_norm": 3.75,
"learning_rate": 0.00014321998157496656,
"loss": 4.5779,
"step": 13175
},
{
"epoch": 5.920934411500449,
"grad_norm": 3.546875,
"learning_rate": 0.00014311669122123023,
"loss": 4.6475,
"step": 13180
},
{
"epoch": 5.923180592991914,
"grad_norm": 3.59375,
"learning_rate": 0.0001430134140219785,
"loss": 4.5966,
"step": 13185
},
{
"epoch": 5.925426774483379,
"grad_norm": 3.609375,
"learning_rate": 0.00014291015003929343,
"loss": 4.5979,
"step": 13190
},
{
"epoch": 5.927672955974843,
"grad_norm": 3.53125,
"learning_rate": 0.00014280689933524892,
"loss": 4.5359,
"step": 13195
},
{
"epoch": 5.929919137466308,
"grad_norm": 3.578125,
"learning_rate": 0.00014270366197191104,
"loss": 4.573,
"step": 13200
},
{
"epoch": 5.932165318957772,
"grad_norm": 3.609375,
"learning_rate": 0.00014260043801133773,
"loss": 4.5386,
"step": 13205
},
{
"epoch": 5.934411500449237,
"grad_norm": 3.578125,
"learning_rate": 0.00014249722751557905,
"loss": 4.5472,
"step": 13210
},
{
"epoch": 5.936657681940701,
"grad_norm": 3.578125,
"learning_rate": 0.00014239403054667668,
"loss": 4.6398,
"step": 13215
},
{
"epoch": 5.938903863432166,
"grad_norm": 3.4375,
"learning_rate": 0.00014229084716666445,
"loss": 4.5737,
"step": 13220
},
{
"epoch": 5.94115004492363,
"grad_norm": 3.8125,
"learning_rate": 0.0001421876774375679,
"loss": 4.6144,
"step": 13225
},
{
"epoch": 5.943396226415095,
"grad_norm": 3.5,
"learning_rate": 0.00014208452142140435,
"loss": 4.576,
"step": 13230
},
{
"epoch": 5.945642407906559,
"grad_norm": 3.734375,
"learning_rate": 0.00014198137918018287,
"loss": 4.5736,
"step": 13235
},
{
"epoch": 5.947888589398024,
"grad_norm": 3.640625,
"learning_rate": 0.00014187825077590431,
"loss": 4.5622,
"step": 13240
},
{
"epoch": 5.950134770889488,
"grad_norm": 3.546875,
"learning_rate": 0.00014177513627056115,
"loss": 4.5501,
"step": 13245
},
{
"epoch": 5.9523809523809526,
"grad_norm": 3.625,
"learning_rate": 0.00014167203572613756,
"loss": 4.5601,
"step": 13250
},
{
"epoch": 5.954627133872417,
"grad_norm": 3.59375,
"learning_rate": 0.00014156894920460932,
"loss": 4.5828,
"step": 13255
},
{
"epoch": 5.9568733153638815,
"grad_norm": 3.78125,
"learning_rate": 0.00014146587676794366,
"loss": 4.5764,
"step": 13260
},
{
"epoch": 5.959119496855346,
"grad_norm": 3.5625,
"learning_rate": 0.00014136281847809952,
"loss": 4.5667,
"step": 13265
},
{
"epoch": 5.9613656783468105,
"grad_norm": 3.390625,
"learning_rate": 0.00014125977439702724,
"loss": 4.5588,
"step": 13270
},
{
"epoch": 5.963611859838275,
"grad_norm": 3.65625,
"learning_rate": 0.00014115674458666858,
"loss": 4.5707,
"step": 13275
},
{
"epoch": 5.9658580413297395,
"grad_norm": 3.625,
"learning_rate": 0.0001410537291089568,
"loss": 4.6072,
"step": 13280
},
{
"epoch": 5.968104222821204,
"grad_norm": 3.515625,
"learning_rate": 0.00014095072802581656,
"loss": 4.5488,
"step": 13285
},
{
"epoch": 5.9703504043126685,
"grad_norm": 3.65625,
"learning_rate": 0.00014084774139916378,
"loss": 4.5971,
"step": 13290
},
{
"epoch": 5.972596585804133,
"grad_norm": 3.5,
"learning_rate": 0.0001407447692909057,
"loss": 4.6285,
"step": 13295
},
{
"epoch": 5.9748427672955975,
"grad_norm": 3.671875,
"learning_rate": 0.00014064181176294096,
"loss": 4.5695,
"step": 13300
},
{
"epoch": 5.9770889487870615,
"grad_norm": 3.625,
"learning_rate": 0.0001405388688771593,
"loss": 4.6901,
"step": 13305
},
{
"epoch": 5.9793351302785265,
"grad_norm": 3.703125,
"learning_rate": 0.0001404359406954416,
"loss": 4.5687,
"step": 13310
},
{
"epoch": 5.981581311769991,
"grad_norm": 3.9375,
"learning_rate": 0.0001403330272796602,
"loss": 4.6383,
"step": 13315
},
{
"epoch": 5.9838274932614555,
"grad_norm": 3.4375,
"learning_rate": 0.00014023012869167828,
"loss": 4.5998,
"step": 13320
},
{
"epoch": 5.98607367475292,
"grad_norm": 3.5,
"learning_rate": 0.00014012724499335013,
"loss": 4.6224,
"step": 13325
},
{
"epoch": 5.9883198562443845,
"grad_norm": 3.703125,
"learning_rate": 0.00014002437624652118,
"loss": 4.589,
"step": 13330
},
{
"epoch": 5.990566037735849,
"grad_norm": 3.671875,
"learning_rate": 0.00013992152251302784,
"loss": 4.5993,
"step": 13335
},
{
"epoch": 5.992812219227313,
"grad_norm": 3.53125,
"learning_rate": 0.00013981868385469756,
"loss": 4.6194,
"step": 13340
},
{
"epoch": 5.995058400718778,
"grad_norm": 3.8125,
"learning_rate": 0.00013971586033334864,
"loss": 4.6192,
"step": 13345
},
{
"epoch": 5.997304582210242,
"grad_norm": 3.96875,
"learning_rate": 0.00013961305201079025,
"loss": 4.6668,
"step": 13350
},
{
"epoch": 5.999550763701707,
"grad_norm": 3.375,
"learning_rate": 0.00013951025894882256,
"loss": 4.5731,
"step": 13355
},
{
"epoch": 6.001796945193171,
"grad_norm": 3.796875,
"learning_rate": 0.00013940748120923641,
"loss": 4.5212,
"step": 13360
},
{
"epoch": 6.004043126684636,
"grad_norm": 3.703125,
"learning_rate": 0.0001393047188538136,
"loss": 4.521,
"step": 13365
},
{
"epoch": 6.0062893081761,
"grad_norm": 3.640625,
"learning_rate": 0.00013920197194432657,
"loss": 4.5062,
"step": 13370
},
{
"epoch": 6.008535489667565,
"grad_norm": 3.625,
"learning_rate": 0.00013909924054253845,
"loss": 4.4692,
"step": 13375
},
{
"epoch": 6.010781671159029,
"grad_norm": 3.765625,
"learning_rate": 0.00013899652471020308,
"loss": 4.5304,
"step": 13380
},
{
"epoch": 6.013027852650494,
"grad_norm": 3.734375,
"learning_rate": 0.00013889382450906507,
"loss": 4.5064,
"step": 13385
},
{
"epoch": 6.015274034141958,
"grad_norm": 3.578125,
"learning_rate": 0.00013879114000085933,
"loss": 4.5232,
"step": 13390
},
{
"epoch": 6.017520215633423,
"grad_norm": 3.65625,
"learning_rate": 0.0001386884712473117,
"loss": 4.5745,
"step": 13395
},
{
"epoch": 6.019766397124887,
"grad_norm": 3.890625,
"learning_rate": 0.0001385858183101383,
"loss": 4.5067,
"step": 13400
},
{
"epoch": 6.022012578616352,
"grad_norm": 3.546875,
"learning_rate": 0.0001384831812510458,
"loss": 4.5416,
"step": 13405
},
{
"epoch": 6.024258760107816,
"grad_norm": 3.796875,
"learning_rate": 0.00013838056013173143,
"loss": 4.5799,
"step": 13410
},
{
"epoch": 6.026504941599281,
"grad_norm": 3.46875,
"learning_rate": 0.00013827795501388264,
"loss": 4.5485,
"step": 13415
},
{
"epoch": 6.028751123090745,
"grad_norm": 3.90625,
"learning_rate": 0.00013817536595917742,
"loss": 4.5435,
"step": 13420
},
{
"epoch": 6.03099730458221,
"grad_norm": 3.8125,
"learning_rate": 0.00013807279302928405,
"loss": 4.5413,
"step": 13425
},
{
"epoch": 6.033243486073674,
"grad_norm": 3.734375,
"learning_rate": 0.0001379702362858611,
"loss": 4.5275,
"step": 13430
},
{
"epoch": 6.035489667565139,
"grad_norm": 3.734375,
"learning_rate": 0.00013786769579055753,
"loss": 4.5684,
"step": 13435
},
{
"epoch": 6.037735849056604,
"grad_norm": 3.71875,
"learning_rate": 0.00013776517160501238,
"loss": 4.5337,
"step": 13440
},
{
"epoch": 6.039982030548068,
"grad_norm": 3.4375,
"learning_rate": 0.00013766266379085492,
"loss": 4.5921,
"step": 13445
},
{
"epoch": 6.042228212039533,
"grad_norm": 3.6875,
"learning_rate": 0.00013756017240970457,
"loss": 4.5239,
"step": 13450
},
{
"epoch": 6.044474393530997,
"grad_norm": 3.453125,
"learning_rate": 0.00013745769752317093,
"loss": 4.5291,
"step": 13455
},
{
"epoch": 6.046720575022462,
"grad_norm": 3.6875,
"learning_rate": 0.0001373552391928537,
"loss": 4.5423,
"step": 13460
},
{
"epoch": 6.048966756513926,
"grad_norm": 3.84375,
"learning_rate": 0.00013725279748034257,
"loss": 4.5376,
"step": 13465
},
{
"epoch": 6.051212938005391,
"grad_norm": 3.59375,
"learning_rate": 0.00013715037244721725,
"loss": 4.5239,
"step": 13470
},
{
"epoch": 6.053459119496855,
"grad_norm": 3.578125,
"learning_rate": 0.0001370479641550474,
"loss": 4.557,
"step": 13475
},
{
"epoch": 6.05570530098832,
"grad_norm": 3.75,
"learning_rate": 0.0001369455726653927,
"loss": 4.5622,
"step": 13480
},
{
"epoch": 6.057951482479784,
"grad_norm": 3.515625,
"learning_rate": 0.00013684319803980262,
"loss": 4.5015,
"step": 13485
},
{
"epoch": 6.060197663971249,
"grad_norm": 3.59375,
"learning_rate": 0.00013674084033981655,
"loss": 4.4932,
"step": 13490
},
{
"epoch": 6.062443845462713,
"grad_norm": 3.578125,
"learning_rate": 0.00013663849962696379,
"loss": 4.551,
"step": 13495
},
{
"epoch": 6.064690026954178,
"grad_norm": 4.53125,
"learning_rate": 0.0001365361759627632,
"loss": 4.5254,
"step": 13500
},
{
"epoch": 6.066936208445642,
"grad_norm": 3.6875,
"learning_rate": 0.00013643386940872363,
"loss": 4.5218,
"step": 13505
},
{
"epoch": 6.069182389937107,
"grad_norm": 3.796875,
"learning_rate": 0.00013633158002634356,
"loss": 4.5726,
"step": 13510
},
{
"epoch": 6.071428571428571,
"grad_norm": 3.59375,
"learning_rate": 0.0001362293078771111,
"loss": 4.4735,
"step": 13515
},
{
"epoch": 6.073674752920036,
"grad_norm": 3.703125,
"learning_rate": 0.00013612705302250405,
"loss": 4.5454,
"step": 13520
},
{
"epoch": 6.0759209344115,
"grad_norm": 3.765625,
"learning_rate": 0.00013602481552398983,
"loss": 4.5265,
"step": 13525
},
{
"epoch": 6.078167115902965,
"grad_norm": 3.6875,
"learning_rate": 0.0001359225954430253,
"loss": 4.5326,
"step": 13530
},
{
"epoch": 6.080413297394429,
"grad_norm": 3.78125,
"learning_rate": 0.00013582039284105706,
"loss": 4.5025,
"step": 13535
},
{
"epoch": 6.082659478885894,
"grad_norm": 3.71875,
"learning_rate": 0.00013571820777952105,
"loss": 4.5118,
"step": 13540
},
{
"epoch": 6.084905660377358,
"grad_norm": 3.703125,
"learning_rate": 0.00013561604031984268,
"loss": 4.4428,
"step": 13545
},
{
"epoch": 6.087151841868823,
"grad_norm": 3.640625,
"learning_rate": 0.0001355138905234369,
"loss": 4.5108,
"step": 13550
},
{
"epoch": 6.089398023360287,
"grad_norm": 3.6875,
"learning_rate": 0.00013541175845170785,
"loss": 4.4825,
"step": 13555
},
{
"epoch": 6.091644204851752,
"grad_norm": 3.328125,
"learning_rate": 0.00013530964416604913,
"loss": 4.5154,
"step": 13560
},
{
"epoch": 6.093890386343216,
"grad_norm": 3.65625,
"learning_rate": 0.0001352075477278436,
"loss": 4.5195,
"step": 13565
},
{
"epoch": 6.096136567834681,
"grad_norm": 3.484375,
"learning_rate": 0.00013510546919846358,
"loss": 4.5567,
"step": 13570
},
{
"epoch": 6.098382749326145,
"grad_norm": 3.34375,
"learning_rate": 0.0001350034086392703,
"loss": 4.5383,
"step": 13575
},
{
"epoch": 6.10062893081761,
"grad_norm": 3.78125,
"learning_rate": 0.00013490136611161448,
"loss": 4.5397,
"step": 13580
},
{
"epoch": 6.102875112309075,
"grad_norm": 3.65625,
"learning_rate": 0.00013479934167683579,
"loss": 4.4954,
"step": 13585
},
{
"epoch": 6.105121293800539,
"grad_norm": 3.53125,
"learning_rate": 0.00013469733539626315,
"loss": 4.4926,
"step": 13590
},
{
"epoch": 6.107367475292004,
"grad_norm": 3.796875,
"learning_rate": 0.00013459534733121448,
"loss": 4.5163,
"step": 13595
},
{
"epoch": 6.109613656783468,
"grad_norm": 3.703125,
"learning_rate": 0.00013449337754299688,
"loss": 4.5654,
"step": 13600
},
{
"epoch": 6.111859838274933,
"grad_norm": 3.65625,
"learning_rate": 0.00013439142609290633,
"loss": 4.5667,
"step": 13605
},
{
"epoch": 6.114106019766397,
"grad_norm": 3.84375,
"learning_rate": 0.00013428949304222787,
"loss": 4.5255,
"step": 13610
},
{
"epoch": 6.116352201257862,
"grad_norm": 3.78125,
"learning_rate": 0.00013418757845223546,
"loss": 4.5519,
"step": 13615
},
{
"epoch": 6.118598382749326,
"grad_norm": 3.515625,
"learning_rate": 0.00013408568238419186,
"loss": 4.5788,
"step": 13620
},
{
"epoch": 6.120844564240791,
"grad_norm": 3.4375,
"learning_rate": 0.00013398380489934892,
"loss": 4.5207,
"step": 13625
},
{
"epoch": 6.123090745732255,
"grad_norm": 3.75,
"learning_rate": 0.00013388194605894703,
"loss": 4.4663,
"step": 13630
},
{
"epoch": 6.12533692722372,
"grad_norm": 3.8125,
"learning_rate": 0.00013378010592421575,
"loss": 4.5232,
"step": 13635
},
{
"epoch": 6.127583108715184,
"grad_norm": 3.875,
"learning_rate": 0.00013367828455637296,
"loss": 4.48,
"step": 13640
},
{
"epoch": 6.129829290206649,
"grad_norm": 3.796875,
"learning_rate": 0.00013357648201662556,
"loss": 4.5684,
"step": 13645
},
{
"epoch": 6.132075471698113,
"grad_norm": 3.609375,
"learning_rate": 0.00013347469836616906,
"loss": 4.5237,
"step": 13650
},
{
"epoch": 6.134321653189578,
"grad_norm": 3.53125,
"learning_rate": 0.00013337293366618759,
"loss": 4.5358,
"step": 13655
},
{
"epoch": 6.136567834681042,
"grad_norm": 3.609375,
"learning_rate": 0.00013327118797785392,
"loss": 4.4897,
"step": 13660
},
{
"epoch": 6.138814016172507,
"grad_norm": 3.65625,
"learning_rate": 0.00013316946136232932,
"loss": 4.4809,
"step": 13665
},
{
"epoch": 6.141060197663971,
"grad_norm": 3.953125,
"learning_rate": 0.00013306775388076367,
"loss": 4.4886,
"step": 13670
},
{
"epoch": 6.143306379155436,
"grad_norm": 3.78125,
"learning_rate": 0.00013296606559429536,
"loss": 4.4976,
"step": 13675
},
{
"epoch": 6.1455525606469,
"grad_norm": 3.65625,
"learning_rate": 0.00013286439656405116,
"loss": 4.4976,
"step": 13680
},
{
"epoch": 6.147798742138365,
"grad_norm": 3.484375,
"learning_rate": 0.00013276274685114636,
"loss": 4.5344,
"step": 13685
},
{
"epoch": 6.150044923629829,
"grad_norm": 3.78125,
"learning_rate": 0.00013266111651668455,
"loss": 4.5032,
"step": 13690
},
{
"epoch": 6.152291105121294,
"grad_norm": 3.734375,
"learning_rate": 0.00013255950562175774,
"loss": 4.579,
"step": 13695
},
{
"epoch": 6.154537286612758,
"grad_norm": 3.640625,
"learning_rate": 0.00013245791422744616,
"loss": 4.5537,
"step": 13700
},
{
"epoch": 6.156783468104223,
"grad_norm": 3.859375,
"learning_rate": 0.00013235634239481848,
"loss": 4.493,
"step": 13705
},
{
"epoch": 6.159029649595688,
"grad_norm": 3.671875,
"learning_rate": 0.0001322547901849314,
"loss": 4.5021,
"step": 13710
},
{
"epoch": 6.161275831087152,
"grad_norm": 3.5625,
"learning_rate": 0.00013215325765883004,
"loss": 4.5263,
"step": 13715
},
{
"epoch": 6.163522012578617,
"grad_norm": 3.828125,
"learning_rate": 0.00013205174487754756,
"loss": 4.5108,
"step": 13720
},
{
"epoch": 6.165768194070081,
"grad_norm": 3.640625,
"learning_rate": 0.00013195025190210525,
"loss": 4.5393,
"step": 13725
},
{
"epoch": 6.168014375561546,
"grad_norm": 3.78125,
"learning_rate": 0.00013184877879351256,
"loss": 4.4821,
"step": 13730
},
{
"epoch": 6.17026055705301,
"grad_norm": 3.671875,
"learning_rate": 0.0001317473256127669,
"loss": 4.5444,
"step": 13735
},
{
"epoch": 6.172506738544475,
"grad_norm": 3.8125,
"learning_rate": 0.0001316458924208538,
"loss": 4.507,
"step": 13740
},
{
"epoch": 6.174752920035939,
"grad_norm": 3.5625,
"learning_rate": 0.00013154447927874675,
"loss": 4.5028,
"step": 13745
},
{
"epoch": 6.176999101527404,
"grad_norm": 3.65625,
"learning_rate": 0.00013144308624740713,
"loss": 4.4272,
"step": 13750
},
{
"epoch": 6.179245283018868,
"grad_norm": 3.484375,
"learning_rate": 0.00013134171338778433,
"loss": 4.5698,
"step": 13755
},
{
"epoch": 6.181491464510333,
"grad_norm": 3.859375,
"learning_rate": 0.0001312403607608155,
"loss": 4.4958,
"step": 13760
},
{
"epoch": 6.183737646001797,
"grad_norm": 3.625,
"learning_rate": 0.0001311390284274257,
"loss": 4.5115,
"step": 13765
},
{
"epoch": 6.185983827493262,
"grad_norm": 3.546875,
"learning_rate": 0.0001310377164485278,
"loss": 4.5206,
"step": 13770
},
{
"epoch": 6.188230008984726,
"grad_norm": 3.75,
"learning_rate": 0.00013093642488502238,
"loss": 4.5121,
"step": 13775
},
{
"epoch": 6.190476190476191,
"grad_norm": 3.8125,
"learning_rate": 0.00013083515379779784,
"loss": 4.5207,
"step": 13780
},
{
"epoch": 6.192722371967655,
"grad_norm": 3.6875,
"learning_rate": 0.00013073390324773012,
"loss": 4.5614,
"step": 13785
},
{
"epoch": 6.19496855345912,
"grad_norm": 3.65625,
"learning_rate": 0.00013063267329568295,
"loss": 4.4603,
"step": 13790
},
{
"epoch": 6.197214734950584,
"grad_norm": 3.65625,
"learning_rate": 0.0001305314640025077,
"loss": 4.5404,
"step": 13795
},
{
"epoch": 6.199460916442049,
"grad_norm": 3.65625,
"learning_rate": 0.00013043027542904308,
"loss": 4.5545,
"step": 13800
},
{
"epoch": 6.201707097933513,
"grad_norm": 3.765625,
"learning_rate": 0.0001303291076361157,
"loss": 4.6362,
"step": 13805
},
{
"epoch": 6.203953279424978,
"grad_norm": 3.796875,
"learning_rate": 0.0001302279606845394,
"loss": 4.4728,
"step": 13810
},
{
"epoch": 6.206199460916442,
"grad_norm": 3.734375,
"learning_rate": 0.0001301268346351156,
"loss": 4.5516,
"step": 13815
},
{
"epoch": 6.208445642407907,
"grad_norm": 3.765625,
"learning_rate": 0.00013002572954863315,
"loss": 4.5923,
"step": 13820
},
{
"epoch": 6.210691823899371,
"grad_norm": 3.875,
"learning_rate": 0.00012992464548586833,
"loss": 4.5001,
"step": 13825
},
{
"epoch": 6.212938005390836,
"grad_norm": 3.828125,
"learning_rate": 0.0001298235825075847,
"loss": 4.4834,
"step": 13830
},
{
"epoch": 6.2151841868823,
"grad_norm": 3.796875,
"learning_rate": 0.00012972254067453322,
"loss": 4.4975,
"step": 13835
},
{
"epoch": 6.217430368373765,
"grad_norm": 4.0,
"learning_rate": 0.00012962152004745208,
"loss": 4.5328,
"step": 13840
},
{
"epoch": 6.219676549865229,
"grad_norm": 3.90625,
"learning_rate": 0.00012952052068706678,
"loss": 4.4904,
"step": 13845
},
{
"epoch": 6.221922731356694,
"grad_norm": 3.84375,
"learning_rate": 0.00012941954265409004,
"loss": 4.6397,
"step": 13850
},
{
"epoch": 6.2241689128481585,
"grad_norm": 3.71875,
"learning_rate": 0.0001293185860092216,
"loss": 4.5542,
"step": 13855
},
{
"epoch": 6.226415094339623,
"grad_norm": 3.703125,
"learning_rate": 0.00012921765081314865,
"loss": 4.5427,
"step": 13860
},
{
"epoch": 6.2286612758310875,
"grad_norm": 3.9375,
"learning_rate": 0.0001291167371265452,
"loss": 4.5675,
"step": 13865
},
{
"epoch": 6.230907457322552,
"grad_norm": 3.6875,
"learning_rate": 0.00012901584501007248,
"loss": 4.556,
"step": 13870
},
{
"epoch": 6.2331536388140165,
"grad_norm": 4.0,
"learning_rate": 0.0001289149745243787,
"loss": 4.5741,
"step": 13875
},
{
"epoch": 6.235399820305481,
"grad_norm": 3.484375,
"learning_rate": 0.00012881412573009904,
"loss": 4.5604,
"step": 13880
},
{
"epoch": 6.2376460017969455,
"grad_norm": 3.4375,
"learning_rate": 0.00012871329868785572,
"loss": 4.5488,
"step": 13885
},
{
"epoch": 6.2398921832884096,
"grad_norm": 3.8125,
"learning_rate": 0.00012861249345825788,
"loss": 4.5015,
"step": 13890
},
{
"epoch": 6.2421383647798745,
"grad_norm": 3.71875,
"learning_rate": 0.00012851171010190148,
"loss": 4.4744,
"step": 13895
},
{
"epoch": 6.2443845462713385,
"grad_norm": 3.71875,
"learning_rate": 0.00012841094867936935,
"loss": 4.4872,
"step": 13900
},
{
"epoch": 6.2466307277628035,
"grad_norm": 3.6875,
"learning_rate": 0.00012831020925123117,
"loss": 4.5236,
"step": 13905
},
{
"epoch": 6.2488769092542675,
"grad_norm": 3.65625,
"learning_rate": 0.00012820949187804337,
"loss": 4.5709,
"step": 13910
},
{
"epoch": 6.2511230907457325,
"grad_norm": 3.59375,
"learning_rate": 0.00012810879662034915,
"loss": 4.5999,
"step": 13915
},
{
"epoch": 6.2533692722371965,
"grad_norm": 3.65625,
"learning_rate": 0.00012800812353867835,
"loss": 4.4844,
"step": 13920
},
{
"epoch": 6.2556154537286615,
"grad_norm": 3.96875,
"learning_rate": 0.0001279074726935476,
"loss": 4.5329,
"step": 13925
},
{
"epoch": 6.2578616352201255,
"grad_norm": 3.859375,
"learning_rate": 0.00012780684414546005,
"loss": 4.5309,
"step": 13930
},
{
"epoch": 6.2601078167115904,
"grad_norm": 3.859375,
"learning_rate": 0.0001277062379549055,
"loss": 4.5303,
"step": 13935
},
{
"epoch": 6.2623539982030545,
"grad_norm": 3.6875,
"learning_rate": 0.00012760565418236023,
"loss": 4.5207,
"step": 13940
},
{
"epoch": 6.264600179694519,
"grad_norm": 3.515625,
"learning_rate": 0.00012750509288828718,
"loss": 4.5325,
"step": 13945
},
{
"epoch": 6.2668463611859835,
"grad_norm": 3.515625,
"learning_rate": 0.00012740455413313574,
"loss": 4.5184,
"step": 13950
},
{
"epoch": 6.269092542677448,
"grad_norm": 3.84375,
"learning_rate": 0.00012730403797734172,
"loss": 4.5426,
"step": 13955
},
{
"epoch": 6.2713387241689125,
"grad_norm": 3.5625,
"learning_rate": 0.0001272035444813273,
"loss": 4.498,
"step": 13960
},
{
"epoch": 6.273584905660377,
"grad_norm": 3.953125,
"learning_rate": 0.000127103073705501,
"loss": 4.5535,
"step": 13965
},
{
"epoch": 6.2758310871518415,
"grad_norm": 3.671875,
"learning_rate": 0.00012700262571025789,
"loss": 4.4931,
"step": 13970
},
{
"epoch": 6.278077268643306,
"grad_norm": 3.78125,
"learning_rate": 0.0001269022005559792,
"loss": 4.531,
"step": 13975
},
{
"epoch": 6.280323450134771,
"grad_norm": 3.859375,
"learning_rate": 0.00012680179830303244,
"loss": 4.4948,
"step": 13980
},
{
"epoch": 6.282569631626235,
"grad_norm": 3.78125,
"learning_rate": 0.00012670141901177138,
"loss": 4.5464,
"step": 13985
},
{
"epoch": 6.2848158131177,
"grad_norm": 3.671875,
"learning_rate": 0.00012660106274253597,
"loss": 4.5584,
"step": 13990
},
{
"epoch": 6.287061994609164,
"grad_norm": 3.734375,
"learning_rate": 0.00012650072955565226,
"loss": 4.5407,
"step": 13995
},
{
"epoch": 6.289308176100629,
"grad_norm": 3.859375,
"learning_rate": 0.00012640041951143263,
"loss": 4.5669,
"step": 14000
},
{
"epoch": 6.289308176100629,
"eval_loss": 4.793929100036621,
"eval_runtime": 16.1025,
"eval_samples_per_second": 1925.975,
"eval_steps_per_second": 240.77,
"step": 14000
},
{
"epoch": 6.291554357592093,
"grad_norm": 3.921875,
"learning_rate": 0.00012630013267017528,
"loss": 4.5684,
"step": 14005
},
{
"epoch": 6.293800539083558,
"grad_norm": 3.6875,
"learning_rate": 0.00012619986909216465,
"loss": 4.5145,
"step": 14010
},
{
"epoch": 6.296046720575022,
"grad_norm": 3.609375,
"learning_rate": 0.00012609962883767113,
"loss": 4.5817,
"step": 14015
},
{
"epoch": 6.298292902066487,
"grad_norm": 3.796875,
"learning_rate": 0.00012599941196695107,
"loss": 4.5459,
"step": 14020
},
{
"epoch": 6.300539083557951,
"grad_norm": 3.625,
"learning_rate": 0.00012589921854024686,
"loss": 4.5078,
"step": 14025
},
{
"epoch": 6.302785265049416,
"grad_norm": 3.6875,
"learning_rate": 0.00012579904861778661,
"loss": 4.5486,
"step": 14030
},
{
"epoch": 6.30503144654088,
"grad_norm": 3.65625,
"learning_rate": 0.00012569890225978456,
"loss": 4.4713,
"step": 14035
},
{
"epoch": 6.307277628032345,
"grad_norm": 3.546875,
"learning_rate": 0.00012559877952644053,
"loss": 4.5605,
"step": 14040
},
{
"epoch": 6.309523809523809,
"grad_norm": 3.703125,
"learning_rate": 0.0001254986804779403,
"loss": 4.5235,
"step": 14045
},
{
"epoch": 6.311769991015274,
"grad_norm": 3.53125,
"learning_rate": 0.00012539860517445537,
"loss": 4.4912,
"step": 14050
},
{
"epoch": 6.314016172506738,
"grad_norm": 3.75,
"learning_rate": 0.00012529855367614294,
"loss": 4.5545,
"step": 14055
},
{
"epoch": 6.316262353998203,
"grad_norm": 3.484375,
"learning_rate": 0.0001251985260431459,
"loss": 4.5054,
"step": 14060
},
{
"epoch": 6.318508535489667,
"grad_norm": 3.703125,
"learning_rate": 0.00012509852233559286,
"loss": 4.45,
"step": 14065
},
{
"epoch": 6.320754716981132,
"grad_norm": 3.546875,
"learning_rate": 0.00012499854261359799,
"loss": 4.6074,
"step": 14070
},
{
"epoch": 6.323000898472596,
"grad_norm": 3.75,
"learning_rate": 0.00012489858693726108,
"loss": 4.5045,
"step": 14075
},
{
"epoch": 6.325247079964061,
"grad_norm": 3.703125,
"learning_rate": 0.0001247986553666674,
"loss": 4.5179,
"step": 14080
},
{
"epoch": 6.327493261455525,
"grad_norm": 3.90625,
"learning_rate": 0.00012469874796188778,
"loss": 4.5399,
"step": 14085
},
{
"epoch": 6.32973944294699,
"grad_norm": 3.484375,
"learning_rate": 0.0001245988647829785,
"loss": 4.5538,
"step": 14090
},
{
"epoch": 6.331985624438454,
"grad_norm": 3.71875,
"learning_rate": 0.00012449900588998132,
"loss": 4.5488,
"step": 14095
},
{
"epoch": 6.334231805929919,
"grad_norm": 3.96875,
"learning_rate": 0.00012439917134292336,
"loss": 4.5329,
"step": 14100
},
{
"epoch": 6.336477987421383,
"grad_norm": 4.0,
"learning_rate": 0.00012429936120181715,
"loss": 4.5304,
"step": 14105
},
{
"epoch": 6.338724168912848,
"grad_norm": 3.53125,
"learning_rate": 0.00012419957552666048,
"loss": 4.519,
"step": 14110
},
{
"epoch": 6.340970350404312,
"grad_norm": 3.65625,
"learning_rate": 0.0001240998143774365,
"loss": 4.5323,
"step": 14115
},
{
"epoch": 6.343216531895777,
"grad_norm": 3.9375,
"learning_rate": 0.0001240000778141135,
"loss": 4.5171,
"step": 14120
},
{
"epoch": 6.345462713387242,
"grad_norm": 4.125,
"learning_rate": 0.00012390036589664518,
"loss": 4.5038,
"step": 14125
},
{
"epoch": 6.347708894878706,
"grad_norm": 3.6875,
"learning_rate": 0.0001238006786849703,
"loss": 4.5938,
"step": 14130
},
{
"epoch": 6.349955076370171,
"grad_norm": 3.8125,
"learning_rate": 0.00012370101623901273,
"loss": 4.4651,
"step": 14135
},
{
"epoch": 6.352201257861635,
"grad_norm": 3.453125,
"learning_rate": 0.00012360137861868156,
"loss": 4.5762,
"step": 14140
},
{
"epoch": 6.3544474393531,
"grad_norm": 3.53125,
"learning_rate": 0.00012350176588387093,
"loss": 4.4847,
"step": 14145
},
{
"epoch": 6.356693620844564,
"grad_norm": 3.65625,
"learning_rate": 0.00012340217809446,
"loss": 4.5574,
"step": 14150
},
{
"epoch": 6.358939802336029,
"grad_norm": 3.59375,
"learning_rate": 0.00012330261531031287,
"loss": 4.5025,
"step": 14155
},
{
"epoch": 6.361185983827493,
"grad_norm": 3.890625,
"learning_rate": 0.00012320307759127876,
"loss": 4.5731,
"step": 14160
},
{
"epoch": 6.363432165318958,
"grad_norm": 3.84375,
"learning_rate": 0.0001231035649971917,
"loss": 4.5366,
"step": 14165
},
{
"epoch": 6.365678346810422,
"grad_norm": 3.78125,
"learning_rate": 0.00012300407758787066,
"loss": 4.4876,
"step": 14170
},
{
"epoch": 6.367924528301887,
"grad_norm": 3.703125,
"learning_rate": 0.00012290461542311946,
"loss": 4.5827,
"step": 14175
},
{
"epoch": 6.370170709793351,
"grad_norm": 3.59375,
"learning_rate": 0.00012280517856272675,
"loss": 4.4945,
"step": 14180
},
{
"epoch": 6.372416891284816,
"grad_norm": 3.90625,
"learning_rate": 0.000122705767066466,
"loss": 4.5056,
"step": 14185
},
{
"epoch": 6.37466307277628,
"grad_norm": 3.703125,
"learning_rate": 0.00012260638099409536,
"loss": 4.5234,
"step": 14190
},
{
"epoch": 6.376909254267745,
"grad_norm": 3.59375,
"learning_rate": 0.0001225070204053578,
"loss": 4.466,
"step": 14195
},
{
"epoch": 6.379155435759209,
"grad_norm": 3.640625,
"learning_rate": 0.00012240768535998084,
"loss": 4.4772,
"step": 14200
},
{
"epoch": 6.381401617250674,
"grad_norm": 3.90625,
"learning_rate": 0.00012230837591767672,
"loss": 4.5287,
"step": 14205
},
{
"epoch": 6.383647798742138,
"grad_norm": 3.8125,
"learning_rate": 0.00012220909213814235,
"loss": 4.4945,
"step": 14210
},
{
"epoch": 6.385893980233603,
"grad_norm": 3.640625,
"learning_rate": 0.00012210983408105915,
"loss": 4.4599,
"step": 14215
},
{
"epoch": 6.388140161725067,
"grad_norm": 3.578125,
"learning_rate": 0.000122010601806093,
"loss": 4.5246,
"step": 14220
},
{
"epoch": 6.390386343216532,
"grad_norm": 3.953125,
"learning_rate": 0.00012191139537289445,
"loss": 4.4865,
"step": 14225
},
{
"epoch": 6.392632524707996,
"grad_norm": 3.78125,
"learning_rate": 0.00012181221484109835,
"loss": 4.5183,
"step": 14230
},
{
"epoch": 6.394878706199461,
"grad_norm": 3.515625,
"learning_rate": 0.0001217130602703241,
"loss": 4.5526,
"step": 14235
},
{
"epoch": 6.397124887690925,
"grad_norm": 3.890625,
"learning_rate": 0.00012161393172017542,
"loss": 4.5366,
"step": 14240
},
{
"epoch": 6.39937106918239,
"grad_norm": 3.84375,
"learning_rate": 0.0001215148292502405,
"loss": 4.5064,
"step": 14245
},
{
"epoch": 6.401617250673855,
"grad_norm": 3.90625,
"learning_rate": 0.00012141575292009165,
"loss": 4.529,
"step": 14250
},
{
"epoch": 6.403863432165319,
"grad_norm": 3.609375,
"learning_rate": 0.00012131670278928569,
"loss": 4.5554,
"step": 14255
},
{
"epoch": 6.406109613656783,
"grad_norm": 3.796875,
"learning_rate": 0.00012121767891736353,
"loss": 4.5227,
"step": 14260
},
{
"epoch": 6.408355795148248,
"grad_norm": 3.5625,
"learning_rate": 0.00012111868136385037,
"loss": 4.5264,
"step": 14265
},
{
"epoch": 6.410601976639713,
"grad_norm": 3.8125,
"learning_rate": 0.00012101971018825564,
"loss": 4.5253,
"step": 14270
},
{
"epoch": 6.412848158131177,
"grad_norm": 3.703125,
"learning_rate": 0.00012092076545007273,
"loss": 4.5086,
"step": 14275
},
{
"epoch": 6.415094339622642,
"grad_norm": 3.609375,
"learning_rate": 0.00012082184720877934,
"loss": 4.5902,
"step": 14280
},
{
"epoch": 6.417340521114106,
"grad_norm": 3.640625,
"learning_rate": 0.00012072295552383708,
"loss": 4.5578,
"step": 14285
},
{
"epoch": 6.419586702605571,
"grad_norm": 3.859375,
"learning_rate": 0.00012062409045469175,
"loss": 4.5546,
"step": 14290
},
{
"epoch": 6.421832884097035,
"grad_norm": 3.65625,
"learning_rate": 0.00012052525206077305,
"loss": 4.5276,
"step": 14295
},
{
"epoch": 6.4240790655885,
"grad_norm": 3.515625,
"learning_rate": 0.0001204264404014947,
"loss": 4.5656,
"step": 14300
},
{
"epoch": 6.426325247079964,
"grad_norm": 3.609375,
"learning_rate": 0.00012032765553625428,
"loss": 4.5248,
"step": 14305
},
{
"epoch": 6.428571428571429,
"grad_norm": 3.765625,
"learning_rate": 0.0001202288975244333,
"loss": 4.5503,
"step": 14310
},
{
"epoch": 6.430817610062893,
"grad_norm": 3.703125,
"learning_rate": 0.00012013016642539715,
"loss": 4.5641,
"step": 14315
},
{
"epoch": 6.433063791554358,
"grad_norm": 3.734375,
"learning_rate": 0.00012003146229849505,
"loss": 4.4994,
"step": 14320
},
{
"epoch": 6.435309973045822,
"grad_norm": 3.671875,
"learning_rate": 0.00011993278520305992,
"loss": 4.5326,
"step": 14325
},
{
"epoch": 6.437556154537287,
"grad_norm": 3.890625,
"learning_rate": 0.00011983413519840854,
"loss": 4.4685,
"step": 14330
},
{
"epoch": 6.439802336028751,
"grad_norm": 3.859375,
"learning_rate": 0.00011973551234384135,
"loss": 4.5315,
"step": 14335
},
{
"epoch": 6.442048517520216,
"grad_norm": 3.640625,
"learning_rate": 0.00011963691669864244,
"loss": 4.4935,
"step": 14340
},
{
"epoch": 6.44429469901168,
"grad_norm": 3.796875,
"learning_rate": 0.00011953834832207957,
"loss": 4.5003,
"step": 14345
},
{
"epoch": 6.446540880503145,
"grad_norm": 3.953125,
"learning_rate": 0.0001194398072734042,
"loss": 4.5085,
"step": 14350
},
{
"epoch": 6.448787061994609,
"grad_norm": 3.609375,
"learning_rate": 0.00011934129361185124,
"loss": 4.4885,
"step": 14355
},
{
"epoch": 6.451033243486074,
"grad_norm": 3.765625,
"learning_rate": 0.00011924280739663914,
"loss": 4.4973,
"step": 14360
},
{
"epoch": 6.453279424977538,
"grad_norm": 3.734375,
"learning_rate": 0.00011914434868696995,
"loss": 4.4664,
"step": 14365
},
{
"epoch": 6.455525606469003,
"grad_norm": 4.03125,
"learning_rate": 0.00011904591754202906,
"loss": 4.5713,
"step": 14370
},
{
"epoch": 6.457771787960467,
"grad_norm": 3.609375,
"learning_rate": 0.0001189475140209854,
"loss": 4.5177,
"step": 14375
},
{
"epoch": 6.460017969451932,
"grad_norm": 3.375,
"learning_rate": 0.00011884913818299123,
"loss": 4.4828,
"step": 14380
},
{
"epoch": 6.462264150943396,
"grad_norm": 3.75,
"learning_rate": 0.00011875079008718222,
"loss": 4.61,
"step": 14385
},
{
"epoch": 6.464510332434861,
"grad_norm": 3.640625,
"learning_rate": 0.00011865246979267728,
"loss": 4.5747,
"step": 14390
},
{
"epoch": 6.466756513926326,
"grad_norm": 3.9375,
"learning_rate": 0.0001185541773585787,
"loss": 4.5184,
"step": 14395
},
{
"epoch": 6.46900269541779,
"grad_norm": 3.5625,
"learning_rate": 0.000118455912843972,
"loss": 4.5603,
"step": 14400
},
{
"epoch": 6.471248876909255,
"grad_norm": 4.09375,
"learning_rate": 0.00011835767630792586,
"loss": 4.5171,
"step": 14405
},
{
"epoch": 6.473495058400719,
"grad_norm": 3.875,
"learning_rate": 0.00011825946780949216,
"loss": 4.5148,
"step": 14410
},
{
"epoch": 6.475741239892184,
"grad_norm": 3.59375,
"learning_rate": 0.00011816128740770604,
"loss": 4.5804,
"step": 14415
},
{
"epoch": 6.477987421383648,
"grad_norm": 3.90625,
"learning_rate": 0.00011806313516158559,
"loss": 4.5033,
"step": 14420
},
{
"epoch": 6.480233602875113,
"grad_norm": 3.859375,
"learning_rate": 0.00011796501113013204,
"loss": 4.5177,
"step": 14425
},
{
"epoch": 6.482479784366577,
"grad_norm": 3.890625,
"learning_rate": 0.00011786691537232975,
"loss": 4.5909,
"step": 14430
},
{
"epoch": 6.484725965858042,
"grad_norm": 3.625,
"learning_rate": 0.00011776884794714586,
"loss": 4.5206,
"step": 14435
},
{
"epoch": 6.486972147349506,
"grad_norm": 3.6875,
"learning_rate": 0.00011767080891353069,
"loss": 4.5884,
"step": 14440
},
{
"epoch": 6.489218328840971,
"grad_norm": 3.71875,
"learning_rate": 0.00011757279833041742,
"loss": 4.4961,
"step": 14445
},
{
"epoch": 6.491464510332435,
"grad_norm": 3.703125,
"learning_rate": 0.00011747481625672212,
"loss": 4.5385,
"step": 14450
},
{
"epoch": 6.4937106918239,
"grad_norm": 3.859375,
"learning_rate": 0.00011737686275134372,
"loss": 4.494,
"step": 14455
},
{
"epoch": 6.495956873315364,
"grad_norm": 3.5,
"learning_rate": 0.00011727893787316402,
"loss": 4.551,
"step": 14460
},
{
"epoch": 6.498203054806829,
"grad_norm": 3.65625,
"learning_rate": 0.00011718104168104756,
"loss": 4.4896,
"step": 14465
},
{
"epoch": 6.500449236298293,
"grad_norm": 3.671875,
"learning_rate": 0.00011708317423384163,
"loss": 4.578,
"step": 14470
},
{
"epoch": 6.502695417789758,
"grad_norm": 3.625,
"learning_rate": 0.00011698533559037628,
"loss": 4.4863,
"step": 14475
},
{
"epoch": 6.504941599281222,
"grad_norm": 3.609375,
"learning_rate": 0.00011688752580946425,
"loss": 4.5179,
"step": 14480
},
{
"epoch": 6.507187780772687,
"grad_norm": 3.6875,
"learning_rate": 0.00011678974494990092,
"loss": 4.5397,
"step": 14485
},
{
"epoch": 6.509433962264151,
"grad_norm": 3.609375,
"learning_rate": 0.0001166919930704642,
"loss": 4.5018,
"step": 14490
},
{
"epoch": 6.5116801437556155,
"grad_norm": 3.625,
"learning_rate": 0.00011659427022991474,
"loss": 4.447,
"step": 14495
},
{
"epoch": 6.51392632524708,
"grad_norm": 3.90625,
"learning_rate": 0.00011649657648699564,
"loss": 4.5113,
"step": 14500
},
{
"epoch": 6.5161725067385445,
"grad_norm": 3.578125,
"learning_rate": 0.00011639891190043248,
"loss": 4.4485,
"step": 14505
},
{
"epoch": 6.518418688230009,
"grad_norm": 3.65625,
"learning_rate": 0.00011630127652893336,
"loss": 4.5122,
"step": 14510
},
{
"epoch": 6.5206648697214735,
"grad_norm": 3.859375,
"learning_rate": 0.00011620367043118884,
"loss": 4.538,
"step": 14515
},
{
"epoch": 6.5229110512129385,
"grad_norm": 4.0,
"learning_rate": 0.00011610609366587179,
"loss": 4.5288,
"step": 14520
},
{
"epoch": 6.5251572327044025,
"grad_norm": 3.84375,
"learning_rate": 0.00011600854629163758,
"loss": 4.4971,
"step": 14525
},
{
"epoch": 6.527403414195867,
"grad_norm": 3.796875,
"learning_rate": 0.00011591102836712383,
"loss": 4.5613,
"step": 14530
},
{
"epoch": 6.5296495956873315,
"grad_norm": 3.703125,
"learning_rate": 0.00011581353995095046,
"loss": 4.5092,
"step": 14535
},
{
"epoch": 6.531895777178796,
"grad_norm": 3.84375,
"learning_rate": 0.00011571608110171965,
"loss": 4.5608,
"step": 14540
},
{
"epoch": 6.5341419586702605,
"grad_norm": 3.65625,
"learning_rate": 0.00011561865187801587,
"loss": 4.5326,
"step": 14545
},
{
"epoch": 6.536388140161725,
"grad_norm": 3.75,
"learning_rate": 0.00011552125233840563,
"loss": 4.5099,
"step": 14550
},
{
"epoch": 6.5386343216531895,
"grad_norm": 3.515625,
"learning_rate": 0.00011542388254143775,
"loss": 4.518,
"step": 14555
},
{
"epoch": 6.540880503144654,
"grad_norm": 3.9375,
"learning_rate": 0.00011532654254564316,
"loss": 4.602,
"step": 14560
},
{
"epoch": 6.5431266846361185,
"grad_norm": 3.84375,
"learning_rate": 0.0001152292324095348,
"loss": 4.5577,
"step": 14565
},
{
"epoch": 6.545372866127583,
"grad_norm": 3.703125,
"learning_rate": 0.0001151319521916077,
"loss": 4.5812,
"step": 14570
},
{
"epoch": 6.5476190476190474,
"grad_norm": 3.703125,
"learning_rate": 0.00011503470195033893,
"loss": 4.4876,
"step": 14575
},
{
"epoch": 6.549865229110512,
"grad_norm": 3.859375,
"learning_rate": 0.00011493748174418742,
"loss": 4.4887,
"step": 14580
},
{
"epoch": 6.552111410601976,
"grad_norm": 3.453125,
"learning_rate": 0.00011484029163159424,
"loss": 4.5311,
"step": 14585
},
{
"epoch": 6.554357592093441,
"grad_norm": 3.78125,
"learning_rate": 0.00011474313167098222,
"loss": 4.5097,
"step": 14590
},
{
"epoch": 6.556603773584905,
"grad_norm": 3.859375,
"learning_rate": 0.00011464600192075608,
"loss": 4.5612,
"step": 14595
},
{
"epoch": 6.55884995507637,
"grad_norm": 3.609375,
"learning_rate": 0.0001145489024393024,
"loss": 4.556,
"step": 14600
},
{
"epoch": 6.561096136567834,
"grad_norm": 3.5625,
"learning_rate": 0.00011445183328498965,
"loss": 4.5377,
"step": 14605
},
{
"epoch": 6.563342318059299,
"grad_norm": 3.828125,
"learning_rate": 0.00011435479451616801,
"loss": 4.498,
"step": 14610
},
{
"epoch": 6.565588499550763,
"grad_norm": 3.640625,
"learning_rate": 0.00011425778619116928,
"loss": 4.5996,
"step": 14615
},
{
"epoch": 6.567834681042228,
"grad_norm": 3.59375,
"learning_rate": 0.00011416080836830717,
"loss": 4.5207,
"step": 14620
},
{
"epoch": 6.570080862533692,
"grad_norm": 3.703125,
"learning_rate": 0.00011406386110587684,
"loss": 4.5385,
"step": 14625
},
{
"epoch": 6.572327044025157,
"grad_norm": 3.65625,
"learning_rate": 0.00011396694446215525,
"loss": 4.5183,
"step": 14630
},
{
"epoch": 6.574573225516621,
"grad_norm": 3.546875,
"learning_rate": 0.00011387005849540086,
"loss": 4.6191,
"step": 14635
},
{
"epoch": 6.576819407008086,
"grad_norm": 3.65625,
"learning_rate": 0.00011377320326385376,
"loss": 4.5576,
"step": 14640
},
{
"epoch": 6.579065588499551,
"grad_norm": 3.71875,
"learning_rate": 0.00011367637882573548,
"loss": 4.5206,
"step": 14645
},
{
"epoch": 6.581311769991015,
"grad_norm": 3.65625,
"learning_rate": 0.00011357958523924913,
"loss": 4.5109,
"step": 14650
},
{
"epoch": 6.583557951482479,
"grad_norm": 3.640625,
"learning_rate": 0.00011348282256257918,
"loss": 4.5546,
"step": 14655
},
{
"epoch": 6.585804132973944,
"grad_norm": 3.5625,
"learning_rate": 0.00011338609085389158,
"loss": 4.4889,
"step": 14660
},
{
"epoch": 6.588050314465409,
"grad_norm": 3.734375,
"learning_rate": 0.00011328939017133358,
"loss": 4.5036,
"step": 14665
},
{
"epoch": 6.590296495956873,
"grad_norm": 3.609375,
"learning_rate": 0.000113192720573034,
"loss": 4.5495,
"step": 14670
},
{
"epoch": 6.592542677448337,
"grad_norm": 3.859375,
"learning_rate": 0.00011309608211710271,
"loss": 4.5405,
"step": 14675
},
{
"epoch": 6.594788858939802,
"grad_norm": 3.65625,
"learning_rate": 0.00011299947486163105,
"loss": 4.4951,
"step": 14680
},
{
"epoch": 6.597035040431267,
"grad_norm": 3.4375,
"learning_rate": 0.00011290289886469147,
"loss": 4.5056,
"step": 14685
},
{
"epoch": 6.599281221922731,
"grad_norm": 3.375,
"learning_rate": 0.00011280635418433776,
"loss": 4.5174,
"step": 14690
},
{
"epoch": 6.601527403414196,
"grad_norm": 3.578125,
"learning_rate": 0.00011270984087860467,
"loss": 4.5037,
"step": 14695
},
{
"epoch": 6.60377358490566,
"grad_norm": 3.71875,
"learning_rate": 0.00011261335900550839,
"loss": 4.5719,
"step": 14700
},
{
"epoch": 6.606019766397125,
"grad_norm": 3.78125,
"learning_rate": 0.000112516908623046,
"loss": 4.5371,
"step": 14705
},
{
"epoch": 6.608265947888589,
"grad_norm": 3.734375,
"learning_rate": 0.0001124204897891957,
"loss": 4.5445,
"step": 14710
},
{
"epoch": 6.610512129380054,
"grad_norm": 3.578125,
"learning_rate": 0.00011232410256191677,
"loss": 4.5235,
"step": 14715
},
{
"epoch": 6.612758310871518,
"grad_norm": 3.9375,
"learning_rate": 0.00011222774699914941,
"loss": 4.5053,
"step": 14720
},
{
"epoch": 6.615004492362983,
"grad_norm": 3.71875,
"learning_rate": 0.00011213142315881486,
"loss": 4.6242,
"step": 14725
},
{
"epoch": 6.617250673854447,
"grad_norm": 3.859375,
"learning_rate": 0.00011203513109881524,
"loss": 4.5777,
"step": 14730
},
{
"epoch": 6.619496855345912,
"grad_norm": 3.9375,
"learning_rate": 0.00011193887087703363,
"loss": 4.4892,
"step": 14735
},
{
"epoch": 6.621743036837376,
"grad_norm": 3.828125,
"learning_rate": 0.00011184264255133388,
"loss": 4.5864,
"step": 14740
},
{
"epoch": 6.623989218328841,
"grad_norm": 3.5625,
"learning_rate": 0.00011174644617956081,
"loss": 4.537,
"step": 14745
},
{
"epoch": 6.626235399820305,
"grad_norm": 3.65625,
"learning_rate": 0.00011165028181953985,
"loss": 4.5378,
"step": 14750
},
{
"epoch": 6.62848158131177,
"grad_norm": 3.578125,
"learning_rate": 0.00011155414952907728,
"loss": 4.5097,
"step": 14755
},
{
"epoch": 6.630727762803234,
"grad_norm": 3.953125,
"learning_rate": 0.00011145804936596011,
"loss": 4.5378,
"step": 14760
},
{
"epoch": 6.632973944294699,
"grad_norm": 4.0,
"learning_rate": 0.00011136198138795606,
"loss": 4.6784,
"step": 14765
},
{
"epoch": 6.635220125786163,
"grad_norm": 3.78125,
"learning_rate": 0.00011126594565281345,
"loss": 4.595,
"step": 14770
},
{
"epoch": 6.637466307277628,
"grad_norm": 4.0,
"learning_rate": 0.00011116994221826121,
"loss": 4.5158,
"step": 14775
},
{
"epoch": 6.639712488769092,
"grad_norm": 3.921875,
"learning_rate": 0.00011107397114200892,
"loss": 4.5673,
"step": 14780
},
{
"epoch": 6.641958670260557,
"grad_norm": 3.765625,
"learning_rate": 0.00011097803248174664,
"loss": 4.5942,
"step": 14785
},
{
"epoch": 6.644204851752022,
"grad_norm": 3.625,
"learning_rate": 0.00011088212629514502,
"loss": 4.5581,
"step": 14790
},
{
"epoch": 6.646451033243486,
"grad_norm": 3.828125,
"learning_rate": 0.00011078625263985509,
"loss": 4.6212,
"step": 14795
},
{
"epoch": 6.64869721473495,
"grad_norm": 4.0625,
"learning_rate": 0.0001106904115735084,
"loss": 4.4868,
"step": 14800
},
{
"epoch": 6.650943396226415,
"grad_norm": 3.890625,
"learning_rate": 0.00011059460315371693,
"loss": 4.4921,
"step": 14805
},
{
"epoch": 6.65318957771788,
"grad_norm": 3.625,
"learning_rate": 0.00011049882743807289,
"loss": 4.5305,
"step": 14810
},
{
"epoch": 6.655435759209344,
"grad_norm": 3.703125,
"learning_rate": 0.00011040308448414901,
"loss": 4.4816,
"step": 14815
},
{
"epoch": 6.657681940700809,
"grad_norm": 3.734375,
"learning_rate": 0.00011030737434949829,
"loss": 4.5401,
"step": 14820
},
{
"epoch": 6.659928122192273,
"grad_norm": 3.59375,
"learning_rate": 0.00011021169709165386,
"loss": 4.5575,
"step": 14825
},
{
"epoch": 6.662174303683738,
"grad_norm": 3.65625,
"learning_rate": 0.00011011605276812926,
"loss": 4.5486,
"step": 14830
},
{
"epoch": 6.664420485175202,
"grad_norm": 3.8125,
"learning_rate": 0.00011002044143641815,
"loss": 4.5209,
"step": 14835
},
{
"epoch": 6.666666666666667,
"grad_norm": 3.75,
"learning_rate": 0.00010992486315399431,
"loss": 4.5942,
"step": 14840
},
{
"epoch": 6.668912848158131,
"grad_norm": 3.828125,
"learning_rate": 0.00010982931797831182,
"loss": 4.4669,
"step": 14845
},
{
"epoch": 6.671159029649596,
"grad_norm": 3.890625,
"learning_rate": 0.00010973380596680472,
"loss": 4.5182,
"step": 14850
},
{
"epoch": 6.67340521114106,
"grad_norm": 3.765625,
"learning_rate": 0.00010963832717688711,
"loss": 4.4873,
"step": 14855
},
{
"epoch": 6.675651392632525,
"grad_norm": 3.828125,
"learning_rate": 0.00010954288166595314,
"loss": 4.528,
"step": 14860
},
{
"epoch": 6.677897574123989,
"grad_norm": 3.734375,
"learning_rate": 0.00010944746949137705,
"loss": 4.5011,
"step": 14865
},
{
"epoch": 6.680143755615454,
"grad_norm": 3.921875,
"learning_rate": 0.00010935209071051289,
"loss": 4.4619,
"step": 14870
},
{
"epoch": 6.682389937106918,
"grad_norm": 3.75,
"learning_rate": 0.00010925674538069476,
"loss": 4.6037,
"step": 14875
},
{
"epoch": 6.684636118598383,
"grad_norm": 3.625,
"learning_rate": 0.00010916143355923657,
"loss": 4.5853,
"step": 14880
},
{
"epoch": 6.686882300089847,
"grad_norm": 3.5,
"learning_rate": 0.00010906615530343216,
"loss": 4.5759,
"step": 14885
},
{
"epoch": 6.689128481581312,
"grad_norm": 3.65625,
"learning_rate": 0.00010897091067055507,
"loss": 4.4407,
"step": 14890
},
{
"epoch": 6.691374663072776,
"grad_norm": 3.640625,
"learning_rate": 0.00010887569971785877,
"loss": 4.4905,
"step": 14895
},
{
"epoch": 6.693620844564241,
"grad_norm": 3.8125,
"learning_rate": 0.00010878052250257651,
"loss": 4.5457,
"step": 14900
},
{
"epoch": 6.695867026055705,
"grad_norm": 3.8125,
"learning_rate": 0.00010868537908192096,
"loss": 4.4999,
"step": 14905
},
{
"epoch": 6.69811320754717,
"grad_norm": 3.6875,
"learning_rate": 0.0001085902695130849,
"loss": 4.5304,
"step": 14910
},
{
"epoch": 6.700359389038635,
"grad_norm": 3.484375,
"learning_rate": 0.0001084951938532404,
"loss": 4.5785,
"step": 14915
},
{
"epoch": 6.702605570530099,
"grad_norm": 3.78125,
"learning_rate": 0.00010840015215953941,
"loss": 4.5099,
"step": 14920
},
{
"epoch": 6.704851752021563,
"grad_norm": 3.515625,
"learning_rate": 0.00010830514448911326,
"loss": 4.5237,
"step": 14925
},
{
"epoch": 6.707097933513028,
"grad_norm": 3.6875,
"learning_rate": 0.00010821017089907299,
"loss": 4.5074,
"step": 14930
},
{
"epoch": 6.709344115004493,
"grad_norm": 3.828125,
"learning_rate": 0.000108115231446509,
"loss": 4.5812,
"step": 14935
},
{
"epoch": 6.711590296495957,
"grad_norm": 3.90625,
"learning_rate": 0.0001080203261884913,
"loss": 4.5318,
"step": 14940
},
{
"epoch": 6.713836477987421,
"grad_norm": 3.703125,
"learning_rate": 0.00010792545518206936,
"loss": 4.5566,
"step": 14945
},
{
"epoch": 6.716082659478886,
"grad_norm": 3.5625,
"learning_rate": 0.00010783061848427187,
"loss": 4.4695,
"step": 14950
},
{
"epoch": 6.718328840970351,
"grad_norm": 3.671875,
"learning_rate": 0.00010773581615210714,
"loss": 4.5218,
"step": 14955
},
{
"epoch": 6.720575022461815,
"grad_norm": 3.671875,
"learning_rate": 0.00010764104824256261,
"loss": 4.4921,
"step": 14960
},
{
"epoch": 6.72282120395328,
"grad_norm": 3.65625,
"learning_rate": 0.0001075463148126052,
"loss": 4.4961,
"step": 14965
},
{
"epoch": 6.725067385444744,
"grad_norm": 3.984375,
"learning_rate": 0.00010745161591918092,
"loss": 4.4951,
"step": 14970
},
{
"epoch": 6.727313566936209,
"grad_norm": 3.8125,
"learning_rate": 0.00010735695161921522,
"loss": 4.5504,
"step": 14975
},
{
"epoch": 6.729559748427673,
"grad_norm": 3.875,
"learning_rate": 0.00010726232196961269,
"loss": 4.5075,
"step": 14980
},
{
"epoch": 6.731805929919138,
"grad_norm": 3.828125,
"learning_rate": 0.00010716772702725692,
"loss": 4.5834,
"step": 14985
},
{
"epoch": 6.734052111410602,
"grad_norm": 3.484375,
"learning_rate": 0.00010707316684901095,
"loss": 4.4663,
"step": 14990
},
{
"epoch": 6.736298292902067,
"grad_norm": 4.03125,
"learning_rate": 0.00010697864149171663,
"loss": 4.4827,
"step": 14995
},
{
"epoch": 6.738544474393531,
"grad_norm": 3.75,
"learning_rate": 0.00010688415101219502,
"loss": 4.5397,
"step": 15000
},
{
"epoch": 6.738544474393531,
"eval_loss": 4.787689685821533,
"eval_runtime": 16.0548,
"eval_samples_per_second": 1931.7,
"eval_steps_per_second": 241.486,
"step": 15000
},
{
"epoch": 6.740790655884996,
"grad_norm": 3.8125,
"learning_rate": 0.00010678969546724628,
"loss": 4.5253,
"step": 15005
},
{
"epoch": 6.74303683737646,
"grad_norm": 3.765625,
"learning_rate": 0.00010669527491364935,
"loss": 4.5435,
"step": 15010
},
{
"epoch": 6.745283018867925,
"grad_norm": 3.65625,
"learning_rate": 0.00010660088940816236,
"loss": 4.4933,
"step": 15015
},
{
"epoch": 6.747529200359389,
"grad_norm": 3.734375,
"learning_rate": 0.00010650653900752224,
"loss": 4.541,
"step": 15020
},
{
"epoch": 6.749775381850854,
"grad_norm": 3.5625,
"learning_rate": 0.00010641222376844495,
"loss": 4.5388,
"step": 15025
},
{
"epoch": 6.752021563342318,
"grad_norm": 3.703125,
"learning_rate": 0.00010631794374762507,
"loss": 4.552,
"step": 15030
},
{
"epoch": 6.754267744833783,
"grad_norm": 3.625,
"learning_rate": 0.00010622369900173626,
"loss": 4.518,
"step": 15035
},
{
"epoch": 6.756513926325247,
"grad_norm": 3.671875,
"learning_rate": 0.00010612948958743091,
"loss": 4.5602,
"step": 15040
},
{
"epoch": 6.758760107816712,
"grad_norm": 3.84375,
"learning_rate": 0.00010603531556134006,
"loss": 4.5835,
"step": 15045
},
{
"epoch": 6.761006289308176,
"grad_norm": 4.0625,
"learning_rate": 0.00010594117698007362,
"loss": 4.5293,
"step": 15050
},
{
"epoch": 6.763252470799641,
"grad_norm": 3.734375,
"learning_rate": 0.00010584707390022008,
"loss": 4.4825,
"step": 15055
},
{
"epoch": 6.765498652291106,
"grad_norm": 3.8125,
"learning_rate": 0.0001057530063783467,
"loss": 4.5615,
"step": 15060
},
{
"epoch": 6.76774483378257,
"grad_norm": 3.71875,
"learning_rate": 0.00010565897447099929,
"loss": 4.5061,
"step": 15065
},
{
"epoch": 6.769991015274034,
"grad_norm": 3.75,
"learning_rate": 0.00010556497823470215,
"loss": 4.4721,
"step": 15070
},
{
"epoch": 6.772237196765499,
"grad_norm": 3.9375,
"learning_rate": 0.00010547101772595847,
"loss": 4.5282,
"step": 15075
},
{
"epoch": 6.774483378256964,
"grad_norm": 3.703125,
"learning_rate": 0.00010537709300124956,
"loss": 4.5139,
"step": 15080
},
{
"epoch": 6.776729559748428,
"grad_norm": 3.859375,
"learning_rate": 0.00010528320411703548,
"loss": 4.5012,
"step": 15085
},
{
"epoch": 6.7789757412398925,
"grad_norm": 3.859375,
"learning_rate": 0.00010518935112975469,
"loss": 4.5018,
"step": 15090
},
{
"epoch": 6.781221922731357,
"grad_norm": 3.734375,
"learning_rate": 0.00010509553409582404,
"loss": 4.5257,
"step": 15095
},
{
"epoch": 6.7834681042228215,
"grad_norm": 3.75,
"learning_rate": 0.0001050017530716388,
"loss": 4.5579,
"step": 15100
},
{
"epoch": 6.785714285714286,
"grad_norm": 3.8125,
"learning_rate": 0.00010490800811357252,
"loss": 4.4895,
"step": 15105
},
{
"epoch": 6.7879604672057505,
"grad_norm": 4.03125,
"learning_rate": 0.00010481429927797716,
"loss": 4.4847,
"step": 15110
},
{
"epoch": 6.790206648697215,
"grad_norm": 3.5625,
"learning_rate": 0.00010472062662118303,
"loss": 4.4448,
"step": 15115
},
{
"epoch": 6.7924528301886795,
"grad_norm": 3.796875,
"learning_rate": 0.00010462699019949839,
"loss": 4.5179,
"step": 15120
},
{
"epoch": 6.794699011680144,
"grad_norm": 3.71875,
"learning_rate": 0.00010453339006921012,
"loss": 4.5188,
"step": 15125
},
{
"epoch": 6.7969451931716085,
"grad_norm": 3.796875,
"learning_rate": 0.00010443982628658295,
"loss": 4.5691,
"step": 15130
},
{
"epoch": 6.7991913746630726,
"grad_norm": 3.78125,
"learning_rate": 0.00010434629890786,
"loss": 4.4779,
"step": 15135
},
{
"epoch": 6.8014375561545375,
"grad_norm": 3.625,
"learning_rate": 0.00010425280798926233,
"loss": 4.5947,
"step": 15140
},
{
"epoch": 6.8036837376460015,
"grad_norm": 3.875,
"learning_rate": 0.00010415935358698916,
"loss": 4.5423,
"step": 15145
},
{
"epoch": 6.8059299191374665,
"grad_norm": 4.125,
"learning_rate": 0.00010406593575721785,
"loss": 4.5479,
"step": 15150
},
{
"epoch": 6.8081761006289305,
"grad_norm": 3.640625,
"learning_rate": 0.00010397255455610357,
"loss": 4.5018,
"step": 15155
},
{
"epoch": 6.8104222821203955,
"grad_norm": 3.859375,
"learning_rate": 0.00010387921003977968,
"loss": 4.5712,
"step": 15160
},
{
"epoch": 6.8126684636118595,
"grad_norm": 3.65625,
"learning_rate": 0.00010378590226435731,
"loss": 4.5021,
"step": 15165
},
{
"epoch": 6.8149146451033245,
"grad_norm": 3.796875,
"learning_rate": 0.00010369263128592566,
"loss": 4.5527,
"step": 15170
},
{
"epoch": 6.8171608265947885,
"grad_norm": 3.640625,
"learning_rate": 0.00010359939716055165,
"loss": 4.4868,
"step": 15175
},
{
"epoch": 6.819407008086253,
"grad_norm": 3.734375,
"learning_rate": 0.00010350619994428019,
"loss": 4.5061,
"step": 15180
},
{
"epoch": 6.821653189577718,
"grad_norm": 3.859375,
"learning_rate": 0.00010341303969313401,
"loss": 4.5157,
"step": 15185
},
{
"epoch": 6.823899371069182,
"grad_norm": 3.796875,
"learning_rate": 0.00010331991646311347,
"loss": 4.5326,
"step": 15190
},
{
"epoch": 6.8261455525606465,
"grad_norm": 3.484375,
"learning_rate": 0.00010322683031019678,
"loss": 4.578,
"step": 15195
},
{
"epoch": 6.828391734052111,
"grad_norm": 3.703125,
"learning_rate": 0.00010313378129033985,
"loss": 4.5518,
"step": 15200
},
{
"epoch": 6.830637915543576,
"grad_norm": 4.09375,
"learning_rate": 0.00010304076945947624,
"loss": 4.5308,
"step": 15205
},
{
"epoch": 6.83288409703504,
"grad_norm": 3.609375,
"learning_rate": 0.00010294779487351727,
"loss": 4.5058,
"step": 15210
},
{
"epoch": 6.8351302785265045,
"grad_norm": 3.5625,
"learning_rate": 0.00010285485758835168,
"loss": 4.5244,
"step": 15215
},
{
"epoch": 6.837376460017969,
"grad_norm": 3.71875,
"learning_rate": 0.00010276195765984605,
"loss": 4.5473,
"step": 15220
},
{
"epoch": 6.839622641509434,
"grad_norm": 3.734375,
"learning_rate": 0.00010266909514384407,
"loss": 4.5875,
"step": 15225
},
{
"epoch": 6.841868823000898,
"grad_norm": 3.84375,
"learning_rate": 0.00010257627009616741,
"loss": 4.5284,
"step": 15230
},
{
"epoch": 6.844115004492363,
"grad_norm": 3.640625,
"learning_rate": 0.000102483482572615,
"loss": 4.4934,
"step": 15235
},
{
"epoch": 6.846361185983827,
"grad_norm": 3.71875,
"learning_rate": 0.00010239073262896317,
"loss": 4.5229,
"step": 15240
},
{
"epoch": 6.848607367475292,
"grad_norm": 3.578125,
"learning_rate": 0.00010229802032096582,
"loss": 4.5418,
"step": 15245
},
{
"epoch": 6.850853548966756,
"grad_norm": 3.53125,
"learning_rate": 0.000102205345704354,
"loss": 4.5674,
"step": 15250
},
{
"epoch": 6.853099730458221,
"grad_norm": 3.421875,
"learning_rate": 0.00010211270883483634,
"loss": 4.5549,
"step": 15255
},
{
"epoch": 6.855345911949685,
"grad_norm": 3.4375,
"learning_rate": 0.00010202010976809868,
"loss": 4.5274,
"step": 15260
},
{
"epoch": 6.85759209344115,
"grad_norm": 3.6875,
"learning_rate": 0.00010192754855980403,
"loss": 4.5748,
"step": 15265
},
{
"epoch": 6.859838274932614,
"grad_norm": 3.703125,
"learning_rate": 0.00010183502526559287,
"loss": 4.4829,
"step": 15270
},
{
"epoch": 6.862084456424079,
"grad_norm": 3.6875,
"learning_rate": 0.00010174253994108262,
"loss": 4.6253,
"step": 15275
},
{
"epoch": 6.864330637915543,
"grad_norm": 3.9375,
"learning_rate": 0.00010165009264186815,
"loss": 4.5681,
"step": 15280
},
{
"epoch": 6.866576819407008,
"grad_norm": 3.671875,
"learning_rate": 0.00010155768342352122,
"loss": 4.4874,
"step": 15285
},
{
"epoch": 6.868823000898472,
"grad_norm": 3.671875,
"learning_rate": 0.0001014653123415909,
"loss": 4.5273,
"step": 15290
},
{
"epoch": 6.871069182389937,
"grad_norm": 3.75,
"learning_rate": 0.00010137297945160326,
"loss": 4.5341,
"step": 15295
},
{
"epoch": 6.873315363881401,
"grad_norm": 3.921875,
"learning_rate": 0.00010128068480906132,
"loss": 4.4617,
"step": 15300
},
{
"epoch": 6.875561545372866,
"grad_norm": 3.578125,
"learning_rate": 0.00010118842846944532,
"loss": 4.5259,
"step": 15305
},
{
"epoch": 6.87780772686433,
"grad_norm": 3.6875,
"learning_rate": 0.00010109621048821218,
"loss": 4.5097,
"step": 15310
},
{
"epoch": 6.880053908355795,
"grad_norm": 3.8125,
"learning_rate": 0.00010100403092079611,
"loss": 4.4702,
"step": 15315
},
{
"epoch": 6.882300089847259,
"grad_norm": 3.828125,
"learning_rate": 0.00010091188982260793,
"loss": 4.4858,
"step": 15320
},
{
"epoch": 6.884546271338724,
"grad_norm": 3.890625,
"learning_rate": 0.00010081978724903546,
"loss": 4.5182,
"step": 15325
},
{
"epoch": 6.886792452830189,
"grad_norm": 3.6875,
"learning_rate": 0.00010072772325544344,
"loss": 4.4208,
"step": 15330
},
{
"epoch": 6.889038634321653,
"grad_norm": 3.75,
"learning_rate": 0.00010063569789717327,
"loss": 4.5253,
"step": 15335
},
{
"epoch": 6.891284815813117,
"grad_norm": 3.90625,
"learning_rate": 0.00010054371122954323,
"loss": 4.5032,
"step": 15340
},
{
"epoch": 6.893530997304582,
"grad_norm": 3.640625,
"learning_rate": 0.00010045176330784823,
"loss": 4.4829,
"step": 15345
},
{
"epoch": 6.895777178796047,
"grad_norm": 3.859375,
"learning_rate": 0.00010035985418736004,
"loss": 4.5584,
"step": 15350
},
{
"epoch": 6.898023360287511,
"grad_norm": 3.953125,
"learning_rate": 0.00010026798392332702,
"loss": 4.4988,
"step": 15355
},
{
"epoch": 6.900269541778976,
"grad_norm": 3.75,
"learning_rate": 0.00010017615257097412,
"loss": 4.5714,
"step": 15360
},
{
"epoch": 6.90251572327044,
"grad_norm": 3.625,
"learning_rate": 0.00010008436018550307,
"loss": 4.549,
"step": 15365
},
{
"epoch": 6.904761904761905,
"grad_norm": 3.8125,
"learning_rate": 9.999260682209193e-05,
"loss": 4.4937,
"step": 15370
},
{
"epoch": 6.907008086253369,
"grad_norm": 3.765625,
"learning_rate": 9.990089253589559e-05,
"loss": 4.5615,
"step": 15375
},
{
"epoch": 6.909254267744834,
"grad_norm": 3.765625,
"learning_rate": 9.980921738204522e-05,
"loss": 4.4389,
"step": 15380
},
{
"epoch": 6.911500449236298,
"grad_norm": 3.71875,
"learning_rate": 9.971758141564848e-05,
"loss": 4.5287,
"step": 15385
},
{
"epoch": 6.913746630727763,
"grad_norm": 3.75,
"learning_rate": 9.962598469178966e-05,
"loss": 4.4771,
"step": 15390
},
{
"epoch": 6.915992812219227,
"grad_norm": 3.734375,
"learning_rate": 9.953442726552923e-05,
"loss": 4.5699,
"step": 15395
},
{
"epoch": 6.918238993710692,
"grad_norm": 3.953125,
"learning_rate": 9.944290919190425e-05,
"loss": 4.467,
"step": 15400
},
{
"epoch": 6.920485175202156,
"grad_norm": 3.71875,
"learning_rate": 9.935143052592802e-05,
"loss": 4.5499,
"step": 15405
},
{
"epoch": 6.922731356693621,
"grad_norm": 3.765625,
"learning_rate": 9.925999132259006e-05,
"loss": 4.4919,
"step": 15410
},
{
"epoch": 6.924977538185085,
"grad_norm": 3.890625,
"learning_rate": 9.916859163685636e-05,
"loss": 4.5208,
"step": 15415
},
{
"epoch": 6.92722371967655,
"grad_norm": 3.828125,
"learning_rate": 9.907723152366898e-05,
"loss": 4.5282,
"step": 15420
},
{
"epoch": 6.929469901168014,
"grad_norm": 3.890625,
"learning_rate": 9.898591103794635e-05,
"loss": 4.5708,
"step": 15425
},
{
"epoch": 6.931716082659479,
"grad_norm": 4.28125,
"learning_rate": 9.889463023458291e-05,
"loss": 4.4944,
"step": 15430
},
{
"epoch": 6.933962264150943,
"grad_norm": 3.6875,
"learning_rate": 9.880338916844935e-05,
"loss": 4.5565,
"step": 15435
},
{
"epoch": 6.936208445642408,
"grad_norm": 3.578125,
"learning_rate": 9.87121878943926e-05,
"loss": 4.5056,
"step": 15440
},
{
"epoch": 6.938454627133872,
"grad_norm": 3.75,
"learning_rate": 9.862102646723533e-05,
"loss": 4.5851,
"step": 15445
},
{
"epoch": 6.940700808625337,
"grad_norm": 3.609375,
"learning_rate": 9.85299049417766e-05,
"loss": 4.4913,
"step": 15450
},
{
"epoch": 6.942946990116801,
"grad_norm": 3.75,
"learning_rate": 9.843882337279125e-05,
"loss": 4.5163,
"step": 15455
},
{
"epoch": 6.945193171608266,
"grad_norm": 3.859375,
"learning_rate": 9.834778181503018e-05,
"loss": 4.5936,
"step": 15460
},
{
"epoch": 6.94743935309973,
"grad_norm": 3.8125,
"learning_rate": 9.825678032322038e-05,
"loss": 4.5088,
"step": 15465
},
{
"epoch": 6.949685534591195,
"grad_norm": 3.671875,
"learning_rate": 9.81658189520645e-05,
"loss": 4.5256,
"step": 15470
},
{
"epoch": 6.95193171608266,
"grad_norm": 3.65625,
"learning_rate": 9.807489775624128e-05,
"loss": 4.487,
"step": 15475
},
{
"epoch": 6.954177897574124,
"grad_norm": 3.734375,
"learning_rate": 9.798401679040511e-05,
"loss": 4.5617,
"step": 15480
},
{
"epoch": 6.956424079065588,
"grad_norm": 3.6875,
"learning_rate": 9.789317610918647e-05,
"loss": 4.4997,
"step": 15485
},
{
"epoch": 6.958670260557053,
"grad_norm": 3.953125,
"learning_rate": 9.780237576719134e-05,
"loss": 4.5441,
"step": 15490
},
{
"epoch": 6.960916442048518,
"grad_norm": 3.9375,
"learning_rate": 9.771161581900161e-05,
"loss": 4.5346,
"step": 15495
},
{
"epoch": 6.963162623539982,
"grad_norm": 3.734375,
"learning_rate": 9.762089631917495e-05,
"loss": 4.5024,
"step": 15500
},
{
"epoch": 6.965408805031447,
"grad_norm": 3.859375,
"learning_rate": 9.75302173222445e-05,
"loss": 4.5533,
"step": 15505
},
{
"epoch": 6.967654986522911,
"grad_norm": 3.8125,
"learning_rate": 9.743957888271931e-05,
"loss": 4.5326,
"step": 15510
},
{
"epoch": 6.969901168014376,
"grad_norm": 3.765625,
"learning_rate": 9.734898105508373e-05,
"loss": 4.5071,
"step": 15515
},
{
"epoch": 6.97214734950584,
"grad_norm": 3.734375,
"learning_rate": 9.725842389379808e-05,
"loss": 4.5342,
"step": 15520
},
{
"epoch": 6.974393530997305,
"grad_norm": 3.65625,
"learning_rate": 9.716790745329793e-05,
"loss": 4.5754,
"step": 15525
},
{
"epoch": 6.976639712488769,
"grad_norm": 3.734375,
"learning_rate": 9.707743178799446e-05,
"loss": 4.5492,
"step": 15530
},
{
"epoch": 6.978885893980234,
"grad_norm": 3.734375,
"learning_rate": 9.698699695227454e-05,
"loss": 4.494,
"step": 15535
},
{
"epoch": 6.981132075471698,
"grad_norm": 3.53125,
"learning_rate": 9.689660300050007e-05,
"loss": 4.5579,
"step": 15540
},
{
"epoch": 6.983378256963163,
"grad_norm": 3.96875,
"learning_rate": 9.680624998700875e-05,
"loss": 4.6125,
"step": 15545
},
{
"epoch": 6.985624438454627,
"grad_norm": 3.90625,
"learning_rate": 9.671593796611356e-05,
"loss": 4.5417,
"step": 15550
},
{
"epoch": 6.987870619946092,
"grad_norm": 3.859375,
"learning_rate": 9.662566699210276e-05,
"loss": 4.5106,
"step": 15555
},
{
"epoch": 6.990116801437556,
"grad_norm": 3.640625,
"learning_rate": 9.653543711924005e-05,
"loss": 4.5051,
"step": 15560
},
{
"epoch": 6.992362982929021,
"grad_norm": 3.828125,
"learning_rate": 9.644524840176432e-05,
"loss": 4.5207,
"step": 15565
},
{
"epoch": 6.994609164420485,
"grad_norm": 3.796875,
"learning_rate": 9.635510089388985e-05,
"loss": 4.5393,
"step": 15570
},
{
"epoch": 6.99685534591195,
"grad_norm": 3.65625,
"learning_rate": 9.626499464980596e-05,
"loss": 4.5064,
"step": 15575
},
{
"epoch": 6.999101527403414,
"grad_norm": 3.765625,
"learning_rate": 9.617492972367731e-05,
"loss": 4.4867,
"step": 15580
},
{
"epoch": 7.001347708894879,
"grad_norm": 3.859375,
"learning_rate": 9.608490616964378e-05,
"loss": 4.5065,
"step": 15585
},
{
"epoch": 7.003593890386343,
"grad_norm": 3.65625,
"learning_rate": 9.599492404182018e-05,
"loss": 4.5735,
"step": 15590
},
{
"epoch": 7.005840071877808,
"grad_norm": 3.875,
"learning_rate": 9.590498339429659e-05,
"loss": 4.4286,
"step": 15595
},
{
"epoch": 7.008086253369272,
"grad_norm": 4.0625,
"learning_rate": 9.581508428113803e-05,
"loss": 4.4728,
"step": 15600
},
{
"epoch": 7.010332434860737,
"grad_norm": 3.875,
"learning_rate": 9.572522675638465e-05,
"loss": 4.4923,
"step": 15605
},
{
"epoch": 7.012578616352202,
"grad_norm": 3.875,
"learning_rate": 9.56354108740516e-05,
"loss": 4.509,
"step": 15610
},
{
"epoch": 7.014824797843666,
"grad_norm": 3.75,
"learning_rate": 9.554563668812888e-05,
"loss": 4.5017,
"step": 15615
},
{
"epoch": 7.017070979335131,
"grad_norm": 3.96875,
"learning_rate": 9.545590425258161e-05,
"loss": 4.4987,
"step": 15620
},
{
"epoch": 7.019317160826595,
"grad_norm": 3.875,
"learning_rate": 9.536621362134961e-05,
"loss": 4.481,
"step": 15625
},
{
"epoch": 7.02156334231806,
"grad_norm": 3.90625,
"learning_rate": 9.527656484834776e-05,
"loss": 4.5342,
"step": 15630
},
{
"epoch": 7.023809523809524,
"grad_norm": 3.75,
"learning_rate": 9.51869579874656e-05,
"loss": 4.5017,
"step": 15635
},
{
"epoch": 7.026055705300989,
"grad_norm": 3.875,
"learning_rate": 9.50973930925676e-05,
"loss": 4.4833,
"step": 15640
},
{
"epoch": 7.028301886792453,
"grad_norm": 3.90625,
"learning_rate": 9.500787021749303e-05,
"loss": 4.5442,
"step": 15645
},
{
"epoch": 7.030548068283918,
"grad_norm": 3.671875,
"learning_rate": 9.491838941605575e-05,
"loss": 4.4307,
"step": 15650
},
{
"epoch": 7.032794249775382,
"grad_norm": 4.0625,
"learning_rate": 9.482895074204451e-05,
"loss": 4.4837,
"step": 15655
},
{
"epoch": 7.035040431266847,
"grad_norm": 3.578125,
"learning_rate": 9.473955424922253e-05,
"loss": 4.5033,
"step": 15660
},
{
"epoch": 7.037286612758311,
"grad_norm": 3.71875,
"learning_rate": 9.465019999132792e-05,
"loss": 4.5645,
"step": 15665
},
{
"epoch": 7.039532794249776,
"grad_norm": 3.828125,
"learning_rate": 9.456088802207314e-05,
"loss": 4.476,
"step": 15670
},
{
"epoch": 7.04177897574124,
"grad_norm": 3.984375,
"learning_rate": 9.447161839514545e-05,
"loss": 4.4451,
"step": 15675
},
{
"epoch": 7.044025157232705,
"grad_norm": 3.640625,
"learning_rate": 9.43823911642066e-05,
"loss": 4.4924,
"step": 15680
},
{
"epoch": 7.046271338724169,
"grad_norm": 3.78125,
"learning_rate": 9.42932063828927e-05,
"loss": 4.5004,
"step": 15685
},
{
"epoch": 7.048517520215634,
"grad_norm": 4.0,
"learning_rate": 9.420406410481456e-05,
"loss": 4.522,
"step": 15690
},
{
"epoch": 7.050763701707098,
"grad_norm": 3.578125,
"learning_rate": 9.411496438355735e-05,
"loss": 4.5235,
"step": 15695
},
{
"epoch": 7.053009883198563,
"grad_norm": 4.65625,
"learning_rate": 9.402590727268055e-05,
"loss": 4.4144,
"step": 15700
},
{
"epoch": 7.055256064690027,
"grad_norm": 4.0625,
"learning_rate": 9.393689282571825e-05,
"loss": 4.4762,
"step": 15705
},
{
"epoch": 7.057502246181492,
"grad_norm": 3.9375,
"learning_rate": 9.384792109617868e-05,
"loss": 4.4985,
"step": 15710
},
{
"epoch": 7.059748427672956,
"grad_norm": 4.125,
"learning_rate": 9.375899213754453e-05,
"loss": 4.4447,
"step": 15715
},
{
"epoch": 7.061994609164421,
"grad_norm": 3.53125,
"learning_rate": 9.36701060032728e-05,
"loss": 4.4487,
"step": 15720
},
{
"epoch": 7.064240790655885,
"grad_norm": 4.1875,
"learning_rate": 9.358126274679453e-05,
"loss": 4.4904,
"step": 15725
},
{
"epoch": 7.0664869721473496,
"grad_norm": 3.765625,
"learning_rate": 9.349246242151532e-05,
"loss": 4.5504,
"step": 15730
},
{
"epoch": 7.068733153638814,
"grad_norm": 3.875,
"learning_rate": 9.340370508081463e-05,
"loss": 4.5029,
"step": 15735
},
{
"epoch": 7.0709793351302785,
"grad_norm": 3.84375,
"learning_rate": 9.331499077804634e-05,
"loss": 4.459,
"step": 15740
},
{
"epoch": 7.0732255166217435,
"grad_norm": 3.796875,
"learning_rate": 9.322631956653825e-05,
"loss": 4.4855,
"step": 15745
},
{
"epoch": 7.0754716981132075,
"grad_norm": 3.703125,
"learning_rate": 9.31376914995924e-05,
"loss": 4.4964,
"step": 15750
},
{
"epoch": 7.0777178796046725,
"grad_norm": 3.390625,
"learning_rate": 9.304910663048491e-05,
"loss": 4.49,
"step": 15755
},
{
"epoch": 7.0799640610961365,
"grad_norm": 4.0,
"learning_rate": 9.296056501246579e-05,
"loss": 4.5372,
"step": 15760
},
{
"epoch": 7.0822102425876015,
"grad_norm": 3.796875,
"learning_rate": 9.287206669875926e-05,
"loss": 4.4935,
"step": 15765
},
{
"epoch": 7.0844564240790655,
"grad_norm": 3.90625,
"learning_rate": 9.27836117425632e-05,
"loss": 4.455,
"step": 15770
},
{
"epoch": 7.0867026055705304,
"grad_norm": 3.84375,
"learning_rate": 9.26952001970498e-05,
"loss": 4.4762,
"step": 15775
},
{
"epoch": 7.0889487870619945,
"grad_norm": 3.765625,
"learning_rate": 9.260683211536484e-05,
"loss": 4.4771,
"step": 15780
},
{
"epoch": 7.091194968553459,
"grad_norm": 3.828125,
"learning_rate": 9.251850755062811e-05,
"loss": 4.4781,
"step": 15785
},
{
"epoch": 7.0934411500449235,
"grad_norm": 3.71875,
"learning_rate": 9.243022655593334e-05,
"loss": 4.4911,
"step": 15790
},
{
"epoch": 7.095687331536388,
"grad_norm": 3.65625,
"learning_rate": 9.234198918434785e-05,
"loss": 4.4318,
"step": 15795
},
{
"epoch": 7.0979335130278525,
"grad_norm": 3.703125,
"learning_rate": 9.225379548891291e-05,
"loss": 4.4665,
"step": 15800
},
{
"epoch": 7.100179694519317,
"grad_norm": 3.921875,
"learning_rate": 9.216564552264343e-05,
"loss": 4.4929,
"step": 15805
},
{
"epoch": 7.1024258760107815,
"grad_norm": 4.09375,
"learning_rate": 9.207753933852811e-05,
"loss": 4.4802,
"step": 15810
},
{
"epoch": 7.104672057502246,
"grad_norm": 3.984375,
"learning_rate": 9.198947698952933e-05,
"loss": 4.4331,
"step": 15815
},
{
"epoch": 7.1069182389937104,
"grad_norm": 3.546875,
"learning_rate": 9.190145852858297e-05,
"loss": 4.4975,
"step": 15820
},
{
"epoch": 7.109164420485175,
"grad_norm": 4.0,
"learning_rate": 9.181348400859882e-05,
"loss": 4.4677,
"step": 15825
},
{
"epoch": 7.111410601976639,
"grad_norm": 4.03125,
"learning_rate": 9.172555348245992e-05,
"loss": 4.4449,
"step": 15830
},
{
"epoch": 7.113656783468104,
"grad_norm": 3.765625,
"learning_rate": 9.163766700302316e-05,
"loss": 4.4773,
"step": 15835
},
{
"epoch": 7.115902964959568,
"grad_norm": 3.6875,
"learning_rate": 9.15498246231187e-05,
"loss": 4.4561,
"step": 15840
},
{
"epoch": 7.118149146451033,
"grad_norm": 3.96875,
"learning_rate": 9.146202639555036e-05,
"loss": 4.4665,
"step": 15845
},
{
"epoch": 7.120395327942497,
"grad_norm": 4.0625,
"learning_rate": 9.137427237309552e-05,
"loss": 4.5028,
"step": 15850
},
{
"epoch": 7.122641509433962,
"grad_norm": 3.796875,
"learning_rate": 9.128656260850459e-05,
"loss": 4.568,
"step": 15855
},
{
"epoch": 7.124887690925426,
"grad_norm": 3.921875,
"learning_rate": 9.119889715450172e-05,
"loss": 4.4987,
"step": 15860
},
{
"epoch": 7.127133872416891,
"grad_norm": 4.03125,
"learning_rate": 9.111127606378437e-05,
"loss": 4.4623,
"step": 15865
},
{
"epoch": 7.129380053908355,
"grad_norm": 3.703125,
"learning_rate": 9.102369938902324e-05,
"loss": 4.4772,
"step": 15870
},
{
"epoch": 7.13162623539982,
"grad_norm": 3.984375,
"learning_rate": 9.093616718286244e-05,
"loss": 4.446,
"step": 15875
},
{
"epoch": 7.133872416891284,
"grad_norm": 3.75,
"learning_rate": 9.084867949791923e-05,
"loss": 4.4265,
"step": 15880
},
{
"epoch": 7.136118598382749,
"grad_norm": 3.5625,
"learning_rate": 9.07612363867842e-05,
"loss": 4.4706,
"step": 15885
},
{
"epoch": 7.138364779874214,
"grad_norm": 3.640625,
"learning_rate": 9.067383790202109e-05,
"loss": 4.5364,
"step": 15890
},
{
"epoch": 7.140610961365678,
"grad_norm": 3.84375,
"learning_rate": 9.058648409616683e-05,
"loss": 4.5555,
"step": 15895
},
{
"epoch": 7.142857142857143,
"grad_norm": 3.765625,
"learning_rate": 9.049917502173158e-05,
"loss": 4.4759,
"step": 15900
},
{
"epoch": 7.145103324348607,
"grad_norm": 3.6875,
"learning_rate": 9.041191073119844e-05,
"loss": 4.4862,
"step": 15905
},
{
"epoch": 7.147349505840072,
"grad_norm": 4.1875,
"learning_rate": 9.032469127702375e-05,
"loss": 4.4843,
"step": 15910
},
{
"epoch": 7.149595687331536,
"grad_norm": 3.8125,
"learning_rate": 9.023751671163673e-05,
"loss": 4.4575,
"step": 15915
},
{
"epoch": 7.151841868823001,
"grad_norm": 4.0625,
"learning_rate": 9.015038708743986e-05,
"loss": 4.4978,
"step": 15920
},
{
"epoch": 7.154088050314465,
"grad_norm": 3.6875,
"learning_rate": 9.00633024568083e-05,
"loss": 4.4962,
"step": 15925
},
{
"epoch": 7.15633423180593,
"grad_norm": 3.75,
"learning_rate": 8.997626287209041e-05,
"loss": 4.4657,
"step": 15930
},
{
"epoch": 7.158580413297394,
"grad_norm": 4.03125,
"learning_rate": 8.988926838560742e-05,
"loss": 4.4657,
"step": 15935
},
{
"epoch": 7.160826594788859,
"grad_norm": 3.78125,
"learning_rate": 8.980231904965333e-05,
"loss": 4.5124,
"step": 15940
},
{
"epoch": 7.163072776280323,
"grad_norm": 3.671875,
"learning_rate": 8.971541491649518e-05,
"loss": 4.5223,
"step": 15945
},
{
"epoch": 7.165318957771788,
"grad_norm": 3.859375,
"learning_rate": 8.962855603837264e-05,
"loss": 4.4444,
"step": 15950
},
{
"epoch": 7.167565139263252,
"grad_norm": 3.765625,
"learning_rate": 8.954174246749835e-05,
"loss": 4.4397,
"step": 15955
},
{
"epoch": 7.169811320754717,
"grad_norm": 3.875,
"learning_rate": 8.945497425605765e-05,
"loss": 4.4983,
"step": 15960
},
{
"epoch": 7.172057502246181,
"grad_norm": 4.03125,
"learning_rate": 8.936825145620855e-05,
"loss": 4.4257,
"step": 15965
},
{
"epoch": 7.174303683737646,
"grad_norm": 3.734375,
"learning_rate": 8.92815741200819e-05,
"loss": 4.4325,
"step": 15970
},
{
"epoch": 7.17654986522911,
"grad_norm": 3.75,
"learning_rate": 8.919494229978106e-05,
"loss": 4.5093,
"step": 15975
},
{
"epoch": 7.178796046720575,
"grad_norm": 4.03125,
"learning_rate": 8.910835604738218e-05,
"loss": 4.4844,
"step": 15980
},
{
"epoch": 7.181042228212039,
"grad_norm": 3.8125,
"learning_rate": 8.902181541493386e-05,
"loss": 4.4911,
"step": 15985
},
{
"epoch": 7.183288409703504,
"grad_norm": 4.15625,
"learning_rate": 8.893532045445743e-05,
"loss": 4.5168,
"step": 15990
},
{
"epoch": 7.185534591194968,
"grad_norm": 3.796875,
"learning_rate": 8.884887121794674e-05,
"loss": 4.5234,
"step": 15995
},
{
"epoch": 7.187780772686433,
"grad_norm": 3.828125,
"learning_rate": 8.876246775736802e-05,
"loss": 4.5854,
"step": 16000
},
{
"epoch": 7.187780772686433,
"eval_loss": 4.78458833694458,
"eval_runtime": 16.0382,
"eval_samples_per_second": 1933.692,
"eval_steps_per_second": 241.735,
"step": 16000
},
{
"epoch": 7.190026954177897,
"grad_norm": 3.921875,
"learning_rate": 8.867611012466018e-05,
"loss": 4.4497,
"step": 16005
},
{
"epoch": 7.192273135669362,
"grad_norm": 3.8125,
"learning_rate": 8.85897983717344e-05,
"loss": 4.5458,
"step": 16010
},
{
"epoch": 7.194519317160827,
"grad_norm": 3.640625,
"learning_rate": 8.850353255047437e-05,
"loss": 4.5325,
"step": 16015
},
{
"epoch": 7.196765498652291,
"grad_norm": 3.90625,
"learning_rate": 8.841731271273623e-05,
"loss": 4.5205,
"step": 16020
},
{
"epoch": 7.199011680143756,
"grad_norm": 3.875,
"learning_rate": 8.833113891034832e-05,
"loss": 4.4578,
"step": 16025
},
{
"epoch": 7.20125786163522,
"grad_norm": 3.953125,
"learning_rate": 8.824501119511147e-05,
"loss": 4.4487,
"step": 16030
},
{
"epoch": 7.203504043126685,
"grad_norm": 3.875,
"learning_rate": 8.815892961879865e-05,
"loss": 4.4484,
"step": 16035
},
{
"epoch": 7.205750224618149,
"grad_norm": 4.09375,
"learning_rate": 8.807289423315524e-05,
"loss": 4.4684,
"step": 16040
},
{
"epoch": 7.207996406109614,
"grad_norm": 4.125,
"learning_rate": 8.798690508989883e-05,
"loss": 4.4769,
"step": 16045
},
{
"epoch": 7.210242587601078,
"grad_norm": 3.921875,
"learning_rate": 8.790096224071905e-05,
"loss": 4.4799,
"step": 16050
},
{
"epoch": 7.212488769092543,
"grad_norm": 3.59375,
"learning_rate": 8.781506573727798e-05,
"loss": 4.4449,
"step": 16055
},
{
"epoch": 7.214734950584007,
"grad_norm": 3.859375,
"learning_rate": 8.772921563120957e-05,
"loss": 4.5143,
"step": 16060
},
{
"epoch": 7.216981132075472,
"grad_norm": 3.5,
"learning_rate": 8.764341197412002e-05,
"loss": 4.5065,
"step": 16065
},
{
"epoch": 7.219227313566936,
"grad_norm": 3.890625,
"learning_rate": 8.755765481758765e-05,
"loss": 4.4876,
"step": 16070
},
{
"epoch": 7.221473495058401,
"grad_norm": 3.9375,
"learning_rate": 8.747194421316264e-05,
"loss": 4.5722,
"step": 16075
},
{
"epoch": 7.223719676549865,
"grad_norm": 3.515625,
"learning_rate": 8.738628021236748e-05,
"loss": 4.5264,
"step": 16080
},
{
"epoch": 7.22596585804133,
"grad_norm": 3.703125,
"learning_rate": 8.730066286669631e-05,
"loss": 4.4706,
"step": 16085
},
{
"epoch": 7.228212039532794,
"grad_norm": 3.75,
"learning_rate": 8.721509222761553e-05,
"loss": 4.4608,
"step": 16090
},
{
"epoch": 7.230458221024259,
"grad_norm": 3.953125,
"learning_rate": 8.712956834656318e-05,
"loss": 4.5088,
"step": 16095
},
{
"epoch": 7.232704402515723,
"grad_norm": 4.09375,
"learning_rate": 8.704409127494942e-05,
"loss": 4.4941,
"step": 16100
},
{
"epoch": 7.234950584007188,
"grad_norm": 3.96875,
"learning_rate": 8.695866106415623e-05,
"loss": 4.5131,
"step": 16105
},
{
"epoch": 7.237196765498652,
"grad_norm": 3.984375,
"learning_rate": 8.687327776553726e-05,
"loss": 4.5158,
"step": 16110
},
{
"epoch": 7.239442946990117,
"grad_norm": 4.03125,
"learning_rate": 8.678794143041821e-05,
"loss": 4.5603,
"step": 16115
},
{
"epoch": 7.241689128481581,
"grad_norm": 3.6875,
"learning_rate": 8.670265211009633e-05,
"loss": 4.5157,
"step": 16120
},
{
"epoch": 7.243935309973046,
"grad_norm": 4.09375,
"learning_rate": 8.661740985584074e-05,
"loss": 4.541,
"step": 16125
},
{
"epoch": 7.24618149146451,
"grad_norm": 3.953125,
"learning_rate": 8.653221471889221e-05,
"loss": 4.5027,
"step": 16130
},
{
"epoch": 7.248427672955975,
"grad_norm": 4.03125,
"learning_rate": 8.644706675046313e-05,
"loss": 4.447,
"step": 16135
},
{
"epoch": 7.250673854447439,
"grad_norm": 4.15625,
"learning_rate": 8.63619660017378e-05,
"loss": 4.5064,
"step": 16140
},
{
"epoch": 7.252920035938904,
"grad_norm": 3.96875,
"learning_rate": 8.627691252387174e-05,
"loss": 4.5067,
"step": 16145
},
{
"epoch": 7.255166217430368,
"grad_norm": 4.0,
"learning_rate": 8.61919063679924e-05,
"loss": 4.483,
"step": 16150
},
{
"epoch": 7.257412398921833,
"grad_norm": 3.65625,
"learning_rate": 8.610694758519852e-05,
"loss": 4.5225,
"step": 16155
},
{
"epoch": 7.259658580413298,
"grad_norm": 3.65625,
"learning_rate": 8.602203622656055e-05,
"loss": 4.4459,
"step": 16160
},
{
"epoch": 7.261904761904762,
"grad_norm": 3.78125,
"learning_rate": 8.593717234312045e-05,
"loss": 4.4964,
"step": 16165
},
{
"epoch": 7.264150943396227,
"grad_norm": 3.515625,
"learning_rate": 8.585235598589144e-05,
"loss": 4.5308,
"step": 16170
},
{
"epoch": 7.266397124887691,
"grad_norm": 3.640625,
"learning_rate": 8.576758720585835e-05,
"loss": 4.4536,
"step": 16175
},
{
"epoch": 7.268643306379156,
"grad_norm": 3.765625,
"learning_rate": 8.568286605397726e-05,
"loss": 4.4493,
"step": 16180
},
{
"epoch": 7.27088948787062,
"grad_norm": 3.875,
"learning_rate": 8.559819258117578e-05,
"loss": 4.5189,
"step": 16185
},
{
"epoch": 7.273135669362085,
"grad_norm": 4.03125,
"learning_rate": 8.551356683835285e-05,
"loss": 4.5442,
"step": 16190
},
{
"epoch": 7.275381850853549,
"grad_norm": 3.59375,
"learning_rate": 8.542898887637855e-05,
"loss": 4.454,
"step": 16195
},
{
"epoch": 7.277628032345014,
"grad_norm": 4.03125,
"learning_rate": 8.53444587460944e-05,
"loss": 4.4647,
"step": 16200
},
{
"epoch": 7.279874213836478,
"grad_norm": 3.78125,
"learning_rate": 8.52599764983131e-05,
"loss": 4.4834,
"step": 16205
},
{
"epoch": 7.282120395327943,
"grad_norm": 3.84375,
"learning_rate": 8.517554218381856e-05,
"loss": 4.4616,
"step": 16210
},
{
"epoch": 7.284366576819407,
"grad_norm": 3.84375,
"learning_rate": 8.509115585336598e-05,
"loss": 4.4305,
"step": 16215
},
{
"epoch": 7.286612758310872,
"grad_norm": 3.859375,
"learning_rate": 8.500681755768151e-05,
"loss": 4.4802,
"step": 16220
},
{
"epoch": 7.288858939802336,
"grad_norm": 3.90625,
"learning_rate": 8.492252734746268e-05,
"loss": 4.4653,
"step": 16225
},
{
"epoch": 7.291105121293801,
"grad_norm": 3.90625,
"learning_rate": 8.483828527337787e-05,
"loss": 4.5672,
"step": 16230
},
{
"epoch": 7.293351302785265,
"grad_norm": 3.9375,
"learning_rate": 8.47540913860667e-05,
"loss": 4.4814,
"step": 16235
},
{
"epoch": 7.29559748427673,
"grad_norm": 4.0625,
"learning_rate": 8.466994573613974e-05,
"loss": 4.5198,
"step": 16240
},
{
"epoch": 7.297843665768194,
"grad_norm": 3.578125,
"learning_rate": 8.458584837417858e-05,
"loss": 4.4652,
"step": 16245
},
{
"epoch": 7.300089847259659,
"grad_norm": 3.9375,
"learning_rate": 8.450179935073583e-05,
"loss": 4.5406,
"step": 16250
},
{
"epoch": 7.302336028751123,
"grad_norm": 4.0625,
"learning_rate": 8.441779871633491e-05,
"loss": 4.4671,
"step": 16255
},
{
"epoch": 7.304582210242588,
"grad_norm": 4.09375,
"learning_rate": 8.433384652147037e-05,
"loss": 4.5091,
"step": 16260
},
{
"epoch": 7.306828391734052,
"grad_norm": 3.859375,
"learning_rate": 8.424994281660739e-05,
"loss": 4.5249,
"step": 16265
},
{
"epoch": 7.309074573225517,
"grad_norm": 3.9375,
"learning_rate": 8.416608765218223e-05,
"loss": 4.4371,
"step": 16270
},
{
"epoch": 7.311320754716981,
"grad_norm": 4.03125,
"learning_rate": 8.40822810786018e-05,
"loss": 4.5291,
"step": 16275
},
{
"epoch": 7.313566936208446,
"grad_norm": 3.859375,
"learning_rate": 8.399852314624385e-05,
"loss": 4.4864,
"step": 16280
},
{
"epoch": 7.315813117699911,
"grad_norm": 4.125,
"learning_rate": 8.391481390545704e-05,
"loss": 4.452,
"step": 16285
},
{
"epoch": 7.318059299191375,
"grad_norm": 4.15625,
"learning_rate": 8.383115340656048e-05,
"loss": 4.5127,
"step": 16290
},
{
"epoch": 7.320305480682839,
"grad_norm": 3.78125,
"learning_rate": 8.374754169984422e-05,
"loss": 4.5017,
"step": 16295
},
{
"epoch": 7.322551662174304,
"grad_norm": 4.0625,
"learning_rate": 8.366397883556883e-05,
"loss": 4.4903,
"step": 16300
},
{
"epoch": 7.324797843665769,
"grad_norm": 3.875,
"learning_rate": 8.358046486396564e-05,
"loss": 4.5602,
"step": 16305
},
{
"epoch": 7.327044025157233,
"grad_norm": 3.875,
"learning_rate": 8.349699983523654e-05,
"loss": 4.5111,
"step": 16310
},
{
"epoch": 7.329290206648698,
"grad_norm": 4.0625,
"learning_rate": 8.341358379955392e-05,
"loss": 4.4737,
"step": 16315
},
{
"epoch": 7.331536388140162,
"grad_norm": 3.703125,
"learning_rate": 8.333021680706085e-05,
"loss": 4.4634,
"step": 16320
},
{
"epoch": 7.333782569631627,
"grad_norm": 3.84375,
"learning_rate": 8.324689890787086e-05,
"loss": 4.548,
"step": 16325
},
{
"epoch": 7.336028751123091,
"grad_norm": 3.921875,
"learning_rate": 8.316363015206787e-05,
"loss": 4.4421,
"step": 16330
},
{
"epoch": 7.3382749326145555,
"grad_norm": 3.828125,
"learning_rate": 8.30804105897065e-05,
"loss": 4.5086,
"step": 16335
},
{
"epoch": 7.34052111410602,
"grad_norm": 4.0,
"learning_rate": 8.299724027081154e-05,
"loss": 4.4927,
"step": 16340
},
{
"epoch": 7.3427672955974845,
"grad_norm": 3.9375,
"learning_rate": 8.291411924537838e-05,
"loss": 4.4961,
"step": 16345
},
{
"epoch": 7.345013477088949,
"grad_norm": 3.65625,
"learning_rate": 8.283104756337261e-05,
"loss": 4.4941,
"step": 16350
},
{
"epoch": 7.3472596585804135,
"grad_norm": 3.578125,
"learning_rate": 8.274802527473027e-05,
"loss": 4.4502,
"step": 16355
},
{
"epoch": 7.349505840071878,
"grad_norm": 4.03125,
"learning_rate": 8.266505242935777e-05,
"loss": 4.4218,
"step": 16360
},
{
"epoch": 7.3517520215633425,
"grad_norm": 3.765625,
"learning_rate": 8.258212907713158e-05,
"loss": 4.4552,
"step": 16365
},
{
"epoch": 7.353998203054807,
"grad_norm": 3.828125,
"learning_rate": 8.249925526789864e-05,
"loss": 4.4254,
"step": 16370
},
{
"epoch": 7.3562443845462715,
"grad_norm": 4.09375,
"learning_rate": 8.241643105147594e-05,
"loss": 4.4798,
"step": 16375
},
{
"epoch": 7.3584905660377355,
"grad_norm": 4.03125,
"learning_rate": 8.233365647765082e-05,
"loss": 4.5004,
"step": 16380
},
{
"epoch": 7.3607367475292005,
"grad_norm": 3.828125,
"learning_rate": 8.225093159618059e-05,
"loss": 4.4728,
"step": 16385
},
{
"epoch": 7.3629829290206645,
"grad_norm": 3.75,
"learning_rate": 8.216825645679288e-05,
"loss": 4.5282,
"step": 16390
},
{
"epoch": 7.3652291105121295,
"grad_norm": 4.125,
"learning_rate": 8.208563110918534e-05,
"loss": 4.4876,
"step": 16395
},
{
"epoch": 7.3674752920035935,
"grad_norm": 3.75,
"learning_rate": 8.20030556030256e-05,
"loss": 4.4282,
"step": 16400
},
{
"epoch": 7.3697214734950585,
"grad_norm": 3.890625,
"learning_rate": 8.192052998795149e-05,
"loss": 4.4644,
"step": 16405
},
{
"epoch": 7.3719676549865225,
"grad_norm": 3.84375,
"learning_rate": 8.18380543135707e-05,
"loss": 4.5372,
"step": 16410
},
{
"epoch": 7.3742138364779874,
"grad_norm": 3.65625,
"learning_rate": 8.175562862946102e-05,
"loss": 4.4788,
"step": 16415
},
{
"epoch": 7.3764600179694515,
"grad_norm": 3.984375,
"learning_rate": 8.167325298517015e-05,
"loss": 4.5196,
"step": 16420
},
{
"epoch": 7.378706199460916,
"grad_norm": 3.625,
"learning_rate": 8.159092743021566e-05,
"loss": 4.5199,
"step": 16425
},
{
"epoch": 7.380952380952381,
"grad_norm": 3.90625,
"learning_rate": 8.15086520140851e-05,
"loss": 4.4835,
"step": 16430
},
{
"epoch": 7.383198562443845,
"grad_norm": 4.25,
"learning_rate": 8.142642678623576e-05,
"loss": 4.4979,
"step": 16435
},
{
"epoch": 7.38544474393531,
"grad_norm": 3.875,
"learning_rate": 8.134425179609489e-05,
"loss": 4.4895,
"step": 16440
},
{
"epoch": 7.387690925426774,
"grad_norm": 4.09375,
"learning_rate": 8.126212709305946e-05,
"loss": 4.4258,
"step": 16445
},
{
"epoch": 7.389937106918239,
"grad_norm": 3.9375,
"learning_rate": 8.118005272649622e-05,
"loss": 4.4786,
"step": 16450
},
{
"epoch": 7.392183288409703,
"grad_norm": 3.734375,
"learning_rate": 8.109802874574171e-05,
"loss": 4.4982,
"step": 16455
},
{
"epoch": 7.394429469901168,
"grad_norm": 3.796875,
"learning_rate": 8.101605520010212e-05,
"loss": 4.5045,
"step": 16460
},
{
"epoch": 7.396675651392632,
"grad_norm": 3.765625,
"learning_rate": 8.09341321388534e-05,
"loss": 4.5082,
"step": 16465
},
{
"epoch": 7.398921832884097,
"grad_norm": 4.0,
"learning_rate": 8.0852259611241e-05,
"loss": 4.4946,
"step": 16470
},
{
"epoch": 7.401168014375561,
"grad_norm": 3.96875,
"learning_rate": 8.077043766648025e-05,
"loss": 4.4921,
"step": 16475
},
{
"epoch": 7.403414195867026,
"grad_norm": 4.34375,
"learning_rate": 8.068866635375575e-05,
"loss": 4.4717,
"step": 16480
},
{
"epoch": 7.40566037735849,
"grad_norm": 3.78125,
"learning_rate": 8.060694572222198e-05,
"loss": 4.4662,
"step": 16485
},
{
"epoch": 7.407906558849955,
"grad_norm": 3.828125,
"learning_rate": 8.052527582100275e-05,
"loss": 4.4932,
"step": 16490
},
{
"epoch": 7.410152740341419,
"grad_norm": 3.734375,
"learning_rate": 8.044365669919137e-05,
"loss": 4.4941,
"step": 16495
},
{
"epoch": 7.412398921832884,
"grad_norm": 3.75,
"learning_rate": 8.036208840585076e-05,
"loss": 4.5339,
"step": 16500
},
{
"epoch": 7.414645103324348,
"grad_norm": 4.03125,
"learning_rate": 8.028057099001324e-05,
"loss": 4.4626,
"step": 16505
},
{
"epoch": 7.416891284815813,
"grad_norm": 3.71875,
"learning_rate": 8.019910450068046e-05,
"loss": 4.565,
"step": 16510
},
{
"epoch": 7.419137466307277,
"grad_norm": 3.953125,
"learning_rate": 8.011768898682357e-05,
"loss": 4.538,
"step": 16515
},
{
"epoch": 7.421383647798742,
"grad_norm": 4.0,
"learning_rate": 8.003632449738297e-05,
"loss": 4.4717,
"step": 16520
},
{
"epoch": 7.423629829290206,
"grad_norm": 3.984375,
"learning_rate": 7.995501108126851e-05,
"loss": 4.4681,
"step": 16525
},
{
"epoch": 7.425876010781671,
"grad_norm": 3.84375,
"learning_rate": 7.987374878735922e-05,
"loss": 4.4855,
"step": 16530
},
{
"epoch": 7.428122192273135,
"grad_norm": 3.796875,
"learning_rate": 7.979253766450347e-05,
"loss": 4.4463,
"step": 16535
},
{
"epoch": 7.4303683737646,
"grad_norm": 3.921875,
"learning_rate": 7.971137776151891e-05,
"loss": 4.4767,
"step": 16540
},
{
"epoch": 7.432614555256064,
"grad_norm": 3.796875,
"learning_rate": 7.963026912719223e-05,
"loss": 4.5125,
"step": 16545
},
{
"epoch": 7.434860736747529,
"grad_norm": 3.828125,
"learning_rate": 7.954921181027953e-05,
"loss": 4.4621,
"step": 16550
},
{
"epoch": 7.437106918238994,
"grad_norm": 4.03125,
"learning_rate": 7.946820585950587e-05,
"loss": 4.4562,
"step": 16555
},
{
"epoch": 7.439353099730458,
"grad_norm": 3.703125,
"learning_rate": 7.938725132356549e-05,
"loss": 4.4049,
"step": 16560
},
{
"epoch": 7.441599281221922,
"grad_norm": 3.84375,
"learning_rate": 7.930634825112187e-05,
"loss": 4.4527,
"step": 16565
},
{
"epoch": 7.443845462713387,
"grad_norm": 3.953125,
"learning_rate": 7.92254966908073e-05,
"loss": 4.5469,
"step": 16570
},
{
"epoch": 7.446091644204852,
"grad_norm": 3.953125,
"learning_rate": 7.914469669122331e-05,
"loss": 4.5527,
"step": 16575
},
{
"epoch": 7.448337825696316,
"grad_norm": 3.890625,
"learning_rate": 7.906394830094031e-05,
"loss": 4.4505,
"step": 16580
},
{
"epoch": 7.450584007187781,
"grad_norm": 3.78125,
"learning_rate": 7.898325156849779e-05,
"loss": 4.5039,
"step": 16585
},
{
"epoch": 7.452830188679245,
"grad_norm": 3.953125,
"learning_rate": 7.890260654240407e-05,
"loss": 4.4809,
"step": 16590
},
{
"epoch": 7.45507637017071,
"grad_norm": 3.890625,
"learning_rate": 7.882201327113644e-05,
"loss": 4.4749,
"step": 16595
},
{
"epoch": 7.457322551662174,
"grad_norm": 3.828125,
"learning_rate": 7.87414718031412e-05,
"loss": 4.447,
"step": 16600
},
{
"epoch": 7.459568733153639,
"grad_norm": 3.859375,
"learning_rate": 7.86609821868333e-05,
"loss": 4.5102,
"step": 16605
},
{
"epoch": 7.461814914645103,
"grad_norm": 3.703125,
"learning_rate": 7.858054447059671e-05,
"loss": 4.4386,
"step": 16610
},
{
"epoch": 7.464061096136568,
"grad_norm": 3.921875,
"learning_rate": 7.850015870278398e-05,
"loss": 4.5193,
"step": 16615
},
{
"epoch": 7.466307277628032,
"grad_norm": 4.0625,
"learning_rate": 7.841982493171671e-05,
"loss": 4.4742,
"step": 16620
},
{
"epoch": 7.468553459119497,
"grad_norm": 3.953125,
"learning_rate": 7.833954320568498e-05,
"loss": 4.5049,
"step": 16625
},
{
"epoch": 7.470799640610961,
"grad_norm": 3.65625,
"learning_rate": 7.825931357294777e-05,
"loss": 4.5168,
"step": 16630
},
{
"epoch": 7.473045822102426,
"grad_norm": 3.921875,
"learning_rate": 7.81791360817327e-05,
"loss": 4.4544,
"step": 16635
},
{
"epoch": 7.47529200359389,
"grad_norm": 3.828125,
"learning_rate": 7.809901078023598e-05,
"loss": 4.4673,
"step": 16640
},
{
"epoch": 7.477538185085355,
"grad_norm": 3.703125,
"learning_rate": 7.801893771662253e-05,
"loss": 4.5267,
"step": 16645
},
{
"epoch": 7.479784366576819,
"grad_norm": 3.765625,
"learning_rate": 7.793891693902582e-05,
"loss": 4.4746,
"step": 16650
},
{
"epoch": 7.482030548068284,
"grad_norm": 3.578125,
"learning_rate": 7.785894849554785e-05,
"loss": 4.4841,
"step": 16655
},
{
"epoch": 7.484276729559748,
"grad_norm": 4.0,
"learning_rate": 7.777903243425933e-05,
"loss": 4.4935,
"step": 16660
},
{
"epoch": 7.486522911051213,
"grad_norm": 3.796875,
"learning_rate": 7.769916880319925e-05,
"loss": 4.403,
"step": 16665
},
{
"epoch": 7.488769092542677,
"grad_norm": 4.15625,
"learning_rate": 7.761935765037527e-05,
"loss": 4.5541,
"step": 16670
},
{
"epoch": 7.491015274034142,
"grad_norm": 3.71875,
"learning_rate": 7.753959902376338e-05,
"loss": 4.5181,
"step": 16675
},
{
"epoch": 7.493261455525606,
"grad_norm": 3.859375,
"learning_rate": 7.745989297130808e-05,
"loss": 4.4491,
"step": 16680
},
{
"epoch": 7.495507637017071,
"grad_norm": 3.71875,
"learning_rate": 7.738023954092229e-05,
"loss": 4.4856,
"step": 16685
},
{
"epoch": 7.497753818508535,
"grad_norm": 3.890625,
"learning_rate": 7.730063878048717e-05,
"loss": 4.5341,
"step": 16690
},
{
"epoch": 7.5,
"grad_norm": 4.125,
"learning_rate": 7.722109073785234e-05,
"loss": 4.4758,
"step": 16695
},
{
"epoch": 7.502246181491465,
"grad_norm": 3.796875,
"learning_rate": 7.71415954608356e-05,
"loss": 4.5248,
"step": 16700
},
{
"epoch": 7.504492362982929,
"grad_norm": 4.03125,
"learning_rate": 7.706215299722321e-05,
"loss": 4.4835,
"step": 16705
},
{
"epoch": 7.506738544474393,
"grad_norm": 3.953125,
"learning_rate": 7.698276339476957e-05,
"loss": 4.511,
"step": 16710
},
{
"epoch": 7.508984725965858,
"grad_norm": 3.59375,
"learning_rate": 7.690342670119726e-05,
"loss": 4.4425,
"step": 16715
},
{
"epoch": 7.511230907457323,
"grad_norm": 4.0625,
"learning_rate": 7.682414296419724e-05,
"loss": 4.4964,
"step": 16720
},
{
"epoch": 7.513477088948787,
"grad_norm": 4.03125,
"learning_rate": 7.674491223142836e-05,
"loss": 4.46,
"step": 16725
},
{
"epoch": 7.515723270440252,
"grad_norm": 3.859375,
"learning_rate": 7.666573455051789e-05,
"loss": 4.4496,
"step": 16730
},
{
"epoch": 7.517969451931716,
"grad_norm": 4.09375,
"learning_rate": 7.658660996906097e-05,
"loss": 4.5062,
"step": 16735
},
{
"epoch": 7.520215633423181,
"grad_norm": 3.96875,
"learning_rate": 7.650753853462101e-05,
"loss": 4.4952,
"step": 16740
},
{
"epoch": 7.522461814914645,
"grad_norm": 3.84375,
"learning_rate": 7.642852029472939e-05,
"loss": 4.5598,
"step": 16745
},
{
"epoch": 7.52470799640611,
"grad_norm": 3.59375,
"learning_rate": 7.63495552968855e-05,
"loss": 4.4556,
"step": 16750
},
{
"epoch": 7.526954177897574,
"grad_norm": 3.765625,
"learning_rate": 7.627064358855677e-05,
"loss": 4.4857,
"step": 16755
},
{
"epoch": 7.529200359389039,
"grad_norm": 4.03125,
"learning_rate": 7.619178521717853e-05,
"loss": 4.4772,
"step": 16760
},
{
"epoch": 7.531446540880503,
"grad_norm": 3.875,
"learning_rate": 7.611298023015408e-05,
"loss": 4.4325,
"step": 16765
},
{
"epoch": 7.533692722371968,
"grad_norm": 3.890625,
"learning_rate": 7.603422867485472e-05,
"loss": 4.4809,
"step": 16770
},
{
"epoch": 7.535938903863432,
"grad_norm": 3.9375,
"learning_rate": 7.595553059861946e-05,
"loss": 4.4866,
"step": 16775
},
{
"epoch": 7.538185085354897,
"grad_norm": 3.953125,
"learning_rate": 7.587688604875534e-05,
"loss": 4.5252,
"step": 16780
},
{
"epoch": 7.540431266846361,
"grad_norm": 3.828125,
"learning_rate": 7.579829507253702e-05,
"loss": 4.4775,
"step": 16785
},
{
"epoch": 7.542677448337826,
"grad_norm": 3.90625,
"learning_rate": 7.571975771720719e-05,
"loss": 4.4986,
"step": 16790
},
{
"epoch": 7.54492362982929,
"grad_norm": 3.796875,
"learning_rate": 7.564127402997607e-05,
"loss": 4.414,
"step": 16795
},
{
"epoch": 7.547169811320755,
"grad_norm": 3.78125,
"learning_rate": 7.556284405802187e-05,
"loss": 4.5513,
"step": 16800
},
{
"epoch": 7.549415992812219,
"grad_norm": 3.953125,
"learning_rate": 7.548446784849028e-05,
"loss": 4.4816,
"step": 16805
},
{
"epoch": 7.551662174303684,
"grad_norm": 3.921875,
"learning_rate": 7.54061454484948e-05,
"loss": 4.4992,
"step": 16810
},
{
"epoch": 7.553908355795148,
"grad_norm": 3.96875,
"learning_rate": 7.532787690511656e-05,
"loss": 4.5391,
"step": 16815
},
{
"epoch": 7.556154537286613,
"grad_norm": 3.703125,
"learning_rate": 7.524966226540434e-05,
"loss": 4.5233,
"step": 16820
},
{
"epoch": 7.558400718778078,
"grad_norm": 3.71875,
"learning_rate": 7.51715015763744e-05,
"loss": 4.5246,
"step": 16825
},
{
"epoch": 7.560646900269542,
"grad_norm": 3.671875,
"learning_rate": 7.509339488501077e-05,
"loss": 4.4983,
"step": 16830
},
{
"epoch": 7.562893081761006,
"grad_norm": 3.84375,
"learning_rate": 7.501534223826481e-05,
"loss": 4.5314,
"step": 16835
},
{
"epoch": 7.565139263252471,
"grad_norm": 4.09375,
"learning_rate": 7.49373436830556e-05,
"loss": 4.4689,
"step": 16840
},
{
"epoch": 7.567385444743936,
"grad_norm": 3.921875,
"learning_rate": 7.485939926626948e-05,
"loss": 4.5512,
"step": 16845
},
{
"epoch": 7.5696316262354,
"grad_norm": 3.65625,
"learning_rate": 7.478150903476043e-05,
"loss": 4.5035,
"step": 16850
},
{
"epoch": 7.571877807726865,
"grad_norm": 3.8125,
"learning_rate": 7.47036730353498e-05,
"loss": 4.5341,
"step": 16855
},
{
"epoch": 7.574123989218329,
"grad_norm": 4.03125,
"learning_rate": 7.462589131482628e-05,
"loss": 4.4958,
"step": 16860
},
{
"epoch": 7.576370170709794,
"grad_norm": 4.21875,
"learning_rate": 7.454816391994604e-05,
"loss": 4.4979,
"step": 16865
},
{
"epoch": 7.578616352201258,
"grad_norm": 4.0,
"learning_rate": 7.447049089743247e-05,
"loss": 4.4834,
"step": 16870
},
{
"epoch": 7.580862533692723,
"grad_norm": 4.03125,
"learning_rate": 7.439287229397642e-05,
"loss": 4.5267,
"step": 16875
},
{
"epoch": 7.583108715184187,
"grad_norm": 3.859375,
"learning_rate": 7.431530815623586e-05,
"loss": 4.4695,
"step": 16880
},
{
"epoch": 7.585354896675652,
"grad_norm": 3.65625,
"learning_rate": 7.423779853083618e-05,
"loss": 4.5421,
"step": 16885
},
{
"epoch": 7.587601078167116,
"grad_norm": 3.578125,
"learning_rate": 7.416034346436994e-05,
"loss": 4.4031,
"step": 16890
},
{
"epoch": 7.589847259658581,
"grad_norm": 4.0625,
"learning_rate": 7.408294300339682e-05,
"loss": 4.477,
"step": 16895
},
{
"epoch": 7.592093441150045,
"grad_norm": 3.984375,
"learning_rate": 7.400559719444382e-05,
"loss": 4.4463,
"step": 16900
},
{
"epoch": 7.59433962264151,
"grad_norm": 3.578125,
"learning_rate": 7.392830608400499e-05,
"loss": 4.4661,
"step": 16905
},
{
"epoch": 7.596585804132974,
"grad_norm": 3.796875,
"learning_rate": 7.385106971854148e-05,
"loss": 4.4802,
"step": 16910
},
{
"epoch": 7.598831985624439,
"grad_norm": 3.75,
"learning_rate": 7.37738881444817e-05,
"loss": 4.4845,
"step": 16915
},
{
"epoch": 7.601078167115903,
"grad_norm": 3.859375,
"learning_rate": 7.369676140822088e-05,
"loss": 4.5179,
"step": 16920
},
{
"epoch": 7.603324348607368,
"grad_norm": 4.40625,
"learning_rate": 7.361968955612151e-05,
"loss": 4.5927,
"step": 16925
},
{
"epoch": 7.605570530098832,
"grad_norm": 4.28125,
"learning_rate": 7.354267263451288e-05,
"loss": 4.4907,
"step": 16930
},
{
"epoch": 7.607816711590297,
"grad_norm": 3.828125,
"learning_rate": 7.346571068969147e-05,
"loss": 4.4642,
"step": 16935
},
{
"epoch": 7.610062893081761,
"grad_norm": 3.78125,
"learning_rate": 7.338880376792052e-05,
"loss": 4.4903,
"step": 16940
},
{
"epoch": 7.612309074573226,
"grad_norm": 3.828125,
"learning_rate": 7.331195191543033e-05,
"loss": 4.4774,
"step": 16945
},
{
"epoch": 7.6145552560646905,
"grad_norm": 4.0,
"learning_rate": 7.323515517841807e-05,
"loss": 4.5547,
"step": 16950
},
{
"epoch": 7.616801437556155,
"grad_norm": 4.34375,
"learning_rate": 7.315841360304773e-05,
"loss": 4.4924,
"step": 16955
},
{
"epoch": 7.619047619047619,
"grad_norm": 3.953125,
"learning_rate": 7.308172723545019e-05,
"loss": 4.5127,
"step": 16960
},
{
"epoch": 7.621293800539084,
"grad_norm": 3.921875,
"learning_rate": 7.300509612172313e-05,
"loss": 4.5099,
"step": 16965
},
{
"epoch": 7.6235399820305485,
"grad_norm": 3.78125,
"learning_rate": 7.292852030793095e-05,
"loss": 4.5229,
"step": 16970
},
{
"epoch": 7.6257861635220126,
"grad_norm": 4.84375,
"learning_rate": 7.285199984010494e-05,
"loss": 4.5042,
"step": 16975
},
{
"epoch": 7.628032345013477,
"grad_norm": 3.953125,
"learning_rate": 7.277553476424299e-05,
"loss": 4.4774,
"step": 16980
},
{
"epoch": 7.6302785265049415,
"grad_norm": 3.96875,
"learning_rate": 7.26991251263098e-05,
"loss": 4.497,
"step": 16985
},
{
"epoch": 7.6325247079964065,
"grad_norm": 3.921875,
"learning_rate": 7.262277097223665e-05,
"loss": 4.4846,
"step": 16990
},
{
"epoch": 7.6347708894878705,
"grad_norm": 3.890625,
"learning_rate": 7.254647234792155e-05,
"loss": 4.5027,
"step": 16995
},
{
"epoch": 7.6370170709793355,
"grad_norm": 4.0,
"learning_rate": 7.247022929922913e-05,
"loss": 4.4799,
"step": 17000
},
{
"epoch": 7.6370170709793355,
"eval_loss": 4.779622554779053,
"eval_runtime": 16.0334,
"eval_samples_per_second": 1934.274,
"eval_steps_per_second": 241.808,
"step": 17000
},
{
"epoch": 7.6392632524707995,
"grad_norm": 3.890625,
"learning_rate": 7.239404187199049e-05,
"loss": 4.4591,
"step": 17005
},
{
"epoch": 7.6415094339622645,
"grad_norm": 3.890625,
"learning_rate": 7.231791011200347e-05,
"loss": 4.4489,
"step": 17010
},
{
"epoch": 7.6437556154537285,
"grad_norm": 3.8125,
"learning_rate": 7.224183406503228e-05,
"loss": 4.4788,
"step": 17015
},
{
"epoch": 7.646001796945193,
"grad_norm": 3.875,
"learning_rate": 7.216581377680779e-05,
"loss": 4.4854,
"step": 17020
},
{
"epoch": 7.6482479784366575,
"grad_norm": 4.03125,
"learning_rate": 7.208984929302719e-05,
"loss": 4.4835,
"step": 17025
},
{
"epoch": 7.650494159928122,
"grad_norm": 4.03125,
"learning_rate": 7.201394065935427e-05,
"loss": 4.4524,
"step": 17030
},
{
"epoch": 7.6527403414195865,
"grad_norm": 4.03125,
"learning_rate": 7.193808792141926e-05,
"loss": 4.473,
"step": 17035
},
{
"epoch": 7.654986522911051,
"grad_norm": 3.828125,
"learning_rate": 7.186229112481861e-05,
"loss": 4.4807,
"step": 17040
},
{
"epoch": 7.6572327044025155,
"grad_norm": 3.59375,
"learning_rate": 7.178655031511534e-05,
"loss": 4.5386,
"step": 17045
},
{
"epoch": 7.65947888589398,
"grad_norm": 3.71875,
"learning_rate": 7.171086553783866e-05,
"loss": 4.573,
"step": 17050
},
{
"epoch": 7.6617250673854445,
"grad_norm": 3.890625,
"learning_rate": 7.163523683848418e-05,
"loss": 4.4759,
"step": 17055
},
{
"epoch": 7.663971248876909,
"grad_norm": 3.921875,
"learning_rate": 7.155966426251387e-05,
"loss": 4.4932,
"step": 17060
},
{
"epoch": 7.666217430368373,
"grad_norm": 3.734375,
"learning_rate": 7.14841478553558e-05,
"loss": 4.4657,
"step": 17065
},
{
"epoch": 7.668463611859838,
"grad_norm": 3.8125,
"learning_rate": 7.140868766240443e-05,
"loss": 4.4906,
"step": 17070
},
{
"epoch": 7.670709793351302,
"grad_norm": 4.25,
"learning_rate": 7.133328372902025e-05,
"loss": 4.5175,
"step": 17075
},
{
"epoch": 7.672955974842767,
"grad_norm": 3.859375,
"learning_rate": 7.125793610053015e-05,
"loss": 4.5268,
"step": 17080
},
{
"epoch": 7.675202156334231,
"grad_norm": 4.15625,
"learning_rate": 7.118264482222697e-05,
"loss": 4.5015,
"step": 17085
},
{
"epoch": 7.677448337825696,
"grad_norm": 4.0625,
"learning_rate": 7.110740993936981e-05,
"loss": 4.4872,
"step": 17090
},
{
"epoch": 7.679694519317161,
"grad_norm": 3.859375,
"learning_rate": 7.103223149718387e-05,
"loss": 4.4893,
"step": 17095
},
{
"epoch": 7.681940700808625,
"grad_norm": 3.90625,
"learning_rate": 7.095710954086032e-05,
"loss": 4.4821,
"step": 17100
},
{
"epoch": 7.684186882300089,
"grad_norm": 3.953125,
"learning_rate": 7.088204411555647e-05,
"loss": 4.515,
"step": 17105
},
{
"epoch": 7.686433063791554,
"grad_norm": 3.75,
"learning_rate": 7.080703526639556e-05,
"loss": 4.4783,
"step": 17110
},
{
"epoch": 7.688679245283019,
"grad_norm": 4.21875,
"learning_rate": 7.073208303846694e-05,
"loss": 4.5,
"step": 17115
},
{
"epoch": 7.690925426774483,
"grad_norm": 4.0625,
"learning_rate": 7.06571874768259e-05,
"loss": 4.5088,
"step": 17120
},
{
"epoch": 7.693171608265948,
"grad_norm": 3.703125,
"learning_rate": 7.05823486264935e-05,
"loss": 4.3995,
"step": 17125
},
{
"epoch": 7.695417789757412,
"grad_norm": 3.828125,
"learning_rate": 7.050756653245693e-05,
"loss": 4.4503,
"step": 17130
},
{
"epoch": 7.697663971248877,
"grad_norm": 3.828125,
"learning_rate": 7.04328412396691e-05,
"loss": 4.5101,
"step": 17135
},
{
"epoch": 7.699910152740341,
"grad_norm": 3.921875,
"learning_rate": 7.035817279304888e-05,
"loss": 4.5276,
"step": 17140
},
{
"epoch": 7.702156334231806,
"grad_norm": 3.90625,
"learning_rate": 7.028356123748097e-05,
"loss": 4.5155,
"step": 17145
},
{
"epoch": 7.70440251572327,
"grad_norm": 3.875,
"learning_rate": 7.020900661781576e-05,
"loss": 4.4453,
"step": 17150
},
{
"epoch": 7.706648697214735,
"grad_norm": 3.828125,
"learning_rate": 7.013450897886958e-05,
"loss": 4.558,
"step": 17155
},
{
"epoch": 7.708894878706199,
"grad_norm": 3.765625,
"learning_rate": 7.006006836542431e-05,
"loss": 4.4913,
"step": 17160
},
{
"epoch": 7.711141060197664,
"grad_norm": 3.984375,
"learning_rate": 6.998568482222771e-05,
"loss": 4.4458,
"step": 17165
},
{
"epoch": 7.713387241689128,
"grad_norm": 3.921875,
"learning_rate": 6.991135839399322e-05,
"loss": 4.4604,
"step": 17170
},
{
"epoch": 7.715633423180593,
"grad_norm": 3.5625,
"learning_rate": 6.983708912539985e-05,
"loss": 4.4622,
"step": 17175
},
{
"epoch": 7.717879604672057,
"grad_norm": 3.828125,
"learning_rate": 6.976287706109237e-05,
"loss": 4.4235,
"step": 17180
},
{
"epoch": 7.720125786163522,
"grad_norm": 3.875,
"learning_rate": 6.968872224568103e-05,
"loss": 4.5184,
"step": 17185
},
{
"epoch": 7.722371967654986,
"grad_norm": 3.796875,
"learning_rate": 6.961462472374179e-05,
"loss": 4.5468,
"step": 17190
},
{
"epoch": 7.724618149146451,
"grad_norm": 3.9375,
"learning_rate": 6.954058453981609e-05,
"loss": 4.4982,
"step": 17195
},
{
"epoch": 7.726864330637915,
"grad_norm": 4.28125,
"learning_rate": 6.946660173841093e-05,
"loss": 4.4807,
"step": 17200
},
{
"epoch": 7.72911051212938,
"grad_norm": 3.96875,
"learning_rate": 6.939267636399888e-05,
"loss": 4.5004,
"step": 17205
},
{
"epoch": 7.731356693620844,
"grad_norm": 3.78125,
"learning_rate": 6.931880846101783e-05,
"loss": 4.4918,
"step": 17210
},
{
"epoch": 7.733602875112309,
"grad_norm": 3.75,
"learning_rate": 6.924499807387132e-05,
"loss": 4.4224,
"step": 17215
},
{
"epoch": 7.735849056603773,
"grad_norm": 3.8125,
"learning_rate": 6.917124524692812e-05,
"loss": 4.4814,
"step": 17220
},
{
"epoch": 7.738095238095238,
"grad_norm": 3.96875,
"learning_rate": 6.909755002452258e-05,
"loss": 4.4893,
"step": 17225
},
{
"epoch": 7.740341419586702,
"grad_norm": 3.8125,
"learning_rate": 6.902391245095426e-05,
"loss": 4.4256,
"step": 17230
},
{
"epoch": 7.742587601078167,
"grad_norm": 4.0625,
"learning_rate": 6.89503325704882e-05,
"loss": 4.4999,
"step": 17235
},
{
"epoch": 7.744833782569632,
"grad_norm": 3.75,
"learning_rate": 6.887681042735472e-05,
"loss": 4.4886,
"step": 17240
},
{
"epoch": 7.747079964061096,
"grad_norm": 4.0,
"learning_rate": 6.880334606574935e-05,
"loss": 4.4772,
"step": 17245
},
{
"epoch": 7.74932614555256,
"grad_norm": 3.640625,
"learning_rate": 6.872993952983303e-05,
"loss": 4.5186,
"step": 17250
},
{
"epoch": 7.751572327044025,
"grad_norm": 3.828125,
"learning_rate": 6.865659086373179e-05,
"loss": 4.5021,
"step": 17255
},
{
"epoch": 7.75381850853549,
"grad_norm": 4.0625,
"learning_rate": 6.858330011153697e-05,
"loss": 4.4495,
"step": 17260
},
{
"epoch": 7.756064690026954,
"grad_norm": 3.875,
"learning_rate": 6.851006731730514e-05,
"loss": 4.4986,
"step": 17265
},
{
"epoch": 7.758310871518419,
"grad_norm": 3.78125,
"learning_rate": 6.843689252505787e-05,
"loss": 4.4514,
"step": 17270
},
{
"epoch": 7.760557053009883,
"grad_norm": 4.0625,
"learning_rate": 6.836377577878207e-05,
"loss": 4.445,
"step": 17275
},
{
"epoch": 7.762803234501348,
"grad_norm": 3.84375,
"learning_rate": 6.82907171224295e-05,
"loss": 4.4659,
"step": 17280
},
{
"epoch": 7.765049415992812,
"grad_norm": 3.921875,
"learning_rate": 6.821771659991722e-05,
"loss": 4.4338,
"step": 17285
},
{
"epoch": 7.767295597484277,
"grad_norm": 3.90625,
"learning_rate": 6.81447742551273e-05,
"loss": 4.5099,
"step": 17290
},
{
"epoch": 7.769541778975741,
"grad_norm": 3.640625,
"learning_rate": 6.807189013190675e-05,
"loss": 4.4715,
"step": 17295
},
{
"epoch": 7.771787960467206,
"grad_norm": 4.0,
"learning_rate": 6.799906427406771e-05,
"loss": 4.469,
"step": 17300
},
{
"epoch": 7.77403414195867,
"grad_norm": 3.90625,
"learning_rate": 6.792629672538715e-05,
"loss": 4.5603,
"step": 17305
},
{
"epoch": 7.776280323450135,
"grad_norm": 3.8125,
"learning_rate": 6.78535875296071e-05,
"loss": 4.499,
"step": 17310
},
{
"epoch": 7.778526504941599,
"grad_norm": 3.90625,
"learning_rate": 6.778093673043453e-05,
"loss": 4.4613,
"step": 17315
},
{
"epoch": 7.780772686433064,
"grad_norm": 4.375,
"learning_rate": 6.770834437154115e-05,
"loss": 4.4875,
"step": 17320
},
{
"epoch": 7.783018867924528,
"grad_norm": 3.796875,
"learning_rate": 6.763581049656376e-05,
"loss": 4.5347,
"step": 17325
},
{
"epoch": 7.785265049415993,
"grad_norm": 3.953125,
"learning_rate": 6.756333514910379e-05,
"loss": 4.4883,
"step": 17330
},
{
"epoch": 7.787511230907457,
"grad_norm": 3.828125,
"learning_rate": 6.749091837272767e-05,
"loss": 4.5301,
"step": 17335
},
{
"epoch": 7.789757412398922,
"grad_norm": 4.21875,
"learning_rate": 6.741856021096647e-05,
"loss": 4.4522,
"step": 17340
},
{
"epoch": 7.792003593890386,
"grad_norm": 3.921875,
"learning_rate": 6.734626070731612e-05,
"loss": 4.4783,
"step": 17345
},
{
"epoch": 7.794249775381851,
"grad_norm": 4.0625,
"learning_rate": 6.727401990523731e-05,
"loss": 4.463,
"step": 17350
},
{
"epoch": 7.796495956873315,
"grad_norm": 3.859375,
"learning_rate": 6.720183784815531e-05,
"loss": 4.4799,
"step": 17355
},
{
"epoch": 7.79874213836478,
"grad_norm": 4.0625,
"learning_rate": 6.712971457946027e-05,
"loss": 4.4696,
"step": 17360
},
{
"epoch": 7.800988319856245,
"grad_norm": 3.84375,
"learning_rate": 6.70576501425068e-05,
"loss": 4.4441,
"step": 17365
},
{
"epoch": 7.803234501347709,
"grad_norm": 3.625,
"learning_rate": 6.698564458061429e-05,
"loss": 4.4721,
"step": 17370
},
{
"epoch": 7.805480682839173,
"grad_norm": 3.96875,
"learning_rate": 6.691369793706672e-05,
"loss": 4.4785,
"step": 17375
},
{
"epoch": 7.807726864330638,
"grad_norm": 3.890625,
"learning_rate": 6.684181025511254e-05,
"loss": 4.5319,
"step": 17380
},
{
"epoch": 7.809973045822103,
"grad_norm": 3.6875,
"learning_rate": 6.676998157796493e-05,
"loss": 4.5144,
"step": 17385
},
{
"epoch": 7.812219227313567,
"grad_norm": 3.9375,
"learning_rate": 6.669821194880144e-05,
"loss": 4.4935,
"step": 17390
},
{
"epoch": 7.814465408805032,
"grad_norm": 4.21875,
"learning_rate": 6.662650141076426e-05,
"loss": 4.4763,
"step": 17395
},
{
"epoch": 7.816711590296496,
"grad_norm": 3.8125,
"learning_rate": 6.655485000695993e-05,
"loss": 4.4597,
"step": 17400
},
{
"epoch": 7.818957771787961,
"grad_norm": 3.890625,
"learning_rate": 6.648325778045954e-05,
"loss": 4.4522,
"step": 17405
},
{
"epoch": 7.821203953279425,
"grad_norm": 3.96875,
"learning_rate": 6.641172477429864e-05,
"loss": 4.4301,
"step": 17410
},
{
"epoch": 7.82345013477089,
"grad_norm": 3.84375,
"learning_rate": 6.634025103147698e-05,
"loss": 4.4444,
"step": 17415
},
{
"epoch": 7.825696316262354,
"grad_norm": 3.875,
"learning_rate": 6.626883659495897e-05,
"loss": 4.5345,
"step": 17420
},
{
"epoch": 7.827942497753819,
"grad_norm": 3.90625,
"learning_rate": 6.61974815076731e-05,
"loss": 4.5533,
"step": 17425
},
{
"epoch": 7.830188679245283,
"grad_norm": 3.90625,
"learning_rate": 6.612618581251243e-05,
"loss": 4.4798,
"step": 17430
},
{
"epoch": 7.832434860736748,
"grad_norm": 3.953125,
"learning_rate": 6.605494955233412e-05,
"loss": 4.5134,
"step": 17435
},
{
"epoch": 7.834681042228212,
"grad_norm": 3.875,
"learning_rate": 6.598377276995963e-05,
"loss": 4.4622,
"step": 17440
},
{
"epoch": 7.836927223719677,
"grad_norm": 3.875,
"learning_rate": 6.591265550817483e-05,
"loss": 4.4938,
"step": 17445
},
{
"epoch": 7.839173405211141,
"grad_norm": 4.03125,
"learning_rate": 6.584159780972958e-05,
"loss": 4.4976,
"step": 17450
},
{
"epoch": 7.841419586702606,
"grad_norm": 4.09375,
"learning_rate": 6.577059971733813e-05,
"loss": 4.493,
"step": 17455
},
{
"epoch": 7.84366576819407,
"grad_norm": 4.0625,
"learning_rate": 6.569966127367885e-05,
"loss": 4.4949,
"step": 17460
},
{
"epoch": 7.845911949685535,
"grad_norm": 4.125,
"learning_rate": 6.562878252139411e-05,
"loss": 4.4956,
"step": 17465
},
{
"epoch": 7.848158131176999,
"grad_norm": 4.0,
"learning_rate": 6.555796350309065e-05,
"loss": 4.4718,
"step": 17470
},
{
"epoch": 7.850404312668464,
"grad_norm": 3.734375,
"learning_rate": 6.548720426133902e-05,
"loss": 4.4235,
"step": 17475
},
{
"epoch": 7.852650494159928,
"grad_norm": 3.796875,
"learning_rate": 6.541650483867413e-05,
"loss": 4.5444,
"step": 17480
},
{
"epoch": 7.854896675651393,
"grad_norm": 4.15625,
"learning_rate": 6.534586527759466e-05,
"loss": 4.5028,
"step": 17485
},
{
"epoch": 7.857142857142857,
"grad_norm": 3.953125,
"learning_rate": 6.52752856205635e-05,
"loss": 4.5085,
"step": 17490
},
{
"epoch": 7.859389038634322,
"grad_norm": 3.734375,
"learning_rate": 6.520476591000746e-05,
"loss": 4.4457,
"step": 17495
},
{
"epoch": 7.861635220125786,
"grad_norm": 3.953125,
"learning_rate": 6.51343061883173e-05,
"loss": 4.5553,
"step": 17500
},
{
"epoch": 7.863881401617251,
"grad_norm": 3.75,
"learning_rate": 6.506390649784776e-05,
"loss": 4.459,
"step": 17505
},
{
"epoch": 7.866127583108716,
"grad_norm": 4.28125,
"learning_rate": 6.499356688091743e-05,
"loss": 4.5444,
"step": 17510
},
{
"epoch": 7.86837376460018,
"grad_norm": 3.859375,
"learning_rate": 6.492328737980882e-05,
"loss": 4.5072,
"step": 17515
},
{
"epoch": 7.870619946091644,
"grad_norm": 3.9375,
"learning_rate": 6.48530680367684e-05,
"loss": 4.4354,
"step": 17520
},
{
"epoch": 7.872866127583109,
"grad_norm": 3.78125,
"learning_rate": 6.478290889400627e-05,
"loss": 4.5007,
"step": 17525
},
{
"epoch": 7.875112309074574,
"grad_norm": 3.84375,
"learning_rate": 6.471280999369657e-05,
"loss": 4.5356,
"step": 17530
},
{
"epoch": 7.877358490566038,
"grad_norm": 3.84375,
"learning_rate": 6.464277137797706e-05,
"loss": 4.5208,
"step": 17535
},
{
"epoch": 7.879604672057503,
"grad_norm": 3.90625,
"learning_rate": 6.457279308894932e-05,
"loss": 4.4958,
"step": 17540
},
{
"epoch": 7.881850853548967,
"grad_norm": 3.828125,
"learning_rate": 6.450287516867868e-05,
"loss": 4.5001,
"step": 17545
},
{
"epoch": 7.884097035040432,
"grad_norm": 3.953125,
"learning_rate": 6.443301765919417e-05,
"loss": 4.4255,
"step": 17550
},
{
"epoch": 7.886343216531896,
"grad_norm": 4.0625,
"learning_rate": 6.436322060248853e-05,
"loss": 4.4363,
"step": 17555
},
{
"epoch": 7.888589398023361,
"grad_norm": 3.859375,
"learning_rate": 6.429348404051806e-05,
"loss": 4.4416,
"step": 17560
},
{
"epoch": 7.890835579514825,
"grad_norm": 3.9375,
"learning_rate": 6.422380801520287e-05,
"loss": 4.4451,
"step": 17565
},
{
"epoch": 7.8930817610062896,
"grad_norm": 4.03125,
"learning_rate": 6.415419256842646e-05,
"loss": 4.4769,
"step": 17570
},
{
"epoch": 7.895327942497754,
"grad_norm": 4.125,
"learning_rate": 6.408463774203619e-05,
"loss": 4.5419,
"step": 17575
},
{
"epoch": 7.8975741239892185,
"grad_norm": 3.921875,
"learning_rate": 6.401514357784267e-05,
"loss": 4.5194,
"step": 17580
},
{
"epoch": 7.899820305480683,
"grad_norm": 3.6875,
"learning_rate": 6.394571011762029e-05,
"loss": 4.4502,
"step": 17585
},
{
"epoch": 7.9020664869721475,
"grad_norm": 3.984375,
"learning_rate": 6.387633740310687e-05,
"loss": 4.4928,
"step": 17590
},
{
"epoch": 7.904312668463612,
"grad_norm": 4.1875,
"learning_rate": 6.380702547600368e-05,
"loss": 4.497,
"step": 17595
},
{
"epoch": 7.9065588499550765,
"grad_norm": 4.0625,
"learning_rate": 6.373777437797543e-05,
"loss": 4.4634,
"step": 17600
},
{
"epoch": 7.908805031446541,
"grad_norm": 3.96875,
"learning_rate": 6.366858415065036e-05,
"loss": 4.5166,
"step": 17605
},
{
"epoch": 7.9110512129380055,
"grad_norm": 4.0,
"learning_rate": 6.359945483562007e-05,
"loss": 4.503,
"step": 17610
},
{
"epoch": 7.9132973944294696,
"grad_norm": 3.765625,
"learning_rate": 6.353038647443952e-05,
"loss": 4.4697,
"step": 17615
},
{
"epoch": 7.9155435759209345,
"grad_norm": 4.03125,
"learning_rate": 6.346137910862707e-05,
"loss": 4.4233,
"step": 17620
},
{
"epoch": 7.9177897574123985,
"grad_norm": 3.875,
"learning_rate": 6.339243277966438e-05,
"loss": 4.4729,
"step": 17625
},
{
"epoch": 7.9200359389038635,
"grad_norm": 4.15625,
"learning_rate": 6.332354752899643e-05,
"loss": 4.5412,
"step": 17630
},
{
"epoch": 7.922282120395328,
"grad_norm": 3.84375,
"learning_rate": 6.325472339803149e-05,
"loss": 4.5044,
"step": 17635
},
{
"epoch": 7.9245283018867925,
"grad_norm": 4.03125,
"learning_rate": 6.318596042814116e-05,
"loss": 4.4582,
"step": 17640
},
{
"epoch": 7.9267744833782565,
"grad_norm": 3.8125,
"learning_rate": 6.311725866066012e-05,
"loss": 4.4888,
"step": 17645
},
{
"epoch": 7.9290206648697215,
"grad_norm": 4.15625,
"learning_rate": 6.304861813688639e-05,
"loss": 4.5026,
"step": 17650
},
{
"epoch": 7.931266846361186,
"grad_norm": 3.875,
"learning_rate": 6.298003889808108e-05,
"loss": 4.5242,
"step": 17655
},
{
"epoch": 7.9335130278526504,
"grad_norm": 3.71875,
"learning_rate": 6.291152098546856e-05,
"loss": 4.4611,
"step": 17660
},
{
"epoch": 7.935759209344115,
"grad_norm": 3.953125,
"learning_rate": 6.28430644402363e-05,
"loss": 4.3842,
"step": 17665
},
{
"epoch": 7.938005390835579,
"grad_norm": 3.71875,
"learning_rate": 6.277466930353481e-05,
"loss": 4.5028,
"step": 17670
},
{
"epoch": 7.940251572327044,
"grad_norm": 3.828125,
"learning_rate": 6.270633561647781e-05,
"loss": 4.4906,
"step": 17675
},
{
"epoch": 7.942497753818508,
"grad_norm": 3.921875,
"learning_rate": 6.263806342014195e-05,
"loss": 4.4686,
"step": 17680
},
{
"epoch": 7.944743935309973,
"grad_norm": 4.125,
"learning_rate": 6.256985275556704e-05,
"loss": 4.5212,
"step": 17685
},
{
"epoch": 7.946990116801437,
"grad_norm": 3.921875,
"learning_rate": 6.250170366375578e-05,
"loss": 4.471,
"step": 17690
},
{
"epoch": 7.949236298292902,
"grad_norm": 3.921875,
"learning_rate": 6.243361618567395e-05,
"loss": 4.5291,
"step": 17695
},
{
"epoch": 7.951482479784366,
"grad_norm": 4.09375,
"learning_rate": 6.236559036225033e-05,
"loss": 4.4481,
"step": 17700
},
{
"epoch": 7.953728661275831,
"grad_norm": 3.796875,
"learning_rate": 6.229762623437642e-05,
"loss": 4.4311,
"step": 17705
},
{
"epoch": 7.955974842767295,
"grad_norm": 3.875,
"learning_rate": 6.222972384290699e-05,
"loss": 4.455,
"step": 17710
},
{
"epoch": 7.95822102425876,
"grad_norm": 3.78125,
"learning_rate": 6.21618832286593e-05,
"loss": 4.5304,
"step": 17715
},
{
"epoch": 7.960467205750224,
"grad_norm": 3.734375,
"learning_rate": 6.209410443241376e-05,
"loss": 4.5395,
"step": 17720
},
{
"epoch": 7.962713387241689,
"grad_norm": 3.875,
"learning_rate": 6.202638749491355e-05,
"loss": 4.4199,
"step": 17725
},
{
"epoch": 7.964959568733153,
"grad_norm": 4.0,
"learning_rate": 6.19587324568646e-05,
"loss": 4.4652,
"step": 17730
},
{
"epoch": 7.967205750224618,
"grad_norm": 3.796875,
"learning_rate": 6.189113935893571e-05,
"loss": 4.5211,
"step": 17735
},
{
"epoch": 7.969451931716082,
"grad_norm": 3.859375,
"learning_rate": 6.182360824175837e-05,
"loss": 4.4731,
"step": 17740
},
{
"epoch": 7.971698113207547,
"grad_norm": 3.859375,
"learning_rate": 6.175613914592691e-05,
"loss": 4.4723,
"step": 17745
},
{
"epoch": 7.973944294699011,
"grad_norm": 4.03125,
"learning_rate": 6.168873211199829e-05,
"loss": 4.4953,
"step": 17750
},
{
"epoch": 7.976190476190476,
"grad_norm": 3.890625,
"learning_rate": 6.162138718049216e-05,
"loss": 4.4922,
"step": 17755
},
{
"epoch": 7.97843665768194,
"grad_norm": 3.859375,
"learning_rate": 6.155410439189095e-05,
"loss": 4.4632,
"step": 17760
},
{
"epoch": 7.980682839173405,
"grad_norm": 3.9375,
"learning_rate": 6.148688378663958e-05,
"loss": 4.4894,
"step": 17765
},
{
"epoch": 7.982929020664869,
"grad_norm": 3.953125,
"learning_rate": 6.141972540514572e-05,
"loss": 4.5014,
"step": 17770
},
{
"epoch": 7.985175202156334,
"grad_norm": 4.03125,
"learning_rate": 6.135262928777962e-05,
"loss": 4.4312,
"step": 17775
},
{
"epoch": 7.987421383647799,
"grad_norm": 4.03125,
"learning_rate": 6.128559547487397e-05,
"loss": 4.4561,
"step": 17780
},
{
"epoch": 7.989667565139263,
"grad_norm": 3.96875,
"learning_rate": 6.12186240067242e-05,
"loss": 4.5294,
"step": 17785
},
{
"epoch": 7.991913746630727,
"grad_norm": 3.90625,
"learning_rate": 6.115171492358809e-05,
"loss": 4.4025,
"step": 17790
},
{
"epoch": 7.994159928122192,
"grad_norm": 3.8125,
"learning_rate": 6.108486826568607e-05,
"loss": 4.4746,
"step": 17795
},
{
"epoch": 7.996406109613657,
"grad_norm": 3.890625,
"learning_rate": 6.1018084073200906e-05,
"loss": 4.5202,
"step": 17800
},
{
"epoch": 7.998652291105121,
"grad_norm": 4.0,
"learning_rate": 6.095136238627792e-05,
"loss": 4.5089,
"step": 17805
},
{
"epoch": 8.000898472596585,
"grad_norm": 4.125,
"learning_rate": 6.088470324502486e-05,
"loss": 4.4478,
"step": 17810
},
{
"epoch": 8.00314465408805,
"grad_norm": 3.796875,
"learning_rate": 6.081810668951174e-05,
"loss": 4.4606,
"step": 17815
},
{
"epoch": 8.005390835579515,
"grad_norm": 4.03125,
"learning_rate": 6.0751572759771165e-05,
"loss": 4.5093,
"step": 17820
},
{
"epoch": 8.00763701707098,
"grad_norm": 3.84375,
"learning_rate": 6.068510149579786e-05,
"loss": 4.5073,
"step": 17825
},
{
"epoch": 8.009883198562443,
"grad_norm": 3.625,
"learning_rate": 6.0618692937549105e-05,
"loss": 4.5232,
"step": 17830
},
{
"epoch": 8.012129380053908,
"grad_norm": 3.984375,
"learning_rate": 6.055234712494431e-05,
"loss": 4.473,
"step": 17835
},
{
"epoch": 8.014375561545373,
"grad_norm": 3.984375,
"learning_rate": 6.0486064097865263e-05,
"loss": 4.428,
"step": 17840
},
{
"epoch": 8.016621743036838,
"grad_norm": 3.6875,
"learning_rate": 6.041984389615605e-05,
"loss": 4.4789,
"step": 17845
},
{
"epoch": 8.018867924528301,
"grad_norm": 3.953125,
"learning_rate": 6.0353686559622816e-05,
"loss": 4.5115,
"step": 17850
},
{
"epoch": 8.021114106019766,
"grad_norm": 3.9375,
"learning_rate": 6.0287592128034146e-05,
"loss": 4.4342,
"step": 17855
},
{
"epoch": 8.023360287511231,
"grad_norm": 4.03125,
"learning_rate": 6.022156064112057e-05,
"loss": 4.4225,
"step": 17860
},
{
"epoch": 8.025606469002696,
"grad_norm": 3.9375,
"learning_rate": 6.0155592138574985e-05,
"loss": 4.499,
"step": 17865
},
{
"epoch": 8.02785265049416,
"grad_norm": 4.03125,
"learning_rate": 6.0089686660052366e-05,
"loss": 4.4441,
"step": 17870
},
{
"epoch": 8.030098831985624,
"grad_norm": 3.953125,
"learning_rate": 6.0023844245169716e-05,
"loss": 4.54,
"step": 17875
},
{
"epoch": 8.032345013477089,
"grad_norm": 3.796875,
"learning_rate": 5.9958064933506276e-05,
"loss": 4.4182,
"step": 17880
},
{
"epoch": 8.034591194968554,
"grad_norm": 4.15625,
"learning_rate": 5.9892348764603184e-05,
"loss": 4.4358,
"step": 17885
},
{
"epoch": 8.036837376460017,
"grad_norm": 4.125,
"learning_rate": 5.9826695777963815e-05,
"loss": 4.481,
"step": 17890
},
{
"epoch": 8.039083557951482,
"grad_norm": 3.875,
"learning_rate": 5.976110601305337e-05,
"loss": 4.5304,
"step": 17895
},
{
"epoch": 8.041329739442947,
"grad_norm": 4.0625,
"learning_rate": 5.969557950929916e-05,
"loss": 4.3914,
"step": 17900
},
{
"epoch": 8.043575920934412,
"grad_norm": 4.0,
"learning_rate": 5.9630116306090515e-05,
"loss": 4.5243,
"step": 17905
},
{
"epoch": 8.045822102425875,
"grad_norm": 4.125,
"learning_rate": 5.95647164427786e-05,
"loss": 4.5211,
"step": 17910
},
{
"epoch": 8.04806828391734,
"grad_norm": 4.09375,
"learning_rate": 5.94993799586765e-05,
"loss": 4.4862,
"step": 17915
},
{
"epoch": 8.050314465408805,
"grad_norm": 3.703125,
"learning_rate": 5.943410689305936e-05,
"loss": 4.484,
"step": 17920
},
{
"epoch": 8.05256064690027,
"grad_norm": 3.640625,
"learning_rate": 5.936889728516398e-05,
"loss": 4.4546,
"step": 17925
},
{
"epoch": 8.054806828391735,
"grad_norm": 3.9375,
"learning_rate": 5.9303751174189235e-05,
"loss": 4.5075,
"step": 17930
},
{
"epoch": 8.057053009883198,
"grad_norm": 3.84375,
"learning_rate": 5.923866859929563e-05,
"loss": 4.5261,
"step": 17935
},
{
"epoch": 8.059299191374663,
"grad_norm": 3.859375,
"learning_rate": 5.9173649599605665e-05,
"loss": 4.5253,
"step": 17940
},
{
"epoch": 8.061545372866128,
"grad_norm": 3.953125,
"learning_rate": 5.9108694214203454e-05,
"loss": 4.4496,
"step": 17945
},
{
"epoch": 8.063791554357593,
"grad_norm": 3.796875,
"learning_rate": 5.904380248213497e-05,
"loss": 4.4981,
"step": 17950
},
{
"epoch": 8.066037735849056,
"grad_norm": 3.828125,
"learning_rate": 5.8978974442407945e-05,
"loss": 4.4188,
"step": 17955
},
{
"epoch": 8.068283917340521,
"grad_norm": 4.03125,
"learning_rate": 5.891421013399173e-05,
"loss": 4.496,
"step": 17960
},
{
"epoch": 8.070530098831986,
"grad_norm": 4.09375,
"learning_rate": 5.884950959581748e-05,
"loss": 4.4931,
"step": 17965
},
{
"epoch": 8.07277628032345,
"grad_norm": 4.03125,
"learning_rate": 5.878487286677785e-05,
"loss": 4.5287,
"step": 17970
},
{
"epoch": 8.075022461814914,
"grad_norm": 3.984375,
"learning_rate": 5.872029998572735e-05,
"loss": 4.4908,
"step": 17975
},
{
"epoch": 8.077268643306379,
"grad_norm": 3.84375,
"learning_rate": 5.86557909914819e-05,
"loss": 4.4313,
"step": 17980
},
{
"epoch": 8.079514824797844,
"grad_norm": 4.0625,
"learning_rate": 5.859134592281918e-05,
"loss": 4.4793,
"step": 17985
},
{
"epoch": 8.081761006289309,
"grad_norm": 3.859375,
"learning_rate": 5.8526964818478395e-05,
"loss": 4.4486,
"step": 17990
},
{
"epoch": 8.084007187780772,
"grad_norm": 3.828125,
"learning_rate": 5.846264771716024e-05,
"loss": 4.4387,
"step": 17995
},
{
"epoch": 8.086253369272237,
"grad_norm": 4.0,
"learning_rate": 5.839839465752702e-05,
"loss": 4.5099,
"step": 18000
},
{
"epoch": 8.086253369272237,
"eval_loss": 4.780518054962158,
"eval_runtime": 15.9734,
"eval_samples_per_second": 1941.539,
"eval_steps_per_second": 242.716,
"step": 18000
},
{
"epoch": 8.088499550763702,
"grad_norm": 4.0,
"learning_rate": 5.8334205678202464e-05,
"loss": 4.4045,
"step": 18005
},
{
"epoch": 8.090745732255167,
"grad_norm": 4.03125,
"learning_rate": 5.827008081777183e-05,
"loss": 4.4716,
"step": 18010
},
{
"epoch": 8.09299191374663,
"grad_norm": 3.859375,
"learning_rate": 5.8206020114781895e-05,
"loss": 4.4368,
"step": 18015
},
{
"epoch": 8.095238095238095,
"grad_norm": 3.796875,
"learning_rate": 5.8142023607740695e-05,
"loss": 4.427,
"step": 18020
},
{
"epoch": 8.09748427672956,
"grad_norm": 4.0,
"learning_rate": 5.807809133511786e-05,
"loss": 4.4108,
"step": 18025
},
{
"epoch": 8.099730458221025,
"grad_norm": 4.0,
"learning_rate": 5.801422333534426e-05,
"loss": 4.4607,
"step": 18030
},
{
"epoch": 8.101976639712488,
"grad_norm": 4.0625,
"learning_rate": 5.7950419646812294e-05,
"loss": 4.4477,
"step": 18035
},
{
"epoch": 8.104222821203953,
"grad_norm": 3.890625,
"learning_rate": 5.788668030787551e-05,
"loss": 4.4343,
"step": 18040
},
{
"epoch": 8.106469002695418,
"grad_norm": 3.8125,
"learning_rate": 5.782300535684891e-05,
"loss": 4.4925,
"step": 18045
},
{
"epoch": 8.108715184186883,
"grad_norm": 3.953125,
"learning_rate": 5.7759394832008776e-05,
"loss": 4.4611,
"step": 18050
},
{
"epoch": 8.110961365678348,
"grad_norm": 4.09375,
"learning_rate": 5.76958487715926e-05,
"loss": 4.4711,
"step": 18055
},
{
"epoch": 8.11320754716981,
"grad_norm": 3.890625,
"learning_rate": 5.763236721379919e-05,
"loss": 4.4197,
"step": 18060
},
{
"epoch": 8.115453728661276,
"grad_norm": 3.71875,
"learning_rate": 5.756895019678849e-05,
"loss": 4.493,
"step": 18065
},
{
"epoch": 8.11769991015274,
"grad_norm": 4.0,
"learning_rate": 5.750559775868181e-05,
"loss": 4.4368,
"step": 18070
},
{
"epoch": 8.119946091644206,
"grad_norm": 4.03125,
"learning_rate": 5.744230993756148e-05,
"loss": 4.4406,
"step": 18075
},
{
"epoch": 8.122192273135669,
"grad_norm": 3.78125,
"learning_rate": 5.737908677147101e-05,
"loss": 4.4617,
"step": 18080
},
{
"epoch": 8.124438454627134,
"grad_norm": 4.03125,
"learning_rate": 5.731592829841516e-05,
"loss": 4.511,
"step": 18085
},
{
"epoch": 8.126684636118599,
"grad_norm": 3.90625,
"learning_rate": 5.725283455635965e-05,
"loss": 4.4128,
"step": 18090
},
{
"epoch": 8.128930817610064,
"grad_norm": 3.921875,
"learning_rate": 5.718980558323139e-05,
"loss": 4.4366,
"step": 18095
},
{
"epoch": 8.131176999101527,
"grad_norm": 4.0625,
"learning_rate": 5.712684141691836e-05,
"loss": 4.4686,
"step": 18100
},
{
"epoch": 8.133423180592992,
"grad_norm": 3.96875,
"learning_rate": 5.7063942095269505e-05,
"loss": 4.4801,
"step": 18105
},
{
"epoch": 8.135669362084457,
"grad_norm": 3.953125,
"learning_rate": 5.7001107656094893e-05,
"loss": 4.4497,
"step": 18110
},
{
"epoch": 8.137915543575922,
"grad_norm": 4.09375,
"learning_rate": 5.693833813716546e-05,
"loss": 4.4774,
"step": 18115
},
{
"epoch": 8.140161725067385,
"grad_norm": 3.84375,
"learning_rate": 5.687563357621321e-05,
"loss": 4.4912,
"step": 18120
},
{
"epoch": 8.14240790655885,
"grad_norm": 3.9375,
"learning_rate": 5.6812994010931146e-05,
"loss": 4.4964,
"step": 18125
},
{
"epoch": 8.144654088050315,
"grad_norm": 3.953125,
"learning_rate": 5.675041947897303e-05,
"loss": 4.4952,
"step": 18130
},
{
"epoch": 8.14690026954178,
"grad_norm": 3.859375,
"learning_rate": 5.6687910017953755e-05,
"loss": 4.4608,
"step": 18135
},
{
"epoch": 8.149146451033243,
"grad_norm": 3.953125,
"learning_rate": 5.662546566544886e-05,
"loss": 4.4754,
"step": 18140
},
{
"epoch": 8.151392632524708,
"grad_norm": 3.96875,
"learning_rate": 5.656308645899498e-05,
"loss": 4.4565,
"step": 18145
},
{
"epoch": 8.153638814016173,
"grad_norm": 3.765625,
"learning_rate": 5.650077243608937e-05,
"loss": 4.4769,
"step": 18150
},
{
"epoch": 8.155884995507638,
"grad_norm": 4.125,
"learning_rate": 5.643852363419027e-05,
"loss": 4.4776,
"step": 18155
},
{
"epoch": 8.1581311769991,
"grad_norm": 4.125,
"learning_rate": 5.637634009071666e-05,
"loss": 4.4136,
"step": 18160
},
{
"epoch": 8.160377358490566,
"grad_norm": 3.9375,
"learning_rate": 5.631422184304822e-05,
"loss": 4.5088,
"step": 18165
},
{
"epoch": 8.16262353998203,
"grad_norm": 3.84375,
"learning_rate": 5.625216892852553e-05,
"loss": 4.5145,
"step": 18170
},
{
"epoch": 8.164869721473496,
"grad_norm": 4.1875,
"learning_rate": 5.6190181384449726e-05,
"loss": 4.458,
"step": 18175
},
{
"epoch": 8.167115902964959,
"grad_norm": 3.96875,
"learning_rate": 5.6128259248082795e-05,
"loss": 4.4935,
"step": 18180
},
{
"epoch": 8.169362084456424,
"grad_norm": 4.03125,
"learning_rate": 5.6066402556647306e-05,
"loss": 4.439,
"step": 18185
},
{
"epoch": 8.171608265947889,
"grad_norm": 3.828125,
"learning_rate": 5.600461134732651e-05,
"loss": 4.4209,
"step": 18190
},
{
"epoch": 8.173854447439354,
"grad_norm": 3.921875,
"learning_rate": 5.5942885657264406e-05,
"loss": 4.4214,
"step": 18195
},
{
"epoch": 8.176100628930818,
"grad_norm": 3.859375,
"learning_rate": 5.588122552356538e-05,
"loss": 4.4687,
"step": 18200
},
{
"epoch": 8.178346810422282,
"grad_norm": 3.953125,
"learning_rate": 5.5819630983294655e-05,
"loss": 4.4344,
"step": 18205
},
{
"epoch": 8.180592991913747,
"grad_norm": 3.84375,
"learning_rate": 5.575810207347785e-05,
"loss": 4.4538,
"step": 18210
},
{
"epoch": 8.182839173405211,
"grad_norm": 4.15625,
"learning_rate": 5.569663883110118e-05,
"loss": 4.4291,
"step": 18215
},
{
"epoch": 8.185085354896676,
"grad_norm": 4.09375,
"learning_rate": 5.563524129311149e-05,
"loss": 4.4979,
"step": 18220
},
{
"epoch": 8.18733153638814,
"grad_norm": 4.21875,
"learning_rate": 5.557390949641598e-05,
"loss": 4.4595,
"step": 18225
},
{
"epoch": 8.189577717879605,
"grad_norm": 4.09375,
"learning_rate": 5.551264347788241e-05,
"loss": 4.4759,
"step": 18230
},
{
"epoch": 8.19182389937107,
"grad_norm": 4.0625,
"learning_rate": 5.5451443274338915e-05,
"loss": 4.4531,
"step": 18235
},
{
"epoch": 8.194070080862534,
"grad_norm": 3.859375,
"learning_rate": 5.53903089225742e-05,
"loss": 4.4832,
"step": 18240
},
{
"epoch": 8.196316262353998,
"grad_norm": 3.75,
"learning_rate": 5.5329240459337316e-05,
"loss": 4.4841,
"step": 18245
},
{
"epoch": 8.198562443845463,
"grad_norm": 3.734375,
"learning_rate": 5.5268237921337674e-05,
"loss": 4.4415,
"step": 18250
},
{
"epoch": 8.200808625336927,
"grad_norm": 3.9375,
"learning_rate": 5.5207301345245166e-05,
"loss": 4.4937,
"step": 18255
},
{
"epoch": 8.203054806828392,
"grad_norm": 4.0625,
"learning_rate": 5.514643076768986e-05,
"loss": 4.3784,
"step": 18260
},
{
"epoch": 8.205300988319856,
"grad_norm": 4.09375,
"learning_rate": 5.5085626225262305e-05,
"loss": 4.5438,
"step": 18265
},
{
"epoch": 8.20754716981132,
"grad_norm": 3.78125,
"learning_rate": 5.5024887754513314e-05,
"loss": 4.4229,
"step": 18270
},
{
"epoch": 8.209793351302785,
"grad_norm": 4.09375,
"learning_rate": 5.496421539195394e-05,
"loss": 4.4617,
"step": 18275
},
{
"epoch": 8.21203953279425,
"grad_norm": 4.3125,
"learning_rate": 5.4903609174055566e-05,
"loss": 4.4654,
"step": 18280
},
{
"epoch": 8.214285714285714,
"grad_norm": 4.125,
"learning_rate": 5.4843069137249694e-05,
"loss": 4.4163,
"step": 18285
},
{
"epoch": 8.216531895777178,
"grad_norm": 3.765625,
"learning_rate": 5.4782595317928205e-05,
"loss": 4.5016,
"step": 18290
},
{
"epoch": 8.218778077268643,
"grad_norm": 4.09375,
"learning_rate": 5.472218775244305e-05,
"loss": 4.3907,
"step": 18295
},
{
"epoch": 8.221024258760108,
"grad_norm": 3.796875,
"learning_rate": 5.46618464771064e-05,
"loss": 4.4023,
"step": 18300
},
{
"epoch": 8.223270440251572,
"grad_norm": 3.8125,
"learning_rate": 5.460157152819061e-05,
"loss": 4.4586,
"step": 18305
},
{
"epoch": 8.225516621743036,
"grad_norm": 4.21875,
"learning_rate": 5.4541362941928076e-05,
"loss": 4.4928,
"step": 18310
},
{
"epoch": 8.227762803234501,
"grad_norm": 4.28125,
"learning_rate": 5.4481220754511434e-05,
"loss": 4.3868,
"step": 18315
},
{
"epoch": 8.230008984725966,
"grad_norm": 3.828125,
"learning_rate": 5.442114500209324e-05,
"loss": 4.4299,
"step": 18320
},
{
"epoch": 8.232255166217431,
"grad_norm": 4.0625,
"learning_rate": 5.436113572078625e-05,
"loss": 4.4576,
"step": 18325
},
{
"epoch": 8.234501347708894,
"grad_norm": 3.953125,
"learning_rate": 5.43011929466632e-05,
"loss": 4.4762,
"step": 18330
},
{
"epoch": 8.23674752920036,
"grad_norm": 4.15625,
"learning_rate": 5.424131671575686e-05,
"loss": 4.5095,
"step": 18335
},
{
"epoch": 8.238993710691824,
"grad_norm": 3.890625,
"learning_rate": 5.418150706406007e-05,
"loss": 4.4108,
"step": 18340
},
{
"epoch": 8.24123989218329,
"grad_norm": 3.71875,
"learning_rate": 5.412176402752546e-05,
"loss": 4.3927,
"step": 18345
},
{
"epoch": 8.243486073674752,
"grad_norm": 3.890625,
"learning_rate": 5.406208764206585e-05,
"loss": 4.4245,
"step": 18350
},
{
"epoch": 8.245732255166217,
"grad_norm": 3.6875,
"learning_rate": 5.4002477943553816e-05,
"loss": 4.465,
"step": 18355
},
{
"epoch": 8.247978436657682,
"grad_norm": 3.671875,
"learning_rate": 5.39429349678219e-05,
"loss": 4.4656,
"step": 18360
},
{
"epoch": 8.250224618149147,
"grad_norm": 3.890625,
"learning_rate": 5.388345875066264e-05,
"loss": 4.479,
"step": 18365
},
{
"epoch": 8.25247079964061,
"grad_norm": 3.859375,
"learning_rate": 5.3824049327828245e-05,
"loss": 4.3851,
"step": 18370
},
{
"epoch": 8.254716981132075,
"grad_norm": 3.90625,
"learning_rate": 5.376470673503096e-05,
"loss": 4.4901,
"step": 18375
},
{
"epoch": 8.25696316262354,
"grad_norm": 4.0,
"learning_rate": 5.370543100794273e-05,
"loss": 4.4664,
"step": 18380
},
{
"epoch": 8.259209344115005,
"grad_norm": 4.0,
"learning_rate": 5.3646222182195366e-05,
"loss": 4.4547,
"step": 18385
},
{
"epoch": 8.261455525606468,
"grad_norm": 3.78125,
"learning_rate": 5.358708029338048e-05,
"loss": 4.4895,
"step": 18390
},
{
"epoch": 8.263701707097933,
"grad_norm": 4.125,
"learning_rate": 5.352800537704936e-05,
"loss": 4.4985,
"step": 18395
},
{
"epoch": 8.265947888589398,
"grad_norm": 4.09375,
"learning_rate": 5.346899746871313e-05,
"loss": 4.4499,
"step": 18400
},
{
"epoch": 8.268194070080863,
"grad_norm": 4.28125,
"learning_rate": 5.341005660384257e-05,
"loss": 4.5527,
"step": 18405
},
{
"epoch": 8.270440251572326,
"grad_norm": 3.875,
"learning_rate": 5.3351182817868216e-05,
"loss": 4.4724,
"step": 18410
},
{
"epoch": 8.272686433063791,
"grad_norm": 3.96875,
"learning_rate": 5.329237614618028e-05,
"loss": 4.4431,
"step": 18415
},
{
"epoch": 8.274932614555256,
"grad_norm": 4.0,
"learning_rate": 5.323363662412851e-05,
"loss": 4.4714,
"step": 18420
},
{
"epoch": 8.277178796046721,
"grad_norm": 4.28125,
"learning_rate": 5.3174964287022474e-05,
"loss": 4.4337,
"step": 18425
},
{
"epoch": 8.279424977538184,
"grad_norm": 3.96875,
"learning_rate": 5.311635917013118e-05,
"loss": 4.4529,
"step": 18430
},
{
"epoch": 8.28167115902965,
"grad_norm": 3.9375,
"learning_rate": 5.305782130868341e-05,
"loss": 4.435,
"step": 18435
},
{
"epoch": 8.283917340521114,
"grad_norm": 3.84375,
"learning_rate": 5.2999350737867296e-05,
"loss": 4.4485,
"step": 18440
},
{
"epoch": 8.286163522012579,
"grad_norm": 3.953125,
"learning_rate": 5.294094749283072e-05,
"loss": 4.4753,
"step": 18445
},
{
"epoch": 8.288409703504042,
"grad_norm": 4.0,
"learning_rate": 5.2882611608681024e-05,
"loss": 4.4287,
"step": 18450
},
{
"epoch": 8.290655884995507,
"grad_norm": 3.671875,
"learning_rate": 5.282434312048499e-05,
"loss": 4.4659,
"step": 18455
},
{
"epoch": 8.292902066486972,
"grad_norm": 3.890625,
"learning_rate": 5.276614206326898e-05,
"loss": 4.4461,
"step": 18460
},
{
"epoch": 8.295148247978437,
"grad_norm": 3.96875,
"learning_rate": 5.2708008472018786e-05,
"loss": 4.4968,
"step": 18465
},
{
"epoch": 8.297394429469902,
"grad_norm": 3.953125,
"learning_rate": 5.2649942381679626e-05,
"loss": 4.4595,
"step": 18470
},
{
"epoch": 8.299640610961365,
"grad_norm": 4.03125,
"learning_rate": 5.259194382715623e-05,
"loss": 4.4837,
"step": 18475
},
{
"epoch": 8.30188679245283,
"grad_norm": 4.125,
"learning_rate": 5.253401284331256e-05,
"loss": 4.4814,
"step": 18480
},
{
"epoch": 8.304132973944295,
"grad_norm": 4.03125,
"learning_rate": 5.247614946497215e-05,
"loss": 4.4858,
"step": 18485
},
{
"epoch": 8.30637915543576,
"grad_norm": 3.734375,
"learning_rate": 5.241835372691774e-05,
"loss": 4.4735,
"step": 18490
},
{
"epoch": 8.308625336927223,
"grad_norm": 4.25,
"learning_rate": 5.236062566389155e-05,
"loss": 4.4556,
"step": 18495
},
{
"epoch": 8.310871518418688,
"grad_norm": 4.0625,
"learning_rate": 5.230296531059497e-05,
"loss": 4.3911,
"step": 18500
},
{
"epoch": 8.313117699910153,
"grad_norm": 3.953125,
"learning_rate": 5.22453727016888e-05,
"loss": 4.4873,
"step": 18505
},
{
"epoch": 8.315363881401618,
"grad_norm": 3.703125,
"learning_rate": 5.2187847871793134e-05,
"loss": 4.4996,
"step": 18510
},
{
"epoch": 8.317610062893081,
"grad_norm": 3.9375,
"learning_rate": 5.213039085548716e-05,
"loss": 4.4138,
"step": 18515
},
{
"epoch": 8.319856244384546,
"grad_norm": 3.859375,
"learning_rate": 5.207300168730952e-05,
"loss": 4.4796,
"step": 18520
},
{
"epoch": 8.322102425876011,
"grad_norm": 4.0625,
"learning_rate": 5.20156804017579e-05,
"loss": 4.4773,
"step": 18525
},
{
"epoch": 8.324348607367476,
"grad_norm": 3.859375,
"learning_rate": 5.1958427033289304e-05,
"loss": 4.468,
"step": 18530
},
{
"epoch": 8.326594788858939,
"grad_norm": 3.84375,
"learning_rate": 5.190124161631977e-05,
"loss": 4.4441,
"step": 18535
},
{
"epoch": 8.328840970350404,
"grad_norm": 3.875,
"learning_rate": 5.18441241852246e-05,
"loss": 4.4903,
"step": 18540
},
{
"epoch": 8.331087151841869,
"grad_norm": 4.09375,
"learning_rate": 5.178707477433829e-05,
"loss": 4.4684,
"step": 18545
},
{
"epoch": 8.333333333333334,
"grad_norm": 4.0,
"learning_rate": 5.1730093417954214e-05,
"loss": 4.3903,
"step": 18550
},
{
"epoch": 8.335579514824797,
"grad_norm": 3.953125,
"learning_rate": 5.167318015032504e-05,
"loss": 4.4638,
"step": 18555
},
{
"epoch": 8.337825696316262,
"grad_norm": 3.875,
"learning_rate": 5.161633500566249e-05,
"loss": 4.4414,
"step": 18560
},
{
"epoch": 8.340071877807727,
"grad_norm": 4.0,
"learning_rate": 5.155955801813721e-05,
"loss": 4.4694,
"step": 18565
},
{
"epoch": 8.342318059299192,
"grad_norm": 3.96875,
"learning_rate": 5.150284922187902e-05,
"loss": 4.4675,
"step": 18570
},
{
"epoch": 8.344564240790655,
"grad_norm": 3.8125,
"learning_rate": 5.1446208650976645e-05,
"loss": 4.4494,
"step": 18575
},
{
"epoch": 8.34681042228212,
"grad_norm": 3.859375,
"learning_rate": 5.138963633947789e-05,
"loss": 4.4237,
"step": 18580
},
{
"epoch": 8.349056603773585,
"grad_norm": 4.15625,
"learning_rate": 5.133313232138942e-05,
"loss": 4.4784,
"step": 18585
},
{
"epoch": 8.35130278526505,
"grad_norm": 3.890625,
"learning_rate": 5.127669663067691e-05,
"loss": 4.4938,
"step": 18590
},
{
"epoch": 8.353548966756513,
"grad_norm": 4.0,
"learning_rate": 5.122032930126502e-05,
"loss": 4.4414,
"step": 18595
},
{
"epoch": 8.355795148247978,
"grad_norm": 4.09375,
"learning_rate": 5.1164030367037166e-05,
"loss": 4.4891,
"step": 18600
},
{
"epoch": 8.358041329739443,
"grad_norm": 3.984375,
"learning_rate": 5.1107799861835827e-05,
"loss": 4.4619,
"step": 18605
},
{
"epoch": 8.360287511230908,
"grad_norm": 4.25,
"learning_rate": 5.105163781946217e-05,
"loss": 4.4723,
"step": 18610
},
{
"epoch": 8.362533692722373,
"grad_norm": 4.0625,
"learning_rate": 5.0995544273676335e-05,
"loss": 4.462,
"step": 18615
},
{
"epoch": 8.364779874213836,
"grad_norm": 4.21875,
"learning_rate": 5.0939519258197314e-05,
"loss": 4.4285,
"step": 18620
},
{
"epoch": 8.367026055705301,
"grad_norm": 3.890625,
"learning_rate": 5.0883562806702725e-05,
"loss": 4.4085,
"step": 18625
},
{
"epoch": 8.369272237196766,
"grad_norm": 3.96875,
"learning_rate": 5.082767495282917e-05,
"loss": 4.4501,
"step": 18630
},
{
"epoch": 8.37151841868823,
"grad_norm": 3.703125,
"learning_rate": 5.077185573017186e-05,
"loss": 4.5868,
"step": 18635
},
{
"epoch": 8.373764600179694,
"grad_norm": 4.125,
"learning_rate": 5.071610517228491e-05,
"loss": 4.4669,
"step": 18640
},
{
"epoch": 8.376010781671159,
"grad_norm": 3.9375,
"learning_rate": 5.066042331268099e-05,
"loss": 4.5506,
"step": 18645
},
{
"epoch": 8.378256963162624,
"grad_norm": 4.0625,
"learning_rate": 5.060481018483157e-05,
"loss": 4.4666,
"step": 18650
},
{
"epoch": 8.380503144654089,
"grad_norm": 3.875,
"learning_rate": 5.054926582216683e-05,
"loss": 4.513,
"step": 18655
},
{
"epoch": 8.382749326145552,
"grad_norm": 3.921875,
"learning_rate": 5.049379025807553e-05,
"loss": 4.4393,
"step": 18660
},
{
"epoch": 8.384995507637017,
"grad_norm": 4.0625,
"learning_rate": 5.043838352590515e-05,
"loss": 4.4618,
"step": 18665
},
{
"epoch": 8.387241689128482,
"grad_norm": 4.15625,
"learning_rate": 5.0383045658961694e-05,
"loss": 4.5318,
"step": 18670
},
{
"epoch": 8.389487870619947,
"grad_norm": 4.0625,
"learning_rate": 5.032777669050993e-05,
"loss": 4.4933,
"step": 18675
},
{
"epoch": 8.39173405211141,
"grad_norm": 4.09375,
"learning_rate": 5.0272576653773034e-05,
"loss": 4.4422,
"step": 18680
},
{
"epoch": 8.393980233602875,
"grad_norm": 4.0625,
"learning_rate": 5.021744558193286e-05,
"loss": 4.4074,
"step": 18685
},
{
"epoch": 8.39622641509434,
"grad_norm": 4.4375,
"learning_rate": 5.0162383508129806e-05,
"loss": 4.4848,
"step": 18690
},
{
"epoch": 8.398472596585805,
"grad_norm": 3.953125,
"learning_rate": 5.01073904654627e-05,
"loss": 4.4374,
"step": 18695
},
{
"epoch": 8.400718778077268,
"grad_norm": 3.9375,
"learning_rate": 5.005246648698898e-05,
"loss": 4.4559,
"step": 18700
},
{
"epoch": 8.402964959568733,
"grad_norm": 4.15625,
"learning_rate": 4.9997611605724496e-05,
"loss": 4.4243,
"step": 18705
},
{
"epoch": 8.405211141060198,
"grad_norm": 4.1875,
"learning_rate": 4.994282585464359e-05,
"loss": 4.4827,
"step": 18710
},
{
"epoch": 8.407457322551663,
"grad_norm": 3.9375,
"learning_rate": 4.9888109266679086e-05,
"loss": 4.5147,
"step": 18715
},
{
"epoch": 8.409703504043126,
"grad_norm": 3.953125,
"learning_rate": 4.9833461874722125e-05,
"loss": 4.4535,
"step": 18720
},
{
"epoch": 8.41194968553459,
"grad_norm": 3.6875,
"learning_rate": 4.977888371162237e-05,
"loss": 4.4727,
"step": 18725
},
{
"epoch": 8.414195867026056,
"grad_norm": 3.875,
"learning_rate": 4.972437481018783e-05,
"loss": 4.487,
"step": 18730
},
{
"epoch": 8.41644204851752,
"grad_norm": 4.09375,
"learning_rate": 4.966993520318484e-05,
"loss": 4.4624,
"step": 18735
},
{
"epoch": 8.418688230008986,
"grad_norm": 4.0625,
"learning_rate": 4.961556492333816e-05,
"loss": 4.4402,
"step": 18740
},
{
"epoch": 8.420934411500449,
"grad_norm": 4.09375,
"learning_rate": 4.956126400333076e-05,
"loss": 4.5195,
"step": 18745
},
{
"epoch": 8.423180592991914,
"grad_norm": 4.0,
"learning_rate": 4.950703247580404e-05,
"loss": 4.514,
"step": 18750
},
{
"epoch": 8.425426774483379,
"grad_norm": 3.96875,
"learning_rate": 4.945287037335759e-05,
"loss": 4.4338,
"step": 18755
},
{
"epoch": 8.427672955974844,
"grad_norm": 3.9375,
"learning_rate": 4.939877772854933e-05,
"loss": 4.4666,
"step": 18760
},
{
"epoch": 8.429919137466307,
"grad_norm": 3.890625,
"learning_rate": 4.934475457389543e-05,
"loss": 4.4787,
"step": 18765
},
{
"epoch": 8.432165318957772,
"grad_norm": 3.84375,
"learning_rate": 4.9290800941870225e-05,
"loss": 4.5359,
"step": 18770
},
{
"epoch": 8.434411500449237,
"grad_norm": 3.96875,
"learning_rate": 4.923691686490631e-05,
"loss": 4.456,
"step": 18775
},
{
"epoch": 8.436657681940702,
"grad_norm": 3.890625,
"learning_rate": 4.918310237539447e-05,
"loss": 4.4697,
"step": 18780
},
{
"epoch": 8.438903863432165,
"grad_norm": 3.859375,
"learning_rate": 4.912935750568365e-05,
"loss": 4.5175,
"step": 18785
},
{
"epoch": 8.44115004492363,
"grad_norm": 3.953125,
"learning_rate": 4.907568228808087e-05,
"loss": 4.5084,
"step": 18790
},
{
"epoch": 8.443396226415095,
"grad_norm": 3.953125,
"learning_rate": 4.9022076754851436e-05,
"loss": 4.4899,
"step": 18795
},
{
"epoch": 8.44564240790656,
"grad_norm": 4.0,
"learning_rate": 4.896854093821869e-05,
"loss": 4.5063,
"step": 18800
},
{
"epoch": 8.447888589398023,
"grad_norm": 3.96875,
"learning_rate": 4.891507487036399e-05,
"loss": 4.4922,
"step": 18805
},
{
"epoch": 8.450134770889488,
"grad_norm": 3.984375,
"learning_rate": 4.88616785834269e-05,
"loss": 4.514,
"step": 18810
},
{
"epoch": 8.452380952380953,
"grad_norm": 3.953125,
"learning_rate": 4.880835210950491e-05,
"loss": 4.4305,
"step": 18815
},
{
"epoch": 8.454627133872417,
"grad_norm": 3.890625,
"learning_rate": 4.875509548065362e-05,
"loss": 4.5128,
"step": 18820
},
{
"epoch": 8.45687331536388,
"grad_norm": 4.125,
"learning_rate": 4.87019087288867e-05,
"loss": 4.448,
"step": 18825
},
{
"epoch": 8.459119496855346,
"grad_norm": 3.96875,
"learning_rate": 4.864879188617565e-05,
"loss": 4.4806,
"step": 18830
},
{
"epoch": 8.46136567834681,
"grad_norm": 3.8125,
"learning_rate": 4.859574498445011e-05,
"loss": 4.4896,
"step": 18835
},
{
"epoch": 8.463611859838275,
"grad_norm": 3.875,
"learning_rate": 4.854276805559757e-05,
"loss": 4.5605,
"step": 18840
},
{
"epoch": 8.465858041329739,
"grad_norm": 3.953125,
"learning_rate": 4.848986113146352e-05,
"loss": 4.4716,
"step": 18845
},
{
"epoch": 8.468104222821204,
"grad_norm": 4.15625,
"learning_rate": 4.843702424385133e-05,
"loss": 4.4414,
"step": 18850
},
{
"epoch": 8.470350404312669,
"grad_norm": 3.890625,
"learning_rate": 4.838425742452228e-05,
"loss": 4.4549,
"step": 18855
},
{
"epoch": 8.472596585804133,
"grad_norm": 3.953125,
"learning_rate": 4.8331560705195614e-05,
"loss": 4.5185,
"step": 18860
},
{
"epoch": 8.474842767295598,
"grad_norm": 3.9375,
"learning_rate": 4.827893411754824e-05,
"loss": 4.5196,
"step": 18865
},
{
"epoch": 8.477088948787062,
"grad_norm": 3.671875,
"learning_rate": 4.82263776932151e-05,
"loss": 4.484,
"step": 18870
},
{
"epoch": 8.479335130278526,
"grad_norm": 3.828125,
"learning_rate": 4.8173891463788884e-05,
"loss": 4.4651,
"step": 18875
},
{
"epoch": 8.481581311769991,
"grad_norm": 3.828125,
"learning_rate": 4.812147546082006e-05,
"loss": 4.4591,
"step": 18880
},
{
"epoch": 8.483827493261456,
"grad_norm": 4.03125,
"learning_rate": 4.806912971581695e-05,
"loss": 4.45,
"step": 18885
},
{
"epoch": 8.48607367475292,
"grad_norm": 3.859375,
"learning_rate": 4.801685426024555e-05,
"loss": 4.4575,
"step": 18890
},
{
"epoch": 8.488319856244384,
"grad_norm": 3.90625,
"learning_rate": 4.796464912552974e-05,
"loss": 4.5483,
"step": 18895
},
{
"epoch": 8.49056603773585,
"grad_norm": 3.96875,
"learning_rate": 4.791251434305097e-05,
"loss": 4.4674,
"step": 18900
},
{
"epoch": 8.492812219227314,
"grad_norm": 3.9375,
"learning_rate": 4.786044994414851e-05,
"loss": 4.487,
"step": 18905
},
{
"epoch": 8.495058400718777,
"grad_norm": 3.9375,
"learning_rate": 4.780845596011932e-05,
"loss": 4.4694,
"step": 18910
},
{
"epoch": 8.497304582210242,
"grad_norm": 3.8125,
"learning_rate": 4.775653242221791e-05,
"loss": 4.4402,
"step": 18915
},
{
"epoch": 8.499550763701707,
"grad_norm": 3.796875,
"learning_rate": 4.770467936165665e-05,
"loss": 4.5017,
"step": 18920
},
{
"epoch": 8.501796945193172,
"grad_norm": 3.9375,
"learning_rate": 4.765289680960533e-05,
"loss": 4.4096,
"step": 18925
},
{
"epoch": 8.504043126684635,
"grad_norm": 4.0625,
"learning_rate": 4.7601184797191506e-05,
"loss": 4.4218,
"step": 18930
},
{
"epoch": 8.5062893081761,
"grad_norm": 3.671875,
"learning_rate": 4.754954335550026e-05,
"loss": 4.4643,
"step": 18935
},
{
"epoch": 8.508535489667565,
"grad_norm": 3.765625,
"learning_rate": 4.749797251557426e-05,
"loss": 4.4153,
"step": 18940
},
{
"epoch": 8.51078167115903,
"grad_norm": 3.9375,
"learning_rate": 4.744647230841379e-05,
"loss": 4.472,
"step": 18945
},
{
"epoch": 8.513027852650493,
"grad_norm": 3.953125,
"learning_rate": 4.739504276497658e-05,
"loss": 4.446,
"step": 18950
},
{
"epoch": 8.515274034141958,
"grad_norm": 3.953125,
"learning_rate": 4.7343683916177994e-05,
"loss": 4.4649,
"step": 18955
},
{
"epoch": 8.517520215633423,
"grad_norm": 3.890625,
"learning_rate": 4.7292395792890765e-05,
"loss": 4.5418,
"step": 18960
},
{
"epoch": 8.519766397124888,
"grad_norm": 3.828125,
"learning_rate": 4.7241178425945247e-05,
"loss": 4.4404,
"step": 18965
},
{
"epoch": 8.522012578616351,
"grad_norm": 4.15625,
"learning_rate": 4.719003184612919e-05,
"loss": 4.4466,
"step": 18970
},
{
"epoch": 8.524258760107816,
"grad_norm": 4.15625,
"learning_rate": 4.713895608418777e-05,
"loss": 4.476,
"step": 18975
},
{
"epoch": 8.526504941599281,
"grad_norm": 4.125,
"learning_rate": 4.7087951170823675e-05,
"loss": 4.5428,
"step": 18980
},
{
"epoch": 8.528751123090746,
"grad_norm": 4.375,
"learning_rate": 4.7037017136696905e-05,
"loss": 4.443,
"step": 18985
},
{
"epoch": 8.530997304582211,
"grad_norm": 4.03125,
"learning_rate": 4.698615401242495e-05,
"loss": 4.4697,
"step": 18990
},
{
"epoch": 8.533243486073674,
"grad_norm": 3.90625,
"learning_rate": 4.693536182858256e-05,
"loss": 4.4363,
"step": 18995
},
{
"epoch": 8.53548966756514,
"grad_norm": 3.90625,
"learning_rate": 4.688464061570198e-05,
"loss": 4.5175,
"step": 19000
},
{
"epoch": 8.53548966756514,
"eval_loss": 4.779257297515869,
"eval_runtime": 16.0653,
"eval_samples_per_second": 1930.435,
"eval_steps_per_second": 241.328,
"step": 19000
},
{
"epoch": 8.537735849056604,
"grad_norm": 4.03125,
"learning_rate": 4.6833990404272724e-05,
"loss": 4.4766,
"step": 19005
},
{
"epoch": 8.539982030548067,
"grad_norm": 3.90625,
"learning_rate": 4.678341122474156e-05,
"loss": 4.4872,
"step": 19010
},
{
"epoch": 8.542228212039532,
"grad_norm": 4.34375,
"learning_rate": 4.673290310751268e-05,
"loss": 4.5014,
"step": 19015
},
{
"epoch": 8.544474393530997,
"grad_norm": 4.125,
"learning_rate": 4.668246608294749e-05,
"loss": 4.5077,
"step": 19020
},
{
"epoch": 8.546720575022462,
"grad_norm": 3.890625,
"learning_rate": 4.663210018136464e-05,
"loss": 4.5305,
"step": 19025
},
{
"epoch": 8.548966756513927,
"grad_norm": 3.734375,
"learning_rate": 4.658180543304009e-05,
"loss": 4.4317,
"step": 19030
},
{
"epoch": 8.55121293800539,
"grad_norm": 4.125,
"learning_rate": 4.653158186820696e-05,
"loss": 4.5103,
"step": 19035
},
{
"epoch": 8.553459119496855,
"grad_norm": 4.09375,
"learning_rate": 4.6481429517055675e-05,
"loss": 4.5253,
"step": 19040
},
{
"epoch": 8.55570530098832,
"grad_norm": 3.984375,
"learning_rate": 4.643134840973374e-05,
"loss": 4.4771,
"step": 19045
},
{
"epoch": 8.557951482479785,
"grad_norm": 3.96875,
"learning_rate": 4.638133857634589e-05,
"loss": 4.4538,
"step": 19050
},
{
"epoch": 8.560197663971248,
"grad_norm": 3.953125,
"learning_rate": 4.633140004695407e-05,
"loss": 4.4769,
"step": 19055
},
{
"epoch": 8.562443845462713,
"grad_norm": 3.875,
"learning_rate": 4.628153285157725e-05,
"loss": 4.4907,
"step": 19060
},
{
"epoch": 8.564690026954178,
"grad_norm": 3.796875,
"learning_rate": 4.623173702019159e-05,
"loss": 4.5098,
"step": 19065
},
{
"epoch": 8.566936208445643,
"grad_norm": 4.0,
"learning_rate": 4.618201258273034e-05,
"loss": 4.4628,
"step": 19070
},
{
"epoch": 8.569182389937106,
"grad_norm": 3.96875,
"learning_rate": 4.6132359569083816e-05,
"loss": 4.5161,
"step": 19075
},
{
"epoch": 8.571428571428571,
"grad_norm": 4.03125,
"learning_rate": 4.608277800909946e-05,
"loss": 4.4383,
"step": 19080
},
{
"epoch": 8.573674752920036,
"grad_norm": 3.953125,
"learning_rate": 4.603326793258167e-05,
"loss": 4.4597,
"step": 19085
},
{
"epoch": 8.575920934411501,
"grad_norm": 3.84375,
"learning_rate": 4.5983829369291956e-05,
"loss": 4.4562,
"step": 19090
},
{
"epoch": 8.578167115902964,
"grad_norm": 4.03125,
"learning_rate": 4.593446234894877e-05,
"loss": 4.4617,
"step": 19095
},
{
"epoch": 8.58041329739443,
"grad_norm": 4.03125,
"learning_rate": 4.5885166901227626e-05,
"loss": 4.4788,
"step": 19100
},
{
"epoch": 8.582659478885894,
"grad_norm": 3.90625,
"learning_rate": 4.583594305576096e-05,
"loss": 4.4296,
"step": 19105
},
{
"epoch": 8.584905660377359,
"grad_norm": 4.03125,
"learning_rate": 4.578679084213817e-05,
"loss": 4.5075,
"step": 19110
},
{
"epoch": 8.587151841868822,
"grad_norm": 3.53125,
"learning_rate": 4.5737710289905674e-05,
"loss": 4.5024,
"step": 19115
},
{
"epoch": 8.589398023360287,
"grad_norm": 4.375,
"learning_rate": 4.5688701428566685e-05,
"loss": 4.4307,
"step": 19120
},
{
"epoch": 8.591644204851752,
"grad_norm": 4.0625,
"learning_rate": 4.563976428758144e-05,
"loss": 4.4966,
"step": 19125
},
{
"epoch": 8.593890386343217,
"grad_norm": 4.03125,
"learning_rate": 4.5590898896366964e-05,
"loss": 4.4909,
"step": 19130
},
{
"epoch": 8.59613656783468,
"grad_norm": 3.953125,
"learning_rate": 4.5542105284297236e-05,
"loss": 4.4896,
"step": 19135
},
{
"epoch": 8.598382749326145,
"grad_norm": 4.125,
"learning_rate": 4.549338348070303e-05,
"loss": 4.4824,
"step": 19140
},
{
"epoch": 8.60062893081761,
"grad_norm": 4.1875,
"learning_rate": 4.544473351487196e-05,
"loss": 4.4483,
"step": 19145
},
{
"epoch": 8.602875112309075,
"grad_norm": 3.75,
"learning_rate": 4.5396155416048524e-05,
"loss": 4.4626,
"step": 19150
},
{
"epoch": 8.60512129380054,
"grad_norm": 3.984375,
"learning_rate": 4.5347649213433905e-05,
"loss": 4.4716,
"step": 19155
},
{
"epoch": 8.607367475292003,
"grad_norm": 3.890625,
"learning_rate": 4.529921493618618e-05,
"loss": 4.4124,
"step": 19160
},
{
"epoch": 8.609613656783468,
"grad_norm": 3.765625,
"learning_rate": 4.5250852613420094e-05,
"loss": 4.4807,
"step": 19165
},
{
"epoch": 8.611859838274933,
"grad_norm": 3.8125,
"learning_rate": 4.520256227420722e-05,
"loss": 4.4539,
"step": 19170
},
{
"epoch": 8.614106019766398,
"grad_norm": 3.984375,
"learning_rate": 4.515434394757586e-05,
"loss": 4.5185,
"step": 19175
},
{
"epoch": 8.616352201257861,
"grad_norm": 3.890625,
"learning_rate": 4.510619766251088e-05,
"loss": 4.5331,
"step": 19180
},
{
"epoch": 8.618598382749326,
"grad_norm": 4.03125,
"learning_rate": 4.505812344795407e-05,
"loss": 4.3877,
"step": 19185
},
{
"epoch": 8.620844564240791,
"grad_norm": 3.78125,
"learning_rate": 4.501012133280368e-05,
"loss": 4.4476,
"step": 19190
},
{
"epoch": 8.623090745732256,
"grad_norm": 4.0,
"learning_rate": 4.496219134591478e-05,
"loss": 4.4507,
"step": 19195
},
{
"epoch": 8.625336927223719,
"grad_norm": 3.796875,
"learning_rate": 4.4914333516099047e-05,
"loss": 4.4937,
"step": 19200
},
{
"epoch": 8.627583108715184,
"grad_norm": 3.890625,
"learning_rate": 4.4866547872124675e-05,
"loss": 4.4312,
"step": 19205
},
{
"epoch": 8.629829290206649,
"grad_norm": 3.9375,
"learning_rate": 4.481883444271663e-05,
"loss": 4.485,
"step": 19210
},
{
"epoch": 8.632075471698114,
"grad_norm": 4.28125,
"learning_rate": 4.477119325655633e-05,
"loss": 4.5384,
"step": 19215
},
{
"epoch": 8.634321653189577,
"grad_norm": 3.984375,
"learning_rate": 4.4723624342281845e-05,
"loss": 4.4882,
"step": 19220
},
{
"epoch": 8.636567834681042,
"grad_norm": 4.0625,
"learning_rate": 4.46761277284878e-05,
"loss": 4.4326,
"step": 19225
},
{
"epoch": 8.638814016172507,
"grad_norm": 3.953125,
"learning_rate": 4.46287034437253e-05,
"loss": 4.4201,
"step": 19230
},
{
"epoch": 8.641060197663972,
"grad_norm": 4.09375,
"learning_rate": 4.458135151650204e-05,
"loss": 4.4966,
"step": 19235
},
{
"epoch": 8.643306379155435,
"grad_norm": 3.9375,
"learning_rate": 4.4534071975282164e-05,
"loss": 4.4862,
"step": 19240
},
{
"epoch": 8.6455525606469,
"grad_norm": 3.984375,
"learning_rate": 4.448686484848638e-05,
"loss": 4.4492,
"step": 19245
},
{
"epoch": 8.647798742138365,
"grad_norm": 3.90625,
"learning_rate": 4.443973016449173e-05,
"loss": 4.4572,
"step": 19250
},
{
"epoch": 8.65004492362983,
"grad_norm": 3.75,
"learning_rate": 4.4392667951631835e-05,
"loss": 4.5163,
"step": 19255
},
{
"epoch": 8.652291105121293,
"grad_norm": 4.03125,
"learning_rate": 4.434567823819675e-05,
"loss": 4.4779,
"step": 19260
},
{
"epoch": 8.654537286612758,
"grad_norm": 3.953125,
"learning_rate": 4.429876105243285e-05,
"loss": 4.4685,
"step": 19265
},
{
"epoch": 8.656783468104223,
"grad_norm": 3.8125,
"learning_rate": 4.4251916422543015e-05,
"loss": 4.4743,
"step": 19270
},
{
"epoch": 8.659029649595688,
"grad_norm": 4.0625,
"learning_rate": 4.420514437668643e-05,
"loss": 4.4898,
"step": 19275
},
{
"epoch": 8.661275831087153,
"grad_norm": 4.0625,
"learning_rate": 4.415844494297874e-05,
"loss": 4.4705,
"step": 19280
},
{
"epoch": 8.663522012578616,
"grad_norm": 3.828125,
"learning_rate": 4.411181814949184e-05,
"loss": 4.4875,
"step": 19285
},
{
"epoch": 8.66576819407008,
"grad_norm": 3.796875,
"learning_rate": 4.406526402425399e-05,
"loss": 4.4229,
"step": 19290
},
{
"epoch": 8.668014375561546,
"grad_norm": 3.828125,
"learning_rate": 4.4018782595249866e-05,
"loss": 4.442,
"step": 19295
},
{
"epoch": 8.67026055705301,
"grad_norm": 4.1875,
"learning_rate": 4.397237389042028e-05,
"loss": 4.4163,
"step": 19300
},
{
"epoch": 8.672506738544474,
"grad_norm": 4.03125,
"learning_rate": 4.392603793766247e-05,
"loss": 4.4643,
"step": 19305
},
{
"epoch": 8.674752920035939,
"grad_norm": 4.0625,
"learning_rate": 4.387977476482983e-05,
"loss": 4.4177,
"step": 19310
},
{
"epoch": 8.676999101527404,
"grad_norm": 4.0,
"learning_rate": 4.383358439973209e-05,
"loss": 4.4912,
"step": 19315
},
{
"epoch": 8.679245283018869,
"grad_norm": 4.0625,
"learning_rate": 4.37874668701352e-05,
"loss": 4.454,
"step": 19320
},
{
"epoch": 8.681491464510332,
"grad_norm": 3.9375,
"learning_rate": 4.374142220376125e-05,
"loss": 4.4142,
"step": 19325
},
{
"epoch": 8.683737646001797,
"grad_norm": 4.03125,
"learning_rate": 4.369545042828868e-05,
"loss": 4.5224,
"step": 19330
},
{
"epoch": 8.685983827493262,
"grad_norm": 4.09375,
"learning_rate": 4.364955157135195e-05,
"loss": 4.465,
"step": 19335
},
{
"epoch": 8.688230008984727,
"grad_norm": 3.8125,
"learning_rate": 4.3603725660541736e-05,
"loss": 4.4517,
"step": 19340
},
{
"epoch": 8.69047619047619,
"grad_norm": 3.90625,
"learning_rate": 4.355797272340497e-05,
"loss": 4.5375,
"step": 19345
},
{
"epoch": 8.692722371967655,
"grad_norm": 3.84375,
"learning_rate": 4.3512292787444564e-05,
"loss": 4.4509,
"step": 19350
},
{
"epoch": 8.69496855345912,
"grad_norm": 4.09375,
"learning_rate": 4.346668588011968e-05,
"loss": 4.4416,
"step": 19355
},
{
"epoch": 8.697214734950585,
"grad_norm": 4.0,
"learning_rate": 4.342115202884548e-05,
"loss": 4.4413,
"step": 19360
},
{
"epoch": 8.699460916442048,
"grad_norm": 3.78125,
"learning_rate": 4.337569126099326e-05,
"loss": 4.5407,
"step": 19365
},
{
"epoch": 8.701707097933513,
"grad_norm": 3.875,
"learning_rate": 4.3330303603890414e-05,
"loss": 4.4828,
"step": 19370
},
{
"epoch": 8.703953279424978,
"grad_norm": 3.765625,
"learning_rate": 4.328498908482028e-05,
"loss": 4.4302,
"step": 19375
},
{
"epoch": 8.706199460916443,
"grad_norm": 3.59375,
"learning_rate": 4.323974773102238e-05,
"loss": 4.4499,
"step": 19380
},
{
"epoch": 8.708445642407906,
"grad_norm": 3.859375,
"learning_rate": 4.319457956969211e-05,
"loss": 4.4702,
"step": 19385
},
{
"epoch": 8.71069182389937,
"grad_norm": 3.984375,
"learning_rate": 4.314948462798098e-05,
"loss": 4.5252,
"step": 19390
},
{
"epoch": 8.712938005390836,
"grad_norm": 3.890625,
"learning_rate": 4.310446293299639e-05,
"loss": 4.4829,
"step": 19395
},
{
"epoch": 8.7151841868823,
"grad_norm": 4.0,
"learning_rate": 4.3059514511801805e-05,
"loss": 4.4736,
"step": 19400
},
{
"epoch": 8.717430368373766,
"grad_norm": 3.765625,
"learning_rate": 4.3014639391416595e-05,
"loss": 4.482,
"step": 19405
},
{
"epoch": 8.719676549865229,
"grad_norm": 3.734375,
"learning_rate": 4.296983759881606e-05,
"loss": 4.5219,
"step": 19410
},
{
"epoch": 8.721922731356694,
"grad_norm": 3.59375,
"learning_rate": 4.292510916093144e-05,
"loss": 4.547,
"step": 19415
},
{
"epoch": 8.724168912848159,
"grad_norm": 3.984375,
"learning_rate": 4.288045410464986e-05,
"loss": 4.3926,
"step": 19420
},
{
"epoch": 8.726415094339622,
"grad_norm": 3.84375,
"learning_rate": 4.2835872456814366e-05,
"loss": 4.4436,
"step": 19425
},
{
"epoch": 8.728661275831087,
"grad_norm": 3.765625,
"learning_rate": 4.279136424422385e-05,
"loss": 4.5177,
"step": 19430
},
{
"epoch": 8.730907457322552,
"grad_norm": 3.859375,
"learning_rate": 4.274692949363307e-05,
"loss": 4.5139,
"step": 19435
},
{
"epoch": 8.733153638814017,
"grad_norm": 4.0625,
"learning_rate": 4.270256823175264e-05,
"loss": 4.4525,
"step": 19440
},
{
"epoch": 8.735399820305481,
"grad_norm": 3.796875,
"learning_rate": 4.265828048524892e-05,
"loss": 4.5383,
"step": 19445
},
{
"epoch": 8.737646001796945,
"grad_norm": 4.0,
"learning_rate": 4.261406628074422e-05,
"loss": 4.418,
"step": 19450
},
{
"epoch": 8.73989218328841,
"grad_norm": 3.921875,
"learning_rate": 4.256992564481649e-05,
"loss": 4.434,
"step": 19455
},
{
"epoch": 8.742138364779874,
"grad_norm": 3.96875,
"learning_rate": 4.252585860399959e-05,
"loss": 4.4382,
"step": 19460
},
{
"epoch": 8.74438454627134,
"grad_norm": 3.84375,
"learning_rate": 4.248186518478307e-05,
"loss": 4.4914,
"step": 19465
},
{
"epoch": 8.746630727762803,
"grad_norm": 3.609375,
"learning_rate": 4.2437945413612184e-05,
"loss": 4.4838,
"step": 19470
},
{
"epoch": 8.748876909254268,
"grad_norm": 4.0625,
"learning_rate": 4.239409931688803e-05,
"loss": 4.4763,
"step": 19475
},
{
"epoch": 8.751123090745732,
"grad_norm": 4.0,
"learning_rate": 4.235032692096729e-05,
"loss": 4.4874,
"step": 19480
},
{
"epoch": 8.753369272237197,
"grad_norm": 3.859375,
"learning_rate": 4.230662825216248e-05,
"loss": 4.4227,
"step": 19485
},
{
"epoch": 8.75561545372866,
"grad_norm": 3.75,
"learning_rate": 4.2263003336741655e-05,
"loss": 4.4311,
"step": 19490
},
{
"epoch": 8.757861635220126,
"grad_norm": 4.0,
"learning_rate": 4.2219452200928656e-05,
"loss": 4.4769,
"step": 19495
},
{
"epoch": 8.76010781671159,
"grad_norm": 3.890625,
"learning_rate": 4.21759748709029e-05,
"loss": 4.4972,
"step": 19500
},
{
"epoch": 8.762353998203055,
"grad_norm": 3.90625,
"learning_rate": 4.213257137279943e-05,
"loss": 4.4786,
"step": 19505
},
{
"epoch": 8.764600179694519,
"grad_norm": 3.96875,
"learning_rate": 4.208924173270897e-05,
"loss": 4.4806,
"step": 19510
},
{
"epoch": 8.766846361185983,
"grad_norm": 4.25,
"learning_rate": 4.204598597667785e-05,
"loss": 4.4575,
"step": 19515
},
{
"epoch": 8.769092542677448,
"grad_norm": 4.21875,
"learning_rate": 4.2002804130707865e-05,
"loss": 4.4575,
"step": 19520
},
{
"epoch": 8.771338724168913,
"grad_norm": 3.90625,
"learning_rate": 4.1959696220756545e-05,
"loss": 4.5042,
"step": 19525
},
{
"epoch": 8.773584905660378,
"grad_norm": 4.15625,
"learning_rate": 4.191666227273683e-05,
"loss": 4.4264,
"step": 19530
},
{
"epoch": 8.775831087151841,
"grad_norm": 4.09375,
"learning_rate": 4.187370231251735e-05,
"loss": 4.4629,
"step": 19535
},
{
"epoch": 8.778077268643306,
"grad_norm": 3.65625,
"learning_rate": 4.183081636592208e-05,
"loss": 4.4501,
"step": 19540
},
{
"epoch": 8.780323450134771,
"grad_norm": 4.15625,
"learning_rate": 4.178800445873066e-05,
"loss": 4.4359,
"step": 19545
},
{
"epoch": 8.782569631626234,
"grad_norm": 3.890625,
"learning_rate": 4.174526661667818e-05,
"loss": 4.4594,
"step": 19550
},
{
"epoch": 8.7848158131177,
"grad_norm": 3.765625,
"learning_rate": 4.1702602865455136e-05,
"loss": 4.4939,
"step": 19555
},
{
"epoch": 8.787061994609164,
"grad_norm": 3.984375,
"learning_rate": 4.166001323070761e-05,
"loss": 4.4454,
"step": 19560
},
{
"epoch": 8.78930817610063,
"grad_norm": 4.03125,
"learning_rate": 4.161749773803698e-05,
"loss": 4.5035,
"step": 19565
},
{
"epoch": 8.791554357592094,
"grad_norm": 3.6875,
"learning_rate": 4.15750564130002e-05,
"loss": 4.3936,
"step": 19570
},
{
"epoch": 8.793800539083557,
"grad_norm": 4.03125,
"learning_rate": 4.153268928110961e-05,
"loss": 4.4261,
"step": 19575
},
{
"epoch": 8.796046720575022,
"grad_norm": 3.875,
"learning_rate": 4.149039636783283e-05,
"loss": 4.5206,
"step": 19580
},
{
"epoch": 8.798292902066487,
"grad_norm": 4.15625,
"learning_rate": 4.144817769859303e-05,
"loss": 4.4361,
"step": 19585
},
{
"epoch": 8.800539083557952,
"grad_norm": 4.03125,
"learning_rate": 4.140603329876861e-05,
"loss": 4.4497,
"step": 19590
},
{
"epoch": 8.802785265049415,
"grad_norm": 3.890625,
"learning_rate": 4.1363963193693495e-05,
"loss": 4.4402,
"step": 19595
},
{
"epoch": 8.80503144654088,
"grad_norm": 4.0,
"learning_rate": 4.132196740865674e-05,
"loss": 4.4708,
"step": 19600
},
{
"epoch": 8.807277628032345,
"grad_norm": 3.796875,
"learning_rate": 4.12800459689029e-05,
"loss": 4.4493,
"step": 19605
},
{
"epoch": 8.80952380952381,
"grad_norm": 3.90625,
"learning_rate": 4.123819889963176e-05,
"loss": 4.4969,
"step": 19610
},
{
"epoch": 8.811769991015273,
"grad_norm": 3.78125,
"learning_rate": 4.1196426225998374e-05,
"loss": 4.4436,
"step": 19615
},
{
"epoch": 8.814016172506738,
"grad_norm": 3.953125,
"learning_rate": 4.115472797311318e-05,
"loss": 4.459,
"step": 19620
},
{
"epoch": 8.816262353998203,
"grad_norm": 4.15625,
"learning_rate": 4.1113104166041736e-05,
"loss": 4.4202,
"step": 19625
},
{
"epoch": 8.818508535489668,
"grad_norm": 4.34375,
"learning_rate": 4.107155482980499e-05,
"loss": 4.5383,
"step": 19630
},
{
"epoch": 8.820754716981131,
"grad_norm": 4.03125,
"learning_rate": 4.103007998937901e-05,
"loss": 4.4495,
"step": 19635
},
{
"epoch": 8.823000898472596,
"grad_norm": 4.28125,
"learning_rate": 4.098867966969516e-05,
"loss": 4.4994,
"step": 19640
},
{
"epoch": 8.825247079964061,
"grad_norm": 4.03125,
"learning_rate": 4.094735389564e-05,
"loss": 4.4804,
"step": 19645
},
{
"epoch": 8.827493261455526,
"grad_norm": 3.90625,
"learning_rate": 4.090610269205524e-05,
"loss": 4.5048,
"step": 19650
},
{
"epoch": 8.82973944294699,
"grad_norm": 3.765625,
"learning_rate": 4.086492608373776e-05,
"loss": 4.445,
"step": 19655
},
{
"epoch": 8.831985624438454,
"grad_norm": 4.15625,
"learning_rate": 4.0823824095439674e-05,
"loss": 4.4986,
"step": 19660
},
{
"epoch": 8.83423180592992,
"grad_norm": 3.953125,
"learning_rate": 4.078279675186814e-05,
"loss": 4.5123,
"step": 19665
},
{
"epoch": 8.836477987421384,
"grad_norm": 4.0625,
"learning_rate": 4.074184407768554e-05,
"loss": 4.5034,
"step": 19670
},
{
"epoch": 8.838724168912847,
"grad_norm": 3.78125,
"learning_rate": 4.07009660975093e-05,
"loss": 4.4861,
"step": 19675
},
{
"epoch": 8.840970350404312,
"grad_norm": 3.9375,
"learning_rate": 4.066016283591198e-05,
"loss": 4.4408,
"step": 19680
},
{
"epoch": 8.843216531895777,
"grad_norm": 4.0,
"learning_rate": 4.0619434317421205e-05,
"loss": 4.4868,
"step": 19685
},
{
"epoch": 8.845462713387242,
"grad_norm": 4.0,
"learning_rate": 4.0578780566519715e-05,
"loss": 4.4404,
"step": 19690
},
{
"epoch": 8.847708894878707,
"grad_norm": 3.9375,
"learning_rate": 4.053820160764526e-05,
"loss": 4.4598,
"step": 19695
},
{
"epoch": 8.84995507637017,
"grad_norm": 4.125,
"learning_rate": 4.0497697465190625e-05,
"loss": 4.564,
"step": 19700
},
{
"epoch": 8.852201257861635,
"grad_norm": 4.03125,
"learning_rate": 4.045726816350369e-05,
"loss": 4.4761,
"step": 19705
},
{
"epoch": 8.8544474393531,
"grad_norm": 3.90625,
"learning_rate": 4.0416913726887224e-05,
"loss": 4.4506,
"step": 19710
},
{
"epoch": 8.856693620844565,
"grad_norm": 4.0,
"learning_rate": 4.0376634179599135e-05,
"loss": 4.4752,
"step": 19715
},
{
"epoch": 8.858939802336028,
"grad_norm": 4.125,
"learning_rate": 4.033642954585224e-05,
"loss": 4.4495,
"step": 19720
},
{
"epoch": 8.861185983827493,
"grad_norm": 3.984375,
"learning_rate": 4.029629984981427e-05,
"loss": 4.4673,
"step": 19725
},
{
"epoch": 8.863432165318958,
"grad_norm": 4.0625,
"learning_rate": 4.025624511560806e-05,
"loss": 4.4779,
"step": 19730
},
{
"epoch": 8.865678346810423,
"grad_norm": 3.84375,
"learning_rate": 4.021626536731121e-05,
"loss": 4.4455,
"step": 19735
},
{
"epoch": 8.867924528301886,
"grad_norm": 3.71875,
"learning_rate": 4.0176360628956395e-05,
"loss": 4.4714,
"step": 19740
},
{
"epoch": 8.870170709793351,
"grad_norm": 3.6875,
"learning_rate": 4.0136530924531075e-05,
"loss": 4.5017,
"step": 19745
},
{
"epoch": 8.872416891284816,
"grad_norm": 4.46875,
"learning_rate": 4.009677627797768e-05,
"loss": 4.4362,
"step": 19750
},
{
"epoch": 8.874663072776281,
"grad_norm": 3.921875,
"learning_rate": 4.005709671319355e-05,
"loss": 4.4355,
"step": 19755
},
{
"epoch": 8.876909254267744,
"grad_norm": 4.0,
"learning_rate": 4.00174922540308e-05,
"loss": 4.4632,
"step": 19760
},
{
"epoch": 8.879155435759209,
"grad_norm": 3.90625,
"learning_rate": 3.997796292429645e-05,
"loss": 4.4354,
"step": 19765
},
{
"epoch": 8.881401617250674,
"grad_norm": 4.0,
"learning_rate": 3.993850874775237e-05,
"loss": 4.4726,
"step": 19770
},
{
"epoch": 8.883647798742139,
"grad_norm": 3.515625,
"learning_rate": 3.989912974811521e-05,
"loss": 4.5184,
"step": 19775
},
{
"epoch": 8.885893980233602,
"grad_norm": 4.15625,
"learning_rate": 3.98598259490565e-05,
"loss": 4.4391,
"step": 19780
},
{
"epoch": 8.888140161725067,
"grad_norm": 3.921875,
"learning_rate": 3.982059737420249e-05,
"loss": 4.4652,
"step": 19785
},
{
"epoch": 8.890386343216532,
"grad_norm": 3.8125,
"learning_rate": 3.978144404713424e-05,
"loss": 4.4924,
"step": 19790
},
{
"epoch": 8.892632524707997,
"grad_norm": 4.03125,
"learning_rate": 3.974236599138759e-05,
"loss": 4.4596,
"step": 19795
},
{
"epoch": 8.89487870619946,
"grad_norm": 3.9375,
"learning_rate": 3.970336323045314e-05,
"loss": 4.4982,
"step": 19800
},
{
"epoch": 8.897124887690925,
"grad_norm": 3.609375,
"learning_rate": 3.9664435787776164e-05,
"loss": 4.4997,
"step": 19805
},
{
"epoch": 8.89937106918239,
"grad_norm": 3.765625,
"learning_rate": 3.9625583686756766e-05,
"loss": 4.4742,
"step": 19810
},
{
"epoch": 8.901617250673855,
"grad_norm": 4.0,
"learning_rate": 3.958680695074968e-05,
"loss": 4.4771,
"step": 19815
},
{
"epoch": 8.90386343216532,
"grad_norm": 3.921875,
"learning_rate": 3.954810560306433e-05,
"loss": 4.53,
"step": 19820
},
{
"epoch": 8.906109613656783,
"grad_norm": 3.921875,
"learning_rate": 3.950947966696488e-05,
"loss": 4.3864,
"step": 19825
},
{
"epoch": 8.908355795148248,
"grad_norm": 3.9375,
"learning_rate": 3.947092916567015e-05,
"loss": 4.606,
"step": 19830
},
{
"epoch": 8.910601976639713,
"grad_norm": 3.671875,
"learning_rate": 3.943245412235356e-05,
"loss": 4.4162,
"step": 19835
},
{
"epoch": 8.912848158131178,
"grad_norm": 4.0,
"learning_rate": 3.939405456014328e-05,
"loss": 4.4389,
"step": 19840
},
{
"epoch": 8.915094339622641,
"grad_norm": 4.125,
"learning_rate": 3.935573050212193e-05,
"loss": 4.4811,
"step": 19845
},
{
"epoch": 8.917340521114106,
"grad_norm": 3.90625,
"learning_rate": 3.931748197132697e-05,
"loss": 4.4041,
"step": 19850
},
{
"epoch": 8.91958670260557,
"grad_norm": 4.0625,
"learning_rate": 3.9279308990750244e-05,
"loss": 4.4089,
"step": 19855
},
{
"epoch": 8.921832884097036,
"grad_norm": 3.8125,
"learning_rate": 3.924121158333831e-05,
"loss": 4.4629,
"step": 19860
},
{
"epoch": 8.924079065588499,
"grad_norm": 3.796875,
"learning_rate": 3.92031897719923e-05,
"loss": 4.5195,
"step": 19865
},
{
"epoch": 8.926325247079964,
"grad_norm": 4.03125,
"learning_rate": 3.916524357956781e-05,
"loss": 4.4806,
"step": 19870
},
{
"epoch": 8.928571428571429,
"grad_norm": 3.921875,
"learning_rate": 3.9127373028875096e-05,
"loss": 4.4477,
"step": 19875
},
{
"epoch": 8.930817610062894,
"grad_norm": 3.984375,
"learning_rate": 3.908957814267883e-05,
"loss": 4.5098,
"step": 19880
},
{
"epoch": 8.933063791554357,
"grad_norm": 4.0625,
"learning_rate": 3.90518589436983e-05,
"loss": 4.5037,
"step": 19885
},
{
"epoch": 8.935309973045822,
"grad_norm": 3.65625,
"learning_rate": 3.901421545460721e-05,
"loss": 4.408,
"step": 19890
},
{
"epoch": 8.937556154537287,
"grad_norm": 3.984375,
"learning_rate": 3.8976647698033825e-05,
"loss": 4.4674,
"step": 19895
},
{
"epoch": 8.939802336028752,
"grad_norm": 3.96875,
"learning_rate": 3.8939155696560876e-05,
"loss": 4.4573,
"step": 19900
},
{
"epoch": 8.942048517520215,
"grad_norm": 3.984375,
"learning_rate": 3.8901739472725504e-05,
"loss": 4.3736,
"step": 19905
},
{
"epoch": 8.94429469901168,
"grad_norm": 4.25,
"learning_rate": 3.8864399049019366e-05,
"loss": 4.4087,
"step": 19910
},
{
"epoch": 8.946540880503145,
"grad_norm": 3.828125,
"learning_rate": 3.8827134447888464e-05,
"loss": 4.4721,
"step": 19915
},
{
"epoch": 8.94878706199461,
"grad_norm": 4.03125,
"learning_rate": 3.8789945691733335e-05,
"loss": 4.443,
"step": 19920
},
{
"epoch": 8.951033243486073,
"grad_norm": 3.859375,
"learning_rate": 3.875283280290885e-05,
"loss": 4.4484,
"step": 19925
},
{
"epoch": 8.953279424977538,
"grad_norm": 4.09375,
"learning_rate": 3.871579580372429e-05,
"loss": 4.4923,
"step": 19930
},
{
"epoch": 8.955525606469003,
"grad_norm": 3.984375,
"learning_rate": 3.8678834716443316e-05,
"loss": 4.4644,
"step": 19935
},
{
"epoch": 8.957771787960468,
"grad_norm": 3.8125,
"learning_rate": 3.8641949563283965e-05,
"loss": 4.4395,
"step": 19940
},
{
"epoch": 8.960017969451933,
"grad_norm": 4.03125,
"learning_rate": 3.8605140366418616e-05,
"loss": 4.4335,
"step": 19945
},
{
"epoch": 8.962264150943396,
"grad_norm": 3.96875,
"learning_rate": 3.8568407147973994e-05,
"loss": 4.4393,
"step": 19950
},
{
"epoch": 8.96451033243486,
"grad_norm": 3.96875,
"learning_rate": 3.8531749930031154e-05,
"loss": 4.4614,
"step": 19955
},
{
"epoch": 8.966756513926326,
"grad_norm": 3.75,
"learning_rate": 3.84951687346255e-05,
"loss": 4.5097,
"step": 19960
},
{
"epoch": 8.969002695417789,
"grad_norm": 4.125,
"learning_rate": 3.8458663583746685e-05,
"loss": 4.5181,
"step": 19965
},
{
"epoch": 8.971248876909254,
"grad_norm": 4.0,
"learning_rate": 3.8422234499338634e-05,
"loss": 4.4566,
"step": 19970
},
{
"epoch": 8.973495058400719,
"grad_norm": 4.03125,
"learning_rate": 3.838588150329963e-05,
"loss": 4.52,
"step": 19975
},
{
"epoch": 8.975741239892184,
"grad_norm": 4.375,
"learning_rate": 3.834960461748213e-05,
"loss": 4.4874,
"step": 19980
},
{
"epoch": 8.977987421383649,
"grad_norm": 3.796875,
"learning_rate": 3.8313403863692926e-05,
"loss": 4.4312,
"step": 19985
},
{
"epoch": 8.980233602875112,
"grad_norm": 4.0,
"learning_rate": 3.8277279263692926e-05,
"loss": 4.4864,
"step": 19990
},
{
"epoch": 8.982479784366577,
"grad_norm": 4.03125,
"learning_rate": 3.824123083919743e-05,
"loss": 4.4817,
"step": 19995
},
{
"epoch": 8.984725965858042,
"grad_norm": 3.90625,
"learning_rate": 3.820525861187575e-05,
"loss": 4.5165,
"step": 20000
},
{
"epoch": 8.984725965858042,
"eval_loss": 4.77856969833374,
"eval_runtime": 16.1346,
"eval_samples_per_second": 1922.144,
"eval_steps_per_second": 240.291,
"step": 20000
},
{
"epoch": 8.986972147349507,
"grad_norm": 4.15625,
"learning_rate": 3.816936260335156e-05,
"loss": 4.4603,
"step": 20005
},
{
"epoch": 8.98921832884097,
"grad_norm": 4.0,
"learning_rate": 3.8133542835202646e-05,
"loss": 4.3947,
"step": 20010
},
{
"epoch": 8.991464510332435,
"grad_norm": 4.0,
"learning_rate": 3.809779932896095e-05,
"loss": 4.4674,
"step": 20015
},
{
"epoch": 8.9937106918239,
"grad_norm": 3.953125,
"learning_rate": 3.8062132106112625e-05,
"loss": 4.4778,
"step": 20020
},
{
"epoch": 8.995956873315365,
"grad_norm": 4.0625,
"learning_rate": 3.802654118809788e-05,
"loss": 4.5111,
"step": 20025
},
{
"epoch": 8.998203054806828,
"grad_norm": 4.0625,
"learning_rate": 3.7991026596311175e-05,
"loss": 4.5153,
"step": 20030
},
{
"epoch": 9.000449236298293,
"grad_norm": 4.15625,
"learning_rate": 3.795558835210098e-05,
"loss": 4.4467,
"step": 20035
},
{
"epoch": 9.002695417789758,
"grad_norm": 3.703125,
"learning_rate": 3.7920226476769924e-05,
"loss": 4.4839,
"step": 20040
},
{
"epoch": 9.004941599281223,
"grad_norm": 3.9375,
"learning_rate": 3.788494099157474e-05,
"loss": 4.4578,
"step": 20045
},
{
"epoch": 9.007187780772686,
"grad_norm": 4.0625,
"learning_rate": 3.7849731917726205e-05,
"loss": 4.4219,
"step": 20050
},
{
"epoch": 9.00943396226415,
"grad_norm": 3.90625,
"learning_rate": 3.78145992763892e-05,
"loss": 4.4764,
"step": 20055
},
{
"epoch": 9.011680143755616,
"grad_norm": 4.15625,
"learning_rate": 3.777954308868263e-05,
"loss": 4.4311,
"step": 20060
},
{
"epoch": 9.01392632524708,
"grad_norm": 3.71875,
"learning_rate": 3.774456337567944e-05,
"loss": 4.446,
"step": 20065
},
{
"epoch": 9.016172506738544,
"grad_norm": 4.125,
"learning_rate": 3.770966015840665e-05,
"loss": 4.4513,
"step": 20070
},
{
"epoch": 9.018418688230009,
"grad_norm": 4.25,
"learning_rate": 3.767483345784523e-05,
"loss": 4.4918,
"step": 20075
},
{
"epoch": 9.020664869721474,
"grad_norm": 4.0,
"learning_rate": 3.764008329493025e-05,
"loss": 4.4665,
"step": 20080
},
{
"epoch": 9.022911051212938,
"grad_norm": 4.09375,
"learning_rate": 3.760540969055065e-05,
"loss": 4.4682,
"step": 20085
},
{
"epoch": 9.025157232704403,
"grad_norm": 3.90625,
"learning_rate": 3.7570812665549446e-05,
"loss": 4.4813,
"step": 20090
},
{
"epoch": 9.027403414195867,
"grad_norm": 3.828125,
"learning_rate": 3.753629224072356e-05,
"loss": 4.4417,
"step": 20095
},
{
"epoch": 9.029649595687331,
"grad_norm": 4.03125,
"learning_rate": 3.750184843682391e-05,
"loss": 4.4383,
"step": 20100
},
{
"epoch": 9.031895777178796,
"grad_norm": 3.765625,
"learning_rate": 3.746748127455536e-05,
"loss": 4.4592,
"step": 20105
},
{
"epoch": 9.034141958670261,
"grad_norm": 4.09375,
"learning_rate": 3.7433190774576636e-05,
"loss": 4.4933,
"step": 20110
},
{
"epoch": 9.036388140161725,
"grad_norm": 3.875,
"learning_rate": 3.739897695750048e-05,
"loss": 4.4422,
"step": 20115
},
{
"epoch": 9.03863432165319,
"grad_norm": 3.921875,
"learning_rate": 3.7364839843893435e-05,
"loss": 4.4862,
"step": 20120
},
{
"epoch": 9.040880503144654,
"grad_norm": 3.8125,
"learning_rate": 3.733077945427603e-05,
"loss": 4.4359,
"step": 20125
},
{
"epoch": 9.04312668463612,
"grad_norm": 3.59375,
"learning_rate": 3.729679580912262e-05,
"loss": 4.5098,
"step": 20130
},
{
"epoch": 9.045372866127583,
"grad_norm": 3.953125,
"learning_rate": 3.726288892886141e-05,
"loss": 4.4862,
"step": 20135
},
{
"epoch": 9.047619047619047,
"grad_norm": 4.0,
"learning_rate": 3.7229058833874525e-05,
"loss": 4.3972,
"step": 20140
},
{
"epoch": 9.049865229110512,
"grad_norm": 3.703125,
"learning_rate": 3.7195305544497864e-05,
"loss": 4.4993,
"step": 20145
},
{
"epoch": 9.052111410601977,
"grad_norm": 3.890625,
"learning_rate": 3.71616290810212e-05,
"loss": 4.4783,
"step": 20150
},
{
"epoch": 9.05435759209344,
"grad_norm": 3.984375,
"learning_rate": 3.712802946368816e-05,
"loss": 4.4746,
"step": 20155
},
{
"epoch": 9.056603773584905,
"grad_norm": 3.921875,
"learning_rate": 3.709450671269606e-05,
"loss": 4.5272,
"step": 20160
},
{
"epoch": 9.05884995507637,
"grad_norm": 4.0625,
"learning_rate": 3.706106084819612e-05,
"loss": 4.4499,
"step": 20165
},
{
"epoch": 9.061096136567835,
"grad_norm": 3.96875,
"learning_rate": 3.7027691890293285e-05,
"loss": 4.4562,
"step": 20170
},
{
"epoch": 9.063342318059298,
"grad_norm": 4.0625,
"learning_rate": 3.6994399859046304e-05,
"loss": 4.5038,
"step": 20175
},
{
"epoch": 9.065588499550763,
"grad_norm": 3.96875,
"learning_rate": 3.6961184774467666e-05,
"loss": 4.4168,
"step": 20180
},
{
"epoch": 9.067834681042228,
"grad_norm": 3.90625,
"learning_rate": 3.69280466565236e-05,
"loss": 4.4731,
"step": 20185
},
{
"epoch": 9.070080862533693,
"grad_norm": 4.0625,
"learning_rate": 3.689498552513408e-05,
"loss": 4.4363,
"step": 20190
},
{
"epoch": 9.072327044025156,
"grad_norm": 4.15625,
"learning_rate": 3.68620014001728e-05,
"loss": 4.4345,
"step": 20195
},
{
"epoch": 9.074573225516621,
"grad_norm": 3.984375,
"learning_rate": 3.682909430146715e-05,
"loss": 4.4821,
"step": 20200
},
{
"epoch": 9.076819407008086,
"grad_norm": 4.125,
"learning_rate": 3.679626424879825e-05,
"loss": 4.403,
"step": 20205
},
{
"epoch": 9.079065588499551,
"grad_norm": 3.921875,
"learning_rate": 3.676351126190086e-05,
"loss": 4.4852,
"step": 20210
},
{
"epoch": 9.081311769991014,
"grad_norm": 4.15625,
"learning_rate": 3.6730835360463476e-05,
"loss": 4.4251,
"step": 20215
},
{
"epoch": 9.08355795148248,
"grad_norm": 4.0625,
"learning_rate": 3.6698236564128184e-05,
"loss": 4.4048,
"step": 20220
},
{
"epoch": 9.085804132973944,
"grad_norm": 4.03125,
"learning_rate": 3.666571489249081e-05,
"loss": 4.4439,
"step": 20225
},
{
"epoch": 9.08805031446541,
"grad_norm": 3.796875,
"learning_rate": 3.6633270365100696e-05,
"loss": 4.4273,
"step": 20230
},
{
"epoch": 9.090296495956874,
"grad_norm": 3.984375,
"learning_rate": 3.6600903001460934e-05,
"loss": 4.4236,
"step": 20235
},
{
"epoch": 9.092542677448337,
"grad_norm": 3.90625,
"learning_rate": 3.656861282102816e-05,
"loss": 4.4979,
"step": 20240
},
{
"epoch": 9.094788858939802,
"grad_norm": 4.03125,
"learning_rate": 3.653639984321262e-05,
"loss": 4.4669,
"step": 20245
},
{
"epoch": 9.097035040431267,
"grad_norm": 3.84375,
"learning_rate": 3.65042640873782e-05,
"loss": 4.3567,
"step": 20250
},
{
"epoch": 9.099281221922732,
"grad_norm": 4.1875,
"learning_rate": 3.6472205572842304e-05,
"loss": 4.5483,
"step": 20255
},
{
"epoch": 9.101527403414195,
"grad_norm": 3.796875,
"learning_rate": 3.6440224318875944e-05,
"loss": 4.3891,
"step": 20260
},
{
"epoch": 9.10377358490566,
"grad_norm": 4.21875,
"learning_rate": 3.640832034470366e-05,
"loss": 4.4508,
"step": 20265
},
{
"epoch": 9.106019766397125,
"grad_norm": 4.0625,
"learning_rate": 3.637649366950357e-05,
"loss": 4.5023,
"step": 20270
},
{
"epoch": 9.10826594788859,
"grad_norm": 3.90625,
"learning_rate": 3.6344744312407325e-05,
"loss": 4.4781,
"step": 20275
},
{
"epoch": 9.110512129380053,
"grad_norm": 4.15625,
"learning_rate": 3.631307229250003e-05,
"loss": 4.4442,
"step": 20280
},
{
"epoch": 9.112758310871518,
"grad_norm": 3.828125,
"learning_rate": 3.628147762882046e-05,
"loss": 4.4639,
"step": 20285
},
{
"epoch": 9.115004492362983,
"grad_norm": 4.125,
"learning_rate": 3.624996034036065e-05,
"loss": 4.5477,
"step": 20290
},
{
"epoch": 9.117250673854448,
"grad_norm": 4.0,
"learning_rate": 3.6218520446066334e-05,
"loss": 4.4283,
"step": 20295
},
{
"epoch": 9.119496855345911,
"grad_norm": 3.96875,
"learning_rate": 3.6187157964836664e-05,
"loss": 4.4165,
"step": 20300
},
{
"epoch": 9.121743036837376,
"grad_norm": 3.859375,
"learning_rate": 3.6155872915524195e-05,
"loss": 4.4519,
"step": 20305
},
{
"epoch": 9.123989218328841,
"grad_norm": 3.890625,
"learning_rate": 3.6124665316935e-05,
"loss": 4.4878,
"step": 20310
},
{
"epoch": 9.126235399820306,
"grad_norm": 4.125,
"learning_rate": 3.609353518782858e-05,
"loss": 4.4873,
"step": 20315
},
{
"epoch": 9.12848158131177,
"grad_norm": 4.09375,
"learning_rate": 3.6062482546917844e-05,
"loss": 4.4115,
"step": 20320
},
{
"epoch": 9.130727762803234,
"grad_norm": 3.765625,
"learning_rate": 3.603150741286919e-05,
"loss": 4.5559,
"step": 20325
},
{
"epoch": 9.132973944294699,
"grad_norm": 3.78125,
"learning_rate": 3.600060980430231e-05,
"loss": 4.4489,
"step": 20330
},
{
"epoch": 9.135220125786164,
"grad_norm": 3.875,
"learning_rate": 3.5969789739790416e-05,
"loss": 4.4855,
"step": 20335
},
{
"epoch": 9.137466307277627,
"grad_norm": 3.90625,
"learning_rate": 3.593904723786002e-05,
"loss": 4.4637,
"step": 20340
},
{
"epoch": 9.139712488769092,
"grad_norm": 3.953125,
"learning_rate": 3.5908382316991077e-05,
"loss": 4.418,
"step": 20345
},
{
"epoch": 9.141958670260557,
"grad_norm": 4.1875,
"learning_rate": 3.5877794995616825e-05,
"loss": 4.3961,
"step": 20350
},
{
"epoch": 9.144204851752022,
"grad_norm": 4.0,
"learning_rate": 3.5847285292123914e-05,
"loss": 4.4948,
"step": 20355
},
{
"epoch": 9.146451033243487,
"grad_norm": 3.90625,
"learning_rate": 3.581685322485234e-05,
"loss": 4.438,
"step": 20360
},
{
"epoch": 9.14869721473495,
"grad_norm": 3.921875,
"learning_rate": 3.5786498812095394e-05,
"loss": 4.4238,
"step": 20365
},
{
"epoch": 9.150943396226415,
"grad_norm": 3.890625,
"learning_rate": 3.5756222072099744e-05,
"loss": 4.4014,
"step": 20370
},
{
"epoch": 9.15318957771788,
"grad_norm": 3.703125,
"learning_rate": 3.572602302306527e-05,
"loss": 4.5125,
"step": 20375
},
{
"epoch": 9.155435759209345,
"grad_norm": 3.796875,
"learning_rate": 3.569590168314526e-05,
"loss": 4.4185,
"step": 20380
},
{
"epoch": 9.157681940700808,
"grad_norm": 4.15625,
"learning_rate": 3.566585807044621e-05,
"loss": 4.4771,
"step": 20385
},
{
"epoch": 9.159928122192273,
"grad_norm": 3.5625,
"learning_rate": 3.563589220302793e-05,
"loss": 4.4659,
"step": 20390
},
{
"epoch": 9.162174303683738,
"grad_norm": 4.125,
"learning_rate": 3.560600409890352e-05,
"loss": 4.4403,
"step": 20395
},
{
"epoch": 9.164420485175203,
"grad_norm": 4.09375,
"learning_rate": 3.5576193776039254e-05,
"loss": 4.5184,
"step": 20400
},
{
"epoch": 9.166666666666666,
"grad_norm": 4.0,
"learning_rate": 3.554646125235475e-05,
"loss": 4.4681,
"step": 20405
},
{
"epoch": 9.168912848158131,
"grad_norm": 3.75,
"learning_rate": 3.551680654572276e-05,
"loss": 4.4061,
"step": 20410
},
{
"epoch": 9.171159029649596,
"grad_norm": 3.953125,
"learning_rate": 3.548722967396934e-05,
"loss": 4.4493,
"step": 20415
},
{
"epoch": 9.173405211141061,
"grad_norm": 4.0,
"learning_rate": 3.5457730654873724e-05,
"loss": 4.4838,
"step": 20420
},
{
"epoch": 9.175651392632524,
"grad_norm": 3.90625,
"learning_rate": 3.5428309506168314e-05,
"loss": 4.4205,
"step": 20425
},
{
"epoch": 9.177897574123989,
"grad_norm": 4.125,
"learning_rate": 3.5398966245538796e-05,
"loss": 4.4896,
"step": 20430
},
{
"epoch": 9.180143755615454,
"grad_norm": 4.15625,
"learning_rate": 3.5369700890623934e-05,
"loss": 4.4758,
"step": 20435
},
{
"epoch": 9.182389937106919,
"grad_norm": 4.0625,
"learning_rate": 3.534051345901573e-05,
"loss": 4.493,
"step": 20440
},
{
"epoch": 9.184636118598382,
"grad_norm": 4.0,
"learning_rate": 3.53114039682593e-05,
"loss": 4.4232,
"step": 20445
},
{
"epoch": 9.186882300089847,
"grad_norm": 3.8125,
"learning_rate": 3.5282372435852935e-05,
"loss": 4.437,
"step": 20450
},
{
"epoch": 9.189128481581312,
"grad_norm": 4.0625,
"learning_rate": 3.5253418879248056e-05,
"loss": 4.5357,
"step": 20455
},
{
"epoch": 9.191374663072777,
"grad_norm": 4.15625,
"learning_rate": 3.52245433158492e-05,
"loss": 4.4931,
"step": 20460
},
{
"epoch": 9.19362084456424,
"grad_norm": 4.0,
"learning_rate": 3.519574576301405e-05,
"loss": 4.4336,
"step": 20465
},
{
"epoch": 9.195867026055705,
"grad_norm": 4.0,
"learning_rate": 3.516702623805339e-05,
"loss": 4.4578,
"step": 20470
},
{
"epoch": 9.19811320754717,
"grad_norm": 3.96875,
"learning_rate": 3.5138384758231055e-05,
"loss": 4.4428,
"step": 20475
},
{
"epoch": 9.200359389038635,
"grad_norm": 4.09375,
"learning_rate": 3.5109821340764016e-05,
"loss": 4.4737,
"step": 20480
},
{
"epoch": 9.202605570530098,
"grad_norm": 3.9375,
"learning_rate": 3.50813360028223e-05,
"loss": 4.5054,
"step": 20485
},
{
"epoch": 9.204851752021563,
"grad_norm": 4.0625,
"learning_rate": 3.5052928761529004e-05,
"loss": 4.4779,
"step": 20490
},
{
"epoch": 9.207097933513028,
"grad_norm": 3.921875,
"learning_rate": 3.502459963396027e-05,
"loss": 4.4074,
"step": 20495
},
{
"epoch": 9.209344115004493,
"grad_norm": 3.734375,
"learning_rate": 3.4996348637145285e-05,
"loss": 4.5052,
"step": 20500
},
{
"epoch": 9.211590296495958,
"grad_norm": 4.46875,
"learning_rate": 3.49681757880663e-05,
"loss": 4.4463,
"step": 20505
},
{
"epoch": 9.213836477987421,
"grad_norm": 3.984375,
"learning_rate": 3.4940081103658546e-05,
"loss": 4.4598,
"step": 20510
},
{
"epoch": 9.216082659478886,
"grad_norm": 4.0,
"learning_rate": 3.491206460081028e-05,
"loss": 4.4375,
"step": 20515
},
{
"epoch": 9.21832884097035,
"grad_norm": 4.0625,
"learning_rate": 3.4884126296362766e-05,
"loss": 4.4494,
"step": 20520
},
{
"epoch": 9.220575022461816,
"grad_norm": 3.90625,
"learning_rate": 3.48562662071103e-05,
"loss": 4.4271,
"step": 20525
},
{
"epoch": 9.222821203953279,
"grad_norm": 4.03125,
"learning_rate": 3.48284843498001e-05,
"loss": 4.4001,
"step": 20530
},
{
"epoch": 9.225067385444744,
"grad_norm": 4.3125,
"learning_rate": 3.4800780741132374e-05,
"loss": 4.4296,
"step": 20535
},
{
"epoch": 9.227313566936209,
"grad_norm": 3.765625,
"learning_rate": 3.477315539776034e-05,
"loss": 4.4655,
"step": 20540
},
{
"epoch": 9.229559748427674,
"grad_norm": 3.890625,
"learning_rate": 3.474560833629008e-05,
"loss": 4.4903,
"step": 20545
},
{
"epoch": 9.231805929919137,
"grad_norm": 4.21875,
"learning_rate": 3.471813957328072e-05,
"loss": 4.4421,
"step": 20550
},
{
"epoch": 9.234052111410602,
"grad_norm": 3.609375,
"learning_rate": 3.4690749125244233e-05,
"loss": 4.4794,
"step": 20555
},
{
"epoch": 9.236298292902067,
"grad_norm": 4.21875,
"learning_rate": 3.466343700864557e-05,
"loss": 4.5095,
"step": 20560
},
{
"epoch": 9.238544474393532,
"grad_norm": 4.0625,
"learning_rate": 3.46362032399026e-05,
"loss": 4.4533,
"step": 20565
},
{
"epoch": 9.240790655884995,
"grad_norm": 3.75,
"learning_rate": 3.460904783538602e-05,
"loss": 4.486,
"step": 20570
},
{
"epoch": 9.24303683737646,
"grad_norm": 3.734375,
"learning_rate": 3.4581970811419535e-05,
"loss": 4.4751,
"step": 20575
},
{
"epoch": 9.245283018867925,
"grad_norm": 4.0,
"learning_rate": 3.4554972184279635e-05,
"loss": 4.472,
"step": 20580
},
{
"epoch": 9.24752920035939,
"grad_norm": 4.125,
"learning_rate": 3.452805197019573e-05,
"loss": 4.4159,
"step": 20585
},
{
"epoch": 9.249775381850853,
"grad_norm": 4.0625,
"learning_rate": 3.450121018535008e-05,
"loss": 4.4319,
"step": 20590
},
{
"epoch": 9.252021563342318,
"grad_norm": 4.0,
"learning_rate": 3.447444684587781e-05,
"loss": 4.4907,
"step": 20595
},
{
"epoch": 9.254267744833783,
"grad_norm": 4.1875,
"learning_rate": 3.4447761967866926e-05,
"loss": 4.4595,
"step": 20600
},
{
"epoch": 9.256513926325248,
"grad_norm": 4.09375,
"learning_rate": 3.442115556735816e-05,
"loss": 4.5408,
"step": 20605
},
{
"epoch": 9.25876010781671,
"grad_norm": 3.734375,
"learning_rate": 3.439462766034518e-05,
"loss": 4.4267,
"step": 20610
},
{
"epoch": 9.261006289308176,
"grad_norm": 3.8125,
"learning_rate": 3.4368178262774435e-05,
"loss": 4.4546,
"step": 20615
},
{
"epoch": 9.26325247079964,
"grad_norm": 3.65625,
"learning_rate": 3.434180739054515e-05,
"loss": 4.4058,
"step": 20620
},
{
"epoch": 9.265498652291106,
"grad_norm": 4.125,
"learning_rate": 3.4315515059509406e-05,
"loss": 4.4089,
"step": 20625
},
{
"epoch": 9.267744833782569,
"grad_norm": 4.09375,
"learning_rate": 3.4289301285471984e-05,
"loss": 4.489,
"step": 20630
},
{
"epoch": 9.269991015274034,
"grad_norm": 4.0,
"learning_rate": 3.4263166084190556e-05,
"loss": 4.4804,
"step": 20635
},
{
"epoch": 9.272237196765499,
"grad_norm": 4.25,
"learning_rate": 3.423710947137547e-05,
"loss": 4.4848,
"step": 20640
},
{
"epoch": 9.274483378256964,
"grad_norm": 4.1875,
"learning_rate": 3.421113146268986e-05,
"loss": 4.4427,
"step": 20645
},
{
"epoch": 9.276729559748428,
"grad_norm": 4.0625,
"learning_rate": 3.418523207374963e-05,
"loss": 4.4495,
"step": 20650
},
{
"epoch": 9.278975741239892,
"grad_norm": 4.03125,
"learning_rate": 3.4159411320123404e-05,
"loss": 4.5049,
"step": 20655
},
{
"epoch": 9.281221922731357,
"grad_norm": 3.859375,
"learning_rate": 3.413366921733255e-05,
"loss": 4.4524,
"step": 20660
},
{
"epoch": 9.283468104222822,
"grad_norm": 3.953125,
"learning_rate": 3.410800578085113e-05,
"loss": 4.4708,
"step": 20665
},
{
"epoch": 9.285714285714286,
"grad_norm": 3.921875,
"learning_rate": 3.408242102610594e-05,
"loss": 4.4779,
"step": 20670
},
{
"epoch": 9.28796046720575,
"grad_norm": 4.1875,
"learning_rate": 3.405691496847651e-05,
"loss": 4.5344,
"step": 20675
},
{
"epoch": 9.290206648697215,
"grad_norm": 3.9375,
"learning_rate": 3.403148762329497e-05,
"loss": 4.4469,
"step": 20680
},
{
"epoch": 9.29245283018868,
"grad_norm": 3.578125,
"learning_rate": 3.4006139005846275e-05,
"loss": 4.4822,
"step": 20685
},
{
"epoch": 9.294699011680144,
"grad_norm": 4.03125,
"learning_rate": 3.398086913136789e-05,
"loss": 4.4309,
"step": 20690
},
{
"epoch": 9.296945193171608,
"grad_norm": 4.09375,
"learning_rate": 3.3955678015050085e-05,
"loss": 4.4726,
"step": 20695
},
{
"epoch": 9.299191374663073,
"grad_norm": 3.921875,
"learning_rate": 3.3930565672035704e-05,
"loss": 4.4887,
"step": 20700
},
{
"epoch": 9.301437556154537,
"grad_norm": 4.03125,
"learning_rate": 3.3905532117420285e-05,
"loss": 4.4062,
"step": 20705
},
{
"epoch": 9.303683737646002,
"grad_norm": 3.953125,
"learning_rate": 3.388057736625198e-05,
"loss": 4.4559,
"step": 20710
},
{
"epoch": 9.305929919137466,
"grad_norm": 4.0,
"learning_rate": 3.3855701433531565e-05,
"loss": 4.4399,
"step": 20715
},
{
"epoch": 9.30817610062893,
"grad_norm": 4.0625,
"learning_rate": 3.383090433421249e-05,
"loss": 4.4522,
"step": 20720
},
{
"epoch": 9.310422282120395,
"grad_norm": 4.15625,
"learning_rate": 3.380618608320073e-05,
"loss": 4.5295,
"step": 20725
},
{
"epoch": 9.31266846361186,
"grad_norm": 4.0,
"learning_rate": 3.378154669535494e-05,
"loss": 4.4936,
"step": 20730
},
{
"epoch": 9.314914645103324,
"grad_norm": 3.828125,
"learning_rate": 3.3756986185486315e-05,
"loss": 4.4374,
"step": 20735
},
{
"epoch": 9.317160826594789,
"grad_norm": 4.09375,
"learning_rate": 3.373250456835867e-05,
"loss": 4.4132,
"step": 20740
},
{
"epoch": 9.319407008086253,
"grad_norm": 3.9375,
"learning_rate": 3.37081018586884e-05,
"loss": 4.4009,
"step": 20745
},
{
"epoch": 9.321653189577718,
"grad_norm": 4.1875,
"learning_rate": 3.368377807114441e-05,
"loss": 4.4279,
"step": 20750
},
{
"epoch": 9.323899371069182,
"grad_norm": 3.921875,
"learning_rate": 3.365953322034823e-05,
"loss": 4.4392,
"step": 20755
},
{
"epoch": 9.326145552560646,
"grad_norm": 3.828125,
"learning_rate": 3.3635367320873925e-05,
"loss": 4.4438,
"step": 20760
},
{
"epoch": 9.328391734052111,
"grad_norm": 4.125,
"learning_rate": 3.361128038724807e-05,
"loss": 4.4955,
"step": 20765
},
{
"epoch": 9.330637915543576,
"grad_norm": 3.796875,
"learning_rate": 3.3587272433949785e-05,
"loss": 4.4454,
"step": 20770
},
{
"epoch": 9.332884097035041,
"grad_norm": 3.953125,
"learning_rate": 3.356334347541074e-05,
"loss": 4.4515,
"step": 20775
},
{
"epoch": 9.335130278526504,
"grad_norm": 4.03125,
"learning_rate": 3.3539493526015084e-05,
"loss": 4.4467,
"step": 20780
},
{
"epoch": 9.33737646001797,
"grad_norm": 4.125,
"learning_rate": 3.351572260009951e-05,
"loss": 4.4908,
"step": 20785
},
{
"epoch": 9.339622641509434,
"grad_norm": 4.09375,
"learning_rate": 3.3492030711953147e-05,
"loss": 4.4666,
"step": 20790
},
{
"epoch": 9.3418688230009,
"grad_norm": 3.734375,
"learning_rate": 3.3468417875817694e-05,
"loss": 4.4878,
"step": 20795
},
{
"epoch": 9.344115004492362,
"grad_norm": 4.03125,
"learning_rate": 3.3444884105887275e-05,
"loss": 4.502,
"step": 20800
},
{
"epoch": 9.346361185983827,
"grad_norm": 4.03125,
"learning_rate": 3.3421429416308485e-05,
"loss": 4.4959,
"step": 20805
},
{
"epoch": 9.348607367475292,
"grad_norm": 3.9375,
"learning_rate": 3.3398053821180397e-05,
"loss": 4.4766,
"step": 20810
},
{
"epoch": 9.350853548966757,
"grad_norm": 4.03125,
"learning_rate": 3.337475733455456e-05,
"loss": 4.4401,
"step": 20815
},
{
"epoch": 9.35309973045822,
"grad_norm": 3.859375,
"learning_rate": 3.335153997043494e-05,
"loss": 4.5102,
"step": 20820
},
{
"epoch": 9.355345911949685,
"grad_norm": 3.703125,
"learning_rate": 3.332840174277793e-05,
"loss": 4.437,
"step": 20825
},
{
"epoch": 9.35759209344115,
"grad_norm": 4.0,
"learning_rate": 3.3305342665492403e-05,
"loss": 4.4891,
"step": 20830
},
{
"epoch": 9.359838274932615,
"grad_norm": 3.96875,
"learning_rate": 3.328236275243958e-05,
"loss": 4.3987,
"step": 20835
},
{
"epoch": 9.362084456424078,
"grad_norm": 4.125,
"learning_rate": 3.325946201743317e-05,
"loss": 4.5082,
"step": 20840
},
{
"epoch": 9.364330637915543,
"grad_norm": 4.78125,
"learning_rate": 3.323664047423924e-05,
"loss": 4.4234,
"step": 20845
},
{
"epoch": 9.366576819407008,
"grad_norm": 3.796875,
"learning_rate": 3.321389813657625e-05,
"loss": 4.5046,
"step": 20850
},
{
"epoch": 9.368823000898473,
"grad_norm": 3.96875,
"learning_rate": 3.319123501811511e-05,
"loss": 4.4412,
"step": 20855
},
{
"epoch": 9.371069182389936,
"grad_norm": 3.859375,
"learning_rate": 3.3168651132479e-05,
"loss": 4.4695,
"step": 20860
},
{
"epoch": 9.373315363881401,
"grad_norm": 4.09375,
"learning_rate": 3.314614649324361e-05,
"loss": 4.4956,
"step": 20865
},
{
"epoch": 9.375561545372866,
"grad_norm": 3.609375,
"learning_rate": 3.312372111393684e-05,
"loss": 4.4805,
"step": 20870
},
{
"epoch": 9.377807726864331,
"grad_norm": 3.9375,
"learning_rate": 3.310137500803907e-05,
"loss": 4.4551,
"step": 20875
},
{
"epoch": 9.380053908355794,
"grad_norm": 4.125,
"learning_rate": 3.3079108188982986e-05,
"loss": 4.5872,
"step": 20880
},
{
"epoch": 9.38230008984726,
"grad_norm": 3.75,
"learning_rate": 3.305692067015358e-05,
"loss": 4.4619,
"step": 20885
},
{
"epoch": 9.384546271338724,
"grad_norm": 4.0625,
"learning_rate": 3.303481246488822e-05,
"loss": 4.4675,
"step": 20890
},
{
"epoch": 9.38679245283019,
"grad_norm": 4.15625,
"learning_rate": 3.301278358647659e-05,
"loss": 4.4347,
"step": 20895
},
{
"epoch": 9.389038634321654,
"grad_norm": 3.953125,
"learning_rate": 3.299083404816066e-05,
"loss": 4.4454,
"step": 20900
},
{
"epoch": 9.391284815813117,
"grad_norm": 4.03125,
"learning_rate": 3.296896386313473e-05,
"loss": 4.4842,
"step": 20905
},
{
"epoch": 9.393530997304582,
"grad_norm": 3.96875,
"learning_rate": 3.294717304454539e-05,
"loss": 4.4931,
"step": 20910
},
{
"epoch": 9.395777178796047,
"grad_norm": 3.875,
"learning_rate": 3.292546160549158e-05,
"loss": 4.46,
"step": 20915
},
{
"epoch": 9.398023360287512,
"grad_norm": 4.0625,
"learning_rate": 3.290382955902438e-05,
"loss": 4.4622,
"step": 20920
},
{
"epoch": 9.400269541778975,
"grad_norm": 3.953125,
"learning_rate": 3.288227691814729e-05,
"loss": 4.432,
"step": 20925
},
{
"epoch": 9.40251572327044,
"grad_norm": 3.9375,
"learning_rate": 3.286080369581602e-05,
"loss": 4.4224,
"step": 20930
},
{
"epoch": 9.404761904761905,
"grad_norm": 4.0625,
"learning_rate": 3.2839409904938546e-05,
"loss": 4.4739,
"step": 20935
},
{
"epoch": 9.40700808625337,
"grad_norm": 4.0,
"learning_rate": 3.281809555837509e-05,
"loss": 4.449,
"step": 20940
},
{
"epoch": 9.409254267744833,
"grad_norm": 3.953125,
"learning_rate": 3.2796860668938105e-05,
"loss": 4.4706,
"step": 20945
},
{
"epoch": 9.411500449236298,
"grad_norm": 4.0625,
"learning_rate": 3.277570524939233e-05,
"loss": 4.4801,
"step": 20950
},
{
"epoch": 9.413746630727763,
"grad_norm": 4.125,
"learning_rate": 3.275462931245467e-05,
"loss": 4.4234,
"step": 20955
},
{
"epoch": 9.415992812219228,
"grad_norm": 4.125,
"learning_rate": 3.273363287079431e-05,
"loss": 4.418,
"step": 20960
},
{
"epoch": 9.418238993710691,
"grad_norm": 4.125,
"learning_rate": 3.271271593703261e-05,
"loss": 4.4828,
"step": 20965
},
{
"epoch": 9.420485175202156,
"grad_norm": 3.78125,
"learning_rate": 3.2691878523743136e-05,
"loss": 4.4232,
"step": 20970
},
{
"epoch": 9.422731356693621,
"grad_norm": 4.15625,
"learning_rate": 3.267112064345168e-05,
"loss": 4.444,
"step": 20975
},
{
"epoch": 9.424977538185086,
"grad_norm": 4.21875,
"learning_rate": 3.265044230863621e-05,
"loss": 4.428,
"step": 20980
},
{
"epoch": 9.42722371967655,
"grad_norm": 3.859375,
"learning_rate": 3.262984353172687e-05,
"loss": 4.5171,
"step": 20985
},
{
"epoch": 9.429469901168014,
"grad_norm": 4.09375,
"learning_rate": 3.2609324325105996e-05,
"loss": 4.4546,
"step": 20990
},
{
"epoch": 9.431716082659479,
"grad_norm": 3.921875,
"learning_rate": 3.258888470110805e-05,
"loss": 4.4345,
"step": 20995
},
{
"epoch": 9.433962264150944,
"grad_norm": 3.796875,
"learning_rate": 3.2568524672019736e-05,
"loss": 4.4649,
"step": 21000
},
{
"epoch": 9.433962264150944,
"eval_loss": 4.779551982879639,
"eval_runtime": 16.1437,
"eval_samples_per_second": 1921.061,
"eval_steps_per_second": 240.156,
"step": 21000
},
{
"epoch": 9.436208445642407,
"grad_norm": 4.21875,
"learning_rate": 3.254824425007984e-05,
"loss": 4.4247,
"step": 21005
},
{
"epoch": 9.438454627133872,
"grad_norm": 3.890625,
"learning_rate": 3.252804344747934e-05,
"loss": 4.467,
"step": 21010
},
{
"epoch": 9.440700808625337,
"grad_norm": 3.9375,
"learning_rate": 3.250792227636132e-05,
"loss": 4.4517,
"step": 21015
},
{
"epoch": 9.442946990116802,
"grad_norm": 3.875,
"learning_rate": 3.2487880748820984e-05,
"loss": 4.483,
"step": 21020
},
{
"epoch": 9.445193171608265,
"grad_norm": 3.859375,
"learning_rate": 3.2467918876905736e-05,
"loss": 4.4234,
"step": 21025
},
{
"epoch": 9.44743935309973,
"grad_norm": 3.65625,
"learning_rate": 3.244803667261501e-05,
"loss": 4.5102,
"step": 21030
},
{
"epoch": 9.449685534591195,
"grad_norm": 3.84375,
"learning_rate": 3.242823414790042e-05,
"loss": 4.3925,
"step": 21035
},
{
"epoch": 9.45193171608266,
"grad_norm": 4.21875,
"learning_rate": 3.24085113146656e-05,
"loss": 4.5181,
"step": 21040
},
{
"epoch": 9.454177897574123,
"grad_norm": 3.984375,
"learning_rate": 3.238886818476639e-05,
"loss": 4.4318,
"step": 21045
},
{
"epoch": 9.456424079065588,
"grad_norm": 3.859375,
"learning_rate": 3.236930477001061e-05,
"loss": 4.4973,
"step": 21050
},
{
"epoch": 9.458670260557053,
"grad_norm": 4.09375,
"learning_rate": 3.2349821082158236e-05,
"loss": 4.4727,
"step": 21055
},
{
"epoch": 9.460916442048518,
"grad_norm": 3.859375,
"learning_rate": 3.23304171329213e-05,
"loss": 4.4771,
"step": 21060
},
{
"epoch": 9.463162623539983,
"grad_norm": 3.84375,
"learning_rate": 3.2311092933963865e-05,
"loss": 4.4805,
"step": 21065
},
{
"epoch": 9.465408805031446,
"grad_norm": 3.921875,
"learning_rate": 3.229184849690212e-05,
"loss": 4.4046,
"step": 21070
},
{
"epoch": 9.467654986522911,
"grad_norm": 4.0,
"learning_rate": 3.227268383330426e-05,
"loss": 4.4095,
"step": 21075
},
{
"epoch": 9.469901168014376,
"grad_norm": 4.125,
"learning_rate": 3.225359895469053e-05,
"loss": 4.5492,
"step": 21080
},
{
"epoch": 9.47214734950584,
"grad_norm": 3.921875,
"learning_rate": 3.2234593872533225e-05,
"loss": 4.3571,
"step": 21085
},
{
"epoch": 9.474393530997304,
"grad_norm": 4.15625,
"learning_rate": 3.221566859825667e-05,
"loss": 4.4533,
"step": 21090
},
{
"epoch": 9.476639712488769,
"grad_norm": 4.15625,
"learning_rate": 3.219682314323724e-05,
"loss": 4.434,
"step": 21095
},
{
"epoch": 9.478885893980234,
"grad_norm": 3.9375,
"learning_rate": 3.2178057518803274e-05,
"loss": 4.4256,
"step": 21100
},
{
"epoch": 9.481132075471699,
"grad_norm": 3.71875,
"learning_rate": 3.215937173623517e-05,
"loss": 4.4841,
"step": 21105
},
{
"epoch": 9.483378256963162,
"grad_norm": 3.9375,
"learning_rate": 3.214076580676533e-05,
"loss": 4.4873,
"step": 21110
},
{
"epoch": 9.485624438454627,
"grad_norm": 4.21875,
"learning_rate": 3.212223974157812e-05,
"loss": 4.4543,
"step": 21115
},
{
"epoch": 9.487870619946092,
"grad_norm": 3.5625,
"learning_rate": 3.210379355180993e-05,
"loss": 4.505,
"step": 21120
},
{
"epoch": 9.490116801437557,
"grad_norm": 4.0625,
"learning_rate": 3.208542724854913e-05,
"loss": 4.4396,
"step": 21125
},
{
"epoch": 9.49236298292902,
"grad_norm": 4.34375,
"learning_rate": 3.206714084283605e-05,
"loss": 4.4859,
"step": 21130
},
{
"epoch": 9.494609164420485,
"grad_norm": 4.03125,
"learning_rate": 3.204893434566302e-05,
"loss": 4.5148,
"step": 21135
},
{
"epoch": 9.49685534591195,
"grad_norm": 3.9375,
"learning_rate": 3.203080776797432e-05,
"loss": 4.5325,
"step": 21140
},
{
"epoch": 9.499101527403415,
"grad_norm": 4.25,
"learning_rate": 3.2012761120666185e-05,
"loss": 4.4119,
"step": 21145
},
{
"epoch": 9.501347708894878,
"grad_norm": 4.03125,
"learning_rate": 3.199479441458679e-05,
"loss": 4.4054,
"step": 21150
},
{
"epoch": 9.503593890386343,
"grad_norm": 3.984375,
"learning_rate": 3.197690766053632e-05,
"loss": 4.4224,
"step": 21155
},
{
"epoch": 9.505840071877808,
"grad_norm": 3.953125,
"learning_rate": 3.1959100869266814e-05,
"loss": 4.4602,
"step": 21160
},
{
"epoch": 9.508086253369273,
"grad_norm": 4.1875,
"learning_rate": 3.194137405148229e-05,
"loss": 4.478,
"step": 21165
},
{
"epoch": 9.510332434860736,
"grad_norm": 3.953125,
"learning_rate": 3.192372721783868e-05,
"loss": 4.4035,
"step": 21170
},
{
"epoch": 9.5125786163522,
"grad_norm": 4.0625,
"learning_rate": 3.1906160378943865e-05,
"loss": 4.4875,
"step": 21175
},
{
"epoch": 9.514824797843666,
"grad_norm": 3.9375,
"learning_rate": 3.188867354535759e-05,
"loss": 4.4576,
"step": 21180
},
{
"epoch": 9.51707097933513,
"grad_norm": 4.25,
"learning_rate": 3.187126672759153e-05,
"loss": 4.4914,
"step": 21185
},
{
"epoch": 9.519317160826596,
"grad_norm": 4.03125,
"learning_rate": 3.185393993610929e-05,
"loss": 4.4658,
"step": 21190
},
{
"epoch": 9.521563342318059,
"grad_norm": 3.921875,
"learning_rate": 3.183669318132632e-05,
"loss": 4.4875,
"step": 21195
},
{
"epoch": 9.523809523809524,
"grad_norm": 3.859375,
"learning_rate": 3.181952647360999e-05,
"loss": 4.4896,
"step": 21200
},
{
"epoch": 9.526055705300989,
"grad_norm": 3.640625,
"learning_rate": 3.1802439823279534e-05,
"loss": 4.4181,
"step": 21205
},
{
"epoch": 9.528301886792454,
"grad_norm": 4.34375,
"learning_rate": 3.1785433240606084e-05,
"loss": 4.5146,
"step": 21210
},
{
"epoch": 9.530548068283917,
"grad_norm": 3.90625,
"learning_rate": 3.176850673581264e-05,
"loss": 4.4964,
"step": 21215
},
{
"epoch": 9.532794249775382,
"grad_norm": 3.828125,
"learning_rate": 3.175166031907402e-05,
"loss": 4.4304,
"step": 21220
},
{
"epoch": 9.535040431266847,
"grad_norm": 4.03125,
"learning_rate": 3.173489400051695e-05,
"loss": 4.4662,
"step": 21225
},
{
"epoch": 9.537286612758312,
"grad_norm": 3.828125,
"learning_rate": 3.1718207790220025e-05,
"loss": 4.4908,
"step": 21230
},
{
"epoch": 9.539532794249775,
"grad_norm": 3.65625,
"learning_rate": 3.1701601698213606e-05,
"loss": 4.4367,
"step": 21235
},
{
"epoch": 9.54177897574124,
"grad_norm": 3.765625,
"learning_rate": 3.168507573447995e-05,
"loss": 4.3619,
"step": 21240
},
{
"epoch": 9.544025157232705,
"grad_norm": 3.9375,
"learning_rate": 3.166862990895315e-05,
"loss": 4.4482,
"step": 21245
},
{
"epoch": 9.54627133872417,
"grad_norm": 3.90625,
"learning_rate": 3.1652264231519106e-05,
"loss": 4.4297,
"step": 21250
},
{
"epoch": 9.548517520215633,
"grad_norm": 3.71875,
"learning_rate": 3.163597871201555e-05,
"loss": 4.4541,
"step": 21255
},
{
"epoch": 9.550763701707098,
"grad_norm": 4.15625,
"learning_rate": 3.161977336023201e-05,
"loss": 4.4299,
"step": 21260
},
{
"epoch": 9.553009883198563,
"grad_norm": 3.90625,
"learning_rate": 3.1603648185909876e-05,
"loss": 4.43,
"step": 21265
},
{
"epoch": 9.555256064690028,
"grad_norm": 3.90625,
"learning_rate": 3.158760319874226e-05,
"loss": 4.5526,
"step": 21270
},
{
"epoch": 9.55750224618149,
"grad_norm": 3.890625,
"learning_rate": 3.1571638408374145e-05,
"loss": 4.4884,
"step": 21275
},
{
"epoch": 9.559748427672956,
"grad_norm": 4.1875,
"learning_rate": 3.155575382440228e-05,
"loss": 4.5047,
"step": 21280
},
{
"epoch": 9.56199460916442,
"grad_norm": 4.03125,
"learning_rate": 3.153994945637519e-05,
"loss": 4.4524,
"step": 21285
},
{
"epoch": 9.564240790655886,
"grad_norm": 4.0,
"learning_rate": 3.1524225313793195e-05,
"loss": 4.3926,
"step": 21290
},
{
"epoch": 9.566486972147349,
"grad_norm": 4.15625,
"learning_rate": 3.1508581406108374e-05,
"loss": 4.5939,
"step": 21295
},
{
"epoch": 9.568733153638814,
"grad_norm": 4.0,
"learning_rate": 3.1493017742724605e-05,
"loss": 4.4458,
"step": 21300
},
{
"epoch": 9.570979335130279,
"grad_norm": 4.15625,
"learning_rate": 3.147753433299748e-05,
"loss": 4.4331,
"step": 21305
},
{
"epoch": 9.573225516621743,
"grad_norm": 3.90625,
"learning_rate": 3.146213118623441e-05,
"loss": 4.4841,
"step": 21310
},
{
"epoch": 9.575471698113208,
"grad_norm": 3.953125,
"learning_rate": 3.144680831169452e-05,
"loss": 4.4666,
"step": 21315
},
{
"epoch": 9.577717879604672,
"grad_norm": 3.96875,
"learning_rate": 3.143156571858868e-05,
"loss": 4.478,
"step": 21320
},
{
"epoch": 9.579964061096137,
"grad_norm": 4.15625,
"learning_rate": 3.1416403416079505e-05,
"loss": 4.4336,
"step": 21325
},
{
"epoch": 9.582210242587601,
"grad_norm": 3.953125,
"learning_rate": 3.140132141328138e-05,
"loss": 4.471,
"step": 21330
},
{
"epoch": 9.584456424079066,
"grad_norm": 3.828125,
"learning_rate": 3.138631971926037e-05,
"loss": 4.4267,
"step": 21335
},
{
"epoch": 9.58670260557053,
"grad_norm": 4.25,
"learning_rate": 3.1371398343034266e-05,
"loss": 4.4679,
"step": 21340
},
{
"epoch": 9.588948787061994,
"grad_norm": 4.03125,
"learning_rate": 3.135655729357265e-05,
"loss": 4.4145,
"step": 21345
},
{
"epoch": 9.59119496855346,
"grad_norm": 4.03125,
"learning_rate": 3.134179657979672e-05,
"loss": 4.4745,
"step": 21350
},
{
"epoch": 9.593441150044924,
"grad_norm": 3.96875,
"learning_rate": 3.1327116210579456e-05,
"loss": 4.4523,
"step": 21355
},
{
"epoch": 9.595687331536388,
"grad_norm": 3.84375,
"learning_rate": 3.131251619474552e-05,
"loss": 4.3996,
"step": 21360
},
{
"epoch": 9.597933513027852,
"grad_norm": 3.921875,
"learning_rate": 3.129799654107124e-05,
"loss": 4.4422,
"step": 21365
},
{
"epoch": 9.600179694519317,
"grad_norm": 4.125,
"learning_rate": 3.128355725828468e-05,
"loss": 4.5367,
"step": 21370
},
{
"epoch": 9.602425876010782,
"grad_norm": 3.828125,
"learning_rate": 3.126919835506558e-05,
"loss": 4.4527,
"step": 21375
},
{
"epoch": 9.604672057502246,
"grad_norm": 4.15625,
"learning_rate": 3.1254919840045356e-05,
"loss": 4.4232,
"step": 21380
},
{
"epoch": 9.60691823899371,
"grad_norm": 3.65625,
"learning_rate": 3.124072172180709e-05,
"loss": 4.464,
"step": 21385
},
{
"epoch": 9.609164420485175,
"grad_norm": 4.25,
"learning_rate": 3.1226604008885566e-05,
"loss": 4.5189,
"step": 21390
},
{
"epoch": 9.61141060197664,
"grad_norm": 4.0,
"learning_rate": 3.121256670976719e-05,
"loss": 4.4456,
"step": 21395
},
{
"epoch": 9.613656783468103,
"grad_norm": 4.03125,
"learning_rate": 3.1198609832890093e-05,
"loss": 4.4784,
"step": 21400
},
{
"epoch": 9.615902964959568,
"grad_norm": 3.921875,
"learning_rate": 3.1184733386643994e-05,
"loss": 4.4983,
"step": 21405
},
{
"epoch": 9.618149146451033,
"grad_norm": 4.0,
"learning_rate": 3.1170937379370314e-05,
"loss": 4.4304,
"step": 21410
},
{
"epoch": 9.620395327942498,
"grad_norm": 4.1875,
"learning_rate": 3.115722181936209e-05,
"loss": 4.4549,
"step": 21415
},
{
"epoch": 9.622641509433961,
"grad_norm": 4.21875,
"learning_rate": 3.114358671486403e-05,
"loss": 4.4759,
"step": 21420
},
{
"epoch": 9.624887690925426,
"grad_norm": 3.890625,
"learning_rate": 3.1130032074072465e-05,
"loss": 4.4785,
"step": 21425
},
{
"epoch": 9.627133872416891,
"grad_norm": 4.09375,
"learning_rate": 3.111655790513532e-05,
"loss": 4.4772,
"step": 21430
},
{
"epoch": 9.629380053908356,
"grad_norm": 3.90625,
"learning_rate": 3.110316421615223e-05,
"loss": 4.532,
"step": 21435
},
{
"epoch": 9.631626235399821,
"grad_norm": 4.0,
"learning_rate": 3.108985101517435e-05,
"loss": 4.4875,
"step": 21440
},
{
"epoch": 9.633872416891284,
"grad_norm": 3.9375,
"learning_rate": 3.107661831020455e-05,
"loss": 4.4913,
"step": 21445
},
{
"epoch": 9.63611859838275,
"grad_norm": 3.75,
"learning_rate": 3.1063466109197236e-05,
"loss": 4.5703,
"step": 21450
},
{
"epoch": 9.638364779874214,
"grad_norm": 3.9375,
"learning_rate": 3.105039442005847e-05,
"loss": 4.4012,
"step": 21455
},
{
"epoch": 9.640610961365677,
"grad_norm": 3.984375,
"learning_rate": 3.1037403250645916e-05,
"loss": 4.404,
"step": 21460
},
{
"epoch": 9.642857142857142,
"grad_norm": 4.0625,
"learning_rate": 3.102449260876879e-05,
"loss": 4.4471,
"step": 21465
},
{
"epoch": 9.645103324348607,
"grad_norm": 3.921875,
"learning_rate": 3.1011662502187955e-05,
"loss": 4.4435,
"step": 21470
},
{
"epoch": 9.647349505840072,
"grad_norm": 3.734375,
"learning_rate": 3.0998912938615795e-05,
"loss": 4.4458,
"step": 21475
},
{
"epoch": 9.649595687331537,
"grad_norm": 3.6875,
"learning_rate": 3.0986243925716375e-05,
"loss": 4.5468,
"step": 21480
},
{
"epoch": 9.651841868823,
"grad_norm": 3.828125,
"learning_rate": 3.097365547110527e-05,
"loss": 4.4227,
"step": 21485
},
{
"epoch": 9.654088050314465,
"grad_norm": 4.03125,
"learning_rate": 3.096114758234962e-05,
"loss": 4.4105,
"step": 21490
},
{
"epoch": 9.65633423180593,
"grad_norm": 3.953125,
"learning_rate": 3.09487202669682e-05,
"loss": 4.4524,
"step": 21495
},
{
"epoch": 9.658580413297395,
"grad_norm": 3.640625,
"learning_rate": 3.0936373532431294e-05,
"loss": 4.4503,
"step": 21500
},
{
"epoch": 9.660826594788858,
"grad_norm": 3.9375,
"learning_rate": 3.092410738616075e-05,
"loss": 4.4434,
"step": 21505
},
{
"epoch": 9.663072776280323,
"grad_norm": 4.09375,
"learning_rate": 3.0911921835530025e-05,
"loss": 4.4463,
"step": 21510
},
{
"epoch": 9.665318957771788,
"grad_norm": 4.03125,
"learning_rate": 3.089981688786405e-05,
"loss": 4.4196,
"step": 21515
},
{
"epoch": 9.667565139263253,
"grad_norm": 3.890625,
"learning_rate": 3.0887792550439384e-05,
"loss": 4.4454,
"step": 21520
},
{
"epoch": 9.669811320754716,
"grad_norm": 3.984375,
"learning_rate": 3.0875848830484056e-05,
"loss": 4.4653,
"step": 21525
},
{
"epoch": 9.672057502246181,
"grad_norm": 4.15625,
"learning_rate": 3.0863985735177713e-05,
"loss": 4.4771,
"step": 21530
},
{
"epoch": 9.674303683737646,
"grad_norm": 4.125,
"learning_rate": 3.0852203271651446e-05,
"loss": 4.5219,
"step": 21535
},
{
"epoch": 9.676549865229111,
"grad_norm": 4.125,
"learning_rate": 3.084050144698795e-05,
"loss": 4.5028,
"step": 21540
},
{
"epoch": 9.678796046720574,
"grad_norm": 4.25,
"learning_rate": 3.0828880268221423e-05,
"loss": 4.5004,
"step": 21545
},
{
"epoch": 9.68104222821204,
"grad_norm": 4.03125,
"learning_rate": 3.081733974233757e-05,
"loss": 4.3945,
"step": 21550
},
{
"epoch": 9.683288409703504,
"grad_norm": 4.125,
"learning_rate": 3.080587987627364e-05,
"loss": 4.4239,
"step": 21555
},
{
"epoch": 9.685534591194969,
"grad_norm": 4.0,
"learning_rate": 3.079450067691836e-05,
"loss": 4.4172,
"step": 21560
},
{
"epoch": 9.687780772686434,
"grad_norm": 3.84375,
"learning_rate": 3.0783202151112005e-05,
"loss": 4.4264,
"step": 21565
},
{
"epoch": 9.690026954177897,
"grad_norm": 4.09375,
"learning_rate": 3.077198430564635e-05,
"loss": 4.4771,
"step": 21570
},
{
"epoch": 9.692273135669362,
"grad_norm": 3.890625,
"learning_rate": 3.0760847147264654e-05,
"loss": 4.4455,
"step": 21575
},
{
"epoch": 9.694519317160827,
"grad_norm": 3.921875,
"learning_rate": 3.074979068266168e-05,
"loss": 4.4437,
"step": 21580
},
{
"epoch": 9.69676549865229,
"grad_norm": 3.890625,
"learning_rate": 3.073881491848366e-05,
"loss": 4.4892,
"step": 21585
},
{
"epoch": 9.699011680143755,
"grad_norm": 4.125,
"learning_rate": 3.072791986132838e-05,
"loss": 4.403,
"step": 21590
},
{
"epoch": 9.70125786163522,
"grad_norm": 4.125,
"learning_rate": 3.071710551774505e-05,
"loss": 4.4125,
"step": 21595
},
{
"epoch": 9.703504043126685,
"grad_norm": 4.4375,
"learning_rate": 3.0706371894234375e-05,
"loss": 4.5058,
"step": 21600
},
{
"epoch": 9.70575022461815,
"grad_norm": 4.0625,
"learning_rate": 3.0695718997248576e-05,
"loss": 4.451,
"step": 21605
},
{
"epoch": 9.707996406109613,
"grad_norm": 3.75,
"learning_rate": 3.068514683319129e-05,
"loss": 4.4327,
"step": 21610
},
{
"epoch": 9.710242587601078,
"grad_norm": 3.984375,
"learning_rate": 3.067465540841767e-05,
"loss": 4.4968,
"step": 21615
},
{
"epoch": 9.712488769092543,
"grad_norm": 3.796875,
"learning_rate": 3.06642447292343e-05,
"loss": 4.4585,
"step": 21620
},
{
"epoch": 9.714734950584008,
"grad_norm": 4.03125,
"learning_rate": 3.0653914801899244e-05,
"loss": 4.4387,
"step": 21625
},
{
"epoch": 9.716981132075471,
"grad_norm": 3.984375,
"learning_rate": 3.064366563262204e-05,
"loss": 4.4181,
"step": 21630
},
{
"epoch": 9.719227313566936,
"grad_norm": 4.09375,
"learning_rate": 3.0633497227563644e-05,
"loss": 4.4826,
"step": 21635
},
{
"epoch": 9.721473495058401,
"grad_norm": 4.09375,
"learning_rate": 3.062340959283652e-05,
"loss": 4.4405,
"step": 21640
},
{
"epoch": 9.723719676549866,
"grad_norm": 3.890625,
"learning_rate": 3.0613402734504484e-05,
"loss": 4.4622,
"step": 21645
},
{
"epoch": 9.725965858041329,
"grad_norm": 3.984375,
"learning_rate": 3.0603476658582896e-05,
"loss": 4.4617,
"step": 21650
},
{
"epoch": 9.728212039532794,
"grad_norm": 3.90625,
"learning_rate": 3.059363137103851e-05,
"loss": 4.4626,
"step": 21655
},
{
"epoch": 9.730458221024259,
"grad_norm": 4.1875,
"learning_rate": 3.05838668777895e-05,
"loss": 4.4759,
"step": 21660
},
{
"epoch": 9.732704402515724,
"grad_norm": 4.0,
"learning_rate": 3.057418318470553e-05,
"loss": 4.6199,
"step": 21665
},
{
"epoch": 9.734950584007187,
"grad_norm": 4.1875,
"learning_rate": 3.0564580297607615e-05,
"loss": 4.4436,
"step": 21670
},
{
"epoch": 9.737196765498652,
"grad_norm": 3.796875,
"learning_rate": 3.055505822226827e-05,
"loss": 4.4397,
"step": 21675
},
{
"epoch": 9.739442946990117,
"grad_norm": 4.03125,
"learning_rate": 3.054561696441139e-05,
"loss": 4.431,
"step": 21680
},
{
"epoch": 9.741689128481582,
"grad_norm": 4.34375,
"learning_rate": 3.0536256529712295e-05,
"loss": 4.4939,
"step": 21685
},
{
"epoch": 9.743935309973045,
"grad_norm": 3.921875,
"learning_rate": 3.052697692379772e-05,
"loss": 4.5042,
"step": 21690
},
{
"epoch": 9.74618149146451,
"grad_norm": 3.78125,
"learning_rate": 3.0517778152245845e-05,
"loss": 4.4137,
"step": 21695
},
{
"epoch": 9.748427672955975,
"grad_norm": 3.953125,
"learning_rate": 3.050866022058619e-05,
"loss": 4.4318,
"step": 21700
},
{
"epoch": 9.75067385444744,
"grad_norm": 3.953125,
"learning_rate": 3.049962313429976e-05,
"loss": 4.4799,
"step": 21705
},
{
"epoch": 9.752920035938903,
"grad_norm": 4.125,
"learning_rate": 3.0490666898818908e-05,
"loss": 4.5333,
"step": 21710
},
{
"epoch": 9.755166217430368,
"grad_norm": 3.921875,
"learning_rate": 3.0481791519527406e-05,
"loss": 4.436,
"step": 21715
},
{
"epoch": 9.757412398921833,
"grad_norm": 3.921875,
"learning_rate": 3.047299700176042e-05,
"loss": 4.4265,
"step": 21720
},
{
"epoch": 9.759658580413298,
"grad_norm": 4.0625,
"learning_rate": 3.0464283350804495e-05,
"loss": 4.464,
"step": 21725
},
{
"epoch": 9.761904761904763,
"grad_norm": 3.90625,
"learning_rate": 3.0455650571897578e-05,
"loss": 4.3911,
"step": 21730
},
{
"epoch": 9.764150943396226,
"grad_norm": 4.09375,
"learning_rate": 3.0447098670229016e-05,
"loss": 4.4912,
"step": 21735
},
{
"epoch": 9.76639712488769,
"grad_norm": 3.765625,
"learning_rate": 3.0438627650939498e-05,
"loss": 4.4731,
"step": 21740
},
{
"epoch": 9.768643306379156,
"grad_norm": 3.609375,
"learning_rate": 3.0430237519121147e-05,
"loss": 4.4542,
"step": 21745
},
{
"epoch": 9.77088948787062,
"grad_norm": 3.828125,
"learning_rate": 3.042192827981744e-05,
"loss": 4.4322,
"step": 21750
},
{
"epoch": 9.773135669362084,
"grad_norm": 3.859375,
"learning_rate": 3.041369993802318e-05,
"loss": 4.5,
"step": 21755
},
{
"epoch": 9.775381850853549,
"grad_norm": 4.28125,
"learning_rate": 3.0405552498684635e-05,
"loss": 4.5075,
"step": 21760
},
{
"epoch": 9.777628032345014,
"grad_norm": 3.953125,
"learning_rate": 3.0397485966699357e-05,
"loss": 4.4934,
"step": 21765
},
{
"epoch": 9.779874213836479,
"grad_norm": 3.9375,
"learning_rate": 3.0389500346916307e-05,
"loss": 4.4909,
"step": 21770
},
{
"epoch": 9.782120395327942,
"grad_norm": 3.921875,
"learning_rate": 3.0381595644135787e-05,
"loss": 4.5213,
"step": 21775
},
{
"epoch": 9.784366576819407,
"grad_norm": 4.3125,
"learning_rate": 3.0373771863109488e-05,
"loss": 4.5062,
"step": 21780
},
{
"epoch": 9.786612758310872,
"grad_norm": 3.90625,
"learning_rate": 3.036602900854044e-05,
"loss": 4.499,
"step": 21785
},
{
"epoch": 9.788858939802337,
"grad_norm": 4.0625,
"learning_rate": 3.0358367085082986e-05,
"loss": 4.4432,
"step": 21790
},
{
"epoch": 9.7911051212938,
"grad_norm": 4.09375,
"learning_rate": 3.0350786097342906e-05,
"loss": 4.475,
"step": 21795
},
{
"epoch": 9.793351302785265,
"grad_norm": 4.0625,
"learning_rate": 3.0343286049877233e-05,
"loss": 4.4657,
"step": 21800
},
{
"epoch": 9.79559748427673,
"grad_norm": 3.8125,
"learning_rate": 3.033586694719443e-05,
"loss": 4.4604,
"step": 21805
},
{
"epoch": 9.797843665768195,
"grad_norm": 3.828125,
"learning_rate": 3.032852879375425e-05,
"loss": 4.4558,
"step": 21810
},
{
"epoch": 9.800089847259658,
"grad_norm": 3.78125,
"learning_rate": 3.0321271593967798e-05,
"loss": 4.4586,
"step": 21815
},
{
"epoch": 9.802336028751123,
"grad_norm": 4.1875,
"learning_rate": 3.0314095352197537e-05,
"loss": 4.492,
"step": 21820
},
{
"epoch": 9.804582210242588,
"grad_norm": 3.859375,
"learning_rate": 3.0307000072757216e-05,
"loss": 4.4882,
"step": 21825
},
{
"epoch": 9.806828391734053,
"grad_norm": 4.125,
"learning_rate": 3.0299985759911967e-05,
"loss": 4.4841,
"step": 21830
},
{
"epoch": 9.809074573225516,
"grad_norm": 4.03125,
"learning_rate": 3.0293052417878228e-05,
"loss": 4.4587,
"step": 21835
},
{
"epoch": 9.81132075471698,
"grad_norm": 3.875,
"learning_rate": 3.0286200050823747e-05,
"loss": 4.4934,
"step": 21840
},
{
"epoch": 9.813566936208446,
"grad_norm": 4.0625,
"learning_rate": 3.0279428662867646e-05,
"loss": 4.4899,
"step": 21845
},
{
"epoch": 9.81581311769991,
"grad_norm": 3.984375,
"learning_rate": 3.0272738258080327e-05,
"loss": 4.4076,
"step": 21850
},
{
"epoch": 9.818059299191376,
"grad_norm": 4.0,
"learning_rate": 3.026612884048352e-05,
"loss": 4.4671,
"step": 21855
},
{
"epoch": 9.820305480682839,
"grad_norm": 4.03125,
"learning_rate": 3.0259600414050278e-05,
"loss": 4.3938,
"step": 21860
},
{
"epoch": 9.822551662174304,
"grad_norm": 3.984375,
"learning_rate": 3.025315298270496e-05,
"loss": 4.5627,
"step": 21865
},
{
"epoch": 9.824797843665769,
"grad_norm": 4.15625,
"learning_rate": 3.024678655032324e-05,
"loss": 4.5109,
"step": 21870
},
{
"epoch": 9.827044025157234,
"grad_norm": 3.984375,
"learning_rate": 3.0240501120732125e-05,
"loss": 4.499,
"step": 21875
},
{
"epoch": 9.829290206648697,
"grad_norm": 4.09375,
"learning_rate": 3.0234296697709894e-05,
"loss": 4.4657,
"step": 21880
},
{
"epoch": 9.831536388140162,
"grad_norm": 4.1875,
"learning_rate": 3.022817328498614e-05,
"loss": 4.4049,
"step": 21885
},
{
"epoch": 9.833782569631627,
"grad_norm": 3.890625,
"learning_rate": 3.022213088624178e-05,
"loss": 4.508,
"step": 21890
},
{
"epoch": 9.836028751123091,
"grad_norm": 3.953125,
"learning_rate": 3.0216169505109022e-05,
"loss": 4.4213,
"step": 21895
},
{
"epoch": 9.838274932614555,
"grad_norm": 3.984375,
"learning_rate": 3.021028914517134e-05,
"loss": 4.3869,
"step": 21900
},
{
"epoch": 9.84052111410602,
"grad_norm": 4.1875,
"learning_rate": 3.020448980996358e-05,
"loss": 4.5112,
"step": 21905
},
{
"epoch": 9.842767295597485,
"grad_norm": 4.21875,
"learning_rate": 3.0198771502971783e-05,
"loss": 4.5125,
"step": 21910
},
{
"epoch": 9.84501347708895,
"grad_norm": 3.9375,
"learning_rate": 3.019313422763338e-05,
"loss": 4.4641,
"step": 21915
},
{
"epoch": 9.847259658580413,
"grad_norm": 3.65625,
"learning_rate": 3.0187577987337025e-05,
"loss": 4.481,
"step": 21920
},
{
"epoch": 9.849505840071878,
"grad_norm": 3.875,
"learning_rate": 3.0182102785422673e-05,
"loss": 4.4802,
"step": 21925
},
{
"epoch": 9.851752021563343,
"grad_norm": 3.96875,
"learning_rate": 3.017670862518161e-05,
"loss": 4.4696,
"step": 21930
},
{
"epoch": 9.853998203054807,
"grad_norm": 4.03125,
"learning_rate": 3.017139550985634e-05,
"loss": 4.4655,
"step": 21935
},
{
"epoch": 9.85624438454627,
"grad_norm": 3.71875,
"learning_rate": 3.0166163442640687e-05,
"loss": 4.4801,
"step": 21940
},
{
"epoch": 9.858490566037736,
"grad_norm": 4.0625,
"learning_rate": 3.016101242667975e-05,
"loss": 4.4328,
"step": 21945
},
{
"epoch": 9.8607367475292,
"grad_norm": 4.15625,
"learning_rate": 3.015594246506991e-05,
"loss": 4.5086,
"step": 21950
},
{
"epoch": 9.862982929020665,
"grad_norm": 3.890625,
"learning_rate": 3.0150953560858822e-05,
"loss": 4.4391,
"step": 21955
},
{
"epoch": 9.865229110512129,
"grad_norm": 3.53125,
"learning_rate": 3.0146045717045403e-05,
"loss": 4.4804,
"step": 21960
},
{
"epoch": 9.867475292003594,
"grad_norm": 4.09375,
"learning_rate": 3.014121893657986e-05,
"loss": 4.4691,
"step": 21965
},
{
"epoch": 9.869721473495058,
"grad_norm": 4.0,
"learning_rate": 3.0136473222363663e-05,
"loss": 4.4258,
"step": 21970
},
{
"epoch": 9.871967654986523,
"grad_norm": 4.0625,
"learning_rate": 3.0131808577249562e-05,
"loss": 4.4583,
"step": 21975
},
{
"epoch": 9.874213836477988,
"grad_norm": 4.125,
"learning_rate": 3.0127225004041557e-05,
"loss": 4.467,
"step": 21980
},
{
"epoch": 9.876460017969451,
"grad_norm": 4.0625,
"learning_rate": 3.012272250549493e-05,
"loss": 4.4482,
"step": 21985
},
{
"epoch": 9.878706199460916,
"grad_norm": 3.609375,
"learning_rate": 3.011830108431621e-05,
"loss": 4.4542,
"step": 21990
},
{
"epoch": 9.880952380952381,
"grad_norm": 4.0,
"learning_rate": 3.011396074316322e-05,
"loss": 4.4514,
"step": 21995
},
{
"epoch": 9.883198562443845,
"grad_norm": 3.71875,
"learning_rate": 3.0109701484644995e-05,
"loss": 4.4542,
"step": 22000
},
{
"epoch": 9.883198562443845,
"eval_loss": 4.778508186340332,
"eval_runtime": 16.0782,
"eval_samples_per_second": 1928.886,
"eval_steps_per_second": 241.134,
"step": 22000
},
{
"epoch": 9.88544474393531,
"grad_norm": 4.03125,
"learning_rate": 3.010552331132188e-05,
"loss": 4.4514,
"step": 22005
},
{
"epoch": 9.887690925426774,
"grad_norm": 4.03125,
"learning_rate": 3.0101426225705458e-05,
"loss": 4.4856,
"step": 22010
},
{
"epoch": 9.88993710691824,
"grad_norm": 4.28125,
"learning_rate": 3.0097410230258556e-05,
"loss": 4.4761,
"step": 22015
},
{
"epoch": 9.892183288409704,
"grad_norm": 3.984375,
"learning_rate": 3.0093475327395274e-05,
"loss": 4.4174,
"step": 22020
},
{
"epoch": 9.894429469901167,
"grad_norm": 4.1875,
"learning_rate": 3.0089621519480948e-05,
"loss": 4.4723,
"step": 22025
},
{
"epoch": 9.896675651392632,
"grad_norm": 4.125,
"learning_rate": 3.0085848808832195e-05,
"loss": 4.4188,
"step": 22030
},
{
"epoch": 9.898921832884097,
"grad_norm": 3.875,
"learning_rate": 3.008215719771684e-05,
"loss": 4.4611,
"step": 22035
},
{
"epoch": 9.901168014375562,
"grad_norm": 4.0,
"learning_rate": 3.0078546688354016e-05,
"loss": 4.4664,
"step": 22040
},
{
"epoch": 9.903414195867025,
"grad_norm": 4.09375,
"learning_rate": 3.0075017282914045e-05,
"loss": 4.4478,
"step": 22045
},
{
"epoch": 9.90566037735849,
"grad_norm": 4.375,
"learning_rate": 3.0071568983518542e-05,
"loss": 4.5441,
"step": 22050
},
{
"epoch": 9.907906558849955,
"grad_norm": 3.96875,
"learning_rate": 3.0068201792240328e-05,
"loss": 4.451,
"step": 22055
},
{
"epoch": 9.91015274034142,
"grad_norm": 4.09375,
"learning_rate": 3.006491571110349e-05,
"loss": 4.5322,
"step": 22060
},
{
"epoch": 9.912398921832883,
"grad_norm": 4.1875,
"learning_rate": 3.0061710742083364e-05,
"loss": 4.3999,
"step": 22065
},
{
"epoch": 9.914645103324348,
"grad_norm": 3.921875,
"learning_rate": 3.005858688710651e-05,
"loss": 4.4591,
"step": 22070
},
{
"epoch": 9.916891284815813,
"grad_norm": 3.984375,
"learning_rate": 3.005554414805075e-05,
"loss": 4.4283,
"step": 22075
},
{
"epoch": 9.919137466307278,
"grad_norm": 4.0,
"learning_rate": 3.0052582526745136e-05,
"loss": 4.4817,
"step": 22080
},
{
"epoch": 9.921383647798741,
"grad_norm": 3.9375,
"learning_rate": 3.004970202496993e-05,
"loss": 4.4711,
"step": 22085
},
{
"epoch": 9.923629829290206,
"grad_norm": 3.984375,
"learning_rate": 3.0046902644456684e-05,
"loss": 4.4818,
"step": 22090
},
{
"epoch": 9.925876010781671,
"grad_norm": 3.875,
"learning_rate": 3.0044184386888162e-05,
"loss": 4.4919,
"step": 22095
},
{
"epoch": 9.928122192273136,
"grad_norm": 4.125,
"learning_rate": 3.004154725389835e-05,
"loss": 4.4638,
"step": 22100
},
{
"epoch": 9.9303683737646,
"grad_norm": 4.03125,
"learning_rate": 3.0038991247072477e-05,
"loss": 4.4633,
"step": 22105
},
{
"epoch": 9.932614555256064,
"grad_norm": 3.859375,
"learning_rate": 3.0036516367947006e-05,
"loss": 4.4803,
"step": 22110
},
{
"epoch": 9.93486073674753,
"grad_norm": 4.25,
"learning_rate": 3.0034122618009663e-05,
"loss": 4.4513,
"step": 22115
},
{
"epoch": 9.937106918238994,
"grad_norm": 3.953125,
"learning_rate": 3.003180999869934e-05,
"loss": 4.3843,
"step": 22120
},
{
"epoch": 9.939353099730457,
"grad_norm": 3.96875,
"learning_rate": 3.0029578511406223e-05,
"loss": 4.4749,
"step": 22125
},
{
"epoch": 9.941599281221922,
"grad_norm": 3.921875,
"learning_rate": 3.002742815747171e-05,
"loss": 4.4282,
"step": 22130
},
{
"epoch": 9.943845462713387,
"grad_norm": 3.984375,
"learning_rate": 3.0025358938188387e-05,
"loss": 4.4134,
"step": 22135
},
{
"epoch": 9.946091644204852,
"grad_norm": 4.09375,
"learning_rate": 3.0023370854800123e-05,
"loss": 4.5129,
"step": 22140
},
{
"epoch": 9.948337825696317,
"grad_norm": 4.1875,
"learning_rate": 3.002146390850201e-05,
"loss": 4.523,
"step": 22145
},
{
"epoch": 9.95058400718778,
"grad_norm": 3.875,
"learning_rate": 3.0019638100440313e-05,
"loss": 4.4539,
"step": 22150
},
{
"epoch": 9.952830188679245,
"grad_norm": 4.0625,
"learning_rate": 3.00178934317126e-05,
"loss": 4.43,
"step": 22155
},
{
"epoch": 9.95507637017071,
"grad_norm": 3.890625,
"learning_rate": 3.0016229903367582e-05,
"loss": 4.4415,
"step": 22160
},
{
"epoch": 9.957322551662175,
"grad_norm": 3.84375,
"learning_rate": 3.0014647516405286e-05,
"loss": 4.5016,
"step": 22165
},
{
"epoch": 9.959568733153638,
"grad_norm": 4.15625,
"learning_rate": 3.0013146271776875e-05,
"loss": 4.506,
"step": 22170
},
{
"epoch": 9.961814914645103,
"grad_norm": 3.90625,
"learning_rate": 3.001172617038481e-05,
"loss": 4.4784,
"step": 22175
},
{
"epoch": 9.964061096136568,
"grad_norm": 3.65625,
"learning_rate": 3.0010387213082716e-05,
"loss": 4.4242,
"step": 22180
},
{
"epoch": 9.966307277628033,
"grad_norm": 4.15625,
"learning_rate": 3.0009129400675485e-05,
"loss": 4.4398,
"step": 22185
},
{
"epoch": 9.968553459119496,
"grad_norm": 3.9375,
"learning_rate": 3.0007952733919195e-05,
"loss": 4.4891,
"step": 22190
},
{
"epoch": 9.970799640610961,
"grad_norm": 4.1875,
"learning_rate": 3.0006857213521175e-05,
"loss": 4.4433,
"step": 22195
},
{
"epoch": 9.973045822102426,
"grad_norm": 3.953125,
"learning_rate": 3.0005842840139957e-05,
"loss": 4.3985,
"step": 22200
},
{
"epoch": 9.975292003593891,
"grad_norm": 3.8125,
"learning_rate": 3.0004909614385315e-05,
"loss": 4.4239,
"step": 22205
},
{
"epoch": 9.977538185085354,
"grad_norm": 4.0625,
"learning_rate": 3.0004057536818215e-05,
"loss": 4.4357,
"step": 22210
},
{
"epoch": 9.979784366576819,
"grad_norm": 4.0,
"learning_rate": 3.000328660795086e-05,
"loss": 4.4589,
"step": 22215
},
{
"epoch": 9.982030548068284,
"grad_norm": 4.375,
"learning_rate": 3.0002596828246667e-05,
"loss": 4.4735,
"step": 22220
},
{
"epoch": 9.984276729559749,
"grad_norm": 3.921875,
"learning_rate": 3.0001988198120287e-05,
"loss": 4.4521,
"step": 22225
},
{
"epoch": 9.986522911051212,
"grad_norm": 3.921875,
"learning_rate": 3.0001460717937572e-05,
"loss": 4.4655,
"step": 22230
},
{
"epoch": 9.988769092542677,
"grad_norm": 3.84375,
"learning_rate": 3.0001014388015603e-05,
"loss": 4.4577,
"step": 22235
},
{
"epoch": 9.991015274034142,
"grad_norm": 3.953125,
"learning_rate": 3.0000649208622676e-05,
"loss": 4.4676,
"step": 22240
},
{
"epoch": 9.993261455525607,
"grad_norm": 3.984375,
"learning_rate": 3.000036517997831e-05,
"loss": 4.4503,
"step": 22245
},
{
"epoch": 9.99550763701707,
"grad_norm": 3.953125,
"learning_rate": 3.0000162302253235e-05,
"loss": 4.401,
"step": 22250
},
{
"epoch": 9.997753818508535,
"grad_norm": 4.0,
"learning_rate": 3.0000040575569408e-05,
"loss": 4.4357,
"step": 22255
},
{
"epoch": 10.0,
"grad_norm": 8.625,
"learning_rate": 2.9999999999999997e-05,
"loss": 4.5633,
"step": 22260
}
],
"logging_steps": 5,
"max_steps": 22260,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 5000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 6.03170666514432e+17,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}