{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 1000, "global_step": 22260, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0022461814914645105, "grad_norm": 54.0, "learning_rate": 7.499999999999999e-07, "loss": 10.989, "step": 5 }, { "epoch": 0.004492362982929021, "grad_norm": 52.75, "learning_rate": 1.4999999999999998e-06, "loss": 10.984, "step": 10 }, { "epoch": 0.006738544474393531, "grad_norm": 52.5, "learning_rate": 2.2499999999999996e-06, "loss": 10.9491, "step": 15 }, { "epoch": 0.008984725965858042, "grad_norm": 50.25, "learning_rate": 2.9999999999999997e-06, "loss": 10.8608, "step": 20 }, { "epoch": 0.011230907457322551, "grad_norm": 44.75, "learning_rate": 3.7499999999999997e-06, "loss": 10.7375, "step": 25 }, { "epoch": 0.013477088948787063, "grad_norm": 38.0, "learning_rate": 4.499999999999999e-06, "loss": 10.5621, "step": 30 }, { "epoch": 0.015723270440251572, "grad_norm": 25.5, "learning_rate": 5.25e-06, "loss": 10.3304, "step": 35 }, { "epoch": 0.017969451931716084, "grad_norm": 19.25, "learning_rate": 5.999999999999999e-06, "loss": 10.1403, "step": 40 }, { "epoch": 0.02021563342318059, "grad_norm": 13.8125, "learning_rate": 6.749999999999999e-06, "loss": 9.9521, "step": 45 }, { "epoch": 0.022461814914645103, "grad_norm": 11.1875, "learning_rate": 7.499999999999999e-06, "loss": 9.843, "step": 50 }, { "epoch": 0.024707996406109614, "grad_norm": 10.5, "learning_rate": 8.249999999999999e-06, "loss": 9.7584, "step": 55 }, { "epoch": 0.026954177897574125, "grad_norm": 10.0625, "learning_rate": 8.999999999999999e-06, "loss": 9.7293, "step": 60 }, { "epoch": 0.029200359389038633, "grad_norm": 9.1875, "learning_rate": 9.75e-06, "loss": 9.719, "step": 65 }, { "epoch": 0.031446540880503145, "grad_norm": 8.9375, "learning_rate": 1.05e-05, "loss": 9.6908, "step": 70 }, { "epoch": 0.03369272237196765, "grad_norm": 9.125, "learning_rate": 1.1249999999999999e-05, "loss": 9.6617, "step": 75 }, { "epoch": 0.03593890386343217, "grad_norm": 9.0625, "learning_rate": 1.1999999999999999e-05, "loss": 9.6228, "step": 80 }, { "epoch": 0.038185085354896675, "grad_norm": 9.125, "learning_rate": 1.275e-05, "loss": 9.6069, "step": 85 }, { "epoch": 0.04043126684636118, "grad_norm": 9.3125, "learning_rate": 1.3499999999999998e-05, "loss": 9.5342, "step": 90 }, { "epoch": 0.0426774483378257, "grad_norm": 8.9375, "learning_rate": 1.4249999999999999e-05, "loss": 9.5187, "step": 95 }, { "epoch": 0.044923629829290206, "grad_norm": 9.125, "learning_rate": 1.4999999999999999e-05, "loss": 9.4719, "step": 100 }, { "epoch": 0.04716981132075472, "grad_norm": 9.0, "learning_rate": 1.5749999999999997e-05, "loss": 9.4167, "step": 105 }, { "epoch": 0.04941599281221923, "grad_norm": 8.9375, "learning_rate": 1.6499999999999998e-05, "loss": 9.3825, "step": 110 }, { "epoch": 0.051662174303683736, "grad_norm": 8.75, "learning_rate": 1.725e-05, "loss": 9.3577, "step": 115 }, { "epoch": 0.05390835579514825, "grad_norm": 8.625, "learning_rate": 1.7999999999999997e-05, "loss": 9.3387, "step": 120 }, { "epoch": 0.05615453728661276, "grad_norm": 9.375, "learning_rate": 1.875e-05, "loss": 9.2947, "step": 125 }, { "epoch": 0.05840071877807727, "grad_norm": 8.75, "learning_rate": 1.95e-05, "loss": 9.2177, "step": 130 }, { "epoch": 0.06064690026954178, "grad_norm": 8.8125, "learning_rate": 2.025e-05, "loss": 9.1683, "step": 135 }, { "epoch": 0.06289308176100629, "grad_norm": 9.875, "learning_rate": 2.1e-05, "loss": 9.1444, "step": 140 }, { "epoch": 0.0651392632524708, "grad_norm": 9.625, "learning_rate": 2.1749999999999997e-05, "loss": 9.0632, "step": 145 }, { "epoch": 0.0673854447439353, "grad_norm": 8.8125, "learning_rate": 2.2499999999999998e-05, "loss": 9.0828, "step": 150 }, { "epoch": 0.06963162623539983, "grad_norm": 9.5625, "learning_rate": 2.325e-05, "loss": 9.0005, "step": 155 }, { "epoch": 0.07187780772686433, "grad_norm": 11.1875, "learning_rate": 2.3999999999999997e-05, "loss": 8.9463, "step": 160 }, { "epoch": 0.07412398921832884, "grad_norm": 9.3125, "learning_rate": 2.475e-05, "loss": 8.9145, "step": 165 }, { "epoch": 0.07637017070979335, "grad_norm": 8.1875, "learning_rate": 2.55e-05, "loss": 8.8803, "step": 170 }, { "epoch": 0.07861635220125786, "grad_norm": 7.65625, "learning_rate": 2.6249999999999998e-05, "loss": 8.8266, "step": 175 }, { "epoch": 0.08086253369272237, "grad_norm": 7.78125, "learning_rate": 2.6999999999999996e-05, "loss": 8.7826, "step": 180 }, { "epoch": 0.08310871518418689, "grad_norm": 8.875, "learning_rate": 2.7749999999999997e-05, "loss": 8.7463, "step": 185 }, { "epoch": 0.0853548966756514, "grad_norm": 8.375, "learning_rate": 2.8499999999999998e-05, "loss": 8.6836, "step": 190 }, { "epoch": 0.0876010781671159, "grad_norm": 8.5, "learning_rate": 2.925e-05, "loss": 8.6827, "step": 195 }, { "epoch": 0.08984725965858041, "grad_norm": 8.25, "learning_rate": 2.9999999999999997e-05, "loss": 8.588, "step": 200 }, { "epoch": 0.09209344115004492, "grad_norm": 8.3125, "learning_rate": 3.0749999999999995e-05, "loss": 8.5417, "step": 205 }, { "epoch": 0.09433962264150944, "grad_norm": 9.4375, "learning_rate": 3.149999999999999e-05, "loss": 8.5287, "step": 210 }, { "epoch": 0.09658580413297395, "grad_norm": 8.125, "learning_rate": 3.225e-05, "loss": 8.49, "step": 215 }, { "epoch": 0.09883198562443846, "grad_norm": 7.59375, "learning_rate": 3.2999999999999996e-05, "loss": 8.4025, "step": 220 }, { "epoch": 0.10107816711590296, "grad_norm": 8.75, "learning_rate": 3.375e-05, "loss": 8.3121, "step": 225 }, { "epoch": 0.10332434860736747, "grad_norm": 7.8125, "learning_rate": 3.45e-05, "loss": 8.2635, "step": 230 }, { "epoch": 0.10557053009883198, "grad_norm": 8.3125, "learning_rate": 3.5249999999999996e-05, "loss": 8.2691, "step": 235 }, { "epoch": 0.1078167115902965, "grad_norm": 9.4375, "learning_rate": 3.5999999999999994e-05, "loss": 8.1828, "step": 240 }, { "epoch": 0.11006289308176101, "grad_norm": 7.0625, "learning_rate": 3.675e-05, "loss": 8.0901, "step": 245 }, { "epoch": 0.11230907457322552, "grad_norm": 8.125, "learning_rate": 3.75e-05, "loss": 8.0418, "step": 250 }, { "epoch": 0.11455525606469003, "grad_norm": 7.0625, "learning_rate": 3.8249999999999995e-05, "loss": 8.0148, "step": 255 }, { "epoch": 0.11680143755615453, "grad_norm": 7.5625, "learning_rate": 3.9e-05, "loss": 7.9943, "step": 260 }, { "epoch": 0.11904761904761904, "grad_norm": 7.0625, "learning_rate": 3.975e-05, "loss": 7.852, "step": 265 }, { "epoch": 0.12129380053908356, "grad_norm": 6.6875, "learning_rate": 4.05e-05, "loss": 7.8506, "step": 270 }, { "epoch": 0.12353998203054807, "grad_norm": 7.46875, "learning_rate": 4.125e-05, "loss": 7.7912, "step": 275 }, { "epoch": 0.12578616352201258, "grad_norm": 6.0, "learning_rate": 4.2e-05, "loss": 7.7331, "step": 280 }, { "epoch": 0.1280323450134771, "grad_norm": 6.75, "learning_rate": 4.2749999999999996e-05, "loss": 7.6362, "step": 285 }, { "epoch": 0.1302785265049416, "grad_norm": 5.9375, "learning_rate": 4.3499999999999993e-05, "loss": 7.5867, "step": 290 }, { "epoch": 0.13252470799640612, "grad_norm": 6.40625, "learning_rate": 4.424999999999999e-05, "loss": 7.5268, "step": 295 }, { "epoch": 0.1347708894878706, "grad_norm": 5.71875, "learning_rate": 4.4999999999999996e-05, "loss": 7.5554, "step": 300 }, { "epoch": 0.13701707097933513, "grad_norm": 5.5, "learning_rate": 4.5749999999999994e-05, "loss": 7.4486, "step": 305 }, { "epoch": 0.13926325247079965, "grad_norm": 5.15625, "learning_rate": 4.65e-05, "loss": 7.4554, "step": 310 }, { "epoch": 0.14150943396226415, "grad_norm": 4.84375, "learning_rate": 4.7249999999999997e-05, "loss": 7.3681, "step": 315 }, { "epoch": 0.14375561545372867, "grad_norm": 7.625, "learning_rate": 4.7999999999999994e-05, "loss": 7.2977, "step": 320 }, { "epoch": 0.14600179694519316, "grad_norm": 5.25, "learning_rate": 4.875e-05, "loss": 7.2572, "step": 325 }, { "epoch": 0.14824797843665768, "grad_norm": 5.125, "learning_rate": 4.95e-05, "loss": 7.322, "step": 330 }, { "epoch": 0.15049415992812218, "grad_norm": 4.96875, "learning_rate": 5.025e-05, "loss": 7.2646, "step": 335 }, { "epoch": 0.1527403414195867, "grad_norm": 4.96875, "learning_rate": 5.1e-05, "loss": 7.32, "step": 340 }, { "epoch": 0.15498652291105122, "grad_norm": 5.3125, "learning_rate": 5.174999999999999e-05, "loss": 7.209, "step": 345 }, { "epoch": 0.15723270440251572, "grad_norm": 5.40625, "learning_rate": 5.2499999999999995e-05, "loss": 7.1961, "step": 350 }, { "epoch": 0.15947888589398024, "grad_norm": 4.15625, "learning_rate": 5.324999999999999e-05, "loss": 7.2062, "step": 355 }, { "epoch": 0.16172506738544473, "grad_norm": 4.65625, "learning_rate": 5.399999999999999e-05, "loss": 7.1401, "step": 360 }, { "epoch": 0.16397124887690925, "grad_norm": 5.71875, "learning_rate": 5.4749999999999996e-05, "loss": 7.1402, "step": 365 }, { "epoch": 0.16621743036837378, "grad_norm": 5.34375, "learning_rate": 5.5499999999999994e-05, "loss": 7.073, "step": 370 }, { "epoch": 0.16846361185983827, "grad_norm": 5.96875, "learning_rate": 5.625e-05, "loss": 7.115, "step": 375 }, { "epoch": 0.1707097933513028, "grad_norm": 4.625, "learning_rate": 5.6999999999999996e-05, "loss": 7.1363, "step": 380 }, { "epoch": 0.17295597484276728, "grad_norm": 5.34375, "learning_rate": 5.7749999999999994e-05, "loss": 7.1075, "step": 385 }, { "epoch": 0.1752021563342318, "grad_norm": 4.46875, "learning_rate": 5.85e-05, "loss": 7.0746, "step": 390 }, { "epoch": 0.17744833782569633, "grad_norm": 4.53125, "learning_rate": 5.925e-05, "loss": 7.0877, "step": 395 }, { "epoch": 0.17969451931716082, "grad_norm": 4.6875, "learning_rate": 5.9999999999999995e-05, "loss": 7.033, "step": 400 }, { "epoch": 0.18194070080862534, "grad_norm": 4.9375, "learning_rate": 6.075e-05, "loss": 7.0603, "step": 405 }, { "epoch": 0.18418688230008984, "grad_norm": 4.8125, "learning_rate": 6.149999999999999e-05, "loss": 7.0149, "step": 410 }, { "epoch": 0.18643306379155436, "grad_norm": 4.6875, "learning_rate": 6.225e-05, "loss": 6.9823, "step": 415 }, { "epoch": 0.18867924528301888, "grad_norm": 5.65625, "learning_rate": 6.299999999999999e-05, "loss": 7.0107, "step": 420 }, { "epoch": 0.19092542677448338, "grad_norm": 4.5625, "learning_rate": 6.374999999999999e-05, "loss": 7.0235, "step": 425 }, { "epoch": 0.1931716082659479, "grad_norm": 4.71875, "learning_rate": 6.45e-05, "loss": 6.9444, "step": 430 }, { "epoch": 0.1954177897574124, "grad_norm": 4.6875, "learning_rate": 6.525e-05, "loss": 6.9067, "step": 435 }, { "epoch": 0.1976639712488769, "grad_norm": 4.375, "learning_rate": 6.599999999999999e-05, "loss": 6.9952, "step": 440 }, { "epoch": 0.1999101527403414, "grad_norm": 4.46875, "learning_rate": 6.675e-05, "loss": 6.8992, "step": 445 }, { "epoch": 0.20215633423180593, "grad_norm": 4.875, "learning_rate": 6.75e-05, "loss": 6.931, "step": 450 }, { "epoch": 0.20440251572327045, "grad_norm": 4.6875, "learning_rate": 6.824999999999999e-05, "loss": 6.9036, "step": 455 }, { "epoch": 0.20664869721473494, "grad_norm": 4.75, "learning_rate": 6.9e-05, "loss": 6.9332, "step": 460 }, { "epoch": 0.20889487870619947, "grad_norm": 4.25, "learning_rate": 6.975e-05, "loss": 7.0612, "step": 465 }, { "epoch": 0.21114106019766396, "grad_norm": 4.59375, "learning_rate": 7.049999999999999e-05, "loss": 6.8777, "step": 470 }, { "epoch": 0.21338724168912848, "grad_norm": 4.59375, "learning_rate": 7.125e-05, "loss": 6.8593, "step": 475 }, { "epoch": 0.215633423180593, "grad_norm": 5.1875, "learning_rate": 7.199999999999999e-05, "loss": 6.9541, "step": 480 }, { "epoch": 0.2178796046720575, "grad_norm": 4.65625, "learning_rate": 7.274999999999999e-05, "loss": 6.878, "step": 485 }, { "epoch": 0.22012578616352202, "grad_norm": 5.1875, "learning_rate": 7.35e-05, "loss": 6.8284, "step": 490 }, { "epoch": 0.2223719676549865, "grad_norm": 3.9375, "learning_rate": 7.424999999999999e-05, "loss": 6.8567, "step": 495 }, { "epoch": 0.22461814914645103, "grad_norm": 5.1875, "learning_rate": 7.5e-05, "loss": 6.8235, "step": 500 }, { "epoch": 0.22686433063791556, "grad_norm": 4.65625, "learning_rate": 7.575e-05, "loss": 6.8903, "step": 505 }, { "epoch": 0.22911051212938005, "grad_norm": 5.875, "learning_rate": 7.649999999999999e-05, "loss": 6.8404, "step": 510 }, { "epoch": 0.23135669362084457, "grad_norm": 5.0625, "learning_rate": 7.725e-05, "loss": 6.8318, "step": 515 }, { "epoch": 0.23360287511230907, "grad_norm": 4.5625, "learning_rate": 7.8e-05, "loss": 6.8522, "step": 520 }, { "epoch": 0.2358490566037736, "grad_norm": 5.03125, "learning_rate": 7.874999999999999e-05, "loss": 6.859, "step": 525 }, { "epoch": 0.23809523809523808, "grad_norm": 4.71875, "learning_rate": 7.95e-05, "loss": 6.8336, "step": 530 }, { "epoch": 0.2403414195867026, "grad_norm": 4.875, "learning_rate": 8.025e-05, "loss": 6.7897, "step": 535 }, { "epoch": 0.24258760107816713, "grad_norm": 4.375, "learning_rate": 8.1e-05, "loss": 6.7873, "step": 540 }, { "epoch": 0.24483378256963162, "grad_norm": 4.34375, "learning_rate": 8.175e-05, "loss": 6.7691, "step": 545 }, { "epoch": 0.24707996406109614, "grad_norm": 4.40625, "learning_rate": 8.25e-05, "loss": 6.8252, "step": 550 }, { "epoch": 0.24932614555256064, "grad_norm": 4.6875, "learning_rate": 8.325e-05, "loss": 6.8071, "step": 555 }, { "epoch": 0.25157232704402516, "grad_norm": 4.65625, "learning_rate": 8.4e-05, "loss": 6.7156, "step": 560 }, { "epoch": 0.25381850853548965, "grad_norm": 4.875, "learning_rate": 8.474999999999999e-05, "loss": 6.8189, "step": 565 }, { "epoch": 0.2560646900269542, "grad_norm": 4.53125, "learning_rate": 8.549999999999999e-05, "loss": 6.8159, "step": 570 }, { "epoch": 0.2583108715184187, "grad_norm": 3.75, "learning_rate": 8.624999999999998e-05, "loss": 6.847, "step": 575 }, { "epoch": 0.2605570530098832, "grad_norm": 4.71875, "learning_rate": 8.699999999999999e-05, "loss": 6.7576, "step": 580 }, { "epoch": 0.2628032345013477, "grad_norm": 5.375, "learning_rate": 8.774999999999999e-05, "loss": 6.7211, "step": 585 }, { "epoch": 0.26504941599281223, "grad_norm": 4.875, "learning_rate": 8.849999999999998e-05, "loss": 6.7255, "step": 590 }, { "epoch": 0.2672955974842767, "grad_norm": 4.71875, "learning_rate": 8.924999999999999e-05, "loss": 6.6598, "step": 595 }, { "epoch": 0.2695417789757412, "grad_norm": 4.25, "learning_rate": 8.999999999999999e-05, "loss": 6.7735, "step": 600 }, { "epoch": 0.27178796046720577, "grad_norm": 4.3125, "learning_rate": 9.074999999999998e-05, "loss": 6.7253, "step": 605 }, { "epoch": 0.27403414195867026, "grad_norm": 4.8125, "learning_rate": 9.149999999999999e-05, "loss": 6.6825, "step": 610 }, { "epoch": 0.27628032345013476, "grad_norm": 4.40625, "learning_rate": 9.224999999999999e-05, "loss": 6.7523, "step": 615 }, { "epoch": 0.2785265049415993, "grad_norm": 4.46875, "learning_rate": 9.3e-05, "loss": 6.7212, "step": 620 }, { "epoch": 0.2807726864330638, "grad_norm": 4.875, "learning_rate": 9.374999999999999e-05, "loss": 6.7052, "step": 625 }, { "epoch": 0.2830188679245283, "grad_norm": 4.6875, "learning_rate": 9.449999999999999e-05, "loss": 6.7031, "step": 630 }, { "epoch": 0.2852650494159928, "grad_norm": 4.1875, "learning_rate": 9.525e-05, "loss": 6.7163, "step": 635 }, { "epoch": 0.28751123090745734, "grad_norm": 4.15625, "learning_rate": 9.599999999999999e-05, "loss": 6.7148, "step": 640 }, { "epoch": 0.28975741239892183, "grad_norm": 3.78125, "learning_rate": 9.675e-05, "loss": 6.7027, "step": 645 }, { "epoch": 0.2920035938903863, "grad_norm": 4.375, "learning_rate": 9.75e-05, "loss": 6.6511, "step": 650 }, { "epoch": 0.2942497753818509, "grad_norm": 4.0625, "learning_rate": 9.824999999999999e-05, "loss": 6.704, "step": 655 }, { "epoch": 0.29649595687331537, "grad_norm": 4.09375, "learning_rate": 9.9e-05, "loss": 6.689, "step": 660 }, { "epoch": 0.29874213836477986, "grad_norm": 3.875, "learning_rate": 9.975e-05, "loss": 6.6784, "step": 665 }, { "epoch": 0.30098831985624436, "grad_norm": 4.5, "learning_rate": 0.0001005, "loss": 6.597, "step": 670 }, { "epoch": 0.3032345013477089, "grad_norm": 4.3125, "learning_rate": 0.00010125, "loss": 6.6198, "step": 675 }, { "epoch": 0.3054806828391734, "grad_norm": 4.03125, "learning_rate": 0.000102, "loss": 6.6226, "step": 680 }, { "epoch": 0.3077268643306379, "grad_norm": 4.03125, "learning_rate": 0.00010275, "loss": 6.627, "step": 685 }, { "epoch": 0.30997304582210244, "grad_norm": 5.1875, "learning_rate": 0.00010349999999999998, "loss": 6.6003, "step": 690 }, { "epoch": 0.31221922731356694, "grad_norm": 3.640625, "learning_rate": 0.00010424999999999999, "loss": 6.5845, "step": 695 }, { "epoch": 0.31446540880503143, "grad_norm": 4.4375, "learning_rate": 0.00010499999999999999, "loss": 6.6143, "step": 700 }, { "epoch": 0.316711590296496, "grad_norm": 4.90625, "learning_rate": 0.00010574999999999998, "loss": 6.6305, "step": 705 }, { "epoch": 0.3189577717879605, "grad_norm": 4.3125, "learning_rate": 0.00010649999999999999, "loss": 6.5312, "step": 710 }, { "epoch": 0.32120395327942497, "grad_norm": 4.15625, "learning_rate": 0.00010724999999999999, "loss": 6.63, "step": 715 }, { "epoch": 0.32345013477088946, "grad_norm": 4.53125, "learning_rate": 0.00010799999999999998, "loss": 6.564, "step": 720 }, { "epoch": 0.325696316262354, "grad_norm": 4.03125, "learning_rate": 0.00010874999999999999, "loss": 6.6572, "step": 725 }, { "epoch": 0.3279424977538185, "grad_norm": 4.40625, "learning_rate": 0.00010949999999999999, "loss": 6.5728, "step": 730 }, { "epoch": 0.330188679245283, "grad_norm": 4.34375, "learning_rate": 0.00011024999999999998, "loss": 6.5245, "step": 735 }, { "epoch": 0.33243486073674755, "grad_norm": 5.5, "learning_rate": 0.00011099999999999999, "loss": 6.5883, "step": 740 }, { "epoch": 0.33468104222821204, "grad_norm": 5.53125, "learning_rate": 0.00011174999999999999, "loss": 6.5549, "step": 745 }, { "epoch": 0.33692722371967654, "grad_norm": 4.40625, "learning_rate": 0.0001125, "loss": 6.5269, "step": 750 }, { "epoch": 0.33917340521114103, "grad_norm": 4.65625, "learning_rate": 0.00011324999999999999, "loss": 6.5262, "step": 755 }, { "epoch": 0.3414195867026056, "grad_norm": 4.25, "learning_rate": 0.00011399999999999999, "loss": 6.4958, "step": 760 }, { "epoch": 0.3436657681940701, "grad_norm": 4.34375, "learning_rate": 0.00011475, "loss": 6.4719, "step": 765 }, { "epoch": 0.34591194968553457, "grad_norm": 3.828125, "learning_rate": 0.00011549999999999999, "loss": 6.4948, "step": 770 }, { "epoch": 0.3481581311769991, "grad_norm": 3.890625, "learning_rate": 0.00011624999999999999, "loss": 6.5652, "step": 775 }, { "epoch": 0.3504043126684636, "grad_norm": 3.828125, "learning_rate": 0.000117, "loss": 6.633, "step": 780 }, { "epoch": 0.3526504941599281, "grad_norm": 3.78125, "learning_rate": 0.00011774999999999999, "loss": 6.4617, "step": 785 }, { "epoch": 0.35489667565139266, "grad_norm": 3.9375, "learning_rate": 0.0001185, "loss": 6.524, "step": 790 }, { "epoch": 0.35714285714285715, "grad_norm": 7.25, "learning_rate": 0.00011925, "loss": 6.4985, "step": 795 }, { "epoch": 0.35938903863432164, "grad_norm": 3.828125, "learning_rate": 0.00011999999999999999, "loss": 6.4988, "step": 800 }, { "epoch": 0.36163522012578614, "grad_norm": 5.125, "learning_rate": 0.00012075, "loss": 6.5393, "step": 805 }, { "epoch": 0.3638814016172507, "grad_norm": 4.90625, "learning_rate": 0.0001215, "loss": 6.4869, "step": 810 }, { "epoch": 0.3661275831087152, "grad_norm": 4.1875, "learning_rate": 0.00012225, "loss": 6.4419, "step": 815 }, { "epoch": 0.3683737646001797, "grad_norm": 3.765625, "learning_rate": 0.00012299999999999998, "loss": 6.574, "step": 820 }, { "epoch": 0.3706199460916442, "grad_norm": 3.796875, "learning_rate": 0.00012374999999999997, "loss": 6.5063, "step": 825 }, { "epoch": 0.3728661275831087, "grad_norm": 3.734375, "learning_rate": 0.0001245, "loss": 6.5404, "step": 830 }, { "epoch": 0.3751123090745732, "grad_norm": 3.65625, "learning_rate": 0.00012524999999999998, "loss": 6.4726, "step": 835 }, { "epoch": 0.37735849056603776, "grad_norm": 4.0, "learning_rate": 0.00012599999999999997, "loss": 6.4099, "step": 840 }, { "epoch": 0.37960467205750226, "grad_norm": 4.25, "learning_rate": 0.00012675, "loss": 6.3966, "step": 845 }, { "epoch": 0.38185085354896675, "grad_norm": 3.828125, "learning_rate": 0.00012749999999999998, "loss": 6.4607, "step": 850 }, { "epoch": 0.38409703504043125, "grad_norm": 4.28125, "learning_rate": 0.00012824999999999997, "loss": 6.4718, "step": 855 }, { "epoch": 0.3863432165318958, "grad_norm": 4.71875, "learning_rate": 0.000129, "loss": 6.4569, "step": 860 }, { "epoch": 0.3885893980233603, "grad_norm": 4.375, "learning_rate": 0.00012974999999999998, "loss": 6.3576, "step": 865 }, { "epoch": 0.3908355795148248, "grad_norm": 4.65625, "learning_rate": 0.0001305, "loss": 6.4259, "step": 870 }, { "epoch": 0.39308176100628933, "grad_norm": 4.96875, "learning_rate": 0.00013125, "loss": 6.3831, "step": 875 }, { "epoch": 0.3953279424977538, "grad_norm": 3.90625, "learning_rate": 0.00013199999999999998, "loss": 6.4086, "step": 880 }, { "epoch": 0.3975741239892183, "grad_norm": 3.75, "learning_rate": 0.00013275, "loss": 6.3207, "step": 885 }, { "epoch": 0.3998203054806828, "grad_norm": 4.28125, "learning_rate": 0.0001335, "loss": 6.4129, "step": 890 }, { "epoch": 0.40206648697214736, "grad_norm": 3.8125, "learning_rate": 0.00013424999999999998, "loss": 6.4397, "step": 895 }, { "epoch": 0.40431266846361186, "grad_norm": 3.921875, "learning_rate": 0.000135, "loss": 6.4104, "step": 900 }, { "epoch": 0.40655884995507635, "grad_norm": 3.984375, "learning_rate": 0.00013575, "loss": 6.3327, "step": 905 }, { "epoch": 0.4088050314465409, "grad_norm": 3.859375, "learning_rate": 0.00013649999999999998, "loss": 6.3965, "step": 910 }, { "epoch": 0.4110512129380054, "grad_norm": 4.03125, "learning_rate": 0.00013725, "loss": 6.3614, "step": 915 }, { "epoch": 0.4132973944294699, "grad_norm": 3.734375, "learning_rate": 0.000138, "loss": 6.3743, "step": 920 }, { "epoch": 0.41554357592093444, "grad_norm": 3.984375, "learning_rate": 0.00013874999999999998, "loss": 6.4228, "step": 925 }, { "epoch": 0.41778975741239893, "grad_norm": 4.03125, "learning_rate": 0.0001395, "loss": 6.4047, "step": 930 }, { "epoch": 0.4200359389038634, "grad_norm": 3.984375, "learning_rate": 0.00014025, "loss": 6.3634, "step": 935 }, { "epoch": 0.4222821203953279, "grad_norm": 4.0, "learning_rate": 0.00014099999999999998, "loss": 6.3866, "step": 940 }, { "epoch": 0.42452830188679247, "grad_norm": 3.796875, "learning_rate": 0.00014174999999999998, "loss": 6.3599, "step": 945 }, { "epoch": 0.42677448337825696, "grad_norm": 4.03125, "learning_rate": 0.0001425, "loss": 6.3422, "step": 950 }, { "epoch": 0.42902066486972146, "grad_norm": 4.15625, "learning_rate": 0.00014324999999999999, "loss": 6.2791, "step": 955 }, { "epoch": 0.431266846361186, "grad_norm": 3.96875, "learning_rate": 0.00014399999999999998, "loss": 6.3505, "step": 960 }, { "epoch": 0.4335130278526505, "grad_norm": 4.5, "learning_rate": 0.00014475, "loss": 6.3671, "step": 965 }, { "epoch": 0.435759209344115, "grad_norm": 3.65625, "learning_rate": 0.00014549999999999999, "loss": 6.318, "step": 970 }, { "epoch": 0.4380053908355795, "grad_norm": 4.28125, "learning_rate": 0.00014624999999999998, "loss": 6.3299, "step": 975 }, { "epoch": 0.44025157232704404, "grad_norm": 3.578125, "learning_rate": 0.000147, "loss": 6.4073, "step": 980 }, { "epoch": 0.44249775381850853, "grad_norm": 3.734375, "learning_rate": 0.00014774999999999999, "loss": 6.4377, "step": 985 }, { "epoch": 0.444743935309973, "grad_norm": 3.765625, "learning_rate": 0.00014849999999999998, "loss": 6.2784, "step": 990 }, { "epoch": 0.4469901168014376, "grad_norm": 3.953125, "learning_rate": 0.00014925, "loss": 6.2901, "step": 995 }, { "epoch": 0.44923629829290207, "grad_norm": 4.375, "learning_rate": 0.00015, "loss": 6.2973, "step": 1000 }, { "epoch": 0.44923629829290207, "eval_loss": 6.229096412658691, "eval_runtime": 16.2469, "eval_samples_per_second": 1908.854, "eval_steps_per_second": 238.63, "step": 1000 }, { "epoch": 0.45148247978436656, "grad_norm": 3.78125, "learning_rate": 0.00015074999999999998, "loss": 6.3253, "step": 1005 }, { "epoch": 0.4537286612758311, "grad_norm": 3.953125, "learning_rate": 0.0001515, "loss": 6.2906, "step": 1010 }, { "epoch": 0.4559748427672956, "grad_norm": 3.90625, "learning_rate": 0.00015224999999999996, "loss": 6.3351, "step": 1015 }, { "epoch": 0.4582210242587601, "grad_norm": 3.6875, "learning_rate": 0.00015299999999999998, "loss": 6.368, "step": 1020 }, { "epoch": 0.4604672057502246, "grad_norm": 3.796875, "learning_rate": 0.00015374999999999997, "loss": 6.3008, "step": 1025 }, { "epoch": 0.46271338724168914, "grad_norm": 3.703125, "learning_rate": 0.0001545, "loss": 6.283, "step": 1030 }, { "epoch": 0.46495956873315364, "grad_norm": 3.734375, "learning_rate": 0.00015524999999999998, "loss": 6.3212, "step": 1035 }, { "epoch": 0.46720575022461813, "grad_norm": 4.15625, "learning_rate": 0.000156, "loss": 6.2874, "step": 1040 }, { "epoch": 0.4694519317160827, "grad_norm": 3.484375, "learning_rate": 0.00015675, "loss": 6.2944, "step": 1045 }, { "epoch": 0.4716981132075472, "grad_norm": 4.3125, "learning_rate": 0.00015749999999999998, "loss": 6.3099, "step": 1050 }, { "epoch": 0.47394429469901167, "grad_norm": 3.734375, "learning_rate": 0.00015824999999999997, "loss": 6.2531, "step": 1055 }, { "epoch": 0.47619047619047616, "grad_norm": 3.609375, "learning_rate": 0.000159, "loss": 6.2326, "step": 1060 }, { "epoch": 0.4784366576819407, "grad_norm": 3.8125, "learning_rate": 0.00015974999999999998, "loss": 6.2059, "step": 1065 }, { "epoch": 0.4806828391734052, "grad_norm": 3.625, "learning_rate": 0.0001605, "loss": 6.2798, "step": 1070 }, { "epoch": 0.4829290206648697, "grad_norm": 3.890625, "learning_rate": 0.00016125, "loss": 6.2814, "step": 1075 }, { "epoch": 0.48517520215633425, "grad_norm": 3.84375, "learning_rate": 0.000162, "loss": 6.1955, "step": 1080 }, { "epoch": 0.48742138364779874, "grad_norm": 4.0, "learning_rate": 0.00016274999999999997, "loss": 6.3142, "step": 1085 }, { "epoch": 0.48966756513926324, "grad_norm": 3.71875, "learning_rate": 0.0001635, "loss": 6.193, "step": 1090 }, { "epoch": 0.4919137466307278, "grad_norm": 4.0, "learning_rate": 0.00016424999999999998, "loss": 6.26, "step": 1095 }, { "epoch": 0.4941599281221923, "grad_norm": 4.0625, "learning_rate": 0.000165, "loss": 6.2443, "step": 1100 }, { "epoch": 0.4964061096136568, "grad_norm": 3.671875, "learning_rate": 0.00016575, "loss": 6.2278, "step": 1105 }, { "epoch": 0.49865229110512127, "grad_norm": 3.6875, "learning_rate": 0.0001665, "loss": 6.2254, "step": 1110 }, { "epoch": 0.5008984725965858, "grad_norm": 3.921875, "learning_rate": 0.00016724999999999997, "loss": 6.3325, "step": 1115 }, { "epoch": 0.5031446540880503, "grad_norm": 3.921875, "learning_rate": 0.000168, "loss": 6.186, "step": 1120 }, { "epoch": 0.5053908355795148, "grad_norm": 3.859375, "learning_rate": 0.00016874999999999998, "loss": 6.2389, "step": 1125 }, { "epoch": 0.5076370170709793, "grad_norm": 4.71875, "learning_rate": 0.00016949999999999997, "loss": 6.1268, "step": 1130 }, { "epoch": 0.5098831985624438, "grad_norm": 3.90625, "learning_rate": 0.00017025, "loss": 6.1445, "step": 1135 }, { "epoch": 0.5121293800539084, "grad_norm": 3.484375, "learning_rate": 0.00017099999999999998, "loss": 6.1658, "step": 1140 }, { "epoch": 0.5143755615453729, "grad_norm": 3.78125, "learning_rate": 0.00017175, "loss": 6.1832, "step": 1145 }, { "epoch": 0.5166217430368374, "grad_norm": 3.96875, "learning_rate": 0.00017249999999999996, "loss": 6.1621, "step": 1150 }, { "epoch": 0.5188679245283019, "grad_norm": 3.765625, "learning_rate": 0.00017324999999999998, "loss": 6.22, "step": 1155 }, { "epoch": 0.5211141060197664, "grad_norm": 3.890625, "learning_rate": 0.00017399999999999997, "loss": 6.1432, "step": 1160 }, { "epoch": 0.5233602875112309, "grad_norm": 3.59375, "learning_rate": 0.00017475, "loss": 6.1223, "step": 1165 }, { "epoch": 0.5256064690026954, "grad_norm": 3.28125, "learning_rate": 0.00017549999999999998, "loss": 6.1839, "step": 1170 }, { "epoch": 0.52785265049416, "grad_norm": 3.9375, "learning_rate": 0.00017625, "loss": 6.2021, "step": 1175 }, { "epoch": 0.5300988319856245, "grad_norm": 4.03125, "learning_rate": 0.00017699999999999997, "loss": 6.1947, "step": 1180 }, { "epoch": 0.532345013477089, "grad_norm": 4.5, "learning_rate": 0.00017774999999999998, "loss": 6.1474, "step": 1185 }, { "epoch": 0.5345911949685535, "grad_norm": 3.671875, "learning_rate": 0.00017849999999999997, "loss": 6.1488, "step": 1190 }, { "epoch": 0.536837376460018, "grad_norm": 3.734375, "learning_rate": 0.00017925, "loss": 6.1943, "step": 1195 }, { "epoch": 0.5390835579514824, "grad_norm": 3.8125, "learning_rate": 0.00017999999999999998, "loss": 6.13, "step": 1200 }, { "epoch": 0.541329739442947, "grad_norm": 3.828125, "learning_rate": 0.00018075, "loss": 6.0818, "step": 1205 }, { "epoch": 0.5435759209344115, "grad_norm": 3.546875, "learning_rate": 0.00018149999999999997, "loss": 6.1505, "step": 1210 }, { "epoch": 0.545822102425876, "grad_norm": 4.03125, "learning_rate": 0.00018224999999999998, "loss": 6.1578, "step": 1215 }, { "epoch": 0.5480682839173405, "grad_norm": 3.921875, "learning_rate": 0.00018299999999999998, "loss": 6.0904, "step": 1220 }, { "epoch": 0.550314465408805, "grad_norm": 4.1875, "learning_rate": 0.00018375, "loss": 6.0851, "step": 1225 }, { "epoch": 0.5525606469002695, "grad_norm": 4.21875, "learning_rate": 0.00018449999999999999, "loss": 6.1133, "step": 1230 }, { "epoch": 0.554806828391734, "grad_norm": 3.765625, "learning_rate": 0.00018525, "loss": 6.1453, "step": 1235 }, { "epoch": 0.5570530098831986, "grad_norm": 3.671875, "learning_rate": 0.000186, "loss": 6.1572, "step": 1240 }, { "epoch": 0.5592991913746631, "grad_norm": 3.8125, "learning_rate": 0.00018675, "loss": 6.2205, "step": 1245 }, { "epoch": 0.5615453728661276, "grad_norm": 4.4375, "learning_rate": 0.00018749999999999998, "loss": 6.1114, "step": 1250 }, { "epoch": 0.5637915543575921, "grad_norm": 4.03125, "learning_rate": 0.00018824999999999997, "loss": 6.1407, "step": 1255 }, { "epoch": 0.5660377358490566, "grad_norm": 4.1875, "learning_rate": 0.00018899999999999999, "loss": 6.1272, "step": 1260 }, { "epoch": 0.5682839173405211, "grad_norm": 4.03125, "learning_rate": 0.00018974999999999998, "loss": 6.1264, "step": 1265 }, { "epoch": 0.5705300988319856, "grad_norm": 4.09375, "learning_rate": 0.0001905, "loss": 6.0308, "step": 1270 }, { "epoch": 0.5727762803234502, "grad_norm": 3.421875, "learning_rate": 0.00019124999999999996, "loss": 6.1028, "step": 1275 }, { "epoch": 0.5750224618149147, "grad_norm": 3.953125, "learning_rate": 0.00019199999999999998, "loss": 6.1002, "step": 1280 }, { "epoch": 0.5772686433063792, "grad_norm": 4.1875, "learning_rate": 0.00019274999999999997, "loss": 6.1451, "step": 1285 }, { "epoch": 0.5795148247978437, "grad_norm": 4.0625, "learning_rate": 0.0001935, "loss": 6.0798, "step": 1290 }, { "epoch": 0.5817610062893082, "grad_norm": 3.609375, "learning_rate": 0.00019424999999999998, "loss": 6.0831, "step": 1295 }, { "epoch": 0.5840071877807727, "grad_norm": 3.671875, "learning_rate": 0.000195, "loss": 6.1054, "step": 1300 }, { "epoch": 0.5862533692722371, "grad_norm": 3.625, "learning_rate": 0.00019574999999999996, "loss": 6.0122, "step": 1305 }, { "epoch": 0.5884995507637018, "grad_norm": 4.0625, "learning_rate": 0.00019649999999999998, "loss": 6.0397, "step": 1310 }, { "epoch": 0.5907457322551662, "grad_norm": 3.59375, "learning_rate": 0.00019724999999999997, "loss": 5.9765, "step": 1315 }, { "epoch": 0.5929919137466307, "grad_norm": 3.296875, "learning_rate": 0.000198, "loss": 6.0359, "step": 1320 }, { "epoch": 0.5952380952380952, "grad_norm": 3.828125, "learning_rate": 0.00019874999999999998, "loss": 6.0552, "step": 1325 }, { "epoch": 0.5974842767295597, "grad_norm": 3.5625, "learning_rate": 0.0001995, "loss": 6.0254, "step": 1330 }, { "epoch": 0.5997304582210242, "grad_norm": 3.703125, "learning_rate": 0.00020025, "loss": 6.0575, "step": 1335 }, { "epoch": 0.6019766397124887, "grad_norm": 3.59375, "learning_rate": 0.000201, "loss": 6.004, "step": 1340 }, { "epoch": 0.6042228212039533, "grad_norm": 3.65625, "learning_rate": 0.00020174999999999997, "loss": 6.0784, "step": 1345 }, { "epoch": 0.6064690026954178, "grad_norm": 3.78125, "learning_rate": 0.0002025, "loss": 6.1157, "step": 1350 }, { "epoch": 0.6087151841868823, "grad_norm": 3.65625, "learning_rate": 0.00020324999999999998, "loss": 6.0583, "step": 1355 }, { "epoch": 0.6109613656783468, "grad_norm": 3.4375, "learning_rate": 0.000204, "loss": 6.0366, "step": 1360 }, { "epoch": 0.6132075471698113, "grad_norm": 3.4375, "learning_rate": 0.00020475, "loss": 6.1213, "step": 1365 }, { "epoch": 0.6154537286612758, "grad_norm": 3.8125, "learning_rate": 0.0002055, "loss": 6.1744, "step": 1370 }, { "epoch": 0.6176999101527404, "grad_norm": 3.8125, "learning_rate": 0.00020624999999999997, "loss": 6.0912, "step": 1375 }, { "epoch": 0.6199460916442049, "grad_norm": 3.421875, "learning_rate": 0.00020699999999999996, "loss": 5.9619, "step": 1380 }, { "epoch": 0.6221922731356694, "grad_norm": 3.78125, "learning_rate": 0.00020774999999999998, "loss": 5.9658, "step": 1385 }, { "epoch": 0.6244384546271339, "grad_norm": 3.484375, "learning_rate": 0.00020849999999999997, "loss": 6.0913, "step": 1390 }, { "epoch": 0.6266846361185984, "grad_norm": 3.484375, "learning_rate": 0.00020925, "loss": 6.0363, "step": 1395 }, { "epoch": 0.6289308176100629, "grad_norm": 3.890625, "learning_rate": 0.00020999999999999998, "loss": 5.9513, "step": 1400 }, { "epoch": 0.6311769991015274, "grad_norm": 4.0625, "learning_rate": 0.00021074999999999997, "loss": 5.9931, "step": 1405 }, { "epoch": 0.633423180592992, "grad_norm": 4.0, "learning_rate": 0.00021149999999999996, "loss": 5.9732, "step": 1410 }, { "epoch": 0.6356693620844565, "grad_norm": 3.671875, "learning_rate": 0.00021224999999999998, "loss": 6.0028, "step": 1415 }, { "epoch": 0.637915543575921, "grad_norm": 3.5, "learning_rate": 0.00021299999999999997, "loss": 6.0171, "step": 1420 }, { "epoch": 0.6401617250673854, "grad_norm": 3.421875, "learning_rate": 0.00021375, "loss": 5.9886, "step": 1425 }, { "epoch": 0.6424079065588499, "grad_norm": 3.875, "learning_rate": 0.00021449999999999998, "loss": 5.9436, "step": 1430 }, { "epoch": 0.6446540880503144, "grad_norm": 3.3125, "learning_rate": 0.00021525, "loss": 6.0565, "step": 1435 }, { "epoch": 0.6469002695417789, "grad_norm": 3.640625, "learning_rate": 0.00021599999999999996, "loss": 6.1117, "step": 1440 }, { "epoch": 0.6491464510332435, "grad_norm": 3.625, "learning_rate": 0.00021674999999999998, "loss": 5.9778, "step": 1445 }, { "epoch": 0.651392632524708, "grad_norm": 4.0625, "learning_rate": 0.00021749999999999997, "loss": 5.9706, "step": 1450 }, { "epoch": 0.6536388140161725, "grad_norm": 4.15625, "learning_rate": 0.00021825, "loss": 5.9358, "step": 1455 }, { "epoch": 0.655884995507637, "grad_norm": 3.5, "learning_rate": 0.00021899999999999998, "loss": 6.0584, "step": 1460 }, { "epoch": 0.6581311769991015, "grad_norm": 3.734375, "learning_rate": 0.00021975, "loss": 6.0055, "step": 1465 }, { "epoch": 0.660377358490566, "grad_norm": 3.78125, "learning_rate": 0.00022049999999999997, "loss": 5.9678, "step": 1470 }, { "epoch": 0.6626235399820305, "grad_norm": 3.703125, "learning_rate": 0.00022124999999999998, "loss": 5.9747, "step": 1475 }, { "epoch": 0.6648697214734951, "grad_norm": 3.46875, "learning_rate": 0.00022199999999999998, "loss": 5.9542, "step": 1480 }, { "epoch": 0.6671159029649596, "grad_norm": 3.34375, "learning_rate": 0.00022275, "loss": 5.9001, "step": 1485 }, { "epoch": 0.6693620844564241, "grad_norm": 3.65625, "learning_rate": 0.00022349999999999998, "loss": 5.9689, "step": 1490 }, { "epoch": 0.6716082659478886, "grad_norm": 3.953125, "learning_rate": 0.00022425, "loss": 5.9823, "step": 1495 }, { "epoch": 0.6738544474393531, "grad_norm": 3.53125, "learning_rate": 0.000225, "loss": 5.9758, "step": 1500 }, { "epoch": 0.6761006289308176, "grad_norm": 3.484375, "learning_rate": 0.00022574999999999996, "loss": 5.9994, "step": 1505 }, { "epoch": 0.6783468104222821, "grad_norm": 3.6875, "learning_rate": 0.00022649999999999998, "loss": 5.8979, "step": 1510 }, { "epoch": 0.6805929919137467, "grad_norm": 3.328125, "learning_rate": 0.00022724999999999997, "loss": 6.0046, "step": 1515 }, { "epoch": 0.6828391734052112, "grad_norm": 3.75, "learning_rate": 0.00022799999999999999, "loss": 5.9637, "step": 1520 }, { "epoch": 0.6850853548966757, "grad_norm": 3.296875, "learning_rate": 0.00022874999999999998, "loss": 5.939, "step": 1525 }, { "epoch": 0.6873315363881402, "grad_norm": 3.484375, "learning_rate": 0.0002295, "loss": 6.0089, "step": 1530 }, { "epoch": 0.6895777178796046, "grad_norm": 3.46875, "learning_rate": 0.00023024999999999996, "loss": 5.9247, "step": 1535 }, { "epoch": 0.6918238993710691, "grad_norm": 3.3125, "learning_rate": 0.00023099999999999998, "loss": 5.8969, "step": 1540 }, { "epoch": 0.6940700808625337, "grad_norm": 3.734375, "learning_rate": 0.00023174999999999997, "loss": 5.8485, "step": 1545 }, { "epoch": 0.6963162623539982, "grad_norm": 3.375, "learning_rate": 0.00023249999999999999, "loss": 5.9481, "step": 1550 }, { "epoch": 0.6985624438454627, "grad_norm": 3.5625, "learning_rate": 0.00023324999999999998, "loss": 5.9145, "step": 1555 }, { "epoch": 0.7008086253369272, "grad_norm": 3.5, "learning_rate": 0.000234, "loss": 5.8711, "step": 1560 }, { "epoch": 0.7030548068283917, "grad_norm": 3.703125, "learning_rate": 0.00023474999999999996, "loss": 5.9697, "step": 1565 }, { "epoch": 0.7053009883198562, "grad_norm": 3.75, "learning_rate": 0.00023549999999999998, "loss": 5.8905, "step": 1570 }, { "epoch": 0.7075471698113207, "grad_norm": 3.59375, "learning_rate": 0.00023624999999999997, "loss": 5.9357, "step": 1575 }, { "epoch": 0.7097933513027853, "grad_norm": 3.453125, "learning_rate": 0.000237, "loss": 5.8548, "step": 1580 }, { "epoch": 0.7120395327942498, "grad_norm": 3.484375, "learning_rate": 0.00023774999999999998, "loss": 5.9498, "step": 1585 }, { "epoch": 0.7142857142857143, "grad_norm": 3.78125, "learning_rate": 0.0002385, "loss": 5.8457, "step": 1590 }, { "epoch": 0.7165318957771788, "grad_norm": 3.5625, "learning_rate": 0.00023925, "loss": 5.8717, "step": 1595 }, { "epoch": 0.7187780772686433, "grad_norm": 3.328125, "learning_rate": 0.00023999999999999998, "loss": 5.8193, "step": 1600 }, { "epoch": 0.7210242587601078, "grad_norm": 3.296875, "learning_rate": 0.00024074999999999997, "loss": 5.8618, "step": 1605 }, { "epoch": 0.7232704402515723, "grad_norm": 3.625, "learning_rate": 0.0002415, "loss": 5.8882, "step": 1610 }, { "epoch": 0.7255166217430369, "grad_norm": 3.28125, "learning_rate": 0.00024224999999999998, "loss": 5.9087, "step": 1615 }, { "epoch": 0.7277628032345014, "grad_norm": 3.53125, "learning_rate": 0.000243, "loss": 5.8994, "step": 1620 }, { "epoch": 0.7300089847259659, "grad_norm": 3.34375, "learning_rate": 0.00024375, "loss": 5.9156, "step": 1625 }, { "epoch": 0.7322551662174304, "grad_norm": 3.78125, "learning_rate": 0.0002445, "loss": 5.889, "step": 1630 }, { "epoch": 0.7345013477088949, "grad_norm": 3.5, "learning_rate": 0.00024524999999999997, "loss": 5.8538, "step": 1635 }, { "epoch": 0.7367475292003594, "grad_norm": 3.53125, "learning_rate": 0.00024599999999999996, "loss": 5.914, "step": 1640 }, { "epoch": 0.7389937106918238, "grad_norm": 3.25, "learning_rate": 0.00024675, "loss": 5.8628, "step": 1645 }, { "epoch": 0.7412398921832885, "grad_norm": 3.5, "learning_rate": 0.00024749999999999994, "loss": 5.8555, "step": 1650 }, { "epoch": 0.743486073674753, "grad_norm": 3.4375, "learning_rate": 0.00024825, "loss": 5.8846, "step": 1655 }, { "epoch": 0.7457322551662174, "grad_norm": 3.703125, "learning_rate": 0.000249, "loss": 5.8957, "step": 1660 }, { "epoch": 0.7479784366576819, "grad_norm": 3.25, "learning_rate": 0.00024974999999999997, "loss": 5.8036, "step": 1665 }, { "epoch": 0.7502246181491464, "grad_norm": 3.375, "learning_rate": 0.00025049999999999996, "loss": 5.845, "step": 1670 }, { "epoch": 0.7524707996406109, "grad_norm": 3.1875, "learning_rate": 0.00025125, "loss": 5.8801, "step": 1675 }, { "epoch": 0.7547169811320755, "grad_norm": 3.53125, "learning_rate": 0.00025199999999999995, "loss": 5.8356, "step": 1680 }, { "epoch": 0.75696316262354, "grad_norm": 3.375, "learning_rate": 0.00025275, "loss": 5.851, "step": 1685 }, { "epoch": 0.7592093441150045, "grad_norm": 3.546875, "learning_rate": 0.0002535, "loss": 5.8647, "step": 1690 }, { "epoch": 0.761455525606469, "grad_norm": 3.4375, "learning_rate": 0.00025425, "loss": 5.8168, "step": 1695 }, { "epoch": 0.7637017070979335, "grad_norm": 3.609375, "learning_rate": 0.00025499999999999996, "loss": 5.8514, "step": 1700 }, { "epoch": 0.765947888589398, "grad_norm": 3.3125, "learning_rate": 0.00025575, "loss": 5.7495, "step": 1705 }, { "epoch": 0.7681940700808625, "grad_norm": 3.515625, "learning_rate": 0.00025649999999999995, "loss": 5.8702, "step": 1710 }, { "epoch": 0.7704402515723271, "grad_norm": 3.640625, "learning_rate": 0.00025725, "loss": 5.9178, "step": 1715 }, { "epoch": 0.7726864330637916, "grad_norm": 3.1875, "learning_rate": 0.000258, "loss": 5.82, "step": 1720 }, { "epoch": 0.7749326145552561, "grad_norm": 3.765625, "learning_rate": 0.00025875, "loss": 5.823, "step": 1725 }, { "epoch": 0.7771787960467206, "grad_norm": 3.4375, "learning_rate": 0.00025949999999999997, "loss": 5.8712, "step": 1730 }, { "epoch": 0.7794249775381851, "grad_norm": 3.140625, "learning_rate": 0.00026025, "loss": 5.8173, "step": 1735 }, { "epoch": 0.7816711590296496, "grad_norm": 3.28125, "learning_rate": 0.000261, "loss": 5.8169, "step": 1740 }, { "epoch": 0.7839173405211141, "grad_norm": 3.4375, "learning_rate": 0.00026175, "loss": 5.8047, "step": 1745 }, { "epoch": 0.7861635220125787, "grad_norm": 3.21875, "learning_rate": 0.0002625, "loss": 5.8384, "step": 1750 }, { "epoch": 0.7884097035040432, "grad_norm": 3.40625, "learning_rate": 0.00026325, "loss": 5.7996, "step": 1755 }, { "epoch": 0.7906558849955077, "grad_norm": 3.4375, "learning_rate": 0.00026399999999999997, "loss": 5.7611, "step": 1760 }, { "epoch": 0.7929020664869721, "grad_norm": 3.390625, "learning_rate": 0.00026474999999999996, "loss": 5.7925, "step": 1765 }, { "epoch": 0.7951482479784366, "grad_norm": 3.375, "learning_rate": 0.0002655, "loss": 5.8187, "step": 1770 }, { "epoch": 0.7973944294699011, "grad_norm": 3.53125, "learning_rate": 0.00026624999999999994, "loss": 5.7791, "step": 1775 }, { "epoch": 0.7996406109613656, "grad_norm": 3.8125, "learning_rate": 0.000267, "loss": 5.8063, "step": 1780 }, { "epoch": 0.8018867924528302, "grad_norm": 3.25, "learning_rate": 0.00026775, "loss": 5.8167, "step": 1785 }, { "epoch": 0.8041329739442947, "grad_norm": 3.46875, "learning_rate": 0.00026849999999999997, "loss": 5.7916, "step": 1790 }, { "epoch": 0.8063791554357592, "grad_norm": 3.28125, "learning_rate": 0.00026924999999999996, "loss": 5.8446, "step": 1795 }, { "epoch": 0.8086253369272237, "grad_norm": 3.65625, "learning_rate": 0.00027, "loss": 5.8757, "step": 1800 }, { "epoch": 0.8108715184186882, "grad_norm": 3.734375, "learning_rate": 0.00027074999999999994, "loss": 5.7271, "step": 1805 }, { "epoch": 0.8131176999101527, "grad_norm": 3.765625, "learning_rate": 0.0002715, "loss": 5.8397, "step": 1810 }, { "epoch": 0.8153638814016172, "grad_norm": 3.34375, "learning_rate": 0.00027225, "loss": 5.7838, "step": 1815 }, { "epoch": 0.8176100628930818, "grad_norm": 3.59375, "learning_rate": 0.00027299999999999997, "loss": 5.7907, "step": 1820 }, { "epoch": 0.8198562443845463, "grad_norm": 3.921875, "learning_rate": 0.00027374999999999996, "loss": 5.8579, "step": 1825 }, { "epoch": 0.8221024258760108, "grad_norm": 3.46875, "learning_rate": 0.0002745, "loss": 5.8342, "step": 1830 }, { "epoch": 0.8243486073674753, "grad_norm": 3.75, "learning_rate": 0.00027525, "loss": 5.7949, "step": 1835 }, { "epoch": 0.8265947888589398, "grad_norm": 3.4375, "learning_rate": 0.000276, "loss": 5.7715, "step": 1840 }, { "epoch": 0.8288409703504043, "grad_norm": 3.703125, "learning_rate": 0.00027675, "loss": 5.7804, "step": 1845 }, { "epoch": 0.8310871518418689, "grad_norm": 3.4375, "learning_rate": 0.00027749999999999997, "loss": 5.7288, "step": 1850 }, { "epoch": 0.8333333333333334, "grad_norm": 3.109375, "learning_rate": 0.00027824999999999996, "loss": 5.7319, "step": 1855 }, { "epoch": 0.8355795148247979, "grad_norm": 3.21875, "learning_rate": 0.000279, "loss": 5.7636, "step": 1860 }, { "epoch": 0.8378256963162624, "grad_norm": 3.234375, "learning_rate": 0.00027975, "loss": 5.7395, "step": 1865 }, { "epoch": 0.8400718778077269, "grad_norm": 3.6875, "learning_rate": 0.0002805, "loss": 5.7519, "step": 1870 }, { "epoch": 0.8423180592991913, "grad_norm": 3.265625, "learning_rate": 0.00028125, "loss": 5.706, "step": 1875 }, { "epoch": 0.8445642407906558, "grad_norm": 3.390625, "learning_rate": 0.00028199999999999997, "loss": 5.799, "step": 1880 }, { "epoch": 0.8468104222821204, "grad_norm": 3.265625, "learning_rate": 0.00028274999999999996, "loss": 5.7856, "step": 1885 }, { "epoch": 0.8490566037735849, "grad_norm": 3.421875, "learning_rate": 0.00028349999999999995, "loss": 5.8625, "step": 1890 }, { "epoch": 0.8513027852650494, "grad_norm": 3.203125, "learning_rate": 0.00028425, "loss": 5.7212, "step": 1895 }, { "epoch": 0.8535489667565139, "grad_norm": 3.296875, "learning_rate": 0.000285, "loss": 5.7326, "step": 1900 }, { "epoch": 0.8557951482479784, "grad_norm": 3.5, "learning_rate": 0.00028575, "loss": 5.7664, "step": 1905 }, { "epoch": 0.8580413297394429, "grad_norm": 3.34375, "learning_rate": 0.00028649999999999997, "loss": 5.7231, "step": 1910 }, { "epoch": 0.8602875112309074, "grad_norm": 3.40625, "learning_rate": 0.00028724999999999996, "loss": 5.7759, "step": 1915 }, { "epoch": 0.862533692722372, "grad_norm": 3.125, "learning_rate": 0.00028799999999999995, "loss": 5.7442, "step": 1920 }, { "epoch": 0.8647798742138365, "grad_norm": 3.15625, "learning_rate": 0.00028875, "loss": 5.7252, "step": 1925 }, { "epoch": 0.867026055705301, "grad_norm": 3.265625, "learning_rate": 0.0002895, "loss": 5.7196, "step": 1930 }, { "epoch": 0.8692722371967655, "grad_norm": 3.328125, "learning_rate": 0.00029025, "loss": 5.7376, "step": 1935 }, { "epoch": 0.87151841868823, "grad_norm": 3.1875, "learning_rate": 0.00029099999999999997, "loss": 5.8077, "step": 1940 }, { "epoch": 0.8737646001796945, "grad_norm": 3.625, "learning_rate": 0.00029174999999999996, "loss": 5.7826, "step": 1945 }, { "epoch": 0.876010781671159, "grad_norm": 3.609375, "learning_rate": 0.00029249999999999995, "loss": 5.736, "step": 1950 }, { "epoch": 0.8782569631626236, "grad_norm": 3.421875, "learning_rate": 0.00029325, "loss": 5.7531, "step": 1955 }, { "epoch": 0.8805031446540881, "grad_norm": 3.4375, "learning_rate": 0.000294, "loss": 5.7246, "step": 1960 }, { "epoch": 0.8827493261455526, "grad_norm": 3.375, "learning_rate": 0.00029475, "loss": 5.7786, "step": 1965 }, { "epoch": 0.8849955076370171, "grad_norm": 3.296875, "learning_rate": 0.00029549999999999997, "loss": 5.7237, "step": 1970 }, { "epoch": 0.8872416891284816, "grad_norm": 2.96875, "learning_rate": 0.00029624999999999996, "loss": 5.8053, "step": 1975 }, { "epoch": 0.889487870619946, "grad_norm": 3.328125, "learning_rate": 0.00029699999999999996, "loss": 5.6918, "step": 1980 }, { "epoch": 0.8917340521114105, "grad_norm": 3.015625, "learning_rate": 0.00029775, "loss": 5.8251, "step": 1985 }, { "epoch": 0.8939802336028752, "grad_norm": 3.78125, "learning_rate": 0.0002985, "loss": 5.7529, "step": 1990 }, { "epoch": 0.8962264150943396, "grad_norm": 3.640625, "learning_rate": 0.00029925, "loss": 5.7181, "step": 1995 }, { "epoch": 0.8984725965858041, "grad_norm": 3.234375, "learning_rate": 0.0003, "loss": 5.7413, "step": 2000 }, { "epoch": 0.8984725965858041, "eval_loss": 5.639461517333984, "eval_runtime": 16.0491, "eval_samples_per_second": 1932.383, "eval_steps_per_second": 241.571, "step": 2000 }, { "epoch": 0.9007187780772686, "grad_norm": 3.140625, "learning_rate": 0.00029999995942443054, "loss": 5.6436, "step": 2005 }, { "epoch": 0.9029649595687331, "grad_norm": 3.328125, "learning_rate": 0.00029999983769774674, "loss": 5.7627, "step": 2010 }, { "epoch": 0.9052111410601976, "grad_norm": 3.171875, "learning_rate": 0.0002999996348200217, "loss": 5.7181, "step": 2015 }, { "epoch": 0.9074573225516622, "grad_norm": 3.34375, "learning_rate": 0.0002999993507913773, "loss": 5.7097, "step": 2020 }, { "epoch": 0.9097035040431267, "grad_norm": 3.1875, "learning_rate": 0.0002999989856119844, "loss": 5.6407, "step": 2025 }, { "epoch": 0.9119496855345912, "grad_norm": 3.453125, "learning_rate": 0.0002999985392820624, "loss": 5.6532, "step": 2030 }, { "epoch": 0.9141958670260557, "grad_norm": 3.140625, "learning_rate": 0.0002999980118018797, "loss": 5.6993, "step": 2035 }, { "epoch": 0.9164420485175202, "grad_norm": 3.546875, "learning_rate": 0.0002999974031717533, "loss": 5.6507, "step": 2040 }, { "epoch": 0.9186882300089847, "grad_norm": 3.546875, "learning_rate": 0.0002999967133920491, "loss": 5.6629, "step": 2045 }, { "epoch": 0.9209344115004492, "grad_norm": 3.203125, "learning_rate": 0.0002999959424631818, "loss": 5.7172, "step": 2050 }, { "epoch": 0.9231805929919138, "grad_norm": 3.140625, "learning_rate": 0.0002999950903856147, "loss": 5.5766, "step": 2055 }, { "epoch": 0.9254267744833783, "grad_norm": 3.234375, "learning_rate": 0.00029999415715986, "loss": 5.6546, "step": 2060 }, { "epoch": 0.9276729559748428, "grad_norm": 3.34375, "learning_rate": 0.0002999931427864788, "loss": 5.6317, "step": 2065 }, { "epoch": 0.9299191374663073, "grad_norm": 3.1875, "learning_rate": 0.00029999204726608076, "loss": 5.6605, "step": 2070 }, { "epoch": 0.9321653189577718, "grad_norm": 3.40625, "learning_rate": 0.0002999908705993245, "loss": 5.6958, "step": 2075 }, { "epoch": 0.9344115004492363, "grad_norm": 3.046875, "learning_rate": 0.00029998961278691725, "loss": 5.6498, "step": 2080 }, { "epoch": 0.9366576819407008, "grad_norm": 3.203125, "learning_rate": 0.0002999882738296152, "loss": 5.6887, "step": 2085 }, { "epoch": 0.9389038634321654, "grad_norm": 3.453125, "learning_rate": 0.0002999868537282231, "loss": 5.617, "step": 2090 }, { "epoch": 0.9411500449236299, "grad_norm": 3.25, "learning_rate": 0.0002999853524835947, "loss": 5.7708, "step": 2095 }, { "epoch": 0.9433962264150944, "grad_norm": 3.421875, "learning_rate": 0.0002999837700966324, "loss": 5.6733, "step": 2100 }, { "epoch": 0.9456424079065588, "grad_norm": 3.359375, "learning_rate": 0.00029998210656828736, "loss": 5.7, "step": 2105 }, { "epoch": 0.9478885893980233, "grad_norm": 3.296875, "learning_rate": 0.0002999803618995596, "loss": 5.6652, "step": 2110 }, { "epoch": 0.9501347708894878, "grad_norm": 3.71875, "learning_rate": 0.00029997853609149797, "loss": 5.7413, "step": 2115 }, { "epoch": 0.9523809523809523, "grad_norm": 3.3125, "learning_rate": 0.00029997662914519983, "loss": 5.7038, "step": 2120 }, { "epoch": 0.9546271338724169, "grad_norm": 3.546875, "learning_rate": 0.0002999746410618116, "loss": 5.6402, "step": 2125 }, { "epoch": 0.9568733153638814, "grad_norm": 3.09375, "learning_rate": 0.00029997257184252827, "loss": 5.5762, "step": 2130 }, { "epoch": 0.9591194968553459, "grad_norm": 3.421875, "learning_rate": 0.00029997042148859374, "loss": 5.7327, "step": 2135 }, { "epoch": 0.9613656783468104, "grad_norm": 3.296875, "learning_rate": 0.0002999681900013006, "loss": 5.6974, "step": 2140 }, { "epoch": 0.9636118598382749, "grad_norm": 3.140625, "learning_rate": 0.0002999658773819903, "loss": 5.7185, "step": 2145 }, { "epoch": 0.9658580413297394, "grad_norm": 3.34375, "learning_rate": 0.00029996348363205296, "loss": 5.7269, "step": 2150 }, { "epoch": 0.968104222821204, "grad_norm": 3.0, "learning_rate": 0.0002999610087529275, "loss": 5.6719, "step": 2155 }, { "epoch": 0.9703504043126685, "grad_norm": 3.375, "learning_rate": 0.00029995845274610164, "loss": 5.6067, "step": 2160 }, { "epoch": 0.972596585804133, "grad_norm": 3.25, "learning_rate": 0.00029995581561311185, "loss": 5.612, "step": 2165 }, { "epoch": 0.9748427672955975, "grad_norm": 3.390625, "learning_rate": 0.00029995309735554327, "loss": 5.6163, "step": 2170 }, { "epoch": 0.977088948787062, "grad_norm": 3.265625, "learning_rate": 0.00029995029797503007, "loss": 5.6468, "step": 2175 }, { "epoch": 0.9793351302785265, "grad_norm": 3.03125, "learning_rate": 0.00029994741747325487, "loss": 5.6653, "step": 2180 }, { "epoch": 0.981581311769991, "grad_norm": 3.1875, "learning_rate": 0.00029994445585194925, "loss": 5.6416, "step": 2185 }, { "epoch": 0.9838274932614556, "grad_norm": 3.09375, "learning_rate": 0.00029994141311289347, "loss": 5.5982, "step": 2190 }, { "epoch": 0.9860736747529201, "grad_norm": 3.328125, "learning_rate": 0.00029993828925791664, "loss": 5.6288, "step": 2195 }, { "epoch": 0.9883198562443846, "grad_norm": 3.203125, "learning_rate": 0.0002999350842888965, "loss": 5.6725, "step": 2200 }, { "epoch": 0.9905660377358491, "grad_norm": 3.40625, "learning_rate": 0.0002999317982077596, "loss": 5.6444, "step": 2205 }, { "epoch": 0.9928122192273136, "grad_norm": 2.921875, "learning_rate": 0.00029992843101648144, "loss": 5.6642, "step": 2210 }, { "epoch": 0.995058400718778, "grad_norm": 3.015625, "learning_rate": 0.00029992498271708595, "loss": 5.6011, "step": 2215 }, { "epoch": 0.9973045822102425, "grad_norm": 2.90625, "learning_rate": 0.00029992145331164596, "loss": 5.6432, "step": 2220 }, { "epoch": 0.9995507637017071, "grad_norm": 3.140625, "learning_rate": 0.0002999178428022831, "loss": 5.6428, "step": 2225 }, { "epoch": 1.0017969451931716, "grad_norm": 3.265625, "learning_rate": 0.0002999141511911678, "loss": 5.5542, "step": 2230 }, { "epoch": 1.0040431266846361, "grad_norm": 3.296875, "learning_rate": 0.000299910378480519, "loss": 5.6403, "step": 2235 }, { "epoch": 1.0062893081761006, "grad_norm": 3.21875, "learning_rate": 0.0002999065246726047, "loss": 5.5451, "step": 2240 }, { "epoch": 1.0085354896675651, "grad_norm": 3.0625, "learning_rate": 0.0002999025897697414, "loss": 5.6575, "step": 2245 }, { "epoch": 1.0107816711590296, "grad_norm": 3.140625, "learning_rate": 0.0002998985737742945, "loss": 5.5892, "step": 2250 }, { "epoch": 1.013027852650494, "grad_norm": 3.203125, "learning_rate": 0.0002998944766886781, "loss": 5.6127, "step": 2255 }, { "epoch": 1.0152740341419586, "grad_norm": 3.078125, "learning_rate": 0.000299890298515355, "loss": 5.5885, "step": 2260 }, { "epoch": 1.017520215633423, "grad_norm": 3.265625, "learning_rate": 0.0002998860392568368, "loss": 5.5215, "step": 2265 }, { "epoch": 1.0197663971248876, "grad_norm": 3.171875, "learning_rate": 0.00029988169891568373, "loss": 5.6074, "step": 2270 }, { "epoch": 1.0220125786163523, "grad_norm": 3.171875, "learning_rate": 0.00029987727749450506, "loss": 5.6192, "step": 2275 }, { "epoch": 1.0242587601078168, "grad_norm": 3.328125, "learning_rate": 0.00029987277499595843, "loss": 5.5663, "step": 2280 }, { "epoch": 1.0265049415992813, "grad_norm": 3.265625, "learning_rate": 0.0002998681914227504, "loss": 5.5862, "step": 2285 }, { "epoch": 1.0287511230907458, "grad_norm": 3.0, "learning_rate": 0.0002998635267776363, "loss": 5.5536, "step": 2290 }, { "epoch": 1.0309973045822103, "grad_norm": 3.3125, "learning_rate": 0.0002998587810634201, "loss": 5.5818, "step": 2295 }, { "epoch": 1.0332434860736748, "grad_norm": 3.234375, "learning_rate": 0.0002998539542829546, "loss": 5.6147, "step": 2300 }, { "epoch": 1.0354896675651393, "grad_norm": 3.09375, "learning_rate": 0.00029984904643914114, "loss": 5.6629, "step": 2305 }, { "epoch": 1.0377358490566038, "grad_norm": 3.15625, "learning_rate": 0.00029984405753493006, "loss": 5.5412, "step": 2310 }, { "epoch": 1.0399820305480683, "grad_norm": 2.984375, "learning_rate": 0.00029983898757332024, "loss": 5.5598, "step": 2315 }, { "epoch": 1.0422282120395328, "grad_norm": 2.96875, "learning_rate": 0.0002998338365573593, "loss": 5.6111, "step": 2320 }, { "epoch": 1.0444743935309972, "grad_norm": 3.234375, "learning_rate": 0.0002998286044901436, "loss": 5.4899, "step": 2325 }, { "epoch": 1.0467205750224617, "grad_norm": 3.453125, "learning_rate": 0.0002998232913748184, "loss": 5.5567, "step": 2330 }, { "epoch": 1.0489667565139262, "grad_norm": 3.40625, "learning_rate": 0.0002998178972145773, "loss": 5.4968, "step": 2335 }, { "epoch": 1.0512129380053907, "grad_norm": 3.03125, "learning_rate": 0.000299812422012663, "loss": 5.6119, "step": 2340 }, { "epoch": 1.0534591194968554, "grad_norm": 3.15625, "learning_rate": 0.0002998068657723666, "loss": 5.5563, "step": 2345 }, { "epoch": 1.05570530098832, "grad_norm": 3.203125, "learning_rate": 0.0002998012284970282, "loss": 5.5985, "step": 2350 }, { "epoch": 1.0579514824797844, "grad_norm": 3.46875, "learning_rate": 0.00029979551019003643, "loss": 5.5002, "step": 2355 }, { "epoch": 1.060197663971249, "grad_norm": 3.046875, "learning_rate": 0.0002997897108548286, "loss": 5.6114, "step": 2360 }, { "epoch": 1.0624438454627134, "grad_norm": 3.140625, "learning_rate": 0.00029978383049489093, "loss": 5.5056, "step": 2365 }, { "epoch": 1.064690026954178, "grad_norm": 3.109375, "learning_rate": 0.0002997778691137582, "loss": 5.515, "step": 2370 }, { "epoch": 1.0669362084456424, "grad_norm": 3.15625, "learning_rate": 0.00029977182671501383, "loss": 5.5303, "step": 2375 }, { "epoch": 1.069182389937107, "grad_norm": 3.140625, "learning_rate": 0.00029976570330229006, "loss": 5.5147, "step": 2380 }, { "epoch": 1.0714285714285714, "grad_norm": 3.109375, "learning_rate": 0.00029975949887926784, "loss": 5.5098, "step": 2385 }, { "epoch": 1.073674752920036, "grad_norm": 3.046875, "learning_rate": 0.00029975321344967676, "loss": 5.5533, "step": 2390 }, { "epoch": 1.0759209344115004, "grad_norm": 3.28125, "learning_rate": 0.000299746847017295, "loss": 5.5429, "step": 2395 }, { "epoch": 1.0781671159029649, "grad_norm": 3.265625, "learning_rate": 0.00029974039958594967, "loss": 5.508, "step": 2400 }, { "epoch": 1.0804132973944294, "grad_norm": 3.1875, "learning_rate": 0.0002997338711595165, "loss": 5.5494, "step": 2405 }, { "epoch": 1.082659478885894, "grad_norm": 3.203125, "learning_rate": 0.00029972726174191965, "loss": 5.4273, "step": 2410 }, { "epoch": 1.0849056603773586, "grad_norm": 3.0625, "learning_rate": 0.00029972057133713235, "loss": 5.5474, "step": 2415 }, { "epoch": 1.087151841868823, "grad_norm": 2.84375, "learning_rate": 0.00029971379994917624, "loss": 5.5008, "step": 2420 }, { "epoch": 1.0893980233602876, "grad_norm": 3.359375, "learning_rate": 0.00029970694758212177, "loss": 5.4682, "step": 2425 }, { "epoch": 1.091644204851752, "grad_norm": 3.0, "learning_rate": 0.000299700014240088, "loss": 5.4666, "step": 2430 }, { "epoch": 1.0938903863432166, "grad_norm": 3.3125, "learning_rate": 0.00029969299992724273, "loss": 5.5844, "step": 2435 }, { "epoch": 1.096136567834681, "grad_norm": 3.3125, "learning_rate": 0.00029968590464780247, "loss": 5.5141, "step": 2440 }, { "epoch": 1.0983827493261455, "grad_norm": 3.046875, "learning_rate": 0.0002996787284060322, "loss": 5.4897, "step": 2445 }, { "epoch": 1.10062893081761, "grad_norm": 3.125, "learning_rate": 0.00029967147120624573, "loss": 5.4318, "step": 2450 }, { "epoch": 1.1028751123090745, "grad_norm": 3.4375, "learning_rate": 0.00029966413305280553, "loss": 5.506, "step": 2455 }, { "epoch": 1.105121293800539, "grad_norm": 3.390625, "learning_rate": 0.00029965671395012274, "loss": 5.4363, "step": 2460 }, { "epoch": 1.1073674752920035, "grad_norm": 3.265625, "learning_rate": 0.0002996492139026571, "loss": 5.4077, "step": 2465 }, { "epoch": 1.109613656783468, "grad_norm": 3.265625, "learning_rate": 0.000299641632914917, "loss": 5.4435, "step": 2470 }, { "epoch": 1.1118598382749325, "grad_norm": 3.078125, "learning_rate": 0.0002996339709914596, "loss": 5.4641, "step": 2475 }, { "epoch": 1.1141060197663972, "grad_norm": 3.015625, "learning_rate": 0.0002996262281368905, "loss": 5.5053, "step": 2480 }, { "epoch": 1.1163522012578617, "grad_norm": 3.34375, "learning_rate": 0.0002996184043558642, "loss": 5.3987, "step": 2485 }, { "epoch": 1.1185983827493262, "grad_norm": 3.03125, "learning_rate": 0.0002996104996530837, "loss": 5.6063, "step": 2490 }, { "epoch": 1.1208445642407907, "grad_norm": 3.328125, "learning_rate": 0.0002996025140333006, "loss": 5.4782, "step": 2495 }, { "epoch": 1.1230907457322552, "grad_norm": 3.25, "learning_rate": 0.00029959444750131533, "loss": 5.4836, "step": 2500 }, { "epoch": 1.1253369272237197, "grad_norm": 3.140625, "learning_rate": 0.0002995863000619768, "loss": 5.5181, "step": 2505 }, { "epoch": 1.1275831087151842, "grad_norm": 3.1875, "learning_rate": 0.0002995780717201825, "loss": 5.4469, "step": 2510 }, { "epoch": 1.1298292902066487, "grad_norm": 3.03125, "learning_rate": 0.0002995697624808788, "loss": 5.4445, "step": 2515 }, { "epoch": 1.1320754716981132, "grad_norm": 3.125, "learning_rate": 0.00029956137234906044, "loss": 5.4844, "step": 2520 }, { "epoch": 1.1343216531895777, "grad_norm": 2.953125, "learning_rate": 0.00029955290132977093, "loss": 5.5633, "step": 2525 }, { "epoch": 1.1365678346810422, "grad_norm": 3.109375, "learning_rate": 0.0002995443494281024, "loss": 5.4724, "step": 2530 }, { "epoch": 1.1388140161725067, "grad_norm": 3.34375, "learning_rate": 0.00029953571664919547, "loss": 5.4786, "step": 2535 }, { "epoch": 1.1410601976639712, "grad_norm": 3.328125, "learning_rate": 0.0002995270029982396, "loss": 5.5004, "step": 2540 }, { "epoch": 1.1433063791554359, "grad_norm": 3.0625, "learning_rate": 0.00029951820848047255, "loss": 5.4758, "step": 2545 }, { "epoch": 1.1455525606469004, "grad_norm": 3.0, "learning_rate": 0.0002995093331011811, "loss": 5.4789, "step": 2550 }, { "epoch": 1.1477987421383649, "grad_norm": 3.03125, "learning_rate": 0.00029950037686570023, "loss": 5.3991, "step": 2555 }, { "epoch": 1.1500449236298294, "grad_norm": 3.3125, "learning_rate": 0.0002994913397794138, "loss": 5.5046, "step": 2560 }, { "epoch": 1.1522911051212938, "grad_norm": 3.46875, "learning_rate": 0.00029948222184775415, "loss": 5.5293, "step": 2565 }, { "epoch": 1.1545372866127583, "grad_norm": 3.125, "learning_rate": 0.00029947302307620227, "loss": 5.4079, "step": 2570 }, { "epoch": 1.1567834681042228, "grad_norm": 3.203125, "learning_rate": 0.0002994637434702877, "loss": 5.425, "step": 2575 }, { "epoch": 1.1590296495956873, "grad_norm": 3.296875, "learning_rate": 0.0002994543830355886, "loss": 5.4591, "step": 2580 }, { "epoch": 1.1612758310871518, "grad_norm": 3.296875, "learning_rate": 0.0002994449417777317, "loss": 5.5263, "step": 2585 }, { "epoch": 1.1635220125786163, "grad_norm": 3.140625, "learning_rate": 0.00029943541970239233, "loss": 5.4458, "step": 2590 }, { "epoch": 1.1657681940700808, "grad_norm": 3.1875, "learning_rate": 0.00029942581681529447, "loss": 5.4449, "step": 2595 }, { "epoch": 1.1680143755615453, "grad_norm": 3.34375, "learning_rate": 0.00029941613312221046, "loss": 5.5558, "step": 2600 }, { "epoch": 1.1702605570530098, "grad_norm": 3.0, "learning_rate": 0.00029940636862896145, "loss": 5.5165, "step": 2605 }, { "epoch": 1.1725067385444743, "grad_norm": 3.3125, "learning_rate": 0.0002993965233414171, "loss": 5.4624, "step": 2610 }, { "epoch": 1.1747529200359388, "grad_norm": 3.203125, "learning_rate": 0.0002993865972654955, "loss": 5.4336, "step": 2615 }, { "epoch": 1.1769991015274035, "grad_norm": 3.5, "learning_rate": 0.0002993765904071635, "loss": 5.5293, "step": 2620 }, { "epoch": 1.179245283018868, "grad_norm": 3.15625, "learning_rate": 0.00029936650277243633, "loss": 5.5603, "step": 2625 }, { "epoch": 1.1814914645103325, "grad_norm": 3.140625, "learning_rate": 0.0002993563343673779, "loss": 5.4785, "step": 2630 }, { "epoch": 1.183737646001797, "grad_norm": 3.09375, "learning_rate": 0.0002993460851981007, "loss": 5.4188, "step": 2635 }, { "epoch": 1.1859838274932615, "grad_norm": 3.078125, "learning_rate": 0.00029933575527076565, "loss": 5.5139, "step": 2640 }, { "epoch": 1.188230008984726, "grad_norm": 3.015625, "learning_rate": 0.0002993253445915823, "loss": 5.3998, "step": 2645 }, { "epoch": 1.1904761904761905, "grad_norm": 3.328125, "learning_rate": 0.0002993148531668087, "loss": 5.5066, "step": 2650 }, { "epoch": 1.192722371967655, "grad_norm": 3.125, "learning_rate": 0.0002993042810027514, "loss": 5.416, "step": 2655 }, { "epoch": 1.1949685534591195, "grad_norm": 3.171875, "learning_rate": 0.0002992936281057656, "loss": 5.4367, "step": 2660 }, { "epoch": 1.197214734950584, "grad_norm": 3.125, "learning_rate": 0.000299282894482255, "loss": 5.3912, "step": 2665 }, { "epoch": 1.1994609164420484, "grad_norm": 2.9375, "learning_rate": 0.00029927208013867164, "loss": 5.4456, "step": 2670 }, { "epoch": 1.201707097933513, "grad_norm": 3.296875, "learning_rate": 0.0002992611850815163, "loss": 5.5036, "step": 2675 }, { "epoch": 1.2039532794249777, "grad_norm": 3.234375, "learning_rate": 0.0002992502093173383, "loss": 5.4467, "step": 2680 }, { "epoch": 1.2061994609164421, "grad_norm": 3.375, "learning_rate": 0.0002992391528527353, "loss": 5.3611, "step": 2685 }, { "epoch": 1.2084456424079066, "grad_norm": 3.359375, "learning_rate": 0.00029922801569435366, "loss": 5.4635, "step": 2690 }, { "epoch": 1.2106918238993711, "grad_norm": 3.671875, "learning_rate": 0.00029921679784888797, "loss": 5.4823, "step": 2695 }, { "epoch": 1.2129380053908356, "grad_norm": 2.875, "learning_rate": 0.0002992054993230816, "loss": 5.378, "step": 2700 }, { "epoch": 1.2151841868823001, "grad_norm": 2.765625, "learning_rate": 0.0002991941201237263, "loss": 5.4737, "step": 2705 }, { "epoch": 1.2174303683737646, "grad_norm": 3.0625, "learning_rate": 0.0002991826602576624, "loss": 5.4399, "step": 2710 }, { "epoch": 1.219676549865229, "grad_norm": 3.046875, "learning_rate": 0.00029917111973177857, "loss": 5.4663, "step": 2715 }, { "epoch": 1.2219227313566936, "grad_norm": 3.484375, "learning_rate": 0.00029915949855301204, "loss": 5.3946, "step": 2720 }, { "epoch": 1.224168912848158, "grad_norm": 2.953125, "learning_rate": 0.0002991477967283485, "loss": 5.4415, "step": 2725 }, { "epoch": 1.2264150943396226, "grad_norm": 3.125, "learning_rate": 0.00029913601426482226, "loss": 5.3648, "step": 2730 }, { "epoch": 1.228661275831087, "grad_norm": 2.953125, "learning_rate": 0.00029912415116951593, "loss": 5.4543, "step": 2735 }, { "epoch": 1.2309074573225516, "grad_norm": 2.921875, "learning_rate": 0.0002991122074495606, "loss": 5.381, "step": 2740 }, { "epoch": 1.233153638814016, "grad_norm": 3.015625, "learning_rate": 0.0002991001831121359, "loss": 5.4367, "step": 2745 }, { "epoch": 1.2353998203054806, "grad_norm": 3.796875, "learning_rate": 0.00029908807816446994, "loss": 5.5144, "step": 2750 }, { "epoch": 1.2376460017969453, "grad_norm": 3.140625, "learning_rate": 0.0002990758926138392, "loss": 5.4193, "step": 2755 }, { "epoch": 1.2398921832884098, "grad_norm": 3.078125, "learning_rate": 0.0002990636264675687, "loss": 5.4758, "step": 2760 }, { "epoch": 1.2421383647798743, "grad_norm": 3.265625, "learning_rate": 0.00029905127973303176, "loss": 5.4093, "step": 2765 }, { "epoch": 1.2443845462713388, "grad_norm": 3.015625, "learning_rate": 0.00029903885241765036, "loss": 5.4189, "step": 2770 }, { "epoch": 1.2466307277628033, "grad_norm": 2.90625, "learning_rate": 0.0002990263445288947, "loss": 5.4447, "step": 2775 }, { "epoch": 1.2488769092542678, "grad_norm": 3.03125, "learning_rate": 0.0002990137560742836, "loss": 5.3926, "step": 2780 }, { "epoch": 1.2511230907457322, "grad_norm": 3.203125, "learning_rate": 0.00029900108706138416, "loss": 5.3857, "step": 2785 }, { "epoch": 1.2533692722371967, "grad_norm": 2.890625, "learning_rate": 0.000298988337497812, "loss": 5.4141, "step": 2790 }, { "epoch": 1.2556154537286612, "grad_norm": 3.0625, "learning_rate": 0.0002989755073912311, "loss": 5.422, "step": 2795 }, { "epoch": 1.2578616352201257, "grad_norm": 3.1875, "learning_rate": 0.0002989625967493541, "loss": 5.3838, "step": 2800 }, { "epoch": 1.2601078167115902, "grad_norm": 3.046875, "learning_rate": 0.00029894960557994146, "loss": 5.5335, "step": 2805 }, { "epoch": 1.262353998203055, "grad_norm": 2.9375, "learning_rate": 0.00029893653389080274, "loss": 5.3528, "step": 2810 }, { "epoch": 1.2646001796945194, "grad_norm": 3.15625, "learning_rate": 0.0002989233816897954, "loss": 5.3309, "step": 2815 }, { "epoch": 1.266846361185984, "grad_norm": 3.09375, "learning_rate": 0.0002989101489848256, "loss": 5.4407, "step": 2820 }, { "epoch": 1.2690925426774484, "grad_norm": 3.421875, "learning_rate": 0.0002988968357838477, "loss": 5.3808, "step": 2825 }, { "epoch": 1.271338724168913, "grad_norm": 2.9375, "learning_rate": 0.0002988834420948647, "loss": 5.4058, "step": 2830 }, { "epoch": 1.2735849056603774, "grad_norm": 2.953125, "learning_rate": 0.0002988699679259275, "loss": 5.4674, "step": 2835 }, { "epoch": 1.275831087151842, "grad_norm": 3.0, "learning_rate": 0.00029885641328513594, "loss": 5.4242, "step": 2840 }, { "epoch": 1.2780772686433064, "grad_norm": 3.109375, "learning_rate": 0.0002988427781806379, "loss": 5.4332, "step": 2845 }, { "epoch": 1.280323450134771, "grad_norm": 2.953125, "learning_rate": 0.0002988290626206297, "loss": 5.3583, "step": 2850 }, { "epoch": 1.2825696316262354, "grad_norm": 3.328125, "learning_rate": 0.000298815266613356, "loss": 5.3448, "step": 2855 }, { "epoch": 1.2848158131176999, "grad_norm": 3.03125, "learning_rate": 0.0002988013901671099, "loss": 5.4957, "step": 2860 }, { "epoch": 1.2870619946091644, "grad_norm": 3.078125, "learning_rate": 0.0002987874332902328, "loss": 5.4692, "step": 2865 }, { "epoch": 1.2893081761006289, "grad_norm": 3.09375, "learning_rate": 0.0002987733959911144, "loss": 5.3743, "step": 2870 }, { "epoch": 1.2915543575920934, "grad_norm": 2.890625, "learning_rate": 0.00029875927827819286, "loss": 5.368, "step": 2875 }, { "epoch": 1.2938005390835579, "grad_norm": 3.046875, "learning_rate": 0.00029874508015995463, "loss": 5.3748, "step": 2880 }, { "epoch": 1.2960467205750223, "grad_norm": 3.140625, "learning_rate": 0.0002987308016449344, "loss": 5.3995, "step": 2885 }, { "epoch": 1.2982929020664868, "grad_norm": 3.1875, "learning_rate": 0.00029871644274171534, "loss": 5.3753, "step": 2890 }, { "epoch": 1.3005390835579516, "grad_norm": 3.234375, "learning_rate": 0.00029870200345892876, "loss": 5.4296, "step": 2895 }, { "epoch": 1.302785265049416, "grad_norm": 3.09375, "learning_rate": 0.00029868748380525444, "loss": 5.315, "step": 2900 }, { "epoch": 1.3050314465408805, "grad_norm": 3.125, "learning_rate": 0.0002986728837894205, "loss": 5.4592, "step": 2905 }, { "epoch": 1.307277628032345, "grad_norm": 3.203125, "learning_rate": 0.00029865820342020325, "loss": 5.4735, "step": 2910 }, { "epoch": 1.3095238095238095, "grad_norm": 3.109375, "learning_rate": 0.0002986434427064273, "loss": 5.3768, "step": 2915 }, { "epoch": 1.311769991015274, "grad_norm": 2.890625, "learning_rate": 0.0002986286016569657, "loss": 5.381, "step": 2920 }, { "epoch": 1.3140161725067385, "grad_norm": 2.890625, "learning_rate": 0.0002986136802807396, "loss": 5.4079, "step": 2925 }, { "epoch": 1.316262353998203, "grad_norm": 3.21875, "learning_rate": 0.00029859867858671857, "loss": 5.435, "step": 2930 }, { "epoch": 1.3185085354896675, "grad_norm": 3.171875, "learning_rate": 0.00029858359658392045, "loss": 5.4919, "step": 2935 }, { "epoch": 1.320754716981132, "grad_norm": 2.859375, "learning_rate": 0.00029856843428141127, "loss": 5.3849, "step": 2940 }, { "epoch": 1.3230008984725967, "grad_norm": 3.703125, "learning_rate": 0.00029855319168830543, "loss": 5.4001, "step": 2945 }, { "epoch": 1.3252470799640612, "grad_norm": 3.375, "learning_rate": 0.0002985378688137656, "loss": 5.5048, "step": 2950 }, { "epoch": 1.3274932614555257, "grad_norm": 3.03125, "learning_rate": 0.00029852246566700253, "loss": 5.367, "step": 2955 }, { "epoch": 1.3297394429469902, "grad_norm": 2.921875, "learning_rate": 0.0002985069822572754, "loss": 5.3137, "step": 2960 }, { "epoch": 1.3319856244384547, "grad_norm": 3.15625, "learning_rate": 0.0002984914185938916, "loss": 5.3961, "step": 2965 }, { "epoch": 1.3342318059299192, "grad_norm": 3.1875, "learning_rate": 0.0002984757746862068, "loss": 5.4488, "step": 2970 }, { "epoch": 1.3364779874213837, "grad_norm": 3.171875, "learning_rate": 0.00029846005054362474, "loss": 5.4318, "step": 2975 }, { "epoch": 1.3387241689128482, "grad_norm": 2.96875, "learning_rate": 0.0002984442461755977, "loss": 5.3834, "step": 2980 }, { "epoch": 1.3409703504043127, "grad_norm": 3.0625, "learning_rate": 0.00029842836159162583, "loss": 5.4205, "step": 2985 }, { "epoch": 1.3432165318957772, "grad_norm": 2.90625, "learning_rate": 0.0002984123968012577, "loss": 5.4352, "step": 2990 }, { "epoch": 1.3454627133872417, "grad_norm": 3.03125, "learning_rate": 0.0002983963518140901, "loss": 5.4451, "step": 2995 }, { "epoch": 1.3477088948787062, "grad_norm": 3.0625, "learning_rate": 0.00029838022663976793, "loss": 5.3171, "step": 3000 }, { "epoch": 1.3477088948787062, "eval_loss": 5.344548225402832, "eval_runtime": 16.0596, "eval_samples_per_second": 1931.124, "eval_steps_per_second": 241.414, "step": 3000 }, { "epoch": 1.3499550763701706, "grad_norm": 2.984375, "learning_rate": 0.0002983640212879844, "loss": 5.4371, "step": 3005 }, { "epoch": 1.3522012578616351, "grad_norm": 3.265625, "learning_rate": 0.0002983477357684809, "loss": 5.3769, "step": 3010 }, { "epoch": 1.3544474393530996, "grad_norm": 3.421875, "learning_rate": 0.0002983313700910468, "loss": 5.4952, "step": 3015 }, { "epoch": 1.3566936208445641, "grad_norm": 2.96875, "learning_rate": 0.00029831492426552, "loss": 5.3494, "step": 3020 }, { "epoch": 1.3589398023360286, "grad_norm": 3.0625, "learning_rate": 0.00029829839830178636, "loss": 5.4431, "step": 3025 }, { "epoch": 1.3611859838274933, "grad_norm": 2.953125, "learning_rate": 0.00029828179220977994, "loss": 5.3644, "step": 3030 }, { "epoch": 1.3634321653189578, "grad_norm": 3.1875, "learning_rate": 0.000298265105999483, "loss": 5.3982, "step": 3035 }, { "epoch": 1.3656783468104223, "grad_norm": 3.03125, "learning_rate": 0.00029824833968092595, "loss": 5.3913, "step": 3040 }, { "epoch": 1.3679245283018868, "grad_norm": 2.96875, "learning_rate": 0.00029823149326418735, "loss": 5.3851, "step": 3045 }, { "epoch": 1.3701707097933513, "grad_norm": 3.0, "learning_rate": 0.0002982145667593939, "loss": 5.3206, "step": 3050 }, { "epoch": 1.3724168912848158, "grad_norm": 3.203125, "learning_rate": 0.00029819756017672043, "loss": 5.3429, "step": 3055 }, { "epoch": 1.3746630727762803, "grad_norm": 3.25, "learning_rate": 0.00029818047352639, "loss": 5.4596, "step": 3060 }, { "epoch": 1.3769092542677448, "grad_norm": 3.078125, "learning_rate": 0.00029816330681867366, "loss": 5.3423, "step": 3065 }, { "epoch": 1.3791554357592093, "grad_norm": 2.875, "learning_rate": 0.0002981460600638907, "loss": 5.3283, "step": 3070 }, { "epoch": 1.3814016172506738, "grad_norm": 2.921875, "learning_rate": 0.00029812873327240844, "loss": 5.3159, "step": 3075 }, { "epoch": 1.3836477987421385, "grad_norm": 2.890625, "learning_rate": 0.0002981113264546424, "loss": 5.3529, "step": 3080 }, { "epoch": 1.385893980233603, "grad_norm": 3.125, "learning_rate": 0.0002980938396210561, "loss": 5.46, "step": 3085 }, { "epoch": 1.3881401617250675, "grad_norm": 2.890625, "learning_rate": 0.00029807627278216126, "loss": 5.4219, "step": 3090 }, { "epoch": 1.390386343216532, "grad_norm": 3.03125, "learning_rate": 0.0002980586259485177, "loss": 5.4519, "step": 3095 }, { "epoch": 1.3926325247079965, "grad_norm": 3.15625, "learning_rate": 0.00029804089913073315, "loss": 5.4067, "step": 3100 }, { "epoch": 1.394878706199461, "grad_norm": 3.046875, "learning_rate": 0.0002980230923394637, "loss": 5.348, "step": 3105 }, { "epoch": 1.3971248876909255, "grad_norm": 3.109375, "learning_rate": 0.00029800520558541317, "loss": 5.3693, "step": 3110 }, { "epoch": 1.39937106918239, "grad_norm": 2.96875, "learning_rate": 0.0002979872388793338, "loss": 5.3537, "step": 3115 }, { "epoch": 1.4016172506738545, "grad_norm": 2.75, "learning_rate": 0.00029796919223202563, "loss": 5.3571, "step": 3120 }, { "epoch": 1.403863432165319, "grad_norm": 3.0, "learning_rate": 0.0002979510656543369, "loss": 5.3759, "step": 3125 }, { "epoch": 1.4061096136567834, "grad_norm": 3.109375, "learning_rate": 0.0002979328591571639, "loss": 5.3222, "step": 3130 }, { "epoch": 1.408355795148248, "grad_norm": 3.0, "learning_rate": 0.00029791457275145085, "loss": 5.2987, "step": 3135 }, { "epoch": 1.4106019766397124, "grad_norm": 2.984375, "learning_rate": 0.00029789620644819005, "loss": 5.3843, "step": 3140 }, { "epoch": 1.412848158131177, "grad_norm": 3.03125, "learning_rate": 0.00029787776025842186, "loss": 5.3461, "step": 3145 }, { "epoch": 1.4150943396226414, "grad_norm": 3.015625, "learning_rate": 0.00029785923419323467, "loss": 5.3381, "step": 3150 }, { "epoch": 1.417340521114106, "grad_norm": 2.890625, "learning_rate": 0.0002978406282637648, "loss": 5.3985, "step": 3155 }, { "epoch": 1.4195867026055704, "grad_norm": 2.953125, "learning_rate": 0.0002978219424811967, "loss": 5.3383, "step": 3160 }, { "epoch": 1.4218328840970351, "grad_norm": 3.125, "learning_rate": 0.00029780317685676276, "loss": 5.4033, "step": 3165 }, { "epoch": 1.4240790655884996, "grad_norm": 3.03125, "learning_rate": 0.0002977843314017433, "loss": 5.4135, "step": 3170 }, { "epoch": 1.426325247079964, "grad_norm": 3.0, "learning_rate": 0.0002977654061274668, "loss": 5.3461, "step": 3175 }, { "epoch": 1.4285714285714286, "grad_norm": 3.0625, "learning_rate": 0.0002977464010453095, "loss": 5.281, "step": 3180 }, { "epoch": 1.430817610062893, "grad_norm": 3.359375, "learning_rate": 0.0002977273161666957, "loss": 5.4328, "step": 3185 }, { "epoch": 1.4330637915543576, "grad_norm": 3.0, "learning_rate": 0.00029770815150309787, "loss": 5.3081, "step": 3190 }, { "epoch": 1.435309973045822, "grad_norm": 2.984375, "learning_rate": 0.0002976889070660361, "loss": 5.4198, "step": 3195 }, { "epoch": 1.4375561545372866, "grad_norm": 2.90625, "learning_rate": 0.0002976695828670787, "loss": 5.3054, "step": 3200 }, { "epoch": 1.439802336028751, "grad_norm": 2.984375, "learning_rate": 0.00029765017891784175, "loss": 5.4182, "step": 3205 }, { "epoch": 1.4420485175202156, "grad_norm": 2.765625, "learning_rate": 0.00029763069522998936, "loss": 5.3818, "step": 3210 }, { "epoch": 1.44429469901168, "grad_norm": 2.78125, "learning_rate": 0.0002976111318152336, "loss": 5.34, "step": 3215 }, { "epoch": 1.4465408805031448, "grad_norm": 2.96875, "learning_rate": 0.0002975914886853344, "loss": 5.4218, "step": 3220 }, { "epoch": 1.4487870619946093, "grad_norm": 3.078125, "learning_rate": 0.00029757176585209957, "loss": 5.3399, "step": 3225 }, { "epoch": 1.4510332434860738, "grad_norm": 3.265625, "learning_rate": 0.000297551963327385, "loss": 5.2921, "step": 3230 }, { "epoch": 1.4532794249775383, "grad_norm": 2.890625, "learning_rate": 0.00029753208112309423, "loss": 5.3799, "step": 3235 }, { "epoch": 1.4555256064690028, "grad_norm": 2.890625, "learning_rate": 0.00029751211925117897, "loss": 5.2984, "step": 3240 }, { "epoch": 1.4577717879604672, "grad_norm": 3.265625, "learning_rate": 0.00029749207772363867, "loss": 5.379, "step": 3245 }, { "epoch": 1.4600179694519317, "grad_norm": 3.0, "learning_rate": 0.0002974719565525207, "loss": 5.3465, "step": 3250 }, { "epoch": 1.4622641509433962, "grad_norm": 2.90625, "learning_rate": 0.0002974517557499201, "loss": 5.413, "step": 3255 }, { "epoch": 1.4645103324348607, "grad_norm": 3.40625, "learning_rate": 0.00029743147532798023, "loss": 5.2814, "step": 3260 }, { "epoch": 1.4667565139263252, "grad_norm": 2.96875, "learning_rate": 0.00029741111529889194, "loss": 5.3454, "step": 3265 }, { "epoch": 1.4690026954177897, "grad_norm": 3.078125, "learning_rate": 0.000297390675674894, "loss": 5.3013, "step": 3270 }, { "epoch": 1.4712488769092542, "grad_norm": 3.09375, "learning_rate": 0.0002973701564682731, "loss": 5.2762, "step": 3275 }, { "epoch": 1.4734950584007187, "grad_norm": 3.015625, "learning_rate": 0.00029734955769136377, "loss": 5.3686, "step": 3280 }, { "epoch": 1.4757412398921832, "grad_norm": 3.140625, "learning_rate": 0.00029732887935654827, "loss": 5.3697, "step": 3285 }, { "epoch": 1.4779874213836477, "grad_norm": 2.953125, "learning_rate": 0.0002973081214762568, "loss": 5.2504, "step": 3290 }, { "epoch": 1.4802336028751122, "grad_norm": 2.9375, "learning_rate": 0.00029728728406296735, "loss": 5.3318, "step": 3295 }, { "epoch": 1.482479784366577, "grad_norm": 3.078125, "learning_rate": 0.00029726636712920564, "loss": 5.3078, "step": 3300 }, { "epoch": 1.4847259658580414, "grad_norm": 3.046875, "learning_rate": 0.0002972453706875453, "loss": 5.3814, "step": 3305 }, { "epoch": 1.486972147349506, "grad_norm": 2.875, "learning_rate": 0.0002972242947506076, "loss": 5.2753, "step": 3310 }, { "epoch": 1.4892183288409704, "grad_norm": 3.046875, "learning_rate": 0.0002972031393310619, "loss": 5.3256, "step": 3315 }, { "epoch": 1.4914645103324349, "grad_norm": 3.0625, "learning_rate": 0.0002971819044416249, "loss": 5.3758, "step": 3320 }, { "epoch": 1.4937106918238994, "grad_norm": 2.75, "learning_rate": 0.00029716059009506145, "loss": 5.3209, "step": 3325 }, { "epoch": 1.4959568733153639, "grad_norm": 3.109375, "learning_rate": 0.000297139196304184, "loss": 5.3075, "step": 3330 }, { "epoch": 1.4982030548068284, "grad_norm": 2.859375, "learning_rate": 0.0002971177230818527, "loss": 5.3805, "step": 3335 }, { "epoch": 1.5004492362982929, "grad_norm": 3.0625, "learning_rate": 0.0002970961704409756, "loss": 5.3156, "step": 3340 }, { "epoch": 1.5026954177897576, "grad_norm": 2.90625, "learning_rate": 0.0002970745383945084, "loss": 5.3465, "step": 3345 }, { "epoch": 1.504941599281222, "grad_norm": 3.015625, "learning_rate": 0.00029705282695545454, "loss": 5.3717, "step": 3350 }, { "epoch": 1.5071877807726866, "grad_norm": 3.09375, "learning_rate": 0.00029703103613686527, "loss": 5.2288, "step": 3355 }, { "epoch": 1.509433962264151, "grad_norm": 3.1875, "learning_rate": 0.0002970091659518393, "loss": 5.2978, "step": 3360 }, { "epoch": 1.5116801437556155, "grad_norm": 2.9375, "learning_rate": 0.0002969872164135234, "loss": 5.2993, "step": 3365 }, { "epoch": 1.51392632524708, "grad_norm": 3.0625, "learning_rate": 0.00029696518753511173, "loss": 5.3231, "step": 3370 }, { "epoch": 1.5161725067385445, "grad_norm": 2.796875, "learning_rate": 0.0002969430793298464, "loss": 5.334, "step": 3375 }, { "epoch": 1.518418688230009, "grad_norm": 3.03125, "learning_rate": 0.00029692089181101696, "loss": 5.2514, "step": 3380 }, { "epoch": 1.5206648697214735, "grad_norm": 2.890625, "learning_rate": 0.0002968986249919609, "loss": 5.3403, "step": 3385 }, { "epoch": 1.522911051212938, "grad_norm": 3.0625, "learning_rate": 0.0002968762788860631, "loss": 5.3209, "step": 3390 }, { "epoch": 1.5251572327044025, "grad_norm": 3.125, "learning_rate": 0.0002968538535067564, "loss": 5.3657, "step": 3395 }, { "epoch": 1.527403414195867, "grad_norm": 2.96875, "learning_rate": 0.000296831348867521, "loss": 5.3167, "step": 3400 }, { "epoch": 1.5296495956873315, "grad_norm": 2.953125, "learning_rate": 0.0002968087649818848, "loss": 5.2753, "step": 3405 }, { "epoch": 1.531895777178796, "grad_norm": 2.984375, "learning_rate": 0.0002967861018634237, "loss": 5.3678, "step": 3410 }, { "epoch": 1.5341419586702605, "grad_norm": 3.265625, "learning_rate": 0.00029676335952576074, "loss": 5.3243, "step": 3415 }, { "epoch": 1.536388140161725, "grad_norm": 3.109375, "learning_rate": 0.0002967405379825668, "loss": 5.2466, "step": 3420 }, { "epoch": 1.5386343216531895, "grad_norm": 2.953125, "learning_rate": 0.0002967176372475604, "loss": 5.2428, "step": 3425 }, { "epoch": 1.540880503144654, "grad_norm": 2.890625, "learning_rate": 0.0002966946573345076, "loss": 5.2614, "step": 3430 }, { "epoch": 1.5431266846361185, "grad_norm": 2.921875, "learning_rate": 0.00029667159825722206, "loss": 5.3399, "step": 3435 }, { "epoch": 1.545372866127583, "grad_norm": 3.0625, "learning_rate": 0.00029664846002956506, "loss": 5.2338, "step": 3440 }, { "epoch": 1.5476190476190477, "grad_norm": 3.09375, "learning_rate": 0.0002966252426654454, "loss": 5.3445, "step": 3445 }, { "epoch": 1.5498652291105122, "grad_norm": 2.875, "learning_rate": 0.0002966019461788196, "loss": 5.2916, "step": 3450 }, { "epoch": 1.5521114106019767, "grad_norm": 2.9375, "learning_rate": 0.0002965785705836915, "loss": 5.3159, "step": 3455 }, { "epoch": 1.5543575920934412, "grad_norm": 3.1875, "learning_rate": 0.0002965551158941127, "loss": 5.3027, "step": 3460 }, { "epoch": 1.5566037735849056, "grad_norm": 2.96875, "learning_rate": 0.0002965315821241823, "loss": 5.2319, "step": 3465 }, { "epoch": 1.5588499550763701, "grad_norm": 3.875, "learning_rate": 0.00029650796928804685, "loss": 5.3169, "step": 3470 }, { "epoch": 1.5610961365678346, "grad_norm": 3.03125, "learning_rate": 0.0002964842773999005, "loss": 5.2524, "step": 3475 }, { "epoch": 1.5633423180592994, "grad_norm": 2.921875, "learning_rate": 0.0002964605064739849, "loss": 5.3455, "step": 3480 }, { "epoch": 1.5655884995507638, "grad_norm": 3.015625, "learning_rate": 0.0002964366565245892, "loss": 5.3241, "step": 3485 }, { "epoch": 1.5678346810422283, "grad_norm": 3.015625, "learning_rate": 0.00029641272756605023, "loss": 5.301, "step": 3490 }, { "epoch": 1.5700808625336928, "grad_norm": 3.0, "learning_rate": 0.0002963887196127519, "loss": 5.2987, "step": 3495 }, { "epoch": 1.5723270440251573, "grad_norm": 2.96875, "learning_rate": 0.00029636463267912607, "loss": 5.2262, "step": 3500 }, { "epoch": 1.5745732255166218, "grad_norm": 2.90625, "learning_rate": 0.00029634046677965174, "loss": 5.2556, "step": 3505 }, { "epoch": 1.5768194070080863, "grad_norm": 2.90625, "learning_rate": 0.00029631622192885553, "loss": 5.3328, "step": 3510 }, { "epoch": 1.5790655884995508, "grad_norm": 3.078125, "learning_rate": 0.00029629189814131155, "loss": 5.3252, "step": 3515 }, { "epoch": 1.5813117699910153, "grad_norm": 3.03125, "learning_rate": 0.0002962674954316413, "loss": 5.2871, "step": 3520 }, { "epoch": 1.5835579514824798, "grad_norm": 2.890625, "learning_rate": 0.0002962430138145137, "loss": 5.2723, "step": 3525 }, { "epoch": 1.5858041329739443, "grad_norm": 2.765625, "learning_rate": 0.000296218453304645, "loss": 5.2836, "step": 3530 }, { "epoch": 1.5880503144654088, "grad_norm": 3.015625, "learning_rate": 0.00029619381391679923, "loss": 5.3014, "step": 3535 }, { "epoch": 1.5902964959568733, "grad_norm": 2.890625, "learning_rate": 0.00029616909566578746, "loss": 5.2194, "step": 3540 }, { "epoch": 1.5925426774483378, "grad_norm": 2.875, "learning_rate": 0.0002961442985664684, "loss": 5.3363, "step": 3545 }, { "epoch": 1.5947888589398023, "grad_norm": 2.875, "learning_rate": 0.000296119422633748, "loss": 5.2192, "step": 3550 }, { "epoch": 1.5970350404312668, "grad_norm": 3.109375, "learning_rate": 0.0002960944678825797, "loss": 5.2585, "step": 3555 }, { "epoch": 1.5992812219227313, "grad_norm": 3.40625, "learning_rate": 0.0002960694343279643, "loss": 5.4105, "step": 3560 }, { "epoch": 1.6015274034141957, "grad_norm": 2.953125, "learning_rate": 0.0002960443219849499, "loss": 5.2834, "step": 3565 }, { "epoch": 1.6037735849056602, "grad_norm": 2.953125, "learning_rate": 0.0002960191308686321, "loss": 5.2917, "step": 3570 }, { "epoch": 1.6060197663971247, "grad_norm": 2.953125, "learning_rate": 0.0002959938609941537, "loss": 5.3014, "step": 3575 }, { "epoch": 1.6082659478885895, "grad_norm": 3.0625, "learning_rate": 0.00029596851237670494, "loss": 5.2469, "step": 3580 }, { "epoch": 1.610512129380054, "grad_norm": 3.046875, "learning_rate": 0.00029594308503152344, "loss": 5.2651, "step": 3585 }, { "epoch": 1.6127583108715184, "grad_norm": 2.9375, "learning_rate": 0.00029591757897389403, "loss": 5.2144, "step": 3590 }, { "epoch": 1.615004492362983, "grad_norm": 3.015625, "learning_rate": 0.00029589199421914885, "loss": 5.2536, "step": 3595 }, { "epoch": 1.6172506738544474, "grad_norm": 2.90625, "learning_rate": 0.0002958663307826674, "loss": 5.2291, "step": 3600 }, { "epoch": 1.619496855345912, "grad_norm": 2.875, "learning_rate": 0.00029584058867987656, "loss": 5.2936, "step": 3605 }, { "epoch": 1.6217430368373764, "grad_norm": 3.171875, "learning_rate": 0.00029581476792625035, "loss": 5.3135, "step": 3610 }, { "epoch": 1.6239892183288411, "grad_norm": 3.078125, "learning_rate": 0.0002957888685373101, "loss": 5.2395, "step": 3615 }, { "epoch": 1.6262353998203056, "grad_norm": 3.015625, "learning_rate": 0.0002957628905286245, "loss": 5.2269, "step": 3620 }, { "epoch": 1.6284815813117701, "grad_norm": 2.953125, "learning_rate": 0.00029573683391580946, "loss": 5.2192, "step": 3625 }, { "epoch": 1.6307277628032346, "grad_norm": 3.109375, "learning_rate": 0.000295710698714528, "loss": 5.2539, "step": 3630 }, { "epoch": 1.632973944294699, "grad_norm": 3.03125, "learning_rate": 0.0002956844849404906, "loss": 5.2506, "step": 3635 }, { "epoch": 1.6352201257861636, "grad_norm": 2.78125, "learning_rate": 0.00029565819260945483, "loss": 5.2739, "step": 3640 }, { "epoch": 1.637466307277628, "grad_norm": 3.03125, "learning_rate": 0.00029563182173722555, "loss": 5.232, "step": 3645 }, { "epoch": 1.6397124887690926, "grad_norm": 2.890625, "learning_rate": 0.0002956053723396548, "loss": 5.3054, "step": 3650 }, { "epoch": 1.641958670260557, "grad_norm": 2.9375, "learning_rate": 0.0002955788444326418, "loss": 5.2955, "step": 3655 }, { "epoch": 1.6442048517520216, "grad_norm": 3.0, "learning_rate": 0.00029555223803213305, "loss": 5.2577, "step": 3660 }, { "epoch": 1.646451033243486, "grad_norm": 2.96875, "learning_rate": 0.00029552555315412216, "loss": 5.2796, "step": 3665 }, { "epoch": 1.6486972147349506, "grad_norm": 3.75, "learning_rate": 0.0002954987898146499, "loss": 5.3159, "step": 3670 }, { "epoch": 1.650943396226415, "grad_norm": 2.890625, "learning_rate": 0.0002954719480298043, "loss": 5.2639, "step": 3675 }, { "epoch": 1.6531895777178796, "grad_norm": 2.875, "learning_rate": 0.00029544502781572035, "loss": 5.2906, "step": 3680 }, { "epoch": 1.655435759209344, "grad_norm": 4.75, "learning_rate": 0.0002954180291885804, "loss": 5.299, "step": 3685 }, { "epoch": 1.6576819407008085, "grad_norm": 3.046875, "learning_rate": 0.00029539095216461395, "loss": 5.2026, "step": 3690 }, { "epoch": 1.659928122192273, "grad_norm": 2.859375, "learning_rate": 0.0002953637967600974, "loss": 5.2159, "step": 3695 }, { "epoch": 1.6621743036837375, "grad_norm": 2.953125, "learning_rate": 0.0002953365629913544, "loss": 5.22, "step": 3700 }, { "epoch": 1.664420485175202, "grad_norm": 3.015625, "learning_rate": 0.0002953092508747557, "loss": 5.1528, "step": 3705 }, { "epoch": 1.6666666666666665, "grad_norm": 3.09375, "learning_rate": 0.0002952818604267193, "loss": 5.234, "step": 3710 }, { "epoch": 1.668912848158131, "grad_norm": 3.421875, "learning_rate": 0.0002952543916637099, "loss": 5.263, "step": 3715 }, { "epoch": 1.6711590296495957, "grad_norm": 2.984375, "learning_rate": 0.00029522684460223965, "loss": 5.2879, "step": 3720 }, { "epoch": 1.6734052111410602, "grad_norm": 3.046875, "learning_rate": 0.0002951992192588676, "loss": 5.2081, "step": 3725 }, { "epoch": 1.6756513926325247, "grad_norm": 2.953125, "learning_rate": 0.0002951715156501999, "loss": 5.2688, "step": 3730 }, { "epoch": 1.6778975741239892, "grad_norm": 3.0, "learning_rate": 0.00029514373379288967, "loss": 5.2266, "step": 3735 }, { "epoch": 1.6801437556154537, "grad_norm": 2.859375, "learning_rate": 0.0002951158737036372, "loss": 5.2542, "step": 3740 }, { "epoch": 1.6823899371069182, "grad_norm": 2.984375, "learning_rate": 0.0002950879353991897, "loss": 5.2341, "step": 3745 }, { "epoch": 1.684636118598383, "grad_norm": 3.171875, "learning_rate": 0.0002950599188963414, "loss": 5.2238, "step": 3750 }, { "epoch": 1.6868823000898474, "grad_norm": 3.09375, "learning_rate": 0.0002950318242119337, "loss": 5.3397, "step": 3755 }, { "epoch": 1.689128481581312, "grad_norm": 3.015625, "learning_rate": 0.0002950036513628547, "loss": 5.2441, "step": 3760 }, { "epoch": 1.6913746630727764, "grad_norm": 2.859375, "learning_rate": 0.0002949754003660397, "loss": 5.3238, "step": 3765 }, { "epoch": 1.693620844564241, "grad_norm": 3.390625, "learning_rate": 0.00029494707123847095, "loss": 5.3302, "step": 3770 }, { "epoch": 1.6958670260557054, "grad_norm": 3.28125, "learning_rate": 0.0002949186639971777, "loss": 5.2831, "step": 3775 }, { "epoch": 1.6981132075471699, "grad_norm": 3.078125, "learning_rate": 0.00029489017865923597, "loss": 5.2566, "step": 3780 }, { "epoch": 1.7003593890386344, "grad_norm": 2.9375, "learning_rate": 0.00029486161524176893, "loss": 5.2631, "step": 3785 }, { "epoch": 1.7026055705300989, "grad_norm": 3.046875, "learning_rate": 0.0002948329737619466, "loss": 5.2597, "step": 3790 }, { "epoch": 1.7048517520215634, "grad_norm": 3.265625, "learning_rate": 0.0002948042542369859, "loss": 5.2838, "step": 3795 }, { "epoch": 1.7070979335130279, "grad_norm": 2.9375, "learning_rate": 0.0002947754566841508, "loss": 5.2681, "step": 3800 }, { "epoch": 1.7093441150044923, "grad_norm": 3.046875, "learning_rate": 0.00029474658112075197, "loss": 5.3089, "step": 3805 }, { "epoch": 1.7115902964959568, "grad_norm": 3.03125, "learning_rate": 0.00029471762756414703, "loss": 5.2663, "step": 3810 }, { "epoch": 1.7138364779874213, "grad_norm": 2.953125, "learning_rate": 0.00029468859603174065, "loss": 5.2597, "step": 3815 }, { "epoch": 1.7160826594788858, "grad_norm": 3.046875, "learning_rate": 0.00029465948654098427, "loss": 5.2646, "step": 3820 }, { "epoch": 1.7183288409703503, "grad_norm": 2.890625, "learning_rate": 0.0002946302991093761, "loss": 5.2662, "step": 3825 }, { "epoch": 1.7205750224618148, "grad_norm": 2.890625, "learning_rate": 0.00029460103375446116, "loss": 5.2176, "step": 3830 }, { "epoch": 1.7228212039532793, "grad_norm": 2.84375, "learning_rate": 0.00029457169049383164, "loss": 5.225, "step": 3835 }, { "epoch": 1.7250673854447438, "grad_norm": 3.09375, "learning_rate": 0.00029454226934512624, "loss": 5.2631, "step": 3840 }, { "epoch": 1.7273135669362083, "grad_norm": 2.8125, "learning_rate": 0.00029451277032603064, "loss": 5.2029, "step": 3845 }, { "epoch": 1.7295597484276728, "grad_norm": 2.921875, "learning_rate": 0.0002944831934542772, "loss": 5.2321, "step": 3850 }, { "epoch": 1.7318059299191375, "grad_norm": 3.03125, "learning_rate": 0.00029445353874764526, "loss": 5.2173, "step": 3855 }, { "epoch": 1.734052111410602, "grad_norm": 2.90625, "learning_rate": 0.00029442380622396073, "loss": 5.2293, "step": 3860 }, { "epoch": 1.7362982929020665, "grad_norm": 2.984375, "learning_rate": 0.00029439399590109645, "loss": 5.1509, "step": 3865 }, { "epoch": 1.738544474393531, "grad_norm": 2.890625, "learning_rate": 0.00029436410779697206, "loss": 5.2911, "step": 3870 }, { "epoch": 1.7407906558849955, "grad_norm": 3.03125, "learning_rate": 0.00029433414192955377, "loss": 5.1782, "step": 3875 }, { "epoch": 1.74303683737646, "grad_norm": 3.0, "learning_rate": 0.0002943040983168547, "loss": 5.2294, "step": 3880 }, { "epoch": 1.7452830188679245, "grad_norm": 3.171875, "learning_rate": 0.0002942739769769347, "loss": 5.2567, "step": 3885 }, { "epoch": 1.7475292003593892, "grad_norm": 3.546875, "learning_rate": 0.00029424377792790023, "loss": 5.2894, "step": 3890 }, { "epoch": 1.7497753818508537, "grad_norm": 2.953125, "learning_rate": 0.0002942135011879046, "loss": 5.3933, "step": 3895 }, { "epoch": 1.7520215633423182, "grad_norm": 3.1875, "learning_rate": 0.00029418314677514764, "loss": 5.295, "step": 3900 }, { "epoch": 1.7542677448337827, "grad_norm": 3.15625, "learning_rate": 0.0002941527147078761, "loss": 5.1949, "step": 3905 }, { "epoch": 1.7565139263252472, "grad_norm": 2.96875, "learning_rate": 0.00029412220500438317, "loss": 5.1329, "step": 3910 }, { "epoch": 1.7587601078167117, "grad_norm": 3.109375, "learning_rate": 0.0002940916176830089, "loss": 5.3141, "step": 3915 }, { "epoch": 1.7610062893081762, "grad_norm": 3.109375, "learning_rate": 0.0002940609527621399, "loss": 5.2578, "step": 3920 }, { "epoch": 1.7632524707996406, "grad_norm": 3.0, "learning_rate": 0.00029403021026020955, "loss": 5.2614, "step": 3925 }, { "epoch": 1.7654986522911051, "grad_norm": 3.109375, "learning_rate": 0.00029399939019569767, "loss": 5.2955, "step": 3930 }, { "epoch": 1.7677448337825696, "grad_norm": 2.9375, "learning_rate": 0.00029396849258713084, "loss": 5.2972, "step": 3935 }, { "epoch": 1.7699910152740341, "grad_norm": 3.09375, "learning_rate": 0.00029393751745308215, "loss": 5.2714, "step": 3940 }, { "epoch": 1.7722371967654986, "grad_norm": 3.234375, "learning_rate": 0.0002939064648121714, "loss": 5.2846, "step": 3945 }, { "epoch": 1.7744833782569631, "grad_norm": 2.90625, "learning_rate": 0.00029387533468306504, "loss": 5.263, "step": 3950 }, { "epoch": 1.7767295597484276, "grad_norm": 3.09375, "learning_rate": 0.0002938441270844758, "loss": 5.1442, "step": 3955 }, { "epoch": 1.778975741239892, "grad_norm": 2.859375, "learning_rate": 0.00029381284203516334, "loss": 5.209, "step": 3960 }, { "epoch": 1.7812219227313566, "grad_norm": 3.078125, "learning_rate": 0.00029378147955393363, "loss": 5.2285, "step": 3965 }, { "epoch": 1.783468104222821, "grad_norm": 3.171875, "learning_rate": 0.00029375003965963935, "loss": 5.2605, "step": 3970 }, { "epoch": 1.7857142857142856, "grad_norm": 2.921875, "learning_rate": 0.00029371852237117957, "loss": 5.2557, "step": 3975 }, { "epoch": 1.78796046720575, "grad_norm": 2.96875, "learning_rate": 0.00029368692770749994, "loss": 5.1953, "step": 3980 }, { "epoch": 1.7902066486972146, "grad_norm": 3.0, "learning_rate": 0.00029365525568759266, "loss": 5.2138, "step": 3985 }, { "epoch": 1.7924528301886793, "grad_norm": 3.03125, "learning_rate": 0.0002936235063304964, "loss": 5.2362, "step": 3990 }, { "epoch": 1.7946990116801438, "grad_norm": 3.703125, "learning_rate": 0.0002935916796552963, "loss": 5.238, "step": 3995 }, { "epoch": 1.7969451931716083, "grad_norm": 3.03125, "learning_rate": 0.00029355977568112403, "loss": 5.2092, "step": 4000 }, { "epoch": 1.7969451931716083, "eval_loss": 5.183039665222168, "eval_runtime": 16.1808, "eval_samples_per_second": 1916.649, "eval_steps_per_second": 239.604, "step": 4000 }, { "epoch": 1.7991913746630728, "grad_norm": 2.875, "learning_rate": 0.00029352779442715765, "loss": 5.2075, "step": 4005 }, { "epoch": 1.8014375561545373, "grad_norm": 3.0, "learning_rate": 0.0002934957359126218, "loss": 5.1898, "step": 4010 }, { "epoch": 1.8036837376460018, "grad_norm": 3.25, "learning_rate": 0.0002934636001567873, "loss": 5.2844, "step": 4015 }, { "epoch": 1.8059299191374663, "grad_norm": 3.109375, "learning_rate": 0.0002934313871789718, "loss": 5.2941, "step": 4020 }, { "epoch": 1.808176100628931, "grad_norm": 3.140625, "learning_rate": 0.00029339909699853904, "loss": 5.3192, "step": 4025 }, { "epoch": 1.8104222821203955, "grad_norm": 3.015625, "learning_rate": 0.00029336672963489925, "loss": 5.1957, "step": 4030 }, { "epoch": 1.81266846361186, "grad_norm": 2.890625, "learning_rate": 0.0002933342851075092, "loss": 5.2322, "step": 4035 }, { "epoch": 1.8149146451033245, "grad_norm": 2.921875, "learning_rate": 0.00029330176343587175, "loss": 5.124, "step": 4040 }, { "epoch": 1.817160826594789, "grad_norm": 2.921875, "learning_rate": 0.00029326916463953646, "loss": 5.195, "step": 4045 }, { "epoch": 1.8194070080862534, "grad_norm": 3.03125, "learning_rate": 0.0002932364887380991, "loss": 5.2398, "step": 4050 }, { "epoch": 1.821653189577718, "grad_norm": 3.03125, "learning_rate": 0.00029320373575120174, "loss": 5.1243, "step": 4055 }, { "epoch": 1.8238993710691824, "grad_norm": 2.921875, "learning_rate": 0.0002931709056985328, "loss": 5.1875, "step": 4060 }, { "epoch": 1.826145552560647, "grad_norm": 3.140625, "learning_rate": 0.0002931379985998272, "loss": 5.2679, "step": 4065 }, { "epoch": 1.8283917340521114, "grad_norm": 3.109375, "learning_rate": 0.0002931050144748659, "loss": 5.1371, "step": 4070 }, { "epoch": 1.830637915543576, "grad_norm": 2.921875, "learning_rate": 0.0002930719533434764, "loss": 5.2114, "step": 4075 }, { "epoch": 1.8328840970350404, "grad_norm": 2.984375, "learning_rate": 0.0002930388152255323, "loss": 5.2132, "step": 4080 }, { "epoch": 1.835130278526505, "grad_norm": 2.96875, "learning_rate": 0.0002930056001409537, "loss": 5.211, "step": 4085 }, { "epoch": 1.8373764600179694, "grad_norm": 2.921875, "learning_rate": 0.0002929723081097067, "loss": 5.1184, "step": 4090 }, { "epoch": 1.8396226415094339, "grad_norm": 2.796875, "learning_rate": 0.00029293893915180387, "loss": 5.1128, "step": 4095 }, { "epoch": 1.8418688230008984, "grad_norm": 3.078125, "learning_rate": 0.00029290549328730395, "loss": 5.2356, "step": 4100 }, { "epoch": 1.8441150044923629, "grad_norm": 3.140625, "learning_rate": 0.0002928719705363118, "loss": 5.1903, "step": 4105 }, { "epoch": 1.8463611859838274, "grad_norm": 2.9375, "learning_rate": 0.00029283837091897876, "loss": 5.1552, "step": 4110 }, { "epoch": 1.8486073674752919, "grad_norm": 3.015625, "learning_rate": 0.00029280469445550213, "loss": 5.1519, "step": 4115 }, { "epoch": 1.8508535489667564, "grad_norm": 3.09375, "learning_rate": 0.0002927709411661255, "loss": 5.181, "step": 4120 }, { "epoch": 1.853099730458221, "grad_norm": 3.15625, "learning_rate": 0.00029273711107113856, "loss": 5.1855, "step": 4125 }, { "epoch": 1.8553459119496856, "grad_norm": 3.015625, "learning_rate": 0.00029270320419087743, "loss": 5.2248, "step": 4130 }, { "epoch": 1.85759209344115, "grad_norm": 3.015625, "learning_rate": 0.00029266922054572395, "loss": 5.1783, "step": 4135 }, { "epoch": 1.8598382749326146, "grad_norm": 2.890625, "learning_rate": 0.00029263516015610655, "loss": 5.2069, "step": 4140 }, { "epoch": 1.862084456424079, "grad_norm": 3.078125, "learning_rate": 0.0002926010230424995, "loss": 5.1962, "step": 4145 }, { "epoch": 1.8643306379155435, "grad_norm": 3.0625, "learning_rate": 0.00029256680922542334, "loss": 5.1803, "step": 4150 }, { "epoch": 1.866576819407008, "grad_norm": 2.875, "learning_rate": 0.0002925325187254446, "loss": 5.2128, "step": 4155 }, { "epoch": 1.8688230008984728, "grad_norm": 2.78125, "learning_rate": 0.00029249815156317605, "loss": 5.184, "step": 4160 }, { "epoch": 1.8710691823899372, "grad_norm": 3.109375, "learning_rate": 0.0002924637077592764, "loss": 5.2263, "step": 4165 }, { "epoch": 1.8733153638814017, "grad_norm": 3.15625, "learning_rate": 0.0002924291873344505, "loss": 5.1901, "step": 4170 }, { "epoch": 1.8755615453728662, "grad_norm": 2.921875, "learning_rate": 0.00029239459030944935, "loss": 5.2521, "step": 4175 }, { "epoch": 1.8778077268643307, "grad_norm": 2.9375, "learning_rate": 0.0002923599167050697, "loss": 5.167, "step": 4180 }, { "epoch": 1.8800539083557952, "grad_norm": 2.9375, "learning_rate": 0.0002923251665421547, "loss": 5.1813, "step": 4185 }, { "epoch": 1.8823000898472597, "grad_norm": 2.8125, "learning_rate": 0.0002922903398415933, "loss": 5.2392, "step": 4190 }, { "epoch": 1.8845462713387242, "grad_norm": 3.0, "learning_rate": 0.0002922554366243205, "loss": 5.2032, "step": 4195 }, { "epoch": 1.8867924528301887, "grad_norm": 3.421875, "learning_rate": 0.00029222045691131737, "loss": 5.1849, "step": 4200 }, { "epoch": 1.8890386343216532, "grad_norm": 2.90625, "learning_rate": 0.00029218540072361074, "loss": 5.1958, "step": 4205 }, { "epoch": 1.8912848158131177, "grad_norm": 2.921875, "learning_rate": 0.0002921502680822738, "loss": 5.174, "step": 4210 }, { "epoch": 1.8935309973045822, "grad_norm": 3.25, "learning_rate": 0.0002921150590084252, "loss": 5.2986, "step": 4215 }, { "epoch": 1.8957771787960467, "grad_norm": 3.125, "learning_rate": 0.00029207977352323005, "loss": 5.1103, "step": 4220 }, { "epoch": 1.8980233602875112, "grad_norm": 2.796875, "learning_rate": 0.000292044411647899, "loss": 5.2693, "step": 4225 }, { "epoch": 1.9002695417789757, "grad_norm": 3.046875, "learning_rate": 0.00029200897340368883, "loss": 5.219, "step": 4230 }, { "epoch": 1.9025157232704402, "grad_norm": 2.921875, "learning_rate": 0.0002919734588119021, "loss": 5.1556, "step": 4235 }, { "epoch": 1.9047619047619047, "grad_norm": 3.15625, "learning_rate": 0.0002919378678938874, "loss": 5.202, "step": 4240 }, { "epoch": 1.9070080862533692, "grad_norm": 2.921875, "learning_rate": 0.000291902200671039, "loss": 5.1384, "step": 4245 }, { "epoch": 1.9092542677448336, "grad_norm": 3.140625, "learning_rate": 0.00029186645716479734, "loss": 5.1446, "step": 4250 }, { "epoch": 1.9115004492362981, "grad_norm": 3.3125, "learning_rate": 0.0002918306373966484, "loss": 5.3229, "step": 4255 }, { "epoch": 1.9137466307277629, "grad_norm": 2.96875, "learning_rate": 0.00029179474138812424, "loss": 5.1863, "step": 4260 }, { "epoch": 1.9159928122192273, "grad_norm": 3.046875, "learning_rate": 0.0002917587691608026, "loss": 5.1948, "step": 4265 }, { "epoch": 1.9182389937106918, "grad_norm": 3.25, "learning_rate": 0.00029172272073630707, "loss": 5.1398, "step": 4270 }, { "epoch": 1.9204851752021563, "grad_norm": 2.90625, "learning_rate": 0.000291686596136307, "loss": 5.2248, "step": 4275 }, { "epoch": 1.9227313566936208, "grad_norm": 2.921875, "learning_rate": 0.00029165039538251786, "loss": 5.2137, "step": 4280 }, { "epoch": 1.9249775381850853, "grad_norm": 3.046875, "learning_rate": 0.00029161411849670034, "loss": 5.2118, "step": 4285 }, { "epoch": 1.9272237196765498, "grad_norm": 3.09375, "learning_rate": 0.00029157776550066134, "loss": 5.1821, "step": 4290 }, { "epoch": 1.9294699011680145, "grad_norm": 2.890625, "learning_rate": 0.0002915413364162533, "loss": 5.1385, "step": 4295 }, { "epoch": 1.931716082659479, "grad_norm": 2.9375, "learning_rate": 0.00029150483126537445, "loss": 5.1265, "step": 4300 }, { "epoch": 1.9339622641509435, "grad_norm": 2.921875, "learning_rate": 0.0002914682500699688, "loss": 5.173, "step": 4305 }, { "epoch": 1.936208445642408, "grad_norm": 3.25, "learning_rate": 0.00029143159285202597, "loss": 5.175, "step": 4310 }, { "epoch": 1.9384546271338725, "grad_norm": 2.921875, "learning_rate": 0.0002913948596335814, "loss": 5.1925, "step": 4315 }, { "epoch": 1.940700808625337, "grad_norm": 3.21875, "learning_rate": 0.00029135805043671597, "loss": 5.1982, "step": 4320 }, { "epoch": 1.9429469901168015, "grad_norm": 3.015625, "learning_rate": 0.0002913211652835567, "loss": 5.1497, "step": 4325 }, { "epoch": 1.945193171608266, "grad_norm": 2.890625, "learning_rate": 0.00029128420419627566, "loss": 5.151, "step": 4330 }, { "epoch": 1.9474393530997305, "grad_norm": 3.015625, "learning_rate": 0.00029124716719709114, "loss": 5.1051, "step": 4335 }, { "epoch": 1.949685534591195, "grad_norm": 2.9375, "learning_rate": 0.0002912100543082666, "loss": 5.1568, "step": 4340 }, { "epoch": 1.9519317160826595, "grad_norm": 2.9375, "learning_rate": 0.0002911728655521115, "loss": 5.1824, "step": 4345 }, { "epoch": 1.954177897574124, "grad_norm": 3.03125, "learning_rate": 0.00029113560095098064, "loss": 5.1908, "step": 4350 }, { "epoch": 1.9564240790655885, "grad_norm": 3.046875, "learning_rate": 0.0002910982605272745, "loss": 5.1337, "step": 4355 }, { "epoch": 1.958670260557053, "grad_norm": 2.953125, "learning_rate": 0.0002910608443034391, "loss": 5.2017, "step": 4360 }, { "epoch": 1.9609164420485174, "grad_norm": 2.84375, "learning_rate": 0.00029102335230196615, "loss": 5.131, "step": 4365 }, { "epoch": 1.963162623539982, "grad_norm": 2.875, "learning_rate": 0.00029098578454539274, "loss": 5.1247, "step": 4370 }, { "epoch": 1.9654088050314464, "grad_norm": 3.046875, "learning_rate": 0.0002909481410563017, "loss": 5.1947, "step": 4375 }, { "epoch": 1.967654986522911, "grad_norm": 2.9375, "learning_rate": 0.0002909104218573211, "loss": 5.162, "step": 4380 }, { "epoch": 1.9699011680143754, "grad_norm": 2.9375, "learning_rate": 0.00029087262697112494, "loss": 5.1051, "step": 4385 }, { "epoch": 1.97214734950584, "grad_norm": 2.9375, "learning_rate": 0.00029083475642043216, "loss": 5.1855, "step": 4390 }, { "epoch": 1.9743935309973046, "grad_norm": 3.046875, "learning_rate": 0.0002907968102280077, "loss": 5.1933, "step": 4395 }, { "epoch": 1.9766397124887691, "grad_norm": 3.234375, "learning_rate": 0.0002907587884166616, "loss": 5.1138, "step": 4400 }, { "epoch": 1.9788858939802336, "grad_norm": 3.1875, "learning_rate": 0.0002907206910092498, "loss": 5.1579, "step": 4405 }, { "epoch": 1.9811320754716981, "grad_norm": 2.96875, "learning_rate": 0.000290682518028673, "loss": 5.1163, "step": 4410 }, { "epoch": 1.9833782569631626, "grad_norm": 2.921875, "learning_rate": 0.00029064426949787807, "loss": 5.1887, "step": 4415 }, { "epoch": 1.985624438454627, "grad_norm": 2.984375, "learning_rate": 0.0002906059454398567, "loss": 5.2164, "step": 4420 }, { "epoch": 1.9878706199460916, "grad_norm": 3.125, "learning_rate": 0.0002905675458776464, "loss": 5.0996, "step": 4425 }, { "epoch": 1.9901168014375563, "grad_norm": 3.03125, "learning_rate": 0.0002905290708343298, "loss": 5.1728, "step": 4430 }, { "epoch": 1.9923629829290208, "grad_norm": 3.0, "learning_rate": 0.00029049052033303514, "loss": 5.1126, "step": 4435 }, { "epoch": 1.9946091644204853, "grad_norm": 3.03125, "learning_rate": 0.00029045189439693564, "loss": 5.1486, "step": 4440 }, { "epoch": 1.9968553459119498, "grad_norm": 2.890625, "learning_rate": 0.00029041319304925036, "loss": 5.098, "step": 4445 }, { "epoch": 1.9991015274034143, "grad_norm": 3.0, "learning_rate": 0.0002903744163132432, "loss": 5.1236, "step": 4450 }, { "epoch": 2.001347708894879, "grad_norm": 3.171875, "learning_rate": 0.00029033556421222383, "loss": 5.1441, "step": 4455 }, { "epoch": 2.0035938903863433, "grad_norm": 2.984375, "learning_rate": 0.0002902966367695468, "loss": 5.0451, "step": 4460 }, { "epoch": 2.0058400718778078, "grad_norm": 2.859375, "learning_rate": 0.00029025763400861236, "loss": 5.104, "step": 4465 }, { "epoch": 2.0080862533692723, "grad_norm": 2.9375, "learning_rate": 0.00029021855595286574, "loss": 5.0897, "step": 4470 }, { "epoch": 2.0103324348607368, "grad_norm": 2.90625, "learning_rate": 0.0002901794026257975, "loss": 4.9517, "step": 4475 }, { "epoch": 2.0125786163522013, "grad_norm": 2.859375, "learning_rate": 0.0002901401740509435, "loss": 4.9774, "step": 4480 }, { "epoch": 2.0148247978436657, "grad_norm": 2.96875, "learning_rate": 0.0002901008702518848, "loss": 4.986, "step": 4485 }, { "epoch": 2.0170709793351302, "grad_norm": 3.015625, "learning_rate": 0.0002900614912522476, "loss": 5.0134, "step": 4490 }, { "epoch": 2.0193171608265947, "grad_norm": 3.3125, "learning_rate": 0.0002900220370757035, "loss": 5.0922, "step": 4495 }, { "epoch": 2.0215633423180592, "grad_norm": 2.8125, "learning_rate": 0.0002899825077459692, "loss": 5.0198, "step": 4500 }, { "epoch": 2.0238095238095237, "grad_norm": 3.203125, "learning_rate": 0.0002899429032868064, "loss": 5.1019, "step": 4505 }, { "epoch": 2.026055705300988, "grad_norm": 3.078125, "learning_rate": 0.0002899032237220223, "loss": 5.0552, "step": 4510 }, { "epoch": 2.0283018867924527, "grad_norm": 2.859375, "learning_rate": 0.0002898634690754689, "loss": 5.0344, "step": 4515 }, { "epoch": 2.030548068283917, "grad_norm": 2.9375, "learning_rate": 0.0002898236393710436, "loss": 5.04, "step": 4520 }, { "epoch": 2.0327942497753817, "grad_norm": 3.109375, "learning_rate": 0.00028978373463268883, "loss": 5.0868, "step": 4525 }, { "epoch": 2.035040431266846, "grad_norm": 3.234375, "learning_rate": 0.00028974375488439194, "loss": 5.0977, "step": 4530 }, { "epoch": 2.0372866127583107, "grad_norm": 2.96875, "learning_rate": 0.0002897037001501857, "loss": 5.0351, "step": 4535 }, { "epoch": 2.039532794249775, "grad_norm": 3.5, "learning_rate": 0.00028966357045414774, "loss": 5.115, "step": 4540 }, { "epoch": 2.0417789757412397, "grad_norm": 2.875, "learning_rate": 0.00028962336582040086, "loss": 5.137, "step": 4545 }, { "epoch": 2.0440251572327046, "grad_norm": 3.15625, "learning_rate": 0.0002895830862731127, "loss": 5.0389, "step": 4550 }, { "epoch": 2.046271338724169, "grad_norm": 2.921875, "learning_rate": 0.0002895427318364963, "loss": 5.045, "step": 4555 }, { "epoch": 2.0485175202156336, "grad_norm": 3.0, "learning_rate": 0.00028950230253480935, "loss": 5.0665, "step": 4560 }, { "epoch": 2.050763701707098, "grad_norm": 3.0, "learning_rate": 0.00028946179839235475, "loss": 4.9852, "step": 4565 }, { "epoch": 2.0530098831985626, "grad_norm": 2.890625, "learning_rate": 0.0002894212194334803, "loss": 5.1119, "step": 4570 }, { "epoch": 2.055256064690027, "grad_norm": 2.921875, "learning_rate": 0.00028938056568257874, "loss": 5.0799, "step": 4575 }, { "epoch": 2.0575022461814916, "grad_norm": 3.125, "learning_rate": 0.000289339837164088, "loss": 5.0597, "step": 4580 }, { "epoch": 2.059748427672956, "grad_norm": 2.859375, "learning_rate": 0.0002892990339024907, "loss": 5.0044, "step": 4585 }, { "epoch": 2.0619946091644206, "grad_norm": 3.09375, "learning_rate": 0.0002892581559223144, "loss": 5.0103, "step": 4590 }, { "epoch": 2.064240790655885, "grad_norm": 3.140625, "learning_rate": 0.00028921720324813185, "loss": 5.0157, "step": 4595 }, { "epoch": 2.0664869721473496, "grad_norm": 2.875, "learning_rate": 0.0002891761759045603, "loss": 5.0655, "step": 4600 }, { "epoch": 2.068733153638814, "grad_norm": 3.0, "learning_rate": 0.0002891350739162622, "loss": 5.1106, "step": 4605 }, { "epoch": 2.0709793351302785, "grad_norm": 3.125, "learning_rate": 0.0002890938973079447, "loss": 5.129, "step": 4610 }, { "epoch": 2.073225516621743, "grad_norm": 3.125, "learning_rate": 0.00028905264610436, "loss": 5.031, "step": 4615 }, { "epoch": 2.0754716981132075, "grad_norm": 2.859375, "learning_rate": 0.00028901132033030475, "loss": 5.0716, "step": 4620 }, { "epoch": 2.077717879604672, "grad_norm": 2.984375, "learning_rate": 0.000288969920010621, "loss": 5.0758, "step": 4625 }, { "epoch": 2.0799640610961365, "grad_norm": 3.0625, "learning_rate": 0.000288928445170195, "loss": 5.0436, "step": 4630 }, { "epoch": 2.082210242587601, "grad_norm": 2.84375, "learning_rate": 0.00028888689583395826, "loss": 5.0841, "step": 4635 }, { "epoch": 2.0844564240790655, "grad_norm": 3.140625, "learning_rate": 0.00028884527202688683, "loss": 5.0446, "step": 4640 }, { "epoch": 2.08670260557053, "grad_norm": 3.015625, "learning_rate": 0.0002888035737740016, "loss": 4.9765, "step": 4645 }, { "epoch": 2.0889487870619945, "grad_norm": 2.96875, "learning_rate": 0.00028876180110036823, "loss": 5.1058, "step": 4650 }, { "epoch": 2.091194968553459, "grad_norm": 2.953125, "learning_rate": 0.0002887199540310971, "loss": 5.0546, "step": 4655 }, { "epoch": 2.0934411500449235, "grad_norm": 2.828125, "learning_rate": 0.00028867803259134326, "loss": 4.9612, "step": 4660 }, { "epoch": 2.095687331536388, "grad_norm": 3.046875, "learning_rate": 0.00028863603680630653, "loss": 5.0064, "step": 4665 }, { "epoch": 2.0979335130278525, "grad_norm": 2.984375, "learning_rate": 0.00028859396670123135, "loss": 5.0299, "step": 4670 }, { "epoch": 2.100179694519317, "grad_norm": 3.171875, "learning_rate": 0.000288551822301407, "loss": 5.0889, "step": 4675 }, { "epoch": 2.1024258760107815, "grad_norm": 3.0, "learning_rate": 0.00028850960363216714, "loss": 5.0944, "step": 4680 }, { "epoch": 2.1046720575022464, "grad_norm": 2.953125, "learning_rate": 0.0002884673107188904, "loss": 4.9692, "step": 4685 }, { "epoch": 2.106918238993711, "grad_norm": 3.140625, "learning_rate": 0.00028842494358699973, "loss": 4.9994, "step": 4690 }, { "epoch": 2.1091644204851754, "grad_norm": 3.03125, "learning_rate": 0.000288382502261963, "loss": 5.0891, "step": 4695 }, { "epoch": 2.11141060197664, "grad_norm": 3.125, "learning_rate": 0.0002883399867692924, "loss": 5.0812, "step": 4700 }, { "epoch": 2.1136567834681044, "grad_norm": 3.1875, "learning_rate": 0.00028829739713454483, "loss": 5.0365, "step": 4705 }, { "epoch": 2.115902964959569, "grad_norm": 3.0625, "learning_rate": 0.0002882547333833218, "loss": 5.0654, "step": 4710 }, { "epoch": 2.1181491464510334, "grad_norm": 2.859375, "learning_rate": 0.00028821199554126934, "loss": 4.9854, "step": 4715 }, { "epoch": 2.120395327942498, "grad_norm": 3.046875, "learning_rate": 0.0002881691836340779, "loss": 5.0865, "step": 4720 }, { "epoch": 2.1226415094339623, "grad_norm": 2.9375, "learning_rate": 0.00028812629768748267, "loss": 5.045, "step": 4725 }, { "epoch": 2.124887690925427, "grad_norm": 3.390625, "learning_rate": 0.00028808333772726316, "loss": 5.0897, "step": 4730 }, { "epoch": 2.1271338724168913, "grad_norm": 3.265625, "learning_rate": 0.00028804030377924345, "loss": 5.0187, "step": 4735 }, { "epoch": 2.129380053908356, "grad_norm": 2.9375, "learning_rate": 0.0002879971958692921, "loss": 5.0898, "step": 4740 }, { "epoch": 2.1316262353998203, "grad_norm": 2.953125, "learning_rate": 0.00028795401402332215, "loss": 5.0058, "step": 4745 }, { "epoch": 2.133872416891285, "grad_norm": 2.984375, "learning_rate": 0.00028791075826729097, "loss": 5.0468, "step": 4750 }, { "epoch": 2.1361185983827493, "grad_norm": 3.109375, "learning_rate": 0.00028786742862720055, "loss": 5.0241, "step": 4755 }, { "epoch": 2.138364779874214, "grad_norm": 3.0625, "learning_rate": 0.0002878240251290971, "loss": 5.1405, "step": 4760 }, { "epoch": 2.1406109613656783, "grad_norm": 2.828125, "learning_rate": 0.0002877805477990713, "loss": 5.0095, "step": 4765 }, { "epoch": 2.142857142857143, "grad_norm": 3.203125, "learning_rate": 0.00028773699666325835, "loss": 5.0425, "step": 4770 }, { "epoch": 2.1451033243486073, "grad_norm": 3.078125, "learning_rate": 0.00028769337174783754, "loss": 5.0217, "step": 4775 }, { "epoch": 2.147349505840072, "grad_norm": 3.078125, "learning_rate": 0.0002876496730790327, "loss": 5.0803, "step": 4780 }, { "epoch": 2.1495956873315363, "grad_norm": 3.484375, "learning_rate": 0.00028760590068311194, "loss": 5.0487, "step": 4785 }, { "epoch": 2.1518418688230008, "grad_norm": 3.0625, "learning_rate": 0.00028756205458638776, "loss": 5.0174, "step": 4790 }, { "epoch": 2.1540880503144653, "grad_norm": 3.421875, "learning_rate": 0.00028751813481521694, "loss": 5.0855, "step": 4795 }, { "epoch": 2.1563342318059298, "grad_norm": 3.140625, "learning_rate": 0.00028747414139600034, "loss": 5.0706, "step": 4800 }, { "epoch": 2.1585804132973943, "grad_norm": 2.984375, "learning_rate": 0.0002874300743551835, "loss": 5.1177, "step": 4805 }, { "epoch": 2.1608265947888587, "grad_norm": 3.328125, "learning_rate": 0.0002873859337192558, "loss": 5.0589, "step": 4810 }, { "epoch": 2.1630727762803232, "grad_norm": 3.140625, "learning_rate": 0.00028734171951475104, "loss": 5.0959, "step": 4815 }, { "epoch": 2.165318957771788, "grad_norm": 3.078125, "learning_rate": 0.00028729743176824735, "loss": 5.0754, "step": 4820 }, { "epoch": 2.1675651392632527, "grad_norm": 3.03125, "learning_rate": 0.0002872530705063669, "loss": 5.0442, "step": 4825 }, { "epoch": 2.169811320754717, "grad_norm": 3.421875, "learning_rate": 0.00028720863575577615, "loss": 4.9739, "step": 4830 }, { "epoch": 2.1720575022461817, "grad_norm": 3.09375, "learning_rate": 0.0002871641275431856, "loss": 5.0175, "step": 4835 }, { "epoch": 2.174303683737646, "grad_norm": 3.15625, "learning_rate": 0.0002871195458953501, "loss": 5.0096, "step": 4840 }, { "epoch": 2.1765498652291106, "grad_norm": 3.125, "learning_rate": 0.0002870748908390686, "loss": 5.0525, "step": 4845 }, { "epoch": 2.178796046720575, "grad_norm": 2.890625, "learning_rate": 0.0002870301624011839, "loss": 5.0469, "step": 4850 }, { "epoch": 2.1810422282120396, "grad_norm": 3.0, "learning_rate": 0.0002869853606085834, "loss": 5.0679, "step": 4855 }, { "epoch": 2.183288409703504, "grad_norm": 2.9375, "learning_rate": 0.00028694048548819816, "loss": 5.0369, "step": 4860 }, { "epoch": 2.1855345911949686, "grad_norm": 3.078125, "learning_rate": 0.00028689553706700356, "loss": 5.0443, "step": 4865 }, { "epoch": 2.187780772686433, "grad_norm": 3.0625, "learning_rate": 0.000286850515372019, "loss": 4.9984, "step": 4870 }, { "epoch": 2.1900269541778976, "grad_norm": 2.9375, "learning_rate": 0.00028680542043030787, "loss": 4.9734, "step": 4875 }, { "epoch": 2.192273135669362, "grad_norm": 2.953125, "learning_rate": 0.0002867602522689776, "loss": 5.0096, "step": 4880 }, { "epoch": 2.1945193171608266, "grad_norm": 3.265625, "learning_rate": 0.00028671501091517967, "loss": 4.9606, "step": 4885 }, { "epoch": 2.196765498652291, "grad_norm": 3.171875, "learning_rate": 0.0002866696963961096, "loss": 5.072, "step": 4890 }, { "epoch": 2.1990116801437556, "grad_norm": 2.96875, "learning_rate": 0.0002866243087390067, "loss": 5.0319, "step": 4895 }, { "epoch": 2.20125786163522, "grad_norm": 3.125, "learning_rate": 0.0002865788479711545, "loss": 5.0198, "step": 4900 }, { "epoch": 2.2035040431266846, "grad_norm": 3.015625, "learning_rate": 0.00028653331411988034, "loss": 5.001, "step": 4905 }, { "epoch": 2.205750224618149, "grad_norm": 2.890625, "learning_rate": 0.00028648770721255543, "loss": 5.0652, "step": 4910 }, { "epoch": 2.2079964061096136, "grad_norm": 3.0625, "learning_rate": 0.000286442027276595, "loss": 4.9551, "step": 4915 }, { "epoch": 2.210242587601078, "grad_norm": 3.09375, "learning_rate": 0.0002863962743394583, "loss": 5.0335, "step": 4920 }, { "epoch": 2.2124887690925426, "grad_norm": 3.265625, "learning_rate": 0.00028635044842864805, "loss": 5.0267, "step": 4925 }, { "epoch": 2.214734950584007, "grad_norm": 3.34375, "learning_rate": 0.0002863045495717113, "loss": 5.0602, "step": 4930 }, { "epoch": 2.2169811320754715, "grad_norm": 3.09375, "learning_rate": 0.0002862585777962387, "loss": 5.0753, "step": 4935 }, { "epoch": 2.219227313566936, "grad_norm": 3.09375, "learning_rate": 0.0002862125331298648, "loss": 5.0716, "step": 4940 }, { "epoch": 2.2214734950584005, "grad_norm": 3.328125, "learning_rate": 0.0002861664156002679, "loss": 5.0408, "step": 4945 }, { "epoch": 2.223719676549865, "grad_norm": 3.25, "learning_rate": 0.00028612022523517015, "loss": 5.0705, "step": 4950 }, { "epoch": 2.22596585804133, "grad_norm": 2.921875, "learning_rate": 0.0002860739620623375, "loss": 5.06, "step": 4955 }, { "epoch": 2.2282120395327945, "grad_norm": 2.953125, "learning_rate": 0.00028602762610957966, "loss": 5.0575, "step": 4960 }, { "epoch": 2.230458221024259, "grad_norm": 2.90625, "learning_rate": 0.0002859812174047501, "loss": 5.0911, "step": 4965 }, { "epoch": 2.2327044025157234, "grad_norm": 3.0625, "learning_rate": 0.00028593473597574595, "loss": 5.0714, "step": 4970 }, { "epoch": 2.234950584007188, "grad_norm": 3.15625, "learning_rate": 0.00028588818185050816, "loss": 4.9425, "step": 4975 }, { "epoch": 2.2371967654986524, "grad_norm": 3.359375, "learning_rate": 0.00028584155505702124, "loss": 5.0257, "step": 4980 }, { "epoch": 2.239442946990117, "grad_norm": 3.015625, "learning_rate": 0.00028579485562331354, "loss": 4.9997, "step": 4985 }, { "epoch": 2.2416891284815814, "grad_norm": 3.0, "learning_rate": 0.00028574808357745697, "loss": 5.136, "step": 4990 }, { "epoch": 2.243935309973046, "grad_norm": 3.046875, "learning_rate": 0.0002857012389475671, "loss": 4.9934, "step": 4995 }, { "epoch": 2.2461814914645104, "grad_norm": 3.203125, "learning_rate": 0.0002856543217618033, "loss": 5.0804, "step": 5000 }, { "epoch": 2.2461814914645104, "eval_loss": 5.077792644500732, "eval_runtime": 16.1311, "eval_samples_per_second": 1922.556, "eval_steps_per_second": 240.343, "step": 5000 }, { "epoch": 2.248427672955975, "grad_norm": 3.21875, "learning_rate": 0.00028560733204836814, "loss": 5.0199, "step": 5005 }, { "epoch": 2.2506738544474394, "grad_norm": 3.03125, "learning_rate": 0.0002855602698355083, "loss": 4.9319, "step": 5010 }, { "epoch": 2.252920035938904, "grad_norm": 3.28125, "learning_rate": 0.0002855131351515136, "loss": 5.014, "step": 5015 }, { "epoch": 2.2551662174303684, "grad_norm": 2.96875, "learning_rate": 0.00028546592802471783, "loss": 4.9945, "step": 5020 }, { "epoch": 2.257412398921833, "grad_norm": 3.03125, "learning_rate": 0.0002854186484834979, "loss": 5.0318, "step": 5025 }, { "epoch": 2.2596585804132974, "grad_norm": 3.015625, "learning_rate": 0.0002853712965562747, "loss": 4.9928, "step": 5030 }, { "epoch": 2.261904761904762, "grad_norm": 3.25, "learning_rate": 0.0002853238722715122, "loss": 5.0388, "step": 5035 }, { "epoch": 2.2641509433962264, "grad_norm": 3.1875, "learning_rate": 0.0002852763756577181, "loss": 5.0577, "step": 5040 }, { "epoch": 2.266397124887691, "grad_norm": 2.984375, "learning_rate": 0.0002852288067434437, "loss": 5.0389, "step": 5045 }, { "epoch": 2.2686433063791553, "grad_norm": 3.265625, "learning_rate": 0.00028518116555728334, "loss": 5.0881, "step": 5050 }, { "epoch": 2.27088948787062, "grad_norm": 3.140625, "learning_rate": 0.0002851334521278753, "loss": 5.0352, "step": 5055 }, { "epoch": 2.2731356693620843, "grad_norm": 2.921875, "learning_rate": 0.0002850856664839009, "loss": 5.0594, "step": 5060 }, { "epoch": 2.275381850853549, "grad_norm": 3.078125, "learning_rate": 0.0002850378086540852, "loss": 5.0079, "step": 5065 }, { "epoch": 2.2776280323450133, "grad_norm": 2.984375, "learning_rate": 0.00028498987866719627, "loss": 4.9529, "step": 5070 }, { "epoch": 2.279874213836478, "grad_norm": 3.09375, "learning_rate": 0.00028494187655204594, "loss": 5.0234, "step": 5075 }, { "epoch": 2.2821203953279423, "grad_norm": 3.640625, "learning_rate": 0.00028489380233748913, "loss": 4.9815, "step": 5080 }, { "epoch": 2.284366576819407, "grad_norm": 3.03125, "learning_rate": 0.0002848456560524242, "loss": 5.0092, "step": 5085 }, { "epoch": 2.2866127583108717, "grad_norm": 2.78125, "learning_rate": 0.0002847974377257927, "loss": 5.024, "step": 5090 }, { "epoch": 2.288858939802336, "grad_norm": 3.078125, "learning_rate": 0.0002847491473865799, "loss": 4.9528, "step": 5095 }, { "epoch": 2.2911051212938007, "grad_norm": 3.15625, "learning_rate": 0.0002847007850638138, "loss": 5.0185, "step": 5100 }, { "epoch": 2.2933513027852652, "grad_norm": 3.078125, "learning_rate": 0.00028465235078656607, "loss": 5.066, "step": 5105 }, { "epoch": 2.2955974842767297, "grad_norm": 3.109375, "learning_rate": 0.00028460384458395147, "loss": 5.0169, "step": 5110 }, { "epoch": 2.297843665768194, "grad_norm": 3.015625, "learning_rate": 0.000284555266485128, "loss": 4.9659, "step": 5115 }, { "epoch": 2.3000898472596587, "grad_norm": 3.3125, "learning_rate": 0.00028450661651929695, "loss": 4.9802, "step": 5120 }, { "epoch": 2.302336028751123, "grad_norm": 3.09375, "learning_rate": 0.00028445789471570273, "loss": 5.035, "step": 5125 }, { "epoch": 2.3045822102425877, "grad_norm": 2.9375, "learning_rate": 0.00028440910110363296, "loss": 5.0922, "step": 5130 }, { "epoch": 2.306828391734052, "grad_norm": 3.046875, "learning_rate": 0.00028436023571241855, "loss": 5.0105, "step": 5135 }, { "epoch": 2.3090745732255167, "grad_norm": 3.3125, "learning_rate": 0.0002843112985714333, "loss": 4.9729, "step": 5140 }, { "epoch": 2.311320754716981, "grad_norm": 2.953125, "learning_rate": 0.00028426228971009426, "loss": 5.0841, "step": 5145 }, { "epoch": 2.3135669362084457, "grad_norm": 3.265625, "learning_rate": 0.0002842132091578618, "loss": 5.0193, "step": 5150 }, { "epoch": 2.31581311769991, "grad_norm": 3.03125, "learning_rate": 0.000284164056944239, "loss": 5.0195, "step": 5155 }, { "epoch": 2.3180592991913747, "grad_norm": 2.90625, "learning_rate": 0.00028411483309877234, "loss": 4.9968, "step": 5160 }, { "epoch": 2.320305480682839, "grad_norm": 3.015625, "learning_rate": 0.0002840655376510512, "loss": 5.0199, "step": 5165 }, { "epoch": 2.3225516621743036, "grad_norm": 2.96875, "learning_rate": 0.000284016170630708, "loss": 5.0053, "step": 5170 }, { "epoch": 2.324797843665768, "grad_norm": 3.078125, "learning_rate": 0.00028396673206741827, "loss": 5.0486, "step": 5175 }, { "epoch": 2.3270440251572326, "grad_norm": 3.15625, "learning_rate": 0.0002839172219909005, "loss": 5.0429, "step": 5180 }, { "epoch": 2.329290206648697, "grad_norm": 3.15625, "learning_rate": 0.0002838676404309162, "loss": 4.9361, "step": 5185 }, { "epoch": 2.3315363881401616, "grad_norm": 2.921875, "learning_rate": 0.00028381798741726965, "loss": 4.9766, "step": 5190 }, { "epoch": 2.333782569631626, "grad_norm": 3.21875, "learning_rate": 0.0002837682629798084, "loss": 4.9904, "step": 5195 }, { "epoch": 2.3360287511230906, "grad_norm": 3.078125, "learning_rate": 0.0002837184671484227, "loss": 5.0596, "step": 5200 }, { "epoch": 2.338274932614555, "grad_norm": 3.046875, "learning_rate": 0.0002836685999530459, "loss": 4.986, "step": 5205 }, { "epoch": 2.3405211141060196, "grad_norm": 2.96875, "learning_rate": 0.0002836186614236541, "loss": 4.9947, "step": 5210 }, { "epoch": 2.342767295597484, "grad_norm": 2.96875, "learning_rate": 0.0002835686515902663, "loss": 4.9349, "step": 5215 }, { "epoch": 2.3450134770889486, "grad_norm": 3.109375, "learning_rate": 0.0002835185704829443, "loss": 4.9609, "step": 5220 }, { "epoch": 2.3472596585804135, "grad_norm": 3.59375, "learning_rate": 0.000283468418131793, "loss": 5.0416, "step": 5225 }, { "epoch": 2.3495058400718776, "grad_norm": 3.1875, "learning_rate": 0.0002834181945669599, "loss": 5.0325, "step": 5230 }, { "epoch": 2.3517520215633425, "grad_norm": 3.0625, "learning_rate": 0.0002833678998186354, "loss": 5.0163, "step": 5235 }, { "epoch": 2.353998203054807, "grad_norm": 3.0625, "learning_rate": 0.0002833175339170525, "loss": 4.949, "step": 5240 }, { "epoch": 2.3562443845462715, "grad_norm": 3.140625, "learning_rate": 0.0002832670968924873, "loss": 4.976, "step": 5245 }, { "epoch": 2.358490566037736, "grad_norm": 3.109375, "learning_rate": 0.0002832165887752584, "loss": 5.017, "step": 5250 }, { "epoch": 2.3607367475292005, "grad_norm": 3.0625, "learning_rate": 0.00028316600959572727, "loss": 5.0021, "step": 5255 }, { "epoch": 2.362982929020665, "grad_norm": 3.03125, "learning_rate": 0.000283115359384298, "loss": 5.0264, "step": 5260 }, { "epoch": 2.3652291105121295, "grad_norm": 3.21875, "learning_rate": 0.00028306463817141743, "loss": 5.0308, "step": 5265 }, { "epoch": 2.367475292003594, "grad_norm": 3.28125, "learning_rate": 0.00028301384598757506, "loss": 5.0511, "step": 5270 }, { "epoch": 2.3697214734950585, "grad_norm": 2.8125, "learning_rate": 0.00028296298286330305, "loss": 5.0266, "step": 5275 }, { "epoch": 2.371967654986523, "grad_norm": 3.0625, "learning_rate": 0.0002829120488291763, "loss": 4.9507, "step": 5280 }, { "epoch": 2.3742138364779874, "grad_norm": 3.0625, "learning_rate": 0.0002828610439158122, "loss": 5.0018, "step": 5285 }, { "epoch": 2.376460017969452, "grad_norm": 2.984375, "learning_rate": 0.0002828099681538708, "loss": 5.0241, "step": 5290 }, { "epoch": 2.3787061994609164, "grad_norm": 3.109375, "learning_rate": 0.0002827588215740547, "loss": 5.0567, "step": 5295 }, { "epoch": 2.380952380952381, "grad_norm": 3.0, "learning_rate": 0.0002827076042071092, "loss": 4.9711, "step": 5300 }, { "epoch": 2.3831985624438454, "grad_norm": 3.078125, "learning_rate": 0.000282656316083822, "loss": 5.0117, "step": 5305 }, { "epoch": 2.38544474393531, "grad_norm": 3.03125, "learning_rate": 0.0002826049572350234, "loss": 4.9413, "step": 5310 }, { "epoch": 2.3876909254267744, "grad_norm": 3.234375, "learning_rate": 0.00028255352769158623, "loss": 5.0217, "step": 5315 }, { "epoch": 2.389937106918239, "grad_norm": 3.203125, "learning_rate": 0.0002825020274844257, "loss": 4.9966, "step": 5320 }, { "epoch": 2.3921832884097034, "grad_norm": 3.0625, "learning_rate": 0.00028245045664449973, "loss": 5.0136, "step": 5325 }, { "epoch": 2.394429469901168, "grad_norm": 3.109375, "learning_rate": 0.00028239881520280847, "loss": 5.0281, "step": 5330 }, { "epoch": 2.3966756513926324, "grad_norm": 3.609375, "learning_rate": 0.00028234710319039466, "loss": 5.0617, "step": 5335 }, { "epoch": 2.398921832884097, "grad_norm": 3.109375, "learning_rate": 0.00028229532063834336, "loss": 4.941, "step": 5340 }, { "epoch": 2.4011680143755614, "grad_norm": 3.171875, "learning_rate": 0.00028224346757778205, "loss": 5.0323, "step": 5345 }, { "epoch": 2.403414195867026, "grad_norm": 3.0, "learning_rate": 0.00028219154403988063, "loss": 4.9451, "step": 5350 }, { "epoch": 2.4056603773584904, "grad_norm": 2.859375, "learning_rate": 0.0002821395500558515, "loss": 5.037, "step": 5355 }, { "epoch": 2.4079065588499553, "grad_norm": 3.046875, "learning_rate": 0.000282087485656949, "loss": 4.9873, "step": 5360 }, { "epoch": 2.4101527403414194, "grad_norm": 2.921875, "learning_rate": 0.00028203535087447025, "loss": 5.0177, "step": 5365 }, { "epoch": 2.4123989218328843, "grad_norm": 3.171875, "learning_rate": 0.00028198314573975444, "loss": 5.1103, "step": 5370 }, { "epoch": 2.414645103324349, "grad_norm": 3.015625, "learning_rate": 0.00028193087028418305, "loss": 5.0471, "step": 5375 }, { "epoch": 2.4168912848158133, "grad_norm": 3.3125, "learning_rate": 0.00028187852453917994, "loss": 5.021, "step": 5380 }, { "epoch": 2.4191374663072778, "grad_norm": 2.9375, "learning_rate": 0.0002818261085362111, "loss": 4.9735, "step": 5385 }, { "epoch": 2.4213836477987423, "grad_norm": 3.296875, "learning_rate": 0.00028177362230678485, "loss": 4.9797, "step": 5390 }, { "epoch": 2.4236298292902068, "grad_norm": 3.0, "learning_rate": 0.00028172106588245175, "loss": 4.9648, "step": 5395 }, { "epoch": 2.4258760107816713, "grad_norm": 3.0, "learning_rate": 0.00028166843929480436, "loss": 5.1115, "step": 5400 }, { "epoch": 2.4281221922731357, "grad_norm": 5.5, "learning_rate": 0.00028161574257547765, "loss": 4.9531, "step": 5405 }, { "epoch": 2.4303683737646002, "grad_norm": 2.984375, "learning_rate": 0.00028156297575614864, "loss": 4.9762, "step": 5410 }, { "epoch": 2.4326145552560647, "grad_norm": 3.03125, "learning_rate": 0.00028151013886853647, "loss": 5.0298, "step": 5415 }, { "epoch": 2.4348607367475292, "grad_norm": 2.921875, "learning_rate": 0.0002814572319444024, "loss": 5.0413, "step": 5420 }, { "epoch": 2.4371069182389937, "grad_norm": 3.109375, "learning_rate": 0.0002814042550155499, "loss": 5.0045, "step": 5425 }, { "epoch": 2.439353099730458, "grad_norm": 2.921875, "learning_rate": 0.00028135120811382435, "loss": 5.0008, "step": 5430 }, { "epoch": 2.4415992812219227, "grad_norm": 3.15625, "learning_rate": 0.0002812980912711133, "loss": 5.0294, "step": 5435 }, { "epoch": 2.443845462713387, "grad_norm": 3.359375, "learning_rate": 0.00028124490451934635, "loss": 5.0769, "step": 5440 }, { "epoch": 2.4460916442048517, "grad_norm": 3.421875, "learning_rate": 0.000281191647890495, "loss": 5.0096, "step": 5445 }, { "epoch": 2.448337825696316, "grad_norm": 3.03125, "learning_rate": 0.0002811383214165731, "loss": 4.9805, "step": 5450 }, { "epoch": 2.4505840071877807, "grad_norm": 3.046875, "learning_rate": 0.000281084925129636, "loss": 4.949, "step": 5455 }, { "epoch": 2.452830188679245, "grad_norm": 2.9375, "learning_rate": 0.0002810314590617813, "loss": 4.9011, "step": 5460 }, { "epoch": 2.4550763701707097, "grad_norm": 2.953125, "learning_rate": 0.00028097792324514853, "loss": 4.9974, "step": 5465 }, { "epoch": 2.457322551662174, "grad_norm": 3.234375, "learning_rate": 0.0002809243177119191, "loss": 4.9817, "step": 5470 }, { "epoch": 2.4595687331536387, "grad_norm": 2.984375, "learning_rate": 0.0002808706424943164, "loss": 5.03, "step": 5475 }, { "epoch": 2.461814914645103, "grad_norm": 2.890625, "learning_rate": 0.00028081689762460553, "loss": 5.0556, "step": 5480 }, { "epoch": 2.4640610961365677, "grad_norm": 3.015625, "learning_rate": 0.00028076308313509365, "loss": 5.0748, "step": 5485 }, { "epoch": 2.466307277628032, "grad_norm": 2.984375, "learning_rate": 0.00028070919905812976, "loss": 4.9664, "step": 5490 }, { "epoch": 2.468553459119497, "grad_norm": 3.109375, "learning_rate": 0.00028065524542610456, "loss": 5.0278, "step": 5495 }, { "epoch": 2.470799640610961, "grad_norm": 2.984375, "learning_rate": 0.00028060122227145065, "loss": 5.0414, "step": 5500 }, { "epoch": 2.473045822102426, "grad_norm": 3.4375, "learning_rate": 0.0002805471296266424, "loss": 4.9741, "step": 5505 }, { "epoch": 2.4752920035938906, "grad_norm": 2.96875, "learning_rate": 0.00028049296752419593, "loss": 4.9957, "step": 5510 }, { "epoch": 2.477538185085355, "grad_norm": 2.90625, "learning_rate": 0.00028043873599666925, "loss": 5.0025, "step": 5515 }, { "epoch": 2.4797843665768196, "grad_norm": 2.921875, "learning_rate": 0.0002803844350766618, "loss": 4.9335, "step": 5520 }, { "epoch": 2.482030548068284, "grad_norm": 3.1875, "learning_rate": 0.0002803300647968152, "loss": 4.9839, "step": 5525 }, { "epoch": 2.4842767295597485, "grad_norm": 3.25, "learning_rate": 0.00028027562518981216, "loss": 4.9595, "step": 5530 }, { "epoch": 2.486522911051213, "grad_norm": 2.96875, "learning_rate": 0.0002802211162883776, "loss": 4.9742, "step": 5535 }, { "epoch": 2.4887690925426775, "grad_norm": 3.390625, "learning_rate": 0.0002801665381252779, "loss": 5.0239, "step": 5540 }, { "epoch": 2.491015274034142, "grad_norm": 2.890625, "learning_rate": 0.0002801118907333209, "loss": 4.9802, "step": 5545 }, { "epoch": 2.4932614555256065, "grad_norm": 3.015625, "learning_rate": 0.0002800571741453564, "loss": 4.9646, "step": 5550 }, { "epoch": 2.495507637017071, "grad_norm": 3.03125, "learning_rate": 0.0002800023883942755, "loss": 4.9662, "step": 5555 }, { "epoch": 2.4977538185085355, "grad_norm": 2.890625, "learning_rate": 0.000279947533513011, "loss": 5.0099, "step": 5560 }, { "epoch": 2.5, "grad_norm": 3.03125, "learning_rate": 0.0002798926095345373, "loss": 4.9691, "step": 5565 }, { "epoch": 2.5022461814914645, "grad_norm": 2.9375, "learning_rate": 0.00027983761649187015, "loss": 5.0081, "step": 5570 }, { "epoch": 2.504492362982929, "grad_norm": 3.0, "learning_rate": 0.00027978255441806713, "loss": 4.9875, "step": 5575 }, { "epoch": 2.5067385444743935, "grad_norm": 2.890625, "learning_rate": 0.00027972742334622696, "loss": 5.0318, "step": 5580 }, { "epoch": 2.508984725965858, "grad_norm": 2.984375, "learning_rate": 0.00027967222330949006, "loss": 4.9564, "step": 5585 }, { "epoch": 2.5112309074573225, "grad_norm": 2.96875, "learning_rate": 0.00027961695434103827, "loss": 4.9256, "step": 5590 }, { "epoch": 2.513477088948787, "grad_norm": 3.015625, "learning_rate": 0.00027956161647409486, "loss": 5.0247, "step": 5595 }, { "epoch": 2.5157232704402515, "grad_norm": 3.234375, "learning_rate": 0.00027950620974192446, "loss": 5.0235, "step": 5600 }, { "epoch": 2.517969451931716, "grad_norm": 3.125, "learning_rate": 0.00027945073417783315, "loss": 4.956, "step": 5605 }, { "epoch": 2.5202156334231804, "grad_norm": 3.046875, "learning_rate": 0.0002793951898151684, "loss": 4.94, "step": 5610 }, { "epoch": 2.522461814914645, "grad_norm": 3.125, "learning_rate": 0.00027933957668731897, "loss": 5.0343, "step": 5615 }, { "epoch": 2.52470799640611, "grad_norm": 3.1875, "learning_rate": 0.0002792838948277151, "loss": 4.958, "step": 5620 }, { "epoch": 2.526954177897574, "grad_norm": 3.078125, "learning_rate": 0.0002792281442698281, "loss": 4.9437, "step": 5625 }, { "epoch": 2.529200359389039, "grad_norm": 3.25, "learning_rate": 0.0002791723250471708, "loss": 4.9659, "step": 5630 }, { "epoch": 2.531446540880503, "grad_norm": 3.265625, "learning_rate": 0.00027911643719329723, "loss": 4.9834, "step": 5635 }, { "epoch": 2.533692722371968, "grad_norm": 3.171875, "learning_rate": 0.0002790604807418027, "loss": 4.9261, "step": 5640 }, { "epoch": 2.535938903863432, "grad_norm": 3.109375, "learning_rate": 0.0002790044557263236, "loss": 5.0249, "step": 5645 }, { "epoch": 2.538185085354897, "grad_norm": 3.015625, "learning_rate": 0.00027894836218053784, "loss": 4.9651, "step": 5650 }, { "epoch": 2.5404312668463613, "grad_norm": 3.1875, "learning_rate": 0.00027889220013816416, "loss": 5.0094, "step": 5655 }, { "epoch": 2.542677448337826, "grad_norm": 3.421875, "learning_rate": 0.0002788359696329628, "loss": 4.9669, "step": 5660 }, { "epoch": 2.5449236298292903, "grad_norm": 3.0625, "learning_rate": 0.00027877967069873494, "loss": 4.9674, "step": 5665 }, { "epoch": 2.547169811320755, "grad_norm": 3.25, "learning_rate": 0.0002787233033693231, "loss": 4.9891, "step": 5670 }, { "epoch": 2.5494159928122193, "grad_norm": 3.0625, "learning_rate": 0.0002786668676786106, "loss": 4.913, "step": 5675 }, { "epoch": 2.551662174303684, "grad_norm": 2.84375, "learning_rate": 0.00027861036366052215, "loss": 4.9285, "step": 5680 }, { "epoch": 2.5539083557951483, "grad_norm": 3.0, "learning_rate": 0.0002785537913490233, "loss": 4.9889, "step": 5685 }, { "epoch": 2.556154537286613, "grad_norm": 3.265625, "learning_rate": 0.000278497150778121, "loss": 4.9526, "step": 5690 }, { "epoch": 2.5584007187780773, "grad_norm": 3.109375, "learning_rate": 0.00027844044198186275, "loss": 5.052, "step": 5695 }, { "epoch": 2.560646900269542, "grad_norm": 3.375, "learning_rate": 0.00027838366499433753, "loss": 4.9283, "step": 5700 }, { "epoch": 2.5628930817610063, "grad_norm": 3.171875, "learning_rate": 0.00027832681984967493, "loss": 5.0854, "step": 5705 }, { "epoch": 2.5651392632524708, "grad_norm": 3.3125, "learning_rate": 0.00027826990658204575, "loss": 4.8997, "step": 5710 }, { "epoch": 2.5673854447439353, "grad_norm": 3.015625, "learning_rate": 0.0002782129252256617, "loss": 5.0075, "step": 5715 }, { "epoch": 2.5696316262353998, "grad_norm": 2.90625, "learning_rate": 0.0002781558758147754, "loss": 4.9988, "step": 5720 }, { "epoch": 2.5718778077268643, "grad_norm": 2.90625, "learning_rate": 0.0002780987583836802, "loss": 5.0213, "step": 5725 }, { "epoch": 2.5741239892183287, "grad_norm": 3.046875, "learning_rate": 0.0002780415729667107, "loss": 4.9287, "step": 5730 }, { "epoch": 2.5763701707097932, "grad_norm": 2.9375, "learning_rate": 0.0002779843195982421, "loss": 4.9767, "step": 5735 }, { "epoch": 2.5786163522012577, "grad_norm": 3.140625, "learning_rate": 0.00027792699831269044, "loss": 4.9909, "step": 5740 }, { "epoch": 2.5808625336927222, "grad_norm": 3.390625, "learning_rate": 0.00027786960914451286, "loss": 5.0782, "step": 5745 }, { "epoch": 2.5831087151841867, "grad_norm": 3.3125, "learning_rate": 0.00027781215212820684, "loss": 4.9401, "step": 5750 }, { "epoch": 2.5853548966756517, "grad_norm": 3.03125, "learning_rate": 0.0002777546272983112, "loss": 4.973, "step": 5755 }, { "epoch": 2.5876010781671157, "grad_norm": 2.875, "learning_rate": 0.000277697034689405, "loss": 4.9916, "step": 5760 }, { "epoch": 2.5898472596585806, "grad_norm": 3.265625, "learning_rate": 0.00027763937433610843, "loss": 4.9979, "step": 5765 }, { "epoch": 2.5920934411500447, "grad_norm": 2.984375, "learning_rate": 0.00027758164627308225, "loss": 4.9277, "step": 5770 }, { "epoch": 2.5943396226415096, "grad_norm": 3.453125, "learning_rate": 0.00027752385053502783, "loss": 5.0059, "step": 5775 }, { "epoch": 2.5965858041329737, "grad_norm": 3.15625, "learning_rate": 0.0002774659871566874, "loss": 4.9416, "step": 5780 }, { "epoch": 2.5988319856244386, "grad_norm": 3.046875, "learning_rate": 0.00027740805617284376, "loss": 4.9635, "step": 5785 }, { "epoch": 2.601078167115903, "grad_norm": 3.1875, "learning_rate": 0.0002773500576183203, "loss": 5.0296, "step": 5790 }, { "epoch": 2.6033243486073676, "grad_norm": 2.9375, "learning_rate": 0.0002772919915279812, "loss": 4.9442, "step": 5795 }, { "epoch": 2.605570530098832, "grad_norm": 3.125, "learning_rate": 0.000277233857936731, "loss": 4.9411, "step": 5800 }, { "epoch": 2.6078167115902966, "grad_norm": 3.328125, "learning_rate": 0.000277175656879515, "loss": 5.0361, "step": 5805 }, { "epoch": 2.610062893081761, "grad_norm": 3.0, "learning_rate": 0.00027711738839131895, "loss": 4.9689, "step": 5810 }, { "epoch": 2.6123090745732256, "grad_norm": 3.09375, "learning_rate": 0.00027705905250716926, "loss": 4.9799, "step": 5815 }, { "epoch": 2.61455525606469, "grad_norm": 3.015625, "learning_rate": 0.0002770006492621327, "loss": 5.0151, "step": 5820 }, { "epoch": 2.6168014375561546, "grad_norm": 2.984375, "learning_rate": 0.0002769421786913166, "loss": 4.9397, "step": 5825 }, { "epoch": 2.619047619047619, "grad_norm": 3.171875, "learning_rate": 0.0002768836408298688, "loss": 5.0281, "step": 5830 }, { "epoch": 2.6212938005390836, "grad_norm": 3.078125, "learning_rate": 0.0002768250357129775, "loss": 4.9577, "step": 5835 }, { "epoch": 2.623539982030548, "grad_norm": 2.890625, "learning_rate": 0.00027676636337587145, "loss": 4.9567, "step": 5840 }, { "epoch": 2.6257861635220126, "grad_norm": 3.140625, "learning_rate": 0.00027670762385381974, "loss": 4.9665, "step": 5845 }, { "epoch": 2.628032345013477, "grad_norm": 3.25, "learning_rate": 0.00027664881718213175, "loss": 4.9683, "step": 5850 }, { "epoch": 2.6302785265049415, "grad_norm": 3.0, "learning_rate": 0.0002765899433961574, "loss": 4.9659, "step": 5855 }, { "epoch": 2.632524707996406, "grad_norm": 3.265625, "learning_rate": 0.00027653100253128687, "loss": 4.9306, "step": 5860 }, { "epoch": 2.6347708894878705, "grad_norm": 3.53125, "learning_rate": 0.00027647199462295065, "loss": 4.9186, "step": 5865 }, { "epoch": 2.637017070979335, "grad_norm": 3.28125, "learning_rate": 0.00027641291970661953, "loss": 4.9603, "step": 5870 }, { "epoch": 2.6392632524707995, "grad_norm": 2.953125, "learning_rate": 0.00027635377781780465, "loss": 4.9935, "step": 5875 }, { "epoch": 2.641509433962264, "grad_norm": 3.015625, "learning_rate": 0.00027629456899205725, "loss": 5.0406, "step": 5880 }, { "epoch": 2.6437556154537285, "grad_norm": 3.203125, "learning_rate": 0.00027623529326496906, "loss": 4.9832, "step": 5885 }, { "epoch": 2.6460017969451934, "grad_norm": 3.609375, "learning_rate": 0.0002761759506721717, "loss": 4.9561, "step": 5890 }, { "epoch": 2.6482479784366575, "grad_norm": 2.921875, "learning_rate": 0.0002761165412493373, "loss": 4.9534, "step": 5895 }, { "epoch": 2.6504941599281224, "grad_norm": 3.015625, "learning_rate": 0.00027605706503217806, "loss": 4.9841, "step": 5900 }, { "epoch": 2.6527403414195865, "grad_norm": 3.234375, "learning_rate": 0.0002759975220564462, "loss": 5.1202, "step": 5905 }, { "epoch": 2.6549865229110514, "grad_norm": 3.171875, "learning_rate": 0.0002759379123579341, "loss": 4.9232, "step": 5910 }, { "epoch": 2.6572327044025155, "grad_norm": 3.140625, "learning_rate": 0.0002758782359724745, "loss": 4.923, "step": 5915 }, { "epoch": 2.6594788858939804, "grad_norm": 3.375, "learning_rate": 0.00027581849293593994, "loss": 4.9812, "step": 5920 }, { "epoch": 2.661725067385445, "grad_norm": 3.046875, "learning_rate": 0.00027575868328424307, "loss": 5.078, "step": 5925 }, { "epoch": 2.6639712488769094, "grad_norm": 3.28125, "learning_rate": 0.00027569880705333676, "loss": 4.9466, "step": 5930 }, { "epoch": 2.666217430368374, "grad_norm": 2.984375, "learning_rate": 0.00027563886427921377, "loss": 4.9025, "step": 5935 }, { "epoch": 2.6684636118598384, "grad_norm": 3.015625, "learning_rate": 0.00027557885499790674, "loss": 4.9652, "step": 5940 }, { "epoch": 2.670709793351303, "grad_norm": 3.0, "learning_rate": 0.00027551877924548854, "loss": 5.0297, "step": 5945 }, { "epoch": 2.6729559748427674, "grad_norm": 3.171875, "learning_rate": 0.0002754586370580719, "loss": 4.9552, "step": 5950 }, { "epoch": 2.675202156334232, "grad_norm": 3.328125, "learning_rate": 0.00027539842847180935, "loss": 5.0043, "step": 5955 }, { "epoch": 2.6774483378256964, "grad_norm": 2.90625, "learning_rate": 0.00027533815352289353, "loss": 4.9195, "step": 5960 }, { "epoch": 2.679694519317161, "grad_norm": 3.015625, "learning_rate": 0.00027527781224755696, "loss": 4.9867, "step": 5965 }, { "epoch": 2.6819407008086253, "grad_norm": 3.015625, "learning_rate": 0.0002752174046820718, "loss": 4.9602, "step": 5970 }, { "epoch": 2.68418688230009, "grad_norm": 3.03125, "learning_rate": 0.00027515693086275025, "loss": 4.9846, "step": 5975 }, { "epoch": 2.6864330637915543, "grad_norm": 3.0, "learning_rate": 0.0002750963908259445, "loss": 4.9414, "step": 5980 }, { "epoch": 2.688679245283019, "grad_norm": 3.0625, "learning_rate": 0.00027503578460804604, "loss": 4.9944, "step": 5985 }, { "epoch": 2.6909254267744833, "grad_norm": 2.96875, "learning_rate": 0.00027497511224548667, "loss": 5.0515, "step": 5990 }, { "epoch": 2.693171608265948, "grad_norm": 3.484375, "learning_rate": 0.0002749143737747377, "loss": 4.9846, "step": 5995 }, { "epoch": 2.6954177897574123, "grad_norm": 3.984375, "learning_rate": 0.00027485356923231014, "loss": 4.9318, "step": 6000 }, { "epoch": 2.6954177897574123, "eval_loss": 5.004312992095947, "eval_runtime": 16.1624, "eval_samples_per_second": 1918.841, "eval_steps_per_second": 239.878, "step": 6000 }, { "epoch": 2.697663971248877, "grad_norm": 3.078125, "learning_rate": 0.00027479269865475487, "loss": 4.9818, "step": 6005 }, { "epoch": 2.6999101527403413, "grad_norm": 3.0, "learning_rate": 0.0002747317620786623, "loss": 4.9563, "step": 6010 }, { "epoch": 2.702156334231806, "grad_norm": 3.53125, "learning_rate": 0.0002746707595406627, "loss": 4.9565, "step": 6015 }, { "epoch": 2.7044025157232703, "grad_norm": 2.890625, "learning_rate": 0.0002746096910774258, "loss": 4.955, "step": 6020 }, { "epoch": 2.706648697214735, "grad_norm": 2.890625, "learning_rate": 0.00027454855672566107, "loss": 4.948, "step": 6025 }, { "epoch": 2.7088948787061993, "grad_norm": 3.4375, "learning_rate": 0.0002744873565221176, "loss": 4.9657, "step": 6030 }, { "epoch": 2.711141060197664, "grad_norm": 2.953125, "learning_rate": 0.000274426090503584, "loss": 4.9173, "step": 6035 }, { "epoch": 2.7133872416891283, "grad_norm": 3.265625, "learning_rate": 0.00027436475870688847, "loss": 4.9704, "step": 6040 }, { "epoch": 2.715633423180593, "grad_norm": 2.96875, "learning_rate": 0.00027430336116889876, "loss": 4.9755, "step": 6045 }, { "epoch": 2.7178796046720572, "grad_norm": 3.1875, "learning_rate": 0.00027424189792652214, "loss": 4.9371, "step": 6050 }, { "epoch": 2.720125786163522, "grad_norm": 2.90625, "learning_rate": 0.00027418036901670533, "loss": 4.9885, "step": 6055 }, { "epoch": 2.7223719676549867, "grad_norm": 3.0625, "learning_rate": 0.00027411877447643454, "loss": 4.8649, "step": 6060 }, { "epoch": 2.724618149146451, "grad_norm": 3.203125, "learning_rate": 0.0002740571143427356, "loss": 4.9875, "step": 6065 }, { "epoch": 2.7268643306379157, "grad_norm": 3.078125, "learning_rate": 0.00027399538865267343, "loss": 4.9599, "step": 6070 }, { "epoch": 2.72911051212938, "grad_norm": 3.03125, "learning_rate": 0.0002739335974433527, "loss": 4.9258, "step": 6075 }, { "epoch": 2.7313566936208447, "grad_norm": 3.1875, "learning_rate": 0.0002738717407519172, "loss": 4.9696, "step": 6080 }, { "epoch": 2.733602875112309, "grad_norm": 3.6875, "learning_rate": 0.00027380981861555026, "loss": 4.9673, "step": 6085 }, { "epoch": 2.7358490566037736, "grad_norm": 3.09375, "learning_rate": 0.00027374783107147446, "loss": 5.0333, "step": 6090 }, { "epoch": 2.738095238095238, "grad_norm": 3.03125, "learning_rate": 0.00027368577815695176, "loss": 4.9061, "step": 6095 }, { "epoch": 2.7403414195867026, "grad_norm": 3.171875, "learning_rate": 0.0002736236599092833, "loss": 4.8905, "step": 6100 }, { "epoch": 2.742587601078167, "grad_norm": 3.09375, "learning_rate": 0.0002735614763658097, "loss": 4.9118, "step": 6105 }, { "epoch": 2.7448337825696316, "grad_norm": 3.4375, "learning_rate": 0.0002734992275639106, "loss": 5.0166, "step": 6110 }, { "epoch": 2.747079964061096, "grad_norm": 3.03125, "learning_rate": 0.000273436913541005, "loss": 4.9696, "step": 6115 }, { "epoch": 2.7493261455525606, "grad_norm": 3.109375, "learning_rate": 0.0002733745343345511, "loss": 4.9343, "step": 6120 }, { "epoch": 2.751572327044025, "grad_norm": 3.046875, "learning_rate": 0.00027331208998204623, "loss": 4.9587, "step": 6125 }, { "epoch": 2.7538185085354896, "grad_norm": 3.015625, "learning_rate": 0.00027324958052102696, "loss": 4.8604, "step": 6130 }, { "epoch": 2.756064690026954, "grad_norm": 3.140625, "learning_rate": 0.00027318700598906887, "loss": 4.8835, "step": 6135 }, { "epoch": 2.7583108715184186, "grad_norm": 3.0, "learning_rate": 0.0002731243664237868, "loss": 4.9459, "step": 6140 }, { "epoch": 2.760557053009883, "grad_norm": 3.09375, "learning_rate": 0.00027306166186283457, "loss": 4.912, "step": 6145 }, { "epoch": 2.7628032345013476, "grad_norm": 3.109375, "learning_rate": 0.00027299889234390514, "loss": 4.962, "step": 6150 }, { "epoch": 2.765049415992812, "grad_norm": 3.296875, "learning_rate": 0.0002729360579047305, "loss": 4.8934, "step": 6155 }, { "epoch": 2.767295597484277, "grad_norm": 3.15625, "learning_rate": 0.00027287315858308164, "loss": 4.9474, "step": 6160 }, { "epoch": 2.769541778975741, "grad_norm": 3.09375, "learning_rate": 0.00027281019441676856, "loss": 4.9378, "step": 6165 }, { "epoch": 2.771787960467206, "grad_norm": 2.984375, "learning_rate": 0.00027274716544364034, "loss": 4.9604, "step": 6170 }, { "epoch": 2.77403414195867, "grad_norm": 3.09375, "learning_rate": 0.00027268407170158486, "loss": 4.9024, "step": 6175 }, { "epoch": 2.776280323450135, "grad_norm": 2.859375, "learning_rate": 0.00027262091322852893, "loss": 4.9575, "step": 6180 }, { "epoch": 2.778526504941599, "grad_norm": 2.984375, "learning_rate": 0.00027255769006243855, "loss": 4.9526, "step": 6185 }, { "epoch": 2.780772686433064, "grad_norm": 2.875, "learning_rate": 0.00027249440224131813, "loss": 4.9385, "step": 6190 }, { "epoch": 2.7830188679245285, "grad_norm": 3.09375, "learning_rate": 0.0002724310498032115, "loss": 4.9669, "step": 6195 }, { "epoch": 2.785265049415993, "grad_norm": 3.0, "learning_rate": 0.0002723676327862008, "loss": 4.9392, "step": 6200 }, { "epoch": 2.7875112309074574, "grad_norm": 3.125, "learning_rate": 0.00027230415122840736, "loss": 4.873, "step": 6205 }, { "epoch": 2.789757412398922, "grad_norm": 3.390625, "learning_rate": 0.0002722406051679912, "loss": 4.9445, "step": 6210 }, { "epoch": 2.7920035938903864, "grad_norm": 3.25, "learning_rate": 0.00027217699464315105, "loss": 4.9907, "step": 6215 }, { "epoch": 2.794249775381851, "grad_norm": 3.171875, "learning_rate": 0.00027211331969212443, "loss": 5.0226, "step": 6220 }, { "epoch": 2.7964959568733154, "grad_norm": 3.125, "learning_rate": 0.00027204958035318766, "loss": 4.9335, "step": 6225 }, { "epoch": 2.79874213836478, "grad_norm": 3.203125, "learning_rate": 0.00027198577666465574, "loss": 4.9036, "step": 6230 }, { "epoch": 2.8009883198562444, "grad_norm": 3.125, "learning_rate": 0.0002719219086648821, "loss": 4.9858, "step": 6235 }, { "epoch": 2.803234501347709, "grad_norm": 3.484375, "learning_rate": 0.0002718579763922593, "loss": 4.9541, "step": 6240 }, { "epoch": 2.8054806828391734, "grad_norm": 3.703125, "learning_rate": 0.0002717939798852181, "loss": 4.918, "step": 6245 }, { "epoch": 2.807726864330638, "grad_norm": 3.09375, "learning_rate": 0.0002717299191822281, "loss": 4.9336, "step": 6250 }, { "epoch": 2.8099730458221024, "grad_norm": 3.109375, "learning_rate": 0.0002716657943217975, "loss": 4.9163, "step": 6255 }, { "epoch": 2.812219227313567, "grad_norm": 3.4375, "learning_rate": 0.000271601605342473, "loss": 4.8734, "step": 6260 }, { "epoch": 2.8144654088050314, "grad_norm": 3.265625, "learning_rate": 0.00027153735228283975, "loss": 4.9175, "step": 6265 }, { "epoch": 2.816711590296496, "grad_norm": 3.21875, "learning_rate": 0.0002714730351815216, "loss": 5.0152, "step": 6270 }, { "epoch": 2.8189577717879604, "grad_norm": 3.03125, "learning_rate": 0.0002714086540771808, "loss": 4.9391, "step": 6275 }, { "epoch": 2.821203953279425, "grad_norm": 3.03125, "learning_rate": 0.0002713442090085181, "loss": 4.9086, "step": 6280 }, { "epoch": 2.8234501347708894, "grad_norm": 3.109375, "learning_rate": 0.0002712797000142727, "loss": 4.9808, "step": 6285 }, { "epoch": 2.825696316262354, "grad_norm": 3.453125, "learning_rate": 0.0002712151271332222, "loss": 4.9397, "step": 6290 }, { "epoch": 2.827942497753819, "grad_norm": 3.203125, "learning_rate": 0.00027115049040418254, "loss": 4.8799, "step": 6295 }, { "epoch": 2.830188679245283, "grad_norm": 2.9375, "learning_rate": 0.0002710857898660082, "loss": 4.9463, "step": 6300 }, { "epoch": 2.8324348607367478, "grad_norm": 3.09375, "learning_rate": 0.00027102102555759205, "loss": 4.9161, "step": 6305 }, { "epoch": 2.834681042228212, "grad_norm": 2.828125, "learning_rate": 0.000270956197517865, "loss": 4.9685, "step": 6310 }, { "epoch": 2.8369272237196768, "grad_norm": 2.96875, "learning_rate": 0.0002708913057857965, "loss": 4.9146, "step": 6315 }, { "epoch": 2.839173405211141, "grad_norm": 2.921875, "learning_rate": 0.00027082635040039435, "loss": 4.9201, "step": 6320 }, { "epoch": 2.8414195867026057, "grad_norm": 3.0, "learning_rate": 0.0002707613314007044, "loss": 4.9169, "step": 6325 }, { "epoch": 2.8436657681940702, "grad_norm": 3.109375, "learning_rate": 0.00027069624882581077, "loss": 4.9624, "step": 6330 }, { "epoch": 2.8459119496855347, "grad_norm": 3.078125, "learning_rate": 0.000270631102714836, "loss": 4.9702, "step": 6335 }, { "epoch": 2.8481581311769992, "grad_norm": 3.015625, "learning_rate": 0.0002705658931069406, "loss": 4.9867, "step": 6340 }, { "epoch": 2.8504043126684637, "grad_norm": 2.90625, "learning_rate": 0.0002705006200413235, "loss": 4.9874, "step": 6345 }, { "epoch": 2.852650494159928, "grad_norm": 2.921875, "learning_rate": 0.00027043528355722135, "loss": 4.935, "step": 6350 }, { "epoch": 2.8548966756513927, "grad_norm": 3.03125, "learning_rate": 0.00027036988369390946, "loss": 4.8973, "step": 6355 }, { "epoch": 2.857142857142857, "grad_norm": 2.9375, "learning_rate": 0.00027030442049070076, "loss": 4.8817, "step": 6360 }, { "epoch": 2.8593890386343217, "grad_norm": 3.140625, "learning_rate": 0.0002702388939869466, "loss": 4.9394, "step": 6365 }, { "epoch": 2.861635220125786, "grad_norm": 3.125, "learning_rate": 0.00027017330422203614, "loss": 4.844, "step": 6370 }, { "epoch": 2.8638814016172507, "grad_norm": 3.015625, "learning_rate": 0.0002701076512353968, "loss": 4.8938, "step": 6375 }, { "epoch": 2.866127583108715, "grad_norm": 2.84375, "learning_rate": 0.00027004193506649374, "loss": 4.9167, "step": 6380 }, { "epoch": 2.8683737646001797, "grad_norm": 3.03125, "learning_rate": 0.00026997615575483026, "loss": 4.9192, "step": 6385 }, { "epoch": 2.870619946091644, "grad_norm": 3.109375, "learning_rate": 0.0002699103133399476, "loss": 4.9162, "step": 6390 }, { "epoch": 2.8728661275831087, "grad_norm": 3.015625, "learning_rate": 0.00026984440786142496, "loss": 4.9291, "step": 6395 }, { "epoch": 2.875112309074573, "grad_norm": 3.28125, "learning_rate": 0.0002697784393588794, "loss": 5.0111, "step": 6400 }, { "epoch": 2.8773584905660377, "grad_norm": 3.15625, "learning_rate": 0.0002697124078719659, "loss": 5.0144, "step": 6405 }, { "epoch": 2.879604672057502, "grad_norm": 2.921875, "learning_rate": 0.00026964631344037713, "loss": 4.9051, "step": 6410 }, { "epoch": 2.8818508535489666, "grad_norm": 3.25, "learning_rate": 0.00026958015610384394, "loss": 4.9883, "step": 6415 }, { "epoch": 2.884097035040431, "grad_norm": 3.078125, "learning_rate": 0.00026951393590213474, "loss": 5.031, "step": 6420 }, { "epoch": 2.8863432165318956, "grad_norm": 3.75, "learning_rate": 0.0002694476528750557, "loss": 4.9772, "step": 6425 }, { "epoch": 2.88858939802336, "grad_norm": 3.1875, "learning_rate": 0.0002693813070624509, "loss": 4.989, "step": 6430 }, { "epoch": 2.8908355795148246, "grad_norm": 3.109375, "learning_rate": 0.00026931489850420213, "loss": 4.9268, "step": 6435 }, { "epoch": 2.8930817610062896, "grad_norm": 3.03125, "learning_rate": 0.0002692484272402288, "loss": 4.9483, "step": 6440 }, { "epoch": 2.8953279424977536, "grad_norm": 3.15625, "learning_rate": 0.00026918189331048825, "loss": 4.9406, "step": 6445 }, { "epoch": 2.8975741239892185, "grad_norm": 2.9375, "learning_rate": 0.00026911529675497514, "loss": 4.9445, "step": 6450 }, { "epoch": 2.8998203054806826, "grad_norm": 3.296875, "learning_rate": 0.00026904863761372205, "loss": 4.8945, "step": 6455 }, { "epoch": 2.9020664869721475, "grad_norm": 3.171875, "learning_rate": 0.0002689819159267991, "loss": 4.944, "step": 6460 }, { "epoch": 2.904312668463612, "grad_norm": 3.0625, "learning_rate": 0.00026891513173431394, "loss": 4.8718, "step": 6465 }, { "epoch": 2.9065588499550765, "grad_norm": 3.328125, "learning_rate": 0.0002688482850764119, "loss": 4.9186, "step": 6470 }, { "epoch": 2.908805031446541, "grad_norm": 3.140625, "learning_rate": 0.0002687813759932758, "loss": 4.9278, "step": 6475 }, { "epoch": 2.9110512129380055, "grad_norm": 3.734375, "learning_rate": 0.000268714404525126, "loss": 4.9514, "step": 6480 }, { "epoch": 2.91329739442947, "grad_norm": 3.015625, "learning_rate": 0.0002686473707122204, "loss": 4.8925, "step": 6485 }, { "epoch": 2.9155435759209345, "grad_norm": 3.0, "learning_rate": 0.00026858027459485427, "loss": 4.864, "step": 6490 }, { "epoch": 2.917789757412399, "grad_norm": 3.125, "learning_rate": 0.0002685131162133604, "loss": 4.9311, "step": 6495 }, { "epoch": 2.9200359389038635, "grad_norm": 3.09375, "learning_rate": 0.0002684458956081091, "loss": 4.9183, "step": 6500 }, { "epoch": 2.922282120395328, "grad_norm": 3.296875, "learning_rate": 0.00026837861281950786, "loss": 5.0522, "step": 6505 }, { "epoch": 2.9245283018867925, "grad_norm": 3.1875, "learning_rate": 0.00026831126788800174, "loss": 4.8712, "step": 6510 }, { "epoch": 2.926774483378257, "grad_norm": 3.125, "learning_rate": 0.00026824386085407307, "loss": 4.9477, "step": 6515 }, { "epoch": 2.9290206648697215, "grad_norm": 3.03125, "learning_rate": 0.0002681763917582416, "loss": 4.8396, "step": 6520 }, { "epoch": 2.931266846361186, "grad_norm": 3.53125, "learning_rate": 0.00026810886064106425, "loss": 5.0069, "step": 6525 }, { "epoch": 2.9335130278526504, "grad_norm": 3.328125, "learning_rate": 0.00026804126754313533, "loss": 4.9952, "step": 6530 }, { "epoch": 2.935759209344115, "grad_norm": 3.078125, "learning_rate": 0.00026797361250508644, "loss": 4.8484, "step": 6535 }, { "epoch": 2.9380053908355794, "grad_norm": 3.890625, "learning_rate": 0.0002679058955675862, "loss": 5.0126, "step": 6540 }, { "epoch": 2.940251572327044, "grad_norm": 3.015625, "learning_rate": 0.00026783811677134065, "loss": 4.9527, "step": 6545 }, { "epoch": 2.9424977538185084, "grad_norm": 2.890625, "learning_rate": 0.00026777027615709304, "loss": 4.9737, "step": 6550 }, { "epoch": 2.944743935309973, "grad_norm": 3.796875, "learning_rate": 0.0002677023737656235, "loss": 4.9099, "step": 6555 }, { "epoch": 2.9469901168014374, "grad_norm": 2.984375, "learning_rate": 0.00026763440963774966, "loss": 4.8881, "step": 6560 }, { "epoch": 2.949236298292902, "grad_norm": 3.25, "learning_rate": 0.00026756638381432603, "loss": 4.9634, "step": 6565 }, { "epoch": 2.9514824797843664, "grad_norm": 3.015625, "learning_rate": 0.0002674982963362442, "loss": 4.8719, "step": 6570 }, { "epoch": 2.9537286612758313, "grad_norm": 3.125, "learning_rate": 0.00026743014724443293, "loss": 4.969, "step": 6575 }, { "epoch": 2.9559748427672954, "grad_norm": 3.03125, "learning_rate": 0.000267361936579858, "loss": 4.917, "step": 6580 }, { "epoch": 2.9582210242587603, "grad_norm": 3.109375, "learning_rate": 0.00026729366438352215, "loss": 4.9, "step": 6585 }, { "epoch": 2.9604672057502244, "grad_norm": 3.015625, "learning_rate": 0.0002672253306964651, "loss": 4.9521, "step": 6590 }, { "epoch": 2.9627133872416893, "grad_norm": 3.328125, "learning_rate": 0.0002671569355597637, "loss": 4.9743, "step": 6595 }, { "epoch": 2.964959568733154, "grad_norm": 3.09375, "learning_rate": 0.0002670884790145314, "loss": 4.9766, "step": 6600 }, { "epoch": 2.9672057502246183, "grad_norm": 3.109375, "learning_rate": 0.0002670199611019189, "loss": 4.96, "step": 6605 }, { "epoch": 2.969451931716083, "grad_norm": 3.046875, "learning_rate": 0.00026695138186311364, "loss": 4.8694, "step": 6610 }, { "epoch": 2.9716981132075473, "grad_norm": 3.15625, "learning_rate": 0.0002668827413393399, "loss": 4.8839, "step": 6615 }, { "epoch": 2.973944294699012, "grad_norm": 3.09375, "learning_rate": 0.0002668140395718588, "loss": 4.8949, "step": 6620 }, { "epoch": 2.9761904761904763, "grad_norm": 3.0625, "learning_rate": 0.0002667452766019685, "loss": 4.8834, "step": 6625 }, { "epoch": 2.9784366576819408, "grad_norm": 3.0625, "learning_rate": 0.00026667645247100357, "loss": 4.9984, "step": 6630 }, { "epoch": 2.9806828391734053, "grad_norm": 3.015625, "learning_rate": 0.0002666075672203356, "loss": 4.8534, "step": 6635 }, { "epoch": 2.9829290206648698, "grad_norm": 3.171875, "learning_rate": 0.00026653862089137296, "loss": 4.9614, "step": 6640 }, { "epoch": 2.9851752021563343, "grad_norm": 3.09375, "learning_rate": 0.0002664696135255605, "loss": 4.8943, "step": 6645 }, { "epoch": 2.9874213836477987, "grad_norm": 2.859375, "learning_rate": 0.00026640054516437997, "loss": 4.8697, "step": 6650 }, { "epoch": 2.9896675651392632, "grad_norm": 3.46875, "learning_rate": 0.0002663314158493496, "loss": 4.9618, "step": 6655 }, { "epoch": 2.9919137466307277, "grad_norm": 3.09375, "learning_rate": 0.00026626222562202456, "loss": 4.8952, "step": 6660 }, { "epoch": 2.9941599281221922, "grad_norm": 3.140625, "learning_rate": 0.00026619297452399633, "loss": 4.9478, "step": 6665 }, { "epoch": 2.9964061096136567, "grad_norm": 3.03125, "learning_rate": 0.0002661236625968931, "loss": 4.9124, "step": 6670 }, { "epoch": 2.998652291105121, "grad_norm": 3.328125, "learning_rate": 0.00026605428988237965, "loss": 4.9819, "step": 6675 }, { "epoch": 3.0008984725965857, "grad_norm": 3.375, "learning_rate": 0.0002659848564221573, "loss": 4.8854, "step": 6680 }, { "epoch": 3.00314465408805, "grad_norm": 3.296875, "learning_rate": 0.0002659153622579638, "loss": 4.8016, "step": 6685 }, { "epoch": 3.0053908355795147, "grad_norm": 3.859375, "learning_rate": 0.0002658458074315735, "loss": 4.8727, "step": 6690 }, { "epoch": 3.007637017070979, "grad_norm": 3.046875, "learning_rate": 0.0002657761919847971, "loss": 4.824, "step": 6695 }, { "epoch": 3.0098831985624437, "grad_norm": 3.015625, "learning_rate": 0.0002657065159594819, "loss": 4.8335, "step": 6700 }, { "epoch": 3.012129380053908, "grad_norm": 3.0, "learning_rate": 0.00026563677939751146, "loss": 4.8298, "step": 6705 }, { "epoch": 3.0143755615453727, "grad_norm": 3.125, "learning_rate": 0.00026556698234080577, "loss": 4.8667, "step": 6710 }, { "epoch": 3.016621743036837, "grad_norm": 3.328125, "learning_rate": 0.0002654971248313213, "loss": 4.8908, "step": 6715 }, { "epoch": 3.018867924528302, "grad_norm": 3.453125, "learning_rate": 0.0002654272069110507, "loss": 4.7931, "step": 6720 }, { "epoch": 3.0211141060197666, "grad_norm": 3.296875, "learning_rate": 0.0002653572286220229, "loss": 4.7856, "step": 6725 }, { "epoch": 3.023360287511231, "grad_norm": 3.078125, "learning_rate": 0.0002652871900063034, "loss": 4.7742, "step": 6730 }, { "epoch": 3.0256064690026956, "grad_norm": 3.203125, "learning_rate": 0.0002652170911059937, "loss": 4.8128, "step": 6735 }, { "epoch": 3.02785265049416, "grad_norm": 3.1875, "learning_rate": 0.0002651469319632316, "loss": 4.8296, "step": 6740 }, { "epoch": 3.0300988319856246, "grad_norm": 3.46875, "learning_rate": 0.00026507671262019115, "loss": 4.8438, "step": 6745 }, { "epoch": 3.032345013477089, "grad_norm": 3.078125, "learning_rate": 0.00026500643311908257, "loss": 4.7799, "step": 6750 }, { "epoch": 3.0345911949685536, "grad_norm": 3.171875, "learning_rate": 0.0002649360935021522, "loss": 4.845, "step": 6755 }, { "epoch": 3.036837376460018, "grad_norm": 3.28125, "learning_rate": 0.00026486569381168267, "loss": 4.8419, "step": 6760 }, { "epoch": 3.0390835579514826, "grad_norm": 3.265625, "learning_rate": 0.0002647952340899925, "loss": 4.8683, "step": 6765 }, { "epoch": 3.041329739442947, "grad_norm": 3.5, "learning_rate": 0.0002647247143794365, "loss": 4.8123, "step": 6770 }, { "epoch": 3.0435759209344115, "grad_norm": 3.015625, "learning_rate": 0.00026465413472240534, "loss": 4.8202, "step": 6775 }, { "epoch": 3.045822102425876, "grad_norm": 3.0625, "learning_rate": 0.0002645834951613259, "loss": 4.8717, "step": 6780 }, { "epoch": 3.0480682839173405, "grad_norm": 3.0625, "learning_rate": 0.00026451279573866095, "loss": 4.7503, "step": 6785 }, { "epoch": 3.050314465408805, "grad_norm": 3.140625, "learning_rate": 0.0002644420364969094, "loss": 4.8023, "step": 6790 }, { "epoch": 3.0525606469002695, "grad_norm": 3.25, "learning_rate": 0.0002643712174786059, "loss": 4.8349, "step": 6795 }, { "epoch": 3.054806828391734, "grad_norm": 3.109375, "learning_rate": 0.00026430033872632116, "loss": 4.7862, "step": 6800 }, { "epoch": 3.0570530098831985, "grad_norm": 4.0, "learning_rate": 0.00026422940028266183, "loss": 4.8704, "step": 6805 }, { "epoch": 3.059299191374663, "grad_norm": 3.40625, "learning_rate": 0.0002641584021902704, "loss": 4.8366, "step": 6810 }, { "epoch": 3.0615453728661275, "grad_norm": 3.296875, "learning_rate": 0.0002640873444918252, "loss": 4.8354, "step": 6815 }, { "epoch": 3.063791554357592, "grad_norm": 3.234375, "learning_rate": 0.00026401622723004034, "loss": 4.8597, "step": 6820 }, { "epoch": 3.0660377358490565, "grad_norm": 3.03125, "learning_rate": 0.00026394505044766587, "loss": 4.8656, "step": 6825 }, { "epoch": 3.068283917340521, "grad_norm": 3.203125, "learning_rate": 0.0002638738141874876, "loss": 4.8846, "step": 6830 }, { "epoch": 3.0705300988319855, "grad_norm": 3.140625, "learning_rate": 0.00026380251849232687, "loss": 4.8764, "step": 6835 }, { "epoch": 3.07277628032345, "grad_norm": 3.046875, "learning_rate": 0.00026373116340504103, "loss": 4.7835, "step": 6840 }, { "epoch": 3.0750224618149145, "grad_norm": 3.15625, "learning_rate": 0.00026365974896852296, "loss": 4.8515, "step": 6845 }, { "epoch": 3.077268643306379, "grad_norm": 3.234375, "learning_rate": 0.0002635882752257013, "loss": 4.8444, "step": 6850 }, { "epoch": 3.079514824797844, "grad_norm": 3.046875, "learning_rate": 0.00026351674221954043, "loss": 4.7379, "step": 6855 }, { "epoch": 3.0817610062893084, "grad_norm": 3.359375, "learning_rate": 0.0002634451499930401, "loss": 4.8479, "step": 6860 }, { "epoch": 3.084007187780773, "grad_norm": 3.203125, "learning_rate": 0.0002633734985892358, "loss": 4.7955, "step": 6865 }, { "epoch": 3.0862533692722374, "grad_norm": 2.953125, "learning_rate": 0.00026330178805119853, "loss": 4.8404, "step": 6870 }, { "epoch": 3.088499550763702, "grad_norm": 3.34375, "learning_rate": 0.00026323001842203504, "loss": 4.8335, "step": 6875 }, { "epoch": 3.0907457322551664, "grad_norm": 3.1875, "learning_rate": 0.00026315818974488744, "loss": 4.8428, "step": 6880 }, { "epoch": 3.092991913746631, "grad_norm": 3.21875, "learning_rate": 0.00026308630206293325, "loss": 4.7151, "step": 6885 }, { "epoch": 3.0952380952380953, "grad_norm": 3.046875, "learning_rate": 0.0002630143554193857, "loss": 4.8407, "step": 6890 }, { "epoch": 3.09748427672956, "grad_norm": 3.078125, "learning_rate": 0.00026294234985749313, "loss": 4.7312, "step": 6895 }, { "epoch": 3.0997304582210243, "grad_norm": 3.25, "learning_rate": 0.00026287028542053975, "loss": 4.7925, "step": 6900 }, { "epoch": 3.101976639712489, "grad_norm": 3.203125, "learning_rate": 0.0002627981621518447, "loss": 4.8265, "step": 6905 }, { "epoch": 3.1042228212039533, "grad_norm": 3.03125, "learning_rate": 0.0002627259800947627, "loss": 4.8757, "step": 6910 }, { "epoch": 3.106469002695418, "grad_norm": 3.015625, "learning_rate": 0.00026265373929268383, "loss": 4.8218, "step": 6915 }, { "epoch": 3.1087151841868823, "grad_norm": 3.046875, "learning_rate": 0.00026258143978903354, "loss": 4.8604, "step": 6920 }, { "epoch": 3.110961365678347, "grad_norm": 3.546875, "learning_rate": 0.00026250908162727234, "loss": 4.8454, "step": 6925 }, { "epoch": 3.1132075471698113, "grad_norm": 3.125, "learning_rate": 0.0002624366648508962, "loss": 4.8521, "step": 6930 }, { "epoch": 3.115453728661276, "grad_norm": 3.1875, "learning_rate": 0.00026236418950343623, "loss": 4.7329, "step": 6935 }, { "epoch": 3.1176999101527403, "grad_norm": 3.15625, "learning_rate": 0.0002622916556284588, "loss": 4.7403, "step": 6940 }, { "epoch": 3.1199460916442048, "grad_norm": 3.15625, "learning_rate": 0.0002622190632695655, "loss": 4.7682, "step": 6945 }, { "epoch": 3.1221922731356693, "grad_norm": 3.21875, "learning_rate": 0.0002621464124703929, "loss": 4.8178, "step": 6950 }, { "epoch": 3.1244384546271338, "grad_norm": 3.125, "learning_rate": 0.00026207370327461284, "loss": 4.8774, "step": 6955 }, { "epoch": 3.1266846361185983, "grad_norm": 3.578125, "learning_rate": 0.0002620009357259323, "loss": 4.8435, "step": 6960 }, { "epoch": 3.1289308176100628, "grad_norm": 3.109375, "learning_rate": 0.0002619281098680932, "loss": 4.8077, "step": 6965 }, { "epoch": 3.1311769991015272, "grad_norm": 3.296875, "learning_rate": 0.0002618552257448727, "loss": 4.7614, "step": 6970 }, { "epoch": 3.1334231805929917, "grad_norm": 3.21875, "learning_rate": 0.00026178228340008276, "loss": 4.8246, "step": 6975 }, { "epoch": 3.1356693620844562, "grad_norm": 3.546875, "learning_rate": 0.0002617092828775705, "loss": 4.8138, "step": 6980 }, { "epoch": 3.1379155435759207, "grad_norm": 3.046875, "learning_rate": 0.0002616362242212179, "loss": 4.8909, "step": 6985 }, { "epoch": 3.1401617250673857, "grad_norm": 3.390625, "learning_rate": 0.00026156310747494206, "loss": 4.826, "step": 6990 }, { "epoch": 3.14240790655885, "grad_norm": 3.140625, "learning_rate": 0.0002614899326826948, "loss": 4.7744, "step": 6995 }, { "epoch": 3.1446540880503147, "grad_norm": 3.296875, "learning_rate": 0.00026141669988846293, "loss": 4.9305, "step": 7000 }, { "epoch": 3.1446540880503147, "eval_loss": 4.942409038543701, "eval_runtime": 16.1167, "eval_samples_per_second": 1924.273, "eval_steps_per_second": 240.557, "step": 7000 }, { "epoch": 3.146900269541779, "grad_norm": 3.03125, "learning_rate": 0.00026134340913626814, "loss": 4.8665, "step": 7005 }, { "epoch": 3.1491464510332436, "grad_norm": 3.328125, "learning_rate": 0.00026127006047016693, "loss": 4.8994, "step": 7010 }, { "epoch": 3.151392632524708, "grad_norm": 3.25, "learning_rate": 0.0002611966539342506, "loss": 4.8005, "step": 7015 }, { "epoch": 3.1536388140161726, "grad_norm": 3.203125, "learning_rate": 0.0002611231895726453, "loss": 4.9289, "step": 7020 }, { "epoch": 3.155884995507637, "grad_norm": 3.359375, "learning_rate": 0.0002610496674295118, "loss": 4.7866, "step": 7025 }, { "epoch": 3.1581311769991016, "grad_norm": 3.015625, "learning_rate": 0.0002609760875490457, "loss": 4.791, "step": 7030 }, { "epoch": 3.160377358490566, "grad_norm": 3.0625, "learning_rate": 0.00026090244997547743, "loss": 4.7871, "step": 7035 }, { "epoch": 3.1626235399820306, "grad_norm": 3.015625, "learning_rate": 0.00026082875475307184, "loss": 4.7937, "step": 7040 }, { "epoch": 3.164869721473495, "grad_norm": 2.953125, "learning_rate": 0.0002607550019261287, "loss": 4.794, "step": 7045 }, { "epoch": 3.1671159029649596, "grad_norm": 3.3125, "learning_rate": 0.0002606811915389822, "loss": 4.8237, "step": 7050 }, { "epoch": 3.169362084456424, "grad_norm": 2.921875, "learning_rate": 0.00026060732363600113, "loss": 4.8981, "step": 7055 }, { "epoch": 3.1716082659478886, "grad_norm": 3.296875, "learning_rate": 0.00026053339826158904, "loss": 4.8026, "step": 7060 }, { "epoch": 3.173854447439353, "grad_norm": 3.296875, "learning_rate": 0.0002604594154601839, "loss": 4.8203, "step": 7065 }, { "epoch": 3.1761006289308176, "grad_norm": 3.3125, "learning_rate": 0.00026038537527625817, "loss": 4.8191, "step": 7070 }, { "epoch": 3.178346810422282, "grad_norm": 3.1875, "learning_rate": 0.00026031127775431894, "loss": 4.8258, "step": 7075 }, { "epoch": 3.1805929919137466, "grad_norm": 3.109375, "learning_rate": 0.0002602371229389076, "loss": 4.8744, "step": 7080 }, { "epoch": 3.182839173405211, "grad_norm": 3.296875, "learning_rate": 0.0002601629108746001, "loss": 4.8279, "step": 7085 }, { "epoch": 3.1850853548966755, "grad_norm": 3.390625, "learning_rate": 0.0002600886416060068, "loss": 4.7797, "step": 7090 }, { "epoch": 3.18733153638814, "grad_norm": 4.0625, "learning_rate": 0.00026001431517777226, "loss": 4.8642, "step": 7095 }, { "epoch": 3.1895777178796045, "grad_norm": 3.265625, "learning_rate": 0.0002599399316345757, "loss": 4.7916, "step": 7100 }, { "epoch": 3.191823899371069, "grad_norm": 3.171875, "learning_rate": 0.0002598654910211304, "loss": 4.779, "step": 7105 }, { "epoch": 3.1940700808625335, "grad_norm": 3.21875, "learning_rate": 0.0002597909933821842, "loss": 4.8629, "step": 7110 }, { "epoch": 3.196316262353998, "grad_norm": 3.21875, "learning_rate": 0.000259716438762519, "loss": 4.7861, "step": 7115 }, { "epoch": 3.1985624438454625, "grad_norm": 3.25, "learning_rate": 0.0002596418272069511, "loss": 4.8307, "step": 7120 }, { "epoch": 3.2008086253369274, "grad_norm": 3.15625, "learning_rate": 0.0002595671587603309, "loss": 4.7688, "step": 7125 }, { "epoch": 3.2030548068283915, "grad_norm": 3.125, "learning_rate": 0.00025949243346754306, "loss": 4.8044, "step": 7130 }, { "epoch": 3.2053009883198564, "grad_norm": 3.21875, "learning_rate": 0.00025941765137350647, "loss": 4.8947, "step": 7135 }, { "epoch": 3.207547169811321, "grad_norm": 3.28125, "learning_rate": 0.0002593428125231741, "loss": 4.8465, "step": 7140 }, { "epoch": 3.2097933513027854, "grad_norm": 3.171875, "learning_rate": 0.000259267916961533, "loss": 4.7989, "step": 7145 }, { "epoch": 3.21203953279425, "grad_norm": 3.203125, "learning_rate": 0.0002591929647336044, "loss": 4.8914, "step": 7150 }, { "epoch": 3.2142857142857144, "grad_norm": 3.125, "learning_rate": 0.00025911795588444354, "loss": 4.861, "step": 7155 }, { "epoch": 3.216531895777179, "grad_norm": 3.28125, "learning_rate": 0.00025904289045913966, "loss": 4.8461, "step": 7160 }, { "epoch": 3.2187780772686434, "grad_norm": 3.21875, "learning_rate": 0.0002589677685028161, "loss": 4.8025, "step": 7165 }, { "epoch": 3.221024258760108, "grad_norm": 3.265625, "learning_rate": 0.0002588925900606301, "loss": 4.7946, "step": 7170 }, { "epoch": 3.2232704402515724, "grad_norm": 3.296875, "learning_rate": 0.000258817355177773, "loss": 4.8217, "step": 7175 }, { "epoch": 3.225516621743037, "grad_norm": 3.3125, "learning_rate": 0.0002587420638994698, "loss": 4.7985, "step": 7180 }, { "epoch": 3.2277628032345014, "grad_norm": 3.328125, "learning_rate": 0.0002586667162709797, "loss": 4.7482, "step": 7185 }, { "epoch": 3.230008984725966, "grad_norm": 3.03125, "learning_rate": 0.0002585913123375956, "loss": 4.7843, "step": 7190 }, { "epoch": 3.2322551662174304, "grad_norm": 3.203125, "learning_rate": 0.00025851585214464414, "loss": 4.8181, "step": 7195 }, { "epoch": 3.234501347708895, "grad_norm": 3.234375, "learning_rate": 0.0002584403357374861, "loss": 4.8006, "step": 7200 }, { "epoch": 3.2367475292003594, "grad_norm": 3.3125, "learning_rate": 0.0002583647631615158, "loss": 4.9063, "step": 7205 }, { "epoch": 3.238993710691824, "grad_norm": 3.125, "learning_rate": 0.00025828913446216133, "loss": 4.8141, "step": 7210 }, { "epoch": 3.2412398921832883, "grad_norm": 3.140625, "learning_rate": 0.0002582134496848847, "loss": 4.8757, "step": 7215 }, { "epoch": 3.243486073674753, "grad_norm": 3.09375, "learning_rate": 0.0002581377088751814, "loss": 4.802, "step": 7220 }, { "epoch": 3.2457322551662173, "grad_norm": 3.125, "learning_rate": 0.00025806191207858076, "loss": 4.8018, "step": 7225 }, { "epoch": 3.247978436657682, "grad_norm": 3.390625, "learning_rate": 0.0002579860593406457, "loss": 4.8861, "step": 7230 }, { "epoch": 3.2502246181491463, "grad_norm": 3.1875, "learning_rate": 0.0002579101507069728, "loss": 4.8369, "step": 7235 }, { "epoch": 3.252470799640611, "grad_norm": 3.0625, "learning_rate": 0.0002578341862231922, "loss": 4.823, "step": 7240 }, { "epoch": 3.2547169811320753, "grad_norm": 3.203125, "learning_rate": 0.0002577581659349677, "loss": 4.8676, "step": 7245 }, { "epoch": 3.25696316262354, "grad_norm": 3.3125, "learning_rate": 0.0002576820898879965, "loss": 4.8252, "step": 7250 }, { "epoch": 3.2592093441150043, "grad_norm": 3.046875, "learning_rate": 0.0002576059581280095, "loss": 4.7799, "step": 7255 }, { "epoch": 3.2614555256064692, "grad_norm": 3.125, "learning_rate": 0.0002575297707007709, "loss": 4.8234, "step": 7260 }, { "epoch": 3.2637017070979333, "grad_norm": 3.1875, "learning_rate": 0.00025745352765207843, "loss": 4.7695, "step": 7265 }, { "epoch": 3.265947888589398, "grad_norm": 3.359375, "learning_rate": 0.0002573772290277633, "loss": 4.8183, "step": 7270 }, { "epoch": 3.2681940700808627, "grad_norm": 3.15625, "learning_rate": 0.0002573008748736902, "loss": 4.8566, "step": 7275 }, { "epoch": 3.270440251572327, "grad_norm": 3.125, "learning_rate": 0.00025722446523575705, "loss": 4.7541, "step": 7280 }, { "epoch": 3.2726864330637917, "grad_norm": 3.046875, "learning_rate": 0.00025714800015989506, "loss": 4.7738, "step": 7285 }, { "epoch": 3.274932614555256, "grad_norm": 3.359375, "learning_rate": 0.00025707147969206904, "loss": 4.8893, "step": 7290 }, { "epoch": 3.2771787960467207, "grad_norm": 3.421875, "learning_rate": 0.0002569949038782769, "loss": 4.7991, "step": 7295 }, { "epoch": 3.279424977538185, "grad_norm": 3.265625, "learning_rate": 0.0002569182727645498, "loss": 4.7758, "step": 7300 }, { "epoch": 3.2816711590296497, "grad_norm": 3.171875, "learning_rate": 0.0002568415863969522, "loss": 4.7917, "step": 7305 }, { "epoch": 3.283917340521114, "grad_norm": 3.390625, "learning_rate": 0.00025676484482158187, "loss": 4.7623, "step": 7310 }, { "epoch": 3.2861635220125787, "grad_norm": 3.265625, "learning_rate": 0.0002566880480845696, "loss": 4.7615, "step": 7315 }, { "epoch": 3.288409703504043, "grad_norm": 3.4375, "learning_rate": 0.00025661119623207943, "loss": 4.7924, "step": 7320 }, { "epoch": 3.2906558849955077, "grad_norm": 3.125, "learning_rate": 0.00025653428931030856, "loss": 4.8251, "step": 7325 }, { "epoch": 3.292902066486972, "grad_norm": 3.25, "learning_rate": 0.00025645732736548707, "loss": 4.7862, "step": 7330 }, { "epoch": 3.2951482479784366, "grad_norm": 3.296875, "learning_rate": 0.0002563803104438785, "loss": 4.8408, "step": 7335 }, { "epoch": 3.297394429469901, "grad_norm": 3.453125, "learning_rate": 0.0002563032385917791, "loss": 4.7965, "step": 7340 }, { "epoch": 3.2996406109613656, "grad_norm": 3.1875, "learning_rate": 0.00025622611185551825, "loss": 4.8009, "step": 7345 }, { "epoch": 3.30188679245283, "grad_norm": 3.03125, "learning_rate": 0.0002561489302814585, "loss": 4.7463, "step": 7350 }, { "epoch": 3.3041329739442946, "grad_norm": 3.296875, "learning_rate": 0.000256071693915995, "loss": 4.8728, "step": 7355 }, { "epoch": 3.306379155435759, "grad_norm": 3.640625, "learning_rate": 0.00025599440280555616, "loss": 4.8311, "step": 7360 }, { "epoch": 3.3086253369272236, "grad_norm": 3.171875, "learning_rate": 0.00025591705699660317, "loss": 4.8349, "step": 7365 }, { "epoch": 3.310871518418688, "grad_norm": 3.03125, "learning_rate": 0.00025583965653563006, "loss": 4.8007, "step": 7370 }, { "epoch": 3.3131176999101526, "grad_norm": 3.125, "learning_rate": 0.00025576220146916376, "loss": 4.8239, "step": 7375 }, { "epoch": 3.315363881401617, "grad_norm": 3.125, "learning_rate": 0.0002556846918437641, "loss": 4.7837, "step": 7380 }, { "epoch": 3.3176100628930816, "grad_norm": 3.1875, "learning_rate": 0.0002556071277060236, "loss": 4.8485, "step": 7385 }, { "epoch": 3.319856244384546, "grad_norm": 3.1875, "learning_rate": 0.0002555295091025675, "loss": 4.8199, "step": 7390 }, { "epoch": 3.322102425876011, "grad_norm": 3.203125, "learning_rate": 0.00025545183608005395, "loss": 4.7623, "step": 7395 }, { "epoch": 3.324348607367475, "grad_norm": 3.265625, "learning_rate": 0.0002553741086851737, "loss": 4.8978, "step": 7400 }, { "epoch": 3.32659478885894, "grad_norm": 3.25, "learning_rate": 0.0002552963269646502, "loss": 4.7921, "step": 7405 }, { "epoch": 3.3288409703504045, "grad_norm": 3.21875, "learning_rate": 0.0002552184909652396, "loss": 4.7919, "step": 7410 }, { "epoch": 3.331087151841869, "grad_norm": 3.203125, "learning_rate": 0.0002551406007337305, "loss": 4.8001, "step": 7415 }, { "epoch": 3.3333333333333335, "grad_norm": 3.296875, "learning_rate": 0.0002550626563169444, "loss": 4.7865, "step": 7420 }, { "epoch": 3.335579514824798, "grad_norm": 3.015625, "learning_rate": 0.0002549846577617352, "loss": 4.7541, "step": 7425 }, { "epoch": 3.3378256963162625, "grad_norm": 3.234375, "learning_rate": 0.00025490660511498926, "loss": 4.8102, "step": 7430 }, { "epoch": 3.340071877807727, "grad_norm": 3.03125, "learning_rate": 0.0002548284984236256, "loss": 4.7369, "step": 7435 }, { "epoch": 3.3423180592991915, "grad_norm": 3.234375, "learning_rate": 0.0002547503377345957, "loss": 4.8052, "step": 7440 }, { "epoch": 3.344564240790656, "grad_norm": 3.078125, "learning_rate": 0.00025467212309488347, "loss": 4.8815, "step": 7445 }, { "epoch": 3.3468104222821204, "grad_norm": 3.3125, "learning_rate": 0.0002545938545515052, "loss": 4.8297, "step": 7450 }, { "epoch": 3.349056603773585, "grad_norm": 3.375, "learning_rate": 0.00025451553215150973, "loss": 4.8577, "step": 7455 }, { "epoch": 3.3513027852650494, "grad_norm": 3.109375, "learning_rate": 0.0002544371559419781, "loss": 4.8335, "step": 7460 }, { "epoch": 3.353548966756514, "grad_norm": 3.109375, "learning_rate": 0.0002543587259700239, "loss": 4.8204, "step": 7465 }, { "epoch": 3.3557951482479784, "grad_norm": 3.15625, "learning_rate": 0.0002542802422827928, "loss": 4.8149, "step": 7470 }, { "epoch": 3.358041329739443, "grad_norm": 3.234375, "learning_rate": 0.00025420170492746293, "loss": 4.7721, "step": 7475 }, { "epoch": 3.3602875112309074, "grad_norm": 3.3125, "learning_rate": 0.0002541231139512447, "loss": 4.7729, "step": 7480 }, { "epoch": 3.362533692722372, "grad_norm": 3.125, "learning_rate": 0.0002540444694013805, "loss": 4.8291, "step": 7485 }, { "epoch": 3.3647798742138364, "grad_norm": 3.515625, "learning_rate": 0.00025396577132514523, "loss": 4.8063, "step": 7490 }, { "epoch": 3.367026055705301, "grad_norm": 3.234375, "learning_rate": 0.00025388701976984587, "loss": 4.7306, "step": 7495 }, { "epoch": 3.3692722371967654, "grad_norm": 3.125, "learning_rate": 0.0002538082147828214, "loss": 4.7863, "step": 7500 }, { "epoch": 3.37151841868823, "grad_norm": 2.96875, "learning_rate": 0.0002537293564114432, "loss": 4.8825, "step": 7505 }, { "epoch": 3.3737646001796944, "grad_norm": 3.0625, "learning_rate": 0.00025365044470311446, "loss": 4.8224, "step": 7510 }, { "epoch": 3.376010781671159, "grad_norm": 3.25, "learning_rate": 0.0002535714797052706, "loss": 4.8422, "step": 7515 }, { "epoch": 3.3782569631626234, "grad_norm": 3.265625, "learning_rate": 0.000253492461465379, "loss": 4.7746, "step": 7520 }, { "epoch": 3.380503144654088, "grad_norm": 3.640625, "learning_rate": 0.00025341339003093905, "loss": 4.8838, "step": 7525 }, { "epoch": 3.382749326145553, "grad_norm": 3.9375, "learning_rate": 0.00025333426544948214, "loss": 4.7781, "step": 7530 }, { "epoch": 3.384995507637017, "grad_norm": 3.3125, "learning_rate": 0.0002532550877685716, "loss": 4.808, "step": 7535 }, { "epoch": 3.387241689128482, "grad_norm": 3.09375, "learning_rate": 0.0002531758570358028, "loss": 4.8098, "step": 7540 }, { "epoch": 3.3894878706199463, "grad_norm": 3.1875, "learning_rate": 0.0002530965732988027, "loss": 4.7755, "step": 7545 }, { "epoch": 3.3917340521114108, "grad_norm": 3.15625, "learning_rate": 0.00025301723660523044, "loss": 4.7746, "step": 7550 }, { "epoch": 3.3939802336028753, "grad_norm": 3.0, "learning_rate": 0.00025293784700277673, "loss": 4.7915, "step": 7555 }, { "epoch": 3.3962264150943398, "grad_norm": 3.09375, "learning_rate": 0.0002528584045391644, "loss": 4.7881, "step": 7560 }, { "epoch": 3.3984725965858043, "grad_norm": 3.265625, "learning_rate": 0.00025277890926214767, "loss": 4.8263, "step": 7565 }, { "epoch": 3.4007187780772687, "grad_norm": 3.203125, "learning_rate": 0.0002526993612195128, "loss": 4.7642, "step": 7570 }, { "epoch": 3.4029649595687332, "grad_norm": 3.03125, "learning_rate": 0.0002526197604590777, "loss": 4.8737, "step": 7575 }, { "epoch": 3.4052111410601977, "grad_norm": 3.203125, "learning_rate": 0.00025254010702869194, "loss": 4.7828, "step": 7580 }, { "epoch": 3.4074573225516622, "grad_norm": 3.03125, "learning_rate": 0.0002524604009762366, "loss": 4.8156, "step": 7585 }, { "epoch": 3.4097035040431267, "grad_norm": 3.21875, "learning_rate": 0.00025238064234962474, "loss": 4.8054, "step": 7590 }, { "epoch": 3.411949685534591, "grad_norm": 3.09375, "learning_rate": 0.00025230083119680074, "loss": 4.8162, "step": 7595 }, { "epoch": 3.4141958670260557, "grad_norm": 3.0, "learning_rate": 0.0002522209675657407, "loss": 4.8375, "step": 7600 }, { "epoch": 3.41644204851752, "grad_norm": 3.359375, "learning_rate": 0.0002521410515044522, "loss": 4.7701, "step": 7605 }, { "epoch": 3.4186882300089847, "grad_norm": 3.328125, "learning_rate": 0.0002520610830609742, "loss": 4.7296, "step": 7610 }, { "epoch": 3.420934411500449, "grad_norm": 3.078125, "learning_rate": 0.0002519810622833775, "loss": 4.858, "step": 7615 }, { "epoch": 3.4231805929919137, "grad_norm": 3.28125, "learning_rate": 0.00025190098921976404, "loss": 4.852, "step": 7620 }, { "epoch": 3.425426774483378, "grad_norm": 3.234375, "learning_rate": 0.00025182086391826726, "loss": 4.7896, "step": 7625 }, { "epoch": 3.4276729559748427, "grad_norm": 3.125, "learning_rate": 0.0002517406864270522, "loss": 4.8046, "step": 7630 }, { "epoch": 3.429919137466307, "grad_norm": 3.109375, "learning_rate": 0.000251660456794315, "loss": 4.8123, "step": 7635 }, { "epoch": 3.4321653189577717, "grad_norm": 3.265625, "learning_rate": 0.0002515801750682833, "loss": 4.7836, "step": 7640 }, { "epoch": 3.434411500449236, "grad_norm": 3.265625, "learning_rate": 0.000251499841297216, "loss": 4.8299, "step": 7645 }, { "epoch": 3.4366576819407006, "grad_norm": 3.15625, "learning_rate": 0.0002514194555294033, "loss": 4.7713, "step": 7650 }, { "epoch": 3.438903863432165, "grad_norm": 3.109375, "learning_rate": 0.00025133901781316663, "loss": 4.7489, "step": 7655 }, { "epoch": 3.4411500449236296, "grad_norm": 3.046875, "learning_rate": 0.0002512585281968588, "loss": 4.8084, "step": 7660 }, { "epoch": 3.4433962264150946, "grad_norm": 3.234375, "learning_rate": 0.00025117798672886354, "loss": 4.7632, "step": 7665 }, { "epoch": 3.4456424079065586, "grad_norm": 3.640625, "learning_rate": 0.0002510973934575959, "loss": 4.7962, "step": 7670 }, { "epoch": 3.4478885893980236, "grad_norm": 3.203125, "learning_rate": 0.0002510167484315022, "loss": 4.7781, "step": 7675 }, { "epoch": 3.450134770889488, "grad_norm": 3.1875, "learning_rate": 0.0002509360516990597, "loss": 4.7245, "step": 7680 }, { "epoch": 3.4523809523809526, "grad_norm": 3.28125, "learning_rate": 0.00025085530330877666, "loss": 4.8373, "step": 7685 }, { "epoch": 3.454627133872417, "grad_norm": 3.28125, "learning_rate": 0.0002507745033091927, "loss": 4.7834, "step": 7690 }, { "epoch": 3.4568733153638815, "grad_norm": 3.140625, "learning_rate": 0.00025069365174887814, "loss": 4.8153, "step": 7695 }, { "epoch": 3.459119496855346, "grad_norm": 3.265625, "learning_rate": 0.0002506127486764345, "loss": 4.7421, "step": 7700 }, { "epoch": 3.4613656783468105, "grad_norm": 3.1875, "learning_rate": 0.00025053179414049416, "loss": 4.8132, "step": 7705 }, { "epoch": 3.463611859838275, "grad_norm": 3.046875, "learning_rate": 0.00025045078818972046, "loss": 4.7787, "step": 7710 }, { "epoch": 3.4658580413297395, "grad_norm": 3.5, "learning_rate": 0.0002503697308728077, "loss": 4.8161, "step": 7715 }, { "epoch": 3.468104222821204, "grad_norm": 3.34375, "learning_rate": 0.0002502886222384811, "loss": 4.7623, "step": 7720 }, { "epoch": 3.4703504043126685, "grad_norm": 3.15625, "learning_rate": 0.0002502074623354965, "loss": 4.9089, "step": 7725 }, { "epoch": 3.472596585804133, "grad_norm": 3.0625, "learning_rate": 0.0002501262512126408, "loss": 4.8818, "step": 7730 }, { "epoch": 3.4748427672955975, "grad_norm": 3.375, "learning_rate": 0.00025004498891873146, "loss": 4.8681, "step": 7735 }, { "epoch": 3.477088948787062, "grad_norm": 3.046875, "learning_rate": 0.000249963675502617, "loss": 4.7931, "step": 7740 }, { "epoch": 3.4793351302785265, "grad_norm": 3.109375, "learning_rate": 0.00024988231101317647, "loss": 4.7794, "step": 7745 }, { "epoch": 3.481581311769991, "grad_norm": 3.140625, "learning_rate": 0.00024980089549931955, "loss": 4.7498, "step": 7750 }, { "epoch": 3.4838274932614555, "grad_norm": 3.046875, "learning_rate": 0.0002497194290099868, "loss": 4.8458, "step": 7755 }, { "epoch": 3.48607367475292, "grad_norm": 3.34375, "learning_rate": 0.00024963791159414927, "loss": 4.8934, "step": 7760 }, { "epoch": 3.4883198562443845, "grad_norm": 3.375, "learning_rate": 0.00024955634330080863, "loss": 4.7986, "step": 7765 }, { "epoch": 3.490566037735849, "grad_norm": 3.0625, "learning_rate": 0.00024947472417899733, "loss": 4.7661, "step": 7770 }, { "epoch": 3.4928122192273134, "grad_norm": 3.296875, "learning_rate": 0.000249393054277778, "loss": 4.8551, "step": 7775 }, { "epoch": 3.495058400718778, "grad_norm": 3.078125, "learning_rate": 0.0002493113336462442, "loss": 4.8479, "step": 7780 }, { "epoch": 3.4973045822102424, "grad_norm": 3.25, "learning_rate": 0.00024922956233351976, "loss": 4.8853, "step": 7785 }, { "epoch": 3.499550763701707, "grad_norm": 3.171875, "learning_rate": 0.00024914774038875895, "loss": 4.7687, "step": 7790 }, { "epoch": 3.5017969451931714, "grad_norm": 3.296875, "learning_rate": 0.0002490658678611466, "loss": 4.7617, "step": 7795 }, { "epoch": 3.5040431266846364, "grad_norm": 3.078125, "learning_rate": 0.00024898394479989786, "loss": 4.7722, "step": 7800 }, { "epoch": 3.5062893081761004, "grad_norm": 3.109375, "learning_rate": 0.0002489019712542583, "loss": 4.8146, "step": 7805 }, { "epoch": 3.5085354896675653, "grad_norm": 3.15625, "learning_rate": 0.00024881994727350373, "loss": 4.8087, "step": 7810 }, { "epoch": 3.5107816711590294, "grad_norm": 3.125, "learning_rate": 0.0002487378729069405, "loss": 4.8572, "step": 7815 }, { "epoch": 3.5130278526504943, "grad_norm": 3.171875, "learning_rate": 0.0002486557482039051, "loss": 4.7661, "step": 7820 }, { "epoch": 3.515274034141959, "grad_norm": 3.09375, "learning_rate": 0.0002485735732137642, "loss": 4.8137, "step": 7825 }, { "epoch": 3.5175202156334233, "grad_norm": 3.125, "learning_rate": 0.00024849134798591487, "loss": 4.8203, "step": 7830 }, { "epoch": 3.519766397124888, "grad_norm": 3.140625, "learning_rate": 0.00024840907256978433, "loss": 4.7541, "step": 7835 }, { "epoch": 3.5220125786163523, "grad_norm": 3.03125, "learning_rate": 0.0002483267470148298, "loss": 4.7931, "step": 7840 }, { "epoch": 3.524258760107817, "grad_norm": 3.59375, "learning_rate": 0.0002482443713705389, "loss": 4.8011, "step": 7845 }, { "epoch": 3.5265049415992813, "grad_norm": 3.296875, "learning_rate": 0.0002481619456864293, "loss": 4.8371, "step": 7850 }, { "epoch": 3.528751123090746, "grad_norm": 3.265625, "learning_rate": 0.0002480794700120485, "loss": 4.7765, "step": 7855 }, { "epoch": 3.5309973045822103, "grad_norm": 3.4375, "learning_rate": 0.00024799694439697436, "loss": 4.7728, "step": 7860 }, { "epoch": 3.5332434860736748, "grad_norm": 3.125, "learning_rate": 0.00024791436889081466, "loss": 4.82, "step": 7865 }, { "epoch": 3.5354896675651393, "grad_norm": 3.28125, "learning_rate": 0.0002478317435432071, "loss": 4.8269, "step": 7870 }, { "epoch": 3.5377358490566038, "grad_norm": 3.1875, "learning_rate": 0.00024774906840381935, "loss": 4.8323, "step": 7875 }, { "epoch": 3.5399820305480683, "grad_norm": 3.1875, "learning_rate": 0.0002476663435223492, "loss": 4.769, "step": 7880 }, { "epoch": 3.5422282120395328, "grad_norm": 3.296875, "learning_rate": 0.00024758356894852404, "loss": 4.7889, "step": 7885 }, { "epoch": 3.5444743935309972, "grad_norm": 3.21875, "learning_rate": 0.00024750074473210134, "loss": 4.7562, "step": 7890 }, { "epoch": 3.5467205750224617, "grad_norm": 3.203125, "learning_rate": 0.0002474178709228684, "loss": 4.8241, "step": 7895 }, { "epoch": 3.5489667565139262, "grad_norm": 3.0, "learning_rate": 0.0002473349475706422, "loss": 4.7784, "step": 7900 }, { "epoch": 3.5512129380053907, "grad_norm": 2.984375, "learning_rate": 0.0002472519747252697, "loss": 4.7175, "step": 7905 }, { "epoch": 3.5534591194968552, "grad_norm": 3.15625, "learning_rate": 0.00024716895243662737, "loss": 4.7571, "step": 7910 }, { "epoch": 3.5557053009883197, "grad_norm": 3.234375, "learning_rate": 0.00024708588075462166, "loss": 4.7624, "step": 7915 }, { "epoch": 3.557951482479784, "grad_norm": 3.3125, "learning_rate": 0.0002470027597291885, "loss": 4.7763, "step": 7920 }, { "epoch": 3.560197663971249, "grad_norm": 3.140625, "learning_rate": 0.0002469195894102935, "loss": 4.8054, "step": 7925 }, { "epoch": 3.562443845462713, "grad_norm": 3.15625, "learning_rate": 0.0002468363698479321, "loss": 4.7957, "step": 7930 }, { "epoch": 3.564690026954178, "grad_norm": 3.390625, "learning_rate": 0.0002467531010921292, "loss": 4.7889, "step": 7935 }, { "epoch": 3.566936208445642, "grad_norm": 3.15625, "learning_rate": 0.00024666978319293914, "loss": 4.7675, "step": 7940 }, { "epoch": 3.569182389937107, "grad_norm": 3.21875, "learning_rate": 0.00024658641620044604, "loss": 4.843, "step": 7945 }, { "epoch": 3.571428571428571, "grad_norm": 3.1875, "learning_rate": 0.0002465030001647634, "loss": 4.7993, "step": 7950 }, { "epoch": 3.573674752920036, "grad_norm": 3.3125, "learning_rate": 0.0002464195351360343, "loss": 4.8091, "step": 7955 }, { "epoch": 3.5759209344115006, "grad_norm": 3.21875, "learning_rate": 0.0002463360211644311, "loss": 4.7122, "step": 7960 }, { "epoch": 3.578167115902965, "grad_norm": 3.046875, "learning_rate": 0.0002462524583001557, "loss": 4.8386, "step": 7965 }, { "epoch": 3.5804132973944296, "grad_norm": 3.203125, "learning_rate": 0.0002461688465934395, "loss": 4.752, "step": 7970 }, { "epoch": 3.582659478885894, "grad_norm": 3.125, "learning_rate": 0.00024608518609454293, "loss": 4.7621, "step": 7975 }, { "epoch": 3.5849056603773586, "grad_norm": 3.390625, "learning_rate": 0.0002460014768537561, "loss": 4.7538, "step": 7980 }, { "epoch": 3.587151841868823, "grad_norm": 3.625, "learning_rate": 0.00024591771892139817, "loss": 4.8875, "step": 7985 }, { "epoch": 3.5893980233602876, "grad_norm": 3.125, "learning_rate": 0.0002458339123478178, "loss": 4.8174, "step": 7990 }, { "epoch": 3.591644204851752, "grad_norm": 3.046875, "learning_rate": 0.00024575005718339255, "loss": 4.7648, "step": 7995 }, { "epoch": 3.5938903863432166, "grad_norm": 3.15625, "learning_rate": 0.00024566615347852965, "loss": 4.8448, "step": 8000 }, { "epoch": 3.5938903863432166, "eval_loss": 4.903536796569824, "eval_runtime": 16.0594, "eval_samples_per_second": 1931.145, "eval_steps_per_second": 241.416, "step": 8000 }, { "epoch": 3.596136567834681, "grad_norm": 3.265625, "learning_rate": 0.0002455822012836651, "loss": 4.8178, "step": 8005 }, { "epoch": 3.5983827493261455, "grad_norm": 3.015625, "learning_rate": 0.0002454982006492642, "loss": 4.7372, "step": 8010 }, { "epoch": 3.60062893081761, "grad_norm": 3.25, "learning_rate": 0.00024541415162582144, "loss": 4.8245, "step": 8015 }, { "epoch": 3.6028751123090745, "grad_norm": 3.421875, "learning_rate": 0.00024533005426386026, "loss": 4.8042, "step": 8020 }, { "epoch": 3.605121293800539, "grad_norm": 3.453125, "learning_rate": 0.0002452459086139333, "loss": 4.7305, "step": 8025 }, { "epoch": 3.6073674752920035, "grad_norm": 3.28125, "learning_rate": 0.0002451617147266221, "loss": 4.7429, "step": 8030 }, { "epoch": 3.609613656783468, "grad_norm": 2.96875, "learning_rate": 0.00024507747265253735, "loss": 4.8285, "step": 8035 }, { "epoch": 3.6118598382749325, "grad_norm": 3.234375, "learning_rate": 0.00024499318244231846, "loss": 4.7211, "step": 8040 }, { "epoch": 3.614106019766397, "grad_norm": 3.140625, "learning_rate": 0.00024490884414663406, "loss": 4.8138, "step": 8045 }, { "epoch": 3.6163522012578615, "grad_norm": 3.25, "learning_rate": 0.00024482445781618144, "loss": 4.782, "step": 8050 }, { "epoch": 3.618598382749326, "grad_norm": 3.46875, "learning_rate": 0.0002447400235016869, "loss": 4.823, "step": 8055 }, { "epoch": 3.620844564240791, "grad_norm": 3.5, "learning_rate": 0.00024465554125390566, "loss": 4.7987, "step": 8060 }, { "epoch": 3.623090745732255, "grad_norm": 3.09375, "learning_rate": 0.00024457101112362146, "loss": 4.8458, "step": 8065 }, { "epoch": 3.62533692722372, "grad_norm": 3.171875, "learning_rate": 0.00024448643316164715, "loss": 4.8168, "step": 8070 }, { "epoch": 3.627583108715184, "grad_norm": 3.265625, "learning_rate": 0.0002444018074188242, "loss": 4.819, "step": 8075 }, { "epoch": 3.629829290206649, "grad_norm": 3.203125, "learning_rate": 0.00024431713394602276, "loss": 4.8063, "step": 8080 }, { "epoch": 3.632075471698113, "grad_norm": 3.34375, "learning_rate": 0.0002442324127941417, "loss": 4.7975, "step": 8085 }, { "epoch": 3.634321653189578, "grad_norm": 3.25, "learning_rate": 0.00024414764401410854, "loss": 4.7946, "step": 8090 }, { "epoch": 3.6365678346810424, "grad_norm": 3.234375, "learning_rate": 0.00024406282765687952, "loss": 4.8146, "step": 8095 }, { "epoch": 3.638814016172507, "grad_norm": 3.3125, "learning_rate": 0.00024397796377343938, "loss": 4.8036, "step": 8100 }, { "epoch": 3.6410601976639714, "grad_norm": 3.25, "learning_rate": 0.00024389305241480144, "loss": 4.8005, "step": 8105 }, { "epoch": 3.643306379155436, "grad_norm": 3.28125, "learning_rate": 0.00024380809363200756, "loss": 4.83, "step": 8110 }, { "epoch": 3.6455525606469004, "grad_norm": 3.1875, "learning_rate": 0.0002437230874761282, "loss": 4.7698, "step": 8115 }, { "epoch": 3.647798742138365, "grad_norm": 3.328125, "learning_rate": 0.00024363803399826217, "loss": 4.7716, "step": 8120 }, { "epoch": 3.6500449236298294, "grad_norm": 3.171875, "learning_rate": 0.0002435529332495368, "loss": 4.7791, "step": 8125 }, { "epoch": 3.652291105121294, "grad_norm": 3.234375, "learning_rate": 0.0002434677852811078, "loss": 4.8309, "step": 8130 }, { "epoch": 3.6545372866127583, "grad_norm": 3.4375, "learning_rate": 0.00024338259014415923, "loss": 4.8274, "step": 8135 }, { "epoch": 3.656783468104223, "grad_norm": 3.265625, "learning_rate": 0.00024329734788990366, "loss": 4.7547, "step": 8140 }, { "epoch": 3.6590296495956873, "grad_norm": 3.265625, "learning_rate": 0.00024321205856958178, "loss": 4.7754, "step": 8145 }, { "epoch": 3.661275831087152, "grad_norm": 3.171875, "learning_rate": 0.00024312672223446272, "loss": 4.765, "step": 8150 }, { "epoch": 3.6635220125786163, "grad_norm": 3.1875, "learning_rate": 0.0002430413389358438, "loss": 4.7664, "step": 8155 }, { "epoch": 3.665768194070081, "grad_norm": 3.203125, "learning_rate": 0.00024295590872505055, "loss": 4.756, "step": 8160 }, { "epoch": 3.6680143755615453, "grad_norm": 3.328125, "learning_rate": 0.0002428704316534368, "loss": 4.7967, "step": 8165 }, { "epoch": 3.67026055705301, "grad_norm": 3.421875, "learning_rate": 0.00024278490777238448, "loss": 4.7841, "step": 8170 }, { "epoch": 3.6725067385444743, "grad_norm": 3.140625, "learning_rate": 0.0002426993371333037, "loss": 4.7319, "step": 8175 }, { "epoch": 3.674752920035939, "grad_norm": 3.296875, "learning_rate": 0.0002426137197876325, "loss": 4.8211, "step": 8180 }, { "epoch": 3.6769991015274033, "grad_norm": 3.09375, "learning_rate": 0.00024252805578683733, "loss": 4.7469, "step": 8185 }, { "epoch": 3.6792452830188678, "grad_norm": 3.28125, "learning_rate": 0.00024244234518241235, "loss": 4.7779, "step": 8190 }, { "epoch": 3.6814914645103327, "grad_norm": 3.296875, "learning_rate": 0.00024235658802587996, "loss": 4.7694, "step": 8195 }, { "epoch": 3.6837376460017968, "grad_norm": 3.171875, "learning_rate": 0.00024227078436879043, "loss": 4.8198, "step": 8200 }, { "epoch": 3.6859838274932617, "grad_norm": 3.875, "learning_rate": 0.00024218493426272203, "loss": 4.8039, "step": 8205 }, { "epoch": 3.6882300089847257, "grad_norm": 3.375, "learning_rate": 0.00024209903775928093, "loss": 4.758, "step": 8210 }, { "epoch": 3.6904761904761907, "grad_norm": 3.03125, "learning_rate": 0.0002420130949101012, "loss": 4.7523, "step": 8215 }, { "epoch": 3.6927223719676547, "grad_norm": 3.34375, "learning_rate": 0.00024192710576684476, "loss": 4.7552, "step": 8220 }, { "epoch": 3.6949685534591197, "grad_norm": 3.125, "learning_rate": 0.00024184107038120137, "loss": 4.8155, "step": 8225 }, { "epoch": 3.697214734950584, "grad_norm": 3.046875, "learning_rate": 0.00024175498880488856, "loss": 4.7569, "step": 8230 }, { "epoch": 3.6994609164420487, "grad_norm": 2.9375, "learning_rate": 0.00024166886108965168, "loss": 4.7675, "step": 8235 }, { "epoch": 3.701707097933513, "grad_norm": 3.8125, "learning_rate": 0.00024158268728726375, "loss": 4.7861, "step": 8240 }, { "epoch": 3.7039532794249777, "grad_norm": 3.3125, "learning_rate": 0.0002414964674495256, "loss": 4.8087, "step": 8245 }, { "epoch": 3.706199460916442, "grad_norm": 4.71875, "learning_rate": 0.00024141020162826558, "loss": 4.8768, "step": 8250 }, { "epoch": 3.7084456424079066, "grad_norm": 3.359375, "learning_rate": 0.0002413238898753398, "loss": 4.812, "step": 8255 }, { "epoch": 3.710691823899371, "grad_norm": 3.46875, "learning_rate": 0.00024123753224263193, "loss": 4.8085, "step": 8260 }, { "epoch": 3.7129380053908356, "grad_norm": 3.0625, "learning_rate": 0.00024115112878205321, "loss": 4.7944, "step": 8265 }, { "epoch": 3.7151841868823, "grad_norm": 3.21875, "learning_rate": 0.00024106467954554254, "loss": 4.7821, "step": 8270 }, { "epoch": 3.7174303683737646, "grad_norm": 3.140625, "learning_rate": 0.0002409781845850661, "loss": 4.7904, "step": 8275 }, { "epoch": 3.719676549865229, "grad_norm": 3.015625, "learning_rate": 0.00024089164395261784, "loss": 4.7672, "step": 8280 }, { "epoch": 3.7219227313566936, "grad_norm": 3.078125, "learning_rate": 0.0002408050577002189, "loss": 4.7801, "step": 8285 }, { "epoch": 3.724168912848158, "grad_norm": 3.125, "learning_rate": 0.00024071842587991806, "loss": 4.8116, "step": 8290 }, { "epoch": 3.7264150943396226, "grad_norm": 3.125, "learning_rate": 0.00024063174854379145, "loss": 4.8041, "step": 8295 }, { "epoch": 3.728661275831087, "grad_norm": 3.34375, "learning_rate": 0.00024054502574394235, "loss": 4.8149, "step": 8300 }, { "epoch": 3.7309074573225516, "grad_norm": 3.15625, "learning_rate": 0.0002404582575325016, "loss": 4.7656, "step": 8305 }, { "epoch": 3.733153638814016, "grad_norm": 3.046875, "learning_rate": 0.00024037144396162733, "loss": 4.8151, "step": 8310 }, { "epoch": 3.7353998203054806, "grad_norm": 3.390625, "learning_rate": 0.00024028458508350484, "loss": 4.836, "step": 8315 }, { "epoch": 3.737646001796945, "grad_norm": 3.28125, "learning_rate": 0.00024019768095034664, "loss": 4.7813, "step": 8320 }, { "epoch": 3.7398921832884096, "grad_norm": 3.234375, "learning_rate": 0.00024011073161439255, "loss": 4.8202, "step": 8325 }, { "epoch": 3.742138364779874, "grad_norm": 3.375, "learning_rate": 0.00024002373712790956, "loss": 4.7586, "step": 8330 }, { "epoch": 3.7443845462713385, "grad_norm": 3.296875, "learning_rate": 0.0002399366975431917, "loss": 4.7871, "step": 8335 }, { "epoch": 3.7466307277628035, "grad_norm": 3.28125, "learning_rate": 0.00023984961291256018, "loss": 4.7348, "step": 8340 }, { "epoch": 3.7488769092542675, "grad_norm": 3.171875, "learning_rate": 0.00023976248328836327, "loss": 4.7715, "step": 8345 }, { "epoch": 3.7511230907457325, "grad_norm": 3.421875, "learning_rate": 0.00023967530872297623, "loss": 4.76, "step": 8350 }, { "epoch": 3.7533692722371965, "grad_norm": 3.15625, "learning_rate": 0.0002395880892688015, "loss": 4.7845, "step": 8355 }, { "epoch": 3.7556154537286615, "grad_norm": 3.203125, "learning_rate": 0.00023950082497826842, "loss": 4.8077, "step": 8360 }, { "epoch": 3.757861635220126, "grad_norm": 3.265625, "learning_rate": 0.00023941351590383314, "loss": 4.7349, "step": 8365 }, { "epoch": 3.7601078167115904, "grad_norm": 3.859375, "learning_rate": 0.0002393261620979789, "loss": 4.7692, "step": 8370 }, { "epoch": 3.762353998203055, "grad_norm": 3.171875, "learning_rate": 0.00023923876361321583, "loss": 4.7812, "step": 8375 }, { "epoch": 3.7646001796945194, "grad_norm": 3.1875, "learning_rate": 0.0002391513205020808, "loss": 4.786, "step": 8380 }, { "epoch": 3.766846361185984, "grad_norm": 3.671875, "learning_rate": 0.00023906383281713757, "loss": 4.8124, "step": 8385 }, { "epoch": 3.7690925426774484, "grad_norm": 3.171875, "learning_rate": 0.00023897630061097677, "loss": 4.8414, "step": 8390 }, { "epoch": 3.771338724168913, "grad_norm": 3.1875, "learning_rate": 0.00023888872393621564, "loss": 4.7914, "step": 8395 }, { "epoch": 3.7735849056603774, "grad_norm": 3.40625, "learning_rate": 0.00023880110284549828, "loss": 4.8036, "step": 8400 }, { "epoch": 3.775831087151842, "grad_norm": 3.171875, "learning_rate": 0.0002387134373914954, "loss": 4.7247, "step": 8405 }, { "epoch": 3.7780772686433064, "grad_norm": 3.28125, "learning_rate": 0.00023862572762690452, "loss": 4.8149, "step": 8410 }, { "epoch": 3.780323450134771, "grad_norm": 3.28125, "learning_rate": 0.0002385379736044496, "loss": 4.7492, "step": 8415 }, { "epoch": 3.7825696316262354, "grad_norm": 3.46875, "learning_rate": 0.00023845017537688125, "loss": 4.763, "step": 8420 }, { "epoch": 3.7848158131177, "grad_norm": 3.15625, "learning_rate": 0.00023836233299697685, "loss": 4.7334, "step": 8425 }, { "epoch": 3.7870619946091644, "grad_norm": 3.484375, "learning_rate": 0.00023827444651754005, "loss": 4.7934, "step": 8430 }, { "epoch": 3.789308176100629, "grad_norm": 3.34375, "learning_rate": 0.00023818651599140115, "loss": 4.8119, "step": 8435 }, { "epoch": 3.7915543575920934, "grad_norm": 3.453125, "learning_rate": 0.00023809854147141695, "loss": 4.7902, "step": 8440 }, { "epoch": 3.793800539083558, "grad_norm": 3.65625, "learning_rate": 0.00023801052301047063, "loss": 4.8311, "step": 8445 }, { "epoch": 3.7960467205750223, "grad_norm": 3.390625, "learning_rate": 0.00023792246066147186, "loss": 4.7616, "step": 8450 }, { "epoch": 3.798292902066487, "grad_norm": 3.03125, "learning_rate": 0.00023783435447735657, "loss": 4.7668, "step": 8455 }, { "epoch": 3.8005390835579513, "grad_norm": 3.21875, "learning_rate": 0.00023774620451108707, "loss": 4.7531, "step": 8460 }, { "epoch": 3.802785265049416, "grad_norm": 3.203125, "learning_rate": 0.00023765801081565213, "loss": 4.8713, "step": 8465 }, { "epoch": 3.8050314465408803, "grad_norm": 3.140625, "learning_rate": 0.00023756977344406663, "loss": 4.878, "step": 8470 }, { "epoch": 3.8072776280323453, "grad_norm": 3.40625, "learning_rate": 0.00023748149244937186, "loss": 4.7309, "step": 8475 }, { "epoch": 3.8095238095238093, "grad_norm": 3.109375, "learning_rate": 0.00023739316788463517, "loss": 4.7774, "step": 8480 }, { "epoch": 3.8117699910152743, "grad_norm": 3.109375, "learning_rate": 0.00023730479980295022, "loss": 4.7595, "step": 8485 }, { "epoch": 3.8140161725067383, "grad_norm": 3.296875, "learning_rate": 0.0002372163882574368, "loss": 4.7536, "step": 8490 }, { "epoch": 3.8162623539982032, "grad_norm": 3.3125, "learning_rate": 0.00023712793330124077, "loss": 4.7914, "step": 8495 }, { "epoch": 3.8185085354896673, "grad_norm": 3.3125, "learning_rate": 0.00023703943498753417, "loss": 4.7724, "step": 8500 }, { "epoch": 3.8207547169811322, "grad_norm": 3.03125, "learning_rate": 0.00023695089336951507, "loss": 4.7454, "step": 8505 }, { "epoch": 3.8230008984725967, "grad_norm": 3.21875, "learning_rate": 0.00023686230850040758, "loss": 4.7626, "step": 8510 }, { "epoch": 3.825247079964061, "grad_norm": 3.296875, "learning_rate": 0.00023677368043346174, "loss": 4.773, "step": 8515 }, { "epoch": 3.8274932614555257, "grad_norm": 3.1875, "learning_rate": 0.0002366850092219537, "loss": 4.7775, "step": 8520 }, { "epoch": 3.82973944294699, "grad_norm": 3.09375, "learning_rate": 0.00023659629491918534, "loss": 4.7524, "step": 8525 }, { "epoch": 3.8319856244384547, "grad_norm": 3.15625, "learning_rate": 0.0002365075375784847, "loss": 4.783, "step": 8530 }, { "epoch": 3.834231805929919, "grad_norm": 3.28125, "learning_rate": 0.00023641873725320544, "loss": 4.7749, "step": 8535 }, { "epoch": 3.8364779874213837, "grad_norm": 3.21875, "learning_rate": 0.0002363298939967272, "loss": 4.7647, "step": 8540 }, { "epoch": 3.838724168912848, "grad_norm": 3.40625, "learning_rate": 0.00023624100786245547, "loss": 4.7626, "step": 8545 }, { "epoch": 3.8409703504043127, "grad_norm": 3.140625, "learning_rate": 0.0002361520789038213, "loss": 4.7869, "step": 8550 }, { "epoch": 3.843216531895777, "grad_norm": 3.34375, "learning_rate": 0.00023606310717428177, "loss": 4.8144, "step": 8555 }, { "epoch": 3.8454627133872417, "grad_norm": 3.015625, "learning_rate": 0.00023597409272731946, "loss": 4.791, "step": 8560 }, { "epoch": 3.847708894878706, "grad_norm": 3.0625, "learning_rate": 0.00023588503561644268, "loss": 4.86, "step": 8565 }, { "epoch": 3.8499550763701706, "grad_norm": 3.046875, "learning_rate": 0.0002357959358951854, "loss": 4.7741, "step": 8570 }, { "epoch": 3.852201257861635, "grad_norm": 3.234375, "learning_rate": 0.00023570679361710728, "loss": 4.7514, "step": 8575 }, { "epoch": 3.8544474393530996, "grad_norm": 3.296875, "learning_rate": 0.0002356176088357934, "loss": 4.7896, "step": 8580 }, { "epoch": 3.856693620844564, "grad_norm": 3.171875, "learning_rate": 0.00023552838160485453, "loss": 4.7116, "step": 8585 }, { "epoch": 3.8589398023360286, "grad_norm": 3.6875, "learning_rate": 0.00023543911197792682, "loss": 4.7938, "step": 8590 }, { "epoch": 3.861185983827493, "grad_norm": 3.296875, "learning_rate": 0.0002353498000086721, "loss": 4.7324, "step": 8595 }, { "epoch": 3.8634321653189576, "grad_norm": 3.234375, "learning_rate": 0.00023526044575077743, "loss": 4.7512, "step": 8600 }, { "epoch": 3.865678346810422, "grad_norm": 3.484375, "learning_rate": 0.0002351710492579555, "loss": 4.7148, "step": 8605 }, { "epoch": 3.867924528301887, "grad_norm": 3.09375, "learning_rate": 0.00023508161058394424, "loss": 4.7609, "step": 8610 }, { "epoch": 3.870170709793351, "grad_norm": 3.40625, "learning_rate": 0.00023499212978250696, "loss": 4.8106, "step": 8615 }, { "epoch": 3.872416891284816, "grad_norm": 3.234375, "learning_rate": 0.00023490260690743235, "loss": 4.8064, "step": 8620 }, { "epoch": 3.87466307277628, "grad_norm": 3.34375, "learning_rate": 0.00023481304201253438, "loss": 4.8099, "step": 8625 }, { "epoch": 3.876909254267745, "grad_norm": 3.46875, "learning_rate": 0.00023472343515165223, "loss": 4.8328, "step": 8630 }, { "epoch": 3.879155435759209, "grad_norm": 3.109375, "learning_rate": 0.00023463378637865036, "loss": 4.8231, "step": 8635 }, { "epoch": 3.881401617250674, "grad_norm": 3.125, "learning_rate": 0.00023454409574741843, "loss": 4.7911, "step": 8640 }, { "epoch": 3.8836477987421385, "grad_norm": 3.078125, "learning_rate": 0.00023445436331187108, "loss": 4.7646, "step": 8645 }, { "epoch": 3.885893980233603, "grad_norm": 3.21875, "learning_rate": 0.0002343645891259484, "loss": 4.8198, "step": 8650 }, { "epoch": 3.8881401617250675, "grad_norm": 3.375, "learning_rate": 0.00023427477324361532, "loss": 4.7684, "step": 8655 }, { "epoch": 3.890386343216532, "grad_norm": 3.21875, "learning_rate": 0.00023418491571886198, "loss": 4.7267, "step": 8660 }, { "epoch": 3.8926325247079965, "grad_norm": 3.203125, "learning_rate": 0.0002340950166057034, "loss": 4.8222, "step": 8665 }, { "epoch": 3.894878706199461, "grad_norm": 3.359375, "learning_rate": 0.0002340050759581798, "loss": 4.8394, "step": 8670 }, { "epoch": 3.8971248876909255, "grad_norm": 2.9375, "learning_rate": 0.00023391509383035618, "loss": 4.7634, "step": 8675 }, { "epoch": 3.89937106918239, "grad_norm": 3.25, "learning_rate": 0.00023382507027632264, "loss": 4.6996, "step": 8680 }, { "epoch": 3.9016172506738545, "grad_norm": 3.0, "learning_rate": 0.00023373500535019403, "loss": 4.8458, "step": 8685 }, { "epoch": 3.903863432165319, "grad_norm": 3.25, "learning_rate": 0.00023364489910611018, "loss": 4.7772, "step": 8690 }, { "epoch": 3.9061096136567834, "grad_norm": 3.15625, "learning_rate": 0.00023355475159823568, "loss": 4.8331, "step": 8695 }, { "epoch": 3.908355795148248, "grad_norm": 3.1875, "learning_rate": 0.00023346456288075995, "loss": 4.8063, "step": 8700 }, { "epoch": 3.9106019766397124, "grad_norm": 3.59375, "learning_rate": 0.00023337433300789725, "loss": 4.7402, "step": 8705 }, { "epoch": 3.912848158131177, "grad_norm": 3.046875, "learning_rate": 0.00023328406203388646, "loss": 4.7966, "step": 8710 }, { "epoch": 3.9150943396226414, "grad_norm": 3.5, "learning_rate": 0.00023319375001299125, "loss": 4.7583, "step": 8715 }, { "epoch": 3.917340521114106, "grad_norm": 3.15625, "learning_rate": 0.00023310339699949995, "loss": 4.8278, "step": 8720 }, { "epoch": 3.9195867026055704, "grad_norm": 3.359375, "learning_rate": 0.0002330130030477255, "loss": 4.7303, "step": 8725 }, { "epoch": 3.921832884097035, "grad_norm": 3.109375, "learning_rate": 0.00023292256821200546, "loss": 4.8432, "step": 8730 }, { "epoch": 3.9240790655884994, "grad_norm": 3.1875, "learning_rate": 0.00023283209254670203, "loss": 4.7292, "step": 8735 }, { "epoch": 3.926325247079964, "grad_norm": 3.265625, "learning_rate": 0.00023274157610620187, "loss": 4.7701, "step": 8740 }, { "epoch": 3.928571428571429, "grad_norm": 3.28125, "learning_rate": 0.00023265101894491623, "loss": 4.7612, "step": 8745 }, { "epoch": 3.930817610062893, "grad_norm": 4.0, "learning_rate": 0.0002325604211172807, "loss": 4.8229, "step": 8750 }, { "epoch": 3.933063791554358, "grad_norm": 3.21875, "learning_rate": 0.00023246978267775546, "loss": 4.7351, "step": 8755 }, { "epoch": 3.935309973045822, "grad_norm": 3.203125, "learning_rate": 0.00023237910368082503, "loss": 4.8045, "step": 8760 }, { "epoch": 3.937556154537287, "grad_norm": 3.40625, "learning_rate": 0.0002322883841809983, "loss": 4.8242, "step": 8765 }, { "epoch": 3.939802336028751, "grad_norm": 3.234375, "learning_rate": 0.00023219762423280863, "loss": 4.7605, "step": 8770 }, { "epoch": 3.942048517520216, "grad_norm": 3.15625, "learning_rate": 0.00023210682389081355, "loss": 4.7575, "step": 8775 }, { "epoch": 3.9442946990116803, "grad_norm": 3.15625, "learning_rate": 0.00023201598320959487, "loss": 4.7075, "step": 8780 }, { "epoch": 3.9465408805031448, "grad_norm": 3.46875, "learning_rate": 0.00023192510224375875, "loss": 4.7283, "step": 8785 }, { "epoch": 3.9487870619946093, "grad_norm": 3.203125, "learning_rate": 0.00023183418104793548, "loss": 4.7323, "step": 8790 }, { "epoch": 3.9510332434860738, "grad_norm": 3.21875, "learning_rate": 0.00023174321967677958, "loss": 4.7524, "step": 8795 }, { "epoch": 3.9532794249775383, "grad_norm": 3.265625, "learning_rate": 0.00023165221818496976, "loss": 4.7401, "step": 8800 }, { "epoch": 3.9555256064690028, "grad_norm": 3.421875, "learning_rate": 0.00023156117662720876, "loss": 4.8213, "step": 8805 }, { "epoch": 3.9577717879604672, "grad_norm": 3.375, "learning_rate": 0.0002314700950582234, "loss": 4.778, "step": 8810 }, { "epoch": 3.9600179694519317, "grad_norm": 3.3125, "learning_rate": 0.00023137897353276468, "loss": 4.7286, "step": 8815 }, { "epoch": 3.9622641509433962, "grad_norm": 3.296875, "learning_rate": 0.0002312878121056074, "loss": 4.7604, "step": 8820 }, { "epoch": 3.9645103324348607, "grad_norm": 3.578125, "learning_rate": 0.00023119661083155057, "loss": 4.7712, "step": 8825 }, { "epoch": 3.9667565139263252, "grad_norm": 3.546875, "learning_rate": 0.0002311053697654171, "loss": 4.7624, "step": 8830 }, { "epoch": 3.9690026954177897, "grad_norm": 3.109375, "learning_rate": 0.00023101408896205366, "loss": 4.7509, "step": 8835 }, { "epoch": 3.971248876909254, "grad_norm": 3.015625, "learning_rate": 0.00023092276847633101, "loss": 4.8025, "step": 8840 }, { "epoch": 3.9734950584007187, "grad_norm": 3.234375, "learning_rate": 0.00023083140836314367, "loss": 4.8212, "step": 8845 }, { "epoch": 3.975741239892183, "grad_norm": 3.203125, "learning_rate": 0.00023074000867740995, "loss": 4.6859, "step": 8850 }, { "epoch": 3.9779874213836477, "grad_norm": 3.390625, "learning_rate": 0.000230648569474072, "loss": 4.8425, "step": 8855 }, { "epoch": 3.980233602875112, "grad_norm": 3.375, "learning_rate": 0.0002305570908080957, "loss": 4.7836, "step": 8860 }, { "epoch": 3.9824797843665767, "grad_norm": 3.453125, "learning_rate": 0.00023046557273447075, "loss": 4.8095, "step": 8865 }, { "epoch": 3.984725965858041, "grad_norm": 3.296875, "learning_rate": 0.00023037401530821042, "loss": 4.772, "step": 8870 }, { "epoch": 3.9869721473495057, "grad_norm": 3.484375, "learning_rate": 0.00023028241858435154, "loss": 4.7742, "step": 8875 }, { "epoch": 3.9892183288409706, "grad_norm": 3.375, "learning_rate": 0.0002301907826179548, "loss": 4.7964, "step": 8880 }, { "epoch": 3.9914645103324347, "grad_norm": 3.25, "learning_rate": 0.00023009910746410442, "loss": 4.7904, "step": 8885 }, { "epoch": 3.9937106918238996, "grad_norm": 3.015625, "learning_rate": 0.00023000739317790805, "loss": 4.8029, "step": 8890 }, { "epoch": 3.9959568733153636, "grad_norm": 3.421875, "learning_rate": 0.00022991563981449693, "loss": 4.721, "step": 8895 }, { "epoch": 3.9982030548068286, "grad_norm": 3.109375, "learning_rate": 0.00022982384742902586, "loss": 4.7919, "step": 8900 }, { "epoch": 4.000449236298293, "grad_norm": 3.296875, "learning_rate": 0.00022973201607667297, "loss": 4.8092, "step": 8905 }, { "epoch": 4.002695417789758, "grad_norm": 3.28125, "learning_rate": 0.00022964014581263993, "loss": 4.6842, "step": 8910 }, { "epoch": 4.004941599281222, "grad_norm": 3.28125, "learning_rate": 0.0002295482366921517, "loss": 4.7022, "step": 8915 }, { "epoch": 4.007187780772687, "grad_norm": 3.203125, "learning_rate": 0.00022945628877045675, "loss": 4.6657, "step": 8920 }, { "epoch": 4.009433962264151, "grad_norm": 3.21875, "learning_rate": 0.00022936430210282674, "loss": 4.6306, "step": 8925 }, { "epoch": 4.0116801437556155, "grad_norm": 3.3125, "learning_rate": 0.00022927227674455653, "loss": 4.6586, "step": 8930 }, { "epoch": 4.01392632524708, "grad_norm": 3.265625, "learning_rate": 0.0002291802127509645, "loss": 4.7236, "step": 8935 }, { "epoch": 4.0161725067385445, "grad_norm": 3.296875, "learning_rate": 0.0002290881101773921, "loss": 4.7126, "step": 8940 }, { "epoch": 4.018418688230009, "grad_norm": 3.25, "learning_rate": 0.00022899596907920389, "loss": 4.7159, "step": 8945 }, { "epoch": 4.0206648697214735, "grad_norm": 3.46875, "learning_rate": 0.0002289037895117878, "loss": 4.7184, "step": 8950 }, { "epoch": 4.022911051212938, "grad_norm": 3.109375, "learning_rate": 0.0002288115715305547, "loss": 4.6531, "step": 8955 }, { "epoch": 4.0251572327044025, "grad_norm": 3.34375, "learning_rate": 0.00022871931519093867, "loss": 4.6687, "step": 8960 }, { "epoch": 4.0274034141958674, "grad_norm": 3.21875, "learning_rate": 0.00022862702054839674, "loss": 4.6754, "step": 8965 }, { "epoch": 4.0296495956873315, "grad_norm": 3.265625, "learning_rate": 0.00022853468765840907, "loss": 4.7094, "step": 8970 }, { "epoch": 4.031895777178796, "grad_norm": 3.25, "learning_rate": 0.00022844231657647874, "loss": 4.6216, "step": 8975 }, { "epoch": 4.0341419586702605, "grad_norm": 3.109375, "learning_rate": 0.00022834990735813186, "loss": 4.677, "step": 8980 }, { "epoch": 4.036388140161725, "grad_norm": 3.40625, "learning_rate": 0.0002282574600589174, "loss": 4.6951, "step": 8985 }, { "epoch": 4.0386343216531895, "grad_norm": 3.15625, "learning_rate": 0.00022816497473440717, "loss": 4.7113, "step": 8990 }, { "epoch": 4.040880503144654, "grad_norm": 3.4375, "learning_rate": 0.00022807245144019594, "loss": 4.6925, "step": 8995 }, { "epoch": 4.0431266846361185, "grad_norm": 3.328125, "learning_rate": 0.00022797989023190133, "loss": 4.6086, "step": 9000 }, { "epoch": 4.0431266846361185, "eval_loss": 4.870049476623535, "eval_runtime": 16.2198, "eval_samples_per_second": 1912.043, "eval_steps_per_second": 239.028, "step": 9000 }, { "epoch": 4.045372866127583, "grad_norm": 3.71875, "learning_rate": 0.00022788729116516364, "loss": 4.732, "step": 9005 }, { "epoch": 4.0476190476190474, "grad_norm": 3.4375, "learning_rate": 0.000227794654295646, "loss": 4.6506, "step": 9010 }, { "epoch": 4.049865229110512, "grad_norm": 3.171875, "learning_rate": 0.0002277019796790342, "loss": 4.6555, "step": 9015 }, { "epoch": 4.052111410601976, "grad_norm": 3.46875, "learning_rate": 0.00022760926737103683, "loss": 4.6926, "step": 9020 }, { "epoch": 4.054357592093441, "grad_norm": 3.28125, "learning_rate": 0.00022751651742738502, "loss": 4.7167, "step": 9025 }, { "epoch": 4.056603773584905, "grad_norm": 3.421875, "learning_rate": 0.00022742372990383261, "loss": 4.6789, "step": 9030 }, { "epoch": 4.05884995507637, "grad_norm": 3.3125, "learning_rate": 0.00022733090485615594, "loss": 4.693, "step": 9035 }, { "epoch": 4.061096136567834, "grad_norm": 3.140625, "learning_rate": 0.00022723804234015403, "loss": 4.6121, "step": 9040 }, { "epoch": 4.063342318059299, "grad_norm": 3.15625, "learning_rate": 0.00022714514241164825, "loss": 4.6792, "step": 9045 }, { "epoch": 4.065588499550763, "grad_norm": 3.3125, "learning_rate": 0.00022705220512648266, "loss": 4.7157, "step": 9050 }, { "epoch": 4.067834681042228, "grad_norm": 3.203125, "learning_rate": 0.0002269592305405237, "loss": 4.6604, "step": 9055 }, { "epoch": 4.070080862533692, "grad_norm": 3.46875, "learning_rate": 0.00022686621870966013, "loss": 4.7952, "step": 9060 }, { "epoch": 4.072327044025157, "grad_norm": 3.25, "learning_rate": 0.0002267731696898032, "loss": 4.6736, "step": 9065 }, { "epoch": 4.074573225516621, "grad_norm": 3.09375, "learning_rate": 0.0002266800835368865, "loss": 4.6698, "step": 9070 }, { "epoch": 4.076819407008086, "grad_norm": 3.515625, "learning_rate": 0.00022658696030686598, "loss": 4.7083, "step": 9075 }, { "epoch": 4.07906558849955, "grad_norm": 3.3125, "learning_rate": 0.00022649380005571975, "loss": 4.6677, "step": 9080 }, { "epoch": 4.081311769991015, "grad_norm": 3.375, "learning_rate": 0.0002264006028394483, "loss": 4.6811, "step": 9085 }, { "epoch": 4.083557951482479, "grad_norm": 3.1875, "learning_rate": 0.00022630736871407436, "loss": 4.7013, "step": 9090 }, { "epoch": 4.085804132973944, "grad_norm": 3.265625, "learning_rate": 0.00022621409773564269, "loss": 4.6296, "step": 9095 }, { "epoch": 4.088050314465409, "grad_norm": 3.453125, "learning_rate": 0.00022612078996022032, "loss": 4.7159, "step": 9100 }, { "epoch": 4.090296495956873, "grad_norm": 3.1875, "learning_rate": 0.0002260274454438964, "loss": 4.6755, "step": 9105 }, { "epoch": 4.092542677448338, "grad_norm": 3.328125, "learning_rate": 0.00022593406424278214, "loss": 4.6946, "step": 9110 }, { "epoch": 4.094788858939802, "grad_norm": 3.5625, "learning_rate": 0.0002258406464130108, "loss": 4.7235, "step": 9115 }, { "epoch": 4.097035040431267, "grad_norm": 3.1875, "learning_rate": 0.00022574719201073765, "loss": 4.6733, "step": 9120 }, { "epoch": 4.099281221922731, "grad_norm": 3.34375, "learning_rate": 0.00022565370109214, "loss": 4.6833, "step": 9125 }, { "epoch": 4.101527403414196, "grad_norm": 3.578125, "learning_rate": 0.00022556017371341703, "loss": 4.6202, "step": 9130 }, { "epoch": 4.10377358490566, "grad_norm": 3.265625, "learning_rate": 0.0002254666099307899, "loss": 4.7795, "step": 9135 }, { "epoch": 4.106019766397125, "grad_norm": 3.3125, "learning_rate": 0.00022537300980050157, "loss": 4.6459, "step": 9140 }, { "epoch": 4.108265947888589, "grad_norm": 3.265625, "learning_rate": 0.00022527937337881698, "loss": 4.7103, "step": 9145 }, { "epoch": 4.110512129380054, "grad_norm": 3.25, "learning_rate": 0.0002251857007220228, "loss": 4.6682, "step": 9150 }, { "epoch": 4.112758310871518, "grad_norm": 3.34375, "learning_rate": 0.00022509199188642747, "loss": 4.7415, "step": 9155 }, { "epoch": 4.115004492362983, "grad_norm": 3.140625, "learning_rate": 0.00022499824692836124, "loss": 4.688, "step": 9160 }, { "epoch": 4.117250673854447, "grad_norm": 3.28125, "learning_rate": 0.00022490446590417594, "loss": 4.654, "step": 9165 }, { "epoch": 4.119496855345912, "grad_norm": 3.265625, "learning_rate": 0.0002248106488702453, "loss": 4.7146, "step": 9170 }, { "epoch": 4.121743036837376, "grad_norm": 3.296875, "learning_rate": 0.00022471679588296456, "loss": 4.6695, "step": 9175 }, { "epoch": 4.123989218328841, "grad_norm": 3.234375, "learning_rate": 0.00022462290699875044, "loss": 4.712, "step": 9180 }, { "epoch": 4.126235399820305, "grad_norm": 3.140625, "learning_rate": 0.00022452898227404158, "loss": 4.6607, "step": 9185 }, { "epoch": 4.12848158131177, "grad_norm": 3.46875, "learning_rate": 0.00022443502176529783, "loss": 4.5974, "step": 9190 }, { "epoch": 4.130727762803234, "grad_norm": 3.3125, "learning_rate": 0.00022434102552900073, "loss": 4.6807, "step": 9195 }, { "epoch": 4.132973944294699, "grad_norm": 3.421875, "learning_rate": 0.0002242469936216533, "loss": 4.63, "step": 9200 }, { "epoch": 4.135220125786163, "grad_norm": 3.3125, "learning_rate": 0.00022415292609977988, "loss": 4.6643, "step": 9205 }, { "epoch": 4.137466307277628, "grad_norm": 3.375, "learning_rate": 0.00022405882301992637, "loss": 4.6353, "step": 9210 }, { "epoch": 4.139712488769092, "grad_norm": 3.265625, "learning_rate": 0.00022396468443865994, "loss": 4.6658, "step": 9215 }, { "epoch": 4.141958670260557, "grad_norm": 3.28125, "learning_rate": 0.00022387051041256907, "loss": 4.6706, "step": 9220 }, { "epoch": 4.144204851752021, "grad_norm": 3.125, "learning_rate": 0.00022377630099826366, "loss": 4.7247, "step": 9225 }, { "epoch": 4.146451033243486, "grad_norm": 3.1875, "learning_rate": 0.0002236820562523749, "loss": 4.6981, "step": 9230 }, { "epoch": 4.148697214734951, "grad_norm": 3.3125, "learning_rate": 0.00022358777623155505, "loss": 4.6588, "step": 9235 }, { "epoch": 4.150943396226415, "grad_norm": 3.15625, "learning_rate": 0.00022349346099247768, "loss": 4.7104, "step": 9240 }, { "epoch": 4.15318957771788, "grad_norm": 3.34375, "learning_rate": 0.00022339911059183763, "loss": 4.6564, "step": 9245 }, { "epoch": 4.155435759209344, "grad_norm": 3.34375, "learning_rate": 0.00022330472508635062, "loss": 4.6547, "step": 9250 }, { "epoch": 4.157681940700809, "grad_norm": 3.359375, "learning_rate": 0.0002232103045327537, "loss": 4.7364, "step": 9255 }, { "epoch": 4.159928122192273, "grad_norm": 3.25, "learning_rate": 0.00022311584898780494, "loss": 4.6378, "step": 9260 }, { "epoch": 4.162174303683738, "grad_norm": 3.25, "learning_rate": 0.00022302135850828337, "loss": 4.6988, "step": 9265 }, { "epoch": 4.164420485175202, "grad_norm": 3.25, "learning_rate": 0.00022292683315098904, "loss": 4.6526, "step": 9270 }, { "epoch": 4.166666666666667, "grad_norm": 3.484375, "learning_rate": 0.00022283227297274305, "loss": 4.6805, "step": 9275 }, { "epoch": 4.168912848158131, "grad_norm": 3.703125, "learning_rate": 0.00022273767803038727, "loss": 4.6883, "step": 9280 }, { "epoch": 4.171159029649596, "grad_norm": 3.390625, "learning_rate": 0.00022264304838078475, "loss": 4.6406, "step": 9285 }, { "epoch": 4.17340521114106, "grad_norm": 3.28125, "learning_rate": 0.00022254838408081908, "loss": 4.7056, "step": 9290 }, { "epoch": 4.175651392632525, "grad_norm": 3.515625, "learning_rate": 0.0002224536851873948, "loss": 4.7173, "step": 9295 }, { "epoch": 4.177897574123989, "grad_norm": 3.234375, "learning_rate": 0.00022235895175743743, "loss": 4.6716, "step": 9300 }, { "epoch": 4.180143755615454, "grad_norm": 3.53125, "learning_rate": 0.00022226418384789284, "loss": 4.6478, "step": 9305 }, { "epoch": 4.182389937106918, "grad_norm": 3.34375, "learning_rate": 0.00022216938151572814, "loss": 4.6902, "step": 9310 }, { "epoch": 4.184636118598383, "grad_norm": 3.46875, "learning_rate": 0.00022207454481793063, "loss": 4.694, "step": 9315 }, { "epoch": 4.186882300089847, "grad_norm": 3.21875, "learning_rate": 0.00022197967381150867, "loss": 4.6173, "step": 9320 }, { "epoch": 4.189128481581312, "grad_norm": 3.28125, "learning_rate": 0.00022188476855349102, "loss": 4.6479, "step": 9325 }, { "epoch": 4.191374663072776, "grad_norm": 3.53125, "learning_rate": 0.0002217898291009271, "loss": 4.7526, "step": 9330 }, { "epoch": 4.193620844564241, "grad_norm": 3.40625, "learning_rate": 0.00022169485551088678, "loss": 4.6498, "step": 9335 }, { "epoch": 4.195867026055705, "grad_norm": 3.640625, "learning_rate": 0.00022159984784046063, "loss": 4.6994, "step": 9340 }, { "epoch": 4.19811320754717, "grad_norm": 3.46875, "learning_rate": 0.00022150480614675962, "loss": 4.674, "step": 9345 }, { "epoch": 4.200359389038634, "grad_norm": 3.375, "learning_rate": 0.00022140973048691512, "loss": 4.6204, "step": 9350 }, { "epoch": 4.202605570530099, "grad_norm": 3.203125, "learning_rate": 0.00022131462091807904, "loss": 4.6888, "step": 9355 }, { "epoch": 4.204851752021563, "grad_norm": 3.328125, "learning_rate": 0.00022121947749742353, "loss": 4.7396, "step": 9360 }, { "epoch": 4.207097933513028, "grad_norm": 3.25, "learning_rate": 0.0002211243002821412, "loss": 4.7202, "step": 9365 }, { "epoch": 4.209344115004493, "grad_norm": 3.421875, "learning_rate": 0.00022102908932944488, "loss": 4.6249, "step": 9370 }, { "epoch": 4.211590296495957, "grad_norm": 3.140625, "learning_rate": 0.00022093384469656785, "loss": 4.6602, "step": 9375 }, { "epoch": 4.213836477987422, "grad_norm": 3.234375, "learning_rate": 0.00022083856644076338, "loss": 4.6307, "step": 9380 }, { "epoch": 4.216082659478886, "grad_norm": 3.40625, "learning_rate": 0.00022074325461930524, "loss": 4.678, "step": 9385 }, { "epoch": 4.218328840970351, "grad_norm": 3.0625, "learning_rate": 0.00022064790928948708, "loss": 4.6617, "step": 9390 }, { "epoch": 4.220575022461815, "grad_norm": 3.0625, "learning_rate": 0.00022055253050862295, "loss": 4.6534, "step": 9395 }, { "epoch": 4.22282120395328, "grad_norm": 3.265625, "learning_rate": 0.00022045711833404682, "loss": 4.6576, "step": 9400 }, { "epoch": 4.225067385444744, "grad_norm": 3.328125, "learning_rate": 0.0002203616728231129, "loss": 4.683, "step": 9405 }, { "epoch": 4.227313566936209, "grad_norm": 3.25, "learning_rate": 0.0002202661940331953, "loss": 4.7134, "step": 9410 }, { "epoch": 4.229559748427673, "grad_norm": 3.5, "learning_rate": 0.00022017068202168818, "loss": 4.6617, "step": 9415 }, { "epoch": 4.231805929919138, "grad_norm": 3.34375, "learning_rate": 0.0002200751368460057, "loss": 4.6808, "step": 9420 }, { "epoch": 4.234052111410602, "grad_norm": 3.265625, "learning_rate": 0.00021997955856358184, "loss": 4.6932, "step": 9425 }, { "epoch": 4.236298292902067, "grad_norm": 3.25, "learning_rate": 0.00021988394723187075, "loss": 4.6907, "step": 9430 }, { "epoch": 4.238544474393531, "grad_norm": 3.359375, "learning_rate": 0.00021978830290834614, "loss": 4.7321, "step": 9435 }, { "epoch": 4.240790655884996, "grad_norm": 3.453125, "learning_rate": 0.0002196926256505017, "loss": 4.714, "step": 9440 }, { "epoch": 4.24303683737646, "grad_norm": 3.28125, "learning_rate": 0.00021959691551585097, "loss": 4.7027, "step": 9445 }, { "epoch": 4.245283018867925, "grad_norm": 3.578125, "learning_rate": 0.0002195011725619271, "loss": 4.5976, "step": 9450 }, { "epoch": 4.247529200359389, "grad_norm": 3.234375, "learning_rate": 0.00021940539684628307, "loss": 4.629, "step": 9455 }, { "epoch": 4.249775381850854, "grad_norm": 3.546875, "learning_rate": 0.00021930958842649156, "loss": 4.6977, "step": 9460 }, { "epoch": 4.252021563342318, "grad_norm": 3.25, "learning_rate": 0.00021921374736014488, "loss": 4.6177, "step": 9465 }, { "epoch": 4.254267744833783, "grad_norm": 3.359375, "learning_rate": 0.00021911787370485497, "loss": 4.7463, "step": 9470 }, { "epoch": 4.256513926325247, "grad_norm": 4.34375, "learning_rate": 0.00021902196751825333, "loss": 4.6008, "step": 9475 }, { "epoch": 4.258760107816712, "grad_norm": 3.46875, "learning_rate": 0.0002189260288579911, "loss": 4.6715, "step": 9480 }, { "epoch": 4.261006289308176, "grad_norm": 3.109375, "learning_rate": 0.00021883005778173878, "loss": 4.692, "step": 9485 }, { "epoch": 4.263252470799641, "grad_norm": 3.265625, "learning_rate": 0.00021873405434718655, "loss": 4.7109, "step": 9490 }, { "epoch": 4.265498652291106, "grad_norm": 3.234375, "learning_rate": 0.00021863801861204393, "loss": 4.7958, "step": 9495 }, { "epoch": 4.26774483378257, "grad_norm": 3.484375, "learning_rate": 0.00021854195063403988, "loss": 4.682, "step": 9500 }, { "epoch": 4.269991015274034, "grad_norm": 3.3125, "learning_rate": 0.00021844585047092274, "loss": 4.6555, "step": 9505 }, { "epoch": 4.272237196765499, "grad_norm": 3.515625, "learning_rate": 0.00021834971818046018, "loss": 4.6723, "step": 9510 }, { "epoch": 4.274483378256964, "grad_norm": 3.53125, "learning_rate": 0.00021825355382043917, "loss": 4.688, "step": 9515 }, { "epoch": 4.276729559748428, "grad_norm": 3.25, "learning_rate": 0.0002181573574486661, "loss": 4.7326, "step": 9520 }, { "epoch": 4.2789757412398925, "grad_norm": 3.25, "learning_rate": 0.00021806112912296633, "loss": 4.6849, "step": 9525 }, { "epoch": 4.281221922731357, "grad_norm": 3.421875, "learning_rate": 0.00021796486890118474, "loss": 4.6588, "step": 9530 }, { "epoch": 4.2834681042228215, "grad_norm": 3.40625, "learning_rate": 0.00021786857684118514, "loss": 4.7288, "step": 9535 }, { "epoch": 4.285714285714286, "grad_norm": 3.21875, "learning_rate": 0.00021777225300085055, "loss": 4.749, "step": 9540 }, { "epoch": 4.2879604672057505, "grad_norm": 3.359375, "learning_rate": 0.0002176758974380832, "loss": 4.7648, "step": 9545 }, { "epoch": 4.290206648697215, "grad_norm": 3.59375, "learning_rate": 0.00021757951021080424, "loss": 4.7049, "step": 9550 }, { "epoch": 4.2924528301886795, "grad_norm": 3.25, "learning_rate": 0.00021748309137695394, "loss": 4.6978, "step": 9555 }, { "epoch": 4.294699011680144, "grad_norm": 3.4375, "learning_rate": 0.00021738664099449158, "loss": 4.7332, "step": 9560 }, { "epoch": 4.2969451931716085, "grad_norm": 3.25, "learning_rate": 0.0002172901591213953, "loss": 4.6581, "step": 9565 }, { "epoch": 4.2991913746630726, "grad_norm": 3.46875, "learning_rate": 0.00021719364581566225, "loss": 4.5986, "step": 9570 }, { "epoch": 4.3014375561545375, "grad_norm": 3.328125, "learning_rate": 0.00021709710113530851, "loss": 4.6496, "step": 9575 }, { "epoch": 4.3036837376460015, "grad_norm": 3.296875, "learning_rate": 0.00021700052513836892, "loss": 4.599, "step": 9580 }, { "epoch": 4.3059299191374665, "grad_norm": 3.59375, "learning_rate": 0.00021690391788289725, "loss": 4.6833, "step": 9585 }, { "epoch": 4.3081761006289305, "grad_norm": 3.4375, "learning_rate": 0.00021680727942696595, "loss": 4.6969, "step": 9590 }, { "epoch": 4.3104222821203955, "grad_norm": 3.515625, "learning_rate": 0.00021671060982866638, "loss": 4.6736, "step": 9595 }, { "epoch": 4.3126684636118595, "grad_norm": 3.140625, "learning_rate": 0.00021661390914610846, "loss": 4.7603, "step": 9600 }, { "epoch": 4.3149146451033245, "grad_norm": 3.28125, "learning_rate": 0.00021651717743742082, "loss": 4.6373, "step": 9605 }, { "epoch": 4.3171608265947885, "grad_norm": 3.40625, "learning_rate": 0.00021642041476075088, "loss": 4.6946, "step": 9610 }, { "epoch": 4.319407008086253, "grad_norm": 3.546875, "learning_rate": 0.0002163236211742645, "loss": 4.6837, "step": 9615 }, { "epoch": 4.3216531895777175, "grad_norm": 3.25, "learning_rate": 0.00021622679673614621, "loss": 4.7555, "step": 9620 }, { "epoch": 4.323899371069182, "grad_norm": 3.3125, "learning_rate": 0.0002161299415045991, "loss": 4.7089, "step": 9625 }, { "epoch": 4.3261455525606465, "grad_norm": 3.40625, "learning_rate": 0.00021603305553784472, "loss": 4.7033, "step": 9630 }, { "epoch": 4.328391734052111, "grad_norm": 3.171875, "learning_rate": 0.00021593613889412313, "loss": 4.6401, "step": 9635 }, { "epoch": 4.330637915543576, "grad_norm": 3.4375, "learning_rate": 0.00021583919163169286, "loss": 4.6932, "step": 9640 }, { "epoch": 4.33288409703504, "grad_norm": 3.25, "learning_rate": 0.00021574221380883072, "loss": 4.6643, "step": 9645 }, { "epoch": 4.335130278526505, "grad_norm": 3.484375, "learning_rate": 0.000215645205483832, "loss": 4.685, "step": 9650 }, { "epoch": 4.337376460017969, "grad_norm": 3.4375, "learning_rate": 0.00021554816671501034, "loss": 4.6756, "step": 9655 }, { "epoch": 4.339622641509434, "grad_norm": 3.5, "learning_rate": 0.0002154510975606976, "loss": 4.6344, "step": 9660 }, { "epoch": 4.341868823000898, "grad_norm": 3.296875, "learning_rate": 0.00021535399807924398, "loss": 4.6881, "step": 9665 }, { "epoch": 4.344115004492363, "grad_norm": 3.390625, "learning_rate": 0.0002152568683290178, "loss": 4.6898, "step": 9670 }, { "epoch": 4.346361185983827, "grad_norm": 3.109375, "learning_rate": 0.0002151597083684058, "loss": 4.7361, "step": 9675 }, { "epoch": 4.348607367475292, "grad_norm": 3.359375, "learning_rate": 0.00021506251825581255, "loss": 4.6747, "step": 9680 }, { "epoch": 4.350853548966756, "grad_norm": 3.34375, "learning_rate": 0.00021496529804966103, "loss": 4.7717, "step": 9685 }, { "epoch": 4.353099730458221, "grad_norm": 3.203125, "learning_rate": 0.00021486804780839226, "loss": 4.6896, "step": 9690 }, { "epoch": 4.355345911949685, "grad_norm": 3.78125, "learning_rate": 0.00021477076759046513, "loss": 4.621, "step": 9695 }, { "epoch": 4.35759209344115, "grad_norm": 3.5, "learning_rate": 0.00021467345745435678, "loss": 4.7051, "step": 9700 }, { "epoch": 4.359838274932614, "grad_norm": 3.203125, "learning_rate": 0.0002145761174585622, "loss": 4.6858, "step": 9705 }, { "epoch": 4.362084456424079, "grad_norm": 3.515625, "learning_rate": 0.00021447874766159433, "loss": 4.7533, "step": 9710 }, { "epoch": 4.364330637915543, "grad_norm": 3.265625, "learning_rate": 0.00021438134812198415, "loss": 4.7265, "step": 9715 }, { "epoch": 4.366576819407008, "grad_norm": 3.265625, "learning_rate": 0.00021428391889828034, "loss": 4.6811, "step": 9720 }, { "epoch": 4.368823000898472, "grad_norm": 3.453125, "learning_rate": 0.00021418646004904953, "loss": 4.6191, "step": 9725 }, { "epoch": 4.371069182389937, "grad_norm": 3.640625, "learning_rate": 0.00021408897163287615, "loss": 4.7698, "step": 9730 }, { "epoch": 4.373315363881401, "grad_norm": 3.296875, "learning_rate": 0.00021399145370836238, "loss": 4.7045, "step": 9735 }, { "epoch": 4.375561545372866, "grad_norm": 3.203125, "learning_rate": 0.0002138939063341282, "loss": 4.6745, "step": 9740 }, { "epoch": 4.37780772686433, "grad_norm": 3.5, "learning_rate": 0.00021379632956881116, "loss": 4.6972, "step": 9745 }, { "epoch": 4.380053908355795, "grad_norm": 3.296875, "learning_rate": 0.00021369872347106662, "loss": 4.6778, "step": 9750 }, { "epoch": 4.382300089847259, "grad_norm": 3.578125, "learning_rate": 0.00021360108809956752, "loss": 4.7863, "step": 9755 }, { "epoch": 4.384546271338724, "grad_norm": 3.171875, "learning_rate": 0.00021350342351300438, "loss": 4.6412, "step": 9760 }, { "epoch": 4.386792452830189, "grad_norm": 3.296875, "learning_rate": 0.00021340572977008524, "loss": 4.6175, "step": 9765 }, { "epoch": 4.389038634321653, "grad_norm": 3.09375, "learning_rate": 0.0002133080069295358, "loss": 4.7325, "step": 9770 }, { "epoch": 4.391284815813117, "grad_norm": 3.296875, "learning_rate": 0.0002132102550500991, "loss": 4.6518, "step": 9775 }, { "epoch": 4.393530997304582, "grad_norm": 3.328125, "learning_rate": 0.00021311247419053574, "loss": 4.701, "step": 9780 }, { "epoch": 4.395777178796047, "grad_norm": 3.21875, "learning_rate": 0.0002130146644096237, "loss": 4.6722, "step": 9785 }, { "epoch": 4.398023360287511, "grad_norm": 3.40625, "learning_rate": 0.00021291682576615837, "loss": 4.6529, "step": 9790 }, { "epoch": 4.400269541778976, "grad_norm": 3.25, "learning_rate": 0.00021281895831895247, "loss": 4.6846, "step": 9795 }, { "epoch": 4.40251572327044, "grad_norm": 3.34375, "learning_rate": 0.00021272106212683598, "loss": 4.674, "step": 9800 }, { "epoch": 4.404761904761905, "grad_norm": 3.34375, "learning_rate": 0.00021262313724865626, "loss": 4.6378, "step": 9805 }, { "epoch": 4.407008086253369, "grad_norm": 3.34375, "learning_rate": 0.0002125251837432779, "loss": 4.7018, "step": 9810 }, { "epoch": 4.409254267744834, "grad_norm": 3.265625, "learning_rate": 0.00021242720166958257, "loss": 4.68, "step": 9815 }, { "epoch": 4.411500449236298, "grad_norm": 3.515625, "learning_rate": 0.00021232919108646933, "loss": 4.6918, "step": 9820 }, { "epoch": 4.413746630727763, "grad_norm": 3.75, "learning_rate": 0.00021223115205285418, "loss": 4.6849, "step": 9825 }, { "epoch": 4.415992812219227, "grad_norm": 3.25, "learning_rate": 0.00021213308462767025, "loss": 4.6931, "step": 9830 }, { "epoch": 4.418238993710692, "grad_norm": 3.453125, "learning_rate": 0.00021203498886986793, "loss": 4.6482, "step": 9835 }, { "epoch": 4.420485175202156, "grad_norm": 3.34375, "learning_rate": 0.00021193686483841437, "loss": 4.6818, "step": 9840 }, { "epoch": 4.422731356693621, "grad_norm": 3.1875, "learning_rate": 0.00021183871259229393, "loss": 4.6446, "step": 9845 }, { "epoch": 4.424977538185085, "grad_norm": 3.453125, "learning_rate": 0.00021174053219050778, "loss": 4.6766, "step": 9850 }, { "epoch": 4.42722371967655, "grad_norm": 3.3125, "learning_rate": 0.0002116423236920741, "loss": 4.6678, "step": 9855 }, { "epoch": 4.429469901168014, "grad_norm": 3.390625, "learning_rate": 0.00021154408715602795, "loss": 4.7295, "step": 9860 }, { "epoch": 4.431716082659479, "grad_norm": 3.171875, "learning_rate": 0.00021144582264142123, "loss": 4.7037, "step": 9865 }, { "epoch": 4.433962264150943, "grad_norm": 3.4375, "learning_rate": 0.00021134753020732265, "loss": 4.6139, "step": 9870 }, { "epoch": 4.436208445642408, "grad_norm": 3.5625, "learning_rate": 0.00021124920991281778, "loss": 4.6233, "step": 9875 }, { "epoch": 4.438454627133872, "grad_norm": 3.359375, "learning_rate": 0.00021115086181700877, "loss": 4.6462, "step": 9880 }, { "epoch": 4.440700808625337, "grad_norm": 3.234375, "learning_rate": 0.00021105248597901456, "loss": 4.692, "step": 9885 }, { "epoch": 4.442946990116801, "grad_norm": 4.03125, "learning_rate": 0.00021095408245797094, "loss": 4.7333, "step": 9890 }, { "epoch": 4.445193171608266, "grad_norm": 3.390625, "learning_rate": 0.00021085565131303004, "loss": 4.7001, "step": 9895 }, { "epoch": 4.44743935309973, "grad_norm": 3.515625, "learning_rate": 0.00021075719260336086, "loss": 4.6733, "step": 9900 }, { "epoch": 4.449685534591195, "grad_norm": 3.375, "learning_rate": 0.00021065870638814875, "loss": 4.6761, "step": 9905 }, { "epoch": 4.45193171608266, "grad_norm": 3.65625, "learning_rate": 0.0002105601927265958, "loss": 4.7403, "step": 9910 }, { "epoch": 4.454177897574124, "grad_norm": 3.390625, "learning_rate": 0.0002104616516779204, "loss": 4.6371, "step": 9915 }, { "epoch": 4.456424079065589, "grad_norm": 3.296875, "learning_rate": 0.00021036308330135752, "loss": 4.6903, "step": 9920 }, { "epoch": 4.458670260557053, "grad_norm": 3.078125, "learning_rate": 0.00021026448765615866, "loss": 4.6965, "step": 9925 }, { "epoch": 4.460916442048518, "grad_norm": 3.375, "learning_rate": 0.00021016586480159145, "loss": 4.7419, "step": 9930 }, { "epoch": 4.463162623539982, "grad_norm": 3.46875, "learning_rate": 0.0002100672147969401, "loss": 4.7129, "step": 9935 }, { "epoch": 4.465408805031447, "grad_norm": 3.3125, "learning_rate": 0.00020996853770150495, "loss": 4.6758, "step": 9940 }, { "epoch": 4.467654986522911, "grad_norm": 3.171875, "learning_rate": 0.00020986983357460282, "loss": 4.6629, "step": 9945 }, { "epoch": 4.469901168014376, "grad_norm": 3.3125, "learning_rate": 0.00020977110247556667, "loss": 4.7698, "step": 9950 }, { "epoch": 4.47214734950584, "grad_norm": 3.21875, "learning_rate": 0.00020967234446374572, "loss": 4.7114, "step": 9955 }, { "epoch": 4.474393530997305, "grad_norm": 3.453125, "learning_rate": 0.0002095735595985053, "loss": 4.7192, "step": 9960 }, { "epoch": 4.476639712488769, "grad_norm": 3.296875, "learning_rate": 0.00020947474793922699, "loss": 4.7341, "step": 9965 }, { "epoch": 4.478885893980234, "grad_norm": 3.453125, "learning_rate": 0.00020937590954530827, "loss": 4.6937, "step": 9970 }, { "epoch": 4.481132075471698, "grad_norm": 3.515625, "learning_rate": 0.00020927704447616291, "loss": 4.6864, "step": 9975 }, { "epoch": 4.483378256963163, "grad_norm": 3.6875, "learning_rate": 0.0002091781527912207, "loss": 4.7076, "step": 9980 }, { "epoch": 4.485624438454627, "grad_norm": 3.375, "learning_rate": 0.00020907923454992729, "loss": 4.6991, "step": 9985 }, { "epoch": 4.487870619946092, "grad_norm": 3.1875, "learning_rate": 0.0002089802898117444, "loss": 4.64, "step": 9990 }, { "epoch": 4.490116801437556, "grad_norm": 3.265625, "learning_rate": 0.0002088813186361496, "loss": 4.656, "step": 9995 }, { "epoch": 4.492362982929021, "grad_norm": 3.578125, "learning_rate": 0.00020878232108263647, "loss": 4.7397, "step": 10000 }, { "epoch": 4.492362982929021, "eval_loss": 4.845594882965088, "eval_runtime": 16.0517, "eval_samples_per_second": 1932.072, "eval_steps_per_second": 241.532, "step": 10000 }, { "epoch": 4.494609164420485, "grad_norm": 3.453125, "learning_rate": 0.00020868329721071427, "loss": 4.6774, "step": 10005 }, { "epoch": 4.49685534591195, "grad_norm": 3.40625, "learning_rate": 0.00020858424707990828, "loss": 4.6391, "step": 10010 }, { "epoch": 4.499101527403414, "grad_norm": 3.40625, "learning_rate": 0.00020848517074975947, "loss": 4.7102, "step": 10015 }, { "epoch": 4.501347708894879, "grad_norm": 3.328125, "learning_rate": 0.00020838606827982452, "loss": 4.6926, "step": 10020 }, { "epoch": 4.503593890386343, "grad_norm": 3.40625, "learning_rate": 0.00020828693972967587, "loss": 4.7024, "step": 10025 }, { "epoch": 4.505840071877808, "grad_norm": 3.34375, "learning_rate": 0.0002081877851589016, "loss": 4.7202, "step": 10030 }, { "epoch": 4.508086253369273, "grad_norm": 3.5, "learning_rate": 0.00020808860462710556, "loss": 4.7569, "step": 10035 }, { "epoch": 4.510332434860737, "grad_norm": 3.53125, "learning_rate": 0.00020798939819390697, "loss": 4.6679, "step": 10040 }, { "epoch": 4.512578616352201, "grad_norm": 3.109375, "learning_rate": 0.00020789016591894085, "loss": 4.6852, "step": 10045 }, { "epoch": 4.514824797843666, "grad_norm": 3.234375, "learning_rate": 0.0002077909078618576, "loss": 4.6635, "step": 10050 }, { "epoch": 4.517070979335131, "grad_norm": 3.34375, "learning_rate": 0.00020769162408232326, "loss": 4.6376, "step": 10055 }, { "epoch": 4.519317160826595, "grad_norm": 3.453125, "learning_rate": 0.00020759231464001916, "loss": 4.7037, "step": 10060 }, { "epoch": 4.52156334231806, "grad_norm": 3.1875, "learning_rate": 0.0002074929795946422, "loss": 4.7064, "step": 10065 }, { "epoch": 4.523809523809524, "grad_norm": 3.25, "learning_rate": 0.0002073936190059046, "loss": 4.6143, "step": 10070 }, { "epoch": 4.526055705300989, "grad_norm": 3.46875, "learning_rate": 0.000207294232933534, "loss": 4.6545, "step": 10075 }, { "epoch": 4.528301886792453, "grad_norm": 3.359375, "learning_rate": 0.00020719482143727325, "loss": 4.6446, "step": 10080 }, { "epoch": 4.530548068283918, "grad_norm": 3.640625, "learning_rate": 0.00020709538457688054, "loss": 4.6623, "step": 10085 }, { "epoch": 4.532794249775382, "grad_norm": 3.453125, "learning_rate": 0.00020699592241212934, "loss": 4.7065, "step": 10090 }, { "epoch": 4.535040431266847, "grad_norm": 3.234375, "learning_rate": 0.0002068964350028083, "loss": 4.6512, "step": 10095 }, { "epoch": 4.537286612758311, "grad_norm": 3.25, "learning_rate": 0.00020679692240872124, "loss": 4.6402, "step": 10100 }, { "epoch": 4.539532794249776, "grad_norm": 3.40625, "learning_rate": 0.0002066973846896871, "loss": 4.687, "step": 10105 }, { "epoch": 4.54177897574124, "grad_norm": 3.546875, "learning_rate": 0.00020659782190554, "loss": 4.7121, "step": 10110 }, { "epoch": 4.544025157232705, "grad_norm": 3.375, "learning_rate": 0.0002064982341161291, "loss": 4.6398, "step": 10115 }, { "epoch": 4.546271338724169, "grad_norm": 3.421875, "learning_rate": 0.00020639862138131841, "loss": 4.664, "step": 10120 }, { "epoch": 4.548517520215634, "grad_norm": 3.40625, "learning_rate": 0.0002062989837609873, "loss": 4.6828, "step": 10125 }, { "epoch": 4.550763701707098, "grad_norm": 3.234375, "learning_rate": 0.00020619932131502974, "loss": 4.7815, "step": 10130 }, { "epoch": 4.553009883198563, "grad_norm": 3.5, "learning_rate": 0.00020609963410335485, "loss": 4.6678, "step": 10135 }, { "epoch": 4.555256064690027, "grad_norm": 3.390625, "learning_rate": 0.00020599992218588652, "loss": 4.6596, "step": 10140 }, { "epoch": 4.557502246181492, "grad_norm": 3.375, "learning_rate": 0.00020590018562256353, "loss": 4.6422, "step": 10145 }, { "epoch": 4.559748427672956, "grad_norm": 3.296875, "learning_rate": 0.00020580042447333952, "loss": 4.6899, "step": 10150 }, { "epoch": 4.561994609164421, "grad_norm": 3.65625, "learning_rate": 0.0002057006387981828, "loss": 4.6943, "step": 10155 }, { "epoch": 4.564240790655885, "grad_norm": 3.34375, "learning_rate": 0.0002056008286570766, "loss": 4.649, "step": 10160 }, { "epoch": 4.5664869721473496, "grad_norm": 3.359375, "learning_rate": 0.00020550099411001862, "loss": 4.6801, "step": 10165 }, { "epoch": 4.568733153638814, "grad_norm": 3.625, "learning_rate": 0.00020540113521702147, "loss": 4.6718, "step": 10170 }, { "epoch": 4.5709793351302785, "grad_norm": 3.359375, "learning_rate": 0.00020530125203811221, "loss": 4.6745, "step": 10175 }, { "epoch": 4.5732255166217435, "grad_norm": 3.515625, "learning_rate": 0.00020520134463333258, "loss": 4.6223, "step": 10180 }, { "epoch": 4.5754716981132075, "grad_norm": 3.40625, "learning_rate": 0.0002051014130627389, "loss": 4.6363, "step": 10185 }, { "epoch": 4.577717879604672, "grad_norm": 3.453125, "learning_rate": 0.00020500145738640198, "loss": 4.6598, "step": 10190 }, { "epoch": 4.5799640610961365, "grad_norm": 3.265625, "learning_rate": 0.00020490147766440714, "loss": 4.6446, "step": 10195 }, { "epoch": 4.5822102425876015, "grad_norm": 3.59375, "learning_rate": 0.0002048014739568541, "loss": 4.7159, "step": 10200 }, { "epoch": 4.5844564240790655, "grad_norm": 3.5, "learning_rate": 0.00020470144632385705, "loss": 4.6996, "step": 10205 }, { "epoch": 4.5867026055705304, "grad_norm": 3.640625, "learning_rate": 0.00020460139482554463, "loss": 4.7254, "step": 10210 }, { "epoch": 4.5889487870619945, "grad_norm": 3.40625, "learning_rate": 0.0002045013195220597, "loss": 4.6386, "step": 10215 }, { "epoch": 4.591194968553459, "grad_norm": 3.28125, "learning_rate": 0.00020440122047355946, "loss": 4.6847, "step": 10220 }, { "epoch": 4.5934411500449235, "grad_norm": 3.265625, "learning_rate": 0.00020430109774021547, "loss": 4.7036, "step": 10225 }, { "epoch": 4.595687331536388, "grad_norm": 3.421875, "learning_rate": 0.00020420095138221336, "loss": 4.6806, "step": 10230 }, { "epoch": 4.5979335130278525, "grad_norm": 3.3125, "learning_rate": 0.00020410078145975314, "loss": 4.673, "step": 10235 }, { "epoch": 4.600179694519317, "grad_norm": 3.3125, "learning_rate": 0.00020400058803304887, "loss": 4.6314, "step": 10240 }, { "epoch": 4.6024258760107815, "grad_norm": 3.203125, "learning_rate": 0.00020390037116232884, "loss": 4.6995, "step": 10245 }, { "epoch": 4.604672057502246, "grad_norm": 3.40625, "learning_rate": 0.00020380013090783532, "loss": 4.7094, "step": 10250 }, { "epoch": 4.6069182389937104, "grad_norm": 3.28125, "learning_rate": 0.00020369986732982472, "loss": 4.6444, "step": 10255 }, { "epoch": 4.609164420485175, "grad_norm": 3.359375, "learning_rate": 0.00020359958048856737, "loss": 4.6719, "step": 10260 }, { "epoch": 4.611410601976639, "grad_norm": 3.3125, "learning_rate": 0.00020349927044434774, "loss": 4.7002, "step": 10265 }, { "epoch": 4.613656783468104, "grad_norm": 3.234375, "learning_rate": 0.00020339893725746403, "loss": 4.7256, "step": 10270 }, { "epoch": 4.615902964959568, "grad_norm": 3.4375, "learning_rate": 0.00020329858098822861, "loss": 4.6411, "step": 10275 }, { "epoch": 4.618149146451033, "grad_norm": 3.265625, "learning_rate": 0.00020319820169696756, "loss": 4.7058, "step": 10280 }, { "epoch": 4.620395327942497, "grad_norm": 3.234375, "learning_rate": 0.00020309779944402079, "loss": 4.6442, "step": 10285 }, { "epoch": 4.622641509433962, "grad_norm": 3.390625, "learning_rate": 0.0002029973742897421, "loss": 4.7026, "step": 10290 }, { "epoch": 4.624887690925426, "grad_norm": 3.359375, "learning_rate": 0.000202896926294499, "loss": 4.6634, "step": 10295 }, { "epoch": 4.627133872416891, "grad_norm": 3.328125, "learning_rate": 0.00020279645551867276, "loss": 4.6369, "step": 10300 }, { "epoch": 4.629380053908356, "grad_norm": 3.3125, "learning_rate": 0.00020269596202265828, "loss": 4.6473, "step": 10305 }, { "epoch": 4.63162623539982, "grad_norm": 3.34375, "learning_rate": 0.0002025954458668642, "loss": 4.706, "step": 10310 }, { "epoch": 4.633872416891284, "grad_norm": 3.171875, "learning_rate": 0.00020249490711171276, "loss": 4.7147, "step": 10315 }, { "epoch": 4.636118598382749, "grad_norm": 3.78125, "learning_rate": 0.00020239434581763972, "loss": 4.6597, "step": 10320 }, { "epoch": 4.638364779874214, "grad_norm": 3.328125, "learning_rate": 0.0002022937620450945, "loss": 4.6389, "step": 10325 }, { "epoch": 4.640610961365678, "grad_norm": 3.3125, "learning_rate": 0.00020219315585453992, "loss": 4.7479, "step": 10330 }, { "epoch": 4.642857142857143, "grad_norm": 3.390625, "learning_rate": 0.00020209252730645234, "loss": 4.7021, "step": 10335 }, { "epoch": 4.645103324348607, "grad_norm": 3.140625, "learning_rate": 0.00020199187646132162, "loss": 4.5968, "step": 10340 }, { "epoch": 4.647349505840072, "grad_norm": 3.15625, "learning_rate": 0.00020189120337965082, "loss": 4.6683, "step": 10345 }, { "epoch": 4.649595687331536, "grad_norm": 3.421875, "learning_rate": 0.00020179050812195662, "loss": 4.7173, "step": 10350 }, { "epoch": 4.651841868823001, "grad_norm": 3.109375, "learning_rate": 0.0002016897907487688, "loss": 4.6803, "step": 10355 }, { "epoch": 4.654088050314465, "grad_norm": 3.390625, "learning_rate": 0.00020158905132063064, "loss": 4.7211, "step": 10360 }, { "epoch": 4.65633423180593, "grad_norm": 3.546875, "learning_rate": 0.0002014882898980985, "loss": 4.6781, "step": 10365 }, { "epoch": 4.658580413297394, "grad_norm": 3.453125, "learning_rate": 0.00020138750654174212, "loss": 4.6721, "step": 10370 }, { "epoch": 4.660826594788859, "grad_norm": 3.5625, "learning_rate": 0.00020128670131214427, "loss": 4.701, "step": 10375 }, { "epoch": 4.663072776280323, "grad_norm": 3.4375, "learning_rate": 0.0002011858742699009, "loss": 4.6892, "step": 10380 }, { "epoch": 4.665318957771788, "grad_norm": 3.46875, "learning_rate": 0.0002010850254756213, "loss": 4.6924, "step": 10385 }, { "epoch": 4.667565139263252, "grad_norm": 3.203125, "learning_rate": 0.00020098415498992752, "loss": 4.6235, "step": 10390 }, { "epoch": 4.669811320754717, "grad_norm": 3.28125, "learning_rate": 0.00020088326287345476, "loss": 4.6235, "step": 10395 }, { "epoch": 4.672057502246181, "grad_norm": 3.421875, "learning_rate": 0.00020078234918685133, "loss": 4.6504, "step": 10400 }, { "epoch": 4.674303683737646, "grad_norm": 3.171875, "learning_rate": 0.00020068141399077837, "loss": 4.641, "step": 10405 }, { "epoch": 4.67654986522911, "grad_norm": 3.375, "learning_rate": 0.00020058045734590998, "loss": 4.6779, "step": 10410 }, { "epoch": 4.678796046720575, "grad_norm": 3.59375, "learning_rate": 0.0002004794793129332, "loss": 4.7042, "step": 10415 }, { "epoch": 4.681042228212039, "grad_norm": 3.46875, "learning_rate": 0.0002003784799525479, "loss": 4.6635, "step": 10420 }, { "epoch": 4.683288409703504, "grad_norm": 3.375, "learning_rate": 0.00020027745932546677, "loss": 4.703, "step": 10425 }, { "epoch": 4.685534591194968, "grad_norm": 3.21875, "learning_rate": 0.00020017641749241533, "loss": 4.6605, "step": 10430 }, { "epoch": 4.687780772686433, "grad_norm": 3.578125, "learning_rate": 0.00020007535451413167, "loss": 4.6924, "step": 10435 }, { "epoch": 4.690026954177897, "grad_norm": 3.40625, "learning_rate": 0.00019997427045136687, "loss": 4.6854, "step": 10440 }, { "epoch": 4.692273135669362, "grad_norm": 3.359375, "learning_rate": 0.00019987316536488443, "loss": 4.7355, "step": 10445 }, { "epoch": 4.694519317160827, "grad_norm": 3.234375, "learning_rate": 0.00019977203931546063, "loss": 4.631, "step": 10450 }, { "epoch": 4.696765498652291, "grad_norm": 3.40625, "learning_rate": 0.00019967089236388433, "loss": 4.7017, "step": 10455 }, { "epoch": 4.699011680143755, "grad_norm": 3.21875, "learning_rate": 0.00019956972457095692, "loss": 4.6951, "step": 10460 }, { "epoch": 4.70125786163522, "grad_norm": 3.265625, "learning_rate": 0.00019946853599749233, "loss": 4.6927, "step": 10465 }, { "epoch": 4.703504043126685, "grad_norm": 3.375, "learning_rate": 0.00019936732670431702, "loss": 4.6368, "step": 10470 }, { "epoch": 4.705750224618149, "grad_norm": 3.40625, "learning_rate": 0.00019926609675226985, "loss": 4.6823, "step": 10475 }, { "epoch": 4.707996406109614, "grad_norm": 3.59375, "learning_rate": 0.00019916484620220213, "loss": 4.6877, "step": 10480 }, { "epoch": 4.710242587601078, "grad_norm": 3.46875, "learning_rate": 0.00019906357511497756, "loss": 4.6462, "step": 10485 }, { "epoch": 4.712488769092543, "grad_norm": 3.5, "learning_rate": 0.00019896228355147216, "loss": 4.7508, "step": 10490 }, { "epoch": 4.714734950584007, "grad_norm": 3.3125, "learning_rate": 0.00019886097157257427, "loss": 4.7164, "step": 10495 }, { "epoch": 4.716981132075472, "grad_norm": 3.34375, "learning_rate": 0.00019875963923918447, "loss": 4.5771, "step": 10500 }, { "epoch": 4.719227313566936, "grad_norm": 3.390625, "learning_rate": 0.00019865828661221564, "loss": 4.658, "step": 10505 }, { "epoch": 4.721473495058401, "grad_norm": 3.6875, "learning_rate": 0.00019855691375259284, "loss": 4.6967, "step": 10510 }, { "epoch": 4.723719676549865, "grad_norm": 3.34375, "learning_rate": 0.00019845552072125325, "loss": 4.6446, "step": 10515 }, { "epoch": 4.72596585804133, "grad_norm": 3.453125, "learning_rate": 0.00019835410757914617, "loss": 4.6289, "step": 10520 }, { "epoch": 4.728212039532794, "grad_norm": 3.28125, "learning_rate": 0.0001982526743872331, "loss": 4.6711, "step": 10525 }, { "epoch": 4.730458221024259, "grad_norm": 3.40625, "learning_rate": 0.00019815122120648743, "loss": 4.6641, "step": 10530 }, { "epoch": 4.732704402515723, "grad_norm": 3.5625, "learning_rate": 0.00019804974809789472, "loss": 4.6712, "step": 10535 }, { "epoch": 4.734950584007188, "grad_norm": 3.421875, "learning_rate": 0.00019794825512245244, "loss": 4.6884, "step": 10540 }, { "epoch": 4.737196765498652, "grad_norm": 3.609375, "learning_rate": 0.00019784674234116996, "loss": 4.6803, "step": 10545 }, { "epoch": 4.739442946990117, "grad_norm": 3.390625, "learning_rate": 0.00019774520981506857, "loss": 4.6987, "step": 10550 }, { "epoch": 4.741689128481581, "grad_norm": 3.25, "learning_rate": 0.00019764365760518152, "loss": 4.6352, "step": 10555 }, { "epoch": 4.743935309973046, "grad_norm": 3.25, "learning_rate": 0.00019754208577255384, "loss": 4.7026, "step": 10560 }, { "epoch": 4.74618149146451, "grad_norm": 3.46875, "learning_rate": 0.0001974404943782423, "loss": 4.6432, "step": 10565 }, { "epoch": 4.748427672955975, "grad_norm": 3.453125, "learning_rate": 0.00019733888348331545, "loss": 4.6899, "step": 10570 }, { "epoch": 4.75067385444744, "grad_norm": 3.421875, "learning_rate": 0.00019723725314885364, "loss": 4.6817, "step": 10575 }, { "epoch": 4.752920035938904, "grad_norm": 3.515625, "learning_rate": 0.00019713560343594884, "loss": 4.6453, "step": 10580 }, { "epoch": 4.755166217430368, "grad_norm": 3.296875, "learning_rate": 0.00019703393440570464, "loss": 4.6756, "step": 10585 }, { "epoch": 4.757412398921833, "grad_norm": 3.296875, "learning_rate": 0.00019693224611923632, "loss": 4.6776, "step": 10590 }, { "epoch": 4.759658580413298, "grad_norm": 3.359375, "learning_rate": 0.00019683053863767068, "loss": 4.6709, "step": 10595 }, { "epoch": 4.761904761904762, "grad_norm": 3.1875, "learning_rate": 0.00019672881202214616, "loss": 4.6625, "step": 10600 }, { "epoch": 4.764150943396227, "grad_norm": 3.375, "learning_rate": 0.00019662706633381244, "loss": 4.6974, "step": 10605 }, { "epoch": 4.766397124887691, "grad_norm": 3.3125, "learning_rate": 0.00019652530163383094, "loss": 4.6754, "step": 10610 }, { "epoch": 4.768643306379156, "grad_norm": 3.640625, "learning_rate": 0.00019642351798337444, "loss": 4.7521, "step": 10615 }, { "epoch": 4.77088948787062, "grad_norm": 3.5625, "learning_rate": 0.00019632171544362706, "loss": 4.7013, "step": 10620 }, { "epoch": 4.773135669362085, "grad_norm": 3.734375, "learning_rate": 0.00019621989407578425, "loss": 4.6938, "step": 10625 }, { "epoch": 4.775381850853549, "grad_norm": 3.40625, "learning_rate": 0.00019611805394105294, "loss": 4.6451, "step": 10630 }, { "epoch": 4.777628032345014, "grad_norm": 3.390625, "learning_rate": 0.00019601619510065108, "loss": 4.6984, "step": 10635 }, { "epoch": 4.779874213836478, "grad_norm": 3.421875, "learning_rate": 0.00019591431761580813, "loss": 4.7172, "step": 10640 }, { "epoch": 4.782120395327943, "grad_norm": 3.453125, "learning_rate": 0.00019581242154776454, "loss": 4.6923, "step": 10645 }, { "epoch": 4.784366576819407, "grad_norm": 3.671875, "learning_rate": 0.00019571050695777208, "loss": 4.6544, "step": 10650 }, { "epoch": 4.786612758310872, "grad_norm": 3.359375, "learning_rate": 0.00019560857390709362, "loss": 4.6593, "step": 10655 }, { "epoch": 4.788858939802336, "grad_norm": 3.375, "learning_rate": 0.0001955066224570031, "loss": 4.5734, "step": 10660 }, { "epoch": 4.791105121293801, "grad_norm": 3.671875, "learning_rate": 0.0001954046526687855, "loss": 4.6368, "step": 10665 }, { "epoch": 4.793351302785265, "grad_norm": 3.53125, "learning_rate": 0.00019530266460373685, "loss": 4.6785, "step": 10670 }, { "epoch": 4.79559748427673, "grad_norm": 3.546875, "learning_rate": 0.00019520065832316419, "loss": 4.6739, "step": 10675 }, { "epoch": 4.797843665768194, "grad_norm": 3.234375, "learning_rate": 0.00019509863388838552, "loss": 4.6251, "step": 10680 }, { "epoch": 4.800089847259659, "grad_norm": 3.390625, "learning_rate": 0.00019499659136072966, "loss": 4.7282, "step": 10685 }, { "epoch": 4.802336028751123, "grad_norm": 3.34375, "learning_rate": 0.0001948945308015364, "loss": 4.7018, "step": 10690 }, { "epoch": 4.804582210242588, "grad_norm": 3.265625, "learning_rate": 0.00019479245227215639, "loss": 4.6882, "step": 10695 }, { "epoch": 4.806828391734052, "grad_norm": 3.265625, "learning_rate": 0.00019469035583395087, "loss": 4.6144, "step": 10700 }, { "epoch": 4.809074573225517, "grad_norm": 3.921875, "learning_rate": 0.00019458824154829215, "loss": 4.6726, "step": 10705 }, { "epoch": 4.811320754716981, "grad_norm": 3.5, "learning_rate": 0.00019448610947656313, "loss": 4.7331, "step": 10710 }, { "epoch": 4.813566936208446, "grad_norm": 3.453125, "learning_rate": 0.0001943839596801573, "loss": 4.6982, "step": 10715 }, { "epoch": 4.815813117699911, "grad_norm": 3.359375, "learning_rate": 0.00019428179222047892, "loss": 4.6745, "step": 10720 }, { "epoch": 4.818059299191375, "grad_norm": 3.40625, "learning_rate": 0.00019417960715894294, "loss": 4.6347, "step": 10725 }, { "epoch": 4.820305480682839, "grad_norm": 3.359375, "learning_rate": 0.00019407740455697466, "loss": 4.6806, "step": 10730 }, { "epoch": 4.822551662174304, "grad_norm": 3.515625, "learning_rate": 0.0001939751844760102, "loss": 4.6906, "step": 10735 }, { "epoch": 4.824797843665769, "grad_norm": 3.375, "learning_rate": 0.00019387294697749592, "loss": 4.6839, "step": 10740 }, { "epoch": 4.827044025157233, "grad_norm": 3.234375, "learning_rate": 0.0001937706921228889, "loss": 4.6198, "step": 10745 }, { "epoch": 4.829290206648698, "grad_norm": 3.421875, "learning_rate": 0.00019366841997365647, "loss": 4.6804, "step": 10750 }, { "epoch": 4.831536388140162, "grad_norm": 3.578125, "learning_rate": 0.00019356613059127634, "loss": 4.6341, "step": 10755 }, { "epoch": 4.833782569631627, "grad_norm": 3.28125, "learning_rate": 0.00019346382403723683, "loss": 4.6665, "step": 10760 }, { "epoch": 4.836028751123091, "grad_norm": 3.359375, "learning_rate": 0.00019336150037303624, "loss": 4.6748, "step": 10765 }, { "epoch": 4.8382749326145555, "grad_norm": 3.21875, "learning_rate": 0.00019325915966018344, "loss": 4.6332, "step": 10770 }, { "epoch": 4.84052111410602, "grad_norm": 3.59375, "learning_rate": 0.0001931568019601974, "loss": 4.7308, "step": 10775 }, { "epoch": 4.8427672955974845, "grad_norm": 3.28125, "learning_rate": 0.00019305442733460733, "loss": 4.6666, "step": 10780 }, { "epoch": 4.845013477088949, "grad_norm": 3.359375, "learning_rate": 0.00019295203584495258, "loss": 4.7263, "step": 10785 }, { "epoch": 4.8472596585804135, "grad_norm": 3.328125, "learning_rate": 0.00019284962755278273, "loss": 4.745, "step": 10790 }, { "epoch": 4.849505840071878, "grad_norm": 3.625, "learning_rate": 0.0001927472025196574, "loss": 4.6752, "step": 10795 }, { "epoch": 4.8517520215633425, "grad_norm": 3.140625, "learning_rate": 0.00019264476080714627, "loss": 4.6648, "step": 10800 }, { "epoch": 4.853998203054807, "grad_norm": 3.25, "learning_rate": 0.000192542302476829, "loss": 4.6762, "step": 10805 }, { "epoch": 4.8562443845462715, "grad_norm": 3.40625, "learning_rate": 0.00019243982759029543, "loss": 4.7266, "step": 10810 }, { "epoch": 4.8584905660377355, "grad_norm": 3.484375, "learning_rate": 0.00019233733620914508, "loss": 4.6201, "step": 10815 }, { "epoch": 4.8607367475292005, "grad_norm": 3.3125, "learning_rate": 0.0001922348283949876, "loss": 4.6364, "step": 10820 }, { "epoch": 4.8629829290206645, "grad_norm": 3.546875, "learning_rate": 0.0001921323042094424, "loss": 4.6883, "step": 10825 }, { "epoch": 4.8652291105121295, "grad_norm": 3.546875, "learning_rate": 0.00019202976371413883, "loss": 4.666, "step": 10830 }, { "epoch": 4.8674752920035935, "grad_norm": 3.1875, "learning_rate": 0.00019192720697071595, "loss": 4.7158, "step": 10835 }, { "epoch": 4.8697214734950585, "grad_norm": 3.90625, "learning_rate": 0.0001918246340408226, "loss": 4.7058, "step": 10840 }, { "epoch": 4.871967654986523, "grad_norm": 3.484375, "learning_rate": 0.00019172204498611733, "loss": 4.6477, "step": 10845 }, { "epoch": 4.8742138364779874, "grad_norm": 3.359375, "learning_rate": 0.0001916194398682686, "loss": 4.6853, "step": 10850 }, { "epoch": 4.8764600179694515, "grad_norm": 3.265625, "learning_rate": 0.0001915168187489542, "loss": 4.6215, "step": 10855 }, { "epoch": 4.878706199460916, "grad_norm": 3.640625, "learning_rate": 0.0001914141816898617, "loss": 4.6465, "step": 10860 }, { "epoch": 4.880952380952381, "grad_norm": 3.234375, "learning_rate": 0.00019131152875268828, "loss": 4.6278, "step": 10865 }, { "epoch": 4.883198562443845, "grad_norm": 3.953125, "learning_rate": 0.00019120885999914067, "loss": 4.6461, "step": 10870 }, { "epoch": 4.8854447439353095, "grad_norm": 3.421875, "learning_rate": 0.00019110617549093493, "loss": 4.6566, "step": 10875 }, { "epoch": 4.887690925426774, "grad_norm": 3.359375, "learning_rate": 0.00019100347528979691, "loss": 4.6575, "step": 10880 }, { "epoch": 4.889937106918239, "grad_norm": 3.46875, "learning_rate": 0.00019090075945746152, "loss": 4.6042, "step": 10885 }, { "epoch": 4.892183288409703, "grad_norm": 3.4375, "learning_rate": 0.00019079802805567342, "loss": 4.6088, "step": 10890 }, { "epoch": 4.894429469901168, "grad_norm": 3.390625, "learning_rate": 0.00019069528114618636, "loss": 4.6826, "step": 10895 }, { "epoch": 4.896675651392632, "grad_norm": 3.84375, "learning_rate": 0.00019059251879076358, "loss": 4.6487, "step": 10900 }, { "epoch": 4.898921832884097, "grad_norm": 3.328125, "learning_rate": 0.00019048974105117744, "loss": 4.6943, "step": 10905 }, { "epoch": 4.901168014375561, "grad_norm": 3.796875, "learning_rate": 0.00019038694798920975, "loss": 4.7191, "step": 10910 }, { "epoch": 4.903414195867026, "grad_norm": 3.25, "learning_rate": 0.0001902841396666514, "loss": 4.6203, "step": 10915 }, { "epoch": 4.90566037735849, "grad_norm": 3.6875, "learning_rate": 0.00019018131614530244, "loss": 4.5897, "step": 10920 }, { "epoch": 4.907906558849955, "grad_norm": 3.3125, "learning_rate": 0.00019007847748697215, "loss": 4.6385, "step": 10925 }, { "epoch": 4.910152740341419, "grad_norm": 3.453125, "learning_rate": 0.00018997562375347882, "loss": 4.6484, "step": 10930 }, { "epoch": 4.912398921832884, "grad_norm": 3.3125, "learning_rate": 0.00018987275500664987, "loss": 4.6698, "step": 10935 }, { "epoch": 4.914645103324348, "grad_norm": 3.34375, "learning_rate": 0.00018976987130832172, "loss": 4.6663, "step": 10940 }, { "epoch": 4.916891284815813, "grad_norm": 3.4375, "learning_rate": 0.00018966697272033975, "loss": 4.6668, "step": 10945 }, { "epoch": 4.919137466307277, "grad_norm": 3.28125, "learning_rate": 0.0001895640593045583, "loss": 4.695, "step": 10950 }, { "epoch": 4.921383647798742, "grad_norm": 3.171875, "learning_rate": 0.00018946113112284073, "loss": 4.575, "step": 10955 }, { "epoch": 4.923629829290206, "grad_norm": 3.40625, "learning_rate": 0.000189358188237059, "loss": 4.5769, "step": 10960 }, { "epoch": 4.925876010781671, "grad_norm": 3.5, "learning_rate": 0.00018925523070909426, "loss": 4.7168, "step": 10965 }, { "epoch": 4.928122192273135, "grad_norm": 3.421875, "learning_rate": 0.0001891522586008362, "loss": 4.6358, "step": 10970 }, { "epoch": 4.9303683737646, "grad_norm": 3.25, "learning_rate": 0.0001890492719741834, "loss": 4.6406, "step": 10975 }, { "epoch": 4.932614555256064, "grad_norm": 3.484375, "learning_rate": 0.00018894627089104316, "loss": 4.6625, "step": 10980 }, { "epoch": 4.934860736747529, "grad_norm": 3.28125, "learning_rate": 0.00018884325541333142, "loss": 4.6533, "step": 10985 }, { "epoch": 4.937106918238994, "grad_norm": 3.296875, "learning_rate": 0.00018874022560297276, "loss": 4.6776, "step": 10990 }, { "epoch": 4.939353099730458, "grad_norm": 3.515625, "learning_rate": 0.00018863718152190045, "loss": 4.7341, "step": 10995 }, { "epoch": 4.941599281221922, "grad_norm": 3.40625, "learning_rate": 0.00018853412323205634, "loss": 4.6744, "step": 11000 }, { "epoch": 4.941599281221922, "eval_loss": 4.825163841247559, "eval_runtime": 16.1797, "eval_samples_per_second": 1916.78, "eval_steps_per_second": 239.621, "step": 11000 }, { "epoch": 4.943845462713387, "grad_norm": 3.328125, "learning_rate": 0.00018843105079539068, "loss": 4.6552, "step": 11005 }, { "epoch": 4.946091644204852, "grad_norm": 3.546875, "learning_rate": 0.0001883279642738624, "loss": 4.6436, "step": 11010 }, { "epoch": 4.948337825696316, "grad_norm": 3.21875, "learning_rate": 0.00018822486372943885, "loss": 4.6329, "step": 11015 }, { "epoch": 4.950584007187781, "grad_norm": 3.203125, "learning_rate": 0.00018812174922409566, "loss": 4.6874, "step": 11020 }, { "epoch": 4.952830188679245, "grad_norm": 3.34375, "learning_rate": 0.00018801862081981713, "loss": 4.6695, "step": 11025 }, { "epoch": 4.95507637017071, "grad_norm": 3.375, "learning_rate": 0.00018791547857859565, "loss": 4.703, "step": 11030 }, { "epoch": 4.957322551662174, "grad_norm": 3.6875, "learning_rate": 0.00018781232256243212, "loss": 4.6931, "step": 11035 }, { "epoch": 4.959568733153639, "grad_norm": 3.359375, "learning_rate": 0.00018770915283333555, "loss": 4.6586, "step": 11040 }, { "epoch": 4.961814914645103, "grad_norm": 3.359375, "learning_rate": 0.0001876059694533233, "loss": 4.7529, "step": 11045 }, { "epoch": 4.964061096136568, "grad_norm": 3.0, "learning_rate": 0.00018750277248442095, "loss": 4.7353, "step": 11050 }, { "epoch": 4.966307277628032, "grad_norm": 3.375, "learning_rate": 0.00018739956198866222, "loss": 4.6483, "step": 11055 }, { "epoch": 4.968553459119497, "grad_norm": 3.328125, "learning_rate": 0.00018729633802808894, "loss": 4.6358, "step": 11060 }, { "epoch": 4.970799640610961, "grad_norm": 3.453125, "learning_rate": 0.0001871931006647511, "loss": 4.6772, "step": 11065 }, { "epoch": 4.973045822102426, "grad_norm": 3.421875, "learning_rate": 0.00018708984996070662, "loss": 4.6191, "step": 11070 }, { "epoch": 4.97529200359389, "grad_norm": 3.375, "learning_rate": 0.0001869865859780215, "loss": 4.6018, "step": 11075 }, { "epoch": 4.977538185085355, "grad_norm": 3.21875, "learning_rate": 0.0001868833087787698, "loss": 4.6706, "step": 11080 }, { "epoch": 4.979784366576819, "grad_norm": 3.515625, "learning_rate": 0.00018678001842503347, "loss": 4.6274, "step": 11085 }, { "epoch": 4.982030548068284, "grad_norm": 3.484375, "learning_rate": 0.0001866767149789023, "loss": 4.7188, "step": 11090 }, { "epoch": 4.984276729559748, "grad_norm": 3.453125, "learning_rate": 0.00018657339850247407, "loss": 4.6799, "step": 11095 }, { "epoch": 4.986522911051213, "grad_norm": 4.9375, "learning_rate": 0.0001864700690578543, "loss": 4.6977, "step": 11100 }, { "epoch": 4.988769092542677, "grad_norm": 3.34375, "learning_rate": 0.00018636672670715632, "loss": 4.7117, "step": 11105 }, { "epoch": 4.991015274034142, "grad_norm": 3.25, "learning_rate": 0.0001862633715125013, "loss": 4.6384, "step": 11110 }, { "epoch": 4.993261455525607, "grad_norm": 3.25, "learning_rate": 0.00018616000353601804, "loss": 4.6931, "step": 11115 }, { "epoch": 4.995507637017071, "grad_norm": 3.421875, "learning_rate": 0.00018605662283984305, "loss": 4.702, "step": 11120 }, { "epoch": 4.997753818508535, "grad_norm": 3.46875, "learning_rate": 0.00018595322948612047, "loss": 4.7222, "step": 11125 }, { "epoch": 5.0, "grad_norm": 8.5, "learning_rate": 0.00018584982353700208, "loss": 4.6256, "step": 11130 }, { "epoch": 5.002246181491465, "grad_norm": 3.5, "learning_rate": 0.00018574640505464722, "loss": 4.5416, "step": 11135 }, { "epoch": 5.004492362982929, "grad_norm": 3.328125, "learning_rate": 0.00018564297410122272, "loss": 4.5626, "step": 11140 }, { "epoch": 5.006738544474394, "grad_norm": 3.6875, "learning_rate": 0.00018553953073890305, "loss": 4.6248, "step": 11145 }, { "epoch": 5.008984725965858, "grad_norm": 3.375, "learning_rate": 0.00018543607502986996, "loss": 4.6286, "step": 11150 }, { "epoch": 5.011230907457323, "grad_norm": 3.84375, "learning_rate": 0.00018533260703631265, "loss": 4.5629, "step": 11155 }, { "epoch": 5.013477088948787, "grad_norm": 3.375, "learning_rate": 0.00018522912682042786, "loss": 4.5586, "step": 11160 }, { "epoch": 5.015723270440252, "grad_norm": 3.765625, "learning_rate": 0.0001851256344444195, "loss": 4.6115, "step": 11165 }, { "epoch": 5.017969451931716, "grad_norm": 3.484375, "learning_rate": 0.00018502212997049893, "loss": 4.5836, "step": 11170 }, { "epoch": 5.020215633423181, "grad_norm": 3.484375, "learning_rate": 0.00018491861346088464, "loss": 4.538, "step": 11175 }, { "epoch": 5.022461814914645, "grad_norm": 3.53125, "learning_rate": 0.00018481508497780245, "loss": 4.5572, "step": 11180 }, { "epoch": 5.02470799640611, "grad_norm": 3.34375, "learning_rate": 0.00018471154458348538, "loss": 4.5747, "step": 11185 }, { "epoch": 5.026954177897574, "grad_norm": 3.546875, "learning_rate": 0.00018460799234017354, "loss": 4.5392, "step": 11190 }, { "epoch": 5.029200359389039, "grad_norm": 3.53125, "learning_rate": 0.0001845044283101142, "loss": 4.6448, "step": 11195 }, { "epoch": 5.031446540880503, "grad_norm": 3.578125, "learning_rate": 0.00018440085255556183, "loss": 4.5819, "step": 11200 }, { "epoch": 5.033692722371968, "grad_norm": 3.34375, "learning_rate": 0.00018429726513877773, "loss": 4.5667, "step": 11205 }, { "epoch": 5.035938903863432, "grad_norm": 3.46875, "learning_rate": 0.00018419366612203037, "loss": 4.5275, "step": 11210 }, { "epoch": 5.038185085354897, "grad_norm": 3.546875, "learning_rate": 0.00018409005556759513, "loss": 4.6251, "step": 11215 }, { "epoch": 5.040431266846361, "grad_norm": 3.4375, "learning_rate": 0.0001839864335377543, "loss": 4.5541, "step": 11220 }, { "epoch": 5.042677448337826, "grad_norm": 3.3125, "learning_rate": 0.00018388280009479718, "loss": 4.4648, "step": 11225 }, { "epoch": 5.04492362982929, "grad_norm": 3.703125, "learning_rate": 0.00018377915530101984, "loss": 4.5523, "step": 11230 }, { "epoch": 5.047169811320755, "grad_norm": 3.265625, "learning_rate": 0.00018367549921872512, "loss": 4.5651, "step": 11235 }, { "epoch": 5.049415992812219, "grad_norm": 3.3125, "learning_rate": 0.00018357183191022283, "loss": 4.6121, "step": 11240 }, { "epoch": 5.051662174303684, "grad_norm": 3.421875, "learning_rate": 0.00018346815343782936, "loss": 4.5563, "step": 11245 }, { "epoch": 5.053908355795148, "grad_norm": 3.46875, "learning_rate": 0.00018336446386386782, "loss": 4.6189, "step": 11250 }, { "epoch": 5.056154537286613, "grad_norm": 3.4375, "learning_rate": 0.00018326076325066808, "loss": 4.5833, "step": 11255 }, { "epoch": 5.058400718778077, "grad_norm": 3.453125, "learning_rate": 0.00018315705166056667, "loss": 4.5648, "step": 11260 }, { "epoch": 5.060646900269542, "grad_norm": 3.328125, "learning_rate": 0.0001830533291559066, "loss": 4.5622, "step": 11265 }, { "epoch": 5.062893081761007, "grad_norm": 3.359375, "learning_rate": 0.00018294959579903742, "loss": 4.5399, "step": 11270 }, { "epoch": 5.065139263252471, "grad_norm": 3.578125, "learning_rate": 0.0001828458516523154, "loss": 4.6647, "step": 11275 }, { "epoch": 5.067385444743936, "grad_norm": 3.703125, "learning_rate": 0.0001827420967781031, "loss": 4.5279, "step": 11280 }, { "epoch": 5.0696316262354, "grad_norm": 3.625, "learning_rate": 0.00018263833123876962, "loss": 4.6492, "step": 11285 }, { "epoch": 5.071877807726865, "grad_norm": 3.15625, "learning_rate": 0.00018253455509669047, "loss": 4.632, "step": 11290 }, { "epoch": 5.074123989218329, "grad_norm": 3.609375, "learning_rate": 0.00018243076841424754, "loss": 4.5971, "step": 11295 }, { "epoch": 5.076370170709794, "grad_norm": 3.515625, "learning_rate": 0.00018232697125382903, "loss": 4.5777, "step": 11300 }, { "epoch": 5.078616352201258, "grad_norm": 3.28125, "learning_rate": 0.0001822231636778293, "loss": 4.5621, "step": 11305 }, { "epoch": 5.080862533692723, "grad_norm": 3.71875, "learning_rate": 0.0001821193457486493, "loss": 4.584, "step": 11310 }, { "epoch": 5.083108715184187, "grad_norm": 3.4375, "learning_rate": 0.00018201551752869595, "loss": 4.5465, "step": 11315 }, { "epoch": 5.085354896675652, "grad_norm": 3.4375, "learning_rate": 0.0001819116790803824, "loss": 4.6135, "step": 11320 }, { "epoch": 5.087601078167116, "grad_norm": 3.375, "learning_rate": 0.00018180783046612797, "loss": 4.5585, "step": 11325 }, { "epoch": 5.089847259658581, "grad_norm": 3.421875, "learning_rate": 0.00018170397174835812, "loss": 4.6003, "step": 11330 }, { "epoch": 5.092093441150045, "grad_norm": 3.421875, "learning_rate": 0.00018160010298950432, "loss": 4.5887, "step": 11335 }, { "epoch": 5.09433962264151, "grad_norm": 3.71875, "learning_rate": 0.00018149622425200419, "loss": 4.6328, "step": 11340 }, { "epoch": 5.096585804132974, "grad_norm": 3.421875, "learning_rate": 0.00018139233559830118, "loss": 4.554, "step": 11345 }, { "epoch": 5.098831985624439, "grad_norm": 3.40625, "learning_rate": 0.00018128843709084484, "loss": 4.5458, "step": 11350 }, { "epoch": 5.101078167115903, "grad_norm": 3.375, "learning_rate": 0.00018118452879209055, "loss": 4.5481, "step": 11355 }, { "epoch": 5.103324348607368, "grad_norm": 3.609375, "learning_rate": 0.0001810806107644997, "loss": 4.5575, "step": 11360 }, { "epoch": 5.105570530098832, "grad_norm": 3.5, "learning_rate": 0.00018097668307053935, "loss": 4.6343, "step": 11365 }, { "epoch": 5.107816711590297, "grad_norm": 3.515625, "learning_rate": 0.00018087274577268246, "loss": 4.5504, "step": 11370 }, { "epoch": 5.110062893081761, "grad_norm": 3.359375, "learning_rate": 0.00018076879893340794, "loss": 4.6276, "step": 11375 }, { "epoch": 5.112309074573226, "grad_norm": 3.515625, "learning_rate": 0.0001806648426152001, "loss": 4.5617, "step": 11380 }, { "epoch": 5.11455525606469, "grad_norm": 3.46875, "learning_rate": 0.00018056087688054918, "loss": 4.617, "step": 11385 }, { "epoch": 5.116801437556155, "grad_norm": 3.609375, "learning_rate": 0.000180456901791951, "loss": 4.5855, "step": 11390 }, { "epoch": 5.119047619047619, "grad_norm": 3.453125, "learning_rate": 0.000180352917411907, "loss": 4.6229, "step": 11395 }, { "epoch": 5.121293800539084, "grad_norm": 3.53125, "learning_rate": 0.00018024892380292425, "loss": 4.576, "step": 11400 }, { "epoch": 5.1235399820305485, "grad_norm": 3.21875, "learning_rate": 0.00018014492102751535, "loss": 4.5866, "step": 11405 }, { "epoch": 5.1257861635220126, "grad_norm": 3.484375, "learning_rate": 0.00018004090914819837, "loss": 4.6268, "step": 11410 }, { "epoch": 5.1280323450134775, "grad_norm": 3.375, "learning_rate": 0.00017993688822749696, "loss": 4.5861, "step": 11415 }, { "epoch": 5.1302785265049415, "grad_norm": 3.453125, "learning_rate": 0.00017983285832794, "loss": 4.6557, "step": 11420 }, { "epoch": 5.1325247079964065, "grad_norm": 3.40625, "learning_rate": 0.00017972881951206193, "loss": 4.5783, "step": 11425 }, { "epoch": 5.1347708894878705, "grad_norm": 3.6875, "learning_rate": 0.00017962477184240263, "loss": 4.5894, "step": 11430 }, { "epoch": 5.1370170709793355, "grad_norm": 3.515625, "learning_rate": 0.0001795207153815071, "loss": 4.5467, "step": 11435 }, { "epoch": 5.1392632524707995, "grad_norm": 3.609375, "learning_rate": 0.0001794166501919257, "loss": 4.6068, "step": 11440 }, { "epoch": 5.1415094339622645, "grad_norm": 3.453125, "learning_rate": 0.00017931257633621404, "loss": 4.5561, "step": 11445 }, { "epoch": 5.1437556154537285, "grad_norm": 3.6875, "learning_rate": 0.00017920849387693307, "loss": 4.507, "step": 11450 }, { "epoch": 5.146001796945193, "grad_norm": 3.609375, "learning_rate": 0.0001791044028766486, "loss": 4.6283, "step": 11455 }, { "epoch": 5.1482479784366575, "grad_norm": 3.59375, "learning_rate": 0.00017900030339793193, "loss": 4.5579, "step": 11460 }, { "epoch": 5.150494159928122, "grad_norm": 3.578125, "learning_rate": 0.00017889619550335925, "loss": 4.6677, "step": 11465 }, { "epoch": 5.1527403414195865, "grad_norm": 3.546875, "learning_rate": 0.00017879207925551179, "loss": 4.6207, "step": 11470 }, { "epoch": 5.154986522911051, "grad_norm": 3.578125, "learning_rate": 0.00017868795471697588, "loss": 4.5903, "step": 11475 }, { "epoch": 5.1572327044025155, "grad_norm": 3.65625, "learning_rate": 0.00017858382195034284, "loss": 4.5039, "step": 11480 }, { "epoch": 5.15947888589398, "grad_norm": 3.5, "learning_rate": 0.0001784796810182089, "loss": 4.61, "step": 11485 }, { "epoch": 5.1617250673854445, "grad_norm": 3.90625, "learning_rate": 0.00017837553198317524, "loss": 4.6192, "step": 11490 }, { "epoch": 5.163971248876909, "grad_norm": 3.609375, "learning_rate": 0.00017827137490784788, "loss": 4.6387, "step": 11495 }, { "epoch": 5.166217430368373, "grad_norm": 3.328125, "learning_rate": 0.0001781672098548376, "loss": 4.5613, "step": 11500 }, { "epoch": 5.168463611859838, "grad_norm": 3.34375, "learning_rate": 0.00017806303688676012, "loss": 4.6324, "step": 11505 }, { "epoch": 5.170709793351302, "grad_norm": 3.53125, "learning_rate": 0.0001779588560662358, "loss": 4.6147, "step": 11510 }, { "epoch": 5.172955974842767, "grad_norm": 3.546875, "learning_rate": 0.00017785466745588984, "loss": 4.6576, "step": 11515 }, { "epoch": 5.175202156334231, "grad_norm": 3.625, "learning_rate": 0.000177750471118352, "loss": 4.5248, "step": 11520 }, { "epoch": 5.177448337825696, "grad_norm": 3.375, "learning_rate": 0.00017764626711625668, "loss": 4.5522, "step": 11525 }, { "epoch": 5.17969451931716, "grad_norm": 3.453125, "learning_rate": 0.0001775420555122431, "loss": 4.5719, "step": 11530 }, { "epoch": 5.181940700808625, "grad_norm": 3.40625, "learning_rate": 0.00017743783636895474, "loss": 4.593, "step": 11535 }, { "epoch": 5.184186882300089, "grad_norm": 3.5, "learning_rate": 0.00017733360974903984, "loss": 4.5872, "step": 11540 }, { "epoch": 5.186433063791554, "grad_norm": 3.40625, "learning_rate": 0.000177229375715151, "loss": 4.5696, "step": 11545 }, { "epoch": 5.188679245283019, "grad_norm": 3.65625, "learning_rate": 0.00017712513432994542, "loss": 4.5896, "step": 11550 }, { "epoch": 5.190925426774483, "grad_norm": 3.359375, "learning_rate": 0.00017702088565608459, "loss": 4.6056, "step": 11555 }, { "epoch": 5.193171608265948, "grad_norm": 3.546875, "learning_rate": 0.00017691662975623435, "loss": 4.5833, "step": 11560 }, { "epoch": 5.195417789757412, "grad_norm": 3.65625, "learning_rate": 0.0001768123666930651, "loss": 4.5589, "step": 11565 }, { "epoch": 5.197663971248877, "grad_norm": 3.5, "learning_rate": 0.00017670809652925128, "loss": 4.5404, "step": 11570 }, { "epoch": 5.199910152740341, "grad_norm": 3.453125, "learning_rate": 0.0001766038193274718, "loss": 4.5204, "step": 11575 }, { "epoch": 5.202156334231806, "grad_norm": 3.515625, "learning_rate": 0.00017649953515040976, "loss": 4.6046, "step": 11580 }, { "epoch": 5.20440251572327, "grad_norm": 3.53125, "learning_rate": 0.00017639524406075233, "loss": 4.6078, "step": 11585 }, { "epoch": 5.206648697214735, "grad_norm": 3.515625, "learning_rate": 0.00017629094612119098, "loss": 4.5427, "step": 11590 }, { "epoch": 5.208894878706199, "grad_norm": 3.578125, "learning_rate": 0.00017618664139442116, "loss": 4.6234, "step": 11595 }, { "epoch": 5.211141060197664, "grad_norm": 3.328125, "learning_rate": 0.00017608232994314254, "loss": 4.6195, "step": 11600 }, { "epoch": 5.213387241689128, "grad_norm": 3.625, "learning_rate": 0.0001759780118300588, "loss": 4.5626, "step": 11605 }, { "epoch": 5.215633423180593, "grad_norm": 3.390625, "learning_rate": 0.00017587368711787754, "loss": 4.5641, "step": 11610 }, { "epoch": 5.217879604672057, "grad_norm": 3.59375, "learning_rate": 0.00017576935586931046, "loss": 4.6135, "step": 11615 }, { "epoch": 5.220125786163522, "grad_norm": 3.609375, "learning_rate": 0.00017566501814707304, "loss": 4.6373, "step": 11620 }, { "epoch": 5.222371967654986, "grad_norm": 3.453125, "learning_rate": 0.00017556067401388467, "loss": 4.6505, "step": 11625 }, { "epoch": 5.224618149146451, "grad_norm": 3.625, "learning_rate": 0.00017545632353246882, "loss": 4.4975, "step": 11630 }, { "epoch": 5.226864330637915, "grad_norm": 3.625, "learning_rate": 0.00017535196676555248, "loss": 4.6227, "step": 11635 }, { "epoch": 5.22911051212938, "grad_norm": 3.40625, "learning_rate": 0.00017524760377586655, "loss": 4.545, "step": 11640 }, { "epoch": 5.231356693620844, "grad_norm": 3.609375, "learning_rate": 0.0001751432346261457, "loss": 4.6033, "step": 11645 }, { "epoch": 5.233602875112309, "grad_norm": 3.515625, "learning_rate": 0.00017503885937912824, "loss": 4.5615, "step": 11650 }, { "epoch": 5.235849056603773, "grad_norm": 3.5625, "learning_rate": 0.00017493447809755614, "loss": 4.5551, "step": 11655 }, { "epoch": 5.238095238095238, "grad_norm": 3.625, "learning_rate": 0.0001748300908441751, "loss": 4.5172, "step": 11660 }, { "epoch": 5.240341419586702, "grad_norm": 3.484375, "learning_rate": 0.00017472569768173436, "loss": 4.5991, "step": 11665 }, { "epoch": 5.242587601078167, "grad_norm": 3.84375, "learning_rate": 0.00017462129867298656, "loss": 4.6592, "step": 11670 }, { "epoch": 5.244833782569632, "grad_norm": 4.03125, "learning_rate": 0.00017451689388068813, "loss": 4.596, "step": 11675 }, { "epoch": 5.247079964061096, "grad_norm": 3.921875, "learning_rate": 0.00017441248336759872, "loss": 4.5997, "step": 11680 }, { "epoch": 5.249326145552561, "grad_norm": 3.625, "learning_rate": 0.00017430806719648153, "loss": 4.6422, "step": 11685 }, { "epoch": 5.251572327044025, "grad_norm": 3.6875, "learning_rate": 0.00017420364543010327, "loss": 4.6005, "step": 11690 }, { "epoch": 5.25381850853549, "grad_norm": 3.53125, "learning_rate": 0.0001740992181312339, "loss": 4.5657, "step": 11695 }, { "epoch": 5.256064690026954, "grad_norm": 4.0625, "learning_rate": 0.0001739947853626466, "loss": 4.5688, "step": 11700 }, { "epoch": 5.258310871518419, "grad_norm": 3.4375, "learning_rate": 0.00017389034718711795, "loss": 4.5535, "step": 11705 }, { "epoch": 5.260557053009883, "grad_norm": 3.84375, "learning_rate": 0.00017378590366742784, "loss": 4.6458, "step": 11710 }, { "epoch": 5.262803234501348, "grad_norm": 3.671875, "learning_rate": 0.00017368145486635933, "loss": 4.617, "step": 11715 }, { "epoch": 5.265049415992812, "grad_norm": 3.890625, "learning_rate": 0.00017357700084669862, "loss": 4.5719, "step": 11720 }, { "epoch": 5.267295597484277, "grad_norm": 3.515625, "learning_rate": 0.000173472541671235, "loss": 4.5821, "step": 11725 }, { "epoch": 5.269541778975741, "grad_norm": 3.359375, "learning_rate": 0.00017336807740276098, "loss": 4.6096, "step": 11730 }, { "epoch": 5.271787960467206, "grad_norm": 3.734375, "learning_rate": 0.00017326360810407214, "loss": 4.5613, "step": 11735 }, { "epoch": 5.27403414195867, "grad_norm": 3.625, "learning_rate": 0.00017315913383796685, "loss": 4.6638, "step": 11740 }, { "epoch": 5.276280323450135, "grad_norm": 3.5625, "learning_rate": 0.00017305465466724672, "loss": 4.6461, "step": 11745 }, { "epoch": 5.278526504941599, "grad_norm": 3.5, "learning_rate": 0.00017295017065471627, "loss": 4.6047, "step": 11750 }, { "epoch": 5.280772686433064, "grad_norm": 3.359375, "learning_rate": 0.00017284568186318286, "loss": 4.5866, "step": 11755 }, { "epoch": 5.283018867924528, "grad_norm": 3.546875, "learning_rate": 0.00017274118835545668, "loss": 4.5601, "step": 11760 }, { "epoch": 5.285265049415993, "grad_norm": 3.734375, "learning_rate": 0.0001726366901943509, "loss": 4.5449, "step": 11765 }, { "epoch": 5.287511230907457, "grad_norm": 3.59375, "learning_rate": 0.00017253218744268137, "loss": 4.6337, "step": 11770 }, { "epoch": 5.289757412398922, "grad_norm": 3.671875, "learning_rate": 0.0001724276801632667, "loss": 4.6278, "step": 11775 }, { "epoch": 5.292003593890386, "grad_norm": 3.546875, "learning_rate": 0.00017232316841892832, "loss": 4.626, "step": 11780 }, { "epoch": 5.294249775381851, "grad_norm": 3.484375, "learning_rate": 0.00017221865227249028, "loss": 4.6026, "step": 11785 }, { "epoch": 5.296495956873315, "grad_norm": 3.359375, "learning_rate": 0.00017211413178677923, "loss": 4.5839, "step": 11790 }, { "epoch": 5.29874213836478, "grad_norm": 3.609375, "learning_rate": 0.0001720096070246245, "loss": 4.5683, "step": 11795 }, { "epoch": 5.300988319856244, "grad_norm": 3.375, "learning_rate": 0.000171905078048858, "loss": 4.5679, "step": 11800 }, { "epoch": 5.303234501347709, "grad_norm": 3.59375, "learning_rate": 0.0001718005449223141, "loss": 4.633, "step": 11805 }, { "epoch": 5.305480682839173, "grad_norm": 3.65625, "learning_rate": 0.0001716960077078297, "loss": 4.6708, "step": 11810 }, { "epoch": 5.307726864330638, "grad_norm": 3.59375, "learning_rate": 0.0001715914664682442, "loss": 4.613, "step": 11815 }, { "epoch": 5.309973045822103, "grad_norm": 3.5, "learning_rate": 0.00017148692126639937, "loss": 4.6032, "step": 11820 }, { "epoch": 5.312219227313567, "grad_norm": 3.59375, "learning_rate": 0.00017138237216513937, "loss": 4.5737, "step": 11825 }, { "epoch": 5.314465408805032, "grad_norm": 3.515625, "learning_rate": 0.00017127781922731067, "loss": 4.5867, "step": 11830 }, { "epoch": 5.316711590296496, "grad_norm": 3.5625, "learning_rate": 0.00017117326251576216, "loss": 4.571, "step": 11835 }, { "epoch": 5.318957771787961, "grad_norm": 3.625, "learning_rate": 0.00017106870209334488, "loss": 4.5513, "step": 11840 }, { "epoch": 5.321203953279425, "grad_norm": 3.453125, "learning_rate": 0.00017096413802291212, "loss": 4.5808, "step": 11845 }, { "epoch": 5.32345013477089, "grad_norm": 3.578125, "learning_rate": 0.00017085957036731947, "loss": 4.5539, "step": 11850 }, { "epoch": 5.325696316262354, "grad_norm": 3.546875, "learning_rate": 0.0001707549991894245, "loss": 4.6171, "step": 11855 }, { "epoch": 5.327942497753819, "grad_norm": 3.609375, "learning_rate": 0.00017065042455208704, "loss": 4.5978, "step": 11860 }, { "epoch": 5.330188679245283, "grad_norm": 3.359375, "learning_rate": 0.0001705458465181689, "loss": 4.6106, "step": 11865 }, { "epoch": 5.332434860736748, "grad_norm": 3.625, "learning_rate": 0.00017044126515053403, "loss": 4.5804, "step": 11870 }, { "epoch": 5.334681042228212, "grad_norm": 3.53125, "learning_rate": 0.00017033668051204837, "loss": 4.5118, "step": 11875 }, { "epoch": 5.336927223719677, "grad_norm": 3.625, "learning_rate": 0.00017023209266557967, "loss": 4.6332, "step": 11880 }, { "epoch": 5.339173405211141, "grad_norm": 3.6875, "learning_rate": 0.00017012750167399781, "loss": 4.6062, "step": 11885 }, { "epoch": 5.341419586702606, "grad_norm": 3.53125, "learning_rate": 0.00017002290760017447, "loss": 4.603, "step": 11890 }, { "epoch": 5.34366576819407, "grad_norm": 3.515625, "learning_rate": 0.00016991831050698324, "loss": 4.5982, "step": 11895 }, { "epoch": 5.345911949685535, "grad_norm": 3.625, "learning_rate": 0.00016981371045729938, "loss": 4.5631, "step": 11900 }, { "epoch": 5.348158131176999, "grad_norm": 3.609375, "learning_rate": 0.00016970910751400007, "loss": 4.5913, "step": 11905 }, { "epoch": 5.350404312668464, "grad_norm": 3.515625, "learning_rate": 0.0001696045017399642, "loss": 4.622, "step": 11910 }, { "epoch": 5.352650494159928, "grad_norm": 3.53125, "learning_rate": 0.0001694998931980723, "loss": 4.549, "step": 11915 }, { "epoch": 5.354896675651393, "grad_norm": 3.484375, "learning_rate": 0.00016939528195120669, "loss": 4.552, "step": 11920 }, { "epoch": 5.357142857142857, "grad_norm": 3.609375, "learning_rate": 0.0001692906680622512, "loss": 4.6126, "step": 11925 }, { "epoch": 5.359389038634322, "grad_norm": 3.40625, "learning_rate": 0.0001691860515940912, "loss": 4.574, "step": 11930 }, { "epoch": 5.361635220125786, "grad_norm": 3.578125, "learning_rate": 0.00016908143260961387, "loss": 4.5608, "step": 11935 }, { "epoch": 5.363881401617251, "grad_norm": 3.78125, "learning_rate": 0.00016897681117170748, "loss": 4.5848, "step": 11940 }, { "epoch": 5.366127583108716, "grad_norm": 4.0, "learning_rate": 0.00016887218734326222, "loss": 4.6342, "step": 11945 }, { "epoch": 5.36837376460018, "grad_norm": 3.703125, "learning_rate": 0.0001687675611871695, "loss": 4.5628, "step": 11950 }, { "epoch": 5.370619946091645, "grad_norm": 3.515625, "learning_rate": 0.00016866293276632206, "loss": 4.7054, "step": 11955 }, { "epoch": 5.372866127583109, "grad_norm": 4.34375, "learning_rate": 0.00016855830214361416, "loss": 4.5294, "step": 11960 }, { "epoch": 5.375112309074574, "grad_norm": 3.796875, "learning_rate": 0.00016845366938194128, "loss": 4.6392, "step": 11965 }, { "epoch": 5.377358490566038, "grad_norm": 3.578125, "learning_rate": 0.00016834903454420022, "loss": 4.6763, "step": 11970 }, { "epoch": 5.379604672057503, "grad_norm": 3.421875, "learning_rate": 0.000168244397693289, "loss": 4.6158, "step": 11975 }, { "epoch": 5.381850853548967, "grad_norm": 3.578125, "learning_rate": 0.00016813975889210696, "loss": 4.6223, "step": 11980 }, { "epoch": 5.384097035040432, "grad_norm": 3.5, "learning_rate": 0.00016803511820355447, "loss": 4.6112, "step": 11985 }, { "epoch": 5.386343216531896, "grad_norm": 3.40625, "learning_rate": 0.0001679304756905331, "loss": 4.545, "step": 11990 }, { "epoch": 5.388589398023361, "grad_norm": 3.5, "learning_rate": 0.0001678258314159455, "loss": 4.5991, "step": 11995 }, { "epoch": 5.390835579514825, "grad_norm": 3.640625, "learning_rate": 0.0001677211854426954, "loss": 4.6018, "step": 12000 }, { "epoch": 5.390835579514825, "eval_loss": 4.814459800720215, "eval_runtime": 16.0452, "eval_samples_per_second": 1932.846, "eval_steps_per_second": 241.629, "step": 12000 }, { "epoch": 5.3930817610062896, "grad_norm": 3.578125, "learning_rate": 0.00016761653783368754, "loss": 4.6236, "step": 12005 }, { "epoch": 5.395327942497754, "grad_norm": 3.515625, "learning_rate": 0.00016751188865182765, "loss": 4.5989, "step": 12010 }, { "epoch": 5.3975741239892185, "grad_norm": 3.59375, "learning_rate": 0.0001674072379600224, "loss": 4.5864, "step": 12015 }, { "epoch": 5.399820305480683, "grad_norm": 3.59375, "learning_rate": 0.00016730258582117936, "loss": 4.6238, "step": 12020 }, { "epoch": 5.4020664869721475, "grad_norm": 3.5625, "learning_rate": 0.000167197932298207, "loss": 4.5502, "step": 12025 }, { "epoch": 5.404312668463612, "grad_norm": 3.515625, "learning_rate": 0.00016709327745401448, "loss": 4.6318, "step": 12030 }, { "epoch": 5.4065588499550765, "grad_norm": 3.515625, "learning_rate": 0.00016698862135151204, "loss": 4.6023, "step": 12035 }, { "epoch": 5.408805031446541, "grad_norm": 3.65625, "learning_rate": 0.00016688396405361043, "loss": 4.6592, "step": 12040 }, { "epoch": 5.4110512129380055, "grad_norm": 3.515625, "learning_rate": 0.00016677930562322119, "loss": 4.5976, "step": 12045 }, { "epoch": 5.4132973944294696, "grad_norm": 3.8125, "learning_rate": 0.00016667464612325658, "loss": 4.5946, "step": 12050 }, { "epoch": 5.4155435759209345, "grad_norm": 4.21875, "learning_rate": 0.0001665699856166294, "loss": 4.6208, "step": 12055 }, { "epoch": 5.4177897574123985, "grad_norm": 3.6875, "learning_rate": 0.00016646532416625322, "loss": 4.535, "step": 12060 }, { "epoch": 5.4200359389038635, "grad_norm": 3.359375, "learning_rate": 0.000166360661835042, "loss": 4.5717, "step": 12065 }, { "epoch": 5.4222821203953275, "grad_norm": 3.53125, "learning_rate": 0.0001662559986859104, "loss": 4.5988, "step": 12070 }, { "epoch": 5.4245283018867925, "grad_norm": 3.671875, "learning_rate": 0.00016615133478177342, "loss": 4.5987, "step": 12075 }, { "epoch": 5.4267744833782565, "grad_norm": 3.609375, "learning_rate": 0.00016604667018554661, "loss": 4.5958, "step": 12080 }, { "epoch": 5.4290206648697215, "grad_norm": 3.78125, "learning_rate": 0.0001659420049601459, "loss": 4.6424, "step": 12085 }, { "epoch": 5.431266846361186, "grad_norm": 3.421875, "learning_rate": 0.00016583733916848754, "loss": 4.5967, "step": 12090 }, { "epoch": 5.4335130278526504, "grad_norm": 3.640625, "learning_rate": 0.0001657326728734883, "loss": 4.584, "step": 12095 }, { "epoch": 5.435759209344115, "grad_norm": 3.453125, "learning_rate": 0.00016562800613806507, "loss": 4.5748, "step": 12100 }, { "epoch": 5.438005390835579, "grad_norm": 3.578125, "learning_rate": 0.00016552333902513505, "loss": 4.6481, "step": 12105 }, { "epoch": 5.440251572327044, "grad_norm": 3.328125, "learning_rate": 0.00016541867159761573, "loss": 4.5908, "step": 12110 }, { "epoch": 5.442497753818508, "grad_norm": 3.40625, "learning_rate": 0.0001653140039184247, "loss": 4.6422, "step": 12115 }, { "epoch": 5.444743935309973, "grad_norm": 3.59375, "learning_rate": 0.00016520933605047977, "loss": 4.576, "step": 12120 }, { "epoch": 5.446990116801437, "grad_norm": 3.765625, "learning_rate": 0.00016510466805669892, "loss": 4.619, "step": 12125 }, { "epoch": 5.449236298292902, "grad_norm": 3.328125, "learning_rate": 0.000165, "loss": 4.5754, "step": 12130 }, { "epoch": 5.451482479784366, "grad_norm": 3.484375, "learning_rate": 0.00016489533194330108, "loss": 4.5766, "step": 12135 }, { "epoch": 5.453728661275831, "grad_norm": 3.5625, "learning_rate": 0.00016479066394952017, "loss": 4.5739, "step": 12140 }, { "epoch": 5.455974842767295, "grad_norm": 3.640625, "learning_rate": 0.0001646859960815753, "loss": 4.631, "step": 12145 }, { "epoch": 5.45822102425876, "grad_norm": 3.6875, "learning_rate": 0.00016458132840238427, "loss": 4.5661, "step": 12150 }, { "epoch": 5.460467205750224, "grad_norm": 3.625, "learning_rate": 0.00016447666097486494, "loss": 4.6136, "step": 12155 }, { "epoch": 5.462713387241689, "grad_norm": 3.609375, "learning_rate": 0.00016437199386193493, "loss": 4.5473, "step": 12160 }, { "epoch": 5.464959568733153, "grad_norm": 3.484375, "learning_rate": 0.00016426732712651167, "loss": 4.5471, "step": 12165 }, { "epoch": 5.467205750224618, "grad_norm": 3.390625, "learning_rate": 0.00016416266083151243, "loss": 4.5728, "step": 12170 }, { "epoch": 5.469451931716082, "grad_norm": 3.5625, "learning_rate": 0.0001640579950398541, "loss": 4.5791, "step": 12175 }, { "epoch": 5.471698113207547, "grad_norm": 3.453125, "learning_rate": 0.00016395332981445336, "loss": 4.6452, "step": 12180 }, { "epoch": 5.473944294699011, "grad_norm": 3.484375, "learning_rate": 0.00016384866521822655, "loss": 4.5756, "step": 12185 }, { "epoch": 5.476190476190476, "grad_norm": 3.484375, "learning_rate": 0.00016374400131408958, "loss": 4.6024, "step": 12190 }, { "epoch": 5.47843665768194, "grad_norm": 3.5625, "learning_rate": 0.000163639338164958, "loss": 4.5795, "step": 12195 }, { "epoch": 5.480682839173405, "grad_norm": 3.5, "learning_rate": 0.00016353467583374675, "loss": 4.5962, "step": 12200 }, { "epoch": 5.482929020664869, "grad_norm": 3.671875, "learning_rate": 0.0001634300143833706, "loss": 4.5926, "step": 12205 }, { "epoch": 5.485175202156334, "grad_norm": 3.609375, "learning_rate": 0.0001633253538767435, "loss": 4.5774, "step": 12210 }, { "epoch": 5.487421383647799, "grad_norm": 3.4375, "learning_rate": 0.00016322069437677884, "loss": 4.5415, "step": 12215 }, { "epoch": 5.489667565139263, "grad_norm": 3.5625, "learning_rate": 0.00016311603594638962, "loss": 4.5944, "step": 12220 }, { "epoch": 5.491913746630728, "grad_norm": 3.546875, "learning_rate": 0.00016301137864848799, "loss": 4.6549, "step": 12225 }, { "epoch": 5.494159928122192, "grad_norm": 3.828125, "learning_rate": 0.00016290672254598552, "loss": 4.5839, "step": 12230 }, { "epoch": 5.496406109613657, "grad_norm": 3.484375, "learning_rate": 0.00016280206770179307, "loss": 4.5881, "step": 12235 }, { "epoch": 5.498652291105121, "grad_norm": 3.3125, "learning_rate": 0.00016269741417882064, "loss": 4.6375, "step": 12240 }, { "epoch": 5.500898472596586, "grad_norm": 3.4375, "learning_rate": 0.0001625927620399776, "loss": 4.5649, "step": 12245 }, { "epoch": 5.50314465408805, "grad_norm": 3.5625, "learning_rate": 0.00016248811134817235, "loss": 4.6003, "step": 12250 }, { "epoch": 5.505390835579515, "grad_norm": 3.5, "learning_rate": 0.00016238346216631246, "loss": 4.6594, "step": 12255 }, { "epoch": 5.507637017070979, "grad_norm": 3.59375, "learning_rate": 0.0001622788145573046, "loss": 4.638, "step": 12260 }, { "epoch": 5.509883198562444, "grad_norm": 3.890625, "learning_rate": 0.0001621741685840545, "loss": 4.6509, "step": 12265 }, { "epoch": 5.512129380053908, "grad_norm": 3.6875, "learning_rate": 0.0001620695243094669, "loss": 4.6186, "step": 12270 }, { "epoch": 5.514375561545373, "grad_norm": 3.421875, "learning_rate": 0.00016196488179644552, "loss": 4.7181, "step": 12275 }, { "epoch": 5.516621743036837, "grad_norm": 3.359375, "learning_rate": 0.00016186024110789304, "loss": 4.5435, "step": 12280 }, { "epoch": 5.518867924528302, "grad_norm": 3.546875, "learning_rate": 0.000161755602306711, "loss": 4.591, "step": 12285 }, { "epoch": 5.521114106019766, "grad_norm": 3.828125, "learning_rate": 0.00016165096545579978, "loss": 4.6252, "step": 12290 }, { "epoch": 5.523360287511231, "grad_norm": 3.484375, "learning_rate": 0.00016154633061805872, "loss": 4.5458, "step": 12295 }, { "epoch": 5.525606469002695, "grad_norm": 3.265625, "learning_rate": 0.00016144169785638584, "loss": 4.5708, "step": 12300 }, { "epoch": 5.52785265049416, "grad_norm": 3.453125, "learning_rate": 0.00016133706723367794, "loss": 4.6349, "step": 12305 }, { "epoch": 5.530098831985624, "grad_norm": 3.71875, "learning_rate": 0.0001612324388128305, "loss": 4.6165, "step": 12310 }, { "epoch": 5.532345013477089, "grad_norm": 3.5625, "learning_rate": 0.00016112781265673778, "loss": 4.612, "step": 12315 }, { "epoch": 5.534591194968553, "grad_norm": 3.75, "learning_rate": 0.0001610231888282925, "loss": 4.579, "step": 12320 }, { "epoch": 5.536837376460018, "grad_norm": 3.640625, "learning_rate": 0.0001609185673903862, "loss": 4.6182, "step": 12325 }, { "epoch": 5.539083557951482, "grad_norm": 3.40625, "learning_rate": 0.00016081394840590876, "loss": 4.5729, "step": 12330 }, { "epoch": 5.541329739442947, "grad_norm": 3.390625, "learning_rate": 0.0001607093319377488, "loss": 4.6167, "step": 12335 }, { "epoch": 5.543575920934412, "grad_norm": 3.59375, "learning_rate": 0.00016060471804879326, "loss": 4.618, "step": 12340 }, { "epoch": 5.545822102425876, "grad_norm": 3.46875, "learning_rate": 0.00016050010680192765, "loss": 4.68, "step": 12345 }, { "epoch": 5.54806828391734, "grad_norm": 3.546875, "learning_rate": 0.00016039549826003577, "loss": 4.5647, "step": 12350 }, { "epoch": 5.550314465408805, "grad_norm": 3.65625, "learning_rate": 0.0001602908924859999, "loss": 4.5273, "step": 12355 }, { "epoch": 5.55256064690027, "grad_norm": 4.15625, "learning_rate": 0.0001601862895427006, "loss": 4.5393, "step": 12360 }, { "epoch": 5.554806828391734, "grad_norm": 3.484375, "learning_rate": 0.00016008168949301676, "loss": 4.5627, "step": 12365 }, { "epoch": 5.557053009883199, "grad_norm": 3.34375, "learning_rate": 0.00015997709239982553, "loss": 4.6177, "step": 12370 }, { "epoch": 5.559299191374663, "grad_norm": 3.75, "learning_rate": 0.00015987249832600218, "loss": 4.6861, "step": 12375 }, { "epoch": 5.561545372866128, "grad_norm": 3.515625, "learning_rate": 0.00015976790733442036, "loss": 4.6145, "step": 12380 }, { "epoch": 5.563791554357592, "grad_norm": 3.46875, "learning_rate": 0.00015966331948795166, "loss": 4.5544, "step": 12385 }, { "epoch": 5.566037735849057, "grad_norm": 3.46875, "learning_rate": 0.00015955873484946597, "loss": 4.6375, "step": 12390 }, { "epoch": 5.568283917340521, "grad_norm": 3.578125, "learning_rate": 0.00015945415348183112, "loss": 4.5971, "step": 12395 }, { "epoch": 5.570530098831986, "grad_norm": 3.625, "learning_rate": 0.00015934957544791302, "loss": 4.575, "step": 12400 }, { "epoch": 5.57277628032345, "grad_norm": 3.578125, "learning_rate": 0.0001592450008105755, "loss": 4.6174, "step": 12405 }, { "epoch": 5.575022461814915, "grad_norm": 3.515625, "learning_rate": 0.00015914042963268053, "loss": 4.5955, "step": 12410 }, { "epoch": 5.577268643306379, "grad_norm": 3.75, "learning_rate": 0.00015903586197708788, "loss": 4.5669, "step": 12415 }, { "epoch": 5.579514824797844, "grad_norm": 3.671875, "learning_rate": 0.00015893129790665511, "loss": 4.5646, "step": 12420 }, { "epoch": 5.581761006289308, "grad_norm": 3.5625, "learning_rate": 0.00015882673748423784, "loss": 4.6166, "step": 12425 }, { "epoch": 5.584007187780773, "grad_norm": 3.734375, "learning_rate": 0.00015872218077268933, "loss": 4.5632, "step": 12430 }, { "epoch": 5.586253369272237, "grad_norm": 3.53125, "learning_rate": 0.00015861762783486063, "loss": 4.568, "step": 12435 }, { "epoch": 5.588499550763702, "grad_norm": 3.421875, "learning_rate": 0.0001585130787336006, "loss": 4.5876, "step": 12440 }, { "epoch": 5.590745732255166, "grad_norm": 3.625, "learning_rate": 0.0001584085335317558, "loss": 4.5641, "step": 12445 }, { "epoch": 5.592991913746631, "grad_norm": 3.609375, "learning_rate": 0.0001583039922921703, "loss": 4.5242, "step": 12450 }, { "epoch": 5.595238095238095, "grad_norm": 3.609375, "learning_rate": 0.0001581994550776859, "loss": 4.632, "step": 12455 }, { "epoch": 5.59748427672956, "grad_norm": 3.546875, "learning_rate": 0.000158094921951142, "loss": 4.5217, "step": 12460 }, { "epoch": 5.599730458221024, "grad_norm": 3.84375, "learning_rate": 0.00015799039297537544, "loss": 4.5639, "step": 12465 }, { "epoch": 5.601976639712489, "grad_norm": 3.609375, "learning_rate": 0.00015788586821322074, "loss": 4.5401, "step": 12470 }, { "epoch": 5.604222821203953, "grad_norm": 3.5625, "learning_rate": 0.00015778134772750972, "loss": 4.5897, "step": 12475 }, { "epoch": 5.606469002695418, "grad_norm": 3.421875, "learning_rate": 0.00015767683158107165, "loss": 4.5933, "step": 12480 }, { "epoch": 5.608715184186883, "grad_norm": 3.53125, "learning_rate": 0.00015757231983673327, "loss": 4.5536, "step": 12485 }, { "epoch": 5.610961365678347, "grad_norm": 3.4375, "learning_rate": 0.00015746781255731863, "loss": 4.6235, "step": 12490 }, { "epoch": 5.613207547169811, "grad_norm": 3.8125, "learning_rate": 0.0001573633098056491, "loss": 4.6643, "step": 12495 }, { "epoch": 5.615453728661276, "grad_norm": 3.5, "learning_rate": 0.0001572588116445433, "loss": 4.5779, "step": 12500 }, { "epoch": 5.617699910152741, "grad_norm": 3.515625, "learning_rate": 0.0001571543181368171, "loss": 4.5442, "step": 12505 }, { "epoch": 5.619946091644205, "grad_norm": 3.703125, "learning_rate": 0.00015704982934528367, "loss": 4.5921, "step": 12510 }, { "epoch": 5.62219227313567, "grad_norm": 3.421875, "learning_rate": 0.00015694534533275325, "loss": 4.6018, "step": 12515 }, { "epoch": 5.624438454627134, "grad_norm": 3.515625, "learning_rate": 0.00015684086616203313, "loss": 4.6134, "step": 12520 }, { "epoch": 5.626684636118599, "grad_norm": 3.71875, "learning_rate": 0.00015673639189592788, "loss": 4.571, "step": 12525 }, { "epoch": 5.628930817610063, "grad_norm": 3.75, "learning_rate": 0.00015663192259723904, "loss": 4.5947, "step": 12530 }, { "epoch": 5.631176999101528, "grad_norm": 3.71875, "learning_rate": 0.00015652745832876502, "loss": 4.6523, "step": 12535 }, { "epoch": 5.633423180592992, "grad_norm": 3.75, "learning_rate": 0.0001564229991533014, "loss": 4.6599, "step": 12540 }, { "epoch": 5.635669362084457, "grad_norm": 3.546875, "learning_rate": 0.00015631854513364066, "loss": 4.5874, "step": 12545 }, { "epoch": 5.637915543575921, "grad_norm": 3.609375, "learning_rate": 0.00015621409633257216, "loss": 4.563, "step": 12550 }, { "epoch": 5.640161725067386, "grad_norm": 3.40625, "learning_rate": 0.00015610965281288205, "loss": 4.5769, "step": 12555 }, { "epoch": 5.64240790655885, "grad_norm": 3.703125, "learning_rate": 0.00015600521463735346, "loss": 4.5632, "step": 12560 }, { "epoch": 5.644654088050315, "grad_norm": 3.421875, "learning_rate": 0.00015590078186876612, "loss": 4.5776, "step": 12565 }, { "epoch": 5.646900269541779, "grad_norm": 3.421875, "learning_rate": 0.0001557963545698967, "loss": 4.5644, "step": 12570 }, { "epoch": 5.649146451033244, "grad_norm": 3.546875, "learning_rate": 0.00015569193280351844, "loss": 4.6529, "step": 12575 }, { "epoch": 5.651392632524708, "grad_norm": 3.75, "learning_rate": 0.00015558751663240127, "loss": 4.6086, "step": 12580 }, { "epoch": 5.653638814016173, "grad_norm": 3.34375, "learning_rate": 0.0001554831061193119, "loss": 4.5621, "step": 12585 }, { "epoch": 5.655884995507637, "grad_norm": 3.640625, "learning_rate": 0.0001553787013270134, "loss": 4.5809, "step": 12590 }, { "epoch": 5.658131176999102, "grad_norm": 3.5, "learning_rate": 0.00015527430231826564, "loss": 4.5426, "step": 12595 }, { "epoch": 5.660377358490566, "grad_norm": 3.4375, "learning_rate": 0.00015516990915582487, "loss": 4.5945, "step": 12600 }, { "epoch": 5.662623539982031, "grad_norm": 3.71875, "learning_rate": 0.00015506552190244386, "loss": 4.6063, "step": 12605 }, { "epoch": 5.6648697214734955, "grad_norm": 3.453125, "learning_rate": 0.00015496114062087175, "loss": 4.5035, "step": 12610 }, { "epoch": 5.66711590296496, "grad_norm": 3.546875, "learning_rate": 0.0001548567653738543, "loss": 4.5661, "step": 12615 }, { "epoch": 5.669362084456424, "grad_norm": 3.71875, "learning_rate": 0.00015475239622413344, "loss": 4.6367, "step": 12620 }, { "epoch": 5.671608265947889, "grad_norm": 3.5, "learning_rate": 0.00015464803323444754, "loss": 4.5808, "step": 12625 }, { "epoch": 5.6738544474393535, "grad_norm": 3.546875, "learning_rate": 0.0001545436764675312, "loss": 4.6723, "step": 12630 }, { "epoch": 5.676100628930818, "grad_norm": 3.5625, "learning_rate": 0.0001544393259861153, "loss": 4.5973, "step": 12635 }, { "epoch": 5.678346810422282, "grad_norm": 3.796875, "learning_rate": 0.00015433498185292695, "loss": 4.4989, "step": 12640 }, { "epoch": 5.680592991913747, "grad_norm": 3.609375, "learning_rate": 0.00015423064413068953, "loss": 4.5821, "step": 12645 }, { "epoch": 5.6828391734052115, "grad_norm": 3.640625, "learning_rate": 0.00015412631288212243, "loss": 4.5362, "step": 12650 }, { "epoch": 5.6850853548966755, "grad_norm": 3.671875, "learning_rate": 0.0001540219881699412, "loss": 4.5634, "step": 12655 }, { "epoch": 5.6873315363881405, "grad_norm": 3.515625, "learning_rate": 0.00015391767005685744, "loss": 4.5416, "step": 12660 }, { "epoch": 5.6895777178796045, "grad_norm": 3.71875, "learning_rate": 0.0001538133586055788, "loss": 4.4939, "step": 12665 }, { "epoch": 5.6918238993710695, "grad_norm": 3.796875, "learning_rate": 0.00015370905387880905, "loss": 4.6136, "step": 12670 }, { "epoch": 5.6940700808625335, "grad_norm": 3.53125, "learning_rate": 0.00015360475593924764, "loss": 4.6813, "step": 12675 }, { "epoch": 5.6963162623539985, "grad_norm": 3.484375, "learning_rate": 0.00015350046484959023, "loss": 4.6002, "step": 12680 }, { "epoch": 5.6985624438454625, "grad_norm": 3.78125, "learning_rate": 0.0001533961806725282, "loss": 4.5868, "step": 12685 }, { "epoch": 5.7008086253369274, "grad_norm": 3.703125, "learning_rate": 0.00015329190347074871, "loss": 4.561, "step": 12690 }, { "epoch": 5.7030548068283915, "grad_norm": 3.453125, "learning_rate": 0.0001531876333069349, "loss": 4.5557, "step": 12695 }, { "epoch": 5.705300988319856, "grad_norm": 3.53125, "learning_rate": 0.00015308337024376564, "loss": 4.5689, "step": 12700 }, { "epoch": 5.7075471698113205, "grad_norm": 3.53125, "learning_rate": 0.0001529791143439155, "loss": 4.5652, "step": 12705 }, { "epoch": 5.709793351302785, "grad_norm": 3.59375, "learning_rate": 0.0001528748656700546, "loss": 4.6094, "step": 12710 }, { "epoch": 5.7120395327942495, "grad_norm": 3.65625, "learning_rate": 0.00015277062428484898, "loss": 4.6139, "step": 12715 }, { "epoch": 5.714285714285714, "grad_norm": 3.53125, "learning_rate": 0.0001526663902509602, "loss": 4.5639, "step": 12720 }, { "epoch": 5.7165318957771785, "grad_norm": 3.609375, "learning_rate": 0.00015256216363104526, "loss": 4.5862, "step": 12725 }, { "epoch": 5.718778077268643, "grad_norm": 3.46875, "learning_rate": 0.0001524579444877569, "loss": 4.5733, "step": 12730 }, { "epoch": 5.7210242587601075, "grad_norm": 3.9375, "learning_rate": 0.00015235373288374329, "loss": 4.5285, "step": 12735 }, { "epoch": 5.723270440251572, "grad_norm": 3.59375, "learning_rate": 0.000152249528881648, "loss": 4.6101, "step": 12740 }, { "epoch": 5.725516621743036, "grad_norm": 3.53125, "learning_rate": 0.00015214533254411016, "loss": 4.531, "step": 12745 }, { "epoch": 5.727762803234501, "grad_norm": 3.796875, "learning_rate": 0.0001520411439337642, "loss": 4.63, "step": 12750 }, { "epoch": 5.730008984725966, "grad_norm": 3.609375, "learning_rate": 0.00015193696311323988, "loss": 4.5898, "step": 12755 }, { "epoch": 5.73225516621743, "grad_norm": 3.53125, "learning_rate": 0.0001518327901451624, "loss": 4.6105, "step": 12760 }, { "epoch": 5.734501347708894, "grad_norm": 3.578125, "learning_rate": 0.00015172862509215215, "loss": 4.5676, "step": 12765 }, { "epoch": 5.736747529200359, "grad_norm": 3.734375, "learning_rate": 0.00015162446801682476, "loss": 4.5878, "step": 12770 }, { "epoch": 5.738993710691824, "grad_norm": 3.359375, "learning_rate": 0.0001515203189817911, "loss": 4.6522, "step": 12775 }, { "epoch": 5.741239892183288, "grad_norm": 3.484375, "learning_rate": 0.00015141617804965716, "loss": 4.5824, "step": 12780 }, { "epoch": 5.743486073674753, "grad_norm": 3.453125, "learning_rate": 0.00015131204528302412, "loss": 4.6432, "step": 12785 }, { "epoch": 5.745732255166217, "grad_norm": 3.75, "learning_rate": 0.0001512079207444882, "loss": 4.5879, "step": 12790 }, { "epoch": 5.747978436657682, "grad_norm": 3.390625, "learning_rate": 0.00015110380449664075, "loss": 4.574, "step": 12795 }, { "epoch": 5.750224618149146, "grad_norm": 4.34375, "learning_rate": 0.00015099969660206804, "loss": 4.6326, "step": 12800 }, { "epoch": 5.752470799640611, "grad_norm": 3.671875, "learning_rate": 0.00015089559712335135, "loss": 4.5741, "step": 12805 }, { "epoch": 5.754716981132075, "grad_norm": 3.546875, "learning_rate": 0.00015079150612306693, "loss": 4.6053, "step": 12810 }, { "epoch": 5.75696316262354, "grad_norm": 3.609375, "learning_rate": 0.00015068742366378587, "loss": 4.5251, "step": 12815 }, { "epoch": 5.759209344115004, "grad_norm": 3.8125, "learning_rate": 0.00015058334980807425, "loss": 4.6518, "step": 12820 }, { "epoch": 5.761455525606469, "grad_norm": 3.609375, "learning_rate": 0.00015047928461849286, "loss": 4.6316, "step": 12825 }, { "epoch": 5.763701707097933, "grad_norm": 3.546875, "learning_rate": 0.00015037522815759732, "loss": 4.5181, "step": 12830 }, { "epoch": 5.765947888589398, "grad_norm": 3.40625, "learning_rate": 0.000150271180487938, "loss": 4.5048, "step": 12835 }, { "epoch": 5.768194070080862, "grad_norm": 3.8125, "learning_rate": 0.00015016714167206, "loss": 4.6388, "step": 12840 }, { "epoch": 5.770440251572327, "grad_norm": 3.59375, "learning_rate": 0.0001500631117725031, "loss": 4.594, "step": 12845 }, { "epoch": 5.772686433063791, "grad_norm": 3.40625, "learning_rate": 0.00014995909085180163, "loss": 4.5535, "step": 12850 }, { "epoch": 5.774932614555256, "grad_norm": 3.546875, "learning_rate": 0.00014985507897248465, "loss": 4.566, "step": 12855 }, { "epoch": 5.77717879604672, "grad_norm": 3.6875, "learning_rate": 0.00014975107619707577, "loss": 4.5553, "step": 12860 }, { "epoch": 5.779424977538185, "grad_norm": 3.765625, "learning_rate": 0.000149647082588093, "loss": 4.5905, "step": 12865 }, { "epoch": 5.781671159029649, "grad_norm": 3.40625, "learning_rate": 0.000149543098208049, "loss": 4.5787, "step": 12870 }, { "epoch": 5.783917340521114, "grad_norm": 3.546875, "learning_rate": 0.00014943912311945085, "loss": 4.6191, "step": 12875 }, { "epoch": 5.786163522012579, "grad_norm": 3.578125, "learning_rate": 0.0001493351573847999, "loss": 4.6162, "step": 12880 }, { "epoch": 5.788409703504043, "grad_norm": 3.484375, "learning_rate": 0.00014923120106659205, "loss": 4.5889, "step": 12885 }, { "epoch": 5.790655884995507, "grad_norm": 3.40625, "learning_rate": 0.00014912725422731749, "loss": 4.6641, "step": 12890 }, { "epoch": 5.792902066486972, "grad_norm": 3.78125, "learning_rate": 0.00014902331692946065, "loss": 4.544, "step": 12895 }, { "epoch": 5.795148247978437, "grad_norm": 3.75, "learning_rate": 0.00014891938923550032, "loss": 4.4786, "step": 12900 }, { "epoch": 5.797394429469901, "grad_norm": 3.59375, "learning_rate": 0.00014881547120790945, "loss": 4.5937, "step": 12905 }, { "epoch": 5.799640610961365, "grad_norm": 3.9375, "learning_rate": 0.00014871156290915515, "loss": 4.6043, "step": 12910 }, { "epoch": 5.80188679245283, "grad_norm": 3.515625, "learning_rate": 0.00014860766440169881, "loss": 4.5986, "step": 12915 }, { "epoch": 5.804132973944295, "grad_norm": 3.6875, "learning_rate": 0.0001485037757479958, "loss": 4.5605, "step": 12920 }, { "epoch": 5.806379155435759, "grad_norm": 3.46875, "learning_rate": 0.00014839989701049563, "loss": 4.6339, "step": 12925 }, { "epoch": 5.808625336927224, "grad_norm": 3.421875, "learning_rate": 0.00014829602825164188, "loss": 4.5654, "step": 12930 }, { "epoch": 5.810871518418688, "grad_norm": 3.578125, "learning_rate": 0.000148192169533872, "loss": 4.5828, "step": 12935 }, { "epoch": 5.813117699910153, "grad_norm": 3.8125, "learning_rate": 0.0001480883209196176, "loss": 4.5773, "step": 12940 }, { "epoch": 5.815363881401617, "grad_norm": 3.53125, "learning_rate": 0.00014798448247130405, "loss": 4.6143, "step": 12945 }, { "epoch": 5.817610062893082, "grad_norm": 3.65625, "learning_rate": 0.0001478806542513507, "loss": 4.5957, "step": 12950 }, { "epoch": 5.819856244384546, "grad_norm": 3.640625, "learning_rate": 0.00014777683632217069, "loss": 4.6125, "step": 12955 }, { "epoch": 5.822102425876011, "grad_norm": 3.390625, "learning_rate": 0.00014767302874617096, "loss": 4.5948, "step": 12960 }, { "epoch": 5.824348607367475, "grad_norm": 3.546875, "learning_rate": 0.00014756923158575243, "loss": 4.5827, "step": 12965 }, { "epoch": 5.82659478885894, "grad_norm": 3.5625, "learning_rate": 0.00014746544490330945, "loss": 4.5534, "step": 12970 }, { "epoch": 5.828840970350404, "grad_norm": 3.453125, "learning_rate": 0.0001473616687612303, "loss": 4.5907, "step": 12975 }, { "epoch": 5.831087151841869, "grad_norm": 3.46875, "learning_rate": 0.00014725790322189688, "loss": 4.6352, "step": 12980 }, { "epoch": 5.833333333333333, "grad_norm": 3.59375, "learning_rate": 0.0001471541483476846, "loss": 4.5848, "step": 12985 }, { "epoch": 5.835579514824798, "grad_norm": 3.53125, "learning_rate": 0.00014705040420096252, "loss": 4.5885, "step": 12990 }, { "epoch": 5.837825696316262, "grad_norm": 3.5625, "learning_rate": 0.0001469466708440934, "loss": 4.6434, "step": 12995 }, { "epoch": 5.840071877807727, "grad_norm": 3.734375, "learning_rate": 0.0001468429483394333, "loss": 4.5689, "step": 13000 }, { "epoch": 5.840071877807727, "eval_loss": 4.79840087890625, "eval_runtime": 16.0175, "eval_samples_per_second": 1936.199, "eval_steps_per_second": 242.048, "step": 13000 }, { "epoch": 5.842318059299191, "grad_norm": 3.453125, "learning_rate": 0.00014673923674933192, "loss": 4.6216, "step": 13005 }, { "epoch": 5.844564240790656, "grad_norm": 3.578125, "learning_rate": 0.00014663553613613217, "loss": 4.5931, "step": 13010 }, { "epoch": 5.84681042228212, "grad_norm": 3.5625, "learning_rate": 0.00014653184656217066, "loss": 4.6131, "step": 13015 }, { "epoch": 5.849056603773585, "grad_norm": 3.90625, "learning_rate": 0.0001464281680897772, "loss": 4.605, "step": 13020 }, { "epoch": 5.85130278526505, "grad_norm": 3.53125, "learning_rate": 0.0001463245007812749, "loss": 4.5923, "step": 13025 }, { "epoch": 5.853548966756514, "grad_norm": 3.53125, "learning_rate": 0.0001462208446989802, "loss": 4.6215, "step": 13030 }, { "epoch": 5.855795148247978, "grad_norm": 3.546875, "learning_rate": 0.00014611719990520285, "loss": 4.6047, "step": 13035 }, { "epoch": 5.858041329739443, "grad_norm": 3.578125, "learning_rate": 0.0001460135664622457, "loss": 4.6222, "step": 13040 }, { "epoch": 5.860287511230908, "grad_norm": 3.4375, "learning_rate": 0.00014590994443240487, "loss": 4.5746, "step": 13045 }, { "epoch": 5.862533692722372, "grad_norm": 3.421875, "learning_rate": 0.00014580633387796966, "loss": 4.5708, "step": 13050 }, { "epoch": 5.864779874213837, "grad_norm": 3.515625, "learning_rate": 0.00014570273486122227, "loss": 4.5993, "step": 13055 }, { "epoch": 5.867026055705301, "grad_norm": 3.609375, "learning_rate": 0.0001455991474444382, "loss": 4.5421, "step": 13060 }, { "epoch": 5.869272237196766, "grad_norm": 3.5625, "learning_rate": 0.0001454955716898858, "loss": 4.5749, "step": 13065 }, { "epoch": 5.87151841868823, "grad_norm": 3.390625, "learning_rate": 0.00014539200765982646, "loss": 4.6254, "step": 13070 }, { "epoch": 5.873764600179695, "grad_norm": 3.421875, "learning_rate": 0.00014528845541651462, "loss": 4.5774, "step": 13075 }, { "epoch": 5.876010781671159, "grad_norm": 3.5625, "learning_rate": 0.00014518491502219752, "loss": 4.58, "step": 13080 }, { "epoch": 5.878256963162624, "grad_norm": 3.75, "learning_rate": 0.00014508138653911536, "loss": 4.5937, "step": 13085 }, { "epoch": 5.880503144654088, "grad_norm": 3.5, "learning_rate": 0.00014497787002950107, "loss": 4.6329, "step": 13090 }, { "epoch": 5.882749326145553, "grad_norm": 3.46875, "learning_rate": 0.00014487436555558046, "loss": 4.6449, "step": 13095 }, { "epoch": 5.884995507637017, "grad_norm": 3.46875, "learning_rate": 0.00014477087317957212, "loss": 4.6336, "step": 13100 }, { "epoch": 5.887241689128482, "grad_norm": 3.3125, "learning_rate": 0.00014466739296368732, "loss": 4.5848, "step": 13105 }, { "epoch": 5.889487870619946, "grad_norm": 3.40625, "learning_rate": 0.00014456392497013006, "loss": 4.5894, "step": 13110 }, { "epoch": 5.891734052111411, "grad_norm": 3.71875, "learning_rate": 0.00014446046926109695, "loss": 4.547, "step": 13115 }, { "epoch": 5.893980233602875, "grad_norm": 3.4375, "learning_rate": 0.00014435702589877725, "loss": 4.5287, "step": 13120 }, { "epoch": 5.89622641509434, "grad_norm": 3.609375, "learning_rate": 0.00014425359494535275, "loss": 4.5491, "step": 13125 }, { "epoch": 5.898472596585804, "grad_norm": 3.578125, "learning_rate": 0.00014415017646299792, "loss": 4.6074, "step": 13130 }, { "epoch": 5.900718778077269, "grad_norm": 3.59375, "learning_rate": 0.00014404677051387948, "loss": 4.5964, "step": 13135 }, { "epoch": 5.902964959568733, "grad_norm": 3.453125, "learning_rate": 0.00014394337716015692, "loss": 4.5608, "step": 13140 }, { "epoch": 5.905211141060198, "grad_norm": 3.46875, "learning_rate": 0.00014383999646398193, "loss": 4.5739, "step": 13145 }, { "epoch": 5.907457322551663, "grad_norm": 3.75, "learning_rate": 0.00014373662848749866, "loss": 4.6323, "step": 13150 }, { "epoch": 5.909703504043127, "grad_norm": 3.703125, "learning_rate": 0.00014363327329284362, "loss": 4.5731, "step": 13155 }, { "epoch": 5.911949685534591, "grad_norm": 3.390625, "learning_rate": 0.00014352993094214573, "loss": 4.5454, "step": 13160 }, { "epoch": 5.914195867026056, "grad_norm": 3.578125, "learning_rate": 0.00014342660149752596, "loss": 4.5988, "step": 13165 }, { "epoch": 5.916442048517521, "grad_norm": 3.703125, "learning_rate": 0.00014332328502109773, "loss": 4.5969, "step": 13170 }, { "epoch": 5.918688230008985, "grad_norm": 3.75, "learning_rate": 0.00014321998157496656, "loss": 4.5779, "step": 13175 }, { "epoch": 5.920934411500449, "grad_norm": 3.546875, "learning_rate": 0.00014311669122123023, "loss": 4.6475, "step": 13180 }, { "epoch": 5.923180592991914, "grad_norm": 3.59375, "learning_rate": 0.0001430134140219785, "loss": 4.5966, "step": 13185 }, { "epoch": 5.925426774483379, "grad_norm": 3.609375, "learning_rate": 0.00014291015003929343, "loss": 4.5979, "step": 13190 }, { "epoch": 5.927672955974843, "grad_norm": 3.53125, "learning_rate": 0.00014280689933524892, "loss": 4.5359, "step": 13195 }, { "epoch": 5.929919137466308, "grad_norm": 3.578125, "learning_rate": 0.00014270366197191104, "loss": 4.573, "step": 13200 }, { "epoch": 5.932165318957772, "grad_norm": 3.609375, "learning_rate": 0.00014260043801133773, "loss": 4.5386, "step": 13205 }, { "epoch": 5.934411500449237, "grad_norm": 3.578125, "learning_rate": 0.00014249722751557905, "loss": 4.5472, "step": 13210 }, { "epoch": 5.936657681940701, "grad_norm": 3.578125, "learning_rate": 0.00014239403054667668, "loss": 4.6398, "step": 13215 }, { "epoch": 5.938903863432166, "grad_norm": 3.4375, "learning_rate": 0.00014229084716666445, "loss": 4.5737, "step": 13220 }, { "epoch": 5.94115004492363, "grad_norm": 3.8125, "learning_rate": 0.0001421876774375679, "loss": 4.6144, "step": 13225 }, { "epoch": 5.943396226415095, "grad_norm": 3.5, "learning_rate": 0.00014208452142140435, "loss": 4.576, "step": 13230 }, { "epoch": 5.945642407906559, "grad_norm": 3.734375, "learning_rate": 0.00014198137918018287, "loss": 4.5736, "step": 13235 }, { "epoch": 5.947888589398024, "grad_norm": 3.640625, "learning_rate": 0.00014187825077590431, "loss": 4.5622, "step": 13240 }, { "epoch": 5.950134770889488, "grad_norm": 3.546875, "learning_rate": 0.00014177513627056115, "loss": 4.5501, "step": 13245 }, { "epoch": 5.9523809523809526, "grad_norm": 3.625, "learning_rate": 0.00014167203572613756, "loss": 4.5601, "step": 13250 }, { "epoch": 5.954627133872417, "grad_norm": 3.59375, "learning_rate": 0.00014156894920460932, "loss": 4.5828, "step": 13255 }, { "epoch": 5.9568733153638815, "grad_norm": 3.78125, "learning_rate": 0.00014146587676794366, "loss": 4.5764, "step": 13260 }, { "epoch": 5.959119496855346, "grad_norm": 3.5625, "learning_rate": 0.00014136281847809952, "loss": 4.5667, "step": 13265 }, { "epoch": 5.9613656783468105, "grad_norm": 3.390625, "learning_rate": 0.00014125977439702724, "loss": 4.5588, "step": 13270 }, { "epoch": 5.963611859838275, "grad_norm": 3.65625, "learning_rate": 0.00014115674458666858, "loss": 4.5707, "step": 13275 }, { "epoch": 5.9658580413297395, "grad_norm": 3.625, "learning_rate": 0.0001410537291089568, "loss": 4.6072, "step": 13280 }, { "epoch": 5.968104222821204, "grad_norm": 3.515625, "learning_rate": 0.00014095072802581656, "loss": 4.5488, "step": 13285 }, { "epoch": 5.9703504043126685, "grad_norm": 3.65625, "learning_rate": 0.00014084774139916378, "loss": 4.5971, "step": 13290 }, { "epoch": 5.972596585804133, "grad_norm": 3.5, "learning_rate": 0.0001407447692909057, "loss": 4.6285, "step": 13295 }, { "epoch": 5.9748427672955975, "grad_norm": 3.671875, "learning_rate": 0.00014064181176294096, "loss": 4.5695, "step": 13300 }, { "epoch": 5.9770889487870615, "grad_norm": 3.625, "learning_rate": 0.0001405388688771593, "loss": 4.6901, "step": 13305 }, { "epoch": 5.9793351302785265, "grad_norm": 3.703125, "learning_rate": 0.0001404359406954416, "loss": 4.5687, "step": 13310 }, { "epoch": 5.981581311769991, "grad_norm": 3.9375, "learning_rate": 0.0001403330272796602, "loss": 4.6383, "step": 13315 }, { "epoch": 5.9838274932614555, "grad_norm": 3.4375, "learning_rate": 0.00014023012869167828, "loss": 4.5998, "step": 13320 }, { "epoch": 5.98607367475292, "grad_norm": 3.5, "learning_rate": 0.00014012724499335013, "loss": 4.6224, "step": 13325 }, { "epoch": 5.9883198562443845, "grad_norm": 3.703125, "learning_rate": 0.00014002437624652118, "loss": 4.589, "step": 13330 }, { "epoch": 5.990566037735849, "grad_norm": 3.671875, "learning_rate": 0.00013992152251302784, "loss": 4.5993, "step": 13335 }, { "epoch": 5.992812219227313, "grad_norm": 3.53125, "learning_rate": 0.00013981868385469756, "loss": 4.6194, "step": 13340 }, { "epoch": 5.995058400718778, "grad_norm": 3.8125, "learning_rate": 0.00013971586033334864, "loss": 4.6192, "step": 13345 }, { "epoch": 5.997304582210242, "grad_norm": 3.96875, "learning_rate": 0.00013961305201079025, "loss": 4.6668, "step": 13350 }, { "epoch": 5.999550763701707, "grad_norm": 3.375, "learning_rate": 0.00013951025894882256, "loss": 4.5731, "step": 13355 }, { "epoch": 6.001796945193171, "grad_norm": 3.796875, "learning_rate": 0.00013940748120923641, "loss": 4.5212, "step": 13360 }, { "epoch": 6.004043126684636, "grad_norm": 3.703125, "learning_rate": 0.0001393047188538136, "loss": 4.521, "step": 13365 }, { "epoch": 6.0062893081761, "grad_norm": 3.640625, "learning_rate": 0.00013920197194432657, "loss": 4.5062, "step": 13370 }, { "epoch": 6.008535489667565, "grad_norm": 3.625, "learning_rate": 0.00013909924054253845, "loss": 4.4692, "step": 13375 }, { "epoch": 6.010781671159029, "grad_norm": 3.765625, "learning_rate": 0.00013899652471020308, "loss": 4.5304, "step": 13380 }, { "epoch": 6.013027852650494, "grad_norm": 3.734375, "learning_rate": 0.00013889382450906507, "loss": 4.5064, "step": 13385 }, { "epoch": 6.015274034141958, "grad_norm": 3.578125, "learning_rate": 0.00013879114000085933, "loss": 4.5232, "step": 13390 }, { "epoch": 6.017520215633423, "grad_norm": 3.65625, "learning_rate": 0.0001386884712473117, "loss": 4.5745, "step": 13395 }, { "epoch": 6.019766397124887, "grad_norm": 3.890625, "learning_rate": 0.0001385858183101383, "loss": 4.5067, "step": 13400 }, { "epoch": 6.022012578616352, "grad_norm": 3.546875, "learning_rate": 0.0001384831812510458, "loss": 4.5416, "step": 13405 }, { "epoch": 6.024258760107816, "grad_norm": 3.796875, "learning_rate": 0.00013838056013173143, "loss": 4.5799, "step": 13410 }, { "epoch": 6.026504941599281, "grad_norm": 3.46875, "learning_rate": 0.00013827795501388264, "loss": 4.5485, "step": 13415 }, { "epoch": 6.028751123090745, "grad_norm": 3.90625, "learning_rate": 0.00013817536595917742, "loss": 4.5435, "step": 13420 }, { "epoch": 6.03099730458221, "grad_norm": 3.8125, "learning_rate": 0.00013807279302928405, "loss": 4.5413, "step": 13425 }, { "epoch": 6.033243486073674, "grad_norm": 3.734375, "learning_rate": 0.0001379702362858611, "loss": 4.5275, "step": 13430 }, { "epoch": 6.035489667565139, "grad_norm": 3.734375, "learning_rate": 0.00013786769579055753, "loss": 4.5684, "step": 13435 }, { "epoch": 6.037735849056604, "grad_norm": 3.71875, "learning_rate": 0.00013776517160501238, "loss": 4.5337, "step": 13440 }, { "epoch": 6.039982030548068, "grad_norm": 3.4375, "learning_rate": 0.00013766266379085492, "loss": 4.5921, "step": 13445 }, { "epoch": 6.042228212039533, "grad_norm": 3.6875, "learning_rate": 0.00013756017240970457, "loss": 4.5239, "step": 13450 }, { "epoch": 6.044474393530997, "grad_norm": 3.453125, "learning_rate": 0.00013745769752317093, "loss": 4.5291, "step": 13455 }, { "epoch": 6.046720575022462, "grad_norm": 3.6875, "learning_rate": 0.0001373552391928537, "loss": 4.5423, "step": 13460 }, { "epoch": 6.048966756513926, "grad_norm": 3.84375, "learning_rate": 0.00013725279748034257, "loss": 4.5376, "step": 13465 }, { "epoch": 6.051212938005391, "grad_norm": 3.59375, "learning_rate": 0.00013715037244721725, "loss": 4.5239, "step": 13470 }, { "epoch": 6.053459119496855, "grad_norm": 3.578125, "learning_rate": 0.0001370479641550474, "loss": 4.557, "step": 13475 }, { "epoch": 6.05570530098832, "grad_norm": 3.75, "learning_rate": 0.0001369455726653927, "loss": 4.5622, "step": 13480 }, { "epoch": 6.057951482479784, "grad_norm": 3.515625, "learning_rate": 0.00013684319803980262, "loss": 4.5015, "step": 13485 }, { "epoch": 6.060197663971249, "grad_norm": 3.59375, "learning_rate": 0.00013674084033981655, "loss": 4.4932, "step": 13490 }, { "epoch": 6.062443845462713, "grad_norm": 3.578125, "learning_rate": 0.00013663849962696379, "loss": 4.551, "step": 13495 }, { "epoch": 6.064690026954178, "grad_norm": 4.53125, "learning_rate": 0.0001365361759627632, "loss": 4.5254, "step": 13500 }, { "epoch": 6.066936208445642, "grad_norm": 3.6875, "learning_rate": 0.00013643386940872363, "loss": 4.5218, "step": 13505 }, { "epoch": 6.069182389937107, "grad_norm": 3.796875, "learning_rate": 0.00013633158002634356, "loss": 4.5726, "step": 13510 }, { "epoch": 6.071428571428571, "grad_norm": 3.59375, "learning_rate": 0.0001362293078771111, "loss": 4.4735, "step": 13515 }, { "epoch": 6.073674752920036, "grad_norm": 3.703125, "learning_rate": 0.00013612705302250405, "loss": 4.5454, "step": 13520 }, { "epoch": 6.0759209344115, "grad_norm": 3.765625, "learning_rate": 0.00013602481552398983, "loss": 4.5265, "step": 13525 }, { "epoch": 6.078167115902965, "grad_norm": 3.6875, "learning_rate": 0.0001359225954430253, "loss": 4.5326, "step": 13530 }, { "epoch": 6.080413297394429, "grad_norm": 3.78125, "learning_rate": 0.00013582039284105706, "loss": 4.5025, "step": 13535 }, { "epoch": 6.082659478885894, "grad_norm": 3.71875, "learning_rate": 0.00013571820777952105, "loss": 4.5118, "step": 13540 }, { "epoch": 6.084905660377358, "grad_norm": 3.703125, "learning_rate": 0.00013561604031984268, "loss": 4.4428, "step": 13545 }, { "epoch": 6.087151841868823, "grad_norm": 3.640625, "learning_rate": 0.0001355138905234369, "loss": 4.5108, "step": 13550 }, { "epoch": 6.089398023360287, "grad_norm": 3.6875, "learning_rate": 0.00013541175845170785, "loss": 4.4825, "step": 13555 }, { "epoch": 6.091644204851752, "grad_norm": 3.328125, "learning_rate": 0.00013530964416604913, "loss": 4.5154, "step": 13560 }, { "epoch": 6.093890386343216, "grad_norm": 3.65625, "learning_rate": 0.0001352075477278436, "loss": 4.5195, "step": 13565 }, { "epoch": 6.096136567834681, "grad_norm": 3.484375, "learning_rate": 0.00013510546919846358, "loss": 4.5567, "step": 13570 }, { "epoch": 6.098382749326145, "grad_norm": 3.34375, "learning_rate": 0.0001350034086392703, "loss": 4.5383, "step": 13575 }, { "epoch": 6.10062893081761, "grad_norm": 3.78125, "learning_rate": 0.00013490136611161448, "loss": 4.5397, "step": 13580 }, { "epoch": 6.102875112309075, "grad_norm": 3.65625, "learning_rate": 0.00013479934167683579, "loss": 4.4954, "step": 13585 }, { "epoch": 6.105121293800539, "grad_norm": 3.53125, "learning_rate": 0.00013469733539626315, "loss": 4.4926, "step": 13590 }, { "epoch": 6.107367475292004, "grad_norm": 3.796875, "learning_rate": 0.00013459534733121448, "loss": 4.5163, "step": 13595 }, { "epoch": 6.109613656783468, "grad_norm": 3.703125, "learning_rate": 0.00013449337754299688, "loss": 4.5654, "step": 13600 }, { "epoch": 6.111859838274933, "grad_norm": 3.65625, "learning_rate": 0.00013439142609290633, "loss": 4.5667, "step": 13605 }, { "epoch": 6.114106019766397, "grad_norm": 3.84375, "learning_rate": 0.00013428949304222787, "loss": 4.5255, "step": 13610 }, { "epoch": 6.116352201257862, "grad_norm": 3.78125, "learning_rate": 0.00013418757845223546, "loss": 4.5519, "step": 13615 }, { "epoch": 6.118598382749326, "grad_norm": 3.515625, "learning_rate": 0.00013408568238419186, "loss": 4.5788, "step": 13620 }, { "epoch": 6.120844564240791, "grad_norm": 3.4375, "learning_rate": 0.00013398380489934892, "loss": 4.5207, "step": 13625 }, { "epoch": 6.123090745732255, "grad_norm": 3.75, "learning_rate": 0.00013388194605894703, "loss": 4.4663, "step": 13630 }, { "epoch": 6.12533692722372, "grad_norm": 3.8125, "learning_rate": 0.00013378010592421575, "loss": 4.5232, "step": 13635 }, { "epoch": 6.127583108715184, "grad_norm": 3.875, "learning_rate": 0.00013367828455637296, "loss": 4.48, "step": 13640 }, { "epoch": 6.129829290206649, "grad_norm": 3.796875, "learning_rate": 0.00013357648201662556, "loss": 4.5684, "step": 13645 }, { "epoch": 6.132075471698113, "grad_norm": 3.609375, "learning_rate": 0.00013347469836616906, "loss": 4.5237, "step": 13650 }, { "epoch": 6.134321653189578, "grad_norm": 3.53125, "learning_rate": 0.00013337293366618759, "loss": 4.5358, "step": 13655 }, { "epoch": 6.136567834681042, "grad_norm": 3.609375, "learning_rate": 0.00013327118797785392, "loss": 4.4897, "step": 13660 }, { "epoch": 6.138814016172507, "grad_norm": 3.65625, "learning_rate": 0.00013316946136232932, "loss": 4.4809, "step": 13665 }, { "epoch": 6.141060197663971, "grad_norm": 3.953125, "learning_rate": 0.00013306775388076367, "loss": 4.4886, "step": 13670 }, { "epoch": 6.143306379155436, "grad_norm": 3.78125, "learning_rate": 0.00013296606559429536, "loss": 4.4976, "step": 13675 }, { "epoch": 6.1455525606469, "grad_norm": 3.65625, "learning_rate": 0.00013286439656405116, "loss": 4.4976, "step": 13680 }, { "epoch": 6.147798742138365, "grad_norm": 3.484375, "learning_rate": 0.00013276274685114636, "loss": 4.5344, "step": 13685 }, { "epoch": 6.150044923629829, "grad_norm": 3.78125, "learning_rate": 0.00013266111651668455, "loss": 4.5032, "step": 13690 }, { "epoch": 6.152291105121294, "grad_norm": 3.734375, "learning_rate": 0.00013255950562175774, "loss": 4.579, "step": 13695 }, { "epoch": 6.154537286612758, "grad_norm": 3.640625, "learning_rate": 0.00013245791422744616, "loss": 4.5537, "step": 13700 }, { "epoch": 6.156783468104223, "grad_norm": 3.859375, "learning_rate": 0.00013235634239481848, "loss": 4.493, "step": 13705 }, { "epoch": 6.159029649595688, "grad_norm": 3.671875, "learning_rate": 0.0001322547901849314, "loss": 4.5021, "step": 13710 }, { "epoch": 6.161275831087152, "grad_norm": 3.5625, "learning_rate": 0.00013215325765883004, "loss": 4.5263, "step": 13715 }, { "epoch": 6.163522012578617, "grad_norm": 3.828125, "learning_rate": 0.00013205174487754756, "loss": 4.5108, "step": 13720 }, { "epoch": 6.165768194070081, "grad_norm": 3.640625, "learning_rate": 0.00013195025190210525, "loss": 4.5393, "step": 13725 }, { "epoch": 6.168014375561546, "grad_norm": 3.78125, "learning_rate": 0.00013184877879351256, "loss": 4.4821, "step": 13730 }, { "epoch": 6.17026055705301, "grad_norm": 3.671875, "learning_rate": 0.0001317473256127669, "loss": 4.5444, "step": 13735 }, { "epoch": 6.172506738544475, "grad_norm": 3.8125, "learning_rate": 0.0001316458924208538, "loss": 4.507, "step": 13740 }, { "epoch": 6.174752920035939, "grad_norm": 3.5625, "learning_rate": 0.00013154447927874675, "loss": 4.5028, "step": 13745 }, { "epoch": 6.176999101527404, "grad_norm": 3.65625, "learning_rate": 0.00013144308624740713, "loss": 4.4272, "step": 13750 }, { "epoch": 6.179245283018868, "grad_norm": 3.484375, "learning_rate": 0.00013134171338778433, "loss": 4.5698, "step": 13755 }, { "epoch": 6.181491464510333, "grad_norm": 3.859375, "learning_rate": 0.0001312403607608155, "loss": 4.4958, "step": 13760 }, { "epoch": 6.183737646001797, "grad_norm": 3.625, "learning_rate": 0.0001311390284274257, "loss": 4.5115, "step": 13765 }, { "epoch": 6.185983827493262, "grad_norm": 3.546875, "learning_rate": 0.0001310377164485278, "loss": 4.5206, "step": 13770 }, { "epoch": 6.188230008984726, "grad_norm": 3.75, "learning_rate": 0.00013093642488502238, "loss": 4.5121, "step": 13775 }, { "epoch": 6.190476190476191, "grad_norm": 3.8125, "learning_rate": 0.00013083515379779784, "loss": 4.5207, "step": 13780 }, { "epoch": 6.192722371967655, "grad_norm": 3.6875, "learning_rate": 0.00013073390324773012, "loss": 4.5614, "step": 13785 }, { "epoch": 6.19496855345912, "grad_norm": 3.65625, "learning_rate": 0.00013063267329568295, "loss": 4.4603, "step": 13790 }, { "epoch": 6.197214734950584, "grad_norm": 3.65625, "learning_rate": 0.0001305314640025077, "loss": 4.5404, "step": 13795 }, { "epoch": 6.199460916442049, "grad_norm": 3.65625, "learning_rate": 0.00013043027542904308, "loss": 4.5545, "step": 13800 }, { "epoch": 6.201707097933513, "grad_norm": 3.765625, "learning_rate": 0.0001303291076361157, "loss": 4.6362, "step": 13805 }, { "epoch": 6.203953279424978, "grad_norm": 3.796875, "learning_rate": 0.0001302279606845394, "loss": 4.4728, "step": 13810 }, { "epoch": 6.206199460916442, "grad_norm": 3.734375, "learning_rate": 0.0001301268346351156, "loss": 4.5516, "step": 13815 }, { "epoch": 6.208445642407907, "grad_norm": 3.765625, "learning_rate": 0.00013002572954863315, "loss": 4.5923, "step": 13820 }, { "epoch": 6.210691823899371, "grad_norm": 3.875, "learning_rate": 0.00012992464548586833, "loss": 4.5001, "step": 13825 }, { "epoch": 6.212938005390836, "grad_norm": 3.828125, "learning_rate": 0.0001298235825075847, "loss": 4.4834, "step": 13830 }, { "epoch": 6.2151841868823, "grad_norm": 3.796875, "learning_rate": 0.00012972254067453322, "loss": 4.4975, "step": 13835 }, { "epoch": 6.217430368373765, "grad_norm": 4.0, "learning_rate": 0.00012962152004745208, "loss": 4.5328, "step": 13840 }, { "epoch": 6.219676549865229, "grad_norm": 3.90625, "learning_rate": 0.00012952052068706678, "loss": 4.4904, "step": 13845 }, { "epoch": 6.221922731356694, "grad_norm": 3.84375, "learning_rate": 0.00012941954265409004, "loss": 4.6397, "step": 13850 }, { "epoch": 6.2241689128481585, "grad_norm": 3.71875, "learning_rate": 0.0001293185860092216, "loss": 4.5542, "step": 13855 }, { "epoch": 6.226415094339623, "grad_norm": 3.703125, "learning_rate": 0.00012921765081314865, "loss": 4.5427, "step": 13860 }, { "epoch": 6.2286612758310875, "grad_norm": 3.9375, "learning_rate": 0.0001291167371265452, "loss": 4.5675, "step": 13865 }, { "epoch": 6.230907457322552, "grad_norm": 3.6875, "learning_rate": 0.00012901584501007248, "loss": 4.556, "step": 13870 }, { "epoch": 6.2331536388140165, "grad_norm": 4.0, "learning_rate": 0.0001289149745243787, "loss": 4.5741, "step": 13875 }, { "epoch": 6.235399820305481, "grad_norm": 3.484375, "learning_rate": 0.00012881412573009904, "loss": 4.5604, "step": 13880 }, { "epoch": 6.2376460017969455, "grad_norm": 3.4375, "learning_rate": 0.00012871329868785572, "loss": 4.5488, "step": 13885 }, { "epoch": 6.2398921832884096, "grad_norm": 3.8125, "learning_rate": 0.00012861249345825788, "loss": 4.5015, "step": 13890 }, { "epoch": 6.2421383647798745, "grad_norm": 3.71875, "learning_rate": 0.00012851171010190148, "loss": 4.4744, "step": 13895 }, { "epoch": 6.2443845462713385, "grad_norm": 3.71875, "learning_rate": 0.00012841094867936935, "loss": 4.4872, "step": 13900 }, { "epoch": 6.2466307277628035, "grad_norm": 3.6875, "learning_rate": 0.00012831020925123117, "loss": 4.5236, "step": 13905 }, { "epoch": 6.2488769092542675, "grad_norm": 3.65625, "learning_rate": 0.00012820949187804337, "loss": 4.5709, "step": 13910 }, { "epoch": 6.2511230907457325, "grad_norm": 3.59375, "learning_rate": 0.00012810879662034915, "loss": 4.5999, "step": 13915 }, { "epoch": 6.2533692722371965, "grad_norm": 3.65625, "learning_rate": 0.00012800812353867835, "loss": 4.4844, "step": 13920 }, { "epoch": 6.2556154537286615, "grad_norm": 3.96875, "learning_rate": 0.0001279074726935476, "loss": 4.5329, "step": 13925 }, { "epoch": 6.2578616352201255, "grad_norm": 3.859375, "learning_rate": 0.00012780684414546005, "loss": 4.5309, "step": 13930 }, { "epoch": 6.2601078167115904, "grad_norm": 3.859375, "learning_rate": 0.0001277062379549055, "loss": 4.5303, "step": 13935 }, { "epoch": 6.2623539982030545, "grad_norm": 3.6875, "learning_rate": 0.00012760565418236023, "loss": 4.5207, "step": 13940 }, { "epoch": 6.264600179694519, "grad_norm": 3.515625, "learning_rate": 0.00012750509288828718, "loss": 4.5325, "step": 13945 }, { "epoch": 6.2668463611859835, "grad_norm": 3.515625, "learning_rate": 0.00012740455413313574, "loss": 4.5184, "step": 13950 }, { "epoch": 6.269092542677448, "grad_norm": 3.84375, "learning_rate": 0.00012730403797734172, "loss": 4.5426, "step": 13955 }, { "epoch": 6.2713387241689125, "grad_norm": 3.5625, "learning_rate": 0.0001272035444813273, "loss": 4.498, "step": 13960 }, { "epoch": 6.273584905660377, "grad_norm": 3.953125, "learning_rate": 0.000127103073705501, "loss": 4.5535, "step": 13965 }, { "epoch": 6.2758310871518415, "grad_norm": 3.671875, "learning_rate": 0.00012700262571025789, "loss": 4.4931, "step": 13970 }, { "epoch": 6.278077268643306, "grad_norm": 3.78125, "learning_rate": 0.0001269022005559792, "loss": 4.531, "step": 13975 }, { "epoch": 6.280323450134771, "grad_norm": 3.859375, "learning_rate": 0.00012680179830303244, "loss": 4.4948, "step": 13980 }, { "epoch": 6.282569631626235, "grad_norm": 3.78125, "learning_rate": 0.00012670141901177138, "loss": 4.5464, "step": 13985 }, { "epoch": 6.2848158131177, "grad_norm": 3.671875, "learning_rate": 0.00012660106274253597, "loss": 4.5584, "step": 13990 }, { "epoch": 6.287061994609164, "grad_norm": 3.734375, "learning_rate": 0.00012650072955565226, "loss": 4.5407, "step": 13995 }, { "epoch": 6.289308176100629, "grad_norm": 3.859375, "learning_rate": 0.00012640041951143263, "loss": 4.5669, "step": 14000 }, { "epoch": 6.289308176100629, "eval_loss": 4.793929100036621, "eval_runtime": 16.1025, "eval_samples_per_second": 1925.975, "eval_steps_per_second": 240.77, "step": 14000 }, { "epoch": 6.291554357592093, "grad_norm": 3.921875, "learning_rate": 0.00012630013267017528, "loss": 4.5684, "step": 14005 }, { "epoch": 6.293800539083558, "grad_norm": 3.6875, "learning_rate": 0.00012619986909216465, "loss": 4.5145, "step": 14010 }, { "epoch": 6.296046720575022, "grad_norm": 3.609375, "learning_rate": 0.00012609962883767113, "loss": 4.5817, "step": 14015 }, { "epoch": 6.298292902066487, "grad_norm": 3.796875, "learning_rate": 0.00012599941196695107, "loss": 4.5459, "step": 14020 }, { "epoch": 6.300539083557951, "grad_norm": 3.625, "learning_rate": 0.00012589921854024686, "loss": 4.5078, "step": 14025 }, { "epoch": 6.302785265049416, "grad_norm": 3.6875, "learning_rate": 0.00012579904861778661, "loss": 4.5486, "step": 14030 }, { "epoch": 6.30503144654088, "grad_norm": 3.65625, "learning_rate": 0.00012569890225978456, "loss": 4.4713, "step": 14035 }, { "epoch": 6.307277628032345, "grad_norm": 3.546875, "learning_rate": 0.00012559877952644053, "loss": 4.5605, "step": 14040 }, { "epoch": 6.309523809523809, "grad_norm": 3.703125, "learning_rate": 0.0001254986804779403, "loss": 4.5235, "step": 14045 }, { "epoch": 6.311769991015274, "grad_norm": 3.53125, "learning_rate": 0.00012539860517445537, "loss": 4.4912, "step": 14050 }, { "epoch": 6.314016172506738, "grad_norm": 3.75, "learning_rate": 0.00012529855367614294, "loss": 4.5545, "step": 14055 }, { "epoch": 6.316262353998203, "grad_norm": 3.484375, "learning_rate": 0.0001251985260431459, "loss": 4.5054, "step": 14060 }, { "epoch": 6.318508535489667, "grad_norm": 3.703125, "learning_rate": 0.00012509852233559286, "loss": 4.45, "step": 14065 }, { "epoch": 6.320754716981132, "grad_norm": 3.546875, "learning_rate": 0.00012499854261359799, "loss": 4.6074, "step": 14070 }, { "epoch": 6.323000898472596, "grad_norm": 3.75, "learning_rate": 0.00012489858693726108, "loss": 4.5045, "step": 14075 }, { "epoch": 6.325247079964061, "grad_norm": 3.703125, "learning_rate": 0.0001247986553666674, "loss": 4.5179, "step": 14080 }, { "epoch": 6.327493261455525, "grad_norm": 3.90625, "learning_rate": 0.00012469874796188778, "loss": 4.5399, "step": 14085 }, { "epoch": 6.32973944294699, "grad_norm": 3.484375, "learning_rate": 0.0001245988647829785, "loss": 4.5538, "step": 14090 }, { "epoch": 6.331985624438454, "grad_norm": 3.71875, "learning_rate": 0.00012449900588998132, "loss": 4.5488, "step": 14095 }, { "epoch": 6.334231805929919, "grad_norm": 3.96875, "learning_rate": 0.00012439917134292336, "loss": 4.5329, "step": 14100 }, { "epoch": 6.336477987421383, "grad_norm": 4.0, "learning_rate": 0.00012429936120181715, "loss": 4.5304, "step": 14105 }, { "epoch": 6.338724168912848, "grad_norm": 3.53125, "learning_rate": 0.00012419957552666048, "loss": 4.519, "step": 14110 }, { "epoch": 6.340970350404312, "grad_norm": 3.65625, "learning_rate": 0.0001240998143774365, "loss": 4.5323, "step": 14115 }, { "epoch": 6.343216531895777, "grad_norm": 3.9375, "learning_rate": 0.0001240000778141135, "loss": 4.5171, "step": 14120 }, { "epoch": 6.345462713387242, "grad_norm": 4.125, "learning_rate": 0.00012390036589664518, "loss": 4.5038, "step": 14125 }, { "epoch": 6.347708894878706, "grad_norm": 3.6875, "learning_rate": 0.0001238006786849703, "loss": 4.5938, "step": 14130 }, { "epoch": 6.349955076370171, "grad_norm": 3.8125, "learning_rate": 0.00012370101623901273, "loss": 4.4651, "step": 14135 }, { "epoch": 6.352201257861635, "grad_norm": 3.453125, "learning_rate": 0.00012360137861868156, "loss": 4.5762, "step": 14140 }, { "epoch": 6.3544474393531, "grad_norm": 3.53125, "learning_rate": 0.00012350176588387093, "loss": 4.4847, "step": 14145 }, { "epoch": 6.356693620844564, "grad_norm": 3.65625, "learning_rate": 0.00012340217809446, "loss": 4.5574, "step": 14150 }, { "epoch": 6.358939802336029, "grad_norm": 3.59375, "learning_rate": 0.00012330261531031287, "loss": 4.5025, "step": 14155 }, { "epoch": 6.361185983827493, "grad_norm": 3.890625, "learning_rate": 0.00012320307759127876, "loss": 4.5731, "step": 14160 }, { "epoch": 6.363432165318958, "grad_norm": 3.84375, "learning_rate": 0.0001231035649971917, "loss": 4.5366, "step": 14165 }, { "epoch": 6.365678346810422, "grad_norm": 3.78125, "learning_rate": 0.00012300407758787066, "loss": 4.4876, "step": 14170 }, { "epoch": 6.367924528301887, "grad_norm": 3.703125, "learning_rate": 0.00012290461542311946, "loss": 4.5827, "step": 14175 }, { "epoch": 6.370170709793351, "grad_norm": 3.59375, "learning_rate": 0.00012280517856272675, "loss": 4.4945, "step": 14180 }, { "epoch": 6.372416891284816, "grad_norm": 3.90625, "learning_rate": 0.000122705767066466, "loss": 4.5056, "step": 14185 }, { "epoch": 6.37466307277628, "grad_norm": 3.703125, "learning_rate": 0.00012260638099409536, "loss": 4.5234, "step": 14190 }, { "epoch": 6.376909254267745, "grad_norm": 3.59375, "learning_rate": 0.0001225070204053578, "loss": 4.466, "step": 14195 }, { "epoch": 6.379155435759209, "grad_norm": 3.640625, "learning_rate": 0.00012240768535998084, "loss": 4.4772, "step": 14200 }, { "epoch": 6.381401617250674, "grad_norm": 3.90625, "learning_rate": 0.00012230837591767672, "loss": 4.5287, "step": 14205 }, { "epoch": 6.383647798742138, "grad_norm": 3.8125, "learning_rate": 0.00012220909213814235, "loss": 4.4945, "step": 14210 }, { "epoch": 6.385893980233603, "grad_norm": 3.640625, "learning_rate": 0.00012210983408105915, "loss": 4.4599, "step": 14215 }, { "epoch": 6.388140161725067, "grad_norm": 3.578125, "learning_rate": 0.000122010601806093, "loss": 4.5246, "step": 14220 }, { "epoch": 6.390386343216532, "grad_norm": 3.953125, "learning_rate": 0.00012191139537289445, "loss": 4.4865, "step": 14225 }, { "epoch": 6.392632524707996, "grad_norm": 3.78125, "learning_rate": 0.00012181221484109835, "loss": 4.5183, "step": 14230 }, { "epoch": 6.394878706199461, "grad_norm": 3.515625, "learning_rate": 0.0001217130602703241, "loss": 4.5526, "step": 14235 }, { "epoch": 6.397124887690925, "grad_norm": 3.890625, "learning_rate": 0.00012161393172017542, "loss": 4.5366, "step": 14240 }, { "epoch": 6.39937106918239, "grad_norm": 3.84375, "learning_rate": 0.0001215148292502405, "loss": 4.5064, "step": 14245 }, { "epoch": 6.401617250673855, "grad_norm": 3.90625, "learning_rate": 0.00012141575292009165, "loss": 4.529, "step": 14250 }, { "epoch": 6.403863432165319, "grad_norm": 3.609375, "learning_rate": 0.00012131670278928569, "loss": 4.5554, "step": 14255 }, { "epoch": 6.406109613656783, "grad_norm": 3.796875, "learning_rate": 0.00012121767891736353, "loss": 4.5227, "step": 14260 }, { "epoch": 6.408355795148248, "grad_norm": 3.5625, "learning_rate": 0.00012111868136385037, "loss": 4.5264, "step": 14265 }, { "epoch": 6.410601976639713, "grad_norm": 3.8125, "learning_rate": 0.00012101971018825564, "loss": 4.5253, "step": 14270 }, { "epoch": 6.412848158131177, "grad_norm": 3.703125, "learning_rate": 0.00012092076545007273, "loss": 4.5086, "step": 14275 }, { "epoch": 6.415094339622642, "grad_norm": 3.609375, "learning_rate": 0.00012082184720877934, "loss": 4.5902, "step": 14280 }, { "epoch": 6.417340521114106, "grad_norm": 3.640625, "learning_rate": 0.00012072295552383708, "loss": 4.5578, "step": 14285 }, { "epoch": 6.419586702605571, "grad_norm": 3.859375, "learning_rate": 0.00012062409045469175, "loss": 4.5546, "step": 14290 }, { "epoch": 6.421832884097035, "grad_norm": 3.65625, "learning_rate": 0.00012052525206077305, "loss": 4.5276, "step": 14295 }, { "epoch": 6.4240790655885, "grad_norm": 3.515625, "learning_rate": 0.0001204264404014947, "loss": 4.5656, "step": 14300 }, { "epoch": 6.426325247079964, "grad_norm": 3.609375, "learning_rate": 0.00012032765553625428, "loss": 4.5248, "step": 14305 }, { "epoch": 6.428571428571429, "grad_norm": 3.765625, "learning_rate": 0.0001202288975244333, "loss": 4.5503, "step": 14310 }, { "epoch": 6.430817610062893, "grad_norm": 3.703125, "learning_rate": 0.00012013016642539715, "loss": 4.5641, "step": 14315 }, { "epoch": 6.433063791554358, "grad_norm": 3.734375, "learning_rate": 0.00012003146229849505, "loss": 4.4994, "step": 14320 }, { "epoch": 6.435309973045822, "grad_norm": 3.671875, "learning_rate": 0.00011993278520305992, "loss": 4.5326, "step": 14325 }, { "epoch": 6.437556154537287, "grad_norm": 3.890625, "learning_rate": 0.00011983413519840854, "loss": 4.4685, "step": 14330 }, { "epoch": 6.439802336028751, "grad_norm": 3.859375, "learning_rate": 0.00011973551234384135, "loss": 4.5315, "step": 14335 }, { "epoch": 6.442048517520216, "grad_norm": 3.640625, "learning_rate": 0.00011963691669864244, "loss": 4.4935, "step": 14340 }, { "epoch": 6.44429469901168, "grad_norm": 3.796875, "learning_rate": 0.00011953834832207957, "loss": 4.5003, "step": 14345 }, { "epoch": 6.446540880503145, "grad_norm": 3.953125, "learning_rate": 0.0001194398072734042, "loss": 4.5085, "step": 14350 }, { "epoch": 6.448787061994609, "grad_norm": 3.609375, "learning_rate": 0.00011934129361185124, "loss": 4.4885, "step": 14355 }, { "epoch": 6.451033243486074, "grad_norm": 3.765625, "learning_rate": 0.00011924280739663914, "loss": 4.4973, "step": 14360 }, { "epoch": 6.453279424977538, "grad_norm": 3.734375, "learning_rate": 0.00011914434868696995, "loss": 4.4664, "step": 14365 }, { "epoch": 6.455525606469003, "grad_norm": 4.03125, "learning_rate": 0.00011904591754202906, "loss": 4.5713, "step": 14370 }, { "epoch": 6.457771787960467, "grad_norm": 3.609375, "learning_rate": 0.0001189475140209854, "loss": 4.5177, "step": 14375 }, { "epoch": 6.460017969451932, "grad_norm": 3.375, "learning_rate": 0.00011884913818299123, "loss": 4.4828, "step": 14380 }, { "epoch": 6.462264150943396, "grad_norm": 3.75, "learning_rate": 0.00011875079008718222, "loss": 4.61, "step": 14385 }, { "epoch": 6.464510332434861, "grad_norm": 3.640625, "learning_rate": 0.00011865246979267728, "loss": 4.5747, "step": 14390 }, { "epoch": 6.466756513926326, "grad_norm": 3.9375, "learning_rate": 0.0001185541773585787, "loss": 4.5184, "step": 14395 }, { "epoch": 6.46900269541779, "grad_norm": 3.5625, "learning_rate": 0.000118455912843972, "loss": 4.5603, "step": 14400 }, { "epoch": 6.471248876909255, "grad_norm": 4.09375, "learning_rate": 0.00011835767630792586, "loss": 4.5171, "step": 14405 }, { "epoch": 6.473495058400719, "grad_norm": 3.875, "learning_rate": 0.00011825946780949216, "loss": 4.5148, "step": 14410 }, { "epoch": 6.475741239892184, "grad_norm": 3.59375, "learning_rate": 0.00011816128740770604, "loss": 4.5804, "step": 14415 }, { "epoch": 6.477987421383648, "grad_norm": 3.90625, "learning_rate": 0.00011806313516158559, "loss": 4.5033, "step": 14420 }, { "epoch": 6.480233602875113, "grad_norm": 3.859375, "learning_rate": 0.00011796501113013204, "loss": 4.5177, "step": 14425 }, { "epoch": 6.482479784366577, "grad_norm": 3.890625, "learning_rate": 0.00011786691537232975, "loss": 4.5909, "step": 14430 }, { "epoch": 6.484725965858042, "grad_norm": 3.625, "learning_rate": 0.00011776884794714586, "loss": 4.5206, "step": 14435 }, { "epoch": 6.486972147349506, "grad_norm": 3.6875, "learning_rate": 0.00011767080891353069, "loss": 4.5884, "step": 14440 }, { "epoch": 6.489218328840971, "grad_norm": 3.71875, "learning_rate": 0.00011757279833041742, "loss": 4.4961, "step": 14445 }, { "epoch": 6.491464510332435, "grad_norm": 3.703125, "learning_rate": 0.00011747481625672212, "loss": 4.5385, "step": 14450 }, { "epoch": 6.4937106918239, "grad_norm": 3.859375, "learning_rate": 0.00011737686275134372, "loss": 4.494, "step": 14455 }, { "epoch": 6.495956873315364, "grad_norm": 3.5, "learning_rate": 0.00011727893787316402, "loss": 4.551, "step": 14460 }, { "epoch": 6.498203054806829, "grad_norm": 3.65625, "learning_rate": 0.00011718104168104756, "loss": 4.4896, "step": 14465 }, { "epoch": 6.500449236298293, "grad_norm": 3.671875, "learning_rate": 0.00011708317423384163, "loss": 4.578, "step": 14470 }, { "epoch": 6.502695417789758, "grad_norm": 3.625, "learning_rate": 0.00011698533559037628, "loss": 4.4863, "step": 14475 }, { "epoch": 6.504941599281222, "grad_norm": 3.609375, "learning_rate": 0.00011688752580946425, "loss": 4.5179, "step": 14480 }, { "epoch": 6.507187780772687, "grad_norm": 3.6875, "learning_rate": 0.00011678974494990092, "loss": 4.5397, "step": 14485 }, { "epoch": 6.509433962264151, "grad_norm": 3.609375, "learning_rate": 0.0001166919930704642, "loss": 4.5018, "step": 14490 }, { "epoch": 6.5116801437556155, "grad_norm": 3.625, "learning_rate": 0.00011659427022991474, "loss": 4.447, "step": 14495 }, { "epoch": 6.51392632524708, "grad_norm": 3.90625, "learning_rate": 0.00011649657648699564, "loss": 4.5113, "step": 14500 }, { "epoch": 6.5161725067385445, "grad_norm": 3.578125, "learning_rate": 0.00011639891190043248, "loss": 4.4485, "step": 14505 }, { "epoch": 6.518418688230009, "grad_norm": 3.65625, "learning_rate": 0.00011630127652893336, "loss": 4.5122, "step": 14510 }, { "epoch": 6.5206648697214735, "grad_norm": 3.859375, "learning_rate": 0.00011620367043118884, "loss": 4.538, "step": 14515 }, { "epoch": 6.5229110512129385, "grad_norm": 4.0, "learning_rate": 0.00011610609366587179, "loss": 4.5288, "step": 14520 }, { "epoch": 6.5251572327044025, "grad_norm": 3.84375, "learning_rate": 0.00011600854629163758, "loss": 4.4971, "step": 14525 }, { "epoch": 6.527403414195867, "grad_norm": 3.796875, "learning_rate": 0.00011591102836712383, "loss": 4.5613, "step": 14530 }, { "epoch": 6.5296495956873315, "grad_norm": 3.703125, "learning_rate": 0.00011581353995095046, "loss": 4.5092, "step": 14535 }, { "epoch": 6.531895777178796, "grad_norm": 3.84375, "learning_rate": 0.00011571608110171965, "loss": 4.5608, "step": 14540 }, { "epoch": 6.5341419586702605, "grad_norm": 3.65625, "learning_rate": 0.00011561865187801587, "loss": 4.5326, "step": 14545 }, { "epoch": 6.536388140161725, "grad_norm": 3.75, "learning_rate": 0.00011552125233840563, "loss": 4.5099, "step": 14550 }, { "epoch": 6.5386343216531895, "grad_norm": 3.515625, "learning_rate": 0.00011542388254143775, "loss": 4.518, "step": 14555 }, { "epoch": 6.540880503144654, "grad_norm": 3.9375, "learning_rate": 0.00011532654254564316, "loss": 4.602, "step": 14560 }, { "epoch": 6.5431266846361185, "grad_norm": 3.84375, "learning_rate": 0.0001152292324095348, "loss": 4.5577, "step": 14565 }, { "epoch": 6.545372866127583, "grad_norm": 3.703125, "learning_rate": 0.0001151319521916077, "loss": 4.5812, "step": 14570 }, { "epoch": 6.5476190476190474, "grad_norm": 3.703125, "learning_rate": 0.00011503470195033893, "loss": 4.4876, "step": 14575 }, { "epoch": 6.549865229110512, "grad_norm": 3.859375, "learning_rate": 0.00011493748174418742, "loss": 4.4887, "step": 14580 }, { "epoch": 6.552111410601976, "grad_norm": 3.453125, "learning_rate": 0.00011484029163159424, "loss": 4.5311, "step": 14585 }, { "epoch": 6.554357592093441, "grad_norm": 3.78125, "learning_rate": 0.00011474313167098222, "loss": 4.5097, "step": 14590 }, { "epoch": 6.556603773584905, "grad_norm": 3.859375, "learning_rate": 0.00011464600192075608, "loss": 4.5612, "step": 14595 }, { "epoch": 6.55884995507637, "grad_norm": 3.609375, "learning_rate": 0.0001145489024393024, "loss": 4.556, "step": 14600 }, { "epoch": 6.561096136567834, "grad_norm": 3.5625, "learning_rate": 0.00011445183328498965, "loss": 4.5377, "step": 14605 }, { "epoch": 6.563342318059299, "grad_norm": 3.828125, "learning_rate": 0.00011435479451616801, "loss": 4.498, "step": 14610 }, { "epoch": 6.565588499550763, "grad_norm": 3.640625, "learning_rate": 0.00011425778619116928, "loss": 4.5996, "step": 14615 }, { "epoch": 6.567834681042228, "grad_norm": 3.59375, "learning_rate": 0.00011416080836830717, "loss": 4.5207, "step": 14620 }, { "epoch": 6.570080862533692, "grad_norm": 3.703125, "learning_rate": 0.00011406386110587684, "loss": 4.5385, "step": 14625 }, { "epoch": 6.572327044025157, "grad_norm": 3.65625, "learning_rate": 0.00011396694446215525, "loss": 4.5183, "step": 14630 }, { "epoch": 6.574573225516621, "grad_norm": 3.546875, "learning_rate": 0.00011387005849540086, "loss": 4.6191, "step": 14635 }, { "epoch": 6.576819407008086, "grad_norm": 3.65625, "learning_rate": 0.00011377320326385376, "loss": 4.5576, "step": 14640 }, { "epoch": 6.579065588499551, "grad_norm": 3.71875, "learning_rate": 0.00011367637882573548, "loss": 4.5206, "step": 14645 }, { "epoch": 6.581311769991015, "grad_norm": 3.65625, "learning_rate": 0.00011357958523924913, "loss": 4.5109, "step": 14650 }, { "epoch": 6.583557951482479, "grad_norm": 3.640625, "learning_rate": 0.00011348282256257918, "loss": 4.5546, "step": 14655 }, { "epoch": 6.585804132973944, "grad_norm": 3.5625, "learning_rate": 0.00011338609085389158, "loss": 4.4889, "step": 14660 }, { "epoch": 6.588050314465409, "grad_norm": 3.734375, "learning_rate": 0.00011328939017133358, "loss": 4.5036, "step": 14665 }, { "epoch": 6.590296495956873, "grad_norm": 3.609375, "learning_rate": 0.000113192720573034, "loss": 4.5495, "step": 14670 }, { "epoch": 6.592542677448337, "grad_norm": 3.859375, "learning_rate": 0.00011309608211710271, "loss": 4.5405, "step": 14675 }, { "epoch": 6.594788858939802, "grad_norm": 3.65625, "learning_rate": 0.00011299947486163105, "loss": 4.4951, "step": 14680 }, { "epoch": 6.597035040431267, "grad_norm": 3.4375, "learning_rate": 0.00011290289886469147, "loss": 4.5056, "step": 14685 }, { "epoch": 6.599281221922731, "grad_norm": 3.375, "learning_rate": 0.00011280635418433776, "loss": 4.5174, "step": 14690 }, { "epoch": 6.601527403414196, "grad_norm": 3.578125, "learning_rate": 0.00011270984087860467, "loss": 4.5037, "step": 14695 }, { "epoch": 6.60377358490566, "grad_norm": 3.71875, "learning_rate": 0.00011261335900550839, "loss": 4.5719, "step": 14700 }, { "epoch": 6.606019766397125, "grad_norm": 3.78125, "learning_rate": 0.000112516908623046, "loss": 4.5371, "step": 14705 }, { "epoch": 6.608265947888589, "grad_norm": 3.734375, "learning_rate": 0.0001124204897891957, "loss": 4.5445, "step": 14710 }, { "epoch": 6.610512129380054, "grad_norm": 3.578125, "learning_rate": 0.00011232410256191677, "loss": 4.5235, "step": 14715 }, { "epoch": 6.612758310871518, "grad_norm": 3.9375, "learning_rate": 0.00011222774699914941, "loss": 4.5053, "step": 14720 }, { "epoch": 6.615004492362983, "grad_norm": 3.71875, "learning_rate": 0.00011213142315881486, "loss": 4.6242, "step": 14725 }, { "epoch": 6.617250673854447, "grad_norm": 3.859375, "learning_rate": 0.00011203513109881524, "loss": 4.5777, "step": 14730 }, { "epoch": 6.619496855345912, "grad_norm": 3.9375, "learning_rate": 0.00011193887087703363, "loss": 4.4892, "step": 14735 }, { "epoch": 6.621743036837376, "grad_norm": 3.828125, "learning_rate": 0.00011184264255133388, "loss": 4.5864, "step": 14740 }, { "epoch": 6.623989218328841, "grad_norm": 3.5625, "learning_rate": 0.00011174644617956081, "loss": 4.537, "step": 14745 }, { "epoch": 6.626235399820305, "grad_norm": 3.65625, "learning_rate": 0.00011165028181953985, "loss": 4.5378, "step": 14750 }, { "epoch": 6.62848158131177, "grad_norm": 3.578125, "learning_rate": 0.00011155414952907728, "loss": 4.5097, "step": 14755 }, { "epoch": 6.630727762803234, "grad_norm": 3.953125, "learning_rate": 0.00011145804936596011, "loss": 4.5378, "step": 14760 }, { "epoch": 6.632973944294699, "grad_norm": 4.0, "learning_rate": 0.00011136198138795606, "loss": 4.6784, "step": 14765 }, { "epoch": 6.635220125786163, "grad_norm": 3.78125, "learning_rate": 0.00011126594565281345, "loss": 4.595, "step": 14770 }, { "epoch": 6.637466307277628, "grad_norm": 4.0, "learning_rate": 0.00011116994221826121, "loss": 4.5158, "step": 14775 }, { "epoch": 6.639712488769092, "grad_norm": 3.921875, "learning_rate": 0.00011107397114200892, "loss": 4.5673, "step": 14780 }, { "epoch": 6.641958670260557, "grad_norm": 3.765625, "learning_rate": 0.00011097803248174664, "loss": 4.5942, "step": 14785 }, { "epoch": 6.644204851752022, "grad_norm": 3.625, "learning_rate": 0.00011088212629514502, "loss": 4.5581, "step": 14790 }, { "epoch": 6.646451033243486, "grad_norm": 3.828125, "learning_rate": 0.00011078625263985509, "loss": 4.6212, "step": 14795 }, { "epoch": 6.64869721473495, "grad_norm": 4.0625, "learning_rate": 0.0001106904115735084, "loss": 4.4868, "step": 14800 }, { "epoch": 6.650943396226415, "grad_norm": 3.890625, "learning_rate": 0.00011059460315371693, "loss": 4.4921, "step": 14805 }, { "epoch": 6.65318957771788, "grad_norm": 3.625, "learning_rate": 0.00011049882743807289, "loss": 4.5305, "step": 14810 }, { "epoch": 6.655435759209344, "grad_norm": 3.703125, "learning_rate": 0.00011040308448414901, "loss": 4.4816, "step": 14815 }, { "epoch": 6.657681940700809, "grad_norm": 3.734375, "learning_rate": 0.00011030737434949829, "loss": 4.5401, "step": 14820 }, { "epoch": 6.659928122192273, "grad_norm": 3.59375, "learning_rate": 0.00011021169709165386, "loss": 4.5575, "step": 14825 }, { "epoch": 6.662174303683738, "grad_norm": 3.65625, "learning_rate": 0.00011011605276812926, "loss": 4.5486, "step": 14830 }, { "epoch": 6.664420485175202, "grad_norm": 3.8125, "learning_rate": 0.00011002044143641815, "loss": 4.5209, "step": 14835 }, { "epoch": 6.666666666666667, "grad_norm": 3.75, "learning_rate": 0.00010992486315399431, "loss": 4.5942, "step": 14840 }, { "epoch": 6.668912848158131, "grad_norm": 3.828125, "learning_rate": 0.00010982931797831182, "loss": 4.4669, "step": 14845 }, { "epoch": 6.671159029649596, "grad_norm": 3.890625, "learning_rate": 0.00010973380596680472, "loss": 4.5182, "step": 14850 }, { "epoch": 6.67340521114106, "grad_norm": 3.765625, "learning_rate": 0.00010963832717688711, "loss": 4.4873, "step": 14855 }, { "epoch": 6.675651392632525, "grad_norm": 3.828125, "learning_rate": 0.00010954288166595314, "loss": 4.528, "step": 14860 }, { "epoch": 6.677897574123989, "grad_norm": 3.734375, "learning_rate": 0.00010944746949137705, "loss": 4.5011, "step": 14865 }, { "epoch": 6.680143755615454, "grad_norm": 3.921875, "learning_rate": 0.00010935209071051289, "loss": 4.4619, "step": 14870 }, { "epoch": 6.682389937106918, "grad_norm": 3.75, "learning_rate": 0.00010925674538069476, "loss": 4.6037, "step": 14875 }, { "epoch": 6.684636118598383, "grad_norm": 3.625, "learning_rate": 0.00010916143355923657, "loss": 4.5853, "step": 14880 }, { "epoch": 6.686882300089847, "grad_norm": 3.5, "learning_rate": 0.00010906615530343216, "loss": 4.5759, "step": 14885 }, { "epoch": 6.689128481581312, "grad_norm": 3.65625, "learning_rate": 0.00010897091067055507, "loss": 4.4407, "step": 14890 }, { "epoch": 6.691374663072776, "grad_norm": 3.640625, "learning_rate": 0.00010887569971785877, "loss": 4.4905, "step": 14895 }, { "epoch": 6.693620844564241, "grad_norm": 3.8125, "learning_rate": 0.00010878052250257651, "loss": 4.5457, "step": 14900 }, { "epoch": 6.695867026055705, "grad_norm": 3.8125, "learning_rate": 0.00010868537908192096, "loss": 4.4999, "step": 14905 }, { "epoch": 6.69811320754717, "grad_norm": 3.6875, "learning_rate": 0.0001085902695130849, "loss": 4.5304, "step": 14910 }, { "epoch": 6.700359389038635, "grad_norm": 3.484375, "learning_rate": 0.0001084951938532404, "loss": 4.5785, "step": 14915 }, { "epoch": 6.702605570530099, "grad_norm": 3.78125, "learning_rate": 0.00010840015215953941, "loss": 4.5099, "step": 14920 }, { "epoch": 6.704851752021563, "grad_norm": 3.515625, "learning_rate": 0.00010830514448911326, "loss": 4.5237, "step": 14925 }, { "epoch": 6.707097933513028, "grad_norm": 3.6875, "learning_rate": 0.00010821017089907299, "loss": 4.5074, "step": 14930 }, { "epoch": 6.709344115004493, "grad_norm": 3.828125, "learning_rate": 0.000108115231446509, "loss": 4.5812, "step": 14935 }, { "epoch": 6.711590296495957, "grad_norm": 3.90625, "learning_rate": 0.0001080203261884913, "loss": 4.5318, "step": 14940 }, { "epoch": 6.713836477987421, "grad_norm": 3.703125, "learning_rate": 0.00010792545518206936, "loss": 4.5566, "step": 14945 }, { "epoch": 6.716082659478886, "grad_norm": 3.5625, "learning_rate": 0.00010783061848427187, "loss": 4.4695, "step": 14950 }, { "epoch": 6.718328840970351, "grad_norm": 3.671875, "learning_rate": 0.00010773581615210714, "loss": 4.5218, "step": 14955 }, { "epoch": 6.720575022461815, "grad_norm": 3.671875, "learning_rate": 0.00010764104824256261, "loss": 4.4921, "step": 14960 }, { "epoch": 6.72282120395328, "grad_norm": 3.65625, "learning_rate": 0.0001075463148126052, "loss": 4.4961, "step": 14965 }, { "epoch": 6.725067385444744, "grad_norm": 3.984375, "learning_rate": 0.00010745161591918092, "loss": 4.4951, "step": 14970 }, { "epoch": 6.727313566936209, "grad_norm": 3.8125, "learning_rate": 0.00010735695161921522, "loss": 4.5504, "step": 14975 }, { "epoch": 6.729559748427673, "grad_norm": 3.875, "learning_rate": 0.00010726232196961269, "loss": 4.5075, "step": 14980 }, { "epoch": 6.731805929919138, "grad_norm": 3.828125, "learning_rate": 0.00010716772702725692, "loss": 4.5834, "step": 14985 }, { "epoch": 6.734052111410602, "grad_norm": 3.484375, "learning_rate": 0.00010707316684901095, "loss": 4.4663, "step": 14990 }, { "epoch": 6.736298292902067, "grad_norm": 4.03125, "learning_rate": 0.00010697864149171663, "loss": 4.4827, "step": 14995 }, { "epoch": 6.738544474393531, "grad_norm": 3.75, "learning_rate": 0.00010688415101219502, "loss": 4.5397, "step": 15000 }, { "epoch": 6.738544474393531, "eval_loss": 4.787689685821533, "eval_runtime": 16.0548, "eval_samples_per_second": 1931.7, "eval_steps_per_second": 241.486, "step": 15000 }, { "epoch": 6.740790655884996, "grad_norm": 3.8125, "learning_rate": 0.00010678969546724628, "loss": 4.5253, "step": 15005 }, { "epoch": 6.74303683737646, "grad_norm": 3.765625, "learning_rate": 0.00010669527491364935, "loss": 4.5435, "step": 15010 }, { "epoch": 6.745283018867925, "grad_norm": 3.65625, "learning_rate": 0.00010660088940816236, "loss": 4.4933, "step": 15015 }, { "epoch": 6.747529200359389, "grad_norm": 3.734375, "learning_rate": 0.00010650653900752224, "loss": 4.541, "step": 15020 }, { "epoch": 6.749775381850854, "grad_norm": 3.5625, "learning_rate": 0.00010641222376844495, "loss": 4.5388, "step": 15025 }, { "epoch": 6.752021563342318, "grad_norm": 3.703125, "learning_rate": 0.00010631794374762507, "loss": 4.552, "step": 15030 }, { "epoch": 6.754267744833783, "grad_norm": 3.625, "learning_rate": 0.00010622369900173626, "loss": 4.518, "step": 15035 }, { "epoch": 6.756513926325247, "grad_norm": 3.671875, "learning_rate": 0.00010612948958743091, "loss": 4.5602, "step": 15040 }, { "epoch": 6.758760107816712, "grad_norm": 3.84375, "learning_rate": 0.00010603531556134006, "loss": 4.5835, "step": 15045 }, { "epoch": 6.761006289308176, "grad_norm": 4.0625, "learning_rate": 0.00010594117698007362, "loss": 4.5293, "step": 15050 }, { "epoch": 6.763252470799641, "grad_norm": 3.734375, "learning_rate": 0.00010584707390022008, "loss": 4.4825, "step": 15055 }, { "epoch": 6.765498652291106, "grad_norm": 3.8125, "learning_rate": 0.0001057530063783467, "loss": 4.5615, "step": 15060 }, { "epoch": 6.76774483378257, "grad_norm": 3.71875, "learning_rate": 0.00010565897447099929, "loss": 4.5061, "step": 15065 }, { "epoch": 6.769991015274034, "grad_norm": 3.75, "learning_rate": 0.00010556497823470215, "loss": 4.4721, "step": 15070 }, { "epoch": 6.772237196765499, "grad_norm": 3.9375, "learning_rate": 0.00010547101772595847, "loss": 4.5282, "step": 15075 }, { "epoch": 6.774483378256964, "grad_norm": 3.703125, "learning_rate": 0.00010537709300124956, "loss": 4.5139, "step": 15080 }, { "epoch": 6.776729559748428, "grad_norm": 3.859375, "learning_rate": 0.00010528320411703548, "loss": 4.5012, "step": 15085 }, { "epoch": 6.7789757412398925, "grad_norm": 3.859375, "learning_rate": 0.00010518935112975469, "loss": 4.5018, "step": 15090 }, { "epoch": 6.781221922731357, "grad_norm": 3.734375, "learning_rate": 0.00010509553409582404, "loss": 4.5257, "step": 15095 }, { "epoch": 6.7834681042228215, "grad_norm": 3.75, "learning_rate": 0.0001050017530716388, "loss": 4.5579, "step": 15100 }, { "epoch": 6.785714285714286, "grad_norm": 3.8125, "learning_rate": 0.00010490800811357252, "loss": 4.4895, "step": 15105 }, { "epoch": 6.7879604672057505, "grad_norm": 4.03125, "learning_rate": 0.00010481429927797716, "loss": 4.4847, "step": 15110 }, { "epoch": 6.790206648697215, "grad_norm": 3.5625, "learning_rate": 0.00010472062662118303, "loss": 4.4448, "step": 15115 }, { "epoch": 6.7924528301886795, "grad_norm": 3.796875, "learning_rate": 0.00010462699019949839, "loss": 4.5179, "step": 15120 }, { "epoch": 6.794699011680144, "grad_norm": 3.71875, "learning_rate": 0.00010453339006921012, "loss": 4.5188, "step": 15125 }, { "epoch": 6.7969451931716085, "grad_norm": 3.796875, "learning_rate": 0.00010443982628658295, "loss": 4.5691, "step": 15130 }, { "epoch": 6.7991913746630726, "grad_norm": 3.78125, "learning_rate": 0.00010434629890786, "loss": 4.4779, "step": 15135 }, { "epoch": 6.8014375561545375, "grad_norm": 3.625, "learning_rate": 0.00010425280798926233, "loss": 4.5947, "step": 15140 }, { "epoch": 6.8036837376460015, "grad_norm": 3.875, "learning_rate": 0.00010415935358698916, "loss": 4.5423, "step": 15145 }, { "epoch": 6.8059299191374665, "grad_norm": 4.125, "learning_rate": 0.00010406593575721785, "loss": 4.5479, "step": 15150 }, { "epoch": 6.8081761006289305, "grad_norm": 3.640625, "learning_rate": 0.00010397255455610357, "loss": 4.5018, "step": 15155 }, { "epoch": 6.8104222821203955, "grad_norm": 3.859375, "learning_rate": 0.00010387921003977968, "loss": 4.5712, "step": 15160 }, { "epoch": 6.8126684636118595, "grad_norm": 3.65625, "learning_rate": 0.00010378590226435731, "loss": 4.5021, "step": 15165 }, { "epoch": 6.8149146451033245, "grad_norm": 3.796875, "learning_rate": 0.00010369263128592566, "loss": 4.5527, "step": 15170 }, { "epoch": 6.8171608265947885, "grad_norm": 3.640625, "learning_rate": 0.00010359939716055165, "loss": 4.4868, "step": 15175 }, { "epoch": 6.819407008086253, "grad_norm": 3.734375, "learning_rate": 0.00010350619994428019, "loss": 4.5061, "step": 15180 }, { "epoch": 6.821653189577718, "grad_norm": 3.859375, "learning_rate": 0.00010341303969313401, "loss": 4.5157, "step": 15185 }, { "epoch": 6.823899371069182, "grad_norm": 3.796875, "learning_rate": 0.00010331991646311347, "loss": 4.5326, "step": 15190 }, { "epoch": 6.8261455525606465, "grad_norm": 3.484375, "learning_rate": 0.00010322683031019678, "loss": 4.578, "step": 15195 }, { "epoch": 6.828391734052111, "grad_norm": 3.703125, "learning_rate": 0.00010313378129033985, "loss": 4.5518, "step": 15200 }, { "epoch": 6.830637915543576, "grad_norm": 4.09375, "learning_rate": 0.00010304076945947624, "loss": 4.5308, "step": 15205 }, { "epoch": 6.83288409703504, "grad_norm": 3.609375, "learning_rate": 0.00010294779487351727, "loss": 4.5058, "step": 15210 }, { "epoch": 6.8351302785265045, "grad_norm": 3.5625, "learning_rate": 0.00010285485758835168, "loss": 4.5244, "step": 15215 }, { "epoch": 6.837376460017969, "grad_norm": 3.71875, "learning_rate": 0.00010276195765984605, "loss": 4.5473, "step": 15220 }, { "epoch": 6.839622641509434, "grad_norm": 3.734375, "learning_rate": 0.00010266909514384407, "loss": 4.5875, "step": 15225 }, { "epoch": 6.841868823000898, "grad_norm": 3.84375, "learning_rate": 0.00010257627009616741, "loss": 4.5284, "step": 15230 }, { "epoch": 6.844115004492363, "grad_norm": 3.640625, "learning_rate": 0.000102483482572615, "loss": 4.4934, "step": 15235 }, { "epoch": 6.846361185983827, "grad_norm": 3.71875, "learning_rate": 0.00010239073262896317, "loss": 4.5229, "step": 15240 }, { "epoch": 6.848607367475292, "grad_norm": 3.578125, "learning_rate": 0.00010229802032096582, "loss": 4.5418, "step": 15245 }, { "epoch": 6.850853548966756, "grad_norm": 3.53125, "learning_rate": 0.000102205345704354, "loss": 4.5674, "step": 15250 }, { "epoch": 6.853099730458221, "grad_norm": 3.421875, "learning_rate": 0.00010211270883483634, "loss": 4.5549, "step": 15255 }, { "epoch": 6.855345911949685, "grad_norm": 3.4375, "learning_rate": 0.00010202010976809868, "loss": 4.5274, "step": 15260 }, { "epoch": 6.85759209344115, "grad_norm": 3.6875, "learning_rate": 0.00010192754855980403, "loss": 4.5748, "step": 15265 }, { "epoch": 6.859838274932614, "grad_norm": 3.703125, "learning_rate": 0.00010183502526559287, "loss": 4.4829, "step": 15270 }, { "epoch": 6.862084456424079, "grad_norm": 3.6875, "learning_rate": 0.00010174253994108262, "loss": 4.6253, "step": 15275 }, { "epoch": 6.864330637915543, "grad_norm": 3.9375, "learning_rate": 0.00010165009264186815, "loss": 4.5681, "step": 15280 }, { "epoch": 6.866576819407008, "grad_norm": 3.671875, "learning_rate": 0.00010155768342352122, "loss": 4.4874, "step": 15285 }, { "epoch": 6.868823000898472, "grad_norm": 3.671875, "learning_rate": 0.0001014653123415909, "loss": 4.5273, "step": 15290 }, { "epoch": 6.871069182389937, "grad_norm": 3.75, "learning_rate": 0.00010137297945160326, "loss": 4.5341, "step": 15295 }, { "epoch": 6.873315363881401, "grad_norm": 3.921875, "learning_rate": 0.00010128068480906132, "loss": 4.4617, "step": 15300 }, { "epoch": 6.875561545372866, "grad_norm": 3.578125, "learning_rate": 0.00010118842846944532, "loss": 4.5259, "step": 15305 }, { "epoch": 6.87780772686433, "grad_norm": 3.6875, "learning_rate": 0.00010109621048821218, "loss": 4.5097, "step": 15310 }, { "epoch": 6.880053908355795, "grad_norm": 3.8125, "learning_rate": 0.00010100403092079611, "loss": 4.4702, "step": 15315 }, { "epoch": 6.882300089847259, "grad_norm": 3.828125, "learning_rate": 0.00010091188982260793, "loss": 4.4858, "step": 15320 }, { "epoch": 6.884546271338724, "grad_norm": 3.890625, "learning_rate": 0.00010081978724903546, "loss": 4.5182, "step": 15325 }, { "epoch": 6.886792452830189, "grad_norm": 3.6875, "learning_rate": 0.00010072772325544344, "loss": 4.4208, "step": 15330 }, { "epoch": 6.889038634321653, "grad_norm": 3.75, "learning_rate": 0.00010063569789717327, "loss": 4.5253, "step": 15335 }, { "epoch": 6.891284815813117, "grad_norm": 3.90625, "learning_rate": 0.00010054371122954323, "loss": 4.5032, "step": 15340 }, { "epoch": 6.893530997304582, "grad_norm": 3.640625, "learning_rate": 0.00010045176330784823, "loss": 4.4829, "step": 15345 }, { "epoch": 6.895777178796047, "grad_norm": 3.859375, "learning_rate": 0.00010035985418736004, "loss": 4.5584, "step": 15350 }, { "epoch": 6.898023360287511, "grad_norm": 3.953125, "learning_rate": 0.00010026798392332702, "loss": 4.4988, "step": 15355 }, { "epoch": 6.900269541778976, "grad_norm": 3.75, "learning_rate": 0.00010017615257097412, "loss": 4.5714, "step": 15360 }, { "epoch": 6.90251572327044, "grad_norm": 3.625, "learning_rate": 0.00010008436018550307, "loss": 4.549, "step": 15365 }, { "epoch": 6.904761904761905, "grad_norm": 3.8125, "learning_rate": 9.999260682209193e-05, "loss": 4.4937, "step": 15370 }, { "epoch": 6.907008086253369, "grad_norm": 3.765625, "learning_rate": 9.990089253589559e-05, "loss": 4.5615, "step": 15375 }, { "epoch": 6.909254267744834, "grad_norm": 3.765625, "learning_rate": 9.980921738204522e-05, "loss": 4.4389, "step": 15380 }, { "epoch": 6.911500449236298, "grad_norm": 3.71875, "learning_rate": 9.971758141564848e-05, "loss": 4.5287, "step": 15385 }, { "epoch": 6.913746630727763, "grad_norm": 3.75, "learning_rate": 9.962598469178966e-05, "loss": 4.4771, "step": 15390 }, { "epoch": 6.915992812219227, "grad_norm": 3.734375, "learning_rate": 9.953442726552923e-05, "loss": 4.5699, "step": 15395 }, { "epoch": 6.918238993710692, "grad_norm": 3.953125, "learning_rate": 9.944290919190425e-05, "loss": 4.467, "step": 15400 }, { "epoch": 6.920485175202156, "grad_norm": 3.71875, "learning_rate": 9.935143052592802e-05, "loss": 4.5499, "step": 15405 }, { "epoch": 6.922731356693621, "grad_norm": 3.765625, "learning_rate": 9.925999132259006e-05, "loss": 4.4919, "step": 15410 }, { "epoch": 6.924977538185085, "grad_norm": 3.890625, "learning_rate": 9.916859163685636e-05, "loss": 4.5208, "step": 15415 }, { "epoch": 6.92722371967655, "grad_norm": 3.828125, "learning_rate": 9.907723152366898e-05, "loss": 4.5282, "step": 15420 }, { "epoch": 6.929469901168014, "grad_norm": 3.890625, "learning_rate": 9.898591103794635e-05, "loss": 4.5708, "step": 15425 }, { "epoch": 6.931716082659479, "grad_norm": 4.28125, "learning_rate": 9.889463023458291e-05, "loss": 4.4944, "step": 15430 }, { "epoch": 6.933962264150943, "grad_norm": 3.6875, "learning_rate": 9.880338916844935e-05, "loss": 4.5565, "step": 15435 }, { "epoch": 6.936208445642408, "grad_norm": 3.578125, "learning_rate": 9.87121878943926e-05, "loss": 4.5056, "step": 15440 }, { "epoch": 6.938454627133872, "grad_norm": 3.75, "learning_rate": 9.862102646723533e-05, "loss": 4.5851, "step": 15445 }, { "epoch": 6.940700808625337, "grad_norm": 3.609375, "learning_rate": 9.85299049417766e-05, "loss": 4.4913, "step": 15450 }, { "epoch": 6.942946990116801, "grad_norm": 3.75, "learning_rate": 9.843882337279125e-05, "loss": 4.5163, "step": 15455 }, { "epoch": 6.945193171608266, "grad_norm": 3.859375, "learning_rate": 9.834778181503018e-05, "loss": 4.5936, "step": 15460 }, { "epoch": 6.94743935309973, "grad_norm": 3.8125, "learning_rate": 9.825678032322038e-05, "loss": 4.5088, "step": 15465 }, { "epoch": 6.949685534591195, "grad_norm": 3.671875, "learning_rate": 9.81658189520645e-05, "loss": 4.5256, "step": 15470 }, { "epoch": 6.95193171608266, "grad_norm": 3.65625, "learning_rate": 9.807489775624128e-05, "loss": 4.487, "step": 15475 }, { "epoch": 6.954177897574124, "grad_norm": 3.734375, "learning_rate": 9.798401679040511e-05, "loss": 4.5617, "step": 15480 }, { "epoch": 6.956424079065588, "grad_norm": 3.6875, "learning_rate": 9.789317610918647e-05, "loss": 4.4997, "step": 15485 }, { "epoch": 6.958670260557053, "grad_norm": 3.953125, "learning_rate": 9.780237576719134e-05, "loss": 4.5441, "step": 15490 }, { "epoch": 6.960916442048518, "grad_norm": 3.9375, "learning_rate": 9.771161581900161e-05, "loss": 4.5346, "step": 15495 }, { "epoch": 6.963162623539982, "grad_norm": 3.734375, "learning_rate": 9.762089631917495e-05, "loss": 4.5024, "step": 15500 }, { "epoch": 6.965408805031447, "grad_norm": 3.859375, "learning_rate": 9.75302173222445e-05, "loss": 4.5533, "step": 15505 }, { "epoch": 6.967654986522911, "grad_norm": 3.8125, "learning_rate": 9.743957888271931e-05, "loss": 4.5326, "step": 15510 }, { "epoch": 6.969901168014376, "grad_norm": 3.765625, "learning_rate": 9.734898105508373e-05, "loss": 4.5071, "step": 15515 }, { "epoch": 6.97214734950584, "grad_norm": 3.734375, "learning_rate": 9.725842389379808e-05, "loss": 4.5342, "step": 15520 }, { "epoch": 6.974393530997305, "grad_norm": 3.65625, "learning_rate": 9.716790745329793e-05, "loss": 4.5754, "step": 15525 }, { "epoch": 6.976639712488769, "grad_norm": 3.734375, "learning_rate": 9.707743178799446e-05, "loss": 4.5492, "step": 15530 }, { "epoch": 6.978885893980234, "grad_norm": 3.734375, "learning_rate": 9.698699695227454e-05, "loss": 4.494, "step": 15535 }, { "epoch": 6.981132075471698, "grad_norm": 3.53125, "learning_rate": 9.689660300050007e-05, "loss": 4.5579, "step": 15540 }, { "epoch": 6.983378256963163, "grad_norm": 3.96875, "learning_rate": 9.680624998700875e-05, "loss": 4.6125, "step": 15545 }, { "epoch": 6.985624438454627, "grad_norm": 3.90625, "learning_rate": 9.671593796611356e-05, "loss": 4.5417, "step": 15550 }, { "epoch": 6.987870619946092, "grad_norm": 3.859375, "learning_rate": 9.662566699210276e-05, "loss": 4.5106, "step": 15555 }, { "epoch": 6.990116801437556, "grad_norm": 3.640625, "learning_rate": 9.653543711924005e-05, "loss": 4.5051, "step": 15560 }, { "epoch": 6.992362982929021, "grad_norm": 3.828125, "learning_rate": 9.644524840176432e-05, "loss": 4.5207, "step": 15565 }, { "epoch": 6.994609164420485, "grad_norm": 3.796875, "learning_rate": 9.635510089388985e-05, "loss": 4.5393, "step": 15570 }, { "epoch": 6.99685534591195, "grad_norm": 3.65625, "learning_rate": 9.626499464980596e-05, "loss": 4.5064, "step": 15575 }, { "epoch": 6.999101527403414, "grad_norm": 3.765625, "learning_rate": 9.617492972367731e-05, "loss": 4.4867, "step": 15580 }, { "epoch": 7.001347708894879, "grad_norm": 3.859375, "learning_rate": 9.608490616964378e-05, "loss": 4.5065, "step": 15585 }, { "epoch": 7.003593890386343, "grad_norm": 3.65625, "learning_rate": 9.599492404182018e-05, "loss": 4.5735, "step": 15590 }, { "epoch": 7.005840071877808, "grad_norm": 3.875, "learning_rate": 9.590498339429659e-05, "loss": 4.4286, "step": 15595 }, { "epoch": 7.008086253369272, "grad_norm": 4.0625, "learning_rate": 9.581508428113803e-05, "loss": 4.4728, "step": 15600 }, { "epoch": 7.010332434860737, "grad_norm": 3.875, "learning_rate": 9.572522675638465e-05, "loss": 4.4923, "step": 15605 }, { "epoch": 7.012578616352202, "grad_norm": 3.875, "learning_rate": 9.56354108740516e-05, "loss": 4.509, "step": 15610 }, { "epoch": 7.014824797843666, "grad_norm": 3.75, "learning_rate": 9.554563668812888e-05, "loss": 4.5017, "step": 15615 }, { "epoch": 7.017070979335131, "grad_norm": 3.96875, "learning_rate": 9.545590425258161e-05, "loss": 4.4987, "step": 15620 }, { "epoch": 7.019317160826595, "grad_norm": 3.875, "learning_rate": 9.536621362134961e-05, "loss": 4.481, "step": 15625 }, { "epoch": 7.02156334231806, "grad_norm": 3.90625, "learning_rate": 9.527656484834776e-05, "loss": 4.5342, "step": 15630 }, { "epoch": 7.023809523809524, "grad_norm": 3.75, "learning_rate": 9.51869579874656e-05, "loss": 4.5017, "step": 15635 }, { "epoch": 7.026055705300989, "grad_norm": 3.875, "learning_rate": 9.50973930925676e-05, "loss": 4.4833, "step": 15640 }, { "epoch": 7.028301886792453, "grad_norm": 3.90625, "learning_rate": 9.500787021749303e-05, "loss": 4.5442, "step": 15645 }, { "epoch": 7.030548068283918, "grad_norm": 3.671875, "learning_rate": 9.491838941605575e-05, "loss": 4.4307, "step": 15650 }, { "epoch": 7.032794249775382, "grad_norm": 4.0625, "learning_rate": 9.482895074204451e-05, "loss": 4.4837, "step": 15655 }, { "epoch": 7.035040431266847, "grad_norm": 3.578125, "learning_rate": 9.473955424922253e-05, "loss": 4.5033, "step": 15660 }, { "epoch": 7.037286612758311, "grad_norm": 3.71875, "learning_rate": 9.465019999132792e-05, "loss": 4.5645, "step": 15665 }, { "epoch": 7.039532794249776, "grad_norm": 3.828125, "learning_rate": 9.456088802207314e-05, "loss": 4.476, "step": 15670 }, { "epoch": 7.04177897574124, "grad_norm": 3.984375, "learning_rate": 9.447161839514545e-05, "loss": 4.4451, "step": 15675 }, { "epoch": 7.044025157232705, "grad_norm": 3.640625, "learning_rate": 9.43823911642066e-05, "loss": 4.4924, "step": 15680 }, { "epoch": 7.046271338724169, "grad_norm": 3.78125, "learning_rate": 9.42932063828927e-05, "loss": 4.5004, "step": 15685 }, { "epoch": 7.048517520215634, "grad_norm": 4.0, "learning_rate": 9.420406410481456e-05, "loss": 4.522, "step": 15690 }, { "epoch": 7.050763701707098, "grad_norm": 3.578125, "learning_rate": 9.411496438355735e-05, "loss": 4.5235, "step": 15695 }, { "epoch": 7.053009883198563, "grad_norm": 4.65625, "learning_rate": 9.402590727268055e-05, "loss": 4.4144, "step": 15700 }, { "epoch": 7.055256064690027, "grad_norm": 4.0625, "learning_rate": 9.393689282571825e-05, "loss": 4.4762, "step": 15705 }, { "epoch": 7.057502246181492, "grad_norm": 3.9375, "learning_rate": 9.384792109617868e-05, "loss": 4.4985, "step": 15710 }, { "epoch": 7.059748427672956, "grad_norm": 4.125, "learning_rate": 9.375899213754453e-05, "loss": 4.4447, "step": 15715 }, { "epoch": 7.061994609164421, "grad_norm": 3.53125, "learning_rate": 9.36701060032728e-05, "loss": 4.4487, "step": 15720 }, { "epoch": 7.064240790655885, "grad_norm": 4.1875, "learning_rate": 9.358126274679453e-05, "loss": 4.4904, "step": 15725 }, { "epoch": 7.0664869721473496, "grad_norm": 3.765625, "learning_rate": 9.349246242151532e-05, "loss": 4.5504, "step": 15730 }, { "epoch": 7.068733153638814, "grad_norm": 3.875, "learning_rate": 9.340370508081463e-05, "loss": 4.5029, "step": 15735 }, { "epoch": 7.0709793351302785, "grad_norm": 3.84375, "learning_rate": 9.331499077804634e-05, "loss": 4.459, "step": 15740 }, { "epoch": 7.0732255166217435, "grad_norm": 3.796875, "learning_rate": 9.322631956653825e-05, "loss": 4.4855, "step": 15745 }, { "epoch": 7.0754716981132075, "grad_norm": 3.703125, "learning_rate": 9.31376914995924e-05, "loss": 4.4964, "step": 15750 }, { "epoch": 7.0777178796046725, "grad_norm": 3.390625, "learning_rate": 9.304910663048491e-05, "loss": 4.49, "step": 15755 }, { "epoch": 7.0799640610961365, "grad_norm": 4.0, "learning_rate": 9.296056501246579e-05, "loss": 4.5372, "step": 15760 }, { "epoch": 7.0822102425876015, "grad_norm": 3.796875, "learning_rate": 9.287206669875926e-05, "loss": 4.4935, "step": 15765 }, { "epoch": 7.0844564240790655, "grad_norm": 3.90625, "learning_rate": 9.27836117425632e-05, "loss": 4.455, "step": 15770 }, { "epoch": 7.0867026055705304, "grad_norm": 3.84375, "learning_rate": 9.26952001970498e-05, "loss": 4.4762, "step": 15775 }, { "epoch": 7.0889487870619945, "grad_norm": 3.765625, "learning_rate": 9.260683211536484e-05, "loss": 4.4771, "step": 15780 }, { "epoch": 7.091194968553459, "grad_norm": 3.828125, "learning_rate": 9.251850755062811e-05, "loss": 4.4781, "step": 15785 }, { "epoch": 7.0934411500449235, "grad_norm": 3.71875, "learning_rate": 9.243022655593334e-05, "loss": 4.4911, "step": 15790 }, { "epoch": 7.095687331536388, "grad_norm": 3.65625, "learning_rate": 9.234198918434785e-05, "loss": 4.4318, "step": 15795 }, { "epoch": 7.0979335130278525, "grad_norm": 3.703125, "learning_rate": 9.225379548891291e-05, "loss": 4.4665, "step": 15800 }, { "epoch": 7.100179694519317, "grad_norm": 3.921875, "learning_rate": 9.216564552264343e-05, "loss": 4.4929, "step": 15805 }, { "epoch": 7.1024258760107815, "grad_norm": 4.09375, "learning_rate": 9.207753933852811e-05, "loss": 4.4802, "step": 15810 }, { "epoch": 7.104672057502246, "grad_norm": 3.984375, "learning_rate": 9.198947698952933e-05, "loss": 4.4331, "step": 15815 }, { "epoch": 7.1069182389937104, "grad_norm": 3.546875, "learning_rate": 9.190145852858297e-05, "loss": 4.4975, "step": 15820 }, { "epoch": 7.109164420485175, "grad_norm": 4.0, "learning_rate": 9.181348400859882e-05, "loss": 4.4677, "step": 15825 }, { "epoch": 7.111410601976639, "grad_norm": 4.03125, "learning_rate": 9.172555348245992e-05, "loss": 4.4449, "step": 15830 }, { "epoch": 7.113656783468104, "grad_norm": 3.765625, "learning_rate": 9.163766700302316e-05, "loss": 4.4773, "step": 15835 }, { "epoch": 7.115902964959568, "grad_norm": 3.6875, "learning_rate": 9.15498246231187e-05, "loss": 4.4561, "step": 15840 }, { "epoch": 7.118149146451033, "grad_norm": 3.96875, "learning_rate": 9.146202639555036e-05, "loss": 4.4665, "step": 15845 }, { "epoch": 7.120395327942497, "grad_norm": 4.0625, "learning_rate": 9.137427237309552e-05, "loss": 4.5028, "step": 15850 }, { "epoch": 7.122641509433962, "grad_norm": 3.796875, "learning_rate": 9.128656260850459e-05, "loss": 4.568, "step": 15855 }, { "epoch": 7.124887690925426, "grad_norm": 3.921875, "learning_rate": 9.119889715450172e-05, "loss": 4.4987, "step": 15860 }, { "epoch": 7.127133872416891, "grad_norm": 4.03125, "learning_rate": 9.111127606378437e-05, "loss": 4.4623, "step": 15865 }, { "epoch": 7.129380053908355, "grad_norm": 3.703125, "learning_rate": 9.102369938902324e-05, "loss": 4.4772, "step": 15870 }, { "epoch": 7.13162623539982, "grad_norm": 3.984375, "learning_rate": 9.093616718286244e-05, "loss": 4.446, "step": 15875 }, { "epoch": 7.133872416891284, "grad_norm": 3.75, "learning_rate": 9.084867949791923e-05, "loss": 4.4265, "step": 15880 }, { "epoch": 7.136118598382749, "grad_norm": 3.5625, "learning_rate": 9.07612363867842e-05, "loss": 4.4706, "step": 15885 }, { "epoch": 7.138364779874214, "grad_norm": 3.640625, "learning_rate": 9.067383790202109e-05, "loss": 4.5364, "step": 15890 }, { "epoch": 7.140610961365678, "grad_norm": 3.84375, "learning_rate": 9.058648409616683e-05, "loss": 4.5555, "step": 15895 }, { "epoch": 7.142857142857143, "grad_norm": 3.765625, "learning_rate": 9.049917502173158e-05, "loss": 4.4759, "step": 15900 }, { "epoch": 7.145103324348607, "grad_norm": 3.6875, "learning_rate": 9.041191073119844e-05, "loss": 4.4862, "step": 15905 }, { "epoch": 7.147349505840072, "grad_norm": 4.1875, "learning_rate": 9.032469127702375e-05, "loss": 4.4843, "step": 15910 }, { "epoch": 7.149595687331536, "grad_norm": 3.8125, "learning_rate": 9.023751671163673e-05, "loss": 4.4575, "step": 15915 }, { "epoch": 7.151841868823001, "grad_norm": 4.0625, "learning_rate": 9.015038708743986e-05, "loss": 4.4978, "step": 15920 }, { "epoch": 7.154088050314465, "grad_norm": 3.6875, "learning_rate": 9.00633024568083e-05, "loss": 4.4962, "step": 15925 }, { "epoch": 7.15633423180593, "grad_norm": 3.75, "learning_rate": 8.997626287209041e-05, "loss": 4.4657, "step": 15930 }, { "epoch": 7.158580413297394, "grad_norm": 4.03125, "learning_rate": 8.988926838560742e-05, "loss": 4.4657, "step": 15935 }, { "epoch": 7.160826594788859, "grad_norm": 3.78125, "learning_rate": 8.980231904965333e-05, "loss": 4.5124, "step": 15940 }, { "epoch": 7.163072776280323, "grad_norm": 3.671875, "learning_rate": 8.971541491649518e-05, "loss": 4.5223, "step": 15945 }, { "epoch": 7.165318957771788, "grad_norm": 3.859375, "learning_rate": 8.962855603837264e-05, "loss": 4.4444, "step": 15950 }, { "epoch": 7.167565139263252, "grad_norm": 3.765625, "learning_rate": 8.954174246749835e-05, "loss": 4.4397, "step": 15955 }, { "epoch": 7.169811320754717, "grad_norm": 3.875, "learning_rate": 8.945497425605765e-05, "loss": 4.4983, "step": 15960 }, { "epoch": 7.172057502246181, "grad_norm": 4.03125, "learning_rate": 8.936825145620855e-05, "loss": 4.4257, "step": 15965 }, { "epoch": 7.174303683737646, "grad_norm": 3.734375, "learning_rate": 8.92815741200819e-05, "loss": 4.4325, "step": 15970 }, { "epoch": 7.17654986522911, "grad_norm": 3.75, "learning_rate": 8.919494229978106e-05, "loss": 4.5093, "step": 15975 }, { "epoch": 7.178796046720575, "grad_norm": 4.03125, "learning_rate": 8.910835604738218e-05, "loss": 4.4844, "step": 15980 }, { "epoch": 7.181042228212039, "grad_norm": 3.8125, "learning_rate": 8.902181541493386e-05, "loss": 4.4911, "step": 15985 }, { "epoch": 7.183288409703504, "grad_norm": 4.15625, "learning_rate": 8.893532045445743e-05, "loss": 4.5168, "step": 15990 }, { "epoch": 7.185534591194968, "grad_norm": 3.796875, "learning_rate": 8.884887121794674e-05, "loss": 4.5234, "step": 15995 }, { "epoch": 7.187780772686433, "grad_norm": 3.828125, "learning_rate": 8.876246775736802e-05, "loss": 4.5854, "step": 16000 }, { "epoch": 7.187780772686433, "eval_loss": 4.78458833694458, "eval_runtime": 16.0382, "eval_samples_per_second": 1933.692, "eval_steps_per_second": 241.735, "step": 16000 }, { "epoch": 7.190026954177897, "grad_norm": 3.921875, "learning_rate": 8.867611012466018e-05, "loss": 4.4497, "step": 16005 }, { "epoch": 7.192273135669362, "grad_norm": 3.8125, "learning_rate": 8.85897983717344e-05, "loss": 4.5458, "step": 16010 }, { "epoch": 7.194519317160827, "grad_norm": 3.640625, "learning_rate": 8.850353255047437e-05, "loss": 4.5325, "step": 16015 }, { "epoch": 7.196765498652291, "grad_norm": 3.90625, "learning_rate": 8.841731271273623e-05, "loss": 4.5205, "step": 16020 }, { "epoch": 7.199011680143756, "grad_norm": 3.875, "learning_rate": 8.833113891034832e-05, "loss": 4.4578, "step": 16025 }, { "epoch": 7.20125786163522, "grad_norm": 3.953125, "learning_rate": 8.824501119511147e-05, "loss": 4.4487, "step": 16030 }, { "epoch": 7.203504043126685, "grad_norm": 3.875, "learning_rate": 8.815892961879865e-05, "loss": 4.4484, "step": 16035 }, { "epoch": 7.205750224618149, "grad_norm": 4.09375, "learning_rate": 8.807289423315524e-05, "loss": 4.4684, "step": 16040 }, { "epoch": 7.207996406109614, "grad_norm": 4.125, "learning_rate": 8.798690508989883e-05, "loss": 4.4769, "step": 16045 }, { "epoch": 7.210242587601078, "grad_norm": 3.921875, "learning_rate": 8.790096224071905e-05, "loss": 4.4799, "step": 16050 }, { "epoch": 7.212488769092543, "grad_norm": 3.59375, "learning_rate": 8.781506573727798e-05, "loss": 4.4449, "step": 16055 }, { "epoch": 7.214734950584007, "grad_norm": 3.859375, "learning_rate": 8.772921563120957e-05, "loss": 4.5143, "step": 16060 }, { "epoch": 7.216981132075472, "grad_norm": 3.5, "learning_rate": 8.764341197412002e-05, "loss": 4.5065, "step": 16065 }, { "epoch": 7.219227313566936, "grad_norm": 3.890625, "learning_rate": 8.755765481758765e-05, "loss": 4.4876, "step": 16070 }, { "epoch": 7.221473495058401, "grad_norm": 3.9375, "learning_rate": 8.747194421316264e-05, "loss": 4.5722, "step": 16075 }, { "epoch": 7.223719676549865, "grad_norm": 3.515625, "learning_rate": 8.738628021236748e-05, "loss": 4.5264, "step": 16080 }, { "epoch": 7.22596585804133, "grad_norm": 3.703125, "learning_rate": 8.730066286669631e-05, "loss": 4.4706, "step": 16085 }, { "epoch": 7.228212039532794, "grad_norm": 3.75, "learning_rate": 8.721509222761553e-05, "loss": 4.4608, "step": 16090 }, { "epoch": 7.230458221024259, "grad_norm": 3.953125, "learning_rate": 8.712956834656318e-05, "loss": 4.5088, "step": 16095 }, { "epoch": 7.232704402515723, "grad_norm": 4.09375, "learning_rate": 8.704409127494942e-05, "loss": 4.4941, "step": 16100 }, { "epoch": 7.234950584007188, "grad_norm": 3.96875, "learning_rate": 8.695866106415623e-05, "loss": 4.5131, "step": 16105 }, { "epoch": 7.237196765498652, "grad_norm": 3.984375, "learning_rate": 8.687327776553726e-05, "loss": 4.5158, "step": 16110 }, { "epoch": 7.239442946990117, "grad_norm": 4.03125, "learning_rate": 8.678794143041821e-05, "loss": 4.5603, "step": 16115 }, { "epoch": 7.241689128481581, "grad_norm": 3.6875, "learning_rate": 8.670265211009633e-05, "loss": 4.5157, "step": 16120 }, { "epoch": 7.243935309973046, "grad_norm": 4.09375, "learning_rate": 8.661740985584074e-05, "loss": 4.541, "step": 16125 }, { "epoch": 7.24618149146451, "grad_norm": 3.953125, "learning_rate": 8.653221471889221e-05, "loss": 4.5027, "step": 16130 }, { "epoch": 7.248427672955975, "grad_norm": 4.03125, "learning_rate": 8.644706675046313e-05, "loss": 4.447, "step": 16135 }, { "epoch": 7.250673854447439, "grad_norm": 4.15625, "learning_rate": 8.63619660017378e-05, "loss": 4.5064, "step": 16140 }, { "epoch": 7.252920035938904, "grad_norm": 3.96875, "learning_rate": 8.627691252387174e-05, "loss": 4.5067, "step": 16145 }, { "epoch": 7.255166217430368, "grad_norm": 4.0, "learning_rate": 8.61919063679924e-05, "loss": 4.483, "step": 16150 }, { "epoch": 7.257412398921833, "grad_norm": 3.65625, "learning_rate": 8.610694758519852e-05, "loss": 4.5225, "step": 16155 }, { "epoch": 7.259658580413298, "grad_norm": 3.65625, "learning_rate": 8.602203622656055e-05, "loss": 4.4459, "step": 16160 }, { "epoch": 7.261904761904762, "grad_norm": 3.78125, "learning_rate": 8.593717234312045e-05, "loss": 4.4964, "step": 16165 }, { "epoch": 7.264150943396227, "grad_norm": 3.515625, "learning_rate": 8.585235598589144e-05, "loss": 4.5308, "step": 16170 }, { "epoch": 7.266397124887691, "grad_norm": 3.640625, "learning_rate": 8.576758720585835e-05, "loss": 4.4536, "step": 16175 }, { "epoch": 7.268643306379156, "grad_norm": 3.765625, "learning_rate": 8.568286605397726e-05, "loss": 4.4493, "step": 16180 }, { "epoch": 7.27088948787062, "grad_norm": 3.875, "learning_rate": 8.559819258117578e-05, "loss": 4.5189, "step": 16185 }, { "epoch": 7.273135669362085, "grad_norm": 4.03125, "learning_rate": 8.551356683835285e-05, "loss": 4.5442, "step": 16190 }, { "epoch": 7.275381850853549, "grad_norm": 3.59375, "learning_rate": 8.542898887637855e-05, "loss": 4.454, "step": 16195 }, { "epoch": 7.277628032345014, "grad_norm": 4.03125, "learning_rate": 8.53444587460944e-05, "loss": 4.4647, "step": 16200 }, { "epoch": 7.279874213836478, "grad_norm": 3.78125, "learning_rate": 8.52599764983131e-05, "loss": 4.4834, "step": 16205 }, { "epoch": 7.282120395327943, "grad_norm": 3.84375, "learning_rate": 8.517554218381856e-05, "loss": 4.4616, "step": 16210 }, { "epoch": 7.284366576819407, "grad_norm": 3.84375, "learning_rate": 8.509115585336598e-05, "loss": 4.4305, "step": 16215 }, { "epoch": 7.286612758310872, "grad_norm": 3.859375, "learning_rate": 8.500681755768151e-05, "loss": 4.4802, "step": 16220 }, { "epoch": 7.288858939802336, "grad_norm": 3.90625, "learning_rate": 8.492252734746268e-05, "loss": 4.4653, "step": 16225 }, { "epoch": 7.291105121293801, "grad_norm": 3.90625, "learning_rate": 8.483828527337787e-05, "loss": 4.5672, "step": 16230 }, { "epoch": 7.293351302785265, "grad_norm": 3.9375, "learning_rate": 8.47540913860667e-05, "loss": 4.4814, "step": 16235 }, { "epoch": 7.29559748427673, "grad_norm": 4.0625, "learning_rate": 8.466994573613974e-05, "loss": 4.5198, "step": 16240 }, { "epoch": 7.297843665768194, "grad_norm": 3.578125, "learning_rate": 8.458584837417858e-05, "loss": 4.4652, "step": 16245 }, { "epoch": 7.300089847259659, "grad_norm": 3.9375, "learning_rate": 8.450179935073583e-05, "loss": 4.5406, "step": 16250 }, { "epoch": 7.302336028751123, "grad_norm": 4.0625, "learning_rate": 8.441779871633491e-05, "loss": 4.4671, "step": 16255 }, { "epoch": 7.304582210242588, "grad_norm": 4.09375, "learning_rate": 8.433384652147037e-05, "loss": 4.5091, "step": 16260 }, { "epoch": 7.306828391734052, "grad_norm": 3.859375, "learning_rate": 8.424994281660739e-05, "loss": 4.5249, "step": 16265 }, { "epoch": 7.309074573225517, "grad_norm": 3.9375, "learning_rate": 8.416608765218223e-05, "loss": 4.4371, "step": 16270 }, { "epoch": 7.311320754716981, "grad_norm": 4.03125, "learning_rate": 8.40822810786018e-05, "loss": 4.5291, "step": 16275 }, { "epoch": 7.313566936208446, "grad_norm": 3.859375, "learning_rate": 8.399852314624385e-05, "loss": 4.4864, "step": 16280 }, { "epoch": 7.315813117699911, "grad_norm": 4.125, "learning_rate": 8.391481390545704e-05, "loss": 4.452, "step": 16285 }, { "epoch": 7.318059299191375, "grad_norm": 4.15625, "learning_rate": 8.383115340656048e-05, "loss": 4.5127, "step": 16290 }, { "epoch": 7.320305480682839, "grad_norm": 3.78125, "learning_rate": 8.374754169984422e-05, "loss": 4.5017, "step": 16295 }, { "epoch": 7.322551662174304, "grad_norm": 4.0625, "learning_rate": 8.366397883556883e-05, "loss": 4.4903, "step": 16300 }, { "epoch": 7.324797843665769, "grad_norm": 3.875, "learning_rate": 8.358046486396564e-05, "loss": 4.5602, "step": 16305 }, { "epoch": 7.327044025157233, "grad_norm": 3.875, "learning_rate": 8.349699983523654e-05, "loss": 4.5111, "step": 16310 }, { "epoch": 7.329290206648698, "grad_norm": 4.0625, "learning_rate": 8.341358379955392e-05, "loss": 4.4737, "step": 16315 }, { "epoch": 7.331536388140162, "grad_norm": 3.703125, "learning_rate": 8.333021680706085e-05, "loss": 4.4634, "step": 16320 }, { "epoch": 7.333782569631627, "grad_norm": 3.84375, "learning_rate": 8.324689890787086e-05, "loss": 4.548, "step": 16325 }, { "epoch": 7.336028751123091, "grad_norm": 3.921875, "learning_rate": 8.316363015206787e-05, "loss": 4.4421, "step": 16330 }, { "epoch": 7.3382749326145555, "grad_norm": 3.828125, "learning_rate": 8.30804105897065e-05, "loss": 4.5086, "step": 16335 }, { "epoch": 7.34052111410602, "grad_norm": 4.0, "learning_rate": 8.299724027081154e-05, "loss": 4.4927, "step": 16340 }, { "epoch": 7.3427672955974845, "grad_norm": 3.9375, "learning_rate": 8.291411924537838e-05, "loss": 4.4961, "step": 16345 }, { "epoch": 7.345013477088949, "grad_norm": 3.65625, "learning_rate": 8.283104756337261e-05, "loss": 4.4941, "step": 16350 }, { "epoch": 7.3472596585804135, "grad_norm": 3.578125, "learning_rate": 8.274802527473027e-05, "loss": 4.4502, "step": 16355 }, { "epoch": 7.349505840071878, "grad_norm": 4.03125, "learning_rate": 8.266505242935777e-05, "loss": 4.4218, "step": 16360 }, { "epoch": 7.3517520215633425, "grad_norm": 3.765625, "learning_rate": 8.258212907713158e-05, "loss": 4.4552, "step": 16365 }, { "epoch": 7.353998203054807, "grad_norm": 3.828125, "learning_rate": 8.249925526789864e-05, "loss": 4.4254, "step": 16370 }, { "epoch": 7.3562443845462715, "grad_norm": 4.09375, "learning_rate": 8.241643105147594e-05, "loss": 4.4798, "step": 16375 }, { "epoch": 7.3584905660377355, "grad_norm": 4.03125, "learning_rate": 8.233365647765082e-05, "loss": 4.5004, "step": 16380 }, { "epoch": 7.3607367475292005, "grad_norm": 3.828125, "learning_rate": 8.225093159618059e-05, "loss": 4.4728, "step": 16385 }, { "epoch": 7.3629829290206645, "grad_norm": 3.75, "learning_rate": 8.216825645679288e-05, "loss": 4.5282, "step": 16390 }, { "epoch": 7.3652291105121295, "grad_norm": 4.125, "learning_rate": 8.208563110918534e-05, "loss": 4.4876, "step": 16395 }, { "epoch": 7.3674752920035935, "grad_norm": 3.75, "learning_rate": 8.20030556030256e-05, "loss": 4.4282, "step": 16400 }, { "epoch": 7.3697214734950585, "grad_norm": 3.890625, "learning_rate": 8.192052998795149e-05, "loss": 4.4644, "step": 16405 }, { "epoch": 7.3719676549865225, "grad_norm": 3.84375, "learning_rate": 8.18380543135707e-05, "loss": 4.5372, "step": 16410 }, { "epoch": 7.3742138364779874, "grad_norm": 3.65625, "learning_rate": 8.175562862946102e-05, "loss": 4.4788, "step": 16415 }, { "epoch": 7.3764600179694515, "grad_norm": 3.984375, "learning_rate": 8.167325298517015e-05, "loss": 4.5196, "step": 16420 }, { "epoch": 7.378706199460916, "grad_norm": 3.625, "learning_rate": 8.159092743021566e-05, "loss": 4.5199, "step": 16425 }, { "epoch": 7.380952380952381, "grad_norm": 3.90625, "learning_rate": 8.15086520140851e-05, "loss": 4.4835, "step": 16430 }, { "epoch": 7.383198562443845, "grad_norm": 4.25, "learning_rate": 8.142642678623576e-05, "loss": 4.4979, "step": 16435 }, { "epoch": 7.38544474393531, "grad_norm": 3.875, "learning_rate": 8.134425179609489e-05, "loss": 4.4895, "step": 16440 }, { "epoch": 7.387690925426774, "grad_norm": 4.09375, "learning_rate": 8.126212709305946e-05, "loss": 4.4258, "step": 16445 }, { "epoch": 7.389937106918239, "grad_norm": 3.9375, "learning_rate": 8.118005272649622e-05, "loss": 4.4786, "step": 16450 }, { "epoch": 7.392183288409703, "grad_norm": 3.734375, "learning_rate": 8.109802874574171e-05, "loss": 4.4982, "step": 16455 }, { "epoch": 7.394429469901168, "grad_norm": 3.796875, "learning_rate": 8.101605520010212e-05, "loss": 4.5045, "step": 16460 }, { "epoch": 7.396675651392632, "grad_norm": 3.765625, "learning_rate": 8.09341321388534e-05, "loss": 4.5082, "step": 16465 }, { "epoch": 7.398921832884097, "grad_norm": 4.0, "learning_rate": 8.0852259611241e-05, "loss": 4.4946, "step": 16470 }, { "epoch": 7.401168014375561, "grad_norm": 3.96875, "learning_rate": 8.077043766648025e-05, "loss": 4.4921, "step": 16475 }, { "epoch": 7.403414195867026, "grad_norm": 4.34375, "learning_rate": 8.068866635375575e-05, "loss": 4.4717, "step": 16480 }, { "epoch": 7.40566037735849, "grad_norm": 3.78125, "learning_rate": 8.060694572222198e-05, "loss": 4.4662, "step": 16485 }, { "epoch": 7.407906558849955, "grad_norm": 3.828125, "learning_rate": 8.052527582100275e-05, "loss": 4.4932, "step": 16490 }, { "epoch": 7.410152740341419, "grad_norm": 3.734375, "learning_rate": 8.044365669919137e-05, "loss": 4.4941, "step": 16495 }, { "epoch": 7.412398921832884, "grad_norm": 3.75, "learning_rate": 8.036208840585076e-05, "loss": 4.5339, "step": 16500 }, { "epoch": 7.414645103324348, "grad_norm": 4.03125, "learning_rate": 8.028057099001324e-05, "loss": 4.4626, "step": 16505 }, { "epoch": 7.416891284815813, "grad_norm": 3.71875, "learning_rate": 8.019910450068046e-05, "loss": 4.565, "step": 16510 }, { "epoch": 7.419137466307277, "grad_norm": 3.953125, "learning_rate": 8.011768898682357e-05, "loss": 4.538, "step": 16515 }, { "epoch": 7.421383647798742, "grad_norm": 4.0, "learning_rate": 8.003632449738297e-05, "loss": 4.4717, "step": 16520 }, { "epoch": 7.423629829290206, "grad_norm": 3.984375, "learning_rate": 7.995501108126851e-05, "loss": 4.4681, "step": 16525 }, { "epoch": 7.425876010781671, "grad_norm": 3.84375, "learning_rate": 7.987374878735922e-05, "loss": 4.4855, "step": 16530 }, { "epoch": 7.428122192273135, "grad_norm": 3.796875, "learning_rate": 7.979253766450347e-05, "loss": 4.4463, "step": 16535 }, { "epoch": 7.4303683737646, "grad_norm": 3.921875, "learning_rate": 7.971137776151891e-05, "loss": 4.4767, "step": 16540 }, { "epoch": 7.432614555256064, "grad_norm": 3.796875, "learning_rate": 7.963026912719223e-05, "loss": 4.5125, "step": 16545 }, { "epoch": 7.434860736747529, "grad_norm": 3.828125, "learning_rate": 7.954921181027953e-05, "loss": 4.4621, "step": 16550 }, { "epoch": 7.437106918238994, "grad_norm": 4.03125, "learning_rate": 7.946820585950587e-05, "loss": 4.4562, "step": 16555 }, { "epoch": 7.439353099730458, "grad_norm": 3.703125, "learning_rate": 7.938725132356549e-05, "loss": 4.4049, "step": 16560 }, { "epoch": 7.441599281221922, "grad_norm": 3.84375, "learning_rate": 7.930634825112187e-05, "loss": 4.4527, "step": 16565 }, { "epoch": 7.443845462713387, "grad_norm": 3.953125, "learning_rate": 7.92254966908073e-05, "loss": 4.5469, "step": 16570 }, { "epoch": 7.446091644204852, "grad_norm": 3.953125, "learning_rate": 7.914469669122331e-05, "loss": 4.5527, "step": 16575 }, { "epoch": 7.448337825696316, "grad_norm": 3.890625, "learning_rate": 7.906394830094031e-05, "loss": 4.4505, "step": 16580 }, { "epoch": 7.450584007187781, "grad_norm": 3.78125, "learning_rate": 7.898325156849779e-05, "loss": 4.5039, "step": 16585 }, { "epoch": 7.452830188679245, "grad_norm": 3.953125, "learning_rate": 7.890260654240407e-05, "loss": 4.4809, "step": 16590 }, { "epoch": 7.45507637017071, "grad_norm": 3.890625, "learning_rate": 7.882201327113644e-05, "loss": 4.4749, "step": 16595 }, { "epoch": 7.457322551662174, "grad_norm": 3.828125, "learning_rate": 7.87414718031412e-05, "loss": 4.447, "step": 16600 }, { "epoch": 7.459568733153639, "grad_norm": 3.859375, "learning_rate": 7.86609821868333e-05, "loss": 4.5102, "step": 16605 }, { "epoch": 7.461814914645103, "grad_norm": 3.703125, "learning_rate": 7.858054447059671e-05, "loss": 4.4386, "step": 16610 }, { "epoch": 7.464061096136568, "grad_norm": 3.921875, "learning_rate": 7.850015870278398e-05, "loss": 4.5193, "step": 16615 }, { "epoch": 7.466307277628032, "grad_norm": 4.0625, "learning_rate": 7.841982493171671e-05, "loss": 4.4742, "step": 16620 }, { "epoch": 7.468553459119497, "grad_norm": 3.953125, "learning_rate": 7.833954320568498e-05, "loss": 4.5049, "step": 16625 }, { "epoch": 7.470799640610961, "grad_norm": 3.65625, "learning_rate": 7.825931357294777e-05, "loss": 4.5168, "step": 16630 }, { "epoch": 7.473045822102426, "grad_norm": 3.921875, "learning_rate": 7.81791360817327e-05, "loss": 4.4544, "step": 16635 }, { "epoch": 7.47529200359389, "grad_norm": 3.828125, "learning_rate": 7.809901078023598e-05, "loss": 4.4673, "step": 16640 }, { "epoch": 7.477538185085355, "grad_norm": 3.703125, "learning_rate": 7.801893771662253e-05, "loss": 4.5267, "step": 16645 }, { "epoch": 7.479784366576819, "grad_norm": 3.765625, "learning_rate": 7.793891693902582e-05, "loss": 4.4746, "step": 16650 }, { "epoch": 7.482030548068284, "grad_norm": 3.578125, "learning_rate": 7.785894849554785e-05, "loss": 4.4841, "step": 16655 }, { "epoch": 7.484276729559748, "grad_norm": 4.0, "learning_rate": 7.777903243425933e-05, "loss": 4.4935, "step": 16660 }, { "epoch": 7.486522911051213, "grad_norm": 3.796875, "learning_rate": 7.769916880319925e-05, "loss": 4.403, "step": 16665 }, { "epoch": 7.488769092542677, "grad_norm": 4.15625, "learning_rate": 7.761935765037527e-05, "loss": 4.5541, "step": 16670 }, { "epoch": 7.491015274034142, "grad_norm": 3.71875, "learning_rate": 7.753959902376338e-05, "loss": 4.5181, "step": 16675 }, { "epoch": 7.493261455525606, "grad_norm": 3.859375, "learning_rate": 7.745989297130808e-05, "loss": 4.4491, "step": 16680 }, { "epoch": 7.495507637017071, "grad_norm": 3.71875, "learning_rate": 7.738023954092229e-05, "loss": 4.4856, "step": 16685 }, { "epoch": 7.497753818508535, "grad_norm": 3.890625, "learning_rate": 7.730063878048717e-05, "loss": 4.5341, "step": 16690 }, { "epoch": 7.5, "grad_norm": 4.125, "learning_rate": 7.722109073785234e-05, "loss": 4.4758, "step": 16695 }, { "epoch": 7.502246181491465, "grad_norm": 3.796875, "learning_rate": 7.71415954608356e-05, "loss": 4.5248, "step": 16700 }, { "epoch": 7.504492362982929, "grad_norm": 4.03125, "learning_rate": 7.706215299722321e-05, "loss": 4.4835, "step": 16705 }, { "epoch": 7.506738544474393, "grad_norm": 3.953125, "learning_rate": 7.698276339476957e-05, "loss": 4.511, "step": 16710 }, { "epoch": 7.508984725965858, "grad_norm": 3.59375, "learning_rate": 7.690342670119726e-05, "loss": 4.4425, "step": 16715 }, { "epoch": 7.511230907457323, "grad_norm": 4.0625, "learning_rate": 7.682414296419724e-05, "loss": 4.4964, "step": 16720 }, { "epoch": 7.513477088948787, "grad_norm": 4.03125, "learning_rate": 7.674491223142836e-05, "loss": 4.46, "step": 16725 }, { "epoch": 7.515723270440252, "grad_norm": 3.859375, "learning_rate": 7.666573455051789e-05, "loss": 4.4496, "step": 16730 }, { "epoch": 7.517969451931716, "grad_norm": 4.09375, "learning_rate": 7.658660996906097e-05, "loss": 4.5062, "step": 16735 }, { "epoch": 7.520215633423181, "grad_norm": 3.96875, "learning_rate": 7.650753853462101e-05, "loss": 4.4952, "step": 16740 }, { "epoch": 7.522461814914645, "grad_norm": 3.84375, "learning_rate": 7.642852029472939e-05, "loss": 4.5598, "step": 16745 }, { "epoch": 7.52470799640611, "grad_norm": 3.59375, "learning_rate": 7.63495552968855e-05, "loss": 4.4556, "step": 16750 }, { "epoch": 7.526954177897574, "grad_norm": 3.765625, "learning_rate": 7.627064358855677e-05, "loss": 4.4857, "step": 16755 }, { "epoch": 7.529200359389039, "grad_norm": 4.03125, "learning_rate": 7.619178521717853e-05, "loss": 4.4772, "step": 16760 }, { "epoch": 7.531446540880503, "grad_norm": 3.875, "learning_rate": 7.611298023015408e-05, "loss": 4.4325, "step": 16765 }, { "epoch": 7.533692722371968, "grad_norm": 3.890625, "learning_rate": 7.603422867485472e-05, "loss": 4.4809, "step": 16770 }, { "epoch": 7.535938903863432, "grad_norm": 3.9375, "learning_rate": 7.595553059861946e-05, "loss": 4.4866, "step": 16775 }, { "epoch": 7.538185085354897, "grad_norm": 3.953125, "learning_rate": 7.587688604875534e-05, "loss": 4.5252, "step": 16780 }, { "epoch": 7.540431266846361, "grad_norm": 3.828125, "learning_rate": 7.579829507253702e-05, "loss": 4.4775, "step": 16785 }, { "epoch": 7.542677448337826, "grad_norm": 3.90625, "learning_rate": 7.571975771720719e-05, "loss": 4.4986, "step": 16790 }, { "epoch": 7.54492362982929, "grad_norm": 3.796875, "learning_rate": 7.564127402997607e-05, "loss": 4.414, "step": 16795 }, { "epoch": 7.547169811320755, "grad_norm": 3.78125, "learning_rate": 7.556284405802187e-05, "loss": 4.5513, "step": 16800 }, { "epoch": 7.549415992812219, "grad_norm": 3.953125, "learning_rate": 7.548446784849028e-05, "loss": 4.4816, "step": 16805 }, { "epoch": 7.551662174303684, "grad_norm": 3.921875, "learning_rate": 7.54061454484948e-05, "loss": 4.4992, "step": 16810 }, { "epoch": 7.553908355795148, "grad_norm": 3.96875, "learning_rate": 7.532787690511656e-05, "loss": 4.5391, "step": 16815 }, { "epoch": 7.556154537286613, "grad_norm": 3.703125, "learning_rate": 7.524966226540434e-05, "loss": 4.5233, "step": 16820 }, { "epoch": 7.558400718778078, "grad_norm": 3.71875, "learning_rate": 7.51715015763744e-05, "loss": 4.5246, "step": 16825 }, { "epoch": 7.560646900269542, "grad_norm": 3.671875, "learning_rate": 7.509339488501077e-05, "loss": 4.4983, "step": 16830 }, { "epoch": 7.562893081761006, "grad_norm": 3.84375, "learning_rate": 7.501534223826481e-05, "loss": 4.5314, "step": 16835 }, { "epoch": 7.565139263252471, "grad_norm": 4.09375, "learning_rate": 7.49373436830556e-05, "loss": 4.4689, "step": 16840 }, { "epoch": 7.567385444743936, "grad_norm": 3.921875, "learning_rate": 7.485939926626948e-05, "loss": 4.5512, "step": 16845 }, { "epoch": 7.5696316262354, "grad_norm": 3.65625, "learning_rate": 7.478150903476043e-05, "loss": 4.5035, "step": 16850 }, { "epoch": 7.571877807726865, "grad_norm": 3.8125, "learning_rate": 7.47036730353498e-05, "loss": 4.5341, "step": 16855 }, { "epoch": 7.574123989218329, "grad_norm": 4.03125, "learning_rate": 7.462589131482628e-05, "loss": 4.4958, "step": 16860 }, { "epoch": 7.576370170709794, "grad_norm": 4.21875, "learning_rate": 7.454816391994604e-05, "loss": 4.4979, "step": 16865 }, { "epoch": 7.578616352201258, "grad_norm": 4.0, "learning_rate": 7.447049089743247e-05, "loss": 4.4834, "step": 16870 }, { "epoch": 7.580862533692723, "grad_norm": 4.03125, "learning_rate": 7.439287229397642e-05, "loss": 4.5267, "step": 16875 }, { "epoch": 7.583108715184187, "grad_norm": 3.859375, "learning_rate": 7.431530815623586e-05, "loss": 4.4695, "step": 16880 }, { "epoch": 7.585354896675652, "grad_norm": 3.65625, "learning_rate": 7.423779853083618e-05, "loss": 4.5421, "step": 16885 }, { "epoch": 7.587601078167116, "grad_norm": 3.578125, "learning_rate": 7.416034346436994e-05, "loss": 4.4031, "step": 16890 }, { "epoch": 7.589847259658581, "grad_norm": 4.0625, "learning_rate": 7.408294300339682e-05, "loss": 4.477, "step": 16895 }, { "epoch": 7.592093441150045, "grad_norm": 3.984375, "learning_rate": 7.400559719444382e-05, "loss": 4.4463, "step": 16900 }, { "epoch": 7.59433962264151, "grad_norm": 3.578125, "learning_rate": 7.392830608400499e-05, "loss": 4.4661, "step": 16905 }, { "epoch": 7.596585804132974, "grad_norm": 3.796875, "learning_rate": 7.385106971854148e-05, "loss": 4.4802, "step": 16910 }, { "epoch": 7.598831985624439, "grad_norm": 3.75, "learning_rate": 7.37738881444817e-05, "loss": 4.4845, "step": 16915 }, { "epoch": 7.601078167115903, "grad_norm": 3.859375, "learning_rate": 7.369676140822088e-05, "loss": 4.5179, "step": 16920 }, { "epoch": 7.603324348607368, "grad_norm": 4.40625, "learning_rate": 7.361968955612151e-05, "loss": 4.5927, "step": 16925 }, { "epoch": 7.605570530098832, "grad_norm": 4.28125, "learning_rate": 7.354267263451288e-05, "loss": 4.4907, "step": 16930 }, { "epoch": 7.607816711590297, "grad_norm": 3.828125, "learning_rate": 7.346571068969147e-05, "loss": 4.4642, "step": 16935 }, { "epoch": 7.610062893081761, "grad_norm": 3.78125, "learning_rate": 7.338880376792052e-05, "loss": 4.4903, "step": 16940 }, { "epoch": 7.612309074573226, "grad_norm": 3.828125, "learning_rate": 7.331195191543033e-05, "loss": 4.4774, "step": 16945 }, { "epoch": 7.6145552560646905, "grad_norm": 4.0, "learning_rate": 7.323515517841807e-05, "loss": 4.5547, "step": 16950 }, { "epoch": 7.616801437556155, "grad_norm": 4.34375, "learning_rate": 7.315841360304773e-05, "loss": 4.4924, "step": 16955 }, { "epoch": 7.619047619047619, "grad_norm": 3.953125, "learning_rate": 7.308172723545019e-05, "loss": 4.5127, "step": 16960 }, { "epoch": 7.621293800539084, "grad_norm": 3.921875, "learning_rate": 7.300509612172313e-05, "loss": 4.5099, "step": 16965 }, { "epoch": 7.6235399820305485, "grad_norm": 3.78125, "learning_rate": 7.292852030793095e-05, "loss": 4.5229, "step": 16970 }, { "epoch": 7.6257861635220126, "grad_norm": 4.84375, "learning_rate": 7.285199984010494e-05, "loss": 4.5042, "step": 16975 }, { "epoch": 7.628032345013477, "grad_norm": 3.953125, "learning_rate": 7.277553476424299e-05, "loss": 4.4774, "step": 16980 }, { "epoch": 7.6302785265049415, "grad_norm": 3.96875, "learning_rate": 7.26991251263098e-05, "loss": 4.497, "step": 16985 }, { "epoch": 7.6325247079964065, "grad_norm": 3.921875, "learning_rate": 7.262277097223665e-05, "loss": 4.4846, "step": 16990 }, { "epoch": 7.6347708894878705, "grad_norm": 3.890625, "learning_rate": 7.254647234792155e-05, "loss": 4.5027, "step": 16995 }, { "epoch": 7.6370170709793355, "grad_norm": 4.0, "learning_rate": 7.247022929922913e-05, "loss": 4.4799, "step": 17000 }, { "epoch": 7.6370170709793355, "eval_loss": 4.779622554779053, "eval_runtime": 16.0334, "eval_samples_per_second": 1934.274, "eval_steps_per_second": 241.808, "step": 17000 }, { "epoch": 7.6392632524707995, "grad_norm": 3.890625, "learning_rate": 7.239404187199049e-05, "loss": 4.4591, "step": 17005 }, { "epoch": 7.6415094339622645, "grad_norm": 3.890625, "learning_rate": 7.231791011200347e-05, "loss": 4.4489, "step": 17010 }, { "epoch": 7.6437556154537285, "grad_norm": 3.8125, "learning_rate": 7.224183406503228e-05, "loss": 4.4788, "step": 17015 }, { "epoch": 7.646001796945193, "grad_norm": 3.875, "learning_rate": 7.216581377680779e-05, "loss": 4.4854, "step": 17020 }, { "epoch": 7.6482479784366575, "grad_norm": 4.03125, "learning_rate": 7.208984929302719e-05, "loss": 4.4835, "step": 17025 }, { "epoch": 7.650494159928122, "grad_norm": 4.03125, "learning_rate": 7.201394065935427e-05, "loss": 4.4524, "step": 17030 }, { "epoch": 7.6527403414195865, "grad_norm": 4.03125, "learning_rate": 7.193808792141926e-05, "loss": 4.473, "step": 17035 }, { "epoch": 7.654986522911051, "grad_norm": 3.828125, "learning_rate": 7.186229112481861e-05, "loss": 4.4807, "step": 17040 }, { "epoch": 7.6572327044025155, "grad_norm": 3.59375, "learning_rate": 7.178655031511534e-05, "loss": 4.5386, "step": 17045 }, { "epoch": 7.65947888589398, "grad_norm": 3.71875, "learning_rate": 7.171086553783866e-05, "loss": 4.573, "step": 17050 }, { "epoch": 7.6617250673854445, "grad_norm": 3.890625, "learning_rate": 7.163523683848418e-05, "loss": 4.4759, "step": 17055 }, { "epoch": 7.663971248876909, "grad_norm": 3.921875, "learning_rate": 7.155966426251387e-05, "loss": 4.4932, "step": 17060 }, { "epoch": 7.666217430368373, "grad_norm": 3.734375, "learning_rate": 7.14841478553558e-05, "loss": 4.4657, "step": 17065 }, { "epoch": 7.668463611859838, "grad_norm": 3.8125, "learning_rate": 7.140868766240443e-05, "loss": 4.4906, "step": 17070 }, { "epoch": 7.670709793351302, "grad_norm": 4.25, "learning_rate": 7.133328372902025e-05, "loss": 4.5175, "step": 17075 }, { "epoch": 7.672955974842767, "grad_norm": 3.859375, "learning_rate": 7.125793610053015e-05, "loss": 4.5268, "step": 17080 }, { "epoch": 7.675202156334231, "grad_norm": 4.15625, "learning_rate": 7.118264482222697e-05, "loss": 4.5015, "step": 17085 }, { "epoch": 7.677448337825696, "grad_norm": 4.0625, "learning_rate": 7.110740993936981e-05, "loss": 4.4872, "step": 17090 }, { "epoch": 7.679694519317161, "grad_norm": 3.859375, "learning_rate": 7.103223149718387e-05, "loss": 4.4893, "step": 17095 }, { "epoch": 7.681940700808625, "grad_norm": 3.90625, "learning_rate": 7.095710954086032e-05, "loss": 4.4821, "step": 17100 }, { "epoch": 7.684186882300089, "grad_norm": 3.953125, "learning_rate": 7.088204411555647e-05, "loss": 4.515, "step": 17105 }, { "epoch": 7.686433063791554, "grad_norm": 3.75, "learning_rate": 7.080703526639556e-05, "loss": 4.4783, "step": 17110 }, { "epoch": 7.688679245283019, "grad_norm": 4.21875, "learning_rate": 7.073208303846694e-05, "loss": 4.5, "step": 17115 }, { "epoch": 7.690925426774483, "grad_norm": 4.0625, "learning_rate": 7.06571874768259e-05, "loss": 4.5088, "step": 17120 }, { "epoch": 7.693171608265948, "grad_norm": 3.703125, "learning_rate": 7.05823486264935e-05, "loss": 4.3995, "step": 17125 }, { "epoch": 7.695417789757412, "grad_norm": 3.828125, "learning_rate": 7.050756653245693e-05, "loss": 4.4503, "step": 17130 }, { "epoch": 7.697663971248877, "grad_norm": 3.828125, "learning_rate": 7.04328412396691e-05, "loss": 4.5101, "step": 17135 }, { "epoch": 7.699910152740341, "grad_norm": 3.921875, "learning_rate": 7.035817279304888e-05, "loss": 4.5276, "step": 17140 }, { "epoch": 7.702156334231806, "grad_norm": 3.90625, "learning_rate": 7.028356123748097e-05, "loss": 4.5155, "step": 17145 }, { "epoch": 7.70440251572327, "grad_norm": 3.875, "learning_rate": 7.020900661781576e-05, "loss": 4.4453, "step": 17150 }, { "epoch": 7.706648697214735, "grad_norm": 3.828125, "learning_rate": 7.013450897886958e-05, "loss": 4.558, "step": 17155 }, { "epoch": 7.708894878706199, "grad_norm": 3.765625, "learning_rate": 7.006006836542431e-05, "loss": 4.4913, "step": 17160 }, { "epoch": 7.711141060197664, "grad_norm": 3.984375, "learning_rate": 6.998568482222771e-05, "loss": 4.4458, "step": 17165 }, { "epoch": 7.713387241689128, "grad_norm": 3.921875, "learning_rate": 6.991135839399322e-05, "loss": 4.4604, "step": 17170 }, { "epoch": 7.715633423180593, "grad_norm": 3.5625, "learning_rate": 6.983708912539985e-05, "loss": 4.4622, "step": 17175 }, { "epoch": 7.717879604672057, "grad_norm": 3.828125, "learning_rate": 6.976287706109237e-05, "loss": 4.4235, "step": 17180 }, { "epoch": 7.720125786163522, "grad_norm": 3.875, "learning_rate": 6.968872224568103e-05, "loss": 4.5184, "step": 17185 }, { "epoch": 7.722371967654986, "grad_norm": 3.796875, "learning_rate": 6.961462472374179e-05, "loss": 4.5468, "step": 17190 }, { "epoch": 7.724618149146451, "grad_norm": 3.9375, "learning_rate": 6.954058453981609e-05, "loss": 4.4982, "step": 17195 }, { "epoch": 7.726864330637915, "grad_norm": 4.28125, "learning_rate": 6.946660173841093e-05, "loss": 4.4807, "step": 17200 }, { "epoch": 7.72911051212938, "grad_norm": 3.96875, "learning_rate": 6.939267636399888e-05, "loss": 4.5004, "step": 17205 }, { "epoch": 7.731356693620844, "grad_norm": 3.78125, "learning_rate": 6.931880846101783e-05, "loss": 4.4918, "step": 17210 }, { "epoch": 7.733602875112309, "grad_norm": 3.75, "learning_rate": 6.924499807387132e-05, "loss": 4.4224, "step": 17215 }, { "epoch": 7.735849056603773, "grad_norm": 3.8125, "learning_rate": 6.917124524692812e-05, "loss": 4.4814, "step": 17220 }, { "epoch": 7.738095238095238, "grad_norm": 3.96875, "learning_rate": 6.909755002452258e-05, "loss": 4.4893, "step": 17225 }, { "epoch": 7.740341419586702, "grad_norm": 3.8125, "learning_rate": 6.902391245095426e-05, "loss": 4.4256, "step": 17230 }, { "epoch": 7.742587601078167, "grad_norm": 4.0625, "learning_rate": 6.89503325704882e-05, "loss": 4.4999, "step": 17235 }, { "epoch": 7.744833782569632, "grad_norm": 3.75, "learning_rate": 6.887681042735472e-05, "loss": 4.4886, "step": 17240 }, { "epoch": 7.747079964061096, "grad_norm": 4.0, "learning_rate": 6.880334606574935e-05, "loss": 4.4772, "step": 17245 }, { "epoch": 7.74932614555256, "grad_norm": 3.640625, "learning_rate": 6.872993952983303e-05, "loss": 4.5186, "step": 17250 }, { "epoch": 7.751572327044025, "grad_norm": 3.828125, "learning_rate": 6.865659086373179e-05, "loss": 4.5021, "step": 17255 }, { "epoch": 7.75381850853549, "grad_norm": 4.0625, "learning_rate": 6.858330011153697e-05, "loss": 4.4495, "step": 17260 }, { "epoch": 7.756064690026954, "grad_norm": 3.875, "learning_rate": 6.851006731730514e-05, "loss": 4.4986, "step": 17265 }, { "epoch": 7.758310871518419, "grad_norm": 3.78125, "learning_rate": 6.843689252505787e-05, "loss": 4.4514, "step": 17270 }, { "epoch": 7.760557053009883, "grad_norm": 4.0625, "learning_rate": 6.836377577878207e-05, "loss": 4.445, "step": 17275 }, { "epoch": 7.762803234501348, "grad_norm": 3.84375, "learning_rate": 6.82907171224295e-05, "loss": 4.4659, "step": 17280 }, { "epoch": 7.765049415992812, "grad_norm": 3.921875, "learning_rate": 6.821771659991722e-05, "loss": 4.4338, "step": 17285 }, { "epoch": 7.767295597484277, "grad_norm": 3.90625, "learning_rate": 6.81447742551273e-05, "loss": 4.5099, "step": 17290 }, { "epoch": 7.769541778975741, "grad_norm": 3.640625, "learning_rate": 6.807189013190675e-05, "loss": 4.4715, "step": 17295 }, { "epoch": 7.771787960467206, "grad_norm": 4.0, "learning_rate": 6.799906427406771e-05, "loss": 4.469, "step": 17300 }, { "epoch": 7.77403414195867, "grad_norm": 3.90625, "learning_rate": 6.792629672538715e-05, "loss": 4.5603, "step": 17305 }, { "epoch": 7.776280323450135, "grad_norm": 3.8125, "learning_rate": 6.78535875296071e-05, "loss": 4.499, "step": 17310 }, { "epoch": 7.778526504941599, "grad_norm": 3.90625, "learning_rate": 6.778093673043453e-05, "loss": 4.4613, "step": 17315 }, { "epoch": 7.780772686433064, "grad_norm": 4.375, "learning_rate": 6.770834437154115e-05, "loss": 4.4875, "step": 17320 }, { "epoch": 7.783018867924528, "grad_norm": 3.796875, "learning_rate": 6.763581049656376e-05, "loss": 4.5347, "step": 17325 }, { "epoch": 7.785265049415993, "grad_norm": 3.953125, "learning_rate": 6.756333514910379e-05, "loss": 4.4883, "step": 17330 }, { "epoch": 7.787511230907457, "grad_norm": 3.828125, "learning_rate": 6.749091837272767e-05, "loss": 4.5301, "step": 17335 }, { "epoch": 7.789757412398922, "grad_norm": 4.21875, "learning_rate": 6.741856021096647e-05, "loss": 4.4522, "step": 17340 }, { "epoch": 7.792003593890386, "grad_norm": 3.921875, "learning_rate": 6.734626070731612e-05, "loss": 4.4783, "step": 17345 }, { "epoch": 7.794249775381851, "grad_norm": 4.0625, "learning_rate": 6.727401990523731e-05, "loss": 4.463, "step": 17350 }, { "epoch": 7.796495956873315, "grad_norm": 3.859375, "learning_rate": 6.720183784815531e-05, "loss": 4.4799, "step": 17355 }, { "epoch": 7.79874213836478, "grad_norm": 4.0625, "learning_rate": 6.712971457946027e-05, "loss": 4.4696, "step": 17360 }, { "epoch": 7.800988319856245, "grad_norm": 3.84375, "learning_rate": 6.70576501425068e-05, "loss": 4.4441, "step": 17365 }, { "epoch": 7.803234501347709, "grad_norm": 3.625, "learning_rate": 6.698564458061429e-05, "loss": 4.4721, "step": 17370 }, { "epoch": 7.805480682839173, "grad_norm": 3.96875, "learning_rate": 6.691369793706672e-05, "loss": 4.4785, "step": 17375 }, { "epoch": 7.807726864330638, "grad_norm": 3.890625, "learning_rate": 6.684181025511254e-05, "loss": 4.5319, "step": 17380 }, { "epoch": 7.809973045822103, "grad_norm": 3.6875, "learning_rate": 6.676998157796493e-05, "loss": 4.5144, "step": 17385 }, { "epoch": 7.812219227313567, "grad_norm": 3.9375, "learning_rate": 6.669821194880144e-05, "loss": 4.4935, "step": 17390 }, { "epoch": 7.814465408805032, "grad_norm": 4.21875, "learning_rate": 6.662650141076426e-05, "loss": 4.4763, "step": 17395 }, { "epoch": 7.816711590296496, "grad_norm": 3.8125, "learning_rate": 6.655485000695993e-05, "loss": 4.4597, "step": 17400 }, { "epoch": 7.818957771787961, "grad_norm": 3.890625, "learning_rate": 6.648325778045954e-05, "loss": 4.4522, "step": 17405 }, { "epoch": 7.821203953279425, "grad_norm": 3.96875, "learning_rate": 6.641172477429864e-05, "loss": 4.4301, "step": 17410 }, { "epoch": 7.82345013477089, "grad_norm": 3.84375, "learning_rate": 6.634025103147698e-05, "loss": 4.4444, "step": 17415 }, { "epoch": 7.825696316262354, "grad_norm": 3.875, "learning_rate": 6.626883659495897e-05, "loss": 4.5345, "step": 17420 }, { "epoch": 7.827942497753819, "grad_norm": 3.90625, "learning_rate": 6.61974815076731e-05, "loss": 4.5533, "step": 17425 }, { "epoch": 7.830188679245283, "grad_norm": 3.90625, "learning_rate": 6.612618581251243e-05, "loss": 4.4798, "step": 17430 }, { "epoch": 7.832434860736748, "grad_norm": 3.953125, "learning_rate": 6.605494955233412e-05, "loss": 4.5134, "step": 17435 }, { "epoch": 7.834681042228212, "grad_norm": 3.875, "learning_rate": 6.598377276995963e-05, "loss": 4.4622, "step": 17440 }, { "epoch": 7.836927223719677, "grad_norm": 3.875, "learning_rate": 6.591265550817483e-05, "loss": 4.4938, "step": 17445 }, { "epoch": 7.839173405211141, "grad_norm": 4.03125, "learning_rate": 6.584159780972958e-05, "loss": 4.4976, "step": 17450 }, { "epoch": 7.841419586702606, "grad_norm": 4.09375, "learning_rate": 6.577059971733813e-05, "loss": 4.493, "step": 17455 }, { "epoch": 7.84366576819407, "grad_norm": 4.0625, "learning_rate": 6.569966127367885e-05, "loss": 4.4949, "step": 17460 }, { "epoch": 7.845911949685535, "grad_norm": 4.125, "learning_rate": 6.562878252139411e-05, "loss": 4.4956, "step": 17465 }, { "epoch": 7.848158131176999, "grad_norm": 4.0, "learning_rate": 6.555796350309065e-05, "loss": 4.4718, "step": 17470 }, { "epoch": 7.850404312668464, "grad_norm": 3.734375, "learning_rate": 6.548720426133902e-05, "loss": 4.4235, "step": 17475 }, { "epoch": 7.852650494159928, "grad_norm": 3.796875, "learning_rate": 6.541650483867413e-05, "loss": 4.5444, "step": 17480 }, { "epoch": 7.854896675651393, "grad_norm": 4.15625, "learning_rate": 6.534586527759466e-05, "loss": 4.5028, "step": 17485 }, { "epoch": 7.857142857142857, "grad_norm": 3.953125, "learning_rate": 6.52752856205635e-05, "loss": 4.5085, "step": 17490 }, { "epoch": 7.859389038634322, "grad_norm": 3.734375, "learning_rate": 6.520476591000746e-05, "loss": 4.4457, "step": 17495 }, { "epoch": 7.861635220125786, "grad_norm": 3.953125, "learning_rate": 6.51343061883173e-05, "loss": 4.5553, "step": 17500 }, { "epoch": 7.863881401617251, "grad_norm": 3.75, "learning_rate": 6.506390649784776e-05, "loss": 4.459, "step": 17505 }, { "epoch": 7.866127583108716, "grad_norm": 4.28125, "learning_rate": 6.499356688091743e-05, "loss": 4.5444, "step": 17510 }, { "epoch": 7.86837376460018, "grad_norm": 3.859375, "learning_rate": 6.492328737980882e-05, "loss": 4.5072, "step": 17515 }, { "epoch": 7.870619946091644, "grad_norm": 3.9375, "learning_rate": 6.48530680367684e-05, "loss": 4.4354, "step": 17520 }, { "epoch": 7.872866127583109, "grad_norm": 3.78125, "learning_rate": 6.478290889400627e-05, "loss": 4.5007, "step": 17525 }, { "epoch": 7.875112309074574, "grad_norm": 3.84375, "learning_rate": 6.471280999369657e-05, "loss": 4.5356, "step": 17530 }, { "epoch": 7.877358490566038, "grad_norm": 3.84375, "learning_rate": 6.464277137797706e-05, "loss": 4.5208, "step": 17535 }, { "epoch": 7.879604672057503, "grad_norm": 3.90625, "learning_rate": 6.457279308894932e-05, "loss": 4.4958, "step": 17540 }, { "epoch": 7.881850853548967, "grad_norm": 3.828125, "learning_rate": 6.450287516867868e-05, "loss": 4.5001, "step": 17545 }, { "epoch": 7.884097035040432, "grad_norm": 3.953125, "learning_rate": 6.443301765919417e-05, "loss": 4.4255, "step": 17550 }, { "epoch": 7.886343216531896, "grad_norm": 4.0625, "learning_rate": 6.436322060248853e-05, "loss": 4.4363, "step": 17555 }, { "epoch": 7.888589398023361, "grad_norm": 3.859375, "learning_rate": 6.429348404051806e-05, "loss": 4.4416, "step": 17560 }, { "epoch": 7.890835579514825, "grad_norm": 3.9375, "learning_rate": 6.422380801520287e-05, "loss": 4.4451, "step": 17565 }, { "epoch": 7.8930817610062896, "grad_norm": 4.03125, "learning_rate": 6.415419256842646e-05, "loss": 4.4769, "step": 17570 }, { "epoch": 7.895327942497754, "grad_norm": 4.125, "learning_rate": 6.408463774203619e-05, "loss": 4.5419, "step": 17575 }, { "epoch": 7.8975741239892185, "grad_norm": 3.921875, "learning_rate": 6.401514357784267e-05, "loss": 4.5194, "step": 17580 }, { "epoch": 7.899820305480683, "grad_norm": 3.6875, "learning_rate": 6.394571011762029e-05, "loss": 4.4502, "step": 17585 }, { "epoch": 7.9020664869721475, "grad_norm": 3.984375, "learning_rate": 6.387633740310687e-05, "loss": 4.4928, "step": 17590 }, { "epoch": 7.904312668463612, "grad_norm": 4.1875, "learning_rate": 6.380702547600368e-05, "loss": 4.497, "step": 17595 }, { "epoch": 7.9065588499550765, "grad_norm": 4.0625, "learning_rate": 6.373777437797543e-05, "loss": 4.4634, "step": 17600 }, { "epoch": 7.908805031446541, "grad_norm": 3.96875, "learning_rate": 6.366858415065036e-05, "loss": 4.5166, "step": 17605 }, { "epoch": 7.9110512129380055, "grad_norm": 4.0, "learning_rate": 6.359945483562007e-05, "loss": 4.503, "step": 17610 }, { "epoch": 7.9132973944294696, "grad_norm": 3.765625, "learning_rate": 6.353038647443952e-05, "loss": 4.4697, "step": 17615 }, { "epoch": 7.9155435759209345, "grad_norm": 4.03125, "learning_rate": 6.346137910862707e-05, "loss": 4.4233, "step": 17620 }, { "epoch": 7.9177897574123985, "grad_norm": 3.875, "learning_rate": 6.339243277966438e-05, "loss": 4.4729, "step": 17625 }, { "epoch": 7.9200359389038635, "grad_norm": 4.15625, "learning_rate": 6.332354752899643e-05, "loss": 4.5412, "step": 17630 }, { "epoch": 7.922282120395328, "grad_norm": 3.84375, "learning_rate": 6.325472339803149e-05, "loss": 4.5044, "step": 17635 }, { "epoch": 7.9245283018867925, "grad_norm": 4.03125, "learning_rate": 6.318596042814116e-05, "loss": 4.4582, "step": 17640 }, { "epoch": 7.9267744833782565, "grad_norm": 3.8125, "learning_rate": 6.311725866066012e-05, "loss": 4.4888, "step": 17645 }, { "epoch": 7.9290206648697215, "grad_norm": 4.15625, "learning_rate": 6.304861813688639e-05, "loss": 4.5026, "step": 17650 }, { "epoch": 7.931266846361186, "grad_norm": 3.875, "learning_rate": 6.298003889808108e-05, "loss": 4.5242, "step": 17655 }, { "epoch": 7.9335130278526504, "grad_norm": 3.71875, "learning_rate": 6.291152098546856e-05, "loss": 4.4611, "step": 17660 }, { "epoch": 7.935759209344115, "grad_norm": 3.953125, "learning_rate": 6.28430644402363e-05, "loss": 4.3842, "step": 17665 }, { "epoch": 7.938005390835579, "grad_norm": 3.71875, "learning_rate": 6.277466930353481e-05, "loss": 4.5028, "step": 17670 }, { "epoch": 7.940251572327044, "grad_norm": 3.828125, "learning_rate": 6.270633561647781e-05, "loss": 4.4906, "step": 17675 }, { "epoch": 7.942497753818508, "grad_norm": 3.921875, "learning_rate": 6.263806342014195e-05, "loss": 4.4686, "step": 17680 }, { "epoch": 7.944743935309973, "grad_norm": 4.125, "learning_rate": 6.256985275556704e-05, "loss": 4.5212, "step": 17685 }, { "epoch": 7.946990116801437, "grad_norm": 3.921875, "learning_rate": 6.250170366375578e-05, "loss": 4.471, "step": 17690 }, { "epoch": 7.949236298292902, "grad_norm": 3.921875, "learning_rate": 6.243361618567395e-05, "loss": 4.5291, "step": 17695 }, { "epoch": 7.951482479784366, "grad_norm": 4.09375, "learning_rate": 6.236559036225033e-05, "loss": 4.4481, "step": 17700 }, { "epoch": 7.953728661275831, "grad_norm": 3.796875, "learning_rate": 6.229762623437642e-05, "loss": 4.4311, "step": 17705 }, { "epoch": 7.955974842767295, "grad_norm": 3.875, "learning_rate": 6.222972384290699e-05, "loss": 4.455, "step": 17710 }, { "epoch": 7.95822102425876, "grad_norm": 3.78125, "learning_rate": 6.21618832286593e-05, "loss": 4.5304, "step": 17715 }, { "epoch": 7.960467205750224, "grad_norm": 3.734375, "learning_rate": 6.209410443241376e-05, "loss": 4.5395, "step": 17720 }, { "epoch": 7.962713387241689, "grad_norm": 3.875, "learning_rate": 6.202638749491355e-05, "loss": 4.4199, "step": 17725 }, { "epoch": 7.964959568733153, "grad_norm": 4.0, "learning_rate": 6.19587324568646e-05, "loss": 4.4652, "step": 17730 }, { "epoch": 7.967205750224618, "grad_norm": 3.796875, "learning_rate": 6.189113935893571e-05, "loss": 4.5211, "step": 17735 }, { "epoch": 7.969451931716082, "grad_norm": 3.859375, "learning_rate": 6.182360824175837e-05, "loss": 4.4731, "step": 17740 }, { "epoch": 7.971698113207547, "grad_norm": 3.859375, "learning_rate": 6.175613914592691e-05, "loss": 4.4723, "step": 17745 }, { "epoch": 7.973944294699011, "grad_norm": 4.03125, "learning_rate": 6.168873211199829e-05, "loss": 4.4953, "step": 17750 }, { "epoch": 7.976190476190476, "grad_norm": 3.890625, "learning_rate": 6.162138718049216e-05, "loss": 4.4922, "step": 17755 }, { "epoch": 7.97843665768194, "grad_norm": 3.859375, "learning_rate": 6.155410439189095e-05, "loss": 4.4632, "step": 17760 }, { "epoch": 7.980682839173405, "grad_norm": 3.9375, "learning_rate": 6.148688378663958e-05, "loss": 4.4894, "step": 17765 }, { "epoch": 7.982929020664869, "grad_norm": 3.953125, "learning_rate": 6.141972540514572e-05, "loss": 4.5014, "step": 17770 }, { "epoch": 7.985175202156334, "grad_norm": 4.03125, "learning_rate": 6.135262928777962e-05, "loss": 4.4312, "step": 17775 }, { "epoch": 7.987421383647799, "grad_norm": 4.03125, "learning_rate": 6.128559547487397e-05, "loss": 4.4561, "step": 17780 }, { "epoch": 7.989667565139263, "grad_norm": 3.96875, "learning_rate": 6.12186240067242e-05, "loss": 4.5294, "step": 17785 }, { "epoch": 7.991913746630727, "grad_norm": 3.90625, "learning_rate": 6.115171492358809e-05, "loss": 4.4025, "step": 17790 }, { "epoch": 7.994159928122192, "grad_norm": 3.8125, "learning_rate": 6.108486826568607e-05, "loss": 4.4746, "step": 17795 }, { "epoch": 7.996406109613657, "grad_norm": 3.890625, "learning_rate": 6.1018084073200906e-05, "loss": 4.5202, "step": 17800 }, { "epoch": 7.998652291105121, "grad_norm": 4.0, "learning_rate": 6.095136238627792e-05, "loss": 4.5089, "step": 17805 }, { "epoch": 8.000898472596585, "grad_norm": 4.125, "learning_rate": 6.088470324502486e-05, "loss": 4.4478, "step": 17810 }, { "epoch": 8.00314465408805, "grad_norm": 3.796875, "learning_rate": 6.081810668951174e-05, "loss": 4.4606, "step": 17815 }, { "epoch": 8.005390835579515, "grad_norm": 4.03125, "learning_rate": 6.0751572759771165e-05, "loss": 4.5093, "step": 17820 }, { "epoch": 8.00763701707098, "grad_norm": 3.84375, "learning_rate": 6.068510149579786e-05, "loss": 4.5073, "step": 17825 }, { "epoch": 8.009883198562443, "grad_norm": 3.625, "learning_rate": 6.0618692937549105e-05, "loss": 4.5232, "step": 17830 }, { "epoch": 8.012129380053908, "grad_norm": 3.984375, "learning_rate": 6.055234712494431e-05, "loss": 4.473, "step": 17835 }, { "epoch": 8.014375561545373, "grad_norm": 3.984375, "learning_rate": 6.0486064097865263e-05, "loss": 4.428, "step": 17840 }, { "epoch": 8.016621743036838, "grad_norm": 3.6875, "learning_rate": 6.041984389615605e-05, "loss": 4.4789, "step": 17845 }, { "epoch": 8.018867924528301, "grad_norm": 3.953125, "learning_rate": 6.0353686559622816e-05, "loss": 4.5115, "step": 17850 }, { "epoch": 8.021114106019766, "grad_norm": 3.9375, "learning_rate": 6.0287592128034146e-05, "loss": 4.4342, "step": 17855 }, { "epoch": 8.023360287511231, "grad_norm": 4.03125, "learning_rate": 6.022156064112057e-05, "loss": 4.4225, "step": 17860 }, { "epoch": 8.025606469002696, "grad_norm": 3.9375, "learning_rate": 6.0155592138574985e-05, "loss": 4.499, "step": 17865 }, { "epoch": 8.02785265049416, "grad_norm": 4.03125, "learning_rate": 6.0089686660052366e-05, "loss": 4.4441, "step": 17870 }, { "epoch": 8.030098831985624, "grad_norm": 3.953125, "learning_rate": 6.0023844245169716e-05, "loss": 4.54, "step": 17875 }, { "epoch": 8.032345013477089, "grad_norm": 3.796875, "learning_rate": 5.9958064933506276e-05, "loss": 4.4182, "step": 17880 }, { "epoch": 8.034591194968554, "grad_norm": 4.15625, "learning_rate": 5.9892348764603184e-05, "loss": 4.4358, "step": 17885 }, { "epoch": 8.036837376460017, "grad_norm": 4.125, "learning_rate": 5.9826695777963815e-05, "loss": 4.481, "step": 17890 }, { "epoch": 8.039083557951482, "grad_norm": 3.875, "learning_rate": 5.976110601305337e-05, "loss": 4.5304, "step": 17895 }, { "epoch": 8.041329739442947, "grad_norm": 4.0625, "learning_rate": 5.969557950929916e-05, "loss": 4.3914, "step": 17900 }, { "epoch": 8.043575920934412, "grad_norm": 4.0, "learning_rate": 5.9630116306090515e-05, "loss": 4.5243, "step": 17905 }, { "epoch": 8.045822102425875, "grad_norm": 4.125, "learning_rate": 5.95647164427786e-05, "loss": 4.5211, "step": 17910 }, { "epoch": 8.04806828391734, "grad_norm": 4.09375, "learning_rate": 5.94993799586765e-05, "loss": 4.4862, "step": 17915 }, { "epoch": 8.050314465408805, "grad_norm": 3.703125, "learning_rate": 5.943410689305936e-05, "loss": 4.484, "step": 17920 }, { "epoch": 8.05256064690027, "grad_norm": 3.640625, "learning_rate": 5.936889728516398e-05, "loss": 4.4546, "step": 17925 }, { "epoch": 8.054806828391735, "grad_norm": 3.9375, "learning_rate": 5.9303751174189235e-05, "loss": 4.5075, "step": 17930 }, { "epoch": 8.057053009883198, "grad_norm": 3.84375, "learning_rate": 5.923866859929563e-05, "loss": 4.5261, "step": 17935 }, { "epoch": 8.059299191374663, "grad_norm": 3.859375, "learning_rate": 5.9173649599605665e-05, "loss": 4.5253, "step": 17940 }, { "epoch": 8.061545372866128, "grad_norm": 3.953125, "learning_rate": 5.9108694214203454e-05, "loss": 4.4496, "step": 17945 }, { "epoch": 8.063791554357593, "grad_norm": 3.796875, "learning_rate": 5.904380248213497e-05, "loss": 4.4981, "step": 17950 }, { "epoch": 8.066037735849056, "grad_norm": 3.828125, "learning_rate": 5.8978974442407945e-05, "loss": 4.4188, "step": 17955 }, { "epoch": 8.068283917340521, "grad_norm": 4.03125, "learning_rate": 5.891421013399173e-05, "loss": 4.496, "step": 17960 }, { "epoch": 8.070530098831986, "grad_norm": 4.09375, "learning_rate": 5.884950959581748e-05, "loss": 4.4931, "step": 17965 }, { "epoch": 8.07277628032345, "grad_norm": 4.03125, "learning_rate": 5.878487286677785e-05, "loss": 4.5287, "step": 17970 }, { "epoch": 8.075022461814914, "grad_norm": 3.984375, "learning_rate": 5.872029998572735e-05, "loss": 4.4908, "step": 17975 }, { "epoch": 8.077268643306379, "grad_norm": 3.84375, "learning_rate": 5.86557909914819e-05, "loss": 4.4313, "step": 17980 }, { "epoch": 8.079514824797844, "grad_norm": 4.0625, "learning_rate": 5.859134592281918e-05, "loss": 4.4793, "step": 17985 }, { "epoch": 8.081761006289309, "grad_norm": 3.859375, "learning_rate": 5.8526964818478395e-05, "loss": 4.4486, "step": 17990 }, { "epoch": 8.084007187780772, "grad_norm": 3.828125, "learning_rate": 5.846264771716024e-05, "loss": 4.4387, "step": 17995 }, { "epoch": 8.086253369272237, "grad_norm": 4.0, "learning_rate": 5.839839465752702e-05, "loss": 4.5099, "step": 18000 }, { "epoch": 8.086253369272237, "eval_loss": 4.780518054962158, "eval_runtime": 15.9734, "eval_samples_per_second": 1941.539, "eval_steps_per_second": 242.716, "step": 18000 }, { "epoch": 8.088499550763702, "grad_norm": 4.0, "learning_rate": 5.8334205678202464e-05, "loss": 4.4045, "step": 18005 }, { "epoch": 8.090745732255167, "grad_norm": 4.03125, "learning_rate": 5.827008081777183e-05, "loss": 4.4716, "step": 18010 }, { "epoch": 8.09299191374663, "grad_norm": 3.859375, "learning_rate": 5.8206020114781895e-05, "loss": 4.4368, "step": 18015 }, { "epoch": 8.095238095238095, "grad_norm": 3.796875, "learning_rate": 5.8142023607740695e-05, "loss": 4.427, "step": 18020 }, { "epoch": 8.09748427672956, "grad_norm": 4.0, "learning_rate": 5.807809133511786e-05, "loss": 4.4108, "step": 18025 }, { "epoch": 8.099730458221025, "grad_norm": 4.0, "learning_rate": 5.801422333534426e-05, "loss": 4.4607, "step": 18030 }, { "epoch": 8.101976639712488, "grad_norm": 4.0625, "learning_rate": 5.7950419646812294e-05, "loss": 4.4477, "step": 18035 }, { "epoch": 8.104222821203953, "grad_norm": 3.890625, "learning_rate": 5.788668030787551e-05, "loss": 4.4343, "step": 18040 }, { "epoch": 8.106469002695418, "grad_norm": 3.8125, "learning_rate": 5.782300535684891e-05, "loss": 4.4925, "step": 18045 }, { "epoch": 8.108715184186883, "grad_norm": 3.953125, "learning_rate": 5.7759394832008776e-05, "loss": 4.4611, "step": 18050 }, { "epoch": 8.110961365678348, "grad_norm": 4.09375, "learning_rate": 5.76958487715926e-05, "loss": 4.4711, "step": 18055 }, { "epoch": 8.11320754716981, "grad_norm": 3.890625, "learning_rate": 5.763236721379919e-05, "loss": 4.4197, "step": 18060 }, { "epoch": 8.115453728661276, "grad_norm": 3.71875, "learning_rate": 5.756895019678849e-05, "loss": 4.493, "step": 18065 }, { "epoch": 8.11769991015274, "grad_norm": 4.0, "learning_rate": 5.750559775868181e-05, "loss": 4.4368, "step": 18070 }, { "epoch": 8.119946091644206, "grad_norm": 4.03125, "learning_rate": 5.744230993756148e-05, "loss": 4.4406, "step": 18075 }, { "epoch": 8.122192273135669, "grad_norm": 3.78125, "learning_rate": 5.737908677147101e-05, "loss": 4.4617, "step": 18080 }, { "epoch": 8.124438454627134, "grad_norm": 4.03125, "learning_rate": 5.731592829841516e-05, "loss": 4.511, "step": 18085 }, { "epoch": 8.126684636118599, "grad_norm": 3.90625, "learning_rate": 5.725283455635965e-05, "loss": 4.4128, "step": 18090 }, { "epoch": 8.128930817610064, "grad_norm": 3.921875, "learning_rate": 5.718980558323139e-05, "loss": 4.4366, "step": 18095 }, { "epoch": 8.131176999101527, "grad_norm": 4.0625, "learning_rate": 5.712684141691836e-05, "loss": 4.4686, "step": 18100 }, { "epoch": 8.133423180592992, "grad_norm": 3.96875, "learning_rate": 5.7063942095269505e-05, "loss": 4.4801, "step": 18105 }, { "epoch": 8.135669362084457, "grad_norm": 3.953125, "learning_rate": 5.7001107656094893e-05, "loss": 4.4497, "step": 18110 }, { "epoch": 8.137915543575922, "grad_norm": 4.09375, "learning_rate": 5.693833813716546e-05, "loss": 4.4774, "step": 18115 }, { "epoch": 8.140161725067385, "grad_norm": 3.84375, "learning_rate": 5.687563357621321e-05, "loss": 4.4912, "step": 18120 }, { "epoch": 8.14240790655885, "grad_norm": 3.9375, "learning_rate": 5.6812994010931146e-05, "loss": 4.4964, "step": 18125 }, { "epoch": 8.144654088050315, "grad_norm": 3.953125, "learning_rate": 5.675041947897303e-05, "loss": 4.4952, "step": 18130 }, { "epoch": 8.14690026954178, "grad_norm": 3.859375, "learning_rate": 5.6687910017953755e-05, "loss": 4.4608, "step": 18135 }, { "epoch": 8.149146451033243, "grad_norm": 3.953125, "learning_rate": 5.662546566544886e-05, "loss": 4.4754, "step": 18140 }, { "epoch": 8.151392632524708, "grad_norm": 3.96875, "learning_rate": 5.656308645899498e-05, "loss": 4.4565, "step": 18145 }, { "epoch": 8.153638814016173, "grad_norm": 3.765625, "learning_rate": 5.650077243608937e-05, "loss": 4.4769, "step": 18150 }, { "epoch": 8.155884995507638, "grad_norm": 4.125, "learning_rate": 5.643852363419027e-05, "loss": 4.4776, "step": 18155 }, { "epoch": 8.1581311769991, "grad_norm": 4.125, "learning_rate": 5.637634009071666e-05, "loss": 4.4136, "step": 18160 }, { "epoch": 8.160377358490566, "grad_norm": 3.9375, "learning_rate": 5.631422184304822e-05, "loss": 4.5088, "step": 18165 }, { "epoch": 8.16262353998203, "grad_norm": 3.84375, "learning_rate": 5.625216892852553e-05, "loss": 4.5145, "step": 18170 }, { "epoch": 8.164869721473496, "grad_norm": 4.1875, "learning_rate": 5.6190181384449726e-05, "loss": 4.458, "step": 18175 }, { "epoch": 8.167115902964959, "grad_norm": 3.96875, "learning_rate": 5.6128259248082795e-05, "loss": 4.4935, "step": 18180 }, { "epoch": 8.169362084456424, "grad_norm": 4.03125, "learning_rate": 5.6066402556647306e-05, "loss": 4.439, "step": 18185 }, { "epoch": 8.171608265947889, "grad_norm": 3.828125, "learning_rate": 5.600461134732651e-05, "loss": 4.4209, "step": 18190 }, { "epoch": 8.173854447439354, "grad_norm": 3.921875, "learning_rate": 5.5942885657264406e-05, "loss": 4.4214, "step": 18195 }, { "epoch": 8.176100628930818, "grad_norm": 3.859375, "learning_rate": 5.588122552356538e-05, "loss": 4.4687, "step": 18200 }, { "epoch": 8.178346810422282, "grad_norm": 3.953125, "learning_rate": 5.5819630983294655e-05, "loss": 4.4344, "step": 18205 }, { "epoch": 8.180592991913747, "grad_norm": 3.84375, "learning_rate": 5.575810207347785e-05, "loss": 4.4538, "step": 18210 }, { "epoch": 8.182839173405211, "grad_norm": 4.15625, "learning_rate": 5.569663883110118e-05, "loss": 4.4291, "step": 18215 }, { "epoch": 8.185085354896676, "grad_norm": 4.09375, "learning_rate": 5.563524129311149e-05, "loss": 4.4979, "step": 18220 }, { "epoch": 8.18733153638814, "grad_norm": 4.21875, "learning_rate": 5.557390949641598e-05, "loss": 4.4595, "step": 18225 }, { "epoch": 8.189577717879605, "grad_norm": 4.09375, "learning_rate": 5.551264347788241e-05, "loss": 4.4759, "step": 18230 }, { "epoch": 8.19182389937107, "grad_norm": 4.0625, "learning_rate": 5.5451443274338915e-05, "loss": 4.4531, "step": 18235 }, { "epoch": 8.194070080862534, "grad_norm": 3.859375, "learning_rate": 5.53903089225742e-05, "loss": 4.4832, "step": 18240 }, { "epoch": 8.196316262353998, "grad_norm": 3.75, "learning_rate": 5.5329240459337316e-05, "loss": 4.4841, "step": 18245 }, { "epoch": 8.198562443845463, "grad_norm": 3.734375, "learning_rate": 5.5268237921337674e-05, "loss": 4.4415, "step": 18250 }, { "epoch": 8.200808625336927, "grad_norm": 3.9375, "learning_rate": 5.5207301345245166e-05, "loss": 4.4937, "step": 18255 }, { "epoch": 8.203054806828392, "grad_norm": 4.0625, "learning_rate": 5.514643076768986e-05, "loss": 4.3784, "step": 18260 }, { "epoch": 8.205300988319856, "grad_norm": 4.09375, "learning_rate": 5.5085626225262305e-05, "loss": 4.5438, "step": 18265 }, { "epoch": 8.20754716981132, "grad_norm": 3.78125, "learning_rate": 5.5024887754513314e-05, "loss": 4.4229, "step": 18270 }, { "epoch": 8.209793351302785, "grad_norm": 4.09375, "learning_rate": 5.496421539195394e-05, "loss": 4.4617, "step": 18275 }, { "epoch": 8.21203953279425, "grad_norm": 4.3125, "learning_rate": 5.4903609174055566e-05, "loss": 4.4654, "step": 18280 }, { "epoch": 8.214285714285714, "grad_norm": 4.125, "learning_rate": 5.4843069137249694e-05, "loss": 4.4163, "step": 18285 }, { "epoch": 8.216531895777178, "grad_norm": 3.765625, "learning_rate": 5.4782595317928205e-05, "loss": 4.5016, "step": 18290 }, { "epoch": 8.218778077268643, "grad_norm": 4.09375, "learning_rate": 5.472218775244305e-05, "loss": 4.3907, "step": 18295 }, { "epoch": 8.221024258760108, "grad_norm": 3.796875, "learning_rate": 5.46618464771064e-05, "loss": 4.4023, "step": 18300 }, { "epoch": 8.223270440251572, "grad_norm": 3.8125, "learning_rate": 5.460157152819061e-05, "loss": 4.4586, "step": 18305 }, { "epoch": 8.225516621743036, "grad_norm": 4.21875, "learning_rate": 5.4541362941928076e-05, "loss": 4.4928, "step": 18310 }, { "epoch": 8.227762803234501, "grad_norm": 4.28125, "learning_rate": 5.4481220754511434e-05, "loss": 4.3868, "step": 18315 }, { "epoch": 8.230008984725966, "grad_norm": 3.828125, "learning_rate": 5.442114500209324e-05, "loss": 4.4299, "step": 18320 }, { "epoch": 8.232255166217431, "grad_norm": 4.0625, "learning_rate": 5.436113572078625e-05, "loss": 4.4576, "step": 18325 }, { "epoch": 8.234501347708894, "grad_norm": 3.953125, "learning_rate": 5.43011929466632e-05, "loss": 4.4762, "step": 18330 }, { "epoch": 8.23674752920036, "grad_norm": 4.15625, "learning_rate": 5.424131671575686e-05, "loss": 4.5095, "step": 18335 }, { "epoch": 8.238993710691824, "grad_norm": 3.890625, "learning_rate": 5.418150706406007e-05, "loss": 4.4108, "step": 18340 }, { "epoch": 8.24123989218329, "grad_norm": 3.71875, "learning_rate": 5.412176402752546e-05, "loss": 4.3927, "step": 18345 }, { "epoch": 8.243486073674752, "grad_norm": 3.890625, "learning_rate": 5.406208764206585e-05, "loss": 4.4245, "step": 18350 }, { "epoch": 8.245732255166217, "grad_norm": 3.6875, "learning_rate": 5.4002477943553816e-05, "loss": 4.465, "step": 18355 }, { "epoch": 8.247978436657682, "grad_norm": 3.671875, "learning_rate": 5.39429349678219e-05, "loss": 4.4656, "step": 18360 }, { "epoch": 8.250224618149147, "grad_norm": 3.890625, "learning_rate": 5.388345875066264e-05, "loss": 4.479, "step": 18365 }, { "epoch": 8.25247079964061, "grad_norm": 3.859375, "learning_rate": 5.3824049327828245e-05, "loss": 4.3851, "step": 18370 }, { "epoch": 8.254716981132075, "grad_norm": 3.90625, "learning_rate": 5.376470673503096e-05, "loss": 4.4901, "step": 18375 }, { "epoch": 8.25696316262354, "grad_norm": 4.0, "learning_rate": 5.370543100794273e-05, "loss": 4.4664, "step": 18380 }, { "epoch": 8.259209344115005, "grad_norm": 4.0, "learning_rate": 5.3646222182195366e-05, "loss": 4.4547, "step": 18385 }, { "epoch": 8.261455525606468, "grad_norm": 3.78125, "learning_rate": 5.358708029338048e-05, "loss": 4.4895, "step": 18390 }, { "epoch": 8.263701707097933, "grad_norm": 4.125, "learning_rate": 5.352800537704936e-05, "loss": 4.4985, "step": 18395 }, { "epoch": 8.265947888589398, "grad_norm": 4.09375, "learning_rate": 5.346899746871313e-05, "loss": 4.4499, "step": 18400 }, { "epoch": 8.268194070080863, "grad_norm": 4.28125, "learning_rate": 5.341005660384257e-05, "loss": 4.5527, "step": 18405 }, { "epoch": 8.270440251572326, "grad_norm": 3.875, "learning_rate": 5.3351182817868216e-05, "loss": 4.4724, "step": 18410 }, { "epoch": 8.272686433063791, "grad_norm": 3.96875, "learning_rate": 5.329237614618028e-05, "loss": 4.4431, "step": 18415 }, { "epoch": 8.274932614555256, "grad_norm": 4.0, "learning_rate": 5.323363662412851e-05, "loss": 4.4714, "step": 18420 }, { "epoch": 8.277178796046721, "grad_norm": 4.28125, "learning_rate": 5.3174964287022474e-05, "loss": 4.4337, "step": 18425 }, { "epoch": 8.279424977538184, "grad_norm": 3.96875, "learning_rate": 5.311635917013118e-05, "loss": 4.4529, "step": 18430 }, { "epoch": 8.28167115902965, "grad_norm": 3.9375, "learning_rate": 5.305782130868341e-05, "loss": 4.435, "step": 18435 }, { "epoch": 8.283917340521114, "grad_norm": 3.84375, "learning_rate": 5.2999350737867296e-05, "loss": 4.4485, "step": 18440 }, { "epoch": 8.286163522012579, "grad_norm": 3.953125, "learning_rate": 5.294094749283072e-05, "loss": 4.4753, "step": 18445 }, { "epoch": 8.288409703504042, "grad_norm": 4.0, "learning_rate": 5.2882611608681024e-05, "loss": 4.4287, "step": 18450 }, { "epoch": 8.290655884995507, "grad_norm": 3.671875, "learning_rate": 5.282434312048499e-05, "loss": 4.4659, "step": 18455 }, { "epoch": 8.292902066486972, "grad_norm": 3.890625, "learning_rate": 5.276614206326898e-05, "loss": 4.4461, "step": 18460 }, { "epoch": 8.295148247978437, "grad_norm": 3.96875, "learning_rate": 5.2708008472018786e-05, "loss": 4.4968, "step": 18465 }, { "epoch": 8.297394429469902, "grad_norm": 3.953125, "learning_rate": 5.2649942381679626e-05, "loss": 4.4595, "step": 18470 }, { "epoch": 8.299640610961365, "grad_norm": 4.03125, "learning_rate": 5.259194382715623e-05, "loss": 4.4837, "step": 18475 }, { "epoch": 8.30188679245283, "grad_norm": 4.125, "learning_rate": 5.253401284331256e-05, "loss": 4.4814, "step": 18480 }, { "epoch": 8.304132973944295, "grad_norm": 4.03125, "learning_rate": 5.247614946497215e-05, "loss": 4.4858, "step": 18485 }, { "epoch": 8.30637915543576, "grad_norm": 3.734375, "learning_rate": 5.241835372691774e-05, "loss": 4.4735, "step": 18490 }, { "epoch": 8.308625336927223, "grad_norm": 4.25, "learning_rate": 5.236062566389155e-05, "loss": 4.4556, "step": 18495 }, { "epoch": 8.310871518418688, "grad_norm": 4.0625, "learning_rate": 5.230296531059497e-05, "loss": 4.3911, "step": 18500 }, { "epoch": 8.313117699910153, "grad_norm": 3.953125, "learning_rate": 5.22453727016888e-05, "loss": 4.4873, "step": 18505 }, { "epoch": 8.315363881401618, "grad_norm": 3.703125, "learning_rate": 5.2187847871793134e-05, "loss": 4.4996, "step": 18510 }, { "epoch": 8.317610062893081, "grad_norm": 3.9375, "learning_rate": 5.213039085548716e-05, "loss": 4.4138, "step": 18515 }, { "epoch": 8.319856244384546, "grad_norm": 3.859375, "learning_rate": 5.207300168730952e-05, "loss": 4.4796, "step": 18520 }, { "epoch": 8.322102425876011, "grad_norm": 4.0625, "learning_rate": 5.20156804017579e-05, "loss": 4.4773, "step": 18525 }, { "epoch": 8.324348607367476, "grad_norm": 3.859375, "learning_rate": 5.1958427033289304e-05, "loss": 4.468, "step": 18530 }, { "epoch": 8.326594788858939, "grad_norm": 3.84375, "learning_rate": 5.190124161631977e-05, "loss": 4.4441, "step": 18535 }, { "epoch": 8.328840970350404, "grad_norm": 3.875, "learning_rate": 5.18441241852246e-05, "loss": 4.4903, "step": 18540 }, { "epoch": 8.331087151841869, "grad_norm": 4.09375, "learning_rate": 5.178707477433829e-05, "loss": 4.4684, "step": 18545 }, { "epoch": 8.333333333333334, "grad_norm": 4.0, "learning_rate": 5.1730093417954214e-05, "loss": 4.3903, "step": 18550 }, { "epoch": 8.335579514824797, "grad_norm": 3.953125, "learning_rate": 5.167318015032504e-05, "loss": 4.4638, "step": 18555 }, { "epoch": 8.337825696316262, "grad_norm": 3.875, "learning_rate": 5.161633500566249e-05, "loss": 4.4414, "step": 18560 }, { "epoch": 8.340071877807727, "grad_norm": 4.0, "learning_rate": 5.155955801813721e-05, "loss": 4.4694, "step": 18565 }, { "epoch": 8.342318059299192, "grad_norm": 3.96875, "learning_rate": 5.150284922187902e-05, "loss": 4.4675, "step": 18570 }, { "epoch": 8.344564240790655, "grad_norm": 3.8125, "learning_rate": 5.1446208650976645e-05, "loss": 4.4494, "step": 18575 }, { "epoch": 8.34681042228212, "grad_norm": 3.859375, "learning_rate": 5.138963633947789e-05, "loss": 4.4237, "step": 18580 }, { "epoch": 8.349056603773585, "grad_norm": 4.15625, "learning_rate": 5.133313232138942e-05, "loss": 4.4784, "step": 18585 }, { "epoch": 8.35130278526505, "grad_norm": 3.890625, "learning_rate": 5.127669663067691e-05, "loss": 4.4938, "step": 18590 }, { "epoch": 8.353548966756513, "grad_norm": 4.0, "learning_rate": 5.122032930126502e-05, "loss": 4.4414, "step": 18595 }, { "epoch": 8.355795148247978, "grad_norm": 4.09375, "learning_rate": 5.1164030367037166e-05, "loss": 4.4891, "step": 18600 }, { "epoch": 8.358041329739443, "grad_norm": 3.984375, "learning_rate": 5.1107799861835827e-05, "loss": 4.4619, "step": 18605 }, { "epoch": 8.360287511230908, "grad_norm": 4.25, "learning_rate": 5.105163781946217e-05, "loss": 4.4723, "step": 18610 }, { "epoch": 8.362533692722373, "grad_norm": 4.0625, "learning_rate": 5.0995544273676335e-05, "loss": 4.462, "step": 18615 }, { "epoch": 8.364779874213836, "grad_norm": 4.21875, "learning_rate": 5.0939519258197314e-05, "loss": 4.4285, "step": 18620 }, { "epoch": 8.367026055705301, "grad_norm": 3.890625, "learning_rate": 5.0883562806702725e-05, "loss": 4.4085, "step": 18625 }, { "epoch": 8.369272237196766, "grad_norm": 3.96875, "learning_rate": 5.082767495282917e-05, "loss": 4.4501, "step": 18630 }, { "epoch": 8.37151841868823, "grad_norm": 3.703125, "learning_rate": 5.077185573017186e-05, "loss": 4.5868, "step": 18635 }, { "epoch": 8.373764600179694, "grad_norm": 4.125, "learning_rate": 5.071610517228491e-05, "loss": 4.4669, "step": 18640 }, { "epoch": 8.376010781671159, "grad_norm": 3.9375, "learning_rate": 5.066042331268099e-05, "loss": 4.5506, "step": 18645 }, { "epoch": 8.378256963162624, "grad_norm": 4.0625, "learning_rate": 5.060481018483157e-05, "loss": 4.4666, "step": 18650 }, { "epoch": 8.380503144654089, "grad_norm": 3.875, "learning_rate": 5.054926582216683e-05, "loss": 4.513, "step": 18655 }, { "epoch": 8.382749326145552, "grad_norm": 3.921875, "learning_rate": 5.049379025807553e-05, "loss": 4.4393, "step": 18660 }, { "epoch": 8.384995507637017, "grad_norm": 4.0625, "learning_rate": 5.043838352590515e-05, "loss": 4.4618, "step": 18665 }, { "epoch": 8.387241689128482, "grad_norm": 4.15625, "learning_rate": 5.0383045658961694e-05, "loss": 4.5318, "step": 18670 }, { "epoch": 8.389487870619947, "grad_norm": 4.0625, "learning_rate": 5.032777669050993e-05, "loss": 4.4933, "step": 18675 }, { "epoch": 8.39173405211141, "grad_norm": 4.09375, "learning_rate": 5.0272576653773034e-05, "loss": 4.4422, "step": 18680 }, { "epoch": 8.393980233602875, "grad_norm": 4.0625, "learning_rate": 5.021744558193286e-05, "loss": 4.4074, "step": 18685 }, { "epoch": 8.39622641509434, "grad_norm": 4.4375, "learning_rate": 5.0162383508129806e-05, "loss": 4.4848, "step": 18690 }, { "epoch": 8.398472596585805, "grad_norm": 3.953125, "learning_rate": 5.01073904654627e-05, "loss": 4.4374, "step": 18695 }, { "epoch": 8.400718778077268, "grad_norm": 3.9375, "learning_rate": 5.005246648698898e-05, "loss": 4.4559, "step": 18700 }, { "epoch": 8.402964959568733, "grad_norm": 4.15625, "learning_rate": 4.9997611605724496e-05, "loss": 4.4243, "step": 18705 }, { "epoch": 8.405211141060198, "grad_norm": 4.1875, "learning_rate": 4.994282585464359e-05, "loss": 4.4827, "step": 18710 }, { "epoch": 8.407457322551663, "grad_norm": 3.9375, "learning_rate": 4.9888109266679086e-05, "loss": 4.5147, "step": 18715 }, { "epoch": 8.409703504043126, "grad_norm": 3.953125, "learning_rate": 4.9833461874722125e-05, "loss": 4.4535, "step": 18720 }, { "epoch": 8.41194968553459, "grad_norm": 3.6875, "learning_rate": 4.977888371162237e-05, "loss": 4.4727, "step": 18725 }, { "epoch": 8.414195867026056, "grad_norm": 3.875, "learning_rate": 4.972437481018783e-05, "loss": 4.487, "step": 18730 }, { "epoch": 8.41644204851752, "grad_norm": 4.09375, "learning_rate": 4.966993520318484e-05, "loss": 4.4624, "step": 18735 }, { "epoch": 8.418688230008986, "grad_norm": 4.0625, "learning_rate": 4.961556492333816e-05, "loss": 4.4402, "step": 18740 }, { "epoch": 8.420934411500449, "grad_norm": 4.09375, "learning_rate": 4.956126400333076e-05, "loss": 4.5195, "step": 18745 }, { "epoch": 8.423180592991914, "grad_norm": 4.0, "learning_rate": 4.950703247580404e-05, "loss": 4.514, "step": 18750 }, { "epoch": 8.425426774483379, "grad_norm": 3.96875, "learning_rate": 4.945287037335759e-05, "loss": 4.4338, "step": 18755 }, { "epoch": 8.427672955974844, "grad_norm": 3.9375, "learning_rate": 4.939877772854933e-05, "loss": 4.4666, "step": 18760 }, { "epoch": 8.429919137466307, "grad_norm": 3.890625, "learning_rate": 4.934475457389543e-05, "loss": 4.4787, "step": 18765 }, { "epoch": 8.432165318957772, "grad_norm": 3.84375, "learning_rate": 4.9290800941870225e-05, "loss": 4.5359, "step": 18770 }, { "epoch": 8.434411500449237, "grad_norm": 3.96875, "learning_rate": 4.923691686490631e-05, "loss": 4.456, "step": 18775 }, { "epoch": 8.436657681940702, "grad_norm": 3.890625, "learning_rate": 4.918310237539447e-05, "loss": 4.4697, "step": 18780 }, { "epoch": 8.438903863432165, "grad_norm": 3.859375, "learning_rate": 4.912935750568365e-05, "loss": 4.5175, "step": 18785 }, { "epoch": 8.44115004492363, "grad_norm": 3.953125, "learning_rate": 4.907568228808087e-05, "loss": 4.5084, "step": 18790 }, { "epoch": 8.443396226415095, "grad_norm": 3.953125, "learning_rate": 4.9022076754851436e-05, "loss": 4.4899, "step": 18795 }, { "epoch": 8.44564240790656, "grad_norm": 4.0, "learning_rate": 4.896854093821869e-05, "loss": 4.5063, "step": 18800 }, { "epoch": 8.447888589398023, "grad_norm": 3.96875, "learning_rate": 4.891507487036399e-05, "loss": 4.4922, "step": 18805 }, { "epoch": 8.450134770889488, "grad_norm": 3.984375, "learning_rate": 4.88616785834269e-05, "loss": 4.514, "step": 18810 }, { "epoch": 8.452380952380953, "grad_norm": 3.953125, "learning_rate": 4.880835210950491e-05, "loss": 4.4305, "step": 18815 }, { "epoch": 8.454627133872417, "grad_norm": 3.890625, "learning_rate": 4.875509548065362e-05, "loss": 4.5128, "step": 18820 }, { "epoch": 8.45687331536388, "grad_norm": 4.125, "learning_rate": 4.87019087288867e-05, "loss": 4.448, "step": 18825 }, { "epoch": 8.459119496855346, "grad_norm": 3.96875, "learning_rate": 4.864879188617565e-05, "loss": 4.4806, "step": 18830 }, { "epoch": 8.46136567834681, "grad_norm": 3.8125, "learning_rate": 4.859574498445011e-05, "loss": 4.4896, "step": 18835 }, { "epoch": 8.463611859838275, "grad_norm": 3.875, "learning_rate": 4.854276805559757e-05, "loss": 4.5605, "step": 18840 }, { "epoch": 8.465858041329739, "grad_norm": 3.953125, "learning_rate": 4.848986113146352e-05, "loss": 4.4716, "step": 18845 }, { "epoch": 8.468104222821204, "grad_norm": 4.15625, "learning_rate": 4.843702424385133e-05, "loss": 4.4414, "step": 18850 }, { "epoch": 8.470350404312669, "grad_norm": 3.890625, "learning_rate": 4.838425742452228e-05, "loss": 4.4549, "step": 18855 }, { "epoch": 8.472596585804133, "grad_norm": 3.953125, "learning_rate": 4.8331560705195614e-05, "loss": 4.5185, "step": 18860 }, { "epoch": 8.474842767295598, "grad_norm": 3.9375, "learning_rate": 4.827893411754824e-05, "loss": 4.5196, "step": 18865 }, { "epoch": 8.477088948787062, "grad_norm": 3.671875, "learning_rate": 4.82263776932151e-05, "loss": 4.484, "step": 18870 }, { "epoch": 8.479335130278526, "grad_norm": 3.828125, "learning_rate": 4.8173891463788884e-05, "loss": 4.4651, "step": 18875 }, { "epoch": 8.481581311769991, "grad_norm": 3.828125, "learning_rate": 4.812147546082006e-05, "loss": 4.4591, "step": 18880 }, { "epoch": 8.483827493261456, "grad_norm": 4.03125, "learning_rate": 4.806912971581695e-05, "loss": 4.45, "step": 18885 }, { "epoch": 8.48607367475292, "grad_norm": 3.859375, "learning_rate": 4.801685426024555e-05, "loss": 4.4575, "step": 18890 }, { "epoch": 8.488319856244384, "grad_norm": 3.90625, "learning_rate": 4.796464912552974e-05, "loss": 4.5483, "step": 18895 }, { "epoch": 8.49056603773585, "grad_norm": 3.96875, "learning_rate": 4.791251434305097e-05, "loss": 4.4674, "step": 18900 }, { "epoch": 8.492812219227314, "grad_norm": 3.9375, "learning_rate": 4.786044994414851e-05, "loss": 4.487, "step": 18905 }, { "epoch": 8.495058400718777, "grad_norm": 3.9375, "learning_rate": 4.780845596011932e-05, "loss": 4.4694, "step": 18910 }, { "epoch": 8.497304582210242, "grad_norm": 3.8125, "learning_rate": 4.775653242221791e-05, "loss": 4.4402, "step": 18915 }, { "epoch": 8.499550763701707, "grad_norm": 3.796875, "learning_rate": 4.770467936165665e-05, "loss": 4.5017, "step": 18920 }, { "epoch": 8.501796945193172, "grad_norm": 3.9375, "learning_rate": 4.765289680960533e-05, "loss": 4.4096, "step": 18925 }, { "epoch": 8.504043126684635, "grad_norm": 4.0625, "learning_rate": 4.7601184797191506e-05, "loss": 4.4218, "step": 18930 }, { "epoch": 8.5062893081761, "grad_norm": 3.671875, "learning_rate": 4.754954335550026e-05, "loss": 4.4643, "step": 18935 }, { "epoch": 8.508535489667565, "grad_norm": 3.765625, "learning_rate": 4.749797251557426e-05, "loss": 4.4153, "step": 18940 }, { "epoch": 8.51078167115903, "grad_norm": 3.9375, "learning_rate": 4.744647230841379e-05, "loss": 4.472, "step": 18945 }, { "epoch": 8.513027852650493, "grad_norm": 3.953125, "learning_rate": 4.739504276497658e-05, "loss": 4.446, "step": 18950 }, { "epoch": 8.515274034141958, "grad_norm": 3.953125, "learning_rate": 4.7343683916177994e-05, "loss": 4.4649, "step": 18955 }, { "epoch": 8.517520215633423, "grad_norm": 3.890625, "learning_rate": 4.7292395792890765e-05, "loss": 4.5418, "step": 18960 }, { "epoch": 8.519766397124888, "grad_norm": 3.828125, "learning_rate": 4.7241178425945247e-05, "loss": 4.4404, "step": 18965 }, { "epoch": 8.522012578616351, "grad_norm": 4.15625, "learning_rate": 4.719003184612919e-05, "loss": 4.4466, "step": 18970 }, { "epoch": 8.524258760107816, "grad_norm": 4.15625, "learning_rate": 4.713895608418777e-05, "loss": 4.476, "step": 18975 }, { "epoch": 8.526504941599281, "grad_norm": 4.125, "learning_rate": 4.7087951170823675e-05, "loss": 4.5428, "step": 18980 }, { "epoch": 8.528751123090746, "grad_norm": 4.375, "learning_rate": 4.7037017136696905e-05, "loss": 4.443, "step": 18985 }, { "epoch": 8.530997304582211, "grad_norm": 4.03125, "learning_rate": 4.698615401242495e-05, "loss": 4.4697, "step": 18990 }, { "epoch": 8.533243486073674, "grad_norm": 3.90625, "learning_rate": 4.693536182858256e-05, "loss": 4.4363, "step": 18995 }, { "epoch": 8.53548966756514, "grad_norm": 3.90625, "learning_rate": 4.688464061570198e-05, "loss": 4.5175, "step": 19000 }, { "epoch": 8.53548966756514, "eval_loss": 4.779257297515869, "eval_runtime": 16.0653, "eval_samples_per_second": 1930.435, "eval_steps_per_second": 241.328, "step": 19000 }, { "epoch": 8.537735849056604, "grad_norm": 4.03125, "learning_rate": 4.6833990404272724e-05, "loss": 4.4766, "step": 19005 }, { "epoch": 8.539982030548067, "grad_norm": 3.90625, "learning_rate": 4.678341122474156e-05, "loss": 4.4872, "step": 19010 }, { "epoch": 8.542228212039532, "grad_norm": 4.34375, "learning_rate": 4.673290310751268e-05, "loss": 4.5014, "step": 19015 }, { "epoch": 8.544474393530997, "grad_norm": 4.125, "learning_rate": 4.668246608294749e-05, "loss": 4.5077, "step": 19020 }, { "epoch": 8.546720575022462, "grad_norm": 3.890625, "learning_rate": 4.663210018136464e-05, "loss": 4.5305, "step": 19025 }, { "epoch": 8.548966756513927, "grad_norm": 3.734375, "learning_rate": 4.658180543304009e-05, "loss": 4.4317, "step": 19030 }, { "epoch": 8.55121293800539, "grad_norm": 4.125, "learning_rate": 4.653158186820696e-05, "loss": 4.5103, "step": 19035 }, { "epoch": 8.553459119496855, "grad_norm": 4.09375, "learning_rate": 4.6481429517055675e-05, "loss": 4.5253, "step": 19040 }, { "epoch": 8.55570530098832, "grad_norm": 3.984375, "learning_rate": 4.643134840973374e-05, "loss": 4.4771, "step": 19045 }, { "epoch": 8.557951482479785, "grad_norm": 3.96875, "learning_rate": 4.638133857634589e-05, "loss": 4.4538, "step": 19050 }, { "epoch": 8.560197663971248, "grad_norm": 3.953125, "learning_rate": 4.633140004695407e-05, "loss": 4.4769, "step": 19055 }, { "epoch": 8.562443845462713, "grad_norm": 3.875, "learning_rate": 4.628153285157725e-05, "loss": 4.4907, "step": 19060 }, { "epoch": 8.564690026954178, "grad_norm": 3.796875, "learning_rate": 4.623173702019159e-05, "loss": 4.5098, "step": 19065 }, { "epoch": 8.566936208445643, "grad_norm": 4.0, "learning_rate": 4.618201258273034e-05, "loss": 4.4628, "step": 19070 }, { "epoch": 8.569182389937106, "grad_norm": 3.96875, "learning_rate": 4.6132359569083816e-05, "loss": 4.5161, "step": 19075 }, { "epoch": 8.571428571428571, "grad_norm": 4.03125, "learning_rate": 4.608277800909946e-05, "loss": 4.4383, "step": 19080 }, { "epoch": 8.573674752920036, "grad_norm": 3.953125, "learning_rate": 4.603326793258167e-05, "loss": 4.4597, "step": 19085 }, { "epoch": 8.575920934411501, "grad_norm": 3.84375, "learning_rate": 4.5983829369291956e-05, "loss": 4.4562, "step": 19090 }, { "epoch": 8.578167115902964, "grad_norm": 4.03125, "learning_rate": 4.593446234894877e-05, "loss": 4.4617, "step": 19095 }, { "epoch": 8.58041329739443, "grad_norm": 4.03125, "learning_rate": 4.5885166901227626e-05, "loss": 4.4788, "step": 19100 }, { "epoch": 8.582659478885894, "grad_norm": 3.90625, "learning_rate": 4.583594305576096e-05, "loss": 4.4296, "step": 19105 }, { "epoch": 8.584905660377359, "grad_norm": 4.03125, "learning_rate": 4.578679084213817e-05, "loss": 4.5075, "step": 19110 }, { "epoch": 8.587151841868822, "grad_norm": 3.53125, "learning_rate": 4.5737710289905674e-05, "loss": 4.5024, "step": 19115 }, { "epoch": 8.589398023360287, "grad_norm": 4.375, "learning_rate": 4.5688701428566685e-05, "loss": 4.4307, "step": 19120 }, { "epoch": 8.591644204851752, "grad_norm": 4.0625, "learning_rate": 4.563976428758144e-05, "loss": 4.4966, "step": 19125 }, { "epoch": 8.593890386343217, "grad_norm": 4.03125, "learning_rate": 4.5590898896366964e-05, "loss": 4.4909, "step": 19130 }, { "epoch": 8.59613656783468, "grad_norm": 3.953125, "learning_rate": 4.5542105284297236e-05, "loss": 4.4896, "step": 19135 }, { "epoch": 8.598382749326145, "grad_norm": 4.125, "learning_rate": 4.549338348070303e-05, "loss": 4.4824, "step": 19140 }, { "epoch": 8.60062893081761, "grad_norm": 4.1875, "learning_rate": 4.544473351487196e-05, "loss": 4.4483, "step": 19145 }, { "epoch": 8.602875112309075, "grad_norm": 3.75, "learning_rate": 4.5396155416048524e-05, "loss": 4.4626, "step": 19150 }, { "epoch": 8.60512129380054, "grad_norm": 3.984375, "learning_rate": 4.5347649213433905e-05, "loss": 4.4716, "step": 19155 }, { "epoch": 8.607367475292003, "grad_norm": 3.890625, "learning_rate": 4.529921493618618e-05, "loss": 4.4124, "step": 19160 }, { "epoch": 8.609613656783468, "grad_norm": 3.765625, "learning_rate": 4.5250852613420094e-05, "loss": 4.4807, "step": 19165 }, { "epoch": 8.611859838274933, "grad_norm": 3.8125, "learning_rate": 4.520256227420722e-05, "loss": 4.4539, "step": 19170 }, { "epoch": 8.614106019766398, "grad_norm": 3.984375, "learning_rate": 4.515434394757586e-05, "loss": 4.5185, "step": 19175 }, { "epoch": 8.616352201257861, "grad_norm": 3.890625, "learning_rate": 4.510619766251088e-05, "loss": 4.5331, "step": 19180 }, { "epoch": 8.618598382749326, "grad_norm": 4.03125, "learning_rate": 4.505812344795407e-05, "loss": 4.3877, "step": 19185 }, { "epoch": 8.620844564240791, "grad_norm": 3.78125, "learning_rate": 4.501012133280368e-05, "loss": 4.4476, "step": 19190 }, { "epoch": 8.623090745732256, "grad_norm": 4.0, "learning_rate": 4.496219134591478e-05, "loss": 4.4507, "step": 19195 }, { "epoch": 8.625336927223719, "grad_norm": 3.796875, "learning_rate": 4.4914333516099047e-05, "loss": 4.4937, "step": 19200 }, { "epoch": 8.627583108715184, "grad_norm": 3.890625, "learning_rate": 4.4866547872124675e-05, "loss": 4.4312, "step": 19205 }, { "epoch": 8.629829290206649, "grad_norm": 3.9375, "learning_rate": 4.481883444271663e-05, "loss": 4.485, "step": 19210 }, { "epoch": 8.632075471698114, "grad_norm": 4.28125, "learning_rate": 4.477119325655633e-05, "loss": 4.5384, "step": 19215 }, { "epoch": 8.634321653189577, "grad_norm": 3.984375, "learning_rate": 4.4723624342281845e-05, "loss": 4.4882, "step": 19220 }, { "epoch": 8.636567834681042, "grad_norm": 4.0625, "learning_rate": 4.46761277284878e-05, "loss": 4.4326, "step": 19225 }, { "epoch": 8.638814016172507, "grad_norm": 3.953125, "learning_rate": 4.46287034437253e-05, "loss": 4.4201, "step": 19230 }, { "epoch": 8.641060197663972, "grad_norm": 4.09375, "learning_rate": 4.458135151650204e-05, "loss": 4.4966, "step": 19235 }, { "epoch": 8.643306379155435, "grad_norm": 3.9375, "learning_rate": 4.4534071975282164e-05, "loss": 4.4862, "step": 19240 }, { "epoch": 8.6455525606469, "grad_norm": 3.984375, "learning_rate": 4.448686484848638e-05, "loss": 4.4492, "step": 19245 }, { "epoch": 8.647798742138365, "grad_norm": 3.90625, "learning_rate": 4.443973016449173e-05, "loss": 4.4572, "step": 19250 }, { "epoch": 8.65004492362983, "grad_norm": 3.75, "learning_rate": 4.4392667951631835e-05, "loss": 4.5163, "step": 19255 }, { "epoch": 8.652291105121293, "grad_norm": 4.03125, "learning_rate": 4.434567823819675e-05, "loss": 4.4779, "step": 19260 }, { "epoch": 8.654537286612758, "grad_norm": 3.953125, "learning_rate": 4.429876105243285e-05, "loss": 4.4685, "step": 19265 }, { "epoch": 8.656783468104223, "grad_norm": 3.8125, "learning_rate": 4.4251916422543015e-05, "loss": 4.4743, "step": 19270 }, { "epoch": 8.659029649595688, "grad_norm": 4.0625, "learning_rate": 4.420514437668643e-05, "loss": 4.4898, "step": 19275 }, { "epoch": 8.661275831087153, "grad_norm": 4.0625, "learning_rate": 4.415844494297874e-05, "loss": 4.4705, "step": 19280 }, { "epoch": 8.663522012578616, "grad_norm": 3.828125, "learning_rate": 4.411181814949184e-05, "loss": 4.4875, "step": 19285 }, { "epoch": 8.66576819407008, "grad_norm": 3.796875, "learning_rate": 4.406526402425399e-05, "loss": 4.4229, "step": 19290 }, { "epoch": 8.668014375561546, "grad_norm": 3.828125, "learning_rate": 4.4018782595249866e-05, "loss": 4.442, "step": 19295 }, { "epoch": 8.67026055705301, "grad_norm": 4.1875, "learning_rate": 4.397237389042028e-05, "loss": 4.4163, "step": 19300 }, { "epoch": 8.672506738544474, "grad_norm": 4.03125, "learning_rate": 4.392603793766247e-05, "loss": 4.4643, "step": 19305 }, { "epoch": 8.674752920035939, "grad_norm": 4.0625, "learning_rate": 4.387977476482983e-05, "loss": 4.4177, "step": 19310 }, { "epoch": 8.676999101527404, "grad_norm": 4.0, "learning_rate": 4.383358439973209e-05, "loss": 4.4912, "step": 19315 }, { "epoch": 8.679245283018869, "grad_norm": 4.0625, "learning_rate": 4.37874668701352e-05, "loss": 4.454, "step": 19320 }, { "epoch": 8.681491464510332, "grad_norm": 3.9375, "learning_rate": 4.374142220376125e-05, "loss": 4.4142, "step": 19325 }, { "epoch": 8.683737646001797, "grad_norm": 4.03125, "learning_rate": 4.369545042828868e-05, "loss": 4.5224, "step": 19330 }, { "epoch": 8.685983827493262, "grad_norm": 4.09375, "learning_rate": 4.364955157135195e-05, "loss": 4.465, "step": 19335 }, { "epoch": 8.688230008984727, "grad_norm": 3.8125, "learning_rate": 4.3603725660541736e-05, "loss": 4.4517, "step": 19340 }, { "epoch": 8.69047619047619, "grad_norm": 3.90625, "learning_rate": 4.355797272340497e-05, "loss": 4.5375, "step": 19345 }, { "epoch": 8.692722371967655, "grad_norm": 3.84375, "learning_rate": 4.3512292787444564e-05, "loss": 4.4509, "step": 19350 }, { "epoch": 8.69496855345912, "grad_norm": 4.09375, "learning_rate": 4.346668588011968e-05, "loss": 4.4416, "step": 19355 }, { "epoch": 8.697214734950585, "grad_norm": 4.0, "learning_rate": 4.342115202884548e-05, "loss": 4.4413, "step": 19360 }, { "epoch": 8.699460916442048, "grad_norm": 3.78125, "learning_rate": 4.337569126099326e-05, "loss": 4.5407, "step": 19365 }, { "epoch": 8.701707097933513, "grad_norm": 3.875, "learning_rate": 4.3330303603890414e-05, "loss": 4.4828, "step": 19370 }, { "epoch": 8.703953279424978, "grad_norm": 3.765625, "learning_rate": 4.328498908482028e-05, "loss": 4.4302, "step": 19375 }, { "epoch": 8.706199460916443, "grad_norm": 3.59375, "learning_rate": 4.323974773102238e-05, "loss": 4.4499, "step": 19380 }, { "epoch": 8.708445642407906, "grad_norm": 3.859375, "learning_rate": 4.319457956969211e-05, "loss": 4.4702, "step": 19385 }, { "epoch": 8.71069182389937, "grad_norm": 3.984375, "learning_rate": 4.314948462798098e-05, "loss": 4.5252, "step": 19390 }, { "epoch": 8.712938005390836, "grad_norm": 3.890625, "learning_rate": 4.310446293299639e-05, "loss": 4.4829, "step": 19395 }, { "epoch": 8.7151841868823, "grad_norm": 4.0, "learning_rate": 4.3059514511801805e-05, "loss": 4.4736, "step": 19400 }, { "epoch": 8.717430368373766, "grad_norm": 3.765625, "learning_rate": 4.3014639391416595e-05, "loss": 4.482, "step": 19405 }, { "epoch": 8.719676549865229, "grad_norm": 3.734375, "learning_rate": 4.296983759881606e-05, "loss": 4.5219, "step": 19410 }, { "epoch": 8.721922731356694, "grad_norm": 3.59375, "learning_rate": 4.292510916093144e-05, "loss": 4.547, "step": 19415 }, { "epoch": 8.724168912848159, "grad_norm": 3.984375, "learning_rate": 4.288045410464986e-05, "loss": 4.3926, "step": 19420 }, { "epoch": 8.726415094339622, "grad_norm": 3.84375, "learning_rate": 4.2835872456814366e-05, "loss": 4.4436, "step": 19425 }, { "epoch": 8.728661275831087, "grad_norm": 3.765625, "learning_rate": 4.279136424422385e-05, "loss": 4.5177, "step": 19430 }, { "epoch": 8.730907457322552, "grad_norm": 3.859375, "learning_rate": 4.274692949363307e-05, "loss": 4.5139, "step": 19435 }, { "epoch": 8.733153638814017, "grad_norm": 4.0625, "learning_rate": 4.270256823175264e-05, "loss": 4.4525, "step": 19440 }, { "epoch": 8.735399820305481, "grad_norm": 3.796875, "learning_rate": 4.265828048524892e-05, "loss": 4.5383, "step": 19445 }, { "epoch": 8.737646001796945, "grad_norm": 4.0, "learning_rate": 4.261406628074422e-05, "loss": 4.418, "step": 19450 }, { "epoch": 8.73989218328841, "grad_norm": 3.921875, "learning_rate": 4.256992564481649e-05, "loss": 4.434, "step": 19455 }, { "epoch": 8.742138364779874, "grad_norm": 3.96875, "learning_rate": 4.252585860399959e-05, "loss": 4.4382, "step": 19460 }, { "epoch": 8.74438454627134, "grad_norm": 3.84375, "learning_rate": 4.248186518478307e-05, "loss": 4.4914, "step": 19465 }, { "epoch": 8.746630727762803, "grad_norm": 3.609375, "learning_rate": 4.2437945413612184e-05, "loss": 4.4838, "step": 19470 }, { "epoch": 8.748876909254268, "grad_norm": 4.0625, "learning_rate": 4.239409931688803e-05, "loss": 4.4763, "step": 19475 }, { "epoch": 8.751123090745732, "grad_norm": 4.0, "learning_rate": 4.235032692096729e-05, "loss": 4.4874, "step": 19480 }, { "epoch": 8.753369272237197, "grad_norm": 3.859375, "learning_rate": 4.230662825216248e-05, "loss": 4.4227, "step": 19485 }, { "epoch": 8.75561545372866, "grad_norm": 3.75, "learning_rate": 4.2263003336741655e-05, "loss": 4.4311, "step": 19490 }, { "epoch": 8.757861635220126, "grad_norm": 4.0, "learning_rate": 4.2219452200928656e-05, "loss": 4.4769, "step": 19495 }, { "epoch": 8.76010781671159, "grad_norm": 3.890625, "learning_rate": 4.21759748709029e-05, "loss": 4.4972, "step": 19500 }, { "epoch": 8.762353998203055, "grad_norm": 3.90625, "learning_rate": 4.213257137279943e-05, "loss": 4.4786, "step": 19505 }, { "epoch": 8.764600179694519, "grad_norm": 3.96875, "learning_rate": 4.208924173270897e-05, "loss": 4.4806, "step": 19510 }, { "epoch": 8.766846361185983, "grad_norm": 4.25, "learning_rate": 4.204598597667785e-05, "loss": 4.4575, "step": 19515 }, { "epoch": 8.769092542677448, "grad_norm": 4.21875, "learning_rate": 4.2002804130707865e-05, "loss": 4.4575, "step": 19520 }, { "epoch": 8.771338724168913, "grad_norm": 3.90625, "learning_rate": 4.1959696220756545e-05, "loss": 4.5042, "step": 19525 }, { "epoch": 8.773584905660378, "grad_norm": 4.15625, "learning_rate": 4.191666227273683e-05, "loss": 4.4264, "step": 19530 }, { "epoch": 8.775831087151841, "grad_norm": 4.09375, "learning_rate": 4.187370231251735e-05, "loss": 4.4629, "step": 19535 }, { "epoch": 8.778077268643306, "grad_norm": 3.65625, "learning_rate": 4.183081636592208e-05, "loss": 4.4501, "step": 19540 }, { "epoch": 8.780323450134771, "grad_norm": 4.15625, "learning_rate": 4.178800445873066e-05, "loss": 4.4359, "step": 19545 }, { "epoch": 8.782569631626234, "grad_norm": 3.890625, "learning_rate": 4.174526661667818e-05, "loss": 4.4594, "step": 19550 }, { "epoch": 8.7848158131177, "grad_norm": 3.765625, "learning_rate": 4.1702602865455136e-05, "loss": 4.4939, "step": 19555 }, { "epoch": 8.787061994609164, "grad_norm": 3.984375, "learning_rate": 4.166001323070761e-05, "loss": 4.4454, "step": 19560 }, { "epoch": 8.78930817610063, "grad_norm": 4.03125, "learning_rate": 4.161749773803698e-05, "loss": 4.5035, "step": 19565 }, { "epoch": 8.791554357592094, "grad_norm": 3.6875, "learning_rate": 4.15750564130002e-05, "loss": 4.3936, "step": 19570 }, { "epoch": 8.793800539083557, "grad_norm": 4.03125, "learning_rate": 4.153268928110961e-05, "loss": 4.4261, "step": 19575 }, { "epoch": 8.796046720575022, "grad_norm": 3.875, "learning_rate": 4.149039636783283e-05, "loss": 4.5206, "step": 19580 }, { "epoch": 8.798292902066487, "grad_norm": 4.15625, "learning_rate": 4.144817769859303e-05, "loss": 4.4361, "step": 19585 }, { "epoch": 8.800539083557952, "grad_norm": 4.03125, "learning_rate": 4.140603329876861e-05, "loss": 4.4497, "step": 19590 }, { "epoch": 8.802785265049415, "grad_norm": 3.890625, "learning_rate": 4.1363963193693495e-05, "loss": 4.4402, "step": 19595 }, { "epoch": 8.80503144654088, "grad_norm": 4.0, "learning_rate": 4.132196740865674e-05, "loss": 4.4708, "step": 19600 }, { "epoch": 8.807277628032345, "grad_norm": 3.796875, "learning_rate": 4.12800459689029e-05, "loss": 4.4493, "step": 19605 }, { "epoch": 8.80952380952381, "grad_norm": 3.90625, "learning_rate": 4.123819889963176e-05, "loss": 4.4969, "step": 19610 }, { "epoch": 8.811769991015273, "grad_norm": 3.78125, "learning_rate": 4.1196426225998374e-05, "loss": 4.4436, "step": 19615 }, { "epoch": 8.814016172506738, "grad_norm": 3.953125, "learning_rate": 4.115472797311318e-05, "loss": 4.459, "step": 19620 }, { "epoch": 8.816262353998203, "grad_norm": 4.15625, "learning_rate": 4.1113104166041736e-05, "loss": 4.4202, "step": 19625 }, { "epoch": 8.818508535489668, "grad_norm": 4.34375, "learning_rate": 4.107155482980499e-05, "loss": 4.5383, "step": 19630 }, { "epoch": 8.820754716981131, "grad_norm": 4.03125, "learning_rate": 4.103007998937901e-05, "loss": 4.4495, "step": 19635 }, { "epoch": 8.823000898472596, "grad_norm": 4.28125, "learning_rate": 4.098867966969516e-05, "loss": 4.4994, "step": 19640 }, { "epoch": 8.825247079964061, "grad_norm": 4.03125, "learning_rate": 4.094735389564e-05, "loss": 4.4804, "step": 19645 }, { "epoch": 8.827493261455526, "grad_norm": 3.90625, "learning_rate": 4.090610269205524e-05, "loss": 4.5048, "step": 19650 }, { "epoch": 8.82973944294699, "grad_norm": 3.765625, "learning_rate": 4.086492608373776e-05, "loss": 4.445, "step": 19655 }, { "epoch": 8.831985624438454, "grad_norm": 4.15625, "learning_rate": 4.0823824095439674e-05, "loss": 4.4986, "step": 19660 }, { "epoch": 8.83423180592992, "grad_norm": 3.953125, "learning_rate": 4.078279675186814e-05, "loss": 4.5123, "step": 19665 }, { "epoch": 8.836477987421384, "grad_norm": 4.0625, "learning_rate": 4.074184407768554e-05, "loss": 4.5034, "step": 19670 }, { "epoch": 8.838724168912847, "grad_norm": 3.78125, "learning_rate": 4.07009660975093e-05, "loss": 4.4861, "step": 19675 }, { "epoch": 8.840970350404312, "grad_norm": 3.9375, "learning_rate": 4.066016283591198e-05, "loss": 4.4408, "step": 19680 }, { "epoch": 8.843216531895777, "grad_norm": 4.0, "learning_rate": 4.0619434317421205e-05, "loss": 4.4868, "step": 19685 }, { "epoch": 8.845462713387242, "grad_norm": 4.0, "learning_rate": 4.0578780566519715e-05, "loss": 4.4404, "step": 19690 }, { "epoch": 8.847708894878707, "grad_norm": 3.9375, "learning_rate": 4.053820160764526e-05, "loss": 4.4598, "step": 19695 }, { "epoch": 8.84995507637017, "grad_norm": 4.125, "learning_rate": 4.0497697465190625e-05, "loss": 4.564, "step": 19700 }, { "epoch": 8.852201257861635, "grad_norm": 4.03125, "learning_rate": 4.045726816350369e-05, "loss": 4.4761, "step": 19705 }, { "epoch": 8.8544474393531, "grad_norm": 3.90625, "learning_rate": 4.0416913726887224e-05, "loss": 4.4506, "step": 19710 }, { "epoch": 8.856693620844565, "grad_norm": 4.0, "learning_rate": 4.0376634179599135e-05, "loss": 4.4752, "step": 19715 }, { "epoch": 8.858939802336028, "grad_norm": 4.125, "learning_rate": 4.033642954585224e-05, "loss": 4.4495, "step": 19720 }, { "epoch": 8.861185983827493, "grad_norm": 3.984375, "learning_rate": 4.029629984981427e-05, "loss": 4.4673, "step": 19725 }, { "epoch": 8.863432165318958, "grad_norm": 4.0625, "learning_rate": 4.025624511560806e-05, "loss": 4.4779, "step": 19730 }, { "epoch": 8.865678346810423, "grad_norm": 3.84375, "learning_rate": 4.021626536731121e-05, "loss": 4.4455, "step": 19735 }, { "epoch": 8.867924528301886, "grad_norm": 3.71875, "learning_rate": 4.0176360628956395e-05, "loss": 4.4714, "step": 19740 }, { "epoch": 8.870170709793351, "grad_norm": 3.6875, "learning_rate": 4.0136530924531075e-05, "loss": 4.5017, "step": 19745 }, { "epoch": 8.872416891284816, "grad_norm": 4.46875, "learning_rate": 4.009677627797768e-05, "loss": 4.4362, "step": 19750 }, { "epoch": 8.874663072776281, "grad_norm": 3.921875, "learning_rate": 4.005709671319355e-05, "loss": 4.4355, "step": 19755 }, { "epoch": 8.876909254267744, "grad_norm": 4.0, "learning_rate": 4.00174922540308e-05, "loss": 4.4632, "step": 19760 }, { "epoch": 8.879155435759209, "grad_norm": 3.90625, "learning_rate": 3.997796292429645e-05, "loss": 4.4354, "step": 19765 }, { "epoch": 8.881401617250674, "grad_norm": 4.0, "learning_rate": 3.993850874775237e-05, "loss": 4.4726, "step": 19770 }, { "epoch": 8.883647798742139, "grad_norm": 3.515625, "learning_rate": 3.989912974811521e-05, "loss": 4.5184, "step": 19775 }, { "epoch": 8.885893980233602, "grad_norm": 4.15625, "learning_rate": 3.98598259490565e-05, "loss": 4.4391, "step": 19780 }, { "epoch": 8.888140161725067, "grad_norm": 3.921875, "learning_rate": 3.982059737420249e-05, "loss": 4.4652, "step": 19785 }, { "epoch": 8.890386343216532, "grad_norm": 3.8125, "learning_rate": 3.978144404713424e-05, "loss": 4.4924, "step": 19790 }, { "epoch": 8.892632524707997, "grad_norm": 4.03125, "learning_rate": 3.974236599138759e-05, "loss": 4.4596, "step": 19795 }, { "epoch": 8.89487870619946, "grad_norm": 3.9375, "learning_rate": 3.970336323045314e-05, "loss": 4.4982, "step": 19800 }, { "epoch": 8.897124887690925, "grad_norm": 3.609375, "learning_rate": 3.9664435787776164e-05, "loss": 4.4997, "step": 19805 }, { "epoch": 8.89937106918239, "grad_norm": 3.765625, "learning_rate": 3.9625583686756766e-05, "loss": 4.4742, "step": 19810 }, { "epoch": 8.901617250673855, "grad_norm": 4.0, "learning_rate": 3.958680695074968e-05, "loss": 4.4771, "step": 19815 }, { "epoch": 8.90386343216532, "grad_norm": 3.921875, "learning_rate": 3.954810560306433e-05, "loss": 4.53, "step": 19820 }, { "epoch": 8.906109613656783, "grad_norm": 3.921875, "learning_rate": 3.950947966696488e-05, "loss": 4.3864, "step": 19825 }, { "epoch": 8.908355795148248, "grad_norm": 3.9375, "learning_rate": 3.947092916567015e-05, "loss": 4.606, "step": 19830 }, { "epoch": 8.910601976639713, "grad_norm": 3.671875, "learning_rate": 3.943245412235356e-05, "loss": 4.4162, "step": 19835 }, { "epoch": 8.912848158131178, "grad_norm": 4.0, "learning_rate": 3.939405456014328e-05, "loss": 4.4389, "step": 19840 }, { "epoch": 8.915094339622641, "grad_norm": 4.125, "learning_rate": 3.935573050212193e-05, "loss": 4.4811, "step": 19845 }, { "epoch": 8.917340521114106, "grad_norm": 3.90625, "learning_rate": 3.931748197132697e-05, "loss": 4.4041, "step": 19850 }, { "epoch": 8.91958670260557, "grad_norm": 4.0625, "learning_rate": 3.9279308990750244e-05, "loss": 4.4089, "step": 19855 }, { "epoch": 8.921832884097036, "grad_norm": 3.8125, "learning_rate": 3.924121158333831e-05, "loss": 4.4629, "step": 19860 }, { "epoch": 8.924079065588499, "grad_norm": 3.796875, "learning_rate": 3.92031897719923e-05, "loss": 4.5195, "step": 19865 }, { "epoch": 8.926325247079964, "grad_norm": 4.03125, "learning_rate": 3.916524357956781e-05, "loss": 4.4806, "step": 19870 }, { "epoch": 8.928571428571429, "grad_norm": 3.921875, "learning_rate": 3.9127373028875096e-05, "loss": 4.4477, "step": 19875 }, { "epoch": 8.930817610062894, "grad_norm": 3.984375, "learning_rate": 3.908957814267883e-05, "loss": 4.5098, "step": 19880 }, { "epoch": 8.933063791554357, "grad_norm": 4.0625, "learning_rate": 3.90518589436983e-05, "loss": 4.5037, "step": 19885 }, { "epoch": 8.935309973045822, "grad_norm": 3.65625, "learning_rate": 3.901421545460721e-05, "loss": 4.408, "step": 19890 }, { "epoch": 8.937556154537287, "grad_norm": 3.984375, "learning_rate": 3.8976647698033825e-05, "loss": 4.4674, "step": 19895 }, { "epoch": 8.939802336028752, "grad_norm": 3.96875, "learning_rate": 3.8939155696560876e-05, "loss": 4.4573, "step": 19900 }, { "epoch": 8.942048517520215, "grad_norm": 3.984375, "learning_rate": 3.8901739472725504e-05, "loss": 4.3736, "step": 19905 }, { "epoch": 8.94429469901168, "grad_norm": 4.25, "learning_rate": 3.8864399049019366e-05, "loss": 4.4087, "step": 19910 }, { "epoch": 8.946540880503145, "grad_norm": 3.828125, "learning_rate": 3.8827134447888464e-05, "loss": 4.4721, "step": 19915 }, { "epoch": 8.94878706199461, "grad_norm": 4.03125, "learning_rate": 3.8789945691733335e-05, "loss": 4.443, "step": 19920 }, { "epoch": 8.951033243486073, "grad_norm": 3.859375, "learning_rate": 3.875283280290885e-05, "loss": 4.4484, "step": 19925 }, { "epoch": 8.953279424977538, "grad_norm": 4.09375, "learning_rate": 3.871579580372429e-05, "loss": 4.4923, "step": 19930 }, { "epoch": 8.955525606469003, "grad_norm": 3.984375, "learning_rate": 3.8678834716443316e-05, "loss": 4.4644, "step": 19935 }, { "epoch": 8.957771787960468, "grad_norm": 3.8125, "learning_rate": 3.8641949563283965e-05, "loss": 4.4395, "step": 19940 }, { "epoch": 8.960017969451933, "grad_norm": 4.03125, "learning_rate": 3.8605140366418616e-05, "loss": 4.4335, "step": 19945 }, { "epoch": 8.962264150943396, "grad_norm": 3.96875, "learning_rate": 3.8568407147973994e-05, "loss": 4.4393, "step": 19950 }, { "epoch": 8.96451033243486, "grad_norm": 3.96875, "learning_rate": 3.8531749930031154e-05, "loss": 4.4614, "step": 19955 }, { "epoch": 8.966756513926326, "grad_norm": 3.75, "learning_rate": 3.84951687346255e-05, "loss": 4.5097, "step": 19960 }, { "epoch": 8.969002695417789, "grad_norm": 4.125, "learning_rate": 3.8458663583746685e-05, "loss": 4.5181, "step": 19965 }, { "epoch": 8.971248876909254, "grad_norm": 4.0, "learning_rate": 3.8422234499338634e-05, "loss": 4.4566, "step": 19970 }, { "epoch": 8.973495058400719, "grad_norm": 4.03125, "learning_rate": 3.838588150329963e-05, "loss": 4.52, "step": 19975 }, { "epoch": 8.975741239892184, "grad_norm": 4.375, "learning_rate": 3.834960461748213e-05, "loss": 4.4874, "step": 19980 }, { "epoch": 8.977987421383649, "grad_norm": 3.796875, "learning_rate": 3.8313403863692926e-05, "loss": 4.4312, "step": 19985 }, { "epoch": 8.980233602875112, "grad_norm": 4.0, "learning_rate": 3.8277279263692926e-05, "loss": 4.4864, "step": 19990 }, { "epoch": 8.982479784366577, "grad_norm": 4.03125, "learning_rate": 3.824123083919743e-05, "loss": 4.4817, "step": 19995 }, { "epoch": 8.984725965858042, "grad_norm": 3.90625, "learning_rate": 3.820525861187575e-05, "loss": 4.5165, "step": 20000 }, { "epoch": 8.984725965858042, "eval_loss": 4.77856969833374, "eval_runtime": 16.1346, "eval_samples_per_second": 1922.144, "eval_steps_per_second": 240.291, "step": 20000 }, { "epoch": 8.986972147349507, "grad_norm": 4.15625, "learning_rate": 3.816936260335156e-05, "loss": 4.4603, "step": 20005 }, { "epoch": 8.98921832884097, "grad_norm": 4.0, "learning_rate": 3.8133542835202646e-05, "loss": 4.3947, "step": 20010 }, { "epoch": 8.991464510332435, "grad_norm": 4.0, "learning_rate": 3.809779932896095e-05, "loss": 4.4674, "step": 20015 }, { "epoch": 8.9937106918239, "grad_norm": 3.953125, "learning_rate": 3.8062132106112625e-05, "loss": 4.4778, "step": 20020 }, { "epoch": 8.995956873315365, "grad_norm": 4.0625, "learning_rate": 3.802654118809788e-05, "loss": 4.5111, "step": 20025 }, { "epoch": 8.998203054806828, "grad_norm": 4.0625, "learning_rate": 3.7991026596311175e-05, "loss": 4.5153, "step": 20030 }, { "epoch": 9.000449236298293, "grad_norm": 4.15625, "learning_rate": 3.795558835210098e-05, "loss": 4.4467, "step": 20035 }, { "epoch": 9.002695417789758, "grad_norm": 3.703125, "learning_rate": 3.7920226476769924e-05, "loss": 4.4839, "step": 20040 }, { "epoch": 9.004941599281223, "grad_norm": 3.9375, "learning_rate": 3.788494099157474e-05, "loss": 4.4578, "step": 20045 }, { "epoch": 9.007187780772686, "grad_norm": 4.0625, "learning_rate": 3.7849731917726205e-05, "loss": 4.4219, "step": 20050 }, { "epoch": 9.00943396226415, "grad_norm": 3.90625, "learning_rate": 3.78145992763892e-05, "loss": 4.4764, "step": 20055 }, { "epoch": 9.011680143755616, "grad_norm": 4.15625, "learning_rate": 3.777954308868263e-05, "loss": 4.4311, "step": 20060 }, { "epoch": 9.01392632524708, "grad_norm": 3.71875, "learning_rate": 3.774456337567944e-05, "loss": 4.446, "step": 20065 }, { "epoch": 9.016172506738544, "grad_norm": 4.125, "learning_rate": 3.770966015840665e-05, "loss": 4.4513, "step": 20070 }, { "epoch": 9.018418688230009, "grad_norm": 4.25, "learning_rate": 3.767483345784523e-05, "loss": 4.4918, "step": 20075 }, { "epoch": 9.020664869721474, "grad_norm": 4.0, "learning_rate": 3.764008329493025e-05, "loss": 4.4665, "step": 20080 }, { "epoch": 9.022911051212938, "grad_norm": 4.09375, "learning_rate": 3.760540969055065e-05, "loss": 4.4682, "step": 20085 }, { "epoch": 9.025157232704403, "grad_norm": 3.90625, "learning_rate": 3.7570812665549446e-05, "loss": 4.4813, "step": 20090 }, { "epoch": 9.027403414195867, "grad_norm": 3.828125, "learning_rate": 3.753629224072356e-05, "loss": 4.4417, "step": 20095 }, { "epoch": 9.029649595687331, "grad_norm": 4.03125, "learning_rate": 3.750184843682391e-05, "loss": 4.4383, "step": 20100 }, { "epoch": 9.031895777178796, "grad_norm": 3.765625, "learning_rate": 3.746748127455536e-05, "loss": 4.4592, "step": 20105 }, { "epoch": 9.034141958670261, "grad_norm": 4.09375, "learning_rate": 3.7433190774576636e-05, "loss": 4.4933, "step": 20110 }, { "epoch": 9.036388140161725, "grad_norm": 3.875, "learning_rate": 3.739897695750048e-05, "loss": 4.4422, "step": 20115 }, { "epoch": 9.03863432165319, "grad_norm": 3.921875, "learning_rate": 3.7364839843893435e-05, "loss": 4.4862, "step": 20120 }, { "epoch": 9.040880503144654, "grad_norm": 3.8125, "learning_rate": 3.733077945427603e-05, "loss": 4.4359, "step": 20125 }, { "epoch": 9.04312668463612, "grad_norm": 3.59375, "learning_rate": 3.729679580912262e-05, "loss": 4.5098, "step": 20130 }, { "epoch": 9.045372866127583, "grad_norm": 3.953125, "learning_rate": 3.726288892886141e-05, "loss": 4.4862, "step": 20135 }, { "epoch": 9.047619047619047, "grad_norm": 4.0, "learning_rate": 3.7229058833874525e-05, "loss": 4.3972, "step": 20140 }, { "epoch": 9.049865229110512, "grad_norm": 3.703125, "learning_rate": 3.7195305544497864e-05, "loss": 4.4993, "step": 20145 }, { "epoch": 9.052111410601977, "grad_norm": 3.890625, "learning_rate": 3.71616290810212e-05, "loss": 4.4783, "step": 20150 }, { "epoch": 9.05435759209344, "grad_norm": 3.984375, "learning_rate": 3.712802946368816e-05, "loss": 4.4746, "step": 20155 }, { "epoch": 9.056603773584905, "grad_norm": 3.921875, "learning_rate": 3.709450671269606e-05, "loss": 4.5272, "step": 20160 }, { "epoch": 9.05884995507637, "grad_norm": 4.0625, "learning_rate": 3.706106084819612e-05, "loss": 4.4499, "step": 20165 }, { "epoch": 9.061096136567835, "grad_norm": 3.96875, "learning_rate": 3.7027691890293285e-05, "loss": 4.4562, "step": 20170 }, { "epoch": 9.063342318059298, "grad_norm": 4.0625, "learning_rate": 3.6994399859046304e-05, "loss": 4.5038, "step": 20175 }, { "epoch": 9.065588499550763, "grad_norm": 3.96875, "learning_rate": 3.6961184774467666e-05, "loss": 4.4168, "step": 20180 }, { "epoch": 9.067834681042228, "grad_norm": 3.90625, "learning_rate": 3.69280466565236e-05, "loss": 4.4731, "step": 20185 }, { "epoch": 9.070080862533693, "grad_norm": 4.0625, "learning_rate": 3.689498552513408e-05, "loss": 4.4363, "step": 20190 }, { "epoch": 9.072327044025156, "grad_norm": 4.15625, "learning_rate": 3.68620014001728e-05, "loss": 4.4345, "step": 20195 }, { "epoch": 9.074573225516621, "grad_norm": 3.984375, "learning_rate": 3.682909430146715e-05, "loss": 4.4821, "step": 20200 }, { "epoch": 9.076819407008086, "grad_norm": 4.125, "learning_rate": 3.679626424879825e-05, "loss": 4.403, "step": 20205 }, { "epoch": 9.079065588499551, "grad_norm": 3.921875, "learning_rate": 3.676351126190086e-05, "loss": 4.4852, "step": 20210 }, { "epoch": 9.081311769991014, "grad_norm": 4.15625, "learning_rate": 3.6730835360463476e-05, "loss": 4.4251, "step": 20215 }, { "epoch": 9.08355795148248, "grad_norm": 4.0625, "learning_rate": 3.6698236564128184e-05, "loss": 4.4048, "step": 20220 }, { "epoch": 9.085804132973944, "grad_norm": 4.03125, "learning_rate": 3.666571489249081e-05, "loss": 4.4439, "step": 20225 }, { "epoch": 9.08805031446541, "grad_norm": 3.796875, "learning_rate": 3.6633270365100696e-05, "loss": 4.4273, "step": 20230 }, { "epoch": 9.090296495956874, "grad_norm": 3.984375, "learning_rate": 3.6600903001460934e-05, "loss": 4.4236, "step": 20235 }, { "epoch": 9.092542677448337, "grad_norm": 3.90625, "learning_rate": 3.656861282102816e-05, "loss": 4.4979, "step": 20240 }, { "epoch": 9.094788858939802, "grad_norm": 4.03125, "learning_rate": 3.653639984321262e-05, "loss": 4.4669, "step": 20245 }, { "epoch": 9.097035040431267, "grad_norm": 3.84375, "learning_rate": 3.65042640873782e-05, "loss": 4.3567, "step": 20250 }, { "epoch": 9.099281221922732, "grad_norm": 4.1875, "learning_rate": 3.6472205572842304e-05, "loss": 4.5483, "step": 20255 }, { "epoch": 9.101527403414195, "grad_norm": 3.796875, "learning_rate": 3.6440224318875944e-05, "loss": 4.3891, "step": 20260 }, { "epoch": 9.10377358490566, "grad_norm": 4.21875, "learning_rate": 3.640832034470366e-05, "loss": 4.4508, "step": 20265 }, { "epoch": 9.106019766397125, "grad_norm": 4.0625, "learning_rate": 3.637649366950357e-05, "loss": 4.5023, "step": 20270 }, { "epoch": 9.10826594788859, "grad_norm": 3.90625, "learning_rate": 3.6344744312407325e-05, "loss": 4.4781, "step": 20275 }, { "epoch": 9.110512129380053, "grad_norm": 4.15625, "learning_rate": 3.631307229250003e-05, "loss": 4.4442, "step": 20280 }, { "epoch": 9.112758310871518, "grad_norm": 3.828125, "learning_rate": 3.628147762882046e-05, "loss": 4.4639, "step": 20285 }, { "epoch": 9.115004492362983, "grad_norm": 4.125, "learning_rate": 3.624996034036065e-05, "loss": 4.5477, "step": 20290 }, { "epoch": 9.117250673854448, "grad_norm": 4.0, "learning_rate": 3.6218520446066334e-05, "loss": 4.4283, "step": 20295 }, { "epoch": 9.119496855345911, "grad_norm": 3.96875, "learning_rate": 3.6187157964836664e-05, "loss": 4.4165, "step": 20300 }, { "epoch": 9.121743036837376, "grad_norm": 3.859375, "learning_rate": 3.6155872915524195e-05, "loss": 4.4519, "step": 20305 }, { "epoch": 9.123989218328841, "grad_norm": 3.890625, "learning_rate": 3.6124665316935e-05, "loss": 4.4878, "step": 20310 }, { "epoch": 9.126235399820306, "grad_norm": 4.125, "learning_rate": 3.609353518782858e-05, "loss": 4.4873, "step": 20315 }, { "epoch": 9.12848158131177, "grad_norm": 4.09375, "learning_rate": 3.6062482546917844e-05, "loss": 4.4115, "step": 20320 }, { "epoch": 9.130727762803234, "grad_norm": 3.765625, "learning_rate": 3.603150741286919e-05, "loss": 4.5559, "step": 20325 }, { "epoch": 9.132973944294699, "grad_norm": 3.78125, "learning_rate": 3.600060980430231e-05, "loss": 4.4489, "step": 20330 }, { "epoch": 9.135220125786164, "grad_norm": 3.875, "learning_rate": 3.5969789739790416e-05, "loss": 4.4855, "step": 20335 }, { "epoch": 9.137466307277627, "grad_norm": 3.90625, "learning_rate": 3.593904723786002e-05, "loss": 4.4637, "step": 20340 }, { "epoch": 9.139712488769092, "grad_norm": 3.953125, "learning_rate": 3.5908382316991077e-05, "loss": 4.418, "step": 20345 }, { "epoch": 9.141958670260557, "grad_norm": 4.1875, "learning_rate": 3.5877794995616825e-05, "loss": 4.3961, "step": 20350 }, { "epoch": 9.144204851752022, "grad_norm": 4.0, "learning_rate": 3.5847285292123914e-05, "loss": 4.4948, "step": 20355 }, { "epoch": 9.146451033243487, "grad_norm": 3.90625, "learning_rate": 3.581685322485234e-05, "loss": 4.438, "step": 20360 }, { "epoch": 9.14869721473495, "grad_norm": 3.921875, "learning_rate": 3.5786498812095394e-05, "loss": 4.4238, "step": 20365 }, { "epoch": 9.150943396226415, "grad_norm": 3.890625, "learning_rate": 3.5756222072099744e-05, "loss": 4.4014, "step": 20370 }, { "epoch": 9.15318957771788, "grad_norm": 3.703125, "learning_rate": 3.572602302306527e-05, "loss": 4.5125, "step": 20375 }, { "epoch": 9.155435759209345, "grad_norm": 3.796875, "learning_rate": 3.569590168314526e-05, "loss": 4.4185, "step": 20380 }, { "epoch": 9.157681940700808, "grad_norm": 4.15625, "learning_rate": 3.566585807044621e-05, "loss": 4.4771, "step": 20385 }, { "epoch": 9.159928122192273, "grad_norm": 3.5625, "learning_rate": 3.563589220302793e-05, "loss": 4.4659, "step": 20390 }, { "epoch": 9.162174303683738, "grad_norm": 4.125, "learning_rate": 3.560600409890352e-05, "loss": 4.4403, "step": 20395 }, { "epoch": 9.164420485175203, "grad_norm": 4.09375, "learning_rate": 3.5576193776039254e-05, "loss": 4.5184, "step": 20400 }, { "epoch": 9.166666666666666, "grad_norm": 4.0, "learning_rate": 3.554646125235475e-05, "loss": 4.4681, "step": 20405 }, { "epoch": 9.168912848158131, "grad_norm": 3.75, "learning_rate": 3.551680654572276e-05, "loss": 4.4061, "step": 20410 }, { "epoch": 9.171159029649596, "grad_norm": 3.953125, "learning_rate": 3.548722967396934e-05, "loss": 4.4493, "step": 20415 }, { "epoch": 9.173405211141061, "grad_norm": 4.0, "learning_rate": 3.5457730654873724e-05, "loss": 4.4838, "step": 20420 }, { "epoch": 9.175651392632524, "grad_norm": 3.90625, "learning_rate": 3.5428309506168314e-05, "loss": 4.4205, "step": 20425 }, { "epoch": 9.177897574123989, "grad_norm": 4.125, "learning_rate": 3.5398966245538796e-05, "loss": 4.4896, "step": 20430 }, { "epoch": 9.180143755615454, "grad_norm": 4.15625, "learning_rate": 3.5369700890623934e-05, "loss": 4.4758, "step": 20435 }, { "epoch": 9.182389937106919, "grad_norm": 4.0625, "learning_rate": 3.534051345901573e-05, "loss": 4.493, "step": 20440 }, { "epoch": 9.184636118598382, "grad_norm": 4.0, "learning_rate": 3.53114039682593e-05, "loss": 4.4232, "step": 20445 }, { "epoch": 9.186882300089847, "grad_norm": 3.8125, "learning_rate": 3.5282372435852935e-05, "loss": 4.437, "step": 20450 }, { "epoch": 9.189128481581312, "grad_norm": 4.0625, "learning_rate": 3.5253418879248056e-05, "loss": 4.5357, "step": 20455 }, { "epoch": 9.191374663072777, "grad_norm": 4.15625, "learning_rate": 3.52245433158492e-05, "loss": 4.4931, "step": 20460 }, { "epoch": 9.19362084456424, "grad_norm": 4.0, "learning_rate": 3.519574576301405e-05, "loss": 4.4336, "step": 20465 }, { "epoch": 9.195867026055705, "grad_norm": 4.0, "learning_rate": 3.516702623805339e-05, "loss": 4.4578, "step": 20470 }, { "epoch": 9.19811320754717, "grad_norm": 3.96875, "learning_rate": 3.5138384758231055e-05, "loss": 4.4428, "step": 20475 }, { "epoch": 9.200359389038635, "grad_norm": 4.09375, "learning_rate": 3.5109821340764016e-05, "loss": 4.4737, "step": 20480 }, { "epoch": 9.202605570530098, "grad_norm": 3.9375, "learning_rate": 3.50813360028223e-05, "loss": 4.5054, "step": 20485 }, { "epoch": 9.204851752021563, "grad_norm": 4.0625, "learning_rate": 3.5052928761529004e-05, "loss": 4.4779, "step": 20490 }, { "epoch": 9.207097933513028, "grad_norm": 3.921875, "learning_rate": 3.502459963396027e-05, "loss": 4.4074, "step": 20495 }, { "epoch": 9.209344115004493, "grad_norm": 3.734375, "learning_rate": 3.4996348637145285e-05, "loss": 4.5052, "step": 20500 }, { "epoch": 9.211590296495958, "grad_norm": 4.46875, "learning_rate": 3.49681757880663e-05, "loss": 4.4463, "step": 20505 }, { "epoch": 9.213836477987421, "grad_norm": 3.984375, "learning_rate": 3.4940081103658546e-05, "loss": 4.4598, "step": 20510 }, { "epoch": 9.216082659478886, "grad_norm": 4.0, "learning_rate": 3.491206460081028e-05, "loss": 4.4375, "step": 20515 }, { "epoch": 9.21832884097035, "grad_norm": 4.0625, "learning_rate": 3.4884126296362766e-05, "loss": 4.4494, "step": 20520 }, { "epoch": 9.220575022461816, "grad_norm": 3.90625, "learning_rate": 3.48562662071103e-05, "loss": 4.4271, "step": 20525 }, { "epoch": 9.222821203953279, "grad_norm": 4.03125, "learning_rate": 3.48284843498001e-05, "loss": 4.4001, "step": 20530 }, { "epoch": 9.225067385444744, "grad_norm": 4.3125, "learning_rate": 3.4800780741132374e-05, "loss": 4.4296, "step": 20535 }, { "epoch": 9.227313566936209, "grad_norm": 3.765625, "learning_rate": 3.477315539776034e-05, "loss": 4.4655, "step": 20540 }, { "epoch": 9.229559748427674, "grad_norm": 3.890625, "learning_rate": 3.474560833629008e-05, "loss": 4.4903, "step": 20545 }, { "epoch": 9.231805929919137, "grad_norm": 4.21875, "learning_rate": 3.471813957328072e-05, "loss": 4.4421, "step": 20550 }, { "epoch": 9.234052111410602, "grad_norm": 3.609375, "learning_rate": 3.4690749125244233e-05, "loss": 4.4794, "step": 20555 }, { "epoch": 9.236298292902067, "grad_norm": 4.21875, "learning_rate": 3.466343700864557e-05, "loss": 4.5095, "step": 20560 }, { "epoch": 9.238544474393532, "grad_norm": 4.0625, "learning_rate": 3.46362032399026e-05, "loss": 4.4533, "step": 20565 }, { "epoch": 9.240790655884995, "grad_norm": 3.75, "learning_rate": 3.460904783538602e-05, "loss": 4.486, "step": 20570 }, { "epoch": 9.24303683737646, "grad_norm": 3.734375, "learning_rate": 3.4581970811419535e-05, "loss": 4.4751, "step": 20575 }, { "epoch": 9.245283018867925, "grad_norm": 4.0, "learning_rate": 3.4554972184279635e-05, "loss": 4.472, "step": 20580 }, { "epoch": 9.24752920035939, "grad_norm": 4.125, "learning_rate": 3.452805197019573e-05, "loss": 4.4159, "step": 20585 }, { "epoch": 9.249775381850853, "grad_norm": 4.0625, "learning_rate": 3.450121018535008e-05, "loss": 4.4319, "step": 20590 }, { "epoch": 9.252021563342318, "grad_norm": 4.0, "learning_rate": 3.447444684587781e-05, "loss": 4.4907, "step": 20595 }, { "epoch": 9.254267744833783, "grad_norm": 4.1875, "learning_rate": 3.4447761967866926e-05, "loss": 4.4595, "step": 20600 }, { "epoch": 9.256513926325248, "grad_norm": 4.09375, "learning_rate": 3.442115556735816e-05, "loss": 4.5408, "step": 20605 }, { "epoch": 9.25876010781671, "grad_norm": 3.734375, "learning_rate": 3.439462766034518e-05, "loss": 4.4267, "step": 20610 }, { "epoch": 9.261006289308176, "grad_norm": 3.8125, "learning_rate": 3.4368178262774435e-05, "loss": 4.4546, "step": 20615 }, { "epoch": 9.26325247079964, "grad_norm": 3.65625, "learning_rate": 3.434180739054515e-05, "loss": 4.4058, "step": 20620 }, { "epoch": 9.265498652291106, "grad_norm": 4.125, "learning_rate": 3.4315515059509406e-05, "loss": 4.4089, "step": 20625 }, { "epoch": 9.267744833782569, "grad_norm": 4.09375, "learning_rate": 3.4289301285471984e-05, "loss": 4.489, "step": 20630 }, { "epoch": 9.269991015274034, "grad_norm": 4.0, "learning_rate": 3.4263166084190556e-05, "loss": 4.4804, "step": 20635 }, { "epoch": 9.272237196765499, "grad_norm": 4.25, "learning_rate": 3.423710947137547e-05, "loss": 4.4848, "step": 20640 }, { "epoch": 9.274483378256964, "grad_norm": 4.1875, "learning_rate": 3.421113146268986e-05, "loss": 4.4427, "step": 20645 }, { "epoch": 9.276729559748428, "grad_norm": 4.0625, "learning_rate": 3.418523207374963e-05, "loss": 4.4495, "step": 20650 }, { "epoch": 9.278975741239892, "grad_norm": 4.03125, "learning_rate": 3.4159411320123404e-05, "loss": 4.5049, "step": 20655 }, { "epoch": 9.281221922731357, "grad_norm": 3.859375, "learning_rate": 3.413366921733255e-05, "loss": 4.4524, "step": 20660 }, { "epoch": 9.283468104222822, "grad_norm": 3.953125, "learning_rate": 3.410800578085113e-05, "loss": 4.4708, "step": 20665 }, { "epoch": 9.285714285714286, "grad_norm": 3.921875, "learning_rate": 3.408242102610594e-05, "loss": 4.4779, "step": 20670 }, { "epoch": 9.28796046720575, "grad_norm": 4.1875, "learning_rate": 3.405691496847651e-05, "loss": 4.5344, "step": 20675 }, { "epoch": 9.290206648697215, "grad_norm": 3.9375, "learning_rate": 3.403148762329497e-05, "loss": 4.4469, "step": 20680 }, { "epoch": 9.29245283018868, "grad_norm": 3.578125, "learning_rate": 3.4006139005846275e-05, "loss": 4.4822, "step": 20685 }, { "epoch": 9.294699011680144, "grad_norm": 4.03125, "learning_rate": 3.398086913136789e-05, "loss": 4.4309, "step": 20690 }, { "epoch": 9.296945193171608, "grad_norm": 4.09375, "learning_rate": 3.3955678015050085e-05, "loss": 4.4726, "step": 20695 }, { "epoch": 9.299191374663073, "grad_norm": 3.921875, "learning_rate": 3.3930565672035704e-05, "loss": 4.4887, "step": 20700 }, { "epoch": 9.301437556154537, "grad_norm": 4.03125, "learning_rate": 3.3905532117420285e-05, "loss": 4.4062, "step": 20705 }, { "epoch": 9.303683737646002, "grad_norm": 3.953125, "learning_rate": 3.388057736625198e-05, "loss": 4.4559, "step": 20710 }, { "epoch": 9.305929919137466, "grad_norm": 4.0, "learning_rate": 3.3855701433531565e-05, "loss": 4.4399, "step": 20715 }, { "epoch": 9.30817610062893, "grad_norm": 4.0625, "learning_rate": 3.383090433421249e-05, "loss": 4.4522, "step": 20720 }, { "epoch": 9.310422282120395, "grad_norm": 4.15625, "learning_rate": 3.380618608320073e-05, "loss": 4.5295, "step": 20725 }, { "epoch": 9.31266846361186, "grad_norm": 4.0, "learning_rate": 3.378154669535494e-05, "loss": 4.4936, "step": 20730 }, { "epoch": 9.314914645103324, "grad_norm": 3.828125, "learning_rate": 3.3756986185486315e-05, "loss": 4.4374, "step": 20735 }, { "epoch": 9.317160826594789, "grad_norm": 4.09375, "learning_rate": 3.373250456835867e-05, "loss": 4.4132, "step": 20740 }, { "epoch": 9.319407008086253, "grad_norm": 3.9375, "learning_rate": 3.37081018586884e-05, "loss": 4.4009, "step": 20745 }, { "epoch": 9.321653189577718, "grad_norm": 4.1875, "learning_rate": 3.368377807114441e-05, "loss": 4.4279, "step": 20750 }, { "epoch": 9.323899371069182, "grad_norm": 3.921875, "learning_rate": 3.365953322034823e-05, "loss": 4.4392, "step": 20755 }, { "epoch": 9.326145552560646, "grad_norm": 3.828125, "learning_rate": 3.3635367320873925e-05, "loss": 4.4438, "step": 20760 }, { "epoch": 9.328391734052111, "grad_norm": 4.125, "learning_rate": 3.361128038724807e-05, "loss": 4.4955, "step": 20765 }, { "epoch": 9.330637915543576, "grad_norm": 3.796875, "learning_rate": 3.3587272433949785e-05, "loss": 4.4454, "step": 20770 }, { "epoch": 9.332884097035041, "grad_norm": 3.953125, "learning_rate": 3.356334347541074e-05, "loss": 4.4515, "step": 20775 }, { "epoch": 9.335130278526504, "grad_norm": 4.03125, "learning_rate": 3.3539493526015084e-05, "loss": 4.4467, "step": 20780 }, { "epoch": 9.33737646001797, "grad_norm": 4.125, "learning_rate": 3.351572260009951e-05, "loss": 4.4908, "step": 20785 }, { "epoch": 9.339622641509434, "grad_norm": 4.09375, "learning_rate": 3.3492030711953147e-05, "loss": 4.4666, "step": 20790 }, { "epoch": 9.3418688230009, "grad_norm": 3.734375, "learning_rate": 3.3468417875817694e-05, "loss": 4.4878, "step": 20795 }, { "epoch": 9.344115004492362, "grad_norm": 4.03125, "learning_rate": 3.3444884105887275e-05, "loss": 4.502, "step": 20800 }, { "epoch": 9.346361185983827, "grad_norm": 4.03125, "learning_rate": 3.3421429416308485e-05, "loss": 4.4959, "step": 20805 }, { "epoch": 9.348607367475292, "grad_norm": 3.9375, "learning_rate": 3.3398053821180397e-05, "loss": 4.4766, "step": 20810 }, { "epoch": 9.350853548966757, "grad_norm": 4.03125, "learning_rate": 3.337475733455456e-05, "loss": 4.4401, "step": 20815 }, { "epoch": 9.35309973045822, "grad_norm": 3.859375, "learning_rate": 3.335153997043494e-05, "loss": 4.5102, "step": 20820 }, { "epoch": 9.355345911949685, "grad_norm": 3.703125, "learning_rate": 3.332840174277793e-05, "loss": 4.437, "step": 20825 }, { "epoch": 9.35759209344115, "grad_norm": 4.0, "learning_rate": 3.3305342665492403e-05, "loss": 4.4891, "step": 20830 }, { "epoch": 9.359838274932615, "grad_norm": 3.96875, "learning_rate": 3.328236275243958e-05, "loss": 4.3987, "step": 20835 }, { "epoch": 9.362084456424078, "grad_norm": 4.125, "learning_rate": 3.325946201743317e-05, "loss": 4.5082, "step": 20840 }, { "epoch": 9.364330637915543, "grad_norm": 4.78125, "learning_rate": 3.323664047423924e-05, "loss": 4.4234, "step": 20845 }, { "epoch": 9.366576819407008, "grad_norm": 3.796875, "learning_rate": 3.321389813657625e-05, "loss": 4.5046, "step": 20850 }, { "epoch": 9.368823000898473, "grad_norm": 3.96875, "learning_rate": 3.319123501811511e-05, "loss": 4.4412, "step": 20855 }, { "epoch": 9.371069182389936, "grad_norm": 3.859375, "learning_rate": 3.3168651132479e-05, "loss": 4.4695, "step": 20860 }, { "epoch": 9.373315363881401, "grad_norm": 4.09375, "learning_rate": 3.314614649324361e-05, "loss": 4.4956, "step": 20865 }, { "epoch": 9.375561545372866, "grad_norm": 3.609375, "learning_rate": 3.312372111393684e-05, "loss": 4.4805, "step": 20870 }, { "epoch": 9.377807726864331, "grad_norm": 3.9375, "learning_rate": 3.310137500803907e-05, "loss": 4.4551, "step": 20875 }, { "epoch": 9.380053908355794, "grad_norm": 4.125, "learning_rate": 3.3079108188982986e-05, "loss": 4.5872, "step": 20880 }, { "epoch": 9.38230008984726, "grad_norm": 3.75, "learning_rate": 3.305692067015358e-05, "loss": 4.4619, "step": 20885 }, { "epoch": 9.384546271338724, "grad_norm": 4.0625, "learning_rate": 3.303481246488822e-05, "loss": 4.4675, "step": 20890 }, { "epoch": 9.38679245283019, "grad_norm": 4.15625, "learning_rate": 3.301278358647659e-05, "loss": 4.4347, "step": 20895 }, { "epoch": 9.389038634321654, "grad_norm": 3.953125, "learning_rate": 3.299083404816066e-05, "loss": 4.4454, "step": 20900 }, { "epoch": 9.391284815813117, "grad_norm": 4.03125, "learning_rate": 3.296896386313473e-05, "loss": 4.4842, "step": 20905 }, { "epoch": 9.393530997304582, "grad_norm": 3.96875, "learning_rate": 3.294717304454539e-05, "loss": 4.4931, "step": 20910 }, { "epoch": 9.395777178796047, "grad_norm": 3.875, "learning_rate": 3.292546160549158e-05, "loss": 4.46, "step": 20915 }, { "epoch": 9.398023360287512, "grad_norm": 4.0625, "learning_rate": 3.290382955902438e-05, "loss": 4.4622, "step": 20920 }, { "epoch": 9.400269541778975, "grad_norm": 3.953125, "learning_rate": 3.288227691814729e-05, "loss": 4.432, "step": 20925 }, { "epoch": 9.40251572327044, "grad_norm": 3.9375, "learning_rate": 3.286080369581602e-05, "loss": 4.4224, "step": 20930 }, { "epoch": 9.404761904761905, "grad_norm": 4.0625, "learning_rate": 3.2839409904938546e-05, "loss": 4.4739, "step": 20935 }, { "epoch": 9.40700808625337, "grad_norm": 4.0, "learning_rate": 3.281809555837509e-05, "loss": 4.449, "step": 20940 }, { "epoch": 9.409254267744833, "grad_norm": 3.953125, "learning_rate": 3.2796860668938105e-05, "loss": 4.4706, "step": 20945 }, { "epoch": 9.411500449236298, "grad_norm": 4.0625, "learning_rate": 3.277570524939233e-05, "loss": 4.4801, "step": 20950 }, { "epoch": 9.413746630727763, "grad_norm": 4.125, "learning_rate": 3.275462931245467e-05, "loss": 4.4234, "step": 20955 }, { "epoch": 9.415992812219228, "grad_norm": 4.125, "learning_rate": 3.273363287079431e-05, "loss": 4.418, "step": 20960 }, { "epoch": 9.418238993710691, "grad_norm": 4.125, "learning_rate": 3.271271593703261e-05, "loss": 4.4828, "step": 20965 }, { "epoch": 9.420485175202156, "grad_norm": 3.78125, "learning_rate": 3.2691878523743136e-05, "loss": 4.4232, "step": 20970 }, { "epoch": 9.422731356693621, "grad_norm": 4.15625, "learning_rate": 3.267112064345168e-05, "loss": 4.444, "step": 20975 }, { "epoch": 9.424977538185086, "grad_norm": 4.21875, "learning_rate": 3.265044230863621e-05, "loss": 4.428, "step": 20980 }, { "epoch": 9.42722371967655, "grad_norm": 3.859375, "learning_rate": 3.262984353172687e-05, "loss": 4.5171, "step": 20985 }, { "epoch": 9.429469901168014, "grad_norm": 4.09375, "learning_rate": 3.2609324325105996e-05, "loss": 4.4546, "step": 20990 }, { "epoch": 9.431716082659479, "grad_norm": 3.921875, "learning_rate": 3.258888470110805e-05, "loss": 4.4345, "step": 20995 }, { "epoch": 9.433962264150944, "grad_norm": 3.796875, "learning_rate": 3.2568524672019736e-05, "loss": 4.4649, "step": 21000 }, { "epoch": 9.433962264150944, "eval_loss": 4.779551982879639, "eval_runtime": 16.1437, "eval_samples_per_second": 1921.061, "eval_steps_per_second": 240.156, "step": 21000 }, { "epoch": 9.436208445642407, "grad_norm": 4.21875, "learning_rate": 3.254824425007984e-05, "loss": 4.4247, "step": 21005 }, { "epoch": 9.438454627133872, "grad_norm": 3.890625, "learning_rate": 3.252804344747934e-05, "loss": 4.467, "step": 21010 }, { "epoch": 9.440700808625337, "grad_norm": 3.9375, "learning_rate": 3.250792227636132e-05, "loss": 4.4517, "step": 21015 }, { "epoch": 9.442946990116802, "grad_norm": 3.875, "learning_rate": 3.2487880748820984e-05, "loss": 4.483, "step": 21020 }, { "epoch": 9.445193171608265, "grad_norm": 3.859375, "learning_rate": 3.2467918876905736e-05, "loss": 4.4234, "step": 21025 }, { "epoch": 9.44743935309973, "grad_norm": 3.65625, "learning_rate": 3.244803667261501e-05, "loss": 4.5102, "step": 21030 }, { "epoch": 9.449685534591195, "grad_norm": 3.84375, "learning_rate": 3.242823414790042e-05, "loss": 4.3925, "step": 21035 }, { "epoch": 9.45193171608266, "grad_norm": 4.21875, "learning_rate": 3.24085113146656e-05, "loss": 4.5181, "step": 21040 }, { "epoch": 9.454177897574123, "grad_norm": 3.984375, "learning_rate": 3.238886818476639e-05, "loss": 4.4318, "step": 21045 }, { "epoch": 9.456424079065588, "grad_norm": 3.859375, "learning_rate": 3.236930477001061e-05, "loss": 4.4973, "step": 21050 }, { "epoch": 9.458670260557053, "grad_norm": 4.09375, "learning_rate": 3.2349821082158236e-05, "loss": 4.4727, "step": 21055 }, { "epoch": 9.460916442048518, "grad_norm": 3.859375, "learning_rate": 3.23304171329213e-05, "loss": 4.4771, "step": 21060 }, { "epoch": 9.463162623539983, "grad_norm": 3.84375, "learning_rate": 3.2311092933963865e-05, "loss": 4.4805, "step": 21065 }, { "epoch": 9.465408805031446, "grad_norm": 3.921875, "learning_rate": 3.229184849690212e-05, "loss": 4.4046, "step": 21070 }, { "epoch": 9.467654986522911, "grad_norm": 4.0, "learning_rate": 3.227268383330426e-05, "loss": 4.4095, "step": 21075 }, { "epoch": 9.469901168014376, "grad_norm": 4.125, "learning_rate": 3.225359895469053e-05, "loss": 4.5492, "step": 21080 }, { "epoch": 9.47214734950584, "grad_norm": 3.921875, "learning_rate": 3.2234593872533225e-05, "loss": 4.3571, "step": 21085 }, { "epoch": 9.474393530997304, "grad_norm": 4.15625, "learning_rate": 3.221566859825667e-05, "loss": 4.4533, "step": 21090 }, { "epoch": 9.476639712488769, "grad_norm": 4.15625, "learning_rate": 3.219682314323724e-05, "loss": 4.434, "step": 21095 }, { "epoch": 9.478885893980234, "grad_norm": 3.9375, "learning_rate": 3.2178057518803274e-05, "loss": 4.4256, "step": 21100 }, { "epoch": 9.481132075471699, "grad_norm": 3.71875, "learning_rate": 3.215937173623517e-05, "loss": 4.4841, "step": 21105 }, { "epoch": 9.483378256963162, "grad_norm": 3.9375, "learning_rate": 3.214076580676533e-05, "loss": 4.4873, "step": 21110 }, { "epoch": 9.485624438454627, "grad_norm": 4.21875, "learning_rate": 3.212223974157812e-05, "loss": 4.4543, "step": 21115 }, { "epoch": 9.487870619946092, "grad_norm": 3.5625, "learning_rate": 3.210379355180993e-05, "loss": 4.505, "step": 21120 }, { "epoch": 9.490116801437557, "grad_norm": 4.0625, "learning_rate": 3.208542724854913e-05, "loss": 4.4396, "step": 21125 }, { "epoch": 9.49236298292902, "grad_norm": 4.34375, "learning_rate": 3.206714084283605e-05, "loss": 4.4859, "step": 21130 }, { "epoch": 9.494609164420485, "grad_norm": 4.03125, "learning_rate": 3.204893434566302e-05, "loss": 4.5148, "step": 21135 }, { "epoch": 9.49685534591195, "grad_norm": 3.9375, "learning_rate": 3.203080776797432e-05, "loss": 4.5325, "step": 21140 }, { "epoch": 9.499101527403415, "grad_norm": 4.25, "learning_rate": 3.2012761120666185e-05, "loss": 4.4119, "step": 21145 }, { "epoch": 9.501347708894878, "grad_norm": 4.03125, "learning_rate": 3.199479441458679e-05, "loss": 4.4054, "step": 21150 }, { "epoch": 9.503593890386343, "grad_norm": 3.984375, "learning_rate": 3.197690766053632e-05, "loss": 4.4224, "step": 21155 }, { "epoch": 9.505840071877808, "grad_norm": 3.953125, "learning_rate": 3.1959100869266814e-05, "loss": 4.4602, "step": 21160 }, { "epoch": 9.508086253369273, "grad_norm": 4.1875, "learning_rate": 3.194137405148229e-05, "loss": 4.478, "step": 21165 }, { "epoch": 9.510332434860736, "grad_norm": 3.953125, "learning_rate": 3.192372721783868e-05, "loss": 4.4035, "step": 21170 }, { "epoch": 9.5125786163522, "grad_norm": 4.0625, "learning_rate": 3.1906160378943865e-05, "loss": 4.4875, "step": 21175 }, { "epoch": 9.514824797843666, "grad_norm": 3.9375, "learning_rate": 3.188867354535759e-05, "loss": 4.4576, "step": 21180 }, { "epoch": 9.51707097933513, "grad_norm": 4.25, "learning_rate": 3.187126672759153e-05, "loss": 4.4914, "step": 21185 }, { "epoch": 9.519317160826596, "grad_norm": 4.03125, "learning_rate": 3.185393993610929e-05, "loss": 4.4658, "step": 21190 }, { "epoch": 9.521563342318059, "grad_norm": 3.921875, "learning_rate": 3.183669318132632e-05, "loss": 4.4875, "step": 21195 }, { "epoch": 9.523809523809524, "grad_norm": 3.859375, "learning_rate": 3.181952647360999e-05, "loss": 4.4896, "step": 21200 }, { "epoch": 9.526055705300989, "grad_norm": 3.640625, "learning_rate": 3.1802439823279534e-05, "loss": 4.4181, "step": 21205 }, { "epoch": 9.528301886792454, "grad_norm": 4.34375, "learning_rate": 3.1785433240606084e-05, "loss": 4.5146, "step": 21210 }, { "epoch": 9.530548068283917, "grad_norm": 3.90625, "learning_rate": 3.176850673581264e-05, "loss": 4.4964, "step": 21215 }, { "epoch": 9.532794249775382, "grad_norm": 3.828125, "learning_rate": 3.175166031907402e-05, "loss": 4.4304, "step": 21220 }, { "epoch": 9.535040431266847, "grad_norm": 4.03125, "learning_rate": 3.173489400051695e-05, "loss": 4.4662, "step": 21225 }, { "epoch": 9.537286612758312, "grad_norm": 3.828125, "learning_rate": 3.1718207790220025e-05, "loss": 4.4908, "step": 21230 }, { "epoch": 9.539532794249775, "grad_norm": 3.65625, "learning_rate": 3.1701601698213606e-05, "loss": 4.4367, "step": 21235 }, { "epoch": 9.54177897574124, "grad_norm": 3.765625, "learning_rate": 3.168507573447995e-05, "loss": 4.3619, "step": 21240 }, { "epoch": 9.544025157232705, "grad_norm": 3.9375, "learning_rate": 3.166862990895315e-05, "loss": 4.4482, "step": 21245 }, { "epoch": 9.54627133872417, "grad_norm": 3.90625, "learning_rate": 3.1652264231519106e-05, "loss": 4.4297, "step": 21250 }, { "epoch": 9.548517520215633, "grad_norm": 3.71875, "learning_rate": 3.163597871201555e-05, "loss": 4.4541, "step": 21255 }, { "epoch": 9.550763701707098, "grad_norm": 4.15625, "learning_rate": 3.161977336023201e-05, "loss": 4.4299, "step": 21260 }, { "epoch": 9.553009883198563, "grad_norm": 3.90625, "learning_rate": 3.1603648185909876e-05, "loss": 4.43, "step": 21265 }, { "epoch": 9.555256064690028, "grad_norm": 3.90625, "learning_rate": 3.158760319874226e-05, "loss": 4.5526, "step": 21270 }, { "epoch": 9.55750224618149, "grad_norm": 3.890625, "learning_rate": 3.1571638408374145e-05, "loss": 4.4884, "step": 21275 }, { "epoch": 9.559748427672956, "grad_norm": 4.1875, "learning_rate": 3.155575382440228e-05, "loss": 4.5047, "step": 21280 }, { "epoch": 9.56199460916442, "grad_norm": 4.03125, "learning_rate": 3.153994945637519e-05, "loss": 4.4524, "step": 21285 }, { "epoch": 9.564240790655886, "grad_norm": 4.0, "learning_rate": 3.1524225313793195e-05, "loss": 4.3926, "step": 21290 }, { "epoch": 9.566486972147349, "grad_norm": 4.15625, "learning_rate": 3.1508581406108374e-05, "loss": 4.5939, "step": 21295 }, { "epoch": 9.568733153638814, "grad_norm": 4.0, "learning_rate": 3.1493017742724605e-05, "loss": 4.4458, "step": 21300 }, { "epoch": 9.570979335130279, "grad_norm": 4.15625, "learning_rate": 3.147753433299748e-05, "loss": 4.4331, "step": 21305 }, { "epoch": 9.573225516621743, "grad_norm": 3.90625, "learning_rate": 3.146213118623441e-05, "loss": 4.4841, "step": 21310 }, { "epoch": 9.575471698113208, "grad_norm": 3.953125, "learning_rate": 3.144680831169452e-05, "loss": 4.4666, "step": 21315 }, { "epoch": 9.577717879604672, "grad_norm": 3.96875, "learning_rate": 3.143156571858868e-05, "loss": 4.478, "step": 21320 }, { "epoch": 9.579964061096137, "grad_norm": 4.15625, "learning_rate": 3.1416403416079505e-05, "loss": 4.4336, "step": 21325 }, { "epoch": 9.582210242587601, "grad_norm": 3.953125, "learning_rate": 3.140132141328138e-05, "loss": 4.471, "step": 21330 }, { "epoch": 9.584456424079066, "grad_norm": 3.828125, "learning_rate": 3.138631971926037e-05, "loss": 4.4267, "step": 21335 }, { "epoch": 9.58670260557053, "grad_norm": 4.25, "learning_rate": 3.1371398343034266e-05, "loss": 4.4679, "step": 21340 }, { "epoch": 9.588948787061994, "grad_norm": 4.03125, "learning_rate": 3.135655729357265e-05, "loss": 4.4145, "step": 21345 }, { "epoch": 9.59119496855346, "grad_norm": 4.03125, "learning_rate": 3.134179657979672e-05, "loss": 4.4745, "step": 21350 }, { "epoch": 9.593441150044924, "grad_norm": 3.96875, "learning_rate": 3.1327116210579456e-05, "loss": 4.4523, "step": 21355 }, { "epoch": 9.595687331536388, "grad_norm": 3.84375, "learning_rate": 3.131251619474552e-05, "loss": 4.3996, "step": 21360 }, { "epoch": 9.597933513027852, "grad_norm": 3.921875, "learning_rate": 3.129799654107124e-05, "loss": 4.4422, "step": 21365 }, { "epoch": 9.600179694519317, "grad_norm": 4.125, "learning_rate": 3.128355725828468e-05, "loss": 4.5367, "step": 21370 }, { "epoch": 9.602425876010782, "grad_norm": 3.828125, "learning_rate": 3.126919835506558e-05, "loss": 4.4527, "step": 21375 }, { "epoch": 9.604672057502246, "grad_norm": 4.15625, "learning_rate": 3.1254919840045356e-05, "loss": 4.4232, "step": 21380 }, { "epoch": 9.60691823899371, "grad_norm": 3.65625, "learning_rate": 3.124072172180709e-05, "loss": 4.464, "step": 21385 }, { "epoch": 9.609164420485175, "grad_norm": 4.25, "learning_rate": 3.1226604008885566e-05, "loss": 4.5189, "step": 21390 }, { "epoch": 9.61141060197664, "grad_norm": 4.0, "learning_rate": 3.121256670976719e-05, "loss": 4.4456, "step": 21395 }, { "epoch": 9.613656783468103, "grad_norm": 4.03125, "learning_rate": 3.1198609832890093e-05, "loss": 4.4784, "step": 21400 }, { "epoch": 9.615902964959568, "grad_norm": 3.921875, "learning_rate": 3.1184733386643994e-05, "loss": 4.4983, "step": 21405 }, { "epoch": 9.618149146451033, "grad_norm": 4.0, "learning_rate": 3.1170937379370314e-05, "loss": 4.4304, "step": 21410 }, { "epoch": 9.620395327942498, "grad_norm": 4.1875, "learning_rate": 3.115722181936209e-05, "loss": 4.4549, "step": 21415 }, { "epoch": 9.622641509433961, "grad_norm": 4.21875, "learning_rate": 3.114358671486403e-05, "loss": 4.4759, "step": 21420 }, { "epoch": 9.624887690925426, "grad_norm": 3.890625, "learning_rate": 3.1130032074072465e-05, "loss": 4.4785, "step": 21425 }, { "epoch": 9.627133872416891, "grad_norm": 4.09375, "learning_rate": 3.111655790513532e-05, "loss": 4.4772, "step": 21430 }, { "epoch": 9.629380053908356, "grad_norm": 3.90625, "learning_rate": 3.110316421615223e-05, "loss": 4.532, "step": 21435 }, { "epoch": 9.631626235399821, "grad_norm": 4.0, "learning_rate": 3.108985101517435e-05, "loss": 4.4875, "step": 21440 }, { "epoch": 9.633872416891284, "grad_norm": 3.9375, "learning_rate": 3.107661831020455e-05, "loss": 4.4913, "step": 21445 }, { "epoch": 9.63611859838275, "grad_norm": 3.75, "learning_rate": 3.1063466109197236e-05, "loss": 4.5703, "step": 21450 }, { "epoch": 9.638364779874214, "grad_norm": 3.9375, "learning_rate": 3.105039442005847e-05, "loss": 4.4012, "step": 21455 }, { "epoch": 9.640610961365677, "grad_norm": 3.984375, "learning_rate": 3.1037403250645916e-05, "loss": 4.404, "step": 21460 }, { "epoch": 9.642857142857142, "grad_norm": 4.0625, "learning_rate": 3.102449260876879e-05, "loss": 4.4471, "step": 21465 }, { "epoch": 9.645103324348607, "grad_norm": 3.921875, "learning_rate": 3.1011662502187955e-05, "loss": 4.4435, "step": 21470 }, { "epoch": 9.647349505840072, "grad_norm": 3.734375, "learning_rate": 3.0998912938615795e-05, "loss": 4.4458, "step": 21475 }, { "epoch": 9.649595687331537, "grad_norm": 3.6875, "learning_rate": 3.0986243925716375e-05, "loss": 4.5468, "step": 21480 }, { "epoch": 9.651841868823, "grad_norm": 3.828125, "learning_rate": 3.097365547110527e-05, "loss": 4.4227, "step": 21485 }, { "epoch": 9.654088050314465, "grad_norm": 4.03125, "learning_rate": 3.096114758234962e-05, "loss": 4.4105, "step": 21490 }, { "epoch": 9.65633423180593, "grad_norm": 3.953125, "learning_rate": 3.09487202669682e-05, "loss": 4.4524, "step": 21495 }, { "epoch": 9.658580413297395, "grad_norm": 3.640625, "learning_rate": 3.0936373532431294e-05, "loss": 4.4503, "step": 21500 }, { "epoch": 9.660826594788858, "grad_norm": 3.9375, "learning_rate": 3.092410738616075e-05, "loss": 4.4434, "step": 21505 }, { "epoch": 9.663072776280323, "grad_norm": 4.09375, "learning_rate": 3.0911921835530025e-05, "loss": 4.4463, "step": 21510 }, { "epoch": 9.665318957771788, "grad_norm": 4.03125, "learning_rate": 3.089981688786405e-05, "loss": 4.4196, "step": 21515 }, { "epoch": 9.667565139263253, "grad_norm": 3.890625, "learning_rate": 3.0887792550439384e-05, "loss": 4.4454, "step": 21520 }, { "epoch": 9.669811320754716, "grad_norm": 3.984375, "learning_rate": 3.0875848830484056e-05, "loss": 4.4653, "step": 21525 }, { "epoch": 9.672057502246181, "grad_norm": 4.15625, "learning_rate": 3.0863985735177713e-05, "loss": 4.4771, "step": 21530 }, { "epoch": 9.674303683737646, "grad_norm": 4.125, "learning_rate": 3.0852203271651446e-05, "loss": 4.5219, "step": 21535 }, { "epoch": 9.676549865229111, "grad_norm": 4.125, "learning_rate": 3.084050144698795e-05, "loss": 4.5028, "step": 21540 }, { "epoch": 9.678796046720574, "grad_norm": 4.25, "learning_rate": 3.0828880268221423e-05, "loss": 4.5004, "step": 21545 }, { "epoch": 9.68104222821204, "grad_norm": 4.03125, "learning_rate": 3.081733974233757e-05, "loss": 4.3945, "step": 21550 }, { "epoch": 9.683288409703504, "grad_norm": 4.125, "learning_rate": 3.080587987627364e-05, "loss": 4.4239, "step": 21555 }, { "epoch": 9.685534591194969, "grad_norm": 4.0, "learning_rate": 3.079450067691836e-05, "loss": 4.4172, "step": 21560 }, { "epoch": 9.687780772686434, "grad_norm": 3.84375, "learning_rate": 3.0783202151112005e-05, "loss": 4.4264, "step": 21565 }, { "epoch": 9.690026954177897, "grad_norm": 4.09375, "learning_rate": 3.077198430564635e-05, "loss": 4.4771, "step": 21570 }, { "epoch": 9.692273135669362, "grad_norm": 3.890625, "learning_rate": 3.0760847147264654e-05, "loss": 4.4455, "step": 21575 }, { "epoch": 9.694519317160827, "grad_norm": 3.921875, "learning_rate": 3.074979068266168e-05, "loss": 4.4437, "step": 21580 }, { "epoch": 9.69676549865229, "grad_norm": 3.890625, "learning_rate": 3.073881491848366e-05, "loss": 4.4892, "step": 21585 }, { "epoch": 9.699011680143755, "grad_norm": 4.125, "learning_rate": 3.072791986132838e-05, "loss": 4.403, "step": 21590 }, { "epoch": 9.70125786163522, "grad_norm": 4.125, "learning_rate": 3.071710551774505e-05, "loss": 4.4125, "step": 21595 }, { "epoch": 9.703504043126685, "grad_norm": 4.4375, "learning_rate": 3.0706371894234375e-05, "loss": 4.5058, "step": 21600 }, { "epoch": 9.70575022461815, "grad_norm": 4.0625, "learning_rate": 3.0695718997248576e-05, "loss": 4.451, "step": 21605 }, { "epoch": 9.707996406109613, "grad_norm": 3.75, "learning_rate": 3.068514683319129e-05, "loss": 4.4327, "step": 21610 }, { "epoch": 9.710242587601078, "grad_norm": 3.984375, "learning_rate": 3.067465540841767e-05, "loss": 4.4968, "step": 21615 }, { "epoch": 9.712488769092543, "grad_norm": 3.796875, "learning_rate": 3.06642447292343e-05, "loss": 4.4585, "step": 21620 }, { "epoch": 9.714734950584008, "grad_norm": 4.03125, "learning_rate": 3.0653914801899244e-05, "loss": 4.4387, "step": 21625 }, { "epoch": 9.716981132075471, "grad_norm": 3.984375, "learning_rate": 3.064366563262204e-05, "loss": 4.4181, "step": 21630 }, { "epoch": 9.719227313566936, "grad_norm": 4.09375, "learning_rate": 3.0633497227563644e-05, "loss": 4.4826, "step": 21635 }, { "epoch": 9.721473495058401, "grad_norm": 4.09375, "learning_rate": 3.062340959283652e-05, "loss": 4.4405, "step": 21640 }, { "epoch": 9.723719676549866, "grad_norm": 3.890625, "learning_rate": 3.0613402734504484e-05, "loss": 4.4622, "step": 21645 }, { "epoch": 9.725965858041329, "grad_norm": 3.984375, "learning_rate": 3.0603476658582896e-05, "loss": 4.4617, "step": 21650 }, { "epoch": 9.728212039532794, "grad_norm": 3.90625, "learning_rate": 3.059363137103851e-05, "loss": 4.4626, "step": 21655 }, { "epoch": 9.730458221024259, "grad_norm": 4.1875, "learning_rate": 3.05838668777895e-05, "loss": 4.4759, "step": 21660 }, { "epoch": 9.732704402515724, "grad_norm": 4.0, "learning_rate": 3.057418318470553e-05, "loss": 4.6199, "step": 21665 }, { "epoch": 9.734950584007187, "grad_norm": 4.1875, "learning_rate": 3.0564580297607615e-05, "loss": 4.4436, "step": 21670 }, { "epoch": 9.737196765498652, "grad_norm": 3.796875, "learning_rate": 3.055505822226827e-05, "loss": 4.4397, "step": 21675 }, { "epoch": 9.739442946990117, "grad_norm": 4.03125, "learning_rate": 3.054561696441139e-05, "loss": 4.431, "step": 21680 }, { "epoch": 9.741689128481582, "grad_norm": 4.34375, "learning_rate": 3.0536256529712295e-05, "loss": 4.4939, "step": 21685 }, { "epoch": 9.743935309973045, "grad_norm": 3.921875, "learning_rate": 3.052697692379772e-05, "loss": 4.5042, "step": 21690 }, { "epoch": 9.74618149146451, "grad_norm": 3.78125, "learning_rate": 3.0517778152245845e-05, "loss": 4.4137, "step": 21695 }, { "epoch": 9.748427672955975, "grad_norm": 3.953125, "learning_rate": 3.050866022058619e-05, "loss": 4.4318, "step": 21700 }, { "epoch": 9.75067385444744, "grad_norm": 3.953125, "learning_rate": 3.049962313429976e-05, "loss": 4.4799, "step": 21705 }, { "epoch": 9.752920035938903, "grad_norm": 4.125, "learning_rate": 3.0490666898818908e-05, "loss": 4.5333, "step": 21710 }, { "epoch": 9.755166217430368, "grad_norm": 3.921875, "learning_rate": 3.0481791519527406e-05, "loss": 4.436, "step": 21715 }, { "epoch": 9.757412398921833, "grad_norm": 3.921875, "learning_rate": 3.047299700176042e-05, "loss": 4.4265, "step": 21720 }, { "epoch": 9.759658580413298, "grad_norm": 4.0625, "learning_rate": 3.0464283350804495e-05, "loss": 4.464, "step": 21725 }, { "epoch": 9.761904761904763, "grad_norm": 3.90625, "learning_rate": 3.0455650571897578e-05, "loss": 4.3911, "step": 21730 }, { "epoch": 9.764150943396226, "grad_norm": 4.09375, "learning_rate": 3.0447098670229016e-05, "loss": 4.4912, "step": 21735 }, { "epoch": 9.76639712488769, "grad_norm": 3.765625, "learning_rate": 3.0438627650939498e-05, "loss": 4.4731, "step": 21740 }, { "epoch": 9.768643306379156, "grad_norm": 3.609375, "learning_rate": 3.0430237519121147e-05, "loss": 4.4542, "step": 21745 }, { "epoch": 9.77088948787062, "grad_norm": 3.828125, "learning_rate": 3.042192827981744e-05, "loss": 4.4322, "step": 21750 }, { "epoch": 9.773135669362084, "grad_norm": 3.859375, "learning_rate": 3.041369993802318e-05, "loss": 4.5, "step": 21755 }, { "epoch": 9.775381850853549, "grad_norm": 4.28125, "learning_rate": 3.0405552498684635e-05, "loss": 4.5075, "step": 21760 }, { "epoch": 9.777628032345014, "grad_norm": 3.953125, "learning_rate": 3.0397485966699357e-05, "loss": 4.4934, "step": 21765 }, { "epoch": 9.779874213836479, "grad_norm": 3.9375, "learning_rate": 3.0389500346916307e-05, "loss": 4.4909, "step": 21770 }, { "epoch": 9.782120395327942, "grad_norm": 3.921875, "learning_rate": 3.0381595644135787e-05, "loss": 4.5213, "step": 21775 }, { "epoch": 9.784366576819407, "grad_norm": 4.3125, "learning_rate": 3.0373771863109488e-05, "loss": 4.5062, "step": 21780 }, { "epoch": 9.786612758310872, "grad_norm": 3.90625, "learning_rate": 3.036602900854044e-05, "loss": 4.499, "step": 21785 }, { "epoch": 9.788858939802337, "grad_norm": 4.0625, "learning_rate": 3.0358367085082986e-05, "loss": 4.4432, "step": 21790 }, { "epoch": 9.7911051212938, "grad_norm": 4.09375, "learning_rate": 3.0350786097342906e-05, "loss": 4.475, "step": 21795 }, { "epoch": 9.793351302785265, "grad_norm": 4.0625, "learning_rate": 3.0343286049877233e-05, "loss": 4.4657, "step": 21800 }, { "epoch": 9.79559748427673, "grad_norm": 3.8125, "learning_rate": 3.033586694719443e-05, "loss": 4.4604, "step": 21805 }, { "epoch": 9.797843665768195, "grad_norm": 3.828125, "learning_rate": 3.032852879375425e-05, "loss": 4.4558, "step": 21810 }, { "epoch": 9.800089847259658, "grad_norm": 3.78125, "learning_rate": 3.0321271593967798e-05, "loss": 4.4586, "step": 21815 }, { "epoch": 9.802336028751123, "grad_norm": 4.1875, "learning_rate": 3.0314095352197537e-05, "loss": 4.492, "step": 21820 }, { "epoch": 9.804582210242588, "grad_norm": 3.859375, "learning_rate": 3.0307000072757216e-05, "loss": 4.4882, "step": 21825 }, { "epoch": 9.806828391734053, "grad_norm": 4.125, "learning_rate": 3.0299985759911967e-05, "loss": 4.4841, "step": 21830 }, { "epoch": 9.809074573225516, "grad_norm": 4.03125, "learning_rate": 3.0293052417878228e-05, "loss": 4.4587, "step": 21835 }, { "epoch": 9.81132075471698, "grad_norm": 3.875, "learning_rate": 3.0286200050823747e-05, "loss": 4.4934, "step": 21840 }, { "epoch": 9.813566936208446, "grad_norm": 4.0625, "learning_rate": 3.0279428662867646e-05, "loss": 4.4899, "step": 21845 }, { "epoch": 9.81581311769991, "grad_norm": 3.984375, "learning_rate": 3.0272738258080327e-05, "loss": 4.4076, "step": 21850 }, { "epoch": 9.818059299191376, "grad_norm": 4.0, "learning_rate": 3.026612884048352e-05, "loss": 4.4671, "step": 21855 }, { "epoch": 9.820305480682839, "grad_norm": 4.03125, "learning_rate": 3.0259600414050278e-05, "loss": 4.3938, "step": 21860 }, { "epoch": 9.822551662174304, "grad_norm": 3.984375, "learning_rate": 3.025315298270496e-05, "loss": 4.5627, "step": 21865 }, { "epoch": 9.824797843665769, "grad_norm": 4.15625, "learning_rate": 3.024678655032324e-05, "loss": 4.5109, "step": 21870 }, { "epoch": 9.827044025157234, "grad_norm": 3.984375, "learning_rate": 3.0240501120732125e-05, "loss": 4.499, "step": 21875 }, { "epoch": 9.829290206648697, "grad_norm": 4.09375, "learning_rate": 3.0234296697709894e-05, "loss": 4.4657, "step": 21880 }, { "epoch": 9.831536388140162, "grad_norm": 4.1875, "learning_rate": 3.022817328498614e-05, "loss": 4.4049, "step": 21885 }, { "epoch": 9.833782569631627, "grad_norm": 3.890625, "learning_rate": 3.022213088624178e-05, "loss": 4.508, "step": 21890 }, { "epoch": 9.836028751123091, "grad_norm": 3.953125, "learning_rate": 3.0216169505109022e-05, "loss": 4.4213, "step": 21895 }, { "epoch": 9.838274932614555, "grad_norm": 3.984375, "learning_rate": 3.021028914517134e-05, "loss": 4.3869, "step": 21900 }, { "epoch": 9.84052111410602, "grad_norm": 4.1875, "learning_rate": 3.020448980996358e-05, "loss": 4.5112, "step": 21905 }, { "epoch": 9.842767295597485, "grad_norm": 4.21875, "learning_rate": 3.0198771502971783e-05, "loss": 4.5125, "step": 21910 }, { "epoch": 9.84501347708895, "grad_norm": 3.9375, "learning_rate": 3.019313422763338e-05, "loss": 4.4641, "step": 21915 }, { "epoch": 9.847259658580413, "grad_norm": 3.65625, "learning_rate": 3.0187577987337025e-05, "loss": 4.481, "step": 21920 }, { "epoch": 9.849505840071878, "grad_norm": 3.875, "learning_rate": 3.0182102785422673e-05, "loss": 4.4802, "step": 21925 }, { "epoch": 9.851752021563343, "grad_norm": 3.96875, "learning_rate": 3.017670862518161e-05, "loss": 4.4696, "step": 21930 }, { "epoch": 9.853998203054807, "grad_norm": 4.03125, "learning_rate": 3.017139550985634e-05, "loss": 4.4655, "step": 21935 }, { "epoch": 9.85624438454627, "grad_norm": 3.71875, "learning_rate": 3.0166163442640687e-05, "loss": 4.4801, "step": 21940 }, { "epoch": 9.858490566037736, "grad_norm": 4.0625, "learning_rate": 3.016101242667975e-05, "loss": 4.4328, "step": 21945 }, { "epoch": 9.8607367475292, "grad_norm": 4.15625, "learning_rate": 3.015594246506991e-05, "loss": 4.5086, "step": 21950 }, { "epoch": 9.862982929020665, "grad_norm": 3.890625, "learning_rate": 3.0150953560858822e-05, "loss": 4.4391, "step": 21955 }, { "epoch": 9.865229110512129, "grad_norm": 3.53125, "learning_rate": 3.0146045717045403e-05, "loss": 4.4804, "step": 21960 }, { "epoch": 9.867475292003594, "grad_norm": 4.09375, "learning_rate": 3.014121893657986e-05, "loss": 4.4691, "step": 21965 }, { "epoch": 9.869721473495058, "grad_norm": 4.0, "learning_rate": 3.0136473222363663e-05, "loss": 4.4258, "step": 21970 }, { "epoch": 9.871967654986523, "grad_norm": 4.0625, "learning_rate": 3.0131808577249562e-05, "loss": 4.4583, "step": 21975 }, { "epoch": 9.874213836477988, "grad_norm": 4.125, "learning_rate": 3.0127225004041557e-05, "loss": 4.467, "step": 21980 }, { "epoch": 9.876460017969451, "grad_norm": 4.0625, "learning_rate": 3.012272250549493e-05, "loss": 4.4482, "step": 21985 }, { "epoch": 9.878706199460916, "grad_norm": 3.609375, "learning_rate": 3.011830108431621e-05, "loss": 4.4542, "step": 21990 }, { "epoch": 9.880952380952381, "grad_norm": 4.0, "learning_rate": 3.011396074316322e-05, "loss": 4.4514, "step": 21995 }, { "epoch": 9.883198562443845, "grad_norm": 3.71875, "learning_rate": 3.0109701484644995e-05, "loss": 4.4542, "step": 22000 }, { "epoch": 9.883198562443845, "eval_loss": 4.778508186340332, "eval_runtime": 16.0782, "eval_samples_per_second": 1928.886, "eval_steps_per_second": 241.134, "step": 22000 }, { "epoch": 9.88544474393531, "grad_norm": 4.03125, "learning_rate": 3.010552331132188e-05, "loss": 4.4514, "step": 22005 }, { "epoch": 9.887690925426774, "grad_norm": 4.03125, "learning_rate": 3.0101426225705458e-05, "loss": 4.4856, "step": 22010 }, { "epoch": 9.88993710691824, "grad_norm": 4.28125, "learning_rate": 3.0097410230258556e-05, "loss": 4.4761, "step": 22015 }, { "epoch": 9.892183288409704, "grad_norm": 3.984375, "learning_rate": 3.0093475327395274e-05, "loss": 4.4174, "step": 22020 }, { "epoch": 9.894429469901167, "grad_norm": 4.1875, "learning_rate": 3.0089621519480948e-05, "loss": 4.4723, "step": 22025 }, { "epoch": 9.896675651392632, "grad_norm": 4.125, "learning_rate": 3.0085848808832195e-05, "loss": 4.4188, "step": 22030 }, { "epoch": 9.898921832884097, "grad_norm": 3.875, "learning_rate": 3.008215719771684e-05, "loss": 4.4611, "step": 22035 }, { "epoch": 9.901168014375562, "grad_norm": 4.0, "learning_rate": 3.0078546688354016e-05, "loss": 4.4664, "step": 22040 }, { "epoch": 9.903414195867025, "grad_norm": 4.09375, "learning_rate": 3.0075017282914045e-05, "loss": 4.4478, "step": 22045 }, { "epoch": 9.90566037735849, "grad_norm": 4.375, "learning_rate": 3.0071568983518542e-05, "loss": 4.5441, "step": 22050 }, { "epoch": 9.907906558849955, "grad_norm": 3.96875, "learning_rate": 3.0068201792240328e-05, "loss": 4.451, "step": 22055 }, { "epoch": 9.91015274034142, "grad_norm": 4.09375, "learning_rate": 3.006491571110349e-05, "loss": 4.5322, "step": 22060 }, { "epoch": 9.912398921832883, "grad_norm": 4.1875, "learning_rate": 3.0061710742083364e-05, "loss": 4.3999, "step": 22065 }, { "epoch": 9.914645103324348, "grad_norm": 3.921875, "learning_rate": 3.005858688710651e-05, "loss": 4.4591, "step": 22070 }, { "epoch": 9.916891284815813, "grad_norm": 3.984375, "learning_rate": 3.005554414805075e-05, "loss": 4.4283, "step": 22075 }, { "epoch": 9.919137466307278, "grad_norm": 4.0, "learning_rate": 3.0052582526745136e-05, "loss": 4.4817, "step": 22080 }, { "epoch": 9.921383647798741, "grad_norm": 3.9375, "learning_rate": 3.004970202496993e-05, "loss": 4.4711, "step": 22085 }, { "epoch": 9.923629829290206, "grad_norm": 3.984375, "learning_rate": 3.0046902644456684e-05, "loss": 4.4818, "step": 22090 }, { "epoch": 9.925876010781671, "grad_norm": 3.875, "learning_rate": 3.0044184386888162e-05, "loss": 4.4919, "step": 22095 }, { "epoch": 9.928122192273136, "grad_norm": 4.125, "learning_rate": 3.004154725389835e-05, "loss": 4.4638, "step": 22100 }, { "epoch": 9.9303683737646, "grad_norm": 4.03125, "learning_rate": 3.0038991247072477e-05, "loss": 4.4633, "step": 22105 }, { "epoch": 9.932614555256064, "grad_norm": 3.859375, "learning_rate": 3.0036516367947006e-05, "loss": 4.4803, "step": 22110 }, { "epoch": 9.93486073674753, "grad_norm": 4.25, "learning_rate": 3.0034122618009663e-05, "loss": 4.4513, "step": 22115 }, { "epoch": 9.937106918238994, "grad_norm": 3.953125, "learning_rate": 3.003180999869934e-05, "loss": 4.3843, "step": 22120 }, { "epoch": 9.939353099730457, "grad_norm": 3.96875, "learning_rate": 3.0029578511406223e-05, "loss": 4.4749, "step": 22125 }, { "epoch": 9.941599281221922, "grad_norm": 3.921875, "learning_rate": 3.002742815747171e-05, "loss": 4.4282, "step": 22130 }, { "epoch": 9.943845462713387, "grad_norm": 3.984375, "learning_rate": 3.0025358938188387e-05, "loss": 4.4134, "step": 22135 }, { "epoch": 9.946091644204852, "grad_norm": 4.09375, "learning_rate": 3.0023370854800123e-05, "loss": 4.5129, "step": 22140 }, { "epoch": 9.948337825696317, "grad_norm": 4.1875, "learning_rate": 3.002146390850201e-05, "loss": 4.523, "step": 22145 }, { "epoch": 9.95058400718778, "grad_norm": 3.875, "learning_rate": 3.0019638100440313e-05, "loss": 4.4539, "step": 22150 }, { "epoch": 9.952830188679245, "grad_norm": 4.0625, "learning_rate": 3.00178934317126e-05, "loss": 4.43, "step": 22155 }, { "epoch": 9.95507637017071, "grad_norm": 3.890625, "learning_rate": 3.0016229903367582e-05, "loss": 4.4415, "step": 22160 }, { "epoch": 9.957322551662175, "grad_norm": 3.84375, "learning_rate": 3.0014647516405286e-05, "loss": 4.5016, "step": 22165 }, { "epoch": 9.959568733153638, "grad_norm": 4.15625, "learning_rate": 3.0013146271776875e-05, "loss": 4.506, "step": 22170 }, { "epoch": 9.961814914645103, "grad_norm": 3.90625, "learning_rate": 3.001172617038481e-05, "loss": 4.4784, "step": 22175 }, { "epoch": 9.964061096136568, "grad_norm": 3.65625, "learning_rate": 3.0010387213082716e-05, "loss": 4.4242, "step": 22180 }, { "epoch": 9.966307277628033, "grad_norm": 4.15625, "learning_rate": 3.0009129400675485e-05, "loss": 4.4398, "step": 22185 }, { "epoch": 9.968553459119496, "grad_norm": 3.9375, "learning_rate": 3.0007952733919195e-05, "loss": 4.4891, "step": 22190 }, { "epoch": 9.970799640610961, "grad_norm": 4.1875, "learning_rate": 3.0006857213521175e-05, "loss": 4.4433, "step": 22195 }, { "epoch": 9.973045822102426, "grad_norm": 3.953125, "learning_rate": 3.0005842840139957e-05, "loss": 4.3985, "step": 22200 }, { "epoch": 9.975292003593891, "grad_norm": 3.8125, "learning_rate": 3.0004909614385315e-05, "loss": 4.4239, "step": 22205 }, { "epoch": 9.977538185085354, "grad_norm": 4.0625, "learning_rate": 3.0004057536818215e-05, "loss": 4.4357, "step": 22210 }, { "epoch": 9.979784366576819, "grad_norm": 4.0, "learning_rate": 3.000328660795086e-05, "loss": 4.4589, "step": 22215 }, { "epoch": 9.982030548068284, "grad_norm": 4.375, "learning_rate": 3.0002596828246667e-05, "loss": 4.4735, "step": 22220 }, { "epoch": 9.984276729559749, "grad_norm": 3.921875, "learning_rate": 3.0001988198120287e-05, "loss": 4.4521, "step": 22225 }, { "epoch": 9.986522911051212, "grad_norm": 3.921875, "learning_rate": 3.0001460717937572e-05, "loss": 4.4655, "step": 22230 }, { "epoch": 9.988769092542677, "grad_norm": 3.84375, "learning_rate": 3.0001014388015603e-05, "loss": 4.4577, "step": 22235 }, { "epoch": 9.991015274034142, "grad_norm": 3.953125, "learning_rate": 3.0000649208622676e-05, "loss": 4.4676, "step": 22240 }, { "epoch": 9.993261455525607, "grad_norm": 3.984375, "learning_rate": 3.000036517997831e-05, "loss": 4.4503, "step": 22245 }, { "epoch": 9.99550763701707, "grad_norm": 3.953125, "learning_rate": 3.0000162302253235e-05, "loss": 4.401, "step": 22250 }, { "epoch": 9.997753818508535, "grad_norm": 4.0, "learning_rate": 3.0000040575569408e-05, "loss": 4.4357, "step": 22255 }, { "epoch": 10.0, "grad_norm": 8.625, "learning_rate": 2.9999999999999997e-05, "loss": 4.5633, "step": 22260 } ], "logging_steps": 5, "max_steps": 22260, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.03170666514432e+17, "train_batch_size": 32, "trial_name": null, "trial_params": null }