{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.2461814914645104, "eval_steps": 1000, "global_step": 5000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0022461814914645105, "grad_norm": 54.0, "learning_rate": 7.499999999999999e-07, "loss": 10.989, "step": 5 }, { "epoch": 0.004492362982929021, "grad_norm": 52.75, "learning_rate": 1.4999999999999998e-06, "loss": 10.984, "step": 10 }, { "epoch": 0.006738544474393531, "grad_norm": 52.5, "learning_rate": 2.2499999999999996e-06, "loss": 10.9491, "step": 15 }, { "epoch": 0.008984725965858042, "grad_norm": 50.25, "learning_rate": 2.9999999999999997e-06, "loss": 10.8608, "step": 20 }, { "epoch": 0.011230907457322551, "grad_norm": 44.75, "learning_rate": 3.7499999999999997e-06, "loss": 10.7375, "step": 25 }, { "epoch": 0.013477088948787063, "grad_norm": 38.0, "learning_rate": 4.499999999999999e-06, "loss": 10.5621, "step": 30 }, { "epoch": 0.015723270440251572, "grad_norm": 25.5, "learning_rate": 5.25e-06, "loss": 10.3304, "step": 35 }, { "epoch": 0.017969451931716084, "grad_norm": 19.25, "learning_rate": 5.999999999999999e-06, "loss": 10.1403, "step": 40 }, { "epoch": 0.02021563342318059, "grad_norm": 13.8125, "learning_rate": 6.749999999999999e-06, "loss": 9.9521, "step": 45 }, { "epoch": 0.022461814914645103, "grad_norm": 11.1875, "learning_rate": 7.499999999999999e-06, "loss": 9.843, "step": 50 }, { "epoch": 0.024707996406109614, "grad_norm": 10.5, "learning_rate": 8.249999999999999e-06, "loss": 9.7584, "step": 55 }, { "epoch": 0.026954177897574125, "grad_norm": 10.0625, "learning_rate": 8.999999999999999e-06, "loss": 9.7293, "step": 60 }, { "epoch": 0.029200359389038633, "grad_norm": 9.1875, "learning_rate": 9.75e-06, "loss": 9.719, "step": 65 }, { "epoch": 0.031446540880503145, "grad_norm": 8.9375, "learning_rate": 1.05e-05, "loss": 9.6908, "step": 70 }, { "epoch": 0.03369272237196765, "grad_norm": 9.125, "learning_rate": 1.1249999999999999e-05, "loss": 9.6617, "step": 75 }, { "epoch": 0.03593890386343217, "grad_norm": 9.0625, "learning_rate": 1.1999999999999999e-05, "loss": 9.6228, "step": 80 }, { "epoch": 0.038185085354896675, "grad_norm": 9.125, "learning_rate": 1.275e-05, "loss": 9.6069, "step": 85 }, { "epoch": 0.04043126684636118, "grad_norm": 9.3125, "learning_rate": 1.3499999999999998e-05, "loss": 9.5342, "step": 90 }, { "epoch": 0.0426774483378257, "grad_norm": 8.9375, "learning_rate": 1.4249999999999999e-05, "loss": 9.5187, "step": 95 }, { "epoch": 0.044923629829290206, "grad_norm": 9.125, "learning_rate": 1.4999999999999999e-05, "loss": 9.4719, "step": 100 }, { "epoch": 0.04716981132075472, "grad_norm": 9.0, "learning_rate": 1.5749999999999997e-05, "loss": 9.4167, "step": 105 }, { "epoch": 0.04941599281221923, "grad_norm": 8.9375, "learning_rate": 1.6499999999999998e-05, "loss": 9.3825, "step": 110 }, { "epoch": 0.051662174303683736, "grad_norm": 8.75, "learning_rate": 1.725e-05, "loss": 9.3577, "step": 115 }, { "epoch": 0.05390835579514825, "grad_norm": 8.625, "learning_rate": 1.7999999999999997e-05, "loss": 9.3387, "step": 120 }, { "epoch": 0.05615453728661276, "grad_norm": 9.375, "learning_rate": 1.875e-05, "loss": 9.2947, "step": 125 }, { "epoch": 0.05840071877807727, "grad_norm": 8.75, "learning_rate": 1.95e-05, "loss": 9.2177, "step": 130 }, { "epoch": 0.06064690026954178, "grad_norm": 8.8125, "learning_rate": 2.025e-05, "loss": 9.1683, "step": 135 }, { "epoch": 0.06289308176100629, "grad_norm": 9.875, "learning_rate": 2.1e-05, "loss": 9.1444, "step": 140 }, { "epoch": 0.0651392632524708, "grad_norm": 9.625, "learning_rate": 2.1749999999999997e-05, "loss": 9.0632, "step": 145 }, { "epoch": 0.0673854447439353, "grad_norm": 8.8125, "learning_rate": 2.2499999999999998e-05, "loss": 9.0828, "step": 150 }, { "epoch": 0.06963162623539983, "grad_norm": 9.5625, "learning_rate": 2.325e-05, "loss": 9.0005, "step": 155 }, { "epoch": 0.07187780772686433, "grad_norm": 11.1875, "learning_rate": 2.3999999999999997e-05, "loss": 8.9463, "step": 160 }, { "epoch": 0.07412398921832884, "grad_norm": 9.3125, "learning_rate": 2.475e-05, "loss": 8.9145, "step": 165 }, { "epoch": 0.07637017070979335, "grad_norm": 8.1875, "learning_rate": 2.55e-05, "loss": 8.8803, "step": 170 }, { "epoch": 0.07861635220125786, "grad_norm": 7.65625, "learning_rate": 2.6249999999999998e-05, "loss": 8.8266, "step": 175 }, { "epoch": 0.08086253369272237, "grad_norm": 7.78125, "learning_rate": 2.6999999999999996e-05, "loss": 8.7826, "step": 180 }, { "epoch": 0.08310871518418689, "grad_norm": 8.875, "learning_rate": 2.7749999999999997e-05, "loss": 8.7463, "step": 185 }, { "epoch": 0.0853548966756514, "grad_norm": 8.375, "learning_rate": 2.8499999999999998e-05, "loss": 8.6836, "step": 190 }, { "epoch": 0.0876010781671159, "grad_norm": 8.5, "learning_rate": 2.925e-05, "loss": 8.6827, "step": 195 }, { "epoch": 0.08984725965858041, "grad_norm": 8.25, "learning_rate": 2.9999999999999997e-05, "loss": 8.588, "step": 200 }, { "epoch": 0.09209344115004492, "grad_norm": 8.3125, "learning_rate": 3.0749999999999995e-05, "loss": 8.5417, "step": 205 }, { "epoch": 0.09433962264150944, "grad_norm": 9.4375, "learning_rate": 3.149999999999999e-05, "loss": 8.5287, "step": 210 }, { "epoch": 0.09658580413297395, "grad_norm": 8.125, "learning_rate": 3.225e-05, "loss": 8.49, "step": 215 }, { "epoch": 0.09883198562443846, "grad_norm": 7.59375, "learning_rate": 3.2999999999999996e-05, "loss": 8.4025, "step": 220 }, { "epoch": 0.10107816711590296, "grad_norm": 8.75, "learning_rate": 3.375e-05, "loss": 8.3121, "step": 225 }, { "epoch": 0.10332434860736747, "grad_norm": 7.8125, "learning_rate": 3.45e-05, "loss": 8.2635, "step": 230 }, { "epoch": 0.10557053009883198, "grad_norm": 8.3125, "learning_rate": 3.5249999999999996e-05, "loss": 8.2691, "step": 235 }, { "epoch": 0.1078167115902965, "grad_norm": 9.4375, "learning_rate": 3.5999999999999994e-05, "loss": 8.1828, "step": 240 }, { "epoch": 0.11006289308176101, "grad_norm": 7.0625, "learning_rate": 3.675e-05, "loss": 8.0901, "step": 245 }, { "epoch": 0.11230907457322552, "grad_norm": 8.125, "learning_rate": 3.75e-05, "loss": 8.0418, "step": 250 }, { "epoch": 0.11455525606469003, "grad_norm": 7.0625, "learning_rate": 3.8249999999999995e-05, "loss": 8.0148, "step": 255 }, { "epoch": 0.11680143755615453, "grad_norm": 7.5625, "learning_rate": 3.9e-05, "loss": 7.9943, "step": 260 }, { "epoch": 0.11904761904761904, "grad_norm": 7.0625, "learning_rate": 3.975e-05, "loss": 7.852, "step": 265 }, { "epoch": 0.12129380053908356, "grad_norm": 6.6875, "learning_rate": 4.05e-05, "loss": 7.8506, "step": 270 }, { "epoch": 0.12353998203054807, "grad_norm": 7.46875, "learning_rate": 4.125e-05, "loss": 7.7912, "step": 275 }, { "epoch": 0.12578616352201258, "grad_norm": 6.0, "learning_rate": 4.2e-05, "loss": 7.7331, "step": 280 }, { "epoch": 0.1280323450134771, "grad_norm": 6.75, "learning_rate": 4.2749999999999996e-05, "loss": 7.6362, "step": 285 }, { "epoch": 0.1302785265049416, "grad_norm": 5.9375, "learning_rate": 4.3499999999999993e-05, "loss": 7.5867, "step": 290 }, { "epoch": 0.13252470799640612, "grad_norm": 6.40625, "learning_rate": 4.424999999999999e-05, "loss": 7.5268, "step": 295 }, { "epoch": 0.1347708894878706, "grad_norm": 5.71875, "learning_rate": 4.4999999999999996e-05, "loss": 7.5554, "step": 300 }, { "epoch": 0.13701707097933513, "grad_norm": 5.5, "learning_rate": 4.5749999999999994e-05, "loss": 7.4486, "step": 305 }, { "epoch": 0.13926325247079965, "grad_norm": 5.15625, "learning_rate": 4.65e-05, "loss": 7.4554, "step": 310 }, { "epoch": 0.14150943396226415, "grad_norm": 4.84375, "learning_rate": 4.7249999999999997e-05, "loss": 7.3681, "step": 315 }, { "epoch": 0.14375561545372867, "grad_norm": 7.625, "learning_rate": 4.7999999999999994e-05, "loss": 7.2977, "step": 320 }, { "epoch": 0.14600179694519316, "grad_norm": 5.25, "learning_rate": 4.875e-05, "loss": 7.2572, "step": 325 }, { "epoch": 0.14824797843665768, "grad_norm": 5.125, "learning_rate": 4.95e-05, "loss": 7.322, "step": 330 }, { "epoch": 0.15049415992812218, "grad_norm": 4.96875, "learning_rate": 5.025e-05, "loss": 7.2646, "step": 335 }, { "epoch": 0.1527403414195867, "grad_norm": 4.96875, "learning_rate": 5.1e-05, "loss": 7.32, "step": 340 }, { "epoch": 0.15498652291105122, "grad_norm": 5.3125, "learning_rate": 5.174999999999999e-05, "loss": 7.209, "step": 345 }, { "epoch": 0.15723270440251572, "grad_norm": 5.40625, "learning_rate": 5.2499999999999995e-05, "loss": 7.1961, "step": 350 }, { "epoch": 0.15947888589398024, "grad_norm": 4.15625, "learning_rate": 5.324999999999999e-05, "loss": 7.2062, "step": 355 }, { "epoch": 0.16172506738544473, "grad_norm": 4.65625, "learning_rate": 5.399999999999999e-05, "loss": 7.1401, "step": 360 }, { "epoch": 0.16397124887690925, "grad_norm": 5.71875, "learning_rate": 5.4749999999999996e-05, "loss": 7.1402, "step": 365 }, { "epoch": 0.16621743036837378, "grad_norm": 5.34375, "learning_rate": 5.5499999999999994e-05, "loss": 7.073, "step": 370 }, { "epoch": 0.16846361185983827, "grad_norm": 5.96875, "learning_rate": 5.625e-05, "loss": 7.115, "step": 375 }, { "epoch": 0.1707097933513028, "grad_norm": 4.625, "learning_rate": 5.6999999999999996e-05, "loss": 7.1363, "step": 380 }, { "epoch": 0.17295597484276728, "grad_norm": 5.34375, "learning_rate": 5.7749999999999994e-05, "loss": 7.1075, "step": 385 }, { "epoch": 0.1752021563342318, "grad_norm": 4.46875, "learning_rate": 5.85e-05, "loss": 7.0746, "step": 390 }, { "epoch": 0.17744833782569633, "grad_norm": 4.53125, "learning_rate": 5.925e-05, "loss": 7.0877, "step": 395 }, { "epoch": 0.17969451931716082, "grad_norm": 4.6875, "learning_rate": 5.9999999999999995e-05, "loss": 7.033, "step": 400 }, { "epoch": 0.18194070080862534, "grad_norm": 4.9375, "learning_rate": 6.075e-05, "loss": 7.0603, "step": 405 }, { "epoch": 0.18418688230008984, "grad_norm": 4.8125, "learning_rate": 6.149999999999999e-05, "loss": 7.0149, "step": 410 }, { "epoch": 0.18643306379155436, "grad_norm": 4.6875, "learning_rate": 6.225e-05, "loss": 6.9823, "step": 415 }, { "epoch": 0.18867924528301888, "grad_norm": 5.65625, "learning_rate": 6.299999999999999e-05, "loss": 7.0107, "step": 420 }, { "epoch": 0.19092542677448338, "grad_norm": 4.5625, "learning_rate": 6.374999999999999e-05, "loss": 7.0235, "step": 425 }, { "epoch": 0.1931716082659479, "grad_norm": 4.71875, "learning_rate": 6.45e-05, "loss": 6.9444, "step": 430 }, { "epoch": 0.1954177897574124, "grad_norm": 4.6875, "learning_rate": 6.525e-05, "loss": 6.9067, "step": 435 }, { "epoch": 0.1976639712488769, "grad_norm": 4.375, "learning_rate": 6.599999999999999e-05, "loss": 6.9952, "step": 440 }, { "epoch": 0.1999101527403414, "grad_norm": 4.46875, "learning_rate": 6.675e-05, "loss": 6.8992, "step": 445 }, { "epoch": 0.20215633423180593, "grad_norm": 4.875, "learning_rate": 6.75e-05, "loss": 6.931, "step": 450 }, { "epoch": 0.20440251572327045, "grad_norm": 4.6875, "learning_rate": 6.824999999999999e-05, "loss": 6.9036, "step": 455 }, { "epoch": 0.20664869721473494, "grad_norm": 4.75, "learning_rate": 6.9e-05, "loss": 6.9332, "step": 460 }, { "epoch": 0.20889487870619947, "grad_norm": 4.25, "learning_rate": 6.975e-05, "loss": 7.0612, "step": 465 }, { "epoch": 0.21114106019766396, "grad_norm": 4.59375, "learning_rate": 7.049999999999999e-05, "loss": 6.8777, "step": 470 }, { "epoch": 0.21338724168912848, "grad_norm": 4.59375, "learning_rate": 7.125e-05, "loss": 6.8593, "step": 475 }, { "epoch": 0.215633423180593, "grad_norm": 5.1875, "learning_rate": 7.199999999999999e-05, "loss": 6.9541, "step": 480 }, { "epoch": 0.2178796046720575, "grad_norm": 4.65625, "learning_rate": 7.274999999999999e-05, "loss": 6.878, "step": 485 }, { "epoch": 0.22012578616352202, "grad_norm": 5.1875, "learning_rate": 7.35e-05, "loss": 6.8284, "step": 490 }, { "epoch": 0.2223719676549865, "grad_norm": 3.9375, "learning_rate": 7.424999999999999e-05, "loss": 6.8567, "step": 495 }, { "epoch": 0.22461814914645103, "grad_norm": 5.1875, "learning_rate": 7.5e-05, "loss": 6.8235, "step": 500 }, { "epoch": 0.22686433063791556, "grad_norm": 4.65625, "learning_rate": 7.575e-05, "loss": 6.8903, "step": 505 }, { "epoch": 0.22911051212938005, "grad_norm": 5.875, "learning_rate": 7.649999999999999e-05, "loss": 6.8404, "step": 510 }, { "epoch": 0.23135669362084457, "grad_norm": 5.0625, "learning_rate": 7.725e-05, "loss": 6.8318, "step": 515 }, { "epoch": 0.23360287511230907, "grad_norm": 4.5625, "learning_rate": 7.8e-05, "loss": 6.8522, "step": 520 }, { "epoch": 0.2358490566037736, "grad_norm": 5.03125, "learning_rate": 7.874999999999999e-05, "loss": 6.859, "step": 525 }, { "epoch": 0.23809523809523808, "grad_norm": 4.71875, "learning_rate": 7.95e-05, "loss": 6.8336, "step": 530 }, { "epoch": 0.2403414195867026, "grad_norm": 4.875, "learning_rate": 8.025e-05, "loss": 6.7897, "step": 535 }, { "epoch": 0.24258760107816713, "grad_norm": 4.375, "learning_rate": 8.1e-05, "loss": 6.7873, "step": 540 }, { "epoch": 0.24483378256963162, "grad_norm": 4.34375, "learning_rate": 8.175e-05, "loss": 6.7691, "step": 545 }, { "epoch": 0.24707996406109614, "grad_norm": 4.40625, "learning_rate": 8.25e-05, "loss": 6.8252, "step": 550 }, { "epoch": 0.24932614555256064, "grad_norm": 4.6875, "learning_rate": 8.325e-05, "loss": 6.8071, "step": 555 }, { "epoch": 0.25157232704402516, "grad_norm": 4.65625, "learning_rate": 8.4e-05, "loss": 6.7156, "step": 560 }, { "epoch": 0.25381850853548965, "grad_norm": 4.875, "learning_rate": 8.474999999999999e-05, "loss": 6.8189, "step": 565 }, { "epoch": 0.2560646900269542, "grad_norm": 4.53125, "learning_rate": 8.549999999999999e-05, "loss": 6.8159, "step": 570 }, { "epoch": 0.2583108715184187, "grad_norm": 3.75, "learning_rate": 8.624999999999998e-05, "loss": 6.847, "step": 575 }, { "epoch": 0.2605570530098832, "grad_norm": 4.71875, "learning_rate": 8.699999999999999e-05, "loss": 6.7576, "step": 580 }, { "epoch": 0.2628032345013477, "grad_norm": 5.375, "learning_rate": 8.774999999999999e-05, "loss": 6.7211, "step": 585 }, { "epoch": 0.26504941599281223, "grad_norm": 4.875, "learning_rate": 8.849999999999998e-05, "loss": 6.7255, "step": 590 }, { "epoch": 0.2672955974842767, "grad_norm": 4.71875, "learning_rate": 8.924999999999999e-05, "loss": 6.6598, "step": 595 }, { "epoch": 0.2695417789757412, "grad_norm": 4.25, "learning_rate": 8.999999999999999e-05, "loss": 6.7735, "step": 600 }, { "epoch": 0.27178796046720577, "grad_norm": 4.3125, "learning_rate": 9.074999999999998e-05, "loss": 6.7253, "step": 605 }, { "epoch": 0.27403414195867026, "grad_norm": 4.8125, "learning_rate": 9.149999999999999e-05, "loss": 6.6825, "step": 610 }, { "epoch": 0.27628032345013476, "grad_norm": 4.40625, "learning_rate": 9.224999999999999e-05, "loss": 6.7523, "step": 615 }, { "epoch": 0.2785265049415993, "grad_norm": 4.46875, "learning_rate": 9.3e-05, "loss": 6.7212, "step": 620 }, { "epoch": 0.2807726864330638, "grad_norm": 4.875, "learning_rate": 9.374999999999999e-05, "loss": 6.7052, "step": 625 }, { "epoch": 0.2830188679245283, "grad_norm": 4.6875, "learning_rate": 9.449999999999999e-05, "loss": 6.7031, "step": 630 }, { "epoch": 0.2852650494159928, "grad_norm": 4.1875, "learning_rate": 9.525e-05, "loss": 6.7163, "step": 635 }, { "epoch": 0.28751123090745734, "grad_norm": 4.15625, "learning_rate": 9.599999999999999e-05, "loss": 6.7148, "step": 640 }, { "epoch": 0.28975741239892183, "grad_norm": 3.78125, "learning_rate": 9.675e-05, "loss": 6.7027, "step": 645 }, { "epoch": 0.2920035938903863, "grad_norm": 4.375, "learning_rate": 9.75e-05, "loss": 6.6511, "step": 650 }, { "epoch": 0.2942497753818509, "grad_norm": 4.0625, "learning_rate": 9.824999999999999e-05, "loss": 6.704, "step": 655 }, { "epoch": 0.29649595687331537, "grad_norm": 4.09375, "learning_rate": 9.9e-05, "loss": 6.689, "step": 660 }, { "epoch": 0.29874213836477986, "grad_norm": 3.875, "learning_rate": 9.975e-05, "loss": 6.6784, "step": 665 }, { "epoch": 0.30098831985624436, "grad_norm": 4.5, "learning_rate": 0.0001005, "loss": 6.597, "step": 670 }, { "epoch": 0.3032345013477089, "grad_norm": 4.3125, "learning_rate": 0.00010125, "loss": 6.6198, "step": 675 }, { "epoch": 0.3054806828391734, "grad_norm": 4.03125, "learning_rate": 0.000102, "loss": 6.6226, "step": 680 }, { "epoch": 0.3077268643306379, "grad_norm": 4.03125, "learning_rate": 0.00010275, "loss": 6.627, "step": 685 }, { "epoch": 0.30997304582210244, "grad_norm": 5.1875, "learning_rate": 0.00010349999999999998, "loss": 6.6003, "step": 690 }, { "epoch": 0.31221922731356694, "grad_norm": 3.640625, "learning_rate": 0.00010424999999999999, "loss": 6.5845, "step": 695 }, { "epoch": 0.31446540880503143, "grad_norm": 4.4375, "learning_rate": 0.00010499999999999999, "loss": 6.6143, "step": 700 }, { "epoch": 0.316711590296496, "grad_norm": 4.90625, "learning_rate": 0.00010574999999999998, "loss": 6.6305, "step": 705 }, { "epoch": 0.3189577717879605, "grad_norm": 4.3125, "learning_rate": 0.00010649999999999999, "loss": 6.5312, "step": 710 }, { "epoch": 0.32120395327942497, "grad_norm": 4.15625, "learning_rate": 0.00010724999999999999, "loss": 6.63, "step": 715 }, { "epoch": 0.32345013477088946, "grad_norm": 4.53125, "learning_rate": 0.00010799999999999998, "loss": 6.564, "step": 720 }, { "epoch": 0.325696316262354, "grad_norm": 4.03125, "learning_rate": 0.00010874999999999999, "loss": 6.6572, "step": 725 }, { "epoch": 0.3279424977538185, "grad_norm": 4.40625, "learning_rate": 0.00010949999999999999, "loss": 6.5728, "step": 730 }, { "epoch": 0.330188679245283, "grad_norm": 4.34375, "learning_rate": 0.00011024999999999998, "loss": 6.5245, "step": 735 }, { "epoch": 0.33243486073674755, "grad_norm": 5.5, "learning_rate": 0.00011099999999999999, "loss": 6.5883, "step": 740 }, { "epoch": 0.33468104222821204, "grad_norm": 5.53125, "learning_rate": 0.00011174999999999999, "loss": 6.5549, "step": 745 }, { "epoch": 0.33692722371967654, "grad_norm": 4.40625, "learning_rate": 0.0001125, "loss": 6.5269, "step": 750 }, { "epoch": 0.33917340521114103, "grad_norm": 4.65625, "learning_rate": 0.00011324999999999999, "loss": 6.5262, "step": 755 }, { "epoch": 0.3414195867026056, "grad_norm": 4.25, "learning_rate": 0.00011399999999999999, "loss": 6.4958, "step": 760 }, { "epoch": 0.3436657681940701, "grad_norm": 4.34375, "learning_rate": 0.00011475, "loss": 6.4719, "step": 765 }, { "epoch": 0.34591194968553457, "grad_norm": 3.828125, "learning_rate": 0.00011549999999999999, "loss": 6.4948, "step": 770 }, { "epoch": 0.3481581311769991, "grad_norm": 3.890625, "learning_rate": 0.00011624999999999999, "loss": 6.5652, "step": 775 }, { "epoch": 0.3504043126684636, "grad_norm": 3.828125, "learning_rate": 0.000117, "loss": 6.633, "step": 780 }, { "epoch": 0.3526504941599281, "grad_norm": 3.78125, "learning_rate": 0.00011774999999999999, "loss": 6.4617, "step": 785 }, { "epoch": 0.35489667565139266, "grad_norm": 3.9375, "learning_rate": 0.0001185, "loss": 6.524, "step": 790 }, { "epoch": 0.35714285714285715, "grad_norm": 7.25, "learning_rate": 0.00011925, "loss": 6.4985, "step": 795 }, { "epoch": 0.35938903863432164, "grad_norm": 3.828125, "learning_rate": 0.00011999999999999999, "loss": 6.4988, "step": 800 }, { "epoch": 0.36163522012578614, "grad_norm": 5.125, "learning_rate": 0.00012075, "loss": 6.5393, "step": 805 }, { "epoch": 0.3638814016172507, "grad_norm": 4.90625, "learning_rate": 0.0001215, "loss": 6.4869, "step": 810 }, { "epoch": 0.3661275831087152, "grad_norm": 4.1875, "learning_rate": 0.00012225, "loss": 6.4419, "step": 815 }, { "epoch": 0.3683737646001797, "grad_norm": 3.765625, "learning_rate": 0.00012299999999999998, "loss": 6.574, "step": 820 }, { "epoch": 0.3706199460916442, "grad_norm": 3.796875, "learning_rate": 0.00012374999999999997, "loss": 6.5063, "step": 825 }, { "epoch": 0.3728661275831087, "grad_norm": 3.734375, "learning_rate": 0.0001245, "loss": 6.5404, "step": 830 }, { "epoch": 0.3751123090745732, "grad_norm": 3.65625, "learning_rate": 0.00012524999999999998, "loss": 6.4726, "step": 835 }, { "epoch": 0.37735849056603776, "grad_norm": 4.0, "learning_rate": 0.00012599999999999997, "loss": 6.4099, "step": 840 }, { "epoch": 0.37960467205750226, "grad_norm": 4.25, "learning_rate": 0.00012675, "loss": 6.3966, "step": 845 }, { "epoch": 0.38185085354896675, "grad_norm": 3.828125, "learning_rate": 0.00012749999999999998, "loss": 6.4607, "step": 850 }, { "epoch": 0.38409703504043125, "grad_norm": 4.28125, "learning_rate": 0.00012824999999999997, "loss": 6.4718, "step": 855 }, { "epoch": 0.3863432165318958, "grad_norm": 4.71875, "learning_rate": 0.000129, "loss": 6.4569, "step": 860 }, { "epoch": 0.3885893980233603, "grad_norm": 4.375, "learning_rate": 0.00012974999999999998, "loss": 6.3576, "step": 865 }, { "epoch": 0.3908355795148248, "grad_norm": 4.65625, "learning_rate": 0.0001305, "loss": 6.4259, "step": 870 }, { "epoch": 0.39308176100628933, "grad_norm": 4.96875, "learning_rate": 0.00013125, "loss": 6.3831, "step": 875 }, { "epoch": 0.3953279424977538, "grad_norm": 3.90625, "learning_rate": 0.00013199999999999998, "loss": 6.4086, "step": 880 }, { "epoch": 0.3975741239892183, "grad_norm": 3.75, "learning_rate": 0.00013275, "loss": 6.3207, "step": 885 }, { "epoch": 0.3998203054806828, "grad_norm": 4.28125, "learning_rate": 0.0001335, "loss": 6.4129, "step": 890 }, { "epoch": 0.40206648697214736, "grad_norm": 3.8125, "learning_rate": 0.00013424999999999998, "loss": 6.4397, "step": 895 }, { "epoch": 0.40431266846361186, "grad_norm": 3.921875, "learning_rate": 0.000135, "loss": 6.4104, "step": 900 }, { "epoch": 0.40655884995507635, "grad_norm": 3.984375, "learning_rate": 0.00013575, "loss": 6.3327, "step": 905 }, { "epoch": 0.4088050314465409, "grad_norm": 3.859375, "learning_rate": 0.00013649999999999998, "loss": 6.3965, "step": 910 }, { "epoch": 0.4110512129380054, "grad_norm": 4.03125, "learning_rate": 0.00013725, "loss": 6.3614, "step": 915 }, { "epoch": 0.4132973944294699, "grad_norm": 3.734375, "learning_rate": 0.000138, "loss": 6.3743, "step": 920 }, { "epoch": 0.41554357592093444, "grad_norm": 3.984375, "learning_rate": 0.00013874999999999998, "loss": 6.4228, "step": 925 }, { "epoch": 0.41778975741239893, "grad_norm": 4.03125, "learning_rate": 0.0001395, "loss": 6.4047, "step": 930 }, { "epoch": 0.4200359389038634, "grad_norm": 3.984375, "learning_rate": 0.00014025, "loss": 6.3634, "step": 935 }, { "epoch": 0.4222821203953279, "grad_norm": 4.0, "learning_rate": 0.00014099999999999998, "loss": 6.3866, "step": 940 }, { "epoch": 0.42452830188679247, "grad_norm": 3.796875, "learning_rate": 0.00014174999999999998, "loss": 6.3599, "step": 945 }, { "epoch": 0.42677448337825696, "grad_norm": 4.03125, "learning_rate": 0.0001425, "loss": 6.3422, "step": 950 }, { "epoch": 0.42902066486972146, "grad_norm": 4.15625, "learning_rate": 0.00014324999999999999, "loss": 6.2791, "step": 955 }, { "epoch": 0.431266846361186, "grad_norm": 3.96875, "learning_rate": 0.00014399999999999998, "loss": 6.3505, "step": 960 }, { "epoch": 0.4335130278526505, "grad_norm": 4.5, "learning_rate": 0.00014475, "loss": 6.3671, "step": 965 }, { "epoch": 0.435759209344115, "grad_norm": 3.65625, "learning_rate": 0.00014549999999999999, "loss": 6.318, "step": 970 }, { "epoch": 0.4380053908355795, "grad_norm": 4.28125, "learning_rate": 0.00014624999999999998, "loss": 6.3299, "step": 975 }, { "epoch": 0.44025157232704404, "grad_norm": 3.578125, "learning_rate": 0.000147, "loss": 6.4073, "step": 980 }, { "epoch": 0.44249775381850853, "grad_norm": 3.734375, "learning_rate": 0.00014774999999999999, "loss": 6.4377, "step": 985 }, { "epoch": 0.444743935309973, "grad_norm": 3.765625, "learning_rate": 0.00014849999999999998, "loss": 6.2784, "step": 990 }, { "epoch": 0.4469901168014376, "grad_norm": 3.953125, "learning_rate": 0.00014925, "loss": 6.2901, "step": 995 }, { "epoch": 0.44923629829290207, "grad_norm": 4.375, "learning_rate": 0.00015, "loss": 6.2973, "step": 1000 }, { "epoch": 0.44923629829290207, "eval_loss": 6.229096412658691, "eval_runtime": 16.2469, "eval_samples_per_second": 1908.854, "eval_steps_per_second": 238.63, "step": 1000 }, { "epoch": 0.45148247978436656, "grad_norm": 3.78125, "learning_rate": 0.00015074999999999998, "loss": 6.3253, "step": 1005 }, { "epoch": 0.4537286612758311, "grad_norm": 3.953125, "learning_rate": 0.0001515, "loss": 6.2906, "step": 1010 }, { "epoch": 0.4559748427672956, "grad_norm": 3.90625, "learning_rate": 0.00015224999999999996, "loss": 6.3351, "step": 1015 }, { "epoch": 0.4582210242587601, "grad_norm": 3.6875, "learning_rate": 0.00015299999999999998, "loss": 6.368, "step": 1020 }, { "epoch": 0.4604672057502246, "grad_norm": 3.796875, "learning_rate": 0.00015374999999999997, "loss": 6.3008, "step": 1025 }, { "epoch": 0.46271338724168914, "grad_norm": 3.703125, "learning_rate": 0.0001545, "loss": 6.283, "step": 1030 }, { "epoch": 0.46495956873315364, "grad_norm": 3.734375, "learning_rate": 0.00015524999999999998, "loss": 6.3212, "step": 1035 }, { "epoch": 0.46720575022461813, "grad_norm": 4.15625, "learning_rate": 0.000156, "loss": 6.2874, "step": 1040 }, { "epoch": 0.4694519317160827, "grad_norm": 3.484375, "learning_rate": 0.00015675, "loss": 6.2944, "step": 1045 }, { "epoch": 0.4716981132075472, "grad_norm": 4.3125, "learning_rate": 0.00015749999999999998, "loss": 6.3099, "step": 1050 }, { "epoch": 0.47394429469901167, "grad_norm": 3.734375, "learning_rate": 0.00015824999999999997, "loss": 6.2531, "step": 1055 }, { "epoch": 0.47619047619047616, "grad_norm": 3.609375, "learning_rate": 0.000159, "loss": 6.2326, "step": 1060 }, { "epoch": 0.4784366576819407, "grad_norm": 3.8125, "learning_rate": 0.00015974999999999998, "loss": 6.2059, "step": 1065 }, { "epoch": 0.4806828391734052, "grad_norm": 3.625, "learning_rate": 0.0001605, "loss": 6.2798, "step": 1070 }, { "epoch": 0.4829290206648697, "grad_norm": 3.890625, "learning_rate": 0.00016125, "loss": 6.2814, "step": 1075 }, { "epoch": 0.48517520215633425, "grad_norm": 3.84375, "learning_rate": 0.000162, "loss": 6.1955, "step": 1080 }, { "epoch": 0.48742138364779874, "grad_norm": 4.0, "learning_rate": 0.00016274999999999997, "loss": 6.3142, "step": 1085 }, { "epoch": 0.48966756513926324, "grad_norm": 3.71875, "learning_rate": 0.0001635, "loss": 6.193, "step": 1090 }, { "epoch": 0.4919137466307278, "grad_norm": 4.0, "learning_rate": 0.00016424999999999998, "loss": 6.26, "step": 1095 }, { "epoch": 0.4941599281221923, "grad_norm": 4.0625, "learning_rate": 0.000165, "loss": 6.2443, "step": 1100 }, { "epoch": 0.4964061096136568, "grad_norm": 3.671875, "learning_rate": 0.00016575, "loss": 6.2278, "step": 1105 }, { "epoch": 0.49865229110512127, "grad_norm": 3.6875, "learning_rate": 0.0001665, "loss": 6.2254, "step": 1110 }, { "epoch": 0.5008984725965858, "grad_norm": 3.921875, "learning_rate": 0.00016724999999999997, "loss": 6.3325, "step": 1115 }, { "epoch": 0.5031446540880503, "grad_norm": 3.921875, "learning_rate": 0.000168, "loss": 6.186, "step": 1120 }, { "epoch": 0.5053908355795148, "grad_norm": 3.859375, "learning_rate": 0.00016874999999999998, "loss": 6.2389, "step": 1125 }, { "epoch": 0.5076370170709793, "grad_norm": 4.71875, "learning_rate": 0.00016949999999999997, "loss": 6.1268, "step": 1130 }, { "epoch": 0.5098831985624438, "grad_norm": 3.90625, "learning_rate": 0.00017025, "loss": 6.1445, "step": 1135 }, { "epoch": 0.5121293800539084, "grad_norm": 3.484375, "learning_rate": 0.00017099999999999998, "loss": 6.1658, "step": 1140 }, { "epoch": 0.5143755615453729, "grad_norm": 3.78125, "learning_rate": 0.00017175, "loss": 6.1832, "step": 1145 }, { "epoch": 0.5166217430368374, "grad_norm": 3.96875, "learning_rate": 0.00017249999999999996, "loss": 6.1621, "step": 1150 }, { "epoch": 0.5188679245283019, "grad_norm": 3.765625, "learning_rate": 0.00017324999999999998, "loss": 6.22, "step": 1155 }, { "epoch": 0.5211141060197664, "grad_norm": 3.890625, "learning_rate": 0.00017399999999999997, "loss": 6.1432, "step": 1160 }, { "epoch": 0.5233602875112309, "grad_norm": 3.59375, "learning_rate": 0.00017475, "loss": 6.1223, "step": 1165 }, { "epoch": 0.5256064690026954, "grad_norm": 3.28125, "learning_rate": 0.00017549999999999998, "loss": 6.1839, "step": 1170 }, { "epoch": 0.52785265049416, "grad_norm": 3.9375, "learning_rate": 0.00017625, "loss": 6.2021, "step": 1175 }, { "epoch": 0.5300988319856245, "grad_norm": 4.03125, "learning_rate": 0.00017699999999999997, "loss": 6.1947, "step": 1180 }, { "epoch": 0.532345013477089, "grad_norm": 4.5, "learning_rate": 0.00017774999999999998, "loss": 6.1474, "step": 1185 }, { "epoch": 0.5345911949685535, "grad_norm": 3.671875, "learning_rate": 0.00017849999999999997, "loss": 6.1488, "step": 1190 }, { "epoch": 0.536837376460018, "grad_norm": 3.734375, "learning_rate": 0.00017925, "loss": 6.1943, "step": 1195 }, { "epoch": 0.5390835579514824, "grad_norm": 3.8125, "learning_rate": 0.00017999999999999998, "loss": 6.13, "step": 1200 }, { "epoch": 0.541329739442947, "grad_norm": 3.828125, "learning_rate": 0.00018075, "loss": 6.0818, "step": 1205 }, { "epoch": 0.5435759209344115, "grad_norm": 3.546875, "learning_rate": 0.00018149999999999997, "loss": 6.1505, "step": 1210 }, { "epoch": 0.545822102425876, "grad_norm": 4.03125, "learning_rate": 0.00018224999999999998, "loss": 6.1578, "step": 1215 }, { "epoch": 0.5480682839173405, "grad_norm": 3.921875, "learning_rate": 0.00018299999999999998, "loss": 6.0904, "step": 1220 }, { "epoch": 0.550314465408805, "grad_norm": 4.1875, "learning_rate": 0.00018375, "loss": 6.0851, "step": 1225 }, { "epoch": 0.5525606469002695, "grad_norm": 4.21875, "learning_rate": 0.00018449999999999999, "loss": 6.1133, "step": 1230 }, { "epoch": 0.554806828391734, "grad_norm": 3.765625, "learning_rate": 0.00018525, "loss": 6.1453, "step": 1235 }, { "epoch": 0.5570530098831986, "grad_norm": 3.671875, "learning_rate": 0.000186, "loss": 6.1572, "step": 1240 }, { "epoch": 0.5592991913746631, "grad_norm": 3.8125, "learning_rate": 0.00018675, "loss": 6.2205, "step": 1245 }, { "epoch": 0.5615453728661276, "grad_norm": 4.4375, "learning_rate": 0.00018749999999999998, "loss": 6.1114, "step": 1250 }, { "epoch": 0.5637915543575921, "grad_norm": 4.03125, "learning_rate": 0.00018824999999999997, "loss": 6.1407, "step": 1255 }, { "epoch": 0.5660377358490566, "grad_norm": 4.1875, "learning_rate": 0.00018899999999999999, "loss": 6.1272, "step": 1260 }, { "epoch": 0.5682839173405211, "grad_norm": 4.03125, "learning_rate": 0.00018974999999999998, "loss": 6.1264, "step": 1265 }, { "epoch": 0.5705300988319856, "grad_norm": 4.09375, "learning_rate": 0.0001905, "loss": 6.0308, "step": 1270 }, { "epoch": 0.5727762803234502, "grad_norm": 3.421875, "learning_rate": 0.00019124999999999996, "loss": 6.1028, "step": 1275 }, { "epoch": 0.5750224618149147, "grad_norm": 3.953125, "learning_rate": 0.00019199999999999998, "loss": 6.1002, "step": 1280 }, { "epoch": 0.5772686433063792, "grad_norm": 4.1875, "learning_rate": 0.00019274999999999997, "loss": 6.1451, "step": 1285 }, { "epoch": 0.5795148247978437, "grad_norm": 4.0625, "learning_rate": 0.0001935, "loss": 6.0798, "step": 1290 }, { "epoch": 0.5817610062893082, "grad_norm": 3.609375, "learning_rate": 0.00019424999999999998, "loss": 6.0831, "step": 1295 }, { "epoch": 0.5840071877807727, "grad_norm": 3.671875, "learning_rate": 0.000195, "loss": 6.1054, "step": 1300 }, { "epoch": 0.5862533692722371, "grad_norm": 3.625, "learning_rate": 0.00019574999999999996, "loss": 6.0122, "step": 1305 }, { "epoch": 0.5884995507637018, "grad_norm": 4.0625, "learning_rate": 0.00019649999999999998, "loss": 6.0397, "step": 1310 }, { "epoch": 0.5907457322551662, "grad_norm": 3.59375, "learning_rate": 0.00019724999999999997, "loss": 5.9765, "step": 1315 }, { "epoch": 0.5929919137466307, "grad_norm": 3.296875, "learning_rate": 0.000198, "loss": 6.0359, "step": 1320 }, { "epoch": 0.5952380952380952, "grad_norm": 3.828125, "learning_rate": 0.00019874999999999998, "loss": 6.0552, "step": 1325 }, { "epoch": 0.5974842767295597, "grad_norm": 3.5625, "learning_rate": 0.0001995, "loss": 6.0254, "step": 1330 }, { "epoch": 0.5997304582210242, "grad_norm": 3.703125, "learning_rate": 0.00020025, "loss": 6.0575, "step": 1335 }, { "epoch": 0.6019766397124887, "grad_norm": 3.59375, "learning_rate": 0.000201, "loss": 6.004, "step": 1340 }, { "epoch": 0.6042228212039533, "grad_norm": 3.65625, "learning_rate": 0.00020174999999999997, "loss": 6.0784, "step": 1345 }, { "epoch": 0.6064690026954178, "grad_norm": 3.78125, "learning_rate": 0.0002025, "loss": 6.1157, "step": 1350 }, { "epoch": 0.6087151841868823, "grad_norm": 3.65625, "learning_rate": 0.00020324999999999998, "loss": 6.0583, "step": 1355 }, { "epoch": 0.6109613656783468, "grad_norm": 3.4375, "learning_rate": 0.000204, "loss": 6.0366, "step": 1360 }, { "epoch": 0.6132075471698113, "grad_norm": 3.4375, "learning_rate": 0.00020475, "loss": 6.1213, "step": 1365 }, { "epoch": 0.6154537286612758, "grad_norm": 3.8125, "learning_rate": 0.0002055, "loss": 6.1744, "step": 1370 }, { "epoch": 0.6176999101527404, "grad_norm": 3.8125, "learning_rate": 0.00020624999999999997, "loss": 6.0912, "step": 1375 }, { "epoch": 0.6199460916442049, "grad_norm": 3.421875, "learning_rate": 0.00020699999999999996, "loss": 5.9619, "step": 1380 }, { "epoch": 0.6221922731356694, "grad_norm": 3.78125, "learning_rate": 0.00020774999999999998, "loss": 5.9658, "step": 1385 }, { "epoch": 0.6244384546271339, "grad_norm": 3.484375, "learning_rate": 0.00020849999999999997, "loss": 6.0913, "step": 1390 }, { "epoch": 0.6266846361185984, "grad_norm": 3.484375, "learning_rate": 0.00020925, "loss": 6.0363, "step": 1395 }, { "epoch": 0.6289308176100629, "grad_norm": 3.890625, "learning_rate": 0.00020999999999999998, "loss": 5.9513, "step": 1400 }, { "epoch": 0.6311769991015274, "grad_norm": 4.0625, "learning_rate": 0.00021074999999999997, "loss": 5.9931, "step": 1405 }, { "epoch": 0.633423180592992, "grad_norm": 4.0, "learning_rate": 0.00021149999999999996, "loss": 5.9732, "step": 1410 }, { "epoch": 0.6356693620844565, "grad_norm": 3.671875, "learning_rate": 0.00021224999999999998, "loss": 6.0028, "step": 1415 }, { "epoch": 0.637915543575921, "grad_norm": 3.5, "learning_rate": 0.00021299999999999997, "loss": 6.0171, "step": 1420 }, { "epoch": 0.6401617250673854, "grad_norm": 3.421875, "learning_rate": 0.00021375, "loss": 5.9886, "step": 1425 }, { "epoch": 0.6424079065588499, "grad_norm": 3.875, "learning_rate": 0.00021449999999999998, "loss": 5.9436, "step": 1430 }, { "epoch": 0.6446540880503144, "grad_norm": 3.3125, "learning_rate": 0.00021525, "loss": 6.0565, "step": 1435 }, { "epoch": 0.6469002695417789, "grad_norm": 3.640625, "learning_rate": 0.00021599999999999996, "loss": 6.1117, "step": 1440 }, { "epoch": 0.6491464510332435, "grad_norm": 3.625, "learning_rate": 0.00021674999999999998, "loss": 5.9778, "step": 1445 }, { "epoch": 0.651392632524708, "grad_norm": 4.0625, "learning_rate": 0.00021749999999999997, "loss": 5.9706, "step": 1450 }, { "epoch": 0.6536388140161725, "grad_norm": 4.15625, "learning_rate": 0.00021825, "loss": 5.9358, "step": 1455 }, { "epoch": 0.655884995507637, "grad_norm": 3.5, "learning_rate": 0.00021899999999999998, "loss": 6.0584, "step": 1460 }, { "epoch": 0.6581311769991015, "grad_norm": 3.734375, "learning_rate": 0.00021975, "loss": 6.0055, "step": 1465 }, { "epoch": 0.660377358490566, "grad_norm": 3.78125, "learning_rate": 0.00022049999999999997, "loss": 5.9678, "step": 1470 }, { "epoch": 0.6626235399820305, "grad_norm": 3.703125, "learning_rate": 0.00022124999999999998, "loss": 5.9747, "step": 1475 }, { "epoch": 0.6648697214734951, "grad_norm": 3.46875, "learning_rate": 0.00022199999999999998, "loss": 5.9542, "step": 1480 }, { "epoch": 0.6671159029649596, "grad_norm": 3.34375, "learning_rate": 0.00022275, "loss": 5.9001, "step": 1485 }, { "epoch": 0.6693620844564241, "grad_norm": 3.65625, "learning_rate": 0.00022349999999999998, "loss": 5.9689, "step": 1490 }, { "epoch": 0.6716082659478886, "grad_norm": 3.953125, "learning_rate": 0.00022425, "loss": 5.9823, "step": 1495 }, { "epoch": 0.6738544474393531, "grad_norm": 3.53125, "learning_rate": 0.000225, "loss": 5.9758, "step": 1500 }, { "epoch": 0.6761006289308176, "grad_norm": 3.484375, "learning_rate": 0.00022574999999999996, "loss": 5.9994, "step": 1505 }, { "epoch": 0.6783468104222821, "grad_norm": 3.6875, "learning_rate": 0.00022649999999999998, "loss": 5.8979, "step": 1510 }, { "epoch": 0.6805929919137467, "grad_norm": 3.328125, "learning_rate": 0.00022724999999999997, "loss": 6.0046, "step": 1515 }, { "epoch": 0.6828391734052112, "grad_norm": 3.75, "learning_rate": 0.00022799999999999999, "loss": 5.9637, "step": 1520 }, { "epoch": 0.6850853548966757, "grad_norm": 3.296875, "learning_rate": 0.00022874999999999998, "loss": 5.939, "step": 1525 }, { "epoch": 0.6873315363881402, "grad_norm": 3.484375, "learning_rate": 0.0002295, "loss": 6.0089, "step": 1530 }, { "epoch": 0.6895777178796046, "grad_norm": 3.46875, "learning_rate": 0.00023024999999999996, "loss": 5.9247, "step": 1535 }, { "epoch": 0.6918238993710691, "grad_norm": 3.3125, "learning_rate": 0.00023099999999999998, "loss": 5.8969, "step": 1540 }, { "epoch": 0.6940700808625337, "grad_norm": 3.734375, "learning_rate": 0.00023174999999999997, "loss": 5.8485, "step": 1545 }, { "epoch": 0.6963162623539982, "grad_norm": 3.375, "learning_rate": 0.00023249999999999999, "loss": 5.9481, "step": 1550 }, { "epoch": 0.6985624438454627, "grad_norm": 3.5625, "learning_rate": 0.00023324999999999998, "loss": 5.9145, "step": 1555 }, { "epoch": 0.7008086253369272, "grad_norm": 3.5, "learning_rate": 0.000234, "loss": 5.8711, "step": 1560 }, { "epoch": 0.7030548068283917, "grad_norm": 3.703125, "learning_rate": 0.00023474999999999996, "loss": 5.9697, "step": 1565 }, { "epoch": 0.7053009883198562, "grad_norm": 3.75, "learning_rate": 0.00023549999999999998, "loss": 5.8905, "step": 1570 }, { "epoch": 0.7075471698113207, "grad_norm": 3.59375, "learning_rate": 0.00023624999999999997, "loss": 5.9357, "step": 1575 }, { "epoch": 0.7097933513027853, "grad_norm": 3.453125, "learning_rate": 0.000237, "loss": 5.8548, "step": 1580 }, { "epoch": 0.7120395327942498, "grad_norm": 3.484375, "learning_rate": 0.00023774999999999998, "loss": 5.9498, "step": 1585 }, { "epoch": 0.7142857142857143, "grad_norm": 3.78125, "learning_rate": 0.0002385, "loss": 5.8457, "step": 1590 }, { "epoch": 0.7165318957771788, "grad_norm": 3.5625, "learning_rate": 0.00023925, "loss": 5.8717, "step": 1595 }, { "epoch": 0.7187780772686433, "grad_norm": 3.328125, "learning_rate": 0.00023999999999999998, "loss": 5.8193, "step": 1600 }, { "epoch": 0.7210242587601078, "grad_norm": 3.296875, "learning_rate": 0.00024074999999999997, "loss": 5.8618, "step": 1605 }, { "epoch": 0.7232704402515723, "grad_norm": 3.625, "learning_rate": 0.0002415, "loss": 5.8882, "step": 1610 }, { "epoch": 0.7255166217430369, "grad_norm": 3.28125, "learning_rate": 0.00024224999999999998, "loss": 5.9087, "step": 1615 }, { "epoch": 0.7277628032345014, "grad_norm": 3.53125, "learning_rate": 0.000243, "loss": 5.8994, "step": 1620 }, { "epoch": 0.7300089847259659, "grad_norm": 3.34375, "learning_rate": 0.00024375, "loss": 5.9156, "step": 1625 }, { "epoch": 0.7322551662174304, "grad_norm": 3.78125, "learning_rate": 0.0002445, "loss": 5.889, "step": 1630 }, { "epoch": 0.7345013477088949, "grad_norm": 3.5, "learning_rate": 0.00024524999999999997, "loss": 5.8538, "step": 1635 }, { "epoch": 0.7367475292003594, "grad_norm": 3.53125, "learning_rate": 0.00024599999999999996, "loss": 5.914, "step": 1640 }, { "epoch": 0.7389937106918238, "grad_norm": 3.25, "learning_rate": 0.00024675, "loss": 5.8628, "step": 1645 }, { "epoch": 0.7412398921832885, "grad_norm": 3.5, "learning_rate": 0.00024749999999999994, "loss": 5.8555, "step": 1650 }, { "epoch": 0.743486073674753, "grad_norm": 3.4375, "learning_rate": 0.00024825, "loss": 5.8846, "step": 1655 }, { "epoch": 0.7457322551662174, "grad_norm": 3.703125, "learning_rate": 0.000249, "loss": 5.8957, "step": 1660 }, { "epoch": 0.7479784366576819, "grad_norm": 3.25, "learning_rate": 0.00024974999999999997, "loss": 5.8036, "step": 1665 }, { "epoch": 0.7502246181491464, "grad_norm": 3.375, "learning_rate": 0.00025049999999999996, "loss": 5.845, "step": 1670 }, { "epoch": 0.7524707996406109, "grad_norm": 3.1875, "learning_rate": 0.00025125, "loss": 5.8801, "step": 1675 }, { "epoch": 0.7547169811320755, "grad_norm": 3.53125, "learning_rate": 0.00025199999999999995, "loss": 5.8356, "step": 1680 }, { "epoch": 0.75696316262354, "grad_norm": 3.375, "learning_rate": 0.00025275, "loss": 5.851, "step": 1685 }, { "epoch": 0.7592093441150045, "grad_norm": 3.546875, "learning_rate": 0.0002535, "loss": 5.8647, "step": 1690 }, { "epoch": 0.761455525606469, "grad_norm": 3.4375, "learning_rate": 0.00025425, "loss": 5.8168, "step": 1695 }, { "epoch": 0.7637017070979335, "grad_norm": 3.609375, "learning_rate": 0.00025499999999999996, "loss": 5.8514, "step": 1700 }, { "epoch": 0.765947888589398, "grad_norm": 3.3125, "learning_rate": 0.00025575, "loss": 5.7495, "step": 1705 }, { "epoch": 0.7681940700808625, "grad_norm": 3.515625, "learning_rate": 0.00025649999999999995, "loss": 5.8702, "step": 1710 }, { "epoch": 0.7704402515723271, "grad_norm": 3.640625, "learning_rate": 0.00025725, "loss": 5.9178, "step": 1715 }, { "epoch": 0.7726864330637916, "grad_norm": 3.1875, "learning_rate": 0.000258, "loss": 5.82, "step": 1720 }, { "epoch": 0.7749326145552561, "grad_norm": 3.765625, "learning_rate": 0.00025875, "loss": 5.823, "step": 1725 }, { "epoch": 0.7771787960467206, "grad_norm": 3.4375, "learning_rate": 0.00025949999999999997, "loss": 5.8712, "step": 1730 }, { "epoch": 0.7794249775381851, "grad_norm": 3.140625, "learning_rate": 0.00026025, "loss": 5.8173, "step": 1735 }, { "epoch": 0.7816711590296496, "grad_norm": 3.28125, "learning_rate": 0.000261, "loss": 5.8169, "step": 1740 }, { "epoch": 0.7839173405211141, "grad_norm": 3.4375, "learning_rate": 0.00026175, "loss": 5.8047, "step": 1745 }, { "epoch": 0.7861635220125787, "grad_norm": 3.21875, "learning_rate": 0.0002625, "loss": 5.8384, "step": 1750 }, { "epoch": 0.7884097035040432, "grad_norm": 3.40625, "learning_rate": 0.00026325, "loss": 5.7996, "step": 1755 }, { "epoch": 0.7906558849955077, "grad_norm": 3.4375, "learning_rate": 0.00026399999999999997, "loss": 5.7611, "step": 1760 }, { "epoch": 0.7929020664869721, "grad_norm": 3.390625, "learning_rate": 0.00026474999999999996, "loss": 5.7925, "step": 1765 }, { "epoch": 0.7951482479784366, "grad_norm": 3.375, "learning_rate": 0.0002655, "loss": 5.8187, "step": 1770 }, { "epoch": 0.7973944294699011, "grad_norm": 3.53125, "learning_rate": 0.00026624999999999994, "loss": 5.7791, "step": 1775 }, { "epoch": 0.7996406109613656, "grad_norm": 3.8125, "learning_rate": 0.000267, "loss": 5.8063, "step": 1780 }, { "epoch": 0.8018867924528302, "grad_norm": 3.25, "learning_rate": 0.00026775, "loss": 5.8167, "step": 1785 }, { "epoch": 0.8041329739442947, "grad_norm": 3.46875, "learning_rate": 0.00026849999999999997, "loss": 5.7916, "step": 1790 }, { "epoch": 0.8063791554357592, "grad_norm": 3.28125, "learning_rate": 0.00026924999999999996, "loss": 5.8446, "step": 1795 }, { "epoch": 0.8086253369272237, "grad_norm": 3.65625, "learning_rate": 0.00027, "loss": 5.8757, "step": 1800 }, { "epoch": 0.8108715184186882, "grad_norm": 3.734375, "learning_rate": 0.00027074999999999994, "loss": 5.7271, "step": 1805 }, { "epoch": 0.8131176999101527, "grad_norm": 3.765625, "learning_rate": 0.0002715, "loss": 5.8397, "step": 1810 }, { "epoch": 0.8153638814016172, "grad_norm": 3.34375, "learning_rate": 0.00027225, "loss": 5.7838, "step": 1815 }, { "epoch": 0.8176100628930818, "grad_norm": 3.59375, "learning_rate": 0.00027299999999999997, "loss": 5.7907, "step": 1820 }, { "epoch": 0.8198562443845463, "grad_norm": 3.921875, "learning_rate": 0.00027374999999999996, "loss": 5.8579, "step": 1825 }, { "epoch": 0.8221024258760108, "grad_norm": 3.46875, "learning_rate": 0.0002745, "loss": 5.8342, "step": 1830 }, { "epoch": 0.8243486073674753, "grad_norm": 3.75, "learning_rate": 0.00027525, "loss": 5.7949, "step": 1835 }, { "epoch": 0.8265947888589398, "grad_norm": 3.4375, "learning_rate": 0.000276, "loss": 5.7715, "step": 1840 }, { "epoch": 0.8288409703504043, "grad_norm": 3.703125, "learning_rate": 0.00027675, "loss": 5.7804, "step": 1845 }, { "epoch": 0.8310871518418689, "grad_norm": 3.4375, "learning_rate": 0.00027749999999999997, "loss": 5.7288, "step": 1850 }, { "epoch": 0.8333333333333334, "grad_norm": 3.109375, "learning_rate": 0.00027824999999999996, "loss": 5.7319, "step": 1855 }, { "epoch": 0.8355795148247979, "grad_norm": 3.21875, "learning_rate": 0.000279, "loss": 5.7636, "step": 1860 }, { "epoch": 0.8378256963162624, "grad_norm": 3.234375, "learning_rate": 0.00027975, "loss": 5.7395, "step": 1865 }, { "epoch": 0.8400718778077269, "grad_norm": 3.6875, "learning_rate": 0.0002805, "loss": 5.7519, "step": 1870 }, { "epoch": 0.8423180592991913, "grad_norm": 3.265625, "learning_rate": 0.00028125, "loss": 5.706, "step": 1875 }, { "epoch": 0.8445642407906558, "grad_norm": 3.390625, "learning_rate": 0.00028199999999999997, "loss": 5.799, "step": 1880 }, { "epoch": 0.8468104222821204, "grad_norm": 3.265625, "learning_rate": 0.00028274999999999996, "loss": 5.7856, "step": 1885 }, { "epoch": 0.8490566037735849, "grad_norm": 3.421875, "learning_rate": 0.00028349999999999995, "loss": 5.8625, "step": 1890 }, { "epoch": 0.8513027852650494, "grad_norm": 3.203125, "learning_rate": 0.00028425, "loss": 5.7212, "step": 1895 }, { "epoch": 0.8535489667565139, "grad_norm": 3.296875, "learning_rate": 0.000285, "loss": 5.7326, "step": 1900 }, { "epoch": 0.8557951482479784, "grad_norm": 3.5, "learning_rate": 0.00028575, "loss": 5.7664, "step": 1905 }, { "epoch": 0.8580413297394429, "grad_norm": 3.34375, "learning_rate": 0.00028649999999999997, "loss": 5.7231, "step": 1910 }, { "epoch": 0.8602875112309074, "grad_norm": 3.40625, "learning_rate": 0.00028724999999999996, "loss": 5.7759, "step": 1915 }, { "epoch": 0.862533692722372, "grad_norm": 3.125, "learning_rate": 0.00028799999999999995, "loss": 5.7442, "step": 1920 }, { "epoch": 0.8647798742138365, "grad_norm": 3.15625, "learning_rate": 0.00028875, "loss": 5.7252, "step": 1925 }, { "epoch": 0.867026055705301, "grad_norm": 3.265625, "learning_rate": 0.0002895, "loss": 5.7196, "step": 1930 }, { "epoch": 0.8692722371967655, "grad_norm": 3.328125, "learning_rate": 0.00029025, "loss": 5.7376, "step": 1935 }, { "epoch": 0.87151841868823, "grad_norm": 3.1875, "learning_rate": 0.00029099999999999997, "loss": 5.8077, "step": 1940 }, { "epoch": 0.8737646001796945, "grad_norm": 3.625, "learning_rate": 0.00029174999999999996, "loss": 5.7826, "step": 1945 }, { "epoch": 0.876010781671159, "grad_norm": 3.609375, "learning_rate": 0.00029249999999999995, "loss": 5.736, "step": 1950 }, { "epoch": 0.8782569631626236, "grad_norm": 3.421875, "learning_rate": 0.00029325, "loss": 5.7531, "step": 1955 }, { "epoch": 0.8805031446540881, "grad_norm": 3.4375, "learning_rate": 0.000294, "loss": 5.7246, "step": 1960 }, { "epoch": 0.8827493261455526, "grad_norm": 3.375, "learning_rate": 0.00029475, "loss": 5.7786, "step": 1965 }, { "epoch": 0.8849955076370171, "grad_norm": 3.296875, "learning_rate": 0.00029549999999999997, "loss": 5.7237, "step": 1970 }, { "epoch": 0.8872416891284816, "grad_norm": 2.96875, "learning_rate": 0.00029624999999999996, "loss": 5.8053, "step": 1975 }, { "epoch": 0.889487870619946, "grad_norm": 3.328125, "learning_rate": 0.00029699999999999996, "loss": 5.6918, "step": 1980 }, { "epoch": 0.8917340521114105, "grad_norm": 3.015625, "learning_rate": 0.00029775, "loss": 5.8251, "step": 1985 }, { "epoch": 0.8939802336028752, "grad_norm": 3.78125, "learning_rate": 0.0002985, "loss": 5.7529, "step": 1990 }, { "epoch": 0.8962264150943396, "grad_norm": 3.640625, "learning_rate": 0.00029925, "loss": 5.7181, "step": 1995 }, { "epoch": 0.8984725965858041, "grad_norm": 3.234375, "learning_rate": 0.0003, "loss": 5.7413, "step": 2000 }, { "epoch": 0.8984725965858041, "eval_loss": 5.639461517333984, "eval_runtime": 16.0491, "eval_samples_per_second": 1932.383, "eval_steps_per_second": 241.571, "step": 2000 }, { "epoch": 0.9007187780772686, "grad_norm": 3.140625, "learning_rate": 0.00029999995942443054, "loss": 5.6436, "step": 2005 }, { "epoch": 0.9029649595687331, "grad_norm": 3.328125, "learning_rate": 0.00029999983769774674, "loss": 5.7627, "step": 2010 }, { "epoch": 0.9052111410601976, "grad_norm": 3.171875, "learning_rate": 0.0002999996348200217, "loss": 5.7181, "step": 2015 }, { "epoch": 0.9074573225516622, "grad_norm": 3.34375, "learning_rate": 0.0002999993507913773, "loss": 5.7097, "step": 2020 }, { "epoch": 0.9097035040431267, "grad_norm": 3.1875, "learning_rate": 0.0002999989856119844, "loss": 5.6407, "step": 2025 }, { "epoch": 0.9119496855345912, "grad_norm": 3.453125, "learning_rate": 0.0002999985392820624, "loss": 5.6532, "step": 2030 }, { "epoch": 0.9141958670260557, "grad_norm": 3.140625, "learning_rate": 0.0002999980118018797, "loss": 5.6993, "step": 2035 }, { "epoch": 0.9164420485175202, "grad_norm": 3.546875, "learning_rate": 0.0002999974031717533, "loss": 5.6507, "step": 2040 }, { "epoch": 0.9186882300089847, "grad_norm": 3.546875, "learning_rate": 0.0002999967133920491, "loss": 5.6629, "step": 2045 }, { "epoch": 0.9209344115004492, "grad_norm": 3.203125, "learning_rate": 0.0002999959424631818, "loss": 5.7172, "step": 2050 }, { "epoch": 0.9231805929919138, "grad_norm": 3.140625, "learning_rate": 0.0002999950903856147, "loss": 5.5766, "step": 2055 }, { "epoch": 0.9254267744833783, "grad_norm": 3.234375, "learning_rate": 0.00029999415715986, "loss": 5.6546, "step": 2060 }, { "epoch": 0.9276729559748428, "grad_norm": 3.34375, "learning_rate": 0.0002999931427864788, "loss": 5.6317, "step": 2065 }, { "epoch": 0.9299191374663073, "grad_norm": 3.1875, "learning_rate": 0.00029999204726608076, "loss": 5.6605, "step": 2070 }, { "epoch": 0.9321653189577718, "grad_norm": 3.40625, "learning_rate": 0.0002999908705993245, "loss": 5.6958, "step": 2075 }, { "epoch": 0.9344115004492363, "grad_norm": 3.046875, "learning_rate": 0.00029998961278691725, "loss": 5.6498, "step": 2080 }, { "epoch": 0.9366576819407008, "grad_norm": 3.203125, "learning_rate": 0.0002999882738296152, "loss": 5.6887, "step": 2085 }, { "epoch": 0.9389038634321654, "grad_norm": 3.453125, "learning_rate": 0.0002999868537282231, "loss": 5.617, "step": 2090 }, { "epoch": 0.9411500449236299, "grad_norm": 3.25, "learning_rate": 0.0002999853524835947, "loss": 5.7708, "step": 2095 }, { "epoch": 0.9433962264150944, "grad_norm": 3.421875, "learning_rate": 0.0002999837700966324, "loss": 5.6733, "step": 2100 }, { "epoch": 0.9456424079065588, "grad_norm": 3.359375, "learning_rate": 0.00029998210656828736, "loss": 5.7, "step": 2105 }, { "epoch": 0.9478885893980233, "grad_norm": 3.296875, "learning_rate": 0.0002999803618995596, "loss": 5.6652, "step": 2110 }, { "epoch": 0.9501347708894878, "grad_norm": 3.71875, "learning_rate": 0.00029997853609149797, "loss": 5.7413, "step": 2115 }, { "epoch": 0.9523809523809523, "grad_norm": 3.3125, "learning_rate": 0.00029997662914519983, "loss": 5.7038, "step": 2120 }, { "epoch": 0.9546271338724169, "grad_norm": 3.546875, "learning_rate": 0.0002999746410618116, "loss": 5.6402, "step": 2125 }, { "epoch": 0.9568733153638814, "grad_norm": 3.09375, "learning_rate": 0.00029997257184252827, "loss": 5.5762, "step": 2130 }, { "epoch": 0.9591194968553459, "grad_norm": 3.421875, "learning_rate": 0.00029997042148859374, "loss": 5.7327, "step": 2135 }, { "epoch": 0.9613656783468104, "grad_norm": 3.296875, "learning_rate": 0.0002999681900013006, "loss": 5.6974, "step": 2140 }, { "epoch": 0.9636118598382749, "grad_norm": 3.140625, "learning_rate": 0.0002999658773819903, "loss": 5.7185, "step": 2145 }, { "epoch": 0.9658580413297394, "grad_norm": 3.34375, "learning_rate": 0.00029996348363205296, "loss": 5.7269, "step": 2150 }, { "epoch": 0.968104222821204, "grad_norm": 3.0, "learning_rate": 0.0002999610087529275, "loss": 5.6719, "step": 2155 }, { "epoch": 0.9703504043126685, "grad_norm": 3.375, "learning_rate": 0.00029995845274610164, "loss": 5.6067, "step": 2160 }, { "epoch": 0.972596585804133, "grad_norm": 3.25, "learning_rate": 0.00029995581561311185, "loss": 5.612, "step": 2165 }, { "epoch": 0.9748427672955975, "grad_norm": 3.390625, "learning_rate": 0.00029995309735554327, "loss": 5.6163, "step": 2170 }, { "epoch": 0.977088948787062, "grad_norm": 3.265625, "learning_rate": 0.00029995029797503007, "loss": 5.6468, "step": 2175 }, { "epoch": 0.9793351302785265, "grad_norm": 3.03125, "learning_rate": 0.00029994741747325487, "loss": 5.6653, "step": 2180 }, { "epoch": 0.981581311769991, "grad_norm": 3.1875, "learning_rate": 0.00029994445585194925, "loss": 5.6416, "step": 2185 }, { "epoch": 0.9838274932614556, "grad_norm": 3.09375, "learning_rate": 0.00029994141311289347, "loss": 5.5982, "step": 2190 }, { "epoch": 0.9860736747529201, "grad_norm": 3.328125, "learning_rate": 0.00029993828925791664, "loss": 5.6288, "step": 2195 }, { "epoch": 0.9883198562443846, "grad_norm": 3.203125, "learning_rate": 0.0002999350842888965, "loss": 5.6725, "step": 2200 }, { "epoch": 0.9905660377358491, "grad_norm": 3.40625, "learning_rate": 0.0002999317982077596, "loss": 5.6444, "step": 2205 }, { "epoch": 0.9928122192273136, "grad_norm": 2.921875, "learning_rate": 0.00029992843101648144, "loss": 5.6642, "step": 2210 }, { "epoch": 0.995058400718778, "grad_norm": 3.015625, "learning_rate": 0.00029992498271708595, "loss": 5.6011, "step": 2215 }, { "epoch": 0.9973045822102425, "grad_norm": 2.90625, "learning_rate": 0.00029992145331164596, "loss": 5.6432, "step": 2220 }, { "epoch": 0.9995507637017071, "grad_norm": 3.140625, "learning_rate": 0.0002999178428022831, "loss": 5.6428, "step": 2225 }, { "epoch": 1.0017969451931716, "grad_norm": 3.265625, "learning_rate": 0.0002999141511911678, "loss": 5.5542, "step": 2230 }, { "epoch": 1.0040431266846361, "grad_norm": 3.296875, "learning_rate": 0.000299910378480519, "loss": 5.6403, "step": 2235 }, { "epoch": 1.0062893081761006, "grad_norm": 3.21875, "learning_rate": 0.0002999065246726047, "loss": 5.5451, "step": 2240 }, { "epoch": 1.0085354896675651, "grad_norm": 3.0625, "learning_rate": 0.0002999025897697414, "loss": 5.6575, "step": 2245 }, { "epoch": 1.0107816711590296, "grad_norm": 3.140625, "learning_rate": 0.0002998985737742945, "loss": 5.5892, "step": 2250 }, { "epoch": 1.013027852650494, "grad_norm": 3.203125, "learning_rate": 0.0002998944766886781, "loss": 5.6127, "step": 2255 }, { "epoch": 1.0152740341419586, "grad_norm": 3.078125, "learning_rate": 0.000299890298515355, "loss": 5.5885, "step": 2260 }, { "epoch": 1.017520215633423, "grad_norm": 3.265625, "learning_rate": 0.0002998860392568368, "loss": 5.5215, "step": 2265 }, { "epoch": 1.0197663971248876, "grad_norm": 3.171875, "learning_rate": 0.00029988169891568373, "loss": 5.6074, "step": 2270 }, { "epoch": 1.0220125786163523, "grad_norm": 3.171875, "learning_rate": 0.00029987727749450506, "loss": 5.6192, "step": 2275 }, { "epoch": 1.0242587601078168, "grad_norm": 3.328125, "learning_rate": 0.00029987277499595843, "loss": 5.5663, "step": 2280 }, { "epoch": 1.0265049415992813, "grad_norm": 3.265625, "learning_rate": 0.0002998681914227504, "loss": 5.5862, "step": 2285 }, { "epoch": 1.0287511230907458, "grad_norm": 3.0, "learning_rate": 0.0002998635267776363, "loss": 5.5536, "step": 2290 }, { "epoch": 1.0309973045822103, "grad_norm": 3.3125, "learning_rate": 0.0002998587810634201, "loss": 5.5818, "step": 2295 }, { "epoch": 1.0332434860736748, "grad_norm": 3.234375, "learning_rate": 0.0002998539542829546, "loss": 5.6147, "step": 2300 }, { "epoch": 1.0354896675651393, "grad_norm": 3.09375, "learning_rate": 0.00029984904643914114, "loss": 5.6629, "step": 2305 }, { "epoch": 1.0377358490566038, "grad_norm": 3.15625, "learning_rate": 0.00029984405753493006, "loss": 5.5412, "step": 2310 }, { "epoch": 1.0399820305480683, "grad_norm": 2.984375, "learning_rate": 0.00029983898757332024, "loss": 5.5598, "step": 2315 }, { "epoch": 1.0422282120395328, "grad_norm": 2.96875, "learning_rate": 0.0002998338365573593, "loss": 5.6111, "step": 2320 }, { "epoch": 1.0444743935309972, "grad_norm": 3.234375, "learning_rate": 0.0002998286044901436, "loss": 5.4899, "step": 2325 }, { "epoch": 1.0467205750224617, "grad_norm": 3.453125, "learning_rate": 0.0002998232913748184, "loss": 5.5567, "step": 2330 }, { "epoch": 1.0489667565139262, "grad_norm": 3.40625, "learning_rate": 0.0002998178972145773, "loss": 5.4968, "step": 2335 }, { "epoch": 1.0512129380053907, "grad_norm": 3.03125, "learning_rate": 0.000299812422012663, "loss": 5.6119, "step": 2340 }, { "epoch": 1.0534591194968554, "grad_norm": 3.15625, "learning_rate": 0.0002998068657723666, "loss": 5.5563, "step": 2345 }, { "epoch": 1.05570530098832, "grad_norm": 3.203125, "learning_rate": 0.0002998012284970282, "loss": 5.5985, "step": 2350 }, { "epoch": 1.0579514824797844, "grad_norm": 3.46875, "learning_rate": 0.00029979551019003643, "loss": 5.5002, "step": 2355 }, { "epoch": 1.060197663971249, "grad_norm": 3.046875, "learning_rate": 0.0002997897108548286, "loss": 5.6114, "step": 2360 }, { "epoch": 1.0624438454627134, "grad_norm": 3.140625, "learning_rate": 0.00029978383049489093, "loss": 5.5056, "step": 2365 }, { "epoch": 1.064690026954178, "grad_norm": 3.109375, "learning_rate": 0.0002997778691137582, "loss": 5.515, "step": 2370 }, { "epoch": 1.0669362084456424, "grad_norm": 3.15625, "learning_rate": 0.00029977182671501383, "loss": 5.5303, "step": 2375 }, { "epoch": 1.069182389937107, "grad_norm": 3.140625, "learning_rate": 0.00029976570330229006, "loss": 5.5147, "step": 2380 }, { "epoch": 1.0714285714285714, "grad_norm": 3.109375, "learning_rate": 0.00029975949887926784, "loss": 5.5098, "step": 2385 }, { "epoch": 1.073674752920036, "grad_norm": 3.046875, "learning_rate": 0.00029975321344967676, "loss": 5.5533, "step": 2390 }, { "epoch": 1.0759209344115004, "grad_norm": 3.28125, "learning_rate": 0.000299746847017295, "loss": 5.5429, "step": 2395 }, { "epoch": 1.0781671159029649, "grad_norm": 3.265625, "learning_rate": 0.00029974039958594967, "loss": 5.508, "step": 2400 }, { "epoch": 1.0804132973944294, "grad_norm": 3.1875, "learning_rate": 0.0002997338711595165, "loss": 5.5494, "step": 2405 }, { "epoch": 1.082659478885894, "grad_norm": 3.203125, "learning_rate": 0.00029972726174191965, "loss": 5.4273, "step": 2410 }, { "epoch": 1.0849056603773586, "grad_norm": 3.0625, "learning_rate": 0.00029972057133713235, "loss": 5.5474, "step": 2415 }, { "epoch": 1.087151841868823, "grad_norm": 2.84375, "learning_rate": 0.00029971379994917624, "loss": 5.5008, "step": 2420 }, { "epoch": 1.0893980233602876, "grad_norm": 3.359375, "learning_rate": 0.00029970694758212177, "loss": 5.4682, "step": 2425 }, { "epoch": 1.091644204851752, "grad_norm": 3.0, "learning_rate": 0.000299700014240088, "loss": 5.4666, "step": 2430 }, { "epoch": 1.0938903863432166, "grad_norm": 3.3125, "learning_rate": 0.00029969299992724273, "loss": 5.5844, "step": 2435 }, { "epoch": 1.096136567834681, "grad_norm": 3.3125, "learning_rate": 0.00029968590464780247, "loss": 5.5141, "step": 2440 }, { "epoch": 1.0983827493261455, "grad_norm": 3.046875, "learning_rate": 0.0002996787284060322, "loss": 5.4897, "step": 2445 }, { "epoch": 1.10062893081761, "grad_norm": 3.125, "learning_rate": 0.00029967147120624573, "loss": 5.4318, "step": 2450 }, { "epoch": 1.1028751123090745, "grad_norm": 3.4375, "learning_rate": 0.00029966413305280553, "loss": 5.506, "step": 2455 }, { "epoch": 1.105121293800539, "grad_norm": 3.390625, "learning_rate": 0.00029965671395012274, "loss": 5.4363, "step": 2460 }, { "epoch": 1.1073674752920035, "grad_norm": 3.265625, "learning_rate": 0.0002996492139026571, "loss": 5.4077, "step": 2465 }, { "epoch": 1.109613656783468, "grad_norm": 3.265625, "learning_rate": 0.000299641632914917, "loss": 5.4435, "step": 2470 }, { "epoch": 1.1118598382749325, "grad_norm": 3.078125, "learning_rate": 0.0002996339709914596, "loss": 5.4641, "step": 2475 }, { "epoch": 1.1141060197663972, "grad_norm": 3.015625, "learning_rate": 0.0002996262281368905, "loss": 5.5053, "step": 2480 }, { "epoch": 1.1163522012578617, "grad_norm": 3.34375, "learning_rate": 0.0002996184043558642, "loss": 5.3987, "step": 2485 }, { "epoch": 1.1185983827493262, "grad_norm": 3.03125, "learning_rate": 0.0002996104996530837, "loss": 5.6063, "step": 2490 }, { "epoch": 1.1208445642407907, "grad_norm": 3.328125, "learning_rate": 0.0002996025140333006, "loss": 5.4782, "step": 2495 }, { "epoch": 1.1230907457322552, "grad_norm": 3.25, "learning_rate": 0.00029959444750131533, "loss": 5.4836, "step": 2500 }, { "epoch": 1.1253369272237197, "grad_norm": 3.140625, "learning_rate": 0.0002995863000619768, "loss": 5.5181, "step": 2505 }, { "epoch": 1.1275831087151842, "grad_norm": 3.1875, "learning_rate": 0.0002995780717201825, "loss": 5.4469, "step": 2510 }, { "epoch": 1.1298292902066487, "grad_norm": 3.03125, "learning_rate": 0.0002995697624808788, "loss": 5.4445, "step": 2515 }, { "epoch": 1.1320754716981132, "grad_norm": 3.125, "learning_rate": 0.00029956137234906044, "loss": 5.4844, "step": 2520 }, { "epoch": 1.1343216531895777, "grad_norm": 2.953125, "learning_rate": 0.00029955290132977093, "loss": 5.5633, "step": 2525 }, { "epoch": 1.1365678346810422, "grad_norm": 3.109375, "learning_rate": 0.0002995443494281024, "loss": 5.4724, "step": 2530 }, { "epoch": 1.1388140161725067, "grad_norm": 3.34375, "learning_rate": 0.00029953571664919547, "loss": 5.4786, "step": 2535 }, { "epoch": 1.1410601976639712, "grad_norm": 3.328125, "learning_rate": 0.0002995270029982396, "loss": 5.5004, "step": 2540 }, { "epoch": 1.1433063791554359, "grad_norm": 3.0625, "learning_rate": 0.00029951820848047255, "loss": 5.4758, "step": 2545 }, { "epoch": 1.1455525606469004, "grad_norm": 3.0, "learning_rate": 0.0002995093331011811, "loss": 5.4789, "step": 2550 }, { "epoch": 1.1477987421383649, "grad_norm": 3.03125, "learning_rate": 0.00029950037686570023, "loss": 5.3991, "step": 2555 }, { "epoch": 1.1500449236298294, "grad_norm": 3.3125, "learning_rate": 0.0002994913397794138, "loss": 5.5046, "step": 2560 }, { "epoch": 1.1522911051212938, "grad_norm": 3.46875, "learning_rate": 0.00029948222184775415, "loss": 5.5293, "step": 2565 }, { "epoch": 1.1545372866127583, "grad_norm": 3.125, "learning_rate": 0.00029947302307620227, "loss": 5.4079, "step": 2570 }, { "epoch": 1.1567834681042228, "grad_norm": 3.203125, "learning_rate": 0.0002994637434702877, "loss": 5.425, "step": 2575 }, { "epoch": 1.1590296495956873, "grad_norm": 3.296875, "learning_rate": 0.0002994543830355886, "loss": 5.4591, "step": 2580 }, { "epoch": 1.1612758310871518, "grad_norm": 3.296875, "learning_rate": 0.0002994449417777317, "loss": 5.5263, "step": 2585 }, { "epoch": 1.1635220125786163, "grad_norm": 3.140625, "learning_rate": 0.00029943541970239233, "loss": 5.4458, "step": 2590 }, { "epoch": 1.1657681940700808, "grad_norm": 3.1875, "learning_rate": 0.00029942581681529447, "loss": 5.4449, "step": 2595 }, { "epoch": 1.1680143755615453, "grad_norm": 3.34375, "learning_rate": 0.00029941613312221046, "loss": 5.5558, "step": 2600 }, { "epoch": 1.1702605570530098, "grad_norm": 3.0, "learning_rate": 0.00029940636862896145, "loss": 5.5165, "step": 2605 }, { "epoch": 1.1725067385444743, "grad_norm": 3.3125, "learning_rate": 0.0002993965233414171, "loss": 5.4624, "step": 2610 }, { "epoch": 1.1747529200359388, "grad_norm": 3.203125, "learning_rate": 0.0002993865972654955, "loss": 5.4336, "step": 2615 }, { "epoch": 1.1769991015274035, "grad_norm": 3.5, "learning_rate": 0.0002993765904071635, "loss": 5.5293, "step": 2620 }, { "epoch": 1.179245283018868, "grad_norm": 3.15625, "learning_rate": 0.00029936650277243633, "loss": 5.5603, "step": 2625 }, { "epoch": 1.1814914645103325, "grad_norm": 3.140625, "learning_rate": 0.0002993563343673779, "loss": 5.4785, "step": 2630 }, { "epoch": 1.183737646001797, "grad_norm": 3.09375, "learning_rate": 0.0002993460851981007, "loss": 5.4188, "step": 2635 }, { "epoch": 1.1859838274932615, "grad_norm": 3.078125, "learning_rate": 0.00029933575527076565, "loss": 5.5139, "step": 2640 }, { "epoch": 1.188230008984726, "grad_norm": 3.015625, "learning_rate": 0.0002993253445915823, "loss": 5.3998, "step": 2645 }, { "epoch": 1.1904761904761905, "grad_norm": 3.328125, "learning_rate": 0.0002993148531668087, "loss": 5.5066, "step": 2650 }, { "epoch": 1.192722371967655, "grad_norm": 3.125, "learning_rate": 0.0002993042810027514, "loss": 5.416, "step": 2655 }, { "epoch": 1.1949685534591195, "grad_norm": 3.171875, "learning_rate": 0.0002992936281057656, "loss": 5.4367, "step": 2660 }, { "epoch": 1.197214734950584, "grad_norm": 3.125, "learning_rate": 0.000299282894482255, "loss": 5.3912, "step": 2665 }, { "epoch": 1.1994609164420484, "grad_norm": 2.9375, "learning_rate": 0.00029927208013867164, "loss": 5.4456, "step": 2670 }, { "epoch": 1.201707097933513, "grad_norm": 3.296875, "learning_rate": 0.0002992611850815163, "loss": 5.5036, "step": 2675 }, { "epoch": 1.2039532794249777, "grad_norm": 3.234375, "learning_rate": 0.0002992502093173383, "loss": 5.4467, "step": 2680 }, { "epoch": 1.2061994609164421, "grad_norm": 3.375, "learning_rate": 0.0002992391528527353, "loss": 5.3611, "step": 2685 }, { "epoch": 1.2084456424079066, "grad_norm": 3.359375, "learning_rate": 0.00029922801569435366, "loss": 5.4635, "step": 2690 }, { "epoch": 1.2106918238993711, "grad_norm": 3.671875, "learning_rate": 0.00029921679784888797, "loss": 5.4823, "step": 2695 }, { "epoch": 1.2129380053908356, "grad_norm": 2.875, "learning_rate": 0.0002992054993230816, "loss": 5.378, "step": 2700 }, { "epoch": 1.2151841868823001, "grad_norm": 2.765625, "learning_rate": 0.0002991941201237263, "loss": 5.4737, "step": 2705 }, { "epoch": 1.2174303683737646, "grad_norm": 3.0625, "learning_rate": 0.0002991826602576624, "loss": 5.4399, "step": 2710 }, { "epoch": 1.219676549865229, "grad_norm": 3.046875, "learning_rate": 0.00029917111973177857, "loss": 5.4663, "step": 2715 }, { "epoch": 1.2219227313566936, "grad_norm": 3.484375, "learning_rate": 0.00029915949855301204, "loss": 5.3946, "step": 2720 }, { "epoch": 1.224168912848158, "grad_norm": 2.953125, "learning_rate": 0.0002991477967283485, "loss": 5.4415, "step": 2725 }, { "epoch": 1.2264150943396226, "grad_norm": 3.125, "learning_rate": 0.00029913601426482226, "loss": 5.3648, "step": 2730 }, { "epoch": 1.228661275831087, "grad_norm": 2.953125, "learning_rate": 0.00029912415116951593, "loss": 5.4543, "step": 2735 }, { "epoch": 1.2309074573225516, "grad_norm": 2.921875, "learning_rate": 0.0002991122074495606, "loss": 5.381, "step": 2740 }, { "epoch": 1.233153638814016, "grad_norm": 3.015625, "learning_rate": 0.0002991001831121359, "loss": 5.4367, "step": 2745 }, { "epoch": 1.2353998203054806, "grad_norm": 3.796875, "learning_rate": 0.00029908807816446994, "loss": 5.5144, "step": 2750 }, { "epoch": 1.2376460017969453, "grad_norm": 3.140625, "learning_rate": 0.0002990758926138392, "loss": 5.4193, "step": 2755 }, { "epoch": 1.2398921832884098, "grad_norm": 3.078125, "learning_rate": 0.0002990636264675687, "loss": 5.4758, "step": 2760 }, { "epoch": 1.2421383647798743, "grad_norm": 3.265625, "learning_rate": 0.00029905127973303176, "loss": 5.4093, "step": 2765 }, { "epoch": 1.2443845462713388, "grad_norm": 3.015625, "learning_rate": 0.00029903885241765036, "loss": 5.4189, "step": 2770 }, { "epoch": 1.2466307277628033, "grad_norm": 2.90625, "learning_rate": 0.0002990263445288947, "loss": 5.4447, "step": 2775 }, { "epoch": 1.2488769092542678, "grad_norm": 3.03125, "learning_rate": 0.0002990137560742836, "loss": 5.3926, "step": 2780 }, { "epoch": 1.2511230907457322, "grad_norm": 3.203125, "learning_rate": 0.00029900108706138416, "loss": 5.3857, "step": 2785 }, { "epoch": 1.2533692722371967, "grad_norm": 2.890625, "learning_rate": 0.000298988337497812, "loss": 5.4141, "step": 2790 }, { "epoch": 1.2556154537286612, "grad_norm": 3.0625, "learning_rate": 0.0002989755073912311, "loss": 5.422, "step": 2795 }, { "epoch": 1.2578616352201257, "grad_norm": 3.1875, "learning_rate": 0.0002989625967493541, "loss": 5.3838, "step": 2800 }, { "epoch": 1.2601078167115902, "grad_norm": 3.046875, "learning_rate": 0.00029894960557994146, "loss": 5.5335, "step": 2805 }, { "epoch": 1.262353998203055, "grad_norm": 2.9375, "learning_rate": 0.00029893653389080274, "loss": 5.3528, "step": 2810 }, { "epoch": 1.2646001796945194, "grad_norm": 3.15625, "learning_rate": 0.0002989233816897954, "loss": 5.3309, "step": 2815 }, { "epoch": 1.266846361185984, "grad_norm": 3.09375, "learning_rate": 0.0002989101489848256, "loss": 5.4407, "step": 2820 }, { "epoch": 1.2690925426774484, "grad_norm": 3.421875, "learning_rate": 0.0002988968357838477, "loss": 5.3808, "step": 2825 }, { "epoch": 1.271338724168913, "grad_norm": 2.9375, "learning_rate": 0.0002988834420948647, "loss": 5.4058, "step": 2830 }, { "epoch": 1.2735849056603774, "grad_norm": 2.953125, "learning_rate": 0.0002988699679259275, "loss": 5.4674, "step": 2835 }, { "epoch": 1.275831087151842, "grad_norm": 3.0, "learning_rate": 0.00029885641328513594, "loss": 5.4242, "step": 2840 }, { "epoch": 1.2780772686433064, "grad_norm": 3.109375, "learning_rate": 0.0002988427781806379, "loss": 5.4332, "step": 2845 }, { "epoch": 1.280323450134771, "grad_norm": 2.953125, "learning_rate": 0.0002988290626206297, "loss": 5.3583, "step": 2850 }, { "epoch": 1.2825696316262354, "grad_norm": 3.328125, "learning_rate": 0.000298815266613356, "loss": 5.3448, "step": 2855 }, { "epoch": 1.2848158131176999, "grad_norm": 3.03125, "learning_rate": 0.0002988013901671099, "loss": 5.4957, "step": 2860 }, { "epoch": 1.2870619946091644, "grad_norm": 3.078125, "learning_rate": 0.0002987874332902328, "loss": 5.4692, "step": 2865 }, { "epoch": 1.2893081761006289, "grad_norm": 3.09375, "learning_rate": 0.0002987733959911144, "loss": 5.3743, "step": 2870 }, { "epoch": 1.2915543575920934, "grad_norm": 2.890625, "learning_rate": 0.00029875927827819286, "loss": 5.368, "step": 2875 }, { "epoch": 1.2938005390835579, "grad_norm": 3.046875, "learning_rate": 0.00029874508015995463, "loss": 5.3748, "step": 2880 }, { "epoch": 1.2960467205750223, "grad_norm": 3.140625, "learning_rate": 0.0002987308016449344, "loss": 5.3995, "step": 2885 }, { "epoch": 1.2982929020664868, "grad_norm": 3.1875, "learning_rate": 0.00029871644274171534, "loss": 5.3753, "step": 2890 }, { "epoch": 1.3005390835579516, "grad_norm": 3.234375, "learning_rate": 0.00029870200345892876, "loss": 5.4296, "step": 2895 }, { "epoch": 1.302785265049416, "grad_norm": 3.09375, "learning_rate": 0.00029868748380525444, "loss": 5.315, "step": 2900 }, { "epoch": 1.3050314465408805, "grad_norm": 3.125, "learning_rate": 0.0002986728837894205, "loss": 5.4592, "step": 2905 }, { "epoch": 1.307277628032345, "grad_norm": 3.203125, "learning_rate": 0.00029865820342020325, "loss": 5.4735, "step": 2910 }, { "epoch": 1.3095238095238095, "grad_norm": 3.109375, "learning_rate": 0.0002986434427064273, "loss": 5.3768, "step": 2915 }, { "epoch": 1.311769991015274, "grad_norm": 2.890625, "learning_rate": 0.0002986286016569657, "loss": 5.381, "step": 2920 }, { "epoch": 1.3140161725067385, "grad_norm": 2.890625, "learning_rate": 0.0002986136802807396, "loss": 5.4079, "step": 2925 }, { "epoch": 1.316262353998203, "grad_norm": 3.21875, "learning_rate": 0.00029859867858671857, "loss": 5.435, "step": 2930 }, { "epoch": 1.3185085354896675, "grad_norm": 3.171875, "learning_rate": 0.00029858359658392045, "loss": 5.4919, "step": 2935 }, { "epoch": 1.320754716981132, "grad_norm": 2.859375, "learning_rate": 0.00029856843428141127, "loss": 5.3849, "step": 2940 }, { "epoch": 1.3230008984725967, "grad_norm": 3.703125, "learning_rate": 0.00029855319168830543, "loss": 5.4001, "step": 2945 }, { "epoch": 1.3252470799640612, "grad_norm": 3.375, "learning_rate": 0.0002985378688137656, "loss": 5.5048, "step": 2950 }, { "epoch": 1.3274932614555257, "grad_norm": 3.03125, "learning_rate": 0.00029852246566700253, "loss": 5.367, "step": 2955 }, { "epoch": 1.3297394429469902, "grad_norm": 2.921875, "learning_rate": 0.0002985069822572754, "loss": 5.3137, "step": 2960 }, { "epoch": 1.3319856244384547, "grad_norm": 3.15625, "learning_rate": 0.0002984914185938916, "loss": 5.3961, "step": 2965 }, { "epoch": 1.3342318059299192, "grad_norm": 3.1875, "learning_rate": 0.0002984757746862068, "loss": 5.4488, "step": 2970 }, { "epoch": 1.3364779874213837, "grad_norm": 3.171875, "learning_rate": 0.00029846005054362474, "loss": 5.4318, "step": 2975 }, { "epoch": 1.3387241689128482, "grad_norm": 2.96875, "learning_rate": 0.0002984442461755977, "loss": 5.3834, "step": 2980 }, { "epoch": 1.3409703504043127, "grad_norm": 3.0625, "learning_rate": 0.00029842836159162583, "loss": 5.4205, "step": 2985 }, { "epoch": 1.3432165318957772, "grad_norm": 2.90625, "learning_rate": 0.0002984123968012577, "loss": 5.4352, "step": 2990 }, { "epoch": 1.3454627133872417, "grad_norm": 3.03125, "learning_rate": 0.0002983963518140901, "loss": 5.4451, "step": 2995 }, { "epoch": 1.3477088948787062, "grad_norm": 3.0625, "learning_rate": 0.00029838022663976793, "loss": 5.3171, "step": 3000 }, { "epoch": 1.3477088948787062, "eval_loss": 5.344548225402832, "eval_runtime": 16.0596, "eval_samples_per_second": 1931.124, "eval_steps_per_second": 241.414, "step": 3000 }, { "epoch": 1.3499550763701706, "grad_norm": 2.984375, "learning_rate": 0.0002983640212879844, "loss": 5.4371, "step": 3005 }, { "epoch": 1.3522012578616351, "grad_norm": 3.265625, "learning_rate": 0.0002983477357684809, "loss": 5.3769, "step": 3010 }, { "epoch": 1.3544474393530996, "grad_norm": 3.421875, "learning_rate": 0.0002983313700910468, "loss": 5.4952, "step": 3015 }, { "epoch": 1.3566936208445641, "grad_norm": 2.96875, "learning_rate": 0.00029831492426552, "loss": 5.3494, "step": 3020 }, { "epoch": 1.3589398023360286, "grad_norm": 3.0625, "learning_rate": 0.00029829839830178636, "loss": 5.4431, "step": 3025 }, { "epoch": 1.3611859838274933, "grad_norm": 2.953125, "learning_rate": 0.00029828179220977994, "loss": 5.3644, "step": 3030 }, { "epoch": 1.3634321653189578, "grad_norm": 3.1875, "learning_rate": 0.000298265105999483, "loss": 5.3982, "step": 3035 }, { "epoch": 1.3656783468104223, "grad_norm": 3.03125, "learning_rate": 0.00029824833968092595, "loss": 5.3913, "step": 3040 }, { "epoch": 1.3679245283018868, "grad_norm": 2.96875, "learning_rate": 0.00029823149326418735, "loss": 5.3851, "step": 3045 }, { "epoch": 1.3701707097933513, "grad_norm": 3.0, "learning_rate": 0.0002982145667593939, "loss": 5.3206, "step": 3050 }, { "epoch": 1.3724168912848158, "grad_norm": 3.203125, "learning_rate": 0.00029819756017672043, "loss": 5.3429, "step": 3055 }, { "epoch": 1.3746630727762803, "grad_norm": 3.25, "learning_rate": 0.00029818047352639, "loss": 5.4596, "step": 3060 }, { "epoch": 1.3769092542677448, "grad_norm": 3.078125, "learning_rate": 0.00029816330681867366, "loss": 5.3423, "step": 3065 }, { "epoch": 1.3791554357592093, "grad_norm": 2.875, "learning_rate": 0.0002981460600638907, "loss": 5.3283, "step": 3070 }, { "epoch": 1.3814016172506738, "grad_norm": 2.921875, "learning_rate": 0.00029812873327240844, "loss": 5.3159, "step": 3075 }, { "epoch": 1.3836477987421385, "grad_norm": 2.890625, "learning_rate": 0.0002981113264546424, "loss": 5.3529, "step": 3080 }, { "epoch": 1.385893980233603, "grad_norm": 3.125, "learning_rate": 0.0002980938396210561, "loss": 5.46, "step": 3085 }, { "epoch": 1.3881401617250675, "grad_norm": 2.890625, "learning_rate": 0.00029807627278216126, "loss": 5.4219, "step": 3090 }, { "epoch": 1.390386343216532, "grad_norm": 3.03125, "learning_rate": 0.0002980586259485177, "loss": 5.4519, "step": 3095 }, { "epoch": 1.3926325247079965, "grad_norm": 3.15625, "learning_rate": 0.00029804089913073315, "loss": 5.4067, "step": 3100 }, { "epoch": 1.394878706199461, "grad_norm": 3.046875, "learning_rate": 0.0002980230923394637, "loss": 5.348, "step": 3105 }, { "epoch": 1.3971248876909255, "grad_norm": 3.109375, "learning_rate": 0.00029800520558541317, "loss": 5.3693, "step": 3110 }, { "epoch": 1.39937106918239, "grad_norm": 2.96875, "learning_rate": 0.0002979872388793338, "loss": 5.3537, "step": 3115 }, { "epoch": 1.4016172506738545, "grad_norm": 2.75, "learning_rate": 0.00029796919223202563, "loss": 5.3571, "step": 3120 }, { "epoch": 1.403863432165319, "grad_norm": 3.0, "learning_rate": 0.0002979510656543369, "loss": 5.3759, "step": 3125 }, { "epoch": 1.4061096136567834, "grad_norm": 3.109375, "learning_rate": 0.0002979328591571639, "loss": 5.3222, "step": 3130 }, { "epoch": 1.408355795148248, "grad_norm": 3.0, "learning_rate": 0.00029791457275145085, "loss": 5.2987, "step": 3135 }, { "epoch": 1.4106019766397124, "grad_norm": 2.984375, "learning_rate": 0.00029789620644819005, "loss": 5.3843, "step": 3140 }, { "epoch": 1.412848158131177, "grad_norm": 3.03125, "learning_rate": 0.00029787776025842186, "loss": 5.3461, "step": 3145 }, { "epoch": 1.4150943396226414, "grad_norm": 3.015625, "learning_rate": 0.00029785923419323467, "loss": 5.3381, "step": 3150 }, { "epoch": 1.417340521114106, "grad_norm": 2.890625, "learning_rate": 0.0002978406282637648, "loss": 5.3985, "step": 3155 }, { "epoch": 1.4195867026055704, "grad_norm": 2.953125, "learning_rate": 0.0002978219424811967, "loss": 5.3383, "step": 3160 }, { "epoch": 1.4218328840970351, "grad_norm": 3.125, "learning_rate": 0.00029780317685676276, "loss": 5.4033, "step": 3165 }, { "epoch": 1.4240790655884996, "grad_norm": 3.03125, "learning_rate": 0.0002977843314017433, "loss": 5.4135, "step": 3170 }, { "epoch": 1.426325247079964, "grad_norm": 3.0, "learning_rate": 0.0002977654061274668, "loss": 5.3461, "step": 3175 }, { "epoch": 1.4285714285714286, "grad_norm": 3.0625, "learning_rate": 0.0002977464010453095, "loss": 5.281, "step": 3180 }, { "epoch": 1.430817610062893, "grad_norm": 3.359375, "learning_rate": 0.0002977273161666957, "loss": 5.4328, "step": 3185 }, { "epoch": 1.4330637915543576, "grad_norm": 3.0, "learning_rate": 0.00029770815150309787, "loss": 5.3081, "step": 3190 }, { "epoch": 1.435309973045822, "grad_norm": 2.984375, "learning_rate": 0.0002976889070660361, "loss": 5.4198, "step": 3195 }, { "epoch": 1.4375561545372866, "grad_norm": 2.90625, "learning_rate": 0.0002976695828670787, "loss": 5.3054, "step": 3200 }, { "epoch": 1.439802336028751, "grad_norm": 2.984375, "learning_rate": 0.00029765017891784175, "loss": 5.4182, "step": 3205 }, { "epoch": 1.4420485175202156, "grad_norm": 2.765625, "learning_rate": 0.00029763069522998936, "loss": 5.3818, "step": 3210 }, { "epoch": 1.44429469901168, "grad_norm": 2.78125, "learning_rate": 0.0002976111318152336, "loss": 5.34, "step": 3215 }, { "epoch": 1.4465408805031448, "grad_norm": 2.96875, "learning_rate": 0.0002975914886853344, "loss": 5.4218, "step": 3220 }, { "epoch": 1.4487870619946093, "grad_norm": 3.078125, "learning_rate": 0.00029757176585209957, "loss": 5.3399, "step": 3225 }, { "epoch": 1.4510332434860738, "grad_norm": 3.265625, "learning_rate": 0.000297551963327385, "loss": 5.2921, "step": 3230 }, { "epoch": 1.4532794249775383, "grad_norm": 2.890625, "learning_rate": 0.00029753208112309423, "loss": 5.3799, "step": 3235 }, { "epoch": 1.4555256064690028, "grad_norm": 2.890625, "learning_rate": 0.00029751211925117897, "loss": 5.2984, "step": 3240 }, { "epoch": 1.4577717879604672, "grad_norm": 3.265625, "learning_rate": 0.00029749207772363867, "loss": 5.379, "step": 3245 }, { "epoch": 1.4600179694519317, "grad_norm": 3.0, "learning_rate": 0.0002974719565525207, "loss": 5.3465, "step": 3250 }, { "epoch": 1.4622641509433962, "grad_norm": 2.90625, "learning_rate": 0.0002974517557499201, "loss": 5.413, "step": 3255 }, { "epoch": 1.4645103324348607, "grad_norm": 3.40625, "learning_rate": 0.00029743147532798023, "loss": 5.2814, "step": 3260 }, { "epoch": 1.4667565139263252, "grad_norm": 2.96875, "learning_rate": 0.00029741111529889194, "loss": 5.3454, "step": 3265 }, { "epoch": 1.4690026954177897, "grad_norm": 3.078125, "learning_rate": 0.000297390675674894, "loss": 5.3013, "step": 3270 }, { "epoch": 1.4712488769092542, "grad_norm": 3.09375, "learning_rate": 0.0002973701564682731, "loss": 5.2762, "step": 3275 }, { "epoch": 1.4734950584007187, "grad_norm": 3.015625, "learning_rate": 0.00029734955769136377, "loss": 5.3686, "step": 3280 }, { "epoch": 1.4757412398921832, "grad_norm": 3.140625, "learning_rate": 0.00029732887935654827, "loss": 5.3697, "step": 3285 }, { "epoch": 1.4779874213836477, "grad_norm": 2.953125, "learning_rate": 0.0002973081214762568, "loss": 5.2504, "step": 3290 }, { "epoch": 1.4802336028751122, "grad_norm": 2.9375, "learning_rate": 0.00029728728406296735, "loss": 5.3318, "step": 3295 }, { "epoch": 1.482479784366577, "grad_norm": 3.078125, "learning_rate": 0.00029726636712920564, "loss": 5.3078, "step": 3300 }, { "epoch": 1.4847259658580414, "grad_norm": 3.046875, "learning_rate": 0.0002972453706875453, "loss": 5.3814, "step": 3305 }, { "epoch": 1.486972147349506, "grad_norm": 2.875, "learning_rate": 0.0002972242947506076, "loss": 5.2753, "step": 3310 }, { "epoch": 1.4892183288409704, "grad_norm": 3.046875, "learning_rate": 0.0002972031393310619, "loss": 5.3256, "step": 3315 }, { "epoch": 1.4914645103324349, "grad_norm": 3.0625, "learning_rate": 0.0002971819044416249, "loss": 5.3758, "step": 3320 }, { "epoch": 1.4937106918238994, "grad_norm": 2.75, "learning_rate": 0.00029716059009506145, "loss": 5.3209, "step": 3325 }, { "epoch": 1.4959568733153639, "grad_norm": 3.109375, "learning_rate": 0.000297139196304184, "loss": 5.3075, "step": 3330 }, { "epoch": 1.4982030548068284, "grad_norm": 2.859375, "learning_rate": 0.0002971177230818527, "loss": 5.3805, "step": 3335 }, { "epoch": 1.5004492362982929, "grad_norm": 3.0625, "learning_rate": 0.0002970961704409756, "loss": 5.3156, "step": 3340 }, { "epoch": 1.5026954177897576, "grad_norm": 2.90625, "learning_rate": 0.0002970745383945084, "loss": 5.3465, "step": 3345 }, { "epoch": 1.504941599281222, "grad_norm": 3.015625, "learning_rate": 0.00029705282695545454, "loss": 5.3717, "step": 3350 }, { "epoch": 1.5071877807726866, "grad_norm": 3.09375, "learning_rate": 0.00029703103613686527, "loss": 5.2288, "step": 3355 }, { "epoch": 1.509433962264151, "grad_norm": 3.1875, "learning_rate": 0.0002970091659518393, "loss": 5.2978, "step": 3360 }, { "epoch": 1.5116801437556155, "grad_norm": 2.9375, "learning_rate": 0.0002969872164135234, "loss": 5.2993, "step": 3365 }, { "epoch": 1.51392632524708, "grad_norm": 3.0625, "learning_rate": 0.00029696518753511173, "loss": 5.3231, "step": 3370 }, { "epoch": 1.5161725067385445, "grad_norm": 2.796875, "learning_rate": 0.0002969430793298464, "loss": 5.334, "step": 3375 }, { "epoch": 1.518418688230009, "grad_norm": 3.03125, "learning_rate": 0.00029692089181101696, "loss": 5.2514, "step": 3380 }, { "epoch": 1.5206648697214735, "grad_norm": 2.890625, "learning_rate": 0.0002968986249919609, "loss": 5.3403, "step": 3385 }, { "epoch": 1.522911051212938, "grad_norm": 3.0625, "learning_rate": 0.0002968762788860631, "loss": 5.3209, "step": 3390 }, { "epoch": 1.5251572327044025, "grad_norm": 3.125, "learning_rate": 0.0002968538535067564, "loss": 5.3657, "step": 3395 }, { "epoch": 1.527403414195867, "grad_norm": 2.96875, "learning_rate": 0.000296831348867521, "loss": 5.3167, "step": 3400 }, { "epoch": 1.5296495956873315, "grad_norm": 2.953125, "learning_rate": 0.0002968087649818848, "loss": 5.2753, "step": 3405 }, { "epoch": 1.531895777178796, "grad_norm": 2.984375, "learning_rate": 0.0002967861018634237, "loss": 5.3678, "step": 3410 }, { "epoch": 1.5341419586702605, "grad_norm": 3.265625, "learning_rate": 0.00029676335952576074, "loss": 5.3243, "step": 3415 }, { "epoch": 1.536388140161725, "grad_norm": 3.109375, "learning_rate": 0.0002967405379825668, "loss": 5.2466, "step": 3420 }, { "epoch": 1.5386343216531895, "grad_norm": 2.953125, "learning_rate": 0.0002967176372475604, "loss": 5.2428, "step": 3425 }, { "epoch": 1.540880503144654, "grad_norm": 2.890625, "learning_rate": 0.0002966946573345076, "loss": 5.2614, "step": 3430 }, { "epoch": 1.5431266846361185, "grad_norm": 2.921875, "learning_rate": 0.00029667159825722206, "loss": 5.3399, "step": 3435 }, { "epoch": 1.545372866127583, "grad_norm": 3.0625, "learning_rate": 0.00029664846002956506, "loss": 5.2338, "step": 3440 }, { "epoch": 1.5476190476190477, "grad_norm": 3.09375, "learning_rate": 0.0002966252426654454, "loss": 5.3445, "step": 3445 }, { "epoch": 1.5498652291105122, "grad_norm": 2.875, "learning_rate": 0.0002966019461788196, "loss": 5.2916, "step": 3450 }, { "epoch": 1.5521114106019767, "grad_norm": 2.9375, "learning_rate": 0.0002965785705836915, "loss": 5.3159, "step": 3455 }, { "epoch": 1.5543575920934412, "grad_norm": 3.1875, "learning_rate": 0.0002965551158941127, "loss": 5.3027, "step": 3460 }, { "epoch": 1.5566037735849056, "grad_norm": 2.96875, "learning_rate": 0.0002965315821241823, "loss": 5.2319, "step": 3465 }, { "epoch": 1.5588499550763701, "grad_norm": 3.875, "learning_rate": 0.00029650796928804685, "loss": 5.3169, "step": 3470 }, { "epoch": 1.5610961365678346, "grad_norm": 3.03125, "learning_rate": 0.0002964842773999005, "loss": 5.2524, "step": 3475 }, { "epoch": 1.5633423180592994, "grad_norm": 2.921875, "learning_rate": 0.0002964605064739849, "loss": 5.3455, "step": 3480 }, { "epoch": 1.5655884995507638, "grad_norm": 3.015625, "learning_rate": 0.0002964366565245892, "loss": 5.3241, "step": 3485 }, { "epoch": 1.5678346810422283, "grad_norm": 3.015625, "learning_rate": 0.00029641272756605023, "loss": 5.301, "step": 3490 }, { "epoch": 1.5700808625336928, "grad_norm": 3.0, "learning_rate": 0.0002963887196127519, "loss": 5.2987, "step": 3495 }, { "epoch": 1.5723270440251573, "grad_norm": 2.96875, "learning_rate": 0.00029636463267912607, "loss": 5.2262, "step": 3500 }, { "epoch": 1.5745732255166218, "grad_norm": 2.90625, "learning_rate": 0.00029634046677965174, "loss": 5.2556, "step": 3505 }, { "epoch": 1.5768194070080863, "grad_norm": 2.90625, "learning_rate": 0.00029631622192885553, "loss": 5.3328, "step": 3510 }, { "epoch": 1.5790655884995508, "grad_norm": 3.078125, "learning_rate": 0.00029629189814131155, "loss": 5.3252, "step": 3515 }, { "epoch": 1.5813117699910153, "grad_norm": 3.03125, "learning_rate": 0.0002962674954316413, "loss": 5.2871, "step": 3520 }, { "epoch": 1.5835579514824798, "grad_norm": 2.890625, "learning_rate": 0.0002962430138145137, "loss": 5.2723, "step": 3525 }, { "epoch": 1.5858041329739443, "grad_norm": 2.765625, "learning_rate": 0.000296218453304645, "loss": 5.2836, "step": 3530 }, { "epoch": 1.5880503144654088, "grad_norm": 3.015625, "learning_rate": 0.00029619381391679923, "loss": 5.3014, "step": 3535 }, { "epoch": 1.5902964959568733, "grad_norm": 2.890625, "learning_rate": 0.00029616909566578746, "loss": 5.2194, "step": 3540 }, { "epoch": 1.5925426774483378, "grad_norm": 2.875, "learning_rate": 0.0002961442985664684, "loss": 5.3363, "step": 3545 }, { "epoch": 1.5947888589398023, "grad_norm": 2.875, "learning_rate": 0.000296119422633748, "loss": 5.2192, "step": 3550 }, { "epoch": 1.5970350404312668, "grad_norm": 3.109375, "learning_rate": 0.0002960944678825797, "loss": 5.2585, "step": 3555 }, { "epoch": 1.5992812219227313, "grad_norm": 3.40625, "learning_rate": 0.0002960694343279643, "loss": 5.4105, "step": 3560 }, { "epoch": 1.6015274034141957, "grad_norm": 2.953125, "learning_rate": 0.0002960443219849499, "loss": 5.2834, "step": 3565 }, { "epoch": 1.6037735849056602, "grad_norm": 2.953125, "learning_rate": 0.0002960191308686321, "loss": 5.2917, "step": 3570 }, { "epoch": 1.6060197663971247, "grad_norm": 2.953125, "learning_rate": 0.0002959938609941537, "loss": 5.3014, "step": 3575 }, { "epoch": 1.6082659478885895, "grad_norm": 3.0625, "learning_rate": 0.00029596851237670494, "loss": 5.2469, "step": 3580 }, { "epoch": 1.610512129380054, "grad_norm": 3.046875, "learning_rate": 0.00029594308503152344, "loss": 5.2651, "step": 3585 }, { "epoch": 1.6127583108715184, "grad_norm": 2.9375, "learning_rate": 0.00029591757897389403, "loss": 5.2144, "step": 3590 }, { "epoch": 1.615004492362983, "grad_norm": 3.015625, "learning_rate": 0.00029589199421914885, "loss": 5.2536, "step": 3595 }, { "epoch": 1.6172506738544474, "grad_norm": 2.90625, "learning_rate": 0.0002958663307826674, "loss": 5.2291, "step": 3600 }, { "epoch": 1.619496855345912, "grad_norm": 2.875, "learning_rate": 0.00029584058867987656, "loss": 5.2936, "step": 3605 }, { "epoch": 1.6217430368373764, "grad_norm": 3.171875, "learning_rate": 0.00029581476792625035, "loss": 5.3135, "step": 3610 }, { "epoch": 1.6239892183288411, "grad_norm": 3.078125, "learning_rate": 0.0002957888685373101, "loss": 5.2395, "step": 3615 }, { "epoch": 1.6262353998203056, "grad_norm": 3.015625, "learning_rate": 0.0002957628905286245, "loss": 5.2269, "step": 3620 }, { "epoch": 1.6284815813117701, "grad_norm": 2.953125, "learning_rate": 0.00029573683391580946, "loss": 5.2192, "step": 3625 }, { "epoch": 1.6307277628032346, "grad_norm": 3.109375, "learning_rate": 0.000295710698714528, "loss": 5.2539, "step": 3630 }, { "epoch": 1.632973944294699, "grad_norm": 3.03125, "learning_rate": 0.0002956844849404906, "loss": 5.2506, "step": 3635 }, { "epoch": 1.6352201257861636, "grad_norm": 2.78125, "learning_rate": 0.00029565819260945483, "loss": 5.2739, "step": 3640 }, { "epoch": 1.637466307277628, "grad_norm": 3.03125, "learning_rate": 0.00029563182173722555, "loss": 5.232, "step": 3645 }, { "epoch": 1.6397124887690926, "grad_norm": 2.890625, "learning_rate": 0.0002956053723396548, "loss": 5.3054, "step": 3650 }, { "epoch": 1.641958670260557, "grad_norm": 2.9375, "learning_rate": 0.0002955788444326418, "loss": 5.2955, "step": 3655 }, { "epoch": 1.6442048517520216, "grad_norm": 3.0, "learning_rate": 0.00029555223803213305, "loss": 5.2577, "step": 3660 }, { "epoch": 1.646451033243486, "grad_norm": 2.96875, "learning_rate": 0.00029552555315412216, "loss": 5.2796, "step": 3665 }, { "epoch": 1.6486972147349506, "grad_norm": 3.75, "learning_rate": 0.0002954987898146499, "loss": 5.3159, "step": 3670 }, { "epoch": 1.650943396226415, "grad_norm": 2.890625, "learning_rate": 0.0002954719480298043, "loss": 5.2639, "step": 3675 }, { "epoch": 1.6531895777178796, "grad_norm": 2.875, "learning_rate": 0.00029544502781572035, "loss": 5.2906, "step": 3680 }, { "epoch": 1.655435759209344, "grad_norm": 4.75, "learning_rate": 0.0002954180291885804, "loss": 5.299, "step": 3685 }, { "epoch": 1.6576819407008085, "grad_norm": 3.046875, "learning_rate": 0.00029539095216461395, "loss": 5.2026, "step": 3690 }, { "epoch": 1.659928122192273, "grad_norm": 2.859375, "learning_rate": 0.0002953637967600974, "loss": 5.2159, "step": 3695 }, { "epoch": 1.6621743036837375, "grad_norm": 2.953125, "learning_rate": 0.0002953365629913544, "loss": 5.22, "step": 3700 }, { "epoch": 1.664420485175202, "grad_norm": 3.015625, "learning_rate": 0.0002953092508747557, "loss": 5.1528, "step": 3705 }, { "epoch": 1.6666666666666665, "grad_norm": 3.09375, "learning_rate": 0.0002952818604267193, "loss": 5.234, "step": 3710 }, { "epoch": 1.668912848158131, "grad_norm": 3.421875, "learning_rate": 0.0002952543916637099, "loss": 5.263, "step": 3715 }, { "epoch": 1.6711590296495957, "grad_norm": 2.984375, "learning_rate": 0.00029522684460223965, "loss": 5.2879, "step": 3720 }, { "epoch": 1.6734052111410602, "grad_norm": 3.046875, "learning_rate": 0.0002951992192588676, "loss": 5.2081, "step": 3725 }, { "epoch": 1.6756513926325247, "grad_norm": 2.953125, "learning_rate": 0.0002951715156501999, "loss": 5.2688, "step": 3730 }, { "epoch": 1.6778975741239892, "grad_norm": 3.0, "learning_rate": 0.00029514373379288967, "loss": 5.2266, "step": 3735 }, { "epoch": 1.6801437556154537, "grad_norm": 2.859375, "learning_rate": 0.0002951158737036372, "loss": 5.2542, "step": 3740 }, { "epoch": 1.6823899371069182, "grad_norm": 2.984375, "learning_rate": 0.0002950879353991897, "loss": 5.2341, "step": 3745 }, { "epoch": 1.684636118598383, "grad_norm": 3.171875, "learning_rate": 0.0002950599188963414, "loss": 5.2238, "step": 3750 }, { "epoch": 1.6868823000898474, "grad_norm": 3.09375, "learning_rate": 0.0002950318242119337, "loss": 5.3397, "step": 3755 }, { "epoch": 1.689128481581312, "grad_norm": 3.015625, "learning_rate": 0.0002950036513628547, "loss": 5.2441, "step": 3760 }, { "epoch": 1.6913746630727764, "grad_norm": 2.859375, "learning_rate": 0.0002949754003660397, "loss": 5.3238, "step": 3765 }, { "epoch": 1.693620844564241, "grad_norm": 3.390625, "learning_rate": 0.00029494707123847095, "loss": 5.3302, "step": 3770 }, { "epoch": 1.6958670260557054, "grad_norm": 3.28125, "learning_rate": 0.0002949186639971777, "loss": 5.2831, "step": 3775 }, { "epoch": 1.6981132075471699, "grad_norm": 3.078125, "learning_rate": 0.00029489017865923597, "loss": 5.2566, "step": 3780 }, { "epoch": 1.7003593890386344, "grad_norm": 2.9375, "learning_rate": 0.00029486161524176893, "loss": 5.2631, "step": 3785 }, { "epoch": 1.7026055705300989, "grad_norm": 3.046875, "learning_rate": 0.0002948329737619466, "loss": 5.2597, "step": 3790 }, { "epoch": 1.7048517520215634, "grad_norm": 3.265625, "learning_rate": 0.0002948042542369859, "loss": 5.2838, "step": 3795 }, { "epoch": 1.7070979335130279, "grad_norm": 2.9375, "learning_rate": 0.0002947754566841508, "loss": 5.2681, "step": 3800 }, { "epoch": 1.7093441150044923, "grad_norm": 3.046875, "learning_rate": 0.00029474658112075197, "loss": 5.3089, "step": 3805 }, { "epoch": 1.7115902964959568, "grad_norm": 3.03125, "learning_rate": 0.00029471762756414703, "loss": 5.2663, "step": 3810 }, { "epoch": 1.7138364779874213, "grad_norm": 2.953125, "learning_rate": 0.00029468859603174065, "loss": 5.2597, "step": 3815 }, { "epoch": 1.7160826594788858, "grad_norm": 3.046875, "learning_rate": 0.00029465948654098427, "loss": 5.2646, "step": 3820 }, { "epoch": 1.7183288409703503, "grad_norm": 2.890625, "learning_rate": 0.0002946302991093761, "loss": 5.2662, "step": 3825 }, { "epoch": 1.7205750224618148, "grad_norm": 2.890625, "learning_rate": 0.00029460103375446116, "loss": 5.2176, "step": 3830 }, { "epoch": 1.7228212039532793, "grad_norm": 2.84375, "learning_rate": 0.00029457169049383164, "loss": 5.225, "step": 3835 }, { "epoch": 1.7250673854447438, "grad_norm": 3.09375, "learning_rate": 0.00029454226934512624, "loss": 5.2631, "step": 3840 }, { "epoch": 1.7273135669362083, "grad_norm": 2.8125, "learning_rate": 0.00029451277032603064, "loss": 5.2029, "step": 3845 }, { "epoch": 1.7295597484276728, "grad_norm": 2.921875, "learning_rate": 0.0002944831934542772, "loss": 5.2321, "step": 3850 }, { "epoch": 1.7318059299191375, "grad_norm": 3.03125, "learning_rate": 0.00029445353874764526, "loss": 5.2173, "step": 3855 }, { "epoch": 1.734052111410602, "grad_norm": 2.90625, "learning_rate": 0.00029442380622396073, "loss": 5.2293, "step": 3860 }, { "epoch": 1.7362982929020665, "grad_norm": 2.984375, "learning_rate": 0.00029439399590109645, "loss": 5.1509, "step": 3865 }, { "epoch": 1.738544474393531, "grad_norm": 2.890625, "learning_rate": 0.00029436410779697206, "loss": 5.2911, "step": 3870 }, { "epoch": 1.7407906558849955, "grad_norm": 3.03125, "learning_rate": 0.00029433414192955377, "loss": 5.1782, "step": 3875 }, { "epoch": 1.74303683737646, "grad_norm": 3.0, "learning_rate": 0.0002943040983168547, "loss": 5.2294, "step": 3880 }, { "epoch": 1.7452830188679245, "grad_norm": 3.171875, "learning_rate": 0.0002942739769769347, "loss": 5.2567, "step": 3885 }, { "epoch": 1.7475292003593892, "grad_norm": 3.546875, "learning_rate": 0.00029424377792790023, "loss": 5.2894, "step": 3890 }, { "epoch": 1.7497753818508537, "grad_norm": 2.953125, "learning_rate": 0.0002942135011879046, "loss": 5.3933, "step": 3895 }, { "epoch": 1.7520215633423182, "grad_norm": 3.1875, "learning_rate": 0.00029418314677514764, "loss": 5.295, "step": 3900 }, { "epoch": 1.7542677448337827, "grad_norm": 3.15625, "learning_rate": 0.0002941527147078761, "loss": 5.1949, "step": 3905 }, { "epoch": 1.7565139263252472, "grad_norm": 2.96875, "learning_rate": 0.00029412220500438317, "loss": 5.1329, "step": 3910 }, { "epoch": 1.7587601078167117, "grad_norm": 3.109375, "learning_rate": 0.0002940916176830089, "loss": 5.3141, "step": 3915 }, { "epoch": 1.7610062893081762, "grad_norm": 3.109375, "learning_rate": 0.0002940609527621399, "loss": 5.2578, "step": 3920 }, { "epoch": 1.7632524707996406, "grad_norm": 3.0, "learning_rate": 0.00029403021026020955, "loss": 5.2614, "step": 3925 }, { "epoch": 1.7654986522911051, "grad_norm": 3.109375, "learning_rate": 0.00029399939019569767, "loss": 5.2955, "step": 3930 }, { "epoch": 1.7677448337825696, "grad_norm": 2.9375, "learning_rate": 0.00029396849258713084, "loss": 5.2972, "step": 3935 }, { "epoch": 1.7699910152740341, "grad_norm": 3.09375, "learning_rate": 0.00029393751745308215, "loss": 5.2714, "step": 3940 }, { "epoch": 1.7722371967654986, "grad_norm": 3.234375, "learning_rate": 0.0002939064648121714, "loss": 5.2846, "step": 3945 }, { "epoch": 1.7744833782569631, "grad_norm": 2.90625, "learning_rate": 0.00029387533468306504, "loss": 5.263, "step": 3950 }, { "epoch": 1.7767295597484276, "grad_norm": 3.09375, "learning_rate": 0.0002938441270844758, "loss": 5.1442, "step": 3955 }, { "epoch": 1.778975741239892, "grad_norm": 2.859375, "learning_rate": 0.00029381284203516334, "loss": 5.209, "step": 3960 }, { "epoch": 1.7812219227313566, "grad_norm": 3.078125, "learning_rate": 0.00029378147955393363, "loss": 5.2285, "step": 3965 }, { "epoch": 1.783468104222821, "grad_norm": 3.171875, "learning_rate": 0.00029375003965963935, "loss": 5.2605, "step": 3970 }, { "epoch": 1.7857142857142856, "grad_norm": 2.921875, "learning_rate": 0.00029371852237117957, "loss": 5.2557, "step": 3975 }, { "epoch": 1.78796046720575, "grad_norm": 2.96875, "learning_rate": 0.00029368692770749994, "loss": 5.1953, "step": 3980 }, { "epoch": 1.7902066486972146, "grad_norm": 3.0, "learning_rate": 0.00029365525568759266, "loss": 5.2138, "step": 3985 }, { "epoch": 1.7924528301886793, "grad_norm": 3.03125, "learning_rate": 0.0002936235063304964, "loss": 5.2362, "step": 3990 }, { "epoch": 1.7946990116801438, "grad_norm": 3.703125, "learning_rate": 0.0002935916796552963, "loss": 5.238, "step": 3995 }, { "epoch": 1.7969451931716083, "grad_norm": 3.03125, "learning_rate": 0.00029355977568112403, "loss": 5.2092, "step": 4000 }, { "epoch": 1.7969451931716083, "eval_loss": 5.183039665222168, "eval_runtime": 16.1808, "eval_samples_per_second": 1916.649, "eval_steps_per_second": 239.604, "step": 4000 }, { "epoch": 1.7991913746630728, "grad_norm": 2.875, "learning_rate": 0.00029352779442715765, "loss": 5.2075, "step": 4005 }, { "epoch": 1.8014375561545373, "grad_norm": 3.0, "learning_rate": 0.0002934957359126218, "loss": 5.1898, "step": 4010 }, { "epoch": 1.8036837376460018, "grad_norm": 3.25, "learning_rate": 0.0002934636001567873, "loss": 5.2844, "step": 4015 }, { "epoch": 1.8059299191374663, "grad_norm": 3.109375, "learning_rate": 0.0002934313871789718, "loss": 5.2941, "step": 4020 }, { "epoch": 1.808176100628931, "grad_norm": 3.140625, "learning_rate": 0.00029339909699853904, "loss": 5.3192, "step": 4025 }, { "epoch": 1.8104222821203955, "grad_norm": 3.015625, "learning_rate": 0.00029336672963489925, "loss": 5.1957, "step": 4030 }, { "epoch": 1.81266846361186, "grad_norm": 2.890625, "learning_rate": 0.0002933342851075092, "loss": 5.2322, "step": 4035 }, { "epoch": 1.8149146451033245, "grad_norm": 2.921875, "learning_rate": 0.00029330176343587175, "loss": 5.124, "step": 4040 }, { "epoch": 1.817160826594789, "grad_norm": 2.921875, "learning_rate": 0.00029326916463953646, "loss": 5.195, "step": 4045 }, { "epoch": 1.8194070080862534, "grad_norm": 3.03125, "learning_rate": 0.0002932364887380991, "loss": 5.2398, "step": 4050 }, { "epoch": 1.821653189577718, "grad_norm": 3.03125, "learning_rate": 0.00029320373575120174, "loss": 5.1243, "step": 4055 }, { "epoch": 1.8238993710691824, "grad_norm": 2.921875, "learning_rate": 0.0002931709056985328, "loss": 5.1875, "step": 4060 }, { "epoch": 1.826145552560647, "grad_norm": 3.140625, "learning_rate": 0.0002931379985998272, "loss": 5.2679, "step": 4065 }, { "epoch": 1.8283917340521114, "grad_norm": 3.109375, "learning_rate": 0.0002931050144748659, "loss": 5.1371, "step": 4070 }, { "epoch": 1.830637915543576, "grad_norm": 2.921875, "learning_rate": 0.0002930719533434764, "loss": 5.2114, "step": 4075 }, { "epoch": 1.8328840970350404, "grad_norm": 2.984375, "learning_rate": 0.0002930388152255323, "loss": 5.2132, "step": 4080 }, { "epoch": 1.835130278526505, "grad_norm": 2.96875, "learning_rate": 0.0002930056001409537, "loss": 5.211, "step": 4085 }, { "epoch": 1.8373764600179694, "grad_norm": 2.921875, "learning_rate": 0.0002929723081097067, "loss": 5.1184, "step": 4090 }, { "epoch": 1.8396226415094339, "grad_norm": 2.796875, "learning_rate": 0.00029293893915180387, "loss": 5.1128, "step": 4095 }, { "epoch": 1.8418688230008984, "grad_norm": 3.078125, "learning_rate": 0.00029290549328730395, "loss": 5.2356, "step": 4100 }, { "epoch": 1.8441150044923629, "grad_norm": 3.140625, "learning_rate": 0.0002928719705363118, "loss": 5.1903, "step": 4105 }, { "epoch": 1.8463611859838274, "grad_norm": 2.9375, "learning_rate": 0.00029283837091897876, "loss": 5.1552, "step": 4110 }, { "epoch": 1.8486073674752919, "grad_norm": 3.015625, "learning_rate": 0.00029280469445550213, "loss": 5.1519, "step": 4115 }, { "epoch": 1.8508535489667564, "grad_norm": 3.09375, "learning_rate": 0.0002927709411661255, "loss": 5.181, "step": 4120 }, { "epoch": 1.853099730458221, "grad_norm": 3.15625, "learning_rate": 0.00029273711107113856, "loss": 5.1855, "step": 4125 }, { "epoch": 1.8553459119496856, "grad_norm": 3.015625, "learning_rate": 0.00029270320419087743, "loss": 5.2248, "step": 4130 }, { "epoch": 1.85759209344115, "grad_norm": 3.015625, "learning_rate": 0.00029266922054572395, "loss": 5.1783, "step": 4135 }, { "epoch": 1.8598382749326146, "grad_norm": 2.890625, "learning_rate": 0.00029263516015610655, "loss": 5.2069, "step": 4140 }, { "epoch": 1.862084456424079, "grad_norm": 3.078125, "learning_rate": 0.0002926010230424995, "loss": 5.1962, "step": 4145 }, { "epoch": 1.8643306379155435, "grad_norm": 3.0625, "learning_rate": 0.00029256680922542334, "loss": 5.1803, "step": 4150 }, { "epoch": 1.866576819407008, "grad_norm": 2.875, "learning_rate": 0.0002925325187254446, "loss": 5.2128, "step": 4155 }, { "epoch": 1.8688230008984728, "grad_norm": 2.78125, "learning_rate": 0.00029249815156317605, "loss": 5.184, "step": 4160 }, { "epoch": 1.8710691823899372, "grad_norm": 3.109375, "learning_rate": 0.0002924637077592764, "loss": 5.2263, "step": 4165 }, { "epoch": 1.8733153638814017, "grad_norm": 3.15625, "learning_rate": 0.0002924291873344505, "loss": 5.1901, "step": 4170 }, { "epoch": 1.8755615453728662, "grad_norm": 2.921875, "learning_rate": 0.00029239459030944935, "loss": 5.2521, "step": 4175 }, { "epoch": 1.8778077268643307, "grad_norm": 2.9375, "learning_rate": 0.0002923599167050697, "loss": 5.167, "step": 4180 }, { "epoch": 1.8800539083557952, "grad_norm": 2.9375, "learning_rate": 0.0002923251665421547, "loss": 5.1813, "step": 4185 }, { "epoch": 1.8823000898472597, "grad_norm": 2.8125, "learning_rate": 0.0002922903398415933, "loss": 5.2392, "step": 4190 }, { "epoch": 1.8845462713387242, "grad_norm": 3.0, "learning_rate": 0.0002922554366243205, "loss": 5.2032, "step": 4195 }, { "epoch": 1.8867924528301887, "grad_norm": 3.421875, "learning_rate": 0.00029222045691131737, "loss": 5.1849, "step": 4200 }, { "epoch": 1.8890386343216532, "grad_norm": 2.90625, "learning_rate": 0.00029218540072361074, "loss": 5.1958, "step": 4205 }, { "epoch": 1.8912848158131177, "grad_norm": 2.921875, "learning_rate": 0.0002921502680822738, "loss": 5.174, "step": 4210 }, { "epoch": 1.8935309973045822, "grad_norm": 3.25, "learning_rate": 0.0002921150590084252, "loss": 5.2986, "step": 4215 }, { "epoch": 1.8957771787960467, "grad_norm": 3.125, "learning_rate": 0.00029207977352323005, "loss": 5.1103, "step": 4220 }, { "epoch": 1.8980233602875112, "grad_norm": 2.796875, "learning_rate": 0.000292044411647899, "loss": 5.2693, "step": 4225 }, { "epoch": 1.9002695417789757, "grad_norm": 3.046875, "learning_rate": 0.00029200897340368883, "loss": 5.219, "step": 4230 }, { "epoch": 1.9025157232704402, "grad_norm": 2.921875, "learning_rate": 0.0002919734588119021, "loss": 5.1556, "step": 4235 }, { "epoch": 1.9047619047619047, "grad_norm": 3.15625, "learning_rate": 0.0002919378678938874, "loss": 5.202, "step": 4240 }, { "epoch": 1.9070080862533692, "grad_norm": 2.921875, "learning_rate": 0.000291902200671039, "loss": 5.1384, "step": 4245 }, { "epoch": 1.9092542677448336, "grad_norm": 3.140625, "learning_rate": 0.00029186645716479734, "loss": 5.1446, "step": 4250 }, { "epoch": 1.9115004492362981, "grad_norm": 3.3125, "learning_rate": 0.0002918306373966484, "loss": 5.3229, "step": 4255 }, { "epoch": 1.9137466307277629, "grad_norm": 2.96875, "learning_rate": 0.00029179474138812424, "loss": 5.1863, "step": 4260 }, { "epoch": 1.9159928122192273, "grad_norm": 3.046875, "learning_rate": 0.0002917587691608026, "loss": 5.1948, "step": 4265 }, { "epoch": 1.9182389937106918, "grad_norm": 3.25, "learning_rate": 0.00029172272073630707, "loss": 5.1398, "step": 4270 }, { "epoch": 1.9204851752021563, "grad_norm": 2.90625, "learning_rate": 0.000291686596136307, "loss": 5.2248, "step": 4275 }, { "epoch": 1.9227313566936208, "grad_norm": 2.921875, "learning_rate": 0.00029165039538251786, "loss": 5.2137, "step": 4280 }, { "epoch": 1.9249775381850853, "grad_norm": 3.046875, "learning_rate": 0.00029161411849670034, "loss": 5.2118, "step": 4285 }, { "epoch": 1.9272237196765498, "grad_norm": 3.09375, "learning_rate": 0.00029157776550066134, "loss": 5.1821, "step": 4290 }, { "epoch": 1.9294699011680145, "grad_norm": 2.890625, "learning_rate": 0.0002915413364162533, "loss": 5.1385, "step": 4295 }, { "epoch": 1.931716082659479, "grad_norm": 2.9375, "learning_rate": 0.00029150483126537445, "loss": 5.1265, "step": 4300 }, { "epoch": 1.9339622641509435, "grad_norm": 2.921875, "learning_rate": 0.0002914682500699688, "loss": 5.173, "step": 4305 }, { "epoch": 1.936208445642408, "grad_norm": 3.25, "learning_rate": 0.00029143159285202597, "loss": 5.175, "step": 4310 }, { "epoch": 1.9384546271338725, "grad_norm": 2.921875, "learning_rate": 0.0002913948596335814, "loss": 5.1925, "step": 4315 }, { "epoch": 1.940700808625337, "grad_norm": 3.21875, "learning_rate": 0.00029135805043671597, "loss": 5.1982, "step": 4320 }, { "epoch": 1.9429469901168015, "grad_norm": 3.015625, "learning_rate": 0.0002913211652835567, "loss": 5.1497, "step": 4325 }, { "epoch": 1.945193171608266, "grad_norm": 2.890625, "learning_rate": 0.00029128420419627566, "loss": 5.151, "step": 4330 }, { "epoch": 1.9474393530997305, "grad_norm": 3.015625, "learning_rate": 0.00029124716719709114, "loss": 5.1051, "step": 4335 }, { "epoch": 1.949685534591195, "grad_norm": 2.9375, "learning_rate": 0.0002912100543082666, "loss": 5.1568, "step": 4340 }, { "epoch": 1.9519317160826595, "grad_norm": 2.9375, "learning_rate": 0.0002911728655521115, "loss": 5.1824, "step": 4345 }, { "epoch": 1.954177897574124, "grad_norm": 3.03125, "learning_rate": 0.00029113560095098064, "loss": 5.1908, "step": 4350 }, { "epoch": 1.9564240790655885, "grad_norm": 3.046875, "learning_rate": 0.0002910982605272745, "loss": 5.1337, "step": 4355 }, { "epoch": 1.958670260557053, "grad_norm": 2.953125, "learning_rate": 0.0002910608443034391, "loss": 5.2017, "step": 4360 }, { "epoch": 1.9609164420485174, "grad_norm": 2.84375, "learning_rate": 0.00029102335230196615, "loss": 5.131, "step": 4365 }, { "epoch": 1.963162623539982, "grad_norm": 2.875, "learning_rate": 0.00029098578454539274, "loss": 5.1247, "step": 4370 }, { "epoch": 1.9654088050314464, "grad_norm": 3.046875, "learning_rate": 0.0002909481410563017, "loss": 5.1947, "step": 4375 }, { "epoch": 1.967654986522911, "grad_norm": 2.9375, "learning_rate": 0.0002909104218573211, "loss": 5.162, "step": 4380 }, { "epoch": 1.9699011680143754, "grad_norm": 2.9375, "learning_rate": 0.00029087262697112494, "loss": 5.1051, "step": 4385 }, { "epoch": 1.97214734950584, "grad_norm": 2.9375, "learning_rate": 0.00029083475642043216, "loss": 5.1855, "step": 4390 }, { "epoch": 1.9743935309973046, "grad_norm": 3.046875, "learning_rate": 0.0002907968102280077, "loss": 5.1933, "step": 4395 }, { "epoch": 1.9766397124887691, "grad_norm": 3.234375, "learning_rate": 0.0002907587884166616, "loss": 5.1138, "step": 4400 }, { "epoch": 1.9788858939802336, "grad_norm": 3.1875, "learning_rate": 0.0002907206910092498, "loss": 5.1579, "step": 4405 }, { "epoch": 1.9811320754716981, "grad_norm": 2.96875, "learning_rate": 0.000290682518028673, "loss": 5.1163, "step": 4410 }, { "epoch": 1.9833782569631626, "grad_norm": 2.921875, "learning_rate": 0.00029064426949787807, "loss": 5.1887, "step": 4415 }, { "epoch": 1.985624438454627, "grad_norm": 2.984375, "learning_rate": 0.0002906059454398567, "loss": 5.2164, "step": 4420 }, { "epoch": 1.9878706199460916, "grad_norm": 3.125, "learning_rate": 0.0002905675458776464, "loss": 5.0996, "step": 4425 }, { "epoch": 1.9901168014375563, "grad_norm": 3.03125, "learning_rate": 0.0002905290708343298, "loss": 5.1728, "step": 4430 }, { "epoch": 1.9923629829290208, "grad_norm": 3.0, "learning_rate": 0.00029049052033303514, "loss": 5.1126, "step": 4435 }, { "epoch": 1.9946091644204853, "grad_norm": 3.03125, "learning_rate": 0.00029045189439693564, "loss": 5.1486, "step": 4440 }, { "epoch": 1.9968553459119498, "grad_norm": 2.890625, "learning_rate": 0.00029041319304925036, "loss": 5.098, "step": 4445 }, { "epoch": 1.9991015274034143, "grad_norm": 3.0, "learning_rate": 0.0002903744163132432, "loss": 5.1236, "step": 4450 }, { "epoch": 2.001347708894879, "grad_norm": 3.171875, "learning_rate": 0.00029033556421222383, "loss": 5.1441, "step": 4455 }, { "epoch": 2.0035938903863433, "grad_norm": 2.984375, "learning_rate": 0.0002902966367695468, "loss": 5.0451, "step": 4460 }, { "epoch": 2.0058400718778078, "grad_norm": 2.859375, "learning_rate": 0.00029025763400861236, "loss": 5.104, "step": 4465 }, { "epoch": 2.0080862533692723, "grad_norm": 2.9375, "learning_rate": 0.00029021855595286574, "loss": 5.0897, "step": 4470 }, { "epoch": 2.0103324348607368, "grad_norm": 2.90625, "learning_rate": 0.0002901794026257975, "loss": 4.9517, "step": 4475 }, { "epoch": 2.0125786163522013, "grad_norm": 2.859375, "learning_rate": 0.0002901401740509435, "loss": 4.9774, "step": 4480 }, { "epoch": 2.0148247978436657, "grad_norm": 2.96875, "learning_rate": 0.0002901008702518848, "loss": 4.986, "step": 4485 }, { "epoch": 2.0170709793351302, "grad_norm": 3.015625, "learning_rate": 0.0002900614912522476, "loss": 5.0134, "step": 4490 }, { "epoch": 2.0193171608265947, "grad_norm": 3.3125, "learning_rate": 0.0002900220370757035, "loss": 5.0922, "step": 4495 }, { "epoch": 2.0215633423180592, "grad_norm": 2.8125, "learning_rate": 0.0002899825077459692, "loss": 5.0198, "step": 4500 }, { "epoch": 2.0238095238095237, "grad_norm": 3.203125, "learning_rate": 0.0002899429032868064, "loss": 5.1019, "step": 4505 }, { "epoch": 2.026055705300988, "grad_norm": 3.078125, "learning_rate": 0.0002899032237220223, "loss": 5.0552, "step": 4510 }, { "epoch": 2.0283018867924527, "grad_norm": 2.859375, "learning_rate": 0.0002898634690754689, "loss": 5.0344, "step": 4515 }, { "epoch": 2.030548068283917, "grad_norm": 2.9375, "learning_rate": 0.0002898236393710436, "loss": 5.04, "step": 4520 }, { "epoch": 2.0327942497753817, "grad_norm": 3.109375, "learning_rate": 0.00028978373463268883, "loss": 5.0868, "step": 4525 }, { "epoch": 2.035040431266846, "grad_norm": 3.234375, "learning_rate": 0.00028974375488439194, "loss": 5.0977, "step": 4530 }, { "epoch": 2.0372866127583107, "grad_norm": 2.96875, "learning_rate": 0.0002897037001501857, "loss": 5.0351, "step": 4535 }, { "epoch": 2.039532794249775, "grad_norm": 3.5, "learning_rate": 0.00028966357045414774, "loss": 5.115, "step": 4540 }, { "epoch": 2.0417789757412397, "grad_norm": 2.875, "learning_rate": 0.00028962336582040086, "loss": 5.137, "step": 4545 }, { "epoch": 2.0440251572327046, "grad_norm": 3.15625, "learning_rate": 0.0002895830862731127, "loss": 5.0389, "step": 4550 }, { "epoch": 2.046271338724169, "grad_norm": 2.921875, "learning_rate": 0.0002895427318364963, "loss": 5.045, "step": 4555 }, { "epoch": 2.0485175202156336, "grad_norm": 3.0, "learning_rate": 0.00028950230253480935, "loss": 5.0665, "step": 4560 }, { "epoch": 2.050763701707098, "grad_norm": 3.0, "learning_rate": 0.00028946179839235475, "loss": 4.9852, "step": 4565 }, { "epoch": 2.0530098831985626, "grad_norm": 2.890625, "learning_rate": 0.0002894212194334803, "loss": 5.1119, "step": 4570 }, { "epoch": 2.055256064690027, "grad_norm": 2.921875, "learning_rate": 0.00028938056568257874, "loss": 5.0799, "step": 4575 }, { "epoch": 2.0575022461814916, "grad_norm": 3.125, "learning_rate": 0.000289339837164088, "loss": 5.0597, "step": 4580 }, { "epoch": 2.059748427672956, "grad_norm": 2.859375, "learning_rate": 0.0002892990339024907, "loss": 5.0044, "step": 4585 }, { "epoch": 2.0619946091644206, "grad_norm": 3.09375, "learning_rate": 0.0002892581559223144, "loss": 5.0103, "step": 4590 }, { "epoch": 2.064240790655885, "grad_norm": 3.140625, "learning_rate": 0.00028921720324813185, "loss": 5.0157, "step": 4595 }, { "epoch": 2.0664869721473496, "grad_norm": 2.875, "learning_rate": 0.0002891761759045603, "loss": 5.0655, "step": 4600 }, { "epoch": 2.068733153638814, "grad_norm": 3.0, "learning_rate": 0.0002891350739162622, "loss": 5.1106, "step": 4605 }, { "epoch": 2.0709793351302785, "grad_norm": 3.125, "learning_rate": 0.0002890938973079447, "loss": 5.129, "step": 4610 }, { "epoch": 2.073225516621743, "grad_norm": 3.125, "learning_rate": 0.00028905264610436, "loss": 5.031, "step": 4615 }, { "epoch": 2.0754716981132075, "grad_norm": 2.859375, "learning_rate": 0.00028901132033030475, "loss": 5.0716, "step": 4620 }, { "epoch": 2.077717879604672, "grad_norm": 2.984375, "learning_rate": 0.000288969920010621, "loss": 5.0758, "step": 4625 }, { "epoch": 2.0799640610961365, "grad_norm": 3.0625, "learning_rate": 0.000288928445170195, "loss": 5.0436, "step": 4630 }, { "epoch": 2.082210242587601, "grad_norm": 2.84375, "learning_rate": 0.00028888689583395826, "loss": 5.0841, "step": 4635 }, { "epoch": 2.0844564240790655, "grad_norm": 3.140625, "learning_rate": 0.00028884527202688683, "loss": 5.0446, "step": 4640 }, { "epoch": 2.08670260557053, "grad_norm": 3.015625, "learning_rate": 0.0002888035737740016, "loss": 4.9765, "step": 4645 }, { "epoch": 2.0889487870619945, "grad_norm": 2.96875, "learning_rate": 0.00028876180110036823, "loss": 5.1058, "step": 4650 }, { "epoch": 2.091194968553459, "grad_norm": 2.953125, "learning_rate": 0.0002887199540310971, "loss": 5.0546, "step": 4655 }, { "epoch": 2.0934411500449235, "grad_norm": 2.828125, "learning_rate": 0.00028867803259134326, "loss": 4.9612, "step": 4660 }, { "epoch": 2.095687331536388, "grad_norm": 3.046875, "learning_rate": 0.00028863603680630653, "loss": 5.0064, "step": 4665 }, { "epoch": 2.0979335130278525, "grad_norm": 2.984375, "learning_rate": 0.00028859396670123135, "loss": 5.0299, "step": 4670 }, { "epoch": 2.100179694519317, "grad_norm": 3.171875, "learning_rate": 0.000288551822301407, "loss": 5.0889, "step": 4675 }, { "epoch": 2.1024258760107815, "grad_norm": 3.0, "learning_rate": 0.00028850960363216714, "loss": 5.0944, "step": 4680 }, { "epoch": 2.1046720575022464, "grad_norm": 2.953125, "learning_rate": 0.0002884673107188904, "loss": 4.9692, "step": 4685 }, { "epoch": 2.106918238993711, "grad_norm": 3.140625, "learning_rate": 0.00028842494358699973, "loss": 4.9994, "step": 4690 }, { "epoch": 2.1091644204851754, "grad_norm": 3.03125, "learning_rate": 0.000288382502261963, "loss": 5.0891, "step": 4695 }, { "epoch": 2.11141060197664, "grad_norm": 3.125, "learning_rate": 0.0002883399867692924, "loss": 5.0812, "step": 4700 }, { "epoch": 2.1136567834681044, "grad_norm": 3.1875, "learning_rate": 0.00028829739713454483, "loss": 5.0365, "step": 4705 }, { "epoch": 2.115902964959569, "grad_norm": 3.0625, "learning_rate": 0.0002882547333833218, "loss": 5.0654, "step": 4710 }, { "epoch": 2.1181491464510334, "grad_norm": 2.859375, "learning_rate": 0.00028821199554126934, "loss": 4.9854, "step": 4715 }, { "epoch": 2.120395327942498, "grad_norm": 3.046875, "learning_rate": 0.0002881691836340779, "loss": 5.0865, "step": 4720 }, { "epoch": 2.1226415094339623, "grad_norm": 2.9375, "learning_rate": 0.00028812629768748267, "loss": 5.045, "step": 4725 }, { "epoch": 2.124887690925427, "grad_norm": 3.390625, "learning_rate": 0.00028808333772726316, "loss": 5.0897, "step": 4730 }, { "epoch": 2.1271338724168913, "grad_norm": 3.265625, "learning_rate": 0.00028804030377924345, "loss": 5.0187, "step": 4735 }, { "epoch": 2.129380053908356, "grad_norm": 2.9375, "learning_rate": 0.0002879971958692921, "loss": 5.0898, "step": 4740 }, { "epoch": 2.1316262353998203, "grad_norm": 2.953125, "learning_rate": 0.00028795401402332215, "loss": 5.0058, "step": 4745 }, { "epoch": 2.133872416891285, "grad_norm": 2.984375, "learning_rate": 0.00028791075826729097, "loss": 5.0468, "step": 4750 }, { "epoch": 2.1361185983827493, "grad_norm": 3.109375, "learning_rate": 0.00028786742862720055, "loss": 5.0241, "step": 4755 }, { "epoch": 2.138364779874214, "grad_norm": 3.0625, "learning_rate": 0.0002878240251290971, "loss": 5.1405, "step": 4760 }, { "epoch": 2.1406109613656783, "grad_norm": 2.828125, "learning_rate": 0.0002877805477990713, "loss": 5.0095, "step": 4765 }, { "epoch": 2.142857142857143, "grad_norm": 3.203125, "learning_rate": 0.00028773699666325835, "loss": 5.0425, "step": 4770 }, { "epoch": 2.1451033243486073, "grad_norm": 3.078125, "learning_rate": 0.00028769337174783754, "loss": 5.0217, "step": 4775 }, { "epoch": 2.147349505840072, "grad_norm": 3.078125, "learning_rate": 0.0002876496730790327, "loss": 5.0803, "step": 4780 }, { "epoch": 2.1495956873315363, "grad_norm": 3.484375, "learning_rate": 0.00028760590068311194, "loss": 5.0487, "step": 4785 }, { "epoch": 2.1518418688230008, "grad_norm": 3.0625, "learning_rate": 0.00028756205458638776, "loss": 5.0174, "step": 4790 }, { "epoch": 2.1540880503144653, "grad_norm": 3.421875, "learning_rate": 0.00028751813481521694, "loss": 5.0855, "step": 4795 }, { "epoch": 2.1563342318059298, "grad_norm": 3.140625, "learning_rate": 0.00028747414139600034, "loss": 5.0706, "step": 4800 }, { "epoch": 2.1585804132973943, "grad_norm": 2.984375, "learning_rate": 0.0002874300743551835, "loss": 5.1177, "step": 4805 }, { "epoch": 2.1608265947888587, "grad_norm": 3.328125, "learning_rate": 0.0002873859337192558, "loss": 5.0589, "step": 4810 }, { "epoch": 2.1630727762803232, "grad_norm": 3.140625, "learning_rate": 0.00028734171951475104, "loss": 5.0959, "step": 4815 }, { "epoch": 2.165318957771788, "grad_norm": 3.078125, "learning_rate": 0.00028729743176824735, "loss": 5.0754, "step": 4820 }, { "epoch": 2.1675651392632527, "grad_norm": 3.03125, "learning_rate": 0.0002872530705063669, "loss": 5.0442, "step": 4825 }, { "epoch": 2.169811320754717, "grad_norm": 3.421875, "learning_rate": 0.00028720863575577615, "loss": 4.9739, "step": 4830 }, { "epoch": 2.1720575022461817, "grad_norm": 3.09375, "learning_rate": 0.0002871641275431856, "loss": 5.0175, "step": 4835 }, { "epoch": 2.174303683737646, "grad_norm": 3.15625, "learning_rate": 0.0002871195458953501, "loss": 5.0096, "step": 4840 }, { "epoch": 2.1765498652291106, "grad_norm": 3.125, "learning_rate": 0.0002870748908390686, "loss": 5.0525, "step": 4845 }, { "epoch": 2.178796046720575, "grad_norm": 2.890625, "learning_rate": 0.0002870301624011839, "loss": 5.0469, "step": 4850 }, { "epoch": 2.1810422282120396, "grad_norm": 3.0, "learning_rate": 0.0002869853606085834, "loss": 5.0679, "step": 4855 }, { "epoch": 2.183288409703504, "grad_norm": 2.9375, "learning_rate": 0.00028694048548819816, "loss": 5.0369, "step": 4860 }, { "epoch": 2.1855345911949686, "grad_norm": 3.078125, "learning_rate": 0.00028689553706700356, "loss": 5.0443, "step": 4865 }, { "epoch": 2.187780772686433, "grad_norm": 3.0625, "learning_rate": 0.000286850515372019, "loss": 4.9984, "step": 4870 }, { "epoch": 2.1900269541778976, "grad_norm": 2.9375, "learning_rate": 0.00028680542043030787, "loss": 4.9734, "step": 4875 }, { "epoch": 2.192273135669362, "grad_norm": 2.953125, "learning_rate": 0.0002867602522689776, "loss": 5.0096, "step": 4880 }, { "epoch": 2.1945193171608266, "grad_norm": 3.265625, "learning_rate": 0.00028671501091517967, "loss": 4.9606, "step": 4885 }, { "epoch": 2.196765498652291, "grad_norm": 3.171875, "learning_rate": 0.0002866696963961096, "loss": 5.072, "step": 4890 }, { "epoch": 2.1990116801437556, "grad_norm": 2.96875, "learning_rate": 0.0002866243087390067, "loss": 5.0319, "step": 4895 }, { "epoch": 2.20125786163522, "grad_norm": 3.125, "learning_rate": 0.0002865788479711545, "loss": 5.0198, "step": 4900 }, { "epoch": 2.2035040431266846, "grad_norm": 3.015625, "learning_rate": 0.00028653331411988034, "loss": 5.001, "step": 4905 }, { "epoch": 2.205750224618149, "grad_norm": 2.890625, "learning_rate": 0.00028648770721255543, "loss": 5.0652, "step": 4910 }, { "epoch": 2.2079964061096136, "grad_norm": 3.0625, "learning_rate": 0.000286442027276595, "loss": 4.9551, "step": 4915 }, { "epoch": 2.210242587601078, "grad_norm": 3.09375, "learning_rate": 0.0002863962743394583, "loss": 5.0335, "step": 4920 }, { "epoch": 2.2124887690925426, "grad_norm": 3.265625, "learning_rate": 0.00028635044842864805, "loss": 5.0267, "step": 4925 }, { "epoch": 2.214734950584007, "grad_norm": 3.34375, "learning_rate": 0.0002863045495717113, "loss": 5.0602, "step": 4930 }, { "epoch": 2.2169811320754715, "grad_norm": 3.09375, "learning_rate": 0.0002862585777962387, "loss": 5.0753, "step": 4935 }, { "epoch": 2.219227313566936, "grad_norm": 3.09375, "learning_rate": 0.0002862125331298648, "loss": 5.0716, "step": 4940 }, { "epoch": 2.2214734950584005, "grad_norm": 3.328125, "learning_rate": 0.0002861664156002679, "loss": 5.0408, "step": 4945 }, { "epoch": 2.223719676549865, "grad_norm": 3.25, "learning_rate": 0.00028612022523517015, "loss": 5.0705, "step": 4950 }, { "epoch": 2.22596585804133, "grad_norm": 2.921875, "learning_rate": 0.0002860739620623375, "loss": 5.06, "step": 4955 }, { "epoch": 2.2282120395327945, "grad_norm": 2.953125, "learning_rate": 0.00028602762610957966, "loss": 5.0575, "step": 4960 }, { "epoch": 2.230458221024259, "grad_norm": 2.90625, "learning_rate": 0.0002859812174047501, "loss": 5.0911, "step": 4965 }, { "epoch": 2.2327044025157234, "grad_norm": 3.0625, "learning_rate": 0.00028593473597574595, "loss": 5.0714, "step": 4970 }, { "epoch": 2.234950584007188, "grad_norm": 3.15625, "learning_rate": 0.00028588818185050816, "loss": 4.9425, "step": 4975 }, { "epoch": 2.2371967654986524, "grad_norm": 3.359375, "learning_rate": 0.00028584155505702124, "loss": 5.0257, "step": 4980 }, { "epoch": 2.239442946990117, "grad_norm": 3.015625, "learning_rate": 0.00028579485562331354, "loss": 4.9997, "step": 4985 }, { "epoch": 2.2416891284815814, "grad_norm": 3.0, "learning_rate": 0.00028574808357745697, "loss": 5.136, "step": 4990 }, { "epoch": 2.243935309973046, "grad_norm": 3.046875, "learning_rate": 0.0002857012389475671, "loss": 4.9934, "step": 4995 }, { "epoch": 2.2461814914645104, "grad_norm": 3.203125, "learning_rate": 0.0002856543217618033, "loss": 5.0804, "step": 5000 }, { "epoch": 2.2461814914645104, "eval_loss": 5.077792644500732, "eval_runtime": 16.1311, "eval_samples_per_second": 1922.556, "eval_steps_per_second": 240.343, "step": 5000 } ], "logging_steps": 5, "max_steps": 22260, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.35397447273728e+17, "train_batch_size": 32, "trial_name": null, "trial_params": null }