{ "best_metric": null, "best_model_checkpoint": null, "epoch": 7.952286282306163, "eval_steps": 500, "global_step": 4000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.009940357852882704, "grad_norm": 7.9375, "learning_rate": 1e-05, "loss": 10.9268, "step": 5 }, { "epoch": 0.019880715705765408, "grad_norm": 7.125, "learning_rate": 2e-05, "loss": 10.8658, "step": 10 }, { "epoch": 0.02982107355864811, "grad_norm": 5.34375, "learning_rate": 3e-05, "loss": 10.6834, "step": 15 }, { "epoch": 0.039761431411530816, "grad_norm": 3.921875, "learning_rate": 4e-05, "loss": 10.4582, "step": 20 }, { "epoch": 0.04970178926441352, "grad_norm": 3.21875, "learning_rate": 5e-05, "loss": 10.2988, "step": 25 }, { "epoch": 0.05964214711729622, "grad_norm": 2.75, "learning_rate": 6e-05, "loss": 10.2331, "step": 30 }, { "epoch": 0.06958250497017893, "grad_norm": 2.859375, "learning_rate": 7.000000000000001e-05, "loss": 10.0708, "step": 35 }, { "epoch": 0.07952286282306163, "grad_norm": 2.8125, "learning_rate": 8e-05, "loss": 9.92, "step": 40 }, { "epoch": 0.08946322067594434, "grad_norm": 2.734375, "learning_rate": 8.999999999999999e-05, "loss": 9.7669, "step": 45 }, { "epoch": 0.09940357852882704, "grad_norm": 2.578125, "learning_rate": 0.0001, "loss": 9.5628, "step": 50 }, { "epoch": 0.10934393638170974, "grad_norm": 2.40625, "learning_rate": 0.00011, "loss": 9.4269, "step": 55 }, { "epoch": 0.11928429423459244, "grad_norm": 2.109375, "learning_rate": 0.00012, "loss": 9.1929, "step": 60 }, { "epoch": 0.12922465208747516, "grad_norm": 2.015625, "learning_rate": 0.00013000000000000002, "loss": 8.9863, "step": 65 }, { "epoch": 0.13916500994035785, "grad_norm": 1.6171875, "learning_rate": 0.00014000000000000001, "loss": 8.8215, "step": 70 }, { "epoch": 0.14910536779324055, "grad_norm": 1.390625, "learning_rate": 0.00015, "loss": 8.6842, "step": 75 }, { "epoch": 0.15904572564612326, "grad_norm": 1.234375, "learning_rate": 0.00016, "loss": 8.5892, "step": 80 }, { "epoch": 0.16898608349900596, "grad_norm": 1.2734375, "learning_rate": 0.00017, "loss": 8.4767, "step": 85 }, { "epoch": 0.17892644135188868, "grad_norm": 1.09375, "learning_rate": 0.00017999999999999998, "loss": 8.4466, "step": 90 }, { "epoch": 0.18886679920477137, "grad_norm": 1.4140625, "learning_rate": 0.00019, "loss": 8.4066, "step": 95 }, { "epoch": 0.1988071570576541, "grad_norm": 1.0625, "learning_rate": 0.0002, "loss": 8.4231, "step": 100 }, { "epoch": 0.20874751491053678, "grad_norm": 1.140625, "learning_rate": 0.00021, "loss": 8.3899, "step": 105 }, { "epoch": 0.21868787276341947, "grad_norm": 1.46875, "learning_rate": 0.00022, "loss": 8.4064, "step": 110 }, { "epoch": 0.2286282306163022, "grad_norm": 1.890625, "learning_rate": 0.00023, "loss": 8.3748, "step": 115 }, { "epoch": 0.23856858846918488, "grad_norm": 1.59375, "learning_rate": 0.00024, "loss": 8.3608, "step": 120 }, { "epoch": 0.2485089463220676, "grad_norm": 1.421875, "learning_rate": 0.00025, "loss": 8.297, "step": 125 }, { "epoch": 0.2584493041749503, "grad_norm": 1.296875, "learning_rate": 0.00026000000000000003, "loss": 8.3535, "step": 130 }, { "epoch": 0.268389662027833, "grad_norm": 1.6953125, "learning_rate": 0.00027, "loss": 8.2688, "step": 135 }, { "epoch": 0.2783300198807157, "grad_norm": 3.09375, "learning_rate": 0.00028000000000000003, "loss": 8.2854, "step": 140 }, { "epoch": 0.2882703777335984, "grad_norm": 1.921875, "learning_rate": 0.00029, "loss": 8.1947, "step": 145 }, { "epoch": 0.2982107355864811, "grad_norm": 1.4921875, "learning_rate": 0.0003, "loss": 8.1805, "step": 150 }, { "epoch": 0.3081510934393638, "grad_norm": 1.578125, "learning_rate": 0.00031, "loss": 8.2251, "step": 155 }, { "epoch": 0.31809145129224653, "grad_norm": 1.4140625, "learning_rate": 0.00032, "loss": 8.1777, "step": 160 }, { "epoch": 0.32803180914512925, "grad_norm": 1.5078125, "learning_rate": 0.00033, "loss": 8.1526, "step": 165 }, { "epoch": 0.3379721669980119, "grad_norm": 2.8125, "learning_rate": 0.00034, "loss": 8.1298, "step": 170 }, { "epoch": 0.34791252485089463, "grad_norm": 2.03125, "learning_rate": 0.00035, "loss": 8.0809, "step": 175 }, { "epoch": 0.35785288270377735, "grad_norm": 1.625, "learning_rate": 0.00035999999999999997, "loss": 8.0967, "step": 180 }, { "epoch": 0.36779324055666, "grad_norm": 1.5625, "learning_rate": 0.00037, "loss": 8.0563, "step": 185 }, { "epoch": 0.37773359840954274, "grad_norm": 1.6484375, "learning_rate": 0.00038, "loss": 8.0444, "step": 190 }, { "epoch": 0.38767395626242546, "grad_norm": 1.609375, "learning_rate": 0.00039000000000000005, "loss": 8.08, "step": 195 }, { "epoch": 0.3976143141153082, "grad_norm": 1.828125, "learning_rate": 0.0004, "loss": 8.0104, "step": 200 }, { "epoch": 0.40755467196819084, "grad_norm": 1.40625, "learning_rate": 0.00041, "loss": 7.9939, "step": 205 }, { "epoch": 0.41749502982107356, "grad_norm": 1.4609375, "learning_rate": 0.00042, "loss": 7.9853, "step": 210 }, { "epoch": 0.4274353876739563, "grad_norm": 1.734375, "learning_rate": 0.00043, "loss": 7.9695, "step": 215 }, { "epoch": 0.43737574552683894, "grad_norm": 1.5859375, "learning_rate": 0.00044, "loss": 7.9762, "step": 220 }, { "epoch": 0.44731610337972166, "grad_norm": 1.6875, "learning_rate": 0.00045000000000000004, "loss": 7.9428, "step": 225 }, { "epoch": 0.4572564612326044, "grad_norm": 1.3671875, "learning_rate": 0.00046, "loss": 7.8805, "step": 230 }, { "epoch": 0.4671968190854871, "grad_norm": 1.6796875, "learning_rate": 0.00047, "loss": 7.8542, "step": 235 }, { "epoch": 0.47713717693836977, "grad_norm": 1.6015625, "learning_rate": 0.00048, "loss": 7.9372, "step": 240 }, { "epoch": 0.4870775347912525, "grad_norm": 1.421875, "learning_rate": 0.00049, "loss": 7.8429, "step": 245 }, { "epoch": 0.4970178926441352, "grad_norm": 1.5390625, "learning_rate": 0.0005, "loss": 7.797, "step": 250 }, { "epoch": 0.5069582504970179, "grad_norm": 2.03125, "learning_rate": 0.00051, "loss": 7.8132, "step": 255 }, { "epoch": 0.5168986083499006, "grad_norm": 1.546875, "learning_rate": 0.0005200000000000001, "loss": 7.8295, "step": 260 }, { "epoch": 0.5268389662027833, "grad_norm": 1.5703125, "learning_rate": 0.0005300000000000001, "loss": 7.777, "step": 265 }, { "epoch": 0.536779324055666, "grad_norm": 1.5546875, "learning_rate": 0.00054, "loss": 7.7526, "step": 270 }, { "epoch": 0.5467196819085487, "grad_norm": 1.6953125, "learning_rate": 0.00055, "loss": 7.7913, "step": 275 }, { "epoch": 0.5566600397614314, "grad_norm": 1.78125, "learning_rate": 0.0005600000000000001, "loss": 7.7696, "step": 280 }, { "epoch": 0.5666003976143141, "grad_norm": 1.75, "learning_rate": 0.00057, "loss": 7.7029, "step": 285 }, { "epoch": 0.5765407554671969, "grad_norm": 1.4765625, "learning_rate": 0.00058, "loss": 7.7941, "step": 290 }, { "epoch": 0.5864811133200796, "grad_norm": 1.734375, "learning_rate": 0.00059, "loss": 7.7566, "step": 295 }, { "epoch": 0.5964214711729622, "grad_norm": 1.671875, "learning_rate": 0.0006, "loss": 7.6959, "step": 300 }, { "epoch": 0.6063618290258449, "grad_norm": 1.5234375, "learning_rate": 0.00061, "loss": 7.6751, "step": 305 }, { "epoch": 0.6163021868787276, "grad_norm": 1.453125, "learning_rate": 0.00062, "loss": 7.6124, "step": 310 }, { "epoch": 0.6262425447316103, "grad_norm": 1.484375, "learning_rate": 0.00063, "loss": 7.6487, "step": 315 }, { "epoch": 0.6361829025844931, "grad_norm": 1.53125, "learning_rate": 0.00064, "loss": 7.5655, "step": 320 }, { "epoch": 0.6461232604373758, "grad_norm": 2.046875, "learning_rate": 0.0006500000000000001, "loss": 7.673, "step": 325 }, { "epoch": 0.6560636182902585, "grad_norm": 1.859375, "learning_rate": 0.00066, "loss": 7.6439, "step": 330 }, { "epoch": 0.6660039761431411, "grad_norm": 1.609375, "learning_rate": 0.00067, "loss": 7.5623, "step": 335 }, { "epoch": 0.6759443339960238, "grad_norm": 1.7265625, "learning_rate": 0.00068, "loss": 7.611, "step": 340 }, { "epoch": 0.6858846918489065, "grad_norm": 1.59375, "learning_rate": 0.00069, "loss": 7.5456, "step": 345 }, { "epoch": 0.6958250497017893, "grad_norm": 1.8046875, "learning_rate": 0.0007, "loss": 7.6076, "step": 350 }, { "epoch": 0.705765407554672, "grad_norm": 1.6015625, "learning_rate": 0.00071, "loss": 7.5633, "step": 355 }, { "epoch": 0.7157057654075547, "grad_norm": 1.546875, "learning_rate": 0.0007199999999999999, "loss": 7.5467, "step": 360 }, { "epoch": 0.7256461232604374, "grad_norm": 1.59375, "learning_rate": 0.00073, "loss": 7.524, "step": 365 }, { "epoch": 0.73558648111332, "grad_norm": 1.546875, "learning_rate": 0.00074, "loss": 7.5077, "step": 370 }, { "epoch": 0.7455268389662028, "grad_norm": 1.4375, "learning_rate": 0.00075, "loss": 7.418, "step": 375 }, { "epoch": 0.7554671968190855, "grad_norm": 1.6171875, "learning_rate": 0.00076, "loss": 7.4668, "step": 380 }, { "epoch": 0.7654075546719682, "grad_norm": 1.6875, "learning_rate": 0.0007700000000000001, "loss": 7.5067, "step": 385 }, { "epoch": 0.7753479125248509, "grad_norm": 2.03125, "learning_rate": 0.0007800000000000001, "loss": 7.4381, "step": 390 }, { "epoch": 0.7852882703777336, "grad_norm": 1.7578125, "learning_rate": 0.00079, "loss": 7.4506, "step": 395 }, { "epoch": 0.7952286282306164, "grad_norm": 1.6796875, "learning_rate": 0.0008, "loss": 7.414, "step": 400 }, { "epoch": 0.805168986083499, "grad_norm": 1.453125, "learning_rate": 0.0008100000000000001, "loss": 7.4373, "step": 405 }, { "epoch": 0.8151093439363817, "grad_norm": 1.6015625, "learning_rate": 0.00082, "loss": 7.4198, "step": 410 }, { "epoch": 0.8250497017892644, "grad_norm": 1.53125, "learning_rate": 0.00083, "loss": 7.432, "step": 415 }, { "epoch": 0.8349900596421471, "grad_norm": 1.6796875, "learning_rate": 0.00084, "loss": 7.3658, "step": 420 }, { "epoch": 0.8449304174950298, "grad_norm": 1.5, "learning_rate": 0.00085, "loss": 7.4176, "step": 425 }, { "epoch": 0.8548707753479126, "grad_norm": 1.546875, "learning_rate": 0.00086, "loss": 7.3672, "step": 430 }, { "epoch": 0.8648111332007953, "grad_norm": 1.5546875, "learning_rate": 0.00087, "loss": 7.192, "step": 435 }, { "epoch": 0.8747514910536779, "grad_norm": 1.375, "learning_rate": 0.00088, "loss": 7.3876, "step": 440 }, { "epoch": 0.8846918489065606, "grad_norm": 1.4296875, "learning_rate": 0.0008900000000000001, "loss": 7.3364, "step": 445 }, { "epoch": 0.8946322067594433, "grad_norm": 1.453125, "learning_rate": 0.0009000000000000001, "loss": 7.4146, "step": 450 }, { "epoch": 0.904572564612326, "grad_norm": 1.4921875, "learning_rate": 0.00091, "loss": 7.3658, "step": 455 }, { "epoch": 0.9145129224652088, "grad_norm": 1.4921875, "learning_rate": 0.00092, "loss": 7.4027, "step": 460 }, { "epoch": 0.9244532803180915, "grad_norm": 1.484375, "learning_rate": 0.00093, "loss": 7.389, "step": 465 }, { "epoch": 0.9343936381709742, "grad_norm": 1.7265625, "learning_rate": 0.00094, "loss": 7.2114, "step": 470 }, { "epoch": 0.9443339960238568, "grad_norm": 1.640625, "learning_rate": 0.00095, "loss": 7.2808, "step": 475 }, { "epoch": 0.9542743538767395, "grad_norm": 1.5078125, "learning_rate": 0.00096, "loss": 7.2878, "step": 480 }, { "epoch": 0.9642147117296223, "grad_norm": 1.4921875, "learning_rate": 0.0009699999999999999, "loss": 7.2404, "step": 485 }, { "epoch": 0.974155069582505, "grad_norm": 1.4921875, "learning_rate": 0.00098, "loss": 7.1975, "step": 490 }, { "epoch": 0.9840954274353877, "grad_norm": 1.6796875, "learning_rate": 0.00099, "loss": 7.3406, "step": 495 }, { "epoch": 0.9940357852882704, "grad_norm": 1.4296875, "learning_rate": 0.001, "loss": 7.2716, "step": 500 }, { "epoch": 0.9940357852882704, "eval_loss": 7.370969295501709, "eval_runtime": 1.0019, "eval_samples_per_second": 3458.582, "eval_steps_per_second": 433.196, "step": 500 }, { "epoch": 1.0039761431411531, "grad_norm": 1.5703125, "learning_rate": 0.0009999972946377045, "loss": 7.1343, "step": 505 }, { "epoch": 1.0139165009940359, "grad_norm": 1.4375, "learning_rate": 0.0009999891785833469, "loss": 6.9134, "step": 510 }, { "epoch": 1.0238568588469186, "grad_norm": 2.671875, "learning_rate": 0.0009999756519345133, "loss": 7.1282, "step": 515 }, { "epoch": 1.0337972166998013, "grad_norm": 1.4453125, "learning_rate": 0.0009999567148538456, "loss": 7.0434, "step": 520 }, { "epoch": 1.0437375745526838, "grad_norm": 1.5234375, "learning_rate": 0.0009999323675690406, "loss": 6.9941, "step": 525 }, { "epoch": 1.0536779324055665, "grad_norm": 1.4921875, "learning_rate": 0.0009999026103728454, "loss": 7.0054, "step": 530 }, { "epoch": 1.0636182902584492, "grad_norm": 1.484375, "learning_rate": 0.0009998674436230558, "loss": 7.066, "step": 535 }, { "epoch": 1.073558648111332, "grad_norm": 2.640625, "learning_rate": 0.000999826867742511, "loss": 7.0517, "step": 540 }, { "epoch": 1.0834990059642147, "grad_norm": 1.5, "learning_rate": 0.0009997808832190884, "loss": 7.0508, "step": 545 }, { "epoch": 1.0934393638170974, "grad_norm": 1.625, "learning_rate": 0.0009997294906056982, "loss": 7.0144, "step": 550 }, { "epoch": 1.10337972166998, "grad_norm": 1.46875, "learning_rate": 0.000999672690520277, "loss": 7.0313, "step": 555 }, { "epoch": 1.1133200795228628, "grad_norm": 1.3046875, "learning_rate": 0.000999610483645779, "loss": 6.9706, "step": 560 }, { "epoch": 1.1232604373757455, "grad_norm": 1.4453125, "learning_rate": 0.0009995428707301694, "loss": 6.8887, "step": 565 }, { "epoch": 1.1332007952286283, "grad_norm": 1.453125, "learning_rate": 0.0009994698525864147, "loss": 7.0384, "step": 570 }, { "epoch": 1.143141153081511, "grad_norm": 1.4921875, "learning_rate": 0.0009993914300924726, "loss": 7.0223, "step": 575 }, { "epoch": 1.1530815109343937, "grad_norm": 1.453125, "learning_rate": 0.000999307604191282, "loss": 6.9796, "step": 580 }, { "epoch": 1.1630218687872764, "grad_norm": 1.390625, "learning_rate": 0.0009992183758907518, "loss": 6.9398, "step": 585 }, { "epoch": 1.1729622266401591, "grad_norm": 1.671875, "learning_rate": 0.0009991237462637478, "loss": 7.0023, "step": 590 }, { "epoch": 1.1829025844930419, "grad_norm": 1.6328125, "learning_rate": 0.000999023716448081, "loss": 6.9849, "step": 595 }, { "epoch": 1.1928429423459244, "grad_norm": 1.3203125, "learning_rate": 0.0009989182876464931, "loss": 6.7996, "step": 600 }, { "epoch": 1.202783300198807, "grad_norm": 1.5625, "learning_rate": 0.0009988074611266423, "loss": 6.9645, "step": 605 }, { "epoch": 1.2127236580516898, "grad_norm": 1.5390625, "learning_rate": 0.000998691238221088, "loss": 7.0538, "step": 610 }, { "epoch": 1.2226640159045725, "grad_norm": 1.4453125, "learning_rate": 0.0009985696203272752, "loss": 6.9212, "step": 615 }, { "epoch": 1.2326043737574552, "grad_norm": 1.3359375, "learning_rate": 0.0009984426089075168, "loss": 6.9601, "step": 620 }, { "epoch": 1.242544731610338, "grad_norm": 1.4375, "learning_rate": 0.000998310205488977, "loss": 7.0175, "step": 625 }, { "epoch": 1.2524850894632207, "grad_norm": 1.375, "learning_rate": 0.0009981724116636525, "loss": 6.9285, "step": 630 }, { "epoch": 1.2624254473161034, "grad_norm": 1.515625, "learning_rate": 0.0009980292290883526, "loss": 6.9276, "step": 635 }, { "epoch": 1.2723658051689861, "grad_norm": 1.4609375, "learning_rate": 0.000997880659484681, "loss": 6.898, "step": 640 }, { "epoch": 1.2823061630218688, "grad_norm": 1.328125, "learning_rate": 0.0009977267046390138, "loss": 6.9623, "step": 645 }, { "epoch": 1.2922465208747516, "grad_norm": 1.3671875, "learning_rate": 0.000997567366402478, "loss": 6.9184, "step": 650 }, { "epoch": 1.302186878727634, "grad_norm": 1.40625, "learning_rate": 0.0009974026466909299, "loss": 6.8785, "step": 655 }, { "epoch": 1.3121272365805168, "grad_norm": 1.484375, "learning_rate": 0.000997232547484932, "loss": 6.9804, "step": 660 }, { "epoch": 1.3220675944333995, "grad_norm": 1.921875, "learning_rate": 0.0009970570708297281, "loss": 6.9272, "step": 665 }, { "epoch": 1.3320079522862822, "grad_norm": 2.34375, "learning_rate": 0.0009968762188352208, "loss": 6.8377, "step": 670 }, { "epoch": 1.341948310139165, "grad_norm": 1.578125, "learning_rate": 0.0009966899936759436, "loss": 6.8753, "step": 675 }, { "epoch": 1.3518886679920477, "grad_norm": 1.328125, "learning_rate": 0.0009964983975910369, "loss": 6.8766, "step": 680 }, { "epoch": 1.3618290258449304, "grad_norm": 1.5234375, "learning_rate": 0.0009963014328842196, "loss": 6.7019, "step": 685 }, { "epoch": 1.371769383697813, "grad_norm": 1.4375, "learning_rate": 0.0009960991019237627, "loss": 6.8576, "step": 690 }, { "epoch": 1.3817097415506958, "grad_norm": 1.3125, "learning_rate": 0.0009958914071424596, "loss": 6.8327, "step": 695 }, { "epoch": 1.3916500994035785, "grad_norm": 1.3984375, "learning_rate": 0.0009956783510375975, "loss": 6.8854, "step": 700 }, { "epoch": 1.4015904572564613, "grad_norm": 1.375, "learning_rate": 0.0009954599361709276, "loss": 6.8141, "step": 705 }, { "epoch": 1.411530815109344, "grad_norm": 1.40625, "learning_rate": 0.0009952361651686331, "loss": 6.8819, "step": 710 }, { "epoch": 1.4214711729622267, "grad_norm": 1.390625, "learning_rate": 0.0009950070407212996, "loss": 6.7342, "step": 715 }, { "epoch": 1.4314115308151094, "grad_norm": 1.4375, "learning_rate": 0.0009947725655838806, "loss": 6.9538, "step": 720 }, { "epoch": 1.4413518886679921, "grad_norm": 1.34375, "learning_rate": 0.0009945327425756661, "loss": 6.7637, "step": 725 }, { "epoch": 1.4512922465208749, "grad_norm": 1.3671875, "learning_rate": 0.000994287574580248, "loss": 6.8231, "step": 730 }, { "epoch": 1.4612326043737576, "grad_norm": 1.3828125, "learning_rate": 0.0009940370645454848, "loss": 6.7927, "step": 735 }, { "epoch": 1.4711729622266403, "grad_norm": 1.3984375, "learning_rate": 0.000993781215483467, "loss": 6.7573, "step": 740 }, { "epoch": 1.4811133200795228, "grad_norm": 1.4765625, "learning_rate": 0.0009935200304704815, "loss": 6.8108, "step": 745 }, { "epoch": 1.4910536779324055, "grad_norm": 1.3359375, "learning_rate": 0.0009932535126469725, "loss": 6.7694, "step": 750 }, { "epoch": 1.5009940357852882, "grad_norm": 1.4765625, "learning_rate": 0.0009929816652175063, "loss": 6.8232, "step": 755 }, { "epoch": 1.510934393638171, "grad_norm": 1.28125, "learning_rate": 0.00099270449145073, "loss": 6.7734, "step": 760 }, { "epoch": 1.5208747514910537, "grad_norm": 1.46875, "learning_rate": 0.0009924219946793353, "loss": 6.7136, "step": 765 }, { "epoch": 1.5308151093439364, "grad_norm": 1.328125, "learning_rate": 0.0009921341783000158, "loss": 6.7783, "step": 770 }, { "epoch": 1.540755467196819, "grad_norm": 1.3828125, "learning_rate": 0.000991841045773427, "loss": 6.7885, "step": 775 }, { "epoch": 1.5506958250497018, "grad_norm": 1.3046875, "learning_rate": 0.000991542600624146, "loss": 6.8011, "step": 780 }, { "epoch": 1.5606361829025845, "grad_norm": 1.3984375, "learning_rate": 0.0009912388464406265, "loss": 6.7833, "step": 785 }, { "epoch": 1.570576540755467, "grad_norm": 1.4375, "learning_rate": 0.0009909297868751585, "loss": 6.8117, "step": 790 }, { "epoch": 1.5805168986083498, "grad_norm": 1.375, "learning_rate": 0.0009906154256438223, "loss": 6.7266, "step": 795 }, { "epoch": 1.5904572564612325, "grad_norm": 1.28125, "learning_rate": 0.0009902957665264443, "loss": 6.7726, "step": 800 }, { "epoch": 1.6003976143141152, "grad_norm": 1.3046875, "learning_rate": 0.0009899708133665529, "loss": 6.6905, "step": 805 }, { "epoch": 1.610337972166998, "grad_norm": 1.4140625, "learning_rate": 0.0009896405700713295, "loss": 6.7339, "step": 810 }, { "epoch": 1.6202783300198806, "grad_norm": 1.28125, "learning_rate": 0.000989305040611565, "loss": 6.7015, "step": 815 }, { "epoch": 1.6302186878727634, "grad_norm": 1.3359375, "learning_rate": 0.0009889642290216085, "loss": 6.8003, "step": 820 }, { "epoch": 1.640159045725646, "grad_norm": 1.390625, "learning_rate": 0.0009886181393993223, "loss": 6.5606, "step": 825 }, { "epoch": 1.6500994035785288, "grad_norm": 1.421875, "learning_rate": 0.0009882667759060298, "loss": 6.6673, "step": 830 }, { "epoch": 1.6600397614314115, "grad_norm": 1.453125, "learning_rate": 0.0009879101427664662, "loss": 6.756, "step": 835 }, { "epoch": 1.6699801192842942, "grad_norm": 1.359375, "learning_rate": 0.0009875482442687294, "loss": 6.7176, "step": 840 }, { "epoch": 1.679920477137177, "grad_norm": 1.3125, "learning_rate": 0.0009871810847642258, "loss": 6.7124, "step": 845 }, { "epoch": 1.6898608349900597, "grad_norm": 1.28125, "learning_rate": 0.00098680866866762, "loss": 6.6817, "step": 850 }, { "epoch": 1.6998011928429424, "grad_norm": 1.34375, "learning_rate": 0.0009864310004567807, "loss": 6.6662, "step": 855 }, { "epoch": 1.7097415506958251, "grad_norm": 1.3046875, "learning_rate": 0.000986048084672727, "loss": 6.7013, "step": 860 }, { "epoch": 1.7196819085487078, "grad_norm": 1.3515625, "learning_rate": 0.0009856599259195741, "loss": 6.7312, "step": 865 }, { "epoch": 1.7296222664015906, "grad_norm": 1.4296875, "learning_rate": 0.0009852665288644783, "loss": 6.733, "step": 870 }, { "epoch": 1.7395626242544733, "grad_norm": 1.3984375, "learning_rate": 0.000984867898237579, "loss": 6.7149, "step": 875 }, { "epoch": 1.749502982107356, "grad_norm": 1.34375, "learning_rate": 0.000984464038831945, "loss": 6.6369, "step": 880 }, { "epoch": 1.7594433399602387, "grad_norm": 1.2578125, "learning_rate": 0.0009840549555035136, "loss": 6.6816, "step": 885 }, { "epoch": 1.7693836978131214, "grad_norm": 1.4375, "learning_rate": 0.0009836406531710342, "loss": 6.6142, "step": 890 }, { "epoch": 1.779324055666004, "grad_norm": 1.3984375, "learning_rate": 0.0009832211368160087, "loss": 6.661, "step": 895 }, { "epoch": 1.7892644135188867, "grad_norm": 1.4453125, "learning_rate": 0.0009827964114826314, "loss": 6.7227, "step": 900 }, { "epoch": 1.7992047713717694, "grad_norm": 1.28125, "learning_rate": 0.0009823664822777285, "loss": 6.6658, "step": 905 }, { "epoch": 1.809145129224652, "grad_norm": 1.265625, "learning_rate": 0.000981931354370697, "loss": 6.6343, "step": 910 }, { "epoch": 1.8190854870775348, "grad_norm": 1.3984375, "learning_rate": 0.0009814910329934414, "loss": 6.6246, "step": 915 }, { "epoch": 1.8290258449304175, "grad_norm": 2.109375, "learning_rate": 0.0009810455234403126, "loss": 6.6103, "step": 920 }, { "epoch": 1.8389662027833003, "grad_norm": 1.546875, "learning_rate": 0.000980594831068043, "loss": 6.6444, "step": 925 }, { "epoch": 1.8489065606361827, "grad_norm": 1.3203125, "learning_rate": 0.0009801389612956815, "loss": 6.58, "step": 930 }, { "epoch": 1.8588469184890655, "grad_norm": 1.21875, "learning_rate": 0.0009796779196045303, "loss": 6.4323, "step": 935 }, { "epoch": 1.8687872763419482, "grad_norm": 1.375, "learning_rate": 0.0009792117115380774, "loss": 6.5837, "step": 940 }, { "epoch": 1.878727634194831, "grad_norm": 1.3515625, "learning_rate": 0.0009787403427019303, "loss": 6.5917, "step": 945 }, { "epoch": 1.8886679920477136, "grad_norm": 1.234375, "learning_rate": 0.000978263818763749, "loss": 6.5789, "step": 950 }, { "epoch": 1.8986083499005963, "grad_norm": 1.3203125, "learning_rate": 0.0009777821454531775, "loss": 6.6152, "step": 955 }, { "epoch": 1.908548707753479, "grad_norm": 1.1953125, "learning_rate": 0.0009772953285617748, "loss": 6.5594, "step": 960 }, { "epoch": 1.9184890656063618, "grad_norm": 1.3203125, "learning_rate": 0.0009768033739429459, "loss": 6.5918, "step": 965 }, { "epoch": 1.9284294234592445, "grad_norm": 1.4140625, "learning_rate": 0.0009763062875118706, "loss": 6.5099, "step": 970 }, { "epoch": 1.9383697813121272, "grad_norm": 1.21875, "learning_rate": 0.0009758040752454326, "loss": 6.5962, "step": 975 }, { "epoch": 1.94831013916501, "grad_norm": 1.296875, "learning_rate": 0.0009752967431821485, "loss": 6.6482, "step": 980 }, { "epoch": 1.9582504970178927, "grad_norm": 1.359375, "learning_rate": 0.0009747842974220936, "loss": 6.5015, "step": 985 }, { "epoch": 1.9681908548707754, "grad_norm": 1.25, "learning_rate": 0.00097426674412683, "loss": 6.5312, "step": 990 }, { "epoch": 1.978131212723658, "grad_norm": 1.359375, "learning_rate": 0.0009737440895193317, "loss": 6.7128, "step": 995 }, { "epoch": 1.9880715705765408, "grad_norm": 1.3125, "learning_rate": 0.0009732163398839106, "loss": 6.4696, "step": 1000 }, { "epoch": 1.9880715705765408, "eval_loss": 6.7939839363098145, "eval_runtime": 0.998, "eval_samples_per_second": 3471.994, "eval_steps_per_second": 434.876, "step": 1000 }, { "epoch": 1.9980119284294235, "grad_norm": 1.4765625, "learning_rate": 0.0009726835015661391, "loss": 6.5883, "step": 1005 }, { "epoch": 2.0079522862823063, "grad_norm": 1.25, "learning_rate": 0.0009721455809727765, "loss": 6.1925, "step": 1010 }, { "epoch": 2.017892644135189, "grad_norm": 1.3359375, "learning_rate": 0.0009716025845716894, "loss": 5.9461, "step": 1015 }, { "epoch": 2.0278330019880717, "grad_norm": 1.3046875, "learning_rate": 0.0009710545188917757, "loss": 6.1817, "step": 1020 }, { "epoch": 2.0377733598409544, "grad_norm": 1.40625, "learning_rate": 0.0009705013905228854, "loss": 6.1649, "step": 1025 }, { "epoch": 2.047713717693837, "grad_norm": 1.3359375, "learning_rate": 0.0009699432061157414, "loss": 6.1353, "step": 1030 }, { "epoch": 2.05765407554672, "grad_norm": 1.265625, "learning_rate": 0.0009693799723818591, "loss": 6.259, "step": 1035 }, { "epoch": 2.0675944333996026, "grad_norm": 1.4375, "learning_rate": 0.0009688116960934669, "loss": 6.2553, "step": 1040 }, { "epoch": 2.0775347912524853, "grad_norm": 1.1953125, "learning_rate": 0.0009682383840834234, "loss": 6.1891, "step": 1045 }, { "epoch": 2.0874751491053676, "grad_norm": 2.21875, "learning_rate": 0.0009676600432451364, "loss": 6.1401, "step": 1050 }, { "epoch": 2.0974155069582503, "grad_norm": 1.453125, "learning_rate": 0.0009670766805324789, "loss": 6.2547, "step": 1055 }, { "epoch": 2.107355864811133, "grad_norm": 1.3046875, "learning_rate": 0.0009664883029597066, "loss": 6.1731, "step": 1060 }, { "epoch": 2.1172962226640157, "grad_norm": 1.3984375, "learning_rate": 0.0009658949176013729, "loss": 6.1449, "step": 1065 }, { "epoch": 2.1272365805168985, "grad_norm": 1.328125, "learning_rate": 0.0009652965315922438, "loss": 6.1843, "step": 1070 }, { "epoch": 2.137176938369781, "grad_norm": 1.3984375, "learning_rate": 0.0009646931521272123, "loss": 6.1649, "step": 1075 }, { "epoch": 2.147117296222664, "grad_norm": 1.359375, "learning_rate": 0.0009640847864612124, "loss": 6.0978, "step": 1080 }, { "epoch": 2.1570576540755466, "grad_norm": 1.4296875, "learning_rate": 0.0009634714419091302, "loss": 6.1817, "step": 1085 }, { "epoch": 2.1669980119284293, "grad_norm": 1.28125, "learning_rate": 0.0009628531258457185, "loss": 6.2633, "step": 1090 }, { "epoch": 2.176938369781312, "grad_norm": 1.4453125, "learning_rate": 0.0009622298457055056, "loss": 6.0898, "step": 1095 }, { "epoch": 2.1868787276341948, "grad_norm": 1.515625, "learning_rate": 0.0009616016089827078, "loss": 6.1527, "step": 1100 }, { "epoch": 2.1968190854870775, "grad_norm": 1.515625, "learning_rate": 0.0009609684232311378, "loss": 6.0889, "step": 1105 }, { "epoch": 2.20675944333996, "grad_norm": 1.34375, "learning_rate": 0.0009603302960641154, "loss": 6.0709, "step": 1110 }, { "epoch": 2.216699801192843, "grad_norm": 1.390625, "learning_rate": 0.0009596872351543742, "loss": 6.1364, "step": 1115 }, { "epoch": 2.2266401590457257, "grad_norm": 1.3828125, "learning_rate": 0.0009590392482339713, "loss": 6.0234, "step": 1120 }, { "epoch": 2.2365805168986084, "grad_norm": 1.390625, "learning_rate": 0.0009583863430941926, "loss": 6.0574, "step": 1125 }, { "epoch": 2.246520874751491, "grad_norm": 1.453125, "learning_rate": 0.0009577285275854602, "loss": 6.0683, "step": 1130 }, { "epoch": 2.256461232604374, "grad_norm": 1.4296875, "learning_rate": 0.0009570658096172374, "loss": 6.1882, "step": 1135 }, { "epoch": 2.2664015904572565, "grad_norm": 1.375, "learning_rate": 0.0009563981971579342, "loss": 6.209, "step": 1140 }, { "epoch": 2.2763419483101393, "grad_norm": 1.3515625, "learning_rate": 0.0009557256982348107, "loss": 6.1383, "step": 1145 }, { "epoch": 2.286282306163022, "grad_norm": 1.296875, "learning_rate": 0.0009550483209338814, "loss": 6.2263, "step": 1150 }, { "epoch": 2.2962226640159047, "grad_norm": 1.3125, "learning_rate": 0.0009543660733998174, "loss": 6.1201, "step": 1155 }, { "epoch": 2.3061630218687874, "grad_norm": 1.4140625, "learning_rate": 0.0009536789638358488, "loss": 6.1434, "step": 1160 }, { "epoch": 2.31610337972167, "grad_norm": 1.4140625, "learning_rate": 0.000952987000503666, "loss": 6.1207, "step": 1165 }, { "epoch": 2.326043737574553, "grad_norm": 1.3828125, "learning_rate": 0.0009522901917233196, "loss": 5.9989, "step": 1170 }, { "epoch": 2.3359840954274356, "grad_norm": 1.390625, "learning_rate": 0.000951588545873122, "loss": 6.0769, "step": 1175 }, { "epoch": 2.3459244532803183, "grad_norm": 1.3203125, "learning_rate": 0.0009508820713895454, "loss": 6.0889, "step": 1180 }, { "epoch": 2.355864811133201, "grad_norm": 1.3671875, "learning_rate": 0.0009501707767671204, "loss": 6.207, "step": 1185 }, { "epoch": 2.3658051689860837, "grad_norm": 1.328125, "learning_rate": 0.0009494546705583344, "loss": 6.2066, "step": 1190 }, { "epoch": 2.3757455268389664, "grad_norm": 1.34375, "learning_rate": 0.0009487337613735288, "loss": 6.1389, "step": 1195 }, { "epoch": 2.3856858846918487, "grad_norm": 1.34375, "learning_rate": 0.0009480080578807941, "loss": 6.0482, "step": 1200 }, { "epoch": 2.3956262425447314, "grad_norm": 1.4140625, "learning_rate": 0.0009472775688058681, "loss": 6.157, "step": 1205 }, { "epoch": 2.405566600397614, "grad_norm": 1.3515625, "learning_rate": 0.0009465423029320288, "loss": 6.1028, "step": 1210 }, { "epoch": 2.415506958250497, "grad_norm": 1.390625, "learning_rate": 0.0009458022690999899, "loss": 6.091, "step": 1215 }, { "epoch": 2.4254473161033796, "grad_norm": 1.25, "learning_rate": 0.000945057476207794, "loss": 6.2019, "step": 1220 }, { "epoch": 2.4353876739562623, "grad_norm": 1.28125, "learning_rate": 0.0009443079332107064, "loss": 6.0989, "step": 1225 }, { "epoch": 2.445328031809145, "grad_norm": 1.296875, "learning_rate": 0.0009435536491211062, "loss": 6.127, "step": 1230 }, { "epoch": 2.4552683896620278, "grad_norm": 1.5390625, "learning_rate": 0.0009427946330083791, "loss": 6.2099, "step": 1235 }, { "epoch": 2.4652087475149105, "grad_norm": 1.375, "learning_rate": 0.0009420308939988073, "loss": 6.1458, "step": 1240 }, { "epoch": 2.475149105367793, "grad_norm": 1.234375, "learning_rate": 0.000941262441275461, "loss": 6.2149, "step": 1245 }, { "epoch": 2.485089463220676, "grad_norm": 1.5, "learning_rate": 0.0009404892840780868, "loss": 6.1122, "step": 1250 }, { "epoch": 2.4950298210735586, "grad_norm": 1.3359375, "learning_rate": 0.0009397114317029974, "loss": 6.0771, "step": 1255 }, { "epoch": 2.5049701789264414, "grad_norm": 1.53125, "learning_rate": 0.0009389288935029595, "loss": 6.1618, "step": 1260 }, { "epoch": 2.514910536779324, "grad_norm": 1.328125, "learning_rate": 0.0009381416788870807, "loss": 6.2248, "step": 1265 }, { "epoch": 2.524850894632207, "grad_norm": 1.453125, "learning_rate": 0.0009373497973206984, "loss": 6.1093, "step": 1270 }, { "epoch": 2.5347912524850895, "grad_norm": 1.359375, "learning_rate": 0.0009365532583252634, "loss": 6.148, "step": 1275 }, { "epoch": 2.5447316103379722, "grad_norm": 1.3671875, "learning_rate": 0.0009357520714782273, "loss": 6.1547, "step": 1280 }, { "epoch": 2.554671968190855, "grad_norm": 1.3515625, "learning_rate": 0.0009349462464129264, "loss": 6.1477, "step": 1285 }, { "epoch": 2.5646123260437377, "grad_norm": 1.390625, "learning_rate": 0.000934135792818466, "loss": 6.0481, "step": 1290 }, { "epoch": 2.5745526838966204, "grad_norm": 1.734375, "learning_rate": 0.0009333207204396049, "loss": 6.1616, "step": 1295 }, { "epoch": 2.584493041749503, "grad_norm": 1.4453125, "learning_rate": 0.0009325010390766362, "loss": 6.0867, "step": 1300 }, { "epoch": 2.594433399602386, "grad_norm": 1.3984375, "learning_rate": 0.0009316767585852716, "loss": 6.1963, "step": 1305 }, { "epoch": 2.604373757455268, "grad_norm": 1.421875, "learning_rate": 0.0009308478888765214, "loss": 6.0798, "step": 1310 }, { "epoch": 2.614314115308151, "grad_norm": 1.3359375, "learning_rate": 0.0009300144399165763, "loss": 6.0295, "step": 1315 }, { "epoch": 2.6242544731610336, "grad_norm": 1.4453125, "learning_rate": 0.0009291764217266869, "loss": 6.1378, "step": 1320 }, { "epoch": 2.6341948310139163, "grad_norm": 1.21875, "learning_rate": 0.0009283338443830432, "loss": 6.2136, "step": 1325 }, { "epoch": 2.644135188866799, "grad_norm": 1.3046875, "learning_rate": 0.0009274867180166542, "loss": 6.0917, "step": 1330 }, { "epoch": 2.6540755467196817, "grad_norm": 1.4375, "learning_rate": 0.0009266350528132253, "loss": 6.1464, "step": 1335 }, { "epoch": 2.6640159045725644, "grad_norm": 1.3828125, "learning_rate": 0.0009257788590130365, "loss": 6.1729, "step": 1340 }, { "epoch": 2.673956262425447, "grad_norm": 1.5, "learning_rate": 0.0009249181469108181, "loss": 6.1581, "step": 1345 }, { "epoch": 2.68389662027833, "grad_norm": 1.3359375, "learning_rate": 0.0009240529268556283, "loss": 6.1723, "step": 1350 }, { "epoch": 2.6938369781312126, "grad_norm": 1.3828125, "learning_rate": 0.0009231832092507283, "loss": 6.0568, "step": 1355 }, { "epoch": 2.7037773359840953, "grad_norm": 1.4609375, "learning_rate": 0.0009223090045534567, "loss": 6.1985, "step": 1360 }, { "epoch": 2.713717693836978, "grad_norm": 1.390625, "learning_rate": 0.0009214303232751044, "loss": 6.0739, "step": 1365 }, { "epoch": 2.7236580516898607, "grad_norm": 1.3046875, "learning_rate": 0.0009205471759807874, "loss": 5.9764, "step": 1370 }, { "epoch": 2.7335984095427435, "grad_norm": 1.3671875, "learning_rate": 0.0009196595732893213, "loss": 6.1165, "step": 1375 }, { "epoch": 2.743538767395626, "grad_norm": 1.3359375, "learning_rate": 0.0009187675258730918, "loss": 6.1814, "step": 1380 }, { "epoch": 2.753479125248509, "grad_norm": 1.40625, "learning_rate": 0.0009178710444579277, "loss": 6.1311, "step": 1385 }, { "epoch": 2.7634194831013916, "grad_norm": 1.421875, "learning_rate": 0.0009169701398229713, "loss": 6.1689, "step": 1390 }, { "epoch": 2.7733598409542743, "grad_norm": 1.390625, "learning_rate": 0.000916064822800549, "loss": 6.1812, "step": 1395 }, { "epoch": 2.783300198807157, "grad_norm": 1.3828125, "learning_rate": 0.0009151551042760408, "loss": 6.0078, "step": 1400 }, { "epoch": 2.79324055666004, "grad_norm": 1.4140625, "learning_rate": 0.0009142409951877497, "loss": 6.1, "step": 1405 }, { "epoch": 2.8031809145129225, "grad_norm": 1.203125, "learning_rate": 0.0009133225065267707, "loss": 6.1481, "step": 1410 }, { "epoch": 2.8131212723658052, "grad_norm": 1.3203125, "learning_rate": 0.000912399649336857, "loss": 6.1574, "step": 1415 }, { "epoch": 2.823061630218688, "grad_norm": 1.375, "learning_rate": 0.0009114724347142892, "loss": 6.0991, "step": 1420 }, { "epoch": 2.8330019880715707, "grad_norm": 1.375, "learning_rate": 0.0009105408738077402, "loss": 6.1241, "step": 1425 }, { "epoch": 2.8429423459244534, "grad_norm": 1.78125, "learning_rate": 0.0009096049778181426, "loss": 6.0701, "step": 1430 }, { "epoch": 2.852882703777336, "grad_norm": 1.4296875, "learning_rate": 0.0009086647579985526, "loss": 5.9656, "step": 1435 }, { "epoch": 2.862823061630219, "grad_norm": 2.859375, "learning_rate": 0.0009077202256540159, "loss": 5.9564, "step": 1440 }, { "epoch": 2.8727634194831015, "grad_norm": 1.3671875, "learning_rate": 0.0009067713921414313, "loss": 6.1301, "step": 1445 }, { "epoch": 2.8827037773359843, "grad_norm": 1.2421875, "learning_rate": 0.0009058182688694137, "loss": 6.1673, "step": 1450 }, { "epoch": 2.892644135188867, "grad_norm": 1.3515625, "learning_rate": 0.0009048608672981576, "loss": 6.0005, "step": 1455 }, { "epoch": 2.9025844930417497, "grad_norm": 1.2734375, "learning_rate": 0.0009038991989392992, "loss": 6.1207, "step": 1460 }, { "epoch": 2.9125248508946324, "grad_norm": 1.359375, "learning_rate": 0.0009029332753557776, "loss": 6.0328, "step": 1465 }, { "epoch": 2.922465208747515, "grad_norm": 1.6015625, "learning_rate": 0.0009019631081616963, "loss": 6.1406, "step": 1470 }, { "epoch": 2.932405566600398, "grad_norm": 1.34375, "learning_rate": 0.0009009887090221828, "loss": 6.117, "step": 1475 }, { "epoch": 2.9423459244532806, "grad_norm": 1.3359375, "learning_rate": 0.0009000100896532492, "loss": 6.164, "step": 1480 }, { "epoch": 2.952286282306163, "grad_norm": 1.3984375, "learning_rate": 0.0008990272618216508, "loss": 6.1009, "step": 1485 }, { "epoch": 2.9622266401590456, "grad_norm": 1.28125, "learning_rate": 0.0008980402373447446, "loss": 5.9912, "step": 1490 }, { "epoch": 2.9721669980119283, "grad_norm": 1.3125, "learning_rate": 0.0008970490280903477, "loss": 6.1235, "step": 1495 }, { "epoch": 2.982107355864811, "grad_norm": 1.3359375, "learning_rate": 0.000896053645976594, "loss": 6.06, "step": 1500 }, { "epoch": 2.982107355864811, "eval_loss": 6.611093044281006, "eval_runtime": 0.9899, "eval_samples_per_second": 3500.511, "eval_steps_per_second": 438.448, "step": 1500 }, { "epoch": 2.9920477137176937, "grad_norm": 1.3125, "learning_rate": 0.0008950541029717912, "loss": 6.1042, "step": 1505 }, { "epoch": 3.0019880715705765, "grad_norm": 1.34375, "learning_rate": 0.0008940504110942771, "loss": 6.0011, "step": 1510 }, { "epoch": 3.011928429423459, "grad_norm": 1.3671875, "learning_rate": 0.0008930425824122744, "loss": 5.6549, "step": 1515 }, { "epoch": 3.021868787276342, "grad_norm": 1.2734375, "learning_rate": 0.0008920306290437462, "loss": 5.6925, "step": 1520 }, { "epoch": 3.0318091451292246, "grad_norm": 1.359375, "learning_rate": 0.0008910145631562507, "loss": 5.671, "step": 1525 }, { "epoch": 3.0417495029821073, "grad_norm": 1.375, "learning_rate": 0.0008899943969667932, "loss": 5.6636, "step": 1530 }, { "epoch": 3.05168986083499, "grad_norm": 1.3984375, "learning_rate": 0.0008889701427416815, "loss": 5.6678, "step": 1535 }, { "epoch": 3.0616302186878728, "grad_norm": 1.3046875, "learning_rate": 0.0008879418127963767, "loss": 5.6669, "step": 1540 }, { "epoch": 3.0715705765407555, "grad_norm": 1.375, "learning_rate": 0.0008869094194953455, "loss": 5.6231, "step": 1545 }, { "epoch": 3.081510934393638, "grad_norm": 1.359375, "learning_rate": 0.0008858729752519121, "loss": 5.6009, "step": 1550 }, { "epoch": 3.091451292246521, "grad_norm": 1.3046875, "learning_rate": 0.0008848324925281085, "loss": 5.6402, "step": 1555 }, { "epoch": 3.1013916500994037, "grad_norm": 1.359375, "learning_rate": 0.0008837879838345245, "loss": 5.53, "step": 1560 }, { "epoch": 3.1113320079522864, "grad_norm": 1.390625, "learning_rate": 0.0008827394617301576, "loss": 5.6391, "step": 1565 }, { "epoch": 3.121272365805169, "grad_norm": 1.453125, "learning_rate": 0.0008816869388222618, "loss": 5.6437, "step": 1570 }, { "epoch": 3.131212723658052, "grad_norm": 1.421875, "learning_rate": 0.0008806304277661964, "loss": 5.6641, "step": 1575 }, { "epoch": 3.1411530815109345, "grad_norm": 1.5234375, "learning_rate": 0.0008795699412652732, "loss": 5.7165, "step": 1580 }, { "epoch": 3.1510934393638173, "grad_norm": 1.4765625, "learning_rate": 0.0008785054920706039, "loss": 5.5469, "step": 1585 }, { "epoch": 3.1610337972167, "grad_norm": 1.3515625, "learning_rate": 0.0008774370929809475, "loss": 5.6662, "step": 1590 }, { "epoch": 3.1709741550695827, "grad_norm": 1.390625, "learning_rate": 0.0008763647568425557, "loss": 5.6708, "step": 1595 }, { "epoch": 3.1809145129224654, "grad_norm": 1.4609375, "learning_rate": 0.0008752884965490185, "loss": 5.7795, "step": 1600 }, { "epoch": 3.1908548707753477, "grad_norm": 1.5234375, "learning_rate": 0.0008742083250411091, "loss": 5.7075, "step": 1605 }, { "epoch": 3.2007952286282304, "grad_norm": 1.40625, "learning_rate": 0.0008731242553066287, "loss": 5.7067, "step": 1610 }, { "epoch": 3.210735586481113, "grad_norm": 1.40625, "learning_rate": 0.0008720363003802503, "loss": 5.6134, "step": 1615 }, { "epoch": 3.220675944333996, "grad_norm": 1.3984375, "learning_rate": 0.0008709444733433617, "loss": 5.7087, "step": 1620 }, { "epoch": 3.2306163021868786, "grad_norm": 1.25, "learning_rate": 0.0008698487873239079, "loss": 5.6931, "step": 1625 }, { "epoch": 3.2405566600397613, "grad_norm": 1.3203125, "learning_rate": 0.0008687492554962345, "loss": 5.6945, "step": 1630 }, { "epoch": 3.250497017892644, "grad_norm": 1.4140625, "learning_rate": 0.0008676458910809273, "loss": 5.551, "step": 1635 }, { "epoch": 3.2604373757455267, "grad_norm": 1.2734375, "learning_rate": 0.0008665387073446556, "loss": 5.6124, "step": 1640 }, { "epoch": 3.2703777335984094, "grad_norm": 1.3359375, "learning_rate": 0.000865427717600011, "loss": 5.7552, "step": 1645 }, { "epoch": 3.280318091451292, "grad_norm": 1.375, "learning_rate": 0.0008643129352053478, "loss": 5.5644, "step": 1650 }, { "epoch": 3.290258449304175, "grad_norm": 1.453125, "learning_rate": 0.0008631943735646231, "loss": 5.5357, "step": 1655 }, { "epoch": 3.3001988071570576, "grad_norm": 1.4296875, "learning_rate": 0.0008620720461272344, "loss": 5.64, "step": 1660 }, { "epoch": 3.3101391650099403, "grad_norm": 1.5234375, "learning_rate": 0.0008609459663878586, "loss": 5.6495, "step": 1665 }, { "epoch": 3.320079522862823, "grad_norm": 1.2890625, "learning_rate": 0.00085981614788629, "loss": 5.6324, "step": 1670 }, { "epoch": 3.3300198807157058, "grad_norm": 1.3671875, "learning_rate": 0.0008586826042072768, "loss": 5.7362, "step": 1675 }, { "epoch": 3.3399602385685885, "grad_norm": 1.3828125, "learning_rate": 0.0008575453489803583, "loss": 5.6626, "step": 1680 }, { "epoch": 3.349900596421471, "grad_norm": 1.4296875, "learning_rate": 0.0008564043958797008, "loss": 5.6344, "step": 1685 }, { "epoch": 3.359840954274354, "grad_norm": 1.328125, "learning_rate": 0.0008552597586239333, "loss": 5.7596, "step": 1690 }, { "epoch": 3.3697813121272366, "grad_norm": 1.4375, "learning_rate": 0.0008541114509759821, "loss": 5.7374, "step": 1695 }, { "epoch": 3.3797216699801194, "grad_norm": 1.328125, "learning_rate": 0.0008529594867429059, "loss": 5.7173, "step": 1700 }, { "epoch": 3.389662027833002, "grad_norm": 1.578125, "learning_rate": 0.0008518038797757299, "loss": 5.7058, "step": 1705 }, { "epoch": 3.399602385685885, "grad_norm": 1.46875, "learning_rate": 0.0008506446439692784, "loss": 5.6639, "step": 1710 }, { "epoch": 3.4095427435387675, "grad_norm": 1.4296875, "learning_rate": 0.0008494817932620086, "loss": 5.524, "step": 1715 }, { "epoch": 3.4194831013916502, "grad_norm": 1.375, "learning_rate": 0.0008483153416358423, "loss": 5.7222, "step": 1720 }, { "epoch": 3.429423459244533, "grad_norm": 1.4453125, "learning_rate": 0.0008471453031159987, "loss": 5.7485, "step": 1725 }, { "epoch": 3.4393638170974157, "grad_norm": 1.3828125, "learning_rate": 0.0008459716917708248, "loss": 5.6887, "step": 1730 }, { "epoch": 3.4493041749502984, "grad_norm": 1.3984375, "learning_rate": 0.0008447945217116265, "loss": 5.674, "step": 1735 }, { "epoch": 3.459244532803181, "grad_norm": 1.4609375, "learning_rate": 0.0008436138070924997, "loss": 5.7669, "step": 1740 }, { "epoch": 3.469184890656064, "grad_norm": 1.390625, "learning_rate": 0.000842429562110159, "loss": 5.7269, "step": 1745 }, { "epoch": 3.4791252485089466, "grad_norm": 1.296875, "learning_rate": 0.0008412418010037673, "loss": 5.6269, "step": 1750 }, { "epoch": 3.4890656063618293, "grad_norm": 1.3203125, "learning_rate": 0.0008400505380547655, "loss": 5.6623, "step": 1755 }, { "epoch": 3.4990059642147116, "grad_norm": 1.4375, "learning_rate": 0.0008388557875866995, "loss": 5.6936, "step": 1760 }, { "epoch": 3.5089463220675943, "grad_norm": 1.46875, "learning_rate": 0.0008376575639650489, "loss": 5.6208, "step": 1765 }, { "epoch": 3.518886679920477, "grad_norm": 1.40625, "learning_rate": 0.0008364558815970536, "loss": 5.7239, "step": 1770 }, { "epoch": 3.5288270377733597, "grad_norm": 1.4140625, "learning_rate": 0.0008352507549315407, "loss": 5.5989, "step": 1775 }, { "epoch": 3.5387673956262424, "grad_norm": 1.3046875, "learning_rate": 0.0008340421984587517, "loss": 5.7711, "step": 1780 }, { "epoch": 3.548707753479125, "grad_norm": 1.40625, "learning_rate": 0.000832830226710167, "loss": 5.6808, "step": 1785 }, { "epoch": 3.558648111332008, "grad_norm": 1.3359375, "learning_rate": 0.0008316148542583319, "loss": 5.7268, "step": 1790 }, { "epoch": 3.5685884691848906, "grad_norm": 2.078125, "learning_rate": 0.000830396095716681, "loss": 5.4095, "step": 1795 }, { "epoch": 3.5785288270377733, "grad_norm": 1.484375, "learning_rate": 0.0008291739657393626, "loss": 5.4744, "step": 1800 }, { "epoch": 3.588469184890656, "grad_norm": 1.3125, "learning_rate": 0.0008279484790210632, "loss": 5.7428, "step": 1805 }, { "epoch": 3.5984095427435387, "grad_norm": 1.3984375, "learning_rate": 0.000826719650296829, "loss": 5.6654, "step": 1810 }, { "epoch": 3.6083499005964215, "grad_norm": 1.53125, "learning_rate": 0.0008254874943418914, "loss": 5.4806, "step": 1815 }, { "epoch": 3.618290258449304, "grad_norm": 1.4140625, "learning_rate": 0.0008242520259714868, "loss": 5.6981, "step": 1820 }, { "epoch": 3.628230616302187, "grad_norm": 1.421875, "learning_rate": 0.00082301326004068, "loss": 5.7074, "step": 1825 }, { "epoch": 3.6381709741550696, "grad_norm": 1.359375, "learning_rate": 0.0008217712114441846, "loss": 5.6836, "step": 1830 }, { "epoch": 3.6481113320079523, "grad_norm": 1.3671875, "learning_rate": 0.0008205258951161852, "loss": 5.6227, "step": 1835 }, { "epoch": 3.658051689860835, "grad_norm": 1.3671875, "learning_rate": 0.0008192773260301564, "loss": 5.7191, "step": 1840 }, { "epoch": 3.667992047713718, "grad_norm": 1.3828125, "learning_rate": 0.0008180255191986837, "loss": 5.718, "step": 1845 }, { "epoch": 3.6779324055666005, "grad_norm": 1.5078125, "learning_rate": 0.0008167704896732828, "loss": 5.7583, "step": 1850 }, { "epoch": 3.6878727634194832, "grad_norm": 1.4453125, "learning_rate": 0.0008155122525442182, "loss": 5.6963, "step": 1855 }, { "epoch": 3.697813121272366, "grad_norm": 1.5234375, "learning_rate": 0.0008142508229403225, "loss": 5.7905, "step": 1860 }, { "epoch": 3.7077534791252487, "grad_norm": 1.25, "learning_rate": 0.0008129862160288137, "loss": 5.7075, "step": 1865 }, { "epoch": 3.717693836978131, "grad_norm": 1.390625, "learning_rate": 0.0008117184470151134, "loss": 5.7883, "step": 1870 }, { "epoch": 3.7276341948310137, "grad_norm": 1.3984375, "learning_rate": 0.000810447531142664, "loss": 5.7276, "step": 1875 }, { "epoch": 3.7375745526838964, "grad_norm": 1.3984375, "learning_rate": 0.0008091734836927447, "loss": 5.6329, "step": 1880 }, { "epoch": 3.747514910536779, "grad_norm": 1.328125, "learning_rate": 0.0008078963199842886, "loss": 5.7745, "step": 1885 }, { "epoch": 3.757455268389662, "grad_norm": 1.3203125, "learning_rate": 0.000806616055373698, "loss": 5.7704, "step": 1890 }, { "epoch": 3.7673956262425445, "grad_norm": 1.4609375, "learning_rate": 0.0008053327052546605, "loss": 5.7274, "step": 1895 }, { "epoch": 3.7773359840954273, "grad_norm": 1.40625, "learning_rate": 0.0008040462850579625, "loss": 5.7801, "step": 1900 }, { "epoch": 3.78727634194831, "grad_norm": 1.328125, "learning_rate": 0.000802756810251305, "loss": 5.6725, "step": 1905 }, { "epoch": 3.7972166998011927, "grad_norm": 1.40625, "learning_rate": 0.0008014642963391168, "loss": 5.6716, "step": 1910 }, { "epoch": 3.8071570576540754, "grad_norm": 1.4609375, "learning_rate": 0.0008001687588623686, "loss": 5.6628, "step": 1915 }, { "epoch": 3.817097415506958, "grad_norm": 1.4453125, "learning_rate": 0.0007988702133983861, "loss": 5.7119, "step": 1920 }, { "epoch": 3.827037773359841, "grad_norm": 1.5703125, "learning_rate": 0.0007975686755606623, "loss": 5.6313, "step": 1925 }, { "epoch": 3.8369781312127236, "grad_norm": 1.5390625, "learning_rate": 0.0007962641609986703, "loss": 5.6761, "step": 1930 }, { "epoch": 3.8469184890656063, "grad_norm": 1.5078125, "learning_rate": 0.0007949566853976738, "loss": 5.625, "step": 1935 }, { "epoch": 3.856858846918489, "grad_norm": 1.359375, "learning_rate": 0.0007936462644785413, "loss": 5.7207, "step": 1940 }, { "epoch": 3.8667992047713717, "grad_norm": 1.3515625, "learning_rate": 0.0007923329139975537, "loss": 5.7018, "step": 1945 }, { "epoch": 3.8767395626242545, "grad_norm": 1.4765625, "learning_rate": 0.0007910166497462173, "loss": 5.7197, "step": 1950 }, { "epoch": 3.886679920477137, "grad_norm": 1.390625, "learning_rate": 0.0007896974875510731, "loss": 5.7637, "step": 1955 }, { "epoch": 3.89662027833002, "grad_norm": 1.5625, "learning_rate": 0.0007883754432735058, "loss": 5.7297, "step": 1960 }, { "epoch": 3.9065606361829026, "grad_norm": 1.3828125, "learning_rate": 0.0007870505328095545, "loss": 5.6832, "step": 1965 }, { "epoch": 3.9165009940357853, "grad_norm": 1.390625, "learning_rate": 0.0007857227720897207, "loss": 5.7105, "step": 1970 }, { "epoch": 3.926441351888668, "grad_norm": 1.3203125, "learning_rate": 0.0007843921770787765, "loss": 5.7642, "step": 1975 }, { "epoch": 3.9363817097415508, "grad_norm": 1.4765625, "learning_rate": 0.0007830587637755736, "loss": 5.7092, "step": 1980 }, { "epoch": 3.9463220675944335, "grad_norm": 1.3125, "learning_rate": 0.00078172254821285, "loss": 5.6186, "step": 1985 }, { "epoch": 3.956262425447316, "grad_norm": 1.390625, "learning_rate": 0.0007803835464570379, "loss": 5.8184, "step": 1990 }, { "epoch": 3.966202783300199, "grad_norm": 1.4140625, "learning_rate": 0.0007790417746080698, "loss": 5.7464, "step": 1995 }, { "epoch": 3.9761431411530817, "grad_norm": 1.453125, "learning_rate": 0.0007776972487991857, "loss": 5.7122, "step": 2000 }, { "epoch": 3.9761431411530817, "eval_loss": 6.55804967880249, "eval_runtime": 0.997, "eval_samples_per_second": 3475.347, "eval_steps_per_second": 435.296, "step": 2000 }, { "epoch": 3.9860834990059644, "grad_norm": 1.3046875, "learning_rate": 0.0007763499851967385, "loss": 5.7407, "step": 2005 }, { "epoch": 3.996023856858847, "grad_norm": 2.015625, "learning_rate": 0.0007750000000000001, "loss": 5.6136, "step": 2010 }, { "epoch": 4.00596421471173, "grad_norm": 1.2265625, "learning_rate": 0.000773647309440966, "loss": 5.5336, "step": 2015 }, { "epoch": 4.0159045725646125, "grad_norm": 1.1875, "learning_rate": 0.0007722919297841613, "loss": 5.2728, "step": 2020 }, { "epoch": 4.025844930417495, "grad_norm": 1.2109375, "learning_rate": 0.0007709338773264435, "loss": 5.1443, "step": 2025 }, { "epoch": 4.035785288270378, "grad_norm": 1.4609375, "learning_rate": 0.0007695731683968077, "loss": 5.2748, "step": 2030 }, { "epoch": 4.045725646123261, "grad_norm": 1.4609375, "learning_rate": 0.0007682098193561904, "loss": 5.2884, "step": 2035 }, { "epoch": 4.055666003976143, "grad_norm": 1.4375, "learning_rate": 0.0007668438465972717, "loss": 5.1426, "step": 2040 }, { "epoch": 4.065606361829026, "grad_norm": 1.453125, "learning_rate": 0.0007654752665442794, "loss": 5.2264, "step": 2045 }, { "epoch": 4.075546719681909, "grad_norm": 1.2421875, "learning_rate": 0.0007641040956527904, "loss": 5.3447, "step": 2050 }, { "epoch": 4.085487077534792, "grad_norm": 1.546875, "learning_rate": 0.0007627303504095341, "loss": 5.3463, "step": 2055 }, { "epoch": 4.095427435387674, "grad_norm": 1.421875, "learning_rate": 0.0007613540473321927, "loss": 5.1995, "step": 2060 }, { "epoch": 4.105367793240557, "grad_norm": 1.359375, "learning_rate": 0.0007599752029692041, "loss": 5.2133, "step": 2065 }, { "epoch": 4.11530815109344, "grad_norm": 1.453125, "learning_rate": 0.0007585938338995616, "loss": 5.2684, "step": 2070 }, { "epoch": 4.1252485089463224, "grad_norm": 1.515625, "learning_rate": 0.0007572099567326158, "loss": 5.3087, "step": 2075 }, { "epoch": 4.135188866799205, "grad_norm": 1.4375, "learning_rate": 0.0007558235881078734, "loss": 5.3412, "step": 2080 }, { "epoch": 4.145129224652088, "grad_norm": 1.4296875, "learning_rate": 0.0007544347446947986, "loss": 5.3484, "step": 2085 }, { "epoch": 4.155069582504971, "grad_norm": 1.390625, "learning_rate": 0.0007530434431926118, "loss": 5.3476, "step": 2090 }, { "epoch": 4.165009940357853, "grad_norm": 1.3359375, "learning_rate": 0.0007516497003300892, "loss": 5.2869, "step": 2095 }, { "epoch": 4.174950298210735, "grad_norm": 1.453125, "learning_rate": 0.0007502535328653615, "loss": 5.3405, "step": 2100 }, { "epoch": 4.184890656063618, "grad_norm": 1.421875, "learning_rate": 0.0007488549575857124, "loss": 5.2971, "step": 2105 }, { "epoch": 4.194831013916501, "grad_norm": 1.3828125, "learning_rate": 0.0007474539913073764, "loss": 5.2878, "step": 2110 }, { "epoch": 4.204771371769383, "grad_norm": 1.4375, "learning_rate": 0.0007460506508753373, "loss": 5.3329, "step": 2115 }, { "epoch": 4.214711729622266, "grad_norm": 1.3671875, "learning_rate": 0.0007446449531631255, "loss": 5.3297, "step": 2120 }, { "epoch": 4.224652087475149, "grad_norm": 1.4765625, "learning_rate": 0.0007432369150726146, "loss": 5.3451, "step": 2125 }, { "epoch": 4.2345924453280315, "grad_norm": 1.4140625, "learning_rate": 0.0007418265535338187, "loss": 5.275, "step": 2130 }, { "epoch": 4.244532803180914, "grad_norm": 1.453125, "learning_rate": 0.0007404138855046884, "loss": 5.3323, "step": 2135 }, { "epoch": 4.254473161033797, "grad_norm": 1.4765625, "learning_rate": 0.0007389989279709077, "loss": 5.2785, "step": 2140 }, { "epoch": 4.26441351888668, "grad_norm": 1.5859375, "learning_rate": 0.0007375816979456887, "loss": 5.3797, "step": 2145 }, { "epoch": 4.274353876739562, "grad_norm": 1.3515625, "learning_rate": 0.0007361622124695677, "loss": 5.3003, "step": 2150 }, { "epoch": 4.284294234592445, "grad_norm": 1.5234375, "learning_rate": 0.0007347404886102002, "loss": 5.3552, "step": 2155 }, { "epoch": 4.294234592445328, "grad_norm": 1.421875, "learning_rate": 0.0007333165434621556, "loss": 5.3648, "step": 2160 }, { "epoch": 4.3041749502982105, "grad_norm": 1.3671875, "learning_rate": 0.0007318903941467119, "loss": 5.3271, "step": 2165 }, { "epoch": 4.314115308151093, "grad_norm": 1.484375, "learning_rate": 0.0007304620578116493, "loss": 5.3462, "step": 2170 }, { "epoch": 4.324055666003976, "grad_norm": 1.5390625, "learning_rate": 0.0007290315516310445, "loss": 5.3889, "step": 2175 }, { "epoch": 4.333996023856859, "grad_norm": 1.59375, "learning_rate": 0.0007275988928050645, "loss": 5.375, "step": 2180 }, { "epoch": 4.343936381709741, "grad_norm": 1.515625, "learning_rate": 0.0007261640985597584, "loss": 5.3061, "step": 2185 }, { "epoch": 4.353876739562624, "grad_norm": 1.4453125, "learning_rate": 0.0007247271861468522, "loss": 5.3972, "step": 2190 }, { "epoch": 4.363817097415507, "grad_norm": 1.59375, "learning_rate": 0.0007232881728435397, "loss": 5.3455, "step": 2195 }, { "epoch": 4.3737574552683895, "grad_norm": 1.5, "learning_rate": 0.0007218470759522759, "loss": 5.3268, "step": 2200 }, { "epoch": 4.383697813121272, "grad_norm": 1.5, "learning_rate": 0.0007204039128005682, "loss": 5.0667, "step": 2205 }, { "epoch": 4.393638170974155, "grad_norm": 1.578125, "learning_rate": 0.0007189587007407686, "loss": 5.3382, "step": 2210 }, { "epoch": 4.403578528827038, "grad_norm": 1.5234375, "learning_rate": 0.0007175114571498644, "loss": 5.3345, "step": 2215 }, { "epoch": 4.41351888667992, "grad_norm": 1.46875, "learning_rate": 0.0007160621994292706, "loss": 5.3493, "step": 2220 }, { "epoch": 4.423459244532803, "grad_norm": 1.5390625, "learning_rate": 0.0007146109450046187, "loss": 5.3449, "step": 2225 }, { "epoch": 4.433399602385686, "grad_norm": 1.34375, "learning_rate": 0.0007131577113255489, "loss": 5.288, "step": 2230 }, { "epoch": 4.443339960238569, "grad_norm": 1.421875, "learning_rate": 0.0007117025158654991, "loss": 5.443, "step": 2235 }, { "epoch": 4.453280318091451, "grad_norm": 1.3828125, "learning_rate": 0.0007102453761214961, "loss": 5.3436, "step": 2240 }, { "epoch": 4.463220675944334, "grad_norm": 1.328125, "learning_rate": 0.0007087863096139438, "loss": 5.3078, "step": 2245 }, { "epoch": 4.473161033797217, "grad_norm": 1.4140625, "learning_rate": 0.0007073253338864137, "loss": 5.3483, "step": 2250 }, { "epoch": 4.4831013916500995, "grad_norm": 1.6015625, "learning_rate": 0.0007058624665054326, "loss": 5.3732, "step": 2255 }, { "epoch": 4.493041749502982, "grad_norm": 1.3984375, "learning_rate": 0.0007043977250602732, "loss": 5.3702, "step": 2260 }, { "epoch": 4.502982107355865, "grad_norm": 1.375, "learning_rate": 0.0007029311271627408, "loss": 5.337, "step": 2265 }, { "epoch": 4.512922465208748, "grad_norm": 1.34375, "learning_rate": 0.0007014626904469629, "loss": 5.4367, "step": 2270 }, { "epoch": 4.52286282306163, "grad_norm": 1.5078125, "learning_rate": 0.0006999924325691765, "loss": 5.3764, "step": 2275 }, { "epoch": 4.532803180914513, "grad_norm": 1.4921875, "learning_rate": 0.0006985203712075161, "loss": 5.2262, "step": 2280 }, { "epoch": 4.542743538767396, "grad_norm": 1.4609375, "learning_rate": 0.0006970465240618006, "loss": 5.4132, "step": 2285 }, { "epoch": 4.5526838966202785, "grad_norm": 1.6015625, "learning_rate": 0.0006955709088533212, "loss": 5.3537, "step": 2290 }, { "epoch": 4.562624254473161, "grad_norm": 1.3984375, "learning_rate": 0.0006940935433246279, "loss": 5.4217, "step": 2295 }, { "epoch": 4.572564612326044, "grad_norm": 1.5390625, "learning_rate": 0.0006926144452393163, "loss": 5.2409, "step": 2300 }, { "epoch": 4.582504970178927, "grad_norm": 1.359375, "learning_rate": 0.0006911336323818137, "loss": 5.2503, "step": 2305 }, { "epoch": 4.592445328031809, "grad_norm": 1.3671875, "learning_rate": 0.000689651122557166, "loss": 5.362, "step": 2310 }, { "epoch": 4.602385685884692, "grad_norm": 1.4765625, "learning_rate": 0.0006881669335908229, "loss": 5.4085, "step": 2315 }, { "epoch": 4.612326043737575, "grad_norm": 1.5, "learning_rate": 0.0006866810833284234, "loss": 5.3714, "step": 2320 }, { "epoch": 4.6222664015904575, "grad_norm": 1.453125, "learning_rate": 0.0006851935896355827, "loss": 5.3056, "step": 2325 }, { "epoch": 4.63220675944334, "grad_norm": 1.46875, "learning_rate": 0.0006837044703976754, "loss": 5.3678, "step": 2330 }, { "epoch": 4.642147117296223, "grad_norm": 1.3828125, "learning_rate": 0.0006822137435196214, "loss": 5.2933, "step": 2335 }, { "epoch": 4.652087475149106, "grad_norm": 1.515625, "learning_rate": 0.0006807214269256713, "loss": 5.4022, "step": 2340 }, { "epoch": 4.662027833001988, "grad_norm": 1.5, "learning_rate": 0.0006792275385591895, "loss": 5.1277, "step": 2345 }, { "epoch": 4.671968190854871, "grad_norm": 1.5, "learning_rate": 0.0006777320963824396, "loss": 5.381, "step": 2350 }, { "epoch": 4.681908548707754, "grad_norm": 1.75, "learning_rate": 0.0006762351183763674, "loss": 5.4134, "step": 2355 }, { "epoch": 4.691848906560637, "grad_norm": 1.46875, "learning_rate": 0.0006747366225403858, "loss": 5.3593, "step": 2360 }, { "epoch": 4.701789264413518, "grad_norm": 1.5234375, "learning_rate": 0.0006732366268921576, "loss": 5.2874, "step": 2365 }, { "epoch": 4.711729622266402, "grad_norm": 1.4609375, "learning_rate": 0.0006717351494673791, "loss": 5.3487, "step": 2370 }, { "epoch": 4.721669980119284, "grad_norm": 1.3984375, "learning_rate": 0.0006702322083195633, "loss": 5.3187, "step": 2375 }, { "epoch": 4.7316103379721675, "grad_norm": 1.546875, "learning_rate": 0.0006687278215198226, "loss": 5.3978, "step": 2380 }, { "epoch": 4.741550695825049, "grad_norm": 1.5234375, "learning_rate": 0.000667222007156652, "loss": 5.215, "step": 2385 }, { "epoch": 4.751491053677933, "grad_norm": 1.515625, "learning_rate": 0.0006657147833357107, "loss": 5.4158, "step": 2390 }, { "epoch": 4.761431411530815, "grad_norm": 1.4375, "learning_rate": 0.0006642061681796056, "loss": 5.312, "step": 2395 }, { "epoch": 4.7713717693836974, "grad_norm": 1.46875, "learning_rate": 0.0006626961798276726, "loss": 5.3738, "step": 2400 }, { "epoch": 4.78131212723658, "grad_norm": 1.4296875, "learning_rate": 0.0006611848364357584, "loss": 5.3342, "step": 2405 }, { "epoch": 4.791252485089463, "grad_norm": 1.4921875, "learning_rate": 0.0006596721561760028, "loss": 5.2798, "step": 2410 }, { "epoch": 4.801192842942346, "grad_norm": 1.3984375, "learning_rate": 0.0006581581572366196, "loss": 5.3128, "step": 2415 }, { "epoch": 4.811133200795228, "grad_norm": 1.453125, "learning_rate": 0.0006566428578216785, "loss": 5.1778, "step": 2420 }, { "epoch": 4.821073558648111, "grad_norm": 1.4375, "learning_rate": 0.0006551262761508857, "loss": 5.4791, "step": 2425 }, { "epoch": 4.831013916500994, "grad_norm": 1.4921875, "learning_rate": 0.0006536084304593652, "loss": 5.3982, "step": 2430 }, { "epoch": 4.8409542743538765, "grad_norm": 1.546875, "learning_rate": 0.000652089338997439, "loss": 5.3334, "step": 2435 }, { "epoch": 4.850894632206759, "grad_norm": 1.484375, "learning_rate": 0.0006505690200304083, "loss": 5.4211, "step": 2440 }, { "epoch": 4.860834990059642, "grad_norm": 1.5859375, "learning_rate": 0.0006490474918383339, "loss": 5.2573, "step": 2445 }, { "epoch": 4.870775347912525, "grad_norm": 1.34375, "learning_rate": 0.0006475247727158154, "loss": 5.3309, "step": 2450 }, { "epoch": 4.880715705765407, "grad_norm": 1.453125, "learning_rate": 0.0006460008809717727, "loss": 5.3371, "step": 2455 }, { "epoch": 4.89065606361829, "grad_norm": 1.4140625, "learning_rate": 0.0006444758349292244, "loss": 5.4302, "step": 2460 }, { "epoch": 4.900596421471173, "grad_norm": 1.453125, "learning_rate": 0.0006429496529250689, "loss": 5.4542, "step": 2465 }, { "epoch": 4.9105367793240555, "grad_norm": 1.6328125, "learning_rate": 0.0006414223533098627, "loss": 5.3545, "step": 2470 }, { "epoch": 4.920477137176938, "grad_norm": 1.4140625, "learning_rate": 0.0006398939544476005, "loss": 5.4528, "step": 2475 }, { "epoch": 4.930417495029821, "grad_norm": 1.4609375, "learning_rate": 0.000638364474715494, "loss": 5.2046, "step": 2480 }, { "epoch": 4.940357852882704, "grad_norm": 1.4765625, "learning_rate": 0.0006368339325037513, "loss": 5.1874, "step": 2485 }, { "epoch": 4.950298210735586, "grad_norm": 1.4765625, "learning_rate": 0.0006353023462153552, "loss": 5.42, "step": 2490 }, { "epoch": 4.960238568588469, "grad_norm": 1.3984375, "learning_rate": 0.0006337697342658431, "loss": 5.3834, "step": 2495 }, { "epoch": 4.970178926441352, "grad_norm": 1.3828125, "learning_rate": 0.0006322361150830839, "loss": 5.4399, "step": 2500 }, { "epoch": 4.970178926441352, "eval_loss": 6.549376964569092, "eval_runtime": 0.998, "eval_samples_per_second": 3471.846, "eval_steps_per_second": 434.857, "step": 2500 }, { "epoch": 4.980119284294235, "grad_norm": 1.5, "learning_rate": 0.0006307015071070575, "loss": 5.3366, "step": 2505 }, { "epoch": 4.990059642147117, "grad_norm": 1.453125, "learning_rate": 0.0006291659287896334, "loss": 5.3573, "step": 2510 }, { "epoch": 5.0, "grad_norm": 1.9375, "learning_rate": 0.0006276293985943478, "loss": 5.4082, "step": 2515 }, { "epoch": 5.009940357852883, "grad_norm": 1.390625, "learning_rate": 0.0006260919349961824, "loss": 4.9887, "step": 2520 }, { "epoch": 5.019880715705765, "grad_norm": 1.53125, "learning_rate": 0.0006245535564813417, "loss": 4.9016, "step": 2525 }, { "epoch": 5.029821073558648, "grad_norm": 1.484375, "learning_rate": 0.0006230142815470312, "loss": 4.9046, "step": 2530 }, { "epoch": 5.039761431411531, "grad_norm": 1.3046875, "learning_rate": 0.0006214741287012348, "loss": 5.0343, "step": 2535 }, { "epoch": 5.049701789264414, "grad_norm": 1.4765625, "learning_rate": 0.0006199331164624922, "loss": 5.0157, "step": 2540 }, { "epoch": 5.059642147117296, "grad_norm": 1.453125, "learning_rate": 0.0006183912633596763, "loss": 4.9286, "step": 2545 }, { "epoch": 5.069582504970179, "grad_norm": 1.5390625, "learning_rate": 0.0006168485879317707, "loss": 5.0776, "step": 2550 }, { "epoch": 5.079522862823062, "grad_norm": 1.4453125, "learning_rate": 0.0006153051087276458, "loss": 4.83, "step": 2555 }, { "epoch": 5.0894632206759445, "grad_norm": 1.40625, "learning_rate": 0.0006137608443058371, "loss": 5.1353, "step": 2560 }, { "epoch": 5.099403578528827, "grad_norm": 1.4921875, "learning_rate": 0.0006122158132343213, "loss": 5.0341, "step": 2565 }, { "epoch": 5.10934393638171, "grad_norm": 1.390625, "learning_rate": 0.000610670034090293, "loss": 4.984, "step": 2570 }, { "epoch": 5.119284294234593, "grad_norm": 1.5546875, "learning_rate": 0.0006091235254599417, "loss": 5.0181, "step": 2575 }, { "epoch": 5.129224652087475, "grad_norm": 1.4375, "learning_rate": 0.0006075763059382278, "loss": 5.0337, "step": 2580 }, { "epoch": 5.139165009940358, "grad_norm": 1.4921875, "learning_rate": 0.0006060283941286597, "loss": 5.0363, "step": 2585 }, { "epoch": 5.149105367793241, "grad_norm": 1.484375, "learning_rate": 0.0006044798086430697, "loss": 5.0434, "step": 2590 }, { "epoch": 5.1590457256461235, "grad_norm": 1.453125, "learning_rate": 0.00060293056810139, "loss": 5.0536, "step": 2595 }, { "epoch": 5.168986083499006, "grad_norm": 1.46875, "learning_rate": 0.0006013806911314293, "loss": 5.0753, "step": 2600 }, { "epoch": 5.178926441351889, "grad_norm": 1.6171875, "learning_rate": 0.0005998301963686485, "loss": 5.0426, "step": 2605 }, { "epoch": 5.188866799204772, "grad_norm": 1.59375, "learning_rate": 0.0005982791024559371, "loss": 5.1351, "step": 2610 }, { "epoch": 5.198807157057654, "grad_norm": 1.4609375, "learning_rate": 0.0005967274280433881, "loss": 5.0403, "step": 2615 }, { "epoch": 5.208747514910537, "grad_norm": 1.4375, "learning_rate": 0.0005951751917880747, "loss": 5.0975, "step": 2620 }, { "epoch": 5.21868787276342, "grad_norm": 1.5546875, "learning_rate": 0.0005936224123538254, "loss": 4.8919, "step": 2625 }, { "epoch": 5.2286282306163026, "grad_norm": 1.5546875, "learning_rate": 0.000592069108411, "loss": 5.0733, "step": 2630 }, { "epoch": 5.238568588469185, "grad_norm": 1.3828125, "learning_rate": 0.0005905152986362649, "loss": 5.1395, "step": 2635 }, { "epoch": 5.248508946322068, "grad_norm": 1.6328125, "learning_rate": 0.0005889610017123685, "loss": 5.0554, "step": 2640 }, { "epoch": 5.258449304174951, "grad_norm": 1.546875, "learning_rate": 0.0005874062363279164, "loss": 4.8689, "step": 2645 }, { "epoch": 5.2683896620278325, "grad_norm": 1.4296875, "learning_rate": 0.0005858510211771469, "loss": 5.1814, "step": 2650 }, { "epoch": 5.278330019880716, "grad_norm": 1.5625, "learning_rate": 0.0005842953749597065, "loss": 5.0087, "step": 2655 }, { "epoch": 5.288270377733598, "grad_norm": 1.5390625, "learning_rate": 0.0005827393163804249, "loss": 4.9926, "step": 2660 }, { "epoch": 5.298210735586481, "grad_norm": 1.546875, "learning_rate": 0.0005811828641490892, "loss": 5.0692, "step": 2665 }, { "epoch": 5.308151093439363, "grad_norm": 1.546875, "learning_rate": 0.0005796260369802205, "loss": 4.9736, "step": 2670 }, { "epoch": 5.318091451292246, "grad_norm": 1.5859375, "learning_rate": 0.0005780688535928478, "loss": 5.0796, "step": 2675 }, { "epoch": 5.328031809145129, "grad_norm": 1.515625, "learning_rate": 0.0005765113327102831, "loss": 5.1267, "step": 2680 }, { "epoch": 5.337972166998012, "grad_norm": 1.53125, "learning_rate": 0.0005749534930598966, "loss": 4.9451, "step": 2685 }, { "epoch": 5.347912524850894, "grad_norm": 1.4296875, "learning_rate": 0.0005733953533728912, "loss": 5.0916, "step": 2690 }, { "epoch": 5.357852882703777, "grad_norm": 1.5390625, "learning_rate": 0.0005718369323840773, "loss": 5.0723, "step": 2695 }, { "epoch": 5.36779324055666, "grad_norm": 1.6484375, "learning_rate": 0.0005702782488316478, "loss": 5.1471, "step": 2700 }, { "epoch": 5.3777335984095425, "grad_norm": 1.546875, "learning_rate": 0.0005687193214569524, "loss": 4.9716, "step": 2705 }, { "epoch": 5.387673956262425, "grad_norm": 1.421875, "learning_rate": 0.0005671601690042727, "loss": 4.9593, "step": 2710 }, { "epoch": 5.397614314115308, "grad_norm": 1.546875, "learning_rate": 0.0005656008102205966, "loss": 5.0421, "step": 2715 }, { "epoch": 5.407554671968191, "grad_norm": 1.5390625, "learning_rate": 0.0005640412638553927, "loss": 5.0378, "step": 2720 }, { "epoch": 5.417495029821073, "grad_norm": 1.4453125, "learning_rate": 0.000562481548660385, "loss": 5.0324, "step": 2725 }, { "epoch": 5.427435387673956, "grad_norm": 1.578125, "learning_rate": 0.000560921683389328, "loss": 5.0774, "step": 2730 }, { "epoch": 5.437375745526839, "grad_norm": 1.4375, "learning_rate": 0.0005593616867977801, "loss": 4.975, "step": 2735 }, { "epoch": 5.4473161033797215, "grad_norm": 1.5234375, "learning_rate": 0.000557801577642879, "loss": 5.0673, "step": 2740 }, { "epoch": 5.457256461232604, "grad_norm": 1.546875, "learning_rate": 0.0005562413746831156, "loss": 5.0886, "step": 2745 }, { "epoch": 5.467196819085487, "grad_norm": 1.484375, "learning_rate": 0.000554681096678109, "loss": 5.1069, "step": 2750 }, { "epoch": 5.47713717693837, "grad_norm": 1.46875, "learning_rate": 0.0005531207623883801, "loss": 4.9482, "step": 2755 }, { "epoch": 5.487077534791252, "grad_norm": 1.4453125, "learning_rate": 0.0005515603905751276, "loss": 5.046, "step": 2760 }, { "epoch": 5.497017892644135, "grad_norm": 1.5546875, "learning_rate": 0.00055, "loss": 5.0091, "step": 2765 }, { "epoch": 5.506958250497018, "grad_norm": 1.3515625, "learning_rate": 0.0005484396094248726, "loss": 4.8669, "step": 2770 }, { "epoch": 5.5168986083499005, "grad_norm": 1.4375, "learning_rate": 0.0005468792376116198, "loss": 5.0538, "step": 2775 }, { "epoch": 5.526838966202783, "grad_norm": 1.5234375, "learning_rate": 0.0005453189033218912, "loss": 5.0069, "step": 2780 }, { "epoch": 5.536779324055666, "grad_norm": 1.53125, "learning_rate": 0.0005437586253168845, "loss": 5.0555, "step": 2785 }, { "epoch": 5.546719681908549, "grad_norm": 1.484375, "learning_rate": 0.0005421984223571211, "loss": 5.0909, "step": 2790 }, { "epoch": 5.556660039761431, "grad_norm": 1.5390625, "learning_rate": 0.0005406383132022199, "loss": 5.1069, "step": 2795 }, { "epoch": 5.566600397614314, "grad_norm": 1.3359375, "learning_rate": 0.000539078316610672, "loss": 5.204, "step": 2800 }, { "epoch": 5.576540755467197, "grad_norm": 1.6015625, "learning_rate": 0.000537518451339615, "loss": 5.1035, "step": 2805 }, { "epoch": 5.58648111332008, "grad_norm": 1.40625, "learning_rate": 0.0005359587361446073, "loss": 5.1064, "step": 2810 }, { "epoch": 5.596421471172962, "grad_norm": 1.46875, "learning_rate": 0.0005343991897794036, "loss": 4.9599, "step": 2815 }, { "epoch": 5.606361829025845, "grad_norm": 1.40625, "learning_rate": 0.0005328398309957274, "loss": 5.1179, "step": 2820 }, { "epoch": 5.616302186878728, "grad_norm": 1.6171875, "learning_rate": 0.0005312806785430478, "loss": 5.0926, "step": 2825 }, { "epoch": 5.6262425447316105, "grad_norm": 1.5234375, "learning_rate": 0.0005297217511683524, "loss": 5.1679, "step": 2830 }, { "epoch": 5.636182902584493, "grad_norm": 1.4453125, "learning_rate": 0.0005281630676159228, "loss": 4.9398, "step": 2835 }, { "epoch": 5.646123260437376, "grad_norm": 1.4375, "learning_rate": 0.0005266046466271089, "loss": 5.0491, "step": 2840 }, { "epoch": 5.656063618290259, "grad_norm": 1.671875, "learning_rate": 0.0005250465069401034, "loss": 5.0645, "step": 2845 }, { "epoch": 5.666003976143141, "grad_norm": 1.5, "learning_rate": 0.000523488667289717, "loss": 4.8957, "step": 2850 }, { "epoch": 5.675944333996024, "grad_norm": 1.5234375, "learning_rate": 0.0005219311464071524, "loss": 5.0396, "step": 2855 }, { "epoch": 5.685884691848907, "grad_norm": 1.5, "learning_rate": 0.0005203739630197796, "loss": 5.0879, "step": 2860 }, { "epoch": 5.6958250497017895, "grad_norm": 1.4609375, "learning_rate": 0.0005188171358509109, "loss": 5.1446, "step": 2865 }, { "epoch": 5.705765407554672, "grad_norm": 1.5703125, "learning_rate": 0.0005172606836195753, "loss": 5.0362, "step": 2870 }, { "epoch": 5.715705765407555, "grad_norm": 1.40625, "learning_rate": 0.0005157046250402936, "loss": 5.0523, "step": 2875 }, { "epoch": 5.725646123260438, "grad_norm": 1.6171875, "learning_rate": 0.0005141489788228533, "loss": 5.1221, "step": 2880 }, { "epoch": 5.73558648111332, "grad_norm": 1.4921875, "learning_rate": 0.0005125937636720838, "loss": 5.1486, "step": 2885 }, { "epoch": 5.745526838966203, "grad_norm": 1.5390625, "learning_rate": 0.0005110389982876316, "loss": 5.1022, "step": 2890 }, { "epoch": 5.755467196819086, "grad_norm": 1.546875, "learning_rate": 0.000509484701363735, "loss": 5.0928, "step": 2895 }, { "epoch": 5.7654075546719685, "grad_norm": 1.625, "learning_rate": 0.000507930891589, "loss": 5.0699, "step": 2900 }, { "epoch": 5.775347912524851, "grad_norm": 1.515625, "learning_rate": 0.0005063775876461746, "loss": 5.0418, "step": 2905 }, { "epoch": 5.785288270377734, "grad_norm": 1.578125, "learning_rate": 0.0005048248082119253, "loss": 5.1046, "step": 2910 }, { "epoch": 5.795228628230617, "grad_norm": 1.34375, "learning_rate": 0.000503272571956612, "loss": 5.0661, "step": 2915 }, { "epoch": 5.805168986083499, "grad_norm": 1.5234375, "learning_rate": 0.000501720897544063, "loss": 4.8625, "step": 2920 }, { "epoch": 5.815109343936381, "grad_norm": 1.4609375, "learning_rate": 0.0005001698036313514, "loss": 4.9658, "step": 2925 }, { "epoch": 5.825049701789265, "grad_norm": 1.5, "learning_rate": 0.0004986193088685708, "loss": 5.0999, "step": 2930 }, { "epoch": 5.834990059642147, "grad_norm": 1.5546875, "learning_rate": 0.0004970694318986101, "loss": 5.0693, "step": 2935 }, { "epoch": 5.84493041749503, "grad_norm": 1.59375, "learning_rate": 0.0004955201913569304, "loss": 4.9639, "step": 2940 }, { "epoch": 5.854870775347912, "grad_norm": 1.53125, "learning_rate": 0.0004939716058713404, "loss": 5.2055, "step": 2945 }, { "epoch": 5.864811133200796, "grad_norm": 1.5, "learning_rate": 0.0004924236940617722, "loss": 4.9903, "step": 2950 }, { "epoch": 5.8747514910536776, "grad_norm": 1.515625, "learning_rate": 0.0004908764745400584, "loss": 5.1406, "step": 2955 }, { "epoch": 5.88469184890656, "grad_norm": 1.5546875, "learning_rate": 0.000489329965909707, "loss": 5.1008, "step": 2960 }, { "epoch": 5.894632206759443, "grad_norm": 1.4296875, "learning_rate": 0.0004877841867656788, "loss": 5.0674, "step": 2965 }, { "epoch": 5.904572564612326, "grad_norm": 1.484375, "learning_rate": 0.000486239155694163, "loss": 4.9296, "step": 2970 }, { "epoch": 5.914512922465208, "grad_norm": 1.484375, "learning_rate": 0.00048469489127235424, "loss": 5.1554, "step": 2975 }, { "epoch": 5.924453280318091, "grad_norm": 1.5703125, "learning_rate": 0.00048315141206822944, "loss": 4.9096, "step": 2980 }, { "epoch": 5.934393638170974, "grad_norm": 1.5390625, "learning_rate": 0.0004816087366403237, "loss": 5.0318, "step": 2985 }, { "epoch": 5.944333996023857, "grad_norm": 1.5546875, "learning_rate": 0.0004800668835375078, "loss": 5.0878, "step": 2990 }, { "epoch": 5.954274353876739, "grad_norm": 1.6015625, "learning_rate": 0.0004785258712987651, "loss": 5.1169, "step": 2995 }, { "epoch": 5.964214711729622, "grad_norm": 1.6328125, "learning_rate": 0.0004769857184529688, "loss": 5.1109, "step": 3000 }, { "epoch": 5.964214711729622, "eval_loss": 6.561558246612549, "eval_runtime": 0.9954, "eval_samples_per_second": 3481.004, "eval_steps_per_second": 436.005, "step": 3000 }, { "epoch": 5.974155069582505, "grad_norm": 1.59375, "learning_rate": 0.0004754464435186583, "loss": 5.1491, "step": 3005 }, { "epoch": 5.9840954274353875, "grad_norm": 1.3984375, "learning_rate": 0.00047390806500381753, "loss": 5.181, "step": 3010 }, { "epoch": 5.99403578528827, "grad_norm": 1.5078125, "learning_rate": 0.0004723706014056522, "loss": 5.1196, "step": 3015 }, { "epoch": 6.003976143141153, "grad_norm": 1.4375, "learning_rate": 0.0004708340712103667, "loss": 4.9136, "step": 3020 }, { "epoch": 6.013916500994036, "grad_norm": 1.4453125, "learning_rate": 0.0004692984928929426, "loss": 4.878, "step": 3025 }, { "epoch": 6.023856858846918, "grad_norm": 1.5546875, "learning_rate": 0.00046776388491691633, "loss": 4.7533, "step": 3030 }, { "epoch": 6.033797216699801, "grad_norm": 1.5390625, "learning_rate": 0.00046623026573415716, "loss": 4.8227, "step": 3035 }, { "epoch": 6.043737574552684, "grad_norm": 1.5, "learning_rate": 0.0004646976537846449, "loss": 4.8174, "step": 3040 }, { "epoch": 6.0536779324055665, "grad_norm": 1.5, "learning_rate": 0.0004631660674962489, "loss": 4.8375, "step": 3045 }, { "epoch": 6.063618290258449, "grad_norm": 1.46875, "learning_rate": 0.00046163552528450617, "loss": 4.7929, "step": 3050 }, { "epoch": 6.073558648111332, "grad_norm": 1.4765625, "learning_rate": 0.00046010604555239964, "loss": 4.8151, "step": 3055 }, { "epoch": 6.083499005964215, "grad_norm": 1.7734375, "learning_rate": 0.00045857764669013736, "loss": 4.8373, "step": 3060 }, { "epoch": 6.093439363817097, "grad_norm": 1.5078125, "learning_rate": 0.0004570503470749312, "loss": 4.5439, "step": 3065 }, { "epoch": 6.10337972166998, "grad_norm": 1.546875, "learning_rate": 0.00045552416507077564, "loss": 4.8271, "step": 3070 }, { "epoch": 6.113320079522863, "grad_norm": 1.4765625, "learning_rate": 0.00045399911902822745, "loss": 4.8319, "step": 3075 }, { "epoch": 6.1232604373757455, "grad_norm": 1.640625, "learning_rate": 0.00045247522728418467, "loss": 4.819, "step": 3080 }, { "epoch": 6.133200795228628, "grad_norm": 1.5859375, "learning_rate": 0.00045095250816166624, "loss": 4.8703, "step": 3085 }, { "epoch": 6.143141153081511, "grad_norm": 1.6328125, "learning_rate": 0.0004494309799695916, "loss": 4.799, "step": 3090 }, { "epoch": 6.153081510934394, "grad_norm": 1.421875, "learning_rate": 0.00044791066100256105, "loss": 4.7608, "step": 3095 }, { "epoch": 6.163021868787276, "grad_norm": 1.625, "learning_rate": 0.00044639156954063484, "loss": 4.8488, "step": 3100 }, { "epoch": 6.172962226640159, "grad_norm": 1.6953125, "learning_rate": 0.0004448737238491143, "loss": 4.8453, "step": 3105 }, { "epoch": 6.182902584493042, "grad_norm": 1.4296875, "learning_rate": 0.0004433571421783216, "loss": 4.8493, "step": 3110 }, { "epoch": 6.192842942345925, "grad_norm": 1.4453125, "learning_rate": 0.00044184184276338046, "loss": 4.777, "step": 3115 }, { "epoch": 6.202783300198807, "grad_norm": 1.640625, "learning_rate": 0.0004403278438239975, "loss": 4.8673, "step": 3120 }, { "epoch": 6.21272365805169, "grad_norm": 1.5625, "learning_rate": 0.0004388151635642418, "loss": 4.8473, "step": 3125 }, { "epoch": 6.222664015904573, "grad_norm": 1.6171875, "learning_rate": 0.0004373038201723276, "loss": 4.7034, "step": 3130 }, { "epoch": 6.2326043737574555, "grad_norm": 1.625, "learning_rate": 0.00043579383182039443, "loss": 4.8992, "step": 3135 }, { "epoch": 6.242544731610338, "grad_norm": 1.4921875, "learning_rate": 0.00043428521666428945, "loss": 4.9339, "step": 3140 }, { "epoch": 6.252485089463221, "grad_norm": 1.546875, "learning_rate": 0.0004327779928433482, "loss": 4.8042, "step": 3145 }, { "epoch": 6.262425447316104, "grad_norm": 1.59375, "learning_rate": 0.00043127217848017743, "loss": 4.7408, "step": 3150 }, { "epoch": 6.272365805168986, "grad_norm": 1.625, "learning_rate": 0.00042976779168043676, "loss": 4.8389, "step": 3155 }, { "epoch": 6.282306163021869, "grad_norm": 1.5703125, "learning_rate": 0.000428264850532621, "loss": 4.7711, "step": 3160 }, { "epoch": 6.292246520874752, "grad_norm": 1.6328125, "learning_rate": 0.0004267633731078425, "loss": 4.783, "step": 3165 }, { "epoch": 6.3021868787276345, "grad_norm": 1.703125, "learning_rate": 0.0004252633774596143, "loss": 4.8478, "step": 3170 }, { "epoch": 6.312127236580517, "grad_norm": 1.6328125, "learning_rate": 0.0004237648816236328, "loss": 4.7584, "step": 3175 }, { "epoch": 6.3220675944334, "grad_norm": 1.703125, "learning_rate": 0.0004222679036175605, "loss": 4.816, "step": 3180 }, { "epoch": 6.332007952286283, "grad_norm": 1.5, "learning_rate": 0.0004207724614408105, "loss": 4.8031, "step": 3185 }, { "epoch": 6.341948310139165, "grad_norm": 1.7421875, "learning_rate": 0.0004192785730743287, "loss": 4.9174, "step": 3190 }, { "epoch": 6.351888667992048, "grad_norm": 1.5859375, "learning_rate": 0.0004177862564803785, "loss": 4.8877, "step": 3195 }, { "epoch": 6.361829025844931, "grad_norm": 1.4765625, "learning_rate": 0.0004162955296023246, "loss": 4.7547, "step": 3200 }, { "epoch": 6.3717693836978135, "grad_norm": 1.75, "learning_rate": 0.00041480641036441724, "loss": 4.8575, "step": 3205 }, { "epoch": 6.381709741550695, "grad_norm": 1.6484375, "learning_rate": 0.0004133189166715766, "loss": 4.9182, "step": 3210 }, { "epoch": 6.391650099403579, "grad_norm": 1.453125, "learning_rate": 0.00041183306640917727, "loss": 4.899, "step": 3215 }, { "epoch": 6.401590457256461, "grad_norm": 1.484375, "learning_rate": 0.0004103488774428341, "loss": 4.7395, "step": 3220 }, { "epoch": 6.4115308151093435, "grad_norm": 1.3515625, "learning_rate": 0.0004088663676181864, "loss": 4.6612, "step": 3225 }, { "epoch": 6.421471172962226, "grad_norm": 1.6484375, "learning_rate": 0.00040738555476068386, "loss": 4.8349, "step": 3230 }, { "epoch": 6.431411530815109, "grad_norm": 1.6796875, "learning_rate": 0.0004059064566753722, "loss": 4.8487, "step": 3235 }, { "epoch": 6.441351888667992, "grad_norm": 1.6328125, "learning_rate": 0.0004044290911466789, "loss": 4.827, "step": 3240 }, { "epoch": 6.451292246520874, "grad_norm": 1.5703125, "learning_rate": 0.00040295347593819955, "loss": 4.8536, "step": 3245 }, { "epoch": 6.461232604373757, "grad_norm": 1.5625, "learning_rate": 0.000401479628792484, "loss": 4.8688, "step": 3250 }, { "epoch": 6.47117296222664, "grad_norm": 1.6484375, "learning_rate": 0.00040000756743082354, "loss": 4.9422, "step": 3255 }, { "epoch": 6.481113320079523, "grad_norm": 1.625, "learning_rate": 0.00039853730955303725, "loss": 4.8351, "step": 3260 }, { "epoch": 6.491053677932405, "grad_norm": 1.4921875, "learning_rate": 0.00039706887283725943, "loss": 4.9323, "step": 3265 }, { "epoch": 6.500994035785288, "grad_norm": 1.6484375, "learning_rate": 0.000395602274939727, "loss": 4.7852, "step": 3270 }, { "epoch": 6.510934393638171, "grad_norm": 1.5703125, "learning_rate": 0.0003941375334945675, "loss": 4.8949, "step": 3275 }, { "epoch": 6.5208747514910534, "grad_norm": 1.6015625, "learning_rate": 0.00039267466611358636, "loss": 4.8897, "step": 3280 }, { "epoch": 6.530815109343936, "grad_norm": 1.703125, "learning_rate": 0.000391213690386056, "loss": 4.8623, "step": 3285 }, { "epoch": 6.540755467196819, "grad_norm": 1.6171875, "learning_rate": 0.0003897546238785039, "loss": 4.8776, "step": 3290 }, { "epoch": 6.550695825049702, "grad_norm": 1.3984375, "learning_rate": 0.00038829748413450095, "loss": 4.7904, "step": 3295 }, { "epoch": 6.560636182902584, "grad_norm": 1.59375, "learning_rate": 0.00038684228867445135, "loss": 4.9014, "step": 3300 }, { "epoch": 6.570576540755467, "grad_norm": 1.5703125, "learning_rate": 0.00038538905499538144, "loss": 4.8478, "step": 3305 }, { "epoch": 6.58051689860835, "grad_norm": 2.640625, "learning_rate": 0.0003839378005707297, "loss": 4.7246, "step": 3310 }, { "epoch": 6.5904572564612325, "grad_norm": 1.53125, "learning_rate": 0.00038248854285013567, "loss": 4.8388, "step": 3315 }, { "epoch": 6.600397614314115, "grad_norm": 1.4765625, "learning_rate": 0.0003810412992592317, "loss": 4.6584, "step": 3320 }, { "epoch": 6.610337972166998, "grad_norm": 1.6171875, "learning_rate": 0.000379596087199432, "loss": 4.6995, "step": 3325 }, { "epoch": 6.620278330019881, "grad_norm": 1.6484375, "learning_rate": 0.0003781529240477243, "loss": 4.7442, "step": 3330 }, { "epoch": 6.630218687872763, "grad_norm": 1.5390625, "learning_rate": 0.00037671182715646036, "loss": 4.8731, "step": 3335 }, { "epoch": 6.640159045725646, "grad_norm": 1.4453125, "learning_rate": 0.0003752728138531479, "loss": 4.8889, "step": 3340 }, { "epoch": 6.650099403578529, "grad_norm": 1.5078125, "learning_rate": 0.0003738359014402417, "loss": 4.8921, "step": 3345 }, { "epoch": 6.6600397614314115, "grad_norm": 1.5625, "learning_rate": 0.0003724011071949357, "loss": 4.7939, "step": 3350 }, { "epoch": 6.669980119284294, "grad_norm": 1.6171875, "learning_rate": 0.00037096844836895546, "loss": 4.8604, "step": 3355 }, { "epoch": 6.679920477137177, "grad_norm": 1.5078125, "learning_rate": 0.0003695379421883509, "loss": 4.7357, "step": 3360 }, { "epoch": 6.68986083499006, "grad_norm": 1.796875, "learning_rate": 0.00036810960585328836, "loss": 4.8192, "step": 3365 }, { "epoch": 6.699801192842942, "grad_norm": 1.5546875, "learning_rate": 0.0003666834565378444, "loss": 4.8656, "step": 3370 }, { "epoch": 6.709741550695825, "grad_norm": 1.640625, "learning_rate": 0.00036525951138979986, "loss": 4.8806, "step": 3375 }, { "epoch": 6.719681908548708, "grad_norm": 1.578125, "learning_rate": 0.0003638377875304324, "loss": 4.8788, "step": 3380 }, { "epoch": 6.729622266401591, "grad_norm": 1.3671875, "learning_rate": 0.00036241830205431134, "loss": 4.8286, "step": 3385 }, { "epoch": 6.739562624254473, "grad_norm": 1.7109375, "learning_rate": 0.0003610010720290923, "loss": 4.8831, "step": 3390 }, { "epoch": 6.749502982107356, "grad_norm": 1.734375, "learning_rate": 0.0003595861144953115, "loss": 4.8835, "step": 3395 }, { "epoch": 6.759443339960239, "grad_norm": 1.625, "learning_rate": 0.00035817344646618134, "loss": 4.8331, "step": 3400 }, { "epoch": 6.769383697813121, "grad_norm": 1.7578125, "learning_rate": 0.0003567630849273854, "loss": 4.89, "step": 3405 }, { "epoch": 6.779324055666004, "grad_norm": 1.625, "learning_rate": 0.00035535504683687467, "loss": 4.8879, "step": 3410 }, { "epoch": 6.789264413518887, "grad_norm": 1.484375, "learning_rate": 0.0003539493491246628, "loss": 4.8563, "step": 3415 }, { "epoch": 6.79920477137177, "grad_norm": 1.59375, "learning_rate": 0.0003525460086926239, "loss": 4.8114, "step": 3420 }, { "epoch": 6.809145129224652, "grad_norm": 1.5703125, "learning_rate": 0.0003511450424142878, "loss": 4.7707, "step": 3425 }, { "epoch": 6.819085487077535, "grad_norm": 1.6953125, "learning_rate": 0.00034974646713463854, "loss": 4.8998, "step": 3430 }, { "epoch": 6.829025844930418, "grad_norm": 1.6640625, "learning_rate": 0.00034835029966991083, "loss": 4.8483, "step": 3435 }, { "epoch": 6.8389662027833005, "grad_norm": 1.484375, "learning_rate": 0.0003469565568073884, "loss": 4.8809, "step": 3440 }, { "epoch": 6.848906560636183, "grad_norm": 1.546875, "learning_rate": 0.00034556525530520166, "loss": 4.8783, "step": 3445 }, { "epoch": 6.858846918489066, "grad_norm": 1.6328125, "learning_rate": 0.0003441764118921268, "loss": 4.9983, "step": 3450 }, { "epoch": 6.868787276341949, "grad_norm": 1.578125, "learning_rate": 0.00034279004326738446, "loss": 4.887, "step": 3455 }, { "epoch": 6.878727634194831, "grad_norm": 1.484375, "learning_rate": 0.0003414061661004383, "loss": 4.8204, "step": 3460 }, { "epoch": 6.888667992047714, "grad_norm": 1.4609375, "learning_rate": 0.00034002479703079593, "loss": 4.8597, "step": 3465 }, { "epoch": 6.898608349900597, "grad_norm": 1.59375, "learning_rate": 0.00033864595266780727, "loss": 4.8343, "step": 3470 }, { "epoch": 6.9085487077534795, "grad_norm": 1.59375, "learning_rate": 0.00033726964959046596, "loss": 4.9038, "step": 3475 }, { "epoch": 6.918489065606362, "grad_norm": 1.609375, "learning_rate": 0.0003358959043472096, "loss": 4.8918, "step": 3480 }, { "epoch": 6.928429423459244, "grad_norm": 1.5234375, "learning_rate": 0.00033452473345572064, "loss": 4.9477, "step": 3485 }, { "epoch": 6.938369781312128, "grad_norm": 1.625, "learning_rate": 0.00033315615340272827, "loss": 4.8718, "step": 3490 }, { "epoch": 6.9483101391650095, "grad_norm": 1.6640625, "learning_rate": 0.00033179018064380964, "loss": 4.8683, "step": 3495 }, { "epoch": 6.958250497017893, "grad_norm": 1.6171875, "learning_rate": 0.0003304268316031922, "loss": 4.9095, "step": 3500 }, { "epoch": 6.958250497017893, "eval_loss": 6.577323913574219, "eval_runtime": 0.9906, "eval_samples_per_second": 3497.799, "eval_steps_per_second": 438.108, "step": 3500 }, { "epoch": 6.968190854870775, "grad_norm": 1.5546875, "learning_rate": 0.00032906612267355673, "loss": 4.9377, "step": 3505 }, { "epoch": 6.9781312127236585, "grad_norm": 1.578125, "learning_rate": 0.0003277080702158389, "loss": 4.9013, "step": 3510 }, { "epoch": 6.98807157057654, "grad_norm": 1.671875, "learning_rate": 0.000326352690559034, "loss": 4.8689, "step": 3515 }, { "epoch": 6.998011928429423, "grad_norm": 1.4296875, "learning_rate": 0.00032500000000000015, "loss": 4.7685, "step": 3520 }, { "epoch": 7.007952286282306, "grad_norm": 1.578125, "learning_rate": 0.0003236500148032616, "loss": 4.5525, "step": 3525 }, { "epoch": 7.0178926441351885, "grad_norm": 1.390625, "learning_rate": 0.00032230275120081445, "loss": 4.6655, "step": 3530 }, { "epoch": 7.027833001988071, "grad_norm": 1.5859375, "learning_rate": 0.0003209582253919302, "loss": 4.5633, "step": 3535 }, { "epoch": 7.037773359840954, "grad_norm": 1.4375, "learning_rate": 0.00031961645354296214, "loss": 4.645, "step": 3540 }, { "epoch": 7.047713717693837, "grad_norm": 1.4921875, "learning_rate": 0.00031827745178714996, "loss": 4.6067, "step": 3545 }, { "epoch": 7.057654075546719, "grad_norm": 1.6015625, "learning_rate": 0.00031694123622442647, "loss": 4.586, "step": 3550 }, { "epoch": 7.067594433399602, "grad_norm": 1.53125, "learning_rate": 0.0003156078229212236, "loss": 4.6739, "step": 3555 }, { "epoch": 7.077534791252485, "grad_norm": 1.484375, "learning_rate": 0.00031427722791027953, "loss": 4.7466, "step": 3560 }, { "epoch": 7.087475149105368, "grad_norm": 1.5390625, "learning_rate": 0.0003129494671904457, "loss": 4.6642, "step": 3565 }, { "epoch": 7.09741550695825, "grad_norm": 1.671875, "learning_rate": 0.0003116245567264944, "loss": 4.7098, "step": 3570 }, { "epoch": 7.107355864811133, "grad_norm": 1.609375, "learning_rate": 0.00031030251244892714, "loss": 4.7158, "step": 3575 }, { "epoch": 7.117296222664016, "grad_norm": 1.6796875, "learning_rate": 0.00030898335025378277, "loss": 4.713, "step": 3580 }, { "epoch": 7.1272365805168985, "grad_norm": 1.4609375, "learning_rate": 0.0003076670860024464, "loss": 4.7384, "step": 3585 }, { "epoch": 7.137176938369781, "grad_norm": 1.6015625, "learning_rate": 0.0003063537355214588, "loss": 4.6543, "step": 3590 }, { "epoch": 7.147117296222664, "grad_norm": 1.6796875, "learning_rate": 0.0003050433146023259, "loss": 4.6485, "step": 3595 }, { "epoch": 7.157057654075547, "grad_norm": 1.53125, "learning_rate": 0.00030373583900132975, "loss": 4.7452, "step": 3600 }, { "epoch": 7.166998011928429, "grad_norm": 1.7109375, "learning_rate": 0.0003024313244393377, "loss": 4.6436, "step": 3605 }, { "epoch": 7.176938369781312, "grad_norm": 1.7421875, "learning_rate": 0.00030112978660161395, "loss": 4.5875, "step": 3610 }, { "epoch": 7.186878727634195, "grad_norm": 1.6953125, "learning_rate": 0.0002998312411376315, "loss": 4.7453, "step": 3615 }, { "epoch": 7.1968190854870775, "grad_norm": 1.484375, "learning_rate": 0.00029853570366088336, "loss": 4.6465, "step": 3620 }, { "epoch": 7.20675944333996, "grad_norm": 1.4765625, "learning_rate": 0.0002972431897486952, "loss": 4.7295, "step": 3625 }, { "epoch": 7.216699801192843, "grad_norm": 1.7265625, "learning_rate": 0.00029595371494203754, "loss": 4.7584, "step": 3630 }, { "epoch": 7.226640159045726, "grad_norm": 1.78125, "learning_rate": 0.0002946672947453395, "loss": 4.7102, "step": 3635 }, { "epoch": 7.236580516898608, "grad_norm": 1.625, "learning_rate": 0.0002933839446263019, "loss": 4.711, "step": 3640 }, { "epoch": 7.246520874751491, "grad_norm": 1.5078125, "learning_rate": 0.0002921036800157115, "loss": 4.731, "step": 3645 }, { "epoch": 7.256461232604374, "grad_norm": 1.5234375, "learning_rate": 0.0002908265163072554, "loss": 4.6717, "step": 3650 }, { "epoch": 7.2664015904572565, "grad_norm": 1.5546875, "learning_rate": 0.0002895524688573361, "loss": 4.5894, "step": 3655 }, { "epoch": 7.276341948310139, "grad_norm": 1.375, "learning_rate": 0.00028828155298488655, "loss": 4.5726, "step": 3660 }, { "epoch": 7.286282306163022, "grad_norm": 1.6015625, "learning_rate": 0.0002870137839711864, "loss": 4.7588, "step": 3665 }, { "epoch": 7.296222664015905, "grad_norm": 1.625, "learning_rate": 0.00028574917705967765, "loss": 4.6552, "step": 3670 }, { "epoch": 7.306163021868787, "grad_norm": 1.40625, "learning_rate": 0.0002844877474557819, "loss": 4.6187, "step": 3675 }, { "epoch": 7.31610337972167, "grad_norm": 1.609375, "learning_rate": 0.00028322951032671727, "loss": 4.612, "step": 3680 }, { "epoch": 7.326043737574553, "grad_norm": 1.6640625, "learning_rate": 0.00028197448080131634, "loss": 4.7434, "step": 3685 }, { "epoch": 7.335984095427436, "grad_norm": 1.6328125, "learning_rate": 0.0002807226739698437, "loss": 4.6918, "step": 3690 }, { "epoch": 7.345924453280318, "grad_norm": 1.796875, "learning_rate": 0.0002794741048838149, "loss": 4.6909, "step": 3695 }, { "epoch": 7.355864811133201, "grad_norm": 1.46875, "learning_rate": 0.0002782287885558155, "loss": 4.7203, "step": 3700 }, { "epoch": 7.365805168986084, "grad_norm": 1.5546875, "learning_rate": 0.0002769867399593201, "loss": 4.7569, "step": 3705 }, { "epoch": 7.3757455268389664, "grad_norm": 1.7265625, "learning_rate": 0.00027574797402851313, "loss": 4.7278, "step": 3710 }, { "epoch": 7.385685884691849, "grad_norm": 1.734375, "learning_rate": 0.00027451250565810857, "loss": 4.7496, "step": 3715 }, { "epoch": 7.395626242544732, "grad_norm": 1.609375, "learning_rate": 0.000273280349703171, "loss": 4.7328, "step": 3720 }, { "epoch": 7.405566600397615, "grad_norm": 1.421875, "learning_rate": 0.00027205152097893695, "loss": 4.7538, "step": 3725 }, { "epoch": 7.415506958250497, "grad_norm": 1.7109375, "learning_rate": 0.00027082603426063735, "loss": 4.5678, "step": 3730 }, { "epoch": 7.42544731610338, "grad_norm": 1.78125, "learning_rate": 0.00026960390428331906, "loss": 4.6914, "step": 3735 }, { "epoch": 7.435387673956263, "grad_norm": 1.71875, "learning_rate": 0.00026838514574166814, "loss": 4.579, "step": 3740 }, { "epoch": 7.4453280318091455, "grad_norm": 1.7734375, "learning_rate": 0.0002671697732898329, "loss": 4.7473, "step": 3745 }, { "epoch": 7.455268389662028, "grad_norm": 1.6171875, "learning_rate": 0.0002659578015412483, "loss": 4.6904, "step": 3750 }, { "epoch": 7.465208747514911, "grad_norm": 1.65625, "learning_rate": 0.00026474924506845934, "loss": 4.6641, "step": 3755 }, { "epoch": 7.475149105367794, "grad_norm": 1.78125, "learning_rate": 0.0002635441184029466, "loss": 4.7065, "step": 3760 }, { "epoch": 7.485089463220676, "grad_norm": 1.7421875, "learning_rate": 0.00026234243603495125, "loss": 4.6873, "step": 3765 }, { "epoch": 7.495029821073558, "grad_norm": 1.4140625, "learning_rate": 0.0002611442124133005, "loss": 4.6487, "step": 3770 }, { "epoch": 7.504970178926442, "grad_norm": 1.53125, "learning_rate": 0.0002599494619452345, "loss": 4.6771, "step": 3775 }, { "epoch": 7.514910536779324, "grad_norm": 1.578125, "learning_rate": 0.0002587581989962328, "loss": 4.652, "step": 3780 }, { "epoch": 7.524850894632207, "grad_norm": 1.59375, "learning_rate": 0.00025757043788984113, "loss": 4.7845, "step": 3785 }, { "epoch": 7.534791252485089, "grad_norm": 1.6015625, "learning_rate": 0.0002563861929075003, "loss": 4.7395, "step": 3790 }, { "epoch": 7.544731610337972, "grad_norm": 1.4921875, "learning_rate": 0.00025520547828837347, "loss": 4.6688, "step": 3795 }, { "epoch": 7.5546719681908545, "grad_norm": 1.7421875, "learning_rate": 0.0002540283082291754, "loss": 4.6914, "step": 3800 }, { "epoch": 7.564612326043737, "grad_norm": 1.578125, "learning_rate": 0.0002528546968840014, "loss": 4.7399, "step": 3805 }, { "epoch": 7.57455268389662, "grad_norm": 1.609375, "learning_rate": 0.00025168465836415785, "loss": 4.8495, "step": 3810 }, { "epoch": 7.584493041749503, "grad_norm": 1.578125, "learning_rate": 0.00025051820673799166, "loss": 4.7914, "step": 3815 }, { "epoch": 7.594433399602385, "grad_norm": 1.546875, "learning_rate": 0.00024935535603072176, "loss": 4.6663, "step": 3820 }, { "epoch": 7.604373757455268, "grad_norm": 1.5234375, "learning_rate": 0.00024819612022427027, "loss": 4.8146, "step": 3825 }, { "epoch": 7.614314115308151, "grad_norm": 1.59375, "learning_rate": 0.00024704051325709407, "loss": 4.7236, "step": 3830 }, { "epoch": 7.6242544731610336, "grad_norm": 1.6328125, "learning_rate": 0.00024588854902401797, "loss": 4.5924, "step": 3835 }, { "epoch": 7.634194831013916, "grad_norm": 1.859375, "learning_rate": 0.0002447402413760668, "loss": 4.7034, "step": 3840 }, { "epoch": 7.644135188866799, "grad_norm": 1.609375, "learning_rate": 0.00024359560412029913, "loss": 4.5523, "step": 3845 }, { "epoch": 7.654075546719682, "grad_norm": 1.46875, "learning_rate": 0.00024245465101964164, "loss": 4.5733, "step": 3850 }, { "epoch": 7.664015904572564, "grad_norm": 1.6875, "learning_rate": 0.00024131739579272317, "loss": 4.6514, "step": 3855 }, { "epoch": 7.673956262425447, "grad_norm": 1.234375, "learning_rate": 0.00024018385211371, "loss": 4.5405, "step": 3860 }, { "epoch": 7.68389662027833, "grad_norm": 1.5390625, "learning_rate": 0.00023905403361214144, "loss": 4.625, "step": 3865 }, { "epoch": 7.693836978131213, "grad_norm": 1.390625, "learning_rate": 0.0002379279538727657, "loss": 4.676, "step": 3870 }, { "epoch": 7.703777335984095, "grad_norm": 1.609375, "learning_rate": 0.00023680562643537689, "loss": 4.742, "step": 3875 }, { "epoch": 7.713717693836978, "grad_norm": 1.6171875, "learning_rate": 0.00023568706479465214, "loss": 4.708, "step": 3880 }, { "epoch": 7.723658051689861, "grad_norm": 1.515625, "learning_rate": 0.00023457228239998906, "loss": 4.8227, "step": 3885 }, { "epoch": 7.7335984095427435, "grad_norm": 1.6015625, "learning_rate": 0.00023346129265534442, "loss": 4.6784, "step": 3890 }, { "epoch": 7.743538767395626, "grad_norm": 1.5625, "learning_rate": 0.0002323541089190727, "loss": 4.6197, "step": 3895 }, { "epoch": 7.753479125248509, "grad_norm": 1.546875, "learning_rate": 0.0002312507445037658, "loss": 4.7725, "step": 3900 }, { "epoch": 7.763419483101392, "grad_norm": 1.6875, "learning_rate": 0.00023015121267609218, "loss": 4.6471, "step": 3905 }, { "epoch": 7.773359840954274, "grad_norm": 1.578125, "learning_rate": 0.0002290555266566385, "loss": 4.7204, "step": 3910 }, { "epoch": 7.783300198807157, "grad_norm": 1.578125, "learning_rate": 0.00022796369961974977, "loss": 4.7072, "step": 3915 }, { "epoch": 7.79324055666004, "grad_norm": 1.6171875, "learning_rate": 0.00022687574469337145, "loss": 4.621, "step": 3920 }, { "epoch": 7.8031809145129225, "grad_norm": 1.6328125, "learning_rate": 0.00022579167495889114, "loss": 4.7284, "step": 3925 }, { "epoch": 7.813121272365805, "grad_norm": 1.453125, "learning_rate": 0.00022471150345098175, "loss": 4.652, "step": 3930 }, { "epoch": 7.823061630218688, "grad_norm": 1.6484375, "learning_rate": 0.00022363524315744437, "loss": 4.723, "step": 3935 }, { "epoch": 7.833001988071571, "grad_norm": 1.6796875, "learning_rate": 0.00022256290701905254, "loss": 4.7788, "step": 3940 }, { "epoch": 7.842942345924453, "grad_norm": 1.8125, "learning_rate": 0.0002214945079293962, "loss": 4.6438, "step": 3945 }, { "epoch": 7.852882703777336, "grad_norm": 1.640625, "learning_rate": 0.000220430058734727, "loss": 4.6901, "step": 3950 }, { "epoch": 7.862823061630219, "grad_norm": 1.5859375, "learning_rate": 0.00021936957223380368, "loss": 4.7047, "step": 3955 }, { "epoch": 7.8727634194831015, "grad_norm": 1.421875, "learning_rate": 0.0002183130611777382, "loss": 4.64, "step": 3960 }, { "epoch": 7.882703777335984, "grad_norm": 1.734375, "learning_rate": 0.00021726053826984248, "loss": 4.6828, "step": 3965 }, { "epoch": 7.892644135188867, "grad_norm": 1.6875, "learning_rate": 0.00021621201616547548, "loss": 4.7413, "step": 3970 }, { "epoch": 7.90258449304175, "grad_norm": 1.65625, "learning_rate": 0.00021516750747189146, "loss": 4.711, "step": 3975 }, { "epoch": 7.912524850894632, "grad_norm": 1.640625, "learning_rate": 0.00021412702474808782, "loss": 4.7372, "step": 3980 }, { "epoch": 7.922465208747515, "grad_norm": 1.5546875, "learning_rate": 0.00021309058050465447, "loss": 4.5801, "step": 3985 }, { "epoch": 7.932405566600398, "grad_norm": 1.6796875, "learning_rate": 0.0002120581872036233, "loss": 4.7983, "step": 3990 }, { "epoch": 7.942345924453281, "grad_norm": 1.703125, "learning_rate": 0.00021102985725831848, "loss": 4.712, "step": 3995 }, { "epoch": 7.952286282306163, "grad_norm": 1.5703125, "learning_rate": 0.00021000560303320687, "loss": 4.6951, "step": 4000 }, { "epoch": 7.952286282306163, "eval_loss": 6.589858531951904, "eval_runtime": 0.9934, "eval_samples_per_second": 3488.121, "eval_steps_per_second": 436.896, "step": 4000 } ], "logging_steps": 5, "max_steps": 5030, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5554263168645120.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }