{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9880715705765408, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.009940357852882704, "grad_norm": 7.53125, "learning_rate": 1e-05, "loss": 10.9235, "step": 5 }, { "epoch": 0.019880715705765408, "grad_norm": 7.125, "learning_rate": 2e-05, "loss": 10.8542, "step": 10 }, { "epoch": 0.02982107355864811, "grad_norm": 5.28125, "learning_rate": 3e-05, "loss": 10.6296, "step": 15 }, { "epoch": 0.039761431411530816, "grad_norm": 3.734375, "learning_rate": 4e-05, "loss": 10.4421, "step": 20 }, { "epoch": 0.04970178926441352, "grad_norm": 3.109375, "learning_rate": 5e-05, "loss": 10.3131, "step": 25 }, { "epoch": 0.05964214711729622, "grad_norm": 2.921875, "learning_rate": 6e-05, "loss": 10.2138, "step": 30 }, { "epoch": 0.06958250497017893, "grad_norm": 2.921875, "learning_rate": 7.000000000000001e-05, "loss": 10.061, "step": 35 }, { "epoch": 0.07952286282306163, "grad_norm": 2.671875, "learning_rate": 8e-05, "loss": 9.9397, "step": 40 }, { "epoch": 0.08946322067594434, "grad_norm": 2.625, "learning_rate": 8.999999999999999e-05, "loss": 9.7387, "step": 45 }, { "epoch": 0.09940357852882704, "grad_norm": 2.59375, "learning_rate": 0.0001, "loss": 9.5704, "step": 50 }, { "epoch": 0.10934393638170974, "grad_norm": 2.375, "learning_rate": 0.00011, "loss": 9.3733, "step": 55 }, { "epoch": 0.11928429423459244, "grad_norm": 2.34375, "learning_rate": 0.00012, "loss": 9.1876, "step": 60 }, { "epoch": 0.12922465208747516, "grad_norm": 1.84375, "learning_rate": 0.00013000000000000002, "loss": 9.041, "step": 65 }, { "epoch": 0.13916500994035785, "grad_norm": 1.671875, "learning_rate": 0.00014000000000000001, "loss": 8.8545, "step": 70 }, { "epoch": 0.14910536779324055, "grad_norm": 1.53125, "learning_rate": 0.00015, "loss": 8.6955, "step": 75 }, { "epoch": 0.15904572564612326, "grad_norm": 1.2421875, "learning_rate": 0.00016, "loss": 8.5583, "step": 80 }, { "epoch": 0.16898608349900596, "grad_norm": 1.046875, "learning_rate": 0.00017, "loss": 8.4812, "step": 85 }, { "epoch": 0.17892644135188868, "grad_norm": 1.28125, "learning_rate": 0.00017999999999999998, "loss": 8.4265, "step": 90 }, { "epoch": 0.18886679920477137, "grad_norm": 1.2109375, "learning_rate": 0.00019, "loss": 8.4638, "step": 95 }, { "epoch": 0.1988071570576541, "grad_norm": 1.2734375, "learning_rate": 0.0002, "loss": 8.3998, "step": 100 }, { "epoch": 0.20874751491053678, "grad_norm": 1.265625, "learning_rate": 0.00021, "loss": 8.3605, "step": 105 }, { "epoch": 0.21868787276341947, "grad_norm": 1.3203125, "learning_rate": 0.00022, "loss": 8.3752, "step": 110 }, { "epoch": 0.2286282306163022, "grad_norm": 1.5, "learning_rate": 0.00023, "loss": 8.3556, "step": 115 }, { "epoch": 0.23856858846918488, "grad_norm": 1.3671875, "learning_rate": 0.00024, "loss": 8.3474, "step": 120 }, { "epoch": 0.2485089463220676, "grad_norm": 1.3203125, "learning_rate": 0.00025, "loss": 8.2942, "step": 125 }, { "epoch": 0.2584493041749503, "grad_norm": 1.65625, "learning_rate": 0.00026000000000000003, "loss": 8.316, "step": 130 }, { "epoch": 0.268389662027833, "grad_norm": 1.5234375, "learning_rate": 0.00027, "loss": 8.2375, "step": 135 }, { "epoch": 0.2783300198807157, "grad_norm": 1.734375, "learning_rate": 0.00028000000000000003, "loss": 8.2676, "step": 140 }, { "epoch": 0.2882703777335984, "grad_norm": 1.9375, "learning_rate": 0.00029, "loss": 8.1987, "step": 145 }, { "epoch": 0.2982107355864811, "grad_norm": 1.96875, "learning_rate": 0.0003, "loss": 8.1847, "step": 150 }, { "epoch": 0.3081510934393638, "grad_norm": 1.703125, "learning_rate": 0.00031, "loss": 8.2578, "step": 155 }, { "epoch": 0.31809145129224653, "grad_norm": 1.578125, "learning_rate": 0.00032, "loss": 8.1946, "step": 160 }, { "epoch": 0.32803180914512925, "grad_norm": 1.5859375, "learning_rate": 0.00033, "loss": 8.1504, "step": 165 }, { "epoch": 0.3379721669980119, "grad_norm": 1.640625, "learning_rate": 0.00034, "loss": 8.126, "step": 170 }, { "epoch": 0.34791252485089463, "grad_norm": 1.3359375, "learning_rate": 0.00035, "loss": 8.0893, "step": 175 }, { "epoch": 0.35785288270377735, "grad_norm": 1.75, "learning_rate": 0.00035999999999999997, "loss": 8.0522, "step": 180 }, { "epoch": 0.36779324055666, "grad_norm": 1.640625, "learning_rate": 0.00037, "loss": 8.0653, "step": 185 }, { "epoch": 0.37773359840954274, "grad_norm": 1.515625, "learning_rate": 0.00038, "loss": 8.0899, "step": 190 }, { "epoch": 0.38767395626242546, "grad_norm": 1.5546875, "learning_rate": 0.00039000000000000005, "loss": 8.0308, "step": 195 }, { "epoch": 0.3976143141153082, "grad_norm": 1.703125, "learning_rate": 0.0004, "loss": 7.9695, "step": 200 }, { "epoch": 0.40755467196819084, "grad_norm": 1.5546875, "learning_rate": 0.00041, "loss": 7.9639, "step": 205 }, { "epoch": 0.41749502982107356, "grad_norm": 2.203125, "learning_rate": 0.00042, "loss": 7.9662, "step": 210 }, { "epoch": 0.4274353876739563, "grad_norm": 1.65625, "learning_rate": 0.00043, "loss": 7.9049, "step": 215 }, { "epoch": 0.43737574552683894, "grad_norm": 1.515625, "learning_rate": 0.00044, "loss": 7.9815, "step": 220 }, { "epoch": 0.44731610337972166, "grad_norm": 1.5, "learning_rate": 0.00045000000000000004, "loss": 7.9026, "step": 225 }, { "epoch": 0.4572564612326044, "grad_norm": 1.6484375, "learning_rate": 0.00046, "loss": 7.8753, "step": 230 }, { "epoch": 0.4671968190854871, "grad_norm": 1.515625, "learning_rate": 0.00047, "loss": 7.8852, "step": 235 }, { "epoch": 0.47713717693836977, "grad_norm": 1.625, "learning_rate": 0.00048, "loss": 7.9331, "step": 240 }, { "epoch": 0.4870775347912525, "grad_norm": 1.78125, "learning_rate": 0.00049, "loss": 7.7972, "step": 245 }, { "epoch": 0.4970178926441352, "grad_norm": 1.5234375, "learning_rate": 0.0005, "loss": 7.8323, "step": 250 }, { "epoch": 0.5069582504970179, "grad_norm": 2.375, "learning_rate": 0.00051, "loss": 7.8246, "step": 255 }, { "epoch": 0.5168986083499006, "grad_norm": 1.5546875, "learning_rate": 0.0005200000000000001, "loss": 7.864, "step": 260 }, { "epoch": 0.5268389662027833, "grad_norm": 1.46875, "learning_rate": 0.0005300000000000001, "loss": 7.9518, "step": 265 }, { "epoch": 0.536779324055666, "grad_norm": 1.703125, "learning_rate": 0.00054, "loss": 7.7417, "step": 270 }, { "epoch": 0.5467196819085487, "grad_norm": 1.5078125, "learning_rate": 0.00055, "loss": 7.7927, "step": 275 }, { "epoch": 0.5566600397614314, "grad_norm": 1.546875, "learning_rate": 0.0005600000000000001, "loss": 7.7389, "step": 280 }, { "epoch": 0.5666003976143141, "grad_norm": 1.5625, "learning_rate": 0.00057, "loss": 7.7131, "step": 285 }, { "epoch": 0.5765407554671969, "grad_norm": 1.3125, "learning_rate": 0.00058, "loss": 7.6988, "step": 290 }, { "epoch": 0.5864811133200796, "grad_norm": 1.7421875, "learning_rate": 0.00059, "loss": 7.7104, "step": 295 }, { "epoch": 0.5964214711729622, "grad_norm": 1.859375, "learning_rate": 0.0006, "loss": 7.6605, "step": 300 }, { "epoch": 0.6063618290258449, "grad_norm": 1.59375, "learning_rate": 0.00061, "loss": 7.7188, "step": 305 }, { "epoch": 0.6163021868787276, "grad_norm": 1.734375, "learning_rate": 0.00062, "loss": 7.6618, "step": 310 }, { "epoch": 0.6262425447316103, "grad_norm": 1.46875, "learning_rate": 0.00063, "loss": 7.7635, "step": 315 }, { "epoch": 0.6361829025844931, "grad_norm": 2.265625, "learning_rate": 0.00064, "loss": 7.6721, "step": 320 }, { "epoch": 0.6461232604373758, "grad_norm": 2.953125, "learning_rate": 0.0006500000000000001, "loss": 7.6399, "step": 325 }, { "epoch": 0.6560636182902585, "grad_norm": 1.578125, "learning_rate": 0.00066, "loss": 7.5828, "step": 330 }, { "epoch": 0.6660039761431411, "grad_norm": 1.515625, "learning_rate": 0.00067, "loss": 7.6427, "step": 335 }, { "epoch": 0.6759443339960238, "grad_norm": 1.515625, "learning_rate": 0.00068, "loss": 7.4545, "step": 340 }, { "epoch": 0.6858846918489065, "grad_norm": 1.59375, "learning_rate": 0.00069, "loss": 7.5338, "step": 345 }, { "epoch": 0.6958250497017893, "grad_norm": 1.7265625, "learning_rate": 0.0007, "loss": 7.5311, "step": 350 }, { "epoch": 0.705765407554672, "grad_norm": 2.171875, "learning_rate": 0.00071, "loss": 7.5899, "step": 355 }, { "epoch": 0.7157057654075547, "grad_norm": 1.4375, "learning_rate": 0.0007199999999999999, "loss": 7.5128, "step": 360 }, { "epoch": 0.7256461232604374, "grad_norm": 1.671875, "learning_rate": 0.00073, "loss": 7.4893, "step": 365 }, { "epoch": 0.73558648111332, "grad_norm": 1.59375, "learning_rate": 0.00074, "loss": 7.5447, "step": 370 }, { "epoch": 0.7455268389662028, "grad_norm": 1.609375, "learning_rate": 0.00075, "loss": 7.4271, "step": 375 }, { "epoch": 0.7554671968190855, "grad_norm": 1.46875, "learning_rate": 0.00076, "loss": 7.5216, "step": 380 }, { "epoch": 0.7654075546719682, "grad_norm": 1.515625, "learning_rate": 0.0007700000000000001, "loss": 7.4923, "step": 385 }, { "epoch": 0.7753479125248509, "grad_norm": 1.4453125, "learning_rate": 0.0007800000000000001, "loss": 7.4305, "step": 390 }, { "epoch": 0.7852882703777336, "grad_norm": 1.484375, "learning_rate": 0.00079, "loss": 7.4223, "step": 395 }, { "epoch": 0.7952286282306164, "grad_norm": 1.5703125, "learning_rate": 0.0008, "loss": 7.4875, "step": 400 }, { "epoch": 0.805168986083499, "grad_norm": 1.4921875, "learning_rate": 0.0008100000000000001, "loss": 7.4465, "step": 405 }, { "epoch": 0.8151093439363817, "grad_norm": 1.5625, "learning_rate": 0.00082, "loss": 7.4046, "step": 410 }, { "epoch": 0.8250497017892644, "grad_norm": 1.5, "learning_rate": 0.00083, "loss": 7.384, "step": 415 }, { "epoch": 0.8349900596421471, "grad_norm": 1.5546875, "learning_rate": 0.00084, "loss": 7.3318, "step": 420 }, { "epoch": 0.8449304174950298, "grad_norm": 1.421875, "learning_rate": 0.00085, "loss": 7.4465, "step": 425 }, { "epoch": 0.8548707753479126, "grad_norm": 1.4296875, "learning_rate": 0.00086, "loss": 7.3554, "step": 430 }, { "epoch": 0.8648111332007953, "grad_norm": 1.5078125, "learning_rate": 0.00087, "loss": 7.348, "step": 435 }, { "epoch": 0.8747514910536779, "grad_norm": 1.4921875, "learning_rate": 0.00088, "loss": 7.3536, "step": 440 }, { "epoch": 0.8846918489065606, "grad_norm": 1.59375, "learning_rate": 0.0008900000000000001, "loss": 7.3074, "step": 445 }, { "epoch": 0.8946322067594433, "grad_norm": 1.546875, "learning_rate": 0.0009000000000000001, "loss": 7.4301, "step": 450 }, { "epoch": 0.904572564612326, "grad_norm": 1.5625, "learning_rate": 0.00091, "loss": 7.2948, "step": 455 }, { "epoch": 0.9145129224652088, "grad_norm": 1.6953125, "learning_rate": 0.00092, "loss": 7.4022, "step": 460 }, { "epoch": 0.9244532803180915, "grad_norm": 1.78125, "learning_rate": 0.00093, "loss": 7.3491, "step": 465 }, { "epoch": 0.9343936381709742, "grad_norm": 1.4140625, "learning_rate": 0.00094, "loss": 7.3304, "step": 470 }, { "epoch": 0.9443339960238568, "grad_norm": 1.4296875, "learning_rate": 0.00095, "loss": 7.3213, "step": 475 }, { "epoch": 0.9542743538767395, "grad_norm": 1.53125, "learning_rate": 0.00096, "loss": 7.3184, "step": 480 }, { "epoch": 0.9642147117296223, "grad_norm": 1.4609375, "learning_rate": 0.0009699999999999999, "loss": 7.2904, "step": 485 }, { "epoch": 0.974155069582505, "grad_norm": 1.3671875, "learning_rate": 0.00098, "loss": 7.2904, "step": 490 }, { "epoch": 0.9840954274353877, "grad_norm": 1.3359375, "learning_rate": 0.00099, "loss": 7.2536, "step": 495 }, { "epoch": 0.9940357852882704, "grad_norm": 1.859375, "learning_rate": 0.001, "loss": 7.2545, "step": 500 }, { "epoch": 0.9940357852882704, "eval_loss": 7.392611503601074, "eval_runtime": 0.9938, "eval_samples_per_second": 3486.498, "eval_steps_per_second": 436.693, "step": 500 }, { "epoch": 1.0039761431411531, "grad_norm": 1.546875, "learning_rate": 0.0009999972946377045, "loss": 7.1713, "step": 505 }, { "epoch": 1.0139165009940359, "grad_norm": 1.5390625, "learning_rate": 0.0009999891785833469, "loss": 7.0401, "step": 510 }, { "epoch": 1.0238568588469186, "grad_norm": 1.40625, "learning_rate": 0.0009999756519345133, "loss": 7.0191, "step": 515 }, { "epoch": 1.0337972166998013, "grad_norm": 1.453125, "learning_rate": 0.0009999567148538456, "loss": 7.0774, "step": 520 }, { "epoch": 1.0437375745526838, "grad_norm": 1.5703125, "learning_rate": 0.0009999323675690406, "loss": 7.1122, "step": 525 }, { "epoch": 1.0536779324055665, "grad_norm": 1.65625, "learning_rate": 0.0009999026103728454, "loss": 7.0297, "step": 530 }, { "epoch": 1.0636182902584492, "grad_norm": 1.6875, "learning_rate": 0.0009998674436230558, "loss": 7.0478, "step": 535 }, { "epoch": 1.073558648111332, "grad_norm": 1.5234375, "learning_rate": 0.000999826867742511, "loss": 7.0749, "step": 540 }, { "epoch": 1.0834990059642147, "grad_norm": 1.3984375, "learning_rate": 0.0009997808832190884, "loss": 6.9982, "step": 545 }, { "epoch": 1.0934393638170974, "grad_norm": 1.5, "learning_rate": 0.0009997294906056982, "loss": 7.0269, "step": 550 }, { "epoch": 1.10337972166998, "grad_norm": 1.328125, "learning_rate": 0.000999672690520277, "loss": 7.0031, "step": 555 }, { "epoch": 1.1133200795228628, "grad_norm": 1.28125, "learning_rate": 0.000999610483645779, "loss": 6.9229, "step": 560 }, { "epoch": 1.1232604373757455, "grad_norm": 1.421875, "learning_rate": 0.0009995428707301694, "loss": 6.989, "step": 565 }, { "epoch": 1.1332007952286283, "grad_norm": 1.421875, "learning_rate": 0.0009994698525864147, "loss": 7.0723, "step": 570 }, { "epoch": 1.143141153081511, "grad_norm": 1.453125, "learning_rate": 0.0009993914300924726, "loss": 7.0914, "step": 575 }, { "epoch": 1.1530815109343937, "grad_norm": 1.4609375, "learning_rate": 0.000999307604191282, "loss": 6.9886, "step": 580 }, { "epoch": 1.1630218687872764, "grad_norm": 1.4609375, "learning_rate": 0.0009992183758907518, "loss": 6.993, "step": 585 }, { "epoch": 1.1729622266401591, "grad_norm": 1.4453125, "learning_rate": 0.0009991237462637478, "loss": 6.9879, "step": 590 }, { "epoch": 1.1829025844930419, "grad_norm": 1.328125, "learning_rate": 0.000999023716448081, "loss": 7.034, "step": 595 }, { "epoch": 1.1928429423459244, "grad_norm": 1.5390625, "learning_rate": 0.0009989182876464931, "loss": 6.9752, "step": 600 }, { "epoch": 1.202783300198807, "grad_norm": 1.546875, "learning_rate": 0.0009988074611266423, "loss": 6.8754, "step": 605 }, { "epoch": 1.2127236580516898, "grad_norm": 1.4453125, "learning_rate": 0.000998691238221088, "loss": 6.9923, "step": 610 }, { "epoch": 1.2226640159045725, "grad_norm": 1.375, "learning_rate": 0.0009985696203272752, "loss": 6.885, "step": 615 }, { "epoch": 1.2326043737574552, "grad_norm": 1.53125, "learning_rate": 0.0009984426089075168, "loss": 6.9113, "step": 620 }, { "epoch": 1.242544731610338, "grad_norm": 1.5, "learning_rate": 0.000998310205488977, "loss": 6.9467, "step": 625 }, { "epoch": 1.2524850894632207, "grad_norm": 1.421875, "learning_rate": 0.0009981724116636525, "loss": 6.91, "step": 630 }, { "epoch": 1.2624254473161034, "grad_norm": 1.390625, "learning_rate": 0.0009980292290883526, "loss": 6.9814, "step": 635 }, { "epoch": 1.2723658051689861, "grad_norm": 1.3515625, "learning_rate": 0.000997880659484681, "loss": 6.9393, "step": 640 }, { "epoch": 1.2823061630218688, "grad_norm": 1.5234375, "learning_rate": 0.0009977267046390138, "loss": 6.9344, "step": 645 }, { "epoch": 1.2922465208747516, "grad_norm": 1.484375, "learning_rate": 0.000997567366402478, "loss": 6.8575, "step": 650 }, { "epoch": 1.302186878727634, "grad_norm": 1.3125, "learning_rate": 0.0009974026466909299, "loss": 6.85, "step": 655 }, { "epoch": 1.3121272365805168, "grad_norm": 1.3984375, "learning_rate": 0.000997232547484932, "loss": 6.9196, "step": 660 }, { "epoch": 1.3220675944333995, "grad_norm": 1.6875, "learning_rate": 0.0009970570708297281, "loss": 6.8259, "step": 665 }, { "epoch": 1.3320079522862822, "grad_norm": 1.5390625, "learning_rate": 0.0009968762188352208, "loss": 6.8472, "step": 670 }, { "epoch": 1.341948310139165, "grad_norm": 1.3828125, "learning_rate": 0.0009966899936759436, "loss": 6.8573, "step": 675 }, { "epoch": 1.3518886679920477, "grad_norm": 2.78125, "learning_rate": 0.0009964983975910369, "loss": 6.9833, "step": 680 }, { "epoch": 1.3618290258449304, "grad_norm": 1.3125, "learning_rate": 0.0009963014328842196, "loss": 6.9976, "step": 685 }, { "epoch": 1.371769383697813, "grad_norm": 1.4296875, "learning_rate": 0.0009960991019237627, "loss": 6.8598, "step": 690 }, { "epoch": 1.3817097415506958, "grad_norm": 1.359375, "learning_rate": 0.0009958914071424596, "loss": 6.8171, "step": 695 }, { "epoch": 1.3916500994035785, "grad_norm": 1.578125, "learning_rate": 0.0009956783510375975, "loss": 6.8734, "step": 700 }, { "epoch": 1.4015904572564613, "grad_norm": 1.328125, "learning_rate": 0.0009954599361709276, "loss": 6.8877, "step": 705 }, { "epoch": 1.411530815109344, "grad_norm": 1.34375, "learning_rate": 0.0009952361651686331, "loss": 6.7897, "step": 710 }, { "epoch": 1.4214711729622267, "grad_norm": 1.5546875, "learning_rate": 0.0009950070407212996, "loss": 6.9605, "step": 715 }, { "epoch": 1.4314115308151094, "grad_norm": 1.4140625, "learning_rate": 0.0009947725655838806, "loss": 6.8834, "step": 720 }, { "epoch": 1.4413518886679921, "grad_norm": 1.3984375, "learning_rate": 0.0009945327425756661, "loss": 6.8195, "step": 725 }, { "epoch": 1.4512922465208749, "grad_norm": 1.3828125, "learning_rate": 0.000994287574580248, "loss": 6.8148, "step": 730 }, { "epoch": 1.4612326043737576, "grad_norm": 1.421875, "learning_rate": 0.0009940370645454848, "loss": 6.8626, "step": 735 }, { "epoch": 1.4711729622266403, "grad_norm": 1.453125, "learning_rate": 0.000993781215483467, "loss": 6.8765, "step": 740 }, { "epoch": 1.4811133200795228, "grad_norm": 1.46875, "learning_rate": 0.0009935200304704815, "loss": 6.7831, "step": 745 }, { "epoch": 1.4910536779324055, "grad_norm": 1.4765625, "learning_rate": 0.0009932535126469725, "loss": 6.8274, "step": 750 }, { "epoch": 1.5009940357852882, "grad_norm": 1.515625, "learning_rate": 0.0009929816652175063, "loss": 6.8189, "step": 755 }, { "epoch": 1.510934393638171, "grad_norm": 1.328125, "learning_rate": 0.00099270449145073, "loss": 6.7934, "step": 760 }, { "epoch": 1.5208747514910537, "grad_norm": 1.4140625, "learning_rate": 0.0009924219946793353, "loss": 6.6405, "step": 765 }, { "epoch": 1.5308151093439364, "grad_norm": 1.71875, "learning_rate": 0.0009921341783000158, "loss": 6.6862, "step": 770 }, { "epoch": 1.540755467196819, "grad_norm": 1.3671875, "learning_rate": 0.000991841045773427, "loss": 6.7518, "step": 775 }, { "epoch": 1.5506958250497018, "grad_norm": 1.4296875, "learning_rate": 0.000991542600624146, "loss": 6.7292, "step": 780 }, { "epoch": 1.5606361829025845, "grad_norm": 1.546875, "learning_rate": 0.0009912388464406265, "loss": 6.7062, "step": 785 }, { "epoch": 1.570576540755467, "grad_norm": 1.5, "learning_rate": 0.0009909297868751585, "loss": 6.6082, "step": 790 }, { "epoch": 1.5805168986083498, "grad_norm": 1.3203125, "learning_rate": 0.0009906154256438223, "loss": 6.7426, "step": 795 }, { "epoch": 1.5904572564612325, "grad_norm": 1.484375, "learning_rate": 0.0009902957665264443, "loss": 6.8086, "step": 800 }, { "epoch": 1.6003976143141152, "grad_norm": 1.78125, "learning_rate": 0.0009899708133665529, "loss": 6.736, "step": 805 }, { "epoch": 1.610337972166998, "grad_norm": 1.3203125, "learning_rate": 0.0009896405700713295, "loss": 6.7488, "step": 810 }, { "epoch": 1.6202783300198806, "grad_norm": 1.3203125, "learning_rate": 0.000989305040611565, "loss": 6.7246, "step": 815 }, { "epoch": 1.6302186878727634, "grad_norm": 1.3984375, "learning_rate": 0.0009889642290216085, "loss": 6.7968, "step": 820 }, { "epoch": 1.640159045725646, "grad_norm": 1.4453125, "learning_rate": 0.0009886181393993223, "loss": 6.6922, "step": 825 }, { "epoch": 1.6500994035785288, "grad_norm": 1.4140625, "learning_rate": 0.0009882667759060298, "loss": 6.6635, "step": 830 }, { "epoch": 1.6600397614314115, "grad_norm": 1.3515625, "learning_rate": 0.0009879101427664662, "loss": 6.6233, "step": 835 }, { "epoch": 1.6699801192842942, "grad_norm": 1.375, "learning_rate": 0.0009875482442687294, "loss": 6.7173, "step": 840 }, { "epoch": 1.679920477137177, "grad_norm": 2.03125, "learning_rate": 0.0009871810847642258, "loss": 6.7099, "step": 845 }, { "epoch": 1.6898608349900597, "grad_norm": 1.453125, "learning_rate": 0.00098680866866762, "loss": 6.6863, "step": 850 }, { "epoch": 1.6998011928429424, "grad_norm": 1.3984375, "learning_rate": 0.0009864310004567807, "loss": 6.728, "step": 855 }, { "epoch": 1.7097415506958251, "grad_norm": 1.4296875, "learning_rate": 0.000986048084672727, "loss": 6.6503, "step": 860 }, { "epoch": 1.7196819085487078, "grad_norm": 1.265625, "learning_rate": 0.0009856599259195741, "loss": 6.6758, "step": 865 }, { "epoch": 1.7296222664015906, "grad_norm": 1.4609375, "learning_rate": 0.0009852665288644783, "loss": 6.6894, "step": 870 }, { "epoch": 1.7395626242544733, "grad_norm": 1.5078125, "learning_rate": 0.000984867898237579, "loss": 6.6299, "step": 875 }, { "epoch": 1.749502982107356, "grad_norm": 1.3203125, "learning_rate": 0.000984464038831945, "loss": 6.6652, "step": 880 }, { "epoch": 1.7594433399602387, "grad_norm": 1.53125, "learning_rate": 0.0009840549555035136, "loss": 6.6375, "step": 885 }, { "epoch": 1.7693836978131214, "grad_norm": 1.34375, "learning_rate": 0.0009836406531710342, "loss": 6.6245, "step": 890 }, { "epoch": 1.779324055666004, "grad_norm": 1.359375, "learning_rate": 0.0009832211368160087, "loss": 6.6434, "step": 895 }, { "epoch": 1.7892644135188867, "grad_norm": 1.4375, "learning_rate": 0.0009827964114826314, "loss": 6.5907, "step": 900 }, { "epoch": 1.7992047713717694, "grad_norm": 1.3359375, "learning_rate": 0.0009823664822777285, "loss": 6.6743, "step": 905 }, { "epoch": 1.809145129224652, "grad_norm": 1.3203125, "learning_rate": 0.000981931354370697, "loss": 6.6238, "step": 910 }, { "epoch": 1.8190854870775348, "grad_norm": 1.3828125, "learning_rate": 0.0009814910329934414, "loss": 6.5983, "step": 915 }, { "epoch": 1.8290258449304175, "grad_norm": 1.3984375, "learning_rate": 0.0009810455234403126, "loss": 6.6457, "step": 920 }, { "epoch": 1.8389662027833003, "grad_norm": 1.3125, "learning_rate": 0.000980594831068043, "loss": 6.4873, "step": 925 }, { "epoch": 1.8489065606361827, "grad_norm": 1.234375, "learning_rate": 0.0009801389612956815, "loss": 6.5629, "step": 930 }, { "epoch": 1.8588469184890655, "grad_norm": 1.4921875, "learning_rate": 0.0009796779196045303, "loss": 6.6765, "step": 935 }, { "epoch": 1.8687872763419482, "grad_norm": 1.5234375, "learning_rate": 0.0009792117115380774, "loss": 6.5999, "step": 940 }, { "epoch": 1.878727634194831, "grad_norm": 1.390625, "learning_rate": 0.0009787403427019303, "loss": 6.6639, "step": 945 }, { "epoch": 1.8886679920477136, "grad_norm": 1.3359375, "learning_rate": 0.000978263818763749, "loss": 6.6352, "step": 950 }, { "epoch": 1.8986083499005963, "grad_norm": 1.5859375, "learning_rate": 0.0009777821454531775, "loss": 6.6011, "step": 955 }, { "epoch": 1.908548707753479, "grad_norm": 1.3984375, "learning_rate": 0.0009772953285617748, "loss": 6.5817, "step": 960 }, { "epoch": 1.9184890656063618, "grad_norm": 1.40625, "learning_rate": 0.0009768033739429459, "loss": 6.6113, "step": 965 }, { "epoch": 1.9284294234592445, "grad_norm": 1.328125, "learning_rate": 0.0009763062875118706, "loss": 6.5931, "step": 970 }, { "epoch": 1.9383697813121272, "grad_norm": 1.46875, "learning_rate": 0.0009758040752454326, "loss": 6.6421, "step": 975 }, { "epoch": 1.94831013916501, "grad_norm": 1.234375, "learning_rate": 0.0009752967431821485, "loss": 6.6209, "step": 980 }, { "epoch": 1.9582504970178927, "grad_norm": 1.515625, "learning_rate": 0.0009747842974220936, "loss": 6.5526, "step": 985 }, { "epoch": 1.9681908548707754, "grad_norm": 1.4609375, "learning_rate": 0.00097426674412683, "loss": 6.6085, "step": 990 }, { "epoch": 1.978131212723658, "grad_norm": 1.40625, "learning_rate": 0.0009737440895193317, "loss": 6.548, "step": 995 }, { "epoch": 1.9880715705765408, "grad_norm": 1.3515625, "learning_rate": 0.0009732163398839106, "loss": 6.5648, "step": 1000 }, { "epoch": 1.9880715705765408, "eval_loss": 6.81672477722168, "eval_runtime": 0.9933, "eval_samples_per_second": 3488.505, "eval_steps_per_second": 436.944, "step": 1000 } ], "logging_steps": 5, "max_steps": 5030, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1380849812951040.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }