{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.8463973592359101, "eval_steps": 500, "global_step": 15641, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005411401823642415, "grad_norm": 3.381409168243408, "learning_rate": 8.999999999999999e-05, "loss": 11.904257202148438, "num_input_tokens_seen": 5242880, "step": 10, "train_runtime": 98.6668, "train_tokens_per_second": 53137.211 }, { "epoch": 0.001082280364728483, "grad_norm": 3.160391092300415, "learning_rate": 0.00019, "loss": 11.522220611572266, "num_input_tokens_seen": 10485760, "step": 20, "train_runtime": 143.6354, "train_tokens_per_second": 73002.603 }, { "epoch": 0.0016234205470927244, "grad_norm": 2.81247878074646, "learning_rate": 0.00029, "loss": 10.88598403930664, "num_input_tokens_seen": 15728640, "step": 30, "train_runtime": 188.573, "train_tokens_per_second": 83408.77 }, { "epoch": 0.002164560729456966, "grad_norm": 2.100623846054077, "learning_rate": 0.00039, "loss": 10.268418884277343, "num_input_tokens_seen": 20971520, "step": 40, "train_runtime": 233.5046, "train_tokens_per_second": 89812.025 }, { "epoch": 0.002705700911821207, "grad_norm": 1.6848973035812378, "learning_rate": 0.00049, "loss": 9.741734313964844, "num_input_tokens_seen": 26214400, "step": 50, "train_runtime": 278.4485, "train_tokens_per_second": 94144.504 }, { "epoch": 0.003246841094185449, "grad_norm": 2.3044769763946533, "learning_rate": 0.00059, "loss": 9.239765167236328, "num_input_tokens_seen": 31457280, "step": 60, "train_runtime": 323.3989, "train_tokens_per_second": 97270.822 }, { "epoch": 0.00378798127654969, "grad_norm": 1.6891608238220215, "learning_rate": 0.0006900000000000001, "loss": 8.75671157836914, "num_input_tokens_seen": 36700160, "step": 70, "train_runtime": 368.3824, "train_tokens_per_second": 99625.16 }, { "epoch": 0.004329121458913932, "grad_norm": 1.304526448249817, "learning_rate": 0.00079, "loss": 8.304008483886719, "num_input_tokens_seen": 41943040, "step": 80, "train_runtime": 413.3956, "train_tokens_per_second": 101459.807 }, { "epoch": 0.004870261641278173, "grad_norm": 0.9729027152061462, "learning_rate": 0.00089, "loss": 7.881130981445312, "num_input_tokens_seen": 47185920, "step": 90, "train_runtime": 458.4234, "train_tokens_per_second": 102930.863 }, { "epoch": 0.005411401823642414, "grad_norm": 0.8812423944473267, "learning_rate": 0.00099, "loss": 7.507221984863281, "num_input_tokens_seen": 52428800, "step": 100, "train_runtime": 503.4688, "train_tokens_per_second": 104135.16 }, { "epoch": 0.005952542006006656, "grad_norm": 0.8653954863548279, "learning_rate": 0.00109, "loss": 7.197725677490235, "num_input_tokens_seen": 57671680, "step": 110, "train_runtime": 548.5148, "train_tokens_per_second": 105141.522 }, { "epoch": 0.006493682188370898, "grad_norm": 0.6527077555656433, "learning_rate": 0.0011899999999999999, "loss": 6.962340545654297, "num_input_tokens_seen": 62914560, "step": 120, "train_runtime": 593.6028, "train_tokens_per_second": 105987.634 }, { "epoch": 0.007034822370735139, "grad_norm": 0.9212841987609863, "learning_rate": 0.0012900000000000001, "loss": 6.748843383789063, "num_input_tokens_seen": 68157440, "step": 130, "train_runtime": 638.7162, "train_tokens_per_second": 106710.057 }, { "epoch": 0.00757596255309938, "grad_norm": 0.8431305885314941, "learning_rate": 0.0013900000000000002, "loss": 6.572750854492187, "num_input_tokens_seen": 73400320, "step": 140, "train_runtime": 683.8844, "train_tokens_per_second": 107328.553 }, { "epoch": 0.008117102735463622, "grad_norm": 1.0355322360992432, "learning_rate": 0.00149, "loss": 6.391636657714844, "num_input_tokens_seen": 78643200, "step": 150, "train_runtime": 729.0444, "train_tokens_per_second": 107871.613 }, { "epoch": 0.008658242917827864, "grad_norm": 1.3339749574661255, "learning_rate": 0.00159, "loss": 6.223532104492188, "num_input_tokens_seen": 83886080, "step": 160, "train_runtime": 774.2149, "train_tokens_per_second": 108349.867 }, { "epoch": 0.009199383100192105, "grad_norm": 1.152486801147461, "learning_rate": 0.00169, "loss": 6.081370162963867, "num_input_tokens_seen": 89128960, "step": 170, "train_runtime": 819.4009, "train_tokens_per_second": 108773.326 }, { "epoch": 0.009740523282556346, "grad_norm": 1.163500189781189, "learning_rate": 0.00179, "loss": 5.940819549560547, "num_input_tokens_seen": 94371840, "step": 180, "train_runtime": 864.5859, "train_tokens_per_second": 109152.654 }, { "epoch": 0.010281663464920588, "grad_norm": 1.2408533096313477, "learning_rate": 0.00189, "loss": 5.812583160400391, "num_input_tokens_seen": 99614720, "step": 190, "train_runtime": 909.7569, "train_tokens_per_second": 109495.979 }, { "epoch": 0.010822803647284829, "grad_norm": 1.1574287414550781, "learning_rate": 0.00199, "loss": 5.6905670166015625, "num_input_tokens_seen": 104857600, "step": 200, "train_runtime": 954.9646, "train_tokens_per_second": 109802.599 }, { "epoch": 0.011363943829649071, "grad_norm": 1.296819806098938, "learning_rate": 0.00209, "loss": 5.591656494140625, "num_input_tokens_seen": 110100480, "step": 210, "train_runtime": 1000.1348, "train_tokens_per_second": 110085.64 }, { "epoch": 0.011905084012013312, "grad_norm": 1.0654325485229492, "learning_rate": 0.00219, "loss": 5.485440444946289, "num_input_tokens_seen": 115343360, "step": 220, "train_runtime": 1045.3265, "train_tokens_per_second": 110341.942 }, { "epoch": 0.012446224194377553, "grad_norm": 1.1002130508422852, "learning_rate": 0.00229, "loss": 5.387198257446289, "num_input_tokens_seen": 120586240, "step": 230, "train_runtime": 1090.5293, "train_tokens_per_second": 110575.879 }, { "epoch": 0.012987364376741795, "grad_norm": 1.0023939609527588, "learning_rate": 0.0023899999999999998, "loss": 5.29501953125, "num_input_tokens_seen": 125829120, "step": 240, "train_runtime": 1135.7379, "train_tokens_per_second": 110790.637 }, { "epoch": 0.013528504559106036, "grad_norm": 0.8933797478675842, "learning_rate": 0.00249, "loss": 5.21459846496582, "num_input_tokens_seen": 131072000, "step": 250, "train_runtime": 1180.9487, "train_tokens_per_second": 110988.737 }, { "epoch": 0.014069644741470278, "grad_norm": 1.0700093507766724, "learning_rate": 0.0025900000000000003, "loss": 5.131219482421875, "num_input_tokens_seen": 136314880, "step": 260, "train_runtime": 1226.1688, "train_tokens_per_second": 111171.384 }, { "epoch": 0.01461078492383452, "grad_norm": 1.194157600402832, "learning_rate": 0.00269, "loss": 5.058176422119141, "num_input_tokens_seen": 141557760, "step": 270, "train_runtime": 1271.4137, "train_tokens_per_second": 111338.863 }, { "epoch": 0.01515192510619876, "grad_norm": 1.097806453704834, "learning_rate": 0.0027900000000000004, "loss": 4.985312652587891, "num_input_tokens_seen": 146800640, "step": 280, "train_runtime": 1316.6654, "train_tokens_per_second": 111494.264 }, { "epoch": 0.015693065288563002, "grad_norm": 0.9698807001113892, "learning_rate": 0.0028899999999999998, "loss": 4.905867004394532, "num_input_tokens_seen": 152043520, "step": 290, "train_runtime": 1361.9146, "train_tokens_per_second": 111639.537 }, { "epoch": 0.016234205470927243, "grad_norm": 0.9057841300964355, "learning_rate": 0.00299, "loss": 4.847021102905273, "num_input_tokens_seen": 157286400, "step": 300, "train_runtime": 1407.157, "train_tokens_per_second": 111776.016 }, { "epoch": 0.016775345653291484, "grad_norm": 0.9901854395866394, "learning_rate": 0.00309, "loss": 4.7939613342285154, "num_input_tokens_seen": 162529280, "step": 310, "train_runtime": 1452.4002, "train_tokens_per_second": 111903.927 }, { "epoch": 0.017316485835655728, "grad_norm": 0.9903898239135742, "learning_rate": 0.00319, "loss": 4.720642852783203, "num_input_tokens_seen": 167772160, "step": 320, "train_runtime": 1497.6772, "train_tokens_per_second": 112021.577 }, { "epoch": 0.01785762601801997, "grad_norm": 1.001007318496704, "learning_rate": 0.0032900000000000004, "loss": 4.6716564178466795, "num_input_tokens_seen": 173015040, "step": 330, "train_runtime": 1542.9302, "train_tokens_per_second": 112134.066 }, { "epoch": 0.01839876620038421, "grad_norm": 0.9968867897987366, "learning_rate": 0.0033900000000000002, "loss": 4.6183723449707035, "num_input_tokens_seen": 178257920, "step": 340, "train_runtime": 1588.1903, "train_tokens_per_second": 112239.647 }, { "epoch": 0.01893990638274845, "grad_norm": 0.9285950064659119, "learning_rate": 0.00349, "loss": 4.572103881835938, "num_input_tokens_seen": 183500800, "step": 350, "train_runtime": 1633.4577, "train_tokens_per_second": 112338.875 }, { "epoch": 0.01948104656511269, "grad_norm": 0.989262044429779, "learning_rate": 0.00359, "loss": 4.529151153564453, "num_input_tokens_seen": 188743680, "step": 360, "train_runtime": 1678.7231, "train_tokens_per_second": 112432.882 }, { "epoch": 0.020022186747476935, "grad_norm": 1.0208923816680908, "learning_rate": 0.00369, "loss": 4.4996803283691404, "num_input_tokens_seen": 193986560, "step": 370, "train_runtime": 1724.0055, "train_tokens_per_second": 112520.846 }, { "epoch": 0.020563326929841176, "grad_norm": 0.9494571089744568, "learning_rate": 0.00379, "loss": 4.4542186737060545, "num_input_tokens_seen": 199229440, "step": 380, "train_runtime": 1769.3003, "train_tokens_per_second": 112603.517 }, { "epoch": 0.021104467112205417, "grad_norm": 0.7988581657409668, "learning_rate": 0.0038900000000000002, "loss": 4.431841278076172, "num_input_tokens_seen": 204472320, "step": 390, "train_runtime": 1818.3866, "train_tokens_per_second": 112447.111 }, { "epoch": 0.021645607294569658, "grad_norm": 0.832046389579773, "learning_rate": 0.0039900000000000005, "loss": 4.396000671386719, "num_input_tokens_seen": 209715200, "step": 400, "train_runtime": 1863.6718, "train_tokens_per_second": 112527.967 }, { "epoch": 0.0221867474769339, "grad_norm": 0.8342320919036865, "learning_rate": 0.00409, "loss": 4.37408332824707, "num_input_tokens_seen": 214958080, "step": 410, "train_runtime": 1909.0219, "train_tokens_per_second": 112601.162 }, { "epoch": 0.022727887659298143, "grad_norm": 0.9766927361488342, "learning_rate": 0.00419, "loss": 4.35020637512207, "num_input_tokens_seen": 220200960, "step": 420, "train_runtime": 1954.3817, "train_tokens_per_second": 112670.397 }, { "epoch": 0.023269027841662383, "grad_norm": 0.8501082062721252, "learning_rate": 0.00429, "loss": 4.312299346923828, "num_input_tokens_seen": 225443840, "step": 430, "train_runtime": 1999.72, "train_tokens_per_second": 112737.702 }, { "epoch": 0.023810168024026624, "grad_norm": 0.8430765867233276, "learning_rate": 0.00439, "loss": 4.310842895507813, "num_input_tokens_seen": 230686720, "step": 440, "train_runtime": 2045.0616, "train_tokens_per_second": 112801.843 }, { "epoch": 0.024351308206390865, "grad_norm": 0.7848499417304993, "learning_rate": 0.00449, "loss": 4.2807456970214846, "num_input_tokens_seen": 235929600, "step": 450, "train_runtime": 2090.4316, "train_tokens_per_second": 112861.668 }, { "epoch": 0.024892448388755106, "grad_norm": 0.9066799879074097, "learning_rate": 0.00459, "loss": 4.257038879394531, "num_input_tokens_seen": 241172480, "step": 460, "train_runtime": 2135.8061, "train_tokens_per_second": 112918.713 }, { "epoch": 0.02543358857111935, "grad_norm": 0.7888091802597046, "learning_rate": 0.00469, "loss": 4.235981750488281, "num_input_tokens_seen": 246415360, "step": 470, "train_runtime": 2181.1434, "train_tokens_per_second": 112975.315 }, { "epoch": 0.02597472875348359, "grad_norm": 0.6987936496734619, "learning_rate": 0.00479, "loss": 4.215058135986328, "num_input_tokens_seen": 251658240, "step": 480, "train_runtime": 2226.5022, "train_tokens_per_second": 113028.517 }, { "epoch": 0.02651586893584783, "grad_norm": 0.8686115741729736, "learning_rate": 0.00489, "loss": 4.219595336914063, "num_input_tokens_seen": 256901120, "step": 490, "train_runtime": 2271.8741, "train_tokens_per_second": 113078.945 }, { "epoch": 0.027057009118212072, "grad_norm": 0.9207416772842407, "learning_rate": 0.0049900000000000005, "loss": 4.1927734375, "num_input_tokens_seen": 262144000, "step": 500, "train_runtime": 2317.2129, "train_tokens_per_second": 113129.008 }, { "epoch": 0.027057009118212072, "eval_loss": 4.101890563964844, "eval_runtime": 2.0015, "eval_samples_per_second": 249.812, "eval_steps_per_second": 3.997, "num_input_tokens_seen": 262144000, "step": 500 }, { "epoch": 0.027598149300576313, "grad_norm": 0.7904194593429565, "learning_rate": 0.0049999972179955365, "loss": 4.170513916015625, "num_input_tokens_seen": 267386880, "step": 510, "train_runtime": 2364.5371, "train_tokens_per_second": 113082.126 }, { "epoch": 0.028139289482940557, "grad_norm": 0.63487708568573, "learning_rate": 0.004999987601198816, "loss": 4.152132034301758, "num_input_tokens_seen": 272629760, "step": 520, "train_runtime": 2409.8408, "train_tokens_per_second": 113131.854 }, { "epoch": 0.028680429665304798, "grad_norm": 0.519443154335022, "learning_rate": 0.0049999711152934586, "loss": 4.145978164672852, "num_input_tokens_seen": 277872640, "step": 530, "train_runtime": 2455.1664, "train_tokens_per_second": 113178.739 }, { "epoch": 0.02922156984766904, "grad_norm": 0.5551373362541199, "learning_rate": 0.004999947760329793, "loss": 4.118004608154297, "num_input_tokens_seen": 283115520, "step": 540, "train_runtime": 2500.4657, "train_tokens_per_second": 113225.118 }, { "epoch": 0.02976271003003328, "grad_norm": 0.48466068506240845, "learning_rate": 0.004999917536379122, "loss": 4.0990447998046875, "num_input_tokens_seen": 288358400, "step": 550, "train_runtime": 2545.7721, "train_tokens_per_second": 113269.528 }, { "epoch": 0.03030385021239752, "grad_norm": 0.4300881624221802, "learning_rate": 0.004999880443533718, "loss": 4.095553207397461, "num_input_tokens_seen": 293601280, "step": 560, "train_runtime": 2591.05, "train_tokens_per_second": 113313.63 }, { "epoch": 0.030844990394761764, "grad_norm": 0.3266729414463043, "learning_rate": 0.004999836481906822, "loss": 4.074318313598633, "num_input_tokens_seen": 298844160, "step": 570, "train_runtime": 2636.411, "train_tokens_per_second": 113352.647 }, { "epoch": 0.031386130577126005, "grad_norm": 0.34210285544395447, "learning_rate": 0.004999785651632649, "loss": 4.055056762695313, "num_input_tokens_seen": 304087040, "step": 580, "train_runtime": 2681.696, "train_tokens_per_second": 113393.554 }, { "epoch": 0.03192727075949025, "grad_norm": 0.3171045482158661, "learning_rate": 0.004999727952866382, "loss": 4.028103637695312, "num_input_tokens_seen": 309329920, "step": 590, "train_runtime": 2726.9513, "train_tokens_per_second": 113434.342 }, { "epoch": 0.032468410941854486, "grad_norm": 0.28656497597694397, "learning_rate": 0.00499966338578417, "loss": 4.014062118530274, "num_input_tokens_seen": 314572800, "step": 600, "train_runtime": 2772.2472, "train_tokens_per_second": 113472.131 }, { "epoch": 0.03300955112421873, "grad_norm": 0.31004276871681213, "learning_rate": 0.004999591950583134, "loss": 4.000431060791016, "num_input_tokens_seen": 319815680, "step": 610, "train_runtime": 2817.5313, "train_tokens_per_second": 113509.186 }, { "epoch": 0.03355069130658297, "grad_norm": 0.29579785466194153, "learning_rate": 0.004999513647481364, "loss": 3.9810386657714845, "num_input_tokens_seen": 325058560, "step": 620, "train_runtime": 2862.8161, "train_tokens_per_second": 113545.036 }, { "epoch": 0.03409183148894721, "grad_norm": 0.28329184651374817, "learning_rate": 0.0049994284767179145, "loss": 3.975200653076172, "num_input_tokens_seen": 330301440, "step": 630, "train_runtime": 2908.1102, "train_tokens_per_second": 113579.411 }, { "epoch": 0.034632971671311456, "grad_norm": 0.2848559319972992, "learning_rate": 0.004999336438552809, "loss": 3.9574630737304686, "num_input_tokens_seen": 335544320, "step": 640, "train_runtime": 2953.403, "train_tokens_per_second": 113612.776 }, { "epoch": 0.035174111853675694, "grad_norm": 0.2778968811035156, "learning_rate": 0.004999237533267034, "loss": 3.951917266845703, "num_input_tokens_seen": 340787200, "step": 650, "train_runtime": 2998.7048, "train_tokens_per_second": 113644.799 }, { "epoch": 0.03571525203603994, "grad_norm": 0.28124260902404785, "learning_rate": 0.004999131761162544, "loss": 3.93038330078125, "num_input_tokens_seen": 346030080, "step": 660, "train_runtime": 3044.0205, "train_tokens_per_second": 113675.344 }, { "epoch": 0.036256392218404175, "grad_norm": 0.25421732664108276, "learning_rate": 0.004999019122562258, "loss": 3.9207611083984375, "num_input_tokens_seen": 351272960, "step": 670, "train_runtime": 3089.3299, "train_tokens_per_second": 113705.227 }, { "epoch": 0.03679753240076842, "grad_norm": 0.2740730345249176, "learning_rate": 0.0049988996178100525, "loss": 3.91453857421875, "num_input_tokens_seen": 356515840, "step": 680, "train_runtime": 3134.6017, "train_tokens_per_second": 113735.61 }, { "epoch": 0.037338672583132664, "grad_norm": 0.2670656740665436, "learning_rate": 0.004998773247270772, "loss": 3.884227752685547, "num_input_tokens_seen": 361758720, "step": 690, "train_runtime": 3179.9122, "train_tokens_per_second": 113763.746 }, { "epoch": 0.0378798127654969, "grad_norm": 0.2549172341823578, "learning_rate": 0.004998640011330221, "loss": 3.880903625488281, "num_input_tokens_seen": 367001600, "step": 700, "train_runtime": 3225.2126, "train_tokens_per_second": 113791.443 }, { "epoch": 0.038420952947861145, "grad_norm": 0.23274943232536316, "learning_rate": 0.004998499910395162, "loss": 3.8808818817138673, "num_input_tokens_seen": 372244480, "step": 710, "train_runtime": 3270.4782, "train_tokens_per_second": 113819.588 }, { "epoch": 0.03896209313022538, "grad_norm": 0.2661728858947754, "learning_rate": 0.004998352944893316, "loss": 3.860551452636719, "num_input_tokens_seen": 377487360, "step": 720, "train_runtime": 3315.7715, "train_tokens_per_second": 113846.012 }, { "epoch": 0.039503233312589627, "grad_norm": 0.27070483565330505, "learning_rate": 0.004998199115273362, "loss": 3.8578773498535157, "num_input_tokens_seen": 382730240, "step": 730, "train_runtime": 3361.0384, "train_tokens_per_second": 113872.616 }, { "epoch": 0.04004437349495387, "grad_norm": 0.2620537281036377, "learning_rate": 0.004998038422004937, "loss": 3.8334423065185548, "num_input_tokens_seen": 387973120, "step": 740, "train_runtime": 3406.3177, "train_tokens_per_second": 113898.102 }, { "epoch": 0.04058551367731811, "grad_norm": 0.24665935337543488, "learning_rate": 0.004997870865578627, "loss": 3.830191802978516, "num_input_tokens_seen": 393216000, "step": 750, "train_runtime": 3451.6094, "train_tokens_per_second": 113922.508 }, { "epoch": 0.04112665385968235, "grad_norm": 0.3058369755744934, "learning_rate": 0.004997696446505975, "loss": 3.81226806640625, "num_input_tokens_seen": 398458880, "step": 760, "train_runtime": 3496.8514, "train_tokens_per_second": 113947.901 }, { "epoch": 0.04166779404204659, "grad_norm": 0.24344538152217865, "learning_rate": 0.004997515165319476, "loss": 3.8191978454589846, "num_input_tokens_seen": 403701760, "step": 770, "train_runtime": 3545.7622, "train_tokens_per_second": 113854.718 }, { "epoch": 0.042208934224410834, "grad_norm": 0.26970189809799194, "learning_rate": 0.004997327022572571, "loss": 3.794965362548828, "num_input_tokens_seen": 408944640, "step": 780, "train_runtime": 3591.0695, "train_tokens_per_second": 113878.231 }, { "epoch": 0.04275007440677508, "grad_norm": 0.2699701189994812, "learning_rate": 0.0049971320188396525, "loss": 3.7990867614746096, "num_input_tokens_seen": 414187520, "step": 790, "train_runtime": 3636.3454, "train_tokens_per_second": 113902.14 }, { "epoch": 0.043291214589139315, "grad_norm": 0.24337078630924225, "learning_rate": 0.004996930154716057, "loss": 3.795510101318359, "num_input_tokens_seen": 419430400, "step": 800, "train_runtime": 3681.6305, "train_tokens_per_second": 113925.175 }, { "epoch": 0.04383235477150356, "grad_norm": 0.24991652369499207, "learning_rate": 0.004996721430818068, "loss": 3.7792850494384767, "num_input_tokens_seen": 424673280, "step": 810, "train_runtime": 3726.9273, "train_tokens_per_second": 113947.293 }, { "epoch": 0.0443734949538678, "grad_norm": 0.22850197553634644, "learning_rate": 0.004996505847782908, "loss": 3.7752288818359374, "num_input_tokens_seen": 429916160, "step": 820, "train_runtime": 3772.1962, "train_tokens_per_second": 113969.725 }, { "epoch": 0.04491463513623204, "grad_norm": 0.24704036116600037, "learning_rate": 0.004996283406268743, "loss": 3.7673095703125, "num_input_tokens_seen": 435159040, "step": 830, "train_runtime": 3817.4555, "train_tokens_per_second": 113991.908 }, { "epoch": 0.045455775318596285, "grad_norm": 0.24149645864963531, "learning_rate": 0.004996054106954677, "loss": 3.767901611328125, "num_input_tokens_seen": 440401920, "step": 840, "train_runtime": 3862.7306, "train_tokens_per_second": 114013.106 }, { "epoch": 0.04599691550096052, "grad_norm": 0.26389098167419434, "learning_rate": 0.004995817950540749, "loss": 3.765447998046875, "num_input_tokens_seen": 445644800, "step": 850, "train_runtime": 3908.0129, "train_tokens_per_second": 114033.605 }, { "epoch": 0.04653805568332477, "grad_norm": 0.2389504611492157, "learning_rate": 0.004995574937747936, "loss": 3.7446453094482424, "num_input_tokens_seen": 450887680, "step": 860, "train_runtime": 3953.2772, "train_tokens_per_second": 114054.151 }, { "epoch": 0.047079195865689004, "grad_norm": 0.21696795523166656, "learning_rate": 0.0049953250693181425, "loss": 3.7382736206054688, "num_input_tokens_seen": 456130560, "step": 870, "train_runtime": 3998.5453, "train_tokens_per_second": 114074.125 }, { "epoch": 0.04762033604805325, "grad_norm": 0.23217777907848358, "learning_rate": 0.004995068346014207, "loss": 3.7418495178222657, "num_input_tokens_seen": 461373440, "step": 880, "train_runtime": 4043.8212, "train_tokens_per_second": 114093.431 }, { "epoch": 0.04816147623041749, "grad_norm": 0.25520190596580505, "learning_rate": 0.004994804768619892, "loss": 3.7273784637451173, "num_input_tokens_seen": 466616320, "step": 890, "train_runtime": 4089.1251, "train_tokens_per_second": 114111.53 }, { "epoch": 0.04870261641278173, "grad_norm": 0.2495919018983841, "learning_rate": 0.004994534337939889, "loss": 3.7182594299316407, "num_input_tokens_seen": 471859200, "step": 900, "train_runtime": 4134.3995, "train_tokens_per_second": 114130.045 }, { "epoch": 0.049243756595145974, "grad_norm": 0.2571962773799896, "learning_rate": 0.00499425705479981, "loss": 3.7261619567871094, "num_input_tokens_seen": 477102080, "step": 910, "train_runtime": 4179.6624, "train_tokens_per_second": 114148.472 }, { "epoch": 0.04978489677751021, "grad_norm": 0.2216644585132599, "learning_rate": 0.004993972920046188, "loss": 3.705414581298828, "num_input_tokens_seen": 482344960, "step": 920, "train_runtime": 4224.9503, "train_tokens_per_second": 114165.831 }, { "epoch": 0.050326036959874455, "grad_norm": 0.2777004539966583, "learning_rate": 0.004993681934546471, "loss": 3.707286834716797, "num_input_tokens_seen": 487587840, "step": 930, "train_runtime": 4270.2223, "train_tokens_per_second": 114183.246 }, { "epoch": 0.0508671771422387, "grad_norm": 0.23501209914684296, "learning_rate": 0.004993384099189028, "loss": 3.7012203216552733, "num_input_tokens_seen": 492830720, "step": 940, "train_runtime": 4315.4919, "train_tokens_per_second": 114200.358 }, { "epoch": 0.05140831732460294, "grad_norm": 0.2504929304122925, "learning_rate": 0.004993079414883134, "loss": 3.7007171630859377, "num_input_tokens_seen": 498073600, "step": 950, "train_runtime": 4360.758, "train_tokens_per_second": 114217.207 }, { "epoch": 0.05194945750696718, "grad_norm": 0.265903502702713, "learning_rate": 0.004992767882558976, "loss": 3.6977813720703123, "num_input_tokens_seen": 503316480, "step": 960, "train_runtime": 4406.0254, "train_tokens_per_second": 114233.676 }, { "epoch": 0.05249059768933142, "grad_norm": 0.22946324944496155, "learning_rate": 0.00499244950316765, "loss": 3.6873912811279297, "num_input_tokens_seen": 508559360, "step": 970, "train_runtime": 4451.29, "train_tokens_per_second": 114249.883 }, { "epoch": 0.05303173787169566, "grad_norm": 0.2554706633090973, "learning_rate": 0.004992124277681152, "loss": 3.6791450500488283, "num_input_tokens_seen": 513802240, "step": 980, "train_runtime": 4496.5462, "train_tokens_per_second": 114265.975 }, { "epoch": 0.05357287805405991, "grad_norm": 0.22852079570293427, "learning_rate": 0.004991792207092381, "loss": 3.677058792114258, "num_input_tokens_seen": 519045120, "step": 990, "train_runtime": 4541.8028, "train_tokens_per_second": 114281.739 }, { "epoch": 0.054114018236424144, "grad_norm": 0.24798494577407837, "learning_rate": 0.004991453292415134, "loss": 3.657318115234375, "num_input_tokens_seen": 524288000, "step": 1000, "train_runtime": 4587.0445, "train_tokens_per_second": 114297.561 }, { "epoch": 0.054114018236424144, "eval_loss": 3.6001899242401123, "eval_runtime": 1.9848, "eval_samples_per_second": 251.913, "eval_steps_per_second": 4.031, "num_input_tokens_seen": 524288000, "step": 1000 }, { "epoch": 0.05465515841878839, "grad_norm": 0.223563551902771, "learning_rate": 0.0049911075346841, "loss": 3.666912841796875, "num_input_tokens_seen": 529530880, "step": 1010, "train_runtime": 4637.4751, "train_tokens_per_second": 114185.17 }, { "epoch": 0.055196298601152625, "grad_norm": 0.24604271352291107, "learning_rate": 0.004990754934954863, "loss": 3.6610164642333984, "num_input_tokens_seen": 534773760, "step": 1020, "train_runtime": 4682.7302, "train_tokens_per_second": 114201.276 }, { "epoch": 0.05573743878351687, "grad_norm": 0.2436058074235916, "learning_rate": 0.004990395494303893, "loss": 3.6538921356201173, "num_input_tokens_seen": 540016640, "step": 1030, "train_runtime": 4727.9737, "train_tokens_per_second": 114217.353 }, { "epoch": 0.056278578965881114, "grad_norm": 0.24788981676101685, "learning_rate": 0.004990029213828546, "loss": 3.6453926086425783, "num_input_tokens_seen": 545259520, "step": 1040, "train_runtime": 4773.2764, "train_tokens_per_second": 114231.708 }, { "epoch": 0.05681971914824535, "grad_norm": 0.2355376034975052, "learning_rate": 0.00498965609464706, "loss": 3.653607940673828, "num_input_tokens_seen": 550502400, "step": 1050, "train_runtime": 4818.5445, "train_tokens_per_second": 114246.615 }, { "epoch": 0.057360859330609595, "grad_norm": 0.24511760473251343, "learning_rate": 0.0049892761378985484, "loss": 3.655783462524414, "num_input_tokens_seen": 555745280, "step": 1060, "train_runtime": 4863.8191, "train_tokens_per_second": 114261.091 }, { "epoch": 0.05790199951297383, "grad_norm": 0.2463475465774536, "learning_rate": 0.004988889344743005, "loss": 3.6497840881347656, "num_input_tokens_seen": 560988160, "step": 1070, "train_runtime": 4909.1151, "train_tokens_per_second": 114274.804 }, { "epoch": 0.05844313969533808, "grad_norm": 0.24649877846240997, "learning_rate": 0.00498849571636129, "loss": 3.6289398193359377, "num_input_tokens_seen": 566231040, "step": 1080, "train_runtime": 4954.3585, "train_tokens_per_second": 114289.476 }, { "epoch": 0.05898427987770232, "grad_norm": 0.21440814435482025, "learning_rate": 0.004988095253955132, "loss": 3.6420303344726563, "num_input_tokens_seen": 571473920, "step": 1090, "train_runtime": 4999.6131, "train_tokens_per_second": 114303.629 }, { "epoch": 0.05952542006006656, "grad_norm": 0.23143576085567474, "learning_rate": 0.004987687958747124, "loss": 3.636464309692383, "num_input_tokens_seen": 576716800, "step": 1100, "train_runtime": 5044.8489, "train_tokens_per_second": 114317.952 }, { "epoch": 0.0600665602424308, "grad_norm": 0.216554194688797, "learning_rate": 0.0049872738319807226, "loss": 3.6284786224365235, "num_input_tokens_seen": 581959680, "step": 1110, "train_runtime": 5090.1116, "train_tokens_per_second": 114331.419 }, { "epoch": 0.06060770042479504, "grad_norm": 0.21454273164272308, "learning_rate": 0.004986852874920234, "loss": 3.628643035888672, "num_input_tokens_seen": 587202560, "step": 1120, "train_runtime": 5135.379, "train_tokens_per_second": 114344.542 }, { "epoch": 0.061148840607159284, "grad_norm": 0.22195634245872498, "learning_rate": 0.004986425088850824, "loss": 3.6224212646484375, "num_input_tokens_seen": 592445440, "step": 1130, "train_runtime": 5180.6463, "train_tokens_per_second": 114357.438 }, { "epoch": 0.06168998078952353, "grad_norm": 0.23462195694446564, "learning_rate": 0.004985990475078501, "loss": 3.614238739013672, "num_input_tokens_seen": 597688320, "step": 1140, "train_runtime": 5225.8696, "train_tokens_per_second": 114371.074 }, { "epoch": 0.062231120971887766, "grad_norm": 0.2454216629266739, "learning_rate": 0.004985549034930123, "loss": 3.6097618103027345, "num_input_tokens_seen": 602931200, "step": 1150, "train_runtime": 5274.713, "train_tokens_per_second": 114305.973 }, { "epoch": 0.06277226115425201, "grad_norm": 0.22363615036010742, "learning_rate": 0.004985100769753384, "loss": 3.605723571777344, "num_input_tokens_seen": 608174080, "step": 1160, "train_runtime": 5319.9954, "train_tokens_per_second": 114318.535 }, { "epoch": 0.06331340133661625, "grad_norm": 0.2078346163034439, "learning_rate": 0.00498464568091682, "loss": 3.602735900878906, "num_input_tokens_seen": 613416960, "step": 1170, "train_runtime": 5365.2147, "train_tokens_per_second": 114332.23 }, { "epoch": 0.0638545415189805, "grad_norm": 0.21972794830799103, "learning_rate": 0.004984183769809795, "loss": 3.598741912841797, "num_input_tokens_seen": 618659840, "step": 1180, "train_runtime": 5410.4312, "train_tokens_per_second": 114345.754 }, { "epoch": 0.06439568170134473, "grad_norm": 0.2427527755498886, "learning_rate": 0.0049837150378425005, "loss": 3.596208190917969, "num_input_tokens_seen": 623902720, "step": 1190, "train_runtime": 5455.6665, "train_tokens_per_second": 114358.662 }, { "epoch": 0.06493682188370897, "grad_norm": 0.2279594987630844, "learning_rate": 0.004983239486445956, "loss": 3.59366455078125, "num_input_tokens_seen": 629145600, "step": 1200, "train_runtime": 5500.922, "train_tokens_per_second": 114370.936 }, { "epoch": 0.06547796206607322, "grad_norm": 0.23130950331687927, "learning_rate": 0.004982757117071998, "loss": 3.592302703857422, "num_input_tokens_seen": 634388480, "step": 1210, "train_runtime": 5546.1769, "train_tokens_per_second": 114383.024 }, { "epoch": 0.06601910224843746, "grad_norm": 0.2286449670791626, "learning_rate": 0.004982267931193276, "loss": 3.5859790802001954, "num_input_tokens_seen": 639631360, "step": 1220, "train_runtime": 5591.4258, "train_tokens_per_second": 114395.038 }, { "epoch": 0.0665602424308017, "grad_norm": 0.23779889941215515, "learning_rate": 0.004981771930303254, "loss": 3.586525726318359, "num_input_tokens_seen": 644874240, "step": 1230, "train_runtime": 5636.6531, "train_tokens_per_second": 114407.297 }, { "epoch": 0.06710138261316594, "grad_norm": 0.23324504494667053, "learning_rate": 0.004981269115916199, "loss": 3.579142379760742, "num_input_tokens_seen": 650117120, "step": 1240, "train_runtime": 5681.8961, "train_tokens_per_second": 114419.044 }, { "epoch": 0.06764252279553018, "grad_norm": 0.2067607045173645, "learning_rate": 0.004980759489567181, "loss": 3.5813358306884764, "num_input_tokens_seen": 655360000, "step": 1250, "train_runtime": 5727.162, "train_tokens_per_second": 114430.148 }, { "epoch": 0.06818366297789442, "grad_norm": 0.20190924406051636, "learning_rate": 0.004980243052812064, "loss": 3.572435760498047, "num_input_tokens_seen": 660602880, "step": 1260, "train_runtime": 5772.4268, "train_tokens_per_second": 114441.102 }, { "epoch": 0.06872480316025867, "grad_norm": 0.19773253798484802, "learning_rate": 0.004979719807227508, "loss": 3.5610916137695314, "num_input_tokens_seen": 665845760, "step": 1270, "train_runtime": 5817.687, "train_tokens_per_second": 114451.974 }, { "epoch": 0.06926594334262291, "grad_norm": 0.21706561744213104, "learning_rate": 0.004979189754410956, "loss": 3.5655101776123046, "num_input_tokens_seen": 671088640, "step": 1280, "train_runtime": 5862.9264, "train_tokens_per_second": 114463.084 }, { "epoch": 0.06980708352498714, "grad_norm": 0.23492570221424103, "learning_rate": 0.004978652895980635, "loss": 3.571335220336914, "num_input_tokens_seen": 676331520, "step": 1290, "train_runtime": 5908.1779, "train_tokens_per_second": 114473.792 }, { "epoch": 0.07034822370735139, "grad_norm": 0.23728297650814056, "learning_rate": 0.004978109233575551, "loss": 3.5683116912841797, "num_input_tokens_seen": 681574400, "step": 1300, "train_runtime": 5953.4117, "train_tokens_per_second": 114484.674 }, { "epoch": 0.07088936388971563, "grad_norm": 0.2128531038761139, "learning_rate": 0.0049775587688554775, "loss": 3.553203582763672, "num_input_tokens_seen": 686817280, "step": 1310, "train_runtime": 5998.6316, "train_tokens_per_second": 114495.659 }, { "epoch": 0.07143050407207988, "grad_norm": 0.22945411503314972, "learning_rate": 0.004977001503500959, "loss": 3.5565677642822267, "num_input_tokens_seen": 692060160, "step": 1320, "train_runtime": 6043.8634, "train_tokens_per_second": 114506.254 }, { "epoch": 0.07197164425444412, "grad_norm": 0.21733802556991577, "learning_rate": 0.004976437439213302, "loss": 3.5509429931640626, "num_input_tokens_seen": 697303040, "step": 1330, "train_runtime": 6089.0954, "train_tokens_per_second": 114516.688 }, { "epoch": 0.07251278443680835, "grad_norm": 0.24347279965877533, "learning_rate": 0.004975866577714568, "loss": 3.54642333984375, "num_input_tokens_seen": 702545920, "step": 1340, "train_runtime": 6134.3055, "train_tokens_per_second": 114527.376 }, { "epoch": 0.0730539246191726, "grad_norm": 0.21520718932151794, "learning_rate": 0.004975288920747571, "loss": 3.550141143798828, "num_input_tokens_seen": 707788800, "step": 1350, "train_runtime": 6179.5539, "train_tokens_per_second": 114537.201 }, { "epoch": 0.07359506480153684, "grad_norm": 0.23248061537742615, "learning_rate": 0.0049747044700758705, "loss": 3.5488357543945312, "num_input_tokens_seen": 713031680, "step": 1360, "train_runtime": 6224.7742, "train_tokens_per_second": 114547.397 }, { "epoch": 0.07413620498390108, "grad_norm": 0.23462453484535217, "learning_rate": 0.004974113227483768, "loss": 3.5416290283203127, "num_input_tokens_seen": 718274560, "step": 1370, "train_runtime": 6269.9891, "train_tokens_per_second": 114557.544 }, { "epoch": 0.07467734516626533, "grad_norm": 0.2253153920173645, "learning_rate": 0.004973515194776301, "loss": 3.540643310546875, "num_input_tokens_seen": 723517440, "step": 1380, "train_runtime": 6315.2141, "train_tokens_per_second": 114567.365 }, { "epoch": 0.07521848534862956, "grad_norm": 0.21088625490665436, "learning_rate": 0.0049729103737792355, "loss": 3.543656921386719, "num_input_tokens_seen": 728760320, "step": 1390, "train_runtime": 6360.4473, "train_tokens_per_second": 114576.898 }, { "epoch": 0.0757596255309938, "grad_norm": 0.2161734253168106, "learning_rate": 0.00497229876633906, "loss": 3.5383941650390627, "num_input_tokens_seen": 734003200, "step": 1400, "train_runtime": 6405.6811, "train_tokens_per_second": 114586.285 }, { "epoch": 0.07630076571335805, "grad_norm": 0.20709098875522614, "learning_rate": 0.004971680374322986, "loss": 3.5313690185546873, "num_input_tokens_seen": 739246080, "step": 1410, "train_runtime": 6450.8882, "train_tokens_per_second": 114596.015 }, { "epoch": 0.07684190589572229, "grad_norm": 0.20399479568004608, "learning_rate": 0.004971055199618935, "loss": 3.525136184692383, "num_input_tokens_seen": 744488960, "step": 1420, "train_runtime": 6496.1128, "train_tokens_per_second": 114605.3 }, { "epoch": 0.07738304607808653, "grad_norm": 0.21809037029743195, "learning_rate": 0.004970423244135538, "loss": 3.53038330078125, "num_input_tokens_seen": 749731840, "step": 1430, "train_runtime": 6541.3447, "train_tokens_per_second": 114614.33 }, { "epoch": 0.07792418626045076, "grad_norm": 0.22500748932361603, "learning_rate": 0.004969784509802125, "loss": 3.5225593566894533, "num_input_tokens_seen": 754974720, "step": 1440, "train_runtime": 6586.5647, "train_tokens_per_second": 114623.442 }, { "epoch": 0.07846532644281501, "grad_norm": 0.22614286839962006, "learning_rate": 0.0049691389985687204, "loss": 3.5291175842285156, "num_input_tokens_seen": 760217600, "step": 1450, "train_runtime": 6631.7972, "train_tokens_per_second": 114632.215 }, { "epoch": 0.07900646662517925, "grad_norm": 0.2029477208852768, "learning_rate": 0.004968486712406044, "loss": 3.5189224243164063, "num_input_tokens_seen": 765460480, "step": 1460, "train_runtime": 6677.0236, "train_tokens_per_second": 114640.973 }, { "epoch": 0.0795476068075435, "grad_norm": 0.2242126613855362, "learning_rate": 0.004967827653305494, "loss": 3.504582977294922, "num_input_tokens_seen": 770703360, "step": 1470, "train_runtime": 6722.2724, "train_tokens_per_second": 114649.232 }, { "epoch": 0.08008874698990774, "grad_norm": 0.2245851755142212, "learning_rate": 0.004967161823279147, "loss": 3.5151710510253906, "num_input_tokens_seen": 775946240, "step": 1480, "train_runtime": 6767.5092, "train_tokens_per_second": 114657.582 }, { "epoch": 0.08062988717227197, "grad_norm": 0.21762171387672424, "learning_rate": 0.004966489224359752, "loss": 3.510267639160156, "num_input_tokens_seen": 781189120, "step": 1490, "train_runtime": 6812.7609, "train_tokens_per_second": 114665.571 }, { "epoch": 0.08117102735463622, "grad_norm": 0.22595250606536865, "learning_rate": 0.0049658098586007225, "loss": 3.515250396728516, "num_input_tokens_seen": 786432000, "step": 1500, "train_runtime": 6857.9968, "train_tokens_per_second": 114673.719 }, { "epoch": 0.08117102735463622, "eval_loss": 3.439197540283203, "eval_runtime": 1.986, "eval_samples_per_second": 251.759, "eval_steps_per_second": 4.028, "num_input_tokens_seen": 786432000, "step": 1500 }, { "epoch": 0.08171216753700046, "grad_norm": 0.2041054517030716, "learning_rate": 0.00496512372807613, "loss": 3.5051536560058594, "num_input_tokens_seen": 791674880, "step": 1510, "train_runtime": 6905.2279, "train_tokens_per_second": 114648.624 }, { "epoch": 0.0822533077193647, "grad_norm": 0.20704089105129242, "learning_rate": 0.004964430834880702, "loss": 3.498210906982422, "num_input_tokens_seen": 796917760, "step": 1520, "train_runtime": 6950.4841, "train_tokens_per_second": 114656.439 }, { "epoch": 0.08279444790172895, "grad_norm": 0.2235393226146698, "learning_rate": 0.0049637311811298055, "loss": 3.510853958129883, "num_input_tokens_seen": 802160640, "step": 1530, "train_runtime": 6999.4133, "train_tokens_per_second": 114603.982 }, { "epoch": 0.08333558808409318, "grad_norm": 0.19351601600646973, "learning_rate": 0.004963024768959454, "loss": 3.4970939636230467, "num_input_tokens_seen": 807403520, "step": 1540, "train_runtime": 7044.6706, "train_tokens_per_second": 114611.962 }, { "epoch": 0.08387672826645742, "grad_norm": 0.2104959934949875, "learning_rate": 0.0049623116005262915, "loss": 3.5016387939453124, "num_input_tokens_seen": 812646400, "step": 1550, "train_runtime": 7089.915, "train_tokens_per_second": 114620.048 }, { "epoch": 0.08441786844882167, "grad_norm": 0.25421494245529175, "learning_rate": 0.004961591678007588, "loss": 3.50089111328125, "num_input_tokens_seen": 817889280, "step": 1560, "train_runtime": 7135.1609, "train_tokens_per_second": 114628.008 }, { "epoch": 0.08495900863118591, "grad_norm": 0.2085292786359787, "learning_rate": 0.004960865003601232, "loss": 3.5003082275390627, "num_input_tokens_seen": 823132160, "step": 1570, "train_runtime": 7180.3935, "train_tokens_per_second": 114636.079 }, { "epoch": 0.08550014881355016, "grad_norm": 0.2042287439107895, "learning_rate": 0.00496013157952573, "loss": 3.495660400390625, "num_input_tokens_seen": 828375040, "step": 1580, "train_runtime": 7225.6594, "train_tokens_per_second": 114643.521 }, { "epoch": 0.08604128899591439, "grad_norm": 0.21099670231342316, "learning_rate": 0.004959391408020191, "loss": 3.4938674926757813, "num_input_tokens_seen": 833617920, "step": 1590, "train_runtime": 7270.8988, "train_tokens_per_second": 114651.29 }, { "epoch": 0.08658242917827863, "grad_norm": 0.19382232427597046, "learning_rate": 0.004958644491344324, "loss": 3.4875198364257813, "num_input_tokens_seen": 838860800, "step": 1600, "train_runtime": 7316.1586, "train_tokens_per_second": 114658.64 }, { "epoch": 0.08712356936064287, "grad_norm": 0.20325426757335663, "learning_rate": 0.0049578908317784295, "loss": 3.487265777587891, "num_input_tokens_seen": 844103680, "step": 1610, "train_runtime": 7361.3918, "train_tokens_per_second": 114666.316 }, { "epoch": 0.08766470954300712, "grad_norm": 0.21367783844470978, "learning_rate": 0.004957130431623399, "loss": 3.4908119201660157, "num_input_tokens_seen": 849346560, "step": 1620, "train_runtime": 7406.6443, "train_tokens_per_second": 114673.6 }, { "epoch": 0.08820584972537136, "grad_norm": 0.19166916608810425, "learning_rate": 0.004956363293200697, "loss": 3.478108215332031, "num_input_tokens_seen": 854589440, "step": 1630, "train_runtime": 7451.8697, "train_tokens_per_second": 114681.211 }, { "epoch": 0.0887469899077356, "grad_norm": 0.22475136816501617, "learning_rate": 0.004955589418852363, "loss": 3.4743488311767576, "num_input_tokens_seen": 859832320, "step": 1640, "train_runtime": 7497.1227, "train_tokens_per_second": 114688.308 }, { "epoch": 0.08928813009009984, "grad_norm": 0.22001579403877258, "learning_rate": 0.004954808810940998, "loss": 3.481397247314453, "num_input_tokens_seen": 865075200, "step": 1650, "train_runtime": 7542.3425, "train_tokens_per_second": 114695.826 }, { "epoch": 0.08982927027246408, "grad_norm": 0.20330502092838287, "learning_rate": 0.0049540214718497635, "loss": 3.47830810546875, "num_input_tokens_seen": 870318080, "step": 1660, "train_runtime": 7587.5621, "train_tokens_per_second": 114703.256 }, { "epoch": 0.09037041045482833, "grad_norm": 0.21012984216213226, "learning_rate": 0.00495322740398237, "loss": 3.470731735229492, "num_input_tokens_seen": 875560960, "step": 1670, "train_runtime": 7632.7891, "train_tokens_per_second": 114710.488 }, { "epoch": 0.09091155063719257, "grad_norm": 0.20543314516544342, "learning_rate": 0.004952426609763068, "loss": 3.4727859497070312, "num_input_tokens_seen": 880803840, "step": 1680, "train_runtime": 7678.0113, "train_tokens_per_second": 114717.706 }, { "epoch": 0.0914526908195568, "grad_norm": 0.2099304497241974, "learning_rate": 0.004951619091636649, "loss": 3.462004852294922, "num_input_tokens_seen": 886046720, "step": 1690, "train_runtime": 7723.225, "train_tokens_per_second": 114724.965 }, { "epoch": 0.09199383100192104, "grad_norm": 0.19785360991954803, "learning_rate": 0.004950804852068425, "loss": 3.468863677978516, "num_input_tokens_seen": 891289600, "step": 1700, "train_runtime": 7768.4346, "train_tokens_per_second": 114732.202 }, { "epoch": 0.09253497118428529, "grad_norm": 0.20223312079906464, "learning_rate": 0.004949983893544234, "loss": 3.47713623046875, "num_input_tokens_seen": 896532480, "step": 1710, "train_runtime": 7813.6346, "train_tokens_per_second": 114739.494 }, { "epoch": 0.09307611136664953, "grad_norm": 0.21073880791664124, "learning_rate": 0.004949156218570423, "loss": 3.4744213104248045, "num_input_tokens_seen": 901775360, "step": 1720, "train_runtime": 7858.8265, "train_tokens_per_second": 114746.822 }, { "epoch": 0.09361725154901378, "grad_norm": 0.21444642543792725, "learning_rate": 0.004948321829673847, "loss": 3.4606704711914062, "num_input_tokens_seen": 907018240, "step": 1730, "train_runtime": 7904.0563, "train_tokens_per_second": 114753.514 }, { "epoch": 0.09415839173137801, "grad_norm": 0.21360410749912262, "learning_rate": 0.004947480729401857, "loss": 3.468334197998047, "num_input_tokens_seen": 912261120, "step": 1740, "train_runtime": 7949.2813, "train_tokens_per_second": 114760.201 }, { "epoch": 0.09469953191374225, "grad_norm": 0.20467202365398407, "learning_rate": 0.0049466329203222935, "loss": 3.451313018798828, "num_input_tokens_seen": 917504000, "step": 1750, "train_runtime": 7994.4951, "train_tokens_per_second": 114766.973 }, { "epoch": 0.0952406720961065, "grad_norm": 0.2081199437379837, "learning_rate": 0.004945778405023478, "loss": 3.4613468170166017, "num_input_tokens_seen": 922746880, "step": 1760, "train_runtime": 8039.7393, "train_tokens_per_second": 114773.235 }, { "epoch": 0.09578181227847074, "grad_norm": 0.2067294865846634, "learning_rate": 0.004944917186114206, "loss": 3.4611587524414062, "num_input_tokens_seen": 927989760, "step": 1770, "train_runtime": 8084.9796, "train_tokens_per_second": 114779.481 }, { "epoch": 0.09632295246083498, "grad_norm": 0.19931091368198395, "learning_rate": 0.00494404926622374, "loss": 3.462744140625, "num_input_tokens_seen": 933232640, "step": 1780, "train_runtime": 8130.2373, "train_tokens_per_second": 114785.413 }, { "epoch": 0.09686409264319921, "grad_norm": 0.213568776845932, "learning_rate": 0.004943174648001798, "loss": 3.456349182128906, "num_input_tokens_seen": 938475520, "step": 1790, "train_runtime": 8175.469, "train_tokens_per_second": 114791.644 }, { "epoch": 0.09740523282556346, "grad_norm": 0.20064200460910797, "learning_rate": 0.004942293334118552, "loss": 3.4558643341064452, "num_input_tokens_seen": 943718400, "step": 1800, "train_runtime": 8220.7059, "train_tokens_per_second": 114797.732 }, { "epoch": 0.0979463730079277, "grad_norm": 0.21110571920871735, "learning_rate": 0.004941405327264611, "loss": 3.4533897399902345, "num_input_tokens_seen": 948961280, "step": 1810, "train_runtime": 8265.9542, "train_tokens_per_second": 114803.598 }, { "epoch": 0.09848751319029195, "grad_norm": 0.20624655485153198, "learning_rate": 0.0049405106301510186, "loss": 3.4500003814697267, "num_input_tokens_seen": 954204160, "step": 1820, "train_runtime": 8311.1987, "train_tokens_per_second": 114809.45 }, { "epoch": 0.09902865337265619, "grad_norm": 0.20945154130458832, "learning_rate": 0.004939609245509244, "loss": 3.440562057495117, "num_input_tokens_seen": 959447040, "step": 1830, "train_runtime": 8356.4084, "train_tokens_per_second": 114815.719 }, { "epoch": 0.09956979355502042, "grad_norm": 0.20618219673633575, "learning_rate": 0.004938701176091175, "loss": 3.4402488708496093, "num_input_tokens_seen": 964689920, "step": 1840, "train_runtime": 8401.6104, "train_tokens_per_second": 114822.025 }, { "epoch": 0.10011093373738467, "grad_norm": 0.21744751930236816, "learning_rate": 0.004937786424669103, "loss": 3.447218322753906, "num_input_tokens_seen": 969932800, "step": 1850, "train_runtime": 8446.8197, "train_tokens_per_second": 114828.165 }, { "epoch": 0.10065207391974891, "grad_norm": 0.207778662443161, "learning_rate": 0.004936864994035724, "loss": 3.4344856262207033, "num_input_tokens_seen": 975175680, "step": 1860, "train_runtime": 8492.045, "train_tokens_per_second": 114834.022 }, { "epoch": 0.10119321410211315, "grad_norm": 0.20873455703258514, "learning_rate": 0.004935936887004123, "loss": 3.4340728759765624, "num_input_tokens_seen": 980418560, "step": 1870, "train_runtime": 8537.2574, "train_tokens_per_second": 114839.99 }, { "epoch": 0.1017343542844774, "grad_norm": 0.21933899819850922, "learning_rate": 0.004935002106407768, "loss": 3.431113433837891, "num_input_tokens_seen": 985661440, "step": 1880, "train_runtime": 8582.5067, "train_tokens_per_second": 114845.403 }, { "epoch": 0.10227549446684163, "grad_norm": 0.19961251318454742, "learning_rate": 0.0049340606551005, "loss": 3.4356346130371094, "num_input_tokens_seen": 990904320, "step": 1890, "train_runtime": 8627.7245, "train_tokens_per_second": 114851.177 }, { "epoch": 0.10281663464920587, "grad_norm": 0.1902250349521637, "learning_rate": 0.004933112535956529, "loss": 3.432623291015625, "num_input_tokens_seen": 996147200, "step": 1900, "train_runtime": 8672.9813, "train_tokens_per_second": 114856.376 }, { "epoch": 0.10335777483157012, "grad_norm": 0.1946999877691269, "learning_rate": 0.004932157751870416, "loss": 3.435283660888672, "num_input_tokens_seen": 1001390080, "step": 1910, "train_runtime": 8722.0751, "train_tokens_per_second": 114810.99 }, { "epoch": 0.10389891501393436, "grad_norm": 0.21359668672084808, "learning_rate": 0.004931196305757076, "loss": 3.4397598266601563, "num_input_tokens_seen": 1006632960, "step": 1920, "train_runtime": 8767.3142, "train_tokens_per_second": 114816.572 }, { "epoch": 0.1044400551962986, "grad_norm": 0.188863143324852, "learning_rate": 0.004930228200551757, "loss": 3.428334045410156, "num_input_tokens_seen": 1011875840, "step": 1930, "train_runtime": 8812.53, "train_tokens_per_second": 114822.399 }, { "epoch": 0.10498119537866284, "grad_norm": 0.2043711096048355, "learning_rate": 0.0049292534392100405, "loss": 3.428396987915039, "num_input_tokens_seen": 1017118720, "step": 1940, "train_runtime": 8857.7449, "train_tokens_per_second": 114828.179 }, { "epoch": 0.10552233556102708, "grad_norm": 0.18800680339336395, "learning_rate": 0.00492827202470783, "loss": 3.423938751220703, "num_input_tokens_seen": 1022361600, "step": 1950, "train_runtime": 8902.9768, "train_tokens_per_second": 114833.682 }, { "epoch": 0.10606347574339133, "grad_norm": 0.20674094557762146, "learning_rate": 0.004927283960041336, "loss": 3.4255210876464846, "num_input_tokens_seen": 1027604480, "step": 1960, "train_runtime": 8948.2078, "train_tokens_per_second": 114839.14 }, { "epoch": 0.10660461592575557, "grad_norm": 0.19658301770687103, "learning_rate": 0.004926289248227076, "loss": 3.422502899169922, "num_input_tokens_seen": 1032847360, "step": 1970, "train_runtime": 8993.4331, "train_tokens_per_second": 114844.614 }, { "epoch": 0.10714575610811981, "grad_norm": 0.20629730820655823, "learning_rate": 0.00492528789230186, "loss": 3.419141387939453, "num_input_tokens_seen": 1038090240, "step": 1980, "train_runtime": 9038.6403, "train_tokens_per_second": 114850.266 }, { "epoch": 0.10768689629048404, "grad_norm": 0.20946621894836426, "learning_rate": 0.00492427989532278, "loss": 3.4206031799316405, "num_input_tokens_seen": 1043333120, "step": 1990, "train_runtime": 9083.8627, "train_tokens_per_second": 114855.668 }, { "epoch": 0.10822803647284829, "grad_norm": 0.2047351747751236, "learning_rate": 0.004923265260367205, "loss": 3.421718978881836, "num_input_tokens_seen": 1048576000, "step": 2000, "train_runtime": 9129.0866, "train_tokens_per_second": 114860.998 }, { "epoch": 0.10822803647284829, "eval_loss": 3.355583429336548, "eval_runtime": 1.9828, "eval_samples_per_second": 252.171, "eval_steps_per_second": 4.035, "num_input_tokens_seen": 1048576000, "step": 2000 }, { "epoch": 0.10876917665521253, "grad_norm": 0.18940046429634094, "learning_rate": 0.004922243990532769, "loss": 3.4131790161132813, "num_input_tokens_seen": 1053818880, "step": 2010, "train_runtime": 9178.8208, "train_tokens_per_second": 114809.833 }, { "epoch": 0.10931031683757678, "grad_norm": 0.2015410214662552, "learning_rate": 0.004921216088937362, "loss": 3.433843994140625, "num_input_tokens_seen": 1059061760, "step": 2020, "train_runtime": 9224.0155, "train_tokens_per_second": 114815.696 }, { "epoch": 0.10985145701994102, "grad_norm": 0.20246025919914246, "learning_rate": 0.0049201815587191205, "loss": 3.4257015228271483, "num_input_tokens_seen": 1064304640, "step": 2030, "train_runtime": 9269.2191, "train_tokens_per_second": 114821.392 }, { "epoch": 0.11039259720230525, "grad_norm": 0.1984010487794876, "learning_rate": 0.0049191404030364165, "loss": 3.407004547119141, "num_input_tokens_seen": 1069547520, "step": 2040, "train_runtime": 9314.4158, "train_tokens_per_second": 114827.118 }, { "epoch": 0.1109337373846695, "grad_norm": 0.2240104079246521, "learning_rate": 0.0049180926250678506, "loss": 3.413028335571289, "num_input_tokens_seen": 1074790400, "step": 2050, "train_runtime": 9359.6399, "train_tokens_per_second": 114832.452 }, { "epoch": 0.11147487756703374, "grad_norm": 0.20743729174137115, "learning_rate": 0.004917038228012243, "loss": 3.413587188720703, "num_input_tokens_seen": 1080033280, "step": 2060, "train_runtime": 9404.8754, "train_tokens_per_second": 114837.596 }, { "epoch": 0.11201601774939798, "grad_norm": 0.18610326945781708, "learning_rate": 0.004915977215088616, "loss": 3.4143035888671873, "num_input_tokens_seen": 1085276160, "step": 2070, "train_runtime": 9450.1196, "train_tokens_per_second": 114842.584 }, { "epoch": 0.11255715793176223, "grad_norm": 0.18289707601070404, "learning_rate": 0.004914909589536196, "loss": 3.4013748168945312, "num_input_tokens_seen": 1090519040, "step": 2080, "train_runtime": 9495.3594, "train_tokens_per_second": 114847.579 }, { "epoch": 0.11309829811412646, "grad_norm": 0.20693431794643402, "learning_rate": 0.0049138353546143935, "loss": 3.420492172241211, "num_input_tokens_seen": 1095761920, "step": 2090, "train_runtime": 9540.5793, "train_tokens_per_second": 114852.766 }, { "epoch": 0.1136394382964907, "grad_norm": 0.1881825178861618, "learning_rate": 0.0049127545136027975, "loss": 3.4042373657226563, "num_input_tokens_seen": 1101004800, "step": 2100, "train_runtime": 9585.774, "train_tokens_per_second": 114858.206 }, { "epoch": 0.11418057847885495, "grad_norm": 0.20557381212711334, "learning_rate": 0.004911667069801167, "loss": 3.395760345458984, "num_input_tokens_seen": 1106247680, "step": 2110, "train_runtime": 9630.9862, "train_tokens_per_second": 114863.385 }, { "epoch": 0.11472171866121919, "grad_norm": 0.20688888430595398, "learning_rate": 0.004910573026529419, "loss": 3.3946189880371094, "num_input_tokens_seen": 1111490560, "step": 2120, "train_runtime": 9676.207, "train_tokens_per_second": 114868.415 }, { "epoch": 0.11526285884358344, "grad_norm": 0.1814773976802826, "learning_rate": 0.004909472387127615, "loss": 3.405241775512695, "num_input_tokens_seen": 1116733440, "step": 2130, "train_runtime": 9721.4134, "train_tokens_per_second": 114873.568 }, { "epoch": 0.11580399902594767, "grad_norm": 0.1865544617176056, "learning_rate": 0.004908365154955957, "loss": 3.4098495483398437, "num_input_tokens_seen": 1121976320, "step": 2140, "train_runtime": 9766.6099, "train_tokens_per_second": 114878.789 }, { "epoch": 0.11634513920831191, "grad_norm": 0.2032928168773651, "learning_rate": 0.004907251333394776, "loss": 3.4024234771728517, "num_input_tokens_seen": 1127219200, "step": 2150, "train_runtime": 9811.8225, "train_tokens_per_second": 114883.774 }, { "epoch": 0.11688627939067615, "grad_norm": 0.1904987096786499, "learning_rate": 0.004906130925844515, "loss": 3.3986663818359375, "num_input_tokens_seen": 1132462080, "step": 2160, "train_runtime": 9857.0319, "train_tokens_per_second": 114888.751 }, { "epoch": 0.1174274195730404, "grad_norm": 0.1815112829208374, "learning_rate": 0.004905003935725728, "loss": 3.3947410583496094, "num_input_tokens_seen": 1137704960, "step": 2170, "train_runtime": 9902.2357, "train_tokens_per_second": 114893.746 }, { "epoch": 0.11796855975540464, "grad_norm": 0.20339980721473694, "learning_rate": 0.004903870366479064, "loss": 3.3956260681152344, "num_input_tokens_seen": 1142947840, "step": 2180, "train_runtime": 9947.4535, "train_tokens_per_second": 114898.536 }, { "epoch": 0.11850969993776887, "grad_norm": 0.18381614983081818, "learning_rate": 0.004902730221565258, "loss": 3.3980743408203127, "num_input_tokens_seen": 1148190720, "step": 2190, "train_runtime": 9992.6437, "train_tokens_per_second": 114903.599 }, { "epoch": 0.11905084012013312, "grad_norm": 0.19806860387325287, "learning_rate": 0.004901583504465119, "loss": 3.393767547607422, "num_input_tokens_seen": 1153433600, "step": 2200, "train_runtime": 10037.8153, "train_tokens_per_second": 114908.829 }, { "epoch": 0.11959198030249736, "grad_norm": 0.20329956710338593, "learning_rate": 0.004900430218679523, "loss": 3.3944183349609376, "num_input_tokens_seen": 1158676480, "step": 2210, "train_runtime": 10083.038, "train_tokens_per_second": 114913.43 }, { "epoch": 0.1201331204848616, "grad_norm": 0.2080426961183548, "learning_rate": 0.004899270367729398, "loss": 3.3978126525878904, "num_input_tokens_seen": 1163919360, "step": 2220, "train_runtime": 10128.2721, "train_tokens_per_second": 114917.86 }, { "epoch": 0.12067426066722585, "grad_norm": 0.19686874747276306, "learning_rate": 0.004898103955155715, "loss": 3.395246124267578, "num_input_tokens_seen": 1169162240, "step": 2230, "train_runtime": 10173.5097, "train_tokens_per_second": 114922.212 }, { "epoch": 0.12121540084959008, "grad_norm": 0.20237593352794647, "learning_rate": 0.004896930984519478, "loss": 3.3845314025878905, "num_input_tokens_seen": 1174405120, "step": 2240, "train_runtime": 10218.7376, "train_tokens_per_second": 114926.634 }, { "epoch": 0.12175654103195432, "grad_norm": 0.19075846672058105, "learning_rate": 0.004895751459401713, "loss": 3.380054473876953, "num_input_tokens_seen": 1179648000, "step": 2250, "train_runtime": 10263.9453, "train_tokens_per_second": 114931.243 }, { "epoch": 0.12229768121431857, "grad_norm": 0.18929868936538696, "learning_rate": 0.004894565383403456, "loss": 3.3817626953125, "num_input_tokens_seen": 1184890880, "step": 2260, "train_runtime": 10309.1491, "train_tokens_per_second": 114935.857 }, { "epoch": 0.12283882139668281, "grad_norm": 0.18654604256153107, "learning_rate": 0.0048933727601457415, "loss": 3.3808876037597657, "num_input_tokens_seen": 1190133760, "step": 2270, "train_runtime": 10354.365, "train_tokens_per_second": 114940.294 }, { "epoch": 0.12337996157904706, "grad_norm": 0.19212667644023895, "learning_rate": 0.004892173593269593, "loss": 3.378282928466797, "num_input_tokens_seen": 1195376640, "step": 2280, "train_runtime": 10399.5899, "train_tokens_per_second": 114944.594 }, { "epoch": 0.12392110176141129, "grad_norm": 0.19903384149074554, "learning_rate": 0.004890967886436014, "loss": 3.384090042114258, "num_input_tokens_seen": 1200619520, "step": 2290, "train_runtime": 10448.5527, "train_tokens_per_second": 114907.735 }, { "epoch": 0.12446224194377553, "grad_norm": 0.19387711584568024, "learning_rate": 0.004889755643325971, "loss": 3.380754089355469, "num_input_tokens_seen": 1205862400, "step": 2300, "train_runtime": 10493.7881, "train_tokens_per_second": 114912.021 }, { "epoch": 0.1250033821261398, "grad_norm": 0.20909279584884644, "learning_rate": 0.0048885368676403855, "loss": 3.3727947235107423, "num_input_tokens_seen": 1211105280, "step": 2310, "train_runtime": 10539.0351, "train_tokens_per_second": 114916.145 }, { "epoch": 0.12554452230850402, "grad_norm": 0.17621192336082458, "learning_rate": 0.004887311563100124, "loss": 3.384077453613281, "num_input_tokens_seen": 1216348160, "step": 2320, "train_runtime": 10584.2319, "train_tokens_per_second": 114920.777 }, { "epoch": 0.12608566249086825, "grad_norm": 0.19513387978076935, "learning_rate": 0.004886079733445985, "loss": 3.378644561767578, "num_input_tokens_seen": 1221591040, "step": 2330, "train_runtime": 10629.451, "train_tokens_per_second": 114925.13 }, { "epoch": 0.1266268026732325, "grad_norm": 0.19339337944984436, "learning_rate": 0.004884841382438689, "loss": 3.3802566528320312, "num_input_tokens_seen": 1226833920, "step": 2340, "train_runtime": 10674.6651, "train_tokens_per_second": 114929.5 }, { "epoch": 0.12716794285559674, "grad_norm": 0.20770232379436493, "learning_rate": 0.004883596513858863, "loss": 3.3678009033203127, "num_input_tokens_seen": 1232076800, "step": 2350, "train_runtime": 10719.8522, "train_tokens_per_second": 114934.122 }, { "epoch": 0.127709083037961, "grad_norm": 0.19512751698493958, "learning_rate": 0.004882345131507035, "loss": 3.3827003479003905, "num_input_tokens_seen": 1237319680, "step": 2360, "train_runtime": 10765.0276, "train_tokens_per_second": 114938.831 }, { "epoch": 0.12825022322032523, "grad_norm": 0.1894627958536148, "learning_rate": 0.004881087239203616, "loss": 3.377857208251953, "num_input_tokens_seen": 1242562560, "step": 2370, "train_runtime": 10810.2272, "train_tokens_per_second": 114943.242 }, { "epoch": 0.12879136340268946, "grad_norm": 0.18233883380889893, "learning_rate": 0.004879822840788895, "loss": 3.370525360107422, "num_input_tokens_seen": 1247805440, "step": 2380, "train_runtime": 10855.4267, "train_tokens_per_second": 114947.618 }, { "epoch": 0.12933250358505372, "grad_norm": 0.1876172125339508, "learning_rate": 0.00487855194012302, "loss": 3.3757583618164064, "num_input_tokens_seen": 1253048320, "step": 2390, "train_runtime": 10900.6108, "train_tokens_per_second": 114952.12 }, { "epoch": 0.12987364376741795, "grad_norm": 0.19061093032360077, "learning_rate": 0.0048772745410859955, "loss": 3.371135711669922, "num_input_tokens_seen": 1258291200, "step": 2400, "train_runtime": 10945.8277, "train_tokens_per_second": 114956.24 }, { "epoch": 0.1304147839497822, "grad_norm": 0.19320231676101685, "learning_rate": 0.004875990647577659, "loss": 3.376973342895508, "num_input_tokens_seen": 1263534080, "step": 2410, "train_runtime": 10991.0522, "train_tokens_per_second": 114960.247 }, { "epoch": 0.13095592413214643, "grad_norm": 0.18665140867233276, "learning_rate": 0.004874700263517679, "loss": 3.371229553222656, "num_input_tokens_seen": 1268776960, "step": 2420, "train_runtime": 11036.2334, "train_tokens_per_second": 114964.673 }, { "epoch": 0.13149706431451066, "grad_norm": 0.18199113011360168, "learning_rate": 0.004873403392845541, "loss": 3.361619567871094, "num_input_tokens_seen": 1274019840, "step": 2430, "train_runtime": 11081.4325, "train_tokens_per_second": 114968.877 }, { "epoch": 0.13203820449687492, "grad_norm": 0.19316934049129486, "learning_rate": 0.004872100039520528, "loss": 3.360996627807617, "num_input_tokens_seen": 1279262720, "step": 2440, "train_runtime": 11126.6221, "train_tokens_per_second": 114973.144 }, { "epoch": 0.13257934467923915, "grad_norm": 0.185124471783638, "learning_rate": 0.00487079020752172, "loss": 3.366575241088867, "num_input_tokens_seen": 1284505600, "step": 2450, "train_runtime": 11171.8004, "train_tokens_per_second": 114977.493 }, { "epoch": 0.1331204848616034, "grad_norm": 0.18804939091205597, "learning_rate": 0.004869473900847973, "loss": 3.3575817108154298, "num_input_tokens_seen": 1289748480, "step": 2460, "train_runtime": 11217.0182, "train_tokens_per_second": 114981.402 }, { "epoch": 0.13366162504396764, "grad_norm": 0.19206225872039795, "learning_rate": 0.004868151123517911, "loss": 3.3654083251953124, "num_input_tokens_seen": 1294991360, "step": 2470, "train_runtime": 11262.2319, "train_tokens_per_second": 114985.322 }, { "epoch": 0.13420276522633187, "grad_norm": 0.1901652067899704, "learning_rate": 0.004866821879569913, "loss": 3.3583431243896484, "num_input_tokens_seen": 1300234240, "step": 2480, "train_runtime": 11307.4375, "train_tokens_per_second": 114989.293 }, { "epoch": 0.13474390540869613, "grad_norm": 0.1906082034111023, "learning_rate": 0.004865486173062098, "loss": 3.3592803955078123, "num_input_tokens_seen": 1305477120, "step": 2490, "train_runtime": 11352.6608, "train_tokens_per_second": 114993.053 }, { "epoch": 0.13528504559106036, "grad_norm": 0.17934362590312958, "learning_rate": 0.004864144008072318, "loss": 3.3405136108398437, "num_input_tokens_seen": 1310720000, "step": 2500, "train_runtime": 11397.8619, "train_tokens_per_second": 114997.007 }, { "epoch": 0.13528504559106036, "eval_loss": 3.2980093955993652, "eval_runtime": 1.9858, "eval_samples_per_second": 251.785, "eval_steps_per_second": 4.029, "num_input_tokens_seen": 1310720000, "step": 2500 }, { "epoch": 0.13582618577342462, "grad_norm": 0.18693317472934723, "learning_rate": 0.00486279538869814, "loss": 3.367817687988281, "num_input_tokens_seen": 1315962880, "step": 2510, "train_runtime": 11445.0335, "train_tokens_per_second": 114981.13 }, { "epoch": 0.13636732595578885, "grad_norm": 0.18467511236667633, "learning_rate": 0.004861440319056837, "loss": 3.355504608154297, "num_input_tokens_seen": 1321205760, "step": 2520, "train_runtime": 11490.248, "train_tokens_per_second": 114984.965 }, { "epoch": 0.13690846613815308, "grad_norm": 0.17405100166797638, "learning_rate": 0.004860078803285375, "loss": 3.3486671447753906, "num_input_tokens_seen": 1326448640, "step": 2530, "train_runtime": 11535.4773, "train_tokens_per_second": 114988.622 }, { "epoch": 0.13744960632051734, "grad_norm": 0.1991710066795349, "learning_rate": 0.0048587108455403994, "loss": 3.3470123291015623, "num_input_tokens_seen": 1331691520, "step": 2540, "train_runtime": 11580.7091, "train_tokens_per_second": 114992.226 }, { "epoch": 0.13799074650288157, "grad_norm": 0.19981589913368225, "learning_rate": 0.004857336449998221, "loss": 3.355559539794922, "num_input_tokens_seen": 1336934400, "step": 2550, "train_runtime": 11625.9198, "train_tokens_per_second": 114996.011 }, { "epoch": 0.13853188668524583, "grad_norm": 0.19031056761741638, "learning_rate": 0.004855955620854806, "loss": 3.359702301025391, "num_input_tokens_seen": 1342177280, "step": 2560, "train_runtime": 11671.1393, "train_tokens_per_second": 114999.68 }, { "epoch": 0.13907302686761006, "grad_norm": 0.1825476437807083, "learning_rate": 0.004854568362325763, "loss": 3.3532974243164064, "num_input_tokens_seen": 1347420160, "step": 2570, "train_runtime": 11716.3486, "train_tokens_per_second": 115003.42 }, { "epoch": 0.13961416704997429, "grad_norm": 0.1943996399641037, "learning_rate": 0.004853174678646328, "loss": 3.3549442291259766, "num_input_tokens_seen": 1352663040, "step": 2580, "train_runtime": 11761.5665, "train_tokens_per_second": 115007.048 }, { "epoch": 0.14015530723233854, "grad_norm": 0.18165603280067444, "learning_rate": 0.004851774574071355, "loss": 3.345872497558594, "num_input_tokens_seen": 1357905920, "step": 2590, "train_runtime": 11806.7776, "train_tokens_per_second": 115010.714 }, { "epoch": 0.14069644741470277, "grad_norm": 0.19906878471374512, "learning_rate": 0.004850368052875296, "loss": 3.3501548767089844, "num_input_tokens_seen": 1363148800, "step": 2600, "train_runtime": 11851.978, "train_tokens_per_second": 115014.456 }, { "epoch": 0.14123758759706703, "grad_norm": 0.1889800876379013, "learning_rate": 0.004848955119352198, "loss": 3.357212448120117, "num_input_tokens_seen": 1368391680, "step": 2610, "train_runtime": 11897.1651, "train_tokens_per_second": 115018.298 }, { "epoch": 0.14177872777943126, "grad_norm": 0.1907270848751068, "learning_rate": 0.00484753577781568, "loss": 3.3405483245849608, "num_input_tokens_seen": 1373634560, "step": 2620, "train_runtime": 11942.3695, "train_tokens_per_second": 115021.945 }, { "epoch": 0.1423198679617955, "grad_norm": 0.18622975051403046, "learning_rate": 0.004846110032598928, "loss": 3.344770050048828, "num_input_tokens_seen": 1378877440, "step": 2630, "train_runtime": 11987.559, "train_tokens_per_second": 115025.706 }, { "epoch": 0.14286100814415975, "grad_norm": 0.20059405267238617, "learning_rate": 0.004844677888054675, "loss": 3.344530487060547, "num_input_tokens_seen": 1384120320, "step": 2640, "train_runtime": 12032.7386, "train_tokens_per_second": 115029.535 }, { "epoch": 0.14340214832652398, "grad_norm": 0.18230022490024567, "learning_rate": 0.004843239348555194, "loss": 3.340105438232422, "num_input_tokens_seen": 1389363200, "step": 2650, "train_runtime": 12077.95, "train_tokens_per_second": 115033.032 }, { "epoch": 0.14394328850888824, "grad_norm": 0.1814001351594925, "learning_rate": 0.004841794418492279, "loss": 3.3359622955322266, "num_input_tokens_seen": 1394606080, "step": 2660, "train_runtime": 12123.1531, "train_tokens_per_second": 115036.581 }, { "epoch": 0.14448442869125247, "grad_norm": 0.20956675708293915, "learning_rate": 0.004840343102277236, "loss": 3.3457298278808594, "num_input_tokens_seen": 1399848960, "step": 2670, "train_runtime": 12168.3992, "train_tokens_per_second": 115039.697 }, { "epoch": 0.1450255688736167, "grad_norm": 0.19602610170841217, "learning_rate": 0.004838885404340865, "loss": 3.337678909301758, "num_input_tokens_seen": 1405091840, "step": 2680, "train_runtime": 12217.5728, "train_tokens_per_second": 115005.809 }, { "epoch": 0.14556670905598096, "grad_norm": 0.18408642709255219, "learning_rate": 0.00483742132913345, "loss": 3.3393791198730467, "num_input_tokens_seen": 1410334720, "step": 2690, "train_runtime": 12262.7736, "train_tokens_per_second": 115009.439 }, { "epoch": 0.1461078492383452, "grad_norm": 0.21065284311771393, "learning_rate": 0.00483595088112475, "loss": 3.3346370697021483, "num_input_tokens_seen": 1415577600, "step": 2700, "train_runtime": 12307.9818, "train_tokens_per_second": 115012.974 }, { "epoch": 0.14664898942070945, "grad_norm": 0.18873485922813416, "learning_rate": 0.00483447406480397, "loss": 3.33388671875, "num_input_tokens_seen": 1420820480, "step": 2710, "train_runtime": 12353.2004, "train_tokens_per_second": 115016.387 }, { "epoch": 0.14719012960307368, "grad_norm": 0.18760992586612701, "learning_rate": 0.004832990884679764, "loss": 3.3374618530273437, "num_input_tokens_seen": 1426063360, "step": 2720, "train_runtime": 12398.393, "train_tokens_per_second": 115020.016 }, { "epoch": 0.1477312697854379, "grad_norm": 0.192196786403656, "learning_rate": 0.004831501345280215, "loss": 3.3331283569335937, "num_input_tokens_seen": 1431306240, "step": 2730, "train_runtime": 12443.5889, "train_tokens_per_second": 115023.588 }, { "epoch": 0.14827240996780217, "grad_norm": 0.18086469173431396, "learning_rate": 0.004830005451152815, "loss": 3.342273712158203, "num_input_tokens_seen": 1436549120, "step": 2740, "train_runtime": 12488.8002, "train_tokens_per_second": 115026.992 }, { "epoch": 0.1488135501501664, "grad_norm": 0.20178896188735962, "learning_rate": 0.004828503206864461, "loss": 3.340282440185547, "num_input_tokens_seen": 1441792000, "step": 2750, "train_runtime": 12534.0299, "train_tokens_per_second": 115030.203 }, { "epoch": 0.14935469033253065, "grad_norm": 0.1862727403640747, "learning_rate": 0.004826994617001436, "loss": 3.333884048461914, "num_input_tokens_seen": 1447034880, "step": 2760, "train_runtime": 12579.232, "train_tokens_per_second": 115033.643 }, { "epoch": 0.14989583051489488, "grad_norm": 0.18692044913768768, "learning_rate": 0.004825479686169395, "loss": 3.3313224792480467, "num_input_tokens_seen": 1452277760, "step": 2770, "train_runtime": 12624.4525, "train_tokens_per_second": 115036.89 }, { "epoch": 0.15043697069725911, "grad_norm": 0.19887301325798035, "learning_rate": 0.004823958418993353, "loss": 3.3318561553955077, "num_input_tokens_seen": 1457520640, "step": 2780, "train_runtime": 12669.6266, "train_tokens_per_second": 115040.536 }, { "epoch": 0.15097811087962337, "grad_norm": 0.19694387912750244, "learning_rate": 0.004822430820117667, "loss": 3.324603271484375, "num_input_tokens_seen": 1462763520, "step": 2790, "train_runtime": 12714.8383, "train_tokens_per_second": 115043.816 }, { "epoch": 0.1515192510619876, "grad_norm": 0.18864646553993225, "learning_rate": 0.0048208968942060285, "loss": 3.329520416259766, "num_input_tokens_seen": 1468006400, "step": 2800, "train_runtime": 12760.0555, "train_tokens_per_second": 115047.023 }, { "epoch": 0.15206039124435186, "grad_norm": 0.19097571074962616, "learning_rate": 0.004819356645941442, "loss": 3.334062194824219, "num_input_tokens_seen": 1473249280, "step": 2810, "train_runtime": 12805.2599, "train_tokens_per_second": 115050.323 }, { "epoch": 0.1526015314267161, "grad_norm": 0.18825189769268036, "learning_rate": 0.004817810080026213, "loss": 3.3339500427246094, "num_input_tokens_seen": 1478492160, "step": 2820, "train_runtime": 12850.4647, "train_tokens_per_second": 115053.595 }, { "epoch": 0.15314267160908032, "grad_norm": 0.17853769659996033, "learning_rate": 0.004816257201181937, "loss": 3.3271289825439454, "num_input_tokens_seen": 1483735040, "step": 2830, "train_runtime": 12895.6915, "train_tokens_per_second": 115056.648 }, { "epoch": 0.15368381179144458, "grad_norm": 0.1959368884563446, "learning_rate": 0.004814698014149483, "loss": 3.3293079376220702, "num_input_tokens_seen": 1488977920, "step": 2840, "train_runtime": 12940.9039, "train_tokens_per_second": 115059.808 }, { "epoch": 0.1542249519738088, "grad_norm": 0.1917344182729721, "learning_rate": 0.0048131325236889745, "loss": 3.3289634704589846, "num_input_tokens_seen": 1494220800, "step": 2850, "train_runtime": 12986.1305, "train_tokens_per_second": 115062.82 }, { "epoch": 0.15476609215617307, "grad_norm": 0.18901459872722626, "learning_rate": 0.004811560734579785, "loss": 3.3151206970214844, "num_input_tokens_seen": 1499463680, "step": 2860, "train_runtime": 13031.3423, "train_tokens_per_second": 115065.943 }, { "epoch": 0.1553072323385373, "grad_norm": 0.1884569674730301, "learning_rate": 0.004809982651620513, "loss": 3.321660614013672, "num_input_tokens_seen": 1504706560, "step": 2870, "train_runtime": 13076.553, "train_tokens_per_second": 115069.052 }, { "epoch": 0.15584837252090153, "grad_norm": 0.1960553228855133, "learning_rate": 0.004808398279628971, "loss": 3.326691436767578, "num_input_tokens_seen": 1509949440, "step": 2880, "train_runtime": 13121.764, "train_tokens_per_second": 115072.138 }, { "epoch": 0.1563895127032658, "grad_norm": 0.19503499567508698, "learning_rate": 0.004806807623442178, "loss": 3.321258544921875, "num_input_tokens_seen": 1515192320, "step": 2890, "train_runtime": 13166.9951, "train_tokens_per_second": 115075.027 }, { "epoch": 0.15693065288563002, "grad_norm": 0.19287334382534027, "learning_rate": 0.004805210687916331, "loss": 3.3227684020996096, "num_input_tokens_seen": 1520435200, "step": 2900, "train_runtime": 13212.1814, "train_tokens_per_second": 115078.287 }, { "epoch": 0.15747179306799428, "grad_norm": 0.19813010096549988, "learning_rate": 0.004803607477926801, "loss": 3.3109420776367187, "num_input_tokens_seen": 1525678080, "step": 2910, "train_runtime": 13257.3873, "train_tokens_per_second": 115081.354 }, { "epoch": 0.1580129332503585, "grad_norm": 0.19769278168678284, "learning_rate": 0.004801997998368116, "loss": 3.317332458496094, "num_input_tokens_seen": 1530920960, "step": 2920, "train_runtime": 13302.5829, "train_tokens_per_second": 115084.489 }, { "epoch": 0.15855407343272274, "grad_norm": 0.21613162755966187, "learning_rate": 0.0048003822541539416, "loss": 3.3125213623046874, "num_input_tokens_seen": 1536163840, "step": 2930, "train_runtime": 13347.7691, "train_tokens_per_second": 115087.684 }, { "epoch": 0.159095213615087, "grad_norm": 0.19285152852535248, "learning_rate": 0.004798760250217072, "loss": 3.3247020721435545, "num_input_tokens_seen": 1541406720, "step": 2940, "train_runtime": 13392.9574, "train_tokens_per_second": 115090.84 }, { "epoch": 0.15963635379745122, "grad_norm": 0.19629855453968048, "learning_rate": 0.004797131991509409, "loss": 3.3183937072753906, "num_input_tokens_seen": 1546649600, "step": 2950, "train_runtime": 13438.1692, "train_tokens_per_second": 115093.773 }, { "epoch": 0.16017749397981548, "grad_norm": 0.1971038430929184, "learning_rate": 0.004795497483001952, "loss": 3.3157825469970703, "num_input_tokens_seen": 1551892480, "step": 2960, "train_runtime": 13483.3787, "train_tokens_per_second": 115096.706 }, { "epoch": 0.1607186341621797, "grad_norm": 0.2136721909046173, "learning_rate": 0.0047938567296847805, "loss": 3.3181556701660155, "num_input_tokens_seen": 1557135360, "step": 2970, "train_runtime": 13528.5747, "train_tokens_per_second": 115099.734 }, { "epoch": 0.16125977434454394, "grad_norm": 0.18783020973205566, "learning_rate": 0.004792209736567038, "loss": 3.3050804138183594, "num_input_tokens_seen": 1562378240, "step": 2980, "train_runtime": 13573.7919, "train_tokens_per_second": 115102.563 }, { "epoch": 0.1618009145269082, "grad_norm": 0.17761750519275665, "learning_rate": 0.0047905565086769205, "loss": 3.313432312011719, "num_input_tokens_seen": 1567621120, "step": 2990, "train_runtime": 13618.9923, "train_tokens_per_second": 115105.515 }, { "epoch": 0.16234205470927243, "grad_norm": 0.1785641759634018, "learning_rate": 0.004788897051061655, "loss": 3.317774200439453, "num_input_tokens_seen": 1572864000, "step": 3000, "train_runtime": 13664.2112, "train_tokens_per_second": 115108.291 }, { "epoch": 0.16234205470927243, "eval_loss": 3.2526497840881348, "eval_runtime": 1.9863, "eval_samples_per_second": 251.723, "eval_steps_per_second": 4.028, "num_input_tokens_seen": 1572864000, "step": 3000 }, { "epoch": 0.1628831948916367, "grad_norm": 0.19272948801517487, "learning_rate": 0.004787231368787491, "loss": 3.3128257751464845, "num_input_tokens_seen": 1578106880, "step": 3010, "train_runtime": 13714.1711, "train_tokens_per_second": 115071.255 }, { "epoch": 0.16342433507400092, "grad_norm": 0.1745939403772354, "learning_rate": 0.004785559466939679, "loss": 3.31363525390625, "num_input_tokens_seen": 1583349760, "step": 3020, "train_runtime": 13759.3609, "train_tokens_per_second": 115074.368 }, { "epoch": 0.16396547525636515, "grad_norm": 0.20123089849948883, "learning_rate": 0.0047838813506224575, "loss": 3.3179275512695314, "num_input_tokens_seen": 1588592640, "step": 3030, "train_runtime": 13804.5657, "train_tokens_per_second": 115077.335 }, { "epoch": 0.1645066154387294, "grad_norm": 0.19240304827690125, "learning_rate": 0.004782197024959039, "loss": 3.3164352416992187, "num_input_tokens_seen": 1593835520, "step": 3040, "train_runtime": 13849.7747, "train_tokens_per_second": 115080.249 }, { "epoch": 0.16504775562109364, "grad_norm": 0.19316141307353973, "learning_rate": 0.004780506495091593, "loss": 3.316120147705078, "num_input_tokens_seen": 1599078400, "step": 3050, "train_runtime": 13894.9935, "train_tokens_per_second": 115083.062 }, { "epoch": 0.1655888958034579, "grad_norm": 0.19889311492443085, "learning_rate": 0.004778809766181229, "loss": 3.3089508056640624, "num_input_tokens_seen": 1604321280, "step": 3060, "train_runtime": 13943.9286, "train_tokens_per_second": 115055.185 }, { "epoch": 0.16613003598582213, "grad_norm": 0.19427727162837982, "learning_rate": 0.004777106843407982, "loss": 3.3107887268066407, "num_input_tokens_seen": 1609564160, "step": 3070, "train_runtime": 13989.1559, "train_tokens_per_second": 115057.99 }, { "epoch": 0.16667117616818636, "grad_norm": 0.1867746263742447, "learning_rate": 0.004775397731970797, "loss": 3.306330108642578, "num_input_tokens_seen": 1614807040, "step": 3080, "train_runtime": 14034.3821, "train_tokens_per_second": 115060.787 }, { "epoch": 0.16721231635055062, "grad_norm": 0.18061718344688416, "learning_rate": 0.0047736824370875125, "loss": 3.3135826110839846, "num_input_tokens_seen": 1620049920, "step": 3090, "train_runtime": 14079.6171, "train_tokens_per_second": 115063.493 }, { "epoch": 0.16775345653291485, "grad_norm": 0.1992795616388321, "learning_rate": 0.004771960963994845, "loss": 3.2958747863769533, "num_input_tokens_seen": 1625292800, "step": 3100, "train_runtime": 14124.8552, "train_tokens_per_second": 115066.157 }, { "epoch": 0.1682945967152791, "grad_norm": 0.1932191550731659, "learning_rate": 0.004770233317948373, "loss": 3.305771255493164, "num_input_tokens_seen": 1630535680, "step": 3110, "train_runtime": 14170.1029, "train_tokens_per_second": 115068.725 }, { "epoch": 0.16883573689764333, "grad_norm": 0.18210622668266296, "learning_rate": 0.00476849950422252, "loss": 3.309395599365234, "num_input_tokens_seen": 1635778560, "step": 3120, "train_runtime": 14215.3361, "train_tokens_per_second": 115071.396 }, { "epoch": 0.16937687708000757, "grad_norm": 0.19367872178554535, "learning_rate": 0.004766759528110539, "loss": 3.302986907958984, "num_input_tokens_seen": 1641021440, "step": 3130, "train_runtime": 14260.5763, "train_tokens_per_second": 115073.992 }, { "epoch": 0.16991801726237182, "grad_norm": 0.19194577634334564, "learning_rate": 0.004765013394924499, "loss": 3.304148864746094, "num_input_tokens_seen": 1646264320, "step": 3140, "train_runtime": 14305.8047, "train_tokens_per_second": 115076.667 }, { "epoch": 0.17045915744473605, "grad_norm": 0.18965749442577362, "learning_rate": 0.0047632611099952624, "loss": 3.298334503173828, "num_input_tokens_seen": 1651507200, "step": 3150, "train_runtime": 14351.0274, "train_tokens_per_second": 115079.371 }, { "epoch": 0.1710002976271003, "grad_norm": 0.17816977202892303, "learning_rate": 0.004761502678672474, "loss": 3.300872802734375, "num_input_tokens_seen": 1656750080, "step": 3160, "train_runtime": 14396.2711, "train_tokens_per_second": 115081.889 }, { "epoch": 0.17154143780946454, "grad_norm": 0.19343526661396027, "learning_rate": 0.004759738106324546, "loss": 3.2991104125976562, "num_input_tokens_seen": 1661992960, "step": 3170, "train_runtime": 14441.5146, "train_tokens_per_second": 115084.394 }, { "epoch": 0.17208257799182877, "grad_norm": 0.18815293908119202, "learning_rate": 0.004757967398338635, "loss": 3.307733154296875, "num_input_tokens_seen": 1667235840, "step": 3180, "train_runtime": 14486.7506, "train_tokens_per_second": 115086.943 }, { "epoch": 0.17262371817419303, "grad_norm": 0.18241587281227112, "learning_rate": 0.004756190560120631, "loss": 3.2984477996826174, "num_input_tokens_seen": 1672478720, "step": 3190, "train_runtime": 14531.9861, "train_tokens_per_second": 115089.48 }, { "epoch": 0.17316485835655726, "grad_norm": 0.18176402151584625, "learning_rate": 0.00475440759709514, "loss": 3.300640869140625, "num_input_tokens_seen": 1677721600, "step": 3200, "train_runtime": 14577.2147, "train_tokens_per_second": 115092.055 }, { "epoch": 0.17370599853892152, "grad_norm": 0.20022694766521454, "learning_rate": 0.004752618514705466, "loss": 3.300579071044922, "num_input_tokens_seen": 1682964480, "step": 3210, "train_runtime": 14622.4411, "train_tokens_per_second": 115094.632 }, { "epoch": 0.17424713872128575, "grad_norm": 0.18792809545993805, "learning_rate": 0.0047508233184135945, "loss": 3.295984649658203, "num_input_tokens_seen": 1688207360, "step": 3220, "train_runtime": 14667.6619, "train_tokens_per_second": 115097.237 }, { "epoch": 0.17478827890364998, "grad_norm": 0.200827494263649, "learning_rate": 0.0047490220137001785, "loss": 3.2906261444091798, "num_input_tokens_seen": 1693450240, "step": 3230, "train_runtime": 14712.8844, "train_tokens_per_second": 115099.813 }, { "epoch": 0.17532941908601424, "grad_norm": 0.19141127169132233, "learning_rate": 0.004747214606064517, "loss": 3.2837890625, "num_input_tokens_seen": 1698693120, "step": 3240, "train_runtime": 14758.1057, "train_tokens_per_second": 115102.382 }, { "epoch": 0.17587055926837847, "grad_norm": 0.18976351618766785, "learning_rate": 0.0047454011010245436, "loss": 3.287107467651367, "num_input_tokens_seen": 1703936000, "step": 3250, "train_runtime": 14803.3273, "train_tokens_per_second": 115104.933 }, { "epoch": 0.17641169945074273, "grad_norm": 0.19698546826839447, "learning_rate": 0.004743581504116804, "loss": 3.2882354736328123, "num_input_tokens_seen": 1709178880, "step": 3260, "train_runtime": 14848.5302, "train_tokens_per_second": 115107.614 }, { "epoch": 0.17695283963310696, "grad_norm": 0.17822493612766266, "learning_rate": 0.004741755820896446, "loss": 3.2927810668945314, "num_input_tokens_seen": 1714421760, "step": 3270, "train_runtime": 14893.7537, "train_tokens_per_second": 115110.119 }, { "epoch": 0.1774939798154712, "grad_norm": 0.1720447987318039, "learning_rate": 0.004739924056937195, "loss": 3.2904899597167967, "num_input_tokens_seen": 1719664640, "step": 3280, "train_runtime": 14938.9717, "train_tokens_per_second": 115112.652 }, { "epoch": 0.17803511999783544, "grad_norm": 0.18303626775741577, "learning_rate": 0.004738086217831344, "loss": 3.28282470703125, "num_input_tokens_seen": 1724907520, "step": 3290, "train_runtime": 14984.1992, "train_tokens_per_second": 115115.096 }, { "epoch": 0.17857626018019968, "grad_norm": 0.176763117313385, "learning_rate": 0.004736242309189728, "loss": 3.286945343017578, "num_input_tokens_seen": 1730150400, "step": 3300, "train_runtime": 15029.4297, "train_tokens_per_second": 115117.502 }, { "epoch": 0.17911740036256393, "grad_norm": 0.19218850135803223, "learning_rate": 0.004734392336641718, "loss": 3.290885162353516, "num_input_tokens_seen": 1735393280, "step": 3310, "train_runtime": 15074.6639, "train_tokens_per_second": 115119.866 }, { "epoch": 0.17965854054492816, "grad_norm": 0.180914968252182, "learning_rate": 0.004732536305835194, "loss": 3.2893463134765626, "num_input_tokens_seen": 1740636160, "step": 3320, "train_runtime": 15119.9044, "train_tokens_per_second": 115122.167 }, { "epoch": 0.1801996807272924, "grad_norm": 0.1835494488477707, "learning_rate": 0.0047306742224365326, "loss": 3.2857479095458983, "num_input_tokens_seen": 1745879040, "step": 3330, "train_runtime": 15165.1285, "train_tokens_per_second": 115124.579 }, { "epoch": 0.18074082090965665, "grad_norm": 0.1805170625448227, "learning_rate": 0.004728806092130589, "loss": 3.2880077362060547, "num_input_tokens_seen": 1751121920, "step": 3340, "train_runtime": 15210.3276, "train_tokens_per_second": 115127.166 }, { "epoch": 0.18128196109202088, "grad_norm": 0.18228840827941895, "learning_rate": 0.00472693192062068, "loss": 3.286875915527344, "num_input_tokens_seen": 1756364800, "step": 3350, "train_runtime": 15255.5307, "train_tokens_per_second": 115129.709 }, { "epoch": 0.18182310127438514, "grad_norm": 0.20272916555404663, "learning_rate": 0.0047250517136285634, "loss": 3.2986392974853516, "num_input_tokens_seen": 1761607680, "step": 3360, "train_runtime": 15300.7449, "train_tokens_per_second": 115132.151 }, { "epoch": 0.18236424145674937, "grad_norm": 0.17199651896953583, "learning_rate": 0.0047231654768944255, "loss": 3.2849578857421875, "num_input_tokens_seen": 1766850560, "step": 3370, "train_runtime": 15345.9591, "train_tokens_per_second": 115134.58 }, { "epoch": 0.1829053816391136, "grad_norm": 0.18118058145046234, "learning_rate": 0.00472127321617686, "loss": 3.2900650024414064, "num_input_tokens_seen": 1772093440, "step": 3380, "train_runtime": 15391.1925, "train_tokens_per_second": 115136.851 }, { "epoch": 0.18344652182147786, "grad_norm": 0.19814889132976532, "learning_rate": 0.004719374937252852, "loss": 3.280558776855469, "num_input_tokens_seen": 1777336320, "step": 3390, "train_runtime": 15436.4025, "train_tokens_per_second": 115139.283 }, { "epoch": 0.1839876620038421, "grad_norm": 0.2015380561351776, "learning_rate": 0.00471747064591776, "loss": 3.30006103515625, "num_input_tokens_seen": 1782579200, "step": 3400, "train_runtime": 15481.5971, "train_tokens_per_second": 115141.816 }, { "epoch": 0.18452880218620635, "grad_norm": 0.16767387092113495, "learning_rate": 0.0047155603479852965, "loss": 3.2787837982177734, "num_input_tokens_seen": 1787822080, "step": 3410, "train_runtime": 15526.8015, "train_tokens_per_second": 115144.261 }, { "epoch": 0.18506994236857058, "grad_norm": 0.169756680727005, "learning_rate": 0.0047136440492875145, "loss": 3.283340072631836, "num_input_tokens_seen": 1793064960, "step": 3420, "train_runtime": 15572.0514, "train_tokens_per_second": 115146.355 }, { "epoch": 0.1856110825509348, "grad_norm": 0.18903492391109467, "learning_rate": 0.004711721755674787, "loss": 3.289379119873047, "num_input_tokens_seen": 1798307840, "step": 3430, "train_runtime": 15617.2557, "train_tokens_per_second": 115148.774 }, { "epoch": 0.18615222273329907, "grad_norm": 0.19553808867931366, "learning_rate": 0.004709793473015785, "loss": 3.277596664428711, "num_input_tokens_seen": 1803550720, "step": 3440, "train_runtime": 15666.2718, "train_tokens_per_second": 115123.16 }, { "epoch": 0.1866933629156633, "grad_norm": 0.17524783313274384, "learning_rate": 0.004707859207197468, "loss": 3.272700881958008, "num_input_tokens_seen": 1808793600, "step": 3450, "train_runtime": 15711.4431, "train_tokens_per_second": 115125.873 }, { "epoch": 0.18723450309802755, "grad_norm": 0.1725703924894333, "learning_rate": 0.004705918964125061, "loss": 3.2771453857421875, "num_input_tokens_seen": 1814036480, "step": 3460, "train_runtime": 15756.6281, "train_tokens_per_second": 115128.47 }, { "epoch": 0.18777564328039179, "grad_norm": 0.18361718952655792, "learning_rate": 0.004703972749722038, "loss": 3.2812034606933596, "num_input_tokens_seen": 1819279360, "step": 3470, "train_runtime": 15801.8225, "train_tokens_per_second": 115130.983 }, { "epoch": 0.18831678346275602, "grad_norm": 0.18993116915225983, "learning_rate": 0.004702020569930098, "loss": 3.2690109252929687, "num_input_tokens_seen": 1824522240, "step": 3480, "train_runtime": 15847.0133, "train_tokens_per_second": 115133.508 }, { "epoch": 0.18885792364512027, "grad_norm": 0.1982622891664505, "learning_rate": 0.004700062430709161, "loss": 3.2883895874023437, "num_input_tokens_seen": 1829765120, "step": 3490, "train_runtime": 15892.1956, "train_tokens_per_second": 115136.081 }, { "epoch": 0.1893990638274845, "grad_norm": 0.1953168362379074, "learning_rate": 0.004698098338037333, "loss": 3.2819141387939452, "num_input_tokens_seen": 1835008000, "step": 3500, "train_runtime": 15937.3587, "train_tokens_per_second": 115138.778 }, { "epoch": 0.1893990638274845, "eval_loss": 3.2193782329559326, "eval_runtime": 1.9829, "eval_samples_per_second": 252.151, "eval_steps_per_second": 4.034, "num_input_tokens_seen": 1835008000, "step": 3500 }, { "epoch": 0.18994020400984876, "grad_norm": 0.17765532433986664, "learning_rate": 0.004696128297910899, "loss": 3.2748733520507813, "num_input_tokens_seen": 1840250880, "step": 3510, "train_runtime": 15984.532, "train_tokens_per_second": 115126.979 }, { "epoch": 0.190481344192213, "grad_norm": 0.1692020744085312, "learning_rate": 0.0046941523163443015, "loss": 3.282354736328125, "num_input_tokens_seen": 1845493760, "step": 3520, "train_runtime": 16029.707, "train_tokens_per_second": 115129.601 }, { "epoch": 0.19102248437457722, "grad_norm": 0.17890560626983643, "learning_rate": 0.00469217039937012, "loss": 3.27266845703125, "num_input_tokens_seen": 1850736640, "step": 3530, "train_runtime": 16074.8942, "train_tokens_per_second": 115132.119 }, { "epoch": 0.19156362455694148, "grad_norm": 0.17925257980823517, "learning_rate": 0.004690182553039058, "loss": 3.28330078125, "num_input_tokens_seen": 1855979520, "step": 3540, "train_runtime": 16120.1066, "train_tokens_per_second": 115134.445 }, { "epoch": 0.1921047647393057, "grad_norm": 0.1788860410451889, "learning_rate": 0.004688188783419917, "loss": 3.2885406494140623, "num_input_tokens_seen": 1861222400, "step": 3550, "train_runtime": 16165.2878, "train_tokens_per_second": 115136.979 }, { "epoch": 0.19264590492166997, "grad_norm": 0.1811930388212204, "learning_rate": 0.004686189096599585, "loss": 3.2768978118896483, "num_input_tokens_seen": 1866465280, "step": 3560, "train_runtime": 16210.475, "train_tokens_per_second": 115139.456 }, { "epoch": 0.1931870451040342, "grad_norm": 0.2033502757549286, "learning_rate": 0.004684183498683013, "loss": 3.2799072265625, "num_input_tokens_seen": 1871708160, "step": 3570, "train_runtime": 16255.6518, "train_tokens_per_second": 115141.994 }, { "epoch": 0.19372818528639843, "grad_norm": 0.1871407926082611, "learning_rate": 0.0046821719957932, "loss": 3.2745807647705076, "num_input_tokens_seen": 1876951040, "step": 3580, "train_runtime": 16300.8306, "train_tokens_per_second": 115144.503 }, { "epoch": 0.1942693254687627, "grad_norm": 0.18156413733959198, "learning_rate": 0.004680154594071171, "loss": 3.275892639160156, "num_input_tokens_seen": 1882193920, "step": 3590, "train_runtime": 16346.0348, "train_tokens_per_second": 115146.819 }, { "epoch": 0.19481046565112692, "grad_norm": 0.17189612984657288, "learning_rate": 0.004678131299675962, "loss": 3.278411102294922, "num_input_tokens_seen": 1887436800, "step": 3600, "train_runtime": 16391.2075, "train_tokens_per_second": 115149.344 }, { "epoch": 0.19535160583349118, "grad_norm": 0.18602769076824188, "learning_rate": 0.004676102118784596, "loss": 3.2600128173828127, "num_input_tokens_seen": 1892679680, "step": 3610, "train_runtime": 16436.3754, "train_tokens_per_second": 115151.89 }, { "epoch": 0.1958927460158554, "grad_norm": 0.18247896432876587, "learning_rate": 0.0046740670575920705, "loss": 3.263835906982422, "num_input_tokens_seen": 1897922560, "step": 3620, "train_runtime": 16481.5414, "train_tokens_per_second": 115154.434 }, { "epoch": 0.19643388619821964, "grad_norm": 0.17899157106876373, "learning_rate": 0.004672026122311332, "loss": 3.266416549682617, "num_input_tokens_seen": 1903165440, "step": 3630, "train_runtime": 16526.6863, "train_tokens_per_second": 115157.111 }, { "epoch": 0.1969750263805839, "grad_norm": 0.19543124735355377, "learning_rate": 0.004669979319173264, "loss": 3.261871337890625, "num_input_tokens_seen": 1908408320, "step": 3640, "train_runtime": 16571.8633, "train_tokens_per_second": 115159.55 }, { "epoch": 0.19751616656294813, "grad_norm": 0.18458126485347748, "learning_rate": 0.004667926654426661, "loss": 3.2731971740722656, "num_input_tokens_seen": 1913651200, "step": 3650, "train_runtime": 16617.0658, "train_tokens_per_second": 115161.799 }, { "epoch": 0.19805730674531238, "grad_norm": 0.18683847784996033, "learning_rate": 0.004665868134338213, "loss": 3.2641891479492187, "num_input_tokens_seen": 1918894080, "step": 3660, "train_runtime": 16662.2485, "train_tokens_per_second": 115164.173 }, { "epoch": 0.19859844692767661, "grad_norm": 0.18460538983345032, "learning_rate": 0.00466380376519249, "loss": 3.261058044433594, "num_input_tokens_seen": 1924136960, "step": 3670, "train_runtime": 16707.4259, "train_tokens_per_second": 115166.572 }, { "epoch": 0.19913958711004084, "grad_norm": 0.17181532084941864, "learning_rate": 0.004661733553291914, "loss": 3.2611160278320312, "num_input_tokens_seen": 1929379840, "step": 3680, "train_runtime": 16752.6562, "train_tokens_per_second": 115168.593 }, { "epoch": 0.1996807272924051, "grad_norm": 0.19158703088760376, "learning_rate": 0.004659657504956747, "loss": 3.2646514892578127, "num_input_tokens_seen": 1934622720, "step": 3690, "train_runtime": 16797.8998, "train_tokens_per_second": 115170.512 }, { "epoch": 0.20022186747476933, "grad_norm": 0.18142655491828918, "learning_rate": 0.004657575626525069, "loss": 3.258336639404297, "num_input_tokens_seen": 1939865600, "step": 3700, "train_runtime": 16843.1356, "train_tokens_per_second": 115172.474 }, { "epoch": 0.2007630076571336, "grad_norm": 0.1888807713985443, "learning_rate": 0.00465548792435276, "loss": 3.256613922119141, "num_input_tokens_seen": 1945108480, "step": 3710, "train_runtime": 16888.3732, "train_tokens_per_second": 115174.414 }, { "epoch": 0.20130414783949782, "grad_norm": 0.17357957363128662, "learning_rate": 0.004653394404813478, "loss": 3.2642303466796876, "num_input_tokens_seen": 1950351360, "step": 3720, "train_runtime": 16933.6199, "train_tokens_per_second": 115176.281 }, { "epoch": 0.20184528802186205, "grad_norm": 0.17942315340042114, "learning_rate": 0.004651295074298641, "loss": 3.254298782348633, "num_input_tokens_seen": 1955594240, "step": 3730, "train_runtime": 16978.845, "train_tokens_per_second": 115178.284 }, { "epoch": 0.2023864282042263, "grad_norm": 0.17983372509479523, "learning_rate": 0.00464918993921741, "loss": 3.2564628601074217, "num_input_tokens_seen": 1960837120, "step": 3740, "train_runtime": 17024.0569, "train_tokens_per_second": 115180.367 }, { "epoch": 0.20292756838659054, "grad_norm": 0.19154661893844604, "learning_rate": 0.004647079005996664, "loss": 3.2626083374023436, "num_input_tokens_seen": 1966080000, "step": 3750, "train_runtime": 17069.2756, "train_tokens_per_second": 115182.392 }, { "epoch": 0.2034687085689548, "grad_norm": 0.16907712817192078, "learning_rate": 0.0046449622810809865, "loss": 3.2560802459716798, "num_input_tokens_seen": 1971322880, "step": 3760, "train_runtime": 17114.512, "train_tokens_per_second": 115184.288 }, { "epoch": 0.20400984875131903, "grad_norm": 0.1877511590719223, "learning_rate": 0.004642839770932641, "loss": 3.2611919403076173, "num_input_tokens_seen": 1976565760, "step": 3770, "train_runtime": 17159.7356, "train_tokens_per_second": 115186.26 }, { "epoch": 0.20455098893368326, "grad_norm": 0.1924838423728943, "learning_rate": 0.004640711482031552, "loss": 3.259069061279297, "num_input_tokens_seen": 1981808640, "step": 3780, "train_runtime": 17204.9712, "train_tokens_per_second": 115188.14 }, { "epoch": 0.20509212911604752, "grad_norm": 0.17791348695755005, "learning_rate": 0.00463857742087529, "loss": 3.2603363037109374, "num_input_tokens_seen": 1987051520, "step": 3790, "train_runtime": 17250.1988, "train_tokens_per_second": 115190.065 }, { "epoch": 0.20563326929841175, "grad_norm": 0.18873880803585052, "learning_rate": 0.004636437593979043, "loss": 3.260697937011719, "num_input_tokens_seen": 1992294400, "step": 3800, "train_runtime": 17295.4319, "train_tokens_per_second": 115191.943 }, { "epoch": 0.206174409480776, "grad_norm": 0.1765436977148056, "learning_rate": 0.004634292007875606, "loss": 3.25205078125, "num_input_tokens_seen": 1997537280, "step": 3810, "train_runtime": 17340.6638, "train_tokens_per_second": 115193.819 }, { "epoch": 0.20671554966314024, "grad_norm": 0.17367282509803772, "learning_rate": 0.004632140669115353, "loss": 3.2628250122070312, "num_input_tokens_seen": 2002780160, "step": 3820, "train_runtime": 17390.4197, "train_tokens_per_second": 115165.717 }, { "epoch": 0.20725668984550447, "grad_norm": 0.1866482049226761, "learning_rate": 0.004629983584266224, "loss": 3.255748748779297, "num_input_tokens_seen": 2008023040, "step": 3830, "train_runtime": 17435.5785, "train_tokens_per_second": 115168.134 }, { "epoch": 0.20779783002786872, "grad_norm": 0.18709656596183777, "learning_rate": 0.004627820759913699, "loss": 3.2663009643554686, "num_input_tokens_seen": 2013265920, "step": 3840, "train_runtime": 17480.7483, "train_tokens_per_second": 115170.465 }, { "epoch": 0.20833897021023295, "grad_norm": 0.19228561222553253, "learning_rate": 0.0046256522026607814, "loss": 3.2513301849365233, "num_input_tokens_seen": 2018508800, "step": 3850, "train_runtime": 17525.9241, "train_tokens_per_second": 115172.746 }, { "epoch": 0.2088801103925972, "grad_norm": 0.17255409061908722, "learning_rate": 0.004623477919127976, "loss": 3.243180847167969, "num_input_tokens_seen": 2023751680, "step": 3860, "train_runtime": 17571.1102, "train_tokens_per_second": 115174.947 }, { "epoch": 0.20942125057496144, "grad_norm": 0.176405668258667, "learning_rate": 0.004621297915953271, "loss": 3.2499061584472657, "num_input_tokens_seen": 2028994560, "step": 3870, "train_runtime": 17616.3069, "train_tokens_per_second": 115177.067 }, { "epoch": 0.20996239075732567, "grad_norm": 0.17931312322616577, "learning_rate": 0.004619112199792115, "loss": 3.263928985595703, "num_input_tokens_seen": 2034237440, "step": 3880, "train_runtime": 17661.4804, "train_tokens_per_second": 115179.328 }, { "epoch": 0.21050353093968993, "grad_norm": 0.1927679032087326, "learning_rate": 0.004616920777317401, "loss": 3.243641662597656, "num_input_tokens_seen": 2039480320, "step": 3890, "train_runtime": 17706.6588, "train_tokens_per_second": 115181.545 }, { "epoch": 0.21104467112205416, "grad_norm": 0.1834532767534256, "learning_rate": 0.00461472365521944, "loss": 3.2529090881347655, "num_input_tokens_seen": 2044723200, "step": 3900, "train_runtime": 17751.8407, "train_tokens_per_second": 115183.728 }, { "epoch": 0.21158581130441842, "grad_norm": 0.18204724788665771, "learning_rate": 0.004612520840205942, "loss": 3.252873992919922, "num_input_tokens_seen": 2049966080, "step": 3910, "train_runtime": 17797.0199, "train_tokens_per_second": 115185.918 }, { "epoch": 0.21212695148678265, "grad_norm": 0.17959408462047577, "learning_rate": 0.0046103123390020045, "loss": 3.2571083068847657, "num_input_tokens_seen": 2055208960, "step": 3920, "train_runtime": 17842.2041, "train_tokens_per_second": 115188.065 }, { "epoch": 0.21266809166914688, "grad_norm": 0.18271717429161072, "learning_rate": 0.004608098158350076, "loss": 3.2583240509033202, "num_input_tokens_seen": 2060451840, "step": 3930, "train_runtime": 17887.3836, "train_tokens_per_second": 115190.23 }, { "epoch": 0.21320923185151114, "grad_norm": 0.1708153486251831, "learning_rate": 0.004605878305009951, "loss": 3.2490577697753906, "num_input_tokens_seen": 2065694720, "step": 3940, "train_runtime": 17932.5711, "train_tokens_per_second": 115192.334 }, { "epoch": 0.21375037203387537, "grad_norm": 0.17891845107078552, "learning_rate": 0.004603652785758739, "loss": 3.253165435791016, "num_input_tokens_seen": 2070937600, "step": 3950, "train_runtime": 17977.7786, "train_tokens_per_second": 115194.299 }, { "epoch": 0.21429151221623963, "grad_norm": 0.19264911115169525, "learning_rate": 0.0046014216073908465, "loss": 3.252245330810547, "num_input_tokens_seen": 2076180480, "step": 3960, "train_runtime": 18022.9578, "train_tokens_per_second": 115196.434 }, { "epoch": 0.21483265239860386, "grad_norm": 0.17727237939834595, "learning_rate": 0.00459918477671796, "loss": 3.2557418823242186, "num_input_tokens_seen": 2081423360, "step": 3970, "train_runtime": 18068.1382, "train_tokens_per_second": 115198.552 }, { "epoch": 0.2153737925809681, "grad_norm": 0.18832355737686157, "learning_rate": 0.00459694230056902, "loss": 3.2547958374023436, "num_input_tokens_seen": 2086666240, "step": 3980, "train_runtime": 18113.3099, "train_tokens_per_second": 115200.714 }, { "epoch": 0.21591493276333235, "grad_norm": 0.1745108813047409, "learning_rate": 0.004594694185790203, "loss": 3.2427162170410155, "num_input_tokens_seen": 2091909120, "step": 3990, "train_runtime": 18158.476, "train_tokens_per_second": 115202.901 }, { "epoch": 0.21645607294569658, "grad_norm": 0.18806034326553345, "learning_rate": 0.004592440439244901, "loss": 3.247505950927734, "num_input_tokens_seen": 2097152000, "step": 4000, "train_runtime": 18203.6569, "train_tokens_per_second": 115204.984 }, { "epoch": 0.21645607294569658, "eval_loss": 3.1910831928253174, "eval_runtime": 1.9924, "eval_samples_per_second": 250.957, "eval_steps_per_second": 4.015, "num_input_tokens_seen": 2097152000, "step": 4000 }, { "epoch": 0.21699721312806083, "grad_norm": 0.18182304501533508, "learning_rate": 0.004590181067813696, "loss": 3.2401611328125, "num_input_tokens_seen": 2102394880, "step": 4010, "train_runtime": 18253.3021, "train_tokens_per_second": 115178.879 }, { "epoch": 0.21753835331042506, "grad_norm": 0.19632118940353394, "learning_rate": 0.004587916078394347, "loss": 3.248242950439453, "num_input_tokens_seen": 2107637760, "step": 4020, "train_runtime": 18298.4585, "train_tokens_per_second": 115181.164 }, { "epoch": 0.2180794934927893, "grad_norm": 0.17511795461177826, "learning_rate": 0.004585645477901763, "loss": 3.2442108154296876, "num_input_tokens_seen": 2112880640, "step": 4030, "train_runtime": 18343.616, "train_tokens_per_second": 115183.432 }, { "epoch": 0.21862063367515355, "grad_norm": 0.18919962644577026, "learning_rate": 0.004583369273267981, "loss": 3.2474128723144533, "num_input_tokens_seen": 2118123520, "step": 4040, "train_runtime": 18388.8196, "train_tokens_per_second": 115185.399 }, { "epoch": 0.21916177385751778, "grad_norm": 0.1883443295955658, "learning_rate": 0.00458108747144215, "loss": 3.2397232055664062, "num_input_tokens_seen": 2123366400, "step": 4050, "train_runtime": 18433.9903, "train_tokens_per_second": 115187.562 }, { "epoch": 0.21970291403988204, "grad_norm": 0.16874343156814575, "learning_rate": 0.004578800079390506, "loss": 3.243609619140625, "num_input_tokens_seen": 2128609280, "step": 4060, "train_runtime": 18479.1743, "train_tokens_per_second": 115189.632 }, { "epoch": 0.22024405422224627, "grad_norm": 0.1780671924352646, "learning_rate": 0.004576507104096353, "loss": 3.249961090087891, "num_input_tokens_seen": 2133852160, "step": 4070, "train_runtime": 18524.3424, "train_tokens_per_second": 115191.79 }, { "epoch": 0.2207851944046105, "grad_norm": 0.1814623475074768, "learning_rate": 0.0045742085525600365, "loss": 3.247069549560547, "num_input_tokens_seen": 2139095040, "step": 4080, "train_runtime": 18569.5288, "train_tokens_per_second": 115193.825 }, { "epoch": 0.22132633458697476, "grad_norm": 0.18335077166557312, "learning_rate": 0.004571904431798931, "loss": 3.241147994995117, "num_input_tokens_seen": 2144337920, "step": 4090, "train_runtime": 18614.7052, "train_tokens_per_second": 115195.911 }, { "epoch": 0.221867474769339, "grad_norm": 0.16724441945552826, "learning_rate": 0.004569594748847409, "loss": 3.24347038269043, "num_input_tokens_seen": 2149580800, "step": 4100, "train_runtime": 18659.889, "train_tokens_per_second": 115197.942 }, { "epoch": 0.22240861495170325, "grad_norm": 0.17200608551502228, "learning_rate": 0.004567279510756828, "loss": 3.2341545104980467, "num_input_tokens_seen": 2154823680, "step": 4110, "train_runtime": 18705.0717, "train_tokens_per_second": 115199.969 }, { "epoch": 0.22294975513406748, "grad_norm": 0.178621307015419, "learning_rate": 0.0045649587245955026, "loss": 3.2321949005126953, "num_input_tokens_seen": 2160066560, "step": 4120, "train_runtime": 18750.2407, "train_tokens_per_second": 115202.071 }, { "epoch": 0.2234908953164317, "grad_norm": 0.18516632914543152, "learning_rate": 0.0045626323974486864, "loss": 3.238597869873047, "num_input_tokens_seen": 2165309440, "step": 4130, "train_runtime": 18795.4162, "train_tokens_per_second": 115204.123 }, { "epoch": 0.22403203549879597, "grad_norm": 0.20164692401885986, "learning_rate": 0.004560300536418549, "loss": 3.237165832519531, "num_input_tokens_seen": 2170552320, "step": 4140, "train_runtime": 18840.5926, "train_tokens_per_second": 115206.16 }, { "epoch": 0.2245731756811602, "grad_norm": 0.1872573047876358, "learning_rate": 0.004557963148624155, "loss": 3.2406959533691406, "num_input_tokens_seen": 2175795200, "step": 4150, "train_runtime": 18885.7554, "train_tokens_per_second": 115208.269 }, { "epoch": 0.22511431586352446, "grad_norm": 0.1811392605304718, "learning_rate": 0.0045556202412014414, "loss": 3.235840606689453, "num_input_tokens_seen": 2181038080, "step": 4160, "train_runtime": 18930.9167, "train_tokens_per_second": 115210.379 }, { "epoch": 0.22565545604588869, "grad_norm": 0.1595960259437561, "learning_rate": 0.0045532718213031976, "loss": 3.2397125244140623, "num_input_tokens_seen": 2186280960, "step": 4170, "train_runtime": 18976.0748, "train_tokens_per_second": 115212.497 }, { "epoch": 0.22619659622825292, "grad_norm": 0.16895633935928345, "learning_rate": 0.00455091789609904, "loss": 3.2353279113769533, "num_input_tokens_seen": 2191523840, "step": 4180, "train_runtime": 19021.2321, "train_tokens_per_second": 115214.61 }, { "epoch": 0.22673773641061717, "grad_norm": 0.17124150693416595, "learning_rate": 0.004548558472775396, "loss": 3.2387535095214846, "num_input_tokens_seen": 2196766720, "step": 4190, "train_runtime": 19066.399, "train_tokens_per_second": 115216.656 }, { "epoch": 0.2272788765929814, "grad_norm": 0.1730462908744812, "learning_rate": 0.004546193558535476, "loss": 3.228282165527344, "num_input_tokens_seen": 2202009600, "step": 4200, "train_runtime": 19115.1785, "train_tokens_per_second": 115196.916 }, { "epoch": 0.22782001677534566, "grad_norm": 0.2116994708776474, "learning_rate": 0.004543823160599253, "loss": 3.228871154785156, "num_input_tokens_seen": 2207252480, "step": 4210, "train_runtime": 19160.343, "train_tokens_per_second": 115199.007 }, { "epoch": 0.2283611569577099, "grad_norm": 0.1870228499174118, "learning_rate": 0.004541447286203444, "loss": 3.2268039703369142, "num_input_tokens_seen": 2212495360, "step": 4220, "train_runtime": 19205.5057, "train_tokens_per_second": 115201.099 }, { "epoch": 0.22890229714007412, "grad_norm": 0.18021926283836365, "learning_rate": 0.004539065942601484, "loss": 3.2385711669921875, "num_input_tokens_seen": 2217738240, "step": 4230, "train_runtime": 19250.6698, "train_tokens_per_second": 115203.173 }, { "epoch": 0.22944343732243838, "grad_norm": 0.178096741437912, "learning_rate": 0.004536679137063506, "loss": 3.2425048828125, "num_input_tokens_seen": 2222981120, "step": 4240, "train_runtime": 19295.8409, "train_tokens_per_second": 115205.195 }, { "epoch": 0.2299845775048026, "grad_norm": 0.17963331937789917, "learning_rate": 0.004534286876876316, "loss": 3.2272270202636717, "num_input_tokens_seen": 2228224000, "step": 4250, "train_runtime": 19341.0226, "train_tokens_per_second": 115207.145 }, { "epoch": 0.23052571768716687, "grad_norm": 0.1644730269908905, "learning_rate": 0.004531889169343374, "loss": 3.232299041748047, "num_input_tokens_seen": 2233466880, "step": 4260, "train_runtime": 19386.1815, "train_tokens_per_second": 115209.221 }, { "epoch": 0.2310668578695311, "grad_norm": 0.18202030658721924, "learning_rate": 0.004529486021784774, "loss": 3.232588195800781, "num_input_tokens_seen": 2238709760, "step": 4270, "train_runtime": 19431.3552, "train_tokens_per_second": 115211.201 }, { "epoch": 0.23160799805189533, "grad_norm": 0.1674603968858719, "learning_rate": 0.004527077441537213, "loss": 3.2268638610839844, "num_input_tokens_seen": 2243952640, "step": 4280, "train_runtime": 19476.5366, "train_tokens_per_second": 115213.125 }, { "epoch": 0.2321491382342596, "grad_norm": 0.17482970654964447, "learning_rate": 0.004524663435953974, "loss": 3.231060791015625, "num_input_tokens_seen": 2249195520, "step": 4290, "train_runtime": 19521.6994, "train_tokens_per_second": 115215.15 }, { "epoch": 0.23269027841662382, "grad_norm": 0.1693650186061859, "learning_rate": 0.004522244012404908, "loss": 3.2219474792480467, "num_input_tokens_seen": 2254438400, "step": 4300, "train_runtime": 19566.837, "train_tokens_per_second": 115217.314 }, { "epoch": 0.23323141859898808, "grad_norm": 0.16282694041728973, "learning_rate": 0.004519819178276401, "loss": 3.214075469970703, "num_input_tokens_seen": 2259681280, "step": 4310, "train_runtime": 19611.992, "train_tokens_per_second": 115219.366 }, { "epoch": 0.2337725587813523, "grad_norm": 0.17166836559772491, "learning_rate": 0.004517388940971363, "loss": 3.229071044921875, "num_input_tokens_seen": 2264924160, "step": 4320, "train_runtime": 19657.1365, "train_tokens_per_second": 115221.47 }, { "epoch": 0.23431369896371654, "grad_norm": 0.1811853051185608, "learning_rate": 0.004514953307909195, "loss": 3.2278045654296874, "num_input_tokens_seen": 2270167040, "step": 4330, "train_runtime": 19702.2886, "train_tokens_per_second": 115223.52 }, { "epoch": 0.2348548391460808, "grad_norm": 0.18831849098205566, "learning_rate": 0.0045125122865257725, "loss": 3.2335960388183596, "num_input_tokens_seen": 2275409920, "step": 4340, "train_runtime": 19747.4554, "train_tokens_per_second": 115225.475 }, { "epoch": 0.23539597932844503, "grad_norm": 0.18200556933879852, "learning_rate": 0.004510065884273422, "loss": 3.230799102783203, "num_input_tokens_seen": 2280652800, "step": 4350, "train_runtime": 19792.6287, "train_tokens_per_second": 115227.383 }, { "epoch": 0.23593711951080928, "grad_norm": 0.18054424226284027, "learning_rate": 0.004507614108620896, "loss": 3.2332107543945314, "num_input_tokens_seen": 2285895680, "step": 4360, "train_runtime": 19837.7879, "train_tokens_per_second": 115229.364 }, { "epoch": 0.23647825969317351, "grad_norm": 0.17672619223594666, "learning_rate": 0.004505156967053355, "loss": 3.229229736328125, "num_input_tokens_seen": 2291138560, "step": 4370, "train_runtime": 19882.9214, "train_tokens_per_second": 115231.485 }, { "epoch": 0.23701939987553775, "grad_norm": 0.18023458123207092, "learning_rate": 0.004502694467072336, "loss": 3.221567916870117, "num_input_tokens_seen": 2296381440, "step": 4380, "train_runtime": 19928.0618, "train_tokens_per_second": 115233.557 }, { "epoch": 0.237560540057902, "grad_norm": 0.18084236979484558, "learning_rate": 0.0045002266161957415, "loss": 3.2244552612304687, "num_input_tokens_seen": 2301624320, "step": 4390, "train_runtime": 19973.1892, "train_tokens_per_second": 115235.694 }, { "epoch": 0.23810168024026623, "grad_norm": 0.17632804811000824, "learning_rate": 0.004497753421957804, "loss": 3.2264179229736327, "num_input_tokens_seen": 2306867200, "step": 4400, "train_runtime": 20018.2945, "train_tokens_per_second": 115237.949 }, { "epoch": 0.2386428204226305, "grad_norm": 0.18496030569076538, "learning_rate": 0.004495274891909074, "loss": 3.2306861877441406, "num_input_tokens_seen": 2312110080, "step": 4410, "train_runtime": 20063.4387, "train_tokens_per_second": 115239.97 }, { "epoch": 0.23918396060499472, "grad_norm": 0.19217975437641144, "learning_rate": 0.004492791033616388, "loss": 3.2278289794921875, "num_input_tokens_seen": 2317352960, "step": 4420, "train_runtime": 20108.5903, "train_tokens_per_second": 115241.94 }, { "epoch": 0.23972510078735895, "grad_norm": 0.17978626489639282, "learning_rate": 0.004490301854662851, "loss": 3.222820281982422, "num_input_tokens_seen": 2322595840, "step": 4430, "train_runtime": 20153.7635, "train_tokens_per_second": 115243.778 }, { "epoch": 0.2402662409697232, "grad_norm": 0.1925116330385208, "learning_rate": 0.0044878073626478145, "loss": 3.216511535644531, "num_input_tokens_seen": 2327838720, "step": 4440, "train_runtime": 20198.9336, "train_tokens_per_second": 115245.625 }, { "epoch": 0.24080738115208744, "grad_norm": 0.1764633059501648, "learning_rate": 0.004485307565186844, "loss": 3.2247901916503907, "num_input_tokens_seen": 2333081600, "step": 4450, "train_runtime": 20244.1177, "train_tokens_per_second": 115247.383 }, { "epoch": 0.2413485213344517, "grad_norm": 0.18181581795215607, "learning_rate": 0.0044828024699117095, "loss": 3.2144775390625, "num_input_tokens_seen": 2338324480, "step": 4460, "train_runtime": 20289.2915, "train_tokens_per_second": 115249.193 }, { "epoch": 0.24188966151681593, "grad_norm": 0.1797225922346115, "learning_rate": 0.0044802920844703486, "loss": 3.2179054260253905, "num_input_tokens_seen": 2343567360, "step": 4470, "train_runtime": 20334.4596, "train_tokens_per_second": 115251.028 }, { "epoch": 0.24243080169918016, "grad_norm": 0.17131617665290833, "learning_rate": 0.004477776416526856, "loss": 3.2136348724365233, "num_input_tokens_seen": 2348810240, "step": 4480, "train_runtime": 20379.6251, "train_tokens_per_second": 115252.868 }, { "epoch": 0.24297194188154442, "grad_norm": 0.17490802705287933, "learning_rate": 0.004475255473761447, "loss": 3.223601531982422, "num_input_tokens_seen": 2354053120, "step": 4490, "train_runtime": 20424.7932, "train_tokens_per_second": 115254.686 }, { "epoch": 0.24351308206390865, "grad_norm": 0.18434712290763855, "learning_rate": 0.004472729263870446, "loss": 3.219706726074219, "num_input_tokens_seen": 2359296000, "step": 4500, "train_runtime": 20469.9487, "train_tokens_per_second": 115256.566 }, { "epoch": 0.24351308206390865, "eval_loss": 3.1681437492370605, "eval_runtime": 1.9847, "eval_samples_per_second": 251.921, "eval_steps_per_second": 4.031, "num_input_tokens_seen": 2359296000, "step": 4500 }, { "epoch": 0.2440542222462729, "grad_norm": 0.18588702380657196, "learning_rate": 0.0044701977945662535, "loss": 3.2231178283691406, "num_input_tokens_seen": 2364538880, "step": 4510, "train_runtime": 20517.077, "train_tokens_per_second": 115247.356 }, { "epoch": 0.24459536242863714, "grad_norm": 0.1629338264465332, "learning_rate": 0.004467661073577332, "loss": 3.2203128814697264, "num_input_tokens_seen": 2369781760, "step": 4520, "train_runtime": 20562.2414, "train_tokens_per_second": 115249.195 }, { "epoch": 0.24513650261100137, "grad_norm": 0.19198022782802582, "learning_rate": 0.00446511910864817, "loss": 3.2169837951660156, "num_input_tokens_seen": 2375024640, "step": 4530, "train_runtime": 20607.3939, "train_tokens_per_second": 115251.092 }, { "epoch": 0.24567764279336562, "grad_norm": 0.17990648746490479, "learning_rate": 0.004462571907539273, "loss": 3.2237472534179688, "num_input_tokens_seen": 2380267520, "step": 4540, "train_runtime": 20652.5476, "train_tokens_per_second": 115252.974 }, { "epoch": 0.24621878297572986, "grad_norm": 0.17402444779872894, "learning_rate": 0.004460019478027127, "loss": 3.2200748443603517, "num_input_tokens_seen": 2385510400, "step": 4550, "train_runtime": 20697.698, "train_tokens_per_second": 115254.865 }, { "epoch": 0.2467599231580941, "grad_norm": 0.17017342150211334, "learning_rate": 0.004457461827904183, "loss": 3.2241039276123047, "num_input_tokens_seen": 2390753280, "step": 4560, "train_runtime": 20742.8484, "train_tokens_per_second": 115256.749 }, { "epoch": 0.24730106334045834, "grad_norm": 0.17307622730731964, "learning_rate": 0.004454898964978828, "loss": 3.2237174987792967, "num_input_tokens_seen": 2395996160, "step": 4570, "train_runtime": 20788.0181, "train_tokens_per_second": 115258.518 }, { "epoch": 0.24784220352282257, "grad_norm": 0.18426761031150818, "learning_rate": 0.004452330897075365, "loss": 3.2148464202880858, "num_input_tokens_seen": 2401239040, "step": 4580, "train_runtime": 20836.8458, "train_tokens_per_second": 115240.045 }, { "epoch": 0.24838334370518683, "grad_norm": 0.18935158848762512, "learning_rate": 0.004449757632033987, "loss": 3.2203189849853517, "num_input_tokens_seen": 2406481920, "step": 4590, "train_runtime": 20882.0043, "train_tokens_per_second": 115241.903 }, { "epoch": 0.24892448388755106, "grad_norm": 0.17916643619537354, "learning_rate": 0.004447179177710755, "loss": 3.220214080810547, "num_input_tokens_seen": 2411724800, "step": 4600, "train_runtime": 20927.1721, "train_tokens_per_second": 115243.703 }, { "epoch": 0.24946562406991532, "grad_norm": 0.1714351922273636, "learning_rate": 0.0044445955419775696, "loss": 3.2130130767822265, "num_input_tokens_seen": 2416967680, "step": 4610, "train_runtime": 20972.3937, "train_tokens_per_second": 115245.199 }, { "epoch": 0.2500067642522796, "grad_norm": 0.17399680614471436, "learning_rate": 0.004442006732722152, "loss": 3.2150115966796875, "num_input_tokens_seen": 2422210560, "step": 4620, "train_runtime": 21017.5875, "train_tokens_per_second": 115246.841 }, { "epoch": 0.2505479044346438, "grad_norm": 0.18769405782222748, "learning_rate": 0.00443941275784802, "loss": 3.2187454223632814, "num_input_tokens_seen": 2427453440, "step": 4630, "train_runtime": 21062.785, "train_tokens_per_second": 115248.456 }, { "epoch": 0.25108904461700804, "grad_norm": 0.1909925937652588, "learning_rate": 0.004436813625274458, "loss": 3.228108215332031, "num_input_tokens_seen": 2432696320, "step": 4640, "train_runtime": 21107.9893, "train_tokens_per_second": 115250.026 }, { "epoch": 0.25163018479937227, "grad_norm": 0.16732154786586761, "learning_rate": 0.004434209342936497, "loss": 3.213469314575195, "num_input_tokens_seen": 2437939200, "step": 4650, "train_runtime": 21153.1981, "train_tokens_per_second": 115251.566 }, { "epoch": 0.2521713249817365, "grad_norm": 0.17457814514636993, "learning_rate": 0.0044315999187848915, "loss": 3.224944305419922, "num_input_tokens_seen": 2443182080, "step": 4660, "train_runtime": 21198.4016, "train_tokens_per_second": 115253.127 }, { "epoch": 0.2527124651641008, "grad_norm": 0.192255899310112, "learning_rate": 0.004428985360786096, "loss": 3.227398681640625, "num_input_tokens_seen": 2448424960, "step": 4670, "train_runtime": 21243.6232, "train_tokens_per_second": 115254.584 }, { "epoch": 0.253253605346465, "grad_norm": 0.17784617841243744, "learning_rate": 0.004426365676922234, "loss": 3.2128623962402343, "num_input_tokens_seen": 2453667840, "step": 4680, "train_runtime": 21288.8398, "train_tokens_per_second": 115256.062 }, { "epoch": 0.25379474552882925, "grad_norm": 0.17195752263069153, "learning_rate": 0.00442374087519108, "loss": 3.2142982482910156, "num_input_tokens_seen": 2458910720, "step": 4690, "train_runtime": 21334.0638, "train_tokens_per_second": 115257.493 }, { "epoch": 0.2543358857111935, "grad_norm": 0.1722942292690277, "learning_rate": 0.004421110963606032, "loss": 3.210185241699219, "num_input_tokens_seen": 2464153600, "step": 4700, "train_runtime": 21379.267, "train_tokens_per_second": 115259.031 }, { "epoch": 0.2548770258935577, "grad_norm": 0.16966107487678528, "learning_rate": 0.00441847595019609, "loss": 3.2123428344726563, "num_input_tokens_seen": 2469396480, "step": 4710, "train_runtime": 21424.483, "train_tokens_per_second": 115260.493 }, { "epoch": 0.255418166075922, "grad_norm": 0.18033796548843384, "learning_rate": 0.004415835843005828, "loss": 3.2065505981445312, "num_input_tokens_seen": 2474639360, "step": 4720, "train_runtime": 21469.6877, "train_tokens_per_second": 115262.01 }, { "epoch": 0.2559593062582862, "grad_norm": 0.18272215127944946, "learning_rate": 0.004413190650095373, "loss": 3.2069171905517577, "num_input_tokens_seen": 2479882240, "step": 4730, "train_runtime": 21514.9, "train_tokens_per_second": 115263.48 }, { "epoch": 0.25650044644065045, "grad_norm": 0.17386901378631592, "learning_rate": 0.004410540379540377, "loss": 3.2177162170410156, "num_input_tokens_seen": 2485125120, "step": 4740, "train_runtime": 21560.142, "train_tokens_per_second": 115264.784 }, { "epoch": 0.2570415866230147, "grad_norm": 0.17471922934055328, "learning_rate": 0.0044078850394319935, "loss": 3.2096931457519533, "num_input_tokens_seen": 2490368000, "step": 4750, "train_runtime": 21605.3695, "train_tokens_per_second": 115266.161 }, { "epoch": 0.2575827268053789, "grad_norm": 0.188929483294487, "learning_rate": 0.004405224637876854, "loss": 3.215177536010742, "num_input_tokens_seen": 2495610880, "step": 4760, "train_runtime": 21650.575, "train_tokens_per_second": 115267.649 }, { "epoch": 0.2581238669877432, "grad_norm": 0.18201977014541626, "learning_rate": 0.0044025591829970415, "loss": 3.2025718688964844, "num_input_tokens_seen": 2500853760, "step": 4770, "train_runtime": 21695.7857, "train_tokens_per_second": 115269.103 }, { "epoch": 0.25866500717010743, "grad_norm": 0.18745562434196472, "learning_rate": 0.004399888682930069, "loss": 3.2124725341796876, "num_input_tokens_seen": 2506096640, "step": 4780, "train_runtime": 21740.9904, "train_tokens_per_second": 115270.583 }, { "epoch": 0.25920614735247166, "grad_norm": 0.18076451122760773, "learning_rate": 0.004397213145828847, "loss": 3.2005435943603517, "num_input_tokens_seen": 2511339520, "step": 4790, "train_runtime": 21786.1967, "train_tokens_per_second": 115272.049 }, { "epoch": 0.2597472875348359, "grad_norm": 0.16596098244190216, "learning_rate": 0.004394532579861671, "loss": 3.197236251831055, "num_input_tokens_seen": 2516582400, "step": 4800, "train_runtime": 21831.4029, "train_tokens_per_second": 115273.508 }, { "epoch": 0.2602884277172001, "grad_norm": 0.17128406465053558, "learning_rate": 0.004391846993212182, "loss": 3.2089080810546875, "num_input_tokens_seen": 2521825280, "step": 4810, "train_runtime": 21876.6005, "train_tokens_per_second": 115275.007 }, { "epoch": 0.2608295678995644, "grad_norm": 0.17832306027412415, "learning_rate": 0.004389156394079355, "loss": 3.202547073364258, "num_input_tokens_seen": 2527068160, "step": 4820, "train_runtime": 21921.8037, "train_tokens_per_second": 115276.471 }, { "epoch": 0.26137070808192864, "grad_norm": 0.16681405901908875, "learning_rate": 0.004386460790677465, "loss": 3.2106822967529296, "num_input_tokens_seen": 2532311040, "step": 4830, "train_runtime": 21967.0048, "train_tokens_per_second": 115277.939 }, { "epoch": 0.26191184826429287, "grad_norm": 0.17566899955272675, "learning_rate": 0.004383760191236065, "loss": 3.2070526123046874, "num_input_tokens_seen": 2537553920, "step": 4840, "train_runtime": 22012.208, "train_tokens_per_second": 115279.39 }, { "epoch": 0.2624529884466571, "grad_norm": 0.17574016749858856, "learning_rate": 0.00438105460399996, "loss": 3.203447723388672, "num_input_tokens_seen": 2542796800, "step": 4850, "train_runtime": 22057.4092, "train_tokens_per_second": 115280.846 }, { "epoch": 0.26299412862902133, "grad_norm": 0.16241556406021118, "learning_rate": 0.004378344037229184, "loss": 3.2026832580566404, "num_input_tokens_seen": 2548039680, "step": 4860, "train_runtime": 22102.6211, "train_tokens_per_second": 115282.24 }, { "epoch": 0.2635352688113856, "grad_norm": 0.1805507242679596, "learning_rate": 0.004375628499198973, "loss": 3.2010284423828126, "num_input_tokens_seen": 2553282560, "step": 4870, "train_runtime": 22147.8116, "train_tokens_per_second": 115283.74 }, { "epoch": 0.26407640899374984, "grad_norm": 0.16756032407283783, "learning_rate": 0.004372907998199739, "loss": 3.2070991516113283, "num_input_tokens_seen": 2558525440, "step": 4880, "train_runtime": 22192.9705, "train_tokens_per_second": 115285.398 }, { "epoch": 0.2646175491761141, "grad_norm": 0.18972600996494293, "learning_rate": 0.004370182542537047, "loss": 3.214699554443359, "num_input_tokens_seen": 2563768320, "step": 4890, "train_runtime": 22238.1209, "train_tokens_per_second": 115287.094 }, { "epoch": 0.2651586893584783, "grad_norm": 0.1896647959947586, "learning_rate": 0.004367452140531587, "loss": 3.205576705932617, "num_input_tokens_seen": 2569011200, "step": 4900, "train_runtime": 22283.3129, "train_tokens_per_second": 115288.566 }, { "epoch": 0.26569982954084254, "grad_norm": 0.18498484790325165, "learning_rate": 0.004364716800519152, "loss": 3.2080978393554687, "num_input_tokens_seen": 2574254080, "step": 4910, "train_runtime": 22328.4859, "train_tokens_per_second": 115290.132 }, { "epoch": 0.2662409697232068, "grad_norm": 0.1854403018951416, "learning_rate": 0.0043619765308506074, "loss": 3.203238677978516, "num_input_tokens_seen": 2579496960, "step": 4920, "train_runtime": 22373.6522, "train_tokens_per_second": 115291.725 }, { "epoch": 0.26678210990557105, "grad_norm": 0.1691334992647171, "learning_rate": 0.004359231339891872, "loss": 3.1914302825927736, "num_input_tokens_seen": 2584739840, "step": 4930, "train_runtime": 22418.8106, "train_tokens_per_second": 115293.353 }, { "epoch": 0.2673232500879353, "grad_norm": 0.17332448065280914, "learning_rate": 0.004356481236023887, "loss": 3.2087932586669923, "num_input_tokens_seen": 2589982720, "step": 4940, "train_runtime": 22463.9738, "train_tokens_per_second": 115294.949 }, { "epoch": 0.2678643902702995, "grad_norm": 0.1679113507270813, "learning_rate": 0.004353726227642593, "loss": 3.2014122009277344, "num_input_tokens_seen": 2595225600, "step": 4950, "train_runtime": 22509.1287, "train_tokens_per_second": 115296.582 }, { "epoch": 0.26840553045266374, "grad_norm": 0.16913928091526031, "learning_rate": 0.004350966323158903, "loss": 3.1890819549560545, "num_input_tokens_seen": 2600468480, "step": 4960, "train_runtime": 22554.2873, "train_tokens_per_second": 115298.189 }, { "epoch": 0.26894667063502803, "grad_norm": 0.16906581819057465, "learning_rate": 0.00434820153099868, "loss": 3.202825927734375, "num_input_tokens_seen": 2605711360, "step": 4970, "train_runtime": 22602.9949, "train_tokens_per_second": 115281.686 }, { "epoch": 0.26948781081739226, "grad_norm": 0.16878265142440796, "learning_rate": 0.004345431859602706, "loss": 3.200624465942383, "num_input_tokens_seen": 2610954240, "step": 4980, "train_runtime": 22648.1981, "train_tokens_per_second": 115283.089 }, { "epoch": 0.2700289509997565, "grad_norm": 0.1862846463918686, "learning_rate": 0.004342657317426662, "loss": 3.206439971923828, "num_input_tokens_seen": 2616197120, "step": 4990, "train_runtime": 22693.3935, "train_tokens_per_second": 115284.526 }, { "epoch": 0.2705700911821207, "grad_norm": 0.16954657435417175, "learning_rate": 0.004339877912941097, "loss": 3.199533462524414, "num_input_tokens_seen": 2621440000, "step": 5000, "train_runtime": 22738.6005, "train_tokens_per_second": 115285.899 }, { "epoch": 0.2705700911821207, "eval_loss": 3.146559715270996, "eval_runtime": 1.9859, "eval_samples_per_second": 251.773, "eval_steps_per_second": 4.028, "num_input_tokens_seen": 2621440000, "step": 5000 }, { "epoch": 0.27111123136448495, "grad_norm": 0.1746288388967514, "learning_rate": 0.004337093654631402, "loss": 3.195170593261719, "num_input_tokens_seen": 2626682880, "step": 5010, "train_runtime": 22788.1861, "train_tokens_per_second": 115265.114 }, { "epoch": 0.27165237154684924, "grad_norm": 0.182390496134758, "learning_rate": 0.004334304550997793, "loss": 3.184975433349609, "num_input_tokens_seen": 2631925760, "step": 5020, "train_runtime": 22833.4175, "train_tokens_per_second": 115266.397 }, { "epoch": 0.27219351172921347, "grad_norm": 0.18103830516338348, "learning_rate": 0.004331510610555275, "loss": 3.190489959716797, "num_input_tokens_seen": 2637168640, "step": 5030, "train_runtime": 22878.6263, "train_tokens_per_second": 115267.788 }, { "epoch": 0.2727346519115777, "grad_norm": 0.1782936155796051, "learning_rate": 0.004328711841833618, "loss": 3.196137237548828, "num_input_tokens_seen": 2642411520, "step": 5040, "train_runtime": 22923.8218, "train_tokens_per_second": 115269.24 }, { "epoch": 0.2732757920939419, "grad_norm": 0.185542032122612, "learning_rate": 0.0043259082533773354, "loss": 3.190313720703125, "num_input_tokens_seen": 2647654400, "step": 5050, "train_runtime": 22969.0255, "train_tokens_per_second": 115270.646 }, { "epoch": 0.27381693227630616, "grad_norm": 0.16143307089805603, "learning_rate": 0.0043230998537456536, "loss": 3.2025264739990233, "num_input_tokens_seen": 2652897280, "step": 5060, "train_runtime": 23014.1965, "train_tokens_per_second": 115272.209 }, { "epoch": 0.27435807245867044, "grad_norm": 0.16813282668590546, "learning_rate": 0.004320286651512486, "loss": 3.1958364486694335, "num_input_tokens_seen": 2658140160, "step": 5070, "train_runtime": 23059.3886, "train_tokens_per_second": 115273.662 }, { "epoch": 0.2748992126410347, "grad_norm": 0.18112315237522125, "learning_rate": 0.004317468655266412, "loss": 3.194669723510742, "num_input_tokens_seen": 2663383040, "step": 5080, "train_runtime": 23104.5863, "train_tokens_per_second": 115275.08 }, { "epoch": 0.2754403528233989, "grad_norm": 0.18187521398067474, "learning_rate": 0.004314645873610643, "loss": 3.1878196716308596, "num_input_tokens_seen": 2668625920, "step": 5090, "train_runtime": 23149.7951, "train_tokens_per_second": 115276.438 }, { "epoch": 0.27598149300576313, "grad_norm": 0.16804195940494537, "learning_rate": 0.004311818315163001, "loss": 3.2023330688476563, "num_input_tokens_seen": 2673868800, "step": 5100, "train_runtime": 23194.9829, "train_tokens_per_second": 115277.895 }, { "epoch": 0.27652263318812736, "grad_norm": 0.16169899702072144, "learning_rate": 0.004308985988555892, "loss": 3.195353889465332, "num_input_tokens_seen": 2679111680, "step": 5110, "train_runtime": 23240.1686, "train_tokens_per_second": 115279.356 }, { "epoch": 0.27706377337049165, "grad_norm": 0.1690625697374344, "learning_rate": 0.004306148902436281, "loss": 3.1894439697265624, "num_input_tokens_seen": 2684354560, "step": 5120, "train_runtime": 23285.3699, "train_tokens_per_second": 115280.735 }, { "epoch": 0.2776049135528559, "grad_norm": 0.1880822330713272, "learning_rate": 0.00430330706546566, "loss": 3.1982452392578127, "num_input_tokens_seen": 2689597440, "step": 5130, "train_runtime": 23330.5682, "train_tokens_per_second": 115282.123 }, { "epoch": 0.2781460537352201, "grad_norm": 0.1790066808462143, "learning_rate": 0.004300460486320026, "loss": 3.1980308532714843, "num_input_tokens_seen": 2694840320, "step": 5140, "train_runtime": 23375.7756, "train_tokens_per_second": 115283.461 }, { "epoch": 0.27868719391758434, "grad_norm": 0.16601233184337616, "learning_rate": 0.004297609173689855, "loss": 3.197714996337891, "num_input_tokens_seen": 2700083200, "step": 5150, "train_runtime": 23420.9835, "train_tokens_per_second": 115284.792 }, { "epoch": 0.27922833409994857, "grad_norm": 0.16672180593013763, "learning_rate": 0.0042947531362800715, "loss": 3.1988187789916993, "num_input_tokens_seen": 2705326080, "step": 5160, "train_runtime": 23466.1808, "train_tokens_per_second": 115286.169 }, { "epoch": 0.27976947428231286, "grad_norm": 0.19832877814769745, "learning_rate": 0.00429189238281003, "loss": 3.1931121826171873, "num_input_tokens_seen": 2710568960, "step": 5170, "train_runtime": 23511.3921, "train_tokens_per_second": 115287.472 }, { "epoch": 0.2803106144646771, "grad_norm": 0.1923927664756775, "learning_rate": 0.004289026922013475, "loss": 3.1957611083984374, "num_input_tokens_seen": 2715811840, "step": 5180, "train_runtime": 23556.5915, "train_tokens_per_second": 115288.829 }, { "epoch": 0.2808517546470413, "grad_norm": 0.17779354751110077, "learning_rate": 0.00428615676263853, "loss": 3.181416702270508, "num_input_tokens_seen": 2721054720, "step": 5190, "train_runtime": 23601.8043, "train_tokens_per_second": 115290.115 }, { "epoch": 0.28139289482940555, "grad_norm": 0.16895556449890137, "learning_rate": 0.004283281913447657, "loss": 3.1839942932128906, "num_input_tokens_seen": 2726297600, "step": 5200, "train_runtime": 23647.0206, "train_tokens_per_second": 115291.379 }, { "epoch": 0.2819340350117698, "grad_norm": 0.17021089792251587, "learning_rate": 0.004280402383217639, "loss": 3.193735122680664, "num_input_tokens_seen": 2731540480, "step": 5210, "train_runtime": 23692.2429, "train_tokens_per_second": 115292.608 }, { "epoch": 0.28247517519413406, "grad_norm": 0.16943930089473724, "learning_rate": 0.00427751818073955, "loss": 3.1817481994628904, "num_input_tokens_seen": 2736783360, "step": 5220, "train_runtime": 23737.4424, "train_tokens_per_second": 115293.944 }, { "epoch": 0.2830163153764983, "grad_norm": 0.15319055318832397, "learning_rate": 0.004274629314818728, "loss": 3.1803112030029297, "num_input_tokens_seen": 2742026240, "step": 5230, "train_runtime": 23782.6783, "train_tokens_per_second": 115295.099 }, { "epoch": 0.2835574555588625, "grad_norm": 0.1702287793159485, "learning_rate": 0.004271735794274746, "loss": 3.1876094818115233, "num_input_tokens_seen": 2747269120, "step": 5240, "train_runtime": 23827.881, "train_tokens_per_second": 115296.409 }, { "epoch": 0.28409859574122676, "grad_norm": 0.18369406461715698, "learning_rate": 0.00426883762794139, "loss": 3.1819345474243166, "num_input_tokens_seen": 2752512000, "step": 5250, "train_runtime": 23873.0945, "train_tokens_per_second": 115297.663 }, { "epoch": 0.284639735923591, "grad_norm": 0.1792212277650833, "learning_rate": 0.004265934824666628, "loss": 3.1884128570556642, "num_input_tokens_seen": 2757754880, "step": 5260, "train_runtime": 23918.3193, "train_tokens_per_second": 115298.857 }, { "epoch": 0.28518087610595527, "grad_norm": 0.17727087438106537, "learning_rate": 0.0042630273933125865, "loss": 3.194817543029785, "num_input_tokens_seen": 2762997760, "step": 5270, "train_runtime": 23963.5296, "train_tokens_per_second": 115300.117 }, { "epoch": 0.2857220162883195, "grad_norm": 0.15836812555789948, "learning_rate": 0.004260115342755518, "loss": 3.1808521270751955, "num_input_tokens_seen": 2768240640, "step": 5280, "train_runtime": 24008.7153, "train_tokens_per_second": 115301.49 }, { "epoch": 0.28626315647068373, "grad_norm": 0.17416301369667053, "learning_rate": 0.00425719868188578, "loss": 3.1919151306152345, "num_input_tokens_seen": 2773483520, "step": 5290, "train_runtime": 24053.9251, "train_tokens_per_second": 115302.742 }, { "epoch": 0.28680429665304796, "grad_norm": 0.16871845722198486, "learning_rate": 0.004254277419607802, "loss": 3.182635498046875, "num_input_tokens_seen": 2778726400, "step": 5300, "train_runtime": 24099.1331, "train_tokens_per_second": 115303.998 }, { "epoch": 0.2873454368354122, "grad_norm": 0.1787181943655014, "learning_rate": 0.004251351564840067, "loss": 3.18890495300293, "num_input_tokens_seen": 2783969280, "step": 5310, "train_runtime": 24144.3426, "train_tokens_per_second": 115305.243 }, { "epoch": 0.2878865770177765, "grad_norm": 0.1912972331047058, "learning_rate": 0.00424842112651507, "loss": 3.1834373474121094, "num_input_tokens_seen": 2789212160, "step": 5320, "train_runtime": 24189.5491, "train_tokens_per_second": 115306.496 }, { "epoch": 0.2884277172001407, "grad_norm": 0.16879980266094208, "learning_rate": 0.004245486113579308, "loss": 3.1814502716064452, "num_input_tokens_seen": 2794455040, "step": 5330, "train_runtime": 24234.754, "train_tokens_per_second": 115307.753 }, { "epoch": 0.28896885738250494, "grad_norm": 0.17917132377624512, "learning_rate": 0.00424254653499324, "loss": 3.188125228881836, "num_input_tokens_seen": 2799697920, "step": 5340, "train_runtime": 24279.9641, "train_tokens_per_second": 115308.981 }, { "epoch": 0.28950999756486917, "grad_norm": 0.17311090230941772, "learning_rate": 0.004239602399731263, "loss": 3.1844112396240236, "num_input_tokens_seen": 2804940800, "step": 5350, "train_runtime": 24328.7709, "train_tokens_per_second": 115293.157 }, { "epoch": 0.2900511377472334, "grad_norm": 0.17229342460632324, "learning_rate": 0.004236653716781689, "loss": 3.185770797729492, "num_input_tokens_seen": 2810183680, "step": 5360, "train_runtime": 24373.9674, "train_tokens_per_second": 115294.471 }, { "epoch": 0.2905922779295977, "grad_norm": 0.180856391787529, "learning_rate": 0.0042337004951467075, "loss": 3.1889812469482424, "num_input_tokens_seen": 2815426560, "step": 5370, "train_runtime": 24419.1733, "train_tokens_per_second": 115295.736 }, { "epoch": 0.2911334181119619, "grad_norm": 0.16839343309402466, "learning_rate": 0.004230742743842371, "loss": 3.1733203887939454, "num_input_tokens_seen": 2820669440, "step": 5380, "train_runtime": 24464.3893, "train_tokens_per_second": 115296.949 }, { "epoch": 0.29167455829432615, "grad_norm": 0.16889749467372894, "learning_rate": 0.004227780471898559, "loss": 3.1818462371826173, "num_input_tokens_seen": 2825912320, "step": 5390, "train_runtime": 24509.5858, "train_tokens_per_second": 115298.249 }, { "epoch": 0.2922156984766904, "grad_norm": 0.17744433879852295, "learning_rate": 0.004224813688358949, "loss": 3.1864446640014648, "num_input_tokens_seen": 2831155200, "step": 5400, "train_runtime": 24554.7949, "train_tokens_per_second": 115299.485 }, { "epoch": 0.2927568386590546, "grad_norm": 0.1737280935049057, "learning_rate": 0.004221842402280996, "loss": 3.180088424682617, "num_input_tokens_seen": 2836398080, "step": 5410, "train_runtime": 24599.9993, "train_tokens_per_second": 115300.738 }, { "epoch": 0.2932979788414189, "grad_norm": 0.16631047427654266, "learning_rate": 0.004218866622735898, "loss": 3.175667572021484, "num_input_tokens_seen": 2841640960, "step": 5420, "train_runtime": 24645.2212, "train_tokens_per_second": 115301.905 }, { "epoch": 0.2938391190237831, "grad_norm": 0.17272046208381653, "learning_rate": 0.004215886358808577, "loss": 3.185796546936035, "num_input_tokens_seen": 2846883840, "step": 5430, "train_runtime": 24690.432, "train_tokens_per_second": 115303.12 }, { "epoch": 0.29438025920614735, "grad_norm": 0.1690651923418045, "learning_rate": 0.004212901619597638, "loss": 3.1886520385742188, "num_input_tokens_seen": 2852126720, "step": 5440, "train_runtime": 24735.6453, "train_tokens_per_second": 115304.318 }, { "epoch": 0.2949213993885116, "grad_norm": 0.19146323204040527, "learning_rate": 0.0042099124142153535, "loss": 3.1789478302001952, "num_input_tokens_seen": 2857369600, "step": 5450, "train_runtime": 24780.8456, "train_tokens_per_second": 115305.573 }, { "epoch": 0.2954625395708758, "grad_norm": 0.1788649708032608, "learning_rate": 0.00420691875178763, "loss": 3.1887844085693358, "num_input_tokens_seen": 2862612480, "step": 5460, "train_runtime": 24826.0467, "train_tokens_per_second": 115306.819 }, { "epoch": 0.2960036797532401, "grad_norm": 0.19091546535491943, "learning_rate": 0.004203920641453982, "loss": 3.175608253479004, "num_input_tokens_seen": 2867855360, "step": 5470, "train_runtime": 24871.2591, "train_tokens_per_second": 115308.009 }, { "epoch": 0.29654481993560433, "grad_norm": 0.16818441450595856, "learning_rate": 0.004200918092367501, "loss": 3.1859344482421874, "num_input_tokens_seen": 2873098240, "step": 5480, "train_runtime": 24916.485, "train_tokens_per_second": 115309.131 }, { "epoch": 0.29708596011796856, "grad_norm": 0.1913134902715683, "learning_rate": 0.0041979111136948325, "loss": 3.1723804473876953, "num_input_tokens_seen": 2878341120, "step": 5490, "train_runtime": 24961.6704, "train_tokens_per_second": 115310.437 }, { "epoch": 0.2976271003003328, "grad_norm": 0.18261617422103882, "learning_rate": 0.004194899714616144, "loss": 3.179214286804199, "num_input_tokens_seen": 2883584000, "step": 5500, "train_runtime": 25006.8704, "train_tokens_per_second": 115311.67 }, { "epoch": 0.2976271003003328, "eval_loss": 3.126129388809204, "eval_runtime": 1.9962, "eval_samples_per_second": 250.471, "eval_steps_per_second": 4.008, "num_input_tokens_seen": 2883584000, "step": 5500 }, { "epoch": 0.298168240482697, "grad_norm": 0.18416427075862885, "learning_rate": 0.004191883904325097, "loss": 3.1846160888671875, "num_input_tokens_seen": 2888826880, "step": 5510, "train_runtime": 25054.1224, "train_tokens_per_second": 115303.455 }, { "epoch": 0.2987093806650613, "grad_norm": 0.16038469970226288, "learning_rate": 0.004188863692028823, "loss": 3.180740737915039, "num_input_tokens_seen": 2894069760, "step": 5520, "train_runtime": 25099.3557, "train_tokens_per_second": 115304.544 }, { "epoch": 0.29925052084742554, "grad_norm": 0.16605685651302338, "learning_rate": 0.004185839086947891, "loss": 3.1796802520751952, "num_input_tokens_seen": 2899312640, "step": 5530, "train_runtime": 25144.6135, "train_tokens_per_second": 115305.516 }, { "epoch": 0.29979166102978977, "grad_norm": 0.1819118857383728, "learning_rate": 0.004182810098316281, "loss": 3.1764299392700197, "num_input_tokens_seen": 2904555520, "step": 5540, "train_runtime": 25189.8702, "train_tokens_per_second": 115306.49 }, { "epoch": 0.300332801212154, "grad_norm": 0.1876569390296936, "learning_rate": 0.004179776735381355, "loss": 3.18255500793457, "num_input_tokens_seen": 2909798400, "step": 5550, "train_runtime": 25235.1612, "train_tokens_per_second": 115307.304 }, { "epoch": 0.30087394139451823, "grad_norm": 0.1661430448293686, "learning_rate": 0.004176739007403832, "loss": 3.172201156616211, "num_input_tokens_seen": 2915041280, "step": 5560, "train_runtime": 25280.4455, "train_tokens_per_second": 115308.145 }, { "epoch": 0.3014150815768825, "grad_norm": 0.17655618488788605, "learning_rate": 0.004173696923657755, "loss": 3.17954158782959, "num_input_tokens_seen": 2920284160, "step": 5570, "train_runtime": 25325.7247, "train_tokens_per_second": 115309.007 }, { "epoch": 0.30195622175924675, "grad_norm": 0.17908194661140442, "learning_rate": 0.0041706504934304655, "loss": 3.1723983764648436, "num_input_tokens_seen": 2925527040, "step": 5580, "train_runtime": 25370.9962, "train_tokens_per_second": 115309.9 }, { "epoch": 0.302497361941611, "grad_norm": 0.17515423893928528, "learning_rate": 0.004167599726022575, "loss": 3.183238220214844, "num_input_tokens_seen": 2930769920, "step": 5590, "train_runtime": 25416.2839, "train_tokens_per_second": 115310.717 }, { "epoch": 0.3030385021239752, "grad_norm": 0.1749441921710968, "learning_rate": 0.004164544630747937, "loss": 3.185963821411133, "num_input_tokens_seen": 2936012800, "step": 5600, "train_runtime": 25461.5455, "train_tokens_per_second": 115311.649 }, { "epoch": 0.30357964230633944, "grad_norm": 0.1578006148338318, "learning_rate": 0.004161485216933615, "loss": 3.177383041381836, "num_input_tokens_seen": 2941255680, "step": 5610, "train_runtime": 25506.8309, "train_tokens_per_second": 115312.47 }, { "epoch": 0.3041207824887037, "grad_norm": 0.1903323382139206, "learning_rate": 0.00415842149391986, "loss": 3.179554748535156, "num_input_tokens_seen": 2946498560, "step": 5620, "train_runtime": 25552.099, "train_tokens_per_second": 115313.367 }, { "epoch": 0.30466192267106795, "grad_norm": 0.16383005678653717, "learning_rate": 0.004155353471060077, "loss": 3.160336494445801, "num_input_tokens_seen": 2951741440, "step": 5630, "train_runtime": 25597.3865, "train_tokens_per_second": 115314.172 }, { "epoch": 0.3052030628534322, "grad_norm": 0.1735740303993225, "learning_rate": 0.004152281157720798, "loss": 3.172795867919922, "num_input_tokens_seen": 2956984320, "step": 5640, "train_runtime": 25642.6481, "train_tokens_per_second": 115315.092 }, { "epoch": 0.3057442030357964, "grad_norm": 0.19910795986652374, "learning_rate": 0.004149204563281657, "loss": 3.1711971282958986, "num_input_tokens_seen": 2962227200, "step": 5650, "train_runtime": 25687.9012, "train_tokens_per_second": 115316.046 }, { "epoch": 0.30628534321816064, "grad_norm": 0.18566472828388214, "learning_rate": 0.004146123697135352, "loss": 3.177423095703125, "num_input_tokens_seen": 2967470080, "step": 5660, "train_runtime": 25733.1722, "train_tokens_per_second": 115316.917 }, { "epoch": 0.30682648340052493, "grad_norm": 0.16724054515361786, "learning_rate": 0.004143038568687626, "loss": 3.174397277832031, "num_input_tokens_seen": 2972712960, "step": 5670, "train_runtime": 25778.4366, "train_tokens_per_second": 115317.814 }, { "epoch": 0.30736762358288916, "grad_norm": 0.18052591383457184, "learning_rate": 0.004139949187357236, "loss": 3.172323226928711, "num_input_tokens_seen": 2977955840, "step": 5680, "train_runtime": 25823.6944, "train_tokens_per_second": 115318.738 }, { "epoch": 0.3079087637652534, "grad_norm": 0.1707129180431366, "learning_rate": 0.004136855562575921, "loss": 3.1627834320068358, "num_input_tokens_seen": 2983198720, "step": 5690, "train_runtime": 25868.9566, "train_tokens_per_second": 115319.638 }, { "epoch": 0.3084499039476176, "grad_norm": 0.18003937602043152, "learning_rate": 0.004133757703788374, "loss": 3.175765609741211, "num_input_tokens_seen": 2988441600, "step": 5700, "train_runtime": 25914.2132, "train_tokens_per_second": 115320.561 }, { "epoch": 0.30899104412998185, "grad_norm": 0.17585334181785583, "learning_rate": 0.004130655620452215, "loss": 3.1611761093139648, "num_input_tokens_seen": 2993684480, "step": 5710, "train_runtime": 25959.4637, "train_tokens_per_second": 115321.507 }, { "epoch": 0.30953218431234614, "grad_norm": 0.17584700882434845, "learning_rate": 0.004127549322037963, "loss": 3.1710134506225587, "num_input_tokens_seen": 2998927360, "step": 5720, "train_runtime": 26004.7204, "train_tokens_per_second": 115322.423 }, { "epoch": 0.31007332449471037, "grad_norm": 0.1671862006187439, "learning_rate": 0.004124438818029003, "loss": 3.171963691711426, "num_input_tokens_seen": 3004170240, "step": 5730, "train_runtime": 26053.4016, "train_tokens_per_second": 115308.177 }, { "epoch": 0.3106144646770746, "grad_norm": 0.17120610177516937, "learning_rate": 0.004121324117921561, "loss": 3.171039581298828, "num_input_tokens_seen": 3009413120, "step": 5740, "train_runtime": 26098.5909, "train_tokens_per_second": 115309.41 }, { "epoch": 0.31115560485943883, "grad_norm": 0.17267778515815735, "learning_rate": 0.004118205231224675, "loss": 3.1711191177368163, "num_input_tokens_seen": 3014656000, "step": 5750, "train_runtime": 26143.7653, "train_tokens_per_second": 115310.705 }, { "epoch": 0.31169674504180306, "grad_norm": 0.17473942041397095, "learning_rate": 0.004115082167460159, "loss": 3.1646095275878907, "num_input_tokens_seen": 3019898880, "step": 5760, "train_runtime": 26188.9631, "train_tokens_per_second": 115311.892 }, { "epoch": 0.31223788522416734, "grad_norm": 0.17137861251831055, "learning_rate": 0.004111954936162586, "loss": 3.1746740341186523, "num_input_tokens_seen": 3025141760, "step": 5770, "train_runtime": 26234.3115, "train_tokens_per_second": 115312.413 }, { "epoch": 0.3127790254065316, "grad_norm": 0.16885042190551758, "learning_rate": 0.004108823546879249, "loss": 3.162841033935547, "num_input_tokens_seen": 3030384640, "step": 5780, "train_runtime": 26279.5704, "train_tokens_per_second": 115313.325 }, { "epoch": 0.3133201655888958, "grad_norm": 0.15897022187709808, "learning_rate": 0.004105688009170134, "loss": 3.1719465255737305, "num_input_tokens_seen": 3035627520, "step": 5790, "train_runtime": 26324.8012, "train_tokens_per_second": 115314.357 }, { "epoch": 0.31386130577126004, "grad_norm": 0.1866680085659027, "learning_rate": 0.004102548332607894, "loss": 3.1683422088623048, "num_input_tokens_seen": 3040870400, "step": 5800, "train_runtime": 26370.0195, "train_tokens_per_second": 115315.44 }, { "epoch": 0.31440244595362427, "grad_norm": 0.18191276490688324, "learning_rate": 0.004099404526777816, "loss": 3.1652973175048826, "num_input_tokens_seen": 3046113280, "step": 5810, "train_runtime": 26415.2343, "train_tokens_per_second": 115316.535 }, { "epoch": 0.31494358613598855, "grad_norm": 0.16286683082580566, "learning_rate": 0.004096256601277797, "loss": 3.1653570175170898, "num_input_tokens_seen": 3051356160, "step": 5820, "train_runtime": 26460.4377, "train_tokens_per_second": 115317.675 }, { "epoch": 0.3154847263183528, "grad_norm": 0.15786544978618622, "learning_rate": 0.004093104565718307, "loss": 3.171334457397461, "num_input_tokens_seen": 3056599040, "step": 5830, "train_runtime": 26505.6409, "train_tokens_per_second": 115318.813 }, { "epoch": 0.316025866500717, "grad_norm": 0.16940993070602417, "learning_rate": 0.0040899484297223666, "loss": 3.16903076171875, "num_input_tokens_seen": 3061841920, "step": 5840, "train_runtime": 26550.8652, "train_tokens_per_second": 115319.855 }, { "epoch": 0.31656700668308124, "grad_norm": 0.1778353452682495, "learning_rate": 0.004086788202925512, "loss": 3.163807678222656, "num_input_tokens_seen": 3067084800, "step": 5850, "train_runtime": 26596.0801, "train_tokens_per_second": 115320.934 }, { "epoch": 0.3171081468654455, "grad_norm": 0.18578499555587769, "learning_rate": 0.004083623894975773, "loss": 3.1687942504882813, "num_input_tokens_seen": 3072327680, "step": 5860, "train_runtime": 26641.289, "train_tokens_per_second": 115322.036 }, { "epoch": 0.31764928704780976, "grad_norm": 0.17534473538398743, "learning_rate": 0.004080455515533633, "loss": 3.1645458221435545, "num_input_tokens_seen": 3077570560, "step": 5870, "train_runtime": 26686.5065, "train_tokens_per_second": 115323.096 }, { "epoch": 0.318190427230174, "grad_norm": 0.16227850317955017, "learning_rate": 0.004077283074272012, "loss": 3.1695529937744142, "num_input_tokens_seen": 3082813440, "step": 5880, "train_runtime": 26731.6901, "train_tokens_per_second": 115324.3 }, { "epoch": 0.3187315674125382, "grad_norm": 0.17972981929779053, "learning_rate": 0.004074106580876226, "loss": 3.164577102661133, "num_input_tokens_seen": 3088056320, "step": 5890, "train_runtime": 26776.8465, "train_tokens_per_second": 115325.616 }, { "epoch": 0.31927270759490245, "grad_norm": 0.17186778783798218, "learning_rate": 0.0040709260450439615, "loss": 3.168431854248047, "num_input_tokens_seen": 3093299200, "step": 5900, "train_runtime": 26822.0301, "train_tokens_per_second": 115326.811 }, { "epoch": 0.3198138477772667, "grad_norm": 0.16803112626075745, "learning_rate": 0.0040677414764852485, "loss": 3.1673011779785156, "num_input_tokens_seen": 3098542080, "step": 5910, "train_runtime": 26867.197, "train_tokens_per_second": 115328.074 }, { "epoch": 0.32035498795963097, "grad_norm": 0.16622225940227509, "learning_rate": 0.00406455288492243, "loss": 3.156739616394043, "num_input_tokens_seen": 3103784960, "step": 5920, "train_runtime": 26912.3879, "train_tokens_per_second": 115329.229 }, { "epoch": 0.3208961281419952, "grad_norm": 0.18976053595542908, "learning_rate": 0.004061360280090129, "loss": 3.166844940185547, "num_input_tokens_seen": 3109027840, "step": 5930, "train_runtime": 26957.5834, "train_tokens_per_second": 115330.361 }, { "epoch": 0.3214372683243594, "grad_norm": 0.16867531836032867, "learning_rate": 0.00405816367173522, "loss": 3.1626731872558596, "num_input_tokens_seen": 3114270720, "step": 5940, "train_runtime": 27002.7901, "train_tokens_per_second": 115331.442 }, { "epoch": 0.32197840850672366, "grad_norm": 0.20071354508399963, "learning_rate": 0.004054963069616803, "loss": 3.169915199279785, "num_input_tokens_seen": 3119513600, "step": 5950, "train_runtime": 27047.9883, "train_tokens_per_second": 115332.555 }, { "epoch": 0.3225195486890879, "grad_norm": 0.16498495638370514, "learning_rate": 0.0040517584835061664, "loss": 3.1712413787841798, "num_input_tokens_seen": 3124756480, "step": 5960, "train_runtime": 27093.2042, "train_tokens_per_second": 115333.589 }, { "epoch": 0.3230606888714522, "grad_norm": 0.17592206597328186, "learning_rate": 0.004048549923186767, "loss": 3.1624687194824217, "num_input_tokens_seen": 3129999360, "step": 5970, "train_runtime": 27138.4223, "train_tokens_per_second": 115334.61 }, { "epoch": 0.3236018290538164, "grad_norm": 0.15470415353775024, "learning_rate": 0.00404533739845419, "loss": 3.155242347717285, "num_input_tokens_seen": 3135242240, "step": 5980, "train_runtime": 27183.6528, "train_tokens_per_second": 115335.575 }, { "epoch": 0.32414296923618063, "grad_norm": 0.16501109302043915, "learning_rate": 0.004042120919116126, "loss": 3.1598865509033205, "num_input_tokens_seen": 3140485120, "step": 5990, "train_runtime": 27228.8867, "train_tokens_per_second": 115336.523 }, { "epoch": 0.32468410941854486, "grad_norm": 0.16781945526599884, "learning_rate": 0.004038900494992339, "loss": 3.157525634765625, "num_input_tokens_seen": 3145728000, "step": 6000, "train_runtime": 27274.108, "train_tokens_per_second": 115337.521 }, { "epoch": 0.32468410941854486, "eval_loss": 3.111185073852539, "eval_runtime": 1.9872, "eval_samples_per_second": 251.614, "eval_steps_per_second": 4.026, "num_input_tokens_seen": 3145728000, "step": 6000 }, { "epoch": 0.3252252496009091, "grad_norm": 0.18414868414402008, "learning_rate": 0.004035676135914636, "loss": 3.170181655883789, "num_input_tokens_seen": 3150970880, "step": 6010, "train_runtime": 27323.9049, "train_tokens_per_second": 115319.201 }, { "epoch": 0.3257663897832734, "grad_norm": 0.1616990864276886, "learning_rate": 0.004032447851726835, "loss": 3.1585414886474608, "num_input_tokens_seen": 3156213760, "step": 6020, "train_runtime": 27369.1149, "train_tokens_per_second": 115320.272 }, { "epoch": 0.3263075299656376, "grad_norm": 0.16582000255584717, "learning_rate": 0.004029215652284741, "loss": 3.1622276306152344, "num_input_tokens_seen": 3161456640, "step": 6030, "train_runtime": 27414.3296, "train_tokens_per_second": 115321.319 }, { "epoch": 0.32684867014800184, "grad_norm": 0.17380478978157043, "learning_rate": 0.00402597954745611, "loss": 3.1608341217041014, "num_input_tokens_seen": 3166699520, "step": 6040, "train_runtime": 27459.5638, "train_tokens_per_second": 115322.281 }, { "epoch": 0.32738981033036607, "grad_norm": 0.18764927983283997, "learning_rate": 0.00402273954712062, "loss": 3.1758914947509767, "num_input_tokens_seen": 3171942400, "step": 6050, "train_runtime": 27504.7658, "train_tokens_per_second": 115323.374 }, { "epoch": 0.3279309505127303, "grad_norm": 0.1659294068813324, "learning_rate": 0.004019495661169844, "loss": 3.1681026458740233, "num_input_tokens_seen": 3177185280, "step": 6060, "train_runtime": 27549.978, "train_tokens_per_second": 115324.422 }, { "epoch": 0.3284720906950946, "grad_norm": 0.15407103300094604, "learning_rate": 0.004016247899507217, "loss": 3.1617177963256835, "num_input_tokens_seen": 3182428160, "step": 6070, "train_runtime": 27595.2039, "train_tokens_per_second": 115325.409 }, { "epoch": 0.3290132308774588, "grad_norm": 0.17896398901939392, "learning_rate": 0.004012996272048004, "loss": 3.163351631164551, "num_input_tokens_seen": 3187671040, "step": 6080, "train_runtime": 27640.4032, "train_tokens_per_second": 115326.503 }, { "epoch": 0.32955437105982305, "grad_norm": 0.17344997823238373, "learning_rate": 0.004009740788719276, "loss": 3.153501510620117, "num_input_tokens_seen": 3192913920, "step": 6090, "train_runtime": 27685.6168, "train_tokens_per_second": 115327.534 }, { "epoch": 0.3300955112421873, "grad_norm": 0.16279980540275574, "learning_rate": 0.004006481459459872, "loss": 3.160162162780762, "num_input_tokens_seen": 3198156800, "step": 6100, "train_runtime": 27730.837, "train_tokens_per_second": 115328.535 }, { "epoch": 0.3306366514245515, "grad_norm": 0.16896025836467743, "learning_rate": 0.0040032182942203775, "loss": 3.158255767822266, "num_input_tokens_seen": 3203399680, "step": 6110, "train_runtime": 27779.614, "train_tokens_per_second": 115314.766 }, { "epoch": 0.3311777916069158, "grad_norm": 0.18168415129184723, "learning_rate": 0.003999951302963083, "loss": 3.156180000305176, "num_input_tokens_seen": 3208642560, "step": 6120, "train_runtime": 27824.7398, "train_tokens_per_second": 115316.175 }, { "epoch": 0.33171893178928, "grad_norm": 0.17479564249515533, "learning_rate": 0.003996680495661963, "loss": 3.155413818359375, "num_input_tokens_seen": 3213885440, "step": 6130, "train_runtime": 27869.8597, "train_tokens_per_second": 115317.604 }, { "epoch": 0.33226007197164426, "grad_norm": 0.16649708151817322, "learning_rate": 0.003993405882302642, "loss": 3.162016677856445, "num_input_tokens_seen": 3219128320, "step": 6140, "train_runtime": 27914.9889, "train_tokens_per_second": 115318.99 }, { "epoch": 0.3328012121540085, "grad_norm": 0.17866738140583038, "learning_rate": 0.003990127472882364, "loss": 3.1546072006225585, "num_input_tokens_seen": 3224371200, "step": 6150, "train_runtime": 27960.113, "train_tokens_per_second": 115320.392 }, { "epoch": 0.3333423523363727, "grad_norm": 0.15289874374866486, "learning_rate": 0.0039868452774099615, "loss": 3.1471332550048827, "num_input_tokens_seen": 3229614080, "step": 6160, "train_runtime": 28005.2392, "train_tokens_per_second": 115321.782 }, { "epoch": 0.333883492518737, "grad_norm": 0.16488930583000183, "learning_rate": 0.003983559305905828, "loss": 3.1540958404541017, "num_input_tokens_seen": 3234856960, "step": 6170, "train_runtime": 28050.3655, "train_tokens_per_second": 115323.166 }, { "epoch": 0.33442463270110123, "grad_norm": 0.17347821593284607, "learning_rate": 0.003980269568401881, "loss": 3.153203010559082, "num_input_tokens_seen": 3240099840, "step": 6180, "train_runtime": 28095.5018, "train_tokens_per_second": 115324.505 }, { "epoch": 0.33496577288346546, "grad_norm": 0.16901230812072754, "learning_rate": 0.00397697607494154, "loss": 3.153574752807617, "num_input_tokens_seen": 3245342720, "step": 6190, "train_runtime": 28140.6287, "train_tokens_per_second": 115325.878 }, { "epoch": 0.3355069130658297, "grad_norm": 0.1725231409072876, "learning_rate": 0.0039736788355796875, "loss": 3.1607025146484373, "num_input_tokens_seen": 3250585600, "step": 6200, "train_runtime": 28185.7568, "train_tokens_per_second": 115327.242 }, { "epoch": 0.3360480532481939, "grad_norm": 0.17325183749198914, "learning_rate": 0.003970377860382644, "loss": 3.147405242919922, "num_input_tokens_seen": 3255828480, "step": 6210, "train_runtime": 28230.8843, "train_tokens_per_second": 115328.604 }, { "epoch": 0.3365891934305582, "grad_norm": 0.1715451180934906, "learning_rate": 0.003967073159428135, "loss": 3.150386428833008, "num_input_tokens_seen": 3261071360, "step": 6220, "train_runtime": 28276.018, "train_tokens_per_second": 115329.936 }, { "epoch": 0.33713033361292244, "grad_norm": 0.16657474637031555, "learning_rate": 0.003963764742805262, "loss": 3.1559564590454103, "num_input_tokens_seen": 3266314240, "step": 6230, "train_runtime": 28321.1527, "train_tokens_per_second": 115331.261 }, { "epoch": 0.33767147379528667, "grad_norm": 0.17756827175617218, "learning_rate": 0.003960452620614465, "loss": 3.1532052993774413, "num_input_tokens_seen": 3271557120, "step": 6240, "train_runtime": 28366.2774, "train_tokens_per_second": 115332.621 }, { "epoch": 0.3382126139776509, "grad_norm": 0.16704502701759338, "learning_rate": 0.003957136802967503, "loss": 3.145302581787109, "num_input_tokens_seen": 3276800000, "step": 6250, "train_runtime": 28411.4119, "train_tokens_per_second": 115333.937 }, { "epoch": 0.33875375416001513, "grad_norm": 0.16609609127044678, "learning_rate": 0.003953817299987416, "loss": 3.157614898681641, "num_input_tokens_seen": 3282042880, "step": 6260, "train_runtime": 28456.5404, "train_tokens_per_second": 115335.274 }, { "epoch": 0.3392948943423794, "grad_norm": 0.17776867747306824, "learning_rate": 0.003950494121808493, "loss": 3.1511157989501952, "num_input_tokens_seen": 3287285760, "step": 6270, "train_runtime": 28501.6688, "train_tokens_per_second": 115336.607 }, { "epoch": 0.33983603452474365, "grad_norm": 0.16619160771369934, "learning_rate": 0.003947167278576242, "loss": 3.1576236724853515, "num_input_tokens_seen": 3292528640, "step": 6280, "train_runtime": 28546.8015, "train_tokens_per_second": 115337.917 }, { "epoch": 0.3403771747071079, "grad_norm": 0.17923958599567413, "learning_rate": 0.003943836780447365, "loss": 3.1528648376464843, "num_input_tokens_seen": 3297771520, "step": 6290, "train_runtime": 28591.9231, "train_tokens_per_second": 115339.269 }, { "epoch": 0.3409183148894721, "grad_norm": 0.16474676132202148, "learning_rate": 0.003940502637589718, "loss": 3.1509103775024414, "num_input_tokens_seen": 3303014400, "step": 6300, "train_runtime": 28637.0641, "train_tokens_per_second": 115340.539 }, { "epoch": 0.34145945507183634, "grad_norm": 0.1639336794614792, "learning_rate": 0.0039371648601822865, "loss": 3.155986785888672, "num_input_tokens_seen": 3308257280, "step": 6310, "train_runtime": 28682.1884, "train_tokens_per_second": 115341.871 }, { "epoch": 0.3420005952542006, "grad_norm": 0.17124713957309723, "learning_rate": 0.003933823458415151, "loss": 3.147997283935547, "num_input_tokens_seen": 3313500160, "step": 6320, "train_runtime": 28727.3095, "train_tokens_per_second": 115343.212 }, { "epoch": 0.34254173543656485, "grad_norm": 0.17230060696601868, "learning_rate": 0.003930478442489458, "loss": 3.1527957916259766, "num_input_tokens_seen": 3318743040, "step": 6330, "train_runtime": 28772.44, "train_tokens_per_second": 115344.512 }, { "epoch": 0.3430828756189291, "grad_norm": 0.1681806594133377, "learning_rate": 0.003927129822617386, "loss": 3.1512054443359374, "num_input_tokens_seen": 3323985920, "step": 6340, "train_runtime": 28817.6293, "train_tokens_per_second": 115345.572 }, { "epoch": 0.3436240158012933, "grad_norm": 0.17435091733932495, "learning_rate": 0.003923777609022119, "loss": 3.153603744506836, "num_input_tokens_seen": 3329228800, "step": 6350, "train_runtime": 28862.8169, "train_tokens_per_second": 115346.635 }, { "epoch": 0.34416515598365754, "grad_norm": 0.1703469306230545, "learning_rate": 0.00392042181193781, "loss": 3.142818069458008, "num_input_tokens_seen": 3334471680, "step": 6360, "train_runtime": 28907.9957, "train_tokens_per_second": 115347.73 }, { "epoch": 0.34470629616602183, "grad_norm": 0.1682499647140503, "learning_rate": 0.0039170624416095525, "loss": 3.1417423248291017, "num_input_tokens_seen": 3339714560, "step": 6370, "train_runtime": 28953.1644, "train_tokens_per_second": 115348.862 }, { "epoch": 0.34524743634838606, "grad_norm": 0.16802842915058136, "learning_rate": 0.0039136995082933515, "loss": 3.1456912994384765, "num_input_tokens_seen": 3344957440, "step": 6380, "train_runtime": 28998.3264, "train_tokens_per_second": 115350.017 }, { "epoch": 0.3457885765307503, "grad_norm": 0.1582358479499817, "learning_rate": 0.003910333022256086, "loss": 3.1438793182373046, "num_input_tokens_seen": 3350200320, "step": 6390, "train_runtime": 29043.4985, "train_tokens_per_second": 115351.128 }, { "epoch": 0.3463297167131145, "grad_norm": 0.16883233189582825, "learning_rate": 0.003906962993775483, "loss": 3.1468482971191407, "num_input_tokens_seen": 3355443200, "step": 6400, "train_runtime": 29088.66, "train_tokens_per_second": 115352.278 }, { "epoch": 0.34687085689547875, "grad_norm": 0.18867318332195282, "learning_rate": 0.0039035894331400853, "loss": 3.147420883178711, "num_input_tokens_seen": 3360686080, "step": 6410, "train_runtime": 29133.8253, "train_tokens_per_second": 115353.409 }, { "epoch": 0.34741199707784304, "grad_norm": 0.16323506832122803, "learning_rate": 0.0039002123506492177, "loss": 3.145482063293457, "num_input_tokens_seen": 3365928960, "step": 6420, "train_runtime": 29179.0336, "train_tokens_per_second": 115354.367 }, { "epoch": 0.34795313726020727, "grad_norm": 0.1756802797317505, "learning_rate": 0.003896831756612958, "loss": 3.1475906372070312, "num_input_tokens_seen": 3371171840, "step": 6430, "train_runtime": 29224.2308, "train_tokens_per_second": 115355.366 }, { "epoch": 0.3484942774425715, "grad_norm": 0.17158783972263336, "learning_rate": 0.0038934476613521037, "loss": 3.142435073852539, "num_input_tokens_seen": 3376414720, "step": 6440, "train_runtime": 29269.4011, "train_tokens_per_second": 115356.467 }, { "epoch": 0.34903541762493573, "grad_norm": 0.16574952006340027, "learning_rate": 0.0038900600751981436, "loss": 3.1459327697753907, "num_input_tokens_seen": 3381657600, "step": 6450, "train_runtime": 29314.5687, "train_tokens_per_second": 115357.577 }, { "epoch": 0.34957655780729996, "grad_norm": 0.16016115248203278, "learning_rate": 0.0038866690084932206, "loss": 3.1540714263916017, "num_input_tokens_seen": 3386900480, "step": 6460, "train_runtime": 29359.7508, "train_tokens_per_second": 115358.625 }, { "epoch": 0.35011769798966424, "grad_norm": 0.1590614914894104, "learning_rate": 0.0038832744715901063, "loss": 3.138327789306641, "num_input_tokens_seen": 3392143360, "step": 6470, "train_runtime": 29404.9917, "train_tokens_per_second": 115359.439 }, { "epoch": 0.3506588381720285, "grad_norm": 0.1668478101491928, "learning_rate": 0.003879876474852164, "loss": 3.1390443801879884, "num_input_tokens_seen": 3397386240, "step": 6480, "train_runtime": 29450.2102, "train_tokens_per_second": 115360.339 }, { "epoch": 0.3511999783543927, "grad_norm": 0.16614961624145508, "learning_rate": 0.0038764750286533244, "loss": 3.1493562698364257, "num_input_tokens_seen": 3402629120, "step": 6490, "train_runtime": 29498.9151, "train_tokens_per_second": 115347.602 }, { "epoch": 0.35174111853675694, "grad_norm": 0.1770559698343277, "learning_rate": 0.003873070143378044, "loss": 3.1434371948242186, "num_input_tokens_seen": 3407872000, "step": 6500, "train_runtime": 29544.0364, "train_tokens_per_second": 115348.896 }, { "epoch": 0.35174111853675694, "eval_loss": 3.0966169834136963, "eval_runtime": 1.9851, "eval_samples_per_second": 251.881, "eval_steps_per_second": 4.03, "num_input_tokens_seen": 3407872000, "step": 6500 }, { "epoch": 0.35228225871912117, "grad_norm": 0.1724107414484024, "learning_rate": 0.0038696618294212816, "loss": 3.1477359771728515, "num_input_tokens_seen": 3413114880, "step": 6510, "train_runtime": 29591.1684, "train_tokens_per_second": 115342.349 }, { "epoch": 0.35282339890148545, "grad_norm": 0.17597156763076782, "learning_rate": 0.0038662500971884633, "loss": 3.1492542266845702, "num_input_tokens_seen": 3418357760, "step": 6520, "train_runtime": 29636.3254, "train_tokens_per_second": 115343.509 }, { "epoch": 0.3533645390838497, "grad_norm": 0.1612919569015503, "learning_rate": 0.0038628349570954497, "loss": 3.1426467895507812, "num_input_tokens_seen": 3423600640, "step": 6530, "train_runtime": 29681.4655, "train_tokens_per_second": 115344.73 }, { "epoch": 0.3539056792662139, "grad_norm": 0.16101430356502533, "learning_rate": 0.0038594164195685076, "loss": 3.137646484375, "num_input_tokens_seen": 3428843520, "step": 6540, "train_runtime": 29726.6035, "train_tokens_per_second": 115345.957 }, { "epoch": 0.35444681944857814, "grad_norm": 0.17293353378772736, "learning_rate": 0.003855994495044273, "loss": 3.1470672607421877, "num_input_tokens_seen": 3434086400, "step": 6550, "train_runtime": 29771.7425, "train_tokens_per_second": 115347.175 }, { "epoch": 0.3549879596309424, "grad_norm": 0.18171222507953644, "learning_rate": 0.0038525691939697267, "loss": 3.1423971176147463, "num_input_tokens_seen": 3439329280, "step": 6560, "train_runtime": 29816.873, "train_tokens_per_second": 115348.423 }, { "epoch": 0.35552909981330666, "grad_norm": 0.17078061401844025, "learning_rate": 0.0038491405268021523, "loss": 3.1396827697753906, "num_input_tokens_seen": 3444572160, "step": 6570, "train_runtime": 29862.0878, "train_tokens_per_second": 115349.341 }, { "epoch": 0.3560702399956709, "grad_norm": 0.17867809534072876, "learning_rate": 0.0038457085040091155, "loss": 3.1499147415161133, "num_input_tokens_seen": 3449815040, "step": 6580, "train_runtime": 29907.2427, "train_tokens_per_second": 115350.488 }, { "epoch": 0.3566113801780351, "grad_norm": 0.15178236365318298, "learning_rate": 0.003842273136068423, "loss": 3.13470344543457, "num_input_tokens_seen": 3455057920, "step": 6590, "train_runtime": 29952.42, "train_tokens_per_second": 115351.545 }, { "epoch": 0.35715252036039935, "grad_norm": 0.17382913827896118, "learning_rate": 0.0038388344334680936, "loss": 3.1436153411865235, "num_input_tokens_seen": 3460300800, "step": 6600, "train_runtime": 29997.6461, "train_tokens_per_second": 115352.411 }, { "epoch": 0.3576936605427636, "grad_norm": 0.17544035613536835, "learning_rate": 0.0038353924067063313, "loss": 3.1381744384765624, "num_input_tokens_seen": 3465543680, "step": 6610, "train_runtime": 30042.8233, "train_tokens_per_second": 115353.462 }, { "epoch": 0.35823480072512787, "grad_norm": 0.15095841884613037, "learning_rate": 0.003831947066291482, "loss": 3.1344669342041014, "num_input_tokens_seen": 3470786560, "step": 6620, "train_runtime": 30088.0009, "train_tokens_per_second": 115354.509 }, { "epoch": 0.3587759409074921, "grad_norm": 0.16399560868740082, "learning_rate": 0.0038284984227420146, "loss": 3.134235382080078, "num_input_tokens_seen": 3476029440, "step": 6630, "train_runtime": 30133.1894, "train_tokens_per_second": 115355.51 }, { "epoch": 0.3593170810898563, "grad_norm": 0.18398840725421906, "learning_rate": 0.003825046486586477, "loss": 3.131580924987793, "num_input_tokens_seen": 3481272320, "step": 6640, "train_runtime": 30178.3732, "train_tokens_per_second": 115356.527 }, { "epoch": 0.35985822127222056, "grad_norm": 0.16813096404075623, "learning_rate": 0.0038215912683634726, "loss": 3.1448497772216797, "num_input_tokens_seen": 3486515200, "step": 6650, "train_runtime": 30223.5423, "train_tokens_per_second": 115357.596 }, { "epoch": 0.3603993614545848, "grad_norm": 0.1649860441684723, "learning_rate": 0.003818132778621623, "loss": 3.14077091217041, "num_input_tokens_seen": 3491758080, "step": 6660, "train_runtime": 30268.7194, "train_tokens_per_second": 115358.633 }, { "epoch": 0.3609405016369491, "grad_norm": 0.17575252056121826, "learning_rate": 0.0038146710279195386, "loss": 3.1330080032348633, "num_input_tokens_seen": 3497000960, "step": 6670, "train_runtime": 30313.9788, "train_tokens_per_second": 115359.352 }, { "epoch": 0.3614816418193133, "grad_norm": 0.1742008924484253, "learning_rate": 0.003811206026825786, "loss": 3.155079460144043, "num_input_tokens_seen": 3502243840, "step": 6680, "train_runtime": 30359.1553, "train_tokens_per_second": 115360.385 }, { "epoch": 0.36202278200167753, "grad_norm": 0.1799112856388092, "learning_rate": 0.0038077377859188524, "loss": 3.1288970947265624, "num_input_tokens_seen": 3507486720, "step": 6690, "train_runtime": 30404.3262, "train_tokens_per_second": 115361.436 }, { "epoch": 0.36256392218404176, "grad_norm": 0.16728277504444122, "learning_rate": 0.003804266315787119, "loss": 3.137259864807129, "num_input_tokens_seen": 3512729600, "step": 6700, "train_runtime": 30449.5017, "train_tokens_per_second": 115362.466 }, { "epoch": 0.363105062366406, "grad_norm": 0.1766940951347351, "learning_rate": 0.0038007916270288234, "loss": 3.1414379119873046, "num_input_tokens_seen": 3517972480, "step": 6710, "train_runtime": 30494.6728, "train_tokens_per_second": 115363.51 }, { "epoch": 0.3636462025487703, "grad_norm": 0.17950496077537537, "learning_rate": 0.0037973137302520312, "loss": 3.141128730773926, "num_input_tokens_seen": 3523215360, "step": 6720, "train_runtime": 30539.8417, "train_tokens_per_second": 115364.559 }, { "epoch": 0.3641873427311345, "grad_norm": 0.17668098211288452, "learning_rate": 0.003793832636074601, "loss": 3.1354911804199217, "num_input_tokens_seen": 3528458240, "step": 6730, "train_runtime": 30585.0013, "train_tokens_per_second": 115365.64 }, { "epoch": 0.36472848291349874, "grad_norm": 0.17323218286037445, "learning_rate": 0.0037903483551241534, "loss": 3.1416683197021484, "num_input_tokens_seen": 3533701120, "step": 6740, "train_runtime": 30630.1549, "train_tokens_per_second": 115366.74 }, { "epoch": 0.36526962309586297, "grad_norm": 0.1715293824672699, "learning_rate": 0.003786860898038038, "loss": 3.133253288269043, "num_input_tokens_seen": 3538944000, "step": 6750, "train_runtime": 30675.3114, "train_tokens_per_second": 115367.826 }, { "epoch": 0.3658107632782272, "grad_norm": 0.16131816804409027, "learning_rate": 0.0037833702754633005, "loss": 3.137991714477539, "num_input_tokens_seen": 3544186880, "step": 6760, "train_runtime": 30720.4583, "train_tokens_per_second": 115368.945 }, { "epoch": 0.3663519034605915, "grad_norm": 0.16405366361141205, "learning_rate": 0.003779876498056652, "loss": 3.149972152709961, "num_input_tokens_seen": 3549429760, "step": 6770, "train_runtime": 30765.5763, "train_tokens_per_second": 115370.17 }, { "epoch": 0.3668930436429557, "grad_norm": 0.1677146553993225, "learning_rate": 0.0037763795764844317, "loss": 3.1432748794555665, "num_input_tokens_seen": 3554672640, "step": 6780, "train_runtime": 30810.7138, "train_tokens_per_second": 115371.317 }, { "epoch": 0.36743418382531995, "grad_norm": 0.1701316237449646, "learning_rate": 0.003772879521422583, "loss": 3.138026809692383, "num_input_tokens_seen": 3559915520, "step": 6790, "train_runtime": 30855.8357, "train_tokens_per_second": 115372.52 }, { "epoch": 0.3679753240076842, "grad_norm": 0.1724764108657837, "learning_rate": 0.0037693763435566125, "loss": 3.1394069671630858, "num_input_tokens_seen": 3565158400, "step": 6800, "train_runtime": 30900.9517, "train_tokens_per_second": 115373.741 }, { "epoch": 0.3685164641900484, "grad_norm": 0.16157887876033783, "learning_rate": 0.00376587005358156, "loss": 3.124007797241211, "num_input_tokens_seen": 3570401280, "step": 6810, "train_runtime": 30946.0772, "train_tokens_per_second": 115374.923 }, { "epoch": 0.3690576043724127, "grad_norm": 0.16729003190994263, "learning_rate": 0.0037623606622019675, "loss": 3.122986602783203, "num_input_tokens_seen": 3575644160, "step": 6820, "train_runtime": 30991.3846, "train_tokens_per_second": 115375.425 }, { "epoch": 0.3695987445547769, "grad_norm": 0.17239217460155487, "learning_rate": 0.003758848180131846, "loss": 3.1259433746337892, "num_input_tokens_seen": 3580887040, "step": 6830, "train_runtime": 31036.5265, "train_tokens_per_second": 115376.54 }, { "epoch": 0.37013988473714116, "grad_norm": 0.1540314108133316, "learning_rate": 0.003755332618094642, "loss": 3.128913688659668, "num_input_tokens_seen": 3586129920, "step": 6840, "train_runtime": 31081.6974, "train_tokens_per_second": 115377.544 }, { "epoch": 0.3706810249195054, "grad_norm": 0.16670770943164825, "learning_rate": 0.0037518139868232036, "loss": 3.1437910079956053, "num_input_tokens_seen": 3591372800, "step": 6850, "train_runtime": 31126.8444, "train_tokens_per_second": 115378.634 }, { "epoch": 0.3712221651018696, "grad_norm": 0.16100816428661346, "learning_rate": 0.0037482922970597512, "loss": 3.1303838729858398, "num_input_tokens_seen": 3596615680, "step": 6860, "train_runtime": 31172.0038, "train_tokens_per_second": 115379.675 }, { "epoch": 0.3717633052842339, "grad_norm": 0.1720798909664154, "learning_rate": 0.0037447675595558417, "loss": 3.139808464050293, "num_input_tokens_seen": 3601858560, "step": 6870, "train_runtime": 31220.5874, "train_tokens_per_second": 115368.059 }, { "epoch": 0.37230444546659813, "grad_norm": 0.15832237899303436, "learning_rate": 0.0037412397850723356, "loss": 3.1387088775634764, "num_input_tokens_seen": 3607101440, "step": 6880, "train_runtime": 31265.7548, "train_tokens_per_second": 115369.082 }, { "epoch": 0.37284558564896236, "grad_norm": 0.16572092473506927, "learning_rate": 0.0037377089843793664, "loss": 3.136234092712402, "num_input_tokens_seen": 3612344320, "step": 6890, "train_runtime": 31310.8828, "train_tokens_per_second": 115370.248 }, { "epoch": 0.3733867258313266, "grad_norm": 0.16967612504959106, "learning_rate": 0.0037341751682563075, "loss": 3.1306957244873046, "num_input_tokens_seen": 3617587200, "step": 6900, "train_runtime": 31356.0169, "train_tokens_per_second": 115371.388 }, { "epoch": 0.3739278660136908, "grad_norm": 0.16561359167099, "learning_rate": 0.0037306383474917356, "loss": 3.128021240234375, "num_input_tokens_seen": 3622830080, "step": 6910, "train_runtime": 31401.1695, "train_tokens_per_second": 115372.457 }, { "epoch": 0.3744690061960551, "grad_norm": 0.16602273285388947, "learning_rate": 0.0037270985328834013, "loss": 3.125231170654297, "num_input_tokens_seen": 3628072960, "step": 6920, "train_runtime": 31446.3403, "train_tokens_per_second": 115373.456 }, { "epoch": 0.37501014637841934, "grad_norm": 0.15461350977420807, "learning_rate": 0.0037235557352381975, "loss": 3.1283363342285155, "num_input_tokens_seen": 3633315840, "step": 6930, "train_runtime": 31491.4936, "train_tokens_per_second": 115374.516 }, { "epoch": 0.37555128656078357, "grad_norm": 0.17157427966594696, "learning_rate": 0.003720009965372121, "loss": 3.136751174926758, "num_input_tokens_seen": 3638558720, "step": 6940, "train_runtime": 31536.6325, "train_tokens_per_second": 115375.626 }, { "epoch": 0.3760924267431478, "grad_norm": 0.15815427899360657, "learning_rate": 0.0037164612341102445, "loss": 3.1335182189941406, "num_input_tokens_seen": 3643801600, "step": 6950, "train_runtime": 31581.7854, "train_tokens_per_second": 115376.682 }, { "epoch": 0.37663356692551203, "grad_norm": 0.16368745267391205, "learning_rate": 0.003712909552286681, "loss": 3.1299674987792967, "num_input_tokens_seen": 3649044480, "step": 6960, "train_runtime": 31626.953, "train_tokens_per_second": 115377.681 }, { "epoch": 0.3771747071078763, "grad_norm": 0.17233121395111084, "learning_rate": 0.003709354930744553, "loss": 3.1409616470336914, "num_input_tokens_seen": 3654287360, "step": 6970, "train_runtime": 31672.1101, "train_tokens_per_second": 115378.715 }, { "epoch": 0.37771584729024055, "grad_norm": 0.1784183382987976, "learning_rate": 0.0037057973803359553, "loss": 3.1445953369140627, "num_input_tokens_seen": 3659530240, "step": 6980, "train_runtime": 31717.2675, "train_tokens_per_second": 115379.745 }, { "epoch": 0.3782569874726048, "grad_norm": 0.1589273363351822, "learning_rate": 0.003702236911921925, "loss": 3.1336727142333984, "num_input_tokens_seen": 3664773120, "step": 6990, "train_runtime": 31762.4428, "train_tokens_per_second": 115380.707 }, { "epoch": 0.378798127654969, "grad_norm": 0.16604717075824738, "learning_rate": 0.00369867353637241, "loss": 3.125100326538086, "num_input_tokens_seen": 3670016000, "step": 7000, "train_runtime": 31807.6091, "train_tokens_per_second": 115381.7 }, { "epoch": 0.378798127654969, "eval_loss": 3.082562208175659, "eval_runtime": 1.983, "eval_samples_per_second": 252.143, "eval_steps_per_second": 4.034, "num_input_tokens_seen": 3670016000, "step": 7000 }, { "epoch": 0.37933926783733324, "grad_norm": 0.16016067564487457, "learning_rate": 0.003695107264566231, "loss": 3.132742691040039, "num_input_tokens_seen": 3675258880, "step": 7010, "train_runtime": 31857.0893, "train_tokens_per_second": 115367.065 }, { "epoch": 0.3798804080196975, "grad_norm": 0.17284226417541504, "learning_rate": 0.003691538107391052, "loss": 3.1309505462646485, "num_input_tokens_seen": 3680501760, "step": 7020, "train_runtime": 31902.2704, "train_tokens_per_second": 115368.02 }, { "epoch": 0.38042154820206175, "grad_norm": 0.16180108487606049, "learning_rate": 0.0036879660757433465, "loss": 3.1276824951171873, "num_input_tokens_seen": 3685744640, "step": 7030, "train_runtime": 31947.4422, "train_tokens_per_second": 115369.006 }, { "epoch": 0.380962688384426, "grad_norm": 0.16350635886192322, "learning_rate": 0.0036843911805283613, "loss": 3.127395248413086, "num_input_tokens_seen": 3690987520, "step": 7040, "train_runtime": 31992.5853, "train_tokens_per_second": 115370.092 }, { "epoch": 0.3815038285667902, "grad_norm": 0.15854142606258392, "learning_rate": 0.0036808134326600872, "loss": 3.1203243255615236, "num_input_tokens_seen": 3696230400, "step": 7050, "train_runtime": 32037.7375, "train_tokens_per_second": 115371.143 }, { "epoch": 0.38204496874915445, "grad_norm": 0.1765364557504654, "learning_rate": 0.0036772328430612245, "loss": 3.1236772537231445, "num_input_tokens_seen": 3701473280, "step": 7060, "train_runtime": 32082.8987, "train_tokens_per_second": 115372.159 }, { "epoch": 0.38258610893151873, "grad_norm": 0.16590341925621033, "learning_rate": 0.0036736494226631486, "loss": 3.1179275512695312, "num_input_tokens_seen": 3706716160, "step": 7070, "train_runtime": 32128.0461, "train_tokens_per_second": 115373.221 }, { "epoch": 0.38312724911388296, "grad_norm": 0.1656789630651474, "learning_rate": 0.0036700631824058763, "loss": 3.1220640182495116, "num_input_tokens_seen": 3711959040, "step": 7080, "train_runtime": 32173.2014, "train_tokens_per_second": 115374.252 }, { "epoch": 0.3836683892962472, "grad_norm": 0.18290071189403534, "learning_rate": 0.003666474133238036, "loss": 3.130259704589844, "num_input_tokens_seen": 3717201920, "step": 7090, "train_runtime": 32218.3695, "train_tokens_per_second": 115375.234 }, { "epoch": 0.3842095294786114, "grad_norm": 0.1678554117679596, "learning_rate": 0.003662882286116827, "loss": 3.128999137878418, "num_input_tokens_seen": 3722444800, "step": 7100, "train_runtime": 32263.5278, "train_tokens_per_second": 115376.248 }, { "epoch": 0.38475066966097565, "grad_norm": 0.16328170895576477, "learning_rate": 0.0036592876520079956, "loss": 3.1096935272216797, "num_input_tokens_seen": 3727687680, "step": 7110, "train_runtime": 32308.6892, "train_tokens_per_second": 115377.249 }, { "epoch": 0.38529180984333994, "grad_norm": 0.16377384960651398, "learning_rate": 0.0036556902418857927, "loss": 3.1283496856689452, "num_input_tokens_seen": 3732930560, "step": 7120, "train_runtime": 32353.8348, "train_tokens_per_second": 115378.303 }, { "epoch": 0.38583295002570417, "grad_norm": 0.17365527153015137, "learning_rate": 0.0036520900667329475, "loss": 3.1340274810791016, "num_input_tokens_seen": 3738173440, "step": 7130, "train_runtime": 32398.9948, "train_tokens_per_second": 115379.303 }, { "epoch": 0.3863740902080684, "grad_norm": 0.17289578914642334, "learning_rate": 0.003648487137540628, "loss": 3.126075553894043, "num_input_tokens_seen": 3743416320, "step": 7140, "train_runtime": 32444.1388, "train_tokens_per_second": 115380.357 }, { "epoch": 0.38691523039043263, "grad_norm": 0.1867065280675888, "learning_rate": 0.003644881465308411, "loss": 3.1279239654541016, "num_input_tokens_seen": 3748659200, "step": 7150, "train_runtime": 32489.3038, "train_tokens_per_second": 115381.334 }, { "epoch": 0.38745637057279686, "grad_norm": 0.16090157628059387, "learning_rate": 0.003641273061044249, "loss": 3.126418685913086, "num_input_tokens_seen": 3753902080, "step": 7160, "train_runtime": 32534.4706, "train_tokens_per_second": 115382.301 }, { "epoch": 0.38799751075516115, "grad_norm": 0.16933725774288177, "learning_rate": 0.003637661935764434, "loss": 3.1228607177734373, "num_input_tokens_seen": 3759144960, "step": 7170, "train_runtime": 32579.6304, "train_tokens_per_second": 115383.29 }, { "epoch": 0.3885386509375254, "grad_norm": 0.16463743150234222, "learning_rate": 0.003634048100493565, "loss": 3.1265775680541994, "num_input_tokens_seen": 3764387840, "step": 7180, "train_runtime": 32624.7971, "train_tokens_per_second": 115384.253 }, { "epoch": 0.3890797911198896, "grad_norm": 0.15814442932605743, "learning_rate": 0.003630431566264515, "loss": 3.126376724243164, "num_input_tokens_seen": 3769630720, "step": 7190, "train_runtime": 32669.9527, "train_tokens_per_second": 115385.252 }, { "epoch": 0.38962093130225384, "grad_norm": 0.16953812539577484, "learning_rate": 0.0036268123441183966, "loss": 3.1293899536132814, "num_input_tokens_seen": 3774873600, "step": 7200, "train_runtime": 32715.1316, "train_tokens_per_second": 115386.166 }, { "epoch": 0.39016207148461807, "grad_norm": 0.18077914416790009, "learning_rate": 0.003623190445104527, "loss": 3.130533218383789, "num_input_tokens_seen": 3780116480, "step": 7210, "train_runtime": 32760.3295, "train_tokens_per_second": 115387.01 }, { "epoch": 0.39070321166698235, "grad_norm": 0.17073588073253632, "learning_rate": 0.003619565880280401, "loss": 3.1266639709472654, "num_input_tokens_seen": 3785359360, "step": 7220, "train_runtime": 32805.4983, "train_tokens_per_second": 115387.955 }, { "epoch": 0.3912443518493466, "grad_norm": 0.16945651173591614, "learning_rate": 0.0036159386607116446, "loss": 3.1234695434570314, "num_input_tokens_seen": 3790602240, "step": 7230, "train_runtime": 32850.6502, "train_tokens_per_second": 115388.956 }, { "epoch": 0.3917854920317108, "grad_norm": 0.17761710286140442, "learning_rate": 0.0036123087974719937, "loss": 3.127792739868164, "num_input_tokens_seen": 3795845120, "step": 7240, "train_runtime": 32895.8256, "train_tokens_per_second": 115389.873 }, { "epoch": 0.39232663221407504, "grad_norm": 0.16878648102283478, "learning_rate": 0.0036086763016432545, "loss": 3.120273208618164, "num_input_tokens_seen": 3801088000, "step": 7250, "train_runtime": 32945.144, "train_tokens_per_second": 115376.275 }, { "epoch": 0.3928677723964393, "grad_norm": 0.15386980772018433, "learning_rate": 0.0036050411843152686, "loss": 3.1222068786621096, "num_input_tokens_seen": 3806330880, "step": 7260, "train_runtime": 32990.288, "train_tokens_per_second": 115377.316 }, { "epoch": 0.39340891257880356, "grad_norm": 0.16980594396591187, "learning_rate": 0.0036014034565858824, "loss": 3.1281028747558595, "num_input_tokens_seen": 3811573760, "step": 7270, "train_runtime": 33035.4429, "train_tokens_per_second": 115378.316 }, { "epoch": 0.3939500527611678, "grad_norm": 0.17536021769046783, "learning_rate": 0.003597763129560911, "loss": 3.1235652923583985, "num_input_tokens_seen": 3816816640, "step": 7280, "train_runtime": 33080.605, "train_tokens_per_second": 115379.288 }, { "epoch": 0.394491192943532, "grad_norm": 0.1680123209953308, "learning_rate": 0.0035941202143541053, "loss": 3.123764991760254, "num_input_tokens_seen": 3822059520, "step": 7290, "train_runtime": 33125.7503, "train_tokens_per_second": 115380.315 }, { "epoch": 0.39503233312589625, "grad_norm": 0.15840236842632294, "learning_rate": 0.003590474722087118, "loss": 3.124995803833008, "num_input_tokens_seen": 3827302400, "step": 7300, "train_runtime": 33170.9067, "train_tokens_per_second": 115381.302 }, { "epoch": 0.3955734733082605, "grad_norm": 0.1702660471200943, "learning_rate": 0.00358682666388947, "loss": 3.1230545043945312, "num_input_tokens_seen": 3832545280, "step": 7310, "train_runtime": 33216.0627, "train_tokens_per_second": 115382.287 }, { "epoch": 0.39611461349062477, "grad_norm": 0.14530692994594574, "learning_rate": 0.003583176050898514, "loss": 3.1195556640625, "num_input_tokens_seen": 3837788160, "step": 7320, "train_runtime": 33261.2169, "train_tokens_per_second": 115383.276 }, { "epoch": 0.396655753672989, "grad_norm": 0.16137973964214325, "learning_rate": 0.003579522894259404, "loss": 3.122934341430664, "num_input_tokens_seen": 3843031040, "step": 7330, "train_runtime": 33306.3711, "train_tokens_per_second": 115384.262 }, { "epoch": 0.39719689385535323, "grad_norm": 0.17957496643066406, "learning_rate": 0.0035758672051250597, "loss": 3.118304443359375, "num_input_tokens_seen": 3848273920, "step": 7340, "train_runtime": 33351.4951, "train_tokens_per_second": 115385.35 }, { "epoch": 0.39773803403771746, "grad_norm": 0.1619359254837036, "learning_rate": 0.003572208994656131, "loss": 3.126445007324219, "num_input_tokens_seen": 3853516800, "step": 7350, "train_runtime": 33396.6238, "train_tokens_per_second": 115386.418 }, { "epoch": 0.3982791742200817, "grad_norm": 0.17734915018081665, "learning_rate": 0.003568548274020967, "loss": 3.1167884826660157, "num_input_tokens_seen": 3858759680, "step": 7360, "train_runtime": 33441.7562, "train_tokens_per_second": 115387.471 }, { "epoch": 0.398820314402446, "grad_norm": 0.17586900293827057, "learning_rate": 0.0035648850543955773, "loss": 3.1228519439697267, "num_input_tokens_seen": 3864002560, "step": 7370, "train_runtime": 33486.9063, "train_tokens_per_second": 115388.46 }, { "epoch": 0.3993614545848102, "grad_norm": 0.17276950180530548, "learning_rate": 0.0035612193469636054, "loss": 3.1270915985107424, "num_input_tokens_seen": 3869245440, "step": 7380, "train_runtime": 33532.0567, "train_tokens_per_second": 115389.446 }, { "epoch": 0.39990259476717444, "grad_norm": 0.1578545719385147, "learning_rate": 0.0035575511629162876, "loss": 3.102129364013672, "num_input_tokens_seen": 3874488320, "step": 7390, "train_runtime": 33577.2022, "train_tokens_per_second": 115390.445 }, { "epoch": 0.40044373494953867, "grad_norm": 0.15498770773410797, "learning_rate": 0.0035538805134524183, "loss": 3.115239715576172, "num_input_tokens_seen": 3879731200, "step": 7400, "train_runtime": 33622.363, "train_tokens_per_second": 115391.39 }, { "epoch": 0.4009848751319029, "grad_norm": 0.15868115425109863, "learning_rate": 0.0035502074097783242, "loss": 3.1181896209716795, "num_input_tokens_seen": 3884974080, "step": 7410, "train_runtime": 33667.5163, "train_tokens_per_second": 115392.358 }, { "epoch": 0.4015260153142672, "grad_norm": 0.1605597585439682, "learning_rate": 0.0035465318631078204, "loss": 3.113156318664551, "num_input_tokens_seen": 3890216960, "step": 7420, "train_runtime": 33712.6623, "train_tokens_per_second": 115393.348 }, { "epoch": 0.4020671554966314, "grad_norm": 0.17280755937099457, "learning_rate": 0.003542853884662183, "loss": 3.1183053970336916, "num_input_tokens_seen": 3895459840, "step": 7430, "train_runtime": 33757.8255, "train_tokens_per_second": 115394.276 }, { "epoch": 0.40260829567899564, "grad_norm": 0.16187331080436707, "learning_rate": 0.0035391734856701092, "loss": 3.1163970947265627, "num_input_tokens_seen": 3900702720, "step": 7440, "train_runtime": 33802.989, "train_tokens_per_second": 115395.201 }, { "epoch": 0.4031494358613599, "grad_norm": 0.1724129021167755, "learning_rate": 0.0035354906773676894, "loss": 3.1170070648193358, "num_input_tokens_seen": 3905945600, "step": 7450, "train_runtime": 33848.1517, "train_tokens_per_second": 115396.127 }, { "epoch": 0.4036905760437241, "grad_norm": 0.17225228250026703, "learning_rate": 0.003531805470998366, "loss": 3.110821533203125, "num_input_tokens_seen": 3911188480, "step": 7460, "train_runtime": 33893.3266, "train_tokens_per_second": 115397.008 }, { "epoch": 0.4042317162260884, "grad_norm": 0.1592818796634674, "learning_rate": 0.0035281178778129073, "loss": 3.116873931884766, "num_input_tokens_seen": 3916431360, "step": 7470, "train_runtime": 33938.5013, "train_tokens_per_second": 115397.888 }, { "epoch": 0.4047728564084526, "grad_norm": 0.1658582091331482, "learning_rate": 0.0035244279090693633, "loss": 3.1268436431884767, "num_input_tokens_seen": 3921674240, "step": 7480, "train_runtime": 33983.671, "train_tokens_per_second": 115398.782 }, { "epoch": 0.40531399659081685, "grad_norm": 0.14836189150810242, "learning_rate": 0.00352073557603304, "loss": 3.114876556396484, "num_input_tokens_seen": 3926917120, "step": 7490, "train_runtime": 34028.8257, "train_tokens_per_second": 115399.725 }, { "epoch": 0.4058551367731811, "grad_norm": 0.16045086085796356, "learning_rate": 0.0035170408899764605, "loss": 3.1156852722167967, "num_input_tokens_seen": 3932160000, "step": 7500, "train_runtime": 34073.9726, "train_tokens_per_second": 115400.692 }, { "epoch": 0.4058551367731811, "eval_loss": 3.0682461261749268, "eval_runtime": 1.9852, "eval_samples_per_second": 251.858, "eval_steps_per_second": 4.03, "num_input_tokens_seen": 3932160000, "step": 7500 }, { "epoch": 0.4063962769555453, "grad_norm": 0.16971535980701447, "learning_rate": 0.0035133438621793296, "loss": 3.1024160385131836, "num_input_tokens_seen": 3937402880, "step": 7510, "train_runtime": 34121.1044, "train_tokens_per_second": 115394.943 }, { "epoch": 0.4069374171379096, "grad_norm": 0.16736076772212982, "learning_rate": 0.003509644503928506, "loss": 3.1206098556518556, "num_input_tokens_seen": 3942645760, "step": 7520, "train_runtime": 34166.2706, "train_tokens_per_second": 115395.848 }, { "epoch": 0.4074785573202738, "grad_norm": 0.16113705933094025, "learning_rate": 0.0035059428265179567, "loss": 3.117937469482422, "num_input_tokens_seen": 3947888640, "step": 7530, "train_runtime": 34211.4099, "train_tokens_per_second": 115396.841 }, { "epoch": 0.40801969750263806, "grad_norm": 0.17517107725143433, "learning_rate": 0.0035022388412487356, "loss": 3.1136932373046875, "num_input_tokens_seen": 3953131520, "step": 7540, "train_runtime": 34256.533, "train_tokens_per_second": 115397.887 }, { "epoch": 0.4085608376850023, "grad_norm": 0.18709343671798706, "learning_rate": 0.003498532559428938, "loss": 3.125676918029785, "num_input_tokens_seen": 3958374400, "step": 7550, "train_runtime": 34301.6505, "train_tokens_per_second": 115398.949 }, { "epoch": 0.4091019778673665, "grad_norm": 0.1633439064025879, "learning_rate": 0.0034948239923736713, "loss": 3.1128585815429686, "num_input_tokens_seen": 3963617280, "step": 7560, "train_runtime": 34346.7672, "train_tokens_per_second": 115400.01 }, { "epoch": 0.4096431180497308, "grad_norm": 0.16776174306869507, "learning_rate": 0.0034911131514050214, "loss": 3.114968681335449, "num_input_tokens_seen": 3968860160, "step": 7570, "train_runtime": 34391.881, "train_tokens_per_second": 115401.078 }, { "epoch": 0.41018425823209503, "grad_norm": 0.17015814781188965, "learning_rate": 0.0034874000478520148, "loss": 3.1098609924316407, "num_input_tokens_seen": 3974103040, "step": 7580, "train_runtime": 34437.0464, "train_tokens_per_second": 115401.971 }, { "epoch": 0.41072539841445926, "grad_norm": 0.1705334633588791, "learning_rate": 0.0034836846930505843, "loss": 3.1172601699829103, "num_input_tokens_seen": 3979345920, "step": 7590, "train_runtime": 34482.2174, "train_tokens_per_second": 115402.843 }, { "epoch": 0.4112665385968235, "grad_norm": 0.17283746600151062, "learning_rate": 0.0034799670983435395, "loss": 3.1093212127685548, "num_input_tokens_seen": 3984588800, "step": 7600, "train_runtime": 34527.3958, "train_tokens_per_second": 115403.688 }, { "epoch": 0.4118076787791877, "grad_norm": 0.1661679744720459, "learning_rate": 0.003476247275080524, "loss": 3.114109992980957, "num_input_tokens_seen": 3989831680, "step": 7610, "train_runtime": 34572.5667, "train_tokens_per_second": 115404.555 }, { "epoch": 0.412348818961552, "grad_norm": 0.16221173107624054, "learning_rate": 0.003472525234617988, "loss": 3.1130563735961916, "num_input_tokens_seen": 3995074560, "step": 7620, "train_runtime": 34617.7625, "train_tokens_per_second": 115405.337 }, { "epoch": 0.41288995914391624, "grad_norm": 0.16985613107681274, "learning_rate": 0.0034688009883191507, "loss": 3.1183204650878906, "num_input_tokens_seen": 4000317440, "step": 7630, "train_runtime": 34662.948, "train_tokens_per_second": 115406.152 }, { "epoch": 0.41343109932628047, "grad_norm": 0.15718990564346313, "learning_rate": 0.003465074547553963, "loss": 3.1192548751831053, "num_input_tokens_seen": 4005560320, "step": 7640, "train_runtime": 34711.7967, "train_tokens_per_second": 115394.785 }, { "epoch": 0.4139722395086447, "grad_norm": 0.16134141385555267, "learning_rate": 0.0034613459236990775, "loss": 3.1101545333862304, "num_input_tokens_seen": 4010803200, "step": 7650, "train_runtime": 34756.9452, "train_tokens_per_second": 115395.734 }, { "epoch": 0.41451337969100893, "grad_norm": 0.16892403364181519, "learning_rate": 0.0034576151281378127, "loss": 3.103810691833496, "num_input_tokens_seen": 4016046080, "step": 7660, "train_runtime": 34802.1069, "train_tokens_per_second": 115396.637 }, { "epoch": 0.4150545198733732, "grad_norm": 0.15722833573818207, "learning_rate": 0.003453882172260114, "loss": 3.109886360168457, "num_input_tokens_seen": 4021288960, "step": 7670, "train_runtime": 34847.275, "train_tokens_per_second": 115397.516 }, { "epoch": 0.41559566005573745, "grad_norm": 0.16605538129806519, "learning_rate": 0.0034501470674625258, "loss": 3.110805892944336, "num_input_tokens_seen": 4026531840, "step": 7680, "train_runtime": 34892.47, "train_tokens_per_second": 115398.303 }, { "epoch": 0.4161368002381017, "grad_norm": 0.1643964648246765, "learning_rate": 0.003446409825148149, "loss": 3.11865348815918, "num_input_tokens_seen": 4031774720, "step": 7690, "train_runtime": 34937.6366, "train_tokens_per_second": 115399.183 }, { "epoch": 0.4166779404204659, "grad_norm": 0.17231661081314087, "learning_rate": 0.003442670456726614, "loss": 3.117427444458008, "num_input_tokens_seen": 4037017600, "step": 7700, "train_runtime": 34982.8067, "train_tokens_per_second": 115400.049 }, { "epoch": 0.41721908060283014, "grad_norm": 0.16913042962551117, "learning_rate": 0.0034389289736140405, "loss": 3.1114864349365234, "num_input_tokens_seen": 4042260480, "step": 7710, "train_runtime": 35027.9883, "train_tokens_per_second": 115400.874 }, { "epoch": 0.4177602207851944, "grad_norm": 0.16182249784469604, "learning_rate": 0.0034351853872330042, "loss": 3.107219696044922, "num_input_tokens_seen": 4047503360, "step": 7720, "train_runtime": 35073.1627, "train_tokens_per_second": 115401.722 }, { "epoch": 0.41830136096755866, "grad_norm": 0.15614280104637146, "learning_rate": 0.003431439709012501, "loss": 3.10361385345459, "num_input_tokens_seen": 4052746240, "step": 7730, "train_runtime": 35118.3339, "train_tokens_per_second": 115402.577 }, { "epoch": 0.4188425011499229, "grad_norm": 0.16172853112220764, "learning_rate": 0.003427691950387916, "loss": 3.10665225982666, "num_input_tokens_seen": 4057989120, "step": 7740, "train_runtime": 35163.5186, "train_tokens_per_second": 115403.386 }, { "epoch": 0.4193836413322871, "grad_norm": 0.1584424078464508, "learning_rate": 0.0034239421228009826, "loss": 3.109303665161133, "num_input_tokens_seen": 4063232000, "step": 7750, "train_runtime": 35208.6903, "train_tokens_per_second": 115404.236 }, { "epoch": 0.41992478151465135, "grad_norm": 0.15736353397369385, "learning_rate": 0.0034201902376997523, "loss": 3.1072481155395506, "num_input_tokens_seen": 4068474880, "step": 7760, "train_runtime": 35253.8805, "train_tokens_per_second": 115405.023 }, { "epoch": 0.42046592169701563, "grad_norm": 0.158221036195755, "learning_rate": 0.0034164363065385577, "loss": 3.107033920288086, "num_input_tokens_seen": 4073717760, "step": 7770, "train_runtime": 35299.0377, "train_tokens_per_second": 115405.915 }, { "epoch": 0.42100706187937986, "grad_norm": 0.16100963950157166, "learning_rate": 0.0034126803407779783, "loss": 3.102493667602539, "num_input_tokens_seen": 4078960640, "step": 7780, "train_runtime": 35344.2177, "train_tokens_per_second": 115406.732 }, { "epoch": 0.4215482020617441, "grad_norm": 0.15508411824703217, "learning_rate": 0.0034089223518848043, "loss": 3.110720634460449, "num_input_tokens_seen": 4084203520, "step": 7790, "train_runtime": 35389.3807, "train_tokens_per_second": 115407.601 }, { "epoch": 0.4220893422441083, "grad_norm": 0.16234534978866577, "learning_rate": 0.0034051623513320028, "loss": 3.116852378845215, "num_input_tokens_seen": 4089446400, "step": 7800, "train_runtime": 35434.5473, "train_tokens_per_second": 115408.456 }, { "epoch": 0.42263048242647255, "grad_norm": 0.15150156617164612, "learning_rate": 0.003401400350598683, "loss": 3.110218048095703, "num_input_tokens_seen": 4094689280, "step": 7810, "train_runtime": 35479.7081, "train_tokens_per_second": 115409.328 }, { "epoch": 0.42317162260883684, "grad_norm": 0.16316647827625275, "learning_rate": 0.0033976363611700608, "loss": 3.099168395996094, "num_input_tokens_seen": 4099932160, "step": 7820, "train_runtime": 35524.9004, "train_tokens_per_second": 115410.096 }, { "epoch": 0.42371276279120107, "grad_norm": 0.15622437000274658, "learning_rate": 0.00339387039453742, "loss": 3.1079681396484373, "num_input_tokens_seen": 4105175040, "step": 7830, "train_runtime": 35570.568, "train_tokens_per_second": 115409.319 }, { "epoch": 0.4242539029735653, "grad_norm": 0.1611352562904358, "learning_rate": 0.0033901024621980865, "loss": 3.1027732849121095, "num_input_tokens_seen": 4110417920, "step": 7840, "train_runtime": 35628.7933, "train_tokens_per_second": 115367.868 }, { "epoch": 0.42479504315592953, "grad_norm": 0.1534154713153839, "learning_rate": 0.0033863325756553824, "loss": 3.1010990142822266, "num_input_tokens_seen": 4115660800, "step": 7850, "train_runtime": 35677.6783, "train_tokens_per_second": 115356.744 }, { "epoch": 0.42533618333829376, "grad_norm": 0.16484984755516052, "learning_rate": 0.0033825607464185994, "loss": 3.0935718536376955, "num_input_tokens_seen": 4120903680, "step": 7860, "train_runtime": 35722.8483, "train_tokens_per_second": 115357.646 }, { "epoch": 0.42587732352065805, "grad_norm": 0.15278859436511993, "learning_rate": 0.0033787869860029576, "loss": 3.095734405517578, "num_input_tokens_seen": 4126146560, "step": 7870, "train_runtime": 35768.0118, "train_tokens_per_second": 115358.566 }, { "epoch": 0.4264184637030223, "grad_norm": 0.16884206235408783, "learning_rate": 0.003375011305929574, "loss": 3.1056522369384765, "num_input_tokens_seen": 4131389440, "step": 7880, "train_runtime": 35813.1554, "train_tokens_per_second": 115359.549 }, { "epoch": 0.4269596038853865, "grad_norm": 0.15963584184646606, "learning_rate": 0.003371233717725426, "loss": 3.1040569305419923, "num_input_tokens_seen": 4136632320, "step": 7890, "train_runtime": 35858.3104, "train_tokens_per_second": 115360.492 }, { "epoch": 0.42750074406775074, "grad_norm": 0.1541411578655243, "learning_rate": 0.0033674542329233175, "loss": 3.1086753845214843, "num_input_tokens_seen": 4141875200, "step": 7900, "train_runtime": 35903.4547, "train_tokens_per_second": 115361.467 }, { "epoch": 0.42804188425011497, "grad_norm": 0.16819094121456146, "learning_rate": 0.003363672863061842, "loss": 3.108404350280762, "num_input_tokens_seen": 4147118080, "step": 7910, "train_runtime": 35948.5895, "train_tokens_per_second": 115362.47 }, { "epoch": 0.42858302443247925, "grad_norm": 0.15858127176761627, "learning_rate": 0.003359889619685346, "loss": 3.1061111450195313, "num_input_tokens_seen": 4152360960, "step": 7920, "train_runtime": 35993.7102, "train_tokens_per_second": 115363.516 }, { "epoch": 0.4291241646148435, "grad_norm": 0.15731550753116608, "learning_rate": 0.003356104514343899, "loss": 3.1057785034179686, "num_input_tokens_seen": 4157603840, "step": 7930, "train_runtime": 36038.862, "train_tokens_per_second": 115364.46 }, { "epoch": 0.4296653047972077, "grad_norm": 0.14610068500041962, "learning_rate": 0.0033523175585932524, "loss": 3.09300537109375, "num_input_tokens_seen": 4162846720, "step": 7940, "train_runtime": 36084.0029, "train_tokens_per_second": 115365.436 }, { "epoch": 0.43020644497957194, "grad_norm": 0.1672324687242508, "learning_rate": 0.003348528763994809, "loss": 3.1017438888549806, "num_input_tokens_seen": 4168089600, "step": 7950, "train_runtime": 36129.1342, "train_tokens_per_second": 115366.44 }, { "epoch": 0.4307475851619362, "grad_norm": 0.1700795590877533, "learning_rate": 0.003344738142115583, "loss": 3.0958410263061524, "num_input_tokens_seen": 4173332480, "step": 7960, "train_runtime": 36174.3238, "train_tokens_per_second": 115367.256 }, { "epoch": 0.43128872534430046, "grad_norm": 0.15165534615516663, "learning_rate": 0.00334094570452817, "loss": 3.101241874694824, "num_input_tokens_seen": 4178575360, "step": 7970, "train_runtime": 36219.5195, "train_tokens_per_second": 115368.051 }, { "epoch": 0.4318298655266647, "grad_norm": 0.1584347039461136, "learning_rate": 0.0033371514628107073, "loss": 3.101197052001953, "num_input_tokens_seen": 4183818240, "step": 7980, "train_runtime": 36264.6831, "train_tokens_per_second": 115368.945 }, { "epoch": 0.4323710057090289, "grad_norm": 0.15928910672664642, "learning_rate": 0.0033333554285468387, "loss": 3.1082935333251953, "num_input_tokens_seen": 4189061120, "step": 7990, "train_runtime": 36309.973, "train_tokens_per_second": 115369.436 }, { "epoch": 0.43291214589139315, "grad_norm": 0.15439751744270325, "learning_rate": 0.003329557613325685, "loss": 3.1111793518066406, "num_input_tokens_seen": 4194304000, "step": 8000, "train_runtime": 36355.1728, "train_tokens_per_second": 115370.212 }, { "epoch": 0.43291214589139315, "eval_loss": 3.0542824268341064, "eval_runtime": 1.9899, "eval_samples_per_second": 251.266, "eval_steps_per_second": 4.02, "num_input_tokens_seen": 4194304000, "step": 8000 }, { "epoch": 0.4334532860737574, "grad_norm": 0.1620936542749405, "learning_rate": 0.0033257580287417987, "loss": 3.1044567108154295, "num_input_tokens_seen": 4199546880, "step": 8010, "train_runtime": 36404.8051, "train_tokens_per_second": 115356.939 }, { "epoch": 0.43399442625612167, "grad_norm": 0.16753153502941132, "learning_rate": 0.0033219566863951383, "loss": 3.0971731185913085, "num_input_tokens_seen": 4204789760, "step": 8020, "train_runtime": 36454.2975, "train_tokens_per_second": 115344.144 }, { "epoch": 0.4345355664384859, "grad_norm": 0.15557527542114258, "learning_rate": 0.0033181535978910265, "loss": 3.099981689453125, "num_input_tokens_seen": 4210032640, "step": 8030, "train_runtime": 36499.5202, "train_tokens_per_second": 115344.876 }, { "epoch": 0.43507670662085013, "grad_norm": 0.15290997922420502, "learning_rate": 0.0033143487748401174, "loss": 3.1018728256225585, "num_input_tokens_seen": 4215275520, "step": 8040, "train_runtime": 36544.6707, "train_tokens_per_second": 115345.834 }, { "epoch": 0.43561784680321436, "grad_norm": 0.15225110948085785, "learning_rate": 0.0033105422288583616, "loss": 3.09820671081543, "num_input_tokens_seen": 4220518400, "step": 8050, "train_runtime": 36589.8576, "train_tokens_per_second": 115346.675 }, { "epoch": 0.4361589869855786, "grad_norm": 0.16044707596302032, "learning_rate": 0.003306733971566968, "loss": 3.1036590576171874, "num_input_tokens_seen": 4225761280, "step": 8060, "train_runtime": 36635.0375, "train_tokens_per_second": 115347.535 }, { "epoch": 0.4367001271679429, "grad_norm": 0.1795279085636139, "learning_rate": 0.0033029240145923708, "loss": 3.102092170715332, "num_input_tokens_seen": 4231004160, "step": 8070, "train_runtime": 36680.2289, "train_tokens_per_second": 115348.357 }, { "epoch": 0.4372412673503071, "grad_norm": 0.16536639630794525, "learning_rate": 0.003299112369566194, "loss": 3.101215934753418, "num_input_tokens_seen": 4236247040, "step": 8080, "train_runtime": 36725.4044, "train_tokens_per_second": 115349.228 }, { "epoch": 0.43778240753267134, "grad_norm": 0.1551489681005478, "learning_rate": 0.003295299048125215, "loss": 3.1048954010009764, "num_input_tokens_seen": 4241489920, "step": 8090, "train_runtime": 36770.5602, "train_tokens_per_second": 115350.158 }, { "epoch": 0.43832354771503557, "grad_norm": 0.15728254616260529, "learning_rate": 0.0032914840619113267, "loss": 3.0963891983032226, "num_input_tokens_seen": 4246732800, "step": 8100, "train_runtime": 36815.7389, "train_tokens_per_second": 115351.014 }, { "epoch": 0.4388646878973998, "grad_norm": 0.16317813098430634, "learning_rate": 0.0032876674225715092, "loss": 3.095835876464844, "num_input_tokens_seen": 4251975680, "step": 8110, "train_runtime": 36860.9041, "train_tokens_per_second": 115351.91 }, { "epoch": 0.4394058280797641, "grad_norm": 0.16513289511203766, "learning_rate": 0.0032838491417577845, "loss": 3.100272369384766, "num_input_tokens_seen": 4257218560, "step": 8120, "train_runtime": 36906.0614, "train_tokens_per_second": 115352.828 }, { "epoch": 0.4399469682621283, "grad_norm": 0.154524028301239, "learning_rate": 0.003280029231127189, "loss": 3.1007152557373048, "num_input_tokens_seen": 4262461440, "step": 8130, "train_runtime": 36951.2345, "train_tokens_per_second": 115353.695 }, { "epoch": 0.44048810844449254, "grad_norm": 0.16192783415317535, "learning_rate": 0.003276207702341735, "loss": 3.1067665100097654, "num_input_tokens_seen": 4267704320, "step": 8140, "train_runtime": 36996.3989, "train_tokens_per_second": 115354.587 }, { "epoch": 0.4410292486268568, "grad_norm": 0.1726672351360321, "learning_rate": 0.003272384567068373, "loss": 3.098089027404785, "num_input_tokens_seen": 4272947200, "step": 8150, "train_runtime": 37041.5682, "train_tokens_per_second": 115355.462 }, { "epoch": 0.441570388809221, "grad_norm": 0.14850319921970367, "learning_rate": 0.00326855983697896, "loss": 3.0921985626220705, "num_input_tokens_seen": 4278190080, "step": 8160, "train_runtime": 37086.741, "train_tokens_per_second": 115356.323 }, { "epoch": 0.4421115289915853, "grad_norm": 0.15166330337524414, "learning_rate": 0.0032647335237502195, "loss": 3.101424789428711, "num_input_tokens_seen": 4283432960, "step": 8170, "train_runtime": 37131.9267, "train_tokens_per_second": 115357.142 }, { "epoch": 0.4426526691739495, "grad_norm": 0.15639446675777435, "learning_rate": 0.0032609056390637114, "loss": 3.098773193359375, "num_input_tokens_seen": 4288675840, "step": 8180, "train_runtime": 37177.0966, "train_tokens_per_second": 115358.009 }, { "epoch": 0.44319380935631375, "grad_norm": 0.1532983034849167, "learning_rate": 0.003257076194605791, "loss": 3.1019330978393556, "num_input_tokens_seen": 4293918720, "step": 8190, "train_runtime": 37222.2634, "train_tokens_per_second": 115358.883 }, { "epoch": 0.443734949538678, "grad_norm": 0.17717613279819489, "learning_rate": 0.0032532452020675763, "loss": 3.099607467651367, "num_input_tokens_seen": 4299161600, "step": 8200, "train_runtime": 37267.4247, "train_tokens_per_second": 115359.772 }, { "epoch": 0.4442760897210422, "grad_norm": 0.15027566254138947, "learning_rate": 0.00324941267314491, "loss": 3.107021713256836, "num_input_tokens_seen": 4304404480, "step": 8210, "train_runtime": 37312.5891, "train_tokens_per_second": 115360.649 }, { "epoch": 0.4448172299034065, "grad_norm": 0.16586042940616608, "learning_rate": 0.0032455786195383285, "loss": 3.0993444442749025, "num_input_tokens_seen": 4309647360, "step": 8220, "train_runtime": 37357.762, "train_tokens_per_second": 115361.497 }, { "epoch": 0.4453583700857707, "grad_norm": 0.16227947175502777, "learning_rate": 0.00324174305295302, "loss": 3.0918336868286134, "num_input_tokens_seen": 4314890240, "step": 8230, "train_runtime": 37402.9165, "train_tokens_per_second": 115362.401 }, { "epoch": 0.44589951026813496, "grad_norm": 0.16855213046073914, "learning_rate": 0.0032379059850987926, "loss": 3.0997894287109373, "num_input_tokens_seen": 4320133120, "step": 8240, "train_runtime": 37448.08, "train_tokens_per_second": 115363.274 }, { "epoch": 0.4464406504504992, "grad_norm": 0.15484896302223206, "learning_rate": 0.003234067427690039, "loss": 3.0965702056884767, "num_input_tokens_seen": 4325376000, "step": 8250, "train_runtime": 37493.242, "train_tokens_per_second": 115364.15 }, { "epoch": 0.4469817906328634, "grad_norm": 0.16154596209526062, "learning_rate": 0.0032302273924456966, "loss": 3.0933055877685547, "num_input_tokens_seen": 4330618880, "step": 8260, "train_runtime": 37538.3815, "train_tokens_per_second": 115365.093 }, { "epoch": 0.4475229308152277, "grad_norm": 0.1575249284505844, "learning_rate": 0.003226385891089219, "loss": 3.0924747467041014, "num_input_tokens_seen": 4335861760, "step": 8270, "train_runtime": 37583.5299, "train_tokens_per_second": 115366.007 }, { "epoch": 0.44806407099759193, "grad_norm": 0.160599023103714, "learning_rate": 0.0032225429353485296, "loss": 3.096691131591797, "num_input_tokens_seen": 4341104640, "step": 8280, "train_runtime": 37628.6879, "train_tokens_per_second": 115366.889 }, { "epoch": 0.44860521117995616, "grad_norm": 0.15803949534893036, "learning_rate": 0.003218698536955999, "loss": 3.1002126693725587, "num_input_tokens_seen": 4346347520, "step": 8290, "train_runtime": 37673.8359, "train_tokens_per_second": 115367.799 }, { "epoch": 0.4491463513623204, "grad_norm": 0.1458759903907776, "learning_rate": 0.0032148527076483963, "loss": 3.0890472412109373, "num_input_tokens_seen": 4351590400, "step": 8300, "train_runtime": 37718.9873, "train_tokens_per_second": 115368.697 }, { "epoch": 0.4496874915446846, "grad_norm": 0.15396055579185486, "learning_rate": 0.0032110054591668624, "loss": 3.0894855499267577, "num_input_tokens_seen": 4356833280, "step": 8310, "train_runtime": 37764.1751, "train_tokens_per_second": 115369.481 }, { "epoch": 0.4502286317270489, "grad_norm": 0.15826280415058136, "learning_rate": 0.0032071568032568704, "loss": 3.1003223419189454, "num_input_tokens_seen": 4362076160, "step": 8320, "train_runtime": 37809.3568, "train_tokens_per_second": 115370.282 }, { "epoch": 0.45076977190941314, "grad_norm": 0.16446684300899506, "learning_rate": 0.003203306751668188, "loss": 3.093168258666992, "num_input_tokens_seen": 4367319040, "step": 8330, "train_runtime": 37854.5343, "train_tokens_per_second": 115371.094 }, { "epoch": 0.45131091209177737, "grad_norm": 0.1580137461423874, "learning_rate": 0.0031994553161548474, "loss": 3.101323699951172, "num_input_tokens_seen": 4372561920, "step": 8340, "train_runtime": 37899.7112, "train_tokens_per_second": 115371.906 }, { "epoch": 0.4518520522741416, "grad_norm": 0.15007439255714417, "learning_rate": 0.003195602508475103, "loss": 3.0974876403808596, "num_input_tokens_seen": 4377804800, "step": 8350, "train_runtime": 37944.8699, "train_tokens_per_second": 115372.771 }, { "epoch": 0.45239319245650583, "grad_norm": 0.16612504422664642, "learning_rate": 0.0031917483403914, "loss": 3.097567558288574, "num_input_tokens_seen": 4383047680, "step": 8360, "train_runtime": 37990.0345, "train_tokens_per_second": 115373.617 }, { "epoch": 0.4529343326388701, "grad_norm": 0.152009978890419, "learning_rate": 0.0031878928236703354, "loss": 3.09008674621582, "num_input_tokens_seen": 4388290560, "step": 8370, "train_runtime": 38035.2108, "train_tokens_per_second": 115374.425 }, { "epoch": 0.45347547282123435, "grad_norm": 0.14635391533374786, "learning_rate": 0.003184035970082625, "loss": 3.0835281372070313, "num_input_tokens_seen": 4393533440, "step": 8380, "train_runtime": 38080.3814, "train_tokens_per_second": 115375.248 }, { "epoch": 0.4540166130035986, "grad_norm": 0.16529026627540588, "learning_rate": 0.0031801777914030657, "loss": 3.0935291290283202, "num_input_tokens_seen": 4398776320, "step": 8390, "train_runtime": 38125.5235, "train_tokens_per_second": 115376.155 }, { "epoch": 0.4545577531859628, "grad_norm": 0.15716882050037384, "learning_rate": 0.003176318299410499, "loss": 3.0900102615356446, "num_input_tokens_seen": 4404019200, "step": 8400, "train_runtime": 38175.1087, "train_tokens_per_second": 115363.632 }, { "epoch": 0.45509889336832704, "grad_norm": 0.15623128414154053, "learning_rate": 0.003172457505887777, "loss": 3.0833271026611326, "num_input_tokens_seen": 4409262080, "step": 8410, "train_runtime": 38220.3198, "train_tokens_per_second": 115364.343 }, { "epoch": 0.4556400335506913, "grad_norm": 0.1591528207063675, "learning_rate": 0.0031685954226217234, "loss": 3.0901105880737303, "num_input_tokens_seen": 4414504960, "step": 8420, "train_runtime": 38265.5513, "train_tokens_per_second": 115364.99 }, { "epoch": 0.45618117373305556, "grad_norm": 0.16141286492347717, "learning_rate": 0.003164732061403102, "loss": 3.0906259536743166, "num_input_tokens_seen": 4419747840, "step": 8430, "train_runtime": 38310.7985, "train_tokens_per_second": 115365.589 }, { "epoch": 0.4567223139154198, "grad_norm": 0.15204599499702454, "learning_rate": 0.0031608674340265768, "loss": 3.084097671508789, "num_input_tokens_seen": 4424990720, "step": 8440, "train_runtime": 38356.0563, "train_tokens_per_second": 115366.154 }, { "epoch": 0.457263454097784, "grad_norm": 0.15592141449451447, "learning_rate": 0.003157001552290677, "loss": 3.0875980377197267, "num_input_tokens_seen": 4430233600, "step": 8450, "train_runtime": 38401.2772, "train_tokens_per_second": 115366.829 }, { "epoch": 0.45780459428014825, "grad_norm": 0.15805137157440186, "learning_rate": 0.0031531344279977615, "loss": 3.0840667724609374, "num_input_tokens_seen": 4435476480, "step": 8460, "train_runtime": 38446.5182, "train_tokens_per_second": 115367.443 }, { "epoch": 0.45834573446251253, "grad_norm": 0.1569671630859375, "learning_rate": 0.003149266072953983, "loss": 3.095382308959961, "num_input_tokens_seen": 4440719360, "step": 8470, "train_runtime": 38491.7485, "train_tokens_per_second": 115368.086 }, { "epoch": 0.45888687464487676, "grad_norm": 0.15263330936431885, "learning_rate": 0.0031453964989692517, "loss": 3.0909893035888674, "num_input_tokens_seen": 4445962240, "step": 8480, "train_runtime": 38536.9781, "train_tokens_per_second": 115368.73 }, { "epoch": 0.459428014827241, "grad_norm": 0.16741898655891418, "learning_rate": 0.0031415257178571986, "loss": 3.091363525390625, "num_input_tokens_seen": 4451205120, "step": 8490, "train_runtime": 38582.2145, "train_tokens_per_second": 115369.353 }, { "epoch": 0.4599691550096052, "grad_norm": 0.1621858924627304, "learning_rate": 0.0031376537414351414, "loss": 3.0860706329345704, "num_input_tokens_seen": 4456448000, "step": 8500, "train_runtime": 38627.4547, "train_tokens_per_second": 115369.963 }, { "epoch": 0.4599691550096052, "eval_loss": 3.044252634048462, "eval_runtime": 1.9941, "eval_samples_per_second": 250.745, "eval_steps_per_second": 4.012, "num_input_tokens_seen": 4456448000, "step": 8500 }, { "epoch": 0.46051029519196945, "grad_norm": 0.1537708044052124, "learning_rate": 0.0031337805815240443, "loss": 3.0971357345581056, "num_input_tokens_seen": 4461690880, "step": 8510, "train_runtime": 38674.6947, "train_tokens_per_second": 115364.605 }, { "epoch": 0.46105143537433374, "grad_norm": 0.1561785489320755, "learning_rate": 0.0031299062499484886, "loss": 3.095275115966797, "num_input_tokens_seen": 4466933760, "step": 8520, "train_runtime": 38719.9322, "train_tokens_per_second": 115365.227 }, { "epoch": 0.46159257555669797, "grad_norm": 0.15904352068901062, "learning_rate": 0.0031260307585366277, "loss": 3.093882942199707, "num_input_tokens_seen": 4472176640, "step": 8530, "train_runtime": 38765.1761, "train_tokens_per_second": 115365.828 }, { "epoch": 0.4621337157390622, "grad_norm": 0.1586320400238037, "learning_rate": 0.00312215411912016, "loss": 3.0862545013427733, "num_input_tokens_seen": 4477419520, "step": 8540, "train_runtime": 38810.3531, "train_tokens_per_second": 115366.627 }, { "epoch": 0.46267485592142643, "grad_norm": 0.15955059230327606, "learning_rate": 0.003118276343534288, "loss": 3.09029598236084, "num_input_tokens_seen": 4482662400, "step": 8550, "train_runtime": 38855.5252, "train_tokens_per_second": 115367.438 }, { "epoch": 0.46321599610379066, "grad_norm": 0.16844794154167175, "learning_rate": 0.0031143974436176804, "loss": 3.08276252746582, "num_input_tokens_seen": 4487905280, "step": 8560, "train_runtime": 38900.7107, "train_tokens_per_second": 115368.208 }, { "epoch": 0.46375713628615495, "grad_norm": 0.15490221977233887, "learning_rate": 0.003110517431212442, "loss": 3.096157455444336, "num_input_tokens_seen": 4493148160, "step": 8570, "train_runtime": 38945.895, "train_tokens_per_second": 115368.979 }, { "epoch": 0.4642982764685192, "grad_norm": 0.1643703430891037, "learning_rate": 0.0031066363181640705, "loss": 3.094961929321289, "num_input_tokens_seen": 4498391040, "step": 8580, "train_runtime": 38991.0775, "train_tokens_per_second": 115369.754 }, { "epoch": 0.4648394166508834, "grad_norm": 0.16452111303806305, "learning_rate": 0.003102754116321427, "loss": 3.0949285507202147, "num_input_tokens_seen": 4503633920, "step": 8590, "train_runtime": 39036.2496, "train_tokens_per_second": 115370.558 }, { "epoch": 0.46538055683324764, "grad_norm": 0.15409614145755768, "learning_rate": 0.003098870837536694, "loss": 3.083492660522461, "num_input_tokens_seen": 4508876800, "step": 8600, "train_runtime": 39081.4455, "train_tokens_per_second": 115371.29 }, { "epoch": 0.46592169701561187, "grad_norm": 0.16283227503299713, "learning_rate": 0.0030949864936653444, "loss": 3.0859600067138673, "num_input_tokens_seen": 4514119680, "step": 8610, "train_runtime": 39126.6271, "train_tokens_per_second": 115372.063 }, { "epoch": 0.46646283719797615, "grad_norm": 0.15932060778141022, "learning_rate": 0.0030911010965660995, "loss": 3.0858314514160154, "num_input_tokens_seen": 4519362560, "step": 8620, "train_runtime": 39171.8041, "train_tokens_per_second": 115372.847 }, { "epoch": 0.4670039773803404, "grad_norm": 0.15321630239486694, "learning_rate": 0.0030872146581008993, "loss": 3.0855281829833983, "num_input_tokens_seen": 4524605440, "step": 8630, "train_runtime": 39217.0067, "train_tokens_per_second": 115373.554 }, { "epoch": 0.4675451175627046, "grad_norm": 0.15142542123794556, "learning_rate": 0.0030833271901348604, "loss": 3.0922718048095703, "num_input_tokens_seen": 4529848320, "step": 8640, "train_runtime": 39262.1839, "train_tokens_per_second": 115374.334 }, { "epoch": 0.46808625774506885, "grad_norm": 0.15679921209812164, "learning_rate": 0.0030794387045362448, "loss": 3.089971923828125, "num_input_tokens_seen": 4535091200, "step": 8650, "train_runtime": 39307.3714, "train_tokens_per_second": 115375.082 }, { "epoch": 0.4686273979274331, "grad_norm": 0.149771049618721, "learning_rate": 0.0030755492131764196, "loss": 3.0910947799682615, "num_input_tokens_seen": 4540334080, "step": 8660, "train_runtime": 39352.6168, "train_tokens_per_second": 115375.659 }, { "epoch": 0.46916853810979736, "grad_norm": 0.15804412961006165, "learning_rate": 0.003071658727929823, "loss": 3.096923065185547, "num_input_tokens_seen": 4545576960, "step": 8670, "train_runtime": 39397.8863, "train_tokens_per_second": 115376.163 }, { "epoch": 0.4697096782921616, "grad_norm": 0.17401130497455597, "learning_rate": 0.003067767260673929, "loss": 3.0941158294677735, "num_input_tokens_seen": 4550819840, "step": 8680, "train_runtime": 39443.148, "train_tokens_per_second": 115376.69 }, { "epoch": 0.4702508184745258, "grad_norm": 0.16347981989383698, "learning_rate": 0.003063874823289205, "loss": 3.0893718719482424, "num_input_tokens_seen": 4556062720, "step": 8690, "train_runtime": 39488.3966, "train_tokens_per_second": 115377.253 }, { "epoch": 0.47079195865689005, "grad_norm": 0.16059865057468414, "learning_rate": 0.003059981427659086, "loss": 3.0792430877685546, "num_input_tokens_seen": 4561305600, "step": 8700, "train_runtime": 39533.6411, "train_tokens_per_second": 115377.827 }, { "epoch": 0.4713330988392543, "grad_norm": 0.15623228251934052, "learning_rate": 0.0030560870856699285, "loss": 3.0796392440795897, "num_input_tokens_seen": 4566548480, "step": 8710, "train_runtime": 39578.8987, "train_tokens_per_second": 115378.361 }, { "epoch": 0.47187423902161857, "grad_norm": 0.17271380126476288, "learning_rate": 0.003052191809210979, "loss": 3.0749179840087892, "num_input_tokens_seen": 4571791360, "step": 8720, "train_runtime": 39624.1426, "train_tokens_per_second": 115378.935 }, { "epoch": 0.4724153792039828, "grad_norm": 0.1617797613143921, "learning_rate": 0.0030482956101743385, "loss": 3.077177047729492, "num_input_tokens_seen": 4577034240, "step": 8730, "train_runtime": 39669.3874, "train_tokens_per_second": 115379.504 }, { "epoch": 0.47295651938634703, "grad_norm": 0.15339480340480804, "learning_rate": 0.0030443985004549234, "loss": 3.0854717254638673, "num_input_tokens_seen": 4582277120, "step": 8740, "train_runtime": 39714.6736, "train_tokens_per_second": 115379.952 }, { "epoch": 0.47349765956871126, "grad_norm": 0.1538633406162262, "learning_rate": 0.00304050049195043, "loss": 3.0906457901000977, "num_input_tokens_seen": 4587520000, "step": 8750, "train_runtime": 39759.8518, "train_tokens_per_second": 115380.712 }, { "epoch": 0.4740387997510755, "grad_norm": 0.16446056962013245, "learning_rate": 0.0030366015965612976, "loss": 3.0834827423095703, "num_input_tokens_seen": 4592762880, "step": 8760, "train_runtime": 39805.0284, "train_tokens_per_second": 115381.475 }, { "epoch": 0.4745799399334398, "grad_norm": 0.15852907299995422, "learning_rate": 0.003032701826190677, "loss": 3.077737808227539, "num_input_tokens_seen": 4598005760, "step": 8770, "train_runtime": 39850.2201, "train_tokens_per_second": 115382.192 }, { "epoch": 0.475121080115804, "grad_norm": 0.15762916207313538, "learning_rate": 0.003028801192744386, "loss": 3.074782943725586, "num_input_tokens_seen": 4603248640, "step": 8780, "train_runtime": 39899.2419, "train_tokens_per_second": 115371.832 }, { "epoch": 0.47566222029816824, "grad_norm": 0.1619185209274292, "learning_rate": 0.0030248997081308788, "loss": 3.0825977325439453, "num_input_tokens_seen": 4608491520, "step": 8790, "train_runtime": 39944.5192, "train_tokens_per_second": 115372.312 }, { "epoch": 0.47620336048053247, "grad_norm": 0.16285711526870728, "learning_rate": 0.0030209973842612097, "loss": 3.080776405334473, "num_input_tokens_seen": 4613734400, "step": 8800, "train_runtime": 39989.7251, "train_tokens_per_second": 115372.996 }, { "epoch": 0.4767445006628967, "grad_norm": 0.17198577523231506, "learning_rate": 0.003017094233048994, "loss": 3.0829303741455076, "num_input_tokens_seen": 4618977280, "step": 8810, "train_runtime": 40034.9384, "train_tokens_per_second": 115373.658 }, { "epoch": 0.477285640845261, "grad_norm": 0.16893431544303894, "learning_rate": 0.003013190266410372, "loss": 3.0930507659912108, "num_input_tokens_seen": 4624220160, "step": 8820, "train_runtime": 40080.146, "train_tokens_per_second": 115374.334 }, { "epoch": 0.4778267810276252, "grad_norm": 0.1534847915172577, "learning_rate": 0.003009285496263973, "loss": 3.086047554016113, "num_input_tokens_seen": 4629463040, "step": 8830, "train_runtime": 40125.3234, "train_tokens_per_second": 115375.096 }, { "epoch": 0.47836792120998944, "grad_norm": 0.16068360209465027, "learning_rate": 0.003005379934530884, "loss": 3.0864025115966798, "num_input_tokens_seen": 4634705920, "step": 8840, "train_runtime": 40170.4754, "train_tokens_per_second": 115375.929 }, { "epoch": 0.4789090613923537, "grad_norm": 0.1436556577682495, "learning_rate": 0.003001473593134602, "loss": 3.0830524444580076, "num_input_tokens_seen": 4639948800, "step": 8850, "train_runtime": 40215.6345, "train_tokens_per_second": 115376.74 }, { "epoch": 0.4794502015747179, "grad_norm": 0.14522488415241241, "learning_rate": 0.0029975664840010104, "loss": 3.0799121856689453, "num_input_tokens_seen": 4645191680, "step": 8860, "train_runtime": 40260.7882, "train_tokens_per_second": 115377.564 }, { "epoch": 0.4799913417570822, "grad_norm": 0.16195201873779297, "learning_rate": 0.002993658619058331, "loss": 3.071552848815918, "num_input_tokens_seen": 4650434560, "step": 8870, "train_runtime": 40305.9418, "train_tokens_per_second": 115378.387 }, { "epoch": 0.4805324819394464, "grad_norm": 0.14948424696922302, "learning_rate": 0.0029897500102370974, "loss": 3.0818138122558594, "num_input_tokens_seen": 4655677440, "step": 8880, "train_runtime": 40351.0976, "train_tokens_per_second": 115379.202 }, { "epoch": 0.48107362212181065, "grad_norm": 0.1461019665002823, "learning_rate": 0.0029858406694701117, "loss": 3.082274627685547, "num_input_tokens_seen": 4660920320, "step": 8890, "train_runtime": 40396.2463, "train_tokens_per_second": 115380.035 }, { "epoch": 0.4816147623041749, "grad_norm": 0.1501043438911438, "learning_rate": 0.0029819306086924127, "loss": 3.083462142944336, "num_input_tokens_seen": 4666163200, "step": 8900, "train_runtime": 40441.3988, "train_tokens_per_second": 115380.856 }, { "epoch": 0.4821559024865391, "grad_norm": 0.15929782390594482, "learning_rate": 0.002978019839841233, "loss": 3.0869064331054688, "num_input_tokens_seen": 4671406080, "step": 8910, "train_runtime": 40486.5499, "train_tokens_per_second": 115381.678 }, { "epoch": 0.4826970426689034, "grad_norm": 0.15982982516288757, "learning_rate": 0.002974108374855974, "loss": 3.082635688781738, "num_input_tokens_seen": 4676648960, "step": 8920, "train_runtime": 40531.6946, "train_tokens_per_second": 115382.518 }, { "epoch": 0.48323818285126763, "grad_norm": 0.15398921072483063, "learning_rate": 0.0029701962256781555, "loss": 3.069881820678711, "num_input_tokens_seen": 4681891840, "step": 8930, "train_runtime": 40576.8387, "train_tokens_per_second": 115383.356 }, { "epoch": 0.48377932303363186, "grad_norm": 0.16303351521492004, "learning_rate": 0.0029662834042513903, "loss": 3.078609085083008, "num_input_tokens_seen": 4687134720, "step": 8940, "train_runtime": 40621.9936, "train_tokens_per_second": 115384.163 }, { "epoch": 0.4843204632159961, "grad_norm": 0.16261766850948334, "learning_rate": 0.0029623699225213417, "loss": 3.072034454345703, "num_input_tokens_seen": 4692377600, "step": 8950, "train_runtime": 40667.1772, "train_tokens_per_second": 115384.886 }, { "epoch": 0.4848616033983603, "grad_norm": 0.14818759262561798, "learning_rate": 0.002958455792435689, "loss": 3.077336883544922, "num_input_tokens_seen": 4697620480, "step": 8960, "train_runtime": 40712.3367, "train_tokens_per_second": 115385.676 }, { "epoch": 0.4854027435807246, "grad_norm": 0.1536484807729721, "learning_rate": 0.002954541025944093, "loss": 3.0622703552246096, "num_input_tokens_seen": 4702863360, "step": 8970, "train_runtime": 40757.516, "train_tokens_per_second": 115386.408 }, { "epoch": 0.48594388376308884, "grad_norm": 0.1563226282596588, "learning_rate": 0.002950625634998154, "loss": 3.0665721893310547, "num_input_tokens_seen": 4708106240, "step": 8980, "train_runtime": 40802.7032, "train_tokens_per_second": 115387.116 }, { "epoch": 0.48648502394545307, "grad_norm": 0.15256533026695251, "learning_rate": 0.0029467096315513802, "loss": 3.0700511932373047, "num_input_tokens_seen": 4713349120, "step": 8990, "train_runtime": 40847.8776, "train_tokens_per_second": 115387.859 }, { "epoch": 0.4870261641278173, "grad_norm": 0.1605551838874817, "learning_rate": 0.0029427930275591515, "loss": 3.076490592956543, "num_input_tokens_seen": 4718592000, "step": 9000, "train_runtime": 40893.0303, "train_tokens_per_second": 115388.661 }, { "epoch": 0.4870261641278173, "eval_loss": 3.034029483795166, "eval_runtime": 1.9847, "eval_samples_per_second": 251.93, "eval_steps_per_second": 4.031, "num_input_tokens_seen": 4718592000, "step": 9000 }, { "epoch": 0.4875673043101815, "grad_norm": 0.16099503636360168, "learning_rate": 0.0029388758349786787, "loss": 3.081180953979492, "num_input_tokens_seen": 4723834880, "step": 9010, "train_runtime": 40942.5164, "train_tokens_per_second": 115377.248 }, { "epoch": 0.4881084444925458, "grad_norm": 0.1605864018201828, "learning_rate": 0.0029349580657689707, "loss": 3.0802078247070312, "num_input_tokens_seen": 4729077760, "step": 9020, "train_runtime": 40987.6712, "train_tokens_per_second": 115378.054 }, { "epoch": 0.48864958467491004, "grad_norm": 0.15125182271003723, "learning_rate": 0.0029310397318907965, "loss": 3.090005111694336, "num_input_tokens_seen": 4734320640, "step": 9030, "train_runtime": 41032.8355, "train_tokens_per_second": 115378.832 }, { "epoch": 0.4891907248572743, "grad_norm": 0.16026070713996887, "learning_rate": 0.002927120845306649, "loss": 3.087236785888672, "num_input_tokens_seen": 4739563520, "step": 9040, "train_runtime": 41077.9854, "train_tokens_per_second": 115379.649 }, { "epoch": 0.4897318650396385, "grad_norm": 0.1533481776714325, "learning_rate": 0.0029232014179807098, "loss": 3.0772159576416014, "num_input_tokens_seen": 4744806400, "step": 9050, "train_runtime": 41123.1477, "train_tokens_per_second": 115380.429 }, { "epoch": 0.49027300522200273, "grad_norm": 0.16315831243991852, "learning_rate": 0.002919281461878809, "loss": 3.080950927734375, "num_input_tokens_seen": 4750049280, "step": 9060, "train_runtime": 41168.3038, "train_tokens_per_second": 115381.224 }, { "epoch": 0.490814145404367, "grad_norm": 0.16064807772636414, "learning_rate": 0.0029153609889683934, "loss": 3.077931594848633, "num_input_tokens_seen": 4755292160, "step": 9070, "train_runtime": 41213.4562, "train_tokens_per_second": 115382.028 }, { "epoch": 0.49135528558673125, "grad_norm": 0.15968522429466248, "learning_rate": 0.0029114400112184857, "loss": 3.0715621948242187, "num_input_tokens_seen": 4760535040, "step": 9080, "train_runtime": 41258.6304, "train_tokens_per_second": 115382.769 }, { "epoch": 0.4918964257690955, "grad_norm": 0.21646282076835632, "learning_rate": 0.0029075185405996497, "loss": 3.070268249511719, "num_input_tokens_seen": 4765777920, "step": 9090, "train_runtime": 41303.7917, "train_tokens_per_second": 115383.545 }, { "epoch": 0.4924375659514597, "grad_norm": 0.14861001074314117, "learning_rate": 0.0029035965890839566, "loss": 3.0772144317626955, "num_input_tokens_seen": 4771020800, "step": 9100, "train_runtime": 41348.9385, "train_tokens_per_second": 115384.36 }, { "epoch": 0.49297870613382394, "grad_norm": 0.15690362453460693, "learning_rate": 0.0028996741686449427, "loss": 3.079457092285156, "num_input_tokens_seen": 4776263680, "step": 9110, "train_runtime": 41394.0963, "train_tokens_per_second": 115385.142 }, { "epoch": 0.4935198463161882, "grad_norm": 0.1561357080936432, "learning_rate": 0.0028957512912575777, "loss": 3.081951141357422, "num_input_tokens_seen": 4781506560, "step": 9120, "train_runtime": 41439.2561, "train_tokens_per_second": 115385.917 }, { "epoch": 0.49406098649855246, "grad_norm": 0.15572723746299744, "learning_rate": 0.002891827968898225, "loss": 3.0684499740600586, "num_input_tokens_seen": 4786749440, "step": 9130, "train_runtime": 41484.4245, "train_tokens_per_second": 115386.666 }, { "epoch": 0.4946021266809167, "grad_norm": 0.14807617664337158, "learning_rate": 0.0028879042135446092, "loss": 3.0712486267089845, "num_input_tokens_seen": 4791992320, "step": 9140, "train_runtime": 41529.5787, "train_tokens_per_second": 115387.453 }, { "epoch": 0.4951432668632809, "grad_norm": 0.15159912407398224, "learning_rate": 0.0028839800371757724, "loss": 3.0685661315917967, "num_input_tokens_seen": 4797235200, "step": 9150, "train_runtime": 41574.7343, "train_tokens_per_second": 115388.235 }, { "epoch": 0.49568440704564515, "grad_norm": 0.15283788740634918, "learning_rate": 0.0028800554517720467, "loss": 3.066938591003418, "num_input_tokens_seen": 4802478080, "step": 9160, "train_runtime": 41623.4446, "train_tokens_per_second": 115379.16 }, { "epoch": 0.49622554722800943, "grad_norm": 0.14532612264156342, "learning_rate": 0.0028761304693150093, "loss": 3.0726764678955076, "num_input_tokens_seen": 4807720960, "step": 9170, "train_runtime": 41668.6161, "train_tokens_per_second": 115379.905 }, { "epoch": 0.49676668741037366, "grad_norm": 0.15642227232456207, "learning_rate": 0.0028722051017874514, "loss": 3.075974464416504, "num_input_tokens_seen": 4812963840, "step": 9180, "train_runtime": 41713.7758, "train_tokens_per_second": 115380.681 }, { "epoch": 0.4973078275927379, "grad_norm": 0.15629522502422333, "learning_rate": 0.00286827936117334, "loss": 3.0699131011962892, "num_input_tokens_seen": 4818206720, "step": 9190, "train_runtime": 41758.9432, "train_tokens_per_second": 115381.433 }, { "epoch": 0.4978489677751021, "grad_norm": 0.15175525844097137, "learning_rate": 0.00286435325945778, "loss": 3.0690542221069337, "num_input_tokens_seen": 4823449600, "step": 9200, "train_runtime": 41804.1291, "train_tokens_per_second": 115382.133 }, { "epoch": 0.49839010795746636, "grad_norm": 0.14598695933818817, "learning_rate": 0.0028604268086269793, "loss": 3.072031021118164, "num_input_tokens_seen": 4828692480, "step": 9210, "train_runtime": 41849.2968, "train_tokens_per_second": 115382.882 }, { "epoch": 0.49893124813983064, "grad_norm": 0.14812366664409637, "learning_rate": 0.0028565000206682125, "loss": 3.074822998046875, "num_input_tokens_seen": 4833935360, "step": 9220, "train_runtime": 41894.4632, "train_tokens_per_second": 115383.633 }, { "epoch": 0.49947238832219487, "grad_norm": 0.1581128090620041, "learning_rate": 0.0028525729075697813, "loss": 3.071183967590332, "num_input_tokens_seen": 4839178240, "step": 9230, "train_runtime": 41939.6385, "train_tokens_per_second": 115384.357 }, { "epoch": 0.5000135285045592, "grad_norm": 0.160576730966568, "learning_rate": 0.002848645481320983, "loss": 3.079146385192871, "num_input_tokens_seen": 4844421120, "step": 9240, "train_runtime": 41984.8089, "train_tokens_per_second": 115385.094 }, { "epoch": 0.5005546686869233, "grad_norm": 0.1423659771680832, "learning_rate": 0.002844717753912068, "loss": 3.0759227752685545, "num_input_tokens_seen": 4849664000, "step": 9250, "train_runtime": 42029.9847, "train_tokens_per_second": 115385.814 }, { "epoch": 0.5010958088692876, "grad_norm": 0.14177994430065155, "learning_rate": 0.0028407897373342074, "loss": 3.076811599731445, "num_input_tokens_seen": 4854906880, "step": 9260, "train_runtime": 42075.1549, "train_tokens_per_second": 115386.548 }, { "epoch": 0.5016369490516518, "grad_norm": 0.14039497077465057, "learning_rate": 0.002836861443579456, "loss": 3.0762613296508787, "num_input_tokens_seen": 4860149760, "step": 9270, "train_runtime": 42120.3369, "train_tokens_per_second": 115387.248 }, { "epoch": 0.5021780892340161, "grad_norm": 0.15003980696201324, "learning_rate": 0.0028329328846407125, "loss": 3.0661956787109377, "num_input_tokens_seen": 4865392640, "step": 9280, "train_runtime": 42165.525, "train_tokens_per_second": 115387.93 }, { "epoch": 0.5027192294163804, "grad_norm": 0.1639668196439743, "learning_rate": 0.0028290040725116876, "loss": 3.077253723144531, "num_input_tokens_seen": 4870635520, "step": 9290, "train_runtime": 42210.7142, "train_tokens_per_second": 115388.607 }, { "epoch": 0.5032603695987445, "grad_norm": 0.15424971282482147, "learning_rate": 0.002825075019186865, "loss": 3.0679557800292967, "num_input_tokens_seen": 4875878400, "step": 9300, "train_runtime": 42255.9028, "train_tokens_per_second": 115389.285 }, { "epoch": 0.5038015097811088, "grad_norm": 0.1511068195104599, "learning_rate": 0.0028211457366614607, "loss": 3.0695865631103514, "num_input_tokens_seen": 4881121280, "step": 9310, "train_runtime": 42301.0768, "train_tokens_per_second": 115390.001 }, { "epoch": 0.504342649963473, "grad_norm": 0.15356752276420593, "learning_rate": 0.002817216236931397, "loss": 3.073322296142578, "num_input_tokens_seen": 4886364160, "step": 9320, "train_runtime": 42346.2459, "train_tokens_per_second": 115390.728 }, { "epoch": 0.5048837901458373, "grad_norm": 0.14986173808574677, "learning_rate": 0.002813286531993253, "loss": 3.07531681060791, "num_input_tokens_seen": 4891607040, "step": 9330, "train_runtime": 42391.4328, "train_tokens_per_second": 115391.406 }, { "epoch": 0.5054249303282016, "grad_norm": 0.14537614583969116, "learning_rate": 0.0028093566338442395, "loss": 3.0746026992797852, "num_input_tokens_seen": 4896849920, "step": 9340, "train_runtime": 42436.5896, "train_tokens_per_second": 115392.164 }, { "epoch": 0.5059660705105657, "grad_norm": 0.15007568895816803, "learning_rate": 0.0028054265544821522, "loss": 3.0845333099365235, "num_input_tokens_seen": 4902092800, "step": 9350, "train_runtime": 42481.7468, "train_tokens_per_second": 115392.92 }, { "epoch": 0.50650721069293, "grad_norm": 0.15155982971191406, "learning_rate": 0.0028014963059053446, "loss": 3.0744888305664064, "num_input_tokens_seen": 4907335680, "step": 9360, "train_runtime": 42526.8939, "train_tokens_per_second": 115393.701 }, { "epoch": 0.5070483508752942, "grad_norm": 0.15760909020900726, "learning_rate": 0.002797565900112684, "loss": 3.0650793075561524, "num_input_tokens_seen": 4912578560, "step": 9370, "train_runtime": 42572.0476, "train_tokens_per_second": 115394.463 }, { "epoch": 0.5075894910576585, "grad_norm": 0.156438410282135, "learning_rate": 0.0027936353491035183, "loss": 3.0668895721435545, "num_input_tokens_seen": 4917821440, "step": 9380, "train_runtime": 42617.1956, "train_tokens_per_second": 115395.238 }, { "epoch": 0.5081306312400228, "grad_norm": 0.16406750679016113, "learning_rate": 0.0027897046648776395, "loss": 3.061408042907715, "num_input_tokens_seen": 4923064320, "step": 9390, "train_runtime": 42662.3399, "train_tokens_per_second": 115396.022 }, { "epoch": 0.508671771422387, "grad_norm": 0.13937264680862427, "learning_rate": 0.002785773859435245, "loss": 3.069793128967285, "num_input_tokens_seen": 4928307200, "step": 9400, "train_runtime": 42707.5213, "train_tokens_per_second": 115396.704 }, { "epoch": 0.5092129116047512, "grad_norm": 0.15395483374595642, "learning_rate": 0.0027818429447769044, "loss": 3.071869659423828, "num_input_tokens_seen": 4933550080, "step": 9410, "train_runtime": 42752.6998, "train_tokens_per_second": 115397.392 }, { "epoch": 0.5097540517871154, "grad_norm": 0.15119874477386475, "learning_rate": 0.0027779119329035167, "loss": 3.067423629760742, "num_input_tokens_seen": 4938792960, "step": 9420, "train_runtime": 42797.8655, "train_tokens_per_second": 115398.114 }, { "epoch": 0.5102951919694797, "grad_norm": 0.1615118384361267, "learning_rate": 0.002773980835816284, "loss": 3.0653512954711912, "num_input_tokens_seen": 4944035840, "step": 9430, "train_runtime": 42843.0316, "train_tokens_per_second": 115398.833 }, { "epoch": 0.510836332151844, "grad_norm": 0.16538918018341064, "learning_rate": 0.0027700496655166614, "loss": 3.067237663269043, "num_input_tokens_seen": 4949278720, "step": 9440, "train_runtime": 42888.2044, "train_tokens_per_second": 115399.532 }, { "epoch": 0.5113774723342082, "grad_norm": 0.14385169744491577, "learning_rate": 0.002766118434006332, "loss": 3.078049087524414, "num_input_tokens_seen": 4954521600, "step": 9450, "train_runtime": 42933.3584, "train_tokens_per_second": 115400.281 }, { "epoch": 0.5119186125165724, "grad_norm": 0.15073780715465546, "learning_rate": 0.0027621871532871657, "loss": 3.06368350982666, "num_input_tokens_seen": 4959764480, "step": 9460, "train_runtime": 42978.5149, "train_tokens_per_second": 115401.02 }, { "epoch": 0.5124597526989366, "grad_norm": 0.15621446073055267, "learning_rate": 0.0027582558353611802, "loss": 3.0653354644775392, "num_input_tokens_seen": 4965007360, "step": 9470, "train_runtime": 43023.685, "train_tokens_per_second": 115401.722 }, { "epoch": 0.5130008928813009, "grad_norm": 0.15570929646492004, "learning_rate": 0.0027543244922305105, "loss": 3.0613819122314454, "num_input_tokens_seen": 4970250240, "step": 9480, "train_runtime": 43068.8573, "train_tokens_per_second": 115402.417 }, { "epoch": 0.5135420330636652, "grad_norm": 0.1389538198709488, "learning_rate": 0.0027503931358973644, "loss": 3.0687282562255858, "num_input_tokens_seen": 4975493120, "step": 9490, "train_runtime": 43114.0132, "train_tokens_per_second": 115403.154 }, { "epoch": 0.5140831732460294, "grad_norm": 0.15439286828041077, "learning_rate": 0.002746461778363992, "loss": 3.0685733795166015, "num_input_tokens_seen": 4980736000, "step": 9500, "train_runtime": 43159.1877, "train_tokens_per_second": 115403.84 }, { "epoch": 0.5140831732460294, "eval_loss": 3.021251916885376, "eval_runtime": 1.9829, "eval_samples_per_second": 252.155, "eval_steps_per_second": 4.034, "num_input_tokens_seen": 4980736000, "step": 9500 }, { "epoch": 0.5146243134283937, "grad_norm": 0.16106902062892914, "learning_rate": 0.0027425304316326484, "loss": 3.076310729980469, "num_input_tokens_seen": 4985978880, "step": 9510, "train_runtime": 43206.3354, "train_tokens_per_second": 115399.254 }, { "epoch": 0.5151654536107578, "grad_norm": 0.16451045870780945, "learning_rate": 0.0027385991077055532, "loss": 3.0650386810302734, "num_input_tokens_seen": 4991221760, "step": 9520, "train_runtime": 43251.4993, "train_tokens_per_second": 115399.971 }, { "epoch": 0.5157065937931221, "grad_norm": 0.141593337059021, "learning_rate": 0.002734667818584858, "loss": 3.0678850173950196, "num_input_tokens_seen": 4996464640, "step": 9530, "train_runtime": 43296.675, "train_tokens_per_second": 115400.655 }, { "epoch": 0.5162477339754864, "grad_norm": 0.15153639018535614, "learning_rate": 0.002730736576272606, "loss": 3.0637826919555664, "num_input_tokens_seen": 5001707520, "step": 9540, "train_runtime": 43345.4631, "train_tokens_per_second": 115391.719 }, { "epoch": 0.5167888741578506, "grad_norm": 0.1510300487279892, "learning_rate": 0.0027268053927707015, "loss": 3.066213607788086, "num_input_tokens_seen": 5006950400, "step": 9550, "train_runtime": 43390.6485, "train_tokens_per_second": 115392.385 }, { "epoch": 0.5173300143402149, "grad_norm": 0.14986655116081238, "learning_rate": 0.0027228742800808657, "loss": 3.069229507446289, "num_input_tokens_seen": 5012193280, "step": 9560, "train_runtime": 43435.8008, "train_tokens_per_second": 115393.136 }, { "epoch": 0.517871154522579, "grad_norm": 0.15248462557792664, "learning_rate": 0.002718943250204604, "loss": 3.0567092895507812, "num_input_tokens_seen": 5017436160, "step": 9570, "train_runtime": 43480.9543, "train_tokens_per_second": 115393.883 }, { "epoch": 0.5184122947049433, "grad_norm": 0.1401718556880951, "learning_rate": 0.0027150123151431717, "loss": 3.0642112731933593, "num_input_tokens_seen": 5022679040, "step": 9580, "train_runtime": 43526.1319, "train_tokens_per_second": 115394.565 }, { "epoch": 0.5189534348873076, "grad_norm": 0.1594904363155365, "learning_rate": 0.002711081486897532, "loss": 3.077428436279297, "num_input_tokens_seen": 5027921920, "step": 9590, "train_runtime": 43571.3028, "train_tokens_per_second": 115395.262 }, { "epoch": 0.5194945750696718, "grad_norm": 0.16368508338928223, "learning_rate": 0.0027071507774683217, "loss": 3.0642780303955077, "num_input_tokens_seen": 5033164800, "step": 9600, "train_runtime": 43616.4759, "train_tokens_per_second": 115395.953 }, { "epoch": 0.5200357152520361, "grad_norm": 0.15867675840854645, "learning_rate": 0.0027032201988558165, "loss": 3.056943893432617, "num_input_tokens_seen": 5038407680, "step": 9610, "train_runtime": 43661.6654, "train_tokens_per_second": 115396.599 }, { "epoch": 0.5205768554344002, "grad_norm": 0.14183756709098816, "learning_rate": 0.0026992897630598927, "loss": 3.0706558227539062, "num_input_tokens_seen": 5043650560, "step": 9620, "train_runtime": 43706.8375, "train_tokens_per_second": 115397.289 }, { "epoch": 0.5211179956167645, "grad_norm": 0.15698400139808655, "learning_rate": 0.002695359482079989, "loss": 3.0621952056884765, "num_input_tokens_seen": 5048893440, "step": 9630, "train_runtime": 43752.0038, "train_tokens_per_second": 115397.993 }, { "epoch": 0.5216591357991288, "grad_norm": 0.15258745849132538, "learning_rate": 0.002691429367915072, "loss": 3.0683521270751952, "num_input_tokens_seen": 5054136320, "step": 9640, "train_runtime": 43797.1921, "train_tokens_per_second": 115398.638 }, { "epoch": 0.522200275981493, "grad_norm": 0.1572786569595337, "learning_rate": 0.0026874994325636016, "loss": 3.0657506942749024, "num_input_tokens_seen": 5059379200, "step": 9650, "train_runtime": 43842.367, "train_tokens_per_second": 115399.317 }, { "epoch": 0.5227414161638573, "grad_norm": 0.1440490484237671, "learning_rate": 0.002683569688023488, "loss": 3.057964324951172, "num_input_tokens_seen": 5064622080, "step": 9660, "train_runtime": 43887.5251, "train_tokens_per_second": 115400.038 }, { "epoch": 0.5232825563462215, "grad_norm": 0.1437375247478485, "learning_rate": 0.002679640146292061, "loss": 3.067335510253906, "num_input_tokens_seen": 5069864960, "step": 9670, "train_runtime": 43932.6993, "train_tokens_per_second": 115400.716 }, { "epoch": 0.5238236965285857, "grad_norm": 0.14964227378368378, "learning_rate": 0.0026757108193660294, "loss": 3.0661109924316405, "num_input_tokens_seen": 5075107840, "step": 9680, "train_runtime": 43977.8845, "train_tokens_per_second": 115401.364 }, { "epoch": 0.52436483671095, "grad_norm": 0.15807990729808807, "learning_rate": 0.0026717817192414496, "loss": 3.0581291198730467, "num_input_tokens_seen": 5080350720, "step": 9690, "train_runtime": 44023.0423, "train_tokens_per_second": 115402.082 }, { "epoch": 0.5249059768933142, "grad_norm": 0.1513347625732422, "learning_rate": 0.0026678528579136833, "loss": 3.0648067474365233, "num_input_tokens_seen": 5085593600, "step": 9700, "train_runtime": 44068.1745, "train_tokens_per_second": 115402.865 }, { "epoch": 0.5254471170756785, "grad_norm": 0.13990284502506256, "learning_rate": 0.002663924247377361, "loss": 3.06469841003418, "num_input_tokens_seen": 5090836480, "step": 9710, "train_runtime": 44113.2866, "train_tokens_per_second": 115403.7 }, { "epoch": 0.5259882572580427, "grad_norm": 0.16054664552211761, "learning_rate": 0.002659995899626353, "loss": 3.070522689819336, "num_input_tokens_seen": 5096079360, "step": 9720, "train_runtime": 44158.4184, "train_tokens_per_second": 115404.481 }, { "epoch": 0.5265293974404069, "grad_norm": 0.15888893604278564, "learning_rate": 0.0026560678266537223, "loss": 3.061862564086914, "num_input_tokens_seen": 5101322240, "step": 9730, "train_runtime": 44203.5783, "train_tokens_per_second": 115405.188 }, { "epoch": 0.5270705376227712, "grad_norm": 0.1485973447561264, "learning_rate": 0.002652140040451696, "loss": 3.0686100006103514, "num_input_tokens_seen": 5106565120, "step": 9740, "train_runtime": 44248.7463, "train_tokens_per_second": 115405.871 }, { "epoch": 0.5276116778051354, "grad_norm": 0.1576089709997177, "learning_rate": 0.002648212553011623, "loss": 3.062734603881836, "num_input_tokens_seen": 5111808000, "step": 9750, "train_runtime": 44293.9122, "train_tokens_per_second": 115406.559 }, { "epoch": 0.5281528179874997, "grad_norm": 0.14466626942157745, "learning_rate": 0.0026442853763239444, "loss": 3.0534576416015624, "num_input_tokens_seen": 5117050880, "step": 9760, "train_runtime": 44339.0625, "train_tokens_per_second": 115407.286 }, { "epoch": 0.5286939581698639, "grad_norm": 0.13870450854301453, "learning_rate": 0.0026403585223781483, "loss": 3.0488052368164062, "num_input_tokens_seen": 5122293760, "step": 9770, "train_runtime": 44384.2076, "train_tokens_per_second": 115408.025 }, { "epoch": 0.5292350983522282, "grad_norm": 0.15322810411453247, "learning_rate": 0.0026364320031627385, "loss": 3.056787109375, "num_input_tokens_seen": 5127536640, "step": 9780, "train_runtime": 44429.3763, "train_tokens_per_second": 115408.702 }, { "epoch": 0.5297762385345924, "grad_norm": 0.1526278853416443, "learning_rate": 0.0026325058306652, "loss": 3.068254089355469, "num_input_tokens_seen": 5132779520, "step": 9790, "train_runtime": 44474.5291, "train_tokens_per_second": 115409.418 }, { "epoch": 0.5303173787169566, "grad_norm": 0.14957252144813538, "learning_rate": 0.002628580016871954, "loss": 3.0630029678344726, "num_input_tokens_seen": 5138022400, "step": 9800, "train_runtime": 44519.6838, "train_tokens_per_second": 115410.128 }, { "epoch": 0.5308585188993209, "grad_norm": 0.1606789082288742, "learning_rate": 0.002624654573768332, "loss": 3.0618038177490234, "num_input_tokens_seen": 5143265280, "step": 9810, "train_runtime": 44564.8296, "train_tokens_per_second": 115410.859 }, { "epoch": 0.5313996590816851, "grad_norm": 0.148385152220726, "learning_rate": 0.002620729513338529, "loss": 3.06335334777832, "num_input_tokens_seen": 5148508160, "step": 9820, "train_runtime": 44609.9754, "train_tokens_per_second": 115411.589 }, { "epoch": 0.5319407992640494, "grad_norm": 0.1520962417125702, "learning_rate": 0.002616804847565574, "loss": 3.061989593505859, "num_input_tokens_seen": 5153751040, "step": 9830, "train_runtime": 44655.138, "train_tokens_per_second": 115412.275 }, { "epoch": 0.5324819394464136, "grad_norm": 0.14803871512413025, "learning_rate": 0.002612880588431294, "loss": 3.062520980834961, "num_input_tokens_seen": 5158993920, "step": 9840, "train_runtime": 44700.2918, "train_tokens_per_second": 115412.981 }, { "epoch": 0.5330230796287778, "grad_norm": 0.14693519473075867, "learning_rate": 0.002608956747916268, "loss": 3.053732681274414, "num_input_tokens_seen": 5164236800, "step": 9850, "train_runtime": 44745.4454, "train_tokens_per_second": 115413.686 }, { "epoch": 0.5335642198111421, "grad_norm": 0.14247646927833557, "learning_rate": 0.0026050333379998014, "loss": 3.0687253952026365, "num_input_tokens_seen": 5169479680, "step": 9860, "train_runtime": 44790.614, "train_tokens_per_second": 115414.352 }, { "epoch": 0.5341053599935063, "grad_norm": 0.1414949595928192, "learning_rate": 0.0026011103706598867, "loss": 3.0667953491210938, "num_input_tokens_seen": 5174722560, "step": 9870, "train_runtime": 44835.7697, "train_tokens_per_second": 115415.049 }, { "epoch": 0.5346465001758706, "grad_norm": 0.15308037400245667, "learning_rate": 0.00259718785787316, "loss": 3.0632091522216798, "num_input_tokens_seen": 5179965440, "step": 9880, "train_runtime": 44880.9299, "train_tokens_per_second": 115415.733 }, { "epoch": 0.5351876403582349, "grad_norm": 0.1401015669107437, "learning_rate": 0.002593265811614872, "loss": 3.054189682006836, "num_input_tokens_seen": 5185208320, "step": 9890, "train_runtime": 44926.1001, "train_tokens_per_second": 115416.391 }, { "epoch": 0.535728780540599, "grad_norm": 0.15150010585784912, "learning_rate": 0.0025893442438588523, "loss": 3.0516624450683594, "num_input_tokens_seen": 5190451200, "step": 9900, "train_runtime": 44971.2488, "train_tokens_per_second": 115417.102 }, { "epoch": 0.5362699207229633, "grad_norm": 0.17257611453533173, "learning_rate": 0.0025854231665774653, "loss": 3.0537059783935545, "num_input_tokens_seen": 5195694080, "step": 9910, "train_runtime": 45016.4036, "train_tokens_per_second": 115417.796 }, { "epoch": 0.5368110609053275, "grad_norm": 0.14506685733795166, "learning_rate": 0.002581502591741579, "loss": 3.0568138122558595, "num_input_tokens_seen": 5200936960, "step": 9920, "train_runtime": 45065.2284, "train_tokens_per_second": 115409.089 }, { "epoch": 0.5373522010876918, "grad_norm": 0.14898552000522614, "learning_rate": 0.002577582531320528, "loss": 3.0490861892700196, "num_input_tokens_seen": 5206179840, "step": 9930, "train_runtime": 45110.3898, "train_tokens_per_second": 115409.773 }, { "epoch": 0.5378933412700561, "grad_norm": 0.14676256477832794, "learning_rate": 0.0025736629972820785, "loss": 3.067533493041992, "num_input_tokens_seen": 5211422720, "step": 9940, "train_runtime": 45155.5461, "train_tokens_per_second": 115410.468 }, { "epoch": 0.5384344814524202, "grad_norm": 0.15546375513076782, "learning_rate": 0.002569744001592385, "loss": 3.053817367553711, "num_input_tokens_seen": 5216665600, "step": 9950, "train_runtime": 45200.7043, "train_tokens_per_second": 115411.157 }, { "epoch": 0.5389756216347845, "grad_norm": 0.16900601983070374, "learning_rate": 0.002565825556215962, "loss": 3.062000274658203, "num_input_tokens_seen": 5221908480, "step": 9960, "train_runtime": 45245.886, "train_tokens_per_second": 115411.785 }, { "epoch": 0.5395167618171487, "grad_norm": 0.14602519571781158, "learning_rate": 0.0025619076731156444, "loss": 3.0598079681396486, "num_input_tokens_seen": 5227151360, "step": 9970, "train_runtime": 45291.0267, "train_tokens_per_second": 115412.516 }, { "epoch": 0.540057901999513, "grad_norm": 0.15503665804862976, "learning_rate": 0.002557990364252547, "loss": 3.047740936279297, "num_input_tokens_seen": 5232394240, "step": 9980, "train_runtime": 45336.1788, "train_tokens_per_second": 115413.217 }, { "epoch": 0.5405990421818773, "grad_norm": 0.15822327136993408, "learning_rate": 0.0025540736415860343, "loss": 3.0622173309326173, "num_input_tokens_seen": 5237637120, "step": 9990, "train_runtime": 45381.3445, "train_tokens_per_second": 115413.882 }, { "epoch": 0.5411401823642414, "grad_norm": 0.13620983064174652, "learning_rate": 0.0025501575170736803, "loss": 3.0480823516845703, "num_input_tokens_seen": 5242880000, "step": 10000, "train_runtime": 45426.5099, "train_tokens_per_second": 115414.546 }, { "epoch": 0.5411401823642414, "eval_loss": 3.0108466148376465, "eval_runtime": 1.9863, "eval_samples_per_second": 251.722, "eval_steps_per_second": 4.028, "num_input_tokens_seen": 5242880000, "step": 10000 }, { "epoch": 0.5416813225466057, "grad_norm": 0.14582324028015137, "learning_rate": 0.002546242002671233, "loss": 3.0488231658935545, "num_input_tokens_seen": 5248122880, "step": 10010, "train_runtime": 45475.9888, "train_tokens_per_second": 115404.261 }, { "epoch": 0.5422224627289699, "grad_norm": 0.14079852402210236, "learning_rate": 0.0025423271103325786, "loss": 3.0604705810546875, "num_input_tokens_seen": 5253365760, "step": 10020, "train_runtime": 45521.0949, "train_tokens_per_second": 115405.083 }, { "epoch": 0.5427636029113342, "grad_norm": 0.15606586635112762, "learning_rate": 0.002538412852009702, "loss": 3.0583091735839845, "num_input_tokens_seen": 5258608640, "step": 10030, "train_runtime": 45566.226, "train_tokens_per_second": 115405.841 }, { "epoch": 0.5433047430936985, "grad_norm": 0.14329582452774048, "learning_rate": 0.002534499239652654, "loss": 3.0502853393554688, "num_input_tokens_seen": 5263851520, "step": 10040, "train_runtime": 45611.3663, "train_tokens_per_second": 115406.574 }, { "epoch": 0.5438458832760626, "grad_norm": 0.14503344893455505, "learning_rate": 0.0025305862852095145, "loss": 3.0582489013671874, "num_input_tokens_seen": 5269094400, "step": 10050, "train_runtime": 45656.5231, "train_tokens_per_second": 115407.264 }, { "epoch": 0.5443870234584269, "grad_norm": 0.153029665350914, "learning_rate": 0.002526674000626352, "loss": 3.052793502807617, "num_input_tokens_seen": 5274337280, "step": 10060, "train_runtime": 45701.6761, "train_tokens_per_second": 115407.962 }, { "epoch": 0.5449281636407911, "grad_norm": 0.14204776287078857, "learning_rate": 0.00252276239784719, "loss": 3.052510643005371, "num_input_tokens_seen": 5279580160, "step": 10070, "train_runtime": 45746.8203, "train_tokens_per_second": 115408.68 }, { "epoch": 0.5454693038231554, "grad_norm": 0.14915040135383606, "learning_rate": 0.0025188514888139757, "loss": 3.058329391479492, "num_input_tokens_seen": 5284823040, "step": 10080, "train_runtime": 45791.9522, "train_tokens_per_second": 115409.429 }, { "epoch": 0.5460104440055197, "grad_norm": 0.14046527445316315, "learning_rate": 0.0025149412854665316, "loss": 3.0549495697021483, "num_input_tokens_seen": 5290065920, "step": 10090, "train_runtime": 45837.0921, "train_tokens_per_second": 115410.155 }, { "epoch": 0.5465515841878839, "grad_norm": 0.15560267865657806, "learning_rate": 0.0025110317997425295, "loss": 3.0544879913330076, "num_input_tokens_seen": 5295308800, "step": 10100, "train_runtime": 45882.2338, "train_tokens_per_second": 115410.876 }, { "epoch": 0.5470927243702481, "grad_norm": 0.15250231325626373, "learning_rate": 0.002507123043577449, "loss": 3.0573678970336915, "num_input_tokens_seen": 5300551680, "step": 10110, "train_runtime": 45927.3738, "train_tokens_per_second": 115411.6 }, { "epoch": 0.5476338645526123, "grad_norm": 0.13873432576656342, "learning_rate": 0.002503215028904543, "loss": 3.0521045684814454, "num_input_tokens_seen": 5305794560, "step": 10120, "train_runtime": 45972.5283, "train_tokens_per_second": 115412.285 }, { "epoch": 0.5481750047349766, "grad_norm": 0.15664595365524292, "learning_rate": 0.0024993077676548014, "loss": 3.0536930084228517, "num_input_tokens_seen": 5311037440, "step": 10130, "train_runtime": 46017.6749, "train_tokens_per_second": 115412.99 }, { "epoch": 0.5487161449173409, "grad_norm": 0.15226289629936218, "learning_rate": 0.002495401271756911, "loss": 3.0586917877197264, "num_input_tokens_seen": 5316280320, "step": 10140, "train_runtime": 46062.8137, "train_tokens_per_second": 115413.712 }, { "epoch": 0.5492572850997051, "grad_norm": 0.14132562279701233, "learning_rate": 0.0024914955531372264, "loss": 3.0555648803710938, "num_input_tokens_seen": 5321523200, "step": 10150, "train_runtime": 46107.9535, "train_tokens_per_second": 115414.43 }, { "epoch": 0.5497984252820693, "grad_norm": 0.15198439359664917, "learning_rate": 0.002487590623719726, "loss": 3.0600481033325195, "num_input_tokens_seen": 5326766080, "step": 10160, "train_runtime": 46153.0991, "train_tokens_per_second": 115415.133 }, { "epoch": 0.5503395654644335, "grad_norm": 0.1487119495868683, "learning_rate": 0.002483686495425979, "loss": 3.0562034606933595, "num_input_tokens_seen": 5332008960, "step": 10170, "train_runtime": 46198.2435, "train_tokens_per_second": 115415.837 }, { "epoch": 0.5508807056467978, "grad_norm": 0.1500309705734253, "learning_rate": 0.00247978318017511, "loss": 3.0558704376220702, "num_input_tokens_seen": 5337251840, "step": 10180, "train_runtime": 46243.3932, "train_tokens_per_second": 115416.527 }, { "epoch": 0.5514218458291621, "grad_norm": 0.14477477967739105, "learning_rate": 0.0024758806898837614, "loss": 3.0584625244140624, "num_input_tokens_seen": 5342494720, "step": 10190, "train_runtime": 46288.5341, "train_tokens_per_second": 115417.237 }, { "epoch": 0.5519629860115263, "grad_norm": 0.14965343475341797, "learning_rate": 0.0024719790364660555, "loss": 3.053845024108887, "num_input_tokens_seen": 5347737600, "step": 10200, "train_runtime": 46333.6743, "train_tokens_per_second": 115417.948 }, { "epoch": 0.5525041261938906, "grad_norm": 0.14595621824264526, "learning_rate": 0.002468078231833561, "loss": 3.0468162536621093, "num_input_tokens_seen": 5352980480, "step": 10210, "train_runtime": 46378.8185, "train_tokens_per_second": 115418.647 }, { "epoch": 0.5530452663762547, "grad_norm": 0.15934494137763977, "learning_rate": 0.002464178287895256, "loss": 3.0428611755371096, "num_input_tokens_seen": 5358223360, "step": 10220, "train_runtime": 46423.9466, "train_tokens_per_second": 115419.385 }, { "epoch": 0.553586406558619, "grad_norm": 0.14214660227298737, "learning_rate": 0.002460279216557488, "loss": 3.0542884826660157, "num_input_tokens_seen": 5363466240, "step": 10230, "train_runtime": 46469.0816, "train_tokens_per_second": 115420.104 }, { "epoch": 0.5541275467409833, "grad_norm": 0.16061817109584808, "learning_rate": 0.0024563810297239448, "loss": 3.0611974716186525, "num_input_tokens_seen": 5368709120, "step": 10240, "train_runtime": 46514.2276, "train_tokens_per_second": 115420.795 }, { "epoch": 0.5546686869233475, "grad_norm": 0.14743222296237946, "learning_rate": 0.0024524837392956088, "loss": 3.0409524917602537, "num_input_tokens_seen": 5373952000, "step": 10250, "train_runtime": 46559.373, "train_tokens_per_second": 115421.486 }, { "epoch": 0.5552098271057118, "grad_norm": 0.145145446062088, "learning_rate": 0.0024485873571707313, "loss": 3.0503875732421877, "num_input_tokens_seen": 5379194880, "step": 10260, "train_runtime": 46604.5364, "train_tokens_per_second": 115422.13 }, { "epoch": 0.5557509672880759, "grad_norm": 0.13785313069820404, "learning_rate": 0.0024446918952447856, "loss": 3.051102066040039, "num_input_tokens_seen": 5384437760, "step": 10270, "train_runtime": 46649.714, "train_tokens_per_second": 115422.739 }, { "epoch": 0.5562921074704402, "grad_norm": 0.15741367638111115, "learning_rate": 0.002440797365410437, "loss": 3.0524486541748046, "num_input_tokens_seen": 5389680640, "step": 10280, "train_runtime": 46694.8766, "train_tokens_per_second": 115423.383 }, { "epoch": 0.5568332476528045, "grad_norm": 0.14624185860157013, "learning_rate": 0.002436903779557509, "loss": 3.041734313964844, "num_input_tokens_seen": 5394923520, "step": 10290, "train_runtime": 46740.04, "train_tokens_per_second": 115424.024 }, { "epoch": 0.5573743878351687, "grad_norm": 0.15357740223407745, "learning_rate": 0.002433011149572938, "loss": 3.05377254486084, "num_input_tokens_seen": 5400166400, "step": 10300, "train_runtime": 46785.1999, "train_tokens_per_second": 115424.673 }, { "epoch": 0.557915528017533, "grad_norm": 0.1404609978199005, "learning_rate": 0.002429119487340744, "loss": 3.0517080307006834, "num_input_tokens_seen": 5405409280, "step": 10310, "train_runtime": 46834.0267, "train_tokens_per_second": 115416.283 }, { "epoch": 0.5584566681998971, "grad_norm": 0.13460350036621094, "learning_rate": 0.0024252288047419933, "loss": 3.047005462646484, "num_input_tokens_seen": 5410652160, "step": 10320, "train_runtime": 46879.179, "train_tokens_per_second": 115416.956 }, { "epoch": 0.5589978083822614, "grad_norm": 0.13896240293979645, "learning_rate": 0.002421339113654761, "loss": 3.0483970642089844, "num_input_tokens_seen": 5415895040, "step": 10330, "train_runtime": 46924.3535, "train_tokens_per_second": 115417.574 }, { "epoch": 0.5595389485646257, "grad_norm": 0.1555888056755066, "learning_rate": 0.0024174504259540965, "loss": 3.045535087585449, "num_input_tokens_seen": 5421137920, "step": 10340, "train_runtime": 46969.5132, "train_tokens_per_second": 115418.227 }, { "epoch": 0.5600800887469899, "grad_norm": 0.1442544311285019, "learning_rate": 0.002413562753511982, "loss": 3.0400226593017576, "num_input_tokens_seen": 5426380800, "step": 10350, "train_runtime": 47014.6917, "train_tokens_per_second": 115418.832 }, { "epoch": 0.5606212289293542, "grad_norm": 0.16144470870494843, "learning_rate": 0.002409676108197302, "loss": 3.044460678100586, "num_input_tokens_seen": 5431623680, "step": 10360, "train_runtime": 47059.8715, "train_tokens_per_second": 115419.433 }, { "epoch": 0.5611623691117184, "grad_norm": 0.1435036063194275, "learning_rate": 0.0024057905018758097, "loss": 3.051218032836914, "num_input_tokens_seen": 5436866560, "step": 10370, "train_runtime": 47105.0206, "train_tokens_per_second": 115420.108 }, { "epoch": 0.5617035092940826, "grad_norm": 0.14437657594680786, "learning_rate": 0.0024019059464100794, "loss": 3.049814987182617, "num_input_tokens_seen": 5442109440, "step": 10380, "train_runtime": 47150.1709, "train_tokens_per_second": 115420.779 }, { "epoch": 0.5622446494764469, "grad_norm": 0.1483716368675232, "learning_rate": 0.0023980224536594803, "loss": 3.051362991333008, "num_input_tokens_seen": 5447352320, "step": 10390, "train_runtime": 47195.3171, "train_tokens_per_second": 115421.458 }, { "epoch": 0.5627857896588111, "grad_norm": 0.1392650008201599, "learning_rate": 0.002394140035480139, "loss": 3.05356502532959, "num_input_tokens_seen": 5452595200, "step": 10400, "train_runtime": 47240.4745, "train_tokens_per_second": 115422.109 }, { "epoch": 0.5633269298411754, "grad_norm": 0.13990668952465057, "learning_rate": 0.002390258703724898, "loss": 3.053313064575195, "num_input_tokens_seen": 5457838080, "step": 10410, "train_runtime": 47285.6159, "train_tokens_per_second": 115422.798 }, { "epoch": 0.5638680700235396, "grad_norm": 0.1470736563205719, "learning_rate": 0.002386378470243285, "loss": 3.050541305541992, "num_input_tokens_seen": 5463080960, "step": 10420, "train_runtime": 47330.7575, "train_tokens_per_second": 115423.485 }, { "epoch": 0.5644092102059038, "grad_norm": 0.15061776340007782, "learning_rate": 0.0023824993468814734, "loss": 3.0488460540771483, "num_input_tokens_seen": 5468323840, "step": 10430, "train_runtime": 47375.9168, "train_tokens_per_second": 115424.127 }, { "epoch": 0.5649503503882681, "grad_norm": 0.14553044736385345, "learning_rate": 0.0023786213454822496, "loss": 3.0426799774169924, "num_input_tokens_seen": 5473566720, "step": 10440, "train_runtime": 47421.0774, "train_tokens_per_second": 115424.765 }, { "epoch": 0.5654914905706323, "grad_norm": 0.1505778431892395, "learning_rate": 0.002374744477884974, "loss": 3.0493221282958984, "num_input_tokens_seen": 5478809600, "step": 10450, "train_runtime": 47466.2416, "train_tokens_per_second": 115425.393 }, { "epoch": 0.5660326307529966, "grad_norm": 0.14359861612319946, "learning_rate": 0.002370868755925543, "loss": 3.048199272155762, "num_input_tokens_seen": 5484052480, "step": 10460, "train_runtime": 47511.3938, "train_tokens_per_second": 115426.049 }, { "epoch": 0.5665737709353608, "grad_norm": 0.1411399245262146, "learning_rate": 0.0023669941914363597, "loss": 3.0590206146240235, "num_input_tokens_seen": 5489295360, "step": 10470, "train_runtime": 47556.5734, "train_tokens_per_second": 115426.638 }, { "epoch": 0.567114911117725, "grad_norm": 0.14005140960216522, "learning_rate": 0.0023631207962462905, "loss": 3.052465057373047, "num_input_tokens_seen": 5494538240, "step": 10480, "train_runtime": 47601.7335, "train_tokens_per_second": 115427.272 }, { "epoch": 0.5676560513000893, "grad_norm": 0.15281735360622406, "learning_rate": 0.0023592485821806314, "loss": 3.0543212890625, "num_input_tokens_seen": 5499781120, "step": 10490, "train_runtime": 47646.8804, "train_tokens_per_second": 115427.937 }, { "epoch": 0.5681971914824535, "grad_norm": 0.15110628306865692, "learning_rate": 0.0023553775610610744, "loss": 3.0445037841796876, "num_input_tokens_seen": 5505024000, "step": 10500, "train_runtime": 47692.0533, "train_tokens_per_second": 115428.538 }, { "epoch": 0.5681971914824535, "eval_loss": 3.0027830600738525, "eval_runtime": 1.9832, "eval_samples_per_second": 252.115, "eval_steps_per_second": 4.034, "num_input_tokens_seen": 5505024000, "step": 10500 }, { "epoch": 0.5687383316648178, "grad_norm": 0.14011938869953156, "learning_rate": 0.0023515077447056705, "loss": 3.0531822204589845, "num_input_tokens_seen": 5510266880, "step": 10510, "train_runtime": 47739.2068, "train_tokens_per_second": 115424.349 }, { "epoch": 0.569279471847182, "grad_norm": 0.14612512290477753, "learning_rate": 0.002347639144928789, "loss": 3.051645278930664, "num_input_tokens_seen": 5515509760, "step": 10520, "train_runtime": 47784.361, "train_tokens_per_second": 115424.998 }, { "epoch": 0.5698206120295463, "grad_norm": 0.15726540982723236, "learning_rate": 0.0023437717735410872, "loss": 3.0477500915527345, "num_input_tokens_seen": 5520752640, "step": 10530, "train_runtime": 47829.5028, "train_tokens_per_second": 115425.675 }, { "epoch": 0.5703617522119105, "grad_norm": 0.14600247144699097, "learning_rate": 0.002339905642349474, "loss": 3.0487768173217775, "num_input_tokens_seen": 5525995520, "step": 10540, "train_runtime": 47874.6792, "train_tokens_per_second": 115426.267 }, { "epoch": 0.5709028923942747, "grad_norm": 0.1526118814945221, "learning_rate": 0.0023360407631570685, "loss": 3.0494321823120116, "num_input_tokens_seen": 5531238400, "step": 10550, "train_runtime": 47919.8527, "train_tokens_per_second": 115426.866 }, { "epoch": 0.571444032576639, "grad_norm": 0.14721056818962097, "learning_rate": 0.0023321771477631693, "loss": 3.046247100830078, "num_input_tokens_seen": 5536481280, "step": 10560, "train_runtime": 47965.0143, "train_tokens_per_second": 115427.492 }, { "epoch": 0.5719851727590032, "grad_norm": 0.15114431083202362, "learning_rate": 0.0023283148079632156, "loss": 3.0407901763916017, "num_input_tokens_seen": 5541724160, "step": 10570, "train_runtime": 48010.1749, "train_tokens_per_second": 115428.119 }, { "epoch": 0.5725263129413675, "grad_norm": 0.14631570875644684, "learning_rate": 0.0023244537555487544, "loss": 3.0476711273193358, "num_input_tokens_seen": 5546967040, "step": 10580, "train_runtime": 48055.3433, "train_tokens_per_second": 115428.726 }, { "epoch": 0.5730674531237318, "grad_norm": 0.14861121773719788, "learning_rate": 0.0023205940023074013, "loss": 3.049782562255859, "num_input_tokens_seen": 5552209920, "step": 10590, "train_runtime": 48100.5075, "train_tokens_per_second": 115429.342 }, { "epoch": 0.5736085933060959, "grad_norm": 0.13904227316379547, "learning_rate": 0.002316735560022804, "loss": 3.055135726928711, "num_input_tokens_seen": 5557452800, "step": 10600, "train_runtime": 48145.6693, "train_tokens_per_second": 115429.962 }, { "epoch": 0.5741497334884602, "grad_norm": 0.13765645027160645, "learning_rate": 0.00231287844047461, "loss": 3.047852325439453, "num_input_tokens_seen": 5562695680, "step": 10610, "train_runtime": 48190.8309, "train_tokens_per_second": 115430.582 }, { "epoch": 0.5746908736708244, "grad_norm": 0.14210833609104156, "learning_rate": 0.0023090226554384288, "loss": 3.0472042083740236, "num_input_tokens_seen": 5567938560, "step": 10620, "train_runtime": 48235.9849, "train_tokens_per_second": 115431.22 }, { "epoch": 0.5752320138531887, "grad_norm": 0.149732306599617, "learning_rate": 0.0023051682166857937, "loss": 3.0454326629638673, "num_input_tokens_seen": 5573181440, "step": 10630, "train_runtime": 48281.1275, "train_tokens_per_second": 115431.883 }, { "epoch": 0.575773154035553, "grad_norm": 0.1392926275730133, "learning_rate": 0.002301315135984128, "loss": 3.0390705108642577, "num_input_tokens_seen": 5578424320, "step": 10640, "train_runtime": 48326.2477, "train_tokens_per_second": 115432.598 }, { "epoch": 0.5763142942179171, "grad_norm": 0.13122397661209106, "learning_rate": 0.0022974634250967113, "loss": 3.036616897583008, "num_input_tokens_seen": 5583667200, "step": 10650, "train_runtime": 48371.3879, "train_tokens_per_second": 115433.264 }, { "epoch": 0.5768554344002814, "grad_norm": 0.14650028944015503, "learning_rate": 0.0022936130957826395, "loss": 3.04638786315918, "num_input_tokens_seen": 5588910080, "step": 10660, "train_runtime": 48416.5301, "train_tokens_per_second": 115433.924 }, { "epoch": 0.5773965745826456, "grad_norm": 0.13940243422985077, "learning_rate": 0.002289764159796791, "loss": 3.049785614013672, "num_input_tokens_seen": 5594152960, "step": 10670, "train_runtime": 48461.6807, "train_tokens_per_second": 115434.564 }, { "epoch": 0.5779377147650099, "grad_norm": 0.13686427474021912, "learning_rate": 0.0022859166288897895, "loss": 3.0434268951416015, "num_input_tokens_seen": 5599395840, "step": 10680, "train_runtime": 48506.8227, "train_tokens_per_second": 115435.222 }, { "epoch": 0.5784788549473742, "grad_norm": 0.14600345492362976, "learning_rate": 0.0022820705148079703, "loss": 3.052047538757324, "num_input_tokens_seen": 5604638720, "step": 10690, "train_runtime": 48555.616, "train_tokens_per_second": 115427.198 }, { "epoch": 0.5790199951297383, "grad_norm": 0.14253467321395874, "learning_rate": 0.0022782258292933432, "loss": 3.0317237854003904, "num_input_tokens_seen": 5609881600, "step": 10700, "train_runtime": 48600.7657, "train_tokens_per_second": 115427.844 }, { "epoch": 0.5795611353121026, "grad_norm": 0.13422608375549316, "learning_rate": 0.0022743825840835542, "loss": 3.038676071166992, "num_input_tokens_seen": 5615124480, "step": 10710, "train_runtime": 48645.891, "train_tokens_per_second": 115428.546 }, { "epoch": 0.5801022754944668, "grad_norm": 0.14621081948280334, "learning_rate": 0.0022705407909118574, "loss": 3.0488845825195314, "num_input_tokens_seen": 5620367360, "step": 10720, "train_runtime": 48691.0214, "train_tokens_per_second": 115429.235 }, { "epoch": 0.5806434156768311, "grad_norm": 0.14328445494174957, "learning_rate": 0.002266700461507069, "loss": 3.039694595336914, "num_input_tokens_seen": 5625610240, "step": 10730, "train_runtime": 48736.1623, "train_tokens_per_second": 115429.898 }, { "epoch": 0.5811845558591954, "grad_norm": 0.1407247930765152, "learning_rate": 0.0022628616075935377, "loss": 3.0443794250488283, "num_input_tokens_seen": 5630853120, "step": 10740, "train_runtime": 48781.2843, "train_tokens_per_second": 115430.604 }, { "epoch": 0.5817256960415595, "grad_norm": 0.15159763395786285, "learning_rate": 0.0022590242408911066, "loss": 3.0392004013061524, "num_input_tokens_seen": 5636096000, "step": 10750, "train_runtime": 48826.4182, "train_tokens_per_second": 115431.281 }, { "epoch": 0.5822668362239238, "grad_norm": 0.14982502162456512, "learning_rate": 0.0022551883731150822, "loss": 3.041204833984375, "num_input_tokens_seen": 5641338880, "step": 10760, "train_runtime": 48871.5523, "train_tokens_per_second": 115431.956 }, { "epoch": 0.582807976406288, "grad_norm": 0.14223578572273254, "learning_rate": 0.0022513540159761927, "loss": 3.058414840698242, "num_input_tokens_seen": 5646581760, "step": 10770, "train_runtime": 48916.6973, "train_tokens_per_second": 115432.604 }, { "epoch": 0.5833491165886523, "grad_norm": 0.14026111364364624, "learning_rate": 0.0022475211811805508, "loss": 3.040976715087891, "num_input_tokens_seen": 5651824640, "step": 10780, "train_runtime": 48961.8335, "train_tokens_per_second": 115433.272 }, { "epoch": 0.5838902567710166, "grad_norm": 0.1421726644039154, "learning_rate": 0.0022436898804296273, "loss": 3.0329113006591797, "num_input_tokens_seen": 5657067520, "step": 10790, "train_runtime": 49006.9777, "train_tokens_per_second": 115433.92 }, { "epoch": 0.5844313969533808, "grad_norm": 0.14624913036823273, "learning_rate": 0.0022398601254202074, "loss": 3.0412059783935548, "num_input_tokens_seen": 5662310400, "step": 10800, "train_runtime": 49052.1103, "train_tokens_per_second": 115434.593 }, { "epoch": 0.584972537135745, "grad_norm": 0.14961951971054077, "learning_rate": 0.0022360319278443555, "loss": 3.039783477783203, "num_input_tokens_seen": 5667553280, "step": 10810, "train_runtime": 49097.2475, "train_tokens_per_second": 115435.255 }, { "epoch": 0.5855136773181092, "grad_norm": 0.13819707930088043, "learning_rate": 0.0022322052993893828, "loss": 3.0379779815673826, "num_input_tokens_seen": 5672796160, "step": 10820, "train_runtime": 49142.4002, "train_tokens_per_second": 115435.879 }, { "epoch": 0.5860548175004735, "grad_norm": 0.1506834328174591, "learning_rate": 0.002228380251737811, "loss": 3.038629341125488, "num_input_tokens_seen": 5678039040, "step": 10830, "train_runtime": 49187.5417, "train_tokens_per_second": 115436.528 }, { "epoch": 0.5865959576828378, "grad_norm": 0.1385858803987503, "learning_rate": 0.0022245567965673346, "loss": 3.0388534545898436, "num_input_tokens_seen": 5683281920, "step": 10840, "train_runtime": 49232.6855, "train_tokens_per_second": 115437.171 }, { "epoch": 0.587137097865202, "grad_norm": 0.14153380692005157, "learning_rate": 0.002220734945550785, "loss": 3.040701675415039, "num_input_tokens_seen": 5688524800, "step": 10850, "train_runtime": 49277.8278, "train_tokens_per_second": 115437.816 }, { "epoch": 0.5876782380475662, "grad_norm": 0.1447627693414688, "learning_rate": 0.002216914710356098, "loss": 3.0347267150878907, "num_input_tokens_seen": 5693767680, "step": 10860, "train_runtime": 49322.9704, "train_tokens_per_second": 115438.459 }, { "epoch": 0.5882193782299304, "grad_norm": 0.15700754523277283, "learning_rate": 0.0022130961026462772, "loss": 3.0408071517944335, "num_input_tokens_seen": 5699010560, "step": 10870, "train_runtime": 49368.1084, "train_tokens_per_second": 115439.111 }, { "epoch": 0.5887605184122947, "grad_norm": 0.1373981535434723, "learning_rate": 0.002209279134079355, "loss": 3.0413372039794924, "num_input_tokens_seen": 5704253440, "step": 10880, "train_runtime": 49413.2385, "train_tokens_per_second": 115439.781 }, { "epoch": 0.589301658594659, "grad_norm": 0.13982577621936798, "learning_rate": 0.0022054638163083607, "loss": 3.0364784240722655, "num_input_tokens_seen": 5709496320, "step": 10890, "train_runtime": 49458.3538, "train_tokens_per_second": 115440.484 }, { "epoch": 0.5898427987770232, "grad_norm": 0.1471603363752365, "learning_rate": 0.0022016501609812846, "loss": 3.02860107421875, "num_input_tokens_seen": 5714739200, "step": 10900, "train_runtime": 49503.4854, "train_tokens_per_second": 115441.148 }, { "epoch": 0.5903839389593875, "grad_norm": 0.140976682305336, "learning_rate": 0.002197838179741041, "loss": 3.048592948913574, "num_input_tokens_seen": 5719982080, "step": 10910, "train_runtime": 49548.6066, "train_tokens_per_second": 115441.835 }, { "epoch": 0.5909250791417516, "grad_norm": 0.1391858160495758, "learning_rate": 0.0021940278842254336, "loss": 3.0438766479492188, "num_input_tokens_seen": 5725224960, "step": 10920, "train_runtime": 49593.7284, "train_tokens_per_second": 115442.519 }, { "epoch": 0.5914662193241159, "grad_norm": 0.13984552025794983, "learning_rate": 0.0021902192860671172, "loss": 3.032778739929199, "num_input_tokens_seen": 5730467840, "step": 10930, "train_runtime": 49638.8587, "train_tokens_per_second": 115443.183 }, { "epoch": 0.5920073595064802, "grad_norm": 0.15091544389724731, "learning_rate": 0.0021864123968935696, "loss": 3.0441143035888674, "num_input_tokens_seen": 5735710720, "step": 10940, "train_runtime": 49683.9877, "train_tokens_per_second": 115443.848 }, { "epoch": 0.5925484996888444, "grad_norm": 0.1554540991783142, "learning_rate": 0.0021826072283270465, "loss": 3.028913116455078, "num_input_tokens_seen": 5740953600, "step": 10950, "train_runtime": 49729.1048, "train_tokens_per_second": 115444.539 }, { "epoch": 0.5930896398712087, "grad_norm": 0.14302626252174377, "learning_rate": 0.0021788037919845526, "loss": 3.0385337829589845, "num_input_tokens_seen": 5746196480, "step": 10960, "train_runtime": 49774.2324, "train_tokens_per_second": 115445.205 }, { "epoch": 0.5936307800535728, "grad_norm": 0.14910683035850525, "learning_rate": 0.0021750020994778054, "loss": 3.0436506271362305, "num_input_tokens_seen": 5751439360, "step": 10970, "train_runtime": 49819.3456, "train_tokens_per_second": 115445.903 }, { "epoch": 0.5941719202359371, "grad_norm": 0.15283723175525665, "learning_rate": 0.002171202162413195, "loss": 3.047803497314453, "num_input_tokens_seen": 5756682240, "step": 10980, "train_runtime": 49864.471, "train_tokens_per_second": 115446.572 }, { "epoch": 0.5947130604183014, "grad_norm": 0.14589117467403412, "learning_rate": 0.002167403992391757, "loss": 3.0425289154052733, "num_input_tokens_seen": 5761925120, "step": 10990, "train_runtime": 49909.6019, "train_tokens_per_second": 115447.227 }, { "epoch": 0.5952542006006656, "grad_norm": 0.1394151747226715, "learning_rate": 0.0021636076010091276, "loss": 3.0472259521484375, "num_input_tokens_seen": 5767168000, "step": 11000, "train_runtime": 49954.7308, "train_tokens_per_second": 115447.885 }, { "epoch": 0.5952542006006656, "eval_loss": 2.9926395416259766, "eval_runtime": 1.9832, "eval_samples_per_second": 252.121, "eval_steps_per_second": 4.034, "num_input_tokens_seen": 5767168000, "step": 11000 }, { "epoch": 0.5957953407830299, "grad_norm": 0.1461019068956375, "learning_rate": 0.002159812999855516, "loss": 3.034767913818359, "num_input_tokens_seen": 5772410880, "step": 11010, "train_runtime": 50004.2941, "train_tokens_per_second": 115438.304 }, { "epoch": 0.596336480965394, "grad_norm": 0.13988643884658813, "learning_rate": 0.002156020200515666, "loss": 3.0288986206054687, "num_input_tokens_seen": 5777653760, "step": 11020, "train_runtime": 50049.449, "train_tokens_per_second": 115438.908 }, { "epoch": 0.5968776211477583, "grad_norm": 0.13354118168354034, "learning_rate": 0.002152229214568817, "loss": 3.0315704345703125, "num_input_tokens_seen": 5782896640, "step": 11030, "train_runtime": 50094.5853, "train_tokens_per_second": 115439.555 }, { "epoch": 0.5974187613301226, "grad_norm": 0.1455395370721817, "learning_rate": 0.0021484400535886766, "loss": 3.0255619049072267, "num_input_tokens_seen": 5788139520, "step": 11040, "train_runtime": 50139.709, "train_tokens_per_second": 115440.23 }, { "epoch": 0.5979599015124868, "grad_norm": 0.1527973711490631, "learning_rate": 0.002144652729143379, "loss": 3.0323816299438477, "num_input_tokens_seen": 5793382400, "step": 11050, "train_runtime": 50184.8687, "train_tokens_per_second": 115440.82 }, { "epoch": 0.5985010416948511, "grad_norm": 0.14457052946090698, "learning_rate": 0.0021408672527954502, "loss": 3.0245555877685546, "num_input_tokens_seen": 5798625280, "step": 11060, "train_runtime": 50230.0184, "train_tokens_per_second": 115441.432 }, { "epoch": 0.5990421818772153, "grad_norm": 0.1389371007680893, "learning_rate": 0.0021370836361017764, "loss": 3.036094856262207, "num_input_tokens_seen": 5803868160, "step": 11070, "train_runtime": 50278.8251, "train_tokens_per_second": 115433.647 }, { "epoch": 0.5995833220595795, "grad_norm": 0.15234215557575226, "learning_rate": 0.002133301890613565, "loss": 3.0295217514038084, "num_input_tokens_seen": 5809111040, "step": 11080, "train_runtime": 50323.981, "train_tokens_per_second": 115434.251 }, { "epoch": 0.6001244622419438, "grad_norm": 0.1413789838552475, "learning_rate": 0.002129522027876311, "loss": 3.021541404724121, "num_input_tokens_seen": 5814353920, "step": 11090, "train_runtime": 50369.1453, "train_tokens_per_second": 115434.834 }, { "epoch": 0.600665602424308, "grad_norm": 0.15141098201274872, "learning_rate": 0.0021257440594297607, "loss": 3.026825714111328, "num_input_tokens_seen": 5819596800, "step": 11100, "train_runtime": 50414.335, "train_tokens_per_second": 115435.358 }, { "epoch": 0.6012067426066723, "grad_norm": 0.1444009244441986, "learning_rate": 0.00212196799680788, "loss": 3.033803939819336, "num_input_tokens_seen": 5824839680, "step": 11110, "train_runtime": 50459.5415, "train_tokens_per_second": 115435.842 }, { "epoch": 0.6017478827890365, "grad_norm": 0.14855210483074188, "learning_rate": 0.002118193851538812, "loss": 3.0400081634521485, "num_input_tokens_seen": 5830082560, "step": 11120, "train_runtime": 50504.7336, "train_tokens_per_second": 115436.359 }, { "epoch": 0.6022890229714007, "grad_norm": 0.13804234564304352, "learning_rate": 0.002114421635144851, "loss": 3.0301578521728514, "num_input_tokens_seen": 5835325440, "step": 11130, "train_runtime": 50549.9225, "train_tokens_per_second": 115436.882 }, { "epoch": 0.602830163153765, "grad_norm": 0.14875908195972443, "learning_rate": 0.0021106513591423967, "loss": 3.032312774658203, "num_input_tokens_seen": 5840568320, "step": 11140, "train_runtime": 50595.1041, "train_tokens_per_second": 115437.421 }, { "epoch": 0.6033713033361292, "grad_norm": 0.14444148540496826, "learning_rate": 0.0021068830350419315, "loss": 3.038595199584961, "num_input_tokens_seen": 5845811200, "step": 11150, "train_runtime": 50640.2767, "train_tokens_per_second": 115437.979 }, { "epoch": 0.6039124435184935, "grad_norm": 0.14319837093353271, "learning_rate": 0.002103116674347975, "loss": 3.0365222930908202, "num_input_tokens_seen": 5851054080, "step": 11160, "train_runtime": 50685.4574, "train_tokens_per_second": 115438.518 }, { "epoch": 0.6044535837008577, "grad_norm": 0.1528901755809784, "learning_rate": 0.002099352288559052, "loss": 3.0367916107177733, "num_input_tokens_seen": 5856296960, "step": 11170, "train_runtime": 50730.6306, "train_tokens_per_second": 115439.073 }, { "epoch": 0.604994723883222, "grad_norm": 0.1397976279258728, "learning_rate": 0.002095589889167659, "loss": 3.026215744018555, "num_input_tokens_seen": 5861539840, "step": 11180, "train_runtime": 50775.8006, "train_tokens_per_second": 115439.634 }, { "epoch": 0.6055358640655862, "grad_norm": 0.1472213864326477, "learning_rate": 0.0020918294876602294, "loss": 3.0274309158325194, "num_input_tokens_seen": 5866782720, "step": 11190, "train_runtime": 50820.9715, "train_tokens_per_second": 115440.192 }, { "epoch": 0.6060770042479504, "grad_norm": 0.14902907609939575, "learning_rate": 0.0020880710955170955, "loss": 3.0351707458496096, "num_input_tokens_seen": 5872025600, "step": 11200, "train_runtime": 50866.1392, "train_tokens_per_second": 115440.757 }, { "epoch": 0.6066181444303147, "grad_norm": 0.1408633142709732, "learning_rate": 0.0020843147242124555, "loss": 3.029071807861328, "num_input_tokens_seen": 5877268480, "step": 11210, "train_runtime": 50911.3053, "train_tokens_per_second": 115441.324 }, { "epoch": 0.6071592846126789, "grad_norm": 0.14094239473342896, "learning_rate": 0.0020805603852143383, "loss": 3.032915496826172, "num_input_tokens_seen": 5882511360, "step": 11220, "train_runtime": 50956.4731, "train_tokens_per_second": 115441.886 }, { "epoch": 0.6077004247950432, "grad_norm": 0.14471176266670227, "learning_rate": 0.0020768080899845687, "loss": 3.0328413009643556, "num_input_tokens_seen": 5887754240, "step": 11230, "train_runtime": 51001.6333, "train_tokens_per_second": 115442.465 }, { "epoch": 0.6082415649774074, "grad_norm": 0.1309700757265091, "learning_rate": 0.00207305784997873, "loss": 3.0344516754150392, "num_input_tokens_seen": 5892997120, "step": 11240, "train_runtime": 51046.7842, "train_tokens_per_second": 115443.063 }, { "epoch": 0.6087827051597716, "grad_norm": 0.14067348837852478, "learning_rate": 0.0020693096766461333, "loss": 3.0316375732421874, "num_input_tokens_seen": 5898240000, "step": 11250, "train_runtime": 51091.9272, "train_tokens_per_second": 115443.678 }, { "epoch": 0.6093238453421359, "grad_norm": 0.14874261617660522, "learning_rate": 0.00206556358142978, "loss": 3.0260826110839845, "num_input_tokens_seen": 5903482880, "step": 11260, "train_runtime": 51137.0716, "train_tokens_per_second": 115444.289 }, { "epoch": 0.6098649855245001, "grad_norm": 0.1435764729976654, "learning_rate": 0.002061819575766326, "loss": 3.0409059524536133, "num_input_tokens_seen": 5908725760, "step": 11270, "train_runtime": 51182.2082, "train_tokens_per_second": 115444.917 }, { "epoch": 0.6104061257068644, "grad_norm": 0.1484087109565735, "learning_rate": 0.002058077671086047, "loss": 3.0283117294311523, "num_input_tokens_seen": 5913968640, "step": 11280, "train_runtime": 51227.347, "train_tokens_per_second": 115445.538 }, { "epoch": 0.6109472658892287, "grad_norm": 0.140775665640831, "learning_rate": 0.002054337878812808, "loss": 3.026752471923828, "num_input_tokens_seen": 5919211520, "step": 11290, "train_runtime": 51272.4758, "train_tokens_per_second": 115446.181 }, { "epoch": 0.6114884060715928, "grad_norm": 0.14487655460834503, "learning_rate": 0.002050600210364022, "loss": 3.0381233215332033, "num_input_tokens_seen": 5924454400, "step": 11300, "train_runtime": 51317.6163, "train_tokens_per_second": 115446.796 }, { "epoch": 0.6120295462539571, "grad_norm": 0.13244298100471497, "learning_rate": 0.0020468646771506184, "loss": 3.037242889404297, "num_input_tokens_seen": 5929697280, "step": 11310, "train_runtime": 51362.7685, "train_tokens_per_second": 115447.384 }, { "epoch": 0.6125706864363213, "grad_norm": 0.13805389404296875, "learning_rate": 0.002043131290577007, "loss": 3.034191703796387, "num_input_tokens_seen": 5934940160, "step": 11320, "train_runtime": 51407.9443, "train_tokens_per_second": 115447.919 }, { "epoch": 0.6131118266186856, "grad_norm": 0.13927042484283447, "learning_rate": 0.002039400062041048, "loss": 3.0405059814453126, "num_input_tokens_seen": 5940183040, "step": 11330, "train_runtime": 51453.1187, "train_tokens_per_second": 115448.455 }, { "epoch": 0.6136529668010499, "grad_norm": 0.13962484896183014, "learning_rate": 0.0020356710029340096, "loss": 3.0331016540527345, "num_input_tokens_seen": 5945425920, "step": 11340, "train_runtime": 51498.2896, "train_tokens_per_second": 115448.998 }, { "epoch": 0.614194106983414, "grad_norm": 0.2101336121559143, "learning_rate": 0.0020319441246405357, "loss": 3.028001594543457, "num_input_tokens_seen": 5950668800, "step": 11350, "train_runtime": 51543.4451, "train_tokens_per_second": 115449.574 }, { "epoch": 0.6147352471657783, "grad_norm": 0.14625418186187744, "learning_rate": 0.0020282194385386173, "loss": 3.0344852447509765, "num_input_tokens_seen": 5955911680, "step": 11360, "train_runtime": 51588.6188, "train_tokens_per_second": 115450.109 }, { "epoch": 0.6152763873481425, "grad_norm": 0.1353636085987091, "learning_rate": 0.002024496955999548, "loss": 3.0306270599365233, "num_input_tokens_seen": 5961154560, "step": 11370, "train_runtime": 51633.7613, "train_tokens_per_second": 115450.713 }, { "epoch": 0.6158175275305068, "grad_norm": 0.1368403434753418, "learning_rate": 0.0020207766883878955, "loss": 3.0311580657958985, "num_input_tokens_seen": 5966397440, "step": 11380, "train_runtime": 51678.9232, "train_tokens_per_second": 115451.272 }, { "epoch": 0.6163586677128711, "grad_norm": 0.14600516855716705, "learning_rate": 0.0020170586470614656, "loss": 3.0117847442626955, "num_input_tokens_seen": 5971640320, "step": 11390, "train_runtime": 51724.0749, "train_tokens_per_second": 115451.853 }, { "epoch": 0.6168998078952352, "grad_norm": 0.14564567804336548, "learning_rate": 0.002013342843371269, "loss": 3.037702941894531, "num_input_tokens_seen": 5976883200, "step": 11400, "train_runtime": 51769.2278, "train_tokens_per_second": 115452.431 }, { "epoch": 0.6174409480775995, "grad_norm": 0.1405801773071289, "learning_rate": 0.0020096292886614825, "loss": 3.0343984603881835, "num_input_tokens_seen": 5982126080, "step": 11410, "train_runtime": 51814.3769, "train_tokens_per_second": 115453.016 }, { "epoch": 0.6179820882599637, "grad_norm": 0.14390794932842255, "learning_rate": 0.002005917994269417, "loss": 3.023337173461914, "num_input_tokens_seen": 5987368960, "step": 11420, "train_runtime": 51859.5123, "train_tokens_per_second": 115453.63 }, { "epoch": 0.618523228442328, "grad_norm": 0.14093852043151855, "learning_rate": 0.0020022089715254847, "loss": 3.0304771423339845, "num_input_tokens_seen": 5992611840, "step": 11430, "train_runtime": 51904.644, "train_tokens_per_second": 115454.252 }, { "epoch": 0.6190643686246923, "grad_norm": 0.1447506844997406, "learning_rate": 0.001998502231753161, "loss": 3.030156898498535, "num_input_tokens_seen": 5997854720, "step": 11440, "train_runtime": 51949.7701, "train_tokens_per_second": 115454.885 }, { "epoch": 0.6196055088070564, "grad_norm": 0.1445121169090271, "learning_rate": 0.001994797786268952, "loss": 3.0251228332519533, "num_input_tokens_seen": 6003097600, "step": 11450, "train_runtime": 51998.5254, "train_tokens_per_second": 115447.458 }, { "epoch": 0.6201466489894207, "grad_norm": 0.15314067900180817, "learning_rate": 0.0019910956463823587, "loss": 3.022572135925293, "num_input_tokens_seen": 6008340480, "step": 11460, "train_runtime": 52043.6251, "train_tokens_per_second": 115448.155 }, { "epoch": 0.6206877891717849, "grad_norm": 0.1409369558095932, "learning_rate": 0.0019873958233958444, "loss": 3.024155044555664, "num_input_tokens_seen": 6013583360, "step": 11470, "train_runtime": 52088.7237, "train_tokens_per_second": 115448.852 }, { "epoch": 0.6212289293541492, "grad_norm": 0.15012867748737335, "learning_rate": 0.0019836983286047995, "loss": 3.0334211349487306, "num_input_tokens_seen": 6018826240, "step": 11480, "train_runtime": 52133.8249, "train_tokens_per_second": 115449.543 }, { "epoch": 0.6217700695365135, "grad_norm": 0.14085648953914642, "learning_rate": 0.0019800031732975032, "loss": 3.0264703750610353, "num_input_tokens_seen": 6024069120, "step": 11490, "train_runtime": 52178.9281, "train_tokens_per_second": 115450.228 }, { "epoch": 0.6223112097188777, "grad_norm": 0.14266955852508545, "learning_rate": 0.001976310368755096, "loss": 3.032570648193359, "num_input_tokens_seen": 6029312000, "step": 11500, "train_runtime": 52224.0378, "train_tokens_per_second": 115450.897 }, { "epoch": 0.6223112097188777, "eval_loss": 2.984975814819336, "eval_runtime": 1.9819, "eval_samples_per_second": 252.288, "eval_steps_per_second": 4.037, "num_input_tokens_seen": 6029312000, "step": 11500 }, { "epoch": 0.6228523499012419, "grad_norm": 0.13947011530399323, "learning_rate": 0.001972619926251541, "loss": 3.0404077529907227, "num_input_tokens_seen": 6034554880, "step": 11510, "train_runtime": 52271.1258, "train_tokens_per_second": 115447.195 }, { "epoch": 0.6233934900836061, "grad_norm": 0.1446669101715088, "learning_rate": 0.001968931857053588, "loss": 3.021891784667969, "num_input_tokens_seen": 6039797760, "step": 11520, "train_runtime": 52316.2406, "train_tokens_per_second": 115447.855 }, { "epoch": 0.6239346302659704, "grad_norm": 0.13946829736232758, "learning_rate": 0.0019652461724207425, "loss": 3.0241966247558594, "num_input_tokens_seen": 6045040640, "step": 11530, "train_runtime": 52361.3587, "train_tokens_per_second": 115448.506 }, { "epoch": 0.6244757704483347, "grad_norm": 0.14458313584327698, "learning_rate": 0.0019615628836052324, "loss": 3.0141645431518556, "num_input_tokens_seen": 6050283520, "step": 11540, "train_runtime": 52406.4606, "train_tokens_per_second": 115449.192 }, { "epoch": 0.6250169106306989, "grad_norm": 0.14436115324497223, "learning_rate": 0.0019578820018519663, "loss": 3.0331525802612305, "num_input_tokens_seen": 6055526400, "step": 11550, "train_runtime": 52451.5704, "train_tokens_per_second": 115449.859 }, { "epoch": 0.6255580508130631, "grad_norm": 0.14012649655342102, "learning_rate": 0.0019542035383985083, "loss": 3.043803405761719, "num_input_tokens_seen": 6060769280, "step": 11560, "train_runtime": 52496.6939, "train_tokens_per_second": 115450.495 }, { "epoch": 0.6260991909954273, "grad_norm": 0.14854469895362854, "learning_rate": 0.0019505275044750371, "loss": 3.0200592041015626, "num_input_tokens_seen": 6066012160, "step": 11570, "train_runtime": 52541.8152, "train_tokens_per_second": 115451.134 }, { "epoch": 0.6266403311777916, "grad_norm": 0.15853960812091827, "learning_rate": 0.0019468539113043166, "loss": 3.020526885986328, "num_input_tokens_seen": 6071255040, "step": 11580, "train_runtime": 52586.931, "train_tokens_per_second": 115451.785 }, { "epoch": 0.6271814713601559, "grad_norm": 0.14197298884391785, "learning_rate": 0.0019431827701016575, "loss": 3.0370616912841797, "num_input_tokens_seen": 6076497920, "step": 11590, "train_runtime": 52632.0665, "train_tokens_per_second": 115452.391 }, { "epoch": 0.6277226115425201, "grad_norm": 0.1305466592311859, "learning_rate": 0.0019395140920748827, "loss": 3.023914337158203, "num_input_tokens_seen": 6081740800, "step": 11600, "train_runtime": 52677.2162, "train_tokens_per_second": 115452.965 }, { "epoch": 0.6282637517248844, "grad_norm": 0.1447763293981552, "learning_rate": 0.0019358478884243008, "loss": 3.024199676513672, "num_input_tokens_seen": 6086983680, "step": 11610, "train_runtime": 52722.3572, "train_tokens_per_second": 115453.557 }, { "epoch": 0.6288048919072485, "grad_norm": 0.14126408100128174, "learning_rate": 0.0019321841703426608, "loss": 3.022255706787109, "num_input_tokens_seen": 6092226560, "step": 11620, "train_runtime": 52767.4813, "train_tokens_per_second": 115454.185 }, { "epoch": 0.6293460320896128, "grad_norm": 0.1334850788116455, "learning_rate": 0.0019285229490151263, "loss": 3.0233287811279297, "num_input_tokens_seen": 6097469440, "step": 11630, "train_runtime": 52812.6435, "train_tokens_per_second": 115454.729 }, { "epoch": 0.6298871722719771, "grad_norm": 0.14635370671749115, "learning_rate": 0.0019248642356192365, "loss": 3.03590087890625, "num_input_tokens_seen": 6102712320, "step": 11640, "train_runtime": 52857.8692, "train_tokens_per_second": 115455.133 }, { "epoch": 0.6304283124543413, "grad_norm": 0.13621026277542114, "learning_rate": 0.0019212080413248762, "loss": 3.023410415649414, "num_input_tokens_seen": 6107955200, "step": 11650, "train_runtime": 52903.1239, "train_tokens_per_second": 115455.473 }, { "epoch": 0.6309694526367056, "grad_norm": 0.14006845653057098, "learning_rate": 0.0019175543772942383, "loss": 3.020222473144531, "num_input_tokens_seen": 6113198080, "step": 11660, "train_runtime": 52948.2709, "train_tokens_per_second": 115456.047 }, { "epoch": 0.6315105928190697, "grad_norm": 0.13746832311153412, "learning_rate": 0.0019139032546817902, "loss": 3.0225994110107424, "num_input_tokens_seen": 6118440960, "step": 11670, "train_runtime": 52993.4226, "train_tokens_per_second": 115456.611 }, { "epoch": 0.632051733001434, "grad_norm": 0.13812102377414703, "learning_rate": 0.0019102546846342411, "loss": 3.0324447631835936, "num_input_tokens_seen": 6123683840, "step": 11680, "train_runtime": 53038.5588, "train_tokens_per_second": 115457.207 }, { "epoch": 0.6325928731837983, "grad_norm": 0.14019303023815155, "learning_rate": 0.0019066086782905097, "loss": 3.022325897216797, "num_input_tokens_seen": 6128926720, "step": 11690, "train_runtime": 53083.7143, "train_tokens_per_second": 115457.76 }, { "epoch": 0.6331340133661625, "grad_norm": 0.1436738818883896, "learning_rate": 0.0019029652467816838, "loss": 3.0244091033935545, "num_input_tokens_seen": 6134169600, "step": 11700, "train_runtime": 53128.868, "train_tokens_per_second": 115458.315 }, { "epoch": 0.6336751535485268, "grad_norm": 0.14594176411628723, "learning_rate": 0.0018993244012309913, "loss": 3.025048828125, "num_input_tokens_seen": 6139412480, "step": 11710, "train_runtime": 53173.9948, "train_tokens_per_second": 115458.929 }, { "epoch": 0.634216293730891, "grad_norm": 0.15385597944259644, "learning_rate": 0.0018956861527537688, "loss": 3.0213130950927733, "num_input_tokens_seen": 6144655360, "step": 11720, "train_runtime": 53219.1405, "train_tokens_per_second": 115459.5 }, { "epoch": 0.6347574339132552, "grad_norm": 0.14445240795612335, "learning_rate": 0.0018920505124574195, "loss": 3.029845428466797, "num_input_tokens_seen": 6149898240, "step": 11730, "train_runtime": 53264.2928, "train_tokens_per_second": 115460.056 }, { "epoch": 0.6352985740956195, "grad_norm": 0.1384369432926178, "learning_rate": 0.001888417491441387, "loss": 3.0266345977783202, "num_input_tokens_seen": 6155141120, "step": 11740, "train_runtime": 53309.4606, "train_tokens_per_second": 115460.578 }, { "epoch": 0.6358397142779837, "grad_norm": 0.14229294657707214, "learning_rate": 0.0018847871007971163, "loss": 3.017131042480469, "num_input_tokens_seen": 6160384000, "step": 11750, "train_runtime": 53354.6359, "train_tokens_per_second": 115461.082 }, { "epoch": 0.636380854460348, "grad_norm": 0.14127928018569946, "learning_rate": 0.0018811593516080234, "loss": 3.021234703063965, "num_input_tokens_seen": 6165626880, "step": 11760, "train_runtime": 53399.8028, "train_tokens_per_second": 115461.604 }, { "epoch": 0.6369219946427122, "grad_norm": 0.13989216089248657, "learning_rate": 0.0018775342549494606, "loss": 3.0207067489624024, "num_input_tokens_seen": 6170869760, "step": 11770, "train_runtime": 53444.9593, "train_tokens_per_second": 115462.147 }, { "epoch": 0.6374631348250764, "grad_norm": 0.141075000166893, "learning_rate": 0.0018739118218886802, "loss": 3.017308807373047, "num_input_tokens_seen": 6176112640, "step": 11780, "train_runtime": 53490.1129, "train_tokens_per_second": 115462.696 }, { "epoch": 0.6380042750074407, "grad_norm": 0.1446276307106018, "learning_rate": 0.0018702920634848035, "loss": 3.0272090911865233, "num_input_tokens_seen": 6181355520, "step": 11790, "train_runtime": 53535.2546, "train_tokens_per_second": 115463.269 }, { "epoch": 0.6385454151898049, "grad_norm": 0.14022940397262573, "learning_rate": 0.001866674990788788, "loss": 3.0206020355224608, "num_input_tokens_seen": 6186598400, "step": 11800, "train_runtime": 53580.432, "train_tokens_per_second": 115463.765 }, { "epoch": 0.6390865553721692, "grad_norm": 0.1391611397266388, "learning_rate": 0.0018630606148433892, "loss": 3.0259307861328124, "num_input_tokens_seen": 6191841280, "step": 11810, "train_runtime": 53625.6025, "train_tokens_per_second": 115464.274 }, { "epoch": 0.6396276955545334, "grad_norm": 0.1375313252210617, "learning_rate": 0.0018594489466831293, "loss": 3.019388198852539, "num_input_tokens_seen": 6197084160, "step": 11820, "train_runtime": 53670.752, "train_tokens_per_second": 115464.828 }, { "epoch": 0.6401688357368976, "grad_norm": 0.13326410949230194, "learning_rate": 0.0018558399973342677, "loss": 3.0195072174072264, "num_input_tokens_seen": 6202327040, "step": 11830, "train_runtime": 53719.4728, "train_tokens_per_second": 115457.705 }, { "epoch": 0.6407099759192619, "grad_norm": 0.1425514966249466, "learning_rate": 0.0018522337778147586, "loss": 3.012344741821289, "num_input_tokens_seen": 6207569920, "step": 11840, "train_runtime": 53764.8643, "train_tokens_per_second": 115457.744 }, { "epoch": 0.6412511161016261, "grad_norm": 0.14373312890529633, "learning_rate": 0.001848630299134224, "loss": 3.0200828552246093, "num_input_tokens_seen": 6212812800, "step": 11850, "train_runtime": 53810.7174, "train_tokens_per_second": 115456.792 }, { "epoch": 0.6417922562839904, "grad_norm": 0.14393045008182526, "learning_rate": 0.0018450295722939214, "loss": 3.0205759048461913, "num_input_tokens_seen": 6218055680, "step": 11860, "train_runtime": 53856.71, "train_tokens_per_second": 115455.543 }, { "epoch": 0.6423333964663546, "grad_norm": 0.13831470906734467, "learning_rate": 0.0018414316082867015, "loss": 3.018105697631836, "num_input_tokens_seen": 6223298560, "step": 11870, "train_runtime": 53902.8725, "train_tokens_per_second": 115453.932 }, { "epoch": 0.6428745366487189, "grad_norm": 0.14368562400341034, "learning_rate": 0.0018378364180969837, "loss": 3.0205171585083006, "num_input_tokens_seen": 6228541440, "step": 11880, "train_runtime": 53949.0344, "train_tokens_per_second": 115452.325 }, { "epoch": 0.6434156768310831, "grad_norm": 0.134961798787117, "learning_rate": 0.0018342440127007181, "loss": 3.0208873748779297, "num_input_tokens_seen": 6233784320, "step": 11890, "train_runtime": 53994.8144, "train_tokens_per_second": 115451.537 }, { "epoch": 0.6439568170134473, "grad_norm": 0.139762744307518, "learning_rate": 0.0018306544030653531, "loss": 3.0138370513916017, "num_input_tokens_seen": 6239027200, "step": 11900, "train_runtime": 54040.0794, "train_tokens_per_second": 115451.851 }, { "epoch": 0.6444979571958116, "grad_norm": 0.15458019077777863, "learning_rate": 0.0018270676001498033, "loss": 3.025080108642578, "num_input_tokens_seen": 6244270080, "step": 11910, "train_runtime": 54085.3315, "train_tokens_per_second": 115452.192 }, { "epoch": 0.6450390973781758, "grad_norm": 0.13538894057273865, "learning_rate": 0.001823483614904411, "loss": 3.016307830810547, "num_input_tokens_seen": 6249512960, "step": 11920, "train_runtime": 54130.6042, "train_tokens_per_second": 115452.489 }, { "epoch": 0.6455802375605401, "grad_norm": 0.13436593115329742, "learning_rate": 0.0018199024582709177, "loss": 3.0229183197021485, "num_input_tokens_seen": 6254755840, "step": 11930, "train_runtime": 54175.8479, "train_tokens_per_second": 115452.846 }, { "epoch": 0.6461213777429043, "grad_norm": 0.1262059211730957, "learning_rate": 0.0018163241411824327, "loss": 3.0243408203125, "num_input_tokens_seen": 6259998720, "step": 11940, "train_runtime": 54221.0877, "train_tokens_per_second": 115453.212 }, { "epoch": 0.6466625179252685, "grad_norm": 0.14077694714069366, "learning_rate": 0.0018127486745633914, "loss": 3.009103775024414, "num_input_tokens_seen": 6265241600, "step": 11950, "train_runtime": 54266.3714, "train_tokens_per_second": 115453.483 }, { "epoch": 0.6472036581076328, "grad_norm": 0.14338544011116028, "learning_rate": 0.001809176069329529, "loss": 3.019987106323242, "num_input_tokens_seen": 6270484480, "step": 11960, "train_runtime": 54311.6327, "train_tokens_per_second": 115453.802 }, { "epoch": 0.647744798289997, "grad_norm": 0.1309393346309662, "learning_rate": 0.001805606336387845, "loss": 3.0178783416748045, "num_input_tokens_seen": 6275727360, "step": 11970, "train_runtime": 54356.8873, "train_tokens_per_second": 115454.134 }, { "epoch": 0.6482859384723613, "grad_norm": 0.1303347647190094, "learning_rate": 0.0018020394866365714, "loss": 3.0253570556640623, "num_input_tokens_seen": 6280970240, "step": 11980, "train_runtime": 54402.1335, "train_tokens_per_second": 115454.484 }, { "epoch": 0.6488270786547256, "grad_norm": 0.14178113639354706, "learning_rate": 0.0017984755309651346, "loss": 3.0267719268798827, "num_input_tokens_seen": 6286213120, "step": 11990, "train_runtime": 54447.3835, "train_tokens_per_second": 115454.825 }, { "epoch": 0.6493682188370897, "grad_norm": 0.1430656611919403, "learning_rate": 0.0017949144802541274, "loss": 3.0143644332885744, "num_input_tokens_seen": 6291456000, "step": 12000, "train_runtime": 54492.6535, "train_tokens_per_second": 115455.123 }, { "epoch": 0.6493682188370897, "eval_loss": 2.9761710166931152, "eval_runtime": 1.9875, "eval_samples_per_second": 251.575, "eval_steps_per_second": 4.025, "num_input_tokens_seen": 6291456000, "step": 12000 }, { "epoch": 0.649909359019454, "grad_norm": 0.14500592648983002, "learning_rate": 0.0017913563453752746, "loss": 3.018670654296875, "num_input_tokens_seen": 6296698880, "step": 12010, "train_runtime": 54542.2937, "train_tokens_per_second": 115446.169 }, { "epoch": 0.6504504992018182, "grad_norm": 0.1448933333158493, "learning_rate": 0.0017878011371913977, "loss": 3.0202388763427734, "num_input_tokens_seen": 6301941760, "step": 12020, "train_runtime": 54587.5091, "train_tokens_per_second": 115446.59 }, { "epoch": 0.6509916393841825, "grad_norm": 0.1533222645521164, "learning_rate": 0.0017842488665563833, "loss": 3.025776672363281, "num_input_tokens_seen": 6307184640, "step": 12030, "train_runtime": 54632.7175, "train_tokens_per_second": 115447.024 }, { "epoch": 0.6515327795665468, "grad_norm": 0.13312490284442902, "learning_rate": 0.0017806995443151524, "loss": 3.0187503814697267, "num_input_tokens_seen": 6312427520, "step": 12040, "train_runtime": 54677.8786, "train_tokens_per_second": 115447.557 }, { "epoch": 0.6520739197489109, "grad_norm": 0.13797084987163544, "learning_rate": 0.0017771531813036206, "loss": 3.019959259033203, "num_input_tokens_seen": 6317670400, "step": 12050, "train_runtime": 54723.0506, "train_tokens_per_second": 115448.067 }, { "epoch": 0.6526150599312752, "grad_norm": 0.13628187775611877, "learning_rate": 0.0017736097883486713, "loss": 3.012210655212402, "num_input_tokens_seen": 6322913280, "step": 12060, "train_runtime": 54768.2437, "train_tokens_per_second": 115448.531 }, { "epoch": 0.6531562001136394, "grad_norm": 0.13764619827270508, "learning_rate": 0.001770069376268119, "loss": 3.0185993194580076, "num_input_tokens_seen": 6328156160, "step": 12070, "train_runtime": 54813.436, "train_tokens_per_second": 115448.996 }, { "epoch": 0.6536973402960037, "grad_norm": 0.14087094366550446, "learning_rate": 0.001766531955870682, "loss": 3.0167076110839846, "num_input_tokens_seen": 6333399040, "step": 12080, "train_runtime": 54858.6498, "train_tokens_per_second": 115449.415 }, { "epoch": 0.654238480478368, "grad_norm": 0.13622906804084778, "learning_rate": 0.0017629975379559405, "loss": 3.021717643737793, "num_input_tokens_seen": 6338641920, "step": 12090, "train_runtime": 54903.8659, "train_tokens_per_second": 115449.829 }, { "epoch": 0.6547796206607321, "grad_norm": 0.13120146095752716, "learning_rate": 0.001759466133314308, "loss": 3.0197391510009766, "num_input_tokens_seen": 6343884800, "step": 12100, "train_runtime": 54949.0889, "train_tokens_per_second": 115450.227 }, { "epoch": 0.6553207608430964, "grad_norm": 0.139594167470932, "learning_rate": 0.001755937752727003, "loss": 3.0223533630371096, "num_input_tokens_seen": 6349127680, "step": 12110, "train_runtime": 54994.2934, "train_tokens_per_second": 115450.664 }, { "epoch": 0.6558619010254606, "grad_norm": 0.15013989806175232, "learning_rate": 0.001752412406966008, "loss": 3.0148881912231444, "num_input_tokens_seen": 6354370560, "step": 12120, "train_runtime": 55039.5071, "train_tokens_per_second": 115451.08 }, { "epoch": 0.6564030412078249, "grad_norm": 0.13876710832118988, "learning_rate": 0.0017488901067940416, "loss": 3.0114933013916017, "num_input_tokens_seen": 6359613440, "step": 12130, "train_runtime": 55084.7162, "train_tokens_per_second": 115451.506 }, { "epoch": 0.6569441813901892, "grad_norm": 0.13125662505626678, "learning_rate": 0.0017453708629645238, "loss": 3.004977226257324, "num_input_tokens_seen": 6364856320, "step": 12140, "train_runtime": 55129.9188, "train_tokens_per_second": 115451.944 }, { "epoch": 0.6574853215725533, "grad_norm": 0.14310745894908905, "learning_rate": 0.0017418546862215448, "loss": 3.0219293594360352, "num_input_tokens_seen": 6370099200, "step": 12150, "train_runtime": 55175.1468, "train_tokens_per_second": 115452.329 }, { "epoch": 0.6580264617549176, "grad_norm": 0.1343064159154892, "learning_rate": 0.0017383415872998303, "loss": 3.017044258117676, "num_input_tokens_seen": 6375342080, "step": 12160, "train_runtime": 55220.3693, "train_tokens_per_second": 115452.724 }, { "epoch": 0.6585676019372818, "grad_norm": 0.13533759117126465, "learning_rate": 0.0017348315769247086, "loss": 3.0149707794189453, "num_input_tokens_seen": 6380584960, "step": 12170, "train_runtime": 55265.5973, "train_tokens_per_second": 115453.108 }, { "epoch": 0.6591087421196461, "grad_norm": 0.1386122703552246, "learning_rate": 0.0017313246658120804, "loss": 3.0143962860107423, "num_input_tokens_seen": 6385827840, "step": 12180, "train_runtime": 55310.8039, "train_tokens_per_second": 115453.535 }, { "epoch": 0.6596498823020104, "grad_norm": 0.1343347579240799, "learning_rate": 0.0017278208646683856, "loss": 3.0179080963134766, "num_input_tokens_seen": 6391070720, "step": 12190, "train_runtime": 55356.0418, "train_tokens_per_second": 115453.896 }, { "epoch": 0.6601910224843746, "grad_norm": 0.14169900119304657, "learning_rate": 0.0017243201841905666, "loss": 3.0247045516967774, "num_input_tokens_seen": 6396313600, "step": 12200, "train_runtime": 55401.2599, "train_tokens_per_second": 115454.299 }, { "epoch": 0.6607321626667388, "grad_norm": 0.13514377176761627, "learning_rate": 0.0017208226350660391, "loss": 3.0104536056518554, "num_input_tokens_seen": 6401556480, "step": 12210, "train_runtime": 55450.8592, "train_tokens_per_second": 115445.578 }, { "epoch": 0.661273302849103, "grad_norm": 0.13756819069385529, "learning_rate": 0.0017173282279726609, "loss": 3.0194664001464844, "num_input_tokens_seen": 6406799360, "step": 12220, "train_runtime": 55496.1437, "train_tokens_per_second": 115445.848 }, { "epoch": 0.6618144430314673, "grad_norm": 0.13056276738643646, "learning_rate": 0.0017138369735786954, "loss": 3.0248437881469727, "num_input_tokens_seen": 6412042240, "step": 12230, "train_runtime": 55541.3669, "train_tokens_per_second": 115446.245 }, { "epoch": 0.6623555832138316, "grad_norm": 0.13981449604034424, "learning_rate": 0.0017103488825427826, "loss": 3.0129575729370117, "num_input_tokens_seen": 6417285120, "step": 12240, "train_runtime": 55586.7139, "train_tokens_per_second": 115446.384 }, { "epoch": 0.6628967233961958, "grad_norm": 0.1439344733953476, "learning_rate": 0.0017068639655139026, "loss": 3.022663116455078, "num_input_tokens_seen": 6422528000, "step": 12250, "train_runtime": 55632.0146, "train_tokens_per_second": 115446.619 }, { "epoch": 0.66343786357856, "grad_norm": 0.15030835568904877, "learning_rate": 0.001703382233131348, "loss": 3.012424850463867, "num_input_tokens_seen": 6427770880, "step": 12260, "train_runtime": 55677.2926, "train_tokens_per_second": 115446.901 }, { "epoch": 0.6639790037609242, "grad_norm": 0.13960550725460052, "learning_rate": 0.0016999036960246871, "loss": 3.0081478118896485, "num_input_tokens_seen": 6433013760, "step": 12270, "train_runtime": 55722.6028, "train_tokens_per_second": 115447.115 }, { "epoch": 0.6645201439432885, "grad_norm": 0.13627994060516357, "learning_rate": 0.0016964283648137329, "loss": 3.0084842681884765, "num_input_tokens_seen": 6438256640, "step": 12280, "train_runtime": 55767.8798, "train_tokens_per_second": 115447.398 }, { "epoch": 0.6650612841256528, "grad_norm": 0.14768123626708984, "learning_rate": 0.0016929562501085123, "loss": 3.013652801513672, "num_input_tokens_seen": 6443499520, "step": 12290, "train_runtime": 55813.1427, "train_tokens_per_second": 115447.71 }, { "epoch": 0.665602424308017, "grad_norm": 0.14207823574543, "learning_rate": 0.0016894873625092333, "loss": 3.0111804962158204, "num_input_tokens_seen": 6448742400, "step": 12300, "train_runtime": 55858.4112, "train_tokens_per_second": 115448.01 }, { "epoch": 0.6661435644903813, "grad_norm": 0.1379329413175583, "learning_rate": 0.0016860217126062479, "loss": 3.0187799453735353, "num_input_tokens_seen": 6453985280, "step": 12310, "train_runtime": 55903.6646, "train_tokens_per_second": 115448.34 }, { "epoch": 0.6666847046727454, "grad_norm": 0.14401622116565704, "learning_rate": 0.0016825593109800264, "loss": 3.0228382110595704, "num_input_tokens_seen": 6459228160, "step": 12320, "train_runtime": 55948.9475, "train_tokens_per_second": 115448.609 }, { "epoch": 0.6672258448551097, "grad_norm": 0.12955419719219208, "learning_rate": 0.0016791001682011227, "loss": 3.0097047805786135, "num_input_tokens_seen": 6464471040, "step": 12330, "train_runtime": 55994.314, "train_tokens_per_second": 115448.705 }, { "epoch": 0.667766985037474, "grad_norm": 0.14710277318954468, "learning_rate": 0.0016756442948301386, "loss": 3.0169065475463865, "num_input_tokens_seen": 6469713920, "step": 12340, "train_runtime": 56039.7555, "train_tokens_per_second": 115448.646 }, { "epoch": 0.6683081252198382, "grad_norm": 0.1326688975095749, "learning_rate": 0.0016721917014176982, "loss": 3.009653663635254, "num_input_tokens_seen": 6474956800, "step": 12350, "train_runtime": 56085.2141, "train_tokens_per_second": 115448.553 }, { "epoch": 0.6688492654022025, "grad_norm": 0.13903285562992096, "learning_rate": 0.0016687423985044109, "loss": 3.019660758972168, "num_input_tokens_seen": 6480199680, "step": 12360, "train_runtime": 56130.7366, "train_tokens_per_second": 115448.328 }, { "epoch": 0.6693904055845666, "grad_norm": 0.13976383209228516, "learning_rate": 0.0016652963966208385, "loss": 3.0172367095947266, "num_input_tokens_seen": 6485442560, "step": 12370, "train_runtime": 56176.2788, "train_tokens_per_second": 115448.063 }, { "epoch": 0.6699315457669309, "grad_norm": 0.13633348047733307, "learning_rate": 0.0016618537062874665, "loss": 3.004638671875, "num_input_tokens_seen": 6490685440, "step": 12380, "train_runtime": 56221.949, "train_tokens_per_second": 115447.535 }, { "epoch": 0.6704726859492952, "grad_norm": 0.14074033498764038, "learning_rate": 0.001658414338014669, "loss": 3.019020843505859, "num_input_tokens_seen": 6495928320, "step": 12390, "train_runtime": 56267.7615, "train_tokens_per_second": 115446.717 }, { "epoch": 0.6710138261316594, "grad_norm": 0.1326296180486679, "learning_rate": 0.0016549783023026808, "loss": 3.0110851287841798, "num_input_tokens_seen": 6501171200, "step": 12400, "train_runtime": 56313.0806, "train_tokens_per_second": 115446.911 }, { "epoch": 0.6715549663140237, "grad_norm": 0.13860943913459778, "learning_rate": 0.001651545609641561, "loss": 3.0090118408203126, "num_input_tokens_seen": 6506414080, "step": 12410, "train_runtime": 56358.3912, "train_tokens_per_second": 115447.122 }, { "epoch": 0.6720961064963878, "grad_norm": 0.1410975605249405, "learning_rate": 0.0016481162705111604, "loss": 3.0008705139160154, "num_input_tokens_seen": 6511656960, "step": 12420, "train_runtime": 56403.6982, "train_tokens_per_second": 115447.341 }, { "epoch": 0.6726372466787521, "grad_norm": 0.13546454906463623, "learning_rate": 0.0016446902953810964, "loss": 3.013086700439453, "num_input_tokens_seen": 6516899840, "step": 12430, "train_runtime": 56448.9891, "train_tokens_per_second": 115447.592 }, { "epoch": 0.6731783868611164, "grad_norm": 0.13547931611537933, "learning_rate": 0.0016412676947107113, "loss": 3.004133605957031, "num_input_tokens_seen": 6522142720, "step": 12440, "train_runtime": 56494.2857, "train_tokens_per_second": 115447.831 }, { "epoch": 0.6737195270434806, "grad_norm": 0.13898716866970062, "learning_rate": 0.0016378484789490479, "loss": 3.015100860595703, "num_input_tokens_seen": 6527385600, "step": 12450, "train_runtime": 56539.5755, "train_tokens_per_second": 115448.083 }, { "epoch": 0.6742606672258449, "grad_norm": 0.1385628879070282, "learning_rate": 0.0016344326585348147, "loss": 3.018421936035156, "num_input_tokens_seen": 6532628480, "step": 12460, "train_runtime": 56584.8917, "train_tokens_per_second": 115448.281 }, { "epoch": 0.674801807408209, "grad_norm": 0.13880495727062225, "learning_rate": 0.001631020243896355, "loss": 3.0016693115234374, "num_input_tokens_seen": 6537871360, "step": 12470, "train_runtime": 56630.196, "train_tokens_per_second": 115448.503 }, { "epoch": 0.6753429475905733, "grad_norm": 0.1371801793575287, "learning_rate": 0.0016276112454516134, "loss": 3.0135356903076174, "num_input_tokens_seen": 6543114240, "step": 12480, "train_runtime": 56675.5074, "train_tokens_per_second": 115448.71 }, { "epoch": 0.6758840877729376, "grad_norm": 0.1398102194070816, "learning_rate": 0.001624205673608104, "loss": 3.0212148666381835, "num_input_tokens_seen": 6548357120, "step": 12490, "train_runtime": 56720.8046, "train_tokens_per_second": 115448.946 }, { "epoch": 0.6764252279553018, "grad_norm": 0.1300211250782013, "learning_rate": 0.0016208035387628825, "loss": 3.0142328262329103, "num_input_tokens_seen": 6553600000, "step": 12500, "train_runtime": 56766.0883, "train_tokens_per_second": 115449.209 }, { "epoch": 0.6764252279553018, "eval_loss": 2.968597412109375, "eval_runtime": 1.9925, "eval_samples_per_second": 250.941, "eval_steps_per_second": 4.015, "num_input_tokens_seen": 6553600000, "step": 12500 }, { "epoch": 0.6769663681376661, "grad_norm": 0.14369215071201324, "learning_rate": 0.0016174048513025103, "loss": 3.0048513412475586, "num_input_tokens_seen": 6558842880, "step": 12510, "train_runtime": 56813.3987, "train_tokens_per_second": 115445.353 }, { "epoch": 0.6775075083200303, "grad_norm": 0.14692343771457672, "learning_rate": 0.0016140096216030232, "loss": 3.0137935638427735, "num_input_tokens_seen": 6564085760, "step": 12520, "train_runtime": 56858.6904, "train_tokens_per_second": 115445.602 }, { "epoch": 0.6780486485023945, "grad_norm": 0.14028270542621613, "learning_rate": 0.0016106178600299001, "loss": 3.010356140136719, "num_input_tokens_seen": 6569328640, "step": 12530, "train_runtime": 56903.9761, "train_tokens_per_second": 115445.863 }, { "epoch": 0.6785897886847588, "grad_norm": 0.12822629511356354, "learning_rate": 0.0016072295769380353, "loss": 3.0003124237060548, "num_input_tokens_seen": 6574571520, "step": 12540, "train_runtime": 56949.26, "train_tokens_per_second": 115446.127 }, { "epoch": 0.679130928867123, "grad_norm": 0.1369100958108902, "learning_rate": 0.0016038447826716993, "loss": 3.0066249847412108, "num_input_tokens_seen": 6579814400, "step": 12550, "train_runtime": 56994.5681, "train_tokens_per_second": 115446.342 }, { "epoch": 0.6796720690494873, "grad_norm": 0.14047878980636597, "learning_rate": 0.001600463487564515, "loss": 3.0145965576171876, "num_input_tokens_seen": 6585057280, "step": 12560, "train_runtime": 57039.861, "train_tokens_per_second": 115446.587 }, { "epoch": 0.6802132092318515, "grad_norm": 0.14242438971996307, "learning_rate": 0.001597085701939419, "loss": 3.0166095733642577, "num_input_tokens_seen": 6590300160, "step": 12570, "train_runtime": 57085.1398, "train_tokens_per_second": 115446.86 }, { "epoch": 0.6807543494142158, "grad_norm": 0.1383470743894577, "learning_rate": 0.0015937114361086369, "loss": 3.0075637817382814, "num_input_tokens_seen": 6595543040, "step": 12580, "train_runtime": 57130.4343, "train_tokens_per_second": 115447.101 }, { "epoch": 0.68129548959658, "grad_norm": 0.1291186362504959, "learning_rate": 0.0015903407003736466, "loss": 3.01377010345459, "num_input_tokens_seen": 6600785920, "step": 12590, "train_runtime": 57180.0264, "train_tokens_per_second": 115438.665 }, { "epoch": 0.6818366297789442, "grad_norm": 0.13580311834812164, "learning_rate": 0.0015869735050251489, "loss": 3.0099231719970705, "num_input_tokens_seen": 6606028800, "step": 12600, "train_runtime": 57225.3092, "train_tokens_per_second": 115438.936 }, { "epoch": 0.6823777699613085, "grad_norm": 0.1437922716140747, "learning_rate": 0.0015836098603430357, "loss": 3.0034923553466797, "num_input_tokens_seen": 6611271680, "step": 12610, "train_runtime": 57270.5349, "train_tokens_per_second": 115439.321 }, { "epoch": 0.6829189101436727, "grad_norm": 0.13526742160320282, "learning_rate": 0.0015802497765963614, "loss": 3.00305061340332, "num_input_tokens_seen": 6616514560, "step": 12620, "train_runtime": 57315.7589, "train_tokens_per_second": 115439.709 }, { "epoch": 0.683460050326037, "grad_norm": 0.1404607594013214, "learning_rate": 0.0015768932640433059, "loss": 3.0041690826416017, "num_input_tokens_seen": 6621757440, "step": 12630, "train_runtime": 57360.9936, "train_tokens_per_second": 115440.076 }, { "epoch": 0.6840011905084012, "grad_norm": 0.13756705820560455, "learning_rate": 0.0015735403329311469, "loss": 2.9982038497924806, "num_input_tokens_seen": 6627000320, "step": 12640, "train_runtime": 57406.2268, "train_tokens_per_second": 115440.444 }, { "epoch": 0.6845423306907654, "grad_norm": 0.14006656408309937, "learning_rate": 0.0015701909934962305, "loss": 3.009762763977051, "num_input_tokens_seen": 6632243200, "step": 12650, "train_runtime": 57451.4583, "train_tokens_per_second": 115440.816 }, { "epoch": 0.6850834708731297, "grad_norm": 0.13317948579788208, "learning_rate": 0.001566845255963934, "loss": 3.0151742935180663, "num_input_tokens_seen": 6637486080, "step": 12660, "train_runtime": 57496.7057, "train_tokens_per_second": 115441.154 }, { "epoch": 0.6856246110554939, "grad_norm": 0.13669337332248688, "learning_rate": 0.0015635031305486417, "loss": 3.000714874267578, "num_input_tokens_seen": 6642728960, "step": 12670, "train_runtime": 57541.9394, "train_tokens_per_second": 115441.52 }, { "epoch": 0.6861657512378582, "grad_norm": 0.13967348635196686, "learning_rate": 0.0015601646274537087, "loss": 3.0043874740600587, "num_input_tokens_seen": 6647971840, "step": 12680, "train_runtime": 57587.1773, "train_tokens_per_second": 115441.877 }, { "epoch": 0.6867068914202225, "grad_norm": 0.13815197348594666, "learning_rate": 0.0015568297568714312, "loss": 3.010976219177246, "num_input_tokens_seen": 6653214720, "step": 12690, "train_runtime": 57632.4045, "train_tokens_per_second": 115442.255 }, { "epoch": 0.6872480316025866, "grad_norm": 0.1381223499774933, "learning_rate": 0.001553498528983015, "loss": 3.013303756713867, "num_input_tokens_seen": 6658457600, "step": 12700, "train_runtime": 57677.6438, "train_tokens_per_second": 115442.608 }, { "epoch": 0.6877891717849509, "grad_norm": 0.13350199162960052, "learning_rate": 0.0015501709539585454, "loss": 3.012788009643555, "num_input_tokens_seen": 6663700480, "step": 12710, "train_runtime": 57722.8853, "train_tokens_per_second": 115442.956 }, { "epoch": 0.6883303119673151, "grad_norm": 0.13979476690292358, "learning_rate": 0.0015468470419569564, "loss": 3.0098241806030273, "num_input_tokens_seen": 6668943360, "step": 12720, "train_runtime": 57768.1112, "train_tokens_per_second": 115443.334 }, { "epoch": 0.6888714521496794, "grad_norm": 0.13748957216739655, "learning_rate": 0.0015435268031259992, "loss": 3.009090805053711, "num_input_tokens_seen": 6674186240, "step": 12730, "train_runtime": 57813.3636, "train_tokens_per_second": 115443.659 }, { "epoch": 0.6894125923320437, "grad_norm": 0.13561367988586426, "learning_rate": 0.0015402102476022095, "loss": 3.008078765869141, "num_input_tokens_seen": 6679429120, "step": 12740, "train_runtime": 57858.572, "train_tokens_per_second": 115444.072 }, { "epoch": 0.6899537325144078, "grad_norm": 0.12914767861366272, "learning_rate": 0.0015368973855108782, "loss": 3.0018003463745115, "num_input_tokens_seen": 6684672000, "step": 12750, "train_runtime": 57903.8186, "train_tokens_per_second": 115444.407 }, { "epoch": 0.6904948726967721, "grad_norm": 0.14038655161857605, "learning_rate": 0.0015335882269660217, "loss": 3.004079818725586, "num_input_tokens_seen": 6689914880, "step": 12760, "train_runtime": 57949.0509, "train_tokens_per_second": 115444.771 }, { "epoch": 0.6910360128791363, "grad_norm": 0.13866056501865387, "learning_rate": 0.001530282782070348, "loss": 3.009323310852051, "num_input_tokens_seen": 6695157760, "step": 12770, "train_runtime": 57994.2931, "train_tokens_per_second": 115445.114 }, { "epoch": 0.6915771530615006, "grad_norm": 0.1286270171403885, "learning_rate": 0.001526981060915229, "loss": 3.000651550292969, "num_input_tokens_seen": 6700400640, "step": 12780, "train_runtime": 58039.518, "train_tokens_per_second": 115445.491 }, { "epoch": 0.6921182932438649, "grad_norm": 0.13248993456363678, "learning_rate": 0.0015236830735806679, "loss": 3.0101812362670897, "num_input_tokens_seen": 6705643520, "step": 12790, "train_runtime": 58084.7779, "train_tokens_per_second": 115445.798 }, { "epoch": 0.692659433426229, "grad_norm": 0.1369810700416565, "learning_rate": 0.0015203888301352675, "loss": 3.004811477661133, "num_input_tokens_seen": 6710886400, "step": 12800, "train_runtime": 58130.0044, "train_tokens_per_second": 115446.171 }, { "epoch": 0.6932005736085933, "grad_norm": 0.14264971017837524, "learning_rate": 0.001517098340636202, "loss": 3.010848808288574, "num_input_tokens_seen": 6716129280, "step": 12810, "train_runtime": 58175.241, "train_tokens_per_second": 115446.523 }, { "epoch": 0.6937417137909575, "grad_norm": 0.1406365931034088, "learning_rate": 0.0015138116151291825, "loss": 3.0090103149414062, "num_input_tokens_seen": 6721372160, "step": 12820, "train_runtime": 58220.4724, "train_tokens_per_second": 115446.885 }, { "epoch": 0.6942828539733218, "grad_norm": 0.13356050848960876, "learning_rate": 0.0015105286636484334, "loss": 2.999258613586426, "num_input_tokens_seen": 6726615040, "step": 12830, "train_runtime": 58265.7054, "train_tokens_per_second": 115447.243 }, { "epoch": 0.6948239941556861, "grad_norm": 0.13091513514518738, "learning_rate": 0.001507249496216654, "loss": 3.005986785888672, "num_input_tokens_seen": 6731857920, "step": 12840, "train_runtime": 58310.9354, "train_tokens_per_second": 115447.606 }, { "epoch": 0.6953651343380503, "grad_norm": 0.1335466355085373, "learning_rate": 0.0015039741228449904, "loss": 2.9974597930908202, "num_input_tokens_seen": 6737100800, "step": 12850, "train_runtime": 58356.1736, "train_tokens_per_second": 115447.953 }, { "epoch": 0.6959062745204145, "grad_norm": 0.1375114917755127, "learning_rate": 0.0015007025535330083, "loss": 3.0074440002441407, "num_input_tokens_seen": 6742343680, "step": 12860, "train_runtime": 58401.3717, "train_tokens_per_second": 115448.379 }, { "epoch": 0.6964474147027787, "grad_norm": 0.15171852707862854, "learning_rate": 0.001497434798268658, "loss": 2.996272659301758, "num_input_tokens_seen": 6747586560, "step": 12870, "train_runtime": 58446.5932, "train_tokens_per_second": 115448.757 }, { "epoch": 0.696988554885143, "grad_norm": 0.13725285232067108, "learning_rate": 0.0014941708670282445, "loss": 3.0174352645874025, "num_input_tokens_seen": 6752829440, "step": 12880, "train_runtime": 58491.8411, "train_tokens_per_second": 115449.083 }, { "epoch": 0.6975296950675073, "grad_norm": 0.1326073855161667, "learning_rate": 0.0014909107697764006, "loss": 3.006754684448242, "num_input_tokens_seen": 6758072320, "step": 12890, "train_runtime": 58537.0682, "train_tokens_per_second": 115449.45 }, { "epoch": 0.6980708352498715, "grad_norm": 0.1453487128019333, "learning_rate": 0.0014876545164660543, "loss": 3.003281021118164, "num_input_tokens_seen": 6763315200, "step": 12900, "train_runtime": 58582.3109, "train_tokens_per_second": 115449.785 }, { "epoch": 0.6986119754322357, "grad_norm": 0.13233183324337006, "learning_rate": 0.001484402117038397, "loss": 3.0117160797119142, "num_input_tokens_seen": 6768558080, "step": 12910, "train_runtime": 58627.5472, "train_tokens_per_second": 115450.132 }, { "epoch": 0.6991531156145999, "grad_norm": 0.1383819729089737, "learning_rate": 0.0014811535814228522, "loss": 3.0003276824951173, "num_input_tokens_seen": 6773800960, "step": 12920, "train_runtime": 58672.7881, "train_tokens_per_second": 115450.47 }, { "epoch": 0.6996942557969642, "grad_norm": 0.13273653388023376, "learning_rate": 0.0014779089195370515, "loss": 3.006727600097656, "num_input_tokens_seen": 6779043840, "step": 12930, "train_runtime": 58718.0154, "train_tokens_per_second": 115450.834 }, { "epoch": 0.7002353959793285, "grad_norm": 0.13412410020828247, "learning_rate": 0.0014746681412867993, "loss": 2.9990608215332033, "num_input_tokens_seen": 6784286720, "step": 12940, "train_runtime": 58763.2242, "train_tokens_per_second": 115451.234 }, { "epoch": 0.7007765361616927, "grad_norm": 0.13567864894866943, "learning_rate": 0.0014714312565660412, "loss": 3.001424789428711, "num_input_tokens_seen": 6789529600, "step": 12950, "train_runtime": 58808.4491, "train_tokens_per_second": 115451.601 }, { "epoch": 0.701317676344057, "grad_norm": 0.12947793304920197, "learning_rate": 0.0014681982752568368, "loss": 2.9996448516845704, "num_input_tokens_seen": 6794772480, "step": 12960, "train_runtime": 58853.6594, "train_tokens_per_second": 115451.997 }, { "epoch": 0.7018588165264211, "grad_norm": 0.1319398730993271, "learning_rate": 0.001464969207229331, "loss": 3.0077224731445313, "num_input_tokens_seen": 6800015360, "step": 12970, "train_runtime": 58898.8938, "train_tokens_per_second": 115452.344 }, { "epoch": 0.7023999567087854, "grad_norm": 0.14026153087615967, "learning_rate": 0.0014617440623417178, "loss": 2.999114227294922, "num_input_tokens_seen": 6805258240, "step": 12980, "train_runtime": 58948.6295, "train_tokens_per_second": 115443.875 }, { "epoch": 0.7029410968911497, "grad_norm": 0.14495210349559784, "learning_rate": 0.0014585228504402185, "loss": 3.005875015258789, "num_input_tokens_seen": 6810501120, "step": 12990, "train_runtime": 58994.0959, "train_tokens_per_second": 115443.775 }, { "epoch": 0.7034822370735139, "grad_norm": 0.13643252849578857, "learning_rate": 0.001455305581359043, "loss": 2.997660255432129, "num_input_tokens_seen": 6815744000, "step": 13000, "train_runtime": 59039.5206, "train_tokens_per_second": 115443.756 }, { "epoch": 0.7034822370735139, "eval_loss": 2.960465669631958, "eval_runtime": 1.987, "eval_samples_per_second": 251.641, "eval_steps_per_second": 4.026, "num_input_tokens_seen": 6815744000, "step": 13000 }, { "epoch": 0.7040233772558782, "grad_norm": 0.130798801779747, "learning_rate": 0.001452092264920367, "loss": 3.0002573013305662, "num_input_tokens_seen": 6820986880, "step": 13010, "train_runtime": 59089.599, "train_tokens_per_second": 115434.645 }, { "epoch": 0.7045645174382423, "grad_norm": 0.13077320158481598, "learning_rate": 0.001448882910934297, "loss": 3.00850830078125, "num_input_tokens_seen": 6826229760, "step": 13020, "train_runtime": 59135.0207, "train_tokens_per_second": 115434.639 }, { "epoch": 0.7051056576206066, "grad_norm": 0.14131614565849304, "learning_rate": 0.0014456775291988434, "loss": 3.0077110290527345, "num_input_tokens_seen": 6831472640, "step": 13030, "train_runtime": 59180.4577, "train_tokens_per_second": 115434.603 }, { "epoch": 0.7056467978029709, "grad_norm": 0.13815636932849884, "learning_rate": 0.0014424761294998883, "loss": 3.00131778717041, "num_input_tokens_seen": 6836715520, "step": 13040, "train_runtime": 59225.9087, "train_tokens_per_second": 115434.54 }, { "epoch": 0.7061879379853351, "grad_norm": 0.1329071819782257, "learning_rate": 0.0014392787216111597, "loss": 2.994339370727539, "num_input_tokens_seen": 6841958400, "step": 13050, "train_runtime": 59271.3336, "train_tokens_per_second": 115434.528 }, { "epoch": 0.7067290781676994, "grad_norm": 0.13561072945594788, "learning_rate": 0.0014360853152941958, "loss": 3.0034358978271483, "num_input_tokens_seen": 6847201280, "step": 13060, "train_runtime": 59316.7359, "train_tokens_per_second": 115434.56 }, { "epoch": 0.7072702183500635, "grad_norm": 0.13618333637714386, "learning_rate": 0.0014328959202983182, "loss": 3.0087270736694336, "num_input_tokens_seen": 6852444160, "step": 13070, "train_runtime": 59362.09, "train_tokens_per_second": 115434.685 }, { "epoch": 0.7078113585324278, "grad_norm": 0.1365492194890976, "learning_rate": 0.0014297105463606044, "loss": 3.0061859130859374, "num_input_tokens_seen": 6857687040, "step": 13080, "train_runtime": 59407.4452, "train_tokens_per_second": 115434.808 }, { "epoch": 0.7083524987147921, "grad_norm": 0.13774985074996948, "learning_rate": 0.001426529203205853, "loss": 3.010288429260254, "num_input_tokens_seen": 6862929920, "step": 13090, "train_runtime": 59452.8193, "train_tokens_per_second": 115434.894 }, { "epoch": 0.7088936388971563, "grad_norm": 0.1349509209394455, "learning_rate": 0.00142335190054656, "loss": 3.000904846191406, "num_input_tokens_seen": 6868172800, "step": 13100, "train_runtime": 59498.1377, "train_tokens_per_second": 115435.089 }, { "epoch": 0.7094347790795206, "grad_norm": 0.1314682513475418, "learning_rate": 0.0014201786480828838, "loss": 3.0022382736206055, "num_input_tokens_seen": 6873415680, "step": 13110, "train_runtime": 59543.4355, "train_tokens_per_second": 115435.322 }, { "epoch": 0.7099759192618847, "grad_norm": 0.14362597465515137, "learning_rate": 0.0014170094555026182, "loss": 2.9901851654052733, "num_input_tokens_seen": 6878658560, "step": 13120, "train_runtime": 59588.6836, "train_tokens_per_second": 115435.652 }, { "epoch": 0.710517059444249, "grad_norm": 0.13301101326942444, "learning_rate": 0.0014138443324811618, "loss": 3.0021732330322264, "num_input_tokens_seen": 6883901440, "step": 13130, "train_runtime": 59633.9351, "train_tokens_per_second": 115435.975 }, { "epoch": 0.7110581996266133, "grad_norm": 0.13076400756835938, "learning_rate": 0.0014106832886814891, "loss": 3.0049604415893554, "num_input_tokens_seen": 6889144320, "step": 13140, "train_runtime": 59679.1572, "train_tokens_per_second": 115436.354 }, { "epoch": 0.7115993398089775, "grad_norm": 0.13057680428028107, "learning_rate": 0.0014075263337541223, "loss": 3.009153938293457, "num_input_tokens_seen": 6894387200, "step": 13150, "train_runtime": 59724.3952, "train_tokens_per_second": 115436.702 }, { "epoch": 0.7121404799913418, "grad_norm": 0.13498692214488983, "learning_rate": 0.0014043734773370997, "loss": 2.996112060546875, "num_input_tokens_seen": 6899630080, "step": 13160, "train_runtime": 59769.5992, "train_tokens_per_second": 115437.115 }, { "epoch": 0.712681620173706, "grad_norm": 0.13407272100448608, "learning_rate": 0.0014012247290559466, "loss": 3.0008213043212892, "num_input_tokens_seen": 6904872960, "step": 13170, "train_runtime": 59814.8054, "train_tokens_per_second": 115437.523 }, { "epoch": 0.7132227603560702, "grad_norm": 0.14042150974273682, "learning_rate": 0.0013980800985236468, "loss": 2.9953586578369142, "num_input_tokens_seen": 6910115840, "step": 13180, "train_runtime": 59859.9779, "train_tokens_per_second": 115437.995 }, { "epoch": 0.7137639005384345, "grad_norm": 0.13807494938373566, "learning_rate": 0.0013949395953406127, "loss": 2.9886444091796873, "num_input_tokens_seen": 6915358720, "step": 13190, "train_runtime": 59905.1537, "train_tokens_per_second": 115438.461 }, { "epoch": 0.7143050407207987, "grad_norm": 0.13666392862796783, "learning_rate": 0.0013918032290946552, "loss": 3.0074825286865234, "num_input_tokens_seen": 6920601600, "step": 13200, "train_runtime": 59950.322, "train_tokens_per_second": 115438.94 }, { "epoch": 0.714846180903163, "grad_norm": 0.12777790427207947, "learning_rate": 0.0013886710093609566, "loss": 2.9995635986328124, "num_input_tokens_seen": 6925844480, "step": 13210, "train_runtime": 59995.4811, "train_tokens_per_second": 115439.436 }, { "epoch": 0.7153873210855272, "grad_norm": 0.13057056069374084, "learning_rate": 0.0013855429457020408, "loss": 2.993345260620117, "num_input_tokens_seen": 6931087360, "step": 13220, "train_runtime": 60040.6669, "train_tokens_per_second": 115439.88 }, { "epoch": 0.7159284612678914, "grad_norm": 0.13309696316719055, "learning_rate": 0.0013824190476677417, "loss": 2.9962528228759764, "num_input_tokens_seen": 6936330240, "step": 13230, "train_runtime": 60085.8338, "train_tokens_per_second": 115440.359 }, { "epoch": 0.7164696014502557, "grad_norm": 0.13253308832645416, "learning_rate": 0.0013792993247951752, "loss": 3.001760482788086, "num_input_tokens_seen": 6941573120, "step": 13240, "train_runtime": 60130.9838, "train_tokens_per_second": 115440.871 }, { "epoch": 0.7170107416326199, "grad_norm": 0.14509917795658112, "learning_rate": 0.001376183786608712, "loss": 2.999083137512207, "num_input_tokens_seen": 6946816000, "step": 13250, "train_runtime": 60176.1243, "train_tokens_per_second": 115441.399 }, { "epoch": 0.7175518818149842, "grad_norm": 0.13013510406017303, "learning_rate": 0.001373072442619947, "loss": 3.0021896362304688, "num_input_tokens_seen": 6952058880, "step": 13260, "train_runtime": 60221.2777, "train_tokens_per_second": 115441.903 }, { "epoch": 0.7180930219973484, "grad_norm": 0.1433565616607666, "learning_rate": 0.0013699653023276715, "loss": 2.999072265625, "num_input_tokens_seen": 6957301760, "step": 13270, "train_runtime": 60266.4098, "train_tokens_per_second": 115442.446 }, { "epoch": 0.7186341621797127, "grad_norm": 0.13696636259555817, "learning_rate": 0.0013668623752178402, "loss": 2.991237258911133, "num_input_tokens_seen": 6962544640, "step": 13280, "train_runtime": 60311.5633, "train_tokens_per_second": 115442.948 }, { "epoch": 0.7191753023620769, "grad_norm": 0.134785458445549, "learning_rate": 0.0013637636707635485, "loss": 3.002344512939453, "num_input_tokens_seen": 6967787520, "step": 13290, "train_runtime": 60356.7015, "train_tokens_per_second": 115443.478 }, { "epoch": 0.7197164425444411, "grad_norm": 0.13965272903442383, "learning_rate": 0.0013606691984249973, "loss": 2.9921356201171876, "num_input_tokens_seen": 6973030400, "step": 13300, "train_runtime": 60401.8497, "train_tokens_per_second": 115443.988 }, { "epoch": 0.7202575827268054, "grad_norm": 0.1369258165359497, "learning_rate": 0.0013575789676494676, "loss": 2.9890642166137695, "num_input_tokens_seen": 6978273280, "step": 13310, "train_runtime": 60447.02, "train_tokens_per_second": 115444.455 }, { "epoch": 0.7207987229091696, "grad_norm": 0.1361692249774933, "learning_rate": 0.0013544929878712931, "loss": 3.0067501068115234, "num_input_tokens_seen": 6983516160, "step": 13320, "train_runtime": 60492.1531, "train_tokens_per_second": 115444.993 }, { "epoch": 0.7213398630915339, "grad_norm": 0.13645213842391968, "learning_rate": 0.0013514112685118279, "loss": 2.99460506439209, "num_input_tokens_seen": 6988759040, "step": 13330, "train_runtime": 60537.2701, "train_tokens_per_second": 115445.56 }, { "epoch": 0.7218810032738981, "grad_norm": 0.13640370965003967, "learning_rate": 0.0013483338189794198, "loss": 3.0064407348632813, "num_input_tokens_seen": 6994001920, "step": 13340, "train_runtime": 60582.4237, "train_tokens_per_second": 115446.057 }, { "epoch": 0.7224221434562623, "grad_norm": 0.13847370445728302, "learning_rate": 0.0013452606486693793, "loss": 2.990389823913574, "num_input_tokens_seen": 6999244800, "step": 13350, "train_runtime": 60627.5832, "train_tokens_per_second": 115446.542 }, { "epoch": 0.7229632836386266, "grad_norm": 0.14565610885620117, "learning_rate": 0.001342191766963955, "loss": 2.9985219955444338, "num_input_tokens_seen": 7004487680, "step": 13360, "train_runtime": 60676.4805, "train_tokens_per_second": 115439.914 }, { "epoch": 0.7235044238209908, "grad_norm": 0.13583402335643768, "learning_rate": 0.0013391271832323016, "loss": 3.000563049316406, "num_input_tokens_seen": 7009730560, "step": 13370, "train_runtime": 60721.6176, "train_tokens_per_second": 115440.445 }, { "epoch": 0.7240455640033551, "grad_norm": 0.13164934515953064, "learning_rate": 0.0013360669068304526, "loss": 2.993762969970703, "num_input_tokens_seen": 7014973440, "step": 13380, "train_runtime": 60766.7453, "train_tokens_per_second": 115440.993 }, { "epoch": 0.7245867041857194, "grad_norm": 0.13159868121147156, "learning_rate": 0.001333010947101289, "loss": 2.9905731201171877, "num_input_tokens_seen": 7020216320, "step": 13390, "train_runtime": 60811.8966, "train_tokens_per_second": 115441.496 }, { "epoch": 0.7251278443680835, "grad_norm": 0.1346818059682846, "learning_rate": 0.001329959313374518, "loss": 3.002712631225586, "num_input_tokens_seen": 7025459200, "step": 13400, "train_runtime": 60857.0386, "train_tokens_per_second": 115442.016 }, { "epoch": 0.7256689845504478, "grad_norm": 0.1322467029094696, "learning_rate": 0.0013269120149666353, "loss": 2.9997226715087892, "num_input_tokens_seen": 7030702080, "step": 13410, "train_runtime": 60902.1814, "train_tokens_per_second": 115442.533 }, { "epoch": 0.726210124732812, "grad_norm": 0.13496780395507812, "learning_rate": 0.0013238690611809029, "loss": 3.00130615234375, "num_input_tokens_seen": 7035944960, "step": 13420, "train_runtime": 60947.3114, "train_tokens_per_second": 115443.074 }, { "epoch": 0.7267512649151763, "grad_norm": 0.13476966321468353, "learning_rate": 0.0013208304613073197, "loss": 2.9966285705566404, "num_input_tokens_seen": 7041187840, "step": 13430, "train_runtime": 60992.4581, "train_tokens_per_second": 115443.582 }, { "epoch": 0.7272924050975406, "grad_norm": 0.13049598038196564, "learning_rate": 0.0013177962246225905, "loss": 3.0012109756469725, "num_input_tokens_seen": 7046430720, "step": 13440, "train_runtime": 61037.614, "train_tokens_per_second": 115444.072 }, { "epoch": 0.7278335452799047, "grad_norm": 0.1286519169807434, "learning_rate": 0.0013147663603901006, "loss": 2.9998191833496093, "num_input_tokens_seen": 7051673600, "step": 13450, "train_runtime": 61082.7378, "train_tokens_per_second": 115444.622 }, { "epoch": 0.728374685462269, "grad_norm": 0.13326317071914673, "learning_rate": 0.0013117408778598853, "loss": 2.980904769897461, "num_input_tokens_seen": 7056916480, "step": 13460, "train_runtime": 61127.8727, "train_tokens_per_second": 115445.151 }, { "epoch": 0.7289158256446332, "grad_norm": 0.13441520929336548, "learning_rate": 0.001308719786268604, "loss": 3.0028324127197266, "num_input_tokens_seen": 7062159360, "step": 13470, "train_runtime": 61173.0008, "train_tokens_per_second": 115445.691 }, { "epoch": 0.7294569658269975, "grad_norm": 0.13160498440265656, "learning_rate": 0.0013057030948395115, "loss": 2.990519332885742, "num_input_tokens_seen": 7067402240, "step": 13480, "train_runtime": 61218.1024, "train_tokens_per_second": 115446.281 }, { "epoch": 0.7299981060093618, "grad_norm": 0.13775858283042908, "learning_rate": 0.001302690812782427, "loss": 3.006916046142578, "num_input_tokens_seen": 7072645120, "step": 13490, "train_runtime": 61263.2414, "train_tokens_per_second": 115446.799 }, { "epoch": 0.7305392461917259, "grad_norm": 0.13651160895824432, "learning_rate": 0.0012996829492937084, "loss": 3.000609016418457, "num_input_tokens_seen": 7077888000, "step": 13500, "train_runtime": 61308.388, "train_tokens_per_second": 115447.302 }, { "epoch": 0.7305392461917259, "eval_loss": 2.9539315700531006, "eval_runtime": 1.9872, "eval_samples_per_second": 251.611, "eval_steps_per_second": 4.026, "num_input_tokens_seen": 7077888000, "step": 13500 }, { "epoch": 0.7310803863740902, "grad_norm": 0.1339404284954071, "learning_rate": 0.001296679513556226, "loss": 2.9880565643310546, "num_input_tokens_seen": 7083130880, "step": 13510, "train_runtime": 61355.5007, "train_tokens_per_second": 115444.105 }, { "epoch": 0.7316215265564544, "grad_norm": 0.1354180872440338, "learning_rate": 0.0012936805147393292, "loss": 2.9919578552246096, "num_input_tokens_seen": 7088373760, "step": 13520, "train_runtime": 61400.641, "train_tokens_per_second": 115444.622 }, { "epoch": 0.7321626667388187, "grad_norm": 0.13503789901733398, "learning_rate": 0.0012906859619988247, "loss": 2.99132080078125, "num_input_tokens_seen": 7093616640, "step": 13530, "train_runtime": 61445.7513, "train_tokens_per_second": 115445.193 }, { "epoch": 0.732703806921183, "grad_norm": 0.13498766720294952, "learning_rate": 0.0012876958644769446, "loss": 2.9880552291870117, "num_input_tokens_seen": 7098859520, "step": 13540, "train_runtime": 61490.8935, "train_tokens_per_second": 115445.704 }, { "epoch": 0.7332449471035472, "grad_norm": 0.13910213112831116, "learning_rate": 0.0012847102313023185, "loss": 2.996448516845703, "num_input_tokens_seen": 7104102400, "step": 13550, "train_runtime": 61536.0395, "train_tokens_per_second": 115446.208 }, { "epoch": 0.7337860872859114, "grad_norm": 0.13978877663612366, "learning_rate": 0.0012817290715899468, "loss": 2.9948408126831056, "num_input_tokens_seen": 7109345280, "step": 13560, "train_runtime": 61581.1749, "train_tokens_per_second": 115446.73 }, { "epoch": 0.7343272274682756, "grad_norm": 0.12929198145866394, "learning_rate": 0.0012787523944411728, "loss": 2.990352821350098, "num_input_tokens_seen": 7114588160, "step": 13570, "train_runtime": 61626.3208, "train_tokens_per_second": 115447.232 }, { "epoch": 0.7348683676506399, "grad_norm": 0.12884965538978577, "learning_rate": 0.001275780208943655, "loss": 2.9938125610351562, "num_input_tokens_seen": 7119831040, "step": 13580, "train_runtime": 61671.467, "train_tokens_per_second": 115447.733 }, { "epoch": 0.7354095078330042, "grad_norm": 0.13231875002384186, "learning_rate": 0.0012728125241713403, "loss": 2.9899265289306642, "num_input_tokens_seen": 7125073920, "step": 13590, "train_runtime": 61716.5949, "train_tokens_per_second": 115448.267 }, { "epoch": 0.7359506480153684, "grad_norm": 0.13000380992889404, "learning_rate": 0.001269849349184432, "loss": 2.997477722167969, "num_input_tokens_seen": 7130316800, "step": 13600, "train_runtime": 61761.7628, "train_tokens_per_second": 115448.725 }, { "epoch": 0.7364917881977326, "grad_norm": 0.13756293058395386, "learning_rate": 0.0012668906930293686, "loss": 2.9921825408935545, "num_input_tokens_seen": 7135559680, "step": 13610, "train_runtime": 61806.8862, "train_tokens_per_second": 115449.266 }, { "epoch": 0.7370329283800968, "grad_norm": 0.134871244430542, "learning_rate": 0.0012639365647387907, "loss": 2.991608238220215, "num_input_tokens_seen": 7140802560, "step": 13620, "train_runtime": 61852.0353, "train_tokens_per_second": 115449.759 }, { "epoch": 0.7375740685624611, "grad_norm": 0.13307398557662964, "learning_rate": 0.0012609869733315145, "loss": 2.994303512573242, "num_input_tokens_seen": 7146045440, "step": 13630, "train_runtime": 61897.1942, "train_tokens_per_second": 115450.232 }, { "epoch": 0.7381152087448254, "grad_norm": 0.1326708197593689, "learning_rate": 0.0012580419278125086, "loss": 2.9904823303222656, "num_input_tokens_seen": 7151288320, "step": 13640, "train_runtime": 61942.3523, "train_tokens_per_second": 115450.706 }, { "epoch": 0.7386563489271896, "grad_norm": 0.13145731389522552, "learning_rate": 0.0012551014371728615, "loss": 2.991769790649414, "num_input_tokens_seen": 7156531200, "step": 13650, "train_runtime": 61987.491, "train_tokens_per_second": 115451.216 }, { "epoch": 0.7391974891095539, "grad_norm": 0.13033975660800934, "learning_rate": 0.0012521655103897556, "loss": 2.9962963104248046, "num_input_tokens_seen": 7161774080, "step": 13660, "train_runtime": 62032.6128, "train_tokens_per_second": 115451.756 }, { "epoch": 0.739738629291918, "grad_norm": 0.13624544441699982, "learning_rate": 0.0012492341564264394, "loss": 2.9916343688964844, "num_input_tokens_seen": 7167016960, "step": 13670, "train_runtime": 62077.7496, "train_tokens_per_second": 115452.268 }, { "epoch": 0.7402797694742823, "grad_norm": 0.12694226205348969, "learning_rate": 0.0012463073842322032, "loss": 2.9956790924072267, "num_input_tokens_seen": 7172259840, "step": 13680, "train_runtime": 62122.8901, "train_tokens_per_second": 115452.772 }, { "epoch": 0.7408209096566466, "grad_norm": 0.14218159019947052, "learning_rate": 0.0012433852027423462, "loss": 2.9924745559692383, "num_input_tokens_seen": 7177502720, "step": 13690, "train_runtime": 62168.0831, "train_tokens_per_second": 115453.177 }, { "epoch": 0.7413620498390108, "grad_norm": 0.13965629041194916, "learning_rate": 0.0012404676208781556, "loss": 2.9898683547973635, "num_input_tokens_seen": 7182745600, "step": 13700, "train_runtime": 62213.3158, "train_tokens_per_second": 115453.509 }, { "epoch": 0.7419031900213751, "grad_norm": 0.13439473509788513, "learning_rate": 0.0012375546475468736, "loss": 2.99302978515625, "num_input_tokens_seen": 7187988480, "step": 13710, "train_runtime": 62258.5518, "train_tokens_per_second": 115453.834 }, { "epoch": 0.7424443302037392, "grad_norm": 0.13322672247886658, "learning_rate": 0.0012346462916416746, "loss": 2.9867807388305665, "num_input_tokens_seen": 7193231360, "step": 13720, "train_runtime": 62303.7184, "train_tokens_per_second": 115454.287 }, { "epoch": 0.7429854703861035, "grad_norm": 0.13469451665878296, "learning_rate": 0.001231742562041635, "loss": 2.9933212280273436, "num_input_tokens_seen": 7198474240, "step": 13730, "train_runtime": 62348.8665, "train_tokens_per_second": 115454.773 }, { "epoch": 0.7435266105684678, "grad_norm": 0.1325179785490036, "learning_rate": 0.001228843467611706, "loss": 2.9945892333984374, "num_input_tokens_seen": 7203717120, "step": 13740, "train_runtime": 62397.9384, "train_tokens_per_second": 115447.999 }, { "epoch": 0.744067750750832, "grad_norm": 0.1386304348707199, "learning_rate": 0.0012259490172026927, "loss": 2.989889907836914, "num_input_tokens_seen": 7208960000, "step": 13750, "train_runtime": 62443.1321, "train_tokens_per_second": 115448.405 }, { "epoch": 0.7446088909331963, "grad_norm": 0.13061648607254028, "learning_rate": 0.0012230592196512174, "loss": 2.986536407470703, "num_input_tokens_seen": 7214202880, "step": 13760, "train_runtime": 62488.3343, "train_tokens_per_second": 115448.795 }, { "epoch": 0.7451500311155604, "grad_norm": 0.12978407740592957, "learning_rate": 0.0012201740837796992, "loss": 2.9931753158569334, "num_input_tokens_seen": 7219445760, "step": 13770, "train_runtime": 62533.544, "train_tokens_per_second": 115449.17 }, { "epoch": 0.7456911712979247, "grad_norm": 0.12974348664283752, "learning_rate": 0.0012172936183963243, "loss": 2.98385009765625, "num_input_tokens_seen": 7224688640, "step": 13780, "train_runtime": 62578.7317, "train_tokens_per_second": 115449.586 }, { "epoch": 0.746232311480289, "grad_norm": 0.1361524909734726, "learning_rate": 0.0012144178322950217, "loss": 2.996071624755859, "num_input_tokens_seen": 7229931520, "step": 13790, "train_runtime": 62623.945, "train_tokens_per_second": 115449.953 }, { "epoch": 0.7467734516626532, "grad_norm": 0.12753413617610931, "learning_rate": 0.0012115467342554353, "loss": 2.989743232727051, "num_input_tokens_seen": 7235174400, "step": 13800, "train_runtime": 62669.1454, "train_tokens_per_second": 115450.344 }, { "epoch": 0.7473145918450175, "grad_norm": 0.1313578486442566, "learning_rate": 0.0012086803330428942, "loss": 2.9922863006591798, "num_input_tokens_seen": 7240417280, "step": 13810, "train_runtime": 62714.3608, "train_tokens_per_second": 115450.707 }, { "epoch": 0.7478557320273816, "grad_norm": 0.13242116570472717, "learning_rate": 0.0012058186374083889, "loss": 2.9887691497802735, "num_input_tokens_seen": 7245660160, "step": 13820, "train_runtime": 62759.5959, "train_tokens_per_second": 115451.033 }, { "epoch": 0.7483968722097459, "grad_norm": 0.1344103366136551, "learning_rate": 0.0012029616560885453, "loss": 2.989380645751953, "num_input_tokens_seen": 7250903040, "step": 13830, "train_runtime": 62804.8179, "train_tokens_per_second": 115451.382 }, { "epoch": 0.7489380123921102, "grad_norm": 0.13286016881465912, "learning_rate": 0.001200109397805595, "loss": 2.9872367858886717, "num_input_tokens_seen": 7256145920, "step": 13840, "train_runtime": 62850.0273, "train_tokens_per_second": 115451.754 }, { "epoch": 0.7494791525744744, "grad_norm": 0.13758355379104614, "learning_rate": 0.0011972618712673526, "loss": 2.9894548416137696, "num_input_tokens_seen": 7261388800, "step": 13850, "train_runtime": 62895.244, "train_tokens_per_second": 115452.113 }, { "epoch": 0.7500202927568387, "grad_norm": 0.13310939073562622, "learning_rate": 0.0011944190851671855, "loss": 2.980154800415039, "num_input_tokens_seen": 7266631680, "step": 13860, "train_runtime": 62940.4589, "train_tokens_per_second": 115452.474 }, { "epoch": 0.7505614329392029, "grad_norm": 0.13724195957183838, "learning_rate": 0.0011915810481839884, "loss": 2.9957542419433594, "num_input_tokens_seen": 7271874560, "step": 13870, "train_runtime": 62985.6674, "train_tokens_per_second": 115452.846 }, { "epoch": 0.7511025731215671, "grad_norm": 0.13776428997516632, "learning_rate": 0.0011887477689821579, "loss": 2.9919281005859375, "num_input_tokens_seen": 7277117440, "step": 13880, "train_runtime": 63030.8734, "train_tokens_per_second": 115453.222 }, { "epoch": 0.7516437133039314, "grad_norm": 0.13441872596740723, "learning_rate": 0.001185919256211564, "loss": 2.9903282165527343, "num_input_tokens_seen": 7282360320, "step": 13890, "train_runtime": 63076.0694, "train_tokens_per_second": 115453.616 }, { "epoch": 0.7521848534862956, "grad_norm": 0.14160217344760895, "learning_rate": 0.001183095518507527, "loss": 2.9950998306274412, "num_input_tokens_seen": 7287603200, "step": 13900, "train_runtime": 63121.2819, "train_tokens_per_second": 115453.98 }, { "epoch": 0.7527259936686599, "grad_norm": 0.13321471214294434, "learning_rate": 0.001180276564490789, "loss": 2.9867202758789064, "num_input_tokens_seen": 7292846080, "step": 13910, "train_runtime": 63166.4818, "train_tokens_per_second": 115454.366 }, { "epoch": 0.7532671338510241, "grad_norm": 0.13260754942893982, "learning_rate": 0.001177462402767485, "loss": 2.9936323165893555, "num_input_tokens_seen": 7298088960, "step": 13920, "train_runtime": 63211.6992, "train_tokens_per_second": 115454.719 }, { "epoch": 0.7538082740333883, "grad_norm": 0.13385504484176636, "learning_rate": 0.0011746530419291235, "loss": 2.9826412200927734, "num_input_tokens_seen": 7303331840, "step": 13930, "train_runtime": 63256.8908, "train_tokens_per_second": 115455.119 }, { "epoch": 0.7543494142157526, "grad_norm": 0.1354595571756363, "learning_rate": 0.0011718484905525526, "loss": 2.9921710968017576, "num_input_tokens_seen": 7308574720, "step": 13940, "train_runtime": 63302.0738, "train_tokens_per_second": 115455.534 }, { "epoch": 0.7548905543981168, "grad_norm": 0.13242025673389435, "learning_rate": 0.0011690487571999377, "loss": 2.9915000915527346, "num_input_tokens_seen": 7313817600, "step": 13950, "train_runtime": 63347.2678, "train_tokens_per_second": 115455.928 }, { "epoch": 0.7554316945804811, "grad_norm": 0.1303345412015915, "learning_rate": 0.0011662538504187375, "loss": 2.992412567138672, "num_input_tokens_seen": 7319060480, "step": 13960, "train_runtime": 63392.4687, "train_tokens_per_second": 115456.309 }, { "epoch": 0.7559728347628453, "grad_norm": 0.1336052417755127, "learning_rate": 0.0011634637787416738, "loss": 2.9856544494628907, "num_input_tokens_seen": 7324303360, "step": 13970, "train_runtime": 63437.6413, "train_tokens_per_second": 115456.742 }, { "epoch": 0.7565139749452096, "grad_norm": 0.13160865008831024, "learning_rate": 0.0011606785506867066, "loss": 2.990740966796875, "num_input_tokens_seen": 7329546240, "step": 13980, "train_runtime": 63482.8312, "train_tokens_per_second": 115457.142 }, { "epoch": 0.7570551151275738, "grad_norm": 0.132036030292511, "learning_rate": 0.0011578981747570086, "loss": 2.9869890213012695, "num_input_tokens_seen": 7334789120, "step": 13990, "train_runtime": 63528.0172, "train_tokens_per_second": 115457.548 }, { "epoch": 0.757596255309938, "grad_norm": 0.13680653274059296, "learning_rate": 0.0011551226594409406, "loss": 2.9875946044921875, "num_input_tokens_seen": 7340032000, "step": 14000, "train_runtime": 63573.1915, "train_tokens_per_second": 115457.976 }, { "epoch": 0.757596255309938, "eval_loss": 2.948127031326294, "eval_runtime": 1.9851, "eval_samples_per_second": 251.872, "eval_steps_per_second": 4.03, "num_input_tokens_seen": 7340032000, "step": 14000 }, { "epoch": 0.7581373954923023, "grad_norm": 0.1333727240562439, "learning_rate": 0.0011523520132120217, "loss": 2.9936281204223634, "num_input_tokens_seen": 7345274880, "step": 14010, "train_runtime": 63622.81, "train_tokens_per_second": 115450.337 }, { "epoch": 0.7586785356746665, "grad_norm": 0.13183613121509552, "learning_rate": 0.0011495862445289092, "loss": 2.9838493347167967, "num_input_tokens_seen": 7350517760, "step": 14020, "train_runtime": 63667.9625, "train_tokens_per_second": 115450.809 }, { "epoch": 0.7592196758570308, "grad_norm": 0.13663019239902496, "learning_rate": 0.0011468253618353661, "loss": 2.9881641387939455, "num_input_tokens_seen": 7355760640, "step": 14030, "train_runtime": 63713.121, "train_tokens_per_second": 115451.269 }, { "epoch": 0.759760816039395, "grad_norm": 0.1334005743265152, "learning_rate": 0.0011440693735602413, "loss": 2.9827747344970703, "num_input_tokens_seen": 7361003520, "step": 14040, "train_runtime": 63758.2642, "train_tokens_per_second": 115451.755 }, { "epoch": 0.7603019562217592, "grad_norm": 0.1363915055990219, "learning_rate": 0.0011413182881174402, "loss": 2.976375961303711, "num_input_tokens_seen": 7366246400, "step": 14050, "train_runtime": 63803.3929, "train_tokens_per_second": 115452.268 }, { "epoch": 0.7608430964041235, "grad_norm": 0.13721340894699097, "learning_rate": 0.0011385721139058986, "loss": 3.0018871307373045, "num_input_tokens_seen": 7371489280, "step": 14060, "train_runtime": 63848.5329, "train_tokens_per_second": 115452.759 }, { "epoch": 0.7613842365864877, "grad_norm": 0.13170303404331207, "learning_rate": 0.0011358308593095617, "loss": 2.9844949722290037, "num_input_tokens_seen": 7376732160, "step": 14070, "train_runtime": 63893.6665, "train_tokens_per_second": 115453.261 }, { "epoch": 0.761925376768852, "grad_norm": 0.13645039498806, "learning_rate": 0.0011330945326973533, "loss": 2.9850318908691404, "num_input_tokens_seen": 7381975040, "step": 14080, "train_runtime": 63938.7823, "train_tokens_per_second": 115453.795 }, { "epoch": 0.7624665169512163, "grad_norm": 0.1297563761472702, "learning_rate": 0.0011303631424231526, "loss": 2.9895225524902345, "num_input_tokens_seen": 7387217920, "step": 14090, "train_runtime": 63983.9157, "train_tokens_per_second": 115454.296 }, { "epoch": 0.7630076571335804, "grad_norm": 0.13698382675647736, "learning_rate": 0.0011276366968257677, "loss": 2.9852466583251953, "num_input_tokens_seen": 7392460800, "step": 14100, "train_runtime": 64029.0446, "train_tokens_per_second": 115454.804 }, { "epoch": 0.7635487973159447, "grad_norm": 0.12868466973304749, "learning_rate": 0.001124915204228913, "loss": 2.982627105712891, "num_input_tokens_seen": 7397703680, "step": 14110, "train_runtime": 64074.169, "train_tokens_per_second": 115455.32 }, { "epoch": 0.7640899374983089, "grad_norm": 0.13413524627685547, "learning_rate": 0.0011221986729411787, "loss": 2.982726287841797, "num_input_tokens_seen": 7402946560, "step": 14120, "train_runtime": 64123.0569, "train_tokens_per_second": 115449.059 }, { "epoch": 0.7646310776806732, "grad_norm": 0.13302487134933472, "learning_rate": 0.0011194871112560113, "loss": 2.9999317169189452, "num_input_tokens_seen": 7408189440, "step": 14130, "train_runtime": 64168.1991, "train_tokens_per_second": 115449.546 }, { "epoch": 0.7651722178630375, "grad_norm": 0.13595032691955566, "learning_rate": 0.001116780527451682, "loss": 2.986163330078125, "num_input_tokens_seen": 7413432320, "step": 14140, "train_runtime": 64213.3563, "train_tokens_per_second": 115450.005 }, { "epoch": 0.7657133580454016, "grad_norm": 0.12740519642829895, "learning_rate": 0.0011140789297912688, "loss": 2.9861852645874025, "num_input_tokens_seen": 7418675200, "step": 14150, "train_runtime": 64258.4713, "train_tokens_per_second": 115450.54 }, { "epoch": 0.7662544982277659, "grad_norm": 0.13032016158103943, "learning_rate": 0.0011113823265226242, "loss": 2.9914901733398436, "num_input_tokens_seen": 7423918080, "step": 14160, "train_runtime": 64303.6051, "train_tokens_per_second": 115451.04 }, { "epoch": 0.7667956384101301, "grad_norm": 0.12856240570545197, "learning_rate": 0.0011086907258783525, "loss": 2.99139404296875, "num_input_tokens_seen": 7429160960, "step": 14170, "train_runtime": 64348.7292, "train_tokens_per_second": 115451.557 }, { "epoch": 0.7673367785924944, "grad_norm": 0.1300676167011261, "learning_rate": 0.001106004136075789, "loss": 2.980759620666504, "num_input_tokens_seen": 7434403840, "step": 14180, "train_runtime": 64393.8763, "train_tokens_per_second": 115452.032 }, { "epoch": 0.7678779187748587, "grad_norm": 0.13340207934379578, "learning_rate": 0.0011033225653169676, "loss": 2.979547882080078, "num_input_tokens_seen": 7439646720, "step": 14190, "train_runtime": 64439.0196, "train_tokens_per_second": 115452.513 }, { "epoch": 0.7684190589572228, "grad_norm": 0.1270836591720581, "learning_rate": 0.0011006460217886007, "loss": 2.9818099975585937, "num_input_tokens_seen": 7444889600, "step": 14200, "train_runtime": 64484.1553, "train_tokens_per_second": 115453.006 }, { "epoch": 0.7689601991395871, "grad_norm": 0.1316118985414505, "learning_rate": 0.001097974513662052, "loss": 2.9830299377441407, "num_input_tokens_seen": 7450132480, "step": 14210, "train_runtime": 64529.2695, "train_tokens_per_second": 115453.538 }, { "epoch": 0.7695013393219513, "grad_norm": 0.13914352655410767, "learning_rate": 0.0010953080490933129, "loss": 2.9925983428955076, "num_input_tokens_seen": 7455375360, "step": 14220, "train_runtime": 64574.3994, "train_tokens_per_second": 115454.041 }, { "epoch": 0.7700424795043156, "grad_norm": 0.13164092600345612, "learning_rate": 0.0010926466362229787, "loss": 2.9863054275512697, "num_input_tokens_seen": 7460618240, "step": 14230, "train_runtime": 64619.5117, "train_tokens_per_second": 115454.575 }, { "epoch": 0.7705836196866799, "grad_norm": 0.1346326619386673, "learning_rate": 0.001089990283176218, "loss": 2.9905773162841798, "num_input_tokens_seen": 7465861120, "step": 14240, "train_runtime": 64664.6395, "train_tokens_per_second": 115455.08 }, { "epoch": 0.771124759869044, "grad_norm": 0.1283544898033142, "learning_rate": 0.0010873389980627568, "loss": 2.9964345932006835, "num_input_tokens_seen": 7471104000, "step": 14250, "train_runtime": 64709.798, "train_tokens_per_second": 115455.53 }, { "epoch": 0.7716659000514083, "grad_norm": 0.13457883894443512, "learning_rate": 0.0010846927889768454, "loss": 2.9865245819091797, "num_input_tokens_seen": 7476346880, "step": 14260, "train_runtime": 64754.9357, "train_tokens_per_second": 115456.016 }, { "epoch": 0.7722070402337725, "grad_norm": 0.13008961081504822, "learning_rate": 0.0010820516639972377, "loss": 2.9932941436767577, "num_input_tokens_seen": 7481589760, "step": 14270, "train_runtime": 64800.0796, "train_tokens_per_second": 115456.49 }, { "epoch": 0.7727481804161368, "grad_norm": 0.13576596975326538, "learning_rate": 0.0010794156311871674, "loss": 2.975057601928711, "num_input_tokens_seen": 7486832640, "step": 14280, "train_runtime": 64845.2255, "train_tokens_per_second": 115456.96 }, { "epoch": 0.7732893205985011, "grad_norm": 0.13501375913619995, "learning_rate": 0.0010767846985943225, "loss": 2.983927536010742, "num_input_tokens_seen": 7492075520, "step": 14290, "train_runtime": 64890.3622, "train_tokens_per_second": 115457.446 }, { "epoch": 0.7738304607808653, "grad_norm": 0.1284349411725998, "learning_rate": 0.0010741588742508182, "loss": 2.994318199157715, "num_input_tokens_seen": 7497318400, "step": 14300, "train_runtime": 64935.5045, "train_tokens_per_second": 115457.922 }, { "epoch": 0.7743716009632295, "grad_norm": 0.13406863808631897, "learning_rate": 0.0010715381661731754, "loss": 2.9812191009521483, "num_input_tokens_seen": 7502561280, "step": 14310, "train_runtime": 64980.6813, "train_tokens_per_second": 115458.335 }, { "epoch": 0.7749127411455937, "grad_norm": 0.1352129429578781, "learning_rate": 0.0010689225823622948, "loss": 2.9968055725097655, "num_input_tokens_seen": 7507804160, "step": 14320, "train_runtime": 65025.8721, "train_tokens_per_second": 115458.723 }, { "epoch": 0.775453881327958, "grad_norm": 0.13681240379810333, "learning_rate": 0.0010663121308034337, "loss": 2.984090805053711, "num_input_tokens_seen": 7513047040, "step": 14330, "train_runtime": 65071.0195, "train_tokens_per_second": 115459.188 }, { "epoch": 0.7759950215103223, "grad_norm": 0.12757869064807892, "learning_rate": 0.0010637068194661817, "loss": 2.9872867584228517, "num_input_tokens_seen": 7518289920, "step": 14340, "train_runtime": 65116.166, "train_tokens_per_second": 115459.653 }, { "epoch": 0.7765361616926865, "grad_norm": 0.1297658532857895, "learning_rate": 0.0010611066563044331, "loss": 2.987481689453125, "num_input_tokens_seen": 7523532800, "step": 14350, "train_runtime": 65161.3132, "train_tokens_per_second": 115460.116 }, { "epoch": 0.7770773018750508, "grad_norm": 0.13100814819335938, "learning_rate": 0.0010585116492563672, "loss": 2.984407424926758, "num_input_tokens_seen": 7528775680, "step": 14360, "train_runtime": 65206.4518, "train_tokens_per_second": 115460.594 }, { "epoch": 0.7776184420574149, "grad_norm": 0.13708344101905823, "learning_rate": 0.0010559218062444215, "loss": 2.9803342819213867, "num_input_tokens_seen": 7534018560, "step": 14370, "train_runtime": 65251.6135, "train_tokens_per_second": 115461.031 }, { "epoch": 0.7781595822397792, "grad_norm": 0.13270463049411774, "learning_rate": 0.001053337135175266, "loss": 2.9783748626708983, "num_input_tokens_seen": 7539261440, "step": 14380, "train_runtime": 65296.782, "train_tokens_per_second": 115461.455 }, { "epoch": 0.7787007224221435, "grad_norm": 0.1348678022623062, "learning_rate": 0.001050757643939784, "loss": 2.985927963256836, "num_input_tokens_seen": 7544504320, "step": 14390, "train_runtime": 65341.9205, "train_tokens_per_second": 115461.931 }, { "epoch": 0.7792418626045077, "grad_norm": 0.1359061747789383, "learning_rate": 0.0010481833404130433, "loss": 2.977262496948242, "num_input_tokens_seen": 7549747200, "step": 14400, "train_runtime": 65387.0473, "train_tokens_per_second": 115462.427 }, { "epoch": 0.779783002786872, "grad_norm": 0.13489292562007904, "learning_rate": 0.0010456142324542742, "loss": 2.9768039703369142, "num_input_tokens_seen": 7554990080, "step": 14410, "train_runtime": 65432.1998, "train_tokens_per_second": 115462.878 }, { "epoch": 0.7803241429692361, "grad_norm": 0.13529463112354279, "learning_rate": 0.001043050327906844, "loss": 2.992759132385254, "num_input_tokens_seen": 7560232960, "step": 14420, "train_runtime": 65477.3624, "train_tokens_per_second": 115463.31 }, { "epoch": 0.7808652831516004, "grad_norm": 0.13989658653736115, "learning_rate": 0.0010404916345982372, "loss": 2.9861518859863283, "num_input_tokens_seen": 7565475840, "step": 14430, "train_runtime": 65522.5287, "train_tokens_per_second": 115463.734 }, { "epoch": 0.7814064233339647, "grad_norm": 0.13800008594989777, "learning_rate": 0.0010379381603400246, "loss": 2.983747100830078, "num_input_tokens_seen": 7570718720, "step": 14440, "train_runtime": 65567.6879, "train_tokens_per_second": 115464.171 }, { "epoch": 0.7819475635163289, "grad_norm": 0.14410988986492157, "learning_rate": 0.0010353899129278482, "loss": 2.986704444885254, "num_input_tokens_seen": 7575961600, "step": 14450, "train_runtime": 65612.8209, "train_tokens_per_second": 115464.653 }, { "epoch": 0.7824887036986932, "grad_norm": 0.13409604132175446, "learning_rate": 0.0010328469001413872, "loss": 2.9869441986083984, "num_input_tokens_seen": 7581204480, "step": 14460, "train_runtime": 65657.9605, "train_tokens_per_second": 115465.123 }, { "epoch": 0.7830298438810573, "grad_norm": 0.13234242796897888, "learning_rate": 0.0010303091297443453, "loss": 2.9890289306640625, "num_input_tokens_seen": 7586447360, "step": 14470, "train_runtime": 65703.0949, "train_tokens_per_second": 115465.601 }, { "epoch": 0.7835709840634216, "grad_norm": 0.13398636877536774, "learning_rate": 0.001027776609484418, "loss": 2.9826473236083983, "num_input_tokens_seen": 7591690240, "step": 14480, "train_runtime": 65748.2396, "train_tokens_per_second": 115466.061 }, { "epoch": 0.7841121242457859, "grad_norm": 0.13305144011974335, "learning_rate": 0.0010252493470932719, "loss": 2.9864757537841795, "num_input_tokens_seen": 7596933120, "step": 14490, "train_runtime": 65793.3795, "train_tokens_per_second": 115466.528 }, { "epoch": 0.7846532644281501, "grad_norm": 0.13172990083694458, "learning_rate": 0.0010227273502865237, "loss": 2.9912540435791017, "num_input_tokens_seen": 7602176000, "step": 14500, "train_runtime": 65842.395, "train_tokens_per_second": 115460.199 }, { "epoch": 0.7846532644281501, "eval_loss": 2.9429469108581543, "eval_runtime": 1.9893, "eval_samples_per_second": 251.343, "eval_steps_per_second": 4.021, "num_input_tokens_seen": 7602176000, "step": 14500 }, { "epoch": 0.7851944046105144, "grad_norm": 0.13013876974582672, "learning_rate": 0.0010202106267637142, "loss": 2.9870655059814455, "num_input_tokens_seen": 7607418880, "step": 14510, "train_runtime": 65889.5594, "train_tokens_per_second": 115457.122 }, { "epoch": 0.7857355447928785, "grad_norm": 0.14158159494400024, "learning_rate": 0.001017699184208284, "loss": 2.9855068206787108, "num_input_tokens_seen": 7612661760, "step": 14520, "train_runtime": 65934.7235, "train_tokens_per_second": 115457.552 }, { "epoch": 0.7862766849752428, "grad_norm": 0.12904150784015656, "learning_rate": 0.001015193030287551, "loss": 2.9784725189208983, "num_input_tokens_seen": 7617904640, "step": 14530, "train_runtime": 65979.8789, "train_tokens_per_second": 115457.997 }, { "epoch": 0.7868178251576071, "grad_norm": 0.1475485861301422, "learning_rate": 0.0010126921726526892, "loss": 2.9963218688964846, "num_input_tokens_seen": 7623147520, "step": 14540, "train_runtime": 66025.0052, "train_tokens_per_second": 115458.492 }, { "epoch": 0.7873589653399713, "grad_norm": 0.13277380168437958, "learning_rate": 0.0010101966189387007, "loss": 2.9872737884521485, "num_input_tokens_seen": 7628390400, "step": 14550, "train_runtime": 66070.1575, "train_tokens_per_second": 115458.941 }, { "epoch": 0.7879001055223356, "grad_norm": 0.13506442308425903, "learning_rate": 0.0010077063767643974, "loss": 2.9895917892456056, "num_input_tokens_seen": 7633633280, "step": 14560, "train_runtime": 66115.3068, "train_tokens_per_second": 115459.394 }, { "epoch": 0.7884412457046998, "grad_norm": 0.13273315131664276, "learning_rate": 0.0010052214537323724, "loss": 2.9872600555419924, "num_input_tokens_seen": 7638876160, "step": 14570, "train_runtime": 66160.4452, "train_tokens_per_second": 115459.866 }, { "epoch": 0.788982385887064, "grad_norm": 0.1311519294977188, "learning_rate": 0.0010027418574289832, "loss": 2.9747976303100585, "num_input_tokens_seen": 7644119040, "step": 14580, "train_runtime": 66205.59, "train_tokens_per_second": 115460.326 }, { "epoch": 0.7895235260694283, "grad_norm": 0.13237175345420837, "learning_rate": 0.0010002675954243225, "loss": 2.9707094192504884, "num_input_tokens_seen": 7649361920, "step": 14590, "train_runtime": 66250.7308, "train_tokens_per_second": 115460.793 }, { "epoch": 0.7900646662517925, "grad_norm": 0.13623256981372833, "learning_rate": 0.0009977986752721967, "loss": 2.9789360046386717, "num_input_tokens_seen": 7654604800, "step": 14600, "train_runtime": 66295.8847, "train_tokens_per_second": 115461.236 }, { "epoch": 0.7906058064341568, "grad_norm": 0.13563480973243713, "learning_rate": 0.0009953351045101087, "loss": 2.976993942260742, "num_input_tokens_seen": 7659847680, "step": 14610, "train_runtime": 66341.0194, "train_tokens_per_second": 115461.712 }, { "epoch": 0.791146946616521, "grad_norm": 0.1308317333459854, "learning_rate": 0.000992876890659225, "loss": 2.9876148223876955, "num_input_tokens_seen": 7665090560, "step": 14620, "train_runtime": 66386.152, "train_tokens_per_second": 115462.191 }, { "epoch": 0.7916880867988852, "grad_norm": 0.12994542717933655, "learning_rate": 0.0009904240412243594, "loss": 2.989145278930664, "num_input_tokens_seen": 7670333440, "step": 14630, "train_runtime": 66431.2999, "train_tokens_per_second": 115462.643 }, { "epoch": 0.7922292269812495, "grad_norm": 0.13062526285648346, "learning_rate": 0.0009879765636939479, "loss": 2.9790761947631834, "num_input_tokens_seen": 7675576320, "step": 14640, "train_runtime": 66476.4455, "train_tokens_per_second": 115463.098 }, { "epoch": 0.7927703671636137, "grad_norm": 0.13198526203632355, "learning_rate": 0.0009855344655400273, "loss": 2.991826629638672, "num_input_tokens_seen": 7680819200, "step": 14650, "train_runtime": 66521.5925, "train_tokens_per_second": 115463.55 }, { "epoch": 0.793311507345978, "grad_norm": 0.12981140613555908, "learning_rate": 0.0009830977542182112, "loss": 2.97564754486084, "num_input_tokens_seen": 7686062080, "step": 14660, "train_runtime": 66566.7229, "train_tokens_per_second": 115464.03 }, { "epoch": 0.7938526475283422, "grad_norm": 0.13640232384204865, "learning_rate": 0.0009806664371676665, "loss": 2.9895370483398436, "num_input_tokens_seen": 7691304960, "step": 14670, "train_runtime": 66611.843, "train_tokens_per_second": 115464.527 }, { "epoch": 0.7943937877107065, "grad_norm": 0.13942649960517883, "learning_rate": 0.0009782405218110937, "loss": 2.983687973022461, "num_input_tokens_seen": 7696547840, "step": 14680, "train_runtime": 66656.9717, "train_tokens_per_second": 115465.009 }, { "epoch": 0.7949349278930707, "grad_norm": 0.13253772258758545, "learning_rate": 0.0009758200155546995, "loss": 2.9805246353149415, "num_input_tokens_seen": 7701790720, "step": 14690, "train_runtime": 66702.1127, "train_tokens_per_second": 115465.469 }, { "epoch": 0.7954760680754349, "grad_norm": 0.14124181866645813, "learning_rate": 0.000973404925788178, "loss": 2.9745468139648437, "num_input_tokens_seen": 7707033600, "step": 14700, "train_runtime": 66747.2598, "train_tokens_per_second": 115465.918 }, { "epoch": 0.7960172082577992, "grad_norm": 0.14020085334777832, "learning_rate": 0.0009709952598846878, "loss": 2.978104019165039, "num_input_tokens_seen": 7712276480, "step": 14710, "train_runtime": 66792.381, "train_tokens_per_second": 115466.411 }, { "epoch": 0.7965583484401634, "grad_norm": 0.14543874561786652, "learning_rate": 0.0009685910252008282, "loss": 2.972671890258789, "num_input_tokens_seen": 7717519360, "step": 14720, "train_runtime": 66837.5213, "train_tokens_per_second": 115466.87 }, { "epoch": 0.7970994886225277, "grad_norm": 0.1361764669418335, "learning_rate": 0.0009661922290766168, "loss": 2.979312515258789, "num_input_tokens_seen": 7722762240, "step": 14730, "train_runtime": 66882.6798, "train_tokens_per_second": 115467.297 }, { "epoch": 0.797640628804892, "grad_norm": 0.1359523981809616, "learning_rate": 0.000963798878835467, "loss": 2.9832695007324217, "num_input_tokens_seen": 7728005120, "step": 14740, "train_runtime": 66927.821, "train_tokens_per_second": 115467.753 }, { "epoch": 0.7981817689872561, "grad_norm": 0.1312197595834732, "learning_rate": 0.0009614109817841685, "loss": 2.988373565673828, "num_input_tokens_seen": 7733248000, "step": 14750, "train_runtime": 66972.9704, "train_tokens_per_second": 115468.195 }, { "epoch": 0.7987229091696204, "grad_norm": 0.1324051469564438, "learning_rate": 0.00095902854521286, "loss": 2.9794536590576173, "num_input_tokens_seen": 7738490880, "step": 14760, "train_runtime": 67018.1103, "train_tokens_per_second": 115468.652 }, { "epoch": 0.7992640493519846, "grad_norm": 0.13141310214996338, "learning_rate": 0.0009566515763950114, "loss": 2.979531097412109, "num_input_tokens_seen": 7743733760, "step": 14770, "train_runtime": 67063.2657, "train_tokens_per_second": 115469.083 }, { "epoch": 0.7998051895343489, "grad_norm": 0.13311649858951569, "learning_rate": 0.0009542800825873985, "loss": 2.978958511352539, "num_input_tokens_seen": 7748976640, "step": 14780, "train_runtime": 67108.4044, "train_tokens_per_second": 115469.541 }, { "epoch": 0.8003463297167132, "grad_norm": 0.1344899833202362, "learning_rate": 0.0009519140710300836, "loss": 2.9761631011962892, "num_input_tokens_seen": 7754219520, "step": 14790, "train_runtime": 67153.558, "train_tokens_per_second": 115469.973 }, { "epoch": 0.8008874698990773, "grad_norm": 0.1314343363046646, "learning_rate": 0.0009495535489463907, "loss": 2.9750953674316407, "num_input_tokens_seen": 7759462400, "step": 14800, "train_runtime": 67198.7114, "train_tokens_per_second": 115470.405 }, { "epoch": 0.8014286100814416, "grad_norm": 0.13687878847122192, "learning_rate": 0.0009471985235428848, "loss": 2.977894973754883, "num_input_tokens_seen": 7764705280, "step": 14810, "train_runtime": 67243.8512, "train_tokens_per_second": 115470.859 }, { "epoch": 0.8019697502638058, "grad_norm": 0.13268278539180756, "learning_rate": 0.0009448490020093504, "loss": 2.983228302001953, "num_input_tokens_seen": 7769948160, "step": 14820, "train_runtime": 67288.9927, "train_tokens_per_second": 115471.31 }, { "epoch": 0.8025108904461701, "grad_norm": 0.13738638162612915, "learning_rate": 0.0009425049915187695, "loss": 2.98532657623291, "num_input_tokens_seen": 7775191040, "step": 14830, "train_runtime": 67334.146, "train_tokens_per_second": 115471.741 }, { "epoch": 0.8030520306285344, "grad_norm": 0.13537852466106415, "learning_rate": 0.0009401664992272974, "loss": 2.9814353942871095, "num_input_tokens_seen": 7780433920, "step": 14840, "train_runtime": 67379.3084, "train_tokens_per_second": 115472.155 }, { "epoch": 0.8035931708108985, "grad_norm": 0.13461166620254517, "learning_rate": 0.0009378335322742428, "loss": 2.988892364501953, "num_input_tokens_seen": 7785676800, "step": 14850, "train_runtime": 67424.4589, "train_tokens_per_second": 115472.589 }, { "epoch": 0.8041343109932628, "grad_norm": 0.1397952139377594, "learning_rate": 0.0009355060977820479, "loss": 2.981852149963379, "num_input_tokens_seen": 7790919680, "step": 14860, "train_runtime": 67469.6089, "train_tokens_per_second": 115473.023 }, { "epoch": 0.804675451175627, "grad_norm": 0.13720718026161194, "learning_rate": 0.000933184202856262, "loss": 2.9753461837768556, "num_input_tokens_seen": 7796162560, "step": 14870, "train_runtime": 67514.7478, "train_tokens_per_second": 115473.475 }, { "epoch": 0.8052165913579913, "grad_norm": 0.13194413483142853, "learning_rate": 0.0009308678545855248, "loss": 2.98673038482666, "num_input_tokens_seen": 7801405440, "step": 14880, "train_runtime": 67563.706, "train_tokens_per_second": 115467.4 }, { "epoch": 0.8057577315403556, "grad_norm": 0.13509796559810638, "learning_rate": 0.0009285570600415394, "loss": 2.9741546630859377, "num_input_tokens_seen": 7806648320, "step": 14890, "train_runtime": 67608.8064, "train_tokens_per_second": 115467.921 }, { "epoch": 0.8062988717227197, "grad_norm": 0.13570842146873474, "learning_rate": 0.0009262518262790568, "loss": 2.9908029556274416, "num_input_tokens_seen": 7811891200, "step": 14900, "train_runtime": 67653.9237, "train_tokens_per_second": 115468.413 }, { "epoch": 0.806840011905084, "grad_norm": 0.1328882873058319, "learning_rate": 0.0009239521603358486, "loss": 2.9901811599731447, "num_input_tokens_seen": 7817134080, "step": 14910, "train_runtime": 67699.0266, "train_tokens_per_second": 115468.929 }, { "epoch": 0.8073811520874482, "grad_norm": 0.13037438690662384, "learning_rate": 0.0009216580692326891, "loss": 2.9751874923706056, "num_input_tokens_seen": 7822376960, "step": 14920, "train_runtime": 67744.1354, "train_tokens_per_second": 115469.434 }, { "epoch": 0.8079222922698125, "grad_norm": 0.13509000837802887, "learning_rate": 0.0009193695599733333, "loss": 2.9760356903076173, "num_input_tokens_seen": 7827619840, "step": 14930, "train_runtime": 67789.236, "train_tokens_per_second": 115469.952 }, { "epoch": 0.8084634324521768, "grad_norm": 0.13353431224822998, "learning_rate": 0.0009170866395444952, "loss": 2.979950714111328, "num_input_tokens_seen": 7832862720, "step": 14940, "train_runtime": 67834.3595, "train_tokens_per_second": 115470.431 }, { "epoch": 0.809004572634541, "grad_norm": 0.13296596705913544, "learning_rate": 0.0009148093149158249, "loss": 2.9780080795288084, "num_input_tokens_seen": 7838105600, "step": 14950, "train_runtime": 67879.4629, "train_tokens_per_second": 115470.943 }, { "epoch": 0.8095457128169052, "grad_norm": 0.13199231028556824, "learning_rate": 0.0009125375930398896, "loss": 2.976139450073242, "num_input_tokens_seen": 7843348480, "step": 14960, "train_runtime": 67924.5642, "train_tokens_per_second": 115471.458 }, { "epoch": 0.8100868529992694, "grad_norm": 0.1304149031639099, "learning_rate": 0.0009102714808521528, "loss": 2.9799163818359373, "num_input_tokens_seen": 7848591360, "step": 14970, "train_runtime": 67969.6467, "train_tokens_per_second": 115472.005 }, { "epoch": 0.8106279931816337, "grad_norm": 0.13312670588493347, "learning_rate": 0.0009080109852709498, "loss": 2.9826412200927734, "num_input_tokens_seen": 7853834240, "step": 14980, "train_runtime": 68014.7473, "train_tokens_per_second": 115472.52 }, { "epoch": 0.811169133363998, "grad_norm": 0.13625964522361755, "learning_rate": 0.0009057561131974695, "loss": 2.974313735961914, "num_input_tokens_seen": 7859077120, "step": 14990, "train_runtime": 68059.848, "train_tokens_per_second": 115473.034 }, { "epoch": 0.8117102735463622, "grad_norm": 0.13586074113845825, "learning_rate": 0.000903506871515734, "loss": 2.9799150466918944, "num_input_tokens_seen": 7864320000, "step": 15000, "train_runtime": 68104.9508, "train_tokens_per_second": 115473.544 }, { "epoch": 0.8117102735463622, "eval_loss": 2.9381465911865234, "eval_runtime": 1.9846, "eval_samples_per_second": 251.945, "eval_steps_per_second": 4.031, "num_input_tokens_seen": 7864320000, "step": 15000 }, { "epoch": 0.8122514137287264, "grad_norm": 0.13391871750354767, "learning_rate": 0.0009012632670925736, "loss": 2.972438430786133, "num_input_tokens_seen": 7869562880, "step": 15010, "train_runtime": 68154.5217, "train_tokens_per_second": 115466.482 }, { "epoch": 0.8127925539110906, "grad_norm": 0.13467305898666382, "learning_rate": 0.0008990253067776095, "loss": 2.9732336044311523, "num_input_tokens_seen": 7874805760, "step": 15020, "train_runtime": 68199.7002, "train_tokens_per_second": 115466.868 }, { "epoch": 0.8133336940934549, "grad_norm": 0.13371260464191437, "learning_rate": 0.0008967929974032304, "loss": 2.9756675720214845, "num_input_tokens_seen": 7880048640, "step": 15030, "train_runtime": 68244.8815, "train_tokens_per_second": 115467.248 }, { "epoch": 0.8138748342758192, "grad_norm": 0.13191363215446472, "learning_rate": 0.0008945663457845765, "loss": 2.9834621429443358, "num_input_tokens_seen": 7885291520, "step": 15040, "train_runtime": 68290.0502, "train_tokens_per_second": 115467.649 }, { "epoch": 0.8144159744581834, "grad_norm": 0.1310187131166458, "learning_rate": 0.0008923453587195116, "loss": 2.9787324905395507, "num_input_tokens_seen": 7890534400, "step": 15050, "train_runtime": 68335.2323, "train_tokens_per_second": 115468.026 }, { "epoch": 0.8149571146405477, "grad_norm": 0.13005271553993225, "learning_rate": 0.0008901300429886064, "loss": 2.9818572998046875, "num_input_tokens_seen": 7895777280, "step": 15060, "train_runtime": 68380.4424, "train_tokens_per_second": 115468.356 }, { "epoch": 0.8154982548229118, "grad_norm": 0.13187964260578156, "learning_rate": 0.0008879204053551192, "loss": 2.9841533660888673, "num_input_tokens_seen": 7901020160, "step": 15070, "train_runtime": 68425.6233, "train_tokens_per_second": 115468.735 }, { "epoch": 0.8160393950052761, "grad_norm": 0.12774254381656647, "learning_rate": 0.0008857164525649706, "loss": 2.9738176345825194, "num_input_tokens_seen": 7906263040, "step": 15080, "train_runtime": 68470.8074, "train_tokens_per_second": 115469.108 }, { "epoch": 0.8165805351876404, "grad_norm": 0.13418236374855042, "learning_rate": 0.0008835181913467284, "loss": 2.9698516845703127, "num_input_tokens_seen": 7911505920, "step": 15090, "train_runtime": 68516.0039, "train_tokens_per_second": 115469.459 }, { "epoch": 0.8171216753700046, "grad_norm": 0.13305585086345673, "learning_rate": 0.000881325628411582, "loss": 2.9800113677978515, "num_input_tokens_seen": 7916748800, "step": 15100, "train_runtime": 68561.1978, "train_tokens_per_second": 115469.815 }, { "epoch": 0.8176628155523689, "grad_norm": 0.1298227459192276, "learning_rate": 0.0008791387704533261, "loss": 2.9894580841064453, "num_input_tokens_seen": 7921991680, "step": 15110, "train_runtime": 68606.3897, "train_tokens_per_second": 115470.173 }, { "epoch": 0.818203955734733, "grad_norm": 0.13746146857738495, "learning_rate": 0.0008769576241483369, "loss": 2.969521903991699, "num_input_tokens_seen": 7927234560, "step": 15120, "train_runtime": 68651.5837, "train_tokens_per_second": 115470.527 }, { "epoch": 0.8187450959170973, "grad_norm": 0.1307765245437622, "learning_rate": 0.0008747821961555536, "loss": 2.9746829986572267, "num_input_tokens_seen": 7932477440, "step": 15130, "train_runtime": 68696.7803, "train_tokens_per_second": 115470.877 }, { "epoch": 0.8192862360994616, "grad_norm": 0.12932413816452026, "learning_rate": 0.0008726124931164572, "loss": 2.980904388427734, "num_input_tokens_seen": 7937720320, "step": 15140, "train_runtime": 68741.9605, "train_tokens_per_second": 115471.253 }, { "epoch": 0.8198273762818258, "grad_norm": 0.13145951926708221, "learning_rate": 0.0008704485216550531, "loss": 2.977578544616699, "num_input_tokens_seen": 7942963200, "step": 15150, "train_runtime": 68787.1491, "train_tokens_per_second": 115471.615 }, { "epoch": 0.8203685164641901, "grad_norm": 0.13109584152698517, "learning_rate": 0.0008682902883778457, "loss": 2.973899078369141, "num_input_tokens_seen": 7948206080, "step": 15160, "train_runtime": 68832.3314, "train_tokens_per_second": 115471.987 }, { "epoch": 0.8209096566465542, "grad_norm": 0.1269070953130722, "learning_rate": 0.0008661377998738207, "loss": 2.9858329772949217, "num_input_tokens_seen": 7953448960, "step": 15170, "train_runtime": 68877.5165, "train_tokens_per_second": 115472.354 }, { "epoch": 0.8214507968289185, "grad_norm": 0.13239699602127075, "learning_rate": 0.0008639910627144282, "loss": 2.9783477783203125, "num_input_tokens_seen": 7958691840, "step": 15180, "train_runtime": 68922.6959, "train_tokens_per_second": 115472.73 }, { "epoch": 0.8219919370112828, "grad_norm": 0.129794642329216, "learning_rate": 0.0008618500834535568, "loss": 2.9712141036987303, "num_input_tokens_seen": 7963934720, "step": 15190, "train_runtime": 68967.862, "train_tokens_per_second": 115473.128 }, { "epoch": 0.822533077193647, "grad_norm": 0.13771747052669525, "learning_rate": 0.0008597148686275189, "loss": 2.984314727783203, "num_input_tokens_seen": 7969177600, "step": 15200, "train_runtime": 69013.0362, "train_tokens_per_second": 115473.511 }, { "epoch": 0.8230742173760113, "grad_norm": 0.13398458063602448, "learning_rate": 0.0008575854247550258, "loss": 2.9714584350585938, "num_input_tokens_seen": 7974420480, "step": 15210, "train_runtime": 69058.1959, "train_tokens_per_second": 115473.918 }, { "epoch": 0.8236153575583754, "grad_norm": 0.13028761744499207, "learning_rate": 0.0008554617583371726, "loss": 2.9726911544799806, "num_input_tokens_seen": 7979663360, "step": 15220, "train_runtime": 69103.3538, "train_tokens_per_second": 115474.328 }, { "epoch": 0.8241564977407397, "grad_norm": 0.13187240064144135, "learning_rate": 0.0008533438758574152, "loss": 2.9737316131591798, "num_input_tokens_seen": 7984906240, "step": 15230, "train_runtime": 69148.515, "train_tokens_per_second": 115474.732 }, { "epoch": 0.824697637923104, "grad_norm": 0.13035008311271667, "learning_rate": 0.0008512317837815503, "loss": 2.9657833099365236, "num_input_tokens_seen": 7990149120, "step": 15240, "train_runtime": 69193.6841, "train_tokens_per_second": 115475.122 }, { "epoch": 0.8252387781054682, "grad_norm": 0.1308414787054062, "learning_rate": 0.0008491254885576988, "loss": 2.968144416809082, "num_input_tokens_seen": 7995392000, "step": 15250, "train_runtime": 69238.862, "train_tokens_per_second": 115475.497 }, { "epoch": 0.8257799182878325, "grad_norm": 0.1312231868505478, "learning_rate": 0.0008470249966162835, "loss": 2.9749370574951173, "num_input_tokens_seen": 8000634880, "step": 15260, "train_runtime": 69287.9095, "train_tokens_per_second": 115469.422 }, { "epoch": 0.8263210584701967, "grad_norm": 0.13507384061813354, "learning_rate": 0.0008449303143700088, "loss": 2.9808319091796873, "num_input_tokens_seen": 8005877760, "step": 15270, "train_runtime": 69333.0664, "train_tokens_per_second": 115469.835 }, { "epoch": 0.8268621986525609, "grad_norm": 0.12942056357860565, "learning_rate": 0.0008428414482138435, "loss": 2.969392776489258, "num_input_tokens_seen": 8011120640, "step": 15280, "train_runtime": 69378.1613, "train_tokens_per_second": 115470.351 }, { "epoch": 0.8274033388349252, "grad_norm": 0.12837563455104828, "learning_rate": 0.0008407584045250001, "loss": 2.979315185546875, "num_input_tokens_seen": 8016363520, "step": 15290, "train_runtime": 69423.2721, "train_tokens_per_second": 115470.84 }, { "epoch": 0.8279444790172894, "grad_norm": 0.13300900161266327, "learning_rate": 0.0008386811896629143, "loss": 2.9644968032836916, "num_input_tokens_seen": 8021606400, "step": 15300, "train_runtime": 69468.3762, "train_tokens_per_second": 115471.339 }, { "epoch": 0.8284856191996537, "grad_norm": 0.12836603820323944, "learning_rate": 0.0008366098099692285, "loss": 2.972013473510742, "num_input_tokens_seen": 8026849280, "step": 15310, "train_runtime": 69513.475, "train_tokens_per_second": 115471.846 }, { "epoch": 0.8290267593820179, "grad_norm": 0.12967608869075775, "learning_rate": 0.0008345442717677699, "loss": 2.9776493072509767, "num_input_tokens_seen": 8032092160, "step": 15320, "train_runtime": 69558.5739, "train_tokens_per_second": 115472.352 }, { "epoch": 0.8295678995643821, "grad_norm": 0.12830476462841034, "learning_rate": 0.0008324845813645304, "loss": 2.9773494720458986, "num_input_tokens_seen": 8037335040, "step": 15330, "train_runtime": 69603.6687, "train_tokens_per_second": 115472.865 }, { "epoch": 0.8301090397467464, "grad_norm": 0.13105891644954681, "learning_rate": 0.0008304307450476511, "loss": 2.9748680114746096, "num_input_tokens_seen": 8042577920, "step": 15340, "train_runtime": 69648.769, "train_tokens_per_second": 115473.368 }, { "epoch": 0.8306501799291106, "grad_norm": 0.1301373690366745, "learning_rate": 0.0008283827690873988, "loss": 2.9727630615234375, "num_input_tokens_seen": 8047820800, "step": 15350, "train_runtime": 69693.862, "train_tokens_per_second": 115473.882 }, { "epoch": 0.8311913201114749, "grad_norm": 0.13162434101104736, "learning_rate": 0.0008263406597361503, "loss": 2.978099822998047, "num_input_tokens_seen": 8053063680, "step": 15360, "train_runtime": 69738.9614, "train_tokens_per_second": 115474.385 }, { "epoch": 0.8317324602938391, "grad_norm": 0.13288192451000214, "learning_rate": 0.0008243044232283723, "loss": 2.9758016586303713, "num_input_tokens_seen": 8058306560, "step": 15370, "train_runtime": 69784.0695, "train_tokens_per_second": 115474.873 }, { "epoch": 0.8322736004762034, "grad_norm": 0.136215478181839, "learning_rate": 0.0008222740657806005, "loss": 2.976166915893555, "num_input_tokens_seen": 8063549440, "step": 15380, "train_runtime": 69829.1841, "train_tokens_per_second": 115475.35 }, { "epoch": 0.8328147406585676, "grad_norm": 0.12879818677902222, "learning_rate": 0.000820249593591422, "loss": 2.9633615493774412, "num_input_tokens_seen": 8068792320, "step": 15390, "train_runtime": 69874.3003, "train_tokens_per_second": 115475.823 }, { "epoch": 0.8333558808409318, "grad_norm": 0.1428280621767044, "learning_rate": 0.0008182310128414587, "loss": 2.9798999786376954, "num_input_tokens_seen": 8074035200, "step": 15400, "train_runtime": 69919.3861, "train_tokens_per_second": 115476.346 }, { "epoch": 0.8338970210232961, "grad_norm": 0.1359853297472, "learning_rate": 0.0008162183296933439, "loss": 2.968707275390625, "num_input_tokens_seen": 8079278080, "step": 15410, "train_runtime": 69964.4955, "train_tokens_per_second": 115476.829 }, { "epoch": 0.8344381612056603, "grad_norm": 0.13050523400306702, "learning_rate": 0.0008142115502917066, "loss": 2.973996162414551, "num_input_tokens_seen": 8084520960, "step": 15420, "train_runtime": 70009.6056, "train_tokens_per_second": 115477.31 }, { "epoch": 0.8349793013880246, "grad_norm": 0.13029220700263977, "learning_rate": 0.0008122106807631529, "loss": 2.9792009353637696, "num_input_tokens_seen": 8089763840, "step": 15430, "train_runtime": 70054.706, "train_tokens_per_second": 115477.807 }, { "epoch": 0.8355204415703888, "grad_norm": 0.13232028484344482, "learning_rate": 0.0008102157272162447, "loss": 2.9753578186035154, "num_input_tokens_seen": 8095006720, "step": 15440, "train_runtime": 70099.8205, "train_tokens_per_second": 115478.28 }, { "epoch": 0.836061581752753, "grad_norm": 0.13095484673976898, "learning_rate": 0.0008082266957414837, "loss": 2.97320671081543, "num_input_tokens_seen": 8100249600, "step": 15450, "train_runtime": 70144.9322, "train_tokens_per_second": 115478.757 }, { "epoch": 0.8366027219351173, "grad_norm": 0.13523340225219727, "learning_rate": 0.0008062435924112902, "loss": 2.9681285858154296, "num_input_tokens_seen": 8105492480, "step": 15460, "train_runtime": 70190.0213, "train_tokens_per_second": 115479.271 }, { "epoch": 0.8371438621174815, "grad_norm": 0.13670340180397034, "learning_rate": 0.0008042664232799893, "loss": 2.9674022674560545, "num_input_tokens_seen": 8110735360, "step": 15470, "train_runtime": 70235.1367, "train_tokens_per_second": 115479.741 }, { "epoch": 0.8376850022998458, "grad_norm": 0.12936244904994965, "learning_rate": 0.0008022951943837868, "loss": 2.966217041015625, "num_input_tokens_seen": 8115978240, "step": 15480, "train_runtime": 70280.2433, "train_tokens_per_second": 115480.224 }, { "epoch": 0.8382261424822101, "grad_norm": 0.14200405776500702, "learning_rate": 0.0008003299117407532, "loss": 2.978799247741699, "num_input_tokens_seen": 8121221120, "step": 15490, "train_runtime": 70325.3302, "train_tokens_per_second": 115480.739 }, { "epoch": 0.8387672826645742, "grad_norm": 0.12791140377521515, "learning_rate": 0.0007983705813508069, "loss": 2.971164321899414, "num_input_tokens_seen": 8126464000, "step": 15500, "train_runtime": 70370.4812, "train_tokens_per_second": 115481.149 }, { "epoch": 0.8387672826645742, "eval_loss": 2.9325733184814453, "eval_runtime": 1.9901, "eval_samples_per_second": 251.238, "eval_steps_per_second": 4.02, "num_input_tokens_seen": 8126464000, "step": 15500 }, { "epoch": 0.8393084228469385, "grad_norm": 0.1335526406764984, "learning_rate": 0.0007964172091956926, "loss": 2.9691984176635744, "num_input_tokens_seen": 8131706880, "step": 15510, "train_runtime": 70417.588, "train_tokens_per_second": 115478.35 }, { "epoch": 0.8398495630293027, "grad_norm": 0.13724961876869202, "learning_rate": 0.0007944698012389664, "loss": 2.9696407318115234, "num_input_tokens_seen": 8136949760, "step": 15520, "train_runtime": 70462.6835, "train_tokens_per_second": 115478.851 }, { "epoch": 0.840390703211667, "grad_norm": 0.13106457889080048, "learning_rate": 0.0007925283634259745, "loss": 2.964072036743164, "num_input_tokens_seen": 8142192640, "step": 15530, "train_runtime": 70507.7742, "train_tokens_per_second": 115479.36 }, { "epoch": 0.8409318433940313, "grad_norm": 0.1346583068370819, "learning_rate": 0.000790592901683838, "loss": 2.9721302032470702, "num_input_tokens_seen": 8147435520, "step": 15540, "train_runtime": 70552.8789, "train_tokens_per_second": 115479.845 }, { "epoch": 0.8414729835763954, "grad_norm": 0.12788882851600647, "learning_rate": 0.0007886634219214321, "loss": 2.9774459838867187, "num_input_tokens_seen": 8152678400, "step": 15550, "train_runtime": 70597.9816, "train_tokens_per_second": 115480.333 }, { "epoch": 0.8420141237587597, "grad_norm": 0.1323845237493515, "learning_rate": 0.0007867399300293693, "loss": 2.971846008300781, "num_input_tokens_seen": 8157921280, "step": 15560, "train_runtime": 70643.081, "train_tokens_per_second": 115480.825 }, { "epoch": 0.8425552639411239, "grad_norm": 0.132669135928154, "learning_rate": 0.0007848224318799821, "loss": 2.9736881256103516, "num_input_tokens_seen": 8163164160, "step": 15570, "train_runtime": 70688.1702, "train_tokens_per_second": 115481.334 }, { "epoch": 0.8430964041234882, "grad_norm": 0.1315847635269165, "learning_rate": 0.0007829109333273051, "loss": 2.9581043243408205, "num_input_tokens_seen": 8168407040, "step": 15580, "train_runtime": 70733.2527, "train_tokens_per_second": 115481.852 }, { "epoch": 0.8436375443058525, "grad_norm": 0.13508620858192444, "learning_rate": 0.0007810054402070547, "loss": 2.967576789855957, "num_input_tokens_seen": 8173649920, "step": 15590, "train_runtime": 70778.3173, "train_tokens_per_second": 115482.4 }, { "epoch": 0.8441786844882166, "grad_norm": 0.13094158470630646, "learning_rate": 0.0007791059583366134, "loss": 2.969736671447754, "num_input_tokens_seen": 8178892800, "step": 15600, "train_runtime": 70823.3875, "train_tokens_per_second": 115482.937 }, { "epoch": 0.8447198246705809, "grad_norm": 0.13293389976024628, "learning_rate": 0.0007772124935150125, "loss": 2.9740530014038087, "num_input_tokens_seen": 8184135680, "step": 15610, "train_runtime": 70868.5107, "train_tokens_per_second": 115483.387 }, { "epoch": 0.8452609648529451, "grad_norm": 0.12885726988315582, "learning_rate": 0.0007753250515229127, "loss": 2.9699680328369142, "num_input_tokens_seen": 8189378560, "step": 15620, "train_runtime": 70913.6516, "train_tokens_per_second": 115483.808 }, { "epoch": 0.8458021050353094, "grad_norm": 0.13280688226222992, "learning_rate": 0.0007734436381225877, "loss": 2.9740190505981445, "num_input_tokens_seen": 8194621440, "step": 15630, "train_runtime": 70958.7738, "train_tokens_per_second": 115484.259 }, { "epoch": 0.8463432452176737, "grad_norm": 0.13439851999282837, "learning_rate": 0.0007715682590579061, "loss": 2.975991439819336, "num_input_tokens_seen": 8199864320, "step": 15640, "train_runtime": 71003.8731, "train_tokens_per_second": 115484.747 } ], "logging_steps": 10, "max_steps": 18480, "num_input_tokens_seen": 8200388608, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 9.287888719514173e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }