Files
llama-300M-v3-muon-original/ckpt-8-2b/trainer_state.json
ModelHub XC 53414de0a0 初始化项目,由ModelHub XC社区提供模型
Model: deqing/llama-300M-v3-muon-original
Source: Original Platform
2026-06-20 18:01:22 +08:00

15954 lines
480 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.8463973592359101,
"eval_steps": 500,
"global_step": 15641,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0005411401823642415,
"grad_norm": 3.381409168243408,
"learning_rate": 8.999999999999999e-05,
"loss": 11.904257202148438,
"num_input_tokens_seen": 5242880,
"step": 10,
"train_runtime": 98.6668,
"train_tokens_per_second": 53137.211
},
{
"epoch": 0.001082280364728483,
"grad_norm": 3.160391092300415,
"learning_rate": 0.00019,
"loss": 11.522220611572266,
"num_input_tokens_seen": 10485760,
"step": 20,
"train_runtime": 143.6354,
"train_tokens_per_second": 73002.603
},
{
"epoch": 0.0016234205470927244,
"grad_norm": 2.81247878074646,
"learning_rate": 0.00029,
"loss": 10.88598403930664,
"num_input_tokens_seen": 15728640,
"step": 30,
"train_runtime": 188.573,
"train_tokens_per_second": 83408.77
},
{
"epoch": 0.002164560729456966,
"grad_norm": 2.100623846054077,
"learning_rate": 0.00039,
"loss": 10.268418884277343,
"num_input_tokens_seen": 20971520,
"step": 40,
"train_runtime": 233.5046,
"train_tokens_per_second": 89812.025
},
{
"epoch": 0.002705700911821207,
"grad_norm": 1.6848973035812378,
"learning_rate": 0.00049,
"loss": 9.741734313964844,
"num_input_tokens_seen": 26214400,
"step": 50,
"train_runtime": 278.4485,
"train_tokens_per_second": 94144.504
},
{
"epoch": 0.003246841094185449,
"grad_norm": 2.3044769763946533,
"learning_rate": 0.00059,
"loss": 9.239765167236328,
"num_input_tokens_seen": 31457280,
"step": 60,
"train_runtime": 323.3989,
"train_tokens_per_second": 97270.822
},
{
"epoch": 0.00378798127654969,
"grad_norm": 1.6891608238220215,
"learning_rate": 0.0006900000000000001,
"loss": 8.75671157836914,
"num_input_tokens_seen": 36700160,
"step": 70,
"train_runtime": 368.3824,
"train_tokens_per_second": 99625.16
},
{
"epoch": 0.004329121458913932,
"grad_norm": 1.304526448249817,
"learning_rate": 0.00079,
"loss": 8.304008483886719,
"num_input_tokens_seen": 41943040,
"step": 80,
"train_runtime": 413.3956,
"train_tokens_per_second": 101459.807
},
{
"epoch": 0.004870261641278173,
"grad_norm": 0.9729027152061462,
"learning_rate": 0.00089,
"loss": 7.881130981445312,
"num_input_tokens_seen": 47185920,
"step": 90,
"train_runtime": 458.4234,
"train_tokens_per_second": 102930.863
},
{
"epoch": 0.005411401823642414,
"grad_norm": 0.8812423944473267,
"learning_rate": 0.00099,
"loss": 7.507221984863281,
"num_input_tokens_seen": 52428800,
"step": 100,
"train_runtime": 503.4688,
"train_tokens_per_second": 104135.16
},
{
"epoch": 0.005952542006006656,
"grad_norm": 0.8653954863548279,
"learning_rate": 0.00109,
"loss": 7.197725677490235,
"num_input_tokens_seen": 57671680,
"step": 110,
"train_runtime": 548.5148,
"train_tokens_per_second": 105141.522
},
{
"epoch": 0.006493682188370898,
"grad_norm": 0.6527077555656433,
"learning_rate": 0.0011899999999999999,
"loss": 6.962340545654297,
"num_input_tokens_seen": 62914560,
"step": 120,
"train_runtime": 593.6028,
"train_tokens_per_second": 105987.634
},
{
"epoch": 0.007034822370735139,
"grad_norm": 0.9212841987609863,
"learning_rate": 0.0012900000000000001,
"loss": 6.748843383789063,
"num_input_tokens_seen": 68157440,
"step": 130,
"train_runtime": 638.7162,
"train_tokens_per_second": 106710.057
},
{
"epoch": 0.00757596255309938,
"grad_norm": 0.8431305885314941,
"learning_rate": 0.0013900000000000002,
"loss": 6.572750854492187,
"num_input_tokens_seen": 73400320,
"step": 140,
"train_runtime": 683.8844,
"train_tokens_per_second": 107328.553
},
{
"epoch": 0.008117102735463622,
"grad_norm": 1.0355322360992432,
"learning_rate": 0.00149,
"loss": 6.391636657714844,
"num_input_tokens_seen": 78643200,
"step": 150,
"train_runtime": 729.0444,
"train_tokens_per_second": 107871.613
},
{
"epoch": 0.008658242917827864,
"grad_norm": 1.3339749574661255,
"learning_rate": 0.00159,
"loss": 6.223532104492188,
"num_input_tokens_seen": 83886080,
"step": 160,
"train_runtime": 774.2149,
"train_tokens_per_second": 108349.867
},
{
"epoch": 0.009199383100192105,
"grad_norm": 1.152486801147461,
"learning_rate": 0.00169,
"loss": 6.081370162963867,
"num_input_tokens_seen": 89128960,
"step": 170,
"train_runtime": 819.4009,
"train_tokens_per_second": 108773.326
},
{
"epoch": 0.009740523282556346,
"grad_norm": 1.163500189781189,
"learning_rate": 0.00179,
"loss": 5.940819549560547,
"num_input_tokens_seen": 94371840,
"step": 180,
"train_runtime": 864.5859,
"train_tokens_per_second": 109152.654
},
{
"epoch": 0.010281663464920588,
"grad_norm": 1.2408533096313477,
"learning_rate": 0.00189,
"loss": 5.812583160400391,
"num_input_tokens_seen": 99614720,
"step": 190,
"train_runtime": 909.7569,
"train_tokens_per_second": 109495.979
},
{
"epoch": 0.010822803647284829,
"grad_norm": 1.1574287414550781,
"learning_rate": 0.00199,
"loss": 5.6905670166015625,
"num_input_tokens_seen": 104857600,
"step": 200,
"train_runtime": 954.9646,
"train_tokens_per_second": 109802.599
},
{
"epoch": 0.011363943829649071,
"grad_norm": 1.296819806098938,
"learning_rate": 0.00209,
"loss": 5.591656494140625,
"num_input_tokens_seen": 110100480,
"step": 210,
"train_runtime": 1000.1348,
"train_tokens_per_second": 110085.64
},
{
"epoch": 0.011905084012013312,
"grad_norm": 1.0654325485229492,
"learning_rate": 0.00219,
"loss": 5.485440444946289,
"num_input_tokens_seen": 115343360,
"step": 220,
"train_runtime": 1045.3265,
"train_tokens_per_second": 110341.942
},
{
"epoch": 0.012446224194377553,
"grad_norm": 1.1002130508422852,
"learning_rate": 0.00229,
"loss": 5.387198257446289,
"num_input_tokens_seen": 120586240,
"step": 230,
"train_runtime": 1090.5293,
"train_tokens_per_second": 110575.879
},
{
"epoch": 0.012987364376741795,
"grad_norm": 1.0023939609527588,
"learning_rate": 0.0023899999999999998,
"loss": 5.29501953125,
"num_input_tokens_seen": 125829120,
"step": 240,
"train_runtime": 1135.7379,
"train_tokens_per_second": 110790.637
},
{
"epoch": 0.013528504559106036,
"grad_norm": 0.8933797478675842,
"learning_rate": 0.00249,
"loss": 5.21459846496582,
"num_input_tokens_seen": 131072000,
"step": 250,
"train_runtime": 1180.9487,
"train_tokens_per_second": 110988.737
},
{
"epoch": 0.014069644741470278,
"grad_norm": 1.0700093507766724,
"learning_rate": 0.0025900000000000003,
"loss": 5.131219482421875,
"num_input_tokens_seen": 136314880,
"step": 260,
"train_runtime": 1226.1688,
"train_tokens_per_second": 111171.384
},
{
"epoch": 0.01461078492383452,
"grad_norm": 1.194157600402832,
"learning_rate": 0.00269,
"loss": 5.058176422119141,
"num_input_tokens_seen": 141557760,
"step": 270,
"train_runtime": 1271.4137,
"train_tokens_per_second": 111338.863
},
{
"epoch": 0.01515192510619876,
"grad_norm": 1.097806453704834,
"learning_rate": 0.0027900000000000004,
"loss": 4.985312652587891,
"num_input_tokens_seen": 146800640,
"step": 280,
"train_runtime": 1316.6654,
"train_tokens_per_second": 111494.264
},
{
"epoch": 0.015693065288563002,
"grad_norm": 0.9698807001113892,
"learning_rate": 0.0028899999999999998,
"loss": 4.905867004394532,
"num_input_tokens_seen": 152043520,
"step": 290,
"train_runtime": 1361.9146,
"train_tokens_per_second": 111639.537
},
{
"epoch": 0.016234205470927243,
"grad_norm": 0.9057841300964355,
"learning_rate": 0.00299,
"loss": 4.847021102905273,
"num_input_tokens_seen": 157286400,
"step": 300,
"train_runtime": 1407.157,
"train_tokens_per_second": 111776.016
},
{
"epoch": 0.016775345653291484,
"grad_norm": 0.9901854395866394,
"learning_rate": 0.00309,
"loss": 4.7939613342285154,
"num_input_tokens_seen": 162529280,
"step": 310,
"train_runtime": 1452.4002,
"train_tokens_per_second": 111903.927
},
{
"epoch": 0.017316485835655728,
"grad_norm": 0.9903898239135742,
"learning_rate": 0.00319,
"loss": 4.720642852783203,
"num_input_tokens_seen": 167772160,
"step": 320,
"train_runtime": 1497.6772,
"train_tokens_per_second": 112021.577
},
{
"epoch": 0.01785762601801997,
"grad_norm": 1.001007318496704,
"learning_rate": 0.0032900000000000004,
"loss": 4.6716564178466795,
"num_input_tokens_seen": 173015040,
"step": 330,
"train_runtime": 1542.9302,
"train_tokens_per_second": 112134.066
},
{
"epoch": 0.01839876620038421,
"grad_norm": 0.9968867897987366,
"learning_rate": 0.0033900000000000002,
"loss": 4.6183723449707035,
"num_input_tokens_seen": 178257920,
"step": 340,
"train_runtime": 1588.1903,
"train_tokens_per_second": 112239.647
},
{
"epoch": 0.01893990638274845,
"grad_norm": 0.9285950064659119,
"learning_rate": 0.00349,
"loss": 4.572103881835938,
"num_input_tokens_seen": 183500800,
"step": 350,
"train_runtime": 1633.4577,
"train_tokens_per_second": 112338.875
},
{
"epoch": 0.01948104656511269,
"grad_norm": 0.989262044429779,
"learning_rate": 0.00359,
"loss": 4.529151153564453,
"num_input_tokens_seen": 188743680,
"step": 360,
"train_runtime": 1678.7231,
"train_tokens_per_second": 112432.882
},
{
"epoch": 0.020022186747476935,
"grad_norm": 1.0208923816680908,
"learning_rate": 0.00369,
"loss": 4.4996803283691404,
"num_input_tokens_seen": 193986560,
"step": 370,
"train_runtime": 1724.0055,
"train_tokens_per_second": 112520.846
},
{
"epoch": 0.020563326929841176,
"grad_norm": 0.9494571089744568,
"learning_rate": 0.00379,
"loss": 4.4542186737060545,
"num_input_tokens_seen": 199229440,
"step": 380,
"train_runtime": 1769.3003,
"train_tokens_per_second": 112603.517
},
{
"epoch": 0.021104467112205417,
"grad_norm": 0.7988581657409668,
"learning_rate": 0.0038900000000000002,
"loss": 4.431841278076172,
"num_input_tokens_seen": 204472320,
"step": 390,
"train_runtime": 1818.3866,
"train_tokens_per_second": 112447.111
},
{
"epoch": 0.021645607294569658,
"grad_norm": 0.832046389579773,
"learning_rate": 0.0039900000000000005,
"loss": 4.396000671386719,
"num_input_tokens_seen": 209715200,
"step": 400,
"train_runtime": 1863.6718,
"train_tokens_per_second": 112527.967
},
{
"epoch": 0.0221867474769339,
"grad_norm": 0.8342320919036865,
"learning_rate": 0.00409,
"loss": 4.37408332824707,
"num_input_tokens_seen": 214958080,
"step": 410,
"train_runtime": 1909.0219,
"train_tokens_per_second": 112601.162
},
{
"epoch": 0.022727887659298143,
"grad_norm": 0.9766927361488342,
"learning_rate": 0.00419,
"loss": 4.35020637512207,
"num_input_tokens_seen": 220200960,
"step": 420,
"train_runtime": 1954.3817,
"train_tokens_per_second": 112670.397
},
{
"epoch": 0.023269027841662383,
"grad_norm": 0.8501082062721252,
"learning_rate": 0.00429,
"loss": 4.312299346923828,
"num_input_tokens_seen": 225443840,
"step": 430,
"train_runtime": 1999.72,
"train_tokens_per_second": 112737.702
},
{
"epoch": 0.023810168024026624,
"grad_norm": 0.8430765867233276,
"learning_rate": 0.00439,
"loss": 4.310842895507813,
"num_input_tokens_seen": 230686720,
"step": 440,
"train_runtime": 2045.0616,
"train_tokens_per_second": 112801.843
},
{
"epoch": 0.024351308206390865,
"grad_norm": 0.7848499417304993,
"learning_rate": 0.00449,
"loss": 4.2807456970214846,
"num_input_tokens_seen": 235929600,
"step": 450,
"train_runtime": 2090.4316,
"train_tokens_per_second": 112861.668
},
{
"epoch": 0.024892448388755106,
"grad_norm": 0.9066799879074097,
"learning_rate": 0.00459,
"loss": 4.257038879394531,
"num_input_tokens_seen": 241172480,
"step": 460,
"train_runtime": 2135.8061,
"train_tokens_per_second": 112918.713
},
{
"epoch": 0.02543358857111935,
"grad_norm": 0.7888091802597046,
"learning_rate": 0.00469,
"loss": 4.235981750488281,
"num_input_tokens_seen": 246415360,
"step": 470,
"train_runtime": 2181.1434,
"train_tokens_per_second": 112975.315
},
{
"epoch": 0.02597472875348359,
"grad_norm": 0.6987936496734619,
"learning_rate": 0.00479,
"loss": 4.215058135986328,
"num_input_tokens_seen": 251658240,
"step": 480,
"train_runtime": 2226.5022,
"train_tokens_per_second": 113028.517
},
{
"epoch": 0.02651586893584783,
"grad_norm": 0.8686115741729736,
"learning_rate": 0.00489,
"loss": 4.219595336914063,
"num_input_tokens_seen": 256901120,
"step": 490,
"train_runtime": 2271.8741,
"train_tokens_per_second": 113078.945
},
{
"epoch": 0.027057009118212072,
"grad_norm": 0.9207416772842407,
"learning_rate": 0.0049900000000000005,
"loss": 4.1927734375,
"num_input_tokens_seen": 262144000,
"step": 500,
"train_runtime": 2317.2129,
"train_tokens_per_second": 113129.008
},
{
"epoch": 0.027057009118212072,
"eval_loss": 4.101890563964844,
"eval_runtime": 2.0015,
"eval_samples_per_second": 249.812,
"eval_steps_per_second": 3.997,
"num_input_tokens_seen": 262144000,
"step": 500
},
{
"epoch": 0.027598149300576313,
"grad_norm": 0.7904194593429565,
"learning_rate": 0.0049999972179955365,
"loss": 4.170513916015625,
"num_input_tokens_seen": 267386880,
"step": 510,
"train_runtime": 2364.5371,
"train_tokens_per_second": 113082.126
},
{
"epoch": 0.028139289482940557,
"grad_norm": 0.63487708568573,
"learning_rate": 0.004999987601198816,
"loss": 4.152132034301758,
"num_input_tokens_seen": 272629760,
"step": 520,
"train_runtime": 2409.8408,
"train_tokens_per_second": 113131.854
},
{
"epoch": 0.028680429665304798,
"grad_norm": 0.519443154335022,
"learning_rate": 0.0049999711152934586,
"loss": 4.145978164672852,
"num_input_tokens_seen": 277872640,
"step": 530,
"train_runtime": 2455.1664,
"train_tokens_per_second": 113178.739
},
{
"epoch": 0.02922156984766904,
"grad_norm": 0.5551373362541199,
"learning_rate": 0.004999947760329793,
"loss": 4.118004608154297,
"num_input_tokens_seen": 283115520,
"step": 540,
"train_runtime": 2500.4657,
"train_tokens_per_second": 113225.118
},
{
"epoch": 0.02976271003003328,
"grad_norm": 0.48466068506240845,
"learning_rate": 0.004999917536379122,
"loss": 4.0990447998046875,
"num_input_tokens_seen": 288358400,
"step": 550,
"train_runtime": 2545.7721,
"train_tokens_per_second": 113269.528
},
{
"epoch": 0.03030385021239752,
"grad_norm": 0.4300881624221802,
"learning_rate": 0.004999880443533718,
"loss": 4.095553207397461,
"num_input_tokens_seen": 293601280,
"step": 560,
"train_runtime": 2591.05,
"train_tokens_per_second": 113313.63
},
{
"epoch": 0.030844990394761764,
"grad_norm": 0.3266729414463043,
"learning_rate": 0.004999836481906822,
"loss": 4.074318313598633,
"num_input_tokens_seen": 298844160,
"step": 570,
"train_runtime": 2636.411,
"train_tokens_per_second": 113352.647
},
{
"epoch": 0.031386130577126005,
"grad_norm": 0.34210285544395447,
"learning_rate": 0.004999785651632649,
"loss": 4.055056762695313,
"num_input_tokens_seen": 304087040,
"step": 580,
"train_runtime": 2681.696,
"train_tokens_per_second": 113393.554
},
{
"epoch": 0.03192727075949025,
"grad_norm": 0.3171045482158661,
"learning_rate": 0.004999727952866382,
"loss": 4.028103637695312,
"num_input_tokens_seen": 309329920,
"step": 590,
"train_runtime": 2726.9513,
"train_tokens_per_second": 113434.342
},
{
"epoch": 0.032468410941854486,
"grad_norm": 0.28656497597694397,
"learning_rate": 0.00499966338578417,
"loss": 4.014062118530274,
"num_input_tokens_seen": 314572800,
"step": 600,
"train_runtime": 2772.2472,
"train_tokens_per_second": 113472.131
},
{
"epoch": 0.03300955112421873,
"grad_norm": 0.31004276871681213,
"learning_rate": 0.004999591950583134,
"loss": 4.000431060791016,
"num_input_tokens_seen": 319815680,
"step": 610,
"train_runtime": 2817.5313,
"train_tokens_per_second": 113509.186
},
{
"epoch": 0.03355069130658297,
"grad_norm": 0.29579785466194153,
"learning_rate": 0.004999513647481364,
"loss": 3.9810386657714845,
"num_input_tokens_seen": 325058560,
"step": 620,
"train_runtime": 2862.8161,
"train_tokens_per_second": 113545.036
},
{
"epoch": 0.03409183148894721,
"grad_norm": 0.28329184651374817,
"learning_rate": 0.0049994284767179145,
"loss": 3.975200653076172,
"num_input_tokens_seen": 330301440,
"step": 630,
"train_runtime": 2908.1102,
"train_tokens_per_second": 113579.411
},
{
"epoch": 0.034632971671311456,
"grad_norm": 0.2848559319972992,
"learning_rate": 0.004999336438552809,
"loss": 3.9574630737304686,
"num_input_tokens_seen": 335544320,
"step": 640,
"train_runtime": 2953.403,
"train_tokens_per_second": 113612.776
},
{
"epoch": 0.035174111853675694,
"grad_norm": 0.2778968811035156,
"learning_rate": 0.004999237533267034,
"loss": 3.951917266845703,
"num_input_tokens_seen": 340787200,
"step": 650,
"train_runtime": 2998.7048,
"train_tokens_per_second": 113644.799
},
{
"epoch": 0.03571525203603994,
"grad_norm": 0.28124260902404785,
"learning_rate": 0.004999131761162544,
"loss": 3.93038330078125,
"num_input_tokens_seen": 346030080,
"step": 660,
"train_runtime": 3044.0205,
"train_tokens_per_second": 113675.344
},
{
"epoch": 0.036256392218404175,
"grad_norm": 0.25421732664108276,
"learning_rate": 0.004999019122562258,
"loss": 3.9207611083984375,
"num_input_tokens_seen": 351272960,
"step": 670,
"train_runtime": 3089.3299,
"train_tokens_per_second": 113705.227
},
{
"epoch": 0.03679753240076842,
"grad_norm": 0.2740730345249176,
"learning_rate": 0.0049988996178100525,
"loss": 3.91453857421875,
"num_input_tokens_seen": 356515840,
"step": 680,
"train_runtime": 3134.6017,
"train_tokens_per_second": 113735.61
},
{
"epoch": 0.037338672583132664,
"grad_norm": 0.2670656740665436,
"learning_rate": 0.004998773247270772,
"loss": 3.884227752685547,
"num_input_tokens_seen": 361758720,
"step": 690,
"train_runtime": 3179.9122,
"train_tokens_per_second": 113763.746
},
{
"epoch": 0.0378798127654969,
"grad_norm": 0.2549172341823578,
"learning_rate": 0.004998640011330221,
"loss": 3.880903625488281,
"num_input_tokens_seen": 367001600,
"step": 700,
"train_runtime": 3225.2126,
"train_tokens_per_second": 113791.443
},
{
"epoch": 0.038420952947861145,
"grad_norm": 0.23274943232536316,
"learning_rate": 0.004998499910395162,
"loss": 3.8808818817138673,
"num_input_tokens_seen": 372244480,
"step": 710,
"train_runtime": 3270.4782,
"train_tokens_per_second": 113819.588
},
{
"epoch": 0.03896209313022538,
"grad_norm": 0.2661728858947754,
"learning_rate": 0.004998352944893316,
"loss": 3.860551452636719,
"num_input_tokens_seen": 377487360,
"step": 720,
"train_runtime": 3315.7715,
"train_tokens_per_second": 113846.012
},
{
"epoch": 0.039503233312589627,
"grad_norm": 0.27070483565330505,
"learning_rate": 0.004998199115273362,
"loss": 3.8578773498535157,
"num_input_tokens_seen": 382730240,
"step": 730,
"train_runtime": 3361.0384,
"train_tokens_per_second": 113872.616
},
{
"epoch": 0.04004437349495387,
"grad_norm": 0.2620537281036377,
"learning_rate": 0.004998038422004937,
"loss": 3.8334423065185548,
"num_input_tokens_seen": 387973120,
"step": 740,
"train_runtime": 3406.3177,
"train_tokens_per_second": 113898.102
},
{
"epoch": 0.04058551367731811,
"grad_norm": 0.24665935337543488,
"learning_rate": 0.004997870865578627,
"loss": 3.830191802978516,
"num_input_tokens_seen": 393216000,
"step": 750,
"train_runtime": 3451.6094,
"train_tokens_per_second": 113922.508
},
{
"epoch": 0.04112665385968235,
"grad_norm": 0.3058369755744934,
"learning_rate": 0.004997696446505975,
"loss": 3.81226806640625,
"num_input_tokens_seen": 398458880,
"step": 760,
"train_runtime": 3496.8514,
"train_tokens_per_second": 113947.901
},
{
"epoch": 0.04166779404204659,
"grad_norm": 0.24344538152217865,
"learning_rate": 0.004997515165319476,
"loss": 3.8191978454589846,
"num_input_tokens_seen": 403701760,
"step": 770,
"train_runtime": 3545.7622,
"train_tokens_per_second": 113854.718
},
{
"epoch": 0.042208934224410834,
"grad_norm": 0.26970189809799194,
"learning_rate": 0.004997327022572571,
"loss": 3.794965362548828,
"num_input_tokens_seen": 408944640,
"step": 780,
"train_runtime": 3591.0695,
"train_tokens_per_second": 113878.231
},
{
"epoch": 0.04275007440677508,
"grad_norm": 0.2699701189994812,
"learning_rate": 0.0049971320188396525,
"loss": 3.7990867614746096,
"num_input_tokens_seen": 414187520,
"step": 790,
"train_runtime": 3636.3454,
"train_tokens_per_second": 113902.14
},
{
"epoch": 0.043291214589139315,
"grad_norm": 0.24337078630924225,
"learning_rate": 0.004996930154716057,
"loss": 3.795510101318359,
"num_input_tokens_seen": 419430400,
"step": 800,
"train_runtime": 3681.6305,
"train_tokens_per_second": 113925.175
},
{
"epoch": 0.04383235477150356,
"grad_norm": 0.24991652369499207,
"learning_rate": 0.004996721430818068,
"loss": 3.7792850494384767,
"num_input_tokens_seen": 424673280,
"step": 810,
"train_runtime": 3726.9273,
"train_tokens_per_second": 113947.293
},
{
"epoch": 0.0443734949538678,
"grad_norm": 0.22850197553634644,
"learning_rate": 0.004996505847782908,
"loss": 3.7752288818359374,
"num_input_tokens_seen": 429916160,
"step": 820,
"train_runtime": 3772.1962,
"train_tokens_per_second": 113969.725
},
{
"epoch": 0.04491463513623204,
"grad_norm": 0.24704036116600037,
"learning_rate": 0.004996283406268743,
"loss": 3.7673095703125,
"num_input_tokens_seen": 435159040,
"step": 830,
"train_runtime": 3817.4555,
"train_tokens_per_second": 113991.908
},
{
"epoch": 0.045455775318596285,
"grad_norm": 0.24149645864963531,
"learning_rate": 0.004996054106954677,
"loss": 3.767901611328125,
"num_input_tokens_seen": 440401920,
"step": 840,
"train_runtime": 3862.7306,
"train_tokens_per_second": 114013.106
},
{
"epoch": 0.04599691550096052,
"grad_norm": 0.26389098167419434,
"learning_rate": 0.004995817950540749,
"loss": 3.765447998046875,
"num_input_tokens_seen": 445644800,
"step": 850,
"train_runtime": 3908.0129,
"train_tokens_per_second": 114033.605
},
{
"epoch": 0.04653805568332477,
"grad_norm": 0.2389504611492157,
"learning_rate": 0.004995574937747936,
"loss": 3.7446453094482424,
"num_input_tokens_seen": 450887680,
"step": 860,
"train_runtime": 3953.2772,
"train_tokens_per_second": 114054.151
},
{
"epoch": 0.047079195865689004,
"grad_norm": 0.21696795523166656,
"learning_rate": 0.0049953250693181425,
"loss": 3.7382736206054688,
"num_input_tokens_seen": 456130560,
"step": 870,
"train_runtime": 3998.5453,
"train_tokens_per_second": 114074.125
},
{
"epoch": 0.04762033604805325,
"grad_norm": 0.23217777907848358,
"learning_rate": 0.004995068346014207,
"loss": 3.7418495178222657,
"num_input_tokens_seen": 461373440,
"step": 880,
"train_runtime": 4043.8212,
"train_tokens_per_second": 114093.431
},
{
"epoch": 0.04816147623041749,
"grad_norm": 0.25520190596580505,
"learning_rate": 0.004994804768619892,
"loss": 3.7273784637451173,
"num_input_tokens_seen": 466616320,
"step": 890,
"train_runtime": 4089.1251,
"train_tokens_per_second": 114111.53
},
{
"epoch": 0.04870261641278173,
"grad_norm": 0.2495919018983841,
"learning_rate": 0.004994534337939889,
"loss": 3.7182594299316407,
"num_input_tokens_seen": 471859200,
"step": 900,
"train_runtime": 4134.3995,
"train_tokens_per_second": 114130.045
},
{
"epoch": 0.049243756595145974,
"grad_norm": 0.2571962773799896,
"learning_rate": 0.00499425705479981,
"loss": 3.7261619567871094,
"num_input_tokens_seen": 477102080,
"step": 910,
"train_runtime": 4179.6624,
"train_tokens_per_second": 114148.472
},
{
"epoch": 0.04978489677751021,
"grad_norm": 0.2216644585132599,
"learning_rate": 0.004993972920046188,
"loss": 3.705414581298828,
"num_input_tokens_seen": 482344960,
"step": 920,
"train_runtime": 4224.9503,
"train_tokens_per_second": 114165.831
},
{
"epoch": 0.050326036959874455,
"grad_norm": 0.2777004539966583,
"learning_rate": 0.004993681934546471,
"loss": 3.707286834716797,
"num_input_tokens_seen": 487587840,
"step": 930,
"train_runtime": 4270.2223,
"train_tokens_per_second": 114183.246
},
{
"epoch": 0.0508671771422387,
"grad_norm": 0.23501209914684296,
"learning_rate": 0.004993384099189028,
"loss": 3.7012203216552733,
"num_input_tokens_seen": 492830720,
"step": 940,
"train_runtime": 4315.4919,
"train_tokens_per_second": 114200.358
},
{
"epoch": 0.05140831732460294,
"grad_norm": 0.2504929304122925,
"learning_rate": 0.004993079414883134,
"loss": 3.7007171630859377,
"num_input_tokens_seen": 498073600,
"step": 950,
"train_runtime": 4360.758,
"train_tokens_per_second": 114217.207
},
{
"epoch": 0.05194945750696718,
"grad_norm": 0.265903502702713,
"learning_rate": 0.004992767882558976,
"loss": 3.6977813720703123,
"num_input_tokens_seen": 503316480,
"step": 960,
"train_runtime": 4406.0254,
"train_tokens_per_second": 114233.676
},
{
"epoch": 0.05249059768933142,
"grad_norm": 0.22946324944496155,
"learning_rate": 0.00499244950316765,
"loss": 3.6873912811279297,
"num_input_tokens_seen": 508559360,
"step": 970,
"train_runtime": 4451.29,
"train_tokens_per_second": 114249.883
},
{
"epoch": 0.05303173787169566,
"grad_norm": 0.2554706633090973,
"learning_rate": 0.004992124277681152,
"loss": 3.6791450500488283,
"num_input_tokens_seen": 513802240,
"step": 980,
"train_runtime": 4496.5462,
"train_tokens_per_second": 114265.975
},
{
"epoch": 0.05357287805405991,
"grad_norm": 0.22852079570293427,
"learning_rate": 0.004991792207092381,
"loss": 3.677058792114258,
"num_input_tokens_seen": 519045120,
"step": 990,
"train_runtime": 4541.8028,
"train_tokens_per_second": 114281.739
},
{
"epoch": 0.054114018236424144,
"grad_norm": 0.24798494577407837,
"learning_rate": 0.004991453292415134,
"loss": 3.657318115234375,
"num_input_tokens_seen": 524288000,
"step": 1000,
"train_runtime": 4587.0445,
"train_tokens_per_second": 114297.561
},
{
"epoch": 0.054114018236424144,
"eval_loss": 3.6001899242401123,
"eval_runtime": 1.9848,
"eval_samples_per_second": 251.913,
"eval_steps_per_second": 4.031,
"num_input_tokens_seen": 524288000,
"step": 1000
},
{
"epoch": 0.05465515841878839,
"grad_norm": 0.223563551902771,
"learning_rate": 0.0049911075346841,
"loss": 3.666912841796875,
"num_input_tokens_seen": 529530880,
"step": 1010,
"train_runtime": 4637.4751,
"train_tokens_per_second": 114185.17
},
{
"epoch": 0.055196298601152625,
"grad_norm": 0.24604271352291107,
"learning_rate": 0.004990754934954863,
"loss": 3.6610164642333984,
"num_input_tokens_seen": 534773760,
"step": 1020,
"train_runtime": 4682.7302,
"train_tokens_per_second": 114201.276
},
{
"epoch": 0.05573743878351687,
"grad_norm": 0.2436058074235916,
"learning_rate": 0.004990395494303893,
"loss": 3.6538921356201173,
"num_input_tokens_seen": 540016640,
"step": 1030,
"train_runtime": 4727.9737,
"train_tokens_per_second": 114217.353
},
{
"epoch": 0.056278578965881114,
"grad_norm": 0.24788981676101685,
"learning_rate": 0.004990029213828546,
"loss": 3.6453926086425783,
"num_input_tokens_seen": 545259520,
"step": 1040,
"train_runtime": 4773.2764,
"train_tokens_per_second": 114231.708
},
{
"epoch": 0.05681971914824535,
"grad_norm": 0.2355376034975052,
"learning_rate": 0.00498965609464706,
"loss": 3.653607940673828,
"num_input_tokens_seen": 550502400,
"step": 1050,
"train_runtime": 4818.5445,
"train_tokens_per_second": 114246.615
},
{
"epoch": 0.057360859330609595,
"grad_norm": 0.24511760473251343,
"learning_rate": 0.0049892761378985484,
"loss": 3.655783462524414,
"num_input_tokens_seen": 555745280,
"step": 1060,
"train_runtime": 4863.8191,
"train_tokens_per_second": 114261.091
},
{
"epoch": 0.05790199951297383,
"grad_norm": 0.2463475465774536,
"learning_rate": 0.004988889344743005,
"loss": 3.6497840881347656,
"num_input_tokens_seen": 560988160,
"step": 1070,
"train_runtime": 4909.1151,
"train_tokens_per_second": 114274.804
},
{
"epoch": 0.05844313969533808,
"grad_norm": 0.24649877846240997,
"learning_rate": 0.00498849571636129,
"loss": 3.6289398193359377,
"num_input_tokens_seen": 566231040,
"step": 1080,
"train_runtime": 4954.3585,
"train_tokens_per_second": 114289.476
},
{
"epoch": 0.05898427987770232,
"grad_norm": 0.21440814435482025,
"learning_rate": 0.004988095253955132,
"loss": 3.6420303344726563,
"num_input_tokens_seen": 571473920,
"step": 1090,
"train_runtime": 4999.6131,
"train_tokens_per_second": 114303.629
},
{
"epoch": 0.05952542006006656,
"grad_norm": 0.23143576085567474,
"learning_rate": 0.004987687958747124,
"loss": 3.636464309692383,
"num_input_tokens_seen": 576716800,
"step": 1100,
"train_runtime": 5044.8489,
"train_tokens_per_second": 114317.952
},
{
"epoch": 0.0600665602424308,
"grad_norm": 0.216554194688797,
"learning_rate": 0.0049872738319807226,
"loss": 3.6284786224365235,
"num_input_tokens_seen": 581959680,
"step": 1110,
"train_runtime": 5090.1116,
"train_tokens_per_second": 114331.419
},
{
"epoch": 0.06060770042479504,
"grad_norm": 0.21454273164272308,
"learning_rate": 0.004986852874920234,
"loss": 3.628643035888672,
"num_input_tokens_seen": 587202560,
"step": 1120,
"train_runtime": 5135.379,
"train_tokens_per_second": 114344.542
},
{
"epoch": 0.061148840607159284,
"grad_norm": 0.22195634245872498,
"learning_rate": 0.004986425088850824,
"loss": 3.6224212646484375,
"num_input_tokens_seen": 592445440,
"step": 1130,
"train_runtime": 5180.6463,
"train_tokens_per_second": 114357.438
},
{
"epoch": 0.06168998078952353,
"grad_norm": 0.23462195694446564,
"learning_rate": 0.004985990475078501,
"loss": 3.614238739013672,
"num_input_tokens_seen": 597688320,
"step": 1140,
"train_runtime": 5225.8696,
"train_tokens_per_second": 114371.074
},
{
"epoch": 0.062231120971887766,
"grad_norm": 0.2454216629266739,
"learning_rate": 0.004985549034930123,
"loss": 3.6097618103027345,
"num_input_tokens_seen": 602931200,
"step": 1150,
"train_runtime": 5274.713,
"train_tokens_per_second": 114305.973
},
{
"epoch": 0.06277226115425201,
"grad_norm": 0.22363615036010742,
"learning_rate": 0.004985100769753384,
"loss": 3.605723571777344,
"num_input_tokens_seen": 608174080,
"step": 1160,
"train_runtime": 5319.9954,
"train_tokens_per_second": 114318.535
},
{
"epoch": 0.06331340133661625,
"grad_norm": 0.2078346163034439,
"learning_rate": 0.00498464568091682,
"loss": 3.602735900878906,
"num_input_tokens_seen": 613416960,
"step": 1170,
"train_runtime": 5365.2147,
"train_tokens_per_second": 114332.23
},
{
"epoch": 0.0638545415189805,
"grad_norm": 0.21972794830799103,
"learning_rate": 0.004984183769809795,
"loss": 3.598741912841797,
"num_input_tokens_seen": 618659840,
"step": 1180,
"train_runtime": 5410.4312,
"train_tokens_per_second": 114345.754
},
{
"epoch": 0.06439568170134473,
"grad_norm": 0.2427527755498886,
"learning_rate": 0.0049837150378425005,
"loss": 3.596208190917969,
"num_input_tokens_seen": 623902720,
"step": 1190,
"train_runtime": 5455.6665,
"train_tokens_per_second": 114358.662
},
{
"epoch": 0.06493682188370897,
"grad_norm": 0.2279594987630844,
"learning_rate": 0.004983239486445956,
"loss": 3.59366455078125,
"num_input_tokens_seen": 629145600,
"step": 1200,
"train_runtime": 5500.922,
"train_tokens_per_second": 114370.936
},
{
"epoch": 0.06547796206607322,
"grad_norm": 0.23130950331687927,
"learning_rate": 0.004982757117071998,
"loss": 3.592302703857422,
"num_input_tokens_seen": 634388480,
"step": 1210,
"train_runtime": 5546.1769,
"train_tokens_per_second": 114383.024
},
{
"epoch": 0.06601910224843746,
"grad_norm": 0.2286449670791626,
"learning_rate": 0.004982267931193276,
"loss": 3.5859790802001954,
"num_input_tokens_seen": 639631360,
"step": 1220,
"train_runtime": 5591.4258,
"train_tokens_per_second": 114395.038
},
{
"epoch": 0.0665602424308017,
"grad_norm": 0.23779889941215515,
"learning_rate": 0.004981771930303254,
"loss": 3.586525726318359,
"num_input_tokens_seen": 644874240,
"step": 1230,
"train_runtime": 5636.6531,
"train_tokens_per_second": 114407.297
},
{
"epoch": 0.06710138261316594,
"grad_norm": 0.23324504494667053,
"learning_rate": 0.004981269115916199,
"loss": 3.579142379760742,
"num_input_tokens_seen": 650117120,
"step": 1240,
"train_runtime": 5681.8961,
"train_tokens_per_second": 114419.044
},
{
"epoch": 0.06764252279553018,
"grad_norm": 0.2067607045173645,
"learning_rate": 0.004980759489567181,
"loss": 3.5813358306884764,
"num_input_tokens_seen": 655360000,
"step": 1250,
"train_runtime": 5727.162,
"train_tokens_per_second": 114430.148
},
{
"epoch": 0.06818366297789442,
"grad_norm": 0.20190924406051636,
"learning_rate": 0.004980243052812064,
"loss": 3.572435760498047,
"num_input_tokens_seen": 660602880,
"step": 1260,
"train_runtime": 5772.4268,
"train_tokens_per_second": 114441.102
},
{
"epoch": 0.06872480316025867,
"grad_norm": 0.19773253798484802,
"learning_rate": 0.004979719807227508,
"loss": 3.5610916137695314,
"num_input_tokens_seen": 665845760,
"step": 1270,
"train_runtime": 5817.687,
"train_tokens_per_second": 114451.974
},
{
"epoch": 0.06926594334262291,
"grad_norm": 0.21706561744213104,
"learning_rate": 0.004979189754410956,
"loss": 3.5655101776123046,
"num_input_tokens_seen": 671088640,
"step": 1280,
"train_runtime": 5862.9264,
"train_tokens_per_second": 114463.084
},
{
"epoch": 0.06980708352498714,
"grad_norm": 0.23492570221424103,
"learning_rate": 0.004978652895980635,
"loss": 3.571335220336914,
"num_input_tokens_seen": 676331520,
"step": 1290,
"train_runtime": 5908.1779,
"train_tokens_per_second": 114473.792
},
{
"epoch": 0.07034822370735139,
"grad_norm": 0.23728297650814056,
"learning_rate": 0.004978109233575551,
"loss": 3.5683116912841797,
"num_input_tokens_seen": 681574400,
"step": 1300,
"train_runtime": 5953.4117,
"train_tokens_per_second": 114484.674
},
{
"epoch": 0.07088936388971563,
"grad_norm": 0.2128531038761139,
"learning_rate": 0.0049775587688554775,
"loss": 3.553203582763672,
"num_input_tokens_seen": 686817280,
"step": 1310,
"train_runtime": 5998.6316,
"train_tokens_per_second": 114495.659
},
{
"epoch": 0.07143050407207988,
"grad_norm": 0.22945411503314972,
"learning_rate": 0.004977001503500959,
"loss": 3.5565677642822267,
"num_input_tokens_seen": 692060160,
"step": 1320,
"train_runtime": 6043.8634,
"train_tokens_per_second": 114506.254
},
{
"epoch": 0.07197164425444412,
"grad_norm": 0.21733802556991577,
"learning_rate": 0.004976437439213302,
"loss": 3.5509429931640626,
"num_input_tokens_seen": 697303040,
"step": 1330,
"train_runtime": 6089.0954,
"train_tokens_per_second": 114516.688
},
{
"epoch": 0.07251278443680835,
"grad_norm": 0.24347279965877533,
"learning_rate": 0.004975866577714568,
"loss": 3.54642333984375,
"num_input_tokens_seen": 702545920,
"step": 1340,
"train_runtime": 6134.3055,
"train_tokens_per_second": 114527.376
},
{
"epoch": 0.0730539246191726,
"grad_norm": 0.21520718932151794,
"learning_rate": 0.004975288920747571,
"loss": 3.550141143798828,
"num_input_tokens_seen": 707788800,
"step": 1350,
"train_runtime": 6179.5539,
"train_tokens_per_second": 114537.201
},
{
"epoch": 0.07359506480153684,
"grad_norm": 0.23248061537742615,
"learning_rate": 0.0049747044700758705,
"loss": 3.5488357543945312,
"num_input_tokens_seen": 713031680,
"step": 1360,
"train_runtime": 6224.7742,
"train_tokens_per_second": 114547.397
},
{
"epoch": 0.07413620498390108,
"grad_norm": 0.23462453484535217,
"learning_rate": 0.004974113227483768,
"loss": 3.5416290283203127,
"num_input_tokens_seen": 718274560,
"step": 1370,
"train_runtime": 6269.9891,
"train_tokens_per_second": 114557.544
},
{
"epoch": 0.07467734516626533,
"grad_norm": 0.2253153920173645,
"learning_rate": 0.004973515194776301,
"loss": 3.540643310546875,
"num_input_tokens_seen": 723517440,
"step": 1380,
"train_runtime": 6315.2141,
"train_tokens_per_second": 114567.365
},
{
"epoch": 0.07521848534862956,
"grad_norm": 0.21088625490665436,
"learning_rate": 0.0049729103737792355,
"loss": 3.543656921386719,
"num_input_tokens_seen": 728760320,
"step": 1390,
"train_runtime": 6360.4473,
"train_tokens_per_second": 114576.898
},
{
"epoch": 0.0757596255309938,
"grad_norm": 0.2161734253168106,
"learning_rate": 0.00497229876633906,
"loss": 3.5383941650390627,
"num_input_tokens_seen": 734003200,
"step": 1400,
"train_runtime": 6405.6811,
"train_tokens_per_second": 114586.285
},
{
"epoch": 0.07630076571335805,
"grad_norm": 0.20709098875522614,
"learning_rate": 0.004971680374322986,
"loss": 3.5313690185546873,
"num_input_tokens_seen": 739246080,
"step": 1410,
"train_runtime": 6450.8882,
"train_tokens_per_second": 114596.015
},
{
"epoch": 0.07684190589572229,
"grad_norm": 0.20399479568004608,
"learning_rate": 0.004971055199618935,
"loss": 3.525136184692383,
"num_input_tokens_seen": 744488960,
"step": 1420,
"train_runtime": 6496.1128,
"train_tokens_per_second": 114605.3
},
{
"epoch": 0.07738304607808653,
"grad_norm": 0.21809037029743195,
"learning_rate": 0.004970423244135538,
"loss": 3.53038330078125,
"num_input_tokens_seen": 749731840,
"step": 1430,
"train_runtime": 6541.3447,
"train_tokens_per_second": 114614.33
},
{
"epoch": 0.07792418626045076,
"grad_norm": 0.22500748932361603,
"learning_rate": 0.004969784509802125,
"loss": 3.5225593566894533,
"num_input_tokens_seen": 754974720,
"step": 1440,
"train_runtime": 6586.5647,
"train_tokens_per_second": 114623.442
},
{
"epoch": 0.07846532644281501,
"grad_norm": 0.22614286839962006,
"learning_rate": 0.0049691389985687204,
"loss": 3.5291175842285156,
"num_input_tokens_seen": 760217600,
"step": 1450,
"train_runtime": 6631.7972,
"train_tokens_per_second": 114632.215
},
{
"epoch": 0.07900646662517925,
"grad_norm": 0.2029477208852768,
"learning_rate": 0.004968486712406044,
"loss": 3.5189224243164063,
"num_input_tokens_seen": 765460480,
"step": 1460,
"train_runtime": 6677.0236,
"train_tokens_per_second": 114640.973
},
{
"epoch": 0.0795476068075435,
"grad_norm": 0.2242126613855362,
"learning_rate": 0.004967827653305494,
"loss": 3.504582977294922,
"num_input_tokens_seen": 770703360,
"step": 1470,
"train_runtime": 6722.2724,
"train_tokens_per_second": 114649.232
},
{
"epoch": 0.08008874698990774,
"grad_norm": 0.2245851755142212,
"learning_rate": 0.004967161823279147,
"loss": 3.5151710510253906,
"num_input_tokens_seen": 775946240,
"step": 1480,
"train_runtime": 6767.5092,
"train_tokens_per_second": 114657.582
},
{
"epoch": 0.08062988717227197,
"grad_norm": 0.21762171387672424,
"learning_rate": 0.004966489224359752,
"loss": 3.510267639160156,
"num_input_tokens_seen": 781189120,
"step": 1490,
"train_runtime": 6812.7609,
"train_tokens_per_second": 114665.571
},
{
"epoch": 0.08117102735463622,
"grad_norm": 0.22595250606536865,
"learning_rate": 0.0049658098586007225,
"loss": 3.515250396728516,
"num_input_tokens_seen": 786432000,
"step": 1500,
"train_runtime": 6857.9968,
"train_tokens_per_second": 114673.719
},
{
"epoch": 0.08117102735463622,
"eval_loss": 3.439197540283203,
"eval_runtime": 1.986,
"eval_samples_per_second": 251.759,
"eval_steps_per_second": 4.028,
"num_input_tokens_seen": 786432000,
"step": 1500
},
{
"epoch": 0.08171216753700046,
"grad_norm": 0.2041054517030716,
"learning_rate": 0.00496512372807613,
"loss": 3.5051536560058594,
"num_input_tokens_seen": 791674880,
"step": 1510,
"train_runtime": 6905.2279,
"train_tokens_per_second": 114648.624
},
{
"epoch": 0.0822533077193647,
"grad_norm": 0.20704089105129242,
"learning_rate": 0.004964430834880702,
"loss": 3.498210906982422,
"num_input_tokens_seen": 796917760,
"step": 1520,
"train_runtime": 6950.4841,
"train_tokens_per_second": 114656.439
},
{
"epoch": 0.08279444790172895,
"grad_norm": 0.2235393226146698,
"learning_rate": 0.0049637311811298055,
"loss": 3.510853958129883,
"num_input_tokens_seen": 802160640,
"step": 1530,
"train_runtime": 6999.4133,
"train_tokens_per_second": 114603.982
},
{
"epoch": 0.08333558808409318,
"grad_norm": 0.19351601600646973,
"learning_rate": 0.004963024768959454,
"loss": 3.4970939636230467,
"num_input_tokens_seen": 807403520,
"step": 1540,
"train_runtime": 7044.6706,
"train_tokens_per_second": 114611.962
},
{
"epoch": 0.08387672826645742,
"grad_norm": 0.2104959934949875,
"learning_rate": 0.0049623116005262915,
"loss": 3.5016387939453124,
"num_input_tokens_seen": 812646400,
"step": 1550,
"train_runtime": 7089.915,
"train_tokens_per_second": 114620.048
},
{
"epoch": 0.08441786844882167,
"grad_norm": 0.25421494245529175,
"learning_rate": 0.004961591678007588,
"loss": 3.50089111328125,
"num_input_tokens_seen": 817889280,
"step": 1560,
"train_runtime": 7135.1609,
"train_tokens_per_second": 114628.008
},
{
"epoch": 0.08495900863118591,
"grad_norm": 0.2085292786359787,
"learning_rate": 0.004960865003601232,
"loss": 3.5003082275390627,
"num_input_tokens_seen": 823132160,
"step": 1570,
"train_runtime": 7180.3935,
"train_tokens_per_second": 114636.079
},
{
"epoch": 0.08550014881355016,
"grad_norm": 0.2042287439107895,
"learning_rate": 0.00496013157952573,
"loss": 3.495660400390625,
"num_input_tokens_seen": 828375040,
"step": 1580,
"train_runtime": 7225.6594,
"train_tokens_per_second": 114643.521
},
{
"epoch": 0.08604128899591439,
"grad_norm": 0.21099670231342316,
"learning_rate": 0.004959391408020191,
"loss": 3.4938674926757813,
"num_input_tokens_seen": 833617920,
"step": 1590,
"train_runtime": 7270.8988,
"train_tokens_per_second": 114651.29
},
{
"epoch": 0.08658242917827863,
"grad_norm": 0.19382232427597046,
"learning_rate": 0.004958644491344324,
"loss": 3.4875198364257813,
"num_input_tokens_seen": 838860800,
"step": 1600,
"train_runtime": 7316.1586,
"train_tokens_per_second": 114658.64
},
{
"epoch": 0.08712356936064287,
"grad_norm": 0.20325426757335663,
"learning_rate": 0.0049578908317784295,
"loss": 3.487265777587891,
"num_input_tokens_seen": 844103680,
"step": 1610,
"train_runtime": 7361.3918,
"train_tokens_per_second": 114666.316
},
{
"epoch": 0.08766470954300712,
"grad_norm": 0.21367783844470978,
"learning_rate": 0.004957130431623399,
"loss": 3.4908119201660157,
"num_input_tokens_seen": 849346560,
"step": 1620,
"train_runtime": 7406.6443,
"train_tokens_per_second": 114673.6
},
{
"epoch": 0.08820584972537136,
"grad_norm": 0.19166916608810425,
"learning_rate": 0.004956363293200697,
"loss": 3.478108215332031,
"num_input_tokens_seen": 854589440,
"step": 1630,
"train_runtime": 7451.8697,
"train_tokens_per_second": 114681.211
},
{
"epoch": 0.0887469899077356,
"grad_norm": 0.22475136816501617,
"learning_rate": 0.004955589418852363,
"loss": 3.4743488311767576,
"num_input_tokens_seen": 859832320,
"step": 1640,
"train_runtime": 7497.1227,
"train_tokens_per_second": 114688.308
},
{
"epoch": 0.08928813009009984,
"grad_norm": 0.22001579403877258,
"learning_rate": 0.004954808810940998,
"loss": 3.481397247314453,
"num_input_tokens_seen": 865075200,
"step": 1650,
"train_runtime": 7542.3425,
"train_tokens_per_second": 114695.826
},
{
"epoch": 0.08982927027246408,
"grad_norm": 0.20330502092838287,
"learning_rate": 0.0049540214718497635,
"loss": 3.47830810546875,
"num_input_tokens_seen": 870318080,
"step": 1660,
"train_runtime": 7587.5621,
"train_tokens_per_second": 114703.256
},
{
"epoch": 0.09037041045482833,
"grad_norm": 0.21012984216213226,
"learning_rate": 0.00495322740398237,
"loss": 3.470731735229492,
"num_input_tokens_seen": 875560960,
"step": 1670,
"train_runtime": 7632.7891,
"train_tokens_per_second": 114710.488
},
{
"epoch": 0.09091155063719257,
"grad_norm": 0.20543314516544342,
"learning_rate": 0.004952426609763068,
"loss": 3.4727859497070312,
"num_input_tokens_seen": 880803840,
"step": 1680,
"train_runtime": 7678.0113,
"train_tokens_per_second": 114717.706
},
{
"epoch": 0.0914526908195568,
"grad_norm": 0.2099304497241974,
"learning_rate": 0.004951619091636649,
"loss": 3.462004852294922,
"num_input_tokens_seen": 886046720,
"step": 1690,
"train_runtime": 7723.225,
"train_tokens_per_second": 114724.965
},
{
"epoch": 0.09199383100192104,
"grad_norm": 0.19785360991954803,
"learning_rate": 0.004950804852068425,
"loss": 3.468863677978516,
"num_input_tokens_seen": 891289600,
"step": 1700,
"train_runtime": 7768.4346,
"train_tokens_per_second": 114732.202
},
{
"epoch": 0.09253497118428529,
"grad_norm": 0.20223312079906464,
"learning_rate": 0.004949983893544234,
"loss": 3.47713623046875,
"num_input_tokens_seen": 896532480,
"step": 1710,
"train_runtime": 7813.6346,
"train_tokens_per_second": 114739.494
},
{
"epoch": 0.09307611136664953,
"grad_norm": 0.21073880791664124,
"learning_rate": 0.004949156218570423,
"loss": 3.4744213104248045,
"num_input_tokens_seen": 901775360,
"step": 1720,
"train_runtime": 7858.8265,
"train_tokens_per_second": 114746.822
},
{
"epoch": 0.09361725154901378,
"grad_norm": 0.21444642543792725,
"learning_rate": 0.004948321829673847,
"loss": 3.4606704711914062,
"num_input_tokens_seen": 907018240,
"step": 1730,
"train_runtime": 7904.0563,
"train_tokens_per_second": 114753.514
},
{
"epoch": 0.09415839173137801,
"grad_norm": 0.21360410749912262,
"learning_rate": 0.004947480729401857,
"loss": 3.468334197998047,
"num_input_tokens_seen": 912261120,
"step": 1740,
"train_runtime": 7949.2813,
"train_tokens_per_second": 114760.201
},
{
"epoch": 0.09469953191374225,
"grad_norm": 0.20467202365398407,
"learning_rate": 0.0049466329203222935,
"loss": 3.451313018798828,
"num_input_tokens_seen": 917504000,
"step": 1750,
"train_runtime": 7994.4951,
"train_tokens_per_second": 114766.973
},
{
"epoch": 0.0952406720961065,
"grad_norm": 0.2081199437379837,
"learning_rate": 0.004945778405023478,
"loss": 3.4613468170166017,
"num_input_tokens_seen": 922746880,
"step": 1760,
"train_runtime": 8039.7393,
"train_tokens_per_second": 114773.235
},
{
"epoch": 0.09578181227847074,
"grad_norm": 0.2067294865846634,
"learning_rate": 0.004944917186114206,
"loss": 3.4611587524414062,
"num_input_tokens_seen": 927989760,
"step": 1770,
"train_runtime": 8084.9796,
"train_tokens_per_second": 114779.481
},
{
"epoch": 0.09632295246083498,
"grad_norm": 0.19931091368198395,
"learning_rate": 0.00494404926622374,
"loss": 3.462744140625,
"num_input_tokens_seen": 933232640,
"step": 1780,
"train_runtime": 8130.2373,
"train_tokens_per_second": 114785.413
},
{
"epoch": 0.09686409264319921,
"grad_norm": 0.213568776845932,
"learning_rate": 0.004943174648001798,
"loss": 3.456349182128906,
"num_input_tokens_seen": 938475520,
"step": 1790,
"train_runtime": 8175.469,
"train_tokens_per_second": 114791.644
},
{
"epoch": 0.09740523282556346,
"grad_norm": 0.20064200460910797,
"learning_rate": 0.004942293334118552,
"loss": 3.4558643341064452,
"num_input_tokens_seen": 943718400,
"step": 1800,
"train_runtime": 8220.7059,
"train_tokens_per_second": 114797.732
},
{
"epoch": 0.0979463730079277,
"grad_norm": 0.21110571920871735,
"learning_rate": 0.004941405327264611,
"loss": 3.4533897399902345,
"num_input_tokens_seen": 948961280,
"step": 1810,
"train_runtime": 8265.9542,
"train_tokens_per_second": 114803.598
},
{
"epoch": 0.09848751319029195,
"grad_norm": 0.20624655485153198,
"learning_rate": 0.0049405106301510186,
"loss": 3.4500003814697267,
"num_input_tokens_seen": 954204160,
"step": 1820,
"train_runtime": 8311.1987,
"train_tokens_per_second": 114809.45
},
{
"epoch": 0.09902865337265619,
"grad_norm": 0.20945154130458832,
"learning_rate": 0.004939609245509244,
"loss": 3.440562057495117,
"num_input_tokens_seen": 959447040,
"step": 1830,
"train_runtime": 8356.4084,
"train_tokens_per_second": 114815.719
},
{
"epoch": 0.09956979355502042,
"grad_norm": 0.20618219673633575,
"learning_rate": 0.004938701176091175,
"loss": 3.4402488708496093,
"num_input_tokens_seen": 964689920,
"step": 1840,
"train_runtime": 8401.6104,
"train_tokens_per_second": 114822.025
},
{
"epoch": 0.10011093373738467,
"grad_norm": 0.21744751930236816,
"learning_rate": 0.004937786424669103,
"loss": 3.447218322753906,
"num_input_tokens_seen": 969932800,
"step": 1850,
"train_runtime": 8446.8197,
"train_tokens_per_second": 114828.165
},
{
"epoch": 0.10065207391974891,
"grad_norm": 0.207778662443161,
"learning_rate": 0.004936864994035724,
"loss": 3.4344856262207033,
"num_input_tokens_seen": 975175680,
"step": 1860,
"train_runtime": 8492.045,
"train_tokens_per_second": 114834.022
},
{
"epoch": 0.10119321410211315,
"grad_norm": 0.20873455703258514,
"learning_rate": 0.004935936887004123,
"loss": 3.4340728759765624,
"num_input_tokens_seen": 980418560,
"step": 1870,
"train_runtime": 8537.2574,
"train_tokens_per_second": 114839.99
},
{
"epoch": 0.1017343542844774,
"grad_norm": 0.21933899819850922,
"learning_rate": 0.004935002106407768,
"loss": 3.431113433837891,
"num_input_tokens_seen": 985661440,
"step": 1880,
"train_runtime": 8582.5067,
"train_tokens_per_second": 114845.403
},
{
"epoch": 0.10227549446684163,
"grad_norm": 0.19961251318454742,
"learning_rate": 0.0049340606551005,
"loss": 3.4356346130371094,
"num_input_tokens_seen": 990904320,
"step": 1890,
"train_runtime": 8627.7245,
"train_tokens_per_second": 114851.177
},
{
"epoch": 0.10281663464920587,
"grad_norm": 0.1902250349521637,
"learning_rate": 0.004933112535956529,
"loss": 3.432623291015625,
"num_input_tokens_seen": 996147200,
"step": 1900,
"train_runtime": 8672.9813,
"train_tokens_per_second": 114856.376
},
{
"epoch": 0.10335777483157012,
"grad_norm": 0.1946999877691269,
"learning_rate": 0.004932157751870416,
"loss": 3.435283660888672,
"num_input_tokens_seen": 1001390080,
"step": 1910,
"train_runtime": 8722.0751,
"train_tokens_per_second": 114810.99
},
{
"epoch": 0.10389891501393436,
"grad_norm": 0.21359668672084808,
"learning_rate": 0.004931196305757076,
"loss": 3.4397598266601563,
"num_input_tokens_seen": 1006632960,
"step": 1920,
"train_runtime": 8767.3142,
"train_tokens_per_second": 114816.572
},
{
"epoch": 0.1044400551962986,
"grad_norm": 0.188863143324852,
"learning_rate": 0.004930228200551757,
"loss": 3.428334045410156,
"num_input_tokens_seen": 1011875840,
"step": 1930,
"train_runtime": 8812.53,
"train_tokens_per_second": 114822.399
},
{
"epoch": 0.10498119537866284,
"grad_norm": 0.2043711096048355,
"learning_rate": 0.0049292534392100405,
"loss": 3.428396987915039,
"num_input_tokens_seen": 1017118720,
"step": 1940,
"train_runtime": 8857.7449,
"train_tokens_per_second": 114828.179
},
{
"epoch": 0.10552233556102708,
"grad_norm": 0.18800680339336395,
"learning_rate": 0.00492827202470783,
"loss": 3.423938751220703,
"num_input_tokens_seen": 1022361600,
"step": 1950,
"train_runtime": 8902.9768,
"train_tokens_per_second": 114833.682
},
{
"epoch": 0.10606347574339133,
"grad_norm": 0.20674094557762146,
"learning_rate": 0.004927283960041336,
"loss": 3.4255210876464846,
"num_input_tokens_seen": 1027604480,
"step": 1960,
"train_runtime": 8948.2078,
"train_tokens_per_second": 114839.14
},
{
"epoch": 0.10660461592575557,
"grad_norm": 0.19658301770687103,
"learning_rate": 0.004926289248227076,
"loss": 3.422502899169922,
"num_input_tokens_seen": 1032847360,
"step": 1970,
"train_runtime": 8993.4331,
"train_tokens_per_second": 114844.614
},
{
"epoch": 0.10714575610811981,
"grad_norm": 0.20629730820655823,
"learning_rate": 0.00492528789230186,
"loss": 3.419141387939453,
"num_input_tokens_seen": 1038090240,
"step": 1980,
"train_runtime": 9038.6403,
"train_tokens_per_second": 114850.266
},
{
"epoch": 0.10768689629048404,
"grad_norm": 0.20946621894836426,
"learning_rate": 0.00492427989532278,
"loss": 3.4206031799316405,
"num_input_tokens_seen": 1043333120,
"step": 1990,
"train_runtime": 9083.8627,
"train_tokens_per_second": 114855.668
},
{
"epoch": 0.10822803647284829,
"grad_norm": 0.2047351747751236,
"learning_rate": 0.004923265260367205,
"loss": 3.421718978881836,
"num_input_tokens_seen": 1048576000,
"step": 2000,
"train_runtime": 9129.0866,
"train_tokens_per_second": 114860.998
},
{
"epoch": 0.10822803647284829,
"eval_loss": 3.355583429336548,
"eval_runtime": 1.9828,
"eval_samples_per_second": 252.171,
"eval_steps_per_second": 4.035,
"num_input_tokens_seen": 1048576000,
"step": 2000
},
{
"epoch": 0.10876917665521253,
"grad_norm": 0.18940046429634094,
"learning_rate": 0.004922243990532769,
"loss": 3.4131790161132813,
"num_input_tokens_seen": 1053818880,
"step": 2010,
"train_runtime": 9178.8208,
"train_tokens_per_second": 114809.833
},
{
"epoch": 0.10931031683757678,
"grad_norm": 0.2015410214662552,
"learning_rate": 0.004921216088937362,
"loss": 3.433843994140625,
"num_input_tokens_seen": 1059061760,
"step": 2020,
"train_runtime": 9224.0155,
"train_tokens_per_second": 114815.696
},
{
"epoch": 0.10985145701994102,
"grad_norm": 0.20246025919914246,
"learning_rate": 0.0049201815587191205,
"loss": 3.4257015228271483,
"num_input_tokens_seen": 1064304640,
"step": 2030,
"train_runtime": 9269.2191,
"train_tokens_per_second": 114821.392
},
{
"epoch": 0.11039259720230525,
"grad_norm": 0.1984010487794876,
"learning_rate": 0.0049191404030364165,
"loss": 3.407004547119141,
"num_input_tokens_seen": 1069547520,
"step": 2040,
"train_runtime": 9314.4158,
"train_tokens_per_second": 114827.118
},
{
"epoch": 0.1109337373846695,
"grad_norm": 0.2240104079246521,
"learning_rate": 0.0049180926250678506,
"loss": 3.413028335571289,
"num_input_tokens_seen": 1074790400,
"step": 2050,
"train_runtime": 9359.6399,
"train_tokens_per_second": 114832.452
},
{
"epoch": 0.11147487756703374,
"grad_norm": 0.20743729174137115,
"learning_rate": 0.004917038228012243,
"loss": 3.413587188720703,
"num_input_tokens_seen": 1080033280,
"step": 2060,
"train_runtime": 9404.8754,
"train_tokens_per_second": 114837.596
},
{
"epoch": 0.11201601774939798,
"grad_norm": 0.18610326945781708,
"learning_rate": 0.004915977215088616,
"loss": 3.4143035888671873,
"num_input_tokens_seen": 1085276160,
"step": 2070,
"train_runtime": 9450.1196,
"train_tokens_per_second": 114842.584
},
{
"epoch": 0.11255715793176223,
"grad_norm": 0.18289707601070404,
"learning_rate": 0.004914909589536196,
"loss": 3.4013748168945312,
"num_input_tokens_seen": 1090519040,
"step": 2080,
"train_runtime": 9495.3594,
"train_tokens_per_second": 114847.579
},
{
"epoch": 0.11309829811412646,
"grad_norm": 0.20693431794643402,
"learning_rate": 0.0049138353546143935,
"loss": 3.420492172241211,
"num_input_tokens_seen": 1095761920,
"step": 2090,
"train_runtime": 9540.5793,
"train_tokens_per_second": 114852.766
},
{
"epoch": 0.1136394382964907,
"grad_norm": 0.1881825178861618,
"learning_rate": 0.0049127545136027975,
"loss": 3.4042373657226563,
"num_input_tokens_seen": 1101004800,
"step": 2100,
"train_runtime": 9585.774,
"train_tokens_per_second": 114858.206
},
{
"epoch": 0.11418057847885495,
"grad_norm": 0.20557381212711334,
"learning_rate": 0.004911667069801167,
"loss": 3.395760345458984,
"num_input_tokens_seen": 1106247680,
"step": 2110,
"train_runtime": 9630.9862,
"train_tokens_per_second": 114863.385
},
{
"epoch": 0.11472171866121919,
"grad_norm": 0.20688888430595398,
"learning_rate": 0.004910573026529419,
"loss": 3.3946189880371094,
"num_input_tokens_seen": 1111490560,
"step": 2120,
"train_runtime": 9676.207,
"train_tokens_per_second": 114868.415
},
{
"epoch": 0.11526285884358344,
"grad_norm": 0.1814773976802826,
"learning_rate": 0.004909472387127615,
"loss": 3.405241775512695,
"num_input_tokens_seen": 1116733440,
"step": 2130,
"train_runtime": 9721.4134,
"train_tokens_per_second": 114873.568
},
{
"epoch": 0.11580399902594767,
"grad_norm": 0.1865544617176056,
"learning_rate": 0.004908365154955957,
"loss": 3.4098495483398437,
"num_input_tokens_seen": 1121976320,
"step": 2140,
"train_runtime": 9766.6099,
"train_tokens_per_second": 114878.789
},
{
"epoch": 0.11634513920831191,
"grad_norm": 0.2032928168773651,
"learning_rate": 0.004907251333394776,
"loss": 3.4024234771728517,
"num_input_tokens_seen": 1127219200,
"step": 2150,
"train_runtime": 9811.8225,
"train_tokens_per_second": 114883.774
},
{
"epoch": 0.11688627939067615,
"grad_norm": 0.1904987096786499,
"learning_rate": 0.004906130925844515,
"loss": 3.3986663818359375,
"num_input_tokens_seen": 1132462080,
"step": 2160,
"train_runtime": 9857.0319,
"train_tokens_per_second": 114888.751
},
{
"epoch": 0.1174274195730404,
"grad_norm": 0.1815112829208374,
"learning_rate": 0.004905003935725728,
"loss": 3.3947410583496094,
"num_input_tokens_seen": 1137704960,
"step": 2170,
"train_runtime": 9902.2357,
"train_tokens_per_second": 114893.746
},
{
"epoch": 0.11796855975540464,
"grad_norm": 0.20339980721473694,
"learning_rate": 0.004903870366479064,
"loss": 3.3956260681152344,
"num_input_tokens_seen": 1142947840,
"step": 2180,
"train_runtime": 9947.4535,
"train_tokens_per_second": 114898.536
},
{
"epoch": 0.11850969993776887,
"grad_norm": 0.18381614983081818,
"learning_rate": 0.004902730221565258,
"loss": 3.3980743408203127,
"num_input_tokens_seen": 1148190720,
"step": 2190,
"train_runtime": 9992.6437,
"train_tokens_per_second": 114903.599
},
{
"epoch": 0.11905084012013312,
"grad_norm": 0.19806860387325287,
"learning_rate": 0.004901583504465119,
"loss": 3.393767547607422,
"num_input_tokens_seen": 1153433600,
"step": 2200,
"train_runtime": 10037.8153,
"train_tokens_per_second": 114908.829
},
{
"epoch": 0.11959198030249736,
"grad_norm": 0.20329956710338593,
"learning_rate": 0.004900430218679523,
"loss": 3.3944183349609376,
"num_input_tokens_seen": 1158676480,
"step": 2210,
"train_runtime": 10083.038,
"train_tokens_per_second": 114913.43
},
{
"epoch": 0.1201331204848616,
"grad_norm": 0.2080426961183548,
"learning_rate": 0.004899270367729398,
"loss": 3.3978126525878904,
"num_input_tokens_seen": 1163919360,
"step": 2220,
"train_runtime": 10128.2721,
"train_tokens_per_second": 114917.86
},
{
"epoch": 0.12067426066722585,
"grad_norm": 0.19686874747276306,
"learning_rate": 0.004898103955155715,
"loss": 3.395246124267578,
"num_input_tokens_seen": 1169162240,
"step": 2230,
"train_runtime": 10173.5097,
"train_tokens_per_second": 114922.212
},
{
"epoch": 0.12121540084959008,
"grad_norm": 0.20237593352794647,
"learning_rate": 0.004896930984519478,
"loss": 3.3845314025878905,
"num_input_tokens_seen": 1174405120,
"step": 2240,
"train_runtime": 10218.7376,
"train_tokens_per_second": 114926.634
},
{
"epoch": 0.12175654103195432,
"grad_norm": 0.19075846672058105,
"learning_rate": 0.004895751459401713,
"loss": 3.380054473876953,
"num_input_tokens_seen": 1179648000,
"step": 2250,
"train_runtime": 10263.9453,
"train_tokens_per_second": 114931.243
},
{
"epoch": 0.12229768121431857,
"grad_norm": 0.18929868936538696,
"learning_rate": 0.004894565383403456,
"loss": 3.3817626953125,
"num_input_tokens_seen": 1184890880,
"step": 2260,
"train_runtime": 10309.1491,
"train_tokens_per_second": 114935.857
},
{
"epoch": 0.12283882139668281,
"grad_norm": 0.18654604256153107,
"learning_rate": 0.0048933727601457415,
"loss": 3.3808876037597657,
"num_input_tokens_seen": 1190133760,
"step": 2270,
"train_runtime": 10354.365,
"train_tokens_per_second": 114940.294
},
{
"epoch": 0.12337996157904706,
"grad_norm": 0.19212667644023895,
"learning_rate": 0.004892173593269593,
"loss": 3.378282928466797,
"num_input_tokens_seen": 1195376640,
"step": 2280,
"train_runtime": 10399.5899,
"train_tokens_per_second": 114944.594
},
{
"epoch": 0.12392110176141129,
"grad_norm": 0.19903384149074554,
"learning_rate": 0.004890967886436014,
"loss": 3.384090042114258,
"num_input_tokens_seen": 1200619520,
"step": 2290,
"train_runtime": 10448.5527,
"train_tokens_per_second": 114907.735
},
{
"epoch": 0.12446224194377553,
"grad_norm": 0.19387711584568024,
"learning_rate": 0.004889755643325971,
"loss": 3.380754089355469,
"num_input_tokens_seen": 1205862400,
"step": 2300,
"train_runtime": 10493.7881,
"train_tokens_per_second": 114912.021
},
{
"epoch": 0.1250033821261398,
"grad_norm": 0.20909279584884644,
"learning_rate": 0.0048885368676403855,
"loss": 3.3727947235107423,
"num_input_tokens_seen": 1211105280,
"step": 2310,
"train_runtime": 10539.0351,
"train_tokens_per_second": 114916.145
},
{
"epoch": 0.12554452230850402,
"grad_norm": 0.17621192336082458,
"learning_rate": 0.004887311563100124,
"loss": 3.384077453613281,
"num_input_tokens_seen": 1216348160,
"step": 2320,
"train_runtime": 10584.2319,
"train_tokens_per_second": 114920.777
},
{
"epoch": 0.12608566249086825,
"grad_norm": 0.19513387978076935,
"learning_rate": 0.004886079733445985,
"loss": 3.378644561767578,
"num_input_tokens_seen": 1221591040,
"step": 2330,
"train_runtime": 10629.451,
"train_tokens_per_second": 114925.13
},
{
"epoch": 0.1266268026732325,
"grad_norm": 0.19339337944984436,
"learning_rate": 0.004884841382438689,
"loss": 3.3802566528320312,
"num_input_tokens_seen": 1226833920,
"step": 2340,
"train_runtime": 10674.6651,
"train_tokens_per_second": 114929.5
},
{
"epoch": 0.12716794285559674,
"grad_norm": 0.20770232379436493,
"learning_rate": 0.004883596513858863,
"loss": 3.3678009033203127,
"num_input_tokens_seen": 1232076800,
"step": 2350,
"train_runtime": 10719.8522,
"train_tokens_per_second": 114934.122
},
{
"epoch": 0.127709083037961,
"grad_norm": 0.19512751698493958,
"learning_rate": 0.004882345131507035,
"loss": 3.3827003479003905,
"num_input_tokens_seen": 1237319680,
"step": 2360,
"train_runtime": 10765.0276,
"train_tokens_per_second": 114938.831
},
{
"epoch": 0.12825022322032523,
"grad_norm": 0.1894627958536148,
"learning_rate": 0.004881087239203616,
"loss": 3.377857208251953,
"num_input_tokens_seen": 1242562560,
"step": 2370,
"train_runtime": 10810.2272,
"train_tokens_per_second": 114943.242
},
{
"epoch": 0.12879136340268946,
"grad_norm": 0.18233883380889893,
"learning_rate": 0.004879822840788895,
"loss": 3.370525360107422,
"num_input_tokens_seen": 1247805440,
"step": 2380,
"train_runtime": 10855.4267,
"train_tokens_per_second": 114947.618
},
{
"epoch": 0.12933250358505372,
"grad_norm": 0.1876172125339508,
"learning_rate": 0.00487855194012302,
"loss": 3.3757583618164064,
"num_input_tokens_seen": 1253048320,
"step": 2390,
"train_runtime": 10900.6108,
"train_tokens_per_second": 114952.12
},
{
"epoch": 0.12987364376741795,
"grad_norm": 0.19061093032360077,
"learning_rate": 0.0048772745410859955,
"loss": 3.371135711669922,
"num_input_tokens_seen": 1258291200,
"step": 2400,
"train_runtime": 10945.8277,
"train_tokens_per_second": 114956.24
},
{
"epoch": 0.1304147839497822,
"grad_norm": 0.19320231676101685,
"learning_rate": 0.004875990647577659,
"loss": 3.376973342895508,
"num_input_tokens_seen": 1263534080,
"step": 2410,
"train_runtime": 10991.0522,
"train_tokens_per_second": 114960.247
},
{
"epoch": 0.13095592413214643,
"grad_norm": 0.18665140867233276,
"learning_rate": 0.004874700263517679,
"loss": 3.371229553222656,
"num_input_tokens_seen": 1268776960,
"step": 2420,
"train_runtime": 11036.2334,
"train_tokens_per_second": 114964.673
},
{
"epoch": 0.13149706431451066,
"grad_norm": 0.18199113011360168,
"learning_rate": 0.004873403392845541,
"loss": 3.361619567871094,
"num_input_tokens_seen": 1274019840,
"step": 2430,
"train_runtime": 11081.4325,
"train_tokens_per_second": 114968.877
},
{
"epoch": 0.13203820449687492,
"grad_norm": 0.19316934049129486,
"learning_rate": 0.004872100039520528,
"loss": 3.360996627807617,
"num_input_tokens_seen": 1279262720,
"step": 2440,
"train_runtime": 11126.6221,
"train_tokens_per_second": 114973.144
},
{
"epoch": 0.13257934467923915,
"grad_norm": 0.185124471783638,
"learning_rate": 0.00487079020752172,
"loss": 3.366575241088867,
"num_input_tokens_seen": 1284505600,
"step": 2450,
"train_runtime": 11171.8004,
"train_tokens_per_second": 114977.493
},
{
"epoch": 0.1331204848616034,
"grad_norm": 0.18804939091205597,
"learning_rate": 0.004869473900847973,
"loss": 3.3575817108154298,
"num_input_tokens_seen": 1289748480,
"step": 2460,
"train_runtime": 11217.0182,
"train_tokens_per_second": 114981.402
},
{
"epoch": 0.13366162504396764,
"grad_norm": 0.19206225872039795,
"learning_rate": 0.004868151123517911,
"loss": 3.3654083251953124,
"num_input_tokens_seen": 1294991360,
"step": 2470,
"train_runtime": 11262.2319,
"train_tokens_per_second": 114985.322
},
{
"epoch": 0.13420276522633187,
"grad_norm": 0.1901652067899704,
"learning_rate": 0.004866821879569913,
"loss": 3.3583431243896484,
"num_input_tokens_seen": 1300234240,
"step": 2480,
"train_runtime": 11307.4375,
"train_tokens_per_second": 114989.293
},
{
"epoch": 0.13474390540869613,
"grad_norm": 0.1906082034111023,
"learning_rate": 0.004865486173062098,
"loss": 3.3592803955078123,
"num_input_tokens_seen": 1305477120,
"step": 2490,
"train_runtime": 11352.6608,
"train_tokens_per_second": 114993.053
},
{
"epoch": 0.13528504559106036,
"grad_norm": 0.17934362590312958,
"learning_rate": 0.004864144008072318,
"loss": 3.3405136108398437,
"num_input_tokens_seen": 1310720000,
"step": 2500,
"train_runtime": 11397.8619,
"train_tokens_per_second": 114997.007
},
{
"epoch": 0.13528504559106036,
"eval_loss": 3.2980093955993652,
"eval_runtime": 1.9858,
"eval_samples_per_second": 251.785,
"eval_steps_per_second": 4.029,
"num_input_tokens_seen": 1310720000,
"step": 2500
},
{
"epoch": 0.13582618577342462,
"grad_norm": 0.18693317472934723,
"learning_rate": 0.00486279538869814,
"loss": 3.367817687988281,
"num_input_tokens_seen": 1315962880,
"step": 2510,
"train_runtime": 11445.0335,
"train_tokens_per_second": 114981.13
},
{
"epoch": 0.13636732595578885,
"grad_norm": 0.18467511236667633,
"learning_rate": 0.004861440319056837,
"loss": 3.355504608154297,
"num_input_tokens_seen": 1321205760,
"step": 2520,
"train_runtime": 11490.248,
"train_tokens_per_second": 114984.965
},
{
"epoch": 0.13690846613815308,
"grad_norm": 0.17405100166797638,
"learning_rate": 0.004860078803285375,
"loss": 3.3486671447753906,
"num_input_tokens_seen": 1326448640,
"step": 2530,
"train_runtime": 11535.4773,
"train_tokens_per_second": 114988.622
},
{
"epoch": 0.13744960632051734,
"grad_norm": 0.1991710066795349,
"learning_rate": 0.0048587108455403994,
"loss": 3.3470123291015623,
"num_input_tokens_seen": 1331691520,
"step": 2540,
"train_runtime": 11580.7091,
"train_tokens_per_second": 114992.226
},
{
"epoch": 0.13799074650288157,
"grad_norm": 0.19981589913368225,
"learning_rate": 0.004857336449998221,
"loss": 3.355559539794922,
"num_input_tokens_seen": 1336934400,
"step": 2550,
"train_runtime": 11625.9198,
"train_tokens_per_second": 114996.011
},
{
"epoch": 0.13853188668524583,
"grad_norm": 0.19031056761741638,
"learning_rate": 0.004855955620854806,
"loss": 3.359702301025391,
"num_input_tokens_seen": 1342177280,
"step": 2560,
"train_runtime": 11671.1393,
"train_tokens_per_second": 114999.68
},
{
"epoch": 0.13907302686761006,
"grad_norm": 0.1825476437807083,
"learning_rate": 0.004854568362325763,
"loss": 3.3532974243164064,
"num_input_tokens_seen": 1347420160,
"step": 2570,
"train_runtime": 11716.3486,
"train_tokens_per_second": 115003.42
},
{
"epoch": 0.13961416704997429,
"grad_norm": 0.1943996399641037,
"learning_rate": 0.004853174678646328,
"loss": 3.3549442291259766,
"num_input_tokens_seen": 1352663040,
"step": 2580,
"train_runtime": 11761.5665,
"train_tokens_per_second": 115007.048
},
{
"epoch": 0.14015530723233854,
"grad_norm": 0.18165603280067444,
"learning_rate": 0.004851774574071355,
"loss": 3.345872497558594,
"num_input_tokens_seen": 1357905920,
"step": 2590,
"train_runtime": 11806.7776,
"train_tokens_per_second": 115010.714
},
{
"epoch": 0.14069644741470277,
"grad_norm": 0.19906878471374512,
"learning_rate": 0.004850368052875296,
"loss": 3.3501548767089844,
"num_input_tokens_seen": 1363148800,
"step": 2600,
"train_runtime": 11851.978,
"train_tokens_per_second": 115014.456
},
{
"epoch": 0.14123758759706703,
"grad_norm": 0.1889800876379013,
"learning_rate": 0.004848955119352198,
"loss": 3.357212448120117,
"num_input_tokens_seen": 1368391680,
"step": 2610,
"train_runtime": 11897.1651,
"train_tokens_per_second": 115018.298
},
{
"epoch": 0.14177872777943126,
"grad_norm": 0.1907270848751068,
"learning_rate": 0.00484753577781568,
"loss": 3.3405483245849608,
"num_input_tokens_seen": 1373634560,
"step": 2620,
"train_runtime": 11942.3695,
"train_tokens_per_second": 115021.945
},
{
"epoch": 0.1423198679617955,
"grad_norm": 0.18622975051403046,
"learning_rate": 0.004846110032598928,
"loss": 3.344770050048828,
"num_input_tokens_seen": 1378877440,
"step": 2630,
"train_runtime": 11987.559,
"train_tokens_per_second": 115025.706
},
{
"epoch": 0.14286100814415975,
"grad_norm": 0.20059405267238617,
"learning_rate": 0.004844677888054675,
"loss": 3.344530487060547,
"num_input_tokens_seen": 1384120320,
"step": 2640,
"train_runtime": 12032.7386,
"train_tokens_per_second": 115029.535
},
{
"epoch": 0.14340214832652398,
"grad_norm": 0.18230022490024567,
"learning_rate": 0.004843239348555194,
"loss": 3.340105438232422,
"num_input_tokens_seen": 1389363200,
"step": 2650,
"train_runtime": 12077.95,
"train_tokens_per_second": 115033.032
},
{
"epoch": 0.14394328850888824,
"grad_norm": 0.1814001351594925,
"learning_rate": 0.004841794418492279,
"loss": 3.3359622955322266,
"num_input_tokens_seen": 1394606080,
"step": 2660,
"train_runtime": 12123.1531,
"train_tokens_per_second": 115036.581
},
{
"epoch": 0.14448442869125247,
"grad_norm": 0.20956675708293915,
"learning_rate": 0.004840343102277236,
"loss": 3.3457298278808594,
"num_input_tokens_seen": 1399848960,
"step": 2670,
"train_runtime": 12168.3992,
"train_tokens_per_second": 115039.697
},
{
"epoch": 0.1450255688736167,
"grad_norm": 0.19602610170841217,
"learning_rate": 0.004838885404340865,
"loss": 3.337678909301758,
"num_input_tokens_seen": 1405091840,
"step": 2680,
"train_runtime": 12217.5728,
"train_tokens_per_second": 115005.809
},
{
"epoch": 0.14556670905598096,
"grad_norm": 0.18408642709255219,
"learning_rate": 0.00483742132913345,
"loss": 3.3393791198730467,
"num_input_tokens_seen": 1410334720,
"step": 2690,
"train_runtime": 12262.7736,
"train_tokens_per_second": 115009.439
},
{
"epoch": 0.1461078492383452,
"grad_norm": 0.21065284311771393,
"learning_rate": 0.00483595088112475,
"loss": 3.3346370697021483,
"num_input_tokens_seen": 1415577600,
"step": 2700,
"train_runtime": 12307.9818,
"train_tokens_per_second": 115012.974
},
{
"epoch": 0.14664898942070945,
"grad_norm": 0.18873485922813416,
"learning_rate": 0.00483447406480397,
"loss": 3.33388671875,
"num_input_tokens_seen": 1420820480,
"step": 2710,
"train_runtime": 12353.2004,
"train_tokens_per_second": 115016.387
},
{
"epoch": 0.14719012960307368,
"grad_norm": 0.18760992586612701,
"learning_rate": 0.004832990884679764,
"loss": 3.3374618530273437,
"num_input_tokens_seen": 1426063360,
"step": 2720,
"train_runtime": 12398.393,
"train_tokens_per_second": 115020.016
},
{
"epoch": 0.1477312697854379,
"grad_norm": 0.192196786403656,
"learning_rate": 0.004831501345280215,
"loss": 3.3331283569335937,
"num_input_tokens_seen": 1431306240,
"step": 2730,
"train_runtime": 12443.5889,
"train_tokens_per_second": 115023.588
},
{
"epoch": 0.14827240996780217,
"grad_norm": 0.18086469173431396,
"learning_rate": 0.004830005451152815,
"loss": 3.342273712158203,
"num_input_tokens_seen": 1436549120,
"step": 2740,
"train_runtime": 12488.8002,
"train_tokens_per_second": 115026.992
},
{
"epoch": 0.1488135501501664,
"grad_norm": 0.20178896188735962,
"learning_rate": 0.004828503206864461,
"loss": 3.340282440185547,
"num_input_tokens_seen": 1441792000,
"step": 2750,
"train_runtime": 12534.0299,
"train_tokens_per_second": 115030.203
},
{
"epoch": 0.14935469033253065,
"grad_norm": 0.1862727403640747,
"learning_rate": 0.004826994617001436,
"loss": 3.333884048461914,
"num_input_tokens_seen": 1447034880,
"step": 2760,
"train_runtime": 12579.232,
"train_tokens_per_second": 115033.643
},
{
"epoch": 0.14989583051489488,
"grad_norm": 0.18692044913768768,
"learning_rate": 0.004825479686169395,
"loss": 3.3313224792480467,
"num_input_tokens_seen": 1452277760,
"step": 2770,
"train_runtime": 12624.4525,
"train_tokens_per_second": 115036.89
},
{
"epoch": 0.15043697069725911,
"grad_norm": 0.19887301325798035,
"learning_rate": 0.004823958418993353,
"loss": 3.3318561553955077,
"num_input_tokens_seen": 1457520640,
"step": 2780,
"train_runtime": 12669.6266,
"train_tokens_per_second": 115040.536
},
{
"epoch": 0.15097811087962337,
"grad_norm": 0.19694387912750244,
"learning_rate": 0.004822430820117667,
"loss": 3.324603271484375,
"num_input_tokens_seen": 1462763520,
"step": 2790,
"train_runtime": 12714.8383,
"train_tokens_per_second": 115043.816
},
{
"epoch": 0.1515192510619876,
"grad_norm": 0.18864646553993225,
"learning_rate": 0.0048208968942060285,
"loss": 3.329520416259766,
"num_input_tokens_seen": 1468006400,
"step": 2800,
"train_runtime": 12760.0555,
"train_tokens_per_second": 115047.023
},
{
"epoch": 0.15206039124435186,
"grad_norm": 0.19097571074962616,
"learning_rate": 0.004819356645941442,
"loss": 3.334062194824219,
"num_input_tokens_seen": 1473249280,
"step": 2810,
"train_runtime": 12805.2599,
"train_tokens_per_second": 115050.323
},
{
"epoch": 0.1526015314267161,
"grad_norm": 0.18825189769268036,
"learning_rate": 0.004817810080026213,
"loss": 3.3339500427246094,
"num_input_tokens_seen": 1478492160,
"step": 2820,
"train_runtime": 12850.4647,
"train_tokens_per_second": 115053.595
},
{
"epoch": 0.15314267160908032,
"grad_norm": 0.17853769659996033,
"learning_rate": 0.004816257201181937,
"loss": 3.3271289825439454,
"num_input_tokens_seen": 1483735040,
"step": 2830,
"train_runtime": 12895.6915,
"train_tokens_per_second": 115056.648
},
{
"epoch": 0.15368381179144458,
"grad_norm": 0.1959368884563446,
"learning_rate": 0.004814698014149483,
"loss": 3.3293079376220702,
"num_input_tokens_seen": 1488977920,
"step": 2840,
"train_runtime": 12940.9039,
"train_tokens_per_second": 115059.808
},
{
"epoch": 0.1542249519738088,
"grad_norm": 0.1917344182729721,
"learning_rate": 0.0048131325236889745,
"loss": 3.3289634704589846,
"num_input_tokens_seen": 1494220800,
"step": 2850,
"train_runtime": 12986.1305,
"train_tokens_per_second": 115062.82
},
{
"epoch": 0.15476609215617307,
"grad_norm": 0.18901459872722626,
"learning_rate": 0.004811560734579785,
"loss": 3.3151206970214844,
"num_input_tokens_seen": 1499463680,
"step": 2860,
"train_runtime": 13031.3423,
"train_tokens_per_second": 115065.943
},
{
"epoch": 0.1553072323385373,
"grad_norm": 0.1884569674730301,
"learning_rate": 0.004809982651620513,
"loss": 3.321660614013672,
"num_input_tokens_seen": 1504706560,
"step": 2870,
"train_runtime": 13076.553,
"train_tokens_per_second": 115069.052
},
{
"epoch": 0.15584837252090153,
"grad_norm": 0.1960553228855133,
"learning_rate": 0.004808398279628971,
"loss": 3.326691436767578,
"num_input_tokens_seen": 1509949440,
"step": 2880,
"train_runtime": 13121.764,
"train_tokens_per_second": 115072.138
},
{
"epoch": 0.1563895127032658,
"grad_norm": 0.19503499567508698,
"learning_rate": 0.004806807623442178,
"loss": 3.321258544921875,
"num_input_tokens_seen": 1515192320,
"step": 2890,
"train_runtime": 13166.9951,
"train_tokens_per_second": 115075.027
},
{
"epoch": 0.15693065288563002,
"grad_norm": 0.19287334382534027,
"learning_rate": 0.004805210687916331,
"loss": 3.3227684020996096,
"num_input_tokens_seen": 1520435200,
"step": 2900,
"train_runtime": 13212.1814,
"train_tokens_per_second": 115078.287
},
{
"epoch": 0.15747179306799428,
"grad_norm": 0.19813010096549988,
"learning_rate": 0.004803607477926801,
"loss": 3.3109420776367187,
"num_input_tokens_seen": 1525678080,
"step": 2910,
"train_runtime": 13257.3873,
"train_tokens_per_second": 115081.354
},
{
"epoch": 0.1580129332503585,
"grad_norm": 0.19769278168678284,
"learning_rate": 0.004801997998368116,
"loss": 3.317332458496094,
"num_input_tokens_seen": 1530920960,
"step": 2920,
"train_runtime": 13302.5829,
"train_tokens_per_second": 115084.489
},
{
"epoch": 0.15855407343272274,
"grad_norm": 0.21613162755966187,
"learning_rate": 0.0048003822541539416,
"loss": 3.3125213623046874,
"num_input_tokens_seen": 1536163840,
"step": 2930,
"train_runtime": 13347.7691,
"train_tokens_per_second": 115087.684
},
{
"epoch": 0.159095213615087,
"grad_norm": 0.19285152852535248,
"learning_rate": 0.004798760250217072,
"loss": 3.3247020721435545,
"num_input_tokens_seen": 1541406720,
"step": 2940,
"train_runtime": 13392.9574,
"train_tokens_per_second": 115090.84
},
{
"epoch": 0.15963635379745122,
"grad_norm": 0.19629855453968048,
"learning_rate": 0.004797131991509409,
"loss": 3.3183937072753906,
"num_input_tokens_seen": 1546649600,
"step": 2950,
"train_runtime": 13438.1692,
"train_tokens_per_second": 115093.773
},
{
"epoch": 0.16017749397981548,
"grad_norm": 0.1971038430929184,
"learning_rate": 0.004795497483001952,
"loss": 3.3157825469970703,
"num_input_tokens_seen": 1551892480,
"step": 2960,
"train_runtime": 13483.3787,
"train_tokens_per_second": 115096.706
},
{
"epoch": 0.1607186341621797,
"grad_norm": 0.2136721909046173,
"learning_rate": 0.0047938567296847805,
"loss": 3.3181556701660155,
"num_input_tokens_seen": 1557135360,
"step": 2970,
"train_runtime": 13528.5747,
"train_tokens_per_second": 115099.734
},
{
"epoch": 0.16125977434454394,
"grad_norm": 0.18783020973205566,
"learning_rate": 0.004792209736567038,
"loss": 3.3050804138183594,
"num_input_tokens_seen": 1562378240,
"step": 2980,
"train_runtime": 13573.7919,
"train_tokens_per_second": 115102.563
},
{
"epoch": 0.1618009145269082,
"grad_norm": 0.17761750519275665,
"learning_rate": 0.0047905565086769205,
"loss": 3.313432312011719,
"num_input_tokens_seen": 1567621120,
"step": 2990,
"train_runtime": 13618.9923,
"train_tokens_per_second": 115105.515
},
{
"epoch": 0.16234205470927243,
"grad_norm": 0.1785641759634018,
"learning_rate": 0.004788897051061655,
"loss": 3.317774200439453,
"num_input_tokens_seen": 1572864000,
"step": 3000,
"train_runtime": 13664.2112,
"train_tokens_per_second": 115108.291
},
{
"epoch": 0.16234205470927243,
"eval_loss": 3.2526497840881348,
"eval_runtime": 1.9863,
"eval_samples_per_second": 251.723,
"eval_steps_per_second": 4.028,
"num_input_tokens_seen": 1572864000,
"step": 3000
},
{
"epoch": 0.1628831948916367,
"grad_norm": 0.19272948801517487,
"learning_rate": 0.004787231368787491,
"loss": 3.3128257751464845,
"num_input_tokens_seen": 1578106880,
"step": 3010,
"train_runtime": 13714.1711,
"train_tokens_per_second": 115071.255
},
{
"epoch": 0.16342433507400092,
"grad_norm": 0.1745939403772354,
"learning_rate": 0.004785559466939679,
"loss": 3.31363525390625,
"num_input_tokens_seen": 1583349760,
"step": 3020,
"train_runtime": 13759.3609,
"train_tokens_per_second": 115074.368
},
{
"epoch": 0.16396547525636515,
"grad_norm": 0.20123089849948883,
"learning_rate": 0.0047838813506224575,
"loss": 3.3179275512695314,
"num_input_tokens_seen": 1588592640,
"step": 3030,
"train_runtime": 13804.5657,
"train_tokens_per_second": 115077.335
},
{
"epoch": 0.1645066154387294,
"grad_norm": 0.19240304827690125,
"learning_rate": 0.004782197024959039,
"loss": 3.3164352416992187,
"num_input_tokens_seen": 1593835520,
"step": 3040,
"train_runtime": 13849.7747,
"train_tokens_per_second": 115080.249
},
{
"epoch": 0.16504775562109364,
"grad_norm": 0.19316141307353973,
"learning_rate": 0.004780506495091593,
"loss": 3.316120147705078,
"num_input_tokens_seen": 1599078400,
"step": 3050,
"train_runtime": 13894.9935,
"train_tokens_per_second": 115083.062
},
{
"epoch": 0.1655888958034579,
"grad_norm": 0.19889311492443085,
"learning_rate": 0.004778809766181229,
"loss": 3.3089508056640624,
"num_input_tokens_seen": 1604321280,
"step": 3060,
"train_runtime": 13943.9286,
"train_tokens_per_second": 115055.185
},
{
"epoch": 0.16613003598582213,
"grad_norm": 0.19427727162837982,
"learning_rate": 0.004777106843407982,
"loss": 3.3107887268066407,
"num_input_tokens_seen": 1609564160,
"step": 3070,
"train_runtime": 13989.1559,
"train_tokens_per_second": 115057.99
},
{
"epoch": 0.16667117616818636,
"grad_norm": 0.1867746263742447,
"learning_rate": 0.004775397731970797,
"loss": 3.306330108642578,
"num_input_tokens_seen": 1614807040,
"step": 3080,
"train_runtime": 14034.3821,
"train_tokens_per_second": 115060.787
},
{
"epoch": 0.16721231635055062,
"grad_norm": 0.18061718344688416,
"learning_rate": 0.0047736824370875125,
"loss": 3.3135826110839846,
"num_input_tokens_seen": 1620049920,
"step": 3090,
"train_runtime": 14079.6171,
"train_tokens_per_second": 115063.493
},
{
"epoch": 0.16775345653291485,
"grad_norm": 0.1992795616388321,
"learning_rate": 0.004771960963994845,
"loss": 3.2958747863769533,
"num_input_tokens_seen": 1625292800,
"step": 3100,
"train_runtime": 14124.8552,
"train_tokens_per_second": 115066.157
},
{
"epoch": 0.1682945967152791,
"grad_norm": 0.1932191550731659,
"learning_rate": 0.004770233317948373,
"loss": 3.305771255493164,
"num_input_tokens_seen": 1630535680,
"step": 3110,
"train_runtime": 14170.1029,
"train_tokens_per_second": 115068.725
},
{
"epoch": 0.16883573689764333,
"grad_norm": 0.18210622668266296,
"learning_rate": 0.00476849950422252,
"loss": 3.309395599365234,
"num_input_tokens_seen": 1635778560,
"step": 3120,
"train_runtime": 14215.3361,
"train_tokens_per_second": 115071.396
},
{
"epoch": 0.16937687708000757,
"grad_norm": 0.19367872178554535,
"learning_rate": 0.004766759528110539,
"loss": 3.302986907958984,
"num_input_tokens_seen": 1641021440,
"step": 3130,
"train_runtime": 14260.5763,
"train_tokens_per_second": 115073.992
},
{
"epoch": 0.16991801726237182,
"grad_norm": 0.19194577634334564,
"learning_rate": 0.004765013394924499,
"loss": 3.304148864746094,
"num_input_tokens_seen": 1646264320,
"step": 3140,
"train_runtime": 14305.8047,
"train_tokens_per_second": 115076.667
},
{
"epoch": 0.17045915744473605,
"grad_norm": 0.18965749442577362,
"learning_rate": 0.0047632611099952624,
"loss": 3.298334503173828,
"num_input_tokens_seen": 1651507200,
"step": 3150,
"train_runtime": 14351.0274,
"train_tokens_per_second": 115079.371
},
{
"epoch": 0.1710002976271003,
"grad_norm": 0.17816977202892303,
"learning_rate": 0.004761502678672474,
"loss": 3.300872802734375,
"num_input_tokens_seen": 1656750080,
"step": 3160,
"train_runtime": 14396.2711,
"train_tokens_per_second": 115081.889
},
{
"epoch": 0.17154143780946454,
"grad_norm": 0.19343526661396027,
"learning_rate": 0.004759738106324546,
"loss": 3.2991104125976562,
"num_input_tokens_seen": 1661992960,
"step": 3170,
"train_runtime": 14441.5146,
"train_tokens_per_second": 115084.394
},
{
"epoch": 0.17208257799182877,
"grad_norm": 0.18815293908119202,
"learning_rate": 0.004757967398338635,
"loss": 3.307733154296875,
"num_input_tokens_seen": 1667235840,
"step": 3180,
"train_runtime": 14486.7506,
"train_tokens_per_second": 115086.943
},
{
"epoch": 0.17262371817419303,
"grad_norm": 0.18241587281227112,
"learning_rate": 0.004756190560120631,
"loss": 3.2984477996826174,
"num_input_tokens_seen": 1672478720,
"step": 3190,
"train_runtime": 14531.9861,
"train_tokens_per_second": 115089.48
},
{
"epoch": 0.17316485835655726,
"grad_norm": 0.18176402151584625,
"learning_rate": 0.00475440759709514,
"loss": 3.300640869140625,
"num_input_tokens_seen": 1677721600,
"step": 3200,
"train_runtime": 14577.2147,
"train_tokens_per_second": 115092.055
},
{
"epoch": 0.17370599853892152,
"grad_norm": 0.20022694766521454,
"learning_rate": 0.004752618514705466,
"loss": 3.300579071044922,
"num_input_tokens_seen": 1682964480,
"step": 3210,
"train_runtime": 14622.4411,
"train_tokens_per_second": 115094.632
},
{
"epoch": 0.17424713872128575,
"grad_norm": 0.18792809545993805,
"learning_rate": 0.0047508233184135945,
"loss": 3.295984649658203,
"num_input_tokens_seen": 1688207360,
"step": 3220,
"train_runtime": 14667.6619,
"train_tokens_per_second": 115097.237
},
{
"epoch": 0.17478827890364998,
"grad_norm": 0.200827494263649,
"learning_rate": 0.0047490220137001785,
"loss": 3.2906261444091798,
"num_input_tokens_seen": 1693450240,
"step": 3230,
"train_runtime": 14712.8844,
"train_tokens_per_second": 115099.813
},
{
"epoch": 0.17532941908601424,
"grad_norm": 0.19141127169132233,
"learning_rate": 0.004747214606064517,
"loss": 3.2837890625,
"num_input_tokens_seen": 1698693120,
"step": 3240,
"train_runtime": 14758.1057,
"train_tokens_per_second": 115102.382
},
{
"epoch": 0.17587055926837847,
"grad_norm": 0.18976351618766785,
"learning_rate": 0.0047454011010245436,
"loss": 3.287107467651367,
"num_input_tokens_seen": 1703936000,
"step": 3250,
"train_runtime": 14803.3273,
"train_tokens_per_second": 115104.933
},
{
"epoch": 0.17641169945074273,
"grad_norm": 0.19698546826839447,
"learning_rate": 0.004743581504116804,
"loss": 3.2882354736328123,
"num_input_tokens_seen": 1709178880,
"step": 3260,
"train_runtime": 14848.5302,
"train_tokens_per_second": 115107.614
},
{
"epoch": 0.17695283963310696,
"grad_norm": 0.17822493612766266,
"learning_rate": 0.004741755820896446,
"loss": 3.2927810668945314,
"num_input_tokens_seen": 1714421760,
"step": 3270,
"train_runtime": 14893.7537,
"train_tokens_per_second": 115110.119
},
{
"epoch": 0.1774939798154712,
"grad_norm": 0.1720447987318039,
"learning_rate": 0.004739924056937195,
"loss": 3.2904899597167967,
"num_input_tokens_seen": 1719664640,
"step": 3280,
"train_runtime": 14938.9717,
"train_tokens_per_second": 115112.652
},
{
"epoch": 0.17803511999783544,
"grad_norm": 0.18303626775741577,
"learning_rate": 0.004738086217831344,
"loss": 3.28282470703125,
"num_input_tokens_seen": 1724907520,
"step": 3290,
"train_runtime": 14984.1992,
"train_tokens_per_second": 115115.096
},
{
"epoch": 0.17857626018019968,
"grad_norm": 0.176763117313385,
"learning_rate": 0.004736242309189728,
"loss": 3.286945343017578,
"num_input_tokens_seen": 1730150400,
"step": 3300,
"train_runtime": 15029.4297,
"train_tokens_per_second": 115117.502
},
{
"epoch": 0.17911740036256393,
"grad_norm": 0.19218850135803223,
"learning_rate": 0.004734392336641718,
"loss": 3.290885162353516,
"num_input_tokens_seen": 1735393280,
"step": 3310,
"train_runtime": 15074.6639,
"train_tokens_per_second": 115119.866
},
{
"epoch": 0.17965854054492816,
"grad_norm": 0.180914968252182,
"learning_rate": 0.004732536305835194,
"loss": 3.2893463134765626,
"num_input_tokens_seen": 1740636160,
"step": 3320,
"train_runtime": 15119.9044,
"train_tokens_per_second": 115122.167
},
{
"epoch": 0.1801996807272924,
"grad_norm": 0.1835494488477707,
"learning_rate": 0.0047306742224365326,
"loss": 3.2857479095458983,
"num_input_tokens_seen": 1745879040,
"step": 3330,
"train_runtime": 15165.1285,
"train_tokens_per_second": 115124.579
},
{
"epoch": 0.18074082090965665,
"grad_norm": 0.1805170625448227,
"learning_rate": 0.004728806092130589,
"loss": 3.2880077362060547,
"num_input_tokens_seen": 1751121920,
"step": 3340,
"train_runtime": 15210.3276,
"train_tokens_per_second": 115127.166
},
{
"epoch": 0.18128196109202088,
"grad_norm": 0.18228840827941895,
"learning_rate": 0.00472693192062068,
"loss": 3.286875915527344,
"num_input_tokens_seen": 1756364800,
"step": 3350,
"train_runtime": 15255.5307,
"train_tokens_per_second": 115129.709
},
{
"epoch": 0.18182310127438514,
"grad_norm": 0.20272916555404663,
"learning_rate": 0.0047250517136285634,
"loss": 3.2986392974853516,
"num_input_tokens_seen": 1761607680,
"step": 3360,
"train_runtime": 15300.7449,
"train_tokens_per_second": 115132.151
},
{
"epoch": 0.18236424145674937,
"grad_norm": 0.17199651896953583,
"learning_rate": 0.0047231654768944255,
"loss": 3.2849578857421875,
"num_input_tokens_seen": 1766850560,
"step": 3370,
"train_runtime": 15345.9591,
"train_tokens_per_second": 115134.58
},
{
"epoch": 0.1829053816391136,
"grad_norm": 0.18118058145046234,
"learning_rate": 0.00472127321617686,
"loss": 3.2900650024414064,
"num_input_tokens_seen": 1772093440,
"step": 3380,
"train_runtime": 15391.1925,
"train_tokens_per_second": 115136.851
},
{
"epoch": 0.18344652182147786,
"grad_norm": 0.19814889132976532,
"learning_rate": 0.004719374937252852,
"loss": 3.280558776855469,
"num_input_tokens_seen": 1777336320,
"step": 3390,
"train_runtime": 15436.4025,
"train_tokens_per_second": 115139.283
},
{
"epoch": 0.1839876620038421,
"grad_norm": 0.2015380561351776,
"learning_rate": 0.00471747064591776,
"loss": 3.30006103515625,
"num_input_tokens_seen": 1782579200,
"step": 3400,
"train_runtime": 15481.5971,
"train_tokens_per_second": 115141.816
},
{
"epoch": 0.18452880218620635,
"grad_norm": 0.16767387092113495,
"learning_rate": 0.0047155603479852965,
"loss": 3.2787837982177734,
"num_input_tokens_seen": 1787822080,
"step": 3410,
"train_runtime": 15526.8015,
"train_tokens_per_second": 115144.261
},
{
"epoch": 0.18506994236857058,
"grad_norm": 0.169756680727005,
"learning_rate": 0.0047136440492875145,
"loss": 3.283340072631836,
"num_input_tokens_seen": 1793064960,
"step": 3420,
"train_runtime": 15572.0514,
"train_tokens_per_second": 115146.355
},
{
"epoch": 0.1856110825509348,
"grad_norm": 0.18903492391109467,
"learning_rate": 0.004711721755674787,
"loss": 3.289379119873047,
"num_input_tokens_seen": 1798307840,
"step": 3430,
"train_runtime": 15617.2557,
"train_tokens_per_second": 115148.774
},
{
"epoch": 0.18615222273329907,
"grad_norm": 0.19553808867931366,
"learning_rate": 0.004709793473015785,
"loss": 3.277596664428711,
"num_input_tokens_seen": 1803550720,
"step": 3440,
"train_runtime": 15666.2718,
"train_tokens_per_second": 115123.16
},
{
"epoch": 0.1866933629156633,
"grad_norm": 0.17524783313274384,
"learning_rate": 0.004707859207197468,
"loss": 3.272700881958008,
"num_input_tokens_seen": 1808793600,
"step": 3450,
"train_runtime": 15711.4431,
"train_tokens_per_second": 115125.873
},
{
"epoch": 0.18723450309802755,
"grad_norm": 0.1725703924894333,
"learning_rate": 0.004705918964125061,
"loss": 3.2771453857421875,
"num_input_tokens_seen": 1814036480,
"step": 3460,
"train_runtime": 15756.6281,
"train_tokens_per_second": 115128.47
},
{
"epoch": 0.18777564328039179,
"grad_norm": 0.18361718952655792,
"learning_rate": 0.004703972749722038,
"loss": 3.2812034606933596,
"num_input_tokens_seen": 1819279360,
"step": 3470,
"train_runtime": 15801.8225,
"train_tokens_per_second": 115130.983
},
{
"epoch": 0.18831678346275602,
"grad_norm": 0.18993116915225983,
"learning_rate": 0.004702020569930098,
"loss": 3.2690109252929687,
"num_input_tokens_seen": 1824522240,
"step": 3480,
"train_runtime": 15847.0133,
"train_tokens_per_second": 115133.508
},
{
"epoch": 0.18885792364512027,
"grad_norm": 0.1982622891664505,
"learning_rate": 0.004700062430709161,
"loss": 3.2883895874023437,
"num_input_tokens_seen": 1829765120,
"step": 3490,
"train_runtime": 15892.1956,
"train_tokens_per_second": 115136.081
},
{
"epoch": 0.1893990638274845,
"grad_norm": 0.1953168362379074,
"learning_rate": 0.004698098338037333,
"loss": 3.2819141387939452,
"num_input_tokens_seen": 1835008000,
"step": 3500,
"train_runtime": 15937.3587,
"train_tokens_per_second": 115138.778
},
{
"epoch": 0.1893990638274845,
"eval_loss": 3.2193782329559326,
"eval_runtime": 1.9829,
"eval_samples_per_second": 252.151,
"eval_steps_per_second": 4.034,
"num_input_tokens_seen": 1835008000,
"step": 3500
},
{
"epoch": 0.18994020400984876,
"grad_norm": 0.17765532433986664,
"learning_rate": 0.004696128297910899,
"loss": 3.2748733520507813,
"num_input_tokens_seen": 1840250880,
"step": 3510,
"train_runtime": 15984.532,
"train_tokens_per_second": 115126.979
},
{
"epoch": 0.190481344192213,
"grad_norm": 0.1692020744085312,
"learning_rate": 0.0046941523163443015,
"loss": 3.282354736328125,
"num_input_tokens_seen": 1845493760,
"step": 3520,
"train_runtime": 16029.707,
"train_tokens_per_second": 115129.601
},
{
"epoch": 0.19102248437457722,
"grad_norm": 0.17890560626983643,
"learning_rate": 0.00469217039937012,
"loss": 3.27266845703125,
"num_input_tokens_seen": 1850736640,
"step": 3530,
"train_runtime": 16074.8942,
"train_tokens_per_second": 115132.119
},
{
"epoch": 0.19156362455694148,
"grad_norm": 0.17925257980823517,
"learning_rate": 0.004690182553039058,
"loss": 3.28330078125,
"num_input_tokens_seen": 1855979520,
"step": 3540,
"train_runtime": 16120.1066,
"train_tokens_per_second": 115134.445
},
{
"epoch": 0.1921047647393057,
"grad_norm": 0.1788860410451889,
"learning_rate": 0.004688188783419917,
"loss": 3.2885406494140623,
"num_input_tokens_seen": 1861222400,
"step": 3550,
"train_runtime": 16165.2878,
"train_tokens_per_second": 115136.979
},
{
"epoch": 0.19264590492166997,
"grad_norm": 0.1811930388212204,
"learning_rate": 0.004686189096599585,
"loss": 3.2768978118896483,
"num_input_tokens_seen": 1866465280,
"step": 3560,
"train_runtime": 16210.475,
"train_tokens_per_second": 115139.456
},
{
"epoch": 0.1931870451040342,
"grad_norm": 0.2033502757549286,
"learning_rate": 0.004684183498683013,
"loss": 3.2799072265625,
"num_input_tokens_seen": 1871708160,
"step": 3570,
"train_runtime": 16255.6518,
"train_tokens_per_second": 115141.994
},
{
"epoch": 0.19372818528639843,
"grad_norm": 0.1871407926082611,
"learning_rate": 0.0046821719957932,
"loss": 3.2745807647705076,
"num_input_tokens_seen": 1876951040,
"step": 3580,
"train_runtime": 16300.8306,
"train_tokens_per_second": 115144.503
},
{
"epoch": 0.1942693254687627,
"grad_norm": 0.18156413733959198,
"learning_rate": 0.004680154594071171,
"loss": 3.275892639160156,
"num_input_tokens_seen": 1882193920,
"step": 3590,
"train_runtime": 16346.0348,
"train_tokens_per_second": 115146.819
},
{
"epoch": 0.19481046565112692,
"grad_norm": 0.17189612984657288,
"learning_rate": 0.004678131299675962,
"loss": 3.278411102294922,
"num_input_tokens_seen": 1887436800,
"step": 3600,
"train_runtime": 16391.2075,
"train_tokens_per_second": 115149.344
},
{
"epoch": 0.19535160583349118,
"grad_norm": 0.18602769076824188,
"learning_rate": 0.004676102118784596,
"loss": 3.2600128173828127,
"num_input_tokens_seen": 1892679680,
"step": 3610,
"train_runtime": 16436.3754,
"train_tokens_per_second": 115151.89
},
{
"epoch": 0.1958927460158554,
"grad_norm": 0.18247896432876587,
"learning_rate": 0.0046740670575920705,
"loss": 3.263835906982422,
"num_input_tokens_seen": 1897922560,
"step": 3620,
"train_runtime": 16481.5414,
"train_tokens_per_second": 115154.434
},
{
"epoch": 0.19643388619821964,
"grad_norm": 0.17899157106876373,
"learning_rate": 0.004672026122311332,
"loss": 3.266416549682617,
"num_input_tokens_seen": 1903165440,
"step": 3630,
"train_runtime": 16526.6863,
"train_tokens_per_second": 115157.111
},
{
"epoch": 0.1969750263805839,
"grad_norm": 0.19543124735355377,
"learning_rate": 0.004669979319173264,
"loss": 3.261871337890625,
"num_input_tokens_seen": 1908408320,
"step": 3640,
"train_runtime": 16571.8633,
"train_tokens_per_second": 115159.55
},
{
"epoch": 0.19751616656294813,
"grad_norm": 0.18458126485347748,
"learning_rate": 0.004667926654426661,
"loss": 3.2731971740722656,
"num_input_tokens_seen": 1913651200,
"step": 3650,
"train_runtime": 16617.0658,
"train_tokens_per_second": 115161.799
},
{
"epoch": 0.19805730674531238,
"grad_norm": 0.18683847784996033,
"learning_rate": 0.004665868134338213,
"loss": 3.2641891479492187,
"num_input_tokens_seen": 1918894080,
"step": 3660,
"train_runtime": 16662.2485,
"train_tokens_per_second": 115164.173
},
{
"epoch": 0.19859844692767661,
"grad_norm": 0.18460538983345032,
"learning_rate": 0.00466380376519249,
"loss": 3.261058044433594,
"num_input_tokens_seen": 1924136960,
"step": 3670,
"train_runtime": 16707.4259,
"train_tokens_per_second": 115166.572
},
{
"epoch": 0.19913958711004084,
"grad_norm": 0.17181532084941864,
"learning_rate": 0.004661733553291914,
"loss": 3.2611160278320312,
"num_input_tokens_seen": 1929379840,
"step": 3680,
"train_runtime": 16752.6562,
"train_tokens_per_second": 115168.593
},
{
"epoch": 0.1996807272924051,
"grad_norm": 0.19158703088760376,
"learning_rate": 0.004659657504956747,
"loss": 3.2646514892578127,
"num_input_tokens_seen": 1934622720,
"step": 3690,
"train_runtime": 16797.8998,
"train_tokens_per_second": 115170.512
},
{
"epoch": 0.20022186747476933,
"grad_norm": 0.18142655491828918,
"learning_rate": 0.004657575626525069,
"loss": 3.258336639404297,
"num_input_tokens_seen": 1939865600,
"step": 3700,
"train_runtime": 16843.1356,
"train_tokens_per_second": 115172.474
},
{
"epoch": 0.2007630076571336,
"grad_norm": 0.1888807713985443,
"learning_rate": 0.00465548792435276,
"loss": 3.256613922119141,
"num_input_tokens_seen": 1945108480,
"step": 3710,
"train_runtime": 16888.3732,
"train_tokens_per_second": 115174.414
},
{
"epoch": 0.20130414783949782,
"grad_norm": 0.17357957363128662,
"learning_rate": 0.004653394404813478,
"loss": 3.2642303466796876,
"num_input_tokens_seen": 1950351360,
"step": 3720,
"train_runtime": 16933.6199,
"train_tokens_per_second": 115176.281
},
{
"epoch": 0.20184528802186205,
"grad_norm": 0.17942315340042114,
"learning_rate": 0.004651295074298641,
"loss": 3.254298782348633,
"num_input_tokens_seen": 1955594240,
"step": 3730,
"train_runtime": 16978.845,
"train_tokens_per_second": 115178.284
},
{
"epoch": 0.2023864282042263,
"grad_norm": 0.17983372509479523,
"learning_rate": 0.00464918993921741,
"loss": 3.2564628601074217,
"num_input_tokens_seen": 1960837120,
"step": 3740,
"train_runtime": 17024.0569,
"train_tokens_per_second": 115180.367
},
{
"epoch": 0.20292756838659054,
"grad_norm": 0.19154661893844604,
"learning_rate": 0.004647079005996664,
"loss": 3.2626083374023436,
"num_input_tokens_seen": 1966080000,
"step": 3750,
"train_runtime": 17069.2756,
"train_tokens_per_second": 115182.392
},
{
"epoch": 0.2034687085689548,
"grad_norm": 0.16907712817192078,
"learning_rate": 0.0046449622810809865,
"loss": 3.2560802459716798,
"num_input_tokens_seen": 1971322880,
"step": 3760,
"train_runtime": 17114.512,
"train_tokens_per_second": 115184.288
},
{
"epoch": 0.20400984875131903,
"grad_norm": 0.1877511590719223,
"learning_rate": 0.004642839770932641,
"loss": 3.2611919403076173,
"num_input_tokens_seen": 1976565760,
"step": 3770,
"train_runtime": 17159.7356,
"train_tokens_per_second": 115186.26
},
{
"epoch": 0.20455098893368326,
"grad_norm": 0.1924838423728943,
"learning_rate": 0.004640711482031552,
"loss": 3.259069061279297,
"num_input_tokens_seen": 1981808640,
"step": 3780,
"train_runtime": 17204.9712,
"train_tokens_per_second": 115188.14
},
{
"epoch": 0.20509212911604752,
"grad_norm": 0.17791348695755005,
"learning_rate": 0.00463857742087529,
"loss": 3.2603363037109374,
"num_input_tokens_seen": 1987051520,
"step": 3790,
"train_runtime": 17250.1988,
"train_tokens_per_second": 115190.065
},
{
"epoch": 0.20563326929841175,
"grad_norm": 0.18873880803585052,
"learning_rate": 0.004636437593979043,
"loss": 3.260697937011719,
"num_input_tokens_seen": 1992294400,
"step": 3800,
"train_runtime": 17295.4319,
"train_tokens_per_second": 115191.943
},
{
"epoch": 0.206174409480776,
"grad_norm": 0.1765436977148056,
"learning_rate": 0.004634292007875606,
"loss": 3.25205078125,
"num_input_tokens_seen": 1997537280,
"step": 3810,
"train_runtime": 17340.6638,
"train_tokens_per_second": 115193.819
},
{
"epoch": 0.20671554966314024,
"grad_norm": 0.17367282509803772,
"learning_rate": 0.004632140669115353,
"loss": 3.2628250122070312,
"num_input_tokens_seen": 2002780160,
"step": 3820,
"train_runtime": 17390.4197,
"train_tokens_per_second": 115165.717
},
{
"epoch": 0.20725668984550447,
"grad_norm": 0.1866482049226761,
"learning_rate": 0.004629983584266224,
"loss": 3.255748748779297,
"num_input_tokens_seen": 2008023040,
"step": 3830,
"train_runtime": 17435.5785,
"train_tokens_per_second": 115168.134
},
{
"epoch": 0.20779783002786872,
"grad_norm": 0.18709656596183777,
"learning_rate": 0.004627820759913699,
"loss": 3.2663009643554686,
"num_input_tokens_seen": 2013265920,
"step": 3840,
"train_runtime": 17480.7483,
"train_tokens_per_second": 115170.465
},
{
"epoch": 0.20833897021023295,
"grad_norm": 0.19228561222553253,
"learning_rate": 0.0046256522026607814,
"loss": 3.2513301849365233,
"num_input_tokens_seen": 2018508800,
"step": 3850,
"train_runtime": 17525.9241,
"train_tokens_per_second": 115172.746
},
{
"epoch": 0.2088801103925972,
"grad_norm": 0.17255409061908722,
"learning_rate": 0.004623477919127976,
"loss": 3.243180847167969,
"num_input_tokens_seen": 2023751680,
"step": 3860,
"train_runtime": 17571.1102,
"train_tokens_per_second": 115174.947
},
{
"epoch": 0.20942125057496144,
"grad_norm": 0.176405668258667,
"learning_rate": 0.004621297915953271,
"loss": 3.2499061584472657,
"num_input_tokens_seen": 2028994560,
"step": 3870,
"train_runtime": 17616.3069,
"train_tokens_per_second": 115177.067
},
{
"epoch": 0.20996239075732567,
"grad_norm": 0.17931312322616577,
"learning_rate": 0.004619112199792115,
"loss": 3.263928985595703,
"num_input_tokens_seen": 2034237440,
"step": 3880,
"train_runtime": 17661.4804,
"train_tokens_per_second": 115179.328
},
{
"epoch": 0.21050353093968993,
"grad_norm": 0.1927679032087326,
"learning_rate": 0.004616920777317401,
"loss": 3.243641662597656,
"num_input_tokens_seen": 2039480320,
"step": 3890,
"train_runtime": 17706.6588,
"train_tokens_per_second": 115181.545
},
{
"epoch": 0.21104467112205416,
"grad_norm": 0.1834532767534256,
"learning_rate": 0.00461472365521944,
"loss": 3.2529090881347655,
"num_input_tokens_seen": 2044723200,
"step": 3900,
"train_runtime": 17751.8407,
"train_tokens_per_second": 115183.728
},
{
"epoch": 0.21158581130441842,
"grad_norm": 0.18204724788665771,
"learning_rate": 0.004612520840205942,
"loss": 3.252873992919922,
"num_input_tokens_seen": 2049966080,
"step": 3910,
"train_runtime": 17797.0199,
"train_tokens_per_second": 115185.918
},
{
"epoch": 0.21212695148678265,
"grad_norm": 0.17959408462047577,
"learning_rate": 0.0046103123390020045,
"loss": 3.2571083068847657,
"num_input_tokens_seen": 2055208960,
"step": 3920,
"train_runtime": 17842.2041,
"train_tokens_per_second": 115188.065
},
{
"epoch": 0.21266809166914688,
"grad_norm": 0.18271717429161072,
"learning_rate": 0.004608098158350076,
"loss": 3.2583240509033202,
"num_input_tokens_seen": 2060451840,
"step": 3930,
"train_runtime": 17887.3836,
"train_tokens_per_second": 115190.23
},
{
"epoch": 0.21320923185151114,
"grad_norm": 0.1708153486251831,
"learning_rate": 0.004605878305009951,
"loss": 3.2490577697753906,
"num_input_tokens_seen": 2065694720,
"step": 3940,
"train_runtime": 17932.5711,
"train_tokens_per_second": 115192.334
},
{
"epoch": 0.21375037203387537,
"grad_norm": 0.17891845107078552,
"learning_rate": 0.004603652785758739,
"loss": 3.253165435791016,
"num_input_tokens_seen": 2070937600,
"step": 3950,
"train_runtime": 17977.7786,
"train_tokens_per_second": 115194.299
},
{
"epoch": 0.21429151221623963,
"grad_norm": 0.19264911115169525,
"learning_rate": 0.0046014216073908465,
"loss": 3.252245330810547,
"num_input_tokens_seen": 2076180480,
"step": 3960,
"train_runtime": 18022.9578,
"train_tokens_per_second": 115196.434
},
{
"epoch": 0.21483265239860386,
"grad_norm": 0.17727237939834595,
"learning_rate": 0.00459918477671796,
"loss": 3.2557418823242186,
"num_input_tokens_seen": 2081423360,
"step": 3970,
"train_runtime": 18068.1382,
"train_tokens_per_second": 115198.552
},
{
"epoch": 0.2153737925809681,
"grad_norm": 0.18832355737686157,
"learning_rate": 0.00459694230056902,
"loss": 3.2547958374023436,
"num_input_tokens_seen": 2086666240,
"step": 3980,
"train_runtime": 18113.3099,
"train_tokens_per_second": 115200.714
},
{
"epoch": 0.21591493276333235,
"grad_norm": 0.1745108813047409,
"learning_rate": 0.004594694185790203,
"loss": 3.2427162170410155,
"num_input_tokens_seen": 2091909120,
"step": 3990,
"train_runtime": 18158.476,
"train_tokens_per_second": 115202.901
},
{
"epoch": 0.21645607294569658,
"grad_norm": 0.18806034326553345,
"learning_rate": 0.004592440439244901,
"loss": 3.247505950927734,
"num_input_tokens_seen": 2097152000,
"step": 4000,
"train_runtime": 18203.6569,
"train_tokens_per_second": 115204.984
},
{
"epoch": 0.21645607294569658,
"eval_loss": 3.1910831928253174,
"eval_runtime": 1.9924,
"eval_samples_per_second": 250.957,
"eval_steps_per_second": 4.015,
"num_input_tokens_seen": 2097152000,
"step": 4000
},
{
"epoch": 0.21699721312806083,
"grad_norm": 0.18182304501533508,
"learning_rate": 0.004590181067813696,
"loss": 3.2401611328125,
"num_input_tokens_seen": 2102394880,
"step": 4010,
"train_runtime": 18253.3021,
"train_tokens_per_second": 115178.879
},
{
"epoch": 0.21753835331042506,
"grad_norm": 0.19632118940353394,
"learning_rate": 0.004587916078394347,
"loss": 3.248242950439453,
"num_input_tokens_seen": 2107637760,
"step": 4020,
"train_runtime": 18298.4585,
"train_tokens_per_second": 115181.164
},
{
"epoch": 0.2180794934927893,
"grad_norm": 0.17511795461177826,
"learning_rate": 0.004585645477901763,
"loss": 3.2442108154296876,
"num_input_tokens_seen": 2112880640,
"step": 4030,
"train_runtime": 18343.616,
"train_tokens_per_second": 115183.432
},
{
"epoch": 0.21862063367515355,
"grad_norm": 0.18919962644577026,
"learning_rate": 0.004583369273267981,
"loss": 3.2474128723144533,
"num_input_tokens_seen": 2118123520,
"step": 4040,
"train_runtime": 18388.8196,
"train_tokens_per_second": 115185.399
},
{
"epoch": 0.21916177385751778,
"grad_norm": 0.1883443295955658,
"learning_rate": 0.00458108747144215,
"loss": 3.2397232055664062,
"num_input_tokens_seen": 2123366400,
"step": 4050,
"train_runtime": 18433.9903,
"train_tokens_per_second": 115187.562
},
{
"epoch": 0.21970291403988204,
"grad_norm": 0.16874343156814575,
"learning_rate": 0.004578800079390506,
"loss": 3.243609619140625,
"num_input_tokens_seen": 2128609280,
"step": 4060,
"train_runtime": 18479.1743,
"train_tokens_per_second": 115189.632
},
{
"epoch": 0.22024405422224627,
"grad_norm": 0.1780671924352646,
"learning_rate": 0.004576507104096353,
"loss": 3.249961090087891,
"num_input_tokens_seen": 2133852160,
"step": 4070,
"train_runtime": 18524.3424,
"train_tokens_per_second": 115191.79
},
{
"epoch": 0.2207851944046105,
"grad_norm": 0.1814623475074768,
"learning_rate": 0.0045742085525600365,
"loss": 3.247069549560547,
"num_input_tokens_seen": 2139095040,
"step": 4080,
"train_runtime": 18569.5288,
"train_tokens_per_second": 115193.825
},
{
"epoch": 0.22132633458697476,
"grad_norm": 0.18335077166557312,
"learning_rate": 0.004571904431798931,
"loss": 3.241147994995117,
"num_input_tokens_seen": 2144337920,
"step": 4090,
"train_runtime": 18614.7052,
"train_tokens_per_second": 115195.911
},
{
"epoch": 0.221867474769339,
"grad_norm": 0.16724441945552826,
"learning_rate": 0.004569594748847409,
"loss": 3.24347038269043,
"num_input_tokens_seen": 2149580800,
"step": 4100,
"train_runtime": 18659.889,
"train_tokens_per_second": 115197.942
},
{
"epoch": 0.22240861495170325,
"grad_norm": 0.17200608551502228,
"learning_rate": 0.004567279510756828,
"loss": 3.2341545104980467,
"num_input_tokens_seen": 2154823680,
"step": 4110,
"train_runtime": 18705.0717,
"train_tokens_per_second": 115199.969
},
{
"epoch": 0.22294975513406748,
"grad_norm": 0.178621307015419,
"learning_rate": 0.0045649587245955026,
"loss": 3.2321949005126953,
"num_input_tokens_seen": 2160066560,
"step": 4120,
"train_runtime": 18750.2407,
"train_tokens_per_second": 115202.071
},
{
"epoch": 0.2234908953164317,
"grad_norm": 0.18516632914543152,
"learning_rate": 0.0045626323974486864,
"loss": 3.238597869873047,
"num_input_tokens_seen": 2165309440,
"step": 4130,
"train_runtime": 18795.4162,
"train_tokens_per_second": 115204.123
},
{
"epoch": 0.22403203549879597,
"grad_norm": 0.20164692401885986,
"learning_rate": 0.004560300536418549,
"loss": 3.237165832519531,
"num_input_tokens_seen": 2170552320,
"step": 4140,
"train_runtime": 18840.5926,
"train_tokens_per_second": 115206.16
},
{
"epoch": 0.2245731756811602,
"grad_norm": 0.1872573047876358,
"learning_rate": 0.004557963148624155,
"loss": 3.2406959533691406,
"num_input_tokens_seen": 2175795200,
"step": 4150,
"train_runtime": 18885.7554,
"train_tokens_per_second": 115208.269
},
{
"epoch": 0.22511431586352446,
"grad_norm": 0.1811392605304718,
"learning_rate": 0.0045556202412014414,
"loss": 3.235840606689453,
"num_input_tokens_seen": 2181038080,
"step": 4160,
"train_runtime": 18930.9167,
"train_tokens_per_second": 115210.379
},
{
"epoch": 0.22565545604588869,
"grad_norm": 0.1595960259437561,
"learning_rate": 0.0045532718213031976,
"loss": 3.2397125244140623,
"num_input_tokens_seen": 2186280960,
"step": 4170,
"train_runtime": 18976.0748,
"train_tokens_per_second": 115212.497
},
{
"epoch": 0.22619659622825292,
"grad_norm": 0.16895633935928345,
"learning_rate": 0.00455091789609904,
"loss": 3.2353279113769533,
"num_input_tokens_seen": 2191523840,
"step": 4180,
"train_runtime": 19021.2321,
"train_tokens_per_second": 115214.61
},
{
"epoch": 0.22673773641061717,
"grad_norm": 0.17124150693416595,
"learning_rate": 0.004548558472775396,
"loss": 3.2387535095214846,
"num_input_tokens_seen": 2196766720,
"step": 4190,
"train_runtime": 19066.399,
"train_tokens_per_second": 115216.656
},
{
"epoch": 0.2272788765929814,
"grad_norm": 0.1730462908744812,
"learning_rate": 0.004546193558535476,
"loss": 3.228282165527344,
"num_input_tokens_seen": 2202009600,
"step": 4200,
"train_runtime": 19115.1785,
"train_tokens_per_second": 115196.916
},
{
"epoch": 0.22782001677534566,
"grad_norm": 0.2116994708776474,
"learning_rate": 0.004543823160599253,
"loss": 3.228871154785156,
"num_input_tokens_seen": 2207252480,
"step": 4210,
"train_runtime": 19160.343,
"train_tokens_per_second": 115199.007
},
{
"epoch": 0.2283611569577099,
"grad_norm": 0.1870228499174118,
"learning_rate": 0.004541447286203444,
"loss": 3.2268039703369142,
"num_input_tokens_seen": 2212495360,
"step": 4220,
"train_runtime": 19205.5057,
"train_tokens_per_second": 115201.099
},
{
"epoch": 0.22890229714007412,
"grad_norm": 0.18021926283836365,
"learning_rate": 0.004539065942601484,
"loss": 3.2385711669921875,
"num_input_tokens_seen": 2217738240,
"step": 4230,
"train_runtime": 19250.6698,
"train_tokens_per_second": 115203.173
},
{
"epoch": 0.22944343732243838,
"grad_norm": 0.178096741437912,
"learning_rate": 0.004536679137063506,
"loss": 3.2425048828125,
"num_input_tokens_seen": 2222981120,
"step": 4240,
"train_runtime": 19295.8409,
"train_tokens_per_second": 115205.195
},
{
"epoch": 0.2299845775048026,
"grad_norm": 0.17963331937789917,
"learning_rate": 0.004534286876876316,
"loss": 3.2272270202636717,
"num_input_tokens_seen": 2228224000,
"step": 4250,
"train_runtime": 19341.0226,
"train_tokens_per_second": 115207.145
},
{
"epoch": 0.23052571768716687,
"grad_norm": 0.1644730269908905,
"learning_rate": 0.004531889169343374,
"loss": 3.232299041748047,
"num_input_tokens_seen": 2233466880,
"step": 4260,
"train_runtime": 19386.1815,
"train_tokens_per_second": 115209.221
},
{
"epoch": 0.2310668578695311,
"grad_norm": 0.18202030658721924,
"learning_rate": 0.004529486021784774,
"loss": 3.232588195800781,
"num_input_tokens_seen": 2238709760,
"step": 4270,
"train_runtime": 19431.3552,
"train_tokens_per_second": 115211.201
},
{
"epoch": 0.23160799805189533,
"grad_norm": 0.1674603968858719,
"learning_rate": 0.004527077441537213,
"loss": 3.2268638610839844,
"num_input_tokens_seen": 2243952640,
"step": 4280,
"train_runtime": 19476.5366,
"train_tokens_per_second": 115213.125
},
{
"epoch": 0.2321491382342596,
"grad_norm": 0.17482970654964447,
"learning_rate": 0.004524663435953974,
"loss": 3.231060791015625,
"num_input_tokens_seen": 2249195520,
"step": 4290,
"train_runtime": 19521.6994,
"train_tokens_per_second": 115215.15
},
{
"epoch": 0.23269027841662382,
"grad_norm": 0.1693650186061859,
"learning_rate": 0.004522244012404908,
"loss": 3.2219474792480467,
"num_input_tokens_seen": 2254438400,
"step": 4300,
"train_runtime": 19566.837,
"train_tokens_per_second": 115217.314
},
{
"epoch": 0.23323141859898808,
"grad_norm": 0.16282694041728973,
"learning_rate": 0.004519819178276401,
"loss": 3.214075469970703,
"num_input_tokens_seen": 2259681280,
"step": 4310,
"train_runtime": 19611.992,
"train_tokens_per_second": 115219.366
},
{
"epoch": 0.2337725587813523,
"grad_norm": 0.17166836559772491,
"learning_rate": 0.004517388940971363,
"loss": 3.229071044921875,
"num_input_tokens_seen": 2264924160,
"step": 4320,
"train_runtime": 19657.1365,
"train_tokens_per_second": 115221.47
},
{
"epoch": 0.23431369896371654,
"grad_norm": 0.1811853051185608,
"learning_rate": 0.004514953307909195,
"loss": 3.2278045654296874,
"num_input_tokens_seen": 2270167040,
"step": 4330,
"train_runtime": 19702.2886,
"train_tokens_per_second": 115223.52
},
{
"epoch": 0.2348548391460808,
"grad_norm": 0.18831849098205566,
"learning_rate": 0.0045125122865257725,
"loss": 3.2335960388183596,
"num_input_tokens_seen": 2275409920,
"step": 4340,
"train_runtime": 19747.4554,
"train_tokens_per_second": 115225.475
},
{
"epoch": 0.23539597932844503,
"grad_norm": 0.18200556933879852,
"learning_rate": 0.004510065884273422,
"loss": 3.230799102783203,
"num_input_tokens_seen": 2280652800,
"step": 4350,
"train_runtime": 19792.6287,
"train_tokens_per_second": 115227.383
},
{
"epoch": 0.23593711951080928,
"grad_norm": 0.18054424226284027,
"learning_rate": 0.004507614108620896,
"loss": 3.2332107543945314,
"num_input_tokens_seen": 2285895680,
"step": 4360,
"train_runtime": 19837.7879,
"train_tokens_per_second": 115229.364
},
{
"epoch": 0.23647825969317351,
"grad_norm": 0.17672619223594666,
"learning_rate": 0.004505156967053355,
"loss": 3.229229736328125,
"num_input_tokens_seen": 2291138560,
"step": 4370,
"train_runtime": 19882.9214,
"train_tokens_per_second": 115231.485
},
{
"epoch": 0.23701939987553775,
"grad_norm": 0.18023458123207092,
"learning_rate": 0.004502694467072336,
"loss": 3.221567916870117,
"num_input_tokens_seen": 2296381440,
"step": 4380,
"train_runtime": 19928.0618,
"train_tokens_per_second": 115233.557
},
{
"epoch": 0.237560540057902,
"grad_norm": 0.18084236979484558,
"learning_rate": 0.0045002266161957415,
"loss": 3.2244552612304687,
"num_input_tokens_seen": 2301624320,
"step": 4390,
"train_runtime": 19973.1892,
"train_tokens_per_second": 115235.694
},
{
"epoch": 0.23810168024026623,
"grad_norm": 0.17632804811000824,
"learning_rate": 0.004497753421957804,
"loss": 3.2264179229736327,
"num_input_tokens_seen": 2306867200,
"step": 4400,
"train_runtime": 20018.2945,
"train_tokens_per_second": 115237.949
},
{
"epoch": 0.2386428204226305,
"grad_norm": 0.18496030569076538,
"learning_rate": 0.004495274891909074,
"loss": 3.2306861877441406,
"num_input_tokens_seen": 2312110080,
"step": 4410,
"train_runtime": 20063.4387,
"train_tokens_per_second": 115239.97
},
{
"epoch": 0.23918396060499472,
"grad_norm": 0.19217975437641144,
"learning_rate": 0.004492791033616388,
"loss": 3.2278289794921875,
"num_input_tokens_seen": 2317352960,
"step": 4420,
"train_runtime": 20108.5903,
"train_tokens_per_second": 115241.94
},
{
"epoch": 0.23972510078735895,
"grad_norm": 0.17978626489639282,
"learning_rate": 0.004490301854662851,
"loss": 3.222820281982422,
"num_input_tokens_seen": 2322595840,
"step": 4430,
"train_runtime": 20153.7635,
"train_tokens_per_second": 115243.778
},
{
"epoch": 0.2402662409697232,
"grad_norm": 0.1925116330385208,
"learning_rate": 0.0044878073626478145,
"loss": 3.216511535644531,
"num_input_tokens_seen": 2327838720,
"step": 4440,
"train_runtime": 20198.9336,
"train_tokens_per_second": 115245.625
},
{
"epoch": 0.24080738115208744,
"grad_norm": 0.1764633059501648,
"learning_rate": 0.004485307565186844,
"loss": 3.2247901916503907,
"num_input_tokens_seen": 2333081600,
"step": 4450,
"train_runtime": 20244.1177,
"train_tokens_per_second": 115247.383
},
{
"epoch": 0.2413485213344517,
"grad_norm": 0.18181581795215607,
"learning_rate": 0.0044828024699117095,
"loss": 3.2144775390625,
"num_input_tokens_seen": 2338324480,
"step": 4460,
"train_runtime": 20289.2915,
"train_tokens_per_second": 115249.193
},
{
"epoch": 0.24188966151681593,
"grad_norm": 0.1797225922346115,
"learning_rate": 0.0044802920844703486,
"loss": 3.2179054260253905,
"num_input_tokens_seen": 2343567360,
"step": 4470,
"train_runtime": 20334.4596,
"train_tokens_per_second": 115251.028
},
{
"epoch": 0.24243080169918016,
"grad_norm": 0.17131617665290833,
"learning_rate": 0.004477776416526856,
"loss": 3.2136348724365233,
"num_input_tokens_seen": 2348810240,
"step": 4480,
"train_runtime": 20379.6251,
"train_tokens_per_second": 115252.868
},
{
"epoch": 0.24297194188154442,
"grad_norm": 0.17490802705287933,
"learning_rate": 0.004475255473761447,
"loss": 3.223601531982422,
"num_input_tokens_seen": 2354053120,
"step": 4490,
"train_runtime": 20424.7932,
"train_tokens_per_second": 115254.686
},
{
"epoch": 0.24351308206390865,
"grad_norm": 0.18434712290763855,
"learning_rate": 0.004472729263870446,
"loss": 3.219706726074219,
"num_input_tokens_seen": 2359296000,
"step": 4500,
"train_runtime": 20469.9487,
"train_tokens_per_second": 115256.566
},
{
"epoch": 0.24351308206390865,
"eval_loss": 3.1681437492370605,
"eval_runtime": 1.9847,
"eval_samples_per_second": 251.921,
"eval_steps_per_second": 4.031,
"num_input_tokens_seen": 2359296000,
"step": 4500
},
{
"epoch": 0.2440542222462729,
"grad_norm": 0.18588702380657196,
"learning_rate": 0.0044701977945662535,
"loss": 3.2231178283691406,
"num_input_tokens_seen": 2364538880,
"step": 4510,
"train_runtime": 20517.077,
"train_tokens_per_second": 115247.356
},
{
"epoch": 0.24459536242863714,
"grad_norm": 0.1629338264465332,
"learning_rate": 0.004467661073577332,
"loss": 3.2203128814697264,
"num_input_tokens_seen": 2369781760,
"step": 4520,
"train_runtime": 20562.2414,
"train_tokens_per_second": 115249.195
},
{
"epoch": 0.24513650261100137,
"grad_norm": 0.19198022782802582,
"learning_rate": 0.00446511910864817,
"loss": 3.2169837951660156,
"num_input_tokens_seen": 2375024640,
"step": 4530,
"train_runtime": 20607.3939,
"train_tokens_per_second": 115251.092
},
{
"epoch": 0.24567764279336562,
"grad_norm": 0.17990648746490479,
"learning_rate": 0.004462571907539273,
"loss": 3.2237472534179688,
"num_input_tokens_seen": 2380267520,
"step": 4540,
"train_runtime": 20652.5476,
"train_tokens_per_second": 115252.974
},
{
"epoch": 0.24621878297572986,
"grad_norm": 0.17402444779872894,
"learning_rate": 0.004460019478027127,
"loss": 3.2200748443603517,
"num_input_tokens_seen": 2385510400,
"step": 4550,
"train_runtime": 20697.698,
"train_tokens_per_second": 115254.865
},
{
"epoch": 0.2467599231580941,
"grad_norm": 0.17017342150211334,
"learning_rate": 0.004457461827904183,
"loss": 3.2241039276123047,
"num_input_tokens_seen": 2390753280,
"step": 4560,
"train_runtime": 20742.8484,
"train_tokens_per_second": 115256.749
},
{
"epoch": 0.24730106334045834,
"grad_norm": 0.17307622730731964,
"learning_rate": 0.004454898964978828,
"loss": 3.2237174987792967,
"num_input_tokens_seen": 2395996160,
"step": 4570,
"train_runtime": 20788.0181,
"train_tokens_per_second": 115258.518
},
{
"epoch": 0.24784220352282257,
"grad_norm": 0.18426761031150818,
"learning_rate": 0.004452330897075365,
"loss": 3.2148464202880858,
"num_input_tokens_seen": 2401239040,
"step": 4580,
"train_runtime": 20836.8458,
"train_tokens_per_second": 115240.045
},
{
"epoch": 0.24838334370518683,
"grad_norm": 0.18935158848762512,
"learning_rate": 0.004449757632033987,
"loss": 3.2203189849853517,
"num_input_tokens_seen": 2406481920,
"step": 4590,
"train_runtime": 20882.0043,
"train_tokens_per_second": 115241.903
},
{
"epoch": 0.24892448388755106,
"grad_norm": 0.17916643619537354,
"learning_rate": 0.004447179177710755,
"loss": 3.220214080810547,
"num_input_tokens_seen": 2411724800,
"step": 4600,
"train_runtime": 20927.1721,
"train_tokens_per_second": 115243.703
},
{
"epoch": 0.24946562406991532,
"grad_norm": 0.1714351922273636,
"learning_rate": 0.0044445955419775696,
"loss": 3.2130130767822265,
"num_input_tokens_seen": 2416967680,
"step": 4610,
"train_runtime": 20972.3937,
"train_tokens_per_second": 115245.199
},
{
"epoch": 0.2500067642522796,
"grad_norm": 0.17399680614471436,
"learning_rate": 0.004442006732722152,
"loss": 3.2150115966796875,
"num_input_tokens_seen": 2422210560,
"step": 4620,
"train_runtime": 21017.5875,
"train_tokens_per_second": 115246.841
},
{
"epoch": 0.2505479044346438,
"grad_norm": 0.18769405782222748,
"learning_rate": 0.00443941275784802,
"loss": 3.2187454223632814,
"num_input_tokens_seen": 2427453440,
"step": 4630,
"train_runtime": 21062.785,
"train_tokens_per_second": 115248.456
},
{
"epoch": 0.25108904461700804,
"grad_norm": 0.1909925937652588,
"learning_rate": 0.004436813625274458,
"loss": 3.228108215332031,
"num_input_tokens_seen": 2432696320,
"step": 4640,
"train_runtime": 21107.9893,
"train_tokens_per_second": 115250.026
},
{
"epoch": 0.25163018479937227,
"grad_norm": 0.16732154786586761,
"learning_rate": 0.004434209342936497,
"loss": 3.213469314575195,
"num_input_tokens_seen": 2437939200,
"step": 4650,
"train_runtime": 21153.1981,
"train_tokens_per_second": 115251.566
},
{
"epoch": 0.2521713249817365,
"grad_norm": 0.17457814514636993,
"learning_rate": 0.0044315999187848915,
"loss": 3.224944305419922,
"num_input_tokens_seen": 2443182080,
"step": 4660,
"train_runtime": 21198.4016,
"train_tokens_per_second": 115253.127
},
{
"epoch": 0.2527124651641008,
"grad_norm": 0.192255899310112,
"learning_rate": 0.004428985360786096,
"loss": 3.227398681640625,
"num_input_tokens_seen": 2448424960,
"step": 4670,
"train_runtime": 21243.6232,
"train_tokens_per_second": 115254.584
},
{
"epoch": 0.253253605346465,
"grad_norm": 0.17784617841243744,
"learning_rate": 0.004426365676922234,
"loss": 3.2128623962402343,
"num_input_tokens_seen": 2453667840,
"step": 4680,
"train_runtime": 21288.8398,
"train_tokens_per_second": 115256.062
},
{
"epoch": 0.25379474552882925,
"grad_norm": 0.17195752263069153,
"learning_rate": 0.00442374087519108,
"loss": 3.2142982482910156,
"num_input_tokens_seen": 2458910720,
"step": 4690,
"train_runtime": 21334.0638,
"train_tokens_per_second": 115257.493
},
{
"epoch": 0.2543358857111935,
"grad_norm": 0.1722942292690277,
"learning_rate": 0.004421110963606032,
"loss": 3.210185241699219,
"num_input_tokens_seen": 2464153600,
"step": 4700,
"train_runtime": 21379.267,
"train_tokens_per_second": 115259.031
},
{
"epoch": 0.2548770258935577,
"grad_norm": 0.16966107487678528,
"learning_rate": 0.00441847595019609,
"loss": 3.2123428344726563,
"num_input_tokens_seen": 2469396480,
"step": 4710,
"train_runtime": 21424.483,
"train_tokens_per_second": 115260.493
},
{
"epoch": 0.255418166075922,
"grad_norm": 0.18033796548843384,
"learning_rate": 0.004415835843005828,
"loss": 3.2065505981445312,
"num_input_tokens_seen": 2474639360,
"step": 4720,
"train_runtime": 21469.6877,
"train_tokens_per_second": 115262.01
},
{
"epoch": 0.2559593062582862,
"grad_norm": 0.18272215127944946,
"learning_rate": 0.004413190650095373,
"loss": 3.2069171905517577,
"num_input_tokens_seen": 2479882240,
"step": 4730,
"train_runtime": 21514.9,
"train_tokens_per_second": 115263.48
},
{
"epoch": 0.25650044644065045,
"grad_norm": 0.17386901378631592,
"learning_rate": 0.004410540379540377,
"loss": 3.2177162170410156,
"num_input_tokens_seen": 2485125120,
"step": 4740,
"train_runtime": 21560.142,
"train_tokens_per_second": 115264.784
},
{
"epoch": 0.2570415866230147,
"grad_norm": 0.17471922934055328,
"learning_rate": 0.0044078850394319935,
"loss": 3.2096931457519533,
"num_input_tokens_seen": 2490368000,
"step": 4750,
"train_runtime": 21605.3695,
"train_tokens_per_second": 115266.161
},
{
"epoch": 0.2575827268053789,
"grad_norm": 0.188929483294487,
"learning_rate": 0.004405224637876854,
"loss": 3.215177536010742,
"num_input_tokens_seen": 2495610880,
"step": 4760,
"train_runtime": 21650.575,
"train_tokens_per_second": 115267.649
},
{
"epoch": 0.2581238669877432,
"grad_norm": 0.18201977014541626,
"learning_rate": 0.0044025591829970415,
"loss": 3.2025718688964844,
"num_input_tokens_seen": 2500853760,
"step": 4770,
"train_runtime": 21695.7857,
"train_tokens_per_second": 115269.103
},
{
"epoch": 0.25866500717010743,
"grad_norm": 0.18745562434196472,
"learning_rate": 0.004399888682930069,
"loss": 3.2124725341796876,
"num_input_tokens_seen": 2506096640,
"step": 4780,
"train_runtime": 21740.9904,
"train_tokens_per_second": 115270.583
},
{
"epoch": 0.25920614735247166,
"grad_norm": 0.18076451122760773,
"learning_rate": 0.004397213145828847,
"loss": 3.2005435943603517,
"num_input_tokens_seen": 2511339520,
"step": 4790,
"train_runtime": 21786.1967,
"train_tokens_per_second": 115272.049
},
{
"epoch": 0.2597472875348359,
"grad_norm": 0.16596098244190216,
"learning_rate": 0.004394532579861671,
"loss": 3.197236251831055,
"num_input_tokens_seen": 2516582400,
"step": 4800,
"train_runtime": 21831.4029,
"train_tokens_per_second": 115273.508
},
{
"epoch": 0.2602884277172001,
"grad_norm": 0.17128406465053558,
"learning_rate": 0.004391846993212182,
"loss": 3.2089080810546875,
"num_input_tokens_seen": 2521825280,
"step": 4810,
"train_runtime": 21876.6005,
"train_tokens_per_second": 115275.007
},
{
"epoch": 0.2608295678995644,
"grad_norm": 0.17832306027412415,
"learning_rate": 0.004389156394079355,
"loss": 3.202547073364258,
"num_input_tokens_seen": 2527068160,
"step": 4820,
"train_runtime": 21921.8037,
"train_tokens_per_second": 115276.471
},
{
"epoch": 0.26137070808192864,
"grad_norm": 0.16681405901908875,
"learning_rate": 0.004386460790677465,
"loss": 3.2106822967529296,
"num_input_tokens_seen": 2532311040,
"step": 4830,
"train_runtime": 21967.0048,
"train_tokens_per_second": 115277.939
},
{
"epoch": 0.26191184826429287,
"grad_norm": 0.17566899955272675,
"learning_rate": 0.004383760191236065,
"loss": 3.2070526123046874,
"num_input_tokens_seen": 2537553920,
"step": 4840,
"train_runtime": 22012.208,
"train_tokens_per_second": 115279.39
},
{
"epoch": 0.2624529884466571,
"grad_norm": 0.17574016749858856,
"learning_rate": 0.00438105460399996,
"loss": 3.203447723388672,
"num_input_tokens_seen": 2542796800,
"step": 4850,
"train_runtime": 22057.4092,
"train_tokens_per_second": 115280.846
},
{
"epoch": 0.26299412862902133,
"grad_norm": 0.16241556406021118,
"learning_rate": 0.004378344037229184,
"loss": 3.2026832580566404,
"num_input_tokens_seen": 2548039680,
"step": 4860,
"train_runtime": 22102.6211,
"train_tokens_per_second": 115282.24
},
{
"epoch": 0.2635352688113856,
"grad_norm": 0.1805507242679596,
"learning_rate": 0.004375628499198973,
"loss": 3.2010284423828126,
"num_input_tokens_seen": 2553282560,
"step": 4870,
"train_runtime": 22147.8116,
"train_tokens_per_second": 115283.74
},
{
"epoch": 0.26407640899374984,
"grad_norm": 0.16756032407283783,
"learning_rate": 0.004372907998199739,
"loss": 3.2070991516113283,
"num_input_tokens_seen": 2558525440,
"step": 4880,
"train_runtime": 22192.9705,
"train_tokens_per_second": 115285.398
},
{
"epoch": 0.2646175491761141,
"grad_norm": 0.18972600996494293,
"learning_rate": 0.004370182542537047,
"loss": 3.214699554443359,
"num_input_tokens_seen": 2563768320,
"step": 4890,
"train_runtime": 22238.1209,
"train_tokens_per_second": 115287.094
},
{
"epoch": 0.2651586893584783,
"grad_norm": 0.1896647959947586,
"learning_rate": 0.004367452140531587,
"loss": 3.205576705932617,
"num_input_tokens_seen": 2569011200,
"step": 4900,
"train_runtime": 22283.3129,
"train_tokens_per_second": 115288.566
},
{
"epoch": 0.26569982954084254,
"grad_norm": 0.18498484790325165,
"learning_rate": 0.004364716800519152,
"loss": 3.2080978393554687,
"num_input_tokens_seen": 2574254080,
"step": 4910,
"train_runtime": 22328.4859,
"train_tokens_per_second": 115290.132
},
{
"epoch": 0.2662409697232068,
"grad_norm": 0.1854403018951416,
"learning_rate": 0.0043619765308506074,
"loss": 3.203238677978516,
"num_input_tokens_seen": 2579496960,
"step": 4920,
"train_runtime": 22373.6522,
"train_tokens_per_second": 115291.725
},
{
"epoch": 0.26678210990557105,
"grad_norm": 0.1691334992647171,
"learning_rate": 0.004359231339891872,
"loss": 3.1914302825927736,
"num_input_tokens_seen": 2584739840,
"step": 4930,
"train_runtime": 22418.8106,
"train_tokens_per_second": 115293.353
},
{
"epoch": 0.2673232500879353,
"grad_norm": 0.17332448065280914,
"learning_rate": 0.004356481236023887,
"loss": 3.2087932586669923,
"num_input_tokens_seen": 2589982720,
"step": 4940,
"train_runtime": 22463.9738,
"train_tokens_per_second": 115294.949
},
{
"epoch": 0.2678643902702995,
"grad_norm": 0.1679113507270813,
"learning_rate": 0.004353726227642593,
"loss": 3.2014122009277344,
"num_input_tokens_seen": 2595225600,
"step": 4950,
"train_runtime": 22509.1287,
"train_tokens_per_second": 115296.582
},
{
"epoch": 0.26840553045266374,
"grad_norm": 0.16913928091526031,
"learning_rate": 0.004350966323158903,
"loss": 3.1890819549560545,
"num_input_tokens_seen": 2600468480,
"step": 4960,
"train_runtime": 22554.2873,
"train_tokens_per_second": 115298.189
},
{
"epoch": 0.26894667063502803,
"grad_norm": 0.16906581819057465,
"learning_rate": 0.00434820153099868,
"loss": 3.202825927734375,
"num_input_tokens_seen": 2605711360,
"step": 4970,
"train_runtime": 22602.9949,
"train_tokens_per_second": 115281.686
},
{
"epoch": 0.26948781081739226,
"grad_norm": 0.16878265142440796,
"learning_rate": 0.004345431859602706,
"loss": 3.200624465942383,
"num_input_tokens_seen": 2610954240,
"step": 4980,
"train_runtime": 22648.1981,
"train_tokens_per_second": 115283.089
},
{
"epoch": 0.2700289509997565,
"grad_norm": 0.1862846463918686,
"learning_rate": 0.004342657317426662,
"loss": 3.206439971923828,
"num_input_tokens_seen": 2616197120,
"step": 4990,
"train_runtime": 22693.3935,
"train_tokens_per_second": 115284.526
},
{
"epoch": 0.2705700911821207,
"grad_norm": 0.16954657435417175,
"learning_rate": 0.004339877912941097,
"loss": 3.199533462524414,
"num_input_tokens_seen": 2621440000,
"step": 5000,
"train_runtime": 22738.6005,
"train_tokens_per_second": 115285.899
},
{
"epoch": 0.2705700911821207,
"eval_loss": 3.146559715270996,
"eval_runtime": 1.9859,
"eval_samples_per_second": 251.773,
"eval_steps_per_second": 4.028,
"num_input_tokens_seen": 2621440000,
"step": 5000
},
{
"epoch": 0.27111123136448495,
"grad_norm": 0.1746288388967514,
"learning_rate": 0.004337093654631402,
"loss": 3.195170593261719,
"num_input_tokens_seen": 2626682880,
"step": 5010,
"train_runtime": 22788.1861,
"train_tokens_per_second": 115265.114
},
{
"epoch": 0.27165237154684924,
"grad_norm": 0.182390496134758,
"learning_rate": 0.004334304550997793,
"loss": 3.184975433349609,
"num_input_tokens_seen": 2631925760,
"step": 5020,
"train_runtime": 22833.4175,
"train_tokens_per_second": 115266.397
},
{
"epoch": 0.27219351172921347,
"grad_norm": 0.18103830516338348,
"learning_rate": 0.004331510610555275,
"loss": 3.190489959716797,
"num_input_tokens_seen": 2637168640,
"step": 5030,
"train_runtime": 22878.6263,
"train_tokens_per_second": 115267.788
},
{
"epoch": 0.2727346519115777,
"grad_norm": 0.1782936155796051,
"learning_rate": 0.004328711841833618,
"loss": 3.196137237548828,
"num_input_tokens_seen": 2642411520,
"step": 5040,
"train_runtime": 22923.8218,
"train_tokens_per_second": 115269.24
},
{
"epoch": 0.2732757920939419,
"grad_norm": 0.185542032122612,
"learning_rate": 0.0043259082533773354,
"loss": 3.190313720703125,
"num_input_tokens_seen": 2647654400,
"step": 5050,
"train_runtime": 22969.0255,
"train_tokens_per_second": 115270.646
},
{
"epoch": 0.27381693227630616,
"grad_norm": 0.16143307089805603,
"learning_rate": 0.0043230998537456536,
"loss": 3.2025264739990233,
"num_input_tokens_seen": 2652897280,
"step": 5060,
"train_runtime": 23014.1965,
"train_tokens_per_second": 115272.209
},
{
"epoch": 0.27435807245867044,
"grad_norm": 0.16813282668590546,
"learning_rate": 0.004320286651512486,
"loss": 3.1958364486694335,
"num_input_tokens_seen": 2658140160,
"step": 5070,
"train_runtime": 23059.3886,
"train_tokens_per_second": 115273.662
},
{
"epoch": 0.2748992126410347,
"grad_norm": 0.18112315237522125,
"learning_rate": 0.004317468655266412,
"loss": 3.194669723510742,
"num_input_tokens_seen": 2663383040,
"step": 5080,
"train_runtime": 23104.5863,
"train_tokens_per_second": 115275.08
},
{
"epoch": 0.2754403528233989,
"grad_norm": 0.18187521398067474,
"learning_rate": 0.004314645873610643,
"loss": 3.1878196716308596,
"num_input_tokens_seen": 2668625920,
"step": 5090,
"train_runtime": 23149.7951,
"train_tokens_per_second": 115276.438
},
{
"epoch": 0.27598149300576313,
"grad_norm": 0.16804195940494537,
"learning_rate": 0.004311818315163001,
"loss": 3.2023330688476563,
"num_input_tokens_seen": 2673868800,
"step": 5100,
"train_runtime": 23194.9829,
"train_tokens_per_second": 115277.895
},
{
"epoch": 0.27652263318812736,
"grad_norm": 0.16169899702072144,
"learning_rate": 0.004308985988555892,
"loss": 3.195353889465332,
"num_input_tokens_seen": 2679111680,
"step": 5110,
"train_runtime": 23240.1686,
"train_tokens_per_second": 115279.356
},
{
"epoch": 0.27706377337049165,
"grad_norm": 0.1690625697374344,
"learning_rate": 0.004306148902436281,
"loss": 3.1894439697265624,
"num_input_tokens_seen": 2684354560,
"step": 5120,
"train_runtime": 23285.3699,
"train_tokens_per_second": 115280.735
},
{
"epoch": 0.2776049135528559,
"grad_norm": 0.1880822330713272,
"learning_rate": 0.00430330706546566,
"loss": 3.1982452392578127,
"num_input_tokens_seen": 2689597440,
"step": 5130,
"train_runtime": 23330.5682,
"train_tokens_per_second": 115282.123
},
{
"epoch": 0.2781460537352201,
"grad_norm": 0.1790066808462143,
"learning_rate": 0.004300460486320026,
"loss": 3.1980308532714843,
"num_input_tokens_seen": 2694840320,
"step": 5140,
"train_runtime": 23375.7756,
"train_tokens_per_second": 115283.461
},
{
"epoch": 0.27868719391758434,
"grad_norm": 0.16601233184337616,
"learning_rate": 0.004297609173689855,
"loss": 3.197714996337891,
"num_input_tokens_seen": 2700083200,
"step": 5150,
"train_runtime": 23420.9835,
"train_tokens_per_second": 115284.792
},
{
"epoch": 0.27922833409994857,
"grad_norm": 0.16672180593013763,
"learning_rate": 0.0042947531362800715,
"loss": 3.1988187789916993,
"num_input_tokens_seen": 2705326080,
"step": 5160,
"train_runtime": 23466.1808,
"train_tokens_per_second": 115286.169
},
{
"epoch": 0.27976947428231286,
"grad_norm": 0.19832877814769745,
"learning_rate": 0.00429189238281003,
"loss": 3.1931121826171873,
"num_input_tokens_seen": 2710568960,
"step": 5170,
"train_runtime": 23511.3921,
"train_tokens_per_second": 115287.472
},
{
"epoch": 0.2803106144646771,
"grad_norm": 0.1923927664756775,
"learning_rate": 0.004289026922013475,
"loss": 3.1957611083984374,
"num_input_tokens_seen": 2715811840,
"step": 5180,
"train_runtime": 23556.5915,
"train_tokens_per_second": 115288.829
},
{
"epoch": 0.2808517546470413,
"grad_norm": 0.17779354751110077,
"learning_rate": 0.00428615676263853,
"loss": 3.181416702270508,
"num_input_tokens_seen": 2721054720,
"step": 5190,
"train_runtime": 23601.8043,
"train_tokens_per_second": 115290.115
},
{
"epoch": 0.28139289482940555,
"grad_norm": 0.16895556449890137,
"learning_rate": 0.004283281913447657,
"loss": 3.1839942932128906,
"num_input_tokens_seen": 2726297600,
"step": 5200,
"train_runtime": 23647.0206,
"train_tokens_per_second": 115291.379
},
{
"epoch": 0.2819340350117698,
"grad_norm": 0.17021089792251587,
"learning_rate": 0.004280402383217639,
"loss": 3.193735122680664,
"num_input_tokens_seen": 2731540480,
"step": 5210,
"train_runtime": 23692.2429,
"train_tokens_per_second": 115292.608
},
{
"epoch": 0.28247517519413406,
"grad_norm": 0.16943930089473724,
"learning_rate": 0.00427751818073955,
"loss": 3.1817481994628904,
"num_input_tokens_seen": 2736783360,
"step": 5220,
"train_runtime": 23737.4424,
"train_tokens_per_second": 115293.944
},
{
"epoch": 0.2830163153764983,
"grad_norm": 0.15319055318832397,
"learning_rate": 0.004274629314818728,
"loss": 3.1803112030029297,
"num_input_tokens_seen": 2742026240,
"step": 5230,
"train_runtime": 23782.6783,
"train_tokens_per_second": 115295.099
},
{
"epoch": 0.2835574555588625,
"grad_norm": 0.1702287793159485,
"learning_rate": 0.004271735794274746,
"loss": 3.1876094818115233,
"num_input_tokens_seen": 2747269120,
"step": 5240,
"train_runtime": 23827.881,
"train_tokens_per_second": 115296.409
},
{
"epoch": 0.28409859574122676,
"grad_norm": 0.18369406461715698,
"learning_rate": 0.00426883762794139,
"loss": 3.1819345474243166,
"num_input_tokens_seen": 2752512000,
"step": 5250,
"train_runtime": 23873.0945,
"train_tokens_per_second": 115297.663
},
{
"epoch": 0.284639735923591,
"grad_norm": 0.1792212277650833,
"learning_rate": 0.004265934824666628,
"loss": 3.1884128570556642,
"num_input_tokens_seen": 2757754880,
"step": 5260,
"train_runtime": 23918.3193,
"train_tokens_per_second": 115298.857
},
{
"epoch": 0.28518087610595527,
"grad_norm": 0.17727087438106537,
"learning_rate": 0.0042630273933125865,
"loss": 3.194817543029785,
"num_input_tokens_seen": 2762997760,
"step": 5270,
"train_runtime": 23963.5296,
"train_tokens_per_second": 115300.117
},
{
"epoch": 0.2857220162883195,
"grad_norm": 0.15836812555789948,
"learning_rate": 0.004260115342755518,
"loss": 3.1808521270751955,
"num_input_tokens_seen": 2768240640,
"step": 5280,
"train_runtime": 24008.7153,
"train_tokens_per_second": 115301.49
},
{
"epoch": 0.28626315647068373,
"grad_norm": 0.17416301369667053,
"learning_rate": 0.00425719868188578,
"loss": 3.1919151306152345,
"num_input_tokens_seen": 2773483520,
"step": 5290,
"train_runtime": 24053.9251,
"train_tokens_per_second": 115302.742
},
{
"epoch": 0.28680429665304796,
"grad_norm": 0.16871845722198486,
"learning_rate": 0.004254277419607802,
"loss": 3.182635498046875,
"num_input_tokens_seen": 2778726400,
"step": 5300,
"train_runtime": 24099.1331,
"train_tokens_per_second": 115303.998
},
{
"epoch": 0.2873454368354122,
"grad_norm": 0.1787181943655014,
"learning_rate": 0.004251351564840067,
"loss": 3.18890495300293,
"num_input_tokens_seen": 2783969280,
"step": 5310,
"train_runtime": 24144.3426,
"train_tokens_per_second": 115305.243
},
{
"epoch": 0.2878865770177765,
"grad_norm": 0.1912972331047058,
"learning_rate": 0.00424842112651507,
"loss": 3.1834373474121094,
"num_input_tokens_seen": 2789212160,
"step": 5320,
"train_runtime": 24189.5491,
"train_tokens_per_second": 115306.496
},
{
"epoch": 0.2884277172001407,
"grad_norm": 0.16879980266094208,
"learning_rate": 0.004245486113579308,
"loss": 3.1814502716064452,
"num_input_tokens_seen": 2794455040,
"step": 5330,
"train_runtime": 24234.754,
"train_tokens_per_second": 115307.753
},
{
"epoch": 0.28896885738250494,
"grad_norm": 0.17917132377624512,
"learning_rate": 0.00424254653499324,
"loss": 3.188125228881836,
"num_input_tokens_seen": 2799697920,
"step": 5340,
"train_runtime": 24279.9641,
"train_tokens_per_second": 115308.981
},
{
"epoch": 0.28950999756486917,
"grad_norm": 0.17311090230941772,
"learning_rate": 0.004239602399731263,
"loss": 3.1844112396240236,
"num_input_tokens_seen": 2804940800,
"step": 5350,
"train_runtime": 24328.7709,
"train_tokens_per_second": 115293.157
},
{
"epoch": 0.2900511377472334,
"grad_norm": 0.17229342460632324,
"learning_rate": 0.004236653716781689,
"loss": 3.185770797729492,
"num_input_tokens_seen": 2810183680,
"step": 5360,
"train_runtime": 24373.9674,
"train_tokens_per_second": 115294.471
},
{
"epoch": 0.2905922779295977,
"grad_norm": 0.180856391787529,
"learning_rate": 0.0042337004951467075,
"loss": 3.1889812469482424,
"num_input_tokens_seen": 2815426560,
"step": 5370,
"train_runtime": 24419.1733,
"train_tokens_per_second": 115295.736
},
{
"epoch": 0.2911334181119619,
"grad_norm": 0.16839343309402466,
"learning_rate": 0.004230742743842371,
"loss": 3.1733203887939454,
"num_input_tokens_seen": 2820669440,
"step": 5380,
"train_runtime": 24464.3893,
"train_tokens_per_second": 115296.949
},
{
"epoch": 0.29167455829432615,
"grad_norm": 0.16889749467372894,
"learning_rate": 0.004227780471898559,
"loss": 3.1818462371826173,
"num_input_tokens_seen": 2825912320,
"step": 5390,
"train_runtime": 24509.5858,
"train_tokens_per_second": 115298.249
},
{
"epoch": 0.2922156984766904,
"grad_norm": 0.17744433879852295,
"learning_rate": 0.004224813688358949,
"loss": 3.1864446640014648,
"num_input_tokens_seen": 2831155200,
"step": 5400,
"train_runtime": 24554.7949,
"train_tokens_per_second": 115299.485
},
{
"epoch": 0.2927568386590546,
"grad_norm": 0.1737280935049057,
"learning_rate": 0.004221842402280996,
"loss": 3.180088424682617,
"num_input_tokens_seen": 2836398080,
"step": 5410,
"train_runtime": 24599.9993,
"train_tokens_per_second": 115300.738
},
{
"epoch": 0.2932979788414189,
"grad_norm": 0.16631047427654266,
"learning_rate": 0.004218866622735898,
"loss": 3.175667572021484,
"num_input_tokens_seen": 2841640960,
"step": 5420,
"train_runtime": 24645.2212,
"train_tokens_per_second": 115301.905
},
{
"epoch": 0.2938391190237831,
"grad_norm": 0.17272046208381653,
"learning_rate": 0.004215886358808577,
"loss": 3.185796546936035,
"num_input_tokens_seen": 2846883840,
"step": 5430,
"train_runtime": 24690.432,
"train_tokens_per_second": 115303.12
},
{
"epoch": 0.29438025920614735,
"grad_norm": 0.1690651923418045,
"learning_rate": 0.004212901619597638,
"loss": 3.1886520385742188,
"num_input_tokens_seen": 2852126720,
"step": 5440,
"train_runtime": 24735.6453,
"train_tokens_per_second": 115304.318
},
{
"epoch": 0.2949213993885116,
"grad_norm": 0.19146323204040527,
"learning_rate": 0.0042099124142153535,
"loss": 3.1789478302001952,
"num_input_tokens_seen": 2857369600,
"step": 5450,
"train_runtime": 24780.8456,
"train_tokens_per_second": 115305.573
},
{
"epoch": 0.2954625395708758,
"grad_norm": 0.1788649708032608,
"learning_rate": 0.00420691875178763,
"loss": 3.1887844085693358,
"num_input_tokens_seen": 2862612480,
"step": 5460,
"train_runtime": 24826.0467,
"train_tokens_per_second": 115306.819
},
{
"epoch": 0.2960036797532401,
"grad_norm": 0.19091546535491943,
"learning_rate": 0.004203920641453982,
"loss": 3.175608253479004,
"num_input_tokens_seen": 2867855360,
"step": 5470,
"train_runtime": 24871.2591,
"train_tokens_per_second": 115308.009
},
{
"epoch": 0.29654481993560433,
"grad_norm": 0.16818441450595856,
"learning_rate": 0.004200918092367501,
"loss": 3.1859344482421874,
"num_input_tokens_seen": 2873098240,
"step": 5480,
"train_runtime": 24916.485,
"train_tokens_per_second": 115309.131
},
{
"epoch": 0.29708596011796856,
"grad_norm": 0.1913134902715683,
"learning_rate": 0.0041979111136948325,
"loss": 3.1723804473876953,
"num_input_tokens_seen": 2878341120,
"step": 5490,
"train_runtime": 24961.6704,
"train_tokens_per_second": 115310.437
},
{
"epoch": 0.2976271003003328,
"grad_norm": 0.18261617422103882,
"learning_rate": 0.004194899714616144,
"loss": 3.179214286804199,
"num_input_tokens_seen": 2883584000,
"step": 5500,
"train_runtime": 25006.8704,
"train_tokens_per_second": 115311.67
},
{
"epoch": 0.2976271003003328,
"eval_loss": 3.126129388809204,
"eval_runtime": 1.9962,
"eval_samples_per_second": 250.471,
"eval_steps_per_second": 4.008,
"num_input_tokens_seen": 2883584000,
"step": 5500
},
{
"epoch": 0.298168240482697,
"grad_norm": 0.18416427075862885,
"learning_rate": 0.004191883904325097,
"loss": 3.1846160888671875,
"num_input_tokens_seen": 2888826880,
"step": 5510,
"train_runtime": 25054.1224,
"train_tokens_per_second": 115303.455
},
{
"epoch": 0.2987093806650613,
"grad_norm": 0.16038469970226288,
"learning_rate": 0.004188863692028823,
"loss": 3.180740737915039,
"num_input_tokens_seen": 2894069760,
"step": 5520,
"train_runtime": 25099.3557,
"train_tokens_per_second": 115304.544
},
{
"epoch": 0.29925052084742554,
"grad_norm": 0.16605685651302338,
"learning_rate": 0.004185839086947891,
"loss": 3.1796802520751952,
"num_input_tokens_seen": 2899312640,
"step": 5530,
"train_runtime": 25144.6135,
"train_tokens_per_second": 115305.516
},
{
"epoch": 0.29979166102978977,
"grad_norm": 0.1819118857383728,
"learning_rate": 0.004182810098316281,
"loss": 3.1764299392700197,
"num_input_tokens_seen": 2904555520,
"step": 5540,
"train_runtime": 25189.8702,
"train_tokens_per_second": 115306.49
},
{
"epoch": 0.300332801212154,
"grad_norm": 0.1876569390296936,
"learning_rate": 0.004179776735381355,
"loss": 3.18255500793457,
"num_input_tokens_seen": 2909798400,
"step": 5550,
"train_runtime": 25235.1612,
"train_tokens_per_second": 115307.304
},
{
"epoch": 0.30087394139451823,
"grad_norm": 0.1661430448293686,
"learning_rate": 0.004176739007403832,
"loss": 3.172201156616211,
"num_input_tokens_seen": 2915041280,
"step": 5560,
"train_runtime": 25280.4455,
"train_tokens_per_second": 115308.145
},
{
"epoch": 0.3014150815768825,
"grad_norm": 0.17655618488788605,
"learning_rate": 0.004173696923657755,
"loss": 3.17954158782959,
"num_input_tokens_seen": 2920284160,
"step": 5570,
"train_runtime": 25325.7247,
"train_tokens_per_second": 115309.007
},
{
"epoch": 0.30195622175924675,
"grad_norm": 0.17908194661140442,
"learning_rate": 0.0041706504934304655,
"loss": 3.1723983764648436,
"num_input_tokens_seen": 2925527040,
"step": 5580,
"train_runtime": 25370.9962,
"train_tokens_per_second": 115309.9
},
{
"epoch": 0.302497361941611,
"grad_norm": 0.17515423893928528,
"learning_rate": 0.004167599726022575,
"loss": 3.183238220214844,
"num_input_tokens_seen": 2930769920,
"step": 5590,
"train_runtime": 25416.2839,
"train_tokens_per_second": 115310.717
},
{
"epoch": 0.3030385021239752,
"grad_norm": 0.1749441921710968,
"learning_rate": 0.004164544630747937,
"loss": 3.185963821411133,
"num_input_tokens_seen": 2936012800,
"step": 5600,
"train_runtime": 25461.5455,
"train_tokens_per_second": 115311.649
},
{
"epoch": 0.30357964230633944,
"grad_norm": 0.1578006148338318,
"learning_rate": 0.004161485216933615,
"loss": 3.177383041381836,
"num_input_tokens_seen": 2941255680,
"step": 5610,
"train_runtime": 25506.8309,
"train_tokens_per_second": 115312.47
},
{
"epoch": 0.3041207824887037,
"grad_norm": 0.1903323382139206,
"learning_rate": 0.00415842149391986,
"loss": 3.179554748535156,
"num_input_tokens_seen": 2946498560,
"step": 5620,
"train_runtime": 25552.099,
"train_tokens_per_second": 115313.367
},
{
"epoch": 0.30466192267106795,
"grad_norm": 0.16383005678653717,
"learning_rate": 0.004155353471060077,
"loss": 3.160336494445801,
"num_input_tokens_seen": 2951741440,
"step": 5630,
"train_runtime": 25597.3865,
"train_tokens_per_second": 115314.172
},
{
"epoch": 0.3052030628534322,
"grad_norm": 0.1735740303993225,
"learning_rate": 0.004152281157720798,
"loss": 3.172795867919922,
"num_input_tokens_seen": 2956984320,
"step": 5640,
"train_runtime": 25642.6481,
"train_tokens_per_second": 115315.092
},
{
"epoch": 0.3057442030357964,
"grad_norm": 0.19910795986652374,
"learning_rate": 0.004149204563281657,
"loss": 3.1711971282958986,
"num_input_tokens_seen": 2962227200,
"step": 5650,
"train_runtime": 25687.9012,
"train_tokens_per_second": 115316.046
},
{
"epoch": 0.30628534321816064,
"grad_norm": 0.18566472828388214,
"learning_rate": 0.004146123697135352,
"loss": 3.177423095703125,
"num_input_tokens_seen": 2967470080,
"step": 5660,
"train_runtime": 25733.1722,
"train_tokens_per_second": 115316.917
},
{
"epoch": 0.30682648340052493,
"grad_norm": 0.16724054515361786,
"learning_rate": 0.004143038568687626,
"loss": 3.174397277832031,
"num_input_tokens_seen": 2972712960,
"step": 5670,
"train_runtime": 25778.4366,
"train_tokens_per_second": 115317.814
},
{
"epoch": 0.30736762358288916,
"grad_norm": 0.18052591383457184,
"learning_rate": 0.004139949187357236,
"loss": 3.172323226928711,
"num_input_tokens_seen": 2977955840,
"step": 5680,
"train_runtime": 25823.6944,
"train_tokens_per_second": 115318.738
},
{
"epoch": 0.3079087637652534,
"grad_norm": 0.1707129180431366,
"learning_rate": 0.004136855562575921,
"loss": 3.1627834320068358,
"num_input_tokens_seen": 2983198720,
"step": 5690,
"train_runtime": 25868.9566,
"train_tokens_per_second": 115319.638
},
{
"epoch": 0.3084499039476176,
"grad_norm": 0.18003937602043152,
"learning_rate": 0.004133757703788374,
"loss": 3.175765609741211,
"num_input_tokens_seen": 2988441600,
"step": 5700,
"train_runtime": 25914.2132,
"train_tokens_per_second": 115320.561
},
{
"epoch": 0.30899104412998185,
"grad_norm": 0.17585334181785583,
"learning_rate": 0.004130655620452215,
"loss": 3.1611761093139648,
"num_input_tokens_seen": 2993684480,
"step": 5710,
"train_runtime": 25959.4637,
"train_tokens_per_second": 115321.507
},
{
"epoch": 0.30953218431234614,
"grad_norm": 0.17584700882434845,
"learning_rate": 0.004127549322037963,
"loss": 3.1710134506225587,
"num_input_tokens_seen": 2998927360,
"step": 5720,
"train_runtime": 26004.7204,
"train_tokens_per_second": 115322.423
},
{
"epoch": 0.31007332449471037,
"grad_norm": 0.1671862006187439,
"learning_rate": 0.004124438818029003,
"loss": 3.171963691711426,
"num_input_tokens_seen": 3004170240,
"step": 5730,
"train_runtime": 26053.4016,
"train_tokens_per_second": 115308.177
},
{
"epoch": 0.3106144646770746,
"grad_norm": 0.17120610177516937,
"learning_rate": 0.004121324117921561,
"loss": 3.171039581298828,
"num_input_tokens_seen": 3009413120,
"step": 5740,
"train_runtime": 26098.5909,
"train_tokens_per_second": 115309.41
},
{
"epoch": 0.31115560485943883,
"grad_norm": 0.17267778515815735,
"learning_rate": 0.004118205231224675,
"loss": 3.1711191177368163,
"num_input_tokens_seen": 3014656000,
"step": 5750,
"train_runtime": 26143.7653,
"train_tokens_per_second": 115310.705
},
{
"epoch": 0.31169674504180306,
"grad_norm": 0.17473942041397095,
"learning_rate": 0.004115082167460159,
"loss": 3.1646095275878907,
"num_input_tokens_seen": 3019898880,
"step": 5760,
"train_runtime": 26188.9631,
"train_tokens_per_second": 115311.892
},
{
"epoch": 0.31223788522416734,
"grad_norm": 0.17137861251831055,
"learning_rate": 0.004111954936162586,
"loss": 3.1746740341186523,
"num_input_tokens_seen": 3025141760,
"step": 5770,
"train_runtime": 26234.3115,
"train_tokens_per_second": 115312.413
},
{
"epoch": 0.3127790254065316,
"grad_norm": 0.16885042190551758,
"learning_rate": 0.004108823546879249,
"loss": 3.162841033935547,
"num_input_tokens_seen": 3030384640,
"step": 5780,
"train_runtime": 26279.5704,
"train_tokens_per_second": 115313.325
},
{
"epoch": 0.3133201655888958,
"grad_norm": 0.15897022187709808,
"learning_rate": 0.004105688009170134,
"loss": 3.1719465255737305,
"num_input_tokens_seen": 3035627520,
"step": 5790,
"train_runtime": 26324.8012,
"train_tokens_per_second": 115314.357
},
{
"epoch": 0.31386130577126004,
"grad_norm": 0.1866680085659027,
"learning_rate": 0.004102548332607894,
"loss": 3.1683422088623048,
"num_input_tokens_seen": 3040870400,
"step": 5800,
"train_runtime": 26370.0195,
"train_tokens_per_second": 115315.44
},
{
"epoch": 0.31440244595362427,
"grad_norm": 0.18191276490688324,
"learning_rate": 0.004099404526777816,
"loss": 3.1652973175048826,
"num_input_tokens_seen": 3046113280,
"step": 5810,
"train_runtime": 26415.2343,
"train_tokens_per_second": 115316.535
},
{
"epoch": 0.31494358613598855,
"grad_norm": 0.16286683082580566,
"learning_rate": 0.004096256601277797,
"loss": 3.1653570175170898,
"num_input_tokens_seen": 3051356160,
"step": 5820,
"train_runtime": 26460.4377,
"train_tokens_per_second": 115317.675
},
{
"epoch": 0.3154847263183528,
"grad_norm": 0.15786544978618622,
"learning_rate": 0.004093104565718307,
"loss": 3.171334457397461,
"num_input_tokens_seen": 3056599040,
"step": 5830,
"train_runtime": 26505.6409,
"train_tokens_per_second": 115318.813
},
{
"epoch": 0.316025866500717,
"grad_norm": 0.16940993070602417,
"learning_rate": 0.0040899484297223666,
"loss": 3.16903076171875,
"num_input_tokens_seen": 3061841920,
"step": 5840,
"train_runtime": 26550.8652,
"train_tokens_per_second": 115319.855
},
{
"epoch": 0.31656700668308124,
"grad_norm": 0.1778353452682495,
"learning_rate": 0.004086788202925512,
"loss": 3.163807678222656,
"num_input_tokens_seen": 3067084800,
"step": 5850,
"train_runtime": 26596.0801,
"train_tokens_per_second": 115320.934
},
{
"epoch": 0.3171081468654455,
"grad_norm": 0.18578499555587769,
"learning_rate": 0.004083623894975773,
"loss": 3.1687942504882813,
"num_input_tokens_seen": 3072327680,
"step": 5860,
"train_runtime": 26641.289,
"train_tokens_per_second": 115322.036
},
{
"epoch": 0.31764928704780976,
"grad_norm": 0.17534473538398743,
"learning_rate": 0.004080455515533633,
"loss": 3.1645458221435545,
"num_input_tokens_seen": 3077570560,
"step": 5870,
"train_runtime": 26686.5065,
"train_tokens_per_second": 115323.096
},
{
"epoch": 0.318190427230174,
"grad_norm": 0.16227850317955017,
"learning_rate": 0.004077283074272012,
"loss": 3.1695529937744142,
"num_input_tokens_seen": 3082813440,
"step": 5880,
"train_runtime": 26731.6901,
"train_tokens_per_second": 115324.3
},
{
"epoch": 0.3187315674125382,
"grad_norm": 0.17972981929779053,
"learning_rate": 0.004074106580876226,
"loss": 3.164577102661133,
"num_input_tokens_seen": 3088056320,
"step": 5890,
"train_runtime": 26776.8465,
"train_tokens_per_second": 115325.616
},
{
"epoch": 0.31927270759490245,
"grad_norm": 0.17186778783798218,
"learning_rate": 0.0040709260450439615,
"loss": 3.168431854248047,
"num_input_tokens_seen": 3093299200,
"step": 5900,
"train_runtime": 26822.0301,
"train_tokens_per_second": 115326.811
},
{
"epoch": 0.3198138477772667,
"grad_norm": 0.16803112626075745,
"learning_rate": 0.0040677414764852485,
"loss": 3.1673011779785156,
"num_input_tokens_seen": 3098542080,
"step": 5910,
"train_runtime": 26867.197,
"train_tokens_per_second": 115328.074
},
{
"epoch": 0.32035498795963097,
"grad_norm": 0.16622225940227509,
"learning_rate": 0.00406455288492243,
"loss": 3.156739616394043,
"num_input_tokens_seen": 3103784960,
"step": 5920,
"train_runtime": 26912.3879,
"train_tokens_per_second": 115329.229
},
{
"epoch": 0.3208961281419952,
"grad_norm": 0.18976053595542908,
"learning_rate": 0.004061360280090129,
"loss": 3.166844940185547,
"num_input_tokens_seen": 3109027840,
"step": 5930,
"train_runtime": 26957.5834,
"train_tokens_per_second": 115330.361
},
{
"epoch": 0.3214372683243594,
"grad_norm": 0.16867531836032867,
"learning_rate": 0.00405816367173522,
"loss": 3.1626731872558596,
"num_input_tokens_seen": 3114270720,
"step": 5940,
"train_runtime": 27002.7901,
"train_tokens_per_second": 115331.442
},
{
"epoch": 0.32197840850672366,
"grad_norm": 0.20071354508399963,
"learning_rate": 0.004054963069616803,
"loss": 3.169915199279785,
"num_input_tokens_seen": 3119513600,
"step": 5950,
"train_runtime": 27047.9883,
"train_tokens_per_second": 115332.555
},
{
"epoch": 0.3225195486890879,
"grad_norm": 0.16498495638370514,
"learning_rate": 0.0040517584835061664,
"loss": 3.1712413787841798,
"num_input_tokens_seen": 3124756480,
"step": 5960,
"train_runtime": 27093.2042,
"train_tokens_per_second": 115333.589
},
{
"epoch": 0.3230606888714522,
"grad_norm": 0.17592206597328186,
"learning_rate": 0.004048549923186767,
"loss": 3.1624687194824217,
"num_input_tokens_seen": 3129999360,
"step": 5970,
"train_runtime": 27138.4223,
"train_tokens_per_second": 115334.61
},
{
"epoch": 0.3236018290538164,
"grad_norm": 0.15470415353775024,
"learning_rate": 0.00404533739845419,
"loss": 3.155242347717285,
"num_input_tokens_seen": 3135242240,
"step": 5980,
"train_runtime": 27183.6528,
"train_tokens_per_second": 115335.575
},
{
"epoch": 0.32414296923618063,
"grad_norm": 0.16501109302043915,
"learning_rate": 0.004042120919116126,
"loss": 3.1598865509033205,
"num_input_tokens_seen": 3140485120,
"step": 5990,
"train_runtime": 27228.8867,
"train_tokens_per_second": 115336.523
},
{
"epoch": 0.32468410941854486,
"grad_norm": 0.16781945526599884,
"learning_rate": 0.004038900494992339,
"loss": 3.157525634765625,
"num_input_tokens_seen": 3145728000,
"step": 6000,
"train_runtime": 27274.108,
"train_tokens_per_second": 115337.521
},
{
"epoch": 0.32468410941854486,
"eval_loss": 3.111185073852539,
"eval_runtime": 1.9872,
"eval_samples_per_second": 251.614,
"eval_steps_per_second": 4.026,
"num_input_tokens_seen": 3145728000,
"step": 6000
},
{
"epoch": 0.3252252496009091,
"grad_norm": 0.18414868414402008,
"learning_rate": 0.004035676135914636,
"loss": 3.170181655883789,
"num_input_tokens_seen": 3150970880,
"step": 6010,
"train_runtime": 27323.9049,
"train_tokens_per_second": 115319.201
},
{
"epoch": 0.3257663897832734,
"grad_norm": 0.1616990864276886,
"learning_rate": 0.004032447851726835,
"loss": 3.1585414886474608,
"num_input_tokens_seen": 3156213760,
"step": 6020,
"train_runtime": 27369.1149,
"train_tokens_per_second": 115320.272
},
{
"epoch": 0.3263075299656376,
"grad_norm": 0.16582000255584717,
"learning_rate": 0.004029215652284741,
"loss": 3.1622276306152344,
"num_input_tokens_seen": 3161456640,
"step": 6030,
"train_runtime": 27414.3296,
"train_tokens_per_second": 115321.319
},
{
"epoch": 0.32684867014800184,
"grad_norm": 0.17380478978157043,
"learning_rate": 0.00402597954745611,
"loss": 3.1608341217041014,
"num_input_tokens_seen": 3166699520,
"step": 6040,
"train_runtime": 27459.5638,
"train_tokens_per_second": 115322.281
},
{
"epoch": 0.32738981033036607,
"grad_norm": 0.18764927983283997,
"learning_rate": 0.00402273954712062,
"loss": 3.1758914947509767,
"num_input_tokens_seen": 3171942400,
"step": 6050,
"train_runtime": 27504.7658,
"train_tokens_per_second": 115323.374
},
{
"epoch": 0.3279309505127303,
"grad_norm": 0.1659294068813324,
"learning_rate": 0.004019495661169844,
"loss": 3.1681026458740233,
"num_input_tokens_seen": 3177185280,
"step": 6060,
"train_runtime": 27549.978,
"train_tokens_per_second": 115324.422
},
{
"epoch": 0.3284720906950946,
"grad_norm": 0.15407103300094604,
"learning_rate": 0.004016247899507217,
"loss": 3.1617177963256835,
"num_input_tokens_seen": 3182428160,
"step": 6070,
"train_runtime": 27595.2039,
"train_tokens_per_second": 115325.409
},
{
"epoch": 0.3290132308774588,
"grad_norm": 0.17896398901939392,
"learning_rate": 0.004012996272048004,
"loss": 3.163351631164551,
"num_input_tokens_seen": 3187671040,
"step": 6080,
"train_runtime": 27640.4032,
"train_tokens_per_second": 115326.503
},
{
"epoch": 0.32955437105982305,
"grad_norm": 0.17344997823238373,
"learning_rate": 0.004009740788719276,
"loss": 3.153501510620117,
"num_input_tokens_seen": 3192913920,
"step": 6090,
"train_runtime": 27685.6168,
"train_tokens_per_second": 115327.534
},
{
"epoch": 0.3300955112421873,
"grad_norm": 0.16279980540275574,
"learning_rate": 0.004006481459459872,
"loss": 3.160162162780762,
"num_input_tokens_seen": 3198156800,
"step": 6100,
"train_runtime": 27730.837,
"train_tokens_per_second": 115328.535
},
{
"epoch": 0.3306366514245515,
"grad_norm": 0.16896025836467743,
"learning_rate": 0.0040032182942203775,
"loss": 3.158255767822266,
"num_input_tokens_seen": 3203399680,
"step": 6110,
"train_runtime": 27779.614,
"train_tokens_per_second": 115314.766
},
{
"epoch": 0.3311777916069158,
"grad_norm": 0.18168415129184723,
"learning_rate": 0.003999951302963083,
"loss": 3.156180000305176,
"num_input_tokens_seen": 3208642560,
"step": 6120,
"train_runtime": 27824.7398,
"train_tokens_per_second": 115316.175
},
{
"epoch": 0.33171893178928,
"grad_norm": 0.17479564249515533,
"learning_rate": 0.003996680495661963,
"loss": 3.155413818359375,
"num_input_tokens_seen": 3213885440,
"step": 6130,
"train_runtime": 27869.8597,
"train_tokens_per_second": 115317.604
},
{
"epoch": 0.33226007197164426,
"grad_norm": 0.16649708151817322,
"learning_rate": 0.003993405882302642,
"loss": 3.162016677856445,
"num_input_tokens_seen": 3219128320,
"step": 6140,
"train_runtime": 27914.9889,
"train_tokens_per_second": 115318.99
},
{
"epoch": 0.3328012121540085,
"grad_norm": 0.17866738140583038,
"learning_rate": 0.003990127472882364,
"loss": 3.1546072006225585,
"num_input_tokens_seen": 3224371200,
"step": 6150,
"train_runtime": 27960.113,
"train_tokens_per_second": 115320.392
},
{
"epoch": 0.3333423523363727,
"grad_norm": 0.15289874374866486,
"learning_rate": 0.0039868452774099615,
"loss": 3.1471332550048827,
"num_input_tokens_seen": 3229614080,
"step": 6160,
"train_runtime": 28005.2392,
"train_tokens_per_second": 115321.782
},
{
"epoch": 0.333883492518737,
"grad_norm": 0.16488930583000183,
"learning_rate": 0.003983559305905828,
"loss": 3.1540958404541017,
"num_input_tokens_seen": 3234856960,
"step": 6170,
"train_runtime": 28050.3655,
"train_tokens_per_second": 115323.166
},
{
"epoch": 0.33442463270110123,
"grad_norm": 0.17347821593284607,
"learning_rate": 0.003980269568401881,
"loss": 3.153203010559082,
"num_input_tokens_seen": 3240099840,
"step": 6180,
"train_runtime": 28095.5018,
"train_tokens_per_second": 115324.505
},
{
"epoch": 0.33496577288346546,
"grad_norm": 0.16901230812072754,
"learning_rate": 0.00397697607494154,
"loss": 3.153574752807617,
"num_input_tokens_seen": 3245342720,
"step": 6190,
"train_runtime": 28140.6287,
"train_tokens_per_second": 115325.878
},
{
"epoch": 0.3355069130658297,
"grad_norm": 0.1725231409072876,
"learning_rate": 0.0039736788355796875,
"loss": 3.1607025146484373,
"num_input_tokens_seen": 3250585600,
"step": 6200,
"train_runtime": 28185.7568,
"train_tokens_per_second": 115327.242
},
{
"epoch": 0.3360480532481939,
"grad_norm": 0.17325183749198914,
"learning_rate": 0.003970377860382644,
"loss": 3.147405242919922,
"num_input_tokens_seen": 3255828480,
"step": 6210,
"train_runtime": 28230.8843,
"train_tokens_per_second": 115328.604
},
{
"epoch": 0.3365891934305582,
"grad_norm": 0.1715451180934906,
"learning_rate": 0.003967073159428135,
"loss": 3.150386428833008,
"num_input_tokens_seen": 3261071360,
"step": 6220,
"train_runtime": 28276.018,
"train_tokens_per_second": 115329.936
},
{
"epoch": 0.33713033361292244,
"grad_norm": 0.16657474637031555,
"learning_rate": 0.003963764742805262,
"loss": 3.1559564590454103,
"num_input_tokens_seen": 3266314240,
"step": 6230,
"train_runtime": 28321.1527,
"train_tokens_per_second": 115331.261
},
{
"epoch": 0.33767147379528667,
"grad_norm": 0.17756827175617218,
"learning_rate": 0.003960452620614465,
"loss": 3.1532052993774413,
"num_input_tokens_seen": 3271557120,
"step": 6240,
"train_runtime": 28366.2774,
"train_tokens_per_second": 115332.621
},
{
"epoch": 0.3382126139776509,
"grad_norm": 0.16704502701759338,
"learning_rate": 0.003957136802967503,
"loss": 3.145302581787109,
"num_input_tokens_seen": 3276800000,
"step": 6250,
"train_runtime": 28411.4119,
"train_tokens_per_second": 115333.937
},
{
"epoch": 0.33875375416001513,
"grad_norm": 0.16609609127044678,
"learning_rate": 0.003953817299987416,
"loss": 3.157614898681641,
"num_input_tokens_seen": 3282042880,
"step": 6260,
"train_runtime": 28456.5404,
"train_tokens_per_second": 115335.274
},
{
"epoch": 0.3392948943423794,
"grad_norm": 0.17776867747306824,
"learning_rate": 0.003950494121808493,
"loss": 3.1511157989501952,
"num_input_tokens_seen": 3287285760,
"step": 6270,
"train_runtime": 28501.6688,
"train_tokens_per_second": 115336.607
},
{
"epoch": 0.33983603452474365,
"grad_norm": 0.16619160771369934,
"learning_rate": 0.003947167278576242,
"loss": 3.1576236724853515,
"num_input_tokens_seen": 3292528640,
"step": 6280,
"train_runtime": 28546.8015,
"train_tokens_per_second": 115337.917
},
{
"epoch": 0.3403771747071079,
"grad_norm": 0.17923958599567413,
"learning_rate": 0.003943836780447365,
"loss": 3.1528648376464843,
"num_input_tokens_seen": 3297771520,
"step": 6290,
"train_runtime": 28591.9231,
"train_tokens_per_second": 115339.269
},
{
"epoch": 0.3409183148894721,
"grad_norm": 0.16474676132202148,
"learning_rate": 0.003940502637589718,
"loss": 3.1509103775024414,
"num_input_tokens_seen": 3303014400,
"step": 6300,
"train_runtime": 28637.0641,
"train_tokens_per_second": 115340.539
},
{
"epoch": 0.34145945507183634,
"grad_norm": 0.1639336794614792,
"learning_rate": 0.0039371648601822865,
"loss": 3.155986785888672,
"num_input_tokens_seen": 3308257280,
"step": 6310,
"train_runtime": 28682.1884,
"train_tokens_per_second": 115341.871
},
{
"epoch": 0.3420005952542006,
"grad_norm": 0.17124713957309723,
"learning_rate": 0.003933823458415151,
"loss": 3.147997283935547,
"num_input_tokens_seen": 3313500160,
"step": 6320,
"train_runtime": 28727.3095,
"train_tokens_per_second": 115343.212
},
{
"epoch": 0.34254173543656485,
"grad_norm": 0.17230060696601868,
"learning_rate": 0.003930478442489458,
"loss": 3.1527957916259766,
"num_input_tokens_seen": 3318743040,
"step": 6330,
"train_runtime": 28772.44,
"train_tokens_per_second": 115344.512
},
{
"epoch": 0.3430828756189291,
"grad_norm": 0.1681806594133377,
"learning_rate": 0.003927129822617386,
"loss": 3.1512054443359374,
"num_input_tokens_seen": 3323985920,
"step": 6340,
"train_runtime": 28817.6293,
"train_tokens_per_second": 115345.572
},
{
"epoch": 0.3436240158012933,
"grad_norm": 0.17435091733932495,
"learning_rate": 0.003923777609022119,
"loss": 3.153603744506836,
"num_input_tokens_seen": 3329228800,
"step": 6350,
"train_runtime": 28862.8169,
"train_tokens_per_second": 115346.635
},
{
"epoch": 0.34416515598365754,
"grad_norm": 0.1703469306230545,
"learning_rate": 0.00392042181193781,
"loss": 3.142818069458008,
"num_input_tokens_seen": 3334471680,
"step": 6360,
"train_runtime": 28907.9957,
"train_tokens_per_second": 115347.73
},
{
"epoch": 0.34470629616602183,
"grad_norm": 0.1682499647140503,
"learning_rate": 0.0039170624416095525,
"loss": 3.1417423248291017,
"num_input_tokens_seen": 3339714560,
"step": 6370,
"train_runtime": 28953.1644,
"train_tokens_per_second": 115348.862
},
{
"epoch": 0.34524743634838606,
"grad_norm": 0.16802842915058136,
"learning_rate": 0.0039136995082933515,
"loss": 3.1456912994384765,
"num_input_tokens_seen": 3344957440,
"step": 6380,
"train_runtime": 28998.3264,
"train_tokens_per_second": 115350.017
},
{
"epoch": 0.3457885765307503,
"grad_norm": 0.1582358479499817,
"learning_rate": 0.003910333022256086,
"loss": 3.1438793182373046,
"num_input_tokens_seen": 3350200320,
"step": 6390,
"train_runtime": 29043.4985,
"train_tokens_per_second": 115351.128
},
{
"epoch": 0.3463297167131145,
"grad_norm": 0.16883233189582825,
"learning_rate": 0.003906962993775483,
"loss": 3.1468482971191407,
"num_input_tokens_seen": 3355443200,
"step": 6400,
"train_runtime": 29088.66,
"train_tokens_per_second": 115352.278
},
{
"epoch": 0.34687085689547875,
"grad_norm": 0.18867318332195282,
"learning_rate": 0.0039035894331400853,
"loss": 3.147420883178711,
"num_input_tokens_seen": 3360686080,
"step": 6410,
"train_runtime": 29133.8253,
"train_tokens_per_second": 115353.409
},
{
"epoch": 0.34741199707784304,
"grad_norm": 0.16323506832122803,
"learning_rate": 0.0039002123506492177,
"loss": 3.145482063293457,
"num_input_tokens_seen": 3365928960,
"step": 6420,
"train_runtime": 29179.0336,
"train_tokens_per_second": 115354.367
},
{
"epoch": 0.34795313726020727,
"grad_norm": 0.1756802797317505,
"learning_rate": 0.003896831756612958,
"loss": 3.1475906372070312,
"num_input_tokens_seen": 3371171840,
"step": 6430,
"train_runtime": 29224.2308,
"train_tokens_per_second": 115355.366
},
{
"epoch": 0.3484942774425715,
"grad_norm": 0.17158783972263336,
"learning_rate": 0.0038934476613521037,
"loss": 3.142435073852539,
"num_input_tokens_seen": 3376414720,
"step": 6440,
"train_runtime": 29269.4011,
"train_tokens_per_second": 115356.467
},
{
"epoch": 0.34903541762493573,
"grad_norm": 0.16574952006340027,
"learning_rate": 0.0038900600751981436,
"loss": 3.1459327697753907,
"num_input_tokens_seen": 3381657600,
"step": 6450,
"train_runtime": 29314.5687,
"train_tokens_per_second": 115357.577
},
{
"epoch": 0.34957655780729996,
"grad_norm": 0.16016115248203278,
"learning_rate": 0.0038866690084932206,
"loss": 3.1540714263916017,
"num_input_tokens_seen": 3386900480,
"step": 6460,
"train_runtime": 29359.7508,
"train_tokens_per_second": 115358.625
},
{
"epoch": 0.35011769798966424,
"grad_norm": 0.1590614914894104,
"learning_rate": 0.0038832744715901063,
"loss": 3.138327789306641,
"num_input_tokens_seen": 3392143360,
"step": 6470,
"train_runtime": 29404.9917,
"train_tokens_per_second": 115359.439
},
{
"epoch": 0.3506588381720285,
"grad_norm": 0.1668478101491928,
"learning_rate": 0.003879876474852164,
"loss": 3.1390443801879884,
"num_input_tokens_seen": 3397386240,
"step": 6480,
"train_runtime": 29450.2102,
"train_tokens_per_second": 115360.339
},
{
"epoch": 0.3511999783543927,
"grad_norm": 0.16614961624145508,
"learning_rate": 0.0038764750286533244,
"loss": 3.1493562698364257,
"num_input_tokens_seen": 3402629120,
"step": 6490,
"train_runtime": 29498.9151,
"train_tokens_per_second": 115347.602
},
{
"epoch": 0.35174111853675694,
"grad_norm": 0.1770559698343277,
"learning_rate": 0.003873070143378044,
"loss": 3.1434371948242186,
"num_input_tokens_seen": 3407872000,
"step": 6500,
"train_runtime": 29544.0364,
"train_tokens_per_second": 115348.896
},
{
"epoch": 0.35174111853675694,
"eval_loss": 3.0966169834136963,
"eval_runtime": 1.9851,
"eval_samples_per_second": 251.881,
"eval_steps_per_second": 4.03,
"num_input_tokens_seen": 3407872000,
"step": 6500
},
{
"epoch": 0.35228225871912117,
"grad_norm": 0.1724107414484024,
"learning_rate": 0.0038696618294212816,
"loss": 3.1477359771728515,
"num_input_tokens_seen": 3413114880,
"step": 6510,
"train_runtime": 29591.1684,
"train_tokens_per_second": 115342.349
},
{
"epoch": 0.35282339890148545,
"grad_norm": 0.17597156763076782,
"learning_rate": 0.0038662500971884633,
"loss": 3.1492542266845702,
"num_input_tokens_seen": 3418357760,
"step": 6520,
"train_runtime": 29636.3254,
"train_tokens_per_second": 115343.509
},
{
"epoch": 0.3533645390838497,
"grad_norm": 0.1612919569015503,
"learning_rate": 0.0038628349570954497,
"loss": 3.1426467895507812,
"num_input_tokens_seen": 3423600640,
"step": 6530,
"train_runtime": 29681.4655,
"train_tokens_per_second": 115344.73
},
{
"epoch": 0.3539056792662139,
"grad_norm": 0.16101430356502533,
"learning_rate": 0.0038594164195685076,
"loss": 3.137646484375,
"num_input_tokens_seen": 3428843520,
"step": 6540,
"train_runtime": 29726.6035,
"train_tokens_per_second": 115345.957
},
{
"epoch": 0.35444681944857814,
"grad_norm": 0.17293353378772736,
"learning_rate": 0.003855994495044273,
"loss": 3.1470672607421877,
"num_input_tokens_seen": 3434086400,
"step": 6550,
"train_runtime": 29771.7425,
"train_tokens_per_second": 115347.175
},
{
"epoch": 0.3549879596309424,
"grad_norm": 0.18171222507953644,
"learning_rate": 0.0038525691939697267,
"loss": 3.1423971176147463,
"num_input_tokens_seen": 3439329280,
"step": 6560,
"train_runtime": 29816.873,
"train_tokens_per_second": 115348.423
},
{
"epoch": 0.35552909981330666,
"grad_norm": 0.17078061401844025,
"learning_rate": 0.0038491405268021523,
"loss": 3.1396827697753906,
"num_input_tokens_seen": 3444572160,
"step": 6570,
"train_runtime": 29862.0878,
"train_tokens_per_second": 115349.341
},
{
"epoch": 0.3560702399956709,
"grad_norm": 0.17867809534072876,
"learning_rate": 0.0038457085040091155,
"loss": 3.1499147415161133,
"num_input_tokens_seen": 3449815040,
"step": 6580,
"train_runtime": 29907.2427,
"train_tokens_per_second": 115350.488
},
{
"epoch": 0.3566113801780351,
"grad_norm": 0.15178236365318298,
"learning_rate": 0.003842273136068423,
"loss": 3.13470344543457,
"num_input_tokens_seen": 3455057920,
"step": 6590,
"train_runtime": 29952.42,
"train_tokens_per_second": 115351.545
},
{
"epoch": 0.35715252036039935,
"grad_norm": 0.17382913827896118,
"learning_rate": 0.0038388344334680936,
"loss": 3.1436153411865235,
"num_input_tokens_seen": 3460300800,
"step": 6600,
"train_runtime": 29997.6461,
"train_tokens_per_second": 115352.411
},
{
"epoch": 0.3576936605427636,
"grad_norm": 0.17544035613536835,
"learning_rate": 0.0038353924067063313,
"loss": 3.1381744384765624,
"num_input_tokens_seen": 3465543680,
"step": 6610,
"train_runtime": 30042.8233,
"train_tokens_per_second": 115353.462
},
{
"epoch": 0.35823480072512787,
"grad_norm": 0.15095841884613037,
"learning_rate": 0.003831947066291482,
"loss": 3.1344669342041014,
"num_input_tokens_seen": 3470786560,
"step": 6620,
"train_runtime": 30088.0009,
"train_tokens_per_second": 115354.509
},
{
"epoch": 0.3587759409074921,
"grad_norm": 0.16399560868740082,
"learning_rate": 0.0038284984227420146,
"loss": 3.134235382080078,
"num_input_tokens_seen": 3476029440,
"step": 6630,
"train_runtime": 30133.1894,
"train_tokens_per_second": 115355.51
},
{
"epoch": 0.3593170810898563,
"grad_norm": 0.18398840725421906,
"learning_rate": 0.003825046486586477,
"loss": 3.131580924987793,
"num_input_tokens_seen": 3481272320,
"step": 6640,
"train_runtime": 30178.3732,
"train_tokens_per_second": 115356.527
},
{
"epoch": 0.35985822127222056,
"grad_norm": 0.16813096404075623,
"learning_rate": 0.0038215912683634726,
"loss": 3.1448497772216797,
"num_input_tokens_seen": 3486515200,
"step": 6650,
"train_runtime": 30223.5423,
"train_tokens_per_second": 115357.596
},
{
"epoch": 0.3603993614545848,
"grad_norm": 0.1649860441684723,
"learning_rate": 0.003818132778621623,
"loss": 3.14077091217041,
"num_input_tokens_seen": 3491758080,
"step": 6660,
"train_runtime": 30268.7194,
"train_tokens_per_second": 115358.633
},
{
"epoch": 0.3609405016369491,
"grad_norm": 0.17575252056121826,
"learning_rate": 0.0038146710279195386,
"loss": 3.1330080032348633,
"num_input_tokens_seen": 3497000960,
"step": 6670,
"train_runtime": 30313.9788,
"train_tokens_per_second": 115359.352
},
{
"epoch": 0.3614816418193133,
"grad_norm": 0.1742008924484253,
"learning_rate": 0.003811206026825786,
"loss": 3.155079460144043,
"num_input_tokens_seen": 3502243840,
"step": 6680,
"train_runtime": 30359.1553,
"train_tokens_per_second": 115360.385
},
{
"epoch": 0.36202278200167753,
"grad_norm": 0.1799112856388092,
"learning_rate": 0.0038077377859188524,
"loss": 3.1288970947265624,
"num_input_tokens_seen": 3507486720,
"step": 6690,
"train_runtime": 30404.3262,
"train_tokens_per_second": 115361.436
},
{
"epoch": 0.36256392218404176,
"grad_norm": 0.16728277504444122,
"learning_rate": 0.003804266315787119,
"loss": 3.137259864807129,
"num_input_tokens_seen": 3512729600,
"step": 6700,
"train_runtime": 30449.5017,
"train_tokens_per_second": 115362.466
},
{
"epoch": 0.363105062366406,
"grad_norm": 0.1766940951347351,
"learning_rate": 0.0038007916270288234,
"loss": 3.1414379119873046,
"num_input_tokens_seen": 3517972480,
"step": 6710,
"train_runtime": 30494.6728,
"train_tokens_per_second": 115363.51
},
{
"epoch": 0.3636462025487703,
"grad_norm": 0.17950496077537537,
"learning_rate": 0.0037973137302520312,
"loss": 3.141128730773926,
"num_input_tokens_seen": 3523215360,
"step": 6720,
"train_runtime": 30539.8417,
"train_tokens_per_second": 115364.559
},
{
"epoch": 0.3641873427311345,
"grad_norm": 0.17668098211288452,
"learning_rate": 0.003793832636074601,
"loss": 3.1354911804199217,
"num_input_tokens_seen": 3528458240,
"step": 6730,
"train_runtime": 30585.0013,
"train_tokens_per_second": 115365.64
},
{
"epoch": 0.36472848291349874,
"grad_norm": 0.17323218286037445,
"learning_rate": 0.0037903483551241534,
"loss": 3.1416683197021484,
"num_input_tokens_seen": 3533701120,
"step": 6740,
"train_runtime": 30630.1549,
"train_tokens_per_second": 115366.74
},
{
"epoch": 0.36526962309586297,
"grad_norm": 0.1715293824672699,
"learning_rate": 0.003786860898038038,
"loss": 3.133253288269043,
"num_input_tokens_seen": 3538944000,
"step": 6750,
"train_runtime": 30675.3114,
"train_tokens_per_second": 115367.826
},
{
"epoch": 0.3658107632782272,
"grad_norm": 0.16131816804409027,
"learning_rate": 0.0037833702754633005,
"loss": 3.137991714477539,
"num_input_tokens_seen": 3544186880,
"step": 6760,
"train_runtime": 30720.4583,
"train_tokens_per_second": 115368.945
},
{
"epoch": 0.3663519034605915,
"grad_norm": 0.16405366361141205,
"learning_rate": 0.003779876498056652,
"loss": 3.149972152709961,
"num_input_tokens_seen": 3549429760,
"step": 6770,
"train_runtime": 30765.5763,
"train_tokens_per_second": 115370.17
},
{
"epoch": 0.3668930436429557,
"grad_norm": 0.1677146553993225,
"learning_rate": 0.0037763795764844317,
"loss": 3.1432748794555665,
"num_input_tokens_seen": 3554672640,
"step": 6780,
"train_runtime": 30810.7138,
"train_tokens_per_second": 115371.317
},
{
"epoch": 0.36743418382531995,
"grad_norm": 0.1701316237449646,
"learning_rate": 0.003772879521422583,
"loss": 3.138026809692383,
"num_input_tokens_seen": 3559915520,
"step": 6790,
"train_runtime": 30855.8357,
"train_tokens_per_second": 115372.52
},
{
"epoch": 0.3679753240076842,
"grad_norm": 0.1724764108657837,
"learning_rate": 0.0037693763435566125,
"loss": 3.1394069671630858,
"num_input_tokens_seen": 3565158400,
"step": 6800,
"train_runtime": 30900.9517,
"train_tokens_per_second": 115373.741
},
{
"epoch": 0.3685164641900484,
"grad_norm": 0.16157887876033783,
"learning_rate": 0.00376587005358156,
"loss": 3.124007797241211,
"num_input_tokens_seen": 3570401280,
"step": 6810,
"train_runtime": 30946.0772,
"train_tokens_per_second": 115374.923
},
{
"epoch": 0.3690576043724127,
"grad_norm": 0.16729003190994263,
"learning_rate": 0.0037623606622019675,
"loss": 3.122986602783203,
"num_input_tokens_seen": 3575644160,
"step": 6820,
"train_runtime": 30991.3846,
"train_tokens_per_second": 115375.425
},
{
"epoch": 0.3695987445547769,
"grad_norm": 0.17239217460155487,
"learning_rate": 0.003758848180131846,
"loss": 3.1259433746337892,
"num_input_tokens_seen": 3580887040,
"step": 6830,
"train_runtime": 31036.5265,
"train_tokens_per_second": 115376.54
},
{
"epoch": 0.37013988473714116,
"grad_norm": 0.1540314108133316,
"learning_rate": 0.003755332618094642,
"loss": 3.128913688659668,
"num_input_tokens_seen": 3586129920,
"step": 6840,
"train_runtime": 31081.6974,
"train_tokens_per_second": 115377.544
},
{
"epoch": 0.3706810249195054,
"grad_norm": 0.16670770943164825,
"learning_rate": 0.0037518139868232036,
"loss": 3.1437910079956053,
"num_input_tokens_seen": 3591372800,
"step": 6850,
"train_runtime": 31126.8444,
"train_tokens_per_second": 115378.634
},
{
"epoch": 0.3712221651018696,
"grad_norm": 0.16100816428661346,
"learning_rate": 0.0037482922970597512,
"loss": 3.1303838729858398,
"num_input_tokens_seen": 3596615680,
"step": 6860,
"train_runtime": 31172.0038,
"train_tokens_per_second": 115379.675
},
{
"epoch": 0.3717633052842339,
"grad_norm": 0.1720798909664154,
"learning_rate": 0.0037447675595558417,
"loss": 3.139808464050293,
"num_input_tokens_seen": 3601858560,
"step": 6870,
"train_runtime": 31220.5874,
"train_tokens_per_second": 115368.059
},
{
"epoch": 0.37230444546659813,
"grad_norm": 0.15832237899303436,
"learning_rate": 0.0037412397850723356,
"loss": 3.1387088775634764,
"num_input_tokens_seen": 3607101440,
"step": 6880,
"train_runtime": 31265.7548,
"train_tokens_per_second": 115369.082
},
{
"epoch": 0.37284558564896236,
"grad_norm": 0.16572092473506927,
"learning_rate": 0.0037377089843793664,
"loss": 3.136234092712402,
"num_input_tokens_seen": 3612344320,
"step": 6890,
"train_runtime": 31310.8828,
"train_tokens_per_second": 115370.248
},
{
"epoch": 0.3733867258313266,
"grad_norm": 0.16967612504959106,
"learning_rate": 0.0037341751682563075,
"loss": 3.1306957244873046,
"num_input_tokens_seen": 3617587200,
"step": 6900,
"train_runtime": 31356.0169,
"train_tokens_per_second": 115371.388
},
{
"epoch": 0.3739278660136908,
"grad_norm": 0.16561359167099,
"learning_rate": 0.0037306383474917356,
"loss": 3.128021240234375,
"num_input_tokens_seen": 3622830080,
"step": 6910,
"train_runtime": 31401.1695,
"train_tokens_per_second": 115372.457
},
{
"epoch": 0.3744690061960551,
"grad_norm": 0.16602273285388947,
"learning_rate": 0.0037270985328834013,
"loss": 3.125231170654297,
"num_input_tokens_seen": 3628072960,
"step": 6920,
"train_runtime": 31446.3403,
"train_tokens_per_second": 115373.456
},
{
"epoch": 0.37501014637841934,
"grad_norm": 0.15461350977420807,
"learning_rate": 0.0037235557352381975,
"loss": 3.1283363342285155,
"num_input_tokens_seen": 3633315840,
"step": 6930,
"train_runtime": 31491.4936,
"train_tokens_per_second": 115374.516
},
{
"epoch": 0.37555128656078357,
"grad_norm": 0.17157427966594696,
"learning_rate": 0.003720009965372121,
"loss": 3.136751174926758,
"num_input_tokens_seen": 3638558720,
"step": 6940,
"train_runtime": 31536.6325,
"train_tokens_per_second": 115375.626
},
{
"epoch": 0.3760924267431478,
"grad_norm": 0.15815427899360657,
"learning_rate": 0.0037164612341102445,
"loss": 3.1335182189941406,
"num_input_tokens_seen": 3643801600,
"step": 6950,
"train_runtime": 31581.7854,
"train_tokens_per_second": 115376.682
},
{
"epoch": 0.37663356692551203,
"grad_norm": 0.16368745267391205,
"learning_rate": 0.003712909552286681,
"loss": 3.1299674987792967,
"num_input_tokens_seen": 3649044480,
"step": 6960,
"train_runtime": 31626.953,
"train_tokens_per_second": 115377.681
},
{
"epoch": 0.3771747071078763,
"grad_norm": 0.17233121395111084,
"learning_rate": 0.003709354930744553,
"loss": 3.1409616470336914,
"num_input_tokens_seen": 3654287360,
"step": 6970,
"train_runtime": 31672.1101,
"train_tokens_per_second": 115378.715
},
{
"epoch": 0.37771584729024055,
"grad_norm": 0.1784183382987976,
"learning_rate": 0.0037057973803359553,
"loss": 3.1445953369140627,
"num_input_tokens_seen": 3659530240,
"step": 6980,
"train_runtime": 31717.2675,
"train_tokens_per_second": 115379.745
},
{
"epoch": 0.3782569874726048,
"grad_norm": 0.1589273363351822,
"learning_rate": 0.003702236911921925,
"loss": 3.1336727142333984,
"num_input_tokens_seen": 3664773120,
"step": 6990,
"train_runtime": 31762.4428,
"train_tokens_per_second": 115380.707
},
{
"epoch": 0.378798127654969,
"grad_norm": 0.16604717075824738,
"learning_rate": 0.00369867353637241,
"loss": 3.125100326538086,
"num_input_tokens_seen": 3670016000,
"step": 7000,
"train_runtime": 31807.6091,
"train_tokens_per_second": 115381.7
},
{
"epoch": 0.378798127654969,
"eval_loss": 3.082562208175659,
"eval_runtime": 1.983,
"eval_samples_per_second": 252.143,
"eval_steps_per_second": 4.034,
"num_input_tokens_seen": 3670016000,
"step": 7000
},
{
"epoch": 0.37933926783733324,
"grad_norm": 0.16016067564487457,
"learning_rate": 0.003695107264566231,
"loss": 3.132742691040039,
"num_input_tokens_seen": 3675258880,
"step": 7010,
"train_runtime": 31857.0893,
"train_tokens_per_second": 115367.065
},
{
"epoch": 0.3798804080196975,
"grad_norm": 0.17284226417541504,
"learning_rate": 0.003691538107391052,
"loss": 3.1309505462646485,
"num_input_tokens_seen": 3680501760,
"step": 7020,
"train_runtime": 31902.2704,
"train_tokens_per_second": 115368.02
},
{
"epoch": 0.38042154820206175,
"grad_norm": 0.16180108487606049,
"learning_rate": 0.0036879660757433465,
"loss": 3.1276824951171873,
"num_input_tokens_seen": 3685744640,
"step": 7030,
"train_runtime": 31947.4422,
"train_tokens_per_second": 115369.006
},
{
"epoch": 0.380962688384426,
"grad_norm": 0.16350635886192322,
"learning_rate": 0.0036843911805283613,
"loss": 3.127395248413086,
"num_input_tokens_seen": 3690987520,
"step": 7040,
"train_runtime": 31992.5853,
"train_tokens_per_second": 115370.092
},
{
"epoch": 0.3815038285667902,
"grad_norm": 0.15854142606258392,
"learning_rate": 0.0036808134326600872,
"loss": 3.1203243255615236,
"num_input_tokens_seen": 3696230400,
"step": 7050,
"train_runtime": 32037.7375,
"train_tokens_per_second": 115371.143
},
{
"epoch": 0.38204496874915445,
"grad_norm": 0.1765364557504654,
"learning_rate": 0.0036772328430612245,
"loss": 3.1236772537231445,
"num_input_tokens_seen": 3701473280,
"step": 7060,
"train_runtime": 32082.8987,
"train_tokens_per_second": 115372.159
},
{
"epoch": 0.38258610893151873,
"grad_norm": 0.16590341925621033,
"learning_rate": 0.0036736494226631486,
"loss": 3.1179275512695312,
"num_input_tokens_seen": 3706716160,
"step": 7070,
"train_runtime": 32128.0461,
"train_tokens_per_second": 115373.221
},
{
"epoch": 0.38312724911388296,
"grad_norm": 0.1656789630651474,
"learning_rate": 0.0036700631824058763,
"loss": 3.1220640182495116,
"num_input_tokens_seen": 3711959040,
"step": 7080,
"train_runtime": 32173.2014,
"train_tokens_per_second": 115374.252
},
{
"epoch": 0.3836683892962472,
"grad_norm": 0.18290071189403534,
"learning_rate": 0.003666474133238036,
"loss": 3.130259704589844,
"num_input_tokens_seen": 3717201920,
"step": 7090,
"train_runtime": 32218.3695,
"train_tokens_per_second": 115375.234
},
{
"epoch": 0.3842095294786114,
"grad_norm": 0.1678554117679596,
"learning_rate": 0.003662882286116827,
"loss": 3.128999137878418,
"num_input_tokens_seen": 3722444800,
"step": 7100,
"train_runtime": 32263.5278,
"train_tokens_per_second": 115376.248
},
{
"epoch": 0.38475066966097565,
"grad_norm": 0.16328170895576477,
"learning_rate": 0.0036592876520079956,
"loss": 3.1096935272216797,
"num_input_tokens_seen": 3727687680,
"step": 7110,
"train_runtime": 32308.6892,
"train_tokens_per_second": 115377.249
},
{
"epoch": 0.38529180984333994,
"grad_norm": 0.16377384960651398,
"learning_rate": 0.0036556902418857927,
"loss": 3.1283496856689452,
"num_input_tokens_seen": 3732930560,
"step": 7120,
"train_runtime": 32353.8348,
"train_tokens_per_second": 115378.303
},
{
"epoch": 0.38583295002570417,
"grad_norm": 0.17365527153015137,
"learning_rate": 0.0036520900667329475,
"loss": 3.1340274810791016,
"num_input_tokens_seen": 3738173440,
"step": 7130,
"train_runtime": 32398.9948,
"train_tokens_per_second": 115379.303
},
{
"epoch": 0.3863740902080684,
"grad_norm": 0.17289578914642334,
"learning_rate": 0.003648487137540628,
"loss": 3.126075553894043,
"num_input_tokens_seen": 3743416320,
"step": 7140,
"train_runtime": 32444.1388,
"train_tokens_per_second": 115380.357
},
{
"epoch": 0.38691523039043263,
"grad_norm": 0.1867065280675888,
"learning_rate": 0.003644881465308411,
"loss": 3.1279239654541016,
"num_input_tokens_seen": 3748659200,
"step": 7150,
"train_runtime": 32489.3038,
"train_tokens_per_second": 115381.334
},
{
"epoch": 0.38745637057279686,
"grad_norm": 0.16090157628059387,
"learning_rate": 0.003641273061044249,
"loss": 3.126418685913086,
"num_input_tokens_seen": 3753902080,
"step": 7160,
"train_runtime": 32534.4706,
"train_tokens_per_second": 115382.301
},
{
"epoch": 0.38799751075516115,
"grad_norm": 0.16933725774288177,
"learning_rate": 0.003637661935764434,
"loss": 3.1228607177734373,
"num_input_tokens_seen": 3759144960,
"step": 7170,
"train_runtime": 32579.6304,
"train_tokens_per_second": 115383.29
},
{
"epoch": 0.3885386509375254,
"grad_norm": 0.16463743150234222,
"learning_rate": 0.003634048100493565,
"loss": 3.1265775680541994,
"num_input_tokens_seen": 3764387840,
"step": 7180,
"train_runtime": 32624.7971,
"train_tokens_per_second": 115384.253
},
{
"epoch": 0.3890797911198896,
"grad_norm": 0.15814442932605743,
"learning_rate": 0.003630431566264515,
"loss": 3.126376724243164,
"num_input_tokens_seen": 3769630720,
"step": 7190,
"train_runtime": 32669.9527,
"train_tokens_per_second": 115385.252
},
{
"epoch": 0.38962093130225384,
"grad_norm": 0.16953812539577484,
"learning_rate": 0.0036268123441183966,
"loss": 3.1293899536132814,
"num_input_tokens_seen": 3774873600,
"step": 7200,
"train_runtime": 32715.1316,
"train_tokens_per_second": 115386.166
},
{
"epoch": 0.39016207148461807,
"grad_norm": 0.18077914416790009,
"learning_rate": 0.003623190445104527,
"loss": 3.130533218383789,
"num_input_tokens_seen": 3780116480,
"step": 7210,
"train_runtime": 32760.3295,
"train_tokens_per_second": 115387.01
},
{
"epoch": 0.39070321166698235,
"grad_norm": 0.17073588073253632,
"learning_rate": 0.003619565880280401,
"loss": 3.1266639709472654,
"num_input_tokens_seen": 3785359360,
"step": 7220,
"train_runtime": 32805.4983,
"train_tokens_per_second": 115387.955
},
{
"epoch": 0.3912443518493466,
"grad_norm": 0.16945651173591614,
"learning_rate": 0.0036159386607116446,
"loss": 3.1234695434570314,
"num_input_tokens_seen": 3790602240,
"step": 7230,
"train_runtime": 32850.6502,
"train_tokens_per_second": 115388.956
},
{
"epoch": 0.3917854920317108,
"grad_norm": 0.17761710286140442,
"learning_rate": 0.0036123087974719937,
"loss": 3.127792739868164,
"num_input_tokens_seen": 3795845120,
"step": 7240,
"train_runtime": 32895.8256,
"train_tokens_per_second": 115389.873
},
{
"epoch": 0.39232663221407504,
"grad_norm": 0.16878648102283478,
"learning_rate": 0.0036086763016432545,
"loss": 3.120273208618164,
"num_input_tokens_seen": 3801088000,
"step": 7250,
"train_runtime": 32945.144,
"train_tokens_per_second": 115376.275
},
{
"epoch": 0.3928677723964393,
"grad_norm": 0.15386980772018433,
"learning_rate": 0.0036050411843152686,
"loss": 3.1222068786621096,
"num_input_tokens_seen": 3806330880,
"step": 7260,
"train_runtime": 32990.288,
"train_tokens_per_second": 115377.316
},
{
"epoch": 0.39340891257880356,
"grad_norm": 0.16980594396591187,
"learning_rate": 0.0036014034565858824,
"loss": 3.1281028747558595,
"num_input_tokens_seen": 3811573760,
"step": 7270,
"train_runtime": 33035.4429,
"train_tokens_per_second": 115378.316
},
{
"epoch": 0.3939500527611678,
"grad_norm": 0.17536021769046783,
"learning_rate": 0.003597763129560911,
"loss": 3.1235652923583985,
"num_input_tokens_seen": 3816816640,
"step": 7280,
"train_runtime": 33080.605,
"train_tokens_per_second": 115379.288
},
{
"epoch": 0.394491192943532,
"grad_norm": 0.1680123209953308,
"learning_rate": 0.0035941202143541053,
"loss": 3.123764991760254,
"num_input_tokens_seen": 3822059520,
"step": 7290,
"train_runtime": 33125.7503,
"train_tokens_per_second": 115380.315
},
{
"epoch": 0.39503233312589625,
"grad_norm": 0.15840236842632294,
"learning_rate": 0.003590474722087118,
"loss": 3.124995803833008,
"num_input_tokens_seen": 3827302400,
"step": 7300,
"train_runtime": 33170.9067,
"train_tokens_per_second": 115381.302
},
{
"epoch": 0.3955734733082605,
"grad_norm": 0.1702660471200943,
"learning_rate": 0.00358682666388947,
"loss": 3.1230545043945312,
"num_input_tokens_seen": 3832545280,
"step": 7310,
"train_runtime": 33216.0627,
"train_tokens_per_second": 115382.287
},
{
"epoch": 0.39611461349062477,
"grad_norm": 0.14530692994594574,
"learning_rate": 0.003583176050898514,
"loss": 3.1195556640625,
"num_input_tokens_seen": 3837788160,
"step": 7320,
"train_runtime": 33261.2169,
"train_tokens_per_second": 115383.276
},
{
"epoch": 0.396655753672989,
"grad_norm": 0.16137973964214325,
"learning_rate": 0.003579522894259404,
"loss": 3.122934341430664,
"num_input_tokens_seen": 3843031040,
"step": 7330,
"train_runtime": 33306.3711,
"train_tokens_per_second": 115384.262
},
{
"epoch": 0.39719689385535323,
"grad_norm": 0.17957496643066406,
"learning_rate": 0.0035758672051250597,
"loss": 3.118304443359375,
"num_input_tokens_seen": 3848273920,
"step": 7340,
"train_runtime": 33351.4951,
"train_tokens_per_second": 115385.35
},
{
"epoch": 0.39773803403771746,
"grad_norm": 0.1619359254837036,
"learning_rate": 0.003572208994656131,
"loss": 3.126445007324219,
"num_input_tokens_seen": 3853516800,
"step": 7350,
"train_runtime": 33396.6238,
"train_tokens_per_second": 115386.418
},
{
"epoch": 0.3982791742200817,
"grad_norm": 0.17734915018081665,
"learning_rate": 0.003568548274020967,
"loss": 3.1167884826660157,
"num_input_tokens_seen": 3858759680,
"step": 7360,
"train_runtime": 33441.7562,
"train_tokens_per_second": 115387.471
},
{
"epoch": 0.398820314402446,
"grad_norm": 0.17586900293827057,
"learning_rate": 0.0035648850543955773,
"loss": 3.1228519439697267,
"num_input_tokens_seen": 3864002560,
"step": 7370,
"train_runtime": 33486.9063,
"train_tokens_per_second": 115388.46
},
{
"epoch": 0.3993614545848102,
"grad_norm": 0.17276950180530548,
"learning_rate": 0.0035612193469636054,
"loss": 3.1270915985107424,
"num_input_tokens_seen": 3869245440,
"step": 7380,
"train_runtime": 33532.0567,
"train_tokens_per_second": 115389.446
},
{
"epoch": 0.39990259476717444,
"grad_norm": 0.1578545719385147,
"learning_rate": 0.0035575511629162876,
"loss": 3.102129364013672,
"num_input_tokens_seen": 3874488320,
"step": 7390,
"train_runtime": 33577.2022,
"train_tokens_per_second": 115390.445
},
{
"epoch": 0.40044373494953867,
"grad_norm": 0.15498770773410797,
"learning_rate": 0.0035538805134524183,
"loss": 3.115239715576172,
"num_input_tokens_seen": 3879731200,
"step": 7400,
"train_runtime": 33622.363,
"train_tokens_per_second": 115391.39
},
{
"epoch": 0.4009848751319029,
"grad_norm": 0.15868115425109863,
"learning_rate": 0.0035502074097783242,
"loss": 3.1181896209716795,
"num_input_tokens_seen": 3884974080,
"step": 7410,
"train_runtime": 33667.5163,
"train_tokens_per_second": 115392.358
},
{
"epoch": 0.4015260153142672,
"grad_norm": 0.1605597585439682,
"learning_rate": 0.0035465318631078204,
"loss": 3.113156318664551,
"num_input_tokens_seen": 3890216960,
"step": 7420,
"train_runtime": 33712.6623,
"train_tokens_per_second": 115393.348
},
{
"epoch": 0.4020671554966314,
"grad_norm": 0.17280755937099457,
"learning_rate": 0.003542853884662183,
"loss": 3.1183053970336916,
"num_input_tokens_seen": 3895459840,
"step": 7430,
"train_runtime": 33757.8255,
"train_tokens_per_second": 115394.276
},
{
"epoch": 0.40260829567899564,
"grad_norm": 0.16187331080436707,
"learning_rate": 0.0035391734856701092,
"loss": 3.1163970947265627,
"num_input_tokens_seen": 3900702720,
"step": 7440,
"train_runtime": 33802.989,
"train_tokens_per_second": 115395.201
},
{
"epoch": 0.4031494358613599,
"grad_norm": 0.1724129021167755,
"learning_rate": 0.0035354906773676894,
"loss": 3.1170070648193358,
"num_input_tokens_seen": 3905945600,
"step": 7450,
"train_runtime": 33848.1517,
"train_tokens_per_second": 115396.127
},
{
"epoch": 0.4036905760437241,
"grad_norm": 0.17225228250026703,
"learning_rate": 0.003531805470998366,
"loss": 3.110821533203125,
"num_input_tokens_seen": 3911188480,
"step": 7460,
"train_runtime": 33893.3266,
"train_tokens_per_second": 115397.008
},
{
"epoch": 0.4042317162260884,
"grad_norm": 0.1592818796634674,
"learning_rate": 0.0035281178778129073,
"loss": 3.116873931884766,
"num_input_tokens_seen": 3916431360,
"step": 7470,
"train_runtime": 33938.5013,
"train_tokens_per_second": 115397.888
},
{
"epoch": 0.4047728564084526,
"grad_norm": 0.1658582091331482,
"learning_rate": 0.0035244279090693633,
"loss": 3.1268436431884767,
"num_input_tokens_seen": 3921674240,
"step": 7480,
"train_runtime": 33983.671,
"train_tokens_per_second": 115398.782
},
{
"epoch": 0.40531399659081685,
"grad_norm": 0.14836189150810242,
"learning_rate": 0.00352073557603304,
"loss": 3.114876556396484,
"num_input_tokens_seen": 3926917120,
"step": 7490,
"train_runtime": 34028.8257,
"train_tokens_per_second": 115399.725
},
{
"epoch": 0.4058551367731811,
"grad_norm": 0.16045086085796356,
"learning_rate": 0.0035170408899764605,
"loss": 3.1156852722167967,
"num_input_tokens_seen": 3932160000,
"step": 7500,
"train_runtime": 34073.9726,
"train_tokens_per_second": 115400.692
},
{
"epoch": 0.4058551367731811,
"eval_loss": 3.0682461261749268,
"eval_runtime": 1.9852,
"eval_samples_per_second": 251.858,
"eval_steps_per_second": 4.03,
"num_input_tokens_seen": 3932160000,
"step": 7500
},
{
"epoch": 0.4063962769555453,
"grad_norm": 0.16971535980701447,
"learning_rate": 0.0035133438621793296,
"loss": 3.1024160385131836,
"num_input_tokens_seen": 3937402880,
"step": 7510,
"train_runtime": 34121.1044,
"train_tokens_per_second": 115394.943
},
{
"epoch": 0.4069374171379096,
"grad_norm": 0.16736076772212982,
"learning_rate": 0.003509644503928506,
"loss": 3.1206098556518556,
"num_input_tokens_seen": 3942645760,
"step": 7520,
"train_runtime": 34166.2706,
"train_tokens_per_second": 115395.848
},
{
"epoch": 0.4074785573202738,
"grad_norm": 0.16113705933094025,
"learning_rate": 0.0035059428265179567,
"loss": 3.117937469482422,
"num_input_tokens_seen": 3947888640,
"step": 7530,
"train_runtime": 34211.4099,
"train_tokens_per_second": 115396.841
},
{
"epoch": 0.40801969750263806,
"grad_norm": 0.17517107725143433,
"learning_rate": 0.0035022388412487356,
"loss": 3.1136932373046875,
"num_input_tokens_seen": 3953131520,
"step": 7540,
"train_runtime": 34256.533,
"train_tokens_per_second": 115397.887
},
{
"epoch": 0.4085608376850023,
"grad_norm": 0.18709343671798706,
"learning_rate": 0.003498532559428938,
"loss": 3.125676918029785,
"num_input_tokens_seen": 3958374400,
"step": 7550,
"train_runtime": 34301.6505,
"train_tokens_per_second": 115398.949
},
{
"epoch": 0.4091019778673665,
"grad_norm": 0.1633439064025879,
"learning_rate": 0.0034948239923736713,
"loss": 3.1128585815429686,
"num_input_tokens_seen": 3963617280,
"step": 7560,
"train_runtime": 34346.7672,
"train_tokens_per_second": 115400.01
},
{
"epoch": 0.4096431180497308,
"grad_norm": 0.16776174306869507,
"learning_rate": 0.0034911131514050214,
"loss": 3.114968681335449,
"num_input_tokens_seen": 3968860160,
"step": 7570,
"train_runtime": 34391.881,
"train_tokens_per_second": 115401.078
},
{
"epoch": 0.41018425823209503,
"grad_norm": 0.17015814781188965,
"learning_rate": 0.0034874000478520148,
"loss": 3.1098609924316407,
"num_input_tokens_seen": 3974103040,
"step": 7580,
"train_runtime": 34437.0464,
"train_tokens_per_second": 115401.971
},
{
"epoch": 0.41072539841445926,
"grad_norm": 0.1705334633588791,
"learning_rate": 0.0034836846930505843,
"loss": 3.1172601699829103,
"num_input_tokens_seen": 3979345920,
"step": 7590,
"train_runtime": 34482.2174,
"train_tokens_per_second": 115402.843
},
{
"epoch": 0.4112665385968235,
"grad_norm": 0.17283746600151062,
"learning_rate": 0.0034799670983435395,
"loss": 3.1093212127685548,
"num_input_tokens_seen": 3984588800,
"step": 7600,
"train_runtime": 34527.3958,
"train_tokens_per_second": 115403.688
},
{
"epoch": 0.4118076787791877,
"grad_norm": 0.1661679744720459,
"learning_rate": 0.003476247275080524,
"loss": 3.114109992980957,
"num_input_tokens_seen": 3989831680,
"step": 7610,
"train_runtime": 34572.5667,
"train_tokens_per_second": 115404.555
},
{
"epoch": 0.412348818961552,
"grad_norm": 0.16221173107624054,
"learning_rate": 0.003472525234617988,
"loss": 3.1130563735961916,
"num_input_tokens_seen": 3995074560,
"step": 7620,
"train_runtime": 34617.7625,
"train_tokens_per_second": 115405.337
},
{
"epoch": 0.41288995914391624,
"grad_norm": 0.16985613107681274,
"learning_rate": 0.0034688009883191507,
"loss": 3.1183204650878906,
"num_input_tokens_seen": 4000317440,
"step": 7630,
"train_runtime": 34662.948,
"train_tokens_per_second": 115406.152
},
{
"epoch": 0.41343109932628047,
"grad_norm": 0.15718990564346313,
"learning_rate": 0.003465074547553963,
"loss": 3.1192548751831053,
"num_input_tokens_seen": 4005560320,
"step": 7640,
"train_runtime": 34711.7967,
"train_tokens_per_second": 115394.785
},
{
"epoch": 0.4139722395086447,
"grad_norm": 0.16134141385555267,
"learning_rate": 0.0034613459236990775,
"loss": 3.1101545333862304,
"num_input_tokens_seen": 4010803200,
"step": 7650,
"train_runtime": 34756.9452,
"train_tokens_per_second": 115395.734
},
{
"epoch": 0.41451337969100893,
"grad_norm": 0.16892403364181519,
"learning_rate": 0.0034576151281378127,
"loss": 3.103810691833496,
"num_input_tokens_seen": 4016046080,
"step": 7660,
"train_runtime": 34802.1069,
"train_tokens_per_second": 115396.637
},
{
"epoch": 0.4150545198733732,
"grad_norm": 0.15722833573818207,
"learning_rate": 0.003453882172260114,
"loss": 3.109886360168457,
"num_input_tokens_seen": 4021288960,
"step": 7670,
"train_runtime": 34847.275,
"train_tokens_per_second": 115397.516
},
{
"epoch": 0.41559566005573745,
"grad_norm": 0.16605538129806519,
"learning_rate": 0.0034501470674625258,
"loss": 3.110805892944336,
"num_input_tokens_seen": 4026531840,
"step": 7680,
"train_runtime": 34892.47,
"train_tokens_per_second": 115398.303
},
{
"epoch": 0.4161368002381017,
"grad_norm": 0.1643964648246765,
"learning_rate": 0.003446409825148149,
"loss": 3.11865348815918,
"num_input_tokens_seen": 4031774720,
"step": 7690,
"train_runtime": 34937.6366,
"train_tokens_per_second": 115399.183
},
{
"epoch": 0.4166779404204659,
"grad_norm": 0.17231661081314087,
"learning_rate": 0.003442670456726614,
"loss": 3.117427444458008,
"num_input_tokens_seen": 4037017600,
"step": 7700,
"train_runtime": 34982.8067,
"train_tokens_per_second": 115400.049
},
{
"epoch": 0.41721908060283014,
"grad_norm": 0.16913042962551117,
"learning_rate": 0.0034389289736140405,
"loss": 3.1114864349365234,
"num_input_tokens_seen": 4042260480,
"step": 7710,
"train_runtime": 35027.9883,
"train_tokens_per_second": 115400.874
},
{
"epoch": 0.4177602207851944,
"grad_norm": 0.16182249784469604,
"learning_rate": 0.0034351853872330042,
"loss": 3.107219696044922,
"num_input_tokens_seen": 4047503360,
"step": 7720,
"train_runtime": 35073.1627,
"train_tokens_per_second": 115401.722
},
{
"epoch": 0.41830136096755866,
"grad_norm": 0.15614280104637146,
"learning_rate": 0.003431439709012501,
"loss": 3.10361385345459,
"num_input_tokens_seen": 4052746240,
"step": 7730,
"train_runtime": 35118.3339,
"train_tokens_per_second": 115402.577
},
{
"epoch": 0.4188425011499229,
"grad_norm": 0.16172853112220764,
"learning_rate": 0.003427691950387916,
"loss": 3.10665225982666,
"num_input_tokens_seen": 4057989120,
"step": 7740,
"train_runtime": 35163.5186,
"train_tokens_per_second": 115403.386
},
{
"epoch": 0.4193836413322871,
"grad_norm": 0.1584424078464508,
"learning_rate": 0.0034239421228009826,
"loss": 3.109303665161133,
"num_input_tokens_seen": 4063232000,
"step": 7750,
"train_runtime": 35208.6903,
"train_tokens_per_second": 115404.236
},
{
"epoch": 0.41992478151465135,
"grad_norm": 0.15736353397369385,
"learning_rate": 0.0034201902376997523,
"loss": 3.1072481155395506,
"num_input_tokens_seen": 4068474880,
"step": 7760,
"train_runtime": 35253.8805,
"train_tokens_per_second": 115405.023
},
{
"epoch": 0.42046592169701563,
"grad_norm": 0.158221036195755,
"learning_rate": 0.0034164363065385577,
"loss": 3.107033920288086,
"num_input_tokens_seen": 4073717760,
"step": 7770,
"train_runtime": 35299.0377,
"train_tokens_per_second": 115405.915
},
{
"epoch": 0.42100706187937986,
"grad_norm": 0.16100963950157166,
"learning_rate": 0.0034126803407779783,
"loss": 3.102493667602539,
"num_input_tokens_seen": 4078960640,
"step": 7780,
"train_runtime": 35344.2177,
"train_tokens_per_second": 115406.732
},
{
"epoch": 0.4215482020617441,
"grad_norm": 0.15508411824703217,
"learning_rate": 0.0034089223518848043,
"loss": 3.110720634460449,
"num_input_tokens_seen": 4084203520,
"step": 7790,
"train_runtime": 35389.3807,
"train_tokens_per_second": 115407.601
},
{
"epoch": 0.4220893422441083,
"grad_norm": 0.16234534978866577,
"learning_rate": 0.0034051623513320028,
"loss": 3.116852378845215,
"num_input_tokens_seen": 4089446400,
"step": 7800,
"train_runtime": 35434.5473,
"train_tokens_per_second": 115408.456
},
{
"epoch": 0.42263048242647255,
"grad_norm": 0.15150156617164612,
"learning_rate": 0.003401400350598683,
"loss": 3.110218048095703,
"num_input_tokens_seen": 4094689280,
"step": 7810,
"train_runtime": 35479.7081,
"train_tokens_per_second": 115409.328
},
{
"epoch": 0.42317162260883684,
"grad_norm": 0.16316647827625275,
"learning_rate": 0.0033976363611700608,
"loss": 3.099168395996094,
"num_input_tokens_seen": 4099932160,
"step": 7820,
"train_runtime": 35524.9004,
"train_tokens_per_second": 115410.096
},
{
"epoch": 0.42371276279120107,
"grad_norm": 0.15622437000274658,
"learning_rate": 0.00339387039453742,
"loss": 3.1079681396484373,
"num_input_tokens_seen": 4105175040,
"step": 7830,
"train_runtime": 35570.568,
"train_tokens_per_second": 115409.319
},
{
"epoch": 0.4242539029735653,
"grad_norm": 0.1611352562904358,
"learning_rate": 0.0033901024621980865,
"loss": 3.1027732849121095,
"num_input_tokens_seen": 4110417920,
"step": 7840,
"train_runtime": 35628.7933,
"train_tokens_per_second": 115367.868
},
{
"epoch": 0.42479504315592953,
"grad_norm": 0.1534154713153839,
"learning_rate": 0.0033863325756553824,
"loss": 3.1010990142822266,
"num_input_tokens_seen": 4115660800,
"step": 7850,
"train_runtime": 35677.6783,
"train_tokens_per_second": 115356.744
},
{
"epoch": 0.42533618333829376,
"grad_norm": 0.16484984755516052,
"learning_rate": 0.0033825607464185994,
"loss": 3.0935718536376955,
"num_input_tokens_seen": 4120903680,
"step": 7860,
"train_runtime": 35722.8483,
"train_tokens_per_second": 115357.646
},
{
"epoch": 0.42587732352065805,
"grad_norm": 0.15278859436511993,
"learning_rate": 0.0033787869860029576,
"loss": 3.095734405517578,
"num_input_tokens_seen": 4126146560,
"step": 7870,
"train_runtime": 35768.0118,
"train_tokens_per_second": 115358.566
},
{
"epoch": 0.4264184637030223,
"grad_norm": 0.16884206235408783,
"learning_rate": 0.003375011305929574,
"loss": 3.1056522369384765,
"num_input_tokens_seen": 4131389440,
"step": 7880,
"train_runtime": 35813.1554,
"train_tokens_per_second": 115359.549
},
{
"epoch": 0.4269596038853865,
"grad_norm": 0.15963584184646606,
"learning_rate": 0.003371233717725426,
"loss": 3.1040569305419923,
"num_input_tokens_seen": 4136632320,
"step": 7890,
"train_runtime": 35858.3104,
"train_tokens_per_second": 115360.492
},
{
"epoch": 0.42750074406775074,
"grad_norm": 0.1541411578655243,
"learning_rate": 0.0033674542329233175,
"loss": 3.1086753845214843,
"num_input_tokens_seen": 4141875200,
"step": 7900,
"train_runtime": 35903.4547,
"train_tokens_per_second": 115361.467
},
{
"epoch": 0.42804188425011497,
"grad_norm": 0.16819094121456146,
"learning_rate": 0.003363672863061842,
"loss": 3.108404350280762,
"num_input_tokens_seen": 4147118080,
"step": 7910,
"train_runtime": 35948.5895,
"train_tokens_per_second": 115362.47
},
{
"epoch": 0.42858302443247925,
"grad_norm": 0.15858127176761627,
"learning_rate": 0.003359889619685346,
"loss": 3.1061111450195313,
"num_input_tokens_seen": 4152360960,
"step": 7920,
"train_runtime": 35993.7102,
"train_tokens_per_second": 115363.516
},
{
"epoch": 0.4291241646148435,
"grad_norm": 0.15731550753116608,
"learning_rate": 0.003356104514343899,
"loss": 3.1057785034179686,
"num_input_tokens_seen": 4157603840,
"step": 7930,
"train_runtime": 36038.862,
"train_tokens_per_second": 115364.46
},
{
"epoch": 0.4296653047972077,
"grad_norm": 0.14610068500041962,
"learning_rate": 0.0033523175585932524,
"loss": 3.09300537109375,
"num_input_tokens_seen": 4162846720,
"step": 7940,
"train_runtime": 36084.0029,
"train_tokens_per_second": 115365.436
},
{
"epoch": 0.43020644497957194,
"grad_norm": 0.1672324687242508,
"learning_rate": 0.003348528763994809,
"loss": 3.1017438888549806,
"num_input_tokens_seen": 4168089600,
"step": 7950,
"train_runtime": 36129.1342,
"train_tokens_per_second": 115366.44
},
{
"epoch": 0.4307475851619362,
"grad_norm": 0.1700795590877533,
"learning_rate": 0.003344738142115583,
"loss": 3.0958410263061524,
"num_input_tokens_seen": 4173332480,
"step": 7960,
"train_runtime": 36174.3238,
"train_tokens_per_second": 115367.256
},
{
"epoch": 0.43128872534430046,
"grad_norm": 0.15165534615516663,
"learning_rate": 0.00334094570452817,
"loss": 3.101241874694824,
"num_input_tokens_seen": 4178575360,
"step": 7970,
"train_runtime": 36219.5195,
"train_tokens_per_second": 115368.051
},
{
"epoch": 0.4318298655266647,
"grad_norm": 0.1584347039461136,
"learning_rate": 0.0033371514628107073,
"loss": 3.101197052001953,
"num_input_tokens_seen": 4183818240,
"step": 7980,
"train_runtime": 36264.6831,
"train_tokens_per_second": 115368.945
},
{
"epoch": 0.4323710057090289,
"grad_norm": 0.15928910672664642,
"learning_rate": 0.0033333554285468387,
"loss": 3.1082935333251953,
"num_input_tokens_seen": 4189061120,
"step": 7990,
"train_runtime": 36309.973,
"train_tokens_per_second": 115369.436
},
{
"epoch": 0.43291214589139315,
"grad_norm": 0.15439751744270325,
"learning_rate": 0.003329557613325685,
"loss": 3.1111793518066406,
"num_input_tokens_seen": 4194304000,
"step": 8000,
"train_runtime": 36355.1728,
"train_tokens_per_second": 115370.212
},
{
"epoch": 0.43291214589139315,
"eval_loss": 3.0542824268341064,
"eval_runtime": 1.9899,
"eval_samples_per_second": 251.266,
"eval_steps_per_second": 4.02,
"num_input_tokens_seen": 4194304000,
"step": 8000
},
{
"epoch": 0.4334532860737574,
"grad_norm": 0.1620936542749405,
"learning_rate": 0.0033257580287417987,
"loss": 3.1044567108154295,
"num_input_tokens_seen": 4199546880,
"step": 8010,
"train_runtime": 36404.8051,
"train_tokens_per_second": 115356.939
},
{
"epoch": 0.43399442625612167,
"grad_norm": 0.16753153502941132,
"learning_rate": 0.0033219566863951383,
"loss": 3.0971731185913085,
"num_input_tokens_seen": 4204789760,
"step": 8020,
"train_runtime": 36454.2975,
"train_tokens_per_second": 115344.144
},
{
"epoch": 0.4345355664384859,
"grad_norm": 0.15557527542114258,
"learning_rate": 0.0033181535978910265,
"loss": 3.099981689453125,
"num_input_tokens_seen": 4210032640,
"step": 8030,
"train_runtime": 36499.5202,
"train_tokens_per_second": 115344.876
},
{
"epoch": 0.43507670662085013,
"grad_norm": 0.15290997922420502,
"learning_rate": 0.0033143487748401174,
"loss": 3.1018728256225585,
"num_input_tokens_seen": 4215275520,
"step": 8040,
"train_runtime": 36544.6707,
"train_tokens_per_second": 115345.834
},
{
"epoch": 0.43561784680321436,
"grad_norm": 0.15225110948085785,
"learning_rate": 0.0033105422288583616,
"loss": 3.09820671081543,
"num_input_tokens_seen": 4220518400,
"step": 8050,
"train_runtime": 36589.8576,
"train_tokens_per_second": 115346.675
},
{
"epoch": 0.4361589869855786,
"grad_norm": 0.16044707596302032,
"learning_rate": 0.003306733971566968,
"loss": 3.1036590576171874,
"num_input_tokens_seen": 4225761280,
"step": 8060,
"train_runtime": 36635.0375,
"train_tokens_per_second": 115347.535
},
{
"epoch": 0.4367001271679429,
"grad_norm": 0.1795279085636139,
"learning_rate": 0.0033029240145923708,
"loss": 3.102092170715332,
"num_input_tokens_seen": 4231004160,
"step": 8070,
"train_runtime": 36680.2289,
"train_tokens_per_second": 115348.357
},
{
"epoch": 0.4372412673503071,
"grad_norm": 0.16536639630794525,
"learning_rate": 0.003299112369566194,
"loss": 3.101215934753418,
"num_input_tokens_seen": 4236247040,
"step": 8080,
"train_runtime": 36725.4044,
"train_tokens_per_second": 115349.228
},
{
"epoch": 0.43778240753267134,
"grad_norm": 0.1551489681005478,
"learning_rate": 0.003295299048125215,
"loss": 3.1048954010009764,
"num_input_tokens_seen": 4241489920,
"step": 8090,
"train_runtime": 36770.5602,
"train_tokens_per_second": 115350.158
},
{
"epoch": 0.43832354771503557,
"grad_norm": 0.15728254616260529,
"learning_rate": 0.0032914840619113267,
"loss": 3.0963891983032226,
"num_input_tokens_seen": 4246732800,
"step": 8100,
"train_runtime": 36815.7389,
"train_tokens_per_second": 115351.014
},
{
"epoch": 0.4388646878973998,
"grad_norm": 0.16317813098430634,
"learning_rate": 0.0032876674225715092,
"loss": 3.095835876464844,
"num_input_tokens_seen": 4251975680,
"step": 8110,
"train_runtime": 36860.9041,
"train_tokens_per_second": 115351.91
},
{
"epoch": 0.4394058280797641,
"grad_norm": 0.16513289511203766,
"learning_rate": 0.0032838491417577845,
"loss": 3.100272369384766,
"num_input_tokens_seen": 4257218560,
"step": 8120,
"train_runtime": 36906.0614,
"train_tokens_per_second": 115352.828
},
{
"epoch": 0.4399469682621283,
"grad_norm": 0.154524028301239,
"learning_rate": 0.003280029231127189,
"loss": 3.1007152557373048,
"num_input_tokens_seen": 4262461440,
"step": 8130,
"train_runtime": 36951.2345,
"train_tokens_per_second": 115353.695
},
{
"epoch": 0.44048810844449254,
"grad_norm": 0.16192783415317535,
"learning_rate": 0.003276207702341735,
"loss": 3.1067665100097654,
"num_input_tokens_seen": 4267704320,
"step": 8140,
"train_runtime": 36996.3989,
"train_tokens_per_second": 115354.587
},
{
"epoch": 0.4410292486268568,
"grad_norm": 0.1726672351360321,
"learning_rate": 0.003272384567068373,
"loss": 3.098089027404785,
"num_input_tokens_seen": 4272947200,
"step": 8150,
"train_runtime": 37041.5682,
"train_tokens_per_second": 115355.462
},
{
"epoch": 0.441570388809221,
"grad_norm": 0.14850319921970367,
"learning_rate": 0.00326855983697896,
"loss": 3.0921985626220705,
"num_input_tokens_seen": 4278190080,
"step": 8160,
"train_runtime": 37086.741,
"train_tokens_per_second": 115356.323
},
{
"epoch": 0.4421115289915853,
"grad_norm": 0.15166330337524414,
"learning_rate": 0.0032647335237502195,
"loss": 3.101424789428711,
"num_input_tokens_seen": 4283432960,
"step": 8170,
"train_runtime": 37131.9267,
"train_tokens_per_second": 115357.142
},
{
"epoch": 0.4426526691739495,
"grad_norm": 0.15639446675777435,
"learning_rate": 0.0032609056390637114,
"loss": 3.098773193359375,
"num_input_tokens_seen": 4288675840,
"step": 8180,
"train_runtime": 37177.0966,
"train_tokens_per_second": 115358.009
},
{
"epoch": 0.44319380935631375,
"grad_norm": 0.1532983034849167,
"learning_rate": 0.003257076194605791,
"loss": 3.1019330978393556,
"num_input_tokens_seen": 4293918720,
"step": 8190,
"train_runtime": 37222.2634,
"train_tokens_per_second": 115358.883
},
{
"epoch": 0.443734949538678,
"grad_norm": 0.17717613279819489,
"learning_rate": 0.0032532452020675763,
"loss": 3.099607467651367,
"num_input_tokens_seen": 4299161600,
"step": 8200,
"train_runtime": 37267.4247,
"train_tokens_per_second": 115359.772
},
{
"epoch": 0.4442760897210422,
"grad_norm": 0.15027566254138947,
"learning_rate": 0.00324941267314491,
"loss": 3.107021713256836,
"num_input_tokens_seen": 4304404480,
"step": 8210,
"train_runtime": 37312.5891,
"train_tokens_per_second": 115360.649
},
{
"epoch": 0.4448172299034065,
"grad_norm": 0.16586042940616608,
"learning_rate": 0.0032455786195383285,
"loss": 3.0993444442749025,
"num_input_tokens_seen": 4309647360,
"step": 8220,
"train_runtime": 37357.762,
"train_tokens_per_second": 115361.497
},
{
"epoch": 0.4453583700857707,
"grad_norm": 0.16227947175502777,
"learning_rate": 0.00324174305295302,
"loss": 3.0918336868286134,
"num_input_tokens_seen": 4314890240,
"step": 8230,
"train_runtime": 37402.9165,
"train_tokens_per_second": 115362.401
},
{
"epoch": 0.44589951026813496,
"grad_norm": 0.16855213046073914,
"learning_rate": 0.0032379059850987926,
"loss": 3.0997894287109373,
"num_input_tokens_seen": 4320133120,
"step": 8240,
"train_runtime": 37448.08,
"train_tokens_per_second": 115363.274
},
{
"epoch": 0.4464406504504992,
"grad_norm": 0.15484896302223206,
"learning_rate": 0.003234067427690039,
"loss": 3.0965702056884767,
"num_input_tokens_seen": 4325376000,
"step": 8250,
"train_runtime": 37493.242,
"train_tokens_per_second": 115364.15
},
{
"epoch": 0.4469817906328634,
"grad_norm": 0.16154596209526062,
"learning_rate": 0.0032302273924456966,
"loss": 3.0933055877685547,
"num_input_tokens_seen": 4330618880,
"step": 8260,
"train_runtime": 37538.3815,
"train_tokens_per_second": 115365.093
},
{
"epoch": 0.4475229308152277,
"grad_norm": 0.1575249284505844,
"learning_rate": 0.003226385891089219,
"loss": 3.0924747467041014,
"num_input_tokens_seen": 4335861760,
"step": 8270,
"train_runtime": 37583.5299,
"train_tokens_per_second": 115366.007
},
{
"epoch": 0.44806407099759193,
"grad_norm": 0.160599023103714,
"learning_rate": 0.0032225429353485296,
"loss": 3.096691131591797,
"num_input_tokens_seen": 4341104640,
"step": 8280,
"train_runtime": 37628.6879,
"train_tokens_per_second": 115366.889
},
{
"epoch": 0.44860521117995616,
"grad_norm": 0.15803949534893036,
"learning_rate": 0.003218698536955999,
"loss": 3.1002126693725587,
"num_input_tokens_seen": 4346347520,
"step": 8290,
"train_runtime": 37673.8359,
"train_tokens_per_second": 115367.799
},
{
"epoch": 0.4491463513623204,
"grad_norm": 0.1458759903907776,
"learning_rate": 0.0032148527076483963,
"loss": 3.0890472412109373,
"num_input_tokens_seen": 4351590400,
"step": 8300,
"train_runtime": 37718.9873,
"train_tokens_per_second": 115368.697
},
{
"epoch": 0.4496874915446846,
"grad_norm": 0.15396055579185486,
"learning_rate": 0.0032110054591668624,
"loss": 3.0894855499267577,
"num_input_tokens_seen": 4356833280,
"step": 8310,
"train_runtime": 37764.1751,
"train_tokens_per_second": 115369.481
},
{
"epoch": 0.4502286317270489,
"grad_norm": 0.15826280415058136,
"learning_rate": 0.0032071568032568704,
"loss": 3.1003223419189454,
"num_input_tokens_seen": 4362076160,
"step": 8320,
"train_runtime": 37809.3568,
"train_tokens_per_second": 115370.282
},
{
"epoch": 0.45076977190941314,
"grad_norm": 0.16446684300899506,
"learning_rate": 0.003203306751668188,
"loss": 3.093168258666992,
"num_input_tokens_seen": 4367319040,
"step": 8330,
"train_runtime": 37854.5343,
"train_tokens_per_second": 115371.094
},
{
"epoch": 0.45131091209177737,
"grad_norm": 0.1580137461423874,
"learning_rate": 0.0031994553161548474,
"loss": 3.101323699951172,
"num_input_tokens_seen": 4372561920,
"step": 8340,
"train_runtime": 37899.7112,
"train_tokens_per_second": 115371.906
},
{
"epoch": 0.4518520522741416,
"grad_norm": 0.15007439255714417,
"learning_rate": 0.003195602508475103,
"loss": 3.0974876403808596,
"num_input_tokens_seen": 4377804800,
"step": 8350,
"train_runtime": 37944.8699,
"train_tokens_per_second": 115372.771
},
{
"epoch": 0.45239319245650583,
"grad_norm": 0.16612504422664642,
"learning_rate": 0.0031917483403914,
"loss": 3.097567558288574,
"num_input_tokens_seen": 4383047680,
"step": 8360,
"train_runtime": 37990.0345,
"train_tokens_per_second": 115373.617
},
{
"epoch": 0.4529343326388701,
"grad_norm": 0.152009978890419,
"learning_rate": 0.0031878928236703354,
"loss": 3.09008674621582,
"num_input_tokens_seen": 4388290560,
"step": 8370,
"train_runtime": 38035.2108,
"train_tokens_per_second": 115374.425
},
{
"epoch": 0.45347547282123435,
"grad_norm": 0.14635391533374786,
"learning_rate": 0.003184035970082625,
"loss": 3.0835281372070313,
"num_input_tokens_seen": 4393533440,
"step": 8380,
"train_runtime": 38080.3814,
"train_tokens_per_second": 115375.248
},
{
"epoch": 0.4540166130035986,
"grad_norm": 0.16529026627540588,
"learning_rate": 0.0031801777914030657,
"loss": 3.0935291290283202,
"num_input_tokens_seen": 4398776320,
"step": 8390,
"train_runtime": 38125.5235,
"train_tokens_per_second": 115376.155
},
{
"epoch": 0.4545577531859628,
"grad_norm": 0.15716882050037384,
"learning_rate": 0.003176318299410499,
"loss": 3.0900102615356446,
"num_input_tokens_seen": 4404019200,
"step": 8400,
"train_runtime": 38175.1087,
"train_tokens_per_second": 115363.632
},
{
"epoch": 0.45509889336832704,
"grad_norm": 0.15623128414154053,
"learning_rate": 0.003172457505887777,
"loss": 3.0833271026611326,
"num_input_tokens_seen": 4409262080,
"step": 8410,
"train_runtime": 38220.3198,
"train_tokens_per_second": 115364.343
},
{
"epoch": 0.4556400335506913,
"grad_norm": 0.1591528207063675,
"learning_rate": 0.0031685954226217234,
"loss": 3.0901105880737303,
"num_input_tokens_seen": 4414504960,
"step": 8420,
"train_runtime": 38265.5513,
"train_tokens_per_second": 115364.99
},
{
"epoch": 0.45618117373305556,
"grad_norm": 0.16141286492347717,
"learning_rate": 0.003164732061403102,
"loss": 3.0906259536743166,
"num_input_tokens_seen": 4419747840,
"step": 8430,
"train_runtime": 38310.7985,
"train_tokens_per_second": 115365.589
},
{
"epoch": 0.4567223139154198,
"grad_norm": 0.15204599499702454,
"learning_rate": 0.0031608674340265768,
"loss": 3.084097671508789,
"num_input_tokens_seen": 4424990720,
"step": 8440,
"train_runtime": 38356.0563,
"train_tokens_per_second": 115366.154
},
{
"epoch": 0.457263454097784,
"grad_norm": 0.15592141449451447,
"learning_rate": 0.003157001552290677,
"loss": 3.0875980377197267,
"num_input_tokens_seen": 4430233600,
"step": 8450,
"train_runtime": 38401.2772,
"train_tokens_per_second": 115366.829
},
{
"epoch": 0.45780459428014825,
"grad_norm": 0.15805137157440186,
"learning_rate": 0.0031531344279977615,
"loss": 3.0840667724609374,
"num_input_tokens_seen": 4435476480,
"step": 8460,
"train_runtime": 38446.5182,
"train_tokens_per_second": 115367.443
},
{
"epoch": 0.45834573446251253,
"grad_norm": 0.1569671630859375,
"learning_rate": 0.003149266072953983,
"loss": 3.095382308959961,
"num_input_tokens_seen": 4440719360,
"step": 8470,
"train_runtime": 38491.7485,
"train_tokens_per_second": 115368.086
},
{
"epoch": 0.45888687464487676,
"grad_norm": 0.15263330936431885,
"learning_rate": 0.0031453964989692517,
"loss": 3.0909893035888674,
"num_input_tokens_seen": 4445962240,
"step": 8480,
"train_runtime": 38536.9781,
"train_tokens_per_second": 115368.73
},
{
"epoch": 0.459428014827241,
"grad_norm": 0.16741898655891418,
"learning_rate": 0.0031415257178571986,
"loss": 3.091363525390625,
"num_input_tokens_seen": 4451205120,
"step": 8490,
"train_runtime": 38582.2145,
"train_tokens_per_second": 115369.353
},
{
"epoch": 0.4599691550096052,
"grad_norm": 0.1621858924627304,
"learning_rate": 0.0031376537414351414,
"loss": 3.0860706329345704,
"num_input_tokens_seen": 4456448000,
"step": 8500,
"train_runtime": 38627.4547,
"train_tokens_per_second": 115369.963
},
{
"epoch": 0.4599691550096052,
"eval_loss": 3.044252634048462,
"eval_runtime": 1.9941,
"eval_samples_per_second": 250.745,
"eval_steps_per_second": 4.012,
"num_input_tokens_seen": 4456448000,
"step": 8500
},
{
"epoch": 0.46051029519196945,
"grad_norm": 0.1537708044052124,
"learning_rate": 0.0031337805815240443,
"loss": 3.0971357345581056,
"num_input_tokens_seen": 4461690880,
"step": 8510,
"train_runtime": 38674.6947,
"train_tokens_per_second": 115364.605
},
{
"epoch": 0.46105143537433374,
"grad_norm": 0.1561785489320755,
"learning_rate": 0.0031299062499484886,
"loss": 3.095275115966797,
"num_input_tokens_seen": 4466933760,
"step": 8520,
"train_runtime": 38719.9322,
"train_tokens_per_second": 115365.227
},
{
"epoch": 0.46159257555669797,
"grad_norm": 0.15904352068901062,
"learning_rate": 0.0031260307585366277,
"loss": 3.093882942199707,
"num_input_tokens_seen": 4472176640,
"step": 8530,
"train_runtime": 38765.1761,
"train_tokens_per_second": 115365.828
},
{
"epoch": 0.4621337157390622,
"grad_norm": 0.1586320400238037,
"learning_rate": 0.00312215411912016,
"loss": 3.0862545013427733,
"num_input_tokens_seen": 4477419520,
"step": 8540,
"train_runtime": 38810.3531,
"train_tokens_per_second": 115366.627
},
{
"epoch": 0.46267485592142643,
"grad_norm": 0.15955059230327606,
"learning_rate": 0.003118276343534288,
"loss": 3.09029598236084,
"num_input_tokens_seen": 4482662400,
"step": 8550,
"train_runtime": 38855.5252,
"train_tokens_per_second": 115367.438
},
{
"epoch": 0.46321599610379066,
"grad_norm": 0.16844794154167175,
"learning_rate": 0.0031143974436176804,
"loss": 3.08276252746582,
"num_input_tokens_seen": 4487905280,
"step": 8560,
"train_runtime": 38900.7107,
"train_tokens_per_second": 115368.208
},
{
"epoch": 0.46375713628615495,
"grad_norm": 0.15490221977233887,
"learning_rate": 0.003110517431212442,
"loss": 3.096157455444336,
"num_input_tokens_seen": 4493148160,
"step": 8570,
"train_runtime": 38945.895,
"train_tokens_per_second": 115368.979
},
{
"epoch": 0.4642982764685192,
"grad_norm": 0.1643703430891037,
"learning_rate": 0.0031066363181640705,
"loss": 3.094961929321289,
"num_input_tokens_seen": 4498391040,
"step": 8580,
"train_runtime": 38991.0775,
"train_tokens_per_second": 115369.754
},
{
"epoch": 0.4648394166508834,
"grad_norm": 0.16452111303806305,
"learning_rate": 0.003102754116321427,
"loss": 3.0949285507202147,
"num_input_tokens_seen": 4503633920,
"step": 8590,
"train_runtime": 39036.2496,
"train_tokens_per_second": 115370.558
},
{
"epoch": 0.46538055683324764,
"grad_norm": 0.15409614145755768,
"learning_rate": 0.003098870837536694,
"loss": 3.083492660522461,
"num_input_tokens_seen": 4508876800,
"step": 8600,
"train_runtime": 39081.4455,
"train_tokens_per_second": 115371.29
},
{
"epoch": 0.46592169701561187,
"grad_norm": 0.16283227503299713,
"learning_rate": 0.0030949864936653444,
"loss": 3.0859600067138673,
"num_input_tokens_seen": 4514119680,
"step": 8610,
"train_runtime": 39126.6271,
"train_tokens_per_second": 115372.063
},
{
"epoch": 0.46646283719797615,
"grad_norm": 0.15932060778141022,
"learning_rate": 0.0030911010965660995,
"loss": 3.0858314514160154,
"num_input_tokens_seen": 4519362560,
"step": 8620,
"train_runtime": 39171.8041,
"train_tokens_per_second": 115372.847
},
{
"epoch": 0.4670039773803404,
"grad_norm": 0.15321630239486694,
"learning_rate": 0.0030872146581008993,
"loss": 3.0855281829833983,
"num_input_tokens_seen": 4524605440,
"step": 8630,
"train_runtime": 39217.0067,
"train_tokens_per_second": 115373.554
},
{
"epoch": 0.4675451175627046,
"grad_norm": 0.15142542123794556,
"learning_rate": 0.0030833271901348604,
"loss": 3.0922718048095703,
"num_input_tokens_seen": 4529848320,
"step": 8640,
"train_runtime": 39262.1839,
"train_tokens_per_second": 115374.334
},
{
"epoch": 0.46808625774506885,
"grad_norm": 0.15679921209812164,
"learning_rate": 0.0030794387045362448,
"loss": 3.089971923828125,
"num_input_tokens_seen": 4535091200,
"step": 8650,
"train_runtime": 39307.3714,
"train_tokens_per_second": 115375.082
},
{
"epoch": 0.4686273979274331,
"grad_norm": 0.149771049618721,
"learning_rate": 0.0030755492131764196,
"loss": 3.0910947799682615,
"num_input_tokens_seen": 4540334080,
"step": 8660,
"train_runtime": 39352.6168,
"train_tokens_per_second": 115375.659
},
{
"epoch": 0.46916853810979736,
"grad_norm": 0.15804412961006165,
"learning_rate": 0.003071658727929823,
"loss": 3.096923065185547,
"num_input_tokens_seen": 4545576960,
"step": 8670,
"train_runtime": 39397.8863,
"train_tokens_per_second": 115376.163
},
{
"epoch": 0.4697096782921616,
"grad_norm": 0.17401130497455597,
"learning_rate": 0.003067767260673929,
"loss": 3.0941158294677735,
"num_input_tokens_seen": 4550819840,
"step": 8680,
"train_runtime": 39443.148,
"train_tokens_per_second": 115376.69
},
{
"epoch": 0.4702508184745258,
"grad_norm": 0.16347981989383698,
"learning_rate": 0.003063874823289205,
"loss": 3.0893718719482424,
"num_input_tokens_seen": 4556062720,
"step": 8690,
"train_runtime": 39488.3966,
"train_tokens_per_second": 115377.253
},
{
"epoch": 0.47079195865689005,
"grad_norm": 0.16059865057468414,
"learning_rate": 0.003059981427659086,
"loss": 3.0792430877685546,
"num_input_tokens_seen": 4561305600,
"step": 8700,
"train_runtime": 39533.6411,
"train_tokens_per_second": 115377.827
},
{
"epoch": 0.4713330988392543,
"grad_norm": 0.15623228251934052,
"learning_rate": 0.0030560870856699285,
"loss": 3.0796392440795897,
"num_input_tokens_seen": 4566548480,
"step": 8710,
"train_runtime": 39578.8987,
"train_tokens_per_second": 115378.361
},
{
"epoch": 0.47187423902161857,
"grad_norm": 0.17271380126476288,
"learning_rate": 0.003052191809210979,
"loss": 3.0749179840087892,
"num_input_tokens_seen": 4571791360,
"step": 8720,
"train_runtime": 39624.1426,
"train_tokens_per_second": 115378.935
},
{
"epoch": 0.4724153792039828,
"grad_norm": 0.1617797613143921,
"learning_rate": 0.0030482956101743385,
"loss": 3.077177047729492,
"num_input_tokens_seen": 4577034240,
"step": 8730,
"train_runtime": 39669.3874,
"train_tokens_per_second": 115379.504
},
{
"epoch": 0.47295651938634703,
"grad_norm": 0.15339480340480804,
"learning_rate": 0.0030443985004549234,
"loss": 3.0854717254638673,
"num_input_tokens_seen": 4582277120,
"step": 8740,
"train_runtime": 39714.6736,
"train_tokens_per_second": 115379.952
},
{
"epoch": 0.47349765956871126,
"grad_norm": 0.1538633406162262,
"learning_rate": 0.00304050049195043,
"loss": 3.0906457901000977,
"num_input_tokens_seen": 4587520000,
"step": 8750,
"train_runtime": 39759.8518,
"train_tokens_per_second": 115380.712
},
{
"epoch": 0.4740387997510755,
"grad_norm": 0.16446056962013245,
"learning_rate": 0.0030366015965612976,
"loss": 3.0834827423095703,
"num_input_tokens_seen": 4592762880,
"step": 8760,
"train_runtime": 39805.0284,
"train_tokens_per_second": 115381.475
},
{
"epoch": 0.4745799399334398,
"grad_norm": 0.15852907299995422,
"learning_rate": 0.003032701826190677,
"loss": 3.077737808227539,
"num_input_tokens_seen": 4598005760,
"step": 8770,
"train_runtime": 39850.2201,
"train_tokens_per_second": 115382.192
},
{
"epoch": 0.475121080115804,
"grad_norm": 0.15762916207313538,
"learning_rate": 0.003028801192744386,
"loss": 3.074782943725586,
"num_input_tokens_seen": 4603248640,
"step": 8780,
"train_runtime": 39899.2419,
"train_tokens_per_second": 115371.832
},
{
"epoch": 0.47566222029816824,
"grad_norm": 0.1619185209274292,
"learning_rate": 0.0030248997081308788,
"loss": 3.0825977325439453,
"num_input_tokens_seen": 4608491520,
"step": 8790,
"train_runtime": 39944.5192,
"train_tokens_per_second": 115372.312
},
{
"epoch": 0.47620336048053247,
"grad_norm": 0.16285711526870728,
"learning_rate": 0.0030209973842612097,
"loss": 3.080776405334473,
"num_input_tokens_seen": 4613734400,
"step": 8800,
"train_runtime": 39989.7251,
"train_tokens_per_second": 115372.996
},
{
"epoch": 0.4767445006628967,
"grad_norm": 0.17198577523231506,
"learning_rate": 0.003017094233048994,
"loss": 3.0829303741455076,
"num_input_tokens_seen": 4618977280,
"step": 8810,
"train_runtime": 40034.9384,
"train_tokens_per_second": 115373.658
},
{
"epoch": 0.477285640845261,
"grad_norm": 0.16893431544303894,
"learning_rate": 0.003013190266410372,
"loss": 3.0930507659912108,
"num_input_tokens_seen": 4624220160,
"step": 8820,
"train_runtime": 40080.146,
"train_tokens_per_second": 115374.334
},
{
"epoch": 0.4778267810276252,
"grad_norm": 0.1534847915172577,
"learning_rate": 0.003009285496263973,
"loss": 3.086047554016113,
"num_input_tokens_seen": 4629463040,
"step": 8830,
"train_runtime": 40125.3234,
"train_tokens_per_second": 115375.096
},
{
"epoch": 0.47836792120998944,
"grad_norm": 0.16068360209465027,
"learning_rate": 0.003005379934530884,
"loss": 3.0864025115966798,
"num_input_tokens_seen": 4634705920,
"step": 8840,
"train_runtime": 40170.4754,
"train_tokens_per_second": 115375.929
},
{
"epoch": 0.4789090613923537,
"grad_norm": 0.1436556577682495,
"learning_rate": 0.003001473593134602,
"loss": 3.0830524444580076,
"num_input_tokens_seen": 4639948800,
"step": 8850,
"train_runtime": 40215.6345,
"train_tokens_per_second": 115376.74
},
{
"epoch": 0.4794502015747179,
"grad_norm": 0.14522488415241241,
"learning_rate": 0.0029975664840010104,
"loss": 3.0799121856689453,
"num_input_tokens_seen": 4645191680,
"step": 8860,
"train_runtime": 40260.7882,
"train_tokens_per_second": 115377.564
},
{
"epoch": 0.4799913417570822,
"grad_norm": 0.16195201873779297,
"learning_rate": 0.002993658619058331,
"loss": 3.071552848815918,
"num_input_tokens_seen": 4650434560,
"step": 8870,
"train_runtime": 40305.9418,
"train_tokens_per_second": 115378.387
},
{
"epoch": 0.4805324819394464,
"grad_norm": 0.14948424696922302,
"learning_rate": 0.0029897500102370974,
"loss": 3.0818138122558594,
"num_input_tokens_seen": 4655677440,
"step": 8880,
"train_runtime": 40351.0976,
"train_tokens_per_second": 115379.202
},
{
"epoch": 0.48107362212181065,
"grad_norm": 0.1461019665002823,
"learning_rate": 0.0029858406694701117,
"loss": 3.082274627685547,
"num_input_tokens_seen": 4660920320,
"step": 8890,
"train_runtime": 40396.2463,
"train_tokens_per_second": 115380.035
},
{
"epoch": 0.4816147623041749,
"grad_norm": 0.1501043438911438,
"learning_rate": 0.0029819306086924127,
"loss": 3.083462142944336,
"num_input_tokens_seen": 4666163200,
"step": 8900,
"train_runtime": 40441.3988,
"train_tokens_per_second": 115380.856
},
{
"epoch": 0.4821559024865391,
"grad_norm": 0.15929782390594482,
"learning_rate": 0.002978019839841233,
"loss": 3.0869064331054688,
"num_input_tokens_seen": 4671406080,
"step": 8910,
"train_runtime": 40486.5499,
"train_tokens_per_second": 115381.678
},
{
"epoch": 0.4826970426689034,
"grad_norm": 0.15982982516288757,
"learning_rate": 0.002974108374855974,
"loss": 3.082635688781738,
"num_input_tokens_seen": 4676648960,
"step": 8920,
"train_runtime": 40531.6946,
"train_tokens_per_second": 115382.518
},
{
"epoch": 0.48323818285126763,
"grad_norm": 0.15398921072483063,
"learning_rate": 0.0029701962256781555,
"loss": 3.069881820678711,
"num_input_tokens_seen": 4681891840,
"step": 8930,
"train_runtime": 40576.8387,
"train_tokens_per_second": 115383.356
},
{
"epoch": 0.48377932303363186,
"grad_norm": 0.16303351521492004,
"learning_rate": 0.0029662834042513903,
"loss": 3.078609085083008,
"num_input_tokens_seen": 4687134720,
"step": 8940,
"train_runtime": 40621.9936,
"train_tokens_per_second": 115384.163
},
{
"epoch": 0.4843204632159961,
"grad_norm": 0.16261766850948334,
"learning_rate": 0.0029623699225213417,
"loss": 3.072034454345703,
"num_input_tokens_seen": 4692377600,
"step": 8950,
"train_runtime": 40667.1772,
"train_tokens_per_second": 115384.886
},
{
"epoch": 0.4848616033983603,
"grad_norm": 0.14818759262561798,
"learning_rate": 0.002958455792435689,
"loss": 3.077336883544922,
"num_input_tokens_seen": 4697620480,
"step": 8960,
"train_runtime": 40712.3367,
"train_tokens_per_second": 115385.676
},
{
"epoch": 0.4854027435807246,
"grad_norm": 0.1536484807729721,
"learning_rate": 0.002954541025944093,
"loss": 3.0622703552246096,
"num_input_tokens_seen": 4702863360,
"step": 8970,
"train_runtime": 40757.516,
"train_tokens_per_second": 115386.408
},
{
"epoch": 0.48594388376308884,
"grad_norm": 0.1563226282596588,
"learning_rate": 0.002950625634998154,
"loss": 3.0665721893310547,
"num_input_tokens_seen": 4708106240,
"step": 8980,
"train_runtime": 40802.7032,
"train_tokens_per_second": 115387.116
},
{
"epoch": 0.48648502394545307,
"grad_norm": 0.15256533026695251,
"learning_rate": 0.0029467096315513802,
"loss": 3.0700511932373047,
"num_input_tokens_seen": 4713349120,
"step": 8990,
"train_runtime": 40847.8776,
"train_tokens_per_second": 115387.859
},
{
"epoch": 0.4870261641278173,
"grad_norm": 0.1605551838874817,
"learning_rate": 0.0029427930275591515,
"loss": 3.076490592956543,
"num_input_tokens_seen": 4718592000,
"step": 9000,
"train_runtime": 40893.0303,
"train_tokens_per_second": 115388.661
},
{
"epoch": 0.4870261641278173,
"eval_loss": 3.034029483795166,
"eval_runtime": 1.9847,
"eval_samples_per_second": 251.93,
"eval_steps_per_second": 4.031,
"num_input_tokens_seen": 4718592000,
"step": 9000
},
{
"epoch": 0.4875673043101815,
"grad_norm": 0.16099503636360168,
"learning_rate": 0.0029388758349786787,
"loss": 3.081180953979492,
"num_input_tokens_seen": 4723834880,
"step": 9010,
"train_runtime": 40942.5164,
"train_tokens_per_second": 115377.248
},
{
"epoch": 0.4881084444925458,
"grad_norm": 0.1605864018201828,
"learning_rate": 0.0029349580657689707,
"loss": 3.0802078247070312,
"num_input_tokens_seen": 4729077760,
"step": 9020,
"train_runtime": 40987.6712,
"train_tokens_per_second": 115378.054
},
{
"epoch": 0.48864958467491004,
"grad_norm": 0.15125182271003723,
"learning_rate": 0.0029310397318907965,
"loss": 3.090005111694336,
"num_input_tokens_seen": 4734320640,
"step": 9030,
"train_runtime": 41032.8355,
"train_tokens_per_second": 115378.832
},
{
"epoch": 0.4891907248572743,
"grad_norm": 0.16026070713996887,
"learning_rate": 0.002927120845306649,
"loss": 3.087236785888672,
"num_input_tokens_seen": 4739563520,
"step": 9040,
"train_runtime": 41077.9854,
"train_tokens_per_second": 115379.649
},
{
"epoch": 0.4897318650396385,
"grad_norm": 0.1533481776714325,
"learning_rate": 0.0029232014179807098,
"loss": 3.0772159576416014,
"num_input_tokens_seen": 4744806400,
"step": 9050,
"train_runtime": 41123.1477,
"train_tokens_per_second": 115380.429
},
{
"epoch": 0.49027300522200273,
"grad_norm": 0.16315831243991852,
"learning_rate": 0.002919281461878809,
"loss": 3.080950927734375,
"num_input_tokens_seen": 4750049280,
"step": 9060,
"train_runtime": 41168.3038,
"train_tokens_per_second": 115381.224
},
{
"epoch": 0.490814145404367,
"grad_norm": 0.16064807772636414,
"learning_rate": 0.0029153609889683934,
"loss": 3.077931594848633,
"num_input_tokens_seen": 4755292160,
"step": 9070,
"train_runtime": 41213.4562,
"train_tokens_per_second": 115382.028
},
{
"epoch": 0.49135528558673125,
"grad_norm": 0.15968522429466248,
"learning_rate": 0.0029114400112184857,
"loss": 3.0715621948242187,
"num_input_tokens_seen": 4760535040,
"step": 9080,
"train_runtime": 41258.6304,
"train_tokens_per_second": 115382.769
},
{
"epoch": 0.4918964257690955,
"grad_norm": 0.21646282076835632,
"learning_rate": 0.0029075185405996497,
"loss": 3.070268249511719,
"num_input_tokens_seen": 4765777920,
"step": 9090,
"train_runtime": 41303.7917,
"train_tokens_per_second": 115383.545
},
{
"epoch": 0.4924375659514597,
"grad_norm": 0.14861001074314117,
"learning_rate": 0.0029035965890839566,
"loss": 3.0772144317626955,
"num_input_tokens_seen": 4771020800,
"step": 9100,
"train_runtime": 41348.9385,
"train_tokens_per_second": 115384.36
},
{
"epoch": 0.49297870613382394,
"grad_norm": 0.15690362453460693,
"learning_rate": 0.0028996741686449427,
"loss": 3.079457092285156,
"num_input_tokens_seen": 4776263680,
"step": 9110,
"train_runtime": 41394.0963,
"train_tokens_per_second": 115385.142
},
{
"epoch": 0.4935198463161882,
"grad_norm": 0.1561357080936432,
"learning_rate": 0.0028957512912575777,
"loss": 3.081951141357422,
"num_input_tokens_seen": 4781506560,
"step": 9120,
"train_runtime": 41439.2561,
"train_tokens_per_second": 115385.917
},
{
"epoch": 0.49406098649855246,
"grad_norm": 0.15572723746299744,
"learning_rate": 0.002891827968898225,
"loss": 3.0684499740600586,
"num_input_tokens_seen": 4786749440,
"step": 9130,
"train_runtime": 41484.4245,
"train_tokens_per_second": 115386.666
},
{
"epoch": 0.4946021266809167,
"grad_norm": 0.14807617664337158,
"learning_rate": 0.0028879042135446092,
"loss": 3.0712486267089845,
"num_input_tokens_seen": 4791992320,
"step": 9140,
"train_runtime": 41529.5787,
"train_tokens_per_second": 115387.453
},
{
"epoch": 0.4951432668632809,
"grad_norm": 0.15159912407398224,
"learning_rate": 0.0028839800371757724,
"loss": 3.0685661315917967,
"num_input_tokens_seen": 4797235200,
"step": 9150,
"train_runtime": 41574.7343,
"train_tokens_per_second": 115388.235
},
{
"epoch": 0.49568440704564515,
"grad_norm": 0.15283788740634918,
"learning_rate": 0.0028800554517720467,
"loss": 3.066938591003418,
"num_input_tokens_seen": 4802478080,
"step": 9160,
"train_runtime": 41623.4446,
"train_tokens_per_second": 115379.16
},
{
"epoch": 0.49622554722800943,
"grad_norm": 0.14532612264156342,
"learning_rate": 0.0028761304693150093,
"loss": 3.0726764678955076,
"num_input_tokens_seen": 4807720960,
"step": 9170,
"train_runtime": 41668.6161,
"train_tokens_per_second": 115379.905
},
{
"epoch": 0.49676668741037366,
"grad_norm": 0.15642227232456207,
"learning_rate": 0.0028722051017874514,
"loss": 3.075974464416504,
"num_input_tokens_seen": 4812963840,
"step": 9180,
"train_runtime": 41713.7758,
"train_tokens_per_second": 115380.681
},
{
"epoch": 0.4973078275927379,
"grad_norm": 0.15629522502422333,
"learning_rate": 0.00286827936117334,
"loss": 3.0699131011962892,
"num_input_tokens_seen": 4818206720,
"step": 9190,
"train_runtime": 41758.9432,
"train_tokens_per_second": 115381.433
},
{
"epoch": 0.4978489677751021,
"grad_norm": 0.15175525844097137,
"learning_rate": 0.00286435325945778,
"loss": 3.0690542221069337,
"num_input_tokens_seen": 4823449600,
"step": 9200,
"train_runtime": 41804.1291,
"train_tokens_per_second": 115382.133
},
{
"epoch": 0.49839010795746636,
"grad_norm": 0.14598695933818817,
"learning_rate": 0.0028604268086269793,
"loss": 3.072031021118164,
"num_input_tokens_seen": 4828692480,
"step": 9210,
"train_runtime": 41849.2968,
"train_tokens_per_second": 115382.882
},
{
"epoch": 0.49893124813983064,
"grad_norm": 0.14812366664409637,
"learning_rate": 0.0028565000206682125,
"loss": 3.074822998046875,
"num_input_tokens_seen": 4833935360,
"step": 9220,
"train_runtime": 41894.4632,
"train_tokens_per_second": 115383.633
},
{
"epoch": 0.49947238832219487,
"grad_norm": 0.1581128090620041,
"learning_rate": 0.0028525729075697813,
"loss": 3.071183967590332,
"num_input_tokens_seen": 4839178240,
"step": 9230,
"train_runtime": 41939.6385,
"train_tokens_per_second": 115384.357
},
{
"epoch": 0.5000135285045592,
"grad_norm": 0.160576730966568,
"learning_rate": 0.002848645481320983,
"loss": 3.079146385192871,
"num_input_tokens_seen": 4844421120,
"step": 9240,
"train_runtime": 41984.8089,
"train_tokens_per_second": 115385.094
},
{
"epoch": 0.5005546686869233,
"grad_norm": 0.1423659771680832,
"learning_rate": 0.002844717753912068,
"loss": 3.0759227752685545,
"num_input_tokens_seen": 4849664000,
"step": 9250,
"train_runtime": 42029.9847,
"train_tokens_per_second": 115385.814
},
{
"epoch": 0.5010958088692876,
"grad_norm": 0.14177994430065155,
"learning_rate": 0.0028407897373342074,
"loss": 3.076811599731445,
"num_input_tokens_seen": 4854906880,
"step": 9260,
"train_runtime": 42075.1549,
"train_tokens_per_second": 115386.548
},
{
"epoch": 0.5016369490516518,
"grad_norm": 0.14039497077465057,
"learning_rate": 0.002836861443579456,
"loss": 3.0762613296508787,
"num_input_tokens_seen": 4860149760,
"step": 9270,
"train_runtime": 42120.3369,
"train_tokens_per_second": 115387.248
},
{
"epoch": 0.5021780892340161,
"grad_norm": 0.15003980696201324,
"learning_rate": 0.0028329328846407125,
"loss": 3.0661956787109377,
"num_input_tokens_seen": 4865392640,
"step": 9280,
"train_runtime": 42165.525,
"train_tokens_per_second": 115387.93
},
{
"epoch": 0.5027192294163804,
"grad_norm": 0.1639668196439743,
"learning_rate": 0.0028290040725116876,
"loss": 3.077253723144531,
"num_input_tokens_seen": 4870635520,
"step": 9290,
"train_runtime": 42210.7142,
"train_tokens_per_second": 115388.607
},
{
"epoch": 0.5032603695987445,
"grad_norm": 0.15424971282482147,
"learning_rate": 0.002825075019186865,
"loss": 3.0679557800292967,
"num_input_tokens_seen": 4875878400,
"step": 9300,
"train_runtime": 42255.9028,
"train_tokens_per_second": 115389.285
},
{
"epoch": 0.5038015097811088,
"grad_norm": 0.1511068195104599,
"learning_rate": 0.0028211457366614607,
"loss": 3.0695865631103514,
"num_input_tokens_seen": 4881121280,
"step": 9310,
"train_runtime": 42301.0768,
"train_tokens_per_second": 115390.001
},
{
"epoch": 0.504342649963473,
"grad_norm": 0.15356752276420593,
"learning_rate": 0.002817216236931397,
"loss": 3.073322296142578,
"num_input_tokens_seen": 4886364160,
"step": 9320,
"train_runtime": 42346.2459,
"train_tokens_per_second": 115390.728
},
{
"epoch": 0.5048837901458373,
"grad_norm": 0.14986173808574677,
"learning_rate": 0.002813286531993253,
"loss": 3.07531681060791,
"num_input_tokens_seen": 4891607040,
"step": 9330,
"train_runtime": 42391.4328,
"train_tokens_per_second": 115391.406
},
{
"epoch": 0.5054249303282016,
"grad_norm": 0.14537614583969116,
"learning_rate": 0.0028093566338442395,
"loss": 3.0746026992797852,
"num_input_tokens_seen": 4896849920,
"step": 9340,
"train_runtime": 42436.5896,
"train_tokens_per_second": 115392.164
},
{
"epoch": 0.5059660705105657,
"grad_norm": 0.15007568895816803,
"learning_rate": 0.0028054265544821522,
"loss": 3.0845333099365235,
"num_input_tokens_seen": 4902092800,
"step": 9350,
"train_runtime": 42481.7468,
"train_tokens_per_second": 115392.92
},
{
"epoch": 0.50650721069293,
"grad_norm": 0.15155982971191406,
"learning_rate": 0.0028014963059053446,
"loss": 3.0744888305664064,
"num_input_tokens_seen": 4907335680,
"step": 9360,
"train_runtime": 42526.8939,
"train_tokens_per_second": 115393.701
},
{
"epoch": 0.5070483508752942,
"grad_norm": 0.15760909020900726,
"learning_rate": 0.002797565900112684,
"loss": 3.0650793075561524,
"num_input_tokens_seen": 4912578560,
"step": 9370,
"train_runtime": 42572.0476,
"train_tokens_per_second": 115394.463
},
{
"epoch": 0.5075894910576585,
"grad_norm": 0.156438410282135,
"learning_rate": 0.0027936353491035183,
"loss": 3.0668895721435545,
"num_input_tokens_seen": 4917821440,
"step": 9380,
"train_runtime": 42617.1956,
"train_tokens_per_second": 115395.238
},
{
"epoch": 0.5081306312400228,
"grad_norm": 0.16406750679016113,
"learning_rate": 0.0027897046648776395,
"loss": 3.061408042907715,
"num_input_tokens_seen": 4923064320,
"step": 9390,
"train_runtime": 42662.3399,
"train_tokens_per_second": 115396.022
},
{
"epoch": 0.508671771422387,
"grad_norm": 0.13937264680862427,
"learning_rate": 0.002785773859435245,
"loss": 3.069793128967285,
"num_input_tokens_seen": 4928307200,
"step": 9400,
"train_runtime": 42707.5213,
"train_tokens_per_second": 115396.704
},
{
"epoch": 0.5092129116047512,
"grad_norm": 0.15395483374595642,
"learning_rate": 0.0027818429447769044,
"loss": 3.071869659423828,
"num_input_tokens_seen": 4933550080,
"step": 9410,
"train_runtime": 42752.6998,
"train_tokens_per_second": 115397.392
},
{
"epoch": 0.5097540517871154,
"grad_norm": 0.15119874477386475,
"learning_rate": 0.0027779119329035167,
"loss": 3.067423629760742,
"num_input_tokens_seen": 4938792960,
"step": 9420,
"train_runtime": 42797.8655,
"train_tokens_per_second": 115398.114
},
{
"epoch": 0.5102951919694797,
"grad_norm": 0.1615118384361267,
"learning_rate": 0.002773980835816284,
"loss": 3.0653512954711912,
"num_input_tokens_seen": 4944035840,
"step": 9430,
"train_runtime": 42843.0316,
"train_tokens_per_second": 115398.833
},
{
"epoch": 0.510836332151844,
"grad_norm": 0.16538918018341064,
"learning_rate": 0.0027700496655166614,
"loss": 3.067237663269043,
"num_input_tokens_seen": 4949278720,
"step": 9440,
"train_runtime": 42888.2044,
"train_tokens_per_second": 115399.532
},
{
"epoch": 0.5113774723342082,
"grad_norm": 0.14385169744491577,
"learning_rate": 0.002766118434006332,
"loss": 3.078049087524414,
"num_input_tokens_seen": 4954521600,
"step": 9450,
"train_runtime": 42933.3584,
"train_tokens_per_second": 115400.281
},
{
"epoch": 0.5119186125165724,
"grad_norm": 0.15073780715465546,
"learning_rate": 0.0027621871532871657,
"loss": 3.06368350982666,
"num_input_tokens_seen": 4959764480,
"step": 9460,
"train_runtime": 42978.5149,
"train_tokens_per_second": 115401.02
},
{
"epoch": 0.5124597526989366,
"grad_norm": 0.15621446073055267,
"learning_rate": 0.0027582558353611802,
"loss": 3.0653354644775392,
"num_input_tokens_seen": 4965007360,
"step": 9470,
"train_runtime": 43023.685,
"train_tokens_per_second": 115401.722
},
{
"epoch": 0.5130008928813009,
"grad_norm": 0.15570929646492004,
"learning_rate": 0.0027543244922305105,
"loss": 3.0613819122314454,
"num_input_tokens_seen": 4970250240,
"step": 9480,
"train_runtime": 43068.8573,
"train_tokens_per_second": 115402.417
},
{
"epoch": 0.5135420330636652,
"grad_norm": 0.1389538198709488,
"learning_rate": 0.0027503931358973644,
"loss": 3.0687282562255858,
"num_input_tokens_seen": 4975493120,
"step": 9490,
"train_runtime": 43114.0132,
"train_tokens_per_second": 115403.154
},
{
"epoch": 0.5140831732460294,
"grad_norm": 0.15439286828041077,
"learning_rate": 0.002746461778363992,
"loss": 3.0685733795166015,
"num_input_tokens_seen": 4980736000,
"step": 9500,
"train_runtime": 43159.1877,
"train_tokens_per_second": 115403.84
},
{
"epoch": 0.5140831732460294,
"eval_loss": 3.021251916885376,
"eval_runtime": 1.9829,
"eval_samples_per_second": 252.155,
"eval_steps_per_second": 4.034,
"num_input_tokens_seen": 4980736000,
"step": 9500
},
{
"epoch": 0.5146243134283937,
"grad_norm": 0.16106902062892914,
"learning_rate": 0.0027425304316326484,
"loss": 3.076310729980469,
"num_input_tokens_seen": 4985978880,
"step": 9510,
"train_runtime": 43206.3354,
"train_tokens_per_second": 115399.254
},
{
"epoch": 0.5151654536107578,
"grad_norm": 0.16451045870780945,
"learning_rate": 0.0027385991077055532,
"loss": 3.0650386810302734,
"num_input_tokens_seen": 4991221760,
"step": 9520,
"train_runtime": 43251.4993,
"train_tokens_per_second": 115399.971
},
{
"epoch": 0.5157065937931221,
"grad_norm": 0.141593337059021,
"learning_rate": 0.002734667818584858,
"loss": 3.0678850173950196,
"num_input_tokens_seen": 4996464640,
"step": 9530,
"train_runtime": 43296.675,
"train_tokens_per_second": 115400.655
},
{
"epoch": 0.5162477339754864,
"grad_norm": 0.15153639018535614,
"learning_rate": 0.002730736576272606,
"loss": 3.0637826919555664,
"num_input_tokens_seen": 5001707520,
"step": 9540,
"train_runtime": 43345.4631,
"train_tokens_per_second": 115391.719
},
{
"epoch": 0.5167888741578506,
"grad_norm": 0.1510300487279892,
"learning_rate": 0.0027268053927707015,
"loss": 3.066213607788086,
"num_input_tokens_seen": 5006950400,
"step": 9550,
"train_runtime": 43390.6485,
"train_tokens_per_second": 115392.385
},
{
"epoch": 0.5173300143402149,
"grad_norm": 0.14986655116081238,
"learning_rate": 0.0027228742800808657,
"loss": 3.069229507446289,
"num_input_tokens_seen": 5012193280,
"step": 9560,
"train_runtime": 43435.8008,
"train_tokens_per_second": 115393.136
},
{
"epoch": 0.517871154522579,
"grad_norm": 0.15248462557792664,
"learning_rate": 0.002718943250204604,
"loss": 3.0567092895507812,
"num_input_tokens_seen": 5017436160,
"step": 9570,
"train_runtime": 43480.9543,
"train_tokens_per_second": 115393.883
},
{
"epoch": 0.5184122947049433,
"grad_norm": 0.1401718556880951,
"learning_rate": 0.0027150123151431717,
"loss": 3.0642112731933593,
"num_input_tokens_seen": 5022679040,
"step": 9580,
"train_runtime": 43526.1319,
"train_tokens_per_second": 115394.565
},
{
"epoch": 0.5189534348873076,
"grad_norm": 0.1594904363155365,
"learning_rate": 0.002711081486897532,
"loss": 3.077428436279297,
"num_input_tokens_seen": 5027921920,
"step": 9590,
"train_runtime": 43571.3028,
"train_tokens_per_second": 115395.262
},
{
"epoch": 0.5194945750696718,
"grad_norm": 0.16368508338928223,
"learning_rate": 0.0027071507774683217,
"loss": 3.0642780303955077,
"num_input_tokens_seen": 5033164800,
"step": 9600,
"train_runtime": 43616.4759,
"train_tokens_per_second": 115395.953
},
{
"epoch": 0.5200357152520361,
"grad_norm": 0.15867675840854645,
"learning_rate": 0.0027032201988558165,
"loss": 3.056943893432617,
"num_input_tokens_seen": 5038407680,
"step": 9610,
"train_runtime": 43661.6654,
"train_tokens_per_second": 115396.599
},
{
"epoch": 0.5205768554344002,
"grad_norm": 0.14183756709098816,
"learning_rate": 0.0026992897630598927,
"loss": 3.0706558227539062,
"num_input_tokens_seen": 5043650560,
"step": 9620,
"train_runtime": 43706.8375,
"train_tokens_per_second": 115397.289
},
{
"epoch": 0.5211179956167645,
"grad_norm": 0.15698400139808655,
"learning_rate": 0.002695359482079989,
"loss": 3.0621952056884765,
"num_input_tokens_seen": 5048893440,
"step": 9630,
"train_runtime": 43752.0038,
"train_tokens_per_second": 115397.993
},
{
"epoch": 0.5216591357991288,
"grad_norm": 0.15258745849132538,
"learning_rate": 0.002691429367915072,
"loss": 3.0683521270751952,
"num_input_tokens_seen": 5054136320,
"step": 9640,
"train_runtime": 43797.1921,
"train_tokens_per_second": 115398.638
},
{
"epoch": 0.522200275981493,
"grad_norm": 0.1572786569595337,
"learning_rate": 0.0026874994325636016,
"loss": 3.0657506942749024,
"num_input_tokens_seen": 5059379200,
"step": 9650,
"train_runtime": 43842.367,
"train_tokens_per_second": 115399.317
},
{
"epoch": 0.5227414161638573,
"grad_norm": 0.1440490484237671,
"learning_rate": 0.002683569688023488,
"loss": 3.057964324951172,
"num_input_tokens_seen": 5064622080,
"step": 9660,
"train_runtime": 43887.5251,
"train_tokens_per_second": 115400.038
},
{
"epoch": 0.5232825563462215,
"grad_norm": 0.1437375247478485,
"learning_rate": 0.002679640146292061,
"loss": 3.067335510253906,
"num_input_tokens_seen": 5069864960,
"step": 9670,
"train_runtime": 43932.6993,
"train_tokens_per_second": 115400.716
},
{
"epoch": 0.5238236965285857,
"grad_norm": 0.14964227378368378,
"learning_rate": 0.0026757108193660294,
"loss": 3.0661109924316405,
"num_input_tokens_seen": 5075107840,
"step": 9680,
"train_runtime": 43977.8845,
"train_tokens_per_second": 115401.364
},
{
"epoch": 0.52436483671095,
"grad_norm": 0.15807990729808807,
"learning_rate": 0.0026717817192414496,
"loss": 3.0581291198730467,
"num_input_tokens_seen": 5080350720,
"step": 9690,
"train_runtime": 44023.0423,
"train_tokens_per_second": 115402.082
},
{
"epoch": 0.5249059768933142,
"grad_norm": 0.1513347625732422,
"learning_rate": 0.0026678528579136833,
"loss": 3.0648067474365233,
"num_input_tokens_seen": 5085593600,
"step": 9700,
"train_runtime": 44068.1745,
"train_tokens_per_second": 115402.865
},
{
"epoch": 0.5254471170756785,
"grad_norm": 0.13990284502506256,
"learning_rate": 0.002663924247377361,
"loss": 3.06469841003418,
"num_input_tokens_seen": 5090836480,
"step": 9710,
"train_runtime": 44113.2866,
"train_tokens_per_second": 115403.7
},
{
"epoch": 0.5259882572580427,
"grad_norm": 0.16054664552211761,
"learning_rate": 0.002659995899626353,
"loss": 3.070522689819336,
"num_input_tokens_seen": 5096079360,
"step": 9720,
"train_runtime": 44158.4184,
"train_tokens_per_second": 115404.481
},
{
"epoch": 0.5265293974404069,
"grad_norm": 0.15888893604278564,
"learning_rate": 0.0026560678266537223,
"loss": 3.061862564086914,
"num_input_tokens_seen": 5101322240,
"step": 9730,
"train_runtime": 44203.5783,
"train_tokens_per_second": 115405.188
},
{
"epoch": 0.5270705376227712,
"grad_norm": 0.1485973447561264,
"learning_rate": 0.002652140040451696,
"loss": 3.0686100006103514,
"num_input_tokens_seen": 5106565120,
"step": 9740,
"train_runtime": 44248.7463,
"train_tokens_per_second": 115405.871
},
{
"epoch": 0.5276116778051354,
"grad_norm": 0.1576089709997177,
"learning_rate": 0.002648212553011623,
"loss": 3.062734603881836,
"num_input_tokens_seen": 5111808000,
"step": 9750,
"train_runtime": 44293.9122,
"train_tokens_per_second": 115406.559
},
{
"epoch": 0.5281528179874997,
"grad_norm": 0.14466626942157745,
"learning_rate": 0.0026442853763239444,
"loss": 3.0534576416015624,
"num_input_tokens_seen": 5117050880,
"step": 9760,
"train_runtime": 44339.0625,
"train_tokens_per_second": 115407.286
},
{
"epoch": 0.5286939581698639,
"grad_norm": 0.13870450854301453,
"learning_rate": 0.0026403585223781483,
"loss": 3.0488052368164062,
"num_input_tokens_seen": 5122293760,
"step": 9770,
"train_runtime": 44384.2076,
"train_tokens_per_second": 115408.025
},
{
"epoch": 0.5292350983522282,
"grad_norm": 0.15322810411453247,
"learning_rate": 0.0026364320031627385,
"loss": 3.056787109375,
"num_input_tokens_seen": 5127536640,
"step": 9780,
"train_runtime": 44429.3763,
"train_tokens_per_second": 115408.702
},
{
"epoch": 0.5297762385345924,
"grad_norm": 0.1526278853416443,
"learning_rate": 0.0026325058306652,
"loss": 3.068254089355469,
"num_input_tokens_seen": 5132779520,
"step": 9790,
"train_runtime": 44474.5291,
"train_tokens_per_second": 115409.418
},
{
"epoch": 0.5303173787169566,
"grad_norm": 0.14957252144813538,
"learning_rate": 0.002628580016871954,
"loss": 3.0630029678344726,
"num_input_tokens_seen": 5138022400,
"step": 9800,
"train_runtime": 44519.6838,
"train_tokens_per_second": 115410.128
},
{
"epoch": 0.5308585188993209,
"grad_norm": 0.1606789082288742,
"learning_rate": 0.002624654573768332,
"loss": 3.0618038177490234,
"num_input_tokens_seen": 5143265280,
"step": 9810,
"train_runtime": 44564.8296,
"train_tokens_per_second": 115410.859
},
{
"epoch": 0.5313996590816851,
"grad_norm": 0.148385152220726,
"learning_rate": 0.002620729513338529,
"loss": 3.06335334777832,
"num_input_tokens_seen": 5148508160,
"step": 9820,
"train_runtime": 44609.9754,
"train_tokens_per_second": 115411.589
},
{
"epoch": 0.5319407992640494,
"grad_norm": 0.1520962417125702,
"learning_rate": 0.002616804847565574,
"loss": 3.061989593505859,
"num_input_tokens_seen": 5153751040,
"step": 9830,
"train_runtime": 44655.138,
"train_tokens_per_second": 115412.275
},
{
"epoch": 0.5324819394464136,
"grad_norm": 0.14803871512413025,
"learning_rate": 0.002612880588431294,
"loss": 3.062520980834961,
"num_input_tokens_seen": 5158993920,
"step": 9840,
"train_runtime": 44700.2918,
"train_tokens_per_second": 115412.981
},
{
"epoch": 0.5330230796287778,
"grad_norm": 0.14693519473075867,
"learning_rate": 0.002608956747916268,
"loss": 3.053732681274414,
"num_input_tokens_seen": 5164236800,
"step": 9850,
"train_runtime": 44745.4454,
"train_tokens_per_second": 115413.686
},
{
"epoch": 0.5335642198111421,
"grad_norm": 0.14247646927833557,
"learning_rate": 0.0026050333379998014,
"loss": 3.0687253952026365,
"num_input_tokens_seen": 5169479680,
"step": 9860,
"train_runtime": 44790.614,
"train_tokens_per_second": 115414.352
},
{
"epoch": 0.5341053599935063,
"grad_norm": 0.1414949595928192,
"learning_rate": 0.0026011103706598867,
"loss": 3.0667953491210938,
"num_input_tokens_seen": 5174722560,
"step": 9870,
"train_runtime": 44835.7697,
"train_tokens_per_second": 115415.049
},
{
"epoch": 0.5346465001758706,
"grad_norm": 0.15308037400245667,
"learning_rate": 0.00259718785787316,
"loss": 3.0632091522216798,
"num_input_tokens_seen": 5179965440,
"step": 9880,
"train_runtime": 44880.9299,
"train_tokens_per_second": 115415.733
},
{
"epoch": 0.5351876403582349,
"grad_norm": 0.1401015669107437,
"learning_rate": 0.002593265811614872,
"loss": 3.054189682006836,
"num_input_tokens_seen": 5185208320,
"step": 9890,
"train_runtime": 44926.1001,
"train_tokens_per_second": 115416.391
},
{
"epoch": 0.535728780540599,
"grad_norm": 0.15150010585784912,
"learning_rate": 0.0025893442438588523,
"loss": 3.0516624450683594,
"num_input_tokens_seen": 5190451200,
"step": 9900,
"train_runtime": 44971.2488,
"train_tokens_per_second": 115417.102
},
{
"epoch": 0.5362699207229633,
"grad_norm": 0.17257611453533173,
"learning_rate": 0.0025854231665774653,
"loss": 3.0537059783935545,
"num_input_tokens_seen": 5195694080,
"step": 9910,
"train_runtime": 45016.4036,
"train_tokens_per_second": 115417.796
},
{
"epoch": 0.5368110609053275,
"grad_norm": 0.14506685733795166,
"learning_rate": 0.002581502591741579,
"loss": 3.0568138122558595,
"num_input_tokens_seen": 5200936960,
"step": 9920,
"train_runtime": 45065.2284,
"train_tokens_per_second": 115409.089
},
{
"epoch": 0.5373522010876918,
"grad_norm": 0.14898552000522614,
"learning_rate": 0.002577582531320528,
"loss": 3.0490861892700196,
"num_input_tokens_seen": 5206179840,
"step": 9930,
"train_runtime": 45110.3898,
"train_tokens_per_second": 115409.773
},
{
"epoch": 0.5378933412700561,
"grad_norm": 0.14676256477832794,
"learning_rate": 0.0025736629972820785,
"loss": 3.067533493041992,
"num_input_tokens_seen": 5211422720,
"step": 9940,
"train_runtime": 45155.5461,
"train_tokens_per_second": 115410.468
},
{
"epoch": 0.5384344814524202,
"grad_norm": 0.15546375513076782,
"learning_rate": 0.002569744001592385,
"loss": 3.053817367553711,
"num_input_tokens_seen": 5216665600,
"step": 9950,
"train_runtime": 45200.7043,
"train_tokens_per_second": 115411.157
},
{
"epoch": 0.5389756216347845,
"grad_norm": 0.16900601983070374,
"learning_rate": 0.002565825556215962,
"loss": 3.062000274658203,
"num_input_tokens_seen": 5221908480,
"step": 9960,
"train_runtime": 45245.886,
"train_tokens_per_second": 115411.785
},
{
"epoch": 0.5395167618171487,
"grad_norm": 0.14602519571781158,
"learning_rate": 0.0025619076731156444,
"loss": 3.0598079681396486,
"num_input_tokens_seen": 5227151360,
"step": 9970,
"train_runtime": 45291.0267,
"train_tokens_per_second": 115412.516
},
{
"epoch": 0.540057901999513,
"grad_norm": 0.15503665804862976,
"learning_rate": 0.002557990364252547,
"loss": 3.047740936279297,
"num_input_tokens_seen": 5232394240,
"step": 9980,
"train_runtime": 45336.1788,
"train_tokens_per_second": 115413.217
},
{
"epoch": 0.5405990421818773,
"grad_norm": 0.15822327136993408,
"learning_rate": 0.0025540736415860343,
"loss": 3.0622173309326173,
"num_input_tokens_seen": 5237637120,
"step": 9990,
"train_runtime": 45381.3445,
"train_tokens_per_second": 115413.882
},
{
"epoch": 0.5411401823642414,
"grad_norm": 0.13620983064174652,
"learning_rate": 0.0025501575170736803,
"loss": 3.0480823516845703,
"num_input_tokens_seen": 5242880000,
"step": 10000,
"train_runtime": 45426.5099,
"train_tokens_per_second": 115414.546
},
{
"epoch": 0.5411401823642414,
"eval_loss": 3.0108466148376465,
"eval_runtime": 1.9863,
"eval_samples_per_second": 251.722,
"eval_steps_per_second": 4.028,
"num_input_tokens_seen": 5242880000,
"step": 10000
},
{
"epoch": 0.5416813225466057,
"grad_norm": 0.14582324028015137,
"learning_rate": 0.002546242002671233,
"loss": 3.0488231658935545,
"num_input_tokens_seen": 5248122880,
"step": 10010,
"train_runtime": 45475.9888,
"train_tokens_per_second": 115404.261
},
{
"epoch": 0.5422224627289699,
"grad_norm": 0.14079852402210236,
"learning_rate": 0.0025423271103325786,
"loss": 3.0604705810546875,
"num_input_tokens_seen": 5253365760,
"step": 10020,
"train_runtime": 45521.0949,
"train_tokens_per_second": 115405.083
},
{
"epoch": 0.5427636029113342,
"grad_norm": 0.15606586635112762,
"learning_rate": 0.002538412852009702,
"loss": 3.0583091735839845,
"num_input_tokens_seen": 5258608640,
"step": 10030,
"train_runtime": 45566.226,
"train_tokens_per_second": 115405.841
},
{
"epoch": 0.5433047430936985,
"grad_norm": 0.14329582452774048,
"learning_rate": 0.002534499239652654,
"loss": 3.0502853393554688,
"num_input_tokens_seen": 5263851520,
"step": 10040,
"train_runtime": 45611.3663,
"train_tokens_per_second": 115406.574
},
{
"epoch": 0.5438458832760626,
"grad_norm": 0.14503344893455505,
"learning_rate": 0.0025305862852095145,
"loss": 3.0582489013671874,
"num_input_tokens_seen": 5269094400,
"step": 10050,
"train_runtime": 45656.5231,
"train_tokens_per_second": 115407.264
},
{
"epoch": 0.5443870234584269,
"grad_norm": 0.153029665350914,
"learning_rate": 0.002526674000626352,
"loss": 3.052793502807617,
"num_input_tokens_seen": 5274337280,
"step": 10060,
"train_runtime": 45701.6761,
"train_tokens_per_second": 115407.962
},
{
"epoch": 0.5449281636407911,
"grad_norm": 0.14204776287078857,
"learning_rate": 0.00252276239784719,
"loss": 3.052510643005371,
"num_input_tokens_seen": 5279580160,
"step": 10070,
"train_runtime": 45746.8203,
"train_tokens_per_second": 115408.68
},
{
"epoch": 0.5454693038231554,
"grad_norm": 0.14915040135383606,
"learning_rate": 0.0025188514888139757,
"loss": 3.058329391479492,
"num_input_tokens_seen": 5284823040,
"step": 10080,
"train_runtime": 45791.9522,
"train_tokens_per_second": 115409.429
},
{
"epoch": 0.5460104440055197,
"grad_norm": 0.14046527445316315,
"learning_rate": 0.0025149412854665316,
"loss": 3.0549495697021483,
"num_input_tokens_seen": 5290065920,
"step": 10090,
"train_runtime": 45837.0921,
"train_tokens_per_second": 115410.155
},
{
"epoch": 0.5465515841878839,
"grad_norm": 0.15560267865657806,
"learning_rate": 0.0025110317997425295,
"loss": 3.0544879913330076,
"num_input_tokens_seen": 5295308800,
"step": 10100,
"train_runtime": 45882.2338,
"train_tokens_per_second": 115410.876
},
{
"epoch": 0.5470927243702481,
"grad_norm": 0.15250231325626373,
"learning_rate": 0.002507123043577449,
"loss": 3.0573678970336915,
"num_input_tokens_seen": 5300551680,
"step": 10110,
"train_runtime": 45927.3738,
"train_tokens_per_second": 115411.6
},
{
"epoch": 0.5476338645526123,
"grad_norm": 0.13873432576656342,
"learning_rate": 0.002503215028904543,
"loss": 3.0521045684814454,
"num_input_tokens_seen": 5305794560,
"step": 10120,
"train_runtime": 45972.5283,
"train_tokens_per_second": 115412.285
},
{
"epoch": 0.5481750047349766,
"grad_norm": 0.15664595365524292,
"learning_rate": 0.0024993077676548014,
"loss": 3.0536930084228517,
"num_input_tokens_seen": 5311037440,
"step": 10130,
"train_runtime": 46017.6749,
"train_tokens_per_second": 115412.99
},
{
"epoch": 0.5487161449173409,
"grad_norm": 0.15226289629936218,
"learning_rate": 0.002495401271756911,
"loss": 3.0586917877197264,
"num_input_tokens_seen": 5316280320,
"step": 10140,
"train_runtime": 46062.8137,
"train_tokens_per_second": 115413.712
},
{
"epoch": 0.5492572850997051,
"grad_norm": 0.14132562279701233,
"learning_rate": 0.0024914955531372264,
"loss": 3.0555648803710938,
"num_input_tokens_seen": 5321523200,
"step": 10150,
"train_runtime": 46107.9535,
"train_tokens_per_second": 115414.43
},
{
"epoch": 0.5497984252820693,
"grad_norm": 0.15198439359664917,
"learning_rate": 0.002487590623719726,
"loss": 3.0600481033325195,
"num_input_tokens_seen": 5326766080,
"step": 10160,
"train_runtime": 46153.0991,
"train_tokens_per_second": 115415.133
},
{
"epoch": 0.5503395654644335,
"grad_norm": 0.1487119495868683,
"learning_rate": 0.002483686495425979,
"loss": 3.0562034606933595,
"num_input_tokens_seen": 5332008960,
"step": 10170,
"train_runtime": 46198.2435,
"train_tokens_per_second": 115415.837
},
{
"epoch": 0.5508807056467978,
"grad_norm": 0.1500309705734253,
"learning_rate": 0.00247978318017511,
"loss": 3.0558704376220702,
"num_input_tokens_seen": 5337251840,
"step": 10180,
"train_runtime": 46243.3932,
"train_tokens_per_second": 115416.527
},
{
"epoch": 0.5514218458291621,
"grad_norm": 0.14477477967739105,
"learning_rate": 0.0024758806898837614,
"loss": 3.0584625244140624,
"num_input_tokens_seen": 5342494720,
"step": 10190,
"train_runtime": 46288.5341,
"train_tokens_per_second": 115417.237
},
{
"epoch": 0.5519629860115263,
"grad_norm": 0.14965343475341797,
"learning_rate": 0.0024719790364660555,
"loss": 3.053845024108887,
"num_input_tokens_seen": 5347737600,
"step": 10200,
"train_runtime": 46333.6743,
"train_tokens_per_second": 115417.948
},
{
"epoch": 0.5525041261938906,
"grad_norm": 0.14595621824264526,
"learning_rate": 0.002468078231833561,
"loss": 3.0468162536621093,
"num_input_tokens_seen": 5352980480,
"step": 10210,
"train_runtime": 46378.8185,
"train_tokens_per_second": 115418.647
},
{
"epoch": 0.5530452663762547,
"grad_norm": 0.15934494137763977,
"learning_rate": 0.002464178287895256,
"loss": 3.0428611755371096,
"num_input_tokens_seen": 5358223360,
"step": 10220,
"train_runtime": 46423.9466,
"train_tokens_per_second": 115419.385
},
{
"epoch": 0.553586406558619,
"grad_norm": 0.14214660227298737,
"learning_rate": 0.002460279216557488,
"loss": 3.0542884826660157,
"num_input_tokens_seen": 5363466240,
"step": 10230,
"train_runtime": 46469.0816,
"train_tokens_per_second": 115420.104
},
{
"epoch": 0.5541275467409833,
"grad_norm": 0.16061817109584808,
"learning_rate": 0.0024563810297239448,
"loss": 3.0611974716186525,
"num_input_tokens_seen": 5368709120,
"step": 10240,
"train_runtime": 46514.2276,
"train_tokens_per_second": 115420.795
},
{
"epoch": 0.5546686869233475,
"grad_norm": 0.14743222296237946,
"learning_rate": 0.0024524837392956088,
"loss": 3.0409524917602537,
"num_input_tokens_seen": 5373952000,
"step": 10250,
"train_runtime": 46559.373,
"train_tokens_per_second": 115421.486
},
{
"epoch": 0.5552098271057118,
"grad_norm": 0.145145446062088,
"learning_rate": 0.0024485873571707313,
"loss": 3.0503875732421877,
"num_input_tokens_seen": 5379194880,
"step": 10260,
"train_runtime": 46604.5364,
"train_tokens_per_second": 115422.13
},
{
"epoch": 0.5557509672880759,
"grad_norm": 0.13785313069820404,
"learning_rate": 0.0024446918952447856,
"loss": 3.051102066040039,
"num_input_tokens_seen": 5384437760,
"step": 10270,
"train_runtime": 46649.714,
"train_tokens_per_second": 115422.739
},
{
"epoch": 0.5562921074704402,
"grad_norm": 0.15741367638111115,
"learning_rate": 0.002440797365410437,
"loss": 3.0524486541748046,
"num_input_tokens_seen": 5389680640,
"step": 10280,
"train_runtime": 46694.8766,
"train_tokens_per_second": 115423.383
},
{
"epoch": 0.5568332476528045,
"grad_norm": 0.14624185860157013,
"learning_rate": 0.002436903779557509,
"loss": 3.041734313964844,
"num_input_tokens_seen": 5394923520,
"step": 10290,
"train_runtime": 46740.04,
"train_tokens_per_second": 115424.024
},
{
"epoch": 0.5573743878351687,
"grad_norm": 0.15357740223407745,
"learning_rate": 0.002433011149572938,
"loss": 3.05377254486084,
"num_input_tokens_seen": 5400166400,
"step": 10300,
"train_runtime": 46785.1999,
"train_tokens_per_second": 115424.673
},
{
"epoch": 0.557915528017533,
"grad_norm": 0.1404609978199005,
"learning_rate": 0.002429119487340744,
"loss": 3.0517080307006834,
"num_input_tokens_seen": 5405409280,
"step": 10310,
"train_runtime": 46834.0267,
"train_tokens_per_second": 115416.283
},
{
"epoch": 0.5584566681998971,
"grad_norm": 0.13460350036621094,
"learning_rate": 0.0024252288047419933,
"loss": 3.047005462646484,
"num_input_tokens_seen": 5410652160,
"step": 10320,
"train_runtime": 46879.179,
"train_tokens_per_second": 115416.956
},
{
"epoch": 0.5589978083822614,
"grad_norm": 0.13896240293979645,
"learning_rate": 0.002421339113654761,
"loss": 3.0483970642089844,
"num_input_tokens_seen": 5415895040,
"step": 10330,
"train_runtime": 46924.3535,
"train_tokens_per_second": 115417.574
},
{
"epoch": 0.5595389485646257,
"grad_norm": 0.1555888056755066,
"learning_rate": 0.0024174504259540965,
"loss": 3.045535087585449,
"num_input_tokens_seen": 5421137920,
"step": 10340,
"train_runtime": 46969.5132,
"train_tokens_per_second": 115418.227
},
{
"epoch": 0.5600800887469899,
"grad_norm": 0.1442544311285019,
"learning_rate": 0.002413562753511982,
"loss": 3.0400226593017576,
"num_input_tokens_seen": 5426380800,
"step": 10350,
"train_runtime": 47014.6917,
"train_tokens_per_second": 115418.832
},
{
"epoch": 0.5606212289293542,
"grad_norm": 0.16144470870494843,
"learning_rate": 0.002409676108197302,
"loss": 3.044460678100586,
"num_input_tokens_seen": 5431623680,
"step": 10360,
"train_runtime": 47059.8715,
"train_tokens_per_second": 115419.433
},
{
"epoch": 0.5611623691117184,
"grad_norm": 0.1435036063194275,
"learning_rate": 0.0024057905018758097,
"loss": 3.051218032836914,
"num_input_tokens_seen": 5436866560,
"step": 10370,
"train_runtime": 47105.0206,
"train_tokens_per_second": 115420.108
},
{
"epoch": 0.5617035092940826,
"grad_norm": 0.14437657594680786,
"learning_rate": 0.0024019059464100794,
"loss": 3.049814987182617,
"num_input_tokens_seen": 5442109440,
"step": 10380,
"train_runtime": 47150.1709,
"train_tokens_per_second": 115420.779
},
{
"epoch": 0.5622446494764469,
"grad_norm": 0.1483716368675232,
"learning_rate": 0.0023980224536594803,
"loss": 3.051362991333008,
"num_input_tokens_seen": 5447352320,
"step": 10390,
"train_runtime": 47195.3171,
"train_tokens_per_second": 115421.458
},
{
"epoch": 0.5627857896588111,
"grad_norm": 0.1392650008201599,
"learning_rate": 0.002394140035480139,
"loss": 3.05356502532959,
"num_input_tokens_seen": 5452595200,
"step": 10400,
"train_runtime": 47240.4745,
"train_tokens_per_second": 115422.109
},
{
"epoch": 0.5633269298411754,
"grad_norm": 0.13990668952465057,
"learning_rate": 0.002390258703724898,
"loss": 3.053313064575195,
"num_input_tokens_seen": 5457838080,
"step": 10410,
"train_runtime": 47285.6159,
"train_tokens_per_second": 115422.798
},
{
"epoch": 0.5638680700235396,
"grad_norm": 0.1470736563205719,
"learning_rate": 0.002386378470243285,
"loss": 3.050541305541992,
"num_input_tokens_seen": 5463080960,
"step": 10420,
"train_runtime": 47330.7575,
"train_tokens_per_second": 115423.485
},
{
"epoch": 0.5644092102059038,
"grad_norm": 0.15061776340007782,
"learning_rate": 0.0023824993468814734,
"loss": 3.0488460540771483,
"num_input_tokens_seen": 5468323840,
"step": 10430,
"train_runtime": 47375.9168,
"train_tokens_per_second": 115424.127
},
{
"epoch": 0.5649503503882681,
"grad_norm": 0.14553044736385345,
"learning_rate": 0.0023786213454822496,
"loss": 3.0426799774169924,
"num_input_tokens_seen": 5473566720,
"step": 10440,
"train_runtime": 47421.0774,
"train_tokens_per_second": 115424.765
},
{
"epoch": 0.5654914905706323,
"grad_norm": 0.1505778431892395,
"learning_rate": 0.002374744477884974,
"loss": 3.0493221282958984,
"num_input_tokens_seen": 5478809600,
"step": 10450,
"train_runtime": 47466.2416,
"train_tokens_per_second": 115425.393
},
{
"epoch": 0.5660326307529966,
"grad_norm": 0.14359861612319946,
"learning_rate": 0.002370868755925543,
"loss": 3.048199272155762,
"num_input_tokens_seen": 5484052480,
"step": 10460,
"train_runtime": 47511.3938,
"train_tokens_per_second": 115426.049
},
{
"epoch": 0.5665737709353608,
"grad_norm": 0.1411399245262146,
"learning_rate": 0.0023669941914363597,
"loss": 3.0590206146240235,
"num_input_tokens_seen": 5489295360,
"step": 10470,
"train_runtime": 47556.5734,
"train_tokens_per_second": 115426.638
},
{
"epoch": 0.567114911117725,
"grad_norm": 0.14005140960216522,
"learning_rate": 0.0023631207962462905,
"loss": 3.052465057373047,
"num_input_tokens_seen": 5494538240,
"step": 10480,
"train_runtime": 47601.7335,
"train_tokens_per_second": 115427.272
},
{
"epoch": 0.5676560513000893,
"grad_norm": 0.15281735360622406,
"learning_rate": 0.0023592485821806314,
"loss": 3.0543212890625,
"num_input_tokens_seen": 5499781120,
"step": 10490,
"train_runtime": 47646.8804,
"train_tokens_per_second": 115427.937
},
{
"epoch": 0.5681971914824535,
"grad_norm": 0.15110628306865692,
"learning_rate": 0.0023553775610610744,
"loss": 3.0445037841796876,
"num_input_tokens_seen": 5505024000,
"step": 10500,
"train_runtime": 47692.0533,
"train_tokens_per_second": 115428.538
},
{
"epoch": 0.5681971914824535,
"eval_loss": 3.0027830600738525,
"eval_runtime": 1.9832,
"eval_samples_per_second": 252.115,
"eval_steps_per_second": 4.034,
"num_input_tokens_seen": 5505024000,
"step": 10500
},
{
"epoch": 0.5687383316648178,
"grad_norm": 0.14011938869953156,
"learning_rate": 0.0023515077447056705,
"loss": 3.0531822204589845,
"num_input_tokens_seen": 5510266880,
"step": 10510,
"train_runtime": 47739.2068,
"train_tokens_per_second": 115424.349
},
{
"epoch": 0.569279471847182,
"grad_norm": 0.14612512290477753,
"learning_rate": 0.002347639144928789,
"loss": 3.051645278930664,
"num_input_tokens_seen": 5515509760,
"step": 10520,
"train_runtime": 47784.361,
"train_tokens_per_second": 115424.998
},
{
"epoch": 0.5698206120295463,
"grad_norm": 0.15726540982723236,
"learning_rate": 0.0023437717735410872,
"loss": 3.0477500915527345,
"num_input_tokens_seen": 5520752640,
"step": 10530,
"train_runtime": 47829.5028,
"train_tokens_per_second": 115425.675
},
{
"epoch": 0.5703617522119105,
"grad_norm": 0.14600247144699097,
"learning_rate": 0.002339905642349474,
"loss": 3.0487768173217775,
"num_input_tokens_seen": 5525995520,
"step": 10540,
"train_runtime": 47874.6792,
"train_tokens_per_second": 115426.267
},
{
"epoch": 0.5709028923942747,
"grad_norm": 0.1526118814945221,
"learning_rate": 0.0023360407631570685,
"loss": 3.0494321823120116,
"num_input_tokens_seen": 5531238400,
"step": 10550,
"train_runtime": 47919.8527,
"train_tokens_per_second": 115426.866
},
{
"epoch": 0.571444032576639,
"grad_norm": 0.14721056818962097,
"learning_rate": 0.0023321771477631693,
"loss": 3.046247100830078,
"num_input_tokens_seen": 5536481280,
"step": 10560,
"train_runtime": 47965.0143,
"train_tokens_per_second": 115427.492
},
{
"epoch": 0.5719851727590032,
"grad_norm": 0.15114431083202362,
"learning_rate": 0.0023283148079632156,
"loss": 3.0407901763916017,
"num_input_tokens_seen": 5541724160,
"step": 10570,
"train_runtime": 48010.1749,
"train_tokens_per_second": 115428.119
},
{
"epoch": 0.5725263129413675,
"grad_norm": 0.14631570875644684,
"learning_rate": 0.0023244537555487544,
"loss": 3.0476711273193358,
"num_input_tokens_seen": 5546967040,
"step": 10580,
"train_runtime": 48055.3433,
"train_tokens_per_second": 115428.726
},
{
"epoch": 0.5730674531237318,
"grad_norm": 0.14861121773719788,
"learning_rate": 0.0023205940023074013,
"loss": 3.049782562255859,
"num_input_tokens_seen": 5552209920,
"step": 10590,
"train_runtime": 48100.5075,
"train_tokens_per_second": 115429.342
},
{
"epoch": 0.5736085933060959,
"grad_norm": 0.13904227316379547,
"learning_rate": 0.002316735560022804,
"loss": 3.055135726928711,
"num_input_tokens_seen": 5557452800,
"step": 10600,
"train_runtime": 48145.6693,
"train_tokens_per_second": 115429.962
},
{
"epoch": 0.5741497334884602,
"grad_norm": 0.13765645027160645,
"learning_rate": 0.00231287844047461,
"loss": 3.047852325439453,
"num_input_tokens_seen": 5562695680,
"step": 10610,
"train_runtime": 48190.8309,
"train_tokens_per_second": 115430.582
},
{
"epoch": 0.5746908736708244,
"grad_norm": 0.14210833609104156,
"learning_rate": 0.0023090226554384288,
"loss": 3.0472042083740236,
"num_input_tokens_seen": 5567938560,
"step": 10620,
"train_runtime": 48235.9849,
"train_tokens_per_second": 115431.22
},
{
"epoch": 0.5752320138531887,
"grad_norm": 0.149732306599617,
"learning_rate": 0.0023051682166857937,
"loss": 3.0454326629638673,
"num_input_tokens_seen": 5573181440,
"step": 10630,
"train_runtime": 48281.1275,
"train_tokens_per_second": 115431.883
},
{
"epoch": 0.575773154035553,
"grad_norm": 0.1392926275730133,
"learning_rate": 0.002301315135984128,
"loss": 3.0390705108642577,
"num_input_tokens_seen": 5578424320,
"step": 10640,
"train_runtime": 48326.2477,
"train_tokens_per_second": 115432.598
},
{
"epoch": 0.5763142942179171,
"grad_norm": 0.13122397661209106,
"learning_rate": 0.0022974634250967113,
"loss": 3.036616897583008,
"num_input_tokens_seen": 5583667200,
"step": 10650,
"train_runtime": 48371.3879,
"train_tokens_per_second": 115433.264
},
{
"epoch": 0.5768554344002814,
"grad_norm": 0.14650028944015503,
"learning_rate": 0.0022936130957826395,
"loss": 3.04638786315918,
"num_input_tokens_seen": 5588910080,
"step": 10660,
"train_runtime": 48416.5301,
"train_tokens_per_second": 115433.924
},
{
"epoch": 0.5773965745826456,
"grad_norm": 0.13940243422985077,
"learning_rate": 0.002289764159796791,
"loss": 3.049785614013672,
"num_input_tokens_seen": 5594152960,
"step": 10670,
"train_runtime": 48461.6807,
"train_tokens_per_second": 115434.564
},
{
"epoch": 0.5779377147650099,
"grad_norm": 0.13686427474021912,
"learning_rate": 0.0022859166288897895,
"loss": 3.0434268951416015,
"num_input_tokens_seen": 5599395840,
"step": 10680,
"train_runtime": 48506.8227,
"train_tokens_per_second": 115435.222
},
{
"epoch": 0.5784788549473742,
"grad_norm": 0.14600345492362976,
"learning_rate": 0.0022820705148079703,
"loss": 3.052047538757324,
"num_input_tokens_seen": 5604638720,
"step": 10690,
"train_runtime": 48555.616,
"train_tokens_per_second": 115427.198
},
{
"epoch": 0.5790199951297383,
"grad_norm": 0.14253467321395874,
"learning_rate": 0.0022782258292933432,
"loss": 3.0317237854003904,
"num_input_tokens_seen": 5609881600,
"step": 10700,
"train_runtime": 48600.7657,
"train_tokens_per_second": 115427.844
},
{
"epoch": 0.5795611353121026,
"grad_norm": 0.13422608375549316,
"learning_rate": 0.0022743825840835542,
"loss": 3.038676071166992,
"num_input_tokens_seen": 5615124480,
"step": 10710,
"train_runtime": 48645.891,
"train_tokens_per_second": 115428.546
},
{
"epoch": 0.5801022754944668,
"grad_norm": 0.14621081948280334,
"learning_rate": 0.0022705407909118574,
"loss": 3.0488845825195314,
"num_input_tokens_seen": 5620367360,
"step": 10720,
"train_runtime": 48691.0214,
"train_tokens_per_second": 115429.235
},
{
"epoch": 0.5806434156768311,
"grad_norm": 0.14328445494174957,
"learning_rate": 0.002266700461507069,
"loss": 3.039694595336914,
"num_input_tokens_seen": 5625610240,
"step": 10730,
"train_runtime": 48736.1623,
"train_tokens_per_second": 115429.898
},
{
"epoch": 0.5811845558591954,
"grad_norm": 0.1407247930765152,
"learning_rate": 0.0022628616075935377,
"loss": 3.0443794250488283,
"num_input_tokens_seen": 5630853120,
"step": 10740,
"train_runtime": 48781.2843,
"train_tokens_per_second": 115430.604
},
{
"epoch": 0.5817256960415595,
"grad_norm": 0.15159763395786285,
"learning_rate": 0.0022590242408911066,
"loss": 3.0392004013061524,
"num_input_tokens_seen": 5636096000,
"step": 10750,
"train_runtime": 48826.4182,
"train_tokens_per_second": 115431.281
},
{
"epoch": 0.5822668362239238,
"grad_norm": 0.14982502162456512,
"learning_rate": 0.0022551883731150822,
"loss": 3.041204833984375,
"num_input_tokens_seen": 5641338880,
"step": 10760,
"train_runtime": 48871.5523,
"train_tokens_per_second": 115431.956
},
{
"epoch": 0.582807976406288,
"grad_norm": 0.14223578572273254,
"learning_rate": 0.0022513540159761927,
"loss": 3.058414840698242,
"num_input_tokens_seen": 5646581760,
"step": 10770,
"train_runtime": 48916.6973,
"train_tokens_per_second": 115432.604
},
{
"epoch": 0.5833491165886523,
"grad_norm": 0.14026111364364624,
"learning_rate": 0.0022475211811805508,
"loss": 3.040976715087891,
"num_input_tokens_seen": 5651824640,
"step": 10780,
"train_runtime": 48961.8335,
"train_tokens_per_second": 115433.272
},
{
"epoch": 0.5838902567710166,
"grad_norm": 0.1421726644039154,
"learning_rate": 0.0022436898804296273,
"loss": 3.0329113006591797,
"num_input_tokens_seen": 5657067520,
"step": 10790,
"train_runtime": 49006.9777,
"train_tokens_per_second": 115433.92
},
{
"epoch": 0.5844313969533808,
"grad_norm": 0.14624913036823273,
"learning_rate": 0.0022398601254202074,
"loss": 3.0412059783935548,
"num_input_tokens_seen": 5662310400,
"step": 10800,
"train_runtime": 49052.1103,
"train_tokens_per_second": 115434.593
},
{
"epoch": 0.584972537135745,
"grad_norm": 0.14961951971054077,
"learning_rate": 0.0022360319278443555,
"loss": 3.039783477783203,
"num_input_tokens_seen": 5667553280,
"step": 10810,
"train_runtime": 49097.2475,
"train_tokens_per_second": 115435.255
},
{
"epoch": 0.5855136773181092,
"grad_norm": 0.13819707930088043,
"learning_rate": 0.0022322052993893828,
"loss": 3.0379779815673826,
"num_input_tokens_seen": 5672796160,
"step": 10820,
"train_runtime": 49142.4002,
"train_tokens_per_second": 115435.879
},
{
"epoch": 0.5860548175004735,
"grad_norm": 0.1506834328174591,
"learning_rate": 0.002228380251737811,
"loss": 3.038629341125488,
"num_input_tokens_seen": 5678039040,
"step": 10830,
"train_runtime": 49187.5417,
"train_tokens_per_second": 115436.528
},
{
"epoch": 0.5865959576828378,
"grad_norm": 0.1385858803987503,
"learning_rate": 0.0022245567965673346,
"loss": 3.0388534545898436,
"num_input_tokens_seen": 5683281920,
"step": 10840,
"train_runtime": 49232.6855,
"train_tokens_per_second": 115437.171
},
{
"epoch": 0.587137097865202,
"grad_norm": 0.14153380692005157,
"learning_rate": 0.002220734945550785,
"loss": 3.040701675415039,
"num_input_tokens_seen": 5688524800,
"step": 10850,
"train_runtime": 49277.8278,
"train_tokens_per_second": 115437.816
},
{
"epoch": 0.5876782380475662,
"grad_norm": 0.1447627693414688,
"learning_rate": 0.002216914710356098,
"loss": 3.0347267150878907,
"num_input_tokens_seen": 5693767680,
"step": 10860,
"train_runtime": 49322.9704,
"train_tokens_per_second": 115438.459
},
{
"epoch": 0.5882193782299304,
"grad_norm": 0.15700754523277283,
"learning_rate": 0.0022130961026462772,
"loss": 3.0408071517944335,
"num_input_tokens_seen": 5699010560,
"step": 10870,
"train_runtime": 49368.1084,
"train_tokens_per_second": 115439.111
},
{
"epoch": 0.5887605184122947,
"grad_norm": 0.1373981535434723,
"learning_rate": 0.002209279134079355,
"loss": 3.0413372039794924,
"num_input_tokens_seen": 5704253440,
"step": 10880,
"train_runtime": 49413.2385,
"train_tokens_per_second": 115439.781
},
{
"epoch": 0.589301658594659,
"grad_norm": 0.13982577621936798,
"learning_rate": 0.0022054638163083607,
"loss": 3.0364784240722655,
"num_input_tokens_seen": 5709496320,
"step": 10890,
"train_runtime": 49458.3538,
"train_tokens_per_second": 115440.484
},
{
"epoch": 0.5898427987770232,
"grad_norm": 0.1471603363752365,
"learning_rate": 0.0022016501609812846,
"loss": 3.02860107421875,
"num_input_tokens_seen": 5714739200,
"step": 10900,
"train_runtime": 49503.4854,
"train_tokens_per_second": 115441.148
},
{
"epoch": 0.5903839389593875,
"grad_norm": 0.140976682305336,
"learning_rate": 0.002197838179741041,
"loss": 3.048592948913574,
"num_input_tokens_seen": 5719982080,
"step": 10910,
"train_runtime": 49548.6066,
"train_tokens_per_second": 115441.835
},
{
"epoch": 0.5909250791417516,
"grad_norm": 0.1391858160495758,
"learning_rate": 0.0021940278842254336,
"loss": 3.0438766479492188,
"num_input_tokens_seen": 5725224960,
"step": 10920,
"train_runtime": 49593.7284,
"train_tokens_per_second": 115442.519
},
{
"epoch": 0.5914662193241159,
"grad_norm": 0.13984552025794983,
"learning_rate": 0.0021902192860671172,
"loss": 3.032778739929199,
"num_input_tokens_seen": 5730467840,
"step": 10930,
"train_runtime": 49638.8587,
"train_tokens_per_second": 115443.183
},
{
"epoch": 0.5920073595064802,
"grad_norm": 0.15091544389724731,
"learning_rate": 0.0021864123968935696,
"loss": 3.0441143035888674,
"num_input_tokens_seen": 5735710720,
"step": 10940,
"train_runtime": 49683.9877,
"train_tokens_per_second": 115443.848
},
{
"epoch": 0.5925484996888444,
"grad_norm": 0.1554540991783142,
"learning_rate": 0.0021826072283270465,
"loss": 3.028913116455078,
"num_input_tokens_seen": 5740953600,
"step": 10950,
"train_runtime": 49729.1048,
"train_tokens_per_second": 115444.539
},
{
"epoch": 0.5930896398712087,
"grad_norm": 0.14302626252174377,
"learning_rate": 0.0021788037919845526,
"loss": 3.0385337829589845,
"num_input_tokens_seen": 5746196480,
"step": 10960,
"train_runtime": 49774.2324,
"train_tokens_per_second": 115445.205
},
{
"epoch": 0.5936307800535728,
"grad_norm": 0.14910683035850525,
"learning_rate": 0.0021750020994778054,
"loss": 3.0436506271362305,
"num_input_tokens_seen": 5751439360,
"step": 10970,
"train_runtime": 49819.3456,
"train_tokens_per_second": 115445.903
},
{
"epoch": 0.5941719202359371,
"grad_norm": 0.15283723175525665,
"learning_rate": 0.002171202162413195,
"loss": 3.047803497314453,
"num_input_tokens_seen": 5756682240,
"step": 10980,
"train_runtime": 49864.471,
"train_tokens_per_second": 115446.572
},
{
"epoch": 0.5947130604183014,
"grad_norm": 0.14589117467403412,
"learning_rate": 0.002167403992391757,
"loss": 3.0425289154052733,
"num_input_tokens_seen": 5761925120,
"step": 10990,
"train_runtime": 49909.6019,
"train_tokens_per_second": 115447.227
},
{
"epoch": 0.5952542006006656,
"grad_norm": 0.1394151747226715,
"learning_rate": 0.0021636076010091276,
"loss": 3.0472259521484375,
"num_input_tokens_seen": 5767168000,
"step": 11000,
"train_runtime": 49954.7308,
"train_tokens_per_second": 115447.885
},
{
"epoch": 0.5952542006006656,
"eval_loss": 2.9926395416259766,
"eval_runtime": 1.9832,
"eval_samples_per_second": 252.121,
"eval_steps_per_second": 4.034,
"num_input_tokens_seen": 5767168000,
"step": 11000
},
{
"epoch": 0.5957953407830299,
"grad_norm": 0.1461019068956375,
"learning_rate": 0.002159812999855516,
"loss": 3.034767913818359,
"num_input_tokens_seen": 5772410880,
"step": 11010,
"train_runtime": 50004.2941,
"train_tokens_per_second": 115438.304
},
{
"epoch": 0.596336480965394,
"grad_norm": 0.13988643884658813,
"learning_rate": 0.002156020200515666,
"loss": 3.0288986206054687,
"num_input_tokens_seen": 5777653760,
"step": 11020,
"train_runtime": 50049.449,
"train_tokens_per_second": 115438.908
},
{
"epoch": 0.5968776211477583,
"grad_norm": 0.13354118168354034,
"learning_rate": 0.002152229214568817,
"loss": 3.0315704345703125,
"num_input_tokens_seen": 5782896640,
"step": 11030,
"train_runtime": 50094.5853,
"train_tokens_per_second": 115439.555
},
{
"epoch": 0.5974187613301226,
"grad_norm": 0.1455395370721817,
"learning_rate": 0.0021484400535886766,
"loss": 3.0255619049072267,
"num_input_tokens_seen": 5788139520,
"step": 11040,
"train_runtime": 50139.709,
"train_tokens_per_second": 115440.23
},
{
"epoch": 0.5979599015124868,
"grad_norm": 0.1527973711490631,
"learning_rate": 0.002144652729143379,
"loss": 3.0323816299438477,
"num_input_tokens_seen": 5793382400,
"step": 11050,
"train_runtime": 50184.8687,
"train_tokens_per_second": 115440.82
},
{
"epoch": 0.5985010416948511,
"grad_norm": 0.14457052946090698,
"learning_rate": 0.0021408672527954502,
"loss": 3.0245555877685546,
"num_input_tokens_seen": 5798625280,
"step": 11060,
"train_runtime": 50230.0184,
"train_tokens_per_second": 115441.432
},
{
"epoch": 0.5990421818772153,
"grad_norm": 0.1389371007680893,
"learning_rate": 0.0021370836361017764,
"loss": 3.036094856262207,
"num_input_tokens_seen": 5803868160,
"step": 11070,
"train_runtime": 50278.8251,
"train_tokens_per_second": 115433.647
},
{
"epoch": 0.5995833220595795,
"grad_norm": 0.15234215557575226,
"learning_rate": 0.002133301890613565,
"loss": 3.0295217514038084,
"num_input_tokens_seen": 5809111040,
"step": 11080,
"train_runtime": 50323.981,
"train_tokens_per_second": 115434.251
},
{
"epoch": 0.6001244622419438,
"grad_norm": 0.1413789838552475,
"learning_rate": 0.002129522027876311,
"loss": 3.021541404724121,
"num_input_tokens_seen": 5814353920,
"step": 11090,
"train_runtime": 50369.1453,
"train_tokens_per_second": 115434.834
},
{
"epoch": 0.600665602424308,
"grad_norm": 0.15141098201274872,
"learning_rate": 0.0021257440594297607,
"loss": 3.026825714111328,
"num_input_tokens_seen": 5819596800,
"step": 11100,
"train_runtime": 50414.335,
"train_tokens_per_second": 115435.358
},
{
"epoch": 0.6012067426066723,
"grad_norm": 0.1444009244441986,
"learning_rate": 0.00212196799680788,
"loss": 3.033803939819336,
"num_input_tokens_seen": 5824839680,
"step": 11110,
"train_runtime": 50459.5415,
"train_tokens_per_second": 115435.842
},
{
"epoch": 0.6017478827890365,
"grad_norm": 0.14855210483074188,
"learning_rate": 0.002118193851538812,
"loss": 3.0400081634521485,
"num_input_tokens_seen": 5830082560,
"step": 11120,
"train_runtime": 50504.7336,
"train_tokens_per_second": 115436.359
},
{
"epoch": 0.6022890229714007,
"grad_norm": 0.13804234564304352,
"learning_rate": 0.002114421635144851,
"loss": 3.0301578521728514,
"num_input_tokens_seen": 5835325440,
"step": 11130,
"train_runtime": 50549.9225,
"train_tokens_per_second": 115436.882
},
{
"epoch": 0.602830163153765,
"grad_norm": 0.14875908195972443,
"learning_rate": 0.0021106513591423967,
"loss": 3.032312774658203,
"num_input_tokens_seen": 5840568320,
"step": 11140,
"train_runtime": 50595.1041,
"train_tokens_per_second": 115437.421
},
{
"epoch": 0.6033713033361292,
"grad_norm": 0.14444148540496826,
"learning_rate": 0.0021068830350419315,
"loss": 3.038595199584961,
"num_input_tokens_seen": 5845811200,
"step": 11150,
"train_runtime": 50640.2767,
"train_tokens_per_second": 115437.979
},
{
"epoch": 0.6039124435184935,
"grad_norm": 0.14319837093353271,
"learning_rate": 0.002103116674347975,
"loss": 3.0365222930908202,
"num_input_tokens_seen": 5851054080,
"step": 11160,
"train_runtime": 50685.4574,
"train_tokens_per_second": 115438.518
},
{
"epoch": 0.6044535837008577,
"grad_norm": 0.1528901755809784,
"learning_rate": 0.002099352288559052,
"loss": 3.0367916107177733,
"num_input_tokens_seen": 5856296960,
"step": 11170,
"train_runtime": 50730.6306,
"train_tokens_per_second": 115439.073
},
{
"epoch": 0.604994723883222,
"grad_norm": 0.1397976279258728,
"learning_rate": 0.002095589889167659,
"loss": 3.026215744018555,
"num_input_tokens_seen": 5861539840,
"step": 11180,
"train_runtime": 50775.8006,
"train_tokens_per_second": 115439.634
},
{
"epoch": 0.6055358640655862,
"grad_norm": 0.1472213864326477,
"learning_rate": 0.0020918294876602294,
"loss": 3.0274309158325194,
"num_input_tokens_seen": 5866782720,
"step": 11190,
"train_runtime": 50820.9715,
"train_tokens_per_second": 115440.192
},
{
"epoch": 0.6060770042479504,
"grad_norm": 0.14902907609939575,
"learning_rate": 0.0020880710955170955,
"loss": 3.0351707458496096,
"num_input_tokens_seen": 5872025600,
"step": 11200,
"train_runtime": 50866.1392,
"train_tokens_per_second": 115440.757
},
{
"epoch": 0.6066181444303147,
"grad_norm": 0.1408633142709732,
"learning_rate": 0.0020843147242124555,
"loss": 3.029071807861328,
"num_input_tokens_seen": 5877268480,
"step": 11210,
"train_runtime": 50911.3053,
"train_tokens_per_second": 115441.324
},
{
"epoch": 0.6071592846126789,
"grad_norm": 0.14094239473342896,
"learning_rate": 0.0020805603852143383,
"loss": 3.032915496826172,
"num_input_tokens_seen": 5882511360,
"step": 11220,
"train_runtime": 50956.4731,
"train_tokens_per_second": 115441.886
},
{
"epoch": 0.6077004247950432,
"grad_norm": 0.14471176266670227,
"learning_rate": 0.0020768080899845687,
"loss": 3.0328413009643556,
"num_input_tokens_seen": 5887754240,
"step": 11230,
"train_runtime": 51001.6333,
"train_tokens_per_second": 115442.465
},
{
"epoch": 0.6082415649774074,
"grad_norm": 0.1309700757265091,
"learning_rate": 0.00207305784997873,
"loss": 3.0344516754150392,
"num_input_tokens_seen": 5892997120,
"step": 11240,
"train_runtime": 51046.7842,
"train_tokens_per_second": 115443.063
},
{
"epoch": 0.6087827051597716,
"grad_norm": 0.14067348837852478,
"learning_rate": 0.0020693096766461333,
"loss": 3.0316375732421874,
"num_input_tokens_seen": 5898240000,
"step": 11250,
"train_runtime": 51091.9272,
"train_tokens_per_second": 115443.678
},
{
"epoch": 0.6093238453421359,
"grad_norm": 0.14874261617660522,
"learning_rate": 0.00206556358142978,
"loss": 3.0260826110839845,
"num_input_tokens_seen": 5903482880,
"step": 11260,
"train_runtime": 51137.0716,
"train_tokens_per_second": 115444.289
},
{
"epoch": 0.6098649855245001,
"grad_norm": 0.1435764729976654,
"learning_rate": 0.002061819575766326,
"loss": 3.0409059524536133,
"num_input_tokens_seen": 5908725760,
"step": 11270,
"train_runtime": 51182.2082,
"train_tokens_per_second": 115444.917
},
{
"epoch": 0.6104061257068644,
"grad_norm": 0.1484087109565735,
"learning_rate": 0.002058077671086047,
"loss": 3.0283117294311523,
"num_input_tokens_seen": 5913968640,
"step": 11280,
"train_runtime": 51227.347,
"train_tokens_per_second": 115445.538
},
{
"epoch": 0.6109472658892287,
"grad_norm": 0.140775665640831,
"learning_rate": 0.002054337878812808,
"loss": 3.026752471923828,
"num_input_tokens_seen": 5919211520,
"step": 11290,
"train_runtime": 51272.4758,
"train_tokens_per_second": 115446.181
},
{
"epoch": 0.6114884060715928,
"grad_norm": 0.14487655460834503,
"learning_rate": 0.002050600210364022,
"loss": 3.0381233215332033,
"num_input_tokens_seen": 5924454400,
"step": 11300,
"train_runtime": 51317.6163,
"train_tokens_per_second": 115446.796
},
{
"epoch": 0.6120295462539571,
"grad_norm": 0.13244298100471497,
"learning_rate": 0.0020468646771506184,
"loss": 3.037242889404297,
"num_input_tokens_seen": 5929697280,
"step": 11310,
"train_runtime": 51362.7685,
"train_tokens_per_second": 115447.384
},
{
"epoch": 0.6125706864363213,
"grad_norm": 0.13805389404296875,
"learning_rate": 0.002043131290577007,
"loss": 3.034191703796387,
"num_input_tokens_seen": 5934940160,
"step": 11320,
"train_runtime": 51407.9443,
"train_tokens_per_second": 115447.919
},
{
"epoch": 0.6131118266186856,
"grad_norm": 0.13927042484283447,
"learning_rate": 0.002039400062041048,
"loss": 3.0405059814453126,
"num_input_tokens_seen": 5940183040,
"step": 11330,
"train_runtime": 51453.1187,
"train_tokens_per_second": 115448.455
},
{
"epoch": 0.6136529668010499,
"grad_norm": 0.13962484896183014,
"learning_rate": 0.0020356710029340096,
"loss": 3.0331016540527345,
"num_input_tokens_seen": 5945425920,
"step": 11340,
"train_runtime": 51498.2896,
"train_tokens_per_second": 115448.998
},
{
"epoch": 0.614194106983414,
"grad_norm": 0.2101336121559143,
"learning_rate": 0.0020319441246405357,
"loss": 3.028001594543457,
"num_input_tokens_seen": 5950668800,
"step": 11350,
"train_runtime": 51543.4451,
"train_tokens_per_second": 115449.574
},
{
"epoch": 0.6147352471657783,
"grad_norm": 0.14625418186187744,
"learning_rate": 0.0020282194385386173,
"loss": 3.0344852447509765,
"num_input_tokens_seen": 5955911680,
"step": 11360,
"train_runtime": 51588.6188,
"train_tokens_per_second": 115450.109
},
{
"epoch": 0.6152763873481425,
"grad_norm": 0.1353636085987091,
"learning_rate": 0.002024496955999548,
"loss": 3.0306270599365233,
"num_input_tokens_seen": 5961154560,
"step": 11370,
"train_runtime": 51633.7613,
"train_tokens_per_second": 115450.713
},
{
"epoch": 0.6158175275305068,
"grad_norm": 0.1368403434753418,
"learning_rate": 0.0020207766883878955,
"loss": 3.0311580657958985,
"num_input_tokens_seen": 5966397440,
"step": 11380,
"train_runtime": 51678.9232,
"train_tokens_per_second": 115451.272
},
{
"epoch": 0.6163586677128711,
"grad_norm": 0.14600516855716705,
"learning_rate": 0.0020170586470614656,
"loss": 3.0117847442626955,
"num_input_tokens_seen": 5971640320,
"step": 11390,
"train_runtime": 51724.0749,
"train_tokens_per_second": 115451.853
},
{
"epoch": 0.6168998078952352,
"grad_norm": 0.14564567804336548,
"learning_rate": 0.002013342843371269,
"loss": 3.037702941894531,
"num_input_tokens_seen": 5976883200,
"step": 11400,
"train_runtime": 51769.2278,
"train_tokens_per_second": 115452.431
},
{
"epoch": 0.6174409480775995,
"grad_norm": 0.1405801773071289,
"learning_rate": 0.0020096292886614825,
"loss": 3.0343984603881835,
"num_input_tokens_seen": 5982126080,
"step": 11410,
"train_runtime": 51814.3769,
"train_tokens_per_second": 115453.016
},
{
"epoch": 0.6179820882599637,
"grad_norm": 0.14390794932842255,
"learning_rate": 0.002005917994269417,
"loss": 3.023337173461914,
"num_input_tokens_seen": 5987368960,
"step": 11420,
"train_runtime": 51859.5123,
"train_tokens_per_second": 115453.63
},
{
"epoch": 0.618523228442328,
"grad_norm": 0.14093852043151855,
"learning_rate": 0.0020022089715254847,
"loss": 3.0304771423339845,
"num_input_tokens_seen": 5992611840,
"step": 11430,
"train_runtime": 51904.644,
"train_tokens_per_second": 115454.252
},
{
"epoch": 0.6190643686246923,
"grad_norm": 0.1447506844997406,
"learning_rate": 0.001998502231753161,
"loss": 3.030156898498535,
"num_input_tokens_seen": 5997854720,
"step": 11440,
"train_runtime": 51949.7701,
"train_tokens_per_second": 115454.885
},
{
"epoch": 0.6196055088070564,
"grad_norm": 0.1445121169090271,
"learning_rate": 0.001994797786268952,
"loss": 3.0251228332519533,
"num_input_tokens_seen": 6003097600,
"step": 11450,
"train_runtime": 51998.5254,
"train_tokens_per_second": 115447.458
},
{
"epoch": 0.6201466489894207,
"grad_norm": 0.15314067900180817,
"learning_rate": 0.0019910956463823587,
"loss": 3.022572135925293,
"num_input_tokens_seen": 6008340480,
"step": 11460,
"train_runtime": 52043.6251,
"train_tokens_per_second": 115448.155
},
{
"epoch": 0.6206877891717849,
"grad_norm": 0.1409369558095932,
"learning_rate": 0.0019873958233958444,
"loss": 3.024155044555664,
"num_input_tokens_seen": 6013583360,
"step": 11470,
"train_runtime": 52088.7237,
"train_tokens_per_second": 115448.852
},
{
"epoch": 0.6212289293541492,
"grad_norm": 0.15012867748737335,
"learning_rate": 0.0019836983286047995,
"loss": 3.0334211349487306,
"num_input_tokens_seen": 6018826240,
"step": 11480,
"train_runtime": 52133.8249,
"train_tokens_per_second": 115449.543
},
{
"epoch": 0.6217700695365135,
"grad_norm": 0.14085648953914642,
"learning_rate": 0.0019800031732975032,
"loss": 3.0264703750610353,
"num_input_tokens_seen": 6024069120,
"step": 11490,
"train_runtime": 52178.9281,
"train_tokens_per_second": 115450.228
},
{
"epoch": 0.6223112097188777,
"grad_norm": 0.14266955852508545,
"learning_rate": 0.001976310368755096,
"loss": 3.032570648193359,
"num_input_tokens_seen": 6029312000,
"step": 11500,
"train_runtime": 52224.0378,
"train_tokens_per_second": 115450.897
},
{
"epoch": 0.6223112097188777,
"eval_loss": 2.984975814819336,
"eval_runtime": 1.9819,
"eval_samples_per_second": 252.288,
"eval_steps_per_second": 4.037,
"num_input_tokens_seen": 6029312000,
"step": 11500
},
{
"epoch": 0.6228523499012419,
"grad_norm": 0.13947011530399323,
"learning_rate": 0.001972619926251541,
"loss": 3.0404077529907227,
"num_input_tokens_seen": 6034554880,
"step": 11510,
"train_runtime": 52271.1258,
"train_tokens_per_second": 115447.195
},
{
"epoch": 0.6233934900836061,
"grad_norm": 0.1446669101715088,
"learning_rate": 0.001968931857053588,
"loss": 3.021891784667969,
"num_input_tokens_seen": 6039797760,
"step": 11520,
"train_runtime": 52316.2406,
"train_tokens_per_second": 115447.855
},
{
"epoch": 0.6239346302659704,
"grad_norm": 0.13946829736232758,
"learning_rate": 0.0019652461724207425,
"loss": 3.0241966247558594,
"num_input_tokens_seen": 6045040640,
"step": 11530,
"train_runtime": 52361.3587,
"train_tokens_per_second": 115448.506
},
{
"epoch": 0.6244757704483347,
"grad_norm": 0.14458313584327698,
"learning_rate": 0.0019615628836052324,
"loss": 3.0141645431518556,
"num_input_tokens_seen": 6050283520,
"step": 11540,
"train_runtime": 52406.4606,
"train_tokens_per_second": 115449.192
},
{
"epoch": 0.6250169106306989,
"grad_norm": 0.14436115324497223,
"learning_rate": 0.0019578820018519663,
"loss": 3.0331525802612305,
"num_input_tokens_seen": 6055526400,
"step": 11550,
"train_runtime": 52451.5704,
"train_tokens_per_second": 115449.859
},
{
"epoch": 0.6255580508130631,
"grad_norm": 0.14012649655342102,
"learning_rate": 0.0019542035383985083,
"loss": 3.043803405761719,
"num_input_tokens_seen": 6060769280,
"step": 11560,
"train_runtime": 52496.6939,
"train_tokens_per_second": 115450.495
},
{
"epoch": 0.6260991909954273,
"grad_norm": 0.14854469895362854,
"learning_rate": 0.0019505275044750371,
"loss": 3.0200592041015626,
"num_input_tokens_seen": 6066012160,
"step": 11570,
"train_runtime": 52541.8152,
"train_tokens_per_second": 115451.134
},
{
"epoch": 0.6266403311777916,
"grad_norm": 0.15853960812091827,
"learning_rate": 0.0019468539113043166,
"loss": 3.020526885986328,
"num_input_tokens_seen": 6071255040,
"step": 11580,
"train_runtime": 52586.931,
"train_tokens_per_second": 115451.785
},
{
"epoch": 0.6271814713601559,
"grad_norm": 0.14197298884391785,
"learning_rate": 0.0019431827701016575,
"loss": 3.0370616912841797,
"num_input_tokens_seen": 6076497920,
"step": 11590,
"train_runtime": 52632.0665,
"train_tokens_per_second": 115452.391
},
{
"epoch": 0.6277226115425201,
"grad_norm": 0.1305466592311859,
"learning_rate": 0.0019395140920748827,
"loss": 3.023914337158203,
"num_input_tokens_seen": 6081740800,
"step": 11600,
"train_runtime": 52677.2162,
"train_tokens_per_second": 115452.965
},
{
"epoch": 0.6282637517248844,
"grad_norm": 0.1447763293981552,
"learning_rate": 0.0019358478884243008,
"loss": 3.024199676513672,
"num_input_tokens_seen": 6086983680,
"step": 11610,
"train_runtime": 52722.3572,
"train_tokens_per_second": 115453.557
},
{
"epoch": 0.6288048919072485,
"grad_norm": 0.14126408100128174,
"learning_rate": 0.0019321841703426608,
"loss": 3.022255706787109,
"num_input_tokens_seen": 6092226560,
"step": 11620,
"train_runtime": 52767.4813,
"train_tokens_per_second": 115454.185
},
{
"epoch": 0.6293460320896128,
"grad_norm": 0.1334850788116455,
"learning_rate": 0.0019285229490151263,
"loss": 3.0233287811279297,
"num_input_tokens_seen": 6097469440,
"step": 11630,
"train_runtime": 52812.6435,
"train_tokens_per_second": 115454.729
},
{
"epoch": 0.6298871722719771,
"grad_norm": 0.14635370671749115,
"learning_rate": 0.0019248642356192365,
"loss": 3.03590087890625,
"num_input_tokens_seen": 6102712320,
"step": 11640,
"train_runtime": 52857.8692,
"train_tokens_per_second": 115455.133
},
{
"epoch": 0.6304283124543413,
"grad_norm": 0.13621026277542114,
"learning_rate": 0.0019212080413248762,
"loss": 3.023410415649414,
"num_input_tokens_seen": 6107955200,
"step": 11650,
"train_runtime": 52903.1239,
"train_tokens_per_second": 115455.473
},
{
"epoch": 0.6309694526367056,
"grad_norm": 0.14006845653057098,
"learning_rate": 0.0019175543772942383,
"loss": 3.020222473144531,
"num_input_tokens_seen": 6113198080,
"step": 11660,
"train_runtime": 52948.2709,
"train_tokens_per_second": 115456.047
},
{
"epoch": 0.6315105928190697,
"grad_norm": 0.13746832311153412,
"learning_rate": 0.0019139032546817902,
"loss": 3.0225994110107424,
"num_input_tokens_seen": 6118440960,
"step": 11670,
"train_runtime": 52993.4226,
"train_tokens_per_second": 115456.611
},
{
"epoch": 0.632051733001434,
"grad_norm": 0.13812102377414703,
"learning_rate": 0.0019102546846342411,
"loss": 3.0324447631835936,
"num_input_tokens_seen": 6123683840,
"step": 11680,
"train_runtime": 53038.5588,
"train_tokens_per_second": 115457.207
},
{
"epoch": 0.6325928731837983,
"grad_norm": 0.14019303023815155,
"learning_rate": 0.0019066086782905097,
"loss": 3.022325897216797,
"num_input_tokens_seen": 6128926720,
"step": 11690,
"train_runtime": 53083.7143,
"train_tokens_per_second": 115457.76
},
{
"epoch": 0.6331340133661625,
"grad_norm": 0.1436738818883896,
"learning_rate": 0.0019029652467816838,
"loss": 3.0244091033935545,
"num_input_tokens_seen": 6134169600,
"step": 11700,
"train_runtime": 53128.868,
"train_tokens_per_second": 115458.315
},
{
"epoch": 0.6336751535485268,
"grad_norm": 0.14594176411628723,
"learning_rate": 0.0018993244012309913,
"loss": 3.025048828125,
"num_input_tokens_seen": 6139412480,
"step": 11710,
"train_runtime": 53173.9948,
"train_tokens_per_second": 115458.929
},
{
"epoch": 0.634216293730891,
"grad_norm": 0.15385597944259644,
"learning_rate": 0.0018956861527537688,
"loss": 3.0213130950927733,
"num_input_tokens_seen": 6144655360,
"step": 11720,
"train_runtime": 53219.1405,
"train_tokens_per_second": 115459.5
},
{
"epoch": 0.6347574339132552,
"grad_norm": 0.14445240795612335,
"learning_rate": 0.0018920505124574195,
"loss": 3.029845428466797,
"num_input_tokens_seen": 6149898240,
"step": 11730,
"train_runtime": 53264.2928,
"train_tokens_per_second": 115460.056
},
{
"epoch": 0.6352985740956195,
"grad_norm": 0.1384369432926178,
"learning_rate": 0.001888417491441387,
"loss": 3.0266345977783202,
"num_input_tokens_seen": 6155141120,
"step": 11740,
"train_runtime": 53309.4606,
"train_tokens_per_second": 115460.578
},
{
"epoch": 0.6358397142779837,
"grad_norm": 0.14229294657707214,
"learning_rate": 0.0018847871007971163,
"loss": 3.017131042480469,
"num_input_tokens_seen": 6160384000,
"step": 11750,
"train_runtime": 53354.6359,
"train_tokens_per_second": 115461.082
},
{
"epoch": 0.636380854460348,
"grad_norm": 0.14127928018569946,
"learning_rate": 0.0018811593516080234,
"loss": 3.021234703063965,
"num_input_tokens_seen": 6165626880,
"step": 11760,
"train_runtime": 53399.8028,
"train_tokens_per_second": 115461.604
},
{
"epoch": 0.6369219946427122,
"grad_norm": 0.13989216089248657,
"learning_rate": 0.0018775342549494606,
"loss": 3.0207067489624024,
"num_input_tokens_seen": 6170869760,
"step": 11770,
"train_runtime": 53444.9593,
"train_tokens_per_second": 115462.147
},
{
"epoch": 0.6374631348250764,
"grad_norm": 0.141075000166893,
"learning_rate": 0.0018739118218886802,
"loss": 3.017308807373047,
"num_input_tokens_seen": 6176112640,
"step": 11780,
"train_runtime": 53490.1129,
"train_tokens_per_second": 115462.696
},
{
"epoch": 0.6380042750074407,
"grad_norm": 0.1446276307106018,
"learning_rate": 0.0018702920634848035,
"loss": 3.0272090911865233,
"num_input_tokens_seen": 6181355520,
"step": 11790,
"train_runtime": 53535.2546,
"train_tokens_per_second": 115463.269
},
{
"epoch": 0.6385454151898049,
"grad_norm": 0.14022940397262573,
"learning_rate": 0.001866674990788788,
"loss": 3.0206020355224608,
"num_input_tokens_seen": 6186598400,
"step": 11800,
"train_runtime": 53580.432,
"train_tokens_per_second": 115463.765
},
{
"epoch": 0.6390865553721692,
"grad_norm": 0.1391611397266388,
"learning_rate": 0.0018630606148433892,
"loss": 3.0259307861328124,
"num_input_tokens_seen": 6191841280,
"step": 11810,
"train_runtime": 53625.6025,
"train_tokens_per_second": 115464.274
},
{
"epoch": 0.6396276955545334,
"grad_norm": 0.1375313252210617,
"learning_rate": 0.0018594489466831293,
"loss": 3.019388198852539,
"num_input_tokens_seen": 6197084160,
"step": 11820,
"train_runtime": 53670.752,
"train_tokens_per_second": 115464.828
},
{
"epoch": 0.6401688357368976,
"grad_norm": 0.13326410949230194,
"learning_rate": 0.0018558399973342677,
"loss": 3.0195072174072264,
"num_input_tokens_seen": 6202327040,
"step": 11830,
"train_runtime": 53719.4728,
"train_tokens_per_second": 115457.705
},
{
"epoch": 0.6407099759192619,
"grad_norm": 0.1425514966249466,
"learning_rate": 0.0018522337778147586,
"loss": 3.012344741821289,
"num_input_tokens_seen": 6207569920,
"step": 11840,
"train_runtime": 53764.8643,
"train_tokens_per_second": 115457.744
},
{
"epoch": 0.6412511161016261,
"grad_norm": 0.14373312890529633,
"learning_rate": 0.001848630299134224,
"loss": 3.0200828552246093,
"num_input_tokens_seen": 6212812800,
"step": 11850,
"train_runtime": 53810.7174,
"train_tokens_per_second": 115456.792
},
{
"epoch": 0.6417922562839904,
"grad_norm": 0.14393045008182526,
"learning_rate": 0.0018450295722939214,
"loss": 3.0205759048461913,
"num_input_tokens_seen": 6218055680,
"step": 11860,
"train_runtime": 53856.71,
"train_tokens_per_second": 115455.543
},
{
"epoch": 0.6423333964663546,
"grad_norm": 0.13831470906734467,
"learning_rate": 0.0018414316082867015,
"loss": 3.018105697631836,
"num_input_tokens_seen": 6223298560,
"step": 11870,
"train_runtime": 53902.8725,
"train_tokens_per_second": 115453.932
},
{
"epoch": 0.6428745366487189,
"grad_norm": 0.14368562400341034,
"learning_rate": 0.0018378364180969837,
"loss": 3.0205171585083006,
"num_input_tokens_seen": 6228541440,
"step": 11880,
"train_runtime": 53949.0344,
"train_tokens_per_second": 115452.325
},
{
"epoch": 0.6434156768310831,
"grad_norm": 0.134961798787117,
"learning_rate": 0.0018342440127007181,
"loss": 3.0208873748779297,
"num_input_tokens_seen": 6233784320,
"step": 11890,
"train_runtime": 53994.8144,
"train_tokens_per_second": 115451.537
},
{
"epoch": 0.6439568170134473,
"grad_norm": 0.139762744307518,
"learning_rate": 0.0018306544030653531,
"loss": 3.0138370513916017,
"num_input_tokens_seen": 6239027200,
"step": 11900,
"train_runtime": 54040.0794,
"train_tokens_per_second": 115451.851
},
{
"epoch": 0.6444979571958116,
"grad_norm": 0.15458019077777863,
"learning_rate": 0.0018270676001498033,
"loss": 3.025080108642578,
"num_input_tokens_seen": 6244270080,
"step": 11910,
"train_runtime": 54085.3315,
"train_tokens_per_second": 115452.192
},
{
"epoch": 0.6450390973781758,
"grad_norm": 0.13538894057273865,
"learning_rate": 0.001823483614904411,
"loss": 3.016307830810547,
"num_input_tokens_seen": 6249512960,
"step": 11920,
"train_runtime": 54130.6042,
"train_tokens_per_second": 115452.489
},
{
"epoch": 0.6455802375605401,
"grad_norm": 0.13436593115329742,
"learning_rate": 0.0018199024582709177,
"loss": 3.0229183197021485,
"num_input_tokens_seen": 6254755840,
"step": 11930,
"train_runtime": 54175.8479,
"train_tokens_per_second": 115452.846
},
{
"epoch": 0.6461213777429043,
"grad_norm": 0.1262059211730957,
"learning_rate": 0.0018163241411824327,
"loss": 3.0243408203125,
"num_input_tokens_seen": 6259998720,
"step": 11940,
"train_runtime": 54221.0877,
"train_tokens_per_second": 115453.212
},
{
"epoch": 0.6466625179252685,
"grad_norm": 0.14077694714069366,
"learning_rate": 0.0018127486745633914,
"loss": 3.009103775024414,
"num_input_tokens_seen": 6265241600,
"step": 11950,
"train_runtime": 54266.3714,
"train_tokens_per_second": 115453.483
},
{
"epoch": 0.6472036581076328,
"grad_norm": 0.14338544011116028,
"learning_rate": 0.001809176069329529,
"loss": 3.019987106323242,
"num_input_tokens_seen": 6270484480,
"step": 11960,
"train_runtime": 54311.6327,
"train_tokens_per_second": 115453.802
},
{
"epoch": 0.647744798289997,
"grad_norm": 0.1309393346309662,
"learning_rate": 0.001805606336387845,
"loss": 3.0178783416748045,
"num_input_tokens_seen": 6275727360,
"step": 11970,
"train_runtime": 54356.8873,
"train_tokens_per_second": 115454.134
},
{
"epoch": 0.6482859384723613,
"grad_norm": 0.1303347647190094,
"learning_rate": 0.0018020394866365714,
"loss": 3.0253570556640623,
"num_input_tokens_seen": 6280970240,
"step": 11980,
"train_runtime": 54402.1335,
"train_tokens_per_second": 115454.484
},
{
"epoch": 0.6488270786547256,
"grad_norm": 0.14178113639354706,
"learning_rate": 0.0017984755309651346,
"loss": 3.0267719268798827,
"num_input_tokens_seen": 6286213120,
"step": 11990,
"train_runtime": 54447.3835,
"train_tokens_per_second": 115454.825
},
{
"epoch": 0.6493682188370897,
"grad_norm": 0.1430656611919403,
"learning_rate": 0.0017949144802541274,
"loss": 3.0143644332885744,
"num_input_tokens_seen": 6291456000,
"step": 12000,
"train_runtime": 54492.6535,
"train_tokens_per_second": 115455.123
},
{
"epoch": 0.6493682188370897,
"eval_loss": 2.9761710166931152,
"eval_runtime": 1.9875,
"eval_samples_per_second": 251.575,
"eval_steps_per_second": 4.025,
"num_input_tokens_seen": 6291456000,
"step": 12000
},
{
"epoch": 0.649909359019454,
"grad_norm": 0.14500592648983002,
"learning_rate": 0.0017913563453752746,
"loss": 3.018670654296875,
"num_input_tokens_seen": 6296698880,
"step": 12010,
"train_runtime": 54542.2937,
"train_tokens_per_second": 115446.169
},
{
"epoch": 0.6504504992018182,
"grad_norm": 0.1448933333158493,
"learning_rate": 0.0017878011371913977,
"loss": 3.0202388763427734,
"num_input_tokens_seen": 6301941760,
"step": 12020,
"train_runtime": 54587.5091,
"train_tokens_per_second": 115446.59
},
{
"epoch": 0.6509916393841825,
"grad_norm": 0.1533222645521164,
"learning_rate": 0.0017842488665563833,
"loss": 3.025776672363281,
"num_input_tokens_seen": 6307184640,
"step": 12030,
"train_runtime": 54632.7175,
"train_tokens_per_second": 115447.024
},
{
"epoch": 0.6515327795665468,
"grad_norm": 0.13312490284442902,
"learning_rate": 0.0017806995443151524,
"loss": 3.0187503814697267,
"num_input_tokens_seen": 6312427520,
"step": 12040,
"train_runtime": 54677.8786,
"train_tokens_per_second": 115447.557
},
{
"epoch": 0.6520739197489109,
"grad_norm": 0.13797084987163544,
"learning_rate": 0.0017771531813036206,
"loss": 3.019959259033203,
"num_input_tokens_seen": 6317670400,
"step": 12050,
"train_runtime": 54723.0506,
"train_tokens_per_second": 115448.067
},
{
"epoch": 0.6526150599312752,
"grad_norm": 0.13628187775611877,
"learning_rate": 0.0017736097883486713,
"loss": 3.012210655212402,
"num_input_tokens_seen": 6322913280,
"step": 12060,
"train_runtime": 54768.2437,
"train_tokens_per_second": 115448.531
},
{
"epoch": 0.6531562001136394,
"grad_norm": 0.13764619827270508,
"learning_rate": 0.001770069376268119,
"loss": 3.0185993194580076,
"num_input_tokens_seen": 6328156160,
"step": 12070,
"train_runtime": 54813.436,
"train_tokens_per_second": 115448.996
},
{
"epoch": 0.6536973402960037,
"grad_norm": 0.14087094366550446,
"learning_rate": 0.001766531955870682,
"loss": 3.0167076110839846,
"num_input_tokens_seen": 6333399040,
"step": 12080,
"train_runtime": 54858.6498,
"train_tokens_per_second": 115449.415
},
{
"epoch": 0.654238480478368,
"grad_norm": 0.13622906804084778,
"learning_rate": 0.0017629975379559405,
"loss": 3.021717643737793,
"num_input_tokens_seen": 6338641920,
"step": 12090,
"train_runtime": 54903.8659,
"train_tokens_per_second": 115449.829
},
{
"epoch": 0.6547796206607321,
"grad_norm": 0.13120146095752716,
"learning_rate": 0.001759466133314308,
"loss": 3.0197391510009766,
"num_input_tokens_seen": 6343884800,
"step": 12100,
"train_runtime": 54949.0889,
"train_tokens_per_second": 115450.227
},
{
"epoch": 0.6553207608430964,
"grad_norm": 0.139594167470932,
"learning_rate": 0.001755937752727003,
"loss": 3.0223533630371096,
"num_input_tokens_seen": 6349127680,
"step": 12110,
"train_runtime": 54994.2934,
"train_tokens_per_second": 115450.664
},
{
"epoch": 0.6558619010254606,
"grad_norm": 0.15013989806175232,
"learning_rate": 0.001752412406966008,
"loss": 3.0148881912231444,
"num_input_tokens_seen": 6354370560,
"step": 12120,
"train_runtime": 55039.5071,
"train_tokens_per_second": 115451.08
},
{
"epoch": 0.6564030412078249,
"grad_norm": 0.13876710832118988,
"learning_rate": 0.0017488901067940416,
"loss": 3.0114933013916017,
"num_input_tokens_seen": 6359613440,
"step": 12130,
"train_runtime": 55084.7162,
"train_tokens_per_second": 115451.506
},
{
"epoch": 0.6569441813901892,
"grad_norm": 0.13125662505626678,
"learning_rate": 0.0017453708629645238,
"loss": 3.004977226257324,
"num_input_tokens_seen": 6364856320,
"step": 12140,
"train_runtime": 55129.9188,
"train_tokens_per_second": 115451.944
},
{
"epoch": 0.6574853215725533,
"grad_norm": 0.14310745894908905,
"learning_rate": 0.0017418546862215448,
"loss": 3.0219293594360352,
"num_input_tokens_seen": 6370099200,
"step": 12150,
"train_runtime": 55175.1468,
"train_tokens_per_second": 115452.329
},
{
"epoch": 0.6580264617549176,
"grad_norm": 0.1343064159154892,
"learning_rate": 0.0017383415872998303,
"loss": 3.017044258117676,
"num_input_tokens_seen": 6375342080,
"step": 12160,
"train_runtime": 55220.3693,
"train_tokens_per_second": 115452.724
},
{
"epoch": 0.6585676019372818,
"grad_norm": 0.13533759117126465,
"learning_rate": 0.0017348315769247086,
"loss": 3.0149707794189453,
"num_input_tokens_seen": 6380584960,
"step": 12170,
"train_runtime": 55265.5973,
"train_tokens_per_second": 115453.108
},
{
"epoch": 0.6591087421196461,
"grad_norm": 0.1386122703552246,
"learning_rate": 0.0017313246658120804,
"loss": 3.0143962860107423,
"num_input_tokens_seen": 6385827840,
"step": 12180,
"train_runtime": 55310.8039,
"train_tokens_per_second": 115453.535
},
{
"epoch": 0.6596498823020104,
"grad_norm": 0.1343347579240799,
"learning_rate": 0.0017278208646683856,
"loss": 3.0179080963134766,
"num_input_tokens_seen": 6391070720,
"step": 12190,
"train_runtime": 55356.0418,
"train_tokens_per_second": 115453.896
},
{
"epoch": 0.6601910224843746,
"grad_norm": 0.14169900119304657,
"learning_rate": 0.0017243201841905666,
"loss": 3.0247045516967774,
"num_input_tokens_seen": 6396313600,
"step": 12200,
"train_runtime": 55401.2599,
"train_tokens_per_second": 115454.299
},
{
"epoch": 0.6607321626667388,
"grad_norm": 0.13514377176761627,
"learning_rate": 0.0017208226350660391,
"loss": 3.0104536056518554,
"num_input_tokens_seen": 6401556480,
"step": 12210,
"train_runtime": 55450.8592,
"train_tokens_per_second": 115445.578
},
{
"epoch": 0.661273302849103,
"grad_norm": 0.13756819069385529,
"learning_rate": 0.0017173282279726609,
"loss": 3.0194664001464844,
"num_input_tokens_seen": 6406799360,
"step": 12220,
"train_runtime": 55496.1437,
"train_tokens_per_second": 115445.848
},
{
"epoch": 0.6618144430314673,
"grad_norm": 0.13056276738643646,
"learning_rate": 0.0017138369735786954,
"loss": 3.0248437881469727,
"num_input_tokens_seen": 6412042240,
"step": 12230,
"train_runtime": 55541.3669,
"train_tokens_per_second": 115446.245
},
{
"epoch": 0.6623555832138316,
"grad_norm": 0.13981449604034424,
"learning_rate": 0.0017103488825427826,
"loss": 3.0129575729370117,
"num_input_tokens_seen": 6417285120,
"step": 12240,
"train_runtime": 55586.7139,
"train_tokens_per_second": 115446.384
},
{
"epoch": 0.6628967233961958,
"grad_norm": 0.1439344733953476,
"learning_rate": 0.0017068639655139026,
"loss": 3.022663116455078,
"num_input_tokens_seen": 6422528000,
"step": 12250,
"train_runtime": 55632.0146,
"train_tokens_per_second": 115446.619
},
{
"epoch": 0.66343786357856,
"grad_norm": 0.15030835568904877,
"learning_rate": 0.001703382233131348,
"loss": 3.012424850463867,
"num_input_tokens_seen": 6427770880,
"step": 12260,
"train_runtime": 55677.2926,
"train_tokens_per_second": 115446.901
},
{
"epoch": 0.6639790037609242,
"grad_norm": 0.13960550725460052,
"learning_rate": 0.0016999036960246871,
"loss": 3.0081478118896485,
"num_input_tokens_seen": 6433013760,
"step": 12270,
"train_runtime": 55722.6028,
"train_tokens_per_second": 115447.115
},
{
"epoch": 0.6645201439432885,
"grad_norm": 0.13627994060516357,
"learning_rate": 0.0016964283648137329,
"loss": 3.0084842681884765,
"num_input_tokens_seen": 6438256640,
"step": 12280,
"train_runtime": 55767.8798,
"train_tokens_per_second": 115447.398
},
{
"epoch": 0.6650612841256528,
"grad_norm": 0.14768123626708984,
"learning_rate": 0.0016929562501085123,
"loss": 3.013652801513672,
"num_input_tokens_seen": 6443499520,
"step": 12290,
"train_runtime": 55813.1427,
"train_tokens_per_second": 115447.71
},
{
"epoch": 0.665602424308017,
"grad_norm": 0.14207823574543,
"learning_rate": 0.0016894873625092333,
"loss": 3.0111804962158204,
"num_input_tokens_seen": 6448742400,
"step": 12300,
"train_runtime": 55858.4112,
"train_tokens_per_second": 115448.01
},
{
"epoch": 0.6661435644903813,
"grad_norm": 0.1379329413175583,
"learning_rate": 0.0016860217126062479,
"loss": 3.0187799453735353,
"num_input_tokens_seen": 6453985280,
"step": 12310,
"train_runtime": 55903.6646,
"train_tokens_per_second": 115448.34
},
{
"epoch": 0.6666847046727454,
"grad_norm": 0.14401622116565704,
"learning_rate": 0.0016825593109800264,
"loss": 3.0228382110595704,
"num_input_tokens_seen": 6459228160,
"step": 12320,
"train_runtime": 55948.9475,
"train_tokens_per_second": 115448.609
},
{
"epoch": 0.6672258448551097,
"grad_norm": 0.12955419719219208,
"learning_rate": 0.0016791001682011227,
"loss": 3.0097047805786135,
"num_input_tokens_seen": 6464471040,
"step": 12330,
"train_runtime": 55994.314,
"train_tokens_per_second": 115448.705
},
{
"epoch": 0.667766985037474,
"grad_norm": 0.14710277318954468,
"learning_rate": 0.0016756442948301386,
"loss": 3.0169065475463865,
"num_input_tokens_seen": 6469713920,
"step": 12340,
"train_runtime": 56039.7555,
"train_tokens_per_second": 115448.646
},
{
"epoch": 0.6683081252198382,
"grad_norm": 0.1326688975095749,
"learning_rate": 0.0016721917014176982,
"loss": 3.009653663635254,
"num_input_tokens_seen": 6474956800,
"step": 12350,
"train_runtime": 56085.2141,
"train_tokens_per_second": 115448.553
},
{
"epoch": 0.6688492654022025,
"grad_norm": 0.13903285562992096,
"learning_rate": 0.0016687423985044109,
"loss": 3.019660758972168,
"num_input_tokens_seen": 6480199680,
"step": 12360,
"train_runtime": 56130.7366,
"train_tokens_per_second": 115448.328
},
{
"epoch": 0.6693904055845666,
"grad_norm": 0.13976383209228516,
"learning_rate": 0.0016652963966208385,
"loss": 3.0172367095947266,
"num_input_tokens_seen": 6485442560,
"step": 12370,
"train_runtime": 56176.2788,
"train_tokens_per_second": 115448.063
},
{
"epoch": 0.6699315457669309,
"grad_norm": 0.13633348047733307,
"learning_rate": 0.0016618537062874665,
"loss": 3.004638671875,
"num_input_tokens_seen": 6490685440,
"step": 12380,
"train_runtime": 56221.949,
"train_tokens_per_second": 115447.535
},
{
"epoch": 0.6704726859492952,
"grad_norm": 0.14074033498764038,
"learning_rate": 0.001658414338014669,
"loss": 3.019020843505859,
"num_input_tokens_seen": 6495928320,
"step": 12390,
"train_runtime": 56267.7615,
"train_tokens_per_second": 115446.717
},
{
"epoch": 0.6710138261316594,
"grad_norm": 0.1326296180486679,
"learning_rate": 0.0016549783023026808,
"loss": 3.0110851287841798,
"num_input_tokens_seen": 6501171200,
"step": 12400,
"train_runtime": 56313.0806,
"train_tokens_per_second": 115446.911
},
{
"epoch": 0.6715549663140237,
"grad_norm": 0.13860943913459778,
"learning_rate": 0.001651545609641561,
"loss": 3.0090118408203126,
"num_input_tokens_seen": 6506414080,
"step": 12410,
"train_runtime": 56358.3912,
"train_tokens_per_second": 115447.122
},
{
"epoch": 0.6720961064963878,
"grad_norm": 0.1410975605249405,
"learning_rate": 0.0016481162705111604,
"loss": 3.0008705139160154,
"num_input_tokens_seen": 6511656960,
"step": 12420,
"train_runtime": 56403.6982,
"train_tokens_per_second": 115447.341
},
{
"epoch": 0.6726372466787521,
"grad_norm": 0.13546454906463623,
"learning_rate": 0.0016446902953810964,
"loss": 3.013086700439453,
"num_input_tokens_seen": 6516899840,
"step": 12430,
"train_runtime": 56448.9891,
"train_tokens_per_second": 115447.592
},
{
"epoch": 0.6731783868611164,
"grad_norm": 0.13547931611537933,
"learning_rate": 0.0016412676947107113,
"loss": 3.004133605957031,
"num_input_tokens_seen": 6522142720,
"step": 12440,
"train_runtime": 56494.2857,
"train_tokens_per_second": 115447.831
},
{
"epoch": 0.6737195270434806,
"grad_norm": 0.13898716866970062,
"learning_rate": 0.0016378484789490479,
"loss": 3.015100860595703,
"num_input_tokens_seen": 6527385600,
"step": 12450,
"train_runtime": 56539.5755,
"train_tokens_per_second": 115448.083
},
{
"epoch": 0.6742606672258449,
"grad_norm": 0.1385628879070282,
"learning_rate": 0.0016344326585348147,
"loss": 3.018421936035156,
"num_input_tokens_seen": 6532628480,
"step": 12460,
"train_runtime": 56584.8917,
"train_tokens_per_second": 115448.281
},
{
"epoch": 0.674801807408209,
"grad_norm": 0.13880495727062225,
"learning_rate": 0.001631020243896355,
"loss": 3.0016693115234374,
"num_input_tokens_seen": 6537871360,
"step": 12470,
"train_runtime": 56630.196,
"train_tokens_per_second": 115448.503
},
{
"epoch": 0.6753429475905733,
"grad_norm": 0.1371801793575287,
"learning_rate": 0.0016276112454516134,
"loss": 3.0135356903076174,
"num_input_tokens_seen": 6543114240,
"step": 12480,
"train_runtime": 56675.5074,
"train_tokens_per_second": 115448.71
},
{
"epoch": 0.6758840877729376,
"grad_norm": 0.1398102194070816,
"learning_rate": 0.001624205673608104,
"loss": 3.0212148666381835,
"num_input_tokens_seen": 6548357120,
"step": 12490,
"train_runtime": 56720.8046,
"train_tokens_per_second": 115448.946
},
{
"epoch": 0.6764252279553018,
"grad_norm": 0.1300211250782013,
"learning_rate": 0.0016208035387628825,
"loss": 3.0142328262329103,
"num_input_tokens_seen": 6553600000,
"step": 12500,
"train_runtime": 56766.0883,
"train_tokens_per_second": 115449.209
},
{
"epoch": 0.6764252279553018,
"eval_loss": 2.968597412109375,
"eval_runtime": 1.9925,
"eval_samples_per_second": 250.941,
"eval_steps_per_second": 4.015,
"num_input_tokens_seen": 6553600000,
"step": 12500
},
{
"epoch": 0.6769663681376661,
"grad_norm": 0.14369215071201324,
"learning_rate": 0.0016174048513025103,
"loss": 3.0048513412475586,
"num_input_tokens_seen": 6558842880,
"step": 12510,
"train_runtime": 56813.3987,
"train_tokens_per_second": 115445.353
},
{
"epoch": 0.6775075083200303,
"grad_norm": 0.14692343771457672,
"learning_rate": 0.0016140096216030232,
"loss": 3.0137935638427735,
"num_input_tokens_seen": 6564085760,
"step": 12520,
"train_runtime": 56858.6904,
"train_tokens_per_second": 115445.602
},
{
"epoch": 0.6780486485023945,
"grad_norm": 0.14028270542621613,
"learning_rate": 0.0016106178600299001,
"loss": 3.010356140136719,
"num_input_tokens_seen": 6569328640,
"step": 12530,
"train_runtime": 56903.9761,
"train_tokens_per_second": 115445.863
},
{
"epoch": 0.6785897886847588,
"grad_norm": 0.12822629511356354,
"learning_rate": 0.0016072295769380353,
"loss": 3.0003124237060548,
"num_input_tokens_seen": 6574571520,
"step": 12540,
"train_runtime": 56949.26,
"train_tokens_per_second": 115446.127
},
{
"epoch": 0.679130928867123,
"grad_norm": 0.1369100958108902,
"learning_rate": 0.0016038447826716993,
"loss": 3.0066249847412108,
"num_input_tokens_seen": 6579814400,
"step": 12550,
"train_runtime": 56994.5681,
"train_tokens_per_second": 115446.342
},
{
"epoch": 0.6796720690494873,
"grad_norm": 0.14047878980636597,
"learning_rate": 0.001600463487564515,
"loss": 3.0145965576171876,
"num_input_tokens_seen": 6585057280,
"step": 12560,
"train_runtime": 57039.861,
"train_tokens_per_second": 115446.587
},
{
"epoch": 0.6802132092318515,
"grad_norm": 0.14242438971996307,
"learning_rate": 0.001597085701939419,
"loss": 3.0166095733642577,
"num_input_tokens_seen": 6590300160,
"step": 12570,
"train_runtime": 57085.1398,
"train_tokens_per_second": 115446.86
},
{
"epoch": 0.6807543494142158,
"grad_norm": 0.1383470743894577,
"learning_rate": 0.0015937114361086369,
"loss": 3.0075637817382814,
"num_input_tokens_seen": 6595543040,
"step": 12580,
"train_runtime": 57130.4343,
"train_tokens_per_second": 115447.101
},
{
"epoch": 0.68129548959658,
"grad_norm": 0.1291186362504959,
"learning_rate": 0.0015903407003736466,
"loss": 3.01377010345459,
"num_input_tokens_seen": 6600785920,
"step": 12590,
"train_runtime": 57180.0264,
"train_tokens_per_second": 115438.665
},
{
"epoch": 0.6818366297789442,
"grad_norm": 0.13580311834812164,
"learning_rate": 0.0015869735050251489,
"loss": 3.0099231719970705,
"num_input_tokens_seen": 6606028800,
"step": 12600,
"train_runtime": 57225.3092,
"train_tokens_per_second": 115438.936
},
{
"epoch": 0.6823777699613085,
"grad_norm": 0.1437922716140747,
"learning_rate": 0.0015836098603430357,
"loss": 3.0034923553466797,
"num_input_tokens_seen": 6611271680,
"step": 12610,
"train_runtime": 57270.5349,
"train_tokens_per_second": 115439.321
},
{
"epoch": 0.6829189101436727,
"grad_norm": 0.13526742160320282,
"learning_rate": 0.0015802497765963614,
"loss": 3.00305061340332,
"num_input_tokens_seen": 6616514560,
"step": 12620,
"train_runtime": 57315.7589,
"train_tokens_per_second": 115439.709
},
{
"epoch": 0.683460050326037,
"grad_norm": 0.1404607594013214,
"learning_rate": 0.0015768932640433059,
"loss": 3.0041690826416017,
"num_input_tokens_seen": 6621757440,
"step": 12630,
"train_runtime": 57360.9936,
"train_tokens_per_second": 115440.076
},
{
"epoch": 0.6840011905084012,
"grad_norm": 0.13756705820560455,
"learning_rate": 0.0015735403329311469,
"loss": 2.9982038497924806,
"num_input_tokens_seen": 6627000320,
"step": 12640,
"train_runtime": 57406.2268,
"train_tokens_per_second": 115440.444
},
{
"epoch": 0.6845423306907654,
"grad_norm": 0.14006656408309937,
"learning_rate": 0.0015701909934962305,
"loss": 3.009762763977051,
"num_input_tokens_seen": 6632243200,
"step": 12650,
"train_runtime": 57451.4583,
"train_tokens_per_second": 115440.816
},
{
"epoch": 0.6850834708731297,
"grad_norm": 0.13317948579788208,
"learning_rate": 0.001566845255963934,
"loss": 3.0151742935180663,
"num_input_tokens_seen": 6637486080,
"step": 12660,
"train_runtime": 57496.7057,
"train_tokens_per_second": 115441.154
},
{
"epoch": 0.6856246110554939,
"grad_norm": 0.13669337332248688,
"learning_rate": 0.0015635031305486417,
"loss": 3.000714874267578,
"num_input_tokens_seen": 6642728960,
"step": 12670,
"train_runtime": 57541.9394,
"train_tokens_per_second": 115441.52
},
{
"epoch": 0.6861657512378582,
"grad_norm": 0.13967348635196686,
"learning_rate": 0.0015601646274537087,
"loss": 3.0043874740600587,
"num_input_tokens_seen": 6647971840,
"step": 12680,
"train_runtime": 57587.1773,
"train_tokens_per_second": 115441.877
},
{
"epoch": 0.6867068914202225,
"grad_norm": 0.13815197348594666,
"learning_rate": 0.0015568297568714312,
"loss": 3.010976219177246,
"num_input_tokens_seen": 6653214720,
"step": 12690,
"train_runtime": 57632.4045,
"train_tokens_per_second": 115442.255
},
{
"epoch": 0.6872480316025866,
"grad_norm": 0.1381223499774933,
"learning_rate": 0.001553498528983015,
"loss": 3.013303756713867,
"num_input_tokens_seen": 6658457600,
"step": 12700,
"train_runtime": 57677.6438,
"train_tokens_per_second": 115442.608
},
{
"epoch": 0.6877891717849509,
"grad_norm": 0.13350199162960052,
"learning_rate": 0.0015501709539585454,
"loss": 3.012788009643555,
"num_input_tokens_seen": 6663700480,
"step": 12710,
"train_runtime": 57722.8853,
"train_tokens_per_second": 115442.956
},
{
"epoch": 0.6883303119673151,
"grad_norm": 0.13979476690292358,
"learning_rate": 0.0015468470419569564,
"loss": 3.0098241806030273,
"num_input_tokens_seen": 6668943360,
"step": 12720,
"train_runtime": 57768.1112,
"train_tokens_per_second": 115443.334
},
{
"epoch": 0.6888714521496794,
"grad_norm": 0.13748957216739655,
"learning_rate": 0.0015435268031259992,
"loss": 3.009090805053711,
"num_input_tokens_seen": 6674186240,
"step": 12730,
"train_runtime": 57813.3636,
"train_tokens_per_second": 115443.659
},
{
"epoch": 0.6894125923320437,
"grad_norm": 0.13561367988586426,
"learning_rate": 0.0015402102476022095,
"loss": 3.008078765869141,
"num_input_tokens_seen": 6679429120,
"step": 12740,
"train_runtime": 57858.572,
"train_tokens_per_second": 115444.072
},
{
"epoch": 0.6899537325144078,
"grad_norm": 0.12914767861366272,
"learning_rate": 0.0015368973855108782,
"loss": 3.0018003463745115,
"num_input_tokens_seen": 6684672000,
"step": 12750,
"train_runtime": 57903.8186,
"train_tokens_per_second": 115444.407
},
{
"epoch": 0.6904948726967721,
"grad_norm": 0.14038655161857605,
"learning_rate": 0.0015335882269660217,
"loss": 3.004079818725586,
"num_input_tokens_seen": 6689914880,
"step": 12760,
"train_runtime": 57949.0509,
"train_tokens_per_second": 115444.771
},
{
"epoch": 0.6910360128791363,
"grad_norm": 0.13866056501865387,
"learning_rate": 0.001530282782070348,
"loss": 3.009323310852051,
"num_input_tokens_seen": 6695157760,
"step": 12770,
"train_runtime": 57994.2931,
"train_tokens_per_second": 115445.114
},
{
"epoch": 0.6915771530615006,
"grad_norm": 0.1286270171403885,
"learning_rate": 0.001526981060915229,
"loss": 3.000651550292969,
"num_input_tokens_seen": 6700400640,
"step": 12780,
"train_runtime": 58039.518,
"train_tokens_per_second": 115445.491
},
{
"epoch": 0.6921182932438649,
"grad_norm": 0.13248993456363678,
"learning_rate": 0.0015236830735806679,
"loss": 3.0101812362670897,
"num_input_tokens_seen": 6705643520,
"step": 12790,
"train_runtime": 58084.7779,
"train_tokens_per_second": 115445.798
},
{
"epoch": 0.692659433426229,
"grad_norm": 0.1369810700416565,
"learning_rate": 0.0015203888301352675,
"loss": 3.004811477661133,
"num_input_tokens_seen": 6710886400,
"step": 12800,
"train_runtime": 58130.0044,
"train_tokens_per_second": 115446.171
},
{
"epoch": 0.6932005736085933,
"grad_norm": 0.14264971017837524,
"learning_rate": 0.001517098340636202,
"loss": 3.010848808288574,
"num_input_tokens_seen": 6716129280,
"step": 12810,
"train_runtime": 58175.241,
"train_tokens_per_second": 115446.523
},
{
"epoch": 0.6937417137909575,
"grad_norm": 0.1406365931034088,
"learning_rate": 0.0015138116151291825,
"loss": 3.0090103149414062,
"num_input_tokens_seen": 6721372160,
"step": 12820,
"train_runtime": 58220.4724,
"train_tokens_per_second": 115446.885
},
{
"epoch": 0.6942828539733218,
"grad_norm": 0.13356050848960876,
"learning_rate": 0.0015105286636484334,
"loss": 2.999258613586426,
"num_input_tokens_seen": 6726615040,
"step": 12830,
"train_runtime": 58265.7054,
"train_tokens_per_second": 115447.243
},
{
"epoch": 0.6948239941556861,
"grad_norm": 0.13091513514518738,
"learning_rate": 0.001507249496216654,
"loss": 3.005986785888672,
"num_input_tokens_seen": 6731857920,
"step": 12840,
"train_runtime": 58310.9354,
"train_tokens_per_second": 115447.606
},
{
"epoch": 0.6953651343380503,
"grad_norm": 0.1335466355085373,
"learning_rate": 0.0015039741228449904,
"loss": 2.9974597930908202,
"num_input_tokens_seen": 6737100800,
"step": 12850,
"train_runtime": 58356.1736,
"train_tokens_per_second": 115447.953
},
{
"epoch": 0.6959062745204145,
"grad_norm": 0.1375114917755127,
"learning_rate": 0.0015007025535330083,
"loss": 3.0074440002441407,
"num_input_tokens_seen": 6742343680,
"step": 12860,
"train_runtime": 58401.3717,
"train_tokens_per_second": 115448.379
},
{
"epoch": 0.6964474147027787,
"grad_norm": 0.15171852707862854,
"learning_rate": 0.001497434798268658,
"loss": 2.996272659301758,
"num_input_tokens_seen": 6747586560,
"step": 12870,
"train_runtime": 58446.5932,
"train_tokens_per_second": 115448.757
},
{
"epoch": 0.696988554885143,
"grad_norm": 0.13725285232067108,
"learning_rate": 0.0014941708670282445,
"loss": 3.0174352645874025,
"num_input_tokens_seen": 6752829440,
"step": 12880,
"train_runtime": 58491.8411,
"train_tokens_per_second": 115449.083
},
{
"epoch": 0.6975296950675073,
"grad_norm": 0.1326073855161667,
"learning_rate": 0.0014909107697764006,
"loss": 3.006754684448242,
"num_input_tokens_seen": 6758072320,
"step": 12890,
"train_runtime": 58537.0682,
"train_tokens_per_second": 115449.45
},
{
"epoch": 0.6980708352498715,
"grad_norm": 0.1453487128019333,
"learning_rate": 0.0014876545164660543,
"loss": 3.003281021118164,
"num_input_tokens_seen": 6763315200,
"step": 12900,
"train_runtime": 58582.3109,
"train_tokens_per_second": 115449.785
},
{
"epoch": 0.6986119754322357,
"grad_norm": 0.13233183324337006,
"learning_rate": 0.001484402117038397,
"loss": 3.0117160797119142,
"num_input_tokens_seen": 6768558080,
"step": 12910,
"train_runtime": 58627.5472,
"train_tokens_per_second": 115450.132
},
{
"epoch": 0.6991531156145999,
"grad_norm": 0.1383819729089737,
"learning_rate": 0.0014811535814228522,
"loss": 3.0003276824951173,
"num_input_tokens_seen": 6773800960,
"step": 12920,
"train_runtime": 58672.7881,
"train_tokens_per_second": 115450.47
},
{
"epoch": 0.6996942557969642,
"grad_norm": 0.13273653388023376,
"learning_rate": 0.0014779089195370515,
"loss": 3.006727600097656,
"num_input_tokens_seen": 6779043840,
"step": 12930,
"train_runtime": 58718.0154,
"train_tokens_per_second": 115450.834
},
{
"epoch": 0.7002353959793285,
"grad_norm": 0.13412410020828247,
"learning_rate": 0.0014746681412867993,
"loss": 2.9990608215332033,
"num_input_tokens_seen": 6784286720,
"step": 12940,
"train_runtime": 58763.2242,
"train_tokens_per_second": 115451.234
},
{
"epoch": 0.7007765361616927,
"grad_norm": 0.13567864894866943,
"learning_rate": 0.0014714312565660412,
"loss": 3.001424789428711,
"num_input_tokens_seen": 6789529600,
"step": 12950,
"train_runtime": 58808.4491,
"train_tokens_per_second": 115451.601
},
{
"epoch": 0.701317676344057,
"grad_norm": 0.12947793304920197,
"learning_rate": 0.0014681982752568368,
"loss": 2.9996448516845704,
"num_input_tokens_seen": 6794772480,
"step": 12960,
"train_runtime": 58853.6594,
"train_tokens_per_second": 115451.997
},
{
"epoch": 0.7018588165264211,
"grad_norm": 0.1319398730993271,
"learning_rate": 0.001464969207229331,
"loss": 3.0077224731445313,
"num_input_tokens_seen": 6800015360,
"step": 12970,
"train_runtime": 58898.8938,
"train_tokens_per_second": 115452.344
},
{
"epoch": 0.7023999567087854,
"grad_norm": 0.14026153087615967,
"learning_rate": 0.0014617440623417178,
"loss": 2.999114227294922,
"num_input_tokens_seen": 6805258240,
"step": 12980,
"train_runtime": 58948.6295,
"train_tokens_per_second": 115443.875
},
{
"epoch": 0.7029410968911497,
"grad_norm": 0.14495210349559784,
"learning_rate": 0.0014585228504402185,
"loss": 3.005875015258789,
"num_input_tokens_seen": 6810501120,
"step": 12990,
"train_runtime": 58994.0959,
"train_tokens_per_second": 115443.775
},
{
"epoch": 0.7034822370735139,
"grad_norm": 0.13643252849578857,
"learning_rate": 0.001455305581359043,
"loss": 2.997660255432129,
"num_input_tokens_seen": 6815744000,
"step": 13000,
"train_runtime": 59039.5206,
"train_tokens_per_second": 115443.756
},
{
"epoch": 0.7034822370735139,
"eval_loss": 2.960465669631958,
"eval_runtime": 1.987,
"eval_samples_per_second": 251.641,
"eval_steps_per_second": 4.026,
"num_input_tokens_seen": 6815744000,
"step": 13000
},
{
"epoch": 0.7040233772558782,
"grad_norm": 0.130798801779747,
"learning_rate": 0.001452092264920367,
"loss": 3.0002573013305662,
"num_input_tokens_seen": 6820986880,
"step": 13010,
"train_runtime": 59089.599,
"train_tokens_per_second": 115434.645
},
{
"epoch": 0.7045645174382423,
"grad_norm": 0.13077320158481598,
"learning_rate": 0.001448882910934297,
"loss": 3.00850830078125,
"num_input_tokens_seen": 6826229760,
"step": 13020,
"train_runtime": 59135.0207,
"train_tokens_per_second": 115434.639
},
{
"epoch": 0.7051056576206066,
"grad_norm": 0.14131614565849304,
"learning_rate": 0.0014456775291988434,
"loss": 3.0077110290527345,
"num_input_tokens_seen": 6831472640,
"step": 13030,
"train_runtime": 59180.4577,
"train_tokens_per_second": 115434.603
},
{
"epoch": 0.7056467978029709,
"grad_norm": 0.13815636932849884,
"learning_rate": 0.0014424761294998883,
"loss": 3.00131778717041,
"num_input_tokens_seen": 6836715520,
"step": 13040,
"train_runtime": 59225.9087,
"train_tokens_per_second": 115434.54
},
{
"epoch": 0.7061879379853351,
"grad_norm": 0.1329071819782257,
"learning_rate": 0.0014392787216111597,
"loss": 2.994339370727539,
"num_input_tokens_seen": 6841958400,
"step": 13050,
"train_runtime": 59271.3336,
"train_tokens_per_second": 115434.528
},
{
"epoch": 0.7067290781676994,
"grad_norm": 0.13561072945594788,
"learning_rate": 0.0014360853152941958,
"loss": 3.0034358978271483,
"num_input_tokens_seen": 6847201280,
"step": 13060,
"train_runtime": 59316.7359,
"train_tokens_per_second": 115434.56
},
{
"epoch": 0.7072702183500635,
"grad_norm": 0.13618333637714386,
"learning_rate": 0.0014328959202983182,
"loss": 3.0087270736694336,
"num_input_tokens_seen": 6852444160,
"step": 13070,
"train_runtime": 59362.09,
"train_tokens_per_second": 115434.685
},
{
"epoch": 0.7078113585324278,
"grad_norm": 0.1365492194890976,
"learning_rate": 0.0014297105463606044,
"loss": 3.0061859130859374,
"num_input_tokens_seen": 6857687040,
"step": 13080,
"train_runtime": 59407.4452,
"train_tokens_per_second": 115434.808
},
{
"epoch": 0.7083524987147921,
"grad_norm": 0.13774985074996948,
"learning_rate": 0.001426529203205853,
"loss": 3.010288429260254,
"num_input_tokens_seen": 6862929920,
"step": 13090,
"train_runtime": 59452.8193,
"train_tokens_per_second": 115434.894
},
{
"epoch": 0.7088936388971563,
"grad_norm": 0.1349509209394455,
"learning_rate": 0.00142335190054656,
"loss": 3.000904846191406,
"num_input_tokens_seen": 6868172800,
"step": 13100,
"train_runtime": 59498.1377,
"train_tokens_per_second": 115435.089
},
{
"epoch": 0.7094347790795206,
"grad_norm": 0.1314682513475418,
"learning_rate": 0.0014201786480828838,
"loss": 3.0022382736206055,
"num_input_tokens_seen": 6873415680,
"step": 13110,
"train_runtime": 59543.4355,
"train_tokens_per_second": 115435.322
},
{
"epoch": 0.7099759192618847,
"grad_norm": 0.14362597465515137,
"learning_rate": 0.0014170094555026182,
"loss": 2.9901851654052733,
"num_input_tokens_seen": 6878658560,
"step": 13120,
"train_runtime": 59588.6836,
"train_tokens_per_second": 115435.652
},
{
"epoch": 0.710517059444249,
"grad_norm": 0.13301101326942444,
"learning_rate": 0.0014138443324811618,
"loss": 3.0021732330322264,
"num_input_tokens_seen": 6883901440,
"step": 13130,
"train_runtime": 59633.9351,
"train_tokens_per_second": 115435.975
},
{
"epoch": 0.7110581996266133,
"grad_norm": 0.13076400756835938,
"learning_rate": 0.0014106832886814891,
"loss": 3.0049604415893554,
"num_input_tokens_seen": 6889144320,
"step": 13140,
"train_runtime": 59679.1572,
"train_tokens_per_second": 115436.354
},
{
"epoch": 0.7115993398089775,
"grad_norm": 0.13057680428028107,
"learning_rate": 0.0014075263337541223,
"loss": 3.009153938293457,
"num_input_tokens_seen": 6894387200,
"step": 13150,
"train_runtime": 59724.3952,
"train_tokens_per_second": 115436.702
},
{
"epoch": 0.7121404799913418,
"grad_norm": 0.13498692214488983,
"learning_rate": 0.0014043734773370997,
"loss": 2.996112060546875,
"num_input_tokens_seen": 6899630080,
"step": 13160,
"train_runtime": 59769.5992,
"train_tokens_per_second": 115437.115
},
{
"epoch": 0.712681620173706,
"grad_norm": 0.13407272100448608,
"learning_rate": 0.0014012247290559466,
"loss": 3.0008213043212892,
"num_input_tokens_seen": 6904872960,
"step": 13170,
"train_runtime": 59814.8054,
"train_tokens_per_second": 115437.523
},
{
"epoch": 0.7132227603560702,
"grad_norm": 0.14042150974273682,
"learning_rate": 0.0013980800985236468,
"loss": 2.9953586578369142,
"num_input_tokens_seen": 6910115840,
"step": 13180,
"train_runtime": 59859.9779,
"train_tokens_per_second": 115437.995
},
{
"epoch": 0.7137639005384345,
"grad_norm": 0.13807494938373566,
"learning_rate": 0.0013949395953406127,
"loss": 2.9886444091796873,
"num_input_tokens_seen": 6915358720,
"step": 13190,
"train_runtime": 59905.1537,
"train_tokens_per_second": 115438.461
},
{
"epoch": 0.7143050407207987,
"grad_norm": 0.13666392862796783,
"learning_rate": 0.0013918032290946552,
"loss": 3.0074825286865234,
"num_input_tokens_seen": 6920601600,
"step": 13200,
"train_runtime": 59950.322,
"train_tokens_per_second": 115438.94
},
{
"epoch": 0.714846180903163,
"grad_norm": 0.12777790427207947,
"learning_rate": 0.0013886710093609566,
"loss": 2.9995635986328124,
"num_input_tokens_seen": 6925844480,
"step": 13210,
"train_runtime": 59995.4811,
"train_tokens_per_second": 115439.436
},
{
"epoch": 0.7153873210855272,
"grad_norm": 0.13057056069374084,
"learning_rate": 0.0013855429457020408,
"loss": 2.993345260620117,
"num_input_tokens_seen": 6931087360,
"step": 13220,
"train_runtime": 60040.6669,
"train_tokens_per_second": 115439.88
},
{
"epoch": 0.7159284612678914,
"grad_norm": 0.13309696316719055,
"learning_rate": 0.0013824190476677417,
"loss": 2.9962528228759764,
"num_input_tokens_seen": 6936330240,
"step": 13230,
"train_runtime": 60085.8338,
"train_tokens_per_second": 115440.359
},
{
"epoch": 0.7164696014502557,
"grad_norm": 0.13253308832645416,
"learning_rate": 0.0013792993247951752,
"loss": 3.001760482788086,
"num_input_tokens_seen": 6941573120,
"step": 13240,
"train_runtime": 60130.9838,
"train_tokens_per_second": 115440.871
},
{
"epoch": 0.7170107416326199,
"grad_norm": 0.14509917795658112,
"learning_rate": 0.001376183786608712,
"loss": 2.999083137512207,
"num_input_tokens_seen": 6946816000,
"step": 13250,
"train_runtime": 60176.1243,
"train_tokens_per_second": 115441.399
},
{
"epoch": 0.7175518818149842,
"grad_norm": 0.13013510406017303,
"learning_rate": 0.001373072442619947,
"loss": 3.0021896362304688,
"num_input_tokens_seen": 6952058880,
"step": 13260,
"train_runtime": 60221.2777,
"train_tokens_per_second": 115441.903
},
{
"epoch": 0.7180930219973484,
"grad_norm": 0.1433565616607666,
"learning_rate": 0.0013699653023276715,
"loss": 2.999072265625,
"num_input_tokens_seen": 6957301760,
"step": 13270,
"train_runtime": 60266.4098,
"train_tokens_per_second": 115442.446
},
{
"epoch": 0.7186341621797127,
"grad_norm": 0.13696636259555817,
"learning_rate": 0.0013668623752178402,
"loss": 2.991237258911133,
"num_input_tokens_seen": 6962544640,
"step": 13280,
"train_runtime": 60311.5633,
"train_tokens_per_second": 115442.948
},
{
"epoch": 0.7191753023620769,
"grad_norm": 0.134785458445549,
"learning_rate": 0.0013637636707635485,
"loss": 3.002344512939453,
"num_input_tokens_seen": 6967787520,
"step": 13290,
"train_runtime": 60356.7015,
"train_tokens_per_second": 115443.478
},
{
"epoch": 0.7197164425444411,
"grad_norm": 0.13965272903442383,
"learning_rate": 0.0013606691984249973,
"loss": 2.9921356201171876,
"num_input_tokens_seen": 6973030400,
"step": 13300,
"train_runtime": 60401.8497,
"train_tokens_per_second": 115443.988
},
{
"epoch": 0.7202575827268054,
"grad_norm": 0.1369258165359497,
"learning_rate": 0.0013575789676494676,
"loss": 2.9890642166137695,
"num_input_tokens_seen": 6978273280,
"step": 13310,
"train_runtime": 60447.02,
"train_tokens_per_second": 115444.455
},
{
"epoch": 0.7207987229091696,
"grad_norm": 0.1361692249774933,
"learning_rate": 0.0013544929878712931,
"loss": 3.0067501068115234,
"num_input_tokens_seen": 6983516160,
"step": 13320,
"train_runtime": 60492.1531,
"train_tokens_per_second": 115444.993
},
{
"epoch": 0.7213398630915339,
"grad_norm": 0.13645213842391968,
"learning_rate": 0.0013514112685118279,
"loss": 2.99460506439209,
"num_input_tokens_seen": 6988759040,
"step": 13330,
"train_runtime": 60537.2701,
"train_tokens_per_second": 115445.56
},
{
"epoch": 0.7218810032738981,
"grad_norm": 0.13640370965003967,
"learning_rate": 0.0013483338189794198,
"loss": 3.0064407348632813,
"num_input_tokens_seen": 6994001920,
"step": 13340,
"train_runtime": 60582.4237,
"train_tokens_per_second": 115446.057
},
{
"epoch": 0.7224221434562623,
"grad_norm": 0.13847370445728302,
"learning_rate": 0.0013452606486693793,
"loss": 2.990389823913574,
"num_input_tokens_seen": 6999244800,
"step": 13350,
"train_runtime": 60627.5832,
"train_tokens_per_second": 115446.542
},
{
"epoch": 0.7229632836386266,
"grad_norm": 0.14565610885620117,
"learning_rate": 0.001342191766963955,
"loss": 2.9985219955444338,
"num_input_tokens_seen": 7004487680,
"step": 13360,
"train_runtime": 60676.4805,
"train_tokens_per_second": 115439.914
},
{
"epoch": 0.7235044238209908,
"grad_norm": 0.13583402335643768,
"learning_rate": 0.0013391271832323016,
"loss": 3.000563049316406,
"num_input_tokens_seen": 7009730560,
"step": 13370,
"train_runtime": 60721.6176,
"train_tokens_per_second": 115440.445
},
{
"epoch": 0.7240455640033551,
"grad_norm": 0.13164934515953064,
"learning_rate": 0.0013360669068304526,
"loss": 2.993762969970703,
"num_input_tokens_seen": 7014973440,
"step": 13380,
"train_runtime": 60766.7453,
"train_tokens_per_second": 115440.993
},
{
"epoch": 0.7245867041857194,
"grad_norm": 0.13159868121147156,
"learning_rate": 0.001333010947101289,
"loss": 2.9905731201171877,
"num_input_tokens_seen": 7020216320,
"step": 13390,
"train_runtime": 60811.8966,
"train_tokens_per_second": 115441.496
},
{
"epoch": 0.7251278443680835,
"grad_norm": 0.1346818059682846,
"learning_rate": 0.001329959313374518,
"loss": 3.002712631225586,
"num_input_tokens_seen": 7025459200,
"step": 13400,
"train_runtime": 60857.0386,
"train_tokens_per_second": 115442.016
},
{
"epoch": 0.7256689845504478,
"grad_norm": 0.1322467029094696,
"learning_rate": 0.0013269120149666353,
"loss": 2.9997226715087892,
"num_input_tokens_seen": 7030702080,
"step": 13410,
"train_runtime": 60902.1814,
"train_tokens_per_second": 115442.533
},
{
"epoch": 0.726210124732812,
"grad_norm": 0.13496780395507812,
"learning_rate": 0.0013238690611809029,
"loss": 3.00130615234375,
"num_input_tokens_seen": 7035944960,
"step": 13420,
"train_runtime": 60947.3114,
"train_tokens_per_second": 115443.074
},
{
"epoch": 0.7267512649151763,
"grad_norm": 0.13476966321468353,
"learning_rate": 0.0013208304613073197,
"loss": 2.9966285705566404,
"num_input_tokens_seen": 7041187840,
"step": 13430,
"train_runtime": 60992.4581,
"train_tokens_per_second": 115443.582
},
{
"epoch": 0.7272924050975406,
"grad_norm": 0.13049598038196564,
"learning_rate": 0.0013177962246225905,
"loss": 3.0012109756469725,
"num_input_tokens_seen": 7046430720,
"step": 13440,
"train_runtime": 61037.614,
"train_tokens_per_second": 115444.072
},
{
"epoch": 0.7278335452799047,
"grad_norm": 0.1286519169807434,
"learning_rate": 0.0013147663603901006,
"loss": 2.9998191833496093,
"num_input_tokens_seen": 7051673600,
"step": 13450,
"train_runtime": 61082.7378,
"train_tokens_per_second": 115444.622
},
{
"epoch": 0.728374685462269,
"grad_norm": 0.13326317071914673,
"learning_rate": 0.0013117408778598853,
"loss": 2.980904769897461,
"num_input_tokens_seen": 7056916480,
"step": 13460,
"train_runtime": 61127.8727,
"train_tokens_per_second": 115445.151
},
{
"epoch": 0.7289158256446332,
"grad_norm": 0.13441520929336548,
"learning_rate": 0.001308719786268604,
"loss": 3.0028324127197266,
"num_input_tokens_seen": 7062159360,
"step": 13470,
"train_runtime": 61173.0008,
"train_tokens_per_second": 115445.691
},
{
"epoch": 0.7294569658269975,
"grad_norm": 0.13160498440265656,
"learning_rate": 0.0013057030948395115,
"loss": 2.990519332885742,
"num_input_tokens_seen": 7067402240,
"step": 13480,
"train_runtime": 61218.1024,
"train_tokens_per_second": 115446.281
},
{
"epoch": 0.7299981060093618,
"grad_norm": 0.13775858283042908,
"learning_rate": 0.001302690812782427,
"loss": 3.006916046142578,
"num_input_tokens_seen": 7072645120,
"step": 13490,
"train_runtime": 61263.2414,
"train_tokens_per_second": 115446.799
},
{
"epoch": 0.7305392461917259,
"grad_norm": 0.13651160895824432,
"learning_rate": 0.0012996829492937084,
"loss": 3.000609016418457,
"num_input_tokens_seen": 7077888000,
"step": 13500,
"train_runtime": 61308.388,
"train_tokens_per_second": 115447.302
},
{
"epoch": 0.7305392461917259,
"eval_loss": 2.9539315700531006,
"eval_runtime": 1.9872,
"eval_samples_per_second": 251.611,
"eval_steps_per_second": 4.026,
"num_input_tokens_seen": 7077888000,
"step": 13500
},
{
"epoch": 0.7310803863740902,
"grad_norm": 0.1339404284954071,
"learning_rate": 0.001296679513556226,
"loss": 2.9880565643310546,
"num_input_tokens_seen": 7083130880,
"step": 13510,
"train_runtime": 61355.5007,
"train_tokens_per_second": 115444.105
},
{
"epoch": 0.7316215265564544,
"grad_norm": 0.1354180872440338,
"learning_rate": 0.0012936805147393292,
"loss": 2.9919578552246096,
"num_input_tokens_seen": 7088373760,
"step": 13520,
"train_runtime": 61400.641,
"train_tokens_per_second": 115444.622
},
{
"epoch": 0.7321626667388187,
"grad_norm": 0.13503789901733398,
"learning_rate": 0.0012906859619988247,
"loss": 2.99132080078125,
"num_input_tokens_seen": 7093616640,
"step": 13530,
"train_runtime": 61445.7513,
"train_tokens_per_second": 115445.193
},
{
"epoch": 0.732703806921183,
"grad_norm": 0.13498766720294952,
"learning_rate": 0.0012876958644769446,
"loss": 2.9880552291870117,
"num_input_tokens_seen": 7098859520,
"step": 13540,
"train_runtime": 61490.8935,
"train_tokens_per_second": 115445.704
},
{
"epoch": 0.7332449471035472,
"grad_norm": 0.13910213112831116,
"learning_rate": 0.0012847102313023185,
"loss": 2.996448516845703,
"num_input_tokens_seen": 7104102400,
"step": 13550,
"train_runtime": 61536.0395,
"train_tokens_per_second": 115446.208
},
{
"epoch": 0.7337860872859114,
"grad_norm": 0.13978877663612366,
"learning_rate": 0.0012817290715899468,
"loss": 2.9948408126831056,
"num_input_tokens_seen": 7109345280,
"step": 13560,
"train_runtime": 61581.1749,
"train_tokens_per_second": 115446.73
},
{
"epoch": 0.7343272274682756,
"grad_norm": 0.12929198145866394,
"learning_rate": 0.0012787523944411728,
"loss": 2.990352821350098,
"num_input_tokens_seen": 7114588160,
"step": 13570,
"train_runtime": 61626.3208,
"train_tokens_per_second": 115447.232
},
{
"epoch": 0.7348683676506399,
"grad_norm": 0.12884965538978577,
"learning_rate": 0.001275780208943655,
"loss": 2.9938125610351562,
"num_input_tokens_seen": 7119831040,
"step": 13580,
"train_runtime": 61671.467,
"train_tokens_per_second": 115447.733
},
{
"epoch": 0.7354095078330042,
"grad_norm": 0.13231875002384186,
"learning_rate": 0.0012728125241713403,
"loss": 2.9899265289306642,
"num_input_tokens_seen": 7125073920,
"step": 13590,
"train_runtime": 61716.5949,
"train_tokens_per_second": 115448.267
},
{
"epoch": 0.7359506480153684,
"grad_norm": 0.13000380992889404,
"learning_rate": 0.001269849349184432,
"loss": 2.997477722167969,
"num_input_tokens_seen": 7130316800,
"step": 13600,
"train_runtime": 61761.7628,
"train_tokens_per_second": 115448.725
},
{
"epoch": 0.7364917881977326,
"grad_norm": 0.13756293058395386,
"learning_rate": 0.0012668906930293686,
"loss": 2.9921825408935545,
"num_input_tokens_seen": 7135559680,
"step": 13610,
"train_runtime": 61806.8862,
"train_tokens_per_second": 115449.266
},
{
"epoch": 0.7370329283800968,
"grad_norm": 0.134871244430542,
"learning_rate": 0.0012639365647387907,
"loss": 2.991608238220215,
"num_input_tokens_seen": 7140802560,
"step": 13620,
"train_runtime": 61852.0353,
"train_tokens_per_second": 115449.759
},
{
"epoch": 0.7375740685624611,
"grad_norm": 0.13307398557662964,
"learning_rate": 0.0012609869733315145,
"loss": 2.994303512573242,
"num_input_tokens_seen": 7146045440,
"step": 13630,
"train_runtime": 61897.1942,
"train_tokens_per_second": 115450.232
},
{
"epoch": 0.7381152087448254,
"grad_norm": 0.1326708197593689,
"learning_rate": 0.0012580419278125086,
"loss": 2.9904823303222656,
"num_input_tokens_seen": 7151288320,
"step": 13640,
"train_runtime": 61942.3523,
"train_tokens_per_second": 115450.706
},
{
"epoch": 0.7386563489271896,
"grad_norm": 0.13145731389522552,
"learning_rate": 0.0012551014371728615,
"loss": 2.991769790649414,
"num_input_tokens_seen": 7156531200,
"step": 13650,
"train_runtime": 61987.491,
"train_tokens_per_second": 115451.216
},
{
"epoch": 0.7391974891095539,
"grad_norm": 0.13033975660800934,
"learning_rate": 0.0012521655103897556,
"loss": 2.9962963104248046,
"num_input_tokens_seen": 7161774080,
"step": 13660,
"train_runtime": 62032.6128,
"train_tokens_per_second": 115451.756
},
{
"epoch": 0.739738629291918,
"grad_norm": 0.13624544441699982,
"learning_rate": 0.0012492341564264394,
"loss": 2.9916343688964844,
"num_input_tokens_seen": 7167016960,
"step": 13670,
"train_runtime": 62077.7496,
"train_tokens_per_second": 115452.268
},
{
"epoch": 0.7402797694742823,
"grad_norm": 0.12694226205348969,
"learning_rate": 0.0012463073842322032,
"loss": 2.9956790924072267,
"num_input_tokens_seen": 7172259840,
"step": 13680,
"train_runtime": 62122.8901,
"train_tokens_per_second": 115452.772
},
{
"epoch": 0.7408209096566466,
"grad_norm": 0.14218159019947052,
"learning_rate": 0.0012433852027423462,
"loss": 2.9924745559692383,
"num_input_tokens_seen": 7177502720,
"step": 13690,
"train_runtime": 62168.0831,
"train_tokens_per_second": 115453.177
},
{
"epoch": 0.7413620498390108,
"grad_norm": 0.13965629041194916,
"learning_rate": 0.0012404676208781556,
"loss": 2.9898683547973635,
"num_input_tokens_seen": 7182745600,
"step": 13700,
"train_runtime": 62213.3158,
"train_tokens_per_second": 115453.509
},
{
"epoch": 0.7419031900213751,
"grad_norm": 0.13439473509788513,
"learning_rate": 0.0012375546475468736,
"loss": 2.99302978515625,
"num_input_tokens_seen": 7187988480,
"step": 13710,
"train_runtime": 62258.5518,
"train_tokens_per_second": 115453.834
},
{
"epoch": 0.7424443302037392,
"grad_norm": 0.13322672247886658,
"learning_rate": 0.0012346462916416746,
"loss": 2.9867807388305665,
"num_input_tokens_seen": 7193231360,
"step": 13720,
"train_runtime": 62303.7184,
"train_tokens_per_second": 115454.287
},
{
"epoch": 0.7429854703861035,
"grad_norm": 0.13469451665878296,
"learning_rate": 0.001231742562041635,
"loss": 2.9933212280273436,
"num_input_tokens_seen": 7198474240,
"step": 13730,
"train_runtime": 62348.8665,
"train_tokens_per_second": 115454.773
},
{
"epoch": 0.7435266105684678,
"grad_norm": 0.1325179785490036,
"learning_rate": 0.001228843467611706,
"loss": 2.9945892333984374,
"num_input_tokens_seen": 7203717120,
"step": 13740,
"train_runtime": 62397.9384,
"train_tokens_per_second": 115447.999
},
{
"epoch": 0.744067750750832,
"grad_norm": 0.1386304348707199,
"learning_rate": 0.0012259490172026927,
"loss": 2.989889907836914,
"num_input_tokens_seen": 7208960000,
"step": 13750,
"train_runtime": 62443.1321,
"train_tokens_per_second": 115448.405
},
{
"epoch": 0.7446088909331963,
"grad_norm": 0.13061648607254028,
"learning_rate": 0.0012230592196512174,
"loss": 2.986536407470703,
"num_input_tokens_seen": 7214202880,
"step": 13760,
"train_runtime": 62488.3343,
"train_tokens_per_second": 115448.795
},
{
"epoch": 0.7451500311155604,
"grad_norm": 0.12978407740592957,
"learning_rate": 0.0012201740837796992,
"loss": 2.9931753158569334,
"num_input_tokens_seen": 7219445760,
"step": 13770,
"train_runtime": 62533.544,
"train_tokens_per_second": 115449.17
},
{
"epoch": 0.7456911712979247,
"grad_norm": 0.12974348664283752,
"learning_rate": 0.0012172936183963243,
"loss": 2.98385009765625,
"num_input_tokens_seen": 7224688640,
"step": 13780,
"train_runtime": 62578.7317,
"train_tokens_per_second": 115449.586
},
{
"epoch": 0.746232311480289,
"grad_norm": 0.1361524909734726,
"learning_rate": 0.0012144178322950217,
"loss": 2.996071624755859,
"num_input_tokens_seen": 7229931520,
"step": 13790,
"train_runtime": 62623.945,
"train_tokens_per_second": 115449.953
},
{
"epoch": 0.7467734516626532,
"grad_norm": 0.12753413617610931,
"learning_rate": 0.0012115467342554353,
"loss": 2.989743232727051,
"num_input_tokens_seen": 7235174400,
"step": 13800,
"train_runtime": 62669.1454,
"train_tokens_per_second": 115450.344
},
{
"epoch": 0.7473145918450175,
"grad_norm": 0.1313578486442566,
"learning_rate": 0.0012086803330428942,
"loss": 2.9922863006591798,
"num_input_tokens_seen": 7240417280,
"step": 13810,
"train_runtime": 62714.3608,
"train_tokens_per_second": 115450.707
},
{
"epoch": 0.7478557320273816,
"grad_norm": 0.13242116570472717,
"learning_rate": 0.0012058186374083889,
"loss": 2.9887691497802735,
"num_input_tokens_seen": 7245660160,
"step": 13820,
"train_runtime": 62759.5959,
"train_tokens_per_second": 115451.033
},
{
"epoch": 0.7483968722097459,
"grad_norm": 0.1344103366136551,
"learning_rate": 0.0012029616560885453,
"loss": 2.989380645751953,
"num_input_tokens_seen": 7250903040,
"step": 13830,
"train_runtime": 62804.8179,
"train_tokens_per_second": 115451.382
},
{
"epoch": 0.7489380123921102,
"grad_norm": 0.13286016881465912,
"learning_rate": 0.001200109397805595,
"loss": 2.9872367858886717,
"num_input_tokens_seen": 7256145920,
"step": 13840,
"train_runtime": 62850.0273,
"train_tokens_per_second": 115451.754
},
{
"epoch": 0.7494791525744744,
"grad_norm": 0.13758355379104614,
"learning_rate": 0.0011972618712673526,
"loss": 2.9894548416137696,
"num_input_tokens_seen": 7261388800,
"step": 13850,
"train_runtime": 62895.244,
"train_tokens_per_second": 115452.113
},
{
"epoch": 0.7500202927568387,
"grad_norm": 0.13310939073562622,
"learning_rate": 0.0011944190851671855,
"loss": 2.980154800415039,
"num_input_tokens_seen": 7266631680,
"step": 13860,
"train_runtime": 62940.4589,
"train_tokens_per_second": 115452.474
},
{
"epoch": 0.7505614329392029,
"grad_norm": 0.13724195957183838,
"learning_rate": 0.0011915810481839884,
"loss": 2.9957542419433594,
"num_input_tokens_seen": 7271874560,
"step": 13870,
"train_runtime": 62985.6674,
"train_tokens_per_second": 115452.846
},
{
"epoch": 0.7511025731215671,
"grad_norm": 0.13776428997516632,
"learning_rate": 0.0011887477689821579,
"loss": 2.9919281005859375,
"num_input_tokens_seen": 7277117440,
"step": 13880,
"train_runtime": 63030.8734,
"train_tokens_per_second": 115453.222
},
{
"epoch": 0.7516437133039314,
"grad_norm": 0.13441872596740723,
"learning_rate": 0.001185919256211564,
"loss": 2.9903282165527343,
"num_input_tokens_seen": 7282360320,
"step": 13890,
"train_runtime": 63076.0694,
"train_tokens_per_second": 115453.616
},
{
"epoch": 0.7521848534862956,
"grad_norm": 0.14160217344760895,
"learning_rate": 0.001183095518507527,
"loss": 2.9950998306274412,
"num_input_tokens_seen": 7287603200,
"step": 13900,
"train_runtime": 63121.2819,
"train_tokens_per_second": 115453.98
},
{
"epoch": 0.7527259936686599,
"grad_norm": 0.13321471214294434,
"learning_rate": 0.001180276564490789,
"loss": 2.9867202758789064,
"num_input_tokens_seen": 7292846080,
"step": 13910,
"train_runtime": 63166.4818,
"train_tokens_per_second": 115454.366
},
{
"epoch": 0.7532671338510241,
"grad_norm": 0.13260754942893982,
"learning_rate": 0.001177462402767485,
"loss": 2.9936323165893555,
"num_input_tokens_seen": 7298088960,
"step": 13920,
"train_runtime": 63211.6992,
"train_tokens_per_second": 115454.719
},
{
"epoch": 0.7538082740333883,
"grad_norm": 0.13385504484176636,
"learning_rate": 0.0011746530419291235,
"loss": 2.9826412200927734,
"num_input_tokens_seen": 7303331840,
"step": 13930,
"train_runtime": 63256.8908,
"train_tokens_per_second": 115455.119
},
{
"epoch": 0.7543494142157526,
"grad_norm": 0.1354595571756363,
"learning_rate": 0.0011718484905525526,
"loss": 2.9921710968017576,
"num_input_tokens_seen": 7308574720,
"step": 13940,
"train_runtime": 63302.0738,
"train_tokens_per_second": 115455.534
},
{
"epoch": 0.7548905543981168,
"grad_norm": 0.13242025673389435,
"learning_rate": 0.0011690487571999377,
"loss": 2.9915000915527346,
"num_input_tokens_seen": 7313817600,
"step": 13950,
"train_runtime": 63347.2678,
"train_tokens_per_second": 115455.928
},
{
"epoch": 0.7554316945804811,
"grad_norm": 0.1303345412015915,
"learning_rate": 0.0011662538504187375,
"loss": 2.992412567138672,
"num_input_tokens_seen": 7319060480,
"step": 13960,
"train_runtime": 63392.4687,
"train_tokens_per_second": 115456.309
},
{
"epoch": 0.7559728347628453,
"grad_norm": 0.1336052417755127,
"learning_rate": 0.0011634637787416738,
"loss": 2.9856544494628907,
"num_input_tokens_seen": 7324303360,
"step": 13970,
"train_runtime": 63437.6413,
"train_tokens_per_second": 115456.742
},
{
"epoch": 0.7565139749452096,
"grad_norm": 0.13160865008831024,
"learning_rate": 0.0011606785506867066,
"loss": 2.990740966796875,
"num_input_tokens_seen": 7329546240,
"step": 13980,
"train_runtime": 63482.8312,
"train_tokens_per_second": 115457.142
},
{
"epoch": 0.7570551151275738,
"grad_norm": 0.132036030292511,
"learning_rate": 0.0011578981747570086,
"loss": 2.9869890213012695,
"num_input_tokens_seen": 7334789120,
"step": 13990,
"train_runtime": 63528.0172,
"train_tokens_per_second": 115457.548
},
{
"epoch": 0.757596255309938,
"grad_norm": 0.13680653274059296,
"learning_rate": 0.0011551226594409406,
"loss": 2.9875946044921875,
"num_input_tokens_seen": 7340032000,
"step": 14000,
"train_runtime": 63573.1915,
"train_tokens_per_second": 115457.976
},
{
"epoch": 0.757596255309938,
"eval_loss": 2.948127031326294,
"eval_runtime": 1.9851,
"eval_samples_per_second": 251.872,
"eval_steps_per_second": 4.03,
"num_input_tokens_seen": 7340032000,
"step": 14000
},
{
"epoch": 0.7581373954923023,
"grad_norm": 0.1333727240562439,
"learning_rate": 0.0011523520132120217,
"loss": 2.9936281204223634,
"num_input_tokens_seen": 7345274880,
"step": 14010,
"train_runtime": 63622.81,
"train_tokens_per_second": 115450.337
},
{
"epoch": 0.7586785356746665,
"grad_norm": 0.13183613121509552,
"learning_rate": 0.0011495862445289092,
"loss": 2.9838493347167967,
"num_input_tokens_seen": 7350517760,
"step": 14020,
"train_runtime": 63667.9625,
"train_tokens_per_second": 115450.809
},
{
"epoch": 0.7592196758570308,
"grad_norm": 0.13663019239902496,
"learning_rate": 0.0011468253618353661,
"loss": 2.9881641387939455,
"num_input_tokens_seen": 7355760640,
"step": 14030,
"train_runtime": 63713.121,
"train_tokens_per_second": 115451.269
},
{
"epoch": 0.759760816039395,
"grad_norm": 0.1334005743265152,
"learning_rate": 0.0011440693735602413,
"loss": 2.9827747344970703,
"num_input_tokens_seen": 7361003520,
"step": 14040,
"train_runtime": 63758.2642,
"train_tokens_per_second": 115451.755
},
{
"epoch": 0.7603019562217592,
"grad_norm": 0.1363915055990219,
"learning_rate": 0.0011413182881174402,
"loss": 2.976375961303711,
"num_input_tokens_seen": 7366246400,
"step": 14050,
"train_runtime": 63803.3929,
"train_tokens_per_second": 115452.268
},
{
"epoch": 0.7608430964041235,
"grad_norm": 0.13721340894699097,
"learning_rate": 0.0011385721139058986,
"loss": 3.0018871307373045,
"num_input_tokens_seen": 7371489280,
"step": 14060,
"train_runtime": 63848.5329,
"train_tokens_per_second": 115452.759
},
{
"epoch": 0.7613842365864877,
"grad_norm": 0.13170303404331207,
"learning_rate": 0.0011358308593095617,
"loss": 2.9844949722290037,
"num_input_tokens_seen": 7376732160,
"step": 14070,
"train_runtime": 63893.6665,
"train_tokens_per_second": 115453.261
},
{
"epoch": 0.761925376768852,
"grad_norm": 0.13645039498806,
"learning_rate": 0.0011330945326973533,
"loss": 2.9850318908691404,
"num_input_tokens_seen": 7381975040,
"step": 14080,
"train_runtime": 63938.7823,
"train_tokens_per_second": 115453.795
},
{
"epoch": 0.7624665169512163,
"grad_norm": 0.1297563761472702,
"learning_rate": 0.0011303631424231526,
"loss": 2.9895225524902345,
"num_input_tokens_seen": 7387217920,
"step": 14090,
"train_runtime": 63983.9157,
"train_tokens_per_second": 115454.296
},
{
"epoch": 0.7630076571335804,
"grad_norm": 0.13698382675647736,
"learning_rate": 0.0011276366968257677,
"loss": 2.9852466583251953,
"num_input_tokens_seen": 7392460800,
"step": 14100,
"train_runtime": 64029.0446,
"train_tokens_per_second": 115454.804
},
{
"epoch": 0.7635487973159447,
"grad_norm": 0.12868466973304749,
"learning_rate": 0.001124915204228913,
"loss": 2.982627105712891,
"num_input_tokens_seen": 7397703680,
"step": 14110,
"train_runtime": 64074.169,
"train_tokens_per_second": 115455.32
},
{
"epoch": 0.7640899374983089,
"grad_norm": 0.13413524627685547,
"learning_rate": 0.0011221986729411787,
"loss": 2.982726287841797,
"num_input_tokens_seen": 7402946560,
"step": 14120,
"train_runtime": 64123.0569,
"train_tokens_per_second": 115449.059
},
{
"epoch": 0.7646310776806732,
"grad_norm": 0.13302487134933472,
"learning_rate": 0.0011194871112560113,
"loss": 2.9999317169189452,
"num_input_tokens_seen": 7408189440,
"step": 14130,
"train_runtime": 64168.1991,
"train_tokens_per_second": 115449.546
},
{
"epoch": 0.7651722178630375,
"grad_norm": 0.13595032691955566,
"learning_rate": 0.001116780527451682,
"loss": 2.986163330078125,
"num_input_tokens_seen": 7413432320,
"step": 14140,
"train_runtime": 64213.3563,
"train_tokens_per_second": 115450.005
},
{
"epoch": 0.7657133580454016,
"grad_norm": 0.12740519642829895,
"learning_rate": 0.0011140789297912688,
"loss": 2.9861852645874025,
"num_input_tokens_seen": 7418675200,
"step": 14150,
"train_runtime": 64258.4713,
"train_tokens_per_second": 115450.54
},
{
"epoch": 0.7662544982277659,
"grad_norm": 0.13032016158103943,
"learning_rate": 0.0011113823265226242,
"loss": 2.9914901733398436,
"num_input_tokens_seen": 7423918080,
"step": 14160,
"train_runtime": 64303.6051,
"train_tokens_per_second": 115451.04
},
{
"epoch": 0.7667956384101301,
"grad_norm": 0.12856240570545197,
"learning_rate": 0.0011086907258783525,
"loss": 2.99139404296875,
"num_input_tokens_seen": 7429160960,
"step": 14170,
"train_runtime": 64348.7292,
"train_tokens_per_second": 115451.557
},
{
"epoch": 0.7673367785924944,
"grad_norm": 0.1300676167011261,
"learning_rate": 0.001106004136075789,
"loss": 2.980759620666504,
"num_input_tokens_seen": 7434403840,
"step": 14180,
"train_runtime": 64393.8763,
"train_tokens_per_second": 115452.032
},
{
"epoch": 0.7678779187748587,
"grad_norm": 0.13340207934379578,
"learning_rate": 0.0011033225653169676,
"loss": 2.979547882080078,
"num_input_tokens_seen": 7439646720,
"step": 14190,
"train_runtime": 64439.0196,
"train_tokens_per_second": 115452.513
},
{
"epoch": 0.7684190589572228,
"grad_norm": 0.1270836591720581,
"learning_rate": 0.0011006460217886007,
"loss": 2.9818099975585937,
"num_input_tokens_seen": 7444889600,
"step": 14200,
"train_runtime": 64484.1553,
"train_tokens_per_second": 115453.006
},
{
"epoch": 0.7689601991395871,
"grad_norm": 0.1316118985414505,
"learning_rate": 0.001097974513662052,
"loss": 2.9830299377441407,
"num_input_tokens_seen": 7450132480,
"step": 14210,
"train_runtime": 64529.2695,
"train_tokens_per_second": 115453.538
},
{
"epoch": 0.7695013393219513,
"grad_norm": 0.13914352655410767,
"learning_rate": 0.0010953080490933129,
"loss": 2.9925983428955076,
"num_input_tokens_seen": 7455375360,
"step": 14220,
"train_runtime": 64574.3994,
"train_tokens_per_second": 115454.041
},
{
"epoch": 0.7700424795043156,
"grad_norm": 0.13164092600345612,
"learning_rate": 0.0010926466362229787,
"loss": 2.9863054275512697,
"num_input_tokens_seen": 7460618240,
"step": 14230,
"train_runtime": 64619.5117,
"train_tokens_per_second": 115454.575
},
{
"epoch": 0.7705836196866799,
"grad_norm": 0.1346326619386673,
"learning_rate": 0.001089990283176218,
"loss": 2.9905773162841798,
"num_input_tokens_seen": 7465861120,
"step": 14240,
"train_runtime": 64664.6395,
"train_tokens_per_second": 115455.08
},
{
"epoch": 0.771124759869044,
"grad_norm": 0.1283544898033142,
"learning_rate": 0.0010873389980627568,
"loss": 2.9964345932006835,
"num_input_tokens_seen": 7471104000,
"step": 14250,
"train_runtime": 64709.798,
"train_tokens_per_second": 115455.53
},
{
"epoch": 0.7716659000514083,
"grad_norm": 0.13457883894443512,
"learning_rate": 0.0010846927889768454,
"loss": 2.9865245819091797,
"num_input_tokens_seen": 7476346880,
"step": 14260,
"train_runtime": 64754.9357,
"train_tokens_per_second": 115456.016
},
{
"epoch": 0.7722070402337725,
"grad_norm": 0.13008961081504822,
"learning_rate": 0.0010820516639972377,
"loss": 2.9932941436767577,
"num_input_tokens_seen": 7481589760,
"step": 14270,
"train_runtime": 64800.0796,
"train_tokens_per_second": 115456.49
},
{
"epoch": 0.7727481804161368,
"grad_norm": 0.13576596975326538,
"learning_rate": 0.0010794156311871674,
"loss": 2.975057601928711,
"num_input_tokens_seen": 7486832640,
"step": 14280,
"train_runtime": 64845.2255,
"train_tokens_per_second": 115456.96
},
{
"epoch": 0.7732893205985011,
"grad_norm": 0.13501375913619995,
"learning_rate": 0.0010767846985943225,
"loss": 2.983927536010742,
"num_input_tokens_seen": 7492075520,
"step": 14290,
"train_runtime": 64890.3622,
"train_tokens_per_second": 115457.446
},
{
"epoch": 0.7738304607808653,
"grad_norm": 0.1284349411725998,
"learning_rate": 0.0010741588742508182,
"loss": 2.994318199157715,
"num_input_tokens_seen": 7497318400,
"step": 14300,
"train_runtime": 64935.5045,
"train_tokens_per_second": 115457.922
},
{
"epoch": 0.7743716009632295,
"grad_norm": 0.13406863808631897,
"learning_rate": 0.0010715381661731754,
"loss": 2.9812191009521483,
"num_input_tokens_seen": 7502561280,
"step": 14310,
"train_runtime": 64980.6813,
"train_tokens_per_second": 115458.335
},
{
"epoch": 0.7749127411455937,
"grad_norm": 0.1352129429578781,
"learning_rate": 0.0010689225823622948,
"loss": 2.9968055725097655,
"num_input_tokens_seen": 7507804160,
"step": 14320,
"train_runtime": 65025.8721,
"train_tokens_per_second": 115458.723
},
{
"epoch": 0.775453881327958,
"grad_norm": 0.13681240379810333,
"learning_rate": 0.0010663121308034337,
"loss": 2.984090805053711,
"num_input_tokens_seen": 7513047040,
"step": 14330,
"train_runtime": 65071.0195,
"train_tokens_per_second": 115459.188
},
{
"epoch": 0.7759950215103223,
"grad_norm": 0.12757869064807892,
"learning_rate": 0.0010637068194661817,
"loss": 2.9872867584228517,
"num_input_tokens_seen": 7518289920,
"step": 14340,
"train_runtime": 65116.166,
"train_tokens_per_second": 115459.653
},
{
"epoch": 0.7765361616926865,
"grad_norm": 0.1297658532857895,
"learning_rate": 0.0010611066563044331,
"loss": 2.987481689453125,
"num_input_tokens_seen": 7523532800,
"step": 14350,
"train_runtime": 65161.3132,
"train_tokens_per_second": 115460.116
},
{
"epoch": 0.7770773018750508,
"grad_norm": 0.13100814819335938,
"learning_rate": 0.0010585116492563672,
"loss": 2.984407424926758,
"num_input_tokens_seen": 7528775680,
"step": 14360,
"train_runtime": 65206.4518,
"train_tokens_per_second": 115460.594
},
{
"epoch": 0.7776184420574149,
"grad_norm": 0.13708344101905823,
"learning_rate": 0.0010559218062444215,
"loss": 2.9803342819213867,
"num_input_tokens_seen": 7534018560,
"step": 14370,
"train_runtime": 65251.6135,
"train_tokens_per_second": 115461.031
},
{
"epoch": 0.7781595822397792,
"grad_norm": 0.13270463049411774,
"learning_rate": 0.001053337135175266,
"loss": 2.9783748626708983,
"num_input_tokens_seen": 7539261440,
"step": 14380,
"train_runtime": 65296.782,
"train_tokens_per_second": 115461.455
},
{
"epoch": 0.7787007224221435,
"grad_norm": 0.1348678022623062,
"learning_rate": 0.001050757643939784,
"loss": 2.985927963256836,
"num_input_tokens_seen": 7544504320,
"step": 14390,
"train_runtime": 65341.9205,
"train_tokens_per_second": 115461.931
},
{
"epoch": 0.7792418626045077,
"grad_norm": 0.1359061747789383,
"learning_rate": 0.0010481833404130433,
"loss": 2.977262496948242,
"num_input_tokens_seen": 7549747200,
"step": 14400,
"train_runtime": 65387.0473,
"train_tokens_per_second": 115462.427
},
{
"epoch": 0.779783002786872,
"grad_norm": 0.13489292562007904,
"learning_rate": 0.0010456142324542742,
"loss": 2.9768039703369142,
"num_input_tokens_seen": 7554990080,
"step": 14410,
"train_runtime": 65432.1998,
"train_tokens_per_second": 115462.878
},
{
"epoch": 0.7803241429692361,
"grad_norm": 0.13529463112354279,
"learning_rate": 0.001043050327906844,
"loss": 2.992759132385254,
"num_input_tokens_seen": 7560232960,
"step": 14420,
"train_runtime": 65477.3624,
"train_tokens_per_second": 115463.31
},
{
"epoch": 0.7808652831516004,
"grad_norm": 0.13989658653736115,
"learning_rate": 0.0010404916345982372,
"loss": 2.9861518859863283,
"num_input_tokens_seen": 7565475840,
"step": 14430,
"train_runtime": 65522.5287,
"train_tokens_per_second": 115463.734
},
{
"epoch": 0.7814064233339647,
"grad_norm": 0.13800008594989777,
"learning_rate": 0.0010379381603400246,
"loss": 2.983747100830078,
"num_input_tokens_seen": 7570718720,
"step": 14440,
"train_runtime": 65567.6879,
"train_tokens_per_second": 115464.171
},
{
"epoch": 0.7819475635163289,
"grad_norm": 0.14410988986492157,
"learning_rate": 0.0010353899129278482,
"loss": 2.986704444885254,
"num_input_tokens_seen": 7575961600,
"step": 14450,
"train_runtime": 65612.8209,
"train_tokens_per_second": 115464.653
},
{
"epoch": 0.7824887036986932,
"grad_norm": 0.13409604132175446,
"learning_rate": 0.0010328469001413872,
"loss": 2.9869441986083984,
"num_input_tokens_seen": 7581204480,
"step": 14460,
"train_runtime": 65657.9605,
"train_tokens_per_second": 115465.123
},
{
"epoch": 0.7830298438810573,
"grad_norm": 0.13234242796897888,
"learning_rate": 0.0010303091297443453,
"loss": 2.9890289306640625,
"num_input_tokens_seen": 7586447360,
"step": 14470,
"train_runtime": 65703.0949,
"train_tokens_per_second": 115465.601
},
{
"epoch": 0.7835709840634216,
"grad_norm": 0.13398636877536774,
"learning_rate": 0.001027776609484418,
"loss": 2.9826473236083983,
"num_input_tokens_seen": 7591690240,
"step": 14480,
"train_runtime": 65748.2396,
"train_tokens_per_second": 115466.061
},
{
"epoch": 0.7841121242457859,
"grad_norm": 0.13305144011974335,
"learning_rate": 0.0010252493470932719,
"loss": 2.9864757537841795,
"num_input_tokens_seen": 7596933120,
"step": 14490,
"train_runtime": 65793.3795,
"train_tokens_per_second": 115466.528
},
{
"epoch": 0.7846532644281501,
"grad_norm": 0.13172990083694458,
"learning_rate": 0.0010227273502865237,
"loss": 2.9912540435791017,
"num_input_tokens_seen": 7602176000,
"step": 14500,
"train_runtime": 65842.395,
"train_tokens_per_second": 115460.199
},
{
"epoch": 0.7846532644281501,
"eval_loss": 2.9429469108581543,
"eval_runtime": 1.9893,
"eval_samples_per_second": 251.343,
"eval_steps_per_second": 4.021,
"num_input_tokens_seen": 7602176000,
"step": 14500
},
{
"epoch": 0.7851944046105144,
"grad_norm": 0.13013876974582672,
"learning_rate": 0.0010202106267637142,
"loss": 2.9870655059814455,
"num_input_tokens_seen": 7607418880,
"step": 14510,
"train_runtime": 65889.5594,
"train_tokens_per_second": 115457.122
},
{
"epoch": 0.7857355447928785,
"grad_norm": 0.14158159494400024,
"learning_rate": 0.001017699184208284,
"loss": 2.9855068206787108,
"num_input_tokens_seen": 7612661760,
"step": 14520,
"train_runtime": 65934.7235,
"train_tokens_per_second": 115457.552
},
{
"epoch": 0.7862766849752428,
"grad_norm": 0.12904150784015656,
"learning_rate": 0.001015193030287551,
"loss": 2.9784725189208983,
"num_input_tokens_seen": 7617904640,
"step": 14530,
"train_runtime": 65979.8789,
"train_tokens_per_second": 115457.997
},
{
"epoch": 0.7868178251576071,
"grad_norm": 0.1475485861301422,
"learning_rate": 0.0010126921726526892,
"loss": 2.9963218688964846,
"num_input_tokens_seen": 7623147520,
"step": 14540,
"train_runtime": 66025.0052,
"train_tokens_per_second": 115458.492
},
{
"epoch": 0.7873589653399713,
"grad_norm": 0.13277380168437958,
"learning_rate": 0.0010101966189387007,
"loss": 2.9872737884521485,
"num_input_tokens_seen": 7628390400,
"step": 14550,
"train_runtime": 66070.1575,
"train_tokens_per_second": 115458.941
},
{
"epoch": 0.7879001055223356,
"grad_norm": 0.13506442308425903,
"learning_rate": 0.0010077063767643974,
"loss": 2.9895917892456056,
"num_input_tokens_seen": 7633633280,
"step": 14560,
"train_runtime": 66115.3068,
"train_tokens_per_second": 115459.394
},
{
"epoch": 0.7884412457046998,
"grad_norm": 0.13273315131664276,
"learning_rate": 0.0010052214537323724,
"loss": 2.9872600555419924,
"num_input_tokens_seen": 7638876160,
"step": 14570,
"train_runtime": 66160.4452,
"train_tokens_per_second": 115459.866
},
{
"epoch": 0.788982385887064,
"grad_norm": 0.1311519294977188,
"learning_rate": 0.0010027418574289832,
"loss": 2.9747976303100585,
"num_input_tokens_seen": 7644119040,
"step": 14580,
"train_runtime": 66205.59,
"train_tokens_per_second": 115460.326
},
{
"epoch": 0.7895235260694283,
"grad_norm": 0.13237175345420837,
"learning_rate": 0.0010002675954243225,
"loss": 2.9707094192504884,
"num_input_tokens_seen": 7649361920,
"step": 14590,
"train_runtime": 66250.7308,
"train_tokens_per_second": 115460.793
},
{
"epoch": 0.7900646662517925,
"grad_norm": 0.13623256981372833,
"learning_rate": 0.0009977986752721967,
"loss": 2.9789360046386717,
"num_input_tokens_seen": 7654604800,
"step": 14600,
"train_runtime": 66295.8847,
"train_tokens_per_second": 115461.236
},
{
"epoch": 0.7906058064341568,
"grad_norm": 0.13563480973243713,
"learning_rate": 0.0009953351045101087,
"loss": 2.976993942260742,
"num_input_tokens_seen": 7659847680,
"step": 14610,
"train_runtime": 66341.0194,
"train_tokens_per_second": 115461.712
},
{
"epoch": 0.791146946616521,
"grad_norm": 0.1308317333459854,
"learning_rate": 0.000992876890659225,
"loss": 2.9876148223876955,
"num_input_tokens_seen": 7665090560,
"step": 14620,
"train_runtime": 66386.152,
"train_tokens_per_second": 115462.191
},
{
"epoch": 0.7916880867988852,
"grad_norm": 0.12994542717933655,
"learning_rate": 0.0009904240412243594,
"loss": 2.989145278930664,
"num_input_tokens_seen": 7670333440,
"step": 14630,
"train_runtime": 66431.2999,
"train_tokens_per_second": 115462.643
},
{
"epoch": 0.7922292269812495,
"grad_norm": 0.13062526285648346,
"learning_rate": 0.0009879765636939479,
"loss": 2.9790761947631834,
"num_input_tokens_seen": 7675576320,
"step": 14640,
"train_runtime": 66476.4455,
"train_tokens_per_second": 115463.098
},
{
"epoch": 0.7927703671636137,
"grad_norm": 0.13198526203632355,
"learning_rate": 0.0009855344655400273,
"loss": 2.991826629638672,
"num_input_tokens_seen": 7680819200,
"step": 14650,
"train_runtime": 66521.5925,
"train_tokens_per_second": 115463.55
},
{
"epoch": 0.793311507345978,
"grad_norm": 0.12981140613555908,
"learning_rate": 0.0009830977542182112,
"loss": 2.97564754486084,
"num_input_tokens_seen": 7686062080,
"step": 14660,
"train_runtime": 66566.7229,
"train_tokens_per_second": 115464.03
},
{
"epoch": 0.7938526475283422,
"grad_norm": 0.13640232384204865,
"learning_rate": 0.0009806664371676665,
"loss": 2.9895370483398436,
"num_input_tokens_seen": 7691304960,
"step": 14670,
"train_runtime": 66611.843,
"train_tokens_per_second": 115464.527
},
{
"epoch": 0.7943937877107065,
"grad_norm": 0.13942649960517883,
"learning_rate": 0.0009782405218110937,
"loss": 2.983687973022461,
"num_input_tokens_seen": 7696547840,
"step": 14680,
"train_runtime": 66656.9717,
"train_tokens_per_second": 115465.009
},
{
"epoch": 0.7949349278930707,
"grad_norm": 0.13253772258758545,
"learning_rate": 0.0009758200155546995,
"loss": 2.9805246353149415,
"num_input_tokens_seen": 7701790720,
"step": 14690,
"train_runtime": 66702.1127,
"train_tokens_per_second": 115465.469
},
{
"epoch": 0.7954760680754349,
"grad_norm": 0.14124181866645813,
"learning_rate": 0.000973404925788178,
"loss": 2.9745468139648437,
"num_input_tokens_seen": 7707033600,
"step": 14700,
"train_runtime": 66747.2598,
"train_tokens_per_second": 115465.918
},
{
"epoch": 0.7960172082577992,
"grad_norm": 0.14020085334777832,
"learning_rate": 0.0009709952598846878,
"loss": 2.978104019165039,
"num_input_tokens_seen": 7712276480,
"step": 14710,
"train_runtime": 66792.381,
"train_tokens_per_second": 115466.411
},
{
"epoch": 0.7965583484401634,
"grad_norm": 0.14543874561786652,
"learning_rate": 0.0009685910252008282,
"loss": 2.972671890258789,
"num_input_tokens_seen": 7717519360,
"step": 14720,
"train_runtime": 66837.5213,
"train_tokens_per_second": 115466.87
},
{
"epoch": 0.7970994886225277,
"grad_norm": 0.1361764669418335,
"learning_rate": 0.0009661922290766168,
"loss": 2.979312515258789,
"num_input_tokens_seen": 7722762240,
"step": 14730,
"train_runtime": 66882.6798,
"train_tokens_per_second": 115467.297
},
{
"epoch": 0.797640628804892,
"grad_norm": 0.1359523981809616,
"learning_rate": 0.000963798878835467,
"loss": 2.9832695007324217,
"num_input_tokens_seen": 7728005120,
"step": 14740,
"train_runtime": 66927.821,
"train_tokens_per_second": 115467.753
},
{
"epoch": 0.7981817689872561,
"grad_norm": 0.1312197595834732,
"learning_rate": 0.0009614109817841685,
"loss": 2.988373565673828,
"num_input_tokens_seen": 7733248000,
"step": 14750,
"train_runtime": 66972.9704,
"train_tokens_per_second": 115468.195
},
{
"epoch": 0.7987229091696204,
"grad_norm": 0.1324051469564438,
"learning_rate": 0.00095902854521286,
"loss": 2.9794536590576173,
"num_input_tokens_seen": 7738490880,
"step": 14760,
"train_runtime": 67018.1103,
"train_tokens_per_second": 115468.652
},
{
"epoch": 0.7992640493519846,
"grad_norm": 0.13141310214996338,
"learning_rate": 0.0009566515763950114,
"loss": 2.979531097412109,
"num_input_tokens_seen": 7743733760,
"step": 14770,
"train_runtime": 67063.2657,
"train_tokens_per_second": 115469.083
},
{
"epoch": 0.7998051895343489,
"grad_norm": 0.13311649858951569,
"learning_rate": 0.0009542800825873985,
"loss": 2.978958511352539,
"num_input_tokens_seen": 7748976640,
"step": 14780,
"train_runtime": 67108.4044,
"train_tokens_per_second": 115469.541
},
{
"epoch": 0.8003463297167132,
"grad_norm": 0.1344899833202362,
"learning_rate": 0.0009519140710300836,
"loss": 2.9761631011962892,
"num_input_tokens_seen": 7754219520,
"step": 14790,
"train_runtime": 67153.558,
"train_tokens_per_second": 115469.973
},
{
"epoch": 0.8008874698990773,
"grad_norm": 0.1314343363046646,
"learning_rate": 0.0009495535489463907,
"loss": 2.9750953674316407,
"num_input_tokens_seen": 7759462400,
"step": 14800,
"train_runtime": 67198.7114,
"train_tokens_per_second": 115470.405
},
{
"epoch": 0.8014286100814416,
"grad_norm": 0.13687878847122192,
"learning_rate": 0.0009471985235428848,
"loss": 2.977894973754883,
"num_input_tokens_seen": 7764705280,
"step": 14810,
"train_runtime": 67243.8512,
"train_tokens_per_second": 115470.859
},
{
"epoch": 0.8019697502638058,
"grad_norm": 0.13268278539180756,
"learning_rate": 0.0009448490020093504,
"loss": 2.983228302001953,
"num_input_tokens_seen": 7769948160,
"step": 14820,
"train_runtime": 67288.9927,
"train_tokens_per_second": 115471.31
},
{
"epoch": 0.8025108904461701,
"grad_norm": 0.13738638162612915,
"learning_rate": 0.0009425049915187695,
"loss": 2.98532657623291,
"num_input_tokens_seen": 7775191040,
"step": 14830,
"train_runtime": 67334.146,
"train_tokens_per_second": 115471.741
},
{
"epoch": 0.8030520306285344,
"grad_norm": 0.13537852466106415,
"learning_rate": 0.0009401664992272974,
"loss": 2.9814353942871095,
"num_input_tokens_seen": 7780433920,
"step": 14840,
"train_runtime": 67379.3084,
"train_tokens_per_second": 115472.155
},
{
"epoch": 0.8035931708108985,
"grad_norm": 0.13461166620254517,
"learning_rate": 0.0009378335322742428,
"loss": 2.988892364501953,
"num_input_tokens_seen": 7785676800,
"step": 14850,
"train_runtime": 67424.4589,
"train_tokens_per_second": 115472.589
},
{
"epoch": 0.8041343109932628,
"grad_norm": 0.1397952139377594,
"learning_rate": 0.0009355060977820479,
"loss": 2.981852149963379,
"num_input_tokens_seen": 7790919680,
"step": 14860,
"train_runtime": 67469.6089,
"train_tokens_per_second": 115473.023
},
{
"epoch": 0.804675451175627,
"grad_norm": 0.13720718026161194,
"learning_rate": 0.000933184202856262,
"loss": 2.9753461837768556,
"num_input_tokens_seen": 7796162560,
"step": 14870,
"train_runtime": 67514.7478,
"train_tokens_per_second": 115473.475
},
{
"epoch": 0.8052165913579913,
"grad_norm": 0.13194413483142853,
"learning_rate": 0.0009308678545855248,
"loss": 2.98673038482666,
"num_input_tokens_seen": 7801405440,
"step": 14880,
"train_runtime": 67563.706,
"train_tokens_per_second": 115467.4
},
{
"epoch": 0.8057577315403556,
"grad_norm": 0.13509796559810638,
"learning_rate": 0.0009285570600415394,
"loss": 2.9741546630859377,
"num_input_tokens_seen": 7806648320,
"step": 14890,
"train_runtime": 67608.8064,
"train_tokens_per_second": 115467.921
},
{
"epoch": 0.8062988717227197,
"grad_norm": 0.13570842146873474,
"learning_rate": 0.0009262518262790568,
"loss": 2.9908029556274416,
"num_input_tokens_seen": 7811891200,
"step": 14900,
"train_runtime": 67653.9237,
"train_tokens_per_second": 115468.413
},
{
"epoch": 0.806840011905084,
"grad_norm": 0.1328882873058319,
"learning_rate": 0.0009239521603358486,
"loss": 2.9901811599731447,
"num_input_tokens_seen": 7817134080,
"step": 14910,
"train_runtime": 67699.0266,
"train_tokens_per_second": 115468.929
},
{
"epoch": 0.8073811520874482,
"grad_norm": 0.13037438690662384,
"learning_rate": 0.0009216580692326891,
"loss": 2.9751874923706056,
"num_input_tokens_seen": 7822376960,
"step": 14920,
"train_runtime": 67744.1354,
"train_tokens_per_second": 115469.434
},
{
"epoch": 0.8079222922698125,
"grad_norm": 0.13509000837802887,
"learning_rate": 0.0009193695599733333,
"loss": 2.9760356903076173,
"num_input_tokens_seen": 7827619840,
"step": 14930,
"train_runtime": 67789.236,
"train_tokens_per_second": 115469.952
},
{
"epoch": 0.8084634324521768,
"grad_norm": 0.13353431224822998,
"learning_rate": 0.0009170866395444952,
"loss": 2.979950714111328,
"num_input_tokens_seen": 7832862720,
"step": 14940,
"train_runtime": 67834.3595,
"train_tokens_per_second": 115470.431
},
{
"epoch": 0.809004572634541,
"grad_norm": 0.13296596705913544,
"learning_rate": 0.0009148093149158249,
"loss": 2.9780080795288084,
"num_input_tokens_seen": 7838105600,
"step": 14950,
"train_runtime": 67879.4629,
"train_tokens_per_second": 115470.943
},
{
"epoch": 0.8095457128169052,
"grad_norm": 0.13199231028556824,
"learning_rate": 0.0009125375930398896,
"loss": 2.976139450073242,
"num_input_tokens_seen": 7843348480,
"step": 14960,
"train_runtime": 67924.5642,
"train_tokens_per_second": 115471.458
},
{
"epoch": 0.8100868529992694,
"grad_norm": 0.1304149031639099,
"learning_rate": 0.0009102714808521528,
"loss": 2.9799163818359373,
"num_input_tokens_seen": 7848591360,
"step": 14970,
"train_runtime": 67969.6467,
"train_tokens_per_second": 115472.005
},
{
"epoch": 0.8106279931816337,
"grad_norm": 0.13312670588493347,
"learning_rate": 0.0009080109852709498,
"loss": 2.9826412200927734,
"num_input_tokens_seen": 7853834240,
"step": 14980,
"train_runtime": 68014.7473,
"train_tokens_per_second": 115472.52
},
{
"epoch": 0.811169133363998,
"grad_norm": 0.13625964522361755,
"learning_rate": 0.0009057561131974695,
"loss": 2.974313735961914,
"num_input_tokens_seen": 7859077120,
"step": 14990,
"train_runtime": 68059.848,
"train_tokens_per_second": 115473.034
},
{
"epoch": 0.8117102735463622,
"grad_norm": 0.13586074113845825,
"learning_rate": 0.000903506871515734,
"loss": 2.9799150466918944,
"num_input_tokens_seen": 7864320000,
"step": 15000,
"train_runtime": 68104.9508,
"train_tokens_per_second": 115473.544
},
{
"epoch": 0.8117102735463622,
"eval_loss": 2.9381465911865234,
"eval_runtime": 1.9846,
"eval_samples_per_second": 251.945,
"eval_steps_per_second": 4.031,
"num_input_tokens_seen": 7864320000,
"step": 15000
},
{
"epoch": 0.8122514137287264,
"grad_norm": 0.13391871750354767,
"learning_rate": 0.0009012632670925736,
"loss": 2.972438430786133,
"num_input_tokens_seen": 7869562880,
"step": 15010,
"train_runtime": 68154.5217,
"train_tokens_per_second": 115466.482
},
{
"epoch": 0.8127925539110906,
"grad_norm": 0.13467305898666382,
"learning_rate": 0.0008990253067776095,
"loss": 2.9732336044311523,
"num_input_tokens_seen": 7874805760,
"step": 15020,
"train_runtime": 68199.7002,
"train_tokens_per_second": 115466.868
},
{
"epoch": 0.8133336940934549,
"grad_norm": 0.13371260464191437,
"learning_rate": 0.0008967929974032304,
"loss": 2.9756675720214845,
"num_input_tokens_seen": 7880048640,
"step": 15030,
"train_runtime": 68244.8815,
"train_tokens_per_second": 115467.248
},
{
"epoch": 0.8138748342758192,
"grad_norm": 0.13191363215446472,
"learning_rate": 0.0008945663457845765,
"loss": 2.9834621429443358,
"num_input_tokens_seen": 7885291520,
"step": 15040,
"train_runtime": 68290.0502,
"train_tokens_per_second": 115467.649
},
{
"epoch": 0.8144159744581834,
"grad_norm": 0.1310187131166458,
"learning_rate": 0.0008923453587195116,
"loss": 2.9787324905395507,
"num_input_tokens_seen": 7890534400,
"step": 15050,
"train_runtime": 68335.2323,
"train_tokens_per_second": 115468.026
},
{
"epoch": 0.8149571146405477,
"grad_norm": 0.13005271553993225,
"learning_rate": 0.0008901300429886064,
"loss": 2.9818572998046875,
"num_input_tokens_seen": 7895777280,
"step": 15060,
"train_runtime": 68380.4424,
"train_tokens_per_second": 115468.356
},
{
"epoch": 0.8154982548229118,
"grad_norm": 0.13187964260578156,
"learning_rate": 0.0008879204053551192,
"loss": 2.9841533660888673,
"num_input_tokens_seen": 7901020160,
"step": 15070,
"train_runtime": 68425.6233,
"train_tokens_per_second": 115468.735
},
{
"epoch": 0.8160393950052761,
"grad_norm": 0.12774254381656647,
"learning_rate": 0.0008857164525649706,
"loss": 2.9738176345825194,
"num_input_tokens_seen": 7906263040,
"step": 15080,
"train_runtime": 68470.8074,
"train_tokens_per_second": 115469.108
},
{
"epoch": 0.8165805351876404,
"grad_norm": 0.13418236374855042,
"learning_rate": 0.0008835181913467284,
"loss": 2.9698516845703127,
"num_input_tokens_seen": 7911505920,
"step": 15090,
"train_runtime": 68516.0039,
"train_tokens_per_second": 115469.459
},
{
"epoch": 0.8171216753700046,
"grad_norm": 0.13305585086345673,
"learning_rate": 0.000881325628411582,
"loss": 2.9800113677978515,
"num_input_tokens_seen": 7916748800,
"step": 15100,
"train_runtime": 68561.1978,
"train_tokens_per_second": 115469.815
},
{
"epoch": 0.8176628155523689,
"grad_norm": 0.1298227459192276,
"learning_rate": 0.0008791387704533261,
"loss": 2.9894580841064453,
"num_input_tokens_seen": 7921991680,
"step": 15110,
"train_runtime": 68606.3897,
"train_tokens_per_second": 115470.173
},
{
"epoch": 0.818203955734733,
"grad_norm": 0.13746146857738495,
"learning_rate": 0.0008769576241483369,
"loss": 2.969521903991699,
"num_input_tokens_seen": 7927234560,
"step": 15120,
"train_runtime": 68651.5837,
"train_tokens_per_second": 115470.527
},
{
"epoch": 0.8187450959170973,
"grad_norm": 0.1307765245437622,
"learning_rate": 0.0008747821961555536,
"loss": 2.9746829986572267,
"num_input_tokens_seen": 7932477440,
"step": 15130,
"train_runtime": 68696.7803,
"train_tokens_per_second": 115470.877
},
{
"epoch": 0.8192862360994616,
"grad_norm": 0.12932413816452026,
"learning_rate": 0.0008726124931164572,
"loss": 2.980904388427734,
"num_input_tokens_seen": 7937720320,
"step": 15140,
"train_runtime": 68741.9605,
"train_tokens_per_second": 115471.253
},
{
"epoch": 0.8198273762818258,
"grad_norm": 0.13145951926708221,
"learning_rate": 0.0008704485216550531,
"loss": 2.977578544616699,
"num_input_tokens_seen": 7942963200,
"step": 15150,
"train_runtime": 68787.1491,
"train_tokens_per_second": 115471.615
},
{
"epoch": 0.8203685164641901,
"grad_norm": 0.13109584152698517,
"learning_rate": 0.0008682902883778457,
"loss": 2.973899078369141,
"num_input_tokens_seen": 7948206080,
"step": 15160,
"train_runtime": 68832.3314,
"train_tokens_per_second": 115471.987
},
{
"epoch": 0.8209096566465542,
"grad_norm": 0.1269070953130722,
"learning_rate": 0.0008661377998738207,
"loss": 2.9858329772949217,
"num_input_tokens_seen": 7953448960,
"step": 15170,
"train_runtime": 68877.5165,
"train_tokens_per_second": 115472.354
},
{
"epoch": 0.8214507968289185,
"grad_norm": 0.13239699602127075,
"learning_rate": 0.0008639910627144282,
"loss": 2.9783477783203125,
"num_input_tokens_seen": 7958691840,
"step": 15180,
"train_runtime": 68922.6959,
"train_tokens_per_second": 115472.73
},
{
"epoch": 0.8219919370112828,
"grad_norm": 0.129794642329216,
"learning_rate": 0.0008618500834535568,
"loss": 2.9712141036987303,
"num_input_tokens_seen": 7963934720,
"step": 15190,
"train_runtime": 68967.862,
"train_tokens_per_second": 115473.128
},
{
"epoch": 0.822533077193647,
"grad_norm": 0.13771747052669525,
"learning_rate": 0.0008597148686275189,
"loss": 2.984314727783203,
"num_input_tokens_seen": 7969177600,
"step": 15200,
"train_runtime": 69013.0362,
"train_tokens_per_second": 115473.511
},
{
"epoch": 0.8230742173760113,
"grad_norm": 0.13398458063602448,
"learning_rate": 0.0008575854247550258,
"loss": 2.9714584350585938,
"num_input_tokens_seen": 7974420480,
"step": 15210,
"train_runtime": 69058.1959,
"train_tokens_per_second": 115473.918
},
{
"epoch": 0.8236153575583754,
"grad_norm": 0.13028761744499207,
"learning_rate": 0.0008554617583371726,
"loss": 2.9726911544799806,
"num_input_tokens_seen": 7979663360,
"step": 15220,
"train_runtime": 69103.3538,
"train_tokens_per_second": 115474.328
},
{
"epoch": 0.8241564977407397,
"grad_norm": 0.13187240064144135,
"learning_rate": 0.0008533438758574152,
"loss": 2.9737316131591798,
"num_input_tokens_seen": 7984906240,
"step": 15230,
"train_runtime": 69148.515,
"train_tokens_per_second": 115474.732
},
{
"epoch": 0.824697637923104,
"grad_norm": 0.13035008311271667,
"learning_rate": 0.0008512317837815503,
"loss": 2.9657833099365236,
"num_input_tokens_seen": 7990149120,
"step": 15240,
"train_runtime": 69193.6841,
"train_tokens_per_second": 115475.122
},
{
"epoch": 0.8252387781054682,
"grad_norm": 0.1308414787054062,
"learning_rate": 0.0008491254885576988,
"loss": 2.968144416809082,
"num_input_tokens_seen": 7995392000,
"step": 15250,
"train_runtime": 69238.862,
"train_tokens_per_second": 115475.497
},
{
"epoch": 0.8257799182878325,
"grad_norm": 0.1312231868505478,
"learning_rate": 0.0008470249966162835,
"loss": 2.9749370574951173,
"num_input_tokens_seen": 8000634880,
"step": 15260,
"train_runtime": 69287.9095,
"train_tokens_per_second": 115469.422
},
{
"epoch": 0.8263210584701967,
"grad_norm": 0.13507384061813354,
"learning_rate": 0.0008449303143700088,
"loss": 2.9808319091796873,
"num_input_tokens_seen": 8005877760,
"step": 15270,
"train_runtime": 69333.0664,
"train_tokens_per_second": 115469.835
},
{
"epoch": 0.8268621986525609,
"grad_norm": 0.12942056357860565,
"learning_rate": 0.0008428414482138435,
"loss": 2.969392776489258,
"num_input_tokens_seen": 8011120640,
"step": 15280,
"train_runtime": 69378.1613,
"train_tokens_per_second": 115470.351
},
{
"epoch": 0.8274033388349252,
"grad_norm": 0.12837563455104828,
"learning_rate": 0.0008407584045250001,
"loss": 2.979315185546875,
"num_input_tokens_seen": 8016363520,
"step": 15290,
"train_runtime": 69423.2721,
"train_tokens_per_second": 115470.84
},
{
"epoch": 0.8279444790172894,
"grad_norm": 0.13300900161266327,
"learning_rate": 0.0008386811896629143,
"loss": 2.9644968032836916,
"num_input_tokens_seen": 8021606400,
"step": 15300,
"train_runtime": 69468.3762,
"train_tokens_per_second": 115471.339
},
{
"epoch": 0.8284856191996537,
"grad_norm": 0.12836603820323944,
"learning_rate": 0.0008366098099692285,
"loss": 2.972013473510742,
"num_input_tokens_seen": 8026849280,
"step": 15310,
"train_runtime": 69513.475,
"train_tokens_per_second": 115471.846
},
{
"epoch": 0.8290267593820179,
"grad_norm": 0.12967608869075775,
"learning_rate": 0.0008345442717677699,
"loss": 2.9776493072509767,
"num_input_tokens_seen": 8032092160,
"step": 15320,
"train_runtime": 69558.5739,
"train_tokens_per_second": 115472.352
},
{
"epoch": 0.8295678995643821,
"grad_norm": 0.12830476462841034,
"learning_rate": 0.0008324845813645304,
"loss": 2.9773494720458986,
"num_input_tokens_seen": 8037335040,
"step": 15330,
"train_runtime": 69603.6687,
"train_tokens_per_second": 115472.865
},
{
"epoch": 0.8301090397467464,
"grad_norm": 0.13105891644954681,
"learning_rate": 0.0008304307450476511,
"loss": 2.9748680114746096,
"num_input_tokens_seen": 8042577920,
"step": 15340,
"train_runtime": 69648.769,
"train_tokens_per_second": 115473.368
},
{
"epoch": 0.8306501799291106,
"grad_norm": 0.1301373690366745,
"learning_rate": 0.0008283827690873988,
"loss": 2.9727630615234375,
"num_input_tokens_seen": 8047820800,
"step": 15350,
"train_runtime": 69693.862,
"train_tokens_per_second": 115473.882
},
{
"epoch": 0.8311913201114749,
"grad_norm": 0.13162434101104736,
"learning_rate": 0.0008263406597361503,
"loss": 2.978099822998047,
"num_input_tokens_seen": 8053063680,
"step": 15360,
"train_runtime": 69738.9614,
"train_tokens_per_second": 115474.385
},
{
"epoch": 0.8317324602938391,
"grad_norm": 0.13288192451000214,
"learning_rate": 0.0008243044232283723,
"loss": 2.9758016586303713,
"num_input_tokens_seen": 8058306560,
"step": 15370,
"train_runtime": 69784.0695,
"train_tokens_per_second": 115474.873
},
{
"epoch": 0.8322736004762034,
"grad_norm": 0.136215478181839,
"learning_rate": 0.0008222740657806005,
"loss": 2.976166915893555,
"num_input_tokens_seen": 8063549440,
"step": 15380,
"train_runtime": 69829.1841,
"train_tokens_per_second": 115475.35
},
{
"epoch": 0.8328147406585676,
"grad_norm": 0.12879818677902222,
"learning_rate": 0.000820249593591422,
"loss": 2.9633615493774412,
"num_input_tokens_seen": 8068792320,
"step": 15390,
"train_runtime": 69874.3003,
"train_tokens_per_second": 115475.823
},
{
"epoch": 0.8333558808409318,
"grad_norm": 0.1428280621767044,
"learning_rate": 0.0008182310128414587,
"loss": 2.9798999786376954,
"num_input_tokens_seen": 8074035200,
"step": 15400,
"train_runtime": 69919.3861,
"train_tokens_per_second": 115476.346
},
{
"epoch": 0.8338970210232961,
"grad_norm": 0.1359853297472,
"learning_rate": 0.0008162183296933439,
"loss": 2.968707275390625,
"num_input_tokens_seen": 8079278080,
"step": 15410,
"train_runtime": 69964.4955,
"train_tokens_per_second": 115476.829
},
{
"epoch": 0.8344381612056603,
"grad_norm": 0.13050523400306702,
"learning_rate": 0.0008142115502917066,
"loss": 2.973996162414551,
"num_input_tokens_seen": 8084520960,
"step": 15420,
"train_runtime": 70009.6056,
"train_tokens_per_second": 115477.31
},
{
"epoch": 0.8349793013880246,
"grad_norm": 0.13029220700263977,
"learning_rate": 0.0008122106807631529,
"loss": 2.9792009353637696,
"num_input_tokens_seen": 8089763840,
"step": 15430,
"train_runtime": 70054.706,
"train_tokens_per_second": 115477.807
},
{
"epoch": 0.8355204415703888,
"grad_norm": 0.13232028484344482,
"learning_rate": 0.0008102157272162447,
"loss": 2.9753578186035154,
"num_input_tokens_seen": 8095006720,
"step": 15440,
"train_runtime": 70099.8205,
"train_tokens_per_second": 115478.28
},
{
"epoch": 0.836061581752753,
"grad_norm": 0.13095484673976898,
"learning_rate": 0.0008082266957414837,
"loss": 2.97320671081543,
"num_input_tokens_seen": 8100249600,
"step": 15450,
"train_runtime": 70144.9322,
"train_tokens_per_second": 115478.757
},
{
"epoch": 0.8366027219351173,
"grad_norm": 0.13523340225219727,
"learning_rate": 0.0008062435924112902,
"loss": 2.9681285858154296,
"num_input_tokens_seen": 8105492480,
"step": 15460,
"train_runtime": 70190.0213,
"train_tokens_per_second": 115479.271
},
{
"epoch": 0.8371438621174815,
"grad_norm": 0.13670340180397034,
"learning_rate": 0.0008042664232799893,
"loss": 2.9674022674560545,
"num_input_tokens_seen": 8110735360,
"step": 15470,
"train_runtime": 70235.1367,
"train_tokens_per_second": 115479.741
},
{
"epoch": 0.8376850022998458,
"grad_norm": 0.12936244904994965,
"learning_rate": 0.0008022951943837868,
"loss": 2.966217041015625,
"num_input_tokens_seen": 8115978240,
"step": 15480,
"train_runtime": 70280.2433,
"train_tokens_per_second": 115480.224
},
{
"epoch": 0.8382261424822101,
"grad_norm": 0.14200405776500702,
"learning_rate": 0.0008003299117407532,
"loss": 2.978799247741699,
"num_input_tokens_seen": 8121221120,
"step": 15490,
"train_runtime": 70325.3302,
"train_tokens_per_second": 115480.739
},
{
"epoch": 0.8387672826645742,
"grad_norm": 0.12791140377521515,
"learning_rate": 0.0007983705813508069,
"loss": 2.971164321899414,
"num_input_tokens_seen": 8126464000,
"step": 15500,
"train_runtime": 70370.4812,
"train_tokens_per_second": 115481.149
},
{
"epoch": 0.8387672826645742,
"eval_loss": 2.9325733184814453,
"eval_runtime": 1.9901,
"eval_samples_per_second": 251.238,
"eval_steps_per_second": 4.02,
"num_input_tokens_seen": 8126464000,
"step": 15500
},
{
"epoch": 0.8393084228469385,
"grad_norm": 0.1335526406764984,
"learning_rate": 0.0007964172091956926,
"loss": 2.9691984176635744,
"num_input_tokens_seen": 8131706880,
"step": 15510,
"train_runtime": 70417.588,
"train_tokens_per_second": 115478.35
},
{
"epoch": 0.8398495630293027,
"grad_norm": 0.13724961876869202,
"learning_rate": 0.0007944698012389664,
"loss": 2.9696407318115234,
"num_input_tokens_seen": 8136949760,
"step": 15520,
"train_runtime": 70462.6835,
"train_tokens_per_second": 115478.851
},
{
"epoch": 0.840390703211667,
"grad_norm": 0.13106457889080048,
"learning_rate": 0.0007925283634259745,
"loss": 2.964072036743164,
"num_input_tokens_seen": 8142192640,
"step": 15530,
"train_runtime": 70507.7742,
"train_tokens_per_second": 115479.36
},
{
"epoch": 0.8409318433940313,
"grad_norm": 0.1346583068370819,
"learning_rate": 0.000790592901683838,
"loss": 2.9721302032470702,
"num_input_tokens_seen": 8147435520,
"step": 15540,
"train_runtime": 70552.8789,
"train_tokens_per_second": 115479.845
},
{
"epoch": 0.8414729835763954,
"grad_norm": 0.12788882851600647,
"learning_rate": 0.0007886634219214321,
"loss": 2.9774459838867187,
"num_input_tokens_seen": 8152678400,
"step": 15550,
"train_runtime": 70597.9816,
"train_tokens_per_second": 115480.333
},
{
"epoch": 0.8420141237587597,
"grad_norm": 0.1323845237493515,
"learning_rate": 0.0007867399300293693,
"loss": 2.971846008300781,
"num_input_tokens_seen": 8157921280,
"step": 15560,
"train_runtime": 70643.081,
"train_tokens_per_second": 115480.825
},
{
"epoch": 0.8425552639411239,
"grad_norm": 0.132669135928154,
"learning_rate": 0.0007848224318799821,
"loss": 2.9736881256103516,
"num_input_tokens_seen": 8163164160,
"step": 15570,
"train_runtime": 70688.1702,
"train_tokens_per_second": 115481.334
},
{
"epoch": 0.8430964041234882,
"grad_norm": 0.1315847635269165,
"learning_rate": 0.0007829109333273051,
"loss": 2.9581043243408205,
"num_input_tokens_seen": 8168407040,
"step": 15580,
"train_runtime": 70733.2527,
"train_tokens_per_second": 115481.852
},
{
"epoch": 0.8436375443058525,
"grad_norm": 0.13508620858192444,
"learning_rate": 0.0007810054402070547,
"loss": 2.967576789855957,
"num_input_tokens_seen": 8173649920,
"step": 15590,
"train_runtime": 70778.3173,
"train_tokens_per_second": 115482.4
},
{
"epoch": 0.8441786844882166,
"grad_norm": 0.13094158470630646,
"learning_rate": 0.0007791059583366134,
"loss": 2.969736671447754,
"num_input_tokens_seen": 8178892800,
"step": 15600,
"train_runtime": 70823.3875,
"train_tokens_per_second": 115482.937
},
{
"epoch": 0.8447198246705809,
"grad_norm": 0.13293389976024628,
"learning_rate": 0.0007772124935150125,
"loss": 2.9740530014038087,
"num_input_tokens_seen": 8184135680,
"step": 15610,
"train_runtime": 70868.5107,
"train_tokens_per_second": 115483.387
},
{
"epoch": 0.8452609648529451,
"grad_norm": 0.12885726988315582,
"learning_rate": 0.0007753250515229127,
"loss": 2.9699680328369142,
"num_input_tokens_seen": 8189378560,
"step": 15620,
"train_runtime": 70913.6516,
"train_tokens_per_second": 115483.808
},
{
"epoch": 0.8458021050353094,
"grad_norm": 0.13280688226222992,
"learning_rate": 0.0007734436381225877,
"loss": 2.9740190505981445,
"num_input_tokens_seen": 8194621440,
"step": 15630,
"train_runtime": 70958.7738,
"train_tokens_per_second": 115484.259
},
{
"epoch": 0.8463432452176737,
"grad_norm": 0.13439851999282837,
"learning_rate": 0.0007715682590579061,
"loss": 2.975991439819336,
"num_input_tokens_seen": 8199864320,
"step": 15640,
"train_runtime": 71003.8731,
"train_tokens_per_second": 115484.747
}
],
"logging_steps": 10,
"max_steps": 18480,
"num_input_tokens_seen": 8200388608,
"num_train_epochs": 1,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 9.287888719514173e+18,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}