15954 lines
480 KiB
JSON
15954 lines
480 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 0.8463973592359101,
|
|
"eval_steps": 500,
|
|
"global_step": 15641,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.0005411401823642415,
|
|
"grad_norm": 3.381409168243408,
|
|
"learning_rate": 8.999999999999999e-05,
|
|
"loss": 11.904257202148438,
|
|
"num_input_tokens_seen": 5242880,
|
|
"step": 10,
|
|
"train_runtime": 98.6668,
|
|
"train_tokens_per_second": 53137.211
|
|
},
|
|
{
|
|
"epoch": 0.001082280364728483,
|
|
"grad_norm": 3.160391092300415,
|
|
"learning_rate": 0.00019,
|
|
"loss": 11.522220611572266,
|
|
"num_input_tokens_seen": 10485760,
|
|
"step": 20,
|
|
"train_runtime": 143.6354,
|
|
"train_tokens_per_second": 73002.603
|
|
},
|
|
{
|
|
"epoch": 0.0016234205470927244,
|
|
"grad_norm": 2.81247878074646,
|
|
"learning_rate": 0.00029,
|
|
"loss": 10.88598403930664,
|
|
"num_input_tokens_seen": 15728640,
|
|
"step": 30,
|
|
"train_runtime": 188.573,
|
|
"train_tokens_per_second": 83408.77
|
|
},
|
|
{
|
|
"epoch": 0.002164560729456966,
|
|
"grad_norm": 2.100623846054077,
|
|
"learning_rate": 0.00039,
|
|
"loss": 10.268418884277343,
|
|
"num_input_tokens_seen": 20971520,
|
|
"step": 40,
|
|
"train_runtime": 233.5046,
|
|
"train_tokens_per_second": 89812.025
|
|
},
|
|
{
|
|
"epoch": 0.002705700911821207,
|
|
"grad_norm": 1.6848973035812378,
|
|
"learning_rate": 0.00049,
|
|
"loss": 9.741734313964844,
|
|
"num_input_tokens_seen": 26214400,
|
|
"step": 50,
|
|
"train_runtime": 278.4485,
|
|
"train_tokens_per_second": 94144.504
|
|
},
|
|
{
|
|
"epoch": 0.003246841094185449,
|
|
"grad_norm": 2.3044769763946533,
|
|
"learning_rate": 0.00059,
|
|
"loss": 9.239765167236328,
|
|
"num_input_tokens_seen": 31457280,
|
|
"step": 60,
|
|
"train_runtime": 323.3989,
|
|
"train_tokens_per_second": 97270.822
|
|
},
|
|
{
|
|
"epoch": 0.00378798127654969,
|
|
"grad_norm": 1.6891608238220215,
|
|
"learning_rate": 0.0006900000000000001,
|
|
"loss": 8.75671157836914,
|
|
"num_input_tokens_seen": 36700160,
|
|
"step": 70,
|
|
"train_runtime": 368.3824,
|
|
"train_tokens_per_second": 99625.16
|
|
},
|
|
{
|
|
"epoch": 0.004329121458913932,
|
|
"grad_norm": 1.304526448249817,
|
|
"learning_rate": 0.00079,
|
|
"loss": 8.304008483886719,
|
|
"num_input_tokens_seen": 41943040,
|
|
"step": 80,
|
|
"train_runtime": 413.3956,
|
|
"train_tokens_per_second": 101459.807
|
|
},
|
|
{
|
|
"epoch": 0.004870261641278173,
|
|
"grad_norm": 0.9729027152061462,
|
|
"learning_rate": 0.00089,
|
|
"loss": 7.881130981445312,
|
|
"num_input_tokens_seen": 47185920,
|
|
"step": 90,
|
|
"train_runtime": 458.4234,
|
|
"train_tokens_per_second": 102930.863
|
|
},
|
|
{
|
|
"epoch": 0.005411401823642414,
|
|
"grad_norm": 0.8812423944473267,
|
|
"learning_rate": 0.00099,
|
|
"loss": 7.507221984863281,
|
|
"num_input_tokens_seen": 52428800,
|
|
"step": 100,
|
|
"train_runtime": 503.4688,
|
|
"train_tokens_per_second": 104135.16
|
|
},
|
|
{
|
|
"epoch": 0.005952542006006656,
|
|
"grad_norm": 0.8653954863548279,
|
|
"learning_rate": 0.00109,
|
|
"loss": 7.197725677490235,
|
|
"num_input_tokens_seen": 57671680,
|
|
"step": 110,
|
|
"train_runtime": 548.5148,
|
|
"train_tokens_per_second": 105141.522
|
|
},
|
|
{
|
|
"epoch": 0.006493682188370898,
|
|
"grad_norm": 0.6527077555656433,
|
|
"learning_rate": 0.0011899999999999999,
|
|
"loss": 6.962340545654297,
|
|
"num_input_tokens_seen": 62914560,
|
|
"step": 120,
|
|
"train_runtime": 593.6028,
|
|
"train_tokens_per_second": 105987.634
|
|
},
|
|
{
|
|
"epoch": 0.007034822370735139,
|
|
"grad_norm": 0.9212841987609863,
|
|
"learning_rate": 0.0012900000000000001,
|
|
"loss": 6.748843383789063,
|
|
"num_input_tokens_seen": 68157440,
|
|
"step": 130,
|
|
"train_runtime": 638.7162,
|
|
"train_tokens_per_second": 106710.057
|
|
},
|
|
{
|
|
"epoch": 0.00757596255309938,
|
|
"grad_norm": 0.8431305885314941,
|
|
"learning_rate": 0.0013900000000000002,
|
|
"loss": 6.572750854492187,
|
|
"num_input_tokens_seen": 73400320,
|
|
"step": 140,
|
|
"train_runtime": 683.8844,
|
|
"train_tokens_per_second": 107328.553
|
|
},
|
|
{
|
|
"epoch": 0.008117102735463622,
|
|
"grad_norm": 1.0355322360992432,
|
|
"learning_rate": 0.00149,
|
|
"loss": 6.391636657714844,
|
|
"num_input_tokens_seen": 78643200,
|
|
"step": 150,
|
|
"train_runtime": 729.0444,
|
|
"train_tokens_per_second": 107871.613
|
|
},
|
|
{
|
|
"epoch": 0.008658242917827864,
|
|
"grad_norm": 1.3339749574661255,
|
|
"learning_rate": 0.00159,
|
|
"loss": 6.223532104492188,
|
|
"num_input_tokens_seen": 83886080,
|
|
"step": 160,
|
|
"train_runtime": 774.2149,
|
|
"train_tokens_per_second": 108349.867
|
|
},
|
|
{
|
|
"epoch": 0.009199383100192105,
|
|
"grad_norm": 1.152486801147461,
|
|
"learning_rate": 0.00169,
|
|
"loss": 6.081370162963867,
|
|
"num_input_tokens_seen": 89128960,
|
|
"step": 170,
|
|
"train_runtime": 819.4009,
|
|
"train_tokens_per_second": 108773.326
|
|
},
|
|
{
|
|
"epoch": 0.009740523282556346,
|
|
"grad_norm": 1.163500189781189,
|
|
"learning_rate": 0.00179,
|
|
"loss": 5.940819549560547,
|
|
"num_input_tokens_seen": 94371840,
|
|
"step": 180,
|
|
"train_runtime": 864.5859,
|
|
"train_tokens_per_second": 109152.654
|
|
},
|
|
{
|
|
"epoch": 0.010281663464920588,
|
|
"grad_norm": 1.2408533096313477,
|
|
"learning_rate": 0.00189,
|
|
"loss": 5.812583160400391,
|
|
"num_input_tokens_seen": 99614720,
|
|
"step": 190,
|
|
"train_runtime": 909.7569,
|
|
"train_tokens_per_second": 109495.979
|
|
},
|
|
{
|
|
"epoch": 0.010822803647284829,
|
|
"grad_norm": 1.1574287414550781,
|
|
"learning_rate": 0.00199,
|
|
"loss": 5.6905670166015625,
|
|
"num_input_tokens_seen": 104857600,
|
|
"step": 200,
|
|
"train_runtime": 954.9646,
|
|
"train_tokens_per_second": 109802.599
|
|
},
|
|
{
|
|
"epoch": 0.011363943829649071,
|
|
"grad_norm": 1.296819806098938,
|
|
"learning_rate": 0.00209,
|
|
"loss": 5.591656494140625,
|
|
"num_input_tokens_seen": 110100480,
|
|
"step": 210,
|
|
"train_runtime": 1000.1348,
|
|
"train_tokens_per_second": 110085.64
|
|
},
|
|
{
|
|
"epoch": 0.011905084012013312,
|
|
"grad_norm": 1.0654325485229492,
|
|
"learning_rate": 0.00219,
|
|
"loss": 5.485440444946289,
|
|
"num_input_tokens_seen": 115343360,
|
|
"step": 220,
|
|
"train_runtime": 1045.3265,
|
|
"train_tokens_per_second": 110341.942
|
|
},
|
|
{
|
|
"epoch": 0.012446224194377553,
|
|
"grad_norm": 1.1002130508422852,
|
|
"learning_rate": 0.00229,
|
|
"loss": 5.387198257446289,
|
|
"num_input_tokens_seen": 120586240,
|
|
"step": 230,
|
|
"train_runtime": 1090.5293,
|
|
"train_tokens_per_second": 110575.879
|
|
},
|
|
{
|
|
"epoch": 0.012987364376741795,
|
|
"grad_norm": 1.0023939609527588,
|
|
"learning_rate": 0.0023899999999999998,
|
|
"loss": 5.29501953125,
|
|
"num_input_tokens_seen": 125829120,
|
|
"step": 240,
|
|
"train_runtime": 1135.7379,
|
|
"train_tokens_per_second": 110790.637
|
|
},
|
|
{
|
|
"epoch": 0.013528504559106036,
|
|
"grad_norm": 0.8933797478675842,
|
|
"learning_rate": 0.00249,
|
|
"loss": 5.21459846496582,
|
|
"num_input_tokens_seen": 131072000,
|
|
"step": 250,
|
|
"train_runtime": 1180.9487,
|
|
"train_tokens_per_second": 110988.737
|
|
},
|
|
{
|
|
"epoch": 0.014069644741470278,
|
|
"grad_norm": 1.0700093507766724,
|
|
"learning_rate": 0.0025900000000000003,
|
|
"loss": 5.131219482421875,
|
|
"num_input_tokens_seen": 136314880,
|
|
"step": 260,
|
|
"train_runtime": 1226.1688,
|
|
"train_tokens_per_second": 111171.384
|
|
},
|
|
{
|
|
"epoch": 0.01461078492383452,
|
|
"grad_norm": 1.194157600402832,
|
|
"learning_rate": 0.00269,
|
|
"loss": 5.058176422119141,
|
|
"num_input_tokens_seen": 141557760,
|
|
"step": 270,
|
|
"train_runtime": 1271.4137,
|
|
"train_tokens_per_second": 111338.863
|
|
},
|
|
{
|
|
"epoch": 0.01515192510619876,
|
|
"grad_norm": 1.097806453704834,
|
|
"learning_rate": 0.0027900000000000004,
|
|
"loss": 4.985312652587891,
|
|
"num_input_tokens_seen": 146800640,
|
|
"step": 280,
|
|
"train_runtime": 1316.6654,
|
|
"train_tokens_per_second": 111494.264
|
|
},
|
|
{
|
|
"epoch": 0.015693065288563002,
|
|
"grad_norm": 0.9698807001113892,
|
|
"learning_rate": 0.0028899999999999998,
|
|
"loss": 4.905867004394532,
|
|
"num_input_tokens_seen": 152043520,
|
|
"step": 290,
|
|
"train_runtime": 1361.9146,
|
|
"train_tokens_per_second": 111639.537
|
|
},
|
|
{
|
|
"epoch": 0.016234205470927243,
|
|
"grad_norm": 0.9057841300964355,
|
|
"learning_rate": 0.00299,
|
|
"loss": 4.847021102905273,
|
|
"num_input_tokens_seen": 157286400,
|
|
"step": 300,
|
|
"train_runtime": 1407.157,
|
|
"train_tokens_per_second": 111776.016
|
|
},
|
|
{
|
|
"epoch": 0.016775345653291484,
|
|
"grad_norm": 0.9901854395866394,
|
|
"learning_rate": 0.00309,
|
|
"loss": 4.7939613342285154,
|
|
"num_input_tokens_seen": 162529280,
|
|
"step": 310,
|
|
"train_runtime": 1452.4002,
|
|
"train_tokens_per_second": 111903.927
|
|
},
|
|
{
|
|
"epoch": 0.017316485835655728,
|
|
"grad_norm": 0.9903898239135742,
|
|
"learning_rate": 0.00319,
|
|
"loss": 4.720642852783203,
|
|
"num_input_tokens_seen": 167772160,
|
|
"step": 320,
|
|
"train_runtime": 1497.6772,
|
|
"train_tokens_per_second": 112021.577
|
|
},
|
|
{
|
|
"epoch": 0.01785762601801997,
|
|
"grad_norm": 1.001007318496704,
|
|
"learning_rate": 0.0032900000000000004,
|
|
"loss": 4.6716564178466795,
|
|
"num_input_tokens_seen": 173015040,
|
|
"step": 330,
|
|
"train_runtime": 1542.9302,
|
|
"train_tokens_per_second": 112134.066
|
|
},
|
|
{
|
|
"epoch": 0.01839876620038421,
|
|
"grad_norm": 0.9968867897987366,
|
|
"learning_rate": 0.0033900000000000002,
|
|
"loss": 4.6183723449707035,
|
|
"num_input_tokens_seen": 178257920,
|
|
"step": 340,
|
|
"train_runtime": 1588.1903,
|
|
"train_tokens_per_second": 112239.647
|
|
},
|
|
{
|
|
"epoch": 0.01893990638274845,
|
|
"grad_norm": 0.9285950064659119,
|
|
"learning_rate": 0.00349,
|
|
"loss": 4.572103881835938,
|
|
"num_input_tokens_seen": 183500800,
|
|
"step": 350,
|
|
"train_runtime": 1633.4577,
|
|
"train_tokens_per_second": 112338.875
|
|
},
|
|
{
|
|
"epoch": 0.01948104656511269,
|
|
"grad_norm": 0.989262044429779,
|
|
"learning_rate": 0.00359,
|
|
"loss": 4.529151153564453,
|
|
"num_input_tokens_seen": 188743680,
|
|
"step": 360,
|
|
"train_runtime": 1678.7231,
|
|
"train_tokens_per_second": 112432.882
|
|
},
|
|
{
|
|
"epoch": 0.020022186747476935,
|
|
"grad_norm": 1.0208923816680908,
|
|
"learning_rate": 0.00369,
|
|
"loss": 4.4996803283691404,
|
|
"num_input_tokens_seen": 193986560,
|
|
"step": 370,
|
|
"train_runtime": 1724.0055,
|
|
"train_tokens_per_second": 112520.846
|
|
},
|
|
{
|
|
"epoch": 0.020563326929841176,
|
|
"grad_norm": 0.9494571089744568,
|
|
"learning_rate": 0.00379,
|
|
"loss": 4.4542186737060545,
|
|
"num_input_tokens_seen": 199229440,
|
|
"step": 380,
|
|
"train_runtime": 1769.3003,
|
|
"train_tokens_per_second": 112603.517
|
|
},
|
|
{
|
|
"epoch": 0.021104467112205417,
|
|
"grad_norm": 0.7988581657409668,
|
|
"learning_rate": 0.0038900000000000002,
|
|
"loss": 4.431841278076172,
|
|
"num_input_tokens_seen": 204472320,
|
|
"step": 390,
|
|
"train_runtime": 1818.3866,
|
|
"train_tokens_per_second": 112447.111
|
|
},
|
|
{
|
|
"epoch": 0.021645607294569658,
|
|
"grad_norm": 0.832046389579773,
|
|
"learning_rate": 0.0039900000000000005,
|
|
"loss": 4.396000671386719,
|
|
"num_input_tokens_seen": 209715200,
|
|
"step": 400,
|
|
"train_runtime": 1863.6718,
|
|
"train_tokens_per_second": 112527.967
|
|
},
|
|
{
|
|
"epoch": 0.0221867474769339,
|
|
"grad_norm": 0.8342320919036865,
|
|
"learning_rate": 0.00409,
|
|
"loss": 4.37408332824707,
|
|
"num_input_tokens_seen": 214958080,
|
|
"step": 410,
|
|
"train_runtime": 1909.0219,
|
|
"train_tokens_per_second": 112601.162
|
|
},
|
|
{
|
|
"epoch": 0.022727887659298143,
|
|
"grad_norm": 0.9766927361488342,
|
|
"learning_rate": 0.00419,
|
|
"loss": 4.35020637512207,
|
|
"num_input_tokens_seen": 220200960,
|
|
"step": 420,
|
|
"train_runtime": 1954.3817,
|
|
"train_tokens_per_second": 112670.397
|
|
},
|
|
{
|
|
"epoch": 0.023269027841662383,
|
|
"grad_norm": 0.8501082062721252,
|
|
"learning_rate": 0.00429,
|
|
"loss": 4.312299346923828,
|
|
"num_input_tokens_seen": 225443840,
|
|
"step": 430,
|
|
"train_runtime": 1999.72,
|
|
"train_tokens_per_second": 112737.702
|
|
},
|
|
{
|
|
"epoch": 0.023810168024026624,
|
|
"grad_norm": 0.8430765867233276,
|
|
"learning_rate": 0.00439,
|
|
"loss": 4.310842895507813,
|
|
"num_input_tokens_seen": 230686720,
|
|
"step": 440,
|
|
"train_runtime": 2045.0616,
|
|
"train_tokens_per_second": 112801.843
|
|
},
|
|
{
|
|
"epoch": 0.024351308206390865,
|
|
"grad_norm": 0.7848499417304993,
|
|
"learning_rate": 0.00449,
|
|
"loss": 4.2807456970214846,
|
|
"num_input_tokens_seen": 235929600,
|
|
"step": 450,
|
|
"train_runtime": 2090.4316,
|
|
"train_tokens_per_second": 112861.668
|
|
},
|
|
{
|
|
"epoch": 0.024892448388755106,
|
|
"grad_norm": 0.9066799879074097,
|
|
"learning_rate": 0.00459,
|
|
"loss": 4.257038879394531,
|
|
"num_input_tokens_seen": 241172480,
|
|
"step": 460,
|
|
"train_runtime": 2135.8061,
|
|
"train_tokens_per_second": 112918.713
|
|
},
|
|
{
|
|
"epoch": 0.02543358857111935,
|
|
"grad_norm": 0.7888091802597046,
|
|
"learning_rate": 0.00469,
|
|
"loss": 4.235981750488281,
|
|
"num_input_tokens_seen": 246415360,
|
|
"step": 470,
|
|
"train_runtime": 2181.1434,
|
|
"train_tokens_per_second": 112975.315
|
|
},
|
|
{
|
|
"epoch": 0.02597472875348359,
|
|
"grad_norm": 0.6987936496734619,
|
|
"learning_rate": 0.00479,
|
|
"loss": 4.215058135986328,
|
|
"num_input_tokens_seen": 251658240,
|
|
"step": 480,
|
|
"train_runtime": 2226.5022,
|
|
"train_tokens_per_second": 113028.517
|
|
},
|
|
{
|
|
"epoch": 0.02651586893584783,
|
|
"grad_norm": 0.8686115741729736,
|
|
"learning_rate": 0.00489,
|
|
"loss": 4.219595336914063,
|
|
"num_input_tokens_seen": 256901120,
|
|
"step": 490,
|
|
"train_runtime": 2271.8741,
|
|
"train_tokens_per_second": 113078.945
|
|
},
|
|
{
|
|
"epoch": 0.027057009118212072,
|
|
"grad_norm": 0.9207416772842407,
|
|
"learning_rate": 0.0049900000000000005,
|
|
"loss": 4.1927734375,
|
|
"num_input_tokens_seen": 262144000,
|
|
"step": 500,
|
|
"train_runtime": 2317.2129,
|
|
"train_tokens_per_second": 113129.008
|
|
},
|
|
{
|
|
"epoch": 0.027057009118212072,
|
|
"eval_loss": 4.101890563964844,
|
|
"eval_runtime": 2.0015,
|
|
"eval_samples_per_second": 249.812,
|
|
"eval_steps_per_second": 3.997,
|
|
"num_input_tokens_seen": 262144000,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 0.027598149300576313,
|
|
"grad_norm": 0.7904194593429565,
|
|
"learning_rate": 0.0049999972179955365,
|
|
"loss": 4.170513916015625,
|
|
"num_input_tokens_seen": 267386880,
|
|
"step": 510,
|
|
"train_runtime": 2364.5371,
|
|
"train_tokens_per_second": 113082.126
|
|
},
|
|
{
|
|
"epoch": 0.028139289482940557,
|
|
"grad_norm": 0.63487708568573,
|
|
"learning_rate": 0.004999987601198816,
|
|
"loss": 4.152132034301758,
|
|
"num_input_tokens_seen": 272629760,
|
|
"step": 520,
|
|
"train_runtime": 2409.8408,
|
|
"train_tokens_per_second": 113131.854
|
|
},
|
|
{
|
|
"epoch": 0.028680429665304798,
|
|
"grad_norm": 0.519443154335022,
|
|
"learning_rate": 0.0049999711152934586,
|
|
"loss": 4.145978164672852,
|
|
"num_input_tokens_seen": 277872640,
|
|
"step": 530,
|
|
"train_runtime": 2455.1664,
|
|
"train_tokens_per_second": 113178.739
|
|
},
|
|
{
|
|
"epoch": 0.02922156984766904,
|
|
"grad_norm": 0.5551373362541199,
|
|
"learning_rate": 0.004999947760329793,
|
|
"loss": 4.118004608154297,
|
|
"num_input_tokens_seen": 283115520,
|
|
"step": 540,
|
|
"train_runtime": 2500.4657,
|
|
"train_tokens_per_second": 113225.118
|
|
},
|
|
{
|
|
"epoch": 0.02976271003003328,
|
|
"grad_norm": 0.48466068506240845,
|
|
"learning_rate": 0.004999917536379122,
|
|
"loss": 4.0990447998046875,
|
|
"num_input_tokens_seen": 288358400,
|
|
"step": 550,
|
|
"train_runtime": 2545.7721,
|
|
"train_tokens_per_second": 113269.528
|
|
},
|
|
{
|
|
"epoch": 0.03030385021239752,
|
|
"grad_norm": 0.4300881624221802,
|
|
"learning_rate": 0.004999880443533718,
|
|
"loss": 4.095553207397461,
|
|
"num_input_tokens_seen": 293601280,
|
|
"step": 560,
|
|
"train_runtime": 2591.05,
|
|
"train_tokens_per_second": 113313.63
|
|
},
|
|
{
|
|
"epoch": 0.030844990394761764,
|
|
"grad_norm": 0.3266729414463043,
|
|
"learning_rate": 0.004999836481906822,
|
|
"loss": 4.074318313598633,
|
|
"num_input_tokens_seen": 298844160,
|
|
"step": 570,
|
|
"train_runtime": 2636.411,
|
|
"train_tokens_per_second": 113352.647
|
|
},
|
|
{
|
|
"epoch": 0.031386130577126005,
|
|
"grad_norm": 0.34210285544395447,
|
|
"learning_rate": 0.004999785651632649,
|
|
"loss": 4.055056762695313,
|
|
"num_input_tokens_seen": 304087040,
|
|
"step": 580,
|
|
"train_runtime": 2681.696,
|
|
"train_tokens_per_second": 113393.554
|
|
},
|
|
{
|
|
"epoch": 0.03192727075949025,
|
|
"grad_norm": 0.3171045482158661,
|
|
"learning_rate": 0.004999727952866382,
|
|
"loss": 4.028103637695312,
|
|
"num_input_tokens_seen": 309329920,
|
|
"step": 590,
|
|
"train_runtime": 2726.9513,
|
|
"train_tokens_per_second": 113434.342
|
|
},
|
|
{
|
|
"epoch": 0.032468410941854486,
|
|
"grad_norm": 0.28656497597694397,
|
|
"learning_rate": 0.00499966338578417,
|
|
"loss": 4.014062118530274,
|
|
"num_input_tokens_seen": 314572800,
|
|
"step": 600,
|
|
"train_runtime": 2772.2472,
|
|
"train_tokens_per_second": 113472.131
|
|
},
|
|
{
|
|
"epoch": 0.03300955112421873,
|
|
"grad_norm": 0.31004276871681213,
|
|
"learning_rate": 0.004999591950583134,
|
|
"loss": 4.000431060791016,
|
|
"num_input_tokens_seen": 319815680,
|
|
"step": 610,
|
|
"train_runtime": 2817.5313,
|
|
"train_tokens_per_second": 113509.186
|
|
},
|
|
{
|
|
"epoch": 0.03355069130658297,
|
|
"grad_norm": 0.29579785466194153,
|
|
"learning_rate": 0.004999513647481364,
|
|
"loss": 3.9810386657714845,
|
|
"num_input_tokens_seen": 325058560,
|
|
"step": 620,
|
|
"train_runtime": 2862.8161,
|
|
"train_tokens_per_second": 113545.036
|
|
},
|
|
{
|
|
"epoch": 0.03409183148894721,
|
|
"grad_norm": 0.28329184651374817,
|
|
"learning_rate": 0.0049994284767179145,
|
|
"loss": 3.975200653076172,
|
|
"num_input_tokens_seen": 330301440,
|
|
"step": 630,
|
|
"train_runtime": 2908.1102,
|
|
"train_tokens_per_second": 113579.411
|
|
},
|
|
{
|
|
"epoch": 0.034632971671311456,
|
|
"grad_norm": 0.2848559319972992,
|
|
"learning_rate": 0.004999336438552809,
|
|
"loss": 3.9574630737304686,
|
|
"num_input_tokens_seen": 335544320,
|
|
"step": 640,
|
|
"train_runtime": 2953.403,
|
|
"train_tokens_per_second": 113612.776
|
|
},
|
|
{
|
|
"epoch": 0.035174111853675694,
|
|
"grad_norm": 0.2778968811035156,
|
|
"learning_rate": 0.004999237533267034,
|
|
"loss": 3.951917266845703,
|
|
"num_input_tokens_seen": 340787200,
|
|
"step": 650,
|
|
"train_runtime": 2998.7048,
|
|
"train_tokens_per_second": 113644.799
|
|
},
|
|
{
|
|
"epoch": 0.03571525203603994,
|
|
"grad_norm": 0.28124260902404785,
|
|
"learning_rate": 0.004999131761162544,
|
|
"loss": 3.93038330078125,
|
|
"num_input_tokens_seen": 346030080,
|
|
"step": 660,
|
|
"train_runtime": 3044.0205,
|
|
"train_tokens_per_second": 113675.344
|
|
},
|
|
{
|
|
"epoch": 0.036256392218404175,
|
|
"grad_norm": 0.25421732664108276,
|
|
"learning_rate": 0.004999019122562258,
|
|
"loss": 3.9207611083984375,
|
|
"num_input_tokens_seen": 351272960,
|
|
"step": 670,
|
|
"train_runtime": 3089.3299,
|
|
"train_tokens_per_second": 113705.227
|
|
},
|
|
{
|
|
"epoch": 0.03679753240076842,
|
|
"grad_norm": 0.2740730345249176,
|
|
"learning_rate": 0.0049988996178100525,
|
|
"loss": 3.91453857421875,
|
|
"num_input_tokens_seen": 356515840,
|
|
"step": 680,
|
|
"train_runtime": 3134.6017,
|
|
"train_tokens_per_second": 113735.61
|
|
},
|
|
{
|
|
"epoch": 0.037338672583132664,
|
|
"grad_norm": 0.2670656740665436,
|
|
"learning_rate": 0.004998773247270772,
|
|
"loss": 3.884227752685547,
|
|
"num_input_tokens_seen": 361758720,
|
|
"step": 690,
|
|
"train_runtime": 3179.9122,
|
|
"train_tokens_per_second": 113763.746
|
|
},
|
|
{
|
|
"epoch": 0.0378798127654969,
|
|
"grad_norm": 0.2549172341823578,
|
|
"learning_rate": 0.004998640011330221,
|
|
"loss": 3.880903625488281,
|
|
"num_input_tokens_seen": 367001600,
|
|
"step": 700,
|
|
"train_runtime": 3225.2126,
|
|
"train_tokens_per_second": 113791.443
|
|
},
|
|
{
|
|
"epoch": 0.038420952947861145,
|
|
"grad_norm": 0.23274943232536316,
|
|
"learning_rate": 0.004998499910395162,
|
|
"loss": 3.8808818817138673,
|
|
"num_input_tokens_seen": 372244480,
|
|
"step": 710,
|
|
"train_runtime": 3270.4782,
|
|
"train_tokens_per_second": 113819.588
|
|
},
|
|
{
|
|
"epoch": 0.03896209313022538,
|
|
"grad_norm": 0.2661728858947754,
|
|
"learning_rate": 0.004998352944893316,
|
|
"loss": 3.860551452636719,
|
|
"num_input_tokens_seen": 377487360,
|
|
"step": 720,
|
|
"train_runtime": 3315.7715,
|
|
"train_tokens_per_second": 113846.012
|
|
},
|
|
{
|
|
"epoch": 0.039503233312589627,
|
|
"grad_norm": 0.27070483565330505,
|
|
"learning_rate": 0.004998199115273362,
|
|
"loss": 3.8578773498535157,
|
|
"num_input_tokens_seen": 382730240,
|
|
"step": 730,
|
|
"train_runtime": 3361.0384,
|
|
"train_tokens_per_second": 113872.616
|
|
},
|
|
{
|
|
"epoch": 0.04004437349495387,
|
|
"grad_norm": 0.2620537281036377,
|
|
"learning_rate": 0.004998038422004937,
|
|
"loss": 3.8334423065185548,
|
|
"num_input_tokens_seen": 387973120,
|
|
"step": 740,
|
|
"train_runtime": 3406.3177,
|
|
"train_tokens_per_second": 113898.102
|
|
},
|
|
{
|
|
"epoch": 0.04058551367731811,
|
|
"grad_norm": 0.24665935337543488,
|
|
"learning_rate": 0.004997870865578627,
|
|
"loss": 3.830191802978516,
|
|
"num_input_tokens_seen": 393216000,
|
|
"step": 750,
|
|
"train_runtime": 3451.6094,
|
|
"train_tokens_per_second": 113922.508
|
|
},
|
|
{
|
|
"epoch": 0.04112665385968235,
|
|
"grad_norm": 0.3058369755744934,
|
|
"learning_rate": 0.004997696446505975,
|
|
"loss": 3.81226806640625,
|
|
"num_input_tokens_seen": 398458880,
|
|
"step": 760,
|
|
"train_runtime": 3496.8514,
|
|
"train_tokens_per_second": 113947.901
|
|
},
|
|
{
|
|
"epoch": 0.04166779404204659,
|
|
"grad_norm": 0.24344538152217865,
|
|
"learning_rate": 0.004997515165319476,
|
|
"loss": 3.8191978454589846,
|
|
"num_input_tokens_seen": 403701760,
|
|
"step": 770,
|
|
"train_runtime": 3545.7622,
|
|
"train_tokens_per_second": 113854.718
|
|
},
|
|
{
|
|
"epoch": 0.042208934224410834,
|
|
"grad_norm": 0.26970189809799194,
|
|
"learning_rate": 0.004997327022572571,
|
|
"loss": 3.794965362548828,
|
|
"num_input_tokens_seen": 408944640,
|
|
"step": 780,
|
|
"train_runtime": 3591.0695,
|
|
"train_tokens_per_second": 113878.231
|
|
},
|
|
{
|
|
"epoch": 0.04275007440677508,
|
|
"grad_norm": 0.2699701189994812,
|
|
"learning_rate": 0.0049971320188396525,
|
|
"loss": 3.7990867614746096,
|
|
"num_input_tokens_seen": 414187520,
|
|
"step": 790,
|
|
"train_runtime": 3636.3454,
|
|
"train_tokens_per_second": 113902.14
|
|
},
|
|
{
|
|
"epoch": 0.043291214589139315,
|
|
"grad_norm": 0.24337078630924225,
|
|
"learning_rate": 0.004996930154716057,
|
|
"loss": 3.795510101318359,
|
|
"num_input_tokens_seen": 419430400,
|
|
"step": 800,
|
|
"train_runtime": 3681.6305,
|
|
"train_tokens_per_second": 113925.175
|
|
},
|
|
{
|
|
"epoch": 0.04383235477150356,
|
|
"grad_norm": 0.24991652369499207,
|
|
"learning_rate": 0.004996721430818068,
|
|
"loss": 3.7792850494384767,
|
|
"num_input_tokens_seen": 424673280,
|
|
"step": 810,
|
|
"train_runtime": 3726.9273,
|
|
"train_tokens_per_second": 113947.293
|
|
},
|
|
{
|
|
"epoch": 0.0443734949538678,
|
|
"grad_norm": 0.22850197553634644,
|
|
"learning_rate": 0.004996505847782908,
|
|
"loss": 3.7752288818359374,
|
|
"num_input_tokens_seen": 429916160,
|
|
"step": 820,
|
|
"train_runtime": 3772.1962,
|
|
"train_tokens_per_second": 113969.725
|
|
},
|
|
{
|
|
"epoch": 0.04491463513623204,
|
|
"grad_norm": 0.24704036116600037,
|
|
"learning_rate": 0.004996283406268743,
|
|
"loss": 3.7673095703125,
|
|
"num_input_tokens_seen": 435159040,
|
|
"step": 830,
|
|
"train_runtime": 3817.4555,
|
|
"train_tokens_per_second": 113991.908
|
|
},
|
|
{
|
|
"epoch": 0.045455775318596285,
|
|
"grad_norm": 0.24149645864963531,
|
|
"learning_rate": 0.004996054106954677,
|
|
"loss": 3.767901611328125,
|
|
"num_input_tokens_seen": 440401920,
|
|
"step": 840,
|
|
"train_runtime": 3862.7306,
|
|
"train_tokens_per_second": 114013.106
|
|
},
|
|
{
|
|
"epoch": 0.04599691550096052,
|
|
"grad_norm": 0.26389098167419434,
|
|
"learning_rate": 0.004995817950540749,
|
|
"loss": 3.765447998046875,
|
|
"num_input_tokens_seen": 445644800,
|
|
"step": 850,
|
|
"train_runtime": 3908.0129,
|
|
"train_tokens_per_second": 114033.605
|
|
},
|
|
{
|
|
"epoch": 0.04653805568332477,
|
|
"grad_norm": 0.2389504611492157,
|
|
"learning_rate": 0.004995574937747936,
|
|
"loss": 3.7446453094482424,
|
|
"num_input_tokens_seen": 450887680,
|
|
"step": 860,
|
|
"train_runtime": 3953.2772,
|
|
"train_tokens_per_second": 114054.151
|
|
},
|
|
{
|
|
"epoch": 0.047079195865689004,
|
|
"grad_norm": 0.21696795523166656,
|
|
"learning_rate": 0.0049953250693181425,
|
|
"loss": 3.7382736206054688,
|
|
"num_input_tokens_seen": 456130560,
|
|
"step": 870,
|
|
"train_runtime": 3998.5453,
|
|
"train_tokens_per_second": 114074.125
|
|
},
|
|
{
|
|
"epoch": 0.04762033604805325,
|
|
"grad_norm": 0.23217777907848358,
|
|
"learning_rate": 0.004995068346014207,
|
|
"loss": 3.7418495178222657,
|
|
"num_input_tokens_seen": 461373440,
|
|
"step": 880,
|
|
"train_runtime": 4043.8212,
|
|
"train_tokens_per_second": 114093.431
|
|
},
|
|
{
|
|
"epoch": 0.04816147623041749,
|
|
"grad_norm": 0.25520190596580505,
|
|
"learning_rate": 0.004994804768619892,
|
|
"loss": 3.7273784637451173,
|
|
"num_input_tokens_seen": 466616320,
|
|
"step": 890,
|
|
"train_runtime": 4089.1251,
|
|
"train_tokens_per_second": 114111.53
|
|
},
|
|
{
|
|
"epoch": 0.04870261641278173,
|
|
"grad_norm": 0.2495919018983841,
|
|
"learning_rate": 0.004994534337939889,
|
|
"loss": 3.7182594299316407,
|
|
"num_input_tokens_seen": 471859200,
|
|
"step": 900,
|
|
"train_runtime": 4134.3995,
|
|
"train_tokens_per_second": 114130.045
|
|
},
|
|
{
|
|
"epoch": 0.049243756595145974,
|
|
"grad_norm": 0.2571962773799896,
|
|
"learning_rate": 0.00499425705479981,
|
|
"loss": 3.7261619567871094,
|
|
"num_input_tokens_seen": 477102080,
|
|
"step": 910,
|
|
"train_runtime": 4179.6624,
|
|
"train_tokens_per_second": 114148.472
|
|
},
|
|
{
|
|
"epoch": 0.04978489677751021,
|
|
"grad_norm": 0.2216644585132599,
|
|
"learning_rate": 0.004993972920046188,
|
|
"loss": 3.705414581298828,
|
|
"num_input_tokens_seen": 482344960,
|
|
"step": 920,
|
|
"train_runtime": 4224.9503,
|
|
"train_tokens_per_second": 114165.831
|
|
},
|
|
{
|
|
"epoch": 0.050326036959874455,
|
|
"grad_norm": 0.2777004539966583,
|
|
"learning_rate": 0.004993681934546471,
|
|
"loss": 3.707286834716797,
|
|
"num_input_tokens_seen": 487587840,
|
|
"step": 930,
|
|
"train_runtime": 4270.2223,
|
|
"train_tokens_per_second": 114183.246
|
|
},
|
|
{
|
|
"epoch": 0.0508671771422387,
|
|
"grad_norm": 0.23501209914684296,
|
|
"learning_rate": 0.004993384099189028,
|
|
"loss": 3.7012203216552733,
|
|
"num_input_tokens_seen": 492830720,
|
|
"step": 940,
|
|
"train_runtime": 4315.4919,
|
|
"train_tokens_per_second": 114200.358
|
|
},
|
|
{
|
|
"epoch": 0.05140831732460294,
|
|
"grad_norm": 0.2504929304122925,
|
|
"learning_rate": 0.004993079414883134,
|
|
"loss": 3.7007171630859377,
|
|
"num_input_tokens_seen": 498073600,
|
|
"step": 950,
|
|
"train_runtime": 4360.758,
|
|
"train_tokens_per_second": 114217.207
|
|
},
|
|
{
|
|
"epoch": 0.05194945750696718,
|
|
"grad_norm": 0.265903502702713,
|
|
"learning_rate": 0.004992767882558976,
|
|
"loss": 3.6977813720703123,
|
|
"num_input_tokens_seen": 503316480,
|
|
"step": 960,
|
|
"train_runtime": 4406.0254,
|
|
"train_tokens_per_second": 114233.676
|
|
},
|
|
{
|
|
"epoch": 0.05249059768933142,
|
|
"grad_norm": 0.22946324944496155,
|
|
"learning_rate": 0.00499244950316765,
|
|
"loss": 3.6873912811279297,
|
|
"num_input_tokens_seen": 508559360,
|
|
"step": 970,
|
|
"train_runtime": 4451.29,
|
|
"train_tokens_per_second": 114249.883
|
|
},
|
|
{
|
|
"epoch": 0.05303173787169566,
|
|
"grad_norm": 0.2554706633090973,
|
|
"learning_rate": 0.004992124277681152,
|
|
"loss": 3.6791450500488283,
|
|
"num_input_tokens_seen": 513802240,
|
|
"step": 980,
|
|
"train_runtime": 4496.5462,
|
|
"train_tokens_per_second": 114265.975
|
|
},
|
|
{
|
|
"epoch": 0.05357287805405991,
|
|
"grad_norm": 0.22852079570293427,
|
|
"learning_rate": 0.004991792207092381,
|
|
"loss": 3.677058792114258,
|
|
"num_input_tokens_seen": 519045120,
|
|
"step": 990,
|
|
"train_runtime": 4541.8028,
|
|
"train_tokens_per_second": 114281.739
|
|
},
|
|
{
|
|
"epoch": 0.054114018236424144,
|
|
"grad_norm": 0.24798494577407837,
|
|
"learning_rate": 0.004991453292415134,
|
|
"loss": 3.657318115234375,
|
|
"num_input_tokens_seen": 524288000,
|
|
"step": 1000,
|
|
"train_runtime": 4587.0445,
|
|
"train_tokens_per_second": 114297.561
|
|
},
|
|
{
|
|
"epoch": 0.054114018236424144,
|
|
"eval_loss": 3.6001899242401123,
|
|
"eval_runtime": 1.9848,
|
|
"eval_samples_per_second": 251.913,
|
|
"eval_steps_per_second": 4.031,
|
|
"num_input_tokens_seen": 524288000,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 0.05465515841878839,
|
|
"grad_norm": 0.223563551902771,
|
|
"learning_rate": 0.0049911075346841,
|
|
"loss": 3.666912841796875,
|
|
"num_input_tokens_seen": 529530880,
|
|
"step": 1010,
|
|
"train_runtime": 4637.4751,
|
|
"train_tokens_per_second": 114185.17
|
|
},
|
|
{
|
|
"epoch": 0.055196298601152625,
|
|
"grad_norm": 0.24604271352291107,
|
|
"learning_rate": 0.004990754934954863,
|
|
"loss": 3.6610164642333984,
|
|
"num_input_tokens_seen": 534773760,
|
|
"step": 1020,
|
|
"train_runtime": 4682.7302,
|
|
"train_tokens_per_second": 114201.276
|
|
},
|
|
{
|
|
"epoch": 0.05573743878351687,
|
|
"grad_norm": 0.2436058074235916,
|
|
"learning_rate": 0.004990395494303893,
|
|
"loss": 3.6538921356201173,
|
|
"num_input_tokens_seen": 540016640,
|
|
"step": 1030,
|
|
"train_runtime": 4727.9737,
|
|
"train_tokens_per_second": 114217.353
|
|
},
|
|
{
|
|
"epoch": 0.056278578965881114,
|
|
"grad_norm": 0.24788981676101685,
|
|
"learning_rate": 0.004990029213828546,
|
|
"loss": 3.6453926086425783,
|
|
"num_input_tokens_seen": 545259520,
|
|
"step": 1040,
|
|
"train_runtime": 4773.2764,
|
|
"train_tokens_per_second": 114231.708
|
|
},
|
|
{
|
|
"epoch": 0.05681971914824535,
|
|
"grad_norm": 0.2355376034975052,
|
|
"learning_rate": 0.00498965609464706,
|
|
"loss": 3.653607940673828,
|
|
"num_input_tokens_seen": 550502400,
|
|
"step": 1050,
|
|
"train_runtime": 4818.5445,
|
|
"train_tokens_per_second": 114246.615
|
|
},
|
|
{
|
|
"epoch": 0.057360859330609595,
|
|
"grad_norm": 0.24511760473251343,
|
|
"learning_rate": 0.0049892761378985484,
|
|
"loss": 3.655783462524414,
|
|
"num_input_tokens_seen": 555745280,
|
|
"step": 1060,
|
|
"train_runtime": 4863.8191,
|
|
"train_tokens_per_second": 114261.091
|
|
},
|
|
{
|
|
"epoch": 0.05790199951297383,
|
|
"grad_norm": 0.2463475465774536,
|
|
"learning_rate": 0.004988889344743005,
|
|
"loss": 3.6497840881347656,
|
|
"num_input_tokens_seen": 560988160,
|
|
"step": 1070,
|
|
"train_runtime": 4909.1151,
|
|
"train_tokens_per_second": 114274.804
|
|
},
|
|
{
|
|
"epoch": 0.05844313969533808,
|
|
"grad_norm": 0.24649877846240997,
|
|
"learning_rate": 0.00498849571636129,
|
|
"loss": 3.6289398193359377,
|
|
"num_input_tokens_seen": 566231040,
|
|
"step": 1080,
|
|
"train_runtime": 4954.3585,
|
|
"train_tokens_per_second": 114289.476
|
|
},
|
|
{
|
|
"epoch": 0.05898427987770232,
|
|
"grad_norm": 0.21440814435482025,
|
|
"learning_rate": 0.004988095253955132,
|
|
"loss": 3.6420303344726563,
|
|
"num_input_tokens_seen": 571473920,
|
|
"step": 1090,
|
|
"train_runtime": 4999.6131,
|
|
"train_tokens_per_second": 114303.629
|
|
},
|
|
{
|
|
"epoch": 0.05952542006006656,
|
|
"grad_norm": 0.23143576085567474,
|
|
"learning_rate": 0.004987687958747124,
|
|
"loss": 3.636464309692383,
|
|
"num_input_tokens_seen": 576716800,
|
|
"step": 1100,
|
|
"train_runtime": 5044.8489,
|
|
"train_tokens_per_second": 114317.952
|
|
},
|
|
{
|
|
"epoch": 0.0600665602424308,
|
|
"grad_norm": 0.216554194688797,
|
|
"learning_rate": 0.0049872738319807226,
|
|
"loss": 3.6284786224365235,
|
|
"num_input_tokens_seen": 581959680,
|
|
"step": 1110,
|
|
"train_runtime": 5090.1116,
|
|
"train_tokens_per_second": 114331.419
|
|
},
|
|
{
|
|
"epoch": 0.06060770042479504,
|
|
"grad_norm": 0.21454273164272308,
|
|
"learning_rate": 0.004986852874920234,
|
|
"loss": 3.628643035888672,
|
|
"num_input_tokens_seen": 587202560,
|
|
"step": 1120,
|
|
"train_runtime": 5135.379,
|
|
"train_tokens_per_second": 114344.542
|
|
},
|
|
{
|
|
"epoch": 0.061148840607159284,
|
|
"grad_norm": 0.22195634245872498,
|
|
"learning_rate": 0.004986425088850824,
|
|
"loss": 3.6224212646484375,
|
|
"num_input_tokens_seen": 592445440,
|
|
"step": 1130,
|
|
"train_runtime": 5180.6463,
|
|
"train_tokens_per_second": 114357.438
|
|
},
|
|
{
|
|
"epoch": 0.06168998078952353,
|
|
"grad_norm": 0.23462195694446564,
|
|
"learning_rate": 0.004985990475078501,
|
|
"loss": 3.614238739013672,
|
|
"num_input_tokens_seen": 597688320,
|
|
"step": 1140,
|
|
"train_runtime": 5225.8696,
|
|
"train_tokens_per_second": 114371.074
|
|
},
|
|
{
|
|
"epoch": 0.062231120971887766,
|
|
"grad_norm": 0.2454216629266739,
|
|
"learning_rate": 0.004985549034930123,
|
|
"loss": 3.6097618103027345,
|
|
"num_input_tokens_seen": 602931200,
|
|
"step": 1150,
|
|
"train_runtime": 5274.713,
|
|
"train_tokens_per_second": 114305.973
|
|
},
|
|
{
|
|
"epoch": 0.06277226115425201,
|
|
"grad_norm": 0.22363615036010742,
|
|
"learning_rate": 0.004985100769753384,
|
|
"loss": 3.605723571777344,
|
|
"num_input_tokens_seen": 608174080,
|
|
"step": 1160,
|
|
"train_runtime": 5319.9954,
|
|
"train_tokens_per_second": 114318.535
|
|
},
|
|
{
|
|
"epoch": 0.06331340133661625,
|
|
"grad_norm": 0.2078346163034439,
|
|
"learning_rate": 0.00498464568091682,
|
|
"loss": 3.602735900878906,
|
|
"num_input_tokens_seen": 613416960,
|
|
"step": 1170,
|
|
"train_runtime": 5365.2147,
|
|
"train_tokens_per_second": 114332.23
|
|
},
|
|
{
|
|
"epoch": 0.0638545415189805,
|
|
"grad_norm": 0.21972794830799103,
|
|
"learning_rate": 0.004984183769809795,
|
|
"loss": 3.598741912841797,
|
|
"num_input_tokens_seen": 618659840,
|
|
"step": 1180,
|
|
"train_runtime": 5410.4312,
|
|
"train_tokens_per_second": 114345.754
|
|
},
|
|
{
|
|
"epoch": 0.06439568170134473,
|
|
"grad_norm": 0.2427527755498886,
|
|
"learning_rate": 0.0049837150378425005,
|
|
"loss": 3.596208190917969,
|
|
"num_input_tokens_seen": 623902720,
|
|
"step": 1190,
|
|
"train_runtime": 5455.6665,
|
|
"train_tokens_per_second": 114358.662
|
|
},
|
|
{
|
|
"epoch": 0.06493682188370897,
|
|
"grad_norm": 0.2279594987630844,
|
|
"learning_rate": 0.004983239486445956,
|
|
"loss": 3.59366455078125,
|
|
"num_input_tokens_seen": 629145600,
|
|
"step": 1200,
|
|
"train_runtime": 5500.922,
|
|
"train_tokens_per_second": 114370.936
|
|
},
|
|
{
|
|
"epoch": 0.06547796206607322,
|
|
"grad_norm": 0.23130950331687927,
|
|
"learning_rate": 0.004982757117071998,
|
|
"loss": 3.592302703857422,
|
|
"num_input_tokens_seen": 634388480,
|
|
"step": 1210,
|
|
"train_runtime": 5546.1769,
|
|
"train_tokens_per_second": 114383.024
|
|
},
|
|
{
|
|
"epoch": 0.06601910224843746,
|
|
"grad_norm": 0.2286449670791626,
|
|
"learning_rate": 0.004982267931193276,
|
|
"loss": 3.5859790802001954,
|
|
"num_input_tokens_seen": 639631360,
|
|
"step": 1220,
|
|
"train_runtime": 5591.4258,
|
|
"train_tokens_per_second": 114395.038
|
|
},
|
|
{
|
|
"epoch": 0.0665602424308017,
|
|
"grad_norm": 0.23779889941215515,
|
|
"learning_rate": 0.004981771930303254,
|
|
"loss": 3.586525726318359,
|
|
"num_input_tokens_seen": 644874240,
|
|
"step": 1230,
|
|
"train_runtime": 5636.6531,
|
|
"train_tokens_per_second": 114407.297
|
|
},
|
|
{
|
|
"epoch": 0.06710138261316594,
|
|
"grad_norm": 0.23324504494667053,
|
|
"learning_rate": 0.004981269115916199,
|
|
"loss": 3.579142379760742,
|
|
"num_input_tokens_seen": 650117120,
|
|
"step": 1240,
|
|
"train_runtime": 5681.8961,
|
|
"train_tokens_per_second": 114419.044
|
|
},
|
|
{
|
|
"epoch": 0.06764252279553018,
|
|
"grad_norm": 0.2067607045173645,
|
|
"learning_rate": 0.004980759489567181,
|
|
"loss": 3.5813358306884764,
|
|
"num_input_tokens_seen": 655360000,
|
|
"step": 1250,
|
|
"train_runtime": 5727.162,
|
|
"train_tokens_per_second": 114430.148
|
|
},
|
|
{
|
|
"epoch": 0.06818366297789442,
|
|
"grad_norm": 0.20190924406051636,
|
|
"learning_rate": 0.004980243052812064,
|
|
"loss": 3.572435760498047,
|
|
"num_input_tokens_seen": 660602880,
|
|
"step": 1260,
|
|
"train_runtime": 5772.4268,
|
|
"train_tokens_per_second": 114441.102
|
|
},
|
|
{
|
|
"epoch": 0.06872480316025867,
|
|
"grad_norm": 0.19773253798484802,
|
|
"learning_rate": 0.004979719807227508,
|
|
"loss": 3.5610916137695314,
|
|
"num_input_tokens_seen": 665845760,
|
|
"step": 1270,
|
|
"train_runtime": 5817.687,
|
|
"train_tokens_per_second": 114451.974
|
|
},
|
|
{
|
|
"epoch": 0.06926594334262291,
|
|
"grad_norm": 0.21706561744213104,
|
|
"learning_rate": 0.004979189754410956,
|
|
"loss": 3.5655101776123046,
|
|
"num_input_tokens_seen": 671088640,
|
|
"step": 1280,
|
|
"train_runtime": 5862.9264,
|
|
"train_tokens_per_second": 114463.084
|
|
},
|
|
{
|
|
"epoch": 0.06980708352498714,
|
|
"grad_norm": 0.23492570221424103,
|
|
"learning_rate": 0.004978652895980635,
|
|
"loss": 3.571335220336914,
|
|
"num_input_tokens_seen": 676331520,
|
|
"step": 1290,
|
|
"train_runtime": 5908.1779,
|
|
"train_tokens_per_second": 114473.792
|
|
},
|
|
{
|
|
"epoch": 0.07034822370735139,
|
|
"grad_norm": 0.23728297650814056,
|
|
"learning_rate": 0.004978109233575551,
|
|
"loss": 3.5683116912841797,
|
|
"num_input_tokens_seen": 681574400,
|
|
"step": 1300,
|
|
"train_runtime": 5953.4117,
|
|
"train_tokens_per_second": 114484.674
|
|
},
|
|
{
|
|
"epoch": 0.07088936388971563,
|
|
"grad_norm": 0.2128531038761139,
|
|
"learning_rate": 0.0049775587688554775,
|
|
"loss": 3.553203582763672,
|
|
"num_input_tokens_seen": 686817280,
|
|
"step": 1310,
|
|
"train_runtime": 5998.6316,
|
|
"train_tokens_per_second": 114495.659
|
|
},
|
|
{
|
|
"epoch": 0.07143050407207988,
|
|
"grad_norm": 0.22945411503314972,
|
|
"learning_rate": 0.004977001503500959,
|
|
"loss": 3.5565677642822267,
|
|
"num_input_tokens_seen": 692060160,
|
|
"step": 1320,
|
|
"train_runtime": 6043.8634,
|
|
"train_tokens_per_second": 114506.254
|
|
},
|
|
{
|
|
"epoch": 0.07197164425444412,
|
|
"grad_norm": 0.21733802556991577,
|
|
"learning_rate": 0.004976437439213302,
|
|
"loss": 3.5509429931640626,
|
|
"num_input_tokens_seen": 697303040,
|
|
"step": 1330,
|
|
"train_runtime": 6089.0954,
|
|
"train_tokens_per_second": 114516.688
|
|
},
|
|
{
|
|
"epoch": 0.07251278443680835,
|
|
"grad_norm": 0.24347279965877533,
|
|
"learning_rate": 0.004975866577714568,
|
|
"loss": 3.54642333984375,
|
|
"num_input_tokens_seen": 702545920,
|
|
"step": 1340,
|
|
"train_runtime": 6134.3055,
|
|
"train_tokens_per_second": 114527.376
|
|
},
|
|
{
|
|
"epoch": 0.0730539246191726,
|
|
"grad_norm": 0.21520718932151794,
|
|
"learning_rate": 0.004975288920747571,
|
|
"loss": 3.550141143798828,
|
|
"num_input_tokens_seen": 707788800,
|
|
"step": 1350,
|
|
"train_runtime": 6179.5539,
|
|
"train_tokens_per_second": 114537.201
|
|
},
|
|
{
|
|
"epoch": 0.07359506480153684,
|
|
"grad_norm": 0.23248061537742615,
|
|
"learning_rate": 0.0049747044700758705,
|
|
"loss": 3.5488357543945312,
|
|
"num_input_tokens_seen": 713031680,
|
|
"step": 1360,
|
|
"train_runtime": 6224.7742,
|
|
"train_tokens_per_second": 114547.397
|
|
},
|
|
{
|
|
"epoch": 0.07413620498390108,
|
|
"grad_norm": 0.23462453484535217,
|
|
"learning_rate": 0.004974113227483768,
|
|
"loss": 3.5416290283203127,
|
|
"num_input_tokens_seen": 718274560,
|
|
"step": 1370,
|
|
"train_runtime": 6269.9891,
|
|
"train_tokens_per_second": 114557.544
|
|
},
|
|
{
|
|
"epoch": 0.07467734516626533,
|
|
"grad_norm": 0.2253153920173645,
|
|
"learning_rate": 0.004973515194776301,
|
|
"loss": 3.540643310546875,
|
|
"num_input_tokens_seen": 723517440,
|
|
"step": 1380,
|
|
"train_runtime": 6315.2141,
|
|
"train_tokens_per_second": 114567.365
|
|
},
|
|
{
|
|
"epoch": 0.07521848534862956,
|
|
"grad_norm": 0.21088625490665436,
|
|
"learning_rate": 0.0049729103737792355,
|
|
"loss": 3.543656921386719,
|
|
"num_input_tokens_seen": 728760320,
|
|
"step": 1390,
|
|
"train_runtime": 6360.4473,
|
|
"train_tokens_per_second": 114576.898
|
|
},
|
|
{
|
|
"epoch": 0.0757596255309938,
|
|
"grad_norm": 0.2161734253168106,
|
|
"learning_rate": 0.00497229876633906,
|
|
"loss": 3.5383941650390627,
|
|
"num_input_tokens_seen": 734003200,
|
|
"step": 1400,
|
|
"train_runtime": 6405.6811,
|
|
"train_tokens_per_second": 114586.285
|
|
},
|
|
{
|
|
"epoch": 0.07630076571335805,
|
|
"grad_norm": 0.20709098875522614,
|
|
"learning_rate": 0.004971680374322986,
|
|
"loss": 3.5313690185546873,
|
|
"num_input_tokens_seen": 739246080,
|
|
"step": 1410,
|
|
"train_runtime": 6450.8882,
|
|
"train_tokens_per_second": 114596.015
|
|
},
|
|
{
|
|
"epoch": 0.07684190589572229,
|
|
"grad_norm": 0.20399479568004608,
|
|
"learning_rate": 0.004971055199618935,
|
|
"loss": 3.525136184692383,
|
|
"num_input_tokens_seen": 744488960,
|
|
"step": 1420,
|
|
"train_runtime": 6496.1128,
|
|
"train_tokens_per_second": 114605.3
|
|
},
|
|
{
|
|
"epoch": 0.07738304607808653,
|
|
"grad_norm": 0.21809037029743195,
|
|
"learning_rate": 0.004970423244135538,
|
|
"loss": 3.53038330078125,
|
|
"num_input_tokens_seen": 749731840,
|
|
"step": 1430,
|
|
"train_runtime": 6541.3447,
|
|
"train_tokens_per_second": 114614.33
|
|
},
|
|
{
|
|
"epoch": 0.07792418626045076,
|
|
"grad_norm": 0.22500748932361603,
|
|
"learning_rate": 0.004969784509802125,
|
|
"loss": 3.5225593566894533,
|
|
"num_input_tokens_seen": 754974720,
|
|
"step": 1440,
|
|
"train_runtime": 6586.5647,
|
|
"train_tokens_per_second": 114623.442
|
|
},
|
|
{
|
|
"epoch": 0.07846532644281501,
|
|
"grad_norm": 0.22614286839962006,
|
|
"learning_rate": 0.0049691389985687204,
|
|
"loss": 3.5291175842285156,
|
|
"num_input_tokens_seen": 760217600,
|
|
"step": 1450,
|
|
"train_runtime": 6631.7972,
|
|
"train_tokens_per_second": 114632.215
|
|
},
|
|
{
|
|
"epoch": 0.07900646662517925,
|
|
"grad_norm": 0.2029477208852768,
|
|
"learning_rate": 0.004968486712406044,
|
|
"loss": 3.5189224243164063,
|
|
"num_input_tokens_seen": 765460480,
|
|
"step": 1460,
|
|
"train_runtime": 6677.0236,
|
|
"train_tokens_per_second": 114640.973
|
|
},
|
|
{
|
|
"epoch": 0.0795476068075435,
|
|
"grad_norm": 0.2242126613855362,
|
|
"learning_rate": 0.004967827653305494,
|
|
"loss": 3.504582977294922,
|
|
"num_input_tokens_seen": 770703360,
|
|
"step": 1470,
|
|
"train_runtime": 6722.2724,
|
|
"train_tokens_per_second": 114649.232
|
|
},
|
|
{
|
|
"epoch": 0.08008874698990774,
|
|
"grad_norm": 0.2245851755142212,
|
|
"learning_rate": 0.004967161823279147,
|
|
"loss": 3.5151710510253906,
|
|
"num_input_tokens_seen": 775946240,
|
|
"step": 1480,
|
|
"train_runtime": 6767.5092,
|
|
"train_tokens_per_second": 114657.582
|
|
},
|
|
{
|
|
"epoch": 0.08062988717227197,
|
|
"grad_norm": 0.21762171387672424,
|
|
"learning_rate": 0.004966489224359752,
|
|
"loss": 3.510267639160156,
|
|
"num_input_tokens_seen": 781189120,
|
|
"step": 1490,
|
|
"train_runtime": 6812.7609,
|
|
"train_tokens_per_second": 114665.571
|
|
},
|
|
{
|
|
"epoch": 0.08117102735463622,
|
|
"grad_norm": 0.22595250606536865,
|
|
"learning_rate": 0.0049658098586007225,
|
|
"loss": 3.515250396728516,
|
|
"num_input_tokens_seen": 786432000,
|
|
"step": 1500,
|
|
"train_runtime": 6857.9968,
|
|
"train_tokens_per_second": 114673.719
|
|
},
|
|
{
|
|
"epoch": 0.08117102735463622,
|
|
"eval_loss": 3.439197540283203,
|
|
"eval_runtime": 1.986,
|
|
"eval_samples_per_second": 251.759,
|
|
"eval_steps_per_second": 4.028,
|
|
"num_input_tokens_seen": 786432000,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"epoch": 0.08171216753700046,
|
|
"grad_norm": 0.2041054517030716,
|
|
"learning_rate": 0.00496512372807613,
|
|
"loss": 3.5051536560058594,
|
|
"num_input_tokens_seen": 791674880,
|
|
"step": 1510,
|
|
"train_runtime": 6905.2279,
|
|
"train_tokens_per_second": 114648.624
|
|
},
|
|
{
|
|
"epoch": 0.0822533077193647,
|
|
"grad_norm": 0.20704089105129242,
|
|
"learning_rate": 0.004964430834880702,
|
|
"loss": 3.498210906982422,
|
|
"num_input_tokens_seen": 796917760,
|
|
"step": 1520,
|
|
"train_runtime": 6950.4841,
|
|
"train_tokens_per_second": 114656.439
|
|
},
|
|
{
|
|
"epoch": 0.08279444790172895,
|
|
"grad_norm": 0.2235393226146698,
|
|
"learning_rate": 0.0049637311811298055,
|
|
"loss": 3.510853958129883,
|
|
"num_input_tokens_seen": 802160640,
|
|
"step": 1530,
|
|
"train_runtime": 6999.4133,
|
|
"train_tokens_per_second": 114603.982
|
|
},
|
|
{
|
|
"epoch": 0.08333558808409318,
|
|
"grad_norm": 0.19351601600646973,
|
|
"learning_rate": 0.004963024768959454,
|
|
"loss": 3.4970939636230467,
|
|
"num_input_tokens_seen": 807403520,
|
|
"step": 1540,
|
|
"train_runtime": 7044.6706,
|
|
"train_tokens_per_second": 114611.962
|
|
},
|
|
{
|
|
"epoch": 0.08387672826645742,
|
|
"grad_norm": 0.2104959934949875,
|
|
"learning_rate": 0.0049623116005262915,
|
|
"loss": 3.5016387939453124,
|
|
"num_input_tokens_seen": 812646400,
|
|
"step": 1550,
|
|
"train_runtime": 7089.915,
|
|
"train_tokens_per_second": 114620.048
|
|
},
|
|
{
|
|
"epoch": 0.08441786844882167,
|
|
"grad_norm": 0.25421494245529175,
|
|
"learning_rate": 0.004961591678007588,
|
|
"loss": 3.50089111328125,
|
|
"num_input_tokens_seen": 817889280,
|
|
"step": 1560,
|
|
"train_runtime": 7135.1609,
|
|
"train_tokens_per_second": 114628.008
|
|
},
|
|
{
|
|
"epoch": 0.08495900863118591,
|
|
"grad_norm": 0.2085292786359787,
|
|
"learning_rate": 0.004960865003601232,
|
|
"loss": 3.5003082275390627,
|
|
"num_input_tokens_seen": 823132160,
|
|
"step": 1570,
|
|
"train_runtime": 7180.3935,
|
|
"train_tokens_per_second": 114636.079
|
|
},
|
|
{
|
|
"epoch": 0.08550014881355016,
|
|
"grad_norm": 0.2042287439107895,
|
|
"learning_rate": 0.00496013157952573,
|
|
"loss": 3.495660400390625,
|
|
"num_input_tokens_seen": 828375040,
|
|
"step": 1580,
|
|
"train_runtime": 7225.6594,
|
|
"train_tokens_per_second": 114643.521
|
|
},
|
|
{
|
|
"epoch": 0.08604128899591439,
|
|
"grad_norm": 0.21099670231342316,
|
|
"learning_rate": 0.004959391408020191,
|
|
"loss": 3.4938674926757813,
|
|
"num_input_tokens_seen": 833617920,
|
|
"step": 1590,
|
|
"train_runtime": 7270.8988,
|
|
"train_tokens_per_second": 114651.29
|
|
},
|
|
{
|
|
"epoch": 0.08658242917827863,
|
|
"grad_norm": 0.19382232427597046,
|
|
"learning_rate": 0.004958644491344324,
|
|
"loss": 3.4875198364257813,
|
|
"num_input_tokens_seen": 838860800,
|
|
"step": 1600,
|
|
"train_runtime": 7316.1586,
|
|
"train_tokens_per_second": 114658.64
|
|
},
|
|
{
|
|
"epoch": 0.08712356936064287,
|
|
"grad_norm": 0.20325426757335663,
|
|
"learning_rate": 0.0049578908317784295,
|
|
"loss": 3.487265777587891,
|
|
"num_input_tokens_seen": 844103680,
|
|
"step": 1610,
|
|
"train_runtime": 7361.3918,
|
|
"train_tokens_per_second": 114666.316
|
|
},
|
|
{
|
|
"epoch": 0.08766470954300712,
|
|
"grad_norm": 0.21367783844470978,
|
|
"learning_rate": 0.004957130431623399,
|
|
"loss": 3.4908119201660157,
|
|
"num_input_tokens_seen": 849346560,
|
|
"step": 1620,
|
|
"train_runtime": 7406.6443,
|
|
"train_tokens_per_second": 114673.6
|
|
},
|
|
{
|
|
"epoch": 0.08820584972537136,
|
|
"grad_norm": 0.19166916608810425,
|
|
"learning_rate": 0.004956363293200697,
|
|
"loss": 3.478108215332031,
|
|
"num_input_tokens_seen": 854589440,
|
|
"step": 1630,
|
|
"train_runtime": 7451.8697,
|
|
"train_tokens_per_second": 114681.211
|
|
},
|
|
{
|
|
"epoch": 0.0887469899077356,
|
|
"grad_norm": 0.22475136816501617,
|
|
"learning_rate": 0.004955589418852363,
|
|
"loss": 3.4743488311767576,
|
|
"num_input_tokens_seen": 859832320,
|
|
"step": 1640,
|
|
"train_runtime": 7497.1227,
|
|
"train_tokens_per_second": 114688.308
|
|
},
|
|
{
|
|
"epoch": 0.08928813009009984,
|
|
"grad_norm": 0.22001579403877258,
|
|
"learning_rate": 0.004954808810940998,
|
|
"loss": 3.481397247314453,
|
|
"num_input_tokens_seen": 865075200,
|
|
"step": 1650,
|
|
"train_runtime": 7542.3425,
|
|
"train_tokens_per_second": 114695.826
|
|
},
|
|
{
|
|
"epoch": 0.08982927027246408,
|
|
"grad_norm": 0.20330502092838287,
|
|
"learning_rate": 0.0049540214718497635,
|
|
"loss": 3.47830810546875,
|
|
"num_input_tokens_seen": 870318080,
|
|
"step": 1660,
|
|
"train_runtime": 7587.5621,
|
|
"train_tokens_per_second": 114703.256
|
|
},
|
|
{
|
|
"epoch": 0.09037041045482833,
|
|
"grad_norm": 0.21012984216213226,
|
|
"learning_rate": 0.00495322740398237,
|
|
"loss": 3.470731735229492,
|
|
"num_input_tokens_seen": 875560960,
|
|
"step": 1670,
|
|
"train_runtime": 7632.7891,
|
|
"train_tokens_per_second": 114710.488
|
|
},
|
|
{
|
|
"epoch": 0.09091155063719257,
|
|
"grad_norm": 0.20543314516544342,
|
|
"learning_rate": 0.004952426609763068,
|
|
"loss": 3.4727859497070312,
|
|
"num_input_tokens_seen": 880803840,
|
|
"step": 1680,
|
|
"train_runtime": 7678.0113,
|
|
"train_tokens_per_second": 114717.706
|
|
},
|
|
{
|
|
"epoch": 0.0914526908195568,
|
|
"grad_norm": 0.2099304497241974,
|
|
"learning_rate": 0.004951619091636649,
|
|
"loss": 3.462004852294922,
|
|
"num_input_tokens_seen": 886046720,
|
|
"step": 1690,
|
|
"train_runtime": 7723.225,
|
|
"train_tokens_per_second": 114724.965
|
|
},
|
|
{
|
|
"epoch": 0.09199383100192104,
|
|
"grad_norm": 0.19785360991954803,
|
|
"learning_rate": 0.004950804852068425,
|
|
"loss": 3.468863677978516,
|
|
"num_input_tokens_seen": 891289600,
|
|
"step": 1700,
|
|
"train_runtime": 7768.4346,
|
|
"train_tokens_per_second": 114732.202
|
|
},
|
|
{
|
|
"epoch": 0.09253497118428529,
|
|
"grad_norm": 0.20223312079906464,
|
|
"learning_rate": 0.004949983893544234,
|
|
"loss": 3.47713623046875,
|
|
"num_input_tokens_seen": 896532480,
|
|
"step": 1710,
|
|
"train_runtime": 7813.6346,
|
|
"train_tokens_per_second": 114739.494
|
|
},
|
|
{
|
|
"epoch": 0.09307611136664953,
|
|
"grad_norm": 0.21073880791664124,
|
|
"learning_rate": 0.004949156218570423,
|
|
"loss": 3.4744213104248045,
|
|
"num_input_tokens_seen": 901775360,
|
|
"step": 1720,
|
|
"train_runtime": 7858.8265,
|
|
"train_tokens_per_second": 114746.822
|
|
},
|
|
{
|
|
"epoch": 0.09361725154901378,
|
|
"grad_norm": 0.21444642543792725,
|
|
"learning_rate": 0.004948321829673847,
|
|
"loss": 3.4606704711914062,
|
|
"num_input_tokens_seen": 907018240,
|
|
"step": 1730,
|
|
"train_runtime": 7904.0563,
|
|
"train_tokens_per_second": 114753.514
|
|
},
|
|
{
|
|
"epoch": 0.09415839173137801,
|
|
"grad_norm": 0.21360410749912262,
|
|
"learning_rate": 0.004947480729401857,
|
|
"loss": 3.468334197998047,
|
|
"num_input_tokens_seen": 912261120,
|
|
"step": 1740,
|
|
"train_runtime": 7949.2813,
|
|
"train_tokens_per_second": 114760.201
|
|
},
|
|
{
|
|
"epoch": 0.09469953191374225,
|
|
"grad_norm": 0.20467202365398407,
|
|
"learning_rate": 0.0049466329203222935,
|
|
"loss": 3.451313018798828,
|
|
"num_input_tokens_seen": 917504000,
|
|
"step": 1750,
|
|
"train_runtime": 7994.4951,
|
|
"train_tokens_per_second": 114766.973
|
|
},
|
|
{
|
|
"epoch": 0.0952406720961065,
|
|
"grad_norm": 0.2081199437379837,
|
|
"learning_rate": 0.004945778405023478,
|
|
"loss": 3.4613468170166017,
|
|
"num_input_tokens_seen": 922746880,
|
|
"step": 1760,
|
|
"train_runtime": 8039.7393,
|
|
"train_tokens_per_second": 114773.235
|
|
},
|
|
{
|
|
"epoch": 0.09578181227847074,
|
|
"grad_norm": 0.2067294865846634,
|
|
"learning_rate": 0.004944917186114206,
|
|
"loss": 3.4611587524414062,
|
|
"num_input_tokens_seen": 927989760,
|
|
"step": 1770,
|
|
"train_runtime": 8084.9796,
|
|
"train_tokens_per_second": 114779.481
|
|
},
|
|
{
|
|
"epoch": 0.09632295246083498,
|
|
"grad_norm": 0.19931091368198395,
|
|
"learning_rate": 0.00494404926622374,
|
|
"loss": 3.462744140625,
|
|
"num_input_tokens_seen": 933232640,
|
|
"step": 1780,
|
|
"train_runtime": 8130.2373,
|
|
"train_tokens_per_second": 114785.413
|
|
},
|
|
{
|
|
"epoch": 0.09686409264319921,
|
|
"grad_norm": 0.213568776845932,
|
|
"learning_rate": 0.004943174648001798,
|
|
"loss": 3.456349182128906,
|
|
"num_input_tokens_seen": 938475520,
|
|
"step": 1790,
|
|
"train_runtime": 8175.469,
|
|
"train_tokens_per_second": 114791.644
|
|
},
|
|
{
|
|
"epoch": 0.09740523282556346,
|
|
"grad_norm": 0.20064200460910797,
|
|
"learning_rate": 0.004942293334118552,
|
|
"loss": 3.4558643341064452,
|
|
"num_input_tokens_seen": 943718400,
|
|
"step": 1800,
|
|
"train_runtime": 8220.7059,
|
|
"train_tokens_per_second": 114797.732
|
|
},
|
|
{
|
|
"epoch": 0.0979463730079277,
|
|
"grad_norm": 0.21110571920871735,
|
|
"learning_rate": 0.004941405327264611,
|
|
"loss": 3.4533897399902345,
|
|
"num_input_tokens_seen": 948961280,
|
|
"step": 1810,
|
|
"train_runtime": 8265.9542,
|
|
"train_tokens_per_second": 114803.598
|
|
},
|
|
{
|
|
"epoch": 0.09848751319029195,
|
|
"grad_norm": 0.20624655485153198,
|
|
"learning_rate": 0.0049405106301510186,
|
|
"loss": 3.4500003814697267,
|
|
"num_input_tokens_seen": 954204160,
|
|
"step": 1820,
|
|
"train_runtime": 8311.1987,
|
|
"train_tokens_per_second": 114809.45
|
|
},
|
|
{
|
|
"epoch": 0.09902865337265619,
|
|
"grad_norm": 0.20945154130458832,
|
|
"learning_rate": 0.004939609245509244,
|
|
"loss": 3.440562057495117,
|
|
"num_input_tokens_seen": 959447040,
|
|
"step": 1830,
|
|
"train_runtime": 8356.4084,
|
|
"train_tokens_per_second": 114815.719
|
|
},
|
|
{
|
|
"epoch": 0.09956979355502042,
|
|
"grad_norm": 0.20618219673633575,
|
|
"learning_rate": 0.004938701176091175,
|
|
"loss": 3.4402488708496093,
|
|
"num_input_tokens_seen": 964689920,
|
|
"step": 1840,
|
|
"train_runtime": 8401.6104,
|
|
"train_tokens_per_second": 114822.025
|
|
},
|
|
{
|
|
"epoch": 0.10011093373738467,
|
|
"grad_norm": 0.21744751930236816,
|
|
"learning_rate": 0.004937786424669103,
|
|
"loss": 3.447218322753906,
|
|
"num_input_tokens_seen": 969932800,
|
|
"step": 1850,
|
|
"train_runtime": 8446.8197,
|
|
"train_tokens_per_second": 114828.165
|
|
},
|
|
{
|
|
"epoch": 0.10065207391974891,
|
|
"grad_norm": 0.207778662443161,
|
|
"learning_rate": 0.004936864994035724,
|
|
"loss": 3.4344856262207033,
|
|
"num_input_tokens_seen": 975175680,
|
|
"step": 1860,
|
|
"train_runtime": 8492.045,
|
|
"train_tokens_per_second": 114834.022
|
|
},
|
|
{
|
|
"epoch": 0.10119321410211315,
|
|
"grad_norm": 0.20873455703258514,
|
|
"learning_rate": 0.004935936887004123,
|
|
"loss": 3.4340728759765624,
|
|
"num_input_tokens_seen": 980418560,
|
|
"step": 1870,
|
|
"train_runtime": 8537.2574,
|
|
"train_tokens_per_second": 114839.99
|
|
},
|
|
{
|
|
"epoch": 0.1017343542844774,
|
|
"grad_norm": 0.21933899819850922,
|
|
"learning_rate": 0.004935002106407768,
|
|
"loss": 3.431113433837891,
|
|
"num_input_tokens_seen": 985661440,
|
|
"step": 1880,
|
|
"train_runtime": 8582.5067,
|
|
"train_tokens_per_second": 114845.403
|
|
},
|
|
{
|
|
"epoch": 0.10227549446684163,
|
|
"grad_norm": 0.19961251318454742,
|
|
"learning_rate": 0.0049340606551005,
|
|
"loss": 3.4356346130371094,
|
|
"num_input_tokens_seen": 990904320,
|
|
"step": 1890,
|
|
"train_runtime": 8627.7245,
|
|
"train_tokens_per_second": 114851.177
|
|
},
|
|
{
|
|
"epoch": 0.10281663464920587,
|
|
"grad_norm": 0.1902250349521637,
|
|
"learning_rate": 0.004933112535956529,
|
|
"loss": 3.432623291015625,
|
|
"num_input_tokens_seen": 996147200,
|
|
"step": 1900,
|
|
"train_runtime": 8672.9813,
|
|
"train_tokens_per_second": 114856.376
|
|
},
|
|
{
|
|
"epoch": 0.10335777483157012,
|
|
"grad_norm": 0.1946999877691269,
|
|
"learning_rate": 0.004932157751870416,
|
|
"loss": 3.435283660888672,
|
|
"num_input_tokens_seen": 1001390080,
|
|
"step": 1910,
|
|
"train_runtime": 8722.0751,
|
|
"train_tokens_per_second": 114810.99
|
|
},
|
|
{
|
|
"epoch": 0.10389891501393436,
|
|
"grad_norm": 0.21359668672084808,
|
|
"learning_rate": 0.004931196305757076,
|
|
"loss": 3.4397598266601563,
|
|
"num_input_tokens_seen": 1006632960,
|
|
"step": 1920,
|
|
"train_runtime": 8767.3142,
|
|
"train_tokens_per_second": 114816.572
|
|
},
|
|
{
|
|
"epoch": 0.1044400551962986,
|
|
"grad_norm": 0.188863143324852,
|
|
"learning_rate": 0.004930228200551757,
|
|
"loss": 3.428334045410156,
|
|
"num_input_tokens_seen": 1011875840,
|
|
"step": 1930,
|
|
"train_runtime": 8812.53,
|
|
"train_tokens_per_second": 114822.399
|
|
},
|
|
{
|
|
"epoch": 0.10498119537866284,
|
|
"grad_norm": 0.2043711096048355,
|
|
"learning_rate": 0.0049292534392100405,
|
|
"loss": 3.428396987915039,
|
|
"num_input_tokens_seen": 1017118720,
|
|
"step": 1940,
|
|
"train_runtime": 8857.7449,
|
|
"train_tokens_per_second": 114828.179
|
|
},
|
|
{
|
|
"epoch": 0.10552233556102708,
|
|
"grad_norm": 0.18800680339336395,
|
|
"learning_rate": 0.00492827202470783,
|
|
"loss": 3.423938751220703,
|
|
"num_input_tokens_seen": 1022361600,
|
|
"step": 1950,
|
|
"train_runtime": 8902.9768,
|
|
"train_tokens_per_second": 114833.682
|
|
},
|
|
{
|
|
"epoch": 0.10606347574339133,
|
|
"grad_norm": 0.20674094557762146,
|
|
"learning_rate": 0.004927283960041336,
|
|
"loss": 3.4255210876464846,
|
|
"num_input_tokens_seen": 1027604480,
|
|
"step": 1960,
|
|
"train_runtime": 8948.2078,
|
|
"train_tokens_per_second": 114839.14
|
|
},
|
|
{
|
|
"epoch": 0.10660461592575557,
|
|
"grad_norm": 0.19658301770687103,
|
|
"learning_rate": 0.004926289248227076,
|
|
"loss": 3.422502899169922,
|
|
"num_input_tokens_seen": 1032847360,
|
|
"step": 1970,
|
|
"train_runtime": 8993.4331,
|
|
"train_tokens_per_second": 114844.614
|
|
},
|
|
{
|
|
"epoch": 0.10714575610811981,
|
|
"grad_norm": 0.20629730820655823,
|
|
"learning_rate": 0.00492528789230186,
|
|
"loss": 3.419141387939453,
|
|
"num_input_tokens_seen": 1038090240,
|
|
"step": 1980,
|
|
"train_runtime": 9038.6403,
|
|
"train_tokens_per_second": 114850.266
|
|
},
|
|
{
|
|
"epoch": 0.10768689629048404,
|
|
"grad_norm": 0.20946621894836426,
|
|
"learning_rate": 0.00492427989532278,
|
|
"loss": 3.4206031799316405,
|
|
"num_input_tokens_seen": 1043333120,
|
|
"step": 1990,
|
|
"train_runtime": 9083.8627,
|
|
"train_tokens_per_second": 114855.668
|
|
},
|
|
{
|
|
"epoch": 0.10822803647284829,
|
|
"grad_norm": 0.2047351747751236,
|
|
"learning_rate": 0.004923265260367205,
|
|
"loss": 3.421718978881836,
|
|
"num_input_tokens_seen": 1048576000,
|
|
"step": 2000,
|
|
"train_runtime": 9129.0866,
|
|
"train_tokens_per_second": 114860.998
|
|
},
|
|
{
|
|
"epoch": 0.10822803647284829,
|
|
"eval_loss": 3.355583429336548,
|
|
"eval_runtime": 1.9828,
|
|
"eval_samples_per_second": 252.171,
|
|
"eval_steps_per_second": 4.035,
|
|
"num_input_tokens_seen": 1048576000,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"epoch": 0.10876917665521253,
|
|
"grad_norm": 0.18940046429634094,
|
|
"learning_rate": 0.004922243990532769,
|
|
"loss": 3.4131790161132813,
|
|
"num_input_tokens_seen": 1053818880,
|
|
"step": 2010,
|
|
"train_runtime": 9178.8208,
|
|
"train_tokens_per_second": 114809.833
|
|
},
|
|
{
|
|
"epoch": 0.10931031683757678,
|
|
"grad_norm": 0.2015410214662552,
|
|
"learning_rate": 0.004921216088937362,
|
|
"loss": 3.433843994140625,
|
|
"num_input_tokens_seen": 1059061760,
|
|
"step": 2020,
|
|
"train_runtime": 9224.0155,
|
|
"train_tokens_per_second": 114815.696
|
|
},
|
|
{
|
|
"epoch": 0.10985145701994102,
|
|
"grad_norm": 0.20246025919914246,
|
|
"learning_rate": 0.0049201815587191205,
|
|
"loss": 3.4257015228271483,
|
|
"num_input_tokens_seen": 1064304640,
|
|
"step": 2030,
|
|
"train_runtime": 9269.2191,
|
|
"train_tokens_per_second": 114821.392
|
|
},
|
|
{
|
|
"epoch": 0.11039259720230525,
|
|
"grad_norm": 0.1984010487794876,
|
|
"learning_rate": 0.0049191404030364165,
|
|
"loss": 3.407004547119141,
|
|
"num_input_tokens_seen": 1069547520,
|
|
"step": 2040,
|
|
"train_runtime": 9314.4158,
|
|
"train_tokens_per_second": 114827.118
|
|
},
|
|
{
|
|
"epoch": 0.1109337373846695,
|
|
"grad_norm": 0.2240104079246521,
|
|
"learning_rate": 0.0049180926250678506,
|
|
"loss": 3.413028335571289,
|
|
"num_input_tokens_seen": 1074790400,
|
|
"step": 2050,
|
|
"train_runtime": 9359.6399,
|
|
"train_tokens_per_second": 114832.452
|
|
},
|
|
{
|
|
"epoch": 0.11147487756703374,
|
|
"grad_norm": 0.20743729174137115,
|
|
"learning_rate": 0.004917038228012243,
|
|
"loss": 3.413587188720703,
|
|
"num_input_tokens_seen": 1080033280,
|
|
"step": 2060,
|
|
"train_runtime": 9404.8754,
|
|
"train_tokens_per_second": 114837.596
|
|
},
|
|
{
|
|
"epoch": 0.11201601774939798,
|
|
"grad_norm": 0.18610326945781708,
|
|
"learning_rate": 0.004915977215088616,
|
|
"loss": 3.4143035888671873,
|
|
"num_input_tokens_seen": 1085276160,
|
|
"step": 2070,
|
|
"train_runtime": 9450.1196,
|
|
"train_tokens_per_second": 114842.584
|
|
},
|
|
{
|
|
"epoch": 0.11255715793176223,
|
|
"grad_norm": 0.18289707601070404,
|
|
"learning_rate": 0.004914909589536196,
|
|
"loss": 3.4013748168945312,
|
|
"num_input_tokens_seen": 1090519040,
|
|
"step": 2080,
|
|
"train_runtime": 9495.3594,
|
|
"train_tokens_per_second": 114847.579
|
|
},
|
|
{
|
|
"epoch": 0.11309829811412646,
|
|
"grad_norm": 0.20693431794643402,
|
|
"learning_rate": 0.0049138353546143935,
|
|
"loss": 3.420492172241211,
|
|
"num_input_tokens_seen": 1095761920,
|
|
"step": 2090,
|
|
"train_runtime": 9540.5793,
|
|
"train_tokens_per_second": 114852.766
|
|
},
|
|
{
|
|
"epoch": 0.1136394382964907,
|
|
"grad_norm": 0.1881825178861618,
|
|
"learning_rate": 0.0049127545136027975,
|
|
"loss": 3.4042373657226563,
|
|
"num_input_tokens_seen": 1101004800,
|
|
"step": 2100,
|
|
"train_runtime": 9585.774,
|
|
"train_tokens_per_second": 114858.206
|
|
},
|
|
{
|
|
"epoch": 0.11418057847885495,
|
|
"grad_norm": 0.20557381212711334,
|
|
"learning_rate": 0.004911667069801167,
|
|
"loss": 3.395760345458984,
|
|
"num_input_tokens_seen": 1106247680,
|
|
"step": 2110,
|
|
"train_runtime": 9630.9862,
|
|
"train_tokens_per_second": 114863.385
|
|
},
|
|
{
|
|
"epoch": 0.11472171866121919,
|
|
"grad_norm": 0.20688888430595398,
|
|
"learning_rate": 0.004910573026529419,
|
|
"loss": 3.3946189880371094,
|
|
"num_input_tokens_seen": 1111490560,
|
|
"step": 2120,
|
|
"train_runtime": 9676.207,
|
|
"train_tokens_per_second": 114868.415
|
|
},
|
|
{
|
|
"epoch": 0.11526285884358344,
|
|
"grad_norm": 0.1814773976802826,
|
|
"learning_rate": 0.004909472387127615,
|
|
"loss": 3.405241775512695,
|
|
"num_input_tokens_seen": 1116733440,
|
|
"step": 2130,
|
|
"train_runtime": 9721.4134,
|
|
"train_tokens_per_second": 114873.568
|
|
},
|
|
{
|
|
"epoch": 0.11580399902594767,
|
|
"grad_norm": 0.1865544617176056,
|
|
"learning_rate": 0.004908365154955957,
|
|
"loss": 3.4098495483398437,
|
|
"num_input_tokens_seen": 1121976320,
|
|
"step": 2140,
|
|
"train_runtime": 9766.6099,
|
|
"train_tokens_per_second": 114878.789
|
|
},
|
|
{
|
|
"epoch": 0.11634513920831191,
|
|
"grad_norm": 0.2032928168773651,
|
|
"learning_rate": 0.004907251333394776,
|
|
"loss": 3.4024234771728517,
|
|
"num_input_tokens_seen": 1127219200,
|
|
"step": 2150,
|
|
"train_runtime": 9811.8225,
|
|
"train_tokens_per_second": 114883.774
|
|
},
|
|
{
|
|
"epoch": 0.11688627939067615,
|
|
"grad_norm": 0.1904987096786499,
|
|
"learning_rate": 0.004906130925844515,
|
|
"loss": 3.3986663818359375,
|
|
"num_input_tokens_seen": 1132462080,
|
|
"step": 2160,
|
|
"train_runtime": 9857.0319,
|
|
"train_tokens_per_second": 114888.751
|
|
},
|
|
{
|
|
"epoch": 0.1174274195730404,
|
|
"grad_norm": 0.1815112829208374,
|
|
"learning_rate": 0.004905003935725728,
|
|
"loss": 3.3947410583496094,
|
|
"num_input_tokens_seen": 1137704960,
|
|
"step": 2170,
|
|
"train_runtime": 9902.2357,
|
|
"train_tokens_per_second": 114893.746
|
|
},
|
|
{
|
|
"epoch": 0.11796855975540464,
|
|
"grad_norm": 0.20339980721473694,
|
|
"learning_rate": 0.004903870366479064,
|
|
"loss": 3.3956260681152344,
|
|
"num_input_tokens_seen": 1142947840,
|
|
"step": 2180,
|
|
"train_runtime": 9947.4535,
|
|
"train_tokens_per_second": 114898.536
|
|
},
|
|
{
|
|
"epoch": 0.11850969993776887,
|
|
"grad_norm": 0.18381614983081818,
|
|
"learning_rate": 0.004902730221565258,
|
|
"loss": 3.3980743408203127,
|
|
"num_input_tokens_seen": 1148190720,
|
|
"step": 2190,
|
|
"train_runtime": 9992.6437,
|
|
"train_tokens_per_second": 114903.599
|
|
},
|
|
{
|
|
"epoch": 0.11905084012013312,
|
|
"grad_norm": 0.19806860387325287,
|
|
"learning_rate": 0.004901583504465119,
|
|
"loss": 3.393767547607422,
|
|
"num_input_tokens_seen": 1153433600,
|
|
"step": 2200,
|
|
"train_runtime": 10037.8153,
|
|
"train_tokens_per_second": 114908.829
|
|
},
|
|
{
|
|
"epoch": 0.11959198030249736,
|
|
"grad_norm": 0.20329956710338593,
|
|
"learning_rate": 0.004900430218679523,
|
|
"loss": 3.3944183349609376,
|
|
"num_input_tokens_seen": 1158676480,
|
|
"step": 2210,
|
|
"train_runtime": 10083.038,
|
|
"train_tokens_per_second": 114913.43
|
|
},
|
|
{
|
|
"epoch": 0.1201331204848616,
|
|
"grad_norm": 0.2080426961183548,
|
|
"learning_rate": 0.004899270367729398,
|
|
"loss": 3.3978126525878904,
|
|
"num_input_tokens_seen": 1163919360,
|
|
"step": 2220,
|
|
"train_runtime": 10128.2721,
|
|
"train_tokens_per_second": 114917.86
|
|
},
|
|
{
|
|
"epoch": 0.12067426066722585,
|
|
"grad_norm": 0.19686874747276306,
|
|
"learning_rate": 0.004898103955155715,
|
|
"loss": 3.395246124267578,
|
|
"num_input_tokens_seen": 1169162240,
|
|
"step": 2230,
|
|
"train_runtime": 10173.5097,
|
|
"train_tokens_per_second": 114922.212
|
|
},
|
|
{
|
|
"epoch": 0.12121540084959008,
|
|
"grad_norm": 0.20237593352794647,
|
|
"learning_rate": 0.004896930984519478,
|
|
"loss": 3.3845314025878905,
|
|
"num_input_tokens_seen": 1174405120,
|
|
"step": 2240,
|
|
"train_runtime": 10218.7376,
|
|
"train_tokens_per_second": 114926.634
|
|
},
|
|
{
|
|
"epoch": 0.12175654103195432,
|
|
"grad_norm": 0.19075846672058105,
|
|
"learning_rate": 0.004895751459401713,
|
|
"loss": 3.380054473876953,
|
|
"num_input_tokens_seen": 1179648000,
|
|
"step": 2250,
|
|
"train_runtime": 10263.9453,
|
|
"train_tokens_per_second": 114931.243
|
|
},
|
|
{
|
|
"epoch": 0.12229768121431857,
|
|
"grad_norm": 0.18929868936538696,
|
|
"learning_rate": 0.004894565383403456,
|
|
"loss": 3.3817626953125,
|
|
"num_input_tokens_seen": 1184890880,
|
|
"step": 2260,
|
|
"train_runtime": 10309.1491,
|
|
"train_tokens_per_second": 114935.857
|
|
},
|
|
{
|
|
"epoch": 0.12283882139668281,
|
|
"grad_norm": 0.18654604256153107,
|
|
"learning_rate": 0.0048933727601457415,
|
|
"loss": 3.3808876037597657,
|
|
"num_input_tokens_seen": 1190133760,
|
|
"step": 2270,
|
|
"train_runtime": 10354.365,
|
|
"train_tokens_per_second": 114940.294
|
|
},
|
|
{
|
|
"epoch": 0.12337996157904706,
|
|
"grad_norm": 0.19212667644023895,
|
|
"learning_rate": 0.004892173593269593,
|
|
"loss": 3.378282928466797,
|
|
"num_input_tokens_seen": 1195376640,
|
|
"step": 2280,
|
|
"train_runtime": 10399.5899,
|
|
"train_tokens_per_second": 114944.594
|
|
},
|
|
{
|
|
"epoch": 0.12392110176141129,
|
|
"grad_norm": 0.19903384149074554,
|
|
"learning_rate": 0.004890967886436014,
|
|
"loss": 3.384090042114258,
|
|
"num_input_tokens_seen": 1200619520,
|
|
"step": 2290,
|
|
"train_runtime": 10448.5527,
|
|
"train_tokens_per_second": 114907.735
|
|
},
|
|
{
|
|
"epoch": 0.12446224194377553,
|
|
"grad_norm": 0.19387711584568024,
|
|
"learning_rate": 0.004889755643325971,
|
|
"loss": 3.380754089355469,
|
|
"num_input_tokens_seen": 1205862400,
|
|
"step": 2300,
|
|
"train_runtime": 10493.7881,
|
|
"train_tokens_per_second": 114912.021
|
|
},
|
|
{
|
|
"epoch": 0.1250033821261398,
|
|
"grad_norm": 0.20909279584884644,
|
|
"learning_rate": 0.0048885368676403855,
|
|
"loss": 3.3727947235107423,
|
|
"num_input_tokens_seen": 1211105280,
|
|
"step": 2310,
|
|
"train_runtime": 10539.0351,
|
|
"train_tokens_per_second": 114916.145
|
|
},
|
|
{
|
|
"epoch": 0.12554452230850402,
|
|
"grad_norm": 0.17621192336082458,
|
|
"learning_rate": 0.004887311563100124,
|
|
"loss": 3.384077453613281,
|
|
"num_input_tokens_seen": 1216348160,
|
|
"step": 2320,
|
|
"train_runtime": 10584.2319,
|
|
"train_tokens_per_second": 114920.777
|
|
},
|
|
{
|
|
"epoch": 0.12608566249086825,
|
|
"grad_norm": 0.19513387978076935,
|
|
"learning_rate": 0.004886079733445985,
|
|
"loss": 3.378644561767578,
|
|
"num_input_tokens_seen": 1221591040,
|
|
"step": 2330,
|
|
"train_runtime": 10629.451,
|
|
"train_tokens_per_second": 114925.13
|
|
},
|
|
{
|
|
"epoch": 0.1266268026732325,
|
|
"grad_norm": 0.19339337944984436,
|
|
"learning_rate": 0.004884841382438689,
|
|
"loss": 3.3802566528320312,
|
|
"num_input_tokens_seen": 1226833920,
|
|
"step": 2340,
|
|
"train_runtime": 10674.6651,
|
|
"train_tokens_per_second": 114929.5
|
|
},
|
|
{
|
|
"epoch": 0.12716794285559674,
|
|
"grad_norm": 0.20770232379436493,
|
|
"learning_rate": 0.004883596513858863,
|
|
"loss": 3.3678009033203127,
|
|
"num_input_tokens_seen": 1232076800,
|
|
"step": 2350,
|
|
"train_runtime": 10719.8522,
|
|
"train_tokens_per_second": 114934.122
|
|
},
|
|
{
|
|
"epoch": 0.127709083037961,
|
|
"grad_norm": 0.19512751698493958,
|
|
"learning_rate": 0.004882345131507035,
|
|
"loss": 3.3827003479003905,
|
|
"num_input_tokens_seen": 1237319680,
|
|
"step": 2360,
|
|
"train_runtime": 10765.0276,
|
|
"train_tokens_per_second": 114938.831
|
|
},
|
|
{
|
|
"epoch": 0.12825022322032523,
|
|
"grad_norm": 0.1894627958536148,
|
|
"learning_rate": 0.004881087239203616,
|
|
"loss": 3.377857208251953,
|
|
"num_input_tokens_seen": 1242562560,
|
|
"step": 2370,
|
|
"train_runtime": 10810.2272,
|
|
"train_tokens_per_second": 114943.242
|
|
},
|
|
{
|
|
"epoch": 0.12879136340268946,
|
|
"grad_norm": 0.18233883380889893,
|
|
"learning_rate": 0.004879822840788895,
|
|
"loss": 3.370525360107422,
|
|
"num_input_tokens_seen": 1247805440,
|
|
"step": 2380,
|
|
"train_runtime": 10855.4267,
|
|
"train_tokens_per_second": 114947.618
|
|
},
|
|
{
|
|
"epoch": 0.12933250358505372,
|
|
"grad_norm": 0.1876172125339508,
|
|
"learning_rate": 0.00487855194012302,
|
|
"loss": 3.3757583618164064,
|
|
"num_input_tokens_seen": 1253048320,
|
|
"step": 2390,
|
|
"train_runtime": 10900.6108,
|
|
"train_tokens_per_second": 114952.12
|
|
},
|
|
{
|
|
"epoch": 0.12987364376741795,
|
|
"grad_norm": 0.19061093032360077,
|
|
"learning_rate": 0.0048772745410859955,
|
|
"loss": 3.371135711669922,
|
|
"num_input_tokens_seen": 1258291200,
|
|
"step": 2400,
|
|
"train_runtime": 10945.8277,
|
|
"train_tokens_per_second": 114956.24
|
|
},
|
|
{
|
|
"epoch": 0.1304147839497822,
|
|
"grad_norm": 0.19320231676101685,
|
|
"learning_rate": 0.004875990647577659,
|
|
"loss": 3.376973342895508,
|
|
"num_input_tokens_seen": 1263534080,
|
|
"step": 2410,
|
|
"train_runtime": 10991.0522,
|
|
"train_tokens_per_second": 114960.247
|
|
},
|
|
{
|
|
"epoch": 0.13095592413214643,
|
|
"grad_norm": 0.18665140867233276,
|
|
"learning_rate": 0.004874700263517679,
|
|
"loss": 3.371229553222656,
|
|
"num_input_tokens_seen": 1268776960,
|
|
"step": 2420,
|
|
"train_runtime": 11036.2334,
|
|
"train_tokens_per_second": 114964.673
|
|
},
|
|
{
|
|
"epoch": 0.13149706431451066,
|
|
"grad_norm": 0.18199113011360168,
|
|
"learning_rate": 0.004873403392845541,
|
|
"loss": 3.361619567871094,
|
|
"num_input_tokens_seen": 1274019840,
|
|
"step": 2430,
|
|
"train_runtime": 11081.4325,
|
|
"train_tokens_per_second": 114968.877
|
|
},
|
|
{
|
|
"epoch": 0.13203820449687492,
|
|
"grad_norm": 0.19316934049129486,
|
|
"learning_rate": 0.004872100039520528,
|
|
"loss": 3.360996627807617,
|
|
"num_input_tokens_seen": 1279262720,
|
|
"step": 2440,
|
|
"train_runtime": 11126.6221,
|
|
"train_tokens_per_second": 114973.144
|
|
},
|
|
{
|
|
"epoch": 0.13257934467923915,
|
|
"grad_norm": 0.185124471783638,
|
|
"learning_rate": 0.00487079020752172,
|
|
"loss": 3.366575241088867,
|
|
"num_input_tokens_seen": 1284505600,
|
|
"step": 2450,
|
|
"train_runtime": 11171.8004,
|
|
"train_tokens_per_second": 114977.493
|
|
},
|
|
{
|
|
"epoch": 0.1331204848616034,
|
|
"grad_norm": 0.18804939091205597,
|
|
"learning_rate": 0.004869473900847973,
|
|
"loss": 3.3575817108154298,
|
|
"num_input_tokens_seen": 1289748480,
|
|
"step": 2460,
|
|
"train_runtime": 11217.0182,
|
|
"train_tokens_per_second": 114981.402
|
|
},
|
|
{
|
|
"epoch": 0.13366162504396764,
|
|
"grad_norm": 0.19206225872039795,
|
|
"learning_rate": 0.004868151123517911,
|
|
"loss": 3.3654083251953124,
|
|
"num_input_tokens_seen": 1294991360,
|
|
"step": 2470,
|
|
"train_runtime": 11262.2319,
|
|
"train_tokens_per_second": 114985.322
|
|
},
|
|
{
|
|
"epoch": 0.13420276522633187,
|
|
"grad_norm": 0.1901652067899704,
|
|
"learning_rate": 0.004866821879569913,
|
|
"loss": 3.3583431243896484,
|
|
"num_input_tokens_seen": 1300234240,
|
|
"step": 2480,
|
|
"train_runtime": 11307.4375,
|
|
"train_tokens_per_second": 114989.293
|
|
},
|
|
{
|
|
"epoch": 0.13474390540869613,
|
|
"grad_norm": 0.1906082034111023,
|
|
"learning_rate": 0.004865486173062098,
|
|
"loss": 3.3592803955078123,
|
|
"num_input_tokens_seen": 1305477120,
|
|
"step": 2490,
|
|
"train_runtime": 11352.6608,
|
|
"train_tokens_per_second": 114993.053
|
|
},
|
|
{
|
|
"epoch": 0.13528504559106036,
|
|
"grad_norm": 0.17934362590312958,
|
|
"learning_rate": 0.004864144008072318,
|
|
"loss": 3.3405136108398437,
|
|
"num_input_tokens_seen": 1310720000,
|
|
"step": 2500,
|
|
"train_runtime": 11397.8619,
|
|
"train_tokens_per_second": 114997.007
|
|
},
|
|
{
|
|
"epoch": 0.13528504559106036,
|
|
"eval_loss": 3.2980093955993652,
|
|
"eval_runtime": 1.9858,
|
|
"eval_samples_per_second": 251.785,
|
|
"eval_steps_per_second": 4.029,
|
|
"num_input_tokens_seen": 1310720000,
|
|
"step": 2500
|
|
},
|
|
{
|
|
"epoch": 0.13582618577342462,
|
|
"grad_norm": 0.18693317472934723,
|
|
"learning_rate": 0.00486279538869814,
|
|
"loss": 3.367817687988281,
|
|
"num_input_tokens_seen": 1315962880,
|
|
"step": 2510,
|
|
"train_runtime": 11445.0335,
|
|
"train_tokens_per_second": 114981.13
|
|
},
|
|
{
|
|
"epoch": 0.13636732595578885,
|
|
"grad_norm": 0.18467511236667633,
|
|
"learning_rate": 0.004861440319056837,
|
|
"loss": 3.355504608154297,
|
|
"num_input_tokens_seen": 1321205760,
|
|
"step": 2520,
|
|
"train_runtime": 11490.248,
|
|
"train_tokens_per_second": 114984.965
|
|
},
|
|
{
|
|
"epoch": 0.13690846613815308,
|
|
"grad_norm": 0.17405100166797638,
|
|
"learning_rate": 0.004860078803285375,
|
|
"loss": 3.3486671447753906,
|
|
"num_input_tokens_seen": 1326448640,
|
|
"step": 2530,
|
|
"train_runtime": 11535.4773,
|
|
"train_tokens_per_second": 114988.622
|
|
},
|
|
{
|
|
"epoch": 0.13744960632051734,
|
|
"grad_norm": 0.1991710066795349,
|
|
"learning_rate": 0.0048587108455403994,
|
|
"loss": 3.3470123291015623,
|
|
"num_input_tokens_seen": 1331691520,
|
|
"step": 2540,
|
|
"train_runtime": 11580.7091,
|
|
"train_tokens_per_second": 114992.226
|
|
},
|
|
{
|
|
"epoch": 0.13799074650288157,
|
|
"grad_norm": 0.19981589913368225,
|
|
"learning_rate": 0.004857336449998221,
|
|
"loss": 3.355559539794922,
|
|
"num_input_tokens_seen": 1336934400,
|
|
"step": 2550,
|
|
"train_runtime": 11625.9198,
|
|
"train_tokens_per_second": 114996.011
|
|
},
|
|
{
|
|
"epoch": 0.13853188668524583,
|
|
"grad_norm": 0.19031056761741638,
|
|
"learning_rate": 0.004855955620854806,
|
|
"loss": 3.359702301025391,
|
|
"num_input_tokens_seen": 1342177280,
|
|
"step": 2560,
|
|
"train_runtime": 11671.1393,
|
|
"train_tokens_per_second": 114999.68
|
|
},
|
|
{
|
|
"epoch": 0.13907302686761006,
|
|
"grad_norm": 0.1825476437807083,
|
|
"learning_rate": 0.004854568362325763,
|
|
"loss": 3.3532974243164064,
|
|
"num_input_tokens_seen": 1347420160,
|
|
"step": 2570,
|
|
"train_runtime": 11716.3486,
|
|
"train_tokens_per_second": 115003.42
|
|
},
|
|
{
|
|
"epoch": 0.13961416704997429,
|
|
"grad_norm": 0.1943996399641037,
|
|
"learning_rate": 0.004853174678646328,
|
|
"loss": 3.3549442291259766,
|
|
"num_input_tokens_seen": 1352663040,
|
|
"step": 2580,
|
|
"train_runtime": 11761.5665,
|
|
"train_tokens_per_second": 115007.048
|
|
},
|
|
{
|
|
"epoch": 0.14015530723233854,
|
|
"grad_norm": 0.18165603280067444,
|
|
"learning_rate": 0.004851774574071355,
|
|
"loss": 3.345872497558594,
|
|
"num_input_tokens_seen": 1357905920,
|
|
"step": 2590,
|
|
"train_runtime": 11806.7776,
|
|
"train_tokens_per_second": 115010.714
|
|
},
|
|
{
|
|
"epoch": 0.14069644741470277,
|
|
"grad_norm": 0.19906878471374512,
|
|
"learning_rate": 0.004850368052875296,
|
|
"loss": 3.3501548767089844,
|
|
"num_input_tokens_seen": 1363148800,
|
|
"step": 2600,
|
|
"train_runtime": 11851.978,
|
|
"train_tokens_per_second": 115014.456
|
|
},
|
|
{
|
|
"epoch": 0.14123758759706703,
|
|
"grad_norm": 0.1889800876379013,
|
|
"learning_rate": 0.004848955119352198,
|
|
"loss": 3.357212448120117,
|
|
"num_input_tokens_seen": 1368391680,
|
|
"step": 2610,
|
|
"train_runtime": 11897.1651,
|
|
"train_tokens_per_second": 115018.298
|
|
},
|
|
{
|
|
"epoch": 0.14177872777943126,
|
|
"grad_norm": 0.1907270848751068,
|
|
"learning_rate": 0.00484753577781568,
|
|
"loss": 3.3405483245849608,
|
|
"num_input_tokens_seen": 1373634560,
|
|
"step": 2620,
|
|
"train_runtime": 11942.3695,
|
|
"train_tokens_per_second": 115021.945
|
|
},
|
|
{
|
|
"epoch": 0.1423198679617955,
|
|
"grad_norm": 0.18622975051403046,
|
|
"learning_rate": 0.004846110032598928,
|
|
"loss": 3.344770050048828,
|
|
"num_input_tokens_seen": 1378877440,
|
|
"step": 2630,
|
|
"train_runtime": 11987.559,
|
|
"train_tokens_per_second": 115025.706
|
|
},
|
|
{
|
|
"epoch": 0.14286100814415975,
|
|
"grad_norm": 0.20059405267238617,
|
|
"learning_rate": 0.004844677888054675,
|
|
"loss": 3.344530487060547,
|
|
"num_input_tokens_seen": 1384120320,
|
|
"step": 2640,
|
|
"train_runtime": 12032.7386,
|
|
"train_tokens_per_second": 115029.535
|
|
},
|
|
{
|
|
"epoch": 0.14340214832652398,
|
|
"grad_norm": 0.18230022490024567,
|
|
"learning_rate": 0.004843239348555194,
|
|
"loss": 3.340105438232422,
|
|
"num_input_tokens_seen": 1389363200,
|
|
"step": 2650,
|
|
"train_runtime": 12077.95,
|
|
"train_tokens_per_second": 115033.032
|
|
},
|
|
{
|
|
"epoch": 0.14394328850888824,
|
|
"grad_norm": 0.1814001351594925,
|
|
"learning_rate": 0.004841794418492279,
|
|
"loss": 3.3359622955322266,
|
|
"num_input_tokens_seen": 1394606080,
|
|
"step": 2660,
|
|
"train_runtime": 12123.1531,
|
|
"train_tokens_per_second": 115036.581
|
|
},
|
|
{
|
|
"epoch": 0.14448442869125247,
|
|
"grad_norm": 0.20956675708293915,
|
|
"learning_rate": 0.004840343102277236,
|
|
"loss": 3.3457298278808594,
|
|
"num_input_tokens_seen": 1399848960,
|
|
"step": 2670,
|
|
"train_runtime": 12168.3992,
|
|
"train_tokens_per_second": 115039.697
|
|
},
|
|
{
|
|
"epoch": 0.1450255688736167,
|
|
"grad_norm": 0.19602610170841217,
|
|
"learning_rate": 0.004838885404340865,
|
|
"loss": 3.337678909301758,
|
|
"num_input_tokens_seen": 1405091840,
|
|
"step": 2680,
|
|
"train_runtime": 12217.5728,
|
|
"train_tokens_per_second": 115005.809
|
|
},
|
|
{
|
|
"epoch": 0.14556670905598096,
|
|
"grad_norm": 0.18408642709255219,
|
|
"learning_rate": 0.00483742132913345,
|
|
"loss": 3.3393791198730467,
|
|
"num_input_tokens_seen": 1410334720,
|
|
"step": 2690,
|
|
"train_runtime": 12262.7736,
|
|
"train_tokens_per_second": 115009.439
|
|
},
|
|
{
|
|
"epoch": 0.1461078492383452,
|
|
"grad_norm": 0.21065284311771393,
|
|
"learning_rate": 0.00483595088112475,
|
|
"loss": 3.3346370697021483,
|
|
"num_input_tokens_seen": 1415577600,
|
|
"step": 2700,
|
|
"train_runtime": 12307.9818,
|
|
"train_tokens_per_second": 115012.974
|
|
},
|
|
{
|
|
"epoch": 0.14664898942070945,
|
|
"grad_norm": 0.18873485922813416,
|
|
"learning_rate": 0.00483447406480397,
|
|
"loss": 3.33388671875,
|
|
"num_input_tokens_seen": 1420820480,
|
|
"step": 2710,
|
|
"train_runtime": 12353.2004,
|
|
"train_tokens_per_second": 115016.387
|
|
},
|
|
{
|
|
"epoch": 0.14719012960307368,
|
|
"grad_norm": 0.18760992586612701,
|
|
"learning_rate": 0.004832990884679764,
|
|
"loss": 3.3374618530273437,
|
|
"num_input_tokens_seen": 1426063360,
|
|
"step": 2720,
|
|
"train_runtime": 12398.393,
|
|
"train_tokens_per_second": 115020.016
|
|
},
|
|
{
|
|
"epoch": 0.1477312697854379,
|
|
"grad_norm": 0.192196786403656,
|
|
"learning_rate": 0.004831501345280215,
|
|
"loss": 3.3331283569335937,
|
|
"num_input_tokens_seen": 1431306240,
|
|
"step": 2730,
|
|
"train_runtime": 12443.5889,
|
|
"train_tokens_per_second": 115023.588
|
|
},
|
|
{
|
|
"epoch": 0.14827240996780217,
|
|
"grad_norm": 0.18086469173431396,
|
|
"learning_rate": 0.004830005451152815,
|
|
"loss": 3.342273712158203,
|
|
"num_input_tokens_seen": 1436549120,
|
|
"step": 2740,
|
|
"train_runtime": 12488.8002,
|
|
"train_tokens_per_second": 115026.992
|
|
},
|
|
{
|
|
"epoch": 0.1488135501501664,
|
|
"grad_norm": 0.20178896188735962,
|
|
"learning_rate": 0.004828503206864461,
|
|
"loss": 3.340282440185547,
|
|
"num_input_tokens_seen": 1441792000,
|
|
"step": 2750,
|
|
"train_runtime": 12534.0299,
|
|
"train_tokens_per_second": 115030.203
|
|
},
|
|
{
|
|
"epoch": 0.14935469033253065,
|
|
"grad_norm": 0.1862727403640747,
|
|
"learning_rate": 0.004826994617001436,
|
|
"loss": 3.333884048461914,
|
|
"num_input_tokens_seen": 1447034880,
|
|
"step": 2760,
|
|
"train_runtime": 12579.232,
|
|
"train_tokens_per_second": 115033.643
|
|
},
|
|
{
|
|
"epoch": 0.14989583051489488,
|
|
"grad_norm": 0.18692044913768768,
|
|
"learning_rate": 0.004825479686169395,
|
|
"loss": 3.3313224792480467,
|
|
"num_input_tokens_seen": 1452277760,
|
|
"step": 2770,
|
|
"train_runtime": 12624.4525,
|
|
"train_tokens_per_second": 115036.89
|
|
},
|
|
{
|
|
"epoch": 0.15043697069725911,
|
|
"grad_norm": 0.19887301325798035,
|
|
"learning_rate": 0.004823958418993353,
|
|
"loss": 3.3318561553955077,
|
|
"num_input_tokens_seen": 1457520640,
|
|
"step": 2780,
|
|
"train_runtime": 12669.6266,
|
|
"train_tokens_per_second": 115040.536
|
|
},
|
|
{
|
|
"epoch": 0.15097811087962337,
|
|
"grad_norm": 0.19694387912750244,
|
|
"learning_rate": 0.004822430820117667,
|
|
"loss": 3.324603271484375,
|
|
"num_input_tokens_seen": 1462763520,
|
|
"step": 2790,
|
|
"train_runtime": 12714.8383,
|
|
"train_tokens_per_second": 115043.816
|
|
},
|
|
{
|
|
"epoch": 0.1515192510619876,
|
|
"grad_norm": 0.18864646553993225,
|
|
"learning_rate": 0.0048208968942060285,
|
|
"loss": 3.329520416259766,
|
|
"num_input_tokens_seen": 1468006400,
|
|
"step": 2800,
|
|
"train_runtime": 12760.0555,
|
|
"train_tokens_per_second": 115047.023
|
|
},
|
|
{
|
|
"epoch": 0.15206039124435186,
|
|
"grad_norm": 0.19097571074962616,
|
|
"learning_rate": 0.004819356645941442,
|
|
"loss": 3.334062194824219,
|
|
"num_input_tokens_seen": 1473249280,
|
|
"step": 2810,
|
|
"train_runtime": 12805.2599,
|
|
"train_tokens_per_second": 115050.323
|
|
},
|
|
{
|
|
"epoch": 0.1526015314267161,
|
|
"grad_norm": 0.18825189769268036,
|
|
"learning_rate": 0.004817810080026213,
|
|
"loss": 3.3339500427246094,
|
|
"num_input_tokens_seen": 1478492160,
|
|
"step": 2820,
|
|
"train_runtime": 12850.4647,
|
|
"train_tokens_per_second": 115053.595
|
|
},
|
|
{
|
|
"epoch": 0.15314267160908032,
|
|
"grad_norm": 0.17853769659996033,
|
|
"learning_rate": 0.004816257201181937,
|
|
"loss": 3.3271289825439454,
|
|
"num_input_tokens_seen": 1483735040,
|
|
"step": 2830,
|
|
"train_runtime": 12895.6915,
|
|
"train_tokens_per_second": 115056.648
|
|
},
|
|
{
|
|
"epoch": 0.15368381179144458,
|
|
"grad_norm": 0.1959368884563446,
|
|
"learning_rate": 0.004814698014149483,
|
|
"loss": 3.3293079376220702,
|
|
"num_input_tokens_seen": 1488977920,
|
|
"step": 2840,
|
|
"train_runtime": 12940.9039,
|
|
"train_tokens_per_second": 115059.808
|
|
},
|
|
{
|
|
"epoch": 0.1542249519738088,
|
|
"grad_norm": 0.1917344182729721,
|
|
"learning_rate": 0.0048131325236889745,
|
|
"loss": 3.3289634704589846,
|
|
"num_input_tokens_seen": 1494220800,
|
|
"step": 2850,
|
|
"train_runtime": 12986.1305,
|
|
"train_tokens_per_second": 115062.82
|
|
},
|
|
{
|
|
"epoch": 0.15476609215617307,
|
|
"grad_norm": 0.18901459872722626,
|
|
"learning_rate": 0.004811560734579785,
|
|
"loss": 3.3151206970214844,
|
|
"num_input_tokens_seen": 1499463680,
|
|
"step": 2860,
|
|
"train_runtime": 13031.3423,
|
|
"train_tokens_per_second": 115065.943
|
|
},
|
|
{
|
|
"epoch": 0.1553072323385373,
|
|
"grad_norm": 0.1884569674730301,
|
|
"learning_rate": 0.004809982651620513,
|
|
"loss": 3.321660614013672,
|
|
"num_input_tokens_seen": 1504706560,
|
|
"step": 2870,
|
|
"train_runtime": 13076.553,
|
|
"train_tokens_per_second": 115069.052
|
|
},
|
|
{
|
|
"epoch": 0.15584837252090153,
|
|
"grad_norm": 0.1960553228855133,
|
|
"learning_rate": 0.004808398279628971,
|
|
"loss": 3.326691436767578,
|
|
"num_input_tokens_seen": 1509949440,
|
|
"step": 2880,
|
|
"train_runtime": 13121.764,
|
|
"train_tokens_per_second": 115072.138
|
|
},
|
|
{
|
|
"epoch": 0.1563895127032658,
|
|
"grad_norm": 0.19503499567508698,
|
|
"learning_rate": 0.004806807623442178,
|
|
"loss": 3.321258544921875,
|
|
"num_input_tokens_seen": 1515192320,
|
|
"step": 2890,
|
|
"train_runtime": 13166.9951,
|
|
"train_tokens_per_second": 115075.027
|
|
},
|
|
{
|
|
"epoch": 0.15693065288563002,
|
|
"grad_norm": 0.19287334382534027,
|
|
"learning_rate": 0.004805210687916331,
|
|
"loss": 3.3227684020996096,
|
|
"num_input_tokens_seen": 1520435200,
|
|
"step": 2900,
|
|
"train_runtime": 13212.1814,
|
|
"train_tokens_per_second": 115078.287
|
|
},
|
|
{
|
|
"epoch": 0.15747179306799428,
|
|
"grad_norm": 0.19813010096549988,
|
|
"learning_rate": 0.004803607477926801,
|
|
"loss": 3.3109420776367187,
|
|
"num_input_tokens_seen": 1525678080,
|
|
"step": 2910,
|
|
"train_runtime": 13257.3873,
|
|
"train_tokens_per_second": 115081.354
|
|
},
|
|
{
|
|
"epoch": 0.1580129332503585,
|
|
"grad_norm": 0.19769278168678284,
|
|
"learning_rate": 0.004801997998368116,
|
|
"loss": 3.317332458496094,
|
|
"num_input_tokens_seen": 1530920960,
|
|
"step": 2920,
|
|
"train_runtime": 13302.5829,
|
|
"train_tokens_per_second": 115084.489
|
|
},
|
|
{
|
|
"epoch": 0.15855407343272274,
|
|
"grad_norm": 0.21613162755966187,
|
|
"learning_rate": 0.0048003822541539416,
|
|
"loss": 3.3125213623046874,
|
|
"num_input_tokens_seen": 1536163840,
|
|
"step": 2930,
|
|
"train_runtime": 13347.7691,
|
|
"train_tokens_per_second": 115087.684
|
|
},
|
|
{
|
|
"epoch": 0.159095213615087,
|
|
"grad_norm": 0.19285152852535248,
|
|
"learning_rate": 0.004798760250217072,
|
|
"loss": 3.3247020721435545,
|
|
"num_input_tokens_seen": 1541406720,
|
|
"step": 2940,
|
|
"train_runtime": 13392.9574,
|
|
"train_tokens_per_second": 115090.84
|
|
},
|
|
{
|
|
"epoch": 0.15963635379745122,
|
|
"grad_norm": 0.19629855453968048,
|
|
"learning_rate": 0.004797131991509409,
|
|
"loss": 3.3183937072753906,
|
|
"num_input_tokens_seen": 1546649600,
|
|
"step": 2950,
|
|
"train_runtime": 13438.1692,
|
|
"train_tokens_per_second": 115093.773
|
|
},
|
|
{
|
|
"epoch": 0.16017749397981548,
|
|
"grad_norm": 0.1971038430929184,
|
|
"learning_rate": 0.004795497483001952,
|
|
"loss": 3.3157825469970703,
|
|
"num_input_tokens_seen": 1551892480,
|
|
"step": 2960,
|
|
"train_runtime": 13483.3787,
|
|
"train_tokens_per_second": 115096.706
|
|
},
|
|
{
|
|
"epoch": 0.1607186341621797,
|
|
"grad_norm": 0.2136721909046173,
|
|
"learning_rate": 0.0047938567296847805,
|
|
"loss": 3.3181556701660155,
|
|
"num_input_tokens_seen": 1557135360,
|
|
"step": 2970,
|
|
"train_runtime": 13528.5747,
|
|
"train_tokens_per_second": 115099.734
|
|
},
|
|
{
|
|
"epoch": 0.16125977434454394,
|
|
"grad_norm": 0.18783020973205566,
|
|
"learning_rate": 0.004792209736567038,
|
|
"loss": 3.3050804138183594,
|
|
"num_input_tokens_seen": 1562378240,
|
|
"step": 2980,
|
|
"train_runtime": 13573.7919,
|
|
"train_tokens_per_second": 115102.563
|
|
},
|
|
{
|
|
"epoch": 0.1618009145269082,
|
|
"grad_norm": 0.17761750519275665,
|
|
"learning_rate": 0.0047905565086769205,
|
|
"loss": 3.313432312011719,
|
|
"num_input_tokens_seen": 1567621120,
|
|
"step": 2990,
|
|
"train_runtime": 13618.9923,
|
|
"train_tokens_per_second": 115105.515
|
|
},
|
|
{
|
|
"epoch": 0.16234205470927243,
|
|
"grad_norm": 0.1785641759634018,
|
|
"learning_rate": 0.004788897051061655,
|
|
"loss": 3.317774200439453,
|
|
"num_input_tokens_seen": 1572864000,
|
|
"step": 3000,
|
|
"train_runtime": 13664.2112,
|
|
"train_tokens_per_second": 115108.291
|
|
},
|
|
{
|
|
"epoch": 0.16234205470927243,
|
|
"eval_loss": 3.2526497840881348,
|
|
"eval_runtime": 1.9863,
|
|
"eval_samples_per_second": 251.723,
|
|
"eval_steps_per_second": 4.028,
|
|
"num_input_tokens_seen": 1572864000,
|
|
"step": 3000
|
|
},
|
|
{
|
|
"epoch": 0.1628831948916367,
|
|
"grad_norm": 0.19272948801517487,
|
|
"learning_rate": 0.004787231368787491,
|
|
"loss": 3.3128257751464845,
|
|
"num_input_tokens_seen": 1578106880,
|
|
"step": 3010,
|
|
"train_runtime": 13714.1711,
|
|
"train_tokens_per_second": 115071.255
|
|
},
|
|
{
|
|
"epoch": 0.16342433507400092,
|
|
"grad_norm": 0.1745939403772354,
|
|
"learning_rate": 0.004785559466939679,
|
|
"loss": 3.31363525390625,
|
|
"num_input_tokens_seen": 1583349760,
|
|
"step": 3020,
|
|
"train_runtime": 13759.3609,
|
|
"train_tokens_per_second": 115074.368
|
|
},
|
|
{
|
|
"epoch": 0.16396547525636515,
|
|
"grad_norm": 0.20123089849948883,
|
|
"learning_rate": 0.0047838813506224575,
|
|
"loss": 3.3179275512695314,
|
|
"num_input_tokens_seen": 1588592640,
|
|
"step": 3030,
|
|
"train_runtime": 13804.5657,
|
|
"train_tokens_per_second": 115077.335
|
|
},
|
|
{
|
|
"epoch": 0.1645066154387294,
|
|
"grad_norm": 0.19240304827690125,
|
|
"learning_rate": 0.004782197024959039,
|
|
"loss": 3.3164352416992187,
|
|
"num_input_tokens_seen": 1593835520,
|
|
"step": 3040,
|
|
"train_runtime": 13849.7747,
|
|
"train_tokens_per_second": 115080.249
|
|
},
|
|
{
|
|
"epoch": 0.16504775562109364,
|
|
"grad_norm": 0.19316141307353973,
|
|
"learning_rate": 0.004780506495091593,
|
|
"loss": 3.316120147705078,
|
|
"num_input_tokens_seen": 1599078400,
|
|
"step": 3050,
|
|
"train_runtime": 13894.9935,
|
|
"train_tokens_per_second": 115083.062
|
|
},
|
|
{
|
|
"epoch": 0.1655888958034579,
|
|
"grad_norm": 0.19889311492443085,
|
|
"learning_rate": 0.004778809766181229,
|
|
"loss": 3.3089508056640624,
|
|
"num_input_tokens_seen": 1604321280,
|
|
"step": 3060,
|
|
"train_runtime": 13943.9286,
|
|
"train_tokens_per_second": 115055.185
|
|
},
|
|
{
|
|
"epoch": 0.16613003598582213,
|
|
"grad_norm": 0.19427727162837982,
|
|
"learning_rate": 0.004777106843407982,
|
|
"loss": 3.3107887268066407,
|
|
"num_input_tokens_seen": 1609564160,
|
|
"step": 3070,
|
|
"train_runtime": 13989.1559,
|
|
"train_tokens_per_second": 115057.99
|
|
},
|
|
{
|
|
"epoch": 0.16667117616818636,
|
|
"grad_norm": 0.1867746263742447,
|
|
"learning_rate": 0.004775397731970797,
|
|
"loss": 3.306330108642578,
|
|
"num_input_tokens_seen": 1614807040,
|
|
"step": 3080,
|
|
"train_runtime": 14034.3821,
|
|
"train_tokens_per_second": 115060.787
|
|
},
|
|
{
|
|
"epoch": 0.16721231635055062,
|
|
"grad_norm": 0.18061718344688416,
|
|
"learning_rate": 0.0047736824370875125,
|
|
"loss": 3.3135826110839846,
|
|
"num_input_tokens_seen": 1620049920,
|
|
"step": 3090,
|
|
"train_runtime": 14079.6171,
|
|
"train_tokens_per_second": 115063.493
|
|
},
|
|
{
|
|
"epoch": 0.16775345653291485,
|
|
"grad_norm": 0.1992795616388321,
|
|
"learning_rate": 0.004771960963994845,
|
|
"loss": 3.2958747863769533,
|
|
"num_input_tokens_seen": 1625292800,
|
|
"step": 3100,
|
|
"train_runtime": 14124.8552,
|
|
"train_tokens_per_second": 115066.157
|
|
},
|
|
{
|
|
"epoch": 0.1682945967152791,
|
|
"grad_norm": 0.1932191550731659,
|
|
"learning_rate": 0.004770233317948373,
|
|
"loss": 3.305771255493164,
|
|
"num_input_tokens_seen": 1630535680,
|
|
"step": 3110,
|
|
"train_runtime": 14170.1029,
|
|
"train_tokens_per_second": 115068.725
|
|
},
|
|
{
|
|
"epoch": 0.16883573689764333,
|
|
"grad_norm": 0.18210622668266296,
|
|
"learning_rate": 0.00476849950422252,
|
|
"loss": 3.309395599365234,
|
|
"num_input_tokens_seen": 1635778560,
|
|
"step": 3120,
|
|
"train_runtime": 14215.3361,
|
|
"train_tokens_per_second": 115071.396
|
|
},
|
|
{
|
|
"epoch": 0.16937687708000757,
|
|
"grad_norm": 0.19367872178554535,
|
|
"learning_rate": 0.004766759528110539,
|
|
"loss": 3.302986907958984,
|
|
"num_input_tokens_seen": 1641021440,
|
|
"step": 3130,
|
|
"train_runtime": 14260.5763,
|
|
"train_tokens_per_second": 115073.992
|
|
},
|
|
{
|
|
"epoch": 0.16991801726237182,
|
|
"grad_norm": 0.19194577634334564,
|
|
"learning_rate": 0.004765013394924499,
|
|
"loss": 3.304148864746094,
|
|
"num_input_tokens_seen": 1646264320,
|
|
"step": 3140,
|
|
"train_runtime": 14305.8047,
|
|
"train_tokens_per_second": 115076.667
|
|
},
|
|
{
|
|
"epoch": 0.17045915744473605,
|
|
"grad_norm": 0.18965749442577362,
|
|
"learning_rate": 0.0047632611099952624,
|
|
"loss": 3.298334503173828,
|
|
"num_input_tokens_seen": 1651507200,
|
|
"step": 3150,
|
|
"train_runtime": 14351.0274,
|
|
"train_tokens_per_second": 115079.371
|
|
},
|
|
{
|
|
"epoch": 0.1710002976271003,
|
|
"grad_norm": 0.17816977202892303,
|
|
"learning_rate": 0.004761502678672474,
|
|
"loss": 3.300872802734375,
|
|
"num_input_tokens_seen": 1656750080,
|
|
"step": 3160,
|
|
"train_runtime": 14396.2711,
|
|
"train_tokens_per_second": 115081.889
|
|
},
|
|
{
|
|
"epoch": 0.17154143780946454,
|
|
"grad_norm": 0.19343526661396027,
|
|
"learning_rate": 0.004759738106324546,
|
|
"loss": 3.2991104125976562,
|
|
"num_input_tokens_seen": 1661992960,
|
|
"step": 3170,
|
|
"train_runtime": 14441.5146,
|
|
"train_tokens_per_second": 115084.394
|
|
},
|
|
{
|
|
"epoch": 0.17208257799182877,
|
|
"grad_norm": 0.18815293908119202,
|
|
"learning_rate": 0.004757967398338635,
|
|
"loss": 3.307733154296875,
|
|
"num_input_tokens_seen": 1667235840,
|
|
"step": 3180,
|
|
"train_runtime": 14486.7506,
|
|
"train_tokens_per_second": 115086.943
|
|
},
|
|
{
|
|
"epoch": 0.17262371817419303,
|
|
"grad_norm": 0.18241587281227112,
|
|
"learning_rate": 0.004756190560120631,
|
|
"loss": 3.2984477996826174,
|
|
"num_input_tokens_seen": 1672478720,
|
|
"step": 3190,
|
|
"train_runtime": 14531.9861,
|
|
"train_tokens_per_second": 115089.48
|
|
},
|
|
{
|
|
"epoch": 0.17316485835655726,
|
|
"grad_norm": 0.18176402151584625,
|
|
"learning_rate": 0.00475440759709514,
|
|
"loss": 3.300640869140625,
|
|
"num_input_tokens_seen": 1677721600,
|
|
"step": 3200,
|
|
"train_runtime": 14577.2147,
|
|
"train_tokens_per_second": 115092.055
|
|
},
|
|
{
|
|
"epoch": 0.17370599853892152,
|
|
"grad_norm": 0.20022694766521454,
|
|
"learning_rate": 0.004752618514705466,
|
|
"loss": 3.300579071044922,
|
|
"num_input_tokens_seen": 1682964480,
|
|
"step": 3210,
|
|
"train_runtime": 14622.4411,
|
|
"train_tokens_per_second": 115094.632
|
|
},
|
|
{
|
|
"epoch": 0.17424713872128575,
|
|
"grad_norm": 0.18792809545993805,
|
|
"learning_rate": 0.0047508233184135945,
|
|
"loss": 3.295984649658203,
|
|
"num_input_tokens_seen": 1688207360,
|
|
"step": 3220,
|
|
"train_runtime": 14667.6619,
|
|
"train_tokens_per_second": 115097.237
|
|
},
|
|
{
|
|
"epoch": 0.17478827890364998,
|
|
"grad_norm": 0.200827494263649,
|
|
"learning_rate": 0.0047490220137001785,
|
|
"loss": 3.2906261444091798,
|
|
"num_input_tokens_seen": 1693450240,
|
|
"step": 3230,
|
|
"train_runtime": 14712.8844,
|
|
"train_tokens_per_second": 115099.813
|
|
},
|
|
{
|
|
"epoch": 0.17532941908601424,
|
|
"grad_norm": 0.19141127169132233,
|
|
"learning_rate": 0.004747214606064517,
|
|
"loss": 3.2837890625,
|
|
"num_input_tokens_seen": 1698693120,
|
|
"step": 3240,
|
|
"train_runtime": 14758.1057,
|
|
"train_tokens_per_second": 115102.382
|
|
},
|
|
{
|
|
"epoch": 0.17587055926837847,
|
|
"grad_norm": 0.18976351618766785,
|
|
"learning_rate": 0.0047454011010245436,
|
|
"loss": 3.287107467651367,
|
|
"num_input_tokens_seen": 1703936000,
|
|
"step": 3250,
|
|
"train_runtime": 14803.3273,
|
|
"train_tokens_per_second": 115104.933
|
|
},
|
|
{
|
|
"epoch": 0.17641169945074273,
|
|
"grad_norm": 0.19698546826839447,
|
|
"learning_rate": 0.004743581504116804,
|
|
"loss": 3.2882354736328123,
|
|
"num_input_tokens_seen": 1709178880,
|
|
"step": 3260,
|
|
"train_runtime": 14848.5302,
|
|
"train_tokens_per_second": 115107.614
|
|
},
|
|
{
|
|
"epoch": 0.17695283963310696,
|
|
"grad_norm": 0.17822493612766266,
|
|
"learning_rate": 0.004741755820896446,
|
|
"loss": 3.2927810668945314,
|
|
"num_input_tokens_seen": 1714421760,
|
|
"step": 3270,
|
|
"train_runtime": 14893.7537,
|
|
"train_tokens_per_second": 115110.119
|
|
},
|
|
{
|
|
"epoch": 0.1774939798154712,
|
|
"grad_norm": 0.1720447987318039,
|
|
"learning_rate": 0.004739924056937195,
|
|
"loss": 3.2904899597167967,
|
|
"num_input_tokens_seen": 1719664640,
|
|
"step": 3280,
|
|
"train_runtime": 14938.9717,
|
|
"train_tokens_per_second": 115112.652
|
|
},
|
|
{
|
|
"epoch": 0.17803511999783544,
|
|
"grad_norm": 0.18303626775741577,
|
|
"learning_rate": 0.004738086217831344,
|
|
"loss": 3.28282470703125,
|
|
"num_input_tokens_seen": 1724907520,
|
|
"step": 3290,
|
|
"train_runtime": 14984.1992,
|
|
"train_tokens_per_second": 115115.096
|
|
},
|
|
{
|
|
"epoch": 0.17857626018019968,
|
|
"grad_norm": 0.176763117313385,
|
|
"learning_rate": 0.004736242309189728,
|
|
"loss": 3.286945343017578,
|
|
"num_input_tokens_seen": 1730150400,
|
|
"step": 3300,
|
|
"train_runtime": 15029.4297,
|
|
"train_tokens_per_second": 115117.502
|
|
},
|
|
{
|
|
"epoch": 0.17911740036256393,
|
|
"grad_norm": 0.19218850135803223,
|
|
"learning_rate": 0.004734392336641718,
|
|
"loss": 3.290885162353516,
|
|
"num_input_tokens_seen": 1735393280,
|
|
"step": 3310,
|
|
"train_runtime": 15074.6639,
|
|
"train_tokens_per_second": 115119.866
|
|
},
|
|
{
|
|
"epoch": 0.17965854054492816,
|
|
"grad_norm": 0.180914968252182,
|
|
"learning_rate": 0.004732536305835194,
|
|
"loss": 3.2893463134765626,
|
|
"num_input_tokens_seen": 1740636160,
|
|
"step": 3320,
|
|
"train_runtime": 15119.9044,
|
|
"train_tokens_per_second": 115122.167
|
|
},
|
|
{
|
|
"epoch": 0.1801996807272924,
|
|
"grad_norm": 0.1835494488477707,
|
|
"learning_rate": 0.0047306742224365326,
|
|
"loss": 3.2857479095458983,
|
|
"num_input_tokens_seen": 1745879040,
|
|
"step": 3330,
|
|
"train_runtime": 15165.1285,
|
|
"train_tokens_per_second": 115124.579
|
|
},
|
|
{
|
|
"epoch": 0.18074082090965665,
|
|
"grad_norm": 0.1805170625448227,
|
|
"learning_rate": 0.004728806092130589,
|
|
"loss": 3.2880077362060547,
|
|
"num_input_tokens_seen": 1751121920,
|
|
"step": 3340,
|
|
"train_runtime": 15210.3276,
|
|
"train_tokens_per_second": 115127.166
|
|
},
|
|
{
|
|
"epoch": 0.18128196109202088,
|
|
"grad_norm": 0.18228840827941895,
|
|
"learning_rate": 0.00472693192062068,
|
|
"loss": 3.286875915527344,
|
|
"num_input_tokens_seen": 1756364800,
|
|
"step": 3350,
|
|
"train_runtime": 15255.5307,
|
|
"train_tokens_per_second": 115129.709
|
|
},
|
|
{
|
|
"epoch": 0.18182310127438514,
|
|
"grad_norm": 0.20272916555404663,
|
|
"learning_rate": 0.0047250517136285634,
|
|
"loss": 3.2986392974853516,
|
|
"num_input_tokens_seen": 1761607680,
|
|
"step": 3360,
|
|
"train_runtime": 15300.7449,
|
|
"train_tokens_per_second": 115132.151
|
|
},
|
|
{
|
|
"epoch": 0.18236424145674937,
|
|
"grad_norm": 0.17199651896953583,
|
|
"learning_rate": 0.0047231654768944255,
|
|
"loss": 3.2849578857421875,
|
|
"num_input_tokens_seen": 1766850560,
|
|
"step": 3370,
|
|
"train_runtime": 15345.9591,
|
|
"train_tokens_per_second": 115134.58
|
|
},
|
|
{
|
|
"epoch": 0.1829053816391136,
|
|
"grad_norm": 0.18118058145046234,
|
|
"learning_rate": 0.00472127321617686,
|
|
"loss": 3.2900650024414064,
|
|
"num_input_tokens_seen": 1772093440,
|
|
"step": 3380,
|
|
"train_runtime": 15391.1925,
|
|
"train_tokens_per_second": 115136.851
|
|
},
|
|
{
|
|
"epoch": 0.18344652182147786,
|
|
"grad_norm": 0.19814889132976532,
|
|
"learning_rate": 0.004719374937252852,
|
|
"loss": 3.280558776855469,
|
|
"num_input_tokens_seen": 1777336320,
|
|
"step": 3390,
|
|
"train_runtime": 15436.4025,
|
|
"train_tokens_per_second": 115139.283
|
|
},
|
|
{
|
|
"epoch": 0.1839876620038421,
|
|
"grad_norm": 0.2015380561351776,
|
|
"learning_rate": 0.00471747064591776,
|
|
"loss": 3.30006103515625,
|
|
"num_input_tokens_seen": 1782579200,
|
|
"step": 3400,
|
|
"train_runtime": 15481.5971,
|
|
"train_tokens_per_second": 115141.816
|
|
},
|
|
{
|
|
"epoch": 0.18452880218620635,
|
|
"grad_norm": 0.16767387092113495,
|
|
"learning_rate": 0.0047155603479852965,
|
|
"loss": 3.2787837982177734,
|
|
"num_input_tokens_seen": 1787822080,
|
|
"step": 3410,
|
|
"train_runtime": 15526.8015,
|
|
"train_tokens_per_second": 115144.261
|
|
},
|
|
{
|
|
"epoch": 0.18506994236857058,
|
|
"grad_norm": 0.169756680727005,
|
|
"learning_rate": 0.0047136440492875145,
|
|
"loss": 3.283340072631836,
|
|
"num_input_tokens_seen": 1793064960,
|
|
"step": 3420,
|
|
"train_runtime": 15572.0514,
|
|
"train_tokens_per_second": 115146.355
|
|
},
|
|
{
|
|
"epoch": 0.1856110825509348,
|
|
"grad_norm": 0.18903492391109467,
|
|
"learning_rate": 0.004711721755674787,
|
|
"loss": 3.289379119873047,
|
|
"num_input_tokens_seen": 1798307840,
|
|
"step": 3430,
|
|
"train_runtime": 15617.2557,
|
|
"train_tokens_per_second": 115148.774
|
|
},
|
|
{
|
|
"epoch": 0.18615222273329907,
|
|
"grad_norm": 0.19553808867931366,
|
|
"learning_rate": 0.004709793473015785,
|
|
"loss": 3.277596664428711,
|
|
"num_input_tokens_seen": 1803550720,
|
|
"step": 3440,
|
|
"train_runtime": 15666.2718,
|
|
"train_tokens_per_second": 115123.16
|
|
},
|
|
{
|
|
"epoch": 0.1866933629156633,
|
|
"grad_norm": 0.17524783313274384,
|
|
"learning_rate": 0.004707859207197468,
|
|
"loss": 3.272700881958008,
|
|
"num_input_tokens_seen": 1808793600,
|
|
"step": 3450,
|
|
"train_runtime": 15711.4431,
|
|
"train_tokens_per_second": 115125.873
|
|
},
|
|
{
|
|
"epoch": 0.18723450309802755,
|
|
"grad_norm": 0.1725703924894333,
|
|
"learning_rate": 0.004705918964125061,
|
|
"loss": 3.2771453857421875,
|
|
"num_input_tokens_seen": 1814036480,
|
|
"step": 3460,
|
|
"train_runtime": 15756.6281,
|
|
"train_tokens_per_second": 115128.47
|
|
},
|
|
{
|
|
"epoch": 0.18777564328039179,
|
|
"grad_norm": 0.18361718952655792,
|
|
"learning_rate": 0.004703972749722038,
|
|
"loss": 3.2812034606933596,
|
|
"num_input_tokens_seen": 1819279360,
|
|
"step": 3470,
|
|
"train_runtime": 15801.8225,
|
|
"train_tokens_per_second": 115130.983
|
|
},
|
|
{
|
|
"epoch": 0.18831678346275602,
|
|
"grad_norm": 0.18993116915225983,
|
|
"learning_rate": 0.004702020569930098,
|
|
"loss": 3.2690109252929687,
|
|
"num_input_tokens_seen": 1824522240,
|
|
"step": 3480,
|
|
"train_runtime": 15847.0133,
|
|
"train_tokens_per_second": 115133.508
|
|
},
|
|
{
|
|
"epoch": 0.18885792364512027,
|
|
"grad_norm": 0.1982622891664505,
|
|
"learning_rate": 0.004700062430709161,
|
|
"loss": 3.2883895874023437,
|
|
"num_input_tokens_seen": 1829765120,
|
|
"step": 3490,
|
|
"train_runtime": 15892.1956,
|
|
"train_tokens_per_second": 115136.081
|
|
},
|
|
{
|
|
"epoch": 0.1893990638274845,
|
|
"grad_norm": 0.1953168362379074,
|
|
"learning_rate": 0.004698098338037333,
|
|
"loss": 3.2819141387939452,
|
|
"num_input_tokens_seen": 1835008000,
|
|
"step": 3500,
|
|
"train_runtime": 15937.3587,
|
|
"train_tokens_per_second": 115138.778
|
|
},
|
|
{
|
|
"epoch": 0.1893990638274845,
|
|
"eval_loss": 3.2193782329559326,
|
|
"eval_runtime": 1.9829,
|
|
"eval_samples_per_second": 252.151,
|
|
"eval_steps_per_second": 4.034,
|
|
"num_input_tokens_seen": 1835008000,
|
|
"step": 3500
|
|
},
|
|
{
|
|
"epoch": 0.18994020400984876,
|
|
"grad_norm": 0.17765532433986664,
|
|
"learning_rate": 0.004696128297910899,
|
|
"loss": 3.2748733520507813,
|
|
"num_input_tokens_seen": 1840250880,
|
|
"step": 3510,
|
|
"train_runtime": 15984.532,
|
|
"train_tokens_per_second": 115126.979
|
|
},
|
|
{
|
|
"epoch": 0.190481344192213,
|
|
"grad_norm": 0.1692020744085312,
|
|
"learning_rate": 0.0046941523163443015,
|
|
"loss": 3.282354736328125,
|
|
"num_input_tokens_seen": 1845493760,
|
|
"step": 3520,
|
|
"train_runtime": 16029.707,
|
|
"train_tokens_per_second": 115129.601
|
|
},
|
|
{
|
|
"epoch": 0.19102248437457722,
|
|
"grad_norm": 0.17890560626983643,
|
|
"learning_rate": 0.00469217039937012,
|
|
"loss": 3.27266845703125,
|
|
"num_input_tokens_seen": 1850736640,
|
|
"step": 3530,
|
|
"train_runtime": 16074.8942,
|
|
"train_tokens_per_second": 115132.119
|
|
},
|
|
{
|
|
"epoch": 0.19156362455694148,
|
|
"grad_norm": 0.17925257980823517,
|
|
"learning_rate": 0.004690182553039058,
|
|
"loss": 3.28330078125,
|
|
"num_input_tokens_seen": 1855979520,
|
|
"step": 3540,
|
|
"train_runtime": 16120.1066,
|
|
"train_tokens_per_second": 115134.445
|
|
},
|
|
{
|
|
"epoch": 0.1921047647393057,
|
|
"grad_norm": 0.1788860410451889,
|
|
"learning_rate": 0.004688188783419917,
|
|
"loss": 3.2885406494140623,
|
|
"num_input_tokens_seen": 1861222400,
|
|
"step": 3550,
|
|
"train_runtime": 16165.2878,
|
|
"train_tokens_per_second": 115136.979
|
|
},
|
|
{
|
|
"epoch": 0.19264590492166997,
|
|
"grad_norm": 0.1811930388212204,
|
|
"learning_rate": 0.004686189096599585,
|
|
"loss": 3.2768978118896483,
|
|
"num_input_tokens_seen": 1866465280,
|
|
"step": 3560,
|
|
"train_runtime": 16210.475,
|
|
"train_tokens_per_second": 115139.456
|
|
},
|
|
{
|
|
"epoch": 0.1931870451040342,
|
|
"grad_norm": 0.2033502757549286,
|
|
"learning_rate": 0.004684183498683013,
|
|
"loss": 3.2799072265625,
|
|
"num_input_tokens_seen": 1871708160,
|
|
"step": 3570,
|
|
"train_runtime": 16255.6518,
|
|
"train_tokens_per_second": 115141.994
|
|
},
|
|
{
|
|
"epoch": 0.19372818528639843,
|
|
"grad_norm": 0.1871407926082611,
|
|
"learning_rate": 0.0046821719957932,
|
|
"loss": 3.2745807647705076,
|
|
"num_input_tokens_seen": 1876951040,
|
|
"step": 3580,
|
|
"train_runtime": 16300.8306,
|
|
"train_tokens_per_second": 115144.503
|
|
},
|
|
{
|
|
"epoch": 0.1942693254687627,
|
|
"grad_norm": 0.18156413733959198,
|
|
"learning_rate": 0.004680154594071171,
|
|
"loss": 3.275892639160156,
|
|
"num_input_tokens_seen": 1882193920,
|
|
"step": 3590,
|
|
"train_runtime": 16346.0348,
|
|
"train_tokens_per_second": 115146.819
|
|
},
|
|
{
|
|
"epoch": 0.19481046565112692,
|
|
"grad_norm": 0.17189612984657288,
|
|
"learning_rate": 0.004678131299675962,
|
|
"loss": 3.278411102294922,
|
|
"num_input_tokens_seen": 1887436800,
|
|
"step": 3600,
|
|
"train_runtime": 16391.2075,
|
|
"train_tokens_per_second": 115149.344
|
|
},
|
|
{
|
|
"epoch": 0.19535160583349118,
|
|
"grad_norm": 0.18602769076824188,
|
|
"learning_rate": 0.004676102118784596,
|
|
"loss": 3.2600128173828127,
|
|
"num_input_tokens_seen": 1892679680,
|
|
"step": 3610,
|
|
"train_runtime": 16436.3754,
|
|
"train_tokens_per_second": 115151.89
|
|
},
|
|
{
|
|
"epoch": 0.1958927460158554,
|
|
"grad_norm": 0.18247896432876587,
|
|
"learning_rate": 0.0046740670575920705,
|
|
"loss": 3.263835906982422,
|
|
"num_input_tokens_seen": 1897922560,
|
|
"step": 3620,
|
|
"train_runtime": 16481.5414,
|
|
"train_tokens_per_second": 115154.434
|
|
},
|
|
{
|
|
"epoch": 0.19643388619821964,
|
|
"grad_norm": 0.17899157106876373,
|
|
"learning_rate": 0.004672026122311332,
|
|
"loss": 3.266416549682617,
|
|
"num_input_tokens_seen": 1903165440,
|
|
"step": 3630,
|
|
"train_runtime": 16526.6863,
|
|
"train_tokens_per_second": 115157.111
|
|
},
|
|
{
|
|
"epoch": 0.1969750263805839,
|
|
"grad_norm": 0.19543124735355377,
|
|
"learning_rate": 0.004669979319173264,
|
|
"loss": 3.261871337890625,
|
|
"num_input_tokens_seen": 1908408320,
|
|
"step": 3640,
|
|
"train_runtime": 16571.8633,
|
|
"train_tokens_per_second": 115159.55
|
|
},
|
|
{
|
|
"epoch": 0.19751616656294813,
|
|
"grad_norm": 0.18458126485347748,
|
|
"learning_rate": 0.004667926654426661,
|
|
"loss": 3.2731971740722656,
|
|
"num_input_tokens_seen": 1913651200,
|
|
"step": 3650,
|
|
"train_runtime": 16617.0658,
|
|
"train_tokens_per_second": 115161.799
|
|
},
|
|
{
|
|
"epoch": 0.19805730674531238,
|
|
"grad_norm": 0.18683847784996033,
|
|
"learning_rate": 0.004665868134338213,
|
|
"loss": 3.2641891479492187,
|
|
"num_input_tokens_seen": 1918894080,
|
|
"step": 3660,
|
|
"train_runtime": 16662.2485,
|
|
"train_tokens_per_second": 115164.173
|
|
},
|
|
{
|
|
"epoch": 0.19859844692767661,
|
|
"grad_norm": 0.18460538983345032,
|
|
"learning_rate": 0.00466380376519249,
|
|
"loss": 3.261058044433594,
|
|
"num_input_tokens_seen": 1924136960,
|
|
"step": 3670,
|
|
"train_runtime": 16707.4259,
|
|
"train_tokens_per_second": 115166.572
|
|
},
|
|
{
|
|
"epoch": 0.19913958711004084,
|
|
"grad_norm": 0.17181532084941864,
|
|
"learning_rate": 0.004661733553291914,
|
|
"loss": 3.2611160278320312,
|
|
"num_input_tokens_seen": 1929379840,
|
|
"step": 3680,
|
|
"train_runtime": 16752.6562,
|
|
"train_tokens_per_second": 115168.593
|
|
},
|
|
{
|
|
"epoch": 0.1996807272924051,
|
|
"grad_norm": 0.19158703088760376,
|
|
"learning_rate": 0.004659657504956747,
|
|
"loss": 3.2646514892578127,
|
|
"num_input_tokens_seen": 1934622720,
|
|
"step": 3690,
|
|
"train_runtime": 16797.8998,
|
|
"train_tokens_per_second": 115170.512
|
|
},
|
|
{
|
|
"epoch": 0.20022186747476933,
|
|
"grad_norm": 0.18142655491828918,
|
|
"learning_rate": 0.004657575626525069,
|
|
"loss": 3.258336639404297,
|
|
"num_input_tokens_seen": 1939865600,
|
|
"step": 3700,
|
|
"train_runtime": 16843.1356,
|
|
"train_tokens_per_second": 115172.474
|
|
},
|
|
{
|
|
"epoch": 0.2007630076571336,
|
|
"grad_norm": 0.1888807713985443,
|
|
"learning_rate": 0.00465548792435276,
|
|
"loss": 3.256613922119141,
|
|
"num_input_tokens_seen": 1945108480,
|
|
"step": 3710,
|
|
"train_runtime": 16888.3732,
|
|
"train_tokens_per_second": 115174.414
|
|
},
|
|
{
|
|
"epoch": 0.20130414783949782,
|
|
"grad_norm": 0.17357957363128662,
|
|
"learning_rate": 0.004653394404813478,
|
|
"loss": 3.2642303466796876,
|
|
"num_input_tokens_seen": 1950351360,
|
|
"step": 3720,
|
|
"train_runtime": 16933.6199,
|
|
"train_tokens_per_second": 115176.281
|
|
},
|
|
{
|
|
"epoch": 0.20184528802186205,
|
|
"grad_norm": 0.17942315340042114,
|
|
"learning_rate": 0.004651295074298641,
|
|
"loss": 3.254298782348633,
|
|
"num_input_tokens_seen": 1955594240,
|
|
"step": 3730,
|
|
"train_runtime": 16978.845,
|
|
"train_tokens_per_second": 115178.284
|
|
},
|
|
{
|
|
"epoch": 0.2023864282042263,
|
|
"grad_norm": 0.17983372509479523,
|
|
"learning_rate": 0.00464918993921741,
|
|
"loss": 3.2564628601074217,
|
|
"num_input_tokens_seen": 1960837120,
|
|
"step": 3740,
|
|
"train_runtime": 17024.0569,
|
|
"train_tokens_per_second": 115180.367
|
|
},
|
|
{
|
|
"epoch": 0.20292756838659054,
|
|
"grad_norm": 0.19154661893844604,
|
|
"learning_rate": 0.004647079005996664,
|
|
"loss": 3.2626083374023436,
|
|
"num_input_tokens_seen": 1966080000,
|
|
"step": 3750,
|
|
"train_runtime": 17069.2756,
|
|
"train_tokens_per_second": 115182.392
|
|
},
|
|
{
|
|
"epoch": 0.2034687085689548,
|
|
"grad_norm": 0.16907712817192078,
|
|
"learning_rate": 0.0046449622810809865,
|
|
"loss": 3.2560802459716798,
|
|
"num_input_tokens_seen": 1971322880,
|
|
"step": 3760,
|
|
"train_runtime": 17114.512,
|
|
"train_tokens_per_second": 115184.288
|
|
},
|
|
{
|
|
"epoch": 0.20400984875131903,
|
|
"grad_norm": 0.1877511590719223,
|
|
"learning_rate": 0.004642839770932641,
|
|
"loss": 3.2611919403076173,
|
|
"num_input_tokens_seen": 1976565760,
|
|
"step": 3770,
|
|
"train_runtime": 17159.7356,
|
|
"train_tokens_per_second": 115186.26
|
|
},
|
|
{
|
|
"epoch": 0.20455098893368326,
|
|
"grad_norm": 0.1924838423728943,
|
|
"learning_rate": 0.004640711482031552,
|
|
"loss": 3.259069061279297,
|
|
"num_input_tokens_seen": 1981808640,
|
|
"step": 3780,
|
|
"train_runtime": 17204.9712,
|
|
"train_tokens_per_second": 115188.14
|
|
},
|
|
{
|
|
"epoch": 0.20509212911604752,
|
|
"grad_norm": 0.17791348695755005,
|
|
"learning_rate": 0.00463857742087529,
|
|
"loss": 3.2603363037109374,
|
|
"num_input_tokens_seen": 1987051520,
|
|
"step": 3790,
|
|
"train_runtime": 17250.1988,
|
|
"train_tokens_per_second": 115190.065
|
|
},
|
|
{
|
|
"epoch": 0.20563326929841175,
|
|
"grad_norm": 0.18873880803585052,
|
|
"learning_rate": 0.004636437593979043,
|
|
"loss": 3.260697937011719,
|
|
"num_input_tokens_seen": 1992294400,
|
|
"step": 3800,
|
|
"train_runtime": 17295.4319,
|
|
"train_tokens_per_second": 115191.943
|
|
},
|
|
{
|
|
"epoch": 0.206174409480776,
|
|
"grad_norm": 0.1765436977148056,
|
|
"learning_rate": 0.004634292007875606,
|
|
"loss": 3.25205078125,
|
|
"num_input_tokens_seen": 1997537280,
|
|
"step": 3810,
|
|
"train_runtime": 17340.6638,
|
|
"train_tokens_per_second": 115193.819
|
|
},
|
|
{
|
|
"epoch": 0.20671554966314024,
|
|
"grad_norm": 0.17367282509803772,
|
|
"learning_rate": 0.004632140669115353,
|
|
"loss": 3.2628250122070312,
|
|
"num_input_tokens_seen": 2002780160,
|
|
"step": 3820,
|
|
"train_runtime": 17390.4197,
|
|
"train_tokens_per_second": 115165.717
|
|
},
|
|
{
|
|
"epoch": 0.20725668984550447,
|
|
"grad_norm": 0.1866482049226761,
|
|
"learning_rate": 0.004629983584266224,
|
|
"loss": 3.255748748779297,
|
|
"num_input_tokens_seen": 2008023040,
|
|
"step": 3830,
|
|
"train_runtime": 17435.5785,
|
|
"train_tokens_per_second": 115168.134
|
|
},
|
|
{
|
|
"epoch": 0.20779783002786872,
|
|
"grad_norm": 0.18709656596183777,
|
|
"learning_rate": 0.004627820759913699,
|
|
"loss": 3.2663009643554686,
|
|
"num_input_tokens_seen": 2013265920,
|
|
"step": 3840,
|
|
"train_runtime": 17480.7483,
|
|
"train_tokens_per_second": 115170.465
|
|
},
|
|
{
|
|
"epoch": 0.20833897021023295,
|
|
"grad_norm": 0.19228561222553253,
|
|
"learning_rate": 0.0046256522026607814,
|
|
"loss": 3.2513301849365233,
|
|
"num_input_tokens_seen": 2018508800,
|
|
"step": 3850,
|
|
"train_runtime": 17525.9241,
|
|
"train_tokens_per_second": 115172.746
|
|
},
|
|
{
|
|
"epoch": 0.2088801103925972,
|
|
"grad_norm": 0.17255409061908722,
|
|
"learning_rate": 0.004623477919127976,
|
|
"loss": 3.243180847167969,
|
|
"num_input_tokens_seen": 2023751680,
|
|
"step": 3860,
|
|
"train_runtime": 17571.1102,
|
|
"train_tokens_per_second": 115174.947
|
|
},
|
|
{
|
|
"epoch": 0.20942125057496144,
|
|
"grad_norm": 0.176405668258667,
|
|
"learning_rate": 0.004621297915953271,
|
|
"loss": 3.2499061584472657,
|
|
"num_input_tokens_seen": 2028994560,
|
|
"step": 3870,
|
|
"train_runtime": 17616.3069,
|
|
"train_tokens_per_second": 115177.067
|
|
},
|
|
{
|
|
"epoch": 0.20996239075732567,
|
|
"grad_norm": 0.17931312322616577,
|
|
"learning_rate": 0.004619112199792115,
|
|
"loss": 3.263928985595703,
|
|
"num_input_tokens_seen": 2034237440,
|
|
"step": 3880,
|
|
"train_runtime": 17661.4804,
|
|
"train_tokens_per_second": 115179.328
|
|
},
|
|
{
|
|
"epoch": 0.21050353093968993,
|
|
"grad_norm": 0.1927679032087326,
|
|
"learning_rate": 0.004616920777317401,
|
|
"loss": 3.243641662597656,
|
|
"num_input_tokens_seen": 2039480320,
|
|
"step": 3890,
|
|
"train_runtime": 17706.6588,
|
|
"train_tokens_per_second": 115181.545
|
|
},
|
|
{
|
|
"epoch": 0.21104467112205416,
|
|
"grad_norm": 0.1834532767534256,
|
|
"learning_rate": 0.00461472365521944,
|
|
"loss": 3.2529090881347655,
|
|
"num_input_tokens_seen": 2044723200,
|
|
"step": 3900,
|
|
"train_runtime": 17751.8407,
|
|
"train_tokens_per_second": 115183.728
|
|
},
|
|
{
|
|
"epoch": 0.21158581130441842,
|
|
"grad_norm": 0.18204724788665771,
|
|
"learning_rate": 0.004612520840205942,
|
|
"loss": 3.252873992919922,
|
|
"num_input_tokens_seen": 2049966080,
|
|
"step": 3910,
|
|
"train_runtime": 17797.0199,
|
|
"train_tokens_per_second": 115185.918
|
|
},
|
|
{
|
|
"epoch": 0.21212695148678265,
|
|
"grad_norm": 0.17959408462047577,
|
|
"learning_rate": 0.0046103123390020045,
|
|
"loss": 3.2571083068847657,
|
|
"num_input_tokens_seen": 2055208960,
|
|
"step": 3920,
|
|
"train_runtime": 17842.2041,
|
|
"train_tokens_per_second": 115188.065
|
|
},
|
|
{
|
|
"epoch": 0.21266809166914688,
|
|
"grad_norm": 0.18271717429161072,
|
|
"learning_rate": 0.004608098158350076,
|
|
"loss": 3.2583240509033202,
|
|
"num_input_tokens_seen": 2060451840,
|
|
"step": 3930,
|
|
"train_runtime": 17887.3836,
|
|
"train_tokens_per_second": 115190.23
|
|
},
|
|
{
|
|
"epoch": 0.21320923185151114,
|
|
"grad_norm": 0.1708153486251831,
|
|
"learning_rate": 0.004605878305009951,
|
|
"loss": 3.2490577697753906,
|
|
"num_input_tokens_seen": 2065694720,
|
|
"step": 3940,
|
|
"train_runtime": 17932.5711,
|
|
"train_tokens_per_second": 115192.334
|
|
},
|
|
{
|
|
"epoch": 0.21375037203387537,
|
|
"grad_norm": 0.17891845107078552,
|
|
"learning_rate": 0.004603652785758739,
|
|
"loss": 3.253165435791016,
|
|
"num_input_tokens_seen": 2070937600,
|
|
"step": 3950,
|
|
"train_runtime": 17977.7786,
|
|
"train_tokens_per_second": 115194.299
|
|
},
|
|
{
|
|
"epoch": 0.21429151221623963,
|
|
"grad_norm": 0.19264911115169525,
|
|
"learning_rate": 0.0046014216073908465,
|
|
"loss": 3.252245330810547,
|
|
"num_input_tokens_seen": 2076180480,
|
|
"step": 3960,
|
|
"train_runtime": 18022.9578,
|
|
"train_tokens_per_second": 115196.434
|
|
},
|
|
{
|
|
"epoch": 0.21483265239860386,
|
|
"grad_norm": 0.17727237939834595,
|
|
"learning_rate": 0.00459918477671796,
|
|
"loss": 3.2557418823242186,
|
|
"num_input_tokens_seen": 2081423360,
|
|
"step": 3970,
|
|
"train_runtime": 18068.1382,
|
|
"train_tokens_per_second": 115198.552
|
|
},
|
|
{
|
|
"epoch": 0.2153737925809681,
|
|
"grad_norm": 0.18832355737686157,
|
|
"learning_rate": 0.00459694230056902,
|
|
"loss": 3.2547958374023436,
|
|
"num_input_tokens_seen": 2086666240,
|
|
"step": 3980,
|
|
"train_runtime": 18113.3099,
|
|
"train_tokens_per_second": 115200.714
|
|
},
|
|
{
|
|
"epoch": 0.21591493276333235,
|
|
"grad_norm": 0.1745108813047409,
|
|
"learning_rate": 0.004594694185790203,
|
|
"loss": 3.2427162170410155,
|
|
"num_input_tokens_seen": 2091909120,
|
|
"step": 3990,
|
|
"train_runtime": 18158.476,
|
|
"train_tokens_per_second": 115202.901
|
|
},
|
|
{
|
|
"epoch": 0.21645607294569658,
|
|
"grad_norm": 0.18806034326553345,
|
|
"learning_rate": 0.004592440439244901,
|
|
"loss": 3.247505950927734,
|
|
"num_input_tokens_seen": 2097152000,
|
|
"step": 4000,
|
|
"train_runtime": 18203.6569,
|
|
"train_tokens_per_second": 115204.984
|
|
},
|
|
{
|
|
"epoch": 0.21645607294569658,
|
|
"eval_loss": 3.1910831928253174,
|
|
"eval_runtime": 1.9924,
|
|
"eval_samples_per_second": 250.957,
|
|
"eval_steps_per_second": 4.015,
|
|
"num_input_tokens_seen": 2097152000,
|
|
"step": 4000
|
|
},
|
|
{
|
|
"epoch": 0.21699721312806083,
|
|
"grad_norm": 0.18182304501533508,
|
|
"learning_rate": 0.004590181067813696,
|
|
"loss": 3.2401611328125,
|
|
"num_input_tokens_seen": 2102394880,
|
|
"step": 4010,
|
|
"train_runtime": 18253.3021,
|
|
"train_tokens_per_second": 115178.879
|
|
},
|
|
{
|
|
"epoch": 0.21753835331042506,
|
|
"grad_norm": 0.19632118940353394,
|
|
"learning_rate": 0.004587916078394347,
|
|
"loss": 3.248242950439453,
|
|
"num_input_tokens_seen": 2107637760,
|
|
"step": 4020,
|
|
"train_runtime": 18298.4585,
|
|
"train_tokens_per_second": 115181.164
|
|
},
|
|
{
|
|
"epoch": 0.2180794934927893,
|
|
"grad_norm": 0.17511795461177826,
|
|
"learning_rate": 0.004585645477901763,
|
|
"loss": 3.2442108154296876,
|
|
"num_input_tokens_seen": 2112880640,
|
|
"step": 4030,
|
|
"train_runtime": 18343.616,
|
|
"train_tokens_per_second": 115183.432
|
|
},
|
|
{
|
|
"epoch": 0.21862063367515355,
|
|
"grad_norm": 0.18919962644577026,
|
|
"learning_rate": 0.004583369273267981,
|
|
"loss": 3.2474128723144533,
|
|
"num_input_tokens_seen": 2118123520,
|
|
"step": 4040,
|
|
"train_runtime": 18388.8196,
|
|
"train_tokens_per_second": 115185.399
|
|
},
|
|
{
|
|
"epoch": 0.21916177385751778,
|
|
"grad_norm": 0.1883443295955658,
|
|
"learning_rate": 0.00458108747144215,
|
|
"loss": 3.2397232055664062,
|
|
"num_input_tokens_seen": 2123366400,
|
|
"step": 4050,
|
|
"train_runtime": 18433.9903,
|
|
"train_tokens_per_second": 115187.562
|
|
},
|
|
{
|
|
"epoch": 0.21970291403988204,
|
|
"grad_norm": 0.16874343156814575,
|
|
"learning_rate": 0.004578800079390506,
|
|
"loss": 3.243609619140625,
|
|
"num_input_tokens_seen": 2128609280,
|
|
"step": 4060,
|
|
"train_runtime": 18479.1743,
|
|
"train_tokens_per_second": 115189.632
|
|
},
|
|
{
|
|
"epoch": 0.22024405422224627,
|
|
"grad_norm": 0.1780671924352646,
|
|
"learning_rate": 0.004576507104096353,
|
|
"loss": 3.249961090087891,
|
|
"num_input_tokens_seen": 2133852160,
|
|
"step": 4070,
|
|
"train_runtime": 18524.3424,
|
|
"train_tokens_per_second": 115191.79
|
|
},
|
|
{
|
|
"epoch": 0.2207851944046105,
|
|
"grad_norm": 0.1814623475074768,
|
|
"learning_rate": 0.0045742085525600365,
|
|
"loss": 3.247069549560547,
|
|
"num_input_tokens_seen": 2139095040,
|
|
"step": 4080,
|
|
"train_runtime": 18569.5288,
|
|
"train_tokens_per_second": 115193.825
|
|
},
|
|
{
|
|
"epoch": 0.22132633458697476,
|
|
"grad_norm": 0.18335077166557312,
|
|
"learning_rate": 0.004571904431798931,
|
|
"loss": 3.241147994995117,
|
|
"num_input_tokens_seen": 2144337920,
|
|
"step": 4090,
|
|
"train_runtime": 18614.7052,
|
|
"train_tokens_per_second": 115195.911
|
|
},
|
|
{
|
|
"epoch": 0.221867474769339,
|
|
"grad_norm": 0.16724441945552826,
|
|
"learning_rate": 0.004569594748847409,
|
|
"loss": 3.24347038269043,
|
|
"num_input_tokens_seen": 2149580800,
|
|
"step": 4100,
|
|
"train_runtime": 18659.889,
|
|
"train_tokens_per_second": 115197.942
|
|
},
|
|
{
|
|
"epoch": 0.22240861495170325,
|
|
"grad_norm": 0.17200608551502228,
|
|
"learning_rate": 0.004567279510756828,
|
|
"loss": 3.2341545104980467,
|
|
"num_input_tokens_seen": 2154823680,
|
|
"step": 4110,
|
|
"train_runtime": 18705.0717,
|
|
"train_tokens_per_second": 115199.969
|
|
},
|
|
{
|
|
"epoch": 0.22294975513406748,
|
|
"grad_norm": 0.178621307015419,
|
|
"learning_rate": 0.0045649587245955026,
|
|
"loss": 3.2321949005126953,
|
|
"num_input_tokens_seen": 2160066560,
|
|
"step": 4120,
|
|
"train_runtime": 18750.2407,
|
|
"train_tokens_per_second": 115202.071
|
|
},
|
|
{
|
|
"epoch": 0.2234908953164317,
|
|
"grad_norm": 0.18516632914543152,
|
|
"learning_rate": 0.0045626323974486864,
|
|
"loss": 3.238597869873047,
|
|
"num_input_tokens_seen": 2165309440,
|
|
"step": 4130,
|
|
"train_runtime": 18795.4162,
|
|
"train_tokens_per_second": 115204.123
|
|
},
|
|
{
|
|
"epoch": 0.22403203549879597,
|
|
"grad_norm": 0.20164692401885986,
|
|
"learning_rate": 0.004560300536418549,
|
|
"loss": 3.237165832519531,
|
|
"num_input_tokens_seen": 2170552320,
|
|
"step": 4140,
|
|
"train_runtime": 18840.5926,
|
|
"train_tokens_per_second": 115206.16
|
|
},
|
|
{
|
|
"epoch": 0.2245731756811602,
|
|
"grad_norm": 0.1872573047876358,
|
|
"learning_rate": 0.004557963148624155,
|
|
"loss": 3.2406959533691406,
|
|
"num_input_tokens_seen": 2175795200,
|
|
"step": 4150,
|
|
"train_runtime": 18885.7554,
|
|
"train_tokens_per_second": 115208.269
|
|
},
|
|
{
|
|
"epoch": 0.22511431586352446,
|
|
"grad_norm": 0.1811392605304718,
|
|
"learning_rate": 0.0045556202412014414,
|
|
"loss": 3.235840606689453,
|
|
"num_input_tokens_seen": 2181038080,
|
|
"step": 4160,
|
|
"train_runtime": 18930.9167,
|
|
"train_tokens_per_second": 115210.379
|
|
},
|
|
{
|
|
"epoch": 0.22565545604588869,
|
|
"grad_norm": 0.1595960259437561,
|
|
"learning_rate": 0.0045532718213031976,
|
|
"loss": 3.2397125244140623,
|
|
"num_input_tokens_seen": 2186280960,
|
|
"step": 4170,
|
|
"train_runtime": 18976.0748,
|
|
"train_tokens_per_second": 115212.497
|
|
},
|
|
{
|
|
"epoch": 0.22619659622825292,
|
|
"grad_norm": 0.16895633935928345,
|
|
"learning_rate": 0.00455091789609904,
|
|
"loss": 3.2353279113769533,
|
|
"num_input_tokens_seen": 2191523840,
|
|
"step": 4180,
|
|
"train_runtime": 19021.2321,
|
|
"train_tokens_per_second": 115214.61
|
|
},
|
|
{
|
|
"epoch": 0.22673773641061717,
|
|
"grad_norm": 0.17124150693416595,
|
|
"learning_rate": 0.004548558472775396,
|
|
"loss": 3.2387535095214846,
|
|
"num_input_tokens_seen": 2196766720,
|
|
"step": 4190,
|
|
"train_runtime": 19066.399,
|
|
"train_tokens_per_second": 115216.656
|
|
},
|
|
{
|
|
"epoch": 0.2272788765929814,
|
|
"grad_norm": 0.1730462908744812,
|
|
"learning_rate": 0.004546193558535476,
|
|
"loss": 3.228282165527344,
|
|
"num_input_tokens_seen": 2202009600,
|
|
"step": 4200,
|
|
"train_runtime": 19115.1785,
|
|
"train_tokens_per_second": 115196.916
|
|
},
|
|
{
|
|
"epoch": 0.22782001677534566,
|
|
"grad_norm": 0.2116994708776474,
|
|
"learning_rate": 0.004543823160599253,
|
|
"loss": 3.228871154785156,
|
|
"num_input_tokens_seen": 2207252480,
|
|
"step": 4210,
|
|
"train_runtime": 19160.343,
|
|
"train_tokens_per_second": 115199.007
|
|
},
|
|
{
|
|
"epoch": 0.2283611569577099,
|
|
"grad_norm": 0.1870228499174118,
|
|
"learning_rate": 0.004541447286203444,
|
|
"loss": 3.2268039703369142,
|
|
"num_input_tokens_seen": 2212495360,
|
|
"step": 4220,
|
|
"train_runtime": 19205.5057,
|
|
"train_tokens_per_second": 115201.099
|
|
},
|
|
{
|
|
"epoch": 0.22890229714007412,
|
|
"grad_norm": 0.18021926283836365,
|
|
"learning_rate": 0.004539065942601484,
|
|
"loss": 3.2385711669921875,
|
|
"num_input_tokens_seen": 2217738240,
|
|
"step": 4230,
|
|
"train_runtime": 19250.6698,
|
|
"train_tokens_per_second": 115203.173
|
|
},
|
|
{
|
|
"epoch": 0.22944343732243838,
|
|
"grad_norm": 0.178096741437912,
|
|
"learning_rate": 0.004536679137063506,
|
|
"loss": 3.2425048828125,
|
|
"num_input_tokens_seen": 2222981120,
|
|
"step": 4240,
|
|
"train_runtime": 19295.8409,
|
|
"train_tokens_per_second": 115205.195
|
|
},
|
|
{
|
|
"epoch": 0.2299845775048026,
|
|
"grad_norm": 0.17963331937789917,
|
|
"learning_rate": 0.004534286876876316,
|
|
"loss": 3.2272270202636717,
|
|
"num_input_tokens_seen": 2228224000,
|
|
"step": 4250,
|
|
"train_runtime": 19341.0226,
|
|
"train_tokens_per_second": 115207.145
|
|
},
|
|
{
|
|
"epoch": 0.23052571768716687,
|
|
"grad_norm": 0.1644730269908905,
|
|
"learning_rate": 0.004531889169343374,
|
|
"loss": 3.232299041748047,
|
|
"num_input_tokens_seen": 2233466880,
|
|
"step": 4260,
|
|
"train_runtime": 19386.1815,
|
|
"train_tokens_per_second": 115209.221
|
|
},
|
|
{
|
|
"epoch": 0.2310668578695311,
|
|
"grad_norm": 0.18202030658721924,
|
|
"learning_rate": 0.004529486021784774,
|
|
"loss": 3.232588195800781,
|
|
"num_input_tokens_seen": 2238709760,
|
|
"step": 4270,
|
|
"train_runtime": 19431.3552,
|
|
"train_tokens_per_second": 115211.201
|
|
},
|
|
{
|
|
"epoch": 0.23160799805189533,
|
|
"grad_norm": 0.1674603968858719,
|
|
"learning_rate": 0.004527077441537213,
|
|
"loss": 3.2268638610839844,
|
|
"num_input_tokens_seen": 2243952640,
|
|
"step": 4280,
|
|
"train_runtime": 19476.5366,
|
|
"train_tokens_per_second": 115213.125
|
|
},
|
|
{
|
|
"epoch": 0.2321491382342596,
|
|
"grad_norm": 0.17482970654964447,
|
|
"learning_rate": 0.004524663435953974,
|
|
"loss": 3.231060791015625,
|
|
"num_input_tokens_seen": 2249195520,
|
|
"step": 4290,
|
|
"train_runtime": 19521.6994,
|
|
"train_tokens_per_second": 115215.15
|
|
},
|
|
{
|
|
"epoch": 0.23269027841662382,
|
|
"grad_norm": 0.1693650186061859,
|
|
"learning_rate": 0.004522244012404908,
|
|
"loss": 3.2219474792480467,
|
|
"num_input_tokens_seen": 2254438400,
|
|
"step": 4300,
|
|
"train_runtime": 19566.837,
|
|
"train_tokens_per_second": 115217.314
|
|
},
|
|
{
|
|
"epoch": 0.23323141859898808,
|
|
"grad_norm": 0.16282694041728973,
|
|
"learning_rate": 0.004519819178276401,
|
|
"loss": 3.214075469970703,
|
|
"num_input_tokens_seen": 2259681280,
|
|
"step": 4310,
|
|
"train_runtime": 19611.992,
|
|
"train_tokens_per_second": 115219.366
|
|
},
|
|
{
|
|
"epoch": 0.2337725587813523,
|
|
"grad_norm": 0.17166836559772491,
|
|
"learning_rate": 0.004517388940971363,
|
|
"loss": 3.229071044921875,
|
|
"num_input_tokens_seen": 2264924160,
|
|
"step": 4320,
|
|
"train_runtime": 19657.1365,
|
|
"train_tokens_per_second": 115221.47
|
|
},
|
|
{
|
|
"epoch": 0.23431369896371654,
|
|
"grad_norm": 0.1811853051185608,
|
|
"learning_rate": 0.004514953307909195,
|
|
"loss": 3.2278045654296874,
|
|
"num_input_tokens_seen": 2270167040,
|
|
"step": 4330,
|
|
"train_runtime": 19702.2886,
|
|
"train_tokens_per_second": 115223.52
|
|
},
|
|
{
|
|
"epoch": 0.2348548391460808,
|
|
"grad_norm": 0.18831849098205566,
|
|
"learning_rate": 0.0045125122865257725,
|
|
"loss": 3.2335960388183596,
|
|
"num_input_tokens_seen": 2275409920,
|
|
"step": 4340,
|
|
"train_runtime": 19747.4554,
|
|
"train_tokens_per_second": 115225.475
|
|
},
|
|
{
|
|
"epoch": 0.23539597932844503,
|
|
"grad_norm": 0.18200556933879852,
|
|
"learning_rate": 0.004510065884273422,
|
|
"loss": 3.230799102783203,
|
|
"num_input_tokens_seen": 2280652800,
|
|
"step": 4350,
|
|
"train_runtime": 19792.6287,
|
|
"train_tokens_per_second": 115227.383
|
|
},
|
|
{
|
|
"epoch": 0.23593711951080928,
|
|
"grad_norm": 0.18054424226284027,
|
|
"learning_rate": 0.004507614108620896,
|
|
"loss": 3.2332107543945314,
|
|
"num_input_tokens_seen": 2285895680,
|
|
"step": 4360,
|
|
"train_runtime": 19837.7879,
|
|
"train_tokens_per_second": 115229.364
|
|
},
|
|
{
|
|
"epoch": 0.23647825969317351,
|
|
"grad_norm": 0.17672619223594666,
|
|
"learning_rate": 0.004505156967053355,
|
|
"loss": 3.229229736328125,
|
|
"num_input_tokens_seen": 2291138560,
|
|
"step": 4370,
|
|
"train_runtime": 19882.9214,
|
|
"train_tokens_per_second": 115231.485
|
|
},
|
|
{
|
|
"epoch": 0.23701939987553775,
|
|
"grad_norm": 0.18023458123207092,
|
|
"learning_rate": 0.004502694467072336,
|
|
"loss": 3.221567916870117,
|
|
"num_input_tokens_seen": 2296381440,
|
|
"step": 4380,
|
|
"train_runtime": 19928.0618,
|
|
"train_tokens_per_second": 115233.557
|
|
},
|
|
{
|
|
"epoch": 0.237560540057902,
|
|
"grad_norm": 0.18084236979484558,
|
|
"learning_rate": 0.0045002266161957415,
|
|
"loss": 3.2244552612304687,
|
|
"num_input_tokens_seen": 2301624320,
|
|
"step": 4390,
|
|
"train_runtime": 19973.1892,
|
|
"train_tokens_per_second": 115235.694
|
|
},
|
|
{
|
|
"epoch": 0.23810168024026623,
|
|
"grad_norm": 0.17632804811000824,
|
|
"learning_rate": 0.004497753421957804,
|
|
"loss": 3.2264179229736327,
|
|
"num_input_tokens_seen": 2306867200,
|
|
"step": 4400,
|
|
"train_runtime": 20018.2945,
|
|
"train_tokens_per_second": 115237.949
|
|
},
|
|
{
|
|
"epoch": 0.2386428204226305,
|
|
"grad_norm": 0.18496030569076538,
|
|
"learning_rate": 0.004495274891909074,
|
|
"loss": 3.2306861877441406,
|
|
"num_input_tokens_seen": 2312110080,
|
|
"step": 4410,
|
|
"train_runtime": 20063.4387,
|
|
"train_tokens_per_second": 115239.97
|
|
},
|
|
{
|
|
"epoch": 0.23918396060499472,
|
|
"grad_norm": 0.19217975437641144,
|
|
"learning_rate": 0.004492791033616388,
|
|
"loss": 3.2278289794921875,
|
|
"num_input_tokens_seen": 2317352960,
|
|
"step": 4420,
|
|
"train_runtime": 20108.5903,
|
|
"train_tokens_per_second": 115241.94
|
|
},
|
|
{
|
|
"epoch": 0.23972510078735895,
|
|
"grad_norm": 0.17978626489639282,
|
|
"learning_rate": 0.004490301854662851,
|
|
"loss": 3.222820281982422,
|
|
"num_input_tokens_seen": 2322595840,
|
|
"step": 4430,
|
|
"train_runtime": 20153.7635,
|
|
"train_tokens_per_second": 115243.778
|
|
},
|
|
{
|
|
"epoch": 0.2402662409697232,
|
|
"grad_norm": 0.1925116330385208,
|
|
"learning_rate": 0.0044878073626478145,
|
|
"loss": 3.216511535644531,
|
|
"num_input_tokens_seen": 2327838720,
|
|
"step": 4440,
|
|
"train_runtime": 20198.9336,
|
|
"train_tokens_per_second": 115245.625
|
|
},
|
|
{
|
|
"epoch": 0.24080738115208744,
|
|
"grad_norm": 0.1764633059501648,
|
|
"learning_rate": 0.004485307565186844,
|
|
"loss": 3.2247901916503907,
|
|
"num_input_tokens_seen": 2333081600,
|
|
"step": 4450,
|
|
"train_runtime": 20244.1177,
|
|
"train_tokens_per_second": 115247.383
|
|
},
|
|
{
|
|
"epoch": 0.2413485213344517,
|
|
"grad_norm": 0.18181581795215607,
|
|
"learning_rate": 0.0044828024699117095,
|
|
"loss": 3.2144775390625,
|
|
"num_input_tokens_seen": 2338324480,
|
|
"step": 4460,
|
|
"train_runtime": 20289.2915,
|
|
"train_tokens_per_second": 115249.193
|
|
},
|
|
{
|
|
"epoch": 0.24188966151681593,
|
|
"grad_norm": 0.1797225922346115,
|
|
"learning_rate": 0.0044802920844703486,
|
|
"loss": 3.2179054260253905,
|
|
"num_input_tokens_seen": 2343567360,
|
|
"step": 4470,
|
|
"train_runtime": 20334.4596,
|
|
"train_tokens_per_second": 115251.028
|
|
},
|
|
{
|
|
"epoch": 0.24243080169918016,
|
|
"grad_norm": 0.17131617665290833,
|
|
"learning_rate": 0.004477776416526856,
|
|
"loss": 3.2136348724365233,
|
|
"num_input_tokens_seen": 2348810240,
|
|
"step": 4480,
|
|
"train_runtime": 20379.6251,
|
|
"train_tokens_per_second": 115252.868
|
|
},
|
|
{
|
|
"epoch": 0.24297194188154442,
|
|
"grad_norm": 0.17490802705287933,
|
|
"learning_rate": 0.004475255473761447,
|
|
"loss": 3.223601531982422,
|
|
"num_input_tokens_seen": 2354053120,
|
|
"step": 4490,
|
|
"train_runtime": 20424.7932,
|
|
"train_tokens_per_second": 115254.686
|
|
},
|
|
{
|
|
"epoch": 0.24351308206390865,
|
|
"grad_norm": 0.18434712290763855,
|
|
"learning_rate": 0.004472729263870446,
|
|
"loss": 3.219706726074219,
|
|
"num_input_tokens_seen": 2359296000,
|
|
"step": 4500,
|
|
"train_runtime": 20469.9487,
|
|
"train_tokens_per_second": 115256.566
|
|
},
|
|
{
|
|
"epoch": 0.24351308206390865,
|
|
"eval_loss": 3.1681437492370605,
|
|
"eval_runtime": 1.9847,
|
|
"eval_samples_per_second": 251.921,
|
|
"eval_steps_per_second": 4.031,
|
|
"num_input_tokens_seen": 2359296000,
|
|
"step": 4500
|
|
},
|
|
{
|
|
"epoch": 0.2440542222462729,
|
|
"grad_norm": 0.18588702380657196,
|
|
"learning_rate": 0.0044701977945662535,
|
|
"loss": 3.2231178283691406,
|
|
"num_input_tokens_seen": 2364538880,
|
|
"step": 4510,
|
|
"train_runtime": 20517.077,
|
|
"train_tokens_per_second": 115247.356
|
|
},
|
|
{
|
|
"epoch": 0.24459536242863714,
|
|
"grad_norm": 0.1629338264465332,
|
|
"learning_rate": 0.004467661073577332,
|
|
"loss": 3.2203128814697264,
|
|
"num_input_tokens_seen": 2369781760,
|
|
"step": 4520,
|
|
"train_runtime": 20562.2414,
|
|
"train_tokens_per_second": 115249.195
|
|
},
|
|
{
|
|
"epoch": 0.24513650261100137,
|
|
"grad_norm": 0.19198022782802582,
|
|
"learning_rate": 0.00446511910864817,
|
|
"loss": 3.2169837951660156,
|
|
"num_input_tokens_seen": 2375024640,
|
|
"step": 4530,
|
|
"train_runtime": 20607.3939,
|
|
"train_tokens_per_second": 115251.092
|
|
},
|
|
{
|
|
"epoch": 0.24567764279336562,
|
|
"grad_norm": 0.17990648746490479,
|
|
"learning_rate": 0.004462571907539273,
|
|
"loss": 3.2237472534179688,
|
|
"num_input_tokens_seen": 2380267520,
|
|
"step": 4540,
|
|
"train_runtime": 20652.5476,
|
|
"train_tokens_per_second": 115252.974
|
|
},
|
|
{
|
|
"epoch": 0.24621878297572986,
|
|
"grad_norm": 0.17402444779872894,
|
|
"learning_rate": 0.004460019478027127,
|
|
"loss": 3.2200748443603517,
|
|
"num_input_tokens_seen": 2385510400,
|
|
"step": 4550,
|
|
"train_runtime": 20697.698,
|
|
"train_tokens_per_second": 115254.865
|
|
},
|
|
{
|
|
"epoch": 0.2467599231580941,
|
|
"grad_norm": 0.17017342150211334,
|
|
"learning_rate": 0.004457461827904183,
|
|
"loss": 3.2241039276123047,
|
|
"num_input_tokens_seen": 2390753280,
|
|
"step": 4560,
|
|
"train_runtime": 20742.8484,
|
|
"train_tokens_per_second": 115256.749
|
|
},
|
|
{
|
|
"epoch": 0.24730106334045834,
|
|
"grad_norm": 0.17307622730731964,
|
|
"learning_rate": 0.004454898964978828,
|
|
"loss": 3.2237174987792967,
|
|
"num_input_tokens_seen": 2395996160,
|
|
"step": 4570,
|
|
"train_runtime": 20788.0181,
|
|
"train_tokens_per_second": 115258.518
|
|
},
|
|
{
|
|
"epoch": 0.24784220352282257,
|
|
"grad_norm": 0.18426761031150818,
|
|
"learning_rate": 0.004452330897075365,
|
|
"loss": 3.2148464202880858,
|
|
"num_input_tokens_seen": 2401239040,
|
|
"step": 4580,
|
|
"train_runtime": 20836.8458,
|
|
"train_tokens_per_second": 115240.045
|
|
},
|
|
{
|
|
"epoch": 0.24838334370518683,
|
|
"grad_norm": 0.18935158848762512,
|
|
"learning_rate": 0.004449757632033987,
|
|
"loss": 3.2203189849853517,
|
|
"num_input_tokens_seen": 2406481920,
|
|
"step": 4590,
|
|
"train_runtime": 20882.0043,
|
|
"train_tokens_per_second": 115241.903
|
|
},
|
|
{
|
|
"epoch": 0.24892448388755106,
|
|
"grad_norm": 0.17916643619537354,
|
|
"learning_rate": 0.004447179177710755,
|
|
"loss": 3.220214080810547,
|
|
"num_input_tokens_seen": 2411724800,
|
|
"step": 4600,
|
|
"train_runtime": 20927.1721,
|
|
"train_tokens_per_second": 115243.703
|
|
},
|
|
{
|
|
"epoch": 0.24946562406991532,
|
|
"grad_norm": 0.1714351922273636,
|
|
"learning_rate": 0.0044445955419775696,
|
|
"loss": 3.2130130767822265,
|
|
"num_input_tokens_seen": 2416967680,
|
|
"step": 4610,
|
|
"train_runtime": 20972.3937,
|
|
"train_tokens_per_second": 115245.199
|
|
},
|
|
{
|
|
"epoch": 0.2500067642522796,
|
|
"grad_norm": 0.17399680614471436,
|
|
"learning_rate": 0.004442006732722152,
|
|
"loss": 3.2150115966796875,
|
|
"num_input_tokens_seen": 2422210560,
|
|
"step": 4620,
|
|
"train_runtime": 21017.5875,
|
|
"train_tokens_per_second": 115246.841
|
|
},
|
|
{
|
|
"epoch": 0.2505479044346438,
|
|
"grad_norm": 0.18769405782222748,
|
|
"learning_rate": 0.00443941275784802,
|
|
"loss": 3.2187454223632814,
|
|
"num_input_tokens_seen": 2427453440,
|
|
"step": 4630,
|
|
"train_runtime": 21062.785,
|
|
"train_tokens_per_second": 115248.456
|
|
},
|
|
{
|
|
"epoch": 0.25108904461700804,
|
|
"grad_norm": 0.1909925937652588,
|
|
"learning_rate": 0.004436813625274458,
|
|
"loss": 3.228108215332031,
|
|
"num_input_tokens_seen": 2432696320,
|
|
"step": 4640,
|
|
"train_runtime": 21107.9893,
|
|
"train_tokens_per_second": 115250.026
|
|
},
|
|
{
|
|
"epoch": 0.25163018479937227,
|
|
"grad_norm": 0.16732154786586761,
|
|
"learning_rate": 0.004434209342936497,
|
|
"loss": 3.213469314575195,
|
|
"num_input_tokens_seen": 2437939200,
|
|
"step": 4650,
|
|
"train_runtime": 21153.1981,
|
|
"train_tokens_per_second": 115251.566
|
|
},
|
|
{
|
|
"epoch": 0.2521713249817365,
|
|
"grad_norm": 0.17457814514636993,
|
|
"learning_rate": 0.0044315999187848915,
|
|
"loss": 3.224944305419922,
|
|
"num_input_tokens_seen": 2443182080,
|
|
"step": 4660,
|
|
"train_runtime": 21198.4016,
|
|
"train_tokens_per_second": 115253.127
|
|
},
|
|
{
|
|
"epoch": 0.2527124651641008,
|
|
"grad_norm": 0.192255899310112,
|
|
"learning_rate": 0.004428985360786096,
|
|
"loss": 3.227398681640625,
|
|
"num_input_tokens_seen": 2448424960,
|
|
"step": 4670,
|
|
"train_runtime": 21243.6232,
|
|
"train_tokens_per_second": 115254.584
|
|
},
|
|
{
|
|
"epoch": 0.253253605346465,
|
|
"grad_norm": 0.17784617841243744,
|
|
"learning_rate": 0.004426365676922234,
|
|
"loss": 3.2128623962402343,
|
|
"num_input_tokens_seen": 2453667840,
|
|
"step": 4680,
|
|
"train_runtime": 21288.8398,
|
|
"train_tokens_per_second": 115256.062
|
|
},
|
|
{
|
|
"epoch": 0.25379474552882925,
|
|
"grad_norm": 0.17195752263069153,
|
|
"learning_rate": 0.00442374087519108,
|
|
"loss": 3.2142982482910156,
|
|
"num_input_tokens_seen": 2458910720,
|
|
"step": 4690,
|
|
"train_runtime": 21334.0638,
|
|
"train_tokens_per_second": 115257.493
|
|
},
|
|
{
|
|
"epoch": 0.2543358857111935,
|
|
"grad_norm": 0.1722942292690277,
|
|
"learning_rate": 0.004421110963606032,
|
|
"loss": 3.210185241699219,
|
|
"num_input_tokens_seen": 2464153600,
|
|
"step": 4700,
|
|
"train_runtime": 21379.267,
|
|
"train_tokens_per_second": 115259.031
|
|
},
|
|
{
|
|
"epoch": 0.2548770258935577,
|
|
"grad_norm": 0.16966107487678528,
|
|
"learning_rate": 0.00441847595019609,
|
|
"loss": 3.2123428344726563,
|
|
"num_input_tokens_seen": 2469396480,
|
|
"step": 4710,
|
|
"train_runtime": 21424.483,
|
|
"train_tokens_per_second": 115260.493
|
|
},
|
|
{
|
|
"epoch": 0.255418166075922,
|
|
"grad_norm": 0.18033796548843384,
|
|
"learning_rate": 0.004415835843005828,
|
|
"loss": 3.2065505981445312,
|
|
"num_input_tokens_seen": 2474639360,
|
|
"step": 4720,
|
|
"train_runtime": 21469.6877,
|
|
"train_tokens_per_second": 115262.01
|
|
},
|
|
{
|
|
"epoch": 0.2559593062582862,
|
|
"grad_norm": 0.18272215127944946,
|
|
"learning_rate": 0.004413190650095373,
|
|
"loss": 3.2069171905517577,
|
|
"num_input_tokens_seen": 2479882240,
|
|
"step": 4730,
|
|
"train_runtime": 21514.9,
|
|
"train_tokens_per_second": 115263.48
|
|
},
|
|
{
|
|
"epoch": 0.25650044644065045,
|
|
"grad_norm": 0.17386901378631592,
|
|
"learning_rate": 0.004410540379540377,
|
|
"loss": 3.2177162170410156,
|
|
"num_input_tokens_seen": 2485125120,
|
|
"step": 4740,
|
|
"train_runtime": 21560.142,
|
|
"train_tokens_per_second": 115264.784
|
|
},
|
|
{
|
|
"epoch": 0.2570415866230147,
|
|
"grad_norm": 0.17471922934055328,
|
|
"learning_rate": 0.0044078850394319935,
|
|
"loss": 3.2096931457519533,
|
|
"num_input_tokens_seen": 2490368000,
|
|
"step": 4750,
|
|
"train_runtime": 21605.3695,
|
|
"train_tokens_per_second": 115266.161
|
|
},
|
|
{
|
|
"epoch": 0.2575827268053789,
|
|
"grad_norm": 0.188929483294487,
|
|
"learning_rate": 0.004405224637876854,
|
|
"loss": 3.215177536010742,
|
|
"num_input_tokens_seen": 2495610880,
|
|
"step": 4760,
|
|
"train_runtime": 21650.575,
|
|
"train_tokens_per_second": 115267.649
|
|
},
|
|
{
|
|
"epoch": 0.2581238669877432,
|
|
"grad_norm": 0.18201977014541626,
|
|
"learning_rate": 0.0044025591829970415,
|
|
"loss": 3.2025718688964844,
|
|
"num_input_tokens_seen": 2500853760,
|
|
"step": 4770,
|
|
"train_runtime": 21695.7857,
|
|
"train_tokens_per_second": 115269.103
|
|
},
|
|
{
|
|
"epoch": 0.25866500717010743,
|
|
"grad_norm": 0.18745562434196472,
|
|
"learning_rate": 0.004399888682930069,
|
|
"loss": 3.2124725341796876,
|
|
"num_input_tokens_seen": 2506096640,
|
|
"step": 4780,
|
|
"train_runtime": 21740.9904,
|
|
"train_tokens_per_second": 115270.583
|
|
},
|
|
{
|
|
"epoch": 0.25920614735247166,
|
|
"grad_norm": 0.18076451122760773,
|
|
"learning_rate": 0.004397213145828847,
|
|
"loss": 3.2005435943603517,
|
|
"num_input_tokens_seen": 2511339520,
|
|
"step": 4790,
|
|
"train_runtime": 21786.1967,
|
|
"train_tokens_per_second": 115272.049
|
|
},
|
|
{
|
|
"epoch": 0.2597472875348359,
|
|
"grad_norm": 0.16596098244190216,
|
|
"learning_rate": 0.004394532579861671,
|
|
"loss": 3.197236251831055,
|
|
"num_input_tokens_seen": 2516582400,
|
|
"step": 4800,
|
|
"train_runtime": 21831.4029,
|
|
"train_tokens_per_second": 115273.508
|
|
},
|
|
{
|
|
"epoch": 0.2602884277172001,
|
|
"grad_norm": 0.17128406465053558,
|
|
"learning_rate": 0.004391846993212182,
|
|
"loss": 3.2089080810546875,
|
|
"num_input_tokens_seen": 2521825280,
|
|
"step": 4810,
|
|
"train_runtime": 21876.6005,
|
|
"train_tokens_per_second": 115275.007
|
|
},
|
|
{
|
|
"epoch": 0.2608295678995644,
|
|
"grad_norm": 0.17832306027412415,
|
|
"learning_rate": 0.004389156394079355,
|
|
"loss": 3.202547073364258,
|
|
"num_input_tokens_seen": 2527068160,
|
|
"step": 4820,
|
|
"train_runtime": 21921.8037,
|
|
"train_tokens_per_second": 115276.471
|
|
},
|
|
{
|
|
"epoch": 0.26137070808192864,
|
|
"grad_norm": 0.16681405901908875,
|
|
"learning_rate": 0.004386460790677465,
|
|
"loss": 3.2106822967529296,
|
|
"num_input_tokens_seen": 2532311040,
|
|
"step": 4830,
|
|
"train_runtime": 21967.0048,
|
|
"train_tokens_per_second": 115277.939
|
|
},
|
|
{
|
|
"epoch": 0.26191184826429287,
|
|
"grad_norm": 0.17566899955272675,
|
|
"learning_rate": 0.004383760191236065,
|
|
"loss": 3.2070526123046874,
|
|
"num_input_tokens_seen": 2537553920,
|
|
"step": 4840,
|
|
"train_runtime": 22012.208,
|
|
"train_tokens_per_second": 115279.39
|
|
},
|
|
{
|
|
"epoch": 0.2624529884466571,
|
|
"grad_norm": 0.17574016749858856,
|
|
"learning_rate": 0.00438105460399996,
|
|
"loss": 3.203447723388672,
|
|
"num_input_tokens_seen": 2542796800,
|
|
"step": 4850,
|
|
"train_runtime": 22057.4092,
|
|
"train_tokens_per_second": 115280.846
|
|
},
|
|
{
|
|
"epoch": 0.26299412862902133,
|
|
"grad_norm": 0.16241556406021118,
|
|
"learning_rate": 0.004378344037229184,
|
|
"loss": 3.2026832580566404,
|
|
"num_input_tokens_seen": 2548039680,
|
|
"step": 4860,
|
|
"train_runtime": 22102.6211,
|
|
"train_tokens_per_second": 115282.24
|
|
},
|
|
{
|
|
"epoch": 0.2635352688113856,
|
|
"grad_norm": 0.1805507242679596,
|
|
"learning_rate": 0.004375628499198973,
|
|
"loss": 3.2010284423828126,
|
|
"num_input_tokens_seen": 2553282560,
|
|
"step": 4870,
|
|
"train_runtime": 22147.8116,
|
|
"train_tokens_per_second": 115283.74
|
|
},
|
|
{
|
|
"epoch": 0.26407640899374984,
|
|
"grad_norm": 0.16756032407283783,
|
|
"learning_rate": 0.004372907998199739,
|
|
"loss": 3.2070991516113283,
|
|
"num_input_tokens_seen": 2558525440,
|
|
"step": 4880,
|
|
"train_runtime": 22192.9705,
|
|
"train_tokens_per_second": 115285.398
|
|
},
|
|
{
|
|
"epoch": 0.2646175491761141,
|
|
"grad_norm": 0.18972600996494293,
|
|
"learning_rate": 0.004370182542537047,
|
|
"loss": 3.214699554443359,
|
|
"num_input_tokens_seen": 2563768320,
|
|
"step": 4890,
|
|
"train_runtime": 22238.1209,
|
|
"train_tokens_per_second": 115287.094
|
|
},
|
|
{
|
|
"epoch": 0.2651586893584783,
|
|
"grad_norm": 0.1896647959947586,
|
|
"learning_rate": 0.004367452140531587,
|
|
"loss": 3.205576705932617,
|
|
"num_input_tokens_seen": 2569011200,
|
|
"step": 4900,
|
|
"train_runtime": 22283.3129,
|
|
"train_tokens_per_second": 115288.566
|
|
},
|
|
{
|
|
"epoch": 0.26569982954084254,
|
|
"grad_norm": 0.18498484790325165,
|
|
"learning_rate": 0.004364716800519152,
|
|
"loss": 3.2080978393554687,
|
|
"num_input_tokens_seen": 2574254080,
|
|
"step": 4910,
|
|
"train_runtime": 22328.4859,
|
|
"train_tokens_per_second": 115290.132
|
|
},
|
|
{
|
|
"epoch": 0.2662409697232068,
|
|
"grad_norm": 0.1854403018951416,
|
|
"learning_rate": 0.0043619765308506074,
|
|
"loss": 3.203238677978516,
|
|
"num_input_tokens_seen": 2579496960,
|
|
"step": 4920,
|
|
"train_runtime": 22373.6522,
|
|
"train_tokens_per_second": 115291.725
|
|
},
|
|
{
|
|
"epoch": 0.26678210990557105,
|
|
"grad_norm": 0.1691334992647171,
|
|
"learning_rate": 0.004359231339891872,
|
|
"loss": 3.1914302825927736,
|
|
"num_input_tokens_seen": 2584739840,
|
|
"step": 4930,
|
|
"train_runtime": 22418.8106,
|
|
"train_tokens_per_second": 115293.353
|
|
},
|
|
{
|
|
"epoch": 0.2673232500879353,
|
|
"grad_norm": 0.17332448065280914,
|
|
"learning_rate": 0.004356481236023887,
|
|
"loss": 3.2087932586669923,
|
|
"num_input_tokens_seen": 2589982720,
|
|
"step": 4940,
|
|
"train_runtime": 22463.9738,
|
|
"train_tokens_per_second": 115294.949
|
|
},
|
|
{
|
|
"epoch": 0.2678643902702995,
|
|
"grad_norm": 0.1679113507270813,
|
|
"learning_rate": 0.004353726227642593,
|
|
"loss": 3.2014122009277344,
|
|
"num_input_tokens_seen": 2595225600,
|
|
"step": 4950,
|
|
"train_runtime": 22509.1287,
|
|
"train_tokens_per_second": 115296.582
|
|
},
|
|
{
|
|
"epoch": 0.26840553045266374,
|
|
"grad_norm": 0.16913928091526031,
|
|
"learning_rate": 0.004350966323158903,
|
|
"loss": 3.1890819549560545,
|
|
"num_input_tokens_seen": 2600468480,
|
|
"step": 4960,
|
|
"train_runtime": 22554.2873,
|
|
"train_tokens_per_second": 115298.189
|
|
},
|
|
{
|
|
"epoch": 0.26894667063502803,
|
|
"grad_norm": 0.16906581819057465,
|
|
"learning_rate": 0.00434820153099868,
|
|
"loss": 3.202825927734375,
|
|
"num_input_tokens_seen": 2605711360,
|
|
"step": 4970,
|
|
"train_runtime": 22602.9949,
|
|
"train_tokens_per_second": 115281.686
|
|
},
|
|
{
|
|
"epoch": 0.26948781081739226,
|
|
"grad_norm": 0.16878265142440796,
|
|
"learning_rate": 0.004345431859602706,
|
|
"loss": 3.200624465942383,
|
|
"num_input_tokens_seen": 2610954240,
|
|
"step": 4980,
|
|
"train_runtime": 22648.1981,
|
|
"train_tokens_per_second": 115283.089
|
|
},
|
|
{
|
|
"epoch": 0.2700289509997565,
|
|
"grad_norm": 0.1862846463918686,
|
|
"learning_rate": 0.004342657317426662,
|
|
"loss": 3.206439971923828,
|
|
"num_input_tokens_seen": 2616197120,
|
|
"step": 4990,
|
|
"train_runtime": 22693.3935,
|
|
"train_tokens_per_second": 115284.526
|
|
},
|
|
{
|
|
"epoch": 0.2705700911821207,
|
|
"grad_norm": 0.16954657435417175,
|
|
"learning_rate": 0.004339877912941097,
|
|
"loss": 3.199533462524414,
|
|
"num_input_tokens_seen": 2621440000,
|
|
"step": 5000,
|
|
"train_runtime": 22738.6005,
|
|
"train_tokens_per_second": 115285.899
|
|
},
|
|
{
|
|
"epoch": 0.2705700911821207,
|
|
"eval_loss": 3.146559715270996,
|
|
"eval_runtime": 1.9859,
|
|
"eval_samples_per_second": 251.773,
|
|
"eval_steps_per_second": 4.028,
|
|
"num_input_tokens_seen": 2621440000,
|
|
"step": 5000
|
|
},
|
|
{
|
|
"epoch": 0.27111123136448495,
|
|
"grad_norm": 0.1746288388967514,
|
|
"learning_rate": 0.004337093654631402,
|
|
"loss": 3.195170593261719,
|
|
"num_input_tokens_seen": 2626682880,
|
|
"step": 5010,
|
|
"train_runtime": 22788.1861,
|
|
"train_tokens_per_second": 115265.114
|
|
},
|
|
{
|
|
"epoch": 0.27165237154684924,
|
|
"grad_norm": 0.182390496134758,
|
|
"learning_rate": 0.004334304550997793,
|
|
"loss": 3.184975433349609,
|
|
"num_input_tokens_seen": 2631925760,
|
|
"step": 5020,
|
|
"train_runtime": 22833.4175,
|
|
"train_tokens_per_second": 115266.397
|
|
},
|
|
{
|
|
"epoch": 0.27219351172921347,
|
|
"grad_norm": 0.18103830516338348,
|
|
"learning_rate": 0.004331510610555275,
|
|
"loss": 3.190489959716797,
|
|
"num_input_tokens_seen": 2637168640,
|
|
"step": 5030,
|
|
"train_runtime": 22878.6263,
|
|
"train_tokens_per_second": 115267.788
|
|
},
|
|
{
|
|
"epoch": 0.2727346519115777,
|
|
"grad_norm": 0.1782936155796051,
|
|
"learning_rate": 0.004328711841833618,
|
|
"loss": 3.196137237548828,
|
|
"num_input_tokens_seen": 2642411520,
|
|
"step": 5040,
|
|
"train_runtime": 22923.8218,
|
|
"train_tokens_per_second": 115269.24
|
|
},
|
|
{
|
|
"epoch": 0.2732757920939419,
|
|
"grad_norm": 0.185542032122612,
|
|
"learning_rate": 0.0043259082533773354,
|
|
"loss": 3.190313720703125,
|
|
"num_input_tokens_seen": 2647654400,
|
|
"step": 5050,
|
|
"train_runtime": 22969.0255,
|
|
"train_tokens_per_second": 115270.646
|
|
},
|
|
{
|
|
"epoch": 0.27381693227630616,
|
|
"grad_norm": 0.16143307089805603,
|
|
"learning_rate": 0.0043230998537456536,
|
|
"loss": 3.2025264739990233,
|
|
"num_input_tokens_seen": 2652897280,
|
|
"step": 5060,
|
|
"train_runtime": 23014.1965,
|
|
"train_tokens_per_second": 115272.209
|
|
},
|
|
{
|
|
"epoch": 0.27435807245867044,
|
|
"grad_norm": 0.16813282668590546,
|
|
"learning_rate": 0.004320286651512486,
|
|
"loss": 3.1958364486694335,
|
|
"num_input_tokens_seen": 2658140160,
|
|
"step": 5070,
|
|
"train_runtime": 23059.3886,
|
|
"train_tokens_per_second": 115273.662
|
|
},
|
|
{
|
|
"epoch": 0.2748992126410347,
|
|
"grad_norm": 0.18112315237522125,
|
|
"learning_rate": 0.004317468655266412,
|
|
"loss": 3.194669723510742,
|
|
"num_input_tokens_seen": 2663383040,
|
|
"step": 5080,
|
|
"train_runtime": 23104.5863,
|
|
"train_tokens_per_second": 115275.08
|
|
},
|
|
{
|
|
"epoch": 0.2754403528233989,
|
|
"grad_norm": 0.18187521398067474,
|
|
"learning_rate": 0.004314645873610643,
|
|
"loss": 3.1878196716308596,
|
|
"num_input_tokens_seen": 2668625920,
|
|
"step": 5090,
|
|
"train_runtime": 23149.7951,
|
|
"train_tokens_per_second": 115276.438
|
|
},
|
|
{
|
|
"epoch": 0.27598149300576313,
|
|
"grad_norm": 0.16804195940494537,
|
|
"learning_rate": 0.004311818315163001,
|
|
"loss": 3.2023330688476563,
|
|
"num_input_tokens_seen": 2673868800,
|
|
"step": 5100,
|
|
"train_runtime": 23194.9829,
|
|
"train_tokens_per_second": 115277.895
|
|
},
|
|
{
|
|
"epoch": 0.27652263318812736,
|
|
"grad_norm": 0.16169899702072144,
|
|
"learning_rate": 0.004308985988555892,
|
|
"loss": 3.195353889465332,
|
|
"num_input_tokens_seen": 2679111680,
|
|
"step": 5110,
|
|
"train_runtime": 23240.1686,
|
|
"train_tokens_per_second": 115279.356
|
|
},
|
|
{
|
|
"epoch": 0.27706377337049165,
|
|
"grad_norm": 0.1690625697374344,
|
|
"learning_rate": 0.004306148902436281,
|
|
"loss": 3.1894439697265624,
|
|
"num_input_tokens_seen": 2684354560,
|
|
"step": 5120,
|
|
"train_runtime": 23285.3699,
|
|
"train_tokens_per_second": 115280.735
|
|
},
|
|
{
|
|
"epoch": 0.2776049135528559,
|
|
"grad_norm": 0.1880822330713272,
|
|
"learning_rate": 0.00430330706546566,
|
|
"loss": 3.1982452392578127,
|
|
"num_input_tokens_seen": 2689597440,
|
|
"step": 5130,
|
|
"train_runtime": 23330.5682,
|
|
"train_tokens_per_second": 115282.123
|
|
},
|
|
{
|
|
"epoch": 0.2781460537352201,
|
|
"grad_norm": 0.1790066808462143,
|
|
"learning_rate": 0.004300460486320026,
|
|
"loss": 3.1980308532714843,
|
|
"num_input_tokens_seen": 2694840320,
|
|
"step": 5140,
|
|
"train_runtime": 23375.7756,
|
|
"train_tokens_per_second": 115283.461
|
|
},
|
|
{
|
|
"epoch": 0.27868719391758434,
|
|
"grad_norm": 0.16601233184337616,
|
|
"learning_rate": 0.004297609173689855,
|
|
"loss": 3.197714996337891,
|
|
"num_input_tokens_seen": 2700083200,
|
|
"step": 5150,
|
|
"train_runtime": 23420.9835,
|
|
"train_tokens_per_second": 115284.792
|
|
},
|
|
{
|
|
"epoch": 0.27922833409994857,
|
|
"grad_norm": 0.16672180593013763,
|
|
"learning_rate": 0.0042947531362800715,
|
|
"loss": 3.1988187789916993,
|
|
"num_input_tokens_seen": 2705326080,
|
|
"step": 5160,
|
|
"train_runtime": 23466.1808,
|
|
"train_tokens_per_second": 115286.169
|
|
},
|
|
{
|
|
"epoch": 0.27976947428231286,
|
|
"grad_norm": 0.19832877814769745,
|
|
"learning_rate": 0.00429189238281003,
|
|
"loss": 3.1931121826171873,
|
|
"num_input_tokens_seen": 2710568960,
|
|
"step": 5170,
|
|
"train_runtime": 23511.3921,
|
|
"train_tokens_per_second": 115287.472
|
|
},
|
|
{
|
|
"epoch": 0.2803106144646771,
|
|
"grad_norm": 0.1923927664756775,
|
|
"learning_rate": 0.004289026922013475,
|
|
"loss": 3.1957611083984374,
|
|
"num_input_tokens_seen": 2715811840,
|
|
"step": 5180,
|
|
"train_runtime": 23556.5915,
|
|
"train_tokens_per_second": 115288.829
|
|
},
|
|
{
|
|
"epoch": 0.2808517546470413,
|
|
"grad_norm": 0.17779354751110077,
|
|
"learning_rate": 0.00428615676263853,
|
|
"loss": 3.181416702270508,
|
|
"num_input_tokens_seen": 2721054720,
|
|
"step": 5190,
|
|
"train_runtime": 23601.8043,
|
|
"train_tokens_per_second": 115290.115
|
|
},
|
|
{
|
|
"epoch": 0.28139289482940555,
|
|
"grad_norm": 0.16895556449890137,
|
|
"learning_rate": 0.004283281913447657,
|
|
"loss": 3.1839942932128906,
|
|
"num_input_tokens_seen": 2726297600,
|
|
"step": 5200,
|
|
"train_runtime": 23647.0206,
|
|
"train_tokens_per_second": 115291.379
|
|
},
|
|
{
|
|
"epoch": 0.2819340350117698,
|
|
"grad_norm": 0.17021089792251587,
|
|
"learning_rate": 0.004280402383217639,
|
|
"loss": 3.193735122680664,
|
|
"num_input_tokens_seen": 2731540480,
|
|
"step": 5210,
|
|
"train_runtime": 23692.2429,
|
|
"train_tokens_per_second": 115292.608
|
|
},
|
|
{
|
|
"epoch": 0.28247517519413406,
|
|
"grad_norm": 0.16943930089473724,
|
|
"learning_rate": 0.00427751818073955,
|
|
"loss": 3.1817481994628904,
|
|
"num_input_tokens_seen": 2736783360,
|
|
"step": 5220,
|
|
"train_runtime": 23737.4424,
|
|
"train_tokens_per_second": 115293.944
|
|
},
|
|
{
|
|
"epoch": 0.2830163153764983,
|
|
"grad_norm": 0.15319055318832397,
|
|
"learning_rate": 0.004274629314818728,
|
|
"loss": 3.1803112030029297,
|
|
"num_input_tokens_seen": 2742026240,
|
|
"step": 5230,
|
|
"train_runtime": 23782.6783,
|
|
"train_tokens_per_second": 115295.099
|
|
},
|
|
{
|
|
"epoch": 0.2835574555588625,
|
|
"grad_norm": 0.1702287793159485,
|
|
"learning_rate": 0.004271735794274746,
|
|
"loss": 3.1876094818115233,
|
|
"num_input_tokens_seen": 2747269120,
|
|
"step": 5240,
|
|
"train_runtime": 23827.881,
|
|
"train_tokens_per_second": 115296.409
|
|
},
|
|
{
|
|
"epoch": 0.28409859574122676,
|
|
"grad_norm": 0.18369406461715698,
|
|
"learning_rate": 0.00426883762794139,
|
|
"loss": 3.1819345474243166,
|
|
"num_input_tokens_seen": 2752512000,
|
|
"step": 5250,
|
|
"train_runtime": 23873.0945,
|
|
"train_tokens_per_second": 115297.663
|
|
},
|
|
{
|
|
"epoch": 0.284639735923591,
|
|
"grad_norm": 0.1792212277650833,
|
|
"learning_rate": 0.004265934824666628,
|
|
"loss": 3.1884128570556642,
|
|
"num_input_tokens_seen": 2757754880,
|
|
"step": 5260,
|
|
"train_runtime": 23918.3193,
|
|
"train_tokens_per_second": 115298.857
|
|
},
|
|
{
|
|
"epoch": 0.28518087610595527,
|
|
"grad_norm": 0.17727087438106537,
|
|
"learning_rate": 0.0042630273933125865,
|
|
"loss": 3.194817543029785,
|
|
"num_input_tokens_seen": 2762997760,
|
|
"step": 5270,
|
|
"train_runtime": 23963.5296,
|
|
"train_tokens_per_second": 115300.117
|
|
},
|
|
{
|
|
"epoch": 0.2857220162883195,
|
|
"grad_norm": 0.15836812555789948,
|
|
"learning_rate": 0.004260115342755518,
|
|
"loss": 3.1808521270751955,
|
|
"num_input_tokens_seen": 2768240640,
|
|
"step": 5280,
|
|
"train_runtime": 24008.7153,
|
|
"train_tokens_per_second": 115301.49
|
|
},
|
|
{
|
|
"epoch": 0.28626315647068373,
|
|
"grad_norm": 0.17416301369667053,
|
|
"learning_rate": 0.00425719868188578,
|
|
"loss": 3.1919151306152345,
|
|
"num_input_tokens_seen": 2773483520,
|
|
"step": 5290,
|
|
"train_runtime": 24053.9251,
|
|
"train_tokens_per_second": 115302.742
|
|
},
|
|
{
|
|
"epoch": 0.28680429665304796,
|
|
"grad_norm": 0.16871845722198486,
|
|
"learning_rate": 0.004254277419607802,
|
|
"loss": 3.182635498046875,
|
|
"num_input_tokens_seen": 2778726400,
|
|
"step": 5300,
|
|
"train_runtime": 24099.1331,
|
|
"train_tokens_per_second": 115303.998
|
|
},
|
|
{
|
|
"epoch": 0.2873454368354122,
|
|
"grad_norm": 0.1787181943655014,
|
|
"learning_rate": 0.004251351564840067,
|
|
"loss": 3.18890495300293,
|
|
"num_input_tokens_seen": 2783969280,
|
|
"step": 5310,
|
|
"train_runtime": 24144.3426,
|
|
"train_tokens_per_second": 115305.243
|
|
},
|
|
{
|
|
"epoch": 0.2878865770177765,
|
|
"grad_norm": 0.1912972331047058,
|
|
"learning_rate": 0.00424842112651507,
|
|
"loss": 3.1834373474121094,
|
|
"num_input_tokens_seen": 2789212160,
|
|
"step": 5320,
|
|
"train_runtime": 24189.5491,
|
|
"train_tokens_per_second": 115306.496
|
|
},
|
|
{
|
|
"epoch": 0.2884277172001407,
|
|
"grad_norm": 0.16879980266094208,
|
|
"learning_rate": 0.004245486113579308,
|
|
"loss": 3.1814502716064452,
|
|
"num_input_tokens_seen": 2794455040,
|
|
"step": 5330,
|
|
"train_runtime": 24234.754,
|
|
"train_tokens_per_second": 115307.753
|
|
},
|
|
{
|
|
"epoch": 0.28896885738250494,
|
|
"grad_norm": 0.17917132377624512,
|
|
"learning_rate": 0.00424254653499324,
|
|
"loss": 3.188125228881836,
|
|
"num_input_tokens_seen": 2799697920,
|
|
"step": 5340,
|
|
"train_runtime": 24279.9641,
|
|
"train_tokens_per_second": 115308.981
|
|
},
|
|
{
|
|
"epoch": 0.28950999756486917,
|
|
"grad_norm": 0.17311090230941772,
|
|
"learning_rate": 0.004239602399731263,
|
|
"loss": 3.1844112396240236,
|
|
"num_input_tokens_seen": 2804940800,
|
|
"step": 5350,
|
|
"train_runtime": 24328.7709,
|
|
"train_tokens_per_second": 115293.157
|
|
},
|
|
{
|
|
"epoch": 0.2900511377472334,
|
|
"grad_norm": 0.17229342460632324,
|
|
"learning_rate": 0.004236653716781689,
|
|
"loss": 3.185770797729492,
|
|
"num_input_tokens_seen": 2810183680,
|
|
"step": 5360,
|
|
"train_runtime": 24373.9674,
|
|
"train_tokens_per_second": 115294.471
|
|
},
|
|
{
|
|
"epoch": 0.2905922779295977,
|
|
"grad_norm": 0.180856391787529,
|
|
"learning_rate": 0.0042337004951467075,
|
|
"loss": 3.1889812469482424,
|
|
"num_input_tokens_seen": 2815426560,
|
|
"step": 5370,
|
|
"train_runtime": 24419.1733,
|
|
"train_tokens_per_second": 115295.736
|
|
},
|
|
{
|
|
"epoch": 0.2911334181119619,
|
|
"grad_norm": 0.16839343309402466,
|
|
"learning_rate": 0.004230742743842371,
|
|
"loss": 3.1733203887939454,
|
|
"num_input_tokens_seen": 2820669440,
|
|
"step": 5380,
|
|
"train_runtime": 24464.3893,
|
|
"train_tokens_per_second": 115296.949
|
|
},
|
|
{
|
|
"epoch": 0.29167455829432615,
|
|
"grad_norm": 0.16889749467372894,
|
|
"learning_rate": 0.004227780471898559,
|
|
"loss": 3.1818462371826173,
|
|
"num_input_tokens_seen": 2825912320,
|
|
"step": 5390,
|
|
"train_runtime": 24509.5858,
|
|
"train_tokens_per_second": 115298.249
|
|
},
|
|
{
|
|
"epoch": 0.2922156984766904,
|
|
"grad_norm": 0.17744433879852295,
|
|
"learning_rate": 0.004224813688358949,
|
|
"loss": 3.1864446640014648,
|
|
"num_input_tokens_seen": 2831155200,
|
|
"step": 5400,
|
|
"train_runtime": 24554.7949,
|
|
"train_tokens_per_second": 115299.485
|
|
},
|
|
{
|
|
"epoch": 0.2927568386590546,
|
|
"grad_norm": 0.1737280935049057,
|
|
"learning_rate": 0.004221842402280996,
|
|
"loss": 3.180088424682617,
|
|
"num_input_tokens_seen": 2836398080,
|
|
"step": 5410,
|
|
"train_runtime": 24599.9993,
|
|
"train_tokens_per_second": 115300.738
|
|
},
|
|
{
|
|
"epoch": 0.2932979788414189,
|
|
"grad_norm": 0.16631047427654266,
|
|
"learning_rate": 0.004218866622735898,
|
|
"loss": 3.175667572021484,
|
|
"num_input_tokens_seen": 2841640960,
|
|
"step": 5420,
|
|
"train_runtime": 24645.2212,
|
|
"train_tokens_per_second": 115301.905
|
|
},
|
|
{
|
|
"epoch": 0.2938391190237831,
|
|
"grad_norm": 0.17272046208381653,
|
|
"learning_rate": 0.004215886358808577,
|
|
"loss": 3.185796546936035,
|
|
"num_input_tokens_seen": 2846883840,
|
|
"step": 5430,
|
|
"train_runtime": 24690.432,
|
|
"train_tokens_per_second": 115303.12
|
|
},
|
|
{
|
|
"epoch": 0.29438025920614735,
|
|
"grad_norm": 0.1690651923418045,
|
|
"learning_rate": 0.004212901619597638,
|
|
"loss": 3.1886520385742188,
|
|
"num_input_tokens_seen": 2852126720,
|
|
"step": 5440,
|
|
"train_runtime": 24735.6453,
|
|
"train_tokens_per_second": 115304.318
|
|
},
|
|
{
|
|
"epoch": 0.2949213993885116,
|
|
"grad_norm": 0.19146323204040527,
|
|
"learning_rate": 0.0042099124142153535,
|
|
"loss": 3.1789478302001952,
|
|
"num_input_tokens_seen": 2857369600,
|
|
"step": 5450,
|
|
"train_runtime": 24780.8456,
|
|
"train_tokens_per_second": 115305.573
|
|
},
|
|
{
|
|
"epoch": 0.2954625395708758,
|
|
"grad_norm": 0.1788649708032608,
|
|
"learning_rate": 0.00420691875178763,
|
|
"loss": 3.1887844085693358,
|
|
"num_input_tokens_seen": 2862612480,
|
|
"step": 5460,
|
|
"train_runtime": 24826.0467,
|
|
"train_tokens_per_second": 115306.819
|
|
},
|
|
{
|
|
"epoch": 0.2960036797532401,
|
|
"grad_norm": 0.19091546535491943,
|
|
"learning_rate": 0.004203920641453982,
|
|
"loss": 3.175608253479004,
|
|
"num_input_tokens_seen": 2867855360,
|
|
"step": 5470,
|
|
"train_runtime": 24871.2591,
|
|
"train_tokens_per_second": 115308.009
|
|
},
|
|
{
|
|
"epoch": 0.29654481993560433,
|
|
"grad_norm": 0.16818441450595856,
|
|
"learning_rate": 0.004200918092367501,
|
|
"loss": 3.1859344482421874,
|
|
"num_input_tokens_seen": 2873098240,
|
|
"step": 5480,
|
|
"train_runtime": 24916.485,
|
|
"train_tokens_per_second": 115309.131
|
|
},
|
|
{
|
|
"epoch": 0.29708596011796856,
|
|
"grad_norm": 0.1913134902715683,
|
|
"learning_rate": 0.0041979111136948325,
|
|
"loss": 3.1723804473876953,
|
|
"num_input_tokens_seen": 2878341120,
|
|
"step": 5490,
|
|
"train_runtime": 24961.6704,
|
|
"train_tokens_per_second": 115310.437
|
|
},
|
|
{
|
|
"epoch": 0.2976271003003328,
|
|
"grad_norm": 0.18261617422103882,
|
|
"learning_rate": 0.004194899714616144,
|
|
"loss": 3.179214286804199,
|
|
"num_input_tokens_seen": 2883584000,
|
|
"step": 5500,
|
|
"train_runtime": 25006.8704,
|
|
"train_tokens_per_second": 115311.67
|
|
},
|
|
{
|
|
"epoch": 0.2976271003003328,
|
|
"eval_loss": 3.126129388809204,
|
|
"eval_runtime": 1.9962,
|
|
"eval_samples_per_second": 250.471,
|
|
"eval_steps_per_second": 4.008,
|
|
"num_input_tokens_seen": 2883584000,
|
|
"step": 5500
|
|
},
|
|
{
|
|
"epoch": 0.298168240482697,
|
|
"grad_norm": 0.18416427075862885,
|
|
"learning_rate": 0.004191883904325097,
|
|
"loss": 3.1846160888671875,
|
|
"num_input_tokens_seen": 2888826880,
|
|
"step": 5510,
|
|
"train_runtime": 25054.1224,
|
|
"train_tokens_per_second": 115303.455
|
|
},
|
|
{
|
|
"epoch": 0.2987093806650613,
|
|
"grad_norm": 0.16038469970226288,
|
|
"learning_rate": 0.004188863692028823,
|
|
"loss": 3.180740737915039,
|
|
"num_input_tokens_seen": 2894069760,
|
|
"step": 5520,
|
|
"train_runtime": 25099.3557,
|
|
"train_tokens_per_second": 115304.544
|
|
},
|
|
{
|
|
"epoch": 0.29925052084742554,
|
|
"grad_norm": 0.16605685651302338,
|
|
"learning_rate": 0.004185839086947891,
|
|
"loss": 3.1796802520751952,
|
|
"num_input_tokens_seen": 2899312640,
|
|
"step": 5530,
|
|
"train_runtime": 25144.6135,
|
|
"train_tokens_per_second": 115305.516
|
|
},
|
|
{
|
|
"epoch": 0.29979166102978977,
|
|
"grad_norm": 0.1819118857383728,
|
|
"learning_rate": 0.004182810098316281,
|
|
"loss": 3.1764299392700197,
|
|
"num_input_tokens_seen": 2904555520,
|
|
"step": 5540,
|
|
"train_runtime": 25189.8702,
|
|
"train_tokens_per_second": 115306.49
|
|
},
|
|
{
|
|
"epoch": 0.300332801212154,
|
|
"grad_norm": 0.1876569390296936,
|
|
"learning_rate": 0.004179776735381355,
|
|
"loss": 3.18255500793457,
|
|
"num_input_tokens_seen": 2909798400,
|
|
"step": 5550,
|
|
"train_runtime": 25235.1612,
|
|
"train_tokens_per_second": 115307.304
|
|
},
|
|
{
|
|
"epoch": 0.30087394139451823,
|
|
"grad_norm": 0.1661430448293686,
|
|
"learning_rate": 0.004176739007403832,
|
|
"loss": 3.172201156616211,
|
|
"num_input_tokens_seen": 2915041280,
|
|
"step": 5560,
|
|
"train_runtime": 25280.4455,
|
|
"train_tokens_per_second": 115308.145
|
|
},
|
|
{
|
|
"epoch": 0.3014150815768825,
|
|
"grad_norm": 0.17655618488788605,
|
|
"learning_rate": 0.004173696923657755,
|
|
"loss": 3.17954158782959,
|
|
"num_input_tokens_seen": 2920284160,
|
|
"step": 5570,
|
|
"train_runtime": 25325.7247,
|
|
"train_tokens_per_second": 115309.007
|
|
},
|
|
{
|
|
"epoch": 0.30195622175924675,
|
|
"grad_norm": 0.17908194661140442,
|
|
"learning_rate": 0.0041706504934304655,
|
|
"loss": 3.1723983764648436,
|
|
"num_input_tokens_seen": 2925527040,
|
|
"step": 5580,
|
|
"train_runtime": 25370.9962,
|
|
"train_tokens_per_second": 115309.9
|
|
},
|
|
{
|
|
"epoch": 0.302497361941611,
|
|
"grad_norm": 0.17515423893928528,
|
|
"learning_rate": 0.004167599726022575,
|
|
"loss": 3.183238220214844,
|
|
"num_input_tokens_seen": 2930769920,
|
|
"step": 5590,
|
|
"train_runtime": 25416.2839,
|
|
"train_tokens_per_second": 115310.717
|
|
},
|
|
{
|
|
"epoch": 0.3030385021239752,
|
|
"grad_norm": 0.1749441921710968,
|
|
"learning_rate": 0.004164544630747937,
|
|
"loss": 3.185963821411133,
|
|
"num_input_tokens_seen": 2936012800,
|
|
"step": 5600,
|
|
"train_runtime": 25461.5455,
|
|
"train_tokens_per_second": 115311.649
|
|
},
|
|
{
|
|
"epoch": 0.30357964230633944,
|
|
"grad_norm": 0.1578006148338318,
|
|
"learning_rate": 0.004161485216933615,
|
|
"loss": 3.177383041381836,
|
|
"num_input_tokens_seen": 2941255680,
|
|
"step": 5610,
|
|
"train_runtime": 25506.8309,
|
|
"train_tokens_per_second": 115312.47
|
|
},
|
|
{
|
|
"epoch": 0.3041207824887037,
|
|
"grad_norm": 0.1903323382139206,
|
|
"learning_rate": 0.00415842149391986,
|
|
"loss": 3.179554748535156,
|
|
"num_input_tokens_seen": 2946498560,
|
|
"step": 5620,
|
|
"train_runtime": 25552.099,
|
|
"train_tokens_per_second": 115313.367
|
|
},
|
|
{
|
|
"epoch": 0.30466192267106795,
|
|
"grad_norm": 0.16383005678653717,
|
|
"learning_rate": 0.004155353471060077,
|
|
"loss": 3.160336494445801,
|
|
"num_input_tokens_seen": 2951741440,
|
|
"step": 5630,
|
|
"train_runtime": 25597.3865,
|
|
"train_tokens_per_second": 115314.172
|
|
},
|
|
{
|
|
"epoch": 0.3052030628534322,
|
|
"grad_norm": 0.1735740303993225,
|
|
"learning_rate": 0.004152281157720798,
|
|
"loss": 3.172795867919922,
|
|
"num_input_tokens_seen": 2956984320,
|
|
"step": 5640,
|
|
"train_runtime": 25642.6481,
|
|
"train_tokens_per_second": 115315.092
|
|
},
|
|
{
|
|
"epoch": 0.3057442030357964,
|
|
"grad_norm": 0.19910795986652374,
|
|
"learning_rate": 0.004149204563281657,
|
|
"loss": 3.1711971282958986,
|
|
"num_input_tokens_seen": 2962227200,
|
|
"step": 5650,
|
|
"train_runtime": 25687.9012,
|
|
"train_tokens_per_second": 115316.046
|
|
},
|
|
{
|
|
"epoch": 0.30628534321816064,
|
|
"grad_norm": 0.18566472828388214,
|
|
"learning_rate": 0.004146123697135352,
|
|
"loss": 3.177423095703125,
|
|
"num_input_tokens_seen": 2967470080,
|
|
"step": 5660,
|
|
"train_runtime": 25733.1722,
|
|
"train_tokens_per_second": 115316.917
|
|
},
|
|
{
|
|
"epoch": 0.30682648340052493,
|
|
"grad_norm": 0.16724054515361786,
|
|
"learning_rate": 0.004143038568687626,
|
|
"loss": 3.174397277832031,
|
|
"num_input_tokens_seen": 2972712960,
|
|
"step": 5670,
|
|
"train_runtime": 25778.4366,
|
|
"train_tokens_per_second": 115317.814
|
|
},
|
|
{
|
|
"epoch": 0.30736762358288916,
|
|
"grad_norm": 0.18052591383457184,
|
|
"learning_rate": 0.004139949187357236,
|
|
"loss": 3.172323226928711,
|
|
"num_input_tokens_seen": 2977955840,
|
|
"step": 5680,
|
|
"train_runtime": 25823.6944,
|
|
"train_tokens_per_second": 115318.738
|
|
},
|
|
{
|
|
"epoch": 0.3079087637652534,
|
|
"grad_norm": 0.1707129180431366,
|
|
"learning_rate": 0.004136855562575921,
|
|
"loss": 3.1627834320068358,
|
|
"num_input_tokens_seen": 2983198720,
|
|
"step": 5690,
|
|
"train_runtime": 25868.9566,
|
|
"train_tokens_per_second": 115319.638
|
|
},
|
|
{
|
|
"epoch": 0.3084499039476176,
|
|
"grad_norm": 0.18003937602043152,
|
|
"learning_rate": 0.004133757703788374,
|
|
"loss": 3.175765609741211,
|
|
"num_input_tokens_seen": 2988441600,
|
|
"step": 5700,
|
|
"train_runtime": 25914.2132,
|
|
"train_tokens_per_second": 115320.561
|
|
},
|
|
{
|
|
"epoch": 0.30899104412998185,
|
|
"grad_norm": 0.17585334181785583,
|
|
"learning_rate": 0.004130655620452215,
|
|
"loss": 3.1611761093139648,
|
|
"num_input_tokens_seen": 2993684480,
|
|
"step": 5710,
|
|
"train_runtime": 25959.4637,
|
|
"train_tokens_per_second": 115321.507
|
|
},
|
|
{
|
|
"epoch": 0.30953218431234614,
|
|
"grad_norm": 0.17584700882434845,
|
|
"learning_rate": 0.004127549322037963,
|
|
"loss": 3.1710134506225587,
|
|
"num_input_tokens_seen": 2998927360,
|
|
"step": 5720,
|
|
"train_runtime": 26004.7204,
|
|
"train_tokens_per_second": 115322.423
|
|
},
|
|
{
|
|
"epoch": 0.31007332449471037,
|
|
"grad_norm": 0.1671862006187439,
|
|
"learning_rate": 0.004124438818029003,
|
|
"loss": 3.171963691711426,
|
|
"num_input_tokens_seen": 3004170240,
|
|
"step": 5730,
|
|
"train_runtime": 26053.4016,
|
|
"train_tokens_per_second": 115308.177
|
|
},
|
|
{
|
|
"epoch": 0.3106144646770746,
|
|
"grad_norm": 0.17120610177516937,
|
|
"learning_rate": 0.004121324117921561,
|
|
"loss": 3.171039581298828,
|
|
"num_input_tokens_seen": 3009413120,
|
|
"step": 5740,
|
|
"train_runtime": 26098.5909,
|
|
"train_tokens_per_second": 115309.41
|
|
},
|
|
{
|
|
"epoch": 0.31115560485943883,
|
|
"grad_norm": 0.17267778515815735,
|
|
"learning_rate": 0.004118205231224675,
|
|
"loss": 3.1711191177368163,
|
|
"num_input_tokens_seen": 3014656000,
|
|
"step": 5750,
|
|
"train_runtime": 26143.7653,
|
|
"train_tokens_per_second": 115310.705
|
|
},
|
|
{
|
|
"epoch": 0.31169674504180306,
|
|
"grad_norm": 0.17473942041397095,
|
|
"learning_rate": 0.004115082167460159,
|
|
"loss": 3.1646095275878907,
|
|
"num_input_tokens_seen": 3019898880,
|
|
"step": 5760,
|
|
"train_runtime": 26188.9631,
|
|
"train_tokens_per_second": 115311.892
|
|
},
|
|
{
|
|
"epoch": 0.31223788522416734,
|
|
"grad_norm": 0.17137861251831055,
|
|
"learning_rate": 0.004111954936162586,
|
|
"loss": 3.1746740341186523,
|
|
"num_input_tokens_seen": 3025141760,
|
|
"step": 5770,
|
|
"train_runtime": 26234.3115,
|
|
"train_tokens_per_second": 115312.413
|
|
},
|
|
{
|
|
"epoch": 0.3127790254065316,
|
|
"grad_norm": 0.16885042190551758,
|
|
"learning_rate": 0.004108823546879249,
|
|
"loss": 3.162841033935547,
|
|
"num_input_tokens_seen": 3030384640,
|
|
"step": 5780,
|
|
"train_runtime": 26279.5704,
|
|
"train_tokens_per_second": 115313.325
|
|
},
|
|
{
|
|
"epoch": 0.3133201655888958,
|
|
"grad_norm": 0.15897022187709808,
|
|
"learning_rate": 0.004105688009170134,
|
|
"loss": 3.1719465255737305,
|
|
"num_input_tokens_seen": 3035627520,
|
|
"step": 5790,
|
|
"train_runtime": 26324.8012,
|
|
"train_tokens_per_second": 115314.357
|
|
},
|
|
{
|
|
"epoch": 0.31386130577126004,
|
|
"grad_norm": 0.1866680085659027,
|
|
"learning_rate": 0.004102548332607894,
|
|
"loss": 3.1683422088623048,
|
|
"num_input_tokens_seen": 3040870400,
|
|
"step": 5800,
|
|
"train_runtime": 26370.0195,
|
|
"train_tokens_per_second": 115315.44
|
|
},
|
|
{
|
|
"epoch": 0.31440244595362427,
|
|
"grad_norm": 0.18191276490688324,
|
|
"learning_rate": 0.004099404526777816,
|
|
"loss": 3.1652973175048826,
|
|
"num_input_tokens_seen": 3046113280,
|
|
"step": 5810,
|
|
"train_runtime": 26415.2343,
|
|
"train_tokens_per_second": 115316.535
|
|
},
|
|
{
|
|
"epoch": 0.31494358613598855,
|
|
"grad_norm": 0.16286683082580566,
|
|
"learning_rate": 0.004096256601277797,
|
|
"loss": 3.1653570175170898,
|
|
"num_input_tokens_seen": 3051356160,
|
|
"step": 5820,
|
|
"train_runtime": 26460.4377,
|
|
"train_tokens_per_second": 115317.675
|
|
},
|
|
{
|
|
"epoch": 0.3154847263183528,
|
|
"grad_norm": 0.15786544978618622,
|
|
"learning_rate": 0.004093104565718307,
|
|
"loss": 3.171334457397461,
|
|
"num_input_tokens_seen": 3056599040,
|
|
"step": 5830,
|
|
"train_runtime": 26505.6409,
|
|
"train_tokens_per_second": 115318.813
|
|
},
|
|
{
|
|
"epoch": 0.316025866500717,
|
|
"grad_norm": 0.16940993070602417,
|
|
"learning_rate": 0.0040899484297223666,
|
|
"loss": 3.16903076171875,
|
|
"num_input_tokens_seen": 3061841920,
|
|
"step": 5840,
|
|
"train_runtime": 26550.8652,
|
|
"train_tokens_per_second": 115319.855
|
|
},
|
|
{
|
|
"epoch": 0.31656700668308124,
|
|
"grad_norm": 0.1778353452682495,
|
|
"learning_rate": 0.004086788202925512,
|
|
"loss": 3.163807678222656,
|
|
"num_input_tokens_seen": 3067084800,
|
|
"step": 5850,
|
|
"train_runtime": 26596.0801,
|
|
"train_tokens_per_second": 115320.934
|
|
},
|
|
{
|
|
"epoch": 0.3171081468654455,
|
|
"grad_norm": 0.18578499555587769,
|
|
"learning_rate": 0.004083623894975773,
|
|
"loss": 3.1687942504882813,
|
|
"num_input_tokens_seen": 3072327680,
|
|
"step": 5860,
|
|
"train_runtime": 26641.289,
|
|
"train_tokens_per_second": 115322.036
|
|
},
|
|
{
|
|
"epoch": 0.31764928704780976,
|
|
"grad_norm": 0.17534473538398743,
|
|
"learning_rate": 0.004080455515533633,
|
|
"loss": 3.1645458221435545,
|
|
"num_input_tokens_seen": 3077570560,
|
|
"step": 5870,
|
|
"train_runtime": 26686.5065,
|
|
"train_tokens_per_second": 115323.096
|
|
},
|
|
{
|
|
"epoch": 0.318190427230174,
|
|
"grad_norm": 0.16227850317955017,
|
|
"learning_rate": 0.004077283074272012,
|
|
"loss": 3.1695529937744142,
|
|
"num_input_tokens_seen": 3082813440,
|
|
"step": 5880,
|
|
"train_runtime": 26731.6901,
|
|
"train_tokens_per_second": 115324.3
|
|
},
|
|
{
|
|
"epoch": 0.3187315674125382,
|
|
"grad_norm": 0.17972981929779053,
|
|
"learning_rate": 0.004074106580876226,
|
|
"loss": 3.164577102661133,
|
|
"num_input_tokens_seen": 3088056320,
|
|
"step": 5890,
|
|
"train_runtime": 26776.8465,
|
|
"train_tokens_per_second": 115325.616
|
|
},
|
|
{
|
|
"epoch": 0.31927270759490245,
|
|
"grad_norm": 0.17186778783798218,
|
|
"learning_rate": 0.0040709260450439615,
|
|
"loss": 3.168431854248047,
|
|
"num_input_tokens_seen": 3093299200,
|
|
"step": 5900,
|
|
"train_runtime": 26822.0301,
|
|
"train_tokens_per_second": 115326.811
|
|
},
|
|
{
|
|
"epoch": 0.3198138477772667,
|
|
"grad_norm": 0.16803112626075745,
|
|
"learning_rate": 0.0040677414764852485,
|
|
"loss": 3.1673011779785156,
|
|
"num_input_tokens_seen": 3098542080,
|
|
"step": 5910,
|
|
"train_runtime": 26867.197,
|
|
"train_tokens_per_second": 115328.074
|
|
},
|
|
{
|
|
"epoch": 0.32035498795963097,
|
|
"grad_norm": 0.16622225940227509,
|
|
"learning_rate": 0.00406455288492243,
|
|
"loss": 3.156739616394043,
|
|
"num_input_tokens_seen": 3103784960,
|
|
"step": 5920,
|
|
"train_runtime": 26912.3879,
|
|
"train_tokens_per_second": 115329.229
|
|
},
|
|
{
|
|
"epoch": 0.3208961281419952,
|
|
"grad_norm": 0.18976053595542908,
|
|
"learning_rate": 0.004061360280090129,
|
|
"loss": 3.166844940185547,
|
|
"num_input_tokens_seen": 3109027840,
|
|
"step": 5930,
|
|
"train_runtime": 26957.5834,
|
|
"train_tokens_per_second": 115330.361
|
|
},
|
|
{
|
|
"epoch": 0.3214372683243594,
|
|
"grad_norm": 0.16867531836032867,
|
|
"learning_rate": 0.00405816367173522,
|
|
"loss": 3.1626731872558596,
|
|
"num_input_tokens_seen": 3114270720,
|
|
"step": 5940,
|
|
"train_runtime": 27002.7901,
|
|
"train_tokens_per_second": 115331.442
|
|
},
|
|
{
|
|
"epoch": 0.32197840850672366,
|
|
"grad_norm": 0.20071354508399963,
|
|
"learning_rate": 0.004054963069616803,
|
|
"loss": 3.169915199279785,
|
|
"num_input_tokens_seen": 3119513600,
|
|
"step": 5950,
|
|
"train_runtime": 27047.9883,
|
|
"train_tokens_per_second": 115332.555
|
|
},
|
|
{
|
|
"epoch": 0.3225195486890879,
|
|
"grad_norm": 0.16498495638370514,
|
|
"learning_rate": 0.0040517584835061664,
|
|
"loss": 3.1712413787841798,
|
|
"num_input_tokens_seen": 3124756480,
|
|
"step": 5960,
|
|
"train_runtime": 27093.2042,
|
|
"train_tokens_per_second": 115333.589
|
|
},
|
|
{
|
|
"epoch": 0.3230606888714522,
|
|
"grad_norm": 0.17592206597328186,
|
|
"learning_rate": 0.004048549923186767,
|
|
"loss": 3.1624687194824217,
|
|
"num_input_tokens_seen": 3129999360,
|
|
"step": 5970,
|
|
"train_runtime": 27138.4223,
|
|
"train_tokens_per_second": 115334.61
|
|
},
|
|
{
|
|
"epoch": 0.3236018290538164,
|
|
"grad_norm": 0.15470415353775024,
|
|
"learning_rate": 0.00404533739845419,
|
|
"loss": 3.155242347717285,
|
|
"num_input_tokens_seen": 3135242240,
|
|
"step": 5980,
|
|
"train_runtime": 27183.6528,
|
|
"train_tokens_per_second": 115335.575
|
|
},
|
|
{
|
|
"epoch": 0.32414296923618063,
|
|
"grad_norm": 0.16501109302043915,
|
|
"learning_rate": 0.004042120919116126,
|
|
"loss": 3.1598865509033205,
|
|
"num_input_tokens_seen": 3140485120,
|
|
"step": 5990,
|
|
"train_runtime": 27228.8867,
|
|
"train_tokens_per_second": 115336.523
|
|
},
|
|
{
|
|
"epoch": 0.32468410941854486,
|
|
"grad_norm": 0.16781945526599884,
|
|
"learning_rate": 0.004038900494992339,
|
|
"loss": 3.157525634765625,
|
|
"num_input_tokens_seen": 3145728000,
|
|
"step": 6000,
|
|
"train_runtime": 27274.108,
|
|
"train_tokens_per_second": 115337.521
|
|
},
|
|
{
|
|
"epoch": 0.32468410941854486,
|
|
"eval_loss": 3.111185073852539,
|
|
"eval_runtime": 1.9872,
|
|
"eval_samples_per_second": 251.614,
|
|
"eval_steps_per_second": 4.026,
|
|
"num_input_tokens_seen": 3145728000,
|
|
"step": 6000
|
|
},
|
|
{
|
|
"epoch": 0.3252252496009091,
|
|
"grad_norm": 0.18414868414402008,
|
|
"learning_rate": 0.004035676135914636,
|
|
"loss": 3.170181655883789,
|
|
"num_input_tokens_seen": 3150970880,
|
|
"step": 6010,
|
|
"train_runtime": 27323.9049,
|
|
"train_tokens_per_second": 115319.201
|
|
},
|
|
{
|
|
"epoch": 0.3257663897832734,
|
|
"grad_norm": 0.1616990864276886,
|
|
"learning_rate": 0.004032447851726835,
|
|
"loss": 3.1585414886474608,
|
|
"num_input_tokens_seen": 3156213760,
|
|
"step": 6020,
|
|
"train_runtime": 27369.1149,
|
|
"train_tokens_per_second": 115320.272
|
|
},
|
|
{
|
|
"epoch": 0.3263075299656376,
|
|
"grad_norm": 0.16582000255584717,
|
|
"learning_rate": 0.004029215652284741,
|
|
"loss": 3.1622276306152344,
|
|
"num_input_tokens_seen": 3161456640,
|
|
"step": 6030,
|
|
"train_runtime": 27414.3296,
|
|
"train_tokens_per_second": 115321.319
|
|
},
|
|
{
|
|
"epoch": 0.32684867014800184,
|
|
"grad_norm": 0.17380478978157043,
|
|
"learning_rate": 0.00402597954745611,
|
|
"loss": 3.1608341217041014,
|
|
"num_input_tokens_seen": 3166699520,
|
|
"step": 6040,
|
|
"train_runtime": 27459.5638,
|
|
"train_tokens_per_second": 115322.281
|
|
},
|
|
{
|
|
"epoch": 0.32738981033036607,
|
|
"grad_norm": 0.18764927983283997,
|
|
"learning_rate": 0.00402273954712062,
|
|
"loss": 3.1758914947509767,
|
|
"num_input_tokens_seen": 3171942400,
|
|
"step": 6050,
|
|
"train_runtime": 27504.7658,
|
|
"train_tokens_per_second": 115323.374
|
|
},
|
|
{
|
|
"epoch": 0.3279309505127303,
|
|
"grad_norm": 0.1659294068813324,
|
|
"learning_rate": 0.004019495661169844,
|
|
"loss": 3.1681026458740233,
|
|
"num_input_tokens_seen": 3177185280,
|
|
"step": 6060,
|
|
"train_runtime": 27549.978,
|
|
"train_tokens_per_second": 115324.422
|
|
},
|
|
{
|
|
"epoch": 0.3284720906950946,
|
|
"grad_norm": 0.15407103300094604,
|
|
"learning_rate": 0.004016247899507217,
|
|
"loss": 3.1617177963256835,
|
|
"num_input_tokens_seen": 3182428160,
|
|
"step": 6070,
|
|
"train_runtime": 27595.2039,
|
|
"train_tokens_per_second": 115325.409
|
|
},
|
|
{
|
|
"epoch": 0.3290132308774588,
|
|
"grad_norm": 0.17896398901939392,
|
|
"learning_rate": 0.004012996272048004,
|
|
"loss": 3.163351631164551,
|
|
"num_input_tokens_seen": 3187671040,
|
|
"step": 6080,
|
|
"train_runtime": 27640.4032,
|
|
"train_tokens_per_second": 115326.503
|
|
},
|
|
{
|
|
"epoch": 0.32955437105982305,
|
|
"grad_norm": 0.17344997823238373,
|
|
"learning_rate": 0.004009740788719276,
|
|
"loss": 3.153501510620117,
|
|
"num_input_tokens_seen": 3192913920,
|
|
"step": 6090,
|
|
"train_runtime": 27685.6168,
|
|
"train_tokens_per_second": 115327.534
|
|
},
|
|
{
|
|
"epoch": 0.3300955112421873,
|
|
"grad_norm": 0.16279980540275574,
|
|
"learning_rate": 0.004006481459459872,
|
|
"loss": 3.160162162780762,
|
|
"num_input_tokens_seen": 3198156800,
|
|
"step": 6100,
|
|
"train_runtime": 27730.837,
|
|
"train_tokens_per_second": 115328.535
|
|
},
|
|
{
|
|
"epoch": 0.3306366514245515,
|
|
"grad_norm": 0.16896025836467743,
|
|
"learning_rate": 0.0040032182942203775,
|
|
"loss": 3.158255767822266,
|
|
"num_input_tokens_seen": 3203399680,
|
|
"step": 6110,
|
|
"train_runtime": 27779.614,
|
|
"train_tokens_per_second": 115314.766
|
|
},
|
|
{
|
|
"epoch": 0.3311777916069158,
|
|
"grad_norm": 0.18168415129184723,
|
|
"learning_rate": 0.003999951302963083,
|
|
"loss": 3.156180000305176,
|
|
"num_input_tokens_seen": 3208642560,
|
|
"step": 6120,
|
|
"train_runtime": 27824.7398,
|
|
"train_tokens_per_second": 115316.175
|
|
},
|
|
{
|
|
"epoch": 0.33171893178928,
|
|
"grad_norm": 0.17479564249515533,
|
|
"learning_rate": 0.003996680495661963,
|
|
"loss": 3.155413818359375,
|
|
"num_input_tokens_seen": 3213885440,
|
|
"step": 6130,
|
|
"train_runtime": 27869.8597,
|
|
"train_tokens_per_second": 115317.604
|
|
},
|
|
{
|
|
"epoch": 0.33226007197164426,
|
|
"grad_norm": 0.16649708151817322,
|
|
"learning_rate": 0.003993405882302642,
|
|
"loss": 3.162016677856445,
|
|
"num_input_tokens_seen": 3219128320,
|
|
"step": 6140,
|
|
"train_runtime": 27914.9889,
|
|
"train_tokens_per_second": 115318.99
|
|
},
|
|
{
|
|
"epoch": 0.3328012121540085,
|
|
"grad_norm": 0.17866738140583038,
|
|
"learning_rate": 0.003990127472882364,
|
|
"loss": 3.1546072006225585,
|
|
"num_input_tokens_seen": 3224371200,
|
|
"step": 6150,
|
|
"train_runtime": 27960.113,
|
|
"train_tokens_per_second": 115320.392
|
|
},
|
|
{
|
|
"epoch": 0.3333423523363727,
|
|
"grad_norm": 0.15289874374866486,
|
|
"learning_rate": 0.0039868452774099615,
|
|
"loss": 3.1471332550048827,
|
|
"num_input_tokens_seen": 3229614080,
|
|
"step": 6160,
|
|
"train_runtime": 28005.2392,
|
|
"train_tokens_per_second": 115321.782
|
|
},
|
|
{
|
|
"epoch": 0.333883492518737,
|
|
"grad_norm": 0.16488930583000183,
|
|
"learning_rate": 0.003983559305905828,
|
|
"loss": 3.1540958404541017,
|
|
"num_input_tokens_seen": 3234856960,
|
|
"step": 6170,
|
|
"train_runtime": 28050.3655,
|
|
"train_tokens_per_second": 115323.166
|
|
},
|
|
{
|
|
"epoch": 0.33442463270110123,
|
|
"grad_norm": 0.17347821593284607,
|
|
"learning_rate": 0.003980269568401881,
|
|
"loss": 3.153203010559082,
|
|
"num_input_tokens_seen": 3240099840,
|
|
"step": 6180,
|
|
"train_runtime": 28095.5018,
|
|
"train_tokens_per_second": 115324.505
|
|
},
|
|
{
|
|
"epoch": 0.33496577288346546,
|
|
"grad_norm": 0.16901230812072754,
|
|
"learning_rate": 0.00397697607494154,
|
|
"loss": 3.153574752807617,
|
|
"num_input_tokens_seen": 3245342720,
|
|
"step": 6190,
|
|
"train_runtime": 28140.6287,
|
|
"train_tokens_per_second": 115325.878
|
|
},
|
|
{
|
|
"epoch": 0.3355069130658297,
|
|
"grad_norm": 0.1725231409072876,
|
|
"learning_rate": 0.0039736788355796875,
|
|
"loss": 3.1607025146484373,
|
|
"num_input_tokens_seen": 3250585600,
|
|
"step": 6200,
|
|
"train_runtime": 28185.7568,
|
|
"train_tokens_per_second": 115327.242
|
|
},
|
|
{
|
|
"epoch": 0.3360480532481939,
|
|
"grad_norm": 0.17325183749198914,
|
|
"learning_rate": 0.003970377860382644,
|
|
"loss": 3.147405242919922,
|
|
"num_input_tokens_seen": 3255828480,
|
|
"step": 6210,
|
|
"train_runtime": 28230.8843,
|
|
"train_tokens_per_second": 115328.604
|
|
},
|
|
{
|
|
"epoch": 0.3365891934305582,
|
|
"grad_norm": 0.1715451180934906,
|
|
"learning_rate": 0.003967073159428135,
|
|
"loss": 3.150386428833008,
|
|
"num_input_tokens_seen": 3261071360,
|
|
"step": 6220,
|
|
"train_runtime": 28276.018,
|
|
"train_tokens_per_second": 115329.936
|
|
},
|
|
{
|
|
"epoch": 0.33713033361292244,
|
|
"grad_norm": 0.16657474637031555,
|
|
"learning_rate": 0.003963764742805262,
|
|
"loss": 3.1559564590454103,
|
|
"num_input_tokens_seen": 3266314240,
|
|
"step": 6230,
|
|
"train_runtime": 28321.1527,
|
|
"train_tokens_per_second": 115331.261
|
|
},
|
|
{
|
|
"epoch": 0.33767147379528667,
|
|
"grad_norm": 0.17756827175617218,
|
|
"learning_rate": 0.003960452620614465,
|
|
"loss": 3.1532052993774413,
|
|
"num_input_tokens_seen": 3271557120,
|
|
"step": 6240,
|
|
"train_runtime": 28366.2774,
|
|
"train_tokens_per_second": 115332.621
|
|
},
|
|
{
|
|
"epoch": 0.3382126139776509,
|
|
"grad_norm": 0.16704502701759338,
|
|
"learning_rate": 0.003957136802967503,
|
|
"loss": 3.145302581787109,
|
|
"num_input_tokens_seen": 3276800000,
|
|
"step": 6250,
|
|
"train_runtime": 28411.4119,
|
|
"train_tokens_per_second": 115333.937
|
|
},
|
|
{
|
|
"epoch": 0.33875375416001513,
|
|
"grad_norm": 0.16609609127044678,
|
|
"learning_rate": 0.003953817299987416,
|
|
"loss": 3.157614898681641,
|
|
"num_input_tokens_seen": 3282042880,
|
|
"step": 6260,
|
|
"train_runtime": 28456.5404,
|
|
"train_tokens_per_second": 115335.274
|
|
},
|
|
{
|
|
"epoch": 0.3392948943423794,
|
|
"grad_norm": 0.17776867747306824,
|
|
"learning_rate": 0.003950494121808493,
|
|
"loss": 3.1511157989501952,
|
|
"num_input_tokens_seen": 3287285760,
|
|
"step": 6270,
|
|
"train_runtime": 28501.6688,
|
|
"train_tokens_per_second": 115336.607
|
|
},
|
|
{
|
|
"epoch": 0.33983603452474365,
|
|
"grad_norm": 0.16619160771369934,
|
|
"learning_rate": 0.003947167278576242,
|
|
"loss": 3.1576236724853515,
|
|
"num_input_tokens_seen": 3292528640,
|
|
"step": 6280,
|
|
"train_runtime": 28546.8015,
|
|
"train_tokens_per_second": 115337.917
|
|
},
|
|
{
|
|
"epoch": 0.3403771747071079,
|
|
"grad_norm": 0.17923958599567413,
|
|
"learning_rate": 0.003943836780447365,
|
|
"loss": 3.1528648376464843,
|
|
"num_input_tokens_seen": 3297771520,
|
|
"step": 6290,
|
|
"train_runtime": 28591.9231,
|
|
"train_tokens_per_second": 115339.269
|
|
},
|
|
{
|
|
"epoch": 0.3409183148894721,
|
|
"grad_norm": 0.16474676132202148,
|
|
"learning_rate": 0.003940502637589718,
|
|
"loss": 3.1509103775024414,
|
|
"num_input_tokens_seen": 3303014400,
|
|
"step": 6300,
|
|
"train_runtime": 28637.0641,
|
|
"train_tokens_per_second": 115340.539
|
|
},
|
|
{
|
|
"epoch": 0.34145945507183634,
|
|
"grad_norm": 0.1639336794614792,
|
|
"learning_rate": 0.0039371648601822865,
|
|
"loss": 3.155986785888672,
|
|
"num_input_tokens_seen": 3308257280,
|
|
"step": 6310,
|
|
"train_runtime": 28682.1884,
|
|
"train_tokens_per_second": 115341.871
|
|
},
|
|
{
|
|
"epoch": 0.3420005952542006,
|
|
"grad_norm": 0.17124713957309723,
|
|
"learning_rate": 0.003933823458415151,
|
|
"loss": 3.147997283935547,
|
|
"num_input_tokens_seen": 3313500160,
|
|
"step": 6320,
|
|
"train_runtime": 28727.3095,
|
|
"train_tokens_per_second": 115343.212
|
|
},
|
|
{
|
|
"epoch": 0.34254173543656485,
|
|
"grad_norm": 0.17230060696601868,
|
|
"learning_rate": 0.003930478442489458,
|
|
"loss": 3.1527957916259766,
|
|
"num_input_tokens_seen": 3318743040,
|
|
"step": 6330,
|
|
"train_runtime": 28772.44,
|
|
"train_tokens_per_second": 115344.512
|
|
},
|
|
{
|
|
"epoch": 0.3430828756189291,
|
|
"grad_norm": 0.1681806594133377,
|
|
"learning_rate": 0.003927129822617386,
|
|
"loss": 3.1512054443359374,
|
|
"num_input_tokens_seen": 3323985920,
|
|
"step": 6340,
|
|
"train_runtime": 28817.6293,
|
|
"train_tokens_per_second": 115345.572
|
|
},
|
|
{
|
|
"epoch": 0.3436240158012933,
|
|
"grad_norm": 0.17435091733932495,
|
|
"learning_rate": 0.003923777609022119,
|
|
"loss": 3.153603744506836,
|
|
"num_input_tokens_seen": 3329228800,
|
|
"step": 6350,
|
|
"train_runtime": 28862.8169,
|
|
"train_tokens_per_second": 115346.635
|
|
},
|
|
{
|
|
"epoch": 0.34416515598365754,
|
|
"grad_norm": 0.1703469306230545,
|
|
"learning_rate": 0.00392042181193781,
|
|
"loss": 3.142818069458008,
|
|
"num_input_tokens_seen": 3334471680,
|
|
"step": 6360,
|
|
"train_runtime": 28907.9957,
|
|
"train_tokens_per_second": 115347.73
|
|
},
|
|
{
|
|
"epoch": 0.34470629616602183,
|
|
"grad_norm": 0.1682499647140503,
|
|
"learning_rate": 0.0039170624416095525,
|
|
"loss": 3.1417423248291017,
|
|
"num_input_tokens_seen": 3339714560,
|
|
"step": 6370,
|
|
"train_runtime": 28953.1644,
|
|
"train_tokens_per_second": 115348.862
|
|
},
|
|
{
|
|
"epoch": 0.34524743634838606,
|
|
"grad_norm": 0.16802842915058136,
|
|
"learning_rate": 0.0039136995082933515,
|
|
"loss": 3.1456912994384765,
|
|
"num_input_tokens_seen": 3344957440,
|
|
"step": 6380,
|
|
"train_runtime": 28998.3264,
|
|
"train_tokens_per_second": 115350.017
|
|
},
|
|
{
|
|
"epoch": 0.3457885765307503,
|
|
"grad_norm": 0.1582358479499817,
|
|
"learning_rate": 0.003910333022256086,
|
|
"loss": 3.1438793182373046,
|
|
"num_input_tokens_seen": 3350200320,
|
|
"step": 6390,
|
|
"train_runtime": 29043.4985,
|
|
"train_tokens_per_second": 115351.128
|
|
},
|
|
{
|
|
"epoch": 0.3463297167131145,
|
|
"grad_norm": 0.16883233189582825,
|
|
"learning_rate": 0.003906962993775483,
|
|
"loss": 3.1468482971191407,
|
|
"num_input_tokens_seen": 3355443200,
|
|
"step": 6400,
|
|
"train_runtime": 29088.66,
|
|
"train_tokens_per_second": 115352.278
|
|
},
|
|
{
|
|
"epoch": 0.34687085689547875,
|
|
"grad_norm": 0.18867318332195282,
|
|
"learning_rate": 0.0039035894331400853,
|
|
"loss": 3.147420883178711,
|
|
"num_input_tokens_seen": 3360686080,
|
|
"step": 6410,
|
|
"train_runtime": 29133.8253,
|
|
"train_tokens_per_second": 115353.409
|
|
},
|
|
{
|
|
"epoch": 0.34741199707784304,
|
|
"grad_norm": 0.16323506832122803,
|
|
"learning_rate": 0.0039002123506492177,
|
|
"loss": 3.145482063293457,
|
|
"num_input_tokens_seen": 3365928960,
|
|
"step": 6420,
|
|
"train_runtime": 29179.0336,
|
|
"train_tokens_per_second": 115354.367
|
|
},
|
|
{
|
|
"epoch": 0.34795313726020727,
|
|
"grad_norm": 0.1756802797317505,
|
|
"learning_rate": 0.003896831756612958,
|
|
"loss": 3.1475906372070312,
|
|
"num_input_tokens_seen": 3371171840,
|
|
"step": 6430,
|
|
"train_runtime": 29224.2308,
|
|
"train_tokens_per_second": 115355.366
|
|
},
|
|
{
|
|
"epoch": 0.3484942774425715,
|
|
"grad_norm": 0.17158783972263336,
|
|
"learning_rate": 0.0038934476613521037,
|
|
"loss": 3.142435073852539,
|
|
"num_input_tokens_seen": 3376414720,
|
|
"step": 6440,
|
|
"train_runtime": 29269.4011,
|
|
"train_tokens_per_second": 115356.467
|
|
},
|
|
{
|
|
"epoch": 0.34903541762493573,
|
|
"grad_norm": 0.16574952006340027,
|
|
"learning_rate": 0.0038900600751981436,
|
|
"loss": 3.1459327697753907,
|
|
"num_input_tokens_seen": 3381657600,
|
|
"step": 6450,
|
|
"train_runtime": 29314.5687,
|
|
"train_tokens_per_second": 115357.577
|
|
},
|
|
{
|
|
"epoch": 0.34957655780729996,
|
|
"grad_norm": 0.16016115248203278,
|
|
"learning_rate": 0.0038866690084932206,
|
|
"loss": 3.1540714263916017,
|
|
"num_input_tokens_seen": 3386900480,
|
|
"step": 6460,
|
|
"train_runtime": 29359.7508,
|
|
"train_tokens_per_second": 115358.625
|
|
},
|
|
{
|
|
"epoch": 0.35011769798966424,
|
|
"grad_norm": 0.1590614914894104,
|
|
"learning_rate": 0.0038832744715901063,
|
|
"loss": 3.138327789306641,
|
|
"num_input_tokens_seen": 3392143360,
|
|
"step": 6470,
|
|
"train_runtime": 29404.9917,
|
|
"train_tokens_per_second": 115359.439
|
|
},
|
|
{
|
|
"epoch": 0.3506588381720285,
|
|
"grad_norm": 0.1668478101491928,
|
|
"learning_rate": 0.003879876474852164,
|
|
"loss": 3.1390443801879884,
|
|
"num_input_tokens_seen": 3397386240,
|
|
"step": 6480,
|
|
"train_runtime": 29450.2102,
|
|
"train_tokens_per_second": 115360.339
|
|
},
|
|
{
|
|
"epoch": 0.3511999783543927,
|
|
"grad_norm": 0.16614961624145508,
|
|
"learning_rate": 0.0038764750286533244,
|
|
"loss": 3.1493562698364257,
|
|
"num_input_tokens_seen": 3402629120,
|
|
"step": 6490,
|
|
"train_runtime": 29498.9151,
|
|
"train_tokens_per_second": 115347.602
|
|
},
|
|
{
|
|
"epoch": 0.35174111853675694,
|
|
"grad_norm": 0.1770559698343277,
|
|
"learning_rate": 0.003873070143378044,
|
|
"loss": 3.1434371948242186,
|
|
"num_input_tokens_seen": 3407872000,
|
|
"step": 6500,
|
|
"train_runtime": 29544.0364,
|
|
"train_tokens_per_second": 115348.896
|
|
},
|
|
{
|
|
"epoch": 0.35174111853675694,
|
|
"eval_loss": 3.0966169834136963,
|
|
"eval_runtime": 1.9851,
|
|
"eval_samples_per_second": 251.881,
|
|
"eval_steps_per_second": 4.03,
|
|
"num_input_tokens_seen": 3407872000,
|
|
"step": 6500
|
|
},
|
|
{
|
|
"epoch": 0.35228225871912117,
|
|
"grad_norm": 0.1724107414484024,
|
|
"learning_rate": 0.0038696618294212816,
|
|
"loss": 3.1477359771728515,
|
|
"num_input_tokens_seen": 3413114880,
|
|
"step": 6510,
|
|
"train_runtime": 29591.1684,
|
|
"train_tokens_per_second": 115342.349
|
|
},
|
|
{
|
|
"epoch": 0.35282339890148545,
|
|
"grad_norm": 0.17597156763076782,
|
|
"learning_rate": 0.0038662500971884633,
|
|
"loss": 3.1492542266845702,
|
|
"num_input_tokens_seen": 3418357760,
|
|
"step": 6520,
|
|
"train_runtime": 29636.3254,
|
|
"train_tokens_per_second": 115343.509
|
|
},
|
|
{
|
|
"epoch": 0.3533645390838497,
|
|
"grad_norm": 0.1612919569015503,
|
|
"learning_rate": 0.0038628349570954497,
|
|
"loss": 3.1426467895507812,
|
|
"num_input_tokens_seen": 3423600640,
|
|
"step": 6530,
|
|
"train_runtime": 29681.4655,
|
|
"train_tokens_per_second": 115344.73
|
|
},
|
|
{
|
|
"epoch": 0.3539056792662139,
|
|
"grad_norm": 0.16101430356502533,
|
|
"learning_rate": 0.0038594164195685076,
|
|
"loss": 3.137646484375,
|
|
"num_input_tokens_seen": 3428843520,
|
|
"step": 6540,
|
|
"train_runtime": 29726.6035,
|
|
"train_tokens_per_second": 115345.957
|
|
},
|
|
{
|
|
"epoch": 0.35444681944857814,
|
|
"grad_norm": 0.17293353378772736,
|
|
"learning_rate": 0.003855994495044273,
|
|
"loss": 3.1470672607421877,
|
|
"num_input_tokens_seen": 3434086400,
|
|
"step": 6550,
|
|
"train_runtime": 29771.7425,
|
|
"train_tokens_per_second": 115347.175
|
|
},
|
|
{
|
|
"epoch": 0.3549879596309424,
|
|
"grad_norm": 0.18171222507953644,
|
|
"learning_rate": 0.0038525691939697267,
|
|
"loss": 3.1423971176147463,
|
|
"num_input_tokens_seen": 3439329280,
|
|
"step": 6560,
|
|
"train_runtime": 29816.873,
|
|
"train_tokens_per_second": 115348.423
|
|
},
|
|
{
|
|
"epoch": 0.35552909981330666,
|
|
"grad_norm": 0.17078061401844025,
|
|
"learning_rate": 0.0038491405268021523,
|
|
"loss": 3.1396827697753906,
|
|
"num_input_tokens_seen": 3444572160,
|
|
"step": 6570,
|
|
"train_runtime": 29862.0878,
|
|
"train_tokens_per_second": 115349.341
|
|
},
|
|
{
|
|
"epoch": 0.3560702399956709,
|
|
"grad_norm": 0.17867809534072876,
|
|
"learning_rate": 0.0038457085040091155,
|
|
"loss": 3.1499147415161133,
|
|
"num_input_tokens_seen": 3449815040,
|
|
"step": 6580,
|
|
"train_runtime": 29907.2427,
|
|
"train_tokens_per_second": 115350.488
|
|
},
|
|
{
|
|
"epoch": 0.3566113801780351,
|
|
"grad_norm": 0.15178236365318298,
|
|
"learning_rate": 0.003842273136068423,
|
|
"loss": 3.13470344543457,
|
|
"num_input_tokens_seen": 3455057920,
|
|
"step": 6590,
|
|
"train_runtime": 29952.42,
|
|
"train_tokens_per_second": 115351.545
|
|
},
|
|
{
|
|
"epoch": 0.35715252036039935,
|
|
"grad_norm": 0.17382913827896118,
|
|
"learning_rate": 0.0038388344334680936,
|
|
"loss": 3.1436153411865235,
|
|
"num_input_tokens_seen": 3460300800,
|
|
"step": 6600,
|
|
"train_runtime": 29997.6461,
|
|
"train_tokens_per_second": 115352.411
|
|
},
|
|
{
|
|
"epoch": 0.3576936605427636,
|
|
"grad_norm": 0.17544035613536835,
|
|
"learning_rate": 0.0038353924067063313,
|
|
"loss": 3.1381744384765624,
|
|
"num_input_tokens_seen": 3465543680,
|
|
"step": 6610,
|
|
"train_runtime": 30042.8233,
|
|
"train_tokens_per_second": 115353.462
|
|
},
|
|
{
|
|
"epoch": 0.35823480072512787,
|
|
"grad_norm": 0.15095841884613037,
|
|
"learning_rate": 0.003831947066291482,
|
|
"loss": 3.1344669342041014,
|
|
"num_input_tokens_seen": 3470786560,
|
|
"step": 6620,
|
|
"train_runtime": 30088.0009,
|
|
"train_tokens_per_second": 115354.509
|
|
},
|
|
{
|
|
"epoch": 0.3587759409074921,
|
|
"grad_norm": 0.16399560868740082,
|
|
"learning_rate": 0.0038284984227420146,
|
|
"loss": 3.134235382080078,
|
|
"num_input_tokens_seen": 3476029440,
|
|
"step": 6630,
|
|
"train_runtime": 30133.1894,
|
|
"train_tokens_per_second": 115355.51
|
|
},
|
|
{
|
|
"epoch": 0.3593170810898563,
|
|
"grad_norm": 0.18398840725421906,
|
|
"learning_rate": 0.003825046486586477,
|
|
"loss": 3.131580924987793,
|
|
"num_input_tokens_seen": 3481272320,
|
|
"step": 6640,
|
|
"train_runtime": 30178.3732,
|
|
"train_tokens_per_second": 115356.527
|
|
},
|
|
{
|
|
"epoch": 0.35985822127222056,
|
|
"grad_norm": 0.16813096404075623,
|
|
"learning_rate": 0.0038215912683634726,
|
|
"loss": 3.1448497772216797,
|
|
"num_input_tokens_seen": 3486515200,
|
|
"step": 6650,
|
|
"train_runtime": 30223.5423,
|
|
"train_tokens_per_second": 115357.596
|
|
},
|
|
{
|
|
"epoch": 0.3603993614545848,
|
|
"grad_norm": 0.1649860441684723,
|
|
"learning_rate": 0.003818132778621623,
|
|
"loss": 3.14077091217041,
|
|
"num_input_tokens_seen": 3491758080,
|
|
"step": 6660,
|
|
"train_runtime": 30268.7194,
|
|
"train_tokens_per_second": 115358.633
|
|
},
|
|
{
|
|
"epoch": 0.3609405016369491,
|
|
"grad_norm": 0.17575252056121826,
|
|
"learning_rate": 0.0038146710279195386,
|
|
"loss": 3.1330080032348633,
|
|
"num_input_tokens_seen": 3497000960,
|
|
"step": 6670,
|
|
"train_runtime": 30313.9788,
|
|
"train_tokens_per_second": 115359.352
|
|
},
|
|
{
|
|
"epoch": 0.3614816418193133,
|
|
"grad_norm": 0.1742008924484253,
|
|
"learning_rate": 0.003811206026825786,
|
|
"loss": 3.155079460144043,
|
|
"num_input_tokens_seen": 3502243840,
|
|
"step": 6680,
|
|
"train_runtime": 30359.1553,
|
|
"train_tokens_per_second": 115360.385
|
|
},
|
|
{
|
|
"epoch": 0.36202278200167753,
|
|
"grad_norm": 0.1799112856388092,
|
|
"learning_rate": 0.0038077377859188524,
|
|
"loss": 3.1288970947265624,
|
|
"num_input_tokens_seen": 3507486720,
|
|
"step": 6690,
|
|
"train_runtime": 30404.3262,
|
|
"train_tokens_per_second": 115361.436
|
|
},
|
|
{
|
|
"epoch": 0.36256392218404176,
|
|
"grad_norm": 0.16728277504444122,
|
|
"learning_rate": 0.003804266315787119,
|
|
"loss": 3.137259864807129,
|
|
"num_input_tokens_seen": 3512729600,
|
|
"step": 6700,
|
|
"train_runtime": 30449.5017,
|
|
"train_tokens_per_second": 115362.466
|
|
},
|
|
{
|
|
"epoch": 0.363105062366406,
|
|
"grad_norm": 0.1766940951347351,
|
|
"learning_rate": 0.0038007916270288234,
|
|
"loss": 3.1414379119873046,
|
|
"num_input_tokens_seen": 3517972480,
|
|
"step": 6710,
|
|
"train_runtime": 30494.6728,
|
|
"train_tokens_per_second": 115363.51
|
|
},
|
|
{
|
|
"epoch": 0.3636462025487703,
|
|
"grad_norm": 0.17950496077537537,
|
|
"learning_rate": 0.0037973137302520312,
|
|
"loss": 3.141128730773926,
|
|
"num_input_tokens_seen": 3523215360,
|
|
"step": 6720,
|
|
"train_runtime": 30539.8417,
|
|
"train_tokens_per_second": 115364.559
|
|
},
|
|
{
|
|
"epoch": 0.3641873427311345,
|
|
"grad_norm": 0.17668098211288452,
|
|
"learning_rate": 0.003793832636074601,
|
|
"loss": 3.1354911804199217,
|
|
"num_input_tokens_seen": 3528458240,
|
|
"step": 6730,
|
|
"train_runtime": 30585.0013,
|
|
"train_tokens_per_second": 115365.64
|
|
},
|
|
{
|
|
"epoch": 0.36472848291349874,
|
|
"grad_norm": 0.17323218286037445,
|
|
"learning_rate": 0.0037903483551241534,
|
|
"loss": 3.1416683197021484,
|
|
"num_input_tokens_seen": 3533701120,
|
|
"step": 6740,
|
|
"train_runtime": 30630.1549,
|
|
"train_tokens_per_second": 115366.74
|
|
},
|
|
{
|
|
"epoch": 0.36526962309586297,
|
|
"grad_norm": 0.1715293824672699,
|
|
"learning_rate": 0.003786860898038038,
|
|
"loss": 3.133253288269043,
|
|
"num_input_tokens_seen": 3538944000,
|
|
"step": 6750,
|
|
"train_runtime": 30675.3114,
|
|
"train_tokens_per_second": 115367.826
|
|
},
|
|
{
|
|
"epoch": 0.3658107632782272,
|
|
"grad_norm": 0.16131816804409027,
|
|
"learning_rate": 0.0037833702754633005,
|
|
"loss": 3.137991714477539,
|
|
"num_input_tokens_seen": 3544186880,
|
|
"step": 6760,
|
|
"train_runtime": 30720.4583,
|
|
"train_tokens_per_second": 115368.945
|
|
},
|
|
{
|
|
"epoch": 0.3663519034605915,
|
|
"grad_norm": 0.16405366361141205,
|
|
"learning_rate": 0.003779876498056652,
|
|
"loss": 3.149972152709961,
|
|
"num_input_tokens_seen": 3549429760,
|
|
"step": 6770,
|
|
"train_runtime": 30765.5763,
|
|
"train_tokens_per_second": 115370.17
|
|
},
|
|
{
|
|
"epoch": 0.3668930436429557,
|
|
"grad_norm": 0.1677146553993225,
|
|
"learning_rate": 0.0037763795764844317,
|
|
"loss": 3.1432748794555665,
|
|
"num_input_tokens_seen": 3554672640,
|
|
"step": 6780,
|
|
"train_runtime": 30810.7138,
|
|
"train_tokens_per_second": 115371.317
|
|
},
|
|
{
|
|
"epoch": 0.36743418382531995,
|
|
"grad_norm": 0.1701316237449646,
|
|
"learning_rate": 0.003772879521422583,
|
|
"loss": 3.138026809692383,
|
|
"num_input_tokens_seen": 3559915520,
|
|
"step": 6790,
|
|
"train_runtime": 30855.8357,
|
|
"train_tokens_per_second": 115372.52
|
|
},
|
|
{
|
|
"epoch": 0.3679753240076842,
|
|
"grad_norm": 0.1724764108657837,
|
|
"learning_rate": 0.0037693763435566125,
|
|
"loss": 3.1394069671630858,
|
|
"num_input_tokens_seen": 3565158400,
|
|
"step": 6800,
|
|
"train_runtime": 30900.9517,
|
|
"train_tokens_per_second": 115373.741
|
|
},
|
|
{
|
|
"epoch": 0.3685164641900484,
|
|
"grad_norm": 0.16157887876033783,
|
|
"learning_rate": 0.00376587005358156,
|
|
"loss": 3.124007797241211,
|
|
"num_input_tokens_seen": 3570401280,
|
|
"step": 6810,
|
|
"train_runtime": 30946.0772,
|
|
"train_tokens_per_second": 115374.923
|
|
},
|
|
{
|
|
"epoch": 0.3690576043724127,
|
|
"grad_norm": 0.16729003190994263,
|
|
"learning_rate": 0.0037623606622019675,
|
|
"loss": 3.122986602783203,
|
|
"num_input_tokens_seen": 3575644160,
|
|
"step": 6820,
|
|
"train_runtime": 30991.3846,
|
|
"train_tokens_per_second": 115375.425
|
|
},
|
|
{
|
|
"epoch": 0.3695987445547769,
|
|
"grad_norm": 0.17239217460155487,
|
|
"learning_rate": 0.003758848180131846,
|
|
"loss": 3.1259433746337892,
|
|
"num_input_tokens_seen": 3580887040,
|
|
"step": 6830,
|
|
"train_runtime": 31036.5265,
|
|
"train_tokens_per_second": 115376.54
|
|
},
|
|
{
|
|
"epoch": 0.37013988473714116,
|
|
"grad_norm": 0.1540314108133316,
|
|
"learning_rate": 0.003755332618094642,
|
|
"loss": 3.128913688659668,
|
|
"num_input_tokens_seen": 3586129920,
|
|
"step": 6840,
|
|
"train_runtime": 31081.6974,
|
|
"train_tokens_per_second": 115377.544
|
|
},
|
|
{
|
|
"epoch": 0.3706810249195054,
|
|
"grad_norm": 0.16670770943164825,
|
|
"learning_rate": 0.0037518139868232036,
|
|
"loss": 3.1437910079956053,
|
|
"num_input_tokens_seen": 3591372800,
|
|
"step": 6850,
|
|
"train_runtime": 31126.8444,
|
|
"train_tokens_per_second": 115378.634
|
|
},
|
|
{
|
|
"epoch": 0.3712221651018696,
|
|
"grad_norm": 0.16100816428661346,
|
|
"learning_rate": 0.0037482922970597512,
|
|
"loss": 3.1303838729858398,
|
|
"num_input_tokens_seen": 3596615680,
|
|
"step": 6860,
|
|
"train_runtime": 31172.0038,
|
|
"train_tokens_per_second": 115379.675
|
|
},
|
|
{
|
|
"epoch": 0.3717633052842339,
|
|
"grad_norm": 0.1720798909664154,
|
|
"learning_rate": 0.0037447675595558417,
|
|
"loss": 3.139808464050293,
|
|
"num_input_tokens_seen": 3601858560,
|
|
"step": 6870,
|
|
"train_runtime": 31220.5874,
|
|
"train_tokens_per_second": 115368.059
|
|
},
|
|
{
|
|
"epoch": 0.37230444546659813,
|
|
"grad_norm": 0.15832237899303436,
|
|
"learning_rate": 0.0037412397850723356,
|
|
"loss": 3.1387088775634764,
|
|
"num_input_tokens_seen": 3607101440,
|
|
"step": 6880,
|
|
"train_runtime": 31265.7548,
|
|
"train_tokens_per_second": 115369.082
|
|
},
|
|
{
|
|
"epoch": 0.37284558564896236,
|
|
"grad_norm": 0.16572092473506927,
|
|
"learning_rate": 0.0037377089843793664,
|
|
"loss": 3.136234092712402,
|
|
"num_input_tokens_seen": 3612344320,
|
|
"step": 6890,
|
|
"train_runtime": 31310.8828,
|
|
"train_tokens_per_second": 115370.248
|
|
},
|
|
{
|
|
"epoch": 0.3733867258313266,
|
|
"grad_norm": 0.16967612504959106,
|
|
"learning_rate": 0.0037341751682563075,
|
|
"loss": 3.1306957244873046,
|
|
"num_input_tokens_seen": 3617587200,
|
|
"step": 6900,
|
|
"train_runtime": 31356.0169,
|
|
"train_tokens_per_second": 115371.388
|
|
},
|
|
{
|
|
"epoch": 0.3739278660136908,
|
|
"grad_norm": 0.16561359167099,
|
|
"learning_rate": 0.0037306383474917356,
|
|
"loss": 3.128021240234375,
|
|
"num_input_tokens_seen": 3622830080,
|
|
"step": 6910,
|
|
"train_runtime": 31401.1695,
|
|
"train_tokens_per_second": 115372.457
|
|
},
|
|
{
|
|
"epoch": 0.3744690061960551,
|
|
"grad_norm": 0.16602273285388947,
|
|
"learning_rate": 0.0037270985328834013,
|
|
"loss": 3.125231170654297,
|
|
"num_input_tokens_seen": 3628072960,
|
|
"step": 6920,
|
|
"train_runtime": 31446.3403,
|
|
"train_tokens_per_second": 115373.456
|
|
},
|
|
{
|
|
"epoch": 0.37501014637841934,
|
|
"grad_norm": 0.15461350977420807,
|
|
"learning_rate": 0.0037235557352381975,
|
|
"loss": 3.1283363342285155,
|
|
"num_input_tokens_seen": 3633315840,
|
|
"step": 6930,
|
|
"train_runtime": 31491.4936,
|
|
"train_tokens_per_second": 115374.516
|
|
},
|
|
{
|
|
"epoch": 0.37555128656078357,
|
|
"grad_norm": 0.17157427966594696,
|
|
"learning_rate": 0.003720009965372121,
|
|
"loss": 3.136751174926758,
|
|
"num_input_tokens_seen": 3638558720,
|
|
"step": 6940,
|
|
"train_runtime": 31536.6325,
|
|
"train_tokens_per_second": 115375.626
|
|
},
|
|
{
|
|
"epoch": 0.3760924267431478,
|
|
"grad_norm": 0.15815427899360657,
|
|
"learning_rate": 0.0037164612341102445,
|
|
"loss": 3.1335182189941406,
|
|
"num_input_tokens_seen": 3643801600,
|
|
"step": 6950,
|
|
"train_runtime": 31581.7854,
|
|
"train_tokens_per_second": 115376.682
|
|
},
|
|
{
|
|
"epoch": 0.37663356692551203,
|
|
"grad_norm": 0.16368745267391205,
|
|
"learning_rate": 0.003712909552286681,
|
|
"loss": 3.1299674987792967,
|
|
"num_input_tokens_seen": 3649044480,
|
|
"step": 6960,
|
|
"train_runtime": 31626.953,
|
|
"train_tokens_per_second": 115377.681
|
|
},
|
|
{
|
|
"epoch": 0.3771747071078763,
|
|
"grad_norm": 0.17233121395111084,
|
|
"learning_rate": 0.003709354930744553,
|
|
"loss": 3.1409616470336914,
|
|
"num_input_tokens_seen": 3654287360,
|
|
"step": 6970,
|
|
"train_runtime": 31672.1101,
|
|
"train_tokens_per_second": 115378.715
|
|
},
|
|
{
|
|
"epoch": 0.37771584729024055,
|
|
"grad_norm": 0.1784183382987976,
|
|
"learning_rate": 0.0037057973803359553,
|
|
"loss": 3.1445953369140627,
|
|
"num_input_tokens_seen": 3659530240,
|
|
"step": 6980,
|
|
"train_runtime": 31717.2675,
|
|
"train_tokens_per_second": 115379.745
|
|
},
|
|
{
|
|
"epoch": 0.3782569874726048,
|
|
"grad_norm": 0.1589273363351822,
|
|
"learning_rate": 0.003702236911921925,
|
|
"loss": 3.1336727142333984,
|
|
"num_input_tokens_seen": 3664773120,
|
|
"step": 6990,
|
|
"train_runtime": 31762.4428,
|
|
"train_tokens_per_second": 115380.707
|
|
},
|
|
{
|
|
"epoch": 0.378798127654969,
|
|
"grad_norm": 0.16604717075824738,
|
|
"learning_rate": 0.00369867353637241,
|
|
"loss": 3.125100326538086,
|
|
"num_input_tokens_seen": 3670016000,
|
|
"step": 7000,
|
|
"train_runtime": 31807.6091,
|
|
"train_tokens_per_second": 115381.7
|
|
},
|
|
{
|
|
"epoch": 0.378798127654969,
|
|
"eval_loss": 3.082562208175659,
|
|
"eval_runtime": 1.983,
|
|
"eval_samples_per_second": 252.143,
|
|
"eval_steps_per_second": 4.034,
|
|
"num_input_tokens_seen": 3670016000,
|
|
"step": 7000
|
|
},
|
|
{
|
|
"epoch": 0.37933926783733324,
|
|
"grad_norm": 0.16016067564487457,
|
|
"learning_rate": 0.003695107264566231,
|
|
"loss": 3.132742691040039,
|
|
"num_input_tokens_seen": 3675258880,
|
|
"step": 7010,
|
|
"train_runtime": 31857.0893,
|
|
"train_tokens_per_second": 115367.065
|
|
},
|
|
{
|
|
"epoch": 0.3798804080196975,
|
|
"grad_norm": 0.17284226417541504,
|
|
"learning_rate": 0.003691538107391052,
|
|
"loss": 3.1309505462646485,
|
|
"num_input_tokens_seen": 3680501760,
|
|
"step": 7020,
|
|
"train_runtime": 31902.2704,
|
|
"train_tokens_per_second": 115368.02
|
|
},
|
|
{
|
|
"epoch": 0.38042154820206175,
|
|
"grad_norm": 0.16180108487606049,
|
|
"learning_rate": 0.0036879660757433465,
|
|
"loss": 3.1276824951171873,
|
|
"num_input_tokens_seen": 3685744640,
|
|
"step": 7030,
|
|
"train_runtime": 31947.4422,
|
|
"train_tokens_per_second": 115369.006
|
|
},
|
|
{
|
|
"epoch": 0.380962688384426,
|
|
"grad_norm": 0.16350635886192322,
|
|
"learning_rate": 0.0036843911805283613,
|
|
"loss": 3.127395248413086,
|
|
"num_input_tokens_seen": 3690987520,
|
|
"step": 7040,
|
|
"train_runtime": 31992.5853,
|
|
"train_tokens_per_second": 115370.092
|
|
},
|
|
{
|
|
"epoch": 0.3815038285667902,
|
|
"grad_norm": 0.15854142606258392,
|
|
"learning_rate": 0.0036808134326600872,
|
|
"loss": 3.1203243255615236,
|
|
"num_input_tokens_seen": 3696230400,
|
|
"step": 7050,
|
|
"train_runtime": 32037.7375,
|
|
"train_tokens_per_second": 115371.143
|
|
},
|
|
{
|
|
"epoch": 0.38204496874915445,
|
|
"grad_norm": 0.1765364557504654,
|
|
"learning_rate": 0.0036772328430612245,
|
|
"loss": 3.1236772537231445,
|
|
"num_input_tokens_seen": 3701473280,
|
|
"step": 7060,
|
|
"train_runtime": 32082.8987,
|
|
"train_tokens_per_second": 115372.159
|
|
},
|
|
{
|
|
"epoch": 0.38258610893151873,
|
|
"grad_norm": 0.16590341925621033,
|
|
"learning_rate": 0.0036736494226631486,
|
|
"loss": 3.1179275512695312,
|
|
"num_input_tokens_seen": 3706716160,
|
|
"step": 7070,
|
|
"train_runtime": 32128.0461,
|
|
"train_tokens_per_second": 115373.221
|
|
},
|
|
{
|
|
"epoch": 0.38312724911388296,
|
|
"grad_norm": 0.1656789630651474,
|
|
"learning_rate": 0.0036700631824058763,
|
|
"loss": 3.1220640182495116,
|
|
"num_input_tokens_seen": 3711959040,
|
|
"step": 7080,
|
|
"train_runtime": 32173.2014,
|
|
"train_tokens_per_second": 115374.252
|
|
},
|
|
{
|
|
"epoch": 0.3836683892962472,
|
|
"grad_norm": 0.18290071189403534,
|
|
"learning_rate": 0.003666474133238036,
|
|
"loss": 3.130259704589844,
|
|
"num_input_tokens_seen": 3717201920,
|
|
"step": 7090,
|
|
"train_runtime": 32218.3695,
|
|
"train_tokens_per_second": 115375.234
|
|
},
|
|
{
|
|
"epoch": 0.3842095294786114,
|
|
"grad_norm": 0.1678554117679596,
|
|
"learning_rate": 0.003662882286116827,
|
|
"loss": 3.128999137878418,
|
|
"num_input_tokens_seen": 3722444800,
|
|
"step": 7100,
|
|
"train_runtime": 32263.5278,
|
|
"train_tokens_per_second": 115376.248
|
|
},
|
|
{
|
|
"epoch": 0.38475066966097565,
|
|
"grad_norm": 0.16328170895576477,
|
|
"learning_rate": 0.0036592876520079956,
|
|
"loss": 3.1096935272216797,
|
|
"num_input_tokens_seen": 3727687680,
|
|
"step": 7110,
|
|
"train_runtime": 32308.6892,
|
|
"train_tokens_per_second": 115377.249
|
|
},
|
|
{
|
|
"epoch": 0.38529180984333994,
|
|
"grad_norm": 0.16377384960651398,
|
|
"learning_rate": 0.0036556902418857927,
|
|
"loss": 3.1283496856689452,
|
|
"num_input_tokens_seen": 3732930560,
|
|
"step": 7120,
|
|
"train_runtime": 32353.8348,
|
|
"train_tokens_per_second": 115378.303
|
|
},
|
|
{
|
|
"epoch": 0.38583295002570417,
|
|
"grad_norm": 0.17365527153015137,
|
|
"learning_rate": 0.0036520900667329475,
|
|
"loss": 3.1340274810791016,
|
|
"num_input_tokens_seen": 3738173440,
|
|
"step": 7130,
|
|
"train_runtime": 32398.9948,
|
|
"train_tokens_per_second": 115379.303
|
|
},
|
|
{
|
|
"epoch": 0.3863740902080684,
|
|
"grad_norm": 0.17289578914642334,
|
|
"learning_rate": 0.003648487137540628,
|
|
"loss": 3.126075553894043,
|
|
"num_input_tokens_seen": 3743416320,
|
|
"step": 7140,
|
|
"train_runtime": 32444.1388,
|
|
"train_tokens_per_second": 115380.357
|
|
},
|
|
{
|
|
"epoch": 0.38691523039043263,
|
|
"grad_norm": 0.1867065280675888,
|
|
"learning_rate": 0.003644881465308411,
|
|
"loss": 3.1279239654541016,
|
|
"num_input_tokens_seen": 3748659200,
|
|
"step": 7150,
|
|
"train_runtime": 32489.3038,
|
|
"train_tokens_per_second": 115381.334
|
|
},
|
|
{
|
|
"epoch": 0.38745637057279686,
|
|
"grad_norm": 0.16090157628059387,
|
|
"learning_rate": 0.003641273061044249,
|
|
"loss": 3.126418685913086,
|
|
"num_input_tokens_seen": 3753902080,
|
|
"step": 7160,
|
|
"train_runtime": 32534.4706,
|
|
"train_tokens_per_second": 115382.301
|
|
},
|
|
{
|
|
"epoch": 0.38799751075516115,
|
|
"grad_norm": 0.16933725774288177,
|
|
"learning_rate": 0.003637661935764434,
|
|
"loss": 3.1228607177734373,
|
|
"num_input_tokens_seen": 3759144960,
|
|
"step": 7170,
|
|
"train_runtime": 32579.6304,
|
|
"train_tokens_per_second": 115383.29
|
|
},
|
|
{
|
|
"epoch": 0.3885386509375254,
|
|
"grad_norm": 0.16463743150234222,
|
|
"learning_rate": 0.003634048100493565,
|
|
"loss": 3.1265775680541994,
|
|
"num_input_tokens_seen": 3764387840,
|
|
"step": 7180,
|
|
"train_runtime": 32624.7971,
|
|
"train_tokens_per_second": 115384.253
|
|
},
|
|
{
|
|
"epoch": 0.3890797911198896,
|
|
"grad_norm": 0.15814442932605743,
|
|
"learning_rate": 0.003630431566264515,
|
|
"loss": 3.126376724243164,
|
|
"num_input_tokens_seen": 3769630720,
|
|
"step": 7190,
|
|
"train_runtime": 32669.9527,
|
|
"train_tokens_per_second": 115385.252
|
|
},
|
|
{
|
|
"epoch": 0.38962093130225384,
|
|
"grad_norm": 0.16953812539577484,
|
|
"learning_rate": 0.0036268123441183966,
|
|
"loss": 3.1293899536132814,
|
|
"num_input_tokens_seen": 3774873600,
|
|
"step": 7200,
|
|
"train_runtime": 32715.1316,
|
|
"train_tokens_per_second": 115386.166
|
|
},
|
|
{
|
|
"epoch": 0.39016207148461807,
|
|
"grad_norm": 0.18077914416790009,
|
|
"learning_rate": 0.003623190445104527,
|
|
"loss": 3.130533218383789,
|
|
"num_input_tokens_seen": 3780116480,
|
|
"step": 7210,
|
|
"train_runtime": 32760.3295,
|
|
"train_tokens_per_second": 115387.01
|
|
},
|
|
{
|
|
"epoch": 0.39070321166698235,
|
|
"grad_norm": 0.17073588073253632,
|
|
"learning_rate": 0.003619565880280401,
|
|
"loss": 3.1266639709472654,
|
|
"num_input_tokens_seen": 3785359360,
|
|
"step": 7220,
|
|
"train_runtime": 32805.4983,
|
|
"train_tokens_per_second": 115387.955
|
|
},
|
|
{
|
|
"epoch": 0.3912443518493466,
|
|
"grad_norm": 0.16945651173591614,
|
|
"learning_rate": 0.0036159386607116446,
|
|
"loss": 3.1234695434570314,
|
|
"num_input_tokens_seen": 3790602240,
|
|
"step": 7230,
|
|
"train_runtime": 32850.6502,
|
|
"train_tokens_per_second": 115388.956
|
|
},
|
|
{
|
|
"epoch": 0.3917854920317108,
|
|
"grad_norm": 0.17761710286140442,
|
|
"learning_rate": 0.0036123087974719937,
|
|
"loss": 3.127792739868164,
|
|
"num_input_tokens_seen": 3795845120,
|
|
"step": 7240,
|
|
"train_runtime": 32895.8256,
|
|
"train_tokens_per_second": 115389.873
|
|
},
|
|
{
|
|
"epoch": 0.39232663221407504,
|
|
"grad_norm": 0.16878648102283478,
|
|
"learning_rate": 0.0036086763016432545,
|
|
"loss": 3.120273208618164,
|
|
"num_input_tokens_seen": 3801088000,
|
|
"step": 7250,
|
|
"train_runtime": 32945.144,
|
|
"train_tokens_per_second": 115376.275
|
|
},
|
|
{
|
|
"epoch": 0.3928677723964393,
|
|
"grad_norm": 0.15386980772018433,
|
|
"learning_rate": 0.0036050411843152686,
|
|
"loss": 3.1222068786621096,
|
|
"num_input_tokens_seen": 3806330880,
|
|
"step": 7260,
|
|
"train_runtime": 32990.288,
|
|
"train_tokens_per_second": 115377.316
|
|
},
|
|
{
|
|
"epoch": 0.39340891257880356,
|
|
"grad_norm": 0.16980594396591187,
|
|
"learning_rate": 0.0036014034565858824,
|
|
"loss": 3.1281028747558595,
|
|
"num_input_tokens_seen": 3811573760,
|
|
"step": 7270,
|
|
"train_runtime": 33035.4429,
|
|
"train_tokens_per_second": 115378.316
|
|
},
|
|
{
|
|
"epoch": 0.3939500527611678,
|
|
"grad_norm": 0.17536021769046783,
|
|
"learning_rate": 0.003597763129560911,
|
|
"loss": 3.1235652923583985,
|
|
"num_input_tokens_seen": 3816816640,
|
|
"step": 7280,
|
|
"train_runtime": 33080.605,
|
|
"train_tokens_per_second": 115379.288
|
|
},
|
|
{
|
|
"epoch": 0.394491192943532,
|
|
"grad_norm": 0.1680123209953308,
|
|
"learning_rate": 0.0035941202143541053,
|
|
"loss": 3.123764991760254,
|
|
"num_input_tokens_seen": 3822059520,
|
|
"step": 7290,
|
|
"train_runtime": 33125.7503,
|
|
"train_tokens_per_second": 115380.315
|
|
},
|
|
{
|
|
"epoch": 0.39503233312589625,
|
|
"grad_norm": 0.15840236842632294,
|
|
"learning_rate": 0.003590474722087118,
|
|
"loss": 3.124995803833008,
|
|
"num_input_tokens_seen": 3827302400,
|
|
"step": 7300,
|
|
"train_runtime": 33170.9067,
|
|
"train_tokens_per_second": 115381.302
|
|
},
|
|
{
|
|
"epoch": 0.3955734733082605,
|
|
"grad_norm": 0.1702660471200943,
|
|
"learning_rate": 0.00358682666388947,
|
|
"loss": 3.1230545043945312,
|
|
"num_input_tokens_seen": 3832545280,
|
|
"step": 7310,
|
|
"train_runtime": 33216.0627,
|
|
"train_tokens_per_second": 115382.287
|
|
},
|
|
{
|
|
"epoch": 0.39611461349062477,
|
|
"grad_norm": 0.14530692994594574,
|
|
"learning_rate": 0.003583176050898514,
|
|
"loss": 3.1195556640625,
|
|
"num_input_tokens_seen": 3837788160,
|
|
"step": 7320,
|
|
"train_runtime": 33261.2169,
|
|
"train_tokens_per_second": 115383.276
|
|
},
|
|
{
|
|
"epoch": 0.396655753672989,
|
|
"grad_norm": 0.16137973964214325,
|
|
"learning_rate": 0.003579522894259404,
|
|
"loss": 3.122934341430664,
|
|
"num_input_tokens_seen": 3843031040,
|
|
"step": 7330,
|
|
"train_runtime": 33306.3711,
|
|
"train_tokens_per_second": 115384.262
|
|
},
|
|
{
|
|
"epoch": 0.39719689385535323,
|
|
"grad_norm": 0.17957496643066406,
|
|
"learning_rate": 0.0035758672051250597,
|
|
"loss": 3.118304443359375,
|
|
"num_input_tokens_seen": 3848273920,
|
|
"step": 7340,
|
|
"train_runtime": 33351.4951,
|
|
"train_tokens_per_second": 115385.35
|
|
},
|
|
{
|
|
"epoch": 0.39773803403771746,
|
|
"grad_norm": 0.1619359254837036,
|
|
"learning_rate": 0.003572208994656131,
|
|
"loss": 3.126445007324219,
|
|
"num_input_tokens_seen": 3853516800,
|
|
"step": 7350,
|
|
"train_runtime": 33396.6238,
|
|
"train_tokens_per_second": 115386.418
|
|
},
|
|
{
|
|
"epoch": 0.3982791742200817,
|
|
"grad_norm": 0.17734915018081665,
|
|
"learning_rate": 0.003568548274020967,
|
|
"loss": 3.1167884826660157,
|
|
"num_input_tokens_seen": 3858759680,
|
|
"step": 7360,
|
|
"train_runtime": 33441.7562,
|
|
"train_tokens_per_second": 115387.471
|
|
},
|
|
{
|
|
"epoch": 0.398820314402446,
|
|
"grad_norm": 0.17586900293827057,
|
|
"learning_rate": 0.0035648850543955773,
|
|
"loss": 3.1228519439697267,
|
|
"num_input_tokens_seen": 3864002560,
|
|
"step": 7370,
|
|
"train_runtime": 33486.9063,
|
|
"train_tokens_per_second": 115388.46
|
|
},
|
|
{
|
|
"epoch": 0.3993614545848102,
|
|
"grad_norm": 0.17276950180530548,
|
|
"learning_rate": 0.0035612193469636054,
|
|
"loss": 3.1270915985107424,
|
|
"num_input_tokens_seen": 3869245440,
|
|
"step": 7380,
|
|
"train_runtime": 33532.0567,
|
|
"train_tokens_per_second": 115389.446
|
|
},
|
|
{
|
|
"epoch": 0.39990259476717444,
|
|
"grad_norm": 0.1578545719385147,
|
|
"learning_rate": 0.0035575511629162876,
|
|
"loss": 3.102129364013672,
|
|
"num_input_tokens_seen": 3874488320,
|
|
"step": 7390,
|
|
"train_runtime": 33577.2022,
|
|
"train_tokens_per_second": 115390.445
|
|
},
|
|
{
|
|
"epoch": 0.40044373494953867,
|
|
"grad_norm": 0.15498770773410797,
|
|
"learning_rate": 0.0035538805134524183,
|
|
"loss": 3.115239715576172,
|
|
"num_input_tokens_seen": 3879731200,
|
|
"step": 7400,
|
|
"train_runtime": 33622.363,
|
|
"train_tokens_per_second": 115391.39
|
|
},
|
|
{
|
|
"epoch": 0.4009848751319029,
|
|
"grad_norm": 0.15868115425109863,
|
|
"learning_rate": 0.0035502074097783242,
|
|
"loss": 3.1181896209716795,
|
|
"num_input_tokens_seen": 3884974080,
|
|
"step": 7410,
|
|
"train_runtime": 33667.5163,
|
|
"train_tokens_per_second": 115392.358
|
|
},
|
|
{
|
|
"epoch": 0.4015260153142672,
|
|
"grad_norm": 0.1605597585439682,
|
|
"learning_rate": 0.0035465318631078204,
|
|
"loss": 3.113156318664551,
|
|
"num_input_tokens_seen": 3890216960,
|
|
"step": 7420,
|
|
"train_runtime": 33712.6623,
|
|
"train_tokens_per_second": 115393.348
|
|
},
|
|
{
|
|
"epoch": 0.4020671554966314,
|
|
"grad_norm": 0.17280755937099457,
|
|
"learning_rate": 0.003542853884662183,
|
|
"loss": 3.1183053970336916,
|
|
"num_input_tokens_seen": 3895459840,
|
|
"step": 7430,
|
|
"train_runtime": 33757.8255,
|
|
"train_tokens_per_second": 115394.276
|
|
},
|
|
{
|
|
"epoch": 0.40260829567899564,
|
|
"grad_norm": 0.16187331080436707,
|
|
"learning_rate": 0.0035391734856701092,
|
|
"loss": 3.1163970947265627,
|
|
"num_input_tokens_seen": 3900702720,
|
|
"step": 7440,
|
|
"train_runtime": 33802.989,
|
|
"train_tokens_per_second": 115395.201
|
|
},
|
|
{
|
|
"epoch": 0.4031494358613599,
|
|
"grad_norm": 0.1724129021167755,
|
|
"learning_rate": 0.0035354906773676894,
|
|
"loss": 3.1170070648193358,
|
|
"num_input_tokens_seen": 3905945600,
|
|
"step": 7450,
|
|
"train_runtime": 33848.1517,
|
|
"train_tokens_per_second": 115396.127
|
|
},
|
|
{
|
|
"epoch": 0.4036905760437241,
|
|
"grad_norm": 0.17225228250026703,
|
|
"learning_rate": 0.003531805470998366,
|
|
"loss": 3.110821533203125,
|
|
"num_input_tokens_seen": 3911188480,
|
|
"step": 7460,
|
|
"train_runtime": 33893.3266,
|
|
"train_tokens_per_second": 115397.008
|
|
},
|
|
{
|
|
"epoch": 0.4042317162260884,
|
|
"grad_norm": 0.1592818796634674,
|
|
"learning_rate": 0.0035281178778129073,
|
|
"loss": 3.116873931884766,
|
|
"num_input_tokens_seen": 3916431360,
|
|
"step": 7470,
|
|
"train_runtime": 33938.5013,
|
|
"train_tokens_per_second": 115397.888
|
|
},
|
|
{
|
|
"epoch": 0.4047728564084526,
|
|
"grad_norm": 0.1658582091331482,
|
|
"learning_rate": 0.0035244279090693633,
|
|
"loss": 3.1268436431884767,
|
|
"num_input_tokens_seen": 3921674240,
|
|
"step": 7480,
|
|
"train_runtime": 33983.671,
|
|
"train_tokens_per_second": 115398.782
|
|
},
|
|
{
|
|
"epoch": 0.40531399659081685,
|
|
"grad_norm": 0.14836189150810242,
|
|
"learning_rate": 0.00352073557603304,
|
|
"loss": 3.114876556396484,
|
|
"num_input_tokens_seen": 3926917120,
|
|
"step": 7490,
|
|
"train_runtime": 34028.8257,
|
|
"train_tokens_per_second": 115399.725
|
|
},
|
|
{
|
|
"epoch": 0.4058551367731811,
|
|
"grad_norm": 0.16045086085796356,
|
|
"learning_rate": 0.0035170408899764605,
|
|
"loss": 3.1156852722167967,
|
|
"num_input_tokens_seen": 3932160000,
|
|
"step": 7500,
|
|
"train_runtime": 34073.9726,
|
|
"train_tokens_per_second": 115400.692
|
|
},
|
|
{
|
|
"epoch": 0.4058551367731811,
|
|
"eval_loss": 3.0682461261749268,
|
|
"eval_runtime": 1.9852,
|
|
"eval_samples_per_second": 251.858,
|
|
"eval_steps_per_second": 4.03,
|
|
"num_input_tokens_seen": 3932160000,
|
|
"step": 7500
|
|
},
|
|
{
|
|
"epoch": 0.4063962769555453,
|
|
"grad_norm": 0.16971535980701447,
|
|
"learning_rate": 0.0035133438621793296,
|
|
"loss": 3.1024160385131836,
|
|
"num_input_tokens_seen": 3937402880,
|
|
"step": 7510,
|
|
"train_runtime": 34121.1044,
|
|
"train_tokens_per_second": 115394.943
|
|
},
|
|
{
|
|
"epoch": 0.4069374171379096,
|
|
"grad_norm": 0.16736076772212982,
|
|
"learning_rate": 0.003509644503928506,
|
|
"loss": 3.1206098556518556,
|
|
"num_input_tokens_seen": 3942645760,
|
|
"step": 7520,
|
|
"train_runtime": 34166.2706,
|
|
"train_tokens_per_second": 115395.848
|
|
},
|
|
{
|
|
"epoch": 0.4074785573202738,
|
|
"grad_norm": 0.16113705933094025,
|
|
"learning_rate": 0.0035059428265179567,
|
|
"loss": 3.117937469482422,
|
|
"num_input_tokens_seen": 3947888640,
|
|
"step": 7530,
|
|
"train_runtime": 34211.4099,
|
|
"train_tokens_per_second": 115396.841
|
|
},
|
|
{
|
|
"epoch": 0.40801969750263806,
|
|
"grad_norm": 0.17517107725143433,
|
|
"learning_rate": 0.0035022388412487356,
|
|
"loss": 3.1136932373046875,
|
|
"num_input_tokens_seen": 3953131520,
|
|
"step": 7540,
|
|
"train_runtime": 34256.533,
|
|
"train_tokens_per_second": 115397.887
|
|
},
|
|
{
|
|
"epoch": 0.4085608376850023,
|
|
"grad_norm": 0.18709343671798706,
|
|
"learning_rate": 0.003498532559428938,
|
|
"loss": 3.125676918029785,
|
|
"num_input_tokens_seen": 3958374400,
|
|
"step": 7550,
|
|
"train_runtime": 34301.6505,
|
|
"train_tokens_per_second": 115398.949
|
|
},
|
|
{
|
|
"epoch": 0.4091019778673665,
|
|
"grad_norm": 0.1633439064025879,
|
|
"learning_rate": 0.0034948239923736713,
|
|
"loss": 3.1128585815429686,
|
|
"num_input_tokens_seen": 3963617280,
|
|
"step": 7560,
|
|
"train_runtime": 34346.7672,
|
|
"train_tokens_per_second": 115400.01
|
|
},
|
|
{
|
|
"epoch": 0.4096431180497308,
|
|
"grad_norm": 0.16776174306869507,
|
|
"learning_rate": 0.0034911131514050214,
|
|
"loss": 3.114968681335449,
|
|
"num_input_tokens_seen": 3968860160,
|
|
"step": 7570,
|
|
"train_runtime": 34391.881,
|
|
"train_tokens_per_second": 115401.078
|
|
},
|
|
{
|
|
"epoch": 0.41018425823209503,
|
|
"grad_norm": 0.17015814781188965,
|
|
"learning_rate": 0.0034874000478520148,
|
|
"loss": 3.1098609924316407,
|
|
"num_input_tokens_seen": 3974103040,
|
|
"step": 7580,
|
|
"train_runtime": 34437.0464,
|
|
"train_tokens_per_second": 115401.971
|
|
},
|
|
{
|
|
"epoch": 0.41072539841445926,
|
|
"grad_norm": 0.1705334633588791,
|
|
"learning_rate": 0.0034836846930505843,
|
|
"loss": 3.1172601699829103,
|
|
"num_input_tokens_seen": 3979345920,
|
|
"step": 7590,
|
|
"train_runtime": 34482.2174,
|
|
"train_tokens_per_second": 115402.843
|
|
},
|
|
{
|
|
"epoch": 0.4112665385968235,
|
|
"grad_norm": 0.17283746600151062,
|
|
"learning_rate": 0.0034799670983435395,
|
|
"loss": 3.1093212127685548,
|
|
"num_input_tokens_seen": 3984588800,
|
|
"step": 7600,
|
|
"train_runtime": 34527.3958,
|
|
"train_tokens_per_second": 115403.688
|
|
},
|
|
{
|
|
"epoch": 0.4118076787791877,
|
|
"grad_norm": 0.1661679744720459,
|
|
"learning_rate": 0.003476247275080524,
|
|
"loss": 3.114109992980957,
|
|
"num_input_tokens_seen": 3989831680,
|
|
"step": 7610,
|
|
"train_runtime": 34572.5667,
|
|
"train_tokens_per_second": 115404.555
|
|
},
|
|
{
|
|
"epoch": 0.412348818961552,
|
|
"grad_norm": 0.16221173107624054,
|
|
"learning_rate": 0.003472525234617988,
|
|
"loss": 3.1130563735961916,
|
|
"num_input_tokens_seen": 3995074560,
|
|
"step": 7620,
|
|
"train_runtime": 34617.7625,
|
|
"train_tokens_per_second": 115405.337
|
|
},
|
|
{
|
|
"epoch": 0.41288995914391624,
|
|
"grad_norm": 0.16985613107681274,
|
|
"learning_rate": 0.0034688009883191507,
|
|
"loss": 3.1183204650878906,
|
|
"num_input_tokens_seen": 4000317440,
|
|
"step": 7630,
|
|
"train_runtime": 34662.948,
|
|
"train_tokens_per_second": 115406.152
|
|
},
|
|
{
|
|
"epoch": 0.41343109932628047,
|
|
"grad_norm": 0.15718990564346313,
|
|
"learning_rate": 0.003465074547553963,
|
|
"loss": 3.1192548751831053,
|
|
"num_input_tokens_seen": 4005560320,
|
|
"step": 7640,
|
|
"train_runtime": 34711.7967,
|
|
"train_tokens_per_second": 115394.785
|
|
},
|
|
{
|
|
"epoch": 0.4139722395086447,
|
|
"grad_norm": 0.16134141385555267,
|
|
"learning_rate": 0.0034613459236990775,
|
|
"loss": 3.1101545333862304,
|
|
"num_input_tokens_seen": 4010803200,
|
|
"step": 7650,
|
|
"train_runtime": 34756.9452,
|
|
"train_tokens_per_second": 115395.734
|
|
},
|
|
{
|
|
"epoch": 0.41451337969100893,
|
|
"grad_norm": 0.16892403364181519,
|
|
"learning_rate": 0.0034576151281378127,
|
|
"loss": 3.103810691833496,
|
|
"num_input_tokens_seen": 4016046080,
|
|
"step": 7660,
|
|
"train_runtime": 34802.1069,
|
|
"train_tokens_per_second": 115396.637
|
|
},
|
|
{
|
|
"epoch": 0.4150545198733732,
|
|
"grad_norm": 0.15722833573818207,
|
|
"learning_rate": 0.003453882172260114,
|
|
"loss": 3.109886360168457,
|
|
"num_input_tokens_seen": 4021288960,
|
|
"step": 7670,
|
|
"train_runtime": 34847.275,
|
|
"train_tokens_per_second": 115397.516
|
|
},
|
|
{
|
|
"epoch": 0.41559566005573745,
|
|
"grad_norm": 0.16605538129806519,
|
|
"learning_rate": 0.0034501470674625258,
|
|
"loss": 3.110805892944336,
|
|
"num_input_tokens_seen": 4026531840,
|
|
"step": 7680,
|
|
"train_runtime": 34892.47,
|
|
"train_tokens_per_second": 115398.303
|
|
},
|
|
{
|
|
"epoch": 0.4161368002381017,
|
|
"grad_norm": 0.1643964648246765,
|
|
"learning_rate": 0.003446409825148149,
|
|
"loss": 3.11865348815918,
|
|
"num_input_tokens_seen": 4031774720,
|
|
"step": 7690,
|
|
"train_runtime": 34937.6366,
|
|
"train_tokens_per_second": 115399.183
|
|
},
|
|
{
|
|
"epoch": 0.4166779404204659,
|
|
"grad_norm": 0.17231661081314087,
|
|
"learning_rate": 0.003442670456726614,
|
|
"loss": 3.117427444458008,
|
|
"num_input_tokens_seen": 4037017600,
|
|
"step": 7700,
|
|
"train_runtime": 34982.8067,
|
|
"train_tokens_per_second": 115400.049
|
|
},
|
|
{
|
|
"epoch": 0.41721908060283014,
|
|
"grad_norm": 0.16913042962551117,
|
|
"learning_rate": 0.0034389289736140405,
|
|
"loss": 3.1114864349365234,
|
|
"num_input_tokens_seen": 4042260480,
|
|
"step": 7710,
|
|
"train_runtime": 35027.9883,
|
|
"train_tokens_per_second": 115400.874
|
|
},
|
|
{
|
|
"epoch": 0.4177602207851944,
|
|
"grad_norm": 0.16182249784469604,
|
|
"learning_rate": 0.0034351853872330042,
|
|
"loss": 3.107219696044922,
|
|
"num_input_tokens_seen": 4047503360,
|
|
"step": 7720,
|
|
"train_runtime": 35073.1627,
|
|
"train_tokens_per_second": 115401.722
|
|
},
|
|
{
|
|
"epoch": 0.41830136096755866,
|
|
"grad_norm": 0.15614280104637146,
|
|
"learning_rate": 0.003431439709012501,
|
|
"loss": 3.10361385345459,
|
|
"num_input_tokens_seen": 4052746240,
|
|
"step": 7730,
|
|
"train_runtime": 35118.3339,
|
|
"train_tokens_per_second": 115402.577
|
|
},
|
|
{
|
|
"epoch": 0.4188425011499229,
|
|
"grad_norm": 0.16172853112220764,
|
|
"learning_rate": 0.003427691950387916,
|
|
"loss": 3.10665225982666,
|
|
"num_input_tokens_seen": 4057989120,
|
|
"step": 7740,
|
|
"train_runtime": 35163.5186,
|
|
"train_tokens_per_second": 115403.386
|
|
},
|
|
{
|
|
"epoch": 0.4193836413322871,
|
|
"grad_norm": 0.1584424078464508,
|
|
"learning_rate": 0.0034239421228009826,
|
|
"loss": 3.109303665161133,
|
|
"num_input_tokens_seen": 4063232000,
|
|
"step": 7750,
|
|
"train_runtime": 35208.6903,
|
|
"train_tokens_per_second": 115404.236
|
|
},
|
|
{
|
|
"epoch": 0.41992478151465135,
|
|
"grad_norm": 0.15736353397369385,
|
|
"learning_rate": 0.0034201902376997523,
|
|
"loss": 3.1072481155395506,
|
|
"num_input_tokens_seen": 4068474880,
|
|
"step": 7760,
|
|
"train_runtime": 35253.8805,
|
|
"train_tokens_per_second": 115405.023
|
|
},
|
|
{
|
|
"epoch": 0.42046592169701563,
|
|
"grad_norm": 0.158221036195755,
|
|
"learning_rate": 0.0034164363065385577,
|
|
"loss": 3.107033920288086,
|
|
"num_input_tokens_seen": 4073717760,
|
|
"step": 7770,
|
|
"train_runtime": 35299.0377,
|
|
"train_tokens_per_second": 115405.915
|
|
},
|
|
{
|
|
"epoch": 0.42100706187937986,
|
|
"grad_norm": 0.16100963950157166,
|
|
"learning_rate": 0.0034126803407779783,
|
|
"loss": 3.102493667602539,
|
|
"num_input_tokens_seen": 4078960640,
|
|
"step": 7780,
|
|
"train_runtime": 35344.2177,
|
|
"train_tokens_per_second": 115406.732
|
|
},
|
|
{
|
|
"epoch": 0.4215482020617441,
|
|
"grad_norm": 0.15508411824703217,
|
|
"learning_rate": 0.0034089223518848043,
|
|
"loss": 3.110720634460449,
|
|
"num_input_tokens_seen": 4084203520,
|
|
"step": 7790,
|
|
"train_runtime": 35389.3807,
|
|
"train_tokens_per_second": 115407.601
|
|
},
|
|
{
|
|
"epoch": 0.4220893422441083,
|
|
"grad_norm": 0.16234534978866577,
|
|
"learning_rate": 0.0034051623513320028,
|
|
"loss": 3.116852378845215,
|
|
"num_input_tokens_seen": 4089446400,
|
|
"step": 7800,
|
|
"train_runtime": 35434.5473,
|
|
"train_tokens_per_second": 115408.456
|
|
},
|
|
{
|
|
"epoch": 0.42263048242647255,
|
|
"grad_norm": 0.15150156617164612,
|
|
"learning_rate": 0.003401400350598683,
|
|
"loss": 3.110218048095703,
|
|
"num_input_tokens_seen": 4094689280,
|
|
"step": 7810,
|
|
"train_runtime": 35479.7081,
|
|
"train_tokens_per_second": 115409.328
|
|
},
|
|
{
|
|
"epoch": 0.42317162260883684,
|
|
"grad_norm": 0.16316647827625275,
|
|
"learning_rate": 0.0033976363611700608,
|
|
"loss": 3.099168395996094,
|
|
"num_input_tokens_seen": 4099932160,
|
|
"step": 7820,
|
|
"train_runtime": 35524.9004,
|
|
"train_tokens_per_second": 115410.096
|
|
},
|
|
{
|
|
"epoch": 0.42371276279120107,
|
|
"grad_norm": 0.15622437000274658,
|
|
"learning_rate": 0.00339387039453742,
|
|
"loss": 3.1079681396484373,
|
|
"num_input_tokens_seen": 4105175040,
|
|
"step": 7830,
|
|
"train_runtime": 35570.568,
|
|
"train_tokens_per_second": 115409.319
|
|
},
|
|
{
|
|
"epoch": 0.4242539029735653,
|
|
"grad_norm": 0.1611352562904358,
|
|
"learning_rate": 0.0033901024621980865,
|
|
"loss": 3.1027732849121095,
|
|
"num_input_tokens_seen": 4110417920,
|
|
"step": 7840,
|
|
"train_runtime": 35628.7933,
|
|
"train_tokens_per_second": 115367.868
|
|
},
|
|
{
|
|
"epoch": 0.42479504315592953,
|
|
"grad_norm": 0.1534154713153839,
|
|
"learning_rate": 0.0033863325756553824,
|
|
"loss": 3.1010990142822266,
|
|
"num_input_tokens_seen": 4115660800,
|
|
"step": 7850,
|
|
"train_runtime": 35677.6783,
|
|
"train_tokens_per_second": 115356.744
|
|
},
|
|
{
|
|
"epoch": 0.42533618333829376,
|
|
"grad_norm": 0.16484984755516052,
|
|
"learning_rate": 0.0033825607464185994,
|
|
"loss": 3.0935718536376955,
|
|
"num_input_tokens_seen": 4120903680,
|
|
"step": 7860,
|
|
"train_runtime": 35722.8483,
|
|
"train_tokens_per_second": 115357.646
|
|
},
|
|
{
|
|
"epoch": 0.42587732352065805,
|
|
"grad_norm": 0.15278859436511993,
|
|
"learning_rate": 0.0033787869860029576,
|
|
"loss": 3.095734405517578,
|
|
"num_input_tokens_seen": 4126146560,
|
|
"step": 7870,
|
|
"train_runtime": 35768.0118,
|
|
"train_tokens_per_second": 115358.566
|
|
},
|
|
{
|
|
"epoch": 0.4264184637030223,
|
|
"grad_norm": 0.16884206235408783,
|
|
"learning_rate": 0.003375011305929574,
|
|
"loss": 3.1056522369384765,
|
|
"num_input_tokens_seen": 4131389440,
|
|
"step": 7880,
|
|
"train_runtime": 35813.1554,
|
|
"train_tokens_per_second": 115359.549
|
|
},
|
|
{
|
|
"epoch": 0.4269596038853865,
|
|
"grad_norm": 0.15963584184646606,
|
|
"learning_rate": 0.003371233717725426,
|
|
"loss": 3.1040569305419923,
|
|
"num_input_tokens_seen": 4136632320,
|
|
"step": 7890,
|
|
"train_runtime": 35858.3104,
|
|
"train_tokens_per_second": 115360.492
|
|
},
|
|
{
|
|
"epoch": 0.42750074406775074,
|
|
"grad_norm": 0.1541411578655243,
|
|
"learning_rate": 0.0033674542329233175,
|
|
"loss": 3.1086753845214843,
|
|
"num_input_tokens_seen": 4141875200,
|
|
"step": 7900,
|
|
"train_runtime": 35903.4547,
|
|
"train_tokens_per_second": 115361.467
|
|
},
|
|
{
|
|
"epoch": 0.42804188425011497,
|
|
"grad_norm": 0.16819094121456146,
|
|
"learning_rate": 0.003363672863061842,
|
|
"loss": 3.108404350280762,
|
|
"num_input_tokens_seen": 4147118080,
|
|
"step": 7910,
|
|
"train_runtime": 35948.5895,
|
|
"train_tokens_per_second": 115362.47
|
|
},
|
|
{
|
|
"epoch": 0.42858302443247925,
|
|
"grad_norm": 0.15858127176761627,
|
|
"learning_rate": 0.003359889619685346,
|
|
"loss": 3.1061111450195313,
|
|
"num_input_tokens_seen": 4152360960,
|
|
"step": 7920,
|
|
"train_runtime": 35993.7102,
|
|
"train_tokens_per_second": 115363.516
|
|
},
|
|
{
|
|
"epoch": 0.4291241646148435,
|
|
"grad_norm": 0.15731550753116608,
|
|
"learning_rate": 0.003356104514343899,
|
|
"loss": 3.1057785034179686,
|
|
"num_input_tokens_seen": 4157603840,
|
|
"step": 7930,
|
|
"train_runtime": 36038.862,
|
|
"train_tokens_per_second": 115364.46
|
|
},
|
|
{
|
|
"epoch": 0.4296653047972077,
|
|
"grad_norm": 0.14610068500041962,
|
|
"learning_rate": 0.0033523175585932524,
|
|
"loss": 3.09300537109375,
|
|
"num_input_tokens_seen": 4162846720,
|
|
"step": 7940,
|
|
"train_runtime": 36084.0029,
|
|
"train_tokens_per_second": 115365.436
|
|
},
|
|
{
|
|
"epoch": 0.43020644497957194,
|
|
"grad_norm": 0.1672324687242508,
|
|
"learning_rate": 0.003348528763994809,
|
|
"loss": 3.1017438888549806,
|
|
"num_input_tokens_seen": 4168089600,
|
|
"step": 7950,
|
|
"train_runtime": 36129.1342,
|
|
"train_tokens_per_second": 115366.44
|
|
},
|
|
{
|
|
"epoch": 0.4307475851619362,
|
|
"grad_norm": 0.1700795590877533,
|
|
"learning_rate": 0.003344738142115583,
|
|
"loss": 3.0958410263061524,
|
|
"num_input_tokens_seen": 4173332480,
|
|
"step": 7960,
|
|
"train_runtime": 36174.3238,
|
|
"train_tokens_per_second": 115367.256
|
|
},
|
|
{
|
|
"epoch": 0.43128872534430046,
|
|
"grad_norm": 0.15165534615516663,
|
|
"learning_rate": 0.00334094570452817,
|
|
"loss": 3.101241874694824,
|
|
"num_input_tokens_seen": 4178575360,
|
|
"step": 7970,
|
|
"train_runtime": 36219.5195,
|
|
"train_tokens_per_second": 115368.051
|
|
},
|
|
{
|
|
"epoch": 0.4318298655266647,
|
|
"grad_norm": 0.1584347039461136,
|
|
"learning_rate": 0.0033371514628107073,
|
|
"loss": 3.101197052001953,
|
|
"num_input_tokens_seen": 4183818240,
|
|
"step": 7980,
|
|
"train_runtime": 36264.6831,
|
|
"train_tokens_per_second": 115368.945
|
|
},
|
|
{
|
|
"epoch": 0.4323710057090289,
|
|
"grad_norm": 0.15928910672664642,
|
|
"learning_rate": 0.0033333554285468387,
|
|
"loss": 3.1082935333251953,
|
|
"num_input_tokens_seen": 4189061120,
|
|
"step": 7990,
|
|
"train_runtime": 36309.973,
|
|
"train_tokens_per_second": 115369.436
|
|
},
|
|
{
|
|
"epoch": 0.43291214589139315,
|
|
"grad_norm": 0.15439751744270325,
|
|
"learning_rate": 0.003329557613325685,
|
|
"loss": 3.1111793518066406,
|
|
"num_input_tokens_seen": 4194304000,
|
|
"step": 8000,
|
|
"train_runtime": 36355.1728,
|
|
"train_tokens_per_second": 115370.212
|
|
},
|
|
{
|
|
"epoch": 0.43291214589139315,
|
|
"eval_loss": 3.0542824268341064,
|
|
"eval_runtime": 1.9899,
|
|
"eval_samples_per_second": 251.266,
|
|
"eval_steps_per_second": 4.02,
|
|
"num_input_tokens_seen": 4194304000,
|
|
"step": 8000
|
|
},
|
|
{
|
|
"epoch": 0.4334532860737574,
|
|
"grad_norm": 0.1620936542749405,
|
|
"learning_rate": 0.0033257580287417987,
|
|
"loss": 3.1044567108154295,
|
|
"num_input_tokens_seen": 4199546880,
|
|
"step": 8010,
|
|
"train_runtime": 36404.8051,
|
|
"train_tokens_per_second": 115356.939
|
|
},
|
|
{
|
|
"epoch": 0.43399442625612167,
|
|
"grad_norm": 0.16753153502941132,
|
|
"learning_rate": 0.0033219566863951383,
|
|
"loss": 3.0971731185913085,
|
|
"num_input_tokens_seen": 4204789760,
|
|
"step": 8020,
|
|
"train_runtime": 36454.2975,
|
|
"train_tokens_per_second": 115344.144
|
|
},
|
|
{
|
|
"epoch": 0.4345355664384859,
|
|
"grad_norm": 0.15557527542114258,
|
|
"learning_rate": 0.0033181535978910265,
|
|
"loss": 3.099981689453125,
|
|
"num_input_tokens_seen": 4210032640,
|
|
"step": 8030,
|
|
"train_runtime": 36499.5202,
|
|
"train_tokens_per_second": 115344.876
|
|
},
|
|
{
|
|
"epoch": 0.43507670662085013,
|
|
"grad_norm": 0.15290997922420502,
|
|
"learning_rate": 0.0033143487748401174,
|
|
"loss": 3.1018728256225585,
|
|
"num_input_tokens_seen": 4215275520,
|
|
"step": 8040,
|
|
"train_runtime": 36544.6707,
|
|
"train_tokens_per_second": 115345.834
|
|
},
|
|
{
|
|
"epoch": 0.43561784680321436,
|
|
"grad_norm": 0.15225110948085785,
|
|
"learning_rate": 0.0033105422288583616,
|
|
"loss": 3.09820671081543,
|
|
"num_input_tokens_seen": 4220518400,
|
|
"step": 8050,
|
|
"train_runtime": 36589.8576,
|
|
"train_tokens_per_second": 115346.675
|
|
},
|
|
{
|
|
"epoch": 0.4361589869855786,
|
|
"grad_norm": 0.16044707596302032,
|
|
"learning_rate": 0.003306733971566968,
|
|
"loss": 3.1036590576171874,
|
|
"num_input_tokens_seen": 4225761280,
|
|
"step": 8060,
|
|
"train_runtime": 36635.0375,
|
|
"train_tokens_per_second": 115347.535
|
|
},
|
|
{
|
|
"epoch": 0.4367001271679429,
|
|
"grad_norm": 0.1795279085636139,
|
|
"learning_rate": 0.0033029240145923708,
|
|
"loss": 3.102092170715332,
|
|
"num_input_tokens_seen": 4231004160,
|
|
"step": 8070,
|
|
"train_runtime": 36680.2289,
|
|
"train_tokens_per_second": 115348.357
|
|
},
|
|
{
|
|
"epoch": 0.4372412673503071,
|
|
"grad_norm": 0.16536639630794525,
|
|
"learning_rate": 0.003299112369566194,
|
|
"loss": 3.101215934753418,
|
|
"num_input_tokens_seen": 4236247040,
|
|
"step": 8080,
|
|
"train_runtime": 36725.4044,
|
|
"train_tokens_per_second": 115349.228
|
|
},
|
|
{
|
|
"epoch": 0.43778240753267134,
|
|
"grad_norm": 0.1551489681005478,
|
|
"learning_rate": 0.003295299048125215,
|
|
"loss": 3.1048954010009764,
|
|
"num_input_tokens_seen": 4241489920,
|
|
"step": 8090,
|
|
"train_runtime": 36770.5602,
|
|
"train_tokens_per_second": 115350.158
|
|
},
|
|
{
|
|
"epoch": 0.43832354771503557,
|
|
"grad_norm": 0.15728254616260529,
|
|
"learning_rate": 0.0032914840619113267,
|
|
"loss": 3.0963891983032226,
|
|
"num_input_tokens_seen": 4246732800,
|
|
"step": 8100,
|
|
"train_runtime": 36815.7389,
|
|
"train_tokens_per_second": 115351.014
|
|
},
|
|
{
|
|
"epoch": 0.4388646878973998,
|
|
"grad_norm": 0.16317813098430634,
|
|
"learning_rate": 0.0032876674225715092,
|
|
"loss": 3.095835876464844,
|
|
"num_input_tokens_seen": 4251975680,
|
|
"step": 8110,
|
|
"train_runtime": 36860.9041,
|
|
"train_tokens_per_second": 115351.91
|
|
},
|
|
{
|
|
"epoch": 0.4394058280797641,
|
|
"grad_norm": 0.16513289511203766,
|
|
"learning_rate": 0.0032838491417577845,
|
|
"loss": 3.100272369384766,
|
|
"num_input_tokens_seen": 4257218560,
|
|
"step": 8120,
|
|
"train_runtime": 36906.0614,
|
|
"train_tokens_per_second": 115352.828
|
|
},
|
|
{
|
|
"epoch": 0.4399469682621283,
|
|
"grad_norm": 0.154524028301239,
|
|
"learning_rate": 0.003280029231127189,
|
|
"loss": 3.1007152557373048,
|
|
"num_input_tokens_seen": 4262461440,
|
|
"step": 8130,
|
|
"train_runtime": 36951.2345,
|
|
"train_tokens_per_second": 115353.695
|
|
},
|
|
{
|
|
"epoch": 0.44048810844449254,
|
|
"grad_norm": 0.16192783415317535,
|
|
"learning_rate": 0.003276207702341735,
|
|
"loss": 3.1067665100097654,
|
|
"num_input_tokens_seen": 4267704320,
|
|
"step": 8140,
|
|
"train_runtime": 36996.3989,
|
|
"train_tokens_per_second": 115354.587
|
|
},
|
|
{
|
|
"epoch": 0.4410292486268568,
|
|
"grad_norm": 0.1726672351360321,
|
|
"learning_rate": 0.003272384567068373,
|
|
"loss": 3.098089027404785,
|
|
"num_input_tokens_seen": 4272947200,
|
|
"step": 8150,
|
|
"train_runtime": 37041.5682,
|
|
"train_tokens_per_second": 115355.462
|
|
},
|
|
{
|
|
"epoch": 0.441570388809221,
|
|
"grad_norm": 0.14850319921970367,
|
|
"learning_rate": 0.00326855983697896,
|
|
"loss": 3.0921985626220705,
|
|
"num_input_tokens_seen": 4278190080,
|
|
"step": 8160,
|
|
"train_runtime": 37086.741,
|
|
"train_tokens_per_second": 115356.323
|
|
},
|
|
{
|
|
"epoch": 0.4421115289915853,
|
|
"grad_norm": 0.15166330337524414,
|
|
"learning_rate": 0.0032647335237502195,
|
|
"loss": 3.101424789428711,
|
|
"num_input_tokens_seen": 4283432960,
|
|
"step": 8170,
|
|
"train_runtime": 37131.9267,
|
|
"train_tokens_per_second": 115357.142
|
|
},
|
|
{
|
|
"epoch": 0.4426526691739495,
|
|
"grad_norm": 0.15639446675777435,
|
|
"learning_rate": 0.0032609056390637114,
|
|
"loss": 3.098773193359375,
|
|
"num_input_tokens_seen": 4288675840,
|
|
"step": 8180,
|
|
"train_runtime": 37177.0966,
|
|
"train_tokens_per_second": 115358.009
|
|
},
|
|
{
|
|
"epoch": 0.44319380935631375,
|
|
"grad_norm": 0.1532983034849167,
|
|
"learning_rate": 0.003257076194605791,
|
|
"loss": 3.1019330978393556,
|
|
"num_input_tokens_seen": 4293918720,
|
|
"step": 8190,
|
|
"train_runtime": 37222.2634,
|
|
"train_tokens_per_second": 115358.883
|
|
},
|
|
{
|
|
"epoch": 0.443734949538678,
|
|
"grad_norm": 0.17717613279819489,
|
|
"learning_rate": 0.0032532452020675763,
|
|
"loss": 3.099607467651367,
|
|
"num_input_tokens_seen": 4299161600,
|
|
"step": 8200,
|
|
"train_runtime": 37267.4247,
|
|
"train_tokens_per_second": 115359.772
|
|
},
|
|
{
|
|
"epoch": 0.4442760897210422,
|
|
"grad_norm": 0.15027566254138947,
|
|
"learning_rate": 0.00324941267314491,
|
|
"loss": 3.107021713256836,
|
|
"num_input_tokens_seen": 4304404480,
|
|
"step": 8210,
|
|
"train_runtime": 37312.5891,
|
|
"train_tokens_per_second": 115360.649
|
|
},
|
|
{
|
|
"epoch": 0.4448172299034065,
|
|
"grad_norm": 0.16586042940616608,
|
|
"learning_rate": 0.0032455786195383285,
|
|
"loss": 3.0993444442749025,
|
|
"num_input_tokens_seen": 4309647360,
|
|
"step": 8220,
|
|
"train_runtime": 37357.762,
|
|
"train_tokens_per_second": 115361.497
|
|
},
|
|
{
|
|
"epoch": 0.4453583700857707,
|
|
"grad_norm": 0.16227947175502777,
|
|
"learning_rate": 0.00324174305295302,
|
|
"loss": 3.0918336868286134,
|
|
"num_input_tokens_seen": 4314890240,
|
|
"step": 8230,
|
|
"train_runtime": 37402.9165,
|
|
"train_tokens_per_second": 115362.401
|
|
},
|
|
{
|
|
"epoch": 0.44589951026813496,
|
|
"grad_norm": 0.16855213046073914,
|
|
"learning_rate": 0.0032379059850987926,
|
|
"loss": 3.0997894287109373,
|
|
"num_input_tokens_seen": 4320133120,
|
|
"step": 8240,
|
|
"train_runtime": 37448.08,
|
|
"train_tokens_per_second": 115363.274
|
|
},
|
|
{
|
|
"epoch": 0.4464406504504992,
|
|
"grad_norm": 0.15484896302223206,
|
|
"learning_rate": 0.003234067427690039,
|
|
"loss": 3.0965702056884767,
|
|
"num_input_tokens_seen": 4325376000,
|
|
"step": 8250,
|
|
"train_runtime": 37493.242,
|
|
"train_tokens_per_second": 115364.15
|
|
},
|
|
{
|
|
"epoch": 0.4469817906328634,
|
|
"grad_norm": 0.16154596209526062,
|
|
"learning_rate": 0.0032302273924456966,
|
|
"loss": 3.0933055877685547,
|
|
"num_input_tokens_seen": 4330618880,
|
|
"step": 8260,
|
|
"train_runtime": 37538.3815,
|
|
"train_tokens_per_second": 115365.093
|
|
},
|
|
{
|
|
"epoch": 0.4475229308152277,
|
|
"grad_norm": 0.1575249284505844,
|
|
"learning_rate": 0.003226385891089219,
|
|
"loss": 3.0924747467041014,
|
|
"num_input_tokens_seen": 4335861760,
|
|
"step": 8270,
|
|
"train_runtime": 37583.5299,
|
|
"train_tokens_per_second": 115366.007
|
|
},
|
|
{
|
|
"epoch": 0.44806407099759193,
|
|
"grad_norm": 0.160599023103714,
|
|
"learning_rate": 0.0032225429353485296,
|
|
"loss": 3.096691131591797,
|
|
"num_input_tokens_seen": 4341104640,
|
|
"step": 8280,
|
|
"train_runtime": 37628.6879,
|
|
"train_tokens_per_second": 115366.889
|
|
},
|
|
{
|
|
"epoch": 0.44860521117995616,
|
|
"grad_norm": 0.15803949534893036,
|
|
"learning_rate": 0.003218698536955999,
|
|
"loss": 3.1002126693725587,
|
|
"num_input_tokens_seen": 4346347520,
|
|
"step": 8290,
|
|
"train_runtime": 37673.8359,
|
|
"train_tokens_per_second": 115367.799
|
|
},
|
|
{
|
|
"epoch": 0.4491463513623204,
|
|
"grad_norm": 0.1458759903907776,
|
|
"learning_rate": 0.0032148527076483963,
|
|
"loss": 3.0890472412109373,
|
|
"num_input_tokens_seen": 4351590400,
|
|
"step": 8300,
|
|
"train_runtime": 37718.9873,
|
|
"train_tokens_per_second": 115368.697
|
|
},
|
|
{
|
|
"epoch": 0.4496874915446846,
|
|
"grad_norm": 0.15396055579185486,
|
|
"learning_rate": 0.0032110054591668624,
|
|
"loss": 3.0894855499267577,
|
|
"num_input_tokens_seen": 4356833280,
|
|
"step": 8310,
|
|
"train_runtime": 37764.1751,
|
|
"train_tokens_per_second": 115369.481
|
|
},
|
|
{
|
|
"epoch": 0.4502286317270489,
|
|
"grad_norm": 0.15826280415058136,
|
|
"learning_rate": 0.0032071568032568704,
|
|
"loss": 3.1003223419189454,
|
|
"num_input_tokens_seen": 4362076160,
|
|
"step": 8320,
|
|
"train_runtime": 37809.3568,
|
|
"train_tokens_per_second": 115370.282
|
|
},
|
|
{
|
|
"epoch": 0.45076977190941314,
|
|
"grad_norm": 0.16446684300899506,
|
|
"learning_rate": 0.003203306751668188,
|
|
"loss": 3.093168258666992,
|
|
"num_input_tokens_seen": 4367319040,
|
|
"step": 8330,
|
|
"train_runtime": 37854.5343,
|
|
"train_tokens_per_second": 115371.094
|
|
},
|
|
{
|
|
"epoch": 0.45131091209177737,
|
|
"grad_norm": 0.1580137461423874,
|
|
"learning_rate": 0.0031994553161548474,
|
|
"loss": 3.101323699951172,
|
|
"num_input_tokens_seen": 4372561920,
|
|
"step": 8340,
|
|
"train_runtime": 37899.7112,
|
|
"train_tokens_per_second": 115371.906
|
|
},
|
|
{
|
|
"epoch": 0.4518520522741416,
|
|
"grad_norm": 0.15007439255714417,
|
|
"learning_rate": 0.003195602508475103,
|
|
"loss": 3.0974876403808596,
|
|
"num_input_tokens_seen": 4377804800,
|
|
"step": 8350,
|
|
"train_runtime": 37944.8699,
|
|
"train_tokens_per_second": 115372.771
|
|
},
|
|
{
|
|
"epoch": 0.45239319245650583,
|
|
"grad_norm": 0.16612504422664642,
|
|
"learning_rate": 0.0031917483403914,
|
|
"loss": 3.097567558288574,
|
|
"num_input_tokens_seen": 4383047680,
|
|
"step": 8360,
|
|
"train_runtime": 37990.0345,
|
|
"train_tokens_per_second": 115373.617
|
|
},
|
|
{
|
|
"epoch": 0.4529343326388701,
|
|
"grad_norm": 0.152009978890419,
|
|
"learning_rate": 0.0031878928236703354,
|
|
"loss": 3.09008674621582,
|
|
"num_input_tokens_seen": 4388290560,
|
|
"step": 8370,
|
|
"train_runtime": 38035.2108,
|
|
"train_tokens_per_second": 115374.425
|
|
},
|
|
{
|
|
"epoch": 0.45347547282123435,
|
|
"grad_norm": 0.14635391533374786,
|
|
"learning_rate": 0.003184035970082625,
|
|
"loss": 3.0835281372070313,
|
|
"num_input_tokens_seen": 4393533440,
|
|
"step": 8380,
|
|
"train_runtime": 38080.3814,
|
|
"train_tokens_per_second": 115375.248
|
|
},
|
|
{
|
|
"epoch": 0.4540166130035986,
|
|
"grad_norm": 0.16529026627540588,
|
|
"learning_rate": 0.0031801777914030657,
|
|
"loss": 3.0935291290283202,
|
|
"num_input_tokens_seen": 4398776320,
|
|
"step": 8390,
|
|
"train_runtime": 38125.5235,
|
|
"train_tokens_per_second": 115376.155
|
|
},
|
|
{
|
|
"epoch": 0.4545577531859628,
|
|
"grad_norm": 0.15716882050037384,
|
|
"learning_rate": 0.003176318299410499,
|
|
"loss": 3.0900102615356446,
|
|
"num_input_tokens_seen": 4404019200,
|
|
"step": 8400,
|
|
"train_runtime": 38175.1087,
|
|
"train_tokens_per_second": 115363.632
|
|
},
|
|
{
|
|
"epoch": 0.45509889336832704,
|
|
"grad_norm": 0.15623128414154053,
|
|
"learning_rate": 0.003172457505887777,
|
|
"loss": 3.0833271026611326,
|
|
"num_input_tokens_seen": 4409262080,
|
|
"step": 8410,
|
|
"train_runtime": 38220.3198,
|
|
"train_tokens_per_second": 115364.343
|
|
},
|
|
{
|
|
"epoch": 0.4556400335506913,
|
|
"grad_norm": 0.1591528207063675,
|
|
"learning_rate": 0.0031685954226217234,
|
|
"loss": 3.0901105880737303,
|
|
"num_input_tokens_seen": 4414504960,
|
|
"step": 8420,
|
|
"train_runtime": 38265.5513,
|
|
"train_tokens_per_second": 115364.99
|
|
},
|
|
{
|
|
"epoch": 0.45618117373305556,
|
|
"grad_norm": 0.16141286492347717,
|
|
"learning_rate": 0.003164732061403102,
|
|
"loss": 3.0906259536743166,
|
|
"num_input_tokens_seen": 4419747840,
|
|
"step": 8430,
|
|
"train_runtime": 38310.7985,
|
|
"train_tokens_per_second": 115365.589
|
|
},
|
|
{
|
|
"epoch": 0.4567223139154198,
|
|
"grad_norm": 0.15204599499702454,
|
|
"learning_rate": 0.0031608674340265768,
|
|
"loss": 3.084097671508789,
|
|
"num_input_tokens_seen": 4424990720,
|
|
"step": 8440,
|
|
"train_runtime": 38356.0563,
|
|
"train_tokens_per_second": 115366.154
|
|
},
|
|
{
|
|
"epoch": 0.457263454097784,
|
|
"grad_norm": 0.15592141449451447,
|
|
"learning_rate": 0.003157001552290677,
|
|
"loss": 3.0875980377197267,
|
|
"num_input_tokens_seen": 4430233600,
|
|
"step": 8450,
|
|
"train_runtime": 38401.2772,
|
|
"train_tokens_per_second": 115366.829
|
|
},
|
|
{
|
|
"epoch": 0.45780459428014825,
|
|
"grad_norm": 0.15805137157440186,
|
|
"learning_rate": 0.0031531344279977615,
|
|
"loss": 3.0840667724609374,
|
|
"num_input_tokens_seen": 4435476480,
|
|
"step": 8460,
|
|
"train_runtime": 38446.5182,
|
|
"train_tokens_per_second": 115367.443
|
|
},
|
|
{
|
|
"epoch": 0.45834573446251253,
|
|
"grad_norm": 0.1569671630859375,
|
|
"learning_rate": 0.003149266072953983,
|
|
"loss": 3.095382308959961,
|
|
"num_input_tokens_seen": 4440719360,
|
|
"step": 8470,
|
|
"train_runtime": 38491.7485,
|
|
"train_tokens_per_second": 115368.086
|
|
},
|
|
{
|
|
"epoch": 0.45888687464487676,
|
|
"grad_norm": 0.15263330936431885,
|
|
"learning_rate": 0.0031453964989692517,
|
|
"loss": 3.0909893035888674,
|
|
"num_input_tokens_seen": 4445962240,
|
|
"step": 8480,
|
|
"train_runtime": 38536.9781,
|
|
"train_tokens_per_second": 115368.73
|
|
},
|
|
{
|
|
"epoch": 0.459428014827241,
|
|
"grad_norm": 0.16741898655891418,
|
|
"learning_rate": 0.0031415257178571986,
|
|
"loss": 3.091363525390625,
|
|
"num_input_tokens_seen": 4451205120,
|
|
"step": 8490,
|
|
"train_runtime": 38582.2145,
|
|
"train_tokens_per_second": 115369.353
|
|
},
|
|
{
|
|
"epoch": 0.4599691550096052,
|
|
"grad_norm": 0.1621858924627304,
|
|
"learning_rate": 0.0031376537414351414,
|
|
"loss": 3.0860706329345704,
|
|
"num_input_tokens_seen": 4456448000,
|
|
"step": 8500,
|
|
"train_runtime": 38627.4547,
|
|
"train_tokens_per_second": 115369.963
|
|
},
|
|
{
|
|
"epoch": 0.4599691550096052,
|
|
"eval_loss": 3.044252634048462,
|
|
"eval_runtime": 1.9941,
|
|
"eval_samples_per_second": 250.745,
|
|
"eval_steps_per_second": 4.012,
|
|
"num_input_tokens_seen": 4456448000,
|
|
"step": 8500
|
|
},
|
|
{
|
|
"epoch": 0.46051029519196945,
|
|
"grad_norm": 0.1537708044052124,
|
|
"learning_rate": 0.0031337805815240443,
|
|
"loss": 3.0971357345581056,
|
|
"num_input_tokens_seen": 4461690880,
|
|
"step": 8510,
|
|
"train_runtime": 38674.6947,
|
|
"train_tokens_per_second": 115364.605
|
|
},
|
|
{
|
|
"epoch": 0.46105143537433374,
|
|
"grad_norm": 0.1561785489320755,
|
|
"learning_rate": 0.0031299062499484886,
|
|
"loss": 3.095275115966797,
|
|
"num_input_tokens_seen": 4466933760,
|
|
"step": 8520,
|
|
"train_runtime": 38719.9322,
|
|
"train_tokens_per_second": 115365.227
|
|
},
|
|
{
|
|
"epoch": 0.46159257555669797,
|
|
"grad_norm": 0.15904352068901062,
|
|
"learning_rate": 0.0031260307585366277,
|
|
"loss": 3.093882942199707,
|
|
"num_input_tokens_seen": 4472176640,
|
|
"step": 8530,
|
|
"train_runtime": 38765.1761,
|
|
"train_tokens_per_second": 115365.828
|
|
},
|
|
{
|
|
"epoch": 0.4621337157390622,
|
|
"grad_norm": 0.1586320400238037,
|
|
"learning_rate": 0.00312215411912016,
|
|
"loss": 3.0862545013427733,
|
|
"num_input_tokens_seen": 4477419520,
|
|
"step": 8540,
|
|
"train_runtime": 38810.3531,
|
|
"train_tokens_per_second": 115366.627
|
|
},
|
|
{
|
|
"epoch": 0.46267485592142643,
|
|
"grad_norm": 0.15955059230327606,
|
|
"learning_rate": 0.003118276343534288,
|
|
"loss": 3.09029598236084,
|
|
"num_input_tokens_seen": 4482662400,
|
|
"step": 8550,
|
|
"train_runtime": 38855.5252,
|
|
"train_tokens_per_second": 115367.438
|
|
},
|
|
{
|
|
"epoch": 0.46321599610379066,
|
|
"grad_norm": 0.16844794154167175,
|
|
"learning_rate": 0.0031143974436176804,
|
|
"loss": 3.08276252746582,
|
|
"num_input_tokens_seen": 4487905280,
|
|
"step": 8560,
|
|
"train_runtime": 38900.7107,
|
|
"train_tokens_per_second": 115368.208
|
|
},
|
|
{
|
|
"epoch": 0.46375713628615495,
|
|
"grad_norm": 0.15490221977233887,
|
|
"learning_rate": 0.003110517431212442,
|
|
"loss": 3.096157455444336,
|
|
"num_input_tokens_seen": 4493148160,
|
|
"step": 8570,
|
|
"train_runtime": 38945.895,
|
|
"train_tokens_per_second": 115368.979
|
|
},
|
|
{
|
|
"epoch": 0.4642982764685192,
|
|
"grad_norm": 0.1643703430891037,
|
|
"learning_rate": 0.0031066363181640705,
|
|
"loss": 3.094961929321289,
|
|
"num_input_tokens_seen": 4498391040,
|
|
"step": 8580,
|
|
"train_runtime": 38991.0775,
|
|
"train_tokens_per_second": 115369.754
|
|
},
|
|
{
|
|
"epoch": 0.4648394166508834,
|
|
"grad_norm": 0.16452111303806305,
|
|
"learning_rate": 0.003102754116321427,
|
|
"loss": 3.0949285507202147,
|
|
"num_input_tokens_seen": 4503633920,
|
|
"step": 8590,
|
|
"train_runtime": 39036.2496,
|
|
"train_tokens_per_second": 115370.558
|
|
},
|
|
{
|
|
"epoch": 0.46538055683324764,
|
|
"grad_norm": 0.15409614145755768,
|
|
"learning_rate": 0.003098870837536694,
|
|
"loss": 3.083492660522461,
|
|
"num_input_tokens_seen": 4508876800,
|
|
"step": 8600,
|
|
"train_runtime": 39081.4455,
|
|
"train_tokens_per_second": 115371.29
|
|
},
|
|
{
|
|
"epoch": 0.46592169701561187,
|
|
"grad_norm": 0.16283227503299713,
|
|
"learning_rate": 0.0030949864936653444,
|
|
"loss": 3.0859600067138673,
|
|
"num_input_tokens_seen": 4514119680,
|
|
"step": 8610,
|
|
"train_runtime": 39126.6271,
|
|
"train_tokens_per_second": 115372.063
|
|
},
|
|
{
|
|
"epoch": 0.46646283719797615,
|
|
"grad_norm": 0.15932060778141022,
|
|
"learning_rate": 0.0030911010965660995,
|
|
"loss": 3.0858314514160154,
|
|
"num_input_tokens_seen": 4519362560,
|
|
"step": 8620,
|
|
"train_runtime": 39171.8041,
|
|
"train_tokens_per_second": 115372.847
|
|
},
|
|
{
|
|
"epoch": 0.4670039773803404,
|
|
"grad_norm": 0.15321630239486694,
|
|
"learning_rate": 0.0030872146581008993,
|
|
"loss": 3.0855281829833983,
|
|
"num_input_tokens_seen": 4524605440,
|
|
"step": 8630,
|
|
"train_runtime": 39217.0067,
|
|
"train_tokens_per_second": 115373.554
|
|
},
|
|
{
|
|
"epoch": 0.4675451175627046,
|
|
"grad_norm": 0.15142542123794556,
|
|
"learning_rate": 0.0030833271901348604,
|
|
"loss": 3.0922718048095703,
|
|
"num_input_tokens_seen": 4529848320,
|
|
"step": 8640,
|
|
"train_runtime": 39262.1839,
|
|
"train_tokens_per_second": 115374.334
|
|
},
|
|
{
|
|
"epoch": 0.46808625774506885,
|
|
"grad_norm": 0.15679921209812164,
|
|
"learning_rate": 0.0030794387045362448,
|
|
"loss": 3.089971923828125,
|
|
"num_input_tokens_seen": 4535091200,
|
|
"step": 8650,
|
|
"train_runtime": 39307.3714,
|
|
"train_tokens_per_second": 115375.082
|
|
},
|
|
{
|
|
"epoch": 0.4686273979274331,
|
|
"grad_norm": 0.149771049618721,
|
|
"learning_rate": 0.0030755492131764196,
|
|
"loss": 3.0910947799682615,
|
|
"num_input_tokens_seen": 4540334080,
|
|
"step": 8660,
|
|
"train_runtime": 39352.6168,
|
|
"train_tokens_per_second": 115375.659
|
|
},
|
|
{
|
|
"epoch": 0.46916853810979736,
|
|
"grad_norm": 0.15804412961006165,
|
|
"learning_rate": 0.003071658727929823,
|
|
"loss": 3.096923065185547,
|
|
"num_input_tokens_seen": 4545576960,
|
|
"step": 8670,
|
|
"train_runtime": 39397.8863,
|
|
"train_tokens_per_second": 115376.163
|
|
},
|
|
{
|
|
"epoch": 0.4697096782921616,
|
|
"grad_norm": 0.17401130497455597,
|
|
"learning_rate": 0.003067767260673929,
|
|
"loss": 3.0941158294677735,
|
|
"num_input_tokens_seen": 4550819840,
|
|
"step": 8680,
|
|
"train_runtime": 39443.148,
|
|
"train_tokens_per_second": 115376.69
|
|
},
|
|
{
|
|
"epoch": 0.4702508184745258,
|
|
"grad_norm": 0.16347981989383698,
|
|
"learning_rate": 0.003063874823289205,
|
|
"loss": 3.0893718719482424,
|
|
"num_input_tokens_seen": 4556062720,
|
|
"step": 8690,
|
|
"train_runtime": 39488.3966,
|
|
"train_tokens_per_second": 115377.253
|
|
},
|
|
{
|
|
"epoch": 0.47079195865689005,
|
|
"grad_norm": 0.16059865057468414,
|
|
"learning_rate": 0.003059981427659086,
|
|
"loss": 3.0792430877685546,
|
|
"num_input_tokens_seen": 4561305600,
|
|
"step": 8700,
|
|
"train_runtime": 39533.6411,
|
|
"train_tokens_per_second": 115377.827
|
|
},
|
|
{
|
|
"epoch": 0.4713330988392543,
|
|
"grad_norm": 0.15623228251934052,
|
|
"learning_rate": 0.0030560870856699285,
|
|
"loss": 3.0796392440795897,
|
|
"num_input_tokens_seen": 4566548480,
|
|
"step": 8710,
|
|
"train_runtime": 39578.8987,
|
|
"train_tokens_per_second": 115378.361
|
|
},
|
|
{
|
|
"epoch": 0.47187423902161857,
|
|
"grad_norm": 0.17271380126476288,
|
|
"learning_rate": 0.003052191809210979,
|
|
"loss": 3.0749179840087892,
|
|
"num_input_tokens_seen": 4571791360,
|
|
"step": 8720,
|
|
"train_runtime": 39624.1426,
|
|
"train_tokens_per_second": 115378.935
|
|
},
|
|
{
|
|
"epoch": 0.4724153792039828,
|
|
"grad_norm": 0.1617797613143921,
|
|
"learning_rate": 0.0030482956101743385,
|
|
"loss": 3.077177047729492,
|
|
"num_input_tokens_seen": 4577034240,
|
|
"step": 8730,
|
|
"train_runtime": 39669.3874,
|
|
"train_tokens_per_second": 115379.504
|
|
},
|
|
{
|
|
"epoch": 0.47295651938634703,
|
|
"grad_norm": 0.15339480340480804,
|
|
"learning_rate": 0.0030443985004549234,
|
|
"loss": 3.0854717254638673,
|
|
"num_input_tokens_seen": 4582277120,
|
|
"step": 8740,
|
|
"train_runtime": 39714.6736,
|
|
"train_tokens_per_second": 115379.952
|
|
},
|
|
{
|
|
"epoch": 0.47349765956871126,
|
|
"grad_norm": 0.1538633406162262,
|
|
"learning_rate": 0.00304050049195043,
|
|
"loss": 3.0906457901000977,
|
|
"num_input_tokens_seen": 4587520000,
|
|
"step": 8750,
|
|
"train_runtime": 39759.8518,
|
|
"train_tokens_per_second": 115380.712
|
|
},
|
|
{
|
|
"epoch": 0.4740387997510755,
|
|
"grad_norm": 0.16446056962013245,
|
|
"learning_rate": 0.0030366015965612976,
|
|
"loss": 3.0834827423095703,
|
|
"num_input_tokens_seen": 4592762880,
|
|
"step": 8760,
|
|
"train_runtime": 39805.0284,
|
|
"train_tokens_per_second": 115381.475
|
|
},
|
|
{
|
|
"epoch": 0.4745799399334398,
|
|
"grad_norm": 0.15852907299995422,
|
|
"learning_rate": 0.003032701826190677,
|
|
"loss": 3.077737808227539,
|
|
"num_input_tokens_seen": 4598005760,
|
|
"step": 8770,
|
|
"train_runtime": 39850.2201,
|
|
"train_tokens_per_second": 115382.192
|
|
},
|
|
{
|
|
"epoch": 0.475121080115804,
|
|
"grad_norm": 0.15762916207313538,
|
|
"learning_rate": 0.003028801192744386,
|
|
"loss": 3.074782943725586,
|
|
"num_input_tokens_seen": 4603248640,
|
|
"step": 8780,
|
|
"train_runtime": 39899.2419,
|
|
"train_tokens_per_second": 115371.832
|
|
},
|
|
{
|
|
"epoch": 0.47566222029816824,
|
|
"grad_norm": 0.1619185209274292,
|
|
"learning_rate": 0.0030248997081308788,
|
|
"loss": 3.0825977325439453,
|
|
"num_input_tokens_seen": 4608491520,
|
|
"step": 8790,
|
|
"train_runtime": 39944.5192,
|
|
"train_tokens_per_second": 115372.312
|
|
},
|
|
{
|
|
"epoch": 0.47620336048053247,
|
|
"grad_norm": 0.16285711526870728,
|
|
"learning_rate": 0.0030209973842612097,
|
|
"loss": 3.080776405334473,
|
|
"num_input_tokens_seen": 4613734400,
|
|
"step": 8800,
|
|
"train_runtime": 39989.7251,
|
|
"train_tokens_per_second": 115372.996
|
|
},
|
|
{
|
|
"epoch": 0.4767445006628967,
|
|
"grad_norm": 0.17198577523231506,
|
|
"learning_rate": 0.003017094233048994,
|
|
"loss": 3.0829303741455076,
|
|
"num_input_tokens_seen": 4618977280,
|
|
"step": 8810,
|
|
"train_runtime": 40034.9384,
|
|
"train_tokens_per_second": 115373.658
|
|
},
|
|
{
|
|
"epoch": 0.477285640845261,
|
|
"grad_norm": 0.16893431544303894,
|
|
"learning_rate": 0.003013190266410372,
|
|
"loss": 3.0930507659912108,
|
|
"num_input_tokens_seen": 4624220160,
|
|
"step": 8820,
|
|
"train_runtime": 40080.146,
|
|
"train_tokens_per_second": 115374.334
|
|
},
|
|
{
|
|
"epoch": 0.4778267810276252,
|
|
"grad_norm": 0.1534847915172577,
|
|
"learning_rate": 0.003009285496263973,
|
|
"loss": 3.086047554016113,
|
|
"num_input_tokens_seen": 4629463040,
|
|
"step": 8830,
|
|
"train_runtime": 40125.3234,
|
|
"train_tokens_per_second": 115375.096
|
|
},
|
|
{
|
|
"epoch": 0.47836792120998944,
|
|
"grad_norm": 0.16068360209465027,
|
|
"learning_rate": 0.003005379934530884,
|
|
"loss": 3.0864025115966798,
|
|
"num_input_tokens_seen": 4634705920,
|
|
"step": 8840,
|
|
"train_runtime": 40170.4754,
|
|
"train_tokens_per_second": 115375.929
|
|
},
|
|
{
|
|
"epoch": 0.4789090613923537,
|
|
"grad_norm": 0.1436556577682495,
|
|
"learning_rate": 0.003001473593134602,
|
|
"loss": 3.0830524444580076,
|
|
"num_input_tokens_seen": 4639948800,
|
|
"step": 8850,
|
|
"train_runtime": 40215.6345,
|
|
"train_tokens_per_second": 115376.74
|
|
},
|
|
{
|
|
"epoch": 0.4794502015747179,
|
|
"grad_norm": 0.14522488415241241,
|
|
"learning_rate": 0.0029975664840010104,
|
|
"loss": 3.0799121856689453,
|
|
"num_input_tokens_seen": 4645191680,
|
|
"step": 8860,
|
|
"train_runtime": 40260.7882,
|
|
"train_tokens_per_second": 115377.564
|
|
},
|
|
{
|
|
"epoch": 0.4799913417570822,
|
|
"grad_norm": 0.16195201873779297,
|
|
"learning_rate": 0.002993658619058331,
|
|
"loss": 3.071552848815918,
|
|
"num_input_tokens_seen": 4650434560,
|
|
"step": 8870,
|
|
"train_runtime": 40305.9418,
|
|
"train_tokens_per_second": 115378.387
|
|
},
|
|
{
|
|
"epoch": 0.4805324819394464,
|
|
"grad_norm": 0.14948424696922302,
|
|
"learning_rate": 0.0029897500102370974,
|
|
"loss": 3.0818138122558594,
|
|
"num_input_tokens_seen": 4655677440,
|
|
"step": 8880,
|
|
"train_runtime": 40351.0976,
|
|
"train_tokens_per_second": 115379.202
|
|
},
|
|
{
|
|
"epoch": 0.48107362212181065,
|
|
"grad_norm": 0.1461019665002823,
|
|
"learning_rate": 0.0029858406694701117,
|
|
"loss": 3.082274627685547,
|
|
"num_input_tokens_seen": 4660920320,
|
|
"step": 8890,
|
|
"train_runtime": 40396.2463,
|
|
"train_tokens_per_second": 115380.035
|
|
},
|
|
{
|
|
"epoch": 0.4816147623041749,
|
|
"grad_norm": 0.1501043438911438,
|
|
"learning_rate": 0.0029819306086924127,
|
|
"loss": 3.083462142944336,
|
|
"num_input_tokens_seen": 4666163200,
|
|
"step": 8900,
|
|
"train_runtime": 40441.3988,
|
|
"train_tokens_per_second": 115380.856
|
|
},
|
|
{
|
|
"epoch": 0.4821559024865391,
|
|
"grad_norm": 0.15929782390594482,
|
|
"learning_rate": 0.002978019839841233,
|
|
"loss": 3.0869064331054688,
|
|
"num_input_tokens_seen": 4671406080,
|
|
"step": 8910,
|
|
"train_runtime": 40486.5499,
|
|
"train_tokens_per_second": 115381.678
|
|
},
|
|
{
|
|
"epoch": 0.4826970426689034,
|
|
"grad_norm": 0.15982982516288757,
|
|
"learning_rate": 0.002974108374855974,
|
|
"loss": 3.082635688781738,
|
|
"num_input_tokens_seen": 4676648960,
|
|
"step": 8920,
|
|
"train_runtime": 40531.6946,
|
|
"train_tokens_per_second": 115382.518
|
|
},
|
|
{
|
|
"epoch": 0.48323818285126763,
|
|
"grad_norm": 0.15398921072483063,
|
|
"learning_rate": 0.0029701962256781555,
|
|
"loss": 3.069881820678711,
|
|
"num_input_tokens_seen": 4681891840,
|
|
"step": 8930,
|
|
"train_runtime": 40576.8387,
|
|
"train_tokens_per_second": 115383.356
|
|
},
|
|
{
|
|
"epoch": 0.48377932303363186,
|
|
"grad_norm": 0.16303351521492004,
|
|
"learning_rate": 0.0029662834042513903,
|
|
"loss": 3.078609085083008,
|
|
"num_input_tokens_seen": 4687134720,
|
|
"step": 8940,
|
|
"train_runtime": 40621.9936,
|
|
"train_tokens_per_second": 115384.163
|
|
},
|
|
{
|
|
"epoch": 0.4843204632159961,
|
|
"grad_norm": 0.16261766850948334,
|
|
"learning_rate": 0.0029623699225213417,
|
|
"loss": 3.072034454345703,
|
|
"num_input_tokens_seen": 4692377600,
|
|
"step": 8950,
|
|
"train_runtime": 40667.1772,
|
|
"train_tokens_per_second": 115384.886
|
|
},
|
|
{
|
|
"epoch": 0.4848616033983603,
|
|
"grad_norm": 0.14818759262561798,
|
|
"learning_rate": 0.002958455792435689,
|
|
"loss": 3.077336883544922,
|
|
"num_input_tokens_seen": 4697620480,
|
|
"step": 8960,
|
|
"train_runtime": 40712.3367,
|
|
"train_tokens_per_second": 115385.676
|
|
},
|
|
{
|
|
"epoch": 0.4854027435807246,
|
|
"grad_norm": 0.1536484807729721,
|
|
"learning_rate": 0.002954541025944093,
|
|
"loss": 3.0622703552246096,
|
|
"num_input_tokens_seen": 4702863360,
|
|
"step": 8970,
|
|
"train_runtime": 40757.516,
|
|
"train_tokens_per_second": 115386.408
|
|
},
|
|
{
|
|
"epoch": 0.48594388376308884,
|
|
"grad_norm": 0.1563226282596588,
|
|
"learning_rate": 0.002950625634998154,
|
|
"loss": 3.0665721893310547,
|
|
"num_input_tokens_seen": 4708106240,
|
|
"step": 8980,
|
|
"train_runtime": 40802.7032,
|
|
"train_tokens_per_second": 115387.116
|
|
},
|
|
{
|
|
"epoch": 0.48648502394545307,
|
|
"grad_norm": 0.15256533026695251,
|
|
"learning_rate": 0.0029467096315513802,
|
|
"loss": 3.0700511932373047,
|
|
"num_input_tokens_seen": 4713349120,
|
|
"step": 8990,
|
|
"train_runtime": 40847.8776,
|
|
"train_tokens_per_second": 115387.859
|
|
},
|
|
{
|
|
"epoch": 0.4870261641278173,
|
|
"grad_norm": 0.1605551838874817,
|
|
"learning_rate": 0.0029427930275591515,
|
|
"loss": 3.076490592956543,
|
|
"num_input_tokens_seen": 4718592000,
|
|
"step": 9000,
|
|
"train_runtime": 40893.0303,
|
|
"train_tokens_per_second": 115388.661
|
|
},
|
|
{
|
|
"epoch": 0.4870261641278173,
|
|
"eval_loss": 3.034029483795166,
|
|
"eval_runtime": 1.9847,
|
|
"eval_samples_per_second": 251.93,
|
|
"eval_steps_per_second": 4.031,
|
|
"num_input_tokens_seen": 4718592000,
|
|
"step": 9000
|
|
},
|
|
{
|
|
"epoch": 0.4875673043101815,
|
|
"grad_norm": 0.16099503636360168,
|
|
"learning_rate": 0.0029388758349786787,
|
|
"loss": 3.081180953979492,
|
|
"num_input_tokens_seen": 4723834880,
|
|
"step": 9010,
|
|
"train_runtime": 40942.5164,
|
|
"train_tokens_per_second": 115377.248
|
|
},
|
|
{
|
|
"epoch": 0.4881084444925458,
|
|
"grad_norm": 0.1605864018201828,
|
|
"learning_rate": 0.0029349580657689707,
|
|
"loss": 3.0802078247070312,
|
|
"num_input_tokens_seen": 4729077760,
|
|
"step": 9020,
|
|
"train_runtime": 40987.6712,
|
|
"train_tokens_per_second": 115378.054
|
|
},
|
|
{
|
|
"epoch": 0.48864958467491004,
|
|
"grad_norm": 0.15125182271003723,
|
|
"learning_rate": 0.0029310397318907965,
|
|
"loss": 3.090005111694336,
|
|
"num_input_tokens_seen": 4734320640,
|
|
"step": 9030,
|
|
"train_runtime": 41032.8355,
|
|
"train_tokens_per_second": 115378.832
|
|
},
|
|
{
|
|
"epoch": 0.4891907248572743,
|
|
"grad_norm": 0.16026070713996887,
|
|
"learning_rate": 0.002927120845306649,
|
|
"loss": 3.087236785888672,
|
|
"num_input_tokens_seen": 4739563520,
|
|
"step": 9040,
|
|
"train_runtime": 41077.9854,
|
|
"train_tokens_per_second": 115379.649
|
|
},
|
|
{
|
|
"epoch": 0.4897318650396385,
|
|
"grad_norm": 0.1533481776714325,
|
|
"learning_rate": 0.0029232014179807098,
|
|
"loss": 3.0772159576416014,
|
|
"num_input_tokens_seen": 4744806400,
|
|
"step": 9050,
|
|
"train_runtime": 41123.1477,
|
|
"train_tokens_per_second": 115380.429
|
|
},
|
|
{
|
|
"epoch": 0.49027300522200273,
|
|
"grad_norm": 0.16315831243991852,
|
|
"learning_rate": 0.002919281461878809,
|
|
"loss": 3.080950927734375,
|
|
"num_input_tokens_seen": 4750049280,
|
|
"step": 9060,
|
|
"train_runtime": 41168.3038,
|
|
"train_tokens_per_second": 115381.224
|
|
},
|
|
{
|
|
"epoch": 0.490814145404367,
|
|
"grad_norm": 0.16064807772636414,
|
|
"learning_rate": 0.0029153609889683934,
|
|
"loss": 3.077931594848633,
|
|
"num_input_tokens_seen": 4755292160,
|
|
"step": 9070,
|
|
"train_runtime": 41213.4562,
|
|
"train_tokens_per_second": 115382.028
|
|
},
|
|
{
|
|
"epoch": 0.49135528558673125,
|
|
"grad_norm": 0.15968522429466248,
|
|
"learning_rate": 0.0029114400112184857,
|
|
"loss": 3.0715621948242187,
|
|
"num_input_tokens_seen": 4760535040,
|
|
"step": 9080,
|
|
"train_runtime": 41258.6304,
|
|
"train_tokens_per_second": 115382.769
|
|
},
|
|
{
|
|
"epoch": 0.4918964257690955,
|
|
"grad_norm": 0.21646282076835632,
|
|
"learning_rate": 0.0029075185405996497,
|
|
"loss": 3.070268249511719,
|
|
"num_input_tokens_seen": 4765777920,
|
|
"step": 9090,
|
|
"train_runtime": 41303.7917,
|
|
"train_tokens_per_second": 115383.545
|
|
},
|
|
{
|
|
"epoch": 0.4924375659514597,
|
|
"grad_norm": 0.14861001074314117,
|
|
"learning_rate": 0.0029035965890839566,
|
|
"loss": 3.0772144317626955,
|
|
"num_input_tokens_seen": 4771020800,
|
|
"step": 9100,
|
|
"train_runtime": 41348.9385,
|
|
"train_tokens_per_second": 115384.36
|
|
},
|
|
{
|
|
"epoch": 0.49297870613382394,
|
|
"grad_norm": 0.15690362453460693,
|
|
"learning_rate": 0.0028996741686449427,
|
|
"loss": 3.079457092285156,
|
|
"num_input_tokens_seen": 4776263680,
|
|
"step": 9110,
|
|
"train_runtime": 41394.0963,
|
|
"train_tokens_per_second": 115385.142
|
|
},
|
|
{
|
|
"epoch": 0.4935198463161882,
|
|
"grad_norm": 0.1561357080936432,
|
|
"learning_rate": 0.0028957512912575777,
|
|
"loss": 3.081951141357422,
|
|
"num_input_tokens_seen": 4781506560,
|
|
"step": 9120,
|
|
"train_runtime": 41439.2561,
|
|
"train_tokens_per_second": 115385.917
|
|
},
|
|
{
|
|
"epoch": 0.49406098649855246,
|
|
"grad_norm": 0.15572723746299744,
|
|
"learning_rate": 0.002891827968898225,
|
|
"loss": 3.0684499740600586,
|
|
"num_input_tokens_seen": 4786749440,
|
|
"step": 9130,
|
|
"train_runtime": 41484.4245,
|
|
"train_tokens_per_second": 115386.666
|
|
},
|
|
{
|
|
"epoch": 0.4946021266809167,
|
|
"grad_norm": 0.14807617664337158,
|
|
"learning_rate": 0.0028879042135446092,
|
|
"loss": 3.0712486267089845,
|
|
"num_input_tokens_seen": 4791992320,
|
|
"step": 9140,
|
|
"train_runtime": 41529.5787,
|
|
"train_tokens_per_second": 115387.453
|
|
},
|
|
{
|
|
"epoch": 0.4951432668632809,
|
|
"grad_norm": 0.15159912407398224,
|
|
"learning_rate": 0.0028839800371757724,
|
|
"loss": 3.0685661315917967,
|
|
"num_input_tokens_seen": 4797235200,
|
|
"step": 9150,
|
|
"train_runtime": 41574.7343,
|
|
"train_tokens_per_second": 115388.235
|
|
},
|
|
{
|
|
"epoch": 0.49568440704564515,
|
|
"grad_norm": 0.15283788740634918,
|
|
"learning_rate": 0.0028800554517720467,
|
|
"loss": 3.066938591003418,
|
|
"num_input_tokens_seen": 4802478080,
|
|
"step": 9160,
|
|
"train_runtime": 41623.4446,
|
|
"train_tokens_per_second": 115379.16
|
|
},
|
|
{
|
|
"epoch": 0.49622554722800943,
|
|
"grad_norm": 0.14532612264156342,
|
|
"learning_rate": 0.0028761304693150093,
|
|
"loss": 3.0726764678955076,
|
|
"num_input_tokens_seen": 4807720960,
|
|
"step": 9170,
|
|
"train_runtime": 41668.6161,
|
|
"train_tokens_per_second": 115379.905
|
|
},
|
|
{
|
|
"epoch": 0.49676668741037366,
|
|
"grad_norm": 0.15642227232456207,
|
|
"learning_rate": 0.0028722051017874514,
|
|
"loss": 3.075974464416504,
|
|
"num_input_tokens_seen": 4812963840,
|
|
"step": 9180,
|
|
"train_runtime": 41713.7758,
|
|
"train_tokens_per_second": 115380.681
|
|
},
|
|
{
|
|
"epoch": 0.4973078275927379,
|
|
"grad_norm": 0.15629522502422333,
|
|
"learning_rate": 0.00286827936117334,
|
|
"loss": 3.0699131011962892,
|
|
"num_input_tokens_seen": 4818206720,
|
|
"step": 9190,
|
|
"train_runtime": 41758.9432,
|
|
"train_tokens_per_second": 115381.433
|
|
},
|
|
{
|
|
"epoch": 0.4978489677751021,
|
|
"grad_norm": 0.15175525844097137,
|
|
"learning_rate": 0.00286435325945778,
|
|
"loss": 3.0690542221069337,
|
|
"num_input_tokens_seen": 4823449600,
|
|
"step": 9200,
|
|
"train_runtime": 41804.1291,
|
|
"train_tokens_per_second": 115382.133
|
|
},
|
|
{
|
|
"epoch": 0.49839010795746636,
|
|
"grad_norm": 0.14598695933818817,
|
|
"learning_rate": 0.0028604268086269793,
|
|
"loss": 3.072031021118164,
|
|
"num_input_tokens_seen": 4828692480,
|
|
"step": 9210,
|
|
"train_runtime": 41849.2968,
|
|
"train_tokens_per_second": 115382.882
|
|
},
|
|
{
|
|
"epoch": 0.49893124813983064,
|
|
"grad_norm": 0.14812366664409637,
|
|
"learning_rate": 0.0028565000206682125,
|
|
"loss": 3.074822998046875,
|
|
"num_input_tokens_seen": 4833935360,
|
|
"step": 9220,
|
|
"train_runtime": 41894.4632,
|
|
"train_tokens_per_second": 115383.633
|
|
},
|
|
{
|
|
"epoch": 0.49947238832219487,
|
|
"grad_norm": 0.1581128090620041,
|
|
"learning_rate": 0.0028525729075697813,
|
|
"loss": 3.071183967590332,
|
|
"num_input_tokens_seen": 4839178240,
|
|
"step": 9230,
|
|
"train_runtime": 41939.6385,
|
|
"train_tokens_per_second": 115384.357
|
|
},
|
|
{
|
|
"epoch": 0.5000135285045592,
|
|
"grad_norm": 0.160576730966568,
|
|
"learning_rate": 0.002848645481320983,
|
|
"loss": 3.079146385192871,
|
|
"num_input_tokens_seen": 4844421120,
|
|
"step": 9240,
|
|
"train_runtime": 41984.8089,
|
|
"train_tokens_per_second": 115385.094
|
|
},
|
|
{
|
|
"epoch": 0.5005546686869233,
|
|
"grad_norm": 0.1423659771680832,
|
|
"learning_rate": 0.002844717753912068,
|
|
"loss": 3.0759227752685545,
|
|
"num_input_tokens_seen": 4849664000,
|
|
"step": 9250,
|
|
"train_runtime": 42029.9847,
|
|
"train_tokens_per_second": 115385.814
|
|
},
|
|
{
|
|
"epoch": 0.5010958088692876,
|
|
"grad_norm": 0.14177994430065155,
|
|
"learning_rate": 0.0028407897373342074,
|
|
"loss": 3.076811599731445,
|
|
"num_input_tokens_seen": 4854906880,
|
|
"step": 9260,
|
|
"train_runtime": 42075.1549,
|
|
"train_tokens_per_second": 115386.548
|
|
},
|
|
{
|
|
"epoch": 0.5016369490516518,
|
|
"grad_norm": 0.14039497077465057,
|
|
"learning_rate": 0.002836861443579456,
|
|
"loss": 3.0762613296508787,
|
|
"num_input_tokens_seen": 4860149760,
|
|
"step": 9270,
|
|
"train_runtime": 42120.3369,
|
|
"train_tokens_per_second": 115387.248
|
|
},
|
|
{
|
|
"epoch": 0.5021780892340161,
|
|
"grad_norm": 0.15003980696201324,
|
|
"learning_rate": 0.0028329328846407125,
|
|
"loss": 3.0661956787109377,
|
|
"num_input_tokens_seen": 4865392640,
|
|
"step": 9280,
|
|
"train_runtime": 42165.525,
|
|
"train_tokens_per_second": 115387.93
|
|
},
|
|
{
|
|
"epoch": 0.5027192294163804,
|
|
"grad_norm": 0.1639668196439743,
|
|
"learning_rate": 0.0028290040725116876,
|
|
"loss": 3.077253723144531,
|
|
"num_input_tokens_seen": 4870635520,
|
|
"step": 9290,
|
|
"train_runtime": 42210.7142,
|
|
"train_tokens_per_second": 115388.607
|
|
},
|
|
{
|
|
"epoch": 0.5032603695987445,
|
|
"grad_norm": 0.15424971282482147,
|
|
"learning_rate": 0.002825075019186865,
|
|
"loss": 3.0679557800292967,
|
|
"num_input_tokens_seen": 4875878400,
|
|
"step": 9300,
|
|
"train_runtime": 42255.9028,
|
|
"train_tokens_per_second": 115389.285
|
|
},
|
|
{
|
|
"epoch": 0.5038015097811088,
|
|
"grad_norm": 0.1511068195104599,
|
|
"learning_rate": 0.0028211457366614607,
|
|
"loss": 3.0695865631103514,
|
|
"num_input_tokens_seen": 4881121280,
|
|
"step": 9310,
|
|
"train_runtime": 42301.0768,
|
|
"train_tokens_per_second": 115390.001
|
|
},
|
|
{
|
|
"epoch": 0.504342649963473,
|
|
"grad_norm": 0.15356752276420593,
|
|
"learning_rate": 0.002817216236931397,
|
|
"loss": 3.073322296142578,
|
|
"num_input_tokens_seen": 4886364160,
|
|
"step": 9320,
|
|
"train_runtime": 42346.2459,
|
|
"train_tokens_per_second": 115390.728
|
|
},
|
|
{
|
|
"epoch": 0.5048837901458373,
|
|
"grad_norm": 0.14986173808574677,
|
|
"learning_rate": 0.002813286531993253,
|
|
"loss": 3.07531681060791,
|
|
"num_input_tokens_seen": 4891607040,
|
|
"step": 9330,
|
|
"train_runtime": 42391.4328,
|
|
"train_tokens_per_second": 115391.406
|
|
},
|
|
{
|
|
"epoch": 0.5054249303282016,
|
|
"grad_norm": 0.14537614583969116,
|
|
"learning_rate": 0.0028093566338442395,
|
|
"loss": 3.0746026992797852,
|
|
"num_input_tokens_seen": 4896849920,
|
|
"step": 9340,
|
|
"train_runtime": 42436.5896,
|
|
"train_tokens_per_second": 115392.164
|
|
},
|
|
{
|
|
"epoch": 0.5059660705105657,
|
|
"grad_norm": 0.15007568895816803,
|
|
"learning_rate": 0.0028054265544821522,
|
|
"loss": 3.0845333099365235,
|
|
"num_input_tokens_seen": 4902092800,
|
|
"step": 9350,
|
|
"train_runtime": 42481.7468,
|
|
"train_tokens_per_second": 115392.92
|
|
},
|
|
{
|
|
"epoch": 0.50650721069293,
|
|
"grad_norm": 0.15155982971191406,
|
|
"learning_rate": 0.0028014963059053446,
|
|
"loss": 3.0744888305664064,
|
|
"num_input_tokens_seen": 4907335680,
|
|
"step": 9360,
|
|
"train_runtime": 42526.8939,
|
|
"train_tokens_per_second": 115393.701
|
|
},
|
|
{
|
|
"epoch": 0.5070483508752942,
|
|
"grad_norm": 0.15760909020900726,
|
|
"learning_rate": 0.002797565900112684,
|
|
"loss": 3.0650793075561524,
|
|
"num_input_tokens_seen": 4912578560,
|
|
"step": 9370,
|
|
"train_runtime": 42572.0476,
|
|
"train_tokens_per_second": 115394.463
|
|
},
|
|
{
|
|
"epoch": 0.5075894910576585,
|
|
"grad_norm": 0.156438410282135,
|
|
"learning_rate": 0.0027936353491035183,
|
|
"loss": 3.0668895721435545,
|
|
"num_input_tokens_seen": 4917821440,
|
|
"step": 9380,
|
|
"train_runtime": 42617.1956,
|
|
"train_tokens_per_second": 115395.238
|
|
},
|
|
{
|
|
"epoch": 0.5081306312400228,
|
|
"grad_norm": 0.16406750679016113,
|
|
"learning_rate": 0.0027897046648776395,
|
|
"loss": 3.061408042907715,
|
|
"num_input_tokens_seen": 4923064320,
|
|
"step": 9390,
|
|
"train_runtime": 42662.3399,
|
|
"train_tokens_per_second": 115396.022
|
|
},
|
|
{
|
|
"epoch": 0.508671771422387,
|
|
"grad_norm": 0.13937264680862427,
|
|
"learning_rate": 0.002785773859435245,
|
|
"loss": 3.069793128967285,
|
|
"num_input_tokens_seen": 4928307200,
|
|
"step": 9400,
|
|
"train_runtime": 42707.5213,
|
|
"train_tokens_per_second": 115396.704
|
|
},
|
|
{
|
|
"epoch": 0.5092129116047512,
|
|
"grad_norm": 0.15395483374595642,
|
|
"learning_rate": 0.0027818429447769044,
|
|
"loss": 3.071869659423828,
|
|
"num_input_tokens_seen": 4933550080,
|
|
"step": 9410,
|
|
"train_runtime": 42752.6998,
|
|
"train_tokens_per_second": 115397.392
|
|
},
|
|
{
|
|
"epoch": 0.5097540517871154,
|
|
"grad_norm": 0.15119874477386475,
|
|
"learning_rate": 0.0027779119329035167,
|
|
"loss": 3.067423629760742,
|
|
"num_input_tokens_seen": 4938792960,
|
|
"step": 9420,
|
|
"train_runtime": 42797.8655,
|
|
"train_tokens_per_second": 115398.114
|
|
},
|
|
{
|
|
"epoch": 0.5102951919694797,
|
|
"grad_norm": 0.1615118384361267,
|
|
"learning_rate": 0.002773980835816284,
|
|
"loss": 3.0653512954711912,
|
|
"num_input_tokens_seen": 4944035840,
|
|
"step": 9430,
|
|
"train_runtime": 42843.0316,
|
|
"train_tokens_per_second": 115398.833
|
|
},
|
|
{
|
|
"epoch": 0.510836332151844,
|
|
"grad_norm": 0.16538918018341064,
|
|
"learning_rate": 0.0027700496655166614,
|
|
"loss": 3.067237663269043,
|
|
"num_input_tokens_seen": 4949278720,
|
|
"step": 9440,
|
|
"train_runtime": 42888.2044,
|
|
"train_tokens_per_second": 115399.532
|
|
},
|
|
{
|
|
"epoch": 0.5113774723342082,
|
|
"grad_norm": 0.14385169744491577,
|
|
"learning_rate": 0.002766118434006332,
|
|
"loss": 3.078049087524414,
|
|
"num_input_tokens_seen": 4954521600,
|
|
"step": 9450,
|
|
"train_runtime": 42933.3584,
|
|
"train_tokens_per_second": 115400.281
|
|
},
|
|
{
|
|
"epoch": 0.5119186125165724,
|
|
"grad_norm": 0.15073780715465546,
|
|
"learning_rate": 0.0027621871532871657,
|
|
"loss": 3.06368350982666,
|
|
"num_input_tokens_seen": 4959764480,
|
|
"step": 9460,
|
|
"train_runtime": 42978.5149,
|
|
"train_tokens_per_second": 115401.02
|
|
},
|
|
{
|
|
"epoch": 0.5124597526989366,
|
|
"grad_norm": 0.15621446073055267,
|
|
"learning_rate": 0.0027582558353611802,
|
|
"loss": 3.0653354644775392,
|
|
"num_input_tokens_seen": 4965007360,
|
|
"step": 9470,
|
|
"train_runtime": 43023.685,
|
|
"train_tokens_per_second": 115401.722
|
|
},
|
|
{
|
|
"epoch": 0.5130008928813009,
|
|
"grad_norm": 0.15570929646492004,
|
|
"learning_rate": 0.0027543244922305105,
|
|
"loss": 3.0613819122314454,
|
|
"num_input_tokens_seen": 4970250240,
|
|
"step": 9480,
|
|
"train_runtime": 43068.8573,
|
|
"train_tokens_per_second": 115402.417
|
|
},
|
|
{
|
|
"epoch": 0.5135420330636652,
|
|
"grad_norm": 0.1389538198709488,
|
|
"learning_rate": 0.0027503931358973644,
|
|
"loss": 3.0687282562255858,
|
|
"num_input_tokens_seen": 4975493120,
|
|
"step": 9490,
|
|
"train_runtime": 43114.0132,
|
|
"train_tokens_per_second": 115403.154
|
|
},
|
|
{
|
|
"epoch": 0.5140831732460294,
|
|
"grad_norm": 0.15439286828041077,
|
|
"learning_rate": 0.002746461778363992,
|
|
"loss": 3.0685733795166015,
|
|
"num_input_tokens_seen": 4980736000,
|
|
"step": 9500,
|
|
"train_runtime": 43159.1877,
|
|
"train_tokens_per_second": 115403.84
|
|
},
|
|
{
|
|
"epoch": 0.5140831732460294,
|
|
"eval_loss": 3.021251916885376,
|
|
"eval_runtime": 1.9829,
|
|
"eval_samples_per_second": 252.155,
|
|
"eval_steps_per_second": 4.034,
|
|
"num_input_tokens_seen": 4980736000,
|
|
"step": 9500
|
|
},
|
|
{
|
|
"epoch": 0.5146243134283937,
|
|
"grad_norm": 0.16106902062892914,
|
|
"learning_rate": 0.0027425304316326484,
|
|
"loss": 3.076310729980469,
|
|
"num_input_tokens_seen": 4985978880,
|
|
"step": 9510,
|
|
"train_runtime": 43206.3354,
|
|
"train_tokens_per_second": 115399.254
|
|
},
|
|
{
|
|
"epoch": 0.5151654536107578,
|
|
"grad_norm": 0.16451045870780945,
|
|
"learning_rate": 0.0027385991077055532,
|
|
"loss": 3.0650386810302734,
|
|
"num_input_tokens_seen": 4991221760,
|
|
"step": 9520,
|
|
"train_runtime": 43251.4993,
|
|
"train_tokens_per_second": 115399.971
|
|
},
|
|
{
|
|
"epoch": 0.5157065937931221,
|
|
"grad_norm": 0.141593337059021,
|
|
"learning_rate": 0.002734667818584858,
|
|
"loss": 3.0678850173950196,
|
|
"num_input_tokens_seen": 4996464640,
|
|
"step": 9530,
|
|
"train_runtime": 43296.675,
|
|
"train_tokens_per_second": 115400.655
|
|
},
|
|
{
|
|
"epoch": 0.5162477339754864,
|
|
"grad_norm": 0.15153639018535614,
|
|
"learning_rate": 0.002730736576272606,
|
|
"loss": 3.0637826919555664,
|
|
"num_input_tokens_seen": 5001707520,
|
|
"step": 9540,
|
|
"train_runtime": 43345.4631,
|
|
"train_tokens_per_second": 115391.719
|
|
},
|
|
{
|
|
"epoch": 0.5167888741578506,
|
|
"grad_norm": 0.1510300487279892,
|
|
"learning_rate": 0.0027268053927707015,
|
|
"loss": 3.066213607788086,
|
|
"num_input_tokens_seen": 5006950400,
|
|
"step": 9550,
|
|
"train_runtime": 43390.6485,
|
|
"train_tokens_per_second": 115392.385
|
|
},
|
|
{
|
|
"epoch": 0.5173300143402149,
|
|
"grad_norm": 0.14986655116081238,
|
|
"learning_rate": 0.0027228742800808657,
|
|
"loss": 3.069229507446289,
|
|
"num_input_tokens_seen": 5012193280,
|
|
"step": 9560,
|
|
"train_runtime": 43435.8008,
|
|
"train_tokens_per_second": 115393.136
|
|
},
|
|
{
|
|
"epoch": 0.517871154522579,
|
|
"grad_norm": 0.15248462557792664,
|
|
"learning_rate": 0.002718943250204604,
|
|
"loss": 3.0567092895507812,
|
|
"num_input_tokens_seen": 5017436160,
|
|
"step": 9570,
|
|
"train_runtime": 43480.9543,
|
|
"train_tokens_per_second": 115393.883
|
|
},
|
|
{
|
|
"epoch": 0.5184122947049433,
|
|
"grad_norm": 0.1401718556880951,
|
|
"learning_rate": 0.0027150123151431717,
|
|
"loss": 3.0642112731933593,
|
|
"num_input_tokens_seen": 5022679040,
|
|
"step": 9580,
|
|
"train_runtime": 43526.1319,
|
|
"train_tokens_per_second": 115394.565
|
|
},
|
|
{
|
|
"epoch": 0.5189534348873076,
|
|
"grad_norm": 0.1594904363155365,
|
|
"learning_rate": 0.002711081486897532,
|
|
"loss": 3.077428436279297,
|
|
"num_input_tokens_seen": 5027921920,
|
|
"step": 9590,
|
|
"train_runtime": 43571.3028,
|
|
"train_tokens_per_second": 115395.262
|
|
},
|
|
{
|
|
"epoch": 0.5194945750696718,
|
|
"grad_norm": 0.16368508338928223,
|
|
"learning_rate": 0.0027071507774683217,
|
|
"loss": 3.0642780303955077,
|
|
"num_input_tokens_seen": 5033164800,
|
|
"step": 9600,
|
|
"train_runtime": 43616.4759,
|
|
"train_tokens_per_second": 115395.953
|
|
},
|
|
{
|
|
"epoch": 0.5200357152520361,
|
|
"grad_norm": 0.15867675840854645,
|
|
"learning_rate": 0.0027032201988558165,
|
|
"loss": 3.056943893432617,
|
|
"num_input_tokens_seen": 5038407680,
|
|
"step": 9610,
|
|
"train_runtime": 43661.6654,
|
|
"train_tokens_per_second": 115396.599
|
|
},
|
|
{
|
|
"epoch": 0.5205768554344002,
|
|
"grad_norm": 0.14183756709098816,
|
|
"learning_rate": 0.0026992897630598927,
|
|
"loss": 3.0706558227539062,
|
|
"num_input_tokens_seen": 5043650560,
|
|
"step": 9620,
|
|
"train_runtime": 43706.8375,
|
|
"train_tokens_per_second": 115397.289
|
|
},
|
|
{
|
|
"epoch": 0.5211179956167645,
|
|
"grad_norm": 0.15698400139808655,
|
|
"learning_rate": 0.002695359482079989,
|
|
"loss": 3.0621952056884765,
|
|
"num_input_tokens_seen": 5048893440,
|
|
"step": 9630,
|
|
"train_runtime": 43752.0038,
|
|
"train_tokens_per_second": 115397.993
|
|
},
|
|
{
|
|
"epoch": 0.5216591357991288,
|
|
"grad_norm": 0.15258745849132538,
|
|
"learning_rate": 0.002691429367915072,
|
|
"loss": 3.0683521270751952,
|
|
"num_input_tokens_seen": 5054136320,
|
|
"step": 9640,
|
|
"train_runtime": 43797.1921,
|
|
"train_tokens_per_second": 115398.638
|
|
},
|
|
{
|
|
"epoch": 0.522200275981493,
|
|
"grad_norm": 0.1572786569595337,
|
|
"learning_rate": 0.0026874994325636016,
|
|
"loss": 3.0657506942749024,
|
|
"num_input_tokens_seen": 5059379200,
|
|
"step": 9650,
|
|
"train_runtime": 43842.367,
|
|
"train_tokens_per_second": 115399.317
|
|
},
|
|
{
|
|
"epoch": 0.5227414161638573,
|
|
"grad_norm": 0.1440490484237671,
|
|
"learning_rate": 0.002683569688023488,
|
|
"loss": 3.057964324951172,
|
|
"num_input_tokens_seen": 5064622080,
|
|
"step": 9660,
|
|
"train_runtime": 43887.5251,
|
|
"train_tokens_per_second": 115400.038
|
|
},
|
|
{
|
|
"epoch": 0.5232825563462215,
|
|
"grad_norm": 0.1437375247478485,
|
|
"learning_rate": 0.002679640146292061,
|
|
"loss": 3.067335510253906,
|
|
"num_input_tokens_seen": 5069864960,
|
|
"step": 9670,
|
|
"train_runtime": 43932.6993,
|
|
"train_tokens_per_second": 115400.716
|
|
},
|
|
{
|
|
"epoch": 0.5238236965285857,
|
|
"grad_norm": 0.14964227378368378,
|
|
"learning_rate": 0.0026757108193660294,
|
|
"loss": 3.0661109924316405,
|
|
"num_input_tokens_seen": 5075107840,
|
|
"step": 9680,
|
|
"train_runtime": 43977.8845,
|
|
"train_tokens_per_second": 115401.364
|
|
},
|
|
{
|
|
"epoch": 0.52436483671095,
|
|
"grad_norm": 0.15807990729808807,
|
|
"learning_rate": 0.0026717817192414496,
|
|
"loss": 3.0581291198730467,
|
|
"num_input_tokens_seen": 5080350720,
|
|
"step": 9690,
|
|
"train_runtime": 44023.0423,
|
|
"train_tokens_per_second": 115402.082
|
|
},
|
|
{
|
|
"epoch": 0.5249059768933142,
|
|
"grad_norm": 0.1513347625732422,
|
|
"learning_rate": 0.0026678528579136833,
|
|
"loss": 3.0648067474365233,
|
|
"num_input_tokens_seen": 5085593600,
|
|
"step": 9700,
|
|
"train_runtime": 44068.1745,
|
|
"train_tokens_per_second": 115402.865
|
|
},
|
|
{
|
|
"epoch": 0.5254471170756785,
|
|
"grad_norm": 0.13990284502506256,
|
|
"learning_rate": 0.002663924247377361,
|
|
"loss": 3.06469841003418,
|
|
"num_input_tokens_seen": 5090836480,
|
|
"step": 9710,
|
|
"train_runtime": 44113.2866,
|
|
"train_tokens_per_second": 115403.7
|
|
},
|
|
{
|
|
"epoch": 0.5259882572580427,
|
|
"grad_norm": 0.16054664552211761,
|
|
"learning_rate": 0.002659995899626353,
|
|
"loss": 3.070522689819336,
|
|
"num_input_tokens_seen": 5096079360,
|
|
"step": 9720,
|
|
"train_runtime": 44158.4184,
|
|
"train_tokens_per_second": 115404.481
|
|
},
|
|
{
|
|
"epoch": 0.5265293974404069,
|
|
"grad_norm": 0.15888893604278564,
|
|
"learning_rate": 0.0026560678266537223,
|
|
"loss": 3.061862564086914,
|
|
"num_input_tokens_seen": 5101322240,
|
|
"step": 9730,
|
|
"train_runtime": 44203.5783,
|
|
"train_tokens_per_second": 115405.188
|
|
},
|
|
{
|
|
"epoch": 0.5270705376227712,
|
|
"grad_norm": 0.1485973447561264,
|
|
"learning_rate": 0.002652140040451696,
|
|
"loss": 3.0686100006103514,
|
|
"num_input_tokens_seen": 5106565120,
|
|
"step": 9740,
|
|
"train_runtime": 44248.7463,
|
|
"train_tokens_per_second": 115405.871
|
|
},
|
|
{
|
|
"epoch": 0.5276116778051354,
|
|
"grad_norm": 0.1576089709997177,
|
|
"learning_rate": 0.002648212553011623,
|
|
"loss": 3.062734603881836,
|
|
"num_input_tokens_seen": 5111808000,
|
|
"step": 9750,
|
|
"train_runtime": 44293.9122,
|
|
"train_tokens_per_second": 115406.559
|
|
},
|
|
{
|
|
"epoch": 0.5281528179874997,
|
|
"grad_norm": 0.14466626942157745,
|
|
"learning_rate": 0.0026442853763239444,
|
|
"loss": 3.0534576416015624,
|
|
"num_input_tokens_seen": 5117050880,
|
|
"step": 9760,
|
|
"train_runtime": 44339.0625,
|
|
"train_tokens_per_second": 115407.286
|
|
},
|
|
{
|
|
"epoch": 0.5286939581698639,
|
|
"grad_norm": 0.13870450854301453,
|
|
"learning_rate": 0.0026403585223781483,
|
|
"loss": 3.0488052368164062,
|
|
"num_input_tokens_seen": 5122293760,
|
|
"step": 9770,
|
|
"train_runtime": 44384.2076,
|
|
"train_tokens_per_second": 115408.025
|
|
},
|
|
{
|
|
"epoch": 0.5292350983522282,
|
|
"grad_norm": 0.15322810411453247,
|
|
"learning_rate": 0.0026364320031627385,
|
|
"loss": 3.056787109375,
|
|
"num_input_tokens_seen": 5127536640,
|
|
"step": 9780,
|
|
"train_runtime": 44429.3763,
|
|
"train_tokens_per_second": 115408.702
|
|
},
|
|
{
|
|
"epoch": 0.5297762385345924,
|
|
"grad_norm": 0.1526278853416443,
|
|
"learning_rate": 0.0026325058306652,
|
|
"loss": 3.068254089355469,
|
|
"num_input_tokens_seen": 5132779520,
|
|
"step": 9790,
|
|
"train_runtime": 44474.5291,
|
|
"train_tokens_per_second": 115409.418
|
|
},
|
|
{
|
|
"epoch": 0.5303173787169566,
|
|
"grad_norm": 0.14957252144813538,
|
|
"learning_rate": 0.002628580016871954,
|
|
"loss": 3.0630029678344726,
|
|
"num_input_tokens_seen": 5138022400,
|
|
"step": 9800,
|
|
"train_runtime": 44519.6838,
|
|
"train_tokens_per_second": 115410.128
|
|
},
|
|
{
|
|
"epoch": 0.5308585188993209,
|
|
"grad_norm": 0.1606789082288742,
|
|
"learning_rate": 0.002624654573768332,
|
|
"loss": 3.0618038177490234,
|
|
"num_input_tokens_seen": 5143265280,
|
|
"step": 9810,
|
|
"train_runtime": 44564.8296,
|
|
"train_tokens_per_second": 115410.859
|
|
},
|
|
{
|
|
"epoch": 0.5313996590816851,
|
|
"grad_norm": 0.148385152220726,
|
|
"learning_rate": 0.002620729513338529,
|
|
"loss": 3.06335334777832,
|
|
"num_input_tokens_seen": 5148508160,
|
|
"step": 9820,
|
|
"train_runtime": 44609.9754,
|
|
"train_tokens_per_second": 115411.589
|
|
},
|
|
{
|
|
"epoch": 0.5319407992640494,
|
|
"grad_norm": 0.1520962417125702,
|
|
"learning_rate": 0.002616804847565574,
|
|
"loss": 3.061989593505859,
|
|
"num_input_tokens_seen": 5153751040,
|
|
"step": 9830,
|
|
"train_runtime": 44655.138,
|
|
"train_tokens_per_second": 115412.275
|
|
},
|
|
{
|
|
"epoch": 0.5324819394464136,
|
|
"grad_norm": 0.14803871512413025,
|
|
"learning_rate": 0.002612880588431294,
|
|
"loss": 3.062520980834961,
|
|
"num_input_tokens_seen": 5158993920,
|
|
"step": 9840,
|
|
"train_runtime": 44700.2918,
|
|
"train_tokens_per_second": 115412.981
|
|
},
|
|
{
|
|
"epoch": 0.5330230796287778,
|
|
"grad_norm": 0.14693519473075867,
|
|
"learning_rate": 0.002608956747916268,
|
|
"loss": 3.053732681274414,
|
|
"num_input_tokens_seen": 5164236800,
|
|
"step": 9850,
|
|
"train_runtime": 44745.4454,
|
|
"train_tokens_per_second": 115413.686
|
|
},
|
|
{
|
|
"epoch": 0.5335642198111421,
|
|
"grad_norm": 0.14247646927833557,
|
|
"learning_rate": 0.0026050333379998014,
|
|
"loss": 3.0687253952026365,
|
|
"num_input_tokens_seen": 5169479680,
|
|
"step": 9860,
|
|
"train_runtime": 44790.614,
|
|
"train_tokens_per_second": 115414.352
|
|
},
|
|
{
|
|
"epoch": 0.5341053599935063,
|
|
"grad_norm": 0.1414949595928192,
|
|
"learning_rate": 0.0026011103706598867,
|
|
"loss": 3.0667953491210938,
|
|
"num_input_tokens_seen": 5174722560,
|
|
"step": 9870,
|
|
"train_runtime": 44835.7697,
|
|
"train_tokens_per_second": 115415.049
|
|
},
|
|
{
|
|
"epoch": 0.5346465001758706,
|
|
"grad_norm": 0.15308037400245667,
|
|
"learning_rate": 0.00259718785787316,
|
|
"loss": 3.0632091522216798,
|
|
"num_input_tokens_seen": 5179965440,
|
|
"step": 9880,
|
|
"train_runtime": 44880.9299,
|
|
"train_tokens_per_second": 115415.733
|
|
},
|
|
{
|
|
"epoch": 0.5351876403582349,
|
|
"grad_norm": 0.1401015669107437,
|
|
"learning_rate": 0.002593265811614872,
|
|
"loss": 3.054189682006836,
|
|
"num_input_tokens_seen": 5185208320,
|
|
"step": 9890,
|
|
"train_runtime": 44926.1001,
|
|
"train_tokens_per_second": 115416.391
|
|
},
|
|
{
|
|
"epoch": 0.535728780540599,
|
|
"grad_norm": 0.15150010585784912,
|
|
"learning_rate": 0.0025893442438588523,
|
|
"loss": 3.0516624450683594,
|
|
"num_input_tokens_seen": 5190451200,
|
|
"step": 9900,
|
|
"train_runtime": 44971.2488,
|
|
"train_tokens_per_second": 115417.102
|
|
},
|
|
{
|
|
"epoch": 0.5362699207229633,
|
|
"grad_norm": 0.17257611453533173,
|
|
"learning_rate": 0.0025854231665774653,
|
|
"loss": 3.0537059783935545,
|
|
"num_input_tokens_seen": 5195694080,
|
|
"step": 9910,
|
|
"train_runtime": 45016.4036,
|
|
"train_tokens_per_second": 115417.796
|
|
},
|
|
{
|
|
"epoch": 0.5368110609053275,
|
|
"grad_norm": 0.14506685733795166,
|
|
"learning_rate": 0.002581502591741579,
|
|
"loss": 3.0568138122558595,
|
|
"num_input_tokens_seen": 5200936960,
|
|
"step": 9920,
|
|
"train_runtime": 45065.2284,
|
|
"train_tokens_per_second": 115409.089
|
|
},
|
|
{
|
|
"epoch": 0.5373522010876918,
|
|
"grad_norm": 0.14898552000522614,
|
|
"learning_rate": 0.002577582531320528,
|
|
"loss": 3.0490861892700196,
|
|
"num_input_tokens_seen": 5206179840,
|
|
"step": 9930,
|
|
"train_runtime": 45110.3898,
|
|
"train_tokens_per_second": 115409.773
|
|
},
|
|
{
|
|
"epoch": 0.5378933412700561,
|
|
"grad_norm": 0.14676256477832794,
|
|
"learning_rate": 0.0025736629972820785,
|
|
"loss": 3.067533493041992,
|
|
"num_input_tokens_seen": 5211422720,
|
|
"step": 9940,
|
|
"train_runtime": 45155.5461,
|
|
"train_tokens_per_second": 115410.468
|
|
},
|
|
{
|
|
"epoch": 0.5384344814524202,
|
|
"grad_norm": 0.15546375513076782,
|
|
"learning_rate": 0.002569744001592385,
|
|
"loss": 3.053817367553711,
|
|
"num_input_tokens_seen": 5216665600,
|
|
"step": 9950,
|
|
"train_runtime": 45200.7043,
|
|
"train_tokens_per_second": 115411.157
|
|
},
|
|
{
|
|
"epoch": 0.5389756216347845,
|
|
"grad_norm": 0.16900601983070374,
|
|
"learning_rate": 0.002565825556215962,
|
|
"loss": 3.062000274658203,
|
|
"num_input_tokens_seen": 5221908480,
|
|
"step": 9960,
|
|
"train_runtime": 45245.886,
|
|
"train_tokens_per_second": 115411.785
|
|
},
|
|
{
|
|
"epoch": 0.5395167618171487,
|
|
"grad_norm": 0.14602519571781158,
|
|
"learning_rate": 0.0025619076731156444,
|
|
"loss": 3.0598079681396486,
|
|
"num_input_tokens_seen": 5227151360,
|
|
"step": 9970,
|
|
"train_runtime": 45291.0267,
|
|
"train_tokens_per_second": 115412.516
|
|
},
|
|
{
|
|
"epoch": 0.540057901999513,
|
|
"grad_norm": 0.15503665804862976,
|
|
"learning_rate": 0.002557990364252547,
|
|
"loss": 3.047740936279297,
|
|
"num_input_tokens_seen": 5232394240,
|
|
"step": 9980,
|
|
"train_runtime": 45336.1788,
|
|
"train_tokens_per_second": 115413.217
|
|
},
|
|
{
|
|
"epoch": 0.5405990421818773,
|
|
"grad_norm": 0.15822327136993408,
|
|
"learning_rate": 0.0025540736415860343,
|
|
"loss": 3.0622173309326173,
|
|
"num_input_tokens_seen": 5237637120,
|
|
"step": 9990,
|
|
"train_runtime": 45381.3445,
|
|
"train_tokens_per_second": 115413.882
|
|
},
|
|
{
|
|
"epoch": 0.5411401823642414,
|
|
"grad_norm": 0.13620983064174652,
|
|
"learning_rate": 0.0025501575170736803,
|
|
"loss": 3.0480823516845703,
|
|
"num_input_tokens_seen": 5242880000,
|
|
"step": 10000,
|
|
"train_runtime": 45426.5099,
|
|
"train_tokens_per_second": 115414.546
|
|
},
|
|
{
|
|
"epoch": 0.5411401823642414,
|
|
"eval_loss": 3.0108466148376465,
|
|
"eval_runtime": 1.9863,
|
|
"eval_samples_per_second": 251.722,
|
|
"eval_steps_per_second": 4.028,
|
|
"num_input_tokens_seen": 5242880000,
|
|
"step": 10000
|
|
},
|
|
{
|
|
"epoch": 0.5416813225466057,
|
|
"grad_norm": 0.14582324028015137,
|
|
"learning_rate": 0.002546242002671233,
|
|
"loss": 3.0488231658935545,
|
|
"num_input_tokens_seen": 5248122880,
|
|
"step": 10010,
|
|
"train_runtime": 45475.9888,
|
|
"train_tokens_per_second": 115404.261
|
|
},
|
|
{
|
|
"epoch": 0.5422224627289699,
|
|
"grad_norm": 0.14079852402210236,
|
|
"learning_rate": 0.0025423271103325786,
|
|
"loss": 3.0604705810546875,
|
|
"num_input_tokens_seen": 5253365760,
|
|
"step": 10020,
|
|
"train_runtime": 45521.0949,
|
|
"train_tokens_per_second": 115405.083
|
|
},
|
|
{
|
|
"epoch": 0.5427636029113342,
|
|
"grad_norm": 0.15606586635112762,
|
|
"learning_rate": 0.002538412852009702,
|
|
"loss": 3.0583091735839845,
|
|
"num_input_tokens_seen": 5258608640,
|
|
"step": 10030,
|
|
"train_runtime": 45566.226,
|
|
"train_tokens_per_second": 115405.841
|
|
},
|
|
{
|
|
"epoch": 0.5433047430936985,
|
|
"grad_norm": 0.14329582452774048,
|
|
"learning_rate": 0.002534499239652654,
|
|
"loss": 3.0502853393554688,
|
|
"num_input_tokens_seen": 5263851520,
|
|
"step": 10040,
|
|
"train_runtime": 45611.3663,
|
|
"train_tokens_per_second": 115406.574
|
|
},
|
|
{
|
|
"epoch": 0.5438458832760626,
|
|
"grad_norm": 0.14503344893455505,
|
|
"learning_rate": 0.0025305862852095145,
|
|
"loss": 3.0582489013671874,
|
|
"num_input_tokens_seen": 5269094400,
|
|
"step": 10050,
|
|
"train_runtime": 45656.5231,
|
|
"train_tokens_per_second": 115407.264
|
|
},
|
|
{
|
|
"epoch": 0.5443870234584269,
|
|
"grad_norm": 0.153029665350914,
|
|
"learning_rate": 0.002526674000626352,
|
|
"loss": 3.052793502807617,
|
|
"num_input_tokens_seen": 5274337280,
|
|
"step": 10060,
|
|
"train_runtime": 45701.6761,
|
|
"train_tokens_per_second": 115407.962
|
|
},
|
|
{
|
|
"epoch": 0.5449281636407911,
|
|
"grad_norm": 0.14204776287078857,
|
|
"learning_rate": 0.00252276239784719,
|
|
"loss": 3.052510643005371,
|
|
"num_input_tokens_seen": 5279580160,
|
|
"step": 10070,
|
|
"train_runtime": 45746.8203,
|
|
"train_tokens_per_second": 115408.68
|
|
},
|
|
{
|
|
"epoch": 0.5454693038231554,
|
|
"grad_norm": 0.14915040135383606,
|
|
"learning_rate": 0.0025188514888139757,
|
|
"loss": 3.058329391479492,
|
|
"num_input_tokens_seen": 5284823040,
|
|
"step": 10080,
|
|
"train_runtime": 45791.9522,
|
|
"train_tokens_per_second": 115409.429
|
|
},
|
|
{
|
|
"epoch": 0.5460104440055197,
|
|
"grad_norm": 0.14046527445316315,
|
|
"learning_rate": 0.0025149412854665316,
|
|
"loss": 3.0549495697021483,
|
|
"num_input_tokens_seen": 5290065920,
|
|
"step": 10090,
|
|
"train_runtime": 45837.0921,
|
|
"train_tokens_per_second": 115410.155
|
|
},
|
|
{
|
|
"epoch": 0.5465515841878839,
|
|
"grad_norm": 0.15560267865657806,
|
|
"learning_rate": 0.0025110317997425295,
|
|
"loss": 3.0544879913330076,
|
|
"num_input_tokens_seen": 5295308800,
|
|
"step": 10100,
|
|
"train_runtime": 45882.2338,
|
|
"train_tokens_per_second": 115410.876
|
|
},
|
|
{
|
|
"epoch": 0.5470927243702481,
|
|
"grad_norm": 0.15250231325626373,
|
|
"learning_rate": 0.002507123043577449,
|
|
"loss": 3.0573678970336915,
|
|
"num_input_tokens_seen": 5300551680,
|
|
"step": 10110,
|
|
"train_runtime": 45927.3738,
|
|
"train_tokens_per_second": 115411.6
|
|
},
|
|
{
|
|
"epoch": 0.5476338645526123,
|
|
"grad_norm": 0.13873432576656342,
|
|
"learning_rate": 0.002503215028904543,
|
|
"loss": 3.0521045684814454,
|
|
"num_input_tokens_seen": 5305794560,
|
|
"step": 10120,
|
|
"train_runtime": 45972.5283,
|
|
"train_tokens_per_second": 115412.285
|
|
},
|
|
{
|
|
"epoch": 0.5481750047349766,
|
|
"grad_norm": 0.15664595365524292,
|
|
"learning_rate": 0.0024993077676548014,
|
|
"loss": 3.0536930084228517,
|
|
"num_input_tokens_seen": 5311037440,
|
|
"step": 10130,
|
|
"train_runtime": 46017.6749,
|
|
"train_tokens_per_second": 115412.99
|
|
},
|
|
{
|
|
"epoch": 0.5487161449173409,
|
|
"grad_norm": 0.15226289629936218,
|
|
"learning_rate": 0.002495401271756911,
|
|
"loss": 3.0586917877197264,
|
|
"num_input_tokens_seen": 5316280320,
|
|
"step": 10140,
|
|
"train_runtime": 46062.8137,
|
|
"train_tokens_per_second": 115413.712
|
|
},
|
|
{
|
|
"epoch": 0.5492572850997051,
|
|
"grad_norm": 0.14132562279701233,
|
|
"learning_rate": 0.0024914955531372264,
|
|
"loss": 3.0555648803710938,
|
|
"num_input_tokens_seen": 5321523200,
|
|
"step": 10150,
|
|
"train_runtime": 46107.9535,
|
|
"train_tokens_per_second": 115414.43
|
|
},
|
|
{
|
|
"epoch": 0.5497984252820693,
|
|
"grad_norm": 0.15198439359664917,
|
|
"learning_rate": 0.002487590623719726,
|
|
"loss": 3.0600481033325195,
|
|
"num_input_tokens_seen": 5326766080,
|
|
"step": 10160,
|
|
"train_runtime": 46153.0991,
|
|
"train_tokens_per_second": 115415.133
|
|
},
|
|
{
|
|
"epoch": 0.5503395654644335,
|
|
"grad_norm": 0.1487119495868683,
|
|
"learning_rate": 0.002483686495425979,
|
|
"loss": 3.0562034606933595,
|
|
"num_input_tokens_seen": 5332008960,
|
|
"step": 10170,
|
|
"train_runtime": 46198.2435,
|
|
"train_tokens_per_second": 115415.837
|
|
},
|
|
{
|
|
"epoch": 0.5508807056467978,
|
|
"grad_norm": 0.1500309705734253,
|
|
"learning_rate": 0.00247978318017511,
|
|
"loss": 3.0558704376220702,
|
|
"num_input_tokens_seen": 5337251840,
|
|
"step": 10180,
|
|
"train_runtime": 46243.3932,
|
|
"train_tokens_per_second": 115416.527
|
|
},
|
|
{
|
|
"epoch": 0.5514218458291621,
|
|
"grad_norm": 0.14477477967739105,
|
|
"learning_rate": 0.0024758806898837614,
|
|
"loss": 3.0584625244140624,
|
|
"num_input_tokens_seen": 5342494720,
|
|
"step": 10190,
|
|
"train_runtime": 46288.5341,
|
|
"train_tokens_per_second": 115417.237
|
|
},
|
|
{
|
|
"epoch": 0.5519629860115263,
|
|
"grad_norm": 0.14965343475341797,
|
|
"learning_rate": 0.0024719790364660555,
|
|
"loss": 3.053845024108887,
|
|
"num_input_tokens_seen": 5347737600,
|
|
"step": 10200,
|
|
"train_runtime": 46333.6743,
|
|
"train_tokens_per_second": 115417.948
|
|
},
|
|
{
|
|
"epoch": 0.5525041261938906,
|
|
"grad_norm": 0.14595621824264526,
|
|
"learning_rate": 0.002468078231833561,
|
|
"loss": 3.0468162536621093,
|
|
"num_input_tokens_seen": 5352980480,
|
|
"step": 10210,
|
|
"train_runtime": 46378.8185,
|
|
"train_tokens_per_second": 115418.647
|
|
},
|
|
{
|
|
"epoch": 0.5530452663762547,
|
|
"grad_norm": 0.15934494137763977,
|
|
"learning_rate": 0.002464178287895256,
|
|
"loss": 3.0428611755371096,
|
|
"num_input_tokens_seen": 5358223360,
|
|
"step": 10220,
|
|
"train_runtime": 46423.9466,
|
|
"train_tokens_per_second": 115419.385
|
|
},
|
|
{
|
|
"epoch": 0.553586406558619,
|
|
"grad_norm": 0.14214660227298737,
|
|
"learning_rate": 0.002460279216557488,
|
|
"loss": 3.0542884826660157,
|
|
"num_input_tokens_seen": 5363466240,
|
|
"step": 10230,
|
|
"train_runtime": 46469.0816,
|
|
"train_tokens_per_second": 115420.104
|
|
},
|
|
{
|
|
"epoch": 0.5541275467409833,
|
|
"grad_norm": 0.16061817109584808,
|
|
"learning_rate": 0.0024563810297239448,
|
|
"loss": 3.0611974716186525,
|
|
"num_input_tokens_seen": 5368709120,
|
|
"step": 10240,
|
|
"train_runtime": 46514.2276,
|
|
"train_tokens_per_second": 115420.795
|
|
},
|
|
{
|
|
"epoch": 0.5546686869233475,
|
|
"grad_norm": 0.14743222296237946,
|
|
"learning_rate": 0.0024524837392956088,
|
|
"loss": 3.0409524917602537,
|
|
"num_input_tokens_seen": 5373952000,
|
|
"step": 10250,
|
|
"train_runtime": 46559.373,
|
|
"train_tokens_per_second": 115421.486
|
|
},
|
|
{
|
|
"epoch": 0.5552098271057118,
|
|
"grad_norm": 0.145145446062088,
|
|
"learning_rate": 0.0024485873571707313,
|
|
"loss": 3.0503875732421877,
|
|
"num_input_tokens_seen": 5379194880,
|
|
"step": 10260,
|
|
"train_runtime": 46604.5364,
|
|
"train_tokens_per_second": 115422.13
|
|
},
|
|
{
|
|
"epoch": 0.5557509672880759,
|
|
"grad_norm": 0.13785313069820404,
|
|
"learning_rate": 0.0024446918952447856,
|
|
"loss": 3.051102066040039,
|
|
"num_input_tokens_seen": 5384437760,
|
|
"step": 10270,
|
|
"train_runtime": 46649.714,
|
|
"train_tokens_per_second": 115422.739
|
|
},
|
|
{
|
|
"epoch": 0.5562921074704402,
|
|
"grad_norm": 0.15741367638111115,
|
|
"learning_rate": 0.002440797365410437,
|
|
"loss": 3.0524486541748046,
|
|
"num_input_tokens_seen": 5389680640,
|
|
"step": 10280,
|
|
"train_runtime": 46694.8766,
|
|
"train_tokens_per_second": 115423.383
|
|
},
|
|
{
|
|
"epoch": 0.5568332476528045,
|
|
"grad_norm": 0.14624185860157013,
|
|
"learning_rate": 0.002436903779557509,
|
|
"loss": 3.041734313964844,
|
|
"num_input_tokens_seen": 5394923520,
|
|
"step": 10290,
|
|
"train_runtime": 46740.04,
|
|
"train_tokens_per_second": 115424.024
|
|
},
|
|
{
|
|
"epoch": 0.5573743878351687,
|
|
"grad_norm": 0.15357740223407745,
|
|
"learning_rate": 0.002433011149572938,
|
|
"loss": 3.05377254486084,
|
|
"num_input_tokens_seen": 5400166400,
|
|
"step": 10300,
|
|
"train_runtime": 46785.1999,
|
|
"train_tokens_per_second": 115424.673
|
|
},
|
|
{
|
|
"epoch": 0.557915528017533,
|
|
"grad_norm": 0.1404609978199005,
|
|
"learning_rate": 0.002429119487340744,
|
|
"loss": 3.0517080307006834,
|
|
"num_input_tokens_seen": 5405409280,
|
|
"step": 10310,
|
|
"train_runtime": 46834.0267,
|
|
"train_tokens_per_second": 115416.283
|
|
},
|
|
{
|
|
"epoch": 0.5584566681998971,
|
|
"grad_norm": 0.13460350036621094,
|
|
"learning_rate": 0.0024252288047419933,
|
|
"loss": 3.047005462646484,
|
|
"num_input_tokens_seen": 5410652160,
|
|
"step": 10320,
|
|
"train_runtime": 46879.179,
|
|
"train_tokens_per_second": 115416.956
|
|
},
|
|
{
|
|
"epoch": 0.5589978083822614,
|
|
"grad_norm": 0.13896240293979645,
|
|
"learning_rate": 0.002421339113654761,
|
|
"loss": 3.0483970642089844,
|
|
"num_input_tokens_seen": 5415895040,
|
|
"step": 10330,
|
|
"train_runtime": 46924.3535,
|
|
"train_tokens_per_second": 115417.574
|
|
},
|
|
{
|
|
"epoch": 0.5595389485646257,
|
|
"grad_norm": 0.1555888056755066,
|
|
"learning_rate": 0.0024174504259540965,
|
|
"loss": 3.045535087585449,
|
|
"num_input_tokens_seen": 5421137920,
|
|
"step": 10340,
|
|
"train_runtime": 46969.5132,
|
|
"train_tokens_per_second": 115418.227
|
|
},
|
|
{
|
|
"epoch": 0.5600800887469899,
|
|
"grad_norm": 0.1442544311285019,
|
|
"learning_rate": 0.002413562753511982,
|
|
"loss": 3.0400226593017576,
|
|
"num_input_tokens_seen": 5426380800,
|
|
"step": 10350,
|
|
"train_runtime": 47014.6917,
|
|
"train_tokens_per_second": 115418.832
|
|
},
|
|
{
|
|
"epoch": 0.5606212289293542,
|
|
"grad_norm": 0.16144470870494843,
|
|
"learning_rate": 0.002409676108197302,
|
|
"loss": 3.044460678100586,
|
|
"num_input_tokens_seen": 5431623680,
|
|
"step": 10360,
|
|
"train_runtime": 47059.8715,
|
|
"train_tokens_per_second": 115419.433
|
|
},
|
|
{
|
|
"epoch": 0.5611623691117184,
|
|
"grad_norm": 0.1435036063194275,
|
|
"learning_rate": 0.0024057905018758097,
|
|
"loss": 3.051218032836914,
|
|
"num_input_tokens_seen": 5436866560,
|
|
"step": 10370,
|
|
"train_runtime": 47105.0206,
|
|
"train_tokens_per_second": 115420.108
|
|
},
|
|
{
|
|
"epoch": 0.5617035092940826,
|
|
"grad_norm": 0.14437657594680786,
|
|
"learning_rate": 0.0024019059464100794,
|
|
"loss": 3.049814987182617,
|
|
"num_input_tokens_seen": 5442109440,
|
|
"step": 10380,
|
|
"train_runtime": 47150.1709,
|
|
"train_tokens_per_second": 115420.779
|
|
},
|
|
{
|
|
"epoch": 0.5622446494764469,
|
|
"grad_norm": 0.1483716368675232,
|
|
"learning_rate": 0.0023980224536594803,
|
|
"loss": 3.051362991333008,
|
|
"num_input_tokens_seen": 5447352320,
|
|
"step": 10390,
|
|
"train_runtime": 47195.3171,
|
|
"train_tokens_per_second": 115421.458
|
|
},
|
|
{
|
|
"epoch": 0.5627857896588111,
|
|
"grad_norm": 0.1392650008201599,
|
|
"learning_rate": 0.002394140035480139,
|
|
"loss": 3.05356502532959,
|
|
"num_input_tokens_seen": 5452595200,
|
|
"step": 10400,
|
|
"train_runtime": 47240.4745,
|
|
"train_tokens_per_second": 115422.109
|
|
},
|
|
{
|
|
"epoch": 0.5633269298411754,
|
|
"grad_norm": 0.13990668952465057,
|
|
"learning_rate": 0.002390258703724898,
|
|
"loss": 3.053313064575195,
|
|
"num_input_tokens_seen": 5457838080,
|
|
"step": 10410,
|
|
"train_runtime": 47285.6159,
|
|
"train_tokens_per_second": 115422.798
|
|
},
|
|
{
|
|
"epoch": 0.5638680700235396,
|
|
"grad_norm": 0.1470736563205719,
|
|
"learning_rate": 0.002386378470243285,
|
|
"loss": 3.050541305541992,
|
|
"num_input_tokens_seen": 5463080960,
|
|
"step": 10420,
|
|
"train_runtime": 47330.7575,
|
|
"train_tokens_per_second": 115423.485
|
|
},
|
|
{
|
|
"epoch": 0.5644092102059038,
|
|
"grad_norm": 0.15061776340007782,
|
|
"learning_rate": 0.0023824993468814734,
|
|
"loss": 3.0488460540771483,
|
|
"num_input_tokens_seen": 5468323840,
|
|
"step": 10430,
|
|
"train_runtime": 47375.9168,
|
|
"train_tokens_per_second": 115424.127
|
|
},
|
|
{
|
|
"epoch": 0.5649503503882681,
|
|
"grad_norm": 0.14553044736385345,
|
|
"learning_rate": 0.0023786213454822496,
|
|
"loss": 3.0426799774169924,
|
|
"num_input_tokens_seen": 5473566720,
|
|
"step": 10440,
|
|
"train_runtime": 47421.0774,
|
|
"train_tokens_per_second": 115424.765
|
|
},
|
|
{
|
|
"epoch": 0.5654914905706323,
|
|
"grad_norm": 0.1505778431892395,
|
|
"learning_rate": 0.002374744477884974,
|
|
"loss": 3.0493221282958984,
|
|
"num_input_tokens_seen": 5478809600,
|
|
"step": 10450,
|
|
"train_runtime": 47466.2416,
|
|
"train_tokens_per_second": 115425.393
|
|
},
|
|
{
|
|
"epoch": 0.5660326307529966,
|
|
"grad_norm": 0.14359861612319946,
|
|
"learning_rate": 0.002370868755925543,
|
|
"loss": 3.048199272155762,
|
|
"num_input_tokens_seen": 5484052480,
|
|
"step": 10460,
|
|
"train_runtime": 47511.3938,
|
|
"train_tokens_per_second": 115426.049
|
|
},
|
|
{
|
|
"epoch": 0.5665737709353608,
|
|
"grad_norm": 0.1411399245262146,
|
|
"learning_rate": 0.0023669941914363597,
|
|
"loss": 3.0590206146240235,
|
|
"num_input_tokens_seen": 5489295360,
|
|
"step": 10470,
|
|
"train_runtime": 47556.5734,
|
|
"train_tokens_per_second": 115426.638
|
|
},
|
|
{
|
|
"epoch": 0.567114911117725,
|
|
"grad_norm": 0.14005140960216522,
|
|
"learning_rate": 0.0023631207962462905,
|
|
"loss": 3.052465057373047,
|
|
"num_input_tokens_seen": 5494538240,
|
|
"step": 10480,
|
|
"train_runtime": 47601.7335,
|
|
"train_tokens_per_second": 115427.272
|
|
},
|
|
{
|
|
"epoch": 0.5676560513000893,
|
|
"grad_norm": 0.15281735360622406,
|
|
"learning_rate": 0.0023592485821806314,
|
|
"loss": 3.0543212890625,
|
|
"num_input_tokens_seen": 5499781120,
|
|
"step": 10490,
|
|
"train_runtime": 47646.8804,
|
|
"train_tokens_per_second": 115427.937
|
|
},
|
|
{
|
|
"epoch": 0.5681971914824535,
|
|
"grad_norm": 0.15110628306865692,
|
|
"learning_rate": 0.0023553775610610744,
|
|
"loss": 3.0445037841796876,
|
|
"num_input_tokens_seen": 5505024000,
|
|
"step": 10500,
|
|
"train_runtime": 47692.0533,
|
|
"train_tokens_per_second": 115428.538
|
|
},
|
|
{
|
|
"epoch": 0.5681971914824535,
|
|
"eval_loss": 3.0027830600738525,
|
|
"eval_runtime": 1.9832,
|
|
"eval_samples_per_second": 252.115,
|
|
"eval_steps_per_second": 4.034,
|
|
"num_input_tokens_seen": 5505024000,
|
|
"step": 10500
|
|
},
|
|
{
|
|
"epoch": 0.5687383316648178,
|
|
"grad_norm": 0.14011938869953156,
|
|
"learning_rate": 0.0023515077447056705,
|
|
"loss": 3.0531822204589845,
|
|
"num_input_tokens_seen": 5510266880,
|
|
"step": 10510,
|
|
"train_runtime": 47739.2068,
|
|
"train_tokens_per_second": 115424.349
|
|
},
|
|
{
|
|
"epoch": 0.569279471847182,
|
|
"grad_norm": 0.14612512290477753,
|
|
"learning_rate": 0.002347639144928789,
|
|
"loss": 3.051645278930664,
|
|
"num_input_tokens_seen": 5515509760,
|
|
"step": 10520,
|
|
"train_runtime": 47784.361,
|
|
"train_tokens_per_second": 115424.998
|
|
},
|
|
{
|
|
"epoch": 0.5698206120295463,
|
|
"grad_norm": 0.15726540982723236,
|
|
"learning_rate": 0.0023437717735410872,
|
|
"loss": 3.0477500915527345,
|
|
"num_input_tokens_seen": 5520752640,
|
|
"step": 10530,
|
|
"train_runtime": 47829.5028,
|
|
"train_tokens_per_second": 115425.675
|
|
},
|
|
{
|
|
"epoch": 0.5703617522119105,
|
|
"grad_norm": 0.14600247144699097,
|
|
"learning_rate": 0.002339905642349474,
|
|
"loss": 3.0487768173217775,
|
|
"num_input_tokens_seen": 5525995520,
|
|
"step": 10540,
|
|
"train_runtime": 47874.6792,
|
|
"train_tokens_per_second": 115426.267
|
|
},
|
|
{
|
|
"epoch": 0.5709028923942747,
|
|
"grad_norm": 0.1526118814945221,
|
|
"learning_rate": 0.0023360407631570685,
|
|
"loss": 3.0494321823120116,
|
|
"num_input_tokens_seen": 5531238400,
|
|
"step": 10550,
|
|
"train_runtime": 47919.8527,
|
|
"train_tokens_per_second": 115426.866
|
|
},
|
|
{
|
|
"epoch": 0.571444032576639,
|
|
"grad_norm": 0.14721056818962097,
|
|
"learning_rate": 0.0023321771477631693,
|
|
"loss": 3.046247100830078,
|
|
"num_input_tokens_seen": 5536481280,
|
|
"step": 10560,
|
|
"train_runtime": 47965.0143,
|
|
"train_tokens_per_second": 115427.492
|
|
},
|
|
{
|
|
"epoch": 0.5719851727590032,
|
|
"grad_norm": 0.15114431083202362,
|
|
"learning_rate": 0.0023283148079632156,
|
|
"loss": 3.0407901763916017,
|
|
"num_input_tokens_seen": 5541724160,
|
|
"step": 10570,
|
|
"train_runtime": 48010.1749,
|
|
"train_tokens_per_second": 115428.119
|
|
},
|
|
{
|
|
"epoch": 0.5725263129413675,
|
|
"grad_norm": 0.14631570875644684,
|
|
"learning_rate": 0.0023244537555487544,
|
|
"loss": 3.0476711273193358,
|
|
"num_input_tokens_seen": 5546967040,
|
|
"step": 10580,
|
|
"train_runtime": 48055.3433,
|
|
"train_tokens_per_second": 115428.726
|
|
},
|
|
{
|
|
"epoch": 0.5730674531237318,
|
|
"grad_norm": 0.14861121773719788,
|
|
"learning_rate": 0.0023205940023074013,
|
|
"loss": 3.049782562255859,
|
|
"num_input_tokens_seen": 5552209920,
|
|
"step": 10590,
|
|
"train_runtime": 48100.5075,
|
|
"train_tokens_per_second": 115429.342
|
|
},
|
|
{
|
|
"epoch": 0.5736085933060959,
|
|
"grad_norm": 0.13904227316379547,
|
|
"learning_rate": 0.002316735560022804,
|
|
"loss": 3.055135726928711,
|
|
"num_input_tokens_seen": 5557452800,
|
|
"step": 10600,
|
|
"train_runtime": 48145.6693,
|
|
"train_tokens_per_second": 115429.962
|
|
},
|
|
{
|
|
"epoch": 0.5741497334884602,
|
|
"grad_norm": 0.13765645027160645,
|
|
"learning_rate": 0.00231287844047461,
|
|
"loss": 3.047852325439453,
|
|
"num_input_tokens_seen": 5562695680,
|
|
"step": 10610,
|
|
"train_runtime": 48190.8309,
|
|
"train_tokens_per_second": 115430.582
|
|
},
|
|
{
|
|
"epoch": 0.5746908736708244,
|
|
"grad_norm": 0.14210833609104156,
|
|
"learning_rate": 0.0023090226554384288,
|
|
"loss": 3.0472042083740236,
|
|
"num_input_tokens_seen": 5567938560,
|
|
"step": 10620,
|
|
"train_runtime": 48235.9849,
|
|
"train_tokens_per_second": 115431.22
|
|
},
|
|
{
|
|
"epoch": 0.5752320138531887,
|
|
"grad_norm": 0.149732306599617,
|
|
"learning_rate": 0.0023051682166857937,
|
|
"loss": 3.0454326629638673,
|
|
"num_input_tokens_seen": 5573181440,
|
|
"step": 10630,
|
|
"train_runtime": 48281.1275,
|
|
"train_tokens_per_second": 115431.883
|
|
},
|
|
{
|
|
"epoch": 0.575773154035553,
|
|
"grad_norm": 0.1392926275730133,
|
|
"learning_rate": 0.002301315135984128,
|
|
"loss": 3.0390705108642577,
|
|
"num_input_tokens_seen": 5578424320,
|
|
"step": 10640,
|
|
"train_runtime": 48326.2477,
|
|
"train_tokens_per_second": 115432.598
|
|
},
|
|
{
|
|
"epoch": 0.5763142942179171,
|
|
"grad_norm": 0.13122397661209106,
|
|
"learning_rate": 0.0022974634250967113,
|
|
"loss": 3.036616897583008,
|
|
"num_input_tokens_seen": 5583667200,
|
|
"step": 10650,
|
|
"train_runtime": 48371.3879,
|
|
"train_tokens_per_second": 115433.264
|
|
},
|
|
{
|
|
"epoch": 0.5768554344002814,
|
|
"grad_norm": 0.14650028944015503,
|
|
"learning_rate": 0.0022936130957826395,
|
|
"loss": 3.04638786315918,
|
|
"num_input_tokens_seen": 5588910080,
|
|
"step": 10660,
|
|
"train_runtime": 48416.5301,
|
|
"train_tokens_per_second": 115433.924
|
|
},
|
|
{
|
|
"epoch": 0.5773965745826456,
|
|
"grad_norm": 0.13940243422985077,
|
|
"learning_rate": 0.002289764159796791,
|
|
"loss": 3.049785614013672,
|
|
"num_input_tokens_seen": 5594152960,
|
|
"step": 10670,
|
|
"train_runtime": 48461.6807,
|
|
"train_tokens_per_second": 115434.564
|
|
},
|
|
{
|
|
"epoch": 0.5779377147650099,
|
|
"grad_norm": 0.13686427474021912,
|
|
"learning_rate": 0.0022859166288897895,
|
|
"loss": 3.0434268951416015,
|
|
"num_input_tokens_seen": 5599395840,
|
|
"step": 10680,
|
|
"train_runtime": 48506.8227,
|
|
"train_tokens_per_second": 115435.222
|
|
},
|
|
{
|
|
"epoch": 0.5784788549473742,
|
|
"grad_norm": 0.14600345492362976,
|
|
"learning_rate": 0.0022820705148079703,
|
|
"loss": 3.052047538757324,
|
|
"num_input_tokens_seen": 5604638720,
|
|
"step": 10690,
|
|
"train_runtime": 48555.616,
|
|
"train_tokens_per_second": 115427.198
|
|
},
|
|
{
|
|
"epoch": 0.5790199951297383,
|
|
"grad_norm": 0.14253467321395874,
|
|
"learning_rate": 0.0022782258292933432,
|
|
"loss": 3.0317237854003904,
|
|
"num_input_tokens_seen": 5609881600,
|
|
"step": 10700,
|
|
"train_runtime": 48600.7657,
|
|
"train_tokens_per_second": 115427.844
|
|
},
|
|
{
|
|
"epoch": 0.5795611353121026,
|
|
"grad_norm": 0.13422608375549316,
|
|
"learning_rate": 0.0022743825840835542,
|
|
"loss": 3.038676071166992,
|
|
"num_input_tokens_seen": 5615124480,
|
|
"step": 10710,
|
|
"train_runtime": 48645.891,
|
|
"train_tokens_per_second": 115428.546
|
|
},
|
|
{
|
|
"epoch": 0.5801022754944668,
|
|
"grad_norm": 0.14621081948280334,
|
|
"learning_rate": 0.0022705407909118574,
|
|
"loss": 3.0488845825195314,
|
|
"num_input_tokens_seen": 5620367360,
|
|
"step": 10720,
|
|
"train_runtime": 48691.0214,
|
|
"train_tokens_per_second": 115429.235
|
|
},
|
|
{
|
|
"epoch": 0.5806434156768311,
|
|
"grad_norm": 0.14328445494174957,
|
|
"learning_rate": 0.002266700461507069,
|
|
"loss": 3.039694595336914,
|
|
"num_input_tokens_seen": 5625610240,
|
|
"step": 10730,
|
|
"train_runtime": 48736.1623,
|
|
"train_tokens_per_second": 115429.898
|
|
},
|
|
{
|
|
"epoch": 0.5811845558591954,
|
|
"grad_norm": 0.1407247930765152,
|
|
"learning_rate": 0.0022628616075935377,
|
|
"loss": 3.0443794250488283,
|
|
"num_input_tokens_seen": 5630853120,
|
|
"step": 10740,
|
|
"train_runtime": 48781.2843,
|
|
"train_tokens_per_second": 115430.604
|
|
},
|
|
{
|
|
"epoch": 0.5817256960415595,
|
|
"grad_norm": 0.15159763395786285,
|
|
"learning_rate": 0.0022590242408911066,
|
|
"loss": 3.0392004013061524,
|
|
"num_input_tokens_seen": 5636096000,
|
|
"step": 10750,
|
|
"train_runtime": 48826.4182,
|
|
"train_tokens_per_second": 115431.281
|
|
},
|
|
{
|
|
"epoch": 0.5822668362239238,
|
|
"grad_norm": 0.14982502162456512,
|
|
"learning_rate": 0.0022551883731150822,
|
|
"loss": 3.041204833984375,
|
|
"num_input_tokens_seen": 5641338880,
|
|
"step": 10760,
|
|
"train_runtime": 48871.5523,
|
|
"train_tokens_per_second": 115431.956
|
|
},
|
|
{
|
|
"epoch": 0.582807976406288,
|
|
"grad_norm": 0.14223578572273254,
|
|
"learning_rate": 0.0022513540159761927,
|
|
"loss": 3.058414840698242,
|
|
"num_input_tokens_seen": 5646581760,
|
|
"step": 10770,
|
|
"train_runtime": 48916.6973,
|
|
"train_tokens_per_second": 115432.604
|
|
},
|
|
{
|
|
"epoch": 0.5833491165886523,
|
|
"grad_norm": 0.14026111364364624,
|
|
"learning_rate": 0.0022475211811805508,
|
|
"loss": 3.040976715087891,
|
|
"num_input_tokens_seen": 5651824640,
|
|
"step": 10780,
|
|
"train_runtime": 48961.8335,
|
|
"train_tokens_per_second": 115433.272
|
|
},
|
|
{
|
|
"epoch": 0.5838902567710166,
|
|
"grad_norm": 0.1421726644039154,
|
|
"learning_rate": 0.0022436898804296273,
|
|
"loss": 3.0329113006591797,
|
|
"num_input_tokens_seen": 5657067520,
|
|
"step": 10790,
|
|
"train_runtime": 49006.9777,
|
|
"train_tokens_per_second": 115433.92
|
|
},
|
|
{
|
|
"epoch": 0.5844313969533808,
|
|
"grad_norm": 0.14624913036823273,
|
|
"learning_rate": 0.0022398601254202074,
|
|
"loss": 3.0412059783935548,
|
|
"num_input_tokens_seen": 5662310400,
|
|
"step": 10800,
|
|
"train_runtime": 49052.1103,
|
|
"train_tokens_per_second": 115434.593
|
|
},
|
|
{
|
|
"epoch": 0.584972537135745,
|
|
"grad_norm": 0.14961951971054077,
|
|
"learning_rate": 0.0022360319278443555,
|
|
"loss": 3.039783477783203,
|
|
"num_input_tokens_seen": 5667553280,
|
|
"step": 10810,
|
|
"train_runtime": 49097.2475,
|
|
"train_tokens_per_second": 115435.255
|
|
},
|
|
{
|
|
"epoch": 0.5855136773181092,
|
|
"grad_norm": 0.13819707930088043,
|
|
"learning_rate": 0.0022322052993893828,
|
|
"loss": 3.0379779815673826,
|
|
"num_input_tokens_seen": 5672796160,
|
|
"step": 10820,
|
|
"train_runtime": 49142.4002,
|
|
"train_tokens_per_second": 115435.879
|
|
},
|
|
{
|
|
"epoch": 0.5860548175004735,
|
|
"grad_norm": 0.1506834328174591,
|
|
"learning_rate": 0.002228380251737811,
|
|
"loss": 3.038629341125488,
|
|
"num_input_tokens_seen": 5678039040,
|
|
"step": 10830,
|
|
"train_runtime": 49187.5417,
|
|
"train_tokens_per_second": 115436.528
|
|
},
|
|
{
|
|
"epoch": 0.5865959576828378,
|
|
"grad_norm": 0.1385858803987503,
|
|
"learning_rate": 0.0022245567965673346,
|
|
"loss": 3.0388534545898436,
|
|
"num_input_tokens_seen": 5683281920,
|
|
"step": 10840,
|
|
"train_runtime": 49232.6855,
|
|
"train_tokens_per_second": 115437.171
|
|
},
|
|
{
|
|
"epoch": 0.587137097865202,
|
|
"grad_norm": 0.14153380692005157,
|
|
"learning_rate": 0.002220734945550785,
|
|
"loss": 3.040701675415039,
|
|
"num_input_tokens_seen": 5688524800,
|
|
"step": 10850,
|
|
"train_runtime": 49277.8278,
|
|
"train_tokens_per_second": 115437.816
|
|
},
|
|
{
|
|
"epoch": 0.5876782380475662,
|
|
"grad_norm": 0.1447627693414688,
|
|
"learning_rate": 0.002216914710356098,
|
|
"loss": 3.0347267150878907,
|
|
"num_input_tokens_seen": 5693767680,
|
|
"step": 10860,
|
|
"train_runtime": 49322.9704,
|
|
"train_tokens_per_second": 115438.459
|
|
},
|
|
{
|
|
"epoch": 0.5882193782299304,
|
|
"grad_norm": 0.15700754523277283,
|
|
"learning_rate": 0.0022130961026462772,
|
|
"loss": 3.0408071517944335,
|
|
"num_input_tokens_seen": 5699010560,
|
|
"step": 10870,
|
|
"train_runtime": 49368.1084,
|
|
"train_tokens_per_second": 115439.111
|
|
},
|
|
{
|
|
"epoch": 0.5887605184122947,
|
|
"grad_norm": 0.1373981535434723,
|
|
"learning_rate": 0.002209279134079355,
|
|
"loss": 3.0413372039794924,
|
|
"num_input_tokens_seen": 5704253440,
|
|
"step": 10880,
|
|
"train_runtime": 49413.2385,
|
|
"train_tokens_per_second": 115439.781
|
|
},
|
|
{
|
|
"epoch": 0.589301658594659,
|
|
"grad_norm": 0.13982577621936798,
|
|
"learning_rate": 0.0022054638163083607,
|
|
"loss": 3.0364784240722655,
|
|
"num_input_tokens_seen": 5709496320,
|
|
"step": 10890,
|
|
"train_runtime": 49458.3538,
|
|
"train_tokens_per_second": 115440.484
|
|
},
|
|
{
|
|
"epoch": 0.5898427987770232,
|
|
"grad_norm": 0.1471603363752365,
|
|
"learning_rate": 0.0022016501609812846,
|
|
"loss": 3.02860107421875,
|
|
"num_input_tokens_seen": 5714739200,
|
|
"step": 10900,
|
|
"train_runtime": 49503.4854,
|
|
"train_tokens_per_second": 115441.148
|
|
},
|
|
{
|
|
"epoch": 0.5903839389593875,
|
|
"grad_norm": 0.140976682305336,
|
|
"learning_rate": 0.002197838179741041,
|
|
"loss": 3.048592948913574,
|
|
"num_input_tokens_seen": 5719982080,
|
|
"step": 10910,
|
|
"train_runtime": 49548.6066,
|
|
"train_tokens_per_second": 115441.835
|
|
},
|
|
{
|
|
"epoch": 0.5909250791417516,
|
|
"grad_norm": 0.1391858160495758,
|
|
"learning_rate": 0.0021940278842254336,
|
|
"loss": 3.0438766479492188,
|
|
"num_input_tokens_seen": 5725224960,
|
|
"step": 10920,
|
|
"train_runtime": 49593.7284,
|
|
"train_tokens_per_second": 115442.519
|
|
},
|
|
{
|
|
"epoch": 0.5914662193241159,
|
|
"grad_norm": 0.13984552025794983,
|
|
"learning_rate": 0.0021902192860671172,
|
|
"loss": 3.032778739929199,
|
|
"num_input_tokens_seen": 5730467840,
|
|
"step": 10930,
|
|
"train_runtime": 49638.8587,
|
|
"train_tokens_per_second": 115443.183
|
|
},
|
|
{
|
|
"epoch": 0.5920073595064802,
|
|
"grad_norm": 0.15091544389724731,
|
|
"learning_rate": 0.0021864123968935696,
|
|
"loss": 3.0441143035888674,
|
|
"num_input_tokens_seen": 5735710720,
|
|
"step": 10940,
|
|
"train_runtime": 49683.9877,
|
|
"train_tokens_per_second": 115443.848
|
|
},
|
|
{
|
|
"epoch": 0.5925484996888444,
|
|
"grad_norm": 0.1554540991783142,
|
|
"learning_rate": 0.0021826072283270465,
|
|
"loss": 3.028913116455078,
|
|
"num_input_tokens_seen": 5740953600,
|
|
"step": 10950,
|
|
"train_runtime": 49729.1048,
|
|
"train_tokens_per_second": 115444.539
|
|
},
|
|
{
|
|
"epoch": 0.5930896398712087,
|
|
"grad_norm": 0.14302626252174377,
|
|
"learning_rate": 0.0021788037919845526,
|
|
"loss": 3.0385337829589845,
|
|
"num_input_tokens_seen": 5746196480,
|
|
"step": 10960,
|
|
"train_runtime": 49774.2324,
|
|
"train_tokens_per_second": 115445.205
|
|
},
|
|
{
|
|
"epoch": 0.5936307800535728,
|
|
"grad_norm": 0.14910683035850525,
|
|
"learning_rate": 0.0021750020994778054,
|
|
"loss": 3.0436506271362305,
|
|
"num_input_tokens_seen": 5751439360,
|
|
"step": 10970,
|
|
"train_runtime": 49819.3456,
|
|
"train_tokens_per_second": 115445.903
|
|
},
|
|
{
|
|
"epoch": 0.5941719202359371,
|
|
"grad_norm": 0.15283723175525665,
|
|
"learning_rate": 0.002171202162413195,
|
|
"loss": 3.047803497314453,
|
|
"num_input_tokens_seen": 5756682240,
|
|
"step": 10980,
|
|
"train_runtime": 49864.471,
|
|
"train_tokens_per_second": 115446.572
|
|
},
|
|
{
|
|
"epoch": 0.5947130604183014,
|
|
"grad_norm": 0.14589117467403412,
|
|
"learning_rate": 0.002167403992391757,
|
|
"loss": 3.0425289154052733,
|
|
"num_input_tokens_seen": 5761925120,
|
|
"step": 10990,
|
|
"train_runtime": 49909.6019,
|
|
"train_tokens_per_second": 115447.227
|
|
},
|
|
{
|
|
"epoch": 0.5952542006006656,
|
|
"grad_norm": 0.1394151747226715,
|
|
"learning_rate": 0.0021636076010091276,
|
|
"loss": 3.0472259521484375,
|
|
"num_input_tokens_seen": 5767168000,
|
|
"step": 11000,
|
|
"train_runtime": 49954.7308,
|
|
"train_tokens_per_second": 115447.885
|
|
},
|
|
{
|
|
"epoch": 0.5952542006006656,
|
|
"eval_loss": 2.9926395416259766,
|
|
"eval_runtime": 1.9832,
|
|
"eval_samples_per_second": 252.121,
|
|
"eval_steps_per_second": 4.034,
|
|
"num_input_tokens_seen": 5767168000,
|
|
"step": 11000
|
|
},
|
|
{
|
|
"epoch": 0.5957953407830299,
|
|
"grad_norm": 0.1461019068956375,
|
|
"learning_rate": 0.002159812999855516,
|
|
"loss": 3.034767913818359,
|
|
"num_input_tokens_seen": 5772410880,
|
|
"step": 11010,
|
|
"train_runtime": 50004.2941,
|
|
"train_tokens_per_second": 115438.304
|
|
},
|
|
{
|
|
"epoch": 0.596336480965394,
|
|
"grad_norm": 0.13988643884658813,
|
|
"learning_rate": 0.002156020200515666,
|
|
"loss": 3.0288986206054687,
|
|
"num_input_tokens_seen": 5777653760,
|
|
"step": 11020,
|
|
"train_runtime": 50049.449,
|
|
"train_tokens_per_second": 115438.908
|
|
},
|
|
{
|
|
"epoch": 0.5968776211477583,
|
|
"grad_norm": 0.13354118168354034,
|
|
"learning_rate": 0.002152229214568817,
|
|
"loss": 3.0315704345703125,
|
|
"num_input_tokens_seen": 5782896640,
|
|
"step": 11030,
|
|
"train_runtime": 50094.5853,
|
|
"train_tokens_per_second": 115439.555
|
|
},
|
|
{
|
|
"epoch": 0.5974187613301226,
|
|
"grad_norm": 0.1455395370721817,
|
|
"learning_rate": 0.0021484400535886766,
|
|
"loss": 3.0255619049072267,
|
|
"num_input_tokens_seen": 5788139520,
|
|
"step": 11040,
|
|
"train_runtime": 50139.709,
|
|
"train_tokens_per_second": 115440.23
|
|
},
|
|
{
|
|
"epoch": 0.5979599015124868,
|
|
"grad_norm": 0.1527973711490631,
|
|
"learning_rate": 0.002144652729143379,
|
|
"loss": 3.0323816299438477,
|
|
"num_input_tokens_seen": 5793382400,
|
|
"step": 11050,
|
|
"train_runtime": 50184.8687,
|
|
"train_tokens_per_second": 115440.82
|
|
},
|
|
{
|
|
"epoch": 0.5985010416948511,
|
|
"grad_norm": 0.14457052946090698,
|
|
"learning_rate": 0.0021408672527954502,
|
|
"loss": 3.0245555877685546,
|
|
"num_input_tokens_seen": 5798625280,
|
|
"step": 11060,
|
|
"train_runtime": 50230.0184,
|
|
"train_tokens_per_second": 115441.432
|
|
},
|
|
{
|
|
"epoch": 0.5990421818772153,
|
|
"grad_norm": 0.1389371007680893,
|
|
"learning_rate": 0.0021370836361017764,
|
|
"loss": 3.036094856262207,
|
|
"num_input_tokens_seen": 5803868160,
|
|
"step": 11070,
|
|
"train_runtime": 50278.8251,
|
|
"train_tokens_per_second": 115433.647
|
|
},
|
|
{
|
|
"epoch": 0.5995833220595795,
|
|
"grad_norm": 0.15234215557575226,
|
|
"learning_rate": 0.002133301890613565,
|
|
"loss": 3.0295217514038084,
|
|
"num_input_tokens_seen": 5809111040,
|
|
"step": 11080,
|
|
"train_runtime": 50323.981,
|
|
"train_tokens_per_second": 115434.251
|
|
},
|
|
{
|
|
"epoch": 0.6001244622419438,
|
|
"grad_norm": 0.1413789838552475,
|
|
"learning_rate": 0.002129522027876311,
|
|
"loss": 3.021541404724121,
|
|
"num_input_tokens_seen": 5814353920,
|
|
"step": 11090,
|
|
"train_runtime": 50369.1453,
|
|
"train_tokens_per_second": 115434.834
|
|
},
|
|
{
|
|
"epoch": 0.600665602424308,
|
|
"grad_norm": 0.15141098201274872,
|
|
"learning_rate": 0.0021257440594297607,
|
|
"loss": 3.026825714111328,
|
|
"num_input_tokens_seen": 5819596800,
|
|
"step": 11100,
|
|
"train_runtime": 50414.335,
|
|
"train_tokens_per_second": 115435.358
|
|
},
|
|
{
|
|
"epoch": 0.6012067426066723,
|
|
"grad_norm": 0.1444009244441986,
|
|
"learning_rate": 0.00212196799680788,
|
|
"loss": 3.033803939819336,
|
|
"num_input_tokens_seen": 5824839680,
|
|
"step": 11110,
|
|
"train_runtime": 50459.5415,
|
|
"train_tokens_per_second": 115435.842
|
|
},
|
|
{
|
|
"epoch": 0.6017478827890365,
|
|
"grad_norm": 0.14855210483074188,
|
|
"learning_rate": 0.002118193851538812,
|
|
"loss": 3.0400081634521485,
|
|
"num_input_tokens_seen": 5830082560,
|
|
"step": 11120,
|
|
"train_runtime": 50504.7336,
|
|
"train_tokens_per_second": 115436.359
|
|
},
|
|
{
|
|
"epoch": 0.6022890229714007,
|
|
"grad_norm": 0.13804234564304352,
|
|
"learning_rate": 0.002114421635144851,
|
|
"loss": 3.0301578521728514,
|
|
"num_input_tokens_seen": 5835325440,
|
|
"step": 11130,
|
|
"train_runtime": 50549.9225,
|
|
"train_tokens_per_second": 115436.882
|
|
},
|
|
{
|
|
"epoch": 0.602830163153765,
|
|
"grad_norm": 0.14875908195972443,
|
|
"learning_rate": 0.0021106513591423967,
|
|
"loss": 3.032312774658203,
|
|
"num_input_tokens_seen": 5840568320,
|
|
"step": 11140,
|
|
"train_runtime": 50595.1041,
|
|
"train_tokens_per_second": 115437.421
|
|
},
|
|
{
|
|
"epoch": 0.6033713033361292,
|
|
"grad_norm": 0.14444148540496826,
|
|
"learning_rate": 0.0021068830350419315,
|
|
"loss": 3.038595199584961,
|
|
"num_input_tokens_seen": 5845811200,
|
|
"step": 11150,
|
|
"train_runtime": 50640.2767,
|
|
"train_tokens_per_second": 115437.979
|
|
},
|
|
{
|
|
"epoch": 0.6039124435184935,
|
|
"grad_norm": 0.14319837093353271,
|
|
"learning_rate": 0.002103116674347975,
|
|
"loss": 3.0365222930908202,
|
|
"num_input_tokens_seen": 5851054080,
|
|
"step": 11160,
|
|
"train_runtime": 50685.4574,
|
|
"train_tokens_per_second": 115438.518
|
|
},
|
|
{
|
|
"epoch": 0.6044535837008577,
|
|
"grad_norm": 0.1528901755809784,
|
|
"learning_rate": 0.002099352288559052,
|
|
"loss": 3.0367916107177733,
|
|
"num_input_tokens_seen": 5856296960,
|
|
"step": 11170,
|
|
"train_runtime": 50730.6306,
|
|
"train_tokens_per_second": 115439.073
|
|
},
|
|
{
|
|
"epoch": 0.604994723883222,
|
|
"grad_norm": 0.1397976279258728,
|
|
"learning_rate": 0.002095589889167659,
|
|
"loss": 3.026215744018555,
|
|
"num_input_tokens_seen": 5861539840,
|
|
"step": 11180,
|
|
"train_runtime": 50775.8006,
|
|
"train_tokens_per_second": 115439.634
|
|
},
|
|
{
|
|
"epoch": 0.6055358640655862,
|
|
"grad_norm": 0.1472213864326477,
|
|
"learning_rate": 0.0020918294876602294,
|
|
"loss": 3.0274309158325194,
|
|
"num_input_tokens_seen": 5866782720,
|
|
"step": 11190,
|
|
"train_runtime": 50820.9715,
|
|
"train_tokens_per_second": 115440.192
|
|
},
|
|
{
|
|
"epoch": 0.6060770042479504,
|
|
"grad_norm": 0.14902907609939575,
|
|
"learning_rate": 0.0020880710955170955,
|
|
"loss": 3.0351707458496096,
|
|
"num_input_tokens_seen": 5872025600,
|
|
"step": 11200,
|
|
"train_runtime": 50866.1392,
|
|
"train_tokens_per_second": 115440.757
|
|
},
|
|
{
|
|
"epoch": 0.6066181444303147,
|
|
"grad_norm": 0.1408633142709732,
|
|
"learning_rate": 0.0020843147242124555,
|
|
"loss": 3.029071807861328,
|
|
"num_input_tokens_seen": 5877268480,
|
|
"step": 11210,
|
|
"train_runtime": 50911.3053,
|
|
"train_tokens_per_second": 115441.324
|
|
},
|
|
{
|
|
"epoch": 0.6071592846126789,
|
|
"grad_norm": 0.14094239473342896,
|
|
"learning_rate": 0.0020805603852143383,
|
|
"loss": 3.032915496826172,
|
|
"num_input_tokens_seen": 5882511360,
|
|
"step": 11220,
|
|
"train_runtime": 50956.4731,
|
|
"train_tokens_per_second": 115441.886
|
|
},
|
|
{
|
|
"epoch": 0.6077004247950432,
|
|
"grad_norm": 0.14471176266670227,
|
|
"learning_rate": 0.0020768080899845687,
|
|
"loss": 3.0328413009643556,
|
|
"num_input_tokens_seen": 5887754240,
|
|
"step": 11230,
|
|
"train_runtime": 51001.6333,
|
|
"train_tokens_per_second": 115442.465
|
|
},
|
|
{
|
|
"epoch": 0.6082415649774074,
|
|
"grad_norm": 0.1309700757265091,
|
|
"learning_rate": 0.00207305784997873,
|
|
"loss": 3.0344516754150392,
|
|
"num_input_tokens_seen": 5892997120,
|
|
"step": 11240,
|
|
"train_runtime": 51046.7842,
|
|
"train_tokens_per_second": 115443.063
|
|
},
|
|
{
|
|
"epoch": 0.6087827051597716,
|
|
"grad_norm": 0.14067348837852478,
|
|
"learning_rate": 0.0020693096766461333,
|
|
"loss": 3.0316375732421874,
|
|
"num_input_tokens_seen": 5898240000,
|
|
"step": 11250,
|
|
"train_runtime": 51091.9272,
|
|
"train_tokens_per_second": 115443.678
|
|
},
|
|
{
|
|
"epoch": 0.6093238453421359,
|
|
"grad_norm": 0.14874261617660522,
|
|
"learning_rate": 0.00206556358142978,
|
|
"loss": 3.0260826110839845,
|
|
"num_input_tokens_seen": 5903482880,
|
|
"step": 11260,
|
|
"train_runtime": 51137.0716,
|
|
"train_tokens_per_second": 115444.289
|
|
},
|
|
{
|
|
"epoch": 0.6098649855245001,
|
|
"grad_norm": 0.1435764729976654,
|
|
"learning_rate": 0.002061819575766326,
|
|
"loss": 3.0409059524536133,
|
|
"num_input_tokens_seen": 5908725760,
|
|
"step": 11270,
|
|
"train_runtime": 51182.2082,
|
|
"train_tokens_per_second": 115444.917
|
|
},
|
|
{
|
|
"epoch": 0.6104061257068644,
|
|
"grad_norm": 0.1484087109565735,
|
|
"learning_rate": 0.002058077671086047,
|
|
"loss": 3.0283117294311523,
|
|
"num_input_tokens_seen": 5913968640,
|
|
"step": 11280,
|
|
"train_runtime": 51227.347,
|
|
"train_tokens_per_second": 115445.538
|
|
},
|
|
{
|
|
"epoch": 0.6109472658892287,
|
|
"grad_norm": 0.140775665640831,
|
|
"learning_rate": 0.002054337878812808,
|
|
"loss": 3.026752471923828,
|
|
"num_input_tokens_seen": 5919211520,
|
|
"step": 11290,
|
|
"train_runtime": 51272.4758,
|
|
"train_tokens_per_second": 115446.181
|
|
},
|
|
{
|
|
"epoch": 0.6114884060715928,
|
|
"grad_norm": 0.14487655460834503,
|
|
"learning_rate": 0.002050600210364022,
|
|
"loss": 3.0381233215332033,
|
|
"num_input_tokens_seen": 5924454400,
|
|
"step": 11300,
|
|
"train_runtime": 51317.6163,
|
|
"train_tokens_per_second": 115446.796
|
|
},
|
|
{
|
|
"epoch": 0.6120295462539571,
|
|
"grad_norm": 0.13244298100471497,
|
|
"learning_rate": 0.0020468646771506184,
|
|
"loss": 3.037242889404297,
|
|
"num_input_tokens_seen": 5929697280,
|
|
"step": 11310,
|
|
"train_runtime": 51362.7685,
|
|
"train_tokens_per_second": 115447.384
|
|
},
|
|
{
|
|
"epoch": 0.6125706864363213,
|
|
"grad_norm": 0.13805389404296875,
|
|
"learning_rate": 0.002043131290577007,
|
|
"loss": 3.034191703796387,
|
|
"num_input_tokens_seen": 5934940160,
|
|
"step": 11320,
|
|
"train_runtime": 51407.9443,
|
|
"train_tokens_per_second": 115447.919
|
|
},
|
|
{
|
|
"epoch": 0.6131118266186856,
|
|
"grad_norm": 0.13927042484283447,
|
|
"learning_rate": 0.002039400062041048,
|
|
"loss": 3.0405059814453126,
|
|
"num_input_tokens_seen": 5940183040,
|
|
"step": 11330,
|
|
"train_runtime": 51453.1187,
|
|
"train_tokens_per_second": 115448.455
|
|
},
|
|
{
|
|
"epoch": 0.6136529668010499,
|
|
"grad_norm": 0.13962484896183014,
|
|
"learning_rate": 0.0020356710029340096,
|
|
"loss": 3.0331016540527345,
|
|
"num_input_tokens_seen": 5945425920,
|
|
"step": 11340,
|
|
"train_runtime": 51498.2896,
|
|
"train_tokens_per_second": 115448.998
|
|
},
|
|
{
|
|
"epoch": 0.614194106983414,
|
|
"grad_norm": 0.2101336121559143,
|
|
"learning_rate": 0.0020319441246405357,
|
|
"loss": 3.028001594543457,
|
|
"num_input_tokens_seen": 5950668800,
|
|
"step": 11350,
|
|
"train_runtime": 51543.4451,
|
|
"train_tokens_per_second": 115449.574
|
|
},
|
|
{
|
|
"epoch": 0.6147352471657783,
|
|
"grad_norm": 0.14625418186187744,
|
|
"learning_rate": 0.0020282194385386173,
|
|
"loss": 3.0344852447509765,
|
|
"num_input_tokens_seen": 5955911680,
|
|
"step": 11360,
|
|
"train_runtime": 51588.6188,
|
|
"train_tokens_per_second": 115450.109
|
|
},
|
|
{
|
|
"epoch": 0.6152763873481425,
|
|
"grad_norm": 0.1353636085987091,
|
|
"learning_rate": 0.002024496955999548,
|
|
"loss": 3.0306270599365233,
|
|
"num_input_tokens_seen": 5961154560,
|
|
"step": 11370,
|
|
"train_runtime": 51633.7613,
|
|
"train_tokens_per_second": 115450.713
|
|
},
|
|
{
|
|
"epoch": 0.6158175275305068,
|
|
"grad_norm": 0.1368403434753418,
|
|
"learning_rate": 0.0020207766883878955,
|
|
"loss": 3.0311580657958985,
|
|
"num_input_tokens_seen": 5966397440,
|
|
"step": 11380,
|
|
"train_runtime": 51678.9232,
|
|
"train_tokens_per_second": 115451.272
|
|
},
|
|
{
|
|
"epoch": 0.6163586677128711,
|
|
"grad_norm": 0.14600516855716705,
|
|
"learning_rate": 0.0020170586470614656,
|
|
"loss": 3.0117847442626955,
|
|
"num_input_tokens_seen": 5971640320,
|
|
"step": 11390,
|
|
"train_runtime": 51724.0749,
|
|
"train_tokens_per_second": 115451.853
|
|
},
|
|
{
|
|
"epoch": 0.6168998078952352,
|
|
"grad_norm": 0.14564567804336548,
|
|
"learning_rate": 0.002013342843371269,
|
|
"loss": 3.037702941894531,
|
|
"num_input_tokens_seen": 5976883200,
|
|
"step": 11400,
|
|
"train_runtime": 51769.2278,
|
|
"train_tokens_per_second": 115452.431
|
|
},
|
|
{
|
|
"epoch": 0.6174409480775995,
|
|
"grad_norm": 0.1405801773071289,
|
|
"learning_rate": 0.0020096292886614825,
|
|
"loss": 3.0343984603881835,
|
|
"num_input_tokens_seen": 5982126080,
|
|
"step": 11410,
|
|
"train_runtime": 51814.3769,
|
|
"train_tokens_per_second": 115453.016
|
|
},
|
|
{
|
|
"epoch": 0.6179820882599637,
|
|
"grad_norm": 0.14390794932842255,
|
|
"learning_rate": 0.002005917994269417,
|
|
"loss": 3.023337173461914,
|
|
"num_input_tokens_seen": 5987368960,
|
|
"step": 11420,
|
|
"train_runtime": 51859.5123,
|
|
"train_tokens_per_second": 115453.63
|
|
},
|
|
{
|
|
"epoch": 0.618523228442328,
|
|
"grad_norm": 0.14093852043151855,
|
|
"learning_rate": 0.0020022089715254847,
|
|
"loss": 3.0304771423339845,
|
|
"num_input_tokens_seen": 5992611840,
|
|
"step": 11430,
|
|
"train_runtime": 51904.644,
|
|
"train_tokens_per_second": 115454.252
|
|
},
|
|
{
|
|
"epoch": 0.6190643686246923,
|
|
"grad_norm": 0.1447506844997406,
|
|
"learning_rate": 0.001998502231753161,
|
|
"loss": 3.030156898498535,
|
|
"num_input_tokens_seen": 5997854720,
|
|
"step": 11440,
|
|
"train_runtime": 51949.7701,
|
|
"train_tokens_per_second": 115454.885
|
|
},
|
|
{
|
|
"epoch": 0.6196055088070564,
|
|
"grad_norm": 0.1445121169090271,
|
|
"learning_rate": 0.001994797786268952,
|
|
"loss": 3.0251228332519533,
|
|
"num_input_tokens_seen": 6003097600,
|
|
"step": 11450,
|
|
"train_runtime": 51998.5254,
|
|
"train_tokens_per_second": 115447.458
|
|
},
|
|
{
|
|
"epoch": 0.6201466489894207,
|
|
"grad_norm": 0.15314067900180817,
|
|
"learning_rate": 0.0019910956463823587,
|
|
"loss": 3.022572135925293,
|
|
"num_input_tokens_seen": 6008340480,
|
|
"step": 11460,
|
|
"train_runtime": 52043.6251,
|
|
"train_tokens_per_second": 115448.155
|
|
},
|
|
{
|
|
"epoch": 0.6206877891717849,
|
|
"grad_norm": 0.1409369558095932,
|
|
"learning_rate": 0.0019873958233958444,
|
|
"loss": 3.024155044555664,
|
|
"num_input_tokens_seen": 6013583360,
|
|
"step": 11470,
|
|
"train_runtime": 52088.7237,
|
|
"train_tokens_per_second": 115448.852
|
|
},
|
|
{
|
|
"epoch": 0.6212289293541492,
|
|
"grad_norm": 0.15012867748737335,
|
|
"learning_rate": 0.0019836983286047995,
|
|
"loss": 3.0334211349487306,
|
|
"num_input_tokens_seen": 6018826240,
|
|
"step": 11480,
|
|
"train_runtime": 52133.8249,
|
|
"train_tokens_per_second": 115449.543
|
|
},
|
|
{
|
|
"epoch": 0.6217700695365135,
|
|
"grad_norm": 0.14085648953914642,
|
|
"learning_rate": 0.0019800031732975032,
|
|
"loss": 3.0264703750610353,
|
|
"num_input_tokens_seen": 6024069120,
|
|
"step": 11490,
|
|
"train_runtime": 52178.9281,
|
|
"train_tokens_per_second": 115450.228
|
|
},
|
|
{
|
|
"epoch": 0.6223112097188777,
|
|
"grad_norm": 0.14266955852508545,
|
|
"learning_rate": 0.001976310368755096,
|
|
"loss": 3.032570648193359,
|
|
"num_input_tokens_seen": 6029312000,
|
|
"step": 11500,
|
|
"train_runtime": 52224.0378,
|
|
"train_tokens_per_second": 115450.897
|
|
},
|
|
{
|
|
"epoch": 0.6223112097188777,
|
|
"eval_loss": 2.984975814819336,
|
|
"eval_runtime": 1.9819,
|
|
"eval_samples_per_second": 252.288,
|
|
"eval_steps_per_second": 4.037,
|
|
"num_input_tokens_seen": 6029312000,
|
|
"step": 11500
|
|
},
|
|
{
|
|
"epoch": 0.6228523499012419,
|
|
"grad_norm": 0.13947011530399323,
|
|
"learning_rate": 0.001972619926251541,
|
|
"loss": 3.0404077529907227,
|
|
"num_input_tokens_seen": 6034554880,
|
|
"step": 11510,
|
|
"train_runtime": 52271.1258,
|
|
"train_tokens_per_second": 115447.195
|
|
},
|
|
{
|
|
"epoch": 0.6233934900836061,
|
|
"grad_norm": 0.1446669101715088,
|
|
"learning_rate": 0.001968931857053588,
|
|
"loss": 3.021891784667969,
|
|
"num_input_tokens_seen": 6039797760,
|
|
"step": 11520,
|
|
"train_runtime": 52316.2406,
|
|
"train_tokens_per_second": 115447.855
|
|
},
|
|
{
|
|
"epoch": 0.6239346302659704,
|
|
"grad_norm": 0.13946829736232758,
|
|
"learning_rate": 0.0019652461724207425,
|
|
"loss": 3.0241966247558594,
|
|
"num_input_tokens_seen": 6045040640,
|
|
"step": 11530,
|
|
"train_runtime": 52361.3587,
|
|
"train_tokens_per_second": 115448.506
|
|
},
|
|
{
|
|
"epoch": 0.6244757704483347,
|
|
"grad_norm": 0.14458313584327698,
|
|
"learning_rate": 0.0019615628836052324,
|
|
"loss": 3.0141645431518556,
|
|
"num_input_tokens_seen": 6050283520,
|
|
"step": 11540,
|
|
"train_runtime": 52406.4606,
|
|
"train_tokens_per_second": 115449.192
|
|
},
|
|
{
|
|
"epoch": 0.6250169106306989,
|
|
"grad_norm": 0.14436115324497223,
|
|
"learning_rate": 0.0019578820018519663,
|
|
"loss": 3.0331525802612305,
|
|
"num_input_tokens_seen": 6055526400,
|
|
"step": 11550,
|
|
"train_runtime": 52451.5704,
|
|
"train_tokens_per_second": 115449.859
|
|
},
|
|
{
|
|
"epoch": 0.6255580508130631,
|
|
"grad_norm": 0.14012649655342102,
|
|
"learning_rate": 0.0019542035383985083,
|
|
"loss": 3.043803405761719,
|
|
"num_input_tokens_seen": 6060769280,
|
|
"step": 11560,
|
|
"train_runtime": 52496.6939,
|
|
"train_tokens_per_second": 115450.495
|
|
},
|
|
{
|
|
"epoch": 0.6260991909954273,
|
|
"grad_norm": 0.14854469895362854,
|
|
"learning_rate": 0.0019505275044750371,
|
|
"loss": 3.0200592041015626,
|
|
"num_input_tokens_seen": 6066012160,
|
|
"step": 11570,
|
|
"train_runtime": 52541.8152,
|
|
"train_tokens_per_second": 115451.134
|
|
},
|
|
{
|
|
"epoch": 0.6266403311777916,
|
|
"grad_norm": 0.15853960812091827,
|
|
"learning_rate": 0.0019468539113043166,
|
|
"loss": 3.020526885986328,
|
|
"num_input_tokens_seen": 6071255040,
|
|
"step": 11580,
|
|
"train_runtime": 52586.931,
|
|
"train_tokens_per_second": 115451.785
|
|
},
|
|
{
|
|
"epoch": 0.6271814713601559,
|
|
"grad_norm": 0.14197298884391785,
|
|
"learning_rate": 0.0019431827701016575,
|
|
"loss": 3.0370616912841797,
|
|
"num_input_tokens_seen": 6076497920,
|
|
"step": 11590,
|
|
"train_runtime": 52632.0665,
|
|
"train_tokens_per_second": 115452.391
|
|
},
|
|
{
|
|
"epoch": 0.6277226115425201,
|
|
"grad_norm": 0.1305466592311859,
|
|
"learning_rate": 0.0019395140920748827,
|
|
"loss": 3.023914337158203,
|
|
"num_input_tokens_seen": 6081740800,
|
|
"step": 11600,
|
|
"train_runtime": 52677.2162,
|
|
"train_tokens_per_second": 115452.965
|
|
},
|
|
{
|
|
"epoch": 0.6282637517248844,
|
|
"grad_norm": 0.1447763293981552,
|
|
"learning_rate": 0.0019358478884243008,
|
|
"loss": 3.024199676513672,
|
|
"num_input_tokens_seen": 6086983680,
|
|
"step": 11610,
|
|
"train_runtime": 52722.3572,
|
|
"train_tokens_per_second": 115453.557
|
|
},
|
|
{
|
|
"epoch": 0.6288048919072485,
|
|
"grad_norm": 0.14126408100128174,
|
|
"learning_rate": 0.0019321841703426608,
|
|
"loss": 3.022255706787109,
|
|
"num_input_tokens_seen": 6092226560,
|
|
"step": 11620,
|
|
"train_runtime": 52767.4813,
|
|
"train_tokens_per_second": 115454.185
|
|
},
|
|
{
|
|
"epoch": 0.6293460320896128,
|
|
"grad_norm": 0.1334850788116455,
|
|
"learning_rate": 0.0019285229490151263,
|
|
"loss": 3.0233287811279297,
|
|
"num_input_tokens_seen": 6097469440,
|
|
"step": 11630,
|
|
"train_runtime": 52812.6435,
|
|
"train_tokens_per_second": 115454.729
|
|
},
|
|
{
|
|
"epoch": 0.6298871722719771,
|
|
"grad_norm": 0.14635370671749115,
|
|
"learning_rate": 0.0019248642356192365,
|
|
"loss": 3.03590087890625,
|
|
"num_input_tokens_seen": 6102712320,
|
|
"step": 11640,
|
|
"train_runtime": 52857.8692,
|
|
"train_tokens_per_second": 115455.133
|
|
},
|
|
{
|
|
"epoch": 0.6304283124543413,
|
|
"grad_norm": 0.13621026277542114,
|
|
"learning_rate": 0.0019212080413248762,
|
|
"loss": 3.023410415649414,
|
|
"num_input_tokens_seen": 6107955200,
|
|
"step": 11650,
|
|
"train_runtime": 52903.1239,
|
|
"train_tokens_per_second": 115455.473
|
|
},
|
|
{
|
|
"epoch": 0.6309694526367056,
|
|
"grad_norm": 0.14006845653057098,
|
|
"learning_rate": 0.0019175543772942383,
|
|
"loss": 3.020222473144531,
|
|
"num_input_tokens_seen": 6113198080,
|
|
"step": 11660,
|
|
"train_runtime": 52948.2709,
|
|
"train_tokens_per_second": 115456.047
|
|
},
|
|
{
|
|
"epoch": 0.6315105928190697,
|
|
"grad_norm": 0.13746832311153412,
|
|
"learning_rate": 0.0019139032546817902,
|
|
"loss": 3.0225994110107424,
|
|
"num_input_tokens_seen": 6118440960,
|
|
"step": 11670,
|
|
"train_runtime": 52993.4226,
|
|
"train_tokens_per_second": 115456.611
|
|
},
|
|
{
|
|
"epoch": 0.632051733001434,
|
|
"grad_norm": 0.13812102377414703,
|
|
"learning_rate": 0.0019102546846342411,
|
|
"loss": 3.0324447631835936,
|
|
"num_input_tokens_seen": 6123683840,
|
|
"step": 11680,
|
|
"train_runtime": 53038.5588,
|
|
"train_tokens_per_second": 115457.207
|
|
},
|
|
{
|
|
"epoch": 0.6325928731837983,
|
|
"grad_norm": 0.14019303023815155,
|
|
"learning_rate": 0.0019066086782905097,
|
|
"loss": 3.022325897216797,
|
|
"num_input_tokens_seen": 6128926720,
|
|
"step": 11690,
|
|
"train_runtime": 53083.7143,
|
|
"train_tokens_per_second": 115457.76
|
|
},
|
|
{
|
|
"epoch": 0.6331340133661625,
|
|
"grad_norm": 0.1436738818883896,
|
|
"learning_rate": 0.0019029652467816838,
|
|
"loss": 3.0244091033935545,
|
|
"num_input_tokens_seen": 6134169600,
|
|
"step": 11700,
|
|
"train_runtime": 53128.868,
|
|
"train_tokens_per_second": 115458.315
|
|
},
|
|
{
|
|
"epoch": 0.6336751535485268,
|
|
"grad_norm": 0.14594176411628723,
|
|
"learning_rate": 0.0018993244012309913,
|
|
"loss": 3.025048828125,
|
|
"num_input_tokens_seen": 6139412480,
|
|
"step": 11710,
|
|
"train_runtime": 53173.9948,
|
|
"train_tokens_per_second": 115458.929
|
|
},
|
|
{
|
|
"epoch": 0.634216293730891,
|
|
"grad_norm": 0.15385597944259644,
|
|
"learning_rate": 0.0018956861527537688,
|
|
"loss": 3.0213130950927733,
|
|
"num_input_tokens_seen": 6144655360,
|
|
"step": 11720,
|
|
"train_runtime": 53219.1405,
|
|
"train_tokens_per_second": 115459.5
|
|
},
|
|
{
|
|
"epoch": 0.6347574339132552,
|
|
"grad_norm": 0.14445240795612335,
|
|
"learning_rate": 0.0018920505124574195,
|
|
"loss": 3.029845428466797,
|
|
"num_input_tokens_seen": 6149898240,
|
|
"step": 11730,
|
|
"train_runtime": 53264.2928,
|
|
"train_tokens_per_second": 115460.056
|
|
},
|
|
{
|
|
"epoch": 0.6352985740956195,
|
|
"grad_norm": 0.1384369432926178,
|
|
"learning_rate": 0.001888417491441387,
|
|
"loss": 3.0266345977783202,
|
|
"num_input_tokens_seen": 6155141120,
|
|
"step": 11740,
|
|
"train_runtime": 53309.4606,
|
|
"train_tokens_per_second": 115460.578
|
|
},
|
|
{
|
|
"epoch": 0.6358397142779837,
|
|
"grad_norm": 0.14229294657707214,
|
|
"learning_rate": 0.0018847871007971163,
|
|
"loss": 3.017131042480469,
|
|
"num_input_tokens_seen": 6160384000,
|
|
"step": 11750,
|
|
"train_runtime": 53354.6359,
|
|
"train_tokens_per_second": 115461.082
|
|
},
|
|
{
|
|
"epoch": 0.636380854460348,
|
|
"grad_norm": 0.14127928018569946,
|
|
"learning_rate": 0.0018811593516080234,
|
|
"loss": 3.021234703063965,
|
|
"num_input_tokens_seen": 6165626880,
|
|
"step": 11760,
|
|
"train_runtime": 53399.8028,
|
|
"train_tokens_per_second": 115461.604
|
|
},
|
|
{
|
|
"epoch": 0.6369219946427122,
|
|
"grad_norm": 0.13989216089248657,
|
|
"learning_rate": 0.0018775342549494606,
|
|
"loss": 3.0207067489624024,
|
|
"num_input_tokens_seen": 6170869760,
|
|
"step": 11770,
|
|
"train_runtime": 53444.9593,
|
|
"train_tokens_per_second": 115462.147
|
|
},
|
|
{
|
|
"epoch": 0.6374631348250764,
|
|
"grad_norm": 0.141075000166893,
|
|
"learning_rate": 0.0018739118218886802,
|
|
"loss": 3.017308807373047,
|
|
"num_input_tokens_seen": 6176112640,
|
|
"step": 11780,
|
|
"train_runtime": 53490.1129,
|
|
"train_tokens_per_second": 115462.696
|
|
},
|
|
{
|
|
"epoch": 0.6380042750074407,
|
|
"grad_norm": 0.1446276307106018,
|
|
"learning_rate": 0.0018702920634848035,
|
|
"loss": 3.0272090911865233,
|
|
"num_input_tokens_seen": 6181355520,
|
|
"step": 11790,
|
|
"train_runtime": 53535.2546,
|
|
"train_tokens_per_second": 115463.269
|
|
},
|
|
{
|
|
"epoch": 0.6385454151898049,
|
|
"grad_norm": 0.14022940397262573,
|
|
"learning_rate": 0.001866674990788788,
|
|
"loss": 3.0206020355224608,
|
|
"num_input_tokens_seen": 6186598400,
|
|
"step": 11800,
|
|
"train_runtime": 53580.432,
|
|
"train_tokens_per_second": 115463.765
|
|
},
|
|
{
|
|
"epoch": 0.6390865553721692,
|
|
"grad_norm": 0.1391611397266388,
|
|
"learning_rate": 0.0018630606148433892,
|
|
"loss": 3.0259307861328124,
|
|
"num_input_tokens_seen": 6191841280,
|
|
"step": 11810,
|
|
"train_runtime": 53625.6025,
|
|
"train_tokens_per_second": 115464.274
|
|
},
|
|
{
|
|
"epoch": 0.6396276955545334,
|
|
"grad_norm": 0.1375313252210617,
|
|
"learning_rate": 0.0018594489466831293,
|
|
"loss": 3.019388198852539,
|
|
"num_input_tokens_seen": 6197084160,
|
|
"step": 11820,
|
|
"train_runtime": 53670.752,
|
|
"train_tokens_per_second": 115464.828
|
|
},
|
|
{
|
|
"epoch": 0.6401688357368976,
|
|
"grad_norm": 0.13326410949230194,
|
|
"learning_rate": 0.0018558399973342677,
|
|
"loss": 3.0195072174072264,
|
|
"num_input_tokens_seen": 6202327040,
|
|
"step": 11830,
|
|
"train_runtime": 53719.4728,
|
|
"train_tokens_per_second": 115457.705
|
|
},
|
|
{
|
|
"epoch": 0.6407099759192619,
|
|
"grad_norm": 0.1425514966249466,
|
|
"learning_rate": 0.0018522337778147586,
|
|
"loss": 3.012344741821289,
|
|
"num_input_tokens_seen": 6207569920,
|
|
"step": 11840,
|
|
"train_runtime": 53764.8643,
|
|
"train_tokens_per_second": 115457.744
|
|
},
|
|
{
|
|
"epoch": 0.6412511161016261,
|
|
"grad_norm": 0.14373312890529633,
|
|
"learning_rate": 0.001848630299134224,
|
|
"loss": 3.0200828552246093,
|
|
"num_input_tokens_seen": 6212812800,
|
|
"step": 11850,
|
|
"train_runtime": 53810.7174,
|
|
"train_tokens_per_second": 115456.792
|
|
},
|
|
{
|
|
"epoch": 0.6417922562839904,
|
|
"grad_norm": 0.14393045008182526,
|
|
"learning_rate": 0.0018450295722939214,
|
|
"loss": 3.0205759048461913,
|
|
"num_input_tokens_seen": 6218055680,
|
|
"step": 11860,
|
|
"train_runtime": 53856.71,
|
|
"train_tokens_per_second": 115455.543
|
|
},
|
|
{
|
|
"epoch": 0.6423333964663546,
|
|
"grad_norm": 0.13831470906734467,
|
|
"learning_rate": 0.0018414316082867015,
|
|
"loss": 3.018105697631836,
|
|
"num_input_tokens_seen": 6223298560,
|
|
"step": 11870,
|
|
"train_runtime": 53902.8725,
|
|
"train_tokens_per_second": 115453.932
|
|
},
|
|
{
|
|
"epoch": 0.6428745366487189,
|
|
"grad_norm": 0.14368562400341034,
|
|
"learning_rate": 0.0018378364180969837,
|
|
"loss": 3.0205171585083006,
|
|
"num_input_tokens_seen": 6228541440,
|
|
"step": 11880,
|
|
"train_runtime": 53949.0344,
|
|
"train_tokens_per_second": 115452.325
|
|
},
|
|
{
|
|
"epoch": 0.6434156768310831,
|
|
"grad_norm": 0.134961798787117,
|
|
"learning_rate": 0.0018342440127007181,
|
|
"loss": 3.0208873748779297,
|
|
"num_input_tokens_seen": 6233784320,
|
|
"step": 11890,
|
|
"train_runtime": 53994.8144,
|
|
"train_tokens_per_second": 115451.537
|
|
},
|
|
{
|
|
"epoch": 0.6439568170134473,
|
|
"grad_norm": 0.139762744307518,
|
|
"learning_rate": 0.0018306544030653531,
|
|
"loss": 3.0138370513916017,
|
|
"num_input_tokens_seen": 6239027200,
|
|
"step": 11900,
|
|
"train_runtime": 54040.0794,
|
|
"train_tokens_per_second": 115451.851
|
|
},
|
|
{
|
|
"epoch": 0.6444979571958116,
|
|
"grad_norm": 0.15458019077777863,
|
|
"learning_rate": 0.0018270676001498033,
|
|
"loss": 3.025080108642578,
|
|
"num_input_tokens_seen": 6244270080,
|
|
"step": 11910,
|
|
"train_runtime": 54085.3315,
|
|
"train_tokens_per_second": 115452.192
|
|
},
|
|
{
|
|
"epoch": 0.6450390973781758,
|
|
"grad_norm": 0.13538894057273865,
|
|
"learning_rate": 0.001823483614904411,
|
|
"loss": 3.016307830810547,
|
|
"num_input_tokens_seen": 6249512960,
|
|
"step": 11920,
|
|
"train_runtime": 54130.6042,
|
|
"train_tokens_per_second": 115452.489
|
|
},
|
|
{
|
|
"epoch": 0.6455802375605401,
|
|
"grad_norm": 0.13436593115329742,
|
|
"learning_rate": 0.0018199024582709177,
|
|
"loss": 3.0229183197021485,
|
|
"num_input_tokens_seen": 6254755840,
|
|
"step": 11930,
|
|
"train_runtime": 54175.8479,
|
|
"train_tokens_per_second": 115452.846
|
|
},
|
|
{
|
|
"epoch": 0.6461213777429043,
|
|
"grad_norm": 0.1262059211730957,
|
|
"learning_rate": 0.0018163241411824327,
|
|
"loss": 3.0243408203125,
|
|
"num_input_tokens_seen": 6259998720,
|
|
"step": 11940,
|
|
"train_runtime": 54221.0877,
|
|
"train_tokens_per_second": 115453.212
|
|
},
|
|
{
|
|
"epoch": 0.6466625179252685,
|
|
"grad_norm": 0.14077694714069366,
|
|
"learning_rate": 0.0018127486745633914,
|
|
"loss": 3.009103775024414,
|
|
"num_input_tokens_seen": 6265241600,
|
|
"step": 11950,
|
|
"train_runtime": 54266.3714,
|
|
"train_tokens_per_second": 115453.483
|
|
},
|
|
{
|
|
"epoch": 0.6472036581076328,
|
|
"grad_norm": 0.14338544011116028,
|
|
"learning_rate": 0.001809176069329529,
|
|
"loss": 3.019987106323242,
|
|
"num_input_tokens_seen": 6270484480,
|
|
"step": 11960,
|
|
"train_runtime": 54311.6327,
|
|
"train_tokens_per_second": 115453.802
|
|
},
|
|
{
|
|
"epoch": 0.647744798289997,
|
|
"grad_norm": 0.1309393346309662,
|
|
"learning_rate": 0.001805606336387845,
|
|
"loss": 3.0178783416748045,
|
|
"num_input_tokens_seen": 6275727360,
|
|
"step": 11970,
|
|
"train_runtime": 54356.8873,
|
|
"train_tokens_per_second": 115454.134
|
|
},
|
|
{
|
|
"epoch": 0.6482859384723613,
|
|
"grad_norm": 0.1303347647190094,
|
|
"learning_rate": 0.0018020394866365714,
|
|
"loss": 3.0253570556640623,
|
|
"num_input_tokens_seen": 6280970240,
|
|
"step": 11980,
|
|
"train_runtime": 54402.1335,
|
|
"train_tokens_per_second": 115454.484
|
|
},
|
|
{
|
|
"epoch": 0.6488270786547256,
|
|
"grad_norm": 0.14178113639354706,
|
|
"learning_rate": 0.0017984755309651346,
|
|
"loss": 3.0267719268798827,
|
|
"num_input_tokens_seen": 6286213120,
|
|
"step": 11990,
|
|
"train_runtime": 54447.3835,
|
|
"train_tokens_per_second": 115454.825
|
|
},
|
|
{
|
|
"epoch": 0.6493682188370897,
|
|
"grad_norm": 0.1430656611919403,
|
|
"learning_rate": 0.0017949144802541274,
|
|
"loss": 3.0143644332885744,
|
|
"num_input_tokens_seen": 6291456000,
|
|
"step": 12000,
|
|
"train_runtime": 54492.6535,
|
|
"train_tokens_per_second": 115455.123
|
|
},
|
|
{
|
|
"epoch": 0.6493682188370897,
|
|
"eval_loss": 2.9761710166931152,
|
|
"eval_runtime": 1.9875,
|
|
"eval_samples_per_second": 251.575,
|
|
"eval_steps_per_second": 4.025,
|
|
"num_input_tokens_seen": 6291456000,
|
|
"step": 12000
|
|
},
|
|
{
|
|
"epoch": 0.649909359019454,
|
|
"grad_norm": 0.14500592648983002,
|
|
"learning_rate": 0.0017913563453752746,
|
|
"loss": 3.018670654296875,
|
|
"num_input_tokens_seen": 6296698880,
|
|
"step": 12010,
|
|
"train_runtime": 54542.2937,
|
|
"train_tokens_per_second": 115446.169
|
|
},
|
|
{
|
|
"epoch": 0.6504504992018182,
|
|
"grad_norm": 0.1448933333158493,
|
|
"learning_rate": 0.0017878011371913977,
|
|
"loss": 3.0202388763427734,
|
|
"num_input_tokens_seen": 6301941760,
|
|
"step": 12020,
|
|
"train_runtime": 54587.5091,
|
|
"train_tokens_per_second": 115446.59
|
|
},
|
|
{
|
|
"epoch": 0.6509916393841825,
|
|
"grad_norm": 0.1533222645521164,
|
|
"learning_rate": 0.0017842488665563833,
|
|
"loss": 3.025776672363281,
|
|
"num_input_tokens_seen": 6307184640,
|
|
"step": 12030,
|
|
"train_runtime": 54632.7175,
|
|
"train_tokens_per_second": 115447.024
|
|
},
|
|
{
|
|
"epoch": 0.6515327795665468,
|
|
"grad_norm": 0.13312490284442902,
|
|
"learning_rate": 0.0017806995443151524,
|
|
"loss": 3.0187503814697267,
|
|
"num_input_tokens_seen": 6312427520,
|
|
"step": 12040,
|
|
"train_runtime": 54677.8786,
|
|
"train_tokens_per_second": 115447.557
|
|
},
|
|
{
|
|
"epoch": 0.6520739197489109,
|
|
"grad_norm": 0.13797084987163544,
|
|
"learning_rate": 0.0017771531813036206,
|
|
"loss": 3.019959259033203,
|
|
"num_input_tokens_seen": 6317670400,
|
|
"step": 12050,
|
|
"train_runtime": 54723.0506,
|
|
"train_tokens_per_second": 115448.067
|
|
},
|
|
{
|
|
"epoch": 0.6526150599312752,
|
|
"grad_norm": 0.13628187775611877,
|
|
"learning_rate": 0.0017736097883486713,
|
|
"loss": 3.012210655212402,
|
|
"num_input_tokens_seen": 6322913280,
|
|
"step": 12060,
|
|
"train_runtime": 54768.2437,
|
|
"train_tokens_per_second": 115448.531
|
|
},
|
|
{
|
|
"epoch": 0.6531562001136394,
|
|
"grad_norm": 0.13764619827270508,
|
|
"learning_rate": 0.001770069376268119,
|
|
"loss": 3.0185993194580076,
|
|
"num_input_tokens_seen": 6328156160,
|
|
"step": 12070,
|
|
"train_runtime": 54813.436,
|
|
"train_tokens_per_second": 115448.996
|
|
},
|
|
{
|
|
"epoch": 0.6536973402960037,
|
|
"grad_norm": 0.14087094366550446,
|
|
"learning_rate": 0.001766531955870682,
|
|
"loss": 3.0167076110839846,
|
|
"num_input_tokens_seen": 6333399040,
|
|
"step": 12080,
|
|
"train_runtime": 54858.6498,
|
|
"train_tokens_per_second": 115449.415
|
|
},
|
|
{
|
|
"epoch": 0.654238480478368,
|
|
"grad_norm": 0.13622906804084778,
|
|
"learning_rate": 0.0017629975379559405,
|
|
"loss": 3.021717643737793,
|
|
"num_input_tokens_seen": 6338641920,
|
|
"step": 12090,
|
|
"train_runtime": 54903.8659,
|
|
"train_tokens_per_second": 115449.829
|
|
},
|
|
{
|
|
"epoch": 0.6547796206607321,
|
|
"grad_norm": 0.13120146095752716,
|
|
"learning_rate": 0.001759466133314308,
|
|
"loss": 3.0197391510009766,
|
|
"num_input_tokens_seen": 6343884800,
|
|
"step": 12100,
|
|
"train_runtime": 54949.0889,
|
|
"train_tokens_per_second": 115450.227
|
|
},
|
|
{
|
|
"epoch": 0.6553207608430964,
|
|
"grad_norm": 0.139594167470932,
|
|
"learning_rate": 0.001755937752727003,
|
|
"loss": 3.0223533630371096,
|
|
"num_input_tokens_seen": 6349127680,
|
|
"step": 12110,
|
|
"train_runtime": 54994.2934,
|
|
"train_tokens_per_second": 115450.664
|
|
},
|
|
{
|
|
"epoch": 0.6558619010254606,
|
|
"grad_norm": 0.15013989806175232,
|
|
"learning_rate": 0.001752412406966008,
|
|
"loss": 3.0148881912231444,
|
|
"num_input_tokens_seen": 6354370560,
|
|
"step": 12120,
|
|
"train_runtime": 55039.5071,
|
|
"train_tokens_per_second": 115451.08
|
|
},
|
|
{
|
|
"epoch": 0.6564030412078249,
|
|
"grad_norm": 0.13876710832118988,
|
|
"learning_rate": 0.0017488901067940416,
|
|
"loss": 3.0114933013916017,
|
|
"num_input_tokens_seen": 6359613440,
|
|
"step": 12130,
|
|
"train_runtime": 55084.7162,
|
|
"train_tokens_per_second": 115451.506
|
|
},
|
|
{
|
|
"epoch": 0.6569441813901892,
|
|
"grad_norm": 0.13125662505626678,
|
|
"learning_rate": 0.0017453708629645238,
|
|
"loss": 3.004977226257324,
|
|
"num_input_tokens_seen": 6364856320,
|
|
"step": 12140,
|
|
"train_runtime": 55129.9188,
|
|
"train_tokens_per_second": 115451.944
|
|
},
|
|
{
|
|
"epoch": 0.6574853215725533,
|
|
"grad_norm": 0.14310745894908905,
|
|
"learning_rate": 0.0017418546862215448,
|
|
"loss": 3.0219293594360352,
|
|
"num_input_tokens_seen": 6370099200,
|
|
"step": 12150,
|
|
"train_runtime": 55175.1468,
|
|
"train_tokens_per_second": 115452.329
|
|
},
|
|
{
|
|
"epoch": 0.6580264617549176,
|
|
"grad_norm": 0.1343064159154892,
|
|
"learning_rate": 0.0017383415872998303,
|
|
"loss": 3.017044258117676,
|
|
"num_input_tokens_seen": 6375342080,
|
|
"step": 12160,
|
|
"train_runtime": 55220.3693,
|
|
"train_tokens_per_second": 115452.724
|
|
},
|
|
{
|
|
"epoch": 0.6585676019372818,
|
|
"grad_norm": 0.13533759117126465,
|
|
"learning_rate": 0.0017348315769247086,
|
|
"loss": 3.0149707794189453,
|
|
"num_input_tokens_seen": 6380584960,
|
|
"step": 12170,
|
|
"train_runtime": 55265.5973,
|
|
"train_tokens_per_second": 115453.108
|
|
},
|
|
{
|
|
"epoch": 0.6591087421196461,
|
|
"grad_norm": 0.1386122703552246,
|
|
"learning_rate": 0.0017313246658120804,
|
|
"loss": 3.0143962860107423,
|
|
"num_input_tokens_seen": 6385827840,
|
|
"step": 12180,
|
|
"train_runtime": 55310.8039,
|
|
"train_tokens_per_second": 115453.535
|
|
},
|
|
{
|
|
"epoch": 0.6596498823020104,
|
|
"grad_norm": 0.1343347579240799,
|
|
"learning_rate": 0.0017278208646683856,
|
|
"loss": 3.0179080963134766,
|
|
"num_input_tokens_seen": 6391070720,
|
|
"step": 12190,
|
|
"train_runtime": 55356.0418,
|
|
"train_tokens_per_second": 115453.896
|
|
},
|
|
{
|
|
"epoch": 0.6601910224843746,
|
|
"grad_norm": 0.14169900119304657,
|
|
"learning_rate": 0.0017243201841905666,
|
|
"loss": 3.0247045516967774,
|
|
"num_input_tokens_seen": 6396313600,
|
|
"step": 12200,
|
|
"train_runtime": 55401.2599,
|
|
"train_tokens_per_second": 115454.299
|
|
},
|
|
{
|
|
"epoch": 0.6607321626667388,
|
|
"grad_norm": 0.13514377176761627,
|
|
"learning_rate": 0.0017208226350660391,
|
|
"loss": 3.0104536056518554,
|
|
"num_input_tokens_seen": 6401556480,
|
|
"step": 12210,
|
|
"train_runtime": 55450.8592,
|
|
"train_tokens_per_second": 115445.578
|
|
},
|
|
{
|
|
"epoch": 0.661273302849103,
|
|
"grad_norm": 0.13756819069385529,
|
|
"learning_rate": 0.0017173282279726609,
|
|
"loss": 3.0194664001464844,
|
|
"num_input_tokens_seen": 6406799360,
|
|
"step": 12220,
|
|
"train_runtime": 55496.1437,
|
|
"train_tokens_per_second": 115445.848
|
|
},
|
|
{
|
|
"epoch": 0.6618144430314673,
|
|
"grad_norm": 0.13056276738643646,
|
|
"learning_rate": 0.0017138369735786954,
|
|
"loss": 3.0248437881469727,
|
|
"num_input_tokens_seen": 6412042240,
|
|
"step": 12230,
|
|
"train_runtime": 55541.3669,
|
|
"train_tokens_per_second": 115446.245
|
|
},
|
|
{
|
|
"epoch": 0.6623555832138316,
|
|
"grad_norm": 0.13981449604034424,
|
|
"learning_rate": 0.0017103488825427826,
|
|
"loss": 3.0129575729370117,
|
|
"num_input_tokens_seen": 6417285120,
|
|
"step": 12240,
|
|
"train_runtime": 55586.7139,
|
|
"train_tokens_per_second": 115446.384
|
|
},
|
|
{
|
|
"epoch": 0.6628967233961958,
|
|
"grad_norm": 0.1439344733953476,
|
|
"learning_rate": 0.0017068639655139026,
|
|
"loss": 3.022663116455078,
|
|
"num_input_tokens_seen": 6422528000,
|
|
"step": 12250,
|
|
"train_runtime": 55632.0146,
|
|
"train_tokens_per_second": 115446.619
|
|
},
|
|
{
|
|
"epoch": 0.66343786357856,
|
|
"grad_norm": 0.15030835568904877,
|
|
"learning_rate": 0.001703382233131348,
|
|
"loss": 3.012424850463867,
|
|
"num_input_tokens_seen": 6427770880,
|
|
"step": 12260,
|
|
"train_runtime": 55677.2926,
|
|
"train_tokens_per_second": 115446.901
|
|
},
|
|
{
|
|
"epoch": 0.6639790037609242,
|
|
"grad_norm": 0.13960550725460052,
|
|
"learning_rate": 0.0016999036960246871,
|
|
"loss": 3.0081478118896485,
|
|
"num_input_tokens_seen": 6433013760,
|
|
"step": 12270,
|
|
"train_runtime": 55722.6028,
|
|
"train_tokens_per_second": 115447.115
|
|
},
|
|
{
|
|
"epoch": 0.6645201439432885,
|
|
"grad_norm": 0.13627994060516357,
|
|
"learning_rate": 0.0016964283648137329,
|
|
"loss": 3.0084842681884765,
|
|
"num_input_tokens_seen": 6438256640,
|
|
"step": 12280,
|
|
"train_runtime": 55767.8798,
|
|
"train_tokens_per_second": 115447.398
|
|
},
|
|
{
|
|
"epoch": 0.6650612841256528,
|
|
"grad_norm": 0.14768123626708984,
|
|
"learning_rate": 0.0016929562501085123,
|
|
"loss": 3.013652801513672,
|
|
"num_input_tokens_seen": 6443499520,
|
|
"step": 12290,
|
|
"train_runtime": 55813.1427,
|
|
"train_tokens_per_second": 115447.71
|
|
},
|
|
{
|
|
"epoch": 0.665602424308017,
|
|
"grad_norm": 0.14207823574543,
|
|
"learning_rate": 0.0016894873625092333,
|
|
"loss": 3.0111804962158204,
|
|
"num_input_tokens_seen": 6448742400,
|
|
"step": 12300,
|
|
"train_runtime": 55858.4112,
|
|
"train_tokens_per_second": 115448.01
|
|
},
|
|
{
|
|
"epoch": 0.6661435644903813,
|
|
"grad_norm": 0.1379329413175583,
|
|
"learning_rate": 0.0016860217126062479,
|
|
"loss": 3.0187799453735353,
|
|
"num_input_tokens_seen": 6453985280,
|
|
"step": 12310,
|
|
"train_runtime": 55903.6646,
|
|
"train_tokens_per_second": 115448.34
|
|
},
|
|
{
|
|
"epoch": 0.6666847046727454,
|
|
"grad_norm": 0.14401622116565704,
|
|
"learning_rate": 0.0016825593109800264,
|
|
"loss": 3.0228382110595704,
|
|
"num_input_tokens_seen": 6459228160,
|
|
"step": 12320,
|
|
"train_runtime": 55948.9475,
|
|
"train_tokens_per_second": 115448.609
|
|
},
|
|
{
|
|
"epoch": 0.6672258448551097,
|
|
"grad_norm": 0.12955419719219208,
|
|
"learning_rate": 0.0016791001682011227,
|
|
"loss": 3.0097047805786135,
|
|
"num_input_tokens_seen": 6464471040,
|
|
"step": 12330,
|
|
"train_runtime": 55994.314,
|
|
"train_tokens_per_second": 115448.705
|
|
},
|
|
{
|
|
"epoch": 0.667766985037474,
|
|
"grad_norm": 0.14710277318954468,
|
|
"learning_rate": 0.0016756442948301386,
|
|
"loss": 3.0169065475463865,
|
|
"num_input_tokens_seen": 6469713920,
|
|
"step": 12340,
|
|
"train_runtime": 56039.7555,
|
|
"train_tokens_per_second": 115448.646
|
|
},
|
|
{
|
|
"epoch": 0.6683081252198382,
|
|
"grad_norm": 0.1326688975095749,
|
|
"learning_rate": 0.0016721917014176982,
|
|
"loss": 3.009653663635254,
|
|
"num_input_tokens_seen": 6474956800,
|
|
"step": 12350,
|
|
"train_runtime": 56085.2141,
|
|
"train_tokens_per_second": 115448.553
|
|
},
|
|
{
|
|
"epoch": 0.6688492654022025,
|
|
"grad_norm": 0.13903285562992096,
|
|
"learning_rate": 0.0016687423985044109,
|
|
"loss": 3.019660758972168,
|
|
"num_input_tokens_seen": 6480199680,
|
|
"step": 12360,
|
|
"train_runtime": 56130.7366,
|
|
"train_tokens_per_second": 115448.328
|
|
},
|
|
{
|
|
"epoch": 0.6693904055845666,
|
|
"grad_norm": 0.13976383209228516,
|
|
"learning_rate": 0.0016652963966208385,
|
|
"loss": 3.0172367095947266,
|
|
"num_input_tokens_seen": 6485442560,
|
|
"step": 12370,
|
|
"train_runtime": 56176.2788,
|
|
"train_tokens_per_second": 115448.063
|
|
},
|
|
{
|
|
"epoch": 0.6699315457669309,
|
|
"grad_norm": 0.13633348047733307,
|
|
"learning_rate": 0.0016618537062874665,
|
|
"loss": 3.004638671875,
|
|
"num_input_tokens_seen": 6490685440,
|
|
"step": 12380,
|
|
"train_runtime": 56221.949,
|
|
"train_tokens_per_second": 115447.535
|
|
},
|
|
{
|
|
"epoch": 0.6704726859492952,
|
|
"grad_norm": 0.14074033498764038,
|
|
"learning_rate": 0.001658414338014669,
|
|
"loss": 3.019020843505859,
|
|
"num_input_tokens_seen": 6495928320,
|
|
"step": 12390,
|
|
"train_runtime": 56267.7615,
|
|
"train_tokens_per_second": 115446.717
|
|
},
|
|
{
|
|
"epoch": 0.6710138261316594,
|
|
"grad_norm": 0.1326296180486679,
|
|
"learning_rate": 0.0016549783023026808,
|
|
"loss": 3.0110851287841798,
|
|
"num_input_tokens_seen": 6501171200,
|
|
"step": 12400,
|
|
"train_runtime": 56313.0806,
|
|
"train_tokens_per_second": 115446.911
|
|
},
|
|
{
|
|
"epoch": 0.6715549663140237,
|
|
"grad_norm": 0.13860943913459778,
|
|
"learning_rate": 0.001651545609641561,
|
|
"loss": 3.0090118408203126,
|
|
"num_input_tokens_seen": 6506414080,
|
|
"step": 12410,
|
|
"train_runtime": 56358.3912,
|
|
"train_tokens_per_second": 115447.122
|
|
},
|
|
{
|
|
"epoch": 0.6720961064963878,
|
|
"grad_norm": 0.1410975605249405,
|
|
"learning_rate": 0.0016481162705111604,
|
|
"loss": 3.0008705139160154,
|
|
"num_input_tokens_seen": 6511656960,
|
|
"step": 12420,
|
|
"train_runtime": 56403.6982,
|
|
"train_tokens_per_second": 115447.341
|
|
},
|
|
{
|
|
"epoch": 0.6726372466787521,
|
|
"grad_norm": 0.13546454906463623,
|
|
"learning_rate": 0.0016446902953810964,
|
|
"loss": 3.013086700439453,
|
|
"num_input_tokens_seen": 6516899840,
|
|
"step": 12430,
|
|
"train_runtime": 56448.9891,
|
|
"train_tokens_per_second": 115447.592
|
|
},
|
|
{
|
|
"epoch": 0.6731783868611164,
|
|
"grad_norm": 0.13547931611537933,
|
|
"learning_rate": 0.0016412676947107113,
|
|
"loss": 3.004133605957031,
|
|
"num_input_tokens_seen": 6522142720,
|
|
"step": 12440,
|
|
"train_runtime": 56494.2857,
|
|
"train_tokens_per_second": 115447.831
|
|
},
|
|
{
|
|
"epoch": 0.6737195270434806,
|
|
"grad_norm": 0.13898716866970062,
|
|
"learning_rate": 0.0016378484789490479,
|
|
"loss": 3.015100860595703,
|
|
"num_input_tokens_seen": 6527385600,
|
|
"step": 12450,
|
|
"train_runtime": 56539.5755,
|
|
"train_tokens_per_second": 115448.083
|
|
},
|
|
{
|
|
"epoch": 0.6742606672258449,
|
|
"grad_norm": 0.1385628879070282,
|
|
"learning_rate": 0.0016344326585348147,
|
|
"loss": 3.018421936035156,
|
|
"num_input_tokens_seen": 6532628480,
|
|
"step": 12460,
|
|
"train_runtime": 56584.8917,
|
|
"train_tokens_per_second": 115448.281
|
|
},
|
|
{
|
|
"epoch": 0.674801807408209,
|
|
"grad_norm": 0.13880495727062225,
|
|
"learning_rate": 0.001631020243896355,
|
|
"loss": 3.0016693115234374,
|
|
"num_input_tokens_seen": 6537871360,
|
|
"step": 12470,
|
|
"train_runtime": 56630.196,
|
|
"train_tokens_per_second": 115448.503
|
|
},
|
|
{
|
|
"epoch": 0.6753429475905733,
|
|
"grad_norm": 0.1371801793575287,
|
|
"learning_rate": 0.0016276112454516134,
|
|
"loss": 3.0135356903076174,
|
|
"num_input_tokens_seen": 6543114240,
|
|
"step": 12480,
|
|
"train_runtime": 56675.5074,
|
|
"train_tokens_per_second": 115448.71
|
|
},
|
|
{
|
|
"epoch": 0.6758840877729376,
|
|
"grad_norm": 0.1398102194070816,
|
|
"learning_rate": 0.001624205673608104,
|
|
"loss": 3.0212148666381835,
|
|
"num_input_tokens_seen": 6548357120,
|
|
"step": 12490,
|
|
"train_runtime": 56720.8046,
|
|
"train_tokens_per_second": 115448.946
|
|
},
|
|
{
|
|
"epoch": 0.6764252279553018,
|
|
"grad_norm": 0.1300211250782013,
|
|
"learning_rate": 0.0016208035387628825,
|
|
"loss": 3.0142328262329103,
|
|
"num_input_tokens_seen": 6553600000,
|
|
"step": 12500,
|
|
"train_runtime": 56766.0883,
|
|
"train_tokens_per_second": 115449.209
|
|
},
|
|
{
|
|
"epoch": 0.6764252279553018,
|
|
"eval_loss": 2.968597412109375,
|
|
"eval_runtime": 1.9925,
|
|
"eval_samples_per_second": 250.941,
|
|
"eval_steps_per_second": 4.015,
|
|
"num_input_tokens_seen": 6553600000,
|
|
"step": 12500
|
|
},
|
|
{
|
|
"epoch": 0.6769663681376661,
|
|
"grad_norm": 0.14369215071201324,
|
|
"learning_rate": 0.0016174048513025103,
|
|
"loss": 3.0048513412475586,
|
|
"num_input_tokens_seen": 6558842880,
|
|
"step": 12510,
|
|
"train_runtime": 56813.3987,
|
|
"train_tokens_per_second": 115445.353
|
|
},
|
|
{
|
|
"epoch": 0.6775075083200303,
|
|
"grad_norm": 0.14692343771457672,
|
|
"learning_rate": 0.0016140096216030232,
|
|
"loss": 3.0137935638427735,
|
|
"num_input_tokens_seen": 6564085760,
|
|
"step": 12520,
|
|
"train_runtime": 56858.6904,
|
|
"train_tokens_per_second": 115445.602
|
|
},
|
|
{
|
|
"epoch": 0.6780486485023945,
|
|
"grad_norm": 0.14028270542621613,
|
|
"learning_rate": 0.0016106178600299001,
|
|
"loss": 3.010356140136719,
|
|
"num_input_tokens_seen": 6569328640,
|
|
"step": 12530,
|
|
"train_runtime": 56903.9761,
|
|
"train_tokens_per_second": 115445.863
|
|
},
|
|
{
|
|
"epoch": 0.6785897886847588,
|
|
"grad_norm": 0.12822629511356354,
|
|
"learning_rate": 0.0016072295769380353,
|
|
"loss": 3.0003124237060548,
|
|
"num_input_tokens_seen": 6574571520,
|
|
"step": 12540,
|
|
"train_runtime": 56949.26,
|
|
"train_tokens_per_second": 115446.127
|
|
},
|
|
{
|
|
"epoch": 0.679130928867123,
|
|
"grad_norm": 0.1369100958108902,
|
|
"learning_rate": 0.0016038447826716993,
|
|
"loss": 3.0066249847412108,
|
|
"num_input_tokens_seen": 6579814400,
|
|
"step": 12550,
|
|
"train_runtime": 56994.5681,
|
|
"train_tokens_per_second": 115446.342
|
|
},
|
|
{
|
|
"epoch": 0.6796720690494873,
|
|
"grad_norm": 0.14047878980636597,
|
|
"learning_rate": 0.001600463487564515,
|
|
"loss": 3.0145965576171876,
|
|
"num_input_tokens_seen": 6585057280,
|
|
"step": 12560,
|
|
"train_runtime": 57039.861,
|
|
"train_tokens_per_second": 115446.587
|
|
},
|
|
{
|
|
"epoch": 0.6802132092318515,
|
|
"grad_norm": 0.14242438971996307,
|
|
"learning_rate": 0.001597085701939419,
|
|
"loss": 3.0166095733642577,
|
|
"num_input_tokens_seen": 6590300160,
|
|
"step": 12570,
|
|
"train_runtime": 57085.1398,
|
|
"train_tokens_per_second": 115446.86
|
|
},
|
|
{
|
|
"epoch": 0.6807543494142158,
|
|
"grad_norm": 0.1383470743894577,
|
|
"learning_rate": 0.0015937114361086369,
|
|
"loss": 3.0075637817382814,
|
|
"num_input_tokens_seen": 6595543040,
|
|
"step": 12580,
|
|
"train_runtime": 57130.4343,
|
|
"train_tokens_per_second": 115447.101
|
|
},
|
|
{
|
|
"epoch": 0.68129548959658,
|
|
"grad_norm": 0.1291186362504959,
|
|
"learning_rate": 0.0015903407003736466,
|
|
"loss": 3.01377010345459,
|
|
"num_input_tokens_seen": 6600785920,
|
|
"step": 12590,
|
|
"train_runtime": 57180.0264,
|
|
"train_tokens_per_second": 115438.665
|
|
},
|
|
{
|
|
"epoch": 0.6818366297789442,
|
|
"grad_norm": 0.13580311834812164,
|
|
"learning_rate": 0.0015869735050251489,
|
|
"loss": 3.0099231719970705,
|
|
"num_input_tokens_seen": 6606028800,
|
|
"step": 12600,
|
|
"train_runtime": 57225.3092,
|
|
"train_tokens_per_second": 115438.936
|
|
},
|
|
{
|
|
"epoch": 0.6823777699613085,
|
|
"grad_norm": 0.1437922716140747,
|
|
"learning_rate": 0.0015836098603430357,
|
|
"loss": 3.0034923553466797,
|
|
"num_input_tokens_seen": 6611271680,
|
|
"step": 12610,
|
|
"train_runtime": 57270.5349,
|
|
"train_tokens_per_second": 115439.321
|
|
},
|
|
{
|
|
"epoch": 0.6829189101436727,
|
|
"grad_norm": 0.13526742160320282,
|
|
"learning_rate": 0.0015802497765963614,
|
|
"loss": 3.00305061340332,
|
|
"num_input_tokens_seen": 6616514560,
|
|
"step": 12620,
|
|
"train_runtime": 57315.7589,
|
|
"train_tokens_per_second": 115439.709
|
|
},
|
|
{
|
|
"epoch": 0.683460050326037,
|
|
"grad_norm": 0.1404607594013214,
|
|
"learning_rate": 0.0015768932640433059,
|
|
"loss": 3.0041690826416017,
|
|
"num_input_tokens_seen": 6621757440,
|
|
"step": 12630,
|
|
"train_runtime": 57360.9936,
|
|
"train_tokens_per_second": 115440.076
|
|
},
|
|
{
|
|
"epoch": 0.6840011905084012,
|
|
"grad_norm": 0.13756705820560455,
|
|
"learning_rate": 0.0015735403329311469,
|
|
"loss": 2.9982038497924806,
|
|
"num_input_tokens_seen": 6627000320,
|
|
"step": 12640,
|
|
"train_runtime": 57406.2268,
|
|
"train_tokens_per_second": 115440.444
|
|
},
|
|
{
|
|
"epoch": 0.6845423306907654,
|
|
"grad_norm": 0.14006656408309937,
|
|
"learning_rate": 0.0015701909934962305,
|
|
"loss": 3.009762763977051,
|
|
"num_input_tokens_seen": 6632243200,
|
|
"step": 12650,
|
|
"train_runtime": 57451.4583,
|
|
"train_tokens_per_second": 115440.816
|
|
},
|
|
{
|
|
"epoch": 0.6850834708731297,
|
|
"grad_norm": 0.13317948579788208,
|
|
"learning_rate": 0.001566845255963934,
|
|
"loss": 3.0151742935180663,
|
|
"num_input_tokens_seen": 6637486080,
|
|
"step": 12660,
|
|
"train_runtime": 57496.7057,
|
|
"train_tokens_per_second": 115441.154
|
|
},
|
|
{
|
|
"epoch": 0.6856246110554939,
|
|
"grad_norm": 0.13669337332248688,
|
|
"learning_rate": 0.0015635031305486417,
|
|
"loss": 3.000714874267578,
|
|
"num_input_tokens_seen": 6642728960,
|
|
"step": 12670,
|
|
"train_runtime": 57541.9394,
|
|
"train_tokens_per_second": 115441.52
|
|
},
|
|
{
|
|
"epoch": 0.6861657512378582,
|
|
"grad_norm": 0.13967348635196686,
|
|
"learning_rate": 0.0015601646274537087,
|
|
"loss": 3.0043874740600587,
|
|
"num_input_tokens_seen": 6647971840,
|
|
"step": 12680,
|
|
"train_runtime": 57587.1773,
|
|
"train_tokens_per_second": 115441.877
|
|
},
|
|
{
|
|
"epoch": 0.6867068914202225,
|
|
"grad_norm": 0.13815197348594666,
|
|
"learning_rate": 0.0015568297568714312,
|
|
"loss": 3.010976219177246,
|
|
"num_input_tokens_seen": 6653214720,
|
|
"step": 12690,
|
|
"train_runtime": 57632.4045,
|
|
"train_tokens_per_second": 115442.255
|
|
},
|
|
{
|
|
"epoch": 0.6872480316025866,
|
|
"grad_norm": 0.1381223499774933,
|
|
"learning_rate": 0.001553498528983015,
|
|
"loss": 3.013303756713867,
|
|
"num_input_tokens_seen": 6658457600,
|
|
"step": 12700,
|
|
"train_runtime": 57677.6438,
|
|
"train_tokens_per_second": 115442.608
|
|
},
|
|
{
|
|
"epoch": 0.6877891717849509,
|
|
"grad_norm": 0.13350199162960052,
|
|
"learning_rate": 0.0015501709539585454,
|
|
"loss": 3.012788009643555,
|
|
"num_input_tokens_seen": 6663700480,
|
|
"step": 12710,
|
|
"train_runtime": 57722.8853,
|
|
"train_tokens_per_second": 115442.956
|
|
},
|
|
{
|
|
"epoch": 0.6883303119673151,
|
|
"grad_norm": 0.13979476690292358,
|
|
"learning_rate": 0.0015468470419569564,
|
|
"loss": 3.0098241806030273,
|
|
"num_input_tokens_seen": 6668943360,
|
|
"step": 12720,
|
|
"train_runtime": 57768.1112,
|
|
"train_tokens_per_second": 115443.334
|
|
},
|
|
{
|
|
"epoch": 0.6888714521496794,
|
|
"grad_norm": 0.13748957216739655,
|
|
"learning_rate": 0.0015435268031259992,
|
|
"loss": 3.009090805053711,
|
|
"num_input_tokens_seen": 6674186240,
|
|
"step": 12730,
|
|
"train_runtime": 57813.3636,
|
|
"train_tokens_per_second": 115443.659
|
|
},
|
|
{
|
|
"epoch": 0.6894125923320437,
|
|
"grad_norm": 0.13561367988586426,
|
|
"learning_rate": 0.0015402102476022095,
|
|
"loss": 3.008078765869141,
|
|
"num_input_tokens_seen": 6679429120,
|
|
"step": 12740,
|
|
"train_runtime": 57858.572,
|
|
"train_tokens_per_second": 115444.072
|
|
},
|
|
{
|
|
"epoch": 0.6899537325144078,
|
|
"grad_norm": 0.12914767861366272,
|
|
"learning_rate": 0.0015368973855108782,
|
|
"loss": 3.0018003463745115,
|
|
"num_input_tokens_seen": 6684672000,
|
|
"step": 12750,
|
|
"train_runtime": 57903.8186,
|
|
"train_tokens_per_second": 115444.407
|
|
},
|
|
{
|
|
"epoch": 0.6904948726967721,
|
|
"grad_norm": 0.14038655161857605,
|
|
"learning_rate": 0.0015335882269660217,
|
|
"loss": 3.004079818725586,
|
|
"num_input_tokens_seen": 6689914880,
|
|
"step": 12760,
|
|
"train_runtime": 57949.0509,
|
|
"train_tokens_per_second": 115444.771
|
|
},
|
|
{
|
|
"epoch": 0.6910360128791363,
|
|
"grad_norm": 0.13866056501865387,
|
|
"learning_rate": 0.001530282782070348,
|
|
"loss": 3.009323310852051,
|
|
"num_input_tokens_seen": 6695157760,
|
|
"step": 12770,
|
|
"train_runtime": 57994.2931,
|
|
"train_tokens_per_second": 115445.114
|
|
},
|
|
{
|
|
"epoch": 0.6915771530615006,
|
|
"grad_norm": 0.1286270171403885,
|
|
"learning_rate": 0.001526981060915229,
|
|
"loss": 3.000651550292969,
|
|
"num_input_tokens_seen": 6700400640,
|
|
"step": 12780,
|
|
"train_runtime": 58039.518,
|
|
"train_tokens_per_second": 115445.491
|
|
},
|
|
{
|
|
"epoch": 0.6921182932438649,
|
|
"grad_norm": 0.13248993456363678,
|
|
"learning_rate": 0.0015236830735806679,
|
|
"loss": 3.0101812362670897,
|
|
"num_input_tokens_seen": 6705643520,
|
|
"step": 12790,
|
|
"train_runtime": 58084.7779,
|
|
"train_tokens_per_second": 115445.798
|
|
},
|
|
{
|
|
"epoch": 0.692659433426229,
|
|
"grad_norm": 0.1369810700416565,
|
|
"learning_rate": 0.0015203888301352675,
|
|
"loss": 3.004811477661133,
|
|
"num_input_tokens_seen": 6710886400,
|
|
"step": 12800,
|
|
"train_runtime": 58130.0044,
|
|
"train_tokens_per_second": 115446.171
|
|
},
|
|
{
|
|
"epoch": 0.6932005736085933,
|
|
"grad_norm": 0.14264971017837524,
|
|
"learning_rate": 0.001517098340636202,
|
|
"loss": 3.010848808288574,
|
|
"num_input_tokens_seen": 6716129280,
|
|
"step": 12810,
|
|
"train_runtime": 58175.241,
|
|
"train_tokens_per_second": 115446.523
|
|
},
|
|
{
|
|
"epoch": 0.6937417137909575,
|
|
"grad_norm": 0.1406365931034088,
|
|
"learning_rate": 0.0015138116151291825,
|
|
"loss": 3.0090103149414062,
|
|
"num_input_tokens_seen": 6721372160,
|
|
"step": 12820,
|
|
"train_runtime": 58220.4724,
|
|
"train_tokens_per_second": 115446.885
|
|
},
|
|
{
|
|
"epoch": 0.6942828539733218,
|
|
"grad_norm": 0.13356050848960876,
|
|
"learning_rate": 0.0015105286636484334,
|
|
"loss": 2.999258613586426,
|
|
"num_input_tokens_seen": 6726615040,
|
|
"step": 12830,
|
|
"train_runtime": 58265.7054,
|
|
"train_tokens_per_second": 115447.243
|
|
},
|
|
{
|
|
"epoch": 0.6948239941556861,
|
|
"grad_norm": 0.13091513514518738,
|
|
"learning_rate": 0.001507249496216654,
|
|
"loss": 3.005986785888672,
|
|
"num_input_tokens_seen": 6731857920,
|
|
"step": 12840,
|
|
"train_runtime": 58310.9354,
|
|
"train_tokens_per_second": 115447.606
|
|
},
|
|
{
|
|
"epoch": 0.6953651343380503,
|
|
"grad_norm": 0.1335466355085373,
|
|
"learning_rate": 0.0015039741228449904,
|
|
"loss": 2.9974597930908202,
|
|
"num_input_tokens_seen": 6737100800,
|
|
"step": 12850,
|
|
"train_runtime": 58356.1736,
|
|
"train_tokens_per_second": 115447.953
|
|
},
|
|
{
|
|
"epoch": 0.6959062745204145,
|
|
"grad_norm": 0.1375114917755127,
|
|
"learning_rate": 0.0015007025535330083,
|
|
"loss": 3.0074440002441407,
|
|
"num_input_tokens_seen": 6742343680,
|
|
"step": 12860,
|
|
"train_runtime": 58401.3717,
|
|
"train_tokens_per_second": 115448.379
|
|
},
|
|
{
|
|
"epoch": 0.6964474147027787,
|
|
"grad_norm": 0.15171852707862854,
|
|
"learning_rate": 0.001497434798268658,
|
|
"loss": 2.996272659301758,
|
|
"num_input_tokens_seen": 6747586560,
|
|
"step": 12870,
|
|
"train_runtime": 58446.5932,
|
|
"train_tokens_per_second": 115448.757
|
|
},
|
|
{
|
|
"epoch": 0.696988554885143,
|
|
"grad_norm": 0.13725285232067108,
|
|
"learning_rate": 0.0014941708670282445,
|
|
"loss": 3.0174352645874025,
|
|
"num_input_tokens_seen": 6752829440,
|
|
"step": 12880,
|
|
"train_runtime": 58491.8411,
|
|
"train_tokens_per_second": 115449.083
|
|
},
|
|
{
|
|
"epoch": 0.6975296950675073,
|
|
"grad_norm": 0.1326073855161667,
|
|
"learning_rate": 0.0014909107697764006,
|
|
"loss": 3.006754684448242,
|
|
"num_input_tokens_seen": 6758072320,
|
|
"step": 12890,
|
|
"train_runtime": 58537.0682,
|
|
"train_tokens_per_second": 115449.45
|
|
},
|
|
{
|
|
"epoch": 0.6980708352498715,
|
|
"grad_norm": 0.1453487128019333,
|
|
"learning_rate": 0.0014876545164660543,
|
|
"loss": 3.003281021118164,
|
|
"num_input_tokens_seen": 6763315200,
|
|
"step": 12900,
|
|
"train_runtime": 58582.3109,
|
|
"train_tokens_per_second": 115449.785
|
|
},
|
|
{
|
|
"epoch": 0.6986119754322357,
|
|
"grad_norm": 0.13233183324337006,
|
|
"learning_rate": 0.001484402117038397,
|
|
"loss": 3.0117160797119142,
|
|
"num_input_tokens_seen": 6768558080,
|
|
"step": 12910,
|
|
"train_runtime": 58627.5472,
|
|
"train_tokens_per_second": 115450.132
|
|
},
|
|
{
|
|
"epoch": 0.6991531156145999,
|
|
"grad_norm": 0.1383819729089737,
|
|
"learning_rate": 0.0014811535814228522,
|
|
"loss": 3.0003276824951173,
|
|
"num_input_tokens_seen": 6773800960,
|
|
"step": 12920,
|
|
"train_runtime": 58672.7881,
|
|
"train_tokens_per_second": 115450.47
|
|
},
|
|
{
|
|
"epoch": 0.6996942557969642,
|
|
"grad_norm": 0.13273653388023376,
|
|
"learning_rate": 0.0014779089195370515,
|
|
"loss": 3.006727600097656,
|
|
"num_input_tokens_seen": 6779043840,
|
|
"step": 12930,
|
|
"train_runtime": 58718.0154,
|
|
"train_tokens_per_second": 115450.834
|
|
},
|
|
{
|
|
"epoch": 0.7002353959793285,
|
|
"grad_norm": 0.13412410020828247,
|
|
"learning_rate": 0.0014746681412867993,
|
|
"loss": 2.9990608215332033,
|
|
"num_input_tokens_seen": 6784286720,
|
|
"step": 12940,
|
|
"train_runtime": 58763.2242,
|
|
"train_tokens_per_second": 115451.234
|
|
},
|
|
{
|
|
"epoch": 0.7007765361616927,
|
|
"grad_norm": 0.13567864894866943,
|
|
"learning_rate": 0.0014714312565660412,
|
|
"loss": 3.001424789428711,
|
|
"num_input_tokens_seen": 6789529600,
|
|
"step": 12950,
|
|
"train_runtime": 58808.4491,
|
|
"train_tokens_per_second": 115451.601
|
|
},
|
|
{
|
|
"epoch": 0.701317676344057,
|
|
"grad_norm": 0.12947793304920197,
|
|
"learning_rate": 0.0014681982752568368,
|
|
"loss": 2.9996448516845704,
|
|
"num_input_tokens_seen": 6794772480,
|
|
"step": 12960,
|
|
"train_runtime": 58853.6594,
|
|
"train_tokens_per_second": 115451.997
|
|
},
|
|
{
|
|
"epoch": 0.7018588165264211,
|
|
"grad_norm": 0.1319398730993271,
|
|
"learning_rate": 0.001464969207229331,
|
|
"loss": 3.0077224731445313,
|
|
"num_input_tokens_seen": 6800015360,
|
|
"step": 12970,
|
|
"train_runtime": 58898.8938,
|
|
"train_tokens_per_second": 115452.344
|
|
},
|
|
{
|
|
"epoch": 0.7023999567087854,
|
|
"grad_norm": 0.14026153087615967,
|
|
"learning_rate": 0.0014617440623417178,
|
|
"loss": 2.999114227294922,
|
|
"num_input_tokens_seen": 6805258240,
|
|
"step": 12980,
|
|
"train_runtime": 58948.6295,
|
|
"train_tokens_per_second": 115443.875
|
|
},
|
|
{
|
|
"epoch": 0.7029410968911497,
|
|
"grad_norm": 0.14495210349559784,
|
|
"learning_rate": 0.0014585228504402185,
|
|
"loss": 3.005875015258789,
|
|
"num_input_tokens_seen": 6810501120,
|
|
"step": 12990,
|
|
"train_runtime": 58994.0959,
|
|
"train_tokens_per_second": 115443.775
|
|
},
|
|
{
|
|
"epoch": 0.7034822370735139,
|
|
"grad_norm": 0.13643252849578857,
|
|
"learning_rate": 0.001455305581359043,
|
|
"loss": 2.997660255432129,
|
|
"num_input_tokens_seen": 6815744000,
|
|
"step": 13000,
|
|
"train_runtime": 59039.5206,
|
|
"train_tokens_per_second": 115443.756
|
|
},
|
|
{
|
|
"epoch": 0.7034822370735139,
|
|
"eval_loss": 2.960465669631958,
|
|
"eval_runtime": 1.987,
|
|
"eval_samples_per_second": 251.641,
|
|
"eval_steps_per_second": 4.026,
|
|
"num_input_tokens_seen": 6815744000,
|
|
"step": 13000
|
|
},
|
|
{
|
|
"epoch": 0.7040233772558782,
|
|
"grad_norm": 0.130798801779747,
|
|
"learning_rate": 0.001452092264920367,
|
|
"loss": 3.0002573013305662,
|
|
"num_input_tokens_seen": 6820986880,
|
|
"step": 13010,
|
|
"train_runtime": 59089.599,
|
|
"train_tokens_per_second": 115434.645
|
|
},
|
|
{
|
|
"epoch": 0.7045645174382423,
|
|
"grad_norm": 0.13077320158481598,
|
|
"learning_rate": 0.001448882910934297,
|
|
"loss": 3.00850830078125,
|
|
"num_input_tokens_seen": 6826229760,
|
|
"step": 13020,
|
|
"train_runtime": 59135.0207,
|
|
"train_tokens_per_second": 115434.639
|
|
},
|
|
{
|
|
"epoch": 0.7051056576206066,
|
|
"grad_norm": 0.14131614565849304,
|
|
"learning_rate": 0.0014456775291988434,
|
|
"loss": 3.0077110290527345,
|
|
"num_input_tokens_seen": 6831472640,
|
|
"step": 13030,
|
|
"train_runtime": 59180.4577,
|
|
"train_tokens_per_second": 115434.603
|
|
},
|
|
{
|
|
"epoch": 0.7056467978029709,
|
|
"grad_norm": 0.13815636932849884,
|
|
"learning_rate": 0.0014424761294998883,
|
|
"loss": 3.00131778717041,
|
|
"num_input_tokens_seen": 6836715520,
|
|
"step": 13040,
|
|
"train_runtime": 59225.9087,
|
|
"train_tokens_per_second": 115434.54
|
|
},
|
|
{
|
|
"epoch": 0.7061879379853351,
|
|
"grad_norm": 0.1329071819782257,
|
|
"learning_rate": 0.0014392787216111597,
|
|
"loss": 2.994339370727539,
|
|
"num_input_tokens_seen": 6841958400,
|
|
"step": 13050,
|
|
"train_runtime": 59271.3336,
|
|
"train_tokens_per_second": 115434.528
|
|
},
|
|
{
|
|
"epoch": 0.7067290781676994,
|
|
"grad_norm": 0.13561072945594788,
|
|
"learning_rate": 0.0014360853152941958,
|
|
"loss": 3.0034358978271483,
|
|
"num_input_tokens_seen": 6847201280,
|
|
"step": 13060,
|
|
"train_runtime": 59316.7359,
|
|
"train_tokens_per_second": 115434.56
|
|
},
|
|
{
|
|
"epoch": 0.7072702183500635,
|
|
"grad_norm": 0.13618333637714386,
|
|
"learning_rate": 0.0014328959202983182,
|
|
"loss": 3.0087270736694336,
|
|
"num_input_tokens_seen": 6852444160,
|
|
"step": 13070,
|
|
"train_runtime": 59362.09,
|
|
"train_tokens_per_second": 115434.685
|
|
},
|
|
{
|
|
"epoch": 0.7078113585324278,
|
|
"grad_norm": 0.1365492194890976,
|
|
"learning_rate": 0.0014297105463606044,
|
|
"loss": 3.0061859130859374,
|
|
"num_input_tokens_seen": 6857687040,
|
|
"step": 13080,
|
|
"train_runtime": 59407.4452,
|
|
"train_tokens_per_second": 115434.808
|
|
},
|
|
{
|
|
"epoch": 0.7083524987147921,
|
|
"grad_norm": 0.13774985074996948,
|
|
"learning_rate": 0.001426529203205853,
|
|
"loss": 3.010288429260254,
|
|
"num_input_tokens_seen": 6862929920,
|
|
"step": 13090,
|
|
"train_runtime": 59452.8193,
|
|
"train_tokens_per_second": 115434.894
|
|
},
|
|
{
|
|
"epoch": 0.7088936388971563,
|
|
"grad_norm": 0.1349509209394455,
|
|
"learning_rate": 0.00142335190054656,
|
|
"loss": 3.000904846191406,
|
|
"num_input_tokens_seen": 6868172800,
|
|
"step": 13100,
|
|
"train_runtime": 59498.1377,
|
|
"train_tokens_per_second": 115435.089
|
|
},
|
|
{
|
|
"epoch": 0.7094347790795206,
|
|
"grad_norm": 0.1314682513475418,
|
|
"learning_rate": 0.0014201786480828838,
|
|
"loss": 3.0022382736206055,
|
|
"num_input_tokens_seen": 6873415680,
|
|
"step": 13110,
|
|
"train_runtime": 59543.4355,
|
|
"train_tokens_per_second": 115435.322
|
|
},
|
|
{
|
|
"epoch": 0.7099759192618847,
|
|
"grad_norm": 0.14362597465515137,
|
|
"learning_rate": 0.0014170094555026182,
|
|
"loss": 2.9901851654052733,
|
|
"num_input_tokens_seen": 6878658560,
|
|
"step": 13120,
|
|
"train_runtime": 59588.6836,
|
|
"train_tokens_per_second": 115435.652
|
|
},
|
|
{
|
|
"epoch": 0.710517059444249,
|
|
"grad_norm": 0.13301101326942444,
|
|
"learning_rate": 0.0014138443324811618,
|
|
"loss": 3.0021732330322264,
|
|
"num_input_tokens_seen": 6883901440,
|
|
"step": 13130,
|
|
"train_runtime": 59633.9351,
|
|
"train_tokens_per_second": 115435.975
|
|
},
|
|
{
|
|
"epoch": 0.7110581996266133,
|
|
"grad_norm": 0.13076400756835938,
|
|
"learning_rate": 0.0014106832886814891,
|
|
"loss": 3.0049604415893554,
|
|
"num_input_tokens_seen": 6889144320,
|
|
"step": 13140,
|
|
"train_runtime": 59679.1572,
|
|
"train_tokens_per_second": 115436.354
|
|
},
|
|
{
|
|
"epoch": 0.7115993398089775,
|
|
"grad_norm": 0.13057680428028107,
|
|
"learning_rate": 0.0014075263337541223,
|
|
"loss": 3.009153938293457,
|
|
"num_input_tokens_seen": 6894387200,
|
|
"step": 13150,
|
|
"train_runtime": 59724.3952,
|
|
"train_tokens_per_second": 115436.702
|
|
},
|
|
{
|
|
"epoch": 0.7121404799913418,
|
|
"grad_norm": 0.13498692214488983,
|
|
"learning_rate": 0.0014043734773370997,
|
|
"loss": 2.996112060546875,
|
|
"num_input_tokens_seen": 6899630080,
|
|
"step": 13160,
|
|
"train_runtime": 59769.5992,
|
|
"train_tokens_per_second": 115437.115
|
|
},
|
|
{
|
|
"epoch": 0.712681620173706,
|
|
"grad_norm": 0.13407272100448608,
|
|
"learning_rate": 0.0014012247290559466,
|
|
"loss": 3.0008213043212892,
|
|
"num_input_tokens_seen": 6904872960,
|
|
"step": 13170,
|
|
"train_runtime": 59814.8054,
|
|
"train_tokens_per_second": 115437.523
|
|
},
|
|
{
|
|
"epoch": 0.7132227603560702,
|
|
"grad_norm": 0.14042150974273682,
|
|
"learning_rate": 0.0013980800985236468,
|
|
"loss": 2.9953586578369142,
|
|
"num_input_tokens_seen": 6910115840,
|
|
"step": 13180,
|
|
"train_runtime": 59859.9779,
|
|
"train_tokens_per_second": 115437.995
|
|
},
|
|
{
|
|
"epoch": 0.7137639005384345,
|
|
"grad_norm": 0.13807494938373566,
|
|
"learning_rate": 0.0013949395953406127,
|
|
"loss": 2.9886444091796873,
|
|
"num_input_tokens_seen": 6915358720,
|
|
"step": 13190,
|
|
"train_runtime": 59905.1537,
|
|
"train_tokens_per_second": 115438.461
|
|
},
|
|
{
|
|
"epoch": 0.7143050407207987,
|
|
"grad_norm": 0.13666392862796783,
|
|
"learning_rate": 0.0013918032290946552,
|
|
"loss": 3.0074825286865234,
|
|
"num_input_tokens_seen": 6920601600,
|
|
"step": 13200,
|
|
"train_runtime": 59950.322,
|
|
"train_tokens_per_second": 115438.94
|
|
},
|
|
{
|
|
"epoch": 0.714846180903163,
|
|
"grad_norm": 0.12777790427207947,
|
|
"learning_rate": 0.0013886710093609566,
|
|
"loss": 2.9995635986328124,
|
|
"num_input_tokens_seen": 6925844480,
|
|
"step": 13210,
|
|
"train_runtime": 59995.4811,
|
|
"train_tokens_per_second": 115439.436
|
|
},
|
|
{
|
|
"epoch": 0.7153873210855272,
|
|
"grad_norm": 0.13057056069374084,
|
|
"learning_rate": 0.0013855429457020408,
|
|
"loss": 2.993345260620117,
|
|
"num_input_tokens_seen": 6931087360,
|
|
"step": 13220,
|
|
"train_runtime": 60040.6669,
|
|
"train_tokens_per_second": 115439.88
|
|
},
|
|
{
|
|
"epoch": 0.7159284612678914,
|
|
"grad_norm": 0.13309696316719055,
|
|
"learning_rate": 0.0013824190476677417,
|
|
"loss": 2.9962528228759764,
|
|
"num_input_tokens_seen": 6936330240,
|
|
"step": 13230,
|
|
"train_runtime": 60085.8338,
|
|
"train_tokens_per_second": 115440.359
|
|
},
|
|
{
|
|
"epoch": 0.7164696014502557,
|
|
"grad_norm": 0.13253308832645416,
|
|
"learning_rate": 0.0013792993247951752,
|
|
"loss": 3.001760482788086,
|
|
"num_input_tokens_seen": 6941573120,
|
|
"step": 13240,
|
|
"train_runtime": 60130.9838,
|
|
"train_tokens_per_second": 115440.871
|
|
},
|
|
{
|
|
"epoch": 0.7170107416326199,
|
|
"grad_norm": 0.14509917795658112,
|
|
"learning_rate": 0.001376183786608712,
|
|
"loss": 2.999083137512207,
|
|
"num_input_tokens_seen": 6946816000,
|
|
"step": 13250,
|
|
"train_runtime": 60176.1243,
|
|
"train_tokens_per_second": 115441.399
|
|
},
|
|
{
|
|
"epoch": 0.7175518818149842,
|
|
"grad_norm": 0.13013510406017303,
|
|
"learning_rate": 0.001373072442619947,
|
|
"loss": 3.0021896362304688,
|
|
"num_input_tokens_seen": 6952058880,
|
|
"step": 13260,
|
|
"train_runtime": 60221.2777,
|
|
"train_tokens_per_second": 115441.903
|
|
},
|
|
{
|
|
"epoch": 0.7180930219973484,
|
|
"grad_norm": 0.1433565616607666,
|
|
"learning_rate": 0.0013699653023276715,
|
|
"loss": 2.999072265625,
|
|
"num_input_tokens_seen": 6957301760,
|
|
"step": 13270,
|
|
"train_runtime": 60266.4098,
|
|
"train_tokens_per_second": 115442.446
|
|
},
|
|
{
|
|
"epoch": 0.7186341621797127,
|
|
"grad_norm": 0.13696636259555817,
|
|
"learning_rate": 0.0013668623752178402,
|
|
"loss": 2.991237258911133,
|
|
"num_input_tokens_seen": 6962544640,
|
|
"step": 13280,
|
|
"train_runtime": 60311.5633,
|
|
"train_tokens_per_second": 115442.948
|
|
},
|
|
{
|
|
"epoch": 0.7191753023620769,
|
|
"grad_norm": 0.134785458445549,
|
|
"learning_rate": 0.0013637636707635485,
|
|
"loss": 3.002344512939453,
|
|
"num_input_tokens_seen": 6967787520,
|
|
"step": 13290,
|
|
"train_runtime": 60356.7015,
|
|
"train_tokens_per_second": 115443.478
|
|
},
|
|
{
|
|
"epoch": 0.7197164425444411,
|
|
"grad_norm": 0.13965272903442383,
|
|
"learning_rate": 0.0013606691984249973,
|
|
"loss": 2.9921356201171876,
|
|
"num_input_tokens_seen": 6973030400,
|
|
"step": 13300,
|
|
"train_runtime": 60401.8497,
|
|
"train_tokens_per_second": 115443.988
|
|
},
|
|
{
|
|
"epoch": 0.7202575827268054,
|
|
"grad_norm": 0.1369258165359497,
|
|
"learning_rate": 0.0013575789676494676,
|
|
"loss": 2.9890642166137695,
|
|
"num_input_tokens_seen": 6978273280,
|
|
"step": 13310,
|
|
"train_runtime": 60447.02,
|
|
"train_tokens_per_second": 115444.455
|
|
},
|
|
{
|
|
"epoch": 0.7207987229091696,
|
|
"grad_norm": 0.1361692249774933,
|
|
"learning_rate": 0.0013544929878712931,
|
|
"loss": 3.0067501068115234,
|
|
"num_input_tokens_seen": 6983516160,
|
|
"step": 13320,
|
|
"train_runtime": 60492.1531,
|
|
"train_tokens_per_second": 115444.993
|
|
},
|
|
{
|
|
"epoch": 0.7213398630915339,
|
|
"grad_norm": 0.13645213842391968,
|
|
"learning_rate": 0.0013514112685118279,
|
|
"loss": 2.99460506439209,
|
|
"num_input_tokens_seen": 6988759040,
|
|
"step": 13330,
|
|
"train_runtime": 60537.2701,
|
|
"train_tokens_per_second": 115445.56
|
|
},
|
|
{
|
|
"epoch": 0.7218810032738981,
|
|
"grad_norm": 0.13640370965003967,
|
|
"learning_rate": 0.0013483338189794198,
|
|
"loss": 3.0064407348632813,
|
|
"num_input_tokens_seen": 6994001920,
|
|
"step": 13340,
|
|
"train_runtime": 60582.4237,
|
|
"train_tokens_per_second": 115446.057
|
|
},
|
|
{
|
|
"epoch": 0.7224221434562623,
|
|
"grad_norm": 0.13847370445728302,
|
|
"learning_rate": 0.0013452606486693793,
|
|
"loss": 2.990389823913574,
|
|
"num_input_tokens_seen": 6999244800,
|
|
"step": 13350,
|
|
"train_runtime": 60627.5832,
|
|
"train_tokens_per_second": 115446.542
|
|
},
|
|
{
|
|
"epoch": 0.7229632836386266,
|
|
"grad_norm": 0.14565610885620117,
|
|
"learning_rate": 0.001342191766963955,
|
|
"loss": 2.9985219955444338,
|
|
"num_input_tokens_seen": 7004487680,
|
|
"step": 13360,
|
|
"train_runtime": 60676.4805,
|
|
"train_tokens_per_second": 115439.914
|
|
},
|
|
{
|
|
"epoch": 0.7235044238209908,
|
|
"grad_norm": 0.13583402335643768,
|
|
"learning_rate": 0.0013391271832323016,
|
|
"loss": 3.000563049316406,
|
|
"num_input_tokens_seen": 7009730560,
|
|
"step": 13370,
|
|
"train_runtime": 60721.6176,
|
|
"train_tokens_per_second": 115440.445
|
|
},
|
|
{
|
|
"epoch": 0.7240455640033551,
|
|
"grad_norm": 0.13164934515953064,
|
|
"learning_rate": 0.0013360669068304526,
|
|
"loss": 2.993762969970703,
|
|
"num_input_tokens_seen": 7014973440,
|
|
"step": 13380,
|
|
"train_runtime": 60766.7453,
|
|
"train_tokens_per_second": 115440.993
|
|
},
|
|
{
|
|
"epoch": 0.7245867041857194,
|
|
"grad_norm": 0.13159868121147156,
|
|
"learning_rate": 0.001333010947101289,
|
|
"loss": 2.9905731201171877,
|
|
"num_input_tokens_seen": 7020216320,
|
|
"step": 13390,
|
|
"train_runtime": 60811.8966,
|
|
"train_tokens_per_second": 115441.496
|
|
},
|
|
{
|
|
"epoch": 0.7251278443680835,
|
|
"grad_norm": 0.1346818059682846,
|
|
"learning_rate": 0.001329959313374518,
|
|
"loss": 3.002712631225586,
|
|
"num_input_tokens_seen": 7025459200,
|
|
"step": 13400,
|
|
"train_runtime": 60857.0386,
|
|
"train_tokens_per_second": 115442.016
|
|
},
|
|
{
|
|
"epoch": 0.7256689845504478,
|
|
"grad_norm": 0.1322467029094696,
|
|
"learning_rate": 0.0013269120149666353,
|
|
"loss": 2.9997226715087892,
|
|
"num_input_tokens_seen": 7030702080,
|
|
"step": 13410,
|
|
"train_runtime": 60902.1814,
|
|
"train_tokens_per_second": 115442.533
|
|
},
|
|
{
|
|
"epoch": 0.726210124732812,
|
|
"grad_norm": 0.13496780395507812,
|
|
"learning_rate": 0.0013238690611809029,
|
|
"loss": 3.00130615234375,
|
|
"num_input_tokens_seen": 7035944960,
|
|
"step": 13420,
|
|
"train_runtime": 60947.3114,
|
|
"train_tokens_per_second": 115443.074
|
|
},
|
|
{
|
|
"epoch": 0.7267512649151763,
|
|
"grad_norm": 0.13476966321468353,
|
|
"learning_rate": 0.0013208304613073197,
|
|
"loss": 2.9966285705566404,
|
|
"num_input_tokens_seen": 7041187840,
|
|
"step": 13430,
|
|
"train_runtime": 60992.4581,
|
|
"train_tokens_per_second": 115443.582
|
|
},
|
|
{
|
|
"epoch": 0.7272924050975406,
|
|
"grad_norm": 0.13049598038196564,
|
|
"learning_rate": 0.0013177962246225905,
|
|
"loss": 3.0012109756469725,
|
|
"num_input_tokens_seen": 7046430720,
|
|
"step": 13440,
|
|
"train_runtime": 61037.614,
|
|
"train_tokens_per_second": 115444.072
|
|
},
|
|
{
|
|
"epoch": 0.7278335452799047,
|
|
"grad_norm": 0.1286519169807434,
|
|
"learning_rate": 0.0013147663603901006,
|
|
"loss": 2.9998191833496093,
|
|
"num_input_tokens_seen": 7051673600,
|
|
"step": 13450,
|
|
"train_runtime": 61082.7378,
|
|
"train_tokens_per_second": 115444.622
|
|
},
|
|
{
|
|
"epoch": 0.728374685462269,
|
|
"grad_norm": 0.13326317071914673,
|
|
"learning_rate": 0.0013117408778598853,
|
|
"loss": 2.980904769897461,
|
|
"num_input_tokens_seen": 7056916480,
|
|
"step": 13460,
|
|
"train_runtime": 61127.8727,
|
|
"train_tokens_per_second": 115445.151
|
|
},
|
|
{
|
|
"epoch": 0.7289158256446332,
|
|
"grad_norm": 0.13441520929336548,
|
|
"learning_rate": 0.001308719786268604,
|
|
"loss": 3.0028324127197266,
|
|
"num_input_tokens_seen": 7062159360,
|
|
"step": 13470,
|
|
"train_runtime": 61173.0008,
|
|
"train_tokens_per_second": 115445.691
|
|
},
|
|
{
|
|
"epoch": 0.7294569658269975,
|
|
"grad_norm": 0.13160498440265656,
|
|
"learning_rate": 0.0013057030948395115,
|
|
"loss": 2.990519332885742,
|
|
"num_input_tokens_seen": 7067402240,
|
|
"step": 13480,
|
|
"train_runtime": 61218.1024,
|
|
"train_tokens_per_second": 115446.281
|
|
},
|
|
{
|
|
"epoch": 0.7299981060093618,
|
|
"grad_norm": 0.13775858283042908,
|
|
"learning_rate": 0.001302690812782427,
|
|
"loss": 3.006916046142578,
|
|
"num_input_tokens_seen": 7072645120,
|
|
"step": 13490,
|
|
"train_runtime": 61263.2414,
|
|
"train_tokens_per_second": 115446.799
|
|
},
|
|
{
|
|
"epoch": 0.7305392461917259,
|
|
"grad_norm": 0.13651160895824432,
|
|
"learning_rate": 0.0012996829492937084,
|
|
"loss": 3.000609016418457,
|
|
"num_input_tokens_seen": 7077888000,
|
|
"step": 13500,
|
|
"train_runtime": 61308.388,
|
|
"train_tokens_per_second": 115447.302
|
|
},
|
|
{
|
|
"epoch": 0.7305392461917259,
|
|
"eval_loss": 2.9539315700531006,
|
|
"eval_runtime": 1.9872,
|
|
"eval_samples_per_second": 251.611,
|
|
"eval_steps_per_second": 4.026,
|
|
"num_input_tokens_seen": 7077888000,
|
|
"step": 13500
|
|
},
|
|
{
|
|
"epoch": 0.7310803863740902,
|
|
"grad_norm": 0.1339404284954071,
|
|
"learning_rate": 0.001296679513556226,
|
|
"loss": 2.9880565643310546,
|
|
"num_input_tokens_seen": 7083130880,
|
|
"step": 13510,
|
|
"train_runtime": 61355.5007,
|
|
"train_tokens_per_second": 115444.105
|
|
},
|
|
{
|
|
"epoch": 0.7316215265564544,
|
|
"grad_norm": 0.1354180872440338,
|
|
"learning_rate": 0.0012936805147393292,
|
|
"loss": 2.9919578552246096,
|
|
"num_input_tokens_seen": 7088373760,
|
|
"step": 13520,
|
|
"train_runtime": 61400.641,
|
|
"train_tokens_per_second": 115444.622
|
|
},
|
|
{
|
|
"epoch": 0.7321626667388187,
|
|
"grad_norm": 0.13503789901733398,
|
|
"learning_rate": 0.0012906859619988247,
|
|
"loss": 2.99132080078125,
|
|
"num_input_tokens_seen": 7093616640,
|
|
"step": 13530,
|
|
"train_runtime": 61445.7513,
|
|
"train_tokens_per_second": 115445.193
|
|
},
|
|
{
|
|
"epoch": 0.732703806921183,
|
|
"grad_norm": 0.13498766720294952,
|
|
"learning_rate": 0.0012876958644769446,
|
|
"loss": 2.9880552291870117,
|
|
"num_input_tokens_seen": 7098859520,
|
|
"step": 13540,
|
|
"train_runtime": 61490.8935,
|
|
"train_tokens_per_second": 115445.704
|
|
},
|
|
{
|
|
"epoch": 0.7332449471035472,
|
|
"grad_norm": 0.13910213112831116,
|
|
"learning_rate": 0.0012847102313023185,
|
|
"loss": 2.996448516845703,
|
|
"num_input_tokens_seen": 7104102400,
|
|
"step": 13550,
|
|
"train_runtime": 61536.0395,
|
|
"train_tokens_per_second": 115446.208
|
|
},
|
|
{
|
|
"epoch": 0.7337860872859114,
|
|
"grad_norm": 0.13978877663612366,
|
|
"learning_rate": 0.0012817290715899468,
|
|
"loss": 2.9948408126831056,
|
|
"num_input_tokens_seen": 7109345280,
|
|
"step": 13560,
|
|
"train_runtime": 61581.1749,
|
|
"train_tokens_per_second": 115446.73
|
|
},
|
|
{
|
|
"epoch": 0.7343272274682756,
|
|
"grad_norm": 0.12929198145866394,
|
|
"learning_rate": 0.0012787523944411728,
|
|
"loss": 2.990352821350098,
|
|
"num_input_tokens_seen": 7114588160,
|
|
"step": 13570,
|
|
"train_runtime": 61626.3208,
|
|
"train_tokens_per_second": 115447.232
|
|
},
|
|
{
|
|
"epoch": 0.7348683676506399,
|
|
"grad_norm": 0.12884965538978577,
|
|
"learning_rate": 0.001275780208943655,
|
|
"loss": 2.9938125610351562,
|
|
"num_input_tokens_seen": 7119831040,
|
|
"step": 13580,
|
|
"train_runtime": 61671.467,
|
|
"train_tokens_per_second": 115447.733
|
|
},
|
|
{
|
|
"epoch": 0.7354095078330042,
|
|
"grad_norm": 0.13231875002384186,
|
|
"learning_rate": 0.0012728125241713403,
|
|
"loss": 2.9899265289306642,
|
|
"num_input_tokens_seen": 7125073920,
|
|
"step": 13590,
|
|
"train_runtime": 61716.5949,
|
|
"train_tokens_per_second": 115448.267
|
|
},
|
|
{
|
|
"epoch": 0.7359506480153684,
|
|
"grad_norm": 0.13000380992889404,
|
|
"learning_rate": 0.001269849349184432,
|
|
"loss": 2.997477722167969,
|
|
"num_input_tokens_seen": 7130316800,
|
|
"step": 13600,
|
|
"train_runtime": 61761.7628,
|
|
"train_tokens_per_second": 115448.725
|
|
},
|
|
{
|
|
"epoch": 0.7364917881977326,
|
|
"grad_norm": 0.13756293058395386,
|
|
"learning_rate": 0.0012668906930293686,
|
|
"loss": 2.9921825408935545,
|
|
"num_input_tokens_seen": 7135559680,
|
|
"step": 13610,
|
|
"train_runtime": 61806.8862,
|
|
"train_tokens_per_second": 115449.266
|
|
},
|
|
{
|
|
"epoch": 0.7370329283800968,
|
|
"grad_norm": 0.134871244430542,
|
|
"learning_rate": 0.0012639365647387907,
|
|
"loss": 2.991608238220215,
|
|
"num_input_tokens_seen": 7140802560,
|
|
"step": 13620,
|
|
"train_runtime": 61852.0353,
|
|
"train_tokens_per_second": 115449.759
|
|
},
|
|
{
|
|
"epoch": 0.7375740685624611,
|
|
"grad_norm": 0.13307398557662964,
|
|
"learning_rate": 0.0012609869733315145,
|
|
"loss": 2.994303512573242,
|
|
"num_input_tokens_seen": 7146045440,
|
|
"step": 13630,
|
|
"train_runtime": 61897.1942,
|
|
"train_tokens_per_second": 115450.232
|
|
},
|
|
{
|
|
"epoch": 0.7381152087448254,
|
|
"grad_norm": 0.1326708197593689,
|
|
"learning_rate": 0.0012580419278125086,
|
|
"loss": 2.9904823303222656,
|
|
"num_input_tokens_seen": 7151288320,
|
|
"step": 13640,
|
|
"train_runtime": 61942.3523,
|
|
"train_tokens_per_second": 115450.706
|
|
},
|
|
{
|
|
"epoch": 0.7386563489271896,
|
|
"grad_norm": 0.13145731389522552,
|
|
"learning_rate": 0.0012551014371728615,
|
|
"loss": 2.991769790649414,
|
|
"num_input_tokens_seen": 7156531200,
|
|
"step": 13650,
|
|
"train_runtime": 61987.491,
|
|
"train_tokens_per_second": 115451.216
|
|
},
|
|
{
|
|
"epoch": 0.7391974891095539,
|
|
"grad_norm": 0.13033975660800934,
|
|
"learning_rate": 0.0012521655103897556,
|
|
"loss": 2.9962963104248046,
|
|
"num_input_tokens_seen": 7161774080,
|
|
"step": 13660,
|
|
"train_runtime": 62032.6128,
|
|
"train_tokens_per_second": 115451.756
|
|
},
|
|
{
|
|
"epoch": 0.739738629291918,
|
|
"grad_norm": 0.13624544441699982,
|
|
"learning_rate": 0.0012492341564264394,
|
|
"loss": 2.9916343688964844,
|
|
"num_input_tokens_seen": 7167016960,
|
|
"step": 13670,
|
|
"train_runtime": 62077.7496,
|
|
"train_tokens_per_second": 115452.268
|
|
},
|
|
{
|
|
"epoch": 0.7402797694742823,
|
|
"grad_norm": 0.12694226205348969,
|
|
"learning_rate": 0.0012463073842322032,
|
|
"loss": 2.9956790924072267,
|
|
"num_input_tokens_seen": 7172259840,
|
|
"step": 13680,
|
|
"train_runtime": 62122.8901,
|
|
"train_tokens_per_second": 115452.772
|
|
},
|
|
{
|
|
"epoch": 0.7408209096566466,
|
|
"grad_norm": 0.14218159019947052,
|
|
"learning_rate": 0.0012433852027423462,
|
|
"loss": 2.9924745559692383,
|
|
"num_input_tokens_seen": 7177502720,
|
|
"step": 13690,
|
|
"train_runtime": 62168.0831,
|
|
"train_tokens_per_second": 115453.177
|
|
},
|
|
{
|
|
"epoch": 0.7413620498390108,
|
|
"grad_norm": 0.13965629041194916,
|
|
"learning_rate": 0.0012404676208781556,
|
|
"loss": 2.9898683547973635,
|
|
"num_input_tokens_seen": 7182745600,
|
|
"step": 13700,
|
|
"train_runtime": 62213.3158,
|
|
"train_tokens_per_second": 115453.509
|
|
},
|
|
{
|
|
"epoch": 0.7419031900213751,
|
|
"grad_norm": 0.13439473509788513,
|
|
"learning_rate": 0.0012375546475468736,
|
|
"loss": 2.99302978515625,
|
|
"num_input_tokens_seen": 7187988480,
|
|
"step": 13710,
|
|
"train_runtime": 62258.5518,
|
|
"train_tokens_per_second": 115453.834
|
|
},
|
|
{
|
|
"epoch": 0.7424443302037392,
|
|
"grad_norm": 0.13322672247886658,
|
|
"learning_rate": 0.0012346462916416746,
|
|
"loss": 2.9867807388305665,
|
|
"num_input_tokens_seen": 7193231360,
|
|
"step": 13720,
|
|
"train_runtime": 62303.7184,
|
|
"train_tokens_per_second": 115454.287
|
|
},
|
|
{
|
|
"epoch": 0.7429854703861035,
|
|
"grad_norm": 0.13469451665878296,
|
|
"learning_rate": 0.001231742562041635,
|
|
"loss": 2.9933212280273436,
|
|
"num_input_tokens_seen": 7198474240,
|
|
"step": 13730,
|
|
"train_runtime": 62348.8665,
|
|
"train_tokens_per_second": 115454.773
|
|
},
|
|
{
|
|
"epoch": 0.7435266105684678,
|
|
"grad_norm": 0.1325179785490036,
|
|
"learning_rate": 0.001228843467611706,
|
|
"loss": 2.9945892333984374,
|
|
"num_input_tokens_seen": 7203717120,
|
|
"step": 13740,
|
|
"train_runtime": 62397.9384,
|
|
"train_tokens_per_second": 115447.999
|
|
},
|
|
{
|
|
"epoch": 0.744067750750832,
|
|
"grad_norm": 0.1386304348707199,
|
|
"learning_rate": 0.0012259490172026927,
|
|
"loss": 2.989889907836914,
|
|
"num_input_tokens_seen": 7208960000,
|
|
"step": 13750,
|
|
"train_runtime": 62443.1321,
|
|
"train_tokens_per_second": 115448.405
|
|
},
|
|
{
|
|
"epoch": 0.7446088909331963,
|
|
"grad_norm": 0.13061648607254028,
|
|
"learning_rate": 0.0012230592196512174,
|
|
"loss": 2.986536407470703,
|
|
"num_input_tokens_seen": 7214202880,
|
|
"step": 13760,
|
|
"train_runtime": 62488.3343,
|
|
"train_tokens_per_second": 115448.795
|
|
},
|
|
{
|
|
"epoch": 0.7451500311155604,
|
|
"grad_norm": 0.12978407740592957,
|
|
"learning_rate": 0.0012201740837796992,
|
|
"loss": 2.9931753158569334,
|
|
"num_input_tokens_seen": 7219445760,
|
|
"step": 13770,
|
|
"train_runtime": 62533.544,
|
|
"train_tokens_per_second": 115449.17
|
|
},
|
|
{
|
|
"epoch": 0.7456911712979247,
|
|
"grad_norm": 0.12974348664283752,
|
|
"learning_rate": 0.0012172936183963243,
|
|
"loss": 2.98385009765625,
|
|
"num_input_tokens_seen": 7224688640,
|
|
"step": 13780,
|
|
"train_runtime": 62578.7317,
|
|
"train_tokens_per_second": 115449.586
|
|
},
|
|
{
|
|
"epoch": 0.746232311480289,
|
|
"grad_norm": 0.1361524909734726,
|
|
"learning_rate": 0.0012144178322950217,
|
|
"loss": 2.996071624755859,
|
|
"num_input_tokens_seen": 7229931520,
|
|
"step": 13790,
|
|
"train_runtime": 62623.945,
|
|
"train_tokens_per_second": 115449.953
|
|
},
|
|
{
|
|
"epoch": 0.7467734516626532,
|
|
"grad_norm": 0.12753413617610931,
|
|
"learning_rate": 0.0012115467342554353,
|
|
"loss": 2.989743232727051,
|
|
"num_input_tokens_seen": 7235174400,
|
|
"step": 13800,
|
|
"train_runtime": 62669.1454,
|
|
"train_tokens_per_second": 115450.344
|
|
},
|
|
{
|
|
"epoch": 0.7473145918450175,
|
|
"grad_norm": 0.1313578486442566,
|
|
"learning_rate": 0.0012086803330428942,
|
|
"loss": 2.9922863006591798,
|
|
"num_input_tokens_seen": 7240417280,
|
|
"step": 13810,
|
|
"train_runtime": 62714.3608,
|
|
"train_tokens_per_second": 115450.707
|
|
},
|
|
{
|
|
"epoch": 0.7478557320273816,
|
|
"grad_norm": 0.13242116570472717,
|
|
"learning_rate": 0.0012058186374083889,
|
|
"loss": 2.9887691497802735,
|
|
"num_input_tokens_seen": 7245660160,
|
|
"step": 13820,
|
|
"train_runtime": 62759.5959,
|
|
"train_tokens_per_second": 115451.033
|
|
},
|
|
{
|
|
"epoch": 0.7483968722097459,
|
|
"grad_norm": 0.1344103366136551,
|
|
"learning_rate": 0.0012029616560885453,
|
|
"loss": 2.989380645751953,
|
|
"num_input_tokens_seen": 7250903040,
|
|
"step": 13830,
|
|
"train_runtime": 62804.8179,
|
|
"train_tokens_per_second": 115451.382
|
|
},
|
|
{
|
|
"epoch": 0.7489380123921102,
|
|
"grad_norm": 0.13286016881465912,
|
|
"learning_rate": 0.001200109397805595,
|
|
"loss": 2.9872367858886717,
|
|
"num_input_tokens_seen": 7256145920,
|
|
"step": 13840,
|
|
"train_runtime": 62850.0273,
|
|
"train_tokens_per_second": 115451.754
|
|
},
|
|
{
|
|
"epoch": 0.7494791525744744,
|
|
"grad_norm": 0.13758355379104614,
|
|
"learning_rate": 0.0011972618712673526,
|
|
"loss": 2.9894548416137696,
|
|
"num_input_tokens_seen": 7261388800,
|
|
"step": 13850,
|
|
"train_runtime": 62895.244,
|
|
"train_tokens_per_second": 115452.113
|
|
},
|
|
{
|
|
"epoch": 0.7500202927568387,
|
|
"grad_norm": 0.13310939073562622,
|
|
"learning_rate": 0.0011944190851671855,
|
|
"loss": 2.980154800415039,
|
|
"num_input_tokens_seen": 7266631680,
|
|
"step": 13860,
|
|
"train_runtime": 62940.4589,
|
|
"train_tokens_per_second": 115452.474
|
|
},
|
|
{
|
|
"epoch": 0.7505614329392029,
|
|
"grad_norm": 0.13724195957183838,
|
|
"learning_rate": 0.0011915810481839884,
|
|
"loss": 2.9957542419433594,
|
|
"num_input_tokens_seen": 7271874560,
|
|
"step": 13870,
|
|
"train_runtime": 62985.6674,
|
|
"train_tokens_per_second": 115452.846
|
|
},
|
|
{
|
|
"epoch": 0.7511025731215671,
|
|
"grad_norm": 0.13776428997516632,
|
|
"learning_rate": 0.0011887477689821579,
|
|
"loss": 2.9919281005859375,
|
|
"num_input_tokens_seen": 7277117440,
|
|
"step": 13880,
|
|
"train_runtime": 63030.8734,
|
|
"train_tokens_per_second": 115453.222
|
|
},
|
|
{
|
|
"epoch": 0.7516437133039314,
|
|
"grad_norm": 0.13441872596740723,
|
|
"learning_rate": 0.001185919256211564,
|
|
"loss": 2.9903282165527343,
|
|
"num_input_tokens_seen": 7282360320,
|
|
"step": 13890,
|
|
"train_runtime": 63076.0694,
|
|
"train_tokens_per_second": 115453.616
|
|
},
|
|
{
|
|
"epoch": 0.7521848534862956,
|
|
"grad_norm": 0.14160217344760895,
|
|
"learning_rate": 0.001183095518507527,
|
|
"loss": 2.9950998306274412,
|
|
"num_input_tokens_seen": 7287603200,
|
|
"step": 13900,
|
|
"train_runtime": 63121.2819,
|
|
"train_tokens_per_second": 115453.98
|
|
},
|
|
{
|
|
"epoch": 0.7527259936686599,
|
|
"grad_norm": 0.13321471214294434,
|
|
"learning_rate": 0.001180276564490789,
|
|
"loss": 2.9867202758789064,
|
|
"num_input_tokens_seen": 7292846080,
|
|
"step": 13910,
|
|
"train_runtime": 63166.4818,
|
|
"train_tokens_per_second": 115454.366
|
|
},
|
|
{
|
|
"epoch": 0.7532671338510241,
|
|
"grad_norm": 0.13260754942893982,
|
|
"learning_rate": 0.001177462402767485,
|
|
"loss": 2.9936323165893555,
|
|
"num_input_tokens_seen": 7298088960,
|
|
"step": 13920,
|
|
"train_runtime": 63211.6992,
|
|
"train_tokens_per_second": 115454.719
|
|
},
|
|
{
|
|
"epoch": 0.7538082740333883,
|
|
"grad_norm": 0.13385504484176636,
|
|
"learning_rate": 0.0011746530419291235,
|
|
"loss": 2.9826412200927734,
|
|
"num_input_tokens_seen": 7303331840,
|
|
"step": 13930,
|
|
"train_runtime": 63256.8908,
|
|
"train_tokens_per_second": 115455.119
|
|
},
|
|
{
|
|
"epoch": 0.7543494142157526,
|
|
"grad_norm": 0.1354595571756363,
|
|
"learning_rate": 0.0011718484905525526,
|
|
"loss": 2.9921710968017576,
|
|
"num_input_tokens_seen": 7308574720,
|
|
"step": 13940,
|
|
"train_runtime": 63302.0738,
|
|
"train_tokens_per_second": 115455.534
|
|
},
|
|
{
|
|
"epoch": 0.7548905543981168,
|
|
"grad_norm": 0.13242025673389435,
|
|
"learning_rate": 0.0011690487571999377,
|
|
"loss": 2.9915000915527346,
|
|
"num_input_tokens_seen": 7313817600,
|
|
"step": 13950,
|
|
"train_runtime": 63347.2678,
|
|
"train_tokens_per_second": 115455.928
|
|
},
|
|
{
|
|
"epoch": 0.7554316945804811,
|
|
"grad_norm": 0.1303345412015915,
|
|
"learning_rate": 0.0011662538504187375,
|
|
"loss": 2.992412567138672,
|
|
"num_input_tokens_seen": 7319060480,
|
|
"step": 13960,
|
|
"train_runtime": 63392.4687,
|
|
"train_tokens_per_second": 115456.309
|
|
},
|
|
{
|
|
"epoch": 0.7559728347628453,
|
|
"grad_norm": 0.1336052417755127,
|
|
"learning_rate": 0.0011634637787416738,
|
|
"loss": 2.9856544494628907,
|
|
"num_input_tokens_seen": 7324303360,
|
|
"step": 13970,
|
|
"train_runtime": 63437.6413,
|
|
"train_tokens_per_second": 115456.742
|
|
},
|
|
{
|
|
"epoch": 0.7565139749452096,
|
|
"grad_norm": 0.13160865008831024,
|
|
"learning_rate": 0.0011606785506867066,
|
|
"loss": 2.990740966796875,
|
|
"num_input_tokens_seen": 7329546240,
|
|
"step": 13980,
|
|
"train_runtime": 63482.8312,
|
|
"train_tokens_per_second": 115457.142
|
|
},
|
|
{
|
|
"epoch": 0.7570551151275738,
|
|
"grad_norm": 0.132036030292511,
|
|
"learning_rate": 0.0011578981747570086,
|
|
"loss": 2.9869890213012695,
|
|
"num_input_tokens_seen": 7334789120,
|
|
"step": 13990,
|
|
"train_runtime": 63528.0172,
|
|
"train_tokens_per_second": 115457.548
|
|
},
|
|
{
|
|
"epoch": 0.757596255309938,
|
|
"grad_norm": 0.13680653274059296,
|
|
"learning_rate": 0.0011551226594409406,
|
|
"loss": 2.9875946044921875,
|
|
"num_input_tokens_seen": 7340032000,
|
|
"step": 14000,
|
|
"train_runtime": 63573.1915,
|
|
"train_tokens_per_second": 115457.976
|
|
},
|
|
{
|
|
"epoch": 0.757596255309938,
|
|
"eval_loss": 2.948127031326294,
|
|
"eval_runtime": 1.9851,
|
|
"eval_samples_per_second": 251.872,
|
|
"eval_steps_per_second": 4.03,
|
|
"num_input_tokens_seen": 7340032000,
|
|
"step": 14000
|
|
},
|
|
{
|
|
"epoch": 0.7581373954923023,
|
|
"grad_norm": 0.1333727240562439,
|
|
"learning_rate": 0.0011523520132120217,
|
|
"loss": 2.9936281204223634,
|
|
"num_input_tokens_seen": 7345274880,
|
|
"step": 14010,
|
|
"train_runtime": 63622.81,
|
|
"train_tokens_per_second": 115450.337
|
|
},
|
|
{
|
|
"epoch": 0.7586785356746665,
|
|
"grad_norm": 0.13183613121509552,
|
|
"learning_rate": 0.0011495862445289092,
|
|
"loss": 2.9838493347167967,
|
|
"num_input_tokens_seen": 7350517760,
|
|
"step": 14020,
|
|
"train_runtime": 63667.9625,
|
|
"train_tokens_per_second": 115450.809
|
|
},
|
|
{
|
|
"epoch": 0.7592196758570308,
|
|
"grad_norm": 0.13663019239902496,
|
|
"learning_rate": 0.0011468253618353661,
|
|
"loss": 2.9881641387939455,
|
|
"num_input_tokens_seen": 7355760640,
|
|
"step": 14030,
|
|
"train_runtime": 63713.121,
|
|
"train_tokens_per_second": 115451.269
|
|
},
|
|
{
|
|
"epoch": 0.759760816039395,
|
|
"grad_norm": 0.1334005743265152,
|
|
"learning_rate": 0.0011440693735602413,
|
|
"loss": 2.9827747344970703,
|
|
"num_input_tokens_seen": 7361003520,
|
|
"step": 14040,
|
|
"train_runtime": 63758.2642,
|
|
"train_tokens_per_second": 115451.755
|
|
},
|
|
{
|
|
"epoch": 0.7603019562217592,
|
|
"grad_norm": 0.1363915055990219,
|
|
"learning_rate": 0.0011413182881174402,
|
|
"loss": 2.976375961303711,
|
|
"num_input_tokens_seen": 7366246400,
|
|
"step": 14050,
|
|
"train_runtime": 63803.3929,
|
|
"train_tokens_per_second": 115452.268
|
|
},
|
|
{
|
|
"epoch": 0.7608430964041235,
|
|
"grad_norm": 0.13721340894699097,
|
|
"learning_rate": 0.0011385721139058986,
|
|
"loss": 3.0018871307373045,
|
|
"num_input_tokens_seen": 7371489280,
|
|
"step": 14060,
|
|
"train_runtime": 63848.5329,
|
|
"train_tokens_per_second": 115452.759
|
|
},
|
|
{
|
|
"epoch": 0.7613842365864877,
|
|
"grad_norm": 0.13170303404331207,
|
|
"learning_rate": 0.0011358308593095617,
|
|
"loss": 2.9844949722290037,
|
|
"num_input_tokens_seen": 7376732160,
|
|
"step": 14070,
|
|
"train_runtime": 63893.6665,
|
|
"train_tokens_per_second": 115453.261
|
|
},
|
|
{
|
|
"epoch": 0.761925376768852,
|
|
"grad_norm": 0.13645039498806,
|
|
"learning_rate": 0.0011330945326973533,
|
|
"loss": 2.9850318908691404,
|
|
"num_input_tokens_seen": 7381975040,
|
|
"step": 14080,
|
|
"train_runtime": 63938.7823,
|
|
"train_tokens_per_second": 115453.795
|
|
},
|
|
{
|
|
"epoch": 0.7624665169512163,
|
|
"grad_norm": 0.1297563761472702,
|
|
"learning_rate": 0.0011303631424231526,
|
|
"loss": 2.9895225524902345,
|
|
"num_input_tokens_seen": 7387217920,
|
|
"step": 14090,
|
|
"train_runtime": 63983.9157,
|
|
"train_tokens_per_second": 115454.296
|
|
},
|
|
{
|
|
"epoch": 0.7630076571335804,
|
|
"grad_norm": 0.13698382675647736,
|
|
"learning_rate": 0.0011276366968257677,
|
|
"loss": 2.9852466583251953,
|
|
"num_input_tokens_seen": 7392460800,
|
|
"step": 14100,
|
|
"train_runtime": 64029.0446,
|
|
"train_tokens_per_second": 115454.804
|
|
},
|
|
{
|
|
"epoch": 0.7635487973159447,
|
|
"grad_norm": 0.12868466973304749,
|
|
"learning_rate": 0.001124915204228913,
|
|
"loss": 2.982627105712891,
|
|
"num_input_tokens_seen": 7397703680,
|
|
"step": 14110,
|
|
"train_runtime": 64074.169,
|
|
"train_tokens_per_second": 115455.32
|
|
},
|
|
{
|
|
"epoch": 0.7640899374983089,
|
|
"grad_norm": 0.13413524627685547,
|
|
"learning_rate": 0.0011221986729411787,
|
|
"loss": 2.982726287841797,
|
|
"num_input_tokens_seen": 7402946560,
|
|
"step": 14120,
|
|
"train_runtime": 64123.0569,
|
|
"train_tokens_per_second": 115449.059
|
|
},
|
|
{
|
|
"epoch": 0.7646310776806732,
|
|
"grad_norm": 0.13302487134933472,
|
|
"learning_rate": 0.0011194871112560113,
|
|
"loss": 2.9999317169189452,
|
|
"num_input_tokens_seen": 7408189440,
|
|
"step": 14130,
|
|
"train_runtime": 64168.1991,
|
|
"train_tokens_per_second": 115449.546
|
|
},
|
|
{
|
|
"epoch": 0.7651722178630375,
|
|
"grad_norm": 0.13595032691955566,
|
|
"learning_rate": 0.001116780527451682,
|
|
"loss": 2.986163330078125,
|
|
"num_input_tokens_seen": 7413432320,
|
|
"step": 14140,
|
|
"train_runtime": 64213.3563,
|
|
"train_tokens_per_second": 115450.005
|
|
},
|
|
{
|
|
"epoch": 0.7657133580454016,
|
|
"grad_norm": 0.12740519642829895,
|
|
"learning_rate": 0.0011140789297912688,
|
|
"loss": 2.9861852645874025,
|
|
"num_input_tokens_seen": 7418675200,
|
|
"step": 14150,
|
|
"train_runtime": 64258.4713,
|
|
"train_tokens_per_second": 115450.54
|
|
},
|
|
{
|
|
"epoch": 0.7662544982277659,
|
|
"grad_norm": 0.13032016158103943,
|
|
"learning_rate": 0.0011113823265226242,
|
|
"loss": 2.9914901733398436,
|
|
"num_input_tokens_seen": 7423918080,
|
|
"step": 14160,
|
|
"train_runtime": 64303.6051,
|
|
"train_tokens_per_second": 115451.04
|
|
},
|
|
{
|
|
"epoch": 0.7667956384101301,
|
|
"grad_norm": 0.12856240570545197,
|
|
"learning_rate": 0.0011086907258783525,
|
|
"loss": 2.99139404296875,
|
|
"num_input_tokens_seen": 7429160960,
|
|
"step": 14170,
|
|
"train_runtime": 64348.7292,
|
|
"train_tokens_per_second": 115451.557
|
|
},
|
|
{
|
|
"epoch": 0.7673367785924944,
|
|
"grad_norm": 0.1300676167011261,
|
|
"learning_rate": 0.001106004136075789,
|
|
"loss": 2.980759620666504,
|
|
"num_input_tokens_seen": 7434403840,
|
|
"step": 14180,
|
|
"train_runtime": 64393.8763,
|
|
"train_tokens_per_second": 115452.032
|
|
},
|
|
{
|
|
"epoch": 0.7678779187748587,
|
|
"grad_norm": 0.13340207934379578,
|
|
"learning_rate": 0.0011033225653169676,
|
|
"loss": 2.979547882080078,
|
|
"num_input_tokens_seen": 7439646720,
|
|
"step": 14190,
|
|
"train_runtime": 64439.0196,
|
|
"train_tokens_per_second": 115452.513
|
|
},
|
|
{
|
|
"epoch": 0.7684190589572228,
|
|
"grad_norm": 0.1270836591720581,
|
|
"learning_rate": 0.0011006460217886007,
|
|
"loss": 2.9818099975585937,
|
|
"num_input_tokens_seen": 7444889600,
|
|
"step": 14200,
|
|
"train_runtime": 64484.1553,
|
|
"train_tokens_per_second": 115453.006
|
|
},
|
|
{
|
|
"epoch": 0.7689601991395871,
|
|
"grad_norm": 0.1316118985414505,
|
|
"learning_rate": 0.001097974513662052,
|
|
"loss": 2.9830299377441407,
|
|
"num_input_tokens_seen": 7450132480,
|
|
"step": 14210,
|
|
"train_runtime": 64529.2695,
|
|
"train_tokens_per_second": 115453.538
|
|
},
|
|
{
|
|
"epoch": 0.7695013393219513,
|
|
"grad_norm": 0.13914352655410767,
|
|
"learning_rate": 0.0010953080490933129,
|
|
"loss": 2.9925983428955076,
|
|
"num_input_tokens_seen": 7455375360,
|
|
"step": 14220,
|
|
"train_runtime": 64574.3994,
|
|
"train_tokens_per_second": 115454.041
|
|
},
|
|
{
|
|
"epoch": 0.7700424795043156,
|
|
"grad_norm": 0.13164092600345612,
|
|
"learning_rate": 0.0010926466362229787,
|
|
"loss": 2.9863054275512697,
|
|
"num_input_tokens_seen": 7460618240,
|
|
"step": 14230,
|
|
"train_runtime": 64619.5117,
|
|
"train_tokens_per_second": 115454.575
|
|
},
|
|
{
|
|
"epoch": 0.7705836196866799,
|
|
"grad_norm": 0.1346326619386673,
|
|
"learning_rate": 0.001089990283176218,
|
|
"loss": 2.9905773162841798,
|
|
"num_input_tokens_seen": 7465861120,
|
|
"step": 14240,
|
|
"train_runtime": 64664.6395,
|
|
"train_tokens_per_second": 115455.08
|
|
},
|
|
{
|
|
"epoch": 0.771124759869044,
|
|
"grad_norm": 0.1283544898033142,
|
|
"learning_rate": 0.0010873389980627568,
|
|
"loss": 2.9964345932006835,
|
|
"num_input_tokens_seen": 7471104000,
|
|
"step": 14250,
|
|
"train_runtime": 64709.798,
|
|
"train_tokens_per_second": 115455.53
|
|
},
|
|
{
|
|
"epoch": 0.7716659000514083,
|
|
"grad_norm": 0.13457883894443512,
|
|
"learning_rate": 0.0010846927889768454,
|
|
"loss": 2.9865245819091797,
|
|
"num_input_tokens_seen": 7476346880,
|
|
"step": 14260,
|
|
"train_runtime": 64754.9357,
|
|
"train_tokens_per_second": 115456.016
|
|
},
|
|
{
|
|
"epoch": 0.7722070402337725,
|
|
"grad_norm": 0.13008961081504822,
|
|
"learning_rate": 0.0010820516639972377,
|
|
"loss": 2.9932941436767577,
|
|
"num_input_tokens_seen": 7481589760,
|
|
"step": 14270,
|
|
"train_runtime": 64800.0796,
|
|
"train_tokens_per_second": 115456.49
|
|
},
|
|
{
|
|
"epoch": 0.7727481804161368,
|
|
"grad_norm": 0.13576596975326538,
|
|
"learning_rate": 0.0010794156311871674,
|
|
"loss": 2.975057601928711,
|
|
"num_input_tokens_seen": 7486832640,
|
|
"step": 14280,
|
|
"train_runtime": 64845.2255,
|
|
"train_tokens_per_second": 115456.96
|
|
},
|
|
{
|
|
"epoch": 0.7732893205985011,
|
|
"grad_norm": 0.13501375913619995,
|
|
"learning_rate": 0.0010767846985943225,
|
|
"loss": 2.983927536010742,
|
|
"num_input_tokens_seen": 7492075520,
|
|
"step": 14290,
|
|
"train_runtime": 64890.3622,
|
|
"train_tokens_per_second": 115457.446
|
|
},
|
|
{
|
|
"epoch": 0.7738304607808653,
|
|
"grad_norm": 0.1284349411725998,
|
|
"learning_rate": 0.0010741588742508182,
|
|
"loss": 2.994318199157715,
|
|
"num_input_tokens_seen": 7497318400,
|
|
"step": 14300,
|
|
"train_runtime": 64935.5045,
|
|
"train_tokens_per_second": 115457.922
|
|
},
|
|
{
|
|
"epoch": 0.7743716009632295,
|
|
"grad_norm": 0.13406863808631897,
|
|
"learning_rate": 0.0010715381661731754,
|
|
"loss": 2.9812191009521483,
|
|
"num_input_tokens_seen": 7502561280,
|
|
"step": 14310,
|
|
"train_runtime": 64980.6813,
|
|
"train_tokens_per_second": 115458.335
|
|
},
|
|
{
|
|
"epoch": 0.7749127411455937,
|
|
"grad_norm": 0.1352129429578781,
|
|
"learning_rate": 0.0010689225823622948,
|
|
"loss": 2.9968055725097655,
|
|
"num_input_tokens_seen": 7507804160,
|
|
"step": 14320,
|
|
"train_runtime": 65025.8721,
|
|
"train_tokens_per_second": 115458.723
|
|
},
|
|
{
|
|
"epoch": 0.775453881327958,
|
|
"grad_norm": 0.13681240379810333,
|
|
"learning_rate": 0.0010663121308034337,
|
|
"loss": 2.984090805053711,
|
|
"num_input_tokens_seen": 7513047040,
|
|
"step": 14330,
|
|
"train_runtime": 65071.0195,
|
|
"train_tokens_per_second": 115459.188
|
|
},
|
|
{
|
|
"epoch": 0.7759950215103223,
|
|
"grad_norm": 0.12757869064807892,
|
|
"learning_rate": 0.0010637068194661817,
|
|
"loss": 2.9872867584228517,
|
|
"num_input_tokens_seen": 7518289920,
|
|
"step": 14340,
|
|
"train_runtime": 65116.166,
|
|
"train_tokens_per_second": 115459.653
|
|
},
|
|
{
|
|
"epoch": 0.7765361616926865,
|
|
"grad_norm": 0.1297658532857895,
|
|
"learning_rate": 0.0010611066563044331,
|
|
"loss": 2.987481689453125,
|
|
"num_input_tokens_seen": 7523532800,
|
|
"step": 14350,
|
|
"train_runtime": 65161.3132,
|
|
"train_tokens_per_second": 115460.116
|
|
},
|
|
{
|
|
"epoch": 0.7770773018750508,
|
|
"grad_norm": 0.13100814819335938,
|
|
"learning_rate": 0.0010585116492563672,
|
|
"loss": 2.984407424926758,
|
|
"num_input_tokens_seen": 7528775680,
|
|
"step": 14360,
|
|
"train_runtime": 65206.4518,
|
|
"train_tokens_per_second": 115460.594
|
|
},
|
|
{
|
|
"epoch": 0.7776184420574149,
|
|
"grad_norm": 0.13708344101905823,
|
|
"learning_rate": 0.0010559218062444215,
|
|
"loss": 2.9803342819213867,
|
|
"num_input_tokens_seen": 7534018560,
|
|
"step": 14370,
|
|
"train_runtime": 65251.6135,
|
|
"train_tokens_per_second": 115461.031
|
|
},
|
|
{
|
|
"epoch": 0.7781595822397792,
|
|
"grad_norm": 0.13270463049411774,
|
|
"learning_rate": 0.001053337135175266,
|
|
"loss": 2.9783748626708983,
|
|
"num_input_tokens_seen": 7539261440,
|
|
"step": 14380,
|
|
"train_runtime": 65296.782,
|
|
"train_tokens_per_second": 115461.455
|
|
},
|
|
{
|
|
"epoch": 0.7787007224221435,
|
|
"grad_norm": 0.1348678022623062,
|
|
"learning_rate": 0.001050757643939784,
|
|
"loss": 2.985927963256836,
|
|
"num_input_tokens_seen": 7544504320,
|
|
"step": 14390,
|
|
"train_runtime": 65341.9205,
|
|
"train_tokens_per_second": 115461.931
|
|
},
|
|
{
|
|
"epoch": 0.7792418626045077,
|
|
"grad_norm": 0.1359061747789383,
|
|
"learning_rate": 0.0010481833404130433,
|
|
"loss": 2.977262496948242,
|
|
"num_input_tokens_seen": 7549747200,
|
|
"step": 14400,
|
|
"train_runtime": 65387.0473,
|
|
"train_tokens_per_second": 115462.427
|
|
},
|
|
{
|
|
"epoch": 0.779783002786872,
|
|
"grad_norm": 0.13489292562007904,
|
|
"learning_rate": 0.0010456142324542742,
|
|
"loss": 2.9768039703369142,
|
|
"num_input_tokens_seen": 7554990080,
|
|
"step": 14410,
|
|
"train_runtime": 65432.1998,
|
|
"train_tokens_per_second": 115462.878
|
|
},
|
|
{
|
|
"epoch": 0.7803241429692361,
|
|
"grad_norm": 0.13529463112354279,
|
|
"learning_rate": 0.001043050327906844,
|
|
"loss": 2.992759132385254,
|
|
"num_input_tokens_seen": 7560232960,
|
|
"step": 14420,
|
|
"train_runtime": 65477.3624,
|
|
"train_tokens_per_second": 115463.31
|
|
},
|
|
{
|
|
"epoch": 0.7808652831516004,
|
|
"grad_norm": 0.13989658653736115,
|
|
"learning_rate": 0.0010404916345982372,
|
|
"loss": 2.9861518859863283,
|
|
"num_input_tokens_seen": 7565475840,
|
|
"step": 14430,
|
|
"train_runtime": 65522.5287,
|
|
"train_tokens_per_second": 115463.734
|
|
},
|
|
{
|
|
"epoch": 0.7814064233339647,
|
|
"grad_norm": 0.13800008594989777,
|
|
"learning_rate": 0.0010379381603400246,
|
|
"loss": 2.983747100830078,
|
|
"num_input_tokens_seen": 7570718720,
|
|
"step": 14440,
|
|
"train_runtime": 65567.6879,
|
|
"train_tokens_per_second": 115464.171
|
|
},
|
|
{
|
|
"epoch": 0.7819475635163289,
|
|
"grad_norm": 0.14410988986492157,
|
|
"learning_rate": 0.0010353899129278482,
|
|
"loss": 2.986704444885254,
|
|
"num_input_tokens_seen": 7575961600,
|
|
"step": 14450,
|
|
"train_runtime": 65612.8209,
|
|
"train_tokens_per_second": 115464.653
|
|
},
|
|
{
|
|
"epoch": 0.7824887036986932,
|
|
"grad_norm": 0.13409604132175446,
|
|
"learning_rate": 0.0010328469001413872,
|
|
"loss": 2.9869441986083984,
|
|
"num_input_tokens_seen": 7581204480,
|
|
"step": 14460,
|
|
"train_runtime": 65657.9605,
|
|
"train_tokens_per_second": 115465.123
|
|
},
|
|
{
|
|
"epoch": 0.7830298438810573,
|
|
"grad_norm": 0.13234242796897888,
|
|
"learning_rate": 0.0010303091297443453,
|
|
"loss": 2.9890289306640625,
|
|
"num_input_tokens_seen": 7586447360,
|
|
"step": 14470,
|
|
"train_runtime": 65703.0949,
|
|
"train_tokens_per_second": 115465.601
|
|
},
|
|
{
|
|
"epoch": 0.7835709840634216,
|
|
"grad_norm": 0.13398636877536774,
|
|
"learning_rate": 0.001027776609484418,
|
|
"loss": 2.9826473236083983,
|
|
"num_input_tokens_seen": 7591690240,
|
|
"step": 14480,
|
|
"train_runtime": 65748.2396,
|
|
"train_tokens_per_second": 115466.061
|
|
},
|
|
{
|
|
"epoch": 0.7841121242457859,
|
|
"grad_norm": 0.13305144011974335,
|
|
"learning_rate": 0.0010252493470932719,
|
|
"loss": 2.9864757537841795,
|
|
"num_input_tokens_seen": 7596933120,
|
|
"step": 14490,
|
|
"train_runtime": 65793.3795,
|
|
"train_tokens_per_second": 115466.528
|
|
},
|
|
{
|
|
"epoch": 0.7846532644281501,
|
|
"grad_norm": 0.13172990083694458,
|
|
"learning_rate": 0.0010227273502865237,
|
|
"loss": 2.9912540435791017,
|
|
"num_input_tokens_seen": 7602176000,
|
|
"step": 14500,
|
|
"train_runtime": 65842.395,
|
|
"train_tokens_per_second": 115460.199
|
|
},
|
|
{
|
|
"epoch": 0.7846532644281501,
|
|
"eval_loss": 2.9429469108581543,
|
|
"eval_runtime": 1.9893,
|
|
"eval_samples_per_second": 251.343,
|
|
"eval_steps_per_second": 4.021,
|
|
"num_input_tokens_seen": 7602176000,
|
|
"step": 14500
|
|
},
|
|
{
|
|
"epoch": 0.7851944046105144,
|
|
"grad_norm": 0.13013876974582672,
|
|
"learning_rate": 0.0010202106267637142,
|
|
"loss": 2.9870655059814455,
|
|
"num_input_tokens_seen": 7607418880,
|
|
"step": 14510,
|
|
"train_runtime": 65889.5594,
|
|
"train_tokens_per_second": 115457.122
|
|
},
|
|
{
|
|
"epoch": 0.7857355447928785,
|
|
"grad_norm": 0.14158159494400024,
|
|
"learning_rate": 0.001017699184208284,
|
|
"loss": 2.9855068206787108,
|
|
"num_input_tokens_seen": 7612661760,
|
|
"step": 14520,
|
|
"train_runtime": 65934.7235,
|
|
"train_tokens_per_second": 115457.552
|
|
},
|
|
{
|
|
"epoch": 0.7862766849752428,
|
|
"grad_norm": 0.12904150784015656,
|
|
"learning_rate": 0.001015193030287551,
|
|
"loss": 2.9784725189208983,
|
|
"num_input_tokens_seen": 7617904640,
|
|
"step": 14530,
|
|
"train_runtime": 65979.8789,
|
|
"train_tokens_per_second": 115457.997
|
|
},
|
|
{
|
|
"epoch": 0.7868178251576071,
|
|
"grad_norm": 0.1475485861301422,
|
|
"learning_rate": 0.0010126921726526892,
|
|
"loss": 2.9963218688964846,
|
|
"num_input_tokens_seen": 7623147520,
|
|
"step": 14540,
|
|
"train_runtime": 66025.0052,
|
|
"train_tokens_per_second": 115458.492
|
|
},
|
|
{
|
|
"epoch": 0.7873589653399713,
|
|
"grad_norm": 0.13277380168437958,
|
|
"learning_rate": 0.0010101966189387007,
|
|
"loss": 2.9872737884521485,
|
|
"num_input_tokens_seen": 7628390400,
|
|
"step": 14550,
|
|
"train_runtime": 66070.1575,
|
|
"train_tokens_per_second": 115458.941
|
|
},
|
|
{
|
|
"epoch": 0.7879001055223356,
|
|
"grad_norm": 0.13506442308425903,
|
|
"learning_rate": 0.0010077063767643974,
|
|
"loss": 2.9895917892456056,
|
|
"num_input_tokens_seen": 7633633280,
|
|
"step": 14560,
|
|
"train_runtime": 66115.3068,
|
|
"train_tokens_per_second": 115459.394
|
|
},
|
|
{
|
|
"epoch": 0.7884412457046998,
|
|
"grad_norm": 0.13273315131664276,
|
|
"learning_rate": 0.0010052214537323724,
|
|
"loss": 2.9872600555419924,
|
|
"num_input_tokens_seen": 7638876160,
|
|
"step": 14570,
|
|
"train_runtime": 66160.4452,
|
|
"train_tokens_per_second": 115459.866
|
|
},
|
|
{
|
|
"epoch": 0.788982385887064,
|
|
"grad_norm": 0.1311519294977188,
|
|
"learning_rate": 0.0010027418574289832,
|
|
"loss": 2.9747976303100585,
|
|
"num_input_tokens_seen": 7644119040,
|
|
"step": 14580,
|
|
"train_runtime": 66205.59,
|
|
"train_tokens_per_second": 115460.326
|
|
},
|
|
{
|
|
"epoch": 0.7895235260694283,
|
|
"grad_norm": 0.13237175345420837,
|
|
"learning_rate": 0.0010002675954243225,
|
|
"loss": 2.9707094192504884,
|
|
"num_input_tokens_seen": 7649361920,
|
|
"step": 14590,
|
|
"train_runtime": 66250.7308,
|
|
"train_tokens_per_second": 115460.793
|
|
},
|
|
{
|
|
"epoch": 0.7900646662517925,
|
|
"grad_norm": 0.13623256981372833,
|
|
"learning_rate": 0.0009977986752721967,
|
|
"loss": 2.9789360046386717,
|
|
"num_input_tokens_seen": 7654604800,
|
|
"step": 14600,
|
|
"train_runtime": 66295.8847,
|
|
"train_tokens_per_second": 115461.236
|
|
},
|
|
{
|
|
"epoch": 0.7906058064341568,
|
|
"grad_norm": 0.13563480973243713,
|
|
"learning_rate": 0.0009953351045101087,
|
|
"loss": 2.976993942260742,
|
|
"num_input_tokens_seen": 7659847680,
|
|
"step": 14610,
|
|
"train_runtime": 66341.0194,
|
|
"train_tokens_per_second": 115461.712
|
|
},
|
|
{
|
|
"epoch": 0.791146946616521,
|
|
"grad_norm": 0.1308317333459854,
|
|
"learning_rate": 0.000992876890659225,
|
|
"loss": 2.9876148223876955,
|
|
"num_input_tokens_seen": 7665090560,
|
|
"step": 14620,
|
|
"train_runtime": 66386.152,
|
|
"train_tokens_per_second": 115462.191
|
|
},
|
|
{
|
|
"epoch": 0.7916880867988852,
|
|
"grad_norm": 0.12994542717933655,
|
|
"learning_rate": 0.0009904240412243594,
|
|
"loss": 2.989145278930664,
|
|
"num_input_tokens_seen": 7670333440,
|
|
"step": 14630,
|
|
"train_runtime": 66431.2999,
|
|
"train_tokens_per_second": 115462.643
|
|
},
|
|
{
|
|
"epoch": 0.7922292269812495,
|
|
"grad_norm": 0.13062526285648346,
|
|
"learning_rate": 0.0009879765636939479,
|
|
"loss": 2.9790761947631834,
|
|
"num_input_tokens_seen": 7675576320,
|
|
"step": 14640,
|
|
"train_runtime": 66476.4455,
|
|
"train_tokens_per_second": 115463.098
|
|
},
|
|
{
|
|
"epoch": 0.7927703671636137,
|
|
"grad_norm": 0.13198526203632355,
|
|
"learning_rate": 0.0009855344655400273,
|
|
"loss": 2.991826629638672,
|
|
"num_input_tokens_seen": 7680819200,
|
|
"step": 14650,
|
|
"train_runtime": 66521.5925,
|
|
"train_tokens_per_second": 115463.55
|
|
},
|
|
{
|
|
"epoch": 0.793311507345978,
|
|
"grad_norm": 0.12981140613555908,
|
|
"learning_rate": 0.0009830977542182112,
|
|
"loss": 2.97564754486084,
|
|
"num_input_tokens_seen": 7686062080,
|
|
"step": 14660,
|
|
"train_runtime": 66566.7229,
|
|
"train_tokens_per_second": 115464.03
|
|
},
|
|
{
|
|
"epoch": 0.7938526475283422,
|
|
"grad_norm": 0.13640232384204865,
|
|
"learning_rate": 0.0009806664371676665,
|
|
"loss": 2.9895370483398436,
|
|
"num_input_tokens_seen": 7691304960,
|
|
"step": 14670,
|
|
"train_runtime": 66611.843,
|
|
"train_tokens_per_second": 115464.527
|
|
},
|
|
{
|
|
"epoch": 0.7943937877107065,
|
|
"grad_norm": 0.13942649960517883,
|
|
"learning_rate": 0.0009782405218110937,
|
|
"loss": 2.983687973022461,
|
|
"num_input_tokens_seen": 7696547840,
|
|
"step": 14680,
|
|
"train_runtime": 66656.9717,
|
|
"train_tokens_per_second": 115465.009
|
|
},
|
|
{
|
|
"epoch": 0.7949349278930707,
|
|
"grad_norm": 0.13253772258758545,
|
|
"learning_rate": 0.0009758200155546995,
|
|
"loss": 2.9805246353149415,
|
|
"num_input_tokens_seen": 7701790720,
|
|
"step": 14690,
|
|
"train_runtime": 66702.1127,
|
|
"train_tokens_per_second": 115465.469
|
|
},
|
|
{
|
|
"epoch": 0.7954760680754349,
|
|
"grad_norm": 0.14124181866645813,
|
|
"learning_rate": 0.000973404925788178,
|
|
"loss": 2.9745468139648437,
|
|
"num_input_tokens_seen": 7707033600,
|
|
"step": 14700,
|
|
"train_runtime": 66747.2598,
|
|
"train_tokens_per_second": 115465.918
|
|
},
|
|
{
|
|
"epoch": 0.7960172082577992,
|
|
"grad_norm": 0.14020085334777832,
|
|
"learning_rate": 0.0009709952598846878,
|
|
"loss": 2.978104019165039,
|
|
"num_input_tokens_seen": 7712276480,
|
|
"step": 14710,
|
|
"train_runtime": 66792.381,
|
|
"train_tokens_per_second": 115466.411
|
|
},
|
|
{
|
|
"epoch": 0.7965583484401634,
|
|
"grad_norm": 0.14543874561786652,
|
|
"learning_rate": 0.0009685910252008282,
|
|
"loss": 2.972671890258789,
|
|
"num_input_tokens_seen": 7717519360,
|
|
"step": 14720,
|
|
"train_runtime": 66837.5213,
|
|
"train_tokens_per_second": 115466.87
|
|
},
|
|
{
|
|
"epoch": 0.7970994886225277,
|
|
"grad_norm": 0.1361764669418335,
|
|
"learning_rate": 0.0009661922290766168,
|
|
"loss": 2.979312515258789,
|
|
"num_input_tokens_seen": 7722762240,
|
|
"step": 14730,
|
|
"train_runtime": 66882.6798,
|
|
"train_tokens_per_second": 115467.297
|
|
},
|
|
{
|
|
"epoch": 0.797640628804892,
|
|
"grad_norm": 0.1359523981809616,
|
|
"learning_rate": 0.000963798878835467,
|
|
"loss": 2.9832695007324217,
|
|
"num_input_tokens_seen": 7728005120,
|
|
"step": 14740,
|
|
"train_runtime": 66927.821,
|
|
"train_tokens_per_second": 115467.753
|
|
},
|
|
{
|
|
"epoch": 0.7981817689872561,
|
|
"grad_norm": 0.1312197595834732,
|
|
"learning_rate": 0.0009614109817841685,
|
|
"loss": 2.988373565673828,
|
|
"num_input_tokens_seen": 7733248000,
|
|
"step": 14750,
|
|
"train_runtime": 66972.9704,
|
|
"train_tokens_per_second": 115468.195
|
|
},
|
|
{
|
|
"epoch": 0.7987229091696204,
|
|
"grad_norm": 0.1324051469564438,
|
|
"learning_rate": 0.00095902854521286,
|
|
"loss": 2.9794536590576173,
|
|
"num_input_tokens_seen": 7738490880,
|
|
"step": 14760,
|
|
"train_runtime": 67018.1103,
|
|
"train_tokens_per_second": 115468.652
|
|
},
|
|
{
|
|
"epoch": 0.7992640493519846,
|
|
"grad_norm": 0.13141310214996338,
|
|
"learning_rate": 0.0009566515763950114,
|
|
"loss": 2.979531097412109,
|
|
"num_input_tokens_seen": 7743733760,
|
|
"step": 14770,
|
|
"train_runtime": 67063.2657,
|
|
"train_tokens_per_second": 115469.083
|
|
},
|
|
{
|
|
"epoch": 0.7998051895343489,
|
|
"grad_norm": 0.13311649858951569,
|
|
"learning_rate": 0.0009542800825873985,
|
|
"loss": 2.978958511352539,
|
|
"num_input_tokens_seen": 7748976640,
|
|
"step": 14780,
|
|
"train_runtime": 67108.4044,
|
|
"train_tokens_per_second": 115469.541
|
|
},
|
|
{
|
|
"epoch": 0.8003463297167132,
|
|
"grad_norm": 0.1344899833202362,
|
|
"learning_rate": 0.0009519140710300836,
|
|
"loss": 2.9761631011962892,
|
|
"num_input_tokens_seen": 7754219520,
|
|
"step": 14790,
|
|
"train_runtime": 67153.558,
|
|
"train_tokens_per_second": 115469.973
|
|
},
|
|
{
|
|
"epoch": 0.8008874698990773,
|
|
"grad_norm": 0.1314343363046646,
|
|
"learning_rate": 0.0009495535489463907,
|
|
"loss": 2.9750953674316407,
|
|
"num_input_tokens_seen": 7759462400,
|
|
"step": 14800,
|
|
"train_runtime": 67198.7114,
|
|
"train_tokens_per_second": 115470.405
|
|
},
|
|
{
|
|
"epoch": 0.8014286100814416,
|
|
"grad_norm": 0.13687878847122192,
|
|
"learning_rate": 0.0009471985235428848,
|
|
"loss": 2.977894973754883,
|
|
"num_input_tokens_seen": 7764705280,
|
|
"step": 14810,
|
|
"train_runtime": 67243.8512,
|
|
"train_tokens_per_second": 115470.859
|
|
},
|
|
{
|
|
"epoch": 0.8019697502638058,
|
|
"grad_norm": 0.13268278539180756,
|
|
"learning_rate": 0.0009448490020093504,
|
|
"loss": 2.983228302001953,
|
|
"num_input_tokens_seen": 7769948160,
|
|
"step": 14820,
|
|
"train_runtime": 67288.9927,
|
|
"train_tokens_per_second": 115471.31
|
|
},
|
|
{
|
|
"epoch": 0.8025108904461701,
|
|
"grad_norm": 0.13738638162612915,
|
|
"learning_rate": 0.0009425049915187695,
|
|
"loss": 2.98532657623291,
|
|
"num_input_tokens_seen": 7775191040,
|
|
"step": 14830,
|
|
"train_runtime": 67334.146,
|
|
"train_tokens_per_second": 115471.741
|
|
},
|
|
{
|
|
"epoch": 0.8030520306285344,
|
|
"grad_norm": 0.13537852466106415,
|
|
"learning_rate": 0.0009401664992272974,
|
|
"loss": 2.9814353942871095,
|
|
"num_input_tokens_seen": 7780433920,
|
|
"step": 14840,
|
|
"train_runtime": 67379.3084,
|
|
"train_tokens_per_second": 115472.155
|
|
},
|
|
{
|
|
"epoch": 0.8035931708108985,
|
|
"grad_norm": 0.13461166620254517,
|
|
"learning_rate": 0.0009378335322742428,
|
|
"loss": 2.988892364501953,
|
|
"num_input_tokens_seen": 7785676800,
|
|
"step": 14850,
|
|
"train_runtime": 67424.4589,
|
|
"train_tokens_per_second": 115472.589
|
|
},
|
|
{
|
|
"epoch": 0.8041343109932628,
|
|
"grad_norm": 0.1397952139377594,
|
|
"learning_rate": 0.0009355060977820479,
|
|
"loss": 2.981852149963379,
|
|
"num_input_tokens_seen": 7790919680,
|
|
"step": 14860,
|
|
"train_runtime": 67469.6089,
|
|
"train_tokens_per_second": 115473.023
|
|
},
|
|
{
|
|
"epoch": 0.804675451175627,
|
|
"grad_norm": 0.13720718026161194,
|
|
"learning_rate": 0.000933184202856262,
|
|
"loss": 2.9753461837768556,
|
|
"num_input_tokens_seen": 7796162560,
|
|
"step": 14870,
|
|
"train_runtime": 67514.7478,
|
|
"train_tokens_per_second": 115473.475
|
|
},
|
|
{
|
|
"epoch": 0.8052165913579913,
|
|
"grad_norm": 0.13194413483142853,
|
|
"learning_rate": 0.0009308678545855248,
|
|
"loss": 2.98673038482666,
|
|
"num_input_tokens_seen": 7801405440,
|
|
"step": 14880,
|
|
"train_runtime": 67563.706,
|
|
"train_tokens_per_second": 115467.4
|
|
},
|
|
{
|
|
"epoch": 0.8057577315403556,
|
|
"grad_norm": 0.13509796559810638,
|
|
"learning_rate": 0.0009285570600415394,
|
|
"loss": 2.9741546630859377,
|
|
"num_input_tokens_seen": 7806648320,
|
|
"step": 14890,
|
|
"train_runtime": 67608.8064,
|
|
"train_tokens_per_second": 115467.921
|
|
},
|
|
{
|
|
"epoch": 0.8062988717227197,
|
|
"grad_norm": 0.13570842146873474,
|
|
"learning_rate": 0.0009262518262790568,
|
|
"loss": 2.9908029556274416,
|
|
"num_input_tokens_seen": 7811891200,
|
|
"step": 14900,
|
|
"train_runtime": 67653.9237,
|
|
"train_tokens_per_second": 115468.413
|
|
},
|
|
{
|
|
"epoch": 0.806840011905084,
|
|
"grad_norm": 0.1328882873058319,
|
|
"learning_rate": 0.0009239521603358486,
|
|
"loss": 2.9901811599731447,
|
|
"num_input_tokens_seen": 7817134080,
|
|
"step": 14910,
|
|
"train_runtime": 67699.0266,
|
|
"train_tokens_per_second": 115468.929
|
|
},
|
|
{
|
|
"epoch": 0.8073811520874482,
|
|
"grad_norm": 0.13037438690662384,
|
|
"learning_rate": 0.0009216580692326891,
|
|
"loss": 2.9751874923706056,
|
|
"num_input_tokens_seen": 7822376960,
|
|
"step": 14920,
|
|
"train_runtime": 67744.1354,
|
|
"train_tokens_per_second": 115469.434
|
|
},
|
|
{
|
|
"epoch": 0.8079222922698125,
|
|
"grad_norm": 0.13509000837802887,
|
|
"learning_rate": 0.0009193695599733333,
|
|
"loss": 2.9760356903076173,
|
|
"num_input_tokens_seen": 7827619840,
|
|
"step": 14930,
|
|
"train_runtime": 67789.236,
|
|
"train_tokens_per_second": 115469.952
|
|
},
|
|
{
|
|
"epoch": 0.8084634324521768,
|
|
"grad_norm": 0.13353431224822998,
|
|
"learning_rate": 0.0009170866395444952,
|
|
"loss": 2.979950714111328,
|
|
"num_input_tokens_seen": 7832862720,
|
|
"step": 14940,
|
|
"train_runtime": 67834.3595,
|
|
"train_tokens_per_second": 115470.431
|
|
},
|
|
{
|
|
"epoch": 0.809004572634541,
|
|
"grad_norm": 0.13296596705913544,
|
|
"learning_rate": 0.0009148093149158249,
|
|
"loss": 2.9780080795288084,
|
|
"num_input_tokens_seen": 7838105600,
|
|
"step": 14950,
|
|
"train_runtime": 67879.4629,
|
|
"train_tokens_per_second": 115470.943
|
|
},
|
|
{
|
|
"epoch": 0.8095457128169052,
|
|
"grad_norm": 0.13199231028556824,
|
|
"learning_rate": 0.0009125375930398896,
|
|
"loss": 2.976139450073242,
|
|
"num_input_tokens_seen": 7843348480,
|
|
"step": 14960,
|
|
"train_runtime": 67924.5642,
|
|
"train_tokens_per_second": 115471.458
|
|
},
|
|
{
|
|
"epoch": 0.8100868529992694,
|
|
"grad_norm": 0.1304149031639099,
|
|
"learning_rate": 0.0009102714808521528,
|
|
"loss": 2.9799163818359373,
|
|
"num_input_tokens_seen": 7848591360,
|
|
"step": 14970,
|
|
"train_runtime": 67969.6467,
|
|
"train_tokens_per_second": 115472.005
|
|
},
|
|
{
|
|
"epoch": 0.8106279931816337,
|
|
"grad_norm": 0.13312670588493347,
|
|
"learning_rate": 0.0009080109852709498,
|
|
"loss": 2.9826412200927734,
|
|
"num_input_tokens_seen": 7853834240,
|
|
"step": 14980,
|
|
"train_runtime": 68014.7473,
|
|
"train_tokens_per_second": 115472.52
|
|
},
|
|
{
|
|
"epoch": 0.811169133363998,
|
|
"grad_norm": 0.13625964522361755,
|
|
"learning_rate": 0.0009057561131974695,
|
|
"loss": 2.974313735961914,
|
|
"num_input_tokens_seen": 7859077120,
|
|
"step": 14990,
|
|
"train_runtime": 68059.848,
|
|
"train_tokens_per_second": 115473.034
|
|
},
|
|
{
|
|
"epoch": 0.8117102735463622,
|
|
"grad_norm": 0.13586074113845825,
|
|
"learning_rate": 0.000903506871515734,
|
|
"loss": 2.9799150466918944,
|
|
"num_input_tokens_seen": 7864320000,
|
|
"step": 15000,
|
|
"train_runtime": 68104.9508,
|
|
"train_tokens_per_second": 115473.544
|
|
},
|
|
{
|
|
"epoch": 0.8117102735463622,
|
|
"eval_loss": 2.9381465911865234,
|
|
"eval_runtime": 1.9846,
|
|
"eval_samples_per_second": 251.945,
|
|
"eval_steps_per_second": 4.031,
|
|
"num_input_tokens_seen": 7864320000,
|
|
"step": 15000
|
|
},
|
|
{
|
|
"epoch": 0.8122514137287264,
|
|
"grad_norm": 0.13391871750354767,
|
|
"learning_rate": 0.0009012632670925736,
|
|
"loss": 2.972438430786133,
|
|
"num_input_tokens_seen": 7869562880,
|
|
"step": 15010,
|
|
"train_runtime": 68154.5217,
|
|
"train_tokens_per_second": 115466.482
|
|
},
|
|
{
|
|
"epoch": 0.8127925539110906,
|
|
"grad_norm": 0.13467305898666382,
|
|
"learning_rate": 0.0008990253067776095,
|
|
"loss": 2.9732336044311523,
|
|
"num_input_tokens_seen": 7874805760,
|
|
"step": 15020,
|
|
"train_runtime": 68199.7002,
|
|
"train_tokens_per_second": 115466.868
|
|
},
|
|
{
|
|
"epoch": 0.8133336940934549,
|
|
"grad_norm": 0.13371260464191437,
|
|
"learning_rate": 0.0008967929974032304,
|
|
"loss": 2.9756675720214845,
|
|
"num_input_tokens_seen": 7880048640,
|
|
"step": 15030,
|
|
"train_runtime": 68244.8815,
|
|
"train_tokens_per_second": 115467.248
|
|
},
|
|
{
|
|
"epoch": 0.8138748342758192,
|
|
"grad_norm": 0.13191363215446472,
|
|
"learning_rate": 0.0008945663457845765,
|
|
"loss": 2.9834621429443358,
|
|
"num_input_tokens_seen": 7885291520,
|
|
"step": 15040,
|
|
"train_runtime": 68290.0502,
|
|
"train_tokens_per_second": 115467.649
|
|
},
|
|
{
|
|
"epoch": 0.8144159744581834,
|
|
"grad_norm": 0.1310187131166458,
|
|
"learning_rate": 0.0008923453587195116,
|
|
"loss": 2.9787324905395507,
|
|
"num_input_tokens_seen": 7890534400,
|
|
"step": 15050,
|
|
"train_runtime": 68335.2323,
|
|
"train_tokens_per_second": 115468.026
|
|
},
|
|
{
|
|
"epoch": 0.8149571146405477,
|
|
"grad_norm": 0.13005271553993225,
|
|
"learning_rate": 0.0008901300429886064,
|
|
"loss": 2.9818572998046875,
|
|
"num_input_tokens_seen": 7895777280,
|
|
"step": 15060,
|
|
"train_runtime": 68380.4424,
|
|
"train_tokens_per_second": 115468.356
|
|
},
|
|
{
|
|
"epoch": 0.8154982548229118,
|
|
"grad_norm": 0.13187964260578156,
|
|
"learning_rate": 0.0008879204053551192,
|
|
"loss": 2.9841533660888673,
|
|
"num_input_tokens_seen": 7901020160,
|
|
"step": 15070,
|
|
"train_runtime": 68425.6233,
|
|
"train_tokens_per_second": 115468.735
|
|
},
|
|
{
|
|
"epoch": 0.8160393950052761,
|
|
"grad_norm": 0.12774254381656647,
|
|
"learning_rate": 0.0008857164525649706,
|
|
"loss": 2.9738176345825194,
|
|
"num_input_tokens_seen": 7906263040,
|
|
"step": 15080,
|
|
"train_runtime": 68470.8074,
|
|
"train_tokens_per_second": 115469.108
|
|
},
|
|
{
|
|
"epoch": 0.8165805351876404,
|
|
"grad_norm": 0.13418236374855042,
|
|
"learning_rate": 0.0008835181913467284,
|
|
"loss": 2.9698516845703127,
|
|
"num_input_tokens_seen": 7911505920,
|
|
"step": 15090,
|
|
"train_runtime": 68516.0039,
|
|
"train_tokens_per_second": 115469.459
|
|
},
|
|
{
|
|
"epoch": 0.8171216753700046,
|
|
"grad_norm": 0.13305585086345673,
|
|
"learning_rate": 0.000881325628411582,
|
|
"loss": 2.9800113677978515,
|
|
"num_input_tokens_seen": 7916748800,
|
|
"step": 15100,
|
|
"train_runtime": 68561.1978,
|
|
"train_tokens_per_second": 115469.815
|
|
},
|
|
{
|
|
"epoch": 0.8176628155523689,
|
|
"grad_norm": 0.1298227459192276,
|
|
"learning_rate": 0.0008791387704533261,
|
|
"loss": 2.9894580841064453,
|
|
"num_input_tokens_seen": 7921991680,
|
|
"step": 15110,
|
|
"train_runtime": 68606.3897,
|
|
"train_tokens_per_second": 115470.173
|
|
},
|
|
{
|
|
"epoch": 0.818203955734733,
|
|
"grad_norm": 0.13746146857738495,
|
|
"learning_rate": 0.0008769576241483369,
|
|
"loss": 2.969521903991699,
|
|
"num_input_tokens_seen": 7927234560,
|
|
"step": 15120,
|
|
"train_runtime": 68651.5837,
|
|
"train_tokens_per_second": 115470.527
|
|
},
|
|
{
|
|
"epoch": 0.8187450959170973,
|
|
"grad_norm": 0.1307765245437622,
|
|
"learning_rate": 0.0008747821961555536,
|
|
"loss": 2.9746829986572267,
|
|
"num_input_tokens_seen": 7932477440,
|
|
"step": 15130,
|
|
"train_runtime": 68696.7803,
|
|
"train_tokens_per_second": 115470.877
|
|
},
|
|
{
|
|
"epoch": 0.8192862360994616,
|
|
"grad_norm": 0.12932413816452026,
|
|
"learning_rate": 0.0008726124931164572,
|
|
"loss": 2.980904388427734,
|
|
"num_input_tokens_seen": 7937720320,
|
|
"step": 15140,
|
|
"train_runtime": 68741.9605,
|
|
"train_tokens_per_second": 115471.253
|
|
},
|
|
{
|
|
"epoch": 0.8198273762818258,
|
|
"grad_norm": 0.13145951926708221,
|
|
"learning_rate": 0.0008704485216550531,
|
|
"loss": 2.977578544616699,
|
|
"num_input_tokens_seen": 7942963200,
|
|
"step": 15150,
|
|
"train_runtime": 68787.1491,
|
|
"train_tokens_per_second": 115471.615
|
|
},
|
|
{
|
|
"epoch": 0.8203685164641901,
|
|
"grad_norm": 0.13109584152698517,
|
|
"learning_rate": 0.0008682902883778457,
|
|
"loss": 2.973899078369141,
|
|
"num_input_tokens_seen": 7948206080,
|
|
"step": 15160,
|
|
"train_runtime": 68832.3314,
|
|
"train_tokens_per_second": 115471.987
|
|
},
|
|
{
|
|
"epoch": 0.8209096566465542,
|
|
"grad_norm": 0.1269070953130722,
|
|
"learning_rate": 0.0008661377998738207,
|
|
"loss": 2.9858329772949217,
|
|
"num_input_tokens_seen": 7953448960,
|
|
"step": 15170,
|
|
"train_runtime": 68877.5165,
|
|
"train_tokens_per_second": 115472.354
|
|
},
|
|
{
|
|
"epoch": 0.8214507968289185,
|
|
"grad_norm": 0.13239699602127075,
|
|
"learning_rate": 0.0008639910627144282,
|
|
"loss": 2.9783477783203125,
|
|
"num_input_tokens_seen": 7958691840,
|
|
"step": 15180,
|
|
"train_runtime": 68922.6959,
|
|
"train_tokens_per_second": 115472.73
|
|
},
|
|
{
|
|
"epoch": 0.8219919370112828,
|
|
"grad_norm": 0.129794642329216,
|
|
"learning_rate": 0.0008618500834535568,
|
|
"loss": 2.9712141036987303,
|
|
"num_input_tokens_seen": 7963934720,
|
|
"step": 15190,
|
|
"train_runtime": 68967.862,
|
|
"train_tokens_per_second": 115473.128
|
|
},
|
|
{
|
|
"epoch": 0.822533077193647,
|
|
"grad_norm": 0.13771747052669525,
|
|
"learning_rate": 0.0008597148686275189,
|
|
"loss": 2.984314727783203,
|
|
"num_input_tokens_seen": 7969177600,
|
|
"step": 15200,
|
|
"train_runtime": 69013.0362,
|
|
"train_tokens_per_second": 115473.511
|
|
},
|
|
{
|
|
"epoch": 0.8230742173760113,
|
|
"grad_norm": 0.13398458063602448,
|
|
"learning_rate": 0.0008575854247550258,
|
|
"loss": 2.9714584350585938,
|
|
"num_input_tokens_seen": 7974420480,
|
|
"step": 15210,
|
|
"train_runtime": 69058.1959,
|
|
"train_tokens_per_second": 115473.918
|
|
},
|
|
{
|
|
"epoch": 0.8236153575583754,
|
|
"grad_norm": 0.13028761744499207,
|
|
"learning_rate": 0.0008554617583371726,
|
|
"loss": 2.9726911544799806,
|
|
"num_input_tokens_seen": 7979663360,
|
|
"step": 15220,
|
|
"train_runtime": 69103.3538,
|
|
"train_tokens_per_second": 115474.328
|
|
},
|
|
{
|
|
"epoch": 0.8241564977407397,
|
|
"grad_norm": 0.13187240064144135,
|
|
"learning_rate": 0.0008533438758574152,
|
|
"loss": 2.9737316131591798,
|
|
"num_input_tokens_seen": 7984906240,
|
|
"step": 15230,
|
|
"train_runtime": 69148.515,
|
|
"train_tokens_per_second": 115474.732
|
|
},
|
|
{
|
|
"epoch": 0.824697637923104,
|
|
"grad_norm": 0.13035008311271667,
|
|
"learning_rate": 0.0008512317837815503,
|
|
"loss": 2.9657833099365236,
|
|
"num_input_tokens_seen": 7990149120,
|
|
"step": 15240,
|
|
"train_runtime": 69193.6841,
|
|
"train_tokens_per_second": 115475.122
|
|
},
|
|
{
|
|
"epoch": 0.8252387781054682,
|
|
"grad_norm": 0.1308414787054062,
|
|
"learning_rate": 0.0008491254885576988,
|
|
"loss": 2.968144416809082,
|
|
"num_input_tokens_seen": 7995392000,
|
|
"step": 15250,
|
|
"train_runtime": 69238.862,
|
|
"train_tokens_per_second": 115475.497
|
|
},
|
|
{
|
|
"epoch": 0.8257799182878325,
|
|
"grad_norm": 0.1312231868505478,
|
|
"learning_rate": 0.0008470249966162835,
|
|
"loss": 2.9749370574951173,
|
|
"num_input_tokens_seen": 8000634880,
|
|
"step": 15260,
|
|
"train_runtime": 69287.9095,
|
|
"train_tokens_per_second": 115469.422
|
|
},
|
|
{
|
|
"epoch": 0.8263210584701967,
|
|
"grad_norm": 0.13507384061813354,
|
|
"learning_rate": 0.0008449303143700088,
|
|
"loss": 2.9808319091796873,
|
|
"num_input_tokens_seen": 8005877760,
|
|
"step": 15270,
|
|
"train_runtime": 69333.0664,
|
|
"train_tokens_per_second": 115469.835
|
|
},
|
|
{
|
|
"epoch": 0.8268621986525609,
|
|
"grad_norm": 0.12942056357860565,
|
|
"learning_rate": 0.0008428414482138435,
|
|
"loss": 2.969392776489258,
|
|
"num_input_tokens_seen": 8011120640,
|
|
"step": 15280,
|
|
"train_runtime": 69378.1613,
|
|
"train_tokens_per_second": 115470.351
|
|
},
|
|
{
|
|
"epoch": 0.8274033388349252,
|
|
"grad_norm": 0.12837563455104828,
|
|
"learning_rate": 0.0008407584045250001,
|
|
"loss": 2.979315185546875,
|
|
"num_input_tokens_seen": 8016363520,
|
|
"step": 15290,
|
|
"train_runtime": 69423.2721,
|
|
"train_tokens_per_second": 115470.84
|
|
},
|
|
{
|
|
"epoch": 0.8279444790172894,
|
|
"grad_norm": 0.13300900161266327,
|
|
"learning_rate": 0.0008386811896629143,
|
|
"loss": 2.9644968032836916,
|
|
"num_input_tokens_seen": 8021606400,
|
|
"step": 15300,
|
|
"train_runtime": 69468.3762,
|
|
"train_tokens_per_second": 115471.339
|
|
},
|
|
{
|
|
"epoch": 0.8284856191996537,
|
|
"grad_norm": 0.12836603820323944,
|
|
"learning_rate": 0.0008366098099692285,
|
|
"loss": 2.972013473510742,
|
|
"num_input_tokens_seen": 8026849280,
|
|
"step": 15310,
|
|
"train_runtime": 69513.475,
|
|
"train_tokens_per_second": 115471.846
|
|
},
|
|
{
|
|
"epoch": 0.8290267593820179,
|
|
"grad_norm": 0.12967608869075775,
|
|
"learning_rate": 0.0008345442717677699,
|
|
"loss": 2.9776493072509767,
|
|
"num_input_tokens_seen": 8032092160,
|
|
"step": 15320,
|
|
"train_runtime": 69558.5739,
|
|
"train_tokens_per_second": 115472.352
|
|
},
|
|
{
|
|
"epoch": 0.8295678995643821,
|
|
"grad_norm": 0.12830476462841034,
|
|
"learning_rate": 0.0008324845813645304,
|
|
"loss": 2.9773494720458986,
|
|
"num_input_tokens_seen": 8037335040,
|
|
"step": 15330,
|
|
"train_runtime": 69603.6687,
|
|
"train_tokens_per_second": 115472.865
|
|
},
|
|
{
|
|
"epoch": 0.8301090397467464,
|
|
"grad_norm": 0.13105891644954681,
|
|
"learning_rate": 0.0008304307450476511,
|
|
"loss": 2.9748680114746096,
|
|
"num_input_tokens_seen": 8042577920,
|
|
"step": 15340,
|
|
"train_runtime": 69648.769,
|
|
"train_tokens_per_second": 115473.368
|
|
},
|
|
{
|
|
"epoch": 0.8306501799291106,
|
|
"grad_norm": 0.1301373690366745,
|
|
"learning_rate": 0.0008283827690873988,
|
|
"loss": 2.9727630615234375,
|
|
"num_input_tokens_seen": 8047820800,
|
|
"step": 15350,
|
|
"train_runtime": 69693.862,
|
|
"train_tokens_per_second": 115473.882
|
|
},
|
|
{
|
|
"epoch": 0.8311913201114749,
|
|
"grad_norm": 0.13162434101104736,
|
|
"learning_rate": 0.0008263406597361503,
|
|
"loss": 2.978099822998047,
|
|
"num_input_tokens_seen": 8053063680,
|
|
"step": 15360,
|
|
"train_runtime": 69738.9614,
|
|
"train_tokens_per_second": 115474.385
|
|
},
|
|
{
|
|
"epoch": 0.8317324602938391,
|
|
"grad_norm": 0.13288192451000214,
|
|
"learning_rate": 0.0008243044232283723,
|
|
"loss": 2.9758016586303713,
|
|
"num_input_tokens_seen": 8058306560,
|
|
"step": 15370,
|
|
"train_runtime": 69784.0695,
|
|
"train_tokens_per_second": 115474.873
|
|
},
|
|
{
|
|
"epoch": 0.8322736004762034,
|
|
"grad_norm": 0.136215478181839,
|
|
"learning_rate": 0.0008222740657806005,
|
|
"loss": 2.976166915893555,
|
|
"num_input_tokens_seen": 8063549440,
|
|
"step": 15380,
|
|
"train_runtime": 69829.1841,
|
|
"train_tokens_per_second": 115475.35
|
|
},
|
|
{
|
|
"epoch": 0.8328147406585676,
|
|
"grad_norm": 0.12879818677902222,
|
|
"learning_rate": 0.000820249593591422,
|
|
"loss": 2.9633615493774412,
|
|
"num_input_tokens_seen": 8068792320,
|
|
"step": 15390,
|
|
"train_runtime": 69874.3003,
|
|
"train_tokens_per_second": 115475.823
|
|
},
|
|
{
|
|
"epoch": 0.8333558808409318,
|
|
"grad_norm": 0.1428280621767044,
|
|
"learning_rate": 0.0008182310128414587,
|
|
"loss": 2.9798999786376954,
|
|
"num_input_tokens_seen": 8074035200,
|
|
"step": 15400,
|
|
"train_runtime": 69919.3861,
|
|
"train_tokens_per_second": 115476.346
|
|
},
|
|
{
|
|
"epoch": 0.8338970210232961,
|
|
"grad_norm": 0.1359853297472,
|
|
"learning_rate": 0.0008162183296933439,
|
|
"loss": 2.968707275390625,
|
|
"num_input_tokens_seen": 8079278080,
|
|
"step": 15410,
|
|
"train_runtime": 69964.4955,
|
|
"train_tokens_per_second": 115476.829
|
|
},
|
|
{
|
|
"epoch": 0.8344381612056603,
|
|
"grad_norm": 0.13050523400306702,
|
|
"learning_rate": 0.0008142115502917066,
|
|
"loss": 2.973996162414551,
|
|
"num_input_tokens_seen": 8084520960,
|
|
"step": 15420,
|
|
"train_runtime": 70009.6056,
|
|
"train_tokens_per_second": 115477.31
|
|
},
|
|
{
|
|
"epoch": 0.8349793013880246,
|
|
"grad_norm": 0.13029220700263977,
|
|
"learning_rate": 0.0008122106807631529,
|
|
"loss": 2.9792009353637696,
|
|
"num_input_tokens_seen": 8089763840,
|
|
"step": 15430,
|
|
"train_runtime": 70054.706,
|
|
"train_tokens_per_second": 115477.807
|
|
},
|
|
{
|
|
"epoch": 0.8355204415703888,
|
|
"grad_norm": 0.13232028484344482,
|
|
"learning_rate": 0.0008102157272162447,
|
|
"loss": 2.9753578186035154,
|
|
"num_input_tokens_seen": 8095006720,
|
|
"step": 15440,
|
|
"train_runtime": 70099.8205,
|
|
"train_tokens_per_second": 115478.28
|
|
},
|
|
{
|
|
"epoch": 0.836061581752753,
|
|
"grad_norm": 0.13095484673976898,
|
|
"learning_rate": 0.0008082266957414837,
|
|
"loss": 2.97320671081543,
|
|
"num_input_tokens_seen": 8100249600,
|
|
"step": 15450,
|
|
"train_runtime": 70144.9322,
|
|
"train_tokens_per_second": 115478.757
|
|
},
|
|
{
|
|
"epoch": 0.8366027219351173,
|
|
"grad_norm": 0.13523340225219727,
|
|
"learning_rate": 0.0008062435924112902,
|
|
"loss": 2.9681285858154296,
|
|
"num_input_tokens_seen": 8105492480,
|
|
"step": 15460,
|
|
"train_runtime": 70190.0213,
|
|
"train_tokens_per_second": 115479.271
|
|
},
|
|
{
|
|
"epoch": 0.8371438621174815,
|
|
"grad_norm": 0.13670340180397034,
|
|
"learning_rate": 0.0008042664232799893,
|
|
"loss": 2.9674022674560545,
|
|
"num_input_tokens_seen": 8110735360,
|
|
"step": 15470,
|
|
"train_runtime": 70235.1367,
|
|
"train_tokens_per_second": 115479.741
|
|
},
|
|
{
|
|
"epoch": 0.8376850022998458,
|
|
"grad_norm": 0.12936244904994965,
|
|
"learning_rate": 0.0008022951943837868,
|
|
"loss": 2.966217041015625,
|
|
"num_input_tokens_seen": 8115978240,
|
|
"step": 15480,
|
|
"train_runtime": 70280.2433,
|
|
"train_tokens_per_second": 115480.224
|
|
},
|
|
{
|
|
"epoch": 0.8382261424822101,
|
|
"grad_norm": 0.14200405776500702,
|
|
"learning_rate": 0.0008003299117407532,
|
|
"loss": 2.978799247741699,
|
|
"num_input_tokens_seen": 8121221120,
|
|
"step": 15490,
|
|
"train_runtime": 70325.3302,
|
|
"train_tokens_per_second": 115480.739
|
|
},
|
|
{
|
|
"epoch": 0.8387672826645742,
|
|
"grad_norm": 0.12791140377521515,
|
|
"learning_rate": 0.0007983705813508069,
|
|
"loss": 2.971164321899414,
|
|
"num_input_tokens_seen": 8126464000,
|
|
"step": 15500,
|
|
"train_runtime": 70370.4812,
|
|
"train_tokens_per_second": 115481.149
|
|
},
|
|
{
|
|
"epoch": 0.8387672826645742,
|
|
"eval_loss": 2.9325733184814453,
|
|
"eval_runtime": 1.9901,
|
|
"eval_samples_per_second": 251.238,
|
|
"eval_steps_per_second": 4.02,
|
|
"num_input_tokens_seen": 8126464000,
|
|
"step": 15500
|
|
},
|
|
{
|
|
"epoch": 0.8393084228469385,
|
|
"grad_norm": 0.1335526406764984,
|
|
"learning_rate": 0.0007964172091956926,
|
|
"loss": 2.9691984176635744,
|
|
"num_input_tokens_seen": 8131706880,
|
|
"step": 15510,
|
|
"train_runtime": 70417.588,
|
|
"train_tokens_per_second": 115478.35
|
|
},
|
|
{
|
|
"epoch": 0.8398495630293027,
|
|
"grad_norm": 0.13724961876869202,
|
|
"learning_rate": 0.0007944698012389664,
|
|
"loss": 2.9696407318115234,
|
|
"num_input_tokens_seen": 8136949760,
|
|
"step": 15520,
|
|
"train_runtime": 70462.6835,
|
|
"train_tokens_per_second": 115478.851
|
|
},
|
|
{
|
|
"epoch": 0.840390703211667,
|
|
"grad_norm": 0.13106457889080048,
|
|
"learning_rate": 0.0007925283634259745,
|
|
"loss": 2.964072036743164,
|
|
"num_input_tokens_seen": 8142192640,
|
|
"step": 15530,
|
|
"train_runtime": 70507.7742,
|
|
"train_tokens_per_second": 115479.36
|
|
},
|
|
{
|
|
"epoch": 0.8409318433940313,
|
|
"grad_norm": 0.1346583068370819,
|
|
"learning_rate": 0.000790592901683838,
|
|
"loss": 2.9721302032470702,
|
|
"num_input_tokens_seen": 8147435520,
|
|
"step": 15540,
|
|
"train_runtime": 70552.8789,
|
|
"train_tokens_per_second": 115479.845
|
|
},
|
|
{
|
|
"epoch": 0.8414729835763954,
|
|
"grad_norm": 0.12788882851600647,
|
|
"learning_rate": 0.0007886634219214321,
|
|
"loss": 2.9774459838867187,
|
|
"num_input_tokens_seen": 8152678400,
|
|
"step": 15550,
|
|
"train_runtime": 70597.9816,
|
|
"train_tokens_per_second": 115480.333
|
|
},
|
|
{
|
|
"epoch": 0.8420141237587597,
|
|
"grad_norm": 0.1323845237493515,
|
|
"learning_rate": 0.0007867399300293693,
|
|
"loss": 2.971846008300781,
|
|
"num_input_tokens_seen": 8157921280,
|
|
"step": 15560,
|
|
"train_runtime": 70643.081,
|
|
"train_tokens_per_second": 115480.825
|
|
},
|
|
{
|
|
"epoch": 0.8425552639411239,
|
|
"grad_norm": 0.132669135928154,
|
|
"learning_rate": 0.0007848224318799821,
|
|
"loss": 2.9736881256103516,
|
|
"num_input_tokens_seen": 8163164160,
|
|
"step": 15570,
|
|
"train_runtime": 70688.1702,
|
|
"train_tokens_per_second": 115481.334
|
|
},
|
|
{
|
|
"epoch": 0.8430964041234882,
|
|
"grad_norm": 0.1315847635269165,
|
|
"learning_rate": 0.0007829109333273051,
|
|
"loss": 2.9581043243408205,
|
|
"num_input_tokens_seen": 8168407040,
|
|
"step": 15580,
|
|
"train_runtime": 70733.2527,
|
|
"train_tokens_per_second": 115481.852
|
|
},
|
|
{
|
|
"epoch": 0.8436375443058525,
|
|
"grad_norm": 0.13508620858192444,
|
|
"learning_rate": 0.0007810054402070547,
|
|
"loss": 2.967576789855957,
|
|
"num_input_tokens_seen": 8173649920,
|
|
"step": 15590,
|
|
"train_runtime": 70778.3173,
|
|
"train_tokens_per_second": 115482.4
|
|
},
|
|
{
|
|
"epoch": 0.8441786844882166,
|
|
"grad_norm": 0.13094158470630646,
|
|
"learning_rate": 0.0007791059583366134,
|
|
"loss": 2.969736671447754,
|
|
"num_input_tokens_seen": 8178892800,
|
|
"step": 15600,
|
|
"train_runtime": 70823.3875,
|
|
"train_tokens_per_second": 115482.937
|
|
},
|
|
{
|
|
"epoch": 0.8447198246705809,
|
|
"grad_norm": 0.13293389976024628,
|
|
"learning_rate": 0.0007772124935150125,
|
|
"loss": 2.9740530014038087,
|
|
"num_input_tokens_seen": 8184135680,
|
|
"step": 15610,
|
|
"train_runtime": 70868.5107,
|
|
"train_tokens_per_second": 115483.387
|
|
},
|
|
{
|
|
"epoch": 0.8452609648529451,
|
|
"grad_norm": 0.12885726988315582,
|
|
"learning_rate": 0.0007753250515229127,
|
|
"loss": 2.9699680328369142,
|
|
"num_input_tokens_seen": 8189378560,
|
|
"step": 15620,
|
|
"train_runtime": 70913.6516,
|
|
"train_tokens_per_second": 115483.808
|
|
},
|
|
{
|
|
"epoch": 0.8458021050353094,
|
|
"grad_norm": 0.13280688226222992,
|
|
"learning_rate": 0.0007734436381225877,
|
|
"loss": 2.9740190505981445,
|
|
"num_input_tokens_seen": 8194621440,
|
|
"step": 15630,
|
|
"train_runtime": 70958.7738,
|
|
"train_tokens_per_second": 115484.259
|
|
},
|
|
{
|
|
"epoch": 0.8463432452176737,
|
|
"grad_norm": 0.13439851999282837,
|
|
"learning_rate": 0.0007715682590579061,
|
|
"loss": 2.975991439819336,
|
|
"num_input_tokens_seen": 8199864320,
|
|
"step": 15640,
|
|
"train_runtime": 71003.8731,
|
|
"train_tokens_per_second": 115484.747
|
|
}
|
|
],
|
|
"logging_steps": 10,
|
|
"max_steps": 18480,
|
|
"num_input_tokens_seen": 8200388608,
|
|
"num_train_epochs": 1,
|
|
"save_steps": 1000,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": false
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 9.287888719514173e+18,
|
|
"train_batch_size": 32,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|