19064 lines
499 KiB
JSON
19064 lines
499 KiB
JSON
{
|
|
"best_global_step": 10620,
|
|
"best_metric": 0.05219457671046257,
|
|
"best_model_checkpoint": "saves_bts_preliminary/base/llama-3.2-1b-instruct/train_qnli_42_1779286681/checkpoint-10620",
|
|
"epoch": 1.0,
|
|
"eval_steps": 590,
|
|
"global_step": 11784,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.0004243041412084182,
|
|
"grad_norm": 248.84678649902344,
|
|
"learning_rate": 6.785411365564037e-09,
|
|
"loss": 0.8749,
|
|
"num_input_tokens_seen": 4992,
|
|
"step": 5
|
|
},
|
|
{
|
|
"epoch": 0.0008486082824168364,
|
|
"grad_norm": 288.7929382324219,
|
|
"learning_rate": 1.526717557251908e-08,
|
|
"loss": 0.957,
|
|
"num_input_tokens_seen": 9536,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 0.0012729124236252546,
|
|
"grad_norm": 252.544921875,
|
|
"learning_rate": 2.374893977947413e-08,
|
|
"loss": 0.8848,
|
|
"num_input_tokens_seen": 14016,
|
|
"step": 15
|
|
},
|
|
{
|
|
"epoch": 0.0016972165648336728,
|
|
"grad_norm": 305.71954345703125,
|
|
"learning_rate": 3.223070398642917e-08,
|
|
"loss": 0.8918,
|
|
"num_input_tokens_seen": 19648,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.002121520706042091,
|
|
"grad_norm": 272.7145080566406,
|
|
"learning_rate": 4.0712468193384224e-08,
|
|
"loss": 0.8435,
|
|
"num_input_tokens_seen": 24768,
|
|
"step": 25
|
|
},
|
|
{
|
|
"epoch": 0.0025458248472505093,
|
|
"grad_norm": 294.4197692871094,
|
|
"learning_rate": 4.919423240033927e-08,
|
|
"loss": 0.8892,
|
|
"num_input_tokens_seen": 29952,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 0.0029701289884589274,
|
|
"grad_norm": 269.132568359375,
|
|
"learning_rate": 5.767599660729432e-08,
|
|
"loss": 0.8939,
|
|
"num_input_tokens_seen": 34304,
|
|
"step": 35
|
|
},
|
|
{
|
|
"epoch": 0.0033944331296673455,
|
|
"grad_norm": 301.6712646484375,
|
|
"learning_rate": 6.615776081424935e-08,
|
|
"loss": 0.831,
|
|
"num_input_tokens_seen": 39360,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.0038187372708757637,
|
|
"grad_norm": 254.58384704589844,
|
|
"learning_rate": 7.463952502120441e-08,
|
|
"loss": 0.8454,
|
|
"num_input_tokens_seen": 44480,
|
|
"step": 45
|
|
},
|
|
{
|
|
"epoch": 0.004243041412084182,
|
|
"grad_norm": 201.4508056640625,
|
|
"learning_rate": 8.312128922815945e-08,
|
|
"loss": 0.688,
|
|
"num_input_tokens_seen": 49152,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 0.0046673455532926,
|
|
"grad_norm": 152.2429656982422,
|
|
"learning_rate": 9.16030534351145e-08,
|
|
"loss": 0.5179,
|
|
"num_input_tokens_seen": 53696,
|
|
"step": 55
|
|
},
|
|
{
|
|
"epoch": 0.0050916496945010185,
|
|
"grad_norm": 152.25869750976562,
|
|
"learning_rate": 1.0008481764206955e-07,
|
|
"loss": 0.4914,
|
|
"num_input_tokens_seen": 58560,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.005515953835709436,
|
|
"grad_norm": 108.48810577392578,
|
|
"learning_rate": 1.085665818490246e-07,
|
|
"loss": 0.428,
|
|
"num_input_tokens_seen": 63808,
|
|
"step": 65
|
|
},
|
|
{
|
|
"epoch": 0.005940257976917855,
|
|
"grad_norm": 30.594669342041016,
|
|
"learning_rate": 1.1704834605597964e-07,
|
|
"loss": 0.3349,
|
|
"num_input_tokens_seen": 68096,
|
|
"step": 70
|
|
},
|
|
{
|
|
"epoch": 0.006364562118126273,
|
|
"grad_norm": 19.845989227294922,
|
|
"learning_rate": 1.2553011026293469e-07,
|
|
"loss": 0.2364,
|
|
"num_input_tokens_seen": 72448,
|
|
"step": 75
|
|
},
|
|
{
|
|
"epoch": 0.006788866259334691,
|
|
"grad_norm": 21.346294403076172,
|
|
"learning_rate": 1.3401187446988974e-07,
|
|
"loss": 0.2142,
|
|
"num_input_tokens_seen": 78336,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.00721317040054311,
|
|
"grad_norm": 29.769168853759766,
|
|
"learning_rate": 1.4249363867684477e-07,
|
|
"loss": 0.1614,
|
|
"num_input_tokens_seen": 83072,
|
|
"step": 85
|
|
},
|
|
{
|
|
"epoch": 0.007637474541751527,
|
|
"grad_norm": 19.523893356323242,
|
|
"learning_rate": 1.509754028837998e-07,
|
|
"loss": 0.1774,
|
|
"num_input_tokens_seen": 88128,
|
|
"step": 90
|
|
},
|
|
{
|
|
"epoch": 0.008061778682959946,
|
|
"grad_norm": 60.22774887084961,
|
|
"learning_rate": 1.594571670907549e-07,
|
|
"loss": 0.1676,
|
|
"num_input_tokens_seen": 92992,
|
|
"step": 95
|
|
},
|
|
{
|
|
"epoch": 0.008486082824168364,
|
|
"grad_norm": 34.41898727416992,
|
|
"learning_rate": 1.6793893129770992e-07,
|
|
"loss": 0.1614,
|
|
"num_input_tokens_seen": 98112,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.008910386965376781,
|
|
"grad_norm": 20.954418182373047,
|
|
"learning_rate": 1.7642069550466495e-07,
|
|
"loss": 0.1435,
|
|
"num_input_tokens_seen": 102720,
|
|
"step": 105
|
|
},
|
|
{
|
|
"epoch": 0.0093346911065852,
|
|
"grad_norm": 12.607270240783691,
|
|
"learning_rate": 1.8490245971162e-07,
|
|
"loss": 0.1604,
|
|
"num_input_tokens_seen": 107520,
|
|
"step": 110
|
|
},
|
|
{
|
|
"epoch": 0.009758995247793618,
|
|
"grad_norm": 22.071306228637695,
|
|
"learning_rate": 1.9338422391857507e-07,
|
|
"loss": 0.1692,
|
|
"num_input_tokens_seen": 112064,
|
|
"step": 115
|
|
},
|
|
{
|
|
"epoch": 0.010183299389002037,
|
|
"grad_norm": 11.848485946655273,
|
|
"learning_rate": 2.018659881255301e-07,
|
|
"loss": 0.146,
|
|
"num_input_tokens_seen": 117184,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.010607603530210456,
|
|
"grad_norm": 22.220943450927734,
|
|
"learning_rate": 2.1034775233248513e-07,
|
|
"loss": 0.152,
|
|
"num_input_tokens_seen": 121792,
|
|
"step": 125
|
|
},
|
|
{
|
|
"epoch": 0.011031907671418872,
|
|
"grad_norm": 23.014427185058594,
|
|
"learning_rate": 2.188295165394402e-07,
|
|
"loss": 0.1515,
|
|
"num_input_tokens_seen": 126272,
|
|
"step": 130
|
|
},
|
|
{
|
|
"epoch": 0.011456211812627291,
|
|
"grad_norm": 41.91979217529297,
|
|
"learning_rate": 2.2731128074639524e-07,
|
|
"loss": 0.1476,
|
|
"num_input_tokens_seen": 130880,
|
|
"step": 135
|
|
},
|
|
{
|
|
"epoch": 0.01188051595383571,
|
|
"grad_norm": 23.48094367980957,
|
|
"learning_rate": 2.3579304495335027e-07,
|
|
"loss": 0.1536,
|
|
"num_input_tokens_seen": 135552,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 0.012304820095044128,
|
|
"grad_norm": 19.954736709594727,
|
|
"learning_rate": 2.442748091603053e-07,
|
|
"loss": 0.1438,
|
|
"num_input_tokens_seen": 140928,
|
|
"step": 145
|
|
},
|
|
{
|
|
"epoch": 0.012729124236252547,
|
|
"grad_norm": 50.20425033569336,
|
|
"learning_rate": 2.5275657336726036e-07,
|
|
"loss": 0.1373,
|
|
"num_input_tokens_seen": 146176,
|
|
"step": 150
|
|
},
|
|
{
|
|
"epoch": 0.013153428377460964,
|
|
"grad_norm": 20.869951248168945,
|
|
"learning_rate": 2.612383375742154e-07,
|
|
"loss": 0.1685,
|
|
"num_input_tokens_seen": 151680,
|
|
"step": 155
|
|
},
|
|
{
|
|
"epoch": 0.013577732518669382,
|
|
"grad_norm": 53.046871185302734,
|
|
"learning_rate": 2.697201017811705e-07,
|
|
"loss": 0.1578,
|
|
"num_input_tokens_seen": 156480,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 0.0140020366598778,
|
|
"grad_norm": 92.15911865234375,
|
|
"learning_rate": 2.782018659881255e-07,
|
|
"loss": 0.1476,
|
|
"num_input_tokens_seen": 161024,
|
|
"step": 165
|
|
},
|
|
{
|
|
"epoch": 0.01442634080108622,
|
|
"grad_norm": 36.3692626953125,
|
|
"learning_rate": 2.866836301950806e-07,
|
|
"loss": 0.1207,
|
|
"num_input_tokens_seen": 165760,
|
|
"step": 170
|
|
},
|
|
{
|
|
"epoch": 0.014850644942294636,
|
|
"grad_norm": 15.135072708129883,
|
|
"learning_rate": 2.951653944020356e-07,
|
|
"loss": 0.119,
|
|
"num_input_tokens_seen": 169984,
|
|
"step": 175
|
|
},
|
|
{
|
|
"epoch": 0.015274949083503055,
|
|
"grad_norm": 15.601729393005371,
|
|
"learning_rate": 3.0364715860899065e-07,
|
|
"loss": 0.1558,
|
|
"num_input_tokens_seen": 174528,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 0.01569925322471147,
|
|
"grad_norm": 75.5220947265625,
|
|
"learning_rate": 3.121289228159457e-07,
|
|
"loss": 0.1104,
|
|
"num_input_tokens_seen": 179136,
|
|
"step": 185
|
|
},
|
|
{
|
|
"epoch": 0.016123557365919892,
|
|
"grad_norm": 30.463899612426758,
|
|
"learning_rate": 3.206106870229007e-07,
|
|
"loss": 0.1274,
|
|
"num_input_tokens_seen": 183424,
|
|
"step": 190
|
|
},
|
|
{
|
|
"epoch": 0.01654786150712831,
|
|
"grad_norm": 12.01662826538086,
|
|
"learning_rate": 3.2909245122985577e-07,
|
|
"loss": 0.0972,
|
|
"num_input_tokens_seen": 187968,
|
|
"step": 195
|
|
},
|
|
{
|
|
"epoch": 0.01697216564833673,
|
|
"grad_norm": 23.106531143188477,
|
|
"learning_rate": 3.375742154368109e-07,
|
|
"loss": 0.1191,
|
|
"num_input_tokens_seen": 193152,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.017396469789545146,
|
|
"grad_norm": 46.63650131225586,
|
|
"learning_rate": 3.460559796437659e-07,
|
|
"loss": 0.1308,
|
|
"num_input_tokens_seen": 197632,
|
|
"step": 205
|
|
},
|
|
{
|
|
"epoch": 0.017820773930753563,
|
|
"grad_norm": 16.362655639648438,
|
|
"learning_rate": 3.5453774385072094e-07,
|
|
"loss": 0.0999,
|
|
"num_input_tokens_seen": 202304,
|
|
"step": 210
|
|
},
|
|
{
|
|
"epoch": 0.018245078071961983,
|
|
"grad_norm": 54.664066314697266,
|
|
"learning_rate": 3.63019508057676e-07,
|
|
"loss": 0.1436,
|
|
"num_input_tokens_seen": 207040,
|
|
"step": 215
|
|
},
|
|
{
|
|
"epoch": 0.0186693822131704,
|
|
"grad_norm": 30.950855255126953,
|
|
"learning_rate": 3.71501272264631e-07,
|
|
"loss": 0.1029,
|
|
"num_input_tokens_seen": 212480,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 0.01909368635437882,
|
|
"grad_norm": 68.33805847167969,
|
|
"learning_rate": 3.7998303647158606e-07,
|
|
"loss": 0.1388,
|
|
"num_input_tokens_seen": 217728,
|
|
"step": 225
|
|
},
|
|
{
|
|
"epoch": 0.019517990495587237,
|
|
"grad_norm": 26.92445182800293,
|
|
"learning_rate": 3.8846480067854107e-07,
|
|
"loss": 0.0905,
|
|
"num_input_tokens_seen": 221888,
|
|
"step": 230
|
|
},
|
|
{
|
|
"epoch": 0.019942294636795654,
|
|
"grad_norm": 7.124783515930176,
|
|
"learning_rate": 3.969465648854962e-07,
|
|
"loss": 0.0759,
|
|
"num_input_tokens_seen": 226496,
|
|
"step": 235
|
|
},
|
|
{
|
|
"epoch": 0.020366598778004074,
|
|
"grad_norm": 27.623140335083008,
|
|
"learning_rate": 4.0542832909245124e-07,
|
|
"loss": 0.1287,
|
|
"num_input_tokens_seen": 230720,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 0.02079090291921249,
|
|
"grad_norm": 134.61000061035156,
|
|
"learning_rate": 4.1391009329940624e-07,
|
|
"loss": 0.2617,
|
|
"num_input_tokens_seen": 235584,
|
|
"step": 245
|
|
},
|
|
{
|
|
"epoch": 0.02121520706042091,
|
|
"grad_norm": 13.19383716583252,
|
|
"learning_rate": 4.223918575063613e-07,
|
|
"loss": 0.0804,
|
|
"num_input_tokens_seen": 241088,
|
|
"step": 250
|
|
},
|
|
{
|
|
"epoch": 0.021639511201629328,
|
|
"grad_norm": 26.174644470214844,
|
|
"learning_rate": 4.3087362171331635e-07,
|
|
"loss": 0.0637,
|
|
"num_input_tokens_seen": 245568,
|
|
"step": 255
|
|
},
|
|
{
|
|
"epoch": 0.022063815342837745,
|
|
"grad_norm": 59.06278610229492,
|
|
"learning_rate": 4.3935538592027136e-07,
|
|
"loss": 0.0632,
|
|
"num_input_tokens_seen": 250304,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 0.022488119484046165,
|
|
"grad_norm": 36.08182907104492,
|
|
"learning_rate": 4.4783715012722647e-07,
|
|
"loss": 0.0964,
|
|
"num_input_tokens_seen": 255232,
|
|
"step": 265
|
|
},
|
|
{
|
|
"epoch": 0.022912423625254582,
|
|
"grad_norm": 9.69312572479248,
|
|
"learning_rate": 4.5631891433418153e-07,
|
|
"loss": 0.0901,
|
|
"num_input_tokens_seen": 259840,
|
|
"step": 270
|
|
},
|
|
{
|
|
"epoch": 0.023336727766463002,
|
|
"grad_norm": 28.35649871826172,
|
|
"learning_rate": 4.6480067854113653e-07,
|
|
"loss": 0.0805,
|
|
"num_input_tokens_seen": 264768,
|
|
"step": 275
|
|
},
|
|
{
|
|
"epoch": 0.02376103190767142,
|
|
"grad_norm": 41.839508056640625,
|
|
"learning_rate": 4.732824427480916e-07,
|
|
"loss": 0.1056,
|
|
"num_input_tokens_seen": 270016,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 0.024185336048879836,
|
|
"grad_norm": 31.131669998168945,
|
|
"learning_rate": 4.817642069550466e-07,
|
|
"loss": 0.0957,
|
|
"num_input_tokens_seen": 274496,
|
|
"step": 285
|
|
},
|
|
{
|
|
"epoch": 0.024609640190088256,
|
|
"grad_norm": 20.508243560791016,
|
|
"learning_rate": 4.902459711620017e-07,
|
|
"loss": 0.0838,
|
|
"num_input_tokens_seen": 279296,
|
|
"step": 290
|
|
},
|
|
{
|
|
"epoch": 0.025033944331296673,
|
|
"grad_norm": 69.49073791503906,
|
|
"learning_rate": 4.987277353689568e-07,
|
|
"loss": 0.0754,
|
|
"num_input_tokens_seen": 284288,
|
|
"step": 295
|
|
},
|
|
{
|
|
"epoch": 0.025458248472505093,
|
|
"grad_norm": 31.049354553222656,
|
|
"learning_rate": 5.072094995759117e-07,
|
|
"loss": 0.1018,
|
|
"num_input_tokens_seen": 289088,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 0.02588255261371351,
|
|
"grad_norm": 33.53955078125,
|
|
"learning_rate": 5.156912637828668e-07,
|
|
"loss": 0.1238,
|
|
"num_input_tokens_seen": 293632,
|
|
"step": 305
|
|
},
|
|
{
|
|
"epoch": 0.026306856754921927,
|
|
"grad_norm": 12.766319274902344,
|
|
"learning_rate": 5.241730279898219e-07,
|
|
"loss": 0.0808,
|
|
"num_input_tokens_seen": 298176,
|
|
"step": 310
|
|
},
|
|
{
|
|
"epoch": 0.026731160896130347,
|
|
"grad_norm": 41.98097229003906,
|
|
"learning_rate": 5.326547921967769e-07,
|
|
"loss": 0.1489,
|
|
"num_input_tokens_seen": 302720,
|
|
"step": 315
|
|
},
|
|
{
|
|
"epoch": 0.027155465037338764,
|
|
"grad_norm": 67.35812377929688,
|
|
"learning_rate": 5.411365564037319e-07,
|
|
"loss": 0.1374,
|
|
"num_input_tokens_seen": 307648,
|
|
"step": 320
|
|
},
|
|
{
|
|
"epoch": 0.02757976917854718,
|
|
"grad_norm": 10.051847457885742,
|
|
"learning_rate": 5.49618320610687e-07,
|
|
"loss": 0.0861,
|
|
"num_input_tokens_seen": 312832,
|
|
"step": 325
|
|
},
|
|
{
|
|
"epoch": 0.0280040733197556,
|
|
"grad_norm": 48.343536376953125,
|
|
"learning_rate": 5.581000848176421e-07,
|
|
"loss": 0.0949,
|
|
"num_input_tokens_seen": 317376,
|
|
"step": 330
|
|
},
|
|
{
|
|
"epoch": 0.028428377460964018,
|
|
"grad_norm": 15.865279197692871,
|
|
"learning_rate": 5.66581849024597e-07,
|
|
"loss": 0.0605,
|
|
"num_input_tokens_seen": 322560,
|
|
"step": 335
|
|
},
|
|
{
|
|
"epoch": 0.02885268160217244,
|
|
"grad_norm": 29.660648345947266,
|
|
"learning_rate": 5.750636132315522e-07,
|
|
"loss": 0.0636,
|
|
"num_input_tokens_seen": 327104,
|
|
"step": 340
|
|
},
|
|
{
|
|
"epoch": 0.029276985743380855,
|
|
"grad_norm": 35.54970932006836,
|
|
"learning_rate": 5.835453774385072e-07,
|
|
"loss": 0.0763,
|
|
"num_input_tokens_seen": 332160,
|
|
"step": 345
|
|
},
|
|
{
|
|
"epoch": 0.029701289884589272,
|
|
"grad_norm": 67.17460632324219,
|
|
"learning_rate": 5.920271416454622e-07,
|
|
"loss": 0.1048,
|
|
"num_input_tokens_seen": 336960,
|
|
"step": 350
|
|
},
|
|
{
|
|
"epoch": 0.030125594025797692,
|
|
"grad_norm": 16.28141975402832,
|
|
"learning_rate": 6.005089058524173e-07,
|
|
"loss": 0.1619,
|
|
"num_input_tokens_seen": 341696,
|
|
"step": 355
|
|
},
|
|
{
|
|
"epoch": 0.03054989816700611,
|
|
"grad_norm": 44.869937896728516,
|
|
"learning_rate": 6.089906700593723e-07,
|
|
"loss": 0.1708,
|
|
"num_input_tokens_seen": 347008,
|
|
"step": 360
|
|
},
|
|
{
|
|
"epoch": 0.03097420230821453,
|
|
"grad_norm": 43.148014068603516,
|
|
"learning_rate": 6.174724342663274e-07,
|
|
"loss": 0.1124,
|
|
"num_input_tokens_seen": 352256,
|
|
"step": 365
|
|
},
|
|
{
|
|
"epoch": 0.03139850644942294,
|
|
"grad_norm": 14.010305404663086,
|
|
"learning_rate": 6.259541984732824e-07,
|
|
"loss": 0.0544,
|
|
"num_input_tokens_seen": 357312,
|
|
"step": 370
|
|
},
|
|
{
|
|
"epoch": 0.03182281059063136,
|
|
"grad_norm": 44.6069221496582,
|
|
"learning_rate": 6.344359626802375e-07,
|
|
"loss": 0.1461,
|
|
"num_input_tokens_seen": 361728,
|
|
"step": 375
|
|
},
|
|
{
|
|
"epoch": 0.032247114731839784,
|
|
"grad_norm": 32.66260528564453,
|
|
"learning_rate": 6.429177268871925e-07,
|
|
"loss": 0.1233,
|
|
"num_input_tokens_seen": 366592,
|
|
"step": 380
|
|
},
|
|
{
|
|
"epoch": 0.032671418873048204,
|
|
"grad_norm": 39.97212219238281,
|
|
"learning_rate": 6.513994910941476e-07,
|
|
"loss": 0.2,
|
|
"num_input_tokens_seen": 371392,
|
|
"step": 385
|
|
},
|
|
{
|
|
"epoch": 0.03309572301425662,
|
|
"grad_norm": 16.408811569213867,
|
|
"learning_rate": 6.598812553011026e-07,
|
|
"loss": 0.0928,
|
|
"num_input_tokens_seen": 376640,
|
|
"step": 390
|
|
},
|
|
{
|
|
"epoch": 0.03352002715546504,
|
|
"grad_norm": 36.4805793762207,
|
|
"learning_rate": 6.683630195080576e-07,
|
|
"loss": 0.1433,
|
|
"num_input_tokens_seen": 381504,
|
|
"step": 395
|
|
},
|
|
{
|
|
"epoch": 0.03394433129667346,
|
|
"grad_norm": 39.27249526977539,
|
|
"learning_rate": 6.768447837150128e-07,
|
|
"loss": 0.1205,
|
|
"num_input_tokens_seen": 385920,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 0.03436863543788187,
|
|
"grad_norm": 48.95637512207031,
|
|
"learning_rate": 6.853265479219677e-07,
|
|
"loss": 0.1098,
|
|
"num_input_tokens_seen": 390272,
|
|
"step": 405
|
|
},
|
|
{
|
|
"epoch": 0.03479293957909029,
|
|
"grad_norm": 9.907062530517578,
|
|
"learning_rate": 6.938083121289228e-07,
|
|
"loss": 0.1179,
|
|
"num_input_tokens_seen": 396160,
|
|
"step": 410
|
|
},
|
|
{
|
|
"epoch": 0.03521724372029871,
|
|
"grad_norm": 11.443187713623047,
|
|
"learning_rate": 7.022900763358778e-07,
|
|
"loss": 0.0879,
|
|
"num_input_tokens_seen": 400768,
|
|
"step": 415
|
|
},
|
|
{
|
|
"epoch": 0.035641547861507125,
|
|
"grad_norm": 13.214274406433105,
|
|
"learning_rate": 7.107718405428329e-07,
|
|
"loss": 0.0615,
|
|
"num_input_tokens_seen": 405504,
|
|
"step": 420
|
|
},
|
|
{
|
|
"epoch": 0.036065852002715545,
|
|
"grad_norm": 22.10379409790039,
|
|
"learning_rate": 7.192536047497879e-07,
|
|
"loss": 0.0365,
|
|
"num_input_tokens_seen": 410176,
|
|
"step": 425
|
|
},
|
|
{
|
|
"epoch": 0.036490156143923966,
|
|
"grad_norm": 4.392693996429443,
|
|
"learning_rate": 7.277353689567429e-07,
|
|
"loss": 0.0658,
|
|
"num_input_tokens_seen": 415040,
|
|
"step": 430
|
|
},
|
|
{
|
|
"epoch": 0.036914460285132386,
|
|
"grad_norm": 64.98529052734375,
|
|
"learning_rate": 7.36217133163698e-07,
|
|
"loss": 0.1176,
|
|
"num_input_tokens_seen": 419968,
|
|
"step": 435
|
|
},
|
|
{
|
|
"epoch": 0.0373387644263408,
|
|
"grad_norm": 73.72738647460938,
|
|
"learning_rate": 7.446988973706531e-07,
|
|
"loss": 0.2028,
|
|
"num_input_tokens_seen": 424832,
|
|
"step": 440
|
|
},
|
|
{
|
|
"epoch": 0.03776306856754922,
|
|
"grad_norm": 46.626014709472656,
|
|
"learning_rate": 7.531806615776081e-07,
|
|
"loss": 0.3221,
|
|
"num_input_tokens_seen": 432064,
|
|
"step": 445
|
|
},
|
|
{
|
|
"epoch": 0.03818737270875764,
|
|
"grad_norm": 27.6806583404541,
|
|
"learning_rate": 7.616624257845632e-07,
|
|
"loss": 0.083,
|
|
"num_input_tokens_seen": 437184,
|
|
"step": 450
|
|
},
|
|
{
|
|
"epoch": 0.03861167684996605,
|
|
"grad_norm": 19.583559036254883,
|
|
"learning_rate": 7.701441899915182e-07,
|
|
"loss": 0.0965,
|
|
"num_input_tokens_seen": 442432,
|
|
"step": 455
|
|
},
|
|
{
|
|
"epoch": 0.039035980991174474,
|
|
"grad_norm": 27.98273277282715,
|
|
"learning_rate": 7.786259541984732e-07,
|
|
"loss": 0.0456,
|
|
"num_input_tokens_seen": 447040,
|
|
"step": 460
|
|
},
|
|
{
|
|
"epoch": 0.039460285132382894,
|
|
"grad_norm": 54.28982162475586,
|
|
"learning_rate": 7.871077184054283e-07,
|
|
"loss": 0.132,
|
|
"num_input_tokens_seen": 452032,
|
|
"step": 465
|
|
},
|
|
{
|
|
"epoch": 0.03988458927359131,
|
|
"grad_norm": 22.877153396606445,
|
|
"learning_rate": 7.955894826123833e-07,
|
|
"loss": 0.0625,
|
|
"num_input_tokens_seen": 457344,
|
|
"step": 470
|
|
},
|
|
{
|
|
"epoch": 0.04030889341479973,
|
|
"grad_norm": 28.471019744873047,
|
|
"learning_rate": 8.040712468193384e-07,
|
|
"loss": 0.134,
|
|
"num_input_tokens_seen": 462336,
|
|
"step": 475
|
|
},
|
|
{
|
|
"epoch": 0.04073319755600815,
|
|
"grad_norm": 23.736265182495117,
|
|
"learning_rate": 8.125530110262935e-07,
|
|
"loss": 0.0886,
|
|
"num_input_tokens_seen": 467072,
|
|
"step": 480
|
|
},
|
|
{
|
|
"epoch": 0.04115750169721656,
|
|
"grad_norm": 24.191547393798828,
|
|
"learning_rate": 8.210347752332485e-07,
|
|
"loss": 0.0983,
|
|
"num_input_tokens_seen": 471616,
|
|
"step": 485
|
|
},
|
|
{
|
|
"epoch": 0.04158180583842498,
|
|
"grad_norm": 30.258466720581055,
|
|
"learning_rate": 8.295165394402035e-07,
|
|
"loss": 0.0824,
|
|
"num_input_tokens_seen": 476480,
|
|
"step": 490
|
|
},
|
|
{
|
|
"epoch": 0.0420061099796334,
|
|
"grad_norm": 28.902149200439453,
|
|
"learning_rate": 8.379983036471586e-07,
|
|
"loss": 0.0954,
|
|
"num_input_tokens_seen": 481536,
|
|
"step": 495
|
|
},
|
|
{
|
|
"epoch": 0.04243041412084182,
|
|
"grad_norm": 15.66128921508789,
|
|
"learning_rate": 8.464800678541136e-07,
|
|
"loss": 0.1224,
|
|
"num_input_tokens_seen": 486336,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 0.042854718262050236,
|
|
"grad_norm": 37.88010787963867,
|
|
"learning_rate": 8.549618320610686e-07,
|
|
"loss": 0.1126,
|
|
"num_input_tokens_seen": 490944,
|
|
"step": 505
|
|
},
|
|
{
|
|
"epoch": 0.043279022403258656,
|
|
"grad_norm": 7.317612171173096,
|
|
"learning_rate": 8.634435962680237e-07,
|
|
"loss": 0.0607,
|
|
"num_input_tokens_seen": 495488,
|
|
"step": 510
|
|
},
|
|
{
|
|
"epoch": 0.043703326544467076,
|
|
"grad_norm": 16.47148323059082,
|
|
"learning_rate": 8.719253604749788e-07,
|
|
"loss": 0.1001,
|
|
"num_input_tokens_seen": 499840,
|
|
"step": 515
|
|
},
|
|
{
|
|
"epoch": 0.04412763068567549,
|
|
"grad_norm": 25.302709579467773,
|
|
"learning_rate": 8.804071246819338e-07,
|
|
"loss": 0.0827,
|
|
"num_input_tokens_seen": 504512,
|
|
"step": 520
|
|
},
|
|
{
|
|
"epoch": 0.04455193482688391,
|
|
"grad_norm": 26.834787368774414,
|
|
"learning_rate": 8.888888888888888e-07,
|
|
"loss": 0.0701,
|
|
"num_input_tokens_seen": 509376,
|
|
"step": 525
|
|
},
|
|
{
|
|
"epoch": 0.04497623896809233,
|
|
"grad_norm": 32.22453308105469,
|
|
"learning_rate": 8.973706530958439e-07,
|
|
"loss": 0.0729,
|
|
"num_input_tokens_seen": 513856,
|
|
"step": 530
|
|
},
|
|
{
|
|
"epoch": 0.045400543109300744,
|
|
"grad_norm": 31.164321899414062,
|
|
"learning_rate": 9.058524173027989e-07,
|
|
"loss": 0.1096,
|
|
"num_input_tokens_seen": 518976,
|
|
"step": 535
|
|
},
|
|
{
|
|
"epoch": 0.045824847250509164,
|
|
"grad_norm": 70.79345703125,
|
|
"learning_rate": 9.143341815097539e-07,
|
|
"loss": 0.0947,
|
|
"num_input_tokens_seen": 524160,
|
|
"step": 540
|
|
},
|
|
{
|
|
"epoch": 0.046249151391717584,
|
|
"grad_norm": 29.031068801879883,
|
|
"learning_rate": 9.228159457167091e-07,
|
|
"loss": 0.1678,
|
|
"num_input_tokens_seen": 529152,
|
|
"step": 545
|
|
},
|
|
{
|
|
"epoch": 0.046673455532926005,
|
|
"grad_norm": 67.99354553222656,
|
|
"learning_rate": 9.312977099236641e-07,
|
|
"loss": 0.179,
|
|
"num_input_tokens_seen": 533824,
|
|
"step": 550
|
|
},
|
|
{
|
|
"epoch": 0.04709775967413442,
|
|
"grad_norm": 16.346111297607422,
|
|
"learning_rate": 9.397794741306191e-07,
|
|
"loss": 0.0989,
|
|
"num_input_tokens_seen": 538560,
|
|
"step": 555
|
|
},
|
|
{
|
|
"epoch": 0.04752206381534284,
|
|
"grad_norm": 21.097190856933594,
|
|
"learning_rate": 9.482612383375742e-07,
|
|
"loss": 0.1025,
|
|
"num_input_tokens_seen": 542784,
|
|
"step": 560
|
|
},
|
|
{
|
|
"epoch": 0.04794636795655126,
|
|
"grad_norm": 12.773321151733398,
|
|
"learning_rate": 9.567430025445291e-07,
|
|
"loss": 0.1057,
|
|
"num_input_tokens_seen": 547840,
|
|
"step": 565
|
|
},
|
|
{
|
|
"epoch": 0.04837067209775967,
|
|
"grad_norm": 10.032249450683594,
|
|
"learning_rate": 9.652247667514842e-07,
|
|
"loss": 0.0799,
|
|
"num_input_tokens_seen": 552448,
|
|
"step": 570
|
|
},
|
|
{
|
|
"epoch": 0.04879497623896809,
|
|
"grad_norm": 11.74986457824707,
|
|
"learning_rate": 9.737065309584394e-07,
|
|
"loss": 0.1164,
|
|
"num_input_tokens_seen": 557120,
|
|
"step": 575
|
|
},
|
|
{
|
|
"epoch": 0.04921928038017651,
|
|
"grad_norm": 11.373242378234863,
|
|
"learning_rate": 9.821882951653943e-07,
|
|
"loss": 0.1132,
|
|
"num_input_tokens_seen": 561536,
|
|
"step": 580
|
|
},
|
|
{
|
|
"epoch": 0.049643584521384926,
|
|
"grad_norm": 16.64359474182129,
|
|
"learning_rate": 9.906700593723493e-07,
|
|
"loss": 0.0724,
|
|
"num_input_tokens_seen": 566336,
|
|
"step": 585
|
|
},
|
|
{
|
|
"epoch": 0.050067888662593346,
|
|
"grad_norm": 11.130979537963867,
|
|
"learning_rate": 9.991518235793044e-07,
|
|
"loss": 0.093,
|
|
"num_input_tokens_seen": 571072,
|
|
"step": 590
|
|
},
|
|
{
|
|
"epoch": 0.050067888662593346,
|
|
"eval_loss": 0.08049111068248749,
|
|
"eval_runtime": 16.5132,
|
|
"eval_samples_per_second": 634.341,
|
|
"eval_steps_per_second": 79.33,
|
|
"num_input_tokens_seen": 571072,
|
|
"step": 590
|
|
},
|
|
{
|
|
"epoch": 0.050492192803801766,
|
|
"grad_norm": 21.55449676513672,
|
|
"learning_rate": 1.0076335877862595e-06,
|
|
"loss": 0.0322,
|
|
"num_input_tokens_seen": 576192,
|
|
"step": 595
|
|
},
|
|
{
|
|
"epoch": 0.05091649694501019,
|
|
"grad_norm": 38.285030364990234,
|
|
"learning_rate": 1.0161153519932147e-06,
|
|
"loss": 0.1183,
|
|
"num_input_tokens_seen": 580928,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 0.0513408010862186,
|
|
"grad_norm": 16.391883850097656,
|
|
"learning_rate": 1.0245971162001696e-06,
|
|
"loss": 0.1237,
|
|
"num_input_tokens_seen": 585728,
|
|
"step": 605
|
|
},
|
|
{
|
|
"epoch": 0.05176510522742702,
|
|
"grad_norm": 49.11682891845703,
|
|
"learning_rate": 1.0330788804071246e-06,
|
|
"loss": 0.0873,
|
|
"num_input_tokens_seen": 591040,
|
|
"step": 610
|
|
},
|
|
{
|
|
"epoch": 0.05218940936863544,
|
|
"grad_norm": 33.878257751464844,
|
|
"learning_rate": 1.0415606446140797e-06,
|
|
"loss": 0.1661,
|
|
"num_input_tokens_seen": 595584,
|
|
"step": 615
|
|
},
|
|
{
|
|
"epoch": 0.052613713509843854,
|
|
"grad_norm": 27.837505340576172,
|
|
"learning_rate": 1.0500424088210348e-06,
|
|
"loss": 0.144,
|
|
"num_input_tokens_seen": 600384,
|
|
"step": 620
|
|
},
|
|
{
|
|
"epoch": 0.053038017651052274,
|
|
"grad_norm": 19.002344131469727,
|
|
"learning_rate": 1.0585241730279896e-06,
|
|
"loss": 0.0583,
|
|
"num_input_tokens_seen": 605248,
|
|
"step": 625
|
|
},
|
|
{
|
|
"epoch": 0.053462321792260695,
|
|
"grad_norm": 7.967550754547119,
|
|
"learning_rate": 1.0670059372349449e-06,
|
|
"loss": 0.0787,
|
|
"num_input_tokens_seen": 609856,
|
|
"step": 630
|
|
},
|
|
{
|
|
"epoch": 0.05388662593346911,
|
|
"grad_norm": 9.882608413696289,
|
|
"learning_rate": 1.0754877014419e-06,
|
|
"loss": 0.0427,
|
|
"num_input_tokens_seen": 614976,
|
|
"step": 635
|
|
},
|
|
{
|
|
"epoch": 0.05431093007467753,
|
|
"grad_norm": 31.60948371887207,
|
|
"learning_rate": 1.083969465648855e-06,
|
|
"loss": 0.089,
|
|
"num_input_tokens_seen": 619648,
|
|
"step": 640
|
|
},
|
|
{
|
|
"epoch": 0.05473523421588595,
|
|
"grad_norm": 29.103275299072266,
|
|
"learning_rate": 1.09245122985581e-06,
|
|
"loss": 0.0489,
|
|
"num_input_tokens_seen": 624896,
|
|
"step": 645
|
|
},
|
|
{
|
|
"epoch": 0.05515953835709436,
|
|
"grad_norm": 17.185047149658203,
|
|
"learning_rate": 1.1009329940627649e-06,
|
|
"loss": 0.1496,
|
|
"num_input_tokens_seen": 629632,
|
|
"step": 650
|
|
},
|
|
{
|
|
"epoch": 0.05558384249830278,
|
|
"grad_norm": 20.95003318786621,
|
|
"learning_rate": 1.10941475826972e-06,
|
|
"loss": 0.0988,
|
|
"num_input_tokens_seen": 634624,
|
|
"step": 655
|
|
},
|
|
{
|
|
"epoch": 0.0560081466395112,
|
|
"grad_norm": 28.643211364746094,
|
|
"learning_rate": 1.1178965224766752e-06,
|
|
"loss": 0.0975,
|
|
"num_input_tokens_seen": 639360,
|
|
"step": 660
|
|
},
|
|
{
|
|
"epoch": 0.05643245078071962,
|
|
"grad_norm": 20.850797653198242,
|
|
"learning_rate": 1.1263782866836303e-06,
|
|
"loss": 0.0373,
|
|
"num_input_tokens_seen": 644032,
|
|
"step": 665
|
|
},
|
|
{
|
|
"epoch": 0.056856754921928036,
|
|
"grad_norm": 17.269245147705078,
|
|
"learning_rate": 1.1348600508905853e-06,
|
|
"loss": 0.0982,
|
|
"num_input_tokens_seen": 648256,
|
|
"step": 670
|
|
},
|
|
{
|
|
"epoch": 0.05728105906313646,
|
|
"grad_norm": 28.658626556396484,
|
|
"learning_rate": 1.1433418150975402e-06,
|
|
"loss": 0.1079,
|
|
"num_input_tokens_seen": 653056,
|
|
"step": 675
|
|
},
|
|
{
|
|
"epoch": 0.05770536320434488,
|
|
"grad_norm": 38.10745620727539,
|
|
"learning_rate": 1.1518235793044952e-06,
|
|
"loss": 0.0768,
|
|
"num_input_tokens_seen": 657664,
|
|
"step": 680
|
|
},
|
|
{
|
|
"epoch": 0.05812966734555329,
|
|
"grad_norm": 28.944671630859375,
|
|
"learning_rate": 1.1603053435114503e-06,
|
|
"loss": 0.0351,
|
|
"num_input_tokens_seen": 662016,
|
|
"step": 685
|
|
},
|
|
{
|
|
"epoch": 0.05855397148676171,
|
|
"grad_norm": 6.559541702270508,
|
|
"learning_rate": 1.1687871077184053e-06,
|
|
"loss": 0.1103,
|
|
"num_input_tokens_seen": 666368,
|
|
"step": 690
|
|
},
|
|
{
|
|
"epoch": 0.05897827562797013,
|
|
"grad_norm": 25.122549057006836,
|
|
"learning_rate": 1.1772688719253606e-06,
|
|
"loss": 0.1014,
|
|
"num_input_tokens_seen": 671616,
|
|
"step": 695
|
|
},
|
|
{
|
|
"epoch": 0.059402579769178544,
|
|
"grad_norm": 1.912219524383545,
|
|
"learning_rate": 1.1857506361323155e-06,
|
|
"loss": 0.0408,
|
|
"num_input_tokens_seen": 676288,
|
|
"step": 700
|
|
},
|
|
{
|
|
"epoch": 0.059826883910386965,
|
|
"grad_norm": 6.720822334289551,
|
|
"learning_rate": 1.1942324003392705e-06,
|
|
"loss": 0.0794,
|
|
"num_input_tokens_seen": 680960,
|
|
"step": 705
|
|
},
|
|
{
|
|
"epoch": 0.060251188051595385,
|
|
"grad_norm": 17.048614501953125,
|
|
"learning_rate": 1.2027141645462256e-06,
|
|
"loss": 0.0664,
|
|
"num_input_tokens_seen": 685440,
|
|
"step": 710
|
|
},
|
|
{
|
|
"epoch": 0.060675492192803805,
|
|
"grad_norm": 29.455345153808594,
|
|
"learning_rate": 1.2111959287531806e-06,
|
|
"loss": 0.105,
|
|
"num_input_tokens_seen": 690304,
|
|
"step": 715
|
|
},
|
|
{
|
|
"epoch": 0.06109979633401222,
|
|
"grad_norm": 24.197946548461914,
|
|
"learning_rate": 1.2196776929601355e-06,
|
|
"loss": 0.0794,
|
|
"num_input_tokens_seen": 695040,
|
|
"step": 720
|
|
},
|
|
{
|
|
"epoch": 0.06152410047522064,
|
|
"grad_norm": 11.903913497924805,
|
|
"learning_rate": 1.2281594571670907e-06,
|
|
"loss": 0.0293,
|
|
"num_input_tokens_seen": 699456,
|
|
"step": 725
|
|
},
|
|
{
|
|
"epoch": 0.06194840461642906,
|
|
"grad_norm": 33.0831184387207,
|
|
"learning_rate": 1.2366412213740458e-06,
|
|
"loss": 0.1498,
|
|
"num_input_tokens_seen": 704064,
|
|
"step": 730
|
|
},
|
|
{
|
|
"epoch": 0.06237270875763747,
|
|
"grad_norm": 3.6401190757751465,
|
|
"learning_rate": 1.2451229855810009e-06,
|
|
"loss": 0.0321,
|
|
"num_input_tokens_seen": 708544,
|
|
"step": 735
|
|
},
|
|
{
|
|
"epoch": 0.06279701289884589,
|
|
"grad_norm": 38.597625732421875,
|
|
"learning_rate": 1.253604749787956e-06,
|
|
"loss": 0.2043,
|
|
"num_input_tokens_seen": 713088,
|
|
"step": 740
|
|
},
|
|
{
|
|
"epoch": 0.0632213170400543,
|
|
"grad_norm": 16.462156295776367,
|
|
"learning_rate": 1.2620865139949108e-06,
|
|
"loss": 0.1073,
|
|
"num_input_tokens_seen": 718016,
|
|
"step": 745
|
|
},
|
|
{
|
|
"epoch": 0.06364562118126273,
|
|
"grad_norm": 25.767261505126953,
|
|
"learning_rate": 1.2705682782018658e-06,
|
|
"loss": 0.0483,
|
|
"num_input_tokens_seen": 722752,
|
|
"step": 750
|
|
},
|
|
{
|
|
"epoch": 0.06406992532247115,
|
|
"grad_norm": 16.970993041992188,
|
|
"learning_rate": 1.279050042408821e-06,
|
|
"loss": 0.0701,
|
|
"num_input_tokens_seen": 727872,
|
|
"step": 755
|
|
},
|
|
{
|
|
"epoch": 0.06449422946367957,
|
|
"grad_norm": 21.721702575683594,
|
|
"learning_rate": 1.2875318066157761e-06,
|
|
"loss": 0.1137,
|
|
"num_input_tokens_seen": 732480,
|
|
"step": 760
|
|
},
|
|
{
|
|
"epoch": 0.06491853360488799,
|
|
"grad_norm": 3.266345739364624,
|
|
"learning_rate": 1.2960135708227312e-06,
|
|
"loss": 0.0502,
|
|
"num_input_tokens_seen": 736704,
|
|
"step": 765
|
|
},
|
|
{
|
|
"epoch": 0.06534283774609641,
|
|
"grad_norm": 9.310620307922363,
|
|
"learning_rate": 1.304495335029686e-06,
|
|
"loss": 0.0577,
|
|
"num_input_tokens_seen": 742848,
|
|
"step": 770
|
|
},
|
|
{
|
|
"epoch": 0.06576714188730481,
|
|
"grad_norm": 3.2148964405059814,
|
|
"learning_rate": 1.3129770992366411e-06,
|
|
"loss": 0.1158,
|
|
"num_input_tokens_seen": 747072,
|
|
"step": 775
|
|
},
|
|
{
|
|
"epoch": 0.06619144602851323,
|
|
"grad_norm": 1.0874511003494263,
|
|
"learning_rate": 1.3214588634435962e-06,
|
|
"loss": 0.03,
|
|
"num_input_tokens_seen": 751808,
|
|
"step": 780
|
|
},
|
|
{
|
|
"epoch": 0.06661575016972165,
|
|
"grad_norm": 37.209373474121094,
|
|
"learning_rate": 1.3299406276505512e-06,
|
|
"loss": 0.0934,
|
|
"num_input_tokens_seen": 756800,
|
|
"step": 785
|
|
},
|
|
{
|
|
"epoch": 0.06704005431093008,
|
|
"grad_norm": 30.665420532226562,
|
|
"learning_rate": 1.3384223918575063e-06,
|
|
"loss": 0.1509,
|
|
"num_input_tokens_seen": 761536,
|
|
"step": 790
|
|
},
|
|
{
|
|
"epoch": 0.0674643584521385,
|
|
"grad_norm": 31.810848236083984,
|
|
"learning_rate": 1.3469041560644613e-06,
|
|
"loss": 0.1107,
|
|
"num_input_tokens_seen": 765824,
|
|
"step": 795
|
|
},
|
|
{
|
|
"epoch": 0.06788866259334692,
|
|
"grad_norm": 1.9104334115982056,
|
|
"learning_rate": 1.3553859202714164e-06,
|
|
"loss": 0.343,
|
|
"num_input_tokens_seen": 770240,
|
|
"step": 800
|
|
},
|
|
{
|
|
"epoch": 0.06831296673455534,
|
|
"grad_norm": 2.776639938354492,
|
|
"learning_rate": 1.3638676844783715e-06,
|
|
"loss": 0.117,
|
|
"num_input_tokens_seen": 775424,
|
|
"step": 805
|
|
},
|
|
{
|
|
"epoch": 0.06873727087576374,
|
|
"grad_norm": 11.082282066345215,
|
|
"learning_rate": 1.3723494486853265e-06,
|
|
"loss": 0.0574,
|
|
"num_input_tokens_seen": 779904,
|
|
"step": 810
|
|
},
|
|
{
|
|
"epoch": 0.06916157501697216,
|
|
"grad_norm": 10.873330116271973,
|
|
"learning_rate": 1.3808312128922814e-06,
|
|
"loss": 0.0708,
|
|
"num_input_tokens_seen": 784512,
|
|
"step": 815
|
|
},
|
|
{
|
|
"epoch": 0.06958587915818058,
|
|
"grad_norm": 24.102693557739258,
|
|
"learning_rate": 1.3893129770992366e-06,
|
|
"loss": 0.0787,
|
|
"num_input_tokens_seen": 789440,
|
|
"step": 820
|
|
},
|
|
{
|
|
"epoch": 0.070010183299389,
|
|
"grad_norm": 20.938024520874023,
|
|
"learning_rate": 1.3977947413061917e-06,
|
|
"loss": 0.1107,
|
|
"num_input_tokens_seen": 794240,
|
|
"step": 825
|
|
},
|
|
{
|
|
"epoch": 0.07043448744059742,
|
|
"grad_norm": 9.698646545410156,
|
|
"learning_rate": 1.4062765055131467e-06,
|
|
"loss": 0.1381,
|
|
"num_input_tokens_seen": 798592,
|
|
"step": 830
|
|
},
|
|
{
|
|
"epoch": 0.07085879158180584,
|
|
"grad_norm": 33.49916076660156,
|
|
"learning_rate": 1.4147582697201018e-06,
|
|
"loss": 0.1233,
|
|
"num_input_tokens_seen": 803776,
|
|
"step": 835
|
|
},
|
|
{
|
|
"epoch": 0.07128309572301425,
|
|
"grad_norm": 6.460715293884277,
|
|
"learning_rate": 1.4232400339270566e-06,
|
|
"loss": 0.0565,
|
|
"num_input_tokens_seen": 809088,
|
|
"step": 840
|
|
},
|
|
{
|
|
"epoch": 0.07170739986422267,
|
|
"grad_norm": 16.81247329711914,
|
|
"learning_rate": 1.4317217981340117e-06,
|
|
"loss": 0.0557,
|
|
"num_input_tokens_seen": 813824,
|
|
"step": 845
|
|
},
|
|
{
|
|
"epoch": 0.07213170400543109,
|
|
"grad_norm": 7.92487907409668,
|
|
"learning_rate": 1.440203562340967e-06,
|
|
"loss": 0.0619,
|
|
"num_input_tokens_seen": 818240,
|
|
"step": 850
|
|
},
|
|
{
|
|
"epoch": 0.07255600814663951,
|
|
"grad_norm": 14.726147651672363,
|
|
"learning_rate": 1.448685326547922e-06,
|
|
"loss": 0.1466,
|
|
"num_input_tokens_seen": 822400,
|
|
"step": 855
|
|
},
|
|
{
|
|
"epoch": 0.07298031228784793,
|
|
"grad_norm": 44.7206916809082,
|
|
"learning_rate": 1.457167090754877e-06,
|
|
"loss": 0.1331,
|
|
"num_input_tokens_seen": 827328,
|
|
"step": 860
|
|
},
|
|
{
|
|
"epoch": 0.07340461642905635,
|
|
"grad_norm": 13.617090225219727,
|
|
"learning_rate": 1.465648854961832e-06,
|
|
"loss": 0.1477,
|
|
"num_input_tokens_seen": 832320,
|
|
"step": 865
|
|
},
|
|
{
|
|
"epoch": 0.07382892057026477,
|
|
"grad_norm": 21.184486389160156,
|
|
"learning_rate": 1.474130619168787e-06,
|
|
"loss": 0.0978,
|
|
"num_input_tokens_seen": 837376,
|
|
"step": 870
|
|
},
|
|
{
|
|
"epoch": 0.07425322471147318,
|
|
"grad_norm": 17.4393367767334,
|
|
"learning_rate": 1.482612383375742e-06,
|
|
"loss": 0.0627,
|
|
"num_input_tokens_seen": 843008,
|
|
"step": 875
|
|
},
|
|
{
|
|
"epoch": 0.0746775288526816,
|
|
"grad_norm": 11.663973808288574,
|
|
"learning_rate": 1.491094147582697e-06,
|
|
"loss": 0.106,
|
|
"num_input_tokens_seen": 847680,
|
|
"step": 880
|
|
},
|
|
{
|
|
"epoch": 0.07510183299389002,
|
|
"grad_norm": 7.6909260749816895,
|
|
"learning_rate": 1.4995759117896522e-06,
|
|
"loss": 0.0601,
|
|
"num_input_tokens_seen": 852288,
|
|
"step": 885
|
|
},
|
|
{
|
|
"epoch": 0.07552613713509844,
|
|
"grad_norm": 18.841503143310547,
|
|
"learning_rate": 1.5080576759966072e-06,
|
|
"loss": 0.0866,
|
|
"num_input_tokens_seen": 857280,
|
|
"step": 890
|
|
},
|
|
{
|
|
"epoch": 0.07595044127630686,
|
|
"grad_norm": 15.288080215454102,
|
|
"learning_rate": 1.5165394402035623e-06,
|
|
"loss": 0.0427,
|
|
"num_input_tokens_seen": 862080,
|
|
"step": 895
|
|
},
|
|
{
|
|
"epoch": 0.07637474541751528,
|
|
"grad_norm": 22.42888069152832,
|
|
"learning_rate": 1.5250212044105173e-06,
|
|
"loss": 0.1675,
|
|
"num_input_tokens_seen": 866624,
|
|
"step": 900
|
|
},
|
|
{
|
|
"epoch": 0.07679904955872369,
|
|
"grad_norm": 19.062753677368164,
|
|
"learning_rate": 1.5335029686174724e-06,
|
|
"loss": 0.0776,
|
|
"num_input_tokens_seen": 871360,
|
|
"step": 905
|
|
},
|
|
{
|
|
"epoch": 0.0772233536999321,
|
|
"grad_norm": 23.211612701416016,
|
|
"learning_rate": 1.5419847328244272e-06,
|
|
"loss": 0.1421,
|
|
"num_input_tokens_seen": 877632,
|
|
"step": 910
|
|
},
|
|
{
|
|
"epoch": 0.07764765784114053,
|
|
"grad_norm": 10.103326797485352,
|
|
"learning_rate": 1.5504664970313825e-06,
|
|
"loss": 0.0654,
|
|
"num_input_tokens_seen": 881600,
|
|
"step": 915
|
|
},
|
|
{
|
|
"epoch": 0.07807196198234895,
|
|
"grad_norm": 10.27497386932373,
|
|
"learning_rate": 1.5589482612383376e-06,
|
|
"loss": 0.061,
|
|
"num_input_tokens_seen": 886400,
|
|
"step": 920
|
|
},
|
|
{
|
|
"epoch": 0.07849626612355737,
|
|
"grad_norm": 19.188682556152344,
|
|
"learning_rate": 1.5674300254452926e-06,
|
|
"loss": 0.0973,
|
|
"num_input_tokens_seen": 890880,
|
|
"step": 925
|
|
},
|
|
{
|
|
"epoch": 0.07892057026476579,
|
|
"grad_norm": 18.3917236328125,
|
|
"learning_rate": 1.5759117896522477e-06,
|
|
"loss": 0.0841,
|
|
"num_input_tokens_seen": 895744,
|
|
"step": 930
|
|
},
|
|
{
|
|
"epoch": 0.07934487440597421,
|
|
"grad_norm": 30.841327667236328,
|
|
"learning_rate": 1.5843935538592025e-06,
|
|
"loss": 0.1163,
|
|
"num_input_tokens_seen": 900224,
|
|
"step": 935
|
|
},
|
|
{
|
|
"epoch": 0.07976917854718261,
|
|
"grad_norm": 7.467300891876221,
|
|
"learning_rate": 1.5928753180661576e-06,
|
|
"loss": 0.0599,
|
|
"num_input_tokens_seen": 905280,
|
|
"step": 940
|
|
},
|
|
{
|
|
"epoch": 0.08019348268839104,
|
|
"grad_norm": 4.249171257019043,
|
|
"learning_rate": 1.6013570822731128e-06,
|
|
"loss": 0.0928,
|
|
"num_input_tokens_seen": 910336,
|
|
"step": 945
|
|
},
|
|
{
|
|
"epoch": 0.08061778682959946,
|
|
"grad_norm": 12.576790809631348,
|
|
"learning_rate": 1.609838846480068e-06,
|
|
"loss": 0.0593,
|
|
"num_input_tokens_seen": 915776,
|
|
"step": 950
|
|
},
|
|
{
|
|
"epoch": 0.08104209097080788,
|
|
"grad_norm": 9.395928382873535,
|
|
"learning_rate": 1.618320610687023e-06,
|
|
"loss": 0.0919,
|
|
"num_input_tokens_seen": 920512,
|
|
"step": 955
|
|
},
|
|
{
|
|
"epoch": 0.0814663951120163,
|
|
"grad_norm": 45.098934173583984,
|
|
"learning_rate": 1.6268023748939778e-06,
|
|
"loss": 0.1401,
|
|
"num_input_tokens_seen": 924992,
|
|
"step": 960
|
|
},
|
|
{
|
|
"epoch": 0.08189069925322472,
|
|
"grad_norm": 20.58957290649414,
|
|
"learning_rate": 1.6352841391009329e-06,
|
|
"loss": 0.141,
|
|
"num_input_tokens_seen": 929792,
|
|
"step": 965
|
|
},
|
|
{
|
|
"epoch": 0.08231500339443312,
|
|
"grad_norm": 3.283543586730957,
|
|
"learning_rate": 1.643765903307888e-06,
|
|
"loss": 0.1226,
|
|
"num_input_tokens_seen": 934208,
|
|
"step": 970
|
|
},
|
|
{
|
|
"epoch": 0.08273930753564154,
|
|
"grad_norm": 5.7450737953186035,
|
|
"learning_rate": 1.652247667514843e-06,
|
|
"loss": 0.0639,
|
|
"num_input_tokens_seen": 938624,
|
|
"step": 975
|
|
},
|
|
{
|
|
"epoch": 0.08316361167684996,
|
|
"grad_norm": 2.2620418071746826,
|
|
"learning_rate": 1.660729431721798e-06,
|
|
"loss": 0.0201,
|
|
"num_input_tokens_seen": 943168,
|
|
"step": 980
|
|
},
|
|
{
|
|
"epoch": 0.08358791581805838,
|
|
"grad_norm": 53.71507263183594,
|
|
"learning_rate": 1.669211195928753e-06,
|
|
"loss": 0.1146,
|
|
"num_input_tokens_seen": 948032,
|
|
"step": 985
|
|
},
|
|
{
|
|
"epoch": 0.0840122199592668,
|
|
"grad_norm": 23.383724212646484,
|
|
"learning_rate": 1.6776929601357082e-06,
|
|
"loss": 0.0903,
|
|
"num_input_tokens_seen": 954176,
|
|
"step": 990
|
|
},
|
|
{
|
|
"epoch": 0.08443652410047522,
|
|
"grad_norm": 22.752714157104492,
|
|
"learning_rate": 1.6861747243426632e-06,
|
|
"loss": 0.1022,
|
|
"num_input_tokens_seen": 958912,
|
|
"step": 995
|
|
},
|
|
{
|
|
"epoch": 0.08486082824168364,
|
|
"grad_norm": 16.344005584716797,
|
|
"learning_rate": 1.6946564885496183e-06,
|
|
"loss": 0.1969,
|
|
"num_input_tokens_seen": 963264,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 0.08528513238289205,
|
|
"grad_norm": 13.522501945495605,
|
|
"learning_rate": 1.7031382527565731e-06,
|
|
"loss": 0.0756,
|
|
"num_input_tokens_seen": 968256,
|
|
"step": 1005
|
|
},
|
|
{
|
|
"epoch": 0.08570943652410047,
|
|
"grad_norm": 25.14958381652832,
|
|
"learning_rate": 1.7116200169635284e-06,
|
|
"loss": 0.1334,
|
|
"num_input_tokens_seen": 972608,
|
|
"step": 1010
|
|
},
|
|
{
|
|
"epoch": 0.08613374066530889,
|
|
"grad_norm": 18.22603416442871,
|
|
"learning_rate": 1.7201017811704834e-06,
|
|
"loss": 0.0777,
|
|
"num_input_tokens_seen": 977856,
|
|
"step": 1015
|
|
},
|
|
{
|
|
"epoch": 0.08655804480651731,
|
|
"grad_norm": 22.536724090576172,
|
|
"learning_rate": 1.7285835453774385e-06,
|
|
"loss": 0.0753,
|
|
"num_input_tokens_seen": 982720,
|
|
"step": 1020
|
|
},
|
|
{
|
|
"epoch": 0.08698234894772573,
|
|
"grad_norm": 35.503116607666016,
|
|
"learning_rate": 1.7370653095843936e-06,
|
|
"loss": 0.1303,
|
|
"num_input_tokens_seen": 987584,
|
|
"step": 1025
|
|
},
|
|
{
|
|
"epoch": 0.08740665308893415,
|
|
"grad_norm": 18.759981155395508,
|
|
"learning_rate": 1.7455470737913484e-06,
|
|
"loss": 0.0839,
|
|
"num_input_tokens_seen": 992448,
|
|
"step": 1030
|
|
},
|
|
{
|
|
"epoch": 0.08783095723014257,
|
|
"grad_norm": 24.25473403930664,
|
|
"learning_rate": 1.7540288379983035e-06,
|
|
"loss": 0.1162,
|
|
"num_input_tokens_seen": 997184,
|
|
"step": 1035
|
|
},
|
|
{
|
|
"epoch": 0.08825526137135098,
|
|
"grad_norm": 11.328585624694824,
|
|
"learning_rate": 1.7625106022052587e-06,
|
|
"loss": 0.0671,
|
|
"num_input_tokens_seen": 1002432,
|
|
"step": 1040
|
|
},
|
|
{
|
|
"epoch": 0.0886795655125594,
|
|
"grad_norm": 25.29698371887207,
|
|
"learning_rate": 1.7709923664122138e-06,
|
|
"loss": 0.0825,
|
|
"num_input_tokens_seen": 1007360,
|
|
"step": 1045
|
|
},
|
|
{
|
|
"epoch": 0.08910386965376782,
|
|
"grad_norm": 0.5227817893028259,
|
|
"learning_rate": 1.7794741306191686e-06,
|
|
"loss": 0.0592,
|
|
"num_input_tokens_seen": 1011968,
|
|
"step": 1050
|
|
},
|
|
{
|
|
"epoch": 0.08952817379497624,
|
|
"grad_norm": 40.1595458984375,
|
|
"learning_rate": 1.7879558948261237e-06,
|
|
"loss": 0.0965,
|
|
"num_input_tokens_seen": 1016896,
|
|
"step": 1055
|
|
},
|
|
{
|
|
"epoch": 0.08995247793618466,
|
|
"grad_norm": 4.811716556549072,
|
|
"learning_rate": 1.7964376590330787e-06,
|
|
"loss": 0.1044,
|
|
"num_input_tokens_seen": 1021952,
|
|
"step": 1060
|
|
},
|
|
{
|
|
"epoch": 0.09037678207739308,
|
|
"grad_norm": 20.576797485351562,
|
|
"learning_rate": 1.8049194232400338e-06,
|
|
"loss": 0.2072,
|
|
"num_input_tokens_seen": 1026560,
|
|
"step": 1065
|
|
},
|
|
{
|
|
"epoch": 0.09080108621860149,
|
|
"grad_norm": 1.8417634963989258,
|
|
"learning_rate": 1.813401187446989e-06,
|
|
"loss": 0.0772,
|
|
"num_input_tokens_seen": 1031360,
|
|
"step": 1070
|
|
},
|
|
{
|
|
"epoch": 0.09122539035980991,
|
|
"grad_norm": 30.97926139831543,
|
|
"learning_rate": 1.821882951653944e-06,
|
|
"loss": 0.1088,
|
|
"num_input_tokens_seen": 1036480,
|
|
"step": 1075
|
|
},
|
|
{
|
|
"epoch": 0.09164969450101833,
|
|
"grad_norm": 25.159650802612305,
|
|
"learning_rate": 1.830364715860899e-06,
|
|
"loss": 0.0824,
|
|
"num_input_tokens_seen": 1041024,
|
|
"step": 1080
|
|
},
|
|
{
|
|
"epoch": 0.09207399864222675,
|
|
"grad_norm": 4.7825026512146,
|
|
"learning_rate": 1.838846480067854e-06,
|
|
"loss": 0.0605,
|
|
"num_input_tokens_seen": 1045312,
|
|
"step": 1085
|
|
},
|
|
{
|
|
"epoch": 0.09249830278343517,
|
|
"grad_norm": 17.933177947998047,
|
|
"learning_rate": 1.847328244274809e-06,
|
|
"loss": 0.0823,
|
|
"num_input_tokens_seen": 1050240,
|
|
"step": 1090
|
|
},
|
|
{
|
|
"epoch": 0.09292260692464359,
|
|
"grad_norm": 9.649979591369629,
|
|
"learning_rate": 1.8558100084817641e-06,
|
|
"loss": 0.0944,
|
|
"num_input_tokens_seen": 1055744,
|
|
"step": 1095
|
|
},
|
|
{
|
|
"epoch": 0.09334691106585201,
|
|
"grad_norm": 18.115781784057617,
|
|
"learning_rate": 1.864291772688719e-06,
|
|
"loss": 0.0633,
|
|
"num_input_tokens_seen": 1060352,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"epoch": 0.09377121520706042,
|
|
"grad_norm": 8.423828125,
|
|
"learning_rate": 1.8727735368956743e-06,
|
|
"loss": 0.0993,
|
|
"num_input_tokens_seen": 1065472,
|
|
"step": 1105
|
|
},
|
|
{
|
|
"epoch": 0.09419551934826884,
|
|
"grad_norm": 2.74596905708313,
|
|
"learning_rate": 1.8812553011026293e-06,
|
|
"loss": 0.0645,
|
|
"num_input_tokens_seen": 1070144,
|
|
"step": 1110
|
|
},
|
|
{
|
|
"epoch": 0.09461982348947726,
|
|
"grad_norm": 0.3166660964488983,
|
|
"learning_rate": 1.8897370653095844e-06,
|
|
"loss": 0.0774,
|
|
"num_input_tokens_seen": 1074688,
|
|
"step": 1115
|
|
},
|
|
{
|
|
"epoch": 0.09504412763068568,
|
|
"grad_norm": 0.2743240296840668,
|
|
"learning_rate": 1.8982188295165394e-06,
|
|
"loss": 0.0562,
|
|
"num_input_tokens_seen": 1079040,
|
|
"step": 1120
|
|
},
|
|
{
|
|
"epoch": 0.0954684317718941,
|
|
"grad_norm": 40.89002990722656,
|
|
"learning_rate": 1.9067005937234943e-06,
|
|
"loss": 0.0789,
|
|
"num_input_tokens_seen": 1083456,
|
|
"step": 1125
|
|
},
|
|
{
|
|
"epoch": 0.09589273591310252,
|
|
"grad_norm": 19.67974090576172,
|
|
"learning_rate": 1.9151823579304493e-06,
|
|
"loss": 0.1173,
|
|
"num_input_tokens_seen": 1088064,
|
|
"step": 1130
|
|
},
|
|
{
|
|
"epoch": 0.09631704005431092,
|
|
"grad_norm": 27.459165573120117,
|
|
"learning_rate": 1.9236641221374044e-06,
|
|
"loss": 0.0631,
|
|
"num_input_tokens_seen": 1092544,
|
|
"step": 1135
|
|
},
|
|
{
|
|
"epoch": 0.09674134419551934,
|
|
"grad_norm": 3.5195469856262207,
|
|
"learning_rate": 1.9321458863443595e-06,
|
|
"loss": 0.1012,
|
|
"num_input_tokens_seen": 1097792,
|
|
"step": 1140
|
|
},
|
|
{
|
|
"epoch": 0.09716564833672776,
|
|
"grad_norm": 14.352964401245117,
|
|
"learning_rate": 1.9406276505513145e-06,
|
|
"loss": 0.1322,
|
|
"num_input_tokens_seen": 1102912,
|
|
"step": 1145
|
|
},
|
|
{
|
|
"epoch": 0.09758995247793618,
|
|
"grad_norm": 11.895064353942871,
|
|
"learning_rate": 1.9491094147582696e-06,
|
|
"loss": 0.0303,
|
|
"num_input_tokens_seen": 1107840,
|
|
"step": 1150
|
|
},
|
|
{
|
|
"epoch": 0.0980142566191446,
|
|
"grad_norm": 33.21426010131836,
|
|
"learning_rate": 1.9575911789652246e-06,
|
|
"loss": 0.1073,
|
|
"num_input_tokens_seen": 1112448,
|
|
"step": 1155
|
|
},
|
|
{
|
|
"epoch": 0.09843856076035302,
|
|
"grad_norm": 24.82822036743164,
|
|
"learning_rate": 1.9660729431721797e-06,
|
|
"loss": 0.1041,
|
|
"num_input_tokens_seen": 1117248,
|
|
"step": 1160
|
|
},
|
|
{
|
|
"epoch": 0.09886286490156145,
|
|
"grad_norm": 17.306224822998047,
|
|
"learning_rate": 1.9745547073791347e-06,
|
|
"loss": 0.0874,
|
|
"num_input_tokens_seen": 1121984,
|
|
"step": 1165
|
|
},
|
|
{
|
|
"epoch": 0.09928716904276985,
|
|
"grad_norm": 12.544316291809082,
|
|
"learning_rate": 1.98303647158609e-06,
|
|
"loss": 0.1026,
|
|
"num_input_tokens_seen": 1127040,
|
|
"step": 1170
|
|
},
|
|
{
|
|
"epoch": 0.09971147318397827,
|
|
"grad_norm": 10.098424911499023,
|
|
"learning_rate": 1.991518235793045e-06,
|
|
"loss": 0.0562,
|
|
"num_input_tokens_seen": 1131904,
|
|
"step": 1175
|
|
},
|
|
{
|
|
"epoch": 0.10013577732518669,
|
|
"grad_norm": 18.721099853515625,
|
|
"learning_rate": 2e-06,
|
|
"loss": 0.0911,
|
|
"num_input_tokens_seen": 1136384,
|
|
"step": 1180
|
|
},
|
|
{
|
|
"epoch": 0.10013577732518669,
|
|
"eval_loss": 0.06557556986808777,
|
|
"eval_runtime": 16.5972,
|
|
"eval_samples_per_second": 631.131,
|
|
"eval_steps_per_second": 78.929,
|
|
"num_input_tokens_seen": 1136384,
|
|
"step": 1180
|
|
},
|
|
{
|
|
"epoch": 0.10056008146639511,
|
|
"grad_norm": 6.374276638031006,
|
|
"learning_rate": 1.999998903046209e-06,
|
|
"loss": 0.0434,
|
|
"num_input_tokens_seen": 1140864,
|
|
"step": 1185
|
|
},
|
|
{
|
|
"epoch": 0.10098438560760353,
|
|
"grad_norm": 24.823820114135742,
|
|
"learning_rate": 1.999995612187243e-06,
|
|
"loss": 0.1327,
|
|
"num_input_tokens_seen": 1145408,
|
|
"step": 1190
|
|
},
|
|
{
|
|
"epoch": 0.10140868974881195,
|
|
"grad_norm": 37.46735763549805,
|
|
"learning_rate": 1.9999901274303226e-06,
|
|
"loss": 0.1281,
|
|
"num_input_tokens_seen": 1150400,
|
|
"step": 1195
|
|
},
|
|
{
|
|
"epoch": 0.10183299389002037,
|
|
"grad_norm": 47.0611457824707,
|
|
"learning_rate": 1.9999824487874795e-06,
|
|
"loss": 0.1961,
|
|
"num_input_tokens_seen": 1154880,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"epoch": 0.10225729803122878,
|
|
"grad_norm": 1.3428001403808594,
|
|
"learning_rate": 1.999972576275561e-06,
|
|
"loss": 0.1219,
|
|
"num_input_tokens_seen": 1159552,
|
|
"step": 1205
|
|
},
|
|
{
|
|
"epoch": 0.1026816021724372,
|
|
"grad_norm": 13.852201461791992,
|
|
"learning_rate": 1.999960509916226e-06,
|
|
"loss": 0.0353,
|
|
"num_input_tokens_seen": 1164800,
|
|
"step": 1210
|
|
},
|
|
{
|
|
"epoch": 0.10310590631364562,
|
|
"grad_norm": 25.563032150268555,
|
|
"learning_rate": 1.9999462497359463e-06,
|
|
"loss": 0.0605,
|
|
"num_input_tokens_seen": 1170304,
|
|
"step": 1215
|
|
},
|
|
{
|
|
"epoch": 0.10353021045485404,
|
|
"grad_norm": 28.24983787536621,
|
|
"learning_rate": 1.999929795766009e-06,
|
|
"loss": 0.0671,
|
|
"num_input_tokens_seen": 1175040,
|
|
"step": 1220
|
|
},
|
|
{
|
|
"epoch": 0.10395451459606246,
|
|
"grad_norm": 0.2810249924659729,
|
|
"learning_rate": 1.999911148042511e-06,
|
|
"loss": 0.006,
|
|
"num_input_tokens_seen": 1180288,
|
|
"step": 1225
|
|
},
|
|
{
|
|
"epoch": 0.10437881873727088,
|
|
"grad_norm": 33.06572723388672,
|
|
"learning_rate": 1.999890306606365e-06,
|
|
"loss": 0.1144,
|
|
"num_input_tokens_seen": 1185088,
|
|
"step": 1230
|
|
},
|
|
{
|
|
"epoch": 0.10480312287847929,
|
|
"grad_norm": 0.12836876511573792,
|
|
"learning_rate": 1.9998672715032944e-06,
|
|
"loss": 0.0921,
|
|
"num_input_tokens_seen": 1189504,
|
|
"step": 1235
|
|
},
|
|
{
|
|
"epoch": 0.10522742701968771,
|
|
"grad_norm": 5.7705888748168945,
|
|
"learning_rate": 1.999842042783836e-06,
|
|
"loss": 0.1853,
|
|
"num_input_tokens_seen": 1194304,
|
|
"step": 1240
|
|
},
|
|
{
|
|
"epoch": 0.10565173116089613,
|
|
"grad_norm": 0.7299937605857849,
|
|
"learning_rate": 1.99981462050334e-06,
|
|
"loss": 0.0506,
|
|
"num_input_tokens_seen": 1198976,
|
|
"step": 1245
|
|
},
|
|
{
|
|
"epoch": 0.10607603530210455,
|
|
"grad_norm": 19.693466186523438,
|
|
"learning_rate": 1.999785004721968e-06,
|
|
"loss": 0.0843,
|
|
"num_input_tokens_seen": 1203520,
|
|
"step": 1250
|
|
},
|
|
{
|
|
"epoch": 0.10650033944331297,
|
|
"grad_norm": 8.684469223022461,
|
|
"learning_rate": 1.9997531955046936e-06,
|
|
"loss": 0.177,
|
|
"num_input_tokens_seen": 1207808,
|
|
"step": 1255
|
|
},
|
|
{
|
|
"epoch": 0.10692464358452139,
|
|
"grad_norm": 2.3564505577087402,
|
|
"learning_rate": 1.9997191929213044e-06,
|
|
"loss": 0.0899,
|
|
"num_input_tokens_seen": 1212992,
|
|
"step": 1260
|
|
},
|
|
{
|
|
"epoch": 0.10734894772572981,
|
|
"grad_norm": 30.763608932495117,
|
|
"learning_rate": 1.999682997046398e-06,
|
|
"loss": 0.1214,
|
|
"num_input_tokens_seen": 1217344,
|
|
"step": 1265
|
|
},
|
|
{
|
|
"epoch": 0.10777325186693822,
|
|
"grad_norm": 35.35347366333008,
|
|
"learning_rate": 1.9996446079593855e-06,
|
|
"loss": 0.076,
|
|
"num_input_tokens_seen": 1222080,
|
|
"step": 1270
|
|
},
|
|
{
|
|
"epoch": 0.10819755600814664,
|
|
"grad_norm": 38.572689056396484,
|
|
"learning_rate": 1.999604025744489e-06,
|
|
"loss": 0.1441,
|
|
"num_input_tokens_seen": 1226752,
|
|
"step": 1275
|
|
},
|
|
{
|
|
"epoch": 0.10862186014935506,
|
|
"grad_norm": 33.32093048095703,
|
|
"learning_rate": 1.9995612504907414e-06,
|
|
"loss": 0.1371,
|
|
"num_input_tokens_seen": 1231808,
|
|
"step": 1280
|
|
},
|
|
{
|
|
"epoch": 0.10904616429056348,
|
|
"grad_norm": 38.79288864135742,
|
|
"learning_rate": 1.999516282291988e-06,
|
|
"loss": 0.0911,
|
|
"num_input_tokens_seen": 1236352,
|
|
"step": 1285
|
|
},
|
|
{
|
|
"epoch": 0.1094704684317719,
|
|
"grad_norm": 26.980831146240234,
|
|
"learning_rate": 1.9994691212468853e-06,
|
|
"loss": 0.0998,
|
|
"num_input_tokens_seen": 1241088,
|
|
"step": 1290
|
|
},
|
|
{
|
|
"epoch": 0.10989477257298032,
|
|
"grad_norm": 1.7668942213058472,
|
|
"learning_rate": 1.9994197674588997e-06,
|
|
"loss": 0.047,
|
|
"num_input_tokens_seen": 1246336,
|
|
"step": 1295
|
|
},
|
|
{
|
|
"epoch": 0.11031907671418872,
|
|
"grad_norm": 6.187965393066406,
|
|
"learning_rate": 1.999368221036309e-06,
|
|
"loss": 0.1001,
|
|
"num_input_tokens_seen": 1251648,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"epoch": 0.11074338085539714,
|
|
"grad_norm": 22.688926696777344,
|
|
"learning_rate": 1.9993144820922015e-06,
|
|
"loss": 0.0829,
|
|
"num_input_tokens_seen": 1256448,
|
|
"step": 1305
|
|
},
|
|
{
|
|
"epoch": 0.11116768499660556,
|
|
"grad_norm": 3.2010271549224854,
|
|
"learning_rate": 1.9992585507444757e-06,
|
|
"loss": 0.0956,
|
|
"num_input_tokens_seen": 1261184,
|
|
"step": 1310
|
|
},
|
|
{
|
|
"epoch": 0.11159198913781398,
|
|
"grad_norm": 44.48208236694336,
|
|
"learning_rate": 1.999200427115839e-06,
|
|
"loss": 0.0874,
|
|
"num_input_tokens_seen": 1266304,
|
|
"step": 1315
|
|
},
|
|
{
|
|
"epoch": 0.1120162932790224,
|
|
"grad_norm": 7.88449239730835,
|
|
"learning_rate": 1.99914011133381e-06,
|
|
"loss": 0.0527,
|
|
"num_input_tokens_seen": 1270848,
|
|
"step": 1320
|
|
},
|
|
{
|
|
"epoch": 0.11244059742023083,
|
|
"grad_norm": 15.53889274597168,
|
|
"learning_rate": 1.999077603530716e-06,
|
|
"loss": 0.0292,
|
|
"num_input_tokens_seen": 1275712,
|
|
"step": 1325
|
|
},
|
|
{
|
|
"epoch": 0.11286490156143925,
|
|
"grad_norm": 25.560720443725586,
|
|
"learning_rate": 1.999012903843693e-06,
|
|
"loss": 0.0534,
|
|
"num_input_tokens_seen": 1280000,
|
|
"step": 1330
|
|
},
|
|
{
|
|
"epoch": 0.11328920570264765,
|
|
"grad_norm": 23.02499008178711,
|
|
"learning_rate": 1.9989460124146854e-06,
|
|
"loss": 0.0866,
|
|
"num_input_tokens_seen": 1285440,
|
|
"step": 1335
|
|
},
|
|
{
|
|
"epoch": 0.11371350984385607,
|
|
"grad_norm": 0.4588419795036316,
|
|
"learning_rate": 1.998876929390448e-06,
|
|
"loss": 0.076,
|
|
"num_input_tokens_seen": 1290176,
|
|
"step": 1340
|
|
},
|
|
{
|
|
"epoch": 0.11413781398506449,
|
|
"grad_norm": 37.34756851196289,
|
|
"learning_rate": 1.9988056549225423e-06,
|
|
"loss": 0.1023,
|
|
"num_input_tokens_seen": 1294912,
|
|
"step": 1345
|
|
},
|
|
{
|
|
"epoch": 0.11456211812627291,
|
|
"grad_norm": 6.717406749725342,
|
|
"learning_rate": 1.9987321891673375e-06,
|
|
"loss": 0.0677,
|
|
"num_input_tokens_seen": 1299136,
|
|
"step": 1350
|
|
},
|
|
{
|
|
"epoch": 0.11498642226748133,
|
|
"grad_norm": 9.831482887268066,
|
|
"learning_rate": 1.9986565322860116e-06,
|
|
"loss": 0.1077,
|
|
"num_input_tokens_seen": 1303936,
|
|
"step": 1355
|
|
},
|
|
{
|
|
"epoch": 0.11541072640868975,
|
|
"grad_norm": 21.746313095092773,
|
|
"learning_rate": 1.9985786844445474e-06,
|
|
"loss": 0.0449,
|
|
"num_input_tokens_seen": 1308928,
|
|
"step": 1360
|
|
},
|
|
{
|
|
"epoch": 0.11583503054989816,
|
|
"grad_norm": 31.04161262512207,
|
|
"learning_rate": 1.9984986458137366e-06,
|
|
"loss": 0.0645,
|
|
"num_input_tokens_seen": 1313728,
|
|
"step": 1365
|
|
},
|
|
{
|
|
"epoch": 0.11625933469110658,
|
|
"grad_norm": 1.728615403175354,
|
|
"learning_rate": 1.998416416569177e-06,
|
|
"loss": 0.079,
|
|
"num_input_tokens_seen": 1318400,
|
|
"step": 1370
|
|
},
|
|
{
|
|
"epoch": 0.116683638832315,
|
|
"grad_norm": 21.65105438232422,
|
|
"learning_rate": 1.9983319968912714e-06,
|
|
"loss": 0.0972,
|
|
"num_input_tokens_seen": 1322752,
|
|
"step": 1375
|
|
},
|
|
{
|
|
"epoch": 0.11710794297352342,
|
|
"grad_norm": 16.821231842041016,
|
|
"learning_rate": 1.9982453869652286e-06,
|
|
"loss": 0.0519,
|
|
"num_input_tokens_seen": 1327552,
|
|
"step": 1380
|
|
},
|
|
{
|
|
"epoch": 0.11753224711473184,
|
|
"grad_norm": 42.55325698852539,
|
|
"learning_rate": 1.9981565869810637e-06,
|
|
"loss": 0.0686,
|
|
"num_input_tokens_seen": 1331776,
|
|
"step": 1385
|
|
},
|
|
{
|
|
"epoch": 0.11795655125594026,
|
|
"grad_norm": 9.326438903808594,
|
|
"learning_rate": 1.998065597133594e-06,
|
|
"loss": 0.0555,
|
|
"num_input_tokens_seen": 1336128,
|
|
"step": 1390
|
|
},
|
|
{
|
|
"epoch": 0.11838085539714868,
|
|
"grad_norm": 22.309532165527344,
|
|
"learning_rate": 1.9979724176224447e-06,
|
|
"loss": 0.0791,
|
|
"num_input_tokens_seen": 1340800,
|
|
"step": 1395
|
|
},
|
|
{
|
|
"epoch": 0.11880515953835709,
|
|
"grad_norm": 8.651053428649902,
|
|
"learning_rate": 1.997877048652042e-06,
|
|
"loss": 0.1258,
|
|
"num_input_tokens_seen": 1345408,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"epoch": 0.11922946367956551,
|
|
"grad_norm": 9.907795906066895,
|
|
"learning_rate": 1.9977794904316163e-06,
|
|
"loss": 0.1394,
|
|
"num_input_tokens_seen": 1350208,
|
|
"step": 1405
|
|
},
|
|
{
|
|
"epoch": 0.11965376782077393,
|
|
"grad_norm": 9.335816383361816,
|
|
"learning_rate": 1.9976797431752023e-06,
|
|
"loss": 0.0656,
|
|
"num_input_tokens_seen": 1354624,
|
|
"step": 1410
|
|
},
|
|
{
|
|
"epoch": 0.12007807196198235,
|
|
"grad_norm": 5.561280250549316,
|
|
"learning_rate": 1.9975778071016357e-06,
|
|
"loss": 0.0545,
|
|
"num_input_tokens_seen": 1359232,
|
|
"step": 1415
|
|
},
|
|
{
|
|
"epoch": 0.12050237610319077,
|
|
"grad_norm": 15.503613471984863,
|
|
"learning_rate": 1.997473682434555e-06,
|
|
"loss": 0.0829,
|
|
"num_input_tokens_seen": 1363904,
|
|
"step": 1420
|
|
},
|
|
{
|
|
"epoch": 0.12092668024439919,
|
|
"grad_norm": 6.7993879318237305,
|
|
"learning_rate": 1.9973673694023998e-06,
|
|
"loss": 0.0593,
|
|
"num_input_tokens_seen": 1368448,
|
|
"step": 1425
|
|
},
|
|
{
|
|
"epoch": 0.12135098438560761,
|
|
"grad_norm": 13.170554161071777,
|
|
"learning_rate": 1.997258868238411e-06,
|
|
"loss": 0.0838,
|
|
"num_input_tokens_seen": 1372864,
|
|
"step": 1430
|
|
},
|
|
{
|
|
"epoch": 0.12177528852681602,
|
|
"grad_norm": 18.046785354614258,
|
|
"learning_rate": 1.997148179180631e-06,
|
|
"loss": 0.0844,
|
|
"num_input_tokens_seen": 1377920,
|
|
"step": 1435
|
|
},
|
|
{
|
|
"epoch": 0.12219959266802444,
|
|
"grad_norm": 19.454439163208008,
|
|
"learning_rate": 1.9970353024719003e-06,
|
|
"loss": 0.0889,
|
|
"num_input_tokens_seen": 1382464,
|
|
"step": 1440
|
|
},
|
|
{
|
|
"epoch": 0.12262389680923286,
|
|
"grad_norm": 23.560047149658203,
|
|
"learning_rate": 1.9969202383598605e-06,
|
|
"loss": 0.0692,
|
|
"num_input_tokens_seen": 1387072,
|
|
"step": 1445
|
|
},
|
|
{
|
|
"epoch": 0.12304820095044128,
|
|
"grad_norm": 16.24759292602539,
|
|
"learning_rate": 1.996802987096952e-06,
|
|
"loss": 0.0405,
|
|
"num_input_tokens_seen": 1391488,
|
|
"step": 1450
|
|
},
|
|
{
|
|
"epoch": 0.1234725050916497,
|
|
"grad_norm": 18.184036254882812,
|
|
"learning_rate": 1.9966835489404123e-06,
|
|
"loss": 0.1054,
|
|
"num_input_tokens_seen": 1397440,
|
|
"step": 1455
|
|
},
|
|
{
|
|
"epoch": 0.12389680923285812,
|
|
"grad_norm": 0.7342911958694458,
|
|
"learning_rate": 1.996561924152278e-06,
|
|
"loss": 0.0332,
|
|
"num_input_tokens_seen": 1402048,
|
|
"step": 1460
|
|
},
|
|
{
|
|
"epoch": 0.12432111337406652,
|
|
"grad_norm": 37.05031967163086,
|
|
"learning_rate": 1.996438112999383e-06,
|
|
"loss": 0.06,
|
|
"num_input_tokens_seen": 1406784,
|
|
"step": 1465
|
|
},
|
|
{
|
|
"epoch": 0.12474541751527495,
|
|
"grad_norm": 53.91966247558594,
|
|
"learning_rate": 1.9963121157533573e-06,
|
|
"loss": 0.097,
|
|
"num_input_tokens_seen": 1411328,
|
|
"step": 1470
|
|
},
|
|
{
|
|
"epoch": 0.12516972165648338,
|
|
"grad_norm": 14.047359466552734,
|
|
"learning_rate": 1.9961839326906272e-06,
|
|
"loss": 0.147,
|
|
"num_input_tokens_seen": 1415936,
|
|
"step": 1475
|
|
},
|
|
{
|
|
"epoch": 0.12559402579769177,
|
|
"grad_norm": 45.842193603515625,
|
|
"learning_rate": 1.9960535640924146e-06,
|
|
"loss": 0.1169,
|
|
"num_input_tokens_seen": 1421248,
|
|
"step": 1480
|
|
},
|
|
{
|
|
"epoch": 0.1260183299389002,
|
|
"grad_norm": 11.323390007019043,
|
|
"learning_rate": 1.995921010244736e-06,
|
|
"loss": 0.0447,
|
|
"num_input_tokens_seen": 1425728,
|
|
"step": 1485
|
|
},
|
|
{
|
|
"epoch": 0.1264426340801086,
|
|
"grad_norm": 9.317659378051758,
|
|
"learning_rate": 1.9957862714384025e-06,
|
|
"loss": 0.0698,
|
|
"num_input_tokens_seen": 1431296,
|
|
"step": 1490
|
|
},
|
|
{
|
|
"epoch": 0.12686693822131703,
|
|
"grad_norm": 9.703478813171387,
|
|
"learning_rate": 1.9956493479690188e-06,
|
|
"loss": 0.0719,
|
|
"num_input_tokens_seen": 1436160,
|
|
"step": 1495
|
|
},
|
|
{
|
|
"epoch": 0.12729124236252545,
|
|
"grad_norm": 32.284366607666016,
|
|
"learning_rate": 1.9955102401369814e-06,
|
|
"loss": 0.0953,
|
|
"num_input_tokens_seen": 1440960,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"epoch": 0.12771554650373387,
|
|
"grad_norm": 15.180221557617188,
|
|
"learning_rate": 1.9953689482474806e-06,
|
|
"loss": 0.0475,
|
|
"num_input_tokens_seen": 1445760,
|
|
"step": 1505
|
|
},
|
|
{
|
|
"epoch": 0.1281398506449423,
|
|
"grad_norm": 0.9210243225097656,
|
|
"learning_rate": 1.995225472610498e-06,
|
|
"loss": 0.0238,
|
|
"num_input_tokens_seen": 1450688,
|
|
"step": 1510
|
|
},
|
|
{
|
|
"epoch": 0.12856415478615071,
|
|
"grad_norm": 53.84825134277344,
|
|
"learning_rate": 1.9950798135408057e-06,
|
|
"loss": 0.1675,
|
|
"num_input_tokens_seen": 1455552,
|
|
"step": 1515
|
|
},
|
|
{
|
|
"epoch": 0.12898845892735913,
|
|
"grad_norm": 25.23078727722168,
|
|
"learning_rate": 1.994931971357966e-06,
|
|
"loss": 0.0899,
|
|
"num_input_tokens_seen": 1460416,
|
|
"step": 1520
|
|
},
|
|
{
|
|
"epoch": 0.12941276306856755,
|
|
"grad_norm": 3.3620643615722656,
|
|
"learning_rate": 1.9947819463863316e-06,
|
|
"loss": 0.0524,
|
|
"num_input_tokens_seen": 1466432,
|
|
"step": 1525
|
|
},
|
|
{
|
|
"epoch": 0.12983706720977597,
|
|
"grad_norm": 26.906789779663086,
|
|
"learning_rate": 1.9946297389550432e-06,
|
|
"loss": 0.1081,
|
|
"num_input_tokens_seen": 1471232,
|
|
"step": 1530
|
|
},
|
|
{
|
|
"epoch": 0.1302613713509844,
|
|
"grad_norm": 18.103910446166992,
|
|
"learning_rate": 1.9944753493980292e-06,
|
|
"loss": 0.0741,
|
|
"num_input_tokens_seen": 1476160,
|
|
"step": 1535
|
|
},
|
|
{
|
|
"epoch": 0.13068567549219282,
|
|
"grad_norm": 21.42559242248535,
|
|
"learning_rate": 1.9943187780540062e-06,
|
|
"loss": 0.1012,
|
|
"num_input_tokens_seen": 1481152,
|
|
"step": 1540
|
|
},
|
|
{
|
|
"epoch": 0.13110997963340124,
|
|
"grad_norm": 7.43846321105957,
|
|
"learning_rate": 1.994160025266478e-06,
|
|
"loss": 0.1522,
|
|
"num_input_tokens_seen": 1486336,
|
|
"step": 1545
|
|
},
|
|
{
|
|
"epoch": 0.13153428377460963,
|
|
"grad_norm": 14.285847663879395,
|
|
"learning_rate": 1.9939990913837327e-06,
|
|
"loss": 0.0736,
|
|
"num_input_tokens_seen": 1491264,
|
|
"step": 1550
|
|
},
|
|
{
|
|
"epoch": 0.13195858791581805,
|
|
"grad_norm": 9.191274642944336,
|
|
"learning_rate": 1.993835976758845e-06,
|
|
"loss": 0.0732,
|
|
"num_input_tokens_seen": 1495680,
|
|
"step": 1555
|
|
},
|
|
{
|
|
"epoch": 0.13238289205702647,
|
|
"grad_norm": 36.20352554321289,
|
|
"learning_rate": 1.993670681749673e-06,
|
|
"loss": 0.1633,
|
|
"num_input_tokens_seen": 1501376,
|
|
"step": 1560
|
|
},
|
|
{
|
|
"epoch": 0.1328071961982349,
|
|
"grad_norm": 2.3607308864593506,
|
|
"learning_rate": 1.9935032067188587e-06,
|
|
"loss": 0.1059,
|
|
"num_input_tokens_seen": 1506176,
|
|
"step": 1565
|
|
},
|
|
{
|
|
"epoch": 0.1332315003394433,
|
|
"grad_norm": 20.173490524291992,
|
|
"learning_rate": 1.993333552033827e-06,
|
|
"loss": 0.0609,
|
|
"num_input_tokens_seen": 1511808,
|
|
"step": 1570
|
|
},
|
|
{
|
|
"epoch": 0.13365580448065173,
|
|
"grad_norm": 12.452956199645996,
|
|
"learning_rate": 1.9931617180667844e-06,
|
|
"loss": 0.044,
|
|
"num_input_tokens_seen": 1516608,
|
|
"step": 1575
|
|
},
|
|
{
|
|
"epoch": 0.13408010862186015,
|
|
"grad_norm": 19.202735900878906,
|
|
"learning_rate": 1.992987705194719e-06,
|
|
"loss": 0.1053,
|
|
"num_input_tokens_seen": 1521280,
|
|
"step": 1580
|
|
},
|
|
{
|
|
"epoch": 0.13450441276306857,
|
|
"grad_norm": 4.530014991760254,
|
|
"learning_rate": 1.9928115137993983e-06,
|
|
"loss": 0.0665,
|
|
"num_input_tokens_seen": 1526080,
|
|
"step": 1585
|
|
},
|
|
{
|
|
"epoch": 0.134928716904277,
|
|
"grad_norm": 19.237306594848633,
|
|
"learning_rate": 1.9926331442673703e-06,
|
|
"loss": 0.0412,
|
|
"num_input_tokens_seen": 1530944,
|
|
"step": 1590
|
|
},
|
|
{
|
|
"epoch": 0.1353530210454854,
|
|
"grad_norm": 28.674280166625977,
|
|
"learning_rate": 1.992452596989962e-06,
|
|
"loss": 0.0928,
|
|
"num_input_tokens_seen": 1536256,
|
|
"step": 1595
|
|
},
|
|
{
|
|
"epoch": 0.13577732518669383,
|
|
"grad_norm": 27.277175903320312,
|
|
"learning_rate": 1.9922698723632763e-06,
|
|
"loss": 0.0756,
|
|
"num_input_tokens_seen": 1540864,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"epoch": 0.13620162932790225,
|
|
"grad_norm": 16.0638484954834,
|
|
"learning_rate": 1.992084970788195e-06,
|
|
"loss": 0.0712,
|
|
"num_input_tokens_seen": 1545536,
|
|
"step": 1605
|
|
},
|
|
{
|
|
"epoch": 0.13662593346911067,
|
|
"grad_norm": 16.823604583740234,
|
|
"learning_rate": 1.991897892670375e-06,
|
|
"loss": 0.1171,
|
|
"num_input_tokens_seen": 1550144,
|
|
"step": 1610
|
|
},
|
|
{
|
|
"epoch": 0.13705023761031906,
|
|
"grad_norm": 0.8663851022720337,
|
|
"learning_rate": 1.9917086384202475e-06,
|
|
"loss": 0.0522,
|
|
"num_input_tokens_seen": 1554624,
|
|
"step": 1615
|
|
},
|
|
{
|
|
"epoch": 0.13747454175152748,
|
|
"grad_norm": 21.397823333740234,
|
|
"learning_rate": 1.9915172084530195e-06,
|
|
"loss": 0.0926,
|
|
"num_input_tokens_seen": 1559168,
|
|
"step": 1620
|
|
},
|
|
{
|
|
"epoch": 0.1378988458927359,
|
|
"grad_norm": 17.87856101989746,
|
|
"learning_rate": 1.9913236031886707e-06,
|
|
"loss": 0.1126,
|
|
"num_input_tokens_seen": 1564032,
|
|
"step": 1625
|
|
},
|
|
{
|
|
"epoch": 0.13832315003394433,
|
|
"grad_norm": 4.989554405212402,
|
|
"learning_rate": 1.9911278230519533e-06,
|
|
"loss": 0.0996,
|
|
"num_input_tokens_seen": 1568896,
|
|
"step": 1630
|
|
},
|
|
{
|
|
"epoch": 0.13874745417515275,
|
|
"grad_norm": 14.12590217590332,
|
|
"learning_rate": 1.9909298684723905e-06,
|
|
"loss": 0.0811,
|
|
"num_input_tokens_seen": 1573888,
|
|
"step": 1635
|
|
},
|
|
{
|
|
"epoch": 0.13917175831636117,
|
|
"grad_norm": 0.8297198414802551,
|
|
"learning_rate": 1.9907297398842764e-06,
|
|
"loss": 0.0792,
|
|
"num_input_tokens_seen": 1578496,
|
|
"step": 1640
|
|
},
|
|
{
|
|
"epoch": 0.1395960624575696,
|
|
"grad_norm": 39.59033966064453,
|
|
"learning_rate": 1.9905274377266744e-06,
|
|
"loss": 0.0503,
|
|
"num_input_tokens_seen": 1583104,
|
|
"step": 1645
|
|
},
|
|
{
|
|
"epoch": 0.140020366598778,
|
|
"grad_norm": 47.28880310058594,
|
|
"learning_rate": 1.9903229624434174e-06,
|
|
"loss": 0.1076,
|
|
"num_input_tokens_seen": 1587648,
|
|
"step": 1650
|
|
},
|
|
{
|
|
"epoch": 0.14044467073998643,
|
|
"grad_norm": 17.418458938598633,
|
|
"learning_rate": 1.9901163144831047e-06,
|
|
"loss": 0.1521,
|
|
"num_input_tokens_seen": 1593216,
|
|
"step": 1655
|
|
},
|
|
{
|
|
"epoch": 0.14086897488119485,
|
|
"grad_norm": 34.3858757019043,
|
|
"learning_rate": 1.989907494299103e-06,
|
|
"loss": 0.0168,
|
|
"num_input_tokens_seen": 1598208,
|
|
"step": 1660
|
|
},
|
|
{
|
|
"epoch": 0.14129327902240327,
|
|
"grad_norm": 0.5869410037994385,
|
|
"learning_rate": 1.989696502349545e-06,
|
|
"loss": 0.0033,
|
|
"num_input_tokens_seen": 1602688,
|
|
"step": 1665
|
|
},
|
|
{
|
|
"epoch": 0.1417175831636117,
|
|
"grad_norm": 28.96558380126953,
|
|
"learning_rate": 1.9894833390973266e-06,
|
|
"loss": 0.1475,
|
|
"num_input_tokens_seen": 1606784,
|
|
"step": 1670
|
|
},
|
|
{
|
|
"epoch": 0.1421418873048201,
|
|
"grad_norm": 39.61481475830078,
|
|
"learning_rate": 1.9892680050101085e-06,
|
|
"loss": 0.1722,
|
|
"num_input_tokens_seen": 1611584,
|
|
"step": 1675
|
|
},
|
|
{
|
|
"epoch": 0.1425661914460285,
|
|
"grad_norm": 16.260469436645508,
|
|
"learning_rate": 1.9890505005603146e-06,
|
|
"loss": 0.1062,
|
|
"num_input_tokens_seen": 1616576,
|
|
"step": 1680
|
|
},
|
|
{
|
|
"epoch": 0.14299049558723692,
|
|
"grad_norm": 15.272690773010254,
|
|
"learning_rate": 1.9888308262251284e-06,
|
|
"loss": 0.0872,
|
|
"num_input_tokens_seen": 1621440,
|
|
"step": 1685
|
|
},
|
|
{
|
|
"epoch": 0.14341479972844534,
|
|
"grad_norm": 10.720672607421875,
|
|
"learning_rate": 1.9886089824864956e-06,
|
|
"loss": 0.076,
|
|
"num_input_tokens_seen": 1626368,
|
|
"step": 1690
|
|
},
|
|
{
|
|
"epoch": 0.14383910386965376,
|
|
"grad_norm": 18.04888916015625,
|
|
"learning_rate": 1.9883849698311213e-06,
|
|
"loss": 0.0351,
|
|
"num_input_tokens_seen": 1630784,
|
|
"step": 1695
|
|
},
|
|
{
|
|
"epoch": 0.14426340801086218,
|
|
"grad_norm": 14.322383880615234,
|
|
"learning_rate": 1.988158788750468e-06,
|
|
"loss": 0.0776,
|
|
"num_input_tokens_seen": 1635776,
|
|
"step": 1700
|
|
},
|
|
{
|
|
"epoch": 0.1446877121520706,
|
|
"grad_norm": 17.74030113220215,
|
|
"learning_rate": 1.9879304397407566e-06,
|
|
"loss": 0.0998,
|
|
"num_input_tokens_seen": 1640448,
|
|
"step": 1705
|
|
},
|
|
{
|
|
"epoch": 0.14511201629327902,
|
|
"grad_norm": 24.689001083374023,
|
|
"learning_rate": 1.987699923302963e-06,
|
|
"loss": 0.0343,
|
|
"num_input_tokens_seen": 1645440,
|
|
"step": 1710
|
|
},
|
|
{
|
|
"epoch": 0.14553632043448744,
|
|
"grad_norm": 50.191375732421875,
|
|
"learning_rate": 1.9874672399428195e-06,
|
|
"loss": 0.0939,
|
|
"num_input_tokens_seen": 1649984,
|
|
"step": 1715
|
|
},
|
|
{
|
|
"epoch": 0.14596062457569586,
|
|
"grad_norm": 45.67329788208008,
|
|
"learning_rate": 1.9872323901708116e-06,
|
|
"loss": 0.1503,
|
|
"num_input_tokens_seen": 1654720,
|
|
"step": 1720
|
|
},
|
|
{
|
|
"epoch": 0.14638492871690428,
|
|
"grad_norm": 16.70132064819336,
|
|
"learning_rate": 1.9869953745021785e-06,
|
|
"loss": 0.1076,
|
|
"num_input_tokens_seen": 1659648,
|
|
"step": 1725
|
|
},
|
|
{
|
|
"epoch": 0.1468092328581127,
|
|
"grad_norm": 0.9173280596733093,
|
|
"learning_rate": 1.9867561934569103e-06,
|
|
"loss": 0.061,
|
|
"num_input_tokens_seen": 1664896,
|
|
"step": 1730
|
|
},
|
|
{
|
|
"epoch": 0.14723353699932112,
|
|
"grad_norm": 5.715014457702637,
|
|
"learning_rate": 1.9865148475597475e-06,
|
|
"loss": 0.0399,
|
|
"num_input_tokens_seen": 1669568,
|
|
"step": 1735
|
|
},
|
|
{
|
|
"epoch": 0.14765784114052954,
|
|
"grad_norm": 3.6975412368774414,
|
|
"learning_rate": 1.986271337340182e-06,
|
|
"loss": 0.0846,
|
|
"num_input_tokens_seen": 1674432,
|
|
"step": 1740
|
|
},
|
|
{
|
|
"epoch": 0.14808214528173794,
|
|
"grad_norm": 9.450813293457031,
|
|
"learning_rate": 1.9860256633324513e-06,
|
|
"loss": 0.0118,
|
|
"num_input_tokens_seen": 1678720,
|
|
"step": 1745
|
|
},
|
|
{
|
|
"epoch": 0.14850644942294636,
|
|
"grad_norm": 9.659982681274414,
|
|
"learning_rate": 1.9857778260755426e-06,
|
|
"loss": 0.0556,
|
|
"num_input_tokens_seen": 1683904,
|
|
"step": 1750
|
|
},
|
|
{
|
|
"epoch": 0.14893075356415478,
|
|
"grad_norm": 11.003069877624512,
|
|
"learning_rate": 1.9855278261131876e-06,
|
|
"loss": 0.0806,
|
|
"num_input_tokens_seen": 1689024,
|
|
"step": 1755
|
|
},
|
|
{
|
|
"epoch": 0.1493550577053632,
|
|
"grad_norm": 2.595398426055908,
|
|
"learning_rate": 1.985275663993863e-06,
|
|
"loss": 0.0878,
|
|
"num_input_tokens_seen": 1693632,
|
|
"step": 1760
|
|
},
|
|
{
|
|
"epoch": 0.14977936184657162,
|
|
"grad_norm": 20.050880432128906,
|
|
"learning_rate": 1.9850213402707888e-06,
|
|
"loss": 0.0456,
|
|
"num_input_tokens_seen": 1698304,
|
|
"step": 1765
|
|
},
|
|
{
|
|
"epoch": 0.15020366598778004,
|
|
"grad_norm": 30.87312126159668,
|
|
"learning_rate": 1.9847648555019286e-06,
|
|
"loss": 0.1636,
|
|
"num_input_tokens_seen": 1703808,
|
|
"step": 1770
|
|
},
|
|
{
|
|
"epoch": 0.15020366598778004,
|
|
"eval_loss": 0.07898761332035065,
|
|
"eval_runtime": 16.6934,
|
|
"eval_samples_per_second": 627.493,
|
|
"eval_steps_per_second": 78.474,
|
|
"num_input_tokens_seen": 1703808,
|
|
"step": 1770
|
|
},
|
|
{
|
|
"epoch": 0.15062797012898846,
|
|
"grad_norm": 7.343339443206787,
|
|
"learning_rate": 1.9845062102499858e-06,
|
|
"loss": 0.0886,
|
|
"num_input_tokens_seen": 1708992,
|
|
"step": 1775
|
|
},
|
|
{
|
|
"epoch": 0.15105227427019688,
|
|
"grad_norm": 2.2905964851379395,
|
|
"learning_rate": 1.9842454050824043e-06,
|
|
"loss": 0.0666,
|
|
"num_input_tokens_seen": 1713600,
|
|
"step": 1780
|
|
},
|
|
{
|
|
"epoch": 0.1514765784114053,
|
|
"grad_norm": 21.04395294189453,
|
|
"learning_rate": 1.9839824405713663e-06,
|
|
"loss": 0.0856,
|
|
"num_input_tokens_seen": 1718208,
|
|
"step": 1785
|
|
},
|
|
{
|
|
"epoch": 0.15190088255261372,
|
|
"grad_norm": 17.02180290222168,
|
|
"learning_rate": 1.983717317293792e-06,
|
|
"loss": 0.1331,
|
|
"num_input_tokens_seen": 1722560,
|
|
"step": 1790
|
|
},
|
|
{
|
|
"epoch": 0.15232518669382214,
|
|
"grad_norm": 3.3538708686828613,
|
|
"learning_rate": 1.983450035831337e-06,
|
|
"loss": 0.0212,
|
|
"num_input_tokens_seen": 1727296,
|
|
"step": 1795
|
|
},
|
|
{
|
|
"epoch": 0.15274949083503056,
|
|
"grad_norm": 12.217562675476074,
|
|
"learning_rate": 1.983180596770392e-06,
|
|
"loss": 0.0641,
|
|
"num_input_tokens_seen": 1732608,
|
|
"step": 1800
|
|
},
|
|
{
|
|
"epoch": 0.15317379497623898,
|
|
"grad_norm": 0.4397600293159485,
|
|
"learning_rate": 1.982909000702082e-06,
|
|
"loss": 0.0374,
|
|
"num_input_tokens_seen": 1737280,
|
|
"step": 1805
|
|
},
|
|
{
|
|
"epoch": 0.15359809911744737,
|
|
"grad_norm": 1.4056516885757446,
|
|
"learning_rate": 1.982635248222264e-06,
|
|
"loss": 0.0789,
|
|
"num_input_tokens_seen": 1741440,
|
|
"step": 1810
|
|
},
|
|
{
|
|
"epoch": 0.1540224032586558,
|
|
"grad_norm": 39.428802490234375,
|
|
"learning_rate": 1.982359339931524e-06,
|
|
"loss": 0.053,
|
|
"num_input_tokens_seen": 1746176,
|
|
"step": 1815
|
|
},
|
|
{
|
|
"epoch": 0.1544467073998642,
|
|
"grad_norm": 32.33784484863281,
|
|
"learning_rate": 1.9820812764351804e-06,
|
|
"loss": 0.0999,
|
|
"num_input_tokens_seen": 1751680,
|
|
"step": 1820
|
|
},
|
|
{
|
|
"epoch": 0.15487101154107263,
|
|
"grad_norm": 0.360365092754364,
|
|
"learning_rate": 1.981801058343279e-06,
|
|
"loss": 0.0435,
|
|
"num_input_tokens_seen": 1756416,
|
|
"step": 1825
|
|
},
|
|
{
|
|
"epoch": 0.15529531568228105,
|
|
"grad_norm": 0.27506718039512634,
|
|
"learning_rate": 1.981518686270592e-06,
|
|
"loss": 0.129,
|
|
"num_input_tokens_seen": 1760960,
|
|
"step": 1830
|
|
},
|
|
{
|
|
"epoch": 0.15571961982348947,
|
|
"grad_norm": 36.94242477416992,
|
|
"learning_rate": 1.9812341608366183e-06,
|
|
"loss": 0.1015,
|
|
"num_input_tokens_seen": 1766208,
|
|
"step": 1835
|
|
},
|
|
{
|
|
"epoch": 0.1561439239646979,
|
|
"grad_norm": 5.320521831512451,
|
|
"learning_rate": 1.980947482665579e-06,
|
|
"loss": 0.0396,
|
|
"num_input_tokens_seen": 1771264,
|
|
"step": 1840
|
|
},
|
|
{
|
|
"epoch": 0.15656822810590632,
|
|
"grad_norm": 9.891328811645508,
|
|
"learning_rate": 1.980658652386421e-06,
|
|
"loss": 0.1701,
|
|
"num_input_tokens_seen": 1776192,
|
|
"step": 1845
|
|
},
|
|
{
|
|
"epoch": 0.15699253224711474,
|
|
"grad_norm": 2.218855857849121,
|
|
"learning_rate": 1.9803676706328102e-06,
|
|
"loss": 0.0937,
|
|
"num_input_tokens_seen": 1780992,
|
|
"step": 1850
|
|
},
|
|
{
|
|
"epoch": 0.15741683638832316,
|
|
"grad_norm": 13.614046096801758,
|
|
"learning_rate": 1.980074538043134e-06,
|
|
"loss": 0.0559,
|
|
"num_input_tokens_seen": 1785408,
|
|
"step": 1855
|
|
},
|
|
{
|
|
"epoch": 0.15784114052953158,
|
|
"grad_norm": 21.1054744720459,
|
|
"learning_rate": 1.9797792552604985e-06,
|
|
"loss": 0.106,
|
|
"num_input_tokens_seen": 1790208,
|
|
"step": 1860
|
|
},
|
|
{
|
|
"epoch": 0.15826544467074,
|
|
"grad_norm": 27.84735107421875,
|
|
"learning_rate": 1.9794818229327266e-06,
|
|
"loss": 0.0207,
|
|
"num_input_tokens_seen": 1795264,
|
|
"step": 1865
|
|
},
|
|
{
|
|
"epoch": 0.15868974881194842,
|
|
"grad_norm": 31.6236572265625,
|
|
"learning_rate": 1.9791822417123576e-06,
|
|
"loss": 0.0464,
|
|
"num_input_tokens_seen": 1800064,
|
|
"step": 1870
|
|
},
|
|
{
|
|
"epoch": 0.1591140529531568,
|
|
"grad_norm": 6.656075477600098,
|
|
"learning_rate": 1.9788805122566445e-06,
|
|
"loss": 0.073,
|
|
"num_input_tokens_seen": 1804672,
|
|
"step": 1875
|
|
},
|
|
{
|
|
"epoch": 0.15953835709436523,
|
|
"grad_norm": 39.10996627807617,
|
|
"learning_rate": 1.9785766352275538e-06,
|
|
"loss": 0.1065,
|
|
"num_input_tokens_seen": 1809408,
|
|
"step": 1880
|
|
},
|
|
{
|
|
"epoch": 0.15996266123557365,
|
|
"grad_norm": 29.846555709838867,
|
|
"learning_rate": 1.9782706112917643e-06,
|
|
"loss": 0.1504,
|
|
"num_input_tokens_seen": 1813824,
|
|
"step": 1885
|
|
},
|
|
{
|
|
"epoch": 0.16038696537678207,
|
|
"grad_norm": 9.747427940368652,
|
|
"learning_rate": 1.977962441120664e-06,
|
|
"loss": 0.0695,
|
|
"num_input_tokens_seen": 1818176,
|
|
"step": 1890
|
|
},
|
|
{
|
|
"epoch": 0.1608112695179905,
|
|
"grad_norm": 0.5947927832603455,
|
|
"learning_rate": 1.9776521253903492e-06,
|
|
"loss": 0.0607,
|
|
"num_input_tokens_seen": 1822784,
|
|
"step": 1895
|
|
},
|
|
{
|
|
"epoch": 0.1612355736591989,
|
|
"grad_norm": 8.875264167785645,
|
|
"learning_rate": 1.9773396647816246e-06,
|
|
"loss": 0.1294,
|
|
"num_input_tokens_seen": 1827520,
|
|
"step": 1900
|
|
},
|
|
{
|
|
"epoch": 0.16165987780040733,
|
|
"grad_norm": 17.60746192932129,
|
|
"learning_rate": 1.97702505998e-06,
|
|
"loss": 0.0832,
|
|
"num_input_tokens_seen": 1832256,
|
|
"step": 1905
|
|
},
|
|
{
|
|
"epoch": 0.16208418194161575,
|
|
"grad_norm": 11.24863338470459,
|
|
"learning_rate": 1.976708311675688e-06,
|
|
"loss": 0.0674,
|
|
"num_input_tokens_seen": 1836864,
|
|
"step": 1910
|
|
},
|
|
{
|
|
"epoch": 0.16250848608282417,
|
|
"grad_norm": 7.13277006149292,
|
|
"learning_rate": 1.976389420563607e-06,
|
|
"loss": 0.0357,
|
|
"num_input_tokens_seen": 1841280,
|
|
"step": 1915
|
|
},
|
|
{
|
|
"epoch": 0.1629327902240326,
|
|
"grad_norm": 8.245125770568848,
|
|
"learning_rate": 1.9760683873433734e-06,
|
|
"loss": 0.0758,
|
|
"num_input_tokens_seen": 1846080,
|
|
"step": 1920
|
|
},
|
|
{
|
|
"epoch": 0.163357094365241,
|
|
"grad_norm": 0.40325310826301575,
|
|
"learning_rate": 1.9757452127193043e-06,
|
|
"loss": 0.0643,
|
|
"num_input_tokens_seen": 1850816,
|
|
"step": 1925
|
|
},
|
|
{
|
|
"epoch": 0.16378139850644943,
|
|
"grad_norm": 6.0531721115112305,
|
|
"learning_rate": 1.9754198974004156e-06,
|
|
"loss": 0.0518,
|
|
"num_input_tokens_seen": 1855232,
|
|
"step": 1930
|
|
},
|
|
{
|
|
"epoch": 0.16420570264765785,
|
|
"grad_norm": 6.1818528175354,
|
|
"learning_rate": 1.975092442100419e-06,
|
|
"loss": 0.0657,
|
|
"num_input_tokens_seen": 1860160,
|
|
"step": 1935
|
|
},
|
|
{
|
|
"epoch": 0.16463000678886625,
|
|
"grad_norm": 11.560309410095215,
|
|
"learning_rate": 1.9747628475377204e-06,
|
|
"loss": 0.0385,
|
|
"num_input_tokens_seen": 1865024,
|
|
"step": 1940
|
|
},
|
|
{
|
|
"epoch": 0.16505431093007467,
|
|
"grad_norm": 46.3505859375,
|
|
"learning_rate": 1.9744311144354208e-06,
|
|
"loss": 0.0816,
|
|
"num_input_tokens_seen": 1869888,
|
|
"step": 1945
|
|
},
|
|
{
|
|
"epoch": 0.16547861507128309,
|
|
"grad_norm": 0.5927810668945312,
|
|
"learning_rate": 1.9740972435213112e-06,
|
|
"loss": 0.1011,
|
|
"num_input_tokens_seen": 1874624,
|
|
"step": 1950
|
|
},
|
|
{
|
|
"epoch": 0.1659029192124915,
|
|
"grad_norm": 22.69270896911621,
|
|
"learning_rate": 1.973761235527874e-06,
|
|
"loss": 0.0588,
|
|
"num_input_tokens_seen": 1879168,
|
|
"step": 1955
|
|
},
|
|
{
|
|
"epoch": 0.16632722335369993,
|
|
"grad_norm": 73.64214324951172,
|
|
"learning_rate": 1.9734230911922795e-06,
|
|
"loss": 0.1602,
|
|
"num_input_tokens_seen": 1884096,
|
|
"step": 1960
|
|
},
|
|
{
|
|
"epoch": 0.16675152749490835,
|
|
"grad_norm": 42.7552604675293,
|
|
"learning_rate": 1.9730828112563852e-06,
|
|
"loss": 0.0629,
|
|
"num_input_tokens_seen": 1888832,
|
|
"step": 1965
|
|
},
|
|
{
|
|
"epoch": 0.16717583163611677,
|
|
"grad_norm": 40.55192947387695,
|
|
"learning_rate": 1.972740396466734e-06,
|
|
"loss": 0.0492,
|
|
"num_input_tokens_seen": 1893696,
|
|
"step": 1970
|
|
},
|
|
{
|
|
"epoch": 0.1676001357773252,
|
|
"grad_norm": 6.033570766448975,
|
|
"learning_rate": 1.972395847574552e-06,
|
|
"loss": 0.0552,
|
|
"num_input_tokens_seen": 1898176,
|
|
"step": 1975
|
|
},
|
|
{
|
|
"epoch": 0.1680244399185336,
|
|
"grad_norm": 6.996667385101318,
|
|
"learning_rate": 1.972049165335747e-06,
|
|
"loss": 0.0815,
|
|
"num_input_tokens_seen": 1902720,
|
|
"step": 1980
|
|
},
|
|
{
|
|
"epoch": 0.16844874405974203,
|
|
"grad_norm": 36.47344970703125,
|
|
"learning_rate": 1.9717003505109094e-06,
|
|
"loss": 0.0606,
|
|
"num_input_tokens_seen": 1907520,
|
|
"step": 1985
|
|
},
|
|
{
|
|
"epoch": 0.16887304820095045,
|
|
"grad_norm": 22.555927276611328,
|
|
"learning_rate": 1.9713494038653054e-06,
|
|
"loss": 0.0846,
|
|
"num_input_tokens_seen": 1912000,
|
|
"step": 1990
|
|
},
|
|
{
|
|
"epoch": 0.16929735234215887,
|
|
"grad_norm": 7.1749467849731445,
|
|
"learning_rate": 1.97099632616888e-06,
|
|
"loss": 0.059,
|
|
"num_input_tokens_seen": 1916224,
|
|
"step": 1995
|
|
},
|
|
{
|
|
"epoch": 0.1697216564833673,
|
|
"grad_norm": 1.437082290649414,
|
|
"learning_rate": 1.9706411181962534e-06,
|
|
"loss": 0.0556,
|
|
"num_input_tokens_seen": 1920896,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"epoch": 0.1701459606245757,
|
|
"grad_norm": 25.92020034790039,
|
|
"learning_rate": 1.970283780726718e-06,
|
|
"loss": 0.0372,
|
|
"num_input_tokens_seen": 1925312,
|
|
"step": 2005
|
|
},
|
|
{
|
|
"epoch": 0.1705702647657841,
|
|
"grad_norm": 0.5283477902412415,
|
|
"learning_rate": 1.9699243145442397e-06,
|
|
"loss": 0.0983,
|
|
"num_input_tokens_seen": 1929920,
|
|
"step": 2010
|
|
},
|
|
{
|
|
"epoch": 0.17099456890699252,
|
|
"grad_norm": 27.817808151245117,
|
|
"learning_rate": 1.9695627204374544e-06,
|
|
"loss": 0.0984,
|
|
"num_input_tokens_seen": 1934720,
|
|
"step": 2015
|
|
},
|
|
{
|
|
"epoch": 0.17141887304820094,
|
|
"grad_norm": 27.818002700805664,
|
|
"learning_rate": 1.969198999199666e-06,
|
|
"loss": 0.0461,
|
|
"num_input_tokens_seen": 1939584,
|
|
"step": 2020
|
|
},
|
|
{
|
|
"epoch": 0.17184317718940936,
|
|
"grad_norm": 11.765695571899414,
|
|
"learning_rate": 1.968833151628845e-06,
|
|
"loss": 0.1322,
|
|
"num_input_tokens_seen": 1944576,
|
|
"step": 2025
|
|
},
|
|
{
|
|
"epoch": 0.17226748133061778,
|
|
"grad_norm": 26.49534034729004,
|
|
"learning_rate": 1.968465178527628e-06,
|
|
"loss": 0.1071,
|
|
"num_input_tokens_seen": 1948928,
|
|
"step": 2030
|
|
},
|
|
{
|
|
"epoch": 0.1726917854718262,
|
|
"grad_norm": 46.503177642822266,
|
|
"learning_rate": 1.9680950807033124e-06,
|
|
"loss": 0.0843,
|
|
"num_input_tokens_seen": 1953600,
|
|
"step": 2035
|
|
},
|
|
{
|
|
"epoch": 0.17311608961303462,
|
|
"grad_norm": 5.747588157653809,
|
|
"learning_rate": 1.96772285896786e-06,
|
|
"loss": 0.0991,
|
|
"num_input_tokens_seen": 1958592,
|
|
"step": 2040
|
|
},
|
|
{
|
|
"epoch": 0.17354039375424304,
|
|
"grad_norm": 1.6777392625808716,
|
|
"learning_rate": 1.9673485141378904e-06,
|
|
"loss": 0.037,
|
|
"num_input_tokens_seen": 1962752,
|
|
"step": 2045
|
|
},
|
|
{
|
|
"epoch": 0.17396469789545146,
|
|
"grad_norm": 14.377074241638184,
|
|
"learning_rate": 1.9669720470346817e-06,
|
|
"loss": 0.1303,
|
|
"num_input_tokens_seen": 1967424,
|
|
"step": 2050
|
|
},
|
|
{
|
|
"epoch": 0.17438900203665988,
|
|
"grad_norm": 33.348052978515625,
|
|
"learning_rate": 1.966593458484168e-06,
|
|
"loss": 0.0643,
|
|
"num_input_tokens_seen": 1972736,
|
|
"step": 2055
|
|
},
|
|
{
|
|
"epoch": 0.1748133061778683,
|
|
"grad_norm": 18.610492706298828,
|
|
"learning_rate": 1.9662127493169367e-06,
|
|
"loss": 0.041,
|
|
"num_input_tokens_seen": 1977408,
|
|
"step": 2060
|
|
},
|
|
{
|
|
"epoch": 0.17523761031907673,
|
|
"grad_norm": 10.424017906188965,
|
|
"learning_rate": 1.96582992036823e-06,
|
|
"loss": 0.0251,
|
|
"num_input_tokens_seen": 1982016,
|
|
"step": 2065
|
|
},
|
|
{
|
|
"epoch": 0.17566191446028515,
|
|
"grad_norm": 12.599979400634766,
|
|
"learning_rate": 1.9654449724779387e-06,
|
|
"loss": 0.0838,
|
|
"num_input_tokens_seen": 1987392,
|
|
"step": 2070
|
|
},
|
|
{
|
|
"epoch": 0.17608621860149354,
|
|
"grad_norm": 10.004508018493652,
|
|
"learning_rate": 1.965057906490602e-06,
|
|
"loss": 0.083,
|
|
"num_input_tokens_seen": 1992064,
|
|
"step": 2075
|
|
},
|
|
{
|
|
"epoch": 0.17651052274270196,
|
|
"grad_norm": 18.495250701904297,
|
|
"learning_rate": 1.964668723255408e-06,
|
|
"loss": 0.0856,
|
|
"num_input_tokens_seen": 1997120,
|
|
"step": 2080
|
|
},
|
|
{
|
|
"epoch": 0.17693482688391038,
|
|
"grad_norm": 8.914718627929688,
|
|
"learning_rate": 1.964277423626188e-06,
|
|
"loss": 0.0407,
|
|
"num_input_tokens_seen": 2001664,
|
|
"step": 2085
|
|
},
|
|
{
|
|
"epoch": 0.1773591310251188,
|
|
"grad_norm": 24.79432487487793,
|
|
"learning_rate": 1.9638840084614178e-06,
|
|
"loss": 0.0946,
|
|
"num_input_tokens_seen": 2006336,
|
|
"step": 2090
|
|
},
|
|
{
|
|
"epoch": 0.17778343516632722,
|
|
"grad_norm": 24.861055374145508,
|
|
"learning_rate": 1.963488478624214e-06,
|
|
"loss": 0.0772,
|
|
"num_input_tokens_seen": 2011264,
|
|
"step": 2095
|
|
},
|
|
{
|
|
"epoch": 0.17820773930753564,
|
|
"grad_norm": 26.96556282043457,
|
|
"learning_rate": 1.9630908349823315e-06,
|
|
"loss": 0.0967,
|
|
"num_input_tokens_seen": 2015680,
|
|
"step": 2100
|
|
},
|
|
{
|
|
"epoch": 0.17863204344874406,
|
|
"grad_norm": 18.672582626342773,
|
|
"learning_rate": 1.9626910784081647e-06,
|
|
"loss": 0.1303,
|
|
"num_input_tokens_seen": 2020352,
|
|
"step": 2105
|
|
},
|
|
{
|
|
"epoch": 0.17905634758995248,
|
|
"grad_norm": 8.335457801818848,
|
|
"learning_rate": 1.9622892097787426e-06,
|
|
"loss": 0.0429,
|
|
"num_input_tokens_seen": 2024832,
|
|
"step": 2110
|
|
},
|
|
{
|
|
"epoch": 0.1794806517311609,
|
|
"grad_norm": 9.511039733886719,
|
|
"learning_rate": 1.961885229975727e-06,
|
|
"loss": 0.036,
|
|
"num_input_tokens_seen": 2029184,
|
|
"step": 2115
|
|
},
|
|
{
|
|
"epoch": 0.17990495587236932,
|
|
"grad_norm": 10.767931938171387,
|
|
"learning_rate": 1.9614791398854133e-06,
|
|
"loss": 0.0837,
|
|
"num_input_tokens_seen": 2034048,
|
|
"step": 2120
|
|
},
|
|
{
|
|
"epoch": 0.18032926001357774,
|
|
"grad_norm": 12.731311798095703,
|
|
"learning_rate": 1.9610709403987244e-06,
|
|
"loss": 0.0394,
|
|
"num_input_tokens_seen": 2039232,
|
|
"step": 2125
|
|
},
|
|
{
|
|
"epoch": 0.18075356415478616,
|
|
"grad_norm": 12.151493072509766,
|
|
"learning_rate": 1.9606606324112134e-06,
|
|
"loss": 0.0937,
|
|
"num_input_tokens_seen": 2043712,
|
|
"step": 2130
|
|
},
|
|
{
|
|
"epoch": 0.18117786829599458,
|
|
"grad_norm": 23.468860626220703,
|
|
"learning_rate": 1.9602482168230576e-06,
|
|
"loss": 0.1525,
|
|
"num_input_tokens_seen": 2048576,
|
|
"step": 2135
|
|
},
|
|
{
|
|
"epoch": 0.18160217243720297,
|
|
"grad_norm": 12.770451545715332,
|
|
"learning_rate": 1.9598336945390584e-06,
|
|
"loss": 0.0841,
|
|
"num_input_tokens_seen": 2053440,
|
|
"step": 2140
|
|
},
|
|
{
|
|
"epoch": 0.1820264765784114,
|
|
"grad_norm": 2.36533784866333,
|
|
"learning_rate": 1.95941706646864e-06,
|
|
"loss": 0.061,
|
|
"num_input_tokens_seen": 2058304,
|
|
"step": 2145
|
|
},
|
|
{
|
|
"epoch": 0.18245078071961982,
|
|
"grad_norm": 11.240355491638184,
|
|
"learning_rate": 1.9589983335258457e-06,
|
|
"loss": 0.0666,
|
|
"num_input_tokens_seen": 2062720,
|
|
"step": 2150
|
|
},
|
|
{
|
|
"epoch": 0.18287508486082824,
|
|
"grad_norm": 11.598411560058594,
|
|
"learning_rate": 1.9585774966293365e-06,
|
|
"loss": 0.0806,
|
|
"num_input_tokens_seen": 2067264,
|
|
"step": 2155
|
|
},
|
|
{
|
|
"epoch": 0.18329938900203666,
|
|
"grad_norm": 7.5163068771362305,
|
|
"learning_rate": 1.95815455670239e-06,
|
|
"loss": 0.085,
|
|
"num_input_tokens_seen": 2071744,
|
|
"step": 2160
|
|
},
|
|
{
|
|
"epoch": 0.18372369314324508,
|
|
"grad_norm": 13.197871208190918,
|
|
"learning_rate": 1.957729514672897e-06,
|
|
"loss": 0.0733,
|
|
"num_input_tokens_seen": 2076352,
|
|
"step": 2165
|
|
},
|
|
{
|
|
"epoch": 0.1841479972844535,
|
|
"grad_norm": 20.85205078125,
|
|
"learning_rate": 1.957302371473361e-06,
|
|
"loss": 0.0596,
|
|
"num_input_tokens_seen": 2081088,
|
|
"step": 2170
|
|
},
|
|
{
|
|
"epoch": 0.18457230142566192,
|
|
"grad_norm": 11.822310447692871,
|
|
"learning_rate": 1.9568731280408945e-06,
|
|
"loss": 0.0876,
|
|
"num_input_tokens_seen": 2085760,
|
|
"step": 2175
|
|
},
|
|
{
|
|
"epoch": 0.18499660556687034,
|
|
"grad_norm": 9.391737937927246,
|
|
"learning_rate": 1.956441785317217e-06,
|
|
"loss": 0.1062,
|
|
"num_input_tokens_seen": 2090624,
|
|
"step": 2180
|
|
},
|
|
{
|
|
"epoch": 0.18542090970807876,
|
|
"grad_norm": 10.104798316955566,
|
|
"learning_rate": 1.9560083442486565e-06,
|
|
"loss": 0.0561,
|
|
"num_input_tokens_seen": 2095936,
|
|
"step": 2185
|
|
},
|
|
{
|
|
"epoch": 0.18584521384928718,
|
|
"grad_norm": 8.834094047546387,
|
|
"learning_rate": 1.955572805786141e-06,
|
|
"loss": 0.0777,
|
|
"num_input_tokens_seen": 2100608,
|
|
"step": 2190
|
|
},
|
|
{
|
|
"epoch": 0.1862695179904956,
|
|
"grad_norm": 29.391233444213867,
|
|
"learning_rate": 1.9551351708852015e-06,
|
|
"loss": 0.0766,
|
|
"num_input_tokens_seen": 2105856,
|
|
"step": 2195
|
|
},
|
|
{
|
|
"epoch": 0.18669382213170402,
|
|
"grad_norm": 8.212447166442871,
|
|
"learning_rate": 1.9546954405059697e-06,
|
|
"loss": 0.055,
|
|
"num_input_tokens_seen": 2110464,
|
|
"step": 2200
|
|
},
|
|
{
|
|
"epoch": 0.1871181262729124,
|
|
"grad_norm": 21.514095306396484,
|
|
"learning_rate": 1.954253615613173e-06,
|
|
"loss": 0.0969,
|
|
"num_input_tokens_seen": 2115648,
|
|
"step": 2205
|
|
},
|
|
{
|
|
"epoch": 0.18754243041412083,
|
|
"grad_norm": 1.061404824256897,
|
|
"learning_rate": 1.9538096971761343e-06,
|
|
"loss": 0.037,
|
|
"num_input_tokens_seen": 2120256,
|
|
"step": 2210
|
|
},
|
|
{
|
|
"epoch": 0.18796673455532925,
|
|
"grad_norm": 8.798285484313965,
|
|
"learning_rate": 1.9533636861687696e-06,
|
|
"loss": 0.1158,
|
|
"num_input_tokens_seen": 2124672,
|
|
"step": 2215
|
|
},
|
|
{
|
|
"epoch": 0.18839103869653767,
|
|
"grad_norm": 8.102149963378906,
|
|
"learning_rate": 1.9529155835695855e-06,
|
|
"loss": 0.0527,
|
|
"num_input_tokens_seen": 2129344,
|
|
"step": 2220
|
|
},
|
|
{
|
|
"epoch": 0.1888153428377461,
|
|
"grad_norm": 26.47767448425293,
|
|
"learning_rate": 1.952465390361678e-06,
|
|
"loss": 0.0956,
|
|
"num_input_tokens_seen": 2134336,
|
|
"step": 2225
|
|
},
|
|
{
|
|
"epoch": 0.1892396469789545,
|
|
"grad_norm": 8.625823020935059,
|
|
"learning_rate": 1.95201310753273e-06,
|
|
"loss": 0.0605,
|
|
"num_input_tokens_seen": 2139456,
|
|
"step": 2230
|
|
},
|
|
{
|
|
"epoch": 0.18966395112016293,
|
|
"grad_norm": 3.1748905181884766,
|
|
"learning_rate": 1.9515587360750068e-06,
|
|
"loss": 0.1159,
|
|
"num_input_tokens_seen": 2144640,
|
|
"step": 2235
|
|
},
|
|
{
|
|
"epoch": 0.19008825526137135,
|
|
"grad_norm": 12.466878890991211,
|
|
"learning_rate": 1.9511022769853586e-06,
|
|
"loss": 0.1253,
|
|
"num_input_tokens_seen": 2149760,
|
|
"step": 2240
|
|
},
|
|
{
|
|
"epoch": 0.19051255940257977,
|
|
"grad_norm": 31.816240310668945,
|
|
"learning_rate": 1.9506437312652144e-06,
|
|
"loss": 0.1579,
|
|
"num_input_tokens_seen": 2155200,
|
|
"step": 2245
|
|
},
|
|
{
|
|
"epoch": 0.1909368635437882,
|
|
"grad_norm": 4.647470474243164,
|
|
"learning_rate": 1.9501830999205806e-06,
|
|
"loss": 0.1539,
|
|
"num_input_tokens_seen": 2159872,
|
|
"step": 2250
|
|
},
|
|
{
|
|
"epoch": 0.1913611676849966,
|
|
"grad_norm": 21.403379440307617,
|
|
"learning_rate": 1.9497203839620398e-06,
|
|
"loss": 0.0693,
|
|
"num_input_tokens_seen": 2164544,
|
|
"step": 2255
|
|
},
|
|
{
|
|
"epoch": 0.19178547182620503,
|
|
"grad_norm": 3.0394349098205566,
|
|
"learning_rate": 1.9492555844047483e-06,
|
|
"loss": 0.0465,
|
|
"num_input_tokens_seen": 2169856,
|
|
"step": 2260
|
|
},
|
|
{
|
|
"epoch": 0.19220977596741345,
|
|
"grad_norm": 1.1695024967193604,
|
|
"learning_rate": 1.9487887022684334e-06,
|
|
"loss": 0.037,
|
|
"num_input_tokens_seen": 2174400,
|
|
"step": 2265
|
|
},
|
|
{
|
|
"epoch": 0.19263408010862185,
|
|
"grad_norm": 16.506855010986328,
|
|
"learning_rate": 1.9483197385773913e-06,
|
|
"loss": 0.0993,
|
|
"num_input_tokens_seen": 2179200,
|
|
"step": 2270
|
|
},
|
|
{
|
|
"epoch": 0.19305838424983027,
|
|
"grad_norm": 10.504711151123047,
|
|
"learning_rate": 1.947848694360485e-06,
|
|
"loss": 0.0643,
|
|
"num_input_tokens_seen": 2184768,
|
|
"step": 2275
|
|
},
|
|
{
|
|
"epoch": 0.1934826883910387,
|
|
"grad_norm": 65.77010345458984,
|
|
"learning_rate": 1.947375570651142e-06,
|
|
"loss": 0.0588,
|
|
"num_input_tokens_seen": 2189824,
|
|
"step": 2280
|
|
},
|
|
{
|
|
"epoch": 0.1939069925322471,
|
|
"grad_norm": 1.5039589405059814,
|
|
"learning_rate": 1.9469003684873514e-06,
|
|
"loss": 0.0534,
|
|
"num_input_tokens_seen": 2194752,
|
|
"step": 2285
|
|
},
|
|
{
|
|
"epoch": 0.19433129667345553,
|
|
"grad_norm": 3.758580207824707,
|
|
"learning_rate": 1.946423088911664e-06,
|
|
"loss": 0.0862,
|
|
"num_input_tokens_seen": 2199552,
|
|
"step": 2290
|
|
},
|
|
{
|
|
"epoch": 0.19475560081466395,
|
|
"grad_norm": 15.222334861755371,
|
|
"learning_rate": 1.9459437329711865e-06,
|
|
"loss": 0.2186,
|
|
"num_input_tokens_seen": 2204288,
|
|
"step": 2295
|
|
},
|
|
{
|
|
"epoch": 0.19517990495587237,
|
|
"grad_norm": 26.070444107055664,
|
|
"learning_rate": 1.945462301717581e-06,
|
|
"loss": 0.1068,
|
|
"num_input_tokens_seen": 2209344,
|
|
"step": 2300
|
|
},
|
|
{
|
|
"epoch": 0.1956042090970808,
|
|
"grad_norm": 6.4612345695495605,
|
|
"learning_rate": 1.944978796207064e-06,
|
|
"loss": 0.0747,
|
|
"num_input_tokens_seen": 2214208,
|
|
"step": 2305
|
|
},
|
|
{
|
|
"epoch": 0.1960285132382892,
|
|
"grad_norm": 11.475109100341797,
|
|
"learning_rate": 1.9444932175004017e-06,
|
|
"loss": 0.0991,
|
|
"num_input_tokens_seen": 2218624,
|
|
"step": 2310
|
|
},
|
|
{
|
|
"epoch": 0.19645281737949763,
|
|
"grad_norm": 15.514081954956055,
|
|
"learning_rate": 1.9440055666629087e-06,
|
|
"loss": 0.0871,
|
|
"num_input_tokens_seen": 2223872,
|
|
"step": 2315
|
|
},
|
|
{
|
|
"epoch": 0.19687712152070605,
|
|
"grad_norm": 9.900314331054688,
|
|
"learning_rate": 1.943515844764446e-06,
|
|
"loss": 0.0537,
|
|
"num_input_tokens_seen": 2228096,
|
|
"step": 2320
|
|
},
|
|
{
|
|
"epoch": 0.19730142566191447,
|
|
"grad_norm": 19.002424240112305,
|
|
"learning_rate": 1.943024052879418e-06,
|
|
"loss": 0.1341,
|
|
"num_input_tokens_seen": 2232384,
|
|
"step": 2325
|
|
},
|
|
{
|
|
"epoch": 0.1977257298031229,
|
|
"grad_norm": 1.5769349336624146,
|
|
"learning_rate": 1.9425301920867703e-06,
|
|
"loss": 0.0329,
|
|
"num_input_tokens_seen": 2237184,
|
|
"step": 2330
|
|
},
|
|
{
|
|
"epoch": 0.19815003394433128,
|
|
"grad_norm": 16.940818786621094,
|
|
"learning_rate": 1.942034263469989e-06,
|
|
"loss": 0.0955,
|
|
"num_input_tokens_seen": 2242048,
|
|
"step": 2335
|
|
},
|
|
{
|
|
"epoch": 0.1985743380855397,
|
|
"grad_norm": 10.212929725646973,
|
|
"learning_rate": 1.941536268117095e-06,
|
|
"loss": 0.0699,
|
|
"num_input_tokens_seen": 2247104,
|
|
"step": 2340
|
|
},
|
|
{
|
|
"epoch": 0.19899864222674812,
|
|
"grad_norm": 9.484726905822754,
|
|
"learning_rate": 1.9410362071206436e-06,
|
|
"loss": 0.0602,
|
|
"num_input_tokens_seen": 2251840,
|
|
"step": 2345
|
|
},
|
|
{
|
|
"epoch": 0.19942294636795654,
|
|
"grad_norm": 56.59994125366211,
|
|
"learning_rate": 1.9405340815777232e-06,
|
|
"loss": 0.1239,
|
|
"num_input_tokens_seen": 2256832,
|
|
"step": 2350
|
|
},
|
|
{
|
|
"epoch": 0.19984725050916496,
|
|
"grad_norm": 22.974138259887695,
|
|
"learning_rate": 1.9400298925899505e-06,
|
|
"loss": 0.0264,
|
|
"num_input_tokens_seen": 2261120,
|
|
"step": 2355
|
|
},
|
|
{
|
|
"epoch": 0.20027155465037338,
|
|
"grad_norm": 7.911641597747803,
|
|
"learning_rate": 1.939523641263469e-06,
|
|
"loss": 0.138,
|
|
"num_input_tokens_seen": 2266496,
|
|
"step": 2360
|
|
},
|
|
{
|
|
"epoch": 0.20027155465037338,
|
|
"eval_loss": 0.07734645158052444,
|
|
"eval_runtime": 16.8483,
|
|
"eval_samples_per_second": 621.724,
|
|
"eval_steps_per_second": 77.753,
|
|
"num_input_tokens_seen": 2266496,
|
|
"step": 2360
|
|
},
|
|
{
|
|
"epoch": 0.2006958587915818,
|
|
"grad_norm": 1.336319923400879,
|
|
"learning_rate": 1.9390153287089485e-06,
|
|
"loss": 0.0205,
|
|
"num_input_tokens_seen": 2271040,
|
|
"step": 2365
|
|
},
|
|
{
|
|
"epoch": 0.20112016293279023,
|
|
"grad_norm": 10.518303871154785,
|
|
"learning_rate": 1.938504956041579e-06,
|
|
"loss": 0.0794,
|
|
"num_input_tokens_seen": 2276096,
|
|
"step": 2370
|
|
},
|
|
{
|
|
"epoch": 0.20154446707399865,
|
|
"grad_norm": 23.853179931640625,
|
|
"learning_rate": 1.937992524381071e-06,
|
|
"loss": 0.0384,
|
|
"num_input_tokens_seen": 2280192,
|
|
"step": 2375
|
|
},
|
|
{
|
|
"epoch": 0.20196877121520707,
|
|
"grad_norm": 44.31528854370117,
|
|
"learning_rate": 1.9374780348516525e-06,
|
|
"loss": 0.0313,
|
|
"num_input_tokens_seen": 2284672,
|
|
"step": 2380
|
|
},
|
|
{
|
|
"epoch": 0.20239307535641549,
|
|
"grad_norm": 12.919401168823242,
|
|
"learning_rate": 1.9369614885820657e-06,
|
|
"loss": 0.1268,
|
|
"num_input_tokens_seen": 2289664,
|
|
"step": 2385
|
|
},
|
|
{
|
|
"epoch": 0.2028173794976239,
|
|
"grad_norm": 22.413991928100586,
|
|
"learning_rate": 1.9364428867055655e-06,
|
|
"loss": 0.0992,
|
|
"num_input_tokens_seen": 2294976,
|
|
"step": 2390
|
|
},
|
|
{
|
|
"epoch": 0.20324168363883233,
|
|
"grad_norm": 0.6512446403503418,
|
|
"learning_rate": 1.935922230359916e-06,
|
|
"loss": 0.0393,
|
|
"num_input_tokens_seen": 2299584,
|
|
"step": 2395
|
|
},
|
|
{
|
|
"epoch": 0.20366598778004075,
|
|
"grad_norm": 14.452363967895508,
|
|
"learning_rate": 1.9353995206873898e-06,
|
|
"loss": 0.0795,
|
|
"num_input_tokens_seen": 2304320,
|
|
"step": 2400
|
|
},
|
|
{
|
|
"epoch": 0.20409029192124914,
|
|
"grad_norm": 17.264039993286133,
|
|
"learning_rate": 1.9348747588347637e-06,
|
|
"loss": 0.1607,
|
|
"num_input_tokens_seen": 2308928,
|
|
"step": 2405
|
|
},
|
|
{
|
|
"epoch": 0.20451459606245756,
|
|
"grad_norm": 10.974727630615234,
|
|
"learning_rate": 1.9343479459533157e-06,
|
|
"loss": 0.0689,
|
|
"num_input_tokens_seen": 2313280,
|
|
"step": 2410
|
|
},
|
|
{
|
|
"epoch": 0.20493890020366598,
|
|
"grad_norm": 22.506288528442383,
|
|
"learning_rate": 1.933819083198826e-06,
|
|
"loss": 0.0949,
|
|
"num_input_tokens_seen": 2318400,
|
|
"step": 2415
|
|
},
|
|
{
|
|
"epoch": 0.2053632043448744,
|
|
"grad_norm": 17.46939468383789,
|
|
"learning_rate": 1.9332881717315694e-06,
|
|
"loss": 0.0688,
|
|
"num_input_tokens_seen": 2323712,
|
|
"step": 2420
|
|
},
|
|
{
|
|
"epoch": 0.20578750848608282,
|
|
"grad_norm": 7.412353515625,
|
|
"learning_rate": 1.9327552127163172e-06,
|
|
"loss": 0.0332,
|
|
"num_input_tokens_seen": 2327936,
|
|
"step": 2425
|
|
},
|
|
{
|
|
"epoch": 0.20621181262729124,
|
|
"grad_norm": 0.3739156126976013,
|
|
"learning_rate": 1.932220207322332e-06,
|
|
"loss": 0.0511,
|
|
"num_input_tokens_seen": 2332224,
|
|
"step": 2430
|
|
},
|
|
{
|
|
"epoch": 0.20663611676849966,
|
|
"grad_norm": 27.0732421875,
|
|
"learning_rate": 1.931683156723366e-06,
|
|
"loss": 0.0518,
|
|
"num_input_tokens_seen": 2336704,
|
|
"step": 2435
|
|
},
|
|
{
|
|
"epoch": 0.20706042090970808,
|
|
"grad_norm": 0.7127615213394165,
|
|
"learning_rate": 1.9311440620976595e-06,
|
|
"loss": 0.0638,
|
|
"num_input_tokens_seen": 2341888,
|
|
"step": 2440
|
|
},
|
|
{
|
|
"epoch": 0.2074847250509165,
|
|
"grad_norm": 4.951570987701416,
|
|
"learning_rate": 1.930602924627935e-06,
|
|
"loss": 0.0753,
|
|
"num_input_tokens_seen": 2346432,
|
|
"step": 2445
|
|
},
|
|
{
|
|
"epoch": 0.20790902919212492,
|
|
"grad_norm": 0.4228869676589966,
|
|
"learning_rate": 1.930059745501399e-06,
|
|
"loss": 0.0439,
|
|
"num_input_tokens_seen": 2350656,
|
|
"step": 2450
|
|
},
|
|
{
|
|
"epoch": 0.20833333333333334,
|
|
"grad_norm": 41.80000305175781,
|
|
"learning_rate": 1.9295145259097362e-06,
|
|
"loss": 0.0371,
|
|
"num_input_tokens_seen": 2355392,
|
|
"step": 2455
|
|
},
|
|
{
|
|
"epoch": 0.20875763747454176,
|
|
"grad_norm": 0.12598325312137604,
|
|
"learning_rate": 1.9289672670491076e-06,
|
|
"loss": 0.0202,
|
|
"num_input_tokens_seen": 2360320,
|
|
"step": 2460
|
|
},
|
|
{
|
|
"epoch": 0.20918194161575018,
|
|
"grad_norm": 12.554540634155273,
|
|
"learning_rate": 1.928417970120149e-06,
|
|
"loss": 0.0071,
|
|
"num_input_tokens_seen": 2365120,
|
|
"step": 2465
|
|
},
|
|
{
|
|
"epoch": 0.20960624575695858,
|
|
"grad_norm": 0.3128456473350525,
|
|
"learning_rate": 1.9278666363279664e-06,
|
|
"loss": 0.1438,
|
|
"num_input_tokens_seen": 2369920,
|
|
"step": 2470
|
|
},
|
|
{
|
|
"epoch": 0.210030549898167,
|
|
"grad_norm": 71.60822296142578,
|
|
"learning_rate": 1.9273132668821363e-06,
|
|
"loss": 0.1111,
|
|
"num_input_tokens_seen": 2374848,
|
|
"step": 2475
|
|
},
|
|
{
|
|
"epoch": 0.21045485403937542,
|
|
"grad_norm": 0.13212421536445618,
|
|
"learning_rate": 1.926757862996699e-06,
|
|
"loss": 0.0566,
|
|
"num_input_tokens_seen": 2380032,
|
|
"step": 2480
|
|
},
|
|
{
|
|
"epoch": 0.21087915818058384,
|
|
"grad_norm": 38.85576248168945,
|
|
"learning_rate": 1.92620042589016e-06,
|
|
"loss": 0.094,
|
|
"num_input_tokens_seen": 2384704,
|
|
"step": 2485
|
|
},
|
|
{
|
|
"epoch": 0.21130346232179226,
|
|
"grad_norm": 2.293707847595215,
|
|
"learning_rate": 1.9256409567854847e-06,
|
|
"loss": 0.0463,
|
|
"num_input_tokens_seen": 2389568,
|
|
"step": 2490
|
|
},
|
|
{
|
|
"epoch": 0.21172776646300068,
|
|
"grad_norm": 53.859130859375,
|
|
"learning_rate": 1.9250794569100963e-06,
|
|
"loss": 0.1581,
|
|
"num_input_tokens_seen": 2394560,
|
|
"step": 2495
|
|
},
|
|
{
|
|
"epoch": 0.2121520706042091,
|
|
"grad_norm": 9.447288513183594,
|
|
"learning_rate": 1.9245159274958737e-06,
|
|
"loss": 0.0617,
|
|
"num_input_tokens_seen": 2399232,
|
|
"step": 2500
|
|
},
|
|
{
|
|
"epoch": 0.21257637474541752,
|
|
"grad_norm": 13.274367332458496,
|
|
"learning_rate": 1.9239503697791487e-06,
|
|
"loss": 0.1743,
|
|
"num_input_tokens_seen": 2404032,
|
|
"step": 2505
|
|
},
|
|
{
|
|
"epoch": 0.21300067888662594,
|
|
"grad_norm": 15.30685043334961,
|
|
"learning_rate": 1.9233827850007024e-06,
|
|
"loss": 0.0828,
|
|
"num_input_tokens_seen": 2408576,
|
|
"step": 2510
|
|
},
|
|
{
|
|
"epoch": 0.21342498302783436,
|
|
"grad_norm": 29.790966033935547,
|
|
"learning_rate": 1.9228131744057633e-06,
|
|
"loss": 0.0904,
|
|
"num_input_tokens_seen": 2413440,
|
|
"step": 2515
|
|
},
|
|
{
|
|
"epoch": 0.21384928716904278,
|
|
"grad_norm": 7.731272220611572,
|
|
"learning_rate": 1.922241539244005e-06,
|
|
"loss": 0.042,
|
|
"num_input_tokens_seen": 2417664,
|
|
"step": 2520
|
|
},
|
|
{
|
|
"epoch": 0.2142735913102512,
|
|
"grad_norm": 8.477807998657227,
|
|
"learning_rate": 1.921667880769541e-06,
|
|
"loss": 0.0753,
|
|
"num_input_tokens_seen": 2422464,
|
|
"step": 2525
|
|
},
|
|
{
|
|
"epoch": 0.21469789545145962,
|
|
"grad_norm": 21.304777145385742,
|
|
"learning_rate": 1.921092200240926e-06,
|
|
"loss": 0.0652,
|
|
"num_input_tokens_seen": 2427200,
|
|
"step": 2530
|
|
},
|
|
{
|
|
"epoch": 0.215122199592668,
|
|
"grad_norm": 0.6191572546958923,
|
|
"learning_rate": 1.9205144989211495e-06,
|
|
"loss": 0.0376,
|
|
"num_input_tokens_seen": 2431616,
|
|
"step": 2535
|
|
},
|
|
{
|
|
"epoch": 0.21554650373387643,
|
|
"grad_norm": 11.073234558105469,
|
|
"learning_rate": 1.919934778077635e-06,
|
|
"loss": 0.0664,
|
|
"num_input_tokens_seen": 2436352,
|
|
"step": 2540
|
|
},
|
|
{
|
|
"epoch": 0.21597080787508485,
|
|
"grad_norm": 8.410868644714355,
|
|
"learning_rate": 1.9193530389822362e-06,
|
|
"loss": 0.0497,
|
|
"num_input_tokens_seen": 2440704,
|
|
"step": 2545
|
|
},
|
|
{
|
|
"epoch": 0.21639511201629327,
|
|
"grad_norm": 18.243743896484375,
|
|
"learning_rate": 1.918769282911235e-06,
|
|
"loss": 0.0283,
|
|
"num_input_tokens_seen": 2446272,
|
|
"step": 2550
|
|
},
|
|
{
|
|
"epoch": 0.2168194161575017,
|
|
"grad_norm": 7.846030235290527,
|
|
"learning_rate": 1.9181835111453383e-06,
|
|
"loss": 0.0559,
|
|
"num_input_tokens_seen": 2450496,
|
|
"step": 2555
|
|
},
|
|
{
|
|
"epoch": 0.2172437202987101,
|
|
"grad_norm": 52.11254119873047,
|
|
"learning_rate": 1.9175957249696755e-06,
|
|
"loss": 0.0334,
|
|
"num_input_tokens_seen": 2455424,
|
|
"step": 2560
|
|
},
|
|
{
|
|
"epoch": 0.21766802443991853,
|
|
"grad_norm": 36.6031608581543,
|
|
"learning_rate": 1.9170059256737946e-06,
|
|
"loss": 0.0724,
|
|
"num_input_tokens_seen": 2460352,
|
|
"step": 2565
|
|
},
|
|
{
|
|
"epoch": 0.21809232858112695,
|
|
"grad_norm": 43.663421630859375,
|
|
"learning_rate": 1.9164141145516613e-06,
|
|
"loss": 0.1301,
|
|
"num_input_tokens_seen": 2464896,
|
|
"step": 2570
|
|
},
|
|
{
|
|
"epoch": 0.21851663272233537,
|
|
"grad_norm": 0.19026194512844086,
|
|
"learning_rate": 1.915820292901654e-06,
|
|
"loss": 0.0941,
|
|
"num_input_tokens_seen": 2469824,
|
|
"step": 2575
|
|
},
|
|
{
|
|
"epoch": 0.2189409368635438,
|
|
"grad_norm": 36.45212936401367,
|
|
"learning_rate": 1.915224462026563e-06,
|
|
"loss": 0.1681,
|
|
"num_input_tokens_seen": 2474176,
|
|
"step": 2580
|
|
},
|
|
{
|
|
"epoch": 0.21936524100475221,
|
|
"grad_norm": 12.824152946472168,
|
|
"learning_rate": 1.9146266232335854e-06,
|
|
"loss": 0.1582,
|
|
"num_input_tokens_seen": 2479232,
|
|
"step": 2585
|
|
},
|
|
{
|
|
"epoch": 0.21978954514596064,
|
|
"grad_norm": 32.643001556396484,
|
|
"learning_rate": 1.914026777834325e-06,
|
|
"loss": 0.0963,
|
|
"num_input_tokens_seen": 2483648,
|
|
"step": 2590
|
|
},
|
|
{
|
|
"epoch": 0.22021384928716906,
|
|
"grad_norm": 17.95181655883789,
|
|
"learning_rate": 1.9134249271447872e-06,
|
|
"loss": 0.0632,
|
|
"num_input_tokens_seen": 2488064,
|
|
"step": 2595
|
|
},
|
|
{
|
|
"epoch": 0.22063815342837745,
|
|
"grad_norm": 1.8412888050079346,
|
|
"learning_rate": 1.9128210724853765e-06,
|
|
"loss": 0.0532,
|
|
"num_input_tokens_seen": 2493184,
|
|
"step": 2600
|
|
},
|
|
{
|
|
"epoch": 0.22106245756958587,
|
|
"grad_norm": 16.18398094177246,
|
|
"learning_rate": 1.912215215180894e-06,
|
|
"loss": 0.0981,
|
|
"num_input_tokens_seen": 2498048,
|
|
"step": 2605
|
|
},
|
|
{
|
|
"epoch": 0.2214867617107943,
|
|
"grad_norm": 19.46906852722168,
|
|
"learning_rate": 1.9116073565605347e-06,
|
|
"loss": 0.1506,
|
|
"num_input_tokens_seen": 2504064,
|
|
"step": 2610
|
|
},
|
|
{
|
|
"epoch": 0.2219110658520027,
|
|
"grad_norm": 9.773946762084961,
|
|
"learning_rate": 1.9109974979578847e-06,
|
|
"loss": 0.0656,
|
|
"num_input_tokens_seen": 2508800,
|
|
"step": 2615
|
|
},
|
|
{
|
|
"epoch": 0.22233536999321113,
|
|
"grad_norm": 3.967522382736206,
|
|
"learning_rate": 1.9103856407109172e-06,
|
|
"loss": 0.0211,
|
|
"num_input_tokens_seen": 2513280,
|
|
"step": 2620
|
|
},
|
|
{
|
|
"epoch": 0.22275967413441955,
|
|
"grad_norm": 8.614876747131348,
|
|
"learning_rate": 1.9097717861619907e-06,
|
|
"loss": 0.0466,
|
|
"num_input_tokens_seen": 2518080,
|
|
"step": 2625
|
|
},
|
|
{
|
|
"epoch": 0.22318397827562797,
|
|
"grad_norm": 8.669894218444824,
|
|
"learning_rate": 1.9091559356578445e-06,
|
|
"loss": 0.0785,
|
|
"num_input_tokens_seen": 2522688,
|
|
"step": 2630
|
|
},
|
|
{
|
|
"epoch": 0.2236082824168364,
|
|
"grad_norm": 0.16939519345760345,
|
|
"learning_rate": 1.9085380905495985e-06,
|
|
"loss": 0.0883,
|
|
"num_input_tokens_seen": 2527040,
|
|
"step": 2635
|
|
},
|
|
{
|
|
"epoch": 0.2240325865580448,
|
|
"grad_norm": 16.47370719909668,
|
|
"learning_rate": 1.9079182521927475e-06,
|
|
"loss": 0.1056,
|
|
"num_input_tokens_seen": 2531584,
|
|
"step": 2640
|
|
},
|
|
{
|
|
"epoch": 0.22445689069925323,
|
|
"grad_norm": 5.028382778167725,
|
|
"learning_rate": 1.9072964219471594e-06,
|
|
"loss": 0.1089,
|
|
"num_input_tokens_seen": 2536384,
|
|
"step": 2645
|
|
},
|
|
{
|
|
"epoch": 0.22488119484046165,
|
|
"grad_norm": 24.354970932006836,
|
|
"learning_rate": 1.9066726011770724e-06,
|
|
"loss": 0.0584,
|
|
"num_input_tokens_seen": 2540800,
|
|
"step": 2650
|
|
},
|
|
{
|
|
"epoch": 0.22530549898167007,
|
|
"grad_norm": 0.9733284115791321,
|
|
"learning_rate": 1.906046791251092e-06,
|
|
"loss": 0.0234,
|
|
"num_input_tokens_seen": 2545600,
|
|
"step": 2655
|
|
},
|
|
{
|
|
"epoch": 0.2257298031228785,
|
|
"grad_norm": 24.195287704467773,
|
|
"learning_rate": 1.9054189935421868e-06,
|
|
"loss": 0.1114,
|
|
"num_input_tokens_seen": 2550528,
|
|
"step": 2660
|
|
},
|
|
{
|
|
"epoch": 0.22615410726408688,
|
|
"grad_norm": 19.755464553833008,
|
|
"learning_rate": 1.9047892094276871e-06,
|
|
"loss": 0.0336,
|
|
"num_input_tokens_seen": 2555328,
|
|
"step": 2665
|
|
},
|
|
{
|
|
"epoch": 0.2265784114052953,
|
|
"grad_norm": 12.556132316589355,
|
|
"learning_rate": 1.9041574402892813e-06,
|
|
"loss": 0.1189,
|
|
"num_input_tokens_seen": 2560320,
|
|
"step": 2670
|
|
},
|
|
{
|
|
"epoch": 0.22700271554650372,
|
|
"grad_norm": 3.3276801109313965,
|
|
"learning_rate": 1.903523687513012e-06,
|
|
"loss": 0.0536,
|
|
"num_input_tokens_seen": 2564800,
|
|
"step": 2675
|
|
},
|
|
{
|
|
"epoch": 0.22742701968771215,
|
|
"grad_norm": 35.183223724365234,
|
|
"learning_rate": 1.902887952489275e-06,
|
|
"loss": 0.0745,
|
|
"num_input_tokens_seen": 2569664,
|
|
"step": 2680
|
|
},
|
|
{
|
|
"epoch": 0.22785132382892057,
|
|
"grad_norm": 1.181957721710205,
|
|
"learning_rate": 1.9022502366128132e-06,
|
|
"loss": 0.0412,
|
|
"num_input_tokens_seen": 2574592,
|
|
"step": 2685
|
|
},
|
|
{
|
|
"epoch": 0.22827562797012899,
|
|
"grad_norm": 25.119983673095703,
|
|
"learning_rate": 1.9016105412827173e-06,
|
|
"loss": 0.0882,
|
|
"num_input_tokens_seen": 2579328,
|
|
"step": 2690
|
|
},
|
|
{
|
|
"epoch": 0.2286999321113374,
|
|
"grad_norm": 23.016719818115234,
|
|
"learning_rate": 1.9009688679024189e-06,
|
|
"loss": 0.0212,
|
|
"num_input_tokens_seen": 2584896,
|
|
"step": 2695
|
|
},
|
|
{
|
|
"epoch": 0.22912423625254583,
|
|
"grad_norm": 20.473459243774414,
|
|
"learning_rate": 1.9003252178796907e-06,
|
|
"loss": 0.1346,
|
|
"num_input_tokens_seen": 2589504,
|
|
"step": 2700
|
|
},
|
|
{
|
|
"epoch": 0.22954854039375425,
|
|
"grad_norm": 17.718975067138672,
|
|
"learning_rate": 1.8996795926266412e-06,
|
|
"loss": 0.0705,
|
|
"num_input_tokens_seen": 2594304,
|
|
"step": 2705
|
|
},
|
|
{
|
|
"epoch": 0.22997284453496267,
|
|
"grad_norm": 25.074861526489258,
|
|
"learning_rate": 1.899031993559712e-06,
|
|
"loss": 0.0412,
|
|
"num_input_tokens_seen": 2598784,
|
|
"step": 2710
|
|
},
|
|
{
|
|
"epoch": 0.2303971486761711,
|
|
"grad_norm": 17.30613899230957,
|
|
"learning_rate": 1.8983824220996764e-06,
|
|
"loss": 0.0795,
|
|
"num_input_tokens_seen": 2603712,
|
|
"step": 2715
|
|
},
|
|
{
|
|
"epoch": 0.2308214528173795,
|
|
"grad_norm": 17.876710891723633,
|
|
"learning_rate": 1.8977308796716338e-06,
|
|
"loss": 0.0492,
|
|
"num_input_tokens_seen": 2608320,
|
|
"step": 2720
|
|
},
|
|
{
|
|
"epoch": 0.23124575695858793,
|
|
"grad_norm": 1.3810087442398071,
|
|
"learning_rate": 1.897077367705008e-06,
|
|
"loss": 0.0266,
|
|
"num_input_tokens_seen": 2613248,
|
|
"step": 2725
|
|
},
|
|
{
|
|
"epoch": 0.23167006109979632,
|
|
"grad_norm": 26.5897159576416,
|
|
"learning_rate": 1.896421887633544e-06,
|
|
"loss": 0.0639,
|
|
"num_input_tokens_seen": 2617664,
|
|
"step": 2730
|
|
},
|
|
{
|
|
"epoch": 0.23209436524100474,
|
|
"grad_norm": 7.933111667633057,
|
|
"learning_rate": 1.8957644408953044e-06,
|
|
"loss": 0.0523,
|
|
"num_input_tokens_seen": 2622016,
|
|
"step": 2735
|
|
},
|
|
{
|
|
"epoch": 0.23251866938221316,
|
|
"grad_norm": 17.223819732666016,
|
|
"learning_rate": 1.8951050289326664e-06,
|
|
"loss": 0.0392,
|
|
"num_input_tokens_seen": 2626368,
|
|
"step": 2740
|
|
},
|
|
{
|
|
"epoch": 0.23294297352342158,
|
|
"grad_norm": 1.360128402709961,
|
|
"learning_rate": 1.8944436531923193e-06,
|
|
"loss": 0.013,
|
|
"num_input_tokens_seen": 2630720,
|
|
"step": 2745
|
|
},
|
|
{
|
|
"epoch": 0.23336727766463,
|
|
"grad_norm": 9.804512977600098,
|
|
"learning_rate": 1.8937803151252603e-06,
|
|
"loss": 0.1161,
|
|
"num_input_tokens_seen": 2635456,
|
|
"step": 2750
|
|
},
|
|
{
|
|
"epoch": 0.23379158180583842,
|
|
"grad_norm": 39.875648498535156,
|
|
"learning_rate": 1.8931150161867915e-06,
|
|
"loss": 0.0651,
|
|
"num_input_tokens_seen": 2639744,
|
|
"step": 2755
|
|
},
|
|
{
|
|
"epoch": 0.23421588594704684,
|
|
"grad_norm": 41.242069244384766,
|
|
"learning_rate": 1.8924477578365177e-06,
|
|
"loss": 0.0779,
|
|
"num_input_tokens_seen": 2643904,
|
|
"step": 2760
|
|
},
|
|
{
|
|
"epoch": 0.23464019008825526,
|
|
"grad_norm": 7.9552388191223145,
|
|
"learning_rate": 1.8917785415383415e-06,
|
|
"loss": 0.0454,
|
|
"num_input_tokens_seen": 2648768,
|
|
"step": 2765
|
|
},
|
|
{
|
|
"epoch": 0.23506449422946368,
|
|
"grad_norm": 31.668724060058594,
|
|
"learning_rate": 1.8911073687604622e-06,
|
|
"loss": 0.0256,
|
|
"num_input_tokens_seen": 2653952,
|
|
"step": 2770
|
|
},
|
|
{
|
|
"epoch": 0.2354887983706721,
|
|
"grad_norm": 0.03098186105489731,
|
|
"learning_rate": 1.8904342409753703e-06,
|
|
"loss": 0.0074,
|
|
"num_input_tokens_seen": 2658432,
|
|
"step": 2775
|
|
},
|
|
{
|
|
"epoch": 0.23591310251188052,
|
|
"grad_norm": 0.2730647027492523,
|
|
"learning_rate": 1.8897591596598464e-06,
|
|
"loss": 0.0391,
|
|
"num_input_tokens_seen": 2663168,
|
|
"step": 2780
|
|
},
|
|
{
|
|
"epoch": 0.23633740665308894,
|
|
"grad_norm": 18.097095489501953,
|
|
"learning_rate": 1.8890821262949564e-06,
|
|
"loss": 0.1325,
|
|
"num_input_tokens_seen": 2668096,
|
|
"step": 2785
|
|
},
|
|
{
|
|
"epoch": 0.23676171079429736,
|
|
"grad_norm": 0.6063529849052429,
|
|
"learning_rate": 1.8884031423660488e-06,
|
|
"loss": 0.1382,
|
|
"num_input_tokens_seen": 2672576,
|
|
"step": 2790
|
|
},
|
|
{
|
|
"epoch": 0.23718601493550576,
|
|
"grad_norm": 22.42915916442871,
|
|
"learning_rate": 1.8877222093627517e-06,
|
|
"loss": 0.0854,
|
|
"num_input_tokens_seen": 2678336,
|
|
"step": 2795
|
|
},
|
|
{
|
|
"epoch": 0.23761031907671418,
|
|
"grad_norm": 12.197434425354004,
|
|
"learning_rate": 1.8870393287789694e-06,
|
|
"loss": 0.1233,
|
|
"num_input_tokens_seen": 2684288,
|
|
"step": 2800
|
|
},
|
|
{
|
|
"epoch": 0.2380346232179226,
|
|
"grad_norm": 15.095528602600098,
|
|
"learning_rate": 1.8863545021128781e-06,
|
|
"loss": 0.0402,
|
|
"num_input_tokens_seen": 2688704,
|
|
"step": 2805
|
|
},
|
|
{
|
|
"epoch": 0.23845892735913102,
|
|
"grad_norm": 10.655545234680176,
|
|
"learning_rate": 1.885667730866925e-06,
|
|
"loss": 0.075,
|
|
"num_input_tokens_seen": 2693056,
|
|
"step": 2810
|
|
},
|
|
{
|
|
"epoch": 0.23888323150033944,
|
|
"grad_norm": 18.9387264251709,
|
|
"learning_rate": 1.884979016547822e-06,
|
|
"loss": 0.0854,
|
|
"num_input_tokens_seen": 2697280,
|
|
"step": 2815
|
|
},
|
|
{
|
|
"epoch": 0.23930753564154786,
|
|
"grad_norm": 6.936161518096924,
|
|
"learning_rate": 1.8842883606665457e-06,
|
|
"loss": 0.0838,
|
|
"num_input_tokens_seen": 2701440,
|
|
"step": 2820
|
|
},
|
|
{
|
|
"epoch": 0.23973183978275628,
|
|
"grad_norm": 3.464217185974121,
|
|
"learning_rate": 1.88359576473833e-06,
|
|
"loss": 0.0935,
|
|
"num_input_tokens_seen": 2707264,
|
|
"step": 2825
|
|
},
|
|
{
|
|
"epoch": 0.2401561439239647,
|
|
"grad_norm": 17.284015655517578,
|
|
"learning_rate": 1.8829012302826674e-06,
|
|
"loss": 0.1009,
|
|
"num_input_tokens_seen": 2711680,
|
|
"step": 2830
|
|
},
|
|
{
|
|
"epoch": 0.24058044806517312,
|
|
"grad_norm": 4.557666778564453,
|
|
"learning_rate": 1.8822047588233017e-06,
|
|
"loss": 0.0196,
|
|
"num_input_tokens_seen": 2716352,
|
|
"step": 2835
|
|
},
|
|
{
|
|
"epoch": 0.24100475220638154,
|
|
"grad_norm": 5.856059551239014,
|
|
"learning_rate": 1.881506351888227e-06,
|
|
"loss": 0.0709,
|
|
"num_input_tokens_seen": 2721024,
|
|
"step": 2840
|
|
},
|
|
{
|
|
"epoch": 0.24142905634758996,
|
|
"grad_norm": 14.097959518432617,
|
|
"learning_rate": 1.8808060110096839e-06,
|
|
"loss": 0.0848,
|
|
"num_input_tokens_seen": 2725696,
|
|
"step": 2845
|
|
},
|
|
{
|
|
"epoch": 0.24185336048879838,
|
|
"grad_norm": 19.945884704589844,
|
|
"learning_rate": 1.8801037377241553e-06,
|
|
"loss": 0.0605,
|
|
"num_input_tokens_seen": 2731328,
|
|
"step": 2850
|
|
},
|
|
{
|
|
"epoch": 0.2422776646300068,
|
|
"grad_norm": 2.3079118728637695,
|
|
"learning_rate": 1.879399533572364e-06,
|
|
"loss": 0.0905,
|
|
"num_input_tokens_seen": 2736192,
|
|
"step": 2855
|
|
},
|
|
{
|
|
"epoch": 0.24270196877121522,
|
|
"grad_norm": 23.200849533081055,
|
|
"learning_rate": 1.8786934000992688e-06,
|
|
"loss": 0.0869,
|
|
"num_input_tokens_seen": 2740544,
|
|
"step": 2860
|
|
},
|
|
{
|
|
"epoch": 0.2431262729124236,
|
|
"grad_norm": 16.469356536865234,
|
|
"learning_rate": 1.877985338854061e-06,
|
|
"loss": 0.0745,
|
|
"num_input_tokens_seen": 2745472,
|
|
"step": 2865
|
|
},
|
|
{
|
|
"epoch": 0.24355057705363203,
|
|
"grad_norm": 2.2663915157318115,
|
|
"learning_rate": 1.877275351390162e-06,
|
|
"loss": 0.0094,
|
|
"num_input_tokens_seen": 2750528,
|
|
"step": 2870
|
|
},
|
|
{
|
|
"epoch": 0.24397488119484045,
|
|
"grad_norm": 5.6446533203125,
|
|
"learning_rate": 1.8765634392652183e-06,
|
|
"loss": 0.0381,
|
|
"num_input_tokens_seen": 2755328,
|
|
"step": 2875
|
|
},
|
|
{
|
|
"epoch": 0.24439918533604887,
|
|
"grad_norm": 12.059006690979004,
|
|
"learning_rate": 1.8758496040410998e-06,
|
|
"loss": 0.0899,
|
|
"num_input_tokens_seen": 2760128,
|
|
"step": 2880
|
|
},
|
|
{
|
|
"epoch": 0.2448234894772573,
|
|
"grad_norm": 0.30668967962265015,
|
|
"learning_rate": 1.8751338472838942e-06,
|
|
"loss": 0.024,
|
|
"num_input_tokens_seen": 2764992,
|
|
"step": 2885
|
|
},
|
|
{
|
|
"epoch": 0.24524779361846571,
|
|
"grad_norm": 24.139650344848633,
|
|
"learning_rate": 1.8744161705639065e-06,
|
|
"loss": 0.0506,
|
|
"num_input_tokens_seen": 2769408,
|
|
"step": 2890
|
|
},
|
|
{
|
|
"epoch": 0.24567209775967414,
|
|
"grad_norm": 13.709477424621582,
|
|
"learning_rate": 1.8736965754556526e-06,
|
|
"loss": 0.1089,
|
|
"num_input_tokens_seen": 2774336,
|
|
"step": 2895
|
|
},
|
|
{
|
|
"epoch": 0.24609640190088256,
|
|
"grad_norm": 24.1279239654541,
|
|
"learning_rate": 1.8729750635378578e-06,
|
|
"loss": 0.1844,
|
|
"num_input_tokens_seen": 2779136,
|
|
"step": 2900
|
|
},
|
|
{
|
|
"epoch": 0.24652070604209098,
|
|
"grad_norm": 35.80702209472656,
|
|
"learning_rate": 1.872251636393453e-06,
|
|
"loss": 0.0891,
|
|
"num_input_tokens_seen": 2784256,
|
|
"step": 2905
|
|
},
|
|
{
|
|
"epoch": 0.2469450101832994,
|
|
"grad_norm": 30.313777923583984,
|
|
"learning_rate": 1.8715262956095694e-06,
|
|
"loss": 0.1011,
|
|
"num_input_tokens_seen": 2788864,
|
|
"step": 2910
|
|
},
|
|
{
|
|
"epoch": 0.24736931432450782,
|
|
"grad_norm": 5.44834566116333,
|
|
"learning_rate": 1.8707990427775386e-06,
|
|
"loss": 0.0441,
|
|
"num_input_tokens_seen": 2793152,
|
|
"step": 2915
|
|
},
|
|
{
|
|
"epoch": 0.24779361846571624,
|
|
"grad_norm": 12.391961097717285,
|
|
"learning_rate": 1.870069879492886e-06,
|
|
"loss": 0.0628,
|
|
"num_input_tokens_seen": 2798144,
|
|
"step": 2920
|
|
},
|
|
{
|
|
"epoch": 0.24821792260692466,
|
|
"grad_norm": 0.23529499769210815,
|
|
"learning_rate": 1.869338807355328e-06,
|
|
"loss": 0.0632,
|
|
"num_input_tokens_seen": 2803136,
|
|
"step": 2925
|
|
},
|
|
{
|
|
"epoch": 0.24864222674813305,
|
|
"grad_norm": 22.1816349029541,
|
|
"learning_rate": 1.8686058279687699e-06,
|
|
"loss": 0.071,
|
|
"num_input_tokens_seen": 2807744,
|
|
"step": 2930
|
|
},
|
|
{
|
|
"epoch": 0.24906653088934147,
|
|
"grad_norm": 0.9351317286491394,
|
|
"learning_rate": 1.8678709429413e-06,
|
|
"loss": 0.0478,
|
|
"num_input_tokens_seen": 2812928,
|
|
"step": 2935
|
|
},
|
|
{
|
|
"epoch": 0.2494908350305499,
|
|
"grad_norm": 21.68981170654297,
|
|
"learning_rate": 1.867134153885189e-06,
|
|
"loss": 0.0602,
|
|
"num_input_tokens_seen": 2817536,
|
|
"step": 2940
|
|
},
|
|
{
|
|
"epoch": 0.2499151391717583,
|
|
"grad_norm": 1.3147538900375366,
|
|
"learning_rate": 1.8663954624168832e-06,
|
|
"loss": 0.0879,
|
|
"num_input_tokens_seen": 2822784,
|
|
"step": 2945
|
|
},
|
|
{
|
|
"epoch": 0.25033944331296676,
|
|
"grad_norm": 0.3805413842201233,
|
|
"learning_rate": 1.8656548701570039e-06,
|
|
"loss": 0.0654,
|
|
"num_input_tokens_seen": 2827328,
|
|
"step": 2950
|
|
},
|
|
{
|
|
"epoch": 0.25033944331296676,
|
|
"eval_loss": 0.08211036771535873,
|
|
"eval_runtime": 16.6052,
|
|
"eval_samples_per_second": 630.826,
|
|
"eval_steps_per_second": 78.891,
|
|
"num_input_tokens_seen": 2827328,
|
|
"step": 2950
|
|
},
|
|
{
|
|
"epoch": 0.2507637474541752,
|
|
"grad_norm": 12.028523445129395,
|
|
"learning_rate": 1.864912378730342e-06,
|
|
"loss": 0.0797,
|
|
"num_input_tokens_seen": 2832128,
|
|
"step": 2955
|
|
},
|
|
{
|
|
"epoch": 0.25118805159538354,
|
|
"grad_norm": 24.954545974731445,
|
|
"learning_rate": 1.8641679897658551e-06,
|
|
"loss": 0.1429,
|
|
"num_input_tokens_seen": 2837824,
|
|
"step": 2960
|
|
},
|
|
{
|
|
"epoch": 0.25161235573659196,
|
|
"grad_norm": 2.813983917236328,
|
|
"learning_rate": 1.8634217048966633e-06,
|
|
"loss": 0.0657,
|
|
"num_input_tokens_seen": 2842240,
|
|
"step": 2965
|
|
},
|
|
{
|
|
"epoch": 0.2520366598778004,
|
|
"grad_norm": 24.6806697845459,
|
|
"learning_rate": 1.8626735257600475e-06,
|
|
"loss": 0.0584,
|
|
"num_input_tokens_seen": 2846848,
|
|
"step": 2970
|
|
},
|
|
{
|
|
"epoch": 0.2524609640190088,
|
|
"grad_norm": 13.69776439666748,
|
|
"learning_rate": 1.8619234539974429e-06,
|
|
"loss": 0.1142,
|
|
"num_input_tokens_seen": 2851392,
|
|
"step": 2975
|
|
},
|
|
{
|
|
"epoch": 0.2528852681602172,
|
|
"grad_norm": 19.420358657836914,
|
|
"learning_rate": 1.8611714912544376e-06,
|
|
"loss": 0.055,
|
|
"num_input_tokens_seen": 2855680,
|
|
"step": 2980
|
|
},
|
|
{
|
|
"epoch": 0.25330957230142565,
|
|
"grad_norm": 9.22887134552002,
|
|
"learning_rate": 1.860417639180769e-06,
|
|
"loss": 0.0552,
|
|
"num_input_tokens_seen": 2860544,
|
|
"step": 2985
|
|
},
|
|
{
|
|
"epoch": 0.25373387644263407,
|
|
"grad_norm": 5.3496479988098145,
|
|
"learning_rate": 1.8596618994303183e-06,
|
|
"loss": 0.0516,
|
|
"num_input_tokens_seen": 2865152,
|
|
"step": 2990
|
|
},
|
|
{
|
|
"epoch": 0.2541581805838425,
|
|
"grad_norm": 2.134282112121582,
|
|
"learning_rate": 1.858904273661109e-06,
|
|
"loss": 0.0612,
|
|
"num_input_tokens_seen": 2870144,
|
|
"step": 2995
|
|
},
|
|
{
|
|
"epoch": 0.2545824847250509,
|
|
"grad_norm": 33.44880676269531,
|
|
"learning_rate": 1.8581447635353019e-06,
|
|
"loss": 0.0597,
|
|
"num_input_tokens_seen": 2875200,
|
|
"step": 3000
|
|
},
|
|
{
|
|
"epoch": 0.2550067888662593,
|
|
"grad_norm": 38.41618728637695,
|
|
"learning_rate": 1.8573833707191918e-06,
|
|
"loss": 0.0776,
|
|
"num_input_tokens_seen": 2880192,
|
|
"step": 3005
|
|
},
|
|
{
|
|
"epoch": 0.25543109300746775,
|
|
"grad_norm": 2.3458704948425293,
|
|
"learning_rate": 1.8566200968832044e-06,
|
|
"loss": 0.1305,
|
|
"num_input_tokens_seen": 2884800,
|
|
"step": 3010
|
|
},
|
|
{
|
|
"epoch": 0.25585539714867617,
|
|
"grad_norm": 27.157550811767578,
|
|
"learning_rate": 1.855854943701892e-06,
|
|
"loss": 0.0961,
|
|
"num_input_tokens_seen": 2890176,
|
|
"step": 3015
|
|
},
|
|
{
|
|
"epoch": 0.2562797012898846,
|
|
"grad_norm": 14.84155559539795,
|
|
"learning_rate": 1.85508791285393e-06,
|
|
"loss": 0.1078,
|
|
"num_input_tokens_seen": 2895040,
|
|
"step": 3020
|
|
},
|
|
{
|
|
"epoch": 0.256704005431093,
|
|
"grad_norm": 5.116229057312012,
|
|
"learning_rate": 1.8543190060221125e-06,
|
|
"loss": 0.0569,
|
|
"num_input_tokens_seen": 2899776,
|
|
"step": 3025
|
|
},
|
|
{
|
|
"epoch": 0.25712830957230143,
|
|
"grad_norm": 7.373476028442383,
|
|
"learning_rate": 1.853548224893351e-06,
|
|
"loss": 0.0659,
|
|
"num_input_tokens_seen": 2904064,
|
|
"step": 3030
|
|
},
|
|
{
|
|
"epoch": 0.25755261371350985,
|
|
"grad_norm": 17.929779052734375,
|
|
"learning_rate": 1.8527755711586678e-06,
|
|
"loss": 0.0511,
|
|
"num_input_tokens_seen": 2908800,
|
|
"step": 3035
|
|
},
|
|
{
|
|
"epoch": 0.25797691785471827,
|
|
"grad_norm": 13.803712844848633,
|
|
"learning_rate": 1.8520010465131935e-06,
|
|
"loss": 0.0522,
|
|
"num_input_tokens_seen": 2913216,
|
|
"step": 3040
|
|
},
|
|
{
|
|
"epoch": 0.2584012219959267,
|
|
"grad_norm": 9.763543128967285,
|
|
"learning_rate": 1.8512246526561636e-06,
|
|
"loss": 0.0808,
|
|
"num_input_tokens_seen": 2917504,
|
|
"step": 3045
|
|
},
|
|
{
|
|
"epoch": 0.2588255261371351,
|
|
"grad_norm": 27.986560821533203,
|
|
"learning_rate": 1.8504463912909149e-06,
|
|
"loss": 0.1602,
|
|
"num_input_tokens_seen": 2922752,
|
|
"step": 3050
|
|
},
|
|
{
|
|
"epoch": 0.25924983027834353,
|
|
"grad_norm": 8.609024047851562,
|
|
"learning_rate": 1.8496662641248807e-06,
|
|
"loss": 0.0262,
|
|
"num_input_tokens_seen": 2928000,
|
|
"step": 3055
|
|
},
|
|
{
|
|
"epoch": 0.25967413441955195,
|
|
"grad_norm": 27.452054977416992,
|
|
"learning_rate": 1.8488842728695874e-06,
|
|
"loss": 0.0392,
|
|
"num_input_tokens_seen": 2932736,
|
|
"step": 3060
|
|
},
|
|
{
|
|
"epoch": 0.26009843856076037,
|
|
"grad_norm": 31.19993782043457,
|
|
"learning_rate": 1.8481004192406525e-06,
|
|
"loss": 0.1022,
|
|
"num_input_tokens_seen": 2937664,
|
|
"step": 3065
|
|
},
|
|
{
|
|
"epoch": 0.2605227427019688,
|
|
"grad_norm": 43.040924072265625,
|
|
"learning_rate": 1.8473147049577773e-06,
|
|
"loss": 0.0863,
|
|
"num_input_tokens_seen": 2942784,
|
|
"step": 3070
|
|
},
|
|
{
|
|
"epoch": 0.2609470468431772,
|
|
"grad_norm": 13.672484397888184,
|
|
"learning_rate": 1.8465271317447474e-06,
|
|
"loss": 0.1081,
|
|
"num_input_tokens_seen": 2947840,
|
|
"step": 3075
|
|
},
|
|
{
|
|
"epoch": 0.26137135098438563,
|
|
"grad_norm": 21.428726196289062,
|
|
"learning_rate": 1.845737701329425e-06,
|
|
"loss": 0.0639,
|
|
"num_input_tokens_seen": 2952448,
|
|
"step": 3080
|
|
},
|
|
{
|
|
"epoch": 0.26179565512559405,
|
|
"grad_norm": 4.190557479858398,
|
|
"learning_rate": 1.8449464154437475e-06,
|
|
"loss": 0.0307,
|
|
"num_input_tokens_seen": 2957120,
|
|
"step": 3085
|
|
},
|
|
{
|
|
"epoch": 0.26221995926680247,
|
|
"grad_norm": 14.91804027557373,
|
|
"learning_rate": 1.8441532758237233e-06,
|
|
"loss": 0.0965,
|
|
"num_input_tokens_seen": 2961728,
|
|
"step": 3090
|
|
},
|
|
{
|
|
"epoch": 0.26264426340801084,
|
|
"grad_norm": 7.925253868103027,
|
|
"learning_rate": 1.8433582842094273e-06,
|
|
"loss": 0.0859,
|
|
"num_input_tokens_seen": 2966208,
|
|
"step": 3095
|
|
},
|
|
{
|
|
"epoch": 0.26306856754921926,
|
|
"grad_norm": 22.033737182617188,
|
|
"learning_rate": 1.8425614423449974e-06,
|
|
"loss": 0.0416,
|
|
"num_input_tokens_seen": 2972288,
|
|
"step": 3100
|
|
},
|
|
{
|
|
"epoch": 0.2634928716904277,
|
|
"grad_norm": 12.092147827148438,
|
|
"learning_rate": 1.8417627519786313e-06,
|
|
"loss": 0.0855,
|
|
"num_input_tokens_seen": 2976512,
|
|
"step": 3105
|
|
},
|
|
{
|
|
"epoch": 0.2639171758316361,
|
|
"grad_norm": 8.595088005065918,
|
|
"learning_rate": 1.840962214862582e-06,
|
|
"loss": 0.0655,
|
|
"num_input_tokens_seen": 2981248,
|
|
"step": 3110
|
|
},
|
|
{
|
|
"epoch": 0.2643414799728445,
|
|
"grad_norm": 3.676570177078247,
|
|
"learning_rate": 1.8401598327531533e-06,
|
|
"loss": 0.0851,
|
|
"num_input_tokens_seen": 2985728,
|
|
"step": 3115
|
|
},
|
|
{
|
|
"epoch": 0.26476578411405294,
|
|
"grad_norm": 23.423982620239258,
|
|
"learning_rate": 1.839355607410698e-06,
|
|
"loss": 0.0621,
|
|
"num_input_tokens_seen": 2990144,
|
|
"step": 3120
|
|
},
|
|
{
|
|
"epoch": 0.26519008825526136,
|
|
"grad_norm": 11.561528205871582,
|
|
"learning_rate": 1.8385495405996119e-06,
|
|
"loss": 0.0783,
|
|
"num_input_tokens_seen": 2994560,
|
|
"step": 3125
|
|
},
|
|
{
|
|
"epoch": 0.2656143923964698,
|
|
"grad_norm": 26.278846740722656,
|
|
"learning_rate": 1.8377416340883312e-06,
|
|
"loss": 0.0582,
|
|
"num_input_tokens_seen": 2999488,
|
|
"step": 3130
|
|
},
|
|
{
|
|
"epoch": 0.2660386965376782,
|
|
"grad_norm": 7.513156414031982,
|
|
"learning_rate": 1.836931889649328e-06,
|
|
"loss": 0.0616,
|
|
"num_input_tokens_seen": 3004096,
|
|
"step": 3135
|
|
},
|
|
{
|
|
"epoch": 0.2664630006788866,
|
|
"grad_norm": 22.986509323120117,
|
|
"learning_rate": 1.8361203090591068e-06,
|
|
"loss": 0.1094,
|
|
"num_input_tokens_seen": 3008512,
|
|
"step": 3140
|
|
},
|
|
{
|
|
"epoch": 0.26688730482009504,
|
|
"grad_norm": 11.799283027648926,
|
|
"learning_rate": 1.8353068940982006e-06,
|
|
"loss": 0.0684,
|
|
"num_input_tokens_seen": 3013504,
|
|
"step": 3145
|
|
},
|
|
{
|
|
"epoch": 0.26731160896130346,
|
|
"grad_norm": 0.2482283115386963,
|
|
"learning_rate": 1.8344916465511664e-06,
|
|
"loss": 0.0213,
|
|
"num_input_tokens_seen": 3018112,
|
|
"step": 3150
|
|
},
|
|
{
|
|
"epoch": 0.2677359131025119,
|
|
"grad_norm": 11.042163848876953,
|
|
"learning_rate": 1.833674568206582e-06,
|
|
"loss": 0.1079,
|
|
"num_input_tokens_seen": 3023168,
|
|
"step": 3155
|
|
},
|
|
{
|
|
"epoch": 0.2681602172437203,
|
|
"grad_norm": 10.574271202087402,
|
|
"learning_rate": 1.832855660857042e-06,
|
|
"loss": 0.0687,
|
|
"num_input_tokens_seen": 3027840,
|
|
"step": 3160
|
|
},
|
|
{
|
|
"epoch": 0.2685845213849287,
|
|
"grad_norm": 12.392441749572754,
|
|
"learning_rate": 1.8320349262991532e-06,
|
|
"loss": 0.1278,
|
|
"num_input_tokens_seen": 3034176,
|
|
"step": 3165
|
|
},
|
|
{
|
|
"epoch": 0.26900882552613714,
|
|
"grad_norm": 15.403036117553711,
|
|
"learning_rate": 1.8312123663335316e-06,
|
|
"loss": 0.0563,
|
|
"num_input_tokens_seen": 3038464,
|
|
"step": 3170
|
|
},
|
|
{
|
|
"epoch": 0.26943312966734556,
|
|
"grad_norm": 10.009440422058105,
|
|
"learning_rate": 1.8303879827647974e-06,
|
|
"loss": 0.0785,
|
|
"num_input_tokens_seen": 3042944,
|
|
"step": 3175
|
|
},
|
|
{
|
|
"epoch": 0.269857433808554,
|
|
"grad_norm": 5.536763668060303,
|
|
"learning_rate": 1.8295617774015724e-06,
|
|
"loss": 0.0687,
|
|
"num_input_tokens_seen": 3048000,
|
|
"step": 3180
|
|
},
|
|
{
|
|
"epoch": 0.2702817379497624,
|
|
"grad_norm": 17.349336624145508,
|
|
"learning_rate": 1.8287337520564744e-06,
|
|
"loss": 0.0628,
|
|
"num_input_tokens_seen": 3053056,
|
|
"step": 3185
|
|
},
|
|
{
|
|
"epoch": 0.2707060420909708,
|
|
"grad_norm": 13.206275939941406,
|
|
"learning_rate": 1.8279039085461148e-06,
|
|
"loss": 0.1059,
|
|
"num_input_tokens_seen": 3057792,
|
|
"step": 3190
|
|
},
|
|
{
|
|
"epoch": 0.27113034623217924,
|
|
"grad_norm": 2.6067545413970947,
|
|
"learning_rate": 1.8270722486910933e-06,
|
|
"loss": 0.137,
|
|
"num_input_tokens_seen": 3062784,
|
|
"step": 3195
|
|
},
|
|
{
|
|
"epoch": 0.27155465037338766,
|
|
"grad_norm": 19.8438777923584,
|
|
"learning_rate": 1.8262387743159948e-06,
|
|
"loss": 0.0767,
|
|
"num_input_tokens_seen": 3067712,
|
|
"step": 3200
|
|
},
|
|
{
|
|
"epoch": 0.2719789545145961,
|
|
"grad_norm": 16.484792709350586,
|
|
"learning_rate": 1.8254034872493853e-06,
|
|
"loss": 0.057,
|
|
"num_input_tokens_seen": 3072000,
|
|
"step": 3205
|
|
},
|
|
{
|
|
"epoch": 0.2724032586558045,
|
|
"grad_norm": 0.5902109742164612,
|
|
"learning_rate": 1.8245663893238072e-06,
|
|
"loss": 0.0498,
|
|
"num_input_tokens_seen": 3076416,
|
|
"step": 3210
|
|
},
|
|
{
|
|
"epoch": 0.2728275627970129,
|
|
"grad_norm": 19.477684020996094,
|
|
"learning_rate": 1.823727482375776e-06,
|
|
"loss": 0.0717,
|
|
"num_input_tokens_seen": 3081792,
|
|
"step": 3215
|
|
},
|
|
{
|
|
"epoch": 0.27325186693822134,
|
|
"grad_norm": 1.8754899501800537,
|
|
"learning_rate": 1.8228867682457762e-06,
|
|
"loss": 0.1278,
|
|
"num_input_tokens_seen": 3086656,
|
|
"step": 3220
|
|
},
|
|
{
|
|
"epoch": 0.2736761710794297,
|
|
"grad_norm": 9.842918395996094,
|
|
"learning_rate": 1.8220442487782565e-06,
|
|
"loss": 0.1001,
|
|
"num_input_tokens_seen": 3091328,
|
|
"step": 3225
|
|
},
|
|
{
|
|
"epoch": 0.27410047522063813,
|
|
"grad_norm": 15.38632869720459,
|
|
"learning_rate": 1.8211999258216273e-06,
|
|
"loss": 0.0101,
|
|
"num_input_tokens_seen": 3096448,
|
|
"step": 3230
|
|
},
|
|
{
|
|
"epoch": 0.27452477936184655,
|
|
"grad_norm": 7.185582637786865,
|
|
"learning_rate": 1.8203538012282548e-06,
|
|
"loss": 0.0698,
|
|
"num_input_tokens_seen": 3102400,
|
|
"step": 3235
|
|
},
|
|
{
|
|
"epoch": 0.27494908350305497,
|
|
"grad_norm": 2.286411762237549,
|
|
"learning_rate": 1.8195058768544583e-06,
|
|
"loss": 0.0481,
|
|
"num_input_tokens_seen": 3107008,
|
|
"step": 3240
|
|
},
|
|
{
|
|
"epoch": 0.2753733876442634,
|
|
"grad_norm": 0.3740554451942444,
|
|
"learning_rate": 1.8186561545605052e-06,
|
|
"loss": 0.0359,
|
|
"num_input_tokens_seen": 3111872,
|
|
"step": 3245
|
|
},
|
|
{
|
|
"epoch": 0.2757976917854718,
|
|
"grad_norm": 30.406129837036133,
|
|
"learning_rate": 1.8178046362106083e-06,
|
|
"loss": 0.0247,
|
|
"num_input_tokens_seen": 3116544,
|
|
"step": 3250
|
|
},
|
|
{
|
|
"epoch": 0.27622199592668023,
|
|
"grad_norm": 13.679139137268066,
|
|
"learning_rate": 1.8169513236729195e-06,
|
|
"loss": 0.0942,
|
|
"num_input_tokens_seen": 3121024,
|
|
"step": 3255
|
|
},
|
|
{
|
|
"epoch": 0.27664630006788865,
|
|
"grad_norm": 21.39982795715332,
|
|
"learning_rate": 1.8160962188195278e-06,
|
|
"loss": 0.0605,
|
|
"num_input_tokens_seen": 3125696,
|
|
"step": 3260
|
|
},
|
|
{
|
|
"epoch": 0.27707060420909707,
|
|
"grad_norm": 7.936905860900879,
|
|
"learning_rate": 1.8152393235264545e-06,
|
|
"loss": 0.0994,
|
|
"num_input_tokens_seen": 3130752,
|
|
"step": 3265
|
|
},
|
|
{
|
|
"epoch": 0.2774949083503055,
|
|
"grad_norm": 19.371158599853516,
|
|
"learning_rate": 1.8143806396736486e-06,
|
|
"loss": 0.1005,
|
|
"num_input_tokens_seen": 3135360,
|
|
"step": 3270
|
|
},
|
|
{
|
|
"epoch": 0.2779192124915139,
|
|
"grad_norm": 19.735645294189453,
|
|
"learning_rate": 1.813520169144983e-06,
|
|
"loss": 0.1055,
|
|
"num_input_tokens_seen": 3140032,
|
|
"step": 3275
|
|
},
|
|
{
|
|
"epoch": 0.27834351663272233,
|
|
"grad_norm": 0.4279995858669281,
|
|
"learning_rate": 1.8126579138282501e-06,
|
|
"loss": 0.0714,
|
|
"num_input_tokens_seen": 3144960,
|
|
"step": 3280
|
|
},
|
|
{
|
|
"epoch": 0.27876782077393075,
|
|
"grad_norm": 0.9434179663658142,
|
|
"learning_rate": 1.8117938756151592e-06,
|
|
"loss": 0.0644,
|
|
"num_input_tokens_seen": 3150016,
|
|
"step": 3285
|
|
},
|
|
{
|
|
"epoch": 0.2791921249151392,
|
|
"grad_norm": 19.84819793701172,
|
|
"learning_rate": 1.8109280564013297e-06,
|
|
"loss": 0.1493,
|
|
"num_input_tokens_seen": 3155200,
|
|
"step": 3290
|
|
},
|
|
{
|
|
"epoch": 0.2796164290563476,
|
|
"grad_norm": 23.324520111083984,
|
|
"learning_rate": 1.8100604580862898e-06,
|
|
"loss": 0.057,
|
|
"num_input_tokens_seen": 3160000,
|
|
"step": 3295
|
|
},
|
|
{
|
|
"epoch": 0.280040733197556,
|
|
"grad_norm": 0.755753755569458,
|
|
"learning_rate": 1.8091910825734686e-06,
|
|
"loss": 0.0528,
|
|
"num_input_tokens_seen": 3164672,
|
|
"step": 3300
|
|
},
|
|
{
|
|
"epoch": 0.28046503733876443,
|
|
"grad_norm": 1.2135066986083984,
|
|
"learning_rate": 1.808319931770197e-06,
|
|
"loss": 0.0386,
|
|
"num_input_tokens_seen": 3169152,
|
|
"step": 3305
|
|
},
|
|
{
|
|
"epoch": 0.28088934147997285,
|
|
"grad_norm": 34.18061828613281,
|
|
"learning_rate": 1.8074470075876983e-06,
|
|
"loss": 0.0529,
|
|
"num_input_tokens_seen": 3173888,
|
|
"step": 3310
|
|
},
|
|
{
|
|
"epoch": 0.2813136456211813,
|
|
"grad_norm": 0.2154167890548706,
|
|
"learning_rate": 1.8065723119410884e-06,
|
|
"loss": 0.0567,
|
|
"num_input_tokens_seen": 3179072,
|
|
"step": 3315
|
|
},
|
|
{
|
|
"epoch": 0.2817379497623897,
|
|
"grad_norm": 20.250518798828125,
|
|
"learning_rate": 1.8056958467493678e-06,
|
|
"loss": 0.0953,
|
|
"num_input_tokens_seen": 3183552,
|
|
"step": 3320
|
|
},
|
|
{
|
|
"epoch": 0.2821622539035981,
|
|
"grad_norm": 2.8390376567840576,
|
|
"learning_rate": 1.8048176139354207e-06,
|
|
"loss": 0.0349,
|
|
"num_input_tokens_seen": 3187968,
|
|
"step": 3325
|
|
},
|
|
{
|
|
"epoch": 0.28258655804480654,
|
|
"grad_norm": 0.6544375419616699,
|
|
"learning_rate": 1.8039376154260086e-06,
|
|
"loss": 0.092,
|
|
"num_input_tokens_seen": 3192704,
|
|
"step": 3330
|
|
},
|
|
{
|
|
"epoch": 0.28301086218601496,
|
|
"grad_norm": 2.831908941268921,
|
|
"learning_rate": 1.803055853151767e-06,
|
|
"loss": 0.0937,
|
|
"num_input_tokens_seen": 3197760,
|
|
"step": 3335
|
|
},
|
|
{
|
|
"epoch": 0.2834351663272234,
|
|
"grad_norm": 15.582389831542969,
|
|
"learning_rate": 1.8021723290472007e-06,
|
|
"loss": 0.0314,
|
|
"num_input_tokens_seen": 3202368,
|
|
"step": 3340
|
|
},
|
|
{
|
|
"epoch": 0.2838594704684318,
|
|
"grad_norm": 14.190438270568848,
|
|
"learning_rate": 1.8012870450506798e-06,
|
|
"loss": 0.1856,
|
|
"num_input_tokens_seen": 3207360,
|
|
"step": 3345
|
|
},
|
|
{
|
|
"epoch": 0.2842837746096402,
|
|
"grad_norm": 15.410075187683105,
|
|
"learning_rate": 1.800400003104436e-06,
|
|
"loss": 0.0936,
|
|
"num_input_tokens_seen": 3213632,
|
|
"step": 3350
|
|
},
|
|
{
|
|
"epoch": 0.2847080787508486,
|
|
"grad_norm": 10.997910499572754,
|
|
"learning_rate": 1.799511205154557e-06,
|
|
"loss": 0.0553,
|
|
"num_input_tokens_seen": 3219136,
|
|
"step": 3355
|
|
},
|
|
{
|
|
"epoch": 0.285132382892057,
|
|
"grad_norm": 1.6344224214553833,
|
|
"learning_rate": 1.7986206531509835e-06,
|
|
"loss": 0.0747,
|
|
"num_input_tokens_seen": 3225088,
|
|
"step": 3360
|
|
},
|
|
{
|
|
"epoch": 0.2855566870332654,
|
|
"grad_norm": 7.749290943145752,
|
|
"learning_rate": 1.7977283490475043e-06,
|
|
"loss": 0.0769,
|
|
"num_input_tokens_seen": 3229504,
|
|
"step": 3365
|
|
},
|
|
{
|
|
"epoch": 0.28598099117447384,
|
|
"grad_norm": 7.653243541717529,
|
|
"learning_rate": 1.796834294801752e-06,
|
|
"loss": 0.0521,
|
|
"num_input_tokens_seen": 3234368,
|
|
"step": 3370
|
|
},
|
|
{
|
|
"epoch": 0.28640529531568226,
|
|
"grad_norm": 8.196025848388672,
|
|
"learning_rate": 1.7959384923751993e-06,
|
|
"loss": 0.0916,
|
|
"num_input_tokens_seen": 3239232,
|
|
"step": 3375
|
|
},
|
|
{
|
|
"epoch": 0.2868295994568907,
|
|
"grad_norm": 24.483734130859375,
|
|
"learning_rate": 1.7950409437331535e-06,
|
|
"loss": 0.0843,
|
|
"num_input_tokens_seen": 3244288,
|
|
"step": 3380
|
|
},
|
|
{
|
|
"epoch": 0.2872539035980991,
|
|
"grad_norm": 9.452868461608887,
|
|
"learning_rate": 1.7941416508447534e-06,
|
|
"loss": 0.1443,
|
|
"num_input_tokens_seen": 3248640,
|
|
"step": 3385
|
|
},
|
|
{
|
|
"epoch": 0.2876782077393075,
|
|
"grad_norm": 0.3759312331676483,
|
|
"learning_rate": 1.7932406156829649e-06,
|
|
"loss": 0.0315,
|
|
"num_input_tokens_seen": 3253440,
|
|
"step": 3390
|
|
},
|
|
{
|
|
"epoch": 0.28810251188051594,
|
|
"grad_norm": 13.523527145385742,
|
|
"learning_rate": 1.7923378402245756e-06,
|
|
"loss": 0.0669,
|
|
"num_input_tokens_seen": 3258048,
|
|
"step": 3395
|
|
},
|
|
{
|
|
"epoch": 0.28852681602172436,
|
|
"grad_norm": 9.800745010375977,
|
|
"learning_rate": 1.7914333264501913e-06,
|
|
"loss": 0.0275,
|
|
"num_input_tokens_seen": 3262912,
|
|
"step": 3400
|
|
},
|
|
{
|
|
"epoch": 0.2889511201629328,
|
|
"grad_norm": 25.192153930664062,
|
|
"learning_rate": 1.790527076344232e-06,
|
|
"loss": 0.0653,
|
|
"num_input_tokens_seen": 3267776,
|
|
"step": 3405
|
|
},
|
|
{
|
|
"epoch": 0.2893754243041412,
|
|
"grad_norm": 23.029342651367188,
|
|
"learning_rate": 1.7896190918949266e-06,
|
|
"loss": 0.0589,
|
|
"num_input_tokens_seen": 3272448,
|
|
"step": 3410
|
|
},
|
|
{
|
|
"epoch": 0.2897997284453496,
|
|
"grad_norm": 15.206321716308594,
|
|
"learning_rate": 1.7887093750943088e-06,
|
|
"loss": 0.0813,
|
|
"num_input_tokens_seen": 3277056,
|
|
"step": 3415
|
|
},
|
|
{
|
|
"epoch": 0.29022403258655805,
|
|
"grad_norm": 8.484832763671875,
|
|
"learning_rate": 1.7877979279382131e-06,
|
|
"loss": 0.0809,
|
|
"num_input_tokens_seen": 3282048,
|
|
"step": 3420
|
|
},
|
|
{
|
|
"epoch": 0.29064833672776647,
|
|
"grad_norm": 12.240069389343262,
|
|
"learning_rate": 1.7868847524262708e-06,
|
|
"loss": 0.1146,
|
|
"num_input_tokens_seen": 3286336,
|
|
"step": 3425
|
|
},
|
|
{
|
|
"epoch": 0.2910726408689749,
|
|
"grad_norm": 5.32802677154541,
|
|
"learning_rate": 1.7859698505619043e-06,
|
|
"loss": 0.0252,
|
|
"num_input_tokens_seen": 3290880,
|
|
"step": 3430
|
|
},
|
|
{
|
|
"epoch": 0.2914969450101833,
|
|
"grad_norm": 0.42785537242889404,
|
|
"learning_rate": 1.7850532243523238e-06,
|
|
"loss": 0.0617,
|
|
"num_input_tokens_seen": 3295360,
|
|
"step": 3435
|
|
},
|
|
{
|
|
"epoch": 0.2919212491513917,
|
|
"grad_norm": 0.23996587097644806,
|
|
"learning_rate": 1.7841348758085224e-06,
|
|
"loss": 0.0161,
|
|
"num_input_tokens_seen": 3299840,
|
|
"step": 3440
|
|
},
|
|
{
|
|
"epoch": 0.29234555329260015,
|
|
"grad_norm": 0.7062512636184692,
|
|
"learning_rate": 1.7832148069452719e-06,
|
|
"loss": 0.0229,
|
|
"num_input_tokens_seen": 3304448,
|
|
"step": 3445
|
|
},
|
|
{
|
|
"epoch": 0.29276985743380857,
|
|
"grad_norm": 0.12156375497579575,
|
|
"learning_rate": 1.7822930197811186e-06,
|
|
"loss": 0.1007,
|
|
"num_input_tokens_seen": 3308928,
|
|
"step": 3450
|
|
},
|
|
{
|
|
"epoch": 0.293194161575017,
|
|
"grad_norm": 7.842189311981201,
|
|
"learning_rate": 1.781369516338378e-06,
|
|
"loss": 0.0997,
|
|
"num_input_tokens_seen": 3313408,
|
|
"step": 3455
|
|
},
|
|
{
|
|
"epoch": 0.2936184657162254,
|
|
"grad_norm": 2.187845468521118,
|
|
"learning_rate": 1.7804442986431317e-06,
|
|
"loss": 0.0756,
|
|
"num_input_tokens_seen": 3318080,
|
|
"step": 3460
|
|
},
|
|
{
|
|
"epoch": 0.29404276985743383,
|
|
"grad_norm": 12.173555374145508,
|
|
"learning_rate": 1.7795173687252213e-06,
|
|
"loss": 0.0801,
|
|
"num_input_tokens_seen": 3323136,
|
|
"step": 3465
|
|
},
|
|
{
|
|
"epoch": 0.29446707399864225,
|
|
"grad_norm": 10.94555950164795,
|
|
"learning_rate": 1.778588728618246e-06,
|
|
"loss": 0.0184,
|
|
"num_input_tokens_seen": 3327936,
|
|
"step": 3470
|
|
},
|
|
{
|
|
"epoch": 0.29489137813985067,
|
|
"grad_norm": 27.10161018371582,
|
|
"learning_rate": 1.777658380359556e-06,
|
|
"loss": 0.0703,
|
|
"num_input_tokens_seen": 3332864,
|
|
"step": 3475
|
|
},
|
|
{
|
|
"epoch": 0.2953156822810591,
|
|
"grad_norm": 0.14804022014141083,
|
|
"learning_rate": 1.7767263259902494e-06,
|
|
"loss": 0.0622,
|
|
"num_input_tokens_seen": 3338048,
|
|
"step": 3480
|
|
},
|
|
{
|
|
"epoch": 0.2957399864222675,
|
|
"grad_norm": 49.195499420166016,
|
|
"learning_rate": 1.7757925675551672e-06,
|
|
"loss": 0.1846,
|
|
"num_input_tokens_seen": 3343104,
|
|
"step": 3485
|
|
},
|
|
{
|
|
"epoch": 0.2961642905634759,
|
|
"grad_norm": 11.58652114868164,
|
|
"learning_rate": 1.7748571071028898e-06,
|
|
"loss": 0.0706,
|
|
"num_input_tokens_seen": 3347712,
|
|
"step": 3490
|
|
},
|
|
{
|
|
"epoch": 0.2965885947046843,
|
|
"grad_norm": 12.993041038513184,
|
|
"learning_rate": 1.7739199466857301e-06,
|
|
"loss": 0.0544,
|
|
"num_input_tokens_seen": 3352000,
|
|
"step": 3495
|
|
},
|
|
{
|
|
"epoch": 0.2970128988458927,
|
|
"grad_norm": 2.848405599594116,
|
|
"learning_rate": 1.772981088359732e-06,
|
|
"loss": 0.0226,
|
|
"num_input_tokens_seen": 3356480,
|
|
"step": 3500
|
|
},
|
|
{
|
|
"epoch": 0.29743720298710113,
|
|
"grad_norm": 23.85163688659668,
|
|
"learning_rate": 1.7720405341846636e-06,
|
|
"loss": 0.1146,
|
|
"num_input_tokens_seen": 3361536,
|
|
"step": 3505
|
|
},
|
|
{
|
|
"epoch": 0.29786150712830956,
|
|
"grad_norm": 10.137134552001953,
|
|
"learning_rate": 1.771098286224014e-06,
|
|
"loss": 0.0916,
|
|
"num_input_tokens_seen": 3367296,
|
|
"step": 3510
|
|
},
|
|
{
|
|
"epoch": 0.298285811269518,
|
|
"grad_norm": 1.443406105041504,
|
|
"learning_rate": 1.7701543465449884e-06,
|
|
"loss": 0.0331,
|
|
"num_input_tokens_seen": 3372096,
|
|
"step": 3515
|
|
},
|
|
{
|
|
"epoch": 0.2987101154107264,
|
|
"grad_norm": 0.2798957824707031,
|
|
"learning_rate": 1.7692087172185026e-06,
|
|
"loss": 0.0789,
|
|
"num_input_tokens_seen": 3376384,
|
|
"step": 3520
|
|
},
|
|
{
|
|
"epoch": 0.2991344195519348,
|
|
"grad_norm": 7.460265636444092,
|
|
"learning_rate": 1.7682614003191805e-06,
|
|
"loss": 0.0325,
|
|
"num_input_tokens_seen": 3381504,
|
|
"step": 3525
|
|
},
|
|
{
|
|
"epoch": 0.29955872369314324,
|
|
"grad_norm": 11.331478118896484,
|
|
"learning_rate": 1.7673123979253475e-06,
|
|
"loss": 0.0504,
|
|
"num_input_tokens_seen": 3386112,
|
|
"step": 3530
|
|
},
|
|
{
|
|
"epoch": 0.29998302783435166,
|
|
"grad_norm": 0.4614429771900177,
|
|
"learning_rate": 1.7663617121190271e-06,
|
|
"loss": 0.0483,
|
|
"num_input_tokens_seen": 3395072,
|
|
"step": 3535
|
|
},
|
|
{
|
|
"epoch": 0.3004073319755601,
|
|
"grad_norm": 0.418489933013916,
|
|
"learning_rate": 1.7654093449859367e-06,
|
|
"loss": 0.0218,
|
|
"num_input_tokens_seen": 3399808,
|
|
"step": 3540
|
|
},
|
|
{
|
|
"epoch": 0.3004073319755601,
|
|
"eval_loss": 0.07402241230010986,
|
|
"eval_runtime": 16.7134,
|
|
"eval_samples_per_second": 626.742,
|
|
"eval_steps_per_second": 78.38,
|
|
"num_input_tokens_seen": 3399808,
|
|
"step": 3540
|
|
},
|
|
{
|
|
"epoch": 0.3008316361167685,
|
|
"grad_norm": 0.6309323906898499,
|
|
"learning_rate": 1.764455298615481e-06,
|
|
"loss": 0.0419,
|
|
"num_input_tokens_seen": 3404544,
|
|
"step": 3545
|
|
},
|
|
{
|
|
"epoch": 0.3012559402579769,
|
|
"grad_norm": 15.243474006652832,
|
|
"learning_rate": 1.7634995751007499e-06,
|
|
"loss": 0.1042,
|
|
"num_input_tokens_seen": 3408896,
|
|
"step": 3550
|
|
},
|
|
{
|
|
"epoch": 0.30168024439918534,
|
|
"grad_norm": 17.286970138549805,
|
|
"learning_rate": 1.7625421765385124e-06,
|
|
"loss": 0.0971,
|
|
"num_input_tokens_seen": 3413824,
|
|
"step": 3555
|
|
},
|
|
{
|
|
"epoch": 0.30210454854039376,
|
|
"grad_norm": 21.38294219970703,
|
|
"learning_rate": 1.7615831050292127e-06,
|
|
"loss": 0.0934,
|
|
"num_input_tokens_seen": 3418240,
|
|
"step": 3560
|
|
},
|
|
{
|
|
"epoch": 0.3025288526816022,
|
|
"grad_norm": 8.97788143157959,
|
|
"learning_rate": 1.760622362676965e-06,
|
|
"loss": 0.1297,
|
|
"num_input_tokens_seen": 3423168,
|
|
"step": 3565
|
|
},
|
|
{
|
|
"epoch": 0.3029531568228106,
|
|
"grad_norm": 61.9466552734375,
|
|
"learning_rate": 1.7596599515895486e-06,
|
|
"loss": 0.0857,
|
|
"num_input_tokens_seen": 3428224,
|
|
"step": 3570
|
|
},
|
|
{
|
|
"epoch": 0.303377460964019,
|
|
"grad_norm": 0.1966453343629837,
|
|
"learning_rate": 1.7586958738784055e-06,
|
|
"loss": 0.0494,
|
|
"num_input_tokens_seen": 3432896,
|
|
"step": 3575
|
|
},
|
|
{
|
|
"epoch": 0.30380176510522744,
|
|
"grad_norm": 37.761756896972656,
|
|
"learning_rate": 1.7577301316586323e-06,
|
|
"loss": 0.0641,
|
|
"num_input_tokens_seen": 3437632,
|
|
"step": 3580
|
|
},
|
|
{
|
|
"epoch": 0.30422606924643586,
|
|
"grad_norm": 18.96601104736328,
|
|
"learning_rate": 1.7567627270489787e-06,
|
|
"loss": 0.0591,
|
|
"num_input_tokens_seen": 3442112,
|
|
"step": 3585
|
|
},
|
|
{
|
|
"epoch": 0.3046503733876443,
|
|
"grad_norm": 3.731330156326294,
|
|
"learning_rate": 1.7557936621718406e-06,
|
|
"loss": 0.0411,
|
|
"num_input_tokens_seen": 3448064,
|
|
"step": 3590
|
|
},
|
|
{
|
|
"epoch": 0.3050746775288527,
|
|
"grad_norm": 52.596981048583984,
|
|
"learning_rate": 1.754822939153257e-06,
|
|
"loss": 0.1215,
|
|
"num_input_tokens_seen": 3452800,
|
|
"step": 3595
|
|
},
|
|
{
|
|
"epoch": 0.3054989816700611,
|
|
"grad_norm": 31.76228141784668,
|
|
"learning_rate": 1.7538505601229043e-06,
|
|
"loss": 0.1255,
|
|
"num_input_tokens_seen": 3457856,
|
|
"step": 3600
|
|
},
|
|
{
|
|
"epoch": 0.30592328581126954,
|
|
"grad_norm": 0.8424116969108582,
|
|
"learning_rate": 1.7528765272140927e-06,
|
|
"loss": 0.0426,
|
|
"num_input_tokens_seen": 3462720,
|
|
"step": 3605
|
|
},
|
|
{
|
|
"epoch": 0.30634758995247796,
|
|
"grad_norm": 13.302154541015625,
|
|
"learning_rate": 1.7519008425637597e-06,
|
|
"loss": 0.1186,
|
|
"num_input_tokens_seen": 3467264,
|
|
"step": 3610
|
|
},
|
|
{
|
|
"epoch": 0.3067718940936864,
|
|
"grad_norm": 20.225831985473633,
|
|
"learning_rate": 1.7509235083124679e-06,
|
|
"loss": 0.0711,
|
|
"num_input_tokens_seen": 3472832,
|
|
"step": 3615
|
|
},
|
|
{
|
|
"epoch": 0.30719619823489475,
|
|
"grad_norm": 7.1020827293396,
|
|
"learning_rate": 1.749944526604398e-06,
|
|
"loss": 0.0436,
|
|
"num_input_tokens_seen": 3478016,
|
|
"step": 3620
|
|
},
|
|
{
|
|
"epoch": 0.30762050237610317,
|
|
"grad_norm": 0.9091804027557373,
|
|
"learning_rate": 1.7489638995873453e-06,
|
|
"loss": 0.0482,
|
|
"num_input_tokens_seen": 3482688,
|
|
"step": 3625
|
|
},
|
|
{
|
|
"epoch": 0.3080448065173116,
|
|
"grad_norm": 13.077895164489746,
|
|
"learning_rate": 1.7479816294127149e-06,
|
|
"loss": 0.011,
|
|
"num_input_tokens_seen": 3487296,
|
|
"step": 3630
|
|
},
|
|
{
|
|
"epoch": 0.30846911065852,
|
|
"grad_norm": 0.42384710907936096,
|
|
"learning_rate": 1.746997718235517e-06,
|
|
"loss": 0.0607,
|
|
"num_input_tokens_seen": 3491712,
|
|
"step": 3635
|
|
},
|
|
{
|
|
"epoch": 0.3088934147997284,
|
|
"grad_norm": 6.420424461364746,
|
|
"learning_rate": 1.7460121682143616e-06,
|
|
"loss": 0.09,
|
|
"num_input_tokens_seen": 3496256,
|
|
"step": 3640
|
|
},
|
|
{
|
|
"epoch": 0.30931771894093685,
|
|
"grad_norm": 28.44765853881836,
|
|
"learning_rate": 1.7450249815114545e-06,
|
|
"loss": 0.1042,
|
|
"num_input_tokens_seen": 3500672,
|
|
"step": 3645
|
|
},
|
|
{
|
|
"epoch": 0.30974202308214527,
|
|
"grad_norm": 0.14803655445575714,
|
|
"learning_rate": 1.744036160292592e-06,
|
|
"loss": 0.0076,
|
|
"num_input_tokens_seen": 3505536,
|
|
"step": 3650
|
|
},
|
|
{
|
|
"epoch": 0.3101663272233537,
|
|
"grad_norm": 4.641674995422363,
|
|
"learning_rate": 1.7430457067271563e-06,
|
|
"loss": 0.0662,
|
|
"num_input_tokens_seen": 3510400,
|
|
"step": 3655
|
|
},
|
|
{
|
|
"epoch": 0.3105906313645621,
|
|
"grad_norm": 0.13145208358764648,
|
|
"learning_rate": 1.742053622988111e-06,
|
|
"loss": 0.0375,
|
|
"num_input_tokens_seen": 3514880,
|
|
"step": 3660
|
|
},
|
|
{
|
|
"epoch": 0.31101493550577053,
|
|
"grad_norm": 29.60529327392578,
|
|
"learning_rate": 1.7410599112519969e-06,
|
|
"loss": 0.1194,
|
|
"num_input_tokens_seen": 3520192,
|
|
"step": 3665
|
|
},
|
|
{
|
|
"epoch": 0.31143923964697895,
|
|
"grad_norm": 6.398006916046143,
|
|
"learning_rate": 1.7400645736989246e-06,
|
|
"loss": 0.0676,
|
|
"num_input_tokens_seen": 3524544,
|
|
"step": 3670
|
|
},
|
|
{
|
|
"epoch": 0.31186354378818737,
|
|
"grad_norm": 1.1614575386047363,
|
|
"learning_rate": 1.7390676125125733e-06,
|
|
"loss": 0.0285,
|
|
"num_input_tokens_seen": 3528896,
|
|
"step": 3675
|
|
},
|
|
{
|
|
"epoch": 0.3122878479293958,
|
|
"grad_norm": 0.23471052944660187,
|
|
"learning_rate": 1.7380690298801836e-06,
|
|
"loss": 0.0717,
|
|
"num_input_tokens_seen": 3533568,
|
|
"step": 3680
|
|
},
|
|
{
|
|
"epoch": 0.3127121520706042,
|
|
"grad_norm": 24.79547882080078,
|
|
"learning_rate": 1.7370688279925538e-06,
|
|
"loss": 0.0404,
|
|
"num_input_tokens_seen": 3539008,
|
|
"step": 3685
|
|
},
|
|
{
|
|
"epoch": 0.31313645621181263,
|
|
"grad_norm": 0.13652589917182922,
|
|
"learning_rate": 1.736067009044034e-06,
|
|
"loss": 0.0216,
|
|
"num_input_tokens_seen": 3544448,
|
|
"step": 3690
|
|
},
|
|
{
|
|
"epoch": 0.31356076035302105,
|
|
"grad_norm": 18.10015296936035,
|
|
"learning_rate": 1.7350635752325222e-06,
|
|
"loss": 0.1353,
|
|
"num_input_tokens_seen": 3549184,
|
|
"step": 3695
|
|
},
|
|
{
|
|
"epoch": 0.31398506449422947,
|
|
"grad_norm": 0.6508774161338806,
|
|
"learning_rate": 1.7340585287594603e-06,
|
|
"loss": 0.0321,
|
|
"num_input_tokens_seen": 3554176,
|
|
"step": 3700
|
|
},
|
|
{
|
|
"epoch": 0.3144093686354379,
|
|
"grad_norm": 17.291688919067383,
|
|
"learning_rate": 1.733051871829826e-06,
|
|
"loss": 0.0656,
|
|
"num_input_tokens_seen": 3558720,
|
|
"step": 3705
|
|
},
|
|
{
|
|
"epoch": 0.3148336727766463,
|
|
"grad_norm": 7.416316986083984,
|
|
"learning_rate": 1.7320436066521333e-06,
|
|
"loss": 0.0802,
|
|
"num_input_tokens_seen": 3563648,
|
|
"step": 3710
|
|
},
|
|
{
|
|
"epoch": 0.31525797691785473,
|
|
"grad_norm": 20.006744384765625,
|
|
"learning_rate": 1.7310337354384214e-06,
|
|
"loss": 0.0491,
|
|
"num_input_tokens_seen": 3568704,
|
|
"step": 3715
|
|
},
|
|
{
|
|
"epoch": 0.31568228105906315,
|
|
"grad_norm": 6.35284948348999,
|
|
"learning_rate": 1.7300222604042552e-06,
|
|
"loss": 0.0452,
|
|
"num_input_tokens_seen": 3573184,
|
|
"step": 3720
|
|
},
|
|
{
|
|
"epoch": 0.3161065852002716,
|
|
"grad_norm": 26.828872680664062,
|
|
"learning_rate": 1.7290091837687172e-06,
|
|
"loss": 0.0971,
|
|
"num_input_tokens_seen": 3578432,
|
|
"step": 3725
|
|
},
|
|
{
|
|
"epoch": 0.31653088934148,
|
|
"grad_norm": 1.0489623546600342,
|
|
"learning_rate": 1.7279945077544036e-06,
|
|
"loss": 0.0129,
|
|
"num_input_tokens_seen": 3582848,
|
|
"step": 3730
|
|
},
|
|
{
|
|
"epoch": 0.3169551934826884,
|
|
"grad_norm": 0.16082455217838287,
|
|
"learning_rate": 1.7269782345874203e-06,
|
|
"loss": 0.0383,
|
|
"num_input_tokens_seen": 3587840,
|
|
"step": 3735
|
|
},
|
|
{
|
|
"epoch": 0.31737949762389683,
|
|
"grad_norm": 0.19137334823608398,
|
|
"learning_rate": 1.7259603664973766e-06,
|
|
"loss": 0.1176,
|
|
"num_input_tokens_seen": 3592576,
|
|
"step": 3740
|
|
},
|
|
{
|
|
"epoch": 0.31780380176510525,
|
|
"grad_norm": 19.070852279663086,
|
|
"learning_rate": 1.7249409057173806e-06,
|
|
"loss": 0.0933,
|
|
"num_input_tokens_seen": 3597376,
|
|
"step": 3745
|
|
},
|
|
{
|
|
"epoch": 0.3182281059063136,
|
|
"grad_norm": 2.2940688133239746,
|
|
"learning_rate": 1.7239198544840354e-06,
|
|
"loss": 0.0922,
|
|
"num_input_tokens_seen": 3601728,
|
|
"step": 3750
|
|
},
|
|
{
|
|
"epoch": 0.31865241004752204,
|
|
"grad_norm": 12.90054702758789,
|
|
"learning_rate": 1.7228972150374332e-06,
|
|
"loss": 0.1311,
|
|
"num_input_tokens_seen": 3606592,
|
|
"step": 3755
|
|
},
|
|
{
|
|
"epoch": 0.31907671418873046,
|
|
"grad_norm": 20.113847732543945,
|
|
"learning_rate": 1.7218729896211504e-06,
|
|
"loss": 0.0617,
|
|
"num_input_tokens_seen": 3611328,
|
|
"step": 3760
|
|
},
|
|
{
|
|
"epoch": 0.3195010183299389,
|
|
"grad_norm": 0.45969074964523315,
|
|
"learning_rate": 1.7208471804822425e-06,
|
|
"loss": 0.0452,
|
|
"num_input_tokens_seen": 3616000,
|
|
"step": 3765
|
|
},
|
|
{
|
|
"epoch": 0.3199253224711473,
|
|
"grad_norm": 1.610163927078247,
|
|
"learning_rate": 1.71981978987124e-06,
|
|
"loss": 0.0215,
|
|
"num_input_tokens_seen": 3620288,
|
|
"step": 3770
|
|
},
|
|
{
|
|
"epoch": 0.3203496266123557,
|
|
"grad_norm": 0.5942493081092834,
|
|
"learning_rate": 1.7187908200421432e-06,
|
|
"loss": 0.0609,
|
|
"num_input_tokens_seen": 3624704,
|
|
"step": 3775
|
|
},
|
|
{
|
|
"epoch": 0.32077393075356414,
|
|
"grad_norm": 0.18212255835533142,
|
|
"learning_rate": 1.717760273252417e-06,
|
|
"loss": 0.0529,
|
|
"num_input_tokens_seen": 3628736,
|
|
"step": 3780
|
|
},
|
|
{
|
|
"epoch": 0.32119823489477256,
|
|
"grad_norm": 1.2153847217559814,
|
|
"learning_rate": 1.7167281517629854e-06,
|
|
"loss": 0.0684,
|
|
"num_input_tokens_seen": 3633664,
|
|
"step": 3785
|
|
},
|
|
{
|
|
"epoch": 0.321622539035981,
|
|
"grad_norm": 19.054046630859375,
|
|
"learning_rate": 1.7156944578382277e-06,
|
|
"loss": 0.0817,
|
|
"num_input_tokens_seen": 3638400,
|
|
"step": 3790
|
|
},
|
|
{
|
|
"epoch": 0.3220468431771894,
|
|
"grad_norm": 8.007206916809082,
|
|
"learning_rate": 1.7146591937459732e-06,
|
|
"loss": 0.1081,
|
|
"num_input_tokens_seen": 3643200,
|
|
"step": 3795
|
|
},
|
|
{
|
|
"epoch": 0.3224711473183978,
|
|
"grad_norm": 27.179113388061523,
|
|
"learning_rate": 1.713622361757495e-06,
|
|
"loss": 0.1423,
|
|
"num_input_tokens_seen": 3647680,
|
|
"step": 3800
|
|
},
|
|
{
|
|
"epoch": 0.32289545145960624,
|
|
"grad_norm": 0.31179356575012207,
|
|
"learning_rate": 1.712583964147507e-06,
|
|
"loss": 0.0286,
|
|
"num_input_tokens_seen": 3653120,
|
|
"step": 3805
|
|
},
|
|
{
|
|
"epoch": 0.32331975560081466,
|
|
"grad_norm": 2.882406234741211,
|
|
"learning_rate": 1.7115440031941572e-06,
|
|
"loss": 0.1418,
|
|
"num_input_tokens_seen": 3657856,
|
|
"step": 3810
|
|
},
|
|
{
|
|
"epoch": 0.3237440597420231,
|
|
"grad_norm": 13.141288757324219,
|
|
"learning_rate": 1.7105024811790248e-06,
|
|
"loss": 0.1054,
|
|
"num_input_tokens_seen": 3662656,
|
|
"step": 3815
|
|
},
|
|
{
|
|
"epoch": 0.3241683638832315,
|
|
"grad_norm": 4.509273529052734,
|
|
"learning_rate": 1.7094594003871116e-06,
|
|
"loss": 0.076,
|
|
"num_input_tokens_seen": 3668096,
|
|
"step": 3820
|
|
},
|
|
{
|
|
"epoch": 0.3245926680244399,
|
|
"grad_norm": 15.30965805053711,
|
|
"learning_rate": 1.7084147631068415e-06,
|
|
"loss": 0.0246,
|
|
"num_input_tokens_seen": 3673280,
|
|
"step": 3825
|
|
},
|
|
{
|
|
"epoch": 0.32501697216564834,
|
|
"grad_norm": 17.180437088012695,
|
|
"learning_rate": 1.7073685716300517e-06,
|
|
"loss": 0.0878,
|
|
"num_input_tokens_seen": 3677824,
|
|
"step": 3830
|
|
},
|
|
{
|
|
"epoch": 0.32544127630685676,
|
|
"grad_norm": 1.513655662536621,
|
|
"learning_rate": 1.7063208282519894e-06,
|
|
"loss": 0.0833,
|
|
"num_input_tokens_seen": 3682624,
|
|
"step": 3835
|
|
},
|
|
{
|
|
"epoch": 0.3258655804480652,
|
|
"grad_norm": 15.271832466125488,
|
|
"learning_rate": 1.7052715352713074e-06,
|
|
"loss": 0.067,
|
|
"num_input_tokens_seen": 3687296,
|
|
"step": 3840
|
|
},
|
|
{
|
|
"epoch": 0.3262898845892736,
|
|
"grad_norm": 45.52177810668945,
|
|
"learning_rate": 1.7042206949900568e-06,
|
|
"loss": 0.0788,
|
|
"num_input_tokens_seen": 3692352,
|
|
"step": 3845
|
|
},
|
|
{
|
|
"epoch": 0.326714188730482,
|
|
"grad_norm": 13.412903785705566,
|
|
"learning_rate": 1.703168309713684e-06,
|
|
"loss": 0.0409,
|
|
"num_input_tokens_seen": 3697472,
|
|
"step": 3850
|
|
},
|
|
{
|
|
"epoch": 0.32713849287169044,
|
|
"grad_norm": 11.4611234664917,
|
|
"learning_rate": 1.7021143817510262e-06,
|
|
"loss": 0.0666,
|
|
"num_input_tokens_seen": 3702528,
|
|
"step": 3855
|
|
},
|
|
{
|
|
"epoch": 0.32756279701289887,
|
|
"grad_norm": 17.81032943725586,
|
|
"learning_rate": 1.7010589134143025e-06,
|
|
"loss": 0.1032,
|
|
"num_input_tokens_seen": 3706560,
|
|
"step": 3860
|
|
},
|
|
{
|
|
"epoch": 0.3279871011541073,
|
|
"grad_norm": 11.173263549804688,
|
|
"learning_rate": 1.7000019070191138e-06,
|
|
"loss": 0.0693,
|
|
"num_input_tokens_seen": 3711104,
|
|
"step": 3865
|
|
},
|
|
{
|
|
"epoch": 0.3284114052953157,
|
|
"grad_norm": 7.260513782501221,
|
|
"learning_rate": 1.698943364884434e-06,
|
|
"loss": 0.0774,
|
|
"num_input_tokens_seen": 3715712,
|
|
"step": 3870
|
|
},
|
|
{
|
|
"epoch": 0.3288357094365241,
|
|
"grad_norm": 4.175707817077637,
|
|
"learning_rate": 1.697883289332607e-06,
|
|
"loss": 0.0253,
|
|
"num_input_tokens_seen": 3720960,
|
|
"step": 3875
|
|
},
|
|
{
|
|
"epoch": 0.3292600135777325,
|
|
"grad_norm": 1.0138492584228516,
|
|
"learning_rate": 1.6968216826893405e-06,
|
|
"loss": 0.0591,
|
|
"num_input_tokens_seen": 3725504,
|
|
"step": 3880
|
|
},
|
|
{
|
|
"epoch": 0.3296843177189409,
|
|
"grad_norm": 16.145221710205078,
|
|
"learning_rate": 1.6957585472837014e-06,
|
|
"loss": 0.0627,
|
|
"num_input_tokens_seen": 3729856,
|
|
"step": 3885
|
|
},
|
|
{
|
|
"epoch": 0.33010862186014933,
|
|
"grad_norm": 1.1031025648117065,
|
|
"learning_rate": 1.6946938854481103e-06,
|
|
"loss": 0.1141,
|
|
"num_input_tokens_seen": 3735040,
|
|
"step": 3890
|
|
},
|
|
{
|
|
"epoch": 0.33053292600135775,
|
|
"grad_norm": 10.453174591064453,
|
|
"learning_rate": 1.6936276995183371e-06,
|
|
"loss": 0.0126,
|
|
"num_input_tokens_seen": 3740288,
|
|
"step": 3895
|
|
},
|
|
{
|
|
"epoch": 0.33095723014256617,
|
|
"grad_norm": 1.4971797466278076,
|
|
"learning_rate": 1.6925599918334954e-06,
|
|
"loss": 0.0736,
|
|
"num_input_tokens_seen": 3744960,
|
|
"step": 3900
|
|
},
|
|
{
|
|
"epoch": 0.3313815342837746,
|
|
"grad_norm": 0.29560044407844543,
|
|
"learning_rate": 1.6914907647360367e-06,
|
|
"loss": 0.0762,
|
|
"num_input_tokens_seen": 3749120,
|
|
"step": 3905
|
|
},
|
|
{
|
|
"epoch": 0.331805838424983,
|
|
"grad_norm": 12.087617874145508,
|
|
"learning_rate": 1.6904200205717467e-06,
|
|
"loss": 0.0914,
|
|
"num_input_tokens_seen": 3753984,
|
|
"step": 3910
|
|
},
|
|
{
|
|
"epoch": 0.33223014256619143,
|
|
"grad_norm": 0.22274813055992126,
|
|
"learning_rate": 1.689347761689739e-06,
|
|
"loss": 0.0227,
|
|
"num_input_tokens_seen": 3758464,
|
|
"step": 3915
|
|
},
|
|
{
|
|
"epoch": 0.33265444670739985,
|
|
"grad_norm": 17.105592727661133,
|
|
"learning_rate": 1.6882739904424507e-06,
|
|
"loss": 0.0793,
|
|
"num_input_tokens_seen": 3762624,
|
|
"step": 3920
|
|
},
|
|
{
|
|
"epoch": 0.3330787508486083,
|
|
"grad_norm": 23.649341583251953,
|
|
"learning_rate": 1.6871987091856366e-06,
|
|
"loss": 0.1138,
|
|
"num_input_tokens_seen": 3767616,
|
|
"step": 3925
|
|
},
|
|
{
|
|
"epoch": 0.3335030549898167,
|
|
"grad_norm": 2.6922779083251953,
|
|
"learning_rate": 1.6861219202783644e-06,
|
|
"loss": 0.0032,
|
|
"num_input_tokens_seen": 3772864,
|
|
"step": 3930
|
|
},
|
|
{
|
|
"epoch": 0.3339273591310251,
|
|
"grad_norm": 11.042834281921387,
|
|
"learning_rate": 1.6850436260830093e-06,
|
|
"loss": 0.1166,
|
|
"num_input_tokens_seen": 3777728,
|
|
"step": 3935
|
|
},
|
|
{
|
|
"epoch": 0.33435166327223353,
|
|
"grad_norm": 26.489280700683594,
|
|
"learning_rate": 1.683963828965249e-06,
|
|
"loss": 0.0663,
|
|
"num_input_tokens_seen": 3782912,
|
|
"step": 3940
|
|
},
|
|
{
|
|
"epoch": 0.33477596741344195,
|
|
"grad_norm": 33.38413619995117,
|
|
"learning_rate": 1.6828825312940592e-06,
|
|
"loss": 0.068,
|
|
"num_input_tokens_seen": 3788160,
|
|
"step": 3945
|
|
},
|
|
{
|
|
"epoch": 0.3352002715546504,
|
|
"grad_norm": 23.24782943725586,
|
|
"learning_rate": 1.6817997354417066e-06,
|
|
"loss": 0.0783,
|
|
"num_input_tokens_seen": 3792448,
|
|
"step": 3950
|
|
},
|
|
{
|
|
"epoch": 0.3356245756958588,
|
|
"grad_norm": 0.514280378818512,
|
|
"learning_rate": 1.6807154437837453e-06,
|
|
"loss": 0.0654,
|
|
"num_input_tokens_seen": 3797376,
|
|
"step": 3955
|
|
},
|
|
{
|
|
"epoch": 0.3360488798370672,
|
|
"grad_norm": 13.792230606079102,
|
|
"learning_rate": 1.6796296586990108e-06,
|
|
"loss": 0.0982,
|
|
"num_input_tokens_seen": 3802496,
|
|
"step": 3960
|
|
},
|
|
{
|
|
"epoch": 0.33647318397827564,
|
|
"grad_norm": 2.523439645767212,
|
|
"learning_rate": 1.6785423825696156e-06,
|
|
"loss": 0.0673,
|
|
"num_input_tokens_seen": 3806912,
|
|
"step": 3965
|
|
},
|
|
{
|
|
"epoch": 0.33689748811948406,
|
|
"grad_norm": 0.8217607140541077,
|
|
"learning_rate": 1.6774536177809426e-06,
|
|
"loss": 0.0466,
|
|
"num_input_tokens_seen": 3811648,
|
|
"step": 3970
|
|
},
|
|
{
|
|
"epoch": 0.3373217922606925,
|
|
"grad_norm": 0.2619585394859314,
|
|
"learning_rate": 1.6763633667216416e-06,
|
|
"loss": 0.0878,
|
|
"num_input_tokens_seen": 3817024,
|
|
"step": 3975
|
|
},
|
|
{
|
|
"epoch": 0.3377460964019009,
|
|
"grad_norm": 39.12828063964844,
|
|
"learning_rate": 1.6752716317836226e-06,
|
|
"loss": 0.0377,
|
|
"num_input_tokens_seen": 3821440,
|
|
"step": 3980
|
|
},
|
|
{
|
|
"epoch": 0.3381704005431093,
|
|
"grad_norm": 0.2853908836841583,
|
|
"learning_rate": 1.6741784153620508e-06,
|
|
"loss": 0.0438,
|
|
"num_input_tokens_seen": 3825984,
|
|
"step": 3985
|
|
},
|
|
{
|
|
"epoch": 0.33859470468431774,
|
|
"grad_norm": 32.900543212890625,
|
|
"learning_rate": 1.6730837198553422e-06,
|
|
"loss": 0.0616,
|
|
"num_input_tokens_seen": 3831104,
|
|
"step": 3990
|
|
},
|
|
{
|
|
"epoch": 0.33901900882552616,
|
|
"grad_norm": 8.256237983703613,
|
|
"learning_rate": 1.6719875476651577e-06,
|
|
"loss": 0.0798,
|
|
"num_input_tokens_seen": 3836160,
|
|
"step": 3995
|
|
},
|
|
{
|
|
"epoch": 0.3394433129667346,
|
|
"grad_norm": 8.798226356506348,
|
|
"learning_rate": 1.6708899011963978e-06,
|
|
"loss": 0.0798,
|
|
"num_input_tokens_seen": 3840640,
|
|
"step": 4000
|
|
},
|
|
{
|
|
"epoch": 0.339867617107943,
|
|
"grad_norm": 18.878061294555664,
|
|
"learning_rate": 1.6697907828571966e-06,
|
|
"loss": 0.05,
|
|
"num_input_tokens_seen": 3845440,
|
|
"step": 4005
|
|
},
|
|
{
|
|
"epoch": 0.3402919212491514,
|
|
"grad_norm": 17.28947639465332,
|
|
"learning_rate": 1.6686901950589193e-06,
|
|
"loss": 0.0974,
|
|
"num_input_tokens_seen": 3850368,
|
|
"step": 4010
|
|
},
|
|
{
|
|
"epoch": 0.3407162253903598,
|
|
"grad_norm": 5.869365692138672,
|
|
"learning_rate": 1.6675881402161536e-06,
|
|
"loss": 0.0453,
|
|
"num_input_tokens_seen": 3855296,
|
|
"step": 4015
|
|
},
|
|
{
|
|
"epoch": 0.3411405295315682,
|
|
"grad_norm": 1.2320866584777832,
|
|
"learning_rate": 1.6664846207467054e-06,
|
|
"loss": 0.065,
|
|
"num_input_tokens_seen": 3859648,
|
|
"step": 4020
|
|
},
|
|
{
|
|
"epoch": 0.3415648336727766,
|
|
"grad_norm": 16.332191467285156,
|
|
"learning_rate": 1.665379639071595e-06,
|
|
"loss": 0.0307,
|
|
"num_input_tokens_seen": 3864512,
|
|
"step": 4025
|
|
},
|
|
{
|
|
"epoch": 0.34198913781398504,
|
|
"grad_norm": 11.825613021850586,
|
|
"learning_rate": 1.6642731976150492e-06,
|
|
"loss": 0.0518,
|
|
"num_input_tokens_seen": 3868800,
|
|
"step": 4030
|
|
},
|
|
{
|
|
"epoch": 0.34241344195519346,
|
|
"grad_norm": 20.51567840576172,
|
|
"learning_rate": 1.6631652988044995e-06,
|
|
"loss": 0.054,
|
|
"num_input_tokens_seen": 3873664,
|
|
"step": 4035
|
|
},
|
|
{
|
|
"epoch": 0.3428377460964019,
|
|
"grad_norm": 28.659391403198242,
|
|
"learning_rate": 1.6620559450705728e-06,
|
|
"loss": 0.0704,
|
|
"num_input_tokens_seen": 3878528,
|
|
"step": 4040
|
|
},
|
|
{
|
|
"epoch": 0.3432620502376103,
|
|
"grad_norm": 28.424604415893555,
|
|
"learning_rate": 1.6609451388470885e-06,
|
|
"loss": 0.0975,
|
|
"num_input_tokens_seen": 3883136,
|
|
"step": 4045
|
|
},
|
|
{
|
|
"epoch": 0.3436863543788187,
|
|
"grad_norm": 9.961625099182129,
|
|
"learning_rate": 1.6598328825710533e-06,
|
|
"loss": 0.0922,
|
|
"num_input_tokens_seen": 3888384,
|
|
"step": 4050
|
|
},
|
|
{
|
|
"epoch": 0.34411065852002715,
|
|
"grad_norm": 1.2856426239013672,
|
|
"learning_rate": 1.6587191786826543e-06,
|
|
"loss": 0.009,
|
|
"num_input_tokens_seen": 3893056,
|
|
"step": 4055
|
|
},
|
|
{
|
|
"epoch": 0.34453496266123557,
|
|
"grad_norm": 0.14181415736675262,
|
|
"learning_rate": 1.6576040296252553e-06,
|
|
"loss": 0.0588,
|
|
"num_input_tokens_seen": 3897600,
|
|
"step": 4060
|
|
},
|
|
{
|
|
"epoch": 0.344959266802444,
|
|
"grad_norm": 24.613985061645508,
|
|
"learning_rate": 1.65648743784539e-06,
|
|
"loss": 0.0862,
|
|
"num_input_tokens_seen": 3902080,
|
|
"step": 4065
|
|
},
|
|
{
|
|
"epoch": 0.3453835709436524,
|
|
"grad_norm": 9.96705150604248,
|
|
"learning_rate": 1.6553694057927573e-06,
|
|
"loss": 0.0624,
|
|
"num_input_tokens_seen": 3906880,
|
|
"step": 4070
|
|
},
|
|
{
|
|
"epoch": 0.3458078750848608,
|
|
"grad_norm": 17.452342987060547,
|
|
"learning_rate": 1.654249935920217e-06,
|
|
"loss": 0.1157,
|
|
"num_input_tokens_seen": 3911040,
|
|
"step": 4075
|
|
},
|
|
{
|
|
"epoch": 0.34623217922606925,
|
|
"grad_norm": 6.189785003662109,
|
|
"learning_rate": 1.6531290306837817e-06,
|
|
"loss": 0.0545,
|
|
"num_input_tokens_seen": 3915712,
|
|
"step": 4080
|
|
},
|
|
{
|
|
"epoch": 0.34665648336727767,
|
|
"grad_norm": 4.528266429901123,
|
|
"learning_rate": 1.6520066925426143e-06,
|
|
"loss": 0.0784,
|
|
"num_input_tokens_seen": 3919936,
|
|
"step": 4085
|
|
},
|
|
{
|
|
"epoch": 0.3470807875084861,
|
|
"grad_norm": 1.8457293510437012,
|
|
"learning_rate": 1.650882923959021e-06,
|
|
"loss": 0.0925,
|
|
"num_input_tokens_seen": 3924480,
|
|
"step": 4090
|
|
},
|
|
{
|
|
"epoch": 0.3475050916496945,
|
|
"grad_norm": 7.639169216156006,
|
|
"learning_rate": 1.649757727398446e-06,
|
|
"loss": 0.1517,
|
|
"num_input_tokens_seen": 3929728,
|
|
"step": 4095
|
|
},
|
|
{
|
|
"epoch": 0.34792939579090293,
|
|
"grad_norm": 7.614553928375244,
|
|
"learning_rate": 1.6486311053294669e-06,
|
|
"loss": 0.0333,
|
|
"num_input_tokens_seen": 3934080,
|
|
"step": 4100
|
|
},
|
|
{
|
|
"epoch": 0.34835369993211135,
|
|
"grad_norm": 8.772347450256348,
|
|
"learning_rate": 1.6475030602237876e-06,
|
|
"loss": 0.0888,
|
|
"num_input_tokens_seen": 3938624,
|
|
"step": 4105
|
|
},
|
|
{
|
|
"epoch": 0.34877800407331977,
|
|
"grad_norm": 17.036487579345703,
|
|
"learning_rate": 1.646373594556236e-06,
|
|
"loss": 0.0974,
|
|
"num_input_tokens_seen": 3944448,
|
|
"step": 4110
|
|
},
|
|
{
|
|
"epoch": 0.3492023082145282,
|
|
"grad_norm": 14.555418968200684,
|
|
"learning_rate": 1.6452427108047542e-06,
|
|
"loss": 0.0257,
|
|
"num_input_tokens_seen": 3949184,
|
|
"step": 4115
|
|
},
|
|
{
|
|
"epoch": 0.3496266123557366,
|
|
"grad_norm": 11.33698844909668,
|
|
"learning_rate": 1.6441104114503977e-06,
|
|
"loss": 0.0604,
|
|
"num_input_tokens_seen": 3953664,
|
|
"step": 4120
|
|
},
|
|
{
|
|
"epoch": 0.35005091649694503,
|
|
"grad_norm": 7.524898529052734,
|
|
"learning_rate": 1.642976698977326e-06,
|
|
"loss": 0.0515,
|
|
"num_input_tokens_seen": 3958336,
|
|
"step": 4125
|
|
},
|
|
{
|
|
"epoch": 0.35047522063815345,
|
|
"grad_norm": 35.69285202026367,
|
|
"learning_rate": 1.6418415758727995e-06,
|
|
"loss": 0.0142,
|
|
"num_input_tokens_seen": 3963584,
|
|
"step": 4130
|
|
},
|
|
{
|
|
"epoch": 0.35047522063815345,
|
|
"eval_loss": 0.07080087810754776,
|
|
"eval_runtime": 16.7283,
|
|
"eval_samples_per_second": 626.183,
|
|
"eval_steps_per_second": 78.31,
|
|
"num_input_tokens_seen": 3963584,
|
|
"step": 4130
|
|
},
|
|
{
|
|
"epoch": 0.35089952477936187,
|
|
"grad_norm": 6.572513580322266,
|
|
"learning_rate": 1.6407050446271738e-06,
|
|
"loss": 0.0851,
|
|
"num_input_tokens_seen": 3968896,
|
|
"step": 4135
|
|
},
|
|
{
|
|
"epoch": 0.3513238289205703,
|
|
"grad_norm": 8.945390701293945,
|
|
"learning_rate": 1.6395671077338928e-06,
|
|
"loss": 0.1231,
|
|
"num_input_tokens_seen": 3973440,
|
|
"step": 4140
|
|
},
|
|
{
|
|
"epoch": 0.35174813306177866,
|
|
"grad_norm": 24.750896453857422,
|
|
"learning_rate": 1.6384277676894855e-06,
|
|
"loss": 0.0503,
|
|
"num_input_tokens_seen": 3978176,
|
|
"step": 4145
|
|
},
|
|
{
|
|
"epoch": 0.3521724372029871,
|
|
"grad_norm": 20.2032527923584,
|
|
"learning_rate": 1.6372870269935583e-06,
|
|
"loss": 0.1018,
|
|
"num_input_tokens_seen": 3983360,
|
|
"step": 4150
|
|
},
|
|
{
|
|
"epoch": 0.3525967413441955,
|
|
"grad_norm": 21.14984893798828,
|
|
"learning_rate": 1.6361448881487912e-06,
|
|
"loss": 0.0591,
|
|
"num_input_tokens_seen": 3987584,
|
|
"step": 4155
|
|
},
|
|
{
|
|
"epoch": 0.3530210454854039,
|
|
"grad_norm": 0.25816965103149414,
|
|
"learning_rate": 1.6350013536609307e-06,
|
|
"loss": 0.0428,
|
|
"num_input_tokens_seen": 3992576,
|
|
"step": 4160
|
|
},
|
|
{
|
|
"epoch": 0.35344534962661234,
|
|
"grad_norm": 0.6929789781570435,
|
|
"learning_rate": 1.6338564260387861e-06,
|
|
"loss": 0.0598,
|
|
"num_input_tokens_seen": 3997824,
|
|
"step": 4165
|
|
},
|
|
{
|
|
"epoch": 0.35386965376782076,
|
|
"grad_norm": 19.583251953125,
|
|
"learning_rate": 1.6327101077942228e-06,
|
|
"loss": 0.0944,
|
|
"num_input_tokens_seen": 4002048,
|
|
"step": 4170
|
|
},
|
|
{
|
|
"epoch": 0.3542939579090292,
|
|
"grad_norm": 43.634063720703125,
|
|
"learning_rate": 1.631562401442157e-06,
|
|
"loss": 0.0685,
|
|
"num_input_tokens_seen": 4006656,
|
|
"step": 4175
|
|
},
|
|
{
|
|
"epoch": 0.3547182620502376,
|
|
"grad_norm": 46.133480072021484,
|
|
"learning_rate": 1.6304133095005505e-06,
|
|
"loss": 0.1111,
|
|
"num_input_tokens_seen": 4011136,
|
|
"step": 4180
|
|
},
|
|
{
|
|
"epoch": 0.355142566191446,
|
|
"grad_norm": 55.2512321472168,
|
|
"learning_rate": 1.6292628344904048e-06,
|
|
"loss": 0.0772,
|
|
"num_input_tokens_seen": 4015808,
|
|
"step": 4185
|
|
},
|
|
{
|
|
"epoch": 0.35556687033265444,
|
|
"grad_norm": 28.896337509155273,
|
|
"learning_rate": 1.628110978935756e-06,
|
|
"loss": 0.0808,
|
|
"num_input_tokens_seen": 4020480,
|
|
"step": 4190
|
|
},
|
|
{
|
|
"epoch": 0.35599117447386286,
|
|
"grad_norm": 1.751360297203064,
|
|
"learning_rate": 1.626957745363668e-06,
|
|
"loss": 0.053,
|
|
"num_input_tokens_seen": 4025088,
|
|
"step": 4195
|
|
},
|
|
{
|
|
"epoch": 0.3564154786150713,
|
|
"grad_norm": 5.575546741485596,
|
|
"learning_rate": 1.6258031363042291e-06,
|
|
"loss": 0.0486,
|
|
"num_input_tokens_seen": 4030272,
|
|
"step": 4200
|
|
},
|
|
{
|
|
"epoch": 0.3568397827562797,
|
|
"grad_norm": 7.347448348999023,
|
|
"learning_rate": 1.624647154290545e-06,
|
|
"loss": 0.0786,
|
|
"num_input_tokens_seen": 4035072,
|
|
"step": 4205
|
|
},
|
|
{
|
|
"epoch": 0.3572640868974881,
|
|
"grad_norm": 1.669090986251831,
|
|
"learning_rate": 1.6234898018587336e-06,
|
|
"loss": 0.0577,
|
|
"num_input_tokens_seen": 4039488,
|
|
"step": 4210
|
|
},
|
|
{
|
|
"epoch": 0.35768839103869654,
|
|
"grad_norm": 14.9818115234375,
|
|
"learning_rate": 1.6223310815479186e-06,
|
|
"loss": 0.1174,
|
|
"num_input_tokens_seen": 4044480,
|
|
"step": 4215
|
|
},
|
|
{
|
|
"epoch": 0.35811269517990496,
|
|
"grad_norm": 0.21652163565158844,
|
|
"learning_rate": 1.6211709959002255e-06,
|
|
"loss": 0.0636,
|
|
"num_input_tokens_seen": 4048768,
|
|
"step": 4220
|
|
},
|
|
{
|
|
"epoch": 0.3585369993211134,
|
|
"grad_norm": 1.3993791341781616,
|
|
"learning_rate": 1.620009547460775e-06,
|
|
"loss": 0.0514,
|
|
"num_input_tokens_seen": 4053504,
|
|
"step": 4225
|
|
},
|
|
{
|
|
"epoch": 0.3589613034623218,
|
|
"grad_norm": 8.678082466125488,
|
|
"learning_rate": 1.6188467387776779e-06,
|
|
"loss": 0.0311,
|
|
"num_input_tokens_seen": 4057856,
|
|
"step": 4230
|
|
},
|
|
{
|
|
"epoch": 0.3593856076035302,
|
|
"grad_norm": 6.082565784454346,
|
|
"learning_rate": 1.6176825724020286e-06,
|
|
"loss": 0.0254,
|
|
"num_input_tokens_seen": 4062784,
|
|
"step": 4235
|
|
},
|
|
{
|
|
"epoch": 0.35980991174473864,
|
|
"grad_norm": 0.4407521188259125,
|
|
"learning_rate": 1.6165170508879007e-06,
|
|
"loss": 0.0296,
|
|
"num_input_tokens_seen": 4067328,
|
|
"step": 4240
|
|
},
|
|
{
|
|
"epoch": 0.36023421588594706,
|
|
"grad_norm": 26.19462776184082,
|
|
"learning_rate": 1.6153501767923408e-06,
|
|
"loss": 0.0801,
|
|
"num_input_tokens_seen": 4072704,
|
|
"step": 4245
|
|
},
|
|
{
|
|
"epoch": 0.3606585200271555,
|
|
"grad_norm": 16.313873291015625,
|
|
"learning_rate": 1.6141819526753626e-06,
|
|
"loss": 0.1075,
|
|
"num_input_tokens_seen": 4077504,
|
|
"step": 4250
|
|
},
|
|
{
|
|
"epoch": 0.3610828241683639,
|
|
"grad_norm": 0.1650468409061432,
|
|
"learning_rate": 1.613012381099942e-06,
|
|
"loss": 0.0992,
|
|
"num_input_tokens_seen": 4082240,
|
|
"step": 4255
|
|
},
|
|
{
|
|
"epoch": 0.3615071283095723,
|
|
"grad_norm": 7.224881172180176,
|
|
"learning_rate": 1.6118414646320111e-06,
|
|
"loss": 0.0631,
|
|
"num_input_tokens_seen": 4086272,
|
|
"step": 4260
|
|
},
|
|
{
|
|
"epoch": 0.36193143245078074,
|
|
"grad_norm": 16.345388412475586,
|
|
"learning_rate": 1.6106692058404518e-06,
|
|
"loss": 0.0587,
|
|
"num_input_tokens_seen": 4090880,
|
|
"step": 4265
|
|
},
|
|
{
|
|
"epoch": 0.36235573659198916,
|
|
"grad_norm": 16.976482391357422,
|
|
"learning_rate": 1.6094956072970924e-06,
|
|
"loss": 0.0658,
|
|
"num_input_tokens_seen": 4095552,
|
|
"step": 4270
|
|
},
|
|
{
|
|
"epoch": 0.36278004073319753,
|
|
"grad_norm": 0.15847325325012207,
|
|
"learning_rate": 1.608320671576699e-06,
|
|
"loss": 0.0618,
|
|
"num_input_tokens_seen": 4100352,
|
|
"step": 4275
|
|
},
|
|
{
|
|
"epoch": 0.36320434487440595,
|
|
"grad_norm": 14.864291191101074,
|
|
"learning_rate": 1.6071444012569723e-06,
|
|
"loss": 0.0495,
|
|
"num_input_tokens_seen": 4104704,
|
|
"step": 4280
|
|
},
|
|
{
|
|
"epoch": 0.36362864901561437,
|
|
"grad_norm": 15.263455390930176,
|
|
"learning_rate": 1.6059667989185405e-06,
|
|
"loss": 0.1172,
|
|
"num_input_tokens_seen": 4109376,
|
|
"step": 4285
|
|
},
|
|
{
|
|
"epoch": 0.3640529531568228,
|
|
"grad_norm": 10.064188003540039,
|
|
"learning_rate": 1.6047878671449544e-06,
|
|
"loss": 0.0851,
|
|
"num_input_tokens_seen": 4114496,
|
|
"step": 4290
|
|
},
|
|
{
|
|
"epoch": 0.3644772572980312,
|
|
"grad_norm": 19.330795288085938,
|
|
"learning_rate": 1.6036076085226812e-06,
|
|
"loss": 0.0434,
|
|
"num_input_tokens_seen": 4119552,
|
|
"step": 4295
|
|
},
|
|
{
|
|
"epoch": 0.36490156143923963,
|
|
"grad_norm": 1.7003531455993652,
|
|
"learning_rate": 1.6024260256410995e-06,
|
|
"loss": 0.0246,
|
|
"num_input_tokens_seen": 4124352,
|
|
"step": 4300
|
|
},
|
|
{
|
|
"epoch": 0.36532586558044805,
|
|
"grad_norm": 19.22929573059082,
|
|
"learning_rate": 1.601243121092493e-06,
|
|
"loss": 0.0164,
|
|
"num_input_tokens_seen": 4129152,
|
|
"step": 4305
|
|
},
|
|
{
|
|
"epoch": 0.36575016972165647,
|
|
"grad_norm": 7.802242755889893,
|
|
"learning_rate": 1.6000588974720443e-06,
|
|
"loss": 0.048,
|
|
"num_input_tokens_seen": 4134144,
|
|
"step": 4310
|
|
},
|
|
{
|
|
"epoch": 0.3661744738628649,
|
|
"grad_norm": 12.478157997131348,
|
|
"learning_rate": 1.5988733573778314e-06,
|
|
"loss": 0.1202,
|
|
"num_input_tokens_seen": 4138816,
|
|
"step": 4315
|
|
},
|
|
{
|
|
"epoch": 0.3665987780040733,
|
|
"grad_norm": 33.31437683105469,
|
|
"learning_rate": 1.597686503410819e-06,
|
|
"loss": 0.1003,
|
|
"num_input_tokens_seen": 4143680,
|
|
"step": 4320
|
|
},
|
|
{
|
|
"epoch": 0.36702308214528173,
|
|
"grad_norm": 12.636787414550781,
|
|
"learning_rate": 1.596498338174856e-06,
|
|
"loss": 0.0842,
|
|
"num_input_tokens_seen": 4149120,
|
|
"step": 4325
|
|
},
|
|
{
|
|
"epoch": 0.36744738628649015,
|
|
"grad_norm": 19.710073471069336,
|
|
"learning_rate": 1.595308864276666e-06,
|
|
"loss": 0.0358,
|
|
"num_input_tokens_seen": 4154432,
|
|
"step": 4330
|
|
},
|
|
{
|
|
"epoch": 0.36787169042769857,
|
|
"grad_norm": 0.14161965250968933,
|
|
"learning_rate": 1.5941180843258452e-06,
|
|
"loss": 0.0514,
|
|
"num_input_tokens_seen": 4159360,
|
|
"step": 4335
|
|
},
|
|
{
|
|
"epoch": 0.368295994568907,
|
|
"grad_norm": 3.578963041305542,
|
|
"learning_rate": 1.5929260009348551e-06,
|
|
"loss": 0.0224,
|
|
"num_input_tokens_seen": 4163520,
|
|
"step": 4340
|
|
},
|
|
{
|
|
"epoch": 0.3687202987101154,
|
|
"grad_norm": 15.590179443359375,
|
|
"learning_rate": 1.5917326167190163e-06,
|
|
"loss": 0.0829,
|
|
"num_input_tokens_seen": 4168640,
|
|
"step": 4345
|
|
},
|
|
{
|
|
"epoch": 0.36914460285132383,
|
|
"grad_norm": 52.827056884765625,
|
|
"learning_rate": 1.5905379342965033e-06,
|
|
"loss": 0.1176,
|
|
"num_input_tokens_seen": 4173312,
|
|
"step": 4350
|
|
},
|
|
{
|
|
"epoch": 0.36956890699253225,
|
|
"grad_norm": 9.324585914611816,
|
|
"learning_rate": 1.589341956288339e-06,
|
|
"loss": 0.1304,
|
|
"num_input_tokens_seen": 4177664,
|
|
"step": 4355
|
|
},
|
|
{
|
|
"epoch": 0.3699932111337407,
|
|
"grad_norm": 0.6234789490699768,
|
|
"learning_rate": 1.5881446853183888e-06,
|
|
"loss": 0.0388,
|
|
"num_input_tokens_seen": 4182016,
|
|
"step": 4360
|
|
},
|
|
{
|
|
"epoch": 0.3704175152749491,
|
|
"grad_norm": 1.5286539793014526,
|
|
"learning_rate": 1.586946124013354e-06,
|
|
"loss": 0.0042,
|
|
"num_input_tokens_seen": 4186624,
|
|
"step": 4365
|
|
},
|
|
{
|
|
"epoch": 0.3708418194161575,
|
|
"grad_norm": 7.857736110687256,
|
|
"learning_rate": 1.585746275002768e-06,
|
|
"loss": 0.0721,
|
|
"num_input_tokens_seen": 4191936,
|
|
"step": 4370
|
|
},
|
|
{
|
|
"epoch": 0.37126612355736593,
|
|
"grad_norm": 10.920904159545898,
|
|
"learning_rate": 1.5845451409189887e-06,
|
|
"loss": 0.0805,
|
|
"num_input_tokens_seen": 4196352,
|
|
"step": 4375
|
|
},
|
|
{
|
|
"epoch": 0.37169042769857435,
|
|
"grad_norm": 22.163360595703125,
|
|
"learning_rate": 1.5833427243971927e-06,
|
|
"loss": 0.0616,
|
|
"num_input_tokens_seen": 4200960,
|
|
"step": 4380
|
|
},
|
|
{
|
|
"epoch": 0.3721147318397828,
|
|
"grad_norm": 0.1889847368001938,
|
|
"learning_rate": 1.582139028075371e-06,
|
|
"loss": 0.0615,
|
|
"num_input_tokens_seen": 4206208,
|
|
"step": 4385
|
|
},
|
|
{
|
|
"epoch": 0.3725390359809912,
|
|
"grad_norm": 18.363664627075195,
|
|
"learning_rate": 1.580934054594322e-06,
|
|
"loss": 0.0458,
|
|
"num_input_tokens_seen": 4210688,
|
|
"step": 4390
|
|
},
|
|
{
|
|
"epoch": 0.3729633401221996,
|
|
"grad_norm": 7.274831295013428,
|
|
"learning_rate": 1.5797278065976463e-06,
|
|
"loss": 0.0502,
|
|
"num_input_tokens_seen": 4215424,
|
|
"step": 4395
|
|
},
|
|
{
|
|
"epoch": 0.37338764426340804,
|
|
"grad_norm": 17.620458602905273,
|
|
"learning_rate": 1.5785202867317407e-06,
|
|
"loss": 0.0088,
|
|
"num_input_tokens_seen": 4220160,
|
|
"step": 4400
|
|
},
|
|
{
|
|
"epoch": 0.37381194840461646,
|
|
"grad_norm": 0.7554625868797302,
|
|
"learning_rate": 1.5773114976457915e-06,
|
|
"loss": 0.1113,
|
|
"num_input_tokens_seen": 4224832,
|
|
"step": 4405
|
|
},
|
|
{
|
|
"epoch": 0.3742362525458248,
|
|
"grad_norm": 19.074024200439453,
|
|
"learning_rate": 1.576101441991771e-06,
|
|
"loss": 0.1088,
|
|
"num_input_tokens_seen": 4230464,
|
|
"step": 4410
|
|
},
|
|
{
|
|
"epoch": 0.37466055668703324,
|
|
"grad_norm": 20.73314094543457,
|
|
"learning_rate": 1.574890122424429e-06,
|
|
"loss": 0.092,
|
|
"num_input_tokens_seen": 4234816,
|
|
"step": 4415
|
|
},
|
|
{
|
|
"epoch": 0.37508486082824166,
|
|
"grad_norm": 4.108742713928223,
|
|
"learning_rate": 1.573677541601289e-06,
|
|
"loss": 0.1154,
|
|
"num_input_tokens_seen": 4239424,
|
|
"step": 4420
|
|
},
|
|
{
|
|
"epoch": 0.3755091649694501,
|
|
"grad_norm": 46.613624572753906,
|
|
"learning_rate": 1.5724637021826409e-06,
|
|
"loss": 0.0449,
|
|
"num_input_tokens_seen": 4244160,
|
|
"step": 4425
|
|
},
|
|
{
|
|
"epoch": 0.3759334691106585,
|
|
"grad_norm": 3.298067808151245,
|
|
"learning_rate": 1.5712486068315367e-06,
|
|
"loss": 0.0804,
|
|
"num_input_tokens_seen": 4248320,
|
|
"step": 4430
|
|
},
|
|
{
|
|
"epoch": 0.3763577732518669,
|
|
"grad_norm": 30.74022674560547,
|
|
"learning_rate": 1.5700322582137826e-06,
|
|
"loss": 0.0146,
|
|
"num_input_tokens_seen": 4252672,
|
|
"step": 4435
|
|
},
|
|
{
|
|
"epoch": 0.37678207739307534,
|
|
"grad_norm": 0.15361003577709198,
|
|
"learning_rate": 1.5688146589979358e-06,
|
|
"loss": 0.0993,
|
|
"num_input_tokens_seen": 4257280,
|
|
"step": 4440
|
|
},
|
|
{
|
|
"epoch": 0.37720638153428376,
|
|
"grad_norm": 1.218698263168335,
|
|
"learning_rate": 1.5675958118552962e-06,
|
|
"loss": 0.035,
|
|
"num_input_tokens_seen": 4262592,
|
|
"step": 4445
|
|
},
|
|
{
|
|
"epoch": 0.3776306856754922,
|
|
"grad_norm": 12.991599082946777,
|
|
"learning_rate": 1.5663757194599013e-06,
|
|
"loss": 0.0887,
|
|
"num_input_tokens_seen": 4267840,
|
|
"step": 4450
|
|
},
|
|
{
|
|
"epoch": 0.3780549898167006,
|
|
"grad_norm": 0.4754124581813812,
|
|
"learning_rate": 1.5651543844885216e-06,
|
|
"loss": 0.0662,
|
|
"num_input_tokens_seen": 4272576,
|
|
"step": 4455
|
|
},
|
|
{
|
|
"epoch": 0.378479293957909,
|
|
"grad_norm": 42.31813049316406,
|
|
"learning_rate": 1.5639318096206533e-06,
|
|
"loss": 0.0402,
|
|
"num_input_tokens_seen": 4277568,
|
|
"step": 4460
|
|
},
|
|
{
|
|
"epoch": 0.37890359809911744,
|
|
"grad_norm": 13.319600105285645,
|
|
"learning_rate": 1.562707997538512e-06,
|
|
"loss": 0.086,
|
|
"num_input_tokens_seen": 4282880,
|
|
"step": 4465
|
|
},
|
|
{
|
|
"epoch": 0.37932790224032586,
|
|
"grad_norm": 30.01394271850586,
|
|
"learning_rate": 1.5614829509270288e-06,
|
|
"loss": 0.0974,
|
|
"num_input_tokens_seen": 4287296,
|
|
"step": 4470
|
|
},
|
|
{
|
|
"epoch": 0.3797522063815343,
|
|
"grad_norm": 0.09843635559082031,
|
|
"learning_rate": 1.5602566724738426e-06,
|
|
"loss": 0.0648,
|
|
"num_input_tokens_seen": 4291712,
|
|
"step": 4475
|
|
},
|
|
{
|
|
"epoch": 0.3801765105227427,
|
|
"grad_norm": 21.0961971282959,
|
|
"learning_rate": 1.5590291648692952e-06,
|
|
"loss": 0.0891,
|
|
"num_input_tokens_seen": 4297088,
|
|
"step": 4480
|
|
},
|
|
{
|
|
"epoch": 0.3806008146639511,
|
|
"grad_norm": 30.536773681640625,
|
|
"learning_rate": 1.5578004308064245e-06,
|
|
"loss": 0.0539,
|
|
"num_input_tokens_seen": 4302784,
|
|
"step": 4485
|
|
},
|
|
{
|
|
"epoch": 0.38102511880515955,
|
|
"grad_norm": 15.418561935424805,
|
|
"learning_rate": 1.55657047298096e-06,
|
|
"loss": 0.0554,
|
|
"num_input_tokens_seen": 4307584,
|
|
"step": 4490
|
|
},
|
|
{
|
|
"epoch": 0.38144942294636797,
|
|
"grad_norm": 0.33246198296546936,
|
|
"learning_rate": 1.5553392940913148e-06,
|
|
"loss": 0.0194,
|
|
"num_input_tokens_seen": 4312064,
|
|
"step": 4495
|
|
},
|
|
{
|
|
"epoch": 0.3818737270875764,
|
|
"grad_norm": 0.3118828535079956,
|
|
"learning_rate": 1.554106896838582e-06,
|
|
"loss": 0.0279,
|
|
"num_input_tokens_seen": 4316672,
|
|
"step": 4500
|
|
},
|
|
{
|
|
"epoch": 0.3822980312287848,
|
|
"grad_norm": 32.034019470214844,
|
|
"learning_rate": 1.5528732839265272e-06,
|
|
"loss": 0.057,
|
|
"num_input_tokens_seen": 4321088,
|
|
"step": 4505
|
|
},
|
|
{
|
|
"epoch": 0.3827223353699932,
|
|
"grad_norm": 28.745563507080078,
|
|
"learning_rate": 1.5516384580615832e-06,
|
|
"loss": 0.0411,
|
|
"num_input_tokens_seen": 4326208,
|
|
"step": 4510
|
|
},
|
|
{
|
|
"epoch": 0.38314663951120165,
|
|
"grad_norm": 0.07716988027095795,
|
|
"learning_rate": 1.5504024219528437e-06,
|
|
"loss": 0.0615,
|
|
"num_input_tokens_seen": 4331008,
|
|
"step": 4515
|
|
},
|
|
{
|
|
"epoch": 0.38357094365241007,
|
|
"grad_norm": 15.384174346923828,
|
|
"learning_rate": 1.5491651783120578e-06,
|
|
"loss": 0.0182,
|
|
"num_input_tokens_seen": 4335936,
|
|
"step": 4520
|
|
},
|
|
{
|
|
"epoch": 0.3839952477936185,
|
|
"grad_norm": 0.0799812600016594,
|
|
"learning_rate": 1.5479267298536238e-06,
|
|
"loss": 0.0459,
|
|
"num_input_tokens_seen": 4340608,
|
|
"step": 4525
|
|
},
|
|
{
|
|
"epoch": 0.3844195519348269,
|
|
"grad_norm": 23.019872665405273,
|
|
"learning_rate": 1.5466870792945828e-06,
|
|
"loss": 0.0559,
|
|
"num_input_tokens_seen": 4345344,
|
|
"step": 4530
|
|
},
|
|
{
|
|
"epoch": 0.38484385607603533,
|
|
"grad_norm": 7.923668384552002,
|
|
"learning_rate": 1.545446229354614e-06,
|
|
"loss": 0.0846,
|
|
"num_input_tokens_seen": 4351680,
|
|
"step": 4535
|
|
},
|
|
{
|
|
"epoch": 0.3852681602172437,
|
|
"grad_norm": 35.16139221191406,
|
|
"learning_rate": 1.5442041827560272e-06,
|
|
"loss": 0.1022,
|
|
"num_input_tokens_seen": 4356032,
|
|
"step": 4540
|
|
},
|
|
{
|
|
"epoch": 0.3856924643584521,
|
|
"grad_norm": 27.721284866333008,
|
|
"learning_rate": 1.542960942223758e-06,
|
|
"loss": 0.0553,
|
|
"num_input_tokens_seen": 4360576,
|
|
"step": 4545
|
|
},
|
|
{
|
|
"epoch": 0.38611676849966053,
|
|
"grad_norm": 32.3733024597168,
|
|
"learning_rate": 1.541716510485361e-06,
|
|
"loss": 0.1058,
|
|
"num_input_tokens_seen": 4365376,
|
|
"step": 4550
|
|
},
|
|
{
|
|
"epoch": 0.38654107264086895,
|
|
"grad_norm": 0.24614597856998444,
|
|
"learning_rate": 1.5404708902710048e-06,
|
|
"loss": 0.0121,
|
|
"num_input_tokens_seen": 4370368,
|
|
"step": 4555
|
|
},
|
|
{
|
|
"epoch": 0.3869653767820774,
|
|
"grad_norm": 7.687742710113525,
|
|
"learning_rate": 1.5392240843134648e-06,
|
|
"loss": 0.0926,
|
|
"num_input_tokens_seen": 4375488,
|
|
"step": 4560
|
|
},
|
|
{
|
|
"epoch": 0.3873896809232858,
|
|
"grad_norm": 34.76953125,
|
|
"learning_rate": 1.5379760953481178e-06,
|
|
"loss": 0.1102,
|
|
"num_input_tokens_seen": 4380352,
|
|
"step": 4565
|
|
},
|
|
{
|
|
"epoch": 0.3878139850644942,
|
|
"grad_norm": 25.13454818725586,
|
|
"learning_rate": 1.5367269261129367e-06,
|
|
"loss": 0.0729,
|
|
"num_input_tokens_seen": 4385664,
|
|
"step": 4570
|
|
},
|
|
{
|
|
"epoch": 0.38823828920570264,
|
|
"grad_norm": 16.465362548828125,
|
|
"learning_rate": 1.5354765793484831e-06,
|
|
"loss": 0.1266,
|
|
"num_input_tokens_seen": 4390528,
|
|
"step": 4575
|
|
},
|
|
{
|
|
"epoch": 0.38866259334691106,
|
|
"grad_norm": 0.2559778094291687,
|
|
"learning_rate": 1.5342250577979023e-06,
|
|
"loss": 0.0143,
|
|
"num_input_tokens_seen": 4395136,
|
|
"step": 4580
|
|
},
|
|
{
|
|
"epoch": 0.3890868974881195,
|
|
"grad_norm": 8.67482852935791,
|
|
"learning_rate": 1.532972364206917e-06,
|
|
"loss": 0.0737,
|
|
"num_input_tokens_seen": 4400320,
|
|
"step": 4585
|
|
},
|
|
{
|
|
"epoch": 0.3895112016293279,
|
|
"grad_norm": 0.35467416048049927,
|
|
"learning_rate": 1.5317185013238209e-06,
|
|
"loss": 0.0596,
|
|
"num_input_tokens_seen": 4405056,
|
|
"step": 4590
|
|
},
|
|
{
|
|
"epoch": 0.3899355057705363,
|
|
"grad_norm": 33.41234588623047,
|
|
"learning_rate": 1.5304634718994738e-06,
|
|
"loss": 0.0734,
|
|
"num_input_tokens_seen": 4410624,
|
|
"step": 4595
|
|
},
|
|
{
|
|
"epoch": 0.39035980991174474,
|
|
"grad_norm": 11.3428316116333,
|
|
"learning_rate": 1.5292072786872938e-06,
|
|
"loss": 0.0625,
|
|
"num_input_tokens_seen": 4415424,
|
|
"step": 4600
|
|
},
|
|
{
|
|
"epoch": 0.39078411405295316,
|
|
"grad_norm": 8.808953285217285,
|
|
"learning_rate": 1.527949924443253e-06,
|
|
"loss": 0.0476,
|
|
"num_input_tokens_seen": 4420224,
|
|
"step": 4605
|
|
},
|
|
{
|
|
"epoch": 0.3912084181941616,
|
|
"grad_norm": 13.808575630187988,
|
|
"learning_rate": 1.52669141192587e-06,
|
|
"loss": 0.0841,
|
|
"num_input_tokens_seen": 4424896,
|
|
"step": 4610
|
|
},
|
|
{
|
|
"epoch": 0.39163272233537,
|
|
"grad_norm": 18.995351791381836,
|
|
"learning_rate": 1.5254317438962052e-06,
|
|
"loss": 0.105,
|
|
"num_input_tokens_seen": 4429312,
|
|
"step": 4615
|
|
},
|
|
{
|
|
"epoch": 0.3920570264765784,
|
|
"grad_norm": 6.78574800491333,
|
|
"learning_rate": 1.5241709231178539e-06,
|
|
"loss": 0.1105,
|
|
"num_input_tokens_seen": 4433920,
|
|
"step": 4620
|
|
},
|
|
{
|
|
"epoch": 0.39248133061778684,
|
|
"grad_norm": 11.566998481750488,
|
|
"learning_rate": 1.5229089523569405e-06,
|
|
"loss": 0.07,
|
|
"num_input_tokens_seen": 4438464,
|
|
"step": 4625
|
|
},
|
|
{
|
|
"epoch": 0.39290563475899526,
|
|
"grad_norm": 5.315352439880371,
|
|
"learning_rate": 1.5216458343821122e-06,
|
|
"loss": 0.043,
|
|
"num_input_tokens_seen": 4443584,
|
|
"step": 4630
|
|
},
|
|
{
|
|
"epoch": 0.3933299389002037,
|
|
"grad_norm": 1.3771780729293823,
|
|
"learning_rate": 1.5203815719645328e-06,
|
|
"loss": 0.0455,
|
|
"num_input_tokens_seen": 4448960,
|
|
"step": 4635
|
|
},
|
|
{
|
|
"epoch": 0.3937542430414121,
|
|
"grad_norm": 0.76198810338974,
|
|
"learning_rate": 1.5191161678778773e-06,
|
|
"loss": 0.0259,
|
|
"num_input_tokens_seen": 4453504,
|
|
"step": 4640
|
|
},
|
|
{
|
|
"epoch": 0.3941785471826205,
|
|
"grad_norm": 24.077634811401367,
|
|
"learning_rate": 1.5178496248983251e-06,
|
|
"loss": 0.0853,
|
|
"num_input_tokens_seen": 4458048,
|
|
"step": 4645
|
|
},
|
|
{
|
|
"epoch": 0.39460285132382894,
|
|
"grad_norm": 14.514925003051758,
|
|
"learning_rate": 1.5165819458045554e-06,
|
|
"loss": 0.039,
|
|
"num_input_tokens_seen": 4463168,
|
|
"step": 4650
|
|
},
|
|
{
|
|
"epoch": 0.39502715546503736,
|
|
"grad_norm": 8.085332870483398,
|
|
"learning_rate": 1.5153131333777377e-06,
|
|
"loss": 0.1405,
|
|
"num_input_tokens_seen": 4467520,
|
|
"step": 4655
|
|
},
|
|
{
|
|
"epoch": 0.3954514596062458,
|
|
"grad_norm": 39.1966438293457,
|
|
"learning_rate": 1.51404319040153e-06,
|
|
"loss": 0.076,
|
|
"num_input_tokens_seen": 4472960,
|
|
"step": 4660
|
|
},
|
|
{
|
|
"epoch": 0.3958757637474542,
|
|
"grad_norm": 22.962270736694336,
|
|
"learning_rate": 1.5127721196620697e-06,
|
|
"loss": 0.0622,
|
|
"num_input_tokens_seen": 4477312,
|
|
"step": 4665
|
|
},
|
|
{
|
|
"epoch": 0.39630006788866257,
|
|
"grad_norm": 1.0136003494262695,
|
|
"learning_rate": 1.5114999239479685e-06,
|
|
"loss": 0.0216,
|
|
"num_input_tokens_seen": 4482432,
|
|
"step": 4670
|
|
},
|
|
{
|
|
"epoch": 0.396724372029871,
|
|
"grad_norm": 24.0020694732666,
|
|
"learning_rate": 1.5102266060503063e-06,
|
|
"loss": 0.0665,
|
|
"num_input_tokens_seen": 4486464,
|
|
"step": 4675
|
|
},
|
|
{
|
|
"epoch": 0.3971486761710794,
|
|
"grad_norm": 7.203904628753662,
|
|
"learning_rate": 1.508952168762624e-06,
|
|
"loss": 0.0947,
|
|
"num_input_tokens_seen": 4491520,
|
|
"step": 4680
|
|
},
|
|
{
|
|
"epoch": 0.3975729803122878,
|
|
"grad_norm": 29.702621459960938,
|
|
"learning_rate": 1.5076766148809209e-06,
|
|
"loss": 0.0837,
|
|
"num_input_tokens_seen": 4496256,
|
|
"step": 4685
|
|
},
|
|
{
|
|
"epoch": 0.39799728445349625,
|
|
"grad_norm": 8.779284477233887,
|
|
"learning_rate": 1.506399947203643e-06,
|
|
"loss": 0.0599,
|
|
"num_input_tokens_seen": 4500416,
|
|
"step": 4690
|
|
},
|
|
{
|
|
"epoch": 0.39842158859470467,
|
|
"grad_norm": 44.37575149536133,
|
|
"learning_rate": 1.5051221685316815e-06,
|
|
"loss": 0.0455,
|
|
"num_input_tokens_seen": 4505536,
|
|
"step": 4695
|
|
},
|
|
{
|
|
"epoch": 0.3988458927359131,
|
|
"grad_norm": 8.90916633605957,
|
|
"learning_rate": 1.5038432816683652e-06,
|
|
"loss": 0.1264,
|
|
"num_input_tokens_seen": 4510400,
|
|
"step": 4700
|
|
},
|
|
{
|
|
"epoch": 0.3992701968771215,
|
|
"grad_norm": 2.163975477218628,
|
|
"learning_rate": 1.5025632894194532e-06,
|
|
"loss": 0.0898,
|
|
"num_input_tokens_seen": 4515904,
|
|
"step": 4705
|
|
},
|
|
{
|
|
"epoch": 0.39969450101832993,
|
|
"grad_norm": 19.975688934326172,
|
|
"learning_rate": 1.5012821945931303e-06,
|
|
"loss": 0.0969,
|
|
"num_input_tokens_seen": 4520448,
|
|
"step": 4710
|
|
},
|
|
{
|
|
"epoch": 0.40011880515953835,
|
|
"grad_norm": 17.143659591674805,
|
|
"learning_rate": 1.5e-06,
|
|
"loss": 0.0681,
|
|
"num_input_tokens_seen": 4525824,
|
|
"step": 4715
|
|
},
|
|
{
|
|
"epoch": 0.40054310930074677,
|
|
"grad_norm": 29.65882682800293,
|
|
"learning_rate": 1.498716708453079e-06,
|
|
"loss": 0.0847,
|
|
"num_input_tokens_seen": 4530304,
|
|
"step": 4720
|
|
},
|
|
{
|
|
"epoch": 0.40054310930074677,
|
|
"eval_loss": 0.05730587989091873,
|
|
"eval_runtime": 16.5764,
|
|
"eval_samples_per_second": 631.923,
|
|
"eval_steps_per_second": 79.028,
|
|
"num_input_tokens_seen": 4530304,
|
|
"step": 4720
|
|
},
|
|
{
|
|
"epoch": 0.4009674134419552,
|
|
"grad_norm": 12.743217468261719,
|
|
"learning_rate": 1.4974323227677903e-06,
|
|
"loss": 0.0852,
|
|
"num_input_tokens_seen": 4534720,
|
|
"step": 4725
|
|
},
|
|
{
|
|
"epoch": 0.4013917175831636,
|
|
"grad_norm": 12.805205345153809,
|
|
"learning_rate": 1.4961468457619575e-06,
|
|
"loss": 0.101,
|
|
"num_input_tokens_seen": 4539520,
|
|
"step": 4730
|
|
},
|
|
{
|
|
"epoch": 0.40181602172437203,
|
|
"grad_norm": 0.24352645874023438,
|
|
"learning_rate": 1.4948602802557982e-06,
|
|
"loss": 0.0093,
|
|
"num_input_tokens_seen": 4544448,
|
|
"step": 4735
|
|
},
|
|
{
|
|
"epoch": 0.40224032586558045,
|
|
"grad_norm": 21.318071365356445,
|
|
"learning_rate": 1.4935726290719177e-06,
|
|
"loss": 0.0641,
|
|
"num_input_tokens_seen": 4549632,
|
|
"step": 4740
|
|
},
|
|
{
|
|
"epoch": 0.40266463000678887,
|
|
"grad_norm": 3.4728119373321533,
|
|
"learning_rate": 1.492283895035305e-06,
|
|
"loss": 0.0724,
|
|
"num_input_tokens_seen": 4554560,
|
|
"step": 4745
|
|
},
|
|
{
|
|
"epoch": 0.4030889341479973,
|
|
"grad_norm": 1.816063642501831,
|
|
"learning_rate": 1.490994080973322e-06,
|
|
"loss": 0.0549,
|
|
"num_input_tokens_seen": 4559168,
|
|
"step": 4750
|
|
},
|
|
{
|
|
"epoch": 0.4035132382892057,
|
|
"grad_norm": 29.83852767944336,
|
|
"learning_rate": 1.4897031897157025e-06,
|
|
"loss": 0.066,
|
|
"num_input_tokens_seen": 4563968,
|
|
"step": 4755
|
|
},
|
|
{
|
|
"epoch": 0.40393754243041413,
|
|
"grad_norm": 14.2861328125,
|
|
"learning_rate": 1.4884112240945425e-06,
|
|
"loss": 0.0544,
|
|
"num_input_tokens_seen": 4569408,
|
|
"step": 4760
|
|
},
|
|
{
|
|
"epoch": 0.40436184657162255,
|
|
"grad_norm": 0.583966851234436,
|
|
"learning_rate": 1.4871181869442952e-06,
|
|
"loss": 0.0728,
|
|
"num_input_tokens_seen": 4573824,
|
|
"step": 4765
|
|
},
|
|
{
|
|
"epoch": 0.40478615071283097,
|
|
"grad_norm": 20.67850112915039,
|
|
"learning_rate": 1.485824081101764e-06,
|
|
"loss": 0.1322,
|
|
"num_input_tokens_seen": 4578368,
|
|
"step": 4770
|
|
},
|
|
{
|
|
"epoch": 0.4052104548540394,
|
|
"grad_norm": 42.52288818359375,
|
|
"learning_rate": 1.4845289094060984e-06,
|
|
"loss": 0.0845,
|
|
"num_input_tokens_seen": 4583040,
|
|
"step": 4775
|
|
},
|
|
{
|
|
"epoch": 0.4056347589952478,
|
|
"grad_norm": 17.58137321472168,
|
|
"learning_rate": 1.4832326746987846e-06,
|
|
"loss": 0.1076,
|
|
"num_input_tokens_seen": 4587968,
|
|
"step": 4780
|
|
},
|
|
{
|
|
"epoch": 0.40605906313645623,
|
|
"grad_norm": 10.369521141052246,
|
|
"learning_rate": 1.4819353798236424e-06,
|
|
"loss": 0.1363,
|
|
"num_input_tokens_seen": 4593216,
|
|
"step": 4785
|
|
},
|
|
{
|
|
"epoch": 0.40648336727766465,
|
|
"grad_norm": 13.735040664672852,
|
|
"learning_rate": 1.4806370276268163e-06,
|
|
"loss": 0.0876,
|
|
"num_input_tokens_seen": 4597824,
|
|
"step": 4790
|
|
},
|
|
{
|
|
"epoch": 0.4069076714188731,
|
|
"grad_norm": 16.774799346923828,
|
|
"learning_rate": 1.4793376209567714e-06,
|
|
"loss": 0.0581,
|
|
"num_input_tokens_seen": 4602880,
|
|
"step": 4795
|
|
},
|
|
{
|
|
"epoch": 0.4073319755600815,
|
|
"grad_norm": 6.756897449493408,
|
|
"learning_rate": 1.4780371626642858e-06,
|
|
"loss": 0.0181,
|
|
"num_input_tokens_seen": 4607744,
|
|
"step": 4800
|
|
},
|
|
{
|
|
"epoch": 0.40775627970128986,
|
|
"grad_norm": 7.229116916656494,
|
|
"learning_rate": 1.4767356556024448e-06,
|
|
"loss": 0.1136,
|
|
"num_input_tokens_seen": 4612224,
|
|
"step": 4805
|
|
},
|
|
{
|
|
"epoch": 0.4081805838424983,
|
|
"grad_norm": 14.638116836547852,
|
|
"learning_rate": 1.4754331026266344e-06,
|
|
"loss": 0.0631,
|
|
"num_input_tokens_seen": 4616704,
|
|
"step": 4810
|
|
},
|
|
{
|
|
"epoch": 0.4086048879837067,
|
|
"grad_norm": 7.521239280700684,
|
|
"learning_rate": 1.474129506594536e-06,
|
|
"loss": 0.0563,
|
|
"num_input_tokens_seen": 4621568,
|
|
"step": 4815
|
|
},
|
|
{
|
|
"epoch": 0.4090291921249151,
|
|
"grad_norm": 19.64031219482422,
|
|
"learning_rate": 1.472824870366118e-06,
|
|
"loss": 0.0475,
|
|
"num_input_tokens_seen": 4626176,
|
|
"step": 4820
|
|
},
|
|
{
|
|
"epoch": 0.40945349626612354,
|
|
"grad_norm": 7.8665924072265625,
|
|
"learning_rate": 1.4715191968036324e-06,
|
|
"loss": 0.0625,
|
|
"num_input_tokens_seen": 4630400,
|
|
"step": 4825
|
|
},
|
|
{
|
|
"epoch": 0.40987780040733196,
|
|
"grad_norm": 29.786128997802734,
|
|
"learning_rate": 1.4702124887716058e-06,
|
|
"loss": 0.055,
|
|
"num_input_tokens_seen": 4634688,
|
|
"step": 4830
|
|
},
|
|
{
|
|
"epoch": 0.4103021045485404,
|
|
"grad_norm": 2.612760066986084,
|
|
"learning_rate": 1.4689047491368354e-06,
|
|
"loss": 0.0478,
|
|
"num_input_tokens_seen": 4639104,
|
|
"step": 4835
|
|
},
|
|
{
|
|
"epoch": 0.4107264086897488,
|
|
"grad_norm": 38.5584716796875,
|
|
"learning_rate": 1.4675959807683808e-06,
|
|
"loss": 0.0919,
|
|
"num_input_tokens_seen": 4643328,
|
|
"step": 4840
|
|
},
|
|
{
|
|
"epoch": 0.4111507128309572,
|
|
"grad_norm": 9.823929786682129,
|
|
"learning_rate": 1.4662861865375588e-06,
|
|
"loss": 0.0496,
|
|
"num_input_tokens_seen": 4648320,
|
|
"step": 4845
|
|
},
|
|
{
|
|
"epoch": 0.41157501697216564,
|
|
"grad_norm": 2.0782713890075684,
|
|
"learning_rate": 1.4649753693179373e-06,
|
|
"loss": 0.0994,
|
|
"num_input_tokens_seen": 4653120,
|
|
"step": 4850
|
|
},
|
|
{
|
|
"epoch": 0.41199932111337406,
|
|
"grad_norm": 27.476821899414062,
|
|
"learning_rate": 1.4636635319853272e-06,
|
|
"loss": 0.0693,
|
|
"num_input_tokens_seen": 4658048,
|
|
"step": 4855
|
|
},
|
|
{
|
|
"epoch": 0.4124236252545825,
|
|
"grad_norm": 1.055557370185852,
|
|
"learning_rate": 1.4623506774177796e-06,
|
|
"loss": 0.0279,
|
|
"num_input_tokens_seen": 4662976,
|
|
"step": 4860
|
|
},
|
|
{
|
|
"epoch": 0.4128479293957909,
|
|
"grad_norm": 20.705156326293945,
|
|
"learning_rate": 1.4610368084955748e-06,
|
|
"loss": 0.0648,
|
|
"num_input_tokens_seen": 4667840,
|
|
"step": 4865
|
|
},
|
|
{
|
|
"epoch": 0.4132722335369993,
|
|
"grad_norm": 56.287742614746094,
|
|
"learning_rate": 1.4597219281012208e-06,
|
|
"loss": 0.0903,
|
|
"num_input_tokens_seen": 4673408,
|
|
"step": 4870
|
|
},
|
|
{
|
|
"epoch": 0.41369653767820774,
|
|
"grad_norm": 29.7710018157959,
|
|
"learning_rate": 1.4584060391194436e-06,
|
|
"loss": 0.102,
|
|
"num_input_tokens_seen": 4679552,
|
|
"step": 4875
|
|
},
|
|
{
|
|
"epoch": 0.41412084181941616,
|
|
"grad_norm": 0.7720720767974854,
|
|
"learning_rate": 1.4570891444371814e-06,
|
|
"loss": 0.0925,
|
|
"num_input_tokens_seen": 4684352,
|
|
"step": 4880
|
|
},
|
|
{
|
|
"epoch": 0.4145451459606246,
|
|
"grad_norm": 3.5888407230377197,
|
|
"learning_rate": 1.4557712469435797e-06,
|
|
"loss": 0.0633,
|
|
"num_input_tokens_seen": 4688704,
|
|
"step": 4885
|
|
},
|
|
{
|
|
"epoch": 0.414969450101833,
|
|
"grad_norm": 13.542695999145508,
|
|
"learning_rate": 1.4544523495299841e-06,
|
|
"loss": 0.0528,
|
|
"num_input_tokens_seen": 4693440,
|
|
"step": 4890
|
|
},
|
|
{
|
|
"epoch": 0.4153937542430414,
|
|
"grad_norm": 2.202542304992676,
|
|
"learning_rate": 1.4531324550899333e-06,
|
|
"loss": 0.1069,
|
|
"num_input_tokens_seen": 4698496,
|
|
"step": 4895
|
|
},
|
|
{
|
|
"epoch": 0.41581805838424984,
|
|
"grad_norm": 20.359878540039062,
|
|
"learning_rate": 1.451811566519154e-06,
|
|
"loss": 0.1123,
|
|
"num_input_tokens_seen": 4703616,
|
|
"step": 4900
|
|
},
|
|
{
|
|
"epoch": 0.41624236252545826,
|
|
"grad_norm": 5.599926948547363,
|
|
"learning_rate": 1.450489686715553e-06,
|
|
"loss": 0.0755,
|
|
"num_input_tokens_seen": 4707584,
|
|
"step": 4905
|
|
},
|
|
{
|
|
"epoch": 0.4166666666666667,
|
|
"grad_norm": 33.29148864746094,
|
|
"learning_rate": 1.4491668185792131e-06,
|
|
"loss": 0.1261,
|
|
"num_input_tokens_seen": 4711936,
|
|
"step": 4910
|
|
},
|
|
{
|
|
"epoch": 0.4170909708078751,
|
|
"grad_norm": 2.7609353065490723,
|
|
"learning_rate": 1.4478429650123851e-06,
|
|
"loss": 0.0594,
|
|
"num_input_tokens_seen": 4716992,
|
|
"step": 4915
|
|
},
|
|
{
|
|
"epoch": 0.4175152749490835,
|
|
"grad_norm": 20.4354190826416,
|
|
"learning_rate": 1.44651812891948e-06,
|
|
"loss": 0.0584,
|
|
"num_input_tokens_seen": 4721792,
|
|
"step": 4920
|
|
},
|
|
{
|
|
"epoch": 0.41793957909029195,
|
|
"grad_norm": 12.78392505645752,
|
|
"learning_rate": 1.4451923132070669e-06,
|
|
"loss": 0.0707,
|
|
"num_input_tokens_seen": 4726208,
|
|
"step": 4925
|
|
},
|
|
{
|
|
"epoch": 0.41836388323150037,
|
|
"grad_norm": 23.896026611328125,
|
|
"learning_rate": 1.4438655207838628e-06,
|
|
"loss": 0.0552,
|
|
"num_input_tokens_seen": 4730880,
|
|
"step": 4930
|
|
},
|
|
{
|
|
"epoch": 0.41878818737270873,
|
|
"grad_norm": 2.820502758026123,
|
|
"learning_rate": 1.4425377545607275e-06,
|
|
"loss": 0.0935,
|
|
"num_input_tokens_seen": 4736128,
|
|
"step": 4935
|
|
},
|
|
{
|
|
"epoch": 0.41921249151391715,
|
|
"grad_norm": 0.9861387610435486,
|
|
"learning_rate": 1.4412090174506567e-06,
|
|
"loss": 0.031,
|
|
"num_input_tokens_seen": 4740608,
|
|
"step": 4940
|
|
},
|
|
{
|
|
"epoch": 0.41963679565512557,
|
|
"grad_norm": 0.9733887314796448,
|
|
"learning_rate": 1.4398793123687777e-06,
|
|
"loss": 0.0755,
|
|
"num_input_tokens_seen": 4745408,
|
|
"step": 4945
|
|
},
|
|
{
|
|
"epoch": 0.420061099796334,
|
|
"grad_norm": 0.27657586336135864,
|
|
"learning_rate": 1.4385486422323404e-06,
|
|
"loss": 0.0051,
|
|
"num_input_tokens_seen": 4750400,
|
|
"step": 4950
|
|
},
|
|
{
|
|
"epoch": 0.4204854039375424,
|
|
"grad_norm": 0.5651236176490784,
|
|
"learning_rate": 1.4372170099607123e-06,
|
|
"loss": 0.0511,
|
|
"num_input_tokens_seen": 4755008,
|
|
"step": 4955
|
|
},
|
|
{
|
|
"epoch": 0.42090970807875083,
|
|
"grad_norm": 7.516690731048584,
|
|
"learning_rate": 1.435884418475371e-06,
|
|
"loss": 0.1285,
|
|
"num_input_tokens_seen": 4759424,
|
|
"step": 4960
|
|
},
|
|
{
|
|
"epoch": 0.42133401221995925,
|
|
"grad_norm": 11.492887496948242,
|
|
"learning_rate": 1.4345508706998994e-06,
|
|
"loss": 0.0389,
|
|
"num_input_tokens_seen": 4763968,
|
|
"step": 4965
|
|
},
|
|
{
|
|
"epoch": 0.4217583163611677,
|
|
"grad_norm": 22.433889389038086,
|
|
"learning_rate": 1.433216369559978e-06,
|
|
"loss": 0.0866,
|
|
"num_input_tokens_seen": 4768960,
|
|
"step": 4970
|
|
},
|
|
{
|
|
"epoch": 0.4221826205023761,
|
|
"grad_norm": 3.125324010848999,
|
|
"learning_rate": 1.4318809179833791e-06,
|
|
"loss": 0.0609,
|
|
"num_input_tokens_seen": 4774592,
|
|
"step": 4975
|
|
},
|
|
{
|
|
"epoch": 0.4226069246435845,
|
|
"grad_norm": 14.641558647155762,
|
|
"learning_rate": 1.4305445188999596e-06,
|
|
"loss": 0.1017,
|
|
"num_input_tokens_seen": 4778944,
|
|
"step": 4980
|
|
},
|
|
{
|
|
"epoch": 0.42303122878479293,
|
|
"grad_norm": 2.0290579795837402,
|
|
"learning_rate": 1.4292071752416558e-06,
|
|
"loss": 0.088,
|
|
"num_input_tokens_seen": 4783488,
|
|
"step": 4985
|
|
},
|
|
{
|
|
"epoch": 0.42345553292600135,
|
|
"grad_norm": 2.063349962234497,
|
|
"learning_rate": 1.4278688899424764e-06,
|
|
"loss": 0.0369,
|
|
"num_input_tokens_seen": 4788288,
|
|
"step": 4990
|
|
},
|
|
{
|
|
"epoch": 0.4238798370672098,
|
|
"grad_norm": 0.79680997133255,
|
|
"learning_rate": 1.4265296659384953e-06,
|
|
"loss": 0.0197,
|
|
"num_input_tokens_seen": 4792896,
|
|
"step": 4995
|
|
},
|
|
{
|
|
"epoch": 0.4243041412084182,
|
|
"grad_norm": 3.5232551097869873,
|
|
"learning_rate": 1.4251895061678463e-06,
|
|
"loss": 0.1148,
|
|
"num_input_tokens_seen": 4797056,
|
|
"step": 5000
|
|
},
|
|
{
|
|
"epoch": 0.4247284453496266,
|
|
"grad_norm": 1.2261766195297241,
|
|
"learning_rate": 1.4238484135707162e-06,
|
|
"loss": 0.0636,
|
|
"num_input_tokens_seen": 4802304,
|
|
"step": 5005
|
|
},
|
|
{
|
|
"epoch": 0.42515274949083504,
|
|
"grad_norm": 7.105469226837158,
|
|
"learning_rate": 1.4225063910893384e-06,
|
|
"loss": 0.0641,
|
|
"num_input_tokens_seen": 4807424,
|
|
"step": 5010
|
|
},
|
|
{
|
|
"epoch": 0.42557705363204346,
|
|
"grad_norm": 38.291893005371094,
|
|
"learning_rate": 1.4211634416679855e-06,
|
|
"loss": 0.0701,
|
|
"num_input_tokens_seen": 4812224,
|
|
"step": 5015
|
|
},
|
|
{
|
|
"epoch": 0.4260013577732519,
|
|
"grad_norm": 33.8049201965332,
|
|
"learning_rate": 1.419819568252965e-06,
|
|
"loss": 0.1662,
|
|
"num_input_tokens_seen": 4817216,
|
|
"step": 5020
|
|
},
|
|
{
|
|
"epoch": 0.4264256619144603,
|
|
"grad_norm": 0.6801443696022034,
|
|
"learning_rate": 1.418474773792611e-06,
|
|
"loss": 0.0381,
|
|
"num_input_tokens_seen": 4822336,
|
|
"step": 5025
|
|
},
|
|
{
|
|
"epoch": 0.4268499660556687,
|
|
"grad_norm": 44.115394592285156,
|
|
"learning_rate": 1.4171290612372779e-06,
|
|
"loss": 0.0319,
|
|
"num_input_tokens_seen": 4827328,
|
|
"step": 5030
|
|
},
|
|
{
|
|
"epoch": 0.42727427019687714,
|
|
"grad_norm": 11.847403526306152,
|
|
"learning_rate": 1.4157824335393349e-06,
|
|
"loss": 0.0712,
|
|
"num_input_tokens_seen": 4831808,
|
|
"step": 5035
|
|
},
|
|
{
|
|
"epoch": 0.42769857433808556,
|
|
"grad_norm": 0.15182752907276154,
|
|
"learning_rate": 1.4144348936531588e-06,
|
|
"loss": 0.0301,
|
|
"num_input_tokens_seen": 4836096,
|
|
"step": 5040
|
|
},
|
|
{
|
|
"epoch": 0.428122878479294,
|
|
"grad_norm": 37.98849105834961,
|
|
"learning_rate": 1.413086444535127e-06,
|
|
"loss": 0.047,
|
|
"num_input_tokens_seen": 4840448,
|
|
"step": 5045
|
|
},
|
|
{
|
|
"epoch": 0.4285471826205024,
|
|
"grad_norm": 0.8894705176353455,
|
|
"learning_rate": 1.4117370891436133e-06,
|
|
"loss": 0.0485,
|
|
"num_input_tokens_seen": 4845184,
|
|
"step": 5050
|
|
},
|
|
{
|
|
"epoch": 0.4289714867617108,
|
|
"grad_norm": 10.877485275268555,
|
|
"learning_rate": 1.410386830438978e-06,
|
|
"loss": 0.1229,
|
|
"num_input_tokens_seen": 4849920,
|
|
"step": 5055
|
|
},
|
|
{
|
|
"epoch": 0.42939579090291924,
|
|
"grad_norm": 50.73531723022461,
|
|
"learning_rate": 1.4090356713835635e-06,
|
|
"loss": 0.101,
|
|
"num_input_tokens_seen": 4854400,
|
|
"step": 5060
|
|
},
|
|
{
|
|
"epoch": 0.4298200950441276,
|
|
"grad_norm": 6.57769775390625,
|
|
"learning_rate": 1.4076836149416886e-06,
|
|
"loss": 0.086,
|
|
"num_input_tokens_seen": 4859392,
|
|
"step": 5065
|
|
},
|
|
{
|
|
"epoch": 0.430244399185336,
|
|
"grad_norm": 0.7123314738273621,
|
|
"learning_rate": 1.4063306640796404e-06,
|
|
"loss": 0.1207,
|
|
"num_input_tokens_seen": 4864512,
|
|
"step": 5070
|
|
},
|
|
{
|
|
"epoch": 0.43066870332654444,
|
|
"grad_norm": 16.502891540527344,
|
|
"learning_rate": 1.4049768217656674e-06,
|
|
"loss": 0.0951,
|
|
"num_input_tokens_seen": 4869888,
|
|
"step": 5075
|
|
},
|
|
{
|
|
"epoch": 0.43109300746775286,
|
|
"grad_norm": 10.743193626403809,
|
|
"learning_rate": 1.4036220909699748e-06,
|
|
"loss": 0.0824,
|
|
"num_input_tokens_seen": 4874304,
|
|
"step": 5080
|
|
},
|
|
{
|
|
"epoch": 0.4315173116089613,
|
|
"grad_norm": 24.676218032836914,
|
|
"learning_rate": 1.4022664746647168e-06,
|
|
"loss": 0.1119,
|
|
"num_input_tokens_seen": 4879360,
|
|
"step": 5085
|
|
},
|
|
{
|
|
"epoch": 0.4319416157501697,
|
|
"grad_norm": 8.201478004455566,
|
|
"learning_rate": 1.40090997582399e-06,
|
|
"loss": 0.0834,
|
|
"num_input_tokens_seen": 4883520,
|
|
"step": 5090
|
|
},
|
|
{
|
|
"epoch": 0.4323659198913781,
|
|
"grad_norm": 2.385830879211426,
|
|
"learning_rate": 1.3995525974238278e-06,
|
|
"loss": 0.0373,
|
|
"num_input_tokens_seen": 4888320,
|
|
"step": 5095
|
|
},
|
|
{
|
|
"epoch": 0.43279022403258655,
|
|
"grad_norm": 5.756046772003174,
|
|
"learning_rate": 1.398194342442193e-06,
|
|
"loss": 0.0141,
|
|
"num_input_tokens_seen": 4893824,
|
|
"step": 5100
|
|
},
|
|
{
|
|
"epoch": 0.43321452817379497,
|
|
"grad_norm": 11.290924072265625,
|
|
"learning_rate": 1.396835213858971e-06,
|
|
"loss": 0.0637,
|
|
"num_input_tokens_seen": 4898432,
|
|
"step": 5105
|
|
},
|
|
{
|
|
"epoch": 0.4336388323150034,
|
|
"grad_norm": 16.06569480895996,
|
|
"learning_rate": 1.395475214655965e-06,
|
|
"loss": 0.0878,
|
|
"num_input_tokens_seen": 4903040,
|
|
"step": 5110
|
|
},
|
|
{
|
|
"epoch": 0.4340631364562118,
|
|
"grad_norm": 18.569889068603516,
|
|
"learning_rate": 1.394114347816887e-06,
|
|
"loss": 0.0592,
|
|
"num_input_tokens_seen": 4908096,
|
|
"step": 5115
|
|
},
|
|
{
|
|
"epoch": 0.4344874405974202,
|
|
"grad_norm": 1.079769492149353,
|
|
"learning_rate": 1.3927526163273538e-06,
|
|
"loss": 0.0772,
|
|
"num_input_tokens_seen": 4912640,
|
|
"step": 5120
|
|
},
|
|
{
|
|
"epoch": 0.43491174473862865,
|
|
"grad_norm": 22.124479293823242,
|
|
"learning_rate": 1.3913900231748776e-06,
|
|
"loss": 0.0423,
|
|
"num_input_tokens_seen": 4917504,
|
|
"step": 5125
|
|
},
|
|
{
|
|
"epoch": 0.43533604887983707,
|
|
"grad_norm": 0.39653217792510986,
|
|
"learning_rate": 1.3900265713488623e-06,
|
|
"loss": 0.0191,
|
|
"num_input_tokens_seen": 4922688,
|
|
"step": 5130
|
|
},
|
|
{
|
|
"epoch": 0.4357603530210455,
|
|
"grad_norm": 2.302971363067627,
|
|
"learning_rate": 1.3886622638405952e-06,
|
|
"loss": 0.0524,
|
|
"num_input_tokens_seen": 4926976,
|
|
"step": 5135
|
|
},
|
|
{
|
|
"epoch": 0.4361846571622539,
|
|
"grad_norm": 2.3500657081604004,
|
|
"learning_rate": 1.3872971036432406e-06,
|
|
"loss": 0.0568,
|
|
"num_input_tokens_seen": 4931456,
|
|
"step": 5140
|
|
},
|
|
{
|
|
"epoch": 0.43660896130346233,
|
|
"grad_norm": 38.778648376464844,
|
|
"learning_rate": 1.385931093751834e-06,
|
|
"loss": 0.0942,
|
|
"num_input_tokens_seen": 4936000,
|
|
"step": 5145
|
|
},
|
|
{
|
|
"epoch": 0.43703326544467075,
|
|
"grad_norm": 21.554256439208984,
|
|
"learning_rate": 1.384564237163275e-06,
|
|
"loss": 0.0848,
|
|
"num_input_tokens_seen": 4940288,
|
|
"step": 5150
|
|
},
|
|
{
|
|
"epoch": 0.43745756958587917,
|
|
"grad_norm": 7.437337875366211,
|
|
"learning_rate": 1.3831965368763203e-06,
|
|
"loss": 0.0489,
|
|
"num_input_tokens_seen": 4944576,
|
|
"step": 5155
|
|
},
|
|
{
|
|
"epoch": 0.4378818737270876,
|
|
"grad_norm": 6.118746757507324,
|
|
"learning_rate": 1.3818279958915785e-06,
|
|
"loss": 0.0689,
|
|
"num_input_tokens_seen": 4948992,
|
|
"step": 5160
|
|
},
|
|
{
|
|
"epoch": 0.438306177868296,
|
|
"grad_norm": 17.52583885192871,
|
|
"learning_rate": 1.3804586172115015e-06,
|
|
"loss": 0.063,
|
|
"num_input_tokens_seen": 4953728,
|
|
"step": 5165
|
|
},
|
|
{
|
|
"epoch": 0.43873048200950443,
|
|
"grad_norm": 13.240470886230469,
|
|
"learning_rate": 1.3790884038403793e-06,
|
|
"loss": 0.069,
|
|
"num_input_tokens_seen": 4958720,
|
|
"step": 5170
|
|
},
|
|
{
|
|
"epoch": 0.43915478615071285,
|
|
"grad_norm": 26.129886627197266,
|
|
"learning_rate": 1.3777173587843341e-06,
|
|
"loss": 0.0907,
|
|
"num_input_tokens_seen": 4963840,
|
|
"step": 5175
|
|
},
|
|
{
|
|
"epoch": 0.43957909029192127,
|
|
"grad_norm": 4.156742572784424,
|
|
"learning_rate": 1.3763454850513122e-06,
|
|
"loss": 0.0059,
|
|
"num_input_tokens_seen": 4968512,
|
|
"step": 5180
|
|
},
|
|
{
|
|
"epoch": 0.4400033944331297,
|
|
"grad_norm": 16.35133934020996,
|
|
"learning_rate": 1.3749727856510766e-06,
|
|
"loss": 0.0783,
|
|
"num_input_tokens_seen": 4972928,
|
|
"step": 5185
|
|
},
|
|
{
|
|
"epoch": 0.4404276985743381,
|
|
"grad_norm": 11.311882972717285,
|
|
"learning_rate": 1.373599263595204e-06,
|
|
"loss": 0.0996,
|
|
"num_input_tokens_seen": 4977664,
|
|
"step": 5190
|
|
},
|
|
{
|
|
"epoch": 0.4408520027155465,
|
|
"grad_norm": 1.374624490737915,
|
|
"learning_rate": 1.3722249218970744e-06,
|
|
"loss": 0.1072,
|
|
"num_input_tokens_seen": 4982912,
|
|
"step": 5195
|
|
},
|
|
{
|
|
"epoch": 0.4412763068567549,
|
|
"grad_norm": 21.538982391357422,
|
|
"learning_rate": 1.3708497635718672e-06,
|
|
"loss": 0.047,
|
|
"num_input_tokens_seen": 4988416,
|
|
"step": 5200
|
|
},
|
|
{
|
|
"epoch": 0.4417006109979633,
|
|
"grad_norm": 11.306914329528809,
|
|
"learning_rate": 1.3694737916365515e-06,
|
|
"loss": 0.0895,
|
|
"num_input_tokens_seen": 4993472,
|
|
"step": 5205
|
|
},
|
|
{
|
|
"epoch": 0.44212491513917174,
|
|
"grad_norm": 0.20420613884925842,
|
|
"learning_rate": 1.3680970091098832e-06,
|
|
"loss": 0.0403,
|
|
"num_input_tokens_seen": 4998208,
|
|
"step": 5210
|
|
},
|
|
{
|
|
"epoch": 0.44254921928038016,
|
|
"grad_norm": 14.116876602172852,
|
|
"learning_rate": 1.366719419012396e-06,
|
|
"loss": 0.0659,
|
|
"num_input_tokens_seen": 5003008,
|
|
"step": 5215
|
|
},
|
|
{
|
|
"epoch": 0.4429735234215886,
|
|
"grad_norm": 16.650503158569336,
|
|
"learning_rate": 1.3653410243663951e-06,
|
|
"loss": 0.0273,
|
|
"num_input_tokens_seen": 5008704,
|
|
"step": 5220
|
|
},
|
|
{
|
|
"epoch": 0.443397827562797,
|
|
"grad_norm": 31.029172897338867,
|
|
"learning_rate": 1.363961828195951e-06,
|
|
"loss": 0.1021,
|
|
"num_input_tokens_seen": 5013120,
|
|
"step": 5225
|
|
},
|
|
{
|
|
"epoch": 0.4438221317040054,
|
|
"grad_norm": 0.18527749180793762,
|
|
"learning_rate": 1.3625818335268923e-06,
|
|
"loss": 0.0594,
|
|
"num_input_tokens_seen": 5017664,
|
|
"step": 5230
|
|
},
|
|
{
|
|
"epoch": 0.44424643584521384,
|
|
"grad_norm": 15.43123722076416,
|
|
"learning_rate": 1.3612010433868004e-06,
|
|
"loss": 0.0598,
|
|
"num_input_tokens_seen": 5022528,
|
|
"step": 5235
|
|
},
|
|
{
|
|
"epoch": 0.44467073998642226,
|
|
"grad_norm": 4.617176532745361,
|
|
"learning_rate": 1.3598194608050008e-06,
|
|
"loss": 0.0941,
|
|
"num_input_tokens_seen": 5027072,
|
|
"step": 5240
|
|
},
|
|
{
|
|
"epoch": 0.4450950441276307,
|
|
"grad_norm": 0.34717246890068054,
|
|
"learning_rate": 1.3584370888125583e-06,
|
|
"loss": 0.0098,
|
|
"num_input_tokens_seen": 5031424,
|
|
"step": 5245
|
|
},
|
|
{
|
|
"epoch": 0.4455193482688391,
|
|
"grad_norm": 25.9036865234375,
|
|
"learning_rate": 1.357053930442269e-06,
|
|
"loss": 0.0426,
|
|
"num_input_tokens_seen": 5036480,
|
|
"step": 5250
|
|
},
|
|
{
|
|
"epoch": 0.4459436524100475,
|
|
"grad_norm": 12.893452644348145,
|
|
"learning_rate": 1.355669988728655e-06,
|
|
"loss": 0.0651,
|
|
"num_input_tokens_seen": 5041792,
|
|
"step": 5255
|
|
},
|
|
{
|
|
"epoch": 0.44636795655125594,
|
|
"grad_norm": 0.674833357334137,
|
|
"learning_rate": 1.3542852667079557e-06,
|
|
"loss": 0.0048,
|
|
"num_input_tokens_seen": 5046592,
|
|
"step": 5260
|
|
},
|
|
{
|
|
"epoch": 0.44679226069246436,
|
|
"grad_norm": 21.5000057220459,
|
|
"learning_rate": 1.352899767418124e-06,
|
|
"loss": 0.1007,
|
|
"num_input_tokens_seen": 5051456,
|
|
"step": 5265
|
|
},
|
|
{
|
|
"epoch": 0.4472165648336728,
|
|
"grad_norm": 19.427629470825195,
|
|
"learning_rate": 1.3515134938988168e-06,
|
|
"loss": 0.0992,
|
|
"num_input_tokens_seen": 5056320,
|
|
"step": 5270
|
|
},
|
|
{
|
|
"epoch": 0.4476408689748812,
|
|
"grad_norm": 0.4970286190509796,
|
|
"learning_rate": 1.3501264491913906e-06,
|
|
"loss": 0.0152,
|
|
"num_input_tokens_seen": 5061248,
|
|
"step": 5275
|
|
},
|
|
{
|
|
"epoch": 0.4480651731160896,
|
|
"grad_norm": 22.036640167236328,
|
|
"learning_rate": 1.348738636338893e-06,
|
|
"loss": 0.0759,
|
|
"num_input_tokens_seen": 5065664,
|
|
"step": 5280
|
|
},
|
|
{
|
|
"epoch": 0.44848947725729804,
|
|
"grad_norm": 13.6568021774292,
|
|
"learning_rate": 1.3473500583860568e-06,
|
|
"loss": 0.0608,
|
|
"num_input_tokens_seen": 5071552,
|
|
"step": 5285
|
|
},
|
|
{
|
|
"epoch": 0.44891378139850646,
|
|
"grad_norm": 0.3351401388645172,
|
|
"learning_rate": 1.3459607183792945e-06,
|
|
"loss": 0.0623,
|
|
"num_input_tokens_seen": 5076032,
|
|
"step": 5290
|
|
},
|
|
{
|
|
"epoch": 0.4493380855397149,
|
|
"grad_norm": 13.974827766418457,
|
|
"learning_rate": 1.344570619366689e-06,
|
|
"loss": 0.0411,
|
|
"num_input_tokens_seen": 5080384,
|
|
"step": 5295
|
|
},
|
|
{
|
|
"epoch": 0.4497623896809233,
|
|
"grad_norm": 12.229580879211426,
|
|
"learning_rate": 1.3431797643979894e-06,
|
|
"loss": 0.0469,
|
|
"num_input_tokens_seen": 5085376,
|
|
"step": 5300
|
|
},
|
|
{
|
|
"epoch": 0.4501866938221317,
|
|
"grad_norm": 13.92734432220459,
|
|
"learning_rate": 1.3417881565246027e-06,
|
|
"loss": 0.0507,
|
|
"num_input_tokens_seen": 5090112,
|
|
"step": 5305
|
|
},
|
|
{
|
|
"epoch": 0.45061099796334014,
|
|
"grad_norm": 35.575260162353516,
|
|
"learning_rate": 1.3403957987995882e-06,
|
|
"loss": 0.0674,
|
|
"num_input_tokens_seen": 5095424,
|
|
"step": 5310
|
|
},
|
|
{
|
|
"epoch": 0.45061099796334014,
|
|
"eval_loss": 0.06524531543254852,
|
|
"eval_runtime": 16.6853,
|
|
"eval_samples_per_second": 627.799,
|
|
"eval_steps_per_second": 78.512,
|
|
"num_input_tokens_seen": 5095424,
|
|
"step": 5310
|
|
},
|
|
{
|
|
"epoch": 0.45103530210454856,
|
|
"grad_norm": 2.302506446838379,
|
|
"learning_rate": 1.33900269427765e-06,
|
|
"loss": 0.0605,
|
|
"num_input_tokens_seen": 5100864,
|
|
"step": 5315
|
|
},
|
|
{
|
|
"epoch": 0.451459606245757,
|
|
"grad_norm": 12.176279067993164,
|
|
"learning_rate": 1.3376088460151306e-06,
|
|
"loss": 0.0871,
|
|
"num_input_tokens_seen": 5105088,
|
|
"step": 5320
|
|
},
|
|
{
|
|
"epoch": 0.4518839103869654,
|
|
"grad_norm": 17.406301498413086,
|
|
"learning_rate": 1.336214257070004e-06,
|
|
"loss": 0.0402,
|
|
"num_input_tokens_seen": 5109760,
|
|
"step": 5325
|
|
},
|
|
{
|
|
"epoch": 0.45230821452817377,
|
|
"grad_norm": 17.82622718811035,
|
|
"learning_rate": 1.3348189305018702e-06,
|
|
"loss": 0.1007,
|
|
"num_input_tokens_seen": 5114176,
|
|
"step": 5330
|
|
},
|
|
{
|
|
"epoch": 0.4527325186693822,
|
|
"grad_norm": 13.202062606811523,
|
|
"learning_rate": 1.3334228693719464e-06,
|
|
"loss": 0.0155,
|
|
"num_input_tokens_seen": 5118592,
|
|
"step": 5335
|
|
},
|
|
{
|
|
"epoch": 0.4531568228105906,
|
|
"grad_norm": 0.5704994201660156,
|
|
"learning_rate": 1.3320260767430614e-06,
|
|
"loss": 0.107,
|
|
"num_input_tokens_seen": 5123584,
|
|
"step": 5340
|
|
},
|
|
{
|
|
"epoch": 0.45358112695179903,
|
|
"grad_norm": 12.38845157623291,
|
|
"learning_rate": 1.3306285556796492e-06,
|
|
"loss": 0.038,
|
|
"num_input_tokens_seen": 5128192,
|
|
"step": 5345
|
|
},
|
|
{
|
|
"epoch": 0.45400543109300745,
|
|
"grad_norm": 1.7336963415145874,
|
|
"learning_rate": 1.3292303092477424e-06,
|
|
"loss": 0.0968,
|
|
"num_input_tokens_seen": 5132864,
|
|
"step": 5350
|
|
},
|
|
{
|
|
"epoch": 0.45442973523421587,
|
|
"grad_norm": 1.0668820142745972,
|
|
"learning_rate": 1.3278313405149638e-06,
|
|
"loss": 0.0392,
|
|
"num_input_tokens_seen": 5137216,
|
|
"step": 5355
|
|
},
|
|
{
|
|
"epoch": 0.4548540393754243,
|
|
"grad_norm": 7.79935884475708,
|
|
"learning_rate": 1.3264316525505216e-06,
|
|
"loss": 0.0651,
|
|
"num_input_tokens_seen": 5142528,
|
|
"step": 5360
|
|
},
|
|
{
|
|
"epoch": 0.4552783435166327,
|
|
"grad_norm": 14.068997383117676,
|
|
"learning_rate": 1.3250312484252021e-06,
|
|
"loss": 0.0093,
|
|
"num_input_tokens_seen": 5147968,
|
|
"step": 5365
|
|
},
|
|
{
|
|
"epoch": 0.45570264765784113,
|
|
"grad_norm": 0.11554321646690369,
|
|
"learning_rate": 1.3236301312113627e-06,
|
|
"loss": 0.0303,
|
|
"num_input_tokens_seen": 5152384,
|
|
"step": 5370
|
|
},
|
|
{
|
|
"epoch": 0.45612695179904955,
|
|
"grad_norm": 32.751644134521484,
|
|
"learning_rate": 1.3222283039829247e-06,
|
|
"loss": 0.1354,
|
|
"num_input_tokens_seen": 5156992,
|
|
"step": 5375
|
|
},
|
|
{
|
|
"epoch": 0.45655125594025797,
|
|
"grad_norm": 0.36955803632736206,
|
|
"learning_rate": 1.3208257698153676e-06,
|
|
"loss": 0.1,
|
|
"num_input_tokens_seen": 5161984,
|
|
"step": 5380
|
|
},
|
|
{
|
|
"epoch": 0.4569755600814664,
|
|
"grad_norm": 21.537946701049805,
|
|
"learning_rate": 1.3194225317857216e-06,
|
|
"loss": 0.0496,
|
|
"num_input_tokens_seen": 5167040,
|
|
"step": 5385
|
|
},
|
|
{
|
|
"epoch": 0.4573998642226748,
|
|
"grad_norm": 22.948135375976562,
|
|
"learning_rate": 1.3180185929725616e-06,
|
|
"loss": 0.0496,
|
|
"num_input_tokens_seen": 5171776,
|
|
"step": 5390
|
|
},
|
|
{
|
|
"epoch": 0.45782416836388323,
|
|
"grad_norm": 20.067203521728516,
|
|
"learning_rate": 1.3166139564559992e-06,
|
|
"loss": 0.1042,
|
|
"num_input_tokens_seen": 5176896,
|
|
"step": 5395
|
|
},
|
|
{
|
|
"epoch": 0.45824847250509165,
|
|
"grad_norm": 24.42470932006836,
|
|
"learning_rate": 1.3152086253176773e-06,
|
|
"loss": 0.0425,
|
|
"num_input_tokens_seen": 5181312,
|
|
"step": 5400
|
|
},
|
|
{
|
|
"epoch": 0.4586727766463001,
|
|
"grad_norm": 0.8795569539070129,
|
|
"learning_rate": 1.313802602640763e-06,
|
|
"loss": 0.005,
|
|
"num_input_tokens_seen": 5186112,
|
|
"step": 5405
|
|
},
|
|
{
|
|
"epoch": 0.4590970807875085,
|
|
"grad_norm": 0.6361150145530701,
|
|
"learning_rate": 1.3123958915099392e-06,
|
|
"loss": 0.0412,
|
|
"num_input_tokens_seen": 5191040,
|
|
"step": 5410
|
|
},
|
|
{
|
|
"epoch": 0.4595213849287169,
|
|
"grad_norm": 0.39102301001548767,
|
|
"learning_rate": 1.3109884950114005e-06,
|
|
"loss": 0.051,
|
|
"num_input_tokens_seen": 5195584,
|
|
"step": 5415
|
|
},
|
|
{
|
|
"epoch": 0.45994568906992533,
|
|
"grad_norm": 10.772954940795898,
|
|
"learning_rate": 1.309580416232845e-06,
|
|
"loss": 0.0764,
|
|
"num_input_tokens_seen": 5200256,
|
|
"step": 5420
|
|
},
|
|
{
|
|
"epoch": 0.46036999321113375,
|
|
"grad_norm": 34.4044303894043,
|
|
"learning_rate": 1.3081716582634672e-06,
|
|
"loss": 0.0486,
|
|
"num_input_tokens_seen": 5205376,
|
|
"step": 5425
|
|
},
|
|
{
|
|
"epoch": 0.4607942973523422,
|
|
"grad_norm": 13.2206392288208,
|
|
"learning_rate": 1.3067622241939518e-06,
|
|
"loss": 0.0323,
|
|
"num_input_tokens_seen": 5211392,
|
|
"step": 5430
|
|
},
|
|
{
|
|
"epoch": 0.4612186014935506,
|
|
"grad_norm": 3.4941582679748535,
|
|
"learning_rate": 1.305352117116467e-06,
|
|
"loss": 0.0334,
|
|
"num_input_tokens_seen": 5215616,
|
|
"step": 5435
|
|
},
|
|
{
|
|
"epoch": 0.461642905634759,
|
|
"grad_norm": 10.900171279907227,
|
|
"learning_rate": 1.3039413401246576e-06,
|
|
"loss": 0.0965,
|
|
"num_input_tokens_seen": 5220608,
|
|
"step": 5440
|
|
},
|
|
{
|
|
"epoch": 0.46206720977596744,
|
|
"grad_norm": 7.250054359436035,
|
|
"learning_rate": 1.3025298963136377e-06,
|
|
"loss": 0.1161,
|
|
"num_input_tokens_seen": 5225344,
|
|
"step": 5445
|
|
},
|
|
{
|
|
"epoch": 0.46249151391717586,
|
|
"grad_norm": 25.070987701416016,
|
|
"learning_rate": 1.3011177887799844e-06,
|
|
"loss": 0.0413,
|
|
"num_input_tokens_seen": 5230464,
|
|
"step": 5450
|
|
},
|
|
{
|
|
"epoch": 0.4629158180583843,
|
|
"grad_norm": 26.161561965942383,
|
|
"learning_rate": 1.2997050206217315e-06,
|
|
"loss": 0.054,
|
|
"num_input_tokens_seen": 5235264,
|
|
"step": 5455
|
|
},
|
|
{
|
|
"epoch": 0.46334012219959264,
|
|
"grad_norm": 16.561805725097656,
|
|
"learning_rate": 1.2982915949383614e-06,
|
|
"loss": 0.0957,
|
|
"num_input_tokens_seen": 5239808,
|
|
"step": 5460
|
|
},
|
|
{
|
|
"epoch": 0.46376442634080106,
|
|
"grad_norm": 16.152515411376953,
|
|
"learning_rate": 1.2968775148308002e-06,
|
|
"loss": 0.0722,
|
|
"num_input_tokens_seen": 5244416,
|
|
"step": 5465
|
|
},
|
|
{
|
|
"epoch": 0.4641887304820095,
|
|
"grad_norm": 16.322086334228516,
|
|
"learning_rate": 1.295462783401408e-06,
|
|
"loss": 0.1038,
|
|
"num_input_tokens_seen": 5249280,
|
|
"step": 5470
|
|
},
|
|
{
|
|
"epoch": 0.4646130346232179,
|
|
"grad_norm": 13.543170928955078,
|
|
"learning_rate": 1.2940474037539755e-06,
|
|
"loss": 0.1078,
|
|
"num_input_tokens_seen": 5254080,
|
|
"step": 5475
|
|
},
|
|
{
|
|
"epoch": 0.4650373387644263,
|
|
"grad_norm": 0.8383488655090332,
|
|
"learning_rate": 1.2926313789937143e-06,
|
|
"loss": 0.0207,
|
|
"num_input_tokens_seen": 5259136,
|
|
"step": 5480
|
|
},
|
|
{
|
|
"epoch": 0.46546164290563474,
|
|
"grad_norm": 22.7758731842041,
|
|
"learning_rate": 1.2912147122272522e-06,
|
|
"loss": 0.057,
|
|
"num_input_tokens_seen": 5263744,
|
|
"step": 5485
|
|
},
|
|
{
|
|
"epoch": 0.46588594704684316,
|
|
"grad_norm": 0.8880257606506348,
|
|
"learning_rate": 1.289797406562625e-06,
|
|
"loss": 0.0488,
|
|
"num_input_tokens_seen": 5268544,
|
|
"step": 5490
|
|
},
|
|
{
|
|
"epoch": 0.4663102511880516,
|
|
"grad_norm": 2.692805290222168,
|
|
"learning_rate": 1.2883794651092704e-06,
|
|
"loss": 0.0245,
|
|
"num_input_tokens_seen": 5273280,
|
|
"step": 5495
|
|
},
|
|
{
|
|
"epoch": 0.46673455532926,
|
|
"grad_norm": 18.147809982299805,
|
|
"learning_rate": 1.2869608909780212e-06,
|
|
"loss": 0.0231,
|
|
"num_input_tokens_seen": 5277888,
|
|
"step": 5500
|
|
},
|
|
{
|
|
"epoch": 0.4671588594704684,
|
|
"grad_norm": 15.168424606323242,
|
|
"learning_rate": 1.2855416872810973e-06,
|
|
"loss": 0.0624,
|
|
"num_input_tokens_seen": 5282432,
|
|
"step": 5505
|
|
},
|
|
{
|
|
"epoch": 0.46758316361167684,
|
|
"grad_norm": 9.290873527526855,
|
|
"learning_rate": 1.284121857132101e-06,
|
|
"loss": 0.0089,
|
|
"num_input_tokens_seen": 5288512,
|
|
"step": 5510
|
|
},
|
|
{
|
|
"epoch": 0.46800746775288526,
|
|
"grad_norm": 10.560848236083984,
|
|
"learning_rate": 1.2827014036460082e-06,
|
|
"loss": 0.0197,
|
|
"num_input_tokens_seen": 5292800,
|
|
"step": 5515
|
|
},
|
|
{
|
|
"epoch": 0.4684317718940937,
|
|
"grad_norm": 1.7550711631774902,
|
|
"learning_rate": 1.2812803299391628e-06,
|
|
"loss": 0.0232,
|
|
"num_input_tokens_seen": 5297856,
|
|
"step": 5520
|
|
},
|
|
{
|
|
"epoch": 0.4688560760353021,
|
|
"grad_norm": 7.6489410400390625,
|
|
"learning_rate": 1.2798586391292689e-06,
|
|
"loss": 0.104,
|
|
"num_input_tokens_seen": 5302784,
|
|
"step": 5525
|
|
},
|
|
{
|
|
"epoch": 0.4692803801765105,
|
|
"grad_norm": 37.18665313720703,
|
|
"learning_rate": 1.2784363343353848e-06,
|
|
"loss": 0.0538,
|
|
"num_input_tokens_seen": 5307648,
|
|
"step": 5530
|
|
},
|
|
{
|
|
"epoch": 0.46970468431771895,
|
|
"grad_norm": 7.301642417907715,
|
|
"learning_rate": 1.2770134186779158e-06,
|
|
"loss": 0.0554,
|
|
"num_input_tokens_seen": 5311680,
|
|
"step": 5535
|
|
},
|
|
{
|
|
"epoch": 0.47012898845892737,
|
|
"grad_norm": 8.521341323852539,
|
|
"learning_rate": 1.2755898952786076e-06,
|
|
"loss": 0.053,
|
|
"num_input_tokens_seen": 5316288,
|
|
"step": 5540
|
|
},
|
|
{
|
|
"epoch": 0.4705532926001358,
|
|
"grad_norm": 5.190464496612549,
|
|
"learning_rate": 1.2741657672605385e-06,
|
|
"loss": 0.0156,
|
|
"num_input_tokens_seen": 5320448,
|
|
"step": 5545
|
|
},
|
|
{
|
|
"epoch": 0.4709775967413442,
|
|
"grad_norm": 0.09276895970106125,
|
|
"learning_rate": 1.272741037748114e-06,
|
|
"loss": 0.0563,
|
|
"num_input_tokens_seen": 5324928,
|
|
"step": 5550
|
|
},
|
|
{
|
|
"epoch": 0.4714019008825526,
|
|
"grad_norm": 15.722267150878906,
|
|
"learning_rate": 1.2713157098670588e-06,
|
|
"loss": 0.0487,
|
|
"num_input_tokens_seen": 5329792,
|
|
"step": 5555
|
|
},
|
|
{
|
|
"epoch": 0.47182620502376105,
|
|
"grad_norm": 15.788847923278809,
|
|
"learning_rate": 1.2698897867444112e-06,
|
|
"loss": 0.0939,
|
|
"num_input_tokens_seen": 5334720,
|
|
"step": 5560
|
|
},
|
|
{
|
|
"epoch": 0.47225050916496947,
|
|
"grad_norm": 8.967011451721191,
|
|
"learning_rate": 1.268463271508514e-06,
|
|
"loss": 0.0931,
|
|
"num_input_tokens_seen": 5339968,
|
|
"step": 5565
|
|
},
|
|
{
|
|
"epoch": 0.4726748133061779,
|
|
"grad_norm": 0.2631434500217438,
|
|
"learning_rate": 1.2670361672890099e-06,
|
|
"loss": 0.0061,
|
|
"num_input_tokens_seen": 5345216,
|
|
"step": 5570
|
|
},
|
|
{
|
|
"epoch": 0.4730991174473863,
|
|
"grad_norm": 30.442209243774414,
|
|
"learning_rate": 1.265608477216834e-06,
|
|
"loss": 0.0806,
|
|
"num_input_tokens_seen": 5350400,
|
|
"step": 5575
|
|
},
|
|
{
|
|
"epoch": 0.47352342158859473,
|
|
"grad_norm": 5.9296417236328125,
|
|
"learning_rate": 1.2641802044242065e-06,
|
|
"loss": 0.114,
|
|
"num_input_tokens_seen": 5354944,
|
|
"step": 5580
|
|
},
|
|
{
|
|
"epoch": 0.47394772572980315,
|
|
"grad_norm": 0.6069964170455933,
|
|
"learning_rate": 1.2627513520446252e-06,
|
|
"loss": 0.1269,
|
|
"num_input_tokens_seen": 5359040,
|
|
"step": 5585
|
|
},
|
|
{
|
|
"epoch": 0.4743720298710115,
|
|
"grad_norm": 1.3093260526657104,
|
|
"learning_rate": 1.2613219232128608e-06,
|
|
"loss": 0.1031,
|
|
"num_input_tokens_seen": 5363584,
|
|
"step": 5590
|
|
},
|
|
{
|
|
"epoch": 0.47479633401221993,
|
|
"grad_norm": 12.471885681152344,
|
|
"learning_rate": 1.2598919210649475e-06,
|
|
"loss": 0.0159,
|
|
"num_input_tokens_seen": 5368256,
|
|
"step": 5595
|
|
},
|
|
{
|
|
"epoch": 0.47522063815342835,
|
|
"grad_norm": 2.1485166549682617,
|
|
"learning_rate": 1.2584613487381787e-06,
|
|
"loss": 0.058,
|
|
"num_input_tokens_seen": 5372800,
|
|
"step": 5600
|
|
},
|
|
{
|
|
"epoch": 0.4756449422946368,
|
|
"grad_norm": 16.98457145690918,
|
|
"learning_rate": 1.257030209371097e-06,
|
|
"loss": 0.0218,
|
|
"num_input_tokens_seen": 5377280,
|
|
"step": 5605
|
|
},
|
|
{
|
|
"epoch": 0.4760692464358452,
|
|
"grad_norm": 6.924666404724121,
|
|
"learning_rate": 1.2555985061034902e-06,
|
|
"loss": 0.1367,
|
|
"num_input_tokens_seen": 5382208,
|
|
"step": 5610
|
|
},
|
|
{
|
|
"epoch": 0.4764935505770536,
|
|
"grad_norm": 13.678959846496582,
|
|
"learning_rate": 1.2541662420763832e-06,
|
|
"loss": 0.0931,
|
|
"num_input_tokens_seen": 5386816,
|
|
"step": 5615
|
|
},
|
|
{
|
|
"epoch": 0.47691785471826204,
|
|
"grad_norm": 4.235424041748047,
|
|
"learning_rate": 1.2527334204320306e-06,
|
|
"loss": 0.0499,
|
|
"num_input_tokens_seen": 5391360,
|
|
"step": 5620
|
|
},
|
|
{
|
|
"epoch": 0.47734215885947046,
|
|
"grad_norm": 15.993348121643066,
|
|
"learning_rate": 1.251300044313911e-06,
|
|
"loss": 0.0622,
|
|
"num_input_tokens_seen": 5395904,
|
|
"step": 5625
|
|
},
|
|
{
|
|
"epoch": 0.4777664630006789,
|
|
"grad_norm": 0.5639759302139282,
|
|
"learning_rate": 1.2498661168667188e-06,
|
|
"loss": 0.064,
|
|
"num_input_tokens_seen": 5400448,
|
|
"step": 5630
|
|
},
|
|
{
|
|
"epoch": 0.4781907671418873,
|
|
"grad_norm": 13.044215202331543,
|
|
"learning_rate": 1.2484316412363585e-06,
|
|
"loss": 0.0427,
|
|
"num_input_tokens_seen": 5405824,
|
|
"step": 5635
|
|
},
|
|
{
|
|
"epoch": 0.4786150712830957,
|
|
"grad_norm": 0.6441131830215454,
|
|
"learning_rate": 1.246996620569937e-06,
|
|
"loss": 0.0631,
|
|
"num_input_tokens_seen": 5410688,
|
|
"step": 5640
|
|
},
|
|
{
|
|
"epoch": 0.47903937542430414,
|
|
"grad_norm": 6.8543171882629395,
|
|
"learning_rate": 1.245561058015757e-06,
|
|
"loss": 0.0676,
|
|
"num_input_tokens_seen": 5415296,
|
|
"step": 5645
|
|
},
|
|
{
|
|
"epoch": 0.47946367956551256,
|
|
"grad_norm": 8.087605476379395,
|
|
"learning_rate": 1.2441249567233098e-06,
|
|
"loss": 0.1068,
|
|
"num_input_tokens_seen": 5419648,
|
|
"step": 5650
|
|
},
|
|
{
|
|
"epoch": 0.479887983706721,
|
|
"grad_norm": 1.8520663976669312,
|
|
"learning_rate": 1.2426883198432696e-06,
|
|
"loss": 0.0388,
|
|
"num_input_tokens_seen": 5424576,
|
|
"step": 5655
|
|
},
|
|
{
|
|
"epoch": 0.4803122878479294,
|
|
"grad_norm": 3.021974802017212,
|
|
"learning_rate": 1.2412511505274844e-06,
|
|
"loss": 0.0356,
|
|
"num_input_tokens_seen": 5429184,
|
|
"step": 5660
|
|
},
|
|
{
|
|
"epoch": 0.4807365919891378,
|
|
"grad_norm": 0.46111950278282166,
|
|
"learning_rate": 1.2398134519289708e-06,
|
|
"loss": 0.0564,
|
|
"num_input_tokens_seen": 5433536,
|
|
"step": 5665
|
|
},
|
|
{
|
|
"epoch": 0.48116089613034624,
|
|
"grad_norm": 19.531221389770508,
|
|
"learning_rate": 1.2383752272019071e-06,
|
|
"loss": 0.1217,
|
|
"num_input_tokens_seen": 5438464,
|
|
"step": 5670
|
|
},
|
|
{
|
|
"epoch": 0.48158520027155466,
|
|
"grad_norm": 11.817866325378418,
|
|
"learning_rate": 1.2369364795016252e-06,
|
|
"loss": 0.0716,
|
|
"num_input_tokens_seen": 5443136,
|
|
"step": 5675
|
|
},
|
|
{
|
|
"epoch": 0.4820095044127631,
|
|
"grad_norm": 0.5358498692512512,
|
|
"learning_rate": 1.2354972119846045e-06,
|
|
"loss": 0.0289,
|
|
"num_input_tokens_seen": 5447744,
|
|
"step": 5680
|
|
},
|
|
{
|
|
"epoch": 0.4824338085539715,
|
|
"grad_norm": 2.3041980266571045,
|
|
"learning_rate": 1.2340574278084648e-06,
|
|
"loss": 0.1111,
|
|
"num_input_tokens_seen": 5452800,
|
|
"step": 5685
|
|
},
|
|
{
|
|
"epoch": 0.4828581126951799,
|
|
"grad_norm": 10.597290992736816,
|
|
"learning_rate": 1.23261713013196e-06,
|
|
"loss": 0.0655,
|
|
"num_input_tokens_seen": 5457472,
|
|
"step": 5690
|
|
},
|
|
{
|
|
"epoch": 0.48328241683638834,
|
|
"grad_norm": 10.942108154296875,
|
|
"learning_rate": 1.2311763221149697e-06,
|
|
"loss": 0.0745,
|
|
"num_input_tokens_seen": 5462272,
|
|
"step": 5695
|
|
},
|
|
{
|
|
"epoch": 0.48370672097759676,
|
|
"grad_norm": 15.187726974487305,
|
|
"learning_rate": 1.2297350069184935e-06,
|
|
"loss": 0.019,
|
|
"num_input_tokens_seen": 5466880,
|
|
"step": 5700
|
|
},
|
|
{
|
|
"epoch": 0.4841310251188052,
|
|
"grad_norm": 7.880878448486328,
|
|
"learning_rate": 1.228293187704644e-06,
|
|
"loss": 0.0438,
|
|
"num_input_tokens_seen": 5471616,
|
|
"step": 5705
|
|
},
|
|
{
|
|
"epoch": 0.4845553292600136,
|
|
"grad_norm": 16.48000144958496,
|
|
"learning_rate": 1.2268508676366393e-06,
|
|
"loss": 0.072,
|
|
"num_input_tokens_seen": 5476160,
|
|
"step": 5710
|
|
},
|
|
{
|
|
"epoch": 0.484979633401222,
|
|
"grad_norm": 19.825992584228516,
|
|
"learning_rate": 1.225408049878796e-06,
|
|
"loss": 0.1309,
|
|
"num_input_tokens_seen": 5480960,
|
|
"step": 5715
|
|
},
|
|
{
|
|
"epoch": 0.48540393754243044,
|
|
"grad_norm": 14.43404769897461,
|
|
"learning_rate": 1.223964737596523e-06,
|
|
"loss": 0.0657,
|
|
"num_input_tokens_seen": 5486528,
|
|
"step": 5720
|
|
},
|
|
{
|
|
"epoch": 0.4858282416836388,
|
|
"grad_norm": 17.72797203063965,
|
|
"learning_rate": 1.2225209339563143e-06,
|
|
"loss": 0.0385,
|
|
"num_input_tokens_seen": 5491456,
|
|
"step": 5725
|
|
},
|
|
{
|
|
"epoch": 0.4862525458248472,
|
|
"grad_norm": 1.156896710395813,
|
|
"learning_rate": 1.2210766421257419e-06,
|
|
"loss": 0.0591,
|
|
"num_input_tokens_seen": 5496640,
|
|
"step": 5730
|
|
},
|
|
{
|
|
"epoch": 0.48667684996605565,
|
|
"grad_norm": 0.5983453392982483,
|
|
"learning_rate": 1.2196318652734477e-06,
|
|
"loss": 0.0327,
|
|
"num_input_tokens_seen": 5501376,
|
|
"step": 5735
|
|
},
|
|
{
|
|
"epoch": 0.48710115410726407,
|
|
"grad_norm": 17.34876823425293,
|
|
"learning_rate": 1.2181866065691392e-06,
|
|
"loss": 0.0476,
|
|
"num_input_tokens_seen": 5505856,
|
|
"step": 5740
|
|
},
|
|
{
|
|
"epoch": 0.4875254582484725,
|
|
"grad_norm": 18.48384666442871,
|
|
"learning_rate": 1.2167408691835807e-06,
|
|
"loss": 0.0595,
|
|
"num_input_tokens_seen": 5510720,
|
|
"step": 5745
|
|
},
|
|
{
|
|
"epoch": 0.4879497623896809,
|
|
"grad_norm": 7.38417911529541,
|
|
"learning_rate": 1.2152946562885857e-06,
|
|
"loss": 0.052,
|
|
"num_input_tokens_seen": 5514880,
|
|
"step": 5750
|
|
},
|
|
{
|
|
"epoch": 0.48837406653088933,
|
|
"grad_norm": 0.5416074991226196,
|
|
"learning_rate": 1.2138479710570123e-06,
|
|
"loss": 0.0505,
|
|
"num_input_tokens_seen": 5519616,
|
|
"step": 5755
|
|
},
|
|
{
|
|
"epoch": 0.48879837067209775,
|
|
"grad_norm": 21.0726318359375,
|
|
"learning_rate": 1.2124008166627535e-06,
|
|
"loss": 0.0764,
|
|
"num_input_tokens_seen": 5523968,
|
|
"step": 5760
|
|
},
|
|
{
|
|
"epoch": 0.48922267481330617,
|
|
"grad_norm": 0.5404296517372131,
|
|
"learning_rate": 1.2109531962807332e-06,
|
|
"loss": 0.0676,
|
|
"num_input_tokens_seen": 5528960,
|
|
"step": 5765
|
|
},
|
|
{
|
|
"epoch": 0.4896469789545146,
|
|
"grad_norm": 0.10004238039255142,
|
|
"learning_rate": 1.2095051130868959e-06,
|
|
"loss": 0.0836,
|
|
"num_input_tokens_seen": 5535488,
|
|
"step": 5770
|
|
},
|
|
{
|
|
"epoch": 0.490071283095723,
|
|
"grad_norm": 2.8323614597320557,
|
|
"learning_rate": 1.2080565702582027e-06,
|
|
"loss": 0.0975,
|
|
"num_input_tokens_seen": 5540288,
|
|
"step": 5775
|
|
},
|
|
{
|
|
"epoch": 0.49049558723693143,
|
|
"grad_norm": 28.27829933166504,
|
|
"learning_rate": 1.2066075709726225e-06,
|
|
"loss": 0.0487,
|
|
"num_input_tokens_seen": 5545792,
|
|
"step": 5780
|
|
},
|
|
{
|
|
"epoch": 0.49091989137813985,
|
|
"grad_norm": 18.033672332763672,
|
|
"learning_rate": 1.2051581184091263e-06,
|
|
"loss": 0.0773,
|
|
"num_input_tokens_seen": 5550336,
|
|
"step": 5785
|
|
},
|
|
{
|
|
"epoch": 0.49134419551934827,
|
|
"grad_norm": 10.371143341064453,
|
|
"learning_rate": 1.2037082157476782e-06,
|
|
"loss": 0.0662,
|
|
"num_input_tokens_seen": 5555328,
|
|
"step": 5790
|
|
},
|
|
{
|
|
"epoch": 0.4917684996605567,
|
|
"grad_norm": 12.590456008911133,
|
|
"learning_rate": 1.2022578661692312e-06,
|
|
"loss": 0.0464,
|
|
"num_input_tokens_seen": 5560896,
|
|
"step": 5795
|
|
},
|
|
{
|
|
"epoch": 0.4921928038017651,
|
|
"grad_norm": 1.6708661317825317,
|
|
"learning_rate": 1.2008070728557185e-06,
|
|
"loss": 0.0572,
|
|
"num_input_tokens_seen": 5565824,
|
|
"step": 5800
|
|
},
|
|
{
|
|
"epoch": 0.49261710794297353,
|
|
"grad_norm": 1.8818459510803223,
|
|
"learning_rate": 1.1993558389900462e-06,
|
|
"loss": 0.0479,
|
|
"num_input_tokens_seen": 5570368,
|
|
"step": 5805
|
|
},
|
|
{
|
|
"epoch": 0.49304141208418195,
|
|
"grad_norm": 15.59914493560791,
|
|
"learning_rate": 1.197904167756087e-06,
|
|
"loss": 0.0743,
|
|
"num_input_tokens_seen": 5574848,
|
|
"step": 5810
|
|
},
|
|
{
|
|
"epoch": 0.49346571622539037,
|
|
"grad_norm": 3.4572269916534424,
|
|
"learning_rate": 1.1964520623386741e-06,
|
|
"loss": 0.0797,
|
|
"num_input_tokens_seen": 5579456,
|
|
"step": 5815
|
|
},
|
|
{
|
|
"epoch": 0.4938900203665988,
|
|
"grad_norm": 3.4140560626983643,
|
|
"learning_rate": 1.1949995259235919e-06,
|
|
"loss": 0.0476,
|
|
"num_input_tokens_seen": 5584384,
|
|
"step": 5820
|
|
},
|
|
{
|
|
"epoch": 0.4943143245078072,
|
|
"grad_norm": 26.14369010925293,
|
|
"learning_rate": 1.1935465616975716e-06,
|
|
"loss": 0.1255,
|
|
"num_input_tokens_seen": 5589632,
|
|
"step": 5825
|
|
},
|
|
{
|
|
"epoch": 0.49473862864901563,
|
|
"grad_norm": 31.085243225097656,
|
|
"learning_rate": 1.192093172848282e-06,
|
|
"loss": 0.0316,
|
|
"num_input_tokens_seen": 5594048,
|
|
"step": 5830
|
|
},
|
|
{
|
|
"epoch": 0.49516293279022405,
|
|
"grad_norm": 11.811629295349121,
|
|
"learning_rate": 1.1906393625643242e-06,
|
|
"loss": 0.0546,
|
|
"num_input_tokens_seen": 5598720,
|
|
"step": 5835
|
|
},
|
|
{
|
|
"epoch": 0.4955872369314325,
|
|
"grad_norm": 15.55712890625,
|
|
"learning_rate": 1.1891851340352235e-06,
|
|
"loss": 0.1087,
|
|
"num_input_tokens_seen": 5603136,
|
|
"step": 5840
|
|
},
|
|
{
|
|
"epoch": 0.4960115410726409,
|
|
"grad_norm": 0.4599844813346863,
|
|
"learning_rate": 1.1877304904514232e-06,
|
|
"loss": 0.0484,
|
|
"num_input_tokens_seen": 5607872,
|
|
"step": 5845
|
|
},
|
|
{
|
|
"epoch": 0.4964358452138493,
|
|
"grad_norm": 7.1125617027282715,
|
|
"learning_rate": 1.1862754350042764e-06,
|
|
"loss": 0.0441,
|
|
"num_input_tokens_seen": 5612352,
|
|
"step": 5850
|
|
},
|
|
{
|
|
"epoch": 0.4968601493550577,
|
|
"grad_norm": 0.2599692642688751,
|
|
"learning_rate": 1.1848199708860404e-06,
|
|
"loss": 0.0593,
|
|
"num_input_tokens_seen": 5617472,
|
|
"step": 5855
|
|
},
|
|
{
|
|
"epoch": 0.4972844534962661,
|
|
"grad_norm": 9.090217590332031,
|
|
"learning_rate": 1.183364101289869e-06,
|
|
"loss": 0.1157,
|
|
"num_input_tokens_seen": 5621824,
|
|
"step": 5860
|
|
},
|
|
{
|
|
"epoch": 0.4977087576374745,
|
|
"grad_norm": 0.8271796703338623,
|
|
"learning_rate": 1.1819078294098057e-06,
|
|
"loss": 0.0617,
|
|
"num_input_tokens_seen": 5626304,
|
|
"step": 5865
|
|
},
|
|
{
|
|
"epoch": 0.49813306177868294,
|
|
"grad_norm": 24.290382385253906,
|
|
"learning_rate": 1.180451158440776e-06,
|
|
"loss": 0.0521,
|
|
"num_input_tokens_seen": 5631680,
|
|
"step": 5870
|
|
},
|
|
{
|
|
"epoch": 0.49855736591989136,
|
|
"grad_norm": 7.5667290687561035,
|
|
"learning_rate": 1.1789940915785823e-06,
|
|
"loss": 0.109,
|
|
"num_input_tokens_seen": 5635904,
|
|
"step": 5875
|
|
},
|
|
{
|
|
"epoch": 0.4989816700610998,
|
|
"grad_norm": 0.7740042805671692,
|
|
"learning_rate": 1.177536632019894e-06,
|
|
"loss": 0.0431,
|
|
"num_input_tokens_seen": 5640512,
|
|
"step": 5880
|
|
},
|
|
{
|
|
"epoch": 0.4994059742023082,
|
|
"grad_norm": 6.739123821258545,
|
|
"learning_rate": 1.1760787829622423e-06,
|
|
"loss": 0.0815,
|
|
"num_input_tokens_seen": 5646464,
|
|
"step": 5885
|
|
},
|
|
{
|
|
"epoch": 0.4998302783435166,
|
|
"grad_norm": 0.21364034712314606,
|
|
"learning_rate": 1.1746205476040137e-06,
|
|
"loss": 0.0265,
|
|
"num_input_tokens_seen": 5651008,
|
|
"step": 5890
|
|
},
|
|
{
|
|
"epoch": 0.5002545824847251,
|
|
"grad_norm": 19.09162139892578,
|
|
"learning_rate": 1.173161929144442e-06,
|
|
"loss": 0.0663,
|
|
"num_input_tokens_seen": 5655616,
|
|
"step": 5895
|
|
},
|
|
{
|
|
"epoch": 0.5006788866259335,
|
|
"grad_norm": 0.20594583451747894,
|
|
"learning_rate": 1.171702930783601e-06,
|
|
"loss": 0.0262,
|
|
"num_input_tokens_seen": 5660352,
|
|
"step": 5900
|
|
},
|
|
{
|
|
"epoch": 0.5006788866259335,
|
|
"eval_loss": 0.05282815173268318,
|
|
"eval_runtime": 16.7878,
|
|
"eval_samples_per_second": 623.966,
|
|
"eval_steps_per_second": 78.033,
|
|
"num_input_tokens_seen": 5660352,
|
|
"step": 5900
|
|
},
|
|
{
|
|
"epoch": 0.5011031907671419,
|
|
"grad_norm": 3.6819992065429688,
|
|
"learning_rate": 1.1702435557223986e-06,
|
|
"loss": 0.0716,
|
|
"num_input_tokens_seen": 5664832,
|
|
"step": 5905
|
|
},
|
|
{
|
|
"epoch": 0.5015274949083504,
|
|
"grad_norm": 5.246142387390137,
|
|
"learning_rate": 1.1687838071625684e-06,
|
|
"loss": 0.0852,
|
|
"num_input_tokens_seen": 5669824,
|
|
"step": 5910
|
|
},
|
|
{
|
|
"epoch": 0.5019517990495588,
|
|
"grad_norm": 0.4738549292087555,
|
|
"learning_rate": 1.167323688306664e-06,
|
|
"loss": 0.0659,
|
|
"num_input_tokens_seen": 5674240,
|
|
"step": 5915
|
|
},
|
|
{
|
|
"epoch": 0.5023761031907671,
|
|
"grad_norm": 22.23293113708496,
|
|
"learning_rate": 1.1658632023580515e-06,
|
|
"loss": 0.0717,
|
|
"num_input_tokens_seen": 5679296,
|
|
"step": 5920
|
|
},
|
|
{
|
|
"epoch": 0.5028004073319755,
|
|
"grad_norm": 19.83220672607422,
|
|
"learning_rate": 1.1644023525209014e-06,
|
|
"loss": 0.0458,
|
|
"num_input_tokens_seen": 5683840,
|
|
"step": 5925
|
|
},
|
|
{
|
|
"epoch": 0.5032247114731839,
|
|
"grad_norm": 12.404871940612793,
|
|
"learning_rate": 1.162941142000184e-06,
|
|
"loss": 0.0744,
|
|
"num_input_tokens_seen": 5688896,
|
|
"step": 5930
|
|
},
|
|
{
|
|
"epoch": 0.5036490156143923,
|
|
"grad_norm": 5.345841884613037,
|
|
"learning_rate": 1.1614795740016598e-06,
|
|
"loss": 0.0472,
|
|
"num_input_tokens_seen": 5693440,
|
|
"step": 5935
|
|
},
|
|
{
|
|
"epoch": 0.5040733197556008,
|
|
"grad_norm": 0.944007396697998,
|
|
"learning_rate": 1.160017651731874e-06,
|
|
"loss": 0.0627,
|
|
"num_input_tokens_seen": 5697920,
|
|
"step": 5940
|
|
},
|
|
{
|
|
"epoch": 0.5044976238968092,
|
|
"grad_norm": 0.2512733042240143,
|
|
"learning_rate": 1.1585553783981486e-06,
|
|
"loss": 0.0626,
|
|
"num_input_tokens_seen": 5702528,
|
|
"step": 5945
|
|
},
|
|
{
|
|
"epoch": 0.5049219280380176,
|
|
"grad_norm": 19.54583740234375,
|
|
"learning_rate": 1.1570927572085766e-06,
|
|
"loss": 0.0769,
|
|
"num_input_tokens_seen": 5707584,
|
|
"step": 5950
|
|
},
|
|
{
|
|
"epoch": 0.505346232179226,
|
|
"grad_norm": 12.366445541381836,
|
|
"learning_rate": 1.1556297913720137e-06,
|
|
"loss": 0.0429,
|
|
"num_input_tokens_seen": 5712192,
|
|
"step": 5955
|
|
},
|
|
{
|
|
"epoch": 0.5057705363204344,
|
|
"grad_norm": 21.00571632385254,
|
|
"learning_rate": 1.1541664840980715e-06,
|
|
"loss": 0.1499,
|
|
"num_input_tokens_seen": 5717632,
|
|
"step": 5960
|
|
},
|
|
{
|
|
"epoch": 0.5061948404616429,
|
|
"grad_norm": 8.767720222473145,
|
|
"learning_rate": 1.1527028385971107e-06,
|
|
"loss": 0.0824,
|
|
"num_input_tokens_seen": 5722176,
|
|
"step": 5965
|
|
},
|
|
{
|
|
"epoch": 0.5066191446028513,
|
|
"grad_norm": 15.041585922241211,
|
|
"learning_rate": 1.1512388580802348e-06,
|
|
"loss": 0.0546,
|
|
"num_input_tokens_seen": 5726720,
|
|
"step": 5970
|
|
},
|
|
{
|
|
"epoch": 0.5070434487440597,
|
|
"grad_norm": 16.397735595703125,
|
|
"learning_rate": 1.1497745457592815e-06,
|
|
"loss": 0.0791,
|
|
"num_input_tokens_seen": 5731328,
|
|
"step": 5975
|
|
},
|
|
{
|
|
"epoch": 0.5074677528852681,
|
|
"grad_norm": 4.624978065490723,
|
|
"learning_rate": 1.1483099048468168e-06,
|
|
"loss": 0.0565,
|
|
"num_input_tokens_seen": 5736256,
|
|
"step": 5980
|
|
},
|
|
{
|
|
"epoch": 0.5078920570264766,
|
|
"grad_norm": 9.767938613891602,
|
|
"learning_rate": 1.1468449385561272e-06,
|
|
"loss": 0.0667,
|
|
"num_input_tokens_seen": 5741248,
|
|
"step": 5985
|
|
},
|
|
{
|
|
"epoch": 0.508316361167685,
|
|
"grad_norm": 5.947503089904785,
|
|
"learning_rate": 1.145379650101214e-06,
|
|
"loss": 0.0406,
|
|
"num_input_tokens_seen": 5746304,
|
|
"step": 5990
|
|
},
|
|
{
|
|
"epoch": 0.5087406653088934,
|
|
"grad_norm": 1.009181022644043,
|
|
"learning_rate": 1.143914042696784e-06,
|
|
"loss": 0.102,
|
|
"num_input_tokens_seen": 5751552,
|
|
"step": 5995
|
|
},
|
|
{
|
|
"epoch": 0.5091649694501018,
|
|
"grad_norm": 6.548674583435059,
|
|
"learning_rate": 1.1424481195582445e-06,
|
|
"loss": 0.056,
|
|
"num_input_tokens_seen": 5756032,
|
|
"step": 6000
|
|
},
|
|
{
|
|
"epoch": 0.5095892735913102,
|
|
"grad_norm": 17.75074005126953,
|
|
"learning_rate": 1.1409818839016958e-06,
|
|
"loss": 0.0839,
|
|
"num_input_tokens_seen": 5761600,
|
|
"step": 6005
|
|
},
|
|
{
|
|
"epoch": 0.5100135777325187,
|
|
"grad_norm": 2.6038055419921875,
|
|
"learning_rate": 1.1395153389439231e-06,
|
|
"loss": 0.0264,
|
|
"num_input_tokens_seen": 5766336,
|
|
"step": 6010
|
|
},
|
|
{
|
|
"epoch": 0.5104378818737271,
|
|
"grad_norm": 14.719752311706543,
|
|
"learning_rate": 1.1380484879023903e-06,
|
|
"loss": 0.0816,
|
|
"num_input_tokens_seen": 5771392,
|
|
"step": 6015
|
|
},
|
|
{
|
|
"epoch": 0.5108621860149355,
|
|
"grad_norm": 20.255233764648438,
|
|
"learning_rate": 1.1365813339952334e-06,
|
|
"loss": 0.0274,
|
|
"num_input_tokens_seen": 5775808,
|
|
"step": 6020
|
|
},
|
|
{
|
|
"epoch": 0.5112864901561439,
|
|
"grad_norm": 3.6059844493865967,
|
|
"learning_rate": 1.1351138804412524e-06,
|
|
"loss": 0.0714,
|
|
"num_input_tokens_seen": 5780800,
|
|
"step": 6025
|
|
},
|
|
{
|
|
"epoch": 0.5117107942973523,
|
|
"grad_norm": 18.04718017578125,
|
|
"learning_rate": 1.1336461304599047e-06,
|
|
"loss": 0.0186,
|
|
"num_input_tokens_seen": 5786304,
|
|
"step": 6030
|
|
},
|
|
{
|
|
"epoch": 0.5121350984385608,
|
|
"grad_norm": 0.28144174814224243,
|
|
"learning_rate": 1.1321780872712983e-06,
|
|
"loss": 0.0454,
|
|
"num_input_tokens_seen": 5791360,
|
|
"step": 6035
|
|
},
|
|
{
|
|
"epoch": 0.5125594025797692,
|
|
"grad_norm": 1.852778434753418,
|
|
"learning_rate": 1.1307097540961838e-06,
|
|
"loss": 0.0297,
|
|
"num_input_tokens_seen": 5795840,
|
|
"step": 6040
|
|
},
|
|
{
|
|
"epoch": 0.5129837067209776,
|
|
"grad_norm": 5.90084171295166,
|
|
"learning_rate": 1.129241134155949e-06,
|
|
"loss": 0.037,
|
|
"num_input_tokens_seen": 5800576,
|
|
"step": 6045
|
|
},
|
|
{
|
|
"epoch": 0.513408010862186,
|
|
"grad_norm": 0.2609613537788391,
|
|
"learning_rate": 1.1277722306726103e-06,
|
|
"loss": 0.0494,
|
|
"num_input_tokens_seen": 5805632,
|
|
"step": 6050
|
|
},
|
|
{
|
|
"epoch": 0.5138323150033944,
|
|
"grad_norm": 0.17078615725040436,
|
|
"learning_rate": 1.1263030468688057e-06,
|
|
"loss": 0.028,
|
|
"num_input_tokens_seen": 5810688,
|
|
"step": 6055
|
|
},
|
|
{
|
|
"epoch": 0.5142566191446029,
|
|
"grad_norm": 12.094003677368164,
|
|
"learning_rate": 1.1248335859677891e-06,
|
|
"loss": 0.0581,
|
|
"num_input_tokens_seen": 5815616,
|
|
"step": 6060
|
|
},
|
|
{
|
|
"epoch": 0.5146809232858113,
|
|
"grad_norm": 16.548694610595703,
|
|
"learning_rate": 1.1233638511934218e-06,
|
|
"loss": 0.0844,
|
|
"num_input_tokens_seen": 5820672,
|
|
"step": 6065
|
|
},
|
|
{
|
|
"epoch": 0.5151052274270197,
|
|
"grad_norm": 9.757255554199219,
|
|
"learning_rate": 1.121893845770166e-06,
|
|
"loss": 0.0661,
|
|
"num_input_tokens_seen": 5824896,
|
|
"step": 6070
|
|
},
|
|
{
|
|
"epoch": 0.5155295315682281,
|
|
"grad_norm": 2.3751487731933594,
|
|
"learning_rate": 1.120423572923078e-06,
|
|
"loss": 0.0485,
|
|
"num_input_tokens_seen": 5829632,
|
|
"step": 6075
|
|
},
|
|
{
|
|
"epoch": 0.5159538357094365,
|
|
"grad_norm": 12.690343856811523,
|
|
"learning_rate": 1.1189530358778004e-06,
|
|
"loss": 0.0311,
|
|
"num_input_tokens_seen": 5834240,
|
|
"step": 6080
|
|
},
|
|
{
|
|
"epoch": 0.516378139850645,
|
|
"grad_norm": 1.417817234992981,
|
|
"learning_rate": 1.1174822378605551e-06,
|
|
"loss": 0.0884,
|
|
"num_input_tokens_seen": 5838784,
|
|
"step": 6085
|
|
},
|
|
{
|
|
"epoch": 0.5168024439918534,
|
|
"grad_norm": 16.369613647460938,
|
|
"learning_rate": 1.116011182098138e-06,
|
|
"loss": 0.0638,
|
|
"num_input_tokens_seen": 5843072,
|
|
"step": 6090
|
|
},
|
|
{
|
|
"epoch": 0.5172267481330618,
|
|
"grad_norm": 19.029876708984375,
|
|
"learning_rate": 1.1145398718179085e-06,
|
|
"loss": 0.0438,
|
|
"num_input_tokens_seen": 5847360,
|
|
"step": 6095
|
|
},
|
|
{
|
|
"epoch": 0.5176510522742702,
|
|
"grad_norm": 0.24218542873859406,
|
|
"learning_rate": 1.1130683102477862e-06,
|
|
"loss": 0.0131,
|
|
"num_input_tokens_seen": 5852224,
|
|
"step": 6100
|
|
},
|
|
{
|
|
"epoch": 0.5180753564154786,
|
|
"grad_norm": 0.6491793394088745,
|
|
"learning_rate": 1.1115965006162405e-06,
|
|
"loss": 0.0231,
|
|
"num_input_tokens_seen": 5857152,
|
|
"step": 6105
|
|
},
|
|
{
|
|
"epoch": 0.5184996605566871,
|
|
"grad_norm": 0.04124971851706505,
|
|
"learning_rate": 1.110124446152286e-06,
|
|
"loss": 0.1033,
|
|
"num_input_tokens_seen": 5861888,
|
|
"step": 6110
|
|
},
|
|
{
|
|
"epoch": 0.5189239646978955,
|
|
"grad_norm": 15.396041870117188,
|
|
"learning_rate": 1.1086521500854744e-06,
|
|
"loss": 0.1292,
|
|
"num_input_tokens_seen": 5866496,
|
|
"step": 6115
|
|
},
|
|
{
|
|
"epoch": 0.5193482688391039,
|
|
"grad_norm": 21.97252655029297,
|
|
"learning_rate": 1.1071796156458868e-06,
|
|
"loss": 0.0559,
|
|
"num_input_tokens_seen": 5870912,
|
|
"step": 6120
|
|
},
|
|
{
|
|
"epoch": 0.5197725729803123,
|
|
"grad_norm": 5.2591938972473145,
|
|
"learning_rate": 1.1057068460641281e-06,
|
|
"loss": 0.0806,
|
|
"num_input_tokens_seen": 5876672,
|
|
"step": 6125
|
|
},
|
|
{
|
|
"epoch": 0.5201968771215207,
|
|
"grad_norm": 14.965441703796387,
|
|
"learning_rate": 1.1042338445713183e-06,
|
|
"loss": 0.0336,
|
|
"num_input_tokens_seen": 5881024,
|
|
"step": 6130
|
|
},
|
|
{
|
|
"epoch": 0.5206211812627292,
|
|
"grad_norm": 7.191347599029541,
|
|
"learning_rate": 1.1027606143990867e-06,
|
|
"loss": 0.1725,
|
|
"num_input_tokens_seen": 5886080,
|
|
"step": 6135
|
|
},
|
|
{
|
|
"epoch": 0.5210454854039376,
|
|
"grad_norm": 18.443265914916992,
|
|
"learning_rate": 1.1012871587795638e-06,
|
|
"loss": 0.0484,
|
|
"num_input_tokens_seen": 5890880,
|
|
"step": 6140
|
|
},
|
|
{
|
|
"epoch": 0.521469789545146,
|
|
"grad_norm": 4.920425891876221,
|
|
"learning_rate": 1.0998134809453756e-06,
|
|
"loss": 0.024,
|
|
"num_input_tokens_seen": 5895424,
|
|
"step": 6145
|
|
},
|
|
{
|
|
"epoch": 0.5218940936863544,
|
|
"grad_norm": 0.9843973517417908,
|
|
"learning_rate": 1.0983395841296347e-06,
|
|
"loss": 0.0789,
|
|
"num_input_tokens_seen": 5900352,
|
|
"step": 6150
|
|
},
|
|
{
|
|
"epoch": 0.5223183978275628,
|
|
"grad_norm": 19.042985916137695,
|
|
"learning_rate": 1.0968654715659347e-06,
|
|
"loss": 0.0784,
|
|
"num_input_tokens_seen": 5904960,
|
|
"step": 6155
|
|
},
|
|
{
|
|
"epoch": 0.5227427019687713,
|
|
"grad_norm": 7.392970085144043,
|
|
"learning_rate": 1.095391146488342e-06,
|
|
"loss": 0.0708,
|
|
"num_input_tokens_seen": 5910016,
|
|
"step": 6160
|
|
},
|
|
{
|
|
"epoch": 0.5231670061099797,
|
|
"grad_norm": 12.981468200683594,
|
|
"learning_rate": 1.09391661213139e-06,
|
|
"loss": 0.0644,
|
|
"num_input_tokens_seen": 5914944,
|
|
"step": 6165
|
|
},
|
|
{
|
|
"epoch": 0.5235913102511881,
|
|
"grad_norm": 9.963648796081543,
|
|
"learning_rate": 1.0924418717300707e-06,
|
|
"loss": 0.0499,
|
|
"num_input_tokens_seen": 5920448,
|
|
"step": 6170
|
|
},
|
|
{
|
|
"epoch": 0.5240156143923965,
|
|
"grad_norm": 14.596358299255371,
|
|
"learning_rate": 1.090966928519828e-06,
|
|
"loss": 0.0327,
|
|
"num_input_tokens_seen": 5925696,
|
|
"step": 6175
|
|
},
|
|
{
|
|
"epoch": 0.5244399185336049,
|
|
"grad_norm": 20.38301658630371,
|
|
"learning_rate": 1.0894917857365511e-06,
|
|
"loss": 0.0361,
|
|
"num_input_tokens_seen": 5930624,
|
|
"step": 6180
|
|
},
|
|
{
|
|
"epoch": 0.5248642226748133,
|
|
"grad_norm": 10.057507514953613,
|
|
"learning_rate": 1.0880164466165673e-06,
|
|
"loss": 0.0682,
|
|
"num_input_tokens_seen": 5935168,
|
|
"step": 6185
|
|
},
|
|
{
|
|
"epoch": 0.5252885268160217,
|
|
"grad_norm": 0.2300967425107956,
|
|
"learning_rate": 1.0865409143966338e-06,
|
|
"loss": 0.0491,
|
|
"num_input_tokens_seen": 5939712,
|
|
"step": 6190
|
|
},
|
|
{
|
|
"epoch": 0.5257128309572301,
|
|
"grad_norm": 2.914245367050171,
|
|
"learning_rate": 1.0850651923139317e-06,
|
|
"loss": 0.0088,
|
|
"num_input_tokens_seen": 5944576,
|
|
"step": 6195
|
|
},
|
|
{
|
|
"epoch": 0.5261371350984385,
|
|
"grad_norm": 30.218713760375977,
|
|
"learning_rate": 1.0835892836060598e-06,
|
|
"loss": 0.0573,
|
|
"num_input_tokens_seen": 5949184,
|
|
"step": 6200
|
|
},
|
|
{
|
|
"epoch": 0.5265614392396469,
|
|
"grad_norm": 0.9535674452781677,
|
|
"learning_rate": 1.0821131915110246e-06,
|
|
"loss": 0.0488,
|
|
"num_input_tokens_seen": 5954176,
|
|
"step": 6205
|
|
},
|
|
{
|
|
"epoch": 0.5269857433808554,
|
|
"grad_norm": 18.00348663330078,
|
|
"learning_rate": 1.080636919267236e-06,
|
|
"loss": 0.0792,
|
|
"num_input_tokens_seen": 5958656,
|
|
"step": 6210
|
|
},
|
|
{
|
|
"epoch": 0.5274100475220638,
|
|
"grad_norm": 14.509427070617676,
|
|
"learning_rate": 1.079160470113499e-06,
|
|
"loss": 0.0934,
|
|
"num_input_tokens_seen": 5963264,
|
|
"step": 6215
|
|
},
|
|
{
|
|
"epoch": 0.5278343516632722,
|
|
"grad_norm": 10.986270904541016,
|
|
"learning_rate": 1.0776838472890064e-06,
|
|
"loss": 0.0869,
|
|
"num_input_tokens_seen": 5968576,
|
|
"step": 6220
|
|
},
|
|
{
|
|
"epoch": 0.5282586558044806,
|
|
"grad_norm": 0.9491480588912964,
|
|
"learning_rate": 1.0762070540333322e-06,
|
|
"loss": 0.0059,
|
|
"num_input_tokens_seen": 5973248,
|
|
"step": 6225
|
|
},
|
|
{
|
|
"epoch": 0.528682959945689,
|
|
"grad_norm": 0.7369577884674072,
|
|
"learning_rate": 1.0747300935864243e-06,
|
|
"loss": 0.0649,
|
|
"num_input_tokens_seen": 5977920,
|
|
"step": 6230
|
|
},
|
|
{
|
|
"epoch": 0.5291072640868975,
|
|
"grad_norm": 9.539855003356934,
|
|
"learning_rate": 1.0732529691885977e-06,
|
|
"loss": 0.0558,
|
|
"num_input_tokens_seen": 5982656,
|
|
"step": 6235
|
|
},
|
|
{
|
|
"epoch": 0.5295315682281059,
|
|
"grad_norm": 3.556328058242798,
|
|
"learning_rate": 1.0717756840805263e-06,
|
|
"loss": 0.0415,
|
|
"num_input_tokens_seen": 5987392,
|
|
"step": 6240
|
|
},
|
|
{
|
|
"epoch": 0.5299558723693143,
|
|
"grad_norm": 0.5369057059288025,
|
|
"learning_rate": 1.0702982415032378e-06,
|
|
"loss": 0.0617,
|
|
"num_input_tokens_seen": 5993280,
|
|
"step": 6245
|
|
},
|
|
{
|
|
"epoch": 0.5303801765105227,
|
|
"grad_norm": 23.516464233398438,
|
|
"learning_rate": 1.068820644698104e-06,
|
|
"loss": 0.0366,
|
|
"num_input_tokens_seen": 5998272,
|
|
"step": 6250
|
|
},
|
|
{
|
|
"epoch": 0.5308044806517311,
|
|
"grad_norm": 8.132739067077637,
|
|
"learning_rate": 1.0673428969068363e-06,
|
|
"loss": 0.0304,
|
|
"num_input_tokens_seen": 6002816,
|
|
"step": 6255
|
|
},
|
|
{
|
|
"epoch": 0.5312287847929396,
|
|
"grad_norm": 16.282440185546875,
|
|
"learning_rate": 1.0658650013714765e-06,
|
|
"loss": 0.0206,
|
|
"num_input_tokens_seen": 6007744,
|
|
"step": 6260
|
|
},
|
|
{
|
|
"epoch": 0.531653088934148,
|
|
"grad_norm": 0.06456945836544037,
|
|
"learning_rate": 1.0643869613343906e-06,
|
|
"loss": 0.09,
|
|
"num_input_tokens_seen": 6011776,
|
|
"step": 6265
|
|
},
|
|
{
|
|
"epoch": 0.5320773930753564,
|
|
"grad_norm": 0.0371355265378952,
|
|
"learning_rate": 1.062908780038262e-06,
|
|
"loss": 0.1175,
|
|
"num_input_tokens_seen": 6017344,
|
|
"step": 6270
|
|
},
|
|
{
|
|
"epoch": 0.5325016972165648,
|
|
"grad_norm": 9.185993194580078,
|
|
"learning_rate": 1.0614304607260843e-06,
|
|
"loss": 0.0414,
|
|
"num_input_tokens_seen": 6022144,
|
|
"step": 6275
|
|
},
|
|
{
|
|
"epoch": 0.5329260013577732,
|
|
"grad_norm": 18.560842514038086,
|
|
"learning_rate": 1.0599520066411529e-06,
|
|
"loss": 0.0277,
|
|
"num_input_tokens_seen": 6027712,
|
|
"step": 6280
|
|
},
|
|
{
|
|
"epoch": 0.5333503054989817,
|
|
"grad_norm": 27.039613723754883,
|
|
"learning_rate": 1.0584734210270597e-06,
|
|
"loss": 0.0459,
|
|
"num_input_tokens_seen": 6032064,
|
|
"step": 6285
|
|
},
|
|
{
|
|
"epoch": 0.5337746096401901,
|
|
"grad_norm": 0.044652462005615234,
|
|
"learning_rate": 1.0569947071276845e-06,
|
|
"loss": 0.0491,
|
|
"num_input_tokens_seen": 6036288,
|
|
"step": 6290
|
|
},
|
|
{
|
|
"epoch": 0.5341989137813985,
|
|
"grad_norm": 13.476823806762695,
|
|
"learning_rate": 1.0555158681871897e-06,
|
|
"loss": 0.0962,
|
|
"num_input_tokens_seen": 6040960,
|
|
"step": 6295
|
|
},
|
|
{
|
|
"epoch": 0.5346232179226069,
|
|
"grad_norm": 54.4954833984375,
|
|
"learning_rate": 1.0540369074500103e-06,
|
|
"loss": 0.0443,
|
|
"num_input_tokens_seen": 6045376,
|
|
"step": 6300
|
|
},
|
|
{
|
|
"epoch": 0.5350475220638153,
|
|
"grad_norm": 10.807979583740234,
|
|
"learning_rate": 1.0525578281608503e-06,
|
|
"loss": 0.0797,
|
|
"num_input_tokens_seen": 6049856,
|
|
"step": 6305
|
|
},
|
|
{
|
|
"epoch": 0.5354718262050238,
|
|
"grad_norm": 1.0867037773132324,
|
|
"learning_rate": 1.0510786335646725e-06,
|
|
"loss": 0.0225,
|
|
"num_input_tokens_seen": 6054144,
|
|
"step": 6310
|
|
},
|
|
{
|
|
"epoch": 0.5358961303462322,
|
|
"grad_norm": 15.076601028442383,
|
|
"learning_rate": 1.0495993269066935e-06,
|
|
"loss": 0.046,
|
|
"num_input_tokens_seen": 6060032,
|
|
"step": 6315
|
|
},
|
|
{
|
|
"epoch": 0.5363204344874406,
|
|
"grad_norm": 15.721909523010254,
|
|
"learning_rate": 1.0481199114323746e-06,
|
|
"loss": 0.1197,
|
|
"num_input_tokens_seen": 6065280,
|
|
"step": 6320
|
|
},
|
|
{
|
|
"epoch": 0.536744738628649,
|
|
"grad_norm": 0.3251340687274933,
|
|
"learning_rate": 1.0466403903874175e-06,
|
|
"loss": 0.1007,
|
|
"num_input_tokens_seen": 6070080,
|
|
"step": 6325
|
|
},
|
|
{
|
|
"epoch": 0.5371690427698574,
|
|
"grad_norm": 9.55562973022461,
|
|
"learning_rate": 1.0451607670177543e-06,
|
|
"loss": 0.0638,
|
|
"num_input_tokens_seen": 6076032,
|
|
"step": 6330
|
|
},
|
|
{
|
|
"epoch": 0.5375933469110659,
|
|
"grad_norm": 10.594807624816895,
|
|
"learning_rate": 1.0436810445695421e-06,
|
|
"loss": 0.0613,
|
|
"num_input_tokens_seen": 6080768,
|
|
"step": 6335
|
|
},
|
|
{
|
|
"epoch": 0.5380176510522743,
|
|
"grad_norm": 10.208658218383789,
|
|
"learning_rate": 1.0422012262891548e-06,
|
|
"loss": 0.0435,
|
|
"num_input_tokens_seen": 6085312,
|
|
"step": 6340
|
|
},
|
|
{
|
|
"epoch": 0.5384419551934827,
|
|
"grad_norm": 28.246740341186523,
|
|
"learning_rate": 1.0407213154231774e-06,
|
|
"loss": 0.0255,
|
|
"num_input_tokens_seen": 6090048,
|
|
"step": 6345
|
|
},
|
|
{
|
|
"epoch": 0.5388662593346911,
|
|
"grad_norm": 39.74928283691406,
|
|
"learning_rate": 1.0392413152183973e-06,
|
|
"loss": 0.0789,
|
|
"num_input_tokens_seen": 6094720,
|
|
"step": 6350
|
|
},
|
|
{
|
|
"epoch": 0.5392905634758995,
|
|
"grad_norm": 35.197784423828125,
|
|
"learning_rate": 1.0377612289217982e-06,
|
|
"loss": 0.1252,
|
|
"num_input_tokens_seen": 6099456,
|
|
"step": 6355
|
|
},
|
|
{
|
|
"epoch": 0.539714867617108,
|
|
"grad_norm": 0.0801963061094284,
|
|
"learning_rate": 1.0362810597805524e-06,
|
|
"loss": 0.0352,
|
|
"num_input_tokens_seen": 6104448,
|
|
"step": 6360
|
|
},
|
|
{
|
|
"epoch": 0.5401391717583164,
|
|
"grad_norm": 0.3187190890312195,
|
|
"learning_rate": 1.0348008110420149e-06,
|
|
"loss": 0.0373,
|
|
"num_input_tokens_seen": 6109056,
|
|
"step": 6365
|
|
},
|
|
{
|
|
"epoch": 0.5405634758995248,
|
|
"grad_norm": 16.217567443847656,
|
|
"learning_rate": 1.0333204859537142e-06,
|
|
"loss": 0.104,
|
|
"num_input_tokens_seen": 6114496,
|
|
"step": 6370
|
|
},
|
|
{
|
|
"epoch": 0.5409877800407332,
|
|
"grad_norm": 0.10464701801538467,
|
|
"learning_rate": 1.0318400877633466e-06,
|
|
"loss": 0.0795,
|
|
"num_input_tokens_seen": 6119360,
|
|
"step": 6375
|
|
},
|
|
{
|
|
"epoch": 0.5414120841819416,
|
|
"grad_norm": 11.15638256072998,
|
|
"learning_rate": 1.030359619718769e-06,
|
|
"loss": 0.0844,
|
|
"num_input_tokens_seen": 6124352,
|
|
"step": 6380
|
|
},
|
|
{
|
|
"epoch": 0.5418363883231501,
|
|
"grad_norm": 2.5510356426239014,
|
|
"learning_rate": 1.0288790850679916e-06,
|
|
"loss": 0.0485,
|
|
"num_input_tokens_seen": 6128832,
|
|
"step": 6385
|
|
},
|
|
{
|
|
"epoch": 0.5422606924643585,
|
|
"grad_norm": 32.599891662597656,
|
|
"learning_rate": 1.0273984870591706e-06,
|
|
"loss": 0.0922,
|
|
"num_input_tokens_seen": 6133312,
|
|
"step": 6390
|
|
},
|
|
{
|
|
"epoch": 0.5426849966055669,
|
|
"grad_norm": 29.06612205505371,
|
|
"learning_rate": 1.025917828940601e-06,
|
|
"loss": 0.08,
|
|
"num_input_tokens_seen": 6137600,
|
|
"step": 6395
|
|
},
|
|
{
|
|
"epoch": 0.5431093007467753,
|
|
"grad_norm": 9.78400993347168,
|
|
"learning_rate": 1.02443711396071e-06,
|
|
"loss": 0.0499,
|
|
"num_input_tokens_seen": 6142464,
|
|
"step": 6400
|
|
},
|
|
{
|
|
"epoch": 0.5435336048879837,
|
|
"grad_norm": 0.38985809683799744,
|
|
"learning_rate": 1.0229563453680495e-06,
|
|
"loss": 0.0652,
|
|
"num_input_tokens_seen": 6147072,
|
|
"step": 6405
|
|
},
|
|
{
|
|
"epoch": 0.5439579090291922,
|
|
"grad_norm": 24.73426055908203,
|
|
"learning_rate": 1.021475526411289e-06,
|
|
"loss": 0.0541,
|
|
"num_input_tokens_seen": 6151744,
|
|
"step": 6410
|
|
},
|
|
{
|
|
"epoch": 0.5443822131704006,
|
|
"grad_norm": 6.6672892570495605,
|
|
"learning_rate": 1.0199946603392078e-06,
|
|
"loss": 0.0837,
|
|
"num_input_tokens_seen": 6156672,
|
|
"step": 6415
|
|
},
|
|
{
|
|
"epoch": 0.544806517311609,
|
|
"grad_norm": 28.79816246032715,
|
|
"learning_rate": 1.01851375040069e-06,
|
|
"loss": 0.0547,
|
|
"num_input_tokens_seen": 6161600,
|
|
"step": 6420
|
|
},
|
|
{
|
|
"epoch": 0.5452308214528174,
|
|
"grad_norm": 0.37170231342315674,
|
|
"learning_rate": 1.0170327998447149e-06,
|
|
"loss": 0.0404,
|
|
"num_input_tokens_seen": 6165760,
|
|
"step": 6425
|
|
},
|
|
{
|
|
"epoch": 0.5456551255940258,
|
|
"grad_norm": 1.676621913909912,
|
|
"learning_rate": 1.015551811920351e-06,
|
|
"loss": 0.0183,
|
|
"num_input_tokens_seen": 6174912,
|
|
"step": 6430
|
|
},
|
|
{
|
|
"epoch": 0.5460794297352343,
|
|
"grad_norm": 13.310209274291992,
|
|
"learning_rate": 1.014070789876749e-06,
|
|
"loss": 0.0781,
|
|
"num_input_tokens_seen": 6179136,
|
|
"step": 6435
|
|
},
|
|
{
|
|
"epoch": 0.5465037338764427,
|
|
"grad_norm": 2.580735206604004,
|
|
"learning_rate": 1.0125897369631342e-06,
|
|
"loss": 0.1215,
|
|
"num_input_tokens_seen": 6183680,
|
|
"step": 6440
|
|
},
|
|
{
|
|
"epoch": 0.546928038017651,
|
|
"grad_norm": 31.31711769104004,
|
|
"learning_rate": 1.0111086564288003e-06,
|
|
"loss": 0.049,
|
|
"num_input_tokens_seen": 6188608,
|
|
"step": 6445
|
|
},
|
|
{
|
|
"epoch": 0.5473523421588594,
|
|
"grad_norm": 1.1383436918258667,
|
|
"learning_rate": 1.009627551523101e-06,
|
|
"loss": 0.0609,
|
|
"num_input_tokens_seen": 6193600,
|
|
"step": 6450
|
|
},
|
|
{
|
|
"epoch": 0.5477766463000678,
|
|
"grad_norm": 0.15425796806812286,
|
|
"learning_rate": 1.008146425495443e-06,
|
|
"loss": 0.0417,
|
|
"num_input_tokens_seen": 6198528,
|
|
"step": 6455
|
|
},
|
|
{
|
|
"epoch": 0.5482009504412763,
|
|
"grad_norm": 0.34240061044692993,
|
|
"learning_rate": 1.0066652815952805e-06,
|
|
"loss": 0.0294,
|
|
"num_input_tokens_seen": 6204096,
|
|
"step": 6460
|
|
},
|
|
{
|
|
"epoch": 0.5486252545824847,
|
|
"grad_norm": 5.866513252258301,
|
|
"learning_rate": 1.0051841230721063e-06,
|
|
"loss": 0.0446,
|
|
"num_input_tokens_seen": 6208704,
|
|
"step": 6465
|
|
},
|
|
{
|
|
"epoch": 0.5490495587236931,
|
|
"grad_norm": 6.472537040710449,
|
|
"learning_rate": 1.0037029531754453e-06,
|
|
"loss": 0.096,
|
|
"num_input_tokens_seen": 6213440,
|
|
"step": 6470
|
|
},
|
|
{
|
|
"epoch": 0.5494738628649015,
|
|
"grad_norm": 0.4093743562698364,
|
|
"learning_rate": 1.002221775154847e-06,
|
|
"loss": 0.0276,
|
|
"num_input_tokens_seen": 6218368,
|
|
"step": 6475
|
|
},
|
|
{
|
|
"epoch": 0.5498981670061099,
|
|
"grad_norm": 1.324775218963623,
|
|
"learning_rate": 1.0007405922598793e-06,
|
|
"loss": 0.0676,
|
|
"num_input_tokens_seen": 6223616,
|
|
"step": 6480
|
|
},
|
|
{
|
|
"epoch": 0.5503224711473184,
|
|
"grad_norm": 0.5380284786224365,
|
|
"learning_rate": 9.992594077401208e-07,
|
|
"loss": 0.1,
|
|
"num_input_tokens_seen": 6228480,
|
|
"step": 6485
|
|
},
|
|
{
|
|
"epoch": 0.5507467752885268,
|
|
"grad_norm": 7.3268303871154785,
|
|
"learning_rate": 9.977782248451534e-07,
|
|
"loss": 0.0577,
|
|
"num_input_tokens_seen": 6232896,
|
|
"step": 6490
|
|
},
|
|
{
|
|
"epoch": 0.5507467752885268,
|
|
"eval_loss": 0.05731356516480446,
|
|
"eval_runtime": 16.6093,
|
|
"eval_samples_per_second": 630.671,
|
|
"eval_steps_per_second": 78.871,
|
|
"num_input_tokens_seen": 6232896,
|
|
"step": 6490
|
|
},
|
|
{
|
|
"epoch": 0.5511710794297352,
|
|
"grad_norm": 16.78019905090332,
|
|
"learning_rate": 9.962970468245548e-07,
|
|
"loss": 0.1202,
|
|
"num_input_tokens_seen": 6237696,
|
|
"step": 6495
|
|
},
|
|
{
|
|
"epoch": 0.5515953835709436,
|
|
"grad_norm": 0.7755606174468994,
|
|
"learning_rate": 9.948158769278939e-07,
|
|
"loss": 0.032,
|
|
"num_input_tokens_seen": 6242304,
|
|
"step": 6500
|
|
},
|
|
{
|
|
"epoch": 0.552019687712152,
|
|
"grad_norm": 2.2955169677734375,
|
|
"learning_rate": 9.933347184047194e-07,
|
|
"loss": 0.0531,
|
|
"num_input_tokens_seen": 6246976,
|
|
"step": 6505
|
|
},
|
|
{
|
|
"epoch": 0.5524439918533605,
|
|
"grad_norm": 20.1527156829834,
|
|
"learning_rate": 9.918535745045571e-07,
|
|
"loss": 0.0587,
|
|
"num_input_tokens_seen": 6251264,
|
|
"step": 6510
|
|
},
|
|
{
|
|
"epoch": 0.5528682959945689,
|
|
"grad_norm": 8.775801658630371,
|
|
"learning_rate": 9.903724484768991e-07,
|
|
"loss": 0.0075,
|
|
"num_input_tokens_seen": 6255872,
|
|
"step": 6515
|
|
},
|
|
{
|
|
"epoch": 0.5532926001357773,
|
|
"grad_norm": 1.5770235061645508,
|
|
"learning_rate": 9.888913435711996e-07,
|
|
"loss": 0.044,
|
|
"num_input_tokens_seen": 6260928,
|
|
"step": 6520
|
|
},
|
|
{
|
|
"epoch": 0.5537169042769857,
|
|
"grad_norm": 2.763223648071289,
|
|
"learning_rate": 9.874102630368658e-07,
|
|
"loss": 0.0311,
|
|
"num_input_tokens_seen": 6265600,
|
|
"step": 6525
|
|
},
|
|
{
|
|
"epoch": 0.5541412084181941,
|
|
"grad_norm": 1.272904872894287,
|
|
"learning_rate": 9.859292101232514e-07,
|
|
"loss": 0.0991,
|
|
"num_input_tokens_seen": 6270464,
|
|
"step": 6530
|
|
},
|
|
{
|
|
"epoch": 0.5545655125594026,
|
|
"grad_norm": 38.17815399169922,
|
|
"learning_rate": 9.84448188079649e-07,
|
|
"loss": 0.1166,
|
|
"num_input_tokens_seen": 6274944,
|
|
"step": 6535
|
|
},
|
|
{
|
|
"epoch": 0.554989816700611,
|
|
"grad_norm": 7.1392951011657715,
|
|
"learning_rate": 9.829672001552853e-07,
|
|
"loss": 0.0637,
|
|
"num_input_tokens_seen": 6279424,
|
|
"step": 6540
|
|
},
|
|
{
|
|
"epoch": 0.5554141208418194,
|
|
"grad_norm": 6.693841934204102,
|
|
"learning_rate": 9.8148624959931e-07,
|
|
"loss": 0.125,
|
|
"num_input_tokens_seen": 6284096,
|
|
"step": 6545
|
|
},
|
|
{
|
|
"epoch": 0.5558384249830278,
|
|
"grad_norm": 55.63361358642578,
|
|
"learning_rate": 9.80005339660792e-07,
|
|
"loss": 0.0481,
|
|
"num_input_tokens_seen": 6289728,
|
|
"step": 6550
|
|
},
|
|
{
|
|
"epoch": 0.5562627291242362,
|
|
"grad_norm": 51.95858383178711,
|
|
"learning_rate": 9.785244735887112e-07,
|
|
"loss": 0.0526,
|
|
"num_input_tokens_seen": 6294400,
|
|
"step": 6555
|
|
},
|
|
{
|
|
"epoch": 0.5566870332654447,
|
|
"grad_norm": 10.492825508117676,
|
|
"learning_rate": 9.770436546319504e-07,
|
|
"loss": 0.0582,
|
|
"num_input_tokens_seen": 6298880,
|
|
"step": 6560
|
|
},
|
|
{
|
|
"epoch": 0.5571113374066531,
|
|
"grad_norm": 37.061561584472656,
|
|
"learning_rate": 9.755628860392901e-07,
|
|
"loss": 0.0408,
|
|
"num_input_tokens_seen": 6303424,
|
|
"step": 6565
|
|
},
|
|
{
|
|
"epoch": 0.5575356415478615,
|
|
"grad_norm": 18.075298309326172,
|
|
"learning_rate": 9.740821710593988e-07,
|
|
"loss": 0.0379,
|
|
"num_input_tokens_seen": 6308032,
|
|
"step": 6570
|
|
},
|
|
{
|
|
"epoch": 0.5579599456890699,
|
|
"grad_norm": 0.29117563366889954,
|
|
"learning_rate": 9.726015129408296e-07,
|
|
"loss": 0.0486,
|
|
"num_input_tokens_seen": 6312832,
|
|
"step": 6575
|
|
},
|
|
{
|
|
"epoch": 0.5583842498302783,
|
|
"grad_norm": 10.321760177612305,
|
|
"learning_rate": 9.711209149320083e-07,
|
|
"loss": 0.0549,
|
|
"num_input_tokens_seen": 6317312,
|
|
"step": 6580
|
|
},
|
|
{
|
|
"epoch": 0.5588085539714868,
|
|
"grad_norm": 7.484567165374756,
|
|
"learning_rate": 9.69640380281231e-07,
|
|
"loss": 0.0244,
|
|
"num_input_tokens_seen": 6321920,
|
|
"step": 6585
|
|
},
|
|
{
|
|
"epoch": 0.5592328581126952,
|
|
"grad_norm": 2.416494607925415,
|
|
"learning_rate": 9.681599122366533e-07,
|
|
"loss": 0.1148,
|
|
"num_input_tokens_seen": 6326336,
|
|
"step": 6590
|
|
},
|
|
{
|
|
"epoch": 0.5596571622539036,
|
|
"grad_norm": 0.7220749258995056,
|
|
"learning_rate": 9.66679514046286e-07,
|
|
"loss": 0.0885,
|
|
"num_input_tokens_seen": 6331008,
|
|
"step": 6595
|
|
},
|
|
{
|
|
"epoch": 0.560081466395112,
|
|
"grad_norm": 10.408479690551758,
|
|
"learning_rate": 9.65199188957985e-07,
|
|
"loss": 0.0855,
|
|
"num_input_tokens_seen": 6335744,
|
|
"step": 6600
|
|
},
|
|
{
|
|
"epoch": 0.5605057705363204,
|
|
"grad_norm": 1.3107913732528687,
|
|
"learning_rate": 9.637189402194475e-07,
|
|
"loss": 0.0502,
|
|
"num_input_tokens_seen": 6340736,
|
|
"step": 6605
|
|
},
|
|
{
|
|
"epoch": 0.5609300746775289,
|
|
"grad_norm": 15.155287742614746,
|
|
"learning_rate": 9.622387710782017e-07,
|
|
"loss": 0.0917,
|
|
"num_input_tokens_seen": 6345216,
|
|
"step": 6610
|
|
},
|
|
{
|
|
"epoch": 0.5613543788187373,
|
|
"grad_norm": 2.518677234649658,
|
|
"learning_rate": 9.607586847816029e-07,
|
|
"loss": 0.0395,
|
|
"num_input_tokens_seen": 6350080,
|
|
"step": 6615
|
|
},
|
|
{
|
|
"epoch": 0.5617786829599457,
|
|
"grad_norm": 25.84773826599121,
|
|
"learning_rate": 9.592786845768225e-07,
|
|
"loss": 0.0646,
|
|
"num_input_tokens_seen": 6354816,
|
|
"step": 6620
|
|
},
|
|
{
|
|
"epoch": 0.5622029871011541,
|
|
"grad_norm": 3.7157983779907227,
|
|
"learning_rate": 9.577987737108454e-07,
|
|
"loss": 0.0668,
|
|
"num_input_tokens_seen": 6360000,
|
|
"step": 6625
|
|
},
|
|
{
|
|
"epoch": 0.5626272912423625,
|
|
"grad_norm": 8.483981132507324,
|
|
"learning_rate": 9.563189554304578e-07,
|
|
"loss": 0.1156,
|
|
"num_input_tokens_seen": 6364672,
|
|
"step": 6630
|
|
},
|
|
{
|
|
"epoch": 0.563051595383571,
|
|
"grad_norm": 0.3436071276664734,
|
|
"learning_rate": 9.548392329822456e-07,
|
|
"loss": 0.0845,
|
|
"num_input_tokens_seen": 6369408,
|
|
"step": 6635
|
|
},
|
|
{
|
|
"epoch": 0.5634758995247794,
|
|
"grad_norm": 11.109560012817383,
|
|
"learning_rate": 9.533596096125825e-07,
|
|
"loss": 0.0495,
|
|
"num_input_tokens_seen": 6374080,
|
|
"step": 6640
|
|
},
|
|
{
|
|
"epoch": 0.5639002036659878,
|
|
"grad_norm": 1.3223705291748047,
|
|
"learning_rate": 9.518800885676256e-07,
|
|
"loss": 0.052,
|
|
"num_input_tokens_seen": 6379200,
|
|
"step": 6645
|
|
},
|
|
{
|
|
"epoch": 0.5643245078071962,
|
|
"grad_norm": 0.2177697867155075,
|
|
"learning_rate": 9.504006730933068e-07,
|
|
"loss": 0.0333,
|
|
"num_input_tokens_seen": 6384576,
|
|
"step": 6650
|
|
},
|
|
{
|
|
"epoch": 0.5647488119484046,
|
|
"grad_norm": 8.883828163146973,
|
|
"learning_rate": 9.489213664353276e-07,
|
|
"loss": 0.075,
|
|
"num_input_tokens_seen": 6389760,
|
|
"step": 6655
|
|
},
|
|
{
|
|
"epoch": 0.5651731160896131,
|
|
"grad_norm": 23.963851928710938,
|
|
"learning_rate": 9.474421718391497e-07,
|
|
"loss": 0.1436,
|
|
"num_input_tokens_seen": 6394176,
|
|
"step": 6660
|
|
},
|
|
{
|
|
"epoch": 0.5655974202308215,
|
|
"grad_norm": 15.651252746582031,
|
|
"learning_rate": 9.459630925499897e-07,
|
|
"loss": 0.073,
|
|
"num_input_tokens_seen": 6398976,
|
|
"step": 6665
|
|
},
|
|
{
|
|
"epoch": 0.5660217243720299,
|
|
"grad_norm": 18.78067398071289,
|
|
"learning_rate": 9.444841318128103e-07,
|
|
"loss": 0.0525,
|
|
"num_input_tokens_seen": 6403264,
|
|
"step": 6670
|
|
},
|
|
{
|
|
"epoch": 0.5664460285132383,
|
|
"grad_norm": 13.996537208557129,
|
|
"learning_rate": 9.430052928723152e-07,
|
|
"loss": 0.0774,
|
|
"num_input_tokens_seen": 6408128,
|
|
"step": 6675
|
|
},
|
|
{
|
|
"epoch": 0.5668703326544468,
|
|
"grad_norm": 9.029537200927734,
|
|
"learning_rate": 9.415265789729403e-07,
|
|
"loss": 0.1095,
|
|
"num_input_tokens_seen": 6412672,
|
|
"step": 6680
|
|
},
|
|
{
|
|
"epoch": 0.5672946367956552,
|
|
"grad_norm": 0.7508693933486938,
|
|
"learning_rate": 9.400479933588468e-07,
|
|
"loss": 0.0786,
|
|
"num_input_tokens_seen": 6417088,
|
|
"step": 6685
|
|
},
|
|
{
|
|
"epoch": 0.5677189409368636,
|
|
"grad_norm": 0.8404874801635742,
|
|
"learning_rate": 9.385695392739156e-07,
|
|
"loss": 0.0848,
|
|
"num_input_tokens_seen": 6421824,
|
|
"step": 6690
|
|
},
|
|
{
|
|
"epoch": 0.568143245078072,
|
|
"grad_norm": 1.3852840662002563,
|
|
"learning_rate": 9.370912199617376e-07,
|
|
"loss": 0.0414,
|
|
"num_input_tokens_seen": 6426560,
|
|
"step": 6695
|
|
},
|
|
{
|
|
"epoch": 0.5685675492192804,
|
|
"grad_norm": 39.68050765991211,
|
|
"learning_rate": 9.356130386656093e-07,
|
|
"loss": 0.085,
|
|
"num_input_tokens_seen": 6431040,
|
|
"step": 6700
|
|
},
|
|
{
|
|
"epoch": 0.5689918533604889,
|
|
"grad_norm": 27.244190216064453,
|
|
"learning_rate": 9.341349986285234e-07,
|
|
"loss": 0.0348,
|
|
"num_input_tokens_seen": 6435968,
|
|
"step": 6705
|
|
},
|
|
{
|
|
"epoch": 0.5694161575016972,
|
|
"grad_norm": 14.122902870178223,
|
|
"learning_rate": 9.326571030931636e-07,
|
|
"loss": 0.1352,
|
|
"num_input_tokens_seen": 6440640,
|
|
"step": 6710
|
|
},
|
|
{
|
|
"epoch": 0.5698404616429056,
|
|
"grad_norm": 0.6096320152282715,
|
|
"learning_rate": 9.311793553018958e-07,
|
|
"loss": 0.0731,
|
|
"num_input_tokens_seen": 6445504,
|
|
"step": 6715
|
|
},
|
|
{
|
|
"epoch": 0.570264765784114,
|
|
"grad_norm": 9.897034645080566,
|
|
"learning_rate": 9.297017584967624e-07,
|
|
"loss": 0.0378,
|
|
"num_input_tokens_seen": 6449600,
|
|
"step": 6720
|
|
},
|
|
{
|
|
"epoch": 0.5706890699253224,
|
|
"grad_norm": 3.9690327644348145,
|
|
"learning_rate": 9.282243159194734e-07,
|
|
"loss": 0.0608,
|
|
"num_input_tokens_seen": 6454528,
|
|
"step": 6725
|
|
},
|
|
{
|
|
"epoch": 0.5711133740665308,
|
|
"grad_norm": 0.2730451226234436,
|
|
"learning_rate": 9.267470308114025e-07,
|
|
"loss": 0.0828,
|
|
"num_input_tokens_seen": 6459264,
|
|
"step": 6730
|
|
},
|
|
{
|
|
"epoch": 0.5715376782077393,
|
|
"grad_norm": 13.87447738647461,
|
|
"learning_rate": 9.252699064135758e-07,
|
|
"loss": 0.0718,
|
|
"num_input_tokens_seen": 6463552,
|
|
"step": 6735
|
|
},
|
|
{
|
|
"epoch": 0.5719619823489477,
|
|
"grad_norm": 0.289069265127182,
|
|
"learning_rate": 9.23792945966668e-07,
|
|
"loss": 0.0057,
|
|
"num_input_tokens_seen": 6468608,
|
|
"step": 6740
|
|
},
|
|
{
|
|
"epoch": 0.5723862864901561,
|
|
"grad_norm": 1.9764337539672852,
|
|
"learning_rate": 9.223161527109936e-07,
|
|
"loss": 0.0274,
|
|
"num_input_tokens_seen": 6473408,
|
|
"step": 6745
|
|
},
|
|
{
|
|
"epoch": 0.5728105906313645,
|
|
"grad_norm": 0.2257399559020996,
|
|
"learning_rate": 9.208395298865014e-07,
|
|
"loss": 0.0165,
|
|
"num_input_tokens_seen": 6478656,
|
|
"step": 6750
|
|
},
|
|
{
|
|
"epoch": 0.573234894772573,
|
|
"grad_norm": 6.887660026550293,
|
|
"learning_rate": 9.19363080732764e-07,
|
|
"loss": 0.1115,
|
|
"num_input_tokens_seen": 6483328,
|
|
"step": 6755
|
|
},
|
|
{
|
|
"epoch": 0.5736591989137814,
|
|
"grad_norm": 0.04757938161492348,
|
|
"learning_rate": 9.178868084889756e-07,
|
|
"loss": 0.0175,
|
|
"num_input_tokens_seen": 6488064,
|
|
"step": 6760
|
|
},
|
|
{
|
|
"epoch": 0.5740835030549898,
|
|
"grad_norm": 19.896671295166016,
|
|
"learning_rate": 9.164107163939401e-07,
|
|
"loss": 0.1151,
|
|
"num_input_tokens_seen": 6492864,
|
|
"step": 6765
|
|
},
|
|
{
|
|
"epoch": 0.5745078071961982,
|
|
"grad_norm": 11.189603805541992,
|
|
"learning_rate": 9.149348076860685e-07,
|
|
"loss": 0.0451,
|
|
"num_input_tokens_seen": 6497216,
|
|
"step": 6770
|
|
},
|
|
{
|
|
"epoch": 0.5749321113374066,
|
|
"grad_norm": 6.523300647735596,
|
|
"learning_rate": 9.134590856033664e-07,
|
|
"loss": 0.0378,
|
|
"num_input_tokens_seen": 6501888,
|
|
"step": 6775
|
|
},
|
|
{
|
|
"epoch": 0.575356415478615,
|
|
"grad_norm": 17.663593292236328,
|
|
"learning_rate": 9.11983553383433e-07,
|
|
"loss": 0.0864,
|
|
"num_input_tokens_seen": 6507200,
|
|
"step": 6780
|
|
},
|
|
{
|
|
"epoch": 0.5757807196198235,
|
|
"grad_norm": 0.15700259804725647,
|
|
"learning_rate": 9.105082142634489e-07,
|
|
"loss": 0.0119,
|
|
"num_input_tokens_seen": 6515840,
|
|
"step": 6785
|
|
},
|
|
{
|
|
"epoch": 0.5762050237610319,
|
|
"grad_norm": 10.5796537399292,
|
|
"learning_rate": 9.090330714801723e-07,
|
|
"loss": 0.117,
|
|
"num_input_tokens_seen": 6520384,
|
|
"step": 6790
|
|
},
|
|
{
|
|
"epoch": 0.5766293279022403,
|
|
"grad_norm": 5.669445991516113,
|
|
"learning_rate": 9.075581282699294e-07,
|
|
"loss": 0.0825,
|
|
"num_input_tokens_seen": 6524992,
|
|
"step": 6795
|
|
},
|
|
{
|
|
"epoch": 0.5770536320434487,
|
|
"grad_norm": 15.555697441101074,
|
|
"learning_rate": 9.060833878686098e-07,
|
|
"loss": 0.1268,
|
|
"num_input_tokens_seen": 6532160,
|
|
"step": 6800
|
|
},
|
|
{
|
|
"epoch": 0.5774779361846571,
|
|
"grad_norm": 1.5832566022872925,
|
|
"learning_rate": 9.046088535116581e-07,
|
|
"loss": 0.0236,
|
|
"num_input_tokens_seen": 6536384,
|
|
"step": 6805
|
|
},
|
|
{
|
|
"epoch": 0.5779022403258656,
|
|
"grad_norm": 3.34737229347229,
|
|
"learning_rate": 9.031345284340652e-07,
|
|
"loss": 0.0107,
|
|
"num_input_tokens_seen": 6540800,
|
|
"step": 6810
|
|
},
|
|
{
|
|
"epoch": 0.578326544467074,
|
|
"grad_norm": 14.524471282958984,
|
|
"learning_rate": 9.016604158703654e-07,
|
|
"loss": 0.1481,
|
|
"num_input_tokens_seen": 6545216,
|
|
"step": 6815
|
|
},
|
|
{
|
|
"epoch": 0.5787508486082824,
|
|
"grad_norm": 0.1422017216682434,
|
|
"learning_rate": 9.001865190546244e-07,
|
|
"loss": 0.0529,
|
|
"num_input_tokens_seen": 6550400,
|
|
"step": 6820
|
|
},
|
|
{
|
|
"epoch": 0.5791751527494908,
|
|
"grad_norm": 13.241820335388184,
|
|
"learning_rate": 8.987128412204363e-07,
|
|
"loss": 0.0324,
|
|
"num_input_tokens_seen": 6554752,
|
|
"step": 6825
|
|
},
|
|
{
|
|
"epoch": 0.5795994568906992,
|
|
"grad_norm": 9.32215690612793,
|
|
"learning_rate": 8.972393856009132e-07,
|
|
"loss": 0.0484,
|
|
"num_input_tokens_seen": 6559616,
|
|
"step": 6830
|
|
},
|
|
{
|
|
"epoch": 0.5800237610319077,
|
|
"grad_norm": 11.575126647949219,
|
|
"learning_rate": 8.957661554286817e-07,
|
|
"loss": 0.0353,
|
|
"num_input_tokens_seen": 6564608,
|
|
"step": 6835
|
|
},
|
|
{
|
|
"epoch": 0.5804480651731161,
|
|
"grad_norm": 2.039041042327881,
|
|
"learning_rate": 8.942931539358718e-07,
|
|
"loss": 0.0315,
|
|
"num_input_tokens_seen": 6569024,
|
|
"step": 6840
|
|
},
|
|
{
|
|
"epoch": 0.5808723693143245,
|
|
"grad_norm": 0.281807005405426,
|
|
"learning_rate": 8.928203843541131e-07,
|
|
"loss": 0.049,
|
|
"num_input_tokens_seen": 6574016,
|
|
"step": 6845
|
|
},
|
|
{
|
|
"epoch": 0.5812966734555329,
|
|
"grad_norm": 22.7374267578125,
|
|
"learning_rate": 8.913478499145254e-07,
|
|
"loss": 0.012,
|
|
"num_input_tokens_seen": 6578944,
|
|
"step": 6850
|
|
},
|
|
{
|
|
"epoch": 0.5817209775967414,
|
|
"grad_norm": 18.85504913330078,
|
|
"learning_rate": 8.898755538477138e-07,
|
|
"loss": 0.0403,
|
|
"num_input_tokens_seen": 6584192,
|
|
"step": 6855
|
|
},
|
|
{
|
|
"epoch": 0.5821452817379498,
|
|
"grad_norm": 14.321463584899902,
|
|
"learning_rate": 8.884034993837594e-07,
|
|
"loss": 0.101,
|
|
"num_input_tokens_seen": 6589056,
|
|
"step": 6860
|
|
},
|
|
{
|
|
"epoch": 0.5825695858791582,
|
|
"grad_norm": 29.105127334594727,
|
|
"learning_rate": 8.869316897522141e-07,
|
|
"loss": 0.0699,
|
|
"num_input_tokens_seen": 6593536,
|
|
"step": 6865
|
|
},
|
|
{
|
|
"epoch": 0.5829938900203666,
|
|
"grad_norm": 11.10505485534668,
|
|
"learning_rate": 8.854601281820914e-07,
|
|
"loss": 0.0861,
|
|
"num_input_tokens_seen": 6600128,
|
|
"step": 6870
|
|
},
|
|
{
|
|
"epoch": 0.583418194161575,
|
|
"grad_norm": 19.249723434448242,
|
|
"learning_rate": 8.839888179018621e-07,
|
|
"loss": 0.0448,
|
|
"num_input_tokens_seen": 6604864,
|
|
"step": 6875
|
|
},
|
|
{
|
|
"epoch": 0.5838424983027835,
|
|
"grad_norm": 22.291515350341797,
|
|
"learning_rate": 8.825177621394449e-07,
|
|
"loss": 0.0219,
|
|
"num_input_tokens_seen": 6609728,
|
|
"step": 6880
|
|
},
|
|
{
|
|
"epoch": 0.5842668024439919,
|
|
"grad_norm": 0.8249548077583313,
|
|
"learning_rate": 8.810469641222001e-07,
|
|
"loss": 0.0289,
|
|
"num_input_tokens_seen": 6615104,
|
|
"step": 6885
|
|
},
|
|
{
|
|
"epoch": 0.5846911065852003,
|
|
"grad_norm": 0.4608720541000366,
|
|
"learning_rate": 8.795764270769221e-07,
|
|
"loss": 0.037,
|
|
"num_input_tokens_seen": 6620096,
|
|
"step": 6890
|
|
},
|
|
{
|
|
"epoch": 0.5851154107264087,
|
|
"grad_norm": 26.97224998474121,
|
|
"learning_rate": 8.781061542298341e-07,
|
|
"loss": 0.1021,
|
|
"num_input_tokens_seen": 6624448,
|
|
"step": 6895
|
|
},
|
|
{
|
|
"epoch": 0.5855397148676171,
|
|
"grad_norm": 18.762897491455078,
|
|
"learning_rate": 8.766361488065783e-07,
|
|
"loss": 0.1331,
|
|
"num_input_tokens_seen": 6628800,
|
|
"step": 6900
|
|
},
|
|
{
|
|
"epoch": 0.5859640190088256,
|
|
"grad_norm": 16.53778076171875,
|
|
"learning_rate": 8.751664140322112e-07,
|
|
"loss": 0.0463,
|
|
"num_input_tokens_seen": 6633664,
|
|
"step": 6905
|
|
},
|
|
{
|
|
"epoch": 0.586388323150034,
|
|
"grad_norm": 29.110645294189453,
|
|
"learning_rate": 8.736969531311942e-07,
|
|
"loss": 0.099,
|
|
"num_input_tokens_seen": 6638720,
|
|
"step": 6910
|
|
},
|
|
{
|
|
"epoch": 0.5868126272912424,
|
|
"grad_norm": 0.4162554442882538,
|
|
"learning_rate": 8.7222776932739e-07,
|
|
"loss": 0.0489,
|
|
"num_input_tokens_seen": 6643008,
|
|
"step": 6915
|
|
},
|
|
{
|
|
"epoch": 0.5872369314324508,
|
|
"grad_norm": 2.022207260131836,
|
|
"learning_rate": 8.70758865844051e-07,
|
|
"loss": 0.043,
|
|
"num_input_tokens_seen": 6647360,
|
|
"step": 6920
|
|
},
|
|
{
|
|
"epoch": 0.5876612355736592,
|
|
"grad_norm": 0.21920643746852875,
|
|
"learning_rate": 8.69290245903816e-07,
|
|
"loss": 0.0418,
|
|
"num_input_tokens_seen": 6652032,
|
|
"step": 6925
|
|
},
|
|
{
|
|
"epoch": 0.5880855397148677,
|
|
"grad_norm": 30.20033073425293,
|
|
"learning_rate": 8.678219127287018e-07,
|
|
"loss": 0.1014,
|
|
"num_input_tokens_seen": 6656320,
|
|
"step": 6930
|
|
},
|
|
{
|
|
"epoch": 0.5885098438560761,
|
|
"grad_norm": 3.282747983932495,
|
|
"learning_rate": 8.663538695400951e-07,
|
|
"loss": 0.0698,
|
|
"num_input_tokens_seen": 6660928,
|
|
"step": 6935
|
|
},
|
|
{
|
|
"epoch": 0.5889341479972845,
|
|
"grad_norm": 0.4774364233016968,
|
|
"learning_rate": 8.648861195587475e-07,
|
|
"loss": 0.0335,
|
|
"num_input_tokens_seen": 6665856,
|
|
"step": 6940
|
|
},
|
|
{
|
|
"epoch": 0.5893584521384929,
|
|
"grad_norm": 0.6090132594108582,
|
|
"learning_rate": 8.634186660047663e-07,
|
|
"loss": 0.0438,
|
|
"num_input_tokens_seen": 6670144,
|
|
"step": 6945
|
|
},
|
|
{
|
|
"epoch": 0.5897827562797013,
|
|
"grad_norm": 23.73943519592285,
|
|
"learning_rate": 8.619515120976097e-07,
|
|
"loss": 0.0814,
|
|
"num_input_tokens_seen": 6675264,
|
|
"step": 6950
|
|
},
|
|
{
|
|
"epoch": 0.5902070604209098,
|
|
"grad_norm": 11.576618194580078,
|
|
"learning_rate": 8.60484661056077e-07,
|
|
"loss": 0.1433,
|
|
"num_input_tokens_seen": 6679552,
|
|
"step": 6955
|
|
},
|
|
{
|
|
"epoch": 0.5906313645621182,
|
|
"grad_norm": 10.501605987548828,
|
|
"learning_rate": 8.590181160983043e-07,
|
|
"loss": 0.0439,
|
|
"num_input_tokens_seen": 6683904,
|
|
"step": 6960
|
|
},
|
|
{
|
|
"epoch": 0.5910556687033266,
|
|
"grad_norm": 9.076457977294922,
|
|
"learning_rate": 8.575518804417552e-07,
|
|
"loss": 0.0574,
|
|
"num_input_tokens_seen": 6688320,
|
|
"step": 6965
|
|
},
|
|
{
|
|
"epoch": 0.591479972844535,
|
|
"grad_norm": 1.5596237182617188,
|
|
"learning_rate": 8.560859573032161e-07,
|
|
"loss": 0.0296,
|
|
"num_input_tokens_seen": 6693696,
|
|
"step": 6970
|
|
},
|
|
{
|
|
"epoch": 0.5919042769857433,
|
|
"grad_norm": 2.413376808166504,
|
|
"learning_rate": 8.546203498987861e-07,
|
|
"loss": 0.0478,
|
|
"num_input_tokens_seen": 6698496,
|
|
"step": 6975
|
|
},
|
|
{
|
|
"epoch": 0.5923285811269517,
|
|
"grad_norm": 24.137184143066406,
|
|
"learning_rate": 8.531550614438729e-07,
|
|
"loss": 0.0289,
|
|
"num_input_tokens_seen": 6704192,
|
|
"step": 6980
|
|
},
|
|
{
|
|
"epoch": 0.5927528852681602,
|
|
"grad_norm": 1.2536343336105347,
|
|
"learning_rate": 8.516900951531832e-07,
|
|
"loss": 0.0125,
|
|
"num_input_tokens_seen": 6708480,
|
|
"step": 6985
|
|
},
|
|
{
|
|
"epoch": 0.5931771894093686,
|
|
"grad_norm": 0.4651804566383362,
|
|
"learning_rate": 8.502254542407185e-07,
|
|
"loss": 0.0571,
|
|
"num_input_tokens_seen": 6713856,
|
|
"step": 6990
|
|
},
|
|
{
|
|
"epoch": 0.593601493550577,
|
|
"grad_norm": 6.357785224914551,
|
|
"learning_rate": 8.487611419197653e-07,
|
|
"loss": 0.115,
|
|
"num_input_tokens_seen": 6719104,
|
|
"step": 6995
|
|
},
|
|
{
|
|
"epoch": 0.5940257976917854,
|
|
"grad_norm": 15.31405258178711,
|
|
"learning_rate": 8.472971614028895e-07,
|
|
"loss": 0.0612,
|
|
"num_input_tokens_seen": 6723328,
|
|
"step": 7000
|
|
},
|
|
{
|
|
"epoch": 0.5944501018329938,
|
|
"grad_norm": 0.09603298455476761,
|
|
"learning_rate": 8.458335159019288e-07,
|
|
"loss": 0.0476,
|
|
"num_input_tokens_seen": 6728064,
|
|
"step": 7005
|
|
},
|
|
{
|
|
"epoch": 0.5948744059742023,
|
|
"grad_norm": 1.3734678030014038,
|
|
"learning_rate": 8.443702086279866e-07,
|
|
"loss": 0.0814,
|
|
"num_input_tokens_seen": 6732864,
|
|
"step": 7010
|
|
},
|
|
{
|
|
"epoch": 0.5952987101154107,
|
|
"grad_norm": 2.1125996112823486,
|
|
"learning_rate": 8.429072427914235e-07,
|
|
"loss": 0.0204,
|
|
"num_input_tokens_seen": 6737792,
|
|
"step": 7015
|
|
},
|
|
{
|
|
"epoch": 0.5957230142566191,
|
|
"grad_norm": 1.3404954671859741,
|
|
"learning_rate": 8.414446216018516e-07,
|
|
"loss": 0.046,
|
|
"num_input_tokens_seen": 6742848,
|
|
"step": 7020
|
|
},
|
|
{
|
|
"epoch": 0.5961473183978275,
|
|
"grad_norm": 0.4186006784439087,
|
|
"learning_rate": 8.399823482681261e-07,
|
|
"loss": 0.0659,
|
|
"num_input_tokens_seen": 6748160,
|
|
"step": 7025
|
|
},
|
|
{
|
|
"epoch": 0.596571622539036,
|
|
"grad_norm": 2.4096479415893555,
|
|
"learning_rate": 8.385204259983403e-07,
|
|
"loss": 0.0812,
|
|
"num_input_tokens_seen": 6752960,
|
|
"step": 7030
|
|
},
|
|
{
|
|
"epoch": 0.5969959266802444,
|
|
"grad_norm": 7.440178871154785,
|
|
"learning_rate": 8.37058857999816e-07,
|
|
"loss": 0.051,
|
|
"num_input_tokens_seen": 6758016,
|
|
"step": 7035
|
|
},
|
|
{
|
|
"epoch": 0.5974202308214528,
|
|
"grad_norm": 13.362556457519531,
|
|
"learning_rate": 8.355976474790987e-07,
|
|
"loss": 0.0947,
|
|
"num_input_tokens_seen": 6762688,
|
|
"step": 7040
|
|
},
|
|
{
|
|
"epoch": 0.5978445349626612,
|
|
"grad_norm": 23.323942184448242,
|
|
"learning_rate": 8.341367976419485e-07,
|
|
"loss": 0.0404,
|
|
"num_input_tokens_seen": 6767424,
|
|
"step": 7045
|
|
},
|
|
{
|
|
"epoch": 0.5982688391038696,
|
|
"grad_norm": 9.576716423034668,
|
|
"learning_rate": 8.326763116933359e-07,
|
|
"loss": 0.0808,
|
|
"num_input_tokens_seen": 6771648,
|
|
"step": 7050
|
|
},
|
|
{
|
|
"epoch": 0.598693143245078,
|
|
"grad_norm": 10.22767162322998,
|
|
"learning_rate": 8.312161928374317e-07,
|
|
"loss": 0.0497,
|
|
"num_input_tokens_seen": 6776832,
|
|
"step": 7055
|
|
},
|
|
{
|
|
"epoch": 0.5991174473862865,
|
|
"grad_norm": 0.2496548891067505,
|
|
"learning_rate": 8.297564442776012e-07,
|
|
"loss": 0.029,
|
|
"num_input_tokens_seen": 6781120,
|
|
"step": 7060
|
|
},
|
|
{
|
|
"epoch": 0.5995417515274949,
|
|
"grad_norm": 0.28316208720207214,
|
|
"learning_rate": 8.282970692163988e-07,
|
|
"loss": 0.0345,
|
|
"num_input_tokens_seen": 6785472,
|
|
"step": 7065
|
|
},
|
|
{
|
|
"epoch": 0.5999660556687033,
|
|
"grad_norm": 12.713273048400879,
|
|
"learning_rate": 8.268380708555579e-07,
|
|
"loss": 0.0943,
|
|
"num_input_tokens_seen": 6791488,
|
|
"step": 7070
|
|
},
|
|
{
|
|
"epoch": 0.6003903598099117,
|
|
"grad_norm": 0.26963818073272705,
|
|
"learning_rate": 8.253794523959863e-07,
|
|
"loss": 0.0802,
|
|
"num_input_tokens_seen": 6797120,
|
|
"step": 7075
|
|
},
|
|
{
|
|
"epoch": 0.6008146639511202,
|
|
"grad_norm": 8.560358047485352,
|
|
"learning_rate": 8.239212170377576e-07,
|
|
"loss": 0.042,
|
|
"num_input_tokens_seen": 6801984,
|
|
"step": 7080
|
|
},
|
|
{
|
|
"epoch": 0.6008146639511202,
|
|
"eval_loss": 0.06250116229057312,
|
|
"eval_runtime": 16.5825,
|
|
"eval_samples_per_second": 631.692,
|
|
"eval_steps_per_second": 78.999,
|
|
"num_input_tokens_seen": 6801984,
|
|
"step": 7080
|
|
},
|
|
{
|
|
"epoch": 0.6012389680923286,
|
|
"grad_norm": 7.389669418334961,
|
|
"learning_rate": 8.224633679801062e-07,
|
|
"loss": 0.0869,
|
|
"num_input_tokens_seen": 6806528,
|
|
"step": 7085
|
|
},
|
|
{
|
|
"epoch": 0.601663272233537,
|
|
"grad_norm": 11.655424118041992,
|
|
"learning_rate": 8.210059084214176e-07,
|
|
"loss": 0.1458,
|
|
"num_input_tokens_seen": 6811456,
|
|
"step": 7090
|
|
},
|
|
{
|
|
"epoch": 0.6020875763747454,
|
|
"grad_norm": 0.5460314154624939,
|
|
"learning_rate": 8.195488415592237e-07,
|
|
"loss": 0.0059,
|
|
"num_input_tokens_seen": 6815872,
|
|
"step": 7095
|
|
},
|
|
{
|
|
"epoch": 0.6025118805159538,
|
|
"grad_norm": 0.6214194893836975,
|
|
"learning_rate": 8.180921705901941e-07,
|
|
"loss": 0.0067,
|
|
"num_input_tokens_seen": 6821376,
|
|
"step": 7100
|
|
},
|
|
{
|
|
"epoch": 0.6029361846571623,
|
|
"grad_norm": 7.452787399291992,
|
|
"learning_rate": 8.16635898710131e-07,
|
|
"loss": 0.05,
|
|
"num_input_tokens_seen": 6826688,
|
|
"step": 7105
|
|
},
|
|
{
|
|
"epoch": 0.6033604887983707,
|
|
"grad_norm": 0.3094349503517151,
|
|
"learning_rate": 8.151800291139596e-07,
|
|
"loss": 0.0477,
|
|
"num_input_tokens_seen": 6831680,
|
|
"step": 7110
|
|
},
|
|
{
|
|
"epoch": 0.6037847929395791,
|
|
"grad_norm": 0.4478607773780823,
|
|
"learning_rate": 8.137245649957239e-07,
|
|
"loss": 0.0201,
|
|
"num_input_tokens_seen": 6836032,
|
|
"step": 7115
|
|
},
|
|
{
|
|
"epoch": 0.6042090970807875,
|
|
"grad_norm": 15.664519309997559,
|
|
"learning_rate": 8.122695095485767e-07,
|
|
"loss": 0.0532,
|
|
"num_input_tokens_seen": 6840576,
|
|
"step": 7120
|
|
},
|
|
{
|
|
"epoch": 0.6046334012219959,
|
|
"grad_norm": 0.12322332710027695,
|
|
"learning_rate": 8.108148659647764e-07,
|
|
"loss": 0.0617,
|
|
"num_input_tokens_seen": 6845696,
|
|
"step": 7125
|
|
},
|
|
{
|
|
"epoch": 0.6050577053632044,
|
|
"grad_norm": 0.32104411721229553,
|
|
"learning_rate": 8.093606374356758e-07,
|
|
"loss": 0.0707,
|
|
"num_input_tokens_seen": 6849984,
|
|
"step": 7130
|
|
},
|
|
{
|
|
"epoch": 0.6054820095044128,
|
|
"grad_norm": 0.27167341113090515,
|
|
"learning_rate": 8.079068271517182e-07,
|
|
"loss": 0.0376,
|
|
"num_input_tokens_seen": 6854592,
|
|
"step": 7135
|
|
},
|
|
{
|
|
"epoch": 0.6059063136456212,
|
|
"grad_norm": 35.615360260009766,
|
|
"learning_rate": 8.064534383024284e-07,
|
|
"loss": 0.1343,
|
|
"num_input_tokens_seen": 6859584,
|
|
"step": 7140
|
|
},
|
|
{
|
|
"epoch": 0.6063306177868296,
|
|
"grad_norm": 4.047654151916504,
|
|
"learning_rate": 8.050004740764082e-07,
|
|
"loss": 0.0905,
|
|
"num_input_tokens_seen": 6864320,
|
|
"step": 7145
|
|
},
|
|
{
|
|
"epoch": 0.606754921928038,
|
|
"grad_norm": 7.075096130371094,
|
|
"learning_rate": 8.035479376613261e-07,
|
|
"loss": 0.0268,
|
|
"num_input_tokens_seen": 6868672,
|
|
"step": 7150
|
|
},
|
|
{
|
|
"epoch": 0.6071792260692465,
|
|
"grad_norm": 22.66986846923828,
|
|
"learning_rate": 8.020958322439132e-07,
|
|
"loss": 0.0716,
|
|
"num_input_tokens_seen": 6873088,
|
|
"step": 7155
|
|
},
|
|
{
|
|
"epoch": 0.6076035302104549,
|
|
"grad_norm": 17.093496322631836,
|
|
"learning_rate": 8.006441610099539e-07,
|
|
"loss": 0.0323,
|
|
"num_input_tokens_seen": 6877568,
|
|
"step": 7160
|
|
},
|
|
{
|
|
"epoch": 0.6080278343516633,
|
|
"grad_norm": 6.461370468139648,
|
|
"learning_rate": 7.991929271442817e-07,
|
|
"loss": 0.1091,
|
|
"num_input_tokens_seen": 6882112,
|
|
"step": 7165
|
|
},
|
|
{
|
|
"epoch": 0.6084521384928717,
|
|
"grad_norm": 11.036733627319336,
|
|
"learning_rate": 7.977421338307687e-07,
|
|
"loss": 0.1082,
|
|
"num_input_tokens_seen": 6886720,
|
|
"step": 7170
|
|
},
|
|
{
|
|
"epoch": 0.6088764426340801,
|
|
"grad_norm": 1.370815634727478,
|
|
"learning_rate": 7.962917842523215e-07,
|
|
"loss": 0.0563,
|
|
"num_input_tokens_seen": 6891584,
|
|
"step": 7175
|
|
},
|
|
{
|
|
"epoch": 0.6093007467752886,
|
|
"grad_norm": 27.122100830078125,
|
|
"learning_rate": 7.94841881590874e-07,
|
|
"loss": 0.0287,
|
|
"num_input_tokens_seen": 6896320,
|
|
"step": 7180
|
|
},
|
|
{
|
|
"epoch": 0.609725050916497,
|
|
"grad_norm": 12.820626258850098,
|
|
"learning_rate": 7.933924290273774e-07,
|
|
"loss": 0.1227,
|
|
"num_input_tokens_seen": 6900992,
|
|
"step": 7185
|
|
},
|
|
{
|
|
"epoch": 0.6101493550577054,
|
|
"grad_norm": 1.8752328157424927,
|
|
"learning_rate": 7.919434297417976e-07,
|
|
"loss": 0.0692,
|
|
"num_input_tokens_seen": 6905600,
|
|
"step": 7190
|
|
},
|
|
{
|
|
"epoch": 0.6105736591989138,
|
|
"grad_norm": 14.645645141601562,
|
|
"learning_rate": 7.904948869131039e-07,
|
|
"loss": 0.0946,
|
|
"num_input_tokens_seen": 6910208,
|
|
"step": 7195
|
|
},
|
|
{
|
|
"epoch": 0.6109979633401222,
|
|
"grad_norm": 3.813603639602661,
|
|
"learning_rate": 7.89046803719267e-07,
|
|
"loss": 0.0417,
|
|
"num_input_tokens_seen": 6915264,
|
|
"step": 7200
|
|
},
|
|
{
|
|
"epoch": 0.6114222674813307,
|
|
"grad_norm": 4.853946685791016,
|
|
"learning_rate": 7.875991833372463e-07,
|
|
"loss": 0.0165,
|
|
"num_input_tokens_seen": 6919808,
|
|
"step": 7205
|
|
},
|
|
{
|
|
"epoch": 0.6118465716225391,
|
|
"grad_norm": 6.343606472015381,
|
|
"learning_rate": 7.861520289429879e-07,
|
|
"loss": 0.039,
|
|
"num_input_tokens_seen": 6924608,
|
|
"step": 7210
|
|
},
|
|
{
|
|
"epoch": 0.6122708757637475,
|
|
"grad_norm": 4.9066691398620605,
|
|
"learning_rate": 7.847053437114141e-07,
|
|
"loss": 0.0376,
|
|
"num_input_tokens_seen": 6929344,
|
|
"step": 7215
|
|
},
|
|
{
|
|
"epoch": 0.6126951799049559,
|
|
"grad_norm": 11.412519454956055,
|
|
"learning_rate": 7.832591308164193e-07,
|
|
"loss": 0.1282,
|
|
"num_input_tokens_seen": 6934464,
|
|
"step": 7220
|
|
},
|
|
{
|
|
"epoch": 0.6131194840461643,
|
|
"grad_norm": 26.504152297973633,
|
|
"learning_rate": 7.818133934308606e-07,
|
|
"loss": 0.0867,
|
|
"num_input_tokens_seen": 6939456,
|
|
"step": 7225
|
|
},
|
|
{
|
|
"epoch": 0.6135437881873728,
|
|
"grad_norm": 0.2936766445636749,
|
|
"learning_rate": 7.803681347265524e-07,
|
|
"loss": 0.004,
|
|
"num_input_tokens_seen": 6943808,
|
|
"step": 7230
|
|
},
|
|
{
|
|
"epoch": 0.6139680923285811,
|
|
"grad_norm": 0.2572391927242279,
|
|
"learning_rate": 7.789233578742583e-07,
|
|
"loss": 0.0286,
|
|
"num_input_tokens_seen": 6948736,
|
|
"step": 7235
|
|
},
|
|
{
|
|
"epoch": 0.6143923964697895,
|
|
"grad_norm": 18.245132446289062,
|
|
"learning_rate": 7.774790660436857e-07,
|
|
"loss": 0.0959,
|
|
"num_input_tokens_seen": 6953792,
|
|
"step": 7240
|
|
},
|
|
{
|
|
"epoch": 0.6148167006109979,
|
|
"grad_norm": 0.10369865596294403,
|
|
"learning_rate": 7.760352624034769e-07,
|
|
"loss": 0.0231,
|
|
"num_input_tokens_seen": 6958656,
|
|
"step": 7245
|
|
},
|
|
{
|
|
"epoch": 0.6152410047522063,
|
|
"grad_norm": 22.0079402923584,
|
|
"learning_rate": 7.745919501212043e-07,
|
|
"loss": 0.0454,
|
|
"num_input_tokens_seen": 6963200,
|
|
"step": 7250
|
|
},
|
|
{
|
|
"epoch": 0.6156653088934148,
|
|
"grad_norm": 15.568541526794434,
|
|
"learning_rate": 7.731491323633608e-07,
|
|
"loss": 0.0239,
|
|
"num_input_tokens_seen": 6968448,
|
|
"step": 7255
|
|
},
|
|
{
|
|
"epoch": 0.6160896130346232,
|
|
"grad_norm": 10.313304901123047,
|
|
"learning_rate": 7.71706812295356e-07,
|
|
"loss": 0.0891,
|
|
"num_input_tokens_seen": 6973888,
|
|
"step": 7260
|
|
},
|
|
{
|
|
"epoch": 0.6165139171758316,
|
|
"grad_norm": 7.665482044219971,
|
|
"learning_rate": 7.702649930815065e-07,
|
|
"loss": 0.0806,
|
|
"num_input_tokens_seen": 6978304,
|
|
"step": 7265
|
|
},
|
|
{
|
|
"epoch": 0.61693822131704,
|
|
"grad_norm": 32.76676559448242,
|
|
"learning_rate": 7.688236778850306e-07,
|
|
"loss": 0.0711,
|
|
"num_input_tokens_seen": 6983168,
|
|
"step": 7270
|
|
},
|
|
{
|
|
"epoch": 0.6173625254582484,
|
|
"grad_norm": 0.3456765413284302,
|
|
"learning_rate": 7.6738286986804e-07,
|
|
"loss": 0.0481,
|
|
"num_input_tokens_seen": 6988224,
|
|
"step": 7275
|
|
},
|
|
{
|
|
"epoch": 0.6177868295994569,
|
|
"grad_norm": 1.373316764831543,
|
|
"learning_rate": 7.659425721915351e-07,
|
|
"loss": 0.0878,
|
|
"num_input_tokens_seen": 6992448,
|
|
"step": 7280
|
|
},
|
|
{
|
|
"epoch": 0.6182111337406653,
|
|
"grad_norm": 8.865594863891602,
|
|
"learning_rate": 7.645027880153956e-07,
|
|
"loss": 0.0552,
|
|
"num_input_tokens_seen": 6996864,
|
|
"step": 7285
|
|
},
|
|
{
|
|
"epoch": 0.6186354378818737,
|
|
"grad_norm": 9.038241386413574,
|
|
"learning_rate": 7.63063520498375e-07,
|
|
"loss": 0.0331,
|
|
"num_input_tokens_seen": 7001984,
|
|
"step": 7290
|
|
},
|
|
{
|
|
"epoch": 0.6190597420230821,
|
|
"grad_norm": 10.943050384521484,
|
|
"learning_rate": 7.616247727980927e-07,
|
|
"loss": 0.0884,
|
|
"num_input_tokens_seen": 7007168,
|
|
"step": 7295
|
|
},
|
|
{
|
|
"epoch": 0.6194840461642905,
|
|
"grad_norm": 0.30838364362716675,
|
|
"learning_rate": 7.601865480710289e-07,
|
|
"loss": 0.0308,
|
|
"num_input_tokens_seen": 7012416,
|
|
"step": 7300
|
|
},
|
|
{
|
|
"epoch": 0.619908350305499,
|
|
"grad_norm": 10.412365913391113,
|
|
"learning_rate": 7.587488494725156e-07,
|
|
"loss": 0.0686,
|
|
"num_input_tokens_seen": 7016576,
|
|
"step": 7305
|
|
},
|
|
{
|
|
"epoch": 0.6203326544467074,
|
|
"grad_norm": 6.440368175506592,
|
|
"learning_rate": 7.573116801567301e-07,
|
|
"loss": 0.0491,
|
|
"num_input_tokens_seen": 7021056,
|
|
"step": 7310
|
|
},
|
|
{
|
|
"epoch": 0.6207569585879158,
|
|
"grad_norm": 1.5355511903762817,
|
|
"learning_rate": 7.558750432766901e-07,
|
|
"loss": 0.0869,
|
|
"num_input_tokens_seen": 7025984,
|
|
"step": 7315
|
|
},
|
|
{
|
|
"epoch": 0.6211812627291242,
|
|
"grad_norm": 20.502912521362305,
|
|
"learning_rate": 7.544389419842429e-07,
|
|
"loss": 0.0134,
|
|
"num_input_tokens_seen": 7031104,
|
|
"step": 7320
|
|
},
|
|
{
|
|
"epoch": 0.6216055668703326,
|
|
"grad_norm": 0.8127592206001282,
|
|
"learning_rate": 7.530033794300631e-07,
|
|
"loss": 0.0437,
|
|
"num_input_tokens_seen": 7035904,
|
|
"step": 7325
|
|
},
|
|
{
|
|
"epoch": 0.6220298710115411,
|
|
"grad_norm": 7.8356242179870605,
|
|
"learning_rate": 7.515683587636412e-07,
|
|
"loss": 0.1112,
|
|
"num_input_tokens_seen": 7040384,
|
|
"step": 7330
|
|
},
|
|
{
|
|
"epoch": 0.6224541751527495,
|
|
"grad_norm": 0.13826069235801697,
|
|
"learning_rate": 7.501338831332813e-07,
|
|
"loss": 0.0151,
|
|
"num_input_tokens_seen": 7045312,
|
|
"step": 7335
|
|
},
|
|
{
|
|
"epoch": 0.6228784792939579,
|
|
"grad_norm": 19.061513900756836,
|
|
"learning_rate": 7.486999556860889e-07,
|
|
"loss": 0.06,
|
|
"num_input_tokens_seen": 7050048,
|
|
"step": 7340
|
|
},
|
|
{
|
|
"epoch": 0.6233027834351663,
|
|
"grad_norm": 0.10979703813791275,
|
|
"learning_rate": 7.472665795679694e-07,
|
|
"loss": 0.0236,
|
|
"num_input_tokens_seen": 7054592,
|
|
"step": 7345
|
|
},
|
|
{
|
|
"epoch": 0.6237270875763747,
|
|
"grad_norm": 26.652597427368164,
|
|
"learning_rate": 7.458337579236168e-07,
|
|
"loss": 0.0927,
|
|
"num_input_tokens_seen": 7059392,
|
|
"step": 7350
|
|
},
|
|
{
|
|
"epoch": 0.6241513917175832,
|
|
"grad_norm": 5.441608428955078,
|
|
"learning_rate": 7.4440149389651e-07,
|
|
"loss": 0.1195,
|
|
"num_input_tokens_seen": 7063552,
|
|
"step": 7355
|
|
},
|
|
{
|
|
"epoch": 0.6245756958587916,
|
|
"grad_norm": 22.899656295776367,
|
|
"learning_rate": 7.429697906289029e-07,
|
|
"loss": 0.0531,
|
|
"num_input_tokens_seen": 7068288,
|
|
"step": 7360
|
|
},
|
|
{
|
|
"epoch": 0.625,
|
|
"grad_norm": 16.39104652404785,
|
|
"learning_rate": 7.415386512618216e-07,
|
|
"loss": 0.0146,
|
|
"num_input_tokens_seen": 7073216,
|
|
"step": 7365
|
|
},
|
|
{
|
|
"epoch": 0.6254243041412084,
|
|
"grad_norm": 9.947188377380371,
|
|
"learning_rate": 7.401080789350525e-07,
|
|
"loss": 0.0707,
|
|
"num_input_tokens_seen": 7077824,
|
|
"step": 7370
|
|
},
|
|
{
|
|
"epoch": 0.6258486082824168,
|
|
"grad_norm": 0.26620033383369446,
|
|
"learning_rate": 7.386780767871396e-07,
|
|
"loss": 0.0244,
|
|
"num_input_tokens_seen": 7082240,
|
|
"step": 7375
|
|
},
|
|
{
|
|
"epoch": 0.6262729124236253,
|
|
"grad_norm": 31.630956649780273,
|
|
"learning_rate": 7.372486479553748e-07,
|
|
"loss": 0.0797,
|
|
"num_input_tokens_seen": 7087360,
|
|
"step": 7380
|
|
},
|
|
{
|
|
"epoch": 0.6266972165648337,
|
|
"grad_norm": 0.16265109181404114,
|
|
"learning_rate": 7.358197955757939e-07,
|
|
"loss": 0.0338,
|
|
"num_input_tokens_seen": 7092288,
|
|
"step": 7385
|
|
},
|
|
{
|
|
"epoch": 0.6271215207060421,
|
|
"grad_norm": 18.763938903808594,
|
|
"learning_rate": 7.343915227831661e-07,
|
|
"loss": 0.0369,
|
|
"num_input_tokens_seen": 7096768,
|
|
"step": 7390
|
|
},
|
|
{
|
|
"epoch": 0.6275458248472505,
|
|
"grad_norm": 28.4324951171875,
|
|
"learning_rate": 7.329638327109902e-07,
|
|
"loss": 0.0778,
|
|
"num_input_tokens_seen": 7101312,
|
|
"step": 7395
|
|
},
|
|
{
|
|
"epoch": 0.6279701289884589,
|
|
"grad_norm": 7.5484466552734375,
|
|
"learning_rate": 7.315367284914861e-07,
|
|
"loss": 0.0852,
|
|
"num_input_tokens_seen": 7105664,
|
|
"step": 7400
|
|
},
|
|
{
|
|
"epoch": 0.6283944331296674,
|
|
"grad_norm": 1.3473660945892334,
|
|
"learning_rate": 7.301102132555891e-07,
|
|
"loss": 0.0622,
|
|
"num_input_tokens_seen": 7110208,
|
|
"step": 7405
|
|
},
|
|
{
|
|
"epoch": 0.6288187372708758,
|
|
"grad_norm": 0.4494148790836334,
|
|
"learning_rate": 7.286842901329412e-07,
|
|
"loss": 0.0032,
|
|
"num_input_tokens_seen": 7115136,
|
|
"step": 7410
|
|
},
|
|
{
|
|
"epoch": 0.6292430414120842,
|
|
"grad_norm": 0.6983975768089294,
|
|
"learning_rate": 7.272589622518863e-07,
|
|
"loss": 0.0368,
|
|
"num_input_tokens_seen": 7119552,
|
|
"step": 7415
|
|
},
|
|
{
|
|
"epoch": 0.6296673455532926,
|
|
"grad_norm": 8.335647583007812,
|
|
"learning_rate": 7.258342327394616e-07,
|
|
"loss": 0.0652,
|
|
"num_input_tokens_seen": 7124352,
|
|
"step": 7420
|
|
},
|
|
{
|
|
"epoch": 0.630091649694501,
|
|
"grad_norm": 0.135335311293602,
|
|
"learning_rate": 7.244101047213927e-07,
|
|
"loss": 0.0263,
|
|
"num_input_tokens_seen": 7128768,
|
|
"step": 7425
|
|
},
|
|
{
|
|
"epoch": 0.6305159538357095,
|
|
"grad_norm": 39.84171676635742,
|
|
"learning_rate": 7.229865813220843e-07,
|
|
"loss": 0.0752,
|
|
"num_input_tokens_seen": 7133568,
|
|
"step": 7430
|
|
},
|
|
{
|
|
"epoch": 0.6309402579769179,
|
|
"grad_norm": 9.711935997009277,
|
|
"learning_rate": 7.215636656646151e-07,
|
|
"loss": 0.0693,
|
|
"num_input_tokens_seen": 7138112,
|
|
"step": 7435
|
|
},
|
|
{
|
|
"epoch": 0.6313645621181263,
|
|
"grad_norm": 32.676143646240234,
|
|
"learning_rate": 7.201413608707312e-07,
|
|
"loss": 0.0568,
|
|
"num_input_tokens_seen": 7142848,
|
|
"step": 7440
|
|
},
|
|
{
|
|
"epoch": 0.6317888662593347,
|
|
"grad_norm": 0.1600508689880371,
|
|
"learning_rate": 7.187196700608372e-07,
|
|
"loss": 0.0759,
|
|
"num_input_tokens_seen": 7147584,
|
|
"step": 7445
|
|
},
|
|
{
|
|
"epoch": 0.6322131704005431,
|
|
"grad_norm": 0.4176275134086609,
|
|
"learning_rate": 7.172985963539919e-07,
|
|
"loss": 0.0819,
|
|
"num_input_tokens_seen": 7152192,
|
|
"step": 7450
|
|
},
|
|
{
|
|
"epoch": 0.6326374745417516,
|
|
"grad_norm": 8.17668628692627,
|
|
"learning_rate": 7.158781428678989e-07,
|
|
"loss": 0.0581,
|
|
"num_input_tokens_seen": 7156672,
|
|
"step": 7455
|
|
},
|
|
{
|
|
"epoch": 0.63306177868296,
|
|
"grad_norm": 23.636079788208008,
|
|
"learning_rate": 7.144583127189028e-07,
|
|
"loss": 0.0556,
|
|
"num_input_tokens_seen": 7161664,
|
|
"step": 7460
|
|
},
|
|
{
|
|
"epoch": 0.6334860828241684,
|
|
"grad_norm": 28.82695960998535,
|
|
"learning_rate": 7.130391090219789e-07,
|
|
"loss": 0.0409,
|
|
"num_input_tokens_seen": 7166016,
|
|
"step": 7465
|
|
},
|
|
{
|
|
"epoch": 0.6339103869653768,
|
|
"grad_norm": 33.98797607421875,
|
|
"learning_rate": 7.116205348907298e-07,
|
|
"loss": 0.0595,
|
|
"num_input_tokens_seen": 7170752,
|
|
"step": 7470
|
|
},
|
|
{
|
|
"epoch": 0.6343346911065852,
|
|
"grad_norm": 7.144901752471924,
|
|
"learning_rate": 7.10202593437375e-07,
|
|
"loss": 0.0523,
|
|
"num_input_tokens_seen": 7176064,
|
|
"step": 7475
|
|
},
|
|
{
|
|
"epoch": 0.6347589952477937,
|
|
"grad_norm": 41.93625259399414,
|
|
"learning_rate": 7.08785287772748e-07,
|
|
"loss": 0.071,
|
|
"num_input_tokens_seen": 7181312,
|
|
"step": 7480
|
|
},
|
|
{
|
|
"epoch": 0.6351832993890021,
|
|
"grad_norm": 38.973453521728516,
|
|
"learning_rate": 7.073686210062859e-07,
|
|
"loss": 0.0724,
|
|
"num_input_tokens_seen": 7186176,
|
|
"step": 7485
|
|
},
|
|
{
|
|
"epoch": 0.6356076035302105,
|
|
"grad_norm": 0.981127917766571,
|
|
"learning_rate": 7.059525962460248e-07,
|
|
"loss": 0.0424,
|
|
"num_input_tokens_seen": 7190784,
|
|
"step": 7490
|
|
},
|
|
{
|
|
"epoch": 0.6360319076714189,
|
|
"grad_norm": 9.044798851013184,
|
|
"learning_rate": 7.045372165985919e-07,
|
|
"loss": 0.0896,
|
|
"num_input_tokens_seen": 7195840,
|
|
"step": 7495
|
|
},
|
|
{
|
|
"epoch": 0.6364562118126272,
|
|
"grad_norm": 15.579463958740234,
|
|
"learning_rate": 7.031224851691999e-07,
|
|
"loss": 0.0447,
|
|
"num_input_tokens_seen": 7201024,
|
|
"step": 7500
|
|
},
|
|
{
|
|
"epoch": 0.6368805159538357,
|
|
"grad_norm": 0.566196620464325,
|
|
"learning_rate": 7.017084050616385e-07,
|
|
"loss": 0.0698,
|
|
"num_input_tokens_seen": 7205760,
|
|
"step": 7505
|
|
},
|
|
{
|
|
"epoch": 0.6373048200950441,
|
|
"grad_norm": 7.486564636230469,
|
|
"learning_rate": 7.002949793782686e-07,
|
|
"loss": 0.0505,
|
|
"num_input_tokens_seen": 7210560,
|
|
"step": 7510
|
|
},
|
|
{
|
|
"epoch": 0.6377291242362525,
|
|
"grad_norm": 15.557743072509766,
|
|
"learning_rate": 6.988822112200156e-07,
|
|
"loss": 0.0558,
|
|
"num_input_tokens_seen": 7215488,
|
|
"step": 7515
|
|
},
|
|
{
|
|
"epoch": 0.6381534283774609,
|
|
"grad_norm": 9.049367904663086,
|
|
"learning_rate": 6.974701036863626e-07,
|
|
"loss": 0.0672,
|
|
"num_input_tokens_seen": 7220608,
|
|
"step": 7520
|
|
},
|
|
{
|
|
"epoch": 0.6385777325186693,
|
|
"grad_norm": 0.36959829926490784,
|
|
"learning_rate": 6.960586598753426e-07,
|
|
"loss": 0.0455,
|
|
"num_input_tokens_seen": 7225280,
|
|
"step": 7525
|
|
},
|
|
{
|
|
"epoch": 0.6390020366598778,
|
|
"grad_norm": 0.25574615597724915,
|
|
"learning_rate": 6.946478828835331e-07,
|
|
"loss": 0.0435,
|
|
"num_input_tokens_seen": 7229696,
|
|
"step": 7530
|
|
},
|
|
{
|
|
"epoch": 0.6394263408010862,
|
|
"grad_norm": 10.568726539611816,
|
|
"learning_rate": 6.932377758060481e-07,
|
|
"loss": 0.0416,
|
|
"num_input_tokens_seen": 7233984,
|
|
"step": 7535
|
|
},
|
|
{
|
|
"epoch": 0.6398506449422946,
|
|
"grad_norm": 15.565662384033203,
|
|
"learning_rate": 6.91828341736533e-07,
|
|
"loss": 0.0493,
|
|
"num_input_tokens_seen": 7239936,
|
|
"step": 7540
|
|
},
|
|
{
|
|
"epoch": 0.640274949083503,
|
|
"grad_norm": 27.50994300842285,
|
|
"learning_rate": 6.904195837671552e-07,
|
|
"loss": 0.0335,
|
|
"num_input_tokens_seen": 7244480,
|
|
"step": 7545
|
|
},
|
|
{
|
|
"epoch": 0.6406992532247114,
|
|
"grad_norm": 0.2827405035495758,
|
|
"learning_rate": 6.890115049885994e-07,
|
|
"loss": 0.0681,
|
|
"num_input_tokens_seen": 7248960,
|
|
"step": 7550
|
|
},
|
|
{
|
|
"epoch": 0.6411235573659199,
|
|
"grad_norm": 27.523212432861328,
|
|
"learning_rate": 6.87604108490061e-07,
|
|
"loss": 0.0409,
|
|
"num_input_tokens_seen": 7253888,
|
|
"step": 7555
|
|
},
|
|
{
|
|
"epoch": 0.6415478615071283,
|
|
"grad_norm": 0.722808837890625,
|
|
"learning_rate": 6.861973973592372e-07,
|
|
"loss": 0.0364,
|
|
"num_input_tokens_seen": 7259200,
|
|
"step": 7560
|
|
},
|
|
{
|
|
"epoch": 0.6419721656483367,
|
|
"grad_norm": 0.7283973097801208,
|
|
"learning_rate": 6.847913746823227e-07,
|
|
"loss": 0.0163,
|
|
"num_input_tokens_seen": 7263808,
|
|
"step": 7565
|
|
},
|
|
{
|
|
"epoch": 0.6423964697895451,
|
|
"grad_norm": 12.485780715942383,
|
|
"learning_rate": 6.833860435440006e-07,
|
|
"loss": 0.0146,
|
|
"num_input_tokens_seen": 7269248,
|
|
"step": 7570
|
|
},
|
|
{
|
|
"epoch": 0.6428207739307535,
|
|
"grad_norm": 32.6278076171875,
|
|
"learning_rate": 6.819814070274384e-07,
|
|
"loss": 0.0184,
|
|
"num_input_tokens_seen": 7274496,
|
|
"step": 7575
|
|
},
|
|
{
|
|
"epoch": 0.643245078071962,
|
|
"grad_norm": 6.817934513092041,
|
|
"learning_rate": 6.805774682142782e-07,
|
|
"loss": 0.1752,
|
|
"num_input_tokens_seen": 7279552,
|
|
"step": 7580
|
|
},
|
|
{
|
|
"epoch": 0.6436693822131704,
|
|
"grad_norm": 10.113386154174805,
|
|
"learning_rate": 6.791742301846325e-07,
|
|
"loss": 0.0834,
|
|
"num_input_tokens_seen": 7284096,
|
|
"step": 7585
|
|
},
|
|
{
|
|
"epoch": 0.6440936863543788,
|
|
"grad_norm": 0.13432703912258148,
|
|
"learning_rate": 6.777716960170752e-07,
|
|
"loss": 0.0504,
|
|
"num_input_tokens_seen": 7289088,
|
|
"step": 7590
|
|
},
|
|
{
|
|
"epoch": 0.6445179904955872,
|
|
"grad_norm": 6.147345066070557,
|
|
"learning_rate": 6.763698687886372e-07,
|
|
"loss": 0.1006,
|
|
"num_input_tokens_seen": 7293696,
|
|
"step": 7595
|
|
},
|
|
{
|
|
"epoch": 0.6449422946367956,
|
|
"grad_norm": 0.24970202147960663,
|
|
"learning_rate": 6.749687515747977e-07,
|
|
"loss": 0.0467,
|
|
"num_input_tokens_seen": 7298816,
|
|
"step": 7600
|
|
},
|
|
{
|
|
"epoch": 0.6453665987780041,
|
|
"grad_norm": 6.173451900482178,
|
|
"learning_rate": 6.735683474494784e-07,
|
|
"loss": 0.0689,
|
|
"num_input_tokens_seen": 7303232,
|
|
"step": 7605
|
|
},
|
|
{
|
|
"epoch": 0.6457909029192125,
|
|
"grad_norm": 18.60088348388672,
|
|
"learning_rate": 6.721686594850362e-07,
|
|
"loss": 0.0431,
|
|
"num_input_tokens_seen": 7308416,
|
|
"step": 7610
|
|
},
|
|
{
|
|
"epoch": 0.6462152070604209,
|
|
"grad_norm": 0.18760794401168823,
|
|
"learning_rate": 6.707696907522577e-07,
|
|
"loss": 0.0699,
|
|
"num_input_tokens_seen": 7313024,
|
|
"step": 7615
|
|
},
|
|
{
|
|
"epoch": 0.6466395112016293,
|
|
"grad_norm": 9.816882133483887,
|
|
"learning_rate": 6.693714443203507e-07,
|
|
"loss": 0.0856,
|
|
"num_input_tokens_seen": 7317760,
|
|
"step": 7620
|
|
},
|
|
{
|
|
"epoch": 0.6470638153428377,
|
|
"grad_norm": 28.292943954467773,
|
|
"learning_rate": 6.679739232569388e-07,
|
|
"loss": 0.0443,
|
|
"num_input_tokens_seen": 7322624,
|
|
"step": 7625
|
|
},
|
|
{
|
|
"epoch": 0.6474881194840462,
|
|
"grad_norm": 31.246423721313477,
|
|
"learning_rate": 6.665771306280537e-07,
|
|
"loss": 0.0577,
|
|
"num_input_tokens_seen": 7327104,
|
|
"step": 7630
|
|
},
|
|
{
|
|
"epoch": 0.6479124236252546,
|
|
"grad_norm": 8.723485946655273,
|
|
"learning_rate": 6.651810694981299e-07,
|
|
"loss": 0.0678,
|
|
"num_input_tokens_seen": 7331520,
|
|
"step": 7635
|
|
},
|
|
{
|
|
"epoch": 0.648336727766463,
|
|
"grad_norm": 12.17953872680664,
|
|
"learning_rate": 6.637857429299958e-07,
|
|
"loss": 0.0671,
|
|
"num_input_tokens_seen": 7336448,
|
|
"step": 7640
|
|
},
|
|
{
|
|
"epoch": 0.6487610319076714,
|
|
"grad_norm": 0.20854049921035767,
|
|
"learning_rate": 6.623911539848697e-07,
|
|
"loss": 0.0677,
|
|
"num_input_tokens_seen": 7341248,
|
|
"step": 7645
|
|
},
|
|
{
|
|
"epoch": 0.6491853360488798,
|
|
"grad_norm": 12.072310447692871,
|
|
"learning_rate": 6.6099730572235e-07,
|
|
"loss": 0.0664,
|
|
"num_input_tokens_seen": 7345920,
|
|
"step": 7650
|
|
},
|
|
{
|
|
"epoch": 0.6496096401900883,
|
|
"grad_norm": 0.5409800410270691,
|
|
"learning_rate": 6.596042012004119e-07,
|
|
"loss": 0.0198,
|
|
"num_input_tokens_seen": 7350464,
|
|
"step": 7655
|
|
},
|
|
{
|
|
"epoch": 0.6500339443312967,
|
|
"grad_norm": 19.375585556030273,
|
|
"learning_rate": 6.582118434753973e-07,
|
|
"loss": 0.0948,
|
|
"num_input_tokens_seen": 7355008,
|
|
"step": 7660
|
|
},
|
|
{
|
|
"epoch": 0.6504582484725051,
|
|
"grad_norm": 20.44685935974121,
|
|
"learning_rate": 6.568202356020108e-07,
|
|
"loss": 0.0469,
|
|
"num_input_tokens_seen": 7359680,
|
|
"step": 7665
|
|
},
|
|
{
|
|
"epoch": 0.6508825526137135,
|
|
"grad_norm": 3.251227855682373,
|
|
"learning_rate": 6.554293806333109e-07,
|
|
"loss": 0.0449,
|
|
"num_input_tokens_seen": 7363968,
|
|
"step": 7670
|
|
},
|
|
{
|
|
"epoch": 0.6508825526137135,
|
|
"eval_loss": 0.056983742862939835,
|
|
"eval_runtime": 16.7468,
|
|
"eval_samples_per_second": 625.491,
|
|
"eval_steps_per_second": 78.224,
|
|
"num_input_tokens_seen": 7363968,
|
|
"step": 7670
|
|
},
|
|
{
|
|
"epoch": 0.651306856754922,
|
|
"grad_norm": 9.164955139160156,
|
|
"learning_rate": 6.540392816207054e-07,
|
|
"loss": 0.1121,
|
|
"num_input_tokens_seen": 7368064,
|
|
"step": 7675
|
|
},
|
|
{
|
|
"epoch": 0.6517311608961304,
|
|
"grad_norm": 8.18824577331543,
|
|
"learning_rate": 6.52649941613943e-07,
|
|
"loss": 0.0478,
|
|
"num_input_tokens_seen": 7372736,
|
|
"step": 7680
|
|
},
|
|
{
|
|
"epoch": 0.6521554650373388,
|
|
"grad_norm": 1.0434948205947876,
|
|
"learning_rate": 6.512613636611068e-07,
|
|
"loss": 0.0521,
|
|
"num_input_tokens_seen": 7377600,
|
|
"step": 7685
|
|
},
|
|
{
|
|
"epoch": 0.6525797691785472,
|
|
"grad_norm": 19.809101104736328,
|
|
"learning_rate": 6.498735508086093e-07,
|
|
"loss": 0.0357,
|
|
"num_input_tokens_seen": 7381952,
|
|
"step": 7690
|
|
},
|
|
{
|
|
"epoch": 0.6530040733197556,
|
|
"grad_norm": 3.345973014831543,
|
|
"learning_rate": 6.484865061011829e-07,
|
|
"loss": 0.0372,
|
|
"num_input_tokens_seen": 7387264,
|
|
"step": 7695
|
|
},
|
|
{
|
|
"epoch": 0.653428377460964,
|
|
"grad_norm": 0.6435390114784241,
|
|
"learning_rate": 6.471002325818761e-07,
|
|
"loss": 0.0668,
|
|
"num_input_tokens_seen": 7391936,
|
|
"step": 7700
|
|
},
|
|
{
|
|
"epoch": 0.6538526816021725,
|
|
"grad_norm": 1.903062105178833,
|
|
"learning_rate": 6.45714733292044e-07,
|
|
"loss": 0.0665,
|
|
"num_input_tokens_seen": 7397248,
|
|
"step": 7705
|
|
},
|
|
{
|
|
"epoch": 0.6542769857433809,
|
|
"grad_norm": 15.047941207885742,
|
|
"learning_rate": 6.443300112713452e-07,
|
|
"loss": 0.0696,
|
|
"num_input_tokens_seen": 7401920,
|
|
"step": 7710
|
|
},
|
|
{
|
|
"epoch": 0.6547012898845893,
|
|
"grad_norm": 18.8006534576416,
|
|
"learning_rate": 6.429460695577309e-07,
|
|
"loss": 0.0674,
|
|
"num_input_tokens_seen": 7406912,
|
|
"step": 7715
|
|
},
|
|
{
|
|
"epoch": 0.6551255940257977,
|
|
"grad_norm": 8.167941093444824,
|
|
"learning_rate": 6.415629111874418e-07,
|
|
"loss": 0.0706,
|
|
"num_input_tokens_seen": 7411776,
|
|
"step": 7720
|
|
},
|
|
{
|
|
"epoch": 0.6555498981670062,
|
|
"grad_norm": 1.3768784999847412,
|
|
"learning_rate": 6.401805391949989e-07,
|
|
"loss": 0.0654,
|
|
"num_input_tokens_seen": 7416128,
|
|
"step": 7725
|
|
},
|
|
{
|
|
"epoch": 0.6559742023082146,
|
|
"grad_norm": 0.10121601074934006,
|
|
"learning_rate": 6.387989566131996e-07,
|
|
"loss": 0.0299,
|
|
"num_input_tokens_seen": 7422208,
|
|
"step": 7730
|
|
},
|
|
{
|
|
"epoch": 0.656398506449423,
|
|
"grad_norm": 8.636445045471191,
|
|
"learning_rate": 6.374181664731076e-07,
|
|
"loss": 0.0584,
|
|
"num_input_tokens_seen": 7427008,
|
|
"step": 7735
|
|
},
|
|
{
|
|
"epoch": 0.6568228105906314,
|
|
"grad_norm": 4.268195629119873,
|
|
"learning_rate": 6.360381718040493e-07,
|
|
"loss": 0.0515,
|
|
"num_input_tokens_seen": 7431488,
|
|
"step": 7740
|
|
},
|
|
{
|
|
"epoch": 0.6572471147318398,
|
|
"grad_norm": 2.7408447265625,
|
|
"learning_rate": 6.34658975633605e-07,
|
|
"loss": 0.0585,
|
|
"num_input_tokens_seen": 7436544,
|
|
"step": 7745
|
|
},
|
|
{
|
|
"epoch": 0.6576714188730483,
|
|
"grad_norm": 0.8293548822402954,
|
|
"learning_rate": 6.332805809876041e-07,
|
|
"loss": 0.0633,
|
|
"num_input_tokens_seen": 7440896,
|
|
"step": 7750
|
|
},
|
|
{
|
|
"epoch": 0.6580957230142567,
|
|
"grad_norm": 30.720596313476562,
|
|
"learning_rate": 6.319029908901168e-07,
|
|
"loss": 0.1136,
|
|
"num_input_tokens_seen": 7445824,
|
|
"step": 7755
|
|
},
|
|
{
|
|
"epoch": 0.658520027155465,
|
|
"grad_norm": 25.254791259765625,
|
|
"learning_rate": 6.305262083634487e-07,
|
|
"loss": 0.0956,
|
|
"num_input_tokens_seen": 7450368,
|
|
"step": 7760
|
|
},
|
|
{
|
|
"epoch": 0.6589443312966734,
|
|
"grad_norm": 22.178647994995117,
|
|
"learning_rate": 6.29150236428133e-07,
|
|
"loss": 0.0769,
|
|
"num_input_tokens_seen": 7454720,
|
|
"step": 7765
|
|
},
|
|
{
|
|
"epoch": 0.6593686354378818,
|
|
"grad_norm": 11.562461853027344,
|
|
"learning_rate": 6.277750781029254e-07,
|
|
"loss": 0.088,
|
|
"num_input_tokens_seen": 7459264,
|
|
"step": 7770
|
|
},
|
|
{
|
|
"epoch": 0.6597929395790902,
|
|
"grad_norm": 2.147352457046509,
|
|
"learning_rate": 6.26400736404796e-07,
|
|
"loss": 0.0372,
|
|
"num_input_tokens_seen": 7463744,
|
|
"step": 7775
|
|
},
|
|
{
|
|
"epoch": 0.6602172437202987,
|
|
"grad_norm": 7.912290096282959,
|
|
"learning_rate": 6.250272143489236e-07,
|
|
"loss": 0.0222,
|
|
"num_input_tokens_seen": 7468800,
|
|
"step": 7780
|
|
},
|
|
{
|
|
"epoch": 0.6606415478615071,
|
|
"grad_norm": 0.38751131296157837,
|
|
"learning_rate": 6.23654514948688e-07,
|
|
"loss": 0.073,
|
|
"num_input_tokens_seen": 7473856,
|
|
"step": 7785
|
|
},
|
|
{
|
|
"epoch": 0.6610658520027155,
|
|
"grad_norm": 15.395218849182129,
|
|
"learning_rate": 6.222826412156659e-07,
|
|
"loss": 0.0614,
|
|
"num_input_tokens_seen": 7478144,
|
|
"step": 7790
|
|
},
|
|
{
|
|
"epoch": 0.6614901561439239,
|
|
"grad_norm": 15.553271293640137,
|
|
"learning_rate": 6.209115961596207e-07,
|
|
"loss": 0.0651,
|
|
"num_input_tokens_seen": 7482432,
|
|
"step": 7795
|
|
},
|
|
{
|
|
"epoch": 0.6619144602851323,
|
|
"grad_norm": 7.672582149505615,
|
|
"learning_rate": 6.195413827884986e-07,
|
|
"loss": 0.0825,
|
|
"num_input_tokens_seen": 7487488,
|
|
"step": 7800
|
|
},
|
|
{
|
|
"epoch": 0.6623387644263408,
|
|
"grad_norm": 7.964916706085205,
|
|
"learning_rate": 6.181720041084216e-07,
|
|
"loss": 0.0766,
|
|
"num_input_tokens_seen": 7492032,
|
|
"step": 7805
|
|
},
|
|
{
|
|
"epoch": 0.6627630685675492,
|
|
"grad_norm": 18.966585159301758,
|
|
"learning_rate": 6.168034631236794e-07,
|
|
"loss": 0.0316,
|
|
"num_input_tokens_seen": 7496576,
|
|
"step": 7810
|
|
},
|
|
{
|
|
"epoch": 0.6631873727087576,
|
|
"grad_norm": 0.4047878384590149,
|
|
"learning_rate": 6.154357628367251e-07,
|
|
"loss": 0.0282,
|
|
"num_input_tokens_seen": 7501056,
|
|
"step": 7815
|
|
},
|
|
{
|
|
"epoch": 0.663611676849966,
|
|
"grad_norm": 17.005958557128906,
|
|
"learning_rate": 6.140689062481657e-07,
|
|
"loss": 0.0768,
|
|
"num_input_tokens_seen": 7505792,
|
|
"step": 7820
|
|
},
|
|
{
|
|
"epoch": 0.6640359809911744,
|
|
"grad_norm": 13.057510375976562,
|
|
"learning_rate": 6.127028963567593e-07,
|
|
"loss": 0.0388,
|
|
"num_input_tokens_seen": 7510528,
|
|
"step": 7825
|
|
},
|
|
{
|
|
"epoch": 0.6644602851323829,
|
|
"grad_norm": 15.667932510375977,
|
|
"learning_rate": 6.113377361594048e-07,
|
|
"loss": 0.0698,
|
|
"num_input_tokens_seen": 7515200,
|
|
"step": 7830
|
|
},
|
|
{
|
|
"epoch": 0.6648845892735913,
|
|
"grad_norm": 10.725001335144043,
|
|
"learning_rate": 6.099734286511378e-07,
|
|
"loss": 0.0265,
|
|
"num_input_tokens_seen": 7520128,
|
|
"step": 7835
|
|
},
|
|
{
|
|
"epoch": 0.6653088934147997,
|
|
"grad_norm": 6.3311991691589355,
|
|
"learning_rate": 6.086099768251222e-07,
|
|
"loss": 0.0384,
|
|
"num_input_tokens_seen": 7524544,
|
|
"step": 7840
|
|
},
|
|
{
|
|
"epoch": 0.6657331975560081,
|
|
"grad_norm": 24.02566146850586,
|
|
"learning_rate": 6.072473836726461e-07,
|
|
"loss": 0.1199,
|
|
"num_input_tokens_seen": 7529664,
|
|
"step": 7845
|
|
},
|
|
{
|
|
"epoch": 0.6661575016972165,
|
|
"grad_norm": 7.502722263336182,
|
|
"learning_rate": 6.058856521831126e-07,
|
|
"loss": 0.0495,
|
|
"num_input_tokens_seen": 7533760,
|
|
"step": 7850
|
|
},
|
|
{
|
|
"epoch": 0.666581805838425,
|
|
"grad_norm": 0.14090940356254578,
|
|
"learning_rate": 6.045247853440349e-07,
|
|
"loss": 0.0281,
|
|
"num_input_tokens_seen": 7538432,
|
|
"step": 7855
|
|
},
|
|
{
|
|
"epoch": 0.6670061099796334,
|
|
"grad_norm": 9.162590026855469,
|
|
"learning_rate": 6.031647861410287e-07,
|
|
"loss": 0.0557,
|
|
"num_input_tokens_seen": 7543168,
|
|
"step": 7860
|
|
},
|
|
{
|
|
"epoch": 0.6674304141208418,
|
|
"grad_norm": 28.743383407592773,
|
|
"learning_rate": 6.018056575578074e-07,
|
|
"loss": 0.0596,
|
|
"num_input_tokens_seen": 7547840,
|
|
"step": 7865
|
|
},
|
|
{
|
|
"epoch": 0.6678547182620502,
|
|
"grad_norm": 19.00650405883789,
|
|
"learning_rate": 6.004474025761723e-07,
|
|
"loss": 0.0152,
|
|
"num_input_tokens_seen": 7552768,
|
|
"step": 7870
|
|
},
|
|
{
|
|
"epoch": 0.6682790224032586,
|
|
"grad_norm": 19.516019821166992,
|
|
"learning_rate": 5.990900241760102e-07,
|
|
"loss": 0.1233,
|
|
"num_input_tokens_seen": 7557312,
|
|
"step": 7875
|
|
},
|
|
{
|
|
"epoch": 0.6687033265444671,
|
|
"grad_norm": 14.39048957824707,
|
|
"learning_rate": 5.977335253352833e-07,
|
|
"loss": 0.0976,
|
|
"num_input_tokens_seen": 7561920,
|
|
"step": 7880
|
|
},
|
|
{
|
|
"epoch": 0.6691276306856755,
|
|
"grad_norm": 31.64281463623047,
|
|
"learning_rate": 5.963779090300254e-07,
|
|
"loss": 0.0702,
|
|
"num_input_tokens_seen": 7567104,
|
|
"step": 7885
|
|
},
|
|
{
|
|
"epoch": 0.6695519348268839,
|
|
"grad_norm": 6.729279518127441,
|
|
"learning_rate": 5.950231782343326e-07,
|
|
"loss": 0.0453,
|
|
"num_input_tokens_seen": 7571584,
|
|
"step": 7890
|
|
},
|
|
{
|
|
"epoch": 0.6699762389680923,
|
|
"grad_norm": 12.365065574645996,
|
|
"learning_rate": 5.936693359203597e-07,
|
|
"loss": 0.0216,
|
|
"num_input_tokens_seen": 7576256,
|
|
"step": 7895
|
|
},
|
|
{
|
|
"epoch": 0.6704005431093008,
|
|
"grad_norm": 8.738225936889648,
|
|
"learning_rate": 5.923163850583113e-07,
|
|
"loss": 0.0625,
|
|
"num_input_tokens_seen": 7581248,
|
|
"step": 7900
|
|
},
|
|
{
|
|
"epoch": 0.6708248472505092,
|
|
"grad_norm": 14.904520034790039,
|
|
"learning_rate": 5.909643286164367e-07,
|
|
"loss": 0.044,
|
|
"num_input_tokens_seen": 7585984,
|
|
"step": 7905
|
|
},
|
|
{
|
|
"epoch": 0.6712491513917176,
|
|
"grad_norm": 10.954987525939941,
|
|
"learning_rate": 5.896131695610223e-07,
|
|
"loss": 0.0904,
|
|
"num_input_tokens_seen": 7590528,
|
|
"step": 7910
|
|
},
|
|
{
|
|
"epoch": 0.671673455532926,
|
|
"grad_norm": 13.684930801391602,
|
|
"learning_rate": 5.88262910856387e-07,
|
|
"loss": 0.0285,
|
|
"num_input_tokens_seen": 7595328,
|
|
"step": 7915
|
|
},
|
|
{
|
|
"epoch": 0.6720977596741344,
|
|
"grad_norm": 6.646015167236328,
|
|
"learning_rate": 5.869135554648728e-07,
|
|
"loss": 0.0699,
|
|
"num_input_tokens_seen": 7600128,
|
|
"step": 7920
|
|
},
|
|
{
|
|
"epoch": 0.6725220638153429,
|
|
"grad_norm": 0.9344552159309387,
|
|
"learning_rate": 5.855651063468411e-07,
|
|
"loss": 0.054,
|
|
"num_input_tokens_seen": 7604288,
|
|
"step": 7925
|
|
},
|
|
{
|
|
"epoch": 0.6729463679565513,
|
|
"grad_norm": 19.725372314453125,
|
|
"learning_rate": 5.84217566460665e-07,
|
|
"loss": 0.0355,
|
|
"num_input_tokens_seen": 7609280,
|
|
"step": 7930
|
|
},
|
|
{
|
|
"epoch": 0.6733706720977597,
|
|
"grad_norm": 14.636306762695312,
|
|
"learning_rate": 5.828709387627217e-07,
|
|
"loss": 0.04,
|
|
"num_input_tokens_seen": 7614528,
|
|
"step": 7935
|
|
},
|
|
{
|
|
"epoch": 0.6737949762389681,
|
|
"grad_norm": 13.225370407104492,
|
|
"learning_rate": 5.815252262073891e-07,
|
|
"loss": 0.1098,
|
|
"num_input_tokens_seen": 7619264,
|
|
"step": 7940
|
|
},
|
|
{
|
|
"epoch": 0.6742192803801765,
|
|
"grad_norm": 0.19334328174591064,
|
|
"learning_rate": 5.801804317470349e-07,
|
|
"loss": 0.0032,
|
|
"num_input_tokens_seen": 7624192,
|
|
"step": 7945
|
|
},
|
|
{
|
|
"epoch": 0.674643584521385,
|
|
"grad_norm": 10.048741340637207,
|
|
"learning_rate": 5.788365583320144e-07,
|
|
"loss": 0.0411,
|
|
"num_input_tokens_seen": 7629120,
|
|
"step": 7950
|
|
},
|
|
{
|
|
"epoch": 0.6750678886625934,
|
|
"grad_norm": 14.358065605163574,
|
|
"learning_rate": 5.774936089106617e-07,
|
|
"loss": 0.0486,
|
|
"num_input_tokens_seen": 7633984,
|
|
"step": 7955
|
|
},
|
|
{
|
|
"epoch": 0.6754921928038018,
|
|
"grad_norm": 0.5867604613304138,
|
|
"learning_rate": 5.761515864292835e-07,
|
|
"loss": 0.0189,
|
|
"num_input_tokens_seen": 7638976,
|
|
"step": 7960
|
|
},
|
|
{
|
|
"epoch": 0.6759164969450102,
|
|
"grad_norm": 0.13423283398151398,
|
|
"learning_rate": 5.748104938321534e-07,
|
|
"loss": 0.001,
|
|
"num_input_tokens_seen": 7643520,
|
|
"step": 7965
|
|
},
|
|
{
|
|
"epoch": 0.6763408010862186,
|
|
"grad_norm": 0.5499201416969299,
|
|
"learning_rate": 5.734703340615049e-07,
|
|
"loss": 0.0669,
|
|
"num_input_tokens_seen": 7648576,
|
|
"step": 7970
|
|
},
|
|
{
|
|
"epoch": 0.676765105227427,
|
|
"grad_norm": 0.3033323585987091,
|
|
"learning_rate": 5.721311100575235e-07,
|
|
"loss": 0.0411,
|
|
"num_input_tokens_seen": 7653632,
|
|
"step": 7975
|
|
},
|
|
{
|
|
"epoch": 0.6771894093686355,
|
|
"grad_norm": 0.13103389739990234,
|
|
"learning_rate": 5.707928247583444e-07,
|
|
"loss": 0.0035,
|
|
"num_input_tokens_seen": 7658176,
|
|
"step": 7980
|
|
},
|
|
{
|
|
"epoch": 0.6776137135098439,
|
|
"grad_norm": 15.915654182434082,
|
|
"learning_rate": 5.694554811000407e-07,
|
|
"loss": 0.1087,
|
|
"num_input_tokens_seen": 7663040,
|
|
"step": 7985
|
|
},
|
|
{
|
|
"epoch": 0.6780380176510523,
|
|
"grad_norm": 18.92255973815918,
|
|
"learning_rate": 5.681190820166213e-07,
|
|
"loss": 0.0706,
|
|
"num_input_tokens_seen": 7668160,
|
|
"step": 7990
|
|
},
|
|
{
|
|
"epoch": 0.6784623217922607,
|
|
"grad_norm": 20.302759170532227,
|
|
"learning_rate": 5.667836304400221e-07,
|
|
"loss": 0.036,
|
|
"num_input_tokens_seen": 7673024,
|
|
"step": 7995
|
|
},
|
|
{
|
|
"epoch": 0.6788866259334692,
|
|
"grad_norm": 0.7758719325065613,
|
|
"learning_rate": 5.654491293001005e-07,
|
|
"loss": 0.0617,
|
|
"num_input_tokens_seen": 7677696,
|
|
"step": 8000
|
|
},
|
|
{
|
|
"epoch": 0.6793109300746776,
|
|
"grad_norm": 16.18206214904785,
|
|
"learning_rate": 5.641155815246289e-07,
|
|
"loss": 0.0334,
|
|
"num_input_tokens_seen": 7682752,
|
|
"step": 8005
|
|
},
|
|
{
|
|
"epoch": 0.679735234215886,
|
|
"grad_norm": 0.8303351402282715,
|
|
"learning_rate": 5.62782990039288e-07,
|
|
"loss": 0.0892,
|
|
"num_input_tokens_seen": 7687872,
|
|
"step": 8010
|
|
},
|
|
{
|
|
"epoch": 0.6801595383570944,
|
|
"grad_norm": 10.10102653503418,
|
|
"learning_rate": 5.614513577676592e-07,
|
|
"loss": 0.0831,
|
|
"num_input_tokens_seen": 7692800,
|
|
"step": 8015
|
|
},
|
|
{
|
|
"epoch": 0.6805838424983028,
|
|
"grad_norm": 6.513079643249512,
|
|
"learning_rate": 5.601206876312223e-07,
|
|
"loss": 0.0417,
|
|
"num_input_tokens_seen": 7697472,
|
|
"step": 8020
|
|
},
|
|
{
|
|
"epoch": 0.6810081466395111,
|
|
"grad_norm": 0.1420639455318451,
|
|
"learning_rate": 5.587909825493433e-07,
|
|
"loss": 0.0675,
|
|
"num_input_tokens_seen": 7702336,
|
|
"step": 8025
|
|
},
|
|
{
|
|
"epoch": 0.6814324507807196,
|
|
"grad_norm": 29.473529815673828,
|
|
"learning_rate": 5.57462245439273e-07,
|
|
"loss": 0.0766,
|
|
"num_input_tokens_seen": 7707136,
|
|
"step": 8030
|
|
},
|
|
{
|
|
"epoch": 0.681856754921928,
|
|
"grad_norm": 20.80841064453125,
|
|
"learning_rate": 5.561344792161373e-07,
|
|
"loss": 0.1067,
|
|
"num_input_tokens_seen": 7711808,
|
|
"step": 8035
|
|
},
|
|
{
|
|
"epoch": 0.6822810590631364,
|
|
"grad_norm": 1.2022435665130615,
|
|
"learning_rate": 5.54807686792933e-07,
|
|
"loss": 0.0566,
|
|
"num_input_tokens_seen": 7716352,
|
|
"step": 8040
|
|
},
|
|
{
|
|
"epoch": 0.6827053632043448,
|
|
"grad_norm": 45.30147171020508,
|
|
"learning_rate": 5.534818710805198e-07,
|
|
"loss": 0.115,
|
|
"num_input_tokens_seen": 7721024,
|
|
"step": 8045
|
|
},
|
|
{
|
|
"epoch": 0.6831296673455532,
|
|
"grad_norm": 6.771976470947266,
|
|
"learning_rate": 5.52157034987615e-07,
|
|
"loss": 0.0836,
|
|
"num_input_tokens_seen": 7726208,
|
|
"step": 8050
|
|
},
|
|
{
|
|
"epoch": 0.6835539714867617,
|
|
"grad_norm": 0.2241697609424591,
|
|
"learning_rate": 5.508331814207864e-07,
|
|
"loss": 0.0869,
|
|
"num_input_tokens_seen": 7731136,
|
|
"step": 8055
|
|
},
|
|
{
|
|
"epoch": 0.6839782756279701,
|
|
"grad_norm": 9.992411613464355,
|
|
"learning_rate": 5.495103132844466e-07,
|
|
"loss": 0.1127,
|
|
"num_input_tokens_seen": 7735552,
|
|
"step": 8060
|
|
},
|
|
{
|
|
"epoch": 0.6844025797691785,
|
|
"grad_norm": 0.10987231880426407,
|
|
"learning_rate": 5.481884334808463e-07,
|
|
"loss": 0.0752,
|
|
"num_input_tokens_seen": 7740032,
|
|
"step": 8065
|
|
},
|
|
{
|
|
"epoch": 0.6848268839103869,
|
|
"grad_norm": 29.561595916748047,
|
|
"learning_rate": 5.468675449100664e-07,
|
|
"loss": 0.0359,
|
|
"num_input_tokens_seen": 7744960,
|
|
"step": 8070
|
|
},
|
|
{
|
|
"epoch": 0.6852511880515954,
|
|
"grad_norm": 7.352428913116455,
|
|
"learning_rate": 5.455476504700161e-07,
|
|
"loss": 0.1355,
|
|
"num_input_tokens_seen": 7749632,
|
|
"step": 8075
|
|
},
|
|
{
|
|
"epoch": 0.6856754921928038,
|
|
"grad_norm": 8.440690040588379,
|
|
"learning_rate": 5.442287530564203e-07,
|
|
"loss": 0.078,
|
|
"num_input_tokens_seen": 7754176,
|
|
"step": 8080
|
|
},
|
|
{
|
|
"epoch": 0.6860997963340122,
|
|
"grad_norm": 17.962841033935547,
|
|
"learning_rate": 5.429108555628186e-07,
|
|
"loss": 0.1161,
|
|
"num_input_tokens_seen": 7759360,
|
|
"step": 8085
|
|
},
|
|
{
|
|
"epoch": 0.6865241004752206,
|
|
"grad_norm": 11.928627014160156,
|
|
"learning_rate": 5.415939608805564e-07,
|
|
"loss": 0.0741,
|
|
"num_input_tokens_seen": 7763904,
|
|
"step": 8090
|
|
},
|
|
{
|
|
"epoch": 0.686948404616429,
|
|
"grad_norm": 0.7417271137237549,
|
|
"learning_rate": 5.402780718987789e-07,
|
|
"loss": 0.0152,
|
|
"num_input_tokens_seen": 7768320,
|
|
"step": 8095
|
|
},
|
|
{
|
|
"epoch": 0.6873727087576375,
|
|
"grad_norm": 22.295618057250977,
|
|
"learning_rate": 5.389631915044249e-07,
|
|
"loss": 0.0639,
|
|
"num_input_tokens_seen": 7772928,
|
|
"step": 8100
|
|
},
|
|
{
|
|
"epoch": 0.6877970128988459,
|
|
"grad_norm": 0.626780092716217,
|
|
"learning_rate": 5.376493225822208e-07,
|
|
"loss": 0.0481,
|
|
"num_input_tokens_seen": 7777280,
|
|
"step": 8105
|
|
},
|
|
{
|
|
"epoch": 0.6882213170400543,
|
|
"grad_norm": 3.957951545715332,
|
|
"learning_rate": 5.363364680146725e-07,
|
|
"loss": 0.0534,
|
|
"num_input_tokens_seen": 7782080,
|
|
"step": 8110
|
|
},
|
|
{
|
|
"epoch": 0.6886456211812627,
|
|
"grad_norm": 27.052066802978516,
|
|
"learning_rate": 5.350246306820632e-07,
|
|
"loss": 0.0668,
|
|
"num_input_tokens_seen": 7786880,
|
|
"step": 8115
|
|
},
|
|
{
|
|
"epoch": 0.6890699253224711,
|
|
"grad_norm": 3.0049777030944824,
|
|
"learning_rate": 5.337138134624412e-07,
|
|
"loss": 0.0265,
|
|
"num_input_tokens_seen": 7791232,
|
|
"step": 8120
|
|
},
|
|
{
|
|
"epoch": 0.6894942294636796,
|
|
"grad_norm": 0.7207580208778381,
|
|
"learning_rate": 5.324040192316193e-07,
|
|
"loss": 0.0115,
|
|
"num_input_tokens_seen": 7796160,
|
|
"step": 8125
|
|
},
|
|
{
|
|
"epoch": 0.689918533604888,
|
|
"grad_norm": 1.550174355506897,
|
|
"learning_rate": 5.310952508631645e-07,
|
|
"loss": 0.0314,
|
|
"num_input_tokens_seen": 7801024,
|
|
"step": 8130
|
|
},
|
|
{
|
|
"epoch": 0.6903428377460964,
|
|
"grad_norm": 10.424066543579102,
|
|
"learning_rate": 5.297875112283941e-07,
|
|
"loss": 0.067,
|
|
"num_input_tokens_seen": 7805952,
|
|
"step": 8135
|
|
},
|
|
{
|
|
"epoch": 0.6907671418873048,
|
|
"grad_norm": 22.461483001708984,
|
|
"learning_rate": 5.284808031963676e-07,
|
|
"loss": 0.0658,
|
|
"num_input_tokens_seen": 7811008,
|
|
"step": 8140
|
|
},
|
|
{
|
|
"epoch": 0.6911914460285132,
|
|
"grad_norm": 0.32975611090660095,
|
|
"learning_rate": 5.271751296338822e-07,
|
|
"loss": 0.0104,
|
|
"num_input_tokens_seen": 7815296,
|
|
"step": 8145
|
|
},
|
|
{
|
|
"epoch": 0.6916157501697217,
|
|
"grad_norm": 2.9173388481140137,
|
|
"learning_rate": 5.25870493405464e-07,
|
|
"loss": 0.0286,
|
|
"num_input_tokens_seen": 7820864,
|
|
"step": 8150
|
|
},
|
|
{
|
|
"epoch": 0.6920400543109301,
|
|
"grad_norm": 27.80228042602539,
|
|
"learning_rate": 5.245668973733657e-07,
|
|
"loss": 0.0598,
|
|
"num_input_tokens_seen": 7825856,
|
|
"step": 8155
|
|
},
|
|
{
|
|
"epoch": 0.6924643584521385,
|
|
"grad_norm": 11.682685852050781,
|
|
"learning_rate": 5.232643443975553e-07,
|
|
"loss": 0.0352,
|
|
"num_input_tokens_seen": 7830400,
|
|
"step": 8160
|
|
},
|
|
{
|
|
"epoch": 0.6928886625933469,
|
|
"grad_norm": 0.9110048413276672,
|
|
"learning_rate": 5.219628373357142e-07,
|
|
"loss": 0.006,
|
|
"num_input_tokens_seen": 7835776,
|
|
"step": 8165
|
|
},
|
|
{
|
|
"epoch": 0.6933129667345553,
|
|
"grad_norm": 0.14536383748054504,
|
|
"learning_rate": 5.206623790432285e-07,
|
|
"loss": 0.0763,
|
|
"num_input_tokens_seen": 7840640,
|
|
"step": 8170
|
|
},
|
|
{
|
|
"epoch": 0.6937372708757638,
|
|
"grad_norm": 35.49247741699219,
|
|
"learning_rate": 5.193629723731837e-07,
|
|
"loss": 0.077,
|
|
"num_input_tokens_seen": 7845120,
|
|
"step": 8175
|
|
},
|
|
{
|
|
"epoch": 0.6941615750169722,
|
|
"grad_norm": 12.557273864746094,
|
|
"learning_rate": 5.180646201763577e-07,
|
|
"loss": 0.0517,
|
|
"num_input_tokens_seen": 7849664,
|
|
"step": 8180
|
|
},
|
|
{
|
|
"epoch": 0.6945858791581806,
|
|
"grad_norm": 15.135461807250977,
|
|
"learning_rate": 5.167673253012152e-07,
|
|
"loss": 0.0192,
|
|
"num_input_tokens_seen": 7854080,
|
|
"step": 8185
|
|
},
|
|
{
|
|
"epoch": 0.695010183299389,
|
|
"grad_norm": 11.858826637268066,
|
|
"learning_rate": 5.154710905939015e-07,
|
|
"loss": 0.0317,
|
|
"num_input_tokens_seen": 7858176,
|
|
"step": 8190
|
|
},
|
|
{
|
|
"epoch": 0.6954344874405974,
|
|
"grad_norm": 13.808652877807617,
|
|
"learning_rate": 5.141759188982356e-07,
|
|
"loss": 0.0603,
|
|
"num_input_tokens_seen": 7862912,
|
|
"step": 8195
|
|
},
|
|
{
|
|
"epoch": 0.6958587915818059,
|
|
"grad_norm": 29.41330909729004,
|
|
"learning_rate": 5.12881813055705e-07,
|
|
"loss": 0.0554,
|
|
"num_input_tokens_seen": 7867648,
|
|
"step": 8200
|
|
},
|
|
{
|
|
"epoch": 0.6962830957230143,
|
|
"grad_norm": 1.930039644241333,
|
|
"learning_rate": 5.115887759054571e-07,
|
|
"loss": 0.1073,
|
|
"num_input_tokens_seen": 7873152,
|
|
"step": 8205
|
|
},
|
|
{
|
|
"epoch": 0.6967073998642227,
|
|
"grad_norm": 25.302221298217773,
|
|
"learning_rate": 5.102968102842973e-07,
|
|
"loss": 0.0761,
|
|
"num_input_tokens_seen": 7877824,
|
|
"step": 8210
|
|
},
|
|
{
|
|
"epoch": 0.6971317040054311,
|
|
"grad_norm": 10.491656303405762,
|
|
"learning_rate": 5.090059190266779e-07,
|
|
"loss": 0.1041,
|
|
"num_input_tokens_seen": 7882688,
|
|
"step": 8215
|
|
},
|
|
{
|
|
"epoch": 0.6975560081466395,
|
|
"grad_norm": 1.2530505657196045,
|
|
"learning_rate": 5.077161049646951e-07,
|
|
"loss": 0.0246,
|
|
"num_input_tokens_seen": 7887424,
|
|
"step": 8220
|
|
},
|
|
{
|
|
"epoch": 0.697980312287848,
|
|
"grad_norm": 0.47119423747062683,
|
|
"learning_rate": 5.06427370928082e-07,
|
|
"loss": 0.026,
|
|
"num_input_tokens_seen": 7891584,
|
|
"step": 8225
|
|
},
|
|
{
|
|
"epoch": 0.6984046164290564,
|
|
"grad_norm": 31.259077072143555,
|
|
"learning_rate": 5.05139719744202e-07,
|
|
"loss": 0.122,
|
|
"num_input_tokens_seen": 7896576,
|
|
"step": 8230
|
|
},
|
|
{
|
|
"epoch": 0.6988289205702648,
|
|
"grad_norm": 0.2719421684741974,
|
|
"learning_rate": 5.038531542380425e-07,
|
|
"loss": 0.0641,
|
|
"num_input_tokens_seen": 7901376,
|
|
"step": 8235
|
|
},
|
|
{
|
|
"epoch": 0.6992532247114732,
|
|
"grad_norm": 0.593479335308075,
|
|
"learning_rate": 5.025676772322099e-07,
|
|
"loss": 0.0309,
|
|
"num_input_tokens_seen": 7906368,
|
|
"step": 8240
|
|
},
|
|
{
|
|
"epoch": 0.6996775288526816,
|
|
"grad_norm": 34.5064811706543,
|
|
"learning_rate": 5.012832915469207e-07,
|
|
"loss": 0.0767,
|
|
"num_input_tokens_seen": 7911232,
|
|
"step": 8245
|
|
},
|
|
{
|
|
"epoch": 0.7001018329938901,
|
|
"grad_norm": 14.479310035705566,
|
|
"learning_rate": 5.000000000000002e-07,
|
|
"loss": 0.0687,
|
|
"num_input_tokens_seen": 7915776,
|
|
"step": 8250
|
|
},
|
|
{
|
|
"epoch": 0.7005261371350985,
|
|
"grad_norm": 32.0446662902832,
|
|
"learning_rate": 4.987178054068699e-07,
|
|
"loss": 0.0235,
|
|
"num_input_tokens_seen": 7920192,
|
|
"step": 8255
|
|
},
|
|
{
|
|
"epoch": 0.7009504412763069,
|
|
"grad_norm": 2.59600830078125,
|
|
"learning_rate": 4.97436710580547e-07,
|
|
"loss": 0.0589,
|
|
"num_input_tokens_seen": 7924800,
|
|
"step": 8260
|
|
},
|
|
{
|
|
"epoch": 0.7009504412763069,
|
|
"eval_loss": 0.05402500182390213,
|
|
"eval_runtime": 16.7834,
|
|
"eval_samples_per_second": 624.128,
|
|
"eval_steps_per_second": 78.053,
|
|
"num_input_tokens_seen": 7924800,
|
|
"step": 8260
|
|
},
|
|
{
|
|
"epoch": 0.7013747454175153,
|
|
"grad_norm": 0.33619779348373413,
|
|
"learning_rate": 4.961567183316348e-07,
|
|
"loss": 0.0328,
|
|
"num_input_tokens_seen": 7929664,
|
|
"step": 8265
|
|
},
|
|
{
|
|
"epoch": 0.7017990495587237,
|
|
"grad_norm": 14.767923355102539,
|
|
"learning_rate": 4.948778314683183e-07,
|
|
"loss": 0.1087,
|
|
"num_input_tokens_seen": 7934464,
|
|
"step": 8270
|
|
},
|
|
{
|
|
"epoch": 0.7022233536999322,
|
|
"grad_norm": 25.432844161987305,
|
|
"learning_rate": 4.93600052796357e-07,
|
|
"loss": 0.067,
|
|
"num_input_tokens_seen": 7939264,
|
|
"step": 8275
|
|
},
|
|
{
|
|
"epoch": 0.7026476578411406,
|
|
"grad_norm": 19.076519012451172,
|
|
"learning_rate": 4.923233851190794e-07,
|
|
"loss": 0.0767,
|
|
"num_input_tokens_seen": 7943552,
|
|
"step": 8280
|
|
},
|
|
{
|
|
"epoch": 0.703071961982349,
|
|
"grad_norm": 4.825868129730225,
|
|
"learning_rate": 4.910478312373756e-07,
|
|
"loss": 0.0304,
|
|
"num_input_tokens_seen": 7948032,
|
|
"step": 8285
|
|
},
|
|
{
|
|
"epoch": 0.7034962661235573,
|
|
"grad_norm": 0.5740851163864136,
|
|
"learning_rate": 4.897733939496942e-07,
|
|
"loss": 0.0578,
|
|
"num_input_tokens_seen": 7953408,
|
|
"step": 8290
|
|
},
|
|
{
|
|
"epoch": 0.7039205702647657,
|
|
"grad_norm": 0.20119158923625946,
|
|
"learning_rate": 4.885000760520317e-07,
|
|
"loss": 0.0615,
|
|
"num_input_tokens_seen": 7957760,
|
|
"step": 8295
|
|
},
|
|
{
|
|
"epoch": 0.7043448744059742,
|
|
"grad_norm": 16.458707809448242,
|
|
"learning_rate": 4.872278803379299e-07,
|
|
"loss": 0.0255,
|
|
"num_input_tokens_seen": 7963712,
|
|
"step": 8300
|
|
},
|
|
{
|
|
"epoch": 0.7047691785471826,
|
|
"grad_norm": 1.7117456197738647,
|
|
"learning_rate": 4.8595680959847e-07,
|
|
"loss": 0.0582,
|
|
"num_input_tokens_seen": 7968576,
|
|
"step": 8305
|
|
},
|
|
{
|
|
"epoch": 0.705193482688391,
|
|
"grad_norm": 8.526166915893555,
|
|
"learning_rate": 4.846868666222622e-07,
|
|
"loss": 0.0872,
|
|
"num_input_tokens_seen": 7973184,
|
|
"step": 8310
|
|
},
|
|
{
|
|
"epoch": 0.7056177868295994,
|
|
"grad_norm": 0.6074093580245972,
|
|
"learning_rate": 4.834180541954447e-07,
|
|
"loss": 0.0425,
|
|
"num_input_tokens_seen": 7977792,
|
|
"step": 8315
|
|
},
|
|
{
|
|
"epoch": 0.7060420909708078,
|
|
"grad_norm": 0.9059328436851501,
|
|
"learning_rate": 4.821503751016745e-07,
|
|
"loss": 0.025,
|
|
"num_input_tokens_seen": 7981824,
|
|
"step": 8320
|
|
},
|
|
{
|
|
"epoch": 0.7064663951120163,
|
|
"grad_norm": 4.6199116706848145,
|
|
"learning_rate": 4.808838321221226e-07,
|
|
"loss": 0.05,
|
|
"num_input_tokens_seen": 7986112,
|
|
"step": 8325
|
|
},
|
|
{
|
|
"epoch": 0.7068906992532247,
|
|
"grad_norm": 10.900896072387695,
|
|
"learning_rate": 4.79618428035467e-07,
|
|
"loss": 0.1184,
|
|
"num_input_tokens_seen": 7990592,
|
|
"step": 8330
|
|
},
|
|
{
|
|
"epoch": 0.7073150033944331,
|
|
"grad_norm": 0.8516868948936462,
|
|
"learning_rate": 4.78354165617888e-07,
|
|
"loss": 0.1291,
|
|
"num_input_tokens_seen": 7995136,
|
|
"step": 8335
|
|
},
|
|
{
|
|
"epoch": 0.7077393075356415,
|
|
"grad_norm": 9.084482192993164,
|
|
"learning_rate": 4.77091047643059e-07,
|
|
"loss": 0.0624,
|
|
"num_input_tokens_seen": 7999872,
|
|
"step": 8340
|
|
},
|
|
{
|
|
"epoch": 0.7081636116768499,
|
|
"grad_norm": 22.805496215820312,
|
|
"learning_rate": 4.7582907688214593e-07,
|
|
"loss": 0.0425,
|
|
"num_input_tokens_seen": 8004608,
|
|
"step": 8345
|
|
},
|
|
{
|
|
"epoch": 0.7085879158180584,
|
|
"grad_norm": 30.081192016601562,
|
|
"learning_rate": 4.745682561037947e-07,
|
|
"loss": 0.0204,
|
|
"num_input_tokens_seen": 8009024,
|
|
"step": 8350
|
|
},
|
|
{
|
|
"epoch": 0.7090122199592668,
|
|
"grad_norm": 22.23179054260254,
|
|
"learning_rate": 4.733085880741301e-07,
|
|
"loss": 0.0655,
|
|
"num_input_tokens_seen": 8013504,
|
|
"step": 8355
|
|
},
|
|
{
|
|
"epoch": 0.7094365241004752,
|
|
"grad_norm": 11.346170425415039,
|
|
"learning_rate": 4.7205007555674714e-07,
|
|
"loss": 0.1174,
|
|
"num_input_tokens_seen": 8018432,
|
|
"step": 8360
|
|
},
|
|
{
|
|
"epoch": 0.7098608282416836,
|
|
"grad_norm": 0.08944053202867508,
|
|
"learning_rate": 4.707927213127062e-07,
|
|
"loss": 0.0022,
|
|
"num_input_tokens_seen": 8022848,
|
|
"step": 8365
|
|
},
|
|
{
|
|
"epoch": 0.710285132382892,
|
|
"grad_norm": 7.584676265716553,
|
|
"learning_rate": 4.6953652810052615e-07,
|
|
"loss": 0.0746,
|
|
"num_input_tokens_seen": 8027392,
|
|
"step": 8370
|
|
},
|
|
{
|
|
"epoch": 0.7107094365241005,
|
|
"grad_norm": 11.053536415100098,
|
|
"learning_rate": 4.682814986761792e-07,
|
|
"loss": 0.1568,
|
|
"num_input_tokens_seen": 8032640,
|
|
"step": 8375
|
|
},
|
|
{
|
|
"epoch": 0.7111337406653089,
|
|
"grad_norm": 0.47501927614212036,
|
|
"learning_rate": 4.670276357930828e-07,
|
|
"loss": 0.0424,
|
|
"num_input_tokens_seen": 8037312,
|
|
"step": 8380
|
|
},
|
|
{
|
|
"epoch": 0.7115580448065173,
|
|
"grad_norm": 6.653781890869141,
|
|
"learning_rate": 4.657749422020979e-07,
|
|
"loss": 0.1158,
|
|
"num_input_tokens_seen": 8042240,
|
|
"step": 8385
|
|
},
|
|
{
|
|
"epoch": 0.7119823489477257,
|
|
"grad_norm": 0.5156369805335999,
|
|
"learning_rate": 4.6452342065151704e-07,
|
|
"loss": 0.0395,
|
|
"num_input_tokens_seen": 8047360,
|
|
"step": 8390
|
|
},
|
|
{
|
|
"epoch": 0.7124066530889341,
|
|
"grad_norm": 18.825040817260742,
|
|
"learning_rate": 4.632730738870634e-07,
|
|
"loss": 0.0758,
|
|
"num_input_tokens_seen": 8052096,
|
|
"step": 8395
|
|
},
|
|
{
|
|
"epoch": 0.7128309572301426,
|
|
"grad_norm": 32.614601135253906,
|
|
"learning_rate": 4.6202390465188225e-07,
|
|
"loss": 0.0353,
|
|
"num_input_tokens_seen": 8056960,
|
|
"step": 8400
|
|
},
|
|
{
|
|
"epoch": 0.713255261371351,
|
|
"grad_norm": 33.05833053588867,
|
|
"learning_rate": 4.6077591568653575e-07,
|
|
"loss": 0.054,
|
|
"num_input_tokens_seen": 8061568,
|
|
"step": 8405
|
|
},
|
|
{
|
|
"epoch": 0.7136795655125594,
|
|
"grad_norm": 33.38587188720703,
|
|
"learning_rate": 4.595291097289952e-07,
|
|
"loss": 0.0632,
|
|
"num_input_tokens_seen": 8066624,
|
|
"step": 8410
|
|
},
|
|
{
|
|
"epoch": 0.7141038696537678,
|
|
"grad_norm": 17.00100326538086,
|
|
"learning_rate": 4.582834895146391e-07,
|
|
"loss": 0.0673,
|
|
"num_input_tokens_seen": 8071680,
|
|
"step": 8415
|
|
},
|
|
{
|
|
"epoch": 0.7145281737949762,
|
|
"grad_norm": 13.670267105102539,
|
|
"learning_rate": 4.5703905777624184e-07,
|
|
"loss": 0.0537,
|
|
"num_input_tokens_seen": 8076480,
|
|
"step": 8420
|
|
},
|
|
{
|
|
"epoch": 0.7149524779361847,
|
|
"grad_norm": 0.7490288615226746,
|
|
"learning_rate": 4.5579581724397255e-07,
|
|
"loss": 0.0152,
|
|
"num_input_tokens_seen": 8080768,
|
|
"step": 8425
|
|
},
|
|
{
|
|
"epoch": 0.7153767820773931,
|
|
"grad_norm": 1.8543779850006104,
|
|
"learning_rate": 4.5455377064538603e-07,
|
|
"loss": 0.0533,
|
|
"num_input_tokens_seen": 8085248,
|
|
"step": 8430
|
|
},
|
|
{
|
|
"epoch": 0.7158010862186015,
|
|
"grad_norm": 15.36892318725586,
|
|
"learning_rate": 4.533129207054167e-07,
|
|
"loss": 0.0402,
|
|
"num_input_tokens_seen": 8090176,
|
|
"step": 8435
|
|
},
|
|
{
|
|
"epoch": 0.7162253903598099,
|
|
"grad_norm": 3.8697879314422607,
|
|
"learning_rate": 4.520732701463762e-07,
|
|
"loss": 0.0755,
|
|
"num_input_tokens_seen": 8094464,
|
|
"step": 8440
|
|
},
|
|
{
|
|
"epoch": 0.7166496945010183,
|
|
"grad_norm": 0.8485707640647888,
|
|
"learning_rate": 4.508348216879421e-07,
|
|
"loss": 0.0949,
|
|
"num_input_tokens_seen": 8099008,
|
|
"step": 8445
|
|
},
|
|
{
|
|
"epoch": 0.7170739986422268,
|
|
"grad_norm": 7.10463285446167,
|
|
"learning_rate": 4.4959757804715613e-07,
|
|
"loss": 0.0616,
|
|
"num_input_tokens_seen": 8104256,
|
|
"step": 8450
|
|
},
|
|
{
|
|
"epoch": 0.7174983027834352,
|
|
"grad_norm": 17.09888458251953,
|
|
"learning_rate": 4.483615419384167e-07,
|
|
"loss": 0.0722,
|
|
"num_input_tokens_seen": 8109056,
|
|
"step": 8455
|
|
},
|
|
{
|
|
"epoch": 0.7179226069246436,
|
|
"grad_norm": 0.9063630700111389,
|
|
"learning_rate": 4.4712671607347307e-07,
|
|
"loss": 0.1141,
|
|
"num_input_tokens_seen": 8114176,
|
|
"step": 8460
|
|
},
|
|
{
|
|
"epoch": 0.718346911065852,
|
|
"grad_norm": 0.6737499833106995,
|
|
"learning_rate": 4.458931031614179e-07,
|
|
"loss": 0.0832,
|
|
"num_input_tokens_seen": 8119232,
|
|
"step": 8465
|
|
},
|
|
{
|
|
"epoch": 0.7187712152070604,
|
|
"grad_norm": 0.8057066202163696,
|
|
"learning_rate": 4.4466070590868543e-07,
|
|
"loss": 0.0657,
|
|
"num_input_tokens_seen": 8123840,
|
|
"step": 8470
|
|
},
|
|
{
|
|
"epoch": 0.7191955193482689,
|
|
"grad_norm": 6.584603786468506,
|
|
"learning_rate": 4.434295270190402e-07,
|
|
"loss": 0.1181,
|
|
"num_input_tokens_seen": 8128256,
|
|
"step": 8475
|
|
},
|
|
{
|
|
"epoch": 0.7196198234894773,
|
|
"grad_norm": 29.911685943603516,
|
|
"learning_rate": 4.4219956919357546e-07,
|
|
"loss": 0.0305,
|
|
"num_input_tokens_seen": 8133504,
|
|
"step": 8480
|
|
},
|
|
{
|
|
"epoch": 0.7200441276306857,
|
|
"grad_norm": 0.1387253999710083,
|
|
"learning_rate": 4.409708351307049e-07,
|
|
"loss": 0.0066,
|
|
"num_input_tokens_seen": 8138176,
|
|
"step": 8485
|
|
},
|
|
{
|
|
"epoch": 0.7204684317718941,
|
|
"grad_norm": 13.161700248718262,
|
|
"learning_rate": 4.3974332752615727e-07,
|
|
"loss": 0.0553,
|
|
"num_input_tokens_seen": 8143104,
|
|
"step": 8490
|
|
},
|
|
{
|
|
"epoch": 0.7208927359131025,
|
|
"grad_norm": 17.81656265258789,
|
|
"learning_rate": 4.385170490729712e-07,
|
|
"loss": 0.0336,
|
|
"num_input_tokens_seen": 8147584,
|
|
"step": 8495
|
|
},
|
|
{
|
|
"epoch": 0.721317040054311,
|
|
"grad_norm": 25.293027877807617,
|
|
"learning_rate": 4.3729200246148835e-07,
|
|
"loss": 0.1065,
|
|
"num_input_tokens_seen": 8152896,
|
|
"step": 8500
|
|
},
|
|
{
|
|
"epoch": 0.7217413441955194,
|
|
"grad_norm": 1.8136391639709473,
|
|
"learning_rate": 4.3606819037934673e-07,
|
|
"loss": 0.0257,
|
|
"num_input_tokens_seen": 8158400,
|
|
"step": 8505
|
|
},
|
|
{
|
|
"epoch": 0.7221656483367278,
|
|
"grad_norm": 27.48452377319336,
|
|
"learning_rate": 4.348456155114786e-07,
|
|
"loss": 0.0809,
|
|
"num_input_tokens_seen": 8163520,
|
|
"step": 8510
|
|
},
|
|
{
|
|
"epoch": 0.7225899524779362,
|
|
"grad_norm": 24.924903869628906,
|
|
"learning_rate": 4.336242805400989e-07,
|
|
"loss": 0.0764,
|
|
"num_input_tokens_seen": 8168256,
|
|
"step": 8515
|
|
},
|
|
{
|
|
"epoch": 0.7230142566191446,
|
|
"grad_norm": 1.556986689567566,
|
|
"learning_rate": 4.324041881447041e-07,
|
|
"loss": 0.0274,
|
|
"num_input_tokens_seen": 8172928,
|
|
"step": 8520
|
|
},
|
|
{
|
|
"epoch": 0.7234385607603531,
|
|
"grad_norm": 6.988257884979248,
|
|
"learning_rate": 4.311853410020643e-07,
|
|
"loss": 0.0438,
|
|
"num_input_tokens_seen": 8177728,
|
|
"step": 8525
|
|
},
|
|
{
|
|
"epoch": 0.7238628649015615,
|
|
"grad_norm": 9.756321907043457,
|
|
"learning_rate": 4.299677417862173e-07,
|
|
"loss": 0.0347,
|
|
"num_input_tokens_seen": 8182272,
|
|
"step": 8530
|
|
},
|
|
{
|
|
"epoch": 0.7242871690427699,
|
|
"grad_norm": 20.44253158569336,
|
|
"learning_rate": 4.287513931684634e-07,
|
|
"loss": 0.1273,
|
|
"num_input_tokens_seen": 8187776,
|
|
"step": 8535
|
|
},
|
|
{
|
|
"epoch": 0.7247114731839783,
|
|
"grad_norm": 0.4510215222835541,
|
|
"learning_rate": 4.2753629781735936e-07,
|
|
"loss": 0.0719,
|
|
"num_input_tokens_seen": 8192960,
|
|
"step": 8540
|
|
},
|
|
{
|
|
"epoch": 0.7251357773251867,
|
|
"grad_norm": 35.3104362487793,
|
|
"learning_rate": 4.2632245839871095e-07,
|
|
"loss": 0.1479,
|
|
"num_input_tokens_seen": 8197824,
|
|
"step": 8545
|
|
},
|
|
{
|
|
"epoch": 0.7255600814663951,
|
|
"grad_norm": 26.06441879272461,
|
|
"learning_rate": 4.251098775755708e-07,
|
|
"loss": 0.0411,
|
|
"num_input_tokens_seen": 8202880,
|
|
"step": 8550
|
|
},
|
|
{
|
|
"epoch": 0.7259843856076035,
|
|
"grad_norm": 11.877837181091309,
|
|
"learning_rate": 4.238985580082293e-07,
|
|
"loss": 0.1098,
|
|
"num_input_tokens_seen": 8207552,
|
|
"step": 8555
|
|
},
|
|
{
|
|
"epoch": 0.7264086897488119,
|
|
"grad_norm": 7.760819435119629,
|
|
"learning_rate": 4.2268850235420827e-07,
|
|
"loss": 0.0777,
|
|
"num_input_tokens_seen": 8212672,
|
|
"step": 8560
|
|
},
|
|
{
|
|
"epoch": 0.7268329938900203,
|
|
"grad_norm": 7.423666477203369,
|
|
"learning_rate": 4.214797132682596e-07,
|
|
"loss": 0.0436,
|
|
"num_input_tokens_seen": 8219392,
|
|
"step": 8565
|
|
},
|
|
{
|
|
"epoch": 0.7272572980312287,
|
|
"grad_norm": 2.0695018768310547,
|
|
"learning_rate": 4.202721934023536e-07,
|
|
"loss": 0.0622,
|
|
"num_input_tokens_seen": 8224064,
|
|
"step": 8570
|
|
},
|
|
{
|
|
"epoch": 0.7276816021724372,
|
|
"grad_norm": 8.464885711669922,
|
|
"learning_rate": 4.19065945405678e-07,
|
|
"loss": 0.0636,
|
|
"num_input_tokens_seen": 8229376,
|
|
"step": 8575
|
|
},
|
|
{
|
|
"epoch": 0.7281059063136456,
|
|
"grad_norm": 0.46809127926826477,
|
|
"learning_rate": 4.17860971924629e-07,
|
|
"loss": 0.0214,
|
|
"num_input_tokens_seen": 8234816,
|
|
"step": 8580
|
|
},
|
|
{
|
|
"epoch": 0.728530210454854,
|
|
"grad_norm": 19.159351348876953,
|
|
"learning_rate": 4.166572756028073e-07,
|
|
"loss": 0.0821,
|
|
"num_input_tokens_seen": 8239552,
|
|
"step": 8585
|
|
},
|
|
{
|
|
"epoch": 0.7289545145960624,
|
|
"grad_norm": 14.807759284973145,
|
|
"learning_rate": 4.154548590810113e-07,
|
|
"loss": 0.0445,
|
|
"num_input_tokens_seen": 8243840,
|
|
"step": 8590
|
|
},
|
|
{
|
|
"epoch": 0.7293788187372708,
|
|
"grad_norm": 33.179752349853516,
|
|
"learning_rate": 4.14253724997232e-07,
|
|
"loss": 0.1071,
|
|
"num_input_tokens_seen": 8248960,
|
|
"step": 8595
|
|
},
|
|
{
|
|
"epoch": 0.7298031228784793,
|
|
"grad_norm": 1.7420438528060913,
|
|
"learning_rate": 4.1305387598664567e-07,
|
|
"loss": 0.0536,
|
|
"num_input_tokens_seen": 8253888,
|
|
"step": 8600
|
|
},
|
|
{
|
|
"epoch": 0.7302274270196877,
|
|
"grad_norm": 25.70371437072754,
|
|
"learning_rate": 4.118553146816115e-07,
|
|
"loss": 0.0646,
|
|
"num_input_tokens_seen": 8258816,
|
|
"step": 8605
|
|
},
|
|
{
|
|
"epoch": 0.7306517311608961,
|
|
"grad_norm": 10.947419166564941,
|
|
"learning_rate": 4.1065804371166114e-07,
|
|
"loss": 0.0697,
|
|
"num_input_tokens_seen": 8263424,
|
|
"step": 8610
|
|
},
|
|
{
|
|
"epoch": 0.7310760353021045,
|
|
"grad_norm": 0.6111663579940796,
|
|
"learning_rate": 4.0946206570349685e-07,
|
|
"loss": 0.0673,
|
|
"num_input_tokens_seen": 8268288,
|
|
"step": 8615
|
|
},
|
|
{
|
|
"epoch": 0.7315003394433129,
|
|
"grad_norm": 22.681745529174805,
|
|
"learning_rate": 4.082673832809838e-07,
|
|
"loss": 0.0984,
|
|
"num_input_tokens_seen": 8273152,
|
|
"step": 8620
|
|
},
|
|
{
|
|
"epoch": 0.7319246435845214,
|
|
"grad_norm": 0.2827793061733246,
|
|
"learning_rate": 4.0707399906514483e-07,
|
|
"loss": 0.0205,
|
|
"num_input_tokens_seen": 8278016,
|
|
"step": 8625
|
|
},
|
|
{
|
|
"epoch": 0.7323489477257298,
|
|
"grad_norm": 2.0013508796691895,
|
|
"learning_rate": 4.058819156741545e-07,
|
|
"loss": 0.0476,
|
|
"num_input_tokens_seen": 8283392,
|
|
"step": 8630
|
|
},
|
|
{
|
|
"epoch": 0.7327732518669382,
|
|
"grad_norm": 12.871187210083008,
|
|
"learning_rate": 4.0469113572333426e-07,
|
|
"loss": 0.0493,
|
|
"num_input_tokens_seen": 8288064,
|
|
"step": 8635
|
|
},
|
|
{
|
|
"epoch": 0.7331975560081466,
|
|
"grad_norm": 12.48461627960205,
|
|
"learning_rate": 4.03501661825144e-07,
|
|
"loss": 0.0703,
|
|
"num_input_tokens_seen": 8292672,
|
|
"step": 8640
|
|
},
|
|
{
|
|
"epoch": 0.733621860149355,
|
|
"grad_norm": 7.887132167816162,
|
|
"learning_rate": 4.023134965891809e-07,
|
|
"loss": 0.0966,
|
|
"num_input_tokens_seen": 8297344,
|
|
"step": 8645
|
|
},
|
|
{
|
|
"epoch": 0.7340461642905635,
|
|
"grad_norm": 9.461837768554688,
|
|
"learning_rate": 4.0112664262216866e-07,
|
|
"loss": 0.0438,
|
|
"num_input_tokens_seen": 8302080,
|
|
"step": 8650
|
|
},
|
|
{
|
|
"epoch": 0.7344704684317719,
|
|
"grad_norm": 25.976781845092773,
|
|
"learning_rate": 3.9994110252795563e-07,
|
|
"loss": 0.1172,
|
|
"num_input_tokens_seen": 8307008,
|
|
"step": 8655
|
|
},
|
|
{
|
|
"epoch": 0.7348947725729803,
|
|
"grad_norm": 20.410432815551758,
|
|
"learning_rate": 3.987568789075072e-07,
|
|
"loss": 0.0663,
|
|
"num_input_tokens_seen": 8312064,
|
|
"step": 8660
|
|
},
|
|
{
|
|
"epoch": 0.7353190767141887,
|
|
"grad_norm": 0.0960959941148758,
|
|
"learning_rate": 3.975739743589004e-07,
|
|
"loss": 0.03,
|
|
"num_input_tokens_seen": 8316480,
|
|
"step": 8665
|
|
},
|
|
{
|
|
"epoch": 0.7357433808553971,
|
|
"grad_norm": 12.293548583984375,
|
|
"learning_rate": 3.9639239147731864e-07,
|
|
"loss": 0.0245,
|
|
"num_input_tokens_seen": 8321664,
|
|
"step": 8670
|
|
},
|
|
{
|
|
"epoch": 0.7361676849966056,
|
|
"grad_norm": 2.2200615406036377,
|
|
"learning_rate": 3.952121328550455e-07,
|
|
"loss": 0.0737,
|
|
"num_input_tokens_seen": 8326016,
|
|
"step": 8675
|
|
},
|
|
{
|
|
"epoch": 0.736591989137814,
|
|
"grad_norm": 0.2970643937587738,
|
|
"learning_rate": 3.9403320108145943e-07,
|
|
"loss": 0.0137,
|
|
"num_input_tokens_seen": 8330688,
|
|
"step": 8680
|
|
},
|
|
{
|
|
"epoch": 0.7370162932790224,
|
|
"grad_norm": 0.20511916279792786,
|
|
"learning_rate": 3.928555987430275e-07,
|
|
"loss": 0.0223,
|
|
"num_input_tokens_seen": 8335552,
|
|
"step": 8685
|
|
},
|
|
{
|
|
"epoch": 0.7374405974202308,
|
|
"grad_norm": 0.9357122182846069,
|
|
"learning_rate": 3.916793284233011e-07,
|
|
"loss": 0.0315,
|
|
"num_input_tokens_seen": 8340224,
|
|
"step": 8690
|
|
},
|
|
{
|
|
"epoch": 0.7378649015614392,
|
|
"grad_norm": 7.138563632965088,
|
|
"learning_rate": 3.9050439270290733e-07,
|
|
"loss": 0.0604,
|
|
"num_input_tokens_seen": 8345024,
|
|
"step": 8695
|
|
},
|
|
{
|
|
"epoch": 0.7382892057026477,
|
|
"grad_norm": 0.30067044496536255,
|
|
"learning_rate": 3.8933079415954805e-07,
|
|
"loss": 0.0482,
|
|
"num_input_tokens_seen": 8349632,
|
|
"step": 8700
|
|
},
|
|
{
|
|
"epoch": 0.7387135098438561,
|
|
"grad_norm": 0.2555783689022064,
|
|
"learning_rate": 3.8815853536798905e-07,
|
|
"loss": 0.055,
|
|
"num_input_tokens_seen": 8354624,
|
|
"step": 8705
|
|
},
|
|
{
|
|
"epoch": 0.7391378139850645,
|
|
"grad_norm": 36.78571701049805,
|
|
"learning_rate": 3.8698761890005794e-07,
|
|
"loss": 0.046,
|
|
"num_input_tokens_seen": 8359168,
|
|
"step": 8710
|
|
},
|
|
{
|
|
"epoch": 0.7395621181262729,
|
|
"grad_norm": 0.5426282286643982,
|
|
"learning_rate": 3.858180473246373e-07,
|
|
"loss": 0.0098,
|
|
"num_input_tokens_seen": 8363968,
|
|
"step": 8715
|
|
},
|
|
{
|
|
"epoch": 0.7399864222674813,
|
|
"grad_norm": 8.021255493164062,
|
|
"learning_rate": 3.8464982320765906e-07,
|
|
"loss": 0.0385,
|
|
"num_input_tokens_seen": 8369152,
|
|
"step": 8720
|
|
},
|
|
{
|
|
"epoch": 0.7404107264086898,
|
|
"grad_norm": 1.8594862222671509,
|
|
"learning_rate": 3.834829491120991e-07,
|
|
"loss": 0.0266,
|
|
"num_input_tokens_seen": 8373952,
|
|
"step": 8725
|
|
},
|
|
{
|
|
"epoch": 0.7408350305498982,
|
|
"grad_norm": 28.227519989013672,
|
|
"learning_rate": 3.8231742759797157e-07,
|
|
"loss": 0.1069,
|
|
"num_input_tokens_seen": 8379456,
|
|
"step": 8730
|
|
},
|
|
{
|
|
"epoch": 0.7412593346911066,
|
|
"grad_norm": 16.04825782775879,
|
|
"learning_rate": 3.811532612223219e-07,
|
|
"loss": 0.044,
|
|
"num_input_tokens_seen": 8383936,
|
|
"step": 8735
|
|
},
|
|
{
|
|
"epoch": 0.741683638832315,
|
|
"grad_norm": 10.720365524291992,
|
|
"learning_rate": 3.7999045253922504e-07,
|
|
"loss": 0.0709,
|
|
"num_input_tokens_seen": 8388864,
|
|
"step": 8740
|
|
},
|
|
{
|
|
"epoch": 0.7421079429735234,
|
|
"grad_norm": 26.067113876342773,
|
|
"learning_rate": 3.788290040997746e-07,
|
|
"loss": 0.0603,
|
|
"num_input_tokens_seen": 8393920,
|
|
"step": 8745
|
|
},
|
|
{
|
|
"epoch": 0.7425322471147319,
|
|
"grad_norm": 0.8638919591903687,
|
|
"learning_rate": 3.776689184520815e-07,
|
|
"loss": 0.0856,
|
|
"num_input_tokens_seen": 8398848,
|
|
"step": 8750
|
|
},
|
|
{
|
|
"epoch": 0.7429565512559403,
|
|
"grad_norm": 33.85160446166992,
|
|
"learning_rate": 3.765101981412665e-07,
|
|
"loss": 0.1218,
|
|
"num_input_tokens_seen": 8404160,
|
|
"step": 8755
|
|
},
|
|
{
|
|
"epoch": 0.7433808553971487,
|
|
"grad_norm": 22.34695053100586,
|
|
"learning_rate": 3.753528457094548e-07,
|
|
"loss": 0.0223,
|
|
"num_input_tokens_seen": 8408896,
|
|
"step": 8760
|
|
},
|
|
{
|
|
"epoch": 0.7438051595383571,
|
|
"grad_norm": 9.60935115814209,
|
|
"learning_rate": 3.7419686369577076e-07,
|
|
"loss": 0.1072,
|
|
"num_input_tokens_seen": 8413312,
|
|
"step": 8765
|
|
},
|
|
{
|
|
"epoch": 0.7442294636795656,
|
|
"grad_norm": 0.12352371960878372,
|
|
"learning_rate": 3.730422546363323e-07,
|
|
"loss": 0.0284,
|
|
"num_input_tokens_seen": 8417920,
|
|
"step": 8770
|
|
},
|
|
{
|
|
"epoch": 0.744653767820774,
|
|
"grad_norm": 33.97532653808594,
|
|
"learning_rate": 3.7188902106424414e-07,
|
|
"loss": 0.05,
|
|
"num_input_tokens_seen": 8422720,
|
|
"step": 8775
|
|
},
|
|
{
|
|
"epoch": 0.7450780719619824,
|
|
"grad_norm": 0.438525915145874,
|
|
"learning_rate": 3.7073716550959533e-07,
|
|
"loss": 0.051,
|
|
"num_input_tokens_seen": 8426944,
|
|
"step": 8780
|
|
},
|
|
{
|
|
"epoch": 0.7455023761031908,
|
|
"grad_norm": 18.7294979095459,
|
|
"learning_rate": 3.6958669049944956e-07,
|
|
"loss": 0.0423,
|
|
"num_input_tokens_seen": 8431104,
|
|
"step": 8785
|
|
},
|
|
{
|
|
"epoch": 0.7459266802443992,
|
|
"grad_norm": 19.43697166442871,
|
|
"learning_rate": 3.684375985578431e-07,
|
|
"loss": 0.0669,
|
|
"num_input_tokens_seen": 8435520,
|
|
"step": 8790
|
|
},
|
|
{
|
|
"epoch": 0.7463509843856077,
|
|
"grad_norm": 2.5459182262420654,
|
|
"learning_rate": 3.672898922057773e-07,
|
|
"loss": 0.0337,
|
|
"num_input_tokens_seen": 8441152,
|
|
"step": 8795
|
|
},
|
|
{
|
|
"epoch": 0.7467752885268161,
|
|
"grad_norm": 24.723230361938477,
|
|
"learning_rate": 3.66143573961214e-07,
|
|
"loss": 0.0383,
|
|
"num_input_tokens_seen": 8445824,
|
|
"step": 8800
|
|
},
|
|
{
|
|
"epoch": 0.7471995926680245,
|
|
"grad_norm": 18.451658248901367,
|
|
"learning_rate": 3.649986463390694e-07,
|
|
"loss": 0.0763,
|
|
"num_input_tokens_seen": 8450752,
|
|
"step": 8805
|
|
},
|
|
{
|
|
"epoch": 0.7476238968092329,
|
|
"grad_norm": 17.006040573120117,
|
|
"learning_rate": 3.6385511185120885e-07,
|
|
"loss": 0.054,
|
|
"num_input_tokens_seen": 8455296,
|
|
"step": 8810
|
|
},
|
|
{
|
|
"epoch": 0.7480482009504412,
|
|
"grad_norm": 6.782704830169678,
|
|
"learning_rate": 3.6271297300644156e-07,
|
|
"loss": 0.0375,
|
|
"num_input_tokens_seen": 8460480,
|
|
"step": 8815
|
|
},
|
|
{
|
|
"epoch": 0.7484725050916496,
|
|
"grad_norm": 0.11466942727565765,
|
|
"learning_rate": 3.6157223231051426e-07,
|
|
"loss": 0.0917,
|
|
"num_input_tokens_seen": 8465152,
|
|
"step": 8820
|
|
},
|
|
{
|
|
"epoch": 0.7488968092328581,
|
|
"grad_norm": 1.0368287563323975,
|
|
"learning_rate": 3.6043289226610717e-07,
|
|
"loss": 0.0562,
|
|
"num_input_tokens_seen": 8470144,
|
|
"step": 8825
|
|
},
|
|
{
|
|
"epoch": 0.7493211133740665,
|
|
"grad_norm": 21.403564453125,
|
|
"learning_rate": 3.5929495537282596e-07,
|
|
"loss": 0.0427,
|
|
"num_input_tokens_seen": 8475840,
|
|
"step": 8830
|
|
},
|
|
{
|
|
"epoch": 0.7497454175152749,
|
|
"grad_norm": 0.3052919805049896,
|
|
"learning_rate": 3.5815842412720045e-07,
|
|
"loss": 0.0703,
|
|
"num_input_tokens_seen": 8480256,
|
|
"step": 8835
|
|
},
|
|
{
|
|
"epoch": 0.7501697216564833,
|
|
"grad_norm": 0.6065476536750793,
|
|
"learning_rate": 3.57023301022674e-07,
|
|
"loss": 0.0649,
|
|
"num_input_tokens_seen": 8485568,
|
|
"step": 8840
|
|
},
|
|
{
|
|
"epoch": 0.7505940257976917,
|
|
"grad_norm": 13.807206153869629,
|
|
"learning_rate": 3.558895885496023e-07,
|
|
"loss": 0.078,
|
|
"num_input_tokens_seen": 8490048,
|
|
"step": 8845
|
|
},
|
|
{
|
|
"epoch": 0.7510183299389002,
|
|
"grad_norm": 1.1367219686508179,
|
|
"learning_rate": 3.547572891952456e-07,
|
|
"loss": 0.0597,
|
|
"num_input_tokens_seen": 8494720,
|
|
"step": 8850
|
|
},
|
|
{
|
|
"epoch": 0.7510183299389002,
|
|
"eval_loss": 0.05400332435965538,
|
|
"eval_runtime": 16.632,
|
|
"eval_samples_per_second": 629.811,
|
|
"eval_steps_per_second": 78.764,
|
|
"num_input_tokens_seen": 8494720,
|
|
"step": 8850
|
|
},
|
|
{
|
|
"epoch": 0.7514426340801086,
|
|
"grad_norm": 8.593124389648438,
|
|
"learning_rate": 3.536264054437641e-07,
|
|
"loss": 0.0286,
|
|
"num_input_tokens_seen": 8499392,
|
|
"step": 8855
|
|
},
|
|
{
|
|
"epoch": 0.751866938221317,
|
|
"grad_norm": 5.905686855316162,
|
|
"learning_rate": 3.524969397762122e-07,
|
|
"loss": 0.037,
|
|
"num_input_tokens_seen": 8504256,
|
|
"step": 8860
|
|
},
|
|
{
|
|
"epoch": 0.7522912423625254,
|
|
"grad_norm": 24.596603393554688,
|
|
"learning_rate": 3.5136889467053353e-07,
|
|
"loss": 0.0749,
|
|
"num_input_tokens_seen": 8508864,
|
|
"step": 8865
|
|
},
|
|
{
|
|
"epoch": 0.7527155465037338,
|
|
"grad_norm": 11.573902130126953,
|
|
"learning_rate": 3.5024227260155383e-07,
|
|
"loss": 0.0817,
|
|
"num_input_tokens_seen": 8514048,
|
|
"step": 8870
|
|
},
|
|
{
|
|
"epoch": 0.7531398506449423,
|
|
"grad_norm": 13.811308860778809,
|
|
"learning_rate": 3.4911707604097916e-07,
|
|
"loss": 0.0268,
|
|
"num_input_tokens_seen": 8518656,
|
|
"step": 8875
|
|
},
|
|
{
|
|
"epoch": 0.7535641547861507,
|
|
"grad_norm": 23.726560592651367,
|
|
"learning_rate": 3.4799330745738573e-07,
|
|
"loss": 0.0357,
|
|
"num_input_tokens_seen": 8523520,
|
|
"step": 8880
|
|
},
|
|
{
|
|
"epoch": 0.7539884589273591,
|
|
"grad_norm": 15.262434005737305,
|
|
"learning_rate": 3.468709693162183e-07,
|
|
"loss": 0.1716,
|
|
"num_input_tokens_seen": 8528320,
|
|
"step": 8885
|
|
},
|
|
{
|
|
"epoch": 0.7544127630685675,
|
|
"grad_norm": 11.677915573120117,
|
|
"learning_rate": 3.4575006407978304e-07,
|
|
"loss": 0.0288,
|
|
"num_input_tokens_seen": 8533184,
|
|
"step": 8890
|
|
},
|
|
{
|
|
"epoch": 0.754837067209776,
|
|
"grad_norm": 21.703073501586914,
|
|
"learning_rate": 3.446305942072425e-07,
|
|
"loss": 0.0115,
|
|
"num_input_tokens_seen": 8537536,
|
|
"step": 8895
|
|
},
|
|
{
|
|
"epoch": 0.7552613713509844,
|
|
"grad_norm": 19.767536163330078,
|
|
"learning_rate": 3.4351256215461e-07,
|
|
"loss": 0.0841,
|
|
"num_input_tokens_seen": 8541696,
|
|
"step": 8900
|
|
},
|
|
{
|
|
"epoch": 0.7556856754921928,
|
|
"grad_norm": 11.243904113769531,
|
|
"learning_rate": 3.423959703747449e-07,
|
|
"loss": 0.0227,
|
|
"num_input_tokens_seen": 8546176,
|
|
"step": 8905
|
|
},
|
|
{
|
|
"epoch": 0.7561099796334012,
|
|
"grad_norm": 0.27069732546806335,
|
|
"learning_rate": 3.4128082131734535e-07,
|
|
"loss": 0.0367,
|
|
"num_input_tokens_seen": 8550592,
|
|
"step": 8910
|
|
},
|
|
{
|
|
"epoch": 0.7565342837746096,
|
|
"grad_norm": 1.3123117685317993,
|
|
"learning_rate": 3.401671174289469e-07,
|
|
"loss": 0.0721,
|
|
"num_input_tokens_seen": 8555648,
|
|
"step": 8915
|
|
},
|
|
{
|
|
"epoch": 0.756958587915818,
|
|
"grad_norm": 20.45379066467285,
|
|
"learning_rate": 3.390548611529116e-07,
|
|
"loss": 0.0899,
|
|
"num_input_tokens_seen": 8560640,
|
|
"step": 8920
|
|
},
|
|
{
|
|
"epoch": 0.7573828920570265,
|
|
"grad_norm": 8.115303039550781,
|
|
"learning_rate": 3.3794405492942713e-07,
|
|
"loss": 0.0728,
|
|
"num_input_tokens_seen": 8565376,
|
|
"step": 8925
|
|
},
|
|
{
|
|
"epoch": 0.7578071961982349,
|
|
"grad_norm": 6.984683990478516,
|
|
"learning_rate": 3.368347011955006e-07,
|
|
"loss": 0.0466,
|
|
"num_input_tokens_seen": 8569728,
|
|
"step": 8930
|
|
},
|
|
{
|
|
"epoch": 0.7582315003394433,
|
|
"grad_norm": 51.41347122192383,
|
|
"learning_rate": 3.3572680238495064e-07,
|
|
"loss": 0.0659,
|
|
"num_input_tokens_seen": 8574656,
|
|
"step": 8935
|
|
},
|
|
{
|
|
"epoch": 0.7586558044806517,
|
|
"grad_norm": 0.6196117401123047,
|
|
"learning_rate": 3.346203609284053e-07,
|
|
"loss": 0.0537,
|
|
"num_input_tokens_seen": 8579072,
|
|
"step": 8940
|
|
},
|
|
{
|
|
"epoch": 0.7590801086218602,
|
|
"grad_norm": 18.85371208190918,
|
|
"learning_rate": 3.335153792532945e-07,
|
|
"loss": 0.0298,
|
|
"num_input_tokens_seen": 8584000,
|
|
"step": 8945
|
|
},
|
|
{
|
|
"epoch": 0.7595044127630686,
|
|
"grad_norm": 11.970200538635254,
|
|
"learning_rate": 3.324118597838463e-07,
|
|
"loss": 0.0857,
|
|
"num_input_tokens_seen": 8589248,
|
|
"step": 8950
|
|
},
|
|
{
|
|
"epoch": 0.759928716904277,
|
|
"grad_norm": 0.09408985823392868,
|
|
"learning_rate": 3.313098049410803e-07,
|
|
"loss": 0.0645,
|
|
"num_input_tokens_seen": 8593472,
|
|
"step": 8955
|
|
},
|
|
{
|
|
"epoch": 0.7603530210454854,
|
|
"grad_norm": 0.11534211039543152,
|
|
"learning_rate": 3.3020921714280325e-07,
|
|
"loss": 0.0149,
|
|
"num_input_tokens_seen": 8597952,
|
|
"step": 8960
|
|
},
|
|
{
|
|
"epoch": 0.7607773251866938,
|
|
"grad_norm": 3.628263235092163,
|
|
"learning_rate": 3.291100988036022e-07,
|
|
"loss": 0.0393,
|
|
"num_input_tokens_seen": 8602816,
|
|
"step": 8965
|
|
},
|
|
{
|
|
"epoch": 0.7612016293279023,
|
|
"grad_norm": 0.3252236545085907,
|
|
"learning_rate": 3.280124523348423e-07,
|
|
"loss": 0.0058,
|
|
"num_input_tokens_seen": 8608256,
|
|
"step": 8970
|
|
},
|
|
{
|
|
"epoch": 0.7616259334691107,
|
|
"grad_norm": 16.109745025634766,
|
|
"learning_rate": 3.269162801446578e-07,
|
|
"loss": 0.0307,
|
|
"num_input_tokens_seen": 8612928,
|
|
"step": 8975
|
|
},
|
|
{
|
|
"epoch": 0.7620502376103191,
|
|
"grad_norm": 0.15706217288970947,
|
|
"learning_rate": 3.258215846379492e-07,
|
|
"loss": 0.112,
|
|
"num_input_tokens_seen": 8617280,
|
|
"step": 8980
|
|
},
|
|
{
|
|
"epoch": 0.7624745417515275,
|
|
"grad_norm": 0.4153687655925751,
|
|
"learning_rate": 3.247283682163774e-07,
|
|
"loss": 0.0043,
|
|
"num_input_tokens_seen": 8621952,
|
|
"step": 8985
|
|
},
|
|
{
|
|
"epoch": 0.7628988458927359,
|
|
"grad_norm": 0.5190339088439941,
|
|
"learning_rate": 3.2363663327835855e-07,
|
|
"loss": 0.0165,
|
|
"num_input_tokens_seen": 8627136,
|
|
"step": 8990
|
|
},
|
|
{
|
|
"epoch": 0.7633231500339444,
|
|
"grad_norm": 0.05832768231630325,
|
|
"learning_rate": 3.2254638221905716e-07,
|
|
"loss": 0.0913,
|
|
"num_input_tokens_seen": 8631872,
|
|
"step": 8995
|
|
},
|
|
{
|
|
"epoch": 0.7637474541751528,
|
|
"grad_norm": 52.145225524902344,
|
|
"learning_rate": 3.214576174303846e-07,
|
|
"loss": 0.0842,
|
|
"num_input_tokens_seen": 8636864,
|
|
"step": 9000
|
|
},
|
|
{
|
|
"epoch": 0.7641717583163612,
|
|
"grad_norm": 0.5064007639884949,
|
|
"learning_rate": 3.2037034130098905e-07,
|
|
"loss": 0.0356,
|
|
"num_input_tokens_seen": 8641792,
|
|
"step": 9005
|
|
},
|
|
{
|
|
"epoch": 0.7645960624575696,
|
|
"grad_norm": 7.524738311767578,
|
|
"learning_rate": 3.192845562162549e-07,
|
|
"loss": 0.0393,
|
|
"num_input_tokens_seen": 8646400,
|
|
"step": 9010
|
|
},
|
|
{
|
|
"epoch": 0.765020366598778,
|
|
"grad_norm": 7.0111308097839355,
|
|
"learning_rate": 3.1820026455829353e-07,
|
|
"loss": 0.0504,
|
|
"num_input_tokens_seen": 8650944,
|
|
"step": 9015
|
|
},
|
|
{
|
|
"epoch": 0.7654446707399865,
|
|
"grad_norm": 0.2216712385416031,
|
|
"learning_rate": 3.171174687059408e-07,
|
|
"loss": 0.0735,
|
|
"num_input_tokens_seen": 8656704,
|
|
"step": 9020
|
|
},
|
|
{
|
|
"epoch": 0.7658689748811949,
|
|
"grad_norm": 18.536693572998047,
|
|
"learning_rate": 3.160361710347508e-07,
|
|
"loss": 0.0965,
|
|
"num_input_tokens_seen": 8661120,
|
|
"step": 9025
|
|
},
|
|
{
|
|
"epoch": 0.7662932790224033,
|
|
"grad_norm": 2.144017219543457,
|
|
"learning_rate": 3.14956373916991e-07,
|
|
"loss": 0.0366,
|
|
"num_input_tokens_seen": 8665664,
|
|
"step": 9030
|
|
},
|
|
{
|
|
"epoch": 0.7667175831636117,
|
|
"grad_norm": 23.03734016418457,
|
|
"learning_rate": 3.138780797216356e-07,
|
|
"loss": 0.0522,
|
|
"num_input_tokens_seen": 8670592,
|
|
"step": 9035
|
|
},
|
|
{
|
|
"epoch": 0.7671418873048201,
|
|
"grad_norm": 8.834620475769043,
|
|
"learning_rate": 3.128012908143636e-07,
|
|
"loss": 0.061,
|
|
"num_input_tokens_seen": 8677120,
|
|
"step": 9040
|
|
},
|
|
{
|
|
"epoch": 0.7675661914460286,
|
|
"grad_norm": 0.49720409512519836,
|
|
"learning_rate": 3.1172600955754935e-07,
|
|
"loss": 0.0675,
|
|
"num_input_tokens_seen": 8682176,
|
|
"step": 9045
|
|
},
|
|
{
|
|
"epoch": 0.767990495587237,
|
|
"grad_norm": 6.353196620941162,
|
|
"learning_rate": 3.1065223831026066e-07,
|
|
"loss": 0.0914,
|
|
"num_input_tokens_seen": 8686976,
|
|
"step": 9050
|
|
},
|
|
{
|
|
"epoch": 0.7684147997284454,
|
|
"grad_norm": 24.20335578918457,
|
|
"learning_rate": 3.095799794282533e-07,
|
|
"loss": 0.0759,
|
|
"num_input_tokens_seen": 8691904,
|
|
"step": 9055
|
|
},
|
|
{
|
|
"epoch": 0.7688391038696538,
|
|
"grad_norm": 5.444301128387451,
|
|
"learning_rate": 3.0850923526396334e-07,
|
|
"loss": 0.0464,
|
|
"num_input_tokens_seen": 8700928,
|
|
"step": 9060
|
|
},
|
|
{
|
|
"epoch": 0.7692634080108622,
|
|
"grad_norm": 20.29702377319336,
|
|
"learning_rate": 3.0744000816650464e-07,
|
|
"loss": 0.0332,
|
|
"num_input_tokens_seen": 8705344,
|
|
"step": 9065
|
|
},
|
|
{
|
|
"epoch": 0.7696877121520707,
|
|
"grad_norm": 6.054849624633789,
|
|
"learning_rate": 3.0637230048166263e-07,
|
|
"loss": 0.0576,
|
|
"num_input_tokens_seen": 8710784,
|
|
"step": 9070
|
|
},
|
|
{
|
|
"epoch": 0.770112016293279,
|
|
"grad_norm": 7.025826930999756,
|
|
"learning_rate": 3.0530611455188946e-07,
|
|
"loss": 0.0291,
|
|
"num_input_tokens_seen": 8716032,
|
|
"step": 9075
|
|
},
|
|
{
|
|
"epoch": 0.7705363204344874,
|
|
"grad_norm": 15.280085563659668,
|
|
"learning_rate": 3.0424145271629844e-07,
|
|
"loss": 0.0479,
|
|
"num_input_tokens_seen": 8721088,
|
|
"step": 9080
|
|
},
|
|
{
|
|
"epoch": 0.7709606245756958,
|
|
"grad_norm": 0.3522752821445465,
|
|
"learning_rate": 3.031783173106596e-07,
|
|
"loss": 0.0193,
|
|
"num_input_tokens_seen": 8725632,
|
|
"step": 9085
|
|
},
|
|
{
|
|
"epoch": 0.7713849287169042,
|
|
"grad_norm": 21.421838760375977,
|
|
"learning_rate": 3.0211671066739276e-07,
|
|
"loss": 0.0248,
|
|
"num_input_tokens_seen": 8730560,
|
|
"step": 9090
|
|
},
|
|
{
|
|
"epoch": 0.7718092328581126,
|
|
"grad_norm": 7.205770969390869,
|
|
"learning_rate": 3.01056635115566e-07,
|
|
"loss": 0.0482,
|
|
"num_input_tokens_seen": 8736640,
|
|
"step": 9095
|
|
},
|
|
{
|
|
"epoch": 0.7722335369993211,
|
|
"grad_norm": 0.7924282550811768,
|
|
"learning_rate": 2.999980929808863e-07,
|
|
"loss": 0.0223,
|
|
"num_input_tokens_seen": 8741248,
|
|
"step": 9100
|
|
},
|
|
{
|
|
"epoch": 0.7726578411405295,
|
|
"grad_norm": 7.874751567840576,
|
|
"learning_rate": 2.989410865856975e-07,
|
|
"loss": 0.0359,
|
|
"num_input_tokens_seen": 8745856,
|
|
"step": 9105
|
|
},
|
|
{
|
|
"epoch": 0.7730821452817379,
|
|
"grad_norm": 21.02619171142578,
|
|
"learning_rate": 2.9788561824897397e-07,
|
|
"loss": 0.1285,
|
|
"num_input_tokens_seen": 8750016,
|
|
"step": 9110
|
|
},
|
|
{
|
|
"epoch": 0.7735064494229463,
|
|
"grad_norm": 41.974952697753906,
|
|
"learning_rate": 2.968316902863157e-07,
|
|
"loss": 0.035,
|
|
"num_input_tokens_seen": 8754944,
|
|
"step": 9115
|
|
},
|
|
{
|
|
"epoch": 0.7739307535641547,
|
|
"grad_norm": 13.249424934387207,
|
|
"learning_rate": 2.957793050099433e-07,
|
|
"loss": 0.1247,
|
|
"num_input_tokens_seen": 8759488,
|
|
"step": 9120
|
|
},
|
|
{
|
|
"epoch": 0.7743550577053632,
|
|
"grad_norm": 0.23977115750312805,
|
|
"learning_rate": 2.9472846472869295e-07,
|
|
"loss": 0.0905,
|
|
"num_input_tokens_seen": 8763840,
|
|
"step": 9125
|
|
},
|
|
{
|
|
"epoch": 0.7747793618465716,
|
|
"grad_norm": 0.12379216402769089,
|
|
"learning_rate": 2.936791717480104e-07,
|
|
"loss": 0.0139,
|
|
"num_input_tokens_seen": 8768320,
|
|
"step": 9130
|
|
},
|
|
{
|
|
"epoch": 0.77520366598778,
|
|
"grad_norm": 3.6582441329956055,
|
|
"learning_rate": 2.9263142836994845e-07,
|
|
"loss": 0.0156,
|
|
"num_input_tokens_seen": 8773568,
|
|
"step": 9135
|
|
},
|
|
{
|
|
"epoch": 0.7756279701289884,
|
|
"grad_norm": 8.32474136352539,
|
|
"learning_rate": 2.915852368931585e-07,
|
|
"loss": 0.1032,
|
|
"num_input_tokens_seen": 8778176,
|
|
"step": 9140
|
|
},
|
|
{
|
|
"epoch": 0.7760522742701969,
|
|
"grad_norm": 19.66060447692871,
|
|
"learning_rate": 2.905405996128882e-07,
|
|
"loss": 0.1074,
|
|
"num_input_tokens_seen": 8782784,
|
|
"step": 9145
|
|
},
|
|
{
|
|
"epoch": 0.7764765784114053,
|
|
"grad_norm": 0.7193723320960999,
|
|
"learning_rate": 2.894975188209754e-07,
|
|
"loss": 0.0399,
|
|
"num_input_tokens_seen": 8787456,
|
|
"step": 9150
|
|
},
|
|
{
|
|
"epoch": 0.7769008825526137,
|
|
"grad_norm": 25.011789321899414,
|
|
"learning_rate": 2.8845599680584265e-07,
|
|
"loss": 0.0571,
|
|
"num_input_tokens_seen": 8792256,
|
|
"step": 9155
|
|
},
|
|
{
|
|
"epoch": 0.7773251866938221,
|
|
"grad_norm": 19.858861923217773,
|
|
"learning_rate": 2.8741603585249306e-07,
|
|
"loss": 0.1204,
|
|
"num_input_tokens_seen": 8797568,
|
|
"step": 9160
|
|
},
|
|
{
|
|
"epoch": 0.7777494908350305,
|
|
"grad_norm": 0.0711502954363823,
|
|
"learning_rate": 2.8637763824250507e-07,
|
|
"loss": 0.0529,
|
|
"num_input_tokens_seen": 8802112,
|
|
"step": 9165
|
|
},
|
|
{
|
|
"epoch": 0.778173794976239,
|
|
"grad_norm": 0.8686927556991577,
|
|
"learning_rate": 2.8534080625402677e-07,
|
|
"loss": 0.0134,
|
|
"num_input_tokens_seen": 8806784,
|
|
"step": 9170
|
|
},
|
|
{
|
|
"epoch": 0.7785980991174474,
|
|
"grad_norm": 20.83006477355957,
|
|
"learning_rate": 2.8430554216177203e-07,
|
|
"loss": 0.0053,
|
|
"num_input_tokens_seen": 8812096,
|
|
"step": 9175
|
|
},
|
|
{
|
|
"epoch": 0.7790224032586558,
|
|
"grad_norm": 11.683786392211914,
|
|
"learning_rate": 2.8327184823701464e-07,
|
|
"loss": 0.112,
|
|
"num_input_tokens_seen": 8817024,
|
|
"step": 9180
|
|
},
|
|
{
|
|
"epoch": 0.7794467073998642,
|
|
"grad_norm": 14.229178428649902,
|
|
"learning_rate": 2.822397267475827e-07,
|
|
"loss": 0.1502,
|
|
"num_input_tokens_seen": 8821696,
|
|
"step": 9185
|
|
},
|
|
{
|
|
"epoch": 0.7798710115410726,
|
|
"grad_norm": 30.228673934936523,
|
|
"learning_rate": 2.812091799578566e-07,
|
|
"loss": 0.1192,
|
|
"num_input_tokens_seen": 8826560,
|
|
"step": 9190
|
|
},
|
|
{
|
|
"epoch": 0.780295315682281,
|
|
"grad_norm": 0.3975675702095032,
|
|
"learning_rate": 2.8018021012875995e-07,
|
|
"loss": 0.0307,
|
|
"num_input_tokens_seen": 8831552,
|
|
"step": 9195
|
|
},
|
|
{
|
|
"epoch": 0.7807196198234895,
|
|
"grad_norm": 7.830092430114746,
|
|
"learning_rate": 2.791528195177576e-07,
|
|
"loss": 0.0521,
|
|
"num_input_tokens_seen": 8836480,
|
|
"step": 9200
|
|
},
|
|
{
|
|
"epoch": 0.7811439239646979,
|
|
"grad_norm": 15.838147163391113,
|
|
"learning_rate": 2.7812701037884964e-07,
|
|
"loss": 0.0662,
|
|
"num_input_tokens_seen": 8840832,
|
|
"step": 9205
|
|
},
|
|
{
|
|
"epoch": 0.7815682281059063,
|
|
"grad_norm": 2.2456212043762207,
|
|
"learning_rate": 2.7710278496256665e-07,
|
|
"loss": 0.0571,
|
|
"num_input_tokens_seen": 8845376,
|
|
"step": 9210
|
|
},
|
|
{
|
|
"epoch": 0.7819925322471147,
|
|
"grad_norm": 0.13089334964752197,
|
|
"learning_rate": 2.7608014551596437e-07,
|
|
"loss": 0.0549,
|
|
"num_input_tokens_seen": 8850176,
|
|
"step": 9215
|
|
},
|
|
{
|
|
"epoch": 0.7824168363883232,
|
|
"grad_norm": 3.927847385406494,
|
|
"learning_rate": 2.7505909428261956e-07,
|
|
"loss": 0.0811,
|
|
"num_input_tokens_seen": 8854976,
|
|
"step": 9220
|
|
},
|
|
{
|
|
"epoch": 0.7828411405295316,
|
|
"grad_norm": 18.77018165588379,
|
|
"learning_rate": 2.740396335026234e-07,
|
|
"loss": 0.042,
|
|
"num_input_tokens_seen": 8859392,
|
|
"step": 9225
|
|
},
|
|
{
|
|
"epoch": 0.78326544467074,
|
|
"grad_norm": 0.586846649646759,
|
|
"learning_rate": 2.7302176541257984e-07,
|
|
"loss": 0.0143,
|
|
"num_input_tokens_seen": 8863936,
|
|
"step": 9230
|
|
},
|
|
{
|
|
"epoch": 0.7836897488119484,
|
|
"grad_norm": 1.3635629415512085,
|
|
"learning_rate": 2.720054922455964e-07,
|
|
"loss": 0.1062,
|
|
"num_input_tokens_seen": 8868480,
|
|
"step": 9235
|
|
},
|
|
{
|
|
"epoch": 0.7841140529531568,
|
|
"grad_norm": 53.56965255737305,
|
|
"learning_rate": 2.7099081623128294e-07,
|
|
"loss": 0.0882,
|
|
"num_input_tokens_seen": 8873344,
|
|
"step": 9240
|
|
},
|
|
{
|
|
"epoch": 0.7845383570943653,
|
|
"grad_norm": 23.173128128051758,
|
|
"learning_rate": 2.699777395957449e-07,
|
|
"loss": 0.0753,
|
|
"num_input_tokens_seen": 8878016,
|
|
"step": 9245
|
|
},
|
|
{
|
|
"epoch": 0.7849626612355737,
|
|
"grad_norm": 12.536267280578613,
|
|
"learning_rate": 2.6896626456157846e-07,
|
|
"loss": 0.1384,
|
|
"num_input_tokens_seen": 8882816,
|
|
"step": 9250
|
|
},
|
|
{
|
|
"epoch": 0.7853869653767821,
|
|
"grad_norm": 0.33267757296562195,
|
|
"learning_rate": 2.679563933478667e-07,
|
|
"loss": 0.0021,
|
|
"num_input_tokens_seen": 8887936,
|
|
"step": 9255
|
|
},
|
|
{
|
|
"epoch": 0.7858112695179905,
|
|
"grad_norm": 8.61290168762207,
|
|
"learning_rate": 2.6694812817017387e-07,
|
|
"loss": 0.0199,
|
|
"num_input_tokens_seen": 8892672,
|
|
"step": 9260
|
|
},
|
|
{
|
|
"epoch": 0.7862355736591989,
|
|
"grad_norm": 2.99057936668396,
|
|
"learning_rate": 2.659414712405398e-07,
|
|
"loss": 0.0474,
|
|
"num_input_tokens_seen": 8897152,
|
|
"step": 9265
|
|
},
|
|
{
|
|
"epoch": 0.7866598778004074,
|
|
"grad_norm": 0.39016193151474,
|
|
"learning_rate": 2.649364247674779e-07,
|
|
"loss": 0.0298,
|
|
"num_input_tokens_seen": 8901504,
|
|
"step": 9270
|
|
},
|
|
{
|
|
"epoch": 0.7870841819416158,
|
|
"grad_norm": 16.9559326171875,
|
|
"learning_rate": 2.639329909559662e-07,
|
|
"loss": 0.0533,
|
|
"num_input_tokens_seen": 8906176,
|
|
"step": 9275
|
|
},
|
|
{
|
|
"epoch": 0.7875084860828242,
|
|
"grad_norm": 0.9116821885108948,
|
|
"learning_rate": 2.6293117200744643e-07,
|
|
"loss": 0.0354,
|
|
"num_input_tokens_seen": 8910720,
|
|
"step": 9280
|
|
},
|
|
{
|
|
"epoch": 0.7879327902240326,
|
|
"grad_norm": 6.958780765533447,
|
|
"learning_rate": 2.6193097011981635e-07,
|
|
"loss": 0.0483,
|
|
"num_input_tokens_seen": 8915200,
|
|
"step": 9285
|
|
},
|
|
{
|
|
"epoch": 0.788357094365241,
|
|
"grad_norm": 23.019777297973633,
|
|
"learning_rate": 2.609323874874266e-07,
|
|
"loss": 0.0445,
|
|
"num_input_tokens_seen": 8920256,
|
|
"step": 9290
|
|
},
|
|
{
|
|
"epoch": 0.7887813985064495,
|
|
"grad_norm": 12.305475234985352,
|
|
"learning_rate": 2.5993542630107533e-07,
|
|
"loss": 0.0556,
|
|
"num_input_tokens_seen": 8924864,
|
|
"step": 9295
|
|
},
|
|
{
|
|
"epoch": 0.7892057026476579,
|
|
"grad_norm": 1.3774256706237793,
|
|
"learning_rate": 2.589400887480032e-07,
|
|
"loss": 0.0683,
|
|
"num_input_tokens_seen": 8929856,
|
|
"step": 9300
|
|
},
|
|
{
|
|
"epoch": 0.7896300067888663,
|
|
"grad_norm": 17.66566276550293,
|
|
"learning_rate": 2.579463770118887e-07,
|
|
"loss": 0.0124,
|
|
"num_input_tokens_seen": 8936640,
|
|
"step": 9305
|
|
},
|
|
{
|
|
"epoch": 0.7900543109300747,
|
|
"grad_norm": 0.3400714695453644,
|
|
"learning_rate": 2.569542932728436e-07,
|
|
"loss": 0.0347,
|
|
"num_input_tokens_seen": 8940992,
|
|
"step": 9310
|
|
},
|
|
{
|
|
"epoch": 0.7904786150712831,
|
|
"grad_norm": 44.02901840209961,
|
|
"learning_rate": 2.5596383970740833e-07,
|
|
"loss": 0.0816,
|
|
"num_input_tokens_seen": 8945664,
|
|
"step": 9315
|
|
},
|
|
{
|
|
"epoch": 0.7909029192124916,
|
|
"grad_norm": 10.015303611755371,
|
|
"learning_rate": 2.549750184885454e-07,
|
|
"loss": 0.1272,
|
|
"num_input_tokens_seen": 8950016,
|
|
"step": 9320
|
|
},
|
|
{
|
|
"epoch": 0.7913272233537,
|
|
"grad_norm": 24.959270477294922,
|
|
"learning_rate": 2.5398783178563844e-07,
|
|
"loss": 0.0794,
|
|
"num_input_tokens_seen": 8954880,
|
|
"step": 9325
|
|
},
|
|
{
|
|
"epoch": 0.7917515274949084,
|
|
"grad_norm": 9.257530212402344,
|
|
"learning_rate": 2.5300228176448304e-07,
|
|
"loss": 0.0726,
|
|
"num_input_tokens_seen": 8960128,
|
|
"step": 9330
|
|
},
|
|
{
|
|
"epoch": 0.7921758316361168,
|
|
"grad_norm": 1.9245659112930298,
|
|
"learning_rate": 2.52018370587285e-07,
|
|
"loss": 0.06,
|
|
"num_input_tokens_seen": 8964864,
|
|
"step": 9335
|
|
},
|
|
{
|
|
"epoch": 0.7926001357773251,
|
|
"grad_norm": 7.326444625854492,
|
|
"learning_rate": 2.5103610041265475e-07,
|
|
"loss": 0.1015,
|
|
"num_input_tokens_seen": 8970112,
|
|
"step": 9340
|
|
},
|
|
{
|
|
"epoch": 0.7930244399185336,
|
|
"grad_norm": 6.067197799682617,
|
|
"learning_rate": 2.5005547339560207e-07,
|
|
"loss": 0.0543,
|
|
"num_input_tokens_seen": 8975104,
|
|
"step": 9345
|
|
},
|
|
{
|
|
"epoch": 0.793448744059742,
|
|
"grad_norm": 6.701343059539795,
|
|
"learning_rate": 2.4907649168753197e-07,
|
|
"loss": 0.0538,
|
|
"num_input_tokens_seen": 8980160,
|
|
"step": 9350
|
|
},
|
|
{
|
|
"epoch": 0.7938730482009504,
|
|
"grad_norm": 41.52684783935547,
|
|
"learning_rate": 2.480991574362403e-07,
|
|
"loss": 0.1063,
|
|
"num_input_tokens_seen": 8984320,
|
|
"step": 9355
|
|
},
|
|
{
|
|
"epoch": 0.7942973523421588,
|
|
"grad_norm": 18.54804229736328,
|
|
"learning_rate": 2.471234727859072e-07,
|
|
"loss": 0.0455,
|
|
"num_input_tokens_seen": 8989376,
|
|
"step": 9360
|
|
},
|
|
{
|
|
"epoch": 0.7947216564833672,
|
|
"grad_norm": 1.706449031829834,
|
|
"learning_rate": 2.461494398770957e-07,
|
|
"loss": 0.0241,
|
|
"num_input_tokens_seen": 8993920,
|
|
"step": 9365
|
|
},
|
|
{
|
|
"epoch": 0.7951459606245757,
|
|
"grad_norm": 0.3474697768688202,
|
|
"learning_rate": 2.4517706084674316e-07,
|
|
"loss": 0.0691,
|
|
"num_input_tokens_seen": 8998848,
|
|
"step": 9370
|
|
},
|
|
{
|
|
"epoch": 0.7955702647657841,
|
|
"grad_norm": 0.049774639308452606,
|
|
"learning_rate": 2.4420633782815945e-07,
|
|
"loss": 0.0246,
|
|
"num_input_tokens_seen": 9003712,
|
|
"step": 9375
|
|
},
|
|
{
|
|
"epoch": 0.7959945689069925,
|
|
"grad_norm": 0.6143336892127991,
|
|
"learning_rate": 2.432372729510214e-07,
|
|
"loss": 0.0492,
|
|
"num_input_tokens_seen": 9009472,
|
|
"step": 9380
|
|
},
|
|
{
|
|
"epoch": 0.7964188730482009,
|
|
"grad_norm": 3.855057954788208,
|
|
"learning_rate": 2.4226986834136763e-07,
|
|
"loss": 0.035,
|
|
"num_input_tokens_seen": 9014144,
|
|
"step": 9385
|
|
},
|
|
{
|
|
"epoch": 0.7968431771894093,
|
|
"grad_norm": 21.495492935180664,
|
|
"learning_rate": 2.4130412612159445e-07,
|
|
"loss": 0.0149,
|
|
"num_input_tokens_seen": 9018688,
|
|
"step": 9390
|
|
},
|
|
{
|
|
"epoch": 0.7972674813306178,
|
|
"grad_norm": 10.586645126342773,
|
|
"learning_rate": 2.403400484104514e-07,
|
|
"loss": 0.0315,
|
|
"num_input_tokens_seen": 9023040,
|
|
"step": 9395
|
|
},
|
|
{
|
|
"epoch": 0.7976917854718262,
|
|
"grad_norm": 0.1582435965538025,
|
|
"learning_rate": 2.3937763732303504e-07,
|
|
"loss": 0.0228,
|
|
"num_input_tokens_seen": 9027328,
|
|
"step": 9400
|
|
},
|
|
{
|
|
"epoch": 0.7981160896130346,
|
|
"grad_norm": 0.30041149258613586,
|
|
"learning_rate": 2.3841689497078742e-07,
|
|
"loss": 0.0262,
|
|
"num_input_tokens_seen": 9031552,
|
|
"step": 9405
|
|
},
|
|
{
|
|
"epoch": 0.798540393754243,
|
|
"grad_norm": 0.5289284586906433,
|
|
"learning_rate": 2.3745782346148756e-07,
|
|
"loss": 0.0287,
|
|
"num_input_tokens_seen": 9036672,
|
|
"step": 9410
|
|
},
|
|
{
|
|
"epoch": 0.7989646978954514,
|
|
"grad_norm": 1.5053603649139404,
|
|
"learning_rate": 2.3650042489924992e-07,
|
|
"loss": 0.0498,
|
|
"num_input_tokens_seen": 9041472,
|
|
"step": 9415
|
|
},
|
|
{
|
|
"epoch": 0.7993890020366599,
|
|
"grad_norm": 34.69329833984375,
|
|
"learning_rate": 2.3554470138451909e-07,
|
|
"loss": 0.0857,
|
|
"num_input_tokens_seen": 9046912,
|
|
"step": 9420
|
|
},
|
|
{
|
|
"epoch": 0.7998133061778683,
|
|
"grad_norm": 10.27662467956543,
|
|
"learning_rate": 2.345906550140634e-07,
|
|
"loss": 0.0525,
|
|
"num_input_tokens_seen": 9051712,
|
|
"step": 9425
|
|
},
|
|
{
|
|
"epoch": 0.8002376103190767,
|
|
"grad_norm": 15.842827796936035,
|
|
"learning_rate": 2.3363828788097274e-07,
|
|
"loss": 0.0897,
|
|
"num_input_tokens_seen": 9056256,
|
|
"step": 9430
|
|
},
|
|
{
|
|
"epoch": 0.8006619144602851,
|
|
"grad_norm": 14.575736045837402,
|
|
"learning_rate": 2.3268760207465244e-07,
|
|
"loss": 0.0399,
|
|
"num_input_tokens_seen": 9061376,
|
|
"step": 9435
|
|
},
|
|
{
|
|
"epoch": 0.8010862186014935,
|
|
"grad_norm": 13.162969589233398,
|
|
"learning_rate": 2.3173859968081944e-07,
|
|
"loss": 0.1148,
|
|
"num_input_tokens_seen": 9066048,
|
|
"step": 9440
|
|
},
|
|
{
|
|
"epoch": 0.8010862186014935,
|
|
"eval_loss": 0.05443469434976578,
|
|
"eval_runtime": 16.7899,
|
|
"eval_samples_per_second": 623.886,
|
|
"eval_steps_per_second": 78.023,
|
|
"num_input_tokens_seen": 9066048,
|
|
"step": 9440
|
|
},
|
|
{
|
|
"epoch": 0.801510522742702,
|
|
"grad_norm": 17.026670455932617,
|
|
"learning_rate": 2.3079128278149717e-07,
|
|
"loss": 0.0254,
|
|
"num_input_tokens_seen": 9071232,
|
|
"step": 9445
|
|
},
|
|
{
|
|
"epoch": 0.8019348268839104,
|
|
"grad_norm": 24.76031494140625,
|
|
"learning_rate": 2.2984565345501172e-07,
|
|
"loss": 0.0486,
|
|
"num_input_tokens_seen": 9075520,
|
|
"step": 9450
|
|
},
|
|
{
|
|
"epoch": 0.8023591310251188,
|
|
"grad_norm": 12.294041633605957,
|
|
"learning_rate": 2.2890171377598556e-07,
|
|
"loss": 0.0667,
|
|
"num_input_tokens_seen": 9080192,
|
|
"step": 9455
|
|
},
|
|
{
|
|
"epoch": 0.8027834351663272,
|
|
"grad_norm": 4.340590000152588,
|
|
"learning_rate": 2.2795946581533632e-07,
|
|
"loss": 0.003,
|
|
"num_input_tokens_seen": 9085696,
|
|
"step": 9460
|
|
},
|
|
{
|
|
"epoch": 0.8032077393075356,
|
|
"grad_norm": 12.62264633178711,
|
|
"learning_rate": 2.27018911640268e-07,
|
|
"loss": 0.0713,
|
|
"num_input_tokens_seen": 9090432,
|
|
"step": 9465
|
|
},
|
|
{
|
|
"epoch": 0.8036320434487441,
|
|
"grad_norm": 9.145902633666992,
|
|
"learning_rate": 2.2608005331426982e-07,
|
|
"loss": 0.0628,
|
|
"num_input_tokens_seen": 9094976,
|
|
"step": 9470
|
|
},
|
|
{
|
|
"epoch": 0.8040563475899525,
|
|
"grad_norm": 10.370466232299805,
|
|
"learning_rate": 2.251428928971102e-07,
|
|
"loss": 0.0873,
|
|
"num_input_tokens_seen": 9100096,
|
|
"step": 9475
|
|
},
|
|
{
|
|
"epoch": 0.8044806517311609,
|
|
"grad_norm": 8.401834487915039,
|
|
"learning_rate": 2.2420743244483253e-07,
|
|
"loss": 0.0783,
|
|
"num_input_tokens_seen": 9105408,
|
|
"step": 9480
|
|
},
|
|
{
|
|
"epoch": 0.8049049558723693,
|
|
"grad_norm": 7.3723907470703125,
|
|
"learning_rate": 2.2327367400975051e-07,
|
|
"loss": 0.0437,
|
|
"num_input_tokens_seen": 9110144,
|
|
"step": 9485
|
|
},
|
|
{
|
|
"epoch": 0.8053292600135777,
|
|
"grad_norm": 0.3261149823665619,
|
|
"learning_rate": 2.2234161964044417e-07,
|
|
"loss": 0.0436,
|
|
"num_input_tokens_seen": 9115520,
|
|
"step": 9490
|
|
},
|
|
{
|
|
"epoch": 0.8057535641547862,
|
|
"grad_norm": 0.42327404022216797,
|
|
"learning_rate": 2.2141127138175386e-07,
|
|
"loss": 0.0339,
|
|
"num_input_tokens_seen": 9119808,
|
|
"step": 9495
|
|
},
|
|
{
|
|
"epoch": 0.8061778682959946,
|
|
"grad_norm": 14.385973930358887,
|
|
"learning_rate": 2.2048263127477861e-07,
|
|
"loss": 0.0722,
|
|
"num_input_tokens_seen": 9124672,
|
|
"step": 9500
|
|
},
|
|
{
|
|
"epoch": 0.806602172437203,
|
|
"grad_norm": 15.243555068969727,
|
|
"learning_rate": 2.195557013568684e-07,
|
|
"loss": 0.0921,
|
|
"num_input_tokens_seen": 9129216,
|
|
"step": 9505
|
|
},
|
|
{
|
|
"epoch": 0.8070264765784114,
|
|
"grad_norm": 1.754729151725769,
|
|
"learning_rate": 2.1863048366162207e-07,
|
|
"loss": 0.054,
|
|
"num_input_tokens_seen": 9133952,
|
|
"step": 9510
|
|
},
|
|
{
|
|
"epoch": 0.8074507807196198,
|
|
"grad_norm": 31.17840576171875,
|
|
"learning_rate": 2.1770698021888145e-07,
|
|
"loss": 0.052,
|
|
"num_input_tokens_seen": 9138240,
|
|
"step": 9515
|
|
},
|
|
{
|
|
"epoch": 0.8078750848608283,
|
|
"grad_norm": 8.251001358032227,
|
|
"learning_rate": 2.167851930547283e-07,
|
|
"loss": 0.0921,
|
|
"num_input_tokens_seen": 9142656,
|
|
"step": 9520
|
|
},
|
|
{
|
|
"epoch": 0.8082993890020367,
|
|
"grad_norm": 2.2644736766815186,
|
|
"learning_rate": 2.1586512419147763e-07,
|
|
"loss": 0.0642,
|
|
"num_input_tokens_seen": 9147456,
|
|
"step": 9525
|
|
},
|
|
{
|
|
"epoch": 0.8087236931432451,
|
|
"grad_norm": 10.39969253540039,
|
|
"learning_rate": 2.149467756476765e-07,
|
|
"loss": 0.0265,
|
|
"num_input_tokens_seen": 9152064,
|
|
"step": 9530
|
|
},
|
|
{
|
|
"epoch": 0.8091479972844535,
|
|
"grad_norm": 18.1652889251709,
|
|
"learning_rate": 2.140301494380956e-07,
|
|
"loss": 0.0699,
|
|
"num_input_tokens_seen": 9156544,
|
|
"step": 9535
|
|
},
|
|
{
|
|
"epoch": 0.8095723014256619,
|
|
"grad_norm": 4.475470542907715,
|
|
"learning_rate": 2.1311524757372901e-07,
|
|
"loss": 0.0371,
|
|
"num_input_tokens_seen": 9161088,
|
|
"step": 9540
|
|
},
|
|
{
|
|
"epoch": 0.8099966055668704,
|
|
"grad_norm": 14.571686744689941,
|
|
"learning_rate": 2.1220207206178685e-07,
|
|
"loss": 0.0588,
|
|
"num_input_tokens_seen": 9165440,
|
|
"step": 9545
|
|
},
|
|
{
|
|
"epoch": 0.8104209097080788,
|
|
"grad_norm": 2.090883731842041,
|
|
"learning_rate": 2.1129062490569106e-07,
|
|
"loss": 0.0273,
|
|
"num_input_tokens_seen": 9170496,
|
|
"step": 9550
|
|
},
|
|
{
|
|
"epoch": 0.8108452138492872,
|
|
"grad_norm": 0.35549458861351013,
|
|
"learning_rate": 2.1038090810507348e-07,
|
|
"loss": 0.0275,
|
|
"num_input_tokens_seen": 9175360,
|
|
"step": 9555
|
|
},
|
|
{
|
|
"epoch": 0.8112695179904956,
|
|
"grad_norm": 25.009790420532227,
|
|
"learning_rate": 2.0947292365576785e-07,
|
|
"loss": 0.051,
|
|
"num_input_tokens_seen": 9179776,
|
|
"step": 9560
|
|
},
|
|
{
|
|
"epoch": 0.811693822131704,
|
|
"grad_norm": 0.32450568675994873,
|
|
"learning_rate": 2.085666735498085e-07,
|
|
"loss": 0.0297,
|
|
"num_input_tokens_seen": 9185536,
|
|
"step": 9565
|
|
},
|
|
{
|
|
"epoch": 0.8121181262729125,
|
|
"grad_norm": 11.590928077697754,
|
|
"learning_rate": 2.0766215977542435e-07,
|
|
"loss": 0.0289,
|
|
"num_input_tokens_seen": 9190528,
|
|
"step": 9570
|
|
},
|
|
{
|
|
"epoch": 0.8125424304141209,
|
|
"grad_norm": 46.6675910949707,
|
|
"learning_rate": 2.0675938431703532e-07,
|
|
"loss": 0.0969,
|
|
"num_input_tokens_seen": 9195264,
|
|
"step": 9575
|
|
},
|
|
{
|
|
"epoch": 0.8129667345553293,
|
|
"grad_norm": 18.320871353149414,
|
|
"learning_rate": 2.0585834915524646e-07,
|
|
"loss": 0.0797,
|
|
"num_input_tokens_seen": 9200192,
|
|
"step": 9580
|
|
},
|
|
{
|
|
"epoch": 0.8133910386965377,
|
|
"grad_norm": 0.4181262254714966,
|
|
"learning_rate": 2.0495905626684674e-07,
|
|
"loss": 0.0214,
|
|
"num_input_tokens_seen": 9205056,
|
|
"step": 9585
|
|
},
|
|
{
|
|
"epoch": 0.8138153428377461,
|
|
"grad_norm": 23.465944290161133,
|
|
"learning_rate": 2.0406150762480089e-07,
|
|
"loss": 0.0363,
|
|
"num_input_tokens_seen": 9209856,
|
|
"step": 9590
|
|
},
|
|
{
|
|
"epoch": 0.8142396469789546,
|
|
"grad_norm": 13.115691184997559,
|
|
"learning_rate": 2.0316570519824806e-07,
|
|
"loss": 0.0478,
|
|
"num_input_tokens_seen": 9214464,
|
|
"step": 9595
|
|
},
|
|
{
|
|
"epoch": 0.814663951120163,
|
|
"grad_norm": 8.08517837524414,
|
|
"learning_rate": 2.0227165095249564e-07,
|
|
"loss": 0.0483,
|
|
"num_input_tokens_seen": 9219072,
|
|
"step": 9600
|
|
},
|
|
{
|
|
"epoch": 0.8150882552613713,
|
|
"grad_norm": 1.0336787700653076,
|
|
"learning_rate": 2.0137934684901636e-07,
|
|
"loss": 0.0728,
|
|
"num_input_tokens_seen": 9224768,
|
|
"step": 9605
|
|
},
|
|
{
|
|
"epoch": 0.8155125594025797,
|
|
"grad_norm": 39.788063049316406,
|
|
"learning_rate": 2.0048879484544279e-07,
|
|
"loss": 0.0564,
|
|
"num_input_tokens_seen": 9229696,
|
|
"step": 9610
|
|
},
|
|
{
|
|
"epoch": 0.8159368635437881,
|
|
"grad_norm": 0.04208716005086899,
|
|
"learning_rate": 1.9959999689556407e-07,
|
|
"loss": 0.095,
|
|
"num_input_tokens_seen": 9235072,
|
|
"step": 9615
|
|
},
|
|
{
|
|
"epoch": 0.8163611676849966,
|
|
"grad_norm": 0.4682616889476776,
|
|
"learning_rate": 1.9871295494931994e-07,
|
|
"loss": 0.0964,
|
|
"num_input_tokens_seen": 9240320,
|
|
"step": 9620
|
|
},
|
|
{
|
|
"epoch": 0.816785471826205,
|
|
"grad_norm": 0.28814592957496643,
|
|
"learning_rate": 1.978276709527994e-07,
|
|
"loss": 0.0296,
|
|
"num_input_tokens_seen": 9244928,
|
|
"step": 9625
|
|
},
|
|
{
|
|
"epoch": 0.8172097759674134,
|
|
"grad_norm": 13.587329864501953,
|
|
"learning_rate": 1.9694414684823313e-07,
|
|
"loss": 0.1009,
|
|
"num_input_tokens_seen": 9249792,
|
|
"step": 9630
|
|
},
|
|
{
|
|
"epoch": 0.8176340801086218,
|
|
"grad_norm": 1.9363512992858887,
|
|
"learning_rate": 1.960623845739914e-07,
|
|
"loss": 0.0672,
|
|
"num_input_tokens_seen": 9254848,
|
|
"step": 9635
|
|
},
|
|
{
|
|
"epoch": 0.8180583842498302,
|
|
"grad_norm": 17.151487350463867,
|
|
"learning_rate": 1.9518238606457925e-07,
|
|
"loss": 0.0419,
|
|
"num_input_tokens_seen": 9259392,
|
|
"step": 9640
|
|
},
|
|
{
|
|
"epoch": 0.8184826883910387,
|
|
"grad_norm": 0.4146888852119446,
|
|
"learning_rate": 1.943041532506322e-07,
|
|
"loss": 0.0639,
|
|
"num_input_tokens_seen": 9263872,
|
|
"step": 9645
|
|
},
|
|
{
|
|
"epoch": 0.8189069925322471,
|
|
"grad_norm": 17.29326820373535,
|
|
"learning_rate": 1.9342768805891173e-07,
|
|
"loss": 0.0404,
|
|
"num_input_tokens_seen": 9268800,
|
|
"step": 9650
|
|
},
|
|
{
|
|
"epoch": 0.8193312966734555,
|
|
"grad_norm": 9.275403022766113,
|
|
"learning_rate": 1.9255299241230182e-07,
|
|
"loss": 0.1197,
|
|
"num_input_tokens_seen": 9273408,
|
|
"step": 9655
|
|
},
|
|
{
|
|
"epoch": 0.8197556008146639,
|
|
"grad_norm": 13.459712982177734,
|
|
"learning_rate": 1.91680068229803e-07,
|
|
"loss": 0.0605,
|
|
"num_input_tokens_seen": 9278208,
|
|
"step": 9660
|
|
},
|
|
{
|
|
"epoch": 0.8201799049558723,
|
|
"grad_norm": 0.23458638787269592,
|
|
"learning_rate": 1.9080891742653105e-07,
|
|
"loss": 0.0632,
|
|
"num_input_tokens_seen": 9282944,
|
|
"step": 9665
|
|
},
|
|
{
|
|
"epoch": 0.8206042090970808,
|
|
"grad_norm": 0.45489996671676636,
|
|
"learning_rate": 1.8993954191371042e-07,
|
|
"loss": 0.01,
|
|
"num_input_tokens_seen": 9288064,
|
|
"step": 9670
|
|
},
|
|
{
|
|
"epoch": 0.8210285132382892,
|
|
"grad_norm": 0.10732559859752655,
|
|
"learning_rate": 1.8907194359866986e-07,
|
|
"loss": 0.0302,
|
|
"num_input_tokens_seen": 9293120,
|
|
"step": 9675
|
|
},
|
|
{
|
|
"epoch": 0.8214528173794976,
|
|
"grad_norm": 15.584017753601074,
|
|
"learning_rate": 1.8820612438484075e-07,
|
|
"loss": 0.0712,
|
|
"num_input_tokens_seen": 9297472,
|
|
"step": 9680
|
|
},
|
|
{
|
|
"epoch": 0.821877121520706,
|
|
"grad_norm": 8.304574966430664,
|
|
"learning_rate": 1.8734208617174986e-07,
|
|
"loss": 0.0562,
|
|
"num_input_tokens_seen": 9302144,
|
|
"step": 9685
|
|
},
|
|
{
|
|
"epoch": 0.8223014256619144,
|
|
"grad_norm": 0.9647732377052307,
|
|
"learning_rate": 1.864798308550173e-07,
|
|
"loss": 0.0651,
|
|
"num_input_tokens_seen": 9307200,
|
|
"step": 9690
|
|
},
|
|
{
|
|
"epoch": 0.8227257298031229,
|
|
"grad_norm": 11.009183883666992,
|
|
"learning_rate": 1.856193603263515e-07,
|
|
"loss": 0.0223,
|
|
"num_input_tokens_seen": 9312384,
|
|
"step": 9695
|
|
},
|
|
{
|
|
"epoch": 0.8231500339443313,
|
|
"grad_norm": 2.9790282249450684,
|
|
"learning_rate": 1.8476067647354553e-07,
|
|
"loss": 0.0088,
|
|
"num_input_tokens_seen": 9317120,
|
|
"step": 9700
|
|
},
|
|
{
|
|
"epoch": 0.8235743380855397,
|
|
"grad_norm": 40.640438079833984,
|
|
"learning_rate": 1.8390378118047213e-07,
|
|
"loss": 0.0793,
|
|
"num_input_tokens_seen": 9321664,
|
|
"step": 9705
|
|
},
|
|
{
|
|
"epoch": 0.8239986422267481,
|
|
"grad_norm": 21.46381950378418,
|
|
"learning_rate": 1.8304867632708077e-07,
|
|
"loss": 0.0337,
|
|
"num_input_tokens_seen": 9326208,
|
|
"step": 9710
|
|
},
|
|
{
|
|
"epoch": 0.8244229463679565,
|
|
"grad_norm": 0.20169152319431305,
|
|
"learning_rate": 1.821953637893917e-07,
|
|
"loss": 0.0883,
|
|
"num_input_tokens_seen": 9331264,
|
|
"step": 9715
|
|
},
|
|
{
|
|
"epoch": 0.824847250509165,
|
|
"grad_norm": 10.603561401367188,
|
|
"learning_rate": 1.8134384543949478e-07,
|
|
"loss": 0.0641,
|
|
"num_input_tokens_seen": 9336704,
|
|
"step": 9720
|
|
},
|
|
{
|
|
"epoch": 0.8252715546503734,
|
|
"grad_norm": 8.128856658935547,
|
|
"learning_rate": 1.804941231455417e-07,
|
|
"loss": 0.093,
|
|
"num_input_tokens_seen": 9342016,
|
|
"step": 9725
|
|
},
|
|
{
|
|
"epoch": 0.8256958587915818,
|
|
"grad_norm": 15.758296012878418,
|
|
"learning_rate": 1.7964619877174513e-07,
|
|
"loss": 0.0521,
|
|
"num_input_tokens_seen": 9346752,
|
|
"step": 9730
|
|
},
|
|
{
|
|
"epoch": 0.8261201629327902,
|
|
"grad_norm": 0.16209304332733154,
|
|
"learning_rate": 1.788000741783725e-07,
|
|
"loss": 0.0442,
|
|
"num_input_tokens_seen": 9351296,
|
|
"step": 9735
|
|
},
|
|
{
|
|
"epoch": 0.8265444670739986,
|
|
"grad_norm": 11.122468948364258,
|
|
"learning_rate": 1.7795575122174323e-07,
|
|
"loss": 0.0757,
|
|
"num_input_tokens_seen": 9355712,
|
|
"step": 9740
|
|
},
|
|
{
|
|
"epoch": 0.8269687712152071,
|
|
"grad_norm": 6.385353088378906,
|
|
"learning_rate": 1.7711323175422376e-07,
|
|
"loss": 0.0848,
|
|
"num_input_tokens_seen": 9360384,
|
|
"step": 9745
|
|
},
|
|
{
|
|
"epoch": 0.8273930753564155,
|
|
"grad_norm": 8.261764526367188,
|
|
"learning_rate": 1.7627251762422413e-07,
|
|
"loss": 0.0516,
|
|
"num_input_tokens_seen": 9364608,
|
|
"step": 9750
|
|
},
|
|
{
|
|
"epoch": 0.8278173794976239,
|
|
"grad_norm": 11.625663757324219,
|
|
"learning_rate": 1.7543361067619267e-07,
|
|
"loss": 0.0178,
|
|
"num_input_tokens_seen": 9369728,
|
|
"step": 9755
|
|
},
|
|
{
|
|
"epoch": 0.8282416836388323,
|
|
"grad_norm": 8.714664459228516,
|
|
"learning_rate": 1.7459651275061483e-07,
|
|
"loss": 0.0882,
|
|
"num_input_tokens_seen": 9374592,
|
|
"step": 9760
|
|
},
|
|
{
|
|
"epoch": 0.8286659877800407,
|
|
"grad_norm": 9.188426971435547,
|
|
"learning_rate": 1.737612256840053e-07,
|
|
"loss": 0.0481,
|
|
"num_input_tokens_seen": 9380160,
|
|
"step": 9765
|
|
},
|
|
{
|
|
"epoch": 0.8290902919212492,
|
|
"grad_norm": 16.724620819091797,
|
|
"learning_rate": 1.729277513089068e-07,
|
|
"loss": 0.0598,
|
|
"num_input_tokens_seen": 9385216,
|
|
"step": 9770
|
|
},
|
|
{
|
|
"epoch": 0.8295145960624576,
|
|
"grad_norm": 37.224849700927734,
|
|
"learning_rate": 1.7209609145388538e-07,
|
|
"loss": 0.0556,
|
|
"num_input_tokens_seen": 9390080,
|
|
"step": 9775
|
|
},
|
|
{
|
|
"epoch": 0.829938900203666,
|
|
"grad_norm": 12.946894645690918,
|
|
"learning_rate": 1.7126624794352563e-07,
|
|
"loss": 0.0181,
|
|
"num_input_tokens_seen": 9394304,
|
|
"step": 9780
|
|
},
|
|
{
|
|
"epoch": 0.8303632043448744,
|
|
"grad_norm": 2.938326835632324,
|
|
"learning_rate": 1.7043822259842766e-07,
|
|
"loss": 0.0626,
|
|
"num_input_tokens_seen": 9399360,
|
|
"step": 9785
|
|
},
|
|
{
|
|
"epoch": 0.8307875084860828,
|
|
"grad_norm": 0.6280015707015991,
|
|
"learning_rate": 1.6961201723520247e-07,
|
|
"loss": 0.0377,
|
|
"num_input_tokens_seen": 9404352,
|
|
"step": 9790
|
|
},
|
|
{
|
|
"epoch": 0.8312118126272913,
|
|
"grad_norm": 13.002229690551758,
|
|
"learning_rate": 1.6878763366646832e-07,
|
|
"loss": 0.0396,
|
|
"num_input_tokens_seen": 9409152,
|
|
"step": 9795
|
|
},
|
|
{
|
|
"epoch": 0.8316361167684997,
|
|
"grad_norm": 8.750516891479492,
|
|
"learning_rate": 1.6796507370084656e-07,
|
|
"loss": 0.0491,
|
|
"num_input_tokens_seen": 9414208,
|
|
"step": 9800
|
|
},
|
|
{
|
|
"epoch": 0.8320604209097081,
|
|
"grad_norm": 0.28925469517707825,
|
|
"learning_rate": 1.671443391429581e-07,
|
|
"loss": 0.0614,
|
|
"num_input_tokens_seen": 9419008,
|
|
"step": 9805
|
|
},
|
|
{
|
|
"epoch": 0.8324847250509165,
|
|
"grad_norm": 0.9624937176704407,
|
|
"learning_rate": 1.6632543179341772e-07,
|
|
"loss": 0.0349,
|
|
"num_input_tokens_seen": 9424192,
|
|
"step": 9810
|
|
},
|
|
{
|
|
"epoch": 0.832909029192125,
|
|
"grad_norm": 39.57783508300781,
|
|
"learning_rate": 1.6550835344883364e-07,
|
|
"loss": 0.0264,
|
|
"num_input_tokens_seen": 9429056,
|
|
"step": 9815
|
|
},
|
|
{
|
|
"epoch": 0.8333333333333334,
|
|
"grad_norm": 15.192575454711914,
|
|
"learning_rate": 1.646931059017994e-07,
|
|
"loss": 0.0433,
|
|
"num_input_tokens_seen": 9433024,
|
|
"step": 9820
|
|
},
|
|
{
|
|
"epoch": 0.8337576374745418,
|
|
"grad_norm": 19.527372360229492,
|
|
"learning_rate": 1.6387969094089317e-07,
|
|
"loss": 0.0432,
|
|
"num_input_tokens_seen": 9437696,
|
|
"step": 9825
|
|
},
|
|
{
|
|
"epoch": 0.8341819416157502,
|
|
"grad_norm": 0.6833714246749878,
|
|
"learning_rate": 1.6306811035067203e-07,
|
|
"loss": 0.0486,
|
|
"num_input_tokens_seen": 9442944,
|
|
"step": 9830
|
|
},
|
|
{
|
|
"epoch": 0.8346062457569586,
|
|
"grad_norm": 0.6262238025665283,
|
|
"learning_rate": 1.6225836591166886e-07,
|
|
"loss": 0.0475,
|
|
"num_input_tokens_seen": 9447680,
|
|
"step": 9835
|
|
},
|
|
{
|
|
"epoch": 0.835030549898167,
|
|
"grad_norm": 1.716418981552124,
|
|
"learning_rate": 1.6145045940038803e-07,
|
|
"loss": 0.0774,
|
|
"num_input_tokens_seen": 9452288,
|
|
"step": 9840
|
|
},
|
|
{
|
|
"epoch": 0.8354548540393755,
|
|
"grad_norm": 1.837746500968933,
|
|
"learning_rate": 1.6064439258930217e-07,
|
|
"loss": 0.0675,
|
|
"num_input_tokens_seen": 9456896,
|
|
"step": 9845
|
|
},
|
|
{
|
|
"epoch": 0.8358791581805839,
|
|
"grad_norm": 28.929399490356445,
|
|
"learning_rate": 1.5984016724684658e-07,
|
|
"loss": 0.0753,
|
|
"num_input_tokens_seen": 9461632,
|
|
"step": 9850
|
|
},
|
|
{
|
|
"epoch": 0.8363034623217923,
|
|
"grad_norm": 0.06956404447555542,
|
|
"learning_rate": 1.5903778513741816e-07,
|
|
"loss": 0.073,
|
|
"num_input_tokens_seen": 9466560,
|
|
"step": 9855
|
|
},
|
|
{
|
|
"epoch": 0.8367277664630007,
|
|
"grad_norm": 0.49247103929519653,
|
|
"learning_rate": 1.5823724802136862e-07,
|
|
"loss": 0.0395,
|
|
"num_input_tokens_seen": 9471168,
|
|
"step": 9860
|
|
},
|
|
{
|
|
"epoch": 0.837152070604209,
|
|
"grad_norm": 11.84592342376709,
|
|
"learning_rate": 1.5743855765500258e-07,
|
|
"loss": 0.1041,
|
|
"num_input_tokens_seen": 9475968,
|
|
"step": 9865
|
|
},
|
|
{
|
|
"epoch": 0.8375763747454175,
|
|
"grad_norm": 19.03890609741211,
|
|
"learning_rate": 1.5664171579057273e-07,
|
|
"loss": 0.0334,
|
|
"num_input_tokens_seen": 9481280,
|
|
"step": 9870
|
|
},
|
|
{
|
|
"epoch": 0.8380006788866259,
|
|
"grad_norm": 1.0064212083816528,
|
|
"learning_rate": 1.5584672417627665e-07,
|
|
"loss": 0.0259,
|
|
"num_input_tokens_seen": 9485952,
|
|
"step": 9875
|
|
},
|
|
{
|
|
"epoch": 0.8384249830278343,
|
|
"grad_norm": 1.3949761390686035,
|
|
"learning_rate": 1.5505358455625229e-07,
|
|
"loss": 0.0627,
|
|
"num_input_tokens_seen": 9490048,
|
|
"step": 9880
|
|
},
|
|
{
|
|
"epoch": 0.8388492871690427,
|
|
"grad_norm": 17.959156036376953,
|
|
"learning_rate": 1.5426229867057516e-07,
|
|
"loss": 0.0839,
|
|
"num_input_tokens_seen": 9495360,
|
|
"step": 9885
|
|
},
|
|
{
|
|
"epoch": 0.8392735913102511,
|
|
"grad_norm": 13.035515785217285,
|
|
"learning_rate": 1.5347286825525252e-07,
|
|
"loss": 0.0968,
|
|
"num_input_tokens_seen": 9499968,
|
|
"step": 9890
|
|
},
|
|
{
|
|
"epoch": 0.8396978954514596,
|
|
"grad_norm": 8.93185806274414,
|
|
"learning_rate": 1.526852950422226e-07,
|
|
"loss": 0.0538,
|
|
"num_input_tokens_seen": 9504704,
|
|
"step": 9895
|
|
},
|
|
{
|
|
"epoch": 0.840122199592668,
|
|
"grad_norm": 0.12074297666549683,
|
|
"learning_rate": 1.5189958075934771e-07,
|
|
"loss": 0.0269,
|
|
"num_input_tokens_seen": 9509184,
|
|
"step": 9900
|
|
},
|
|
{
|
|
"epoch": 0.8405465037338764,
|
|
"grad_norm": 14.160613059997559,
|
|
"learning_rate": 1.5111572713041253e-07,
|
|
"loss": 0.0477,
|
|
"num_input_tokens_seen": 9514048,
|
|
"step": 9905
|
|
},
|
|
{
|
|
"epoch": 0.8409708078750848,
|
|
"grad_norm": 7.754039764404297,
|
|
"learning_rate": 1.5033373587511944e-07,
|
|
"loss": 0.0206,
|
|
"num_input_tokens_seen": 9519104,
|
|
"step": 9910
|
|
},
|
|
{
|
|
"epoch": 0.8413951120162932,
|
|
"grad_norm": 0.5104118585586548,
|
|
"learning_rate": 1.4955360870908505e-07,
|
|
"loss": 0.0651,
|
|
"num_input_tokens_seen": 9523840,
|
|
"step": 9915
|
|
},
|
|
{
|
|
"epoch": 0.8418194161575017,
|
|
"grad_norm": 8.219719886779785,
|
|
"learning_rate": 1.4877534734383624e-07,
|
|
"loss": 0.0562,
|
|
"num_input_tokens_seen": 9528384,
|
|
"step": 9920
|
|
},
|
|
{
|
|
"epoch": 0.8422437202987101,
|
|
"grad_norm": 19.17828369140625,
|
|
"learning_rate": 1.4799895348680647e-07,
|
|
"loss": 0.0745,
|
|
"num_input_tokens_seen": 9533184,
|
|
"step": 9925
|
|
},
|
|
{
|
|
"epoch": 0.8426680244399185,
|
|
"grad_norm": 0.587627112865448,
|
|
"learning_rate": 1.4722442884133214e-07,
|
|
"loss": 0.0446,
|
|
"num_input_tokens_seen": 9538944,
|
|
"step": 9930
|
|
},
|
|
{
|
|
"epoch": 0.8430923285811269,
|
|
"grad_norm": 2.815011739730835,
|
|
"learning_rate": 1.4645177510664886e-07,
|
|
"loss": 0.0198,
|
|
"num_input_tokens_seen": 9543296,
|
|
"step": 9935
|
|
},
|
|
{
|
|
"epoch": 0.8435166327223353,
|
|
"grad_norm": 0.25716152787208557,
|
|
"learning_rate": 1.4568099397788746e-07,
|
|
"loss": 0.0716,
|
|
"num_input_tokens_seen": 9547840,
|
|
"step": 9940
|
|
},
|
|
{
|
|
"epoch": 0.8439409368635438,
|
|
"grad_norm": 1.403857946395874,
|
|
"learning_rate": 1.4491208714607016e-07,
|
|
"loss": 0.096,
|
|
"num_input_tokens_seen": 9552704,
|
|
"step": 9945
|
|
},
|
|
{
|
|
"epoch": 0.8443652410047522,
|
|
"grad_norm": 22.68338966369629,
|
|
"learning_rate": 1.4414505629810813e-07,
|
|
"loss": 0.0814,
|
|
"num_input_tokens_seen": 9558272,
|
|
"step": 9950
|
|
},
|
|
{
|
|
"epoch": 0.8447895451459606,
|
|
"grad_norm": 30.75401496887207,
|
|
"learning_rate": 1.433799031167957e-07,
|
|
"loss": 0.104,
|
|
"num_input_tokens_seen": 9563264,
|
|
"step": 9955
|
|
},
|
|
{
|
|
"epoch": 0.845213849287169,
|
|
"grad_norm": 0.21932555735111237,
|
|
"learning_rate": 1.426166292808083e-07,
|
|
"loss": 0.0254,
|
|
"num_input_tokens_seen": 9567680,
|
|
"step": 9960
|
|
},
|
|
{
|
|
"epoch": 0.8456381534283774,
|
|
"grad_norm": 0.2317563146352768,
|
|
"learning_rate": 1.4185523646469821e-07,
|
|
"loss": 0.039,
|
|
"num_input_tokens_seen": 9572608,
|
|
"step": 9965
|
|
},
|
|
{
|
|
"epoch": 0.8460624575695859,
|
|
"grad_norm": 13.577017784118652,
|
|
"learning_rate": 1.410957263388909e-07,
|
|
"loss": 0.0657,
|
|
"num_input_tokens_seen": 9577088,
|
|
"step": 9970
|
|
},
|
|
{
|
|
"epoch": 0.8464867617107943,
|
|
"grad_norm": 0.7426764369010925,
|
|
"learning_rate": 1.4033810056968155e-07,
|
|
"loss": 0.0426,
|
|
"num_input_tokens_seen": 9581952,
|
|
"step": 9975
|
|
},
|
|
{
|
|
"epoch": 0.8469110658520027,
|
|
"grad_norm": 0.2905764579772949,
|
|
"learning_rate": 1.3958236081923102e-07,
|
|
"loss": 0.0968,
|
|
"num_input_tokens_seen": 9586496,
|
|
"step": 9980
|
|
},
|
|
{
|
|
"epoch": 0.8473353699932111,
|
|
"grad_norm": 0.06987245380878448,
|
|
"learning_rate": 1.3882850874556207e-07,
|
|
"loss": 0.0292,
|
|
"num_input_tokens_seen": 9591296,
|
|
"step": 9985
|
|
},
|
|
{
|
|
"epoch": 0.8477596741344195,
|
|
"grad_norm": 0.7430753707885742,
|
|
"learning_rate": 1.3807654600255713e-07,
|
|
"loss": 0.0629,
|
|
"num_input_tokens_seen": 9595904,
|
|
"step": 9990
|
|
},
|
|
{
|
|
"epoch": 0.848183978275628,
|
|
"grad_norm": 62.958980560302734,
|
|
"learning_rate": 1.373264742399526e-07,
|
|
"loss": 0.127,
|
|
"num_input_tokens_seen": 9600640,
|
|
"step": 9995
|
|
},
|
|
{
|
|
"epoch": 0.8486082824168364,
|
|
"grad_norm": 0.3665136694908142,
|
|
"learning_rate": 1.3657829510333652e-07,
|
|
"loss": 0.0693,
|
|
"num_input_tokens_seen": 9605056,
|
|
"step": 10000
|
|
},
|
|
{
|
|
"epoch": 0.8490325865580448,
|
|
"grad_norm": 7.840846061706543,
|
|
"learning_rate": 1.3583201023414493e-07,
|
|
"loss": 0.0068,
|
|
"num_input_tokens_seen": 9610112,
|
|
"step": 10005
|
|
},
|
|
{
|
|
"epoch": 0.8494568906992532,
|
|
"grad_norm": 0.26021483540534973,
|
|
"learning_rate": 1.350876212696579e-07,
|
|
"loss": 0.0306,
|
|
"num_input_tokens_seen": 9615744,
|
|
"step": 10010
|
|
},
|
|
{
|
|
"epoch": 0.8498811948404617,
|
|
"grad_norm": 17.282150268554688,
|
|
"learning_rate": 1.3434512984299596e-07,
|
|
"loss": 0.0076,
|
|
"num_input_tokens_seen": 9620288,
|
|
"step": 10015
|
|
},
|
|
{
|
|
"epoch": 0.8503054989816701,
|
|
"grad_norm": 0.36491698026657104,
|
|
"learning_rate": 1.3360453758311686e-07,
|
|
"loss": 0.0939,
|
|
"num_input_tokens_seen": 9625024,
|
|
"step": 10020
|
|
},
|
|
{
|
|
"epoch": 0.8507298031228785,
|
|
"grad_norm": 0.6906639337539673,
|
|
"learning_rate": 1.32865846114811e-07,
|
|
"loss": 0.0308,
|
|
"num_input_tokens_seen": 9630144,
|
|
"step": 10025
|
|
},
|
|
{
|
|
"epoch": 0.8511541072640869,
|
|
"grad_norm": 36.29147720336914,
|
|
"learning_rate": 1.321290570586999e-07,
|
|
"loss": 0.0761,
|
|
"num_input_tokens_seen": 9634624,
|
|
"step": 10030
|
|
},
|
|
{
|
|
"epoch": 0.8511541072640869,
|
|
"eval_loss": 0.052773453295230865,
|
|
"eval_runtime": 16.7153,
|
|
"eval_samples_per_second": 626.672,
|
|
"eval_steps_per_second": 78.371,
|
|
"num_input_tokens_seen": 9634624,
|
|
"step": 10030
|
|
},
|
|
{
|
|
"epoch": 0.8515784114052953,
|
|
"grad_norm": 0.11966277658939362,
|
|
"learning_rate": 1.3139417203123027e-07,
|
|
"loss": 0.012,
|
|
"num_input_tokens_seen": 9639744,
|
|
"step": 10035
|
|
},
|
|
{
|
|
"epoch": 0.8520027155465038,
|
|
"grad_norm": 0.27449285984039307,
|
|
"learning_rate": 1.306611926446718e-07,
|
|
"loss": 0.0175,
|
|
"num_input_tokens_seen": 9644480,
|
|
"step": 10040
|
|
},
|
|
{
|
|
"epoch": 0.8524270196877122,
|
|
"grad_norm": 0.16298529505729675,
|
|
"learning_rate": 1.2993012050711406e-07,
|
|
"loss": 0.0472,
|
|
"num_input_tokens_seen": 9649408,
|
|
"step": 10045
|
|
},
|
|
{
|
|
"epoch": 0.8528513238289206,
|
|
"grad_norm": 10.504528999328613,
|
|
"learning_rate": 1.292009572224614e-07,
|
|
"loss": 0.0905,
|
|
"num_input_tokens_seen": 9653440,
|
|
"step": 10050
|
|
},
|
|
{
|
|
"epoch": 0.853275627970129,
|
|
"grad_norm": 2.1599643230438232,
|
|
"learning_rate": 1.284737043904306e-07,
|
|
"loss": 0.0366,
|
|
"num_input_tokens_seen": 9658176,
|
|
"step": 10055
|
|
},
|
|
{
|
|
"epoch": 0.8536999321113374,
|
|
"grad_norm": 0.09390539675951004,
|
|
"learning_rate": 1.2774836360654717e-07,
|
|
"loss": 0.089,
|
|
"num_input_tokens_seen": 9662848,
|
|
"step": 10060
|
|
},
|
|
{
|
|
"epoch": 0.8541242362525459,
|
|
"grad_norm": 33.03879165649414,
|
|
"learning_rate": 1.2702493646214207e-07,
|
|
"loss": 0.0755,
|
|
"num_input_tokens_seen": 9667392,
|
|
"step": 10065
|
|
},
|
|
{
|
|
"epoch": 0.8545485403937543,
|
|
"grad_norm": 16.217477798461914,
|
|
"learning_rate": 1.2630342454434728e-07,
|
|
"loss": 0.153,
|
|
"num_input_tokens_seen": 9672384,
|
|
"step": 10070
|
|
},
|
|
{
|
|
"epoch": 0.8549728445349627,
|
|
"grad_norm": 11.52825927734375,
|
|
"learning_rate": 1.2558382943609357e-07,
|
|
"loss": 0.0664,
|
|
"num_input_tokens_seen": 9677248,
|
|
"step": 10075
|
|
},
|
|
{
|
|
"epoch": 0.8553971486761711,
|
|
"grad_norm": 17.614547729492188,
|
|
"learning_rate": 1.2486615271610558e-07,
|
|
"loss": 0.0455,
|
|
"num_input_tokens_seen": 9681536,
|
|
"step": 10080
|
|
},
|
|
{
|
|
"epoch": 0.8558214528173795,
|
|
"grad_norm": 33.50824737548828,
|
|
"learning_rate": 1.241503959589003e-07,
|
|
"loss": 0.0862,
|
|
"num_input_tokens_seen": 9686592,
|
|
"step": 10085
|
|
},
|
|
{
|
|
"epoch": 0.856245756958588,
|
|
"grad_norm": 16.574861526489258,
|
|
"learning_rate": 1.234365607347816e-07,
|
|
"loss": 0.0419,
|
|
"num_input_tokens_seen": 9691648,
|
|
"step": 10090
|
|
},
|
|
{
|
|
"epoch": 0.8566700610997964,
|
|
"grad_norm": 27.496414184570312,
|
|
"learning_rate": 1.22724648609838e-07,
|
|
"loss": 0.0414,
|
|
"num_input_tokens_seen": 9696768,
|
|
"step": 10095
|
|
},
|
|
{
|
|
"epoch": 0.8570943652410048,
|
|
"grad_norm": 9.780006408691406,
|
|
"learning_rate": 1.2201466114593884e-07,
|
|
"loss": 0.0586,
|
|
"num_input_tokens_seen": 9701376,
|
|
"step": 10100
|
|
},
|
|
{
|
|
"epoch": 0.8575186693822132,
|
|
"grad_norm": 0.3296249508857727,
|
|
"learning_rate": 1.2130659990073144e-07,
|
|
"loss": 0.0624,
|
|
"num_input_tokens_seen": 9705408,
|
|
"step": 10105
|
|
},
|
|
{
|
|
"epoch": 0.8579429735234216,
|
|
"grad_norm": 19.46929931640625,
|
|
"learning_rate": 1.206004664276359e-07,
|
|
"loss": 0.0695,
|
|
"num_input_tokens_seen": 9709824,
|
|
"step": 10110
|
|
},
|
|
{
|
|
"epoch": 0.8583672776646301,
|
|
"grad_norm": 0.7439113259315491,
|
|
"learning_rate": 1.198962622758447e-07,
|
|
"loss": 0.084,
|
|
"num_input_tokens_seen": 9715072,
|
|
"step": 10115
|
|
},
|
|
{
|
|
"epoch": 0.8587915818058385,
|
|
"grad_norm": 6.692914962768555,
|
|
"learning_rate": 1.1919398899031585e-07,
|
|
"loss": 0.0664,
|
|
"num_input_tokens_seen": 9720000,
|
|
"step": 10120
|
|
},
|
|
{
|
|
"epoch": 0.8592158859470469,
|
|
"grad_norm": 0.35738176107406616,
|
|
"learning_rate": 1.1849364811177288e-07,
|
|
"loss": 0.0035,
|
|
"num_input_tokens_seen": 9724288,
|
|
"step": 10125
|
|
},
|
|
{
|
|
"epoch": 0.8596401900882552,
|
|
"grad_norm": 0.8352645039558411,
|
|
"learning_rate": 1.1779524117669837e-07,
|
|
"loss": 0.0639,
|
|
"num_input_tokens_seen": 9729280,
|
|
"step": 10130
|
|
},
|
|
{
|
|
"epoch": 0.8600644942294636,
|
|
"grad_norm": 9.030378341674805,
|
|
"learning_rate": 1.1709876971733269e-07,
|
|
"loss": 0.0709,
|
|
"num_input_tokens_seen": 9733696,
|
|
"step": 10135
|
|
},
|
|
{
|
|
"epoch": 0.860488798370672,
|
|
"grad_norm": 0.14551109075546265,
|
|
"learning_rate": 1.1640423526166987e-07,
|
|
"loss": 0.0245,
|
|
"num_input_tokens_seen": 9738624,
|
|
"step": 10140
|
|
},
|
|
{
|
|
"epoch": 0.8609131025118805,
|
|
"grad_norm": 0.5628594160079956,
|
|
"learning_rate": 1.1571163933345462e-07,
|
|
"loss": 0.0646,
|
|
"num_input_tokens_seen": 9743488,
|
|
"step": 10145
|
|
},
|
|
{
|
|
"epoch": 0.8613374066530889,
|
|
"grad_norm": 3.808892011642456,
|
|
"learning_rate": 1.150209834521777e-07,
|
|
"loss": 0.0162,
|
|
"num_input_tokens_seen": 9749632,
|
|
"step": 10150
|
|
},
|
|
{
|
|
"epoch": 0.8617617107942973,
|
|
"grad_norm": 12.639211654663086,
|
|
"learning_rate": 1.1433226913307514e-07,
|
|
"loss": 0.0287,
|
|
"num_input_tokens_seen": 9754432,
|
|
"step": 10155
|
|
},
|
|
{
|
|
"epoch": 0.8621860149355057,
|
|
"grad_norm": 0.8272728323936462,
|
|
"learning_rate": 1.1364549788712185e-07,
|
|
"loss": 0.0365,
|
|
"num_input_tokens_seen": 9759168,
|
|
"step": 10160
|
|
},
|
|
{
|
|
"epoch": 0.8626103190767141,
|
|
"grad_norm": 1.1137412786483765,
|
|
"learning_rate": 1.1296067122103059e-07,
|
|
"loss": 0.0108,
|
|
"num_input_tokens_seen": 9764096,
|
|
"step": 10165
|
|
},
|
|
{
|
|
"epoch": 0.8630346232179226,
|
|
"grad_norm": 0.6005673408508301,
|
|
"learning_rate": 1.1227779063724818e-07,
|
|
"loss": 0.0415,
|
|
"num_input_tokens_seen": 9768768,
|
|
"step": 10170
|
|
},
|
|
{
|
|
"epoch": 0.863458927359131,
|
|
"grad_norm": 16.006635665893555,
|
|
"learning_rate": 1.115968576339511e-07,
|
|
"loss": 0.0172,
|
|
"num_input_tokens_seen": 9773184,
|
|
"step": 10175
|
|
},
|
|
{
|
|
"epoch": 0.8638832315003394,
|
|
"grad_norm": 8.376733779907227,
|
|
"learning_rate": 1.1091787370504347e-07,
|
|
"loss": 0.0685,
|
|
"num_input_tokens_seen": 9778688,
|
|
"step": 10180
|
|
},
|
|
{
|
|
"epoch": 0.8643075356415478,
|
|
"grad_norm": 28.529516220092773,
|
|
"learning_rate": 1.1024084034015347e-07,
|
|
"loss": 0.0368,
|
|
"num_input_tokens_seen": 9783168,
|
|
"step": 10185
|
|
},
|
|
{
|
|
"epoch": 0.8647318397827563,
|
|
"grad_norm": 16.253131866455078,
|
|
"learning_rate": 1.095657590246295e-07,
|
|
"loss": 0.0582,
|
|
"num_input_tokens_seen": 9787712,
|
|
"step": 10190
|
|
},
|
|
{
|
|
"epoch": 0.8651561439239647,
|
|
"grad_norm": 13.5687894821167,
|
|
"learning_rate": 1.0889263123953773e-07,
|
|
"loss": 0.049,
|
|
"num_input_tokens_seen": 9792384,
|
|
"step": 10195
|
|
},
|
|
{
|
|
"epoch": 0.8655804480651731,
|
|
"grad_norm": 7.556813716888428,
|
|
"learning_rate": 1.0822145846165853e-07,
|
|
"loss": 0.1039,
|
|
"num_input_tokens_seen": 9797824,
|
|
"step": 10200
|
|
},
|
|
{
|
|
"epoch": 0.8660047522063815,
|
|
"grad_norm": 4.758289337158203,
|
|
"learning_rate": 1.0755224216348235e-07,
|
|
"loss": 0.1392,
|
|
"num_input_tokens_seen": 9802880,
|
|
"step": 10205
|
|
},
|
|
{
|
|
"epoch": 0.8664290563475899,
|
|
"grad_norm": 10.589117050170898,
|
|
"learning_rate": 1.0688498381320854e-07,
|
|
"loss": 0.0936,
|
|
"num_input_tokens_seen": 9807424,
|
|
"step": 10210
|
|
},
|
|
{
|
|
"epoch": 0.8668533604887984,
|
|
"grad_norm": 38.80707550048828,
|
|
"learning_rate": 1.0621968487473975e-07,
|
|
"loss": 0.0622,
|
|
"num_input_tokens_seen": 9812480,
|
|
"step": 10215
|
|
},
|
|
{
|
|
"epoch": 0.8672776646300068,
|
|
"grad_norm": 27.11156463623047,
|
|
"learning_rate": 1.0555634680768066e-07,
|
|
"loss": 0.0179,
|
|
"num_input_tokens_seen": 9816960,
|
|
"step": 10220
|
|
},
|
|
{
|
|
"epoch": 0.8677019687712152,
|
|
"grad_norm": 0.24414733052253723,
|
|
"learning_rate": 1.0489497106733347e-07,
|
|
"loss": 0.0195,
|
|
"num_input_tokens_seen": 9821568,
|
|
"step": 10225
|
|
},
|
|
{
|
|
"epoch": 0.8681262729124236,
|
|
"grad_norm": 11.67682933807373,
|
|
"learning_rate": 1.0423555910469561e-07,
|
|
"loss": 0.0823,
|
|
"num_input_tokens_seen": 9826048,
|
|
"step": 10230
|
|
},
|
|
{
|
|
"epoch": 0.868550577053632,
|
|
"grad_norm": 1.417206883430481,
|
|
"learning_rate": 1.0357811236645597e-07,
|
|
"loss": 0.0293,
|
|
"num_input_tokens_seen": 9830720,
|
|
"step": 10235
|
|
},
|
|
{
|
|
"epoch": 0.8689748811948405,
|
|
"grad_norm": 2.3976857662200928,
|
|
"learning_rate": 1.0292263229499209e-07,
|
|
"loss": 0.0659,
|
|
"num_input_tokens_seen": 9835648,
|
|
"step": 10240
|
|
},
|
|
{
|
|
"epoch": 0.8693991853360489,
|
|
"grad_norm": 10.028768539428711,
|
|
"learning_rate": 1.022691203283661e-07,
|
|
"loss": 0.1169,
|
|
"num_input_tokens_seen": 9839936,
|
|
"step": 10245
|
|
},
|
|
{
|
|
"epoch": 0.8698234894772573,
|
|
"grad_norm": 0.18101167678833008,
|
|
"learning_rate": 1.0161757790032355e-07,
|
|
"loss": 0.0551,
|
|
"num_input_tokens_seen": 9844608,
|
|
"step": 10250
|
|
},
|
|
{
|
|
"epoch": 0.8702477936184657,
|
|
"grad_norm": 14.05859088897705,
|
|
"learning_rate": 1.0096800644028791e-07,
|
|
"loss": 0.0281,
|
|
"num_input_tokens_seen": 9848896,
|
|
"step": 10255
|
|
},
|
|
{
|
|
"epoch": 0.8706720977596741,
|
|
"grad_norm": 7.950769424438477,
|
|
"learning_rate": 1.003204073733589e-07,
|
|
"loss": 0.0907,
|
|
"num_input_tokens_seen": 9853184,
|
|
"step": 10260
|
|
},
|
|
{
|
|
"epoch": 0.8710964019008826,
|
|
"grad_norm": 0.30992770195007324,
|
|
"learning_rate": 9.967478212030923e-08,
|
|
"loss": 0.0794,
|
|
"num_input_tokens_seen": 9857856,
|
|
"step": 10265
|
|
},
|
|
{
|
|
"epoch": 0.871520706042091,
|
|
"grad_norm": 19.903745651245117,
|
|
"learning_rate": 9.903113209758096e-08,
|
|
"loss": 0.0708,
|
|
"num_input_tokens_seen": 9862592,
|
|
"step": 10270
|
|
},
|
|
{
|
|
"epoch": 0.8719450101832994,
|
|
"grad_norm": 15.889472007751465,
|
|
"learning_rate": 9.838945871728266e-08,
|
|
"loss": 0.0589,
|
|
"num_input_tokens_seen": 9867584,
|
|
"step": 10275
|
|
},
|
|
{
|
|
"epoch": 0.8723693143245078,
|
|
"grad_norm": 27.542219161987305,
|
|
"learning_rate": 9.774976338718677e-08,
|
|
"loss": 0.0168,
|
|
"num_input_tokens_seen": 9872384,
|
|
"step": 10280
|
|
},
|
|
{
|
|
"epoch": 0.8727936184657162,
|
|
"grad_norm": 33.8953971862793,
|
|
"learning_rate": 9.711204751072499e-08,
|
|
"loss": 0.0941,
|
|
"num_input_tokens_seen": 9876672,
|
|
"step": 10285
|
|
},
|
|
{
|
|
"epoch": 0.8732179226069247,
|
|
"grad_norm": 27.397008895874023,
|
|
"learning_rate": 9.647631248698773e-08,
|
|
"loss": 0.0395,
|
|
"num_input_tokens_seen": 9881792,
|
|
"step": 10290
|
|
},
|
|
{
|
|
"epoch": 0.8736422267481331,
|
|
"grad_norm": 18.55731773376465,
|
|
"learning_rate": 9.584255971071886e-08,
|
|
"loss": 0.1179,
|
|
"num_input_tokens_seen": 9886464,
|
|
"step": 10295
|
|
},
|
|
{
|
|
"epoch": 0.8740665308893415,
|
|
"grad_norm": 20.62680435180664,
|
|
"learning_rate": 9.521079057231274e-08,
|
|
"loss": 0.0103,
|
|
"num_input_tokens_seen": 9891264,
|
|
"step": 10300
|
|
},
|
|
{
|
|
"epoch": 0.8744908350305499,
|
|
"grad_norm": 16.10218620300293,
|
|
"learning_rate": 9.45810064578133e-08,
|
|
"loss": 0.0409,
|
|
"num_input_tokens_seen": 9896320,
|
|
"step": 10305
|
|
},
|
|
{
|
|
"epoch": 0.8749151391717583,
|
|
"grad_norm": 33.963340759277344,
|
|
"learning_rate": 9.39532087489081e-08,
|
|
"loss": 0.0852,
|
|
"num_input_tokens_seen": 9901504,
|
|
"step": 10310
|
|
},
|
|
{
|
|
"epoch": 0.8753394433129668,
|
|
"grad_norm": 2.835461378097534,
|
|
"learning_rate": 9.33273988229275e-08,
|
|
"loss": 0.0619,
|
|
"num_input_tokens_seen": 9907008,
|
|
"step": 10315
|
|
},
|
|
{
|
|
"epoch": 0.8757637474541752,
|
|
"grad_norm": 8.292951583862305,
|
|
"learning_rate": 9.270357805284057e-08,
|
|
"loss": 0.1125,
|
|
"num_input_tokens_seen": 9911744,
|
|
"step": 10320
|
|
},
|
|
{
|
|
"epoch": 0.8761880515953836,
|
|
"grad_norm": 7.592496395111084,
|
|
"learning_rate": 9.208174780725253e-08,
|
|
"loss": 0.1024,
|
|
"num_input_tokens_seen": 9916096,
|
|
"step": 10325
|
|
},
|
|
{
|
|
"epoch": 0.876612355736592,
|
|
"grad_norm": 0.9127132892608643,
|
|
"learning_rate": 9.146190945040145e-08,
|
|
"loss": 0.0264,
|
|
"num_input_tokens_seen": 9920448,
|
|
"step": 10330
|
|
},
|
|
{
|
|
"epoch": 0.8770366598778004,
|
|
"grad_norm": 14.537149429321289,
|
|
"learning_rate": 9.084406434215553e-08,
|
|
"loss": 0.0894,
|
|
"num_input_tokens_seen": 9925312,
|
|
"step": 10335
|
|
},
|
|
{
|
|
"epoch": 0.8774609640190089,
|
|
"grad_norm": 23.973838806152344,
|
|
"learning_rate": 9.022821383800926e-08,
|
|
"loss": 0.0864,
|
|
"num_input_tokens_seen": 9929920,
|
|
"step": 10340
|
|
},
|
|
{
|
|
"epoch": 0.8778852681602173,
|
|
"grad_norm": 3.539787769317627,
|
|
"learning_rate": 8.961435928908267e-08,
|
|
"loss": 0.0043,
|
|
"num_input_tokens_seen": 9934912,
|
|
"step": 10345
|
|
},
|
|
{
|
|
"epoch": 0.8783095723014257,
|
|
"grad_norm": 17.533660888671875,
|
|
"learning_rate": 8.900250204211513e-08,
|
|
"loss": 0.0804,
|
|
"num_input_tokens_seen": 9939520,
|
|
"step": 10350
|
|
},
|
|
{
|
|
"epoch": 0.8787338764426341,
|
|
"grad_norm": 12.81690788269043,
|
|
"learning_rate": 8.839264343946506e-08,
|
|
"loss": 0.0352,
|
|
"num_input_tokens_seen": 9944384,
|
|
"step": 10355
|
|
},
|
|
{
|
|
"epoch": 0.8791581805838425,
|
|
"grad_norm": 13.96080493927002,
|
|
"learning_rate": 8.778478481910611e-08,
|
|
"loss": 0.0408,
|
|
"num_input_tokens_seen": 9949056,
|
|
"step": 10360
|
|
},
|
|
{
|
|
"epoch": 0.879582484725051,
|
|
"grad_norm": 11.22106647491455,
|
|
"learning_rate": 8.717892751462363e-08,
|
|
"loss": 0.0414,
|
|
"num_input_tokens_seen": 9954176,
|
|
"step": 10365
|
|
},
|
|
{
|
|
"epoch": 0.8800067888662594,
|
|
"grad_norm": 17.186817169189453,
|
|
"learning_rate": 8.657507285521281e-08,
|
|
"loss": 0.0449,
|
|
"num_input_tokens_seen": 9958912,
|
|
"step": 10370
|
|
},
|
|
{
|
|
"epoch": 0.8804310930074678,
|
|
"grad_norm": 14.373452186584473,
|
|
"learning_rate": 8.597322216567493e-08,
|
|
"loss": 0.0799,
|
|
"num_input_tokens_seen": 9963648,
|
|
"step": 10375
|
|
},
|
|
{
|
|
"epoch": 0.8808553971486762,
|
|
"grad_norm": 2.835481882095337,
|
|
"learning_rate": 8.537337676641442e-08,
|
|
"loss": 0.007,
|
|
"num_input_tokens_seen": 9968256,
|
|
"step": 10380
|
|
},
|
|
{
|
|
"epoch": 0.8812797012898846,
|
|
"grad_norm": 0.21918939054012299,
|
|
"learning_rate": 8.477553797343728e-08,
|
|
"loss": 0.0475,
|
|
"num_input_tokens_seen": 9973376,
|
|
"step": 10385
|
|
},
|
|
{
|
|
"epoch": 0.881704005431093,
|
|
"grad_norm": 19.67888069152832,
|
|
"learning_rate": 8.41797070983461e-08,
|
|
"loss": 0.0888,
|
|
"num_input_tokens_seen": 9978240,
|
|
"step": 10390
|
|
},
|
|
{
|
|
"epoch": 0.8821283095723014,
|
|
"grad_norm": 0.1844080090522766,
|
|
"learning_rate": 8.358588544833877e-08,
|
|
"loss": 0.0033,
|
|
"num_input_tokens_seen": 9982784,
|
|
"step": 10395
|
|
},
|
|
{
|
|
"epoch": 0.8825526137135098,
|
|
"grad_norm": 6.164559364318848,
|
|
"learning_rate": 8.29940743262052e-08,
|
|
"loss": 0.0871,
|
|
"num_input_tokens_seen": 9987008,
|
|
"step": 10400
|
|
},
|
|
{
|
|
"epoch": 0.8829769178547182,
|
|
"grad_norm": 13.286710739135742,
|
|
"learning_rate": 8.240427503032443e-08,
|
|
"loss": 0.0751,
|
|
"num_input_tokens_seen": 9992640,
|
|
"step": 10405
|
|
},
|
|
{
|
|
"epoch": 0.8834012219959266,
|
|
"grad_norm": 1.0005886554718018,
|
|
"learning_rate": 8.181648885466141e-08,
|
|
"loss": 0.0663,
|
|
"num_input_tokens_seen": 9996672,
|
|
"step": 10410
|
|
},
|
|
{
|
|
"epoch": 0.883825526137135,
|
|
"grad_norm": 6.181687831878662,
|
|
"learning_rate": 8.123071708876473e-08,
|
|
"loss": 0.0753,
|
|
"num_input_tokens_seen": 10001216,
|
|
"step": 10415
|
|
},
|
|
{
|
|
"epoch": 0.8842498302783435,
|
|
"grad_norm": 1.2369662523269653,
|
|
"learning_rate": 8.064696101776358e-08,
|
|
"loss": 0.0393,
|
|
"num_input_tokens_seen": 10006144,
|
|
"step": 10420
|
|
},
|
|
{
|
|
"epoch": 0.8846741344195519,
|
|
"grad_norm": 14.8165922164917,
|
|
"learning_rate": 8.006522192236487e-08,
|
|
"loss": 0.0351,
|
|
"num_input_tokens_seen": 10011520,
|
|
"step": 10425
|
|
},
|
|
{
|
|
"epoch": 0.8850984385607603,
|
|
"grad_norm": 0.3576453626155853,
|
|
"learning_rate": 7.948550107885043e-08,
|
|
"loss": 0.0142,
|
|
"num_input_tokens_seen": 10016512,
|
|
"step": 10430
|
|
},
|
|
{
|
|
"epoch": 0.8855227427019687,
|
|
"grad_norm": 24.576852798461914,
|
|
"learning_rate": 7.89077997590738e-08,
|
|
"loss": 0.0854,
|
|
"num_input_tokens_seen": 10020928,
|
|
"step": 10435
|
|
},
|
|
{
|
|
"epoch": 0.8859470468431772,
|
|
"grad_norm": 8.919859886169434,
|
|
"learning_rate": 7.833211923045891e-08,
|
|
"loss": 0.0432,
|
|
"num_input_tokens_seen": 10025920,
|
|
"step": 10440
|
|
},
|
|
{
|
|
"epoch": 0.8863713509843856,
|
|
"grad_norm": 16.309457778930664,
|
|
"learning_rate": 7.775846075599524e-08,
|
|
"loss": 0.0467,
|
|
"num_input_tokens_seen": 10030464,
|
|
"step": 10445
|
|
},
|
|
{
|
|
"epoch": 0.886795655125594,
|
|
"grad_norm": 3.415705919265747,
|
|
"learning_rate": 7.718682559423651e-08,
|
|
"loss": 0.0435,
|
|
"num_input_tokens_seen": 10035328,
|
|
"step": 10450
|
|
},
|
|
{
|
|
"epoch": 0.8872199592668024,
|
|
"grad_norm": 31.442087173461914,
|
|
"learning_rate": 7.661721499929752e-08,
|
|
"loss": 0.073,
|
|
"num_input_tokens_seen": 10040384,
|
|
"step": 10455
|
|
},
|
|
{
|
|
"epoch": 0.8876442634080108,
|
|
"grad_norm": 1.2248891592025757,
|
|
"learning_rate": 7.60496302208512e-08,
|
|
"loss": 0.0381,
|
|
"num_input_tokens_seen": 10045440,
|
|
"step": 10460
|
|
},
|
|
{
|
|
"epoch": 0.8880685675492193,
|
|
"grad_norm": 9.187003135681152,
|
|
"learning_rate": 7.548407250412614e-08,
|
|
"loss": 0.0302,
|
|
"num_input_tokens_seen": 10050432,
|
|
"step": 10465
|
|
},
|
|
{
|
|
"epoch": 0.8884928716904277,
|
|
"grad_norm": 19.261823654174805,
|
|
"learning_rate": 7.492054308990381e-08,
|
|
"loss": 0.0426,
|
|
"num_input_tokens_seen": 10055296,
|
|
"step": 10470
|
|
},
|
|
{
|
|
"epoch": 0.8889171758316361,
|
|
"grad_norm": 5.663764953613281,
|
|
"learning_rate": 7.435904321451524e-08,
|
|
"loss": 0.0498,
|
|
"num_input_tokens_seen": 10060416,
|
|
"step": 10475
|
|
},
|
|
{
|
|
"epoch": 0.8893414799728445,
|
|
"grad_norm": 0.7374381422996521,
|
|
"learning_rate": 7.379957410983995e-08,
|
|
"loss": 0.037,
|
|
"num_input_tokens_seen": 10065472,
|
|
"step": 10480
|
|
},
|
|
{
|
|
"epoch": 0.8897657841140529,
|
|
"grad_norm": 6.194554805755615,
|
|
"learning_rate": 7.324213700330095e-08,
|
|
"loss": 0.0222,
|
|
"num_input_tokens_seen": 10070784,
|
|
"step": 10485
|
|
},
|
|
{
|
|
"epoch": 0.8901900882552614,
|
|
"grad_norm": 0.0684567540884018,
|
|
"learning_rate": 7.268673311786378e-08,
|
|
"loss": 0.0165,
|
|
"num_input_tokens_seen": 10075904,
|
|
"step": 10490
|
|
},
|
|
{
|
|
"epoch": 0.8906143923964698,
|
|
"grad_norm": 6.540678977966309,
|
|
"learning_rate": 7.213336367203338e-08,
|
|
"loss": 0.0171,
|
|
"num_input_tokens_seen": 10080768,
|
|
"step": 10495
|
|
},
|
|
{
|
|
"epoch": 0.8910386965376782,
|
|
"grad_norm": 1.62698233127594,
|
|
"learning_rate": 7.158202987985106e-08,
|
|
"loss": 0.0552,
|
|
"num_input_tokens_seen": 10085312,
|
|
"step": 10500
|
|
},
|
|
{
|
|
"epoch": 0.8914630006788866,
|
|
"grad_norm": 1.3050782680511475,
|
|
"learning_rate": 7.10327329508923e-08,
|
|
"loss": 0.1105,
|
|
"num_input_tokens_seen": 10089792,
|
|
"step": 10505
|
|
},
|
|
{
|
|
"epoch": 0.891887304820095,
|
|
"grad_norm": 9.819490432739258,
|
|
"learning_rate": 7.048547409026384e-08,
|
|
"loss": 0.0839,
|
|
"num_input_tokens_seen": 10094976,
|
|
"step": 10510
|
|
},
|
|
{
|
|
"epoch": 0.8923116089613035,
|
|
"grad_norm": 0.16964447498321533,
|
|
"learning_rate": 6.994025449860064e-08,
|
|
"loss": 0.0256,
|
|
"num_input_tokens_seen": 10099200,
|
|
"step": 10515
|
|
},
|
|
{
|
|
"epoch": 0.8927359131025119,
|
|
"grad_norm": 25.967426300048828,
|
|
"learning_rate": 6.939707537206485e-08,
|
|
"loss": 0.0736,
|
|
"num_input_tokens_seen": 10104320,
|
|
"step": 10520
|
|
},
|
|
{
|
|
"epoch": 0.8931602172437203,
|
|
"grad_norm": 0.1696673482656479,
|
|
"learning_rate": 6.885593790234056e-08,
|
|
"loss": 0.0392,
|
|
"num_input_tokens_seen": 10109312,
|
|
"step": 10525
|
|
},
|
|
{
|
|
"epoch": 0.8935845213849287,
|
|
"grad_norm": 0.5850779414176941,
|
|
"learning_rate": 6.831684327663367e-08,
|
|
"loss": 0.0273,
|
|
"num_input_tokens_seen": 10113600,
|
|
"step": 10530
|
|
},
|
|
{
|
|
"epoch": 0.8940088255261371,
|
|
"grad_norm": 1.523559808731079,
|
|
"learning_rate": 6.777979267766786e-08,
|
|
"loss": 0.0035,
|
|
"num_input_tokens_seen": 10118272,
|
|
"step": 10535
|
|
},
|
|
{
|
|
"epoch": 0.8944331296673456,
|
|
"grad_norm": 37.65926742553711,
|
|
"learning_rate": 6.724478728368277e-08,
|
|
"loss": 0.0707,
|
|
"num_input_tokens_seen": 10122688,
|
|
"step": 10540
|
|
},
|
|
{
|
|
"epoch": 0.894857433808554,
|
|
"grad_norm": 6.3507981300354,
|
|
"learning_rate": 6.671182826843047e-08,
|
|
"loss": 0.1104,
|
|
"num_input_tokens_seen": 10128000,
|
|
"step": 10545
|
|
},
|
|
{
|
|
"epoch": 0.8952817379497624,
|
|
"grad_norm": 7.7042646408081055,
|
|
"learning_rate": 6.618091680117399e-08,
|
|
"loss": 0.0534,
|
|
"num_input_tokens_seen": 10132544,
|
|
"step": 10550
|
|
},
|
|
{
|
|
"epoch": 0.8957060420909708,
|
|
"grad_norm": 8.093523025512695,
|
|
"learning_rate": 6.565205404668395e-08,
|
|
"loss": 0.0641,
|
|
"num_input_tokens_seen": 10138496,
|
|
"step": 10555
|
|
},
|
|
{
|
|
"epoch": 0.8961303462321792,
|
|
"grad_norm": 5.756014347076416,
|
|
"learning_rate": 6.512524116523633e-08,
|
|
"loss": 0.0369,
|
|
"num_input_tokens_seen": 10143424,
|
|
"step": 10560
|
|
},
|
|
{
|
|
"epoch": 0.8965546503733877,
|
|
"grad_norm": 29.61779022216797,
|
|
"learning_rate": 6.460047931261003e-08,
|
|
"loss": 0.1143,
|
|
"num_input_tokens_seen": 10148672,
|
|
"step": 10565
|
|
},
|
|
{
|
|
"epoch": 0.8969789545145961,
|
|
"grad_norm": 18.97088623046875,
|
|
"learning_rate": 6.407776964008383e-08,
|
|
"loss": 0.0177,
|
|
"num_input_tokens_seen": 10153408,
|
|
"step": 10570
|
|
},
|
|
{
|
|
"epoch": 0.8974032586558045,
|
|
"grad_norm": 0.6002587676048279,
|
|
"learning_rate": 6.355711329443481e-08,
|
|
"loss": 0.0462,
|
|
"num_input_tokens_seen": 10157952,
|
|
"step": 10575
|
|
},
|
|
{
|
|
"epoch": 0.8978275627970129,
|
|
"grad_norm": 0.13313351571559906,
|
|
"learning_rate": 6.303851141793437e-08,
|
|
"loss": 0.0389,
|
|
"num_input_tokens_seen": 10163712,
|
|
"step": 10580
|
|
},
|
|
{
|
|
"epoch": 0.8982518669382213,
|
|
"grad_norm": 27.324981689453125,
|
|
"learning_rate": 6.252196514834751e-08,
|
|
"loss": 0.0264,
|
|
"num_input_tokens_seen": 10168512,
|
|
"step": 10585
|
|
},
|
|
{
|
|
"epoch": 0.8986761710794298,
|
|
"grad_norm": 8.916873931884766,
|
|
"learning_rate": 6.200747561892882e-08,
|
|
"loss": 0.0365,
|
|
"num_input_tokens_seen": 10173120,
|
|
"step": 10590
|
|
},
|
|
{
|
|
"epoch": 0.8991004752206382,
|
|
"grad_norm": 6.430027008056641,
|
|
"learning_rate": 6.149504395842087e-08,
|
|
"loss": 0.0647,
|
|
"num_input_tokens_seen": 10177856,
|
|
"step": 10595
|
|
},
|
|
{
|
|
"epoch": 0.8995247793618466,
|
|
"grad_norm": 1.3249238729476929,
|
|
"learning_rate": 6.098467129105123e-08,
|
|
"loss": 0.0304,
|
|
"num_input_tokens_seen": 10182080,
|
|
"step": 10600
|
|
},
|
|
{
|
|
"epoch": 0.899949083503055,
|
|
"grad_norm": 36.652278900146484,
|
|
"learning_rate": 6.047635873653068e-08,
|
|
"loss": 0.084,
|
|
"num_input_tokens_seen": 10186496,
|
|
"step": 10605
|
|
},
|
|
{
|
|
"epoch": 0.9003733876442634,
|
|
"grad_norm": 24.058101654052734,
|
|
"learning_rate": 5.997010741004949e-08,
|
|
"loss": 0.0413,
|
|
"num_input_tokens_seen": 10190912,
|
|
"step": 10610
|
|
},
|
|
{
|
|
"epoch": 0.9007976917854719,
|
|
"grad_norm": 27.96577262878418,
|
|
"learning_rate": 5.946591842227677e-08,
|
|
"loss": 0.0636,
|
|
"num_input_tokens_seen": 10195008,
|
|
"step": 10615
|
|
},
|
|
{
|
|
"epoch": 0.9012219959266803,
|
|
"grad_norm": 14.411796569824219,
|
|
"learning_rate": 5.8963792879356265e-08,
|
|
"loss": 0.0919,
|
|
"num_input_tokens_seen": 10199424,
|
|
"step": 10620
|
|
},
|
|
{
|
|
"epoch": 0.9012219959266803,
|
|
"eval_loss": 0.05219457671046257,
|
|
"eval_runtime": 16.6424,
|
|
"eval_samples_per_second": 629.415,
|
|
"eval_steps_per_second": 78.714,
|
|
"num_input_tokens_seen": 10199424,
|
|
"step": 10620
|
|
},
|
|
{
|
|
"epoch": 0.9016463000678887,
|
|
"grad_norm": 16.386627197265625,
|
|
"learning_rate": 5.84637318829051e-08,
|
|
"loss": 0.1083,
|
|
"num_input_tokens_seen": 10203968,
|
|
"step": 10625
|
|
},
|
|
{
|
|
"epoch": 0.9020706042090971,
|
|
"grad_norm": 16.634204864501953,
|
|
"learning_rate": 5.796573653001091e-08,
|
|
"loss": 0.1181,
|
|
"num_input_tokens_seen": 10208640,
|
|
"step": 10630
|
|
},
|
|
{
|
|
"epoch": 0.9024949083503055,
|
|
"grad_norm": 0.31451216340065,
|
|
"learning_rate": 5.746980791322942e-08,
|
|
"loss": 0.0162,
|
|
"num_input_tokens_seen": 10213504,
|
|
"step": 10635
|
|
},
|
|
{
|
|
"epoch": 0.902919212491514,
|
|
"grad_norm": 9.605644226074219,
|
|
"learning_rate": 5.697594712058218e-08,
|
|
"loss": 0.0502,
|
|
"num_input_tokens_seen": 10218432,
|
|
"step": 10640
|
|
},
|
|
{
|
|
"epoch": 0.9033435166327224,
|
|
"grad_norm": 0.3374364674091339,
|
|
"learning_rate": 5.6484155235554275e-08,
|
|
"loss": 0.0942,
|
|
"num_input_tokens_seen": 10223296,
|
|
"step": 10645
|
|
},
|
|
{
|
|
"epoch": 0.9037678207739308,
|
|
"grad_norm": 10.687803268432617,
|
|
"learning_rate": 5.599443333709131e-08,
|
|
"loss": 0.1258,
|
|
"num_input_tokens_seen": 10227904,
|
|
"step": 10650
|
|
},
|
|
{
|
|
"epoch": 0.9041921249151391,
|
|
"grad_norm": 1.1299291849136353,
|
|
"learning_rate": 5.5506782499598394e-08,
|
|
"loss": 0.0361,
|
|
"num_input_tokens_seen": 10232640,
|
|
"step": 10655
|
|
},
|
|
{
|
|
"epoch": 0.9046164290563475,
|
|
"grad_norm": 42.264034271240234,
|
|
"learning_rate": 5.502120379293585e-08,
|
|
"loss": 0.1016,
|
|
"num_input_tokens_seen": 10236864,
|
|
"step": 10660
|
|
},
|
|
{
|
|
"epoch": 0.905040733197556,
|
|
"grad_norm": 0.647139847278595,
|
|
"learning_rate": 5.453769828241872e-08,
|
|
"loss": 0.056,
|
|
"num_input_tokens_seen": 10241216,
|
|
"step": 10665
|
|
},
|
|
{
|
|
"epoch": 0.9054650373387644,
|
|
"grad_norm": 25.90431785583496,
|
|
"learning_rate": 5.4056267028813606e-08,
|
|
"loss": 0.0501,
|
|
"num_input_tokens_seen": 10245952,
|
|
"step": 10670
|
|
},
|
|
{
|
|
"epoch": 0.9058893414799728,
|
|
"grad_norm": 24.600618362426758,
|
|
"learning_rate": 5.357691108833584e-08,
|
|
"loss": 0.0468,
|
|
"num_input_tokens_seen": 10251392,
|
|
"step": 10675
|
|
},
|
|
{
|
|
"epoch": 0.9063136456211812,
|
|
"grad_norm": 6.45746374130249,
|
|
"learning_rate": 5.309963151264829e-08,
|
|
"loss": 0.1057,
|
|
"num_input_tokens_seen": 10256128,
|
|
"step": 10680
|
|
},
|
|
{
|
|
"epoch": 0.9067379497623896,
|
|
"grad_norm": 2.4365360736846924,
|
|
"learning_rate": 5.262442934885813e-08,
|
|
"loss": 0.0452,
|
|
"num_input_tokens_seen": 10260352,
|
|
"step": 10685
|
|
},
|
|
{
|
|
"epoch": 0.9071622539035981,
|
|
"grad_norm": 0.09468277543783188,
|
|
"learning_rate": 5.21513056395152e-08,
|
|
"loss": 0.0126,
|
|
"num_input_tokens_seen": 10265344,
|
|
"step": 10690
|
|
},
|
|
{
|
|
"epoch": 0.9075865580448065,
|
|
"grad_norm": 14.318769454956055,
|
|
"learning_rate": 5.168026142260862e-08,
|
|
"loss": 0.0545,
|
|
"num_input_tokens_seen": 10269632,
|
|
"step": 10695
|
|
},
|
|
{
|
|
"epoch": 0.9080108621860149,
|
|
"grad_norm": 21.60463523864746,
|
|
"learning_rate": 5.121129773156663e-08,
|
|
"loss": 0.0179,
|
|
"num_input_tokens_seen": 10274560,
|
|
"step": 10700
|
|
},
|
|
{
|
|
"epoch": 0.9084351663272233,
|
|
"grad_norm": 0.18397925794124603,
|
|
"learning_rate": 5.074441559525167e-08,
|
|
"loss": 0.0366,
|
|
"num_input_tokens_seen": 10279552,
|
|
"step": 10705
|
|
},
|
|
{
|
|
"epoch": 0.9088594704684317,
|
|
"grad_norm": 1.6498712301254272,
|
|
"learning_rate": 5.027961603796027e-08,
|
|
"loss": 0.0427,
|
|
"num_input_tokens_seen": 10284288,
|
|
"step": 10710
|
|
},
|
|
{
|
|
"epoch": 0.9092837746096402,
|
|
"grad_norm": 1.6799012422561646,
|
|
"learning_rate": 4.981690007941952e-08,
|
|
"loss": 0.0356,
|
|
"num_input_tokens_seen": 10289600,
|
|
"step": 10715
|
|
},
|
|
{
|
|
"epoch": 0.9097080787508486,
|
|
"grad_norm": 7.8889570236206055,
|
|
"learning_rate": 4.93562687347856e-08,
|
|
"loss": 0.0221,
|
|
"num_input_tokens_seen": 10294144,
|
|
"step": 10720
|
|
},
|
|
{
|
|
"epoch": 0.910132382892057,
|
|
"grad_norm": 34.05645751953125,
|
|
"learning_rate": 4.889772301464112e-08,
|
|
"loss": 0.0435,
|
|
"num_input_tokens_seen": 10298752,
|
|
"step": 10725
|
|
},
|
|
{
|
|
"epoch": 0.9105566870332654,
|
|
"grad_norm": 15.844537734985352,
|
|
"learning_rate": 4.844126392499304e-08,
|
|
"loss": 0.0521,
|
|
"num_input_tokens_seen": 10303424,
|
|
"step": 10730
|
|
},
|
|
{
|
|
"epoch": 0.9109809911744738,
|
|
"grad_norm": 15.672754287719727,
|
|
"learning_rate": 4.7986892467270057e-08,
|
|
"loss": 0.0533,
|
|
"num_input_tokens_seen": 10308096,
|
|
"step": 10735
|
|
},
|
|
{
|
|
"epoch": 0.9114052953156823,
|
|
"grad_norm": 8.268491744995117,
|
|
"learning_rate": 4.7534609638321785e-08,
|
|
"loss": 0.0556,
|
|
"num_input_tokens_seen": 10313152,
|
|
"step": 10740
|
|
},
|
|
{
|
|
"epoch": 0.9118295994568907,
|
|
"grad_norm": 15.37960147857666,
|
|
"learning_rate": 4.70844164304145e-08,
|
|
"loss": 0.029,
|
|
"num_input_tokens_seen": 10318016,
|
|
"step": 10745
|
|
},
|
|
{
|
|
"epoch": 0.9122539035980991,
|
|
"grad_norm": 0.5389252305030823,
|
|
"learning_rate": 4.663631383123057e-08,
|
|
"loss": 0.0327,
|
|
"num_input_tokens_seen": 10322432,
|
|
"step": 10750
|
|
},
|
|
{
|
|
"epoch": 0.9126782077393075,
|
|
"grad_norm": 0.18049681186676025,
|
|
"learning_rate": 4.61903028238656e-08,
|
|
"loss": 0.0111,
|
|
"num_input_tokens_seen": 10327168,
|
|
"step": 10755
|
|
},
|
|
{
|
|
"epoch": 0.9131025118805159,
|
|
"grad_norm": 0.9188250303268433,
|
|
"learning_rate": 4.5746384386826767e-08,
|
|
"loss": 0.085,
|
|
"num_input_tokens_seen": 10331584,
|
|
"step": 10760
|
|
},
|
|
{
|
|
"epoch": 0.9135268160217244,
|
|
"grad_norm": 0.3090452551841736,
|
|
"learning_rate": 4.5304559494030004e-08,
|
|
"loss": 0.0423,
|
|
"num_input_tokens_seen": 10336640,
|
|
"step": 10765
|
|
},
|
|
{
|
|
"epoch": 0.9139511201629328,
|
|
"grad_norm": 10.45613956451416,
|
|
"learning_rate": 4.486482911479839e-08,
|
|
"loss": 0.0531,
|
|
"num_input_tokens_seen": 10341440,
|
|
"step": 10770
|
|
},
|
|
{
|
|
"epoch": 0.9143754243041412,
|
|
"grad_norm": 1.134630560874939,
|
|
"learning_rate": 4.442719421385921e-08,
|
|
"loss": 0.0093,
|
|
"num_input_tokens_seen": 10346624,
|
|
"step": 10775
|
|
},
|
|
{
|
|
"epoch": 0.9147997284453496,
|
|
"grad_norm": 0.4645881652832031,
|
|
"learning_rate": 4.399165575134378e-08,
|
|
"loss": 0.0072,
|
|
"num_input_tokens_seen": 10351552,
|
|
"step": 10780
|
|
},
|
|
{
|
|
"epoch": 0.915224032586558,
|
|
"grad_norm": 9.196870803833008,
|
|
"learning_rate": 4.3558214682782645e-08,
|
|
"loss": 0.0666,
|
|
"num_input_tokens_seen": 10356352,
|
|
"step": 10785
|
|
},
|
|
{
|
|
"epoch": 0.9156483367277665,
|
|
"grad_norm": 0.09253697842359543,
|
|
"learning_rate": 4.312687195910558e-08,
|
|
"loss": 0.0704,
|
|
"num_input_tokens_seen": 10361920,
|
|
"step": 10790
|
|
},
|
|
{
|
|
"epoch": 0.9160726408689749,
|
|
"grad_norm": 7.137597560882568,
|
|
"learning_rate": 4.269762852663894e-08,
|
|
"loss": 0.0733,
|
|
"num_input_tokens_seen": 10366272,
|
|
"step": 10795
|
|
},
|
|
{
|
|
"epoch": 0.9164969450101833,
|
|
"grad_norm": 32.30122756958008,
|
|
"learning_rate": 4.227048532710287e-08,
|
|
"loss": 0.0074,
|
|
"num_input_tokens_seen": 10371328,
|
|
"step": 10800
|
|
},
|
|
{
|
|
"epoch": 0.9169212491513917,
|
|
"grad_norm": 10.809283256530762,
|
|
"learning_rate": 4.184544329761008e-08,
|
|
"loss": 0.0557,
|
|
"num_input_tokens_seen": 10376384,
|
|
"step": 10805
|
|
},
|
|
{
|
|
"epoch": 0.9173455532926001,
|
|
"grad_norm": 10.046077728271484,
|
|
"learning_rate": 4.1422503370663553e-08,
|
|
"loss": 0.081,
|
|
"num_input_tokens_seen": 10381120,
|
|
"step": 10810
|
|
},
|
|
{
|
|
"epoch": 0.9177698574338086,
|
|
"grad_norm": 0.16478992998600006,
|
|
"learning_rate": 4.100166647415437e-08,
|
|
"loss": 0.0242,
|
|
"num_input_tokens_seen": 10385536,
|
|
"step": 10815
|
|
},
|
|
{
|
|
"epoch": 0.918194161575017,
|
|
"grad_norm": 0.41408753395080566,
|
|
"learning_rate": 4.058293353135988e-08,
|
|
"loss": 0.0198,
|
|
"num_input_tokens_seen": 10390208,
|
|
"step": 10820
|
|
},
|
|
{
|
|
"epoch": 0.9186184657162254,
|
|
"grad_norm": 0.22838032245635986,
|
|
"learning_rate": 4.016630546094158e-08,
|
|
"loss": 0.0538,
|
|
"num_input_tokens_seen": 10394560,
|
|
"step": 10825
|
|
},
|
|
{
|
|
"epoch": 0.9190427698574338,
|
|
"grad_norm": 1.2464312314987183,
|
|
"learning_rate": 3.975178317694239e-08,
|
|
"loss": 0.0745,
|
|
"num_input_tokens_seen": 10398848,
|
|
"step": 10830
|
|
},
|
|
{
|
|
"epoch": 0.9194670739986422,
|
|
"grad_norm": 10.291000366210938,
|
|
"learning_rate": 3.9339367588786644e-08,
|
|
"loss": 0.05,
|
|
"num_input_tokens_seen": 10404160,
|
|
"step": 10835
|
|
},
|
|
{
|
|
"epoch": 0.9198913781398507,
|
|
"grad_norm": 7.965226650238037,
|
|
"learning_rate": 3.892905960127546e-08,
|
|
"loss": 0.051,
|
|
"num_input_tokens_seen": 10408704,
|
|
"step": 10840
|
|
},
|
|
{
|
|
"epoch": 0.9203156822810591,
|
|
"grad_norm": 27.655311584472656,
|
|
"learning_rate": 3.852086011458688e-08,
|
|
"loss": 0.0546,
|
|
"num_input_tokens_seen": 10413312,
|
|
"step": 10845
|
|
},
|
|
{
|
|
"epoch": 0.9207399864222675,
|
|
"grad_norm": 33.12567138671875,
|
|
"learning_rate": 3.811477002427288e-08,
|
|
"loss": 0.0619,
|
|
"num_input_tokens_seen": 10418048,
|
|
"step": 10850
|
|
},
|
|
{
|
|
"epoch": 0.9211642905634759,
|
|
"grad_norm": 0.47668957710266113,
|
|
"learning_rate": 3.771079022125745e-08,
|
|
"loss": 0.0272,
|
|
"num_input_tokens_seen": 10422464,
|
|
"step": 10855
|
|
},
|
|
{
|
|
"epoch": 0.9215885947046843,
|
|
"grad_norm": 6.440741062164307,
|
|
"learning_rate": 3.7308921591835074e-08,
|
|
"loss": 0.1013,
|
|
"num_input_tokens_seen": 10426880,
|
|
"step": 10860
|
|
},
|
|
{
|
|
"epoch": 0.9220128988458928,
|
|
"grad_norm": 16.58060073852539,
|
|
"learning_rate": 3.6909165017668385e-08,
|
|
"loss": 0.1054,
|
|
"num_input_tokens_seen": 10431232,
|
|
"step": 10865
|
|
},
|
|
{
|
|
"epoch": 0.9224372029871012,
|
|
"grad_norm": 28.490877151489258,
|
|
"learning_rate": 3.651152137578617e-08,
|
|
"loss": 0.124,
|
|
"num_input_tokens_seen": 10437120,
|
|
"step": 10870
|
|
},
|
|
{
|
|
"epoch": 0.9228615071283096,
|
|
"grad_norm": 11.938909530639648,
|
|
"learning_rate": 3.611599153858214e-08,
|
|
"loss": 0.1294,
|
|
"num_input_tokens_seen": 10443456,
|
|
"step": 10875
|
|
},
|
|
{
|
|
"epoch": 0.923285811269518,
|
|
"grad_norm": 7.236656665802002,
|
|
"learning_rate": 3.572257637381182e-08,
|
|
"loss": 0.059,
|
|
"num_input_tokens_seen": 10448576,
|
|
"step": 10880
|
|
},
|
|
{
|
|
"epoch": 0.9237101154107265,
|
|
"grad_norm": 1.7884862422943115,
|
|
"learning_rate": 3.533127674459202e-08,
|
|
"loss": 0.0306,
|
|
"num_input_tokens_seen": 10453440,
|
|
"step": 10885
|
|
},
|
|
{
|
|
"epoch": 0.9241344195519349,
|
|
"grad_norm": 0.2581826448440552,
|
|
"learning_rate": 3.494209350939792e-08,
|
|
"loss": 0.0249,
|
|
"num_input_tokens_seen": 10458176,
|
|
"step": 10890
|
|
},
|
|
{
|
|
"epoch": 0.9245587236931433,
|
|
"grad_norm": 11.267096519470215,
|
|
"learning_rate": 3.455502752206152e-08,
|
|
"loss": 0.0697,
|
|
"num_input_tokens_seen": 10463232,
|
|
"step": 10895
|
|
},
|
|
{
|
|
"epoch": 0.9249830278343517,
|
|
"grad_norm": 0.24631553888320923,
|
|
"learning_rate": 3.4170079631769764e-08,
|
|
"loss": 0.092,
|
|
"num_input_tokens_seen": 10468032,
|
|
"step": 10900
|
|
},
|
|
{
|
|
"epoch": 0.9254073319755601,
|
|
"grad_norm": 8.674886703491211,
|
|
"learning_rate": 3.378725068306298e-08,
|
|
"loss": 0.0149,
|
|
"num_input_tokens_seen": 10472832,
|
|
"step": 10905
|
|
},
|
|
{
|
|
"epoch": 0.9258316361167686,
|
|
"grad_norm": 0.41779080033302307,
|
|
"learning_rate": 3.3406541515832e-08,
|
|
"loss": 0.0242,
|
|
"num_input_tokens_seen": 10477248,
|
|
"step": 10910
|
|
},
|
|
{
|
|
"epoch": 0.926255940257977,
|
|
"grad_norm": 11.405782699584961,
|
|
"learning_rate": 3.302795296531813e-08,
|
|
"loss": 0.0648,
|
|
"num_input_tokens_seen": 10481920,
|
|
"step": 10915
|
|
},
|
|
{
|
|
"epoch": 0.9266802443991853,
|
|
"grad_norm": 6.852872848510742,
|
|
"learning_rate": 3.265148586210942e-08,
|
|
"loss": 0.0702,
|
|
"num_input_tokens_seen": 10486976,
|
|
"step": 10920
|
|
},
|
|
{
|
|
"epoch": 0.9271045485403937,
|
|
"grad_norm": 26.437047958374023,
|
|
"learning_rate": 3.2277141032139746e-08,
|
|
"loss": 0.0606,
|
|
"num_input_tokens_seen": 10491712,
|
|
"step": 10925
|
|
},
|
|
{
|
|
"epoch": 0.9275288526816021,
|
|
"grad_norm": 31.492311477661133,
|
|
"learning_rate": 3.190491929668748e-08,
|
|
"loss": 0.0879,
|
|
"num_input_tokens_seen": 10496960,
|
|
"step": 10930
|
|
},
|
|
{
|
|
"epoch": 0.9279531568228105,
|
|
"grad_norm": 29.28728675842285,
|
|
"learning_rate": 3.15348214723723e-08,
|
|
"loss": 0.0618,
|
|
"num_input_tokens_seen": 10501120,
|
|
"step": 10935
|
|
},
|
|
{
|
|
"epoch": 0.928377460964019,
|
|
"grad_norm": 0.35964885354042053,
|
|
"learning_rate": 3.11668483711548e-08,
|
|
"loss": 0.025,
|
|
"num_input_tokens_seen": 10507008,
|
|
"step": 10940
|
|
},
|
|
{
|
|
"epoch": 0.9288017651052274,
|
|
"grad_norm": 18.892667770385742,
|
|
"learning_rate": 3.0801000800333876e-08,
|
|
"loss": 0.0437,
|
|
"num_input_tokens_seen": 10511424,
|
|
"step": 10945
|
|
},
|
|
{
|
|
"epoch": 0.9292260692464358,
|
|
"grad_norm": 2.3423001766204834,
|
|
"learning_rate": 3.043727956254538e-08,
|
|
"loss": 0.0336,
|
|
"num_input_tokens_seen": 10515968,
|
|
"step": 10950
|
|
},
|
|
{
|
|
"epoch": 0.9296503733876442,
|
|
"grad_norm": 17.98472785949707,
|
|
"learning_rate": 3.007568545576011e-08,
|
|
"loss": 0.1059,
|
|
"num_input_tokens_seen": 10520576,
|
|
"step": 10955
|
|
},
|
|
{
|
|
"epoch": 0.9300746775288526,
|
|
"grad_norm": 5.375370979309082,
|
|
"learning_rate": 2.971621927328216e-08,
|
|
"loss": 0.0473,
|
|
"num_input_tokens_seen": 10525504,
|
|
"step": 10960
|
|
},
|
|
{
|
|
"epoch": 0.9304989816700611,
|
|
"grad_norm": 0.5201108455657959,
|
|
"learning_rate": 2.9358881803746794e-08,
|
|
"loss": 0.1271,
|
|
"num_input_tokens_seen": 10530112,
|
|
"step": 10965
|
|
},
|
|
{
|
|
"epoch": 0.9309232858112695,
|
|
"grad_norm": 0.5180962681770325,
|
|
"learning_rate": 2.900367383111979e-08,
|
|
"loss": 0.0701,
|
|
"num_input_tokens_seen": 10535232,
|
|
"step": 10970
|
|
},
|
|
{
|
|
"epoch": 0.9313475899524779,
|
|
"grad_norm": 13.562655448913574,
|
|
"learning_rate": 2.865059613469434e-08,
|
|
"loss": 0.0287,
|
|
"num_input_tokens_seen": 10539712,
|
|
"step": 10975
|
|
},
|
|
{
|
|
"epoch": 0.9317718940936863,
|
|
"grad_norm": 1.6060277223587036,
|
|
"learning_rate": 2.829964948909047e-08,
|
|
"loss": 0.0591,
|
|
"num_input_tokens_seen": 10544128,
|
|
"step": 10980
|
|
},
|
|
{
|
|
"epoch": 0.9321961982348947,
|
|
"grad_norm": 9.424308776855469,
|
|
"learning_rate": 2.795083466425252e-08,
|
|
"loss": 0.0196,
|
|
"num_input_tokens_seen": 10548288,
|
|
"step": 10985
|
|
},
|
|
{
|
|
"epoch": 0.9326205023761032,
|
|
"grad_norm": 30.277997970581055,
|
|
"learning_rate": 2.760415242544811e-08,
|
|
"loss": 0.0193,
|
|
"num_input_tokens_seen": 10552512,
|
|
"step": 10990
|
|
},
|
|
{
|
|
"epoch": 0.9330448065173116,
|
|
"grad_norm": 1.7532597780227661,
|
|
"learning_rate": 2.7259603533266063e-08,
|
|
"loss": 0.0724,
|
|
"num_input_tokens_seen": 10557952,
|
|
"step": 10995
|
|
},
|
|
{
|
|
"epoch": 0.93346911065852,
|
|
"grad_norm": 3.480756998062134,
|
|
"learning_rate": 2.6917188743614704e-08,
|
|
"loss": 0.0539,
|
|
"num_input_tokens_seen": 10562240,
|
|
"step": 11000
|
|
},
|
|
{
|
|
"epoch": 0.9338934147997284,
|
|
"grad_norm": 7.805656433105469,
|
|
"learning_rate": 2.6576908807720233e-08,
|
|
"loss": 0.0825,
|
|
"num_input_tokens_seen": 10566976,
|
|
"step": 11005
|
|
},
|
|
{
|
|
"epoch": 0.9343177189409368,
|
|
"grad_norm": 20.04058074951172,
|
|
"learning_rate": 2.623876447212592e-08,
|
|
"loss": 0.0341,
|
|
"num_input_tokens_seen": 10571584,
|
|
"step": 11010
|
|
},
|
|
{
|
|
"epoch": 0.9347420230821453,
|
|
"grad_norm": 18.889392852783203,
|
|
"learning_rate": 2.590275647868867e-08,
|
|
"loss": 0.0808,
|
|
"num_input_tokens_seen": 10576832,
|
|
"step": 11015
|
|
},
|
|
{
|
|
"epoch": 0.9351663272233537,
|
|
"grad_norm": 33.9774284362793,
|
|
"learning_rate": 2.5568885564579258e-08,
|
|
"loss": 0.0354,
|
|
"num_input_tokens_seen": 10581184,
|
|
"step": 11020
|
|
},
|
|
{
|
|
"epoch": 0.9355906313645621,
|
|
"grad_norm": 1.0020630359649658,
|
|
"learning_rate": 2.5237152462279532e-08,
|
|
"loss": 0.0384,
|
|
"num_input_tokens_seen": 10585792,
|
|
"step": 11025
|
|
},
|
|
{
|
|
"epoch": 0.9360149355057705,
|
|
"grad_norm": 0.8174886703491211,
|
|
"learning_rate": 2.4907557899581212e-08,
|
|
"loss": 0.0124,
|
|
"num_input_tokens_seen": 10590016,
|
|
"step": 11030
|
|
},
|
|
{
|
|
"epoch": 0.936439239646979,
|
|
"grad_norm": 1.332942008972168,
|
|
"learning_rate": 2.4580102599584317e-08,
|
|
"loss": 0.0155,
|
|
"num_input_tokens_seen": 10594624,
|
|
"step": 11035
|
|
},
|
|
{
|
|
"epoch": 0.9368635437881874,
|
|
"grad_norm": 15.656383514404297,
|
|
"learning_rate": 2.425478728069552e-08,
|
|
"loss": 0.1189,
|
|
"num_input_tokens_seen": 10598912,
|
|
"step": 11040
|
|
},
|
|
{
|
|
"epoch": 0.9372878479293958,
|
|
"grad_norm": 26.110389709472656,
|
|
"learning_rate": 2.3931612656626688e-08,
|
|
"loss": 0.0298,
|
|
"num_input_tokens_seen": 10603648,
|
|
"step": 11045
|
|
},
|
|
{
|
|
"epoch": 0.9377121520706042,
|
|
"grad_norm": 1.4835513830184937,
|
|
"learning_rate": 2.3610579436392996e-08,
|
|
"loss": 0.0182,
|
|
"num_input_tokens_seen": 10608640,
|
|
"step": 11050
|
|
},
|
|
{
|
|
"epoch": 0.9381364562118126,
|
|
"grad_norm": 13.837592124938965,
|
|
"learning_rate": 2.329168832431161e-08,
|
|
"loss": 0.0991,
|
|
"num_input_tokens_seen": 10613312,
|
|
"step": 11055
|
|
},
|
|
{
|
|
"epoch": 0.938560760353021,
|
|
"grad_norm": 0.43932077288627625,
|
|
"learning_rate": 2.2974940020000112e-08,
|
|
"loss": 0.0121,
|
|
"num_input_tokens_seen": 10617600,
|
|
"step": 11060
|
|
},
|
|
{
|
|
"epoch": 0.9389850644942295,
|
|
"grad_norm": 33.94478225708008,
|
|
"learning_rate": 2.266033521837529e-08,
|
|
"loss": 0.0846,
|
|
"num_input_tokens_seen": 10622144,
|
|
"step": 11065
|
|
},
|
|
{
|
|
"epoch": 0.9394093686354379,
|
|
"grad_norm": 2.8713018894195557,
|
|
"learning_rate": 2.2347874609650596e-08,
|
|
"loss": 0.0031,
|
|
"num_input_tokens_seen": 10626880,
|
|
"step": 11070
|
|
},
|
|
{
|
|
"epoch": 0.9398336727766463,
|
|
"grad_norm": 21.58746910095215,
|
|
"learning_rate": 2.2037558879336004e-08,
|
|
"loss": 0.0855,
|
|
"num_input_tokens_seen": 10632128,
|
|
"step": 11075
|
|
},
|
|
{
|
|
"epoch": 0.9402579769178547,
|
|
"grad_norm": 6.199968338012695,
|
|
"learning_rate": 2.1729388708235485e-08,
|
|
"loss": 0.0285,
|
|
"num_input_tokens_seen": 10636800,
|
|
"step": 11080
|
|
},
|
|
{
|
|
"epoch": 0.9406822810590632,
|
|
"grad_norm": 0.1813025027513504,
|
|
"learning_rate": 2.1423364772445886e-08,
|
|
"loss": 0.0396,
|
|
"num_input_tokens_seen": 10641408,
|
|
"step": 11085
|
|
},
|
|
{
|
|
"epoch": 0.9411065852002716,
|
|
"grad_norm": 1.2281008958816528,
|
|
"learning_rate": 2.111948774335548e-08,
|
|
"loss": 0.0049,
|
|
"num_input_tokens_seen": 10646400,
|
|
"step": 11090
|
|
},
|
|
{
|
|
"epoch": 0.94153088934148,
|
|
"grad_norm": 39.907047271728516,
|
|
"learning_rate": 2.081775828764254e-08,
|
|
"loss": 0.1104,
|
|
"num_input_tokens_seen": 10650816,
|
|
"step": 11095
|
|
},
|
|
{
|
|
"epoch": 0.9419551934826884,
|
|
"grad_norm": 0.5580568313598633,
|
|
"learning_rate": 2.0518177067273103e-08,
|
|
"loss": 0.0055,
|
|
"num_input_tokens_seen": 10655424,
|
|
"step": 11100
|
|
},
|
|
{
|
|
"epoch": 0.9423794976238968,
|
|
"grad_norm": 0.8987800478935242,
|
|
"learning_rate": 2.0220744739501305e-08,
|
|
"loss": 0.0872,
|
|
"num_input_tokens_seen": 10660416,
|
|
"step": 11105
|
|
},
|
|
{
|
|
"epoch": 0.9428038017651053,
|
|
"grad_norm": 7.984748363494873,
|
|
"learning_rate": 1.992546195686573e-08,
|
|
"loss": 0.0639,
|
|
"num_input_tokens_seen": 10665088,
|
|
"step": 11110
|
|
},
|
|
{
|
|
"epoch": 0.9432281059063137,
|
|
"grad_norm": 0.11250204592943192,
|
|
"learning_rate": 1.9632329367189725e-08,
|
|
"loss": 0.0376,
|
|
"num_input_tokens_seen": 10669632,
|
|
"step": 11115
|
|
},
|
|
{
|
|
"epoch": 0.9436524100475221,
|
|
"grad_norm": 4.553378105163574,
|
|
"learning_rate": 1.9341347613579086e-08,
|
|
"loss": 0.0242,
|
|
"num_input_tokens_seen": 10674752,
|
|
"step": 11120
|
|
},
|
|
{
|
|
"epoch": 0.9440767141887305,
|
|
"grad_norm": 3.254133701324463,
|
|
"learning_rate": 1.9052517334420704e-08,
|
|
"loss": 0.0308,
|
|
"num_input_tokens_seen": 10679296,
|
|
"step": 11125
|
|
},
|
|
{
|
|
"epoch": 0.9445010183299389,
|
|
"grad_norm": 29.89137077331543,
|
|
"learning_rate": 1.8765839163381815e-08,
|
|
"loss": 0.1052,
|
|
"num_input_tokens_seen": 10683968,
|
|
"step": 11130
|
|
},
|
|
{
|
|
"epoch": 0.9449253224711474,
|
|
"grad_norm": 0.5620746612548828,
|
|
"learning_rate": 1.8481313729407645e-08,
|
|
"loss": 0.0585,
|
|
"num_input_tokens_seen": 10688512,
|
|
"step": 11135
|
|
},
|
|
{
|
|
"epoch": 0.9453496266123558,
|
|
"grad_norm": 9.976305961608887,
|
|
"learning_rate": 1.8198941656720646e-08,
|
|
"loss": 0.1007,
|
|
"num_input_tokens_seen": 10693312,
|
|
"step": 11140
|
|
},
|
|
{
|
|
"epoch": 0.9457739307535642,
|
|
"grad_norm": 4.825209617614746,
|
|
"learning_rate": 1.7918723564819272e-08,
|
|
"loss": 0.1047,
|
|
"num_input_tokens_seen": 10698688,
|
|
"step": 11145
|
|
},
|
|
{
|
|
"epoch": 0.9461982348947726,
|
|
"grad_norm": 13.057853698730469,
|
|
"learning_rate": 1.7640660068475976e-08,
|
|
"loss": 0.1101,
|
|
"num_input_tokens_seen": 10704448,
|
|
"step": 11150
|
|
},
|
|
{
|
|
"epoch": 0.946622539035981,
|
|
"grad_norm": 0.24543944001197815,
|
|
"learning_rate": 1.7364751777736332e-08,
|
|
"loss": 0.0517,
|
|
"num_input_tokens_seen": 10709312,
|
|
"step": 11155
|
|
},
|
|
{
|
|
"epoch": 0.9470468431771895,
|
|
"grad_norm": 16.951963424682617,
|
|
"learning_rate": 1.7090999297917684e-08,
|
|
"loss": 0.0444,
|
|
"num_input_tokens_seen": 10714368,
|
|
"step": 11160
|
|
},
|
|
{
|
|
"epoch": 0.9474711473183979,
|
|
"grad_norm": 57.081661224365234,
|
|
"learning_rate": 1.6819403229607732e-08,
|
|
"loss": 0.0684,
|
|
"num_input_tokens_seen": 10718848,
|
|
"step": 11165
|
|
},
|
|
{
|
|
"epoch": 0.9478954514596063,
|
|
"grad_norm": 0.26134440302848816,
|
|
"learning_rate": 1.6549964168663054e-08,
|
|
"loss": 0.0475,
|
|
"num_input_tokens_seen": 10723712,
|
|
"step": 11170
|
|
},
|
|
{
|
|
"epoch": 0.9483197556008147,
|
|
"grad_norm": 6.485694408416748,
|
|
"learning_rate": 1.6282682706208028e-08,
|
|
"loss": 0.0466,
|
|
"num_input_tokens_seen": 10728384,
|
|
"step": 11175
|
|
},
|
|
{
|
|
"epoch": 0.948744059742023,
|
|
"grad_norm": 0.27115127444267273,
|
|
"learning_rate": 1.6017559428633588e-08,
|
|
"loss": 0.0178,
|
|
"num_input_tokens_seen": 10733632,
|
|
"step": 11180
|
|
},
|
|
{
|
|
"epoch": 0.9491683638832314,
|
|
"grad_norm": 0.25695866346359253,
|
|
"learning_rate": 1.5754594917595564e-08,
|
|
"loss": 0.0516,
|
|
"num_input_tokens_seen": 10738880,
|
|
"step": 11185
|
|
},
|
|
{
|
|
"epoch": 0.9495926680244399,
|
|
"grad_norm": 18.23794937133789,
|
|
"learning_rate": 1.549378975001403e-08,
|
|
"loss": 0.0705,
|
|
"num_input_tokens_seen": 10744384,
|
|
"step": 11190
|
|
},
|
|
{
|
|
"epoch": 0.9500169721656483,
|
|
"grad_norm": 15.828742980957031,
|
|
"learning_rate": 1.5235144498071172e-08,
|
|
"loss": 0.0316,
|
|
"num_input_tokens_seen": 10749632,
|
|
"step": 11195
|
|
},
|
|
{
|
|
"epoch": 0.9504412763068567,
|
|
"grad_norm": 18.126686096191406,
|
|
"learning_rate": 1.4978659729210974e-08,
|
|
"loss": 0.0319,
|
|
"num_input_tokens_seen": 10754624,
|
|
"step": 11200
|
|
},
|
|
{
|
|
"epoch": 0.9508655804480651,
|
|
"grad_norm": 26.420827865600586,
|
|
"learning_rate": 1.4724336006137094e-08,
|
|
"loss": 0.0789,
|
|
"num_input_tokens_seen": 10759488,
|
|
"step": 11205
|
|
},
|
|
{
|
|
"epoch": 0.9512898845892735,
|
|
"grad_norm": 0.19809313118457794,
|
|
"learning_rate": 1.4472173886812433e-08,
|
|
"loss": 0.0039,
|
|
"num_input_tokens_seen": 10764096,
|
|
"step": 11210
|
|
},
|
|
{
|
|
"epoch": 0.9512898845892735,
|
|
"eval_loss": 0.05228019878268242,
|
|
"eval_runtime": 16.7783,
|
|
"eval_samples_per_second": 624.318,
|
|
"eval_steps_per_second": 78.077,
|
|
"num_input_tokens_seen": 10764096,
|
|
"step": 11210
|
|
},
|
|
{
|
|
"epoch": 0.951714188730482,
|
|
"grad_norm": 17.760011672973633,
|
|
"learning_rate": 1.4222173924457348e-08,
|
|
"loss": 0.0557,
|
|
"num_input_tokens_seen": 10768640,
|
|
"step": 11215
|
|
},
|
|
{
|
|
"epoch": 0.9521384928716904,
|
|
"grad_norm": 0.12983626127243042,
|
|
"learning_rate": 1.3974336667548659e-08,
|
|
"loss": 0.0454,
|
|
"num_input_tokens_seen": 10774016,
|
|
"step": 11220
|
|
},
|
|
{
|
|
"epoch": 0.9525627970128988,
|
|
"grad_norm": 0.3437061607837677,
|
|
"learning_rate": 1.3728662659818201e-08,
|
|
"loss": 0.0033,
|
|
"num_input_tokens_seen": 10779072,
|
|
"step": 11225
|
|
},
|
|
{
|
|
"epoch": 0.9529871011541072,
|
|
"grad_norm": 22.00689125061035,
|
|
"learning_rate": 1.3485152440252389e-08,
|
|
"loss": 0.0656,
|
|
"num_input_tokens_seen": 10784000,
|
|
"step": 11230
|
|
},
|
|
{
|
|
"epoch": 0.9534114052953157,
|
|
"grad_norm": 8.476043701171875,
|
|
"learning_rate": 1.3243806543089875e-08,
|
|
"loss": 0.0665,
|
|
"num_input_tokens_seen": 10788928,
|
|
"step": 11235
|
|
},
|
|
{
|
|
"epoch": 0.9538357094365241,
|
|
"grad_norm": 8.972405433654785,
|
|
"learning_rate": 1.3004625497821553e-08,
|
|
"loss": 0.0257,
|
|
"num_input_tokens_seen": 10794304,
|
|
"step": 11240
|
|
},
|
|
{
|
|
"epoch": 0.9542600135777325,
|
|
"grad_norm": 6.371352195739746,
|
|
"learning_rate": 1.276760982918812e-08,
|
|
"loss": 0.0183,
|
|
"num_input_tokens_seen": 10798528,
|
|
"step": 11245
|
|
},
|
|
{
|
|
"epoch": 0.9546843177189409,
|
|
"grad_norm": 0.2125733196735382,
|
|
"learning_rate": 1.2532760057180291e-08,
|
|
"loss": 0.0544,
|
|
"num_input_tokens_seen": 10803200,
|
|
"step": 11250
|
|
},
|
|
{
|
|
"epoch": 0.9551086218601493,
|
|
"grad_norm": 0.3916779160499573,
|
|
"learning_rate": 1.230007669703681e-08,
|
|
"loss": 0.0297,
|
|
"num_input_tokens_seen": 10807680,
|
|
"step": 11255
|
|
},
|
|
{
|
|
"epoch": 0.9555329260013578,
|
|
"grad_norm": 0.18220025300979614,
|
|
"learning_rate": 1.2069560259243328e-08,
|
|
"loss": 0.0291,
|
|
"num_input_tokens_seen": 10812224,
|
|
"step": 11260
|
|
},
|
|
{
|
|
"epoch": 0.9559572301425662,
|
|
"grad_norm": 7.1517653465271,
|
|
"learning_rate": 1.1841211249531636e-08,
|
|
"loss": 0.0587,
|
|
"num_input_tokens_seen": 10816640,
|
|
"step": 11265
|
|
},
|
|
{
|
|
"epoch": 0.9563815342837746,
|
|
"grad_norm": 0.09612993150949478,
|
|
"learning_rate": 1.1615030168878438e-08,
|
|
"loss": 0.0291,
|
|
"num_input_tokens_seen": 10821440,
|
|
"step": 11270
|
|
},
|
|
{
|
|
"epoch": 0.956805838424983,
|
|
"grad_norm": 79.60122680664062,
|
|
"learning_rate": 1.139101751350402e-08,
|
|
"loss": 0.0627,
|
|
"num_input_tokens_seen": 10826560,
|
|
"step": 11275
|
|
},
|
|
{
|
|
"epoch": 0.9572301425661914,
|
|
"grad_norm": 24.210397720336914,
|
|
"learning_rate": 1.1169173774871477e-08,
|
|
"loss": 0.0722,
|
|
"num_input_tokens_seen": 10831552,
|
|
"step": 11280
|
|
},
|
|
{
|
|
"epoch": 0.9576544467073999,
|
|
"grad_norm": 0.4012939929962158,
|
|
"learning_rate": 1.0949499439685483e-08,
|
|
"loss": 0.0397,
|
|
"num_input_tokens_seen": 10835968,
|
|
"step": 11285
|
|
},
|
|
{
|
|
"epoch": 0.9580787508486083,
|
|
"grad_norm": 12.18429183959961,
|
|
"learning_rate": 1.0731994989891302e-08,
|
|
"loss": 0.054,
|
|
"num_input_tokens_seen": 10840064,
|
|
"step": 11290
|
|
},
|
|
{
|
|
"epoch": 0.9585030549898167,
|
|
"grad_norm": 10.551094055175781,
|
|
"learning_rate": 1.0516660902673446e-08,
|
|
"loss": 0.0437,
|
|
"num_input_tokens_seen": 10844672,
|
|
"step": 11295
|
|
},
|
|
{
|
|
"epoch": 0.9589273591310251,
|
|
"grad_norm": 0.16135410964488983,
|
|
"learning_rate": 1.0303497650455128e-08,
|
|
"loss": 0.0642,
|
|
"num_input_tokens_seen": 10850304,
|
|
"step": 11300
|
|
},
|
|
{
|
|
"epoch": 0.9593516632722335,
|
|
"grad_norm": 3.2537622451782227,
|
|
"learning_rate": 1.0092505700896703e-08,
|
|
"loss": 0.0309,
|
|
"num_input_tokens_seen": 10854912,
|
|
"step": 11305
|
|
},
|
|
{
|
|
"epoch": 0.959775967413442,
|
|
"grad_norm": 23.024616241455078,
|
|
"learning_rate": 9.883685516895113e-09,
|
|
"loss": 0.0377,
|
|
"num_input_tokens_seen": 10859904,
|
|
"step": 11310
|
|
},
|
|
{
|
|
"epoch": 0.9602002715546504,
|
|
"grad_norm": 11.104696273803711,
|
|
"learning_rate": 9.677037556582557e-09,
|
|
"loss": 0.0522,
|
|
"num_input_tokens_seen": 10864704,
|
|
"step": 11315
|
|
},
|
|
{
|
|
"epoch": 0.9606245756958588,
|
|
"grad_norm": 0.35379403829574585,
|
|
"learning_rate": 9.47256227332538e-09,
|
|
"loss": 0.0078,
|
|
"num_input_tokens_seen": 10869248,
|
|
"step": 11320
|
|
},
|
|
{
|
|
"epoch": 0.9610488798370672,
|
|
"grad_norm": 0.46451863646507263,
|
|
"learning_rate": 9.270260115723739e-09,
|
|
"loss": 0.0572,
|
|
"num_input_tokens_seen": 10873984,
|
|
"step": 11325
|
|
},
|
|
{
|
|
"epoch": 0.9614731839782756,
|
|
"grad_norm": 0.07047951221466064,
|
|
"learning_rate": 9.070131527609603e-09,
|
|
"loss": 0.0753,
|
|
"num_input_tokens_seen": 10878592,
|
|
"step": 11330
|
|
},
|
|
{
|
|
"epoch": 0.9618974881194841,
|
|
"grad_norm": 9.00889778137207,
|
|
"learning_rate": 8.872176948046761e-09,
|
|
"loss": 0.0636,
|
|
"num_input_tokens_seen": 10883264,
|
|
"step": 11335
|
|
},
|
|
{
|
|
"epoch": 0.9623217922606925,
|
|
"grad_norm": 1.0891268253326416,
|
|
"learning_rate": 8.676396811329145e-09,
|
|
"loss": 0.0409,
|
|
"num_input_tokens_seen": 10887552,
|
|
"step": 11340
|
|
},
|
|
{
|
|
"epoch": 0.9627460964019009,
|
|
"grad_norm": 7.38459587097168,
|
|
"learning_rate": 8.482791546980506e-09,
|
|
"loss": 0.0758,
|
|
"num_input_tokens_seen": 10892800,
|
|
"step": 11345
|
|
},
|
|
{
|
|
"epoch": 0.9631704005431093,
|
|
"grad_norm": 0.6855674982070923,
|
|
"learning_rate": 8.291361579752631e-09,
|
|
"loss": 0.0589,
|
|
"num_input_tokens_seen": 10897280,
|
|
"step": 11350
|
|
},
|
|
{
|
|
"epoch": 0.9635947046843177,
|
|
"grad_norm": 0.7800962924957275,
|
|
"learning_rate": 8.102107329625351e-09,
|
|
"loss": 0.0539,
|
|
"num_input_tokens_seen": 10902144,
|
|
"step": 11355
|
|
},
|
|
{
|
|
"epoch": 0.9640190088255262,
|
|
"grad_norm": 16.97479820251465,
|
|
"learning_rate": 7.91502921180487e-09,
|
|
"loss": 0.0877,
|
|
"num_input_tokens_seen": 10907264,
|
|
"step": 11360
|
|
},
|
|
{
|
|
"epoch": 0.9644433129667346,
|
|
"grad_norm": 0.42612016201019287,
|
|
"learning_rate": 7.730127636723538e-09,
|
|
"loss": 0.0131,
|
|
"num_input_tokens_seen": 10911808,
|
|
"step": 11365
|
|
},
|
|
{
|
|
"epoch": 0.964867617107943,
|
|
"grad_norm": 17.453168869018555,
|
|
"learning_rate": 7.547403010037978e-09,
|
|
"loss": 0.0949,
|
|
"num_input_tokens_seen": 10916544,
|
|
"step": 11370
|
|
},
|
|
{
|
|
"epoch": 0.9652919212491514,
|
|
"grad_norm": 20.739978790283203,
|
|
"learning_rate": 7.366855732629407e-09,
|
|
"loss": 0.0857,
|
|
"num_input_tokens_seen": 10921024,
|
|
"step": 11375
|
|
},
|
|
{
|
|
"epoch": 0.9657162253903598,
|
|
"grad_norm": 8.788089752197266,
|
|
"learning_rate": 7.1884862006017514e-09,
|
|
"loss": 0.0595,
|
|
"num_input_tokens_seen": 10926016,
|
|
"step": 11380
|
|
},
|
|
{
|
|
"epoch": 0.9661405295315683,
|
|
"grad_norm": 6.86511754989624,
|
|
"learning_rate": 7.012294805281205e-09,
|
|
"loss": 0.0522,
|
|
"num_input_tokens_seen": 10931200,
|
|
"step": 11385
|
|
},
|
|
{
|
|
"epoch": 0.9665648336727767,
|
|
"grad_norm": 11.662250518798828,
|
|
"learning_rate": 6.838281933215562e-09,
|
|
"loss": 0.0681,
|
|
"num_input_tokens_seen": 10936384,
|
|
"step": 11390
|
|
},
|
|
{
|
|
"epoch": 0.9669891378139851,
|
|
"grad_norm": 24.44302749633789,
|
|
"learning_rate": 6.6664479661729944e-09,
|
|
"loss": 0.0259,
|
|
"num_input_tokens_seen": 10941184,
|
|
"step": 11395
|
|
},
|
|
{
|
|
"epoch": 0.9674134419551935,
|
|
"grad_norm": 8.620841026306152,
|
|
"learning_rate": 6.496793281141055e-09,
|
|
"loss": 0.071,
|
|
"num_input_tokens_seen": 10945408,
|
|
"step": 11400
|
|
},
|
|
{
|
|
"epoch": 0.9678377460964019,
|
|
"grad_norm": 23.96091651916504,
|
|
"learning_rate": 6.329318250326898e-09,
|
|
"loss": 0.0914,
|
|
"num_input_tokens_seen": 10950144,
|
|
"step": 11405
|
|
},
|
|
{
|
|
"epoch": 0.9682620502376104,
|
|
"grad_norm": 2.312323570251465,
|
|
"learning_rate": 6.164023241154837e-09,
|
|
"loss": 0.1124,
|
|
"num_input_tokens_seen": 10955264,
|
|
"step": 11410
|
|
},
|
|
{
|
|
"epoch": 0.9686863543788188,
|
|
"grad_norm": 5.044360160827637,
|
|
"learning_rate": 6.000908616267009e-09,
|
|
"loss": 0.0089,
|
|
"num_input_tokens_seen": 10960256,
|
|
"step": 11415
|
|
},
|
|
{
|
|
"epoch": 0.9691106585200272,
|
|
"grad_norm": 20.324050903320312,
|
|
"learning_rate": 5.839974733522046e-09,
|
|
"loss": 0.0679,
|
|
"num_input_tokens_seen": 10964672,
|
|
"step": 11420
|
|
},
|
|
{
|
|
"epoch": 0.9695349626612356,
|
|
"grad_norm": 0.4777509272098541,
|
|
"learning_rate": 5.68122194599363e-09,
|
|
"loss": 0.036,
|
|
"num_input_tokens_seen": 10969728,
|
|
"step": 11425
|
|
},
|
|
{
|
|
"epoch": 0.969959266802444,
|
|
"grad_norm": 0.21774613857269287,
|
|
"learning_rate": 5.5246506019709374e-09,
|
|
"loss": 0.0904,
|
|
"num_input_tokens_seen": 10974720,
|
|
"step": 11430
|
|
},
|
|
{
|
|
"epoch": 0.9703835709436525,
|
|
"grad_norm": 12.210954666137695,
|
|
"learning_rate": 5.370261044956969e-09,
|
|
"loss": 0.0365,
|
|
"num_input_tokens_seen": 10979008,
|
|
"step": 11435
|
|
},
|
|
{
|
|
"epoch": 0.9708078750848609,
|
|
"grad_norm": 6.143990993499756,
|
|
"learning_rate": 5.218053613668116e-09,
|
|
"loss": 0.0372,
|
|
"num_input_tokens_seen": 10984128,
|
|
"step": 11440
|
|
},
|
|
{
|
|
"epoch": 0.9712321792260692,
|
|
"grad_norm": 0.47982257604599,
|
|
"learning_rate": 5.068028642033595e-09,
|
|
"loss": 0.0733,
|
|
"num_input_tokens_seen": 10988480,
|
|
"step": 11445
|
|
},
|
|
{
|
|
"epoch": 0.9716564833672776,
|
|
"grad_norm": 16.406841278076172,
|
|
"learning_rate": 4.92018645919412e-09,
|
|
"loss": 0.0465,
|
|
"num_input_tokens_seen": 10992896,
|
|
"step": 11450
|
|
},
|
|
{
|
|
"epoch": 0.972080787508486,
|
|
"grad_norm": 26.278352737426758,
|
|
"learning_rate": 4.774527389501681e-09,
|
|
"loss": 0.0816,
|
|
"num_input_tokens_seen": 10997696,
|
|
"step": 11455
|
|
},
|
|
{
|
|
"epoch": 0.9725050916496945,
|
|
"grad_norm": 2.272444009780884,
|
|
"learning_rate": 4.63105175251921e-09,
|
|
"loss": 0.0623,
|
|
"num_input_tokens_seen": 11002944,
|
|
"step": 11460
|
|
},
|
|
{
|
|
"epoch": 0.9729293957909029,
|
|
"grad_norm": 21.18509864807129,
|
|
"learning_rate": 4.489759863018583e-09,
|
|
"loss": 0.089,
|
|
"num_input_tokens_seen": 11008384,
|
|
"step": 11465
|
|
},
|
|
{
|
|
"epoch": 0.9733536999321113,
|
|
"grad_norm": 28.060937881469727,
|
|
"learning_rate": 4.350652030981394e-09,
|
|
"loss": 0.1202,
|
|
"num_input_tokens_seen": 11012800,
|
|
"step": 11470
|
|
},
|
|
{
|
|
"epoch": 0.9737780040733197,
|
|
"grad_norm": 17.513378143310547,
|
|
"learning_rate": 4.213728561597296e-09,
|
|
"loss": 0.1186,
|
|
"num_input_tokens_seen": 11017216,
|
|
"step": 11475
|
|
},
|
|
{
|
|
"epoch": 0.9742023082145281,
|
|
"grad_norm": 6.082868576049805,
|
|
"learning_rate": 4.0789897552637735e-09,
|
|
"loss": 0.0924,
|
|
"num_input_tokens_seen": 11021504,
|
|
"step": 11480
|
|
},
|
|
{
|
|
"epoch": 0.9746266123557366,
|
|
"grad_norm": 15.211711883544922,
|
|
"learning_rate": 3.946435907585255e-09,
|
|
"loss": 0.0909,
|
|
"num_input_tokens_seen": 11026816,
|
|
"step": 11485
|
|
},
|
|
{
|
|
"epoch": 0.975050916496945,
|
|
"grad_norm": 9.289284706115723,
|
|
"learning_rate": 3.816067309372673e-09,
|
|
"loss": 0.0712,
|
|
"num_input_tokens_seen": 11032448,
|
|
"step": 11490
|
|
},
|
|
{
|
|
"epoch": 0.9754752206381534,
|
|
"grad_norm": 8.785874366760254,
|
|
"learning_rate": 3.68788424664257e-09,
|
|
"loss": 0.115,
|
|
"num_input_tokens_seen": 11037056,
|
|
"step": 11495
|
|
},
|
|
{
|
|
"epoch": 0.9758995247793618,
|
|
"grad_norm": 0.4425792396068573,
|
|
"learning_rate": 3.561887000616881e-09,
|
|
"loss": 0.098,
|
|
"num_input_tokens_seen": 11041152,
|
|
"step": 11500
|
|
},
|
|
{
|
|
"epoch": 0.9763238289205702,
|
|
"grad_norm": 19.240644454956055,
|
|
"learning_rate": 3.438075847721933e-09,
|
|
"loss": 0.0343,
|
|
"num_input_tokens_seen": 11045504,
|
|
"step": 11505
|
|
},
|
|
{
|
|
"epoch": 0.9767481330617787,
|
|
"grad_norm": 11.093944549560547,
|
|
"learning_rate": 3.316451059587777e-09,
|
|
"loss": 0.0737,
|
|
"num_input_tokens_seen": 11049792,
|
|
"step": 11510
|
|
},
|
|
{
|
|
"epoch": 0.9771724372029871,
|
|
"grad_norm": 0.2328728288412094,
|
|
"learning_rate": 3.1970129030481907e-09,
|
|
"loss": 0.0391,
|
|
"num_input_tokens_seen": 11053952,
|
|
"step": 11515
|
|
},
|
|
{
|
|
"epoch": 0.9775967413441955,
|
|
"grad_norm": 0.33652907609939575,
|
|
"learning_rate": 3.0797616401392335e-09,
|
|
"loss": 0.0531,
|
|
"num_input_tokens_seen": 11058240,
|
|
"step": 11520
|
|
},
|
|
{
|
|
"epoch": 0.9780210454854039,
|
|
"grad_norm": 0.9989842176437378,
|
|
"learning_rate": 2.964697528099469e-09,
|
|
"loss": 0.0639,
|
|
"num_input_tokens_seen": 11063488,
|
|
"step": 11525
|
|
},
|
|
{
|
|
"epoch": 0.9784453496266123,
|
|
"grad_norm": 12.054276466369629,
|
|
"learning_rate": 2.8518208193689664e-09,
|
|
"loss": 0.0469,
|
|
"num_input_tokens_seen": 11068096,
|
|
"step": 11530
|
|
},
|
|
{
|
|
"epoch": 0.9788696537678208,
|
|
"grad_norm": 17.85956573486328,
|
|
"learning_rate": 2.741131761588522e-09,
|
|
"loss": 0.0953,
|
|
"num_input_tokens_seen": 11073728,
|
|
"step": 11535
|
|
},
|
|
{
|
|
"epoch": 0.9792939579090292,
|
|
"grad_norm": 10.166435241699219,
|
|
"learning_rate": 2.632630597600105e-09,
|
|
"loss": 0.0743,
|
|
"num_input_tokens_seen": 11078144,
|
|
"step": 11540
|
|
},
|
|
{
|
|
"epoch": 0.9797182620502376,
|
|
"grad_norm": 5.313485622406006,
|
|
"learning_rate": 2.526317565444969e-09,
|
|
"loss": 0.009,
|
|
"num_input_tokens_seen": 11083328,
|
|
"step": 11545
|
|
},
|
|
{
|
|
"epoch": 0.980142566191446,
|
|
"grad_norm": 17.92743682861328,
|
|
"learning_rate": 2.422192898364095e-09,
|
|
"loss": 0.0973,
|
|
"num_input_tokens_seen": 11088064,
|
|
"step": 11550
|
|
},
|
|
{
|
|
"epoch": 0.9805668703326544,
|
|
"grad_norm": 18.377792358398438,
|
|
"learning_rate": 2.3202568247974175e-09,
|
|
"loss": 0.023,
|
|
"num_input_tokens_seen": 11092736,
|
|
"step": 11555
|
|
},
|
|
{
|
|
"epoch": 0.9809911744738629,
|
|
"grad_norm": 21.25553321838379,
|
|
"learning_rate": 2.2205095683833774e-09,
|
|
"loss": 0.0487,
|
|
"num_input_tokens_seen": 11098176,
|
|
"step": 11560
|
|
},
|
|
{
|
|
"epoch": 0.9814154786150713,
|
|
"grad_norm": 25.436758041381836,
|
|
"learning_rate": 2.122951347958035e-09,
|
|
"loss": 0.0612,
|
|
"num_input_tokens_seen": 11102912,
|
|
"step": 11565
|
|
},
|
|
{
|
|
"epoch": 0.9818397827562797,
|
|
"grad_norm": 7.503152370452881,
|
|
"learning_rate": 2.0275823775551817e-09,
|
|
"loss": 0.1063,
|
|
"num_input_tokens_seen": 11107392,
|
|
"step": 11570
|
|
},
|
|
{
|
|
"epoch": 0.9822640868974881,
|
|
"grad_norm": 23.585283279418945,
|
|
"learning_rate": 1.934402866405671e-09,
|
|
"loss": 0.0492,
|
|
"num_input_tokens_seen": 11112256,
|
|
"step": 11575
|
|
},
|
|
{
|
|
"epoch": 0.9826883910386965,
|
|
"grad_norm": 2.8121836185455322,
|
|
"learning_rate": 1.843413018936535e-09,
|
|
"loss": 0.0275,
|
|
"num_input_tokens_seen": 11117376,
|
|
"step": 11580
|
|
},
|
|
{
|
|
"epoch": 0.983112695179905,
|
|
"grad_norm": 21.14581298828125,
|
|
"learning_rate": 1.7546130347712018e-09,
|
|
"loss": 0.0834,
|
|
"num_input_tokens_seen": 11122624,
|
|
"step": 11585
|
|
},
|
|
{
|
|
"epoch": 0.9835369993211134,
|
|
"grad_norm": 8.43548583984375,
|
|
"learning_rate": 1.6680031087286106e-09,
|
|
"loss": 0.0221,
|
|
"num_input_tokens_seen": 11127680,
|
|
"step": 11590
|
|
},
|
|
{
|
|
"epoch": 0.9839613034623218,
|
|
"grad_norm": 16.386993408203125,
|
|
"learning_rate": 1.5835834308228768e-09,
|
|
"loss": 0.0967,
|
|
"num_input_tokens_seen": 11132672,
|
|
"step": 11595
|
|
},
|
|
{
|
|
"epoch": 0.9843856076035302,
|
|
"grad_norm": 12.732009887695312,
|
|
"learning_rate": 1.5013541862630708e-09,
|
|
"loss": 0.0922,
|
|
"num_input_tokens_seen": 11137408,
|
|
"step": 11600
|
|
},
|
|
{
|
|
"epoch": 0.9848099117447386,
|
|
"grad_norm": 0.8235511183738708,
|
|
"learning_rate": 1.4213155554525513e-09,
|
|
"loss": 0.0145,
|
|
"num_input_tokens_seen": 11142016,
|
|
"step": 11605
|
|
},
|
|
{
|
|
"epoch": 0.9852342158859471,
|
|
"grad_norm": 22.232534408569336,
|
|
"learning_rate": 1.343467713988522e-09,
|
|
"loss": 0.0414,
|
|
"num_input_tokens_seen": 11146624,
|
|
"step": 11610
|
|
},
|
|
{
|
|
"epoch": 0.9856585200271555,
|
|
"grad_norm": 14.48363971710205,
|
|
"learning_rate": 1.2678108326621418e-09,
|
|
"loss": 0.1344,
|
|
"num_input_tokens_seen": 11151680,
|
|
"step": 11615
|
|
},
|
|
{
|
|
"epoch": 0.9860828241683639,
|
|
"grad_norm": 0.5381573438644409,
|
|
"learning_rate": 1.1943450774574148e-09,
|
|
"loss": 0.0762,
|
|
"num_input_tokens_seen": 11156096,
|
|
"step": 11620
|
|
},
|
|
{
|
|
"epoch": 0.9865071283095723,
|
|
"grad_norm": 11.544407844543457,
|
|
"learning_rate": 1.1230706095516352e-09,
|
|
"loss": 0.0637,
|
|
"num_input_tokens_seen": 11160256,
|
|
"step": 11625
|
|
},
|
|
{
|
|
"epoch": 0.9869314324507807,
|
|
"grad_norm": 0.07506151497364044,
|
|
"learning_rate": 1.0539875853142754e-09,
|
|
"loss": 0.0354,
|
|
"num_input_tokens_seen": 11165568,
|
|
"step": 11630
|
|
},
|
|
{
|
|
"epoch": 0.9873557365919892,
|
|
"grad_norm": 9.081315040588379,
|
|
"learning_rate": 9.8709615630721e-10,
|
|
"loss": 0.0572,
|
|
"num_input_tokens_seen": 11170432,
|
|
"step": 11635
|
|
},
|
|
{
|
|
"epoch": 0.9877800407331976,
|
|
"grad_norm": 3.4365997314453125,
|
|
"learning_rate": 9.22396469283937e-10,
|
|
"loss": 0.0744,
|
|
"num_input_tokens_seen": 11175104,
|
|
"step": 11640
|
|
},
|
|
{
|
|
"epoch": 0.988204344874406,
|
|
"grad_norm": 0.12847605347633362,
|
|
"learning_rate": 8.598886661895787e-10,
|
|
"loss": 0.0493,
|
|
"num_input_tokens_seen": 11179584,
|
|
"step": 11645
|
|
},
|
|
{
|
|
"epoch": 0.9886286490156144,
|
|
"grad_norm": 22.073558807373047,
|
|
"learning_rate": 7.995728841605487e-10,
|
|
"loss": 0.0609,
|
|
"num_input_tokens_seen": 11184960,
|
|
"step": 11650
|
|
},
|
|
{
|
|
"epoch": 0.9890529531568228,
|
|
"grad_norm": 0.8127935528755188,
|
|
"learning_rate": 7.41449255524107e-10,
|
|
"loss": 0.0896,
|
|
"num_input_tokens_seen": 11190464,
|
|
"step": 11655
|
|
},
|
|
{
|
|
"epoch": 0.9894772572980313,
|
|
"grad_norm": 8.514037132263184,
|
|
"learning_rate": 6.855179077981388e-10,
|
|
"loss": 0.0836,
|
|
"num_input_tokens_seen": 11195392,
|
|
"step": 11660
|
|
},
|
|
{
|
|
"epoch": 0.9899015614392397,
|
|
"grad_norm": 0.5979529619216919,
|
|
"learning_rate": 6.3177896369071e-10,
|
|
"loss": 0.0488,
|
|
"num_input_tokens_seen": 11200128,
|
|
"step": 11665
|
|
},
|
|
{
|
|
"epoch": 0.9903258655804481,
|
|
"grad_norm": 0.6341665983200073,
|
|
"learning_rate": 5.802325411001785e-10,
|
|
"loss": 0.0546,
|
|
"num_input_tokens_seen": 11204800,
|
|
"step": 11670
|
|
},
|
|
{
|
|
"epoch": 0.9907501697216565,
|
|
"grad_norm": 14.741137504577637,
|
|
"learning_rate": 5.308787531147496e-10,
|
|
"loss": 0.0896,
|
|
"num_input_tokens_seen": 11209472,
|
|
"step": 11675
|
|
},
|
|
{
|
|
"epoch": 0.991174473862865,
|
|
"grad_norm": 1.8743528127670288,
|
|
"learning_rate": 4.837177080119214e-10,
|
|
"loss": 0.0667,
|
|
"num_input_tokens_seen": 11214016,
|
|
"step": 11680
|
|
},
|
|
{
|
|
"epoch": 0.9915987780040734,
|
|
"grad_norm": 6.062661647796631,
|
|
"learning_rate": 4.387495092587068e-10,
|
|
"loss": 0.071,
|
|
"num_input_tokens_seen": 11218752,
|
|
"step": 11685
|
|
},
|
|
{
|
|
"epoch": 0.9920230821452818,
|
|
"grad_norm": 11.430667877197266,
|
|
"learning_rate": 3.959742555111889e-10,
|
|
"loss": 0.1201,
|
|
"num_input_tokens_seen": 11223552,
|
|
"step": 11690
|
|
},
|
|
{
|
|
"epoch": 0.9924473862864902,
|
|
"grad_norm": 43.32329177856445,
|
|
"learning_rate": 3.553920406144106e-10,
|
|
"loss": 0.0678,
|
|
"num_input_tokens_seen": 11228032,
|
|
"step": 11695
|
|
},
|
|
{
|
|
"epoch": 0.9928716904276986,
|
|
"grad_norm": 1.586906909942627,
|
|
"learning_rate": 3.1700295360181927e-10,
|
|
"loss": 0.0473,
|
|
"num_input_tokens_seen": 11232448,
|
|
"step": 11700
|
|
},
|
|
{
|
|
"epoch": 0.993295994568907,
|
|
"grad_norm": 38.332176208496094,
|
|
"learning_rate": 2.808070786955996e-10,
|
|
"loss": 0.091,
|
|
"num_input_tokens_seen": 11237440,
|
|
"step": 11705
|
|
},
|
|
{
|
|
"epoch": 0.9937202987101154,
|
|
"grad_norm": 1.260671854019165,
|
|
"learning_rate": 2.4680449530622984e-10,
|
|
"loss": 0.0588,
|
|
"num_input_tokens_seen": 11242048,
|
|
"step": 11710
|
|
},
|
|
{
|
|
"epoch": 0.9941446028513238,
|
|
"grad_norm": 9.481032371520996,
|
|
"learning_rate": 2.1499527803214846e-10,
|
|
"loss": 0.0834,
|
|
"num_input_tokens_seen": 11246400,
|
|
"step": 11715
|
|
},
|
|
{
|
|
"epoch": 0.9945689069925322,
|
|
"grad_norm": 8.571615219116211,
|
|
"learning_rate": 1.8537949665997642e-10,
|
|
"loss": 0.0513,
|
|
"num_input_tokens_seen": 11251456,
|
|
"step": 11720
|
|
},
|
|
{
|
|
"epoch": 0.9949932111337406,
|
|
"grad_norm": 0.35297322273254395,
|
|
"learning_rate": 1.5795721616373992e-10,
|
|
"loss": 0.0241,
|
|
"num_input_tokens_seen": 11256128,
|
|
"step": 11725
|
|
},
|
|
{
|
|
"epoch": 0.995417515274949,
|
|
"grad_norm": 14.675886154174805,
|
|
"learning_rate": 1.3272849670564746e-10,
|
|
"loss": 0.0464,
|
|
"num_input_tokens_seen": 11260736,
|
|
"step": 11730
|
|
},
|
|
{
|
|
"epoch": 0.9958418194161575,
|
|
"grad_norm": 9.873900413513184,
|
|
"learning_rate": 1.0969339363497975e-10,
|
|
"loss": 0.0742,
|
|
"num_input_tokens_seen": 11264896,
|
|
"step": 11735
|
|
},
|
|
{
|
|
"epoch": 0.9962661235573659,
|
|
"grad_norm": 13.793900489807129,
|
|
"learning_rate": 8.885195748875584e-11,
|
|
"loss": 0.0248,
|
|
"num_input_tokens_seen": 11270400,
|
|
"step": 11740
|
|
},
|
|
{
|
|
"epoch": 0.9966904276985743,
|
|
"grad_norm": 0.5965880155563354,
|
|
"learning_rate": 7.020423399117791e-11,
|
|
"loss": 0.0181,
|
|
"num_input_tokens_seen": 11275008,
|
|
"step": 11745
|
|
},
|
|
{
|
|
"epoch": 0.9971147318397827,
|
|
"grad_norm": 0.791724681854248,
|
|
"learning_rate": 5.375026405352034e-11,
|
|
"loss": 0.0732,
|
|
"num_input_tokens_seen": 11279808,
|
|
"step": 11750
|
|
},
|
|
{
|
|
"epoch": 0.9975390359809911,
|
|
"grad_norm": 12.67089557647705,
|
|
"learning_rate": 3.949008377424068e-11,
|
|
"loss": 0.0392,
|
|
"num_input_tokens_seen": 11283904,
|
|
"step": 11755
|
|
},
|
|
{
|
|
"epoch": 0.9979633401221996,
|
|
"grad_norm": 25.567686080932617,
|
|
"learning_rate": 2.742372443909069e-11,
|
|
"loss": 0.0411,
|
|
"num_input_tokens_seen": 11288512,
|
|
"step": 11760
|
|
},
|
|
{
|
|
"epoch": 0.998387644263408,
|
|
"grad_norm": 18.052406311035156,
|
|
"learning_rate": 1.7551212520339197e-11,
|
|
"loss": 0.1038,
|
|
"num_input_tokens_seen": 11293568,
|
|
"step": 11765
|
|
},
|
|
{
|
|
"epoch": 0.9988119484046164,
|
|
"grad_norm": 1.8384336233139038,
|
|
"learning_rate": 9.872569677438213e-12,
|
|
"loss": 0.1264,
|
|
"num_input_tokens_seen": 11298752,
|
|
"step": 11770
|
|
},
|
|
{
|
|
"epoch": 0.9992362525458248,
|
|
"grad_norm": 26.06826400756836,
|
|
"learning_rate": 4.387812756578846e-12,
|
|
"loss": 0.0502,
|
|
"num_input_tokens_seen": 11303680,
|
|
"step": 11775
|
|
},
|
|
{
|
|
"epoch": 0.9996605566870332,
|
|
"grad_norm": 0.45706942677497864,
|
|
"learning_rate": 1.0969537908023242e-12,
|
|
"loss": 0.0049,
|
|
"num_input_tokens_seen": 11308288,
|
|
"step": 11780
|
|
},
|
|
{
|
|
"epoch": 1.0,
|
|
"num_input_tokens_seen": 11312256,
|
|
"step": 11784,
|
|
"total_flos": 6.605086766609203e+16,
|
|
"train_loss": 0.07284873837158741,
|
|
"train_runtime": 2049.6539,
|
|
"train_samples_per_second": 45.992,
|
|
"train_steps_per_second": 5.749
|
|
}
|
|
],
|
|
"logging_steps": 5,
|
|
"max_steps": 11784,
|
|
"num_input_tokens_seen": 11312256,
|
|
"num_train_epochs": 1,
|
|
"save_steps": 590,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 6.605086766609203e+16,
|
|
"train_batch_size": 8,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|