25208 lines
662 KiB
JSON
25208 lines
662 KiB
JSON
{
|
|
"best_global_step": 14858,
|
|
"best_metric": 0.3537425398826599,
|
|
"best_model_checkpoint": "saves_bts_preliminary/freeze/llama-3.2-1b-instruct/train_record_42_1779354540/checkpoint-14858",
|
|
"epoch": 1.0,
|
|
"eval_steps": 782,
|
|
"global_step": 15621,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.0003200819409768901,
|
|
"grad_norm": 655.673828125,
|
|
"learning_rate": 5.118362124120281e-09,
|
|
"loss": 2.1603,
|
|
"num_input_tokens_seen": 15360,
|
|
"step": 5
|
|
},
|
|
{
|
|
"epoch": 0.0006401638819537802,
|
|
"grad_norm": 461.5277099609375,
|
|
"learning_rate": 1.1516314779270634e-08,
|
|
"loss": 2.344,
|
|
"num_input_tokens_seen": 31104,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 0.0009602458229306702,
|
|
"grad_norm": 540.4111938476562,
|
|
"learning_rate": 1.7914267434420987e-08,
|
|
"loss": 2.115,
|
|
"num_input_tokens_seen": 46208,
|
|
"step": 15
|
|
},
|
|
{
|
|
"epoch": 0.0012803277639075604,
|
|
"grad_norm": 371.52410888671875,
|
|
"learning_rate": 2.431222008957134e-08,
|
|
"loss": 2.741,
|
|
"num_input_tokens_seen": 62464,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.0016004097048844504,
|
|
"grad_norm": 420.4732666015625,
|
|
"learning_rate": 3.071017274472169e-08,
|
|
"loss": 2.0952,
|
|
"num_input_tokens_seen": 79104,
|
|
"step": 25
|
|
},
|
|
{
|
|
"epoch": 0.0019204916458613404,
|
|
"grad_norm": 360.0107421875,
|
|
"learning_rate": 3.710812539987204e-08,
|
|
"loss": 2.1934,
|
|
"num_input_tokens_seen": 94912,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 0.0022405735868382304,
|
|
"grad_norm": 533.6338500976562,
|
|
"learning_rate": 4.350607805502239e-08,
|
|
"loss": 2.3371,
|
|
"num_input_tokens_seen": 110784,
|
|
"step": 35
|
|
},
|
|
{
|
|
"epoch": 0.002560655527815121,
|
|
"grad_norm": 316.48663330078125,
|
|
"learning_rate": 4.990403071017274e-08,
|
|
"loss": 2.1424,
|
|
"num_input_tokens_seen": 125696,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.002880737468792011,
|
|
"grad_norm": 393.286865234375,
|
|
"learning_rate": 5.6301983365323095e-08,
|
|
"loss": 2.0945,
|
|
"num_input_tokens_seen": 140672,
|
|
"step": 45
|
|
},
|
|
{
|
|
"epoch": 0.003200819409768901,
|
|
"grad_norm": 386.58819580078125,
|
|
"learning_rate": 6.269993602047345e-08,
|
|
"loss": 2.0027,
|
|
"num_input_tokens_seen": 155456,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 0.003520901350745791,
|
|
"grad_norm": 368.06884765625,
|
|
"learning_rate": 6.90978886756238e-08,
|
|
"loss": 1.915,
|
|
"num_input_tokens_seen": 170816,
|
|
"step": 55
|
|
},
|
|
{
|
|
"epoch": 0.003840983291722681,
|
|
"grad_norm": 332.34002685546875,
|
|
"learning_rate": 7.549584133077414e-08,
|
|
"loss": 2.0244,
|
|
"num_input_tokens_seen": 185088,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.004161065232699571,
|
|
"grad_norm": 362.953125,
|
|
"learning_rate": 8.18937939859245e-08,
|
|
"loss": 1.6385,
|
|
"num_input_tokens_seen": 200384,
|
|
"step": 65
|
|
},
|
|
{
|
|
"epoch": 0.004481147173676461,
|
|
"grad_norm": 266.420166015625,
|
|
"learning_rate": 8.829174664107485e-08,
|
|
"loss": 1.6591,
|
|
"num_input_tokens_seen": 215744,
|
|
"step": 70
|
|
},
|
|
{
|
|
"epoch": 0.004801229114653352,
|
|
"grad_norm": 168.38560485839844,
|
|
"learning_rate": 9.468969929622521e-08,
|
|
"loss": 1.6555,
|
|
"num_input_tokens_seen": 230400,
|
|
"step": 75
|
|
},
|
|
{
|
|
"epoch": 0.005121311055630242,
|
|
"grad_norm": 282.5287780761719,
|
|
"learning_rate": 1.0108765195137556e-07,
|
|
"loss": 1.3232,
|
|
"num_input_tokens_seen": 246592,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.005441392996607132,
|
|
"grad_norm": 106.96839141845703,
|
|
"learning_rate": 1.074856046065259e-07,
|
|
"loss": 1.1532,
|
|
"num_input_tokens_seen": 262272,
|
|
"step": 85
|
|
},
|
|
{
|
|
"epoch": 0.005761474937584022,
|
|
"grad_norm": 119.50403594970703,
|
|
"learning_rate": 1.1388355726167625e-07,
|
|
"loss": 1.0452,
|
|
"num_input_tokens_seen": 277760,
|
|
"step": 90
|
|
},
|
|
{
|
|
"epoch": 0.006081556878560912,
|
|
"grad_norm": 165.79542541503906,
|
|
"learning_rate": 1.202815099168266e-07,
|
|
"loss": 1.2493,
|
|
"num_input_tokens_seen": 292992,
|
|
"step": 95
|
|
},
|
|
{
|
|
"epoch": 0.006401638819537802,
|
|
"grad_norm": 155.2497100830078,
|
|
"learning_rate": 1.2667946257197694e-07,
|
|
"loss": 1.1191,
|
|
"num_input_tokens_seen": 307840,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.006721720760514692,
|
|
"grad_norm": 112.60347747802734,
|
|
"learning_rate": 1.3307741522712732e-07,
|
|
"loss": 1.0359,
|
|
"num_input_tokens_seen": 323008,
|
|
"step": 105
|
|
},
|
|
{
|
|
"epoch": 0.007041802701491582,
|
|
"grad_norm": 88.95298767089844,
|
|
"learning_rate": 1.3947536788227767e-07,
|
|
"loss": 1.0546,
|
|
"num_input_tokens_seen": 339456,
|
|
"step": 110
|
|
},
|
|
{
|
|
"epoch": 0.007361884642468472,
|
|
"grad_norm": 87.05043029785156,
|
|
"learning_rate": 1.45873320537428e-07,
|
|
"loss": 1.1286,
|
|
"num_input_tokens_seen": 354816,
|
|
"step": 115
|
|
},
|
|
{
|
|
"epoch": 0.007681966583445362,
|
|
"grad_norm": 77.32754516601562,
|
|
"learning_rate": 1.5227127319257838e-07,
|
|
"loss": 0.8243,
|
|
"num_input_tokens_seen": 369472,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.008002048524422252,
|
|
"grad_norm": 69.6989974975586,
|
|
"learning_rate": 1.586692258477287e-07,
|
|
"loss": 0.9582,
|
|
"num_input_tokens_seen": 384768,
|
|
"step": 125
|
|
},
|
|
{
|
|
"epoch": 0.008322130465399142,
|
|
"grad_norm": 96.46429443359375,
|
|
"learning_rate": 1.6506717850287908e-07,
|
|
"loss": 1.0307,
|
|
"num_input_tokens_seen": 400192,
|
|
"step": 130
|
|
},
|
|
{
|
|
"epoch": 0.008642212406376032,
|
|
"grad_norm": 118.02337646484375,
|
|
"learning_rate": 1.7146513115802943e-07,
|
|
"loss": 0.8953,
|
|
"num_input_tokens_seen": 416640,
|
|
"step": 135
|
|
},
|
|
{
|
|
"epoch": 0.008962294347352922,
|
|
"grad_norm": 65.88743591308594,
|
|
"learning_rate": 1.7786308381317976e-07,
|
|
"loss": 0.8263,
|
|
"num_input_tokens_seen": 432640,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 0.009282376288329812,
|
|
"grad_norm": 77.22103881835938,
|
|
"learning_rate": 1.8426103646833014e-07,
|
|
"loss": 0.8971,
|
|
"num_input_tokens_seen": 448640,
|
|
"step": 145
|
|
},
|
|
{
|
|
"epoch": 0.009602458229306703,
|
|
"grad_norm": 88.69629669189453,
|
|
"learning_rate": 1.9065898912348046e-07,
|
|
"loss": 0.9544,
|
|
"num_input_tokens_seen": 464448,
|
|
"step": 150
|
|
},
|
|
{
|
|
"epoch": 0.009922540170283593,
|
|
"grad_norm": 86.22632598876953,
|
|
"learning_rate": 1.9705694177863084e-07,
|
|
"loss": 0.8598,
|
|
"num_input_tokens_seen": 479488,
|
|
"step": 155
|
|
},
|
|
{
|
|
"epoch": 0.010242622111260483,
|
|
"grad_norm": 55.39344787597656,
|
|
"learning_rate": 2.034548944337812e-07,
|
|
"loss": 0.7343,
|
|
"num_input_tokens_seen": 495296,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 0.010562704052237373,
|
|
"grad_norm": 87.78097534179688,
|
|
"learning_rate": 2.0985284708893152e-07,
|
|
"loss": 0.7845,
|
|
"num_input_tokens_seen": 510144,
|
|
"step": 165
|
|
},
|
|
{
|
|
"epoch": 0.010882785993214263,
|
|
"grad_norm": 80.47422790527344,
|
|
"learning_rate": 2.162507997440819e-07,
|
|
"loss": 0.8491,
|
|
"num_input_tokens_seen": 524928,
|
|
"step": 170
|
|
},
|
|
{
|
|
"epoch": 0.011202867934191153,
|
|
"grad_norm": 45.75130081176758,
|
|
"learning_rate": 2.2264875239923222e-07,
|
|
"loss": 0.7122,
|
|
"num_input_tokens_seen": 541504,
|
|
"step": 175
|
|
},
|
|
{
|
|
"epoch": 0.011522949875168043,
|
|
"grad_norm": 81.46015167236328,
|
|
"learning_rate": 2.290467050543826e-07,
|
|
"loss": 0.7354,
|
|
"num_input_tokens_seen": 556096,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 0.011843031816144933,
|
|
"grad_norm": 77.93597412109375,
|
|
"learning_rate": 2.3544465770953295e-07,
|
|
"loss": 0.734,
|
|
"num_input_tokens_seen": 572736,
|
|
"step": 185
|
|
},
|
|
{
|
|
"epoch": 0.012163113757121823,
|
|
"grad_norm": 73.0274658203125,
|
|
"learning_rate": 2.418426103646833e-07,
|
|
"loss": 0.8565,
|
|
"num_input_tokens_seen": 588352,
|
|
"step": 190
|
|
},
|
|
{
|
|
"epoch": 0.012483195698098713,
|
|
"grad_norm": 56.91474533081055,
|
|
"learning_rate": 2.4824056301983363e-07,
|
|
"loss": 0.9816,
|
|
"num_input_tokens_seen": 603520,
|
|
"step": 195
|
|
},
|
|
{
|
|
"epoch": 0.012803277639075603,
|
|
"grad_norm": 66.9703369140625,
|
|
"learning_rate": 2.54638515674984e-07,
|
|
"loss": 0.8158,
|
|
"num_input_tokens_seen": 619392,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.013123359580052493,
|
|
"grad_norm": 59.1487922668457,
|
|
"learning_rate": 2.6103646833013433e-07,
|
|
"loss": 0.8032,
|
|
"num_input_tokens_seen": 635456,
|
|
"step": 205
|
|
},
|
|
{
|
|
"epoch": 0.013443441521029383,
|
|
"grad_norm": 121.4522705078125,
|
|
"learning_rate": 2.6743442098528466e-07,
|
|
"loss": 0.8716,
|
|
"num_input_tokens_seen": 650880,
|
|
"step": 210
|
|
},
|
|
{
|
|
"epoch": 0.013763523462006273,
|
|
"grad_norm": 50.31541442871094,
|
|
"learning_rate": 2.7383237364043504e-07,
|
|
"loss": 0.8278,
|
|
"num_input_tokens_seen": 666688,
|
|
"step": 215
|
|
},
|
|
{
|
|
"epoch": 0.014083605402983163,
|
|
"grad_norm": 70.05236053466797,
|
|
"learning_rate": 2.802303262955854e-07,
|
|
"loss": 0.7898,
|
|
"num_input_tokens_seen": 682112,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 0.014403687343960053,
|
|
"grad_norm": 64.9844741821289,
|
|
"learning_rate": 2.866282789507358e-07,
|
|
"loss": 0.8381,
|
|
"num_input_tokens_seen": 697728,
|
|
"step": 225
|
|
},
|
|
{
|
|
"epoch": 0.014723769284936943,
|
|
"grad_norm": 53.501747131347656,
|
|
"learning_rate": 2.9302623160588607e-07,
|
|
"loss": 0.6829,
|
|
"num_input_tokens_seen": 712704,
|
|
"step": 230
|
|
},
|
|
{
|
|
"epoch": 0.015043851225913833,
|
|
"grad_norm": 91.16888427734375,
|
|
"learning_rate": 2.9942418426103644e-07,
|
|
"loss": 0.9619,
|
|
"num_input_tokens_seen": 729408,
|
|
"step": 235
|
|
},
|
|
{
|
|
"epoch": 0.015363933166890723,
|
|
"grad_norm": 89.74860382080078,
|
|
"learning_rate": 3.058221369161868e-07,
|
|
"loss": 0.7854,
|
|
"num_input_tokens_seen": 745344,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 0.015684015107867613,
|
|
"grad_norm": 56.365665435791016,
|
|
"learning_rate": 3.1222008957133715e-07,
|
|
"loss": 0.6965,
|
|
"num_input_tokens_seen": 762688,
|
|
"step": 245
|
|
},
|
|
{
|
|
"epoch": 0.016004097048844503,
|
|
"grad_norm": 62.77731704711914,
|
|
"learning_rate": 3.186180422264875e-07,
|
|
"loss": 0.7105,
|
|
"num_input_tokens_seen": 779392,
|
|
"step": 250
|
|
},
|
|
{
|
|
"epoch": 0.016324178989821393,
|
|
"grad_norm": 80.97101593017578,
|
|
"learning_rate": 3.2501599488163785e-07,
|
|
"loss": 0.7964,
|
|
"num_input_tokens_seen": 794112,
|
|
"step": 255
|
|
},
|
|
{
|
|
"epoch": 0.016644260930798283,
|
|
"grad_norm": 50.28890609741211,
|
|
"learning_rate": 3.314139475367882e-07,
|
|
"loss": 0.8427,
|
|
"num_input_tokens_seen": 810112,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 0.016964342871775173,
|
|
"grad_norm": 79.30187225341797,
|
|
"learning_rate": 3.3781190019193855e-07,
|
|
"loss": 0.8614,
|
|
"num_input_tokens_seen": 825472,
|
|
"step": 265
|
|
},
|
|
{
|
|
"epoch": 0.017284424812752063,
|
|
"grad_norm": 69.35704803466797,
|
|
"learning_rate": 3.4420985284708893e-07,
|
|
"loss": 0.9819,
|
|
"num_input_tokens_seen": 840128,
|
|
"step": 270
|
|
},
|
|
{
|
|
"epoch": 0.017604506753728953,
|
|
"grad_norm": 70.34232330322266,
|
|
"learning_rate": 3.5060780550223926e-07,
|
|
"loss": 0.7825,
|
|
"num_input_tokens_seen": 855104,
|
|
"step": 275
|
|
},
|
|
{
|
|
"epoch": 0.017924588694705843,
|
|
"grad_norm": 67.7530517578125,
|
|
"learning_rate": 3.570057581573896e-07,
|
|
"loss": 0.8069,
|
|
"num_input_tokens_seen": 870848,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 0.018244670635682733,
|
|
"grad_norm": 46.21129608154297,
|
|
"learning_rate": 3.6340371081253996e-07,
|
|
"loss": 0.7403,
|
|
"num_input_tokens_seen": 885760,
|
|
"step": 285
|
|
},
|
|
{
|
|
"epoch": 0.018564752576659623,
|
|
"grad_norm": 44.078643798828125,
|
|
"learning_rate": 3.6980166346769034e-07,
|
|
"loss": 0.7078,
|
|
"num_input_tokens_seen": 900928,
|
|
"step": 290
|
|
},
|
|
{
|
|
"epoch": 0.018884834517636517,
|
|
"grad_norm": 54.419532775878906,
|
|
"learning_rate": 3.7619961612284067e-07,
|
|
"loss": 0.793,
|
|
"num_input_tokens_seen": 915968,
|
|
"step": 295
|
|
},
|
|
{
|
|
"epoch": 0.019204916458613407,
|
|
"grad_norm": 107.00920867919922,
|
|
"learning_rate": 3.8259756877799104e-07,
|
|
"loss": 0.9919,
|
|
"num_input_tokens_seen": 933056,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 0.019524998399590297,
|
|
"grad_norm": 84.30803680419922,
|
|
"learning_rate": 3.889955214331414e-07,
|
|
"loss": 0.7373,
|
|
"num_input_tokens_seen": 948416,
|
|
"step": 305
|
|
},
|
|
{
|
|
"epoch": 0.019845080340567187,
|
|
"grad_norm": 65.89620971679688,
|
|
"learning_rate": 3.953934740882917e-07,
|
|
"loss": 0.7694,
|
|
"num_input_tokens_seen": 962880,
|
|
"step": 310
|
|
},
|
|
{
|
|
"epoch": 0.020165162281544077,
|
|
"grad_norm": 58.68693923950195,
|
|
"learning_rate": 4.0179142674344207e-07,
|
|
"loss": 0.8088,
|
|
"num_input_tokens_seen": 979904,
|
|
"step": 315
|
|
},
|
|
{
|
|
"epoch": 0.020485244222520967,
|
|
"grad_norm": 64.4815902709961,
|
|
"learning_rate": 4.0818937939859245e-07,
|
|
"loss": 0.8251,
|
|
"num_input_tokens_seen": 995136,
|
|
"step": 320
|
|
},
|
|
{
|
|
"epoch": 0.020805326163497857,
|
|
"grad_norm": 59.8892707824707,
|
|
"learning_rate": 4.145873320537428e-07,
|
|
"loss": 0.7695,
|
|
"num_input_tokens_seen": 1011008,
|
|
"step": 325
|
|
},
|
|
{
|
|
"epoch": 0.021125408104474747,
|
|
"grad_norm": 61.05699157714844,
|
|
"learning_rate": 4.2098528470889315e-07,
|
|
"loss": 0.8335,
|
|
"num_input_tokens_seen": 1025792,
|
|
"step": 330
|
|
},
|
|
{
|
|
"epoch": 0.021445490045451637,
|
|
"grad_norm": 54.53645324707031,
|
|
"learning_rate": 4.273832373640435e-07,
|
|
"loss": 0.6901,
|
|
"num_input_tokens_seen": 1042944,
|
|
"step": 335
|
|
},
|
|
{
|
|
"epoch": 0.021765571986428527,
|
|
"grad_norm": 69.49205017089844,
|
|
"learning_rate": 4.3378119001919386e-07,
|
|
"loss": 0.8267,
|
|
"num_input_tokens_seen": 1058688,
|
|
"step": 340
|
|
},
|
|
{
|
|
"epoch": 0.022085653927405417,
|
|
"grad_norm": 52.010841369628906,
|
|
"learning_rate": 4.401791426743442e-07,
|
|
"loss": 0.7233,
|
|
"num_input_tokens_seen": 1074560,
|
|
"step": 345
|
|
},
|
|
{
|
|
"epoch": 0.022405735868382307,
|
|
"grad_norm": 76.15229034423828,
|
|
"learning_rate": 4.4657709532949456e-07,
|
|
"loss": 0.6991,
|
|
"num_input_tokens_seen": 1089728,
|
|
"step": 350
|
|
},
|
|
{
|
|
"epoch": 0.022725817809359197,
|
|
"grad_norm": 93.6746597290039,
|
|
"learning_rate": 4.5297504798464494e-07,
|
|
"loss": 0.9114,
|
|
"num_input_tokens_seen": 1105024,
|
|
"step": 355
|
|
},
|
|
{
|
|
"epoch": 0.023045899750336087,
|
|
"grad_norm": 51.18860626220703,
|
|
"learning_rate": 4.593730006397952e-07,
|
|
"loss": 0.7824,
|
|
"num_input_tokens_seen": 1121088,
|
|
"step": 360
|
|
},
|
|
{
|
|
"epoch": 0.023365981691312977,
|
|
"grad_norm": 51.50726318359375,
|
|
"learning_rate": 4.657709532949456e-07,
|
|
"loss": 0.7048,
|
|
"num_input_tokens_seen": 1136896,
|
|
"step": 365
|
|
},
|
|
{
|
|
"epoch": 0.023686063632289867,
|
|
"grad_norm": 61.41284942626953,
|
|
"learning_rate": 4.7216890595009597e-07,
|
|
"loss": 0.7082,
|
|
"num_input_tokens_seen": 1153280,
|
|
"step": 370
|
|
},
|
|
{
|
|
"epoch": 0.024006145573266757,
|
|
"grad_norm": 56.15473175048828,
|
|
"learning_rate": 4.785668586052463e-07,
|
|
"loss": 0.8338,
|
|
"num_input_tokens_seen": 1169536,
|
|
"step": 375
|
|
},
|
|
{
|
|
"epoch": 0.024326227514243647,
|
|
"grad_norm": 87.2235107421875,
|
|
"learning_rate": 4.849648112603967e-07,
|
|
"loss": 0.7577,
|
|
"num_input_tokens_seen": 1185088,
|
|
"step": 380
|
|
},
|
|
{
|
|
"epoch": 0.024646309455220537,
|
|
"grad_norm": 35.1290397644043,
|
|
"learning_rate": 4.91362763915547e-07,
|
|
"loss": 0.6664,
|
|
"num_input_tokens_seen": 1200832,
|
|
"step": 385
|
|
},
|
|
{
|
|
"epoch": 0.024966391396197427,
|
|
"grad_norm": 50.34434509277344,
|
|
"learning_rate": 4.977607165706974e-07,
|
|
"loss": 0.6605,
|
|
"num_input_tokens_seen": 1216320,
|
|
"step": 390
|
|
},
|
|
{
|
|
"epoch": 0.025286473337174317,
|
|
"grad_norm": 61.464664459228516,
|
|
"learning_rate": 5.041586692258478e-07,
|
|
"loss": 0.7361,
|
|
"num_input_tokens_seen": 1232832,
|
|
"step": 395
|
|
},
|
|
{
|
|
"epoch": 0.025606555278151207,
|
|
"grad_norm": 49.942779541015625,
|
|
"learning_rate": 5.10556621880998e-07,
|
|
"loss": 0.7037,
|
|
"num_input_tokens_seen": 1248384,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 0.025926637219128097,
|
|
"grad_norm": 35.994441986083984,
|
|
"learning_rate": 5.169545745361484e-07,
|
|
"loss": 0.6727,
|
|
"num_input_tokens_seen": 1263936,
|
|
"step": 405
|
|
},
|
|
{
|
|
"epoch": 0.026246719160104987,
|
|
"grad_norm": 65.4025650024414,
|
|
"learning_rate": 5.233525271912988e-07,
|
|
"loss": 1.118,
|
|
"num_input_tokens_seen": 1294208,
|
|
"step": 410
|
|
},
|
|
{
|
|
"epoch": 0.026566801101081877,
|
|
"grad_norm": 63.024566650390625,
|
|
"learning_rate": 5.297504798464492e-07,
|
|
"loss": 0.7921,
|
|
"num_input_tokens_seen": 1309120,
|
|
"step": 415
|
|
},
|
|
{
|
|
"epoch": 0.026886883042058767,
|
|
"grad_norm": 56.8184814453125,
|
|
"learning_rate": 5.361484325015994e-07,
|
|
"loss": 0.8592,
|
|
"num_input_tokens_seen": 1324224,
|
|
"step": 420
|
|
},
|
|
{
|
|
"epoch": 0.027206964983035656,
|
|
"grad_norm": 74.34542846679688,
|
|
"learning_rate": 5.425463851567498e-07,
|
|
"loss": 0.6829,
|
|
"num_input_tokens_seen": 1341056,
|
|
"step": 425
|
|
},
|
|
{
|
|
"epoch": 0.027527046924012546,
|
|
"grad_norm": 52.57733154296875,
|
|
"learning_rate": 5.489443378119002e-07,
|
|
"loss": 0.7533,
|
|
"num_input_tokens_seen": 1356544,
|
|
"step": 430
|
|
},
|
|
{
|
|
"epoch": 0.027847128864989436,
|
|
"grad_norm": 62.9859733581543,
|
|
"learning_rate": 5.553422904670505e-07,
|
|
"loss": 0.6696,
|
|
"num_input_tokens_seen": 1371840,
|
|
"step": 435
|
|
},
|
|
{
|
|
"epoch": 0.028167210805966326,
|
|
"grad_norm": 55.78180694580078,
|
|
"learning_rate": 5.61740243122201e-07,
|
|
"loss": 0.6825,
|
|
"num_input_tokens_seen": 1386816,
|
|
"step": 440
|
|
},
|
|
{
|
|
"epoch": 0.028487292746943216,
|
|
"grad_norm": 64.4063720703125,
|
|
"learning_rate": 5.681381957773512e-07,
|
|
"loss": 0.7438,
|
|
"num_input_tokens_seen": 1401792,
|
|
"step": 445
|
|
},
|
|
{
|
|
"epoch": 0.028807374687920106,
|
|
"grad_norm": 66.20137023925781,
|
|
"learning_rate": 5.745361484325015e-07,
|
|
"loss": 0.6214,
|
|
"num_input_tokens_seen": 1416896,
|
|
"step": 450
|
|
},
|
|
{
|
|
"epoch": 0.029127456628896996,
|
|
"grad_norm": 82.4999008178711,
|
|
"learning_rate": 5.80934101087652e-07,
|
|
"loss": 0.7517,
|
|
"num_input_tokens_seen": 1432704,
|
|
"step": 455
|
|
},
|
|
{
|
|
"epoch": 0.029447538569873886,
|
|
"grad_norm": 85.98738861083984,
|
|
"learning_rate": 5.873320537428022e-07,
|
|
"loss": 0.7009,
|
|
"num_input_tokens_seen": 1448384,
|
|
"step": 460
|
|
},
|
|
{
|
|
"epoch": 0.029767620510850776,
|
|
"grad_norm": 60.4025764465332,
|
|
"learning_rate": 5.937300063979526e-07,
|
|
"loss": 0.7179,
|
|
"num_input_tokens_seen": 1464832,
|
|
"step": 465
|
|
},
|
|
{
|
|
"epoch": 0.030087702451827666,
|
|
"grad_norm": 69.6055908203125,
|
|
"learning_rate": 6.00127959053103e-07,
|
|
"loss": 0.6785,
|
|
"num_input_tokens_seen": 1479424,
|
|
"step": 470
|
|
},
|
|
{
|
|
"epoch": 0.030407784392804556,
|
|
"grad_norm": 40.376953125,
|
|
"learning_rate": 6.065259117082533e-07,
|
|
"loss": 0.7292,
|
|
"num_input_tokens_seen": 1494336,
|
|
"step": 475
|
|
},
|
|
{
|
|
"epoch": 0.030727866333781446,
|
|
"grad_norm": 53.5233154296875,
|
|
"learning_rate": 6.129238643634037e-07,
|
|
"loss": 0.6741,
|
|
"num_input_tokens_seen": 1509184,
|
|
"step": 480
|
|
},
|
|
{
|
|
"epoch": 0.031047948274758336,
|
|
"grad_norm": 36.17082214355469,
|
|
"learning_rate": 6.19321817018554e-07,
|
|
"loss": 0.8032,
|
|
"num_input_tokens_seen": 1525504,
|
|
"step": 485
|
|
},
|
|
{
|
|
"epoch": 0.031368030215735226,
|
|
"grad_norm": 39.83842468261719,
|
|
"learning_rate": 6.257197696737044e-07,
|
|
"loss": 0.5911,
|
|
"num_input_tokens_seen": 1541504,
|
|
"step": 490
|
|
},
|
|
{
|
|
"epoch": 0.03168811215671212,
|
|
"grad_norm": 38.20148849487305,
|
|
"learning_rate": 6.321177223288548e-07,
|
|
"loss": 0.6188,
|
|
"num_input_tokens_seen": 1557184,
|
|
"step": 495
|
|
},
|
|
{
|
|
"epoch": 0.032008194097689006,
|
|
"grad_norm": 58.953765869140625,
|
|
"learning_rate": 6.385156749840051e-07,
|
|
"loss": 0.7662,
|
|
"num_input_tokens_seen": 1573440,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 0.0323282760386659,
|
|
"grad_norm": 34.08373260498047,
|
|
"learning_rate": 6.449136276391554e-07,
|
|
"loss": 0.8712,
|
|
"num_input_tokens_seen": 1588736,
|
|
"step": 505
|
|
},
|
|
{
|
|
"epoch": 0.032648357979642786,
|
|
"grad_norm": 42.26185989379883,
|
|
"learning_rate": 6.513115802943058e-07,
|
|
"loss": 0.6979,
|
|
"num_input_tokens_seen": 1604352,
|
|
"step": 510
|
|
},
|
|
{
|
|
"epoch": 0.03296843992061968,
|
|
"grad_norm": 39.405391693115234,
|
|
"learning_rate": 6.577095329494562e-07,
|
|
"loss": 0.6574,
|
|
"num_input_tokens_seen": 1618816,
|
|
"step": 515
|
|
},
|
|
{
|
|
"epoch": 0.033288521861596566,
|
|
"grad_norm": 68.6015853881836,
|
|
"learning_rate": 6.641074856046065e-07,
|
|
"loss": 0.7462,
|
|
"num_input_tokens_seen": 1635648,
|
|
"step": 520
|
|
},
|
|
{
|
|
"epoch": 0.03360860380257346,
|
|
"grad_norm": 51.983734130859375,
|
|
"learning_rate": 6.705054382597568e-07,
|
|
"loss": 0.719,
|
|
"num_input_tokens_seen": 1651328,
|
|
"step": 525
|
|
},
|
|
{
|
|
"epoch": 0.033928685743550346,
|
|
"grad_norm": 41.2055549621582,
|
|
"learning_rate": 6.769033909149072e-07,
|
|
"loss": 0.7345,
|
|
"num_input_tokens_seen": 1668928,
|
|
"step": 530
|
|
},
|
|
{
|
|
"epoch": 0.03424876768452724,
|
|
"grad_norm": 49.78337478637695,
|
|
"learning_rate": 6.833013435700575e-07,
|
|
"loss": 0.6202,
|
|
"num_input_tokens_seen": 1685504,
|
|
"step": 535
|
|
},
|
|
{
|
|
"epoch": 0.034568849625504126,
|
|
"grad_norm": 56.454078674316406,
|
|
"learning_rate": 6.89699296225208e-07,
|
|
"loss": 0.7053,
|
|
"num_input_tokens_seen": 1701952,
|
|
"step": 540
|
|
},
|
|
{
|
|
"epoch": 0.03488893156648102,
|
|
"grad_norm": 57.62031173706055,
|
|
"learning_rate": 6.960972488803583e-07,
|
|
"loss": 0.7308,
|
|
"num_input_tokens_seen": 1716992,
|
|
"step": 545
|
|
},
|
|
{
|
|
"epoch": 0.035209013507457906,
|
|
"grad_norm": 26.579238891601562,
|
|
"learning_rate": 7.024952015355085e-07,
|
|
"loss": 0.5835,
|
|
"num_input_tokens_seen": 1732160,
|
|
"step": 550
|
|
},
|
|
{
|
|
"epoch": 0.0355290954484348,
|
|
"grad_norm": 62.191402435302734,
|
|
"learning_rate": 7.08893154190659e-07,
|
|
"loss": 0.6553,
|
|
"num_input_tokens_seen": 1748416,
|
|
"step": 555
|
|
},
|
|
{
|
|
"epoch": 0.035849177389411686,
|
|
"grad_norm": 47.643890380859375,
|
|
"learning_rate": 7.152911068458093e-07,
|
|
"loss": 0.7096,
|
|
"num_input_tokens_seen": 1763776,
|
|
"step": 560
|
|
},
|
|
{
|
|
"epoch": 0.03616925933038858,
|
|
"grad_norm": 53.94837188720703,
|
|
"learning_rate": 7.216890595009597e-07,
|
|
"loss": 0.6985,
|
|
"num_input_tokens_seen": 1780160,
|
|
"step": 565
|
|
},
|
|
{
|
|
"epoch": 0.036489341271365466,
|
|
"grad_norm": 49.82310104370117,
|
|
"learning_rate": 7.2808701215611e-07,
|
|
"loss": 0.6057,
|
|
"num_input_tokens_seen": 1795968,
|
|
"step": 570
|
|
},
|
|
{
|
|
"epoch": 0.03680942321234236,
|
|
"grad_norm": 45.038936614990234,
|
|
"learning_rate": 7.344849648112603e-07,
|
|
"loss": 0.6327,
|
|
"num_input_tokens_seen": 1815424,
|
|
"step": 575
|
|
},
|
|
{
|
|
"epoch": 0.037129505153319246,
|
|
"grad_norm": 86.25282287597656,
|
|
"learning_rate": 7.408829174664107e-07,
|
|
"loss": 0.8275,
|
|
"num_input_tokens_seen": 1831936,
|
|
"step": 580
|
|
},
|
|
{
|
|
"epoch": 0.03744958709429614,
|
|
"grad_norm": 33.02293014526367,
|
|
"learning_rate": 7.472808701215611e-07,
|
|
"loss": 0.6155,
|
|
"num_input_tokens_seen": 1847424,
|
|
"step": 585
|
|
},
|
|
{
|
|
"epoch": 0.03776966903527303,
|
|
"grad_norm": 46.377925872802734,
|
|
"learning_rate": 7.536788227767114e-07,
|
|
"loss": 0.7381,
|
|
"num_input_tokens_seen": 1862400,
|
|
"step": 590
|
|
},
|
|
{
|
|
"epoch": 0.03808975097624992,
|
|
"grad_norm": 83.56999969482422,
|
|
"learning_rate": 7.600767754318617e-07,
|
|
"loss": 0.7694,
|
|
"num_input_tokens_seen": 1876928,
|
|
"step": 595
|
|
},
|
|
{
|
|
"epoch": 0.03840983291722681,
|
|
"grad_norm": 52.600372314453125,
|
|
"learning_rate": 7.664747280870121e-07,
|
|
"loss": 0.6363,
|
|
"num_input_tokens_seen": 1892608,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 0.0387299148582037,
|
|
"grad_norm": 35.62962341308594,
|
|
"learning_rate": 7.728726807421625e-07,
|
|
"loss": 0.7292,
|
|
"num_input_tokens_seen": 1909696,
|
|
"step": 605
|
|
},
|
|
{
|
|
"epoch": 0.03904999679918059,
|
|
"grad_norm": 38.61429214477539,
|
|
"learning_rate": 7.792706333973129e-07,
|
|
"loss": 0.7601,
|
|
"num_input_tokens_seen": 1924864,
|
|
"step": 610
|
|
},
|
|
{
|
|
"epoch": 0.03937007874015748,
|
|
"grad_norm": 35.40009689331055,
|
|
"learning_rate": 7.856685860524632e-07,
|
|
"loss": 0.5592,
|
|
"num_input_tokens_seen": 1939968,
|
|
"step": 615
|
|
},
|
|
{
|
|
"epoch": 0.03969016068113437,
|
|
"grad_norm": 69.34685516357422,
|
|
"learning_rate": 7.920665387076135e-07,
|
|
"loss": 0.7152,
|
|
"num_input_tokens_seen": 1955136,
|
|
"step": 620
|
|
},
|
|
{
|
|
"epoch": 0.04001024262211126,
|
|
"grad_norm": 42.532005310058594,
|
|
"learning_rate": 7.984644913627639e-07,
|
|
"loss": 0.7036,
|
|
"num_input_tokens_seen": 1970880,
|
|
"step": 625
|
|
},
|
|
{
|
|
"epoch": 0.04033032456308815,
|
|
"grad_norm": 61.369667053222656,
|
|
"learning_rate": 8.048624440179143e-07,
|
|
"loss": 0.5794,
|
|
"num_input_tokens_seen": 1986752,
|
|
"step": 630
|
|
},
|
|
{
|
|
"epoch": 0.04065040650406504,
|
|
"grad_norm": 39.555450439453125,
|
|
"learning_rate": 8.112603966730645e-07,
|
|
"loss": 0.5994,
|
|
"num_input_tokens_seen": 2001856,
|
|
"step": 635
|
|
},
|
|
{
|
|
"epoch": 0.04097048844504193,
|
|
"grad_norm": 48.69257354736328,
|
|
"learning_rate": 8.17658349328215e-07,
|
|
"loss": 0.6586,
|
|
"num_input_tokens_seen": 2019968,
|
|
"step": 640
|
|
},
|
|
{
|
|
"epoch": 0.04129057038601882,
|
|
"grad_norm": 57.668907165527344,
|
|
"learning_rate": 8.240563019833653e-07,
|
|
"loss": 0.7047,
|
|
"num_input_tokens_seen": 2035328,
|
|
"step": 645
|
|
},
|
|
{
|
|
"epoch": 0.04161065232699571,
|
|
"grad_norm": 43.12187576293945,
|
|
"learning_rate": 8.304542546385156e-07,
|
|
"loss": 0.6282,
|
|
"num_input_tokens_seen": 2055168,
|
|
"step": 650
|
|
},
|
|
{
|
|
"epoch": 0.0419307342679726,
|
|
"grad_norm": 43.47643280029297,
|
|
"learning_rate": 8.36852207293666e-07,
|
|
"loss": 0.7521,
|
|
"num_input_tokens_seen": 2071808,
|
|
"step": 655
|
|
},
|
|
{
|
|
"epoch": 0.04225081620894949,
|
|
"grad_norm": 44.587730407714844,
|
|
"learning_rate": 8.432501599488163e-07,
|
|
"loss": 0.6527,
|
|
"num_input_tokens_seen": 2087424,
|
|
"step": 660
|
|
},
|
|
{
|
|
"epoch": 0.04257089814992638,
|
|
"grad_norm": 35.20018768310547,
|
|
"learning_rate": 8.496481126039667e-07,
|
|
"loss": 0.7682,
|
|
"num_input_tokens_seen": 2102592,
|
|
"step": 665
|
|
},
|
|
{
|
|
"epoch": 0.04289098009090327,
|
|
"grad_norm": 44.10483169555664,
|
|
"learning_rate": 8.560460652591171e-07,
|
|
"loss": 0.6517,
|
|
"num_input_tokens_seen": 2119488,
|
|
"step": 670
|
|
},
|
|
{
|
|
"epoch": 0.04321106203188016,
|
|
"grad_norm": 37.69010543823242,
|
|
"learning_rate": 8.624440179142674e-07,
|
|
"loss": 0.6454,
|
|
"num_input_tokens_seen": 2136000,
|
|
"step": 675
|
|
},
|
|
{
|
|
"epoch": 0.04353114397285705,
|
|
"grad_norm": 47.20091247558594,
|
|
"learning_rate": 8.688419705694177e-07,
|
|
"loss": 0.7404,
|
|
"num_input_tokens_seen": 2152448,
|
|
"step": 680
|
|
},
|
|
{
|
|
"epoch": 0.04385122591383394,
|
|
"grad_norm": 44.33426284790039,
|
|
"learning_rate": 8.752399232245681e-07,
|
|
"loss": 0.6177,
|
|
"num_input_tokens_seen": 2168000,
|
|
"step": 685
|
|
},
|
|
{
|
|
"epoch": 0.04417130785481083,
|
|
"grad_norm": 42.24176025390625,
|
|
"learning_rate": 8.816378758797185e-07,
|
|
"loss": 0.5953,
|
|
"num_input_tokens_seen": 2183552,
|
|
"step": 690
|
|
},
|
|
{
|
|
"epoch": 0.04449138979578772,
|
|
"grad_norm": 52.65016174316406,
|
|
"learning_rate": 8.880358285348688e-07,
|
|
"loss": 0.7135,
|
|
"num_input_tokens_seen": 2199488,
|
|
"step": 695
|
|
},
|
|
{
|
|
"epoch": 0.04481147173676461,
|
|
"grad_norm": 36.20340347290039,
|
|
"learning_rate": 8.944337811900191e-07,
|
|
"loss": 0.6167,
|
|
"num_input_tokens_seen": 2215296,
|
|
"step": 700
|
|
},
|
|
{
|
|
"epoch": 0.0451315536777415,
|
|
"grad_norm": 50.2882080078125,
|
|
"learning_rate": 9.008317338451695e-07,
|
|
"loss": 0.7051,
|
|
"num_input_tokens_seen": 2230016,
|
|
"step": 705
|
|
},
|
|
{
|
|
"epoch": 0.04545163561871839,
|
|
"grad_norm": 48.945701599121094,
|
|
"learning_rate": 9.072296865003198e-07,
|
|
"loss": 0.6629,
|
|
"num_input_tokens_seen": 2245056,
|
|
"step": 710
|
|
},
|
|
{
|
|
"epoch": 0.04577171755969528,
|
|
"grad_norm": 35.3903923034668,
|
|
"learning_rate": 9.136276391554703e-07,
|
|
"loss": 0.6166,
|
|
"num_input_tokens_seen": 2261248,
|
|
"step": 715
|
|
},
|
|
{
|
|
"epoch": 0.04609179950067217,
|
|
"grad_norm": 57.04933547973633,
|
|
"learning_rate": 9.200255918106205e-07,
|
|
"loss": 0.6516,
|
|
"num_input_tokens_seen": 2278016,
|
|
"step": 720
|
|
},
|
|
{
|
|
"epoch": 0.04641188144164906,
|
|
"grad_norm": 46.2874755859375,
|
|
"learning_rate": 9.264235444657708e-07,
|
|
"loss": 0.5696,
|
|
"num_input_tokens_seen": 2292800,
|
|
"step": 725
|
|
},
|
|
{
|
|
"epoch": 0.04673196338262595,
|
|
"grad_norm": 34.89374542236328,
|
|
"learning_rate": 9.328214971209213e-07,
|
|
"loss": 0.6049,
|
|
"num_input_tokens_seen": 2308224,
|
|
"step": 730
|
|
},
|
|
{
|
|
"epoch": 0.04705204532360284,
|
|
"grad_norm": 39.93567657470703,
|
|
"learning_rate": 9.392194497760716e-07,
|
|
"loss": 0.7005,
|
|
"num_input_tokens_seen": 2325760,
|
|
"step": 735
|
|
},
|
|
{
|
|
"epoch": 0.04737212726457973,
|
|
"grad_norm": 56.7358512878418,
|
|
"learning_rate": 9.456174024312221e-07,
|
|
"loss": 0.6971,
|
|
"num_input_tokens_seen": 2341632,
|
|
"step": 740
|
|
},
|
|
{
|
|
"epoch": 0.04769220920555662,
|
|
"grad_norm": 38.78962326049805,
|
|
"learning_rate": 9.520153550863723e-07,
|
|
"loss": 0.7066,
|
|
"num_input_tokens_seen": 2357504,
|
|
"step": 745
|
|
},
|
|
{
|
|
"epoch": 0.04801229114653351,
|
|
"grad_norm": 42.24749755859375,
|
|
"learning_rate": 9.584133077415226e-07,
|
|
"loss": 0.7294,
|
|
"num_input_tokens_seen": 2372608,
|
|
"step": 750
|
|
},
|
|
{
|
|
"epoch": 0.0483323730875104,
|
|
"grad_norm": 42.99443817138672,
|
|
"learning_rate": 9.64811260396673e-07,
|
|
"loss": 0.587,
|
|
"num_input_tokens_seen": 2388352,
|
|
"step": 755
|
|
},
|
|
{
|
|
"epoch": 0.04865245502848729,
|
|
"grad_norm": 46.318416595458984,
|
|
"learning_rate": 9.712092130518234e-07,
|
|
"loss": 0.6934,
|
|
"num_input_tokens_seen": 2404480,
|
|
"step": 760
|
|
},
|
|
{
|
|
"epoch": 0.04897253696946418,
|
|
"grad_norm": 32.9005126953125,
|
|
"learning_rate": 9.776071657069737e-07,
|
|
"loss": 0.518,
|
|
"num_input_tokens_seen": 2419648,
|
|
"step": 765
|
|
},
|
|
{
|
|
"epoch": 0.04929261891044107,
|
|
"grad_norm": 45.313751220703125,
|
|
"learning_rate": 9.840051183621241e-07,
|
|
"loss": 0.7121,
|
|
"num_input_tokens_seen": 2435584,
|
|
"step": 770
|
|
},
|
|
{
|
|
"epoch": 0.04961270085141796,
|
|
"grad_norm": 57.734039306640625,
|
|
"learning_rate": 9.904030710172743e-07,
|
|
"loss": 0.6265,
|
|
"num_input_tokens_seen": 2451072,
|
|
"step": 775
|
|
},
|
|
{
|
|
"epoch": 0.04993278279239485,
|
|
"grad_norm": 60.701107025146484,
|
|
"learning_rate": 9.968010236724249e-07,
|
|
"loss": 0.764,
|
|
"num_input_tokens_seen": 2467968,
|
|
"step": 780
|
|
},
|
|
{
|
|
"epoch": 0.05006081556878561,
|
|
"eval_loss": 0.6362079381942749,
|
|
"eval_runtime": 49.1703,
|
|
"eval_samples_per_second": 282.406,
|
|
"eval_steps_per_second": 35.306,
|
|
"num_input_tokens_seen": 2474432,
|
|
"step": 782
|
|
},
|
|
{
|
|
"epoch": 0.05025286473337175,
|
|
"grad_norm": 52.689231872558594,
|
|
"learning_rate": 1.0031989763275752e-06,
|
|
"loss": 0.669,
|
|
"num_input_tokens_seen": 2484928,
|
|
"step": 785
|
|
},
|
|
{
|
|
"epoch": 0.05057294667434863,
|
|
"grad_norm": 42.69588851928711,
|
|
"learning_rate": 1.0095969289827256e-06,
|
|
"loss": 0.6777,
|
|
"num_input_tokens_seen": 2501504,
|
|
"step": 790
|
|
},
|
|
{
|
|
"epoch": 0.050893028615325527,
|
|
"grad_norm": 32.48566436767578,
|
|
"learning_rate": 1.0159948816378758e-06,
|
|
"loss": 0.5188,
|
|
"num_input_tokens_seen": 2518848,
|
|
"step": 795
|
|
},
|
|
{
|
|
"epoch": 0.05121311055630241,
|
|
"grad_norm": 33.27299880981445,
|
|
"learning_rate": 1.0223928342930262e-06,
|
|
"loss": 0.5482,
|
|
"num_input_tokens_seen": 2535680,
|
|
"step": 800
|
|
},
|
|
{
|
|
"epoch": 0.051533192497279307,
|
|
"grad_norm": 51.12800979614258,
|
|
"learning_rate": 1.0287907869481766e-06,
|
|
"loss": 0.676,
|
|
"num_input_tokens_seen": 2550976,
|
|
"step": 805
|
|
},
|
|
{
|
|
"epoch": 0.05185327443825619,
|
|
"grad_norm": 39.38006591796875,
|
|
"learning_rate": 1.035188739603327e-06,
|
|
"loss": 0.5562,
|
|
"num_input_tokens_seen": 2566656,
|
|
"step": 810
|
|
},
|
|
{
|
|
"epoch": 0.052173356379233086,
|
|
"grad_norm": 49.9570426940918,
|
|
"learning_rate": 1.0415866922584773e-06,
|
|
"loss": 0.6315,
|
|
"num_input_tokens_seen": 2581568,
|
|
"step": 815
|
|
},
|
|
{
|
|
"epoch": 0.05249343832020997,
|
|
"grad_norm": 51.84290313720703,
|
|
"learning_rate": 1.0479846449136277e-06,
|
|
"loss": 0.6426,
|
|
"num_input_tokens_seen": 2596608,
|
|
"step": 820
|
|
},
|
|
{
|
|
"epoch": 0.052813520261186866,
|
|
"grad_norm": 42.30448532104492,
|
|
"learning_rate": 1.0543825975687779e-06,
|
|
"loss": 0.6719,
|
|
"num_input_tokens_seen": 2612032,
|
|
"step": 825
|
|
},
|
|
{
|
|
"epoch": 0.05313360220216375,
|
|
"grad_norm": 54.049774169921875,
|
|
"learning_rate": 1.0607805502239282e-06,
|
|
"loss": 0.7313,
|
|
"num_input_tokens_seen": 2627264,
|
|
"step": 830
|
|
},
|
|
{
|
|
"epoch": 0.053453684143140646,
|
|
"grad_norm": 43.845027923583984,
|
|
"learning_rate": 1.0671785028790788e-06,
|
|
"loss": 0.548,
|
|
"num_input_tokens_seen": 2643264,
|
|
"step": 835
|
|
},
|
|
{
|
|
"epoch": 0.05377376608411753,
|
|
"grad_norm": 43.62913131713867,
|
|
"learning_rate": 1.073576455534229e-06,
|
|
"loss": 0.5474,
|
|
"num_input_tokens_seen": 2659264,
|
|
"step": 840
|
|
},
|
|
{
|
|
"epoch": 0.054093848025094426,
|
|
"grad_norm": 37.99971389770508,
|
|
"learning_rate": 1.0799744081893794e-06,
|
|
"loss": 0.5737,
|
|
"num_input_tokens_seen": 2673856,
|
|
"step": 845
|
|
},
|
|
{
|
|
"epoch": 0.05441392996607131,
|
|
"grad_norm": 35.17848587036133,
|
|
"learning_rate": 1.0863723608445297e-06,
|
|
"loss": 0.4779,
|
|
"num_input_tokens_seen": 2688448,
|
|
"step": 850
|
|
},
|
|
{
|
|
"epoch": 0.054734011907048206,
|
|
"grad_norm": 69.50128173828125,
|
|
"learning_rate": 1.09277031349968e-06,
|
|
"loss": 0.6201,
|
|
"num_input_tokens_seen": 2703872,
|
|
"step": 855
|
|
},
|
|
{
|
|
"epoch": 0.05505409384802509,
|
|
"grad_norm": 49.573143005371094,
|
|
"learning_rate": 1.0991682661548305e-06,
|
|
"loss": 0.6104,
|
|
"num_input_tokens_seen": 2719040,
|
|
"step": 860
|
|
},
|
|
{
|
|
"epoch": 0.055374175789001986,
|
|
"grad_norm": 35.63096618652344,
|
|
"learning_rate": 1.1055662188099809e-06,
|
|
"loss": 0.6205,
|
|
"num_input_tokens_seen": 2735168,
|
|
"step": 865
|
|
},
|
|
{
|
|
"epoch": 0.05569425772997887,
|
|
"grad_norm": 38.10055160522461,
|
|
"learning_rate": 1.111964171465131e-06,
|
|
"loss": 0.5224,
|
|
"num_input_tokens_seen": 2750592,
|
|
"step": 870
|
|
},
|
|
{
|
|
"epoch": 0.056014339670955766,
|
|
"grad_norm": 21.403268814086914,
|
|
"learning_rate": 1.1183621241202814e-06,
|
|
"loss": 0.6572,
|
|
"num_input_tokens_seen": 2767232,
|
|
"step": 875
|
|
},
|
|
{
|
|
"epoch": 0.05633442161193265,
|
|
"grad_norm": 55.04920959472656,
|
|
"learning_rate": 1.1247600767754318e-06,
|
|
"loss": 0.665,
|
|
"num_input_tokens_seen": 2784768,
|
|
"step": 880
|
|
},
|
|
{
|
|
"epoch": 0.056654503552909546,
|
|
"grad_norm": 39.130226135253906,
|
|
"learning_rate": 1.1311580294305822e-06,
|
|
"loss": 0.5809,
|
|
"num_input_tokens_seen": 2799872,
|
|
"step": 885
|
|
},
|
|
{
|
|
"epoch": 0.05697458549388643,
|
|
"grad_norm": 51.871341705322266,
|
|
"learning_rate": 1.1375559820857326e-06,
|
|
"loss": 0.6481,
|
|
"num_input_tokens_seen": 2816000,
|
|
"step": 890
|
|
},
|
|
{
|
|
"epoch": 0.057294667434863326,
|
|
"grad_norm": 46.604705810546875,
|
|
"learning_rate": 1.143953934740883e-06,
|
|
"loss": 0.5859,
|
|
"num_input_tokens_seen": 2831744,
|
|
"step": 895
|
|
},
|
|
{
|
|
"epoch": 0.05761474937584021,
|
|
"grad_norm": 56.78334426879883,
|
|
"learning_rate": 1.150351887396033e-06,
|
|
"loss": 0.6183,
|
|
"num_input_tokens_seen": 2847424,
|
|
"step": 900
|
|
},
|
|
{
|
|
"epoch": 0.057934831316817106,
|
|
"grad_norm": 51.35699462890625,
|
|
"learning_rate": 1.1567498400511835e-06,
|
|
"loss": 0.616,
|
|
"num_input_tokens_seen": 2862272,
|
|
"step": 905
|
|
},
|
|
{
|
|
"epoch": 0.05825491325779399,
|
|
"grad_norm": 38.57978820800781,
|
|
"learning_rate": 1.163147792706334e-06,
|
|
"loss": 0.4927,
|
|
"num_input_tokens_seen": 2877120,
|
|
"step": 910
|
|
},
|
|
{
|
|
"epoch": 0.058574995198770886,
|
|
"grad_norm": 41.00065612792969,
|
|
"learning_rate": 1.1695457453614842e-06,
|
|
"loss": 0.5249,
|
|
"num_input_tokens_seen": 2894592,
|
|
"step": 915
|
|
},
|
|
{
|
|
"epoch": 0.05889507713974777,
|
|
"grad_norm": 40.363075256347656,
|
|
"learning_rate": 1.1759436980166346e-06,
|
|
"loss": 0.6159,
|
|
"num_input_tokens_seen": 2909888,
|
|
"step": 920
|
|
},
|
|
{
|
|
"epoch": 0.059215159080724666,
|
|
"grad_norm": 49.1600456237793,
|
|
"learning_rate": 1.182341650671785e-06,
|
|
"loss": 0.6195,
|
|
"num_input_tokens_seen": 2925632,
|
|
"step": 925
|
|
},
|
|
{
|
|
"epoch": 0.05953524102170155,
|
|
"grad_norm": 47.78977966308594,
|
|
"learning_rate": 1.1887396033269352e-06,
|
|
"loss": 0.6153,
|
|
"num_input_tokens_seen": 2941760,
|
|
"step": 930
|
|
},
|
|
{
|
|
"epoch": 0.059855322962678446,
|
|
"grad_norm": 47.449405670166016,
|
|
"learning_rate": 1.1951375559820858e-06,
|
|
"loss": 0.7076,
|
|
"num_input_tokens_seen": 2957376,
|
|
"step": 935
|
|
},
|
|
{
|
|
"epoch": 0.06017540490365533,
|
|
"grad_norm": 66.98524475097656,
|
|
"learning_rate": 1.2015355086372361e-06,
|
|
"loss": 0.5704,
|
|
"num_input_tokens_seen": 2972800,
|
|
"step": 940
|
|
},
|
|
{
|
|
"epoch": 0.060495486844632226,
|
|
"grad_norm": 48.29072952270508,
|
|
"learning_rate": 1.2079334612923863e-06,
|
|
"loss": 0.7172,
|
|
"num_input_tokens_seen": 2988480,
|
|
"step": 945
|
|
},
|
|
{
|
|
"epoch": 0.06081556878560911,
|
|
"grad_norm": 43.3856086730957,
|
|
"learning_rate": 1.2143314139475367e-06,
|
|
"loss": 0.6613,
|
|
"num_input_tokens_seen": 3004480,
|
|
"step": 950
|
|
},
|
|
{
|
|
"epoch": 0.061135650726586006,
|
|
"grad_norm": 38.56562423706055,
|
|
"learning_rate": 1.220729366602687e-06,
|
|
"loss": 0.444,
|
|
"num_input_tokens_seen": 3020288,
|
|
"step": 955
|
|
},
|
|
{
|
|
"epoch": 0.06145573266756289,
|
|
"grad_norm": 60.62529373168945,
|
|
"learning_rate": 1.2271273192578374e-06,
|
|
"loss": 0.6011,
|
|
"num_input_tokens_seen": 3035968,
|
|
"step": 960
|
|
},
|
|
{
|
|
"epoch": 0.061775814608539786,
|
|
"grad_norm": 61.26271438598633,
|
|
"learning_rate": 1.2335252719129878e-06,
|
|
"loss": 0.7411,
|
|
"num_input_tokens_seen": 3051776,
|
|
"step": 965
|
|
},
|
|
{
|
|
"epoch": 0.06209589654951667,
|
|
"grad_norm": 52.55011749267578,
|
|
"learning_rate": 1.2399232245681382e-06,
|
|
"loss": 0.5575,
|
|
"num_input_tokens_seen": 3066560,
|
|
"step": 970
|
|
},
|
|
{
|
|
"epoch": 0.062415978490493566,
|
|
"grad_norm": 52.49790954589844,
|
|
"learning_rate": 1.2463211772232884e-06,
|
|
"loss": 0.6357,
|
|
"num_input_tokens_seen": 3082496,
|
|
"step": 975
|
|
},
|
|
{
|
|
"epoch": 0.06273606043147045,
|
|
"grad_norm": 43.31839370727539,
|
|
"learning_rate": 1.2527191298784387e-06,
|
|
"loss": 0.6233,
|
|
"num_input_tokens_seen": 3097856,
|
|
"step": 980
|
|
},
|
|
{
|
|
"epoch": 0.06305614237244735,
|
|
"grad_norm": 25.353742599487305,
|
|
"learning_rate": 1.2591170825335893e-06,
|
|
"loss": 0.5062,
|
|
"num_input_tokens_seen": 3113664,
|
|
"step": 985
|
|
},
|
|
{
|
|
"epoch": 0.06337622431342424,
|
|
"grad_norm": 37.9774169921875,
|
|
"learning_rate": 1.2655150351887395e-06,
|
|
"loss": 0.6242,
|
|
"num_input_tokens_seen": 3129792,
|
|
"step": 990
|
|
},
|
|
{
|
|
"epoch": 0.06369630625440113,
|
|
"grad_norm": 30.752185821533203,
|
|
"learning_rate": 1.2719129878438899e-06,
|
|
"loss": 0.5901,
|
|
"num_input_tokens_seen": 3145024,
|
|
"step": 995
|
|
},
|
|
{
|
|
"epoch": 0.06401638819537801,
|
|
"grad_norm": 41.19409942626953,
|
|
"learning_rate": 1.2783109404990402e-06,
|
|
"loss": 0.7747,
|
|
"num_input_tokens_seen": 3161216,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 0.0643364701363549,
|
|
"grad_norm": 27.8523006439209,
|
|
"learning_rate": 1.2847088931541904e-06,
|
|
"loss": 0.4118,
|
|
"num_input_tokens_seen": 3176960,
|
|
"step": 1005
|
|
},
|
|
{
|
|
"epoch": 0.0646565520773318,
|
|
"grad_norm": 39.628929138183594,
|
|
"learning_rate": 1.291106845809341e-06,
|
|
"loss": 0.607,
|
|
"num_input_tokens_seen": 3193088,
|
|
"step": 1010
|
|
},
|
|
{
|
|
"epoch": 0.0649766340183087,
|
|
"grad_norm": 62.03862762451172,
|
|
"learning_rate": 1.2975047984644914e-06,
|
|
"loss": 0.6808,
|
|
"num_input_tokens_seen": 3210112,
|
|
"step": 1015
|
|
},
|
|
{
|
|
"epoch": 0.06529671595928557,
|
|
"grad_norm": 41.16059494018555,
|
|
"learning_rate": 1.3039027511196418e-06,
|
|
"loss": 0.5044,
|
|
"num_input_tokens_seen": 3224768,
|
|
"step": 1020
|
|
},
|
|
{
|
|
"epoch": 0.06561679790026247,
|
|
"grad_norm": 45.047080993652344,
|
|
"learning_rate": 1.310300703774792e-06,
|
|
"loss": 0.6235,
|
|
"num_input_tokens_seen": 3240128,
|
|
"step": 1025
|
|
},
|
|
{
|
|
"epoch": 0.06593687984123936,
|
|
"grad_norm": 41.879398345947266,
|
|
"learning_rate": 1.3166986564299423e-06,
|
|
"loss": 0.5605,
|
|
"num_input_tokens_seen": 3256576,
|
|
"step": 1030
|
|
},
|
|
{
|
|
"epoch": 0.06625696178221625,
|
|
"grad_norm": 34.385223388671875,
|
|
"learning_rate": 1.3230966090850929e-06,
|
|
"loss": 0.5942,
|
|
"num_input_tokens_seen": 3272384,
|
|
"step": 1035
|
|
},
|
|
{
|
|
"epoch": 0.06657704372319313,
|
|
"grad_norm": 38.94369125366211,
|
|
"learning_rate": 1.329494561740243e-06,
|
|
"loss": 0.4108,
|
|
"num_input_tokens_seen": 3288512,
|
|
"step": 1040
|
|
},
|
|
{
|
|
"epoch": 0.06689712566417003,
|
|
"grad_norm": 40.253990173339844,
|
|
"learning_rate": 1.3358925143953934e-06,
|
|
"loss": 0.4897,
|
|
"num_input_tokens_seen": 3306304,
|
|
"step": 1045
|
|
},
|
|
{
|
|
"epoch": 0.06721720760514692,
|
|
"grad_norm": 42.53627395629883,
|
|
"learning_rate": 1.3422904670505438e-06,
|
|
"loss": 0.4785,
|
|
"num_input_tokens_seen": 3321344,
|
|
"step": 1050
|
|
},
|
|
{
|
|
"epoch": 0.06753728954612381,
|
|
"grad_norm": 38.27849197387695,
|
|
"learning_rate": 1.348688419705694e-06,
|
|
"loss": 0.6127,
|
|
"num_input_tokens_seen": 3338560,
|
|
"step": 1055
|
|
},
|
|
{
|
|
"epoch": 0.06785737148710069,
|
|
"grad_norm": 26.670169830322266,
|
|
"learning_rate": 1.3550863723608446e-06,
|
|
"loss": 0.5135,
|
|
"num_input_tokens_seen": 3353152,
|
|
"step": 1060
|
|
},
|
|
{
|
|
"epoch": 0.06817745342807759,
|
|
"grad_norm": 46.529396057128906,
|
|
"learning_rate": 1.361484325015995e-06,
|
|
"loss": 0.5401,
|
|
"num_input_tokens_seen": 3369536,
|
|
"step": 1065
|
|
},
|
|
{
|
|
"epoch": 0.06849753536905448,
|
|
"grad_norm": 45.95737075805664,
|
|
"learning_rate": 1.3678822776711451e-06,
|
|
"loss": 0.6023,
|
|
"num_input_tokens_seen": 3384832,
|
|
"step": 1070
|
|
},
|
|
{
|
|
"epoch": 0.06881761731003137,
|
|
"grad_norm": 38.86219787597656,
|
|
"learning_rate": 1.3742802303262955e-06,
|
|
"loss": 0.4881,
|
|
"num_input_tokens_seen": 3399424,
|
|
"step": 1075
|
|
},
|
|
{
|
|
"epoch": 0.06913769925100825,
|
|
"grad_norm": 30.497953414916992,
|
|
"learning_rate": 1.3806781829814459e-06,
|
|
"loss": 0.6565,
|
|
"num_input_tokens_seen": 3416704,
|
|
"step": 1080
|
|
},
|
|
{
|
|
"epoch": 0.06945778119198515,
|
|
"grad_norm": 59.77437210083008,
|
|
"learning_rate": 1.3870761356365963e-06,
|
|
"loss": 0.5553,
|
|
"num_input_tokens_seen": 3431552,
|
|
"step": 1085
|
|
},
|
|
{
|
|
"epoch": 0.06977786313296204,
|
|
"grad_norm": 36.94731521606445,
|
|
"learning_rate": 1.3934740882917466e-06,
|
|
"loss": 0.6472,
|
|
"num_input_tokens_seen": 3447488,
|
|
"step": 1090
|
|
},
|
|
{
|
|
"epoch": 0.07009794507393893,
|
|
"grad_norm": 39.8687744140625,
|
|
"learning_rate": 1.399872040946897e-06,
|
|
"loss": 0.5137,
|
|
"num_input_tokens_seen": 3463424,
|
|
"step": 1095
|
|
},
|
|
{
|
|
"epoch": 0.07041802701491581,
|
|
"grad_norm": 51.21504211425781,
|
|
"learning_rate": 1.4062699936020472e-06,
|
|
"loss": 0.6527,
|
|
"num_input_tokens_seen": 3479680,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"epoch": 0.0707381089558927,
|
|
"grad_norm": 49.46668243408203,
|
|
"learning_rate": 1.4126679462571976e-06,
|
|
"loss": 0.5117,
|
|
"num_input_tokens_seen": 3495552,
|
|
"step": 1105
|
|
},
|
|
{
|
|
"epoch": 0.0710581908968696,
|
|
"grad_norm": 56.50544357299805,
|
|
"learning_rate": 1.4190658989123481e-06,
|
|
"loss": 0.4748,
|
|
"num_input_tokens_seen": 3510976,
|
|
"step": 1110
|
|
},
|
|
{
|
|
"epoch": 0.0713782728378465,
|
|
"grad_norm": 49.386070251464844,
|
|
"learning_rate": 1.4254638515674983e-06,
|
|
"loss": 0.6499,
|
|
"num_input_tokens_seen": 3526016,
|
|
"step": 1115
|
|
},
|
|
{
|
|
"epoch": 0.07169835477882337,
|
|
"grad_norm": 22.4860782623291,
|
|
"learning_rate": 1.4318618042226487e-06,
|
|
"loss": 0.5645,
|
|
"num_input_tokens_seen": 3540544,
|
|
"step": 1120
|
|
},
|
|
{
|
|
"epoch": 0.07201843671980027,
|
|
"grad_norm": 43.12958908081055,
|
|
"learning_rate": 1.438259756877799e-06,
|
|
"loss": 0.6069,
|
|
"num_input_tokens_seen": 3556416,
|
|
"step": 1125
|
|
},
|
|
{
|
|
"epoch": 0.07233851866077716,
|
|
"grad_norm": 43.865108489990234,
|
|
"learning_rate": 1.4446577095329492e-06,
|
|
"loss": 0.5077,
|
|
"num_input_tokens_seen": 3572096,
|
|
"step": 1130
|
|
},
|
|
{
|
|
"epoch": 0.07265860060175405,
|
|
"grad_norm": 41.96502685546875,
|
|
"learning_rate": 1.4510556621880998e-06,
|
|
"loss": 0.4993,
|
|
"num_input_tokens_seen": 3587712,
|
|
"step": 1135
|
|
},
|
|
{
|
|
"epoch": 0.07297868254273093,
|
|
"grad_norm": 30.780799865722656,
|
|
"learning_rate": 1.4574536148432502e-06,
|
|
"loss": 0.5417,
|
|
"num_input_tokens_seen": 3605056,
|
|
"step": 1140
|
|
},
|
|
{
|
|
"epoch": 0.07329876448370783,
|
|
"grad_norm": 42.194156646728516,
|
|
"learning_rate": 1.4638515674984004e-06,
|
|
"loss": 0.6805,
|
|
"num_input_tokens_seen": 3621184,
|
|
"step": 1145
|
|
},
|
|
{
|
|
"epoch": 0.07361884642468472,
|
|
"grad_norm": 25.724376678466797,
|
|
"learning_rate": 1.4702495201535507e-06,
|
|
"loss": 0.5834,
|
|
"num_input_tokens_seen": 3635392,
|
|
"step": 1150
|
|
},
|
|
{
|
|
"epoch": 0.07393892836566161,
|
|
"grad_norm": 32.53746795654297,
|
|
"learning_rate": 1.4766474728087011e-06,
|
|
"loss": 0.5049,
|
|
"num_input_tokens_seen": 3649984,
|
|
"step": 1155
|
|
},
|
|
{
|
|
"epoch": 0.07425901030663849,
|
|
"grad_norm": 34.3016471862793,
|
|
"learning_rate": 1.4830454254638515e-06,
|
|
"loss": 0.5276,
|
|
"num_input_tokens_seen": 3665920,
|
|
"step": 1160
|
|
},
|
|
{
|
|
"epoch": 0.07457909224761539,
|
|
"grad_norm": 32.034515380859375,
|
|
"learning_rate": 1.4894433781190019e-06,
|
|
"loss": 0.4587,
|
|
"num_input_tokens_seen": 3680256,
|
|
"step": 1165
|
|
},
|
|
{
|
|
"epoch": 0.07489917418859228,
|
|
"grad_norm": 49.901329040527344,
|
|
"learning_rate": 1.4958413307741523e-06,
|
|
"loss": 0.5255,
|
|
"num_input_tokens_seen": 3697536,
|
|
"step": 1170
|
|
},
|
|
{
|
|
"epoch": 0.07521925612956917,
|
|
"grad_norm": 35.28968048095703,
|
|
"learning_rate": 1.5022392834293024e-06,
|
|
"loss": 0.6111,
|
|
"num_input_tokens_seen": 3713088,
|
|
"step": 1175
|
|
},
|
|
{
|
|
"epoch": 0.07553933807054607,
|
|
"grad_norm": 56.491756439208984,
|
|
"learning_rate": 1.5086372360844528e-06,
|
|
"loss": 0.6712,
|
|
"num_input_tokens_seen": 3729920,
|
|
"step": 1180
|
|
},
|
|
{
|
|
"epoch": 0.07585942001152295,
|
|
"grad_norm": 45.67325210571289,
|
|
"learning_rate": 1.5150351887396034e-06,
|
|
"loss": 0.5489,
|
|
"num_input_tokens_seen": 3745664,
|
|
"step": 1185
|
|
},
|
|
{
|
|
"epoch": 0.07617950195249984,
|
|
"grad_norm": 35.20317840576172,
|
|
"learning_rate": 1.5214331413947536e-06,
|
|
"loss": 0.5258,
|
|
"num_input_tokens_seen": 3760576,
|
|
"step": 1190
|
|
},
|
|
{
|
|
"epoch": 0.07649958389347673,
|
|
"grad_norm": 29.504152297973633,
|
|
"learning_rate": 1.527831094049904e-06,
|
|
"loss": 0.5085,
|
|
"num_input_tokens_seen": 3776576,
|
|
"step": 1195
|
|
},
|
|
{
|
|
"epoch": 0.07681966583445363,
|
|
"grad_norm": 43.33934783935547,
|
|
"learning_rate": 1.5342290467050543e-06,
|
|
"loss": 0.5857,
|
|
"num_input_tokens_seen": 3792384,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"epoch": 0.0771397477754305,
|
|
"grad_norm": 44.849308013916016,
|
|
"learning_rate": 1.5406269993602045e-06,
|
|
"loss": 0.6438,
|
|
"num_input_tokens_seen": 3806592,
|
|
"step": 1205
|
|
},
|
|
{
|
|
"epoch": 0.0774598297164074,
|
|
"grad_norm": 52.07255935668945,
|
|
"learning_rate": 1.547024952015355e-06,
|
|
"loss": 0.5775,
|
|
"num_input_tokens_seen": 3822080,
|
|
"step": 1210
|
|
},
|
|
{
|
|
"epoch": 0.07777991165738429,
|
|
"grad_norm": 37.863677978515625,
|
|
"learning_rate": 1.5534229046705055e-06,
|
|
"loss": 0.5269,
|
|
"num_input_tokens_seen": 3837120,
|
|
"step": 1215
|
|
},
|
|
{
|
|
"epoch": 0.07809999359836119,
|
|
"grad_norm": 37.92720413208008,
|
|
"learning_rate": 1.5598208573256556e-06,
|
|
"loss": 0.6994,
|
|
"num_input_tokens_seen": 3852864,
|
|
"step": 1220
|
|
},
|
|
{
|
|
"epoch": 0.07842007553933807,
|
|
"grad_norm": 34.05339431762695,
|
|
"learning_rate": 1.566218809980806e-06,
|
|
"loss": 0.515,
|
|
"num_input_tokens_seen": 3869184,
|
|
"step": 1225
|
|
},
|
|
{
|
|
"epoch": 0.07874015748031496,
|
|
"grad_norm": 31.917217254638672,
|
|
"learning_rate": 1.5726167626359564e-06,
|
|
"loss": 0.5388,
|
|
"num_input_tokens_seen": 3885248,
|
|
"step": 1230
|
|
},
|
|
{
|
|
"epoch": 0.07906023942129185,
|
|
"grad_norm": 32.81400680541992,
|
|
"learning_rate": 1.5790147152911068e-06,
|
|
"loss": 0.4306,
|
|
"num_input_tokens_seen": 3900416,
|
|
"step": 1235
|
|
},
|
|
{
|
|
"epoch": 0.07938032136226875,
|
|
"grad_norm": 38.30088806152344,
|
|
"learning_rate": 1.5854126679462571e-06,
|
|
"loss": 0.5503,
|
|
"num_input_tokens_seen": 3916096,
|
|
"step": 1240
|
|
},
|
|
{
|
|
"epoch": 0.07970040330324563,
|
|
"grad_norm": 50.25246810913086,
|
|
"learning_rate": 1.5918106206014075e-06,
|
|
"loss": 0.6993,
|
|
"num_input_tokens_seen": 3933312,
|
|
"step": 1245
|
|
},
|
|
{
|
|
"epoch": 0.08002048524422252,
|
|
"grad_norm": 74.49282836914062,
|
|
"learning_rate": 1.5982085732565577e-06,
|
|
"loss": 0.6197,
|
|
"num_input_tokens_seen": 3949440,
|
|
"step": 1250
|
|
},
|
|
{
|
|
"epoch": 0.08034056718519941,
|
|
"grad_norm": 36.928924560546875,
|
|
"learning_rate": 1.604606525911708e-06,
|
|
"loss": 0.6799,
|
|
"num_input_tokens_seen": 3964992,
|
|
"step": 1255
|
|
},
|
|
{
|
|
"epoch": 0.0806606491261763,
|
|
"grad_norm": 56.78390884399414,
|
|
"learning_rate": 1.6110044785668586e-06,
|
|
"loss": 0.7324,
|
|
"num_input_tokens_seen": 3981696,
|
|
"step": 1260
|
|
},
|
|
{
|
|
"epoch": 0.08098073106715319,
|
|
"grad_norm": 38.05080795288086,
|
|
"learning_rate": 1.617402431222009e-06,
|
|
"loss": 0.6136,
|
|
"num_input_tokens_seen": 3997248,
|
|
"step": 1265
|
|
},
|
|
{
|
|
"epoch": 0.08130081300813008,
|
|
"grad_norm": 27.51533317565918,
|
|
"learning_rate": 1.6238003838771592e-06,
|
|
"loss": 0.6689,
|
|
"num_input_tokens_seen": 4011648,
|
|
"step": 1270
|
|
},
|
|
{
|
|
"epoch": 0.08162089494910697,
|
|
"grad_norm": 42.947906494140625,
|
|
"learning_rate": 1.6301983365323096e-06,
|
|
"loss": 0.5254,
|
|
"num_input_tokens_seen": 4028160,
|
|
"step": 1275
|
|
},
|
|
{
|
|
"epoch": 0.08194097689008387,
|
|
"grad_norm": 47.13071060180664,
|
|
"learning_rate": 1.63659628918746e-06,
|
|
"loss": 0.5398,
|
|
"num_input_tokens_seen": 4043584,
|
|
"step": 1280
|
|
},
|
|
{
|
|
"epoch": 0.08226105883106075,
|
|
"grad_norm": 47.630218505859375,
|
|
"learning_rate": 1.6429942418426103e-06,
|
|
"loss": 0.7076,
|
|
"num_input_tokens_seen": 4059456,
|
|
"step": 1285
|
|
},
|
|
{
|
|
"epoch": 0.08258114077203764,
|
|
"grad_norm": 26.62889289855957,
|
|
"learning_rate": 1.6493921944977607e-06,
|
|
"loss": 0.6103,
|
|
"num_input_tokens_seen": 4076096,
|
|
"step": 1290
|
|
},
|
|
{
|
|
"epoch": 0.08290122271301453,
|
|
"grad_norm": 41.755088806152344,
|
|
"learning_rate": 1.655790147152911e-06,
|
|
"loss": 0.6111,
|
|
"num_input_tokens_seen": 4093568,
|
|
"step": 1295
|
|
},
|
|
{
|
|
"epoch": 0.08322130465399143,
|
|
"grad_norm": 36.05648422241211,
|
|
"learning_rate": 1.6621880998080612e-06,
|
|
"loss": 0.6676,
|
|
"num_input_tokens_seen": 4108864,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"epoch": 0.0835413865949683,
|
|
"grad_norm": 37.495201110839844,
|
|
"learning_rate": 1.6685860524632116e-06,
|
|
"loss": 0.6425,
|
|
"num_input_tokens_seen": 4124224,
|
|
"step": 1305
|
|
},
|
|
{
|
|
"epoch": 0.0838614685359452,
|
|
"grad_norm": 40.898502349853516,
|
|
"learning_rate": 1.6749840051183622e-06,
|
|
"loss": 0.5516,
|
|
"num_input_tokens_seen": 4139008,
|
|
"step": 1310
|
|
},
|
|
{
|
|
"epoch": 0.08418155047692209,
|
|
"grad_norm": 24.442567825317383,
|
|
"learning_rate": 1.6813819577735124e-06,
|
|
"loss": 0.5551,
|
|
"num_input_tokens_seen": 4155008,
|
|
"step": 1315
|
|
},
|
|
{
|
|
"epoch": 0.08450163241789899,
|
|
"grad_norm": 26.63324737548828,
|
|
"learning_rate": 1.6877799104286628e-06,
|
|
"loss": 0.4792,
|
|
"num_input_tokens_seen": 4172544,
|
|
"step": 1320
|
|
},
|
|
{
|
|
"epoch": 0.08482171435887587,
|
|
"grad_norm": 48.432395935058594,
|
|
"learning_rate": 1.6941778630838131e-06,
|
|
"loss": 0.6306,
|
|
"num_input_tokens_seen": 4188416,
|
|
"step": 1325
|
|
},
|
|
{
|
|
"epoch": 0.08514179629985276,
|
|
"grad_norm": 19.108352661132812,
|
|
"learning_rate": 1.7005758157389633e-06,
|
|
"loss": 0.5031,
|
|
"num_input_tokens_seen": 4202560,
|
|
"step": 1330
|
|
},
|
|
{
|
|
"epoch": 0.08546187824082965,
|
|
"grad_norm": 35.99553680419922,
|
|
"learning_rate": 1.706973768394114e-06,
|
|
"loss": 0.5574,
|
|
"num_input_tokens_seen": 4219392,
|
|
"step": 1335
|
|
},
|
|
{
|
|
"epoch": 0.08578196018180655,
|
|
"grad_norm": 50.857059478759766,
|
|
"learning_rate": 1.7133717210492643e-06,
|
|
"loss": 0.4844,
|
|
"num_input_tokens_seen": 4235328,
|
|
"step": 1340
|
|
},
|
|
{
|
|
"epoch": 0.08610204212278343,
|
|
"grad_norm": 52.725589752197266,
|
|
"learning_rate": 1.7197696737044144e-06,
|
|
"loss": 0.5778,
|
|
"num_input_tokens_seen": 4250368,
|
|
"step": 1345
|
|
},
|
|
{
|
|
"epoch": 0.08642212406376032,
|
|
"grad_norm": 27.934179306030273,
|
|
"learning_rate": 1.7261676263595648e-06,
|
|
"loss": 0.4549,
|
|
"num_input_tokens_seen": 4265856,
|
|
"step": 1350
|
|
},
|
|
{
|
|
"epoch": 0.08674220600473721,
|
|
"grad_norm": 41.11848068237305,
|
|
"learning_rate": 1.7325655790147152e-06,
|
|
"loss": 0.6627,
|
|
"num_input_tokens_seen": 4281792,
|
|
"step": 1355
|
|
},
|
|
{
|
|
"epoch": 0.0870622879457141,
|
|
"grad_norm": 38.61765670776367,
|
|
"learning_rate": 1.7389635316698656e-06,
|
|
"loss": 0.5873,
|
|
"num_input_tokens_seen": 4297088,
|
|
"step": 1360
|
|
},
|
|
{
|
|
"epoch": 0.087382369886691,
|
|
"grad_norm": 27.60044288635254,
|
|
"learning_rate": 1.745361484325016e-06,
|
|
"loss": 0.5028,
|
|
"num_input_tokens_seen": 4312192,
|
|
"step": 1365
|
|
},
|
|
{
|
|
"epoch": 0.08770245182766788,
|
|
"grad_norm": 27.299213409423828,
|
|
"learning_rate": 1.7517594369801663e-06,
|
|
"loss": 0.4819,
|
|
"num_input_tokens_seen": 4326720,
|
|
"step": 1370
|
|
},
|
|
{
|
|
"epoch": 0.08802253376864477,
|
|
"grad_norm": 58.7935791015625,
|
|
"learning_rate": 1.7581573896353165e-06,
|
|
"loss": 0.7894,
|
|
"num_input_tokens_seen": 4341760,
|
|
"step": 1375
|
|
},
|
|
{
|
|
"epoch": 0.08834261570962167,
|
|
"grad_norm": 36.60477828979492,
|
|
"learning_rate": 1.7645553422904669e-06,
|
|
"loss": 0.6215,
|
|
"num_input_tokens_seen": 4357760,
|
|
"step": 1380
|
|
},
|
|
{
|
|
"epoch": 0.08866269765059856,
|
|
"grad_norm": 36.011505126953125,
|
|
"learning_rate": 1.7709532949456175e-06,
|
|
"loss": 0.6267,
|
|
"num_input_tokens_seen": 4373824,
|
|
"step": 1385
|
|
},
|
|
{
|
|
"epoch": 0.08898277959157544,
|
|
"grad_norm": 32.17240524291992,
|
|
"learning_rate": 1.7773512476007676e-06,
|
|
"loss": 0.4739,
|
|
"num_input_tokens_seen": 4388992,
|
|
"step": 1390
|
|
},
|
|
{
|
|
"epoch": 0.08930286153255233,
|
|
"grad_norm": 29.726274490356445,
|
|
"learning_rate": 1.783749200255918e-06,
|
|
"loss": 0.5295,
|
|
"num_input_tokens_seen": 4404288,
|
|
"step": 1395
|
|
},
|
|
{
|
|
"epoch": 0.08962294347352923,
|
|
"grad_norm": 49.96647644042969,
|
|
"learning_rate": 1.7901471529110684e-06,
|
|
"loss": 0.5366,
|
|
"num_input_tokens_seen": 4419840,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"epoch": 0.08994302541450612,
|
|
"grad_norm": 55.71930694580078,
|
|
"learning_rate": 1.7965451055662186e-06,
|
|
"loss": 0.5109,
|
|
"num_input_tokens_seen": 4435200,
|
|
"step": 1405
|
|
},
|
|
{
|
|
"epoch": 0.090263107355483,
|
|
"grad_norm": 54.367244720458984,
|
|
"learning_rate": 1.8029430582213691e-06,
|
|
"loss": 0.6082,
|
|
"num_input_tokens_seen": 4450368,
|
|
"step": 1410
|
|
},
|
|
{
|
|
"epoch": 0.09058318929645989,
|
|
"grad_norm": 42.54631042480469,
|
|
"learning_rate": 1.8093410108765195e-06,
|
|
"loss": 0.4889,
|
|
"num_input_tokens_seen": 4466048,
|
|
"step": 1415
|
|
},
|
|
{
|
|
"epoch": 0.09090327123743679,
|
|
"grad_norm": 56.23736572265625,
|
|
"learning_rate": 1.8157389635316697e-06,
|
|
"loss": 0.5985,
|
|
"num_input_tokens_seen": 4481920,
|
|
"step": 1420
|
|
},
|
|
{
|
|
"epoch": 0.09122335317841368,
|
|
"grad_norm": 34.284244537353516,
|
|
"learning_rate": 1.82213691618682e-06,
|
|
"loss": 0.5671,
|
|
"num_input_tokens_seen": 4498112,
|
|
"step": 1425
|
|
},
|
|
{
|
|
"epoch": 0.09154343511939056,
|
|
"grad_norm": 32.31144714355469,
|
|
"learning_rate": 1.8285348688419704e-06,
|
|
"loss": 0.4306,
|
|
"num_input_tokens_seen": 4515648,
|
|
"step": 1430
|
|
},
|
|
{
|
|
"epoch": 0.09186351706036745,
|
|
"grad_norm": 34.67725372314453,
|
|
"learning_rate": 1.8349328214971208e-06,
|
|
"loss": 0.5719,
|
|
"num_input_tokens_seen": 4531840,
|
|
"step": 1435
|
|
},
|
|
{
|
|
"epoch": 0.09218359900134435,
|
|
"grad_norm": 48.15701675415039,
|
|
"learning_rate": 1.8413307741522712e-06,
|
|
"loss": 0.5478,
|
|
"num_input_tokens_seen": 4547456,
|
|
"step": 1440
|
|
},
|
|
{
|
|
"epoch": 0.09250368094232124,
|
|
"grad_norm": 40.82353210449219,
|
|
"learning_rate": 1.8477287268074216e-06,
|
|
"loss": 0.557,
|
|
"num_input_tokens_seen": 4563328,
|
|
"step": 1445
|
|
},
|
|
{
|
|
"epoch": 0.09282376288329812,
|
|
"grad_norm": 28.479816436767578,
|
|
"learning_rate": 1.8541266794625718e-06,
|
|
"loss": 0.5856,
|
|
"num_input_tokens_seen": 4579392,
|
|
"step": 1450
|
|
},
|
|
{
|
|
"epoch": 0.09314384482427501,
|
|
"grad_norm": 80.24234008789062,
|
|
"learning_rate": 1.8605246321177221e-06,
|
|
"loss": 0.6149,
|
|
"num_input_tokens_seen": 4595584,
|
|
"step": 1455
|
|
},
|
|
{
|
|
"epoch": 0.0934639267652519,
|
|
"grad_norm": 33.8602294921875,
|
|
"learning_rate": 1.8669225847728727e-06,
|
|
"loss": 0.5711,
|
|
"num_input_tokens_seen": 4610112,
|
|
"step": 1460
|
|
},
|
|
{
|
|
"epoch": 0.0937840087062288,
|
|
"grad_norm": 53.692935943603516,
|
|
"learning_rate": 1.8733205374280229e-06,
|
|
"loss": 0.6948,
|
|
"num_input_tokens_seen": 4626432,
|
|
"step": 1465
|
|
},
|
|
{
|
|
"epoch": 0.09410409064720568,
|
|
"grad_norm": 38.61556625366211,
|
|
"learning_rate": 1.8797184900831733e-06,
|
|
"loss": 0.5771,
|
|
"num_input_tokens_seen": 4641792,
|
|
"step": 1470
|
|
},
|
|
{
|
|
"epoch": 0.09442417258818257,
|
|
"grad_norm": 18.766170501708984,
|
|
"learning_rate": 1.8861164427383236e-06,
|
|
"loss": 0.4046,
|
|
"num_input_tokens_seen": 4656896,
|
|
"step": 1475
|
|
},
|
|
{
|
|
"epoch": 0.09474425452915947,
|
|
"grad_norm": 47.406803131103516,
|
|
"learning_rate": 1.8925143953934738e-06,
|
|
"loss": 0.605,
|
|
"num_input_tokens_seen": 4673472,
|
|
"step": 1480
|
|
},
|
|
{
|
|
"epoch": 0.09506433647013636,
|
|
"grad_norm": 26.30023765563965,
|
|
"learning_rate": 1.8989123480486244e-06,
|
|
"loss": 0.426,
|
|
"num_input_tokens_seen": 4688896,
|
|
"step": 1485
|
|
},
|
|
{
|
|
"epoch": 0.09538441841111324,
|
|
"grad_norm": 43.65274429321289,
|
|
"learning_rate": 1.9053103007037748e-06,
|
|
"loss": 0.6785,
|
|
"num_input_tokens_seen": 4704576,
|
|
"step": 1490
|
|
},
|
|
{
|
|
"epoch": 0.09570450035209013,
|
|
"grad_norm": 38.707481384277344,
|
|
"learning_rate": 1.911708253358925e-06,
|
|
"loss": 0.6069,
|
|
"num_input_tokens_seen": 4719040,
|
|
"step": 1495
|
|
},
|
|
{
|
|
"epoch": 0.09602458229306703,
|
|
"grad_norm": 30.288116455078125,
|
|
"learning_rate": 1.9181062060140753e-06,
|
|
"loss": 0.4831,
|
|
"num_input_tokens_seen": 4733696,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"epoch": 0.09634466423404392,
|
|
"grad_norm": 39.540462493896484,
|
|
"learning_rate": 1.9245041586692255e-06,
|
|
"loss": 0.6045,
|
|
"num_input_tokens_seen": 4748992,
|
|
"step": 1505
|
|
},
|
|
{
|
|
"epoch": 0.0966647461750208,
|
|
"grad_norm": 50.21097183227539,
|
|
"learning_rate": 1.930902111324376e-06,
|
|
"loss": 0.6876,
|
|
"num_input_tokens_seen": 4764992,
|
|
"step": 1510
|
|
},
|
|
{
|
|
"epoch": 0.09698482811599769,
|
|
"grad_norm": 45.92460632324219,
|
|
"learning_rate": 1.9373000639795267e-06,
|
|
"loss": 0.6773,
|
|
"num_input_tokens_seen": 4780352,
|
|
"step": 1515
|
|
},
|
|
{
|
|
"epoch": 0.09730491005697459,
|
|
"grad_norm": 31.217529296875,
|
|
"learning_rate": 1.943698016634677e-06,
|
|
"loss": 0.5393,
|
|
"num_input_tokens_seen": 4796224,
|
|
"step": 1520
|
|
},
|
|
{
|
|
"epoch": 0.09762499199795148,
|
|
"grad_norm": 30.454994201660156,
|
|
"learning_rate": 1.950095969289827e-06,
|
|
"loss": 0.5401,
|
|
"num_input_tokens_seen": 4811840,
|
|
"step": 1525
|
|
},
|
|
{
|
|
"epoch": 0.09794507393892836,
|
|
"grad_norm": 26.992660522460938,
|
|
"learning_rate": 1.9564939219449776e-06,
|
|
"loss": 0.5811,
|
|
"num_input_tokens_seen": 4826432,
|
|
"step": 1530
|
|
},
|
|
{
|
|
"epoch": 0.09826515587990525,
|
|
"grad_norm": 28.93795394897461,
|
|
"learning_rate": 1.9628918746001278e-06,
|
|
"loss": 0.393,
|
|
"num_input_tokens_seen": 4841920,
|
|
"step": 1535
|
|
},
|
|
{
|
|
"epoch": 0.09858523782088215,
|
|
"grad_norm": 30.038558959960938,
|
|
"learning_rate": 1.9692898272552783e-06,
|
|
"loss": 0.5971,
|
|
"num_input_tokens_seen": 4857536,
|
|
"step": 1540
|
|
},
|
|
{
|
|
"epoch": 0.09890531976185904,
|
|
"grad_norm": 56.7470588684082,
|
|
"learning_rate": 1.9756877799104285e-06,
|
|
"loss": 0.6844,
|
|
"num_input_tokens_seen": 4873408,
|
|
"step": 1545
|
|
},
|
|
{
|
|
"epoch": 0.09922540170283592,
|
|
"grad_norm": 43.21520233154297,
|
|
"learning_rate": 1.9820857325655787e-06,
|
|
"loss": 0.5973,
|
|
"num_input_tokens_seen": 4889536,
|
|
"step": 1550
|
|
},
|
|
{
|
|
"epoch": 0.09954548364381281,
|
|
"grad_norm": 59.145320892333984,
|
|
"learning_rate": 1.9884836852207293e-06,
|
|
"loss": 0.627,
|
|
"num_input_tokens_seen": 4904448,
|
|
"step": 1555
|
|
},
|
|
{
|
|
"epoch": 0.0998655655847897,
|
|
"grad_norm": 25.44906997680664,
|
|
"learning_rate": 1.99488163787588e-06,
|
|
"loss": 0.6569,
|
|
"num_input_tokens_seen": 4919616,
|
|
"step": 1560
|
|
},
|
|
{
|
|
"epoch": 0.10012163113757122,
|
|
"eval_loss": 0.5394634008407593,
|
|
"eval_runtime": 49.1959,
|
|
"eval_samples_per_second": 282.259,
|
|
"eval_steps_per_second": 35.288,
|
|
"num_input_tokens_seen": 4931328,
|
|
"step": 1564
|
|
},
|
|
{
|
|
"epoch": 0.1001856475257666,
|
|
"grad_norm": 30.809673309326172,
|
|
"learning_rate": 1.9999999750297625e-06,
|
|
"loss": 0.516,
|
|
"num_input_tokens_seen": 4934144,
|
|
"step": 1565
|
|
},
|
|
{
|
|
"epoch": 0.1005057294667435,
|
|
"grad_norm": 31.836828231811523,
|
|
"learning_rate": 1.9999991010715873e-06,
|
|
"loss": 0.523,
|
|
"num_input_tokens_seen": 4950272,
|
|
"step": 1570
|
|
},
|
|
{
|
|
"epoch": 0.10082581140772037,
|
|
"grad_norm": 43.53628921508789,
|
|
"learning_rate": 1.999996978602793e-06,
|
|
"loss": 0.5346,
|
|
"num_input_tokens_seen": 4965056,
|
|
"step": 1575
|
|
},
|
|
{
|
|
"epoch": 0.10114589334869727,
|
|
"grad_norm": 33.710304260253906,
|
|
"learning_rate": 1.99999360762603e-06,
|
|
"loss": 0.5489,
|
|
"num_input_tokens_seen": 4980160,
|
|
"step": 1580
|
|
},
|
|
{
|
|
"epoch": 0.10146597528967416,
|
|
"grad_norm": 36.39333724975586,
|
|
"learning_rate": 1.9999889881455065e-06,
|
|
"loss": 0.453,
|
|
"num_input_tokens_seen": 4996992,
|
|
"step": 1585
|
|
},
|
|
{
|
|
"epoch": 0.10178605723065105,
|
|
"grad_norm": 35.11768341064453,
|
|
"learning_rate": 1.9999831201669897e-06,
|
|
"loss": 0.5146,
|
|
"num_input_tokens_seen": 5012608,
|
|
"step": 1590
|
|
},
|
|
{
|
|
"epoch": 0.10210613917162793,
|
|
"grad_norm": 38.27321243286133,
|
|
"learning_rate": 1.9999760036978067e-06,
|
|
"loss": 0.4848,
|
|
"num_input_tokens_seen": 5027840,
|
|
"step": 1595
|
|
},
|
|
{
|
|
"epoch": 0.10242622111260483,
|
|
"grad_norm": 33.53286361694336,
|
|
"learning_rate": 1.9999676387468417e-06,
|
|
"loss": 0.5746,
|
|
"num_input_tokens_seen": 5042752,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"epoch": 0.10274630305358172,
|
|
"grad_norm": 26.00925636291504,
|
|
"learning_rate": 1.999958025324539e-06,
|
|
"loss": 0.5487,
|
|
"num_input_tokens_seen": 5058624,
|
|
"step": 1605
|
|
},
|
|
{
|
|
"epoch": 0.10306638499455861,
|
|
"grad_norm": 36.0686149597168,
|
|
"learning_rate": 1.999947163442901e-06,
|
|
"loss": 0.6233,
|
|
"num_input_tokens_seen": 5075008,
|
|
"step": 1610
|
|
},
|
|
{
|
|
"epoch": 0.10338646693553549,
|
|
"grad_norm": 48.770294189453125,
|
|
"learning_rate": 1.9999350531154884e-06,
|
|
"loss": 0.5332,
|
|
"num_input_tokens_seen": 5090880,
|
|
"step": 1615
|
|
},
|
|
{
|
|
"epoch": 0.10370654887651239,
|
|
"grad_norm": 51.00628662109375,
|
|
"learning_rate": 1.9999216943574223e-06,
|
|
"loss": 0.5713,
|
|
"num_input_tokens_seen": 5106816,
|
|
"step": 1620
|
|
},
|
|
{
|
|
"epoch": 0.10402663081748928,
|
|
"grad_norm": 22.478723526000977,
|
|
"learning_rate": 1.9999070871853796e-06,
|
|
"loss": 0.4563,
|
|
"num_input_tokens_seen": 5123904,
|
|
"step": 1625
|
|
},
|
|
{
|
|
"epoch": 0.10434671275846617,
|
|
"grad_norm": 40.58604049682617,
|
|
"learning_rate": 1.9998912316175986e-06,
|
|
"loss": 0.4954,
|
|
"num_input_tokens_seen": 5140160,
|
|
"step": 1630
|
|
},
|
|
{
|
|
"epoch": 0.10466679469944305,
|
|
"grad_norm": 46.91875457763672,
|
|
"learning_rate": 1.9998741276738752e-06,
|
|
"loss": 0.5159,
|
|
"num_input_tokens_seen": 5156288,
|
|
"step": 1635
|
|
},
|
|
{
|
|
"epoch": 0.10498687664041995,
|
|
"grad_norm": 38.68816375732422,
|
|
"learning_rate": 1.999855775375563e-06,
|
|
"loss": 0.5823,
|
|
"num_input_tokens_seen": 5171776,
|
|
"step": 1640
|
|
},
|
|
{
|
|
"epoch": 0.10530695858139684,
|
|
"grad_norm": 40.31874465942383,
|
|
"learning_rate": 1.999836174745576e-06,
|
|
"loss": 0.683,
|
|
"num_input_tokens_seen": 5189504,
|
|
"step": 1645
|
|
},
|
|
{
|
|
"epoch": 0.10562704052237373,
|
|
"grad_norm": 45.76553726196289,
|
|
"learning_rate": 1.9998153258083853e-06,
|
|
"loss": 0.5783,
|
|
"num_input_tokens_seen": 5205056,
|
|
"step": 1650
|
|
},
|
|
{
|
|
"epoch": 0.10594712246335061,
|
|
"grad_norm": 37.635047912597656,
|
|
"learning_rate": 1.9997932285900214e-06,
|
|
"loss": 0.586,
|
|
"num_input_tokens_seen": 5222656,
|
|
"step": 1655
|
|
},
|
|
{
|
|
"epoch": 0.1062672044043275,
|
|
"grad_norm": 53.1572380065918,
|
|
"learning_rate": 1.9997698831180726e-06,
|
|
"loss": 0.6272,
|
|
"num_input_tokens_seen": 5238848,
|
|
"step": 1660
|
|
},
|
|
{
|
|
"epoch": 0.1065872863453044,
|
|
"grad_norm": 33.89291000366211,
|
|
"learning_rate": 1.999745289421686e-06,
|
|
"loss": 0.5203,
|
|
"num_input_tokens_seen": 5255296,
|
|
"step": 1665
|
|
},
|
|
{
|
|
"epoch": 0.10690736828628129,
|
|
"grad_norm": 58.435916900634766,
|
|
"learning_rate": 1.9997194475315674e-06,
|
|
"loss": 0.7716,
|
|
"num_input_tokens_seen": 5270336,
|
|
"step": 1670
|
|
},
|
|
{
|
|
"epoch": 0.10722745022725817,
|
|
"grad_norm": 36.215858459472656,
|
|
"learning_rate": 1.9996923574799808e-06,
|
|
"loss": 0.4842,
|
|
"num_input_tokens_seen": 5286720,
|
|
"step": 1675
|
|
},
|
|
{
|
|
"epoch": 0.10754753216823507,
|
|
"grad_norm": 32.55356979370117,
|
|
"learning_rate": 1.9996640193007476e-06,
|
|
"loss": 0.6428,
|
|
"num_input_tokens_seen": 5301632,
|
|
"step": 1680
|
|
},
|
|
{
|
|
"epoch": 0.10786761410921196,
|
|
"grad_norm": 49.88198471069336,
|
|
"learning_rate": 1.9996344330292495e-06,
|
|
"loss": 0.403,
|
|
"num_input_tokens_seen": 5316544,
|
|
"step": 1685
|
|
},
|
|
{
|
|
"epoch": 0.10818769605018885,
|
|
"grad_norm": 34.35507583618164,
|
|
"learning_rate": 1.9996035987024245e-06,
|
|
"loss": 0.5503,
|
|
"num_input_tokens_seen": 5332544,
|
|
"step": 1690
|
|
},
|
|
{
|
|
"epoch": 0.10850777799116573,
|
|
"grad_norm": 35.31010437011719,
|
|
"learning_rate": 1.99957151635877e-06,
|
|
"loss": 0.5388,
|
|
"num_input_tokens_seen": 5348096,
|
|
"step": 1695
|
|
},
|
|
{
|
|
"epoch": 0.10882785993214263,
|
|
"grad_norm": 50.9265022277832,
|
|
"learning_rate": 1.999538186038341e-06,
|
|
"loss": 0.6275,
|
|
"num_input_tokens_seen": 5362368,
|
|
"step": 1700
|
|
},
|
|
{
|
|
"epoch": 0.10914794187311952,
|
|
"grad_norm": 34.14656066894531,
|
|
"learning_rate": 1.999503607782751e-06,
|
|
"loss": 0.5426,
|
|
"num_input_tokens_seen": 5378176,
|
|
"step": 1705
|
|
},
|
|
{
|
|
"epoch": 0.10946802381409641,
|
|
"grad_norm": 42.861480712890625,
|
|
"learning_rate": 1.999467781635171e-06,
|
|
"loss": 0.5163,
|
|
"num_input_tokens_seen": 5394752,
|
|
"step": 1710
|
|
},
|
|
{
|
|
"epoch": 0.10978810575507329,
|
|
"grad_norm": 31.575403213500977,
|
|
"learning_rate": 1.9994307076403306e-06,
|
|
"loss": 0.6991,
|
|
"num_input_tokens_seen": 5412160,
|
|
"step": 1715
|
|
},
|
|
{
|
|
"epoch": 0.11010818769605019,
|
|
"grad_norm": 35.84833908081055,
|
|
"learning_rate": 1.999392385844517e-06,
|
|
"loss": 0.5245,
|
|
"num_input_tokens_seen": 5427840,
|
|
"step": 1720
|
|
},
|
|
{
|
|
"epoch": 0.11042826963702708,
|
|
"grad_norm": 36.32638931274414,
|
|
"learning_rate": 1.9993528162955753e-06,
|
|
"loss": 0.4035,
|
|
"num_input_tokens_seen": 5444224,
|
|
"step": 1725
|
|
},
|
|
{
|
|
"epoch": 0.11074835157800397,
|
|
"grad_norm": 60.70829391479492,
|
|
"learning_rate": 1.9993119990429095e-06,
|
|
"loss": 0.5767,
|
|
"num_input_tokens_seen": 5459648,
|
|
"step": 1730
|
|
},
|
|
{
|
|
"epoch": 0.11106843351898085,
|
|
"grad_norm": 47.9375114440918,
|
|
"learning_rate": 1.9992699341374794e-06,
|
|
"loss": 0.7821,
|
|
"num_input_tokens_seen": 5475008,
|
|
"step": 1735
|
|
},
|
|
{
|
|
"epoch": 0.11138851545995775,
|
|
"grad_norm": 32.798091888427734,
|
|
"learning_rate": 1.9992266216318033e-06,
|
|
"loss": 0.5285,
|
|
"num_input_tokens_seen": 5491456,
|
|
"step": 1740
|
|
},
|
|
{
|
|
"epoch": 0.11170859740093464,
|
|
"grad_norm": 35.342803955078125,
|
|
"learning_rate": 1.9991820615799583e-06,
|
|
"loss": 0.5674,
|
|
"num_input_tokens_seen": 5507520,
|
|
"step": 1745
|
|
},
|
|
{
|
|
"epoch": 0.11202867934191153,
|
|
"grad_norm": 49.72675704956055,
|
|
"learning_rate": 1.999136254037578e-06,
|
|
"loss": 0.6917,
|
|
"num_input_tokens_seen": 5523072,
|
|
"step": 1750
|
|
},
|
|
{
|
|
"epoch": 0.11234876128288843,
|
|
"grad_norm": 37.71804428100586,
|
|
"learning_rate": 1.999089199061853e-06,
|
|
"loss": 0.5094,
|
|
"num_input_tokens_seen": 5538304,
|
|
"step": 1755
|
|
},
|
|
{
|
|
"epoch": 0.1126688432238653,
|
|
"grad_norm": 35.397056579589844,
|
|
"learning_rate": 1.9990408967115326e-06,
|
|
"loss": 0.4612,
|
|
"num_input_tokens_seen": 5553920,
|
|
"step": 1760
|
|
},
|
|
{
|
|
"epoch": 0.1129889251648422,
|
|
"grad_norm": 26.074499130249023,
|
|
"learning_rate": 1.998991347046922e-06,
|
|
"loss": 0.4599,
|
|
"num_input_tokens_seen": 5569344,
|
|
"step": 1765
|
|
},
|
|
{
|
|
"epoch": 0.11330900710581909,
|
|
"grad_norm": 33.73558044433594,
|
|
"learning_rate": 1.9989405501298857e-06,
|
|
"loss": 0.5104,
|
|
"num_input_tokens_seen": 5585856,
|
|
"step": 1770
|
|
},
|
|
{
|
|
"epoch": 0.11362908904679599,
|
|
"grad_norm": 50.979820251464844,
|
|
"learning_rate": 1.9988885060238436e-06,
|
|
"loss": 0.5755,
|
|
"num_input_tokens_seen": 5603840,
|
|
"step": 1775
|
|
},
|
|
{
|
|
"epoch": 0.11394917098777287,
|
|
"grad_norm": 25.762378692626953,
|
|
"learning_rate": 1.9988352147937735e-06,
|
|
"loss": 0.5167,
|
|
"num_input_tokens_seen": 5620352,
|
|
"step": 1780
|
|
},
|
|
{
|
|
"epoch": 0.11426925292874976,
|
|
"grad_norm": 44.76283645629883,
|
|
"learning_rate": 1.99878067650621e-06,
|
|
"loss": 0.5382,
|
|
"num_input_tokens_seen": 5636544,
|
|
"step": 1785
|
|
},
|
|
{
|
|
"epoch": 0.11458933486972665,
|
|
"grad_norm": 38.91508102416992,
|
|
"learning_rate": 1.998724891229245e-06,
|
|
"loss": 0.5438,
|
|
"num_input_tokens_seen": 5652672,
|
|
"step": 1790
|
|
},
|
|
{
|
|
"epoch": 0.11490941681070355,
|
|
"grad_norm": 42.947147369384766,
|
|
"learning_rate": 1.998667859032527e-06,
|
|
"loss": 0.4956,
|
|
"num_input_tokens_seen": 5668224,
|
|
"step": 1795
|
|
},
|
|
{
|
|
"epoch": 0.11522949875168043,
|
|
"grad_norm": 21.492956161499023,
|
|
"learning_rate": 1.9986095799872613e-06,
|
|
"loss": 0.4506,
|
|
"num_input_tokens_seen": 5684480,
|
|
"step": 1800
|
|
},
|
|
{
|
|
"epoch": 0.11554958069265732,
|
|
"grad_norm": 65.54640197753906,
|
|
"learning_rate": 1.99855005416621e-06,
|
|
"loss": 0.472,
|
|
"num_input_tokens_seen": 5700864,
|
|
"step": 1805
|
|
},
|
|
{
|
|
"epoch": 0.11586966263363421,
|
|
"grad_norm": 40.5084114074707,
|
|
"learning_rate": 1.998489281643692e-06,
|
|
"loss": 0.5965,
|
|
"num_input_tokens_seen": 5716224,
|
|
"step": 1810
|
|
},
|
|
{
|
|
"epoch": 0.1161897445746111,
|
|
"grad_norm": 29.857545852661133,
|
|
"learning_rate": 1.998427262495582e-06,
|
|
"loss": 0.4977,
|
|
"num_input_tokens_seen": 5733056,
|
|
"step": 1815
|
|
},
|
|
{
|
|
"epoch": 0.11650982651558799,
|
|
"grad_norm": 36.56293487548828,
|
|
"learning_rate": 1.9983639967993124e-06,
|
|
"loss": 0.6683,
|
|
"num_input_tokens_seen": 5749120,
|
|
"step": 1820
|
|
},
|
|
{
|
|
"epoch": 0.11682990845656488,
|
|
"grad_norm": 30.559627532958984,
|
|
"learning_rate": 1.99829948463387e-06,
|
|
"loss": 0.7297,
|
|
"num_input_tokens_seen": 5763968,
|
|
"step": 1825
|
|
},
|
|
{
|
|
"epoch": 0.11714999039754177,
|
|
"grad_norm": 31.007530212402344,
|
|
"learning_rate": 1.9982337260798e-06,
|
|
"loss": 0.543,
|
|
"num_input_tokens_seen": 5779520,
|
|
"step": 1830
|
|
},
|
|
{
|
|
"epoch": 0.11747007233851867,
|
|
"grad_norm": 36.148040771484375,
|
|
"learning_rate": 1.998166721219203e-06,
|
|
"loss": 0.5856,
|
|
"num_input_tokens_seen": 5798848,
|
|
"step": 1835
|
|
},
|
|
{
|
|
"epoch": 0.11779015427949555,
|
|
"grad_norm": 31.41288185119629,
|
|
"learning_rate": 1.9980984701357338e-06,
|
|
"loss": 0.5155,
|
|
"num_input_tokens_seen": 5813952,
|
|
"step": 1840
|
|
},
|
|
{
|
|
"epoch": 0.11811023622047244,
|
|
"grad_norm": 36.17179489135742,
|
|
"learning_rate": 1.998028972914606e-06,
|
|
"loss": 0.4362,
|
|
"num_input_tokens_seen": 5830016,
|
|
"step": 1845
|
|
},
|
|
{
|
|
"epoch": 0.11843031816144933,
|
|
"grad_norm": 38.044334411621094,
|
|
"learning_rate": 1.9979582296425877e-06,
|
|
"loss": 0.5893,
|
|
"num_input_tokens_seen": 5845312,
|
|
"step": 1850
|
|
},
|
|
{
|
|
"epoch": 0.11875040010242623,
|
|
"grad_norm": 22.015993118286133,
|
|
"learning_rate": 1.9978862404080022e-06,
|
|
"loss": 0.5851,
|
|
"num_input_tokens_seen": 5860672,
|
|
"step": 1855
|
|
},
|
|
{
|
|
"epoch": 0.1190704820434031,
|
|
"grad_norm": 34.7830696105957,
|
|
"learning_rate": 1.9978130053007295e-06,
|
|
"loss": 0.5376,
|
|
"num_input_tokens_seen": 5875776,
|
|
"step": 1860
|
|
},
|
|
{
|
|
"epoch": 0.11939056398438,
|
|
"grad_norm": 37.767024993896484,
|
|
"learning_rate": 1.9977385244122034e-06,
|
|
"loss": 0.4319,
|
|
"num_input_tokens_seen": 5891200,
|
|
"step": 1865
|
|
},
|
|
{
|
|
"epoch": 0.11971064592535689,
|
|
"grad_norm": 35.13771438598633,
|
|
"learning_rate": 1.997662797835415e-06,
|
|
"loss": 0.4821,
|
|
"num_input_tokens_seen": 5907008,
|
|
"step": 1870
|
|
},
|
|
{
|
|
"epoch": 0.12003072786633379,
|
|
"grad_norm": 34.38051986694336,
|
|
"learning_rate": 1.9975858256649097e-06,
|
|
"loss": 0.4645,
|
|
"num_input_tokens_seen": 5923264,
|
|
"step": 1875
|
|
},
|
|
{
|
|
"epoch": 0.12035080980731067,
|
|
"grad_norm": 52.10721206665039,
|
|
"learning_rate": 1.997507607996788e-06,
|
|
"loss": 0.4911,
|
|
"num_input_tokens_seen": 5939648,
|
|
"step": 1880
|
|
},
|
|
{
|
|
"epoch": 0.12067089174828756,
|
|
"grad_norm": 25.434171676635742,
|
|
"learning_rate": 1.997428144928706e-06,
|
|
"loss": 0.4557,
|
|
"num_input_tokens_seen": 5955520,
|
|
"step": 1885
|
|
},
|
|
{
|
|
"epoch": 0.12099097368926445,
|
|
"grad_norm": 41.22515106201172,
|
|
"learning_rate": 1.9973474365598736e-06,
|
|
"loss": 0.5237,
|
|
"num_input_tokens_seen": 5971072,
|
|
"step": 1890
|
|
},
|
|
{
|
|
"epoch": 0.12131105563024135,
|
|
"grad_norm": 40.53886413574219,
|
|
"learning_rate": 1.9972654829910568e-06,
|
|
"loss": 0.5787,
|
|
"num_input_tokens_seen": 5987264,
|
|
"step": 1895
|
|
},
|
|
{
|
|
"epoch": 0.12163113757121823,
|
|
"grad_norm": 53.25776672363281,
|
|
"learning_rate": 1.9971822843245748e-06,
|
|
"loss": 0.6193,
|
|
"num_input_tokens_seen": 6002880,
|
|
"step": 1900
|
|
},
|
|
{
|
|
"epoch": 0.12195121951219512,
|
|
"grad_norm": 40.23493957519531,
|
|
"learning_rate": 1.997097840664303e-06,
|
|
"loss": 0.5277,
|
|
"num_input_tokens_seen": 6019520,
|
|
"step": 1905
|
|
},
|
|
{
|
|
"epoch": 0.12227130145317201,
|
|
"grad_norm": 64.40421295166016,
|
|
"learning_rate": 1.99701215211567e-06,
|
|
"loss": 0.5641,
|
|
"num_input_tokens_seen": 6035904,
|
|
"step": 1910
|
|
},
|
|
{
|
|
"epoch": 0.1225913833941489,
|
|
"grad_norm": 28.99512481689453,
|
|
"learning_rate": 1.9969252187856587e-06,
|
|
"loss": 0.6009,
|
|
"num_input_tokens_seen": 6050816,
|
|
"step": 1915
|
|
},
|
|
{
|
|
"epoch": 0.12291146533512579,
|
|
"grad_norm": 26.199125289916992,
|
|
"learning_rate": 1.9968370407828065e-06,
|
|
"loss": 0.4204,
|
|
"num_input_tokens_seen": 6065920,
|
|
"step": 1920
|
|
},
|
|
{
|
|
"epoch": 0.12323154727610268,
|
|
"grad_norm": 25.35918426513672,
|
|
"learning_rate": 1.996747618217205e-06,
|
|
"loss": 0.5962,
|
|
"num_input_tokens_seen": 6081728,
|
|
"step": 1925
|
|
},
|
|
{
|
|
"epoch": 0.12355162921707957,
|
|
"grad_norm": 27.474023818969727,
|
|
"learning_rate": 1.9966569512004987e-06,
|
|
"loss": 0.4945,
|
|
"num_input_tokens_seen": 6097472,
|
|
"step": 1930
|
|
},
|
|
{
|
|
"epoch": 0.12387171115805647,
|
|
"grad_norm": 32.64793395996094,
|
|
"learning_rate": 1.996565039845887e-06,
|
|
"loss": 0.5101,
|
|
"num_input_tokens_seen": 6113152,
|
|
"step": 1935
|
|
},
|
|
{
|
|
"epoch": 0.12419179309903335,
|
|
"grad_norm": 47.92166519165039,
|
|
"learning_rate": 1.996471884268122e-06,
|
|
"loss": 0.614,
|
|
"num_input_tokens_seen": 6129408,
|
|
"step": 1940
|
|
},
|
|
{
|
|
"epoch": 0.12451187504001024,
|
|
"grad_norm": 20.421428680419922,
|
|
"learning_rate": 1.9963774845835097e-06,
|
|
"loss": 0.545,
|
|
"num_input_tokens_seen": 6144896,
|
|
"step": 1945
|
|
},
|
|
{
|
|
"epoch": 0.12483195698098713,
|
|
"grad_norm": 50.85639572143555,
|
|
"learning_rate": 1.996281840909909e-06,
|
|
"loss": 0.5868,
|
|
"num_input_tokens_seen": 6160256,
|
|
"step": 1950
|
|
},
|
|
{
|
|
"epoch": 0.12515203892196403,
|
|
"grad_norm": 36.815921783447266,
|
|
"learning_rate": 1.9961849533667322e-06,
|
|
"loss": 0.6354,
|
|
"num_input_tokens_seen": 6175104,
|
|
"step": 1955
|
|
},
|
|
{
|
|
"epoch": 0.1254721208629409,
|
|
"grad_norm": 30.276325225830078,
|
|
"learning_rate": 1.9960868220749447e-06,
|
|
"loss": 0.5185,
|
|
"num_input_tokens_seen": 6190272,
|
|
"step": 1960
|
|
},
|
|
{
|
|
"epoch": 0.1257922028039178,
|
|
"grad_norm": 39.995033264160156,
|
|
"learning_rate": 1.9959874471570644e-06,
|
|
"loss": 0.5855,
|
|
"num_input_tokens_seen": 6205952,
|
|
"step": 1965
|
|
},
|
|
{
|
|
"epoch": 0.1261122847448947,
|
|
"grad_norm": 44.66065216064453,
|
|
"learning_rate": 1.9958868287371625e-06,
|
|
"loss": 0.56,
|
|
"num_input_tokens_seen": 6222592,
|
|
"step": 1970
|
|
},
|
|
{
|
|
"epoch": 0.12643236668587157,
|
|
"grad_norm": 37.10478591918945,
|
|
"learning_rate": 1.9957849669408617e-06,
|
|
"loss": 0.4803,
|
|
"num_input_tokens_seen": 6237696,
|
|
"step": 1975
|
|
},
|
|
{
|
|
"epoch": 0.12675244862684848,
|
|
"grad_norm": 53.23179244995117,
|
|
"learning_rate": 1.995681861895338e-06,
|
|
"loss": 0.4858,
|
|
"num_input_tokens_seen": 6254080,
|
|
"step": 1980
|
|
},
|
|
{
|
|
"epoch": 0.12707253056782536,
|
|
"grad_norm": 27.040605545043945,
|
|
"learning_rate": 1.9955775137293187e-06,
|
|
"loss": 0.5741,
|
|
"num_input_tokens_seen": 6270016,
|
|
"step": 1985
|
|
},
|
|
{
|
|
"epoch": 0.12739261250880227,
|
|
"grad_norm": 50.933433532714844,
|
|
"learning_rate": 1.9954719225730845e-06,
|
|
"loss": 0.6124,
|
|
"num_input_tokens_seen": 6285184,
|
|
"step": 1990
|
|
},
|
|
{
|
|
"epoch": 0.12771269444977915,
|
|
"grad_norm": 47.980018615722656,
|
|
"learning_rate": 1.9953650885584666e-06,
|
|
"loss": 0.4774,
|
|
"num_input_tokens_seen": 6300992,
|
|
"step": 1995
|
|
},
|
|
{
|
|
"epoch": 0.12803277639075603,
|
|
"grad_norm": 15.962865829467773,
|
|
"learning_rate": 1.995257011818849e-06,
|
|
"loss": 0.5445,
|
|
"num_input_tokens_seen": 6315392,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"epoch": 0.12835285833173293,
|
|
"grad_norm": 38.00539779663086,
|
|
"learning_rate": 1.9951476924891666e-06,
|
|
"loss": 0.4739,
|
|
"num_input_tokens_seen": 6331136,
|
|
"step": 2005
|
|
},
|
|
{
|
|
"epoch": 0.1286729402727098,
|
|
"grad_norm": 33.02660369873047,
|
|
"learning_rate": 1.9950371307059056e-06,
|
|
"loss": 0.5553,
|
|
"num_input_tokens_seen": 6347584,
|
|
"step": 2010
|
|
},
|
|
{
|
|
"epoch": 0.1289930222136867,
|
|
"grad_norm": 55.436187744140625,
|
|
"learning_rate": 1.9949253266071036e-06,
|
|
"loss": 0.5728,
|
|
"num_input_tokens_seen": 6362560,
|
|
"step": 2015
|
|
},
|
|
{
|
|
"epoch": 0.1293131041546636,
|
|
"grad_norm": 28.438800811767578,
|
|
"learning_rate": 1.9948122803323503e-06,
|
|
"loss": 0.5075,
|
|
"num_input_tokens_seen": 6378304,
|
|
"step": 2020
|
|
},
|
|
{
|
|
"epoch": 0.12963318609564048,
|
|
"grad_norm": 39.819091796875,
|
|
"learning_rate": 1.9946979920227844e-06,
|
|
"loss": 0.5147,
|
|
"num_input_tokens_seen": 6393280,
|
|
"step": 2025
|
|
},
|
|
{
|
|
"epoch": 0.1299532680366174,
|
|
"grad_norm": 61.551517486572266,
|
|
"learning_rate": 1.994582461821096e-06,
|
|
"loss": 0.5251,
|
|
"num_input_tokens_seen": 6409472,
|
|
"step": 2030
|
|
},
|
|
{
|
|
"epoch": 0.13027334997759427,
|
|
"grad_norm": 71.56808471679688,
|
|
"learning_rate": 1.9944656898715267e-06,
|
|
"loss": 0.7157,
|
|
"num_input_tokens_seen": 6424960,
|
|
"step": 2035
|
|
},
|
|
{
|
|
"epoch": 0.13059343191857115,
|
|
"grad_norm": 31.08255386352539,
|
|
"learning_rate": 1.994347676319867e-06,
|
|
"loss": 0.6057,
|
|
"num_input_tokens_seen": 6440000,
|
|
"step": 2040
|
|
},
|
|
{
|
|
"epoch": 0.13091351385954805,
|
|
"grad_norm": 24.44256019592285,
|
|
"learning_rate": 1.994228421313459e-06,
|
|
"loss": 0.453,
|
|
"num_input_tokens_seen": 6457600,
|
|
"step": 2045
|
|
},
|
|
{
|
|
"epoch": 0.13123359580052493,
|
|
"grad_norm": 39.8853759765625,
|
|
"learning_rate": 1.994107925001193e-06,
|
|
"loss": 0.5143,
|
|
"num_input_tokens_seen": 6473088,
|
|
"step": 2050
|
|
},
|
|
{
|
|
"epoch": 0.1315536777415018,
|
|
"grad_norm": 51.965187072753906,
|
|
"learning_rate": 1.9939861875335108e-06,
|
|
"loss": 0.6013,
|
|
"num_input_tokens_seen": 6487680,
|
|
"step": 2055
|
|
},
|
|
{
|
|
"epoch": 0.13187375968247872,
|
|
"grad_norm": 35.12892532348633,
|
|
"learning_rate": 1.9938632090624025e-06,
|
|
"loss": 0.4831,
|
|
"num_input_tokens_seen": 6503296,
|
|
"step": 2060
|
|
},
|
|
{
|
|
"epoch": 0.1321938416234556,
|
|
"grad_norm": 14.63175106048584,
|
|
"learning_rate": 1.9937389897414087e-06,
|
|
"loss": 0.5363,
|
|
"num_input_tokens_seen": 6518912,
|
|
"step": 2065
|
|
},
|
|
{
|
|
"epoch": 0.1325139235644325,
|
|
"grad_norm": 43.0014533996582,
|
|
"learning_rate": 1.993613529725618e-06,
|
|
"loss": 0.5631,
|
|
"num_input_tokens_seen": 6534784,
|
|
"step": 2070
|
|
},
|
|
{
|
|
"epoch": 0.13283400550540939,
|
|
"grad_norm": 51.7596435546875,
|
|
"learning_rate": 1.99348682917167e-06,
|
|
"loss": 0.5248,
|
|
"num_input_tokens_seen": 6550528,
|
|
"step": 2075
|
|
},
|
|
{
|
|
"epoch": 0.13315408744638627,
|
|
"grad_norm": 34.12824630737305,
|
|
"learning_rate": 1.99335888823775e-06,
|
|
"loss": 0.5344,
|
|
"num_input_tokens_seen": 6566144,
|
|
"step": 2080
|
|
},
|
|
{
|
|
"epoch": 0.13347416938736317,
|
|
"grad_norm": 51.184452056884766,
|
|
"learning_rate": 1.993229707083595e-06,
|
|
"loss": 0.5605,
|
|
"num_input_tokens_seen": 6583872,
|
|
"step": 2085
|
|
},
|
|
{
|
|
"epoch": 0.13379425132834005,
|
|
"grad_norm": 19.79715919494629,
|
|
"learning_rate": 1.993099285870489e-06,
|
|
"loss": 0.4144,
|
|
"num_input_tokens_seen": 6602304,
|
|
"step": 2090
|
|
},
|
|
{
|
|
"epoch": 0.13411433326931693,
|
|
"grad_norm": 41.58517074584961,
|
|
"learning_rate": 1.992967624761264e-06,
|
|
"loss": 0.4607,
|
|
"num_input_tokens_seen": 6618112,
|
|
"step": 2095
|
|
},
|
|
{
|
|
"epoch": 0.13443441521029384,
|
|
"grad_norm": 47.04132080078125,
|
|
"learning_rate": 1.9928347239203014e-06,
|
|
"loss": 0.6174,
|
|
"num_input_tokens_seen": 6635584,
|
|
"step": 2100
|
|
},
|
|
{
|
|
"epoch": 0.13475449715127072,
|
|
"grad_norm": 34.34235763549805,
|
|
"learning_rate": 1.9927005835135282e-06,
|
|
"loss": 0.5339,
|
|
"num_input_tokens_seen": 6653568,
|
|
"step": 2105
|
|
},
|
|
{
|
|
"epoch": 0.13507457909224763,
|
|
"grad_norm": 30.708681106567383,
|
|
"learning_rate": 1.9925652037084214e-06,
|
|
"loss": 0.4604,
|
|
"num_input_tokens_seen": 6668864,
|
|
"step": 2110
|
|
},
|
|
{
|
|
"epoch": 0.1353946610332245,
|
|
"grad_norm": 25.8023738861084,
|
|
"learning_rate": 1.9924285846740037e-06,
|
|
"loss": 0.4852,
|
|
"num_input_tokens_seen": 6684416,
|
|
"step": 2115
|
|
},
|
|
{
|
|
"epoch": 0.13571474297420139,
|
|
"grad_norm": 49.56015396118164,
|
|
"learning_rate": 1.9922907265808452e-06,
|
|
"loss": 0.5927,
|
|
"num_input_tokens_seen": 6699392,
|
|
"step": 2120
|
|
},
|
|
{
|
|
"epoch": 0.1360348249151783,
|
|
"grad_norm": 43.24879455566406,
|
|
"learning_rate": 1.9921516296010643e-06,
|
|
"loss": 0.5477,
|
|
"num_input_tokens_seen": 6714560,
|
|
"step": 2125
|
|
},
|
|
{
|
|
"epoch": 0.13635490685615517,
|
|
"grad_norm": 50.094120025634766,
|
|
"learning_rate": 1.9920112939083246e-06,
|
|
"loss": 0.5584,
|
|
"num_input_tokens_seen": 6729920,
|
|
"step": 2130
|
|
},
|
|
{
|
|
"epoch": 0.13667498879713205,
|
|
"grad_norm": 27.34825897216797,
|
|
"learning_rate": 1.9918697196778367e-06,
|
|
"loss": 0.5555,
|
|
"num_input_tokens_seen": 6744768,
|
|
"step": 2135
|
|
},
|
|
{
|
|
"epoch": 0.13699507073810896,
|
|
"grad_norm": 26.479101181030273,
|
|
"learning_rate": 1.9917269070863578e-06,
|
|
"loss": 0.4607,
|
|
"num_input_tokens_seen": 6759680,
|
|
"step": 2140
|
|
},
|
|
{
|
|
"epoch": 0.13731515267908584,
|
|
"grad_norm": 35.83186340332031,
|
|
"learning_rate": 1.9915828563121915e-06,
|
|
"loss": 0.5094,
|
|
"num_input_tokens_seen": 6775168,
|
|
"step": 2145
|
|
},
|
|
{
|
|
"epoch": 0.13763523462006275,
|
|
"grad_norm": 43.06388473510742,
|
|
"learning_rate": 1.9914375675351865e-06,
|
|
"loss": 0.5364,
|
|
"num_input_tokens_seen": 6791296,
|
|
"step": 2150
|
|
},
|
|
{
|
|
"epoch": 0.13795531656103963,
|
|
"grad_norm": 18.07638168334961,
|
|
"learning_rate": 1.991291040936738e-06,
|
|
"loss": 0.43,
|
|
"num_input_tokens_seen": 6808640,
|
|
"step": 2155
|
|
},
|
|
{
|
|
"epoch": 0.1382753985020165,
|
|
"grad_norm": 41.67695236206055,
|
|
"learning_rate": 1.9911432766997857e-06,
|
|
"loss": 0.6627,
|
|
"num_input_tokens_seen": 6824064,
|
|
"step": 2160
|
|
},
|
|
{
|
|
"epoch": 0.1385954804429934,
|
|
"grad_norm": 56.66114807128906,
|
|
"learning_rate": 1.990994275008815e-06,
|
|
"loss": 0.4426,
|
|
"num_input_tokens_seen": 6839872,
|
|
"step": 2165
|
|
},
|
|
{
|
|
"epoch": 0.1389155623839703,
|
|
"grad_norm": 58.23060989379883,
|
|
"learning_rate": 1.9908440360498565e-06,
|
|
"loss": 0.5081,
|
|
"num_input_tokens_seen": 6855744,
|
|
"step": 2170
|
|
},
|
|
{
|
|
"epoch": 0.1392356443249472,
|
|
"grad_norm": 45.5991096496582,
|
|
"learning_rate": 1.990692560010485e-06,
|
|
"loss": 0.5566,
|
|
"num_input_tokens_seen": 6869632,
|
|
"step": 2175
|
|
},
|
|
{
|
|
"epoch": 0.13955572626592408,
|
|
"grad_norm": 27.95288848876953,
|
|
"learning_rate": 1.9905398470798206e-06,
|
|
"loss": 0.448,
|
|
"num_input_tokens_seen": 6885696,
|
|
"step": 2180
|
|
},
|
|
{
|
|
"epoch": 0.13987580820690096,
|
|
"grad_norm": 29.043428421020508,
|
|
"learning_rate": 1.990385897448527e-06,
|
|
"loss": 0.3634,
|
|
"num_input_tokens_seen": 6901504,
|
|
"step": 2185
|
|
},
|
|
{
|
|
"epoch": 0.14019589014787787,
|
|
"grad_norm": 36.92293167114258,
|
|
"learning_rate": 1.9902307113088114e-06,
|
|
"loss": 0.5822,
|
|
"num_input_tokens_seen": 6916480,
|
|
"step": 2190
|
|
},
|
|
{
|
|
"epoch": 0.14051597208885475,
|
|
"grad_norm": 49.32163619995117,
|
|
"learning_rate": 1.9900742888544264e-06,
|
|
"loss": 0.4818,
|
|
"num_input_tokens_seen": 6932416,
|
|
"step": 2195
|
|
},
|
|
{
|
|
"epoch": 0.14083605402983163,
|
|
"grad_norm": 46.43427658081055,
|
|
"learning_rate": 1.989916630280667e-06,
|
|
"loss": 0.534,
|
|
"num_input_tokens_seen": 6948992,
|
|
"step": 2200
|
|
},
|
|
{
|
|
"epoch": 0.14115613597080853,
|
|
"grad_norm": 57.13213348388672,
|
|
"learning_rate": 1.989757735784372e-06,
|
|
"loss": 0.4636,
|
|
"num_input_tokens_seen": 6964416,
|
|
"step": 2205
|
|
},
|
|
{
|
|
"epoch": 0.1414762179117854,
|
|
"grad_norm": 39.992496490478516,
|
|
"learning_rate": 1.989597605563923e-06,
|
|
"loss": 0.4218,
|
|
"num_input_tokens_seen": 6980544,
|
|
"step": 2210
|
|
},
|
|
{
|
|
"epoch": 0.14179629985276232,
|
|
"grad_norm": 29.56856918334961,
|
|
"learning_rate": 1.9894362398192437e-06,
|
|
"loss": 0.5658,
|
|
"num_input_tokens_seen": 6997440,
|
|
"step": 2215
|
|
},
|
|
{
|
|
"epoch": 0.1421163817937392,
|
|
"grad_norm": 22.893774032592773,
|
|
"learning_rate": 1.9892736387518023e-06,
|
|
"loss": 0.4163,
|
|
"num_input_tokens_seen": 7012672,
|
|
"step": 2220
|
|
},
|
|
{
|
|
"epoch": 0.14243646373471608,
|
|
"grad_norm": 58.755828857421875,
|
|
"learning_rate": 1.9891098025646075e-06,
|
|
"loss": 0.4773,
|
|
"num_input_tokens_seen": 7027648,
|
|
"step": 2225
|
|
},
|
|
{
|
|
"epoch": 0.142756545675693,
|
|
"grad_norm": 28.471839904785156,
|
|
"learning_rate": 1.9889447314622105e-06,
|
|
"loss": 0.5303,
|
|
"num_input_tokens_seen": 7043200,
|
|
"step": 2230
|
|
},
|
|
{
|
|
"epoch": 0.14307662761666987,
|
|
"grad_norm": 41.83107376098633,
|
|
"learning_rate": 1.9887784256507046e-06,
|
|
"loss": 0.7152,
|
|
"num_input_tokens_seen": 7058688,
|
|
"step": 2235
|
|
},
|
|
{
|
|
"epoch": 0.14339670955764675,
|
|
"grad_norm": 32.69862365722656,
|
|
"learning_rate": 1.988610885337725e-06,
|
|
"loss": 0.6679,
|
|
"num_input_tokens_seen": 7074048,
|
|
"step": 2240
|
|
},
|
|
{
|
|
"epoch": 0.14371679149862365,
|
|
"grad_norm": 32.3195686340332,
|
|
"learning_rate": 1.9884421107324476e-06,
|
|
"loss": 0.5261,
|
|
"num_input_tokens_seen": 7089792,
|
|
"step": 2245
|
|
},
|
|
{
|
|
"epoch": 0.14403687343960053,
|
|
"grad_norm": 39.98912811279297,
|
|
"learning_rate": 1.9882721020455893e-06,
|
|
"loss": 0.4755,
|
|
"num_input_tokens_seen": 7104640,
|
|
"step": 2250
|
|
},
|
|
{
|
|
"epoch": 0.14435695538057744,
|
|
"grad_norm": 31.75237464904785,
|
|
"learning_rate": 1.988100859489408e-06,
|
|
"loss": 0.5019,
|
|
"num_input_tokens_seen": 7120064,
|
|
"step": 2255
|
|
},
|
|
{
|
|
"epoch": 0.14467703732155432,
|
|
"grad_norm": 34.01973342895508,
|
|
"learning_rate": 1.9879283832777017e-06,
|
|
"loss": 0.4754,
|
|
"num_input_tokens_seen": 7135232,
|
|
"step": 2260
|
|
},
|
|
{
|
|
"epoch": 0.1449971192625312,
|
|
"grad_norm": 48.68187713623047,
|
|
"learning_rate": 1.9877546736258096e-06,
|
|
"loss": 0.5075,
|
|
"num_input_tokens_seen": 7149632,
|
|
"step": 2265
|
|
},
|
|
{
|
|
"epoch": 0.1453172012035081,
|
|
"grad_norm": 38.213932037353516,
|
|
"learning_rate": 1.98757973075061e-06,
|
|
"loss": 0.4107,
|
|
"num_input_tokens_seen": 7164352,
|
|
"step": 2270
|
|
},
|
|
{
|
|
"epoch": 0.14563728314448499,
|
|
"grad_norm": 35.81578063964844,
|
|
"learning_rate": 1.987403554870521e-06,
|
|
"loss": 0.5188,
|
|
"num_input_tokens_seen": 7179776,
|
|
"step": 2275
|
|
},
|
|
{
|
|
"epoch": 0.14595736508546187,
|
|
"grad_norm": 36.673587799072266,
|
|
"learning_rate": 1.9872261462055003e-06,
|
|
"loss": 0.4212,
|
|
"num_input_tokens_seen": 7194240,
|
|
"step": 2280
|
|
},
|
|
{
|
|
"epoch": 0.14627744702643877,
|
|
"grad_norm": 21.095298767089844,
|
|
"learning_rate": 1.987047504977045e-06,
|
|
"loss": 0.4335,
|
|
"num_input_tokens_seen": 7209472,
|
|
"step": 2285
|
|
},
|
|
{
|
|
"epoch": 0.14659752896741565,
|
|
"grad_norm": 50.36029052734375,
|
|
"learning_rate": 1.9868676314081902e-06,
|
|
"loss": 0.414,
|
|
"num_input_tokens_seen": 7225088,
|
|
"step": 2290
|
|
},
|
|
{
|
|
"epoch": 0.14691761090839256,
|
|
"grad_norm": 74.21929931640625,
|
|
"learning_rate": 1.9866865257235107e-06,
|
|
"loss": 0.6901,
|
|
"num_input_tokens_seen": 7240704,
|
|
"step": 2295
|
|
},
|
|
{
|
|
"epoch": 0.14723769284936944,
|
|
"grad_norm": 29.289196014404297,
|
|
"learning_rate": 1.9865041881491188e-06,
|
|
"loss": 0.4177,
|
|
"num_input_tokens_seen": 7256000,
|
|
"step": 2300
|
|
},
|
|
{
|
|
"epoch": 0.14755777479034632,
|
|
"grad_norm": 50.457210540771484,
|
|
"learning_rate": 1.9863206189126653e-06,
|
|
"loss": 0.6016,
|
|
"num_input_tokens_seen": 7270336,
|
|
"step": 2305
|
|
},
|
|
{
|
|
"epoch": 0.14787785673132323,
|
|
"grad_norm": 49.66255187988281,
|
|
"learning_rate": 1.9861358182433382e-06,
|
|
"loss": 0.5612,
|
|
"num_input_tokens_seen": 7285440,
|
|
"step": 2310
|
|
},
|
|
{
|
|
"epoch": 0.1481979386723001,
|
|
"grad_norm": 37.03299331665039,
|
|
"learning_rate": 1.9859497863718634e-06,
|
|
"loss": 0.4711,
|
|
"num_input_tokens_seen": 7301120,
|
|
"step": 2315
|
|
},
|
|
{
|
|
"epoch": 0.14851802061327699,
|
|
"grad_norm": 22.66673469543457,
|
|
"learning_rate": 1.985762523530504e-06,
|
|
"loss": 0.5204,
|
|
"num_input_tokens_seen": 7316416,
|
|
"step": 2320
|
|
},
|
|
{
|
|
"epoch": 0.1488381025542539,
|
|
"grad_norm": 27.409502029418945,
|
|
"learning_rate": 1.98557402995306e-06,
|
|
"loss": 0.5051,
|
|
"num_input_tokens_seen": 7332160,
|
|
"step": 2325
|
|
},
|
|
{
|
|
"epoch": 0.14915818449523077,
|
|
"grad_norm": 40.534942626953125,
|
|
"learning_rate": 1.985384305874868e-06,
|
|
"loss": 0.7069,
|
|
"num_input_tokens_seen": 7347776,
|
|
"step": 2330
|
|
},
|
|
{
|
|
"epoch": 0.14947826643620768,
|
|
"grad_norm": 33.9571647644043,
|
|
"learning_rate": 1.9851933515328e-06,
|
|
"loss": 0.5467,
|
|
"num_input_tokens_seen": 7363200,
|
|
"step": 2335
|
|
},
|
|
{
|
|
"epoch": 0.14979834837718456,
|
|
"grad_norm": 40.87738037109375,
|
|
"learning_rate": 1.985001167165265e-06,
|
|
"loss": 0.4699,
|
|
"num_input_tokens_seen": 7378752,
|
|
"step": 2340
|
|
},
|
|
{
|
|
"epoch": 0.15011843031816144,
|
|
"grad_norm": 36.36539840698242,
|
|
"learning_rate": 1.984807753012208e-06,
|
|
"loss": 0.5165,
|
|
"num_input_tokens_seen": 7393984,
|
|
"step": 2345
|
|
},
|
|
{
|
|
"epoch": 0.15018244670635683,
|
|
"eval_loss": 0.5076366662979126,
|
|
"eval_runtime": 49.1845,
|
|
"eval_samples_per_second": 282.325,
|
|
"eval_steps_per_second": 35.296,
|
|
"num_input_tokens_seen": 7397056,
|
|
"step": 2346
|
|
},
|
|
{
|
|
"epoch": 0.15043851225913835,
|
|
"grad_norm": 25.733684539794922,
|
|
"learning_rate": 1.9846131093151086e-06,
|
|
"loss": 0.5902,
|
|
"num_input_tokens_seen": 7408832,
|
|
"step": 2350
|
|
},
|
|
{
|
|
"epoch": 0.15075859420011523,
|
|
"grad_norm": 16.574737548828125,
|
|
"learning_rate": 1.9844172363169808e-06,
|
|
"loss": 0.4582,
|
|
"num_input_tokens_seen": 7423040,
|
|
"step": 2355
|
|
},
|
|
{
|
|
"epoch": 0.15107867614109213,
|
|
"grad_norm": 53.731632232666016,
|
|
"learning_rate": 1.9842201342623756e-06,
|
|
"loss": 0.5117,
|
|
"num_input_tokens_seen": 7438464,
|
|
"step": 2360
|
|
},
|
|
{
|
|
"epoch": 0.151398758082069,
|
|
"grad_norm": 28.75635528564453,
|
|
"learning_rate": 1.9840218033973766e-06,
|
|
"loss": 0.5205,
|
|
"num_input_tokens_seen": 7453824,
|
|
"step": 2365
|
|
},
|
|
{
|
|
"epoch": 0.1517188400230459,
|
|
"grad_norm": 36.89908981323242,
|
|
"learning_rate": 1.9838222439696027e-06,
|
|
"loss": 0.5717,
|
|
"num_input_tokens_seen": 7469312,
|
|
"step": 2370
|
|
},
|
|
{
|
|
"epoch": 0.1520389219640228,
|
|
"grad_norm": 53.630462646484375,
|
|
"learning_rate": 1.9836214562282058e-06,
|
|
"loss": 0.7065,
|
|
"num_input_tokens_seen": 7485120,
|
|
"step": 2375
|
|
},
|
|
{
|
|
"epoch": 0.15235900390499968,
|
|
"grad_norm": 36.9291877746582,
|
|
"learning_rate": 1.9834194404238715e-06,
|
|
"loss": 0.4971,
|
|
"num_input_tokens_seen": 7500416,
|
|
"step": 2380
|
|
},
|
|
{
|
|
"epoch": 0.15267908584597656,
|
|
"grad_norm": 41.09784698486328,
|
|
"learning_rate": 1.9832161968088193e-06,
|
|
"loss": 0.4125,
|
|
"num_input_tokens_seen": 7516672,
|
|
"step": 2385
|
|
},
|
|
{
|
|
"epoch": 0.15299916778695347,
|
|
"grad_norm": 53.901432037353516,
|
|
"learning_rate": 1.9830117256368015e-06,
|
|
"loss": 0.4764,
|
|
"num_input_tokens_seen": 7532800,
|
|
"step": 2390
|
|
},
|
|
{
|
|
"epoch": 0.15331924972793035,
|
|
"grad_norm": 38.6842041015625,
|
|
"learning_rate": 1.982806027163102e-06,
|
|
"loss": 0.4924,
|
|
"num_input_tokens_seen": 7547776,
|
|
"step": 2395
|
|
},
|
|
{
|
|
"epoch": 0.15363933166890725,
|
|
"grad_norm": 28.168846130371094,
|
|
"learning_rate": 1.9825991016445386e-06,
|
|
"loss": 0.5579,
|
|
"num_input_tokens_seen": 7562496,
|
|
"step": 2400
|
|
},
|
|
{
|
|
"epoch": 0.15395941360988413,
|
|
"grad_norm": 41.71428298950195,
|
|
"learning_rate": 1.9823909493394594e-06,
|
|
"loss": 0.5286,
|
|
"num_input_tokens_seen": 7577920,
|
|
"step": 2405
|
|
},
|
|
{
|
|
"epoch": 0.154279495550861,
|
|
"grad_norm": 41.26945114135742,
|
|
"learning_rate": 1.9821815705077455e-06,
|
|
"loss": 0.5331,
|
|
"num_input_tokens_seen": 7593216,
|
|
"step": 2410
|
|
},
|
|
{
|
|
"epoch": 0.15459957749183792,
|
|
"grad_norm": 63.113800048828125,
|
|
"learning_rate": 1.9819709654108087e-06,
|
|
"loss": 0.5768,
|
|
"num_input_tokens_seen": 7608192,
|
|
"step": 2415
|
|
},
|
|
{
|
|
"epoch": 0.1549196594328148,
|
|
"grad_norm": 51.21147537231445,
|
|
"learning_rate": 1.981759134311592e-06,
|
|
"loss": 0.4652,
|
|
"num_input_tokens_seen": 7624448,
|
|
"step": 2420
|
|
},
|
|
{
|
|
"epoch": 0.15523974137379168,
|
|
"grad_norm": 45.952392578125,
|
|
"learning_rate": 1.981546077474569e-06,
|
|
"loss": 0.4847,
|
|
"num_input_tokens_seen": 7640192,
|
|
"step": 2425
|
|
},
|
|
{
|
|
"epoch": 0.15555982331476859,
|
|
"grad_norm": 33.45967483520508,
|
|
"learning_rate": 1.981331795165744e-06,
|
|
"loss": 0.5143,
|
|
"num_input_tokens_seen": 7654848,
|
|
"step": 2430
|
|
},
|
|
{
|
|
"epoch": 0.15587990525574547,
|
|
"grad_norm": 64.05781555175781,
|
|
"learning_rate": 1.9811162876526498e-06,
|
|
"loss": 0.6067,
|
|
"num_input_tokens_seen": 7670848,
|
|
"step": 2435
|
|
},
|
|
{
|
|
"epoch": 0.15619998719672237,
|
|
"grad_norm": 28.034521102905273,
|
|
"learning_rate": 1.9808995552043515e-06,
|
|
"loss": 0.6387,
|
|
"num_input_tokens_seen": 7686016,
|
|
"step": 2440
|
|
},
|
|
{
|
|
"epoch": 0.15652006913769925,
|
|
"grad_norm": 33.880714416503906,
|
|
"learning_rate": 1.9806815980914413e-06,
|
|
"loss": 0.5478,
|
|
"num_input_tokens_seen": 7701760,
|
|
"step": 2445
|
|
},
|
|
{
|
|
"epoch": 0.15684015107867613,
|
|
"grad_norm": 35.8829231262207,
|
|
"learning_rate": 1.9804624165860417e-06,
|
|
"loss": 0.5624,
|
|
"num_input_tokens_seen": 7717760,
|
|
"step": 2450
|
|
},
|
|
{
|
|
"epoch": 0.15716023301965304,
|
|
"grad_norm": 17.44162368774414,
|
|
"learning_rate": 1.9802420109618028e-06,
|
|
"loss": 0.3852,
|
|
"num_input_tokens_seen": 7733376,
|
|
"step": 2455
|
|
},
|
|
{
|
|
"epoch": 0.15748031496062992,
|
|
"grad_norm": 15.591707229614258,
|
|
"learning_rate": 1.980020381493904e-06,
|
|
"loss": 0.4984,
|
|
"num_input_tokens_seen": 7750464,
|
|
"step": 2460
|
|
},
|
|
{
|
|
"epoch": 0.1578003969016068,
|
|
"grad_norm": 38.971927642822266,
|
|
"learning_rate": 1.979797528459052e-06,
|
|
"loss": 0.4942,
|
|
"num_input_tokens_seen": 7768576,
|
|
"step": 2465
|
|
},
|
|
{
|
|
"epoch": 0.1581204788425837,
|
|
"grad_norm": 44.95268249511719,
|
|
"learning_rate": 1.979573452135482e-06,
|
|
"loss": 0.5334,
|
|
"num_input_tokens_seen": 7784256,
|
|
"step": 2470
|
|
},
|
|
{
|
|
"epoch": 0.15844056078356059,
|
|
"grad_norm": 33.37703323364258,
|
|
"learning_rate": 1.979348152802955e-06,
|
|
"loss": 0.3186,
|
|
"num_input_tokens_seen": 7799232,
|
|
"step": 2475
|
|
},
|
|
{
|
|
"epoch": 0.1587606427245375,
|
|
"grad_norm": 48.24396896362305,
|
|
"learning_rate": 1.979121630742761e-06,
|
|
"loss": 0.592,
|
|
"num_input_tokens_seen": 7815040,
|
|
"step": 2480
|
|
},
|
|
{
|
|
"epoch": 0.15908072466551437,
|
|
"grad_norm": 18.306211471557617,
|
|
"learning_rate": 1.9788938862377146e-06,
|
|
"loss": 0.4479,
|
|
"num_input_tokens_seen": 7830400,
|
|
"step": 2485
|
|
},
|
|
{
|
|
"epoch": 0.15940080660649125,
|
|
"grad_norm": 30.219003677368164,
|
|
"learning_rate": 1.9786649195721577e-06,
|
|
"loss": 0.4818,
|
|
"num_input_tokens_seen": 7846336,
|
|
"step": 2490
|
|
},
|
|
{
|
|
"epoch": 0.15972088854746816,
|
|
"grad_norm": 42.44570541381836,
|
|
"learning_rate": 1.978434731031958e-06,
|
|
"loss": 0.6323,
|
|
"num_input_tokens_seen": 7862528,
|
|
"step": 2495
|
|
},
|
|
{
|
|
"epoch": 0.16004097048844504,
|
|
"grad_norm": 36.15270233154297,
|
|
"learning_rate": 1.9782033209045085e-06,
|
|
"loss": 0.4541,
|
|
"num_input_tokens_seen": 7880000,
|
|
"step": 2500
|
|
},
|
|
{
|
|
"epoch": 0.16036105242942192,
|
|
"grad_norm": 18.829133987426758,
|
|
"learning_rate": 1.977970689478727e-06,
|
|
"loss": 0.4053,
|
|
"num_input_tokens_seen": 7895296,
|
|
"step": 2505
|
|
},
|
|
{
|
|
"epoch": 0.16068113437039883,
|
|
"grad_norm": 54.07673645019531,
|
|
"learning_rate": 1.9777368370450577e-06,
|
|
"loss": 0.5884,
|
|
"num_input_tokens_seen": 7911104,
|
|
"step": 2510
|
|
},
|
|
{
|
|
"epoch": 0.1610012163113757,
|
|
"grad_norm": 31.81148910522461,
|
|
"learning_rate": 1.9775017638954674e-06,
|
|
"loss": 0.521,
|
|
"num_input_tokens_seen": 7925952,
|
|
"step": 2515
|
|
},
|
|
{
|
|
"epoch": 0.1613212982523526,
|
|
"grad_norm": 31.94769287109375,
|
|
"learning_rate": 1.9772654703234476e-06,
|
|
"loss": 0.5943,
|
|
"num_input_tokens_seen": 7940928,
|
|
"step": 2520
|
|
},
|
|
{
|
|
"epoch": 0.1616413801933295,
|
|
"grad_norm": 43.36374282836914,
|
|
"learning_rate": 1.977027956624014e-06,
|
|
"loss": 0.5665,
|
|
"num_input_tokens_seen": 7955200,
|
|
"step": 2525
|
|
},
|
|
{
|
|
"epoch": 0.16196146213430637,
|
|
"grad_norm": 40.16360855102539,
|
|
"learning_rate": 1.9767892230937046e-06,
|
|
"loss": 0.5819,
|
|
"num_input_tokens_seen": 7970944,
|
|
"step": 2530
|
|
},
|
|
{
|
|
"epoch": 0.16228154407528328,
|
|
"grad_norm": 57.72364044189453,
|
|
"learning_rate": 1.976549270030581e-06,
|
|
"loss": 0.4311,
|
|
"num_input_tokens_seen": 7985856,
|
|
"step": 2535
|
|
},
|
|
{
|
|
"epoch": 0.16260162601626016,
|
|
"grad_norm": 38.951045989990234,
|
|
"learning_rate": 1.9763080977342286e-06,
|
|
"loss": 0.4678,
|
|
"num_input_tokens_seen": 8001088,
|
|
"step": 2540
|
|
},
|
|
{
|
|
"epoch": 0.16292170795723707,
|
|
"grad_norm": 41.949275970458984,
|
|
"learning_rate": 1.9760657065057527e-06,
|
|
"loss": 0.4965,
|
|
"num_input_tokens_seen": 8017856,
|
|
"step": 2545
|
|
},
|
|
{
|
|
"epoch": 0.16324178989821395,
|
|
"grad_norm": 40.579071044921875,
|
|
"learning_rate": 1.975822096647782e-06,
|
|
"loss": 0.4527,
|
|
"num_input_tokens_seen": 8033792,
|
|
"step": 2550
|
|
},
|
|
{
|
|
"epoch": 0.16356187183919083,
|
|
"grad_norm": 38.93642807006836,
|
|
"learning_rate": 1.975577268464466e-06,
|
|
"loss": 0.4821,
|
|
"num_input_tokens_seen": 8048256,
|
|
"step": 2555
|
|
},
|
|
{
|
|
"epoch": 0.16388195378016773,
|
|
"grad_norm": 30.569536209106445,
|
|
"learning_rate": 1.9753312222614765e-06,
|
|
"loss": 0.5626,
|
|
"num_input_tokens_seen": 8063680,
|
|
"step": 2560
|
|
},
|
|
{
|
|
"epoch": 0.1642020357211446,
|
|
"grad_norm": 53.63691329956055,
|
|
"learning_rate": 1.9750839583460036e-06,
|
|
"loss": 0.4853,
|
|
"num_input_tokens_seen": 8079744,
|
|
"step": 2565
|
|
},
|
|
{
|
|
"epoch": 0.1645221176621215,
|
|
"grad_norm": 32.5906982421875,
|
|
"learning_rate": 1.9748354770267603e-06,
|
|
"loss": 0.502,
|
|
"num_input_tokens_seen": 8094656,
|
|
"step": 2570
|
|
},
|
|
{
|
|
"epoch": 0.1648421996030984,
|
|
"grad_norm": 24.61626434326172,
|
|
"learning_rate": 1.9745857786139777e-06,
|
|
"loss": 0.5116,
|
|
"num_input_tokens_seen": 8110528,
|
|
"step": 2575
|
|
},
|
|
{
|
|
"epoch": 0.16516228154407528,
|
|
"grad_norm": 48.395931243896484,
|
|
"learning_rate": 1.974334863419408e-06,
|
|
"loss": 0.6028,
|
|
"num_input_tokens_seen": 8126720,
|
|
"step": 2580
|
|
},
|
|
{
|
|
"epoch": 0.1654823634850522,
|
|
"grad_norm": 34.782806396484375,
|
|
"learning_rate": 1.9740827317563212e-06,
|
|
"loss": 0.518,
|
|
"num_input_tokens_seen": 8141312,
|
|
"step": 2585
|
|
},
|
|
{
|
|
"epoch": 0.16580244542602907,
|
|
"grad_norm": 35.59202575683594,
|
|
"learning_rate": 1.973829383939507e-06,
|
|
"loss": 0.4889,
|
|
"num_input_tokens_seen": 8156736,
|
|
"step": 2590
|
|
},
|
|
{
|
|
"epoch": 0.16612252736700595,
|
|
"grad_norm": 49.05874252319336,
|
|
"learning_rate": 1.973574820285273e-06,
|
|
"loss": 0.4987,
|
|
"num_input_tokens_seen": 8172480,
|
|
"step": 2595
|
|
},
|
|
{
|
|
"epoch": 0.16644260930798285,
|
|
"grad_norm": 39.507137298583984,
|
|
"learning_rate": 1.9733190411114443e-06,
|
|
"loss": 0.5702,
|
|
"num_input_tokens_seen": 8188224,
|
|
"step": 2600
|
|
},
|
|
{
|
|
"epoch": 0.16676269124895973,
|
|
"grad_norm": 36.02799987792969,
|
|
"learning_rate": 1.9730620467373654e-06,
|
|
"loss": 0.438,
|
|
"num_input_tokens_seen": 8204352,
|
|
"step": 2605
|
|
},
|
|
{
|
|
"epoch": 0.1670827731899366,
|
|
"grad_norm": 44.20855712890625,
|
|
"learning_rate": 1.9728038374838958e-06,
|
|
"loss": 0.5744,
|
|
"num_input_tokens_seen": 8219328,
|
|
"step": 2610
|
|
},
|
|
{
|
|
"epoch": 0.16740285513091352,
|
|
"grad_norm": 20.6259822845459,
|
|
"learning_rate": 1.972544413673413e-06,
|
|
"loss": 0.3913,
|
|
"num_input_tokens_seen": 8234560,
|
|
"step": 2615
|
|
},
|
|
{
|
|
"epoch": 0.1677229370718904,
|
|
"grad_norm": 28.986614227294922,
|
|
"learning_rate": 1.9722837756298108e-06,
|
|
"loss": 0.5779,
|
|
"num_input_tokens_seen": 8249344,
|
|
"step": 2620
|
|
},
|
|
{
|
|
"epoch": 0.1680430190128673,
|
|
"grad_norm": 53.51920700073242,
|
|
"learning_rate": 1.972021923678499e-06,
|
|
"loss": 0.5548,
|
|
"num_input_tokens_seen": 8265600,
|
|
"step": 2625
|
|
},
|
|
{
|
|
"epoch": 0.16836310095384419,
|
|
"grad_norm": 27.421762466430664,
|
|
"learning_rate": 1.971758858146403e-06,
|
|
"loss": 0.4861,
|
|
"num_input_tokens_seen": 8280384,
|
|
"step": 2630
|
|
},
|
|
{
|
|
"epoch": 0.16868318289482107,
|
|
"grad_norm": 41.67002868652344,
|
|
"learning_rate": 1.9714945793619626e-06,
|
|
"loss": 0.4897,
|
|
"num_input_tokens_seen": 8295744,
|
|
"step": 2635
|
|
},
|
|
{
|
|
"epoch": 0.16900326483579797,
|
|
"grad_norm": 27.60586929321289,
|
|
"learning_rate": 1.971229087655133e-06,
|
|
"loss": 0.5052,
|
|
"num_input_tokens_seen": 8311680,
|
|
"step": 2640
|
|
},
|
|
{
|
|
"epoch": 0.16932334677677485,
|
|
"grad_norm": 29.15129280090332,
|
|
"learning_rate": 1.9709623833573842e-06,
|
|
"loss": 0.4678,
|
|
"num_input_tokens_seen": 8326592,
|
|
"step": 2645
|
|
},
|
|
{
|
|
"epoch": 0.16964342871775173,
|
|
"grad_norm": 54.205875396728516,
|
|
"learning_rate": 1.9706944668016994e-06,
|
|
"loss": 0.4588,
|
|
"num_input_tokens_seen": 8341632,
|
|
"step": 2650
|
|
},
|
|
{
|
|
"epoch": 0.16996351065872864,
|
|
"grad_norm": 38.538326263427734,
|
|
"learning_rate": 1.9704253383225756e-06,
|
|
"loss": 0.4627,
|
|
"num_input_tokens_seen": 8358400,
|
|
"step": 2655
|
|
},
|
|
{
|
|
"epoch": 0.17028359259970552,
|
|
"grad_norm": 33.1207275390625,
|
|
"learning_rate": 1.970154998256023e-06,
|
|
"loss": 0.4845,
|
|
"num_input_tokens_seen": 8374144,
|
|
"step": 2660
|
|
},
|
|
{
|
|
"epoch": 0.17060367454068243,
|
|
"grad_norm": 35.72023010253906,
|
|
"learning_rate": 1.9698834469395644e-06,
|
|
"loss": 0.4215,
|
|
"num_input_tokens_seen": 8389440,
|
|
"step": 2665
|
|
},
|
|
{
|
|
"epoch": 0.1709237564816593,
|
|
"grad_norm": 33.63475036621094,
|
|
"learning_rate": 1.969610684712234e-06,
|
|
"loss": 0.5408,
|
|
"num_input_tokens_seen": 8404672,
|
|
"step": 2670
|
|
},
|
|
{
|
|
"epoch": 0.17124383842263619,
|
|
"grad_norm": 59.44383239746094,
|
|
"learning_rate": 1.9693367119145794e-06,
|
|
"loss": 0.5508,
|
|
"num_input_tokens_seen": 8420096,
|
|
"step": 2675
|
|
},
|
|
{
|
|
"epoch": 0.1715639203636131,
|
|
"grad_norm": 42.37469482421875,
|
|
"learning_rate": 1.969061528888659e-06,
|
|
"loss": 0.6684,
|
|
"num_input_tokens_seen": 8436288,
|
|
"step": 2680
|
|
},
|
|
{
|
|
"epoch": 0.17188400230458997,
|
|
"grad_norm": 23.906444549560547,
|
|
"learning_rate": 1.9687851359780415e-06,
|
|
"loss": 0.5401,
|
|
"num_input_tokens_seen": 8452672,
|
|
"step": 2685
|
|
},
|
|
{
|
|
"epoch": 0.17220408424556685,
|
|
"grad_norm": 19.489620208740234,
|
|
"learning_rate": 1.968507533527807e-06,
|
|
"loss": 0.4867,
|
|
"num_input_tokens_seen": 8469120,
|
|
"step": 2690
|
|
},
|
|
{
|
|
"epoch": 0.17252416618654376,
|
|
"grad_norm": 46.37827682495117,
|
|
"learning_rate": 1.9682287218845455e-06,
|
|
"loss": 0.4748,
|
|
"num_input_tokens_seen": 8484736,
|
|
"step": 2695
|
|
},
|
|
{
|
|
"epoch": 0.17284424812752064,
|
|
"grad_norm": 38.747093200683594,
|
|
"learning_rate": 1.967948701396356e-06,
|
|
"loss": 0.7367,
|
|
"num_input_tokens_seen": 8500480,
|
|
"step": 2700
|
|
},
|
|
{
|
|
"epoch": 0.17316433006849755,
|
|
"grad_norm": 28.16217803955078,
|
|
"learning_rate": 1.9676674724128485e-06,
|
|
"loss": 0.3977,
|
|
"num_input_tokens_seen": 8514624,
|
|
"step": 2705
|
|
},
|
|
{
|
|
"epoch": 0.17348441200947443,
|
|
"grad_norm": 19.507436752319336,
|
|
"learning_rate": 1.9673850352851397e-06,
|
|
"loss": 0.4543,
|
|
"num_input_tokens_seen": 8529664,
|
|
"step": 2710
|
|
},
|
|
{
|
|
"epoch": 0.1738044939504513,
|
|
"grad_norm": 31.663122177124023,
|
|
"learning_rate": 1.967101390365856e-06,
|
|
"loss": 0.5825,
|
|
"num_input_tokens_seen": 8545280,
|
|
"step": 2715
|
|
},
|
|
{
|
|
"epoch": 0.1741245758914282,
|
|
"grad_norm": 29.334657669067383,
|
|
"learning_rate": 1.966816538009131e-06,
|
|
"loss": 0.492,
|
|
"num_input_tokens_seen": 8560384,
|
|
"step": 2720
|
|
},
|
|
{
|
|
"epoch": 0.1744446578324051,
|
|
"grad_norm": 41.919986724853516,
|
|
"learning_rate": 1.966530478570607e-06,
|
|
"loss": 0.5425,
|
|
"num_input_tokens_seen": 8576960,
|
|
"step": 2725
|
|
},
|
|
{
|
|
"epoch": 0.174764739773382,
|
|
"grad_norm": 31.315555572509766,
|
|
"learning_rate": 1.9662432124074325e-06,
|
|
"loss": 0.4635,
|
|
"num_input_tokens_seen": 8592384,
|
|
"step": 2730
|
|
},
|
|
{
|
|
"epoch": 0.17508482171435888,
|
|
"grad_norm": 29.594783782958984,
|
|
"learning_rate": 1.965954739878262e-06,
|
|
"loss": 0.4836,
|
|
"num_input_tokens_seen": 8609024,
|
|
"step": 2735
|
|
},
|
|
{
|
|
"epoch": 0.17540490365533576,
|
|
"grad_norm": 46.86975860595703,
|
|
"learning_rate": 1.965665061343257e-06,
|
|
"loss": 0.4283,
|
|
"num_input_tokens_seen": 8624768,
|
|
"step": 2740
|
|
},
|
|
{
|
|
"epoch": 0.17572498559631267,
|
|
"grad_norm": 25.347562789916992,
|
|
"learning_rate": 1.965374177164085e-06,
|
|
"loss": 0.4646,
|
|
"num_input_tokens_seen": 8640448,
|
|
"step": 2745
|
|
},
|
|
{
|
|
"epoch": 0.17604506753728955,
|
|
"grad_norm": 27.5438232421875,
|
|
"learning_rate": 1.9650820877039182e-06,
|
|
"loss": 0.5427,
|
|
"num_input_tokens_seen": 8655296,
|
|
"step": 2750
|
|
},
|
|
{
|
|
"epoch": 0.17636514947826643,
|
|
"grad_norm": 69.62262725830078,
|
|
"learning_rate": 1.9647887933274334e-06,
|
|
"loss": 0.4878,
|
|
"num_input_tokens_seen": 8671872,
|
|
"step": 2755
|
|
},
|
|
{
|
|
"epoch": 0.17668523141924333,
|
|
"grad_norm": 21.517606735229492,
|
|
"learning_rate": 1.9644942944008124e-06,
|
|
"loss": 0.4822,
|
|
"num_input_tokens_seen": 8687680,
|
|
"step": 2760
|
|
},
|
|
{
|
|
"epoch": 0.1770053133602202,
|
|
"grad_norm": 57.37998962402344,
|
|
"learning_rate": 1.96419859129174e-06,
|
|
"loss": 0.5914,
|
|
"num_input_tokens_seen": 8702912,
|
|
"step": 2765
|
|
},
|
|
{
|
|
"epoch": 0.17732539530119712,
|
|
"grad_norm": 25.293439865112305,
|
|
"learning_rate": 1.963901684369406e-06,
|
|
"loss": 0.4702,
|
|
"num_input_tokens_seen": 8718144,
|
|
"step": 2770
|
|
},
|
|
{
|
|
"epoch": 0.177645477242174,
|
|
"grad_norm": 36.15742874145508,
|
|
"learning_rate": 1.9636035740045013e-06,
|
|
"loss": 0.4989,
|
|
"num_input_tokens_seen": 8732992,
|
|
"step": 2775
|
|
},
|
|
{
|
|
"epoch": 0.17796555918315088,
|
|
"grad_norm": 26.592554092407227,
|
|
"learning_rate": 1.9633042605692207e-06,
|
|
"loss": 0.6024,
|
|
"num_input_tokens_seen": 8749056,
|
|
"step": 2780
|
|
},
|
|
{
|
|
"epoch": 0.17828564112412779,
|
|
"grad_norm": 22.61241912841797,
|
|
"learning_rate": 1.9630037444372597e-06,
|
|
"loss": 0.4879,
|
|
"num_input_tokens_seen": 8765184,
|
|
"step": 2785
|
|
},
|
|
{
|
|
"epoch": 0.17860572306510467,
|
|
"grad_norm": 43.24379348754883,
|
|
"learning_rate": 1.9627020259838177e-06,
|
|
"loss": 0.4133,
|
|
"num_input_tokens_seen": 8780480,
|
|
"step": 2790
|
|
},
|
|
{
|
|
"epoch": 0.17892580500608155,
|
|
"grad_norm": 33.002906799316406,
|
|
"learning_rate": 1.9623991055855925e-06,
|
|
"loss": 0.5539,
|
|
"num_input_tokens_seen": 8796352,
|
|
"step": 2795
|
|
},
|
|
{
|
|
"epoch": 0.17924588694705845,
|
|
"grad_norm": 27.26972770690918,
|
|
"learning_rate": 1.962094983620784e-06,
|
|
"loss": 0.443,
|
|
"num_input_tokens_seen": 8810688,
|
|
"step": 2800
|
|
},
|
|
{
|
|
"epoch": 0.17956596888803533,
|
|
"grad_norm": 49.42767333984375,
|
|
"learning_rate": 1.9617896604690925e-06,
|
|
"loss": 0.4279,
|
|
"num_input_tokens_seen": 8826304,
|
|
"step": 2805
|
|
},
|
|
{
|
|
"epoch": 0.17988605082901224,
|
|
"grad_norm": 22.84317970275879,
|
|
"learning_rate": 1.961483136511717e-06,
|
|
"loss": 0.4628,
|
|
"num_input_tokens_seen": 8841344,
|
|
"step": 2810
|
|
},
|
|
{
|
|
"epoch": 0.18020613276998912,
|
|
"grad_norm": 47.95643997192383,
|
|
"learning_rate": 1.9611754121313567e-06,
|
|
"loss": 0.6058,
|
|
"num_input_tokens_seen": 8857664,
|
|
"step": 2815
|
|
},
|
|
{
|
|
"epoch": 0.180526214710966,
|
|
"grad_norm": 52.1284294128418,
|
|
"learning_rate": 1.960866487712209e-06,
|
|
"loss": 0.5762,
|
|
"num_input_tokens_seen": 8873408,
|
|
"step": 2820
|
|
},
|
|
{
|
|
"epoch": 0.1808462966519429,
|
|
"grad_norm": 31.013389587402344,
|
|
"learning_rate": 1.9605563636399695e-06,
|
|
"loss": 0.425,
|
|
"num_input_tokens_seen": 8889472,
|
|
"step": 2825
|
|
},
|
|
{
|
|
"epoch": 0.18116637859291979,
|
|
"grad_norm": 60.00368118286133,
|
|
"learning_rate": 1.9602450403018315e-06,
|
|
"loss": 0.5908,
|
|
"num_input_tokens_seen": 8904640,
|
|
"step": 2830
|
|
},
|
|
{
|
|
"epoch": 0.18148646053389667,
|
|
"grad_norm": 35.06608200073242,
|
|
"learning_rate": 1.9599325180864864e-06,
|
|
"loss": 0.4446,
|
|
"num_input_tokens_seen": 8919680,
|
|
"step": 2835
|
|
},
|
|
{
|
|
"epoch": 0.18180654247487357,
|
|
"grad_norm": 31.069002151489258,
|
|
"learning_rate": 1.9596187973841216e-06,
|
|
"loss": 0.4418,
|
|
"num_input_tokens_seen": 8935360,
|
|
"step": 2840
|
|
},
|
|
{
|
|
"epoch": 0.18212662441585045,
|
|
"grad_norm": 26.10578727722168,
|
|
"learning_rate": 1.959303878586421e-06,
|
|
"loss": 0.4892,
|
|
"num_input_tokens_seen": 8951552,
|
|
"step": 2845
|
|
},
|
|
{
|
|
"epoch": 0.18244670635682736,
|
|
"grad_norm": 42.628684997558594,
|
|
"learning_rate": 1.9589877620865647e-06,
|
|
"loss": 0.5694,
|
|
"num_input_tokens_seen": 8968576,
|
|
"step": 2850
|
|
},
|
|
{
|
|
"epoch": 0.18276678829780424,
|
|
"grad_norm": 27.467554092407227,
|
|
"learning_rate": 1.9586704482792277e-06,
|
|
"loss": 0.4559,
|
|
"num_input_tokens_seen": 8983744,
|
|
"step": 2855
|
|
},
|
|
{
|
|
"epoch": 0.18308687023878112,
|
|
"grad_norm": 30.344791412353516,
|
|
"learning_rate": 1.95835193756058e-06,
|
|
"loss": 0.4376,
|
|
"num_input_tokens_seen": 8999040,
|
|
"step": 2860
|
|
},
|
|
{
|
|
"epoch": 0.18340695217975803,
|
|
"grad_norm": 37.68637466430664,
|
|
"learning_rate": 1.9580322303282858e-06,
|
|
"loss": 0.4186,
|
|
"num_input_tokens_seen": 9015872,
|
|
"step": 2865
|
|
},
|
|
{
|
|
"epoch": 0.1837270341207349,
|
|
"grad_norm": 26.828548431396484,
|
|
"learning_rate": 1.9577113269815038e-06,
|
|
"loss": 0.4001,
|
|
"num_input_tokens_seen": 9031744,
|
|
"step": 2870
|
|
},
|
|
{
|
|
"epoch": 0.18404711606171179,
|
|
"grad_norm": 34.85321807861328,
|
|
"learning_rate": 1.957389227920885e-06,
|
|
"loss": 0.5877,
|
|
"num_input_tokens_seen": 9047872,
|
|
"step": 2875
|
|
},
|
|
{
|
|
"epoch": 0.1843671980026887,
|
|
"grad_norm": 33.741172790527344,
|
|
"learning_rate": 1.957065933548574e-06,
|
|
"loss": 0.5101,
|
|
"num_input_tokens_seen": 9062976,
|
|
"step": 2880
|
|
},
|
|
{
|
|
"epoch": 0.18468727994366557,
|
|
"grad_norm": 56.83228302001953,
|
|
"learning_rate": 1.956741444268208e-06,
|
|
"loss": 0.5899,
|
|
"num_input_tokens_seen": 9078208,
|
|
"step": 2885
|
|
},
|
|
{
|
|
"epoch": 0.18500736188464248,
|
|
"grad_norm": 30.513900756835938,
|
|
"learning_rate": 1.9564157604849154e-06,
|
|
"loss": 0.4744,
|
|
"num_input_tokens_seen": 9094720,
|
|
"step": 2890
|
|
},
|
|
{
|
|
"epoch": 0.18532744382561936,
|
|
"grad_norm": 28.41360092163086,
|
|
"learning_rate": 1.9560888826053163e-06,
|
|
"loss": 0.5274,
|
|
"num_input_tokens_seen": 9110336,
|
|
"step": 2895
|
|
},
|
|
{
|
|
"epoch": 0.18564752576659624,
|
|
"grad_norm": 25.244827270507812,
|
|
"learning_rate": 1.9557608110375212e-06,
|
|
"loss": 0.5573,
|
|
"num_input_tokens_seen": 9126912,
|
|
"step": 2900
|
|
},
|
|
{
|
|
"epoch": 0.18596760770757315,
|
|
"grad_norm": 26.246530532836914,
|
|
"learning_rate": 1.955431546191132e-06,
|
|
"loss": 0.549,
|
|
"num_input_tokens_seen": 9142400,
|
|
"step": 2905
|
|
},
|
|
{
|
|
"epoch": 0.18628768964855003,
|
|
"grad_norm": 44.32508087158203,
|
|
"learning_rate": 1.95510108847724e-06,
|
|
"loss": 0.5161,
|
|
"num_input_tokens_seen": 9157184,
|
|
"step": 2910
|
|
},
|
|
{
|
|
"epoch": 0.1866077715895269,
|
|
"grad_norm": 28.210281372070312,
|
|
"learning_rate": 1.954769438308424e-06,
|
|
"loss": 0.5237,
|
|
"num_input_tokens_seen": 9173696,
|
|
"step": 2915
|
|
},
|
|
{
|
|
"epoch": 0.1869278535305038,
|
|
"grad_norm": 36.434974670410156,
|
|
"learning_rate": 1.954436596098754e-06,
|
|
"loss": 0.4992,
|
|
"num_input_tokens_seen": 9190080,
|
|
"step": 2920
|
|
},
|
|
{
|
|
"epoch": 0.1872479354714807,
|
|
"grad_norm": 59.13997268676758,
|
|
"learning_rate": 1.9541025622637875e-06,
|
|
"loss": 0.5761,
|
|
"num_input_tokens_seen": 9204352,
|
|
"step": 2925
|
|
},
|
|
{
|
|
"epoch": 0.1875680174124576,
|
|
"grad_norm": 50.34525680541992,
|
|
"learning_rate": 1.95376733722057e-06,
|
|
"loss": 0.6098,
|
|
"num_input_tokens_seen": 9219200,
|
|
"step": 2930
|
|
},
|
|
{
|
|
"epoch": 0.18788809935343448,
|
|
"grad_norm": 33.083404541015625,
|
|
"learning_rate": 1.9534309213876337e-06,
|
|
"loss": 0.4702,
|
|
"num_input_tokens_seen": 9233600,
|
|
"step": 2935
|
|
},
|
|
{
|
|
"epoch": 0.18820818129441136,
|
|
"grad_norm": 40.38674545288086,
|
|
"learning_rate": 1.953093315184997e-06,
|
|
"loss": 0.4343,
|
|
"num_input_tokens_seen": 9249536,
|
|
"step": 2940
|
|
},
|
|
{
|
|
"epoch": 0.18852826323538827,
|
|
"grad_norm": 39.487579345703125,
|
|
"learning_rate": 1.952754519034166e-06,
|
|
"loss": 0.6391,
|
|
"num_input_tokens_seen": 9264256,
|
|
"step": 2945
|
|
},
|
|
{
|
|
"epoch": 0.18884834517636515,
|
|
"grad_norm": 58.533199310302734,
|
|
"learning_rate": 1.9524145333581313e-06,
|
|
"loss": 0.4487,
|
|
"num_input_tokens_seen": 9279488,
|
|
"step": 2950
|
|
},
|
|
{
|
|
"epoch": 0.18916842711734205,
|
|
"grad_norm": 26.437389373779297,
|
|
"learning_rate": 1.952073358581369e-06,
|
|
"loss": 0.5122,
|
|
"num_input_tokens_seen": 9294336,
|
|
"step": 2955
|
|
},
|
|
{
|
|
"epoch": 0.18948850905831893,
|
|
"grad_norm": 34.934356689453125,
|
|
"learning_rate": 1.95173099512984e-06,
|
|
"loss": 0.5552,
|
|
"num_input_tokens_seen": 9309376,
|
|
"step": 2960
|
|
},
|
|
{
|
|
"epoch": 0.1898085909992958,
|
|
"grad_norm": 22.976945877075195,
|
|
"learning_rate": 1.9513874434309894e-06,
|
|
"loss": 0.4579,
|
|
"num_input_tokens_seen": 9324224,
|
|
"step": 2965
|
|
},
|
|
{
|
|
"epoch": 0.19012867294027272,
|
|
"grad_norm": 27.009410858154297,
|
|
"learning_rate": 1.951042703913745e-06,
|
|
"loss": 0.4466,
|
|
"num_input_tokens_seen": 9339136,
|
|
"step": 2970
|
|
},
|
|
{
|
|
"epoch": 0.1904487548812496,
|
|
"grad_norm": 26.152063369750977,
|
|
"learning_rate": 1.950696777008518e-06,
|
|
"loss": 0.4491,
|
|
"num_input_tokens_seen": 9354688,
|
|
"step": 2975
|
|
},
|
|
{
|
|
"epoch": 0.19076883682222648,
|
|
"grad_norm": 23.096553802490234,
|
|
"learning_rate": 1.9503496631472025e-06,
|
|
"loss": 0.4917,
|
|
"num_input_tokens_seen": 9369664,
|
|
"step": 2980
|
|
},
|
|
{
|
|
"epoch": 0.19108891876320339,
|
|
"grad_norm": 42.896331787109375,
|
|
"learning_rate": 1.9500013627631746e-06,
|
|
"loss": 0.6324,
|
|
"num_input_tokens_seen": 9384768,
|
|
"step": 2985
|
|
},
|
|
{
|
|
"epoch": 0.19140900070418027,
|
|
"grad_norm": 34.10990524291992,
|
|
"learning_rate": 1.949651876291291e-06,
|
|
"loss": 0.3728,
|
|
"num_input_tokens_seen": 9400320,
|
|
"step": 2990
|
|
},
|
|
{
|
|
"epoch": 0.19172908264515717,
|
|
"grad_norm": 56.81764221191406,
|
|
"learning_rate": 1.9493012041678894e-06,
|
|
"loss": 0.4739,
|
|
"num_input_tokens_seen": 9415872,
|
|
"step": 2995
|
|
},
|
|
{
|
|
"epoch": 0.19204916458613405,
|
|
"grad_norm": 31.37006187438965,
|
|
"learning_rate": 1.9489493468307883e-06,
|
|
"loss": 0.6013,
|
|
"num_input_tokens_seen": 9432704,
|
|
"step": 3000
|
|
},
|
|
{
|
|
"epoch": 0.19236924652711093,
|
|
"grad_norm": 52.02330017089844,
|
|
"learning_rate": 1.948596304719286e-06,
|
|
"loss": 0.5159,
|
|
"num_input_tokens_seen": 9448192,
|
|
"step": 3005
|
|
},
|
|
{
|
|
"epoch": 0.19268932846808784,
|
|
"grad_norm": 44.85215759277344,
|
|
"learning_rate": 1.9482420782741594e-06,
|
|
"loss": 0.4322,
|
|
"num_input_tokens_seen": 9464576,
|
|
"step": 3010
|
|
},
|
|
{
|
|
"epoch": 0.19300941040906472,
|
|
"grad_norm": 30.883983612060547,
|
|
"learning_rate": 1.9478866679376647e-06,
|
|
"loss": 0.5546,
|
|
"num_input_tokens_seen": 9479936,
|
|
"step": 3015
|
|
},
|
|
{
|
|
"epoch": 0.1933294923500416,
|
|
"grad_norm": 29.6319637298584,
|
|
"learning_rate": 1.9475300741535353e-06,
|
|
"loss": 0.5447,
|
|
"num_input_tokens_seen": 9497280,
|
|
"step": 3020
|
|
},
|
|
{
|
|
"epoch": 0.1936495742910185,
|
|
"grad_norm": 36.820396423339844,
|
|
"learning_rate": 1.9471722973669833e-06,
|
|
"loss": 0.4568,
|
|
"num_input_tokens_seen": 9514496,
|
|
"step": 3025
|
|
},
|
|
{
|
|
"epoch": 0.19396965623199539,
|
|
"grad_norm": 23.96208953857422,
|
|
"learning_rate": 1.946813338024697e-06,
|
|
"loss": 0.3932,
|
|
"num_input_tokens_seen": 9529536,
|
|
"step": 3030
|
|
},
|
|
{
|
|
"epoch": 0.1942897381729723,
|
|
"grad_norm": 55.99610137939453,
|
|
"learning_rate": 1.9464531965748414e-06,
|
|
"loss": 0.526,
|
|
"num_input_tokens_seen": 9545472,
|
|
"step": 3035
|
|
},
|
|
{
|
|
"epoch": 0.19460982011394917,
|
|
"grad_norm": 39.6732292175293,
|
|
"learning_rate": 1.9460918734670573e-06,
|
|
"loss": 0.585,
|
|
"num_input_tokens_seen": 9560960,
|
|
"step": 3040
|
|
},
|
|
{
|
|
"epoch": 0.19492990205492605,
|
|
"grad_norm": 29.82390022277832,
|
|
"learning_rate": 1.945729369152461e-06,
|
|
"loss": 0.5221,
|
|
"num_input_tokens_seen": 9576320,
|
|
"step": 3045
|
|
},
|
|
{
|
|
"epoch": 0.19524998399590296,
|
|
"grad_norm": 44.162254333496094,
|
|
"learning_rate": 1.945365684083643e-06,
|
|
"loss": 0.5632,
|
|
"num_input_tokens_seen": 9592192,
|
|
"step": 3050
|
|
},
|
|
{
|
|
"epoch": 0.19557006593687984,
|
|
"grad_norm": 52.55691146850586,
|
|
"learning_rate": 1.945000818714668e-06,
|
|
"loss": 0.6164,
|
|
"num_input_tokens_seen": 9608128,
|
|
"step": 3055
|
|
},
|
|
{
|
|
"epoch": 0.19589014787785672,
|
|
"grad_norm": 27.91643714904785,
|
|
"learning_rate": 1.944634773501076e-06,
|
|
"loss": 0.5338,
|
|
"num_input_tokens_seen": 9623872,
|
|
"step": 3060
|
|
},
|
|
{
|
|
"epoch": 0.19621022981883363,
|
|
"grad_norm": 51.04069900512695,
|
|
"learning_rate": 1.9442675488998783e-06,
|
|
"loss": 0.5496,
|
|
"num_input_tokens_seen": 9639488,
|
|
"step": 3065
|
|
},
|
|
{
|
|
"epoch": 0.1965303117598105,
|
|
"grad_norm": 28.205469131469727,
|
|
"learning_rate": 1.9438991453695587e-06,
|
|
"loss": 0.4913,
|
|
"num_input_tokens_seen": 9655680,
|
|
"step": 3070
|
|
},
|
|
{
|
|
"epoch": 0.1968503937007874,
|
|
"grad_norm": 36.26915740966797,
|
|
"learning_rate": 1.943529563370073e-06,
|
|
"loss": 0.5489,
|
|
"num_input_tokens_seen": 9670400,
|
|
"step": 3075
|
|
},
|
|
{
|
|
"epoch": 0.1971704756417643,
|
|
"grad_norm": 21.7237606048584,
|
|
"learning_rate": 1.9431588033628495e-06,
|
|
"loss": 0.3868,
|
|
"num_input_tokens_seen": 9685504,
|
|
"step": 3080
|
|
},
|
|
{
|
|
"epoch": 0.19749055758274117,
|
|
"grad_norm": 44.26191329956055,
|
|
"learning_rate": 1.9427868658107862e-06,
|
|
"loss": 0.635,
|
|
"num_input_tokens_seen": 9701952,
|
|
"step": 3085
|
|
},
|
|
{
|
|
"epoch": 0.19781063952371808,
|
|
"grad_norm": 22.945430755615234,
|
|
"learning_rate": 1.942413751178251e-06,
|
|
"loss": 0.4485,
|
|
"num_input_tokens_seen": 9716928,
|
|
"step": 3090
|
|
},
|
|
{
|
|
"epoch": 0.19813072146469496,
|
|
"grad_norm": 55.33934783935547,
|
|
"learning_rate": 1.9420394599310826e-06,
|
|
"loss": 0.6516,
|
|
"num_input_tokens_seen": 9732096,
|
|
"step": 3095
|
|
},
|
|
{
|
|
"epoch": 0.19845080340567184,
|
|
"grad_norm": 27.51698112487793,
|
|
"learning_rate": 1.941663992536588e-06,
|
|
"loss": 0.5307,
|
|
"num_input_tokens_seen": 9747648,
|
|
"step": 3100
|
|
},
|
|
{
|
|
"epoch": 0.19877088534664875,
|
|
"grad_norm": 14.455513954162598,
|
|
"learning_rate": 1.941287349463542e-06,
|
|
"loss": 0.4371,
|
|
"num_input_tokens_seen": 9763072,
|
|
"step": 3105
|
|
},
|
|
{
|
|
"epoch": 0.19909096728762563,
|
|
"grad_norm": 28.985132217407227,
|
|
"learning_rate": 1.940909531182188e-06,
|
|
"loss": 0.4726,
|
|
"num_input_tokens_seen": 9778176,
|
|
"step": 3110
|
|
},
|
|
{
|
|
"epoch": 0.19941104922860253,
|
|
"grad_norm": 45.77129364013672,
|
|
"learning_rate": 1.9405305381642375e-06,
|
|
"loss": 0.6129,
|
|
"num_input_tokens_seen": 9793536,
|
|
"step": 3115
|
|
},
|
|
{
|
|
"epoch": 0.1997311311695794,
|
|
"grad_norm": 24.09324836730957,
|
|
"learning_rate": 1.9401503708828665e-06,
|
|
"loss": 0.4986,
|
|
"num_input_tokens_seen": 9808192,
|
|
"step": 3120
|
|
},
|
|
{
|
|
"epoch": 0.2000512131105563,
|
|
"grad_norm": 32.09850311279297,
|
|
"learning_rate": 1.939769029812719e-06,
|
|
"loss": 0.5774,
|
|
"num_input_tokens_seen": 9823232,
|
|
"step": 3125
|
|
},
|
|
{
|
|
"epoch": 0.20024326227514244,
|
|
"eval_loss": 0.48840755224227905,
|
|
"eval_runtime": 49.2154,
|
|
"eval_samples_per_second": 282.148,
|
|
"eval_steps_per_second": 35.274,
|
|
"num_input_tokens_seen": 9832064,
|
|
"step": 3128
|
|
},
|
|
{
|
|
"epoch": 0.2003712950515332,
|
|
"grad_norm": 42.523658752441406,
|
|
"learning_rate": 1.939386515429904e-06,
|
|
"loss": 0.5893,
|
|
"num_input_tokens_seen": 9839488,
|
|
"step": 3130
|
|
},
|
|
{
|
|
"epoch": 0.20069137699251008,
|
|
"grad_norm": 20.936914443969727,
|
|
"learning_rate": 1.9390028282119942e-06,
|
|
"loss": 0.421,
|
|
"num_input_tokens_seen": 9856192,
|
|
"step": 3135
|
|
},
|
|
{
|
|
"epoch": 0.201011458933487,
|
|
"grad_norm": 37.659271240234375,
|
|
"learning_rate": 1.938617968638029e-06,
|
|
"loss": 0.5122,
|
|
"num_input_tokens_seen": 9871552,
|
|
"step": 3140
|
|
},
|
|
{
|
|
"epoch": 0.20133154087446387,
|
|
"grad_norm": 40.56658172607422,
|
|
"learning_rate": 1.938231937188509e-06,
|
|
"loss": 0.5077,
|
|
"num_input_tokens_seen": 9886016,
|
|
"step": 3145
|
|
},
|
|
{
|
|
"epoch": 0.20165162281544075,
|
|
"grad_norm": 43.369693756103516,
|
|
"learning_rate": 1.9378447343453995e-06,
|
|
"loss": 0.6156,
|
|
"num_input_tokens_seen": 9903552,
|
|
"step": 3150
|
|
},
|
|
{
|
|
"epoch": 0.20197170475641765,
|
|
"grad_norm": 43.882118225097656,
|
|
"learning_rate": 1.9374563605921275e-06,
|
|
"loss": 0.3458,
|
|
"num_input_tokens_seen": 9920320,
|
|
"step": 3155
|
|
},
|
|
{
|
|
"epoch": 0.20229178669739453,
|
|
"grad_norm": 30.69708251953125,
|
|
"learning_rate": 1.937066816413582e-06,
|
|
"loss": 0.5926,
|
|
"num_input_tokens_seen": 9935936,
|
|
"step": 3160
|
|
},
|
|
{
|
|
"epoch": 0.2026118686383714,
|
|
"grad_norm": 30.447908401489258,
|
|
"learning_rate": 1.9366761022961146e-06,
|
|
"loss": 0.4757,
|
|
"num_input_tokens_seen": 9950912,
|
|
"step": 3165
|
|
},
|
|
{
|
|
"epoch": 0.20293195057934832,
|
|
"grad_norm": 40.40016174316406,
|
|
"learning_rate": 1.9362842187275354e-06,
|
|
"loss": 0.5615,
|
|
"num_input_tokens_seen": 9966080,
|
|
"step": 3170
|
|
},
|
|
{
|
|
"epoch": 0.2032520325203252,
|
|
"grad_norm": 29.704164505004883,
|
|
"learning_rate": 1.9358911661971155e-06,
|
|
"loss": 0.4789,
|
|
"num_input_tokens_seen": 9982080,
|
|
"step": 3175
|
|
},
|
|
{
|
|
"epoch": 0.2035721144613021,
|
|
"grad_norm": 28.506755828857422,
|
|
"learning_rate": 1.9354969451955864e-06,
|
|
"loss": 0.4647,
|
|
"num_input_tokens_seen": 9996544,
|
|
"step": 3180
|
|
},
|
|
{
|
|
"epoch": 0.20389219640227899,
|
|
"grad_norm": 27.22804832458496,
|
|
"learning_rate": 1.9351015562151375e-06,
|
|
"loss": 0.5497,
|
|
"num_input_tokens_seen": 10011776,
|
|
"step": 3185
|
|
},
|
|
{
|
|
"epoch": 0.20421227834325587,
|
|
"grad_norm": 25.4746150970459,
|
|
"learning_rate": 1.934704999749416e-06,
|
|
"loss": 0.4331,
|
|
"num_input_tokens_seen": 10027264,
|
|
"step": 3190
|
|
},
|
|
{
|
|
"epoch": 0.20453236028423277,
|
|
"grad_norm": 23.414485931396484,
|
|
"learning_rate": 1.9343072762935274e-06,
|
|
"loss": 0.4203,
|
|
"num_input_tokens_seen": 10042432,
|
|
"step": 3195
|
|
},
|
|
{
|
|
"epoch": 0.20485244222520965,
|
|
"grad_norm": 28.72736167907715,
|
|
"learning_rate": 1.933908386344035e-06,
|
|
"loss": 0.4135,
|
|
"num_input_tokens_seen": 10057792,
|
|
"step": 3200
|
|
},
|
|
{
|
|
"epoch": 0.20517252416618653,
|
|
"grad_norm": 33.854576110839844,
|
|
"learning_rate": 1.9335083303989565e-06,
|
|
"loss": 0.5222,
|
|
"num_input_tokens_seen": 10074752,
|
|
"step": 3205
|
|
},
|
|
{
|
|
"epoch": 0.20549260610716344,
|
|
"grad_norm": 37.276336669921875,
|
|
"learning_rate": 1.9331071089577674e-06,
|
|
"loss": 0.576,
|
|
"num_input_tokens_seen": 10090752,
|
|
"step": 3210
|
|
},
|
|
{
|
|
"epoch": 0.20581268804814032,
|
|
"grad_norm": 51.40751647949219,
|
|
"learning_rate": 1.9327047225213963e-06,
|
|
"loss": 0.4961,
|
|
"num_input_tokens_seen": 10106240,
|
|
"step": 3215
|
|
},
|
|
{
|
|
"epoch": 0.20613276998911723,
|
|
"grad_norm": 35.04685974121094,
|
|
"learning_rate": 1.9323011715922283e-06,
|
|
"loss": 0.4128,
|
|
"num_input_tokens_seen": 10121856,
|
|
"step": 3220
|
|
},
|
|
{
|
|
"epoch": 0.2064528519300941,
|
|
"grad_norm": 67.41058349609375,
|
|
"learning_rate": 1.931896456674101e-06,
|
|
"loss": 0.4764,
|
|
"num_input_tokens_seen": 10137408,
|
|
"step": 3225
|
|
},
|
|
{
|
|
"epoch": 0.20677293387107099,
|
|
"grad_norm": 32.64918899536133,
|
|
"learning_rate": 1.931490578272306e-06,
|
|
"loss": 0.4548,
|
|
"num_input_tokens_seen": 10152640,
|
|
"step": 3230
|
|
},
|
|
{
|
|
"epoch": 0.2070930158120479,
|
|
"grad_norm": 33.72087097167969,
|
|
"learning_rate": 1.9310835368935867e-06,
|
|
"loss": 0.3538,
|
|
"num_input_tokens_seen": 10167936,
|
|
"step": 3235
|
|
},
|
|
{
|
|
"epoch": 0.20741309775302477,
|
|
"grad_norm": 36.13018035888672,
|
|
"learning_rate": 1.93067533304614e-06,
|
|
"loss": 0.4205,
|
|
"num_input_tokens_seen": 10183360,
|
|
"step": 3240
|
|
},
|
|
{
|
|
"epoch": 0.20773317969400165,
|
|
"grad_norm": 29.964752197265625,
|
|
"learning_rate": 1.9302659672396128e-06,
|
|
"loss": 0.5557,
|
|
"num_input_tokens_seen": 10198208,
|
|
"step": 3245
|
|
},
|
|
{
|
|
"epoch": 0.20805326163497856,
|
|
"grad_norm": 27.227624893188477,
|
|
"learning_rate": 1.9298554399851025e-06,
|
|
"loss": 0.4903,
|
|
"num_input_tokens_seen": 10213568,
|
|
"step": 3250
|
|
},
|
|
{
|
|
"epoch": 0.20837334357595544,
|
|
"grad_norm": 37.30453109741211,
|
|
"learning_rate": 1.929443751795158e-06,
|
|
"loss": 0.4833,
|
|
"num_input_tokens_seen": 10230080,
|
|
"step": 3255
|
|
},
|
|
{
|
|
"epoch": 0.20869342551693235,
|
|
"grad_norm": 23.320819854736328,
|
|
"learning_rate": 1.929030903183776e-06,
|
|
"loss": 0.4759,
|
|
"num_input_tokens_seen": 10246912,
|
|
"step": 3260
|
|
},
|
|
{
|
|
"epoch": 0.20901350745790923,
|
|
"grad_norm": 42.66804885864258,
|
|
"learning_rate": 1.9286168946664033e-06,
|
|
"loss": 0.5368,
|
|
"num_input_tokens_seen": 10262464,
|
|
"step": 3265
|
|
},
|
|
{
|
|
"epoch": 0.2093335893988861,
|
|
"grad_norm": 60.002376556396484,
|
|
"learning_rate": 1.9282017267599352e-06,
|
|
"loss": 0.6679,
|
|
"num_input_tokens_seen": 10278016,
|
|
"step": 3270
|
|
},
|
|
{
|
|
"epoch": 0.209653671339863,
|
|
"grad_norm": 42.901100158691406,
|
|
"learning_rate": 1.9277853999827125e-06,
|
|
"loss": 0.5054,
|
|
"num_input_tokens_seen": 10293824,
|
|
"step": 3275
|
|
},
|
|
{
|
|
"epoch": 0.2099737532808399,
|
|
"grad_norm": 44.74653244018555,
|
|
"learning_rate": 1.9273679148545244e-06,
|
|
"loss": 0.5116,
|
|
"num_input_tokens_seen": 10309568,
|
|
"step": 3280
|
|
},
|
|
{
|
|
"epoch": 0.21029383522181677,
|
|
"grad_norm": 33.75946044921875,
|
|
"learning_rate": 1.9269492718966062e-06,
|
|
"loss": 0.4229,
|
|
"num_input_tokens_seen": 10325696,
|
|
"step": 3285
|
|
},
|
|
{
|
|
"epoch": 0.21061391716279368,
|
|
"grad_norm": 30.77555274963379,
|
|
"learning_rate": 1.9265294716316384e-06,
|
|
"loss": 0.5261,
|
|
"num_input_tokens_seen": 10342016,
|
|
"step": 3290
|
|
},
|
|
{
|
|
"epoch": 0.21093399910377056,
|
|
"grad_norm": 29.430330276489258,
|
|
"learning_rate": 1.926108514583747e-06,
|
|
"loss": 0.4688,
|
|
"num_input_tokens_seen": 10357632,
|
|
"step": 3295
|
|
},
|
|
{
|
|
"epoch": 0.21125408104474747,
|
|
"grad_norm": 50.258575439453125,
|
|
"learning_rate": 1.925686401278501e-06,
|
|
"loss": 0.4801,
|
|
"num_input_tokens_seen": 10373056,
|
|
"step": 3300
|
|
},
|
|
{
|
|
"epoch": 0.21157416298572435,
|
|
"grad_norm": 61.20192337036133,
|
|
"learning_rate": 1.9252631322429143e-06,
|
|
"loss": 0.6373,
|
|
"num_input_tokens_seen": 10389248,
|
|
"step": 3305
|
|
},
|
|
{
|
|
"epoch": 0.21189424492670123,
|
|
"grad_norm": 23.071653366088867,
|
|
"learning_rate": 1.9248387080054435e-06,
|
|
"loss": 0.439,
|
|
"num_input_tokens_seen": 10404864,
|
|
"step": 3310
|
|
},
|
|
{
|
|
"epoch": 0.21221432686767813,
|
|
"grad_norm": 18.74202537536621,
|
|
"learning_rate": 1.9244131290959864e-06,
|
|
"loss": 0.4878,
|
|
"num_input_tokens_seen": 10420416,
|
|
"step": 3315
|
|
},
|
|
{
|
|
"epoch": 0.212534408808655,
|
|
"grad_norm": 33.07780075073242,
|
|
"learning_rate": 1.9239863960458845e-06,
|
|
"loss": 0.4244,
|
|
"num_input_tokens_seen": 10435456,
|
|
"step": 3320
|
|
},
|
|
{
|
|
"epoch": 0.21285449074963192,
|
|
"grad_norm": 31.487497329711914,
|
|
"learning_rate": 1.923558509387918e-06,
|
|
"loss": 0.4881,
|
|
"num_input_tokens_seen": 10451584,
|
|
"step": 3325
|
|
},
|
|
{
|
|
"epoch": 0.2131745726906088,
|
|
"grad_norm": 37.91923904418945,
|
|
"learning_rate": 1.9231294696563086e-06,
|
|
"loss": 0.3745,
|
|
"num_input_tokens_seen": 10467584,
|
|
"step": 3330
|
|
},
|
|
{
|
|
"epoch": 0.21349465463158568,
|
|
"grad_norm": 34.82919692993164,
|
|
"learning_rate": 1.922699277386718e-06,
|
|
"loss": 0.4146,
|
|
"num_input_tokens_seen": 10483264,
|
|
"step": 3335
|
|
},
|
|
{
|
|
"epoch": 0.21381473657256259,
|
|
"grad_norm": 34.857810974121094,
|
|
"learning_rate": 1.9222679331162454e-06,
|
|
"loss": 0.5865,
|
|
"num_input_tokens_seen": 10498560,
|
|
"step": 3340
|
|
},
|
|
{
|
|
"epoch": 0.21413481851353947,
|
|
"grad_norm": 37.536800384521484,
|
|
"learning_rate": 1.92183543738343e-06,
|
|
"loss": 0.4515,
|
|
"num_input_tokens_seen": 10514176,
|
|
"step": 3345
|
|
},
|
|
{
|
|
"epoch": 0.21445490045451635,
|
|
"grad_norm": 25.721649169921875,
|
|
"learning_rate": 1.9214017907282475e-06,
|
|
"loss": 0.4363,
|
|
"num_input_tokens_seen": 10529792,
|
|
"step": 3350
|
|
},
|
|
{
|
|
"epoch": 0.21477498239549325,
|
|
"grad_norm": 37.20597457885742,
|
|
"learning_rate": 1.9209669936921105e-06,
|
|
"loss": 0.4809,
|
|
"num_input_tokens_seen": 10545856,
|
|
"step": 3355
|
|
},
|
|
{
|
|
"epoch": 0.21509506433647013,
|
|
"grad_norm": 43.397335052490234,
|
|
"learning_rate": 1.920531046817869e-06,
|
|
"loss": 0.4092,
|
|
"num_input_tokens_seen": 10562368,
|
|
"step": 3360
|
|
},
|
|
{
|
|
"epoch": 0.21541514627744704,
|
|
"grad_norm": 44.76917266845703,
|
|
"learning_rate": 1.9200939506498067e-06,
|
|
"loss": 0.6238,
|
|
"num_input_tokens_seen": 10577280,
|
|
"step": 3365
|
|
},
|
|
{
|
|
"epoch": 0.21573522821842392,
|
|
"grad_norm": 30.43206214904785,
|
|
"learning_rate": 1.9196557057336446e-06,
|
|
"loss": 0.5817,
|
|
"num_input_tokens_seen": 10592384,
|
|
"step": 3370
|
|
},
|
|
{
|
|
"epoch": 0.2160553101594008,
|
|
"grad_norm": 24.50318145751953,
|
|
"learning_rate": 1.9192163126165354e-06,
|
|
"loss": 0.4498,
|
|
"num_input_tokens_seen": 10608704,
|
|
"step": 3375
|
|
},
|
|
{
|
|
"epoch": 0.2163753921003777,
|
|
"grad_norm": 43.877662658691406,
|
|
"learning_rate": 1.9187757718470673e-06,
|
|
"loss": 0.3997,
|
|
"num_input_tokens_seen": 10625280,
|
|
"step": 3380
|
|
},
|
|
{
|
|
"epoch": 0.21669547404135459,
|
|
"grad_norm": 22.60622215270996,
|
|
"learning_rate": 1.9183340839752606e-06,
|
|
"loss": 0.5339,
|
|
"num_input_tokens_seen": 10641152,
|
|
"step": 3385
|
|
},
|
|
{
|
|
"epoch": 0.21701555598233147,
|
|
"grad_norm": 28.090923309326172,
|
|
"learning_rate": 1.9178912495525672e-06,
|
|
"loss": 0.4193,
|
|
"num_input_tokens_seen": 10657472,
|
|
"step": 3390
|
|
},
|
|
{
|
|
"epoch": 0.21733563792330837,
|
|
"grad_norm": 24.062137603759766,
|
|
"learning_rate": 1.917447269131872e-06,
|
|
"loss": 0.5054,
|
|
"num_input_tokens_seen": 10673600,
|
|
"step": 3395
|
|
},
|
|
{
|
|
"epoch": 0.21765571986428525,
|
|
"grad_norm": 35.8740119934082,
|
|
"learning_rate": 1.917002143267489e-06,
|
|
"loss": 0.5693,
|
|
"num_input_tokens_seen": 10689344,
|
|
"step": 3400
|
|
},
|
|
{
|
|
"epoch": 0.21797580180526216,
|
|
"grad_norm": 29.342618942260742,
|
|
"learning_rate": 1.9165558725151633e-06,
|
|
"loss": 0.4478,
|
|
"num_input_tokens_seen": 10704384,
|
|
"step": 3405
|
|
},
|
|
{
|
|
"epoch": 0.21829588374623904,
|
|
"grad_norm": 56.710784912109375,
|
|
"learning_rate": 1.9161084574320692e-06,
|
|
"loss": 0.5002,
|
|
"num_input_tokens_seen": 10720512,
|
|
"step": 3410
|
|
},
|
|
{
|
|
"epoch": 0.21861596568721592,
|
|
"grad_norm": 31.751296997070312,
|
|
"learning_rate": 1.91565989857681e-06,
|
|
"loss": 0.4727,
|
|
"num_input_tokens_seen": 10735744,
|
|
"step": 3415
|
|
},
|
|
{
|
|
"epoch": 0.21893604762819283,
|
|
"grad_norm": 31.050350189208984,
|
|
"learning_rate": 1.9152101965094162e-06,
|
|
"loss": 0.4573,
|
|
"num_input_tokens_seen": 10750848,
|
|
"step": 3420
|
|
},
|
|
{
|
|
"epoch": 0.2192561295691697,
|
|
"grad_norm": 42.99034881591797,
|
|
"learning_rate": 1.9147593517913464e-06,
|
|
"loss": 0.4878,
|
|
"num_input_tokens_seen": 10765632,
|
|
"step": 3425
|
|
},
|
|
{
|
|
"epoch": 0.21957621151014659,
|
|
"grad_norm": 17.069164276123047,
|
|
"learning_rate": 1.914307364985485e-06,
|
|
"loss": 0.3856,
|
|
"num_input_tokens_seen": 10780928,
|
|
"step": 3430
|
|
},
|
|
{
|
|
"epoch": 0.2198962934511235,
|
|
"grad_norm": 24.95672607421875,
|
|
"learning_rate": 1.913854236656144e-06,
|
|
"loss": 0.4217,
|
|
"num_input_tokens_seen": 10796864,
|
|
"step": 3435
|
|
},
|
|
{
|
|
"epoch": 0.22021637539210037,
|
|
"grad_norm": 39.41409683227539,
|
|
"learning_rate": 1.9133999673690584e-06,
|
|
"loss": 0.4653,
|
|
"num_input_tokens_seen": 10812672,
|
|
"step": 3440
|
|
},
|
|
{
|
|
"epoch": 0.22053645733307728,
|
|
"grad_norm": 44.56681442260742,
|
|
"learning_rate": 1.9129445576913886e-06,
|
|
"loss": 0.4709,
|
|
"num_input_tokens_seen": 10828544,
|
|
"step": 3445
|
|
},
|
|
{
|
|
"epoch": 0.22085653927405416,
|
|
"grad_norm": 23.38069725036621,
|
|
"learning_rate": 1.91248800819172e-06,
|
|
"loss": 0.5335,
|
|
"num_input_tokens_seen": 10844288,
|
|
"step": 3450
|
|
},
|
|
{
|
|
"epoch": 0.22117662121503104,
|
|
"grad_norm": 48.04047775268555,
|
|
"learning_rate": 1.912030319440059e-06,
|
|
"loss": 0.5192,
|
|
"num_input_tokens_seen": 10860160,
|
|
"step": 3455
|
|
},
|
|
{
|
|
"epoch": 0.22149670315600795,
|
|
"grad_norm": 36.49208068847656,
|
|
"learning_rate": 1.9115714920078354e-06,
|
|
"loss": 0.6043,
|
|
"num_input_tokens_seen": 10875968,
|
|
"step": 3460
|
|
},
|
|
{
|
|
"epoch": 0.22181678509698483,
|
|
"grad_norm": 25.53341293334961,
|
|
"learning_rate": 1.9111115264679017e-06,
|
|
"loss": 0.3252,
|
|
"num_input_tokens_seen": 10892096,
|
|
"step": 3465
|
|
},
|
|
{
|
|
"epoch": 0.2221368670379617,
|
|
"grad_norm": 45.4945068359375,
|
|
"learning_rate": 1.910650423394529e-06,
|
|
"loss": 0.4378,
|
|
"num_input_tokens_seen": 10908544,
|
|
"step": 3470
|
|
},
|
|
{
|
|
"epoch": 0.2224569489789386,
|
|
"grad_norm": 45.49387741088867,
|
|
"learning_rate": 1.910188183363411e-06,
|
|
"loss": 0.4817,
|
|
"num_input_tokens_seen": 10924544,
|
|
"step": 3475
|
|
},
|
|
{
|
|
"epoch": 0.2227770309199155,
|
|
"grad_norm": 50.44002151489258,
|
|
"learning_rate": 1.909724806951659e-06,
|
|
"loss": 0.4441,
|
|
"num_input_tokens_seen": 10941888,
|
|
"step": 3480
|
|
},
|
|
{
|
|
"epoch": 0.2230971128608924,
|
|
"grad_norm": 50.978633880615234,
|
|
"learning_rate": 1.909260294737804e-06,
|
|
"loss": 0.4669,
|
|
"num_input_tokens_seen": 10958592,
|
|
"step": 3485
|
|
},
|
|
{
|
|
"epoch": 0.22341719480186928,
|
|
"grad_norm": 80.07136535644531,
|
|
"learning_rate": 1.9087946473017953e-06,
|
|
"loss": 0.555,
|
|
"num_input_tokens_seen": 10974208,
|
|
"step": 3490
|
|
},
|
|
{
|
|
"epoch": 0.22373727674284616,
|
|
"grad_norm": 33.776737213134766,
|
|
"learning_rate": 1.9083278652249992e-06,
|
|
"loss": 0.4304,
|
|
"num_input_tokens_seen": 10988928,
|
|
"step": 3495
|
|
},
|
|
{
|
|
"epoch": 0.22405735868382307,
|
|
"grad_norm": 35.86427307128906,
|
|
"learning_rate": 1.9078599490901983e-06,
|
|
"loss": 0.425,
|
|
"num_input_tokens_seen": 11005952,
|
|
"step": 3500
|
|
},
|
|
{
|
|
"epoch": 0.22437744062479995,
|
|
"grad_norm": 51.98170852661133,
|
|
"learning_rate": 1.9073908994815914e-06,
|
|
"loss": 0.3971,
|
|
"num_input_tokens_seen": 11020608,
|
|
"step": 3505
|
|
},
|
|
{
|
|
"epoch": 0.22469752256577685,
|
|
"grad_norm": 46.34355926513672,
|
|
"learning_rate": 1.9069207169847928e-06,
|
|
"loss": 0.4862,
|
|
"num_input_tokens_seen": 11036736,
|
|
"step": 3510
|
|
},
|
|
{
|
|
"epoch": 0.22501760450675373,
|
|
"grad_norm": 33.5971794128418,
|
|
"learning_rate": 1.9064494021868302e-06,
|
|
"loss": 0.3584,
|
|
"num_input_tokens_seen": 11052480,
|
|
"step": 3515
|
|
},
|
|
{
|
|
"epoch": 0.2253376864477306,
|
|
"grad_norm": 39.64836120605469,
|
|
"learning_rate": 1.9059769556761464e-06,
|
|
"loss": 0.48,
|
|
"num_input_tokens_seen": 11068416,
|
|
"step": 3520
|
|
},
|
|
{
|
|
"epoch": 0.22565776838870752,
|
|
"grad_norm": 31.865467071533203,
|
|
"learning_rate": 1.9055033780425962e-06,
|
|
"loss": 0.4454,
|
|
"num_input_tokens_seen": 11086400,
|
|
"step": 3525
|
|
},
|
|
{
|
|
"epoch": 0.2259778503296844,
|
|
"grad_norm": 88.44284057617188,
|
|
"learning_rate": 1.9050286698774464e-06,
|
|
"loss": 0.562,
|
|
"num_input_tokens_seen": 11102848,
|
|
"step": 3530
|
|
},
|
|
{
|
|
"epoch": 0.22629793227066128,
|
|
"grad_norm": 41.320526123046875,
|
|
"learning_rate": 1.904552831773376e-06,
|
|
"loss": 0.5359,
|
|
"num_input_tokens_seen": 11118080,
|
|
"step": 3535
|
|
},
|
|
{
|
|
"epoch": 0.22661801421163819,
|
|
"grad_norm": 24.0659236907959,
|
|
"learning_rate": 1.9040758643244748e-06,
|
|
"loss": 0.4967,
|
|
"num_input_tokens_seen": 11133120,
|
|
"step": 3540
|
|
},
|
|
{
|
|
"epoch": 0.22693809615261507,
|
|
"grad_norm": 31.473848342895508,
|
|
"learning_rate": 1.903597768126242e-06,
|
|
"loss": 0.4694,
|
|
"num_input_tokens_seen": 11150144,
|
|
"step": 3545
|
|
},
|
|
{
|
|
"epoch": 0.22725817809359197,
|
|
"grad_norm": 58.51475143432617,
|
|
"learning_rate": 1.9031185437755862e-06,
|
|
"loss": 0.4787,
|
|
"num_input_tokens_seen": 11165760,
|
|
"step": 3550
|
|
},
|
|
{
|
|
"epoch": 0.22757826003456885,
|
|
"grad_norm": 52.226993560791016,
|
|
"learning_rate": 1.9026381918708246e-06,
|
|
"loss": 0.4582,
|
|
"num_input_tokens_seen": 11180096,
|
|
"step": 3555
|
|
},
|
|
{
|
|
"epoch": 0.22789834197554573,
|
|
"grad_norm": 19.623682022094727,
|
|
"learning_rate": 1.9021567130116822e-06,
|
|
"loss": 0.3618,
|
|
"num_input_tokens_seen": 11195584,
|
|
"step": 3560
|
|
},
|
|
{
|
|
"epoch": 0.22821842391652264,
|
|
"grad_norm": 59.500858306884766,
|
|
"learning_rate": 1.9016741077992916e-06,
|
|
"loss": 0.3909,
|
|
"num_input_tokens_seen": 11210944,
|
|
"step": 3565
|
|
},
|
|
{
|
|
"epoch": 0.22853850585749952,
|
|
"grad_norm": 27.949474334716797,
|
|
"learning_rate": 1.90119037683619e-06,
|
|
"loss": 0.4052,
|
|
"num_input_tokens_seen": 11227392,
|
|
"step": 3570
|
|
},
|
|
{
|
|
"epoch": 0.2288585877984764,
|
|
"grad_norm": 26.94727325439453,
|
|
"learning_rate": 1.9007055207263223e-06,
|
|
"loss": 0.6492,
|
|
"num_input_tokens_seen": 11244416,
|
|
"step": 3575
|
|
},
|
|
{
|
|
"epoch": 0.2291786697394533,
|
|
"grad_norm": 28.7558536529541,
|
|
"learning_rate": 1.900219540075036e-06,
|
|
"loss": 0.3588,
|
|
"num_input_tokens_seen": 11260672,
|
|
"step": 3580
|
|
},
|
|
{
|
|
"epoch": 0.22949875168043019,
|
|
"grad_norm": 55.45866775512695,
|
|
"learning_rate": 1.8997324354890845e-06,
|
|
"loss": 0.4749,
|
|
"num_input_tokens_seen": 11277504,
|
|
"step": 3585
|
|
},
|
|
{
|
|
"epoch": 0.2298188336214071,
|
|
"grad_norm": 110.20696258544922,
|
|
"learning_rate": 1.8992442075766233e-06,
|
|
"loss": 0.539,
|
|
"num_input_tokens_seen": 11293184,
|
|
"step": 3590
|
|
},
|
|
{
|
|
"epoch": 0.23013891556238397,
|
|
"grad_norm": 29.66388702392578,
|
|
"learning_rate": 1.8987548569472105e-06,
|
|
"loss": 0.3191,
|
|
"num_input_tokens_seen": 11308480,
|
|
"step": 3595
|
|
},
|
|
{
|
|
"epoch": 0.23045899750336085,
|
|
"grad_norm": 31.010486602783203,
|
|
"learning_rate": 1.8982643842118064e-06,
|
|
"loss": 0.396,
|
|
"num_input_tokens_seen": 11323840,
|
|
"step": 3600
|
|
},
|
|
{
|
|
"epoch": 0.23077907944433776,
|
|
"grad_norm": 63.96700668334961,
|
|
"learning_rate": 1.8977727899827716e-06,
|
|
"loss": 0.5821,
|
|
"num_input_tokens_seen": 11339456,
|
|
"step": 3605
|
|
},
|
|
{
|
|
"epoch": 0.23109916138531464,
|
|
"grad_norm": 50.296600341796875,
|
|
"learning_rate": 1.8972800748738678e-06,
|
|
"loss": 0.6554,
|
|
"num_input_tokens_seen": 11354880,
|
|
"step": 3610
|
|
},
|
|
{
|
|
"epoch": 0.23141924332629152,
|
|
"grad_norm": 27.36386489868164,
|
|
"learning_rate": 1.896786239500255e-06,
|
|
"loss": 0.5226,
|
|
"num_input_tokens_seen": 11369984,
|
|
"step": 3615
|
|
},
|
|
{
|
|
"epoch": 0.23173932526726843,
|
|
"grad_norm": 51.205718994140625,
|
|
"learning_rate": 1.8962912844784928e-06,
|
|
"loss": 0.429,
|
|
"num_input_tokens_seen": 11384640,
|
|
"step": 3620
|
|
},
|
|
{
|
|
"epoch": 0.2320594072082453,
|
|
"grad_norm": 53.744346618652344,
|
|
"learning_rate": 1.8957952104265384e-06,
|
|
"loss": 0.4945,
|
|
"num_input_tokens_seen": 11401152,
|
|
"step": 3625
|
|
},
|
|
{
|
|
"epoch": 0.2323794891492222,
|
|
"grad_norm": 32.322486877441406,
|
|
"learning_rate": 1.8952980179637458e-06,
|
|
"loss": 0.4535,
|
|
"num_input_tokens_seen": 11416896,
|
|
"step": 3630
|
|
},
|
|
{
|
|
"epoch": 0.2326995710901991,
|
|
"grad_norm": 34.96129608154297,
|
|
"learning_rate": 1.8947997077108662e-06,
|
|
"loss": 0.4899,
|
|
"num_input_tokens_seen": 11432832,
|
|
"step": 3635
|
|
},
|
|
{
|
|
"epoch": 0.23301965303117597,
|
|
"grad_norm": 30.439565658569336,
|
|
"learning_rate": 1.894300280290045e-06,
|
|
"loss": 0.4807,
|
|
"num_input_tokens_seen": 11448320,
|
|
"step": 3640
|
|
},
|
|
{
|
|
"epoch": 0.23333973497215288,
|
|
"grad_norm": 23.5026912689209,
|
|
"learning_rate": 1.8937997363248237e-06,
|
|
"loss": 0.5674,
|
|
"num_input_tokens_seen": 11463488,
|
|
"step": 3645
|
|
},
|
|
{
|
|
"epoch": 0.23365981691312976,
|
|
"grad_norm": 20.100936889648438,
|
|
"learning_rate": 1.8932980764401373e-06,
|
|
"loss": 0.4527,
|
|
"num_input_tokens_seen": 11478592,
|
|
"step": 3650
|
|
},
|
|
{
|
|
"epoch": 0.23397989885410664,
|
|
"grad_norm": 24.669857025146484,
|
|
"learning_rate": 1.8927953012623141e-06,
|
|
"loss": 0.3564,
|
|
"num_input_tokens_seen": 11494720,
|
|
"step": 3655
|
|
},
|
|
{
|
|
"epoch": 0.23429998079508355,
|
|
"grad_norm": 56.09657287597656,
|
|
"learning_rate": 1.8922914114190744e-06,
|
|
"loss": 0.4846,
|
|
"num_input_tokens_seen": 11511232,
|
|
"step": 3660
|
|
},
|
|
{
|
|
"epoch": 0.23462006273606043,
|
|
"grad_norm": 31.37401008605957,
|
|
"learning_rate": 1.8917864075395312e-06,
|
|
"loss": 0.5093,
|
|
"num_input_tokens_seen": 11527040,
|
|
"step": 3665
|
|
},
|
|
{
|
|
"epoch": 0.23494014467703733,
|
|
"grad_norm": 18.777942657470703,
|
|
"learning_rate": 1.8912802902541873e-06,
|
|
"loss": 0.4461,
|
|
"num_input_tokens_seen": 11542528,
|
|
"step": 3670
|
|
},
|
|
{
|
|
"epoch": 0.2352602266180142,
|
|
"grad_norm": 37.04750442504883,
|
|
"learning_rate": 1.8907730601949362e-06,
|
|
"loss": 0.4974,
|
|
"num_input_tokens_seen": 11557696,
|
|
"step": 3675
|
|
},
|
|
{
|
|
"epoch": 0.2355803085589911,
|
|
"grad_norm": 50.14651870727539,
|
|
"learning_rate": 1.8902647179950608e-06,
|
|
"loss": 0.4648,
|
|
"num_input_tokens_seen": 11574848,
|
|
"step": 3680
|
|
},
|
|
{
|
|
"epoch": 0.235900390499968,
|
|
"grad_norm": 52.763484954833984,
|
|
"learning_rate": 1.889755264289232e-06,
|
|
"loss": 0.5108,
|
|
"num_input_tokens_seen": 11589696,
|
|
"step": 3685
|
|
},
|
|
{
|
|
"epoch": 0.23622047244094488,
|
|
"grad_norm": 20.895673751831055,
|
|
"learning_rate": 1.8892446997135087e-06,
|
|
"loss": 0.384,
|
|
"num_input_tokens_seen": 11606848,
|
|
"step": 3690
|
|
},
|
|
{
|
|
"epoch": 0.23654055438192176,
|
|
"grad_norm": 31.011825561523438,
|
|
"learning_rate": 1.888733024905337e-06,
|
|
"loss": 0.6707,
|
|
"num_input_tokens_seen": 11623744,
|
|
"step": 3695
|
|
},
|
|
{
|
|
"epoch": 0.23686063632289867,
|
|
"grad_norm": 34.26097106933594,
|
|
"learning_rate": 1.888220240503549e-06,
|
|
"loss": 0.4755,
|
|
"num_input_tokens_seen": 11640256,
|
|
"step": 3700
|
|
},
|
|
{
|
|
"epoch": 0.23718071826387555,
|
|
"grad_norm": 32.54058837890625,
|
|
"learning_rate": 1.8877063471483618e-06,
|
|
"loss": 0.412,
|
|
"num_input_tokens_seen": 11655744,
|
|
"step": 3705
|
|
},
|
|
{
|
|
"epoch": 0.23750080020485245,
|
|
"grad_norm": 17.025754928588867,
|
|
"learning_rate": 1.8871913454813772e-06,
|
|
"loss": 0.2935,
|
|
"num_input_tokens_seen": 11671104,
|
|
"step": 3710
|
|
},
|
|
{
|
|
"epoch": 0.23782088214582933,
|
|
"grad_norm": 29.473085403442383,
|
|
"learning_rate": 1.886675236145581e-06,
|
|
"loss": 0.3898,
|
|
"num_input_tokens_seen": 11686848,
|
|
"step": 3715
|
|
},
|
|
{
|
|
"epoch": 0.2381409640868062,
|
|
"grad_norm": 28.30191421508789,
|
|
"learning_rate": 1.8861580197853422e-06,
|
|
"loss": 0.5018,
|
|
"num_input_tokens_seen": 11701952,
|
|
"step": 3720
|
|
},
|
|
{
|
|
"epoch": 0.23846104602778312,
|
|
"grad_norm": 41.48347473144531,
|
|
"learning_rate": 1.8856396970464105e-06,
|
|
"loss": 0.4647,
|
|
"num_input_tokens_seen": 11718592,
|
|
"step": 3725
|
|
},
|
|
{
|
|
"epoch": 0.23878112796876,
|
|
"grad_norm": 40.44169235229492,
|
|
"learning_rate": 1.8851202685759189e-06,
|
|
"loss": 0.5143,
|
|
"num_input_tokens_seen": 11734208,
|
|
"step": 3730
|
|
},
|
|
{
|
|
"epoch": 0.2391012099097369,
|
|
"grad_norm": 11.559971809387207,
|
|
"learning_rate": 1.8845997350223792e-06,
|
|
"loss": 0.407,
|
|
"num_input_tokens_seen": 11748992,
|
|
"step": 3735
|
|
},
|
|
{
|
|
"epoch": 0.23942129185071379,
|
|
"grad_norm": 28.135868072509766,
|
|
"learning_rate": 1.8840780970356842e-06,
|
|
"loss": 0.4217,
|
|
"num_input_tokens_seen": 11764608,
|
|
"step": 3740
|
|
},
|
|
{
|
|
"epoch": 0.23974137379169067,
|
|
"grad_norm": 29.070838928222656,
|
|
"learning_rate": 1.8835553552671048e-06,
|
|
"loss": 0.4078,
|
|
"num_input_tokens_seen": 11780800,
|
|
"step": 3745
|
|
},
|
|
{
|
|
"epoch": 0.24006145573266757,
|
|
"grad_norm": 30.527294158935547,
|
|
"learning_rate": 1.8830315103692902e-06,
|
|
"loss": 0.4593,
|
|
"num_input_tokens_seen": 11795776,
|
|
"step": 3750
|
|
},
|
|
{
|
|
"epoch": 0.24038153767364445,
|
|
"grad_norm": 34.47731399536133,
|
|
"learning_rate": 1.8825065629962669e-06,
|
|
"loss": 0.5071,
|
|
"num_input_tokens_seen": 11811776,
|
|
"step": 3755
|
|
},
|
|
{
|
|
"epoch": 0.24070161961462133,
|
|
"grad_norm": 32.23590087890625,
|
|
"learning_rate": 1.881980513803438e-06,
|
|
"loss": 0.4852,
|
|
"num_input_tokens_seen": 11828224,
|
|
"step": 3760
|
|
},
|
|
{
|
|
"epoch": 0.24102170155559824,
|
|
"grad_norm": 48.78215026855469,
|
|
"learning_rate": 1.881453363447582e-06,
|
|
"loss": 0.5035,
|
|
"num_input_tokens_seen": 11843904,
|
|
"step": 3765
|
|
},
|
|
{
|
|
"epoch": 0.24134178349657512,
|
|
"grad_norm": 57.377567291259766,
|
|
"learning_rate": 1.880925112586852e-06,
|
|
"loss": 0.5574,
|
|
"num_input_tokens_seen": 11859392,
|
|
"step": 3770
|
|
},
|
|
{
|
|
"epoch": 0.24166186543755203,
|
|
"grad_norm": 48.24585723876953,
|
|
"learning_rate": 1.8803957618807762e-06,
|
|
"loss": 0.4427,
|
|
"num_input_tokens_seen": 11875968,
|
|
"step": 3775
|
|
},
|
|
{
|
|
"epoch": 0.2419819473785289,
|
|
"grad_norm": 72.58015441894531,
|
|
"learning_rate": 1.8798653119902548e-06,
|
|
"loss": 0.4404,
|
|
"num_input_tokens_seen": 11891584,
|
|
"step": 3780
|
|
},
|
|
{
|
|
"epoch": 0.24230202931950579,
|
|
"grad_norm": 26.939559936523438,
|
|
"learning_rate": 1.8793337635775603e-06,
|
|
"loss": 0.5029,
|
|
"num_input_tokens_seen": 11906944,
|
|
"step": 3785
|
|
},
|
|
{
|
|
"epoch": 0.2426221112604827,
|
|
"grad_norm": 44.384925842285156,
|
|
"learning_rate": 1.8788011173063376e-06,
|
|
"loss": 0.4729,
|
|
"num_input_tokens_seen": 11922368,
|
|
"step": 3790
|
|
},
|
|
{
|
|
"epoch": 0.24294219320145957,
|
|
"grad_norm": 45.79201126098633,
|
|
"learning_rate": 1.8782673738416018e-06,
|
|
"loss": 0.5181,
|
|
"num_input_tokens_seen": 11938432,
|
|
"step": 3795
|
|
},
|
|
{
|
|
"epoch": 0.24326227514243645,
|
|
"grad_norm": 43.953582763671875,
|
|
"learning_rate": 1.877732533849737e-06,
|
|
"loss": 0.5078,
|
|
"num_input_tokens_seen": 11956608,
|
|
"step": 3800
|
|
},
|
|
{
|
|
"epoch": 0.24358235708341336,
|
|
"grad_norm": 25.617721557617188,
|
|
"learning_rate": 1.8771965979984988e-06,
|
|
"loss": 0.4394,
|
|
"num_input_tokens_seen": 11972480,
|
|
"step": 3805
|
|
},
|
|
{
|
|
"epoch": 0.24390243902439024,
|
|
"grad_norm": 21.165599822998047,
|
|
"learning_rate": 1.8766595669570084e-06,
|
|
"loss": 0.3889,
|
|
"num_input_tokens_seen": 11987072,
|
|
"step": 3810
|
|
},
|
|
{
|
|
"epoch": 0.24422252096536715,
|
|
"grad_norm": 32.8095703125,
|
|
"learning_rate": 1.8761214413957553e-06,
|
|
"loss": 0.4361,
|
|
"num_input_tokens_seen": 12002112,
|
|
"step": 3815
|
|
},
|
|
{
|
|
"epoch": 0.24454260290634403,
|
|
"grad_norm": 23.940019607543945,
|
|
"learning_rate": 1.8755822219865963e-06,
|
|
"loss": 0.3493,
|
|
"num_input_tokens_seen": 12016960,
|
|
"step": 3820
|
|
},
|
|
{
|
|
"epoch": 0.2448626848473209,
|
|
"grad_norm": 68.5343246459961,
|
|
"learning_rate": 1.875041909402752e-06,
|
|
"loss": 0.4331,
|
|
"num_input_tokens_seen": 12032576,
|
|
"step": 3825
|
|
},
|
|
{
|
|
"epoch": 0.2451827667882978,
|
|
"grad_norm": 25.4498233795166,
|
|
"learning_rate": 1.8745005043188102e-06,
|
|
"loss": 0.3638,
|
|
"num_input_tokens_seen": 12048768,
|
|
"step": 3830
|
|
},
|
|
{
|
|
"epoch": 0.2455028487292747,
|
|
"grad_norm": 37.17061233520508,
|
|
"learning_rate": 1.8739580074107208e-06,
|
|
"loss": 0.395,
|
|
"num_input_tokens_seen": 12065088,
|
|
"step": 3835
|
|
},
|
|
{
|
|
"epoch": 0.24582293067025157,
|
|
"grad_norm": 38.826255798339844,
|
|
"learning_rate": 1.873414419355798e-06,
|
|
"loss": 0.6844,
|
|
"num_input_tokens_seen": 12080704,
|
|
"step": 3840
|
|
},
|
|
{
|
|
"epoch": 0.24614301261122848,
|
|
"grad_norm": 40.032527923583984,
|
|
"learning_rate": 1.872869740832717e-06,
|
|
"loss": 0.4292,
|
|
"num_input_tokens_seen": 12096704,
|
|
"step": 3845
|
|
},
|
|
{
|
|
"epoch": 0.24646309455220536,
|
|
"grad_norm": 36.49966049194336,
|
|
"learning_rate": 1.8723239725215165e-06,
|
|
"loss": 0.6103,
|
|
"num_input_tokens_seen": 12111488,
|
|
"step": 3850
|
|
},
|
|
{
|
|
"epoch": 0.24678317649318227,
|
|
"grad_norm": 22.378215789794922,
|
|
"learning_rate": 1.871777115103594e-06,
|
|
"loss": 0.4206,
|
|
"num_input_tokens_seen": 12128192,
|
|
"step": 3855
|
|
},
|
|
{
|
|
"epoch": 0.24710325843415915,
|
|
"grad_norm": 21.57525634765625,
|
|
"learning_rate": 1.8712291692617074e-06,
|
|
"loss": 0.4786,
|
|
"num_input_tokens_seen": 12143808,
|
|
"step": 3860
|
|
},
|
|
{
|
|
"epoch": 0.24742334037513602,
|
|
"grad_norm": 32.303707122802734,
|
|
"learning_rate": 1.8706801356799735e-06,
|
|
"loss": 0.4804,
|
|
"num_input_tokens_seen": 12159232,
|
|
"step": 3865
|
|
},
|
|
{
|
|
"epoch": 0.24774342231611293,
|
|
"grad_norm": 26.57257843017578,
|
|
"learning_rate": 1.8701300150438674e-06,
|
|
"loss": 0.4465,
|
|
"num_input_tokens_seen": 12175360,
|
|
"step": 3870
|
|
},
|
|
{
|
|
"epoch": 0.2480635042570898,
|
|
"grad_norm": 17.5268611907959,
|
|
"learning_rate": 1.869578808040221e-06,
|
|
"loss": 0.4191,
|
|
"num_input_tokens_seen": 12190272,
|
|
"step": 3875
|
|
},
|
|
{
|
|
"epoch": 0.2483835861980667,
|
|
"grad_norm": 48.708431243896484,
|
|
"learning_rate": 1.869026515357223e-06,
|
|
"loss": 0.5149,
|
|
"num_input_tokens_seen": 12208448,
|
|
"step": 3880
|
|
},
|
|
{
|
|
"epoch": 0.2487036681390436,
|
|
"grad_norm": 38.29990005493164,
|
|
"learning_rate": 1.8684731376844169e-06,
|
|
"loss": 0.6372,
|
|
"num_input_tokens_seen": 12225984,
|
|
"step": 3885
|
|
},
|
|
{
|
|
"epoch": 0.24902375008002048,
|
|
"grad_norm": 33.091251373291016,
|
|
"learning_rate": 1.8679186757127014e-06,
|
|
"loss": 0.4965,
|
|
"num_input_tokens_seen": 12241408,
|
|
"step": 3890
|
|
},
|
|
{
|
|
"epoch": 0.24934383202099739,
|
|
"grad_norm": 30.313892364501953,
|
|
"learning_rate": 1.8673631301343288e-06,
|
|
"loss": 0.4381,
|
|
"num_input_tokens_seen": 12256064,
|
|
"step": 3895
|
|
},
|
|
{
|
|
"epoch": 0.24966391396197427,
|
|
"grad_norm": 26.932268142700195,
|
|
"learning_rate": 1.8668065016429044e-06,
|
|
"loss": 0.4388,
|
|
"num_input_tokens_seen": 12272832,
|
|
"step": 3900
|
|
},
|
|
{
|
|
"epoch": 0.24998399590295114,
|
|
"grad_norm": 22.444902420043945,
|
|
"learning_rate": 1.866248790933385e-06,
|
|
"loss": 0.5257,
|
|
"num_input_tokens_seen": 12289024,
|
|
"step": 3905
|
|
},
|
|
{
|
|
"epoch": 0.25030407784392805,
|
|
"grad_norm": 27.67203140258789,
|
|
"learning_rate": 1.8656899987020795e-06,
|
|
"loss": 0.4226,
|
|
"num_input_tokens_seen": 12304064,
|
|
"step": 3910
|
|
},
|
|
{
|
|
"epoch": 0.25030407784392805,
|
|
"eval_loss": 0.4644124507904053,
|
|
"eval_runtime": 49.2047,
|
|
"eval_samples_per_second": 282.209,
|
|
"eval_steps_per_second": 35.281,
|
|
"num_input_tokens_seen": 12304064,
|
|
"step": 3910
|
|
},
|
|
{
|
|
"epoch": 0.25062415978490493,
|
|
"grad_norm": 31.06105613708496,
|
|
"learning_rate": 1.865130125646646e-06,
|
|
"loss": 0.4605,
|
|
"num_input_tokens_seen": 12320256,
|
|
"step": 3915
|
|
},
|
|
{
|
|
"epoch": 0.2509442417258818,
|
|
"grad_norm": 21.309823989868164,
|
|
"learning_rate": 1.8645691724660933e-06,
|
|
"loss": 0.4394,
|
|
"num_input_tokens_seen": 12335360,
|
|
"step": 3920
|
|
},
|
|
{
|
|
"epoch": 0.2512643236668587,
|
|
"grad_norm": 24.060503005981445,
|
|
"learning_rate": 1.8640071398607774e-06,
|
|
"loss": 0.4616,
|
|
"num_input_tokens_seen": 12351488,
|
|
"step": 3925
|
|
},
|
|
{
|
|
"epoch": 0.2515844056078356,
|
|
"grad_norm": 58.631771087646484,
|
|
"learning_rate": 1.8634440285324024e-06,
|
|
"loss": 0.6203,
|
|
"num_input_tokens_seen": 12365952,
|
|
"step": 3930
|
|
},
|
|
{
|
|
"epoch": 0.2519044875488125,
|
|
"grad_norm": 54.601966857910156,
|
|
"learning_rate": 1.8628798391840205e-06,
|
|
"loss": 0.469,
|
|
"num_input_tokens_seen": 12381376,
|
|
"step": 3935
|
|
},
|
|
{
|
|
"epoch": 0.2522245694897894,
|
|
"grad_norm": 57.81584548950195,
|
|
"learning_rate": 1.8623145725200277e-06,
|
|
"loss": 0.4588,
|
|
"num_input_tokens_seen": 12396160,
|
|
"step": 3940
|
|
},
|
|
{
|
|
"epoch": 0.25254465143076626,
|
|
"grad_norm": 27.153488159179688,
|
|
"learning_rate": 1.8617482292461664e-06,
|
|
"loss": 0.4468,
|
|
"num_input_tokens_seen": 12410944,
|
|
"step": 3945
|
|
},
|
|
{
|
|
"epoch": 0.25286473337174314,
|
|
"grad_norm": 25.399364471435547,
|
|
"learning_rate": 1.861180810069523e-06,
|
|
"loss": 0.4172,
|
|
"num_input_tokens_seen": 12426304,
|
|
"step": 3950
|
|
},
|
|
{
|
|
"epoch": 0.2531848153127201,
|
|
"grad_norm": 41.58170700073242,
|
|
"learning_rate": 1.8606123156985268e-06,
|
|
"loss": 0.4599,
|
|
"num_input_tokens_seen": 12442432,
|
|
"step": 3955
|
|
},
|
|
{
|
|
"epoch": 0.25350489725369696,
|
|
"grad_norm": 19.8244686126709,
|
|
"learning_rate": 1.8600427468429496e-06,
|
|
"loss": 0.4617,
|
|
"num_input_tokens_seen": 12458368,
|
|
"step": 3960
|
|
},
|
|
{
|
|
"epoch": 0.25382497919467384,
|
|
"grad_norm": 30.747608184814453,
|
|
"learning_rate": 1.8594721042139052e-06,
|
|
"loss": 0.4302,
|
|
"num_input_tokens_seen": 12474368,
|
|
"step": 3965
|
|
},
|
|
{
|
|
"epoch": 0.2541450611356507,
|
|
"grad_norm": 18.357315063476562,
|
|
"learning_rate": 1.858900388523847e-06,
|
|
"loss": 0.4147,
|
|
"num_input_tokens_seen": 12490176,
|
|
"step": 3970
|
|
},
|
|
{
|
|
"epoch": 0.2544651430766276,
|
|
"grad_norm": 25.5488224029541,
|
|
"learning_rate": 1.8583276004865694e-06,
|
|
"loss": 0.4639,
|
|
"num_input_tokens_seen": 12507840,
|
|
"step": 3975
|
|
},
|
|
{
|
|
"epoch": 0.25478522501760453,
|
|
"grad_norm": 38.78436279296875,
|
|
"learning_rate": 1.8577537408172046e-06,
|
|
"loss": 0.3452,
|
|
"num_input_tokens_seen": 12523520,
|
|
"step": 3980
|
|
},
|
|
{
|
|
"epoch": 0.2551053069585814,
|
|
"grad_norm": 32.23760986328125,
|
|
"learning_rate": 1.8571788102322234e-06,
|
|
"loss": 0.5365,
|
|
"num_input_tokens_seen": 12540736,
|
|
"step": 3985
|
|
},
|
|
{
|
|
"epoch": 0.2554253888995583,
|
|
"grad_norm": 34.73612976074219,
|
|
"learning_rate": 1.8566028094494332e-06,
|
|
"loss": 0.4704,
|
|
"num_input_tokens_seen": 12556352,
|
|
"step": 3990
|
|
},
|
|
{
|
|
"epoch": 0.25574547084053517,
|
|
"grad_norm": 21.44598388671875,
|
|
"learning_rate": 1.8560257391879778e-06,
|
|
"loss": 0.3726,
|
|
"num_input_tokens_seen": 12570688,
|
|
"step": 3995
|
|
},
|
|
{
|
|
"epoch": 0.25606555278151205,
|
|
"grad_norm": 16.398038864135742,
|
|
"learning_rate": 1.855447600168336e-06,
|
|
"loss": 0.4038,
|
|
"num_input_tokens_seen": 12585984,
|
|
"step": 4000
|
|
},
|
|
{
|
|
"epoch": 0.25638563472248893,
|
|
"grad_norm": 19.45931053161621,
|
|
"learning_rate": 1.8548683931123215e-06,
|
|
"loss": 0.4665,
|
|
"num_input_tokens_seen": 12601216,
|
|
"step": 4005
|
|
},
|
|
{
|
|
"epoch": 0.25670571666346587,
|
|
"grad_norm": 65.39263916015625,
|
|
"learning_rate": 1.8542881187430807e-06,
|
|
"loss": 0.4408,
|
|
"num_input_tokens_seen": 12618624,
|
|
"step": 4010
|
|
},
|
|
{
|
|
"epoch": 0.25702579860444275,
|
|
"grad_norm": 24.916526794433594,
|
|
"learning_rate": 1.8537067777850935e-06,
|
|
"loss": 0.5792,
|
|
"num_input_tokens_seen": 12635840,
|
|
"step": 4015
|
|
},
|
|
{
|
|
"epoch": 0.2573458805454196,
|
|
"grad_norm": 21.44871711730957,
|
|
"learning_rate": 1.8531243709641704e-06,
|
|
"loss": 0.3554,
|
|
"num_input_tokens_seen": 12651904,
|
|
"step": 4020
|
|
},
|
|
{
|
|
"epoch": 0.2576659624863965,
|
|
"grad_norm": 37.30930709838867,
|
|
"learning_rate": 1.8525408990074533e-06,
|
|
"loss": 0.4923,
|
|
"num_input_tokens_seen": 12666944,
|
|
"step": 4025
|
|
},
|
|
{
|
|
"epoch": 0.2579860444273734,
|
|
"grad_norm": 14.11586856842041,
|
|
"learning_rate": 1.851956362643414e-06,
|
|
"loss": 0.4155,
|
|
"num_input_tokens_seen": 12682688,
|
|
"step": 4030
|
|
},
|
|
{
|
|
"epoch": 0.2583061263683503,
|
|
"grad_norm": 43.13747024536133,
|
|
"learning_rate": 1.851370762601853e-06,
|
|
"loss": 0.5472,
|
|
"num_input_tokens_seen": 12698304,
|
|
"step": 4035
|
|
},
|
|
{
|
|
"epoch": 0.2586262083093272,
|
|
"grad_norm": 41.56428527832031,
|
|
"learning_rate": 1.8507840996138983e-06,
|
|
"loss": 0.4995,
|
|
"num_input_tokens_seen": 12712896,
|
|
"step": 4040
|
|
},
|
|
{
|
|
"epoch": 0.2589462902503041,
|
|
"grad_norm": 61.59485626220703,
|
|
"learning_rate": 1.8501963744120062e-06,
|
|
"loss": 0.39,
|
|
"num_input_tokens_seen": 12727488,
|
|
"step": 4045
|
|
},
|
|
{
|
|
"epoch": 0.25926637219128096,
|
|
"grad_norm": 34.89384078979492,
|
|
"learning_rate": 1.849607587729958e-06,
|
|
"loss": 0.4037,
|
|
"num_input_tokens_seen": 12742720,
|
|
"step": 4050
|
|
},
|
|
{
|
|
"epoch": 0.25958645413225784,
|
|
"grad_norm": 26.042404174804688,
|
|
"learning_rate": 1.8490177403028615e-06,
|
|
"loss": 0.3918,
|
|
"num_input_tokens_seen": 12757760,
|
|
"step": 4055
|
|
},
|
|
{
|
|
"epoch": 0.2599065360732348,
|
|
"grad_norm": 39.44220733642578,
|
|
"learning_rate": 1.8484268328671475e-06,
|
|
"loss": 0.4879,
|
|
"num_input_tokens_seen": 12773312,
|
|
"step": 4060
|
|
},
|
|
{
|
|
"epoch": 0.26022661801421165,
|
|
"grad_norm": 41.2028923034668,
|
|
"learning_rate": 1.847834866160571e-06,
|
|
"loss": 0.553,
|
|
"num_input_tokens_seen": 12790336,
|
|
"step": 4065
|
|
},
|
|
{
|
|
"epoch": 0.26054669995518853,
|
|
"grad_norm": 26.452022552490234,
|
|
"learning_rate": 1.847241840922209e-06,
|
|
"loss": 0.4995,
|
|
"num_input_tokens_seen": 12805632,
|
|
"step": 4070
|
|
},
|
|
{
|
|
"epoch": 0.2608667818961654,
|
|
"grad_norm": 36.87411117553711,
|
|
"learning_rate": 1.8466477578924616e-06,
|
|
"loss": 0.4861,
|
|
"num_input_tokens_seen": 12821184,
|
|
"step": 4075
|
|
},
|
|
{
|
|
"epoch": 0.2611868638371423,
|
|
"grad_norm": 30.8194522857666,
|
|
"learning_rate": 1.8460526178130472e-06,
|
|
"loss": 0.5037,
|
|
"num_input_tokens_seen": 12836544,
|
|
"step": 4080
|
|
},
|
|
{
|
|
"epoch": 0.26150694577811917,
|
|
"grad_norm": 37.22843551635742,
|
|
"learning_rate": 1.8454564214270056e-06,
|
|
"loss": 0.4307,
|
|
"num_input_tokens_seen": 12852032,
|
|
"step": 4085
|
|
},
|
|
{
|
|
"epoch": 0.2618270277190961,
|
|
"grad_norm": 46.01398468017578,
|
|
"learning_rate": 1.8448591694786955e-06,
|
|
"loss": 0.446,
|
|
"num_input_tokens_seen": 12867456,
|
|
"step": 4090
|
|
},
|
|
{
|
|
"epoch": 0.262147109660073,
|
|
"grad_norm": 30.995271682739258,
|
|
"learning_rate": 1.8442608627137925e-06,
|
|
"loss": 0.3206,
|
|
"num_input_tokens_seen": 12885184,
|
|
"step": 4095
|
|
},
|
|
{
|
|
"epoch": 0.26246719160104987,
|
|
"grad_norm": 30.171613693237305,
|
|
"learning_rate": 1.8436615018792897e-06,
|
|
"loss": 0.3815,
|
|
"num_input_tokens_seen": 12900416,
|
|
"step": 4100
|
|
},
|
|
{
|
|
"epoch": 0.26278727354202674,
|
|
"grad_norm": 38.23905563354492,
|
|
"learning_rate": 1.8430610877234957e-06,
|
|
"loss": 0.5722,
|
|
"num_input_tokens_seen": 12915648,
|
|
"step": 4105
|
|
},
|
|
{
|
|
"epoch": 0.2631073554830036,
|
|
"grad_norm": 15.184795379638672,
|
|
"learning_rate": 1.8424596209960356e-06,
|
|
"loss": 0.4491,
|
|
"num_input_tokens_seen": 12930368,
|
|
"step": 4110
|
|
},
|
|
{
|
|
"epoch": 0.26342743742398056,
|
|
"grad_norm": 24.648910522460938,
|
|
"learning_rate": 1.8418571024478466e-06,
|
|
"loss": 0.5253,
|
|
"num_input_tokens_seen": 12945472,
|
|
"step": 4115
|
|
},
|
|
{
|
|
"epoch": 0.26374751936495744,
|
|
"grad_norm": 24.325111389160156,
|
|
"learning_rate": 1.8412535328311812e-06,
|
|
"loss": 0.4884,
|
|
"num_input_tokens_seen": 12961472,
|
|
"step": 4120
|
|
},
|
|
{
|
|
"epoch": 0.2640676013059343,
|
|
"grad_norm": 67.2924575805664,
|
|
"learning_rate": 1.8406489128996023e-06,
|
|
"loss": 0.5935,
|
|
"num_input_tokens_seen": 12975872,
|
|
"step": 4125
|
|
},
|
|
{
|
|
"epoch": 0.2643876832469112,
|
|
"grad_norm": 33.307865142822266,
|
|
"learning_rate": 1.8400432434079853e-06,
|
|
"loss": 0.5286,
|
|
"num_input_tokens_seen": 12992128,
|
|
"step": 4130
|
|
},
|
|
{
|
|
"epoch": 0.2647077651878881,
|
|
"grad_norm": 17.04827308654785,
|
|
"learning_rate": 1.8394365251125162e-06,
|
|
"loss": 0.4112,
|
|
"num_input_tokens_seen": 13021184,
|
|
"step": 4135
|
|
},
|
|
{
|
|
"epoch": 0.265027847128865,
|
|
"grad_norm": 31.74374771118164,
|
|
"learning_rate": 1.8388287587706888e-06,
|
|
"loss": 0.4385,
|
|
"num_input_tokens_seen": 13037568,
|
|
"step": 4140
|
|
},
|
|
{
|
|
"epoch": 0.2653479290698419,
|
|
"grad_norm": 35.290184020996094,
|
|
"learning_rate": 1.8382199451413074e-06,
|
|
"loss": 0.4655,
|
|
"num_input_tokens_seen": 13053440,
|
|
"step": 4145
|
|
},
|
|
{
|
|
"epoch": 0.26566801101081877,
|
|
"grad_norm": 35.621437072753906,
|
|
"learning_rate": 1.837610084984483e-06,
|
|
"loss": 0.5121,
|
|
"num_input_tokens_seen": 13069440,
|
|
"step": 4150
|
|
},
|
|
{
|
|
"epoch": 0.26598809295179565,
|
|
"grad_norm": 59.76009750366211,
|
|
"learning_rate": 1.8369991790616327e-06,
|
|
"loss": 0.5466,
|
|
"num_input_tokens_seen": 13084224,
|
|
"step": 4155
|
|
},
|
|
{
|
|
"epoch": 0.26630817489277253,
|
|
"grad_norm": 38.1486701965332,
|
|
"learning_rate": 1.8363872281354795e-06,
|
|
"loss": 0.6597,
|
|
"num_input_tokens_seen": 13098688,
|
|
"step": 4160
|
|
},
|
|
{
|
|
"epoch": 0.26662825683374947,
|
|
"grad_norm": 33.94224166870117,
|
|
"learning_rate": 1.835774232970052e-06,
|
|
"loss": 0.4049,
|
|
"num_input_tokens_seen": 13114112,
|
|
"step": 4165
|
|
},
|
|
{
|
|
"epoch": 0.26694833877472635,
|
|
"grad_norm": 29.897977828979492,
|
|
"learning_rate": 1.8351601943306815e-06,
|
|
"loss": 0.4672,
|
|
"num_input_tokens_seen": 13130240,
|
|
"step": 4170
|
|
},
|
|
{
|
|
"epoch": 0.2672684207157032,
|
|
"grad_norm": 41.0724983215332,
|
|
"learning_rate": 1.8345451129840025e-06,
|
|
"loss": 0.3994,
|
|
"num_input_tokens_seen": 13145536,
|
|
"step": 4175
|
|
},
|
|
{
|
|
"epoch": 0.2675885026566801,
|
|
"grad_norm": 37.96142578125,
|
|
"learning_rate": 1.8339289896979515e-06,
|
|
"loss": 0.552,
|
|
"num_input_tokens_seen": 13160256,
|
|
"step": 4180
|
|
},
|
|
{
|
|
"epoch": 0.267908584597657,
|
|
"grad_norm": 37.417449951171875,
|
|
"learning_rate": 1.8333118252417651e-06,
|
|
"loss": 0.5336,
|
|
"num_input_tokens_seen": 13177088,
|
|
"step": 4185
|
|
},
|
|
{
|
|
"epoch": 0.26822866653863386,
|
|
"grad_norm": 32.74960708618164,
|
|
"learning_rate": 1.832693620385981e-06,
|
|
"loss": 0.5098,
|
|
"num_input_tokens_seen": 13192768,
|
|
"step": 4190
|
|
},
|
|
{
|
|
"epoch": 0.2685487484796108,
|
|
"grad_norm": 27.491313934326172,
|
|
"learning_rate": 1.8320743759024352e-06,
|
|
"loss": 0.5183,
|
|
"num_input_tokens_seen": 13208192,
|
|
"step": 4195
|
|
},
|
|
{
|
|
"epoch": 0.2688688304205877,
|
|
"grad_norm": 38.285240173339844,
|
|
"learning_rate": 1.831454092564261e-06,
|
|
"loss": 0.5242,
|
|
"num_input_tokens_seen": 13223872,
|
|
"step": 4200
|
|
},
|
|
{
|
|
"epoch": 0.26918891236156456,
|
|
"grad_norm": 20.660884857177734,
|
|
"learning_rate": 1.8308327711458899e-06,
|
|
"loss": 0.4714,
|
|
"num_input_tokens_seen": 13239104,
|
|
"step": 4205
|
|
},
|
|
{
|
|
"epoch": 0.26950899430254144,
|
|
"grad_norm": 36.68329620361328,
|
|
"learning_rate": 1.830210412423049e-06,
|
|
"loss": 0.3844,
|
|
"num_input_tokens_seen": 13254464,
|
|
"step": 4210
|
|
},
|
|
{
|
|
"epoch": 0.2698290762435183,
|
|
"grad_norm": 22.882728576660156,
|
|
"learning_rate": 1.8295870171727605e-06,
|
|
"loss": 0.3647,
|
|
"num_input_tokens_seen": 13269824,
|
|
"step": 4215
|
|
},
|
|
{
|
|
"epoch": 0.27014915818449525,
|
|
"grad_norm": 20.831666946411133,
|
|
"learning_rate": 1.8289625861733408e-06,
|
|
"loss": 0.4194,
|
|
"num_input_tokens_seen": 13288448,
|
|
"step": 4220
|
|
},
|
|
{
|
|
"epoch": 0.27046924012547213,
|
|
"grad_norm": 34.60063171386719,
|
|
"learning_rate": 1.8283371202043991e-06,
|
|
"loss": 0.5194,
|
|
"num_input_tokens_seen": 13304320,
|
|
"step": 4225
|
|
},
|
|
{
|
|
"epoch": 0.270789322066449,
|
|
"grad_norm": 39.810787200927734,
|
|
"learning_rate": 1.827710620046837e-06,
|
|
"loss": 0.5503,
|
|
"num_input_tokens_seen": 13321920,
|
|
"step": 4230
|
|
},
|
|
{
|
|
"epoch": 0.2711094040074259,
|
|
"grad_norm": 52.01685333251953,
|
|
"learning_rate": 1.8270830864828474e-06,
|
|
"loss": 0.4687,
|
|
"num_input_tokens_seen": 13337280,
|
|
"step": 4235
|
|
},
|
|
{
|
|
"epoch": 0.27142948594840277,
|
|
"grad_norm": 15.508134841918945,
|
|
"learning_rate": 1.8264545202959133e-06,
|
|
"loss": 0.4287,
|
|
"num_input_tokens_seen": 13354112,
|
|
"step": 4240
|
|
},
|
|
{
|
|
"epoch": 0.2717495678893797,
|
|
"grad_norm": 32.78725814819336,
|
|
"learning_rate": 1.8258249222708067e-06,
|
|
"loss": 0.4321,
|
|
"num_input_tokens_seen": 13369600,
|
|
"step": 4245
|
|
},
|
|
{
|
|
"epoch": 0.2720696498303566,
|
|
"grad_norm": 23.458738327026367,
|
|
"learning_rate": 1.8251942931935886e-06,
|
|
"loss": 0.4464,
|
|
"num_input_tokens_seen": 13385536,
|
|
"step": 4250
|
|
},
|
|
{
|
|
"epoch": 0.27238973177133347,
|
|
"grad_norm": 31.733396530151367,
|
|
"learning_rate": 1.8245626338516069e-06,
|
|
"loss": 0.3788,
|
|
"num_input_tokens_seen": 13400832,
|
|
"step": 4255
|
|
},
|
|
{
|
|
"epoch": 0.27270981371231034,
|
|
"grad_norm": 35.16189956665039,
|
|
"learning_rate": 1.823929945033495e-06,
|
|
"loss": 0.3397,
|
|
"num_input_tokens_seen": 13416000,
|
|
"step": 4260
|
|
},
|
|
{
|
|
"epoch": 0.2730298956532872,
|
|
"grad_norm": 31.286619186401367,
|
|
"learning_rate": 1.8232962275291728e-06,
|
|
"loss": 0.5015,
|
|
"num_input_tokens_seen": 13431360,
|
|
"step": 4265
|
|
},
|
|
{
|
|
"epoch": 0.2733499775942641,
|
|
"grad_norm": 45.81655502319336,
|
|
"learning_rate": 1.822661482129844e-06,
|
|
"loss": 0.4342,
|
|
"num_input_tokens_seen": 13446976,
|
|
"step": 4270
|
|
},
|
|
{
|
|
"epoch": 0.27367005953524104,
|
|
"grad_norm": 21.677684783935547,
|
|
"learning_rate": 1.8220257096279956e-06,
|
|
"loss": 0.3796,
|
|
"num_input_tokens_seen": 13463040,
|
|
"step": 4275
|
|
},
|
|
{
|
|
"epoch": 0.2739901414762179,
|
|
"grad_norm": 35.41159439086914,
|
|
"learning_rate": 1.8213889108173972e-06,
|
|
"loss": 0.6798,
|
|
"num_input_tokens_seen": 13478656,
|
|
"step": 4280
|
|
},
|
|
{
|
|
"epoch": 0.2743102234171948,
|
|
"grad_norm": 20.70133399963379,
|
|
"learning_rate": 1.8207510864930992e-06,
|
|
"loss": 0.4843,
|
|
"num_input_tokens_seen": 13495296,
|
|
"step": 4285
|
|
},
|
|
{
|
|
"epoch": 0.2746303053581717,
|
|
"grad_norm": 18.472976684570312,
|
|
"learning_rate": 1.8201122374514336e-06,
|
|
"loss": 0.5024,
|
|
"num_input_tokens_seen": 13510912,
|
|
"step": 4290
|
|
},
|
|
{
|
|
"epoch": 0.27495038729914856,
|
|
"grad_norm": 22.679168701171875,
|
|
"learning_rate": 1.8194723644900099e-06,
|
|
"loss": 0.4465,
|
|
"num_input_tokens_seen": 13525952,
|
|
"step": 4295
|
|
},
|
|
{
|
|
"epoch": 0.2752704692401255,
|
|
"grad_norm": 25.11664390563965,
|
|
"learning_rate": 1.8188314684077173e-06,
|
|
"loss": 0.5334,
|
|
"num_input_tokens_seen": 13546752,
|
|
"step": 4300
|
|
},
|
|
{
|
|
"epoch": 0.2755905511811024,
|
|
"grad_norm": 37.698638916015625,
|
|
"learning_rate": 1.8181895500047226e-06,
|
|
"loss": 0.5659,
|
|
"num_input_tokens_seen": 13561728,
|
|
"step": 4305
|
|
},
|
|
{
|
|
"epoch": 0.27591063312207925,
|
|
"grad_norm": 21.342445373535156,
|
|
"learning_rate": 1.817546610082468e-06,
|
|
"loss": 0.4559,
|
|
"num_input_tokens_seen": 13577344,
|
|
"step": 4310
|
|
},
|
|
{
|
|
"epoch": 0.27623071506305613,
|
|
"grad_norm": 25.98567008972168,
|
|
"learning_rate": 1.816902649443672e-06,
|
|
"loss": 0.4806,
|
|
"num_input_tokens_seen": 13592256,
|
|
"step": 4315
|
|
},
|
|
{
|
|
"epoch": 0.276550797004033,
|
|
"grad_norm": 36.9737548828125,
|
|
"learning_rate": 1.8162576688923262e-06,
|
|
"loss": 0.5351,
|
|
"num_input_tokens_seen": 13608832,
|
|
"step": 4320
|
|
},
|
|
{
|
|
"epoch": 0.27687087894500995,
|
|
"grad_norm": 25.08713150024414,
|
|
"learning_rate": 1.815611669233697e-06,
|
|
"loss": 0.5544,
|
|
"num_input_tokens_seen": 13624128,
|
|
"step": 4325
|
|
},
|
|
{
|
|
"epoch": 0.2771909608859868,
|
|
"grad_norm": 25.511003494262695,
|
|
"learning_rate": 1.8149646512743222e-06,
|
|
"loss": 0.5301,
|
|
"num_input_tokens_seen": 13640576,
|
|
"step": 4330
|
|
},
|
|
{
|
|
"epoch": 0.2775110428269637,
|
|
"grad_norm": 22.00773048400879,
|
|
"learning_rate": 1.8143166158220118e-06,
|
|
"loss": 0.4513,
|
|
"num_input_tokens_seen": 13655872,
|
|
"step": 4335
|
|
},
|
|
{
|
|
"epoch": 0.2778311247679406,
|
|
"grad_norm": 41.66020584106445,
|
|
"learning_rate": 1.8136675636858454e-06,
|
|
"loss": 0.6679,
|
|
"num_input_tokens_seen": 13672384,
|
|
"step": 4340
|
|
},
|
|
{
|
|
"epoch": 0.27815120670891746,
|
|
"grad_norm": 20.195674896240234,
|
|
"learning_rate": 1.8130174956761723e-06,
|
|
"loss": 0.3988,
|
|
"num_input_tokens_seen": 13687296,
|
|
"step": 4345
|
|
},
|
|
{
|
|
"epoch": 0.2784712886498944,
|
|
"grad_norm": 25.734270095825195,
|
|
"learning_rate": 1.81236641260461e-06,
|
|
"loss": 0.5363,
|
|
"num_input_tokens_seen": 13702528,
|
|
"step": 4350
|
|
},
|
|
{
|
|
"epoch": 0.2787913705908713,
|
|
"grad_norm": 67.11882019042969,
|
|
"learning_rate": 1.811714315284043e-06,
|
|
"loss": 0.5002,
|
|
"num_input_tokens_seen": 13717568,
|
|
"step": 4355
|
|
},
|
|
{
|
|
"epoch": 0.27911145253184816,
|
|
"grad_norm": 19.78514862060547,
|
|
"learning_rate": 1.8110612045286229e-06,
|
|
"loss": 0.4016,
|
|
"num_input_tokens_seen": 13733568,
|
|
"step": 4360
|
|
},
|
|
{
|
|
"epoch": 0.27943153447282504,
|
|
"grad_norm": 20.73729705810547,
|
|
"learning_rate": 1.8104070811537661e-06,
|
|
"loss": 0.3744,
|
|
"num_input_tokens_seen": 13749312,
|
|
"step": 4365
|
|
},
|
|
{
|
|
"epoch": 0.2797516164138019,
|
|
"grad_norm": 16.582807540893555,
|
|
"learning_rate": 1.8097519459761533e-06,
|
|
"loss": 0.4299,
|
|
"num_input_tokens_seen": 13765952,
|
|
"step": 4370
|
|
},
|
|
{
|
|
"epoch": 0.2800716983547788,
|
|
"grad_norm": 47.0535888671875,
|
|
"learning_rate": 1.8090957998137283e-06,
|
|
"loss": 0.495,
|
|
"num_input_tokens_seen": 13781440,
|
|
"step": 4375
|
|
},
|
|
{
|
|
"epoch": 0.28039178029575573,
|
|
"grad_norm": 53.1851921081543,
|
|
"learning_rate": 1.8084386434856978e-06,
|
|
"loss": 0.4471,
|
|
"num_input_tokens_seen": 13796864,
|
|
"step": 4380
|
|
},
|
|
{
|
|
"epoch": 0.2807118622367326,
|
|
"grad_norm": 26.30471420288086,
|
|
"learning_rate": 1.8077804778125283e-06,
|
|
"loss": 0.4915,
|
|
"num_input_tokens_seen": 13812736,
|
|
"step": 4385
|
|
},
|
|
{
|
|
"epoch": 0.2810319441777095,
|
|
"grad_norm": 60.074981689453125,
|
|
"learning_rate": 1.807121303615948e-06,
|
|
"loss": 0.4966,
|
|
"num_input_tokens_seen": 13828288,
|
|
"step": 4390
|
|
},
|
|
{
|
|
"epoch": 0.28135202611868637,
|
|
"grad_norm": 40.989219665527344,
|
|
"learning_rate": 1.8064611217189434e-06,
|
|
"loss": 0.4125,
|
|
"num_input_tokens_seen": 13845568,
|
|
"step": 4395
|
|
},
|
|
{
|
|
"epoch": 0.28167210805966325,
|
|
"grad_norm": 25.27169418334961,
|
|
"learning_rate": 1.8057999329457596e-06,
|
|
"loss": 0.398,
|
|
"num_input_tokens_seen": 13860608,
|
|
"step": 4400
|
|
},
|
|
{
|
|
"epoch": 0.2819921900006402,
|
|
"grad_norm": 39.82872772216797,
|
|
"learning_rate": 1.8051377381218984e-06,
|
|
"loss": 0.5663,
|
|
"num_input_tokens_seen": 13876608,
|
|
"step": 4405
|
|
},
|
|
{
|
|
"epoch": 0.28231227194161707,
|
|
"grad_norm": 34.87173080444336,
|
|
"learning_rate": 1.8044745380741177e-06,
|
|
"loss": 0.5656,
|
|
"num_input_tokens_seen": 13893632,
|
|
"step": 4410
|
|
},
|
|
{
|
|
"epoch": 0.28263235388259395,
|
|
"grad_norm": 49.1501579284668,
|
|
"learning_rate": 1.8038103336304306e-06,
|
|
"loss": 0.3896,
|
|
"num_input_tokens_seen": 13909312,
|
|
"step": 4415
|
|
},
|
|
{
|
|
"epoch": 0.2829524358235708,
|
|
"grad_norm": 27.521867752075195,
|
|
"learning_rate": 1.8031451256201042e-06,
|
|
"loss": 0.5699,
|
|
"num_input_tokens_seen": 13925824,
|
|
"step": 4420
|
|
},
|
|
{
|
|
"epoch": 0.2832725177645477,
|
|
"grad_norm": 25.578853607177734,
|
|
"learning_rate": 1.8024789148736589e-06,
|
|
"loss": 0.5385,
|
|
"num_input_tokens_seen": 13942336,
|
|
"step": 4425
|
|
},
|
|
{
|
|
"epoch": 0.28359259970552464,
|
|
"grad_norm": 27.650800704956055,
|
|
"learning_rate": 1.8018117022228655e-06,
|
|
"loss": 0.392,
|
|
"num_input_tokens_seen": 13957760,
|
|
"step": 4430
|
|
},
|
|
{
|
|
"epoch": 0.2839126816465015,
|
|
"grad_norm": 49.428855895996094,
|
|
"learning_rate": 1.8011434885007479e-06,
|
|
"loss": 0.4997,
|
|
"num_input_tokens_seen": 13972992,
|
|
"step": 4435
|
|
},
|
|
{
|
|
"epoch": 0.2842327635874784,
|
|
"grad_norm": 30.81421661376953,
|
|
"learning_rate": 1.8004742745415787e-06,
|
|
"loss": 0.4308,
|
|
"num_input_tokens_seen": 13988736,
|
|
"step": 4440
|
|
},
|
|
{
|
|
"epoch": 0.2845528455284553,
|
|
"grad_norm": 23.36966323852539,
|
|
"learning_rate": 1.799804061180879e-06,
|
|
"loss": 0.5427,
|
|
"num_input_tokens_seen": 14003520,
|
|
"step": 4445
|
|
},
|
|
{
|
|
"epoch": 0.28487292746943216,
|
|
"grad_norm": 29.571027755737305,
|
|
"learning_rate": 1.799132849255418e-06,
|
|
"loss": 0.518,
|
|
"num_input_tokens_seen": 14020608,
|
|
"step": 4450
|
|
},
|
|
{
|
|
"epoch": 0.28519300941040904,
|
|
"grad_norm": 34.7742919921875,
|
|
"learning_rate": 1.798460639603212e-06,
|
|
"loss": 0.4011,
|
|
"num_input_tokens_seen": 14035328,
|
|
"step": 4455
|
|
},
|
|
{
|
|
"epoch": 0.285513091351386,
|
|
"grad_norm": 37.04494094848633,
|
|
"learning_rate": 1.7977874330635224e-06,
|
|
"loss": 0.4805,
|
|
"num_input_tokens_seen": 14050816,
|
|
"step": 4460
|
|
},
|
|
{
|
|
"epoch": 0.28583317329236285,
|
|
"grad_norm": 18.75509262084961,
|
|
"learning_rate": 1.7971132304768555e-06,
|
|
"loss": 0.3289,
|
|
"num_input_tokens_seen": 14066880,
|
|
"step": 4465
|
|
},
|
|
{
|
|
"epoch": 0.28615325523333973,
|
|
"grad_norm": 24.66355323791504,
|
|
"learning_rate": 1.7964380326849612e-06,
|
|
"loss": 0.4937,
|
|
"num_input_tokens_seen": 14081728,
|
|
"step": 4470
|
|
},
|
|
{
|
|
"epoch": 0.2864733371743166,
|
|
"grad_norm": 18.791399002075195,
|
|
"learning_rate": 1.795761840530832e-06,
|
|
"loss": 0.4941,
|
|
"num_input_tokens_seen": 14097984,
|
|
"step": 4475
|
|
},
|
|
{
|
|
"epoch": 0.2867934191152935,
|
|
"grad_norm": 27.4366455078125,
|
|
"learning_rate": 1.7950846548587015e-06,
|
|
"loss": 0.4208,
|
|
"num_input_tokens_seen": 14115264,
|
|
"step": 4480
|
|
},
|
|
{
|
|
"epoch": 0.2871135010562704,
|
|
"grad_norm": 17.53047752380371,
|
|
"learning_rate": 1.7944064765140445e-06,
|
|
"loss": 0.2799,
|
|
"num_input_tokens_seen": 14129472,
|
|
"step": 4485
|
|
},
|
|
{
|
|
"epoch": 0.2874335829972473,
|
|
"grad_norm": 34.00762939453125,
|
|
"learning_rate": 1.7937273063435735e-06,
|
|
"loss": 0.55,
|
|
"num_input_tokens_seen": 14144896,
|
|
"step": 4490
|
|
},
|
|
{
|
|
"epoch": 0.2877536649382242,
|
|
"grad_norm": 27.387237548828125,
|
|
"learning_rate": 1.7930471451952416e-06,
|
|
"loss": 0.3622,
|
|
"num_input_tokens_seen": 14159744,
|
|
"step": 4495
|
|
},
|
|
{
|
|
"epoch": 0.28807374687920106,
|
|
"grad_norm": 39.22768020629883,
|
|
"learning_rate": 1.7923659939182377e-06,
|
|
"loss": 0.4915,
|
|
"num_input_tokens_seen": 14176384,
|
|
"step": 4500
|
|
},
|
|
{
|
|
"epoch": 0.28839382882017794,
|
|
"grad_norm": 39.973106384277344,
|
|
"learning_rate": 1.7916838533629866e-06,
|
|
"loss": 0.5376,
|
|
"num_input_tokens_seen": 14192320,
|
|
"step": 4505
|
|
},
|
|
{
|
|
"epoch": 0.2887139107611549,
|
|
"grad_norm": 27.084346771240234,
|
|
"learning_rate": 1.7910007243811493e-06,
|
|
"loss": 0.397,
|
|
"num_input_tokens_seen": 14208192,
|
|
"step": 4510
|
|
},
|
|
{
|
|
"epoch": 0.28903399270213176,
|
|
"grad_norm": 51.122711181640625,
|
|
"learning_rate": 1.7903166078256202e-06,
|
|
"loss": 0.5486,
|
|
"num_input_tokens_seen": 14223104,
|
|
"step": 4515
|
|
},
|
|
{
|
|
"epoch": 0.28935407464310864,
|
|
"grad_norm": 49.78089141845703,
|
|
"learning_rate": 1.789631504550527e-06,
|
|
"loss": 0.4153,
|
|
"num_input_tokens_seen": 14238464,
|
|
"step": 4520
|
|
},
|
|
{
|
|
"epoch": 0.2896741565840855,
|
|
"grad_norm": 32.12791061401367,
|
|
"learning_rate": 1.7889454154112288e-06,
|
|
"loss": 0.384,
|
|
"num_input_tokens_seen": 14254656,
|
|
"step": 4525
|
|
},
|
|
{
|
|
"epoch": 0.2899942385250624,
|
|
"grad_norm": 43.227901458740234,
|
|
"learning_rate": 1.7882583412643167e-06,
|
|
"loss": 0.3983,
|
|
"num_input_tokens_seen": 14268928,
|
|
"step": 4530
|
|
},
|
|
{
|
|
"epoch": 0.29031432046603933,
|
|
"grad_norm": 31.457603454589844,
|
|
"learning_rate": 1.78757028296761e-06,
|
|
"loss": 0.4326,
|
|
"num_input_tokens_seen": 14285952,
|
|
"step": 4535
|
|
},
|
|
{
|
|
"epoch": 0.2906344024070162,
|
|
"grad_norm": 18.678508758544922,
|
|
"learning_rate": 1.7868812413801582e-06,
|
|
"loss": 0.3522,
|
|
"num_input_tokens_seen": 14301760,
|
|
"step": 4540
|
|
},
|
|
{
|
|
"epoch": 0.2909544843479931,
|
|
"grad_norm": 53.38247299194336,
|
|
"learning_rate": 1.7861912173622372e-06,
|
|
"loss": 0.4976,
|
|
"num_input_tokens_seen": 14318208,
|
|
"step": 4545
|
|
},
|
|
{
|
|
"epoch": 0.29127456628896997,
|
|
"grad_norm": 41.86543655395508,
|
|
"learning_rate": 1.7855002117753504e-06,
|
|
"loss": 0.4597,
|
|
"num_input_tokens_seen": 14334144,
|
|
"step": 4550
|
|
},
|
|
{
|
|
"epoch": 0.29159464822994685,
|
|
"grad_norm": 49.806610107421875,
|
|
"learning_rate": 1.7848082254822266e-06,
|
|
"loss": 0.5283,
|
|
"num_input_tokens_seen": 14349120,
|
|
"step": 4555
|
|
},
|
|
{
|
|
"epoch": 0.29191473017092373,
|
|
"grad_norm": 56.75021743774414,
|
|
"learning_rate": 1.7841152593468185e-06,
|
|
"loss": 0.4868,
|
|
"num_input_tokens_seen": 14365376,
|
|
"step": 4560
|
|
},
|
|
{
|
|
"epoch": 0.29223481211190067,
|
|
"grad_norm": 34.16107940673828,
|
|
"learning_rate": 1.7834213142343026e-06,
|
|
"loss": 0.4582,
|
|
"num_input_tokens_seen": 14381568,
|
|
"step": 4565
|
|
},
|
|
{
|
|
"epoch": 0.29255489405287755,
|
|
"grad_norm": 28.742692947387695,
|
|
"learning_rate": 1.7827263910110777e-06,
|
|
"loss": 0.4626,
|
|
"num_input_tokens_seen": 14397312,
|
|
"step": 4570
|
|
},
|
|
{
|
|
"epoch": 0.2928749759938544,
|
|
"grad_norm": 34.53966522216797,
|
|
"learning_rate": 1.7820304905447632e-06,
|
|
"loss": 0.4372,
|
|
"num_input_tokens_seen": 14412928,
|
|
"step": 4575
|
|
},
|
|
{
|
|
"epoch": 0.2931950579348313,
|
|
"grad_norm": 47.14699935913086,
|
|
"learning_rate": 1.7813336137041991e-06,
|
|
"loss": 0.446,
|
|
"num_input_tokens_seen": 14427968,
|
|
"step": 4580
|
|
},
|
|
{
|
|
"epoch": 0.2935151398758082,
|
|
"grad_norm": 37.16606140136719,
|
|
"learning_rate": 1.7806357613594447e-06,
|
|
"loss": 0.3693,
|
|
"num_input_tokens_seen": 14442944,
|
|
"step": 4585
|
|
},
|
|
{
|
|
"epoch": 0.2938352218167851,
|
|
"grad_norm": 19.43882179260254,
|
|
"learning_rate": 1.7799369343817764e-06,
|
|
"loss": 0.4481,
|
|
"num_input_tokens_seen": 14458176,
|
|
"step": 4590
|
|
},
|
|
{
|
|
"epoch": 0.294155303757762,
|
|
"grad_norm": 24.445056915283203,
|
|
"learning_rate": 1.7792371336436883e-06,
|
|
"loss": 0.3566,
|
|
"num_input_tokens_seen": 14473600,
|
|
"step": 4595
|
|
},
|
|
{
|
|
"epoch": 0.2944753856987389,
|
|
"grad_norm": 28.31954574584961,
|
|
"learning_rate": 1.7785363600188892e-06,
|
|
"loss": 0.6518,
|
|
"num_input_tokens_seen": 14488896,
|
|
"step": 4600
|
|
},
|
|
{
|
|
"epoch": 0.29479546763971576,
|
|
"grad_norm": 38.648948669433594,
|
|
"learning_rate": 1.7778346143823038e-06,
|
|
"loss": 0.5881,
|
|
"num_input_tokens_seen": 14502784,
|
|
"step": 4605
|
|
},
|
|
{
|
|
"epoch": 0.29511554958069264,
|
|
"grad_norm": 33.51401138305664,
|
|
"learning_rate": 1.7771318976100696e-06,
|
|
"loss": 0.4293,
|
|
"num_input_tokens_seen": 14520000,
|
|
"step": 4610
|
|
},
|
|
{
|
|
"epoch": 0.2954356315216696,
|
|
"grad_norm": 28.780546188354492,
|
|
"learning_rate": 1.7764282105795364e-06,
|
|
"loss": 0.3401,
|
|
"num_input_tokens_seen": 14536320,
|
|
"step": 4615
|
|
},
|
|
{
|
|
"epoch": 0.29575571346264645,
|
|
"grad_norm": 47.155277252197266,
|
|
"learning_rate": 1.7757235541692663e-06,
|
|
"loss": 0.4524,
|
|
"num_input_tokens_seen": 14551808,
|
|
"step": 4620
|
|
},
|
|
{
|
|
"epoch": 0.29607579540362333,
|
|
"grad_norm": 19.841266632080078,
|
|
"learning_rate": 1.7750179292590306e-06,
|
|
"loss": 0.3157,
|
|
"num_input_tokens_seen": 14566976,
|
|
"step": 4625
|
|
},
|
|
{
|
|
"epoch": 0.2963958773446002,
|
|
"grad_norm": 26.28995132446289,
|
|
"learning_rate": 1.7743113367298107e-06,
|
|
"loss": 0.3475,
|
|
"num_input_tokens_seen": 14583104,
|
|
"step": 4630
|
|
},
|
|
{
|
|
"epoch": 0.2967159592855771,
|
|
"grad_norm": 38.58869552612305,
|
|
"learning_rate": 1.7736037774637955e-06,
|
|
"loss": 0.4454,
|
|
"num_input_tokens_seen": 14598336,
|
|
"step": 4635
|
|
},
|
|
{
|
|
"epoch": 0.29703604122655397,
|
|
"grad_norm": 50.025482177734375,
|
|
"learning_rate": 1.772895252344381e-06,
|
|
"loss": 0.5142,
|
|
"num_input_tokens_seen": 14615232,
|
|
"step": 4640
|
|
},
|
|
{
|
|
"epoch": 0.2973561231675309,
|
|
"grad_norm": 19.640771865844727,
|
|
"learning_rate": 1.7721857622561692e-06,
|
|
"loss": 0.3932,
|
|
"num_input_tokens_seen": 14630848,
|
|
"step": 4645
|
|
},
|
|
{
|
|
"epoch": 0.2976762051085078,
|
|
"grad_norm": 31.551252365112305,
|
|
"learning_rate": 1.7714753080849664e-06,
|
|
"loss": 0.4601,
|
|
"num_input_tokens_seen": 14647040,
|
|
"step": 4650
|
|
},
|
|
{
|
|
"epoch": 0.29799628704948466,
|
|
"grad_norm": 22.483062744140625,
|
|
"learning_rate": 1.7707638907177837e-06,
|
|
"loss": 0.4116,
|
|
"num_input_tokens_seen": 14661888,
|
|
"step": 4655
|
|
},
|
|
{
|
|
"epoch": 0.29831636899046154,
|
|
"grad_norm": 143.85166931152344,
|
|
"learning_rate": 1.7700515110428336e-06,
|
|
"loss": 0.7093,
|
|
"num_input_tokens_seen": 14677696,
|
|
"step": 4660
|
|
},
|
|
{
|
|
"epoch": 0.2986364509314384,
|
|
"grad_norm": 26.837242126464844,
|
|
"learning_rate": 1.7693381699495307e-06,
|
|
"loss": 0.4799,
|
|
"num_input_tokens_seen": 14693184,
|
|
"step": 4665
|
|
},
|
|
{
|
|
"epoch": 0.29895653287241536,
|
|
"grad_norm": 30.247093200683594,
|
|
"learning_rate": 1.7686238683284894e-06,
|
|
"loss": 0.3643,
|
|
"num_input_tokens_seen": 14707904,
|
|
"step": 4670
|
|
},
|
|
{
|
|
"epoch": 0.29927661481339224,
|
|
"grad_norm": 24.62070083618164,
|
|
"learning_rate": 1.7679086070715237e-06,
|
|
"loss": 0.3608,
|
|
"num_input_tokens_seen": 14724096,
|
|
"step": 4675
|
|
},
|
|
{
|
|
"epoch": 0.2995966967543691,
|
|
"grad_norm": 36.82127380371094,
|
|
"learning_rate": 1.7671923870716459e-06,
|
|
"loss": 0.4544,
|
|
"num_input_tokens_seen": 14738752,
|
|
"step": 4680
|
|
},
|
|
{
|
|
"epoch": 0.299916778695346,
|
|
"grad_norm": 41.65424346923828,
|
|
"learning_rate": 1.7664752092230652e-06,
|
|
"loss": 0.3486,
|
|
"num_input_tokens_seen": 14753664,
|
|
"step": 4685
|
|
},
|
|
{
|
|
"epoch": 0.3002368606363229,
|
|
"grad_norm": 34.0866813659668,
|
|
"learning_rate": 1.7657570744211863e-06,
|
|
"loss": 0.3784,
|
|
"num_input_tokens_seen": 14769152,
|
|
"step": 4690
|
|
},
|
|
{
|
|
"epoch": 0.30036489341271366,
|
|
"eval_loss": 0.4629112482070923,
|
|
"eval_runtime": 49.1915,
|
|
"eval_samples_per_second": 282.284,
|
|
"eval_steps_per_second": 35.291,
|
|
"num_input_tokens_seen": 14775488,
|
|
"step": 4692
|
|
},
|
|
{
|
|
"epoch": 0.3005569425772998,
|
|
"grad_norm": 48.05270004272461,
|
|
"learning_rate": 1.765037983562609e-06,
|
|
"loss": 0.5028,
|
|
"num_input_tokens_seen": 14784128,
|
|
"step": 4695
|
|
},
|
|
{
|
|
"epoch": 0.3008770245182767,
|
|
"grad_norm": 49.29054641723633,
|
|
"learning_rate": 1.7643179375451264e-06,
|
|
"loss": 0.4459,
|
|
"num_input_tokens_seen": 14799936,
|
|
"step": 4700
|
|
},
|
|
{
|
|
"epoch": 0.30119710645925357,
|
|
"grad_norm": 42.15516662597656,
|
|
"learning_rate": 1.7635969372677252e-06,
|
|
"loss": 0.6083,
|
|
"num_input_tokens_seen": 14814208,
|
|
"step": 4705
|
|
},
|
|
{
|
|
"epoch": 0.30151718840023045,
|
|
"grad_norm": 37.26246643066406,
|
|
"learning_rate": 1.7628749836305818e-06,
|
|
"loss": 0.483,
|
|
"num_input_tokens_seen": 14829504,
|
|
"step": 4710
|
|
},
|
|
{
|
|
"epoch": 0.30183727034120733,
|
|
"grad_norm": 30.036657333374023,
|
|
"learning_rate": 1.7621520775350645e-06,
|
|
"loss": 0.3949,
|
|
"num_input_tokens_seen": 14843968,
|
|
"step": 4715
|
|
},
|
|
{
|
|
"epoch": 0.30215735228218427,
|
|
"grad_norm": 33.79453659057617,
|
|
"learning_rate": 1.7614282198837293e-06,
|
|
"loss": 0.4567,
|
|
"num_input_tokens_seen": 14859840,
|
|
"step": 4720
|
|
},
|
|
{
|
|
"epoch": 0.30247743422316115,
|
|
"grad_norm": 39.85743713378906,
|
|
"learning_rate": 1.7607034115803219e-06,
|
|
"loss": 0.473,
|
|
"num_input_tokens_seen": 14875648,
|
|
"step": 4725
|
|
},
|
|
{
|
|
"epoch": 0.302797516164138,
|
|
"grad_norm": 27.397972106933594,
|
|
"learning_rate": 1.7599776535297734e-06,
|
|
"loss": 0.4192,
|
|
"num_input_tokens_seen": 14890560,
|
|
"step": 4730
|
|
},
|
|
{
|
|
"epoch": 0.3031175981051149,
|
|
"grad_norm": 40.91767501831055,
|
|
"learning_rate": 1.7592509466382012e-06,
|
|
"loss": 0.4702,
|
|
"num_input_tokens_seen": 14906688,
|
|
"step": 4735
|
|
},
|
|
{
|
|
"epoch": 0.3034376800460918,
|
|
"grad_norm": 54.96405029296875,
|
|
"learning_rate": 1.7585232918129076e-06,
|
|
"loss": 0.5561,
|
|
"num_input_tokens_seen": 14922496,
|
|
"step": 4740
|
|
},
|
|
{
|
|
"epoch": 0.30375776198706866,
|
|
"grad_norm": 36.16265869140625,
|
|
"learning_rate": 1.757794689962378e-06,
|
|
"loss": 0.4601,
|
|
"num_input_tokens_seen": 14938880,
|
|
"step": 4745
|
|
},
|
|
{
|
|
"epoch": 0.3040778439280456,
|
|
"grad_norm": 44.08560562133789,
|
|
"learning_rate": 1.7570651419962807e-06,
|
|
"loss": 0.4968,
|
|
"num_input_tokens_seen": 14954112,
|
|
"step": 4750
|
|
},
|
|
{
|
|
"epoch": 0.3043979258690225,
|
|
"grad_norm": 42.19171142578125,
|
|
"learning_rate": 1.7563346488254647e-06,
|
|
"loss": 0.448,
|
|
"num_input_tokens_seen": 14969536,
|
|
"step": 4755
|
|
},
|
|
{
|
|
"epoch": 0.30471800780999936,
|
|
"grad_norm": 35.03725051879883,
|
|
"learning_rate": 1.755603211361959e-06,
|
|
"loss": 0.3373,
|
|
"num_input_tokens_seen": 14985728,
|
|
"step": 4760
|
|
},
|
|
{
|
|
"epoch": 0.30503808975097624,
|
|
"grad_norm": 20.99566078186035,
|
|
"learning_rate": 1.7548708305189722e-06,
|
|
"loss": 0.452,
|
|
"num_input_tokens_seen": 15003904,
|
|
"step": 4765
|
|
},
|
|
{
|
|
"epoch": 0.3053581716919531,
|
|
"grad_norm": 59.016563415527344,
|
|
"learning_rate": 1.7541375072108905e-06,
|
|
"loss": 0.5662,
|
|
"num_input_tokens_seen": 15019328,
|
|
"step": 4770
|
|
},
|
|
{
|
|
"epoch": 0.30567825363293005,
|
|
"grad_norm": 45.97145462036133,
|
|
"learning_rate": 1.7534032423532766e-06,
|
|
"loss": 0.4597,
|
|
"num_input_tokens_seen": 15033856,
|
|
"step": 4775
|
|
},
|
|
{
|
|
"epoch": 0.30599833557390693,
|
|
"grad_norm": 22.04340362548828,
|
|
"learning_rate": 1.7526680368628685e-06,
|
|
"loss": 0.3603,
|
|
"num_input_tokens_seen": 15051200,
|
|
"step": 4780
|
|
},
|
|
{
|
|
"epoch": 0.3063184175148838,
|
|
"grad_norm": 32.850303649902344,
|
|
"learning_rate": 1.751931891657579e-06,
|
|
"loss": 0.4471,
|
|
"num_input_tokens_seen": 15066368,
|
|
"step": 4785
|
|
},
|
|
{
|
|
"epoch": 0.3066384994558607,
|
|
"grad_norm": 21.559911727905273,
|
|
"learning_rate": 1.7511948076564943e-06,
|
|
"loss": 0.3494,
|
|
"num_input_tokens_seen": 15081600,
|
|
"step": 4790
|
|
},
|
|
{
|
|
"epoch": 0.30695858139683757,
|
|
"grad_norm": 30.383432388305664,
|
|
"learning_rate": 1.7504567857798722e-06,
|
|
"loss": 0.5308,
|
|
"num_input_tokens_seen": 15097536,
|
|
"step": 4795
|
|
},
|
|
{
|
|
"epoch": 0.3072786633378145,
|
|
"grad_norm": 37.53936767578125,
|
|
"learning_rate": 1.7497178269491417e-06,
|
|
"loss": 0.5013,
|
|
"num_input_tokens_seen": 15113728,
|
|
"step": 4800
|
|
},
|
|
{
|
|
"epoch": 0.3075987452787914,
|
|
"grad_norm": 24.428794860839844,
|
|
"learning_rate": 1.7489779320869014e-06,
|
|
"loss": 0.5561,
|
|
"num_input_tokens_seen": 15130048,
|
|
"step": 4805
|
|
},
|
|
{
|
|
"epoch": 0.30791882721976827,
|
|
"grad_norm": 22.411056518554688,
|
|
"learning_rate": 1.7482371021169193e-06,
|
|
"loss": 0.3673,
|
|
"num_input_tokens_seen": 15145600,
|
|
"step": 4810
|
|
},
|
|
{
|
|
"epoch": 0.30823890916074514,
|
|
"grad_norm": 44.107322692871094,
|
|
"learning_rate": 1.7474953379641297e-06,
|
|
"loss": 0.3935,
|
|
"num_input_tokens_seen": 15162368,
|
|
"step": 4815
|
|
},
|
|
{
|
|
"epoch": 0.308558991101722,
|
|
"grad_norm": 34.96397018432617,
|
|
"learning_rate": 1.746752640554634e-06,
|
|
"loss": 0.4323,
|
|
"num_input_tokens_seen": 15178368,
|
|
"step": 4820
|
|
},
|
|
{
|
|
"epoch": 0.3088790730426989,
|
|
"grad_norm": 26.387361526489258,
|
|
"learning_rate": 1.7460090108156988e-06,
|
|
"loss": 0.5467,
|
|
"num_input_tokens_seen": 15193408,
|
|
"step": 4825
|
|
},
|
|
{
|
|
"epoch": 0.30919915498367584,
|
|
"grad_norm": 22.992677688598633,
|
|
"learning_rate": 1.7452644496757548e-06,
|
|
"loss": 0.3081,
|
|
"num_input_tokens_seen": 15208640,
|
|
"step": 4830
|
|
},
|
|
{
|
|
"epoch": 0.3095192369246527,
|
|
"grad_norm": 44.50247573852539,
|
|
"learning_rate": 1.7445189580643946e-06,
|
|
"loss": 0.4533,
|
|
"num_input_tokens_seen": 15224192,
|
|
"step": 4835
|
|
},
|
|
{
|
|
"epoch": 0.3098393188656296,
|
|
"grad_norm": 28.59990692138672,
|
|
"learning_rate": 1.7437725369123737e-06,
|
|
"loss": 0.5119,
|
|
"num_input_tokens_seen": 15239616,
|
|
"step": 4840
|
|
},
|
|
{
|
|
"epoch": 0.3101594008066065,
|
|
"grad_norm": 31.960166931152344,
|
|
"learning_rate": 1.7430251871516077e-06,
|
|
"loss": 0.4595,
|
|
"num_input_tokens_seen": 15255680,
|
|
"step": 4845
|
|
},
|
|
{
|
|
"epoch": 0.31047948274758336,
|
|
"grad_norm": 25.40645980834961,
|
|
"learning_rate": 1.7422769097151715e-06,
|
|
"loss": 0.4886,
|
|
"num_input_tokens_seen": 15271232,
|
|
"step": 4850
|
|
},
|
|
{
|
|
"epoch": 0.3107995646885603,
|
|
"grad_norm": 65.88490295410156,
|
|
"learning_rate": 1.7415277055372982e-06,
|
|
"loss": 0.4938,
|
|
"num_input_tokens_seen": 15287040,
|
|
"step": 4855
|
|
},
|
|
{
|
|
"epoch": 0.31111964662953717,
|
|
"grad_norm": 25.532987594604492,
|
|
"learning_rate": 1.7407775755533778e-06,
|
|
"loss": 0.5025,
|
|
"num_input_tokens_seen": 15304256,
|
|
"step": 4860
|
|
},
|
|
{
|
|
"epoch": 0.31143972857051405,
|
|
"grad_norm": 18.785158157348633,
|
|
"learning_rate": 1.7400265206999568e-06,
|
|
"loss": 0.3567,
|
|
"num_input_tokens_seen": 15322112,
|
|
"step": 4865
|
|
},
|
|
{
|
|
"epoch": 0.31175981051149093,
|
|
"grad_norm": 69.29310607910156,
|
|
"learning_rate": 1.7392745419147362e-06,
|
|
"loss": 0.5436,
|
|
"num_input_tokens_seen": 15337216,
|
|
"step": 4870
|
|
},
|
|
{
|
|
"epoch": 0.3120798924524678,
|
|
"grad_norm": 38.31575393676758,
|
|
"learning_rate": 1.7385216401365693e-06,
|
|
"loss": 0.4521,
|
|
"num_input_tokens_seen": 15354048,
|
|
"step": 4875
|
|
},
|
|
{
|
|
"epoch": 0.31239997439344475,
|
|
"grad_norm": 28.862852096557617,
|
|
"learning_rate": 1.7377678163054638e-06,
|
|
"loss": 0.4933,
|
|
"num_input_tokens_seen": 15369344,
|
|
"step": 4880
|
|
},
|
|
{
|
|
"epoch": 0.3127200563344216,
|
|
"grad_norm": 51.59070587158203,
|
|
"learning_rate": 1.7370130713625775e-06,
|
|
"loss": 0.4949,
|
|
"num_input_tokens_seen": 15385920,
|
|
"step": 4885
|
|
},
|
|
{
|
|
"epoch": 0.3130401382753985,
|
|
"grad_norm": 20.555160522460938,
|
|
"learning_rate": 1.736257406250218e-06,
|
|
"loss": 0.3867,
|
|
"num_input_tokens_seen": 15401536,
|
|
"step": 4890
|
|
},
|
|
{
|
|
"epoch": 0.3133602202163754,
|
|
"grad_norm": 28.439088821411133,
|
|
"learning_rate": 1.735500821911842e-06,
|
|
"loss": 0.4501,
|
|
"num_input_tokens_seen": 15417152,
|
|
"step": 4895
|
|
},
|
|
{
|
|
"epoch": 0.31368030215735226,
|
|
"grad_norm": 30.494640350341797,
|
|
"learning_rate": 1.7347433192920544e-06,
|
|
"loss": 0.4949,
|
|
"num_input_tokens_seen": 15431872,
|
|
"step": 4900
|
|
},
|
|
{
|
|
"epoch": 0.3140003840983292,
|
|
"grad_norm": 19.200109481811523,
|
|
"learning_rate": 1.7339848993366056e-06,
|
|
"loss": 0.4021,
|
|
"num_input_tokens_seen": 15447552,
|
|
"step": 4905
|
|
},
|
|
{
|
|
"epoch": 0.3143204660393061,
|
|
"grad_norm": 32.95127868652344,
|
|
"learning_rate": 1.7332255629923922e-06,
|
|
"loss": 0.4615,
|
|
"num_input_tokens_seen": 15464384,
|
|
"step": 4910
|
|
},
|
|
{
|
|
"epoch": 0.31464054798028296,
|
|
"grad_norm": 23.275110244750977,
|
|
"learning_rate": 1.732465311207454e-06,
|
|
"loss": 0.4968,
|
|
"num_input_tokens_seen": 15479808,
|
|
"step": 4915
|
|
},
|
|
{
|
|
"epoch": 0.31496062992125984,
|
|
"grad_norm": 47.221412658691406,
|
|
"learning_rate": 1.731704144930975e-06,
|
|
"loss": 0.4973,
|
|
"num_input_tokens_seen": 15496512,
|
|
"step": 4920
|
|
},
|
|
{
|
|
"epoch": 0.3152807118622367,
|
|
"grad_norm": 39.70328903198242,
|
|
"learning_rate": 1.7309420651132797e-06,
|
|
"loss": 0.4094,
|
|
"num_input_tokens_seen": 15512896,
|
|
"step": 4925
|
|
},
|
|
{
|
|
"epoch": 0.3156007938032136,
|
|
"grad_norm": 32.56901931762695,
|
|
"learning_rate": 1.7301790727058343e-06,
|
|
"loss": 0.3234,
|
|
"num_input_tokens_seen": 15528064,
|
|
"step": 4930
|
|
},
|
|
{
|
|
"epoch": 0.31592087574419053,
|
|
"grad_norm": 31.572166442871094,
|
|
"learning_rate": 1.7294151686612431e-06,
|
|
"loss": 0.3618,
|
|
"num_input_tokens_seen": 15543424,
|
|
"step": 4935
|
|
},
|
|
{
|
|
"epoch": 0.3162409576851674,
|
|
"grad_norm": 42.15610122680664,
|
|
"learning_rate": 1.7286503539332495e-06,
|
|
"loss": 0.5609,
|
|
"num_input_tokens_seen": 15560192,
|
|
"step": 4940
|
|
},
|
|
{
|
|
"epoch": 0.3165610396261443,
|
|
"grad_norm": 43.20957946777344,
|
|
"learning_rate": 1.7278846294767337e-06,
|
|
"loss": 0.3968,
|
|
"num_input_tokens_seen": 15576128,
|
|
"step": 4945
|
|
},
|
|
{
|
|
"epoch": 0.31688112156712117,
|
|
"grad_norm": 80.63443756103516,
|
|
"learning_rate": 1.7271179962477118e-06,
|
|
"loss": 0.7032,
|
|
"num_input_tokens_seen": 15592576,
|
|
"step": 4950
|
|
},
|
|
{
|
|
"epoch": 0.31720120350809805,
|
|
"grad_norm": 50.15550994873047,
|
|
"learning_rate": 1.7263504552033341e-06,
|
|
"loss": 0.4261,
|
|
"num_input_tokens_seen": 15607744,
|
|
"step": 4955
|
|
},
|
|
{
|
|
"epoch": 0.317521285449075,
|
|
"grad_norm": 22.618947982788086,
|
|
"learning_rate": 1.725582007301885e-06,
|
|
"loss": 0.4846,
|
|
"num_input_tokens_seen": 15623360,
|
|
"step": 4960
|
|
},
|
|
{
|
|
"epoch": 0.31784136739005187,
|
|
"grad_norm": 33.10743713378906,
|
|
"learning_rate": 1.7248126535027806e-06,
|
|
"loss": 0.4213,
|
|
"num_input_tokens_seen": 15638656,
|
|
"step": 4965
|
|
},
|
|
{
|
|
"epoch": 0.31816144933102875,
|
|
"grad_norm": 41.587379455566406,
|
|
"learning_rate": 1.7240423947665678e-06,
|
|
"loss": 0.4632,
|
|
"num_input_tokens_seen": 15654400,
|
|
"step": 4970
|
|
},
|
|
{
|
|
"epoch": 0.3184815312720056,
|
|
"grad_norm": 27.983142852783203,
|
|
"learning_rate": 1.723271232054924e-06,
|
|
"loss": 0.3822,
|
|
"num_input_tokens_seen": 15670016,
|
|
"step": 4975
|
|
},
|
|
{
|
|
"epoch": 0.3188016132129825,
|
|
"grad_norm": 54.66548538208008,
|
|
"learning_rate": 1.722499166330655e-06,
|
|
"loss": 0.4977,
|
|
"num_input_tokens_seen": 15686208,
|
|
"step": 4980
|
|
},
|
|
{
|
|
"epoch": 0.31912169515395944,
|
|
"grad_norm": 20.663721084594727,
|
|
"learning_rate": 1.7217261985576936e-06,
|
|
"loss": 0.44,
|
|
"num_input_tokens_seen": 15702592,
|
|
"step": 4985
|
|
},
|
|
{
|
|
"epoch": 0.3194417770949363,
|
|
"grad_norm": 73.22879791259766,
|
|
"learning_rate": 1.7209523297010992e-06,
|
|
"loss": 0.5176,
|
|
"num_input_tokens_seen": 15717696,
|
|
"step": 4990
|
|
},
|
|
{
|
|
"epoch": 0.3197618590359132,
|
|
"grad_norm": 36.40870666503906,
|
|
"learning_rate": 1.7201775607270564e-06,
|
|
"loss": 0.4644,
|
|
"num_input_tokens_seen": 15733184,
|
|
"step": 4995
|
|
},
|
|
{
|
|
"epoch": 0.3200819409768901,
|
|
"grad_norm": 30.597986221313477,
|
|
"learning_rate": 1.7194018926028733e-06,
|
|
"loss": 0.5267,
|
|
"num_input_tokens_seen": 15749888,
|
|
"step": 5000
|
|
},
|
|
{
|
|
"epoch": 0.32040202291786696,
|
|
"grad_norm": 35.71719741821289,
|
|
"learning_rate": 1.7186253262969803e-06,
|
|
"loss": 0.3621,
|
|
"num_input_tokens_seen": 15768384,
|
|
"step": 5005
|
|
},
|
|
{
|
|
"epoch": 0.32072210485884384,
|
|
"grad_norm": 24.331857681274414,
|
|
"learning_rate": 1.7178478627789299e-06,
|
|
"loss": 0.3269,
|
|
"num_input_tokens_seen": 15784448,
|
|
"step": 5010
|
|
},
|
|
{
|
|
"epoch": 0.3210421867998208,
|
|
"grad_norm": 25.280595779418945,
|
|
"learning_rate": 1.7170695030193944e-06,
|
|
"loss": 0.4088,
|
|
"num_input_tokens_seen": 15800512,
|
|
"step": 5015
|
|
},
|
|
{
|
|
"epoch": 0.32136226874079765,
|
|
"grad_norm": 29.914012908935547,
|
|
"learning_rate": 1.716290247990165e-06,
|
|
"loss": 0.4744,
|
|
"num_input_tokens_seen": 15815680,
|
|
"step": 5020
|
|
},
|
|
{
|
|
"epoch": 0.32168235068177453,
|
|
"grad_norm": 33.56769561767578,
|
|
"learning_rate": 1.715510098664151e-06,
|
|
"loss": 0.3939,
|
|
"num_input_tokens_seen": 15830528,
|
|
"step": 5025
|
|
},
|
|
{
|
|
"epoch": 0.3220024326227514,
|
|
"grad_norm": 29.231985092163086,
|
|
"learning_rate": 1.7147290560153777e-06,
|
|
"loss": 0.4933,
|
|
"num_input_tokens_seen": 15845568,
|
|
"step": 5030
|
|
},
|
|
{
|
|
"epoch": 0.3223225145637283,
|
|
"grad_norm": 39.174617767333984,
|
|
"learning_rate": 1.7139471210189862e-06,
|
|
"loss": 0.4531,
|
|
"num_input_tokens_seen": 15861632,
|
|
"step": 5035
|
|
},
|
|
{
|
|
"epoch": 0.3226425965047052,
|
|
"grad_norm": 31.1746826171875,
|
|
"learning_rate": 1.7131642946512312e-06,
|
|
"loss": 0.5187,
|
|
"num_input_tokens_seen": 15877632,
|
|
"step": 5040
|
|
},
|
|
{
|
|
"epoch": 0.3229626784456821,
|
|
"grad_norm": 19.761302947998047,
|
|
"learning_rate": 1.712380577889481e-06,
|
|
"loss": 0.37,
|
|
"num_input_tokens_seen": 15893184,
|
|
"step": 5045
|
|
},
|
|
{
|
|
"epoch": 0.323282760386659,
|
|
"grad_norm": 34.54355239868164,
|
|
"learning_rate": 1.711595971712215e-06,
|
|
"loss": 0.3955,
|
|
"num_input_tokens_seen": 15908416,
|
|
"step": 5050
|
|
},
|
|
{
|
|
"epoch": 0.32360284232763586,
|
|
"grad_norm": 25.96015739440918,
|
|
"learning_rate": 1.7108104770990234e-06,
|
|
"loss": 0.4074,
|
|
"num_input_tokens_seen": 15924224,
|
|
"step": 5055
|
|
},
|
|
{
|
|
"epoch": 0.32392292426861274,
|
|
"grad_norm": 22.604724884033203,
|
|
"learning_rate": 1.7100240950306052e-06,
|
|
"loss": 0.2532,
|
|
"num_input_tokens_seen": 15940032,
|
|
"step": 5060
|
|
},
|
|
{
|
|
"epoch": 0.3242430062095897,
|
|
"grad_norm": 38.15263366699219,
|
|
"learning_rate": 1.7092368264887677e-06,
|
|
"loss": 0.4556,
|
|
"num_input_tokens_seen": 15954944,
|
|
"step": 5065
|
|
},
|
|
{
|
|
"epoch": 0.32456308815056656,
|
|
"grad_norm": 57.1259765625,
|
|
"learning_rate": 1.7084486724564252e-06,
|
|
"loss": 0.4923,
|
|
"num_input_tokens_seen": 15970624,
|
|
"step": 5070
|
|
},
|
|
{
|
|
"epoch": 0.32488317009154344,
|
|
"grad_norm": 33.16521072387695,
|
|
"learning_rate": 1.707659633917597e-06,
|
|
"loss": 0.418,
|
|
"num_input_tokens_seen": 15986688,
|
|
"step": 5075
|
|
},
|
|
{
|
|
"epoch": 0.3252032520325203,
|
|
"grad_norm": 35.50617980957031,
|
|
"learning_rate": 1.7068697118574064e-06,
|
|
"loss": 0.4172,
|
|
"num_input_tokens_seen": 16002752,
|
|
"step": 5080
|
|
},
|
|
{
|
|
"epoch": 0.3255233339734972,
|
|
"grad_norm": 23.2056884765625,
|
|
"learning_rate": 1.7060789072620816e-06,
|
|
"loss": 0.4924,
|
|
"num_input_tokens_seen": 16018112,
|
|
"step": 5085
|
|
},
|
|
{
|
|
"epoch": 0.32584341591447413,
|
|
"grad_norm": 23.894432067871094,
|
|
"learning_rate": 1.7052872211189509e-06,
|
|
"loss": 0.411,
|
|
"num_input_tokens_seen": 16033984,
|
|
"step": 5090
|
|
},
|
|
{
|
|
"epoch": 0.326163497855451,
|
|
"grad_norm": 21.645387649536133,
|
|
"learning_rate": 1.7044946544164431e-06,
|
|
"loss": 0.3263,
|
|
"num_input_tokens_seen": 16049536,
|
|
"step": 5095
|
|
},
|
|
{
|
|
"epoch": 0.3264835797964279,
|
|
"grad_norm": 32.932411193847656,
|
|
"learning_rate": 1.703701208144088e-06,
|
|
"loss": 0.3722,
|
|
"num_input_tokens_seen": 16066304,
|
|
"step": 5100
|
|
},
|
|
{
|
|
"epoch": 0.32680366173740477,
|
|
"grad_norm": 42.86146926879883,
|
|
"learning_rate": 1.702906883292512e-06,
|
|
"loss": 0.4627,
|
|
"num_input_tokens_seen": 16081536,
|
|
"step": 5105
|
|
},
|
|
{
|
|
"epoch": 0.32712374367838165,
|
|
"grad_norm": 25.875411987304688,
|
|
"learning_rate": 1.7021116808534393e-06,
|
|
"loss": 0.5501,
|
|
"num_input_tokens_seen": 16096896,
|
|
"step": 5110
|
|
},
|
|
{
|
|
"epoch": 0.32744382561935853,
|
|
"grad_norm": 47.58795166015625,
|
|
"learning_rate": 1.7013156018196893e-06,
|
|
"loss": 0.4294,
|
|
"num_input_tokens_seen": 16112960,
|
|
"step": 5115
|
|
},
|
|
{
|
|
"epoch": 0.32776390756033547,
|
|
"grad_norm": 34.665802001953125,
|
|
"learning_rate": 1.7005186471851759e-06,
|
|
"loss": 0.4168,
|
|
"num_input_tokens_seen": 16129344,
|
|
"step": 5120
|
|
},
|
|
{
|
|
"epoch": 0.32808398950131235,
|
|
"grad_norm": 23.344072341918945,
|
|
"learning_rate": 1.6997208179449066e-06,
|
|
"loss": 0.5931,
|
|
"num_input_tokens_seen": 16147776,
|
|
"step": 5125
|
|
},
|
|
{
|
|
"epoch": 0.3284040714422892,
|
|
"grad_norm": 43.283119201660156,
|
|
"learning_rate": 1.6989221150949806e-06,
|
|
"loss": 0.3523,
|
|
"num_input_tokens_seen": 16162880,
|
|
"step": 5130
|
|
},
|
|
{
|
|
"epoch": 0.3287241533832661,
|
|
"grad_norm": 17.569599151611328,
|
|
"learning_rate": 1.6981225396325873e-06,
|
|
"loss": 0.2737,
|
|
"num_input_tokens_seen": 16179392,
|
|
"step": 5135
|
|
},
|
|
{
|
|
"epoch": 0.329044235324243,
|
|
"grad_norm": 38.69865036010742,
|
|
"learning_rate": 1.6973220925560067e-06,
|
|
"loss": 0.5036,
|
|
"num_input_tokens_seen": 16194560,
|
|
"step": 5140
|
|
},
|
|
{
|
|
"epoch": 0.3293643172652199,
|
|
"grad_norm": 55.1820182800293,
|
|
"learning_rate": 1.696520774864606e-06,
|
|
"loss": 0.4281,
|
|
"num_input_tokens_seen": 16210112,
|
|
"step": 5145
|
|
},
|
|
{
|
|
"epoch": 0.3296843992061968,
|
|
"grad_norm": 68.6947250366211,
|
|
"learning_rate": 1.69571858755884e-06,
|
|
"loss": 0.4646,
|
|
"num_input_tokens_seen": 16225856,
|
|
"step": 5150
|
|
},
|
|
{
|
|
"epoch": 0.3300044811471737,
|
|
"grad_norm": 25.549705505371094,
|
|
"learning_rate": 1.6949155316402487e-06,
|
|
"loss": 0.4177,
|
|
"num_input_tokens_seen": 16241536,
|
|
"step": 5155
|
|
},
|
|
{
|
|
"epoch": 0.33032456308815056,
|
|
"grad_norm": 31.668855667114258,
|
|
"learning_rate": 1.6941116081114566e-06,
|
|
"loss": 0.3777,
|
|
"num_input_tokens_seen": 16256384,
|
|
"step": 5160
|
|
},
|
|
{
|
|
"epoch": 0.33064464502912744,
|
|
"grad_norm": 34.3087158203125,
|
|
"learning_rate": 1.6933068179761722e-06,
|
|
"loss": 0.3937,
|
|
"num_input_tokens_seen": 16271360,
|
|
"step": 5165
|
|
},
|
|
{
|
|
"epoch": 0.3309647269701044,
|
|
"grad_norm": 26.086729049682617,
|
|
"learning_rate": 1.6925011622391857e-06,
|
|
"loss": 0.4118,
|
|
"num_input_tokens_seen": 16286656,
|
|
"step": 5170
|
|
},
|
|
{
|
|
"epoch": 0.33128480891108125,
|
|
"grad_norm": 18.95518684387207,
|
|
"learning_rate": 1.6916946419063667e-06,
|
|
"loss": 0.4038,
|
|
"num_input_tokens_seen": 16302592,
|
|
"step": 5175
|
|
},
|
|
{
|
|
"epoch": 0.33160489085205813,
|
|
"grad_norm": 25.953067779541016,
|
|
"learning_rate": 1.690887257984666e-06,
|
|
"loss": 0.5252,
|
|
"num_input_tokens_seen": 16318656,
|
|
"step": 5180
|
|
},
|
|
{
|
|
"epoch": 0.331924972793035,
|
|
"grad_norm": 26.030420303344727,
|
|
"learning_rate": 1.690079011482112e-06,
|
|
"loss": 0.4784,
|
|
"num_input_tokens_seen": 16334016,
|
|
"step": 5185
|
|
},
|
|
{
|
|
"epoch": 0.3322450547340119,
|
|
"grad_norm": 44.0208625793457,
|
|
"learning_rate": 1.6892699034078096e-06,
|
|
"loss": 0.5322,
|
|
"num_input_tokens_seen": 16349888,
|
|
"step": 5190
|
|
},
|
|
{
|
|
"epoch": 0.33256513667498877,
|
|
"grad_norm": 40.064537048339844,
|
|
"learning_rate": 1.68845993477194e-06,
|
|
"loss": 0.5017,
|
|
"num_input_tokens_seen": 16365056,
|
|
"step": 5195
|
|
},
|
|
{
|
|
"epoch": 0.3328852186159657,
|
|
"grad_norm": 27.49654197692871,
|
|
"learning_rate": 1.6876491065857584e-06,
|
|
"loss": 0.3857,
|
|
"num_input_tokens_seen": 16380032,
|
|
"step": 5200
|
|
},
|
|
{
|
|
"epoch": 0.3332053005569426,
|
|
"grad_norm": 31.578556060791016,
|
|
"learning_rate": 1.6868374198615928e-06,
|
|
"loss": 0.6437,
|
|
"num_input_tokens_seen": 16394752,
|
|
"step": 5205
|
|
},
|
|
{
|
|
"epoch": 0.33352538249791946,
|
|
"grad_norm": 19.591115951538086,
|
|
"learning_rate": 1.6860248756128448e-06,
|
|
"loss": 0.4782,
|
|
"num_input_tokens_seen": 16410368,
|
|
"step": 5210
|
|
},
|
|
{
|
|
"epoch": 0.33384546443889634,
|
|
"grad_norm": 22.99208641052246,
|
|
"learning_rate": 1.6852114748539844e-06,
|
|
"loss": 0.3992,
|
|
"num_input_tokens_seen": 16425088,
|
|
"step": 5215
|
|
},
|
|
{
|
|
"epoch": 0.3341655463798732,
|
|
"grad_norm": 22.972055435180664,
|
|
"learning_rate": 1.6843972186005525e-06,
|
|
"loss": 0.3352,
|
|
"num_input_tokens_seen": 16441152,
|
|
"step": 5220
|
|
},
|
|
{
|
|
"epoch": 0.33448562832085016,
|
|
"grad_norm": 34.798065185546875,
|
|
"learning_rate": 1.6835821078691577e-06,
|
|
"loss": 0.4641,
|
|
"num_input_tokens_seen": 16458240,
|
|
"step": 5225
|
|
},
|
|
{
|
|
"epoch": 0.33480571026182704,
|
|
"grad_norm": 35.769901275634766,
|
|
"learning_rate": 1.6827661436774746e-06,
|
|
"loss": 0.4142,
|
|
"num_input_tokens_seen": 16474112,
|
|
"step": 5230
|
|
},
|
|
{
|
|
"epoch": 0.3351257922028039,
|
|
"grad_norm": 43.8751335144043,
|
|
"learning_rate": 1.681949327044245e-06,
|
|
"loss": 0.3955,
|
|
"num_input_tokens_seen": 16490560,
|
|
"step": 5235
|
|
},
|
|
{
|
|
"epoch": 0.3354458741437808,
|
|
"grad_norm": 67.51107025146484,
|
|
"learning_rate": 1.6811316589892734e-06,
|
|
"loss": 0.6757,
|
|
"num_input_tokens_seen": 16505728,
|
|
"step": 5240
|
|
},
|
|
{
|
|
"epoch": 0.3357659560847577,
|
|
"grad_norm": 21.818950653076172,
|
|
"learning_rate": 1.6803131405334284e-06,
|
|
"loss": 0.4257,
|
|
"num_input_tokens_seen": 16521856,
|
|
"step": 5245
|
|
},
|
|
{
|
|
"epoch": 0.3360860380257346,
|
|
"grad_norm": 30.710657119750977,
|
|
"learning_rate": 1.6794937726986396e-06,
|
|
"loss": 0.4271,
|
|
"num_input_tokens_seen": 16537792,
|
|
"step": 5250
|
|
},
|
|
{
|
|
"epoch": 0.3364061199667115,
|
|
"grad_norm": 42.02250671386719,
|
|
"learning_rate": 1.6786735565078974e-06,
|
|
"loss": 0.434,
|
|
"num_input_tokens_seen": 16553408,
|
|
"step": 5255
|
|
},
|
|
{
|
|
"epoch": 0.33672620190768837,
|
|
"grad_norm": 28.501094818115234,
|
|
"learning_rate": 1.677852492985251e-06,
|
|
"loss": 0.4297,
|
|
"num_input_tokens_seen": 16570112,
|
|
"step": 5260
|
|
},
|
|
{
|
|
"epoch": 0.33704628384866525,
|
|
"grad_norm": 56.61883544921875,
|
|
"learning_rate": 1.6770305831558086e-06,
|
|
"loss": 0.4931,
|
|
"num_input_tokens_seen": 16586304,
|
|
"step": 5265
|
|
},
|
|
{
|
|
"epoch": 0.33736636578964213,
|
|
"grad_norm": 15.158733367919922,
|
|
"learning_rate": 1.6762078280457342e-06,
|
|
"loss": 0.3922,
|
|
"num_input_tokens_seen": 16601920,
|
|
"step": 5270
|
|
},
|
|
{
|
|
"epoch": 0.33768644773061907,
|
|
"grad_norm": 27.923097610473633,
|
|
"learning_rate": 1.6753842286822465e-06,
|
|
"loss": 0.4797,
|
|
"num_input_tokens_seen": 16618240,
|
|
"step": 5275
|
|
},
|
|
{
|
|
"epoch": 0.33800652967159595,
|
|
"grad_norm": 36.38385009765625,
|
|
"learning_rate": 1.6745597860936199e-06,
|
|
"loss": 0.59,
|
|
"num_input_tokens_seen": 16633408,
|
|
"step": 5280
|
|
},
|
|
{
|
|
"epoch": 0.3383266116125728,
|
|
"grad_norm": 38.012123107910156,
|
|
"learning_rate": 1.6737345013091794e-06,
|
|
"loss": 0.439,
|
|
"num_input_tokens_seen": 16649664,
|
|
"step": 5285
|
|
},
|
|
{
|
|
"epoch": 0.3386466935535497,
|
|
"grad_norm": 39.11860656738281,
|
|
"learning_rate": 1.672908375359304e-06,
|
|
"loss": 0.4602,
|
|
"num_input_tokens_seen": 16664896,
|
|
"step": 5290
|
|
},
|
|
{
|
|
"epoch": 0.3389667754945266,
|
|
"grad_norm": 56.845096588134766,
|
|
"learning_rate": 1.6720814092754209e-06,
|
|
"loss": 0.5433,
|
|
"num_input_tokens_seen": 16680384,
|
|
"step": 5295
|
|
},
|
|
{
|
|
"epoch": 0.33928685743550346,
|
|
"grad_norm": 20.308507919311523,
|
|
"learning_rate": 1.6712536040900075e-06,
|
|
"loss": 0.3696,
|
|
"num_input_tokens_seen": 16696192,
|
|
"step": 5300
|
|
},
|
|
{
|
|
"epoch": 0.3396069393764804,
|
|
"grad_norm": 26.112041473388672,
|
|
"learning_rate": 1.6704249608365878e-06,
|
|
"loss": 0.4752,
|
|
"num_input_tokens_seen": 16727104,
|
|
"step": 5305
|
|
},
|
|
{
|
|
"epoch": 0.3399270213174573,
|
|
"grad_norm": 27.13048553466797,
|
|
"learning_rate": 1.669595480549733e-06,
|
|
"loss": 0.4154,
|
|
"num_input_tokens_seen": 16741696,
|
|
"step": 5310
|
|
},
|
|
{
|
|
"epoch": 0.34024710325843416,
|
|
"grad_norm": 40.439273834228516,
|
|
"learning_rate": 1.6687651642650587e-06,
|
|
"loss": 0.432,
|
|
"num_input_tokens_seen": 16757120,
|
|
"step": 5315
|
|
},
|
|
{
|
|
"epoch": 0.34056718519941104,
|
|
"grad_norm": 27.309789657592773,
|
|
"learning_rate": 1.6679340130192245e-06,
|
|
"loss": 0.4471,
|
|
"num_input_tokens_seen": 16772416,
|
|
"step": 5320
|
|
},
|
|
{
|
|
"epoch": 0.3408872671403879,
|
|
"grad_norm": 24.121200561523438,
|
|
"learning_rate": 1.667102027849933e-06,
|
|
"loss": 0.3172,
|
|
"num_input_tokens_seen": 16788352,
|
|
"step": 5325
|
|
},
|
|
{
|
|
"epoch": 0.34120734908136485,
|
|
"grad_norm": 36.701873779296875,
|
|
"learning_rate": 1.6662692097959266e-06,
|
|
"loss": 0.3456,
|
|
"num_input_tokens_seen": 16803648,
|
|
"step": 5330
|
|
},
|
|
{
|
|
"epoch": 0.34152743102234173,
|
|
"grad_norm": 52.13604736328125,
|
|
"learning_rate": 1.6654355598969894e-06,
|
|
"loss": 0.4708,
|
|
"num_input_tokens_seen": 16818944,
|
|
"step": 5335
|
|
},
|
|
{
|
|
"epoch": 0.3418475129633186,
|
|
"grad_norm": 31.60714340209961,
|
|
"learning_rate": 1.6646010791939423e-06,
|
|
"loss": 0.5078,
|
|
"num_input_tokens_seen": 16833984,
|
|
"step": 5340
|
|
},
|
|
{
|
|
"epoch": 0.3421675949042955,
|
|
"grad_norm": 30.880844116210938,
|
|
"learning_rate": 1.6637657687286446e-06,
|
|
"loss": 0.5507,
|
|
"num_input_tokens_seen": 16849280,
|
|
"step": 5345
|
|
},
|
|
{
|
|
"epoch": 0.34248767684527237,
|
|
"grad_norm": 29.642696380615234,
|
|
"learning_rate": 1.6629296295439912e-06,
|
|
"loss": 0.3979,
|
|
"num_input_tokens_seen": 16865664,
|
|
"step": 5350
|
|
},
|
|
{
|
|
"epoch": 0.3428077587862493,
|
|
"grad_norm": 46.237457275390625,
|
|
"learning_rate": 1.6620926626839116e-06,
|
|
"loss": 0.4884,
|
|
"num_input_tokens_seen": 16881536,
|
|
"step": 5355
|
|
},
|
|
{
|
|
"epoch": 0.3431278407272262,
|
|
"grad_norm": 26.425844192504883,
|
|
"learning_rate": 1.661254869193369e-06,
|
|
"loss": 0.4395,
|
|
"num_input_tokens_seen": 16898816,
|
|
"step": 5360
|
|
},
|
|
{
|
|
"epoch": 0.34344792266820307,
|
|
"grad_norm": 44.35171127319336,
|
|
"learning_rate": 1.6604162501183581e-06,
|
|
"loss": 0.5104,
|
|
"num_input_tokens_seen": 16915136,
|
|
"step": 5365
|
|
},
|
|
{
|
|
"epoch": 0.34376800460917994,
|
|
"grad_norm": 29.71055793762207,
|
|
"learning_rate": 1.6595768065059045e-06,
|
|
"loss": 0.4607,
|
|
"num_input_tokens_seen": 16931200,
|
|
"step": 5370
|
|
},
|
|
{
|
|
"epoch": 0.3440880865501568,
|
|
"grad_norm": 26.671714782714844,
|
|
"learning_rate": 1.6587365394040641e-06,
|
|
"loss": 0.4652,
|
|
"num_input_tokens_seen": 16946816,
|
|
"step": 5375
|
|
},
|
|
{
|
|
"epoch": 0.3444081684911337,
|
|
"grad_norm": 28.532976150512695,
|
|
"learning_rate": 1.6578954498619195e-06,
|
|
"loss": 0.3893,
|
|
"num_input_tokens_seen": 16962880,
|
|
"step": 5380
|
|
},
|
|
{
|
|
"epoch": 0.34472825043211064,
|
|
"grad_norm": 31.44209861755371,
|
|
"learning_rate": 1.6570535389295814e-06,
|
|
"loss": 0.4587,
|
|
"num_input_tokens_seen": 16978240,
|
|
"step": 5385
|
|
},
|
|
{
|
|
"epoch": 0.3450483323730875,
|
|
"grad_norm": 22.520421981811523,
|
|
"learning_rate": 1.6562108076581853e-06,
|
|
"loss": 0.3628,
|
|
"num_input_tokens_seen": 16993728,
|
|
"step": 5390
|
|
},
|
|
{
|
|
"epoch": 0.3453684143140644,
|
|
"grad_norm": 37.299156188964844,
|
|
"learning_rate": 1.6553672570998912e-06,
|
|
"loss": 0.5903,
|
|
"num_input_tokens_seen": 17009728,
|
|
"step": 5395
|
|
},
|
|
{
|
|
"epoch": 0.3456884962550413,
|
|
"grad_norm": 38.635986328125,
|
|
"learning_rate": 1.6545228883078815e-06,
|
|
"loss": 0.4174,
|
|
"num_input_tokens_seen": 17024640,
|
|
"step": 5400
|
|
},
|
|
{
|
|
"epoch": 0.34600857819601816,
|
|
"grad_norm": 37.52071762084961,
|
|
"learning_rate": 1.653677702336361e-06,
|
|
"loss": 0.3541,
|
|
"num_input_tokens_seen": 17040512,
|
|
"step": 5405
|
|
},
|
|
{
|
|
"epoch": 0.3463286601369951,
|
|
"grad_norm": 19.03274917602539,
|
|
"learning_rate": 1.6528317002405538e-06,
|
|
"loss": 0.4657,
|
|
"num_input_tokens_seen": 17056064,
|
|
"step": 5410
|
|
},
|
|
{
|
|
"epoch": 0.34664874207797197,
|
|
"grad_norm": 28.59636878967285,
|
|
"learning_rate": 1.6519848830767043e-06,
|
|
"loss": 0.3692,
|
|
"num_input_tokens_seen": 17072448,
|
|
"step": 5415
|
|
},
|
|
{
|
|
"epoch": 0.34696882401894885,
|
|
"grad_norm": 38.893310546875,
|
|
"learning_rate": 1.6511372519020726e-06,
|
|
"loss": 0.6197,
|
|
"num_input_tokens_seen": 17088320,
|
|
"step": 5420
|
|
},
|
|
{
|
|
"epoch": 0.34728890595992573,
|
|
"grad_norm": 39.06748962402344,
|
|
"learning_rate": 1.650288807774937e-06,
|
|
"loss": 0.4291,
|
|
"num_input_tokens_seen": 17104448,
|
|
"step": 5425
|
|
},
|
|
{
|
|
"epoch": 0.3476089879009026,
|
|
"grad_norm": 36.80699920654297,
|
|
"learning_rate": 1.6494395517545893e-06,
|
|
"loss": 0.3964,
|
|
"num_input_tokens_seen": 17121856,
|
|
"step": 5430
|
|
},
|
|
{
|
|
"epoch": 0.34792906984187955,
|
|
"grad_norm": 47.49158477783203,
|
|
"learning_rate": 1.6485894849013362e-06,
|
|
"loss": 0.5052,
|
|
"num_input_tokens_seen": 17136512,
|
|
"step": 5435
|
|
},
|
|
{
|
|
"epoch": 0.3482491517828564,
|
|
"grad_norm": 26.2275333404541,
|
|
"learning_rate": 1.6477386082764961e-06,
|
|
"loss": 0.443,
|
|
"num_input_tokens_seen": 17152640,
|
|
"step": 5440
|
|
},
|
|
{
|
|
"epoch": 0.3485692337238333,
|
|
"grad_norm": 25.935453414916992,
|
|
"learning_rate": 1.6468869229423983e-06,
|
|
"loss": 0.362,
|
|
"num_input_tokens_seen": 17167680,
|
|
"step": 5445
|
|
},
|
|
{
|
|
"epoch": 0.3488893156648102,
|
|
"grad_norm": 57.09697341918945,
|
|
"learning_rate": 1.6460344299623813e-06,
|
|
"loss": 0.6295,
|
|
"num_input_tokens_seen": 17183296,
|
|
"step": 5450
|
|
},
|
|
{
|
|
"epoch": 0.34920939760578706,
|
|
"grad_norm": 62.791343688964844,
|
|
"learning_rate": 1.6451811304007939e-06,
|
|
"loss": 0.5424,
|
|
"num_input_tokens_seen": 17198272,
|
|
"step": 5455
|
|
},
|
|
{
|
|
"epoch": 0.349529479546764,
|
|
"grad_norm": 46.02850341796875,
|
|
"learning_rate": 1.6443270253229895e-06,
|
|
"loss": 0.5177,
|
|
"num_input_tokens_seen": 17213376,
|
|
"step": 5460
|
|
},
|
|
{
|
|
"epoch": 0.3498495614877409,
|
|
"grad_norm": 39.094146728515625,
|
|
"learning_rate": 1.6434721157953288e-06,
|
|
"loss": 0.4657,
|
|
"num_input_tokens_seen": 17229632,
|
|
"step": 5465
|
|
},
|
|
{
|
|
"epoch": 0.35016964342871776,
|
|
"grad_norm": 35.04682540893555,
|
|
"learning_rate": 1.6426164028851765e-06,
|
|
"loss": 0.579,
|
|
"num_input_tokens_seen": 17245696,
|
|
"step": 5470
|
|
},
|
|
{
|
|
"epoch": 0.3504257089814993,
|
|
"eval_loss": 0.43906036019325256,
|
|
"eval_runtime": 49.1679,
|
|
"eval_samples_per_second": 282.42,
|
|
"eval_steps_per_second": 35.308,
|
|
"num_input_tokens_seen": 17259840,
|
|
"step": 5474
|
|
},
|
|
{
|
|
"epoch": 0.35048972536969464,
|
|
"grad_norm": 26.412445068359375,
|
|
"learning_rate": 1.6417598876609002e-06,
|
|
"loss": 0.3787,
|
|
"num_input_tokens_seen": 17262976,
|
|
"step": 5475
|
|
},
|
|
{
|
|
"epoch": 0.3508098073106715,
|
|
"grad_norm": 36.70389175415039,
|
|
"learning_rate": 1.640902571191869e-06,
|
|
"loss": 0.419,
|
|
"num_input_tokens_seen": 17278336,
|
|
"step": 5480
|
|
},
|
|
{
|
|
"epoch": 0.3511298892516484,
|
|
"grad_norm": 41.35291290283203,
|
|
"learning_rate": 1.6400444545484524e-06,
|
|
"loss": 0.3535,
|
|
"num_input_tokens_seen": 17293248,
|
|
"step": 5485
|
|
},
|
|
{
|
|
"epoch": 0.35144997119262533,
|
|
"grad_norm": 19.715316772460938,
|
|
"learning_rate": 1.6391855388020193e-06,
|
|
"loss": 0.4275,
|
|
"num_input_tokens_seen": 17309184,
|
|
"step": 5490
|
|
},
|
|
{
|
|
"epoch": 0.3517700531336022,
|
|
"grad_norm": 32.778873443603516,
|
|
"learning_rate": 1.6383258250249363e-06,
|
|
"loss": 0.4436,
|
|
"num_input_tokens_seen": 17325248,
|
|
"step": 5495
|
|
},
|
|
{
|
|
"epoch": 0.3520901350745791,
|
|
"grad_norm": 19.160093307495117,
|
|
"learning_rate": 1.6374653142905661e-06,
|
|
"loss": 0.4226,
|
|
"num_input_tokens_seen": 17340736,
|
|
"step": 5500
|
|
},
|
|
{
|
|
"epoch": 0.35241021701555597,
|
|
"grad_norm": 35.900447845458984,
|
|
"learning_rate": 1.6366040076732662e-06,
|
|
"loss": 0.4188,
|
|
"num_input_tokens_seen": 17355904,
|
|
"step": 5505
|
|
},
|
|
{
|
|
"epoch": 0.35273029895653285,
|
|
"grad_norm": 28.459196090698242,
|
|
"learning_rate": 1.6357419062483882e-06,
|
|
"loss": 0.4712,
|
|
"num_input_tokens_seen": 17371264,
|
|
"step": 5510
|
|
},
|
|
{
|
|
"epoch": 0.3530503808975098,
|
|
"grad_norm": 24.3746337890625,
|
|
"learning_rate": 1.6348790110922758e-06,
|
|
"loss": 0.4168,
|
|
"num_input_tokens_seen": 17388608,
|
|
"step": 5515
|
|
},
|
|
{
|
|
"epoch": 0.35337046283848667,
|
|
"grad_norm": 30.681352615356445,
|
|
"learning_rate": 1.6340153232822635e-06,
|
|
"loss": 0.4668,
|
|
"num_input_tokens_seen": 17403712,
|
|
"step": 5520
|
|
},
|
|
{
|
|
"epoch": 0.35369054477946354,
|
|
"grad_norm": 40.181785583496094,
|
|
"learning_rate": 1.633150843896676e-06,
|
|
"loss": 0.4809,
|
|
"num_input_tokens_seen": 17421056,
|
|
"step": 5525
|
|
},
|
|
{
|
|
"epoch": 0.3540106267204404,
|
|
"grad_norm": 58.2733154296875,
|
|
"learning_rate": 1.6322855740148263e-06,
|
|
"loss": 0.5588,
|
|
"num_input_tokens_seen": 17436096,
|
|
"step": 5530
|
|
},
|
|
{
|
|
"epoch": 0.3543307086614173,
|
|
"grad_norm": 24.002464294433594,
|
|
"learning_rate": 1.6314195147170132e-06,
|
|
"loss": 0.3701,
|
|
"num_input_tokens_seen": 17452480,
|
|
"step": 5535
|
|
},
|
|
{
|
|
"epoch": 0.35465079060239424,
|
|
"grad_norm": 28.335710525512695,
|
|
"learning_rate": 1.6305526670845225e-06,
|
|
"loss": 0.4038,
|
|
"num_input_tokens_seen": 17467776,
|
|
"step": 5540
|
|
},
|
|
{
|
|
"epoch": 0.3549708725433711,
|
|
"grad_norm": 46.305484771728516,
|
|
"learning_rate": 1.6296850321996232e-06,
|
|
"loss": 0.5081,
|
|
"num_input_tokens_seen": 17482752,
|
|
"step": 5545
|
|
},
|
|
{
|
|
"epoch": 0.355290954484348,
|
|
"grad_norm": 31.239910125732422,
|
|
"learning_rate": 1.6288166111455683e-06,
|
|
"loss": 0.3885,
|
|
"num_input_tokens_seen": 17497792,
|
|
"step": 5550
|
|
},
|
|
{
|
|
"epoch": 0.3556110364253249,
|
|
"grad_norm": 21.766979217529297,
|
|
"learning_rate": 1.6279474050065906e-06,
|
|
"loss": 0.4774,
|
|
"num_input_tokens_seen": 17513024,
|
|
"step": 5555
|
|
},
|
|
{
|
|
"epoch": 0.35593111836630176,
|
|
"grad_norm": 28.28034210205078,
|
|
"learning_rate": 1.6270774148679054e-06,
|
|
"loss": 0.4143,
|
|
"num_input_tokens_seen": 17529024,
|
|
"step": 5560
|
|
},
|
|
{
|
|
"epoch": 0.35625120030727864,
|
|
"grad_norm": 15.855846405029297,
|
|
"learning_rate": 1.6262066418157048e-06,
|
|
"loss": 0.3764,
|
|
"num_input_tokens_seen": 17543936,
|
|
"step": 5565
|
|
},
|
|
{
|
|
"epoch": 0.35657128224825557,
|
|
"grad_norm": 52.373390197753906,
|
|
"learning_rate": 1.6253350869371595e-06,
|
|
"loss": 0.5374,
|
|
"num_input_tokens_seen": 17559168,
|
|
"step": 5570
|
|
},
|
|
{
|
|
"epoch": 0.35689136418923245,
|
|
"grad_norm": 32.6270751953125,
|
|
"learning_rate": 1.6244627513204158e-06,
|
|
"loss": 0.3828,
|
|
"num_input_tokens_seen": 17574912,
|
|
"step": 5575
|
|
},
|
|
{
|
|
"epoch": 0.35721144613020933,
|
|
"grad_norm": 24.754146575927734,
|
|
"learning_rate": 1.6235896360545954e-06,
|
|
"loss": 0.4239,
|
|
"num_input_tokens_seen": 17590272,
|
|
"step": 5580
|
|
},
|
|
{
|
|
"epoch": 0.3575315280711862,
|
|
"grad_norm": 40.839786529541016,
|
|
"learning_rate": 1.622715742229792e-06,
|
|
"loss": 0.4379,
|
|
"num_input_tokens_seen": 17605952,
|
|
"step": 5585
|
|
},
|
|
{
|
|
"epoch": 0.3578516100121631,
|
|
"grad_norm": 21.1004638671875,
|
|
"learning_rate": 1.6218410709370734e-06,
|
|
"loss": 0.3813,
|
|
"num_input_tokens_seen": 17621120,
|
|
"step": 5590
|
|
},
|
|
{
|
|
"epoch": 0.35817169195314,
|
|
"grad_norm": 40.48637008666992,
|
|
"learning_rate": 1.6209656232684768e-06,
|
|
"loss": 0.5629,
|
|
"num_input_tokens_seen": 17636096,
|
|
"step": 5595
|
|
},
|
|
{
|
|
"epoch": 0.3584917738941169,
|
|
"grad_norm": 86.99573516845703,
|
|
"learning_rate": 1.620089400317008e-06,
|
|
"loss": 0.4427,
|
|
"num_input_tokens_seen": 17652672,
|
|
"step": 5600
|
|
},
|
|
{
|
|
"epoch": 0.3588118558350938,
|
|
"grad_norm": 33.9478645324707,
|
|
"learning_rate": 1.6192124031766425e-06,
|
|
"loss": 0.4875,
|
|
"num_input_tokens_seen": 17668032,
|
|
"step": 5605
|
|
},
|
|
{
|
|
"epoch": 0.35913193777607066,
|
|
"grad_norm": 28.759950637817383,
|
|
"learning_rate": 1.6183346329423213e-06,
|
|
"loss": 0.4474,
|
|
"num_input_tokens_seen": 17683264,
|
|
"step": 5610
|
|
},
|
|
{
|
|
"epoch": 0.35945201971704754,
|
|
"grad_norm": 49.65534210205078,
|
|
"learning_rate": 1.6174560907099508e-06,
|
|
"loss": 0.3642,
|
|
"num_input_tokens_seen": 17699200,
|
|
"step": 5615
|
|
},
|
|
{
|
|
"epoch": 0.3597721016580245,
|
|
"grad_norm": 21.184310913085938,
|
|
"learning_rate": 1.6165767775764013e-06,
|
|
"loss": 0.3489,
|
|
"num_input_tokens_seen": 17714816,
|
|
"step": 5620
|
|
},
|
|
{
|
|
"epoch": 0.36009218359900136,
|
|
"grad_norm": 36.253963470458984,
|
|
"learning_rate": 1.6156966946395056e-06,
|
|
"loss": 0.411,
|
|
"num_input_tokens_seen": 17732352,
|
|
"step": 5625
|
|
},
|
|
{
|
|
"epoch": 0.36041226553997824,
|
|
"grad_norm": 52.9035758972168,
|
|
"learning_rate": 1.6148158429980577e-06,
|
|
"loss": 0.5376,
|
|
"num_input_tokens_seen": 17748288,
|
|
"step": 5630
|
|
},
|
|
{
|
|
"epoch": 0.3607323474809551,
|
|
"grad_norm": 40.94856262207031,
|
|
"learning_rate": 1.6139342237518108e-06,
|
|
"loss": 0.3839,
|
|
"num_input_tokens_seen": 17763520,
|
|
"step": 5635
|
|
},
|
|
{
|
|
"epoch": 0.361052429421932,
|
|
"grad_norm": 33.37528610229492,
|
|
"learning_rate": 1.6130518380014773e-06,
|
|
"loss": 0.428,
|
|
"num_input_tokens_seen": 17779328,
|
|
"step": 5640
|
|
},
|
|
{
|
|
"epoch": 0.3613725113629089,
|
|
"grad_norm": 38.7974853515625,
|
|
"learning_rate": 1.6121686868487259e-06,
|
|
"loss": 0.4178,
|
|
"num_input_tokens_seen": 17795584,
|
|
"step": 5645
|
|
},
|
|
{
|
|
"epoch": 0.3616925933038858,
|
|
"grad_norm": 16.909976959228516,
|
|
"learning_rate": 1.6112847713961815e-06,
|
|
"loss": 0.44,
|
|
"num_input_tokens_seen": 17810368,
|
|
"step": 5650
|
|
},
|
|
{
|
|
"epoch": 0.3620126752448627,
|
|
"grad_norm": 27.985116958618164,
|
|
"learning_rate": 1.610400092747423e-06,
|
|
"loss": 0.4283,
|
|
"num_input_tokens_seen": 17826496,
|
|
"step": 5655
|
|
},
|
|
{
|
|
"epoch": 0.36233275718583957,
|
|
"grad_norm": 30.853046417236328,
|
|
"learning_rate": 1.609514652006981e-06,
|
|
"loss": 0.4191,
|
|
"num_input_tokens_seen": 17841344,
|
|
"step": 5660
|
|
},
|
|
{
|
|
"epoch": 0.36265283912681645,
|
|
"grad_norm": 31.243133544921875,
|
|
"learning_rate": 1.60862845028034e-06,
|
|
"loss": 0.5596,
|
|
"num_input_tokens_seen": 17857408,
|
|
"step": 5665
|
|
},
|
|
{
|
|
"epoch": 0.36297292106779333,
|
|
"grad_norm": 24.529314041137695,
|
|
"learning_rate": 1.6077414886739327e-06,
|
|
"loss": 0.4256,
|
|
"num_input_tokens_seen": 17873280,
|
|
"step": 5670
|
|
},
|
|
{
|
|
"epoch": 0.36329300300877027,
|
|
"grad_norm": 20.652950286865234,
|
|
"learning_rate": 1.6068537682951412e-06,
|
|
"loss": 0.4936,
|
|
"num_input_tokens_seen": 17888448,
|
|
"step": 5675
|
|
},
|
|
{
|
|
"epoch": 0.36361308494974715,
|
|
"grad_norm": 28.207895278930664,
|
|
"learning_rate": 1.6059652902522947e-06,
|
|
"loss": 0.4402,
|
|
"num_input_tokens_seen": 17904320,
|
|
"step": 5680
|
|
},
|
|
{
|
|
"epoch": 0.363933166890724,
|
|
"grad_norm": 51.1041145324707,
|
|
"learning_rate": 1.6050760556546683e-06,
|
|
"loss": 0.3667,
|
|
"num_input_tokens_seen": 17919744,
|
|
"step": 5685
|
|
},
|
|
{
|
|
"epoch": 0.3642532488317009,
|
|
"grad_norm": 26.759593963623047,
|
|
"learning_rate": 1.6041860656124823e-06,
|
|
"loss": 0.3814,
|
|
"num_input_tokens_seen": 17934656,
|
|
"step": 5690
|
|
},
|
|
{
|
|
"epoch": 0.3645733307726778,
|
|
"grad_norm": 39.42972946166992,
|
|
"learning_rate": 1.6032953212368993e-06,
|
|
"loss": 0.5375,
|
|
"num_input_tokens_seen": 17950976,
|
|
"step": 5695
|
|
},
|
|
{
|
|
"epoch": 0.3648934127136547,
|
|
"grad_norm": 22.8485164642334,
|
|
"learning_rate": 1.6024038236400243e-06,
|
|
"loss": 0.4688,
|
|
"num_input_tokens_seen": 17966400,
|
|
"step": 5700
|
|
},
|
|
{
|
|
"epoch": 0.3652134946546316,
|
|
"grad_norm": 97.59317016601562,
|
|
"learning_rate": 1.6015115739349027e-06,
|
|
"loss": 0.5649,
|
|
"num_input_tokens_seen": 17983872,
|
|
"step": 5705
|
|
},
|
|
{
|
|
"epoch": 0.3655335765956085,
|
|
"grad_norm": 33.57761764526367,
|
|
"learning_rate": 1.6006185732355183e-06,
|
|
"loss": 0.5461,
|
|
"num_input_tokens_seen": 17999680,
|
|
"step": 5710
|
|
},
|
|
{
|
|
"epoch": 0.36585365853658536,
|
|
"grad_norm": 21.023252487182617,
|
|
"learning_rate": 1.5997248226567931e-06,
|
|
"loss": 0.3802,
|
|
"num_input_tokens_seen": 18014784,
|
|
"step": 5715
|
|
},
|
|
{
|
|
"epoch": 0.36617374047756224,
|
|
"grad_norm": 22.69112205505371,
|
|
"learning_rate": 1.5988303233145853e-06,
|
|
"loss": 0.4997,
|
|
"num_input_tokens_seen": 18029888,
|
|
"step": 5720
|
|
},
|
|
{
|
|
"epoch": 0.3664938224185392,
|
|
"grad_norm": 29.783832550048828,
|
|
"learning_rate": 1.597935076325688e-06,
|
|
"loss": 0.3877,
|
|
"num_input_tokens_seen": 18045632,
|
|
"step": 5725
|
|
},
|
|
{
|
|
"epoch": 0.36681390435951605,
|
|
"grad_norm": 41.83056640625,
|
|
"learning_rate": 1.5970390828078272e-06,
|
|
"loss": 0.5839,
|
|
"num_input_tokens_seen": 18060928,
|
|
"step": 5730
|
|
},
|
|
{
|
|
"epoch": 0.36713398630049293,
|
|
"grad_norm": 16.932323455810547,
|
|
"learning_rate": 1.5961423438796615e-06,
|
|
"loss": 0.4567,
|
|
"num_input_tokens_seen": 18076352,
|
|
"step": 5735
|
|
},
|
|
{
|
|
"epoch": 0.3674540682414698,
|
|
"grad_norm": 43.994022369384766,
|
|
"learning_rate": 1.59524486066078e-06,
|
|
"loss": 0.4411,
|
|
"num_input_tokens_seen": 18092096,
|
|
"step": 5740
|
|
},
|
|
{
|
|
"epoch": 0.3677741501824467,
|
|
"grad_norm": 29.11937141418457,
|
|
"learning_rate": 1.5943466342717012e-06,
|
|
"loss": 0.5834,
|
|
"num_input_tokens_seen": 18107648,
|
|
"step": 5745
|
|
},
|
|
{
|
|
"epoch": 0.36809423212342357,
|
|
"grad_norm": 26.03652572631836,
|
|
"learning_rate": 1.5934476658338708e-06,
|
|
"loss": 0.4433,
|
|
"num_input_tokens_seen": 18123264,
|
|
"step": 5750
|
|
},
|
|
{
|
|
"epoch": 0.3684143140644005,
|
|
"grad_norm": 25.282079696655273,
|
|
"learning_rate": 1.5925479564696619e-06,
|
|
"loss": 0.5414,
|
|
"num_input_tokens_seen": 18138368,
|
|
"step": 5755
|
|
},
|
|
{
|
|
"epoch": 0.3687343960053774,
|
|
"grad_norm": 11.744181632995605,
|
|
"learning_rate": 1.5916475073023721e-06,
|
|
"loss": 0.3336,
|
|
"num_input_tokens_seen": 18154432,
|
|
"step": 5760
|
|
},
|
|
{
|
|
"epoch": 0.36905447794635426,
|
|
"grad_norm": 50.17704391479492,
|
|
"learning_rate": 1.5907463194562226e-06,
|
|
"loss": 0.3355,
|
|
"num_input_tokens_seen": 18171200,
|
|
"step": 5765
|
|
},
|
|
{
|
|
"epoch": 0.36937455988733114,
|
|
"grad_norm": 24.319721221923828,
|
|
"learning_rate": 1.589844394056357e-06,
|
|
"loss": 0.3807,
|
|
"num_input_tokens_seen": 18187008,
|
|
"step": 5770
|
|
},
|
|
{
|
|
"epoch": 0.369694641828308,
|
|
"grad_norm": 48.6660270690918,
|
|
"learning_rate": 1.5889417322288403e-06,
|
|
"loss": 0.3492,
|
|
"num_input_tokens_seen": 18202944,
|
|
"step": 5775
|
|
},
|
|
{
|
|
"epoch": 0.37001472376928496,
|
|
"grad_norm": 86.95288848876953,
|
|
"learning_rate": 1.5880383351006556e-06,
|
|
"loss": 0.4969,
|
|
"num_input_tokens_seen": 18217984,
|
|
"step": 5780
|
|
},
|
|
{
|
|
"epoch": 0.37033480571026184,
|
|
"grad_norm": 30.690433502197266,
|
|
"learning_rate": 1.5871342037997055e-06,
|
|
"loss": 0.505,
|
|
"num_input_tokens_seen": 18233984,
|
|
"step": 5785
|
|
},
|
|
{
|
|
"epoch": 0.3706548876512387,
|
|
"grad_norm": 43.78403091430664,
|
|
"learning_rate": 1.5862293394548082e-06,
|
|
"loss": 0.403,
|
|
"num_input_tokens_seen": 18249024,
|
|
"step": 5790
|
|
},
|
|
{
|
|
"epoch": 0.3709749695922156,
|
|
"grad_norm": 73.22137451171875,
|
|
"learning_rate": 1.5853237431956972e-06,
|
|
"loss": 0.3414,
|
|
"num_input_tokens_seen": 18264256,
|
|
"step": 5795
|
|
},
|
|
{
|
|
"epoch": 0.3712950515331925,
|
|
"grad_norm": 40.81637954711914,
|
|
"learning_rate": 1.5844174161530206e-06,
|
|
"loss": 0.5495,
|
|
"num_input_tokens_seen": 18279936,
|
|
"step": 5800
|
|
},
|
|
{
|
|
"epoch": 0.3716151334741694,
|
|
"grad_norm": 24.28744888305664,
|
|
"learning_rate": 1.5835103594583382e-06,
|
|
"loss": 0.4039,
|
|
"num_input_tokens_seen": 18295488,
|
|
"step": 5805
|
|
},
|
|
{
|
|
"epoch": 0.3719352154151463,
|
|
"grad_norm": 25.278915405273438,
|
|
"learning_rate": 1.5826025742441207e-06,
|
|
"loss": 0.5329,
|
|
"num_input_tokens_seen": 18311360,
|
|
"step": 5810
|
|
},
|
|
{
|
|
"epoch": 0.37225529735612317,
|
|
"grad_norm": 25.298076629638672,
|
|
"learning_rate": 1.5816940616437486e-06,
|
|
"loss": 0.4284,
|
|
"num_input_tokens_seen": 18326592,
|
|
"step": 5815
|
|
},
|
|
{
|
|
"epoch": 0.37257537929710005,
|
|
"grad_norm": 32.25617599487305,
|
|
"learning_rate": 1.5807848227915108e-06,
|
|
"loss": 0.3573,
|
|
"num_input_tokens_seen": 18344000,
|
|
"step": 5820
|
|
},
|
|
{
|
|
"epoch": 0.37289546123807693,
|
|
"grad_norm": 61.83903503417969,
|
|
"learning_rate": 1.5798748588226028e-06,
|
|
"loss": 0.4787,
|
|
"num_input_tokens_seen": 18359872,
|
|
"step": 5825
|
|
},
|
|
{
|
|
"epoch": 0.3732155431790538,
|
|
"grad_norm": 42.77378463745117,
|
|
"learning_rate": 1.578964170873125e-06,
|
|
"loss": 0.4776,
|
|
"num_input_tokens_seen": 18374400,
|
|
"step": 5830
|
|
},
|
|
{
|
|
"epoch": 0.37353562512003075,
|
|
"grad_norm": 19.963783264160156,
|
|
"learning_rate": 1.5780527600800816e-06,
|
|
"loss": 0.2927,
|
|
"num_input_tokens_seen": 18390656,
|
|
"step": 5835
|
|
},
|
|
{
|
|
"epoch": 0.3738557070610076,
|
|
"grad_norm": 63.39997100830078,
|
|
"learning_rate": 1.5771406275813808e-06,
|
|
"loss": 0.4476,
|
|
"num_input_tokens_seen": 18406400,
|
|
"step": 5840
|
|
},
|
|
{
|
|
"epoch": 0.3741757890019845,
|
|
"grad_norm": 51.011985778808594,
|
|
"learning_rate": 1.5762277745158297e-06,
|
|
"loss": 0.5497,
|
|
"num_input_tokens_seen": 18422848,
|
|
"step": 5845
|
|
},
|
|
{
|
|
"epoch": 0.3744958709429614,
|
|
"grad_norm": 70.113525390625,
|
|
"learning_rate": 1.5753142020231365e-06,
|
|
"loss": 0.4932,
|
|
"num_input_tokens_seen": 18438912,
|
|
"step": 5850
|
|
},
|
|
{
|
|
"epoch": 0.37481595288393826,
|
|
"grad_norm": 43.822303771972656,
|
|
"learning_rate": 1.5743999112439073e-06,
|
|
"loss": 0.525,
|
|
"num_input_tokens_seen": 18455488,
|
|
"step": 5855
|
|
},
|
|
{
|
|
"epoch": 0.3751360348249152,
|
|
"grad_norm": 36.65006637573242,
|
|
"learning_rate": 1.5734849033196446e-06,
|
|
"loss": 0.3954,
|
|
"num_input_tokens_seen": 18470080,
|
|
"step": 5860
|
|
},
|
|
{
|
|
"epoch": 0.3754561167658921,
|
|
"grad_norm": 42.59208297729492,
|
|
"learning_rate": 1.5725691793927468e-06,
|
|
"loss": 0.4337,
|
|
"num_input_tokens_seen": 18484480,
|
|
"step": 5865
|
|
},
|
|
{
|
|
"epoch": 0.37577619870686896,
|
|
"grad_norm": 23.022443771362305,
|
|
"learning_rate": 1.5716527406065057e-06,
|
|
"loss": 0.46,
|
|
"num_input_tokens_seen": 18501312,
|
|
"step": 5870
|
|
},
|
|
{
|
|
"epoch": 0.37609628064784584,
|
|
"grad_norm": 25.66585350036621,
|
|
"learning_rate": 1.570735588105106e-06,
|
|
"loss": 0.449,
|
|
"num_input_tokens_seen": 18515968,
|
|
"step": 5875
|
|
},
|
|
{
|
|
"epoch": 0.3764163625888227,
|
|
"grad_norm": 16.919160842895508,
|
|
"learning_rate": 1.5698177230336234e-06,
|
|
"loss": 0.3901,
|
|
"num_input_tokens_seen": 18531200,
|
|
"step": 5880
|
|
},
|
|
{
|
|
"epoch": 0.37673644452979965,
|
|
"grad_norm": 31.569171905517578,
|
|
"learning_rate": 1.568899146538023e-06,
|
|
"loss": 0.2699,
|
|
"num_input_tokens_seen": 18547712,
|
|
"step": 5885
|
|
},
|
|
{
|
|
"epoch": 0.37705652647077653,
|
|
"grad_norm": 28.067827224731445,
|
|
"learning_rate": 1.5679798597651587e-06,
|
|
"loss": 0.4111,
|
|
"num_input_tokens_seen": 18562752,
|
|
"step": 5890
|
|
},
|
|
{
|
|
"epoch": 0.3773766084117534,
|
|
"grad_norm": 40.843196868896484,
|
|
"learning_rate": 1.5670598638627706e-06,
|
|
"loss": 0.4265,
|
|
"num_input_tokens_seen": 18578368,
|
|
"step": 5895
|
|
},
|
|
{
|
|
"epoch": 0.3776966903527303,
|
|
"grad_norm": 40.625885009765625,
|
|
"learning_rate": 1.5661391599794847e-06,
|
|
"loss": 0.3882,
|
|
"num_input_tokens_seen": 18593408,
|
|
"step": 5900
|
|
},
|
|
{
|
|
"epoch": 0.37801677229370717,
|
|
"grad_norm": 28.81650161743164,
|
|
"learning_rate": 1.56521774926481e-06,
|
|
"loss": 0.4155,
|
|
"num_input_tokens_seen": 18607872,
|
|
"step": 5905
|
|
},
|
|
{
|
|
"epoch": 0.3783368542346841,
|
|
"grad_norm": 25.494752883911133,
|
|
"learning_rate": 1.5642956328691393e-06,
|
|
"loss": 0.359,
|
|
"num_input_tokens_seen": 18624000,
|
|
"step": 5910
|
|
},
|
|
{
|
|
"epoch": 0.378656936175661,
|
|
"grad_norm": 52.101295471191406,
|
|
"learning_rate": 1.5633728119437451e-06,
|
|
"loss": 0.564,
|
|
"num_input_tokens_seen": 18640704,
|
|
"step": 5915
|
|
},
|
|
{
|
|
"epoch": 0.37897701811663786,
|
|
"grad_norm": 28.191926956176758,
|
|
"learning_rate": 1.5624492876407807e-06,
|
|
"loss": 0.4568,
|
|
"num_input_tokens_seen": 18658368,
|
|
"step": 5920
|
|
},
|
|
{
|
|
"epoch": 0.37929710005761474,
|
|
"grad_norm": 46.10580825805664,
|
|
"learning_rate": 1.5615250611132766e-06,
|
|
"loss": 0.4087,
|
|
"num_input_tokens_seen": 18675584,
|
|
"step": 5925
|
|
},
|
|
{
|
|
"epoch": 0.3796171819985916,
|
|
"grad_norm": 23.61751365661621,
|
|
"learning_rate": 1.5606001335151405e-06,
|
|
"loss": 0.5669,
|
|
"num_input_tokens_seen": 18691904,
|
|
"step": 5930
|
|
},
|
|
{
|
|
"epoch": 0.3799372639395685,
|
|
"grad_norm": 33.682106018066406,
|
|
"learning_rate": 1.5596745060011561e-06,
|
|
"loss": 0.3744,
|
|
"num_input_tokens_seen": 18708736,
|
|
"step": 5935
|
|
},
|
|
{
|
|
"epoch": 0.38025734588054544,
|
|
"grad_norm": 35.933292388916016,
|
|
"learning_rate": 1.5587481797269793e-06,
|
|
"loss": 0.3464,
|
|
"num_input_tokens_seen": 18724032,
|
|
"step": 5940
|
|
},
|
|
{
|
|
"epoch": 0.3805774278215223,
|
|
"grad_norm": 38.045902252197266,
|
|
"learning_rate": 1.5578211558491396e-06,
|
|
"loss": 0.4203,
|
|
"num_input_tokens_seen": 18740352,
|
|
"step": 5945
|
|
},
|
|
{
|
|
"epoch": 0.3808975097624992,
|
|
"grad_norm": 24.26993751525879,
|
|
"learning_rate": 1.5568934355250375e-06,
|
|
"loss": 0.3225,
|
|
"num_input_tokens_seen": 18754560,
|
|
"step": 5950
|
|
},
|
|
{
|
|
"epoch": 0.3812175917034761,
|
|
"grad_norm": 67.30828094482422,
|
|
"learning_rate": 1.5559650199129423e-06,
|
|
"loss": 0.6491,
|
|
"num_input_tokens_seen": 18769280,
|
|
"step": 5955
|
|
},
|
|
{
|
|
"epoch": 0.38153767364445296,
|
|
"grad_norm": 46.19745635986328,
|
|
"learning_rate": 1.5550359101719921e-06,
|
|
"loss": 0.4012,
|
|
"num_input_tokens_seen": 18784512,
|
|
"step": 5960
|
|
},
|
|
{
|
|
"epoch": 0.3818577555854299,
|
|
"grad_norm": 62.694427490234375,
|
|
"learning_rate": 1.554106107462191e-06,
|
|
"loss": 0.3561,
|
|
"num_input_tokens_seen": 18800384,
|
|
"step": 5965
|
|
},
|
|
{
|
|
"epoch": 0.38217783752640677,
|
|
"grad_norm": 43.13536834716797,
|
|
"learning_rate": 1.5531756129444092e-06,
|
|
"loss": 0.4248,
|
|
"num_input_tokens_seen": 18815552,
|
|
"step": 5970
|
|
},
|
|
{
|
|
"epoch": 0.38249791946738365,
|
|
"grad_norm": 23.844327926635742,
|
|
"learning_rate": 1.5522444277803796e-06,
|
|
"loss": 0.3884,
|
|
"num_input_tokens_seen": 18830080,
|
|
"step": 5975
|
|
},
|
|
{
|
|
"epoch": 0.38281800140836053,
|
|
"grad_norm": 30.173629760742188,
|
|
"learning_rate": 1.5513125531326976e-06,
|
|
"loss": 0.4319,
|
|
"num_input_tokens_seen": 18846272,
|
|
"step": 5980
|
|
},
|
|
{
|
|
"epoch": 0.3831380833493374,
|
|
"grad_norm": 29.421924591064453,
|
|
"learning_rate": 1.5503799901648198e-06,
|
|
"loss": 0.3747,
|
|
"num_input_tokens_seen": 18860928,
|
|
"step": 5985
|
|
},
|
|
{
|
|
"epoch": 0.38345816529031435,
|
|
"grad_norm": 61.6126594543457,
|
|
"learning_rate": 1.5494467400410625e-06,
|
|
"loss": 0.4553,
|
|
"num_input_tokens_seen": 18877120,
|
|
"step": 5990
|
|
},
|
|
{
|
|
"epoch": 0.3837782472312912,
|
|
"grad_norm": 50.92166519165039,
|
|
"learning_rate": 1.5485128039265986e-06,
|
|
"loss": 0.6017,
|
|
"num_input_tokens_seen": 18892224,
|
|
"step": 5995
|
|
},
|
|
{
|
|
"epoch": 0.3840983291722681,
|
|
"grad_norm": 51.964595794677734,
|
|
"learning_rate": 1.547578182987459e-06,
|
|
"loss": 0.4408,
|
|
"num_input_tokens_seen": 18907008,
|
|
"step": 6000
|
|
},
|
|
{
|
|
"epoch": 0.384418411113245,
|
|
"grad_norm": 21.846920013427734,
|
|
"learning_rate": 1.5466428783905286e-06,
|
|
"loss": 0.2736,
|
|
"num_input_tokens_seen": 18922368,
|
|
"step": 6005
|
|
},
|
|
{
|
|
"epoch": 0.38473849305422186,
|
|
"grad_norm": 30.069700241088867,
|
|
"learning_rate": 1.5457068913035463e-06,
|
|
"loss": 0.4288,
|
|
"num_input_tokens_seen": 18937536,
|
|
"step": 6010
|
|
},
|
|
{
|
|
"epoch": 0.38505857499519874,
|
|
"grad_norm": 40.16860580444336,
|
|
"learning_rate": 1.544770222895103e-06,
|
|
"loss": 0.4784,
|
|
"num_input_tokens_seen": 18954048,
|
|
"step": 6015
|
|
},
|
|
{
|
|
"epoch": 0.3853786569361757,
|
|
"grad_norm": 30.41385269165039,
|
|
"learning_rate": 1.5438328743346398e-06,
|
|
"loss": 0.5188,
|
|
"num_input_tokens_seen": 18969472,
|
|
"step": 6020
|
|
},
|
|
{
|
|
"epoch": 0.38569873887715256,
|
|
"grad_norm": 22.75130844116211,
|
|
"learning_rate": 1.5428948467924478e-06,
|
|
"loss": 0.4098,
|
|
"num_input_tokens_seen": 18983872,
|
|
"step": 6025
|
|
},
|
|
{
|
|
"epoch": 0.38601882081812944,
|
|
"grad_norm": 20.55361557006836,
|
|
"learning_rate": 1.5419561414396656e-06,
|
|
"loss": 0.3223,
|
|
"num_input_tokens_seen": 18999360,
|
|
"step": 6030
|
|
},
|
|
{
|
|
"epoch": 0.3863389027591063,
|
|
"grad_norm": 23.010210037231445,
|
|
"learning_rate": 1.541016759448277e-06,
|
|
"loss": 0.4888,
|
|
"num_input_tokens_seen": 19015424,
|
|
"step": 6035
|
|
},
|
|
{
|
|
"epoch": 0.3866589847000832,
|
|
"grad_norm": 30.879016876220703,
|
|
"learning_rate": 1.5400767019911124e-06,
|
|
"loss": 0.3641,
|
|
"num_input_tokens_seen": 19031616,
|
|
"step": 6040
|
|
},
|
|
{
|
|
"epoch": 0.38697906664106013,
|
|
"grad_norm": 31.967321395874023,
|
|
"learning_rate": 1.539135970241844e-06,
|
|
"loss": 0.4821,
|
|
"num_input_tokens_seen": 19047040,
|
|
"step": 6045
|
|
},
|
|
{
|
|
"epoch": 0.387299148582037,
|
|
"grad_norm": 50.726158142089844,
|
|
"learning_rate": 1.5381945653749866e-06,
|
|
"loss": 0.479,
|
|
"num_input_tokens_seen": 19062848,
|
|
"step": 6050
|
|
},
|
|
{
|
|
"epoch": 0.3876192305230139,
|
|
"grad_norm": 80.43476867675781,
|
|
"learning_rate": 1.5372524885658952e-06,
|
|
"loss": 0.5564,
|
|
"num_input_tokens_seen": 19078976,
|
|
"step": 6055
|
|
},
|
|
{
|
|
"epoch": 0.38793931246399077,
|
|
"grad_norm": 24.717586517333984,
|
|
"learning_rate": 1.5363097409907638e-06,
|
|
"loss": 0.3676,
|
|
"num_input_tokens_seen": 19093632,
|
|
"step": 6060
|
|
},
|
|
{
|
|
"epoch": 0.38825939440496765,
|
|
"grad_norm": 22.33540916442871,
|
|
"learning_rate": 1.535366323826624e-06,
|
|
"loss": 0.3605,
|
|
"num_input_tokens_seen": 19109056,
|
|
"step": 6065
|
|
},
|
|
{
|
|
"epoch": 0.3885794763459446,
|
|
"grad_norm": 46.442413330078125,
|
|
"learning_rate": 1.534422238251343e-06,
|
|
"loss": 0.3699,
|
|
"num_input_tokens_seen": 19124544,
|
|
"step": 6070
|
|
},
|
|
{
|
|
"epoch": 0.38889955828692147,
|
|
"grad_norm": 33.82103729248047,
|
|
"learning_rate": 1.5334774854436223e-06,
|
|
"loss": 0.3834,
|
|
"num_input_tokens_seen": 19140480,
|
|
"step": 6075
|
|
},
|
|
{
|
|
"epoch": 0.38921964022789834,
|
|
"grad_norm": 41.09638214111328,
|
|
"learning_rate": 1.5325320665829975e-06,
|
|
"loss": 0.3776,
|
|
"num_input_tokens_seen": 19156736,
|
|
"step": 6080
|
|
},
|
|
{
|
|
"epoch": 0.3895397221688752,
|
|
"grad_norm": 31.53407096862793,
|
|
"learning_rate": 1.5315859828498352e-06,
|
|
"loss": 0.4455,
|
|
"num_input_tokens_seen": 19171520,
|
|
"step": 6085
|
|
},
|
|
{
|
|
"epoch": 0.3898598041098521,
|
|
"grad_norm": 31.16860580444336,
|
|
"learning_rate": 1.5306392354253316e-06,
|
|
"loss": 0.4921,
|
|
"num_input_tokens_seen": 19187136,
|
|
"step": 6090
|
|
},
|
|
{
|
|
"epoch": 0.39017988605082904,
|
|
"grad_norm": 23.219755172729492,
|
|
"learning_rate": 1.5296918254915123e-06,
|
|
"loss": 0.4377,
|
|
"num_input_tokens_seen": 19201856,
|
|
"step": 6095
|
|
},
|
|
{
|
|
"epoch": 0.3904999679918059,
|
|
"grad_norm": 26.253602981567383,
|
|
"learning_rate": 1.5287437542312296e-06,
|
|
"loss": 0.3869,
|
|
"num_input_tokens_seen": 19216704,
|
|
"step": 6100
|
|
},
|
|
{
|
|
"epoch": 0.3908200499327828,
|
|
"grad_norm": 61.03850173950195,
|
|
"learning_rate": 1.5277950228281614e-06,
|
|
"loss": 0.5316,
|
|
"num_input_tokens_seen": 19233408,
|
|
"step": 6105
|
|
},
|
|
{
|
|
"epoch": 0.3911401318737597,
|
|
"grad_norm": 26.556734085083008,
|
|
"learning_rate": 1.52684563246681e-06,
|
|
"loss": 0.354,
|
|
"num_input_tokens_seen": 19250048,
|
|
"step": 6110
|
|
},
|
|
{
|
|
"epoch": 0.39146021381473656,
|
|
"grad_norm": 16.79180335998535,
|
|
"learning_rate": 1.5258955843325015e-06,
|
|
"loss": 0.4243,
|
|
"num_input_tokens_seen": 19266560,
|
|
"step": 6115
|
|
},
|
|
{
|
|
"epoch": 0.39178029575571344,
|
|
"grad_norm": 58.60289764404297,
|
|
"learning_rate": 1.5249448796113804e-06,
|
|
"loss": 0.4885,
|
|
"num_input_tokens_seen": 19281408,
|
|
"step": 6120
|
|
},
|
|
{
|
|
"epoch": 0.39210037769669037,
|
|
"grad_norm": 47.47416687011719,
|
|
"learning_rate": 1.5239935194904141e-06,
|
|
"loss": 0.4747,
|
|
"num_input_tokens_seen": 19296384,
|
|
"step": 6125
|
|
},
|
|
{
|
|
"epoch": 0.39242045963766725,
|
|
"grad_norm": 24.381053924560547,
|
|
"learning_rate": 1.523041505157386e-06,
|
|
"loss": 0.3702,
|
|
"num_input_tokens_seen": 19312000,
|
|
"step": 6130
|
|
},
|
|
{
|
|
"epoch": 0.39274054157864413,
|
|
"grad_norm": 25.145042419433594,
|
|
"learning_rate": 1.5220888378008977e-06,
|
|
"loss": 0.3909,
|
|
"num_input_tokens_seen": 19327488,
|
|
"step": 6135
|
|
},
|
|
{
|
|
"epoch": 0.393060623519621,
|
|
"grad_norm": 22.552824020385742,
|
|
"learning_rate": 1.5211355186103654e-06,
|
|
"loss": 0.4661,
|
|
"num_input_tokens_seen": 19342080,
|
|
"step": 6140
|
|
},
|
|
{
|
|
"epoch": 0.3933807054605979,
|
|
"grad_norm": 50.69114303588867,
|
|
"learning_rate": 1.5201815487760192e-06,
|
|
"loss": 0.4126,
|
|
"num_input_tokens_seen": 19358336,
|
|
"step": 6145
|
|
},
|
|
{
|
|
"epoch": 0.3937007874015748,
|
|
"grad_norm": 92.56407165527344,
|
|
"learning_rate": 1.5192269294889019e-06,
|
|
"loss": 0.508,
|
|
"num_input_tokens_seen": 19373376,
|
|
"step": 6150
|
|
},
|
|
{
|
|
"epoch": 0.3940208693425517,
|
|
"grad_norm": 28.557926177978516,
|
|
"learning_rate": 1.5182716619408666e-06,
|
|
"loss": 0.4029,
|
|
"num_input_tokens_seen": 19388608,
|
|
"step": 6155
|
|
},
|
|
{
|
|
"epoch": 0.3943409512835286,
|
|
"grad_norm": 31.255754470825195,
|
|
"learning_rate": 1.5173157473245764e-06,
|
|
"loss": 0.5398,
|
|
"num_input_tokens_seen": 19403264,
|
|
"step": 6160
|
|
},
|
|
{
|
|
"epoch": 0.39466103322450546,
|
|
"grad_norm": 36.93677520751953,
|
|
"learning_rate": 1.5163591868335016e-06,
|
|
"loss": 0.4363,
|
|
"num_input_tokens_seen": 19418816,
|
|
"step": 6165
|
|
},
|
|
{
|
|
"epoch": 0.39498111516548234,
|
|
"grad_norm": 39.658329010009766,
|
|
"learning_rate": 1.515401981661919e-06,
|
|
"loss": 0.5781,
|
|
"num_input_tokens_seen": 19435392,
|
|
"step": 6170
|
|
},
|
|
{
|
|
"epoch": 0.3953011971064593,
|
|
"grad_norm": 32.506134033203125,
|
|
"learning_rate": 1.514444133004911e-06,
|
|
"loss": 0.4592,
|
|
"num_input_tokens_seen": 19450048,
|
|
"step": 6175
|
|
},
|
|
{
|
|
"epoch": 0.39562127904743616,
|
|
"grad_norm": 30.993446350097656,
|
|
"learning_rate": 1.5134856420583631e-06,
|
|
"loss": 0.4592,
|
|
"num_input_tokens_seen": 19466368,
|
|
"step": 6180
|
|
},
|
|
{
|
|
"epoch": 0.39594136098841304,
|
|
"grad_norm": 23.403287887573242,
|
|
"learning_rate": 1.5125265100189614e-06,
|
|
"loss": 0.3338,
|
|
"num_input_tokens_seen": 19482624,
|
|
"step": 6185
|
|
},
|
|
{
|
|
"epoch": 0.3962614429293899,
|
|
"grad_norm": 32.384483337402344,
|
|
"learning_rate": 1.5115667380841948e-06,
|
|
"loss": 0.5304,
|
|
"num_input_tokens_seen": 19498048,
|
|
"step": 6190
|
|
},
|
|
{
|
|
"epoch": 0.3965815248703668,
|
|
"grad_norm": 19.235095977783203,
|
|
"learning_rate": 1.510606327452349e-06,
|
|
"loss": 0.43,
|
|
"num_input_tokens_seen": 19515264,
|
|
"step": 6195
|
|
},
|
|
{
|
|
"epoch": 0.3969016068113437,
|
|
"grad_norm": 34.2067985534668,
|
|
"learning_rate": 1.5096452793225082e-06,
|
|
"loss": 0.4319,
|
|
"num_input_tokens_seen": 19533056,
|
|
"step": 6200
|
|
},
|
|
{
|
|
"epoch": 0.3972216887523206,
|
|
"grad_norm": 30.670093536376953,
|
|
"learning_rate": 1.5086835948945522e-06,
|
|
"loss": 0.4003,
|
|
"num_input_tokens_seen": 19548480,
|
|
"step": 6205
|
|
},
|
|
{
|
|
"epoch": 0.3975417706932975,
|
|
"grad_norm": 29.265615463256836,
|
|
"learning_rate": 1.5077212753691556e-06,
|
|
"loss": 0.3271,
|
|
"num_input_tokens_seen": 19563712,
|
|
"step": 6210
|
|
},
|
|
{
|
|
"epoch": 0.39786185263427437,
|
|
"grad_norm": 35.499732971191406,
|
|
"learning_rate": 1.5067583219477852e-06,
|
|
"loss": 0.4049,
|
|
"num_input_tokens_seen": 19578624,
|
|
"step": 6215
|
|
},
|
|
{
|
|
"epoch": 0.39818193457525125,
|
|
"grad_norm": 30.449113845825195,
|
|
"learning_rate": 1.5057947358327e-06,
|
|
"loss": 0.3916,
|
|
"num_input_tokens_seen": 19593408,
|
|
"step": 6220
|
|
},
|
|
{
|
|
"epoch": 0.39850201651622813,
|
|
"grad_norm": 37.85767364501953,
|
|
"learning_rate": 1.504830518226948e-06,
|
|
"loss": 0.4907,
|
|
"num_input_tokens_seen": 19609216,
|
|
"step": 6225
|
|
},
|
|
{
|
|
"epoch": 0.39882209845720507,
|
|
"grad_norm": 19.524030685424805,
|
|
"learning_rate": 1.5038656703343672e-06,
|
|
"loss": 0.449,
|
|
"num_input_tokens_seen": 19624896,
|
|
"step": 6230
|
|
},
|
|
{
|
|
"epoch": 0.39914218039818194,
|
|
"grad_norm": 76.64604949951172,
|
|
"learning_rate": 1.5029001933595805e-06,
|
|
"loss": 0.4925,
|
|
"num_input_tokens_seen": 19640128,
|
|
"step": 6235
|
|
},
|
|
{
|
|
"epoch": 0.3994622623391588,
|
|
"grad_norm": 32.2121696472168,
|
|
"learning_rate": 1.501934088507998e-06,
|
|
"loss": 0.3433,
|
|
"num_input_tokens_seen": 19655680,
|
|
"step": 6240
|
|
},
|
|
{
|
|
"epoch": 0.3997823442801357,
|
|
"grad_norm": 32.78192901611328,
|
|
"learning_rate": 1.5009673569858126e-06,
|
|
"loss": 0.6227,
|
|
"num_input_tokens_seen": 19672192,
|
|
"step": 6245
|
|
},
|
|
{
|
|
"epoch": 0.4001024262211126,
|
|
"grad_norm": 46.18693542480469,
|
|
"learning_rate": 1.5e-06,
|
|
"loss": 0.5284,
|
|
"num_input_tokens_seen": 19688896,
|
|
"step": 6250
|
|
},
|
|
{
|
|
"epoch": 0.4004225081620895,
|
|
"grad_norm": 18.203367233276367,
|
|
"learning_rate": 1.4990320187583167e-06,
|
|
"loss": 0.3547,
|
|
"num_input_tokens_seen": 19704128,
|
|
"step": 6255
|
|
},
|
|
{
|
|
"epoch": 0.4004865245502849,
|
|
"eval_loss": 0.42333245277404785,
|
|
"eval_runtime": 49.177,
|
|
"eval_samples_per_second": 282.368,
|
|
"eval_steps_per_second": 35.301,
|
|
"num_input_tokens_seen": 19707456,
|
|
"step": 6256
|
|
},
|
|
{
|
|
"epoch": 0.4007425901030664,
|
|
"grad_norm": 34.608970642089844,
|
|
"learning_rate": 1.4980634144692986e-06,
|
|
"loss": 0.395,
|
|
"num_input_tokens_seen": 19719744,
|
|
"step": 6260
|
|
},
|
|
{
|
|
"epoch": 0.4010626720440433,
|
|
"grad_norm": 48.07910919189453,
|
|
"learning_rate": 1.4970941883422599e-06,
|
|
"loss": 0.3795,
|
|
"num_input_tokens_seen": 19736128,
|
|
"step": 6265
|
|
},
|
|
{
|
|
"epoch": 0.40138275398502016,
|
|
"grad_norm": 25.680130004882812,
|
|
"learning_rate": 1.4961243415872901e-06,
|
|
"loss": 0.4165,
|
|
"num_input_tokens_seen": 19751296,
|
|
"step": 6270
|
|
},
|
|
{
|
|
"epoch": 0.40170283592599704,
|
|
"grad_norm": 64.0484848022461,
|
|
"learning_rate": 1.4951538754152551e-06,
|
|
"loss": 0.4057,
|
|
"num_input_tokens_seen": 19765888,
|
|
"step": 6275
|
|
},
|
|
{
|
|
"epoch": 0.402022917866974,
|
|
"grad_norm": 29.654808044433594,
|
|
"learning_rate": 1.4941827910377925e-06,
|
|
"loss": 0.4205,
|
|
"num_input_tokens_seen": 19780864,
|
|
"step": 6280
|
|
},
|
|
{
|
|
"epoch": 0.40234299980795085,
|
|
"grad_norm": 23.910985946655273,
|
|
"learning_rate": 1.4932110896673131e-06,
|
|
"loss": 0.4014,
|
|
"num_input_tokens_seen": 19796864,
|
|
"step": 6285
|
|
},
|
|
{
|
|
"epoch": 0.40266308174892773,
|
|
"grad_norm": 29.215768814086914,
|
|
"learning_rate": 1.4922387725169973e-06,
|
|
"loss": 0.5395,
|
|
"num_input_tokens_seen": 19811904,
|
|
"step": 6290
|
|
},
|
|
{
|
|
"epoch": 0.4029831636899046,
|
|
"grad_norm": 33.94330596923828,
|
|
"learning_rate": 1.4912658408007947e-06,
|
|
"loss": 0.4049,
|
|
"num_input_tokens_seen": 19827456,
|
|
"step": 6295
|
|
},
|
|
{
|
|
"epoch": 0.4033032456308815,
|
|
"grad_norm": 33.57729721069336,
|
|
"learning_rate": 1.4902922957334215e-06,
|
|
"loss": 0.4269,
|
|
"num_input_tokens_seen": 19842496,
|
|
"step": 6300
|
|
},
|
|
{
|
|
"epoch": 0.40362332757185837,
|
|
"grad_norm": 43.49477005004883,
|
|
"learning_rate": 1.4893181385303608e-06,
|
|
"loss": 0.408,
|
|
"num_input_tokens_seen": 19858240,
|
|
"step": 6305
|
|
},
|
|
{
|
|
"epoch": 0.4039434095128353,
|
|
"grad_norm": 32.84989547729492,
|
|
"learning_rate": 1.4883433704078584e-06,
|
|
"loss": 0.3994,
|
|
"num_input_tokens_seen": 19874368,
|
|
"step": 6310
|
|
},
|
|
{
|
|
"epoch": 0.4042634914538122,
|
|
"grad_norm": 32.79706954956055,
|
|
"learning_rate": 1.4873679925829246e-06,
|
|
"loss": 0.3874,
|
|
"num_input_tokens_seen": 19891904,
|
|
"step": 6315
|
|
},
|
|
{
|
|
"epoch": 0.40458357339478906,
|
|
"grad_norm": 21.430252075195312,
|
|
"learning_rate": 1.4863920062733298e-06,
|
|
"loss": 0.4077,
|
|
"num_input_tokens_seen": 19907392,
|
|
"step": 6320
|
|
},
|
|
{
|
|
"epoch": 0.40490365533576594,
|
|
"grad_norm": 46.1721305847168,
|
|
"learning_rate": 1.485415412697604e-06,
|
|
"loss": 0.3779,
|
|
"num_input_tokens_seen": 19922624,
|
|
"step": 6325
|
|
},
|
|
{
|
|
"epoch": 0.4052237372767428,
|
|
"grad_norm": 36.21952438354492,
|
|
"learning_rate": 1.484438213075036e-06,
|
|
"loss": 0.4348,
|
|
"num_input_tokens_seen": 19939328,
|
|
"step": 6330
|
|
},
|
|
{
|
|
"epoch": 0.40554381921771976,
|
|
"grad_norm": 42.817806243896484,
|
|
"learning_rate": 1.4834604086256713e-06,
|
|
"loss": 0.4465,
|
|
"num_input_tokens_seen": 19955392,
|
|
"step": 6335
|
|
},
|
|
{
|
|
"epoch": 0.40586390115869664,
|
|
"grad_norm": 35.81399154663086,
|
|
"learning_rate": 1.4824820005703097e-06,
|
|
"loss": 0.3818,
|
|
"num_input_tokens_seen": 19971520,
|
|
"step": 6340
|
|
},
|
|
{
|
|
"epoch": 0.4061839830996735,
|
|
"grad_norm": 22.910531997680664,
|
|
"learning_rate": 1.4815029901305061e-06,
|
|
"loss": 0.46,
|
|
"num_input_tokens_seen": 19988352,
|
|
"step": 6345
|
|
},
|
|
{
|
|
"epoch": 0.4065040650406504,
|
|
"grad_norm": 29.75078010559082,
|
|
"learning_rate": 1.480523378528565e-06,
|
|
"loss": 0.4748,
|
|
"num_input_tokens_seen": 20005184,
|
|
"step": 6350
|
|
},
|
|
{
|
|
"epoch": 0.4068241469816273,
|
|
"grad_norm": 45.245052337646484,
|
|
"learning_rate": 1.4795431669875441e-06,
|
|
"loss": 0.4064,
|
|
"num_input_tokens_seen": 20020800,
|
|
"step": 6355
|
|
},
|
|
{
|
|
"epoch": 0.4071442289226042,
|
|
"grad_norm": 29.282560348510742,
|
|
"learning_rate": 1.478562356731249e-06,
|
|
"loss": 0.472,
|
|
"num_input_tokens_seen": 20036416,
|
|
"step": 6360
|
|
},
|
|
{
|
|
"epoch": 0.4074643108635811,
|
|
"grad_norm": 37.65520095825195,
|
|
"learning_rate": 1.4775809489842326e-06,
|
|
"loss": 0.4525,
|
|
"num_input_tokens_seen": 20053184,
|
|
"step": 6365
|
|
},
|
|
{
|
|
"epoch": 0.40778439280455797,
|
|
"grad_norm": 28.39930534362793,
|
|
"learning_rate": 1.4765989449717937e-06,
|
|
"loss": 0.3987,
|
|
"num_input_tokens_seen": 20069888,
|
|
"step": 6370
|
|
},
|
|
{
|
|
"epoch": 0.40810447474553485,
|
|
"grad_norm": 64.06832885742188,
|
|
"learning_rate": 1.4756163459199763e-06,
|
|
"loss": 0.5504,
|
|
"num_input_tokens_seen": 20085760,
|
|
"step": 6375
|
|
},
|
|
{
|
|
"epoch": 0.40842455668651173,
|
|
"grad_norm": 34.247596740722656,
|
|
"learning_rate": 1.4746331530555665e-06,
|
|
"loss": 0.2742,
|
|
"num_input_tokens_seen": 20101056,
|
|
"step": 6380
|
|
},
|
|
{
|
|
"epoch": 0.4087446386274886,
|
|
"grad_norm": 41.40673828125,
|
|
"learning_rate": 1.4736493676060923e-06,
|
|
"loss": 0.4133,
|
|
"num_input_tokens_seen": 20116352,
|
|
"step": 6385
|
|
},
|
|
{
|
|
"epoch": 0.40906472056846555,
|
|
"grad_norm": 20.095537185668945,
|
|
"learning_rate": 1.4726649907998216e-06,
|
|
"loss": 0.3642,
|
|
"num_input_tokens_seen": 20131712,
|
|
"step": 6390
|
|
},
|
|
{
|
|
"epoch": 0.4093848025094424,
|
|
"grad_norm": 30.422456741333008,
|
|
"learning_rate": 1.4716800238657599e-06,
|
|
"loss": 0.3759,
|
|
"num_input_tokens_seen": 20146880,
|
|
"step": 6395
|
|
},
|
|
{
|
|
"epoch": 0.4097048844504193,
|
|
"grad_norm": 16.951066970825195,
|
|
"learning_rate": 1.4706944680336505e-06,
|
|
"loss": 0.2767,
|
|
"num_input_tokens_seen": 20163520,
|
|
"step": 6400
|
|
},
|
|
{
|
|
"epoch": 0.4100249663913962,
|
|
"grad_norm": 42.80522537231445,
|
|
"learning_rate": 1.469708324533971e-06,
|
|
"loss": 0.4681,
|
|
"num_input_tokens_seen": 20177984,
|
|
"step": 6405
|
|
},
|
|
{
|
|
"epoch": 0.41034504833237306,
|
|
"grad_norm": 16.141464233398438,
|
|
"learning_rate": 1.4687215945979335e-06,
|
|
"loss": 0.3395,
|
|
"num_input_tokens_seen": 20193472,
|
|
"step": 6410
|
|
},
|
|
{
|
|
"epoch": 0.41066513027335,
|
|
"grad_norm": 42.42402267456055,
|
|
"learning_rate": 1.4677342794574815e-06,
|
|
"loss": 0.4507,
|
|
"num_input_tokens_seen": 20210624,
|
|
"step": 6415
|
|
},
|
|
{
|
|
"epoch": 0.4109852122143269,
|
|
"grad_norm": 58.724464416503906,
|
|
"learning_rate": 1.4667463803452902e-06,
|
|
"loss": 0.4199,
|
|
"num_input_tokens_seen": 20226688,
|
|
"step": 6420
|
|
},
|
|
{
|
|
"epoch": 0.41130529415530376,
|
|
"grad_norm": 41.05823516845703,
|
|
"learning_rate": 1.4657578984947627e-06,
|
|
"loss": 0.4472,
|
|
"num_input_tokens_seen": 20244608,
|
|
"step": 6425
|
|
},
|
|
{
|
|
"epoch": 0.41162537609628064,
|
|
"grad_norm": 36.066612243652344,
|
|
"learning_rate": 1.4647688351400303e-06,
|
|
"loss": 0.3699,
|
|
"num_input_tokens_seen": 20261184,
|
|
"step": 6430
|
|
},
|
|
{
|
|
"epoch": 0.4119454580372575,
|
|
"grad_norm": 21.72051239013672,
|
|
"learning_rate": 1.46377919151595e-06,
|
|
"loss": 0.3348,
|
|
"num_input_tokens_seen": 20276736,
|
|
"step": 6435
|
|
},
|
|
{
|
|
"epoch": 0.41226553997823445,
|
|
"grad_norm": 29.512678146362305,
|
|
"learning_rate": 1.462788968858104e-06,
|
|
"loss": 0.4651,
|
|
"num_input_tokens_seen": 20293888,
|
|
"step": 6440
|
|
},
|
|
{
|
|
"epoch": 0.41258562191921133,
|
|
"grad_norm": 20.376218795776367,
|
|
"learning_rate": 1.4617981684027966e-06,
|
|
"loss": 0.482,
|
|
"num_input_tokens_seen": 20309696,
|
|
"step": 6445
|
|
},
|
|
{
|
|
"epoch": 0.4129057038601882,
|
|
"grad_norm": 19.641904830932617,
|
|
"learning_rate": 1.4608067913870536e-06,
|
|
"loss": 0.4013,
|
|
"num_input_tokens_seen": 20325632,
|
|
"step": 6450
|
|
},
|
|
{
|
|
"epoch": 0.4132257858011651,
|
|
"grad_norm": 22.1761474609375,
|
|
"learning_rate": 1.4598148390486213e-06,
|
|
"loss": 0.3968,
|
|
"num_input_tokens_seen": 20341888,
|
|
"step": 6455
|
|
},
|
|
{
|
|
"epoch": 0.41354586774214197,
|
|
"grad_norm": 30.586984634399414,
|
|
"learning_rate": 1.4588223126259639e-06,
|
|
"loss": 0.5073,
|
|
"num_input_tokens_seen": 20358656,
|
|
"step": 6460
|
|
},
|
|
{
|
|
"epoch": 0.4138659496831189,
|
|
"grad_norm": 15.245569229125977,
|
|
"learning_rate": 1.4578292133582615e-06,
|
|
"loss": 0.3245,
|
|
"num_input_tokens_seen": 20372864,
|
|
"step": 6465
|
|
},
|
|
{
|
|
"epoch": 0.4141860316240958,
|
|
"grad_norm": 27.139429092407227,
|
|
"learning_rate": 1.456835542485411e-06,
|
|
"loss": 0.3954,
|
|
"num_input_tokens_seen": 20387840,
|
|
"step": 6470
|
|
},
|
|
{
|
|
"epoch": 0.41450611356507266,
|
|
"grad_norm": 32.64242172241211,
|
|
"learning_rate": 1.4558413012480215e-06,
|
|
"loss": 0.4092,
|
|
"num_input_tokens_seen": 20404736,
|
|
"step": 6475
|
|
},
|
|
{
|
|
"epoch": 0.41482619550604954,
|
|
"grad_norm": 37.946998596191406,
|
|
"learning_rate": 1.4548464908874156e-06,
|
|
"loss": 0.5673,
|
|
"num_input_tokens_seen": 20422848,
|
|
"step": 6480
|
|
},
|
|
{
|
|
"epoch": 0.4151462774470264,
|
|
"grad_norm": 31.876144409179688,
|
|
"learning_rate": 1.4538511126456255e-06,
|
|
"loss": 0.3996,
|
|
"num_input_tokens_seen": 20438016,
|
|
"step": 6485
|
|
},
|
|
{
|
|
"epoch": 0.4154663593880033,
|
|
"grad_norm": 54.237831115722656,
|
|
"learning_rate": 1.452855167765392e-06,
|
|
"loss": 0.5913,
|
|
"num_input_tokens_seen": 20454464,
|
|
"step": 6490
|
|
},
|
|
{
|
|
"epoch": 0.41578644132898024,
|
|
"grad_norm": 24.1745548248291,
|
|
"learning_rate": 1.4518586574901647e-06,
|
|
"loss": 0.4487,
|
|
"num_input_tokens_seen": 20470464,
|
|
"step": 6495
|
|
},
|
|
{
|
|
"epoch": 0.4161065232699571,
|
|
"grad_norm": 27.391712188720703,
|
|
"learning_rate": 1.450861583064098e-06,
|
|
"loss": 0.4617,
|
|
"num_input_tokens_seen": 20485696,
|
|
"step": 6500
|
|
},
|
|
{
|
|
"epoch": 0.416426605210934,
|
|
"grad_norm": 24.388179779052734,
|
|
"learning_rate": 1.4498639457320515e-06,
|
|
"loss": 0.3642,
|
|
"num_input_tokens_seen": 20500608,
|
|
"step": 6505
|
|
},
|
|
{
|
|
"epoch": 0.4167466871519109,
|
|
"grad_norm": 34.60757827758789,
|
|
"learning_rate": 1.4488657467395865e-06,
|
|
"loss": 0.4686,
|
|
"num_input_tokens_seen": 20515776,
|
|
"step": 6510
|
|
},
|
|
{
|
|
"epoch": 0.41706676909288776,
|
|
"grad_norm": 38.96852493286133,
|
|
"learning_rate": 1.4478669873329663e-06,
|
|
"loss": 0.5078,
|
|
"num_input_tokens_seen": 20531456,
|
|
"step": 6515
|
|
},
|
|
{
|
|
"epoch": 0.4173868510338647,
|
|
"grad_norm": 32.143882751464844,
|
|
"learning_rate": 1.4468676687591536e-06,
|
|
"loss": 0.386,
|
|
"num_input_tokens_seen": 20547200,
|
|
"step": 6520
|
|
},
|
|
{
|
|
"epoch": 0.41770693297484157,
|
|
"grad_norm": 28.233505249023438,
|
|
"learning_rate": 1.4458677922658104e-06,
|
|
"loss": 0.4358,
|
|
"num_input_tokens_seen": 20562560,
|
|
"step": 6525
|
|
},
|
|
{
|
|
"epoch": 0.41802701491581845,
|
|
"grad_norm": 18.132049560546875,
|
|
"learning_rate": 1.444867359101293e-06,
|
|
"loss": 0.2798,
|
|
"num_input_tokens_seen": 20577344,
|
|
"step": 6530
|
|
},
|
|
{
|
|
"epoch": 0.41834709685679533,
|
|
"grad_norm": 35.095619201660156,
|
|
"learning_rate": 1.4438663705146545e-06,
|
|
"loss": 0.3529,
|
|
"num_input_tokens_seen": 20593088,
|
|
"step": 6535
|
|
},
|
|
{
|
|
"epoch": 0.4186671787977722,
|
|
"grad_norm": 29.12217903137207,
|
|
"learning_rate": 1.442864827755641e-06,
|
|
"loss": 0.3589,
|
|
"num_input_tokens_seen": 20609792,
|
|
"step": 6540
|
|
},
|
|
{
|
|
"epoch": 0.41898726073874915,
|
|
"grad_norm": 18.185195922851562,
|
|
"learning_rate": 1.4418627320746901e-06,
|
|
"loss": 0.4407,
|
|
"num_input_tokens_seen": 20625280,
|
|
"step": 6545
|
|
},
|
|
{
|
|
"epoch": 0.419307342679726,
|
|
"grad_norm": 31.992891311645508,
|
|
"learning_rate": 1.4408600847229304e-06,
|
|
"loss": 0.3854,
|
|
"num_input_tokens_seen": 20641984,
|
|
"step": 6550
|
|
},
|
|
{
|
|
"epoch": 0.4196274246207029,
|
|
"grad_norm": 31.761362075805664,
|
|
"learning_rate": 1.4398568869521782e-06,
|
|
"loss": 0.5281,
|
|
"num_input_tokens_seen": 20658240,
|
|
"step": 6555
|
|
},
|
|
{
|
|
"epoch": 0.4199475065616798,
|
|
"grad_norm": 31.19809341430664,
|
|
"learning_rate": 1.4388531400149384e-06,
|
|
"loss": 0.3645,
|
|
"num_input_tokens_seen": 20673408,
|
|
"step": 6560
|
|
},
|
|
{
|
|
"epoch": 0.42026758850265666,
|
|
"grad_norm": 46.35468673706055,
|
|
"learning_rate": 1.4378488451644007e-06,
|
|
"loss": 0.3866,
|
|
"num_input_tokens_seen": 20688960,
|
|
"step": 6565
|
|
},
|
|
{
|
|
"epoch": 0.42058767044363354,
|
|
"grad_norm": 25.512950897216797,
|
|
"learning_rate": 1.4368440036544386e-06,
|
|
"loss": 0.4049,
|
|
"num_input_tokens_seen": 20704768,
|
|
"step": 6570
|
|
},
|
|
{
|
|
"epoch": 0.4209077523846105,
|
|
"grad_norm": 43.326324462890625,
|
|
"learning_rate": 1.435838616739609e-06,
|
|
"loss": 0.4199,
|
|
"num_input_tokens_seen": 20719808,
|
|
"step": 6575
|
|
},
|
|
{
|
|
"epoch": 0.42122783432558736,
|
|
"grad_norm": 35.062923431396484,
|
|
"learning_rate": 1.4348326856751493e-06,
|
|
"loss": 0.5392,
|
|
"num_input_tokens_seen": 20735680,
|
|
"step": 6580
|
|
},
|
|
{
|
|
"epoch": 0.42154791626656424,
|
|
"grad_norm": 27.509485244750977,
|
|
"learning_rate": 1.433826211716976e-06,
|
|
"loss": 0.3422,
|
|
"num_input_tokens_seen": 20750144,
|
|
"step": 6585
|
|
},
|
|
{
|
|
"epoch": 0.4218679982075411,
|
|
"grad_norm": 33.31727981567383,
|
|
"learning_rate": 1.4328191961216835e-06,
|
|
"loss": 0.3966,
|
|
"num_input_tokens_seen": 20766016,
|
|
"step": 6590
|
|
},
|
|
{
|
|
"epoch": 0.422188080148518,
|
|
"grad_norm": 54.75680923461914,
|
|
"learning_rate": 1.4318116401465427e-06,
|
|
"loss": 0.4812,
|
|
"num_input_tokens_seen": 20782720,
|
|
"step": 6595
|
|
},
|
|
{
|
|
"epoch": 0.42250816208949493,
|
|
"grad_norm": 29.398454666137695,
|
|
"learning_rate": 1.430803545049499e-06,
|
|
"loss": 0.388,
|
|
"num_input_tokens_seen": 20798208,
|
|
"step": 6600
|
|
},
|
|
{
|
|
"epoch": 0.4228282440304718,
|
|
"grad_norm": 16.673908233642578,
|
|
"learning_rate": 1.4297949120891716e-06,
|
|
"loss": 0.5652,
|
|
"num_input_tokens_seen": 20813056,
|
|
"step": 6605
|
|
},
|
|
{
|
|
"epoch": 0.4231483259714487,
|
|
"grad_norm": 35.20106506347656,
|
|
"learning_rate": 1.4287857425248497e-06,
|
|
"loss": 0.4121,
|
|
"num_input_tokens_seen": 20828800,
|
|
"step": 6610
|
|
},
|
|
{
|
|
"epoch": 0.42346840791242557,
|
|
"grad_norm": 23.11936378479004,
|
|
"learning_rate": 1.427776037616494e-06,
|
|
"loss": 0.4974,
|
|
"num_input_tokens_seen": 20844736,
|
|
"step": 6615
|
|
},
|
|
{
|
|
"epoch": 0.42378848985340245,
|
|
"grad_norm": 30.515439987182617,
|
|
"learning_rate": 1.4267657986247326e-06,
|
|
"loss": 0.3527,
|
|
"num_input_tokens_seen": 20860672,
|
|
"step": 6620
|
|
},
|
|
{
|
|
"epoch": 0.4241085717943794,
|
|
"grad_norm": 33.26582717895508,
|
|
"learning_rate": 1.425755026810861e-06,
|
|
"loss": 0.3746,
|
|
"num_input_tokens_seen": 20877184,
|
|
"step": 6625
|
|
},
|
|
{
|
|
"epoch": 0.42442865373535626,
|
|
"grad_norm": 55.12078857421875,
|
|
"learning_rate": 1.4247437234368394e-06,
|
|
"loss": 0.4095,
|
|
"num_input_tokens_seen": 20894208,
|
|
"step": 6630
|
|
},
|
|
{
|
|
"epoch": 0.42474873567633314,
|
|
"grad_norm": 36.948524475097656,
|
|
"learning_rate": 1.423731889765292e-06,
|
|
"loss": 0.4001,
|
|
"num_input_tokens_seen": 20909696,
|
|
"step": 6635
|
|
},
|
|
{
|
|
"epoch": 0.42506881761731,
|
|
"grad_norm": 15.861876487731934,
|
|
"learning_rate": 1.422719527059505e-06,
|
|
"loss": 0.3504,
|
|
"num_input_tokens_seen": 20926016,
|
|
"step": 6640
|
|
},
|
|
{
|
|
"epoch": 0.4253888995582869,
|
|
"grad_norm": 20.37615394592285,
|
|
"learning_rate": 1.4217066365834253e-06,
|
|
"loss": 0.3636,
|
|
"num_input_tokens_seen": 20941440,
|
|
"step": 6645
|
|
},
|
|
{
|
|
"epoch": 0.42570898149926384,
|
|
"grad_norm": 34.968894958496094,
|
|
"learning_rate": 1.4206932196016586e-06,
|
|
"loss": 0.4406,
|
|
"num_input_tokens_seen": 20956352,
|
|
"step": 6650
|
|
},
|
|
{
|
|
"epoch": 0.4260290634402407,
|
|
"grad_norm": 54.73747634887695,
|
|
"learning_rate": 1.4196792773794672e-06,
|
|
"loss": 0.3928,
|
|
"num_input_tokens_seen": 20973056,
|
|
"step": 6655
|
|
},
|
|
{
|
|
"epoch": 0.4263491453812176,
|
|
"grad_norm": 38.245426177978516,
|
|
"learning_rate": 1.418664811182771e-06,
|
|
"loss": 0.438,
|
|
"num_input_tokens_seen": 20989248,
|
|
"step": 6660
|
|
},
|
|
{
|
|
"epoch": 0.4266692273221945,
|
|
"grad_norm": 41.432498931884766,
|
|
"learning_rate": 1.417649822278142e-06,
|
|
"loss": 0.4836,
|
|
"num_input_tokens_seen": 21004096,
|
|
"step": 6665
|
|
},
|
|
{
|
|
"epoch": 0.42698930926317136,
|
|
"grad_norm": 24.442115783691406,
|
|
"learning_rate": 1.4166343119328064e-06,
|
|
"loss": 0.4722,
|
|
"num_input_tokens_seen": 21020224,
|
|
"step": 6670
|
|
},
|
|
{
|
|
"epoch": 0.42730939120414824,
|
|
"grad_norm": 30.54802703857422,
|
|
"learning_rate": 1.4156182814146404e-06,
|
|
"loss": 0.4616,
|
|
"num_input_tokens_seen": 21035264,
|
|
"step": 6675
|
|
},
|
|
{
|
|
"epoch": 0.42762947314512517,
|
|
"grad_norm": 19.643733978271484,
|
|
"learning_rate": 1.4146017319921701e-06,
|
|
"loss": 0.3497,
|
|
"num_input_tokens_seen": 21051904,
|
|
"step": 6680
|
|
},
|
|
{
|
|
"epoch": 0.42794955508610205,
|
|
"grad_norm": 31.077213287353516,
|
|
"learning_rate": 1.4135846649345695e-06,
|
|
"loss": 0.4215,
|
|
"num_input_tokens_seen": 21069504,
|
|
"step": 6685
|
|
},
|
|
{
|
|
"epoch": 0.42826963702707893,
|
|
"grad_norm": 30.736148834228516,
|
|
"learning_rate": 1.4125670815116589e-06,
|
|
"loss": 0.427,
|
|
"num_input_tokens_seen": 21084288,
|
|
"step": 6690
|
|
},
|
|
{
|
|
"epoch": 0.4285897189680558,
|
|
"grad_norm": 28.045896530151367,
|
|
"learning_rate": 1.4115489829939025e-06,
|
|
"loss": 0.2926,
|
|
"num_input_tokens_seen": 21100544,
|
|
"step": 6695
|
|
},
|
|
{
|
|
"epoch": 0.4289098009090327,
|
|
"grad_norm": 28.585994720458984,
|
|
"learning_rate": 1.4105303706524093e-06,
|
|
"loss": 0.4407,
|
|
"num_input_tokens_seen": 21116608,
|
|
"step": 6700
|
|
},
|
|
{
|
|
"epoch": 0.4292298828500096,
|
|
"grad_norm": 48.415164947509766,
|
|
"learning_rate": 1.4095112457589276e-06,
|
|
"loss": 0.5926,
|
|
"num_input_tokens_seen": 21131776,
|
|
"step": 6705
|
|
},
|
|
{
|
|
"epoch": 0.4295499647909865,
|
|
"grad_norm": 28.813779830932617,
|
|
"learning_rate": 1.4084916095858477e-06,
|
|
"loss": 0.3962,
|
|
"num_input_tokens_seen": 21146368,
|
|
"step": 6710
|
|
},
|
|
{
|
|
"epoch": 0.4298700467319634,
|
|
"grad_norm": 30.74667739868164,
|
|
"learning_rate": 1.407471463406197e-06,
|
|
"loss": 0.4951,
|
|
"num_input_tokens_seen": 21162368,
|
|
"step": 6715
|
|
},
|
|
{
|
|
"epoch": 0.43019012867294026,
|
|
"grad_norm": 28.847599029541016,
|
|
"learning_rate": 1.4064508084936399e-06,
|
|
"loss": 0.4329,
|
|
"num_input_tokens_seen": 21179008,
|
|
"step": 6720
|
|
},
|
|
{
|
|
"epoch": 0.43051021061391714,
|
|
"grad_norm": 30.80069351196289,
|
|
"learning_rate": 1.405429646122476e-06,
|
|
"loss": 0.5761,
|
|
"num_input_tokens_seen": 21196160,
|
|
"step": 6725
|
|
},
|
|
{
|
|
"epoch": 0.4308302925548941,
|
|
"grad_norm": 20.700214385986328,
|
|
"learning_rate": 1.4044079775676392e-06,
|
|
"loss": 0.5175,
|
|
"num_input_tokens_seen": 21212032,
|
|
"step": 6730
|
|
},
|
|
{
|
|
"epoch": 0.43115037449587096,
|
|
"grad_norm": 22.982175827026367,
|
|
"learning_rate": 1.4033858041046936e-06,
|
|
"loss": 0.3659,
|
|
"num_input_tokens_seen": 21230272,
|
|
"step": 6735
|
|
},
|
|
{
|
|
"epoch": 0.43147045643684784,
|
|
"grad_norm": 23.89682388305664,
|
|
"learning_rate": 1.4023631270098352e-06,
|
|
"loss": 0.3926,
|
|
"num_input_tokens_seen": 21245760,
|
|
"step": 6740
|
|
},
|
|
{
|
|
"epoch": 0.4317905383778247,
|
|
"grad_norm": 28.520267486572266,
|
|
"learning_rate": 1.4013399475598888e-06,
|
|
"loss": 0.3411,
|
|
"num_input_tokens_seen": 21260992,
|
|
"step": 6745
|
|
},
|
|
{
|
|
"epoch": 0.4321106203188016,
|
|
"grad_norm": 22.041383743286133,
|
|
"learning_rate": 1.4003162670323056e-06,
|
|
"loss": 0.2807,
|
|
"num_input_tokens_seen": 21275136,
|
|
"step": 6750
|
|
},
|
|
{
|
|
"epoch": 0.4324307022597785,
|
|
"grad_norm": 72.86239624023438,
|
|
"learning_rate": 1.3992920867051627e-06,
|
|
"loss": 0.5292,
|
|
"num_input_tokens_seen": 21290560,
|
|
"step": 6755
|
|
},
|
|
{
|
|
"epoch": 0.4327507842007554,
|
|
"grad_norm": 43.2622184753418,
|
|
"learning_rate": 1.3982674078571614e-06,
|
|
"loss": 0.3525,
|
|
"num_input_tokens_seen": 21305536,
|
|
"step": 6760
|
|
},
|
|
{
|
|
"epoch": 0.4330708661417323,
|
|
"grad_norm": 25.79481315612793,
|
|
"learning_rate": 1.3972422317676252e-06,
|
|
"loss": 0.3785,
|
|
"num_input_tokens_seen": 21320576,
|
|
"step": 6765
|
|
},
|
|
{
|
|
"epoch": 0.43339094808270917,
|
|
"grad_norm": 17.41854476928711,
|
|
"learning_rate": 1.3962165597164985e-06,
|
|
"loss": 0.367,
|
|
"num_input_tokens_seen": 21335680,
|
|
"step": 6770
|
|
},
|
|
{
|
|
"epoch": 0.43371103002368605,
|
|
"grad_norm": 30.709115982055664,
|
|
"learning_rate": 1.395190392984345e-06,
|
|
"loss": 0.3496,
|
|
"num_input_tokens_seen": 21351808,
|
|
"step": 6775
|
|
},
|
|
{
|
|
"epoch": 0.43403111196466293,
|
|
"grad_norm": 26.75821304321289,
|
|
"learning_rate": 1.3941637328523452e-06,
|
|
"loss": 0.4482,
|
|
"num_input_tokens_seen": 21366464,
|
|
"step": 6780
|
|
},
|
|
{
|
|
"epoch": 0.43435119390563987,
|
|
"grad_norm": 36.62665939331055,
|
|
"learning_rate": 1.3931365806022978e-06,
|
|
"loss": 0.3094,
|
|
"num_input_tokens_seen": 21383296,
|
|
"step": 6785
|
|
},
|
|
{
|
|
"epoch": 0.43467127584661674,
|
|
"grad_norm": 38.72547912597656,
|
|
"learning_rate": 1.3921089375166131e-06,
|
|
"loss": 0.3178,
|
|
"num_input_tokens_seen": 21399616,
|
|
"step": 6790
|
|
},
|
|
{
|
|
"epoch": 0.4349913577875936,
|
|
"grad_norm": 21.409557342529297,
|
|
"learning_rate": 1.391080804878316e-06,
|
|
"loss": 0.4475,
|
|
"num_input_tokens_seen": 21414848,
|
|
"step": 6795
|
|
},
|
|
{
|
|
"epoch": 0.4353114397285705,
|
|
"grad_norm": 60.08560562133789,
|
|
"learning_rate": 1.3900521839710427e-06,
|
|
"loss": 0.3748,
|
|
"num_input_tokens_seen": 21430144,
|
|
"step": 6800
|
|
},
|
|
{
|
|
"epoch": 0.4356315216695474,
|
|
"grad_norm": 23.838533401489258,
|
|
"learning_rate": 1.3890230760790373e-06,
|
|
"loss": 0.3516,
|
|
"num_input_tokens_seen": 21445248,
|
|
"step": 6805
|
|
},
|
|
{
|
|
"epoch": 0.4359516036105243,
|
|
"grad_norm": 95.79853820800781,
|
|
"learning_rate": 1.3879934824871544e-06,
|
|
"loss": 0.5972,
|
|
"num_input_tokens_seen": 21460544,
|
|
"step": 6810
|
|
},
|
|
{
|
|
"epoch": 0.4362716855515012,
|
|
"grad_norm": 28.7491512298584,
|
|
"learning_rate": 1.3869634044808526e-06,
|
|
"loss": 0.4871,
|
|
"num_input_tokens_seen": 21476224,
|
|
"step": 6815
|
|
},
|
|
{
|
|
"epoch": 0.4365917674924781,
|
|
"grad_norm": 38.301719665527344,
|
|
"learning_rate": 1.3859328433461971e-06,
|
|
"loss": 0.5996,
|
|
"num_input_tokens_seen": 21491712,
|
|
"step": 6820
|
|
},
|
|
{
|
|
"epoch": 0.43691184943345496,
|
|
"grad_norm": 67.00725555419922,
|
|
"learning_rate": 1.3849018003698553e-06,
|
|
"loss": 0.5784,
|
|
"num_input_tokens_seen": 21508928,
|
|
"step": 6825
|
|
},
|
|
{
|
|
"epoch": 0.43723193137443184,
|
|
"grad_norm": 36.97137451171875,
|
|
"learning_rate": 1.3838702768390964e-06,
|
|
"loss": 0.415,
|
|
"num_input_tokens_seen": 21523648,
|
|
"step": 6830
|
|
},
|
|
{
|
|
"epoch": 0.43755201331540877,
|
|
"grad_norm": 31.718050003051758,
|
|
"learning_rate": 1.38283827404179e-06,
|
|
"loss": 0.4777,
|
|
"num_input_tokens_seen": 21539264,
|
|
"step": 6835
|
|
},
|
|
{
|
|
"epoch": 0.43787209525638565,
|
|
"grad_norm": 50.313236236572266,
|
|
"learning_rate": 1.381805793266403e-06,
|
|
"loss": 0.3776,
|
|
"num_input_tokens_seen": 21555520,
|
|
"step": 6840
|
|
},
|
|
{
|
|
"epoch": 0.43819217719736253,
|
|
"grad_norm": 35.556846618652344,
|
|
"learning_rate": 1.3807728358020009e-06,
|
|
"loss": 0.4517,
|
|
"num_input_tokens_seen": 21570112,
|
|
"step": 6845
|
|
},
|
|
{
|
|
"epoch": 0.4385122591383394,
|
|
"grad_norm": 45.01139450073242,
|
|
"learning_rate": 1.3797394029382416e-06,
|
|
"loss": 0.3386,
|
|
"num_input_tokens_seen": 21584768,
|
|
"step": 6850
|
|
},
|
|
{
|
|
"epoch": 0.4388323410793163,
|
|
"grad_norm": 22.66309928894043,
|
|
"learning_rate": 1.37870549596538e-06,
|
|
"loss": 0.2963,
|
|
"num_input_tokens_seen": 21599872,
|
|
"step": 6855
|
|
},
|
|
{
|
|
"epoch": 0.43915242302029317,
|
|
"grad_norm": 19.721696853637695,
|
|
"learning_rate": 1.3776711161742595e-06,
|
|
"loss": 0.5262,
|
|
"num_input_tokens_seen": 21615808,
|
|
"step": 6860
|
|
},
|
|
{
|
|
"epoch": 0.4394725049612701,
|
|
"grad_norm": 27.445302963256836,
|
|
"learning_rate": 1.3766362648563166e-06,
|
|
"loss": 0.4639,
|
|
"num_input_tokens_seen": 21630656,
|
|
"step": 6865
|
|
},
|
|
{
|
|
"epoch": 0.439792586902247,
|
|
"grad_norm": 62.023433685302734,
|
|
"learning_rate": 1.3756009433035744e-06,
|
|
"loss": 0.4073,
|
|
"num_input_tokens_seen": 21646976,
|
|
"step": 6870
|
|
},
|
|
{
|
|
"epoch": 0.44011266884322386,
|
|
"grad_norm": 27.705705642700195,
|
|
"learning_rate": 1.3745651528086447e-06,
|
|
"loss": 0.5615,
|
|
"num_input_tokens_seen": 21665024,
|
|
"step": 6875
|
|
},
|
|
{
|
|
"epoch": 0.44043275078420074,
|
|
"grad_norm": 15.181832313537598,
|
|
"learning_rate": 1.373528894664724e-06,
|
|
"loss": 0.4486,
|
|
"num_input_tokens_seen": 21680128,
|
|
"step": 6880
|
|
},
|
|
{
|
|
"epoch": 0.4407528327251776,
|
|
"grad_norm": 23.56239128112793,
|
|
"learning_rate": 1.3724921701655924e-06,
|
|
"loss": 0.3509,
|
|
"num_input_tokens_seen": 21695808,
|
|
"step": 6885
|
|
},
|
|
{
|
|
"epoch": 0.44107291466615456,
|
|
"grad_norm": 15.783341407775879,
|
|
"learning_rate": 1.3714549806056125e-06,
|
|
"loss": 0.3155,
|
|
"num_input_tokens_seen": 21711936,
|
|
"step": 6890
|
|
},
|
|
{
|
|
"epoch": 0.44139299660713144,
|
|
"grad_norm": 45.81801986694336,
|
|
"learning_rate": 1.3704173272797283e-06,
|
|
"loss": 0.4241,
|
|
"num_input_tokens_seen": 21727488,
|
|
"step": 6895
|
|
},
|
|
{
|
|
"epoch": 0.4417130785481083,
|
|
"grad_norm": 39.76817321777344,
|
|
"learning_rate": 1.3693792114834619e-06,
|
|
"loss": 0.4386,
|
|
"num_input_tokens_seen": 21745280,
|
|
"step": 6900
|
|
},
|
|
{
|
|
"epoch": 0.4420331604890852,
|
|
"grad_norm": 26.435964584350586,
|
|
"learning_rate": 1.3683406345129129e-06,
|
|
"loss": 0.4684,
|
|
"num_input_tokens_seen": 21760000,
|
|
"step": 6905
|
|
},
|
|
{
|
|
"epoch": 0.4423532424300621,
|
|
"grad_norm": 25.047027587890625,
|
|
"learning_rate": 1.3673015976647567e-06,
|
|
"loss": 0.4025,
|
|
"num_input_tokens_seen": 21775232,
|
|
"step": 6910
|
|
},
|
|
{
|
|
"epoch": 0.442673324371039,
|
|
"grad_norm": 36.185760498046875,
|
|
"learning_rate": 1.3662621022362435e-06,
|
|
"loss": 0.3967,
|
|
"num_input_tokens_seen": 21790656,
|
|
"step": 6915
|
|
},
|
|
{
|
|
"epoch": 0.4429934063120159,
|
|
"grad_norm": 53.022464752197266,
|
|
"learning_rate": 1.3652221495251952e-06,
|
|
"loss": 0.4654,
|
|
"num_input_tokens_seen": 21806336,
|
|
"step": 6920
|
|
},
|
|
{
|
|
"epoch": 0.44331348825299277,
|
|
"grad_norm": 26.99211883544922,
|
|
"learning_rate": 1.3641817408300049e-06,
|
|
"loss": 0.3204,
|
|
"num_input_tokens_seen": 21823744,
|
|
"step": 6925
|
|
},
|
|
{
|
|
"epoch": 0.44363357019396965,
|
|
"grad_norm": 30.070894241333008,
|
|
"learning_rate": 1.3631408774496352e-06,
|
|
"loss": 0.5579,
|
|
"num_input_tokens_seen": 21839104,
|
|
"step": 6930
|
|
},
|
|
{
|
|
"epoch": 0.44395365213494653,
|
|
"grad_norm": 26.091249465942383,
|
|
"learning_rate": 1.3620995606836165e-06,
|
|
"loss": 0.3566,
|
|
"num_input_tokens_seen": 21854528,
|
|
"step": 6935
|
|
},
|
|
{
|
|
"epoch": 0.4442737340759234,
|
|
"grad_norm": 58.88991165161133,
|
|
"learning_rate": 1.3610577918320446e-06,
|
|
"loss": 0.6023,
|
|
"num_input_tokens_seen": 21870592,
|
|
"step": 6940
|
|
},
|
|
{
|
|
"epoch": 0.44459381601690035,
|
|
"grad_norm": 44.893310546875,
|
|
"learning_rate": 1.3600155721955802e-06,
|
|
"loss": 0.3743,
|
|
"num_input_tokens_seen": 21885696,
|
|
"step": 6945
|
|
},
|
|
{
|
|
"epoch": 0.4449138979578772,
|
|
"grad_norm": 24.15410614013672,
|
|
"learning_rate": 1.3589729030754468e-06,
|
|
"loss": 0.3819,
|
|
"num_input_tokens_seen": 21901248,
|
|
"step": 6950
|
|
},
|
|
{
|
|
"epoch": 0.4452339798988541,
|
|
"grad_norm": 28.12432861328125,
|
|
"learning_rate": 1.3579297857734293e-06,
|
|
"loss": 0.4341,
|
|
"num_input_tokens_seen": 21916352,
|
|
"step": 6955
|
|
},
|
|
{
|
|
"epoch": 0.445554061839831,
|
|
"grad_norm": 17.1772518157959,
|
|
"learning_rate": 1.3568862215918717e-06,
|
|
"loss": 0.3365,
|
|
"num_input_tokens_seen": 21931072,
|
|
"step": 6960
|
|
},
|
|
{
|
|
"epoch": 0.44587414378080786,
|
|
"grad_norm": 32.58141326904297,
|
|
"learning_rate": 1.3558422118336762e-06,
|
|
"loss": 0.4944,
|
|
"num_input_tokens_seen": 21946752,
|
|
"step": 6965
|
|
},
|
|
{
|
|
"epoch": 0.4461942257217848,
|
|
"grad_norm": 37.54017639160156,
|
|
"learning_rate": 1.354797757802301e-06,
|
|
"loss": 0.4804,
|
|
"num_input_tokens_seen": 21962176,
|
|
"step": 6970
|
|
},
|
|
{
|
|
"epoch": 0.4465143076627617,
|
|
"grad_norm": 17.05492401123047,
|
|
"learning_rate": 1.3537528608017596e-06,
|
|
"loss": 0.392,
|
|
"num_input_tokens_seen": 21978496,
|
|
"step": 6975
|
|
},
|
|
{
|
|
"epoch": 0.44683438960373856,
|
|
"grad_norm": 23.01466941833496,
|
|
"learning_rate": 1.352707522136618e-06,
|
|
"loss": 0.3973,
|
|
"num_input_tokens_seen": 21992576,
|
|
"step": 6980
|
|
},
|
|
{
|
|
"epoch": 0.44715447154471544,
|
|
"grad_norm": 17.1395206451416,
|
|
"learning_rate": 1.3516617431119934e-06,
|
|
"loss": 0.3998,
|
|
"num_input_tokens_seen": 22008000,
|
|
"step": 6985
|
|
},
|
|
{
|
|
"epoch": 0.4474745534856923,
|
|
"grad_norm": 32.3569450378418,
|
|
"learning_rate": 1.350615525033554e-06,
|
|
"loss": 0.53,
|
|
"num_input_tokens_seen": 22022976,
|
|
"step": 6990
|
|
},
|
|
{
|
|
"epoch": 0.44779463542666925,
|
|
"grad_norm": 25.158411026000977,
|
|
"learning_rate": 1.3495688692075144e-06,
|
|
"loss": 0.4027,
|
|
"num_input_tokens_seen": 22038144,
|
|
"step": 6995
|
|
},
|
|
{
|
|
"epoch": 0.44811471736764613,
|
|
"grad_norm": 31.82624053955078,
|
|
"learning_rate": 1.3485217769406376e-06,
|
|
"loss": 0.3435,
|
|
"num_input_tokens_seen": 22054016,
|
|
"step": 7000
|
|
},
|
|
{
|
|
"epoch": 0.448434799308623,
|
|
"grad_norm": 30.720848083496094,
|
|
"learning_rate": 1.3474742495402303e-06,
|
|
"loss": 0.3605,
|
|
"num_input_tokens_seen": 22073920,
|
|
"step": 7005
|
|
},
|
|
{
|
|
"epoch": 0.4487548812495999,
|
|
"grad_norm": 50.93308639526367,
|
|
"learning_rate": 1.3464262883141425e-06,
|
|
"loss": 0.4297,
|
|
"num_input_tokens_seen": 22089728,
|
|
"step": 7010
|
|
},
|
|
{
|
|
"epoch": 0.44907496319057677,
|
|
"grad_norm": 36.83964538574219,
|
|
"learning_rate": 1.3453778945707663e-06,
|
|
"loss": 0.5687,
|
|
"num_input_tokens_seen": 22105344,
|
|
"step": 7015
|
|
},
|
|
{
|
|
"epoch": 0.4493950451315537,
|
|
"grad_norm": 53.62667465209961,
|
|
"learning_rate": 1.3443290696190332e-06,
|
|
"loss": 0.4471,
|
|
"num_input_tokens_seen": 22121792,
|
|
"step": 7020
|
|
},
|
|
{
|
|
"epoch": 0.4497151270725306,
|
|
"grad_norm": 23.14280128479004,
|
|
"learning_rate": 1.343279814768414e-06,
|
|
"loss": 0.4034,
|
|
"num_input_tokens_seen": 22136128,
|
|
"step": 7025
|
|
},
|
|
{
|
|
"epoch": 0.45003520901350746,
|
|
"grad_norm": 22.742084503173828,
|
|
"learning_rate": 1.3422301313289156e-06,
|
|
"loss": 0.38,
|
|
"num_input_tokens_seen": 22151936,
|
|
"step": 7030
|
|
},
|
|
{
|
|
"epoch": 0.45035529095448434,
|
|
"grad_norm": 21.072940826416016,
|
|
"learning_rate": 1.34118002061108e-06,
|
|
"loss": 0.3794,
|
|
"num_input_tokens_seen": 22168128,
|
|
"step": 7035
|
|
},
|
|
{
|
|
"epoch": 0.4505473401190705,
|
|
"eval_loss": 0.43158382177352905,
|
|
"eval_runtime": 49.1758,
|
|
"eval_samples_per_second": 282.375,
|
|
"eval_steps_per_second": 35.302,
|
|
"num_input_tokens_seen": 22178432,
|
|
"step": 7038
|
|
},
|
|
{
|
|
"epoch": 0.4506753728954612,
|
|
"grad_norm": 38.73175048828125,
|
|
"learning_rate": 1.3401294839259828e-06,
|
|
"loss": 0.4309,
|
|
"num_input_tokens_seen": 22184512,
|
|
"step": 7040
|
|
},
|
|
{
|
|
"epoch": 0.4509954548364381,
|
|
"grad_norm": 33.52423095703125,
|
|
"learning_rate": 1.3390785225852312e-06,
|
|
"loss": 0.54,
|
|
"num_input_tokens_seen": 22199872,
|
|
"step": 7045
|
|
},
|
|
{
|
|
"epoch": 0.45131553677741504,
|
|
"grad_norm": 19.460634231567383,
|
|
"learning_rate": 1.3380271379009631e-06,
|
|
"loss": 0.4411,
|
|
"num_input_tokens_seen": 22216960,
|
|
"step": 7050
|
|
},
|
|
{
|
|
"epoch": 0.4516356187183919,
|
|
"grad_norm": 19.555931091308594,
|
|
"learning_rate": 1.3369753311858442e-06,
|
|
"loss": 0.2615,
|
|
"num_input_tokens_seen": 22231488,
|
|
"step": 7055
|
|
},
|
|
{
|
|
"epoch": 0.4519557006593688,
|
|
"grad_norm": 28.813966751098633,
|
|
"learning_rate": 1.3359231037530682e-06,
|
|
"loss": 0.4584,
|
|
"num_input_tokens_seen": 22246976,
|
|
"step": 7060
|
|
},
|
|
{
|
|
"epoch": 0.4522757826003457,
|
|
"grad_norm": 16.085895538330078,
|
|
"learning_rate": 1.3348704569163527e-06,
|
|
"loss": 0.4139,
|
|
"num_input_tokens_seen": 22263680,
|
|
"step": 7065
|
|
},
|
|
{
|
|
"epoch": 0.45259586454132256,
|
|
"grad_norm": 18.5650691986084,
|
|
"learning_rate": 1.33381739198994e-06,
|
|
"loss": 0.3347,
|
|
"num_input_tokens_seen": 22279552,
|
|
"step": 7070
|
|
},
|
|
{
|
|
"epoch": 0.4529159464822995,
|
|
"grad_norm": 19.012405395507812,
|
|
"learning_rate": 1.3327639102885938e-06,
|
|
"loss": 0.4436,
|
|
"num_input_tokens_seen": 22295296,
|
|
"step": 7075
|
|
},
|
|
{
|
|
"epoch": 0.45323602842327637,
|
|
"grad_norm": 34.81302261352539,
|
|
"learning_rate": 1.3317100131275986e-06,
|
|
"loss": 0.3973,
|
|
"num_input_tokens_seen": 22310400,
|
|
"step": 7080
|
|
},
|
|
{
|
|
"epoch": 0.45355611036425325,
|
|
"grad_norm": 60.76240921020508,
|
|
"learning_rate": 1.3306557018227576e-06,
|
|
"loss": 0.492,
|
|
"num_input_tokens_seen": 22326848,
|
|
"step": 7085
|
|
},
|
|
{
|
|
"epoch": 0.45387619230523013,
|
|
"grad_norm": 30.761585235595703,
|
|
"learning_rate": 1.3296009776903903e-06,
|
|
"loss": 0.47,
|
|
"num_input_tokens_seen": 22342592,
|
|
"step": 7090
|
|
},
|
|
{
|
|
"epoch": 0.454196274246207,
|
|
"grad_norm": 29.366207122802734,
|
|
"learning_rate": 1.3285458420473323e-06,
|
|
"loss": 0.4386,
|
|
"num_input_tokens_seen": 22358912,
|
|
"step": 7095
|
|
},
|
|
{
|
|
"epoch": 0.45451635618718395,
|
|
"grad_norm": 30.328184127807617,
|
|
"learning_rate": 1.3274902962109332e-06,
|
|
"loss": 0.3744,
|
|
"num_input_tokens_seen": 22374528,
|
|
"step": 7100
|
|
},
|
|
{
|
|
"epoch": 0.4548364381281608,
|
|
"grad_norm": 17.943153381347656,
|
|
"learning_rate": 1.3264343414990539e-06,
|
|
"loss": 0.3686,
|
|
"num_input_tokens_seen": 22389824,
|
|
"step": 7105
|
|
},
|
|
{
|
|
"epoch": 0.4551565200691377,
|
|
"grad_norm": 35.101932525634766,
|
|
"learning_rate": 1.3253779792300663e-06,
|
|
"loss": 0.4148,
|
|
"num_input_tokens_seen": 22405376,
|
|
"step": 7110
|
|
},
|
|
{
|
|
"epoch": 0.4554766020101146,
|
|
"grad_norm": 14.828371047973633,
|
|
"learning_rate": 1.3243212107228518e-06,
|
|
"loss": 0.3551,
|
|
"num_input_tokens_seen": 22420032,
|
|
"step": 7115
|
|
},
|
|
{
|
|
"epoch": 0.45579668395109146,
|
|
"grad_norm": 15.68032169342041,
|
|
"learning_rate": 1.3232640372967974e-06,
|
|
"loss": 0.3909,
|
|
"num_input_tokens_seen": 22434688,
|
|
"step": 7120
|
|
},
|
|
{
|
|
"epoch": 0.45611676589206834,
|
|
"grad_norm": 51.65379333496094,
|
|
"learning_rate": 1.3222064602717974e-06,
|
|
"loss": 0.4645,
|
|
"num_input_tokens_seen": 22451072,
|
|
"step": 7125
|
|
},
|
|
{
|
|
"epoch": 0.4564368478330453,
|
|
"grad_norm": 30.610668182373047,
|
|
"learning_rate": 1.321148480968248e-06,
|
|
"loss": 0.3488,
|
|
"num_input_tokens_seen": 22466688,
|
|
"step": 7130
|
|
},
|
|
{
|
|
"epoch": 0.45675692977402216,
|
|
"grad_norm": 38.32967758178711,
|
|
"learning_rate": 1.3200901007070495e-06,
|
|
"loss": 0.4609,
|
|
"num_input_tokens_seen": 22482432,
|
|
"step": 7135
|
|
},
|
|
{
|
|
"epoch": 0.45707701171499904,
|
|
"grad_norm": 42.44841003417969,
|
|
"learning_rate": 1.3190313208096022e-06,
|
|
"loss": 0.4616,
|
|
"num_input_tokens_seen": 22496960,
|
|
"step": 7140
|
|
},
|
|
{
|
|
"epoch": 0.4573970936559759,
|
|
"grad_norm": 62.05764389038086,
|
|
"learning_rate": 1.3179721425978048e-06,
|
|
"loss": 0.3617,
|
|
"num_input_tokens_seen": 22512256,
|
|
"step": 7145
|
|
},
|
|
{
|
|
"epoch": 0.4577171755969528,
|
|
"grad_norm": 27.489582061767578,
|
|
"learning_rate": 1.3169125673940541e-06,
|
|
"loss": 0.4002,
|
|
"num_input_tokens_seen": 22528192,
|
|
"step": 7150
|
|
},
|
|
{
|
|
"epoch": 0.45803725753792973,
|
|
"grad_norm": 23.193330764770508,
|
|
"learning_rate": 1.3158525965212422e-06,
|
|
"loss": 0.4126,
|
|
"num_input_tokens_seen": 22545408,
|
|
"step": 7155
|
|
},
|
|
{
|
|
"epoch": 0.4583573394789066,
|
|
"grad_norm": 44.60530090332031,
|
|
"learning_rate": 1.3147922313027548e-06,
|
|
"loss": 0.5063,
|
|
"num_input_tokens_seen": 22560832,
|
|
"step": 7160
|
|
},
|
|
{
|
|
"epoch": 0.4586774214198835,
|
|
"grad_norm": 34.29766845703125,
|
|
"learning_rate": 1.3137314730624707e-06,
|
|
"loss": 0.3456,
|
|
"num_input_tokens_seen": 22577728,
|
|
"step": 7165
|
|
},
|
|
{
|
|
"epoch": 0.45899750336086037,
|
|
"grad_norm": 59.20881652832031,
|
|
"learning_rate": 1.3126703231247588e-06,
|
|
"loss": 0.4722,
|
|
"num_input_tokens_seen": 22594112,
|
|
"step": 7170
|
|
},
|
|
{
|
|
"epoch": 0.45931758530183725,
|
|
"grad_norm": 57.1280632019043,
|
|
"learning_rate": 1.3116087828144772e-06,
|
|
"loss": 0.3917,
|
|
"num_input_tokens_seen": 22609728,
|
|
"step": 7175
|
|
},
|
|
{
|
|
"epoch": 0.4596376672428142,
|
|
"grad_norm": 24.825468063354492,
|
|
"learning_rate": 1.310546853456972e-06,
|
|
"loss": 0.4692,
|
|
"num_input_tokens_seen": 22624704,
|
|
"step": 7180
|
|
},
|
|
{
|
|
"epoch": 0.45995774918379106,
|
|
"grad_norm": 27.96169662475586,
|
|
"learning_rate": 1.3094845363780737e-06,
|
|
"loss": 0.3145,
|
|
"num_input_tokens_seen": 22640448,
|
|
"step": 7185
|
|
},
|
|
{
|
|
"epoch": 0.46027783112476794,
|
|
"grad_norm": 26.550325393676758,
|
|
"learning_rate": 1.3084218329040976e-06,
|
|
"loss": 0.2277,
|
|
"num_input_tokens_seen": 22655680,
|
|
"step": 7190
|
|
},
|
|
{
|
|
"epoch": 0.4605979130657448,
|
|
"grad_norm": 17.48622703552246,
|
|
"learning_rate": 1.3073587443618425e-06,
|
|
"loss": 0.3769,
|
|
"num_input_tokens_seen": 22672128,
|
|
"step": 7195
|
|
},
|
|
{
|
|
"epoch": 0.4609179950067217,
|
|
"grad_norm": 60.23152542114258,
|
|
"learning_rate": 1.3062952720785861e-06,
|
|
"loss": 0.5418,
|
|
"num_input_tokens_seen": 22687104,
|
|
"step": 7200
|
|
},
|
|
{
|
|
"epoch": 0.4612380769476986,
|
|
"grad_norm": 48.24466323852539,
|
|
"learning_rate": 1.305231417382086e-06,
|
|
"loss": 0.3724,
|
|
"num_input_tokens_seen": 22702976,
|
|
"step": 7205
|
|
},
|
|
{
|
|
"epoch": 0.4615581588886755,
|
|
"grad_norm": 34.0355224609375,
|
|
"learning_rate": 1.3041671816005777e-06,
|
|
"loss": 0.3522,
|
|
"num_input_tokens_seen": 22718464,
|
|
"step": 7210
|
|
},
|
|
{
|
|
"epoch": 0.4618782408296524,
|
|
"grad_norm": 30.36563491821289,
|
|
"learning_rate": 1.3031025660627718e-06,
|
|
"loss": 0.3783,
|
|
"num_input_tokens_seen": 22734656,
|
|
"step": 7215
|
|
},
|
|
{
|
|
"epoch": 0.4621983227706293,
|
|
"grad_norm": 38.3671989440918,
|
|
"learning_rate": 1.3020375720978534e-06,
|
|
"loss": 0.4376,
|
|
"num_input_tokens_seen": 22750016,
|
|
"step": 7220
|
|
},
|
|
{
|
|
"epoch": 0.46251840471160616,
|
|
"grad_norm": 32.97966003417969,
|
|
"learning_rate": 1.3009722010354799e-06,
|
|
"loss": 0.3855,
|
|
"num_input_tokens_seen": 22765632,
|
|
"step": 7225
|
|
},
|
|
{
|
|
"epoch": 0.46283848665258304,
|
|
"grad_norm": 39.90695571899414,
|
|
"learning_rate": 1.2999064542057794e-06,
|
|
"loss": 0.4528,
|
|
"num_input_tokens_seen": 22781184,
|
|
"step": 7230
|
|
},
|
|
{
|
|
"epoch": 0.46315856859355997,
|
|
"grad_norm": 31.27988624572754,
|
|
"learning_rate": 1.2988403329393495e-06,
|
|
"loss": 0.4842,
|
|
"num_input_tokens_seen": 22797248,
|
|
"step": 7235
|
|
},
|
|
{
|
|
"epoch": 0.46347865053453685,
|
|
"grad_norm": 29.927885055541992,
|
|
"learning_rate": 1.2977738385672557e-06,
|
|
"loss": 0.4177,
|
|
"num_input_tokens_seen": 22812800,
|
|
"step": 7240
|
|
},
|
|
{
|
|
"epoch": 0.46379873247551373,
|
|
"grad_norm": 21.404644012451172,
|
|
"learning_rate": 1.2967069724210278e-06,
|
|
"loss": 0.4087,
|
|
"num_input_tokens_seen": 22827200,
|
|
"step": 7245
|
|
},
|
|
{
|
|
"epoch": 0.4641188144164906,
|
|
"grad_norm": 31.973535537719727,
|
|
"learning_rate": 1.2956397358326609e-06,
|
|
"loss": 0.5265,
|
|
"num_input_tokens_seen": 22843264,
|
|
"step": 7250
|
|
},
|
|
{
|
|
"epoch": 0.4644388963574675,
|
|
"grad_norm": 39.217674255371094,
|
|
"learning_rate": 1.294572130134613e-06,
|
|
"loss": 0.3799,
|
|
"num_input_tokens_seen": 22858624,
|
|
"step": 7255
|
|
},
|
|
{
|
|
"epoch": 0.4647589782984444,
|
|
"grad_norm": 36.54713821411133,
|
|
"learning_rate": 1.2935041566598016e-06,
|
|
"loss": 0.5557,
|
|
"num_input_tokens_seen": 22873856,
|
|
"step": 7260
|
|
},
|
|
{
|
|
"epoch": 0.4650790602394213,
|
|
"grad_norm": 32.417545318603516,
|
|
"learning_rate": 1.2924358167416049e-06,
|
|
"loss": 0.356,
|
|
"num_input_tokens_seen": 22889600,
|
|
"step": 7265
|
|
},
|
|
{
|
|
"epoch": 0.4653991421803982,
|
|
"grad_norm": 24.408979415893555,
|
|
"learning_rate": 1.2913671117138572e-06,
|
|
"loss": 0.4007,
|
|
"num_input_tokens_seen": 22904704,
|
|
"step": 7270
|
|
},
|
|
{
|
|
"epoch": 0.46571922412137506,
|
|
"grad_norm": 22.64531898498535,
|
|
"learning_rate": 1.29029804291085e-06,
|
|
"loss": 0.3471,
|
|
"num_input_tokens_seen": 22920384,
|
|
"step": 7275
|
|
},
|
|
{
|
|
"epoch": 0.46603930606235194,
|
|
"grad_norm": 44.77216339111328,
|
|
"learning_rate": 1.2892286116673269e-06,
|
|
"loss": 0.3475,
|
|
"num_input_tokens_seen": 22937024,
|
|
"step": 7280
|
|
},
|
|
{
|
|
"epoch": 0.4663593880033289,
|
|
"grad_norm": 26.58623695373535,
|
|
"learning_rate": 1.2881588193184865e-06,
|
|
"loss": 0.4934,
|
|
"num_input_tokens_seen": 22954816,
|
|
"step": 7285
|
|
},
|
|
{
|
|
"epoch": 0.46667946994430576,
|
|
"grad_norm": 22.52194595336914,
|
|
"learning_rate": 1.287088667199977e-06,
|
|
"loss": 0.2918,
|
|
"num_input_tokens_seen": 22969472,
|
|
"step": 7290
|
|
},
|
|
{
|
|
"epoch": 0.46699955188528264,
|
|
"grad_norm": 22.330564498901367,
|
|
"learning_rate": 1.2860181566478956e-06,
|
|
"loss": 0.4681,
|
|
"num_input_tokens_seen": 22984192,
|
|
"step": 7295
|
|
},
|
|
{
|
|
"epoch": 0.4673196338262595,
|
|
"grad_norm": 13.149898529052734,
|
|
"learning_rate": 1.2849472889987874e-06,
|
|
"loss": 0.3868,
|
|
"num_input_tokens_seen": 22999680,
|
|
"step": 7300
|
|
},
|
|
{
|
|
"epoch": 0.4676397157672364,
|
|
"grad_norm": 27.509746551513672,
|
|
"learning_rate": 1.2838760655896431e-06,
|
|
"loss": 0.3784,
|
|
"num_input_tokens_seen": 23014720,
|
|
"step": 7305
|
|
},
|
|
{
|
|
"epoch": 0.4679597977082133,
|
|
"grad_norm": 35.98652648925781,
|
|
"learning_rate": 1.2828044877578983e-06,
|
|
"loss": 0.4544,
|
|
"num_input_tokens_seen": 23030528,
|
|
"step": 7310
|
|
},
|
|
{
|
|
"epoch": 0.4682798796491902,
|
|
"grad_norm": 26.335607528686523,
|
|
"learning_rate": 1.2817325568414297e-06,
|
|
"loss": 0.5205,
|
|
"num_input_tokens_seen": 23046784,
|
|
"step": 7315
|
|
},
|
|
{
|
|
"epoch": 0.4685999615901671,
|
|
"grad_norm": 26.756956100463867,
|
|
"learning_rate": 1.2806602741785562e-06,
|
|
"loss": 0.3379,
|
|
"num_input_tokens_seen": 23061632,
|
|
"step": 7320
|
|
},
|
|
{
|
|
"epoch": 0.46892004353114397,
|
|
"grad_norm": 17.465469360351562,
|
|
"learning_rate": 1.2795876411080346e-06,
|
|
"loss": 0.3202,
|
|
"num_input_tokens_seen": 23077888,
|
|
"step": 7325
|
|
},
|
|
{
|
|
"epoch": 0.46924012547212085,
|
|
"grad_norm": 24.94025993347168,
|
|
"learning_rate": 1.278514658969061e-06,
|
|
"loss": 0.3308,
|
|
"num_input_tokens_seen": 23093568,
|
|
"step": 7330
|
|
},
|
|
{
|
|
"epoch": 0.46956020741309773,
|
|
"grad_norm": 29.178998947143555,
|
|
"learning_rate": 1.2774413291012648e-06,
|
|
"loss": 0.5047,
|
|
"num_input_tokens_seen": 23108992,
|
|
"step": 7335
|
|
},
|
|
{
|
|
"epoch": 0.46988028935407467,
|
|
"grad_norm": 25.278213500976562,
|
|
"learning_rate": 1.2763676528447122e-06,
|
|
"loss": 0.4191,
|
|
"num_input_tokens_seen": 23124992,
|
|
"step": 7340
|
|
},
|
|
{
|
|
"epoch": 0.47020037129505154,
|
|
"grad_norm": 31.44306755065918,
|
|
"learning_rate": 1.2752936315399003e-06,
|
|
"loss": 0.3417,
|
|
"num_input_tokens_seen": 23141888,
|
|
"step": 7345
|
|
},
|
|
{
|
|
"epoch": 0.4705204532360284,
|
|
"grad_norm": 27.29042625427246,
|
|
"learning_rate": 1.2742192665277566e-06,
|
|
"loss": 0.3346,
|
|
"num_input_tokens_seen": 23157888,
|
|
"step": 7350
|
|
},
|
|
{
|
|
"epoch": 0.4708405351770053,
|
|
"grad_norm": 25.130107879638672,
|
|
"learning_rate": 1.2731445591496393e-06,
|
|
"loss": 0.2813,
|
|
"num_input_tokens_seen": 23172864,
|
|
"step": 7355
|
|
},
|
|
{
|
|
"epoch": 0.4711606171179822,
|
|
"grad_norm": 45.540672302246094,
|
|
"learning_rate": 1.2720695107473325e-06,
|
|
"loss": 0.4622,
|
|
"num_input_tokens_seen": 23188352,
|
|
"step": 7360
|
|
},
|
|
{
|
|
"epoch": 0.4714806990589591,
|
|
"grad_norm": 38.563602447509766,
|
|
"learning_rate": 1.2709941226630475e-06,
|
|
"loss": 0.3897,
|
|
"num_input_tokens_seen": 23204096,
|
|
"step": 7365
|
|
},
|
|
{
|
|
"epoch": 0.471800780999936,
|
|
"grad_norm": 27.982297897338867,
|
|
"learning_rate": 1.2699183962394182e-06,
|
|
"loss": 0.3513,
|
|
"num_input_tokens_seen": 23219072,
|
|
"step": 7370
|
|
},
|
|
{
|
|
"epoch": 0.4721208629409129,
|
|
"grad_norm": 15.643006324768066,
|
|
"learning_rate": 1.2688423328195021e-06,
|
|
"loss": 0.4198,
|
|
"num_input_tokens_seen": 23234560,
|
|
"step": 7375
|
|
},
|
|
{
|
|
"epoch": 0.47244094488188976,
|
|
"grad_norm": 62.19183349609375,
|
|
"learning_rate": 1.267765933746777e-06,
|
|
"loss": 0.3426,
|
|
"num_input_tokens_seen": 23250304,
|
|
"step": 7380
|
|
},
|
|
{
|
|
"epoch": 0.47276102682286664,
|
|
"grad_norm": 51.6485710144043,
|
|
"learning_rate": 1.2666892003651397e-06,
|
|
"loss": 0.6245,
|
|
"num_input_tokens_seen": 23265664,
|
|
"step": 7385
|
|
},
|
|
{
|
|
"epoch": 0.4730811087638435,
|
|
"grad_norm": 28.73395538330078,
|
|
"learning_rate": 1.2656121340189043e-06,
|
|
"loss": 0.442,
|
|
"num_input_tokens_seen": 23281472,
|
|
"step": 7390
|
|
},
|
|
{
|
|
"epoch": 0.47340119070482045,
|
|
"grad_norm": 28.408031463623047,
|
|
"learning_rate": 1.264534736052801e-06,
|
|
"loss": 0.411,
|
|
"num_input_tokens_seen": 23297024,
|
|
"step": 7395
|
|
},
|
|
{
|
|
"epoch": 0.47372127264579733,
|
|
"grad_norm": 41.88270950317383,
|
|
"learning_rate": 1.2634570078119739e-06,
|
|
"loss": 0.4385,
|
|
"num_input_tokens_seen": 23313344,
|
|
"step": 7400
|
|
},
|
|
{
|
|
"epoch": 0.4740413545867742,
|
|
"grad_norm": 27.301424026489258,
|
|
"learning_rate": 1.262378950641979e-06,
|
|
"loss": 0.5213,
|
|
"num_input_tokens_seen": 23328512,
|
|
"step": 7405
|
|
},
|
|
{
|
|
"epoch": 0.4743614365277511,
|
|
"grad_norm": 23.59923553466797,
|
|
"learning_rate": 1.2613005658887836e-06,
|
|
"loss": 0.4465,
|
|
"num_input_tokens_seen": 23342400,
|
|
"step": 7410
|
|
},
|
|
{
|
|
"epoch": 0.47468151846872797,
|
|
"grad_norm": 34.58885192871094,
|
|
"learning_rate": 1.2602218548987637e-06,
|
|
"loss": 0.4134,
|
|
"num_input_tokens_seen": 23358400,
|
|
"step": 7415
|
|
},
|
|
{
|
|
"epoch": 0.4750016004097049,
|
|
"grad_norm": 32.09384536743164,
|
|
"learning_rate": 1.2591428190187029e-06,
|
|
"loss": 0.4102,
|
|
"num_input_tokens_seen": 23373376,
|
|
"step": 7420
|
|
},
|
|
{
|
|
"epoch": 0.4753216823506818,
|
|
"grad_norm": 57.16767501831055,
|
|
"learning_rate": 1.2580634595957898e-06,
|
|
"loss": 0.5013,
|
|
"num_input_tokens_seen": 23390400,
|
|
"step": 7425
|
|
},
|
|
{
|
|
"epoch": 0.47564176429165866,
|
|
"grad_norm": 27.278974533081055,
|
|
"learning_rate": 1.2569837779776172e-06,
|
|
"loss": 0.3705,
|
|
"num_input_tokens_seen": 23406400,
|
|
"step": 7430
|
|
},
|
|
{
|
|
"epoch": 0.47596184623263554,
|
|
"grad_norm": 27.228130340576172,
|
|
"learning_rate": 1.2559037755121804e-06,
|
|
"loss": 0.3131,
|
|
"num_input_tokens_seen": 23421824,
|
|
"step": 7435
|
|
},
|
|
{
|
|
"epoch": 0.4762819281736124,
|
|
"grad_norm": 51.93519592285156,
|
|
"learning_rate": 1.2548234535478754e-06,
|
|
"loss": 0.4512,
|
|
"num_input_tokens_seen": 23438272,
|
|
"step": 7440
|
|
},
|
|
{
|
|
"epoch": 0.47660201011458936,
|
|
"grad_norm": 17.943632125854492,
|
|
"learning_rate": 1.2537428134334968e-06,
|
|
"loss": 0.4216,
|
|
"num_input_tokens_seen": 23454976,
|
|
"step": 7445
|
|
},
|
|
{
|
|
"epoch": 0.47692209205556624,
|
|
"grad_norm": 98.46037292480469,
|
|
"learning_rate": 1.252661856518236e-06,
|
|
"loss": 0.5189,
|
|
"num_input_tokens_seen": 23471168,
|
|
"step": 7450
|
|
},
|
|
{
|
|
"epoch": 0.4772421739965431,
|
|
"grad_norm": 28.342315673828125,
|
|
"learning_rate": 1.251580584151681e-06,
|
|
"loss": 0.3564,
|
|
"num_input_tokens_seen": 23486720,
|
|
"step": 7455
|
|
},
|
|
{
|
|
"epoch": 0.47756225593752,
|
|
"grad_norm": 21.639692306518555,
|
|
"learning_rate": 1.2504989976838129e-06,
|
|
"loss": 0.3059,
|
|
"num_input_tokens_seen": 23502912,
|
|
"step": 7460
|
|
},
|
|
{
|
|
"epoch": 0.4778823378784969,
|
|
"grad_norm": 26.391496658325195,
|
|
"learning_rate": 1.2494170984650048e-06,
|
|
"loss": 0.3667,
|
|
"num_input_tokens_seen": 23519552,
|
|
"step": 7465
|
|
},
|
|
{
|
|
"epoch": 0.4782024198194738,
|
|
"grad_norm": 31.00334930419922,
|
|
"learning_rate": 1.248334887846021e-06,
|
|
"loss": 0.4019,
|
|
"num_input_tokens_seen": 23535936,
|
|
"step": 7470
|
|
},
|
|
{
|
|
"epoch": 0.4785225017604507,
|
|
"grad_norm": 29.97296142578125,
|
|
"learning_rate": 1.2472523671780135e-06,
|
|
"loss": 0.4373,
|
|
"num_input_tokens_seen": 23551040,
|
|
"step": 7475
|
|
},
|
|
{
|
|
"epoch": 0.47884258370142757,
|
|
"grad_norm": 35.39260482788086,
|
|
"learning_rate": 1.2461695378125233e-06,
|
|
"loss": 0.3115,
|
|
"num_input_tokens_seen": 23566208,
|
|
"step": 7480
|
|
},
|
|
{
|
|
"epoch": 0.47916266564240445,
|
|
"grad_norm": 20.799793243408203,
|
|
"learning_rate": 1.245086401101474e-06,
|
|
"loss": 0.4197,
|
|
"num_input_tokens_seen": 23581696,
|
|
"step": 7485
|
|
},
|
|
{
|
|
"epoch": 0.47948274758338133,
|
|
"grad_norm": 69.36449432373047,
|
|
"learning_rate": 1.2440029583971757e-06,
|
|
"loss": 0.4454,
|
|
"num_input_tokens_seen": 23597248,
|
|
"step": 7490
|
|
},
|
|
{
|
|
"epoch": 0.4798028295243582,
|
|
"grad_norm": 16.190322875976562,
|
|
"learning_rate": 1.2429192110523188e-06,
|
|
"loss": 0.4913,
|
|
"num_input_tokens_seen": 23612800,
|
|
"step": 7495
|
|
},
|
|
{
|
|
"epoch": 0.48012291146533514,
|
|
"grad_norm": 28.28662109375,
|
|
"learning_rate": 1.2418351604199746e-06,
|
|
"loss": 0.3338,
|
|
"num_input_tokens_seen": 23629056,
|
|
"step": 7500
|
|
},
|
|
{
|
|
"epoch": 0.480442993406312,
|
|
"grad_norm": 39.906612396240234,
|
|
"learning_rate": 1.2407508078535934e-06,
|
|
"loss": 0.4447,
|
|
"num_input_tokens_seen": 23644352,
|
|
"step": 7505
|
|
},
|
|
{
|
|
"epoch": 0.4807630753472889,
|
|
"grad_norm": 25.87689208984375,
|
|
"learning_rate": 1.2396661547070017e-06,
|
|
"loss": 0.2785,
|
|
"num_input_tokens_seen": 23661120,
|
|
"step": 7510
|
|
},
|
|
{
|
|
"epoch": 0.4810831572882658,
|
|
"grad_norm": 18.180044174194336,
|
|
"learning_rate": 1.238581202334402e-06,
|
|
"loss": 0.3347,
|
|
"num_input_tokens_seen": 23677632,
|
|
"step": 7515
|
|
},
|
|
{
|
|
"epoch": 0.48140323922924266,
|
|
"grad_norm": 26.29235076904297,
|
|
"learning_rate": 1.2374959520903699e-06,
|
|
"loss": 0.3673,
|
|
"num_input_tokens_seen": 23693952,
|
|
"step": 7520
|
|
},
|
|
{
|
|
"epoch": 0.4817233211702196,
|
|
"grad_norm": 17.1253662109375,
|
|
"learning_rate": 1.2364104053298531e-06,
|
|
"loss": 0.3341,
|
|
"num_input_tokens_seen": 23708736,
|
|
"step": 7525
|
|
},
|
|
{
|
|
"epoch": 0.4820434031111965,
|
|
"grad_norm": 30.4875431060791,
|
|
"learning_rate": 1.2353245634081692e-06,
|
|
"loss": 0.3913,
|
|
"num_input_tokens_seen": 23724864,
|
|
"step": 7530
|
|
},
|
|
{
|
|
"epoch": 0.48236348505217336,
|
|
"grad_norm": 23.729246139526367,
|
|
"learning_rate": 1.2342384276810053e-06,
|
|
"loss": 0.4148,
|
|
"num_input_tokens_seen": 23740160,
|
|
"step": 7535
|
|
},
|
|
{
|
|
"epoch": 0.48268356699315024,
|
|
"grad_norm": 70.08629608154297,
|
|
"learning_rate": 1.233151999504414e-06,
|
|
"loss": 0.423,
|
|
"num_input_tokens_seen": 23755264,
|
|
"step": 7540
|
|
},
|
|
{
|
|
"epoch": 0.4830036489341271,
|
|
"grad_norm": 46.91286849975586,
|
|
"learning_rate": 1.232065280234814e-06,
|
|
"loss": 0.3317,
|
|
"num_input_tokens_seen": 23770112,
|
|
"step": 7545
|
|
},
|
|
{
|
|
"epoch": 0.48332373087510405,
|
|
"grad_norm": 24.17731285095215,
|
|
"learning_rate": 1.2309782712289867e-06,
|
|
"loss": 0.4189,
|
|
"num_input_tokens_seen": 23785536,
|
|
"step": 7550
|
|
},
|
|
{
|
|
"epoch": 0.48364381281608093,
|
|
"grad_norm": 50.58120346069336,
|
|
"learning_rate": 1.2298909738440758e-06,
|
|
"loss": 0.4307,
|
|
"num_input_tokens_seen": 23801280,
|
|
"step": 7555
|
|
},
|
|
{
|
|
"epoch": 0.4839638947570578,
|
|
"grad_norm": 39.50659942626953,
|
|
"learning_rate": 1.2288033894375847e-06,
|
|
"loss": 0.371,
|
|
"num_input_tokens_seen": 23816448,
|
|
"step": 7560
|
|
},
|
|
{
|
|
"epoch": 0.4842839766980347,
|
|
"grad_norm": 31.22879409790039,
|
|
"learning_rate": 1.2277155193673755e-06,
|
|
"loss": 0.5539,
|
|
"num_input_tokens_seen": 23832512,
|
|
"step": 7565
|
|
},
|
|
{
|
|
"epoch": 0.48460405863901157,
|
|
"grad_norm": 14.704495429992676,
|
|
"learning_rate": 1.2266273649916668e-06,
|
|
"loss": 0.3968,
|
|
"num_input_tokens_seen": 23848192,
|
|
"step": 7570
|
|
},
|
|
{
|
|
"epoch": 0.48492414057998845,
|
|
"grad_norm": 18.676654815673828,
|
|
"learning_rate": 1.2255389276690318e-06,
|
|
"loss": 0.4249,
|
|
"num_input_tokens_seen": 23863808,
|
|
"step": 7575
|
|
},
|
|
{
|
|
"epoch": 0.4852442225209654,
|
|
"grad_norm": 32.08503341674805,
|
|
"learning_rate": 1.2244502087583978e-06,
|
|
"loss": 0.2927,
|
|
"num_input_tokens_seen": 23880960,
|
|
"step": 7580
|
|
},
|
|
{
|
|
"epoch": 0.48556430446194226,
|
|
"grad_norm": 46.882720947265625,
|
|
"learning_rate": 1.2233612096190426e-06,
|
|
"loss": 0.3969,
|
|
"num_input_tokens_seen": 23896256,
|
|
"step": 7585
|
|
},
|
|
{
|
|
"epoch": 0.48588438640291914,
|
|
"grad_norm": 36.5152473449707,
|
|
"learning_rate": 1.222271931610595e-06,
|
|
"loss": 0.5189,
|
|
"num_input_tokens_seen": 23912832,
|
|
"step": 7590
|
|
},
|
|
{
|
|
"epoch": 0.486204468343896,
|
|
"grad_norm": 26.63950538635254,
|
|
"learning_rate": 1.2211823760930306e-06,
|
|
"loss": 0.4929,
|
|
"num_input_tokens_seen": 23928768,
|
|
"step": 7595
|
|
},
|
|
{
|
|
"epoch": 0.4865245502848729,
|
|
"grad_norm": 18.74747657775879,
|
|
"learning_rate": 1.2200925444266726e-06,
|
|
"loss": 0.4206,
|
|
"num_input_tokens_seen": 23945088,
|
|
"step": 7600
|
|
},
|
|
{
|
|
"epoch": 0.48684463222584984,
|
|
"grad_norm": 39.23282241821289,
|
|
"learning_rate": 1.219002437972189e-06,
|
|
"loss": 0.5087,
|
|
"num_input_tokens_seen": 23960192,
|
|
"step": 7605
|
|
},
|
|
{
|
|
"epoch": 0.4871647141668267,
|
|
"grad_norm": 31.527008056640625,
|
|
"learning_rate": 1.21791205809059e-06,
|
|
"loss": 0.4208,
|
|
"num_input_tokens_seen": 23977152,
|
|
"step": 7610
|
|
},
|
|
{
|
|
"epoch": 0.4874847961078036,
|
|
"grad_norm": 30.472713470458984,
|
|
"learning_rate": 1.2168214061432283e-06,
|
|
"loss": 0.3611,
|
|
"num_input_tokens_seen": 23992448,
|
|
"step": 7615
|
|
},
|
|
{
|
|
"epoch": 0.4878048780487805,
|
|
"grad_norm": 24.9169864654541,
|
|
"learning_rate": 1.2157304834917947e-06,
|
|
"loss": 0.4276,
|
|
"num_input_tokens_seen": 24008384,
|
|
"step": 7620
|
|
},
|
|
{
|
|
"epoch": 0.48812495998975736,
|
|
"grad_norm": 28.272476196289062,
|
|
"learning_rate": 1.2146392914983202e-06,
|
|
"loss": 0.6241,
|
|
"num_input_tokens_seen": 24025728,
|
|
"step": 7625
|
|
},
|
|
{
|
|
"epoch": 0.4884450419307343,
|
|
"grad_norm": 44.216453552246094,
|
|
"learning_rate": 1.2135478315251694e-06,
|
|
"loss": 0.5169,
|
|
"num_input_tokens_seen": 24040448,
|
|
"step": 7630
|
|
},
|
|
{
|
|
"epoch": 0.48876512387171117,
|
|
"grad_norm": 26.274669647216797,
|
|
"learning_rate": 1.2124561049350442e-06,
|
|
"loss": 0.3428,
|
|
"num_input_tokens_seen": 24055168,
|
|
"step": 7635
|
|
},
|
|
{
|
|
"epoch": 0.48908520581268805,
|
|
"grad_norm": 41.2357292175293,
|
|
"learning_rate": 1.2113641130909772e-06,
|
|
"loss": 0.453,
|
|
"num_input_tokens_seen": 24070016,
|
|
"step": 7640
|
|
},
|
|
{
|
|
"epoch": 0.48940528775366493,
|
|
"grad_norm": 58.80428695678711,
|
|
"learning_rate": 1.2102718573563334e-06,
|
|
"loss": 0.3108,
|
|
"num_input_tokens_seen": 24084800,
|
|
"step": 7645
|
|
},
|
|
{
|
|
"epoch": 0.4897253696946418,
|
|
"grad_norm": 53.14729309082031,
|
|
"learning_rate": 1.2091793390948066e-06,
|
|
"loss": 0.4842,
|
|
"num_input_tokens_seen": 24100416,
|
|
"step": 7650
|
|
},
|
|
{
|
|
"epoch": 0.49004545163561875,
|
|
"grad_norm": 17.676326751708984,
|
|
"learning_rate": 1.2080865596704191e-06,
|
|
"loss": 0.2906,
|
|
"num_input_tokens_seen": 24117120,
|
|
"step": 7655
|
|
},
|
|
{
|
|
"epoch": 0.4903655335765956,
|
|
"grad_norm": 30.914222717285156,
|
|
"learning_rate": 1.2069935204475187e-06,
|
|
"loss": 0.4391,
|
|
"num_input_tokens_seen": 24132224,
|
|
"step": 7660
|
|
},
|
|
{
|
|
"epoch": 0.4906856155175725,
|
|
"grad_norm": 23.044315338134766,
|
|
"learning_rate": 1.2059002227907776e-06,
|
|
"loss": 0.3992,
|
|
"num_input_tokens_seen": 24147712,
|
|
"step": 7665
|
|
},
|
|
{
|
|
"epoch": 0.4910056974585494,
|
|
"grad_norm": 37.006168365478516,
|
|
"learning_rate": 1.2048066680651908e-06,
|
|
"loss": 0.4121,
|
|
"num_input_tokens_seen": 24164288,
|
|
"step": 7670
|
|
},
|
|
{
|
|
"epoch": 0.49132577939952626,
|
|
"grad_norm": 37.811988830566406,
|
|
"learning_rate": 1.2037128576360743e-06,
|
|
"loss": 0.5577,
|
|
"num_input_tokens_seen": 24193728,
|
|
"step": 7675
|
|
},
|
|
{
|
|
"epoch": 0.49164586134050314,
|
|
"grad_norm": 36.05268478393555,
|
|
"learning_rate": 1.2026187928690627e-06,
|
|
"loss": 0.4148,
|
|
"num_input_tokens_seen": 24208832,
|
|
"step": 7680
|
|
},
|
|
{
|
|
"epoch": 0.4919659432814801,
|
|
"grad_norm": 34.80404281616211,
|
|
"learning_rate": 1.2015244751301098e-06,
|
|
"loss": 0.5085,
|
|
"num_input_tokens_seen": 24223424,
|
|
"step": 7685
|
|
},
|
|
{
|
|
"epoch": 0.49228602522245696,
|
|
"grad_norm": 47.47758865356445,
|
|
"learning_rate": 1.2004299057854832e-06,
|
|
"loss": 0.43,
|
|
"num_input_tokens_seen": 24238976,
|
|
"step": 7690
|
|
},
|
|
{
|
|
"epoch": 0.49260610716343384,
|
|
"grad_norm": 22.682682037353516,
|
|
"learning_rate": 1.1993350862017661e-06,
|
|
"loss": 0.3893,
|
|
"num_input_tokens_seen": 24253632,
|
|
"step": 7695
|
|
},
|
|
{
|
|
"epoch": 0.4929261891044107,
|
|
"grad_norm": 35.10201644897461,
|
|
"learning_rate": 1.1982400177458534e-06,
|
|
"loss": 0.3968,
|
|
"num_input_tokens_seen": 24270720,
|
|
"step": 7700
|
|
},
|
|
{
|
|
"epoch": 0.4932462710453876,
|
|
"grad_norm": 34.98603820800781,
|
|
"learning_rate": 1.197144701784951e-06,
|
|
"loss": 0.4284,
|
|
"num_input_tokens_seen": 24285312,
|
|
"step": 7705
|
|
},
|
|
{
|
|
"epoch": 0.49356635298636453,
|
|
"grad_norm": 32.93339157104492,
|
|
"learning_rate": 1.1960491396865735e-06,
|
|
"loss": 0.3926,
|
|
"num_input_tokens_seen": 24300352,
|
|
"step": 7710
|
|
},
|
|
{
|
|
"epoch": 0.4938864349273414,
|
|
"grad_norm": 27.799358367919922,
|
|
"learning_rate": 1.1949533328185435e-06,
|
|
"loss": 0.3458,
|
|
"num_input_tokens_seen": 24317056,
|
|
"step": 7715
|
|
},
|
|
{
|
|
"epoch": 0.4942065168683183,
|
|
"grad_norm": 25.46038818359375,
|
|
"learning_rate": 1.1938572825489883e-06,
|
|
"loss": 0.3741,
|
|
"num_input_tokens_seen": 24333184,
|
|
"step": 7720
|
|
},
|
|
{
|
|
"epoch": 0.49452659880929517,
|
|
"grad_norm": 29.320058822631836,
|
|
"learning_rate": 1.1927609902463394e-06,
|
|
"loss": 0.409,
|
|
"num_input_tokens_seen": 24348672,
|
|
"step": 7725
|
|
},
|
|
{
|
|
"epoch": 0.49484668075027205,
|
|
"grad_norm": 44.419612884521484,
|
|
"learning_rate": 1.1916644572793314e-06,
|
|
"loss": 0.4346,
|
|
"num_input_tokens_seen": 24363648,
|
|
"step": 7730
|
|
},
|
|
{
|
|
"epoch": 0.495166762691249,
|
|
"grad_norm": 74.09778594970703,
|
|
"learning_rate": 1.190567685016998e-06,
|
|
"loss": 0.4964,
|
|
"num_input_tokens_seen": 24380992,
|
|
"step": 7735
|
|
},
|
|
{
|
|
"epoch": 0.49548684463222586,
|
|
"grad_norm": 27.674976348876953,
|
|
"learning_rate": 1.189470674828672e-06,
|
|
"loss": 0.4107,
|
|
"num_input_tokens_seen": 24395776,
|
|
"step": 7740
|
|
},
|
|
{
|
|
"epoch": 0.49580692657320274,
|
|
"grad_norm": 25.768115997314453,
|
|
"learning_rate": 1.188373428083984e-06,
|
|
"loss": 0.3878,
|
|
"num_input_tokens_seen": 24411584,
|
|
"step": 7745
|
|
},
|
|
{
|
|
"epoch": 0.4961270085141796,
|
|
"grad_norm": 44.345550537109375,
|
|
"learning_rate": 1.1872759461528596e-06,
|
|
"loss": 0.5219,
|
|
"num_input_tokens_seen": 24426560,
|
|
"step": 7750
|
|
},
|
|
{
|
|
"epoch": 0.4964470904551565,
|
|
"grad_norm": 13.35042667388916,
|
|
"learning_rate": 1.1861782304055174e-06,
|
|
"loss": 0.39,
|
|
"num_input_tokens_seen": 24441856,
|
|
"step": 7755
|
|
},
|
|
{
|
|
"epoch": 0.4967671723961334,
|
|
"grad_norm": 18.407421112060547,
|
|
"learning_rate": 1.1850802822124686e-06,
|
|
"loss": 0.3345,
|
|
"num_input_tokens_seen": 24457472,
|
|
"step": 7760
|
|
},
|
|
{
|
|
"epoch": 0.4970872543371103,
|
|
"grad_norm": 57.33185577392578,
|
|
"learning_rate": 1.1839821029445143e-06,
|
|
"loss": 0.5005,
|
|
"num_input_tokens_seen": 24471936,
|
|
"step": 7765
|
|
},
|
|
{
|
|
"epoch": 0.4974073362780872,
|
|
"grad_norm": 35.684871673583984,
|
|
"learning_rate": 1.1828836939727442e-06,
|
|
"loss": 0.3195,
|
|
"num_input_tokens_seen": 24487616,
|
|
"step": 7770
|
|
},
|
|
{
|
|
"epoch": 0.4977274182190641,
|
|
"grad_norm": 39.44476318359375,
|
|
"learning_rate": 1.181785056668535e-06,
|
|
"loss": 0.433,
|
|
"num_input_tokens_seen": 24503936,
|
|
"step": 7775
|
|
},
|
|
{
|
|
"epoch": 0.49804750016004096,
|
|
"grad_norm": 31.5116024017334,
|
|
"learning_rate": 1.180686192403548e-06,
|
|
"loss": 0.4212,
|
|
"num_input_tokens_seen": 24518464,
|
|
"step": 7780
|
|
},
|
|
{
|
|
"epoch": 0.49836758210101784,
|
|
"grad_norm": 69.69412231445312,
|
|
"learning_rate": 1.1795871025497285e-06,
|
|
"loss": 0.3439,
|
|
"num_input_tokens_seen": 24533184,
|
|
"step": 7785
|
|
},
|
|
{
|
|
"epoch": 0.49868766404199477,
|
|
"grad_norm": 33.76158905029297,
|
|
"learning_rate": 1.1784877884793029e-06,
|
|
"loss": 0.4122,
|
|
"num_input_tokens_seen": 24548992,
|
|
"step": 7790
|
|
},
|
|
{
|
|
"epoch": 0.49900774598297165,
|
|
"grad_norm": 32.13736343383789,
|
|
"learning_rate": 1.1773882515647776e-06,
|
|
"loss": 0.3627,
|
|
"num_input_tokens_seen": 24566592,
|
|
"step": 7795
|
|
},
|
|
{
|
|
"epoch": 0.49932782792394853,
|
|
"grad_norm": 26.241132736206055,
|
|
"learning_rate": 1.1762884931789376e-06,
|
|
"loss": 0.4811,
|
|
"num_input_tokens_seen": 24583552,
|
|
"step": 7800
|
|
},
|
|
{
|
|
"epoch": 0.4996479098649254,
|
|
"grad_norm": 15.578927040100098,
|
|
"learning_rate": 1.1751885146948436e-06,
|
|
"loss": 0.4548,
|
|
"num_input_tokens_seen": 24599552,
|
|
"step": 7805
|
|
},
|
|
{
|
|
"epoch": 0.4999679918059023,
|
|
"grad_norm": 34.21600341796875,
|
|
"learning_rate": 1.1740883174858327e-06,
|
|
"loss": 0.3633,
|
|
"num_input_tokens_seen": 24614912,
|
|
"step": 7810
|
|
},
|
|
{
|
|
"epoch": 0.5002880737468792,
|
|
"grad_norm": 33.92721939086914,
|
|
"learning_rate": 1.1729879029255127e-06,
|
|
"loss": 0.3649,
|
|
"num_input_tokens_seen": 24629696,
|
|
"step": 7815
|
|
},
|
|
{
|
|
"epoch": 0.5006081556878561,
|
|
"grad_norm": 32.14542007446289,
|
|
"learning_rate": 1.171887272387765e-06,
|
|
"loss": 0.3939,
|
|
"num_input_tokens_seen": 24646208,
|
|
"step": 7820
|
|
},
|
|
{
|
|
"epoch": 0.5006081556878561,
|
|
"eval_loss": 0.4134162962436676,
|
|
"eval_runtime": 49.1457,
|
|
"eval_samples_per_second": 282.548,
|
|
"eval_steps_per_second": 35.324,
|
|
"num_input_tokens_seen": 24646208,
|
|
"step": 7820
|
|
},
|
|
{
|
|
"epoch": 0.500928237628833,
|
|
"grad_norm": 79.38529205322266,
|
|
"learning_rate": 1.1707864272467397e-06,
|
|
"loss": 0.4985,
|
|
"num_input_tokens_seen": 24661120,
|
|
"step": 7825
|
|
},
|
|
{
|
|
"epoch": 0.5012483195698099,
|
|
"grad_norm": 39.66872024536133,
|
|
"learning_rate": 1.169685368876855e-06,
|
|
"loss": 0.423,
|
|
"num_input_tokens_seen": 24678336,
|
|
"step": 7830
|
|
},
|
|
{
|
|
"epoch": 0.5015684015107867,
|
|
"grad_norm": 61.929866790771484,
|
|
"learning_rate": 1.1685840986527946e-06,
|
|
"loss": 0.5534,
|
|
"num_input_tokens_seen": 24694336,
|
|
"step": 7835
|
|
},
|
|
{
|
|
"epoch": 0.5018884834517636,
|
|
"grad_norm": 36.273685455322266,
|
|
"learning_rate": 1.1674826179495076e-06,
|
|
"loss": 0.4044,
|
|
"num_input_tokens_seen": 24708608,
|
|
"step": 7840
|
|
},
|
|
{
|
|
"epoch": 0.5022085653927405,
|
|
"grad_norm": 33.48814010620117,
|
|
"learning_rate": 1.1663809281422056e-06,
|
|
"loss": 0.415,
|
|
"num_input_tokens_seen": 24724672,
|
|
"step": 7845
|
|
},
|
|
{
|
|
"epoch": 0.5025286473337174,
|
|
"grad_norm": 42.979496002197266,
|
|
"learning_rate": 1.1652790306063615e-06,
|
|
"loss": 0.4562,
|
|
"num_input_tokens_seen": 24740608,
|
|
"step": 7850
|
|
},
|
|
{
|
|
"epoch": 0.5028487292746944,
|
|
"grad_norm": 37.959041595458984,
|
|
"learning_rate": 1.164176926717707e-06,
|
|
"loss": 0.416,
|
|
"num_input_tokens_seen": 24758528,
|
|
"step": 7855
|
|
},
|
|
{
|
|
"epoch": 0.5031688112156713,
|
|
"grad_norm": 23.2774658203125,
|
|
"learning_rate": 1.1630746178522315e-06,
|
|
"loss": 0.3702,
|
|
"num_input_tokens_seen": 24772992,
|
|
"step": 7860
|
|
},
|
|
{
|
|
"epoch": 0.5034888931566481,
|
|
"grad_norm": 27.682905197143555,
|
|
"learning_rate": 1.1619721053861816e-06,
|
|
"loss": 0.4398,
|
|
"num_input_tokens_seen": 24788160,
|
|
"step": 7865
|
|
},
|
|
{
|
|
"epoch": 0.503808975097625,
|
|
"grad_norm": 19.770153045654297,
|
|
"learning_rate": 1.1608693906960558e-06,
|
|
"loss": 0.4093,
|
|
"num_input_tokens_seen": 24804224,
|
|
"step": 7870
|
|
},
|
|
{
|
|
"epoch": 0.5041290570386019,
|
|
"grad_norm": 30.391685485839844,
|
|
"learning_rate": 1.1597664751586069e-06,
|
|
"loss": 0.4426,
|
|
"num_input_tokens_seen": 24820928,
|
|
"step": 7875
|
|
},
|
|
{
|
|
"epoch": 0.5044491389795788,
|
|
"grad_norm": 49.482810974121094,
|
|
"learning_rate": 1.1586633601508382e-06,
|
|
"loss": 0.3837,
|
|
"num_input_tokens_seen": 24835776,
|
|
"step": 7880
|
|
},
|
|
{
|
|
"epoch": 0.5047692209205557,
|
|
"grad_norm": 46.44161605834961,
|
|
"learning_rate": 1.1575600470500014e-06,
|
|
"loss": 0.3858,
|
|
"num_input_tokens_seen": 24851648,
|
|
"step": 7885
|
|
},
|
|
{
|
|
"epoch": 0.5050893028615325,
|
|
"grad_norm": 59.1083869934082,
|
|
"learning_rate": 1.1564565372335957e-06,
|
|
"loss": 0.42,
|
|
"num_input_tokens_seen": 24866880,
|
|
"step": 7890
|
|
},
|
|
{
|
|
"epoch": 0.5054093848025094,
|
|
"grad_norm": 41.57418441772461,
|
|
"learning_rate": 1.1553528320793663e-06,
|
|
"loss": 0.3162,
|
|
"num_input_tokens_seen": 24881856,
|
|
"step": 7895
|
|
},
|
|
{
|
|
"epoch": 0.5057294667434863,
|
|
"grad_norm": 23.643510818481445,
|
|
"learning_rate": 1.1542489329653022e-06,
|
|
"loss": 0.4364,
|
|
"num_input_tokens_seen": 24898560,
|
|
"step": 7900
|
|
},
|
|
{
|
|
"epoch": 0.5060495486844632,
|
|
"grad_norm": 25.241592407226562,
|
|
"learning_rate": 1.1531448412696343e-06,
|
|
"loss": 0.3754,
|
|
"num_input_tokens_seen": 24913216,
|
|
"step": 7905
|
|
},
|
|
{
|
|
"epoch": 0.5063696306254402,
|
|
"grad_norm": 21.214923858642578,
|
|
"learning_rate": 1.1520405583708337e-06,
|
|
"loss": 0.4913,
|
|
"num_input_tokens_seen": 24928832,
|
|
"step": 7910
|
|
},
|
|
{
|
|
"epoch": 0.506689712566417,
|
|
"grad_norm": 33.57106018066406,
|
|
"learning_rate": 1.1509360856476109e-06,
|
|
"loss": 0.4917,
|
|
"num_input_tokens_seen": 24944512,
|
|
"step": 7915
|
|
},
|
|
{
|
|
"epoch": 0.5070097945073939,
|
|
"grad_norm": 37.114646911621094,
|
|
"learning_rate": 1.149831424478913e-06,
|
|
"loss": 0.4612,
|
|
"num_input_tokens_seen": 24959744,
|
|
"step": 7920
|
|
},
|
|
{
|
|
"epoch": 0.5073298764483708,
|
|
"grad_norm": 62.12904357910156,
|
|
"learning_rate": 1.1487265762439224e-06,
|
|
"loss": 0.3948,
|
|
"num_input_tokens_seen": 24975488,
|
|
"step": 7925
|
|
},
|
|
{
|
|
"epoch": 0.5076499583893477,
|
|
"grad_norm": 40.3009033203125,
|
|
"learning_rate": 1.1476215423220547e-06,
|
|
"loss": 0.362,
|
|
"num_input_tokens_seen": 24990272,
|
|
"step": 7930
|
|
},
|
|
{
|
|
"epoch": 0.5079700403303246,
|
|
"grad_norm": 39.82942199707031,
|
|
"learning_rate": 1.146516324092959e-06,
|
|
"loss": 0.3761,
|
|
"num_input_tokens_seen": 25006272,
|
|
"step": 7935
|
|
},
|
|
{
|
|
"epoch": 0.5082901222713014,
|
|
"grad_norm": 23.33016014099121,
|
|
"learning_rate": 1.1454109229365117e-06,
|
|
"loss": 0.2954,
|
|
"num_input_tokens_seen": 25022464,
|
|
"step": 7940
|
|
},
|
|
{
|
|
"epoch": 0.5086102042122783,
|
|
"grad_norm": 27.223312377929688,
|
|
"learning_rate": 1.14430534023282e-06,
|
|
"loss": 0.3132,
|
|
"num_input_tokens_seen": 25037376,
|
|
"step": 7945
|
|
},
|
|
{
|
|
"epoch": 0.5089302861532552,
|
|
"grad_norm": 36.93307876586914,
|
|
"learning_rate": 1.1431995773622167e-06,
|
|
"loss": 0.4736,
|
|
"num_input_tokens_seen": 25053440,
|
|
"step": 7950
|
|
},
|
|
{
|
|
"epoch": 0.5092503680942321,
|
|
"grad_norm": 21.982830047607422,
|
|
"learning_rate": 1.1420936357052597e-06,
|
|
"loss": 0.4369,
|
|
"num_input_tokens_seen": 25069120,
|
|
"step": 7955
|
|
},
|
|
{
|
|
"epoch": 0.5095704500352091,
|
|
"grad_norm": 22.12405014038086,
|
|
"learning_rate": 1.1409875166427303e-06,
|
|
"loss": 0.3078,
|
|
"num_input_tokens_seen": 25084224,
|
|
"step": 7960
|
|
},
|
|
{
|
|
"epoch": 0.509890531976186,
|
|
"grad_norm": 37.66783142089844,
|
|
"learning_rate": 1.1398812215556308e-06,
|
|
"loss": 0.4996,
|
|
"num_input_tokens_seen": 25099520,
|
|
"step": 7965
|
|
},
|
|
{
|
|
"epoch": 0.5102106139171628,
|
|
"grad_norm": 28.573827743530273,
|
|
"learning_rate": 1.1387747518251837e-06,
|
|
"loss": 0.362,
|
|
"num_input_tokens_seen": 25115200,
|
|
"step": 7970
|
|
},
|
|
{
|
|
"epoch": 0.5105306958581397,
|
|
"grad_norm": 20.292476654052734,
|
|
"learning_rate": 1.13766810883283e-06,
|
|
"loss": 0.3266,
|
|
"num_input_tokens_seen": 25131520,
|
|
"step": 7975
|
|
},
|
|
{
|
|
"epoch": 0.5108507777991166,
|
|
"grad_norm": 36.63866424560547,
|
|
"learning_rate": 1.1365612939602255e-06,
|
|
"loss": 0.5172,
|
|
"num_input_tokens_seen": 25147776,
|
|
"step": 7980
|
|
},
|
|
{
|
|
"epoch": 0.5111708597400935,
|
|
"grad_norm": 22.338659286499023,
|
|
"learning_rate": 1.1354543085892423e-06,
|
|
"loss": 0.3683,
|
|
"num_input_tokens_seen": 25162816,
|
|
"step": 7985
|
|
},
|
|
{
|
|
"epoch": 0.5114909416810703,
|
|
"grad_norm": 34.683868408203125,
|
|
"learning_rate": 1.1343471541019646e-06,
|
|
"loss": 0.3333,
|
|
"num_input_tokens_seen": 25178752,
|
|
"step": 7990
|
|
},
|
|
{
|
|
"epoch": 0.5118110236220472,
|
|
"grad_norm": 57.14018249511719,
|
|
"learning_rate": 1.1332398318806872e-06,
|
|
"loss": 0.3719,
|
|
"num_input_tokens_seen": 25194048,
|
|
"step": 7995
|
|
},
|
|
{
|
|
"epoch": 0.5121311055630241,
|
|
"grad_norm": 32.1242561340332,
|
|
"learning_rate": 1.1321323433079158e-06,
|
|
"loss": 0.3796,
|
|
"num_input_tokens_seen": 25209216,
|
|
"step": 8000
|
|
},
|
|
{
|
|
"epoch": 0.512451187504001,
|
|
"grad_norm": 28.248655319213867,
|
|
"learning_rate": 1.1310246897663623e-06,
|
|
"loss": 0.379,
|
|
"num_input_tokens_seen": 25224640,
|
|
"step": 8005
|
|
},
|
|
{
|
|
"epoch": 0.5127712694449779,
|
|
"grad_norm": 19.069774627685547,
|
|
"learning_rate": 1.1299168726389447e-06,
|
|
"loss": 0.408,
|
|
"num_input_tokens_seen": 25239808,
|
|
"step": 8010
|
|
},
|
|
{
|
|
"epoch": 0.5130913513859549,
|
|
"grad_norm": 42.42983627319336,
|
|
"learning_rate": 1.1288088933087868e-06,
|
|
"loss": 0.3354,
|
|
"num_input_tokens_seen": 25257344,
|
|
"step": 8015
|
|
},
|
|
{
|
|
"epoch": 0.5134114333269317,
|
|
"grad_norm": 22.4074764251709,
|
|
"learning_rate": 1.1277007531592127e-06,
|
|
"loss": 0.3365,
|
|
"num_input_tokens_seen": 25272064,
|
|
"step": 8020
|
|
},
|
|
{
|
|
"epoch": 0.5137315152679086,
|
|
"grad_norm": 28.663759231567383,
|
|
"learning_rate": 1.1265924535737492e-06,
|
|
"loss": 0.3619,
|
|
"num_input_tokens_seen": 25287936,
|
|
"step": 8025
|
|
},
|
|
{
|
|
"epoch": 0.5140515972088855,
|
|
"grad_norm": 39.256492614746094,
|
|
"learning_rate": 1.125483995936121e-06,
|
|
"loss": 0.3007,
|
|
"num_input_tokens_seen": 25303232,
|
|
"step": 8030
|
|
},
|
|
{
|
|
"epoch": 0.5143716791498624,
|
|
"grad_norm": 20.142274856567383,
|
|
"learning_rate": 1.1243753816302507e-06,
|
|
"loss": 0.376,
|
|
"num_input_tokens_seen": 25318656,
|
|
"step": 8035
|
|
},
|
|
{
|
|
"epoch": 0.5146917610908393,
|
|
"grad_norm": 46.976951599121094,
|
|
"learning_rate": 1.1232666120402558e-06,
|
|
"loss": 0.417,
|
|
"num_input_tokens_seen": 25333760,
|
|
"step": 8040
|
|
},
|
|
{
|
|
"epoch": 0.5150118430318161,
|
|
"grad_norm": 35.951576232910156,
|
|
"learning_rate": 1.1221576885504487e-06,
|
|
"loss": 0.3827,
|
|
"num_input_tokens_seen": 25349824,
|
|
"step": 8045
|
|
},
|
|
{
|
|
"epoch": 0.515331924972793,
|
|
"grad_norm": 19.6291561126709,
|
|
"learning_rate": 1.121048612545333e-06,
|
|
"loss": 0.4027,
|
|
"num_input_tokens_seen": 25365376,
|
|
"step": 8050
|
|
},
|
|
{
|
|
"epoch": 0.5156520069137699,
|
|
"grad_norm": 44.66822052001953,
|
|
"learning_rate": 1.1199393854096034e-06,
|
|
"loss": 0.4599,
|
|
"num_input_tokens_seen": 25380928,
|
|
"step": 8055
|
|
},
|
|
{
|
|
"epoch": 0.5159720888547468,
|
|
"grad_norm": 79.27295684814453,
|
|
"learning_rate": 1.118830008528143e-06,
|
|
"loss": 0.3487,
|
|
"num_input_tokens_seen": 25396352,
|
|
"step": 8060
|
|
},
|
|
{
|
|
"epoch": 0.5162921707957238,
|
|
"grad_norm": 21.75312042236328,
|
|
"learning_rate": 1.1177204832860212e-06,
|
|
"loss": 0.3159,
|
|
"num_input_tokens_seen": 25411456,
|
|
"step": 8065
|
|
},
|
|
{
|
|
"epoch": 0.5166122527367006,
|
|
"grad_norm": 19.3381290435791,
|
|
"learning_rate": 1.1166108110684947e-06,
|
|
"loss": 0.4322,
|
|
"num_input_tokens_seen": 25428544,
|
|
"step": 8070
|
|
},
|
|
{
|
|
"epoch": 0.5169323346776775,
|
|
"grad_norm": 37.30630111694336,
|
|
"learning_rate": 1.1155009932610003e-06,
|
|
"loss": 0.3988,
|
|
"num_input_tokens_seen": 25443968,
|
|
"step": 8075
|
|
},
|
|
{
|
|
"epoch": 0.5172524166186544,
|
|
"grad_norm": 45.22068786621094,
|
|
"learning_rate": 1.1143910312491605e-06,
|
|
"loss": 0.3273,
|
|
"num_input_tokens_seen": 25458880,
|
|
"step": 8080
|
|
},
|
|
{
|
|
"epoch": 0.5175724985596313,
|
|
"grad_norm": 53.44335174560547,
|
|
"learning_rate": 1.1132809264187748e-06,
|
|
"loss": 0.3196,
|
|
"num_input_tokens_seen": 25474304,
|
|
"step": 8085
|
|
},
|
|
{
|
|
"epoch": 0.5178925805006082,
|
|
"grad_norm": 59.70965576171875,
|
|
"learning_rate": 1.1121706801558226e-06,
|
|
"loss": 0.3884,
|
|
"num_input_tokens_seen": 25489472,
|
|
"step": 8090
|
|
},
|
|
{
|
|
"epoch": 0.518212662441585,
|
|
"grad_norm": 44.1774787902832,
|
|
"learning_rate": 1.111060293846459e-06,
|
|
"loss": 0.3827,
|
|
"num_input_tokens_seen": 25504896,
|
|
"step": 8095
|
|
},
|
|
{
|
|
"epoch": 0.5185327443825619,
|
|
"grad_norm": 79.03081512451172,
|
|
"learning_rate": 1.1099497688770148e-06,
|
|
"loss": 0.4807,
|
|
"num_input_tokens_seen": 25519360,
|
|
"step": 8100
|
|
},
|
|
{
|
|
"epoch": 0.5188528263235388,
|
|
"grad_norm": 35.3879280090332,
|
|
"learning_rate": 1.1088391066339928e-06,
|
|
"loss": 0.4418,
|
|
"num_input_tokens_seen": 25535680,
|
|
"step": 8105
|
|
},
|
|
{
|
|
"epoch": 0.5191729082645157,
|
|
"grad_norm": 43.35395050048828,
|
|
"learning_rate": 1.1077283085040684e-06,
|
|
"loss": 0.5327,
|
|
"num_input_tokens_seen": 25550592,
|
|
"step": 8110
|
|
},
|
|
{
|
|
"epoch": 0.5194929902054926,
|
|
"grad_norm": 39.26498031616211,
|
|
"learning_rate": 1.1066173758740863e-06,
|
|
"loss": 0.4083,
|
|
"num_input_tokens_seen": 25565696,
|
|
"step": 8115
|
|
},
|
|
{
|
|
"epoch": 0.5198130721464695,
|
|
"grad_norm": 17.995386123657227,
|
|
"learning_rate": 1.105506310131058e-06,
|
|
"loss": 0.3485,
|
|
"num_input_tokens_seen": 25581568,
|
|
"step": 8120
|
|
},
|
|
{
|
|
"epoch": 0.5201331540874464,
|
|
"grad_norm": 56.82388687133789,
|
|
"learning_rate": 1.1043951126621634e-06,
|
|
"loss": 0.466,
|
|
"num_input_tokens_seen": 25597760,
|
|
"step": 8125
|
|
},
|
|
{
|
|
"epoch": 0.5204532360284233,
|
|
"grad_norm": 31.271780014038086,
|
|
"learning_rate": 1.1032837848547445e-06,
|
|
"loss": 0.4111,
|
|
"num_input_tokens_seen": 25615424,
|
|
"step": 8130
|
|
},
|
|
{
|
|
"epoch": 0.5207733179694002,
|
|
"grad_norm": 33.19522476196289,
|
|
"learning_rate": 1.1021723280963074e-06,
|
|
"loss": 0.4094,
|
|
"num_input_tokens_seen": 25630720,
|
|
"step": 8135
|
|
},
|
|
{
|
|
"epoch": 0.5210933999103771,
|
|
"grad_norm": 40.24439239501953,
|
|
"learning_rate": 1.1010607437745194e-06,
|
|
"loss": 0.4886,
|
|
"num_input_tokens_seen": 25649280,
|
|
"step": 8140
|
|
},
|
|
{
|
|
"epoch": 0.5214134818513539,
|
|
"grad_norm": 49.17844009399414,
|
|
"learning_rate": 1.0999490332772057e-06,
|
|
"loss": 0.5002,
|
|
"num_input_tokens_seen": 25664576,
|
|
"step": 8145
|
|
},
|
|
{
|
|
"epoch": 0.5217335637923308,
|
|
"grad_norm": 26.123889923095703,
|
|
"learning_rate": 1.0988371979923507e-06,
|
|
"loss": 0.4193,
|
|
"num_input_tokens_seen": 25680384,
|
|
"step": 8150
|
|
},
|
|
{
|
|
"epoch": 0.5220536457333077,
|
|
"grad_norm": 26.953947067260742,
|
|
"learning_rate": 1.097725239308094e-06,
|
|
"loss": 0.4017,
|
|
"num_input_tokens_seen": 25696128,
|
|
"step": 8155
|
|
},
|
|
{
|
|
"epoch": 0.5223737276742846,
|
|
"grad_norm": 15.423673629760742,
|
|
"learning_rate": 1.0966131586127278e-06,
|
|
"loss": 0.2794,
|
|
"num_input_tokens_seen": 25712768,
|
|
"step": 8160
|
|
},
|
|
{
|
|
"epoch": 0.5226938096152615,
|
|
"grad_norm": 25.20142936706543,
|
|
"learning_rate": 1.0955009572946992e-06,
|
|
"loss": 0.4033,
|
|
"num_input_tokens_seen": 25727616,
|
|
"step": 8165
|
|
},
|
|
{
|
|
"epoch": 0.5230138915562383,
|
|
"grad_norm": 22.9870548248291,
|
|
"learning_rate": 1.094388636742604e-06,
|
|
"loss": 0.4149,
|
|
"num_input_tokens_seen": 25744384,
|
|
"step": 8170
|
|
},
|
|
{
|
|
"epoch": 0.5233339734972153,
|
|
"grad_norm": 31.26616859436035,
|
|
"learning_rate": 1.0932761983451878e-06,
|
|
"loss": 0.3376,
|
|
"num_input_tokens_seen": 25760640,
|
|
"step": 8175
|
|
},
|
|
{
|
|
"epoch": 0.5236540554381922,
|
|
"grad_norm": 32.35393142700195,
|
|
"learning_rate": 1.0921636434913425e-06,
|
|
"loss": 0.3116,
|
|
"num_input_tokens_seen": 25776640,
|
|
"step": 8180
|
|
},
|
|
{
|
|
"epoch": 0.5239741373791691,
|
|
"grad_norm": 26.09176254272461,
|
|
"learning_rate": 1.091050973570106e-06,
|
|
"loss": 0.2977,
|
|
"num_input_tokens_seen": 25791744,
|
|
"step": 8185
|
|
},
|
|
{
|
|
"epoch": 0.524294219320146,
|
|
"grad_norm": 49.68628692626953,
|
|
"learning_rate": 1.08993818997066e-06,
|
|
"loss": 0.5531,
|
|
"num_input_tokens_seen": 25808256,
|
|
"step": 8190
|
|
},
|
|
{
|
|
"epoch": 0.5246143012611229,
|
|
"grad_norm": 36.49836730957031,
|
|
"learning_rate": 1.0888252940823283e-06,
|
|
"loss": 0.4378,
|
|
"num_input_tokens_seen": 25824128,
|
|
"step": 8195
|
|
},
|
|
{
|
|
"epoch": 0.5249343832020997,
|
|
"grad_norm": 39.86119842529297,
|
|
"learning_rate": 1.0877122872945737e-06,
|
|
"loss": 0.4676,
|
|
"num_input_tokens_seen": 25840576,
|
|
"step": 8200
|
|
},
|
|
{
|
|
"epoch": 0.5252544651430766,
|
|
"grad_norm": 32.07432556152344,
|
|
"learning_rate": 1.0865991709969983e-06,
|
|
"loss": 0.317,
|
|
"num_input_tokens_seen": 25856256,
|
|
"step": 8205
|
|
},
|
|
{
|
|
"epoch": 0.5255745470840535,
|
|
"grad_norm": 20.993459701538086,
|
|
"learning_rate": 1.0854859465793416e-06,
|
|
"loss": 0.4482,
|
|
"num_input_tokens_seen": 25871424,
|
|
"step": 8210
|
|
},
|
|
{
|
|
"epoch": 0.5258946290250304,
|
|
"grad_norm": 33.609657287597656,
|
|
"learning_rate": 1.0843726154314767e-06,
|
|
"loss": 0.4974,
|
|
"num_input_tokens_seen": 25886272,
|
|
"step": 8215
|
|
},
|
|
{
|
|
"epoch": 0.5262147109660072,
|
|
"grad_norm": 30.594623565673828,
|
|
"learning_rate": 1.083259178943411e-06,
|
|
"loss": 0.4376,
|
|
"num_input_tokens_seen": 25901952,
|
|
"step": 8220
|
|
},
|
|
{
|
|
"epoch": 0.5265347929069842,
|
|
"grad_norm": 20.63231086730957,
|
|
"learning_rate": 1.0821456385052822e-06,
|
|
"loss": 0.3694,
|
|
"num_input_tokens_seen": 25917888,
|
|
"step": 8225
|
|
},
|
|
{
|
|
"epoch": 0.5268548748479611,
|
|
"grad_norm": 46.33021545410156,
|
|
"learning_rate": 1.0810319955073598e-06,
|
|
"loss": 0.4199,
|
|
"num_input_tokens_seen": 25933824,
|
|
"step": 8230
|
|
},
|
|
{
|
|
"epoch": 0.527174956788938,
|
|
"grad_norm": 36.321929931640625,
|
|
"learning_rate": 1.0799182513400393e-06,
|
|
"loss": 0.3888,
|
|
"num_input_tokens_seen": 25951360,
|
|
"step": 8235
|
|
},
|
|
{
|
|
"epoch": 0.5274950387299149,
|
|
"grad_norm": 37.35638427734375,
|
|
"learning_rate": 1.0788044073938438e-06,
|
|
"loss": 0.3594,
|
|
"num_input_tokens_seen": 25967232,
|
|
"step": 8240
|
|
},
|
|
{
|
|
"epoch": 0.5278151206708918,
|
|
"grad_norm": 37.84722900390625,
|
|
"learning_rate": 1.0776904650594205e-06,
|
|
"loss": 0.4146,
|
|
"num_input_tokens_seen": 25982592,
|
|
"step": 8245
|
|
},
|
|
{
|
|
"epoch": 0.5281352026118686,
|
|
"grad_norm": 67.66139221191406,
|
|
"learning_rate": 1.0765764257275394e-06,
|
|
"loss": 0.4094,
|
|
"num_input_tokens_seen": 25997824,
|
|
"step": 8250
|
|
},
|
|
{
|
|
"epoch": 0.5284552845528455,
|
|
"grad_norm": 32.80574035644531,
|
|
"learning_rate": 1.0754622907890914e-06,
|
|
"loss": 0.4292,
|
|
"num_input_tokens_seen": 26013632,
|
|
"step": 8255
|
|
},
|
|
{
|
|
"epoch": 0.5287753664938224,
|
|
"grad_norm": 28.530445098876953,
|
|
"learning_rate": 1.0743480616350873e-06,
|
|
"loss": 0.3249,
|
|
"num_input_tokens_seen": 26028800,
|
|
"step": 8260
|
|
},
|
|
{
|
|
"epoch": 0.5290954484347993,
|
|
"grad_norm": 30.938467025756836,
|
|
"learning_rate": 1.0732337396566558e-06,
|
|
"loss": 0.339,
|
|
"num_input_tokens_seen": 26044672,
|
|
"step": 8265
|
|
},
|
|
{
|
|
"epoch": 0.5294155303757762,
|
|
"grad_norm": 20.649280548095703,
|
|
"learning_rate": 1.07211932624504e-06,
|
|
"loss": 0.396,
|
|
"num_input_tokens_seen": 26060544,
|
|
"step": 8270
|
|
},
|
|
{
|
|
"epoch": 0.529735612316753,
|
|
"grad_norm": 18.15691566467285,
|
|
"learning_rate": 1.0710048227915988e-06,
|
|
"loss": 0.3786,
|
|
"num_input_tokens_seen": 26076160,
|
|
"step": 8275
|
|
},
|
|
{
|
|
"epoch": 0.53005569425773,
|
|
"grad_norm": 24.960102081298828,
|
|
"learning_rate": 1.0698902306878024e-06,
|
|
"loss": 0.4186,
|
|
"num_input_tokens_seen": 26092352,
|
|
"step": 8280
|
|
},
|
|
{
|
|
"epoch": 0.5303757761987069,
|
|
"grad_norm": 25.81612205505371,
|
|
"learning_rate": 1.0687755513252325e-06,
|
|
"loss": 0.3024,
|
|
"num_input_tokens_seen": 26107776,
|
|
"step": 8285
|
|
},
|
|
{
|
|
"epoch": 0.5306958581396838,
|
|
"grad_norm": 11.139862060546875,
|
|
"learning_rate": 1.0676607860955794e-06,
|
|
"loss": 0.31,
|
|
"num_input_tokens_seen": 26123712,
|
|
"step": 8290
|
|
},
|
|
{
|
|
"epoch": 0.5310159400806607,
|
|
"grad_norm": 42.41699981689453,
|
|
"learning_rate": 1.0665459363906404e-06,
|
|
"loss": 0.386,
|
|
"num_input_tokens_seen": 26139200,
|
|
"step": 8295
|
|
},
|
|
{
|
|
"epoch": 0.5313360220216375,
|
|
"grad_norm": 23.389768600463867,
|
|
"learning_rate": 1.0654310036023185e-06,
|
|
"loss": 0.4355,
|
|
"num_input_tokens_seen": 26153600,
|
|
"step": 8300
|
|
},
|
|
{
|
|
"epoch": 0.5316561039626144,
|
|
"grad_norm": 19.833234786987305,
|
|
"learning_rate": 1.0643159891226203e-06,
|
|
"loss": 0.4206,
|
|
"num_input_tokens_seen": 26169600,
|
|
"step": 8305
|
|
},
|
|
{
|
|
"epoch": 0.5319761859035913,
|
|
"grad_norm": 33.841224670410156,
|
|
"learning_rate": 1.0632008943436545e-06,
|
|
"loss": 0.3398,
|
|
"num_input_tokens_seen": 26185536,
|
|
"step": 8310
|
|
},
|
|
{
|
|
"epoch": 0.5322962678445682,
|
|
"grad_norm": 17.150596618652344,
|
|
"learning_rate": 1.0620857206576299e-06,
|
|
"loss": 0.453,
|
|
"num_input_tokens_seen": 26201536,
|
|
"step": 8315
|
|
},
|
|
{
|
|
"epoch": 0.5326163497855451,
|
|
"grad_norm": 14.26513957977295,
|
|
"learning_rate": 1.0609704694568546e-06,
|
|
"loss": 0.2888,
|
|
"num_input_tokens_seen": 26216576,
|
|
"step": 8320
|
|
},
|
|
{
|
|
"epoch": 0.5329364317265219,
|
|
"grad_norm": 23.111820220947266,
|
|
"learning_rate": 1.0598551421337318e-06,
|
|
"loss": 0.2904,
|
|
"num_input_tokens_seen": 26232640,
|
|
"step": 8325
|
|
},
|
|
{
|
|
"epoch": 0.5332565136674989,
|
|
"grad_norm": 20.46584701538086,
|
|
"learning_rate": 1.0587397400807617e-06,
|
|
"loss": 0.5146,
|
|
"num_input_tokens_seen": 26248448,
|
|
"step": 8330
|
|
},
|
|
{
|
|
"epoch": 0.5335765956084758,
|
|
"grad_norm": 36.023284912109375,
|
|
"learning_rate": 1.057624264690536e-06,
|
|
"loss": 0.519,
|
|
"num_input_tokens_seen": 26263872,
|
|
"step": 8335
|
|
},
|
|
{
|
|
"epoch": 0.5338966775494527,
|
|
"grad_norm": 36.1595573425293,
|
|
"learning_rate": 1.0565087173557394e-06,
|
|
"loss": 0.4598,
|
|
"num_input_tokens_seen": 26279872,
|
|
"step": 8340
|
|
},
|
|
{
|
|
"epoch": 0.5342167594904296,
|
|
"grad_norm": 24.1319580078125,
|
|
"learning_rate": 1.055393099469146e-06,
|
|
"loss": 0.3428,
|
|
"num_input_tokens_seen": 26295680,
|
|
"step": 8345
|
|
},
|
|
{
|
|
"epoch": 0.5345368414314065,
|
|
"grad_norm": 34.465797424316406,
|
|
"learning_rate": 1.054277412423617e-06,
|
|
"loss": 0.4057,
|
|
"num_input_tokens_seen": 26311040,
|
|
"step": 8350
|
|
},
|
|
{
|
|
"epoch": 0.5348569233723833,
|
|
"grad_norm": 24.986618041992188,
|
|
"learning_rate": 1.0531616576121017e-06,
|
|
"loss": 0.4603,
|
|
"num_input_tokens_seen": 26326144,
|
|
"step": 8355
|
|
},
|
|
{
|
|
"epoch": 0.5351770053133602,
|
|
"grad_norm": 25.222026824951172,
|
|
"learning_rate": 1.0520458364276325e-06,
|
|
"loss": 0.3347,
|
|
"num_input_tokens_seen": 26341952,
|
|
"step": 8360
|
|
},
|
|
{
|
|
"epoch": 0.5354970872543371,
|
|
"grad_norm": 37.025054931640625,
|
|
"learning_rate": 1.0509299502633256e-06,
|
|
"loss": 0.3565,
|
|
"num_input_tokens_seen": 26356672,
|
|
"step": 8365
|
|
},
|
|
{
|
|
"epoch": 0.535817169195314,
|
|
"grad_norm": 20.434568405151367,
|
|
"learning_rate": 1.0498140005123777e-06,
|
|
"loss": 0.4493,
|
|
"num_input_tokens_seen": 26373056,
|
|
"step": 8370
|
|
},
|
|
{
|
|
"epoch": 0.5361372511362908,
|
|
"grad_norm": 12.433558464050293,
|
|
"learning_rate": 1.0486979885680653e-06,
|
|
"loss": 0.426,
|
|
"num_input_tokens_seen": 26388032,
|
|
"step": 8375
|
|
},
|
|
{
|
|
"epoch": 0.5364573330772677,
|
|
"grad_norm": 54.505035400390625,
|
|
"learning_rate": 1.0475819158237424e-06,
|
|
"loss": 0.4115,
|
|
"num_input_tokens_seen": 26402880,
|
|
"step": 8380
|
|
},
|
|
{
|
|
"epoch": 0.5367774150182447,
|
|
"grad_norm": 22.174421310424805,
|
|
"learning_rate": 1.0464657836728389e-06,
|
|
"loss": 0.4713,
|
|
"num_input_tokens_seen": 26419328,
|
|
"step": 8385
|
|
},
|
|
{
|
|
"epoch": 0.5370974969592216,
|
|
"grad_norm": 33.491397857666016,
|
|
"learning_rate": 1.045349593508859e-06,
|
|
"loss": 0.3981,
|
|
"num_input_tokens_seen": 26434112,
|
|
"step": 8390
|
|
},
|
|
{
|
|
"epoch": 0.5374175789001985,
|
|
"grad_norm": 22.270578384399414,
|
|
"learning_rate": 1.0442333467253788e-06,
|
|
"loss": 0.297,
|
|
"num_input_tokens_seen": 26450688,
|
|
"step": 8395
|
|
},
|
|
{
|
|
"epoch": 0.5377376608411754,
|
|
"grad_norm": 32.83494186401367,
|
|
"learning_rate": 1.0431170447160463e-06,
|
|
"loss": 0.3602,
|
|
"num_input_tokens_seen": 26466368,
|
|
"step": 8400
|
|
},
|
|
{
|
|
"epoch": 0.5380577427821522,
|
|
"grad_norm": 21.519004821777344,
|
|
"learning_rate": 1.0420006888745767e-06,
|
|
"loss": 0.3495,
|
|
"num_input_tokens_seen": 26482624,
|
|
"step": 8405
|
|
},
|
|
{
|
|
"epoch": 0.5383778247231291,
|
|
"grad_norm": 22.21971321105957,
|
|
"learning_rate": 1.0408842805947543e-06,
|
|
"loss": 0.3668,
|
|
"num_input_tokens_seen": 26499200,
|
|
"step": 8410
|
|
},
|
|
{
|
|
"epoch": 0.538697906664106,
|
|
"grad_norm": 32.608150482177734,
|
|
"learning_rate": 1.0397678212704276e-06,
|
|
"loss": 0.5119,
|
|
"num_input_tokens_seen": 26514048,
|
|
"step": 8415
|
|
},
|
|
{
|
|
"epoch": 0.5390179886050829,
|
|
"grad_norm": 32.62826919555664,
|
|
"learning_rate": 1.038651312295509e-06,
|
|
"loss": 0.4034,
|
|
"num_input_tokens_seen": 26529216,
|
|
"step": 8420
|
|
},
|
|
{
|
|
"epoch": 0.5393380705460598,
|
|
"grad_norm": 24.986495971679688,
|
|
"learning_rate": 1.037534755063973e-06,
|
|
"loss": 0.4192,
|
|
"num_input_tokens_seen": 26545152,
|
|
"step": 8425
|
|
},
|
|
{
|
|
"epoch": 0.5396581524870366,
|
|
"grad_norm": 44.1995964050293,
|
|
"learning_rate": 1.0364181509698548e-06,
|
|
"loss": 0.4147,
|
|
"num_input_tokens_seen": 26560512,
|
|
"step": 8430
|
|
},
|
|
{
|
|
"epoch": 0.5399782344280136,
|
|
"grad_norm": 29.369369506835938,
|
|
"learning_rate": 1.0353015014072476e-06,
|
|
"loss": 0.35,
|
|
"num_input_tokens_seen": 26575488,
|
|
"step": 8435
|
|
},
|
|
{
|
|
"epoch": 0.5402983163689905,
|
|
"grad_norm": 50.47454071044922,
|
|
"learning_rate": 1.0341848077703013e-06,
|
|
"loss": 0.405,
|
|
"num_input_tokens_seen": 26591040,
|
|
"step": 8440
|
|
},
|
|
{
|
|
"epoch": 0.5406183983099674,
|
|
"grad_norm": 26.94370460510254,
|
|
"learning_rate": 1.033068071453221e-06,
|
|
"loss": 0.3229,
|
|
"num_input_tokens_seen": 26606976,
|
|
"step": 8445
|
|
},
|
|
{
|
|
"epoch": 0.5409384802509443,
|
|
"grad_norm": 35.96391677856445,
|
|
"learning_rate": 1.0319512938502653e-06,
|
|
"loss": 0.3623,
|
|
"num_input_tokens_seen": 26623296,
|
|
"step": 8450
|
|
},
|
|
{
|
|
"epoch": 0.5412585621919211,
|
|
"grad_norm": 32.38021469116211,
|
|
"learning_rate": 1.0308344763557444e-06,
|
|
"loss": 0.3123,
|
|
"num_input_tokens_seen": 26638336,
|
|
"step": 8455
|
|
},
|
|
{
|
|
"epoch": 0.541578644132898,
|
|
"grad_norm": 15.892178535461426,
|
|
"learning_rate": 1.0297176203640175e-06,
|
|
"loss": 0.2841,
|
|
"num_input_tokens_seen": 26654400,
|
|
"step": 8460
|
|
},
|
|
{
|
|
"epoch": 0.5418987260738749,
|
|
"grad_norm": 54.3671760559082,
|
|
"learning_rate": 1.0286007272694924e-06,
|
|
"loss": 0.3482,
|
|
"num_input_tokens_seen": 26669568,
|
|
"step": 8465
|
|
},
|
|
{
|
|
"epoch": 0.5422188080148518,
|
|
"grad_norm": 27.727298736572266,
|
|
"learning_rate": 1.0274837984666239e-06,
|
|
"loss": 0.4695,
|
|
"num_input_tokens_seen": 26686016,
|
|
"step": 8470
|
|
},
|
|
{
|
|
"epoch": 0.5425388899558287,
|
|
"grad_norm": 31.10105323791504,
|
|
"learning_rate": 1.02636683534991e-06,
|
|
"loss": 0.4184,
|
|
"num_input_tokens_seen": 26701504,
|
|
"step": 8475
|
|
},
|
|
{
|
|
"epoch": 0.5428589718968055,
|
|
"grad_norm": 62.131317138671875,
|
|
"learning_rate": 1.0252498393138928e-06,
|
|
"loss": 0.5884,
|
|
"num_input_tokens_seen": 26717120,
|
|
"step": 8480
|
|
},
|
|
{
|
|
"epoch": 0.5431790538377824,
|
|
"grad_norm": 70.49308776855469,
|
|
"learning_rate": 1.0241328117531546e-06,
|
|
"loss": 0.4193,
|
|
"num_input_tokens_seen": 26732736,
|
|
"step": 8485
|
|
},
|
|
{
|
|
"epoch": 0.5434991357787594,
|
|
"grad_norm": 30.73244285583496,
|
|
"learning_rate": 1.0230157540623174e-06,
|
|
"loss": 0.4126,
|
|
"num_input_tokens_seen": 26747392,
|
|
"step": 8490
|
|
},
|
|
{
|
|
"epoch": 0.5438192177197363,
|
|
"grad_norm": 22.281478881835938,
|
|
"learning_rate": 1.0218986676360415e-06,
|
|
"loss": 0.4462,
|
|
"num_input_tokens_seen": 26762112,
|
|
"step": 8495
|
|
},
|
|
{
|
|
"epoch": 0.5441392996607132,
|
|
"grad_norm": 22.789291381835938,
|
|
"learning_rate": 1.0207815538690216e-06,
|
|
"loss": 0.3709,
|
|
"num_input_tokens_seen": 26777856,
|
|
"step": 8500
|
|
},
|
|
{
|
|
"epoch": 0.54445938160169,
|
|
"grad_norm": 51.15581512451172,
|
|
"learning_rate": 1.0196644141559877e-06,
|
|
"loss": 0.3055,
|
|
"num_input_tokens_seen": 26794048,
|
|
"step": 8505
|
|
},
|
|
{
|
|
"epoch": 0.5447794635426669,
|
|
"grad_norm": 42.44687271118164,
|
|
"learning_rate": 1.0185472498917021e-06,
|
|
"loss": 0.3509,
|
|
"num_input_tokens_seen": 26809792,
|
|
"step": 8510
|
|
},
|
|
{
|
|
"epoch": 0.5450995454836438,
|
|
"grad_norm": 53.03976058959961,
|
|
"learning_rate": 1.017430062470957e-06,
|
|
"loss": 0.4421,
|
|
"num_input_tokens_seen": 26825024,
|
|
"step": 8515
|
|
},
|
|
{
|
|
"epoch": 0.5454196274246207,
|
|
"grad_norm": 29.0567569732666,
|
|
"learning_rate": 1.016312853288574e-06,
|
|
"loss": 0.3472,
|
|
"num_input_tokens_seen": 26841536,
|
|
"step": 8520
|
|
},
|
|
{
|
|
"epoch": 0.5457397093655976,
|
|
"grad_norm": 21.90899085998535,
|
|
"learning_rate": 1.0151956237394027e-06,
|
|
"loss": 0.395,
|
|
"num_input_tokens_seen": 26857600,
|
|
"step": 8525
|
|
},
|
|
{
|
|
"epoch": 0.5460597913065744,
|
|
"grad_norm": 27.42255210876465,
|
|
"learning_rate": 1.0140783752183164e-06,
|
|
"loss": 0.3942,
|
|
"num_input_tokens_seen": 26874176,
|
|
"step": 8530
|
|
},
|
|
{
|
|
"epoch": 0.5463798732475513,
|
|
"grad_norm": 26.120128631591797,
|
|
"learning_rate": 1.0129611091202138e-06,
|
|
"loss": 0.4162,
|
|
"num_input_tokens_seen": 26890176,
|
|
"step": 8535
|
|
},
|
|
{
|
|
"epoch": 0.5466999551885282,
|
|
"grad_norm": 25.828702926635742,
|
|
"learning_rate": 1.0118438268400135e-06,
|
|
"loss": 0.2897,
|
|
"num_input_tokens_seen": 26905728,
|
|
"step": 8540
|
|
},
|
|
{
|
|
"epoch": 0.5470200371295052,
|
|
"grad_norm": 46.99468994140625,
|
|
"learning_rate": 1.0107265297726568e-06,
|
|
"loss": 0.4655,
|
|
"num_input_tokens_seen": 26921280,
|
|
"step": 8545
|
|
},
|
|
{
|
|
"epoch": 0.5473401190704821,
|
|
"grad_norm": 34.46550369262695,
|
|
"learning_rate": 1.009609219313102e-06,
|
|
"loss": 0.4065,
|
|
"num_input_tokens_seen": 26936704,
|
|
"step": 8550
|
|
},
|
|
{
|
|
"epoch": 0.547660201011459,
|
|
"grad_norm": 16.38555145263672,
|
|
"learning_rate": 1.0084918968563236e-06,
|
|
"loss": 0.4008,
|
|
"num_input_tokens_seen": 26952448,
|
|
"step": 8555
|
|
},
|
|
{
|
|
"epoch": 0.5479802829524358,
|
|
"grad_norm": 30.922161102294922,
|
|
"learning_rate": 1.0073745637973124e-06,
|
|
"loss": 0.3928,
|
|
"num_input_tokens_seen": 26967680,
|
|
"step": 8560
|
|
},
|
|
{
|
|
"epoch": 0.5483003648934127,
|
|
"grad_norm": 17.125778198242188,
|
|
"learning_rate": 1.0062572215310718e-06,
|
|
"loss": 0.3489,
|
|
"num_input_tokens_seen": 26982400,
|
|
"step": 8565
|
|
},
|
|
{
|
|
"epoch": 0.5486204468343896,
|
|
"grad_norm": 45.65067672729492,
|
|
"learning_rate": 1.0051398714526165e-06,
|
|
"loss": 0.313,
|
|
"num_input_tokens_seen": 26998400,
|
|
"step": 8570
|
|
},
|
|
{
|
|
"epoch": 0.5489405287753665,
|
|
"grad_norm": 45.19715118408203,
|
|
"learning_rate": 1.0040225149569712e-06,
|
|
"loss": 0.3506,
|
|
"num_input_tokens_seen": 27015936,
|
|
"step": 8575
|
|
},
|
|
{
|
|
"epoch": 0.5492606107163434,
|
|
"grad_norm": 36.80413055419922,
|
|
"learning_rate": 1.0029051534391693e-06,
|
|
"loss": 0.3263,
|
|
"num_input_tokens_seen": 27030528,
|
|
"step": 8580
|
|
},
|
|
{
|
|
"epoch": 0.5495806926573202,
|
|
"grad_norm": 21.942888259887695,
|
|
"learning_rate": 1.001787788294249e-06,
|
|
"loss": 0.3621,
|
|
"num_input_tokens_seen": 27046080,
|
|
"step": 8585
|
|
},
|
|
{
|
|
"epoch": 0.5499007745982971,
|
|
"grad_norm": 22.532997131347656,
|
|
"learning_rate": 1.0006704209172537e-06,
|
|
"loss": 0.4206,
|
|
"num_input_tokens_seen": 27061504,
|
|
"step": 8590
|
|
},
|
|
{
|
|
"epoch": 0.5502208565392741,
|
|
"grad_norm": 47.835289001464844,
|
|
"learning_rate": 9.995530527032301e-07,
|
|
"loss": 0.4297,
|
|
"num_input_tokens_seen": 27077056,
|
|
"step": 8595
|
|
},
|
|
{
|
|
"epoch": 0.550540938480251,
|
|
"grad_norm": 27.61309051513672,
|
|
"learning_rate": 9.984356850472257e-07,
|
|
"loss": 0.3382,
|
|
"num_input_tokens_seen": 27095168,
|
|
"step": 8600
|
|
},
|
|
{
|
|
"epoch": 0.5506689712566417,
|
|
"eval_loss": 0.3985471725463867,
|
|
"eval_runtime": 49.1827,
|
|
"eval_samples_per_second": 282.335,
|
|
"eval_steps_per_second": 35.297,
|
|
"num_input_tokens_seen": 27101056,
|
|
"step": 8602
|
|
},
|
|
{
|
|
"epoch": 0.5508610204212279,
|
|
"grad_norm": 21.579906463623047,
|
|
"learning_rate": 9.97318319344287e-07,
|
|
"loss": 0.3698,
|
|
"num_input_tokens_seen": 27110144,
|
|
"step": 8605
|
|
},
|
|
{
|
|
"epoch": 0.5511811023622047,
|
|
"grad_norm": 25.68075180053711,
|
|
"learning_rate": 9.962009569894577e-07,
|
|
"loss": 0.5311,
|
|
"num_input_tokens_seen": 27124864,
|
|
"step": 8610
|
|
},
|
|
{
|
|
"epoch": 0.5515011843031816,
|
|
"grad_norm": 29.338640213012695,
|
|
"learning_rate": 9.95083599377778e-07,
|
|
"loss": 0.3769,
|
|
"num_input_tokens_seen": 27140160,
|
|
"step": 8615
|
|
},
|
|
{
|
|
"epoch": 0.5518212662441585,
|
|
"grad_norm": 28.375497817993164,
|
|
"learning_rate": 9.939662479042828e-07,
|
|
"loss": 0.374,
|
|
"num_input_tokens_seen": 27155712,
|
|
"step": 8620
|
|
},
|
|
{
|
|
"epoch": 0.5521413481851354,
|
|
"grad_norm": 62.96663284301758,
|
|
"learning_rate": 9.92848903963998e-07,
|
|
"loss": 0.4573,
|
|
"num_input_tokens_seen": 27171520,
|
|
"step": 8625
|
|
},
|
|
{
|
|
"epoch": 0.5524614301261123,
|
|
"grad_norm": 33.80502700805664,
|
|
"learning_rate": 9.9173156895194e-07,
|
|
"loss": 0.4487,
|
|
"num_input_tokens_seen": 27186752,
|
|
"step": 8630
|
|
},
|
|
{
|
|
"epoch": 0.5527815120670891,
|
|
"grad_norm": 25.58247184753418,
|
|
"learning_rate": 9.906142442631154e-07,
|
|
"loss": 0.3823,
|
|
"num_input_tokens_seen": 27201664,
|
|
"step": 8635
|
|
},
|
|
{
|
|
"epoch": 0.553101594008066,
|
|
"grad_norm": 16.91172218322754,
|
|
"learning_rate": 9.894969312925171e-07,
|
|
"loss": 0.3804,
|
|
"num_input_tokens_seen": 27218880,
|
|
"step": 8640
|
|
},
|
|
{
|
|
"epoch": 0.5534216759490429,
|
|
"grad_norm": 51.58964157104492,
|
|
"learning_rate": 9.883796314351234e-07,
|
|
"loss": 0.3448,
|
|
"num_input_tokens_seen": 27235648,
|
|
"step": 8645
|
|
},
|
|
{
|
|
"epoch": 0.5537417578900199,
|
|
"grad_norm": 13.989603996276855,
|
|
"learning_rate": 9.872623460858966e-07,
|
|
"loss": 0.3997,
|
|
"num_input_tokens_seen": 27250880,
|
|
"step": 8650
|
|
},
|
|
{
|
|
"epoch": 0.5540618398309968,
|
|
"grad_norm": 12.994277954101562,
|
|
"learning_rate": 9.861450766397799e-07,
|
|
"loss": 0.3163,
|
|
"num_input_tokens_seen": 27266880,
|
|
"step": 8655
|
|
},
|
|
{
|
|
"epoch": 0.5543819217719737,
|
|
"grad_norm": 24.744857788085938,
|
|
"learning_rate": 9.850278244916976e-07,
|
|
"loss": 0.411,
|
|
"num_input_tokens_seen": 27282816,
|
|
"step": 8660
|
|
},
|
|
{
|
|
"epoch": 0.5547020037129505,
|
|
"grad_norm": 19.459922790527344,
|
|
"learning_rate": 9.839105910365524e-07,
|
|
"loss": 0.4309,
|
|
"num_input_tokens_seen": 27298496,
|
|
"step": 8665
|
|
},
|
|
{
|
|
"epoch": 0.5550220856539274,
|
|
"grad_norm": 38.75739288330078,
|
|
"learning_rate": 9.827933776692235e-07,
|
|
"loss": 0.331,
|
|
"num_input_tokens_seen": 27313856,
|
|
"step": 8670
|
|
},
|
|
{
|
|
"epoch": 0.5553421675949043,
|
|
"grad_norm": 30.53208351135254,
|
|
"learning_rate": 9.81676185784564e-07,
|
|
"loss": 0.34,
|
|
"num_input_tokens_seen": 27328448,
|
|
"step": 8675
|
|
},
|
|
{
|
|
"epoch": 0.5556622495358812,
|
|
"grad_norm": 15.985432624816895,
|
|
"learning_rate": 9.805590167774021e-07,
|
|
"loss": 0.3916,
|
|
"num_input_tokens_seen": 27343872,
|
|
"step": 8680
|
|
},
|
|
{
|
|
"epoch": 0.555982331476858,
|
|
"grad_norm": 75.76026153564453,
|
|
"learning_rate": 9.79441872042536e-07,
|
|
"loss": 0.5632,
|
|
"num_input_tokens_seen": 27358720,
|
|
"step": 8685
|
|
},
|
|
{
|
|
"epoch": 0.5563024134178349,
|
|
"grad_norm": 32.97372817993164,
|
|
"learning_rate": 9.783247529747338e-07,
|
|
"loss": 0.3856,
|
|
"num_input_tokens_seen": 27373312,
|
|
"step": 8690
|
|
},
|
|
{
|
|
"epoch": 0.5566224953588118,
|
|
"grad_norm": 24.052457809448242,
|
|
"learning_rate": 9.772076609687323e-07,
|
|
"loss": 0.3571,
|
|
"num_input_tokens_seen": 27388544,
|
|
"step": 8695
|
|
},
|
|
{
|
|
"epoch": 0.5569425772997888,
|
|
"grad_norm": 35.491371154785156,
|
|
"learning_rate": 9.760905974192334e-07,
|
|
"loss": 0.3259,
|
|
"num_input_tokens_seen": 27405120,
|
|
"step": 8700
|
|
},
|
|
{
|
|
"epoch": 0.5572626592407657,
|
|
"grad_norm": 22.80748748779297,
|
|
"learning_rate": 9.749735637209044e-07,
|
|
"loss": 0.4078,
|
|
"num_input_tokens_seen": 27420544,
|
|
"step": 8705
|
|
},
|
|
{
|
|
"epoch": 0.5575827411817426,
|
|
"grad_norm": 16.720609664916992,
|
|
"learning_rate": 9.738565612683754e-07,
|
|
"loss": 0.3137,
|
|
"num_input_tokens_seen": 27435456,
|
|
"step": 8710
|
|
},
|
|
{
|
|
"epoch": 0.5579028231227194,
|
|
"grad_norm": 28.667015075683594,
|
|
"learning_rate": 9.727395914562363e-07,
|
|
"loss": 0.3477,
|
|
"num_input_tokens_seen": 27452032,
|
|
"step": 8715
|
|
},
|
|
{
|
|
"epoch": 0.5582229050636963,
|
|
"grad_norm": 25.73943328857422,
|
|
"learning_rate": 9.716226556790372e-07,
|
|
"loss": 0.4159,
|
|
"num_input_tokens_seen": 27467520,
|
|
"step": 8720
|
|
},
|
|
{
|
|
"epoch": 0.5585429870046732,
|
|
"grad_norm": 29.19492530822754,
|
|
"learning_rate": 9.705057553312855e-07,
|
|
"loss": 0.312,
|
|
"num_input_tokens_seen": 27482816,
|
|
"step": 8725
|
|
},
|
|
{
|
|
"epoch": 0.5588630689456501,
|
|
"grad_norm": 24.96323013305664,
|
|
"learning_rate": 9.693888918074452e-07,
|
|
"loss": 0.374,
|
|
"num_input_tokens_seen": 27497600,
|
|
"step": 8730
|
|
},
|
|
{
|
|
"epoch": 0.559183150886627,
|
|
"grad_norm": 40.72119140625,
|
|
"learning_rate": 9.682720665019325e-07,
|
|
"loss": 0.4861,
|
|
"num_input_tokens_seen": 27513344,
|
|
"step": 8735
|
|
},
|
|
{
|
|
"epoch": 0.5595032328276038,
|
|
"grad_norm": 20.98204231262207,
|
|
"learning_rate": 9.671552808091172e-07,
|
|
"loss": 0.4204,
|
|
"num_input_tokens_seen": 27530304,
|
|
"step": 8740
|
|
},
|
|
{
|
|
"epoch": 0.5598233147685807,
|
|
"grad_norm": 21.96649932861328,
|
|
"learning_rate": 9.660385361233195e-07,
|
|
"loss": 0.3409,
|
|
"num_input_tokens_seen": 27545664,
|
|
"step": 8745
|
|
},
|
|
{
|
|
"epoch": 0.5601433967095576,
|
|
"grad_norm": 20.051984786987305,
|
|
"learning_rate": 9.649218338388084e-07,
|
|
"loss": 0.2987,
|
|
"num_input_tokens_seen": 27560704,
|
|
"step": 8750
|
|
},
|
|
{
|
|
"epoch": 0.5604634786505346,
|
|
"grad_norm": 21.695167541503906,
|
|
"learning_rate": 9.638051753497994e-07,
|
|
"loss": 0.4353,
|
|
"num_input_tokens_seen": 27577472,
|
|
"step": 8755
|
|
},
|
|
{
|
|
"epoch": 0.5607835605915115,
|
|
"grad_norm": 20.893781661987305,
|
|
"learning_rate": 9.62688562050454e-07,
|
|
"loss": 0.3597,
|
|
"num_input_tokens_seen": 27592960,
|
|
"step": 8760
|
|
},
|
|
{
|
|
"epoch": 0.5611036425324883,
|
|
"grad_norm": 17.552732467651367,
|
|
"learning_rate": 9.615719953348772e-07,
|
|
"loss": 0.4033,
|
|
"num_input_tokens_seen": 27610304,
|
|
"step": 8765
|
|
},
|
|
{
|
|
"epoch": 0.5614237244734652,
|
|
"grad_norm": 37.0562629699707,
|
|
"learning_rate": 9.604554765971148e-07,
|
|
"loss": 0.5574,
|
|
"num_input_tokens_seen": 27628288,
|
|
"step": 8770
|
|
},
|
|
{
|
|
"epoch": 0.5617438064144421,
|
|
"grad_norm": 20.61250114440918,
|
|
"learning_rate": 9.593390072311549e-07,
|
|
"loss": 0.4069,
|
|
"num_input_tokens_seen": 27643904,
|
|
"step": 8775
|
|
},
|
|
{
|
|
"epoch": 0.562063888355419,
|
|
"grad_norm": 20.135055541992188,
|
|
"learning_rate": 9.582225886309216e-07,
|
|
"loss": 0.3576,
|
|
"num_input_tokens_seen": 27660224,
|
|
"step": 8780
|
|
},
|
|
{
|
|
"epoch": 0.5623839702963959,
|
|
"grad_norm": 18.054454803466797,
|
|
"learning_rate": 9.571062221902767e-07,
|
|
"loss": 0.3015,
|
|
"num_input_tokens_seen": 27675136,
|
|
"step": 8785
|
|
},
|
|
{
|
|
"epoch": 0.5627040522373727,
|
|
"grad_norm": 56.18107223510742,
|
|
"learning_rate": 9.559899093030175e-07,
|
|
"loss": 0.3485,
|
|
"num_input_tokens_seen": 27690176,
|
|
"step": 8790
|
|
},
|
|
{
|
|
"epoch": 0.5630241341783496,
|
|
"grad_norm": 34.77610778808594,
|
|
"learning_rate": 9.54873651362873e-07,
|
|
"loss": 0.3061,
|
|
"num_input_tokens_seen": 27704512,
|
|
"step": 8795
|
|
},
|
|
{
|
|
"epoch": 0.5633442161193265,
|
|
"grad_norm": 49.7370491027832,
|
|
"learning_rate": 9.537574497635043e-07,
|
|
"loss": 0.46,
|
|
"num_input_tokens_seen": 27720448,
|
|
"step": 8800
|
|
},
|
|
{
|
|
"epoch": 0.5636642980603035,
|
|
"grad_norm": 27.712543487548828,
|
|
"learning_rate": 9.52641305898503e-07,
|
|
"loss": 0.4966,
|
|
"num_input_tokens_seen": 27735808,
|
|
"step": 8805
|
|
},
|
|
{
|
|
"epoch": 0.5639843800012804,
|
|
"grad_norm": 27.37342643737793,
|
|
"learning_rate": 9.515252211613873e-07,
|
|
"loss": 0.3122,
|
|
"num_input_tokens_seen": 27750464,
|
|
"step": 8810
|
|
},
|
|
{
|
|
"epoch": 0.5643044619422573,
|
|
"grad_norm": 35.751590728759766,
|
|
"learning_rate": 9.504091969456021e-07,
|
|
"loss": 0.4586,
|
|
"num_input_tokens_seen": 27764352,
|
|
"step": 8815
|
|
},
|
|
{
|
|
"epoch": 0.5646245438832341,
|
|
"grad_norm": 20.632070541381836,
|
|
"learning_rate": 9.492932346445165e-07,
|
|
"loss": 0.338,
|
|
"num_input_tokens_seen": 27779840,
|
|
"step": 8820
|
|
},
|
|
{
|
|
"epoch": 0.564944625824211,
|
|
"grad_norm": 23.484317779541016,
|
|
"learning_rate": 9.48177335651423e-07,
|
|
"loss": 0.27,
|
|
"num_input_tokens_seen": 27796352,
|
|
"step": 8825
|
|
},
|
|
{
|
|
"epoch": 0.5652647077651879,
|
|
"grad_norm": 33.279884338378906,
|
|
"learning_rate": 9.470615013595346e-07,
|
|
"loss": 0.3325,
|
|
"num_input_tokens_seen": 27810624,
|
|
"step": 8830
|
|
},
|
|
{
|
|
"epoch": 0.5655847897061648,
|
|
"grad_norm": 42.17190933227539,
|
|
"learning_rate": 9.459457331619829e-07,
|
|
"loss": 0.4447,
|
|
"num_input_tokens_seen": 27825152,
|
|
"step": 8835
|
|
},
|
|
{
|
|
"epoch": 0.5659048716471416,
|
|
"grad_norm": 33.045230865478516,
|
|
"learning_rate": 9.448300324518182e-07,
|
|
"loss": 0.4076,
|
|
"num_input_tokens_seen": 27840384,
|
|
"step": 8840
|
|
},
|
|
{
|
|
"epoch": 0.5662249535881185,
|
|
"grad_norm": 32.98795700073242,
|
|
"learning_rate": 9.437144006220058e-07,
|
|
"loss": 0.3017,
|
|
"num_input_tokens_seen": 27856640,
|
|
"step": 8845
|
|
},
|
|
{
|
|
"epoch": 0.5665450355290954,
|
|
"grad_norm": 9.297707557678223,
|
|
"learning_rate": 9.425988390654249e-07,
|
|
"loss": 0.2027,
|
|
"num_input_tokens_seen": 27872768,
|
|
"step": 8850
|
|
},
|
|
{
|
|
"epoch": 0.5668651174700723,
|
|
"grad_norm": 40.04125213623047,
|
|
"learning_rate": 9.414833491748677e-07,
|
|
"loss": 0.4955,
|
|
"num_input_tokens_seen": 27887488,
|
|
"step": 8855
|
|
},
|
|
{
|
|
"epoch": 0.5671851994110493,
|
|
"grad_norm": 45.78459167480469,
|
|
"learning_rate": 9.40367932343036e-07,
|
|
"loss": 0.3024,
|
|
"num_input_tokens_seen": 27902720,
|
|
"step": 8860
|
|
},
|
|
{
|
|
"epoch": 0.5675052813520262,
|
|
"grad_norm": 28.001405715942383,
|
|
"learning_rate": 9.392525899625407e-07,
|
|
"loss": 0.374,
|
|
"num_input_tokens_seen": 27918080,
|
|
"step": 8865
|
|
},
|
|
{
|
|
"epoch": 0.567825363293003,
|
|
"grad_norm": 48.28670120239258,
|
|
"learning_rate": 9.381373234259004e-07,
|
|
"loss": 0.4011,
|
|
"num_input_tokens_seen": 27933760,
|
|
"step": 8870
|
|
},
|
|
{
|
|
"epoch": 0.5681454452339799,
|
|
"grad_norm": 42.333187103271484,
|
|
"learning_rate": 9.370221341255382e-07,
|
|
"loss": 0.375,
|
|
"num_input_tokens_seen": 27948992,
|
|
"step": 8875
|
|
},
|
|
{
|
|
"epoch": 0.5684655271749568,
|
|
"grad_norm": 28.905458450317383,
|
|
"learning_rate": 9.359070234537807e-07,
|
|
"loss": 0.3382,
|
|
"num_input_tokens_seen": 27966848,
|
|
"step": 8880
|
|
},
|
|
{
|
|
"epoch": 0.5687856091159337,
|
|
"grad_norm": 27.128929138183594,
|
|
"learning_rate": 9.34791992802857e-07,
|
|
"loss": 0.3803,
|
|
"num_input_tokens_seen": 27981696,
|
|
"step": 8885
|
|
},
|
|
{
|
|
"epoch": 0.5691056910569106,
|
|
"grad_norm": 29.13878631591797,
|
|
"learning_rate": 9.336770435648963e-07,
|
|
"loss": 0.2607,
|
|
"num_input_tokens_seen": 27997376,
|
|
"step": 8890
|
|
},
|
|
{
|
|
"epoch": 0.5694257729978874,
|
|
"grad_norm": 25.84345054626465,
|
|
"learning_rate": 9.325621771319246e-07,
|
|
"loss": 0.4075,
|
|
"num_input_tokens_seen": 28014016,
|
|
"step": 8895
|
|
},
|
|
{
|
|
"epoch": 0.5697458549388643,
|
|
"grad_norm": 21.55052947998047,
|
|
"learning_rate": 9.314473948958673e-07,
|
|
"loss": 0.4178,
|
|
"num_input_tokens_seen": 28030400,
|
|
"step": 8900
|
|
},
|
|
{
|
|
"epoch": 0.5700659368798412,
|
|
"grad_norm": 25.94553565979004,
|
|
"learning_rate": 9.303326982485422e-07,
|
|
"loss": 0.3456,
|
|
"num_input_tokens_seen": 28047104,
|
|
"step": 8905
|
|
},
|
|
{
|
|
"epoch": 0.5703860188208181,
|
|
"grad_norm": 49.04792785644531,
|
|
"learning_rate": 9.29218088581661e-07,
|
|
"loss": 0.3546,
|
|
"num_input_tokens_seen": 28063168,
|
|
"step": 8910
|
|
},
|
|
{
|
|
"epoch": 0.5707061007617951,
|
|
"grad_norm": 28.955217361450195,
|
|
"learning_rate": 9.281035672868278e-07,
|
|
"loss": 0.3462,
|
|
"num_input_tokens_seen": 28079104,
|
|
"step": 8915
|
|
},
|
|
{
|
|
"epoch": 0.571026182702772,
|
|
"grad_norm": 27.242048263549805,
|
|
"learning_rate": 9.269891357555348e-07,
|
|
"loss": 0.3912,
|
|
"num_input_tokens_seen": 28094720,
|
|
"step": 8920
|
|
},
|
|
{
|
|
"epoch": 0.5713462646437488,
|
|
"grad_norm": 39.87770462036133,
|
|
"learning_rate": 9.25874795379163e-07,
|
|
"loss": 0.2754,
|
|
"num_input_tokens_seen": 28110848,
|
|
"step": 8925
|
|
},
|
|
{
|
|
"epoch": 0.5716663465847257,
|
|
"grad_norm": 22.331693649291992,
|
|
"learning_rate": 9.247605475489793e-07,
|
|
"loss": 0.4172,
|
|
"num_input_tokens_seen": 28127040,
|
|
"step": 8930
|
|
},
|
|
{
|
|
"epoch": 0.5719864285257026,
|
|
"grad_norm": 33.441993713378906,
|
|
"learning_rate": 9.236463936561358e-07,
|
|
"loss": 0.3062,
|
|
"num_input_tokens_seen": 28143424,
|
|
"step": 8935
|
|
},
|
|
{
|
|
"epoch": 0.5723065104666795,
|
|
"grad_norm": 48.873287200927734,
|
|
"learning_rate": 9.225323350916661e-07,
|
|
"loss": 0.5365,
|
|
"num_input_tokens_seen": 28158528,
|
|
"step": 8940
|
|
},
|
|
{
|
|
"epoch": 0.5726265924076563,
|
|
"grad_norm": 35.569923400878906,
|
|
"learning_rate": 9.214183732464855e-07,
|
|
"loss": 0.3948,
|
|
"num_input_tokens_seen": 28173888,
|
|
"step": 8945
|
|
},
|
|
{
|
|
"epoch": 0.5729466743486332,
|
|
"grad_norm": 20.366697311401367,
|
|
"learning_rate": 9.203045095113886e-07,
|
|
"loss": 0.3671,
|
|
"num_input_tokens_seen": 28191872,
|
|
"step": 8950
|
|
},
|
|
{
|
|
"epoch": 0.5732667562896101,
|
|
"grad_norm": 45.24616622924805,
|
|
"learning_rate": 9.191907452770476e-07,
|
|
"loss": 0.4305,
|
|
"num_input_tokens_seen": 28206912,
|
|
"step": 8955
|
|
},
|
|
{
|
|
"epoch": 0.573586838230587,
|
|
"grad_norm": 29.864273071289062,
|
|
"learning_rate": 9.180770819340095e-07,
|
|
"loss": 0.4233,
|
|
"num_input_tokens_seen": 28222336,
|
|
"step": 8960
|
|
},
|
|
{
|
|
"epoch": 0.573906920171564,
|
|
"grad_norm": 14.063233375549316,
|
|
"learning_rate": 9.169635208726967e-07,
|
|
"loss": 0.376,
|
|
"num_input_tokens_seen": 28238144,
|
|
"step": 8965
|
|
},
|
|
{
|
|
"epoch": 0.5742270021125409,
|
|
"grad_norm": 62.739784240722656,
|
|
"learning_rate": 9.15850063483403e-07,
|
|
"loss": 0.3787,
|
|
"num_input_tokens_seen": 28253376,
|
|
"step": 8970
|
|
},
|
|
{
|
|
"epoch": 0.5745470840535177,
|
|
"grad_norm": 28.41097068786621,
|
|
"learning_rate": 9.147367111562928e-07,
|
|
"loss": 0.3493,
|
|
"num_input_tokens_seen": 28269248,
|
|
"step": 8975
|
|
},
|
|
{
|
|
"epoch": 0.5748671659944946,
|
|
"grad_norm": 35.87826919555664,
|
|
"learning_rate": 9.136234652814005e-07,
|
|
"loss": 0.4094,
|
|
"num_input_tokens_seen": 28285440,
|
|
"step": 8980
|
|
},
|
|
{
|
|
"epoch": 0.5751872479354715,
|
|
"grad_norm": 27.88485336303711,
|
|
"learning_rate": 9.125103272486255e-07,
|
|
"loss": 0.2965,
|
|
"num_input_tokens_seen": 28300736,
|
|
"step": 8985
|
|
},
|
|
{
|
|
"epoch": 0.5755073298764484,
|
|
"grad_norm": 30.880252838134766,
|
|
"learning_rate": 9.11397298447734e-07,
|
|
"loss": 0.361,
|
|
"num_input_tokens_seen": 28315712,
|
|
"step": 8990
|
|
},
|
|
{
|
|
"epoch": 0.5758274118174252,
|
|
"grad_norm": 30.014013290405273,
|
|
"learning_rate": 9.10284380268356e-07,
|
|
"loss": 0.3287,
|
|
"num_input_tokens_seen": 28332032,
|
|
"step": 8995
|
|
},
|
|
{
|
|
"epoch": 0.5761474937584021,
|
|
"grad_norm": 26.396350860595703,
|
|
"learning_rate": 9.091715740999828e-07,
|
|
"loss": 0.4476,
|
|
"num_input_tokens_seen": 28347968,
|
|
"step": 9000
|
|
},
|
|
{
|
|
"epoch": 0.576467575699379,
|
|
"grad_norm": 23.355926513671875,
|
|
"learning_rate": 9.080588813319654e-07,
|
|
"loss": 0.3849,
|
|
"num_input_tokens_seen": 28362944,
|
|
"step": 9005
|
|
},
|
|
{
|
|
"epoch": 0.5767876576403559,
|
|
"grad_norm": 42.71702194213867,
|
|
"learning_rate": 9.069463033535143e-07,
|
|
"loss": 0.3032,
|
|
"num_input_tokens_seen": 28378624,
|
|
"step": 9010
|
|
},
|
|
{
|
|
"epoch": 0.5771077395813328,
|
|
"grad_norm": 62.55430603027344,
|
|
"learning_rate": 9.058338415536962e-07,
|
|
"loss": 0.3865,
|
|
"num_input_tokens_seen": 28394048,
|
|
"step": 9015
|
|
},
|
|
{
|
|
"epoch": 0.5774278215223098,
|
|
"grad_norm": 38.583648681640625,
|
|
"learning_rate": 9.04721497321432e-07,
|
|
"loss": 0.3808,
|
|
"num_input_tokens_seen": 28409664,
|
|
"step": 9020
|
|
},
|
|
{
|
|
"epoch": 0.5777479034632866,
|
|
"grad_norm": 31.30422592163086,
|
|
"learning_rate": 9.036092720454977e-07,
|
|
"loss": 0.3744,
|
|
"num_input_tokens_seen": 28424768,
|
|
"step": 9025
|
|
},
|
|
{
|
|
"epoch": 0.5780679854042635,
|
|
"grad_norm": 29.469755172729492,
|
|
"learning_rate": 9.024971671145189e-07,
|
|
"loss": 0.3387,
|
|
"num_input_tokens_seen": 28439424,
|
|
"step": 9030
|
|
},
|
|
{
|
|
"epoch": 0.5783880673452404,
|
|
"grad_norm": 41.49711608886719,
|
|
"learning_rate": 9.013851839169718e-07,
|
|
"loss": 0.4406,
|
|
"num_input_tokens_seen": 28456064,
|
|
"step": 9035
|
|
},
|
|
{
|
|
"epoch": 0.5787081492862173,
|
|
"grad_norm": 42.17570495605469,
|
|
"learning_rate": 9.002733238411801e-07,
|
|
"loss": 0.3388,
|
|
"num_input_tokens_seen": 28472768,
|
|
"step": 9040
|
|
},
|
|
{
|
|
"epoch": 0.5790282312271942,
|
|
"grad_norm": 31.11846160888672,
|
|
"learning_rate": 8.991615882753147e-07,
|
|
"loss": 0.3489,
|
|
"num_input_tokens_seen": 28488704,
|
|
"step": 9045
|
|
},
|
|
{
|
|
"epoch": 0.579348313168171,
|
|
"grad_norm": 55.96306610107422,
|
|
"learning_rate": 8.980499786073904e-07,
|
|
"loss": 0.4431,
|
|
"num_input_tokens_seen": 28503808,
|
|
"step": 9050
|
|
},
|
|
{
|
|
"epoch": 0.5796683951091479,
|
|
"grad_norm": 54.62471008300781,
|
|
"learning_rate": 8.969384962252645e-07,
|
|
"loss": 0.4759,
|
|
"num_input_tokens_seen": 28520320,
|
|
"step": 9055
|
|
},
|
|
{
|
|
"epoch": 0.5799884770501248,
|
|
"grad_norm": 47.783241271972656,
|
|
"learning_rate": 8.958271425166366e-07,
|
|
"loss": 0.4431,
|
|
"num_input_tokens_seen": 28535680,
|
|
"step": 9060
|
|
},
|
|
{
|
|
"epoch": 0.5803085589911017,
|
|
"grad_norm": 22.617599487304688,
|
|
"learning_rate": 8.947159188690442e-07,
|
|
"loss": 0.396,
|
|
"num_input_tokens_seen": 28551488,
|
|
"step": 9065
|
|
},
|
|
{
|
|
"epoch": 0.5806286409320787,
|
|
"grad_norm": 67.4439697265625,
|
|
"learning_rate": 8.93604826669863e-07,
|
|
"loss": 0.4786,
|
|
"num_input_tokens_seen": 28567040,
|
|
"step": 9070
|
|
},
|
|
{
|
|
"epoch": 0.5809487228730555,
|
|
"grad_norm": 26.622365951538086,
|
|
"learning_rate": 8.924938673063052e-07,
|
|
"loss": 0.3986,
|
|
"num_input_tokens_seen": 28581568,
|
|
"step": 9075
|
|
},
|
|
{
|
|
"epoch": 0.5812688048140324,
|
|
"grad_norm": 15.871992111206055,
|
|
"learning_rate": 8.913830421654166e-07,
|
|
"loss": 0.3559,
|
|
"num_input_tokens_seen": 28596992,
|
|
"step": 9080
|
|
},
|
|
{
|
|
"epoch": 0.5815888867550093,
|
|
"grad_norm": 22.36756134033203,
|
|
"learning_rate": 8.902723526340746e-07,
|
|
"loss": 0.4757,
|
|
"num_input_tokens_seen": 28613952,
|
|
"step": 9085
|
|
},
|
|
{
|
|
"epoch": 0.5819089686959862,
|
|
"grad_norm": 26.785381317138672,
|
|
"learning_rate": 8.89161800098989e-07,
|
|
"loss": 0.4202,
|
|
"num_input_tokens_seen": 28628736,
|
|
"step": 9090
|
|
},
|
|
{
|
|
"epoch": 0.5822290506369631,
|
|
"grad_norm": 54.52938461303711,
|
|
"learning_rate": 8.880513859466974e-07,
|
|
"loss": 0.3704,
|
|
"num_input_tokens_seen": 28644928,
|
|
"step": 9095
|
|
},
|
|
{
|
|
"epoch": 0.5825491325779399,
|
|
"grad_norm": 17.885007858276367,
|
|
"learning_rate": 8.869411115635645e-07,
|
|
"loss": 0.278,
|
|
"num_input_tokens_seen": 28661184,
|
|
"step": 9100
|
|
},
|
|
{
|
|
"epoch": 0.5828692145189168,
|
|
"grad_norm": 17.88958740234375,
|
|
"learning_rate": 8.858309783357816e-07,
|
|
"loss": 0.2772,
|
|
"num_input_tokens_seen": 28675776,
|
|
"step": 9105
|
|
},
|
|
{
|
|
"epoch": 0.5831892964598937,
|
|
"grad_norm": 53.37077713012695,
|
|
"learning_rate": 8.847209876493629e-07,
|
|
"loss": 0.4318,
|
|
"num_input_tokens_seen": 28692160,
|
|
"step": 9110
|
|
},
|
|
{
|
|
"epoch": 0.5835093784008706,
|
|
"grad_norm": 30.646394729614258,
|
|
"learning_rate": 8.836111408901441e-07,
|
|
"loss": 0.2576,
|
|
"num_input_tokens_seen": 28707328,
|
|
"step": 9115
|
|
},
|
|
{
|
|
"epoch": 0.5838294603418475,
|
|
"grad_norm": 43.16847610473633,
|
|
"learning_rate": 8.825014394437828e-07,
|
|
"loss": 0.4235,
|
|
"num_input_tokens_seen": 28722624,
|
|
"step": 9120
|
|
},
|
|
{
|
|
"epoch": 0.5841495422828245,
|
|
"grad_norm": 14.40605640411377,
|
|
"learning_rate": 8.813918846957542e-07,
|
|
"loss": 0.3748,
|
|
"num_input_tokens_seen": 28737856,
|
|
"step": 9125
|
|
},
|
|
{
|
|
"epoch": 0.5844696242238013,
|
|
"grad_norm": 20.49512481689453,
|
|
"learning_rate": 8.802824780313499e-07,
|
|
"loss": 0.4501,
|
|
"num_input_tokens_seen": 28752448,
|
|
"step": 9130
|
|
},
|
|
{
|
|
"epoch": 0.5847897061647782,
|
|
"grad_norm": 22.4967098236084,
|
|
"learning_rate": 8.791732208356771e-07,
|
|
"loss": 0.3958,
|
|
"num_input_tokens_seen": 28767616,
|
|
"step": 9135
|
|
},
|
|
{
|
|
"epoch": 0.5851097881057551,
|
|
"grad_norm": 15.978533744812012,
|
|
"learning_rate": 8.780641144936573e-07,
|
|
"loss": 0.4649,
|
|
"num_input_tokens_seen": 28782400,
|
|
"step": 9140
|
|
},
|
|
{
|
|
"epoch": 0.585429870046732,
|
|
"grad_norm": 48.71504211425781,
|
|
"learning_rate": 8.76955160390022e-07,
|
|
"loss": 0.4457,
|
|
"num_input_tokens_seen": 28798336,
|
|
"step": 9145
|
|
},
|
|
{
|
|
"epoch": 0.5857499519877089,
|
|
"grad_norm": 16.611661911010742,
|
|
"learning_rate": 8.758463599093136e-07,
|
|
"loss": 0.2868,
|
|
"num_input_tokens_seen": 28814336,
|
|
"step": 9150
|
|
},
|
|
{
|
|
"epoch": 0.5860700339286857,
|
|
"grad_norm": 39.33195495605469,
|
|
"learning_rate": 8.747377144358825e-07,
|
|
"loss": 0.5273,
|
|
"num_input_tokens_seen": 28830656,
|
|
"step": 9155
|
|
},
|
|
{
|
|
"epoch": 0.5863901158696626,
|
|
"grad_norm": 42.789817810058594,
|
|
"learning_rate": 8.736292253538861e-07,
|
|
"loss": 0.418,
|
|
"num_input_tokens_seen": 28846656,
|
|
"step": 9160
|
|
},
|
|
{
|
|
"epoch": 0.5867101978106395,
|
|
"grad_norm": 33.47774887084961,
|
|
"learning_rate": 8.725208940472851e-07,
|
|
"loss": 0.309,
|
|
"num_input_tokens_seen": 28862848,
|
|
"step": 9165
|
|
},
|
|
{
|
|
"epoch": 0.5870302797516164,
|
|
"grad_norm": 14.912242889404297,
|
|
"learning_rate": 8.714127218998448e-07,
|
|
"loss": 0.4083,
|
|
"num_input_tokens_seen": 28878400,
|
|
"step": 9170
|
|
},
|
|
{
|
|
"epoch": 0.5873503616925934,
|
|
"grad_norm": 67.51158905029297,
|
|
"learning_rate": 8.70304710295131e-07,
|
|
"loss": 0.5084,
|
|
"num_input_tokens_seen": 28893568,
|
|
"step": 9175
|
|
},
|
|
{
|
|
"epoch": 0.5876704436335702,
|
|
"grad_norm": 29.94365692138672,
|
|
"learning_rate": 8.691968606165092e-07,
|
|
"loss": 0.367,
|
|
"num_input_tokens_seen": 28909824,
|
|
"step": 9180
|
|
},
|
|
{
|
|
"epoch": 0.5879905255745471,
|
|
"grad_norm": 30.510108947753906,
|
|
"learning_rate": 8.680891742471429e-07,
|
|
"loss": 0.3078,
|
|
"num_input_tokens_seen": 28925568,
|
|
"step": 9185
|
|
},
|
|
{
|
|
"epoch": 0.588310607515524,
|
|
"grad_norm": 27.14842987060547,
|
|
"learning_rate": 8.669816525699912e-07,
|
|
"loss": 0.3272,
|
|
"num_input_tokens_seen": 28941056,
|
|
"step": 9190
|
|
},
|
|
{
|
|
"epoch": 0.5886306894565009,
|
|
"grad_norm": 36.03899002075195,
|
|
"learning_rate": 8.658742969678079e-07,
|
|
"loss": 0.4143,
|
|
"num_input_tokens_seen": 28955456,
|
|
"step": 9195
|
|
},
|
|
{
|
|
"epoch": 0.5889507713974778,
|
|
"grad_norm": 33.955684661865234,
|
|
"learning_rate": 8.647671088231398e-07,
|
|
"loss": 0.2927,
|
|
"num_input_tokens_seen": 28971136,
|
|
"step": 9200
|
|
},
|
|
{
|
|
"epoch": 0.5892708533384546,
|
|
"grad_norm": 57.654293060302734,
|
|
"learning_rate": 8.636600895183245e-07,
|
|
"loss": 0.4087,
|
|
"num_input_tokens_seen": 28988480,
|
|
"step": 9205
|
|
},
|
|
{
|
|
"epoch": 0.5895909352794315,
|
|
"grad_norm": 45.632225036621094,
|
|
"learning_rate": 8.625532404354877e-07,
|
|
"loss": 0.3669,
|
|
"num_input_tokens_seen": 29004544,
|
|
"step": 9210
|
|
},
|
|
{
|
|
"epoch": 0.5899110172204084,
|
|
"grad_norm": 14.44135570526123,
|
|
"learning_rate": 8.614465629565443e-07,
|
|
"loss": 0.3809,
|
|
"num_input_tokens_seen": 29019328,
|
|
"step": 9215
|
|
},
|
|
{
|
|
"epoch": 0.5902310991613853,
|
|
"grad_norm": 24.873798370361328,
|
|
"learning_rate": 8.603400584631939e-07,
|
|
"loss": 0.3336,
|
|
"num_input_tokens_seen": 29034752,
|
|
"step": 9220
|
|
},
|
|
{
|
|
"epoch": 0.5905511811023622,
|
|
"grad_norm": 34.6170654296875,
|
|
"learning_rate": 8.592337283369198e-07,
|
|
"loss": 0.4422,
|
|
"num_input_tokens_seen": 29050816,
|
|
"step": 9225
|
|
},
|
|
{
|
|
"epoch": 0.5908712630433391,
|
|
"grad_norm": 26.38481903076172,
|
|
"learning_rate": 8.581275739589893e-07,
|
|
"loss": 0.2752,
|
|
"num_input_tokens_seen": 29065920,
|
|
"step": 9230
|
|
},
|
|
{
|
|
"epoch": 0.591191344984316,
|
|
"grad_norm": 36.17750549316406,
|
|
"learning_rate": 8.570215967104481e-07,
|
|
"loss": 0.483,
|
|
"num_input_tokens_seen": 29080960,
|
|
"step": 9235
|
|
},
|
|
{
|
|
"epoch": 0.5915114269252929,
|
|
"grad_norm": 24.824047088623047,
|
|
"learning_rate": 8.559157979721225e-07,
|
|
"loss": 0.4786,
|
|
"num_input_tokens_seen": 29096768,
|
|
"step": 9240
|
|
},
|
|
{
|
|
"epoch": 0.5918315088662698,
|
|
"grad_norm": 35.19805908203125,
|
|
"learning_rate": 8.548101791246145e-07,
|
|
"loss": 0.5513,
|
|
"num_input_tokens_seen": 29112448,
|
|
"step": 9245
|
|
},
|
|
{
|
|
"epoch": 0.5921515908072467,
|
|
"grad_norm": 30.23106575012207,
|
|
"learning_rate": 8.537047415483028e-07,
|
|
"loss": 0.3392,
|
|
"num_input_tokens_seen": 29127808,
|
|
"step": 9250
|
|
},
|
|
{
|
|
"epoch": 0.5924716727482235,
|
|
"grad_norm": 13.602792739868164,
|
|
"learning_rate": 8.525994866233388e-07,
|
|
"loss": 0.2774,
|
|
"num_input_tokens_seen": 29142912,
|
|
"step": 9255
|
|
},
|
|
{
|
|
"epoch": 0.5927917546892004,
|
|
"grad_norm": 45.087398529052734,
|
|
"learning_rate": 8.514944157296464e-07,
|
|
"loss": 0.3847,
|
|
"num_input_tokens_seen": 29159168,
|
|
"step": 9260
|
|
},
|
|
{
|
|
"epoch": 0.5931118366301773,
|
|
"grad_norm": 38.43781280517578,
|
|
"learning_rate": 8.503895302469199e-07,
|
|
"loss": 0.3826,
|
|
"num_input_tokens_seen": 29175488,
|
|
"step": 9265
|
|
},
|
|
{
|
|
"epoch": 0.5934319185711542,
|
|
"grad_norm": 33.70762634277344,
|
|
"learning_rate": 8.492848315546214e-07,
|
|
"loss": 0.4143,
|
|
"num_input_tokens_seen": 29191104,
|
|
"step": 9270
|
|
},
|
|
{
|
|
"epoch": 0.5937520005121311,
|
|
"grad_norm": 17.961454391479492,
|
|
"learning_rate": 8.4818032103198e-07,
|
|
"loss": 0.4172,
|
|
"num_input_tokens_seen": 29206208,
|
|
"step": 9275
|
|
},
|
|
{
|
|
"epoch": 0.5940720824531079,
|
|
"grad_norm": 42.23419189453125,
|
|
"learning_rate": 8.470760000579906e-07,
|
|
"loss": 0.4169,
|
|
"num_input_tokens_seen": 29221312,
|
|
"step": 9280
|
|
},
|
|
{
|
|
"epoch": 0.5943921643940849,
|
|
"grad_norm": 46.78962707519531,
|
|
"learning_rate": 8.459718700114108e-07,
|
|
"loss": 0.4932,
|
|
"num_input_tokens_seen": 29236800,
|
|
"step": 9285
|
|
},
|
|
{
|
|
"epoch": 0.5947122463350618,
|
|
"grad_norm": 26.358369827270508,
|
|
"learning_rate": 8.448679322707595e-07,
|
|
"loss": 0.4521,
|
|
"num_input_tokens_seen": 29252480,
|
|
"step": 9290
|
|
},
|
|
{
|
|
"epoch": 0.5950323282760387,
|
|
"grad_norm": 41.36620330810547,
|
|
"learning_rate": 8.437641882143163e-07,
|
|
"loss": 0.5845,
|
|
"num_input_tokens_seen": 29266944,
|
|
"step": 9295
|
|
},
|
|
{
|
|
"epoch": 0.5953524102170156,
|
|
"grad_norm": 17.812028884887695,
|
|
"learning_rate": 8.426606392201185e-07,
|
|
"loss": 0.319,
|
|
"num_input_tokens_seen": 29282816,
|
|
"step": 9300
|
|
},
|
|
{
|
|
"epoch": 0.5956724921579925,
|
|
"grad_norm": 22.074562072753906,
|
|
"learning_rate": 8.415572866659599e-07,
|
|
"loss": 0.3009,
|
|
"num_input_tokens_seen": 29297984,
|
|
"step": 9305
|
|
},
|
|
{
|
|
"epoch": 0.5959925740989693,
|
|
"grad_norm": 24.042194366455078,
|
|
"learning_rate": 8.404541319293896e-07,
|
|
"loss": 0.376,
|
|
"num_input_tokens_seen": 29313664,
|
|
"step": 9310
|
|
},
|
|
{
|
|
"epoch": 0.5963126560399462,
|
|
"grad_norm": 20.160175323486328,
|
|
"learning_rate": 8.393511763877086e-07,
|
|
"loss": 0.5842,
|
|
"num_input_tokens_seen": 29329472,
|
|
"step": 9315
|
|
},
|
|
{
|
|
"epoch": 0.5966327379809231,
|
|
"grad_norm": 33.067359924316406,
|
|
"learning_rate": 8.3824842141797e-07,
|
|
"loss": 0.4463,
|
|
"num_input_tokens_seen": 29346048,
|
|
"step": 9320
|
|
},
|
|
{
|
|
"epoch": 0.5969528199219,
|
|
"grad_norm": 27.763477325439453,
|
|
"learning_rate": 8.371458683969765e-07,
|
|
"loss": 0.3801,
|
|
"num_input_tokens_seen": 29361664,
|
|
"step": 9325
|
|
},
|
|
{
|
|
"epoch": 0.5972729018628768,
|
|
"grad_norm": 23.89577865600586,
|
|
"learning_rate": 8.360435187012787e-07,
|
|
"loss": 0.3887,
|
|
"num_input_tokens_seen": 29376896,
|
|
"step": 9330
|
|
},
|
|
{
|
|
"epoch": 0.5975929838038538,
|
|
"grad_norm": 36.93418502807617,
|
|
"learning_rate": 8.349413737071725e-07,
|
|
"loss": 0.3767,
|
|
"num_input_tokens_seen": 29392640,
|
|
"step": 9335
|
|
},
|
|
{
|
|
"epoch": 0.5979130657448307,
|
|
"grad_norm": 29.668235778808594,
|
|
"learning_rate": 8.338394347906994e-07,
|
|
"loss": 0.4399,
|
|
"num_input_tokens_seen": 29407808,
|
|
"step": 9340
|
|
},
|
|
{
|
|
"epoch": 0.5982331476858076,
|
|
"grad_norm": 36.61244201660156,
|
|
"learning_rate": 8.327377033276431e-07,
|
|
"loss": 0.2995,
|
|
"num_input_tokens_seen": 29422528,
|
|
"step": 9345
|
|
},
|
|
{
|
|
"epoch": 0.5985532296267845,
|
|
"grad_norm": 25.591800689697266,
|
|
"learning_rate": 8.316361806935279e-07,
|
|
"loss": 0.3481,
|
|
"num_input_tokens_seen": 29438272,
|
|
"step": 9350
|
|
},
|
|
{
|
|
"epoch": 0.5988733115677614,
|
|
"grad_norm": 30.289875030517578,
|
|
"learning_rate": 8.305348682636177e-07,
|
|
"loss": 0.4557,
|
|
"num_input_tokens_seen": 29453376,
|
|
"step": 9355
|
|
},
|
|
{
|
|
"epoch": 0.5991933935087382,
|
|
"grad_norm": 33.169734954833984,
|
|
"learning_rate": 8.294337674129144e-07,
|
|
"loss": 0.4204,
|
|
"num_input_tokens_seen": 29469248,
|
|
"step": 9360
|
|
},
|
|
{
|
|
"epoch": 0.5995134754497151,
|
|
"grad_norm": 35.08827209472656,
|
|
"learning_rate": 8.283328795161554e-07,
|
|
"loss": 0.2783,
|
|
"num_input_tokens_seen": 29485888,
|
|
"step": 9365
|
|
},
|
|
{
|
|
"epoch": 0.599833557390692,
|
|
"grad_norm": 28.095083236694336,
|
|
"learning_rate": 8.272322059478114e-07,
|
|
"loss": 0.3194,
|
|
"num_input_tokens_seen": 29500864,
|
|
"step": 9370
|
|
},
|
|
{
|
|
"epoch": 0.6001536393316689,
|
|
"grad_norm": 18.85226821899414,
|
|
"learning_rate": 8.261317480820871e-07,
|
|
"loss": 0.2312,
|
|
"num_input_tokens_seen": 29516288,
|
|
"step": 9375
|
|
},
|
|
{
|
|
"epoch": 0.6004737212726458,
|
|
"grad_norm": 34.60100173950195,
|
|
"learning_rate": 8.250315072929168e-07,
|
|
"loss": 0.4,
|
|
"num_input_tokens_seen": 29530880,
|
|
"step": 9380
|
|
},
|
|
{
|
|
"epoch": 0.6007297868254273,
|
|
"eval_loss": 0.3916759490966797,
|
|
"eval_runtime": 49.1281,
|
|
"eval_samples_per_second": 282.649,
|
|
"eval_steps_per_second": 35.336,
|
|
"num_input_tokens_seen": 29544576,
|
|
"step": 9384
|
|
},
|
|
{
|
|
"epoch": 0.6007938032136226,
|
|
"grad_norm": 20.751314163208008,
|
|
"learning_rate": 8.239314849539637e-07,
|
|
"loss": 0.3513,
|
|
"num_input_tokens_seen": 29547840,
|
|
"step": 9385
|
|
},
|
|
{
|
|
"epoch": 0.6011138851545996,
|
|
"grad_norm": 31.6501522064209,
|
|
"learning_rate": 8.228316824386193e-07,
|
|
"loss": 0.4204,
|
|
"num_input_tokens_seen": 29564096,
|
|
"step": 9390
|
|
},
|
|
{
|
|
"epoch": 0.6014339670955765,
|
|
"grad_norm": 33.23552322387695,
|
|
"learning_rate": 8.217321011199995e-07,
|
|
"loss": 0.3633,
|
|
"num_input_tokens_seen": 29579520,
|
|
"step": 9395
|
|
},
|
|
{
|
|
"epoch": 0.6017540490365534,
|
|
"grad_norm": 49.13716125488281,
|
|
"learning_rate": 8.206327423709441e-07,
|
|
"loss": 0.4256,
|
|
"num_input_tokens_seen": 29594048,
|
|
"step": 9400
|
|
},
|
|
{
|
|
"epoch": 0.6020741309775303,
|
|
"grad_norm": 23.02613067626953,
|
|
"learning_rate": 8.195336075640163e-07,
|
|
"loss": 0.3871,
|
|
"num_input_tokens_seen": 29610368,
|
|
"step": 9405
|
|
},
|
|
{
|
|
"epoch": 0.6023942129185071,
|
|
"grad_norm": 32.443267822265625,
|
|
"learning_rate": 8.184346980714984e-07,
|
|
"loss": 0.4232,
|
|
"num_input_tokens_seen": 29625792,
|
|
"step": 9410
|
|
},
|
|
{
|
|
"epoch": 0.602714294859484,
|
|
"grad_norm": 40.73899459838867,
|
|
"learning_rate": 8.173360152653914e-07,
|
|
"loss": 0.3399,
|
|
"num_input_tokens_seen": 29642240,
|
|
"step": 9415
|
|
},
|
|
{
|
|
"epoch": 0.6030343768004609,
|
|
"grad_norm": 28.00251007080078,
|
|
"learning_rate": 8.162375605174143e-07,
|
|
"loss": 0.293,
|
|
"num_input_tokens_seen": 29658176,
|
|
"step": 9420
|
|
},
|
|
{
|
|
"epoch": 0.6033544587414378,
|
|
"grad_norm": 26.76416778564453,
|
|
"learning_rate": 8.151393351990005e-07,
|
|
"loss": 0.3118,
|
|
"num_input_tokens_seen": 29675392,
|
|
"step": 9425
|
|
},
|
|
{
|
|
"epoch": 0.6036745406824147,
|
|
"grad_norm": 29.030107498168945,
|
|
"learning_rate": 8.140413406812971e-07,
|
|
"loss": 0.4241,
|
|
"num_input_tokens_seen": 29690048,
|
|
"step": 9430
|
|
},
|
|
{
|
|
"epoch": 0.6039946226233915,
|
|
"grad_norm": 33.374656677246094,
|
|
"learning_rate": 8.129435783351635e-07,
|
|
"loss": 0.3052,
|
|
"num_input_tokens_seen": 29705088,
|
|
"step": 9435
|
|
},
|
|
{
|
|
"epoch": 0.6043147045643685,
|
|
"grad_norm": 29.674457550048828,
|
|
"learning_rate": 8.118460495311685e-07,
|
|
"loss": 0.4482,
|
|
"num_input_tokens_seen": 29720576,
|
|
"step": 9440
|
|
},
|
|
{
|
|
"epoch": 0.6046347865053454,
|
|
"grad_norm": 30.353450775146484,
|
|
"learning_rate": 8.107487556395901e-07,
|
|
"loss": 0.4204,
|
|
"num_input_tokens_seen": 29736896,
|
|
"step": 9445
|
|
},
|
|
{
|
|
"epoch": 0.6049548684463223,
|
|
"grad_norm": 29.06775665283203,
|
|
"learning_rate": 8.096516980304115e-07,
|
|
"loss": 0.3567,
|
|
"num_input_tokens_seen": 29752768,
|
|
"step": 9450
|
|
},
|
|
{
|
|
"epoch": 0.6052749503872992,
|
|
"grad_norm": 50.72957229614258,
|
|
"learning_rate": 8.085548780733238e-07,
|
|
"loss": 0.3355,
|
|
"num_input_tokens_seen": 29768640,
|
|
"step": 9455
|
|
},
|
|
{
|
|
"epoch": 0.605595032328276,
|
|
"grad_norm": 32.87676239013672,
|
|
"learning_rate": 8.074582971377182e-07,
|
|
"loss": 0.338,
|
|
"num_input_tokens_seen": 29786240,
|
|
"step": 9460
|
|
},
|
|
{
|
|
"epoch": 0.6059151142692529,
|
|
"grad_norm": 40.09199142456055,
|
|
"learning_rate": 8.063619565926892e-07,
|
|
"loss": 0.4356,
|
|
"num_input_tokens_seen": 29802176,
|
|
"step": 9465
|
|
},
|
|
{
|
|
"epoch": 0.6062351962102298,
|
|
"grad_norm": 16.3148250579834,
|
|
"learning_rate": 8.052658578070313e-07,
|
|
"loss": 0.3912,
|
|
"num_input_tokens_seen": 29817600,
|
|
"step": 9470
|
|
},
|
|
{
|
|
"epoch": 0.6065552781512067,
|
|
"grad_norm": 13.280025482177734,
|
|
"learning_rate": 8.041700021492362e-07,
|
|
"loss": 0.3313,
|
|
"num_input_tokens_seen": 29832960,
|
|
"step": 9475
|
|
},
|
|
{
|
|
"epoch": 0.6068753600921836,
|
|
"grad_norm": 23.65538215637207,
|
|
"learning_rate": 8.030743909874924e-07,
|
|
"loss": 0.2888,
|
|
"num_input_tokens_seen": 29848448,
|
|
"step": 9480
|
|
},
|
|
{
|
|
"epoch": 0.6071954420331604,
|
|
"grad_norm": 16.695858001708984,
|
|
"learning_rate": 8.019790256896839e-07,
|
|
"loss": 0.3247,
|
|
"num_input_tokens_seen": 29863296,
|
|
"step": 9485
|
|
},
|
|
{
|
|
"epoch": 0.6075155239741373,
|
|
"grad_norm": 45.717647552490234,
|
|
"learning_rate": 8.008839076233871e-07,
|
|
"loss": 0.3806,
|
|
"num_input_tokens_seen": 29880128,
|
|
"step": 9490
|
|
},
|
|
{
|
|
"epoch": 0.6078356059151143,
|
|
"grad_norm": 24.243160247802734,
|
|
"learning_rate": 7.997890381558691e-07,
|
|
"loss": 0.3618,
|
|
"num_input_tokens_seen": 29895296,
|
|
"step": 9495
|
|
},
|
|
{
|
|
"epoch": 0.6081556878560912,
|
|
"grad_norm": 33.516685485839844,
|
|
"learning_rate": 7.986944186540878e-07,
|
|
"loss": 0.4291,
|
|
"num_input_tokens_seen": 29911296,
|
|
"step": 9500
|
|
},
|
|
{
|
|
"epoch": 0.6084757697970681,
|
|
"grad_norm": 45.87578582763672,
|
|
"learning_rate": 7.976000504846885e-07,
|
|
"loss": 0.4594,
|
|
"num_input_tokens_seen": 29926912,
|
|
"step": 9505
|
|
},
|
|
{
|
|
"epoch": 0.608795851738045,
|
|
"grad_norm": 104.76370239257812,
|
|
"learning_rate": 7.965059350140024e-07,
|
|
"loss": 0.4726,
|
|
"num_input_tokens_seen": 29942272,
|
|
"step": 9510
|
|
},
|
|
{
|
|
"epoch": 0.6091159336790218,
|
|
"grad_norm": 38.258480072021484,
|
|
"learning_rate": 7.954120736080461e-07,
|
|
"loss": 0.4037,
|
|
"num_input_tokens_seen": 29958016,
|
|
"step": 9515
|
|
},
|
|
{
|
|
"epoch": 0.6094360156199987,
|
|
"grad_norm": 24.145002365112305,
|
|
"learning_rate": 7.943184676325178e-07,
|
|
"loss": 0.5797,
|
|
"num_input_tokens_seen": 29974720,
|
|
"step": 9520
|
|
},
|
|
{
|
|
"epoch": 0.6097560975609756,
|
|
"grad_norm": 27.14354133605957,
|
|
"learning_rate": 7.932251184527974e-07,
|
|
"loss": 0.4342,
|
|
"num_input_tokens_seen": 29991680,
|
|
"step": 9525
|
|
},
|
|
{
|
|
"epoch": 0.6100761795019525,
|
|
"grad_norm": 27.287010192871094,
|
|
"learning_rate": 7.921320274339446e-07,
|
|
"loss": 0.2753,
|
|
"num_input_tokens_seen": 30007168,
|
|
"step": 9530
|
|
},
|
|
{
|
|
"epoch": 0.6103962614429294,
|
|
"grad_norm": 39.53981018066406,
|
|
"learning_rate": 7.910391959406966e-07,
|
|
"loss": 0.3337,
|
|
"num_input_tokens_seen": 30022656,
|
|
"step": 9535
|
|
},
|
|
{
|
|
"epoch": 0.6107163433839062,
|
|
"grad_norm": 33.61812210083008,
|
|
"learning_rate": 7.899466253374653e-07,
|
|
"loss": 0.3943,
|
|
"num_input_tokens_seen": 30038144,
|
|
"step": 9540
|
|
},
|
|
{
|
|
"epoch": 0.6110364253248832,
|
|
"grad_norm": 34.27006149291992,
|
|
"learning_rate": 7.88854316988339e-07,
|
|
"loss": 0.3347,
|
|
"num_input_tokens_seen": 30055488,
|
|
"step": 9545
|
|
},
|
|
{
|
|
"epoch": 0.6113565072658601,
|
|
"grad_norm": 39.317073822021484,
|
|
"learning_rate": 7.877622722570771e-07,
|
|
"loss": 0.3016,
|
|
"num_input_tokens_seen": 30071040,
|
|
"step": 9550
|
|
},
|
|
{
|
|
"epoch": 0.611676589206837,
|
|
"grad_norm": 23.81880760192871,
|
|
"learning_rate": 7.866704925071101e-07,
|
|
"loss": 0.4185,
|
|
"num_input_tokens_seen": 30088000,
|
|
"step": 9555
|
|
},
|
|
{
|
|
"epoch": 0.6119966711478139,
|
|
"grad_norm": 24.980806350708008,
|
|
"learning_rate": 7.855789791015377e-07,
|
|
"loss": 0.422,
|
|
"num_input_tokens_seen": 30103040,
|
|
"step": 9560
|
|
},
|
|
{
|
|
"epoch": 0.6123167530887907,
|
|
"grad_norm": 42.49583053588867,
|
|
"learning_rate": 7.844877334031277e-07,
|
|
"loss": 0.3946,
|
|
"num_input_tokens_seen": 30117760,
|
|
"step": 9565
|
|
},
|
|
{
|
|
"epoch": 0.6126368350297676,
|
|
"grad_norm": 32.370361328125,
|
|
"learning_rate": 7.833967567743131e-07,
|
|
"loss": 0.4797,
|
|
"num_input_tokens_seen": 30133888,
|
|
"step": 9570
|
|
},
|
|
{
|
|
"epoch": 0.6129569169707445,
|
|
"grad_norm": 30.043428421020508,
|
|
"learning_rate": 7.823060505771903e-07,
|
|
"loss": 0.3747,
|
|
"num_input_tokens_seen": 30149312,
|
|
"step": 9575
|
|
},
|
|
{
|
|
"epoch": 0.6132769989117214,
|
|
"grad_norm": 39.43803787231445,
|
|
"learning_rate": 7.812156161735199e-07,
|
|
"loss": 0.3944,
|
|
"num_input_tokens_seen": 30163840,
|
|
"step": 9580
|
|
},
|
|
{
|
|
"epoch": 0.6135970808526983,
|
|
"grad_norm": 69.42517852783203,
|
|
"learning_rate": 7.801254549247215e-07,
|
|
"loss": 0.5462,
|
|
"num_input_tokens_seen": 30180544,
|
|
"step": 9585
|
|
},
|
|
{
|
|
"epoch": 0.6139171627936751,
|
|
"grad_norm": 18.023378372192383,
|
|
"learning_rate": 7.790355681918739e-07,
|
|
"loss": 0.3212,
|
|
"num_input_tokens_seen": 30197120,
|
|
"step": 9590
|
|
},
|
|
{
|
|
"epoch": 0.614237244734652,
|
|
"grad_norm": 52.89658737182617,
|
|
"learning_rate": 7.779459573357144e-07,
|
|
"loss": 0.421,
|
|
"num_input_tokens_seen": 30213376,
|
|
"step": 9595
|
|
},
|
|
{
|
|
"epoch": 0.614557326675629,
|
|
"grad_norm": 20.749906539916992,
|
|
"learning_rate": 7.768566237166338e-07,
|
|
"loss": 0.4225,
|
|
"num_input_tokens_seen": 30229120,
|
|
"step": 9600
|
|
},
|
|
{
|
|
"epoch": 0.6148774086166059,
|
|
"grad_norm": 45.14435958862305,
|
|
"learning_rate": 7.757675686946786e-07,
|
|
"loss": 0.5064,
|
|
"num_input_tokens_seen": 30244544,
|
|
"step": 9605
|
|
},
|
|
{
|
|
"epoch": 0.6151974905575828,
|
|
"grad_norm": 31.990671157836914,
|
|
"learning_rate": 7.746787936295468e-07,
|
|
"loss": 0.4207,
|
|
"num_input_tokens_seen": 30260864,
|
|
"step": 9610
|
|
},
|
|
{
|
|
"epoch": 0.6155175724985597,
|
|
"grad_norm": 42.7758674621582,
|
|
"learning_rate": 7.735902998805868e-07,
|
|
"loss": 0.3739,
|
|
"num_input_tokens_seen": 30275456,
|
|
"step": 9615
|
|
},
|
|
{
|
|
"epoch": 0.6158376544395365,
|
|
"grad_norm": 42.92548751831055,
|
|
"learning_rate": 7.725020888067955e-07,
|
|
"loss": 0.4195,
|
|
"num_input_tokens_seen": 30291008,
|
|
"step": 9620
|
|
},
|
|
{
|
|
"epoch": 0.6161577363805134,
|
|
"grad_norm": 18.282148361206055,
|
|
"learning_rate": 7.714141617668176e-07,
|
|
"loss": 0.4814,
|
|
"num_input_tokens_seen": 30306816,
|
|
"step": 9625
|
|
},
|
|
{
|
|
"epoch": 0.6164778183214903,
|
|
"grad_norm": 25.10959815979004,
|
|
"learning_rate": 7.703265201189426e-07,
|
|
"loss": 0.3298,
|
|
"num_input_tokens_seen": 30322240,
|
|
"step": 9630
|
|
},
|
|
{
|
|
"epoch": 0.6167979002624672,
|
|
"grad_norm": 17.638351440429688,
|
|
"learning_rate": 7.692391652211036e-07,
|
|
"loss": 0.3357,
|
|
"num_input_tokens_seen": 30338048,
|
|
"step": 9635
|
|
},
|
|
{
|
|
"epoch": 0.617117982203444,
|
|
"grad_norm": 40.34111404418945,
|
|
"learning_rate": 7.681520984308769e-07,
|
|
"loss": 0.3313,
|
|
"num_input_tokens_seen": 30353984,
|
|
"step": 9640
|
|
},
|
|
{
|
|
"epoch": 0.6174380641444209,
|
|
"grad_norm": 39.976497650146484,
|
|
"learning_rate": 7.670653211054772e-07,
|
|
"loss": 0.4902,
|
|
"num_input_tokens_seen": 30370048,
|
|
"step": 9645
|
|
},
|
|
{
|
|
"epoch": 0.6177581460853978,
|
|
"grad_norm": 35.88365936279297,
|
|
"learning_rate": 7.659788346017591e-07,
|
|
"loss": 0.413,
|
|
"num_input_tokens_seen": 30385344,
|
|
"step": 9650
|
|
},
|
|
{
|
|
"epoch": 0.6180782280263748,
|
|
"grad_norm": 35.93766784667969,
|
|
"learning_rate": 7.648926402762133e-07,
|
|
"loss": 0.3813,
|
|
"num_input_tokens_seen": 30400576,
|
|
"step": 9655
|
|
},
|
|
{
|
|
"epoch": 0.6183983099673517,
|
|
"grad_norm": 38.066795349121094,
|
|
"learning_rate": 7.638067394849671e-07,
|
|
"loss": 0.3867,
|
|
"num_input_tokens_seen": 30415424,
|
|
"step": 9660
|
|
},
|
|
{
|
|
"epoch": 0.6187183919083286,
|
|
"grad_norm": 44.817840576171875,
|
|
"learning_rate": 7.627211335837797e-07,
|
|
"loss": 0.4056,
|
|
"num_input_tokens_seen": 30430592,
|
|
"step": 9665
|
|
},
|
|
{
|
|
"epoch": 0.6190384738493054,
|
|
"grad_norm": 21.97688865661621,
|
|
"learning_rate": 7.616358239280427e-07,
|
|
"loss": 0.4352,
|
|
"num_input_tokens_seen": 30445952,
|
|
"step": 9670
|
|
},
|
|
{
|
|
"epoch": 0.6193585557902823,
|
|
"grad_norm": 30.693403244018555,
|
|
"learning_rate": 7.605508118727787e-07,
|
|
"loss": 0.3274,
|
|
"num_input_tokens_seen": 30461568,
|
|
"step": 9675
|
|
},
|
|
{
|
|
"epoch": 0.6196786377312592,
|
|
"grad_norm": 25.588163375854492,
|
|
"learning_rate": 7.594660987726373e-07,
|
|
"loss": 0.3611,
|
|
"num_input_tokens_seen": 30476672,
|
|
"step": 9680
|
|
},
|
|
{
|
|
"epoch": 0.6199987196722361,
|
|
"grad_norm": 42.19605255126953,
|
|
"learning_rate": 7.583816859818956e-07,
|
|
"loss": 0.4013,
|
|
"num_input_tokens_seen": 30492672,
|
|
"step": 9685
|
|
},
|
|
{
|
|
"epoch": 0.620318801613213,
|
|
"grad_norm": 23.23065948486328,
|
|
"learning_rate": 7.57297574854456e-07,
|
|
"loss": 0.3785,
|
|
"num_input_tokens_seen": 30507712,
|
|
"step": 9690
|
|
},
|
|
{
|
|
"epoch": 0.6206388835541898,
|
|
"grad_norm": 72.38654327392578,
|
|
"learning_rate": 7.56213766743844e-07,
|
|
"loss": 0.4395,
|
|
"num_input_tokens_seen": 30524032,
|
|
"step": 9695
|
|
},
|
|
{
|
|
"epoch": 0.6209589654951667,
|
|
"grad_norm": 16.888713836669922,
|
|
"learning_rate": 7.551302630032064e-07,
|
|
"loss": 0.333,
|
|
"num_input_tokens_seen": 30539776,
|
|
"step": 9700
|
|
},
|
|
{
|
|
"epoch": 0.6212790474361437,
|
|
"grad_norm": 18.87851905822754,
|
|
"learning_rate": 7.540470649853106e-07,
|
|
"loss": 0.3693,
|
|
"num_input_tokens_seen": 30554752,
|
|
"step": 9705
|
|
},
|
|
{
|
|
"epoch": 0.6215991293771206,
|
|
"grad_norm": 25.751543045043945,
|
|
"learning_rate": 7.529641740425419e-07,
|
|
"loss": 0.4034,
|
|
"num_input_tokens_seen": 30571968,
|
|
"step": 9710
|
|
},
|
|
{
|
|
"epoch": 0.6219192113180975,
|
|
"grad_norm": 30.196582794189453,
|
|
"learning_rate": 7.518815915269023e-07,
|
|
"loss": 0.4351,
|
|
"num_input_tokens_seen": 30587264,
|
|
"step": 9715
|
|
},
|
|
{
|
|
"epoch": 0.6222392932590743,
|
|
"grad_norm": 18.58189582824707,
|
|
"learning_rate": 7.507993187900092e-07,
|
|
"loss": 0.3948,
|
|
"num_input_tokens_seen": 30603200,
|
|
"step": 9720
|
|
},
|
|
{
|
|
"epoch": 0.6225593752000512,
|
|
"grad_norm": 29.565282821655273,
|
|
"learning_rate": 7.497173571830926e-07,
|
|
"loss": 0.4253,
|
|
"num_input_tokens_seen": 30617856,
|
|
"step": 9725
|
|
},
|
|
{
|
|
"epoch": 0.6228794571410281,
|
|
"grad_norm": 43.09429168701172,
|
|
"learning_rate": 7.486357080569938e-07,
|
|
"loss": 0.4732,
|
|
"num_input_tokens_seen": 30632448,
|
|
"step": 9730
|
|
},
|
|
{
|
|
"epoch": 0.623199539082005,
|
|
"grad_norm": 23.361135482788086,
|
|
"learning_rate": 7.47554372762165e-07,
|
|
"loss": 0.3747,
|
|
"num_input_tokens_seen": 30647680,
|
|
"step": 9735
|
|
},
|
|
{
|
|
"epoch": 0.6235196210229819,
|
|
"grad_norm": 59.268245697021484,
|
|
"learning_rate": 7.464733526486662e-07,
|
|
"loss": 0.4905,
|
|
"num_input_tokens_seen": 30663616,
|
|
"step": 9740
|
|
},
|
|
{
|
|
"epoch": 0.6238397029639587,
|
|
"grad_norm": 40.878173828125,
|
|
"learning_rate": 7.453926490661628e-07,
|
|
"loss": 0.3424,
|
|
"num_input_tokens_seen": 30682496,
|
|
"step": 9745
|
|
},
|
|
{
|
|
"epoch": 0.6241597849049356,
|
|
"grad_norm": 51.57231521606445,
|
|
"learning_rate": 7.443122633639267e-07,
|
|
"loss": 0.3639,
|
|
"num_input_tokens_seen": 30697664,
|
|
"step": 9750
|
|
},
|
|
{
|
|
"epoch": 0.6244798668459125,
|
|
"grad_norm": 61.236114501953125,
|
|
"learning_rate": 7.432321968908319e-07,
|
|
"loss": 0.3835,
|
|
"num_input_tokens_seen": 30713408,
|
|
"step": 9755
|
|
},
|
|
{
|
|
"epoch": 0.6247999487868895,
|
|
"grad_norm": 22.234743118286133,
|
|
"learning_rate": 7.421524509953543e-07,
|
|
"loss": 0.3173,
|
|
"num_input_tokens_seen": 30730496,
|
|
"step": 9760
|
|
},
|
|
{
|
|
"epoch": 0.6251200307278664,
|
|
"grad_norm": 29.365135192871094,
|
|
"learning_rate": 7.410730270255687e-07,
|
|
"loss": 0.4158,
|
|
"num_input_tokens_seen": 30745664,
|
|
"step": 9765
|
|
},
|
|
{
|
|
"epoch": 0.6254401126688433,
|
|
"grad_norm": 32.928707122802734,
|
|
"learning_rate": 7.399939263291493e-07,
|
|
"loss": 0.3655,
|
|
"num_input_tokens_seen": 30760960,
|
|
"step": 9770
|
|
},
|
|
{
|
|
"epoch": 0.6257601946098201,
|
|
"grad_norm": 33.555416107177734,
|
|
"learning_rate": 7.389151502533657e-07,
|
|
"loss": 0.4854,
|
|
"num_input_tokens_seen": 30775872,
|
|
"step": 9775
|
|
},
|
|
{
|
|
"epoch": 0.626080276550797,
|
|
"grad_norm": 17.007144927978516,
|
|
"learning_rate": 7.378367001450819e-07,
|
|
"loss": 0.3683,
|
|
"num_input_tokens_seen": 30791424,
|
|
"step": 9780
|
|
},
|
|
{
|
|
"epoch": 0.6264003584917739,
|
|
"grad_norm": 55.41214370727539,
|
|
"learning_rate": 7.367585773507567e-07,
|
|
"loss": 0.4317,
|
|
"num_input_tokens_seen": 30807680,
|
|
"step": 9785
|
|
},
|
|
{
|
|
"epoch": 0.6267204404327508,
|
|
"grad_norm": 41.18684387207031,
|
|
"learning_rate": 7.356807832164385e-07,
|
|
"loss": 0.4428,
|
|
"num_input_tokens_seen": 30823680,
|
|
"step": 9790
|
|
},
|
|
{
|
|
"epoch": 0.6270405223737276,
|
|
"grad_norm": 18.37259292602539,
|
|
"learning_rate": 7.346033190877654e-07,
|
|
"loss": 0.4404,
|
|
"num_input_tokens_seen": 30839360,
|
|
"step": 9795
|
|
},
|
|
{
|
|
"epoch": 0.6273606043147045,
|
|
"grad_norm": 36.56877136230469,
|
|
"learning_rate": 7.335261863099651e-07,
|
|
"loss": 0.3596,
|
|
"num_input_tokens_seen": 30854784,
|
|
"step": 9800
|
|
},
|
|
{
|
|
"epoch": 0.6276806862556814,
|
|
"grad_norm": 31.16109275817871,
|
|
"learning_rate": 7.324493862278498e-07,
|
|
"loss": 0.3969,
|
|
"num_input_tokens_seen": 30870592,
|
|
"step": 9805
|
|
},
|
|
{
|
|
"epoch": 0.6280007681966584,
|
|
"grad_norm": 39.3582649230957,
|
|
"learning_rate": 7.313729201858167e-07,
|
|
"loss": 0.4546,
|
|
"num_input_tokens_seen": 30885952,
|
|
"step": 9810
|
|
},
|
|
{
|
|
"epoch": 0.6283208501376353,
|
|
"grad_norm": 21.64111328125,
|
|
"learning_rate": 7.302967895278473e-07,
|
|
"loss": 0.3285,
|
|
"num_input_tokens_seen": 30902080,
|
|
"step": 9815
|
|
},
|
|
{
|
|
"epoch": 0.6286409320786122,
|
|
"grad_norm": 29.953590393066406,
|
|
"learning_rate": 7.292209955975028e-07,
|
|
"loss": 0.4045,
|
|
"num_input_tokens_seen": 30919232,
|
|
"step": 9820
|
|
},
|
|
{
|
|
"epoch": 0.628961014019589,
|
|
"grad_norm": 37.44114685058594,
|
|
"learning_rate": 7.281455397379244e-07,
|
|
"loss": 0.4068,
|
|
"num_input_tokens_seen": 30936448,
|
|
"step": 9825
|
|
},
|
|
{
|
|
"epoch": 0.6292810959605659,
|
|
"grad_norm": 37.291465759277344,
|
|
"learning_rate": 7.270704232918316e-07,
|
|
"loss": 0.3249,
|
|
"num_input_tokens_seen": 30952256,
|
|
"step": 9830
|
|
},
|
|
{
|
|
"epoch": 0.6296011779015428,
|
|
"grad_norm": 53.09471130371094,
|
|
"learning_rate": 7.2599564760152e-07,
|
|
"loss": 0.401,
|
|
"num_input_tokens_seen": 30967360,
|
|
"step": 9835
|
|
},
|
|
{
|
|
"epoch": 0.6299212598425197,
|
|
"grad_norm": 31.090974807739258,
|
|
"learning_rate": 7.249212140088592e-07,
|
|
"loss": 0.3851,
|
|
"num_input_tokens_seen": 30982016,
|
|
"step": 9840
|
|
},
|
|
{
|
|
"epoch": 0.6302413417834966,
|
|
"grad_norm": 19.057065963745117,
|
|
"learning_rate": 7.23847123855293e-07,
|
|
"loss": 0.3347,
|
|
"num_input_tokens_seen": 30998080,
|
|
"step": 9845
|
|
},
|
|
{
|
|
"epoch": 0.6305614237244734,
|
|
"grad_norm": 22.847869873046875,
|
|
"learning_rate": 7.227733784818349e-07,
|
|
"loss": 0.274,
|
|
"num_input_tokens_seen": 31013184,
|
|
"step": 9850
|
|
},
|
|
{
|
|
"epoch": 0.6308815056654503,
|
|
"grad_norm": 10.46581745147705,
|
|
"learning_rate": 7.216999792290683e-07,
|
|
"loss": 0.3758,
|
|
"num_input_tokens_seen": 31028800,
|
|
"step": 9855
|
|
},
|
|
{
|
|
"epoch": 0.6312015876064272,
|
|
"grad_norm": 32.733524322509766,
|
|
"learning_rate": 7.206269274371457e-07,
|
|
"loss": 0.4837,
|
|
"num_input_tokens_seen": 31044736,
|
|
"step": 9860
|
|
},
|
|
{
|
|
"epoch": 0.6315216695474042,
|
|
"grad_norm": 16.939966201782227,
|
|
"learning_rate": 7.195542244457845e-07,
|
|
"loss": 0.3489,
|
|
"num_input_tokens_seen": 31059968,
|
|
"step": 9865
|
|
},
|
|
{
|
|
"epoch": 0.6318417514883811,
|
|
"grad_norm": 21.10120391845703,
|
|
"learning_rate": 7.184818715942666e-07,
|
|
"loss": 0.3215,
|
|
"num_input_tokens_seen": 31074880,
|
|
"step": 9870
|
|
},
|
|
{
|
|
"epoch": 0.6321618334293579,
|
|
"grad_norm": 32.21525573730469,
|
|
"learning_rate": 7.174098702214374e-07,
|
|
"loss": 0.3499,
|
|
"num_input_tokens_seen": 31090432,
|
|
"step": 9875
|
|
},
|
|
{
|
|
"epoch": 0.6324819153703348,
|
|
"grad_norm": 27.200403213500977,
|
|
"learning_rate": 7.163382216657033e-07,
|
|
"loss": 0.372,
|
|
"num_input_tokens_seen": 31107264,
|
|
"step": 9880
|
|
},
|
|
{
|
|
"epoch": 0.6328019973113117,
|
|
"grad_norm": 50.323707580566406,
|
|
"learning_rate": 7.152669272650302e-07,
|
|
"loss": 0.3531,
|
|
"num_input_tokens_seen": 31124096,
|
|
"step": 9885
|
|
},
|
|
{
|
|
"epoch": 0.6331220792522886,
|
|
"grad_norm": 51.96805191040039,
|
|
"learning_rate": 7.141959883569411e-07,
|
|
"loss": 0.3881,
|
|
"num_input_tokens_seen": 31138752,
|
|
"step": 9890
|
|
},
|
|
{
|
|
"epoch": 0.6334421611932655,
|
|
"grad_norm": 28.28093147277832,
|
|
"learning_rate": 7.131254062785165e-07,
|
|
"loss": 0.4624,
|
|
"num_input_tokens_seen": 31154048,
|
|
"step": 9895
|
|
},
|
|
{
|
|
"epoch": 0.6337622431342423,
|
|
"grad_norm": 26.130292892456055,
|
|
"learning_rate": 7.120551823663907e-07,
|
|
"loss": 0.5159,
|
|
"num_input_tokens_seen": 31170304,
|
|
"step": 9900
|
|
},
|
|
{
|
|
"epoch": 0.6340823250752192,
|
|
"grad_norm": 13.736687660217285,
|
|
"learning_rate": 7.109853179567499e-07,
|
|
"loss": 0.2778,
|
|
"num_input_tokens_seen": 31186368,
|
|
"step": 9905
|
|
},
|
|
{
|
|
"epoch": 0.6344024070161961,
|
|
"grad_norm": 22.524595260620117,
|
|
"learning_rate": 7.099158143853337e-07,
|
|
"loss": 0.4266,
|
|
"num_input_tokens_seen": 31201664,
|
|
"step": 9910
|
|
},
|
|
{
|
|
"epoch": 0.634722488957173,
|
|
"grad_norm": 48.70823287963867,
|
|
"learning_rate": 7.088466729874289e-07,
|
|
"loss": 0.396,
|
|
"num_input_tokens_seen": 31217216,
|
|
"step": 9915
|
|
},
|
|
{
|
|
"epoch": 0.63504257089815,
|
|
"grad_norm": 29.29494285583496,
|
|
"learning_rate": 7.077778950978713e-07,
|
|
"loss": 0.3762,
|
|
"num_input_tokens_seen": 31233728,
|
|
"step": 9920
|
|
},
|
|
{
|
|
"epoch": 0.6353626528391269,
|
|
"grad_norm": 17.022003173828125,
|
|
"learning_rate": 7.06709482051043e-07,
|
|
"loss": 0.4657,
|
|
"num_input_tokens_seen": 31249664,
|
|
"step": 9925
|
|
},
|
|
{
|
|
"epoch": 0.6356827347801037,
|
|
"grad_norm": 18.935352325439453,
|
|
"learning_rate": 7.056414351808698e-07,
|
|
"loss": 0.2958,
|
|
"num_input_tokens_seen": 31265408,
|
|
"step": 9930
|
|
},
|
|
{
|
|
"epoch": 0.6360028167210806,
|
|
"grad_norm": 25.703018188476562,
|
|
"learning_rate": 7.045737558208206e-07,
|
|
"loss": 0.3557,
|
|
"num_input_tokens_seen": 31281088,
|
|
"step": 9935
|
|
},
|
|
{
|
|
"epoch": 0.6363228986620575,
|
|
"grad_norm": 28.873281478881836,
|
|
"learning_rate": 7.035064453039064e-07,
|
|
"loss": 0.4025,
|
|
"num_input_tokens_seen": 31296512,
|
|
"step": 9940
|
|
},
|
|
{
|
|
"epoch": 0.6366429806030344,
|
|
"grad_norm": 14.413522720336914,
|
|
"learning_rate": 7.024395049626766e-07,
|
|
"loss": 0.3796,
|
|
"num_input_tokens_seen": 31312000,
|
|
"step": 9945
|
|
},
|
|
{
|
|
"epoch": 0.6369630625440112,
|
|
"grad_norm": 42.59180450439453,
|
|
"learning_rate": 7.013729361292182e-07,
|
|
"loss": 0.3378,
|
|
"num_input_tokens_seen": 31327488,
|
|
"step": 9950
|
|
},
|
|
{
|
|
"epoch": 0.6372831444849881,
|
|
"grad_norm": 37.88176727294922,
|
|
"learning_rate": 7.003067401351554e-07,
|
|
"loss": 0.2992,
|
|
"num_input_tokens_seen": 31343936,
|
|
"step": 9955
|
|
},
|
|
{
|
|
"epoch": 0.637603226425965,
|
|
"grad_norm": 69.40776062011719,
|
|
"learning_rate": 6.992409183116465e-07,
|
|
"loss": 0.3971,
|
|
"num_input_tokens_seen": 31359232,
|
|
"step": 9960
|
|
},
|
|
{
|
|
"epoch": 0.6379233083669419,
|
|
"grad_norm": 18.821264266967773,
|
|
"learning_rate": 6.981754719893826e-07,
|
|
"loss": 0.3715,
|
|
"num_input_tokens_seen": 31375616,
|
|
"step": 9965
|
|
},
|
|
{
|
|
"epoch": 0.6382433903079189,
|
|
"grad_norm": 49.44694137573242,
|
|
"learning_rate": 6.971104024985852e-07,
|
|
"loss": 0.4687,
|
|
"num_input_tokens_seen": 31391680,
|
|
"step": 9970
|
|
},
|
|
{
|
|
"epoch": 0.6385634722488958,
|
|
"grad_norm": 28.005033493041992,
|
|
"learning_rate": 6.960457111690068e-07,
|
|
"loss": 0.3829,
|
|
"num_input_tokens_seen": 31407424,
|
|
"step": 9975
|
|
},
|
|
{
|
|
"epoch": 0.6388835541898726,
|
|
"grad_norm": 18.54348373413086,
|
|
"learning_rate": 6.94981399329927e-07,
|
|
"loss": 0.3854,
|
|
"num_input_tokens_seen": 31422912,
|
|
"step": 9980
|
|
},
|
|
{
|
|
"epoch": 0.6392036361308495,
|
|
"grad_norm": 53.93000030517578,
|
|
"learning_rate": 6.939174683101509e-07,
|
|
"loss": 0.3806,
|
|
"num_input_tokens_seen": 31438912,
|
|
"step": 9985
|
|
},
|
|
{
|
|
"epoch": 0.6395237180718264,
|
|
"grad_norm": 25.696611404418945,
|
|
"learning_rate": 6.9285391943801e-07,
|
|
"loss": 0.2888,
|
|
"num_input_tokens_seen": 31455168,
|
|
"step": 9990
|
|
},
|
|
{
|
|
"epoch": 0.6398438000128033,
|
|
"grad_norm": 32.05419158935547,
|
|
"learning_rate": 6.917907540413569e-07,
|
|
"loss": 0.32,
|
|
"num_input_tokens_seen": 31470592,
|
|
"step": 9995
|
|
},
|
|
{
|
|
"epoch": 0.6401638819537802,
|
|
"grad_norm": 31.22829818725586,
|
|
"learning_rate": 6.907279734475659e-07,
|
|
"loss": 0.3466,
|
|
"num_input_tokens_seen": 31485632,
|
|
"step": 10000
|
|
},
|
|
{
|
|
"epoch": 0.640483963894757,
|
|
"grad_norm": 47.022422790527344,
|
|
"learning_rate": 6.896655789835317e-07,
|
|
"loss": 0.353,
|
|
"num_input_tokens_seen": 31500352,
|
|
"step": 10005
|
|
},
|
|
{
|
|
"epoch": 0.6408040458357339,
|
|
"grad_norm": 38.05258560180664,
|
|
"learning_rate": 6.886035719756656e-07,
|
|
"loss": 0.365,
|
|
"num_input_tokens_seen": 31516928,
|
|
"step": 10010
|
|
},
|
|
{
|
|
"epoch": 0.6411241277767108,
|
|
"grad_norm": 19.052812576293945,
|
|
"learning_rate": 6.875419537498959e-07,
|
|
"loss": 0.272,
|
|
"num_input_tokens_seen": 31532608,
|
|
"step": 10015
|
|
},
|
|
{
|
|
"epoch": 0.6414442097176877,
|
|
"grad_norm": 54.83672332763672,
|
|
"learning_rate": 6.864807256316658e-07,
|
|
"loss": 0.5903,
|
|
"num_input_tokens_seen": 31548608,
|
|
"step": 10020
|
|
},
|
|
{
|
|
"epoch": 0.6417642916586647,
|
|
"grad_norm": 21.3775691986084,
|
|
"learning_rate": 6.854198889459311e-07,
|
|
"loss": 0.4124,
|
|
"num_input_tokens_seen": 31564224,
|
|
"step": 10025
|
|
},
|
|
{
|
|
"epoch": 0.6420843735996415,
|
|
"grad_norm": 8.760631561279297,
|
|
"learning_rate": 6.84359445017158e-07,
|
|
"loss": 0.2575,
|
|
"num_input_tokens_seen": 31579200,
|
|
"step": 10030
|
|
},
|
|
{
|
|
"epoch": 0.6424044555406184,
|
|
"grad_norm": 46.2386589050293,
|
|
"learning_rate": 6.832993951693244e-07,
|
|
"loss": 0.4146,
|
|
"num_input_tokens_seen": 31594816,
|
|
"step": 10035
|
|
},
|
|
{
|
|
"epoch": 0.6427245374815953,
|
|
"grad_norm": 14.027458190917969,
|
|
"learning_rate": 6.822397407259144e-07,
|
|
"loss": 0.3439,
|
|
"num_input_tokens_seen": 31610432,
|
|
"step": 10040
|
|
},
|
|
{
|
|
"epoch": 0.6430446194225722,
|
|
"grad_norm": 40.68043899536133,
|
|
"learning_rate": 6.811804830099186e-07,
|
|
"loss": 0.3688,
|
|
"num_input_tokens_seen": 31627520,
|
|
"step": 10045
|
|
},
|
|
{
|
|
"epoch": 0.6433647013635491,
|
|
"grad_norm": 46.507389068603516,
|
|
"learning_rate": 6.801216233438336e-07,
|
|
"loss": 0.3446,
|
|
"num_input_tokens_seen": 31644352,
|
|
"step": 10050
|
|
},
|
|
{
|
|
"epoch": 0.6436847833045259,
|
|
"grad_norm": 30.864545822143555,
|
|
"learning_rate": 6.790631630496575e-07,
|
|
"loss": 0.3831,
|
|
"num_input_tokens_seen": 31660160,
|
|
"step": 10055
|
|
},
|
|
{
|
|
"epoch": 0.6440048652455028,
|
|
"grad_norm": 34.4333610534668,
|
|
"learning_rate": 6.780051034488903e-07,
|
|
"loss": 0.4395,
|
|
"num_input_tokens_seen": 31676352,
|
|
"step": 10060
|
|
},
|
|
{
|
|
"epoch": 0.6443249471864797,
|
|
"grad_norm": 80.57857513427734,
|
|
"learning_rate": 6.769474458625323e-07,
|
|
"loss": 0.3439,
|
|
"num_input_tokens_seen": 31692160,
|
|
"step": 10065
|
|
},
|
|
{
|
|
"epoch": 0.6446450291274566,
|
|
"grad_norm": 17.465198516845703,
|
|
"learning_rate": 6.758901916110813e-07,
|
|
"loss": 0.3099,
|
|
"num_input_tokens_seen": 31707712,
|
|
"step": 10070
|
|
},
|
|
{
|
|
"epoch": 0.6449651110684336,
|
|
"grad_norm": 16.61797523498535,
|
|
"learning_rate": 6.748333420145315e-07,
|
|
"loss": 0.3246,
|
|
"num_input_tokens_seen": 31723776,
|
|
"step": 10075
|
|
},
|
|
{
|
|
"epoch": 0.6452851930094105,
|
|
"grad_norm": 21.947399139404297,
|
|
"learning_rate": 6.737768983923718e-07,
|
|
"loss": 0.3972,
|
|
"num_input_tokens_seen": 31740672,
|
|
"step": 10080
|
|
},
|
|
{
|
|
"epoch": 0.6456052749503873,
|
|
"grad_norm": 32.63840103149414,
|
|
"learning_rate": 6.727208620635849e-07,
|
|
"loss": 0.2989,
|
|
"num_input_tokens_seen": 31755648,
|
|
"step": 10085
|
|
},
|
|
{
|
|
"epoch": 0.6459253568913642,
|
|
"grad_norm": 37.15324783325195,
|
|
"learning_rate": 6.716652343466446e-07,
|
|
"loss": 0.4543,
|
|
"num_input_tokens_seen": 31770624,
|
|
"step": 10090
|
|
},
|
|
{
|
|
"epoch": 0.6462454388323411,
|
|
"grad_norm": 40.501869201660156,
|
|
"learning_rate": 6.706100165595139e-07,
|
|
"loss": 0.3094,
|
|
"num_input_tokens_seen": 31786816,
|
|
"step": 10095
|
|
},
|
|
{
|
|
"epoch": 0.646565520773318,
|
|
"grad_norm": 33.34444808959961,
|
|
"learning_rate": 6.695552100196452e-07,
|
|
"loss": 0.396,
|
|
"num_input_tokens_seen": 31801792,
|
|
"step": 10100
|
|
},
|
|
{
|
|
"epoch": 0.6468856027142948,
|
|
"grad_norm": 66.4857177734375,
|
|
"learning_rate": 6.685008160439769e-07,
|
|
"loss": 0.5142,
|
|
"num_input_tokens_seen": 31818944,
|
|
"step": 10105
|
|
},
|
|
{
|
|
"epoch": 0.6472056846552717,
|
|
"grad_norm": 35.997623443603516,
|
|
"learning_rate": 6.674468359489313e-07,
|
|
"loss": 0.4128,
|
|
"num_input_tokens_seen": 31834176,
|
|
"step": 10110
|
|
},
|
|
{
|
|
"epoch": 0.6475257665962486,
|
|
"grad_norm": 34.614864349365234,
|
|
"learning_rate": 6.663932710504163e-07,
|
|
"loss": 0.3496,
|
|
"num_input_tokens_seen": 31850176,
|
|
"step": 10115
|
|
},
|
|
{
|
|
"epoch": 0.6478458485372255,
|
|
"grad_norm": 44.908058166503906,
|
|
"learning_rate": 6.653401226638192e-07,
|
|
"loss": 0.3894,
|
|
"num_input_tokens_seen": 31865600,
|
|
"step": 10120
|
|
},
|
|
{
|
|
"epoch": 0.6481659304782024,
|
|
"grad_norm": 23.878267288208008,
|
|
"learning_rate": 6.64287392104008e-07,
|
|
"loss": 0.3921,
|
|
"num_input_tokens_seen": 31880512,
|
|
"step": 10125
|
|
},
|
|
{
|
|
"epoch": 0.6484860124191794,
|
|
"grad_norm": 23.294103622436523,
|
|
"learning_rate": 6.632350806853299e-07,
|
|
"loss": 0.4388,
|
|
"num_input_tokens_seen": 31896512,
|
|
"step": 10130
|
|
},
|
|
{
|
|
"epoch": 0.6488060943601562,
|
|
"grad_norm": 41.51081848144531,
|
|
"learning_rate": 6.621831897216074e-07,
|
|
"loss": 0.4029,
|
|
"num_input_tokens_seen": 31912768,
|
|
"step": 10135
|
|
},
|
|
{
|
|
"epoch": 0.6491261763011331,
|
|
"grad_norm": 166.3177032470703,
|
|
"learning_rate": 6.611317205261387e-07,
|
|
"loss": 0.4345,
|
|
"num_input_tokens_seen": 31927488,
|
|
"step": 10140
|
|
},
|
|
{
|
|
"epoch": 0.64944625824211,
|
|
"grad_norm": 28.243562698364258,
|
|
"learning_rate": 6.60080674411696e-07,
|
|
"loss": 0.3416,
|
|
"num_input_tokens_seen": 31942784,
|
|
"step": 10145
|
|
},
|
|
{
|
|
"epoch": 0.6497663401830869,
|
|
"grad_norm": 15.446354866027832,
|
|
"learning_rate": 6.590300526905225e-07,
|
|
"loss": 0.3172,
|
|
"num_input_tokens_seen": 31958528,
|
|
"step": 10150
|
|
},
|
|
{
|
|
"epoch": 0.6500864221240638,
|
|
"grad_norm": 35.624691009521484,
|
|
"learning_rate": 6.579798566743313e-07,
|
|
"loss": 0.4676,
|
|
"num_input_tokens_seen": 31974016,
|
|
"step": 10155
|
|
},
|
|
{
|
|
"epoch": 0.6504065040650406,
|
|
"grad_norm": 41.85055160522461,
|
|
"learning_rate": 6.569300876743049e-07,
|
|
"loss": 0.3143,
|
|
"num_input_tokens_seen": 31990720,
|
|
"step": 10160
|
|
},
|
|
{
|
|
"epoch": 0.6507265860060175,
|
|
"grad_norm": 31.454090118408203,
|
|
"learning_rate": 6.558807470010923e-07,
|
|
"loss": 0.3188,
|
|
"num_input_tokens_seen": 32007168,
|
|
"step": 10165
|
|
},
|
|
{
|
|
"epoch": 0.6507906023942129,
|
|
"eval_loss": 0.37842774391174316,
|
|
"eval_runtime": 49.1741,
|
|
"eval_samples_per_second": 282.384,
|
|
"eval_steps_per_second": 35.303,
|
|
"num_input_tokens_seen": 32010176,
|
|
"step": 10166
|
|
},
|
|
{
|
|
"epoch": 0.6510466679469944,
|
|
"grad_norm": 29.121973037719727,
|
|
"learning_rate": 6.548318359648071e-07,
|
|
"loss": 0.3642,
|
|
"num_input_tokens_seen": 32022208,
|
|
"step": 10170
|
|
},
|
|
{
|
|
"epoch": 0.6513667498879713,
|
|
"grad_norm": 41.79141616821289,
|
|
"learning_rate": 6.537833558750279e-07,
|
|
"loss": 0.3967,
|
|
"num_input_tokens_seen": 32037760,
|
|
"step": 10175
|
|
},
|
|
{
|
|
"epoch": 0.6516868318289483,
|
|
"grad_norm": 48.895450592041016,
|
|
"learning_rate": 6.527353080407938e-07,
|
|
"loss": 0.3055,
|
|
"num_input_tokens_seen": 32052800,
|
|
"step": 10180
|
|
},
|
|
{
|
|
"epoch": 0.6520069137699251,
|
|
"grad_norm": 25.074914932250977,
|
|
"learning_rate": 6.516876937706048e-07,
|
|
"loss": 0.3366,
|
|
"num_input_tokens_seen": 32068288,
|
|
"step": 10185
|
|
},
|
|
{
|
|
"epoch": 0.652326995710902,
|
|
"grad_norm": 24.659767150878906,
|
|
"learning_rate": 6.506405143724196e-07,
|
|
"loss": 0.3758,
|
|
"num_input_tokens_seen": 32083200,
|
|
"step": 10190
|
|
},
|
|
{
|
|
"epoch": 0.6526470776518789,
|
|
"grad_norm": 51.243431091308594,
|
|
"learning_rate": 6.495937711536546e-07,
|
|
"loss": 0.4635,
|
|
"num_input_tokens_seen": 32098432,
|
|
"step": 10195
|
|
},
|
|
{
|
|
"epoch": 0.6529671595928558,
|
|
"grad_norm": 37.73176574707031,
|
|
"learning_rate": 6.485474654211803e-07,
|
|
"loss": 0.4226,
|
|
"num_input_tokens_seen": 32114944,
|
|
"step": 10200
|
|
},
|
|
{
|
|
"epoch": 0.6532872415338327,
|
|
"grad_norm": 38.4589958190918,
|
|
"learning_rate": 6.475015984813217e-07,
|
|
"loss": 0.3044,
|
|
"num_input_tokens_seen": 32131520,
|
|
"step": 10205
|
|
},
|
|
{
|
|
"epoch": 0.6536073234748095,
|
|
"grad_norm": 12.53635311126709,
|
|
"learning_rate": 6.464561716398564e-07,
|
|
"loss": 0.3158,
|
|
"num_input_tokens_seen": 32147008,
|
|
"step": 10210
|
|
},
|
|
{
|
|
"epoch": 0.6539274054157864,
|
|
"grad_norm": 31.261737823486328,
|
|
"learning_rate": 6.454111862020122e-07,
|
|
"loss": 0.3734,
|
|
"num_input_tokens_seen": 32162560,
|
|
"step": 10215
|
|
},
|
|
{
|
|
"epoch": 0.6542474873567633,
|
|
"grad_norm": 27.614994049072266,
|
|
"learning_rate": 6.443666434724649e-07,
|
|
"loss": 0.3636,
|
|
"num_input_tokens_seen": 32177024,
|
|
"step": 10220
|
|
},
|
|
{
|
|
"epoch": 0.6545675692977402,
|
|
"grad_norm": 25.943843841552734,
|
|
"learning_rate": 6.43322544755339e-07,
|
|
"loss": 0.5155,
|
|
"num_input_tokens_seen": 32193024,
|
|
"step": 10225
|
|
},
|
|
{
|
|
"epoch": 0.6548876512387171,
|
|
"grad_norm": 29.868574142456055,
|
|
"learning_rate": 6.422788913542038e-07,
|
|
"loss": 0.3365,
|
|
"num_input_tokens_seen": 32208896,
|
|
"step": 10230
|
|
},
|
|
{
|
|
"epoch": 0.655207733179694,
|
|
"grad_norm": 15.805740356445312,
|
|
"learning_rate": 6.412356845720726e-07,
|
|
"loss": 0.3296,
|
|
"num_input_tokens_seen": 32225280,
|
|
"step": 10235
|
|
},
|
|
{
|
|
"epoch": 0.6555278151206709,
|
|
"grad_norm": 16.956401824951172,
|
|
"learning_rate": 6.40192925711402e-07,
|
|
"loss": 0.3605,
|
|
"num_input_tokens_seen": 32240768,
|
|
"step": 10240
|
|
},
|
|
{
|
|
"epoch": 0.6558478970616478,
|
|
"grad_norm": 27.354637145996094,
|
|
"learning_rate": 6.39150616074088e-07,
|
|
"loss": 0.3264,
|
|
"num_input_tokens_seen": 32255872,
|
|
"step": 10245
|
|
},
|
|
{
|
|
"epoch": 0.6561679790026247,
|
|
"grad_norm": 30.604806900024414,
|
|
"learning_rate": 6.381087569614668e-07,
|
|
"loss": 0.4193,
|
|
"num_input_tokens_seen": 32272512,
|
|
"step": 10250
|
|
},
|
|
{
|
|
"epoch": 0.6564880609436016,
|
|
"grad_norm": 13.617461204528809,
|
|
"learning_rate": 6.370673496743116e-07,
|
|
"loss": 0.3828,
|
|
"num_input_tokens_seen": 32286272,
|
|
"step": 10255
|
|
},
|
|
{
|
|
"epoch": 0.6568081428845784,
|
|
"grad_norm": 25.074222564697266,
|
|
"learning_rate": 6.360263955128315e-07,
|
|
"loss": 0.4331,
|
|
"num_input_tokens_seen": 32301952,
|
|
"step": 10260
|
|
},
|
|
{
|
|
"epoch": 0.6571282248255553,
|
|
"grad_norm": 16.422725677490234,
|
|
"learning_rate": 6.349858957766701e-07,
|
|
"loss": 0.3602,
|
|
"num_input_tokens_seen": 32318208,
|
|
"step": 10265
|
|
},
|
|
{
|
|
"epoch": 0.6574483067665322,
|
|
"grad_norm": 24.204320907592773,
|
|
"learning_rate": 6.339458517649036e-07,
|
|
"loss": 0.336,
|
|
"num_input_tokens_seen": 32333504,
|
|
"step": 10270
|
|
},
|
|
{
|
|
"epoch": 0.6577683887075091,
|
|
"grad_norm": 32.412906646728516,
|
|
"learning_rate": 6.329062647760395e-07,
|
|
"loss": 0.3626,
|
|
"num_input_tokens_seen": 32350208,
|
|
"step": 10275
|
|
},
|
|
{
|
|
"epoch": 0.658088470648486,
|
|
"grad_norm": 35.1417121887207,
|
|
"learning_rate": 6.318671361080137e-07,
|
|
"loss": 0.3351,
|
|
"num_input_tokens_seen": 32365376,
|
|
"step": 10280
|
|
},
|
|
{
|
|
"epoch": 0.6584085525894628,
|
|
"grad_norm": 16.784576416015625,
|
|
"learning_rate": 6.308284670581906e-07,
|
|
"loss": 0.3306,
|
|
"num_input_tokens_seen": 32381248,
|
|
"step": 10285
|
|
},
|
|
{
|
|
"epoch": 0.6587286345304398,
|
|
"grad_norm": 27.784454345703125,
|
|
"learning_rate": 6.297902589233612e-07,
|
|
"loss": 0.4558,
|
|
"num_input_tokens_seen": 32395968,
|
|
"step": 10290
|
|
},
|
|
{
|
|
"epoch": 0.6590487164714167,
|
|
"grad_norm": 32.86067581176758,
|
|
"learning_rate": 6.287525129997404e-07,
|
|
"loss": 0.3737,
|
|
"num_input_tokens_seen": 32411456,
|
|
"step": 10295
|
|
},
|
|
{
|
|
"epoch": 0.6593687984123936,
|
|
"grad_norm": 24.458457946777344,
|
|
"learning_rate": 6.277152305829656e-07,
|
|
"loss": 0.3865,
|
|
"num_input_tokens_seen": 32426880,
|
|
"step": 10300
|
|
},
|
|
{
|
|
"epoch": 0.6596888803533705,
|
|
"grad_norm": 30.00528907775879,
|
|
"learning_rate": 6.266784129680968e-07,
|
|
"loss": 0.3281,
|
|
"num_input_tokens_seen": 32442368,
|
|
"step": 10305
|
|
},
|
|
{
|
|
"epoch": 0.6600089622943474,
|
|
"grad_norm": 37.038761138916016,
|
|
"learning_rate": 6.256420614496129e-07,
|
|
"loss": 0.3781,
|
|
"num_input_tokens_seen": 32457920,
|
|
"step": 10310
|
|
},
|
|
{
|
|
"epoch": 0.6603290442353242,
|
|
"grad_norm": 34.41950988769531,
|
|
"learning_rate": 6.246061773214102e-07,
|
|
"loss": 0.4085,
|
|
"num_input_tokens_seen": 32473536,
|
|
"step": 10315
|
|
},
|
|
{
|
|
"epoch": 0.6606491261763011,
|
|
"grad_norm": 33.36504364013672,
|
|
"learning_rate": 6.235707618768032e-07,
|
|
"loss": 0.3956,
|
|
"num_input_tokens_seen": 32490240,
|
|
"step": 10320
|
|
},
|
|
{
|
|
"epoch": 0.660969208117278,
|
|
"grad_norm": 63.69960021972656,
|
|
"learning_rate": 6.225358164085196e-07,
|
|
"loss": 0.3506,
|
|
"num_input_tokens_seen": 32505728,
|
|
"step": 10325
|
|
},
|
|
{
|
|
"epoch": 0.6612892900582549,
|
|
"grad_norm": 47.17720031738281,
|
|
"learning_rate": 6.21501342208701e-07,
|
|
"loss": 0.3521,
|
|
"num_input_tokens_seen": 32520960,
|
|
"step": 10330
|
|
},
|
|
{
|
|
"epoch": 0.6616093719992318,
|
|
"grad_norm": 22.675373077392578,
|
|
"learning_rate": 6.204673405689007e-07,
|
|
"loss": 0.4036,
|
|
"num_input_tokens_seen": 32535872,
|
|
"step": 10335
|
|
},
|
|
{
|
|
"epoch": 0.6619294539402087,
|
|
"grad_norm": 21.25689697265625,
|
|
"learning_rate": 6.194338127800823e-07,
|
|
"loss": 0.3158,
|
|
"num_input_tokens_seen": 32552448,
|
|
"step": 10340
|
|
},
|
|
{
|
|
"epoch": 0.6622495358811856,
|
|
"grad_norm": 35.447052001953125,
|
|
"learning_rate": 6.184007601326165e-07,
|
|
"loss": 0.3866,
|
|
"num_input_tokens_seen": 32567232,
|
|
"step": 10345
|
|
},
|
|
{
|
|
"epoch": 0.6625696178221625,
|
|
"grad_norm": 30.18397331237793,
|
|
"learning_rate": 6.173681839162824e-07,
|
|
"loss": 0.3515,
|
|
"num_input_tokens_seen": 32583360,
|
|
"step": 10350
|
|
},
|
|
{
|
|
"epoch": 0.6628896997631394,
|
|
"grad_norm": 30.015911102294922,
|
|
"learning_rate": 6.163360854202635e-07,
|
|
"loss": 0.3336,
|
|
"num_input_tokens_seen": 32598656,
|
|
"step": 10355
|
|
},
|
|
{
|
|
"epoch": 0.6632097817041163,
|
|
"grad_norm": 19.04948616027832,
|
|
"learning_rate": 6.153044659331461e-07,
|
|
"loss": 0.306,
|
|
"num_input_tokens_seen": 32614144,
|
|
"step": 10360
|
|
},
|
|
{
|
|
"epoch": 0.6635298636450931,
|
|
"grad_norm": 30.63086700439453,
|
|
"learning_rate": 6.142733267429203e-07,
|
|
"loss": 0.3687,
|
|
"num_input_tokens_seen": 32629120,
|
|
"step": 10365
|
|
},
|
|
{
|
|
"epoch": 0.66384994558607,
|
|
"grad_norm": 25.801145553588867,
|
|
"learning_rate": 6.132426691369748e-07,
|
|
"loss": 0.4287,
|
|
"num_input_tokens_seen": 32645952,
|
|
"step": 10370
|
|
},
|
|
{
|
|
"epoch": 0.6641700275270469,
|
|
"grad_norm": 12.77051067352295,
|
|
"learning_rate": 6.122124944020977e-07,
|
|
"loss": 0.3988,
|
|
"num_input_tokens_seen": 32661696,
|
|
"step": 10375
|
|
},
|
|
{
|
|
"epoch": 0.6644901094680238,
|
|
"grad_norm": 23.705963134765625,
|
|
"learning_rate": 6.111828038244749e-07,
|
|
"loss": 0.3753,
|
|
"num_input_tokens_seen": 32677760,
|
|
"step": 10380
|
|
},
|
|
{
|
|
"epoch": 0.6648101914090007,
|
|
"grad_norm": 14.284012794494629,
|
|
"learning_rate": 6.101535986896866e-07,
|
|
"loss": 0.2948,
|
|
"num_input_tokens_seen": 32693568,
|
|
"step": 10385
|
|
},
|
|
{
|
|
"epoch": 0.6651302733499775,
|
|
"grad_norm": 15.676067352294922,
|
|
"learning_rate": 6.091248802827076e-07,
|
|
"loss": 0.2899,
|
|
"num_input_tokens_seen": 32708736,
|
|
"step": 10390
|
|
},
|
|
{
|
|
"epoch": 0.6654503552909545,
|
|
"grad_norm": 20.25788688659668,
|
|
"learning_rate": 6.080966498879048e-07,
|
|
"loss": 0.3218,
|
|
"num_input_tokens_seen": 32725440,
|
|
"step": 10395
|
|
},
|
|
{
|
|
"epoch": 0.6657704372319314,
|
|
"grad_norm": 40.33934020996094,
|
|
"learning_rate": 6.070689087890363e-07,
|
|
"loss": 0.2962,
|
|
"num_input_tokens_seen": 32740608,
|
|
"step": 10400
|
|
},
|
|
{
|
|
"epoch": 0.6660905191729083,
|
|
"grad_norm": 20.199983596801758,
|
|
"learning_rate": 6.060416582692487e-07,
|
|
"loss": 0.3974,
|
|
"num_input_tokens_seen": 32756416,
|
|
"step": 10405
|
|
},
|
|
{
|
|
"epoch": 0.6664106011138852,
|
|
"grad_norm": 28.62371253967285,
|
|
"learning_rate": 6.05014899611076e-07,
|
|
"loss": 0.3358,
|
|
"num_input_tokens_seen": 32771904,
|
|
"step": 10410
|
|
},
|
|
{
|
|
"epoch": 0.666730683054862,
|
|
"grad_norm": 53.33070755004883,
|
|
"learning_rate": 6.039886340964391e-07,
|
|
"loss": 0.3724,
|
|
"num_input_tokens_seen": 32787392,
|
|
"step": 10415
|
|
},
|
|
{
|
|
"epoch": 0.6670507649958389,
|
|
"grad_norm": 19.005868911743164,
|
|
"learning_rate": 6.029628630066423e-07,
|
|
"loss": 0.334,
|
|
"num_input_tokens_seen": 32803136,
|
|
"step": 10420
|
|
},
|
|
{
|
|
"epoch": 0.6673708469368158,
|
|
"grad_norm": 30.18621063232422,
|
|
"learning_rate": 6.019375876223724e-07,
|
|
"loss": 0.4173,
|
|
"num_input_tokens_seen": 32818624,
|
|
"step": 10425
|
|
},
|
|
{
|
|
"epoch": 0.6676909288777927,
|
|
"grad_norm": 26.04371452331543,
|
|
"learning_rate": 6.009128092236982e-07,
|
|
"loss": 0.4672,
|
|
"num_input_tokens_seen": 32833920,
|
|
"step": 10430
|
|
},
|
|
{
|
|
"epoch": 0.6680110108187696,
|
|
"grad_norm": 19.67214584350586,
|
|
"learning_rate": 5.998885290900679e-07,
|
|
"loss": 0.3859,
|
|
"num_input_tokens_seen": 32848512,
|
|
"step": 10435
|
|
},
|
|
{
|
|
"epoch": 0.6683310927597464,
|
|
"grad_norm": 26.48846435546875,
|
|
"learning_rate": 5.988647485003061e-07,
|
|
"loss": 0.3391,
|
|
"num_input_tokens_seen": 32865088,
|
|
"step": 10440
|
|
},
|
|
{
|
|
"epoch": 0.6686511747007234,
|
|
"grad_norm": 61.240257263183594,
|
|
"learning_rate": 5.978414687326164e-07,
|
|
"loss": 0.4559,
|
|
"num_input_tokens_seen": 32882048,
|
|
"step": 10445
|
|
},
|
|
{
|
|
"epoch": 0.6689712566417003,
|
|
"grad_norm": 28.02547836303711,
|
|
"learning_rate": 5.968186910645745e-07,
|
|
"loss": 0.365,
|
|
"num_input_tokens_seen": 32898624,
|
|
"step": 10450
|
|
},
|
|
{
|
|
"epoch": 0.6692913385826772,
|
|
"grad_norm": 33.85887145996094,
|
|
"learning_rate": 5.957964167731305e-07,
|
|
"loss": 0.505,
|
|
"num_input_tokens_seen": 32914176,
|
|
"step": 10455
|
|
},
|
|
{
|
|
"epoch": 0.6696114205236541,
|
|
"grad_norm": 40.76100158691406,
|
|
"learning_rate": 5.947746471346065e-07,
|
|
"loss": 0.4068,
|
|
"num_input_tokens_seen": 32931136,
|
|
"step": 10460
|
|
},
|
|
{
|
|
"epoch": 0.669931502464631,
|
|
"grad_norm": 47.47494888305664,
|
|
"learning_rate": 5.937533834246932e-07,
|
|
"loss": 0.3349,
|
|
"num_input_tokens_seen": 32947648,
|
|
"step": 10465
|
|
},
|
|
{
|
|
"epoch": 0.6702515844056078,
|
|
"grad_norm": 24.66529083251953,
|
|
"learning_rate": 5.927326269184504e-07,
|
|
"loss": 0.3745,
|
|
"num_input_tokens_seen": 32964224,
|
|
"step": 10470
|
|
},
|
|
{
|
|
"epoch": 0.6705716663465847,
|
|
"grad_norm": 45.57734680175781,
|
|
"learning_rate": 5.917123788903049e-07,
|
|
"loss": 0.4498,
|
|
"num_input_tokens_seen": 32982080,
|
|
"step": 10475
|
|
},
|
|
{
|
|
"epoch": 0.6708917482875616,
|
|
"grad_norm": 37.37847137451172,
|
|
"learning_rate": 5.906926406140484e-07,
|
|
"loss": 0.4642,
|
|
"num_input_tokens_seen": 32997440,
|
|
"step": 10480
|
|
},
|
|
{
|
|
"epoch": 0.6712118302285385,
|
|
"grad_norm": 37.47283935546875,
|
|
"learning_rate": 5.896734133628354e-07,
|
|
"loss": 0.4298,
|
|
"num_input_tokens_seen": 33013056,
|
|
"step": 10485
|
|
},
|
|
{
|
|
"epoch": 0.6715319121695154,
|
|
"grad_norm": 24.019437789916992,
|
|
"learning_rate": 5.886546984091838e-07,
|
|
"loss": 0.3736,
|
|
"num_input_tokens_seen": 33028416,
|
|
"step": 10490
|
|
},
|
|
{
|
|
"epoch": 0.6718519941104922,
|
|
"grad_norm": 29.65847396850586,
|
|
"learning_rate": 5.876364970249711e-07,
|
|
"loss": 0.3415,
|
|
"num_input_tokens_seen": 33042880,
|
|
"step": 10495
|
|
},
|
|
{
|
|
"epoch": 0.6721720760514692,
|
|
"grad_norm": 33.120933532714844,
|
|
"learning_rate": 5.866188104814336e-07,
|
|
"loss": 0.2735,
|
|
"num_input_tokens_seen": 33058240,
|
|
"step": 10500
|
|
},
|
|
{
|
|
"epoch": 0.6724921579924461,
|
|
"grad_norm": 16.05461883544922,
|
|
"learning_rate": 5.856016400491646e-07,
|
|
"loss": 0.3792,
|
|
"num_input_tokens_seen": 33073920,
|
|
"step": 10505
|
|
},
|
|
{
|
|
"epoch": 0.672812239933423,
|
|
"grad_norm": 8.638588905334473,
|
|
"learning_rate": 5.845849869981136e-07,
|
|
"loss": 0.3192,
|
|
"num_input_tokens_seen": 33089344,
|
|
"step": 10510
|
|
},
|
|
{
|
|
"epoch": 0.6731323218743999,
|
|
"grad_norm": 19.994060516357422,
|
|
"learning_rate": 5.835688525975842e-07,
|
|
"loss": 0.3458,
|
|
"num_input_tokens_seen": 33104384,
|
|
"step": 10515
|
|
},
|
|
{
|
|
"epoch": 0.6734524038153767,
|
|
"grad_norm": 22.6699275970459,
|
|
"learning_rate": 5.825532381162311e-07,
|
|
"loss": 0.3931,
|
|
"num_input_tokens_seen": 33120064,
|
|
"step": 10520
|
|
},
|
|
{
|
|
"epoch": 0.6737724857563536,
|
|
"grad_norm": 22.203550338745117,
|
|
"learning_rate": 5.815381448220619e-07,
|
|
"loss": 0.3866,
|
|
"num_input_tokens_seen": 33136128,
|
|
"step": 10525
|
|
},
|
|
{
|
|
"epoch": 0.6740925676973305,
|
|
"grad_norm": 22.488792419433594,
|
|
"learning_rate": 5.805235739824327e-07,
|
|
"loss": 0.3452,
|
|
"num_input_tokens_seen": 33154816,
|
|
"step": 10530
|
|
},
|
|
{
|
|
"epoch": 0.6744126496383074,
|
|
"grad_norm": 37.78539276123047,
|
|
"learning_rate": 5.795095268640458e-07,
|
|
"loss": 0.5023,
|
|
"num_input_tokens_seen": 33169920,
|
|
"step": 10535
|
|
},
|
|
{
|
|
"epoch": 0.6747327315792843,
|
|
"grad_norm": 35.91427230834961,
|
|
"learning_rate": 5.784960047329519e-07,
|
|
"loss": 0.541,
|
|
"num_input_tokens_seen": 33187712,
|
|
"step": 10540
|
|
},
|
|
{
|
|
"epoch": 0.6750528135202611,
|
|
"grad_norm": 14.399126052856445,
|
|
"learning_rate": 5.774830088545452e-07,
|
|
"loss": 0.3866,
|
|
"num_input_tokens_seen": 33202880,
|
|
"step": 10545
|
|
},
|
|
{
|
|
"epoch": 0.6753728954612381,
|
|
"grad_norm": 15.00992202758789,
|
|
"learning_rate": 5.76470540493563e-07,
|
|
"loss": 0.2997,
|
|
"num_input_tokens_seen": 33218944,
|
|
"step": 10550
|
|
},
|
|
{
|
|
"epoch": 0.675692977402215,
|
|
"grad_norm": 27.697614669799805,
|
|
"learning_rate": 5.754586009140836e-07,
|
|
"loss": 0.4652,
|
|
"num_input_tokens_seen": 33234688,
|
|
"step": 10555
|
|
},
|
|
{
|
|
"epoch": 0.6760130593431919,
|
|
"grad_norm": 48.48150634765625,
|
|
"learning_rate": 5.744471913795256e-07,
|
|
"loss": 0.3679,
|
|
"num_input_tokens_seen": 33249920,
|
|
"step": 10560
|
|
},
|
|
{
|
|
"epoch": 0.6763331412841688,
|
|
"grad_norm": 34.35981369018555,
|
|
"learning_rate": 5.734363131526459e-07,
|
|
"loss": 0.3365,
|
|
"num_input_tokens_seen": 33265792,
|
|
"step": 10565
|
|
},
|
|
{
|
|
"epoch": 0.6766532232251457,
|
|
"grad_norm": 37.82774353027344,
|
|
"learning_rate": 5.724259674955377e-07,
|
|
"loss": 0.3742,
|
|
"num_input_tokens_seen": 33280832,
|
|
"step": 10570
|
|
},
|
|
{
|
|
"epoch": 0.6769733051661225,
|
|
"grad_norm": 28.590476989746094,
|
|
"learning_rate": 5.714161556696291e-07,
|
|
"loss": 0.3888,
|
|
"num_input_tokens_seen": 33296576,
|
|
"step": 10575
|
|
},
|
|
{
|
|
"epoch": 0.6772933871070994,
|
|
"grad_norm": 42.37991714477539,
|
|
"learning_rate": 5.704068789356824e-07,
|
|
"loss": 0.3388,
|
|
"num_input_tokens_seen": 33316672,
|
|
"step": 10580
|
|
},
|
|
{
|
|
"epoch": 0.6776134690480763,
|
|
"grad_norm": 28.075489044189453,
|
|
"learning_rate": 5.693981385537912e-07,
|
|
"loss": 0.3496,
|
|
"num_input_tokens_seen": 33331456,
|
|
"step": 10585
|
|
},
|
|
{
|
|
"epoch": 0.6779335509890532,
|
|
"grad_norm": 24.502607345581055,
|
|
"learning_rate": 5.683899357833801e-07,
|
|
"loss": 0.3447,
|
|
"num_input_tokens_seen": 33346752,
|
|
"step": 10590
|
|
},
|
|
{
|
|
"epoch": 0.67825363293003,
|
|
"grad_norm": 34.75849151611328,
|
|
"learning_rate": 5.673822718832015e-07,
|
|
"loss": 0.455,
|
|
"num_input_tokens_seen": 33362688,
|
|
"step": 10595
|
|
},
|
|
{
|
|
"epoch": 0.6785737148710069,
|
|
"grad_norm": 40.48807144165039,
|
|
"learning_rate": 5.663751481113362e-07,
|
|
"loss": 0.3697,
|
|
"num_input_tokens_seen": 33377600,
|
|
"step": 10600
|
|
},
|
|
{
|
|
"epoch": 0.6788937968119839,
|
|
"grad_norm": 24.50969696044922,
|
|
"learning_rate": 5.653685657251896e-07,
|
|
"loss": 0.4282,
|
|
"num_input_tokens_seen": 33393280,
|
|
"step": 10605
|
|
},
|
|
{
|
|
"epoch": 0.6792138787529608,
|
|
"grad_norm": 41.67803192138672,
|
|
"learning_rate": 5.643625259814922e-07,
|
|
"loss": 0.3746,
|
|
"num_input_tokens_seen": 33410112,
|
|
"step": 10610
|
|
},
|
|
{
|
|
"epoch": 0.6795339606939377,
|
|
"grad_norm": 17.74659538269043,
|
|
"learning_rate": 5.633570301362953e-07,
|
|
"loss": 0.3664,
|
|
"num_input_tokens_seen": 33426624,
|
|
"step": 10615
|
|
},
|
|
{
|
|
"epoch": 0.6798540426349146,
|
|
"grad_norm": 37.33218002319336,
|
|
"learning_rate": 5.623520794449739e-07,
|
|
"loss": 0.36,
|
|
"num_input_tokens_seen": 33442240,
|
|
"step": 10620
|
|
},
|
|
{
|
|
"epoch": 0.6801741245758914,
|
|
"grad_norm": 36.54777908325195,
|
|
"learning_rate": 5.613476751622195e-07,
|
|
"loss": 0.4713,
|
|
"num_input_tokens_seen": 33458432,
|
|
"step": 10625
|
|
},
|
|
{
|
|
"epoch": 0.6804942065168683,
|
|
"grad_norm": 26.362565994262695,
|
|
"learning_rate": 5.603438185420426e-07,
|
|
"loss": 0.4368,
|
|
"num_input_tokens_seen": 33473856,
|
|
"step": 10630
|
|
},
|
|
{
|
|
"epoch": 0.6808142884578452,
|
|
"grad_norm": 58.49364471435547,
|
|
"learning_rate": 5.593405108377714e-07,
|
|
"loss": 0.4714,
|
|
"num_input_tokens_seen": 33489216,
|
|
"step": 10635
|
|
},
|
|
{
|
|
"epoch": 0.6811343703988221,
|
|
"grad_norm": 23.39803695678711,
|
|
"learning_rate": 5.583377533020457e-07,
|
|
"loss": 0.4586,
|
|
"num_input_tokens_seen": 33505280,
|
|
"step": 10640
|
|
},
|
|
{
|
|
"epoch": 0.681454452339799,
|
|
"grad_norm": 40.31536865234375,
|
|
"learning_rate": 5.573355471868201e-07,
|
|
"loss": 0.2834,
|
|
"num_input_tokens_seen": 33520512,
|
|
"step": 10645
|
|
},
|
|
{
|
|
"epoch": 0.6817745342807758,
|
|
"grad_norm": 24.481168746948242,
|
|
"learning_rate": 5.563338937433621e-07,
|
|
"loss": 0.3532,
|
|
"num_input_tokens_seen": 33537344,
|
|
"step": 10650
|
|
},
|
|
{
|
|
"epoch": 0.6820946162217527,
|
|
"grad_norm": 15.533621788024902,
|
|
"learning_rate": 5.553327942222472e-07,
|
|
"loss": 0.2438,
|
|
"num_input_tokens_seen": 33552128,
|
|
"step": 10655
|
|
},
|
|
{
|
|
"epoch": 0.6824146981627297,
|
|
"grad_norm": 26.63052749633789,
|
|
"learning_rate": 5.54332249873359e-07,
|
|
"loss": 0.3547,
|
|
"num_input_tokens_seen": 33566784,
|
|
"step": 10660
|
|
},
|
|
{
|
|
"epoch": 0.6827347801037066,
|
|
"grad_norm": 21.95829963684082,
|
|
"learning_rate": 5.533322619458896e-07,
|
|
"loss": 0.3052,
|
|
"num_input_tokens_seen": 33582080,
|
|
"step": 10665
|
|
},
|
|
{
|
|
"epoch": 0.6830548620446835,
|
|
"grad_norm": 45.98701477050781,
|
|
"learning_rate": 5.52332831688336e-07,
|
|
"loss": 0.4079,
|
|
"num_input_tokens_seen": 33596864,
|
|
"step": 10670
|
|
},
|
|
{
|
|
"epoch": 0.6833749439856603,
|
|
"grad_norm": 79.64530944824219,
|
|
"learning_rate": 5.513339603484981e-07,
|
|
"loss": 0.3454,
|
|
"num_input_tokens_seen": 33613056,
|
|
"step": 10675
|
|
},
|
|
{
|
|
"epoch": 0.6836950259266372,
|
|
"grad_norm": 69.9050064086914,
|
|
"learning_rate": 5.503356491734785e-07,
|
|
"loss": 0.5049,
|
|
"num_input_tokens_seen": 33628160,
|
|
"step": 10680
|
|
},
|
|
{
|
|
"epoch": 0.6840151078676141,
|
|
"grad_norm": 18.264413833618164,
|
|
"learning_rate": 5.493378994096806e-07,
|
|
"loss": 0.4346,
|
|
"num_input_tokens_seen": 33645184,
|
|
"step": 10685
|
|
},
|
|
{
|
|
"epoch": 0.684335189808591,
|
|
"grad_norm": 18.561819076538086,
|
|
"learning_rate": 5.483407123028067e-07,
|
|
"loss": 0.3909,
|
|
"num_input_tokens_seen": 33660800,
|
|
"step": 10690
|
|
},
|
|
{
|
|
"epoch": 0.6846552717495679,
|
|
"grad_norm": 38.80720138549805,
|
|
"learning_rate": 5.473440890978566e-07,
|
|
"loss": 0.4766,
|
|
"num_input_tokens_seen": 33676736,
|
|
"step": 10695
|
|
},
|
|
{
|
|
"epoch": 0.6849753536905447,
|
|
"grad_norm": 25.19498634338379,
|
|
"learning_rate": 5.463480310391261e-07,
|
|
"loss": 0.4079,
|
|
"num_input_tokens_seen": 33692928,
|
|
"step": 10700
|
|
},
|
|
{
|
|
"epoch": 0.6852954356315216,
|
|
"grad_norm": 23.25238800048828,
|
|
"learning_rate": 5.453525393702052e-07,
|
|
"loss": 0.3839,
|
|
"num_input_tokens_seen": 33708352,
|
|
"step": 10705
|
|
},
|
|
{
|
|
"epoch": 0.6856155175724986,
|
|
"grad_norm": 32.19915771484375,
|
|
"learning_rate": 5.443576153339771e-07,
|
|
"loss": 0.3644,
|
|
"num_input_tokens_seen": 33723968,
|
|
"step": 10710
|
|
},
|
|
{
|
|
"epoch": 0.6859355995134755,
|
|
"grad_norm": 46.10927963256836,
|
|
"learning_rate": 5.433632601726159e-07,
|
|
"loss": 0.3272,
|
|
"num_input_tokens_seen": 33739200,
|
|
"step": 10715
|
|
},
|
|
{
|
|
"epoch": 0.6862556814544524,
|
|
"grad_norm": 33.03512191772461,
|
|
"learning_rate": 5.42369475127586e-07,
|
|
"loss": 0.3404,
|
|
"num_input_tokens_seen": 33754944,
|
|
"step": 10720
|
|
},
|
|
{
|
|
"epoch": 0.6865757633954293,
|
|
"grad_norm": 60.13679504394531,
|
|
"learning_rate": 5.413762614396396e-07,
|
|
"loss": 0.4709,
|
|
"num_input_tokens_seen": 33769472,
|
|
"step": 10725
|
|
},
|
|
{
|
|
"epoch": 0.6868958453364061,
|
|
"grad_norm": 33.97296142578125,
|
|
"learning_rate": 5.403836203488157e-07,
|
|
"loss": 0.4262,
|
|
"num_input_tokens_seen": 33784896,
|
|
"step": 10730
|
|
},
|
|
{
|
|
"epoch": 0.687215927277383,
|
|
"grad_norm": 18.200382232666016,
|
|
"learning_rate": 5.393915530944382e-07,
|
|
"loss": 0.3638,
|
|
"num_input_tokens_seen": 33800320,
|
|
"step": 10735
|
|
},
|
|
{
|
|
"epoch": 0.6875360092183599,
|
|
"grad_norm": 24.23163414001465,
|
|
"learning_rate": 5.384000609151145e-07,
|
|
"loss": 0.3765,
|
|
"num_input_tokens_seen": 33816896,
|
|
"step": 10740
|
|
},
|
|
{
|
|
"epoch": 0.6878560911593368,
|
|
"grad_norm": 21.162240982055664,
|
|
"learning_rate": 5.374091450487353e-07,
|
|
"loss": 0.3763,
|
|
"num_input_tokens_seen": 33833344,
|
|
"step": 10745
|
|
},
|
|
{
|
|
"epoch": 0.6881761731003136,
|
|
"grad_norm": 29.74762535095215,
|
|
"learning_rate": 5.364188067324693e-07,
|
|
"loss": 0.3352,
|
|
"num_input_tokens_seen": 33849856,
|
|
"step": 10750
|
|
},
|
|
{
|
|
"epoch": 0.6884962550412905,
|
|
"grad_norm": 13.640717506408691,
|
|
"learning_rate": 5.354290472027659e-07,
|
|
"loss": 0.3441,
|
|
"num_input_tokens_seen": 33865344,
|
|
"step": 10755
|
|
},
|
|
{
|
|
"epoch": 0.6888163369822674,
|
|
"grad_norm": 71.6620864868164,
|
|
"learning_rate": 5.344398676953525e-07,
|
|
"loss": 0.4955,
|
|
"num_input_tokens_seen": 33881792,
|
|
"step": 10760
|
|
},
|
|
{
|
|
"epoch": 0.6891364189232444,
|
|
"grad_norm": 31.854103088378906,
|
|
"learning_rate": 5.334512694452303e-07,
|
|
"loss": 0.4902,
|
|
"num_input_tokens_seen": 33898368,
|
|
"step": 10765
|
|
},
|
|
{
|
|
"epoch": 0.6894565008642213,
|
|
"grad_norm": 22.185178756713867,
|
|
"learning_rate": 5.324632536866755e-07,
|
|
"loss": 0.3489,
|
|
"num_input_tokens_seen": 33914368,
|
|
"step": 10770
|
|
},
|
|
{
|
|
"epoch": 0.6897765828051982,
|
|
"grad_norm": 40.81916046142578,
|
|
"learning_rate": 5.314758216532386e-07,
|
|
"loss": 0.3526,
|
|
"num_input_tokens_seen": 33929728,
|
|
"step": 10775
|
|
},
|
|
{
|
|
"epoch": 0.690096664746175,
|
|
"grad_norm": 20.197229385375977,
|
|
"learning_rate": 5.304889745777396e-07,
|
|
"loss": 0.3743,
|
|
"num_input_tokens_seen": 33944704,
|
|
"step": 10780
|
|
},
|
|
{
|
|
"epoch": 0.6904167466871519,
|
|
"grad_norm": 31.70199203491211,
|
|
"learning_rate": 5.295027136922678e-07,
|
|
"loss": 0.6418,
|
|
"num_input_tokens_seen": 33960128,
|
|
"step": 10785
|
|
},
|
|
{
|
|
"epoch": 0.6907368286281288,
|
|
"grad_norm": 22.89275360107422,
|
|
"learning_rate": 5.285170402281827e-07,
|
|
"loss": 0.4207,
|
|
"num_input_tokens_seen": 33975104,
|
|
"step": 10790
|
|
},
|
|
{
|
|
"epoch": 0.6910569105691057,
|
|
"grad_norm": 33.831241607666016,
|
|
"learning_rate": 5.275319554161087e-07,
|
|
"loss": 0.4588,
|
|
"num_input_tokens_seen": 33990720,
|
|
"step": 10795
|
|
},
|
|
{
|
|
"epoch": 0.6913769925100826,
|
|
"grad_norm": 31.06147575378418,
|
|
"learning_rate": 5.265474604859356e-07,
|
|
"loss": 0.4123,
|
|
"num_input_tokens_seen": 34006272,
|
|
"step": 10800
|
|
},
|
|
{
|
|
"epoch": 0.6916970744510594,
|
|
"grad_norm": 26.169334411621094,
|
|
"learning_rate": 5.255635566668171e-07,
|
|
"loss": 0.3902,
|
|
"num_input_tokens_seen": 34022400,
|
|
"step": 10805
|
|
},
|
|
{
|
|
"epoch": 0.6920171563920363,
|
|
"grad_norm": 22.71941566467285,
|
|
"learning_rate": 5.245802451871686e-07,
|
|
"loss": 0.3704,
|
|
"num_input_tokens_seen": 34038720,
|
|
"step": 10810
|
|
},
|
|
{
|
|
"epoch": 0.6923372383330133,
|
|
"grad_norm": 23.15312957763672,
|
|
"learning_rate": 5.235975272746663e-07,
|
|
"loss": 0.4316,
|
|
"num_input_tokens_seen": 34053760,
|
|
"step": 10815
|
|
},
|
|
{
|
|
"epoch": 0.6926573202739902,
|
|
"grad_norm": 22.503173828125,
|
|
"learning_rate": 5.226154041562442e-07,
|
|
"loss": 0.3024,
|
|
"num_input_tokens_seen": 34069568,
|
|
"step": 10820
|
|
},
|
|
{
|
|
"epoch": 0.6929774022149671,
|
|
"grad_norm": 23.336326599121094,
|
|
"learning_rate": 5.216338770580953e-07,
|
|
"loss": 0.406,
|
|
"num_input_tokens_seen": 34086912,
|
|
"step": 10825
|
|
},
|
|
{
|
|
"epoch": 0.6932974841559439,
|
|
"grad_norm": 22.208585739135742,
|
|
"learning_rate": 5.206529472056678e-07,
|
|
"loss": 0.3649,
|
|
"num_input_tokens_seen": 34101696,
|
|
"step": 10830
|
|
},
|
|
{
|
|
"epoch": 0.6936175660969208,
|
|
"grad_norm": 15.775872230529785,
|
|
"learning_rate": 5.196726158236637e-07,
|
|
"loss": 0.3168,
|
|
"num_input_tokens_seen": 34115904,
|
|
"step": 10835
|
|
},
|
|
{
|
|
"epoch": 0.6939376480378977,
|
|
"grad_norm": 23.13541603088379,
|
|
"learning_rate": 5.186928841360384e-07,
|
|
"loss": 0.3372,
|
|
"num_input_tokens_seen": 34131328,
|
|
"step": 10840
|
|
},
|
|
{
|
|
"epoch": 0.6942577299788746,
|
|
"grad_norm": 29.86430549621582,
|
|
"learning_rate": 5.177137533659985e-07,
|
|
"loss": 0.4395,
|
|
"num_input_tokens_seen": 34148544,
|
|
"step": 10845
|
|
},
|
|
{
|
|
"epoch": 0.6945778119198515,
|
|
"grad_norm": 20.5509033203125,
|
|
"learning_rate": 5.167352247360002e-07,
|
|
"loss": 0.4564,
|
|
"num_input_tokens_seen": 34163520,
|
|
"step": 10850
|
|
},
|
|
{
|
|
"epoch": 0.6948978938608283,
|
|
"grad_norm": 27.466720581054688,
|
|
"learning_rate": 5.157572994677479e-07,
|
|
"loss": 0.3993,
|
|
"num_input_tokens_seen": 34178368,
|
|
"step": 10855
|
|
},
|
|
{
|
|
"epoch": 0.6952179758018052,
|
|
"grad_norm": 32.89216232299805,
|
|
"learning_rate": 5.147799787821929e-07,
|
|
"loss": 0.4055,
|
|
"num_input_tokens_seen": 34193920,
|
|
"step": 10860
|
|
},
|
|
{
|
|
"epoch": 0.6955380577427821,
|
|
"grad_norm": 37.79446792602539,
|
|
"learning_rate": 5.138032638995315e-07,
|
|
"loss": 0.485,
|
|
"num_input_tokens_seen": 34210176,
|
|
"step": 10865
|
|
},
|
|
{
|
|
"epoch": 0.6958581396837591,
|
|
"grad_norm": 53.44511032104492,
|
|
"learning_rate": 5.128271560392037e-07,
|
|
"loss": 0.3575,
|
|
"num_input_tokens_seen": 34227328,
|
|
"step": 10870
|
|
},
|
|
{
|
|
"epoch": 0.696178221624736,
|
|
"grad_norm": 32.73928451538086,
|
|
"learning_rate": 5.118516564198916e-07,
|
|
"loss": 0.3901,
|
|
"num_input_tokens_seen": 34241984,
|
|
"step": 10875
|
|
},
|
|
{
|
|
"epoch": 0.6964983035657129,
|
|
"grad_norm": 23.722578048706055,
|
|
"learning_rate": 5.108767662595175e-07,
|
|
"loss": 0.3371,
|
|
"num_input_tokens_seen": 34256896,
|
|
"step": 10880
|
|
},
|
|
{
|
|
"epoch": 0.6968183855066897,
|
|
"grad_norm": 20.10529899597168,
|
|
"learning_rate": 5.099024867752446e-07,
|
|
"loss": 0.3824,
|
|
"num_input_tokens_seen": 34273792,
|
|
"step": 10885
|
|
},
|
|
{
|
|
"epoch": 0.6971384674476666,
|
|
"grad_norm": 33.20995330810547,
|
|
"learning_rate": 5.089288191834709e-07,
|
|
"loss": 0.3219,
|
|
"num_input_tokens_seen": 34290752,
|
|
"step": 10890
|
|
},
|
|
{
|
|
"epoch": 0.6974585493886435,
|
|
"grad_norm": 32.262474060058594,
|
|
"learning_rate": 5.079557646998318e-07,
|
|
"loss": 0.3367,
|
|
"num_input_tokens_seen": 34308416,
|
|
"step": 10895
|
|
},
|
|
{
|
|
"epoch": 0.6977786313296204,
|
|
"grad_norm": 13.212915420532227,
|
|
"learning_rate": 5.069833245391981e-07,
|
|
"loss": 0.403,
|
|
"num_input_tokens_seen": 34323776,
|
|
"step": 10900
|
|
},
|
|
{
|
|
"epoch": 0.6980987132705972,
|
|
"grad_norm": 24.54563331604004,
|
|
"learning_rate": 5.060114999156728e-07,
|
|
"loss": 0.322,
|
|
"num_input_tokens_seen": 34338944,
|
|
"step": 10905
|
|
},
|
|
{
|
|
"epoch": 0.6984187952115741,
|
|
"grad_norm": 37.85472869873047,
|
|
"learning_rate": 5.050402920425895e-07,
|
|
"loss": 0.3462,
|
|
"num_input_tokens_seen": 34354432,
|
|
"step": 10910
|
|
},
|
|
{
|
|
"epoch": 0.698738877152551,
|
|
"grad_norm": 17.395889282226562,
|
|
"learning_rate": 5.040697021325128e-07,
|
|
"loss": 0.2526,
|
|
"num_input_tokens_seen": 34370432,
|
|
"step": 10915
|
|
},
|
|
{
|
|
"epoch": 0.699058959093528,
|
|
"grad_norm": 32.64187240600586,
|
|
"learning_rate": 5.030997313972361e-07,
|
|
"loss": 0.437,
|
|
"num_input_tokens_seen": 34386496,
|
|
"step": 10920
|
|
},
|
|
{
|
|
"epoch": 0.6993790410345049,
|
|
"grad_norm": 19.760494232177734,
|
|
"learning_rate": 5.021303810477795e-07,
|
|
"loss": 0.368,
|
|
"num_input_tokens_seen": 34402560,
|
|
"step": 10925
|
|
},
|
|
{
|
|
"epoch": 0.6996991229754818,
|
|
"grad_norm": 16.46942710876465,
|
|
"learning_rate": 5.011616522943869e-07,
|
|
"loss": 0.2859,
|
|
"num_input_tokens_seen": 34418496,
|
|
"step": 10930
|
|
},
|
|
{
|
|
"epoch": 0.7000192049164586,
|
|
"grad_norm": 50.63234329223633,
|
|
"learning_rate": 5.001935463465289e-07,
|
|
"loss": 0.2731,
|
|
"num_input_tokens_seen": 34434752,
|
|
"step": 10935
|
|
},
|
|
{
|
|
"epoch": 0.7003392868574355,
|
|
"grad_norm": 22.748510360717773,
|
|
"learning_rate": 4.99226064412897e-07,
|
|
"loss": 0.3965,
|
|
"num_input_tokens_seen": 34450176,
|
|
"step": 10940
|
|
},
|
|
{
|
|
"epoch": 0.7006593687984124,
|
|
"grad_norm": 18.267223358154297,
|
|
"learning_rate": 4.982592077014026e-07,
|
|
"loss": 0.4233,
|
|
"num_input_tokens_seen": 34465600,
|
|
"step": 10945
|
|
},
|
|
{
|
|
"epoch": 0.7008514179629985,
|
|
"eval_loss": 0.37222641706466675,
|
|
"eval_runtime": 49.2115,
|
|
"eval_samples_per_second": 282.17,
|
|
"eval_steps_per_second": 35.276,
|
|
"num_input_tokens_seen": 34475136,
|
|
"step": 10948
|
|
},
|
|
{
|
|
"epoch": 0.7009794507393893,
|
|
"grad_norm": 38.11653518676758,
|
|
"learning_rate": 4.97292977419179e-07,
|
|
"loss": 0.3026,
|
|
"num_input_tokens_seen": 34481600,
|
|
"step": 10950
|
|
},
|
|
{
|
|
"epoch": 0.7012995326803662,
|
|
"grad_norm": 19.48086166381836,
|
|
"learning_rate": 4.963273747725755e-07,
|
|
"loss": 0.2954,
|
|
"num_input_tokens_seen": 34498752,
|
|
"step": 10955
|
|
},
|
|
{
|
|
"epoch": 0.701619614621343,
|
|
"grad_norm": 26.763914108276367,
|
|
"learning_rate": 4.953624009671582e-07,
|
|
"loss": 0.4061,
|
|
"num_input_tokens_seen": 34514240,
|
|
"step": 10960
|
|
},
|
|
{
|
|
"epoch": 0.7019396965623199,
|
|
"grad_norm": 44.18442153930664,
|
|
"learning_rate": 4.943980572077086e-07,
|
|
"loss": 0.4161,
|
|
"num_input_tokens_seen": 34528704,
|
|
"step": 10965
|
|
},
|
|
{
|
|
"epoch": 0.7022597785032968,
|
|
"grad_norm": 38.56117630004883,
|
|
"learning_rate": 4.934343446982209e-07,
|
|
"loss": 0.3243,
|
|
"num_input_tokens_seen": 34544704,
|
|
"step": 10970
|
|
},
|
|
{
|
|
"epoch": 0.7025798604442738,
|
|
"grad_norm": 13.776517868041992,
|
|
"learning_rate": 4.924712646419016e-07,
|
|
"loss": 0.3698,
|
|
"num_input_tokens_seen": 34560000,
|
|
"step": 10975
|
|
},
|
|
{
|
|
"epoch": 0.7028999423852507,
|
|
"grad_norm": 70.76254272460938,
|
|
"learning_rate": 4.915088182411674e-07,
|
|
"loss": 0.3211,
|
|
"num_input_tokens_seen": 34575296,
|
|
"step": 10980
|
|
},
|
|
{
|
|
"epoch": 0.7032200243262275,
|
|
"grad_norm": 33.83591842651367,
|
|
"learning_rate": 4.905470066976439e-07,
|
|
"loss": 0.3715,
|
|
"num_input_tokens_seen": 34590528,
|
|
"step": 10985
|
|
},
|
|
{
|
|
"epoch": 0.7035401062672044,
|
|
"grad_norm": 37.384647369384766,
|
|
"learning_rate": 4.895858312121644e-07,
|
|
"loss": 0.4187,
|
|
"num_input_tokens_seen": 34605312,
|
|
"step": 10990
|
|
},
|
|
{
|
|
"epoch": 0.7038601882081813,
|
|
"grad_norm": 24.66256332397461,
|
|
"learning_rate": 4.886252929847674e-07,
|
|
"loss": 0.4337,
|
|
"num_input_tokens_seen": 34620736,
|
|
"step": 10995
|
|
},
|
|
{
|
|
"epoch": 0.7041802701491582,
|
|
"grad_norm": 42.17767333984375,
|
|
"learning_rate": 4.876653932146963e-07,
|
|
"loss": 0.4578,
|
|
"num_input_tokens_seen": 34636736,
|
|
"step": 11000
|
|
},
|
|
{
|
|
"epoch": 0.7045003520901351,
|
|
"grad_norm": 31.28046417236328,
|
|
"learning_rate": 4.86706133100397e-07,
|
|
"loss": 0.3782,
|
|
"num_input_tokens_seen": 34651776,
|
|
"step": 11005
|
|
},
|
|
{
|
|
"epoch": 0.7048204340311119,
|
|
"grad_norm": 52.68522644042969,
|
|
"learning_rate": 4.857475138395178e-07,
|
|
"loss": 0.2923,
|
|
"num_input_tokens_seen": 34666176,
|
|
"step": 11010
|
|
},
|
|
{
|
|
"epoch": 0.7051405159720888,
|
|
"grad_norm": 15.296350479125977,
|
|
"learning_rate": 4.847895366289054e-07,
|
|
"loss": 0.2529,
|
|
"num_input_tokens_seen": 34682112,
|
|
"step": 11015
|
|
},
|
|
{
|
|
"epoch": 0.7054605979130657,
|
|
"grad_norm": 32.735904693603516,
|
|
"learning_rate": 4.838322026646057e-07,
|
|
"loss": 0.3828,
|
|
"num_input_tokens_seen": 34697024,
|
|
"step": 11020
|
|
},
|
|
{
|
|
"epoch": 0.7057806798540426,
|
|
"grad_norm": 20.01278305053711,
|
|
"learning_rate": 4.82875513141861e-07,
|
|
"loss": 0.3577,
|
|
"num_input_tokens_seen": 34712704,
|
|
"step": 11025
|
|
},
|
|
{
|
|
"epoch": 0.7061007617950196,
|
|
"grad_norm": 29.205598831176758,
|
|
"learning_rate": 4.819194692551106e-07,
|
|
"loss": 0.3791,
|
|
"num_input_tokens_seen": 34728256,
|
|
"step": 11030
|
|
},
|
|
{
|
|
"epoch": 0.7064208437359965,
|
|
"grad_norm": 16.80168914794922,
|
|
"learning_rate": 4.809640721979855e-07,
|
|
"loss": 0.4268,
|
|
"num_input_tokens_seen": 34744512,
|
|
"step": 11035
|
|
},
|
|
{
|
|
"epoch": 0.7067409256769733,
|
|
"grad_norm": 47.780738830566406,
|
|
"learning_rate": 4.8000932316331e-07,
|
|
"loss": 0.4158,
|
|
"num_input_tokens_seen": 34758912,
|
|
"step": 11040
|
|
},
|
|
{
|
|
"epoch": 0.7070610076179502,
|
|
"grad_norm": 29.31734848022461,
|
|
"learning_rate": 4.790552233431002e-07,
|
|
"loss": 0.4037,
|
|
"num_input_tokens_seen": 34774848,
|
|
"step": 11045
|
|
},
|
|
{
|
|
"epoch": 0.7073810895589271,
|
|
"grad_norm": 34.01865005493164,
|
|
"learning_rate": 4.781017739285611e-07,
|
|
"loss": 0.4168,
|
|
"num_input_tokens_seen": 34790016,
|
|
"step": 11050
|
|
},
|
|
{
|
|
"epoch": 0.707701171499904,
|
|
"grad_norm": 13.347481727600098,
|
|
"learning_rate": 4.771489761100842e-07,
|
|
"loss": 0.3453,
|
|
"num_input_tokens_seen": 34804992,
|
|
"step": 11055
|
|
},
|
|
{
|
|
"epoch": 0.7080212534408808,
|
|
"grad_norm": 40.918357849121094,
|
|
"learning_rate": 4.761968310772501e-07,
|
|
"loss": 0.2687,
|
|
"num_input_tokens_seen": 34820288,
|
|
"step": 11060
|
|
},
|
|
{
|
|
"epoch": 0.7083413353818577,
|
|
"grad_norm": 40.15391540527344,
|
|
"learning_rate": 4.7524534001882267e-07,
|
|
"loss": 0.2718,
|
|
"num_input_tokens_seen": 34836096,
|
|
"step": 11065
|
|
},
|
|
{
|
|
"epoch": 0.7086614173228346,
|
|
"grad_norm": 28.034465789794922,
|
|
"learning_rate": 4.7429450412274897e-07,
|
|
"loss": 0.394,
|
|
"num_input_tokens_seen": 34851584,
|
|
"step": 11070
|
|
},
|
|
{
|
|
"epoch": 0.7089814992638115,
|
|
"grad_norm": 23.965686798095703,
|
|
"learning_rate": 4.733443245761596e-07,
|
|
"loss": 0.3458,
|
|
"num_input_tokens_seen": 34868032,
|
|
"step": 11075
|
|
},
|
|
{
|
|
"epoch": 0.7093015812047885,
|
|
"grad_norm": 25.30048179626465,
|
|
"learning_rate": 4.723948025653646e-07,
|
|
"loss": 0.3821,
|
|
"num_input_tokens_seen": 34884032,
|
|
"step": 11080
|
|
},
|
|
{
|
|
"epoch": 0.7096216631457654,
|
|
"grad_norm": 29.63812828063965,
|
|
"learning_rate": 4.714459392758534e-07,
|
|
"loss": 0.3254,
|
|
"num_input_tokens_seen": 34899456,
|
|
"step": 11085
|
|
},
|
|
{
|
|
"epoch": 0.7099417450867422,
|
|
"grad_norm": 51.972572326660156,
|
|
"learning_rate": 4.70497735892293e-07,
|
|
"loss": 0.3735,
|
|
"num_input_tokens_seen": 34915456,
|
|
"step": 11090
|
|
},
|
|
{
|
|
"epoch": 0.7102618270277191,
|
|
"grad_norm": 16.07594871520996,
|
|
"learning_rate": 4.695501935985263e-07,
|
|
"loss": 0.3331,
|
|
"num_input_tokens_seen": 34931328,
|
|
"step": 11095
|
|
},
|
|
{
|
|
"epoch": 0.710581908968696,
|
|
"grad_norm": 34.51850128173828,
|
|
"learning_rate": 4.686033135775711e-07,
|
|
"loss": 0.3999,
|
|
"num_input_tokens_seen": 34946816,
|
|
"step": 11100
|
|
},
|
|
{
|
|
"epoch": 0.7109019909096729,
|
|
"grad_norm": 25.590112686157227,
|
|
"learning_rate": 4.6765709701161817e-07,
|
|
"loss": 0.3245,
|
|
"num_input_tokens_seen": 34964544,
|
|
"step": 11105
|
|
},
|
|
{
|
|
"epoch": 0.7112220728506498,
|
|
"grad_norm": 66.03004455566406,
|
|
"learning_rate": 4.6671154508203003e-07,
|
|
"loss": 0.3996,
|
|
"num_input_tokens_seen": 34982208,
|
|
"step": 11110
|
|
},
|
|
{
|
|
"epoch": 0.7115421547916266,
|
|
"grad_norm": 42.14921188354492,
|
|
"learning_rate": 4.657666589693393e-07,
|
|
"loss": 0.3439,
|
|
"num_input_tokens_seen": 35000576,
|
|
"step": 11115
|
|
},
|
|
{
|
|
"epoch": 0.7118622367326035,
|
|
"grad_norm": 26.2552433013916,
|
|
"learning_rate": 4.6482243985324753e-07,
|
|
"loss": 0.3145,
|
|
"num_input_tokens_seen": 35014912,
|
|
"step": 11120
|
|
},
|
|
{
|
|
"epoch": 0.7121823186735804,
|
|
"grad_norm": 28.899272918701172,
|
|
"learning_rate": 4.638788889126232e-07,
|
|
"loss": 0.2914,
|
|
"num_input_tokens_seen": 35029632,
|
|
"step": 11125
|
|
},
|
|
{
|
|
"epoch": 0.7125024006145573,
|
|
"grad_norm": 27.084138870239258,
|
|
"learning_rate": 4.6293600732550085e-07,
|
|
"loss": 0.3239,
|
|
"num_input_tokens_seen": 35044992,
|
|
"step": 11130
|
|
},
|
|
{
|
|
"epoch": 0.7128224825555343,
|
|
"grad_norm": 16.42285919189453,
|
|
"learning_rate": 4.619937962690792e-07,
|
|
"loss": 0.4686,
|
|
"num_input_tokens_seen": 35060544,
|
|
"step": 11135
|
|
},
|
|
{
|
|
"epoch": 0.7131425644965111,
|
|
"grad_norm": 57.51594924926758,
|
|
"learning_rate": 4.610522569197197e-07,
|
|
"loss": 0.5105,
|
|
"num_input_tokens_seen": 35075648,
|
|
"step": 11140
|
|
},
|
|
{
|
|
"epoch": 0.713462646437488,
|
|
"grad_norm": 20.691587448120117,
|
|
"learning_rate": 4.6011139045294554e-07,
|
|
"loss": 0.3294,
|
|
"num_input_tokens_seen": 35090880,
|
|
"step": 11145
|
|
},
|
|
{
|
|
"epoch": 0.7137827283784649,
|
|
"grad_norm": 99.84747314453125,
|
|
"learning_rate": 4.59171198043439e-07,
|
|
"loss": 0.3904,
|
|
"num_input_tokens_seen": 35106432,
|
|
"step": 11150
|
|
},
|
|
{
|
|
"epoch": 0.7141028103194418,
|
|
"grad_norm": 28.633445739746094,
|
|
"learning_rate": 4.582316808650424e-07,
|
|
"loss": 0.4349,
|
|
"num_input_tokens_seen": 35121664,
|
|
"step": 11155
|
|
},
|
|
{
|
|
"epoch": 0.7144228922604187,
|
|
"grad_norm": 42.922950744628906,
|
|
"learning_rate": 4.572928400907529e-07,
|
|
"loss": 0.491,
|
|
"num_input_tokens_seen": 35137152,
|
|
"step": 11160
|
|
},
|
|
{
|
|
"epoch": 0.7147429742013955,
|
|
"grad_norm": 38.647911071777344,
|
|
"learning_rate": 4.5635467689272434e-07,
|
|
"loss": 0.3682,
|
|
"num_input_tokens_seen": 35153088,
|
|
"step": 11165
|
|
},
|
|
{
|
|
"epoch": 0.7150630561423724,
|
|
"grad_norm": 22.412986755371094,
|
|
"learning_rate": 4.554171924422655e-07,
|
|
"loss": 0.3654,
|
|
"num_input_tokens_seen": 35168192,
|
|
"step": 11170
|
|
},
|
|
{
|
|
"epoch": 0.7153831380833493,
|
|
"grad_norm": 20.65825653076172,
|
|
"learning_rate": 4.544803879098356e-07,
|
|
"loss": 0.3242,
|
|
"num_input_tokens_seen": 35184192,
|
|
"step": 11175
|
|
},
|
|
{
|
|
"epoch": 0.7157032200243262,
|
|
"grad_norm": 23.79654884338379,
|
|
"learning_rate": 4.535442644650462e-07,
|
|
"loss": 0.3848,
|
|
"num_input_tokens_seen": 35200256,
|
|
"step": 11180
|
|
},
|
|
{
|
|
"epoch": 0.7160233019653032,
|
|
"grad_norm": 24.546035766601562,
|
|
"learning_rate": 4.5260882327665906e-07,
|
|
"loss": 0.4889,
|
|
"num_input_tokens_seen": 35214720,
|
|
"step": 11185
|
|
},
|
|
{
|
|
"epoch": 0.71634338390628,
|
|
"grad_norm": 38.554954528808594,
|
|
"learning_rate": 4.5167406551258347e-07,
|
|
"loss": 0.5077,
|
|
"num_input_tokens_seen": 35230720,
|
|
"step": 11190
|
|
},
|
|
{
|
|
"epoch": 0.7166634658472569,
|
|
"grad_norm": 29.644372940063477,
|
|
"learning_rate": 4.5073999233987445e-07,
|
|
"loss": 0.3948,
|
|
"num_input_tokens_seen": 35246400,
|
|
"step": 11195
|
|
},
|
|
{
|
|
"epoch": 0.7169835477882338,
|
|
"grad_norm": 30.197397232055664,
|
|
"learning_rate": 4.4980660492473434e-07,
|
|
"loss": 0.4854,
|
|
"num_input_tokens_seen": 35262784,
|
|
"step": 11200
|
|
},
|
|
{
|
|
"epoch": 0.7173036297292107,
|
|
"grad_norm": 14.077301025390625,
|
|
"learning_rate": 4.4887390443250804e-07,
|
|
"loss": 0.2735,
|
|
"num_input_tokens_seen": 35277632,
|
|
"step": 11205
|
|
},
|
|
{
|
|
"epoch": 0.7176237116701876,
|
|
"grad_norm": 18.285058975219727,
|
|
"learning_rate": 4.4794189202768295e-07,
|
|
"loss": 0.2981,
|
|
"num_input_tokens_seen": 35292544,
|
|
"step": 11210
|
|
},
|
|
{
|
|
"epoch": 0.7179437936111644,
|
|
"grad_norm": 32.815086364746094,
|
|
"learning_rate": 4.4701056887388757e-07,
|
|
"loss": 0.3816,
|
|
"num_input_tokens_seen": 35308352,
|
|
"step": 11215
|
|
},
|
|
{
|
|
"epoch": 0.7182638755521413,
|
|
"grad_norm": 31.119327545166016,
|
|
"learning_rate": 4.460799361338897e-07,
|
|
"loss": 0.3307,
|
|
"num_input_tokens_seen": 35323904,
|
|
"step": 11220
|
|
},
|
|
{
|
|
"epoch": 0.7185839574931182,
|
|
"grad_norm": 18.72206687927246,
|
|
"learning_rate": 4.451499949695954e-07,
|
|
"loss": 0.4203,
|
|
"num_input_tokens_seen": 35340224,
|
|
"step": 11225
|
|
},
|
|
{
|
|
"epoch": 0.7189040394340951,
|
|
"grad_norm": 17.376712799072266,
|
|
"learning_rate": 4.44220746542047e-07,
|
|
"loss": 0.375,
|
|
"num_input_tokens_seen": 35355776,
|
|
"step": 11230
|
|
},
|
|
{
|
|
"epoch": 0.719224121375072,
|
|
"grad_norm": 38.786521911621094,
|
|
"learning_rate": 4.432921920114221e-07,
|
|
"loss": 0.474,
|
|
"num_input_tokens_seen": 35371072,
|
|
"step": 11235
|
|
},
|
|
{
|
|
"epoch": 0.719544203316049,
|
|
"grad_norm": 36.586570739746094,
|
|
"learning_rate": 4.4236433253703185e-07,
|
|
"loss": 0.3144,
|
|
"num_input_tokens_seen": 35387520,
|
|
"step": 11240
|
|
},
|
|
{
|
|
"epoch": 0.7198642852570258,
|
|
"grad_norm": 36.61032485961914,
|
|
"learning_rate": 4.4143716927732e-07,
|
|
"loss": 0.4042,
|
|
"num_input_tokens_seen": 35403840,
|
|
"step": 11245
|
|
},
|
|
{
|
|
"epoch": 0.7201843671980027,
|
|
"grad_norm": 26.34575843811035,
|
|
"learning_rate": 4.405107033898604e-07,
|
|
"loss": 0.3767,
|
|
"num_input_tokens_seen": 35420032,
|
|
"step": 11250
|
|
},
|
|
{
|
|
"epoch": 0.7205044491389796,
|
|
"grad_norm": 33.59138107299805,
|
|
"learning_rate": 4.395849360313568e-07,
|
|
"loss": 0.2887,
|
|
"num_input_tokens_seen": 35436032,
|
|
"step": 11255
|
|
},
|
|
{
|
|
"epoch": 0.7208245310799565,
|
|
"grad_norm": 44.58377456665039,
|
|
"learning_rate": 4.386598683576406e-07,
|
|
"loss": 0.3505,
|
|
"num_input_tokens_seen": 35451136,
|
|
"step": 11260
|
|
},
|
|
{
|
|
"epoch": 0.7211446130209334,
|
|
"grad_norm": 17.373126983642578,
|
|
"learning_rate": 4.377355015236696e-07,
|
|
"loss": 0.4744,
|
|
"num_input_tokens_seen": 35466816,
|
|
"step": 11265
|
|
},
|
|
{
|
|
"epoch": 0.7214646949619102,
|
|
"grad_norm": 33.182308197021484,
|
|
"learning_rate": 4.368118366835266e-07,
|
|
"loss": 0.3588,
|
|
"num_input_tokens_seen": 35483456,
|
|
"step": 11270
|
|
},
|
|
{
|
|
"epoch": 0.7217847769028871,
|
|
"grad_norm": 40.823421478271484,
|
|
"learning_rate": 4.358888749904177e-07,
|
|
"loss": 0.4691,
|
|
"num_input_tokens_seen": 35499584,
|
|
"step": 11275
|
|
},
|
|
{
|
|
"epoch": 0.722104858843864,
|
|
"grad_norm": 24.432401657104492,
|
|
"learning_rate": 4.349666175966725e-07,
|
|
"loss": 0.3521,
|
|
"num_input_tokens_seen": 35515328,
|
|
"step": 11280
|
|
},
|
|
{
|
|
"epoch": 0.7224249407848409,
|
|
"grad_norm": 18.420427322387695,
|
|
"learning_rate": 4.340450656537392e-07,
|
|
"loss": 0.4721,
|
|
"num_input_tokens_seen": 35530048,
|
|
"step": 11285
|
|
},
|
|
{
|
|
"epoch": 0.7227450227258178,
|
|
"grad_norm": 31.080825805664062,
|
|
"learning_rate": 4.331242203121861e-07,
|
|
"loss": 0.2995,
|
|
"num_input_tokens_seen": 35545792,
|
|
"step": 11290
|
|
},
|
|
{
|
|
"epoch": 0.7230651046667947,
|
|
"grad_norm": 43.900115966796875,
|
|
"learning_rate": 4.322040827217004e-07,
|
|
"loss": 0.3775,
|
|
"num_input_tokens_seen": 35561344,
|
|
"step": 11295
|
|
},
|
|
{
|
|
"epoch": 0.7233851866077716,
|
|
"grad_norm": 42.54143142700195,
|
|
"learning_rate": 4.312846540310838e-07,
|
|
"loss": 0.4064,
|
|
"num_input_tokens_seen": 35577024,
|
|
"step": 11300
|
|
},
|
|
{
|
|
"epoch": 0.7237052685487485,
|
|
"grad_norm": 25.552127838134766,
|
|
"learning_rate": 4.3036593538825373e-07,
|
|
"loss": 0.3527,
|
|
"num_input_tokens_seen": 35592192,
|
|
"step": 11305
|
|
},
|
|
{
|
|
"epoch": 0.7240253504897254,
|
|
"grad_norm": 15.031996726989746,
|
|
"learning_rate": 4.2944792794024196e-07,
|
|
"loss": 0.3375,
|
|
"num_input_tokens_seen": 35607872,
|
|
"step": 11310
|
|
},
|
|
{
|
|
"epoch": 0.7243454324307023,
|
|
"grad_norm": 23.10059928894043,
|
|
"learning_rate": 4.285306328331915e-07,
|
|
"loss": 0.3015,
|
|
"num_input_tokens_seen": 35623872,
|
|
"step": 11315
|
|
},
|
|
{
|
|
"epoch": 0.7246655143716791,
|
|
"grad_norm": 27.68567657470703,
|
|
"learning_rate": 4.2761405121235506e-07,
|
|
"loss": 0.3168,
|
|
"num_input_tokens_seen": 35638720,
|
|
"step": 11320
|
|
},
|
|
{
|
|
"epoch": 0.724985596312656,
|
|
"grad_norm": 21.363649368286133,
|
|
"learning_rate": 4.266981842220965e-07,
|
|
"loss": 0.538,
|
|
"num_input_tokens_seen": 35655680,
|
|
"step": 11325
|
|
},
|
|
{
|
|
"epoch": 0.7253056782536329,
|
|
"grad_norm": 25.60169219970703,
|
|
"learning_rate": 4.257830330058864e-07,
|
|
"loss": 0.2663,
|
|
"num_input_tokens_seen": 35671168,
|
|
"step": 11330
|
|
},
|
|
{
|
|
"epoch": 0.7256257601946098,
|
|
"grad_norm": 28.766132354736328,
|
|
"learning_rate": 4.248685987063019e-07,
|
|
"loss": 0.4085,
|
|
"num_input_tokens_seen": 35686848,
|
|
"step": 11335
|
|
},
|
|
{
|
|
"epoch": 0.7259458421355867,
|
|
"grad_norm": 25.852869033813477,
|
|
"learning_rate": 4.2395488246502396e-07,
|
|
"loss": 0.3486,
|
|
"num_input_tokens_seen": 35702720,
|
|
"step": 11340
|
|
},
|
|
{
|
|
"epoch": 0.7262659240765637,
|
|
"grad_norm": 35.1387939453125,
|
|
"learning_rate": 4.2304188542283913e-07,
|
|
"loss": 0.4532,
|
|
"num_input_tokens_seen": 35720640,
|
|
"step": 11345
|
|
},
|
|
{
|
|
"epoch": 0.7265860060175405,
|
|
"grad_norm": 63.59513854980469,
|
|
"learning_rate": 4.221296087196347e-07,
|
|
"loss": 0.3855,
|
|
"num_input_tokens_seen": 35735424,
|
|
"step": 11350
|
|
},
|
|
{
|
|
"epoch": 0.7269060879585174,
|
|
"grad_norm": 22.047700881958008,
|
|
"learning_rate": 4.2121805349439867e-07,
|
|
"loss": 0.46,
|
|
"num_input_tokens_seen": 35751168,
|
|
"step": 11355
|
|
},
|
|
{
|
|
"epoch": 0.7272261698994943,
|
|
"grad_norm": 29.550992965698242,
|
|
"learning_rate": 4.203072208852184e-07,
|
|
"loss": 0.3829,
|
|
"num_input_tokens_seen": 35767168,
|
|
"step": 11360
|
|
},
|
|
{
|
|
"epoch": 0.7275462518404712,
|
|
"grad_norm": 47.271080017089844,
|
|
"learning_rate": 4.193971120292793e-07,
|
|
"loss": 0.447,
|
|
"num_input_tokens_seen": 35782464,
|
|
"step": 11365
|
|
},
|
|
{
|
|
"epoch": 0.727866333781448,
|
|
"grad_norm": 18.53926658630371,
|
|
"learning_rate": 4.184877280628629e-07,
|
|
"loss": 0.4004,
|
|
"num_input_tokens_seen": 35798592,
|
|
"step": 11370
|
|
},
|
|
{
|
|
"epoch": 0.7281864157224249,
|
|
"grad_norm": 35.71843719482422,
|
|
"learning_rate": 4.1757907012134565e-07,
|
|
"loss": 0.3955,
|
|
"num_input_tokens_seen": 35814720,
|
|
"step": 11375
|
|
},
|
|
{
|
|
"epoch": 0.7285064976634018,
|
|
"grad_norm": 32.7597770690918,
|
|
"learning_rate": 4.166711393391978e-07,
|
|
"loss": 0.2807,
|
|
"num_input_tokens_seen": 35830016,
|
|
"step": 11380
|
|
},
|
|
{
|
|
"epoch": 0.7288265796043787,
|
|
"grad_norm": 17.185914993286133,
|
|
"learning_rate": 4.1576393684998146e-07,
|
|
"loss": 0.3365,
|
|
"num_input_tokens_seen": 35845632,
|
|
"step": 11385
|
|
},
|
|
{
|
|
"epoch": 0.7291466615453556,
|
|
"grad_norm": 23.883012771606445,
|
|
"learning_rate": 4.1485746378634966e-07,
|
|
"loss": 0.3505,
|
|
"num_input_tokens_seen": 35861184,
|
|
"step": 11390
|
|
},
|
|
{
|
|
"epoch": 0.7294667434863324,
|
|
"grad_norm": 36.17485046386719,
|
|
"learning_rate": 4.1395172128004473e-07,
|
|
"loss": 0.4186,
|
|
"num_input_tokens_seen": 35876864,
|
|
"step": 11395
|
|
},
|
|
{
|
|
"epoch": 0.7297868254273094,
|
|
"grad_norm": 23.241865158081055,
|
|
"learning_rate": 4.130467104618963e-07,
|
|
"loss": 0.3272,
|
|
"num_input_tokens_seen": 35893568,
|
|
"step": 11400
|
|
},
|
|
{
|
|
"epoch": 0.7301069073682863,
|
|
"grad_norm": 27.966672897338867,
|
|
"learning_rate": 4.1214243246182223e-07,
|
|
"loss": 0.3336,
|
|
"num_input_tokens_seen": 35909696,
|
|
"step": 11405
|
|
},
|
|
{
|
|
"epoch": 0.7304269893092632,
|
|
"grad_norm": 27.110546112060547,
|
|
"learning_rate": 4.1123888840882306e-07,
|
|
"loss": 0.465,
|
|
"num_input_tokens_seen": 35925120,
|
|
"step": 11410
|
|
},
|
|
{
|
|
"epoch": 0.7307470712502401,
|
|
"grad_norm": 27.762094497680664,
|
|
"learning_rate": 4.1033607943098415e-07,
|
|
"loss": 0.3184,
|
|
"num_input_tokens_seen": 35940800,
|
|
"step": 11415
|
|
},
|
|
{
|
|
"epoch": 0.731067153191217,
|
|
"grad_norm": 13.206759452819824,
|
|
"learning_rate": 4.0943400665547423e-07,
|
|
"loss": 0.3461,
|
|
"num_input_tokens_seen": 35955968,
|
|
"step": 11420
|
|
},
|
|
{
|
|
"epoch": 0.7313872351321938,
|
|
"grad_norm": 45.63411331176758,
|
|
"learning_rate": 4.0853267120854064e-07,
|
|
"loss": 0.3261,
|
|
"num_input_tokens_seen": 35972096,
|
|
"step": 11425
|
|
},
|
|
{
|
|
"epoch": 0.7317073170731707,
|
|
"grad_norm": 22.173538208007812,
|
|
"learning_rate": 4.076320742155117e-07,
|
|
"loss": 0.3358,
|
|
"num_input_tokens_seen": 35986624,
|
|
"step": 11430
|
|
},
|
|
{
|
|
"epoch": 0.7320273990141476,
|
|
"grad_norm": 12.858855247497559,
|
|
"learning_rate": 4.067322168007928e-07,
|
|
"loss": 0.3546,
|
|
"num_input_tokens_seen": 36003008,
|
|
"step": 11435
|
|
},
|
|
{
|
|
"epoch": 0.7323474809551245,
|
|
"grad_norm": 24.93453598022461,
|
|
"learning_rate": 4.0583310008786775e-07,
|
|
"loss": 0.3539,
|
|
"num_input_tokens_seen": 36017152,
|
|
"step": 11440
|
|
},
|
|
{
|
|
"epoch": 0.7326675628961014,
|
|
"grad_norm": 42.7269287109375,
|
|
"learning_rate": 4.049347251992932e-07,
|
|
"loss": 0.2777,
|
|
"num_input_tokens_seen": 36031936,
|
|
"step": 11445
|
|
},
|
|
{
|
|
"epoch": 0.7329876448370783,
|
|
"grad_norm": 27.098237991333008,
|
|
"learning_rate": 4.0403709325670064e-07,
|
|
"loss": 0.3461,
|
|
"num_input_tokens_seen": 36048064,
|
|
"step": 11450
|
|
},
|
|
{
|
|
"epoch": 0.7333077267780552,
|
|
"grad_norm": 55.99066162109375,
|
|
"learning_rate": 4.03140205380795e-07,
|
|
"loss": 0.4433,
|
|
"num_input_tokens_seen": 36064256,
|
|
"step": 11455
|
|
},
|
|
{
|
|
"epoch": 0.7336278087190321,
|
|
"grad_norm": 65.33406066894531,
|
|
"learning_rate": 4.0224406269135115e-07,
|
|
"loss": 0.6545,
|
|
"num_input_tokens_seen": 36079424,
|
|
"step": 11460
|
|
},
|
|
{
|
|
"epoch": 0.733947890660009,
|
|
"grad_norm": 45.920005798339844,
|
|
"learning_rate": 4.0134866630721266e-07,
|
|
"loss": 0.3062,
|
|
"num_input_tokens_seen": 36095424,
|
|
"step": 11465
|
|
},
|
|
{
|
|
"epoch": 0.7342679726009859,
|
|
"grad_norm": 24.08492660522461,
|
|
"learning_rate": 4.0045401734629367e-07,
|
|
"loss": 0.3666,
|
|
"num_input_tokens_seen": 36111360,
|
|
"step": 11470
|
|
},
|
|
{
|
|
"epoch": 0.7345880545419627,
|
|
"grad_norm": 25.49542236328125,
|
|
"learning_rate": 3.9956011692557377e-07,
|
|
"loss": 0.3819,
|
|
"num_input_tokens_seen": 36127232,
|
|
"step": 11475
|
|
},
|
|
{
|
|
"epoch": 0.7349081364829396,
|
|
"grad_norm": 50.120731353759766,
|
|
"learning_rate": 3.986669661610972e-07,
|
|
"loss": 0.3447,
|
|
"num_input_tokens_seen": 36143168,
|
|
"step": 11480
|
|
},
|
|
{
|
|
"epoch": 0.7352282184239165,
|
|
"grad_norm": 34.99326705932617,
|
|
"learning_rate": 3.9777456616797414e-07,
|
|
"loss": 0.329,
|
|
"num_input_tokens_seen": 36158272,
|
|
"step": 11485
|
|
},
|
|
{
|
|
"epoch": 0.7355483003648934,
|
|
"grad_norm": 53.85727310180664,
|
|
"learning_rate": 3.968829180603761e-07,
|
|
"loss": 0.3544,
|
|
"num_input_tokens_seen": 36173056,
|
|
"step": 11490
|
|
},
|
|
{
|
|
"epoch": 0.7358683823058703,
|
|
"grad_norm": 48.56296920776367,
|
|
"learning_rate": 3.9599202295153624e-07,
|
|
"loss": 0.4025,
|
|
"num_input_tokens_seen": 36187904,
|
|
"step": 11495
|
|
},
|
|
{
|
|
"epoch": 0.7361884642468471,
|
|
"grad_norm": 70.58976745605469,
|
|
"learning_rate": 3.951018819537476e-07,
|
|
"loss": 0.3587,
|
|
"num_input_tokens_seen": 36205632,
|
|
"step": 11500
|
|
},
|
|
{
|
|
"epoch": 0.7365085461878241,
|
|
"grad_norm": 33.671356201171875,
|
|
"learning_rate": 3.942124961783616e-07,
|
|
"loss": 0.3492,
|
|
"num_input_tokens_seen": 36220160,
|
|
"step": 11505
|
|
},
|
|
{
|
|
"epoch": 0.736828628128801,
|
|
"grad_norm": 27.56850242614746,
|
|
"learning_rate": 3.933238667357869e-07,
|
|
"loss": 0.3096,
|
|
"num_input_tokens_seen": 36236416,
|
|
"step": 11510
|
|
},
|
|
{
|
|
"epoch": 0.7371487100697779,
|
|
"grad_norm": 41.809757232666016,
|
|
"learning_rate": 3.924359947354876e-07,
|
|
"loss": 0.3546,
|
|
"num_input_tokens_seen": 36251584,
|
|
"step": 11515
|
|
},
|
|
{
|
|
"epoch": 0.7374687920107548,
|
|
"grad_norm": 13.83644962310791,
|
|
"learning_rate": 3.915488812859826e-07,
|
|
"loss": 0.3261,
|
|
"num_input_tokens_seen": 36265856,
|
|
"step": 11520
|
|
},
|
|
{
|
|
"epoch": 0.7377888739517316,
|
|
"grad_norm": 58.69389724731445,
|
|
"learning_rate": 3.90662527494843e-07,
|
|
"loss": 0.3797,
|
|
"num_input_tokens_seen": 36283904,
|
|
"step": 11525
|
|
},
|
|
{
|
|
"epoch": 0.7381089558927085,
|
|
"grad_norm": 34.365379333496094,
|
|
"learning_rate": 3.8977693446869285e-07,
|
|
"loss": 0.3638,
|
|
"num_input_tokens_seen": 36298432,
|
|
"step": 11530
|
|
},
|
|
{
|
|
"epoch": 0.7384290378336854,
|
|
"grad_norm": 28.92525291442871,
|
|
"learning_rate": 3.8889210331320445e-07,
|
|
"loss": 0.3298,
|
|
"num_input_tokens_seen": 36313728,
|
|
"step": 11535
|
|
},
|
|
{
|
|
"epoch": 0.7387491197746623,
|
|
"grad_norm": 20.923290252685547,
|
|
"learning_rate": 3.8800803513310033e-07,
|
|
"loss": 0.3795,
|
|
"num_input_tokens_seen": 36329088,
|
|
"step": 11540
|
|
},
|
|
{
|
|
"epoch": 0.7390692017156392,
|
|
"grad_norm": 37.18941116333008,
|
|
"learning_rate": 3.8712473103214993e-07,
|
|
"loss": 0.4125,
|
|
"num_input_tokens_seen": 36345024,
|
|
"step": 11545
|
|
},
|
|
{
|
|
"epoch": 0.739389283656616,
|
|
"grad_norm": 21.793880462646484,
|
|
"learning_rate": 3.862421921131688e-07,
|
|
"loss": 0.3077,
|
|
"num_input_tokens_seen": 36361792,
|
|
"step": 11550
|
|
},
|
|
{
|
|
"epoch": 0.739709365597593,
|
|
"grad_norm": 39.02511978149414,
|
|
"learning_rate": 3.85360419478017e-07,
|
|
"loss": 0.2844,
|
|
"num_input_tokens_seen": 36377152,
|
|
"step": 11555
|
|
},
|
|
{
|
|
"epoch": 0.7400294475385699,
|
|
"grad_norm": 22.608049392700195,
|
|
"learning_rate": 3.8447941422759786e-07,
|
|
"loss": 0.346,
|
|
"num_input_tokens_seen": 36394048,
|
|
"step": 11560
|
|
},
|
|
{
|
|
"epoch": 0.7403495294795468,
|
|
"grad_norm": 31.372352600097656,
|
|
"learning_rate": 3.835991774618579e-07,
|
|
"loss": 0.3546,
|
|
"num_input_tokens_seen": 36409152,
|
|
"step": 11565
|
|
},
|
|
{
|
|
"epoch": 0.7406696114205237,
|
|
"grad_norm": 89.33686065673828,
|
|
"learning_rate": 3.827197102797818e-07,
|
|
"loss": 0.3882,
|
|
"num_input_tokens_seen": 36427072,
|
|
"step": 11570
|
|
},
|
|
{
|
|
"epoch": 0.7409896933615006,
|
|
"grad_norm": 60.6555290222168,
|
|
"learning_rate": 3.818410137793947e-07,
|
|
"loss": 0.4667,
|
|
"num_input_tokens_seen": 36444288,
|
|
"step": 11575
|
|
},
|
|
{
|
|
"epoch": 0.7413097753024774,
|
|
"grad_norm": 17.558565139770508,
|
|
"learning_rate": 3.809630890577602e-07,
|
|
"loss": 0.4323,
|
|
"num_input_tokens_seen": 36460096,
|
|
"step": 11580
|
|
},
|
|
{
|
|
"epoch": 0.7416298572434543,
|
|
"grad_norm": 123.81878662109375,
|
|
"learning_rate": 3.800859372109777e-07,
|
|
"loss": 0.3414,
|
|
"num_input_tokens_seen": 36475264,
|
|
"step": 11585
|
|
},
|
|
{
|
|
"epoch": 0.7419499391844312,
|
|
"grad_norm": 19.393999099731445,
|
|
"learning_rate": 3.7920955933418055e-07,
|
|
"loss": 0.3205,
|
|
"num_input_tokens_seen": 36491264,
|
|
"step": 11590
|
|
},
|
|
{
|
|
"epoch": 0.7422700211254081,
|
|
"grad_norm": 45.717002868652344,
|
|
"learning_rate": 3.7833395652153775e-07,
|
|
"loss": 0.3158,
|
|
"num_input_tokens_seen": 36506368,
|
|
"step": 11595
|
|
},
|
|
{
|
|
"epoch": 0.742590103066385,
|
|
"grad_norm": 36.06786346435547,
|
|
"learning_rate": 3.774591298662497e-07,
|
|
"loss": 0.2953,
|
|
"num_input_tokens_seen": 36522432,
|
|
"step": 11600
|
|
},
|
|
{
|
|
"epoch": 0.7429101850073618,
|
|
"grad_norm": 68.29784393310547,
|
|
"learning_rate": 3.765850804605468e-07,
|
|
"loss": 0.4255,
|
|
"num_input_tokens_seen": 36539008,
|
|
"step": 11605
|
|
},
|
|
{
|
|
"epoch": 0.7432302669483388,
|
|
"grad_norm": 23.243270874023438,
|
|
"learning_rate": 3.7571180939569104e-07,
|
|
"loss": 0.2863,
|
|
"num_input_tokens_seen": 36554240,
|
|
"step": 11610
|
|
},
|
|
{
|
|
"epoch": 0.7435503488893157,
|
|
"grad_norm": 33.98516082763672,
|
|
"learning_rate": 3.748393177619711e-07,
|
|
"loss": 0.308,
|
|
"num_input_tokens_seen": 36569920,
|
|
"step": 11615
|
|
},
|
|
{
|
|
"epoch": 0.7438704308302926,
|
|
"grad_norm": 32.500240325927734,
|
|
"learning_rate": 3.739676066487032e-07,
|
|
"loss": 0.3273,
|
|
"num_input_tokens_seen": 36585792,
|
|
"step": 11620
|
|
},
|
|
{
|
|
"epoch": 0.7441905127712695,
|
|
"grad_norm": 21.474756240844727,
|
|
"learning_rate": 3.730966771442289e-07,
|
|
"loss": 0.2906,
|
|
"num_input_tokens_seen": 36601280,
|
|
"step": 11625
|
|
},
|
|
{
|
|
"epoch": 0.7445105947122463,
|
|
"grad_norm": 34.38766860961914,
|
|
"learning_rate": 3.722265303359137e-07,
|
|
"loss": 0.5193,
|
|
"num_input_tokens_seen": 36617152,
|
|
"step": 11630
|
|
},
|
|
{
|
|
"epoch": 0.7448306766532232,
|
|
"grad_norm": 57.639156341552734,
|
|
"learning_rate": 3.713571673101463e-07,
|
|
"loss": 0.4,
|
|
"num_input_tokens_seen": 36632512,
|
|
"step": 11635
|
|
},
|
|
{
|
|
"epoch": 0.7451507585942001,
|
|
"grad_norm": 13.589853286743164,
|
|
"learning_rate": 3.704885891523366e-07,
|
|
"loss": 0.3338,
|
|
"num_input_tokens_seen": 36647744,
|
|
"step": 11640
|
|
},
|
|
{
|
|
"epoch": 0.745470840535177,
|
|
"grad_norm": 36.2945442199707,
|
|
"learning_rate": 3.696207969469146e-07,
|
|
"loss": 0.3878,
|
|
"num_input_tokens_seen": 36663360,
|
|
"step": 11645
|
|
},
|
|
{
|
|
"epoch": 0.7457909224761539,
|
|
"grad_norm": 28.11053466796875,
|
|
"learning_rate": 3.6875379177732913e-07,
|
|
"loss": 0.3571,
|
|
"num_input_tokens_seen": 36678656,
|
|
"step": 11650
|
|
},
|
|
{
|
|
"epoch": 0.7461110044171307,
|
|
"grad_norm": 71.32048034667969,
|
|
"learning_rate": 3.6788757472604634e-07,
|
|
"loss": 0.4971,
|
|
"num_input_tokens_seen": 36693952,
|
|
"step": 11655
|
|
},
|
|
{
|
|
"epoch": 0.7464310863581076,
|
|
"grad_norm": 34.13432312011719,
|
|
"learning_rate": 3.6702214687454825e-07,
|
|
"loss": 0.3139,
|
|
"num_input_tokens_seen": 36709888,
|
|
"step": 11660
|
|
},
|
|
{
|
|
"epoch": 0.7467511682990846,
|
|
"grad_norm": 26.685256958007812,
|
|
"learning_rate": 3.6615750930333177e-07,
|
|
"loss": 0.3103,
|
|
"num_input_tokens_seen": 36725504,
|
|
"step": 11665
|
|
},
|
|
{
|
|
"epoch": 0.7470712502400615,
|
|
"grad_norm": 11.093647956848145,
|
|
"learning_rate": 3.65293663091907e-07,
|
|
"loss": 0.3055,
|
|
"num_input_tokens_seen": 36741376,
|
|
"step": 11670
|
|
},
|
|
{
|
|
"epoch": 0.7473913321810384,
|
|
"grad_norm": 34.28535461425781,
|
|
"learning_rate": 3.6443060931879623e-07,
|
|
"loss": 0.4277,
|
|
"num_input_tokens_seen": 36756864,
|
|
"step": 11675
|
|
},
|
|
{
|
|
"epoch": 0.7477114141220152,
|
|
"grad_norm": 28.244558334350586,
|
|
"learning_rate": 3.635683490615321e-07,
|
|
"loss": 0.4503,
|
|
"num_input_tokens_seen": 36772608,
|
|
"step": 11680
|
|
},
|
|
{
|
|
"epoch": 0.7480314960629921,
|
|
"grad_norm": 76.50003051757812,
|
|
"learning_rate": 3.6270688339665634e-07,
|
|
"loss": 0.2975,
|
|
"num_input_tokens_seen": 36788352,
|
|
"step": 11685
|
|
},
|
|
{
|
|
"epoch": 0.748351578003969,
|
|
"grad_norm": 38.5800666809082,
|
|
"learning_rate": 3.6184621339972e-07,
|
|
"loss": 0.3444,
|
|
"num_input_tokens_seen": 36804096,
|
|
"step": 11690
|
|
},
|
|
{
|
|
"epoch": 0.7486716599449459,
|
|
"grad_norm": 42.00413131713867,
|
|
"learning_rate": 3.609863401452786e-07,
|
|
"loss": 0.3568,
|
|
"num_input_tokens_seen": 36819776,
|
|
"step": 11695
|
|
},
|
|
{
|
|
"epoch": 0.7489917418859228,
|
|
"grad_norm": 37.22871017456055,
|
|
"learning_rate": 3.6012726470689416e-07,
|
|
"loss": 0.4084,
|
|
"num_input_tokens_seen": 36835072,
|
|
"step": 11700
|
|
},
|
|
{
|
|
"epoch": 0.7493118238268996,
|
|
"grad_norm": 25.7962646484375,
|
|
"learning_rate": 3.592689881571329e-07,
|
|
"loss": 0.3318,
|
|
"num_input_tokens_seen": 36850816,
|
|
"step": 11705
|
|
},
|
|
{
|
|
"epoch": 0.7496319057678765,
|
|
"grad_norm": 36.8912467956543,
|
|
"learning_rate": 3.5841151156756334e-07,
|
|
"loss": 0.4348,
|
|
"num_input_tokens_seen": 36866368,
|
|
"step": 11710
|
|
},
|
|
{
|
|
"epoch": 0.7499519877088535,
|
|
"grad_norm": 39.81080627441406,
|
|
"learning_rate": 3.575548360087539e-07,
|
|
"loss": 0.3994,
|
|
"num_input_tokens_seen": 36885376,
|
|
"step": 11715
|
|
},
|
|
{
|
|
"epoch": 0.7502720696498304,
|
|
"grad_norm": 17.17061996459961,
|
|
"learning_rate": 3.5669896255027533e-07,
|
|
"loss": 0.3173,
|
|
"num_input_tokens_seen": 36900288,
|
|
"step": 11720
|
|
},
|
|
{
|
|
"epoch": 0.7505921515908073,
|
|
"grad_norm": 16.3179988861084,
|
|
"learning_rate": 3.5584389226069543e-07,
|
|
"loss": 0.4035,
|
|
"num_input_tokens_seen": 36916224,
|
|
"step": 11725
|
|
},
|
|
{
|
|
"epoch": 0.7509122335317842,
|
|
"grad_norm": 18.672239303588867,
|
|
"learning_rate": 3.5498962620757866e-07,
|
|
"loss": 0.2995,
|
|
"num_input_tokens_seen": 36931648,
|
|
"step": 11730
|
|
},
|
|
{
|
|
"epoch": 0.7509122335317842,
|
|
"eval_loss": 0.3647865653038025,
|
|
"eval_runtime": 49.1603,
|
|
"eval_samples_per_second": 282.464,
|
|
"eval_steps_per_second": 35.313,
|
|
"num_input_tokens_seen": 36931648,
|
|
"step": 11730
|
|
},
|
|
{
|
|
"epoch": 0.751232315472761,
|
|
"grad_norm": 94.76543426513672,
|
|
"learning_rate": 3.5413616545748713e-07,
|
|
"loss": 0.4327,
|
|
"num_input_tokens_seen": 36945856,
|
|
"step": 11735
|
|
},
|
|
{
|
|
"epoch": 0.7515523974137379,
|
|
"grad_norm": 24.62285804748535,
|
|
"learning_rate": 3.532835110759763e-07,
|
|
"loss": 0.5026,
|
|
"num_input_tokens_seen": 36961792,
|
|
"step": 11740
|
|
},
|
|
{
|
|
"epoch": 0.7518724793547148,
|
|
"grad_norm": 24.09138298034668,
|
|
"learning_rate": 3.524316641275955e-07,
|
|
"loss": 0.3038,
|
|
"num_input_tokens_seen": 36977152,
|
|
"step": 11745
|
|
},
|
|
{
|
|
"epoch": 0.7521925612956917,
|
|
"grad_norm": 19.018442153930664,
|
|
"learning_rate": 3.5158062567588467e-07,
|
|
"loss": 0.4152,
|
|
"num_input_tokens_seen": 36991936,
|
|
"step": 11750
|
|
},
|
|
{
|
|
"epoch": 0.7525126432366686,
|
|
"grad_norm": 64.08114624023438,
|
|
"learning_rate": 3.5073039678337633e-07,
|
|
"loss": 0.3924,
|
|
"num_input_tokens_seen": 37006784,
|
|
"step": 11755
|
|
},
|
|
{
|
|
"epoch": 0.7528327251776454,
|
|
"grad_norm": 36.50153732299805,
|
|
"learning_rate": 3.498809785115908e-07,
|
|
"loss": 0.348,
|
|
"num_input_tokens_seen": 37022208,
|
|
"step": 11760
|
|
},
|
|
{
|
|
"epoch": 0.7531528071186223,
|
|
"grad_norm": 12.052895545959473,
|
|
"learning_rate": 3.4903237192103697e-07,
|
|
"loss": 0.3504,
|
|
"num_input_tokens_seen": 37039488,
|
|
"step": 11765
|
|
},
|
|
{
|
|
"epoch": 0.7534728890595993,
|
|
"grad_norm": 34.9785270690918,
|
|
"learning_rate": 3.481845780712099e-07,
|
|
"loss": 0.3372,
|
|
"num_input_tokens_seen": 37056064,
|
|
"step": 11770
|
|
},
|
|
{
|
|
"epoch": 0.7537929710005762,
|
|
"grad_norm": 47.886329650878906,
|
|
"learning_rate": 3.4733759802059037e-07,
|
|
"loss": 0.3354,
|
|
"num_input_tokens_seen": 37072256,
|
|
"step": 11775
|
|
},
|
|
{
|
|
"epoch": 0.7541130529415531,
|
|
"grad_norm": 68.2834701538086,
|
|
"learning_rate": 3.4649143282664273e-07,
|
|
"loss": 0.4239,
|
|
"num_input_tokens_seen": 37087360,
|
|
"step": 11780
|
|
},
|
|
{
|
|
"epoch": 0.7544331348825299,
|
|
"grad_norm": 27.56783676147461,
|
|
"learning_rate": 3.456460835458143e-07,
|
|
"loss": 0.2992,
|
|
"num_input_tokens_seen": 37102144,
|
|
"step": 11785
|
|
},
|
|
{
|
|
"epoch": 0.7547532168235068,
|
|
"grad_norm": 42.50265121459961,
|
|
"learning_rate": 3.4480155123353337e-07,
|
|
"loss": 0.3172,
|
|
"num_input_tokens_seen": 37117568,
|
|
"step": 11790
|
|
},
|
|
{
|
|
"epoch": 0.7550732987644837,
|
|
"grad_norm": 35.93981170654297,
|
|
"learning_rate": 3.4395783694420875e-07,
|
|
"loss": 0.4541,
|
|
"num_input_tokens_seen": 37132800,
|
|
"step": 11795
|
|
},
|
|
{
|
|
"epoch": 0.7553933807054606,
|
|
"grad_norm": 20.164365768432617,
|
|
"learning_rate": 3.4311494173122743e-07,
|
|
"loss": 0.4009,
|
|
"num_input_tokens_seen": 37147776,
|
|
"step": 11800
|
|
},
|
|
{
|
|
"epoch": 0.7557134626464375,
|
|
"grad_norm": 26.284648895263672,
|
|
"learning_rate": 3.422728666469541e-07,
|
|
"loss": 0.387,
|
|
"num_input_tokens_seen": 37163904,
|
|
"step": 11805
|
|
},
|
|
{
|
|
"epoch": 0.7560335445874143,
|
|
"grad_norm": 42.91219711303711,
|
|
"learning_rate": 3.41431612742729e-07,
|
|
"loss": 0.4272,
|
|
"num_input_tokens_seen": 37180416,
|
|
"step": 11810
|
|
},
|
|
{
|
|
"epoch": 0.7563536265283912,
|
|
"grad_norm": 23.454986572265625,
|
|
"learning_rate": 3.4059118106886855e-07,
|
|
"loss": 0.4243,
|
|
"num_input_tokens_seen": 37196480,
|
|
"step": 11815
|
|
},
|
|
{
|
|
"epoch": 0.7566737084693682,
|
|
"grad_norm": 74.44619750976562,
|
|
"learning_rate": 3.3975157267466036e-07,
|
|
"loss": 0.5118,
|
|
"num_input_tokens_seen": 37211648,
|
|
"step": 11820
|
|
},
|
|
{
|
|
"epoch": 0.7569937904103451,
|
|
"grad_norm": 25.939687728881836,
|
|
"learning_rate": 3.389127886083656e-07,
|
|
"loss": 0.29,
|
|
"num_input_tokens_seen": 37227072,
|
|
"step": 11825
|
|
},
|
|
{
|
|
"epoch": 0.757313872351322,
|
|
"grad_norm": 24.652931213378906,
|
|
"learning_rate": 3.3807482991721667e-07,
|
|
"loss": 0.3415,
|
|
"num_input_tokens_seen": 37243968,
|
|
"step": 11830
|
|
},
|
|
{
|
|
"epoch": 0.7576339542922989,
|
|
"grad_norm": 20.54140853881836,
|
|
"learning_rate": 3.3723769764741474e-07,
|
|
"loss": 0.3219,
|
|
"num_input_tokens_seen": 37259200,
|
|
"step": 11835
|
|
},
|
|
{
|
|
"epoch": 0.7579540362332757,
|
|
"grad_norm": 15.427878379821777,
|
|
"learning_rate": 3.3640139284412825e-07,
|
|
"loss": 0.2948,
|
|
"num_input_tokens_seen": 37275072,
|
|
"step": 11840
|
|
},
|
|
{
|
|
"epoch": 0.7582741181742526,
|
|
"grad_norm": 42.64249038696289,
|
|
"learning_rate": 3.355659165514948e-07,
|
|
"loss": 0.399,
|
|
"num_input_tokens_seen": 37291392,
|
|
"step": 11845
|
|
},
|
|
{
|
|
"epoch": 0.7585942001152295,
|
|
"grad_norm": 16.320554733276367,
|
|
"learning_rate": 3.347312698126161e-07,
|
|
"loss": 0.2714,
|
|
"num_input_tokens_seen": 37307648,
|
|
"step": 11850
|
|
},
|
|
{
|
|
"epoch": 0.7589142820562064,
|
|
"grad_norm": 13.9678316116333,
|
|
"learning_rate": 3.338974536695578e-07,
|
|
"loss": 0.2191,
|
|
"num_input_tokens_seen": 37323136,
|
|
"step": 11855
|
|
},
|
|
{
|
|
"epoch": 0.7592343639971832,
|
|
"grad_norm": 21.917150497436523,
|
|
"learning_rate": 3.330644691633492e-07,
|
|
"loss": 0.3183,
|
|
"num_input_tokens_seen": 37338496,
|
|
"step": 11860
|
|
},
|
|
{
|
|
"epoch": 0.7595544459381601,
|
|
"grad_norm": 10.6149320602417,
|
|
"learning_rate": 3.322323173339818e-07,
|
|
"loss": 0.2783,
|
|
"num_input_tokens_seen": 37356800,
|
|
"step": 11865
|
|
},
|
|
{
|
|
"epoch": 0.759874527879137,
|
|
"grad_norm": 25.766250610351562,
|
|
"learning_rate": 3.314009992204071e-07,
|
|
"loss": 0.4264,
|
|
"num_input_tokens_seen": 37372800,
|
|
"step": 11870
|
|
},
|
|
{
|
|
"epoch": 0.760194609820114,
|
|
"grad_norm": 66.81485748291016,
|
|
"learning_rate": 3.3057051586053443e-07,
|
|
"loss": 0.3269,
|
|
"num_input_tokens_seen": 37388608,
|
|
"step": 11875
|
|
},
|
|
{
|
|
"epoch": 0.7605146917610909,
|
|
"grad_norm": 35.36101150512695,
|
|
"learning_rate": 3.297408682912329e-07,
|
|
"loss": 0.4584,
|
|
"num_input_tokens_seen": 37405184,
|
|
"step": 11880
|
|
},
|
|
{
|
|
"epoch": 0.7608347737020678,
|
|
"grad_norm": 21.154664993286133,
|
|
"learning_rate": 3.289120575483271e-07,
|
|
"loss": 0.2741,
|
|
"num_input_tokens_seen": 37420096,
|
|
"step": 11885
|
|
},
|
|
{
|
|
"epoch": 0.7611548556430446,
|
|
"grad_norm": 31.978300094604492,
|
|
"learning_rate": 3.280840846665969e-07,
|
|
"loss": 0.4214,
|
|
"num_input_tokens_seen": 37434368,
|
|
"step": 11890
|
|
},
|
|
{
|
|
"epoch": 0.7614749375840215,
|
|
"grad_norm": 29.54779052734375,
|
|
"learning_rate": 3.272569506797761e-07,
|
|
"loss": 0.3005,
|
|
"num_input_tokens_seen": 37449344,
|
|
"step": 11895
|
|
},
|
|
{
|
|
"epoch": 0.7617950195249984,
|
|
"grad_norm": 28.143238067626953,
|
|
"learning_rate": 3.2643065662055136e-07,
|
|
"loss": 0.3314,
|
|
"num_input_tokens_seen": 37464448,
|
|
"step": 11900
|
|
},
|
|
{
|
|
"epoch": 0.7621151014659753,
|
|
"grad_norm": 69.54246520996094,
|
|
"learning_rate": 3.2560520352056033e-07,
|
|
"loss": 0.2837,
|
|
"num_input_tokens_seen": 37481856,
|
|
"step": 11905
|
|
},
|
|
{
|
|
"epoch": 0.7624351834069522,
|
|
"grad_norm": 18.128210067749023,
|
|
"learning_rate": 3.24780592410391e-07,
|
|
"loss": 0.3985,
|
|
"num_input_tokens_seen": 37497856,
|
|
"step": 11910
|
|
},
|
|
{
|
|
"epoch": 0.762755265347929,
|
|
"grad_norm": 39.83074188232422,
|
|
"learning_rate": 3.2395682431957994e-07,
|
|
"loss": 0.4494,
|
|
"num_input_tokens_seen": 37513600,
|
|
"step": 11915
|
|
},
|
|
{
|
|
"epoch": 0.7630753472889059,
|
|
"grad_norm": 32.585750579833984,
|
|
"learning_rate": 3.231339002766115e-07,
|
|
"loss": 0.324,
|
|
"num_input_tokens_seen": 37529408,
|
|
"step": 11920
|
|
},
|
|
{
|
|
"epoch": 0.7633954292298829,
|
|
"grad_norm": 30.76116371154785,
|
|
"learning_rate": 3.2231182130891564e-07,
|
|
"loss": 0.3296,
|
|
"num_input_tokens_seen": 37545984,
|
|
"step": 11925
|
|
},
|
|
{
|
|
"epoch": 0.7637155111708598,
|
|
"grad_norm": 59.110801696777344,
|
|
"learning_rate": 3.214905884428679e-07,
|
|
"loss": 0.3405,
|
|
"num_input_tokens_seen": 37561856,
|
|
"step": 11930
|
|
},
|
|
{
|
|
"epoch": 0.7640355931118367,
|
|
"grad_norm": 29.65723991394043,
|
|
"learning_rate": 3.206702027037868e-07,
|
|
"loss": 0.3253,
|
|
"num_input_tokens_seen": 37578624,
|
|
"step": 11935
|
|
},
|
|
{
|
|
"epoch": 0.7643556750528135,
|
|
"grad_norm": 43.48826599121094,
|
|
"learning_rate": 3.198506651159344e-07,
|
|
"loss": 0.3882,
|
|
"num_input_tokens_seen": 37593920,
|
|
"step": 11940
|
|
},
|
|
{
|
|
"epoch": 0.7646757569937904,
|
|
"grad_norm": 23.43718147277832,
|
|
"learning_rate": 3.190319767025121e-07,
|
|
"loss": 0.38,
|
|
"num_input_tokens_seen": 37609664,
|
|
"step": 11945
|
|
},
|
|
{
|
|
"epoch": 0.7649958389347673,
|
|
"grad_norm": 59.76777267456055,
|
|
"learning_rate": 3.1821413848566213e-07,
|
|
"loss": 0.4989,
|
|
"num_input_tokens_seen": 37626048,
|
|
"step": 11950
|
|
},
|
|
{
|
|
"epoch": 0.7653159208757442,
|
|
"grad_norm": 17.83317756652832,
|
|
"learning_rate": 3.1739715148646564e-07,
|
|
"loss": 0.3798,
|
|
"num_input_tokens_seen": 37641792,
|
|
"step": 11955
|
|
},
|
|
{
|
|
"epoch": 0.7656360028167211,
|
|
"grad_norm": 51.09782409667969,
|
|
"learning_rate": 3.1658101672494043e-07,
|
|
"loss": 0.4583,
|
|
"num_input_tokens_seen": 37656512,
|
|
"step": 11960
|
|
},
|
|
{
|
|
"epoch": 0.7659560847576979,
|
|
"grad_norm": 46.76288604736328,
|
|
"learning_rate": 3.157657352200397e-07,
|
|
"loss": 0.3527,
|
|
"num_input_tokens_seen": 37672000,
|
|
"step": 11965
|
|
},
|
|
{
|
|
"epoch": 0.7662761666986748,
|
|
"grad_norm": 41.273860931396484,
|
|
"learning_rate": 3.149513079896521e-07,
|
|
"loss": 0.3362,
|
|
"num_input_tokens_seen": 37687232,
|
|
"step": 11970
|
|
},
|
|
{
|
|
"epoch": 0.7665962486396517,
|
|
"grad_norm": 19.319063186645508,
|
|
"learning_rate": 3.1413773605060034e-07,
|
|
"loss": 0.3244,
|
|
"num_input_tokens_seen": 37702656,
|
|
"step": 11975
|
|
},
|
|
{
|
|
"epoch": 0.7669163305806287,
|
|
"grad_norm": 65.8237533569336,
|
|
"learning_rate": 3.1332502041863783e-07,
|
|
"loss": 0.4343,
|
|
"num_input_tokens_seen": 37718080,
|
|
"step": 11980
|
|
},
|
|
{
|
|
"epoch": 0.7672364125216056,
|
|
"grad_norm": 29.25933837890625,
|
|
"learning_rate": 3.1251316210844946e-07,
|
|
"loss": 0.3141,
|
|
"num_input_tokens_seen": 37735680,
|
|
"step": 11985
|
|
},
|
|
{
|
|
"epoch": 0.7675564944625825,
|
|
"grad_norm": 51.57158660888672,
|
|
"learning_rate": 3.1170216213365055e-07,
|
|
"loss": 0.2871,
|
|
"num_input_tokens_seen": 37749952,
|
|
"step": 11990
|
|
},
|
|
{
|
|
"epoch": 0.7678765764035593,
|
|
"grad_norm": 34.71276092529297,
|
|
"learning_rate": 3.1089202150678397e-07,
|
|
"loss": 0.4607,
|
|
"num_input_tokens_seen": 37765312,
|
|
"step": 11995
|
|
},
|
|
{
|
|
"epoch": 0.7681966583445362,
|
|
"grad_norm": 49.59117126464844,
|
|
"learning_rate": 3.1008274123931886e-07,
|
|
"loss": 0.4695,
|
|
"num_input_tokens_seen": 37780160,
|
|
"step": 12000
|
|
},
|
|
{
|
|
"epoch": 0.7685167402855131,
|
|
"grad_norm": 25.49561309814453,
|
|
"learning_rate": 3.092743223416523e-07,
|
|
"loss": 0.2672,
|
|
"num_input_tokens_seen": 37796352,
|
|
"step": 12005
|
|
},
|
|
{
|
|
"epoch": 0.76883682222649,
|
|
"grad_norm": 59.26298522949219,
|
|
"learning_rate": 3.0846676582310413e-07,
|
|
"loss": 0.3499,
|
|
"num_input_tokens_seen": 37812864,
|
|
"step": 12010
|
|
},
|
|
{
|
|
"epoch": 0.7691569041674668,
|
|
"grad_norm": 43.80664825439453,
|
|
"learning_rate": 3.076600726919185e-07,
|
|
"loss": 0.3824,
|
|
"num_input_tokens_seen": 37827840,
|
|
"step": 12015
|
|
},
|
|
{
|
|
"epoch": 0.7694769861084437,
|
|
"grad_norm": 31.392080307006836,
|
|
"learning_rate": 3.0685424395526106e-07,
|
|
"loss": 0.3579,
|
|
"num_input_tokens_seen": 37847040,
|
|
"step": 12020
|
|
},
|
|
{
|
|
"epoch": 0.7697970680494206,
|
|
"grad_norm": 37.12458419799805,
|
|
"learning_rate": 3.060492806192184e-07,
|
|
"loss": 0.2819,
|
|
"num_input_tokens_seen": 37862464,
|
|
"step": 12025
|
|
},
|
|
{
|
|
"epoch": 0.7701171499903975,
|
|
"grad_norm": 36.16139221191406,
|
|
"learning_rate": 3.052451836887968e-07,
|
|
"loss": 0.377,
|
|
"num_input_tokens_seen": 37877760,
|
|
"step": 12030
|
|
},
|
|
{
|
|
"epoch": 0.7704372319313745,
|
|
"grad_norm": 28.426408767700195,
|
|
"learning_rate": 3.044419541679207e-07,
|
|
"loss": 0.2861,
|
|
"num_input_tokens_seen": 37892800,
|
|
"step": 12035
|
|
},
|
|
{
|
|
"epoch": 0.7707573138723514,
|
|
"grad_norm": 62.23591232299805,
|
|
"learning_rate": 3.0363959305943153e-07,
|
|
"loss": 0.4239,
|
|
"num_input_tokens_seen": 37909056,
|
|
"step": 12040
|
|
},
|
|
{
|
|
"epoch": 0.7710773958133282,
|
|
"grad_norm": 42.053489685058594,
|
|
"learning_rate": 3.028381013650867e-07,
|
|
"loss": 0.348,
|
|
"num_input_tokens_seen": 37925376,
|
|
"step": 12045
|
|
},
|
|
{
|
|
"epoch": 0.7713974777543051,
|
|
"grad_norm": 37.59280014038086,
|
|
"learning_rate": 3.0203748008555783e-07,
|
|
"loss": 0.3716,
|
|
"num_input_tokens_seen": 37941632,
|
|
"step": 12050
|
|
},
|
|
{
|
|
"epoch": 0.771717559695282,
|
|
"grad_norm": 41.64907455444336,
|
|
"learning_rate": 3.012377302204301e-07,
|
|
"loss": 0.3805,
|
|
"num_input_tokens_seen": 37957056,
|
|
"step": 12055
|
|
},
|
|
{
|
|
"epoch": 0.7720376416362589,
|
|
"grad_norm": 46.065406799316406,
|
|
"learning_rate": 3.0043885276820046e-07,
|
|
"loss": 0.3916,
|
|
"num_input_tokens_seen": 37973184,
|
|
"step": 12060
|
|
},
|
|
{
|
|
"epoch": 0.7723577235772358,
|
|
"grad_norm": 24.32598304748535,
|
|
"learning_rate": 2.99640848726277e-07,
|
|
"loss": 0.3087,
|
|
"num_input_tokens_seen": 37988288,
|
|
"step": 12065
|
|
},
|
|
{
|
|
"epoch": 0.7726778055182126,
|
|
"grad_norm": 23.855104446411133,
|
|
"learning_rate": 2.9884371909097704e-07,
|
|
"loss": 0.3812,
|
|
"num_input_tokens_seen": 38004224,
|
|
"step": 12070
|
|
},
|
|
{
|
|
"epoch": 0.7729978874591895,
|
|
"grad_norm": 22.65608787536621,
|
|
"learning_rate": 2.9804746485752616e-07,
|
|
"loss": 0.3711,
|
|
"num_input_tokens_seen": 38019456,
|
|
"step": 12075
|
|
},
|
|
{
|
|
"epoch": 0.7733179694001664,
|
|
"grad_norm": 25.479469299316406,
|
|
"learning_rate": 2.972520870200573e-07,
|
|
"loss": 0.4058,
|
|
"num_input_tokens_seen": 38035264,
|
|
"step": 12080
|
|
},
|
|
{
|
|
"epoch": 0.7736380513411434,
|
|
"grad_norm": 31.957597732543945,
|
|
"learning_rate": 2.9645758657160904e-07,
|
|
"loss": 0.4045,
|
|
"num_input_tokens_seen": 38051072,
|
|
"step": 12085
|
|
},
|
|
{
|
|
"epoch": 0.7739581332821203,
|
|
"grad_norm": 16.966663360595703,
|
|
"learning_rate": 2.9566396450412444e-07,
|
|
"loss": 0.3538,
|
|
"num_input_tokens_seen": 38066688,
|
|
"step": 12090
|
|
},
|
|
{
|
|
"epoch": 0.7742782152230971,
|
|
"grad_norm": 22.1097354888916,
|
|
"learning_rate": 2.9487122180844957e-07,
|
|
"loss": 0.3193,
|
|
"num_input_tokens_seen": 38082048,
|
|
"step": 12095
|
|
},
|
|
{
|
|
"epoch": 0.774598297164074,
|
|
"grad_norm": 58.488800048828125,
|
|
"learning_rate": 2.9407935947433406e-07,
|
|
"loss": 0.2996,
|
|
"num_input_tokens_seen": 38097344,
|
|
"step": 12100
|
|
},
|
|
{
|
|
"epoch": 0.7749183791050509,
|
|
"grad_norm": 48.314144134521484,
|
|
"learning_rate": 2.932883784904264e-07,
|
|
"loss": 0.446,
|
|
"num_input_tokens_seen": 38112320,
|
|
"step": 12105
|
|
},
|
|
{
|
|
"epoch": 0.7752384610460278,
|
|
"grad_norm": 19.5347843170166,
|
|
"learning_rate": 2.9249827984427555e-07,
|
|
"loss": 0.2475,
|
|
"num_input_tokens_seen": 38128000,
|
|
"step": 12110
|
|
},
|
|
{
|
|
"epoch": 0.7755585429870047,
|
|
"grad_norm": 29.27086639404297,
|
|
"learning_rate": 2.917090645223297e-07,
|
|
"loss": 0.3015,
|
|
"num_input_tokens_seen": 38143168,
|
|
"step": 12115
|
|
},
|
|
{
|
|
"epoch": 0.7758786249279815,
|
|
"grad_norm": 27.007768630981445,
|
|
"learning_rate": 2.909207335099332e-07,
|
|
"loss": 0.2912,
|
|
"num_input_tokens_seen": 38157824,
|
|
"step": 12120
|
|
},
|
|
{
|
|
"epoch": 0.7761987068689584,
|
|
"grad_norm": 31.75836181640625,
|
|
"learning_rate": 2.9013328779132595e-07,
|
|
"loss": 0.3332,
|
|
"num_input_tokens_seen": 38172864,
|
|
"step": 12125
|
|
},
|
|
{
|
|
"epoch": 0.7765187888099353,
|
|
"grad_norm": 115.25257110595703,
|
|
"learning_rate": 2.893467283496439e-07,
|
|
"loss": 0.425,
|
|
"num_input_tokens_seen": 38187264,
|
|
"step": 12130
|
|
},
|
|
{
|
|
"epoch": 0.7768388707509122,
|
|
"grad_norm": 19.299240112304688,
|
|
"learning_rate": 2.885610561669155e-07,
|
|
"loss": 0.3551,
|
|
"num_input_tokens_seen": 38204288,
|
|
"step": 12135
|
|
},
|
|
{
|
|
"epoch": 0.7771589526918892,
|
|
"grad_norm": 28.34507179260254,
|
|
"learning_rate": 2.8777627222406163e-07,
|
|
"loss": 0.3462,
|
|
"num_input_tokens_seen": 38219264,
|
|
"step": 12140
|
|
},
|
|
{
|
|
"epoch": 0.777479034632866,
|
|
"grad_norm": 40.6217041015625,
|
|
"learning_rate": 2.869923775008943e-07,
|
|
"loss": 0.3863,
|
|
"num_input_tokens_seen": 38234496,
|
|
"step": 12145
|
|
},
|
|
{
|
|
"epoch": 0.7777991165738429,
|
|
"grad_norm": 37.57053756713867,
|
|
"learning_rate": 2.862093729761155e-07,
|
|
"loss": 0.2559,
|
|
"num_input_tokens_seen": 38251072,
|
|
"step": 12150
|
|
},
|
|
{
|
|
"epoch": 0.7781191985148198,
|
|
"grad_norm": 28.284217834472656,
|
|
"learning_rate": 2.854272596273152e-07,
|
|
"loss": 0.4049,
|
|
"num_input_tokens_seen": 38266560,
|
|
"step": 12155
|
|
},
|
|
{
|
|
"epoch": 0.7784392804557967,
|
|
"grad_norm": 43.39320373535156,
|
|
"learning_rate": 2.8464603843097134e-07,
|
|
"loss": 0.3287,
|
|
"num_input_tokens_seen": 38282944,
|
|
"step": 12160
|
|
},
|
|
{
|
|
"epoch": 0.7787593623967736,
|
|
"grad_norm": 32.42449951171875,
|
|
"learning_rate": 2.8386571036244764e-07,
|
|
"loss": 0.3291,
|
|
"num_input_tokens_seen": 38299264,
|
|
"step": 12165
|
|
},
|
|
{
|
|
"epoch": 0.7790794443377504,
|
|
"grad_norm": 51.791812896728516,
|
|
"learning_rate": 2.830862763959929e-07,
|
|
"loss": 0.39,
|
|
"num_input_tokens_seen": 38314368,
|
|
"step": 12170
|
|
},
|
|
{
|
|
"epoch": 0.7793995262787273,
|
|
"grad_norm": 10.4609956741333,
|
|
"learning_rate": 2.8230773750473956e-07,
|
|
"loss": 0.3154,
|
|
"num_input_tokens_seen": 38329664,
|
|
"step": 12175
|
|
},
|
|
{
|
|
"epoch": 0.7797196082197042,
|
|
"grad_norm": 27.046852111816406,
|
|
"learning_rate": 2.8153009466070267e-07,
|
|
"loss": 0.3072,
|
|
"num_input_tokens_seen": 38345408,
|
|
"step": 12180
|
|
},
|
|
{
|
|
"epoch": 0.7800396901606811,
|
|
"grad_norm": 32.581607818603516,
|
|
"learning_rate": 2.807533488347783e-07,
|
|
"loss": 0.2878,
|
|
"num_input_tokens_seen": 38362688,
|
|
"step": 12185
|
|
},
|
|
{
|
|
"epoch": 0.7803597721016581,
|
|
"grad_norm": 23.63336944580078,
|
|
"learning_rate": 2.7997750099674277e-07,
|
|
"loss": 0.2548,
|
|
"num_input_tokens_seen": 38377600,
|
|
"step": 12190
|
|
},
|
|
{
|
|
"epoch": 0.780679854042635,
|
|
"grad_norm": 36.57121276855469,
|
|
"learning_rate": 2.792025521152512e-07,
|
|
"loss": 0.5286,
|
|
"num_input_tokens_seen": 38392640,
|
|
"step": 12195
|
|
},
|
|
{
|
|
"epoch": 0.7809999359836118,
|
|
"grad_norm": 34.91606521606445,
|
|
"learning_rate": 2.784285031578365e-07,
|
|
"loss": 0.4496,
|
|
"num_input_tokens_seen": 38408448,
|
|
"step": 12200
|
|
},
|
|
{
|
|
"epoch": 0.7813200179245887,
|
|
"grad_norm": 26.795875549316406,
|
|
"learning_rate": 2.7765535509090786e-07,
|
|
"loss": 0.3629,
|
|
"num_input_tokens_seen": 38424512,
|
|
"step": 12205
|
|
},
|
|
{
|
|
"epoch": 0.7816400998655656,
|
|
"grad_norm": 29.603397369384766,
|
|
"learning_rate": 2.768831088797495e-07,
|
|
"loss": 0.4739,
|
|
"num_input_tokens_seen": 38439296,
|
|
"step": 12210
|
|
},
|
|
{
|
|
"epoch": 0.7819601818065425,
|
|
"grad_norm": 15.58344554901123,
|
|
"learning_rate": 2.761117654885201e-07,
|
|
"loss": 0.2482,
|
|
"num_input_tokens_seen": 38455424,
|
|
"step": 12215
|
|
},
|
|
{
|
|
"epoch": 0.7822802637475194,
|
|
"grad_norm": 24.491289138793945,
|
|
"learning_rate": 2.7534132588025063e-07,
|
|
"loss": 0.3265,
|
|
"num_input_tokens_seen": 38470976,
|
|
"step": 12220
|
|
},
|
|
{
|
|
"epoch": 0.7826003456884962,
|
|
"grad_norm": 27.425262451171875,
|
|
"learning_rate": 2.7457179101684483e-07,
|
|
"loss": 0.5075,
|
|
"num_input_tokens_seen": 38486016,
|
|
"step": 12225
|
|
},
|
|
{
|
|
"epoch": 0.7829204276294731,
|
|
"grad_norm": 22.376157760620117,
|
|
"learning_rate": 2.7380316185907506e-07,
|
|
"loss": 0.298,
|
|
"num_input_tokens_seen": 38501248,
|
|
"step": 12230
|
|
},
|
|
{
|
|
"epoch": 0.78324050957045,
|
|
"grad_norm": 19.046939849853516,
|
|
"learning_rate": 2.730354393665839e-07,
|
|
"loss": 0.3503,
|
|
"num_input_tokens_seen": 38516992,
|
|
"step": 12235
|
|
},
|
|
{
|
|
"epoch": 0.7835605915114269,
|
|
"grad_norm": 27.88618278503418,
|
|
"learning_rate": 2.7226862449788245e-07,
|
|
"loss": 0.3702,
|
|
"num_input_tokens_seen": 38531456,
|
|
"step": 12240
|
|
},
|
|
{
|
|
"epoch": 0.7838806734524039,
|
|
"grad_norm": 34.346378326416016,
|
|
"learning_rate": 2.715027182103482e-07,
|
|
"loss": 0.3264,
|
|
"num_input_tokens_seen": 38546880,
|
|
"step": 12245
|
|
},
|
|
{
|
|
"epoch": 0.7842007553933807,
|
|
"grad_norm": 20.54593276977539,
|
|
"learning_rate": 2.707377214602232e-07,
|
|
"loss": 0.3039,
|
|
"num_input_tokens_seen": 38562176,
|
|
"step": 12250
|
|
},
|
|
{
|
|
"epoch": 0.7845208373343576,
|
|
"grad_norm": 37.601043701171875,
|
|
"learning_rate": 2.699736352026157e-07,
|
|
"loss": 0.4366,
|
|
"num_input_tokens_seen": 38577472,
|
|
"step": 12255
|
|
},
|
|
{
|
|
"epoch": 0.7848409192753345,
|
|
"grad_norm": 22.17053985595703,
|
|
"learning_rate": 2.6921046039149645e-07,
|
|
"loss": 0.3297,
|
|
"num_input_tokens_seen": 38593088,
|
|
"step": 12260
|
|
},
|
|
{
|
|
"epoch": 0.7851610012163114,
|
|
"grad_norm": 31.56439208984375,
|
|
"learning_rate": 2.6844819797969744e-07,
|
|
"loss": 0.3408,
|
|
"num_input_tokens_seen": 38607936,
|
|
"step": 12265
|
|
},
|
|
{
|
|
"epoch": 0.7854810831572883,
|
|
"grad_norm": 40.473628997802734,
|
|
"learning_rate": 2.6768684891891236e-07,
|
|
"loss": 0.2481,
|
|
"num_input_tokens_seen": 38625024,
|
|
"step": 12270
|
|
},
|
|
{
|
|
"epoch": 0.7858011650982651,
|
|
"grad_norm": 30.89264678955078,
|
|
"learning_rate": 2.6692641415969497e-07,
|
|
"loss": 0.3321,
|
|
"num_input_tokens_seen": 38641792,
|
|
"step": 12275
|
|
},
|
|
{
|
|
"epoch": 0.786121247039242,
|
|
"grad_norm": 47.64722442626953,
|
|
"learning_rate": 2.66166894651457e-07,
|
|
"loss": 0.395,
|
|
"num_input_tokens_seen": 38656896,
|
|
"step": 12280
|
|
},
|
|
{
|
|
"epoch": 0.7864413289802189,
|
|
"grad_norm": 43.44092559814453,
|
|
"learning_rate": 2.654082913424668e-07,
|
|
"loss": 0.3426,
|
|
"num_input_tokens_seen": 38672448,
|
|
"step": 12285
|
|
},
|
|
{
|
|
"epoch": 0.7867614109211958,
|
|
"grad_norm": 27.422563552856445,
|
|
"learning_rate": 2.6465060517985003e-07,
|
|
"loss": 0.3016,
|
|
"num_input_tokens_seen": 38688576,
|
|
"step": 12290
|
|
},
|
|
{
|
|
"epoch": 0.7870814928621728,
|
|
"grad_norm": 44.733848571777344,
|
|
"learning_rate": 2.638938371095867e-07,
|
|
"loss": 0.5123,
|
|
"num_input_tokens_seen": 38704064,
|
|
"step": 12295
|
|
},
|
|
{
|
|
"epoch": 0.7874015748031497,
|
|
"grad_norm": 20.204547882080078,
|
|
"learning_rate": 2.6313798807651065e-07,
|
|
"loss": 0.381,
|
|
"num_input_tokens_seen": 38718976,
|
|
"step": 12300
|
|
},
|
|
{
|
|
"epoch": 0.7877216567441265,
|
|
"grad_norm": 17.738218307495117,
|
|
"learning_rate": 2.6238305902430813e-07,
|
|
"loss": 0.3529,
|
|
"num_input_tokens_seen": 38734272,
|
|
"step": 12305
|
|
},
|
|
{
|
|
"epoch": 0.7880417386851034,
|
|
"grad_norm": 14.163119316101074,
|
|
"learning_rate": 2.61629050895517e-07,
|
|
"loss": 0.307,
|
|
"num_input_tokens_seen": 38749504,
|
|
"step": 12310
|
|
},
|
|
{
|
|
"epoch": 0.7883618206260803,
|
|
"grad_norm": 27.0414981842041,
|
|
"learning_rate": 2.608759646315253e-07,
|
|
"loss": 0.3171,
|
|
"num_input_tokens_seen": 38764352,
|
|
"step": 12315
|
|
},
|
|
{
|
|
"epoch": 0.7886819025670572,
|
|
"grad_norm": 18.21839714050293,
|
|
"learning_rate": 2.6012380117257005e-07,
|
|
"loss": 0.3637,
|
|
"num_input_tokens_seen": 38780096,
|
|
"step": 12320
|
|
},
|
|
{
|
|
"epoch": 0.789001984508034,
|
|
"grad_norm": 33.14684295654297,
|
|
"learning_rate": 2.5937256145773613e-07,
|
|
"loss": 0.3902,
|
|
"num_input_tokens_seen": 38795712,
|
|
"step": 12325
|
|
},
|
|
{
|
|
"epoch": 0.7893220664490109,
|
|
"grad_norm": 39.35667037963867,
|
|
"learning_rate": 2.586222464249551e-07,
|
|
"loss": 0.3264,
|
|
"num_input_tokens_seen": 38811328,
|
|
"step": 12330
|
|
},
|
|
{
|
|
"epoch": 0.7896421483899878,
|
|
"grad_norm": 27.116695404052734,
|
|
"learning_rate": 2.5787285701100413e-07,
|
|
"loss": 0.2022,
|
|
"num_input_tokens_seen": 38826240,
|
|
"step": 12335
|
|
},
|
|
{
|
|
"epoch": 0.7899622303309647,
|
|
"grad_norm": 37.62165832519531,
|
|
"learning_rate": 2.571243941515048e-07,
|
|
"loss": 0.3672,
|
|
"num_input_tokens_seen": 38842624,
|
|
"step": 12340
|
|
},
|
|
{
|
|
"epoch": 0.7902823122719416,
|
|
"grad_norm": 22.701847076416016,
|
|
"learning_rate": 2.563768587809213e-07,
|
|
"loss": 0.2672,
|
|
"num_input_tokens_seen": 38857472,
|
|
"step": 12345
|
|
},
|
|
{
|
|
"epoch": 0.7906023942129186,
|
|
"grad_norm": 60.98664855957031,
|
|
"learning_rate": 2.5563025183256137e-07,
|
|
"loss": 0.4118,
|
|
"num_input_tokens_seen": 38872256,
|
|
"step": 12350
|
|
},
|
|
{
|
|
"epoch": 0.7909224761538954,
|
|
"grad_norm": 38.52484893798828,
|
|
"learning_rate": 2.548845742385717e-07,
|
|
"loss": 0.552,
|
|
"num_input_tokens_seen": 38890048,
|
|
"step": 12355
|
|
},
|
|
{
|
|
"epoch": 0.7912425580948723,
|
|
"grad_norm": 38.10274887084961,
|
|
"learning_rate": 2.541398269299393e-07,
|
|
"loss": 0.2356,
|
|
"num_input_tokens_seen": 38905664,
|
|
"step": 12360
|
|
},
|
|
{
|
|
"epoch": 0.7915626400358492,
|
|
"grad_norm": 12.663208961486816,
|
|
"learning_rate": 2.5339601083649063e-07,
|
|
"loss": 0.2978,
|
|
"num_input_tokens_seen": 38926144,
|
|
"step": 12365
|
|
},
|
|
{
|
|
"epoch": 0.7918827219768261,
|
|
"grad_norm": 34.63762283325195,
|
|
"learning_rate": 2.526531268868889e-07,
|
|
"loss": 0.4751,
|
|
"num_input_tokens_seen": 38942720,
|
|
"step": 12370
|
|
},
|
|
{
|
|
"epoch": 0.792202803917803,
|
|
"grad_norm": 25.847164154052734,
|
|
"learning_rate": 2.5191117600863266e-07,
|
|
"loss": 0.3397,
|
|
"num_input_tokens_seen": 38958144,
|
|
"step": 12375
|
|
},
|
|
{
|
|
"epoch": 0.7925228858587798,
|
|
"grad_norm": 20.030961990356445,
|
|
"learning_rate": 2.511701591280565e-07,
|
|
"loss": 0.2568,
|
|
"num_input_tokens_seen": 38973376,
|
|
"step": 12380
|
|
},
|
|
{
|
|
"epoch": 0.7928429677997567,
|
|
"grad_norm": 37.385189056396484,
|
|
"learning_rate": 2.504300771703295e-07,
|
|
"loss": 0.346,
|
|
"num_input_tokens_seen": 38989504,
|
|
"step": 12385
|
|
},
|
|
{
|
|
"epoch": 0.7931630497407336,
|
|
"grad_norm": 72.55767822265625,
|
|
"learning_rate": 2.496909310594517e-07,
|
|
"loss": 0.3626,
|
|
"num_input_tokens_seen": 39005056,
|
|
"step": 12390
|
|
},
|
|
{
|
|
"epoch": 0.7934831316817105,
|
|
"grad_norm": 40.421688079833984,
|
|
"learning_rate": 2.4895272171825587e-07,
|
|
"loss": 0.4459,
|
|
"num_input_tokens_seen": 39020608,
|
|
"step": 12395
|
|
},
|
|
{
|
|
"epoch": 0.7938032136226874,
|
|
"grad_norm": 32.116249084472656,
|
|
"learning_rate": 2.482154500684055e-07,
|
|
"loss": 0.443,
|
|
"num_input_tokens_seen": 39035712,
|
|
"step": 12400
|
|
},
|
|
{
|
|
"epoch": 0.7941232955636643,
|
|
"grad_norm": 25.23982048034668,
|
|
"learning_rate": 2.4747911703039293e-07,
|
|
"loss": 0.3361,
|
|
"num_input_tokens_seen": 39050880,
|
|
"step": 12405
|
|
},
|
|
{
|
|
"epoch": 0.7944433775046412,
|
|
"grad_norm": 35.13556671142578,
|
|
"learning_rate": 2.467437235235378e-07,
|
|
"loss": 0.3689,
|
|
"num_input_tokens_seen": 39065792,
|
|
"step": 12410
|
|
},
|
|
{
|
|
"epoch": 0.7947634594456181,
|
|
"grad_norm": 31.368885040283203,
|
|
"learning_rate": 2.460092704659883e-07,
|
|
"loss": 0.3418,
|
|
"num_input_tokens_seen": 39080960,
|
|
"step": 12415
|
|
},
|
|
{
|
|
"epoch": 0.795083541386595,
|
|
"grad_norm": 16.847009658813477,
|
|
"learning_rate": 2.452757587747174e-07,
|
|
"loss": 0.2604,
|
|
"num_input_tokens_seen": 39097216,
|
|
"step": 12420
|
|
},
|
|
{
|
|
"epoch": 0.7954036233275719,
|
|
"grad_norm": 23.280132293701172,
|
|
"learning_rate": 2.445431893655232e-07,
|
|
"loss": 0.1771,
|
|
"num_input_tokens_seen": 39113152,
|
|
"step": 12425
|
|
},
|
|
{
|
|
"epoch": 0.7957237052685487,
|
|
"grad_norm": 35.485782623291016,
|
|
"learning_rate": 2.438115631530271e-07,
|
|
"loss": 0.3722,
|
|
"num_input_tokens_seen": 39130176,
|
|
"step": 12430
|
|
},
|
|
{
|
|
"epoch": 0.7960437872095256,
|
|
"grad_norm": 28.096521377563477,
|
|
"learning_rate": 2.4308088105067305e-07,
|
|
"loss": 0.2283,
|
|
"num_input_tokens_seen": 39145792,
|
|
"step": 12435
|
|
},
|
|
{
|
|
"epoch": 0.7963638691505025,
|
|
"grad_norm": 67.06790924072266,
|
|
"learning_rate": 2.423511439707262e-07,
|
|
"loss": 0.4201,
|
|
"num_input_tokens_seen": 39161280,
|
|
"step": 12440
|
|
},
|
|
{
|
|
"epoch": 0.7966839510914794,
|
|
"grad_norm": 23.704147338867188,
|
|
"learning_rate": 2.4162235282427177e-07,
|
|
"loss": 0.2784,
|
|
"num_input_tokens_seen": 39176512,
|
|
"step": 12445
|
|
},
|
|
{
|
|
"epoch": 0.7970040330324563,
|
|
"grad_norm": 42.61015319824219,
|
|
"learning_rate": 2.408945085212144e-07,
|
|
"loss": 0.3621,
|
|
"num_input_tokens_seen": 39191808,
|
|
"step": 12450
|
|
},
|
|
{
|
|
"epoch": 0.7973241149734333,
|
|
"grad_norm": 33.03046417236328,
|
|
"learning_rate": 2.401676119702759e-07,
|
|
"loss": 0.2479,
|
|
"num_input_tokens_seen": 39208640,
|
|
"step": 12455
|
|
},
|
|
{
|
|
"epoch": 0.7976441969144101,
|
|
"grad_norm": 19.37267303466797,
|
|
"learning_rate": 2.394416640789952e-07,
|
|
"loss": 0.3438,
|
|
"num_input_tokens_seen": 39223232,
|
|
"step": 12460
|
|
},
|
|
{
|
|
"epoch": 0.797964278855387,
|
|
"grad_norm": 40.43623352050781,
|
|
"learning_rate": 2.3871666575372696e-07,
|
|
"loss": 0.3098,
|
|
"num_input_tokens_seen": 39238656,
|
|
"step": 12465
|
|
},
|
|
{
|
|
"epoch": 0.7982843607963639,
|
|
"grad_norm": 54.1468505859375,
|
|
"learning_rate": 2.3799261789963964e-07,
|
|
"loss": 0.532,
|
|
"num_input_tokens_seen": 39255872,
|
|
"step": 12470
|
|
},
|
|
{
|
|
"epoch": 0.7986044427373408,
|
|
"grad_norm": 21.15880584716797,
|
|
"learning_rate": 2.3726952142071644e-07,
|
|
"loss": 0.2708,
|
|
"num_input_tokens_seen": 39270784,
|
|
"step": 12475
|
|
},
|
|
{
|
|
"epoch": 0.7989245246783176,
|
|
"grad_norm": 41.602508544921875,
|
|
"learning_rate": 2.365473772197508e-07,
|
|
"loss": 0.3462,
|
|
"num_input_tokens_seen": 39286080,
|
|
"step": 12480
|
|
},
|
|
{
|
|
"epoch": 0.7992446066192945,
|
|
"grad_norm": 33.63953399658203,
|
|
"learning_rate": 2.3582618619834883e-07,
|
|
"loss": 0.356,
|
|
"num_input_tokens_seen": 39301312,
|
|
"step": 12485
|
|
},
|
|
{
|
|
"epoch": 0.7995646885602714,
|
|
"grad_norm": 16.34864616394043,
|
|
"learning_rate": 2.3510594925692528e-07,
|
|
"loss": 0.2216,
|
|
"num_input_tokens_seen": 39316736,
|
|
"step": 12490
|
|
},
|
|
{
|
|
"epoch": 0.7998847705012483,
|
|
"grad_norm": 28.48493194580078,
|
|
"learning_rate": 2.343866672947057e-07,
|
|
"loss": 0.3493,
|
|
"num_input_tokens_seen": 39331264,
|
|
"step": 12495
|
|
},
|
|
{
|
|
"epoch": 0.8002048524422252,
|
|
"grad_norm": 26.142616271972656,
|
|
"learning_rate": 2.336683412097209e-07,
|
|
"loss": 0.2587,
|
|
"num_input_tokens_seen": 39345856,
|
|
"step": 12500
|
|
},
|
|
{
|
|
"epoch": 0.800524934383202,
|
|
"grad_norm": 23.281526565551758,
|
|
"learning_rate": 2.329509718988095e-07,
|
|
"loss": 0.3645,
|
|
"num_input_tokens_seen": 39361280,
|
|
"step": 12505
|
|
},
|
|
{
|
|
"epoch": 0.800845016324179,
|
|
"grad_norm": 34.25197982788086,
|
|
"learning_rate": 2.3223456025761645e-07,
|
|
"loss": 0.3367,
|
|
"num_input_tokens_seen": 39375872,
|
|
"step": 12510
|
|
},
|
|
{
|
|
"epoch": 0.8009730491005698,
|
|
"eval_loss": 0.36358681321144104,
|
|
"eval_runtime": 49.1621,
|
|
"eval_samples_per_second": 282.453,
|
|
"eval_steps_per_second": 35.312,
|
|
"num_input_tokens_seen": 39382144,
|
|
"step": 12512
|
|
},
|
|
{
|
|
"epoch": 0.8011650982651559,
|
|
"grad_norm": 20.024723052978516,
|
|
"learning_rate": 2.315191071805892e-07,
|
|
"loss": 0.2866,
|
|
"num_input_tokens_seen": 39392320,
|
|
"step": 12515
|
|
},
|
|
{
|
|
"epoch": 0.8014851802061328,
|
|
"grad_norm": 63.86294937133789,
|
|
"learning_rate": 2.3080461356097937e-07,
|
|
"loss": 0.3619,
|
|
"num_input_tokens_seen": 39407680,
|
|
"step": 12520
|
|
},
|
|
{
|
|
"epoch": 0.8018052621471097,
|
|
"grad_norm": 18.46623992919922,
|
|
"learning_rate": 2.30091080290841e-07,
|
|
"loss": 0.288,
|
|
"num_input_tokens_seen": 39424512,
|
|
"step": 12525
|
|
},
|
|
{
|
|
"epoch": 0.8021253440880866,
|
|
"grad_norm": 45.297523498535156,
|
|
"learning_rate": 2.29378508261029e-07,
|
|
"loss": 0.3463,
|
|
"num_input_tokens_seen": 39439296,
|
|
"step": 12530
|
|
},
|
|
{
|
|
"epoch": 0.8024454260290634,
|
|
"grad_norm": 53.35750198364258,
|
|
"learning_rate": 2.2866689836119702e-07,
|
|
"loss": 0.3707,
|
|
"num_input_tokens_seen": 39456576,
|
|
"step": 12535
|
|
},
|
|
{
|
|
"epoch": 0.8027655079700403,
|
|
"grad_norm": 62.54146957397461,
|
|
"learning_rate": 2.2795625147979913e-07,
|
|
"loss": 0.3536,
|
|
"num_input_tokens_seen": 39472512,
|
|
"step": 12540
|
|
},
|
|
{
|
|
"epoch": 0.8030855899110172,
|
|
"grad_norm": 22.177854537963867,
|
|
"learning_rate": 2.2724656850408597e-07,
|
|
"loss": 0.2332,
|
|
"num_input_tokens_seen": 39488192,
|
|
"step": 12545
|
|
},
|
|
{
|
|
"epoch": 0.8034056718519941,
|
|
"grad_norm": 42.50724411010742,
|
|
"learning_rate": 2.2653785032010532e-07,
|
|
"loss": 0.3855,
|
|
"num_input_tokens_seen": 39503552,
|
|
"step": 12550
|
|
},
|
|
{
|
|
"epoch": 0.803725753792971,
|
|
"grad_norm": 38.946964263916016,
|
|
"learning_rate": 2.258300978126999e-07,
|
|
"loss": 0.3363,
|
|
"num_input_tokens_seen": 39519744,
|
|
"step": 12555
|
|
},
|
|
{
|
|
"epoch": 0.804045835733948,
|
|
"grad_norm": 22.364994049072266,
|
|
"learning_rate": 2.2512331186550715e-07,
|
|
"loss": 0.4753,
|
|
"num_input_tokens_seen": 39535232,
|
|
"step": 12560
|
|
},
|
|
{
|
|
"epoch": 0.8043659176749248,
|
|
"grad_norm": 44.744346618652344,
|
|
"learning_rate": 2.244174933609575e-07,
|
|
"loss": 0.3878,
|
|
"num_input_tokens_seen": 39549568,
|
|
"step": 12565
|
|
},
|
|
{
|
|
"epoch": 0.8046859996159017,
|
|
"grad_norm": 27.26950454711914,
|
|
"learning_rate": 2.2371264318027383e-07,
|
|
"loss": 0.2764,
|
|
"num_input_tokens_seen": 39566016,
|
|
"step": 12570
|
|
},
|
|
{
|
|
"epoch": 0.8050060815568786,
|
|
"grad_norm": 31.31670570373535,
|
|
"learning_rate": 2.2300876220346975e-07,
|
|
"loss": 0.2308,
|
|
"num_input_tokens_seen": 39581760,
|
|
"step": 12575
|
|
},
|
|
{
|
|
"epoch": 0.8053261634978555,
|
|
"grad_norm": 39.95564651489258,
|
|
"learning_rate": 2.2230585130934897e-07,
|
|
"loss": 0.2785,
|
|
"num_input_tokens_seen": 39597888,
|
|
"step": 12580
|
|
},
|
|
{
|
|
"epoch": 0.8056462454388323,
|
|
"grad_norm": 23.922866821289062,
|
|
"learning_rate": 2.2160391137550394e-07,
|
|
"loss": 0.4454,
|
|
"num_input_tokens_seen": 39613568,
|
|
"step": 12585
|
|
},
|
|
{
|
|
"epoch": 0.8059663273798092,
|
|
"grad_norm": 60.24818420410156,
|
|
"learning_rate": 2.2090294327831494e-07,
|
|
"loss": 0.4314,
|
|
"num_input_tokens_seen": 39628096,
|
|
"step": 12590
|
|
},
|
|
{
|
|
"epoch": 0.8062864093207861,
|
|
"grad_norm": 40.70429992675781,
|
|
"learning_rate": 2.202029478929488e-07,
|
|
"loss": 0.2695,
|
|
"num_input_tokens_seen": 39642560,
|
|
"step": 12595
|
|
},
|
|
{
|
|
"epoch": 0.806606491261763,
|
|
"grad_norm": 24.328882217407227,
|
|
"learning_rate": 2.195039260933581e-07,
|
|
"loss": 0.2967,
|
|
"num_input_tokens_seen": 39658112,
|
|
"step": 12600
|
|
},
|
|
{
|
|
"epoch": 0.8069265732027399,
|
|
"grad_norm": 33.61399841308594,
|
|
"learning_rate": 2.1880587875227973e-07,
|
|
"loss": 0.2657,
|
|
"num_input_tokens_seen": 39674112,
|
|
"step": 12605
|
|
},
|
|
{
|
|
"epoch": 0.8072466551437167,
|
|
"grad_norm": 27.520858764648438,
|
|
"learning_rate": 2.18108806741234e-07,
|
|
"loss": 0.3313,
|
|
"num_input_tokens_seen": 39690432,
|
|
"step": 12610
|
|
},
|
|
{
|
|
"epoch": 0.8075667370846937,
|
|
"grad_norm": 21.497695922851562,
|
|
"learning_rate": 2.1741271093052315e-07,
|
|
"loss": 0.3512,
|
|
"num_input_tokens_seen": 39705792,
|
|
"step": 12615
|
|
},
|
|
{
|
|
"epoch": 0.8078868190256706,
|
|
"grad_norm": 50.78917694091797,
|
|
"learning_rate": 2.167175921892318e-07,
|
|
"loss": 0.4692,
|
|
"num_input_tokens_seen": 39722048,
|
|
"step": 12620
|
|
},
|
|
{
|
|
"epoch": 0.8082069009666475,
|
|
"grad_norm": 26.748119354248047,
|
|
"learning_rate": 2.1602345138522314e-07,
|
|
"loss": 0.4239,
|
|
"num_input_tokens_seen": 39738304,
|
|
"step": 12625
|
|
},
|
|
{
|
|
"epoch": 0.8085269829076244,
|
|
"grad_norm": 31.953128814697266,
|
|
"learning_rate": 2.1533028938514008e-07,
|
|
"loss": 0.3468,
|
|
"num_input_tokens_seen": 39753728,
|
|
"step": 12630
|
|
},
|
|
{
|
|
"epoch": 0.8088470648486012,
|
|
"grad_norm": 41.40265655517578,
|
|
"learning_rate": 2.1463810705440433e-07,
|
|
"loss": 0.3435,
|
|
"num_input_tokens_seen": 39769600,
|
|
"step": 12635
|
|
},
|
|
{
|
|
"epoch": 0.8091671467895781,
|
|
"grad_norm": 33.059566497802734,
|
|
"learning_rate": 2.139469052572127e-07,
|
|
"loss": 0.3519,
|
|
"num_input_tokens_seen": 39784000,
|
|
"step": 12640
|
|
},
|
|
{
|
|
"epoch": 0.809487228730555,
|
|
"grad_norm": 46.353363037109375,
|
|
"learning_rate": 2.1325668485653891e-07,
|
|
"loss": 0.344,
|
|
"num_input_tokens_seen": 39800320,
|
|
"step": 12645
|
|
},
|
|
{
|
|
"epoch": 0.8098073106715319,
|
|
"grad_norm": 27.811872482299805,
|
|
"learning_rate": 2.1256744671413173e-07,
|
|
"loss": 0.457,
|
|
"num_input_tokens_seen": 39815360,
|
|
"step": 12650
|
|
},
|
|
{
|
|
"epoch": 0.8101273926125088,
|
|
"grad_norm": 31.568683624267578,
|
|
"learning_rate": 2.1187919169051316e-07,
|
|
"loss": 0.3821,
|
|
"num_input_tokens_seen": 39829952,
|
|
"step": 12655
|
|
},
|
|
{
|
|
"epoch": 0.8104474745534856,
|
|
"grad_norm": 33.802940368652344,
|
|
"learning_rate": 2.111919206449767e-07,
|
|
"loss": 0.3528,
|
|
"num_input_tokens_seen": 39845376,
|
|
"step": 12660
|
|
},
|
|
{
|
|
"epoch": 0.8107675564944626,
|
|
"grad_norm": 27.218812942504883,
|
|
"learning_rate": 2.1050563443558922e-07,
|
|
"loss": 0.4858,
|
|
"num_input_tokens_seen": 39861696,
|
|
"step": 12665
|
|
},
|
|
{
|
|
"epoch": 0.8110876384354395,
|
|
"grad_norm": 37.33356475830078,
|
|
"learning_rate": 2.0982033391918697e-07,
|
|
"loss": 0.297,
|
|
"num_input_tokens_seen": 39877440,
|
|
"step": 12670
|
|
},
|
|
{
|
|
"epoch": 0.8114077203764164,
|
|
"grad_norm": 58.22770309448242,
|
|
"learning_rate": 2.0913601995137543e-07,
|
|
"loss": 0.334,
|
|
"num_input_tokens_seen": 39893760,
|
|
"step": 12675
|
|
},
|
|
{
|
|
"epoch": 0.8117278023173933,
|
|
"grad_norm": 15.805877685546875,
|
|
"learning_rate": 2.084526933865287e-07,
|
|
"loss": 0.2943,
|
|
"num_input_tokens_seen": 39909568,
|
|
"step": 12680
|
|
},
|
|
{
|
|
"epoch": 0.8120478842583702,
|
|
"grad_norm": 30.60896873474121,
|
|
"learning_rate": 2.0777035507778817e-07,
|
|
"loss": 0.4543,
|
|
"num_input_tokens_seen": 39923648,
|
|
"step": 12685
|
|
},
|
|
{
|
|
"epoch": 0.812367966199347,
|
|
"grad_norm": 17.86086654663086,
|
|
"learning_rate": 2.0708900587706135e-07,
|
|
"loss": 0.4299,
|
|
"num_input_tokens_seen": 39939008,
|
|
"step": 12690
|
|
},
|
|
{
|
|
"epoch": 0.8126880481403239,
|
|
"grad_norm": 45.35393142700195,
|
|
"learning_rate": 2.0640864663502e-07,
|
|
"loss": 0.3374,
|
|
"num_input_tokens_seen": 39955072,
|
|
"step": 12695
|
|
},
|
|
{
|
|
"epoch": 0.8130081300813008,
|
|
"grad_norm": 31.832155227661133,
|
|
"learning_rate": 2.057292782011013e-07,
|
|
"loss": 0.4545,
|
|
"num_input_tokens_seen": 39970880,
|
|
"step": 12700
|
|
},
|
|
{
|
|
"epoch": 0.8133282120222777,
|
|
"grad_norm": 22.989181518554688,
|
|
"learning_rate": 2.0505090142350468e-07,
|
|
"loss": 0.2967,
|
|
"num_input_tokens_seen": 39986240,
|
|
"step": 12705
|
|
},
|
|
{
|
|
"epoch": 0.8136482939632546,
|
|
"grad_norm": 31.20648765563965,
|
|
"learning_rate": 2.0437351714919127e-07,
|
|
"loss": 0.3427,
|
|
"num_input_tokens_seen": 40001856,
|
|
"step": 12710
|
|
},
|
|
{
|
|
"epoch": 0.8139683759042314,
|
|
"grad_norm": 18.44768714904785,
|
|
"learning_rate": 2.0369712622388336e-07,
|
|
"loss": 0.309,
|
|
"num_input_tokens_seen": 40018112,
|
|
"step": 12715
|
|
},
|
|
{
|
|
"epoch": 0.8142884578452084,
|
|
"grad_norm": 37.2120475769043,
|
|
"learning_rate": 2.0302172949206298e-07,
|
|
"loss": 0.2879,
|
|
"num_input_tokens_seen": 40033664,
|
|
"step": 12720
|
|
},
|
|
{
|
|
"epoch": 0.8146085397861853,
|
|
"grad_norm": 54.152069091796875,
|
|
"learning_rate": 2.0234732779697094e-07,
|
|
"loss": 0.2967,
|
|
"num_input_tokens_seen": 40048768,
|
|
"step": 12725
|
|
},
|
|
{
|
|
"epoch": 0.8149286217271622,
|
|
"grad_norm": 42.13416290283203,
|
|
"learning_rate": 2.016739219806056e-07,
|
|
"loss": 0.3229,
|
|
"num_input_tokens_seen": 40063232,
|
|
"step": 12730
|
|
},
|
|
{
|
|
"epoch": 0.8152487036681391,
|
|
"grad_norm": 19.65249252319336,
|
|
"learning_rate": 2.0100151288372215e-07,
|
|
"loss": 0.3904,
|
|
"num_input_tokens_seen": 40079296,
|
|
"step": 12735
|
|
},
|
|
{
|
|
"epoch": 0.8155687856091159,
|
|
"grad_norm": 59.13142013549805,
|
|
"learning_rate": 2.0033010134583084e-07,
|
|
"loss": 0.5554,
|
|
"num_input_tokens_seen": 40094976,
|
|
"step": 12740
|
|
},
|
|
{
|
|
"epoch": 0.8158888675500928,
|
|
"grad_norm": 32.4484977722168,
|
|
"learning_rate": 1.9965968820519763e-07,
|
|
"loss": 0.3218,
|
|
"num_input_tokens_seen": 40110464,
|
|
"step": 12745
|
|
},
|
|
{
|
|
"epoch": 0.8162089494910697,
|
|
"grad_norm": 48.04807662963867,
|
|
"learning_rate": 1.9899027429884042e-07,
|
|
"loss": 0.3981,
|
|
"num_input_tokens_seen": 40125568,
|
|
"step": 12750
|
|
},
|
|
{
|
|
"epoch": 0.8165290314320466,
|
|
"grad_norm": 37.24668502807617,
|
|
"learning_rate": 1.983218604625305e-07,
|
|
"loss": 0.4142,
|
|
"num_input_tokens_seen": 40141440,
|
|
"step": 12755
|
|
},
|
|
{
|
|
"epoch": 0.8168491133730235,
|
|
"grad_norm": 14.393180847167969,
|
|
"learning_rate": 1.9765444753079096e-07,
|
|
"loss": 0.3275,
|
|
"num_input_tokens_seen": 40156416,
|
|
"step": 12760
|
|
},
|
|
{
|
|
"epoch": 0.8171691953140003,
|
|
"grad_norm": 29.691728591918945,
|
|
"learning_rate": 1.9698803633689408e-07,
|
|
"loss": 0.3998,
|
|
"num_input_tokens_seen": 40172928,
|
|
"step": 12765
|
|
},
|
|
{
|
|
"epoch": 0.8174892772549772,
|
|
"grad_norm": 21.646751403808594,
|
|
"learning_rate": 1.963226277128619e-07,
|
|
"loss": 0.2336,
|
|
"num_input_tokens_seen": 40188096,
|
|
"step": 12770
|
|
},
|
|
{
|
|
"epoch": 0.8178093591959542,
|
|
"grad_norm": 29.038705825805664,
|
|
"learning_rate": 1.956582224894655e-07,
|
|
"loss": 0.3593,
|
|
"num_input_tokens_seen": 40204032,
|
|
"step": 12775
|
|
},
|
|
{
|
|
"epoch": 0.8181294411369311,
|
|
"grad_norm": 46.25074768066406,
|
|
"learning_rate": 1.949948214962227e-07,
|
|
"loss": 0.3646,
|
|
"num_input_tokens_seen": 40218944,
|
|
"step": 12780
|
|
},
|
|
{
|
|
"epoch": 0.818449523077908,
|
|
"grad_norm": 54.344844818115234,
|
|
"learning_rate": 1.943324255613964e-07,
|
|
"loss": 0.3731,
|
|
"num_input_tokens_seen": 40235456,
|
|
"step": 12785
|
|
},
|
|
{
|
|
"epoch": 0.8187696050188848,
|
|
"grad_norm": 24.159887313842773,
|
|
"learning_rate": 1.936710355119967e-07,
|
|
"loss": 0.4505,
|
|
"num_input_tokens_seen": 40250176,
|
|
"step": 12790
|
|
},
|
|
{
|
|
"epoch": 0.8190896869598617,
|
|
"grad_norm": 33.41341018676758,
|
|
"learning_rate": 1.9301065217377655e-07,
|
|
"loss": 0.3157,
|
|
"num_input_tokens_seen": 40265472,
|
|
"step": 12795
|
|
},
|
|
{
|
|
"epoch": 0.8194097689008386,
|
|
"grad_norm": 25.555482864379883,
|
|
"learning_rate": 1.9235127637123249e-07,
|
|
"loss": 0.3992,
|
|
"num_input_tokens_seen": 40281728,
|
|
"step": 12800
|
|
},
|
|
{
|
|
"epoch": 0.8197298508418155,
|
|
"grad_norm": 52.75870132446289,
|
|
"learning_rate": 1.9169290892760225e-07,
|
|
"loss": 0.3282,
|
|
"num_input_tokens_seen": 40296768,
|
|
"step": 12805
|
|
},
|
|
{
|
|
"epoch": 0.8200499327827924,
|
|
"grad_norm": 44.361934661865234,
|
|
"learning_rate": 1.91035550664866e-07,
|
|
"loss": 0.3201,
|
|
"num_input_tokens_seen": 40311488,
|
|
"step": 12810
|
|
},
|
|
{
|
|
"epoch": 0.8203700147237692,
|
|
"grad_norm": 54.147613525390625,
|
|
"learning_rate": 1.903792024037433e-07,
|
|
"loss": 0.314,
|
|
"num_input_tokens_seen": 40327232,
|
|
"step": 12815
|
|
},
|
|
{
|
|
"epoch": 0.8206900966647461,
|
|
"grad_norm": 33.24623489379883,
|
|
"learning_rate": 1.8972386496369185e-07,
|
|
"loss": 0.4472,
|
|
"num_input_tokens_seen": 40344064,
|
|
"step": 12820
|
|
},
|
|
{
|
|
"epoch": 0.8210101786057231,
|
|
"grad_norm": 41.800315856933594,
|
|
"learning_rate": 1.89069539162909e-07,
|
|
"loss": 0.3976,
|
|
"num_input_tokens_seen": 40359040,
|
|
"step": 12825
|
|
},
|
|
{
|
|
"epoch": 0.8213302605467,
|
|
"grad_norm": 19.14189338684082,
|
|
"learning_rate": 1.8841622581832783e-07,
|
|
"loss": 0.4066,
|
|
"num_input_tokens_seen": 40376384,
|
|
"step": 12830
|
|
},
|
|
{
|
|
"epoch": 0.8216503424876769,
|
|
"grad_norm": 28.32308578491211,
|
|
"learning_rate": 1.8776392574561783e-07,
|
|
"loss": 0.5901,
|
|
"num_input_tokens_seen": 40391936,
|
|
"step": 12835
|
|
},
|
|
{
|
|
"epoch": 0.8219704244286538,
|
|
"grad_norm": 23.97947883605957,
|
|
"learning_rate": 1.8711263975918322e-07,
|
|
"loss": 0.4831,
|
|
"num_input_tokens_seen": 40408832,
|
|
"step": 12840
|
|
},
|
|
{
|
|
"epoch": 0.8222905063696306,
|
|
"grad_norm": 35.37938690185547,
|
|
"learning_rate": 1.8646236867216215e-07,
|
|
"loss": 0.4603,
|
|
"num_input_tokens_seen": 40425280,
|
|
"step": 12845
|
|
},
|
|
{
|
|
"epoch": 0.8226105883106075,
|
|
"grad_norm": 34.26011657714844,
|
|
"learning_rate": 1.8581311329642591e-07,
|
|
"loss": 0.338,
|
|
"num_input_tokens_seen": 40440832,
|
|
"step": 12850
|
|
},
|
|
{
|
|
"epoch": 0.8229306702515844,
|
|
"grad_norm": 29.206497192382812,
|
|
"learning_rate": 1.8516487444257723e-07,
|
|
"loss": 0.2651,
|
|
"num_input_tokens_seen": 40458624,
|
|
"step": 12855
|
|
},
|
|
{
|
|
"epoch": 0.8232507521925613,
|
|
"grad_norm": 33.5301399230957,
|
|
"learning_rate": 1.8451765291995004e-07,
|
|
"loss": 0.4093,
|
|
"num_input_tokens_seen": 40474688,
|
|
"step": 12860
|
|
},
|
|
{
|
|
"epoch": 0.8235708341335382,
|
|
"grad_norm": 35.508880615234375,
|
|
"learning_rate": 1.8387144953660806e-07,
|
|
"loss": 0.3554,
|
|
"num_input_tokens_seen": 40490816,
|
|
"step": 12865
|
|
},
|
|
{
|
|
"epoch": 0.823890916074515,
|
|
"grad_norm": 39.21906280517578,
|
|
"learning_rate": 1.832262650993437e-07,
|
|
"loss": 0.4472,
|
|
"num_input_tokens_seen": 40506112,
|
|
"step": 12870
|
|
},
|
|
{
|
|
"epoch": 0.8242109980154919,
|
|
"grad_norm": 20.77424430847168,
|
|
"learning_rate": 1.825821004136774e-07,
|
|
"loss": 0.2954,
|
|
"num_input_tokens_seen": 40521344,
|
|
"step": 12875
|
|
},
|
|
{
|
|
"epoch": 0.8245310799564689,
|
|
"grad_norm": 29.856380462646484,
|
|
"learning_rate": 1.819389562838559e-07,
|
|
"loss": 0.2698,
|
|
"num_input_tokens_seen": 40537024,
|
|
"step": 12880
|
|
},
|
|
{
|
|
"epoch": 0.8248511618974458,
|
|
"grad_norm": 47.23398208618164,
|
|
"learning_rate": 1.8129683351285319e-07,
|
|
"loss": 0.3136,
|
|
"num_input_tokens_seen": 40552640,
|
|
"step": 12885
|
|
},
|
|
{
|
|
"epoch": 0.8251712438384227,
|
|
"grad_norm": 35.031856536865234,
|
|
"learning_rate": 1.8065573290236626e-07,
|
|
"loss": 0.3186,
|
|
"num_input_tokens_seen": 40568000,
|
|
"step": 12890
|
|
},
|
|
{
|
|
"epoch": 0.8254913257793995,
|
|
"grad_norm": 22.70587730407715,
|
|
"learning_rate": 1.8001565525281682e-07,
|
|
"loss": 0.3809,
|
|
"num_input_tokens_seen": 40584960,
|
|
"step": 12895
|
|
},
|
|
{
|
|
"epoch": 0.8258114077203764,
|
|
"grad_norm": 25.041950225830078,
|
|
"learning_rate": 1.793766013633493e-07,
|
|
"loss": 0.3665,
|
|
"num_input_tokens_seen": 40600704,
|
|
"step": 12900
|
|
},
|
|
{
|
|
"epoch": 0.8261314896613533,
|
|
"grad_norm": 27.236404418945312,
|
|
"learning_rate": 1.7873857203183074e-07,
|
|
"loss": 0.3693,
|
|
"num_input_tokens_seen": 40615872,
|
|
"step": 12905
|
|
},
|
|
{
|
|
"epoch": 0.8264515716023302,
|
|
"grad_norm": 54.097450256347656,
|
|
"learning_rate": 1.7810156805484733e-07,
|
|
"loss": 0.4563,
|
|
"num_input_tokens_seen": 40632640,
|
|
"step": 12910
|
|
},
|
|
{
|
|
"epoch": 0.8267716535433071,
|
|
"grad_norm": 25.137113571166992,
|
|
"learning_rate": 1.7746559022770612e-07,
|
|
"loss": 0.2995,
|
|
"num_input_tokens_seen": 40648064,
|
|
"step": 12915
|
|
},
|
|
{
|
|
"epoch": 0.8270917354842839,
|
|
"grad_norm": 29.874134063720703,
|
|
"learning_rate": 1.7683063934443342e-07,
|
|
"loss": 0.3663,
|
|
"num_input_tokens_seen": 40664704,
|
|
"step": 12920
|
|
},
|
|
{
|
|
"epoch": 0.8274118174252608,
|
|
"grad_norm": 40.31401824951172,
|
|
"learning_rate": 1.7619671619777277e-07,
|
|
"loss": 0.4004,
|
|
"num_input_tokens_seen": 40681024,
|
|
"step": 12925
|
|
},
|
|
{
|
|
"epoch": 0.8277318993662378,
|
|
"grad_norm": 31.526283264160156,
|
|
"learning_rate": 1.7556382157918404e-07,
|
|
"loss": 0.4101,
|
|
"num_input_tokens_seen": 40695936,
|
|
"step": 12930
|
|
},
|
|
{
|
|
"epoch": 0.8280519813072147,
|
|
"grad_norm": 27.806535720825195,
|
|
"learning_rate": 1.7493195627884427e-07,
|
|
"loss": 0.3185,
|
|
"num_input_tokens_seen": 40713472,
|
|
"step": 12935
|
|
},
|
|
{
|
|
"epoch": 0.8283720632481916,
|
|
"grad_norm": 42.26551055908203,
|
|
"learning_rate": 1.7430112108564465e-07,
|
|
"loss": 0.3141,
|
|
"num_input_tokens_seen": 40729344,
|
|
"step": 12940
|
|
},
|
|
{
|
|
"epoch": 0.8286921451891684,
|
|
"grad_norm": 35.58454895019531,
|
|
"learning_rate": 1.736713167871896e-07,
|
|
"loss": 0.3861,
|
|
"num_input_tokens_seen": 40745856,
|
|
"step": 12945
|
|
},
|
|
{
|
|
"epoch": 0.8290122271301453,
|
|
"grad_norm": 19.220375061035156,
|
|
"learning_rate": 1.7304254416979803e-07,
|
|
"loss": 0.2993,
|
|
"num_input_tokens_seen": 40761920,
|
|
"step": 12950
|
|
},
|
|
{
|
|
"epoch": 0.8293323090711222,
|
|
"grad_norm": 17.930898666381836,
|
|
"learning_rate": 1.7241480401849963e-07,
|
|
"loss": 0.2488,
|
|
"num_input_tokens_seen": 40776960,
|
|
"step": 12955
|
|
},
|
|
{
|
|
"epoch": 0.8296523910120991,
|
|
"grad_norm": 21.81646156311035,
|
|
"learning_rate": 1.7178809711703524e-07,
|
|
"loss": 0.3455,
|
|
"num_input_tokens_seen": 40792192,
|
|
"step": 12960
|
|
},
|
|
{
|
|
"epoch": 0.829972472953076,
|
|
"grad_norm": 34.8779296875,
|
|
"learning_rate": 1.7116242424785599e-07,
|
|
"loss": 0.3612,
|
|
"num_input_tokens_seen": 40808256,
|
|
"step": 12965
|
|
},
|
|
{
|
|
"epoch": 0.8302925548940528,
|
|
"grad_norm": 40.2933464050293,
|
|
"learning_rate": 1.7053778619212166e-07,
|
|
"loss": 0.4288,
|
|
"num_input_tokens_seen": 40823424,
|
|
"step": 12970
|
|
},
|
|
{
|
|
"epoch": 0.8306126368350297,
|
|
"grad_norm": 39.040504455566406,
|
|
"learning_rate": 1.6991418372970022e-07,
|
|
"loss": 0.4221,
|
|
"num_input_tokens_seen": 40840960,
|
|
"step": 12975
|
|
},
|
|
{
|
|
"epoch": 0.8309327187760066,
|
|
"grad_norm": 26.533519744873047,
|
|
"learning_rate": 1.6929161763916666e-07,
|
|
"loss": 0.3775,
|
|
"num_input_tokens_seen": 40857536,
|
|
"step": 12980
|
|
},
|
|
{
|
|
"epoch": 0.8312528007169836,
|
|
"grad_norm": 25.883270263671875,
|
|
"learning_rate": 1.686700886978021e-07,
|
|
"loss": 0.3597,
|
|
"num_input_tokens_seen": 40874240,
|
|
"step": 12985
|
|
},
|
|
{
|
|
"epoch": 0.8315728826579605,
|
|
"grad_norm": 37.27665710449219,
|
|
"learning_rate": 1.6804959768159266e-07,
|
|
"loss": 0.3573,
|
|
"num_input_tokens_seen": 40888960,
|
|
"step": 12990
|
|
},
|
|
{
|
|
"epoch": 0.8318929645989374,
|
|
"grad_norm": 53.164058685302734,
|
|
"learning_rate": 1.674301453652287e-07,
|
|
"loss": 0.5238,
|
|
"num_input_tokens_seen": 40904512,
|
|
"step": 12995
|
|
},
|
|
{
|
|
"epoch": 0.8322130465399142,
|
|
"grad_norm": 37.5425910949707,
|
|
"learning_rate": 1.6681173252210378e-07,
|
|
"loss": 0.2903,
|
|
"num_input_tokens_seen": 40921856,
|
|
"step": 13000
|
|
},
|
|
{
|
|
"epoch": 0.8325331284808911,
|
|
"grad_norm": 49.16252517700195,
|
|
"learning_rate": 1.6619435992431342e-07,
|
|
"loss": 0.3741,
|
|
"num_input_tokens_seen": 40938752,
|
|
"step": 13005
|
|
},
|
|
{
|
|
"epoch": 0.832853210421868,
|
|
"grad_norm": 43.46717071533203,
|
|
"learning_rate": 1.6557802834265466e-07,
|
|
"loss": 0.3033,
|
|
"num_input_tokens_seen": 40954048,
|
|
"step": 13010
|
|
},
|
|
{
|
|
"epoch": 0.8331732923628449,
|
|
"grad_norm": 24.154077529907227,
|
|
"learning_rate": 1.649627385466248e-07,
|
|
"loss": 0.3593,
|
|
"num_input_tokens_seen": 40972672,
|
|
"step": 13015
|
|
},
|
|
{
|
|
"epoch": 0.8334933743038218,
|
|
"grad_norm": 19.601119995117188,
|
|
"learning_rate": 1.643484913044202e-07,
|
|
"loss": 0.242,
|
|
"num_input_tokens_seen": 40987648,
|
|
"step": 13020
|
|
},
|
|
{
|
|
"epoch": 0.8338134562447986,
|
|
"grad_norm": 13.510409355163574,
|
|
"learning_rate": 1.6373528738293564e-07,
|
|
"loss": 0.3147,
|
|
"num_input_tokens_seen": 41003328,
|
|
"step": 13025
|
|
},
|
|
{
|
|
"epoch": 0.8341335381857755,
|
|
"grad_norm": 31.341299057006836,
|
|
"learning_rate": 1.6312312754776404e-07,
|
|
"loss": 0.2875,
|
|
"num_input_tokens_seen": 41018624,
|
|
"step": 13030
|
|
},
|
|
{
|
|
"epoch": 0.8344536201267524,
|
|
"grad_norm": 16.611501693725586,
|
|
"learning_rate": 1.6251201256319357e-07,
|
|
"loss": 0.3321,
|
|
"num_input_tokens_seen": 41034624,
|
|
"step": 13035
|
|
},
|
|
{
|
|
"epoch": 0.8347737020677294,
|
|
"grad_norm": 26.413938522338867,
|
|
"learning_rate": 1.619019431922083e-07,
|
|
"loss": 0.3821,
|
|
"num_input_tokens_seen": 41049664,
|
|
"step": 13040
|
|
},
|
|
{
|
|
"epoch": 0.8350937840087063,
|
|
"grad_norm": 33.03317642211914,
|
|
"learning_rate": 1.6129292019648754e-07,
|
|
"loss": 0.3454,
|
|
"num_input_tokens_seen": 41066368,
|
|
"step": 13045
|
|
},
|
|
{
|
|
"epoch": 0.8354138659496831,
|
|
"grad_norm": 25.02870750427246,
|
|
"learning_rate": 1.606849443364038e-07,
|
|
"loss": 0.2916,
|
|
"num_input_tokens_seen": 41082048,
|
|
"step": 13050
|
|
},
|
|
{
|
|
"epoch": 0.83573394789066,
|
|
"grad_norm": 16.02092170715332,
|
|
"learning_rate": 1.6007801637102104e-07,
|
|
"loss": 0.3422,
|
|
"num_input_tokens_seen": 41098048,
|
|
"step": 13055
|
|
},
|
|
{
|
|
"epoch": 0.8360540298316369,
|
|
"grad_norm": 20.10306167602539,
|
|
"learning_rate": 1.594721370580969e-07,
|
|
"loss": 0.3826,
|
|
"num_input_tokens_seen": 41112768,
|
|
"step": 13060
|
|
},
|
|
{
|
|
"epoch": 0.8363741117726138,
|
|
"grad_norm": 20.185379028320312,
|
|
"learning_rate": 1.588673071540788e-07,
|
|
"loss": 0.4512,
|
|
"num_input_tokens_seen": 41127488,
|
|
"step": 13065
|
|
},
|
|
{
|
|
"epoch": 0.8366941937135907,
|
|
"grad_norm": 37.06159591674805,
|
|
"learning_rate": 1.5826352741410332e-07,
|
|
"loss": 0.3295,
|
|
"num_input_tokens_seen": 41142272,
|
|
"step": 13070
|
|
},
|
|
{
|
|
"epoch": 0.8370142756545675,
|
|
"grad_norm": 52.25266647338867,
|
|
"learning_rate": 1.576607985919971e-07,
|
|
"loss": 0.2947,
|
|
"num_input_tokens_seen": 41157952,
|
|
"step": 13075
|
|
},
|
|
{
|
|
"epoch": 0.8373343575955444,
|
|
"grad_norm": 38.03484344482422,
|
|
"learning_rate": 1.57059121440274e-07,
|
|
"loss": 0.3595,
|
|
"num_input_tokens_seen": 41172992,
|
|
"step": 13080
|
|
},
|
|
{
|
|
"epoch": 0.8376544395365213,
|
|
"grad_norm": 47.07827377319336,
|
|
"learning_rate": 1.56458496710135e-07,
|
|
"loss": 0.3642,
|
|
"num_input_tokens_seen": 41187776,
|
|
"step": 13085
|
|
},
|
|
{
|
|
"epoch": 0.8379745214774983,
|
|
"grad_norm": 36.153099060058594,
|
|
"learning_rate": 1.5585892515146716e-07,
|
|
"loss": 0.3461,
|
|
"num_input_tokens_seen": 41204416,
|
|
"step": 13090
|
|
},
|
|
{
|
|
"epoch": 0.8382946034184752,
|
|
"grad_norm": 22.711284637451172,
|
|
"learning_rate": 1.5526040751284253e-07,
|
|
"loss": 0.4195,
|
|
"num_input_tokens_seen": 41220032,
|
|
"step": 13095
|
|
},
|
|
{
|
|
"epoch": 0.838614685359452,
|
|
"grad_norm": 35.58867263793945,
|
|
"learning_rate": 1.546629445415174e-07,
|
|
"loss": 0.3118,
|
|
"num_input_tokens_seen": 41235776,
|
|
"step": 13100
|
|
},
|
|
{
|
|
"epoch": 0.8389347673004289,
|
|
"grad_norm": 41.773040771484375,
|
|
"learning_rate": 1.5406653698343141e-07,
|
|
"loss": 0.3725,
|
|
"num_input_tokens_seen": 41252160,
|
|
"step": 13105
|
|
},
|
|
{
|
|
"epoch": 0.8392548492414058,
|
|
"grad_norm": 33.417354583740234,
|
|
"learning_rate": 1.5347118558320637e-07,
|
|
"loss": 0.3539,
|
|
"num_input_tokens_seen": 41269056,
|
|
"step": 13110
|
|
},
|
|
{
|
|
"epoch": 0.8395749311823827,
|
|
"grad_norm": 24.998620986938477,
|
|
"learning_rate": 1.5287689108414558e-07,
|
|
"loss": 0.3562,
|
|
"num_input_tokens_seen": 41285312,
|
|
"step": 13115
|
|
},
|
|
{
|
|
"epoch": 0.8398950131233596,
|
|
"grad_norm": 39.11224365234375,
|
|
"learning_rate": 1.5228365422823242e-07,
|
|
"loss": 0.3246,
|
|
"num_input_tokens_seen": 41300992,
|
|
"step": 13120
|
|
},
|
|
{
|
|
"epoch": 0.8402150950643364,
|
|
"grad_norm": 28.325523376464844,
|
|
"learning_rate": 1.5169147575613038e-07,
|
|
"loss": 0.2623,
|
|
"num_input_tokens_seen": 41317952,
|
|
"step": 13125
|
|
},
|
|
{
|
|
"epoch": 0.8405351770053133,
|
|
"grad_norm": 12.87824821472168,
|
|
"learning_rate": 1.5110035640718098e-07,
|
|
"loss": 0.2941,
|
|
"num_input_tokens_seen": 41333440,
|
|
"step": 13130
|
|
},
|
|
{
|
|
"epoch": 0.8408552589462902,
|
|
"grad_norm": 31.341796875,
|
|
"learning_rate": 1.5051029691940387e-07,
|
|
"loss": 0.3725,
|
|
"num_input_tokens_seen": 41349312,
|
|
"step": 13135
|
|
},
|
|
{
|
|
"epoch": 0.8411753408872671,
|
|
"grad_norm": 33.42830276489258,
|
|
"learning_rate": 1.4992129802949515e-07,
|
|
"loss": 0.3449,
|
|
"num_input_tokens_seen": 41364288,
|
|
"step": 13140
|
|
},
|
|
{
|
|
"epoch": 0.8414954228282441,
|
|
"grad_norm": 24.27691078186035,
|
|
"learning_rate": 1.4933336047282696e-07,
|
|
"loss": 0.2836,
|
|
"num_input_tokens_seen": 41379904,
|
|
"step": 13145
|
|
},
|
|
{
|
|
"epoch": 0.841815504769221,
|
|
"grad_norm": 34.65740203857422,
|
|
"learning_rate": 1.4874648498344579e-07,
|
|
"loss": 0.3199,
|
|
"num_input_tokens_seen": 41394432,
|
|
"step": 13150
|
|
},
|
|
{
|
|
"epoch": 0.8421355867101978,
|
|
"grad_norm": 53.11001205444336,
|
|
"learning_rate": 1.4816067229407348e-07,
|
|
"loss": 0.3419,
|
|
"num_input_tokens_seen": 41409984,
|
|
"step": 13155
|
|
},
|
|
{
|
|
"epoch": 0.8424556686511747,
|
|
"grad_norm": 18.456310272216797,
|
|
"learning_rate": 1.4757592313610322e-07,
|
|
"loss": 0.3038,
|
|
"num_input_tokens_seen": 41425984,
|
|
"step": 13160
|
|
},
|
|
{
|
|
"epoch": 0.8427757505921516,
|
|
"grad_norm": 17.635456085205078,
|
|
"learning_rate": 1.4699223823960128e-07,
|
|
"loss": 0.3293,
|
|
"num_input_tokens_seen": 41441920,
|
|
"step": 13165
|
|
},
|
|
{
|
|
"epoch": 0.8430958325331285,
|
|
"grad_norm": 38.752742767333984,
|
|
"learning_rate": 1.4640961833330579e-07,
|
|
"loss": 0.3392,
|
|
"num_input_tokens_seen": 41457664,
|
|
"step": 13170
|
|
},
|
|
{
|
|
"epoch": 0.8434159144741054,
|
|
"grad_norm": 16.197906494140625,
|
|
"learning_rate": 1.4582806414462378e-07,
|
|
"loss": 0.2544,
|
|
"num_input_tokens_seen": 41472832,
|
|
"step": 13175
|
|
},
|
|
{
|
|
"epoch": 0.8437359964150822,
|
|
"grad_norm": 24.1660213470459,
|
|
"learning_rate": 1.4524757639963258e-07,
|
|
"loss": 0.3411,
|
|
"num_input_tokens_seen": 41490368,
|
|
"step": 13180
|
|
},
|
|
{
|
|
"epoch": 0.8440560783560591,
|
|
"grad_norm": 44.753700256347656,
|
|
"learning_rate": 1.4466815582307845e-07,
|
|
"loss": 0.4458,
|
|
"num_input_tokens_seen": 41506624,
|
|
"step": 13185
|
|
},
|
|
{
|
|
"epoch": 0.844376160297036,
|
|
"grad_norm": 9.318314552307129,
|
|
"learning_rate": 1.440898031383746e-07,
|
|
"loss": 0.2433,
|
|
"num_input_tokens_seen": 41523264,
|
|
"step": 13190
|
|
},
|
|
{
|
|
"epoch": 0.844696242238013,
|
|
"grad_norm": 42.493797302246094,
|
|
"learning_rate": 1.4351251906760064e-07,
|
|
"loss": 0.3678,
|
|
"num_input_tokens_seen": 41538944,
|
|
"step": 13195
|
|
},
|
|
{
|
|
"epoch": 0.8450163241789899,
|
|
"grad_norm": 40.14229202270508,
|
|
"learning_rate": 1.4293630433150317e-07,
|
|
"loss": 0.3919,
|
|
"num_input_tokens_seen": 41554880,
|
|
"step": 13200
|
|
},
|
|
{
|
|
"epoch": 0.8453364061199667,
|
|
"grad_norm": 47.614463806152344,
|
|
"learning_rate": 1.423611596494927e-07,
|
|
"loss": 0.4473,
|
|
"num_input_tokens_seen": 41569280,
|
|
"step": 13205
|
|
},
|
|
{
|
|
"epoch": 0.8456564880609436,
|
|
"grad_norm": 18.392112731933594,
|
|
"learning_rate": 1.4178708573964438e-07,
|
|
"loss": 0.3541,
|
|
"num_input_tokens_seen": 41584576,
|
|
"step": 13210
|
|
},
|
|
{
|
|
"epoch": 0.8459765700019205,
|
|
"grad_norm": 19.08127212524414,
|
|
"learning_rate": 1.4121408331869566e-07,
|
|
"loss": 0.3483,
|
|
"num_input_tokens_seen": 41600000,
|
|
"step": 13215
|
|
},
|
|
{
|
|
"epoch": 0.8462966519428974,
|
|
"grad_norm": 37.911075592041016,
|
|
"learning_rate": 1.406421531020474e-07,
|
|
"loss": 0.3539,
|
|
"num_input_tokens_seen": 41615040,
|
|
"step": 13220
|
|
},
|
|
{
|
|
"epoch": 0.8466167338838743,
|
|
"grad_norm": 69.3670883178711,
|
|
"learning_rate": 1.4007129580376097e-07,
|
|
"loss": 0.3418,
|
|
"num_input_tokens_seen": 41630208,
|
|
"step": 13225
|
|
},
|
|
{
|
|
"epoch": 0.8469368158248511,
|
|
"grad_norm": 36.10555648803711,
|
|
"learning_rate": 1.3950151213655847e-07,
|
|
"loss": 0.354,
|
|
"num_input_tokens_seen": 41645440,
|
|
"step": 13230
|
|
},
|
|
{
|
|
"epoch": 0.847256897765828,
|
|
"grad_norm": 42.61678695678711,
|
|
"learning_rate": 1.389328028118214e-07,
|
|
"loss": 0.3286,
|
|
"num_input_tokens_seen": 41661184,
|
|
"step": 13235
|
|
},
|
|
{
|
|
"epoch": 0.8475769797068049,
|
|
"grad_norm": 27.363248825073242,
|
|
"learning_rate": 1.3836516853959e-07,
|
|
"loss": 0.3546,
|
|
"num_input_tokens_seen": 41676224,
|
|
"step": 13240
|
|
},
|
|
{
|
|
"epoch": 0.8478970616477818,
|
|
"grad_norm": 18.371397018432617,
|
|
"learning_rate": 1.3779861002856242e-07,
|
|
"loss": 0.3031,
|
|
"num_input_tokens_seen": 41690816,
|
|
"step": 13245
|
|
},
|
|
{
|
|
"epoch": 0.8482171435887588,
|
|
"grad_norm": 17.178085327148438,
|
|
"learning_rate": 1.3723312798609366e-07,
|
|
"loss": 0.3261,
|
|
"num_input_tokens_seen": 41706688,
|
|
"step": 13250
|
|
},
|
|
{
|
|
"epoch": 0.8485372255297357,
|
|
"grad_norm": 26.48369789123535,
|
|
"learning_rate": 1.3666872311819455e-07,
|
|
"loss": 0.3518,
|
|
"num_input_tokens_seen": 41721920,
|
|
"step": 13255
|
|
},
|
|
{
|
|
"epoch": 0.8488573074707125,
|
|
"grad_norm": 21.16022300720215,
|
|
"learning_rate": 1.361053961295312e-07,
|
|
"loss": 0.2742,
|
|
"num_input_tokens_seen": 41738112,
|
|
"step": 13260
|
|
},
|
|
{
|
|
"epoch": 0.8491773894116894,
|
|
"grad_norm": 50.990020751953125,
|
|
"learning_rate": 1.3554314772342412e-07,
|
|
"loss": 0.3463,
|
|
"num_input_tokens_seen": 41753792,
|
|
"step": 13265
|
|
},
|
|
{
|
|
"epoch": 0.8494974713526663,
|
|
"grad_norm": 20.54403305053711,
|
|
"learning_rate": 1.349819786018469e-07,
|
|
"loss": 0.3268,
|
|
"num_input_tokens_seen": 41771328,
|
|
"step": 13270
|
|
},
|
|
{
|
|
"epoch": 0.8498175532936432,
|
|
"grad_norm": 37.34607696533203,
|
|
"learning_rate": 1.3442188946542566e-07,
|
|
"loss": 0.375,
|
|
"num_input_tokens_seen": 41787712,
|
|
"step": 13275
|
|
},
|
|
{
|
|
"epoch": 0.85013763523462,
|
|
"grad_norm": 24.755434036254883,
|
|
"learning_rate": 1.338628810134388e-07,
|
|
"loss": 0.2995,
|
|
"num_input_tokens_seen": 41803072,
|
|
"step": 13280
|
|
},
|
|
{
|
|
"epoch": 0.8504577171755969,
|
|
"grad_norm": 36.77594757080078,
|
|
"learning_rate": 1.3330495394381435e-07,
|
|
"loss": 0.3636,
|
|
"num_input_tokens_seen": 41818688,
|
|
"step": 13285
|
|
},
|
|
{
|
|
"epoch": 0.8507777991165738,
|
|
"grad_norm": 15.947341918945312,
|
|
"learning_rate": 1.3274810895313083e-07,
|
|
"loss": 0.272,
|
|
"num_input_tokens_seen": 41833792,
|
|
"step": 13290
|
|
},
|
|
{
|
|
"epoch": 0.8510338646693554,
|
|
"eval_loss": 0.3570670485496521,
|
|
"eval_runtime": 49.1744,
|
|
"eval_samples_per_second": 282.383,
|
|
"eval_steps_per_second": 35.303,
|
|
"num_input_tokens_seen": 41847872,
|
|
"step": 13294
|
|
},
|
|
{
|
|
"epoch": 0.8510978810575507,
|
|
"grad_norm": 25.20223617553711,
|
|
"learning_rate": 1.321923467366164e-07,
|
|
"loss": 0.3708,
|
|
"num_input_tokens_seen": 41850880,
|
|
"step": 13295
|
|
},
|
|
{
|
|
"epoch": 0.8514179629985277,
|
|
"grad_norm": 14.625531196594238,
|
|
"learning_rate": 1.3163766798814603e-07,
|
|
"loss": 0.1815,
|
|
"num_input_tokens_seen": 41866560,
|
|
"step": 13300
|
|
},
|
|
{
|
|
"epoch": 0.8517380449395046,
|
|
"grad_norm": 49.65571594238281,
|
|
"learning_rate": 1.3108407340024264e-07,
|
|
"loss": 0.2872,
|
|
"num_input_tokens_seen": 41882240,
|
|
"step": 13305
|
|
},
|
|
{
|
|
"epoch": 0.8520581268804814,
|
|
"grad_norm": 37.89714813232422,
|
|
"learning_rate": 1.3053156366407613e-07,
|
|
"loss": 0.332,
|
|
"num_input_tokens_seen": 41898880,
|
|
"step": 13310
|
|
},
|
|
{
|
|
"epoch": 0.8523782088214583,
|
|
"grad_norm": 19.63136100769043,
|
|
"learning_rate": 1.2998013946946119e-07,
|
|
"loss": 0.2398,
|
|
"num_input_tokens_seen": 41915968,
|
|
"step": 13315
|
|
},
|
|
{
|
|
"epoch": 0.8526982907624352,
|
|
"grad_norm": 36.910030364990234,
|
|
"learning_rate": 1.2942980150485706e-07,
|
|
"loss": 0.3556,
|
|
"num_input_tokens_seen": 41930816,
|
|
"step": 13320
|
|
},
|
|
{
|
|
"epoch": 0.8530183727034121,
|
|
"grad_norm": 49.309322357177734,
|
|
"learning_rate": 1.2888055045736723e-07,
|
|
"loss": 0.3098,
|
|
"num_input_tokens_seen": 41947200,
|
|
"step": 13325
|
|
},
|
|
{
|
|
"epoch": 0.853338454644389,
|
|
"grad_norm": 19.818714141845703,
|
|
"learning_rate": 1.283323870127384e-07,
|
|
"loss": 0.3021,
|
|
"num_input_tokens_seen": 41962240,
|
|
"step": 13330
|
|
},
|
|
{
|
|
"epoch": 0.8536585365853658,
|
|
"grad_norm": 28.360517501831055,
|
|
"learning_rate": 1.2778531185535911e-07,
|
|
"loss": 0.3063,
|
|
"num_input_tokens_seen": 41978752,
|
|
"step": 13335
|
|
},
|
|
{
|
|
"epoch": 0.8539786185263427,
|
|
"grad_norm": 19.08763313293457,
|
|
"learning_rate": 1.2723932566825844e-07,
|
|
"loss": 0.324,
|
|
"num_input_tokens_seen": 41994112,
|
|
"step": 13340
|
|
},
|
|
{
|
|
"epoch": 0.8542987004673196,
|
|
"grad_norm": 16.557178497314453,
|
|
"learning_rate": 1.2669442913310723e-07,
|
|
"loss": 0.2986,
|
|
"num_input_tokens_seen": 42010432,
|
|
"step": 13345
|
|
},
|
|
{
|
|
"epoch": 0.8546187824082965,
|
|
"grad_norm": 27.915157318115234,
|
|
"learning_rate": 1.2615062293021506e-07,
|
|
"loss": 0.2722,
|
|
"num_input_tokens_seen": 42025984,
|
|
"step": 13350
|
|
},
|
|
{
|
|
"epoch": 0.8549388643492735,
|
|
"grad_norm": 43.59603500366211,
|
|
"learning_rate": 1.2560790773853025e-07,
|
|
"loss": 0.3185,
|
|
"num_input_tokens_seen": 42040832,
|
|
"step": 13355
|
|
},
|
|
{
|
|
"epoch": 0.8552589462902503,
|
|
"grad_norm": 25.36774253845215,
|
|
"learning_rate": 1.2506628423563915e-07,
|
|
"loss": 0.4035,
|
|
"num_input_tokens_seen": 42057536,
|
|
"step": 13360
|
|
},
|
|
{
|
|
"epoch": 0.8555790282312272,
|
|
"grad_norm": 31.750885009765625,
|
|
"learning_rate": 1.2452575309776493e-07,
|
|
"loss": 0.2863,
|
|
"num_input_tokens_seen": 42073152,
|
|
"step": 13365
|
|
},
|
|
{
|
|
"epoch": 0.8558991101722041,
|
|
"grad_norm": 45.091915130615234,
|
|
"learning_rate": 1.2398631499976732e-07,
|
|
"loss": 0.304,
|
|
"num_input_tokens_seen": 42088512,
|
|
"step": 13370
|
|
},
|
|
{
|
|
"epoch": 0.856219192113181,
|
|
"grad_norm": 22.48138999938965,
|
|
"learning_rate": 1.234479706151409e-07,
|
|
"loss": 0.4208,
|
|
"num_input_tokens_seen": 42103552,
|
|
"step": 13375
|
|
},
|
|
{
|
|
"epoch": 0.8565392740541579,
|
|
"grad_norm": 22.086090087890625,
|
|
"learning_rate": 1.2291072061601503e-07,
|
|
"loss": 0.3608,
|
|
"num_input_tokens_seen": 42119872,
|
|
"step": 13380
|
|
},
|
|
{
|
|
"epoch": 0.8568593559951347,
|
|
"grad_norm": 34.048282623291016,
|
|
"learning_rate": 1.2237456567315264e-07,
|
|
"loss": 0.4351,
|
|
"num_input_tokens_seen": 42136832,
|
|
"step": 13385
|
|
},
|
|
{
|
|
"epoch": 0.8571794379361116,
|
|
"grad_norm": 23.326128005981445,
|
|
"learning_rate": 1.2183950645594944e-07,
|
|
"loss": 0.2975,
|
|
"num_input_tokens_seen": 42152896,
|
|
"step": 13390
|
|
},
|
|
{
|
|
"epoch": 0.8574995198770885,
|
|
"grad_norm": 52.200294494628906,
|
|
"learning_rate": 1.2130554363243318e-07,
|
|
"loss": 0.3421,
|
|
"num_input_tokens_seen": 42168064,
|
|
"step": 13395
|
|
},
|
|
{
|
|
"epoch": 0.8578196018180654,
|
|
"grad_norm": 20.56406593322754,
|
|
"learning_rate": 1.207726778692625e-07,
|
|
"loss": 0.3703,
|
|
"num_input_tokens_seen": 42182784,
|
|
"step": 13400
|
|
},
|
|
{
|
|
"epoch": 0.8581396837590423,
|
|
"grad_norm": 23.129608154296875,
|
|
"learning_rate": 1.2024090983172718e-07,
|
|
"loss": 0.3271,
|
|
"num_input_tokens_seen": 42199744,
|
|
"step": 13405
|
|
},
|
|
{
|
|
"epoch": 0.8584597657000193,
|
|
"grad_norm": 40.9952507019043,
|
|
"learning_rate": 1.1971024018374532e-07,
|
|
"loss": 0.3625,
|
|
"num_input_tokens_seen": 42215040,
|
|
"step": 13410
|
|
},
|
|
{
|
|
"epoch": 0.8587798476409961,
|
|
"grad_norm": 35.23881149291992,
|
|
"learning_rate": 1.1918066958786432e-07,
|
|
"loss": 0.3091,
|
|
"num_input_tokens_seen": 42230144,
|
|
"step": 13415
|
|
},
|
|
{
|
|
"epoch": 0.859099929581973,
|
|
"grad_norm": 59.670223236083984,
|
|
"learning_rate": 1.1865219870525922e-07,
|
|
"loss": 0.3553,
|
|
"num_input_tokens_seen": 42246528,
|
|
"step": 13420
|
|
},
|
|
{
|
|
"epoch": 0.8594200115229499,
|
|
"grad_norm": 20.215394973754883,
|
|
"learning_rate": 1.1812482819573222e-07,
|
|
"loss": 0.4317,
|
|
"num_input_tokens_seen": 42263168,
|
|
"step": 13425
|
|
},
|
|
{
|
|
"epoch": 0.8597400934639268,
|
|
"grad_norm": 32.689353942871094,
|
|
"learning_rate": 1.1759855871771163e-07,
|
|
"loss": 0.3905,
|
|
"num_input_tokens_seen": 42278912,
|
|
"step": 13430
|
|
},
|
|
{
|
|
"epoch": 0.8600601754049036,
|
|
"grad_norm": 45.541587829589844,
|
|
"learning_rate": 1.1707339092825075e-07,
|
|
"loss": 0.3824,
|
|
"num_input_tokens_seen": 42294656,
|
|
"step": 13435
|
|
},
|
|
{
|
|
"epoch": 0.8603802573458805,
|
|
"grad_norm": 45.382381439208984,
|
|
"learning_rate": 1.1654932548302842e-07,
|
|
"loss": 0.3909,
|
|
"num_input_tokens_seen": 42311552,
|
|
"step": 13440
|
|
},
|
|
{
|
|
"epoch": 0.8607003392868574,
|
|
"grad_norm": 48.50038528442383,
|
|
"learning_rate": 1.1602636303634595e-07,
|
|
"loss": 0.3635,
|
|
"num_input_tokens_seen": 42327552,
|
|
"step": 13445
|
|
},
|
|
{
|
|
"epoch": 0.8610204212278343,
|
|
"grad_norm": 18.829587936401367,
|
|
"learning_rate": 1.1550450424112801e-07,
|
|
"loss": 0.3583,
|
|
"num_input_tokens_seen": 42343360,
|
|
"step": 13450
|
|
},
|
|
{
|
|
"epoch": 0.8613405031688112,
|
|
"grad_norm": 22.35457992553711,
|
|
"learning_rate": 1.1498374974892178e-07,
|
|
"loss": 0.3341,
|
|
"num_input_tokens_seen": 42360064,
|
|
"step": 13455
|
|
},
|
|
{
|
|
"epoch": 0.8616605851097882,
|
|
"grad_norm": 23.769941329956055,
|
|
"learning_rate": 1.144641002098955e-07,
|
|
"loss": 0.4371,
|
|
"num_input_tokens_seen": 42374976,
|
|
"step": 13460
|
|
},
|
|
{
|
|
"epoch": 0.861980667050765,
|
|
"grad_norm": 44.195152282714844,
|
|
"learning_rate": 1.1394555627283697e-07,
|
|
"loss": 0.3524,
|
|
"num_input_tokens_seen": 42391616,
|
|
"step": 13465
|
|
},
|
|
{
|
|
"epoch": 0.8623007489917419,
|
|
"grad_norm": 58.780975341796875,
|
|
"learning_rate": 1.134281185851551e-07,
|
|
"loss": 0.3095,
|
|
"num_input_tokens_seen": 42406528,
|
|
"step": 13470
|
|
},
|
|
{
|
|
"epoch": 0.8626208309327188,
|
|
"grad_norm": 29.023456573486328,
|
|
"learning_rate": 1.1291178779287691e-07,
|
|
"loss": 0.288,
|
|
"num_input_tokens_seen": 42424320,
|
|
"step": 13475
|
|
},
|
|
{
|
|
"epoch": 0.8629409128736957,
|
|
"grad_norm": 41.91423034667969,
|
|
"learning_rate": 1.1239656454064683e-07,
|
|
"loss": 0.3654,
|
|
"num_input_tokens_seen": 42440960,
|
|
"step": 13480
|
|
},
|
|
{
|
|
"epoch": 0.8632609948146726,
|
|
"grad_norm": 16.42652130126953,
|
|
"learning_rate": 1.1188244947172776e-07,
|
|
"loss": 0.2474,
|
|
"num_input_tokens_seen": 42456448,
|
|
"step": 13485
|
|
},
|
|
{
|
|
"epoch": 0.8635810767556494,
|
|
"grad_norm": 20.765544891357422,
|
|
"learning_rate": 1.1136944322799812e-07,
|
|
"loss": 0.3165,
|
|
"num_input_tokens_seen": 42472448,
|
|
"step": 13490
|
|
},
|
|
{
|
|
"epoch": 0.8639011586966263,
|
|
"grad_norm": 51.0446662902832,
|
|
"learning_rate": 1.1085754644995227e-07,
|
|
"loss": 0.3147,
|
|
"num_input_tokens_seen": 42487808,
|
|
"step": 13495
|
|
},
|
|
{
|
|
"epoch": 0.8642212406376032,
|
|
"grad_norm": 34.88838195800781,
|
|
"learning_rate": 1.1034675977669938e-07,
|
|
"loss": 0.3516,
|
|
"num_input_tokens_seen": 42503744,
|
|
"step": 13500
|
|
},
|
|
{
|
|
"epoch": 0.8645413225785801,
|
|
"grad_norm": 50.67732238769531,
|
|
"learning_rate": 1.0983708384596258e-07,
|
|
"loss": 0.5636,
|
|
"num_input_tokens_seen": 42520768,
|
|
"step": 13505
|
|
},
|
|
{
|
|
"epoch": 0.864861404519557,
|
|
"grad_norm": 17.03850555419922,
|
|
"learning_rate": 1.0932851929407827e-07,
|
|
"loss": 0.3664,
|
|
"num_input_tokens_seen": 42537408,
|
|
"step": 13510
|
|
},
|
|
{
|
|
"epoch": 0.8651814864605339,
|
|
"grad_norm": 45.833168029785156,
|
|
"learning_rate": 1.0882106675599534e-07,
|
|
"loss": 0.36,
|
|
"num_input_tokens_seen": 42553728,
|
|
"step": 13515
|
|
},
|
|
{
|
|
"epoch": 0.8655015684015108,
|
|
"grad_norm": 14.135661125183105,
|
|
"learning_rate": 1.0831472686527409e-07,
|
|
"loss": 0.3304,
|
|
"num_input_tokens_seen": 42568896,
|
|
"step": 13520
|
|
},
|
|
{
|
|
"epoch": 0.8658216503424877,
|
|
"grad_norm": 13.662610054016113,
|
|
"learning_rate": 1.0780950025408586e-07,
|
|
"loss": 0.2939,
|
|
"num_input_tokens_seen": 42584000,
|
|
"step": 13525
|
|
},
|
|
{
|
|
"epoch": 0.8661417322834646,
|
|
"grad_norm": 62.21460723876953,
|
|
"learning_rate": 1.0730538755321217e-07,
|
|
"loss": 0.3824,
|
|
"num_input_tokens_seen": 42600192,
|
|
"step": 13530
|
|
},
|
|
{
|
|
"epoch": 0.8664618142244415,
|
|
"grad_norm": 20.335872650146484,
|
|
"learning_rate": 1.0680238939204334e-07,
|
|
"loss": 0.304,
|
|
"num_input_tokens_seen": 42614656,
|
|
"step": 13535
|
|
},
|
|
{
|
|
"epoch": 0.8667818961654183,
|
|
"grad_norm": 42.727237701416016,
|
|
"learning_rate": 1.0630050639857879e-07,
|
|
"loss": 0.3989,
|
|
"num_input_tokens_seen": 42629504,
|
|
"step": 13540
|
|
},
|
|
{
|
|
"epoch": 0.8671019781063952,
|
|
"grad_norm": 20.651216506958008,
|
|
"learning_rate": 1.0579973919942508e-07,
|
|
"loss": 0.3036,
|
|
"num_input_tokens_seen": 42644224,
|
|
"step": 13545
|
|
},
|
|
{
|
|
"epoch": 0.8674220600473721,
|
|
"grad_norm": 21.302921295166016,
|
|
"learning_rate": 1.0530008841979621e-07,
|
|
"loss": 0.2417,
|
|
"num_input_tokens_seen": 42659584,
|
|
"step": 13550
|
|
},
|
|
{
|
|
"epoch": 0.867742141988349,
|
|
"grad_norm": 36.984397888183594,
|
|
"learning_rate": 1.048015546835117e-07,
|
|
"loss": 0.2756,
|
|
"num_input_tokens_seen": 42675776,
|
|
"step": 13555
|
|
},
|
|
{
|
|
"epoch": 0.8680622239293259,
|
|
"grad_norm": 23.602458953857422,
|
|
"learning_rate": 1.0430413861299691e-07,
|
|
"loss": 0.3976,
|
|
"num_input_tokens_seen": 42693184,
|
|
"step": 13560
|
|
},
|
|
{
|
|
"epoch": 0.8683823058703029,
|
|
"grad_norm": 45.383060455322266,
|
|
"learning_rate": 1.0380784082928196e-07,
|
|
"loss": 0.4533,
|
|
"num_input_tokens_seen": 42710784,
|
|
"step": 13565
|
|
},
|
|
{
|
|
"epoch": 0.8687023878112797,
|
|
"grad_norm": 40.113624572753906,
|
|
"learning_rate": 1.0331266195200006e-07,
|
|
"loss": 0.3903,
|
|
"num_input_tokens_seen": 42727040,
|
|
"step": 13570
|
|
},
|
|
{
|
|
"epoch": 0.8690224697522566,
|
|
"grad_norm": 18.091224670410156,
|
|
"learning_rate": 1.0281860259938779e-07,
|
|
"loss": 0.3126,
|
|
"num_input_tokens_seen": 42742208,
|
|
"step": 13575
|
|
},
|
|
{
|
|
"epoch": 0.8693425516932335,
|
|
"grad_norm": 19.732269287109375,
|
|
"learning_rate": 1.0232566338828452e-07,
|
|
"loss": 0.3673,
|
|
"num_input_tokens_seen": 42758464,
|
|
"step": 13580
|
|
},
|
|
{
|
|
"epoch": 0.8696626336342104,
|
|
"grad_norm": 47.176029205322266,
|
|
"learning_rate": 1.018338449341305e-07,
|
|
"loss": 0.4102,
|
|
"num_input_tokens_seen": 42774016,
|
|
"step": 13585
|
|
},
|
|
{
|
|
"epoch": 0.8699827155751872,
|
|
"grad_norm": 19.62028694152832,
|
|
"learning_rate": 1.0134314785096632e-07,
|
|
"loss": 0.3942,
|
|
"num_input_tokens_seen": 42789248,
|
|
"step": 13590
|
|
},
|
|
{
|
|
"epoch": 0.8703027975161641,
|
|
"grad_norm": 17.851299285888672,
|
|
"learning_rate": 1.0085357275143359e-07,
|
|
"loss": 0.342,
|
|
"num_input_tokens_seen": 42804608,
|
|
"step": 13595
|
|
},
|
|
{
|
|
"epoch": 0.870622879457141,
|
|
"grad_norm": 32.63302230834961,
|
|
"learning_rate": 1.0036512024677268e-07,
|
|
"loss": 0.4964,
|
|
"num_input_tokens_seen": 42819584,
|
|
"step": 13600
|
|
},
|
|
{
|
|
"epoch": 0.8709429613981179,
|
|
"grad_norm": 9.898176193237305,
|
|
"learning_rate": 9.98777909468217e-08,
|
|
"loss": 0.2733,
|
|
"num_input_tokens_seen": 42835200,
|
|
"step": 13605
|
|
},
|
|
{
|
|
"epoch": 0.8712630433390948,
|
|
"grad_norm": 48.42760467529297,
|
|
"learning_rate": 9.939158546001736e-08,
|
|
"loss": 0.406,
|
|
"num_input_tokens_seen": 42852672,
|
|
"step": 13610
|
|
},
|
|
{
|
|
"epoch": 0.8715831252800716,
|
|
"grad_norm": 19.67852020263672,
|
|
"learning_rate": 9.890650439339299e-08,
|
|
"loss": 0.3322,
|
|
"num_input_tokens_seen": 42868672,
|
|
"step": 13615
|
|
},
|
|
{
|
|
"epoch": 0.8719032072210486,
|
|
"grad_norm": 55.09160232543945,
|
|
"learning_rate": 9.842254835257791e-08,
|
|
"loss": 0.416,
|
|
"num_input_tokens_seen": 42884096,
|
|
"step": 13620
|
|
},
|
|
{
|
|
"epoch": 0.8722232891620255,
|
|
"grad_norm": 32.343929290771484,
|
|
"learning_rate": 9.793971794179679e-08,
|
|
"loss": 0.3767,
|
|
"num_input_tokens_seen": 42898752,
|
|
"step": 13625
|
|
},
|
|
{
|
|
"epoch": 0.8725433711030024,
|
|
"grad_norm": 27.15031623840332,
|
|
"learning_rate": 9.745801376386931e-08,
|
|
"loss": 0.3417,
|
|
"num_input_tokens_seen": 42914688,
|
|
"step": 13630
|
|
},
|
|
{
|
|
"epoch": 0.8728634530439793,
|
|
"grad_norm": 42.770503997802734,
|
|
"learning_rate": 9.697743642020861e-08,
|
|
"loss": 0.3211,
|
|
"num_input_tokens_seen": 42930688,
|
|
"step": 13635
|
|
},
|
|
{
|
|
"epoch": 0.8731835349849562,
|
|
"grad_norm": 37.78193664550781,
|
|
"learning_rate": 9.649798651082119e-08,
|
|
"loss": 0.3372,
|
|
"num_input_tokens_seen": 42947008,
|
|
"step": 13640
|
|
},
|
|
{
|
|
"epoch": 0.873503616925933,
|
|
"grad_norm": 17.573001861572266,
|
|
"learning_rate": 9.601966463430588e-08,
|
|
"loss": 0.3946,
|
|
"num_input_tokens_seen": 42962816,
|
|
"step": 13645
|
|
},
|
|
{
|
|
"epoch": 0.8738236988669099,
|
|
"grad_norm": 15.034274101257324,
|
|
"learning_rate": 9.554247138785321e-08,
|
|
"loss": 0.3405,
|
|
"num_input_tokens_seen": 42977664,
|
|
"step": 13650
|
|
},
|
|
{
|
|
"epoch": 0.8741437808078868,
|
|
"grad_norm": 74.6231460571289,
|
|
"learning_rate": 9.506640736724447e-08,
|
|
"loss": 0.4684,
|
|
"num_input_tokens_seen": 42993472,
|
|
"step": 13655
|
|
},
|
|
{
|
|
"epoch": 0.8744638627488637,
|
|
"grad_norm": 31.8859920501709,
|
|
"learning_rate": 9.459147316685123e-08,
|
|
"loss": 0.3895,
|
|
"num_input_tokens_seen": 43010688,
|
|
"step": 13660
|
|
},
|
|
{
|
|
"epoch": 0.8747839446898406,
|
|
"grad_norm": 41.20021438598633,
|
|
"learning_rate": 9.41176693796345e-08,
|
|
"loss": 0.3357,
|
|
"num_input_tokens_seen": 43027392,
|
|
"step": 13665
|
|
},
|
|
{
|
|
"epoch": 0.8751040266308175,
|
|
"grad_norm": 39.77818298339844,
|
|
"learning_rate": 9.364499659714364e-08,
|
|
"loss": 0.4172,
|
|
"num_input_tokens_seen": 43043008,
|
|
"step": 13670
|
|
},
|
|
{
|
|
"epoch": 0.8754241085717944,
|
|
"grad_norm": 36.9276123046875,
|
|
"learning_rate": 9.31734554095165e-08,
|
|
"loss": 0.342,
|
|
"num_input_tokens_seen": 43059072,
|
|
"step": 13675
|
|
},
|
|
{
|
|
"epoch": 0.8757441905127713,
|
|
"grad_norm": 35.170780181884766,
|
|
"learning_rate": 9.270304640547744e-08,
|
|
"loss": 0.3481,
|
|
"num_input_tokens_seen": 43074624,
|
|
"step": 13680
|
|
},
|
|
{
|
|
"epoch": 0.8760642724537482,
|
|
"grad_norm": 30.96558380126953,
|
|
"learning_rate": 9.223377017233768e-08,
|
|
"loss": 0.3952,
|
|
"num_input_tokens_seen": 43089536,
|
|
"step": 13685
|
|
},
|
|
{
|
|
"epoch": 0.8763843543947251,
|
|
"grad_norm": 28.36827850341797,
|
|
"learning_rate": 9.176562729599458e-08,
|
|
"loss": 0.3535,
|
|
"num_input_tokens_seen": 43104512,
|
|
"step": 13690
|
|
},
|
|
{
|
|
"epoch": 0.8767044363357019,
|
|
"grad_norm": 49.10908508300781,
|
|
"learning_rate": 9.129861836092944e-08,
|
|
"loss": 0.3463,
|
|
"num_input_tokens_seen": 43120640,
|
|
"step": 13695
|
|
},
|
|
{
|
|
"epoch": 0.8770245182766788,
|
|
"grad_norm": 21.713356018066406,
|
|
"learning_rate": 9.083274395020845e-08,
|
|
"loss": 0.4422,
|
|
"num_input_tokens_seen": 43136384,
|
|
"step": 13700
|
|
},
|
|
{
|
|
"epoch": 0.8773446002176557,
|
|
"grad_norm": 23.583024978637695,
|
|
"learning_rate": 9.036800464548156e-08,
|
|
"loss": 0.4045,
|
|
"num_input_tokens_seen": 43153216,
|
|
"step": 13705
|
|
},
|
|
{
|
|
"epoch": 0.8776646821586326,
|
|
"grad_norm": 22.666852951049805,
|
|
"learning_rate": 8.990440102698138e-08,
|
|
"loss": 0.3473,
|
|
"num_input_tokens_seen": 43167936,
|
|
"step": 13710
|
|
},
|
|
{
|
|
"epoch": 0.8779847640996095,
|
|
"grad_norm": 42.15274429321289,
|
|
"learning_rate": 8.944193367352182e-08,
|
|
"loss": 0.2767,
|
|
"num_input_tokens_seen": 43183872,
|
|
"step": 13715
|
|
},
|
|
{
|
|
"epoch": 0.8783048460405863,
|
|
"grad_norm": 28.620649337768555,
|
|
"learning_rate": 8.898060316249944e-08,
|
|
"loss": 0.4057,
|
|
"num_input_tokens_seen": 43200256,
|
|
"step": 13720
|
|
},
|
|
{
|
|
"epoch": 0.8786249279815633,
|
|
"grad_norm": 46.91181182861328,
|
|
"learning_rate": 8.852041006989064e-08,
|
|
"loss": 0.3563,
|
|
"num_input_tokens_seen": 43217600,
|
|
"step": 13725
|
|
},
|
|
{
|
|
"epoch": 0.8789450099225402,
|
|
"grad_norm": 48.15342712402344,
|
|
"learning_rate": 8.80613549702518e-08,
|
|
"loss": 0.3785,
|
|
"num_input_tokens_seen": 43233344,
|
|
"step": 13730
|
|
},
|
|
{
|
|
"epoch": 0.8792650918635171,
|
|
"grad_norm": 48.054359436035156,
|
|
"learning_rate": 8.760343843671824e-08,
|
|
"loss": 0.5423,
|
|
"num_input_tokens_seen": 43249280,
|
|
"step": 13735
|
|
},
|
|
{
|
|
"epoch": 0.879585173804494,
|
|
"grad_norm": 74.3794937133789,
|
|
"learning_rate": 8.714666104100487e-08,
|
|
"loss": 0.4461,
|
|
"num_input_tokens_seen": 43265024,
|
|
"step": 13740
|
|
},
|
|
{
|
|
"epoch": 0.8799052557454708,
|
|
"grad_norm": 75.1503677368164,
|
|
"learning_rate": 8.66910233534034e-08,
|
|
"loss": 0.3544,
|
|
"num_input_tokens_seen": 43280576,
|
|
"step": 13745
|
|
},
|
|
{
|
|
"epoch": 0.8802253376864477,
|
|
"grad_norm": 32.35490798950195,
|
|
"learning_rate": 8.62365259427823e-08,
|
|
"loss": 0.3156,
|
|
"num_input_tokens_seen": 43296064,
|
|
"step": 13750
|
|
},
|
|
{
|
|
"epoch": 0.8805454196274246,
|
|
"grad_norm": 29.028377532958984,
|
|
"learning_rate": 8.578316937658758e-08,
|
|
"loss": 0.2899,
|
|
"num_input_tokens_seen": 43311552,
|
|
"step": 13755
|
|
},
|
|
{
|
|
"epoch": 0.8808655015684015,
|
|
"grad_norm": 18.780216217041016,
|
|
"learning_rate": 8.533095422083992e-08,
|
|
"loss": 0.3116,
|
|
"num_input_tokens_seen": 43326272,
|
|
"step": 13760
|
|
},
|
|
{
|
|
"epoch": 0.8811855835093784,
|
|
"grad_norm": 26.572908401489258,
|
|
"learning_rate": 8.487988104013533e-08,
|
|
"loss": 0.2906,
|
|
"num_input_tokens_seen": 43342592,
|
|
"step": 13765
|
|
},
|
|
{
|
|
"epoch": 0.8815056654503552,
|
|
"grad_norm": 24.25293731689453,
|
|
"learning_rate": 8.4429950397644e-08,
|
|
"loss": 0.3188,
|
|
"num_input_tokens_seen": 43357888,
|
|
"step": 13770
|
|
},
|
|
{
|
|
"epoch": 0.8818257473913321,
|
|
"grad_norm": 20.96013832092285,
|
|
"learning_rate": 8.398116285510948e-08,
|
|
"loss": 0.2679,
|
|
"num_input_tokens_seen": 43374272,
|
|
"step": 13775
|
|
},
|
|
{
|
|
"epoch": 0.8821458293323091,
|
|
"grad_norm": 47.135711669921875,
|
|
"learning_rate": 8.353351897284844e-08,
|
|
"loss": 0.2698,
|
|
"num_input_tokens_seen": 43393280,
|
|
"step": 13780
|
|
},
|
|
{
|
|
"epoch": 0.882465911273286,
|
|
"grad_norm": 10.159743309020996,
|
|
"learning_rate": 8.308701930974949e-08,
|
|
"loss": 0.4762,
|
|
"num_input_tokens_seen": 43409600,
|
|
"step": 13785
|
|
},
|
|
{
|
|
"epoch": 0.8827859932142629,
|
|
"grad_norm": 27.35509490966797,
|
|
"learning_rate": 8.264166442327269e-08,
|
|
"loss": 0.4038,
|
|
"num_input_tokens_seen": 43424384,
|
|
"step": 13790
|
|
},
|
|
{
|
|
"epoch": 0.8831060751552398,
|
|
"grad_norm": 41.762332916259766,
|
|
"learning_rate": 8.219745486944885e-08,
|
|
"loss": 0.2533,
|
|
"num_input_tokens_seen": 43440128,
|
|
"step": 13795
|
|
},
|
|
{
|
|
"epoch": 0.8834261570962166,
|
|
"grad_norm": 78.77603912353516,
|
|
"learning_rate": 8.175439120287875e-08,
|
|
"loss": 0.4597,
|
|
"num_input_tokens_seen": 43455168,
|
|
"step": 13800
|
|
},
|
|
{
|
|
"epoch": 0.8837462390371935,
|
|
"grad_norm": 49.571353912353516,
|
|
"learning_rate": 8.131247397673269e-08,
|
|
"loss": 0.3494,
|
|
"num_input_tokens_seen": 43472064,
|
|
"step": 13805
|
|
},
|
|
{
|
|
"epoch": 0.8840663209781704,
|
|
"grad_norm": 118.99240112304688,
|
|
"learning_rate": 8.087170374274921e-08,
|
|
"loss": 0.4333,
|
|
"num_input_tokens_seen": 43488000,
|
|
"step": 13810
|
|
},
|
|
{
|
|
"epoch": 0.8843864029191473,
|
|
"grad_norm": 27.12523078918457,
|
|
"learning_rate": 8.043208105123578e-08,
|
|
"loss": 0.2981,
|
|
"num_input_tokens_seen": 43503488,
|
|
"step": 13815
|
|
},
|
|
{
|
|
"epoch": 0.8847064848601242,
|
|
"grad_norm": 42.8975830078125,
|
|
"learning_rate": 7.999360645106579e-08,
|
|
"loss": 0.335,
|
|
"num_input_tokens_seen": 43518336,
|
|
"step": 13820
|
|
},
|
|
{
|
|
"epoch": 0.885026566801101,
|
|
"grad_norm": 17.23529052734375,
|
|
"learning_rate": 7.955628048968011e-08,
|
|
"loss": 0.2651,
|
|
"num_input_tokens_seen": 43532800,
|
|
"step": 13825
|
|
},
|
|
{
|
|
"epoch": 0.885346648742078,
|
|
"grad_norm": 29.590059280395508,
|
|
"learning_rate": 7.912010371308564e-08,
|
|
"loss": 0.2627,
|
|
"num_input_tokens_seen": 43547648,
|
|
"step": 13830
|
|
},
|
|
{
|
|
"epoch": 0.8856667306830549,
|
|
"grad_norm": 27.454540252685547,
|
|
"learning_rate": 7.868507666585422e-08,
|
|
"loss": 0.2935,
|
|
"num_input_tokens_seen": 43562688,
|
|
"step": 13835
|
|
},
|
|
{
|
|
"epoch": 0.8859868126240318,
|
|
"grad_norm": 45.65460968017578,
|
|
"learning_rate": 7.825119989112172e-08,
|
|
"loss": 0.4137,
|
|
"num_input_tokens_seen": 43578176,
|
|
"step": 13840
|
|
},
|
|
{
|
|
"epoch": 0.8863068945650087,
|
|
"grad_norm": 30.539806365966797,
|
|
"learning_rate": 7.78184739305886e-08,
|
|
"loss": 0.2938,
|
|
"num_input_tokens_seen": 43593920,
|
|
"step": 13845
|
|
},
|
|
{
|
|
"epoch": 0.8866269765059855,
|
|
"grad_norm": 20.917694091796875,
|
|
"learning_rate": 7.73868993245187e-08,
|
|
"loss": 0.3491,
|
|
"num_input_tokens_seen": 43610944,
|
|
"step": 13850
|
|
},
|
|
{
|
|
"epoch": 0.8869470584469624,
|
|
"grad_norm": 18.05341911315918,
|
|
"learning_rate": 7.695647661173754e-08,
|
|
"loss": 0.3412,
|
|
"num_input_tokens_seen": 43627008,
|
|
"step": 13855
|
|
},
|
|
{
|
|
"epoch": 0.8872671403879393,
|
|
"grad_norm": 44.19736862182617,
|
|
"learning_rate": 7.652720632963284e-08,
|
|
"loss": 0.3785,
|
|
"num_input_tokens_seen": 43642752,
|
|
"step": 13860
|
|
},
|
|
{
|
|
"epoch": 0.8875872223289162,
|
|
"grad_norm": 49.171730041503906,
|
|
"learning_rate": 7.609908901415396e-08,
|
|
"loss": 0.3396,
|
|
"num_input_tokens_seen": 43658496,
|
|
"step": 13865
|
|
},
|
|
{
|
|
"epoch": 0.8879073042698931,
|
|
"grad_norm": 53.71741485595703,
|
|
"learning_rate": 7.567212519981047e-08,
|
|
"loss": 0.4018,
|
|
"num_input_tokens_seen": 43674304,
|
|
"step": 13870
|
|
},
|
|
{
|
|
"epoch": 0.8882273862108699,
|
|
"grad_norm": 18.578672409057617,
|
|
"learning_rate": 7.524631541967108e-08,
|
|
"loss": 0.3382,
|
|
"num_input_tokens_seen": 43689536,
|
|
"step": 13875
|
|
},
|
|
{
|
|
"epoch": 0.8885474681518468,
|
|
"grad_norm": 72.489501953125,
|
|
"learning_rate": 7.482166020536485e-08,
|
|
"loss": 0.2903,
|
|
"num_input_tokens_seen": 43706496,
|
|
"step": 13880
|
|
},
|
|
{
|
|
"epoch": 0.8888675500928238,
|
|
"grad_norm": 17.48689079284668,
|
|
"learning_rate": 7.439816008707877e-08,
|
|
"loss": 0.3108,
|
|
"num_input_tokens_seen": 43721408,
|
|
"step": 13885
|
|
},
|
|
{
|
|
"epoch": 0.8891876320338007,
|
|
"grad_norm": 17.783830642700195,
|
|
"learning_rate": 7.397581559355748e-08,
|
|
"loss": 0.3216,
|
|
"num_input_tokens_seen": 43737536,
|
|
"step": 13890
|
|
},
|
|
{
|
|
"epoch": 0.8895077139747776,
|
|
"grad_norm": 33.39737319946289,
|
|
"learning_rate": 7.355462725210315e-08,
|
|
"loss": 0.4116,
|
|
"num_input_tokens_seen": 43752640,
|
|
"step": 13895
|
|
},
|
|
{
|
|
"epoch": 0.8898277959157544,
|
|
"grad_norm": 30.600183486938477,
|
|
"learning_rate": 7.313459558857438e-08,
|
|
"loss": 0.4081,
|
|
"num_input_tokens_seen": 43768384,
|
|
"step": 13900
|
|
},
|
|
{
|
|
"epoch": 0.8901478778567313,
|
|
"grad_norm": 26.679346084594727,
|
|
"learning_rate": 7.271572112738566e-08,
|
|
"loss": 0.3108,
|
|
"num_input_tokens_seen": 43784320,
|
|
"step": 13905
|
|
},
|
|
{
|
|
"epoch": 0.8904679597977082,
|
|
"grad_norm": 32.508792877197266,
|
|
"learning_rate": 7.229800439150657e-08,
|
|
"loss": 0.3582,
|
|
"num_input_tokens_seen": 43799232,
|
|
"step": 13910
|
|
},
|
|
{
|
|
"epoch": 0.8907880417386851,
|
|
"grad_norm": 64.69635009765625,
|
|
"learning_rate": 7.188144590246148e-08,
|
|
"loss": 0.3721,
|
|
"num_input_tokens_seen": 43815360,
|
|
"step": 13915
|
|
},
|
|
{
|
|
"epoch": 0.891108123679662,
|
|
"grad_norm": 24.958736419677734,
|
|
"learning_rate": 7.146604618032848e-08,
|
|
"loss": 0.339,
|
|
"num_input_tokens_seen": 43830336,
|
|
"step": 13920
|
|
},
|
|
{
|
|
"epoch": 0.8914282056206388,
|
|
"grad_norm": 36.58753967285156,
|
|
"learning_rate": 7.105180574373904e-08,
|
|
"loss": 0.4065,
|
|
"num_input_tokens_seen": 43846656,
|
|
"step": 13925
|
|
},
|
|
{
|
|
"epoch": 0.8917482875616157,
|
|
"grad_norm": 19.49739646911621,
|
|
"learning_rate": 7.063872510987712e-08,
|
|
"loss": 0.3231,
|
|
"num_input_tokens_seen": 43862720,
|
|
"step": 13930
|
|
},
|
|
{
|
|
"epoch": 0.8920683695025927,
|
|
"grad_norm": 32.121185302734375,
|
|
"learning_rate": 7.022680479447874e-08,
|
|
"loss": 0.3558,
|
|
"num_input_tokens_seen": 43876800,
|
|
"step": 13935
|
|
},
|
|
{
|
|
"epoch": 0.8923884514435696,
|
|
"grad_norm": 22.010385513305664,
|
|
"learning_rate": 6.98160453118316e-08,
|
|
"loss": 0.2952,
|
|
"num_input_tokens_seen": 43892160,
|
|
"step": 13940
|
|
},
|
|
{
|
|
"epoch": 0.8927085333845465,
|
|
"grad_norm": 38.97593688964844,
|
|
"learning_rate": 6.940644717477328e-08,
|
|
"loss": 0.333,
|
|
"num_input_tokens_seen": 43908416,
|
|
"step": 13945
|
|
},
|
|
{
|
|
"epoch": 0.8930286153255234,
|
|
"grad_norm": 31.57818031311035,
|
|
"learning_rate": 6.899801089469204e-08,
|
|
"loss": 0.4213,
|
|
"num_input_tokens_seen": 43923712,
|
|
"step": 13950
|
|
},
|
|
{
|
|
"epoch": 0.8933486972665002,
|
|
"grad_norm": 20.735111236572266,
|
|
"learning_rate": 6.85907369815254e-08,
|
|
"loss": 0.3555,
|
|
"num_input_tokens_seen": 43939520,
|
|
"step": 13955
|
|
},
|
|
{
|
|
"epoch": 0.8936687792074771,
|
|
"grad_norm": 51.4113883972168,
|
|
"learning_rate": 6.81846259437595e-08,
|
|
"loss": 0.3895,
|
|
"num_input_tokens_seen": 43954688,
|
|
"step": 13960
|
|
},
|
|
{
|
|
"epoch": 0.893988861148454,
|
|
"grad_norm": 53.543155670166016,
|
|
"learning_rate": 6.77796782884289e-08,
|
|
"loss": 0.3146,
|
|
"num_input_tokens_seen": 43969600,
|
|
"step": 13965
|
|
},
|
|
{
|
|
"epoch": 0.8943089430894309,
|
|
"grad_norm": 46.502647399902344,
|
|
"learning_rate": 6.737589452111526e-08,
|
|
"loss": 0.3824,
|
|
"num_input_tokens_seen": 43985472,
|
|
"step": 13970
|
|
},
|
|
{
|
|
"epoch": 0.8946290250304078,
|
|
"grad_norm": 39.93029022216797,
|
|
"learning_rate": 6.697327514594786e-08,
|
|
"loss": 0.3916,
|
|
"num_input_tokens_seen": 44000768,
|
|
"step": 13975
|
|
},
|
|
{
|
|
"epoch": 0.8949491069713846,
|
|
"grad_norm": 41.46504592895508,
|
|
"learning_rate": 6.657182066560118e-08,
|
|
"loss": 0.4586,
|
|
"num_input_tokens_seen": 44017088,
|
|
"step": 13980
|
|
},
|
|
{
|
|
"epoch": 0.8952691889123615,
|
|
"grad_norm": 26.99639892578125,
|
|
"learning_rate": 6.617153158129596e-08,
|
|
"loss": 0.37,
|
|
"num_input_tokens_seen": 44031488,
|
|
"step": 13985
|
|
},
|
|
{
|
|
"epoch": 0.8955892708533385,
|
|
"grad_norm": 37.02708435058594,
|
|
"learning_rate": 6.577240839279807e-08,
|
|
"loss": 0.337,
|
|
"num_input_tokens_seen": 44047296,
|
|
"step": 13990
|
|
},
|
|
{
|
|
"epoch": 0.8959093527943154,
|
|
"grad_norm": 31.63517189025879,
|
|
"learning_rate": 6.537445159841748e-08,
|
|
"loss": 0.3143,
|
|
"num_input_tokens_seen": 44063744,
|
|
"step": 13995
|
|
},
|
|
{
|
|
"epoch": 0.8962294347352923,
|
|
"grad_norm": 34.43181610107422,
|
|
"learning_rate": 6.497766169500752e-08,
|
|
"loss": 0.3936,
|
|
"num_input_tokens_seen": 44079168,
|
|
"step": 14000
|
|
},
|
|
{
|
|
"epoch": 0.8965495166762691,
|
|
"grad_norm": 13.677638053894043,
|
|
"learning_rate": 6.458203917796546e-08,
|
|
"loss": 0.2643,
|
|
"num_input_tokens_seen": 44093824,
|
|
"step": 14005
|
|
},
|
|
{
|
|
"epoch": 0.896869598617246,
|
|
"grad_norm": 19.27773666381836,
|
|
"learning_rate": 6.418758454123041e-08,
|
|
"loss": 0.455,
|
|
"num_input_tokens_seen": 44111296,
|
|
"step": 14010
|
|
},
|
|
{
|
|
"epoch": 0.8971896805582229,
|
|
"grad_norm": 18.031564712524414,
|
|
"learning_rate": 6.379429827728377e-08,
|
|
"loss": 0.3905,
|
|
"num_input_tokens_seen": 44128000,
|
|
"step": 14015
|
|
},
|
|
{
|
|
"epoch": 0.8975097624991998,
|
|
"grad_norm": 17.980560302734375,
|
|
"learning_rate": 6.340218087714799e-08,
|
|
"loss": 0.3833,
|
|
"num_input_tokens_seen": 44143488,
|
|
"step": 14020
|
|
},
|
|
{
|
|
"epoch": 0.8978298444401767,
|
|
"grad_norm": 84.56553649902344,
|
|
"learning_rate": 6.301123283038634e-08,
|
|
"loss": 0.3567,
|
|
"num_input_tokens_seen": 44158976,
|
|
"step": 14025
|
|
},
|
|
{
|
|
"epoch": 0.8981499263811535,
|
|
"grad_norm": 20.843826293945312,
|
|
"learning_rate": 6.262145462510193e-08,
|
|
"loss": 0.319,
|
|
"num_input_tokens_seen": 44175808,
|
|
"step": 14030
|
|
},
|
|
{
|
|
"epoch": 0.8984700083221304,
|
|
"grad_norm": 44.17280578613281,
|
|
"learning_rate": 6.223284674793738e-08,
|
|
"loss": 0.2817,
|
|
"num_input_tokens_seen": 44190336,
|
|
"step": 14035
|
|
},
|
|
{
|
|
"epoch": 0.8987900902631074,
|
|
"grad_norm": 35.57537078857422,
|
|
"learning_rate": 6.184540968407437e-08,
|
|
"loss": 0.3835,
|
|
"num_input_tokens_seen": 44205696,
|
|
"step": 14040
|
|
},
|
|
{
|
|
"epoch": 0.8991101722040843,
|
|
"grad_norm": 26.58342742919922,
|
|
"learning_rate": 6.145914391723239e-08,
|
|
"loss": 0.3546,
|
|
"num_input_tokens_seen": 44222016,
|
|
"step": 14045
|
|
},
|
|
{
|
|
"epoch": 0.8994302541450612,
|
|
"grad_norm": 25.470823287963867,
|
|
"learning_rate": 6.107404992966902e-08,
|
|
"loss": 0.3285,
|
|
"num_input_tokens_seen": 44238592,
|
|
"step": 14050
|
|
},
|
|
{
|
|
"epoch": 0.899750336086038,
|
|
"grad_norm": 23.68887710571289,
|
|
"learning_rate": 6.069012820217856e-08,
|
|
"loss": 0.2517,
|
|
"num_input_tokens_seen": 44254016,
|
|
"step": 14055
|
|
},
|
|
{
|
|
"epoch": 0.9000704180270149,
|
|
"grad_norm": 28.1870059967041,
|
|
"learning_rate": 6.030737921409168e-08,
|
|
"loss": 0.3757,
|
|
"num_input_tokens_seen": 44269376,
|
|
"step": 14060
|
|
},
|
|
{
|
|
"epoch": 0.9003904999679918,
|
|
"grad_norm": 53.616127014160156,
|
|
"learning_rate": 5.992580344327503e-08,
|
|
"loss": 0.4646,
|
|
"num_input_tokens_seen": 44284672,
|
|
"step": 14065
|
|
},
|
|
{
|
|
"epoch": 0.9007105819089687,
|
|
"grad_norm": 33.5253791809082,
|
|
"learning_rate": 5.954540136613051e-08,
|
|
"loss": 0.352,
|
|
"num_input_tokens_seen": 44300224,
|
|
"step": 14070
|
|
},
|
|
{
|
|
"epoch": 0.9010306638499456,
|
|
"grad_norm": 24.468204498291016,
|
|
"learning_rate": 5.916617345759456e-08,
|
|
"loss": 0.3451,
|
|
"num_input_tokens_seen": 44315264,
|
|
"step": 14075
|
|
},
|
|
{
|
|
"epoch": 0.901094680238141,
|
|
"eval_loss": 0.3543796241283417,
|
|
"eval_runtime": 49.176,
|
|
"eval_samples_per_second": 282.373,
|
|
"eval_steps_per_second": 35.302,
|
|
"num_input_tokens_seen": 44318848,
|
|
"step": 14076
|
|
},
|
|
{
|
|
"epoch": 0.9013507457909224,
|
|
"grad_norm": 45.981563568115234,
|
|
"learning_rate": 5.878812019113766e-08,
|
|
"loss": 0.4234,
|
|
"num_input_tokens_seen": 44330176,
|
|
"step": 14080
|
|
},
|
|
{
|
|
"epoch": 0.9016708277318993,
|
|
"grad_norm": 22.737422943115234,
|
|
"learning_rate": 5.84112420387638e-08,
|
|
"loss": 0.2892,
|
|
"num_input_tokens_seen": 44345152,
|
|
"step": 14085
|
|
},
|
|
{
|
|
"epoch": 0.9019909096728762,
|
|
"grad_norm": 31.271459579467773,
|
|
"learning_rate": 5.8035539471009697e-08,
|
|
"loss": 0.3656,
|
|
"num_input_tokens_seen": 44361152,
|
|
"step": 14090
|
|
},
|
|
{
|
|
"epoch": 0.9023109916138532,
|
|
"grad_norm": 33.406707763671875,
|
|
"learning_rate": 5.7661012956944253e-08,
|
|
"loss": 0.4078,
|
|
"num_input_tokens_seen": 44376128,
|
|
"step": 14095
|
|
},
|
|
{
|
|
"epoch": 0.9026310735548301,
|
|
"grad_norm": 17.146968841552734,
|
|
"learning_rate": 5.728766296416876e-08,
|
|
"loss": 0.2842,
|
|
"num_input_tokens_seen": 44392192,
|
|
"step": 14100
|
|
},
|
|
{
|
|
"epoch": 0.902951155495807,
|
|
"grad_norm": 34.22679901123047,
|
|
"learning_rate": 5.6915489958814453e-08,
|
|
"loss": 0.4079,
|
|
"num_input_tokens_seen": 44407680,
|
|
"step": 14105
|
|
},
|
|
{
|
|
"epoch": 0.9032712374367838,
|
|
"grad_norm": 53.51115798950195,
|
|
"learning_rate": 5.654449440554399e-08,
|
|
"loss": 0.4093,
|
|
"num_input_tokens_seen": 44424384,
|
|
"step": 14110
|
|
},
|
|
{
|
|
"epoch": 0.9035913193777607,
|
|
"grad_norm": 21.632587432861328,
|
|
"learning_rate": 5.617467676754972e-08,
|
|
"loss": 0.3752,
|
|
"num_input_tokens_seen": 44439744,
|
|
"step": 14115
|
|
},
|
|
{
|
|
"epoch": 0.9039114013187376,
|
|
"grad_norm": 57.51222610473633,
|
|
"learning_rate": 5.580603750655344e-08,
|
|
"loss": 0.3012,
|
|
"num_input_tokens_seen": 44454272,
|
|
"step": 14120
|
|
},
|
|
{
|
|
"epoch": 0.9042314832597145,
|
|
"grad_norm": 33.1247444152832,
|
|
"learning_rate": 5.543857708280497e-08,
|
|
"loss": 0.3578,
|
|
"num_input_tokens_seen": 44468992,
|
|
"step": 14125
|
|
},
|
|
{
|
|
"epoch": 0.9045515652006914,
|
|
"grad_norm": 41.706947326660156,
|
|
"learning_rate": 5.507229595508367e-08,
|
|
"loss": 0.4819,
|
|
"num_input_tokens_seen": 44484864,
|
|
"step": 14130
|
|
},
|
|
{
|
|
"epoch": 0.9048716471416682,
|
|
"grad_norm": 14.103269577026367,
|
|
"learning_rate": 5.4707194580695504e-08,
|
|
"loss": 0.289,
|
|
"num_input_tokens_seen": 44499968,
|
|
"step": 14135
|
|
},
|
|
{
|
|
"epoch": 0.9051917290826451,
|
|
"grad_norm": 35.217655181884766,
|
|
"learning_rate": 5.4343273415473846e-08,
|
|
"loss": 0.4239,
|
|
"num_input_tokens_seen": 44517952,
|
|
"step": 14140
|
|
},
|
|
{
|
|
"epoch": 0.905511811023622,
|
|
"grad_norm": 24.536203384399414,
|
|
"learning_rate": 5.3980532913778576e-08,
|
|
"loss": 0.3421,
|
|
"num_input_tokens_seen": 44532928,
|
|
"step": 14145
|
|
},
|
|
{
|
|
"epoch": 0.905831892964599,
|
|
"grad_norm": 32.02094650268555,
|
|
"learning_rate": 5.361897352849554e-08,
|
|
"loss": 0.3955,
|
|
"num_input_tokens_seen": 44548288,
|
|
"step": 14150
|
|
},
|
|
{
|
|
"epoch": 0.9061519749055759,
|
|
"grad_norm": 20.607261657714844,
|
|
"learning_rate": 5.325859571103586e-08,
|
|
"loss": 0.3331,
|
|
"num_input_tokens_seen": 44563712,
|
|
"step": 14155
|
|
},
|
|
{
|
|
"epoch": 0.9064720568465527,
|
|
"grad_norm": 21.235889434814453,
|
|
"learning_rate": 5.289939991133508e-08,
|
|
"loss": 0.3333,
|
|
"num_input_tokens_seen": 44579264,
|
|
"step": 14160
|
|
},
|
|
{
|
|
"epoch": 0.9067921387875296,
|
|
"grad_norm": 12.65000057220459,
|
|
"learning_rate": 5.2541386577853895e-08,
|
|
"loss": 0.2384,
|
|
"num_input_tokens_seen": 44594176,
|
|
"step": 14165
|
|
},
|
|
{
|
|
"epoch": 0.9071122207285065,
|
|
"grad_norm": 16.73200225830078,
|
|
"learning_rate": 5.2184556157576e-08,
|
|
"loss": 0.2502,
|
|
"num_input_tokens_seen": 44609664,
|
|
"step": 14170
|
|
},
|
|
{
|
|
"epoch": 0.9074323026694834,
|
|
"grad_norm": 52.27291488647461,
|
|
"learning_rate": 5.1828909096008234e-08,
|
|
"loss": 0.3649,
|
|
"num_input_tokens_seen": 44626944,
|
|
"step": 14175
|
|
},
|
|
{
|
|
"epoch": 0.9077523846104603,
|
|
"grad_norm": 18.205657958984375,
|
|
"learning_rate": 5.14744458371803e-08,
|
|
"loss": 0.2331,
|
|
"num_input_tokens_seen": 44643520,
|
|
"step": 14180
|
|
},
|
|
{
|
|
"epoch": 0.9080724665514371,
|
|
"grad_norm": 87.80847930908203,
|
|
"learning_rate": 5.1121166823643646e-08,
|
|
"loss": 0.5075,
|
|
"num_input_tokens_seen": 44657984,
|
|
"step": 14185
|
|
},
|
|
{
|
|
"epoch": 0.908392548492414,
|
|
"grad_norm": 28.186279296875,
|
|
"learning_rate": 5.076907249647122e-08,
|
|
"loss": 0.376,
|
|
"num_input_tokens_seen": 44673024,
|
|
"step": 14190
|
|
},
|
|
{
|
|
"epoch": 0.9087126304333909,
|
|
"grad_norm": 25.26058578491211,
|
|
"learning_rate": 5.0418163295257055e-08,
|
|
"loss": 0.412,
|
|
"num_input_tokens_seen": 44687424,
|
|
"step": 14195
|
|
},
|
|
{
|
|
"epoch": 0.9090327123743679,
|
|
"grad_norm": 40.44475555419922,
|
|
"learning_rate": 5.006843965811536e-08,
|
|
"loss": 0.2867,
|
|
"num_input_tokens_seen": 44702976,
|
|
"step": 14200
|
|
},
|
|
{
|
|
"epoch": 0.9093527943153448,
|
|
"grad_norm": 46.02883529663086,
|
|
"learning_rate": 4.971990202168008e-08,
|
|
"loss": 0.482,
|
|
"num_input_tokens_seen": 44718144,
|
|
"step": 14205
|
|
},
|
|
{
|
|
"epoch": 0.9096728762563216,
|
|
"grad_norm": 26.443368911743164,
|
|
"learning_rate": 4.9372550821104697e-08,
|
|
"loss": 0.3277,
|
|
"num_input_tokens_seen": 44734912,
|
|
"step": 14210
|
|
},
|
|
{
|
|
"epoch": 0.9099929581972985,
|
|
"grad_norm": 20.41611671447754,
|
|
"learning_rate": 4.902638649006119e-08,
|
|
"loss": 0.311,
|
|
"num_input_tokens_seen": 44749888,
|
|
"step": 14215
|
|
},
|
|
{
|
|
"epoch": 0.9103130401382754,
|
|
"grad_norm": 19.726547241210938,
|
|
"learning_rate": 4.868140946073973e-08,
|
|
"loss": 0.3201,
|
|
"num_input_tokens_seen": 44764544,
|
|
"step": 14220
|
|
},
|
|
{
|
|
"epoch": 0.9106331220792523,
|
|
"grad_norm": 32.19831848144531,
|
|
"learning_rate": 4.833762016384857e-08,
|
|
"loss": 0.2995,
|
|
"num_input_tokens_seen": 44780992,
|
|
"step": 14225
|
|
},
|
|
{
|
|
"epoch": 0.9109532040202292,
|
|
"grad_norm": 50.0634880065918,
|
|
"learning_rate": 4.799501902861214e-08,
|
|
"loss": 0.3879,
|
|
"num_input_tokens_seen": 44796672,
|
|
"step": 14230
|
|
},
|
|
{
|
|
"epoch": 0.911273285961206,
|
|
"grad_norm": 44.15312957763672,
|
|
"learning_rate": 4.765360648277217e-08,
|
|
"loss": 0.4313,
|
|
"num_input_tokens_seen": 44812224,
|
|
"step": 14235
|
|
},
|
|
{
|
|
"epoch": 0.9115933679021829,
|
|
"grad_norm": 38.931339263916016,
|
|
"learning_rate": 4.7313382952586465e-08,
|
|
"loss": 0.4254,
|
|
"num_input_tokens_seen": 44827136,
|
|
"step": 14240
|
|
},
|
|
{
|
|
"epoch": 0.9119134498431598,
|
|
"grad_norm": 16.312923431396484,
|
|
"learning_rate": 4.6974348862828027e-08,
|
|
"loss": 0.3787,
|
|
"num_input_tokens_seen": 44842176,
|
|
"step": 14245
|
|
},
|
|
{
|
|
"epoch": 0.9122335317841367,
|
|
"grad_norm": 47.28225326538086,
|
|
"learning_rate": 4.663650463678448e-08,
|
|
"loss": 0.4211,
|
|
"num_input_tokens_seen": 44858880,
|
|
"step": 14250
|
|
},
|
|
{
|
|
"epoch": 0.9125536137251137,
|
|
"grad_norm": 21.42548942565918,
|
|
"learning_rate": 4.629985069625875e-08,
|
|
"loss": 0.4399,
|
|
"num_input_tokens_seen": 44875328,
|
|
"step": 14255
|
|
},
|
|
{
|
|
"epoch": 0.9128736956660906,
|
|
"grad_norm": 41.41118240356445,
|
|
"learning_rate": 4.596438746156728e-08,
|
|
"loss": 0.3625,
|
|
"num_input_tokens_seen": 44892032,
|
|
"step": 14260
|
|
},
|
|
{
|
|
"epoch": 0.9131937776070674,
|
|
"grad_norm": 35.68510818481445,
|
|
"learning_rate": 4.563011535153949e-08,
|
|
"loss": 0.3618,
|
|
"num_input_tokens_seen": 44907328,
|
|
"step": 14265
|
|
},
|
|
{
|
|
"epoch": 0.9135138595480443,
|
|
"grad_norm": 26.231754302978516,
|
|
"learning_rate": 4.52970347835181e-08,
|
|
"loss": 0.2686,
|
|
"num_input_tokens_seen": 44922560,
|
|
"step": 14270
|
|
},
|
|
{
|
|
"epoch": 0.9138339414890212,
|
|
"grad_norm": 34.4133186340332,
|
|
"learning_rate": 4.496514617335845e-08,
|
|
"loss": 0.3256,
|
|
"num_input_tokens_seen": 44937728,
|
|
"step": 14275
|
|
},
|
|
{
|
|
"epoch": 0.9141540234299981,
|
|
"grad_norm": 42.511531829833984,
|
|
"learning_rate": 4.4634449935427197e-08,
|
|
"loss": 0.3568,
|
|
"num_input_tokens_seen": 44954560,
|
|
"step": 14280
|
|
},
|
|
{
|
|
"epoch": 0.914474105370975,
|
|
"grad_norm": 28.035154342651367,
|
|
"learning_rate": 4.430494648260219e-08,
|
|
"loss": 0.3032,
|
|
"num_input_tokens_seen": 44971520,
|
|
"step": 14285
|
|
},
|
|
{
|
|
"epoch": 0.9147941873119518,
|
|
"grad_norm": 35.39820098876953,
|
|
"learning_rate": 4.397663622627279e-08,
|
|
"loss": 0.4391,
|
|
"num_input_tokens_seen": 44987392,
|
|
"step": 14290
|
|
},
|
|
{
|
|
"epoch": 0.9151142692529287,
|
|
"grad_norm": 25.651020050048828,
|
|
"learning_rate": 4.364951957633789e-08,
|
|
"loss": 0.3116,
|
|
"num_input_tokens_seen": 45002688,
|
|
"step": 14295
|
|
},
|
|
{
|
|
"epoch": 0.9154343511939056,
|
|
"grad_norm": 29.278078079223633,
|
|
"learning_rate": 4.332359694120669e-08,
|
|
"loss": 0.2874,
|
|
"num_input_tokens_seen": 45017792,
|
|
"step": 14300
|
|
},
|
|
{
|
|
"epoch": 0.9157544331348826,
|
|
"grad_norm": 33.1219482421875,
|
|
"learning_rate": 4.299886872779734e-08,
|
|
"loss": 0.3561,
|
|
"num_input_tokens_seen": 45032640,
|
|
"step": 14305
|
|
},
|
|
{
|
|
"epoch": 0.9160745150758595,
|
|
"grad_norm": 29.479825973510742,
|
|
"learning_rate": 4.267533534153678e-08,
|
|
"loss": 0.2945,
|
|
"num_input_tokens_seen": 45048256,
|
|
"step": 14310
|
|
},
|
|
{
|
|
"epoch": 0.9163945970168363,
|
|
"grad_norm": 26.894004821777344,
|
|
"learning_rate": 4.2352997186360316e-08,
|
|
"loss": 0.3251,
|
|
"num_input_tokens_seen": 45064192,
|
|
"step": 14315
|
|
},
|
|
{
|
|
"epoch": 0.9167146789578132,
|
|
"grad_norm": 19.898136138916016,
|
|
"learning_rate": 4.203185466471082e-08,
|
|
"loss": 0.321,
|
|
"num_input_tokens_seen": 45079488,
|
|
"step": 14320
|
|
},
|
|
{
|
|
"epoch": 0.9170347608987901,
|
|
"grad_norm": 20.337265014648438,
|
|
"learning_rate": 4.1711908177538556e-08,
|
|
"loss": 0.3791,
|
|
"num_input_tokens_seen": 45095616,
|
|
"step": 14325
|
|
},
|
|
{
|
|
"epoch": 0.917354842839767,
|
|
"grad_norm": 45.242088317871094,
|
|
"learning_rate": 4.139315812430055e-08,
|
|
"loss": 0.3797,
|
|
"num_input_tokens_seen": 45110592,
|
|
"step": 14330
|
|
},
|
|
{
|
|
"epoch": 0.9176749247807439,
|
|
"grad_norm": 29.204076766967773,
|
|
"learning_rate": 4.1075604902959915e-08,
|
|
"loss": 0.3756,
|
|
"num_input_tokens_seen": 45127168,
|
|
"step": 14335
|
|
},
|
|
{
|
|
"epoch": 0.9179950067217207,
|
|
"grad_norm": 31.663959503173828,
|
|
"learning_rate": 4.07592489099855e-08,
|
|
"loss": 0.3157,
|
|
"num_input_tokens_seen": 45142208,
|
|
"step": 14340
|
|
},
|
|
{
|
|
"epoch": 0.9183150886626976,
|
|
"grad_norm": 38.191898345947266,
|
|
"learning_rate": 4.044409054035147e-08,
|
|
"loss": 0.3917,
|
|
"num_input_tokens_seen": 45157184,
|
|
"step": 14345
|
|
},
|
|
{
|
|
"epoch": 0.9186351706036745,
|
|
"grad_norm": 15.774205207824707,
|
|
"learning_rate": 4.0130130187537195e-08,
|
|
"loss": 0.3891,
|
|
"num_input_tokens_seen": 45174464,
|
|
"step": 14350
|
|
},
|
|
{
|
|
"epoch": 0.9189552525446514,
|
|
"grad_norm": 36.91510772705078,
|
|
"learning_rate": 3.981736824352522e-08,
|
|
"loss": 0.3157,
|
|
"num_input_tokens_seen": 45188992,
|
|
"step": 14355
|
|
},
|
|
{
|
|
"epoch": 0.9192753344856284,
|
|
"grad_norm": 32.23750305175781,
|
|
"learning_rate": 3.950580509880286e-08,
|
|
"loss": 0.4661,
|
|
"num_input_tokens_seen": 45204032,
|
|
"step": 14360
|
|
},
|
|
{
|
|
"epoch": 0.9195954164266052,
|
|
"grad_norm": 46.32685089111328,
|
|
"learning_rate": 3.9195441142360066e-08,
|
|
"loss": 0.4012,
|
|
"num_input_tokens_seen": 45219328,
|
|
"step": 14365
|
|
},
|
|
{
|
|
"epoch": 0.9199154983675821,
|
|
"grad_norm": 23.546079635620117,
|
|
"learning_rate": 3.888627676169043e-08,
|
|
"loss": 0.3271,
|
|
"num_input_tokens_seen": 45235584,
|
|
"step": 14370
|
|
},
|
|
{
|
|
"epoch": 0.920235580308559,
|
|
"grad_norm": 39.16623306274414,
|
|
"learning_rate": 3.857831234278886e-08,
|
|
"loss": 0.3709,
|
|
"num_input_tokens_seen": 45250880,
|
|
"step": 14375
|
|
},
|
|
{
|
|
"epoch": 0.9205556622495359,
|
|
"grad_norm": 31.843650817871094,
|
|
"learning_rate": 3.827154827015255e-08,
|
|
"loss": 0.4085,
|
|
"num_input_tokens_seen": 45266752,
|
|
"step": 14380
|
|
},
|
|
{
|
|
"epoch": 0.9208757441905128,
|
|
"grad_norm": 12.346802711486816,
|
|
"learning_rate": 3.7965984926780383e-08,
|
|
"loss": 0.2914,
|
|
"num_input_tokens_seen": 45282496,
|
|
"step": 14385
|
|
},
|
|
{
|
|
"epoch": 0.9211958261314896,
|
|
"grad_norm": 41.83573532104492,
|
|
"learning_rate": 3.766162269417139e-08,
|
|
"loss": 0.3577,
|
|
"num_input_tokens_seen": 45297024,
|
|
"step": 14390
|
|
},
|
|
{
|
|
"epoch": 0.9215159080724665,
|
|
"grad_norm": 45.033992767333984,
|
|
"learning_rate": 3.73584619523255e-08,
|
|
"loss": 0.3693,
|
|
"num_input_tokens_seen": 45314176,
|
|
"step": 14395
|
|
},
|
|
{
|
|
"epoch": 0.9218359900134434,
|
|
"grad_norm": 21.012765884399414,
|
|
"learning_rate": 3.7056503079742616e-08,
|
|
"loss": 0.3557,
|
|
"num_input_tokens_seen": 45329344,
|
|
"step": 14400
|
|
},
|
|
{
|
|
"epoch": 0.9221560719544203,
|
|
"grad_norm": 29.65179443359375,
|
|
"learning_rate": 3.6755746453421945e-08,
|
|
"loss": 0.3428,
|
|
"num_input_tokens_seen": 45344384,
|
|
"step": 14405
|
|
},
|
|
{
|
|
"epoch": 0.9224761538953972,
|
|
"grad_norm": 13.857353210449219,
|
|
"learning_rate": 3.645619244886145e-08,
|
|
"loss": 0.2869,
|
|
"num_input_tokens_seen": 45360192,
|
|
"step": 14410
|
|
},
|
|
{
|
|
"epoch": 0.9227962358363742,
|
|
"grad_norm": 14.174830436706543,
|
|
"learning_rate": 3.615784144005796e-08,
|
|
"loss": 0.3103,
|
|
"num_input_tokens_seen": 45376000,
|
|
"step": 14415
|
|
},
|
|
{
|
|
"epoch": 0.923116317777351,
|
|
"grad_norm": 30.094505310058594,
|
|
"learning_rate": 3.5860693799506184e-08,
|
|
"loss": 0.4093,
|
|
"num_input_tokens_seen": 45390400,
|
|
"step": 14420
|
|
},
|
|
{
|
|
"epoch": 0.9234363997183279,
|
|
"grad_norm": 29.435256958007812,
|
|
"learning_rate": 3.5564749898198466e-08,
|
|
"loss": 0.4518,
|
|
"num_input_tokens_seen": 45406976,
|
|
"step": 14425
|
|
},
|
|
{
|
|
"epoch": 0.9237564816593048,
|
|
"grad_norm": 33.67948913574219,
|
|
"learning_rate": 3.527001010562425e-08,
|
|
"loss": 0.3481,
|
|
"num_input_tokens_seen": 45422080,
|
|
"step": 14430
|
|
},
|
|
{
|
|
"epoch": 0.9240765636002817,
|
|
"grad_norm": 52.893489837646484,
|
|
"learning_rate": 3.4976474789769504e-08,
|
|
"loss": 0.3429,
|
|
"num_input_tokens_seen": 45439296,
|
|
"step": 14435
|
|
},
|
|
{
|
|
"epoch": 0.9243966455412586,
|
|
"grad_norm": 34.073848724365234,
|
|
"learning_rate": 3.4684144317116636e-08,
|
|
"loss": 0.2983,
|
|
"num_input_tokens_seen": 45454208,
|
|
"step": 14440
|
|
},
|
|
{
|
|
"epoch": 0.9247167274822354,
|
|
"grad_norm": 18.271291732788086,
|
|
"learning_rate": 3.439301905264369e-08,
|
|
"loss": 0.3001,
|
|
"num_input_tokens_seen": 45470400,
|
|
"step": 14445
|
|
},
|
|
{
|
|
"epoch": 0.9250368094232123,
|
|
"grad_norm": 46.16067123413086,
|
|
"learning_rate": 3.410309935982403e-08,
|
|
"loss": 0.3212,
|
|
"num_input_tokens_seen": 45486528,
|
|
"step": 14450
|
|
},
|
|
{
|
|
"epoch": 0.9253568913641892,
|
|
"grad_norm": 17.307554244995117,
|
|
"learning_rate": 3.381438560062555e-08,
|
|
"loss": 0.3429,
|
|
"num_input_tokens_seen": 45501440,
|
|
"step": 14455
|
|
},
|
|
{
|
|
"epoch": 0.9256769733051661,
|
|
"grad_norm": 38.451210021972656,
|
|
"learning_rate": 3.3526878135511025e-08,
|
|
"loss": 0.3181,
|
|
"num_input_tokens_seen": 45517760,
|
|
"step": 14460
|
|
},
|
|
{
|
|
"epoch": 0.9259970552461431,
|
|
"grad_norm": 48.87675094604492,
|
|
"learning_rate": 3.324057732343666e-08,
|
|
"loss": 0.3642,
|
|
"num_input_tokens_seen": 45533056,
|
|
"step": 14465
|
|
},
|
|
{
|
|
"epoch": 0.9263171371871199,
|
|
"grad_norm": 24.82399559020996,
|
|
"learning_rate": 3.295548352185262e-08,
|
|
"loss": 0.4131,
|
|
"num_input_tokens_seen": 45549248,
|
|
"step": 14470
|
|
},
|
|
{
|
|
"epoch": 0.9266372191280968,
|
|
"grad_norm": 36.503944396972656,
|
|
"learning_rate": 3.2671597086701753e-08,
|
|
"loss": 0.3477,
|
|
"num_input_tokens_seen": 45565760,
|
|
"step": 14475
|
|
},
|
|
{
|
|
"epoch": 0.9269573010690737,
|
|
"grad_norm": 23.015771865844727,
|
|
"learning_rate": 3.238891837241964e-08,
|
|
"loss": 0.3246,
|
|
"num_input_tokens_seen": 45581568,
|
|
"step": 14480
|
|
},
|
|
{
|
|
"epoch": 0.9272773830100506,
|
|
"grad_norm": 43.855220794677734,
|
|
"learning_rate": 3.210744773193386e-08,
|
|
"loss": 0.4038,
|
|
"num_input_tokens_seen": 45596928,
|
|
"step": 14485
|
|
},
|
|
{
|
|
"epoch": 0.9275974649510275,
|
|
"grad_norm": 45.25807189941406,
|
|
"learning_rate": 3.182718551666386e-08,
|
|
"loss": 0.2948,
|
|
"num_input_tokens_seen": 45612800,
|
|
"step": 14490
|
|
},
|
|
{
|
|
"epoch": 0.9279175468920043,
|
|
"grad_norm": 79.2214584350586,
|
|
"learning_rate": 3.154813207652063e-08,
|
|
"loss": 0.4114,
|
|
"num_input_tokens_seen": 45627584,
|
|
"step": 14495
|
|
},
|
|
{
|
|
"epoch": 0.9282376288329812,
|
|
"grad_norm": 48.060794830322266,
|
|
"learning_rate": 3.1270287759905143e-08,
|
|
"loss": 0.3379,
|
|
"num_input_tokens_seen": 45643840,
|
|
"step": 14500
|
|
},
|
|
{
|
|
"epoch": 0.9285577107739581,
|
|
"grad_norm": 15.610395431518555,
|
|
"learning_rate": 3.0993652913709476e-08,
|
|
"loss": 0.2884,
|
|
"num_input_tokens_seen": 45659072,
|
|
"step": 14505
|
|
},
|
|
{
|
|
"epoch": 0.928877792714935,
|
|
"grad_norm": 27.879131317138672,
|
|
"learning_rate": 3.0718227883315796e-08,
|
|
"loss": 0.482,
|
|
"num_input_tokens_seen": 45675328,
|
|
"step": 14510
|
|
},
|
|
{
|
|
"epoch": 0.9291978746559119,
|
|
"grad_norm": 39.35497283935547,
|
|
"learning_rate": 3.044401301259503e-08,
|
|
"loss": 0.368,
|
|
"num_input_tokens_seen": 45690816,
|
|
"step": 14515
|
|
},
|
|
{
|
|
"epoch": 0.9295179565968889,
|
|
"grad_norm": 15.0499267578125,
|
|
"learning_rate": 3.017100864390787e-08,
|
|
"loss": 0.3333,
|
|
"num_input_tokens_seen": 45706432,
|
|
"step": 14520
|
|
},
|
|
{
|
|
"epoch": 0.9298380385378657,
|
|
"grad_norm": 51.364315032958984,
|
|
"learning_rate": 2.9899215118103446e-08,
|
|
"loss": 0.3446,
|
|
"num_input_tokens_seen": 45721920,
|
|
"step": 14525
|
|
},
|
|
{
|
|
"epoch": 0.9301581204788426,
|
|
"grad_norm": 15.155922889709473,
|
|
"learning_rate": 2.9628632774519435e-08,
|
|
"loss": 0.3433,
|
|
"num_input_tokens_seen": 45738048,
|
|
"step": 14530
|
|
},
|
|
{
|
|
"epoch": 0.9304782024198195,
|
|
"grad_norm": 24.992616653442383,
|
|
"learning_rate": 2.9359261950980485e-08,
|
|
"loss": 0.3308,
|
|
"num_input_tokens_seen": 45753856,
|
|
"step": 14535
|
|
},
|
|
{
|
|
"epoch": 0.9307982843607964,
|
|
"grad_norm": 22.78838539123535,
|
|
"learning_rate": 2.90911029837998e-08,
|
|
"loss": 0.3015,
|
|
"num_input_tokens_seen": 45768704,
|
|
"step": 14540
|
|
},
|
|
{
|
|
"epoch": 0.9311183663017732,
|
|
"grad_norm": 28.63710594177246,
|
|
"learning_rate": 2.8824156207776673e-08,
|
|
"loss": 0.2789,
|
|
"num_input_tokens_seen": 45783936,
|
|
"step": 14545
|
|
},
|
|
{
|
|
"epoch": 0.9314384482427501,
|
|
"grad_norm": 115.81269836425781,
|
|
"learning_rate": 2.8558421956197397e-08,
|
|
"loss": 0.4514,
|
|
"num_input_tokens_seen": 45800320,
|
|
"step": 14550
|
|
},
|
|
{
|
|
"epoch": 0.931758530183727,
|
|
"grad_norm": 36.78664779663086,
|
|
"learning_rate": 2.829390056083436e-08,
|
|
"loss": 0.3864,
|
|
"num_input_tokens_seen": 45816512,
|
|
"step": 14555
|
|
},
|
|
{
|
|
"epoch": 0.9320786121247039,
|
|
"grad_norm": 21.332889556884766,
|
|
"learning_rate": 2.8030592351945492e-08,
|
|
"loss": 0.3037,
|
|
"num_input_tokens_seen": 45831936,
|
|
"step": 14560
|
|
},
|
|
{
|
|
"epoch": 0.9323986940656808,
|
|
"grad_norm": 20.547264099121094,
|
|
"learning_rate": 2.776849765827427e-08,
|
|
"loss": 0.2968,
|
|
"num_input_tokens_seen": 45846784,
|
|
"step": 14565
|
|
},
|
|
{
|
|
"epoch": 0.9327187760066578,
|
|
"grad_norm": 39.512290954589844,
|
|
"learning_rate": 2.750761680704905e-08,
|
|
"loss": 0.4282,
|
|
"num_input_tokens_seen": 45862080,
|
|
"step": 14570
|
|
},
|
|
{
|
|
"epoch": 0.9330388579476346,
|
|
"grad_norm": 40.28529357910156,
|
|
"learning_rate": 2.724795012398251e-08,
|
|
"loss": 0.3937,
|
|
"num_input_tokens_seen": 45878528,
|
|
"step": 14575
|
|
},
|
|
{
|
|
"epoch": 0.9333589398886115,
|
|
"grad_norm": 36.721534729003906,
|
|
"learning_rate": 2.6989497933271543e-08,
|
|
"loss": 0.3737,
|
|
"num_input_tokens_seen": 45894016,
|
|
"step": 14580
|
|
},
|
|
{
|
|
"epoch": 0.9336790218295884,
|
|
"grad_norm": 18.749881744384766,
|
|
"learning_rate": 2.673226055759692e-08,
|
|
"loss": 0.3295,
|
|
"num_input_tokens_seen": 45909504,
|
|
"step": 14585
|
|
},
|
|
{
|
|
"epoch": 0.9339991037705653,
|
|
"grad_norm": 31.62596321105957,
|
|
"learning_rate": 2.6476238318122402e-08,
|
|
"loss": 0.338,
|
|
"num_input_tokens_seen": 45925376,
|
|
"step": 14590
|
|
},
|
|
{
|
|
"epoch": 0.9343191857115422,
|
|
"grad_norm": 33.345306396484375,
|
|
"learning_rate": 2.6221431534494742e-08,
|
|
"loss": 0.3956,
|
|
"num_input_tokens_seen": 45940224,
|
|
"step": 14595
|
|
},
|
|
{
|
|
"epoch": 0.934639267652519,
|
|
"grad_norm": 57.66178894042969,
|
|
"learning_rate": 2.5967840524843243e-08,
|
|
"loss": 0.3521,
|
|
"num_input_tokens_seen": 45955072,
|
|
"step": 14600
|
|
},
|
|
{
|
|
"epoch": 0.9349593495934959,
|
|
"grad_norm": 33.97639846801758,
|
|
"learning_rate": 2.5715465605779195e-08,
|
|
"loss": 0.4287,
|
|
"num_input_tokens_seen": 45970240,
|
|
"step": 14605
|
|
},
|
|
{
|
|
"epoch": 0.9352794315344728,
|
|
"grad_norm": 83.71870422363281,
|
|
"learning_rate": 2.5464307092395777e-08,
|
|
"loss": 0.406,
|
|
"num_input_tokens_seen": 45985856,
|
|
"step": 14610
|
|
},
|
|
{
|
|
"epoch": 0.9355995134754497,
|
|
"grad_norm": 20.36864471435547,
|
|
"learning_rate": 2.5214365298267148e-08,
|
|
"loss": 0.3398,
|
|
"num_input_tokens_seen": 46000256,
|
|
"step": 14615
|
|
},
|
|
{
|
|
"epoch": 0.9359195954164266,
|
|
"grad_norm": 26.265127182006836,
|
|
"learning_rate": 2.4965640535448917e-08,
|
|
"loss": 0.32,
|
|
"num_input_tokens_seen": 46015616,
|
|
"step": 14620
|
|
},
|
|
{
|
|
"epoch": 0.9362396773574035,
|
|
"grad_norm": 32.42552185058594,
|
|
"learning_rate": 2.471813311447657e-08,
|
|
"loss": 0.3741,
|
|
"num_input_tokens_seen": 46031040,
|
|
"step": 14625
|
|
},
|
|
{
|
|
"epoch": 0.9365597592983804,
|
|
"grad_norm": 37.86249542236328,
|
|
"learning_rate": 2.4471843344365915e-08,
|
|
"loss": 0.3304,
|
|
"num_input_tokens_seen": 46046016,
|
|
"step": 14630
|
|
},
|
|
{
|
|
"epoch": 0.9368798412393573,
|
|
"grad_norm": 17.967323303222656,
|
|
"learning_rate": 2.42267715326131e-08,
|
|
"loss": 0.2715,
|
|
"num_input_tokens_seen": 46062528,
|
|
"step": 14635
|
|
},
|
|
{
|
|
"epoch": 0.9371999231803342,
|
|
"grad_norm": 31.25685691833496,
|
|
"learning_rate": 2.3982917985192697e-08,
|
|
"loss": 0.3426,
|
|
"num_input_tokens_seen": 46078144,
|
|
"step": 14640
|
|
},
|
|
{
|
|
"epoch": 0.9375200051213111,
|
|
"grad_norm": 53.25637435913086,
|
|
"learning_rate": 2.3740283006558838e-08,
|
|
"loss": 0.3748,
|
|
"num_input_tokens_seen": 46096896,
|
|
"step": 14645
|
|
},
|
|
{
|
|
"epoch": 0.9378400870622879,
|
|
"grad_norm": 47.64904022216797,
|
|
"learning_rate": 2.349886689964431e-08,
|
|
"loss": 0.3715,
|
|
"num_input_tokens_seen": 46111808,
|
|
"step": 14650
|
|
},
|
|
{
|
|
"epoch": 0.9381601690032648,
|
|
"grad_norm": 36.294498443603516,
|
|
"learning_rate": 2.32586699658599e-08,
|
|
"loss": 0.2804,
|
|
"num_input_tokens_seen": 46127936,
|
|
"step": 14655
|
|
},
|
|
{
|
|
"epoch": 0.9384802509442417,
|
|
"grad_norm": 22.60685920715332,
|
|
"learning_rate": 2.3019692505094056e-08,
|
|
"loss": 0.3522,
|
|
"num_input_tokens_seen": 46142848,
|
|
"step": 14660
|
|
},
|
|
{
|
|
"epoch": 0.9388003328852186,
|
|
"grad_norm": 51.22877502441406,
|
|
"learning_rate": 2.2781934815713223e-08,
|
|
"loss": 0.5364,
|
|
"num_input_tokens_seen": 46158848,
|
|
"step": 14665
|
|
},
|
|
{
|
|
"epoch": 0.9391204148261955,
|
|
"grad_norm": 28.425065994262695,
|
|
"learning_rate": 2.254539719456061e-08,
|
|
"loss": 0.3566,
|
|
"num_input_tokens_seen": 46174912,
|
|
"step": 14670
|
|
},
|
|
{
|
|
"epoch": 0.9394404967671725,
|
|
"grad_norm": 19.683509826660156,
|
|
"learning_rate": 2.231007993695633e-08,
|
|
"loss": 0.2587,
|
|
"num_input_tokens_seen": 46189248,
|
|
"step": 14675
|
|
},
|
|
{
|
|
"epoch": 0.9397605787081493,
|
|
"grad_norm": 19.60419273376465,
|
|
"learning_rate": 2.2075983336696357e-08,
|
|
"loss": 0.314,
|
|
"num_input_tokens_seen": 46204928,
|
|
"step": 14680
|
|
},
|
|
{
|
|
"epoch": 0.9400806606491262,
|
|
"grad_norm": 40.57781982421875,
|
|
"learning_rate": 2.1843107686053353e-08,
|
|
"loss": 0.3916,
|
|
"num_input_tokens_seen": 46220160,
|
|
"step": 14685
|
|
},
|
|
{
|
|
"epoch": 0.9404007425901031,
|
|
"grad_norm": 24.233959197998047,
|
|
"learning_rate": 2.1611453275775405e-08,
|
|
"loss": 0.4249,
|
|
"num_input_tokens_seen": 46235584,
|
|
"step": 14690
|
|
},
|
|
{
|
|
"epoch": 0.94072082453108,
|
|
"grad_norm": 20.722745895385742,
|
|
"learning_rate": 2.138102039508538e-08,
|
|
"loss": 0.2691,
|
|
"num_input_tokens_seen": 46251904,
|
|
"step": 14695
|
|
},
|
|
{
|
|
"epoch": 0.9410409064720568,
|
|
"grad_norm": 43.360191345214844,
|
|
"learning_rate": 2.1151809331681703e-08,
|
|
"loss": 0.3948,
|
|
"num_input_tokens_seen": 46268032,
|
|
"step": 14700
|
|
},
|
|
{
|
|
"epoch": 0.9413609884130337,
|
|
"grad_norm": 54.16123962402344,
|
|
"learning_rate": 2.092382037173701e-08,
|
|
"loss": 0.3362,
|
|
"num_input_tokens_seen": 46283392,
|
|
"step": 14705
|
|
},
|
|
{
|
|
"epoch": 0.9416810703540106,
|
|
"grad_norm": 26.91010856628418,
|
|
"learning_rate": 2.0697053799898277e-08,
|
|
"loss": 0.2966,
|
|
"num_input_tokens_seen": 46298752,
|
|
"step": 14710
|
|
},
|
|
{
|
|
"epoch": 0.9420011522949875,
|
|
"grad_norm": 29.30316734313965,
|
|
"learning_rate": 2.0471509899286144e-08,
|
|
"loss": 0.3392,
|
|
"num_input_tokens_seen": 46314624,
|
|
"step": 14715
|
|
},
|
|
{
|
|
"epoch": 0.9423212342359644,
|
|
"grad_norm": 25.85833740234375,
|
|
"learning_rate": 2.0247188951494797e-08,
|
|
"loss": 0.3403,
|
|
"num_input_tokens_seen": 46331712,
|
|
"step": 14720
|
|
},
|
|
{
|
|
"epoch": 0.9426413161769412,
|
|
"grad_norm": 49.84812927246094,
|
|
"learning_rate": 2.0024091236591655e-08,
|
|
"loss": 0.5398,
|
|
"num_input_tokens_seen": 46347200,
|
|
"step": 14725
|
|
},
|
|
{
|
|
"epoch": 0.9429613981179182,
|
|
"grad_norm": 17.558185577392578,
|
|
"learning_rate": 1.98022170331168e-08,
|
|
"loss": 0.3166,
|
|
"num_input_tokens_seen": 46363008,
|
|
"step": 14730
|
|
},
|
|
{
|
|
"epoch": 0.9432814800588951,
|
|
"grad_norm": 32.16617202758789,
|
|
"learning_rate": 1.9581566618082744e-08,
|
|
"loss": 0.3797,
|
|
"num_input_tokens_seen": 46378816,
|
|
"step": 14735
|
|
},
|
|
{
|
|
"epoch": 0.943601561999872,
|
|
"grad_norm": 57.684410095214844,
|
|
"learning_rate": 1.9362140266974025e-08,
|
|
"loss": 0.3915,
|
|
"num_input_tokens_seen": 46395200,
|
|
"step": 14740
|
|
},
|
|
{
|
|
"epoch": 0.9439216439408489,
|
|
"grad_norm": 53.940555572509766,
|
|
"learning_rate": 1.9143938253747383e-08,
|
|
"loss": 0.3198,
|
|
"num_input_tokens_seen": 46411840,
|
|
"step": 14745
|
|
},
|
|
{
|
|
"epoch": 0.9442417258818258,
|
|
"grad_norm": 25.7904109954834,
|
|
"learning_rate": 1.892696085083023e-08,
|
|
"loss": 0.4515,
|
|
"num_input_tokens_seen": 46427776,
|
|
"step": 14750
|
|
},
|
|
{
|
|
"epoch": 0.9445618078228026,
|
|
"grad_norm": 36.919376373291016,
|
|
"learning_rate": 1.8711208329121542e-08,
|
|
"loss": 0.3118,
|
|
"num_input_tokens_seen": 46444736,
|
|
"step": 14755
|
|
},
|
|
{
|
|
"epoch": 0.9448818897637795,
|
|
"grad_norm": 26.23403549194336,
|
|
"learning_rate": 1.849668095799084e-08,
|
|
"loss": 0.3325,
|
|
"num_input_tokens_seen": 46460672,
|
|
"step": 14760
|
|
},
|
|
{
|
|
"epoch": 0.9452019717047564,
|
|
"grad_norm": 24.87689781188965,
|
|
"learning_rate": 1.8283379005278098e-08,
|
|
"loss": 0.3344,
|
|
"num_input_tokens_seen": 46476736,
|
|
"step": 14765
|
|
},
|
|
{
|
|
"epoch": 0.9455220536457333,
|
|
"grad_norm": 13.15492057800293,
|
|
"learning_rate": 1.807130273729329e-08,
|
|
"loss": 0.3231,
|
|
"num_input_tokens_seen": 46492416,
|
|
"step": 14770
|
|
},
|
|
{
|
|
"epoch": 0.9458421355867102,
|
|
"grad_norm": 36.111331939697266,
|
|
"learning_rate": 1.7860452418816173e-08,
|
|
"loss": 0.3349,
|
|
"num_input_tokens_seen": 46507264,
|
|
"step": 14775
|
|
},
|
|
{
|
|
"epoch": 0.946162217527687,
|
|
"grad_norm": 28.380617141723633,
|
|
"learning_rate": 1.7650828313095834e-08,
|
|
"loss": 0.3288,
|
|
"num_input_tokens_seen": 46524224,
|
|
"step": 14780
|
|
},
|
|
{
|
|
"epoch": 0.946482299468664,
|
|
"grad_norm": 14.132955551147461,
|
|
"learning_rate": 1.7442430681850362e-08,
|
|
"loss": 0.3101,
|
|
"num_input_tokens_seen": 46539456,
|
|
"step": 14785
|
|
},
|
|
{
|
|
"epoch": 0.9468023814096409,
|
|
"grad_norm": 38.144737243652344,
|
|
"learning_rate": 1.723525978526652e-08,
|
|
"loss": 0.4302,
|
|
"num_input_tokens_seen": 46555136,
|
|
"step": 14790
|
|
},
|
|
{
|
|
"epoch": 0.9471224633506178,
|
|
"grad_norm": 27.17024040222168,
|
|
"learning_rate": 1.702931588199996e-08,
|
|
"loss": 0.3501,
|
|
"num_input_tokens_seen": 46570432,
|
|
"step": 14795
|
|
},
|
|
{
|
|
"epoch": 0.9474425452915947,
|
|
"grad_norm": 30.944738388061523,
|
|
"learning_rate": 1.6824599229173897e-08,
|
|
"loss": 0.3115,
|
|
"num_input_tokens_seen": 46586304,
|
|
"step": 14800
|
|
},
|
|
{
|
|
"epoch": 0.9477626272325715,
|
|
"grad_norm": 33.253997802734375,
|
|
"learning_rate": 1.662111008237932e-08,
|
|
"loss": 0.2909,
|
|
"num_input_tokens_seen": 46602432,
|
|
"step": 14805
|
|
},
|
|
{
|
|
"epoch": 0.9480827091735484,
|
|
"grad_norm": 33.023921966552734,
|
|
"learning_rate": 1.6418848695675003e-08,
|
|
"loss": 0.3218,
|
|
"num_input_tokens_seen": 46617472,
|
|
"step": 14810
|
|
},
|
|
{
|
|
"epoch": 0.9484027911145253,
|
|
"grad_norm": 35.12213897705078,
|
|
"learning_rate": 1.6217815321586614e-08,
|
|
"loss": 0.372,
|
|
"num_input_tokens_seen": 46632896,
|
|
"step": 14815
|
|
},
|
|
{
|
|
"epoch": 0.9487228730555022,
|
|
"grad_norm": 18.142263412475586,
|
|
"learning_rate": 1.6018010211106602e-08,
|
|
"loss": 0.355,
|
|
"num_input_tokens_seen": 46649408,
|
|
"step": 14820
|
|
},
|
|
{
|
|
"epoch": 0.9490429549964791,
|
|
"grad_norm": 16.464832305908203,
|
|
"learning_rate": 1.58194336136942e-08,
|
|
"loss": 0.2816,
|
|
"num_input_tokens_seen": 46665344,
|
|
"step": 14825
|
|
},
|
|
{
|
|
"epoch": 0.9493630369374559,
|
|
"grad_norm": 36.46229934692383,
|
|
"learning_rate": 1.5622085777274417e-08,
|
|
"loss": 0.4274,
|
|
"num_input_tokens_seen": 46680704,
|
|
"step": 14830
|
|
},
|
|
{
|
|
"epoch": 0.9496831188784329,
|
|
"grad_norm": 39.555789947509766,
|
|
"learning_rate": 1.542596694823839e-08,
|
|
"loss": 0.3333,
|
|
"num_input_tokens_seen": 46695936,
|
|
"step": 14835
|
|
},
|
|
{
|
|
"epoch": 0.9500032008194098,
|
|
"grad_norm": 54.21735382080078,
|
|
"learning_rate": 1.5231077371442914e-08,
|
|
"loss": 0.4259,
|
|
"num_input_tokens_seen": 46711680,
|
|
"step": 14840
|
|
},
|
|
{
|
|
"epoch": 0.9503232827603867,
|
|
"grad_norm": 24.265138626098633,
|
|
"learning_rate": 1.5037417290209685e-08,
|
|
"loss": 0.2888,
|
|
"num_input_tokens_seen": 46727040,
|
|
"step": 14845
|
|
},
|
|
{
|
|
"epoch": 0.9506433647013636,
|
|
"grad_norm": 37.78664779663086,
|
|
"learning_rate": 1.4844986946325743e-08,
|
|
"loss": 0.393,
|
|
"num_input_tokens_seen": 46742720,
|
|
"step": 14850
|
|
},
|
|
{
|
|
"epoch": 0.9509634466423404,
|
|
"grad_norm": 23.887489318847656,
|
|
"learning_rate": 1.4653786580042681e-08,
|
|
"loss": 0.2502,
|
|
"num_input_tokens_seen": 46758336,
|
|
"step": 14855
|
|
},
|
|
{
|
|
"epoch": 0.9511554958069266,
|
|
"eval_loss": 0.3537425398826599,
|
|
"eval_runtime": 49.1421,
|
|
"eval_samples_per_second": 282.568,
|
|
"eval_steps_per_second": 35.326,
|
|
"num_input_tokens_seen": 46767552,
|
|
"step": 14858
|
|
},
|
|
{
|
|
"epoch": 0.9512835285833173,
|
|
"grad_norm": 22.978870391845703,
|
|
"learning_rate": 1.4463816430076215e-08,
|
|
"loss": 0.3108,
|
|
"num_input_tokens_seen": 46773312,
|
|
"step": 14860
|
|
},
|
|
{
|
|
"epoch": 0.9516036105242942,
|
|
"grad_norm": 39.241058349609375,
|
|
"learning_rate": 1.4275076733606395e-08,
|
|
"loss": 0.3685,
|
|
"num_input_tokens_seen": 46787968,
|
|
"step": 14865
|
|
},
|
|
{
|
|
"epoch": 0.9519236924652711,
|
|
"grad_norm": 24.853103637695312,
|
|
"learning_rate": 1.4087567726277061e-08,
|
|
"loss": 0.2913,
|
|
"num_input_tokens_seen": 46803712,
|
|
"step": 14870
|
|
},
|
|
{
|
|
"epoch": 0.952243774406248,
|
|
"grad_norm": 28.337535858154297,
|
|
"learning_rate": 1.390128964219528e-08,
|
|
"loss": 0.2789,
|
|
"num_input_tokens_seen": 46820288,
|
|
"step": 14875
|
|
},
|
|
{
|
|
"epoch": 0.9525638563472248,
|
|
"grad_norm": 45.00613784790039,
|
|
"learning_rate": 1.3716242713931348e-08,
|
|
"loss": 0.3819,
|
|
"num_input_tokens_seen": 46835904,
|
|
"step": 14880
|
|
},
|
|
{
|
|
"epoch": 0.9528839382882017,
|
|
"grad_norm": 27.987937927246094,
|
|
"learning_rate": 1.3532427172518789e-08,
|
|
"loss": 0.3714,
|
|
"num_input_tokens_seen": 46851136,
|
|
"step": 14885
|
|
},
|
|
{
|
|
"epoch": 0.9532040202291787,
|
|
"grad_norm": 34.979331970214844,
|
|
"learning_rate": 1.3349843247453252e-08,
|
|
"loss": 0.3343,
|
|
"num_input_tokens_seen": 46867456,
|
|
"step": 14890
|
|
},
|
|
{
|
|
"epoch": 0.9535241021701556,
|
|
"grad_norm": 26.81144905090332,
|
|
"learning_rate": 1.3168491166692941e-08,
|
|
"loss": 0.2772,
|
|
"num_input_tokens_seen": 46882816,
|
|
"step": 14895
|
|
},
|
|
{
|
|
"epoch": 0.9538441841111325,
|
|
"grad_norm": 40.77924728393555,
|
|
"learning_rate": 1.2988371156658073e-08,
|
|
"loss": 0.4506,
|
|
"num_input_tokens_seen": 46898624,
|
|
"step": 14900
|
|
},
|
|
{
|
|
"epoch": 0.9541642660521094,
|
|
"grad_norm": 28.05156135559082,
|
|
"learning_rate": 1.2809483442230763e-08,
|
|
"loss": 0.282,
|
|
"num_input_tokens_seen": 46914304,
|
|
"step": 14905
|
|
},
|
|
{
|
|
"epoch": 0.9544843479930862,
|
|
"grad_norm": 21.98477554321289,
|
|
"learning_rate": 1.2631828246754128e-08,
|
|
"loss": 0.3705,
|
|
"num_input_tokens_seen": 46930368,
|
|
"step": 14910
|
|
},
|
|
{
|
|
"epoch": 0.9548044299340631,
|
|
"grad_norm": 38.7076301574707,
|
|
"learning_rate": 1.2455405792032969e-08,
|
|
"loss": 0.364,
|
|
"num_input_tokens_seen": 46945792,
|
|
"step": 14915
|
|
},
|
|
{
|
|
"epoch": 0.95512451187504,
|
|
"grad_norm": 32.54359817504883,
|
|
"learning_rate": 1.2280216298332646e-08,
|
|
"loss": 0.342,
|
|
"num_input_tokens_seen": 46962048,
|
|
"step": 14920
|
|
},
|
|
{
|
|
"epoch": 0.9554445938160169,
|
|
"grad_norm": 53.13780212402344,
|
|
"learning_rate": 1.2106259984379642e-08,
|
|
"loss": 0.4603,
|
|
"num_input_tokens_seen": 46976768,
|
|
"step": 14925
|
|
},
|
|
{
|
|
"epoch": 0.9557646757569938,
|
|
"grad_norm": 45.00946807861328,
|
|
"learning_rate": 1.1933537067359889e-08,
|
|
"loss": 0.4141,
|
|
"num_input_tokens_seen": 46991424,
|
|
"step": 14930
|
|
},
|
|
{
|
|
"epoch": 0.9560847576979706,
|
|
"grad_norm": 24.874343872070312,
|
|
"learning_rate": 1.1762047762920446e-08,
|
|
"loss": 0.3607,
|
|
"num_input_tokens_seen": 47006656,
|
|
"step": 14935
|
|
},
|
|
{
|
|
"epoch": 0.9564048396389476,
|
|
"grad_norm": 51.970680236816406,
|
|
"learning_rate": 1.1591792285167602e-08,
|
|
"loss": 0.3576,
|
|
"num_input_tokens_seen": 47021824,
|
|
"step": 14940
|
|
},
|
|
{
|
|
"epoch": 0.9567249215799245,
|
|
"grad_norm": 29.96383285522461,
|
|
"learning_rate": 1.1422770846667206e-08,
|
|
"loss": 0.3907,
|
|
"num_input_tokens_seen": 47037440,
|
|
"step": 14945
|
|
},
|
|
{
|
|
"epoch": 0.9570450035209014,
|
|
"grad_norm": 19.72380256652832,
|
|
"learning_rate": 1.1254983658444572e-08,
|
|
"loss": 0.307,
|
|
"num_input_tokens_seen": 47053760,
|
|
"step": 14950
|
|
},
|
|
{
|
|
"epoch": 0.9573650854618783,
|
|
"grad_norm": 46.794639587402344,
|
|
"learning_rate": 1.1088430929984017e-08,
|
|
"loss": 0.3148,
|
|
"num_input_tokens_seen": 47068928,
|
|
"step": 14955
|
|
},
|
|
{
|
|
"epoch": 0.9576851674028551,
|
|
"grad_norm": 37.3883056640625,
|
|
"learning_rate": 1.0923112869228645e-08,
|
|
"loss": 0.383,
|
|
"num_input_tokens_seen": 47084672,
|
|
"step": 14960
|
|
},
|
|
{
|
|
"epoch": 0.958005249343832,
|
|
"grad_norm": 41.08680725097656,
|
|
"learning_rate": 1.0759029682579801e-08,
|
|
"loss": 0.3613,
|
|
"num_input_tokens_seen": 47101632,
|
|
"step": 14965
|
|
},
|
|
{
|
|
"epoch": 0.9583253312848089,
|
|
"grad_norm": 24.6757755279541,
|
|
"learning_rate": 1.0596181574897389e-08,
|
|
"loss": 0.306,
|
|
"num_input_tokens_seen": 47116480,
|
|
"step": 14970
|
|
},
|
|
{
|
|
"epoch": 0.9586454132257858,
|
|
"grad_norm": 29.715951919555664,
|
|
"learning_rate": 1.0434568749499107e-08,
|
|
"loss": 0.3155,
|
|
"num_input_tokens_seen": 47132992,
|
|
"step": 14975
|
|
},
|
|
{
|
|
"epoch": 0.9589654951667627,
|
|
"grad_norm": 26.07288932800293,
|
|
"learning_rate": 1.027419140816066e-08,
|
|
"loss": 0.3061,
|
|
"num_input_tokens_seen": 47149056,
|
|
"step": 14980
|
|
},
|
|
{
|
|
"epoch": 0.9592855771077395,
|
|
"grad_norm": 23.639156341552734,
|
|
"learning_rate": 1.0115049751114768e-08,
|
|
"loss": 0.2984,
|
|
"num_input_tokens_seen": 47164864,
|
|
"step": 14985
|
|
},
|
|
{
|
|
"epoch": 0.9596056590487164,
|
|
"grad_norm": 18.913105010986328,
|
|
"learning_rate": 9.957143977051941e-09,
|
|
"loss": 0.3481,
|
|
"num_input_tokens_seen": 47180544,
|
|
"step": 14990
|
|
},
|
|
{
|
|
"epoch": 0.9599257409896934,
|
|
"grad_norm": 29.4930362701416,
|
|
"learning_rate": 9.800474283119142e-09,
|
|
"loss": 0.3836,
|
|
"num_input_tokens_seen": 47196608,
|
|
"step": 14995
|
|
},
|
|
{
|
|
"epoch": 0.9602458229306703,
|
|
"grad_norm": 26.606163024902344,
|
|
"learning_rate": 9.645040864920462e-09,
|
|
"loss": 0.3701,
|
|
"num_input_tokens_seen": 47213504,
|
|
"step": 15000
|
|
},
|
|
{
|
|
"epoch": 0.9605659048716472,
|
|
"grad_norm": 32.366455078125,
|
|
"learning_rate": 9.490843916516334e-09,
|
|
"loss": 0.4056,
|
|
"num_input_tokens_seen": 47228288,
|
|
"step": 15005
|
|
},
|
|
{
|
|
"epoch": 0.960885986812624,
|
|
"grad_norm": 25.494123458862305,
|
|
"learning_rate": 9.337883630423316e-09,
|
|
"loss": 0.4448,
|
|
"num_input_tokens_seen": 47243712,
|
|
"step": 15010
|
|
},
|
|
{
|
|
"epoch": 0.9612060687536009,
|
|
"grad_norm": 50.839359283447266,
|
|
"learning_rate": 9.186160197614423e-09,
|
|
"loss": 0.4909,
|
|
"num_input_tokens_seen": 47259904,
|
|
"step": 15015
|
|
},
|
|
{
|
|
"epoch": 0.9615261506945778,
|
|
"grad_norm": 33.710933685302734,
|
|
"learning_rate": 9.035673807517795e-09,
|
|
"loss": 0.4837,
|
|
"num_input_tokens_seen": 47275072,
|
|
"step": 15020
|
|
},
|
|
{
|
|
"epoch": 0.9618462326355547,
|
|
"grad_norm": 42.61496353149414,
|
|
"learning_rate": 8.886424648017698e-09,
|
|
"loss": 0.27,
|
|
"num_input_tokens_seen": 47290688,
|
|
"step": 15025
|
|
},
|
|
{
|
|
"epoch": 0.9621663145765316,
|
|
"grad_norm": 18.92186737060547,
|
|
"learning_rate": 8.738412905453408e-09,
|
|
"loss": 0.3408,
|
|
"num_input_tokens_seen": 47306496,
|
|
"step": 15030
|
|
},
|
|
{
|
|
"epoch": 0.9624863965175084,
|
|
"grad_norm": 29.760217666625977,
|
|
"learning_rate": 8.591638764619324e-09,
|
|
"loss": 0.3575,
|
|
"num_input_tokens_seen": 47321280,
|
|
"step": 15035
|
|
},
|
|
{
|
|
"epoch": 0.9628064784584853,
|
|
"grad_norm": 45.232330322265625,
|
|
"learning_rate": 8.446102408764643e-09,
|
|
"loss": 0.3623,
|
|
"num_input_tokens_seen": 47337536,
|
|
"step": 15040
|
|
},
|
|
{
|
|
"epoch": 0.9631265603994623,
|
|
"grad_norm": 38.70942687988281,
|
|
"learning_rate": 8.301804019593129e-09,
|
|
"loss": 0.273,
|
|
"num_input_tokens_seen": 47353024,
|
|
"step": 15045
|
|
},
|
|
{
|
|
"epoch": 0.9634466423404392,
|
|
"grad_norm": 31.57654571533203,
|
|
"learning_rate": 8.158743777263333e-09,
|
|
"loss": 0.3535,
|
|
"num_input_tokens_seen": 47369088,
|
|
"step": 15050
|
|
},
|
|
{
|
|
"epoch": 0.9637667242814161,
|
|
"grad_norm": 26.071718215942383,
|
|
"learning_rate": 8.016921860387272e-09,
|
|
"loss": 0.3678,
|
|
"num_input_tokens_seen": 47384320,
|
|
"step": 15055
|
|
},
|
|
{
|
|
"epoch": 0.964086806222393,
|
|
"grad_norm": 28.67797088623047,
|
|
"learning_rate": 7.876338446031416e-09,
|
|
"loss": 0.3908,
|
|
"num_input_tokens_seen": 47400896,
|
|
"step": 15060
|
|
},
|
|
{
|
|
"epoch": 0.9644068881633698,
|
|
"grad_norm": 44.70686340332031,
|
|
"learning_rate": 7.736993709716033e-09,
|
|
"loss": 0.3169,
|
|
"num_input_tokens_seen": 47416896,
|
|
"step": 15065
|
|
},
|
|
{
|
|
"epoch": 0.9647269701043467,
|
|
"grad_norm": 49.233890533447266,
|
|
"learning_rate": 7.59888782541418e-09,
|
|
"loss": 0.4783,
|
|
"num_input_tokens_seen": 47432320,
|
|
"step": 15070
|
|
},
|
|
{
|
|
"epoch": 0.9650470520453236,
|
|
"grad_norm": 16.93093490600586,
|
|
"learning_rate": 7.462020965553151e-09,
|
|
"loss": 0.2656,
|
|
"num_input_tokens_seen": 47448320,
|
|
"step": 15075
|
|
},
|
|
{
|
|
"epoch": 0.9653671339863005,
|
|
"grad_norm": 17.901084899902344,
|
|
"learning_rate": 7.32639330101259e-09,
|
|
"loss": 0.49,
|
|
"num_input_tokens_seen": 47463488,
|
|
"step": 15080
|
|
},
|
|
{
|
|
"epoch": 0.9656872159272774,
|
|
"grad_norm": 51.073936462402344,
|
|
"learning_rate": 7.1920050011252675e-09,
|
|
"loss": 0.3886,
|
|
"num_input_tokens_seen": 47479104,
|
|
"step": 15085
|
|
},
|
|
{
|
|
"epoch": 0.9660072978682542,
|
|
"grad_norm": 37.59046173095703,
|
|
"learning_rate": 7.058856233676525e-09,
|
|
"loss": 0.391,
|
|
"num_input_tokens_seen": 47496448,
|
|
"step": 15090
|
|
},
|
|
{
|
|
"epoch": 0.9663273798092311,
|
|
"grad_norm": 86.30872344970703,
|
|
"learning_rate": 6.926947164904162e-09,
|
|
"loss": 0.3733,
|
|
"num_input_tokens_seen": 47511936,
|
|
"step": 15095
|
|
},
|
|
{
|
|
"epoch": 0.9666474617502081,
|
|
"grad_norm": 26.688161849975586,
|
|
"learning_rate": 6.796277959498331e-09,
|
|
"loss": 0.3984,
|
|
"num_input_tokens_seen": 47528320,
|
|
"step": 15100
|
|
},
|
|
{
|
|
"epoch": 0.966967543691185,
|
|
"grad_norm": 26.294218063354492,
|
|
"learning_rate": 6.666848780600864e-09,
|
|
"loss": 0.2793,
|
|
"num_input_tokens_seen": 47543296,
|
|
"step": 15105
|
|
},
|
|
{
|
|
"epoch": 0.9672876256321619,
|
|
"grad_norm": 10.18204116821289,
|
|
"learning_rate": 6.538659789805834e-09,
|
|
"loss": 0.2751,
|
|
"num_input_tokens_seen": 47558656,
|
|
"step": 15110
|
|
},
|
|
{
|
|
"epoch": 0.9676077075731387,
|
|
"grad_norm": 34.290340423583984,
|
|
"learning_rate": 6.411711147158438e-09,
|
|
"loss": 0.3498,
|
|
"num_input_tokens_seen": 47574720,
|
|
"step": 15115
|
|
},
|
|
{
|
|
"epoch": 0.9679277895141156,
|
|
"grad_norm": 52.94532012939453,
|
|
"learning_rate": 6.286003011155783e-09,
|
|
"loss": 0.3107,
|
|
"num_input_tokens_seen": 47590272,
|
|
"step": 15120
|
|
},
|
|
{
|
|
"epoch": 0.9682478714550925,
|
|
"grad_norm": 32.81538772583008,
|
|
"learning_rate": 6.161535538745877e-09,
|
|
"loss": 0.4098,
|
|
"num_input_tokens_seen": 47605696,
|
|
"step": 15125
|
|
},
|
|
{
|
|
"epoch": 0.9685679533960694,
|
|
"grad_norm": 32.042781829833984,
|
|
"learning_rate": 6.0383088853277475e-09,
|
|
"loss": 0.3975,
|
|
"num_input_tokens_seen": 47621760,
|
|
"step": 15130
|
|
},
|
|
{
|
|
"epoch": 0.9688880353370463,
|
|
"grad_norm": 24.502296447753906,
|
|
"learning_rate": 5.916323204751439e-09,
|
|
"loss": 0.3081,
|
|
"num_input_tokens_seen": 47639296,
|
|
"step": 15135
|
|
},
|
|
{
|
|
"epoch": 0.9692081172780231,
|
|
"grad_norm": 27.488826751708984,
|
|
"learning_rate": 5.795578649317345e-09,
|
|
"loss": 0.2648,
|
|
"num_input_tokens_seen": 47654656,
|
|
"step": 15140
|
|
},
|
|
{
|
|
"epoch": 0.969528199219,
|
|
"grad_norm": 44.00014877319336,
|
|
"learning_rate": 5.676075369776656e-09,
|
|
"loss": 0.3157,
|
|
"num_input_tokens_seen": 47671168,
|
|
"step": 15145
|
|
},
|
|
{
|
|
"epoch": 0.9698482811599769,
|
|
"grad_norm": 23.902742385864258,
|
|
"learning_rate": 5.557813515330468e-09,
|
|
"loss": 0.3348,
|
|
"num_input_tokens_seen": 47686400,
|
|
"step": 15150
|
|
},
|
|
{
|
|
"epoch": 0.9701683631009539,
|
|
"grad_norm": 28.53948211669922,
|
|
"learning_rate": 5.440793233630115e-09,
|
|
"loss": 0.3439,
|
|
"num_input_tokens_seen": 47701760,
|
|
"step": 15155
|
|
},
|
|
{
|
|
"epoch": 0.9704884450419308,
|
|
"grad_norm": 40.30237579345703,
|
|
"learning_rate": 5.325014670776951e-09,
|
|
"loss": 0.3063,
|
|
"num_input_tokens_seen": 47717248,
|
|
"step": 15160
|
|
},
|
|
{
|
|
"epoch": 0.9708085269829076,
|
|
"grad_norm": 60.948604583740234,
|
|
"learning_rate": 5.21047797132157e-09,
|
|
"loss": 0.3599,
|
|
"num_input_tokens_seen": 47734336,
|
|
"step": 15165
|
|
},
|
|
{
|
|
"epoch": 0.9711286089238845,
|
|
"grad_norm": 25.381938934326172,
|
|
"learning_rate": 5.097183278264694e-09,
|
|
"loss": 0.3417,
|
|
"num_input_tokens_seen": 47750464,
|
|
"step": 15170
|
|
},
|
|
{
|
|
"epoch": 0.9714486908648614,
|
|
"grad_norm": 25.686281204223633,
|
|
"learning_rate": 4.985130733055954e-09,
|
|
"loss": 0.4364,
|
|
"num_input_tokens_seen": 47765824,
|
|
"step": 15175
|
|
},
|
|
{
|
|
"epoch": 0.9717687728058383,
|
|
"grad_norm": 27.45149803161621,
|
|
"learning_rate": 4.874320475594107e-09,
|
|
"loss": 0.3893,
|
|
"num_input_tokens_seen": 47781760,
|
|
"step": 15180
|
|
},
|
|
{
|
|
"epoch": 0.9720888547468152,
|
|
"grad_norm": 17.62384605407715,
|
|
"learning_rate": 4.764752644227377e-09,
|
|
"loss": 0.2832,
|
|
"num_input_tokens_seen": 47797312,
|
|
"step": 15185
|
|
},
|
|
{
|
|
"epoch": 0.972408936687792,
|
|
"grad_norm": 29.088834762573242,
|
|
"learning_rate": 4.656427375752336e-09,
|
|
"loss": 0.3392,
|
|
"num_input_tokens_seen": 47813440,
|
|
"step": 15190
|
|
},
|
|
{
|
|
"epoch": 0.9727290186287689,
|
|
"grad_norm": 33.35861587524414,
|
|
"learning_rate": 4.549344805414246e-09,
|
|
"loss": 0.34,
|
|
"num_input_tokens_seen": 47829440,
|
|
"step": 15195
|
|
},
|
|
{
|
|
"epoch": 0.9730491005697458,
|
|
"grad_norm": 32.597530364990234,
|
|
"learning_rate": 4.443505066907049e-09,
|
|
"loss": 0.4139,
|
|
"num_input_tokens_seen": 47844608,
|
|
"step": 15200
|
|
},
|
|
{
|
|
"epoch": 0.9733691825107228,
|
|
"grad_norm": 28.545236587524414,
|
|
"learning_rate": 4.338908292372934e-09,
|
|
"loss": 0.2823,
|
|
"num_input_tokens_seen": 47860160,
|
|
"step": 15205
|
|
},
|
|
{
|
|
"epoch": 0.9736892644516997,
|
|
"grad_norm": 42.930023193359375,
|
|
"learning_rate": 4.235554612402214e-09,
|
|
"loss": 0.3864,
|
|
"num_input_tokens_seen": 47875648,
|
|
"step": 15210
|
|
},
|
|
{
|
|
"epoch": 0.9740093463926766,
|
|
"grad_norm": 48.120704650878906,
|
|
"learning_rate": 4.133444156033006e-09,
|
|
"loss": 0.381,
|
|
"num_input_tokens_seen": 47892736,
|
|
"step": 15215
|
|
},
|
|
{
|
|
"epoch": 0.9743294283336534,
|
|
"grad_norm": 37.2425422668457,
|
|
"learning_rate": 4.032577050751551e-09,
|
|
"loss": 0.3145,
|
|
"num_input_tokens_seen": 47908992,
|
|
"step": 15220
|
|
},
|
|
{
|
|
"epoch": 0.9746495102746303,
|
|
"grad_norm": 23.053668975830078,
|
|
"learning_rate": 3.932953422491669e-09,
|
|
"loss": 0.3428,
|
|
"num_input_tokens_seen": 47924736,
|
|
"step": 15225
|
|
},
|
|
{
|
|
"epoch": 0.9749695922156072,
|
|
"grad_norm": 52.20282745361328,
|
|
"learning_rate": 3.8345733956345326e-09,
|
|
"loss": 0.284,
|
|
"num_input_tokens_seen": 47941056,
|
|
"step": 15230
|
|
},
|
|
{
|
|
"epoch": 0.9752896741565841,
|
|
"grad_norm": 29.915189743041992,
|
|
"learning_rate": 3.737437093008777e-09,
|
|
"loss": 0.3619,
|
|
"num_input_tokens_seen": 47957824,
|
|
"step": 15235
|
|
},
|
|
{
|
|
"epoch": 0.975609756097561,
|
|
"grad_norm": 42.0181770324707,
|
|
"learning_rate": 3.641544635890281e-09,
|
|
"loss": 0.4107,
|
|
"num_input_tokens_seen": 47973056,
|
|
"step": 15240
|
|
},
|
|
{
|
|
"epoch": 0.9759298380385378,
|
|
"grad_norm": 18.199411392211914,
|
|
"learning_rate": 3.546896144001832e-09,
|
|
"loss": 0.3896,
|
|
"num_input_tokens_seen": 47988928,
|
|
"step": 15245
|
|
},
|
|
{
|
|
"epoch": 0.9762499199795147,
|
|
"grad_norm": 47.75886917114258,
|
|
"learning_rate": 3.4534917355132364e-09,
|
|
"loss": 0.3926,
|
|
"num_input_tokens_seen": 48004032,
|
|
"step": 15250
|
|
},
|
|
{
|
|
"epoch": 0.9765700019204916,
|
|
"grad_norm": 35.261905670166016,
|
|
"learning_rate": 3.361331527040878e-09,
|
|
"loss": 0.4376,
|
|
"num_input_tokens_seen": 48020800,
|
|
"step": 15255
|
|
},
|
|
{
|
|
"epoch": 0.9768900838614686,
|
|
"grad_norm": 31.275798797607422,
|
|
"learning_rate": 3.270415633647938e-09,
|
|
"loss": 0.3935,
|
|
"num_input_tokens_seen": 48036800,
|
|
"step": 15260
|
|
},
|
|
{
|
|
"epoch": 0.9772101658024455,
|
|
"grad_norm": 22.784738540649414,
|
|
"learning_rate": 3.180744168843952e-09,
|
|
"loss": 0.2847,
|
|
"num_input_tokens_seen": 48051264,
|
|
"step": 15265
|
|
},
|
|
{
|
|
"epoch": 0.9775302477434223,
|
|
"grad_norm": 27.314804077148438,
|
|
"learning_rate": 3.0923172445849187e-09,
|
|
"loss": 0.2318,
|
|
"num_input_tokens_seen": 48066176,
|
|
"step": 15270
|
|
},
|
|
{
|
|
"epoch": 0.9778503296843992,
|
|
"grad_norm": 34.85258865356445,
|
|
"learning_rate": 3.0051349712727493e-09,
|
|
"loss": 0.3178,
|
|
"num_input_tokens_seen": 48081984,
|
|
"step": 15275
|
|
},
|
|
{
|
|
"epoch": 0.9781704116253761,
|
|
"grad_norm": 27.141429901123047,
|
|
"learning_rate": 2.9191974577555954e-09,
|
|
"loss": 0.4072,
|
|
"num_input_tokens_seen": 48096896,
|
|
"step": 15280
|
|
},
|
|
{
|
|
"epoch": 0.978490493566353,
|
|
"grad_norm": 18.883970260620117,
|
|
"learning_rate": 2.8345048113274096e-09,
|
|
"loss": 0.2334,
|
|
"num_input_tokens_seen": 48112128,
|
|
"step": 15285
|
|
},
|
|
{
|
|
"epoch": 0.9788105755073299,
|
|
"grad_norm": 32.112449645996094,
|
|
"learning_rate": 2.751057137727941e-09,
|
|
"loss": 0.3388,
|
|
"num_input_tokens_seen": 48127616,
|
|
"step": 15290
|
|
},
|
|
{
|
|
"epoch": 0.9791306574483067,
|
|
"grad_norm": 59.22599411010742,
|
|
"learning_rate": 2.66885454114274e-09,
|
|
"loss": 0.384,
|
|
"num_input_tokens_seen": 48142144,
|
|
"step": 15295
|
|
},
|
|
{
|
|
"epoch": 0.9794507393892836,
|
|
"grad_norm": 60.90025329589844,
|
|
"learning_rate": 2.5878971242025983e-09,
|
|
"loss": 0.3776,
|
|
"num_input_tokens_seen": 48158272,
|
|
"step": 15300
|
|
},
|
|
{
|
|
"epoch": 0.9797708213302605,
|
|
"grad_norm": 23.69969940185547,
|
|
"learning_rate": 2.5081849879837746e-09,
|
|
"loss": 0.3239,
|
|
"num_input_tokens_seen": 48173120,
|
|
"step": 15305
|
|
},
|
|
{
|
|
"epoch": 0.9800909032712375,
|
|
"grad_norm": 19.513404846191406,
|
|
"learning_rate": 2.429718232007771e-09,
|
|
"loss": 0.3428,
|
|
"num_input_tokens_seen": 48188672,
|
|
"step": 15310
|
|
},
|
|
{
|
|
"epoch": 0.9804109852122144,
|
|
"grad_norm": 25.234663009643555,
|
|
"learning_rate": 2.3524969542414453e-09,
|
|
"loss": 0.2688,
|
|
"num_input_tokens_seen": 48204480,
|
|
"step": 15315
|
|
},
|
|
{
|
|
"epoch": 0.9807310671531912,
|
|
"grad_norm": 14.73193359375,
|
|
"learning_rate": 2.2765212510963418e-09,
|
|
"loss": 0.3525,
|
|
"num_input_tokens_seen": 48219584,
|
|
"step": 15320
|
|
},
|
|
{
|
|
"epoch": 0.9810511490941681,
|
|
"grad_norm": 33.33141326904297,
|
|
"learning_rate": 2.2017912174289164e-09,
|
|
"loss": 0.2847,
|
|
"num_input_tokens_seen": 48235904,
|
|
"step": 15325
|
|
},
|
|
{
|
|
"epoch": 0.981371231035145,
|
|
"grad_norm": 34.248878479003906,
|
|
"learning_rate": 2.128306946540648e-09,
|
|
"loss": 0.4052,
|
|
"num_input_tokens_seen": 48252992,
|
|
"step": 15330
|
|
},
|
|
{
|
|
"epoch": 0.9816913129761219,
|
|
"grad_norm": 28.99315071105957,
|
|
"learning_rate": 2.0560685301774792e-09,
|
|
"loss": 0.3316,
|
|
"num_input_tokens_seen": 48267840,
|
|
"step": 15335
|
|
},
|
|
{
|
|
"epoch": 0.9820113949170988,
|
|
"grad_norm": 21.494754791259766,
|
|
"learning_rate": 1.985076058529933e-09,
|
|
"loss": 0.3781,
|
|
"num_input_tokens_seen": 48282688,
|
|
"step": 15340
|
|
},
|
|
{
|
|
"epoch": 0.9823314768580756,
|
|
"grad_norm": 38.192710876464844,
|
|
"learning_rate": 1.9153296202328863e-09,
|
|
"loss": 0.4768,
|
|
"num_input_tokens_seen": 48300096,
|
|
"step": 15345
|
|
},
|
|
{
|
|
"epoch": 0.9826515587990525,
|
|
"grad_norm": 32.44169998168945,
|
|
"learning_rate": 1.8468293023656823e-09,
|
|
"loss": 0.3929,
|
|
"num_input_tokens_seen": 48315136,
|
|
"step": 15350
|
|
},
|
|
{
|
|
"epoch": 0.9829716407400294,
|
|
"grad_norm": 17.585954666137695,
|
|
"learning_rate": 1.7795751904515766e-09,
|
|
"loss": 0.4052,
|
|
"num_input_tokens_seen": 48330240,
|
|
"step": 15355
|
|
},
|
|
{
|
|
"epoch": 0.9832917226810063,
|
|
"grad_norm": 56.64820098876953,
|
|
"learning_rate": 1.7135673684584019e-09,
|
|
"loss": 0.318,
|
|
"num_input_tokens_seen": 48345280,
|
|
"step": 15360
|
|
},
|
|
{
|
|
"epoch": 0.9836118046219833,
|
|
"grad_norm": 30.882753372192383,
|
|
"learning_rate": 1.6488059187974579e-09,
|
|
"loss": 0.3972,
|
|
"num_input_tokens_seen": 48361792,
|
|
"step": 15365
|
|
},
|
|
{
|
|
"epoch": 0.9839318865629602,
|
|
"grad_norm": 32.313411712646484,
|
|
"learning_rate": 1.5852909223242894e-09,
|
|
"loss": 0.4099,
|
|
"num_input_tokens_seen": 48377408,
|
|
"step": 15370
|
|
},
|
|
{
|
|
"epoch": 0.984251968503937,
|
|
"grad_norm": 16.098203659057617,
|
|
"learning_rate": 1.5230224583380192e-09,
|
|
"loss": 0.3759,
|
|
"num_input_tokens_seen": 48392896,
|
|
"step": 15375
|
|
},
|
|
{
|
|
"epoch": 0.9845720504449139,
|
|
"grad_norm": 39.47123336791992,
|
|
"learning_rate": 1.4620006045816813e-09,
|
|
"loss": 0.4663,
|
|
"num_input_tokens_seen": 48407552,
|
|
"step": 15380
|
|
},
|
|
{
|
|
"epoch": 0.9848921323858908,
|
|
"grad_norm": 15.717222213745117,
|
|
"learning_rate": 1.4022254372417774e-09,
|
|
"loss": 0.2785,
|
|
"num_input_tokens_seen": 48424320,
|
|
"step": 15385
|
|
},
|
|
{
|
|
"epoch": 0.9852122143268677,
|
|
"grad_norm": 35.01372146606445,
|
|
"learning_rate": 1.3436970309481655e-09,
|
|
"loss": 0.5093,
|
|
"num_input_tokens_seen": 48441984,
|
|
"step": 15390
|
|
},
|
|
{
|
|
"epoch": 0.9855322962678446,
|
|
"grad_norm": 15.031546592712402,
|
|
"learning_rate": 1.2864154587742815e-09,
|
|
"loss": 0.3442,
|
|
"num_input_tokens_seen": 48456832,
|
|
"step": 15395
|
|
},
|
|
{
|
|
"epoch": 0.9858523782088214,
|
|
"grad_norm": 32.367923736572266,
|
|
"learning_rate": 1.2303807922370292e-09,
|
|
"loss": 0.3608,
|
|
"num_input_tokens_seen": 48472512,
|
|
"step": 15400
|
|
},
|
|
{
|
|
"epoch": 0.9861724601497983,
|
|
"grad_norm": 53.186859130859375,
|
|
"learning_rate": 1.1755931012961128e-09,
|
|
"loss": 0.3122,
|
|
"num_input_tokens_seen": 48488832,
|
|
"step": 15405
|
|
},
|
|
{
|
|
"epoch": 0.9864925420907752,
|
|
"grad_norm": 17.48390007019043,
|
|
"learning_rate": 1.122052454354705e-09,
|
|
"loss": 0.3491,
|
|
"num_input_tokens_seen": 48503936,
|
|
"step": 15410
|
|
},
|
|
{
|
|
"epoch": 0.9868126240317522,
|
|
"grad_norm": 20.294185638427734,
|
|
"learning_rate": 1.0697589182590005e-09,
|
|
"loss": 0.4398,
|
|
"num_input_tokens_seen": 48519040,
|
|
"step": 15415
|
|
},
|
|
{
|
|
"epoch": 0.9871327059727291,
|
|
"grad_norm": 28.50274085998535,
|
|
"learning_rate": 1.018712558297996e-09,
|
|
"loss": 0.5967,
|
|
"num_input_tokens_seen": 48535040,
|
|
"step": 15420
|
|
},
|
|
{
|
|
"epoch": 0.9874527879137059,
|
|
"grad_norm": 36.501163482666016,
|
|
"learning_rate": 9.689134382037113e-10,
|
|
"loss": 0.4383,
|
|
"num_input_tokens_seen": 48551808,
|
|
"step": 15425
|
|
},
|
|
{
|
|
"epoch": 0.9877728698546828,
|
|
"grad_norm": 35.623992919921875,
|
|
"learning_rate": 9.203616201508557e-10,
|
|
"loss": 0.3967,
|
|
"num_input_tokens_seen": 48566592,
|
|
"step": 15430
|
|
},
|
|
{
|
|
"epoch": 0.9880929517956597,
|
|
"grad_norm": 46.61222457885742,
|
|
"learning_rate": 8.730571647570517e-10,
|
|
"loss": 0.3159,
|
|
"num_input_tokens_seen": 48582720,
|
|
"step": 15435
|
|
},
|
|
{
|
|
"epoch": 0.9884130337366366,
|
|
"grad_norm": 46.78093338012695,
|
|
"learning_rate": 8.270001310825003e-10,
|
|
"loss": 0.4878,
|
|
"num_input_tokens_seen": 48599104,
|
|
"step": 15440
|
|
},
|
|
{
|
|
"epoch": 0.9887331156776135,
|
|
"grad_norm": 12.824591636657715,
|
|
"learning_rate": 7.821905766297599e-10,
|
|
"loss": 0.3118,
|
|
"num_input_tokens_seen": 48615040,
|
|
"step": 15445
|
|
},
|
|
{
|
|
"epoch": 0.9890531976185903,
|
|
"grad_norm": 28.26544952392578,
|
|
"learning_rate": 7.386285573441897e-10,
|
|
"loss": 0.3926,
|
|
"num_input_tokens_seen": 48630976,
|
|
"step": 15450
|
|
},
|
|
{
|
|
"epoch": 0.9893732795595672,
|
|
"grad_norm": 25.03919792175293,
|
|
"learning_rate": 6.963141276136175e-10,
|
|
"loss": 0.2862,
|
|
"num_input_tokens_seen": 48646080,
|
|
"step": 15455
|
|
},
|
|
{
|
|
"epoch": 0.9896933615005441,
|
|
"grad_norm": 26.057968139648438,
|
|
"learning_rate": 6.552473402678949e-10,
|
|
"loss": 0.2525,
|
|
"num_input_tokens_seen": 48662528,
|
|
"step": 15460
|
|
},
|
|
{
|
|
"epoch": 0.990013443441521,
|
|
"grad_norm": 49.04160690307617,
|
|
"learning_rate": 6.154282465794524e-10,
|
|
"loss": 0.3301,
|
|
"num_input_tokens_seen": 48680000,
|
|
"step": 15465
|
|
},
|
|
{
|
|
"epoch": 0.990333525382498,
|
|
"grad_norm": 30.749189376831055,
|
|
"learning_rate": 5.768568962629672e-10,
|
|
"loss": 0.424,
|
|
"num_input_tokens_seen": 48696256,
|
|
"step": 15470
|
|
},
|
|
{
|
|
"epoch": 0.9906536073234748,
|
|
"grad_norm": 41.51435470581055,
|
|
"learning_rate": 5.395333374751398e-10,
|
|
"loss": 0.3065,
|
|
"num_input_tokens_seen": 48711168,
|
|
"step": 15475
|
|
},
|
|
{
|
|
"epoch": 0.9909736892644517,
|
|
"grad_norm": 45.217079162597656,
|
|
"learning_rate": 5.034576168149174e-10,
|
|
"loss": 0.5309,
|
|
"num_input_tokens_seen": 48726848,
|
|
"step": 15480
|
|
},
|
|
{
|
|
"epoch": 0.9912937712054286,
|
|
"grad_norm": 48.17198181152344,
|
|
"learning_rate": 4.686297793231597e-10,
|
|
"loss": 0.4868,
|
|
"num_input_tokens_seen": 48743232,
|
|
"step": 15485
|
|
},
|
|
{
|
|
"epoch": 0.9916138531464055,
|
|
"grad_norm": 24.643993377685547,
|
|
"learning_rate": 4.350498684829729e-10,
|
|
"loss": 0.456,
|
|
"num_input_tokens_seen": 48758080,
|
|
"step": 15490
|
|
},
|
|
{
|
|
"epoch": 0.9919339350873824,
|
|
"grad_norm": 38.15465545654297,
|
|
"learning_rate": 4.0271792621926483e-10,
|
|
"loss": 0.3105,
|
|
"num_input_tokens_seen": 48773120,
|
|
"step": 15495
|
|
},
|
|
{
|
|
"epoch": 0.9922540170283592,
|
|
"grad_norm": 14.166491508483887,
|
|
"learning_rate": 3.716339928987455e-10,
|
|
"loss": 0.3815,
|
|
"num_input_tokens_seen": 48789056,
|
|
"step": 15500
|
|
},
|
|
{
|
|
"epoch": 0.9925740989693361,
|
|
"grad_norm": 64.28377532958984,
|
|
"learning_rate": 3.41798107330149e-10,
|
|
"loss": 0.4142,
|
|
"num_input_tokens_seen": 48804288,
|
|
"step": 15505
|
|
},
|
|
{
|
|
"epoch": 0.992894180910313,
|
|
"grad_norm": 34.623619079589844,
|
|
"learning_rate": 3.1321030676390027e-10,
|
|
"loss": 0.3715,
|
|
"num_input_tokens_seen": 48818816,
|
|
"step": 15510
|
|
},
|
|
{
|
|
"epoch": 0.9932142628512899,
|
|
"grad_norm": 22.467647552490234,
|
|
"learning_rate": 2.8587062689222617e-10,
|
|
"loss": 0.2872,
|
|
"num_input_tokens_seen": 48835520,
|
|
"step": 15515
|
|
},
|
|
{
|
|
"epoch": 0.9935343447922668,
|
|
"grad_norm": 30.136613845825195,
|
|
"learning_rate": 2.5977910184904473e-10,
|
|
"loss": 0.3221,
|
|
"num_input_tokens_seen": 48851328,
|
|
"step": 15520
|
|
},
|
|
{
|
|
"epoch": 0.9938544267332438,
|
|
"grad_norm": 32.950374603271484,
|
|
"learning_rate": 2.3493576420985373e-10,
|
|
"loss": 0.3354,
|
|
"num_input_tokens_seen": 48866304,
|
|
"step": 15525
|
|
},
|
|
{
|
|
"epoch": 0.9941745086742206,
|
|
"grad_norm": 15.965251922607422,
|
|
"learning_rate": 2.11340644991842e-10,
|
|
"loss": 0.3174,
|
|
"num_input_tokens_seen": 48882752,
|
|
"step": 15530
|
|
},
|
|
{
|
|
"epoch": 0.9944945906151975,
|
|
"grad_norm": 37.14493942260742,
|
|
"learning_rate": 1.8899377365388936e-10,
|
|
"loss": 0.3041,
|
|
"num_input_tokens_seen": 48898304,
|
|
"step": 15535
|
|
},
|
|
{
|
|
"epoch": 0.9948146725561744,
|
|
"grad_norm": 16.286380767822266,
|
|
"learning_rate": 1.6789517809634447e-10,
|
|
"loss": 0.4202,
|
|
"num_input_tokens_seen": 48914048,
|
|
"step": 15540
|
|
},
|
|
{
|
|
"epoch": 0.9951347544971513,
|
|
"grad_norm": 61.637794494628906,
|
|
"learning_rate": 1.480448846609139e-10,
|
|
"loss": 0.3127,
|
|
"num_input_tokens_seen": 48930176,
|
|
"step": 15545
|
|
},
|
|
{
|
|
"epoch": 0.9954548364381282,
|
|
"grad_norm": 24.89733123779297,
|
|
"learning_rate": 1.294429181311063e-10,
|
|
"loss": 0.3505,
|
|
"num_input_tokens_seen": 48945920,
|
|
"step": 15550
|
|
},
|
|
{
|
|
"epoch": 0.995774918379105,
|
|
"grad_norm": 23.30603790283203,
|
|
"learning_rate": 1.1208930173145503e-10,
|
|
"loss": 0.4079,
|
|
"num_input_tokens_seen": 48960832,
|
|
"step": 15555
|
|
},
|
|
{
|
|
"epoch": 0.9960950003200819,
|
|
"grad_norm": 21.470914840698242,
|
|
"learning_rate": 9.598405712840651e-11,
|
|
"loss": 0.3213,
|
|
"num_input_tokens_seen": 48977280,
|
|
"step": 15560
|
|
},
|
|
{
|
|
"epoch": 0.9964150822610588,
|
|
"grad_norm": 19.718584060668945,
|
|
"learning_rate": 8.1127204429432e-11,
|
|
"loss": 0.347,
|
|
"num_input_tokens_seen": 48992512,
|
|
"step": 15565
|
|
},
|
|
{
|
|
"epoch": 0.9967351642020357,
|
|
"grad_norm": 25.985633850097656,
|
|
"learning_rate": 6.751876218336061e-11,
|
|
"loss": 0.3524,
|
|
"num_input_tokens_seen": 49008128,
|
|
"step": 15570
|
|
},
|
|
{
|
|
"epoch": 0.9970552461430127,
|
|
"grad_norm": 22.135334014892578,
|
|
"learning_rate": 5.515874738071247e-11,
|
|
"loss": 0.3376,
|
|
"num_input_tokens_seen": 49024512,
|
|
"step": 15575
|
|
},
|
|
{
|
|
"epoch": 0.9973753280839895,
|
|
"grad_norm": 44.398292541503906,
|
|
"learning_rate": 4.404717545303249e-11,
|
|
"loss": 0.308,
|
|
"num_input_tokens_seen": 49040128,
|
|
"step": 15580
|
|
},
|
|
{
|
|
"epoch": 0.9976954100249664,
|
|
"grad_norm": 14.405759811401367,
|
|
"learning_rate": 3.418406027322352e-11,
|
|
"loss": 0.3099,
|
|
"num_input_tokens_seen": 49055360,
|
|
"step": 15585
|
|
},
|
|
{
|
|
"epoch": 0.9980154919659433,
|
|
"grad_norm": 33.78312683105469,
|
|
"learning_rate": 2.5569414155546254e-11,
|
|
"loss": 0.3518,
|
|
"num_input_tokens_seen": 49071360,
|
|
"step": 15590
|
|
},
|
|
{
|
|
"epoch": 0.9983355739069202,
|
|
"grad_norm": 50.76702117919922,
|
|
"learning_rate": 1.8203247855397287e-11,
|
|
"loss": 0.2734,
|
|
"num_input_tokens_seen": 49086144,
|
|
"step": 15595
|
|
},
|
|
{
|
|
"epoch": 0.9986556558478971,
|
|
"grad_norm": 33.41775131225586,
|
|
"learning_rate": 1.2085570569642101e-11,
|
|
"loss": 0.395,
|
|
"num_input_tokens_seen": 49101312,
|
|
"step": 15600
|
|
},
|
|
{
|
|
"epoch": 0.9989757377888739,
|
|
"grad_norm": 56.984737396240234,
|
|
"learning_rate": 7.216389936171019e-12,
|
|
"loss": 0.3097,
|
|
"num_input_tokens_seen": 49116672,
|
|
"step": 15605
|
|
},
|
|
{
|
|
"epoch": 0.9992958197298508,
|
|
"grad_norm": 16.939533233642578,
|
|
"learning_rate": 3.5957120342322567e-12,
|
|
"loss": 0.1772,
|
|
"num_input_tokens_seen": 49132288,
|
|
"step": 15610
|
|
},
|
|
{
|
|
"epoch": 0.9996159016708277,
|
|
"grad_norm": 15.791190147399902,
|
|
"learning_rate": 1.2235413842098807e-12,
|
|
"loss": 0.3934,
|
|
"num_input_tokens_seen": 49148096,
|
|
"step": 15615
|
|
},
|
|
{
|
|
"epoch": 0.9999359836118046,
|
|
"grad_norm": 20.582731246948242,
|
|
"learning_rate": 9.98809480678986e-14,
|
|
"loss": 0.2515,
|
|
"num_input_tokens_seen": 49163840,
|
|
"step": 15620
|
|
},
|
|
{
|
|
"epoch": 1.0,
|
|
"num_input_tokens_seen": 49166912,
|
|
"step": 15621,
|
|
"total_flos": 2.8707953551107686e+17,
|
|
"train_loss": 0.44386771268258823,
|
|
"train_runtime": 3548.0201,
|
|
"train_samples_per_second": 35.222,
|
|
"train_steps_per_second": 4.403
|
|
}
|
|
],
|
|
"logging_steps": 5,
|
|
"max_steps": 15621,
|
|
"num_input_tokens_seen": 49166912,
|
|
"num_train_epochs": 1,
|
|
"save_steps": 782,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 2.8707953551107686e+17,
|
|
"train_batch_size": 8,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|