Files
train_record_42_1779354541/trainer_state.json
ModelHub XC b03de33657 初始化项目,由ModelHub XC社区提供模型
Model: rbelanec/train_record_42_1779354541
Source: Original Platform
2026-06-04 15:09:31 +08:00

25208 lines
662 KiB
JSON

{
"best_global_step": 14858,
"best_metric": 0.35565948486328125,
"best_model_checkpoint": "saves_bts_preliminary/base/llama-3.2-1b-instruct/train_record_42_1779354541/checkpoint-14858",
"epoch": 1.0,
"eval_steps": 782,
"global_step": 15621,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0003200819409768901,
"grad_norm": 664.248046875,
"learning_rate": 5.118362124120281e-09,
"loss": 2.1538,
"num_input_tokens_seen": 15360,
"step": 5
},
{
"epoch": 0.0006401638819537802,
"grad_norm": 454.5251770019531,
"learning_rate": 1.1516314779270634e-08,
"loss": 2.3529,
"num_input_tokens_seen": 31104,
"step": 10
},
{
"epoch": 0.0009602458229306702,
"grad_norm": 552.8055419921875,
"learning_rate": 1.7914267434420987e-08,
"loss": 2.1426,
"num_input_tokens_seen": 46208,
"step": 15
},
{
"epoch": 0.0012803277639075604,
"grad_norm": 382.2819519042969,
"learning_rate": 2.431222008957134e-08,
"loss": 2.7631,
"num_input_tokens_seen": 62464,
"step": 20
},
{
"epoch": 0.0016004097048844504,
"grad_norm": 433.5123291015625,
"learning_rate": 3.071017274472169e-08,
"loss": 2.1363,
"num_input_tokens_seen": 79104,
"step": 25
},
{
"epoch": 0.0019204916458613404,
"grad_norm": 372.2735900878906,
"learning_rate": 3.710812539987204e-08,
"loss": 2.2293,
"num_input_tokens_seen": 94912,
"step": 30
},
{
"epoch": 0.0022405735868382304,
"grad_norm": 534.3236083984375,
"learning_rate": 4.350607805502239e-08,
"loss": 2.3549,
"num_input_tokens_seen": 110784,
"step": 35
},
{
"epoch": 0.002560655527815121,
"grad_norm": 314.8287353515625,
"learning_rate": 4.990403071017274e-08,
"loss": 2.1602,
"num_input_tokens_seen": 125696,
"step": 40
},
{
"epoch": 0.002880737468792011,
"grad_norm": 401.165283203125,
"learning_rate": 5.6301983365323095e-08,
"loss": 2.1132,
"num_input_tokens_seen": 140672,
"step": 45
},
{
"epoch": 0.003200819409768901,
"grad_norm": 396.3443298339844,
"learning_rate": 6.269993602047345e-08,
"loss": 2.0446,
"num_input_tokens_seen": 155456,
"step": 50
},
{
"epoch": 0.003520901350745791,
"grad_norm": 373.389892578125,
"learning_rate": 6.90978886756238e-08,
"loss": 1.9213,
"num_input_tokens_seen": 170816,
"step": 55
},
{
"epoch": 0.003840983291722681,
"grad_norm": 337.19696044921875,
"learning_rate": 7.549584133077414e-08,
"loss": 2.0365,
"num_input_tokens_seen": 185088,
"step": 60
},
{
"epoch": 0.004161065232699571,
"grad_norm": 370.04510498046875,
"learning_rate": 8.18937939859245e-08,
"loss": 1.6514,
"num_input_tokens_seen": 200384,
"step": 65
},
{
"epoch": 0.004481147173676461,
"grad_norm": 271.975830078125,
"learning_rate": 8.829174664107485e-08,
"loss": 1.6765,
"num_input_tokens_seen": 215744,
"step": 70
},
{
"epoch": 0.004801229114653352,
"grad_norm": 173.92369079589844,
"learning_rate": 9.468969929622521e-08,
"loss": 1.6609,
"num_input_tokens_seen": 230400,
"step": 75
},
{
"epoch": 0.005121311055630242,
"grad_norm": 279.897705078125,
"learning_rate": 1.0108765195137556e-07,
"loss": 1.326,
"num_input_tokens_seen": 246592,
"step": 80
},
{
"epoch": 0.005441392996607132,
"grad_norm": 108.02691650390625,
"learning_rate": 1.074856046065259e-07,
"loss": 1.1489,
"num_input_tokens_seen": 262272,
"step": 85
},
{
"epoch": 0.005761474937584022,
"grad_norm": 122.35417175292969,
"learning_rate": 1.1388355726167625e-07,
"loss": 1.0445,
"num_input_tokens_seen": 277760,
"step": 90
},
{
"epoch": 0.006081556878560912,
"grad_norm": 163.84530639648438,
"learning_rate": 1.202815099168266e-07,
"loss": 1.2459,
"num_input_tokens_seen": 292992,
"step": 95
},
{
"epoch": 0.006401638819537802,
"grad_norm": 157.150146484375,
"learning_rate": 1.2667946257197694e-07,
"loss": 1.1182,
"num_input_tokens_seen": 307840,
"step": 100
},
{
"epoch": 0.006721720760514692,
"grad_norm": 111.58399200439453,
"learning_rate": 1.3307741522712732e-07,
"loss": 1.0297,
"num_input_tokens_seen": 323008,
"step": 105
},
{
"epoch": 0.007041802701491582,
"grad_norm": 91.58705139160156,
"learning_rate": 1.3947536788227767e-07,
"loss": 1.051,
"num_input_tokens_seen": 339456,
"step": 110
},
{
"epoch": 0.007361884642468472,
"grad_norm": 85.68628692626953,
"learning_rate": 1.45873320537428e-07,
"loss": 1.1295,
"num_input_tokens_seen": 354816,
"step": 115
},
{
"epoch": 0.007681966583445362,
"grad_norm": 78.04808044433594,
"learning_rate": 1.5227127319257838e-07,
"loss": 0.8247,
"num_input_tokens_seen": 369472,
"step": 120
},
{
"epoch": 0.008002048524422252,
"grad_norm": 71.98005676269531,
"learning_rate": 1.586692258477287e-07,
"loss": 0.9577,
"num_input_tokens_seen": 384768,
"step": 125
},
{
"epoch": 0.008322130465399142,
"grad_norm": 98.72233581542969,
"learning_rate": 1.6506717850287908e-07,
"loss": 1.0328,
"num_input_tokens_seen": 400192,
"step": 130
},
{
"epoch": 0.008642212406376032,
"grad_norm": 121.78581237792969,
"learning_rate": 1.7146513115802943e-07,
"loss": 0.8953,
"num_input_tokens_seen": 416640,
"step": 135
},
{
"epoch": 0.008962294347352922,
"grad_norm": 66.73460388183594,
"learning_rate": 1.7786308381317976e-07,
"loss": 0.8265,
"num_input_tokens_seen": 432640,
"step": 140
},
{
"epoch": 0.009282376288329812,
"grad_norm": 78.51518249511719,
"learning_rate": 1.8426103646833014e-07,
"loss": 0.8983,
"num_input_tokens_seen": 448640,
"step": 145
},
{
"epoch": 0.009602458229306703,
"grad_norm": 89.75164794921875,
"learning_rate": 1.9065898912348046e-07,
"loss": 0.9503,
"num_input_tokens_seen": 464448,
"step": 150
},
{
"epoch": 0.009922540170283593,
"grad_norm": 85.8103256225586,
"learning_rate": 1.9705694177863084e-07,
"loss": 0.858,
"num_input_tokens_seen": 479488,
"step": 155
},
{
"epoch": 0.010242622111260483,
"grad_norm": 56.45402526855469,
"learning_rate": 2.034548944337812e-07,
"loss": 0.7304,
"num_input_tokens_seen": 495296,
"step": 160
},
{
"epoch": 0.010562704052237373,
"grad_norm": 89.73834991455078,
"learning_rate": 2.0985284708893152e-07,
"loss": 0.7848,
"num_input_tokens_seen": 510144,
"step": 165
},
{
"epoch": 0.010882785993214263,
"grad_norm": 81.91443634033203,
"learning_rate": 2.162507997440819e-07,
"loss": 0.8469,
"num_input_tokens_seen": 524928,
"step": 170
},
{
"epoch": 0.011202867934191153,
"grad_norm": 47.02535629272461,
"learning_rate": 2.2264875239923222e-07,
"loss": 0.7184,
"num_input_tokens_seen": 541504,
"step": 175
},
{
"epoch": 0.011522949875168043,
"grad_norm": 83.94172668457031,
"learning_rate": 2.290467050543826e-07,
"loss": 0.7354,
"num_input_tokens_seen": 556096,
"step": 180
},
{
"epoch": 0.011843031816144933,
"grad_norm": 78.48973846435547,
"learning_rate": 2.3544465770953295e-07,
"loss": 0.7351,
"num_input_tokens_seen": 572736,
"step": 185
},
{
"epoch": 0.012163113757121823,
"grad_norm": 74.28348541259766,
"learning_rate": 2.418426103646833e-07,
"loss": 0.853,
"num_input_tokens_seen": 588352,
"step": 190
},
{
"epoch": 0.012483195698098713,
"grad_norm": 56.58919906616211,
"learning_rate": 2.4824056301983363e-07,
"loss": 0.9742,
"num_input_tokens_seen": 603520,
"step": 195
},
{
"epoch": 0.012803277639075603,
"grad_norm": 68.8056640625,
"learning_rate": 2.54638515674984e-07,
"loss": 0.8167,
"num_input_tokens_seen": 619392,
"step": 200
},
{
"epoch": 0.013123359580052493,
"grad_norm": 60.73455047607422,
"learning_rate": 2.6103646833013433e-07,
"loss": 0.7996,
"num_input_tokens_seen": 635456,
"step": 205
},
{
"epoch": 0.013443441521029383,
"grad_norm": 123.12853240966797,
"learning_rate": 2.6743442098528466e-07,
"loss": 0.8732,
"num_input_tokens_seen": 650880,
"step": 210
},
{
"epoch": 0.013763523462006273,
"grad_norm": 52.10979461669922,
"learning_rate": 2.7383237364043504e-07,
"loss": 0.8244,
"num_input_tokens_seen": 666688,
"step": 215
},
{
"epoch": 0.014083605402983163,
"grad_norm": 70.8306884765625,
"learning_rate": 2.802303262955854e-07,
"loss": 0.7909,
"num_input_tokens_seen": 682112,
"step": 220
},
{
"epoch": 0.014403687343960053,
"grad_norm": 66.97418975830078,
"learning_rate": 2.866282789507358e-07,
"loss": 0.8318,
"num_input_tokens_seen": 697728,
"step": 225
},
{
"epoch": 0.014723769284936943,
"grad_norm": 54.565391540527344,
"learning_rate": 2.9302623160588607e-07,
"loss": 0.6828,
"num_input_tokens_seen": 712704,
"step": 230
},
{
"epoch": 0.015043851225913833,
"grad_norm": 93.17967987060547,
"learning_rate": 2.9942418426103644e-07,
"loss": 0.9689,
"num_input_tokens_seen": 729408,
"step": 235
},
{
"epoch": 0.015363933166890723,
"grad_norm": 90.8511962890625,
"learning_rate": 3.058221369161868e-07,
"loss": 0.7854,
"num_input_tokens_seen": 745344,
"step": 240
},
{
"epoch": 0.015684015107867613,
"grad_norm": 57.57070541381836,
"learning_rate": 3.1222008957133715e-07,
"loss": 0.7013,
"num_input_tokens_seen": 762688,
"step": 245
},
{
"epoch": 0.016004097048844503,
"grad_norm": 64.6364974975586,
"learning_rate": 3.186180422264875e-07,
"loss": 0.7128,
"num_input_tokens_seen": 779392,
"step": 250
},
{
"epoch": 0.016324178989821393,
"grad_norm": 82.40945434570312,
"learning_rate": 3.2501599488163785e-07,
"loss": 0.7911,
"num_input_tokens_seen": 794112,
"step": 255
},
{
"epoch": 0.016644260930798283,
"grad_norm": 51.280452728271484,
"learning_rate": 3.314139475367882e-07,
"loss": 0.8429,
"num_input_tokens_seen": 810112,
"step": 260
},
{
"epoch": 0.016964342871775173,
"grad_norm": 80.96479797363281,
"learning_rate": 3.3781190019193855e-07,
"loss": 0.8602,
"num_input_tokens_seen": 825472,
"step": 265
},
{
"epoch": 0.017284424812752063,
"grad_norm": 70.75425720214844,
"learning_rate": 3.4420985284708893e-07,
"loss": 0.9818,
"num_input_tokens_seen": 840128,
"step": 270
},
{
"epoch": 0.017604506753728953,
"grad_norm": 72.48260498046875,
"learning_rate": 3.5060780550223926e-07,
"loss": 0.7846,
"num_input_tokens_seen": 855104,
"step": 275
},
{
"epoch": 0.017924588694705843,
"grad_norm": 69.50082397460938,
"learning_rate": 3.570057581573896e-07,
"loss": 0.8039,
"num_input_tokens_seen": 870848,
"step": 280
},
{
"epoch": 0.018244670635682733,
"grad_norm": 47.575138092041016,
"learning_rate": 3.6340371081253996e-07,
"loss": 0.7489,
"num_input_tokens_seen": 885760,
"step": 285
},
{
"epoch": 0.018564752576659623,
"grad_norm": 45.56133270263672,
"learning_rate": 3.6980166346769034e-07,
"loss": 0.705,
"num_input_tokens_seen": 900928,
"step": 290
},
{
"epoch": 0.018884834517636517,
"grad_norm": 55.36705780029297,
"learning_rate": 3.7619961612284067e-07,
"loss": 0.7869,
"num_input_tokens_seen": 915968,
"step": 295
},
{
"epoch": 0.019204916458613407,
"grad_norm": 110.47761535644531,
"learning_rate": 3.8259756877799104e-07,
"loss": 0.9906,
"num_input_tokens_seen": 933056,
"step": 300
},
{
"epoch": 0.019524998399590297,
"grad_norm": 85.01045227050781,
"learning_rate": 3.889955214331414e-07,
"loss": 0.737,
"num_input_tokens_seen": 948416,
"step": 305
},
{
"epoch": 0.019845080340567187,
"grad_norm": 68.13928985595703,
"learning_rate": 3.953934740882917e-07,
"loss": 0.7708,
"num_input_tokens_seen": 962880,
"step": 310
},
{
"epoch": 0.020165162281544077,
"grad_norm": 60.105281829833984,
"learning_rate": 4.0179142674344207e-07,
"loss": 0.8126,
"num_input_tokens_seen": 979904,
"step": 315
},
{
"epoch": 0.020485244222520967,
"grad_norm": 68.0919189453125,
"learning_rate": 4.0818937939859245e-07,
"loss": 0.8299,
"num_input_tokens_seen": 995136,
"step": 320
},
{
"epoch": 0.020805326163497857,
"grad_norm": 61.429813385009766,
"learning_rate": 4.145873320537428e-07,
"loss": 0.771,
"num_input_tokens_seen": 1011008,
"step": 325
},
{
"epoch": 0.021125408104474747,
"grad_norm": 60.88750076293945,
"learning_rate": 4.2098528470889315e-07,
"loss": 0.831,
"num_input_tokens_seen": 1025792,
"step": 330
},
{
"epoch": 0.021445490045451637,
"grad_norm": 56.13808059692383,
"learning_rate": 4.273832373640435e-07,
"loss": 0.6929,
"num_input_tokens_seen": 1042944,
"step": 335
},
{
"epoch": 0.021765571986428527,
"grad_norm": 71.17967987060547,
"learning_rate": 4.3378119001919386e-07,
"loss": 0.8271,
"num_input_tokens_seen": 1058688,
"step": 340
},
{
"epoch": 0.022085653927405417,
"grad_norm": 53.61168670654297,
"learning_rate": 4.401791426743442e-07,
"loss": 0.7202,
"num_input_tokens_seen": 1074560,
"step": 345
},
{
"epoch": 0.022405735868382307,
"grad_norm": 76.7827377319336,
"learning_rate": 4.4657709532949456e-07,
"loss": 0.6947,
"num_input_tokens_seen": 1089728,
"step": 350
},
{
"epoch": 0.022725817809359197,
"grad_norm": 94.65788269042969,
"learning_rate": 4.5297504798464494e-07,
"loss": 0.9107,
"num_input_tokens_seen": 1105024,
"step": 355
},
{
"epoch": 0.023045899750336087,
"grad_norm": 52.23056411743164,
"learning_rate": 4.593730006397952e-07,
"loss": 0.7831,
"num_input_tokens_seen": 1121088,
"step": 360
},
{
"epoch": 0.023365981691312977,
"grad_norm": 52.77978515625,
"learning_rate": 4.657709532949456e-07,
"loss": 0.7062,
"num_input_tokens_seen": 1136896,
"step": 365
},
{
"epoch": 0.023686063632289867,
"grad_norm": 61.802242279052734,
"learning_rate": 4.7216890595009597e-07,
"loss": 0.7133,
"num_input_tokens_seen": 1153280,
"step": 370
},
{
"epoch": 0.024006145573266757,
"grad_norm": 56.2958869934082,
"learning_rate": 4.785668586052463e-07,
"loss": 0.8307,
"num_input_tokens_seen": 1169536,
"step": 375
},
{
"epoch": 0.024326227514243647,
"grad_norm": 89.10625457763672,
"learning_rate": 4.849648112603967e-07,
"loss": 0.7573,
"num_input_tokens_seen": 1185088,
"step": 380
},
{
"epoch": 0.024646309455220537,
"grad_norm": 36.04088592529297,
"learning_rate": 4.91362763915547e-07,
"loss": 0.6599,
"num_input_tokens_seen": 1200832,
"step": 385
},
{
"epoch": 0.024966391396197427,
"grad_norm": 51.54562759399414,
"learning_rate": 4.977607165706974e-07,
"loss": 0.6614,
"num_input_tokens_seen": 1216320,
"step": 390
},
{
"epoch": 0.025286473337174317,
"grad_norm": 63.85747528076172,
"learning_rate": 5.041586692258478e-07,
"loss": 0.7446,
"num_input_tokens_seen": 1232832,
"step": 395
},
{
"epoch": 0.025606555278151207,
"grad_norm": 51.690006256103516,
"learning_rate": 5.10556621880998e-07,
"loss": 0.7023,
"num_input_tokens_seen": 1248384,
"step": 400
},
{
"epoch": 0.025926637219128097,
"grad_norm": 37.42890167236328,
"learning_rate": 5.169545745361484e-07,
"loss": 0.672,
"num_input_tokens_seen": 1263936,
"step": 405
},
{
"epoch": 0.026246719160104987,
"grad_norm": 67.20600891113281,
"learning_rate": 5.233525271912988e-07,
"loss": 1.1224,
"num_input_tokens_seen": 1294208,
"step": 410
},
{
"epoch": 0.026566801101081877,
"grad_norm": 64.76204681396484,
"learning_rate": 5.297504798464492e-07,
"loss": 0.7874,
"num_input_tokens_seen": 1309120,
"step": 415
},
{
"epoch": 0.026886883042058767,
"grad_norm": 58.28200912475586,
"learning_rate": 5.361484325015994e-07,
"loss": 0.8574,
"num_input_tokens_seen": 1324224,
"step": 420
},
{
"epoch": 0.027206964983035656,
"grad_norm": 76.2400131225586,
"learning_rate": 5.425463851567498e-07,
"loss": 0.683,
"num_input_tokens_seen": 1341056,
"step": 425
},
{
"epoch": 0.027527046924012546,
"grad_norm": 53.95072555541992,
"learning_rate": 5.489443378119002e-07,
"loss": 0.7444,
"num_input_tokens_seen": 1356544,
"step": 430
},
{
"epoch": 0.027847128864989436,
"grad_norm": 65.18901824951172,
"learning_rate": 5.553422904670505e-07,
"loss": 0.6717,
"num_input_tokens_seen": 1371840,
"step": 435
},
{
"epoch": 0.028167210805966326,
"grad_norm": 56.888824462890625,
"learning_rate": 5.61740243122201e-07,
"loss": 0.6805,
"num_input_tokens_seen": 1386816,
"step": 440
},
{
"epoch": 0.028487292746943216,
"grad_norm": 65.56224822998047,
"learning_rate": 5.681381957773512e-07,
"loss": 0.744,
"num_input_tokens_seen": 1401792,
"step": 445
},
{
"epoch": 0.028807374687920106,
"grad_norm": 67.67861938476562,
"learning_rate": 5.745361484325015e-07,
"loss": 0.6219,
"num_input_tokens_seen": 1416896,
"step": 450
},
{
"epoch": 0.029127456628896996,
"grad_norm": 84.42303466796875,
"learning_rate": 5.80934101087652e-07,
"loss": 0.7479,
"num_input_tokens_seen": 1432704,
"step": 455
},
{
"epoch": 0.029447538569873886,
"grad_norm": 87.79871368408203,
"learning_rate": 5.873320537428022e-07,
"loss": 0.7009,
"num_input_tokens_seen": 1448384,
"step": 460
},
{
"epoch": 0.029767620510850776,
"grad_norm": 60.47886276245117,
"learning_rate": 5.937300063979526e-07,
"loss": 0.7189,
"num_input_tokens_seen": 1464832,
"step": 465
},
{
"epoch": 0.030087702451827666,
"grad_norm": 71.14859008789062,
"learning_rate": 6.00127959053103e-07,
"loss": 0.678,
"num_input_tokens_seen": 1479424,
"step": 470
},
{
"epoch": 0.030407784392804556,
"grad_norm": 41.08133316040039,
"learning_rate": 6.065259117082533e-07,
"loss": 0.7233,
"num_input_tokens_seen": 1494336,
"step": 475
},
{
"epoch": 0.030727866333781446,
"grad_norm": 54.67021179199219,
"learning_rate": 6.129238643634037e-07,
"loss": 0.6771,
"num_input_tokens_seen": 1509184,
"step": 480
},
{
"epoch": 0.031047948274758336,
"grad_norm": 37.22821807861328,
"learning_rate": 6.19321817018554e-07,
"loss": 0.8088,
"num_input_tokens_seen": 1525504,
"step": 485
},
{
"epoch": 0.031368030215735226,
"grad_norm": 41.66913604736328,
"learning_rate": 6.257197696737044e-07,
"loss": 0.5954,
"num_input_tokens_seen": 1541504,
"step": 490
},
{
"epoch": 0.03168811215671212,
"grad_norm": 39.45866012573242,
"learning_rate": 6.321177223288548e-07,
"loss": 0.6166,
"num_input_tokens_seen": 1557184,
"step": 495
},
{
"epoch": 0.032008194097689006,
"grad_norm": 60.6429443359375,
"learning_rate": 6.385156749840051e-07,
"loss": 0.7699,
"num_input_tokens_seen": 1573440,
"step": 500
},
{
"epoch": 0.0323282760386659,
"grad_norm": 35.02703857421875,
"learning_rate": 6.449136276391554e-07,
"loss": 0.8718,
"num_input_tokens_seen": 1588736,
"step": 505
},
{
"epoch": 0.032648357979642786,
"grad_norm": 43.51701354980469,
"learning_rate": 6.513115802943058e-07,
"loss": 0.6977,
"num_input_tokens_seen": 1604352,
"step": 510
},
{
"epoch": 0.03296843992061968,
"grad_norm": 41.14889907836914,
"learning_rate": 6.577095329494562e-07,
"loss": 0.6582,
"num_input_tokens_seen": 1618816,
"step": 515
},
{
"epoch": 0.033288521861596566,
"grad_norm": 70.44395446777344,
"learning_rate": 6.641074856046065e-07,
"loss": 0.7409,
"num_input_tokens_seen": 1635648,
"step": 520
},
{
"epoch": 0.03360860380257346,
"grad_norm": 53.1386833190918,
"learning_rate": 6.705054382597568e-07,
"loss": 0.7248,
"num_input_tokens_seen": 1651328,
"step": 525
},
{
"epoch": 0.033928685743550346,
"grad_norm": 42.05000305175781,
"learning_rate": 6.769033909149072e-07,
"loss": 0.7271,
"num_input_tokens_seen": 1668928,
"step": 530
},
{
"epoch": 0.03424876768452724,
"grad_norm": 51.52647018432617,
"learning_rate": 6.833013435700575e-07,
"loss": 0.6188,
"num_input_tokens_seen": 1685504,
"step": 535
},
{
"epoch": 0.034568849625504126,
"grad_norm": 57.56531524658203,
"learning_rate": 6.89699296225208e-07,
"loss": 0.7016,
"num_input_tokens_seen": 1701952,
"step": 540
},
{
"epoch": 0.03488893156648102,
"grad_norm": 58.09773635864258,
"learning_rate": 6.960972488803583e-07,
"loss": 0.7293,
"num_input_tokens_seen": 1716992,
"step": 545
},
{
"epoch": 0.035209013507457906,
"grad_norm": 27.947566986083984,
"learning_rate": 7.024952015355085e-07,
"loss": 0.583,
"num_input_tokens_seen": 1732160,
"step": 550
},
{
"epoch": 0.0355290954484348,
"grad_norm": 66.85079193115234,
"learning_rate": 7.08893154190659e-07,
"loss": 0.656,
"num_input_tokens_seen": 1748416,
"step": 555
},
{
"epoch": 0.035849177389411686,
"grad_norm": 48.763916015625,
"learning_rate": 7.152911068458093e-07,
"loss": 0.7074,
"num_input_tokens_seen": 1763776,
"step": 560
},
{
"epoch": 0.03616925933038858,
"grad_norm": 55.289859771728516,
"learning_rate": 7.216890595009597e-07,
"loss": 0.7046,
"num_input_tokens_seen": 1780160,
"step": 565
},
{
"epoch": 0.036489341271365466,
"grad_norm": 51.82642364501953,
"learning_rate": 7.2808701215611e-07,
"loss": 0.6024,
"num_input_tokens_seen": 1795968,
"step": 570
},
{
"epoch": 0.03680942321234236,
"grad_norm": 46.479549407958984,
"learning_rate": 7.344849648112603e-07,
"loss": 0.6319,
"num_input_tokens_seen": 1815424,
"step": 575
},
{
"epoch": 0.037129505153319246,
"grad_norm": 86.72647857666016,
"learning_rate": 7.408829174664107e-07,
"loss": 0.8256,
"num_input_tokens_seen": 1831936,
"step": 580
},
{
"epoch": 0.03744958709429614,
"grad_norm": 34.57395935058594,
"learning_rate": 7.472808701215611e-07,
"loss": 0.6147,
"num_input_tokens_seen": 1847424,
"step": 585
},
{
"epoch": 0.03776966903527303,
"grad_norm": 47.81095886230469,
"learning_rate": 7.536788227767114e-07,
"loss": 0.7338,
"num_input_tokens_seen": 1862400,
"step": 590
},
{
"epoch": 0.03808975097624992,
"grad_norm": 85.52812194824219,
"learning_rate": 7.600767754318617e-07,
"loss": 0.7698,
"num_input_tokens_seen": 1876928,
"step": 595
},
{
"epoch": 0.03840983291722681,
"grad_norm": 54.25386047363281,
"learning_rate": 7.664747280870121e-07,
"loss": 0.6403,
"num_input_tokens_seen": 1892608,
"step": 600
},
{
"epoch": 0.0387299148582037,
"grad_norm": 37.492774963378906,
"learning_rate": 7.728726807421625e-07,
"loss": 0.7287,
"num_input_tokens_seen": 1909696,
"step": 605
},
{
"epoch": 0.03904999679918059,
"grad_norm": 40.18218231201172,
"learning_rate": 7.792706333973129e-07,
"loss": 0.7582,
"num_input_tokens_seen": 1924864,
"step": 610
},
{
"epoch": 0.03937007874015748,
"grad_norm": 36.508460998535156,
"learning_rate": 7.856685860524632e-07,
"loss": 0.5567,
"num_input_tokens_seen": 1939968,
"step": 615
},
{
"epoch": 0.03969016068113437,
"grad_norm": 70.67202758789062,
"learning_rate": 7.920665387076135e-07,
"loss": 0.711,
"num_input_tokens_seen": 1955136,
"step": 620
},
{
"epoch": 0.04001024262211126,
"grad_norm": 44.07026290893555,
"learning_rate": 7.984644913627639e-07,
"loss": 0.7024,
"num_input_tokens_seen": 1970880,
"step": 625
},
{
"epoch": 0.04033032456308815,
"grad_norm": 62.611148834228516,
"learning_rate": 8.048624440179143e-07,
"loss": 0.583,
"num_input_tokens_seen": 1986752,
"step": 630
},
{
"epoch": 0.04065040650406504,
"grad_norm": 41.27976608276367,
"learning_rate": 8.112603966730645e-07,
"loss": 0.5988,
"num_input_tokens_seen": 2001856,
"step": 635
},
{
"epoch": 0.04097048844504193,
"grad_norm": 51.214908599853516,
"learning_rate": 8.17658349328215e-07,
"loss": 0.6581,
"num_input_tokens_seen": 2019968,
"step": 640
},
{
"epoch": 0.04129057038601882,
"grad_norm": 60.009483337402344,
"learning_rate": 8.240563019833653e-07,
"loss": 0.7118,
"num_input_tokens_seen": 2035328,
"step": 645
},
{
"epoch": 0.04161065232699571,
"grad_norm": 44.75967788696289,
"learning_rate": 8.304542546385156e-07,
"loss": 0.6236,
"num_input_tokens_seen": 2055168,
"step": 650
},
{
"epoch": 0.0419307342679726,
"grad_norm": 44.7182731628418,
"learning_rate": 8.36852207293666e-07,
"loss": 0.7478,
"num_input_tokens_seen": 2071808,
"step": 655
},
{
"epoch": 0.04225081620894949,
"grad_norm": 45.788204193115234,
"learning_rate": 8.432501599488163e-07,
"loss": 0.6519,
"num_input_tokens_seen": 2087424,
"step": 660
},
{
"epoch": 0.04257089814992638,
"grad_norm": 36.59204864501953,
"learning_rate": 8.496481126039667e-07,
"loss": 0.7696,
"num_input_tokens_seen": 2102592,
"step": 665
},
{
"epoch": 0.04289098009090327,
"grad_norm": 45.53342056274414,
"learning_rate": 8.560460652591171e-07,
"loss": 0.6487,
"num_input_tokens_seen": 2119488,
"step": 670
},
{
"epoch": 0.04321106203188016,
"grad_norm": 38.34718704223633,
"learning_rate": 8.624440179142674e-07,
"loss": 0.6395,
"num_input_tokens_seen": 2136000,
"step": 675
},
{
"epoch": 0.04353114397285705,
"grad_norm": 49.36905288696289,
"learning_rate": 8.688419705694177e-07,
"loss": 0.7432,
"num_input_tokens_seen": 2152448,
"step": 680
},
{
"epoch": 0.04385122591383394,
"grad_norm": 45.985836029052734,
"learning_rate": 8.752399232245681e-07,
"loss": 0.6236,
"num_input_tokens_seen": 2168000,
"step": 685
},
{
"epoch": 0.04417130785481083,
"grad_norm": 44.200565338134766,
"learning_rate": 8.816378758797185e-07,
"loss": 0.5919,
"num_input_tokens_seen": 2183552,
"step": 690
},
{
"epoch": 0.04449138979578772,
"grad_norm": 55.06573486328125,
"learning_rate": 8.880358285348688e-07,
"loss": 0.7068,
"num_input_tokens_seen": 2199488,
"step": 695
},
{
"epoch": 0.04481147173676461,
"grad_norm": 37.759578704833984,
"learning_rate": 8.944337811900191e-07,
"loss": 0.6095,
"num_input_tokens_seen": 2215296,
"step": 700
},
{
"epoch": 0.0451315536777415,
"grad_norm": 52.18317794799805,
"learning_rate": 9.008317338451695e-07,
"loss": 0.7106,
"num_input_tokens_seen": 2230016,
"step": 705
},
{
"epoch": 0.04545163561871839,
"grad_norm": 50.49892044067383,
"learning_rate": 9.072296865003198e-07,
"loss": 0.666,
"num_input_tokens_seen": 2245056,
"step": 710
},
{
"epoch": 0.04577171755969528,
"grad_norm": 36.86115646362305,
"learning_rate": 9.136276391554703e-07,
"loss": 0.6173,
"num_input_tokens_seen": 2261248,
"step": 715
},
{
"epoch": 0.04609179950067217,
"grad_norm": 57.64673614501953,
"learning_rate": 9.200255918106205e-07,
"loss": 0.65,
"num_input_tokens_seen": 2278016,
"step": 720
},
{
"epoch": 0.04641188144164906,
"grad_norm": 47.956661224365234,
"learning_rate": 9.264235444657708e-07,
"loss": 0.5715,
"num_input_tokens_seen": 2292800,
"step": 725
},
{
"epoch": 0.04673196338262595,
"grad_norm": 36.23506546020508,
"learning_rate": 9.328214971209213e-07,
"loss": 0.5988,
"num_input_tokens_seen": 2308224,
"step": 730
},
{
"epoch": 0.04705204532360284,
"grad_norm": 41.42891311645508,
"learning_rate": 9.392194497760716e-07,
"loss": 0.7054,
"num_input_tokens_seen": 2325760,
"step": 735
},
{
"epoch": 0.04737212726457973,
"grad_norm": 58.167598724365234,
"learning_rate": 9.456174024312221e-07,
"loss": 0.6911,
"num_input_tokens_seen": 2341632,
"step": 740
},
{
"epoch": 0.04769220920555662,
"grad_norm": 40.3576774597168,
"learning_rate": 9.520153550863723e-07,
"loss": 0.7079,
"num_input_tokens_seen": 2357504,
"step": 745
},
{
"epoch": 0.04801229114653351,
"grad_norm": 43.75523376464844,
"learning_rate": 9.584133077415226e-07,
"loss": 0.7284,
"num_input_tokens_seen": 2372608,
"step": 750
},
{
"epoch": 0.0483323730875104,
"grad_norm": 44.16348648071289,
"learning_rate": 9.64811260396673e-07,
"loss": 0.5945,
"num_input_tokens_seen": 2388352,
"step": 755
},
{
"epoch": 0.04865245502848729,
"grad_norm": 48.235191345214844,
"learning_rate": 9.712092130518234e-07,
"loss": 0.7012,
"num_input_tokens_seen": 2404480,
"step": 760
},
{
"epoch": 0.04897253696946418,
"grad_norm": 34.269805908203125,
"learning_rate": 9.776071657069737e-07,
"loss": 0.5138,
"num_input_tokens_seen": 2419648,
"step": 765
},
{
"epoch": 0.04929261891044107,
"grad_norm": 46.598114013671875,
"learning_rate": 9.840051183621241e-07,
"loss": 0.71,
"num_input_tokens_seen": 2435584,
"step": 770
},
{
"epoch": 0.04961270085141796,
"grad_norm": 59.983123779296875,
"learning_rate": 9.904030710172743e-07,
"loss": 0.6251,
"num_input_tokens_seen": 2451072,
"step": 775
},
{
"epoch": 0.04993278279239485,
"grad_norm": 61.95142364501953,
"learning_rate": 9.968010236724249e-07,
"loss": 0.7605,
"num_input_tokens_seen": 2467968,
"step": 780
},
{
"epoch": 0.05006081556878561,
"eval_loss": 0.6365677118301392,
"eval_runtime": 50.6079,
"eval_samples_per_second": 274.384,
"eval_steps_per_second": 34.303,
"num_input_tokens_seen": 2474432,
"step": 782
},
{
"epoch": 0.05025286473337175,
"grad_norm": 54.79026412963867,
"learning_rate": 1.0031989763275752e-06,
"loss": 0.6742,
"num_input_tokens_seen": 2484928,
"step": 785
},
{
"epoch": 0.05057294667434863,
"grad_norm": 43.971065521240234,
"learning_rate": 1.0095969289827256e-06,
"loss": 0.6802,
"num_input_tokens_seen": 2501504,
"step": 790
},
{
"epoch": 0.050893028615325527,
"grad_norm": 34.02169418334961,
"learning_rate": 1.0159948816378758e-06,
"loss": 0.5171,
"num_input_tokens_seen": 2518848,
"step": 795
},
{
"epoch": 0.05121311055630241,
"grad_norm": 34.22026443481445,
"learning_rate": 1.0223928342930262e-06,
"loss": 0.5474,
"num_input_tokens_seen": 2535680,
"step": 800
},
{
"epoch": 0.051533192497279307,
"grad_norm": 52.957420349121094,
"learning_rate": 1.0287907869481766e-06,
"loss": 0.6751,
"num_input_tokens_seen": 2550976,
"step": 805
},
{
"epoch": 0.05185327443825619,
"grad_norm": 40.167659759521484,
"learning_rate": 1.035188739603327e-06,
"loss": 0.5552,
"num_input_tokens_seen": 2566656,
"step": 810
},
{
"epoch": 0.052173356379233086,
"grad_norm": 51.743473052978516,
"learning_rate": 1.0415866922584773e-06,
"loss": 0.6319,
"num_input_tokens_seen": 2581568,
"step": 815
},
{
"epoch": 0.05249343832020997,
"grad_norm": 53.62697219848633,
"learning_rate": 1.0479846449136277e-06,
"loss": 0.64,
"num_input_tokens_seen": 2596608,
"step": 820
},
{
"epoch": 0.052813520261186866,
"grad_norm": 43.395721435546875,
"learning_rate": 1.0543825975687779e-06,
"loss": 0.672,
"num_input_tokens_seen": 2612032,
"step": 825
},
{
"epoch": 0.05313360220216375,
"grad_norm": 55.59555435180664,
"learning_rate": 1.0607805502239282e-06,
"loss": 0.7367,
"num_input_tokens_seen": 2627264,
"step": 830
},
{
"epoch": 0.053453684143140646,
"grad_norm": 45.545921325683594,
"learning_rate": 1.0671785028790788e-06,
"loss": 0.549,
"num_input_tokens_seen": 2643264,
"step": 835
},
{
"epoch": 0.05377376608411753,
"grad_norm": 44.034141540527344,
"learning_rate": 1.073576455534229e-06,
"loss": 0.542,
"num_input_tokens_seen": 2659264,
"step": 840
},
{
"epoch": 0.054093848025094426,
"grad_norm": 39.247581481933594,
"learning_rate": 1.0799744081893794e-06,
"loss": 0.5724,
"num_input_tokens_seen": 2673856,
"step": 845
},
{
"epoch": 0.05441392996607131,
"grad_norm": 36.05900192260742,
"learning_rate": 1.0863723608445297e-06,
"loss": 0.4715,
"num_input_tokens_seen": 2688448,
"step": 850
},
{
"epoch": 0.054734011907048206,
"grad_norm": 70.99398803710938,
"learning_rate": 1.09277031349968e-06,
"loss": 0.6158,
"num_input_tokens_seen": 2703872,
"step": 855
},
{
"epoch": 0.05505409384802509,
"grad_norm": 51.03853988647461,
"learning_rate": 1.0991682661548305e-06,
"loss": 0.6116,
"num_input_tokens_seen": 2719040,
"step": 860
},
{
"epoch": 0.055374175789001986,
"grad_norm": 37.42866897583008,
"learning_rate": 1.1055662188099809e-06,
"loss": 0.6182,
"num_input_tokens_seen": 2735168,
"step": 865
},
{
"epoch": 0.05569425772997887,
"grad_norm": 39.55155944824219,
"learning_rate": 1.111964171465131e-06,
"loss": 0.5281,
"num_input_tokens_seen": 2750592,
"step": 870
},
{
"epoch": 0.056014339670955766,
"grad_norm": 22.90837860107422,
"learning_rate": 1.1183621241202814e-06,
"loss": 0.6583,
"num_input_tokens_seen": 2767232,
"step": 875
},
{
"epoch": 0.05633442161193265,
"grad_norm": 55.55328369140625,
"learning_rate": 1.1247600767754318e-06,
"loss": 0.6623,
"num_input_tokens_seen": 2784768,
"step": 880
},
{
"epoch": 0.056654503552909546,
"grad_norm": 40.864261627197266,
"learning_rate": 1.1311580294305822e-06,
"loss": 0.5782,
"num_input_tokens_seen": 2799872,
"step": 885
},
{
"epoch": 0.05697458549388643,
"grad_norm": 53.441192626953125,
"learning_rate": 1.1375559820857326e-06,
"loss": 0.6444,
"num_input_tokens_seen": 2816000,
"step": 890
},
{
"epoch": 0.057294667434863326,
"grad_norm": 48.67129898071289,
"learning_rate": 1.143953934740883e-06,
"loss": 0.5844,
"num_input_tokens_seen": 2831744,
"step": 895
},
{
"epoch": 0.05761474937584021,
"grad_norm": 57.840087890625,
"learning_rate": 1.150351887396033e-06,
"loss": 0.6139,
"num_input_tokens_seen": 2847424,
"step": 900
},
{
"epoch": 0.057934831316817106,
"grad_norm": 54.10224151611328,
"learning_rate": 1.1567498400511835e-06,
"loss": 0.6182,
"num_input_tokens_seen": 2862272,
"step": 905
},
{
"epoch": 0.05825491325779399,
"grad_norm": 39.821617126464844,
"learning_rate": 1.163147792706334e-06,
"loss": 0.4962,
"num_input_tokens_seen": 2877120,
"step": 910
},
{
"epoch": 0.058574995198770886,
"grad_norm": 41.681732177734375,
"learning_rate": 1.1695457453614842e-06,
"loss": 0.5176,
"num_input_tokens_seen": 2894592,
"step": 915
},
{
"epoch": 0.05889507713974777,
"grad_norm": 43.29148864746094,
"learning_rate": 1.1759436980166346e-06,
"loss": 0.6171,
"num_input_tokens_seen": 2909888,
"step": 920
},
{
"epoch": 0.059215159080724666,
"grad_norm": 49.849117279052734,
"learning_rate": 1.182341650671785e-06,
"loss": 0.6226,
"num_input_tokens_seen": 2925632,
"step": 925
},
{
"epoch": 0.05953524102170155,
"grad_norm": 50.00711441040039,
"learning_rate": 1.1887396033269352e-06,
"loss": 0.6205,
"num_input_tokens_seen": 2941760,
"step": 930
},
{
"epoch": 0.059855322962678446,
"grad_norm": 48.1435546875,
"learning_rate": 1.1951375559820858e-06,
"loss": 0.7098,
"num_input_tokens_seen": 2957376,
"step": 935
},
{
"epoch": 0.06017540490365533,
"grad_norm": 68.08272552490234,
"learning_rate": 1.2015355086372361e-06,
"loss": 0.5669,
"num_input_tokens_seen": 2972800,
"step": 940
},
{
"epoch": 0.060495486844632226,
"grad_norm": 50.63016891479492,
"learning_rate": 1.2079334612923863e-06,
"loss": 0.7169,
"num_input_tokens_seen": 2988480,
"step": 945
},
{
"epoch": 0.06081556878560911,
"grad_norm": 44.2595100402832,
"learning_rate": 1.2143314139475367e-06,
"loss": 0.6581,
"num_input_tokens_seen": 3004480,
"step": 950
},
{
"epoch": 0.061135650726586006,
"grad_norm": 40.70684051513672,
"learning_rate": 1.220729366602687e-06,
"loss": 0.4512,
"num_input_tokens_seen": 3020288,
"step": 955
},
{
"epoch": 0.06145573266756289,
"grad_norm": 61.91799545288086,
"learning_rate": 1.2271273192578374e-06,
"loss": 0.5964,
"num_input_tokens_seen": 3035968,
"step": 960
},
{
"epoch": 0.061775814608539786,
"grad_norm": 62.57038116455078,
"learning_rate": 1.2335252719129878e-06,
"loss": 0.7436,
"num_input_tokens_seen": 3051776,
"step": 965
},
{
"epoch": 0.06209589654951667,
"grad_norm": 54.44983673095703,
"learning_rate": 1.2399232245681382e-06,
"loss": 0.5567,
"num_input_tokens_seen": 3066560,
"step": 970
},
{
"epoch": 0.062415978490493566,
"grad_norm": 55.24098587036133,
"learning_rate": 1.2463211772232884e-06,
"loss": 0.6348,
"num_input_tokens_seen": 3082496,
"step": 975
},
{
"epoch": 0.06273606043147045,
"grad_norm": 45.73814010620117,
"learning_rate": 1.2527191298784387e-06,
"loss": 0.6289,
"num_input_tokens_seen": 3097856,
"step": 980
},
{
"epoch": 0.06305614237244735,
"grad_norm": 26.932607650756836,
"learning_rate": 1.2591170825335893e-06,
"loss": 0.5081,
"num_input_tokens_seen": 3113664,
"step": 985
},
{
"epoch": 0.06337622431342424,
"grad_norm": 39.2259521484375,
"learning_rate": 1.2655150351887395e-06,
"loss": 0.6276,
"num_input_tokens_seen": 3129792,
"step": 990
},
{
"epoch": 0.06369630625440113,
"grad_norm": 31.826623916625977,
"learning_rate": 1.2719129878438899e-06,
"loss": 0.5917,
"num_input_tokens_seen": 3145024,
"step": 995
},
{
"epoch": 0.06401638819537801,
"grad_norm": 42.885284423828125,
"learning_rate": 1.2783109404990402e-06,
"loss": 0.7761,
"num_input_tokens_seen": 3161216,
"step": 1000
},
{
"epoch": 0.0643364701363549,
"grad_norm": 28.05913543701172,
"learning_rate": 1.2847088931541904e-06,
"loss": 0.4107,
"num_input_tokens_seen": 3176960,
"step": 1005
},
{
"epoch": 0.0646565520773318,
"grad_norm": 41.80731201171875,
"learning_rate": 1.291106845809341e-06,
"loss": 0.6098,
"num_input_tokens_seen": 3193088,
"step": 1010
},
{
"epoch": 0.0649766340183087,
"grad_norm": 64.06974029541016,
"learning_rate": 1.2975047984644914e-06,
"loss": 0.6776,
"num_input_tokens_seen": 3210112,
"step": 1015
},
{
"epoch": 0.06529671595928557,
"grad_norm": 42.754390716552734,
"learning_rate": 1.3039027511196418e-06,
"loss": 0.508,
"num_input_tokens_seen": 3224768,
"step": 1020
},
{
"epoch": 0.06561679790026247,
"grad_norm": 45.79197692871094,
"learning_rate": 1.310300703774792e-06,
"loss": 0.6266,
"num_input_tokens_seen": 3240128,
"step": 1025
},
{
"epoch": 0.06593687984123936,
"grad_norm": 43.44371032714844,
"learning_rate": 1.3166986564299423e-06,
"loss": 0.5607,
"num_input_tokens_seen": 3256576,
"step": 1030
},
{
"epoch": 0.06625696178221625,
"grad_norm": 35.83990478515625,
"learning_rate": 1.3230966090850929e-06,
"loss": 0.5973,
"num_input_tokens_seen": 3272384,
"step": 1035
},
{
"epoch": 0.06657704372319313,
"grad_norm": 39.57344436645508,
"learning_rate": 1.329494561740243e-06,
"loss": 0.4099,
"num_input_tokens_seen": 3288512,
"step": 1040
},
{
"epoch": 0.06689712566417003,
"grad_norm": 42.25546646118164,
"learning_rate": 1.3358925143953934e-06,
"loss": 0.4889,
"num_input_tokens_seen": 3306304,
"step": 1045
},
{
"epoch": 0.06721720760514692,
"grad_norm": 44.445255279541016,
"learning_rate": 1.3422904670505438e-06,
"loss": 0.4795,
"num_input_tokens_seen": 3321344,
"step": 1050
},
{
"epoch": 0.06753728954612381,
"grad_norm": 40.285926818847656,
"learning_rate": 1.348688419705694e-06,
"loss": 0.6207,
"num_input_tokens_seen": 3338560,
"step": 1055
},
{
"epoch": 0.06785737148710069,
"grad_norm": 27.29134750366211,
"learning_rate": 1.3550863723608446e-06,
"loss": 0.5093,
"num_input_tokens_seen": 3353152,
"step": 1060
},
{
"epoch": 0.06817745342807759,
"grad_norm": 47.73579788208008,
"learning_rate": 1.361484325015995e-06,
"loss": 0.5401,
"num_input_tokens_seen": 3369536,
"step": 1065
},
{
"epoch": 0.06849753536905448,
"grad_norm": 46.23472213745117,
"learning_rate": 1.3678822776711451e-06,
"loss": 0.6053,
"num_input_tokens_seen": 3384832,
"step": 1070
},
{
"epoch": 0.06881761731003137,
"grad_norm": 40.9404411315918,
"learning_rate": 1.3742802303262955e-06,
"loss": 0.494,
"num_input_tokens_seen": 3399424,
"step": 1075
},
{
"epoch": 0.06913769925100825,
"grad_norm": 32.21672058105469,
"learning_rate": 1.3806781829814459e-06,
"loss": 0.6625,
"num_input_tokens_seen": 3416704,
"step": 1080
},
{
"epoch": 0.06945778119198515,
"grad_norm": 61.273109436035156,
"learning_rate": 1.3870761356365963e-06,
"loss": 0.5524,
"num_input_tokens_seen": 3431552,
"step": 1085
},
{
"epoch": 0.06977786313296204,
"grad_norm": 38.4173469543457,
"learning_rate": 1.3934740882917466e-06,
"loss": 0.6509,
"num_input_tokens_seen": 3447488,
"step": 1090
},
{
"epoch": 0.07009794507393893,
"grad_norm": 41.265380859375,
"learning_rate": 1.399872040946897e-06,
"loss": 0.5205,
"num_input_tokens_seen": 3463424,
"step": 1095
},
{
"epoch": 0.07041802701491581,
"grad_norm": 52.113468170166016,
"learning_rate": 1.4062699936020472e-06,
"loss": 0.6568,
"num_input_tokens_seen": 3479680,
"step": 1100
},
{
"epoch": 0.0707381089558927,
"grad_norm": 51.376312255859375,
"learning_rate": 1.4126679462571976e-06,
"loss": 0.5165,
"num_input_tokens_seen": 3495552,
"step": 1105
},
{
"epoch": 0.0710581908968696,
"grad_norm": 57.10530471801758,
"learning_rate": 1.4190658989123481e-06,
"loss": 0.4769,
"num_input_tokens_seen": 3510976,
"step": 1110
},
{
"epoch": 0.0713782728378465,
"grad_norm": 50.24375534057617,
"learning_rate": 1.4254638515674983e-06,
"loss": 0.6453,
"num_input_tokens_seen": 3526016,
"step": 1115
},
{
"epoch": 0.07169835477882337,
"grad_norm": 22.63068962097168,
"learning_rate": 1.4318618042226487e-06,
"loss": 0.5647,
"num_input_tokens_seen": 3540544,
"step": 1120
},
{
"epoch": 0.07201843671980027,
"grad_norm": 44.166378021240234,
"learning_rate": 1.438259756877799e-06,
"loss": 0.6074,
"num_input_tokens_seen": 3556416,
"step": 1125
},
{
"epoch": 0.07233851866077716,
"grad_norm": 45.217864990234375,
"learning_rate": 1.4446577095329492e-06,
"loss": 0.5063,
"num_input_tokens_seen": 3572096,
"step": 1130
},
{
"epoch": 0.07265860060175405,
"grad_norm": 43.43882751464844,
"learning_rate": 1.4510556621880998e-06,
"loss": 0.4966,
"num_input_tokens_seen": 3587712,
"step": 1135
},
{
"epoch": 0.07297868254273093,
"grad_norm": 31.869800567626953,
"learning_rate": 1.4574536148432502e-06,
"loss": 0.5503,
"num_input_tokens_seen": 3605056,
"step": 1140
},
{
"epoch": 0.07329876448370783,
"grad_norm": 43.5185661315918,
"learning_rate": 1.4638515674984004e-06,
"loss": 0.6826,
"num_input_tokens_seen": 3621184,
"step": 1145
},
{
"epoch": 0.07361884642468472,
"grad_norm": 27.480981826782227,
"learning_rate": 1.4702495201535507e-06,
"loss": 0.5917,
"num_input_tokens_seen": 3635392,
"step": 1150
},
{
"epoch": 0.07393892836566161,
"grad_norm": 34.03066635131836,
"learning_rate": 1.4766474728087011e-06,
"loss": 0.5064,
"num_input_tokens_seen": 3649984,
"step": 1155
},
{
"epoch": 0.07425901030663849,
"grad_norm": 35.03831100463867,
"learning_rate": 1.4830454254638515e-06,
"loss": 0.5239,
"num_input_tokens_seen": 3665920,
"step": 1160
},
{
"epoch": 0.07457909224761539,
"grad_norm": 33.77798080444336,
"learning_rate": 1.4894433781190019e-06,
"loss": 0.4573,
"num_input_tokens_seen": 3680256,
"step": 1165
},
{
"epoch": 0.07489917418859228,
"grad_norm": 51.53587341308594,
"learning_rate": 1.4958413307741523e-06,
"loss": 0.5284,
"num_input_tokens_seen": 3697536,
"step": 1170
},
{
"epoch": 0.07521925612956917,
"grad_norm": 36.20228576660156,
"learning_rate": 1.5022392834293024e-06,
"loss": 0.6147,
"num_input_tokens_seen": 3713088,
"step": 1175
},
{
"epoch": 0.07553933807054607,
"grad_norm": 58.152000427246094,
"learning_rate": 1.5086372360844528e-06,
"loss": 0.6674,
"num_input_tokens_seen": 3729920,
"step": 1180
},
{
"epoch": 0.07585942001152295,
"grad_norm": 47.89228057861328,
"learning_rate": 1.5150351887396034e-06,
"loss": 0.5478,
"num_input_tokens_seen": 3745664,
"step": 1185
},
{
"epoch": 0.07617950195249984,
"grad_norm": 36.1474609375,
"learning_rate": 1.5214331413947536e-06,
"loss": 0.5246,
"num_input_tokens_seen": 3760576,
"step": 1190
},
{
"epoch": 0.07649958389347673,
"grad_norm": 30.582496643066406,
"learning_rate": 1.527831094049904e-06,
"loss": 0.5073,
"num_input_tokens_seen": 3776576,
"step": 1195
},
{
"epoch": 0.07681966583445363,
"grad_norm": 44.59170150756836,
"learning_rate": 1.5342290467050543e-06,
"loss": 0.5868,
"num_input_tokens_seen": 3792384,
"step": 1200
},
{
"epoch": 0.0771397477754305,
"grad_norm": 46.972347259521484,
"learning_rate": 1.5406269993602045e-06,
"loss": 0.6464,
"num_input_tokens_seen": 3806592,
"step": 1205
},
{
"epoch": 0.0774598297164074,
"grad_norm": 53.56911849975586,
"learning_rate": 1.547024952015355e-06,
"loss": 0.5778,
"num_input_tokens_seen": 3822080,
"step": 1210
},
{
"epoch": 0.07777991165738429,
"grad_norm": 39.08710479736328,
"learning_rate": 1.5534229046705055e-06,
"loss": 0.5346,
"num_input_tokens_seen": 3837120,
"step": 1215
},
{
"epoch": 0.07809999359836119,
"grad_norm": 39.212432861328125,
"learning_rate": 1.5598208573256556e-06,
"loss": 0.6952,
"num_input_tokens_seen": 3852864,
"step": 1220
},
{
"epoch": 0.07842007553933807,
"grad_norm": 34.752010345458984,
"learning_rate": 1.566218809980806e-06,
"loss": 0.5148,
"num_input_tokens_seen": 3869184,
"step": 1225
},
{
"epoch": 0.07874015748031496,
"grad_norm": 33.27532958984375,
"learning_rate": 1.5726167626359564e-06,
"loss": 0.5376,
"num_input_tokens_seen": 3885248,
"step": 1230
},
{
"epoch": 0.07906023942129185,
"grad_norm": 33.97770309448242,
"learning_rate": 1.5790147152911068e-06,
"loss": 0.4261,
"num_input_tokens_seen": 3900416,
"step": 1235
},
{
"epoch": 0.07938032136226875,
"grad_norm": 39.494102478027344,
"learning_rate": 1.5854126679462571e-06,
"loss": 0.553,
"num_input_tokens_seen": 3916096,
"step": 1240
},
{
"epoch": 0.07970040330324563,
"grad_norm": 50.25145721435547,
"learning_rate": 1.5918106206014075e-06,
"loss": 0.6992,
"num_input_tokens_seen": 3933312,
"step": 1245
},
{
"epoch": 0.08002048524422252,
"grad_norm": 77.80860900878906,
"learning_rate": 1.5982085732565577e-06,
"loss": 0.615,
"num_input_tokens_seen": 3949440,
"step": 1250
},
{
"epoch": 0.08034056718519941,
"grad_norm": 38.29566955566406,
"learning_rate": 1.604606525911708e-06,
"loss": 0.681,
"num_input_tokens_seen": 3964992,
"step": 1255
},
{
"epoch": 0.0806606491261763,
"grad_norm": 57.336204528808594,
"learning_rate": 1.6110044785668586e-06,
"loss": 0.733,
"num_input_tokens_seen": 3981696,
"step": 1260
},
{
"epoch": 0.08098073106715319,
"grad_norm": 39.39405059814453,
"learning_rate": 1.617402431222009e-06,
"loss": 0.6123,
"num_input_tokens_seen": 3997248,
"step": 1265
},
{
"epoch": 0.08130081300813008,
"grad_norm": 29.0351505279541,
"learning_rate": 1.6238003838771592e-06,
"loss": 0.6693,
"num_input_tokens_seen": 4011648,
"step": 1270
},
{
"epoch": 0.08162089494910697,
"grad_norm": 44.553062438964844,
"learning_rate": 1.6301983365323096e-06,
"loss": 0.5273,
"num_input_tokens_seen": 4028160,
"step": 1275
},
{
"epoch": 0.08194097689008387,
"grad_norm": 49.18455505371094,
"learning_rate": 1.63659628918746e-06,
"loss": 0.5358,
"num_input_tokens_seen": 4043584,
"step": 1280
},
{
"epoch": 0.08226105883106075,
"grad_norm": 49.502559661865234,
"learning_rate": 1.6429942418426103e-06,
"loss": 0.7184,
"num_input_tokens_seen": 4059456,
"step": 1285
},
{
"epoch": 0.08258114077203764,
"grad_norm": 26.83738899230957,
"learning_rate": 1.6493921944977607e-06,
"loss": 0.6079,
"num_input_tokens_seen": 4076096,
"step": 1290
},
{
"epoch": 0.08290122271301453,
"grad_norm": 43.34971618652344,
"learning_rate": 1.655790147152911e-06,
"loss": 0.6143,
"num_input_tokens_seen": 4093568,
"step": 1295
},
{
"epoch": 0.08322130465399143,
"grad_norm": 37.30617904663086,
"learning_rate": 1.6621880998080612e-06,
"loss": 0.66,
"num_input_tokens_seen": 4108864,
"step": 1300
},
{
"epoch": 0.0835413865949683,
"grad_norm": 38.67463684082031,
"learning_rate": 1.6685860524632116e-06,
"loss": 0.6375,
"num_input_tokens_seen": 4124224,
"step": 1305
},
{
"epoch": 0.0838614685359452,
"grad_norm": 42.833805084228516,
"learning_rate": 1.6749840051183622e-06,
"loss": 0.5497,
"num_input_tokens_seen": 4139008,
"step": 1310
},
{
"epoch": 0.08418155047692209,
"grad_norm": 25.77740478515625,
"learning_rate": 1.6813819577735124e-06,
"loss": 0.5553,
"num_input_tokens_seen": 4155008,
"step": 1315
},
{
"epoch": 0.08450163241789899,
"grad_norm": 28.039791107177734,
"learning_rate": 1.6877799104286628e-06,
"loss": 0.4854,
"num_input_tokens_seen": 4172544,
"step": 1320
},
{
"epoch": 0.08482171435887587,
"grad_norm": 49.88291549682617,
"learning_rate": 1.6941778630838131e-06,
"loss": 0.6347,
"num_input_tokens_seen": 4188416,
"step": 1325
},
{
"epoch": 0.08514179629985276,
"grad_norm": 20.655033111572266,
"learning_rate": 1.7005758157389633e-06,
"loss": 0.4947,
"num_input_tokens_seen": 4202560,
"step": 1330
},
{
"epoch": 0.08546187824082965,
"grad_norm": 36.83515548706055,
"learning_rate": 1.706973768394114e-06,
"loss": 0.5618,
"num_input_tokens_seen": 4219392,
"step": 1335
},
{
"epoch": 0.08578196018180655,
"grad_norm": 52.520286560058594,
"learning_rate": 1.7133717210492643e-06,
"loss": 0.4941,
"num_input_tokens_seen": 4235328,
"step": 1340
},
{
"epoch": 0.08610204212278343,
"grad_norm": 54.2568359375,
"learning_rate": 1.7197696737044144e-06,
"loss": 0.5828,
"num_input_tokens_seen": 4250368,
"step": 1345
},
{
"epoch": 0.08642212406376032,
"grad_norm": 29.071889877319336,
"learning_rate": 1.7261676263595648e-06,
"loss": 0.4594,
"num_input_tokens_seen": 4265856,
"step": 1350
},
{
"epoch": 0.08674220600473721,
"grad_norm": 41.43865203857422,
"learning_rate": 1.7325655790147152e-06,
"loss": 0.6623,
"num_input_tokens_seen": 4281792,
"step": 1355
},
{
"epoch": 0.0870622879457141,
"grad_norm": 39.55424118041992,
"learning_rate": 1.7389635316698656e-06,
"loss": 0.5861,
"num_input_tokens_seen": 4297088,
"step": 1360
},
{
"epoch": 0.087382369886691,
"grad_norm": 29.17723846435547,
"learning_rate": 1.745361484325016e-06,
"loss": 0.5083,
"num_input_tokens_seen": 4312192,
"step": 1365
},
{
"epoch": 0.08770245182766788,
"grad_norm": 29.204944610595703,
"learning_rate": 1.7517594369801663e-06,
"loss": 0.4848,
"num_input_tokens_seen": 4326720,
"step": 1370
},
{
"epoch": 0.08802253376864477,
"grad_norm": 62.19997787475586,
"learning_rate": 1.7581573896353165e-06,
"loss": 0.7944,
"num_input_tokens_seen": 4341760,
"step": 1375
},
{
"epoch": 0.08834261570962167,
"grad_norm": 38.76377868652344,
"learning_rate": 1.7645553422904669e-06,
"loss": 0.6211,
"num_input_tokens_seen": 4357760,
"step": 1380
},
{
"epoch": 0.08866269765059856,
"grad_norm": 37.737586975097656,
"learning_rate": 1.7709532949456175e-06,
"loss": 0.6269,
"num_input_tokens_seen": 4373824,
"step": 1385
},
{
"epoch": 0.08898277959157544,
"grad_norm": 33.710941314697266,
"learning_rate": 1.7773512476007676e-06,
"loss": 0.4764,
"num_input_tokens_seen": 4388992,
"step": 1390
},
{
"epoch": 0.08930286153255233,
"grad_norm": 31.27020835876465,
"learning_rate": 1.783749200255918e-06,
"loss": 0.5266,
"num_input_tokens_seen": 4404288,
"step": 1395
},
{
"epoch": 0.08962294347352923,
"grad_norm": 52.275421142578125,
"learning_rate": 1.7901471529110684e-06,
"loss": 0.5359,
"num_input_tokens_seen": 4419840,
"step": 1400
},
{
"epoch": 0.08994302541450612,
"grad_norm": 57.549407958984375,
"learning_rate": 1.7965451055662186e-06,
"loss": 0.5118,
"num_input_tokens_seen": 4435200,
"step": 1405
},
{
"epoch": 0.090263107355483,
"grad_norm": 55.89923858642578,
"learning_rate": 1.8029430582213691e-06,
"loss": 0.61,
"num_input_tokens_seen": 4450368,
"step": 1410
},
{
"epoch": 0.09058318929645989,
"grad_norm": 44.54636764526367,
"learning_rate": 1.8093410108765195e-06,
"loss": 0.4899,
"num_input_tokens_seen": 4466048,
"step": 1415
},
{
"epoch": 0.09090327123743679,
"grad_norm": 57.33565139770508,
"learning_rate": 1.8157389635316697e-06,
"loss": 0.6001,
"num_input_tokens_seen": 4481920,
"step": 1420
},
{
"epoch": 0.09122335317841368,
"grad_norm": 35.38874053955078,
"learning_rate": 1.82213691618682e-06,
"loss": 0.5666,
"num_input_tokens_seen": 4498112,
"step": 1425
},
{
"epoch": 0.09154343511939056,
"grad_norm": 33.644596099853516,
"learning_rate": 1.8285348688419704e-06,
"loss": 0.4314,
"num_input_tokens_seen": 4515648,
"step": 1430
},
{
"epoch": 0.09186351706036745,
"grad_norm": 37.955787658691406,
"learning_rate": 1.8349328214971208e-06,
"loss": 0.5809,
"num_input_tokens_seen": 4531840,
"step": 1435
},
{
"epoch": 0.09218359900134435,
"grad_norm": 49.74119186401367,
"learning_rate": 1.8413307741522712e-06,
"loss": 0.5463,
"num_input_tokens_seen": 4547456,
"step": 1440
},
{
"epoch": 0.09250368094232124,
"grad_norm": 41.80632019042969,
"learning_rate": 1.8477287268074216e-06,
"loss": 0.5548,
"num_input_tokens_seen": 4563328,
"step": 1445
},
{
"epoch": 0.09282376288329812,
"grad_norm": 29.985979080200195,
"learning_rate": 1.8541266794625718e-06,
"loss": 0.5856,
"num_input_tokens_seen": 4579392,
"step": 1450
},
{
"epoch": 0.09314384482427501,
"grad_norm": 82.66356658935547,
"learning_rate": 1.8605246321177221e-06,
"loss": 0.6159,
"num_input_tokens_seen": 4595584,
"step": 1455
},
{
"epoch": 0.0934639267652519,
"grad_norm": 35.51491165161133,
"learning_rate": 1.8669225847728727e-06,
"loss": 0.5707,
"num_input_tokens_seen": 4610112,
"step": 1460
},
{
"epoch": 0.0937840087062288,
"grad_norm": 56.392459869384766,
"learning_rate": 1.8733205374280229e-06,
"loss": 0.7073,
"num_input_tokens_seen": 4626432,
"step": 1465
},
{
"epoch": 0.09410409064720568,
"grad_norm": 40.24674606323242,
"learning_rate": 1.8797184900831733e-06,
"loss": 0.5799,
"num_input_tokens_seen": 4641792,
"step": 1470
},
{
"epoch": 0.09442417258818257,
"grad_norm": 19.75339698791504,
"learning_rate": 1.8861164427383236e-06,
"loss": 0.4144,
"num_input_tokens_seen": 4656896,
"step": 1475
},
{
"epoch": 0.09474425452915947,
"grad_norm": 44.9459342956543,
"learning_rate": 1.8925143953934738e-06,
"loss": 0.6021,
"num_input_tokens_seen": 4673472,
"step": 1480
},
{
"epoch": 0.09506433647013636,
"grad_norm": 26.943050384521484,
"learning_rate": 1.8989123480486244e-06,
"loss": 0.4252,
"num_input_tokens_seen": 4688896,
"step": 1485
},
{
"epoch": 0.09538441841111324,
"grad_norm": 44.868587493896484,
"learning_rate": 1.9053103007037748e-06,
"loss": 0.6809,
"num_input_tokens_seen": 4704576,
"step": 1490
},
{
"epoch": 0.09570450035209013,
"grad_norm": 39.3725471496582,
"learning_rate": 1.911708253358925e-06,
"loss": 0.6032,
"num_input_tokens_seen": 4719040,
"step": 1495
},
{
"epoch": 0.09602458229306703,
"grad_norm": 30.540884017944336,
"learning_rate": 1.9181062060140753e-06,
"loss": 0.4781,
"num_input_tokens_seen": 4733696,
"step": 1500
},
{
"epoch": 0.09634466423404392,
"grad_norm": 41.38250732421875,
"learning_rate": 1.9245041586692255e-06,
"loss": 0.6128,
"num_input_tokens_seen": 4748992,
"step": 1505
},
{
"epoch": 0.0966647461750208,
"grad_norm": 54.247108459472656,
"learning_rate": 1.930902111324376e-06,
"loss": 0.6961,
"num_input_tokens_seen": 4764992,
"step": 1510
},
{
"epoch": 0.09698482811599769,
"grad_norm": 48.3005256652832,
"learning_rate": 1.9373000639795267e-06,
"loss": 0.6847,
"num_input_tokens_seen": 4780352,
"step": 1515
},
{
"epoch": 0.09730491005697459,
"grad_norm": 32.71445846557617,
"learning_rate": 1.943698016634677e-06,
"loss": 0.5486,
"num_input_tokens_seen": 4796224,
"step": 1520
},
{
"epoch": 0.09762499199795148,
"grad_norm": 31.726274490356445,
"learning_rate": 1.950095969289827e-06,
"loss": 0.5423,
"num_input_tokens_seen": 4811840,
"step": 1525
},
{
"epoch": 0.09794507393892836,
"grad_norm": 27.152061462402344,
"learning_rate": 1.9564939219449776e-06,
"loss": 0.5833,
"num_input_tokens_seen": 4826432,
"step": 1530
},
{
"epoch": 0.09826515587990525,
"grad_norm": 30.344701766967773,
"learning_rate": 1.9628918746001278e-06,
"loss": 0.3899,
"num_input_tokens_seen": 4841920,
"step": 1535
},
{
"epoch": 0.09858523782088215,
"grad_norm": 31.89874267578125,
"learning_rate": 1.9692898272552783e-06,
"loss": 0.5976,
"num_input_tokens_seen": 4857536,
"step": 1540
},
{
"epoch": 0.09890531976185904,
"grad_norm": 59.150508880615234,
"learning_rate": 1.9756877799104285e-06,
"loss": 0.6862,
"num_input_tokens_seen": 4873408,
"step": 1545
},
{
"epoch": 0.09922540170283592,
"grad_norm": 44.73534393310547,
"learning_rate": 1.9820857325655787e-06,
"loss": 0.5992,
"num_input_tokens_seen": 4889536,
"step": 1550
},
{
"epoch": 0.09954548364381281,
"grad_norm": 59.36629104614258,
"learning_rate": 1.9884836852207293e-06,
"loss": 0.6222,
"num_input_tokens_seen": 4904448,
"step": 1555
},
{
"epoch": 0.0998655655847897,
"grad_norm": 27.847911834716797,
"learning_rate": 1.99488163787588e-06,
"loss": 0.6538,
"num_input_tokens_seen": 4919616,
"step": 1560
},
{
"epoch": 0.10012163113757122,
"eval_loss": 0.5419119000434875,
"eval_runtime": 50.629,
"eval_samples_per_second": 274.269,
"eval_steps_per_second": 34.289,
"num_input_tokens_seen": 4931328,
"step": 1564
},
{
"epoch": 0.1001856475257666,
"grad_norm": 31.496103286743164,
"learning_rate": 1.9999999750297625e-06,
"loss": 0.5142,
"num_input_tokens_seen": 4934144,
"step": 1565
},
{
"epoch": 0.1005057294667435,
"grad_norm": 33.73641586303711,
"learning_rate": 1.9999991010715873e-06,
"loss": 0.5243,
"num_input_tokens_seen": 4950272,
"step": 1570
},
{
"epoch": 0.10082581140772037,
"grad_norm": 45.70293045043945,
"learning_rate": 1.999996978602793e-06,
"loss": 0.5354,
"num_input_tokens_seen": 4965056,
"step": 1575
},
{
"epoch": 0.10114589334869727,
"grad_norm": 35.1424560546875,
"learning_rate": 1.99999360762603e-06,
"loss": 0.5617,
"num_input_tokens_seen": 4980160,
"step": 1580
},
{
"epoch": 0.10146597528967416,
"grad_norm": 37.27573776245117,
"learning_rate": 1.9999889881455065e-06,
"loss": 0.4574,
"num_input_tokens_seen": 4996992,
"step": 1585
},
{
"epoch": 0.10178605723065105,
"grad_norm": 36.42082977294922,
"learning_rate": 1.9999831201669897e-06,
"loss": 0.5212,
"num_input_tokens_seen": 5012608,
"step": 1590
},
{
"epoch": 0.10210613917162793,
"grad_norm": 40.335140228271484,
"learning_rate": 1.9999760036978067e-06,
"loss": 0.4917,
"num_input_tokens_seen": 5027840,
"step": 1595
},
{
"epoch": 0.10242622111260483,
"grad_norm": 35.37378692626953,
"learning_rate": 1.9999676387468417e-06,
"loss": 0.5698,
"num_input_tokens_seen": 5042752,
"step": 1600
},
{
"epoch": 0.10274630305358172,
"grad_norm": 28.116477966308594,
"learning_rate": 1.999958025324539e-06,
"loss": 0.5443,
"num_input_tokens_seen": 5058624,
"step": 1605
},
{
"epoch": 0.10306638499455861,
"grad_norm": 37.361881256103516,
"learning_rate": 1.999947163442901e-06,
"loss": 0.6261,
"num_input_tokens_seen": 5075008,
"step": 1610
},
{
"epoch": 0.10338646693553549,
"grad_norm": 51.469268798828125,
"learning_rate": 1.9999350531154884e-06,
"loss": 0.5363,
"num_input_tokens_seen": 5090880,
"step": 1615
},
{
"epoch": 0.10370654887651239,
"grad_norm": 51.33501052856445,
"learning_rate": 1.9999216943574223e-06,
"loss": 0.5701,
"num_input_tokens_seen": 5106816,
"step": 1620
},
{
"epoch": 0.10402663081748928,
"grad_norm": 25.08717155456543,
"learning_rate": 1.9999070871853796e-06,
"loss": 0.463,
"num_input_tokens_seen": 5123904,
"step": 1625
},
{
"epoch": 0.10434671275846617,
"grad_norm": 39.659610748291016,
"learning_rate": 1.9998912316175986e-06,
"loss": 0.4856,
"num_input_tokens_seen": 5140160,
"step": 1630
},
{
"epoch": 0.10466679469944305,
"grad_norm": 45.963687896728516,
"learning_rate": 1.9998741276738752e-06,
"loss": 0.5123,
"num_input_tokens_seen": 5156288,
"step": 1635
},
{
"epoch": 0.10498687664041995,
"grad_norm": 39.845699310302734,
"learning_rate": 1.999855775375563e-06,
"loss": 0.5907,
"num_input_tokens_seen": 5171776,
"step": 1640
},
{
"epoch": 0.10530695858139684,
"grad_norm": 41.036468505859375,
"learning_rate": 1.999836174745576e-06,
"loss": 0.6812,
"num_input_tokens_seen": 5189504,
"step": 1645
},
{
"epoch": 0.10562704052237373,
"grad_norm": 46.018531799316406,
"learning_rate": 1.9998153258083853e-06,
"loss": 0.5825,
"num_input_tokens_seen": 5205056,
"step": 1650
},
{
"epoch": 0.10594712246335061,
"grad_norm": 39.028587341308594,
"learning_rate": 1.9997932285900214e-06,
"loss": 0.5911,
"num_input_tokens_seen": 5222656,
"step": 1655
},
{
"epoch": 0.1062672044043275,
"grad_norm": 54.98348617553711,
"learning_rate": 1.9997698831180726e-06,
"loss": 0.6352,
"num_input_tokens_seen": 5238848,
"step": 1660
},
{
"epoch": 0.1065872863453044,
"grad_norm": 36.60569381713867,
"learning_rate": 1.999745289421686e-06,
"loss": 0.5226,
"num_input_tokens_seen": 5255296,
"step": 1665
},
{
"epoch": 0.10690736828628129,
"grad_norm": 59.835819244384766,
"learning_rate": 1.9997194475315674e-06,
"loss": 0.7595,
"num_input_tokens_seen": 5270336,
"step": 1670
},
{
"epoch": 0.10722745022725817,
"grad_norm": 37.14190673828125,
"learning_rate": 1.9996923574799808e-06,
"loss": 0.4864,
"num_input_tokens_seen": 5286720,
"step": 1675
},
{
"epoch": 0.10754753216823507,
"grad_norm": 34.92512893676758,
"learning_rate": 1.9996640193007476e-06,
"loss": 0.6553,
"num_input_tokens_seen": 5301632,
"step": 1680
},
{
"epoch": 0.10786761410921196,
"grad_norm": 50.379520416259766,
"learning_rate": 1.9996344330292495e-06,
"loss": 0.402,
"num_input_tokens_seen": 5316544,
"step": 1685
},
{
"epoch": 0.10818769605018885,
"grad_norm": 35.4121208190918,
"learning_rate": 1.9996035987024245e-06,
"loss": 0.5449,
"num_input_tokens_seen": 5332544,
"step": 1690
},
{
"epoch": 0.10850777799116573,
"grad_norm": 36.77679443359375,
"learning_rate": 1.99957151635877e-06,
"loss": 0.5498,
"num_input_tokens_seen": 5348096,
"step": 1695
},
{
"epoch": 0.10882785993214263,
"grad_norm": 51.19884490966797,
"learning_rate": 1.999538186038341e-06,
"loss": 0.6298,
"num_input_tokens_seen": 5362368,
"step": 1700
},
{
"epoch": 0.10914794187311952,
"grad_norm": 34.20772171020508,
"learning_rate": 1.999503607782751e-06,
"loss": 0.5357,
"num_input_tokens_seen": 5378176,
"step": 1705
},
{
"epoch": 0.10946802381409641,
"grad_norm": 45.92792510986328,
"learning_rate": 1.999467781635171e-06,
"loss": 0.5219,
"num_input_tokens_seen": 5394752,
"step": 1710
},
{
"epoch": 0.10978810575507329,
"grad_norm": 33.12445068359375,
"learning_rate": 1.9994307076403306e-06,
"loss": 0.7002,
"num_input_tokens_seen": 5412160,
"step": 1715
},
{
"epoch": 0.11010818769605019,
"grad_norm": 37.13945388793945,
"learning_rate": 1.999392385844517e-06,
"loss": 0.5297,
"num_input_tokens_seen": 5427840,
"step": 1720
},
{
"epoch": 0.11042826963702708,
"grad_norm": 37.877384185791016,
"learning_rate": 1.9993528162955753e-06,
"loss": 0.4006,
"num_input_tokens_seen": 5444224,
"step": 1725
},
{
"epoch": 0.11074835157800397,
"grad_norm": 63.48334503173828,
"learning_rate": 1.9993119990429095e-06,
"loss": 0.5775,
"num_input_tokens_seen": 5459648,
"step": 1730
},
{
"epoch": 0.11106843351898085,
"grad_norm": 49.40863800048828,
"learning_rate": 1.9992699341374794e-06,
"loss": 0.7845,
"num_input_tokens_seen": 5475008,
"step": 1735
},
{
"epoch": 0.11138851545995775,
"grad_norm": 33.796592712402344,
"learning_rate": 1.9992266216318033e-06,
"loss": 0.533,
"num_input_tokens_seen": 5491456,
"step": 1740
},
{
"epoch": 0.11170859740093464,
"grad_norm": 37.70038986206055,
"learning_rate": 1.9991820615799583e-06,
"loss": 0.5745,
"num_input_tokens_seen": 5507520,
"step": 1745
},
{
"epoch": 0.11202867934191153,
"grad_norm": 51.507301330566406,
"learning_rate": 1.999136254037578e-06,
"loss": 0.6964,
"num_input_tokens_seen": 5523072,
"step": 1750
},
{
"epoch": 0.11234876128288843,
"grad_norm": 38.705711364746094,
"learning_rate": 1.999089199061853e-06,
"loss": 0.5134,
"num_input_tokens_seen": 5538304,
"step": 1755
},
{
"epoch": 0.1126688432238653,
"grad_norm": 38.11091995239258,
"learning_rate": 1.9990408967115326e-06,
"loss": 0.4639,
"num_input_tokens_seen": 5553920,
"step": 1760
},
{
"epoch": 0.1129889251648422,
"grad_norm": 26.92587661743164,
"learning_rate": 1.998991347046922e-06,
"loss": 0.4624,
"num_input_tokens_seen": 5569344,
"step": 1765
},
{
"epoch": 0.11330900710581909,
"grad_norm": 34.729129791259766,
"learning_rate": 1.9989405501298857e-06,
"loss": 0.5057,
"num_input_tokens_seen": 5585856,
"step": 1770
},
{
"epoch": 0.11362908904679599,
"grad_norm": 51.49436950683594,
"learning_rate": 1.9988885060238436e-06,
"loss": 0.5777,
"num_input_tokens_seen": 5603840,
"step": 1775
},
{
"epoch": 0.11394917098777287,
"grad_norm": 26.645742416381836,
"learning_rate": 1.9988352147937735e-06,
"loss": 0.5185,
"num_input_tokens_seen": 5620352,
"step": 1780
},
{
"epoch": 0.11426925292874976,
"grad_norm": 47.49540328979492,
"learning_rate": 1.99878067650621e-06,
"loss": 0.5548,
"num_input_tokens_seen": 5636544,
"step": 1785
},
{
"epoch": 0.11458933486972665,
"grad_norm": 40.62596130371094,
"learning_rate": 1.998724891229245e-06,
"loss": 0.5416,
"num_input_tokens_seen": 5652672,
"step": 1790
},
{
"epoch": 0.11490941681070355,
"grad_norm": 43.69281005859375,
"learning_rate": 1.998667859032527e-06,
"loss": 0.5025,
"num_input_tokens_seen": 5668224,
"step": 1795
},
{
"epoch": 0.11522949875168043,
"grad_norm": 23.043895721435547,
"learning_rate": 1.9986095799872613e-06,
"loss": 0.4544,
"num_input_tokens_seen": 5684480,
"step": 1800
},
{
"epoch": 0.11554958069265732,
"grad_norm": 58.4548454284668,
"learning_rate": 1.99855005416621e-06,
"loss": 0.4475,
"num_input_tokens_seen": 5700864,
"step": 1805
},
{
"epoch": 0.11586966263363421,
"grad_norm": 43.189369201660156,
"learning_rate": 1.998489281643692e-06,
"loss": 0.6003,
"num_input_tokens_seen": 5716224,
"step": 1810
},
{
"epoch": 0.1161897445746111,
"grad_norm": 29.6669864654541,
"learning_rate": 1.998427262495582e-06,
"loss": 0.4876,
"num_input_tokens_seen": 5733056,
"step": 1815
},
{
"epoch": 0.11650982651558799,
"grad_norm": 37.5609016418457,
"learning_rate": 1.9983639967993124e-06,
"loss": 0.6507,
"num_input_tokens_seen": 5749120,
"step": 1820
},
{
"epoch": 0.11682990845656488,
"grad_norm": 34.187103271484375,
"learning_rate": 1.99829948463387e-06,
"loss": 0.7451,
"num_input_tokens_seen": 5763968,
"step": 1825
},
{
"epoch": 0.11714999039754177,
"grad_norm": 33.58884048461914,
"learning_rate": 1.9982337260798e-06,
"loss": 0.5556,
"num_input_tokens_seen": 5779520,
"step": 1830
},
{
"epoch": 0.11747007233851867,
"grad_norm": 37.880897521972656,
"learning_rate": 1.998166721219203e-06,
"loss": 0.5874,
"num_input_tokens_seen": 5798848,
"step": 1835
},
{
"epoch": 0.11779015427949555,
"grad_norm": 30.32021141052246,
"learning_rate": 1.9980984701357338e-06,
"loss": 0.5069,
"num_input_tokens_seen": 5813952,
"step": 1840
},
{
"epoch": 0.11811023622047244,
"grad_norm": 37.01994705200195,
"learning_rate": 1.998028972914606e-06,
"loss": 0.4306,
"num_input_tokens_seen": 5830016,
"step": 1845
},
{
"epoch": 0.11843031816144933,
"grad_norm": 40.025062561035156,
"learning_rate": 1.9979582296425877e-06,
"loss": 0.5965,
"num_input_tokens_seen": 5845312,
"step": 1850
},
{
"epoch": 0.11875040010242623,
"grad_norm": 22.668283462524414,
"learning_rate": 1.9978862404080022e-06,
"loss": 0.5894,
"num_input_tokens_seen": 5860672,
"step": 1855
},
{
"epoch": 0.1190704820434031,
"grad_norm": 37.38002014160156,
"learning_rate": 1.9978130053007295e-06,
"loss": 0.5369,
"num_input_tokens_seen": 5875776,
"step": 1860
},
{
"epoch": 0.11939056398438,
"grad_norm": 39.782684326171875,
"learning_rate": 1.9977385244122034e-06,
"loss": 0.4361,
"num_input_tokens_seen": 5891200,
"step": 1865
},
{
"epoch": 0.11971064592535689,
"grad_norm": 40.374305725097656,
"learning_rate": 1.997662797835415e-06,
"loss": 0.4922,
"num_input_tokens_seen": 5907008,
"step": 1870
},
{
"epoch": 0.12003072786633379,
"grad_norm": 35.58638000488281,
"learning_rate": 1.9975858256649097e-06,
"loss": 0.4561,
"num_input_tokens_seen": 5923264,
"step": 1875
},
{
"epoch": 0.12035080980731067,
"grad_norm": 52.00196075439453,
"learning_rate": 1.997507607996788e-06,
"loss": 0.4952,
"num_input_tokens_seen": 5939648,
"step": 1880
},
{
"epoch": 0.12067089174828756,
"grad_norm": 27.237184524536133,
"learning_rate": 1.997428144928706e-06,
"loss": 0.4576,
"num_input_tokens_seen": 5955520,
"step": 1885
},
{
"epoch": 0.12099097368926445,
"grad_norm": 43.06745147705078,
"learning_rate": 1.9973474365598736e-06,
"loss": 0.5277,
"num_input_tokens_seen": 5971072,
"step": 1890
},
{
"epoch": 0.12131105563024135,
"grad_norm": 40.0740966796875,
"learning_rate": 1.9972654829910568e-06,
"loss": 0.5794,
"num_input_tokens_seen": 5987264,
"step": 1895
},
{
"epoch": 0.12163113757121823,
"grad_norm": 55.42530059814453,
"learning_rate": 1.9971822843245748e-06,
"loss": 0.6246,
"num_input_tokens_seen": 6002880,
"step": 1900
},
{
"epoch": 0.12195121951219512,
"grad_norm": 41.884239196777344,
"learning_rate": 1.997097840664303e-06,
"loss": 0.5281,
"num_input_tokens_seen": 6019520,
"step": 1905
},
{
"epoch": 0.12227130145317201,
"grad_norm": 63.81690216064453,
"learning_rate": 1.99701215211567e-06,
"loss": 0.5722,
"num_input_tokens_seen": 6035904,
"step": 1910
},
{
"epoch": 0.1225913833941489,
"grad_norm": 33.150779724121094,
"learning_rate": 1.9969252187856587e-06,
"loss": 0.6162,
"num_input_tokens_seen": 6050816,
"step": 1915
},
{
"epoch": 0.12291146533512579,
"grad_norm": 27.57270622253418,
"learning_rate": 1.9968370407828065e-06,
"loss": 0.414,
"num_input_tokens_seen": 6065920,
"step": 1920
},
{
"epoch": 0.12323154727610268,
"grad_norm": 26.693384170532227,
"learning_rate": 1.996747618217205e-06,
"loss": 0.5995,
"num_input_tokens_seen": 6081728,
"step": 1925
},
{
"epoch": 0.12355162921707957,
"grad_norm": 29.05069351196289,
"learning_rate": 1.9966569512004987e-06,
"loss": 0.492,
"num_input_tokens_seen": 6097472,
"step": 1930
},
{
"epoch": 0.12387171115805647,
"grad_norm": 33.252803802490234,
"learning_rate": 1.996565039845887e-06,
"loss": 0.5079,
"num_input_tokens_seen": 6113152,
"step": 1935
},
{
"epoch": 0.12419179309903335,
"grad_norm": 49.879119873046875,
"learning_rate": 1.996471884268122e-06,
"loss": 0.6364,
"num_input_tokens_seen": 6129408,
"step": 1940
},
{
"epoch": 0.12451187504001024,
"grad_norm": 21.359004974365234,
"learning_rate": 1.9963774845835097e-06,
"loss": 0.5506,
"num_input_tokens_seen": 6144896,
"step": 1945
},
{
"epoch": 0.12483195698098713,
"grad_norm": 49.929439544677734,
"learning_rate": 1.996281840909909e-06,
"loss": 0.5895,
"num_input_tokens_seen": 6160256,
"step": 1950
},
{
"epoch": 0.12515203892196403,
"grad_norm": 38.942405700683594,
"learning_rate": 1.9961849533667322e-06,
"loss": 0.6389,
"num_input_tokens_seen": 6175104,
"step": 1955
},
{
"epoch": 0.1254721208629409,
"grad_norm": 31.281375885009766,
"learning_rate": 1.9960868220749447e-06,
"loss": 0.5267,
"num_input_tokens_seen": 6190272,
"step": 1960
},
{
"epoch": 0.1257922028039178,
"grad_norm": 40.56554412841797,
"learning_rate": 1.9959874471570644e-06,
"loss": 0.5836,
"num_input_tokens_seen": 6205952,
"step": 1965
},
{
"epoch": 0.1261122847448947,
"grad_norm": 46.20263671875,
"learning_rate": 1.9958868287371625e-06,
"loss": 0.5619,
"num_input_tokens_seen": 6222592,
"step": 1970
},
{
"epoch": 0.12643236668587157,
"grad_norm": 38.54600524902344,
"learning_rate": 1.9957849669408617e-06,
"loss": 0.4804,
"num_input_tokens_seen": 6237696,
"step": 1975
},
{
"epoch": 0.12675244862684848,
"grad_norm": 54.95522689819336,
"learning_rate": 1.995681861895338e-06,
"loss": 0.4947,
"num_input_tokens_seen": 6254080,
"step": 1980
},
{
"epoch": 0.12707253056782536,
"grad_norm": 28.571189880371094,
"learning_rate": 1.9955775137293187e-06,
"loss": 0.5828,
"num_input_tokens_seen": 6270016,
"step": 1985
},
{
"epoch": 0.12739261250880227,
"grad_norm": 52.56492233276367,
"learning_rate": 1.9954719225730845e-06,
"loss": 0.6161,
"num_input_tokens_seen": 6285184,
"step": 1990
},
{
"epoch": 0.12771269444977915,
"grad_norm": 50.46998596191406,
"learning_rate": 1.9953650885584666e-06,
"loss": 0.4833,
"num_input_tokens_seen": 6300992,
"step": 1995
},
{
"epoch": 0.12803277639075603,
"grad_norm": 17.092538833618164,
"learning_rate": 1.995257011818849e-06,
"loss": 0.5462,
"num_input_tokens_seen": 6315392,
"step": 2000
},
{
"epoch": 0.12835285833173293,
"grad_norm": 37.80610656738281,
"learning_rate": 1.9951476924891666e-06,
"loss": 0.4676,
"num_input_tokens_seen": 6331136,
"step": 2005
},
{
"epoch": 0.1286729402727098,
"grad_norm": 34.72353744506836,
"learning_rate": 1.9950371307059056e-06,
"loss": 0.5551,
"num_input_tokens_seen": 6347584,
"step": 2010
},
{
"epoch": 0.1289930222136867,
"grad_norm": 56.483497619628906,
"learning_rate": 1.9949253266071036e-06,
"loss": 0.5584,
"num_input_tokens_seen": 6362560,
"step": 2015
},
{
"epoch": 0.1293131041546636,
"grad_norm": 30.35417938232422,
"learning_rate": 1.9948122803323503e-06,
"loss": 0.5131,
"num_input_tokens_seen": 6378304,
"step": 2020
},
{
"epoch": 0.12963318609564048,
"grad_norm": 50.93225860595703,
"learning_rate": 1.9946979920227844e-06,
"loss": 0.5125,
"num_input_tokens_seen": 6393280,
"step": 2025
},
{
"epoch": 0.1299532680366174,
"grad_norm": 61.17381286621094,
"learning_rate": 1.994582461821096e-06,
"loss": 0.5188,
"num_input_tokens_seen": 6409472,
"step": 2030
},
{
"epoch": 0.13027334997759427,
"grad_norm": 70.97465515136719,
"learning_rate": 1.9944656898715267e-06,
"loss": 0.7149,
"num_input_tokens_seen": 6424960,
"step": 2035
},
{
"epoch": 0.13059343191857115,
"grad_norm": 33.06205368041992,
"learning_rate": 1.994347676319867e-06,
"loss": 0.6082,
"num_input_tokens_seen": 6440000,
"step": 2040
},
{
"epoch": 0.13091351385954805,
"grad_norm": 26.475330352783203,
"learning_rate": 1.994228421313459e-06,
"loss": 0.4607,
"num_input_tokens_seen": 6457600,
"step": 2045
},
{
"epoch": 0.13123359580052493,
"grad_norm": 41.18611526489258,
"learning_rate": 1.994107925001193e-06,
"loss": 0.5187,
"num_input_tokens_seen": 6473088,
"step": 2050
},
{
"epoch": 0.1315536777415018,
"grad_norm": 50.487796783447266,
"learning_rate": 1.9939861875335108e-06,
"loss": 0.595,
"num_input_tokens_seen": 6487680,
"step": 2055
},
{
"epoch": 0.13187375968247872,
"grad_norm": 37.29991912841797,
"learning_rate": 1.9938632090624025e-06,
"loss": 0.4909,
"num_input_tokens_seen": 6503296,
"step": 2060
},
{
"epoch": 0.1321938416234556,
"grad_norm": 15.12756061553955,
"learning_rate": 1.9937389897414087e-06,
"loss": 0.5368,
"num_input_tokens_seen": 6518912,
"step": 2065
},
{
"epoch": 0.1325139235644325,
"grad_norm": 43.301517486572266,
"learning_rate": 1.993613529725618e-06,
"loss": 0.5642,
"num_input_tokens_seen": 6534784,
"step": 2070
},
{
"epoch": 0.13283400550540939,
"grad_norm": 52.800323486328125,
"learning_rate": 1.99348682917167e-06,
"loss": 0.5303,
"num_input_tokens_seen": 6550528,
"step": 2075
},
{
"epoch": 0.13315408744638627,
"grad_norm": 36.16381072998047,
"learning_rate": 1.99335888823775e-06,
"loss": 0.5475,
"num_input_tokens_seen": 6566144,
"step": 2080
},
{
"epoch": 0.13347416938736317,
"grad_norm": 52.74684143066406,
"learning_rate": 1.993229707083595e-06,
"loss": 0.5654,
"num_input_tokens_seen": 6583872,
"step": 2085
},
{
"epoch": 0.13379425132834005,
"grad_norm": 21.61884307861328,
"learning_rate": 1.993099285870489e-06,
"loss": 0.4165,
"num_input_tokens_seen": 6602304,
"step": 2090
},
{
"epoch": 0.13411433326931693,
"grad_norm": 38.61161804199219,
"learning_rate": 1.992967624761264e-06,
"loss": 0.462,
"num_input_tokens_seen": 6618112,
"step": 2095
},
{
"epoch": 0.13443441521029384,
"grad_norm": 48.21979522705078,
"learning_rate": 1.9928347239203014e-06,
"loss": 0.6239,
"num_input_tokens_seen": 6635584,
"step": 2100
},
{
"epoch": 0.13475449715127072,
"grad_norm": 34.77821731567383,
"learning_rate": 1.9927005835135282e-06,
"loss": 0.5283,
"num_input_tokens_seen": 6653568,
"step": 2105
},
{
"epoch": 0.13507457909224763,
"grad_norm": 30.292238235473633,
"learning_rate": 1.9925652037084214e-06,
"loss": 0.4596,
"num_input_tokens_seen": 6668864,
"step": 2110
},
{
"epoch": 0.1353946610332245,
"grad_norm": 27.13306427001953,
"learning_rate": 1.9924285846740037e-06,
"loss": 0.4838,
"num_input_tokens_seen": 6684416,
"step": 2115
},
{
"epoch": 0.13571474297420139,
"grad_norm": 50.00841522216797,
"learning_rate": 1.9922907265808452e-06,
"loss": 0.5948,
"num_input_tokens_seen": 6699392,
"step": 2120
},
{
"epoch": 0.1360348249151783,
"grad_norm": 45.320167541503906,
"learning_rate": 1.9921516296010643e-06,
"loss": 0.544,
"num_input_tokens_seen": 6714560,
"step": 2125
},
{
"epoch": 0.13635490685615517,
"grad_norm": 50.58386993408203,
"learning_rate": 1.9920112939083246e-06,
"loss": 0.5678,
"num_input_tokens_seen": 6729920,
"step": 2130
},
{
"epoch": 0.13667498879713205,
"grad_norm": 27.673641204833984,
"learning_rate": 1.9918697196778367e-06,
"loss": 0.5607,
"num_input_tokens_seen": 6744768,
"step": 2135
},
{
"epoch": 0.13699507073810896,
"grad_norm": 27.846073150634766,
"learning_rate": 1.9917269070863578e-06,
"loss": 0.4531,
"num_input_tokens_seen": 6759680,
"step": 2140
},
{
"epoch": 0.13731515267908584,
"grad_norm": 36.35385513305664,
"learning_rate": 1.9915828563121915e-06,
"loss": 0.5091,
"num_input_tokens_seen": 6775168,
"step": 2145
},
{
"epoch": 0.13763523462006275,
"grad_norm": 43.63134765625,
"learning_rate": 1.9914375675351865e-06,
"loss": 0.5144,
"num_input_tokens_seen": 6791296,
"step": 2150
},
{
"epoch": 0.13795531656103963,
"grad_norm": 19.44449806213379,
"learning_rate": 1.991291040936738e-06,
"loss": 0.4326,
"num_input_tokens_seen": 6808640,
"step": 2155
},
{
"epoch": 0.1382753985020165,
"grad_norm": 43.09555435180664,
"learning_rate": 1.9911432766997857e-06,
"loss": 0.6764,
"num_input_tokens_seen": 6824064,
"step": 2160
},
{
"epoch": 0.1385954804429934,
"grad_norm": 59.859764099121094,
"learning_rate": 1.990994275008815e-06,
"loss": 0.455,
"num_input_tokens_seen": 6839872,
"step": 2165
},
{
"epoch": 0.1389155623839703,
"grad_norm": 62.95064926147461,
"learning_rate": 1.9908440360498565e-06,
"loss": 0.515,
"num_input_tokens_seen": 6855744,
"step": 2170
},
{
"epoch": 0.1392356443249472,
"grad_norm": 46.814388275146484,
"learning_rate": 1.990692560010485e-06,
"loss": 0.5589,
"num_input_tokens_seen": 6869632,
"step": 2175
},
{
"epoch": 0.13955572626592408,
"grad_norm": 30.18223762512207,
"learning_rate": 1.9905398470798206e-06,
"loss": 0.4574,
"num_input_tokens_seen": 6885696,
"step": 2180
},
{
"epoch": 0.13987580820690096,
"grad_norm": 29.679075241088867,
"learning_rate": 1.990385897448527e-06,
"loss": 0.37,
"num_input_tokens_seen": 6901504,
"step": 2185
},
{
"epoch": 0.14019589014787787,
"grad_norm": 37.57693862915039,
"learning_rate": 1.9902307113088114e-06,
"loss": 0.5817,
"num_input_tokens_seen": 6916480,
"step": 2190
},
{
"epoch": 0.14051597208885475,
"grad_norm": 50.049583435058594,
"learning_rate": 1.9900742888544264e-06,
"loss": 0.4882,
"num_input_tokens_seen": 6932416,
"step": 2195
},
{
"epoch": 0.14083605402983163,
"grad_norm": 46.006839752197266,
"learning_rate": 1.989916630280667e-06,
"loss": 0.5338,
"num_input_tokens_seen": 6948992,
"step": 2200
},
{
"epoch": 0.14115613597080853,
"grad_norm": 55.06525802612305,
"learning_rate": 1.989757735784372e-06,
"loss": 0.464,
"num_input_tokens_seen": 6964416,
"step": 2205
},
{
"epoch": 0.1414762179117854,
"grad_norm": 43.949302673339844,
"learning_rate": 1.989597605563923e-06,
"loss": 0.4246,
"num_input_tokens_seen": 6980544,
"step": 2210
},
{
"epoch": 0.14179629985276232,
"grad_norm": 28.58378791809082,
"learning_rate": 1.9894362398192437e-06,
"loss": 0.5755,
"num_input_tokens_seen": 6997440,
"step": 2215
},
{
"epoch": 0.1421163817937392,
"grad_norm": 24.560964584350586,
"learning_rate": 1.9892736387518023e-06,
"loss": 0.4218,
"num_input_tokens_seen": 7012672,
"step": 2220
},
{
"epoch": 0.14243646373471608,
"grad_norm": 55.798553466796875,
"learning_rate": 1.9891098025646075e-06,
"loss": 0.4798,
"num_input_tokens_seen": 7027648,
"step": 2225
},
{
"epoch": 0.142756545675693,
"grad_norm": 29.567869186401367,
"learning_rate": 1.9889447314622105e-06,
"loss": 0.5266,
"num_input_tokens_seen": 7043200,
"step": 2230
},
{
"epoch": 0.14307662761666987,
"grad_norm": 43.9607048034668,
"learning_rate": 1.9887784256507046e-06,
"loss": 0.7416,
"num_input_tokens_seen": 7058688,
"step": 2235
},
{
"epoch": 0.14339670955764675,
"grad_norm": 33.17695999145508,
"learning_rate": 1.988610885337725e-06,
"loss": 0.6734,
"num_input_tokens_seen": 7074048,
"step": 2240
},
{
"epoch": 0.14371679149862365,
"grad_norm": 32.72926330566406,
"learning_rate": 1.9884421107324476e-06,
"loss": 0.5319,
"num_input_tokens_seen": 7089792,
"step": 2245
},
{
"epoch": 0.14403687343960053,
"grad_norm": 41.187984466552734,
"learning_rate": 1.9882721020455893e-06,
"loss": 0.4753,
"num_input_tokens_seen": 7104640,
"step": 2250
},
{
"epoch": 0.14435695538057744,
"grad_norm": 33.69738006591797,
"learning_rate": 1.988100859489408e-06,
"loss": 0.5137,
"num_input_tokens_seen": 7120064,
"step": 2255
},
{
"epoch": 0.14467703732155432,
"grad_norm": 34.818851470947266,
"learning_rate": 1.9879283832777017e-06,
"loss": 0.4839,
"num_input_tokens_seen": 7135232,
"step": 2260
},
{
"epoch": 0.1449971192625312,
"grad_norm": 52.21475601196289,
"learning_rate": 1.9877546736258096e-06,
"loss": 0.5247,
"num_input_tokens_seen": 7149632,
"step": 2265
},
{
"epoch": 0.1453172012035081,
"grad_norm": 39.324825286865234,
"learning_rate": 1.98757973075061e-06,
"loss": 0.4134,
"num_input_tokens_seen": 7164352,
"step": 2270
},
{
"epoch": 0.14563728314448499,
"grad_norm": 36.88801193237305,
"learning_rate": 1.987403554870521e-06,
"loss": 0.52,
"num_input_tokens_seen": 7179776,
"step": 2275
},
{
"epoch": 0.14595736508546187,
"grad_norm": 38.47246170043945,
"learning_rate": 1.9872261462055003e-06,
"loss": 0.423,
"num_input_tokens_seen": 7194240,
"step": 2280
},
{
"epoch": 0.14627744702643877,
"grad_norm": 23.939250946044922,
"learning_rate": 1.987047504977045e-06,
"loss": 0.4393,
"num_input_tokens_seen": 7209472,
"step": 2285
},
{
"epoch": 0.14659752896741565,
"grad_norm": 50.50169372558594,
"learning_rate": 1.9868676314081902e-06,
"loss": 0.4174,
"num_input_tokens_seen": 7225088,
"step": 2290
},
{
"epoch": 0.14691761090839256,
"grad_norm": 75.09852600097656,
"learning_rate": 1.9866865257235107e-06,
"loss": 0.6811,
"num_input_tokens_seen": 7240704,
"step": 2295
},
{
"epoch": 0.14723769284936944,
"grad_norm": 33.65947723388672,
"learning_rate": 1.9865041881491188e-06,
"loss": 0.4241,
"num_input_tokens_seen": 7256000,
"step": 2300
},
{
"epoch": 0.14755777479034632,
"grad_norm": 50.8767204284668,
"learning_rate": 1.9863206189126653e-06,
"loss": 0.6191,
"num_input_tokens_seen": 7270336,
"step": 2305
},
{
"epoch": 0.14787785673132323,
"grad_norm": 47.13086700439453,
"learning_rate": 1.9861358182433382e-06,
"loss": 0.5735,
"num_input_tokens_seen": 7285440,
"step": 2310
},
{
"epoch": 0.1481979386723001,
"grad_norm": 38.765995025634766,
"learning_rate": 1.9859497863718634e-06,
"loss": 0.4719,
"num_input_tokens_seen": 7301120,
"step": 2315
},
{
"epoch": 0.14851802061327699,
"grad_norm": 23.727924346923828,
"learning_rate": 1.985762523530504e-06,
"loss": 0.5315,
"num_input_tokens_seen": 7316416,
"step": 2320
},
{
"epoch": 0.1488381025542539,
"grad_norm": 28.33704948425293,
"learning_rate": 1.98557402995306e-06,
"loss": 0.4997,
"num_input_tokens_seen": 7332160,
"step": 2325
},
{
"epoch": 0.14915818449523077,
"grad_norm": 41.80880355834961,
"learning_rate": 1.985384305874868e-06,
"loss": 0.7101,
"num_input_tokens_seen": 7347776,
"step": 2330
},
{
"epoch": 0.14947826643620768,
"grad_norm": 37.426422119140625,
"learning_rate": 1.9851933515328e-06,
"loss": 0.5478,
"num_input_tokens_seen": 7363200,
"step": 2335
},
{
"epoch": 0.14979834837718456,
"grad_norm": 44.769901275634766,
"learning_rate": 1.985001167165265e-06,
"loss": 0.475,
"num_input_tokens_seen": 7378752,
"step": 2340
},
{
"epoch": 0.15011843031816144,
"grad_norm": 38.463008880615234,
"learning_rate": 1.984807753012208e-06,
"loss": 0.5239,
"num_input_tokens_seen": 7393984,
"step": 2345
},
{
"epoch": 0.15018244670635683,
"eval_loss": 0.5113906264305115,
"eval_runtime": 50.6224,
"eval_samples_per_second": 274.306,
"eval_steps_per_second": 34.293,
"num_input_tokens_seen": 7397056,
"step": 2346
},
{
"epoch": 0.15043851225913835,
"grad_norm": 27.22393798828125,
"learning_rate": 1.9846131093151086e-06,
"loss": 0.5882,
"num_input_tokens_seen": 7408832,
"step": 2350
},
{
"epoch": 0.15075859420011523,
"grad_norm": 17.012371063232422,
"learning_rate": 1.9844172363169808e-06,
"loss": 0.4612,
"num_input_tokens_seen": 7423040,
"step": 2355
},
{
"epoch": 0.15107867614109213,
"grad_norm": 54.97491455078125,
"learning_rate": 1.9842201342623756e-06,
"loss": 0.5148,
"num_input_tokens_seen": 7438464,
"step": 2360
},
{
"epoch": 0.151398758082069,
"grad_norm": 32.32542037963867,
"learning_rate": 1.9840218033973766e-06,
"loss": 0.5219,
"num_input_tokens_seen": 7453824,
"step": 2365
},
{
"epoch": 0.1517188400230459,
"grad_norm": 39.23529052734375,
"learning_rate": 1.9838222439696027e-06,
"loss": 0.5858,
"num_input_tokens_seen": 7469312,
"step": 2370
},
{
"epoch": 0.1520389219640228,
"grad_norm": 51.202392578125,
"learning_rate": 1.9836214562282058e-06,
"loss": 0.7034,
"num_input_tokens_seen": 7485120,
"step": 2375
},
{
"epoch": 0.15235900390499968,
"grad_norm": 38.26160430908203,
"learning_rate": 1.9834194404238715e-06,
"loss": 0.5189,
"num_input_tokens_seen": 7500416,
"step": 2380
},
{
"epoch": 0.15267908584597656,
"grad_norm": 40.620052337646484,
"learning_rate": 1.9832161968088193e-06,
"loss": 0.4149,
"num_input_tokens_seen": 7516672,
"step": 2385
},
{
"epoch": 0.15299916778695347,
"grad_norm": 54.49562454223633,
"learning_rate": 1.9830117256368015e-06,
"loss": 0.4703,
"num_input_tokens_seen": 7532800,
"step": 2390
},
{
"epoch": 0.15331924972793035,
"grad_norm": 39.00943374633789,
"learning_rate": 1.982806027163102e-06,
"loss": 0.4994,
"num_input_tokens_seen": 7547776,
"step": 2395
},
{
"epoch": 0.15363933166890725,
"grad_norm": 30.828948974609375,
"learning_rate": 1.9825991016445386e-06,
"loss": 0.5718,
"num_input_tokens_seen": 7562496,
"step": 2400
},
{
"epoch": 0.15395941360988413,
"grad_norm": 39.274105072021484,
"learning_rate": 1.9823909493394594e-06,
"loss": 0.5263,
"num_input_tokens_seen": 7577920,
"step": 2405
},
{
"epoch": 0.154279495550861,
"grad_norm": 43.676815032958984,
"learning_rate": 1.9821815705077455e-06,
"loss": 0.5373,
"num_input_tokens_seen": 7593216,
"step": 2410
},
{
"epoch": 0.15459957749183792,
"grad_norm": 65.43962860107422,
"learning_rate": 1.9819709654108087e-06,
"loss": 0.5752,
"num_input_tokens_seen": 7608192,
"step": 2415
},
{
"epoch": 0.1549196594328148,
"grad_norm": 44.96727752685547,
"learning_rate": 1.981759134311592e-06,
"loss": 0.4606,
"num_input_tokens_seen": 7624448,
"step": 2420
},
{
"epoch": 0.15523974137379168,
"grad_norm": 48.34320831298828,
"learning_rate": 1.981546077474569e-06,
"loss": 0.4839,
"num_input_tokens_seen": 7640192,
"step": 2425
},
{
"epoch": 0.15555982331476859,
"grad_norm": 35.434444427490234,
"learning_rate": 1.981331795165744e-06,
"loss": 0.534,
"num_input_tokens_seen": 7654848,
"step": 2430
},
{
"epoch": 0.15587990525574547,
"grad_norm": 64.55530548095703,
"learning_rate": 1.9811162876526498e-06,
"loss": 0.6053,
"num_input_tokens_seen": 7670848,
"step": 2435
},
{
"epoch": 0.15619998719672237,
"grad_norm": 30.858980178833008,
"learning_rate": 1.9808995552043515e-06,
"loss": 0.6575,
"num_input_tokens_seen": 7686016,
"step": 2440
},
{
"epoch": 0.15652006913769925,
"grad_norm": 36.265830993652344,
"learning_rate": 1.9806815980914413e-06,
"loss": 0.5662,
"num_input_tokens_seen": 7701760,
"step": 2445
},
{
"epoch": 0.15684015107867613,
"grad_norm": 35.759735107421875,
"learning_rate": 1.9804624165860417e-06,
"loss": 0.5736,
"num_input_tokens_seen": 7717760,
"step": 2450
},
{
"epoch": 0.15716023301965304,
"grad_norm": 18.90166473388672,
"learning_rate": 1.9802420109618028e-06,
"loss": 0.3894,
"num_input_tokens_seen": 7733376,
"step": 2455
},
{
"epoch": 0.15748031496062992,
"grad_norm": 16.299848556518555,
"learning_rate": 1.980020381493904e-06,
"loss": 0.503,
"num_input_tokens_seen": 7750464,
"step": 2460
},
{
"epoch": 0.1578003969016068,
"grad_norm": 39.69455337524414,
"learning_rate": 1.979797528459052e-06,
"loss": 0.5024,
"num_input_tokens_seen": 7768576,
"step": 2465
},
{
"epoch": 0.1581204788425837,
"grad_norm": 45.433387756347656,
"learning_rate": 1.979573452135482e-06,
"loss": 0.5285,
"num_input_tokens_seen": 7784256,
"step": 2470
},
{
"epoch": 0.15844056078356059,
"grad_norm": 33.314964294433594,
"learning_rate": 1.979348152802955e-06,
"loss": 0.3218,
"num_input_tokens_seen": 7799232,
"step": 2475
},
{
"epoch": 0.1587606427245375,
"grad_norm": 48.998722076416016,
"learning_rate": 1.979121630742761e-06,
"loss": 0.5854,
"num_input_tokens_seen": 7815040,
"step": 2480
},
{
"epoch": 0.15908072466551437,
"grad_norm": 19.081119537353516,
"learning_rate": 1.9788938862377146e-06,
"loss": 0.4547,
"num_input_tokens_seen": 7830400,
"step": 2485
},
{
"epoch": 0.15940080660649125,
"grad_norm": 32.0287971496582,
"learning_rate": 1.9786649195721577e-06,
"loss": 0.4803,
"num_input_tokens_seen": 7846336,
"step": 2490
},
{
"epoch": 0.15972088854746816,
"grad_norm": 41.866336822509766,
"learning_rate": 1.978434731031958e-06,
"loss": 0.6471,
"num_input_tokens_seen": 7862528,
"step": 2495
},
{
"epoch": 0.16004097048844504,
"grad_norm": 39.477691650390625,
"learning_rate": 1.9782033209045085e-06,
"loss": 0.4554,
"num_input_tokens_seen": 7880000,
"step": 2500
},
{
"epoch": 0.16036105242942192,
"grad_norm": 20.426006317138672,
"learning_rate": 1.977970689478727e-06,
"loss": 0.4114,
"num_input_tokens_seen": 7895296,
"step": 2505
},
{
"epoch": 0.16068113437039883,
"grad_norm": 58.49917984008789,
"learning_rate": 1.9777368370450577e-06,
"loss": 0.5963,
"num_input_tokens_seen": 7911104,
"step": 2510
},
{
"epoch": 0.1610012163113757,
"grad_norm": 33.156394958496094,
"learning_rate": 1.9775017638954674e-06,
"loss": 0.5129,
"num_input_tokens_seen": 7925952,
"step": 2515
},
{
"epoch": 0.1613212982523526,
"grad_norm": 33.88132858276367,
"learning_rate": 1.9772654703234476e-06,
"loss": 0.6004,
"num_input_tokens_seen": 7940928,
"step": 2520
},
{
"epoch": 0.1616413801933295,
"grad_norm": 44.398887634277344,
"learning_rate": 1.977027956624014e-06,
"loss": 0.5638,
"num_input_tokens_seen": 7955200,
"step": 2525
},
{
"epoch": 0.16196146213430637,
"grad_norm": 43.607967376708984,
"learning_rate": 1.9767892230937046e-06,
"loss": 0.5759,
"num_input_tokens_seen": 7970944,
"step": 2530
},
{
"epoch": 0.16228154407528328,
"grad_norm": 49.36827087402344,
"learning_rate": 1.976549270030581e-06,
"loss": 0.4305,
"num_input_tokens_seen": 7985856,
"step": 2535
},
{
"epoch": 0.16260162601626016,
"grad_norm": 41.47651290893555,
"learning_rate": 1.9763080977342286e-06,
"loss": 0.4789,
"num_input_tokens_seen": 8001088,
"step": 2540
},
{
"epoch": 0.16292170795723707,
"grad_norm": 47.50954818725586,
"learning_rate": 1.9760657065057527e-06,
"loss": 0.4995,
"num_input_tokens_seen": 8017856,
"step": 2545
},
{
"epoch": 0.16324178989821395,
"grad_norm": 42.19331359863281,
"learning_rate": 1.975822096647782e-06,
"loss": 0.4597,
"num_input_tokens_seen": 8033792,
"step": 2550
},
{
"epoch": 0.16356187183919083,
"grad_norm": 40.176029205322266,
"learning_rate": 1.975577268464466e-06,
"loss": 0.4952,
"num_input_tokens_seen": 8048256,
"step": 2555
},
{
"epoch": 0.16388195378016773,
"grad_norm": 31.933691024780273,
"learning_rate": 1.9753312222614765e-06,
"loss": 0.5653,
"num_input_tokens_seen": 8063680,
"step": 2560
},
{
"epoch": 0.1642020357211446,
"grad_norm": 55.901126861572266,
"learning_rate": 1.9750839583460036e-06,
"loss": 0.4827,
"num_input_tokens_seen": 8079744,
"step": 2565
},
{
"epoch": 0.1645221176621215,
"grad_norm": 34.241172790527344,
"learning_rate": 1.9748354770267603e-06,
"loss": 0.5034,
"num_input_tokens_seen": 8094656,
"step": 2570
},
{
"epoch": 0.1648421996030984,
"grad_norm": 26.009151458740234,
"learning_rate": 1.9745857786139777e-06,
"loss": 0.5117,
"num_input_tokens_seen": 8110528,
"step": 2575
},
{
"epoch": 0.16516228154407528,
"grad_norm": 49.35831832885742,
"learning_rate": 1.974334863419408e-06,
"loss": 0.6109,
"num_input_tokens_seen": 8126720,
"step": 2580
},
{
"epoch": 0.1654823634850522,
"grad_norm": 36.18735885620117,
"learning_rate": 1.9740827317563212e-06,
"loss": 0.5038,
"num_input_tokens_seen": 8141312,
"step": 2585
},
{
"epoch": 0.16580244542602907,
"grad_norm": 36.603824615478516,
"learning_rate": 1.973829383939507e-06,
"loss": 0.485,
"num_input_tokens_seen": 8156736,
"step": 2590
},
{
"epoch": 0.16612252736700595,
"grad_norm": 52.14276885986328,
"learning_rate": 1.973574820285273e-06,
"loss": 0.4978,
"num_input_tokens_seen": 8172480,
"step": 2595
},
{
"epoch": 0.16644260930798285,
"grad_norm": 41.253135681152344,
"learning_rate": 1.9733190411114443e-06,
"loss": 0.581,
"num_input_tokens_seen": 8188224,
"step": 2600
},
{
"epoch": 0.16676269124895973,
"grad_norm": 37.012882232666016,
"learning_rate": 1.9730620467373654e-06,
"loss": 0.4388,
"num_input_tokens_seen": 8204352,
"step": 2605
},
{
"epoch": 0.1670827731899366,
"grad_norm": 45.37546157836914,
"learning_rate": 1.9728038374838958e-06,
"loss": 0.5835,
"num_input_tokens_seen": 8219328,
"step": 2610
},
{
"epoch": 0.16740285513091352,
"grad_norm": 22.488475799560547,
"learning_rate": 1.972544413673413e-06,
"loss": 0.392,
"num_input_tokens_seen": 8234560,
"step": 2615
},
{
"epoch": 0.1677229370718904,
"grad_norm": 31.539594650268555,
"learning_rate": 1.9722837756298108e-06,
"loss": 0.5766,
"num_input_tokens_seen": 8249344,
"step": 2620
},
{
"epoch": 0.1680430190128673,
"grad_norm": 54.007774353027344,
"learning_rate": 1.972021923678499e-06,
"loss": 0.551,
"num_input_tokens_seen": 8265600,
"step": 2625
},
{
"epoch": 0.16836310095384419,
"grad_norm": 28.686309814453125,
"learning_rate": 1.971758858146403e-06,
"loss": 0.4822,
"num_input_tokens_seen": 8280384,
"step": 2630
},
{
"epoch": 0.16868318289482107,
"grad_norm": 44.47602081298828,
"learning_rate": 1.9714945793619626e-06,
"loss": 0.4916,
"num_input_tokens_seen": 8295744,
"step": 2635
},
{
"epoch": 0.16900326483579797,
"grad_norm": 28.91101837158203,
"learning_rate": 1.971229087655133e-06,
"loss": 0.52,
"num_input_tokens_seen": 8311680,
"step": 2640
},
{
"epoch": 0.16932334677677485,
"grad_norm": 31.473901748657227,
"learning_rate": 1.9709623833573842e-06,
"loss": 0.4659,
"num_input_tokens_seen": 8326592,
"step": 2645
},
{
"epoch": 0.16964342871775173,
"grad_norm": 47.58165740966797,
"learning_rate": 1.9706944668016994e-06,
"loss": 0.4454,
"num_input_tokens_seen": 8341632,
"step": 2650
},
{
"epoch": 0.16996351065872864,
"grad_norm": 38.02768325805664,
"learning_rate": 1.9704253383225756e-06,
"loss": 0.4643,
"num_input_tokens_seen": 8358400,
"step": 2655
},
{
"epoch": 0.17028359259970552,
"grad_norm": 34.26385498046875,
"learning_rate": 1.970154998256023e-06,
"loss": 0.4813,
"num_input_tokens_seen": 8374144,
"step": 2660
},
{
"epoch": 0.17060367454068243,
"grad_norm": 36.563358306884766,
"learning_rate": 1.9698834469395644e-06,
"loss": 0.4266,
"num_input_tokens_seen": 8389440,
"step": 2665
},
{
"epoch": 0.1709237564816593,
"grad_norm": 35.873085021972656,
"learning_rate": 1.969610684712234e-06,
"loss": 0.5565,
"num_input_tokens_seen": 8404672,
"step": 2670
},
{
"epoch": 0.17124383842263619,
"grad_norm": 67.11613464355469,
"learning_rate": 1.9693367119145794e-06,
"loss": 0.5696,
"num_input_tokens_seen": 8420096,
"step": 2675
},
{
"epoch": 0.1715639203636131,
"grad_norm": 42.11975860595703,
"learning_rate": 1.969061528888659e-06,
"loss": 0.6647,
"num_input_tokens_seen": 8436288,
"step": 2680
},
{
"epoch": 0.17188400230458997,
"grad_norm": 25.307558059692383,
"learning_rate": 1.9687851359780415e-06,
"loss": 0.549,
"num_input_tokens_seen": 8452672,
"step": 2685
},
{
"epoch": 0.17220408424556685,
"grad_norm": 21.267452239990234,
"learning_rate": 1.968507533527807e-06,
"loss": 0.4875,
"num_input_tokens_seen": 8469120,
"step": 2690
},
{
"epoch": 0.17252416618654376,
"grad_norm": 47.49785232543945,
"learning_rate": 1.9682287218845455e-06,
"loss": 0.4694,
"num_input_tokens_seen": 8484736,
"step": 2695
},
{
"epoch": 0.17284424812752064,
"grad_norm": 39.40044021606445,
"learning_rate": 1.967948701396356e-06,
"loss": 0.7448,
"num_input_tokens_seen": 8500480,
"step": 2700
},
{
"epoch": 0.17316433006849755,
"grad_norm": 29.45541763305664,
"learning_rate": 1.9676674724128485e-06,
"loss": 0.3988,
"num_input_tokens_seen": 8514624,
"step": 2705
},
{
"epoch": 0.17348441200947443,
"grad_norm": 21.1942081451416,
"learning_rate": 1.9673850352851397e-06,
"loss": 0.4666,
"num_input_tokens_seen": 8529664,
"step": 2710
},
{
"epoch": 0.1738044939504513,
"grad_norm": 30.817174911499023,
"learning_rate": 1.967101390365856e-06,
"loss": 0.5852,
"num_input_tokens_seen": 8545280,
"step": 2715
},
{
"epoch": 0.1741245758914282,
"grad_norm": 31.28725814819336,
"learning_rate": 1.966816538009131e-06,
"loss": 0.4975,
"num_input_tokens_seen": 8560384,
"step": 2720
},
{
"epoch": 0.1744446578324051,
"grad_norm": 42.42086410522461,
"learning_rate": 1.966530478570607e-06,
"loss": 0.538,
"num_input_tokens_seen": 8576960,
"step": 2725
},
{
"epoch": 0.174764739773382,
"grad_norm": 33.971405029296875,
"learning_rate": 1.9662432124074325e-06,
"loss": 0.4686,
"num_input_tokens_seen": 8592384,
"step": 2730
},
{
"epoch": 0.17508482171435888,
"grad_norm": 31.316307067871094,
"learning_rate": 1.965954739878262e-06,
"loss": 0.4889,
"num_input_tokens_seen": 8609024,
"step": 2735
},
{
"epoch": 0.17540490365533576,
"grad_norm": 50.005043029785156,
"learning_rate": 1.965665061343257e-06,
"loss": 0.4298,
"num_input_tokens_seen": 8624768,
"step": 2740
},
{
"epoch": 0.17572498559631267,
"grad_norm": 26.911272048950195,
"learning_rate": 1.965374177164085e-06,
"loss": 0.4643,
"num_input_tokens_seen": 8640448,
"step": 2745
},
{
"epoch": 0.17604506753728955,
"grad_norm": 29.490320205688477,
"learning_rate": 1.9650820877039182e-06,
"loss": 0.5569,
"num_input_tokens_seen": 8655296,
"step": 2750
},
{
"epoch": 0.17636514947826643,
"grad_norm": 74.6144790649414,
"learning_rate": 1.9647887933274334e-06,
"loss": 0.4903,
"num_input_tokens_seen": 8671872,
"step": 2755
},
{
"epoch": 0.17668523141924333,
"grad_norm": 23.339736938476562,
"learning_rate": 1.9644942944008124e-06,
"loss": 0.4835,
"num_input_tokens_seen": 8687680,
"step": 2760
},
{
"epoch": 0.1770053133602202,
"grad_norm": 56.54179763793945,
"learning_rate": 1.96419859129174e-06,
"loss": 0.6033,
"num_input_tokens_seen": 8702912,
"step": 2765
},
{
"epoch": 0.17732539530119712,
"grad_norm": 25.558734893798828,
"learning_rate": 1.963901684369406e-06,
"loss": 0.467,
"num_input_tokens_seen": 8718144,
"step": 2770
},
{
"epoch": 0.177645477242174,
"grad_norm": 39.40992736816406,
"learning_rate": 1.9636035740045013e-06,
"loss": 0.5107,
"num_input_tokens_seen": 8732992,
"step": 2775
},
{
"epoch": 0.17796555918315088,
"grad_norm": 25.630998611450195,
"learning_rate": 1.9633042605692207e-06,
"loss": 0.6129,
"num_input_tokens_seen": 8749056,
"step": 2780
},
{
"epoch": 0.17828564112412779,
"grad_norm": 24.820589065551758,
"learning_rate": 1.9630037444372597e-06,
"loss": 0.4943,
"num_input_tokens_seen": 8765184,
"step": 2785
},
{
"epoch": 0.17860572306510467,
"grad_norm": 45.313453674316406,
"learning_rate": 1.9627020259838177e-06,
"loss": 0.4163,
"num_input_tokens_seen": 8780480,
"step": 2790
},
{
"epoch": 0.17892580500608155,
"grad_norm": 33.22193908691406,
"learning_rate": 1.9623991055855925e-06,
"loss": 0.5605,
"num_input_tokens_seen": 8796352,
"step": 2795
},
{
"epoch": 0.17924588694705845,
"grad_norm": 28.097183227539062,
"learning_rate": 1.962094983620784e-06,
"loss": 0.4507,
"num_input_tokens_seen": 8810688,
"step": 2800
},
{
"epoch": 0.17956596888803533,
"grad_norm": 46.64733123779297,
"learning_rate": 1.9617896604690925e-06,
"loss": 0.4204,
"num_input_tokens_seen": 8826304,
"step": 2805
},
{
"epoch": 0.17988605082901224,
"grad_norm": 24.82090187072754,
"learning_rate": 1.961483136511717e-06,
"loss": 0.4545,
"num_input_tokens_seen": 8841344,
"step": 2810
},
{
"epoch": 0.18020613276998912,
"grad_norm": 49.74783706665039,
"learning_rate": 1.9611754121313567e-06,
"loss": 0.6135,
"num_input_tokens_seen": 8857664,
"step": 2815
},
{
"epoch": 0.180526214710966,
"grad_norm": 54.14537048339844,
"learning_rate": 1.960866487712209e-06,
"loss": 0.5854,
"num_input_tokens_seen": 8873408,
"step": 2820
},
{
"epoch": 0.1808462966519429,
"grad_norm": 32.58800506591797,
"learning_rate": 1.9605563636399695e-06,
"loss": 0.4328,
"num_input_tokens_seen": 8889472,
"step": 2825
},
{
"epoch": 0.18116637859291979,
"grad_norm": 62.71939468383789,
"learning_rate": 1.9602450403018315e-06,
"loss": 0.6013,
"num_input_tokens_seen": 8904640,
"step": 2830
},
{
"epoch": 0.18148646053389667,
"grad_norm": 36.499908447265625,
"learning_rate": 1.9599325180864864e-06,
"loss": 0.4548,
"num_input_tokens_seen": 8919680,
"step": 2835
},
{
"epoch": 0.18180654247487357,
"grad_norm": 32.281959533691406,
"learning_rate": 1.9596187973841216e-06,
"loss": 0.446,
"num_input_tokens_seen": 8935360,
"step": 2840
},
{
"epoch": 0.18212662441585045,
"grad_norm": 26.72726058959961,
"learning_rate": 1.959303878586421e-06,
"loss": 0.4871,
"num_input_tokens_seen": 8951552,
"step": 2845
},
{
"epoch": 0.18244670635682736,
"grad_norm": 43.18830871582031,
"learning_rate": 1.9589877620865647e-06,
"loss": 0.585,
"num_input_tokens_seen": 8968576,
"step": 2850
},
{
"epoch": 0.18276678829780424,
"grad_norm": 30.252056121826172,
"learning_rate": 1.9586704482792277e-06,
"loss": 0.4598,
"num_input_tokens_seen": 8983744,
"step": 2855
},
{
"epoch": 0.18308687023878112,
"grad_norm": 31.71357536315918,
"learning_rate": 1.95835193756058e-06,
"loss": 0.4344,
"num_input_tokens_seen": 8999040,
"step": 2860
},
{
"epoch": 0.18340695217975803,
"grad_norm": 40.17634963989258,
"learning_rate": 1.9580322303282858e-06,
"loss": 0.4269,
"num_input_tokens_seen": 9015872,
"step": 2865
},
{
"epoch": 0.1837270341207349,
"grad_norm": 27.72193145751953,
"learning_rate": 1.9577113269815038e-06,
"loss": 0.4106,
"num_input_tokens_seen": 9031744,
"step": 2870
},
{
"epoch": 0.18404711606171179,
"grad_norm": 36.63798141479492,
"learning_rate": 1.957389227920885e-06,
"loss": 0.5936,
"num_input_tokens_seen": 9047872,
"step": 2875
},
{
"epoch": 0.1843671980026887,
"grad_norm": 36.46480178833008,
"learning_rate": 1.957065933548574e-06,
"loss": 0.5225,
"num_input_tokens_seen": 9062976,
"step": 2880
},
{
"epoch": 0.18468727994366557,
"grad_norm": 59.29536819458008,
"learning_rate": 1.956741444268208e-06,
"loss": 0.5881,
"num_input_tokens_seen": 9078208,
"step": 2885
},
{
"epoch": 0.18500736188464248,
"grad_norm": 31.733598709106445,
"learning_rate": 1.9564157604849154e-06,
"loss": 0.4778,
"num_input_tokens_seen": 9094720,
"step": 2890
},
{
"epoch": 0.18532744382561936,
"grad_norm": 30.152931213378906,
"learning_rate": 1.9560888826053163e-06,
"loss": 0.529,
"num_input_tokens_seen": 9110336,
"step": 2895
},
{
"epoch": 0.18564752576659624,
"grad_norm": 25.448486328125,
"learning_rate": 1.9557608110375212e-06,
"loss": 0.5617,
"num_input_tokens_seen": 9126912,
"step": 2900
},
{
"epoch": 0.18596760770757315,
"grad_norm": 27.7618465423584,
"learning_rate": 1.955431546191132e-06,
"loss": 0.5447,
"num_input_tokens_seen": 9142400,
"step": 2905
},
{
"epoch": 0.18628768964855003,
"grad_norm": 44.82647705078125,
"learning_rate": 1.95510108847724e-06,
"loss": 0.5254,
"num_input_tokens_seen": 9157184,
"step": 2910
},
{
"epoch": 0.1866077715895269,
"grad_norm": 29.998842239379883,
"learning_rate": 1.954769438308424e-06,
"loss": 0.526,
"num_input_tokens_seen": 9173696,
"step": 2915
},
{
"epoch": 0.1869278535305038,
"grad_norm": 38.51725387573242,
"learning_rate": 1.954436596098754e-06,
"loss": 0.5085,
"num_input_tokens_seen": 9190080,
"step": 2920
},
{
"epoch": 0.1872479354714807,
"grad_norm": 61.17892837524414,
"learning_rate": 1.9541025622637875e-06,
"loss": 0.5828,
"num_input_tokens_seen": 9204352,
"step": 2925
},
{
"epoch": 0.1875680174124576,
"grad_norm": 51.220340728759766,
"learning_rate": 1.95376733722057e-06,
"loss": 0.6086,
"num_input_tokens_seen": 9219200,
"step": 2930
},
{
"epoch": 0.18788809935343448,
"grad_norm": 38.08414840698242,
"learning_rate": 1.9534309213876337e-06,
"loss": 0.4778,
"num_input_tokens_seen": 9233600,
"step": 2935
},
{
"epoch": 0.18820818129441136,
"grad_norm": 40.778892517089844,
"learning_rate": 1.953093315184997e-06,
"loss": 0.4369,
"num_input_tokens_seen": 9249536,
"step": 2940
},
{
"epoch": 0.18852826323538827,
"grad_norm": 40.999114990234375,
"learning_rate": 1.952754519034166e-06,
"loss": 0.6525,
"num_input_tokens_seen": 9264256,
"step": 2945
},
{
"epoch": 0.18884834517636515,
"grad_norm": 62.22706985473633,
"learning_rate": 1.9524145333581313e-06,
"loss": 0.4542,
"num_input_tokens_seen": 9279488,
"step": 2950
},
{
"epoch": 0.18916842711734205,
"grad_norm": 27.501522064208984,
"learning_rate": 1.952073358581369e-06,
"loss": 0.5187,
"num_input_tokens_seen": 9294336,
"step": 2955
},
{
"epoch": 0.18948850905831893,
"grad_norm": 37.89274215698242,
"learning_rate": 1.95173099512984e-06,
"loss": 0.5615,
"num_input_tokens_seen": 9309376,
"step": 2960
},
{
"epoch": 0.1898085909992958,
"grad_norm": 23.472244262695312,
"learning_rate": 1.9513874434309894e-06,
"loss": 0.4698,
"num_input_tokens_seen": 9324224,
"step": 2965
},
{
"epoch": 0.19012867294027272,
"grad_norm": 28.482378005981445,
"learning_rate": 1.951042703913745e-06,
"loss": 0.4491,
"num_input_tokens_seen": 9339136,
"step": 2970
},
{
"epoch": 0.1904487548812496,
"grad_norm": 26.93058204650879,
"learning_rate": 1.950696777008518e-06,
"loss": 0.4492,
"num_input_tokens_seen": 9354688,
"step": 2975
},
{
"epoch": 0.19076883682222648,
"grad_norm": 24.83283233642578,
"learning_rate": 1.9503496631472025e-06,
"loss": 0.4948,
"num_input_tokens_seen": 9369664,
"step": 2980
},
{
"epoch": 0.19108891876320339,
"grad_norm": 43.00146484375,
"learning_rate": 1.9500013627631746e-06,
"loss": 0.6353,
"num_input_tokens_seen": 9384768,
"step": 2985
},
{
"epoch": 0.19140900070418027,
"grad_norm": 35.03440856933594,
"learning_rate": 1.949651876291291e-06,
"loss": 0.3771,
"num_input_tokens_seen": 9400320,
"step": 2990
},
{
"epoch": 0.19172908264515717,
"grad_norm": 54.04991912841797,
"learning_rate": 1.9493012041678894e-06,
"loss": 0.4872,
"num_input_tokens_seen": 9415872,
"step": 2995
},
{
"epoch": 0.19204916458613405,
"grad_norm": 33.80318832397461,
"learning_rate": 1.9489493468307883e-06,
"loss": 0.5988,
"num_input_tokens_seen": 9432704,
"step": 3000
},
{
"epoch": 0.19236924652711093,
"grad_norm": 49.77751541137695,
"learning_rate": 1.948596304719286e-06,
"loss": 0.5456,
"num_input_tokens_seen": 9448192,
"step": 3005
},
{
"epoch": 0.19268932846808784,
"grad_norm": 44.03824234008789,
"learning_rate": 1.9482420782741594e-06,
"loss": 0.4447,
"num_input_tokens_seen": 9464576,
"step": 3010
},
{
"epoch": 0.19300941040906472,
"grad_norm": 32.3371467590332,
"learning_rate": 1.9478866679376647e-06,
"loss": 0.5591,
"num_input_tokens_seen": 9479936,
"step": 3015
},
{
"epoch": 0.1933294923500416,
"grad_norm": 31.040849685668945,
"learning_rate": 1.9475300741535353e-06,
"loss": 0.5564,
"num_input_tokens_seen": 9497280,
"step": 3020
},
{
"epoch": 0.1936495742910185,
"grad_norm": 39.65032958984375,
"learning_rate": 1.9471722973669833e-06,
"loss": 0.4714,
"num_input_tokens_seen": 9514496,
"step": 3025
},
{
"epoch": 0.19396965623199539,
"grad_norm": 24.577251434326172,
"learning_rate": 1.946813338024697e-06,
"loss": 0.3979,
"num_input_tokens_seen": 9529536,
"step": 3030
},
{
"epoch": 0.1942897381729723,
"grad_norm": 54.89514923095703,
"learning_rate": 1.9464531965748414e-06,
"loss": 0.5342,
"num_input_tokens_seen": 9545472,
"step": 3035
},
{
"epoch": 0.19460982011394917,
"grad_norm": 40.215274810791016,
"learning_rate": 1.9460918734670573e-06,
"loss": 0.5827,
"num_input_tokens_seen": 9560960,
"step": 3040
},
{
"epoch": 0.19492990205492605,
"grad_norm": 31.09059715270996,
"learning_rate": 1.945729369152461e-06,
"loss": 0.543,
"num_input_tokens_seen": 9576320,
"step": 3045
},
{
"epoch": 0.19524998399590296,
"grad_norm": 45.24855422973633,
"learning_rate": 1.945365684083643e-06,
"loss": 0.5533,
"num_input_tokens_seen": 9592192,
"step": 3050
},
{
"epoch": 0.19557006593687984,
"grad_norm": 55.232234954833984,
"learning_rate": 1.945000818714668e-06,
"loss": 0.615,
"num_input_tokens_seen": 9608128,
"step": 3055
},
{
"epoch": 0.19589014787785672,
"grad_norm": 28.884002685546875,
"learning_rate": 1.944634773501076e-06,
"loss": 0.546,
"num_input_tokens_seen": 9623872,
"step": 3060
},
{
"epoch": 0.19621022981883363,
"grad_norm": 53.66278076171875,
"learning_rate": 1.9442675488998783e-06,
"loss": 0.5662,
"num_input_tokens_seen": 9639488,
"step": 3065
},
{
"epoch": 0.1965303117598105,
"grad_norm": 28.837116241455078,
"learning_rate": 1.9438991453695587e-06,
"loss": 0.5017,
"num_input_tokens_seen": 9655680,
"step": 3070
},
{
"epoch": 0.1968503937007874,
"grad_norm": 38.618587493896484,
"learning_rate": 1.943529563370073e-06,
"loss": 0.5648,
"num_input_tokens_seen": 9670400,
"step": 3075
},
{
"epoch": 0.1971704756417643,
"grad_norm": 21.553041458129883,
"learning_rate": 1.9431588033628495e-06,
"loss": 0.3815,
"num_input_tokens_seen": 9685504,
"step": 3080
},
{
"epoch": 0.19749055758274117,
"grad_norm": 45.82009506225586,
"learning_rate": 1.9427868658107862e-06,
"loss": 0.6302,
"num_input_tokens_seen": 9701952,
"step": 3085
},
{
"epoch": 0.19781063952371808,
"grad_norm": 24.57038116455078,
"learning_rate": 1.942413751178251e-06,
"loss": 0.449,
"num_input_tokens_seen": 9716928,
"step": 3090
},
{
"epoch": 0.19813072146469496,
"grad_norm": 59.18320846557617,
"learning_rate": 1.9420394599310826e-06,
"loss": 0.6552,
"num_input_tokens_seen": 9732096,
"step": 3095
},
{
"epoch": 0.19845080340567184,
"grad_norm": 29.033939361572266,
"learning_rate": 1.941663992536588e-06,
"loss": 0.5247,
"num_input_tokens_seen": 9747648,
"step": 3100
},
{
"epoch": 0.19877088534664875,
"grad_norm": 15.162464141845703,
"learning_rate": 1.941287349463542e-06,
"loss": 0.4467,
"num_input_tokens_seen": 9763072,
"step": 3105
},
{
"epoch": 0.19909096728762563,
"grad_norm": 31.735469818115234,
"learning_rate": 1.940909531182188e-06,
"loss": 0.4856,
"num_input_tokens_seen": 9778176,
"step": 3110
},
{
"epoch": 0.19941104922860253,
"grad_norm": 47.100677490234375,
"learning_rate": 1.9405305381642375e-06,
"loss": 0.6168,
"num_input_tokens_seen": 9793536,
"step": 3115
},
{
"epoch": 0.1997311311695794,
"grad_norm": 25.74694061279297,
"learning_rate": 1.9401503708828665e-06,
"loss": 0.5055,
"num_input_tokens_seen": 9808192,
"step": 3120
},
{
"epoch": 0.2000512131105563,
"grad_norm": 32.5270881652832,
"learning_rate": 1.939769029812719e-06,
"loss": 0.5889,
"num_input_tokens_seen": 9823232,
"step": 3125
},
{
"epoch": 0.20024326227514244,
"eval_loss": 0.4917045831680298,
"eval_runtime": 50.5995,
"eval_samples_per_second": 274.43,
"eval_steps_per_second": 34.309,
"num_input_tokens_seen": 9832064,
"step": 3128
},
{
"epoch": 0.2003712950515332,
"grad_norm": 41.536319732666016,
"learning_rate": 1.939386515429904e-06,
"loss": 0.5998,
"num_input_tokens_seen": 9839488,
"step": 3130
},
{
"epoch": 0.20069137699251008,
"grad_norm": 21.79485321044922,
"learning_rate": 1.9390028282119942e-06,
"loss": 0.4234,
"num_input_tokens_seen": 9856192,
"step": 3135
},
{
"epoch": 0.201011458933487,
"grad_norm": 39.57857131958008,
"learning_rate": 1.938617968638029e-06,
"loss": 0.5139,
"num_input_tokens_seen": 9871552,
"step": 3140
},
{
"epoch": 0.20133154087446387,
"grad_norm": 40.204986572265625,
"learning_rate": 1.938231937188509e-06,
"loss": 0.5103,
"num_input_tokens_seen": 9886016,
"step": 3145
},
{
"epoch": 0.20165162281544075,
"grad_norm": 43.581180572509766,
"learning_rate": 1.9378447343453995e-06,
"loss": 0.6257,
"num_input_tokens_seen": 9903552,
"step": 3150
},
{
"epoch": 0.20197170475641765,
"grad_norm": 42.93930435180664,
"learning_rate": 1.9374563605921275e-06,
"loss": 0.3501,
"num_input_tokens_seen": 9920320,
"step": 3155
},
{
"epoch": 0.20229178669739453,
"grad_norm": 31.756664276123047,
"learning_rate": 1.937066816413582e-06,
"loss": 0.5844,
"num_input_tokens_seen": 9935936,
"step": 3160
},
{
"epoch": 0.2026118686383714,
"grad_norm": 27.54891586303711,
"learning_rate": 1.9366761022961146e-06,
"loss": 0.4866,
"num_input_tokens_seen": 9950912,
"step": 3165
},
{
"epoch": 0.20293195057934832,
"grad_norm": 43.409576416015625,
"learning_rate": 1.9362842187275354e-06,
"loss": 0.5726,
"num_input_tokens_seen": 9966080,
"step": 3170
},
{
"epoch": 0.2032520325203252,
"grad_norm": 29.677104949951172,
"learning_rate": 1.9358911661971155e-06,
"loss": 0.4769,
"num_input_tokens_seen": 9982080,
"step": 3175
},
{
"epoch": 0.2035721144613021,
"grad_norm": 31.4278621673584,
"learning_rate": 1.9354969451955864e-06,
"loss": 0.4818,
"num_input_tokens_seen": 9996544,
"step": 3180
},
{
"epoch": 0.20389219640227899,
"grad_norm": 28.86357879638672,
"learning_rate": 1.9351015562151375e-06,
"loss": 0.5595,
"num_input_tokens_seen": 10011776,
"step": 3185
},
{
"epoch": 0.20421227834325587,
"grad_norm": 26.479507446289062,
"learning_rate": 1.934704999749416e-06,
"loss": 0.4337,
"num_input_tokens_seen": 10027264,
"step": 3190
},
{
"epoch": 0.20453236028423277,
"grad_norm": 24.264083862304688,
"learning_rate": 1.9343072762935274e-06,
"loss": 0.4251,
"num_input_tokens_seen": 10042432,
"step": 3195
},
{
"epoch": 0.20485244222520965,
"grad_norm": 29.045461654663086,
"learning_rate": 1.933908386344035e-06,
"loss": 0.4122,
"num_input_tokens_seen": 10057792,
"step": 3200
},
{
"epoch": 0.20517252416618653,
"grad_norm": 35.074466705322266,
"learning_rate": 1.9335083303989565e-06,
"loss": 0.528,
"num_input_tokens_seen": 10074752,
"step": 3205
},
{
"epoch": 0.20549260610716344,
"grad_norm": 39.952335357666016,
"learning_rate": 1.9331071089577674e-06,
"loss": 0.5767,
"num_input_tokens_seen": 10090752,
"step": 3210
},
{
"epoch": 0.20581268804814032,
"grad_norm": 49.97673416137695,
"learning_rate": 1.9327047225213963e-06,
"loss": 0.5028,
"num_input_tokens_seen": 10106240,
"step": 3215
},
{
"epoch": 0.20613276998911723,
"grad_norm": 40.61750411987305,
"learning_rate": 1.9323011715922283e-06,
"loss": 0.4154,
"num_input_tokens_seen": 10121856,
"step": 3220
},
{
"epoch": 0.2064528519300941,
"grad_norm": 70.33148956298828,
"learning_rate": 1.931896456674101e-06,
"loss": 0.4682,
"num_input_tokens_seen": 10137408,
"step": 3225
},
{
"epoch": 0.20677293387107099,
"grad_norm": 35.08588790893555,
"learning_rate": 1.931490578272306e-06,
"loss": 0.4611,
"num_input_tokens_seen": 10152640,
"step": 3230
},
{
"epoch": 0.2070930158120479,
"grad_norm": 35.493282318115234,
"learning_rate": 1.9310835368935867e-06,
"loss": 0.3551,
"num_input_tokens_seen": 10167936,
"step": 3235
},
{
"epoch": 0.20741309775302477,
"grad_norm": 37.591766357421875,
"learning_rate": 1.93067533304614e-06,
"loss": 0.4241,
"num_input_tokens_seen": 10183360,
"step": 3240
},
{
"epoch": 0.20773317969400165,
"grad_norm": 31.755170822143555,
"learning_rate": 1.9302659672396128e-06,
"loss": 0.5624,
"num_input_tokens_seen": 10198208,
"step": 3245
},
{
"epoch": 0.20805326163497856,
"grad_norm": 27.73048210144043,
"learning_rate": 1.9298554399851025e-06,
"loss": 0.4975,
"num_input_tokens_seen": 10213568,
"step": 3250
},
{
"epoch": 0.20837334357595544,
"grad_norm": 38.50849533081055,
"learning_rate": 1.929443751795158e-06,
"loss": 0.4755,
"num_input_tokens_seen": 10230080,
"step": 3255
},
{
"epoch": 0.20869342551693235,
"grad_norm": 24.83016586303711,
"learning_rate": 1.929030903183776e-06,
"loss": 0.4792,
"num_input_tokens_seen": 10246912,
"step": 3260
},
{
"epoch": 0.20901350745790923,
"grad_norm": 40.69696044921875,
"learning_rate": 1.9286168946664033e-06,
"loss": 0.5231,
"num_input_tokens_seen": 10262464,
"step": 3265
},
{
"epoch": 0.2093335893988861,
"grad_norm": 65.39424896240234,
"learning_rate": 1.9282017267599352e-06,
"loss": 0.6606,
"num_input_tokens_seen": 10278016,
"step": 3270
},
{
"epoch": 0.209653671339863,
"grad_norm": 44.02311706542969,
"learning_rate": 1.9277853999827125e-06,
"loss": 0.5055,
"num_input_tokens_seen": 10293824,
"step": 3275
},
{
"epoch": 0.2099737532808399,
"grad_norm": 45.903785705566406,
"learning_rate": 1.9273679148545244e-06,
"loss": 0.5263,
"num_input_tokens_seen": 10309568,
"step": 3280
},
{
"epoch": 0.21029383522181677,
"grad_norm": 35.46440124511719,
"learning_rate": 1.9269492718966062e-06,
"loss": 0.4181,
"num_input_tokens_seen": 10325696,
"step": 3285
},
{
"epoch": 0.21061391716279368,
"grad_norm": 31.961286544799805,
"learning_rate": 1.9265294716316384e-06,
"loss": 0.5398,
"num_input_tokens_seen": 10342016,
"step": 3290
},
{
"epoch": 0.21093399910377056,
"grad_norm": 29.716991424560547,
"learning_rate": 1.926108514583747e-06,
"loss": 0.468,
"num_input_tokens_seen": 10357632,
"step": 3295
},
{
"epoch": 0.21125408104474747,
"grad_norm": 51.53056335449219,
"learning_rate": 1.925686401278501e-06,
"loss": 0.4805,
"num_input_tokens_seen": 10373056,
"step": 3300
},
{
"epoch": 0.21157416298572435,
"grad_norm": 55.39066696166992,
"learning_rate": 1.9252631322429143e-06,
"loss": 0.6377,
"num_input_tokens_seen": 10389248,
"step": 3305
},
{
"epoch": 0.21189424492670123,
"grad_norm": 25.772296905517578,
"learning_rate": 1.9248387080054435e-06,
"loss": 0.4445,
"num_input_tokens_seen": 10404864,
"step": 3310
},
{
"epoch": 0.21221432686767813,
"grad_norm": 19.97397232055664,
"learning_rate": 1.9244131290959864e-06,
"loss": 0.4925,
"num_input_tokens_seen": 10420416,
"step": 3315
},
{
"epoch": 0.212534408808655,
"grad_norm": 32.75675964355469,
"learning_rate": 1.9239863960458845e-06,
"loss": 0.4311,
"num_input_tokens_seen": 10435456,
"step": 3320
},
{
"epoch": 0.21285449074963192,
"grad_norm": 33.047603607177734,
"learning_rate": 1.923558509387918e-06,
"loss": 0.4857,
"num_input_tokens_seen": 10451584,
"step": 3325
},
{
"epoch": 0.2131745726906088,
"grad_norm": 39.12358474731445,
"learning_rate": 1.9231294696563086e-06,
"loss": 0.3719,
"num_input_tokens_seen": 10467584,
"step": 3330
},
{
"epoch": 0.21349465463158568,
"grad_norm": 61.4869384765625,
"learning_rate": 1.922699277386718e-06,
"loss": 0.432,
"num_input_tokens_seen": 10483264,
"step": 3335
},
{
"epoch": 0.21381473657256259,
"grad_norm": 35.242061614990234,
"learning_rate": 1.9222679331162454e-06,
"loss": 0.5869,
"num_input_tokens_seen": 10498560,
"step": 3340
},
{
"epoch": 0.21413481851353947,
"grad_norm": 37.8232536315918,
"learning_rate": 1.92183543738343e-06,
"loss": 0.4431,
"num_input_tokens_seen": 10514176,
"step": 3345
},
{
"epoch": 0.21445490045451635,
"grad_norm": 26.13767433166504,
"learning_rate": 1.9214017907282475e-06,
"loss": 0.4427,
"num_input_tokens_seen": 10529792,
"step": 3350
},
{
"epoch": 0.21477498239549325,
"grad_norm": 38.66913986206055,
"learning_rate": 1.9209669936921105e-06,
"loss": 0.499,
"num_input_tokens_seen": 10545856,
"step": 3355
},
{
"epoch": 0.21509506433647013,
"grad_norm": 45.678104400634766,
"learning_rate": 1.920531046817869e-06,
"loss": 0.4092,
"num_input_tokens_seen": 10562368,
"step": 3360
},
{
"epoch": 0.21541514627744704,
"grad_norm": 47.112674713134766,
"learning_rate": 1.9200939506498067e-06,
"loss": 0.6207,
"num_input_tokens_seen": 10577280,
"step": 3365
},
{
"epoch": 0.21573522821842392,
"grad_norm": 29.5268497467041,
"learning_rate": 1.9196557057336446e-06,
"loss": 0.5719,
"num_input_tokens_seen": 10592384,
"step": 3370
},
{
"epoch": 0.2160553101594008,
"grad_norm": 25.944847106933594,
"learning_rate": 1.9192163126165354e-06,
"loss": 0.4577,
"num_input_tokens_seen": 10608704,
"step": 3375
},
{
"epoch": 0.2163753921003777,
"grad_norm": 44.30316925048828,
"learning_rate": 1.9187757718470673e-06,
"loss": 0.4011,
"num_input_tokens_seen": 10625280,
"step": 3380
},
{
"epoch": 0.21669547404135459,
"grad_norm": 23.519418716430664,
"learning_rate": 1.9183340839752606e-06,
"loss": 0.5336,
"num_input_tokens_seen": 10641152,
"step": 3385
},
{
"epoch": 0.21701555598233147,
"grad_norm": 30.161663055419922,
"learning_rate": 1.9178912495525672e-06,
"loss": 0.4227,
"num_input_tokens_seen": 10657472,
"step": 3390
},
{
"epoch": 0.21733563792330837,
"grad_norm": 24.444168090820312,
"learning_rate": 1.917447269131872e-06,
"loss": 0.4942,
"num_input_tokens_seen": 10673600,
"step": 3395
},
{
"epoch": 0.21765571986428525,
"grad_norm": 37.56548309326172,
"learning_rate": 1.917002143267489e-06,
"loss": 0.5717,
"num_input_tokens_seen": 10689344,
"step": 3400
},
{
"epoch": 0.21797580180526216,
"grad_norm": 27.281709671020508,
"learning_rate": 1.9165558725151633e-06,
"loss": 0.4315,
"num_input_tokens_seen": 10704384,
"step": 3405
},
{
"epoch": 0.21829588374623904,
"grad_norm": 58.759857177734375,
"learning_rate": 1.9161084574320692e-06,
"loss": 0.4937,
"num_input_tokens_seen": 10720512,
"step": 3410
},
{
"epoch": 0.21861596568721592,
"grad_norm": 34.621681213378906,
"learning_rate": 1.91565989857681e-06,
"loss": 0.485,
"num_input_tokens_seen": 10735744,
"step": 3415
},
{
"epoch": 0.21893604762819283,
"grad_norm": 32.12639617919922,
"learning_rate": 1.9152101965094162e-06,
"loss": 0.4665,
"num_input_tokens_seen": 10750848,
"step": 3420
},
{
"epoch": 0.2192561295691697,
"grad_norm": 49.00548553466797,
"learning_rate": 1.9147593517913464e-06,
"loss": 0.4939,
"num_input_tokens_seen": 10765632,
"step": 3425
},
{
"epoch": 0.21957621151014659,
"grad_norm": 18.40258026123047,
"learning_rate": 1.914307364985485e-06,
"loss": 0.3868,
"num_input_tokens_seen": 10780928,
"step": 3430
},
{
"epoch": 0.2198962934511235,
"grad_norm": 26.393571853637695,
"learning_rate": 1.913854236656144e-06,
"loss": 0.4273,
"num_input_tokens_seen": 10796864,
"step": 3435
},
{
"epoch": 0.22021637539210037,
"grad_norm": 42.73613357543945,
"learning_rate": 1.9133999673690584e-06,
"loss": 0.4643,
"num_input_tokens_seen": 10812672,
"step": 3440
},
{
"epoch": 0.22053645733307728,
"grad_norm": 46.20648193359375,
"learning_rate": 1.9129445576913886e-06,
"loss": 0.4782,
"num_input_tokens_seen": 10828544,
"step": 3445
},
{
"epoch": 0.22085653927405416,
"grad_norm": 25.036144256591797,
"learning_rate": 1.91248800819172e-06,
"loss": 0.5318,
"num_input_tokens_seen": 10844288,
"step": 3450
},
{
"epoch": 0.22117662121503104,
"grad_norm": 49.720008850097656,
"learning_rate": 1.912030319440059e-06,
"loss": 0.5306,
"num_input_tokens_seen": 10860160,
"step": 3455
},
{
"epoch": 0.22149670315600795,
"grad_norm": 37.309383392333984,
"learning_rate": 1.9115714920078354e-06,
"loss": 0.6076,
"num_input_tokens_seen": 10875968,
"step": 3460
},
{
"epoch": 0.22181678509698483,
"grad_norm": 27.302000045776367,
"learning_rate": 1.9111115264679017e-06,
"loss": 0.3367,
"num_input_tokens_seen": 10892096,
"step": 3465
},
{
"epoch": 0.2221368670379617,
"grad_norm": 45.5595703125,
"learning_rate": 1.910650423394529e-06,
"loss": 0.4611,
"num_input_tokens_seen": 10908544,
"step": 3470
},
{
"epoch": 0.2224569489789386,
"grad_norm": 47.52442169189453,
"learning_rate": 1.910188183363411e-06,
"loss": 0.4804,
"num_input_tokens_seen": 10924544,
"step": 3475
},
{
"epoch": 0.2227770309199155,
"grad_norm": 50.32191467285156,
"learning_rate": 1.909724806951659e-06,
"loss": 0.4408,
"num_input_tokens_seen": 10941888,
"step": 3480
},
{
"epoch": 0.2230971128608924,
"grad_norm": 49.5562858581543,
"learning_rate": 1.909260294737804e-06,
"loss": 0.4689,
"num_input_tokens_seen": 10958592,
"step": 3485
},
{
"epoch": 0.22341719480186928,
"grad_norm": 82.99776458740234,
"learning_rate": 1.9087946473017953e-06,
"loss": 0.5319,
"num_input_tokens_seen": 10974208,
"step": 3490
},
{
"epoch": 0.22373727674284616,
"grad_norm": 33.70225524902344,
"learning_rate": 1.9083278652249992e-06,
"loss": 0.4363,
"num_input_tokens_seen": 10988928,
"step": 3495
},
{
"epoch": 0.22405735868382307,
"grad_norm": 38.13908386230469,
"learning_rate": 1.9078599490901983e-06,
"loss": 0.4327,
"num_input_tokens_seen": 11005952,
"step": 3500
},
{
"epoch": 0.22437744062479995,
"grad_norm": 96.11404418945312,
"learning_rate": 1.9073908994815914e-06,
"loss": 0.4012,
"num_input_tokens_seen": 11020608,
"step": 3505
},
{
"epoch": 0.22469752256577685,
"grad_norm": 50.33843994140625,
"learning_rate": 1.9069207169847928e-06,
"loss": 0.4999,
"num_input_tokens_seen": 11036736,
"step": 3510
},
{
"epoch": 0.22501760450675373,
"grad_norm": 34.01993179321289,
"learning_rate": 1.9064494021868302e-06,
"loss": 0.3645,
"num_input_tokens_seen": 11052480,
"step": 3515
},
{
"epoch": 0.2253376864477306,
"grad_norm": 38.320194244384766,
"learning_rate": 1.9059769556761464e-06,
"loss": 0.4816,
"num_input_tokens_seen": 11068416,
"step": 3520
},
{
"epoch": 0.22565776838870752,
"grad_norm": 32.70565414428711,
"learning_rate": 1.9055033780425962e-06,
"loss": 0.4443,
"num_input_tokens_seen": 11086400,
"step": 3525
},
{
"epoch": 0.2259778503296844,
"grad_norm": 87.77069854736328,
"learning_rate": 1.9050286698774464e-06,
"loss": 0.5674,
"num_input_tokens_seen": 11102848,
"step": 3530
},
{
"epoch": 0.22629793227066128,
"grad_norm": 41.02049255371094,
"learning_rate": 1.904552831773376e-06,
"loss": 0.5366,
"num_input_tokens_seen": 11118080,
"step": 3535
},
{
"epoch": 0.22661801421163819,
"grad_norm": 25.107044219970703,
"learning_rate": 1.9040758643244748e-06,
"loss": 0.5045,
"num_input_tokens_seen": 11133120,
"step": 3540
},
{
"epoch": 0.22693809615261507,
"grad_norm": 30.775938034057617,
"learning_rate": 1.903597768126242e-06,
"loss": 0.4452,
"num_input_tokens_seen": 11150144,
"step": 3545
},
{
"epoch": 0.22725817809359197,
"grad_norm": 53.267887115478516,
"learning_rate": 1.9031185437755862e-06,
"loss": 0.4862,
"num_input_tokens_seen": 11165760,
"step": 3550
},
{
"epoch": 0.22757826003456885,
"grad_norm": 52.50774383544922,
"learning_rate": 1.9026381918708246e-06,
"loss": 0.4948,
"num_input_tokens_seen": 11180096,
"step": 3555
},
{
"epoch": 0.22789834197554573,
"grad_norm": 20.394287109375,
"learning_rate": 1.9021567130116822e-06,
"loss": 0.3775,
"num_input_tokens_seen": 11195584,
"step": 3560
},
{
"epoch": 0.22821842391652264,
"grad_norm": 48.595298767089844,
"learning_rate": 1.9016741077992916e-06,
"loss": 0.389,
"num_input_tokens_seen": 11210944,
"step": 3565
},
{
"epoch": 0.22853850585749952,
"grad_norm": 27.00090980529785,
"learning_rate": 1.90119037683619e-06,
"loss": 0.4008,
"num_input_tokens_seen": 11227392,
"step": 3570
},
{
"epoch": 0.2288585877984764,
"grad_norm": 31.237030029296875,
"learning_rate": 1.9007055207263223e-06,
"loss": 0.6598,
"num_input_tokens_seen": 11244416,
"step": 3575
},
{
"epoch": 0.2291786697394533,
"grad_norm": 29.584184646606445,
"learning_rate": 1.900219540075036e-06,
"loss": 0.3584,
"num_input_tokens_seen": 11260672,
"step": 3580
},
{
"epoch": 0.22949875168043019,
"grad_norm": 58.92024612426758,
"learning_rate": 1.8997324354890845e-06,
"loss": 0.4823,
"num_input_tokens_seen": 11277504,
"step": 3585
},
{
"epoch": 0.2298188336214071,
"grad_norm": 71.05110931396484,
"learning_rate": 1.8992442075766233e-06,
"loss": 0.5325,
"num_input_tokens_seen": 11293184,
"step": 3590
},
{
"epoch": 0.23013891556238397,
"grad_norm": 31.41910743713379,
"learning_rate": 1.8987548569472105e-06,
"loss": 0.3273,
"num_input_tokens_seen": 11308480,
"step": 3595
},
{
"epoch": 0.23045899750336085,
"grad_norm": 31.32626724243164,
"learning_rate": 1.8982643842118064e-06,
"loss": 0.3958,
"num_input_tokens_seen": 11323840,
"step": 3600
},
{
"epoch": 0.23077907944433776,
"grad_norm": 66.50199127197266,
"learning_rate": 1.8977727899827716e-06,
"loss": 0.5822,
"num_input_tokens_seen": 11339456,
"step": 3605
},
{
"epoch": 0.23109916138531464,
"grad_norm": 49.916748046875,
"learning_rate": 1.8972800748738678e-06,
"loss": 0.6628,
"num_input_tokens_seen": 11354880,
"step": 3610
},
{
"epoch": 0.23141924332629152,
"grad_norm": 28.426061630249023,
"learning_rate": 1.896786239500255e-06,
"loss": 0.5365,
"num_input_tokens_seen": 11369984,
"step": 3615
},
{
"epoch": 0.23173932526726843,
"grad_norm": 51.97602081298828,
"learning_rate": 1.8962912844784928e-06,
"loss": 0.4328,
"num_input_tokens_seen": 11384640,
"step": 3620
},
{
"epoch": 0.2320594072082453,
"grad_norm": 53.62090301513672,
"learning_rate": 1.8957952104265384e-06,
"loss": 0.5017,
"num_input_tokens_seen": 11401152,
"step": 3625
},
{
"epoch": 0.2323794891492222,
"grad_norm": 34.26565170288086,
"learning_rate": 1.8952980179637458e-06,
"loss": 0.4551,
"num_input_tokens_seen": 11416896,
"step": 3630
},
{
"epoch": 0.2326995710901991,
"grad_norm": 36.66518020629883,
"learning_rate": 1.8947997077108662e-06,
"loss": 0.5002,
"num_input_tokens_seen": 11432832,
"step": 3635
},
{
"epoch": 0.23301965303117597,
"grad_norm": 32.46730041503906,
"learning_rate": 1.894300280290045e-06,
"loss": 0.5022,
"num_input_tokens_seen": 11448320,
"step": 3640
},
{
"epoch": 0.23333973497215288,
"grad_norm": 25.59243392944336,
"learning_rate": 1.8937997363248237e-06,
"loss": 0.5691,
"num_input_tokens_seen": 11463488,
"step": 3645
},
{
"epoch": 0.23365981691312976,
"grad_norm": 21.926359176635742,
"learning_rate": 1.8932980764401373e-06,
"loss": 0.4616,
"num_input_tokens_seen": 11478592,
"step": 3650
},
{
"epoch": 0.23397989885410664,
"grad_norm": 26.116849899291992,
"learning_rate": 1.8927953012623141e-06,
"loss": 0.367,
"num_input_tokens_seen": 11494720,
"step": 3655
},
{
"epoch": 0.23429998079508355,
"grad_norm": 56.29279708862305,
"learning_rate": 1.8922914114190744e-06,
"loss": 0.4884,
"num_input_tokens_seen": 11511232,
"step": 3660
},
{
"epoch": 0.23462006273606043,
"grad_norm": 33.87401580810547,
"learning_rate": 1.8917864075395312e-06,
"loss": 0.5212,
"num_input_tokens_seen": 11527040,
"step": 3665
},
{
"epoch": 0.23494014467703733,
"grad_norm": 19.292613983154297,
"learning_rate": 1.8912802902541873e-06,
"loss": 0.4641,
"num_input_tokens_seen": 11542528,
"step": 3670
},
{
"epoch": 0.2352602266180142,
"grad_norm": 40.246734619140625,
"learning_rate": 1.8907730601949362e-06,
"loss": 0.503,
"num_input_tokens_seen": 11557696,
"step": 3675
},
{
"epoch": 0.2355803085589911,
"grad_norm": 50.62693786621094,
"learning_rate": 1.8902647179950608e-06,
"loss": 0.4703,
"num_input_tokens_seen": 11574848,
"step": 3680
},
{
"epoch": 0.235900390499968,
"grad_norm": 45.66773986816406,
"learning_rate": 1.889755264289232e-06,
"loss": 0.5074,
"num_input_tokens_seen": 11589696,
"step": 3685
},
{
"epoch": 0.23622047244094488,
"grad_norm": 20.744386672973633,
"learning_rate": 1.8892446997135087e-06,
"loss": 0.3915,
"num_input_tokens_seen": 11606848,
"step": 3690
},
{
"epoch": 0.23654055438192176,
"grad_norm": 30.993366241455078,
"learning_rate": 1.888733024905337e-06,
"loss": 0.6641,
"num_input_tokens_seen": 11623744,
"step": 3695
},
{
"epoch": 0.23686063632289867,
"grad_norm": 38.157161712646484,
"learning_rate": 1.888220240503549e-06,
"loss": 0.4874,
"num_input_tokens_seen": 11640256,
"step": 3700
},
{
"epoch": 0.23718071826387555,
"grad_norm": 33.60673522949219,
"learning_rate": 1.8877063471483618e-06,
"loss": 0.4096,
"num_input_tokens_seen": 11655744,
"step": 3705
},
{
"epoch": 0.23750080020485245,
"grad_norm": 17.36272621154785,
"learning_rate": 1.8871913454813772e-06,
"loss": 0.2877,
"num_input_tokens_seen": 11671104,
"step": 3710
},
{
"epoch": 0.23782088214582933,
"grad_norm": 31.4836483001709,
"learning_rate": 1.886675236145581e-06,
"loss": 0.3741,
"num_input_tokens_seen": 11686848,
"step": 3715
},
{
"epoch": 0.2381409640868062,
"grad_norm": 29.86385726928711,
"learning_rate": 1.8861580197853422e-06,
"loss": 0.5053,
"num_input_tokens_seen": 11701952,
"step": 3720
},
{
"epoch": 0.23846104602778312,
"grad_norm": 39.52205276489258,
"learning_rate": 1.8856396970464105e-06,
"loss": 0.4637,
"num_input_tokens_seen": 11718592,
"step": 3725
},
{
"epoch": 0.23878112796876,
"grad_norm": 36.30428695678711,
"learning_rate": 1.8851202685759189e-06,
"loss": 0.5129,
"num_input_tokens_seen": 11734208,
"step": 3730
},
{
"epoch": 0.2391012099097369,
"grad_norm": 12.694880485534668,
"learning_rate": 1.8845997350223792e-06,
"loss": 0.4186,
"num_input_tokens_seen": 11748992,
"step": 3735
},
{
"epoch": 0.23942129185071379,
"grad_norm": 29.11704444885254,
"learning_rate": 1.8840780970356842e-06,
"loss": 0.4258,
"num_input_tokens_seen": 11764608,
"step": 3740
},
{
"epoch": 0.23974137379169067,
"grad_norm": 28.726520538330078,
"learning_rate": 1.8835553552671048e-06,
"loss": 0.3919,
"num_input_tokens_seen": 11780800,
"step": 3745
},
{
"epoch": 0.24006145573266757,
"grad_norm": 32.10429382324219,
"learning_rate": 1.8830315103692902e-06,
"loss": 0.4467,
"num_input_tokens_seen": 11795776,
"step": 3750
},
{
"epoch": 0.24038153767364445,
"grad_norm": 36.08988571166992,
"learning_rate": 1.8825065629962669e-06,
"loss": 0.52,
"num_input_tokens_seen": 11811776,
"step": 3755
},
{
"epoch": 0.24070161961462133,
"grad_norm": 34.568302154541016,
"learning_rate": 1.881980513803438e-06,
"loss": 0.4902,
"num_input_tokens_seen": 11828224,
"step": 3760
},
{
"epoch": 0.24102170155559824,
"grad_norm": 49.93181228637695,
"learning_rate": 1.881453363447582e-06,
"loss": 0.5093,
"num_input_tokens_seen": 11843904,
"step": 3765
},
{
"epoch": 0.24134178349657512,
"grad_norm": 58.88014221191406,
"learning_rate": 1.880925112586852e-06,
"loss": 0.5653,
"num_input_tokens_seen": 11859392,
"step": 3770
},
{
"epoch": 0.24166186543755203,
"grad_norm": 48.48269271850586,
"learning_rate": 1.8803957618807762e-06,
"loss": 0.4347,
"num_input_tokens_seen": 11875968,
"step": 3775
},
{
"epoch": 0.2419819473785289,
"grad_norm": 70.41657257080078,
"learning_rate": 1.8798653119902548e-06,
"loss": 0.4527,
"num_input_tokens_seen": 11891584,
"step": 3780
},
{
"epoch": 0.24230202931950579,
"grad_norm": 27.50962257385254,
"learning_rate": 1.8793337635775603e-06,
"loss": 0.4997,
"num_input_tokens_seen": 11906944,
"step": 3785
},
{
"epoch": 0.2426221112604827,
"grad_norm": 47.6135139465332,
"learning_rate": 1.8788011173063376e-06,
"loss": 0.4883,
"num_input_tokens_seen": 11922368,
"step": 3790
},
{
"epoch": 0.24294219320145957,
"grad_norm": 50.76975631713867,
"learning_rate": 1.8782673738416018e-06,
"loss": 0.5193,
"num_input_tokens_seen": 11938432,
"step": 3795
},
{
"epoch": 0.24326227514243645,
"grad_norm": 48.272972106933594,
"learning_rate": 1.877732533849737e-06,
"loss": 0.5232,
"num_input_tokens_seen": 11956608,
"step": 3800
},
{
"epoch": 0.24358235708341336,
"grad_norm": 27.233749389648438,
"learning_rate": 1.8771965979984988e-06,
"loss": 0.4473,
"num_input_tokens_seen": 11972480,
"step": 3805
},
{
"epoch": 0.24390243902439024,
"grad_norm": 23.872587203979492,
"learning_rate": 1.8766595669570084e-06,
"loss": 0.3903,
"num_input_tokens_seen": 11987072,
"step": 3810
},
{
"epoch": 0.24422252096536715,
"grad_norm": 32.66127395629883,
"learning_rate": 1.8761214413957553e-06,
"loss": 0.4257,
"num_input_tokens_seen": 12002112,
"step": 3815
},
{
"epoch": 0.24454260290634403,
"grad_norm": 24.8861026763916,
"learning_rate": 1.8755822219865963e-06,
"loss": 0.3607,
"num_input_tokens_seen": 12016960,
"step": 3820
},
{
"epoch": 0.2448626848473209,
"grad_norm": 68.05755615234375,
"learning_rate": 1.875041909402752e-06,
"loss": 0.4264,
"num_input_tokens_seen": 12032576,
"step": 3825
},
{
"epoch": 0.2451827667882978,
"grad_norm": 25.900711059570312,
"learning_rate": 1.8745005043188102e-06,
"loss": 0.3694,
"num_input_tokens_seen": 12048768,
"step": 3830
},
{
"epoch": 0.2455028487292747,
"grad_norm": 34.87873077392578,
"learning_rate": 1.8739580074107208e-06,
"loss": 0.3828,
"num_input_tokens_seen": 12065088,
"step": 3835
},
{
"epoch": 0.24582293067025157,
"grad_norm": 41.93102264404297,
"learning_rate": 1.873414419355798e-06,
"loss": 0.7066,
"num_input_tokens_seen": 12080704,
"step": 3840
},
{
"epoch": 0.24614301261122848,
"grad_norm": 43.951683044433594,
"learning_rate": 1.872869740832717e-06,
"loss": 0.4319,
"num_input_tokens_seen": 12096704,
"step": 3845
},
{
"epoch": 0.24646309455220536,
"grad_norm": 38.640892028808594,
"learning_rate": 1.8723239725215165e-06,
"loss": 0.5962,
"num_input_tokens_seen": 12111488,
"step": 3850
},
{
"epoch": 0.24678317649318227,
"grad_norm": 22.83368682861328,
"learning_rate": 1.871777115103594e-06,
"loss": 0.4195,
"num_input_tokens_seen": 12128192,
"step": 3855
},
{
"epoch": 0.24710325843415915,
"grad_norm": 21.497661590576172,
"learning_rate": 1.8712291692617074e-06,
"loss": 0.4734,
"num_input_tokens_seen": 12143808,
"step": 3860
},
{
"epoch": 0.24742334037513602,
"grad_norm": 35.97737121582031,
"learning_rate": 1.8706801356799735e-06,
"loss": 0.496,
"num_input_tokens_seen": 12159232,
"step": 3865
},
{
"epoch": 0.24774342231611293,
"grad_norm": 28.39885139465332,
"learning_rate": 1.8701300150438674e-06,
"loss": 0.4515,
"num_input_tokens_seen": 12175360,
"step": 3870
},
{
"epoch": 0.2480635042570898,
"grad_norm": 18.00739288330078,
"learning_rate": 1.869578808040221e-06,
"loss": 0.4208,
"num_input_tokens_seen": 12190272,
"step": 3875
},
{
"epoch": 0.2483835861980667,
"grad_norm": 48.9376106262207,
"learning_rate": 1.869026515357223e-06,
"loss": 0.5226,
"num_input_tokens_seen": 12208448,
"step": 3880
},
{
"epoch": 0.2487036681390436,
"grad_norm": 39.13751220703125,
"learning_rate": 1.8684731376844169e-06,
"loss": 0.6458,
"num_input_tokens_seen": 12225984,
"step": 3885
},
{
"epoch": 0.24902375008002048,
"grad_norm": 35.23835372924805,
"learning_rate": 1.8679186757127014e-06,
"loss": 0.5022,
"num_input_tokens_seen": 12241408,
"step": 3890
},
{
"epoch": 0.24934383202099739,
"grad_norm": 33.123191833496094,
"learning_rate": 1.8673631301343288e-06,
"loss": 0.4355,
"num_input_tokens_seen": 12256064,
"step": 3895
},
{
"epoch": 0.24966391396197427,
"grad_norm": 31.406911849975586,
"learning_rate": 1.8668065016429044e-06,
"loss": 0.4515,
"num_input_tokens_seen": 12272832,
"step": 3900
},
{
"epoch": 0.24998399590295114,
"grad_norm": 23.97220230102539,
"learning_rate": 1.866248790933385e-06,
"loss": 0.5368,
"num_input_tokens_seen": 12289024,
"step": 3905
},
{
"epoch": 0.25030407784392805,
"grad_norm": 27.529621124267578,
"learning_rate": 1.8656899987020795e-06,
"loss": 0.4277,
"num_input_tokens_seen": 12304064,
"step": 3910
},
{
"epoch": 0.25030407784392805,
"eval_loss": 0.46774157881736755,
"eval_runtime": 50.7199,
"eval_samples_per_second": 273.778,
"eval_steps_per_second": 34.227,
"num_input_tokens_seen": 12304064,
"step": 3910
},
{
"epoch": 0.25062415978490493,
"grad_norm": 33.151676177978516,
"learning_rate": 1.865130125646646e-06,
"loss": 0.4665,
"num_input_tokens_seen": 12320256,
"step": 3915
},
{
"epoch": 0.2509442417258818,
"grad_norm": 22.091079711914062,
"learning_rate": 1.8645691724660933e-06,
"loss": 0.4426,
"num_input_tokens_seen": 12335360,
"step": 3920
},
{
"epoch": 0.2512643236668587,
"grad_norm": 25.529563903808594,
"learning_rate": 1.8640071398607774e-06,
"loss": 0.4718,
"num_input_tokens_seen": 12351488,
"step": 3925
},
{
"epoch": 0.2515844056078356,
"grad_norm": 64.41213989257812,
"learning_rate": 1.8634440285324024e-06,
"loss": 0.6284,
"num_input_tokens_seen": 12365952,
"step": 3930
},
{
"epoch": 0.2519044875488125,
"grad_norm": 53.673301696777344,
"learning_rate": 1.8628798391840205e-06,
"loss": 0.4716,
"num_input_tokens_seen": 12381376,
"step": 3935
},
{
"epoch": 0.2522245694897894,
"grad_norm": 60.03094482421875,
"learning_rate": 1.8623145725200277e-06,
"loss": 0.4596,
"num_input_tokens_seen": 12396160,
"step": 3940
},
{
"epoch": 0.25254465143076626,
"grad_norm": 29.19440460205078,
"learning_rate": 1.8617482292461664e-06,
"loss": 0.4591,
"num_input_tokens_seen": 12410944,
"step": 3945
},
{
"epoch": 0.25286473337174314,
"grad_norm": 28.456212997436523,
"learning_rate": 1.861180810069523e-06,
"loss": 0.4216,
"num_input_tokens_seen": 12426304,
"step": 3950
},
{
"epoch": 0.2531848153127201,
"grad_norm": 44.63097381591797,
"learning_rate": 1.8606123156985268e-06,
"loss": 0.4785,
"num_input_tokens_seen": 12442432,
"step": 3955
},
{
"epoch": 0.25350489725369696,
"grad_norm": 21.59270477294922,
"learning_rate": 1.8600427468429496e-06,
"loss": 0.4666,
"num_input_tokens_seen": 12458368,
"step": 3960
},
{
"epoch": 0.25382497919467384,
"grad_norm": 32.624305725097656,
"learning_rate": 1.8594721042139052e-06,
"loss": 0.433,
"num_input_tokens_seen": 12474368,
"step": 3965
},
{
"epoch": 0.2541450611356507,
"grad_norm": 19.357770919799805,
"learning_rate": 1.858900388523847e-06,
"loss": 0.423,
"num_input_tokens_seen": 12490176,
"step": 3970
},
{
"epoch": 0.2544651430766276,
"grad_norm": 27.158327102661133,
"learning_rate": 1.8583276004865694e-06,
"loss": 0.4523,
"num_input_tokens_seen": 12507840,
"step": 3975
},
{
"epoch": 0.25478522501760453,
"grad_norm": 35.72364044189453,
"learning_rate": 1.8577537408172046e-06,
"loss": 0.352,
"num_input_tokens_seen": 12523520,
"step": 3980
},
{
"epoch": 0.2551053069585814,
"grad_norm": 32.988182067871094,
"learning_rate": 1.8571788102322234e-06,
"loss": 0.5448,
"num_input_tokens_seen": 12540736,
"step": 3985
},
{
"epoch": 0.2554253888995583,
"grad_norm": 35.66259002685547,
"learning_rate": 1.8566028094494332e-06,
"loss": 0.4698,
"num_input_tokens_seen": 12556352,
"step": 3990
},
{
"epoch": 0.25574547084053517,
"grad_norm": 21.36469268798828,
"learning_rate": 1.8560257391879778e-06,
"loss": 0.3732,
"num_input_tokens_seen": 12570688,
"step": 3995
},
{
"epoch": 0.25606555278151205,
"grad_norm": 16.891489028930664,
"learning_rate": 1.855447600168336e-06,
"loss": 0.4079,
"num_input_tokens_seen": 12585984,
"step": 4000
},
{
"epoch": 0.25638563472248893,
"grad_norm": 19.356708526611328,
"learning_rate": 1.8548683931123215e-06,
"loss": 0.4732,
"num_input_tokens_seen": 12601216,
"step": 4005
},
{
"epoch": 0.25670571666346587,
"grad_norm": 66.04039764404297,
"learning_rate": 1.8542881187430807e-06,
"loss": 0.4471,
"num_input_tokens_seen": 12618624,
"step": 4010
},
{
"epoch": 0.25702579860444275,
"grad_norm": 27.081804275512695,
"learning_rate": 1.8537067777850935e-06,
"loss": 0.5899,
"num_input_tokens_seen": 12635840,
"step": 4015
},
{
"epoch": 0.2573458805454196,
"grad_norm": 22.54844093322754,
"learning_rate": 1.8531243709641704e-06,
"loss": 0.359,
"num_input_tokens_seen": 12651904,
"step": 4020
},
{
"epoch": 0.2576659624863965,
"grad_norm": 37.674034118652344,
"learning_rate": 1.8525408990074533e-06,
"loss": 0.4977,
"num_input_tokens_seen": 12666944,
"step": 4025
},
{
"epoch": 0.2579860444273734,
"grad_norm": 23.49472999572754,
"learning_rate": 1.851956362643414e-06,
"loss": 0.4184,
"num_input_tokens_seen": 12682688,
"step": 4030
},
{
"epoch": 0.2583061263683503,
"grad_norm": 44.18896484375,
"learning_rate": 1.851370762601853e-06,
"loss": 0.5578,
"num_input_tokens_seen": 12698304,
"step": 4035
},
{
"epoch": 0.2586262083093272,
"grad_norm": 42.4050178527832,
"learning_rate": 1.8507840996138983e-06,
"loss": 0.5083,
"num_input_tokens_seen": 12712896,
"step": 4040
},
{
"epoch": 0.2589462902503041,
"grad_norm": 63.08219909667969,
"learning_rate": 1.8501963744120062e-06,
"loss": 0.3908,
"num_input_tokens_seen": 12727488,
"step": 4045
},
{
"epoch": 0.25926637219128096,
"grad_norm": 35.79430389404297,
"learning_rate": 1.849607587729958e-06,
"loss": 0.408,
"num_input_tokens_seen": 12742720,
"step": 4050
},
{
"epoch": 0.25958645413225784,
"grad_norm": 28.8338623046875,
"learning_rate": 1.8490177403028615e-06,
"loss": 0.3966,
"num_input_tokens_seen": 12757760,
"step": 4055
},
{
"epoch": 0.2599065360732348,
"grad_norm": 44.3582878112793,
"learning_rate": 1.8484268328671475e-06,
"loss": 0.4966,
"num_input_tokens_seen": 12773312,
"step": 4060
},
{
"epoch": 0.26022661801421165,
"grad_norm": 41.44272994995117,
"learning_rate": 1.847834866160571e-06,
"loss": 0.5448,
"num_input_tokens_seen": 12790336,
"step": 4065
},
{
"epoch": 0.26054669995518853,
"grad_norm": 26.90788459777832,
"learning_rate": 1.847241840922209e-06,
"loss": 0.4919,
"num_input_tokens_seen": 12805632,
"step": 4070
},
{
"epoch": 0.2608667818961654,
"grad_norm": 36.868736267089844,
"learning_rate": 1.8466477578924616e-06,
"loss": 0.4875,
"num_input_tokens_seen": 12821184,
"step": 4075
},
{
"epoch": 0.2611868638371423,
"grad_norm": 32.42481994628906,
"learning_rate": 1.8460526178130472e-06,
"loss": 0.5004,
"num_input_tokens_seen": 12836544,
"step": 4080
},
{
"epoch": 0.26150694577811917,
"grad_norm": 38.77480697631836,
"learning_rate": 1.8454564214270056e-06,
"loss": 0.436,
"num_input_tokens_seen": 12852032,
"step": 4085
},
{
"epoch": 0.2618270277190961,
"grad_norm": 47.80220031738281,
"learning_rate": 1.8448591694786955e-06,
"loss": 0.4469,
"num_input_tokens_seen": 12867456,
"step": 4090
},
{
"epoch": 0.262147109660073,
"grad_norm": 34.1256103515625,
"learning_rate": 1.8442608627137925e-06,
"loss": 0.341,
"num_input_tokens_seen": 12885184,
"step": 4095
},
{
"epoch": 0.26246719160104987,
"grad_norm": 31.046709060668945,
"learning_rate": 1.8436615018792897e-06,
"loss": 0.3896,
"num_input_tokens_seen": 12900416,
"step": 4100
},
{
"epoch": 0.26278727354202674,
"grad_norm": 38.95481872558594,
"learning_rate": 1.8430610877234957e-06,
"loss": 0.5792,
"num_input_tokens_seen": 12915648,
"step": 4105
},
{
"epoch": 0.2631073554830036,
"grad_norm": 15.715054512023926,
"learning_rate": 1.8424596209960356e-06,
"loss": 0.4624,
"num_input_tokens_seen": 12930368,
"step": 4110
},
{
"epoch": 0.26342743742398056,
"grad_norm": 25.71843910217285,
"learning_rate": 1.8418571024478466e-06,
"loss": 0.5265,
"num_input_tokens_seen": 12945472,
"step": 4115
},
{
"epoch": 0.26374751936495744,
"grad_norm": 23.532575607299805,
"learning_rate": 1.8412535328311812e-06,
"loss": 0.491,
"num_input_tokens_seen": 12961472,
"step": 4120
},
{
"epoch": 0.2640676013059343,
"grad_norm": 50.43345260620117,
"learning_rate": 1.8406489128996023e-06,
"loss": 0.5816,
"num_input_tokens_seen": 12975872,
"step": 4125
},
{
"epoch": 0.2643876832469112,
"grad_norm": 34.13943862915039,
"learning_rate": 1.8400432434079853e-06,
"loss": 0.529,
"num_input_tokens_seen": 12992128,
"step": 4130
},
{
"epoch": 0.2647077651878881,
"grad_norm": 18.359914779663086,
"learning_rate": 1.8394365251125162e-06,
"loss": 0.4095,
"num_input_tokens_seen": 13021184,
"step": 4135
},
{
"epoch": 0.265027847128865,
"grad_norm": 33.158809661865234,
"learning_rate": 1.8388287587706888e-06,
"loss": 0.4425,
"num_input_tokens_seen": 13037568,
"step": 4140
},
{
"epoch": 0.2653479290698419,
"grad_norm": 35.4780387878418,
"learning_rate": 1.8382199451413074e-06,
"loss": 0.4682,
"num_input_tokens_seen": 13053440,
"step": 4145
},
{
"epoch": 0.26566801101081877,
"grad_norm": 38.03645706176758,
"learning_rate": 1.837610084984483e-06,
"loss": 0.5178,
"num_input_tokens_seen": 13069440,
"step": 4150
},
{
"epoch": 0.26598809295179565,
"grad_norm": 55.84706115722656,
"learning_rate": 1.8369991790616327e-06,
"loss": 0.5487,
"num_input_tokens_seen": 13084224,
"step": 4155
},
{
"epoch": 0.26630817489277253,
"grad_norm": 39.98716735839844,
"learning_rate": 1.8363872281354795e-06,
"loss": 0.6725,
"num_input_tokens_seen": 13098688,
"step": 4160
},
{
"epoch": 0.26662825683374947,
"grad_norm": 33.307315826416016,
"learning_rate": 1.835774232970052e-06,
"loss": 0.4162,
"num_input_tokens_seen": 13114112,
"step": 4165
},
{
"epoch": 0.26694833877472635,
"grad_norm": 31.13365936279297,
"learning_rate": 1.8351601943306815e-06,
"loss": 0.454,
"num_input_tokens_seen": 13130240,
"step": 4170
},
{
"epoch": 0.2672684207157032,
"grad_norm": 41.67189407348633,
"learning_rate": 1.8345451129840025e-06,
"loss": 0.3972,
"num_input_tokens_seen": 13145536,
"step": 4175
},
{
"epoch": 0.2675885026566801,
"grad_norm": 38.381675720214844,
"learning_rate": 1.8339289896979515e-06,
"loss": 0.5506,
"num_input_tokens_seen": 13160256,
"step": 4180
},
{
"epoch": 0.267908584597657,
"grad_norm": 39.27140426635742,
"learning_rate": 1.8333118252417651e-06,
"loss": 0.5525,
"num_input_tokens_seen": 13177088,
"step": 4185
},
{
"epoch": 0.26822866653863386,
"grad_norm": 34.699344635009766,
"learning_rate": 1.832693620385981e-06,
"loss": 0.5154,
"num_input_tokens_seen": 13192768,
"step": 4190
},
{
"epoch": 0.2685487484796108,
"grad_norm": 27.59552764892578,
"learning_rate": 1.8320743759024352e-06,
"loss": 0.5089,
"num_input_tokens_seen": 13208192,
"step": 4195
},
{
"epoch": 0.2688688304205877,
"grad_norm": 42.20448303222656,
"learning_rate": 1.831454092564261e-06,
"loss": 0.5486,
"num_input_tokens_seen": 13223872,
"step": 4200
},
{
"epoch": 0.26918891236156456,
"grad_norm": 21.2120361328125,
"learning_rate": 1.8308327711458899e-06,
"loss": 0.4597,
"num_input_tokens_seen": 13239104,
"step": 4205
},
{
"epoch": 0.26950899430254144,
"grad_norm": 40.029544830322266,
"learning_rate": 1.830210412423049e-06,
"loss": 0.3925,
"num_input_tokens_seen": 13254464,
"step": 4210
},
{
"epoch": 0.2698290762435183,
"grad_norm": 23.788978576660156,
"learning_rate": 1.8295870171727605e-06,
"loss": 0.3617,
"num_input_tokens_seen": 13269824,
"step": 4215
},
{
"epoch": 0.27014915818449525,
"grad_norm": 22.333120346069336,
"learning_rate": 1.8289625861733408e-06,
"loss": 0.4149,
"num_input_tokens_seen": 13288448,
"step": 4220
},
{
"epoch": 0.27046924012547213,
"grad_norm": 35.422637939453125,
"learning_rate": 1.8283371202043991e-06,
"loss": 0.5178,
"num_input_tokens_seen": 13304320,
"step": 4225
},
{
"epoch": 0.270789322066449,
"grad_norm": 40.13027572631836,
"learning_rate": 1.827710620046837e-06,
"loss": 0.5533,
"num_input_tokens_seen": 13321920,
"step": 4230
},
{
"epoch": 0.2711094040074259,
"grad_norm": 55.25458526611328,
"learning_rate": 1.8270830864828474e-06,
"loss": 0.4571,
"num_input_tokens_seen": 13337280,
"step": 4235
},
{
"epoch": 0.27142948594840277,
"grad_norm": 16.795021057128906,
"learning_rate": 1.8264545202959133e-06,
"loss": 0.434,
"num_input_tokens_seen": 13354112,
"step": 4240
},
{
"epoch": 0.2717495678893797,
"grad_norm": 37.604312896728516,
"learning_rate": 1.8258249222708067e-06,
"loss": 0.4362,
"num_input_tokens_seen": 13369600,
"step": 4245
},
{
"epoch": 0.2720696498303566,
"grad_norm": 24.84840202331543,
"learning_rate": 1.8251942931935886e-06,
"loss": 0.4558,
"num_input_tokens_seen": 13385536,
"step": 4250
},
{
"epoch": 0.27238973177133347,
"grad_norm": 33.14160919189453,
"learning_rate": 1.8245626338516069e-06,
"loss": 0.3748,
"num_input_tokens_seen": 13400832,
"step": 4255
},
{
"epoch": 0.27270981371231034,
"grad_norm": 35.8721923828125,
"learning_rate": 1.823929945033495e-06,
"loss": 0.338,
"num_input_tokens_seen": 13416000,
"step": 4260
},
{
"epoch": 0.2730298956532872,
"grad_norm": 32.823890686035156,
"learning_rate": 1.8232962275291728e-06,
"loss": 0.5038,
"num_input_tokens_seen": 13431360,
"step": 4265
},
{
"epoch": 0.2733499775942641,
"grad_norm": 50.578182220458984,
"learning_rate": 1.822661482129844e-06,
"loss": 0.4415,
"num_input_tokens_seen": 13446976,
"step": 4270
},
{
"epoch": 0.27367005953524104,
"grad_norm": 22.204975128173828,
"learning_rate": 1.8220257096279956e-06,
"loss": 0.3688,
"num_input_tokens_seen": 13463040,
"step": 4275
},
{
"epoch": 0.2739901414762179,
"grad_norm": 36.239845275878906,
"learning_rate": 1.8213889108173972e-06,
"loss": 0.6843,
"num_input_tokens_seen": 13478656,
"step": 4280
},
{
"epoch": 0.2743102234171948,
"grad_norm": 24.040267944335938,
"learning_rate": 1.8207510864930992e-06,
"loss": 0.4995,
"num_input_tokens_seen": 13495296,
"step": 4285
},
{
"epoch": 0.2746303053581717,
"grad_norm": 20.18925666809082,
"learning_rate": 1.8201122374514336e-06,
"loss": 0.5081,
"num_input_tokens_seen": 13510912,
"step": 4290
},
{
"epoch": 0.27495038729914856,
"grad_norm": 23.800817489624023,
"learning_rate": 1.8194723644900099e-06,
"loss": 0.4362,
"num_input_tokens_seen": 13525952,
"step": 4295
},
{
"epoch": 0.2752704692401255,
"grad_norm": 26.63628387451172,
"learning_rate": 1.8188314684077173e-06,
"loss": 0.5305,
"num_input_tokens_seen": 13546752,
"step": 4300
},
{
"epoch": 0.2755905511811024,
"grad_norm": 40.90980911254883,
"learning_rate": 1.8181895500047226e-06,
"loss": 0.5643,
"num_input_tokens_seen": 13561728,
"step": 4305
},
{
"epoch": 0.27591063312207925,
"grad_norm": 22.858909606933594,
"learning_rate": 1.817546610082468e-06,
"loss": 0.4562,
"num_input_tokens_seen": 13577344,
"step": 4310
},
{
"epoch": 0.27623071506305613,
"grad_norm": 28.205032348632812,
"learning_rate": 1.816902649443672e-06,
"loss": 0.4862,
"num_input_tokens_seen": 13592256,
"step": 4315
},
{
"epoch": 0.276550797004033,
"grad_norm": 37.81781005859375,
"learning_rate": 1.8162576688923262e-06,
"loss": 0.5403,
"num_input_tokens_seen": 13608832,
"step": 4320
},
{
"epoch": 0.27687087894500995,
"grad_norm": 27.493858337402344,
"learning_rate": 1.815611669233697e-06,
"loss": 0.5508,
"num_input_tokens_seen": 13624128,
"step": 4325
},
{
"epoch": 0.2771909608859868,
"grad_norm": 26.893049240112305,
"learning_rate": 1.8149646512743222e-06,
"loss": 0.5369,
"num_input_tokens_seen": 13640576,
"step": 4330
},
{
"epoch": 0.2775110428269637,
"grad_norm": 24.463943481445312,
"learning_rate": 1.8143166158220118e-06,
"loss": 0.4653,
"num_input_tokens_seen": 13655872,
"step": 4335
},
{
"epoch": 0.2778311247679406,
"grad_norm": 44.26751708984375,
"learning_rate": 1.8136675636858454e-06,
"loss": 0.6598,
"num_input_tokens_seen": 13672384,
"step": 4340
},
{
"epoch": 0.27815120670891746,
"grad_norm": 21.628820419311523,
"learning_rate": 1.8130174956761723e-06,
"loss": 0.408,
"num_input_tokens_seen": 13687296,
"step": 4345
},
{
"epoch": 0.2784712886498944,
"grad_norm": 26.918249130249023,
"learning_rate": 1.81236641260461e-06,
"loss": 0.5366,
"num_input_tokens_seen": 13702528,
"step": 4350
},
{
"epoch": 0.2787913705908713,
"grad_norm": 68.7051773071289,
"learning_rate": 1.811714315284043e-06,
"loss": 0.498,
"num_input_tokens_seen": 13717568,
"step": 4355
},
{
"epoch": 0.27911145253184816,
"grad_norm": 21.975799560546875,
"learning_rate": 1.8110612045286229e-06,
"loss": 0.4252,
"num_input_tokens_seen": 13733568,
"step": 4360
},
{
"epoch": 0.27943153447282504,
"grad_norm": 22.67809295654297,
"learning_rate": 1.8104070811537661e-06,
"loss": 0.3778,
"num_input_tokens_seen": 13749312,
"step": 4365
},
{
"epoch": 0.2797516164138019,
"grad_norm": 18.252212524414062,
"learning_rate": 1.8097519459761533e-06,
"loss": 0.4405,
"num_input_tokens_seen": 13765952,
"step": 4370
},
{
"epoch": 0.2800716983547788,
"grad_norm": 49.828643798828125,
"learning_rate": 1.8090957998137283e-06,
"loss": 0.5056,
"num_input_tokens_seen": 13781440,
"step": 4375
},
{
"epoch": 0.28039178029575573,
"grad_norm": 49.623783111572266,
"learning_rate": 1.8084386434856978e-06,
"loss": 0.4528,
"num_input_tokens_seen": 13796864,
"step": 4380
},
{
"epoch": 0.2807118622367326,
"grad_norm": 27.9454402923584,
"learning_rate": 1.8077804778125283e-06,
"loss": 0.4966,
"num_input_tokens_seen": 13812736,
"step": 4385
},
{
"epoch": 0.2810319441777095,
"grad_norm": 57.37892150878906,
"learning_rate": 1.807121303615948e-06,
"loss": 0.489,
"num_input_tokens_seen": 13828288,
"step": 4390
},
{
"epoch": 0.28135202611868637,
"grad_norm": 46.05356216430664,
"learning_rate": 1.8064611217189434e-06,
"loss": 0.4168,
"num_input_tokens_seen": 13845568,
"step": 4395
},
{
"epoch": 0.28167210805966325,
"grad_norm": 23.337419509887695,
"learning_rate": 1.8057999329457596e-06,
"loss": 0.3902,
"num_input_tokens_seen": 13860608,
"step": 4400
},
{
"epoch": 0.2819921900006402,
"grad_norm": 44.485595703125,
"learning_rate": 1.8051377381218984e-06,
"loss": 0.5584,
"num_input_tokens_seen": 13876608,
"step": 4405
},
{
"epoch": 0.28231227194161707,
"grad_norm": 37.74899673461914,
"learning_rate": 1.8044745380741177e-06,
"loss": 0.5613,
"num_input_tokens_seen": 13893632,
"step": 4410
},
{
"epoch": 0.28263235388259395,
"grad_norm": 53.780860900878906,
"learning_rate": 1.8038103336304306e-06,
"loss": 0.3872,
"num_input_tokens_seen": 13909312,
"step": 4415
},
{
"epoch": 0.2829524358235708,
"grad_norm": 28.769821166992188,
"learning_rate": 1.8031451256201042e-06,
"loss": 0.5718,
"num_input_tokens_seen": 13925824,
"step": 4420
},
{
"epoch": 0.2832725177645477,
"grad_norm": 25.95047950744629,
"learning_rate": 1.8024789148736589e-06,
"loss": 0.5314,
"num_input_tokens_seen": 13942336,
"step": 4425
},
{
"epoch": 0.28359259970552464,
"grad_norm": 31.877147674560547,
"learning_rate": 1.8018117022228655e-06,
"loss": 0.4017,
"num_input_tokens_seen": 13957760,
"step": 4430
},
{
"epoch": 0.2839126816465015,
"grad_norm": 53.280426025390625,
"learning_rate": 1.8011434885007479e-06,
"loss": 0.5044,
"num_input_tokens_seen": 13972992,
"step": 4435
},
{
"epoch": 0.2842327635874784,
"grad_norm": 34.214813232421875,
"learning_rate": 1.8004742745415787e-06,
"loss": 0.4184,
"num_input_tokens_seen": 13988736,
"step": 4440
},
{
"epoch": 0.2845528455284553,
"grad_norm": 21.93120002746582,
"learning_rate": 1.799804061180879e-06,
"loss": 0.5398,
"num_input_tokens_seen": 14003520,
"step": 4445
},
{
"epoch": 0.28487292746943216,
"grad_norm": 30.483198165893555,
"learning_rate": 1.799132849255418e-06,
"loss": 0.5016,
"num_input_tokens_seen": 14020608,
"step": 4450
},
{
"epoch": 0.28519300941040904,
"grad_norm": 36.024600982666016,
"learning_rate": 1.798460639603212e-06,
"loss": 0.4168,
"num_input_tokens_seen": 14035328,
"step": 4455
},
{
"epoch": 0.285513091351386,
"grad_norm": 38.09769058227539,
"learning_rate": 1.7977874330635224e-06,
"loss": 0.4799,
"num_input_tokens_seen": 14050816,
"step": 4460
},
{
"epoch": 0.28583317329236285,
"grad_norm": 20.480865478515625,
"learning_rate": 1.7971132304768555e-06,
"loss": 0.3319,
"num_input_tokens_seen": 14066880,
"step": 4465
},
{
"epoch": 0.28615325523333973,
"grad_norm": 26.70489501953125,
"learning_rate": 1.7964380326849612e-06,
"loss": 0.5081,
"num_input_tokens_seen": 14081728,
"step": 4470
},
{
"epoch": 0.2864733371743166,
"grad_norm": 20.658885955810547,
"learning_rate": 1.795761840530832e-06,
"loss": 0.4885,
"num_input_tokens_seen": 14097984,
"step": 4475
},
{
"epoch": 0.2867934191152935,
"grad_norm": 27.52956771850586,
"learning_rate": 1.7950846548587015e-06,
"loss": 0.4186,
"num_input_tokens_seen": 14115264,
"step": 4480
},
{
"epoch": 0.2871135010562704,
"grad_norm": 18.73761749267578,
"learning_rate": 1.7944064765140445e-06,
"loss": 0.2815,
"num_input_tokens_seen": 14129472,
"step": 4485
},
{
"epoch": 0.2874335829972473,
"grad_norm": 34.10987091064453,
"learning_rate": 1.7937273063435735e-06,
"loss": 0.5567,
"num_input_tokens_seen": 14144896,
"step": 4490
},
{
"epoch": 0.2877536649382242,
"grad_norm": 28.596620559692383,
"learning_rate": 1.7930471451952416e-06,
"loss": 0.3559,
"num_input_tokens_seen": 14159744,
"step": 4495
},
{
"epoch": 0.28807374687920106,
"grad_norm": 41.09931564331055,
"learning_rate": 1.7923659939182377e-06,
"loss": 0.4935,
"num_input_tokens_seen": 14176384,
"step": 4500
},
{
"epoch": 0.28839382882017794,
"grad_norm": 43.514373779296875,
"learning_rate": 1.7916838533629866e-06,
"loss": 0.5441,
"num_input_tokens_seen": 14192320,
"step": 4505
},
{
"epoch": 0.2887139107611549,
"grad_norm": 23.66765594482422,
"learning_rate": 1.7910007243811493e-06,
"loss": 0.3929,
"num_input_tokens_seen": 14208192,
"step": 4510
},
{
"epoch": 0.28903399270213176,
"grad_norm": 57.50717544555664,
"learning_rate": 1.7903166078256202e-06,
"loss": 0.5737,
"num_input_tokens_seen": 14223104,
"step": 4515
},
{
"epoch": 0.28935407464310864,
"grad_norm": 56.785011291503906,
"learning_rate": 1.789631504550527e-06,
"loss": 0.4223,
"num_input_tokens_seen": 14238464,
"step": 4520
},
{
"epoch": 0.2896741565840855,
"grad_norm": 34.91124725341797,
"learning_rate": 1.7889454154112288e-06,
"loss": 0.3912,
"num_input_tokens_seen": 14254656,
"step": 4525
},
{
"epoch": 0.2899942385250624,
"grad_norm": 42.82390594482422,
"learning_rate": 1.7882583412643167e-06,
"loss": 0.3903,
"num_input_tokens_seen": 14268928,
"step": 4530
},
{
"epoch": 0.29031432046603933,
"grad_norm": 33.92388153076172,
"learning_rate": 1.78757028296761e-06,
"loss": 0.4489,
"num_input_tokens_seen": 14285952,
"step": 4535
},
{
"epoch": 0.2906344024070162,
"grad_norm": 20.465452194213867,
"learning_rate": 1.7868812413801582e-06,
"loss": 0.3513,
"num_input_tokens_seen": 14301760,
"step": 4540
},
{
"epoch": 0.2909544843479931,
"grad_norm": 55.36177444458008,
"learning_rate": 1.7861912173622372e-06,
"loss": 0.4985,
"num_input_tokens_seen": 14318208,
"step": 4545
},
{
"epoch": 0.29127456628896997,
"grad_norm": 42.88619613647461,
"learning_rate": 1.7855002117753504e-06,
"loss": 0.4537,
"num_input_tokens_seen": 14334144,
"step": 4550
},
{
"epoch": 0.29159464822994685,
"grad_norm": 47.642051696777344,
"learning_rate": 1.7848082254822266e-06,
"loss": 0.5489,
"num_input_tokens_seen": 14349120,
"step": 4555
},
{
"epoch": 0.29191473017092373,
"grad_norm": 54.71379852294922,
"learning_rate": 1.7841152593468185e-06,
"loss": 0.4957,
"num_input_tokens_seen": 14365376,
"step": 4560
},
{
"epoch": 0.29223481211190067,
"grad_norm": 34.66753005981445,
"learning_rate": 1.7834213142343026e-06,
"loss": 0.4636,
"num_input_tokens_seen": 14381568,
"step": 4565
},
{
"epoch": 0.29255489405287755,
"grad_norm": 31.637672424316406,
"learning_rate": 1.7827263910110777e-06,
"loss": 0.4752,
"num_input_tokens_seen": 14397312,
"step": 4570
},
{
"epoch": 0.2928749759938544,
"grad_norm": 36.772586822509766,
"learning_rate": 1.7820304905447632e-06,
"loss": 0.4631,
"num_input_tokens_seen": 14412928,
"step": 4575
},
{
"epoch": 0.2931950579348313,
"grad_norm": 58.408050537109375,
"learning_rate": 1.7813336137041991e-06,
"loss": 0.4515,
"num_input_tokens_seen": 14427968,
"step": 4580
},
{
"epoch": 0.2935151398758082,
"grad_norm": 38.953765869140625,
"learning_rate": 1.7806357613594447e-06,
"loss": 0.3591,
"num_input_tokens_seen": 14442944,
"step": 4585
},
{
"epoch": 0.2938352218167851,
"grad_norm": 20.911611557006836,
"learning_rate": 1.7799369343817764e-06,
"loss": 0.452,
"num_input_tokens_seen": 14458176,
"step": 4590
},
{
"epoch": 0.294155303757762,
"grad_norm": 25.856048583984375,
"learning_rate": 1.7792371336436883e-06,
"loss": 0.3618,
"num_input_tokens_seen": 14473600,
"step": 4595
},
{
"epoch": 0.2944753856987389,
"grad_norm": 30.00708770751953,
"learning_rate": 1.7785363600188892e-06,
"loss": 0.6561,
"num_input_tokens_seen": 14488896,
"step": 4600
},
{
"epoch": 0.29479546763971576,
"grad_norm": 38.951820373535156,
"learning_rate": 1.7778346143823038e-06,
"loss": 0.5982,
"num_input_tokens_seen": 14502784,
"step": 4605
},
{
"epoch": 0.29511554958069264,
"grad_norm": 34.70473861694336,
"learning_rate": 1.7771318976100696e-06,
"loss": 0.4353,
"num_input_tokens_seen": 14520000,
"step": 4610
},
{
"epoch": 0.2954356315216696,
"grad_norm": 30.836076736450195,
"learning_rate": 1.7764282105795364e-06,
"loss": 0.3531,
"num_input_tokens_seen": 14536320,
"step": 4615
},
{
"epoch": 0.29575571346264645,
"grad_norm": 45.481624603271484,
"learning_rate": 1.7757235541692663e-06,
"loss": 0.4688,
"num_input_tokens_seen": 14551808,
"step": 4620
},
{
"epoch": 0.29607579540362333,
"grad_norm": 21.652406692504883,
"learning_rate": 1.7750179292590306e-06,
"loss": 0.3106,
"num_input_tokens_seen": 14566976,
"step": 4625
},
{
"epoch": 0.2963958773446002,
"grad_norm": 25.99779510498047,
"learning_rate": 1.7743113367298107e-06,
"loss": 0.3511,
"num_input_tokens_seen": 14583104,
"step": 4630
},
{
"epoch": 0.2967159592855771,
"grad_norm": 39.653045654296875,
"learning_rate": 1.7736037774637955e-06,
"loss": 0.4515,
"num_input_tokens_seen": 14598336,
"step": 4635
},
{
"epoch": 0.29703604122655397,
"grad_norm": 51.65949249267578,
"learning_rate": 1.772895252344381e-06,
"loss": 0.5141,
"num_input_tokens_seen": 14615232,
"step": 4640
},
{
"epoch": 0.2973561231675309,
"grad_norm": 20.959184646606445,
"learning_rate": 1.7721857622561692e-06,
"loss": 0.388,
"num_input_tokens_seen": 14630848,
"step": 4645
},
{
"epoch": 0.2976762051085078,
"grad_norm": 33.919654846191406,
"learning_rate": 1.7714753080849664e-06,
"loss": 0.4668,
"num_input_tokens_seen": 14647040,
"step": 4650
},
{
"epoch": 0.29799628704948466,
"grad_norm": 23.6036319732666,
"learning_rate": 1.7707638907177837e-06,
"loss": 0.4196,
"num_input_tokens_seen": 14661888,
"step": 4655
},
{
"epoch": 0.29831636899046154,
"grad_norm": 153.35108947753906,
"learning_rate": 1.7700515110428336e-06,
"loss": 0.7015,
"num_input_tokens_seen": 14677696,
"step": 4660
},
{
"epoch": 0.2986364509314384,
"grad_norm": 27.929115295410156,
"learning_rate": 1.7693381699495307e-06,
"loss": 0.4795,
"num_input_tokens_seen": 14693184,
"step": 4665
},
{
"epoch": 0.29895653287241536,
"grad_norm": 31.637182235717773,
"learning_rate": 1.7686238683284894e-06,
"loss": 0.3712,
"num_input_tokens_seen": 14707904,
"step": 4670
},
{
"epoch": 0.29927661481339224,
"grad_norm": 26.304777145385742,
"learning_rate": 1.7679086070715237e-06,
"loss": 0.3553,
"num_input_tokens_seen": 14724096,
"step": 4675
},
{
"epoch": 0.2995966967543691,
"grad_norm": 39.02534484863281,
"learning_rate": 1.7671923870716459e-06,
"loss": 0.4575,
"num_input_tokens_seen": 14738752,
"step": 4680
},
{
"epoch": 0.299916778695346,
"grad_norm": 42.02716827392578,
"learning_rate": 1.7664752092230652e-06,
"loss": 0.355,
"num_input_tokens_seen": 14753664,
"step": 4685
},
{
"epoch": 0.3002368606363229,
"grad_norm": 33.584014892578125,
"learning_rate": 1.7657570744211863e-06,
"loss": 0.3708,
"num_input_tokens_seen": 14769152,
"step": 4690
},
{
"epoch": 0.30036489341271366,
"eval_loss": 0.46517089009284973,
"eval_runtime": 50.6285,
"eval_samples_per_second": 274.272,
"eval_steps_per_second": 34.289,
"num_input_tokens_seen": 14775488,
"step": 4692
},
{
"epoch": 0.3005569425772998,
"grad_norm": 46.08897399902344,
"learning_rate": 1.765037983562609e-06,
"loss": 0.5088,
"num_input_tokens_seen": 14784128,
"step": 4695
},
{
"epoch": 0.3008770245182767,
"grad_norm": 46.2584228515625,
"learning_rate": 1.7643179375451264e-06,
"loss": 0.4325,
"num_input_tokens_seen": 14799936,
"step": 4700
},
{
"epoch": 0.30119710645925357,
"grad_norm": 42.52885437011719,
"learning_rate": 1.7635969372677252e-06,
"loss": 0.6141,
"num_input_tokens_seen": 14814208,
"step": 4705
},
{
"epoch": 0.30151718840023045,
"grad_norm": 40.1710205078125,
"learning_rate": 1.7628749836305818e-06,
"loss": 0.4862,
"num_input_tokens_seen": 14829504,
"step": 4710
},
{
"epoch": 0.30183727034120733,
"grad_norm": 31.896709442138672,
"learning_rate": 1.7621520775350645e-06,
"loss": 0.4053,
"num_input_tokens_seen": 14843968,
"step": 4715
},
{
"epoch": 0.30215735228218427,
"grad_norm": 34.46201705932617,
"learning_rate": 1.7614282198837293e-06,
"loss": 0.4685,
"num_input_tokens_seen": 14859840,
"step": 4720
},
{
"epoch": 0.30247743422316115,
"grad_norm": 44.51201248168945,
"learning_rate": 1.7607034115803219e-06,
"loss": 0.4873,
"num_input_tokens_seen": 14875648,
"step": 4725
},
{
"epoch": 0.302797516164138,
"grad_norm": 29.218721389770508,
"learning_rate": 1.7599776535297734e-06,
"loss": 0.4244,
"num_input_tokens_seen": 14890560,
"step": 4730
},
{
"epoch": 0.3031175981051149,
"grad_norm": 42.597896575927734,
"learning_rate": 1.7592509466382012e-06,
"loss": 0.478,
"num_input_tokens_seen": 14906688,
"step": 4735
},
{
"epoch": 0.3034376800460918,
"grad_norm": 57.83599853515625,
"learning_rate": 1.7585232918129076e-06,
"loss": 0.5622,
"num_input_tokens_seen": 14922496,
"step": 4740
},
{
"epoch": 0.30375776198706866,
"grad_norm": 38.69477081298828,
"learning_rate": 1.757794689962378e-06,
"loss": 0.4656,
"num_input_tokens_seen": 14938880,
"step": 4745
},
{
"epoch": 0.3040778439280456,
"grad_norm": 46.978797912597656,
"learning_rate": 1.7570651419962807e-06,
"loss": 0.5035,
"num_input_tokens_seen": 14954112,
"step": 4750
},
{
"epoch": 0.3043979258690225,
"grad_norm": 44.47570037841797,
"learning_rate": 1.7563346488254647e-06,
"loss": 0.4471,
"num_input_tokens_seen": 14969536,
"step": 4755
},
{
"epoch": 0.30471800780999936,
"grad_norm": 35.79732894897461,
"learning_rate": 1.755603211361959e-06,
"loss": 0.351,
"num_input_tokens_seen": 14985728,
"step": 4760
},
{
"epoch": 0.30503808975097624,
"grad_norm": 21.590835571289062,
"learning_rate": 1.7548708305189722e-06,
"loss": 0.4522,
"num_input_tokens_seen": 15003904,
"step": 4765
},
{
"epoch": 0.3053581716919531,
"grad_norm": 63.41920852661133,
"learning_rate": 1.7541375072108905e-06,
"loss": 0.5752,
"num_input_tokens_seen": 15019328,
"step": 4770
},
{
"epoch": 0.30567825363293005,
"grad_norm": 48.414974212646484,
"learning_rate": 1.7534032423532766e-06,
"loss": 0.4732,
"num_input_tokens_seen": 15033856,
"step": 4775
},
{
"epoch": 0.30599833557390693,
"grad_norm": 23.076284408569336,
"learning_rate": 1.7526680368628685e-06,
"loss": 0.361,
"num_input_tokens_seen": 15051200,
"step": 4780
},
{
"epoch": 0.3063184175148838,
"grad_norm": 33.26884841918945,
"learning_rate": 1.751931891657579e-06,
"loss": 0.4427,
"num_input_tokens_seen": 15066368,
"step": 4785
},
{
"epoch": 0.3066384994558607,
"grad_norm": 21.855182647705078,
"learning_rate": 1.7511948076564943e-06,
"loss": 0.3568,
"num_input_tokens_seen": 15081600,
"step": 4790
},
{
"epoch": 0.30695858139683757,
"grad_norm": 33.14620590209961,
"learning_rate": 1.7504567857798722e-06,
"loss": 0.5404,
"num_input_tokens_seen": 15097536,
"step": 4795
},
{
"epoch": 0.3072786633378145,
"grad_norm": 37.192012786865234,
"learning_rate": 1.7497178269491417e-06,
"loss": 0.4943,
"num_input_tokens_seen": 15113728,
"step": 4800
},
{
"epoch": 0.3075987452787914,
"grad_norm": 23.352327346801758,
"learning_rate": 1.7489779320869014e-06,
"loss": 0.5532,
"num_input_tokens_seen": 15130048,
"step": 4805
},
{
"epoch": 0.30791882721976827,
"grad_norm": 24.619413375854492,
"learning_rate": 1.7482371021169193e-06,
"loss": 0.3715,
"num_input_tokens_seen": 15145600,
"step": 4810
},
{
"epoch": 0.30823890916074514,
"grad_norm": 45.18055725097656,
"learning_rate": 1.7474953379641297e-06,
"loss": 0.4077,
"num_input_tokens_seen": 15162368,
"step": 4815
},
{
"epoch": 0.308558991101722,
"grad_norm": 35.786495208740234,
"learning_rate": 1.746752640554634e-06,
"loss": 0.438,
"num_input_tokens_seen": 15178368,
"step": 4820
},
{
"epoch": 0.3088790730426989,
"grad_norm": 25.801467895507812,
"learning_rate": 1.7460090108156988e-06,
"loss": 0.5348,
"num_input_tokens_seen": 15193408,
"step": 4825
},
{
"epoch": 0.30919915498367584,
"grad_norm": 25.487167358398438,
"learning_rate": 1.7452644496757548e-06,
"loss": 0.3155,
"num_input_tokens_seen": 15208640,
"step": 4830
},
{
"epoch": 0.3095192369246527,
"grad_norm": 47.79193878173828,
"learning_rate": 1.7445189580643946e-06,
"loss": 0.4557,
"num_input_tokens_seen": 15224192,
"step": 4835
},
{
"epoch": 0.3098393188656296,
"grad_norm": 29.753835678100586,
"learning_rate": 1.7437725369123737e-06,
"loss": 0.5187,
"num_input_tokens_seen": 15239616,
"step": 4840
},
{
"epoch": 0.3101594008066065,
"grad_norm": 33.8818359375,
"learning_rate": 1.7430251871516077e-06,
"loss": 0.4925,
"num_input_tokens_seen": 15255680,
"step": 4845
},
{
"epoch": 0.31047948274758336,
"grad_norm": 29.06200408935547,
"learning_rate": 1.7422769097151715e-06,
"loss": 0.5256,
"num_input_tokens_seen": 15271232,
"step": 4850
},
{
"epoch": 0.3107995646885603,
"grad_norm": 67.46397399902344,
"learning_rate": 1.7415277055372982e-06,
"loss": 0.5038,
"num_input_tokens_seen": 15287040,
"step": 4855
},
{
"epoch": 0.31111964662953717,
"grad_norm": 26.468515396118164,
"learning_rate": 1.7407775755533778e-06,
"loss": 0.5181,
"num_input_tokens_seen": 15304256,
"step": 4860
},
{
"epoch": 0.31143972857051405,
"grad_norm": 19.30422019958496,
"learning_rate": 1.7400265206999568e-06,
"loss": 0.364,
"num_input_tokens_seen": 15322112,
"step": 4865
},
{
"epoch": 0.31175981051149093,
"grad_norm": 68.16838836669922,
"learning_rate": 1.7392745419147362e-06,
"loss": 0.5297,
"num_input_tokens_seen": 15337216,
"step": 4870
},
{
"epoch": 0.3120798924524678,
"grad_norm": 37.93073654174805,
"learning_rate": 1.7385216401365693e-06,
"loss": 0.4478,
"num_input_tokens_seen": 15354048,
"step": 4875
},
{
"epoch": 0.31239997439344475,
"grad_norm": 30.45296287536621,
"learning_rate": 1.7377678163054638e-06,
"loss": 0.4964,
"num_input_tokens_seen": 15369344,
"step": 4880
},
{
"epoch": 0.3127200563344216,
"grad_norm": 47.27909469604492,
"learning_rate": 1.7370130713625775e-06,
"loss": 0.4864,
"num_input_tokens_seen": 15385920,
"step": 4885
},
{
"epoch": 0.3130401382753985,
"grad_norm": 24.398977279663086,
"learning_rate": 1.736257406250218e-06,
"loss": 0.3948,
"num_input_tokens_seen": 15401536,
"step": 4890
},
{
"epoch": 0.3133602202163754,
"grad_norm": 30.26610565185547,
"learning_rate": 1.735500821911842e-06,
"loss": 0.4629,
"num_input_tokens_seen": 15417152,
"step": 4895
},
{
"epoch": 0.31368030215735226,
"grad_norm": 31.936508178710938,
"learning_rate": 1.7347433192920544e-06,
"loss": 0.4961,
"num_input_tokens_seen": 15431872,
"step": 4900
},
{
"epoch": 0.3140003840983292,
"grad_norm": 20.389596939086914,
"learning_rate": 1.7339848993366056e-06,
"loss": 0.4021,
"num_input_tokens_seen": 15447552,
"step": 4905
},
{
"epoch": 0.3143204660393061,
"grad_norm": 32.99045181274414,
"learning_rate": 1.7332255629923922e-06,
"loss": 0.4667,
"num_input_tokens_seen": 15464384,
"step": 4910
},
{
"epoch": 0.31464054798028296,
"grad_norm": 24.761920928955078,
"learning_rate": 1.732465311207454e-06,
"loss": 0.5038,
"num_input_tokens_seen": 15479808,
"step": 4915
},
{
"epoch": 0.31496062992125984,
"grad_norm": 45.01390838623047,
"learning_rate": 1.731704144930975e-06,
"loss": 0.5018,
"num_input_tokens_seen": 15496512,
"step": 4920
},
{
"epoch": 0.3152807118622367,
"grad_norm": 38.63529968261719,
"learning_rate": 1.7309420651132797e-06,
"loss": 0.4137,
"num_input_tokens_seen": 15512896,
"step": 4925
},
{
"epoch": 0.3156007938032136,
"grad_norm": 34.324134826660156,
"learning_rate": 1.7301790727058343e-06,
"loss": 0.3295,
"num_input_tokens_seen": 15528064,
"step": 4930
},
{
"epoch": 0.31592087574419053,
"grad_norm": 34.598453521728516,
"learning_rate": 1.7294151686612431e-06,
"loss": 0.3593,
"num_input_tokens_seen": 15543424,
"step": 4935
},
{
"epoch": 0.3162409576851674,
"grad_norm": 44.17629623413086,
"learning_rate": 1.7286503539332495e-06,
"loss": 0.5778,
"num_input_tokens_seen": 15560192,
"step": 4940
},
{
"epoch": 0.3165610396261443,
"grad_norm": 43.37092208862305,
"learning_rate": 1.7278846294767337e-06,
"loss": 0.3873,
"num_input_tokens_seen": 15576128,
"step": 4945
},
{
"epoch": 0.31688112156712117,
"grad_norm": 79.67957305908203,
"learning_rate": 1.7271179962477118e-06,
"loss": 0.6923,
"num_input_tokens_seen": 15592576,
"step": 4950
},
{
"epoch": 0.31720120350809805,
"grad_norm": 51.76191711425781,
"learning_rate": 1.7263504552033341e-06,
"loss": 0.4372,
"num_input_tokens_seen": 15607744,
"step": 4955
},
{
"epoch": 0.317521285449075,
"grad_norm": 24.416505813598633,
"learning_rate": 1.725582007301885e-06,
"loss": 0.481,
"num_input_tokens_seen": 15623360,
"step": 4960
},
{
"epoch": 0.31784136739005187,
"grad_norm": 37.16778564453125,
"learning_rate": 1.7248126535027806e-06,
"loss": 0.4251,
"num_input_tokens_seen": 15638656,
"step": 4965
},
{
"epoch": 0.31816144933102875,
"grad_norm": 40.46333694458008,
"learning_rate": 1.7240423947665678e-06,
"loss": 0.4569,
"num_input_tokens_seen": 15654400,
"step": 4970
},
{
"epoch": 0.3184815312720056,
"grad_norm": 24.423137664794922,
"learning_rate": 1.723271232054924e-06,
"loss": 0.3867,
"num_input_tokens_seen": 15670016,
"step": 4975
},
{
"epoch": 0.3188016132129825,
"grad_norm": 54.15736389160156,
"learning_rate": 1.722499166330655e-06,
"loss": 0.5265,
"num_input_tokens_seen": 15686208,
"step": 4980
},
{
"epoch": 0.31912169515395944,
"grad_norm": 22.534137725830078,
"learning_rate": 1.7217261985576936e-06,
"loss": 0.443,
"num_input_tokens_seen": 15702592,
"step": 4985
},
{
"epoch": 0.3194417770949363,
"grad_norm": 72.37775421142578,
"learning_rate": 1.7209523297010992e-06,
"loss": 0.5114,
"num_input_tokens_seen": 15717696,
"step": 4990
},
{
"epoch": 0.3197618590359132,
"grad_norm": 37.13178634643555,
"learning_rate": 1.7201775607270564e-06,
"loss": 0.4619,
"num_input_tokens_seen": 15733184,
"step": 4995
},
{
"epoch": 0.3200819409768901,
"grad_norm": 32.471622467041016,
"learning_rate": 1.7194018926028733e-06,
"loss": 0.5318,
"num_input_tokens_seen": 15749888,
"step": 5000
},
{
"epoch": 0.32040202291786696,
"grad_norm": 36.298892974853516,
"learning_rate": 1.7186253262969803e-06,
"loss": 0.3622,
"num_input_tokens_seen": 15768384,
"step": 5005
},
{
"epoch": 0.32072210485884384,
"grad_norm": 28.648771286010742,
"learning_rate": 1.7178478627789299e-06,
"loss": 0.3291,
"num_input_tokens_seen": 15784448,
"step": 5010
},
{
"epoch": 0.3210421867998208,
"grad_norm": 24.88985824584961,
"learning_rate": 1.7170695030193944e-06,
"loss": 0.4122,
"num_input_tokens_seen": 15800512,
"step": 5015
},
{
"epoch": 0.32136226874079765,
"grad_norm": 33.581695556640625,
"learning_rate": 1.716290247990165e-06,
"loss": 0.4778,
"num_input_tokens_seen": 15815680,
"step": 5020
},
{
"epoch": 0.32168235068177453,
"grad_norm": 34.822837829589844,
"learning_rate": 1.715510098664151e-06,
"loss": 0.3896,
"num_input_tokens_seen": 15830528,
"step": 5025
},
{
"epoch": 0.3220024326227514,
"grad_norm": 30.73101043701172,
"learning_rate": 1.7147290560153777e-06,
"loss": 0.5141,
"num_input_tokens_seen": 15845568,
"step": 5030
},
{
"epoch": 0.3223225145637283,
"grad_norm": 36.853206634521484,
"learning_rate": 1.7139471210189862e-06,
"loss": 0.447,
"num_input_tokens_seen": 15861632,
"step": 5035
},
{
"epoch": 0.3226425965047052,
"grad_norm": 34.865318298339844,
"learning_rate": 1.7131642946512312e-06,
"loss": 0.543,
"num_input_tokens_seen": 15877632,
"step": 5040
},
{
"epoch": 0.3229626784456821,
"grad_norm": 21.021453857421875,
"learning_rate": 1.712380577889481e-06,
"loss": 0.3918,
"num_input_tokens_seen": 15893184,
"step": 5045
},
{
"epoch": 0.323282760386659,
"grad_norm": 34.583648681640625,
"learning_rate": 1.711595971712215e-06,
"loss": 0.3963,
"num_input_tokens_seen": 15908416,
"step": 5050
},
{
"epoch": 0.32360284232763586,
"grad_norm": 25.76999855041504,
"learning_rate": 1.7108104770990234e-06,
"loss": 0.4042,
"num_input_tokens_seen": 15924224,
"step": 5055
},
{
"epoch": 0.32392292426861274,
"grad_norm": 30.277738571166992,
"learning_rate": 1.7100240950306052e-06,
"loss": 0.254,
"num_input_tokens_seen": 15940032,
"step": 5060
},
{
"epoch": 0.3242430062095897,
"grad_norm": 39.45482635498047,
"learning_rate": 1.7092368264887677e-06,
"loss": 0.4647,
"num_input_tokens_seen": 15954944,
"step": 5065
},
{
"epoch": 0.32456308815056656,
"grad_norm": 54.28398513793945,
"learning_rate": 1.7084486724564252e-06,
"loss": 0.4846,
"num_input_tokens_seen": 15970624,
"step": 5070
},
{
"epoch": 0.32488317009154344,
"grad_norm": 33.94327163696289,
"learning_rate": 1.707659633917597e-06,
"loss": 0.4092,
"num_input_tokens_seen": 15986688,
"step": 5075
},
{
"epoch": 0.3252032520325203,
"grad_norm": 41.53309631347656,
"learning_rate": 1.7068697118574064e-06,
"loss": 0.4098,
"num_input_tokens_seen": 16002752,
"step": 5080
},
{
"epoch": 0.3255233339734972,
"grad_norm": 25.390241622924805,
"learning_rate": 1.7060789072620816e-06,
"loss": 0.4931,
"num_input_tokens_seen": 16018112,
"step": 5085
},
{
"epoch": 0.32584341591447413,
"grad_norm": 26.28127098083496,
"learning_rate": 1.7052872211189509e-06,
"loss": 0.4288,
"num_input_tokens_seen": 16033984,
"step": 5090
},
{
"epoch": 0.326163497855451,
"grad_norm": 24.156646728515625,
"learning_rate": 1.7044946544164431e-06,
"loss": 0.3304,
"num_input_tokens_seen": 16049536,
"step": 5095
},
{
"epoch": 0.3264835797964279,
"grad_norm": 35.91144943237305,
"learning_rate": 1.703701208144088e-06,
"loss": 0.3713,
"num_input_tokens_seen": 16066304,
"step": 5100
},
{
"epoch": 0.32680366173740477,
"grad_norm": 45.723236083984375,
"learning_rate": 1.702906883292512e-06,
"loss": 0.4829,
"num_input_tokens_seen": 16081536,
"step": 5105
},
{
"epoch": 0.32712374367838165,
"grad_norm": 27.605358123779297,
"learning_rate": 1.7021116808534393e-06,
"loss": 0.5586,
"num_input_tokens_seen": 16096896,
"step": 5110
},
{
"epoch": 0.32744382561935853,
"grad_norm": 46.61053466796875,
"learning_rate": 1.7013156018196893e-06,
"loss": 0.443,
"num_input_tokens_seen": 16112960,
"step": 5115
},
{
"epoch": 0.32776390756033547,
"grad_norm": 35.758201599121094,
"learning_rate": 1.7005186471851759e-06,
"loss": 0.4038,
"num_input_tokens_seen": 16129344,
"step": 5120
},
{
"epoch": 0.32808398950131235,
"grad_norm": 24.61606216430664,
"learning_rate": 1.6997208179449066e-06,
"loss": 0.6052,
"num_input_tokens_seen": 16147776,
"step": 5125
},
{
"epoch": 0.3284040714422892,
"grad_norm": 42.087249755859375,
"learning_rate": 1.6989221150949806e-06,
"loss": 0.3508,
"num_input_tokens_seen": 16162880,
"step": 5130
},
{
"epoch": 0.3287241533832661,
"grad_norm": 18.78113555908203,
"learning_rate": 1.6981225396325873e-06,
"loss": 0.2676,
"num_input_tokens_seen": 16179392,
"step": 5135
},
{
"epoch": 0.329044235324243,
"grad_norm": 38.857826232910156,
"learning_rate": 1.6973220925560067e-06,
"loss": 0.504,
"num_input_tokens_seen": 16194560,
"step": 5140
},
{
"epoch": 0.3293643172652199,
"grad_norm": 51.61846923828125,
"learning_rate": 1.696520774864606e-06,
"loss": 0.4243,
"num_input_tokens_seen": 16210112,
"step": 5145
},
{
"epoch": 0.3296843992061968,
"grad_norm": 55.612979888916016,
"learning_rate": 1.69571858755884e-06,
"loss": 0.464,
"num_input_tokens_seen": 16225856,
"step": 5150
},
{
"epoch": 0.3300044811471737,
"grad_norm": 25.002553939819336,
"learning_rate": 1.6949155316402487e-06,
"loss": 0.4314,
"num_input_tokens_seen": 16241536,
"step": 5155
},
{
"epoch": 0.33032456308815056,
"grad_norm": 35.29892349243164,
"learning_rate": 1.6941116081114566e-06,
"loss": 0.3807,
"num_input_tokens_seen": 16256384,
"step": 5160
},
{
"epoch": 0.33064464502912744,
"grad_norm": 39.39152908325195,
"learning_rate": 1.6933068179761722e-06,
"loss": 0.398,
"num_input_tokens_seen": 16271360,
"step": 5165
},
{
"epoch": 0.3309647269701044,
"grad_norm": 27.22661781311035,
"learning_rate": 1.6925011622391857e-06,
"loss": 0.4122,
"num_input_tokens_seen": 16286656,
"step": 5170
},
{
"epoch": 0.33128480891108125,
"grad_norm": 19.13918685913086,
"learning_rate": 1.6916946419063667e-06,
"loss": 0.4255,
"num_input_tokens_seen": 16302592,
"step": 5175
},
{
"epoch": 0.33160489085205813,
"grad_norm": 27.572052001953125,
"learning_rate": 1.690887257984666e-06,
"loss": 0.5442,
"num_input_tokens_seen": 16318656,
"step": 5180
},
{
"epoch": 0.331924972793035,
"grad_norm": 27.06237030029297,
"learning_rate": 1.690079011482112e-06,
"loss": 0.4755,
"num_input_tokens_seen": 16334016,
"step": 5185
},
{
"epoch": 0.3322450547340119,
"grad_norm": 43.45961380004883,
"learning_rate": 1.6892699034078096e-06,
"loss": 0.5287,
"num_input_tokens_seen": 16349888,
"step": 5190
},
{
"epoch": 0.33256513667498877,
"grad_norm": 41.60029220581055,
"learning_rate": 1.68845993477194e-06,
"loss": 0.503,
"num_input_tokens_seen": 16365056,
"step": 5195
},
{
"epoch": 0.3328852186159657,
"grad_norm": 27.33567237854004,
"learning_rate": 1.6876491065857584e-06,
"loss": 0.3973,
"num_input_tokens_seen": 16380032,
"step": 5200
},
{
"epoch": 0.3332053005569426,
"grad_norm": 32.64432907104492,
"learning_rate": 1.6868374198615928e-06,
"loss": 0.6461,
"num_input_tokens_seen": 16394752,
"step": 5205
},
{
"epoch": 0.33352538249791946,
"grad_norm": 20.55217742919922,
"learning_rate": 1.6860248756128448e-06,
"loss": 0.4714,
"num_input_tokens_seen": 16410368,
"step": 5210
},
{
"epoch": 0.33384546443889634,
"grad_norm": 22.915000915527344,
"learning_rate": 1.6852114748539844e-06,
"loss": 0.4142,
"num_input_tokens_seen": 16425088,
"step": 5215
},
{
"epoch": 0.3341655463798732,
"grad_norm": 24.764463424682617,
"learning_rate": 1.6843972186005525e-06,
"loss": 0.3446,
"num_input_tokens_seen": 16441152,
"step": 5220
},
{
"epoch": 0.33448562832085016,
"grad_norm": 35.099388122558594,
"learning_rate": 1.6835821078691577e-06,
"loss": 0.4705,
"num_input_tokens_seen": 16458240,
"step": 5225
},
{
"epoch": 0.33480571026182704,
"grad_norm": 39.11796188354492,
"learning_rate": 1.6827661436774746e-06,
"loss": 0.4342,
"num_input_tokens_seen": 16474112,
"step": 5230
},
{
"epoch": 0.3351257922028039,
"grad_norm": 44.745906829833984,
"learning_rate": 1.681949327044245e-06,
"loss": 0.3957,
"num_input_tokens_seen": 16490560,
"step": 5235
},
{
"epoch": 0.3354458741437808,
"grad_norm": 70.94788360595703,
"learning_rate": 1.6811316589892734e-06,
"loss": 0.6821,
"num_input_tokens_seen": 16505728,
"step": 5240
},
{
"epoch": 0.3357659560847577,
"grad_norm": 24.731172561645508,
"learning_rate": 1.6803131405334284e-06,
"loss": 0.4364,
"num_input_tokens_seen": 16521856,
"step": 5245
},
{
"epoch": 0.3360860380257346,
"grad_norm": 31.324085235595703,
"learning_rate": 1.6794937726986396e-06,
"loss": 0.4436,
"num_input_tokens_seen": 16537792,
"step": 5250
},
{
"epoch": 0.3364061199667115,
"grad_norm": 42.410850524902344,
"learning_rate": 1.6786735565078974e-06,
"loss": 0.4347,
"num_input_tokens_seen": 16553408,
"step": 5255
},
{
"epoch": 0.33672620190768837,
"grad_norm": 27.741994857788086,
"learning_rate": 1.677852492985251e-06,
"loss": 0.4233,
"num_input_tokens_seen": 16570112,
"step": 5260
},
{
"epoch": 0.33704628384866525,
"grad_norm": 60.623374938964844,
"learning_rate": 1.6770305831558086e-06,
"loss": 0.5003,
"num_input_tokens_seen": 16586304,
"step": 5265
},
{
"epoch": 0.33736636578964213,
"grad_norm": 15.280008316040039,
"learning_rate": 1.6762078280457342e-06,
"loss": 0.3912,
"num_input_tokens_seen": 16601920,
"step": 5270
},
{
"epoch": 0.33768644773061907,
"grad_norm": 29.975696563720703,
"learning_rate": 1.6753842286822465e-06,
"loss": 0.4725,
"num_input_tokens_seen": 16618240,
"step": 5275
},
{
"epoch": 0.33800652967159595,
"grad_norm": 33.5026969909668,
"learning_rate": 1.6745597860936199e-06,
"loss": 0.5845,
"num_input_tokens_seen": 16633408,
"step": 5280
},
{
"epoch": 0.3383266116125728,
"grad_norm": 39.31660842895508,
"learning_rate": 1.6737345013091794e-06,
"loss": 0.4484,
"num_input_tokens_seen": 16649664,
"step": 5285
},
{
"epoch": 0.3386466935535497,
"grad_norm": 37.725284576416016,
"learning_rate": 1.672908375359304e-06,
"loss": 0.4686,
"num_input_tokens_seen": 16664896,
"step": 5290
},
{
"epoch": 0.3389667754945266,
"grad_norm": 49.54829406738281,
"learning_rate": 1.6720814092754209e-06,
"loss": 0.5565,
"num_input_tokens_seen": 16680384,
"step": 5295
},
{
"epoch": 0.33928685743550346,
"grad_norm": 22.089405059814453,
"learning_rate": 1.6712536040900075e-06,
"loss": 0.3785,
"num_input_tokens_seen": 16696192,
"step": 5300
},
{
"epoch": 0.3396069393764804,
"grad_norm": 26.588197708129883,
"learning_rate": 1.6704249608365878e-06,
"loss": 0.4741,
"num_input_tokens_seen": 16727104,
"step": 5305
},
{
"epoch": 0.3399270213174573,
"grad_norm": 29.5950870513916,
"learning_rate": 1.669595480549733e-06,
"loss": 0.4291,
"num_input_tokens_seen": 16741696,
"step": 5310
},
{
"epoch": 0.34024710325843416,
"grad_norm": 36.820213317871094,
"learning_rate": 1.6687651642650587e-06,
"loss": 0.4384,
"num_input_tokens_seen": 16757120,
"step": 5315
},
{
"epoch": 0.34056718519941104,
"grad_norm": 28.92207145690918,
"learning_rate": 1.6679340130192245e-06,
"loss": 0.4572,
"num_input_tokens_seen": 16772416,
"step": 5320
},
{
"epoch": 0.3408872671403879,
"grad_norm": 26.009944915771484,
"learning_rate": 1.667102027849933e-06,
"loss": 0.3287,
"num_input_tokens_seen": 16788352,
"step": 5325
},
{
"epoch": 0.34120734908136485,
"grad_norm": 38.97822952270508,
"learning_rate": 1.6662692097959266e-06,
"loss": 0.3582,
"num_input_tokens_seen": 16803648,
"step": 5330
},
{
"epoch": 0.34152743102234173,
"grad_norm": 49.21427536010742,
"learning_rate": 1.6654355598969894e-06,
"loss": 0.4741,
"num_input_tokens_seen": 16818944,
"step": 5335
},
{
"epoch": 0.3418475129633186,
"grad_norm": 33.456058502197266,
"learning_rate": 1.6646010791939423e-06,
"loss": 0.5007,
"num_input_tokens_seen": 16833984,
"step": 5340
},
{
"epoch": 0.3421675949042955,
"grad_norm": 29.826610565185547,
"learning_rate": 1.6637657687286446e-06,
"loss": 0.5632,
"num_input_tokens_seen": 16849280,
"step": 5345
},
{
"epoch": 0.34248767684527237,
"grad_norm": 30.897554397583008,
"learning_rate": 1.6629296295439912e-06,
"loss": 0.4051,
"num_input_tokens_seen": 16865664,
"step": 5350
},
{
"epoch": 0.3428077587862493,
"grad_norm": 46.125911712646484,
"learning_rate": 1.6620926626839116e-06,
"loss": 0.4945,
"num_input_tokens_seen": 16881536,
"step": 5355
},
{
"epoch": 0.3431278407272262,
"grad_norm": 27.395605087280273,
"learning_rate": 1.661254869193369e-06,
"loss": 0.4456,
"num_input_tokens_seen": 16898816,
"step": 5360
},
{
"epoch": 0.34344792266820307,
"grad_norm": 46.8023796081543,
"learning_rate": 1.6604162501183581e-06,
"loss": 0.5174,
"num_input_tokens_seen": 16915136,
"step": 5365
},
{
"epoch": 0.34376800460917994,
"grad_norm": 31.79302406311035,
"learning_rate": 1.6595768065059045e-06,
"loss": 0.4742,
"num_input_tokens_seen": 16931200,
"step": 5370
},
{
"epoch": 0.3440880865501568,
"grad_norm": 28.743654251098633,
"learning_rate": 1.6587365394040641e-06,
"loss": 0.4691,
"num_input_tokens_seen": 16946816,
"step": 5375
},
{
"epoch": 0.3444081684911337,
"grad_norm": 29.590286254882812,
"learning_rate": 1.6578954498619195e-06,
"loss": 0.3826,
"num_input_tokens_seen": 16962880,
"step": 5380
},
{
"epoch": 0.34472825043211064,
"grad_norm": 32.09335708618164,
"learning_rate": 1.6570535389295814e-06,
"loss": 0.4712,
"num_input_tokens_seen": 16978240,
"step": 5385
},
{
"epoch": 0.3450483323730875,
"grad_norm": 21.793235778808594,
"learning_rate": 1.6562108076581853e-06,
"loss": 0.3684,
"num_input_tokens_seen": 16993728,
"step": 5390
},
{
"epoch": 0.3453684143140644,
"grad_norm": 40.34245681762695,
"learning_rate": 1.6553672570998912e-06,
"loss": 0.5846,
"num_input_tokens_seen": 17009728,
"step": 5395
},
{
"epoch": 0.3456884962550413,
"grad_norm": 37.5211067199707,
"learning_rate": 1.6545228883078815e-06,
"loss": 0.414,
"num_input_tokens_seen": 17024640,
"step": 5400
},
{
"epoch": 0.34600857819601816,
"grad_norm": 41.48558807373047,
"learning_rate": 1.653677702336361e-06,
"loss": 0.36,
"num_input_tokens_seen": 17040512,
"step": 5405
},
{
"epoch": 0.3463286601369951,
"grad_norm": 20.21353530883789,
"learning_rate": 1.6528317002405538e-06,
"loss": 0.4801,
"num_input_tokens_seen": 17056064,
"step": 5410
},
{
"epoch": 0.34664874207797197,
"grad_norm": 31.45917320251465,
"learning_rate": 1.6519848830767043e-06,
"loss": 0.3685,
"num_input_tokens_seen": 17072448,
"step": 5415
},
{
"epoch": 0.34696882401894885,
"grad_norm": 43.27189254760742,
"learning_rate": 1.6511372519020726e-06,
"loss": 0.6228,
"num_input_tokens_seen": 17088320,
"step": 5420
},
{
"epoch": 0.34728890595992573,
"grad_norm": 36.178993225097656,
"learning_rate": 1.650288807774937e-06,
"loss": 0.4376,
"num_input_tokens_seen": 17104448,
"step": 5425
},
{
"epoch": 0.3476089879009026,
"grad_norm": 35.5722770690918,
"learning_rate": 1.6494395517545893e-06,
"loss": 0.3981,
"num_input_tokens_seen": 17121856,
"step": 5430
},
{
"epoch": 0.34792906984187955,
"grad_norm": 48.720191955566406,
"learning_rate": 1.6485894849013362e-06,
"loss": 0.5135,
"num_input_tokens_seen": 17136512,
"step": 5435
},
{
"epoch": 0.3482491517828564,
"grad_norm": 25.877552032470703,
"learning_rate": 1.6477386082764961e-06,
"loss": 0.4487,
"num_input_tokens_seen": 17152640,
"step": 5440
},
{
"epoch": 0.3485692337238333,
"grad_norm": 28.085277557373047,
"learning_rate": 1.6468869229423983e-06,
"loss": 0.3645,
"num_input_tokens_seen": 17167680,
"step": 5445
},
{
"epoch": 0.3488893156648102,
"grad_norm": 59.84387969970703,
"learning_rate": 1.6460344299623813e-06,
"loss": 0.6431,
"num_input_tokens_seen": 17183296,
"step": 5450
},
{
"epoch": 0.34920939760578706,
"grad_norm": 56.25507354736328,
"learning_rate": 1.6451811304007939e-06,
"loss": 0.5412,
"num_input_tokens_seen": 17198272,
"step": 5455
},
{
"epoch": 0.349529479546764,
"grad_norm": 46.85531997680664,
"learning_rate": 1.6443270253229895e-06,
"loss": 0.5194,
"num_input_tokens_seen": 17213376,
"step": 5460
},
{
"epoch": 0.3498495614877409,
"grad_norm": 39.22257995605469,
"learning_rate": 1.6434721157953288e-06,
"loss": 0.4614,
"num_input_tokens_seen": 17229632,
"step": 5465
},
{
"epoch": 0.35016964342871776,
"grad_norm": 34.838531494140625,
"learning_rate": 1.6426164028851765e-06,
"loss": 0.5873,
"num_input_tokens_seen": 17245696,
"step": 5470
},
{
"epoch": 0.3504257089814993,
"eval_loss": 0.44318872690200806,
"eval_runtime": 50.6001,
"eval_samples_per_second": 274.427,
"eval_steps_per_second": 34.308,
"num_input_tokens_seen": 17259840,
"step": 5474
},
{
"epoch": 0.35048972536969464,
"grad_norm": 28.226123809814453,
"learning_rate": 1.6417598876609002e-06,
"loss": 0.3797,
"num_input_tokens_seen": 17262976,
"step": 5475
},
{
"epoch": 0.3508098073106715,
"grad_norm": 37.542503356933594,
"learning_rate": 1.640902571191869e-06,
"loss": 0.4144,
"num_input_tokens_seen": 17278336,
"step": 5480
},
{
"epoch": 0.3511298892516484,
"grad_norm": 40.7253303527832,
"learning_rate": 1.6400444545484524e-06,
"loss": 0.3617,
"num_input_tokens_seen": 17293248,
"step": 5485
},
{
"epoch": 0.35144997119262533,
"grad_norm": 21.151514053344727,
"learning_rate": 1.6391855388020193e-06,
"loss": 0.428,
"num_input_tokens_seen": 17309184,
"step": 5490
},
{
"epoch": 0.3517700531336022,
"grad_norm": 35.13167953491211,
"learning_rate": 1.6383258250249363e-06,
"loss": 0.4654,
"num_input_tokens_seen": 17325248,
"step": 5495
},
{
"epoch": 0.3520901350745791,
"grad_norm": 19.110126495361328,
"learning_rate": 1.6374653142905661e-06,
"loss": 0.4297,
"num_input_tokens_seen": 17340736,
"step": 5500
},
{
"epoch": 0.35241021701555597,
"grad_norm": 35.75419235229492,
"learning_rate": 1.6366040076732662e-06,
"loss": 0.4224,
"num_input_tokens_seen": 17355904,
"step": 5505
},
{
"epoch": 0.35273029895653285,
"grad_norm": 28.760461807250977,
"learning_rate": 1.6357419062483882e-06,
"loss": 0.4675,
"num_input_tokens_seen": 17371264,
"step": 5510
},
{
"epoch": 0.3530503808975098,
"grad_norm": 25.240421295166016,
"learning_rate": 1.6348790110922758e-06,
"loss": 0.4268,
"num_input_tokens_seen": 17388608,
"step": 5515
},
{
"epoch": 0.35337046283848667,
"grad_norm": 28.650354385375977,
"learning_rate": 1.6340153232822635e-06,
"loss": 0.4558,
"num_input_tokens_seen": 17403712,
"step": 5520
},
{
"epoch": 0.35369054477946354,
"grad_norm": 44.04157257080078,
"learning_rate": 1.633150843896676e-06,
"loss": 0.5137,
"num_input_tokens_seen": 17421056,
"step": 5525
},
{
"epoch": 0.3540106267204404,
"grad_norm": 56.921592712402344,
"learning_rate": 1.6322855740148263e-06,
"loss": 0.5658,
"num_input_tokens_seen": 17436096,
"step": 5530
},
{
"epoch": 0.3543307086614173,
"grad_norm": 25.134639739990234,
"learning_rate": 1.6314195147170132e-06,
"loss": 0.3768,
"num_input_tokens_seen": 17452480,
"step": 5535
},
{
"epoch": 0.35465079060239424,
"grad_norm": 29.89691162109375,
"learning_rate": 1.6305526670845225e-06,
"loss": 0.4032,
"num_input_tokens_seen": 17467776,
"step": 5540
},
{
"epoch": 0.3549708725433711,
"grad_norm": 46.79875564575195,
"learning_rate": 1.6296850321996232e-06,
"loss": 0.4877,
"num_input_tokens_seen": 17482752,
"step": 5545
},
{
"epoch": 0.355290954484348,
"grad_norm": 34.09406280517578,
"learning_rate": 1.6288166111455683e-06,
"loss": 0.3843,
"num_input_tokens_seen": 17497792,
"step": 5550
},
{
"epoch": 0.3556110364253249,
"grad_norm": 23.421165466308594,
"learning_rate": 1.6279474050065906e-06,
"loss": 0.4878,
"num_input_tokens_seen": 17513024,
"step": 5555
},
{
"epoch": 0.35593111836630176,
"grad_norm": 27.991254806518555,
"learning_rate": 1.6270774148679054e-06,
"loss": 0.4049,
"num_input_tokens_seen": 17529024,
"step": 5560
},
{
"epoch": 0.35625120030727864,
"grad_norm": 17.627593994140625,
"learning_rate": 1.6262066418157048e-06,
"loss": 0.3788,
"num_input_tokens_seen": 17543936,
"step": 5565
},
{
"epoch": 0.35657128224825557,
"grad_norm": 51.489200592041016,
"learning_rate": 1.6253350869371595e-06,
"loss": 0.5444,
"num_input_tokens_seen": 17559168,
"step": 5570
},
{
"epoch": 0.35689136418923245,
"grad_norm": 35.092872619628906,
"learning_rate": 1.6244627513204158e-06,
"loss": 0.3861,
"num_input_tokens_seen": 17574912,
"step": 5575
},
{
"epoch": 0.35721144613020933,
"grad_norm": 23.55853843688965,
"learning_rate": 1.6235896360545954e-06,
"loss": 0.4319,
"num_input_tokens_seen": 17590272,
"step": 5580
},
{
"epoch": 0.3575315280711862,
"grad_norm": 42.771095275878906,
"learning_rate": 1.622715742229792e-06,
"loss": 0.4466,
"num_input_tokens_seen": 17605952,
"step": 5585
},
{
"epoch": 0.3578516100121631,
"grad_norm": 21.996267318725586,
"learning_rate": 1.6218410709370734e-06,
"loss": 0.3861,
"num_input_tokens_seen": 17621120,
"step": 5590
},
{
"epoch": 0.35817169195314,
"grad_norm": 43.53791046142578,
"learning_rate": 1.6209656232684768e-06,
"loss": 0.5462,
"num_input_tokens_seen": 17636096,
"step": 5595
},
{
"epoch": 0.3584917738941169,
"grad_norm": 88.72663116455078,
"learning_rate": 1.620089400317008e-06,
"loss": 0.4566,
"num_input_tokens_seen": 17652672,
"step": 5600
},
{
"epoch": 0.3588118558350938,
"grad_norm": 35.793540954589844,
"learning_rate": 1.6192124031766425e-06,
"loss": 0.4979,
"num_input_tokens_seen": 17668032,
"step": 5605
},
{
"epoch": 0.35913193777607066,
"grad_norm": 29.273569107055664,
"learning_rate": 1.6183346329423213e-06,
"loss": 0.4507,
"num_input_tokens_seen": 17683264,
"step": 5610
},
{
"epoch": 0.35945201971704754,
"grad_norm": 52.97650146484375,
"learning_rate": 1.6174560907099508e-06,
"loss": 0.3672,
"num_input_tokens_seen": 17699200,
"step": 5615
},
{
"epoch": 0.3597721016580245,
"grad_norm": 23.672475814819336,
"learning_rate": 1.6165767775764013e-06,
"loss": 0.3538,
"num_input_tokens_seen": 17714816,
"step": 5620
},
{
"epoch": 0.36009218359900136,
"grad_norm": 38.058650970458984,
"learning_rate": 1.6156966946395056e-06,
"loss": 0.4157,
"num_input_tokens_seen": 17732352,
"step": 5625
},
{
"epoch": 0.36041226553997824,
"grad_norm": 54.641357421875,
"learning_rate": 1.6148158429980577e-06,
"loss": 0.536,
"num_input_tokens_seen": 17748288,
"step": 5630
},
{
"epoch": 0.3607323474809551,
"grad_norm": 42.360755920410156,
"learning_rate": 1.6139342237518108e-06,
"loss": 0.3758,
"num_input_tokens_seen": 17763520,
"step": 5635
},
{
"epoch": 0.361052429421932,
"grad_norm": 33.51826095581055,
"learning_rate": 1.6130518380014773e-06,
"loss": 0.4256,
"num_input_tokens_seen": 17779328,
"step": 5640
},
{
"epoch": 0.3613725113629089,
"grad_norm": 36.83528137207031,
"learning_rate": 1.6121686868487259e-06,
"loss": 0.4313,
"num_input_tokens_seen": 17795584,
"step": 5645
},
{
"epoch": 0.3616925933038858,
"grad_norm": 17.509504318237305,
"learning_rate": 1.6112847713961815e-06,
"loss": 0.4449,
"num_input_tokens_seen": 17810368,
"step": 5650
},
{
"epoch": 0.3620126752448627,
"grad_norm": 28.996376037597656,
"learning_rate": 1.610400092747423e-06,
"loss": 0.4365,
"num_input_tokens_seen": 17826496,
"step": 5655
},
{
"epoch": 0.36233275718583957,
"grad_norm": 31.747772216796875,
"learning_rate": 1.609514652006981e-06,
"loss": 0.4266,
"num_input_tokens_seen": 17841344,
"step": 5660
},
{
"epoch": 0.36265283912681645,
"grad_norm": 32.14071273803711,
"learning_rate": 1.60862845028034e-06,
"loss": 0.5632,
"num_input_tokens_seen": 17857408,
"step": 5665
},
{
"epoch": 0.36297292106779333,
"grad_norm": 24.347280502319336,
"learning_rate": 1.6077414886739327e-06,
"loss": 0.4209,
"num_input_tokens_seen": 17873280,
"step": 5670
},
{
"epoch": 0.36329300300877027,
"grad_norm": 22.06682586669922,
"learning_rate": 1.6068537682951412e-06,
"loss": 0.5023,
"num_input_tokens_seen": 17888448,
"step": 5675
},
{
"epoch": 0.36361308494974715,
"grad_norm": 28.86912727355957,
"learning_rate": 1.6059652902522947e-06,
"loss": 0.4459,
"num_input_tokens_seen": 17904320,
"step": 5680
},
{
"epoch": 0.363933166890724,
"grad_norm": 50.29701232910156,
"learning_rate": 1.6050760556546683e-06,
"loss": 0.3725,
"num_input_tokens_seen": 17919744,
"step": 5685
},
{
"epoch": 0.3642532488317009,
"grad_norm": 26.409318923950195,
"learning_rate": 1.6041860656124823e-06,
"loss": 0.3823,
"num_input_tokens_seen": 17934656,
"step": 5690
},
{
"epoch": 0.3645733307726778,
"grad_norm": 40.44452667236328,
"learning_rate": 1.6032953212368993e-06,
"loss": 0.5608,
"num_input_tokens_seen": 17950976,
"step": 5695
},
{
"epoch": 0.3648934127136547,
"grad_norm": 24.44096565246582,
"learning_rate": 1.6024038236400243e-06,
"loss": 0.465,
"num_input_tokens_seen": 17966400,
"step": 5700
},
{
"epoch": 0.3652134946546316,
"grad_norm": 122.20569610595703,
"learning_rate": 1.6015115739349027e-06,
"loss": 0.5704,
"num_input_tokens_seen": 17983872,
"step": 5705
},
{
"epoch": 0.3655335765956085,
"grad_norm": 33.78934860229492,
"learning_rate": 1.6006185732355183e-06,
"loss": 0.5358,
"num_input_tokens_seen": 17999680,
"step": 5710
},
{
"epoch": 0.36585365853658536,
"grad_norm": 22.399660110473633,
"learning_rate": 1.5997248226567931e-06,
"loss": 0.3807,
"num_input_tokens_seen": 18014784,
"step": 5715
},
{
"epoch": 0.36617374047756224,
"grad_norm": 25.733877182006836,
"learning_rate": 1.5988303233145853e-06,
"loss": 0.5063,
"num_input_tokens_seen": 18029888,
"step": 5720
},
{
"epoch": 0.3664938224185392,
"grad_norm": 30.036779403686523,
"learning_rate": 1.597935076325688e-06,
"loss": 0.3721,
"num_input_tokens_seen": 18045632,
"step": 5725
},
{
"epoch": 0.36681390435951605,
"grad_norm": 42.842872619628906,
"learning_rate": 1.5970390828078272e-06,
"loss": 0.5996,
"num_input_tokens_seen": 18060928,
"step": 5730
},
{
"epoch": 0.36713398630049293,
"grad_norm": 17.993152618408203,
"learning_rate": 1.5961423438796615e-06,
"loss": 0.4616,
"num_input_tokens_seen": 18076352,
"step": 5735
},
{
"epoch": 0.3674540682414698,
"grad_norm": 42.84749984741211,
"learning_rate": 1.59524486066078e-06,
"loss": 0.45,
"num_input_tokens_seen": 18092096,
"step": 5740
},
{
"epoch": 0.3677741501824467,
"grad_norm": 29.25870132446289,
"learning_rate": 1.5943466342717012e-06,
"loss": 0.5875,
"num_input_tokens_seen": 18107648,
"step": 5745
},
{
"epoch": 0.36809423212342357,
"grad_norm": 28.201173782348633,
"learning_rate": 1.5934476658338708e-06,
"loss": 0.4526,
"num_input_tokens_seen": 18123264,
"step": 5750
},
{
"epoch": 0.3684143140644005,
"grad_norm": 29.33237075805664,
"learning_rate": 1.5925479564696619e-06,
"loss": 0.5482,
"num_input_tokens_seen": 18138368,
"step": 5755
},
{
"epoch": 0.3687343960053774,
"grad_norm": 12.648244857788086,
"learning_rate": 1.5916475073023721e-06,
"loss": 0.3433,
"num_input_tokens_seen": 18154432,
"step": 5760
},
{
"epoch": 0.36905447794635426,
"grad_norm": 35.97001266479492,
"learning_rate": 1.5907463194562226e-06,
"loss": 0.3385,
"num_input_tokens_seen": 18171200,
"step": 5765
},
{
"epoch": 0.36937455988733114,
"grad_norm": 24.797889709472656,
"learning_rate": 1.589844394056357e-06,
"loss": 0.3763,
"num_input_tokens_seen": 18187008,
"step": 5770
},
{
"epoch": 0.369694641828308,
"grad_norm": 50.53974914550781,
"learning_rate": 1.5889417322288403e-06,
"loss": 0.3462,
"num_input_tokens_seen": 18202944,
"step": 5775
},
{
"epoch": 0.37001472376928496,
"grad_norm": 84.17024993896484,
"learning_rate": 1.5880383351006556e-06,
"loss": 0.4963,
"num_input_tokens_seen": 18217984,
"step": 5780
},
{
"epoch": 0.37033480571026184,
"grad_norm": 29.01326560974121,
"learning_rate": 1.5871342037997055e-06,
"loss": 0.5257,
"num_input_tokens_seen": 18233984,
"step": 5785
},
{
"epoch": 0.3706548876512387,
"grad_norm": 52.05530548095703,
"learning_rate": 1.5862293394548082e-06,
"loss": 0.416,
"num_input_tokens_seen": 18249024,
"step": 5790
},
{
"epoch": 0.3709749695922156,
"grad_norm": 72.29715728759766,
"learning_rate": 1.5853237431956972e-06,
"loss": 0.3512,
"num_input_tokens_seen": 18264256,
"step": 5795
},
{
"epoch": 0.3712950515331925,
"grad_norm": 43.78818893432617,
"learning_rate": 1.5844174161530206e-06,
"loss": 0.554,
"num_input_tokens_seen": 18279936,
"step": 5800
},
{
"epoch": 0.3716151334741694,
"grad_norm": 26.14434814453125,
"learning_rate": 1.5835103594583382e-06,
"loss": 0.4147,
"num_input_tokens_seen": 18295488,
"step": 5805
},
{
"epoch": 0.3719352154151463,
"grad_norm": 26.584754943847656,
"learning_rate": 1.5826025742441207e-06,
"loss": 0.5357,
"num_input_tokens_seen": 18311360,
"step": 5810
},
{
"epoch": 0.37225529735612317,
"grad_norm": 28.070344924926758,
"learning_rate": 1.5816940616437486e-06,
"loss": 0.4282,
"num_input_tokens_seen": 18326592,
"step": 5815
},
{
"epoch": 0.37257537929710005,
"grad_norm": 36.15549850463867,
"learning_rate": 1.5807848227915108e-06,
"loss": 0.3564,
"num_input_tokens_seen": 18344000,
"step": 5820
},
{
"epoch": 0.37289546123807693,
"grad_norm": 63.37150955200195,
"learning_rate": 1.5798748588226028e-06,
"loss": 0.4888,
"num_input_tokens_seen": 18359872,
"step": 5825
},
{
"epoch": 0.3732155431790538,
"grad_norm": 36.90925216674805,
"learning_rate": 1.578964170873125e-06,
"loss": 0.472,
"num_input_tokens_seen": 18374400,
"step": 5830
},
{
"epoch": 0.37353562512003075,
"grad_norm": 19.994869232177734,
"learning_rate": 1.5780527600800816e-06,
"loss": 0.2731,
"num_input_tokens_seen": 18390656,
"step": 5835
},
{
"epoch": 0.3738557070610076,
"grad_norm": 66.3774185180664,
"learning_rate": 1.5771406275813808e-06,
"loss": 0.4561,
"num_input_tokens_seen": 18406400,
"step": 5840
},
{
"epoch": 0.3741757890019845,
"grad_norm": 54.15401840209961,
"learning_rate": 1.5762277745158297e-06,
"loss": 0.5531,
"num_input_tokens_seen": 18422848,
"step": 5845
},
{
"epoch": 0.3744958709429614,
"grad_norm": 93.29429626464844,
"learning_rate": 1.5753142020231365e-06,
"loss": 0.5008,
"num_input_tokens_seen": 18438912,
"step": 5850
},
{
"epoch": 0.37481595288393826,
"grad_norm": 40.965824127197266,
"learning_rate": 1.5743999112439073e-06,
"loss": 0.5494,
"num_input_tokens_seen": 18455488,
"step": 5855
},
{
"epoch": 0.3751360348249152,
"grad_norm": 41.4587516784668,
"learning_rate": 1.5734849033196446e-06,
"loss": 0.4015,
"num_input_tokens_seen": 18470080,
"step": 5860
},
{
"epoch": 0.3754561167658921,
"grad_norm": 41.16543197631836,
"learning_rate": 1.5725691793927468e-06,
"loss": 0.4426,
"num_input_tokens_seen": 18484480,
"step": 5865
},
{
"epoch": 0.37577619870686896,
"grad_norm": 24.49448585510254,
"learning_rate": 1.5716527406065057e-06,
"loss": 0.4731,
"num_input_tokens_seen": 18501312,
"step": 5870
},
{
"epoch": 0.37609628064784584,
"grad_norm": 26.67026138305664,
"learning_rate": 1.570735588105106e-06,
"loss": 0.4582,
"num_input_tokens_seen": 18515968,
"step": 5875
},
{
"epoch": 0.3764163625888227,
"grad_norm": 17.646738052368164,
"learning_rate": 1.5698177230336234e-06,
"loss": 0.3808,
"num_input_tokens_seen": 18531200,
"step": 5880
},
{
"epoch": 0.37673644452979965,
"grad_norm": 34.10994338989258,
"learning_rate": 1.568899146538023e-06,
"loss": 0.2686,
"num_input_tokens_seen": 18547712,
"step": 5885
},
{
"epoch": 0.37705652647077653,
"grad_norm": 28.377954483032227,
"learning_rate": 1.5679798597651587e-06,
"loss": 0.4112,
"num_input_tokens_seen": 18562752,
"step": 5890
},
{
"epoch": 0.3773766084117534,
"grad_norm": 42.726253509521484,
"learning_rate": 1.5670598638627706e-06,
"loss": 0.4375,
"num_input_tokens_seen": 18578368,
"step": 5895
},
{
"epoch": 0.3776966903527303,
"grad_norm": 42.27223587036133,
"learning_rate": 1.5661391599794847e-06,
"loss": 0.3833,
"num_input_tokens_seen": 18593408,
"step": 5900
},
{
"epoch": 0.37801677229370717,
"grad_norm": 29.259695053100586,
"learning_rate": 1.56521774926481e-06,
"loss": 0.4148,
"num_input_tokens_seen": 18607872,
"step": 5905
},
{
"epoch": 0.3783368542346841,
"grad_norm": 25.923545837402344,
"learning_rate": 1.5642956328691393e-06,
"loss": 0.359,
"num_input_tokens_seen": 18624000,
"step": 5910
},
{
"epoch": 0.378656936175661,
"grad_norm": 53.90917205810547,
"learning_rate": 1.5633728119437451e-06,
"loss": 0.5591,
"num_input_tokens_seen": 18640704,
"step": 5915
},
{
"epoch": 0.37897701811663786,
"grad_norm": 30.155330657958984,
"learning_rate": 1.5624492876407807e-06,
"loss": 0.472,
"num_input_tokens_seen": 18658368,
"step": 5920
},
{
"epoch": 0.37929710005761474,
"grad_norm": 47.06120681762695,
"learning_rate": 1.5615250611132766e-06,
"loss": 0.411,
"num_input_tokens_seen": 18675584,
"step": 5925
},
{
"epoch": 0.3796171819985916,
"grad_norm": 25.19417381286621,
"learning_rate": 1.5606001335151405e-06,
"loss": 0.5683,
"num_input_tokens_seen": 18691904,
"step": 5930
},
{
"epoch": 0.3799372639395685,
"grad_norm": 36.34967803955078,
"learning_rate": 1.5596745060011561e-06,
"loss": 0.3734,
"num_input_tokens_seen": 18708736,
"step": 5935
},
{
"epoch": 0.38025734588054544,
"grad_norm": 36.44337844848633,
"learning_rate": 1.5587481797269793e-06,
"loss": 0.3492,
"num_input_tokens_seen": 18724032,
"step": 5940
},
{
"epoch": 0.3805774278215223,
"grad_norm": 39.71046447753906,
"learning_rate": 1.5578211558491396e-06,
"loss": 0.4266,
"num_input_tokens_seen": 18740352,
"step": 5945
},
{
"epoch": 0.3808975097624992,
"grad_norm": 26.960233688354492,
"learning_rate": 1.5568934355250375e-06,
"loss": 0.3346,
"num_input_tokens_seen": 18754560,
"step": 5950
},
{
"epoch": 0.3812175917034761,
"grad_norm": 69.33877563476562,
"learning_rate": 1.5559650199129423e-06,
"loss": 0.6693,
"num_input_tokens_seen": 18769280,
"step": 5955
},
{
"epoch": 0.38153767364445296,
"grad_norm": 53.183929443359375,
"learning_rate": 1.5550359101719921e-06,
"loss": 0.4131,
"num_input_tokens_seen": 18784512,
"step": 5960
},
{
"epoch": 0.3818577555854299,
"grad_norm": 62.83601379394531,
"learning_rate": 1.554106107462191e-06,
"loss": 0.3615,
"num_input_tokens_seen": 18800384,
"step": 5965
},
{
"epoch": 0.38217783752640677,
"grad_norm": 42.66127014160156,
"learning_rate": 1.5531756129444092e-06,
"loss": 0.4262,
"num_input_tokens_seen": 18815552,
"step": 5970
},
{
"epoch": 0.38249791946738365,
"grad_norm": 26.47112464904785,
"learning_rate": 1.5522444277803796e-06,
"loss": 0.4191,
"num_input_tokens_seen": 18830080,
"step": 5975
},
{
"epoch": 0.38281800140836053,
"grad_norm": 33.1724967956543,
"learning_rate": 1.5513125531326976e-06,
"loss": 0.4244,
"num_input_tokens_seen": 18846272,
"step": 5980
},
{
"epoch": 0.3831380833493374,
"grad_norm": 28.296476364135742,
"learning_rate": 1.5503799901648198e-06,
"loss": 0.3802,
"num_input_tokens_seen": 18860928,
"step": 5985
},
{
"epoch": 0.38345816529031435,
"grad_norm": 63.02308654785156,
"learning_rate": 1.5494467400410625e-06,
"loss": 0.4461,
"num_input_tokens_seen": 18877120,
"step": 5990
},
{
"epoch": 0.3837782472312912,
"grad_norm": 48.09440612792969,
"learning_rate": 1.5485128039265986e-06,
"loss": 0.6047,
"num_input_tokens_seen": 18892224,
"step": 5995
},
{
"epoch": 0.3840983291722681,
"grad_norm": 52.94186782836914,
"learning_rate": 1.547578182987459e-06,
"loss": 0.445,
"num_input_tokens_seen": 18907008,
"step": 6000
},
{
"epoch": 0.384418411113245,
"grad_norm": 22.386451721191406,
"learning_rate": 1.5466428783905286e-06,
"loss": 0.2856,
"num_input_tokens_seen": 18922368,
"step": 6005
},
{
"epoch": 0.38473849305422186,
"grad_norm": 33.801048278808594,
"learning_rate": 1.5457068913035463e-06,
"loss": 0.4418,
"num_input_tokens_seen": 18937536,
"step": 6010
},
{
"epoch": 0.38505857499519874,
"grad_norm": 40.13835906982422,
"learning_rate": 1.544770222895103e-06,
"loss": 0.5024,
"num_input_tokens_seen": 18954048,
"step": 6015
},
{
"epoch": 0.3853786569361757,
"grad_norm": 30.525466918945312,
"learning_rate": 1.5438328743346398e-06,
"loss": 0.5102,
"num_input_tokens_seen": 18969472,
"step": 6020
},
{
"epoch": 0.38569873887715256,
"grad_norm": 24.051631927490234,
"learning_rate": 1.5428948467924478e-06,
"loss": 0.4192,
"num_input_tokens_seen": 18983872,
"step": 6025
},
{
"epoch": 0.38601882081812944,
"grad_norm": 21.4184513092041,
"learning_rate": 1.5419561414396656e-06,
"loss": 0.3268,
"num_input_tokens_seen": 18999360,
"step": 6030
},
{
"epoch": 0.3863389027591063,
"grad_norm": 24.729501724243164,
"learning_rate": 1.541016759448277e-06,
"loss": 0.4969,
"num_input_tokens_seen": 19015424,
"step": 6035
},
{
"epoch": 0.3866589847000832,
"grad_norm": 32.38375473022461,
"learning_rate": 1.5400767019911124e-06,
"loss": 0.3775,
"num_input_tokens_seen": 19031616,
"step": 6040
},
{
"epoch": 0.38697906664106013,
"grad_norm": 31.431982040405273,
"learning_rate": 1.539135970241844e-06,
"loss": 0.4886,
"num_input_tokens_seen": 19047040,
"step": 6045
},
{
"epoch": 0.387299148582037,
"grad_norm": 53.52016067504883,
"learning_rate": 1.5381945653749866e-06,
"loss": 0.4842,
"num_input_tokens_seen": 19062848,
"step": 6050
},
{
"epoch": 0.3876192305230139,
"grad_norm": 81.80306243896484,
"learning_rate": 1.5372524885658952e-06,
"loss": 0.5516,
"num_input_tokens_seen": 19078976,
"step": 6055
},
{
"epoch": 0.38793931246399077,
"grad_norm": 26.691001892089844,
"learning_rate": 1.5363097409907638e-06,
"loss": 0.3732,
"num_input_tokens_seen": 19093632,
"step": 6060
},
{
"epoch": 0.38825939440496765,
"grad_norm": 24.735050201416016,
"learning_rate": 1.535366323826624e-06,
"loss": 0.3583,
"num_input_tokens_seen": 19109056,
"step": 6065
},
{
"epoch": 0.3885794763459446,
"grad_norm": 48.98762130737305,
"learning_rate": 1.534422238251343e-06,
"loss": 0.3623,
"num_input_tokens_seen": 19124544,
"step": 6070
},
{
"epoch": 0.38889955828692147,
"grad_norm": 35.63262939453125,
"learning_rate": 1.5334774854436223e-06,
"loss": 0.3844,
"num_input_tokens_seen": 19140480,
"step": 6075
},
{
"epoch": 0.38921964022789834,
"grad_norm": 37.14643859863281,
"learning_rate": 1.5325320665829975e-06,
"loss": 0.378,
"num_input_tokens_seen": 19156736,
"step": 6080
},
{
"epoch": 0.3895397221688752,
"grad_norm": 34.23735809326172,
"learning_rate": 1.5315859828498352e-06,
"loss": 0.4624,
"num_input_tokens_seen": 19171520,
"step": 6085
},
{
"epoch": 0.3898598041098521,
"grad_norm": 29.676471710205078,
"learning_rate": 1.5306392354253316e-06,
"loss": 0.5057,
"num_input_tokens_seen": 19187136,
"step": 6090
},
{
"epoch": 0.39017988605082904,
"grad_norm": 23.943275451660156,
"learning_rate": 1.5296918254915123e-06,
"loss": 0.4389,
"num_input_tokens_seen": 19201856,
"step": 6095
},
{
"epoch": 0.3904999679918059,
"grad_norm": 27.726776123046875,
"learning_rate": 1.5287437542312296e-06,
"loss": 0.3827,
"num_input_tokens_seen": 19216704,
"step": 6100
},
{
"epoch": 0.3908200499327828,
"grad_norm": 55.275352478027344,
"learning_rate": 1.5277950228281614e-06,
"loss": 0.5423,
"num_input_tokens_seen": 19233408,
"step": 6105
},
{
"epoch": 0.3911401318737597,
"grad_norm": 27.080801010131836,
"learning_rate": 1.52684563246681e-06,
"loss": 0.3617,
"num_input_tokens_seen": 19250048,
"step": 6110
},
{
"epoch": 0.39146021381473656,
"grad_norm": 18.170150756835938,
"learning_rate": 1.5258955843325015e-06,
"loss": 0.4241,
"num_input_tokens_seen": 19266560,
"step": 6115
},
{
"epoch": 0.39178029575571344,
"grad_norm": 60.83952713012695,
"learning_rate": 1.5249448796113804e-06,
"loss": 0.5018,
"num_input_tokens_seen": 19281408,
"step": 6120
},
{
"epoch": 0.39210037769669037,
"grad_norm": 48.25818634033203,
"learning_rate": 1.5239935194904141e-06,
"loss": 0.4797,
"num_input_tokens_seen": 19296384,
"step": 6125
},
{
"epoch": 0.39242045963766725,
"grad_norm": 24.223154067993164,
"learning_rate": 1.523041505157386e-06,
"loss": 0.3946,
"num_input_tokens_seen": 19312000,
"step": 6130
},
{
"epoch": 0.39274054157864413,
"grad_norm": 27.672351837158203,
"learning_rate": 1.5220888378008977e-06,
"loss": 0.395,
"num_input_tokens_seen": 19327488,
"step": 6135
},
{
"epoch": 0.393060623519621,
"grad_norm": 23.515743255615234,
"learning_rate": 1.5211355186103654e-06,
"loss": 0.4748,
"num_input_tokens_seen": 19342080,
"step": 6140
},
{
"epoch": 0.3933807054605979,
"grad_norm": 54.429962158203125,
"learning_rate": 1.5201815487760192e-06,
"loss": 0.4435,
"num_input_tokens_seen": 19358336,
"step": 6145
},
{
"epoch": 0.3937007874015748,
"grad_norm": 88.29194641113281,
"learning_rate": 1.5192269294889019e-06,
"loss": 0.5032,
"num_input_tokens_seen": 19373376,
"step": 6150
},
{
"epoch": 0.3940208693425517,
"grad_norm": 29.944011688232422,
"learning_rate": 1.5182716619408666e-06,
"loss": 0.4021,
"num_input_tokens_seen": 19388608,
"step": 6155
},
{
"epoch": 0.3943409512835286,
"grad_norm": 32.160797119140625,
"learning_rate": 1.5173157473245764e-06,
"loss": 0.5383,
"num_input_tokens_seen": 19403264,
"step": 6160
},
{
"epoch": 0.39466103322450546,
"grad_norm": 38.58219528198242,
"learning_rate": 1.5163591868335016e-06,
"loss": 0.4397,
"num_input_tokens_seen": 19418816,
"step": 6165
},
{
"epoch": 0.39498111516548234,
"grad_norm": 41.530364990234375,
"learning_rate": 1.515401981661919e-06,
"loss": 0.5856,
"num_input_tokens_seen": 19435392,
"step": 6170
},
{
"epoch": 0.3953011971064593,
"grad_norm": 33.935325622558594,
"learning_rate": 1.514444133004911e-06,
"loss": 0.4567,
"num_input_tokens_seen": 19450048,
"step": 6175
},
{
"epoch": 0.39562127904743616,
"grad_norm": 31.931150436401367,
"learning_rate": 1.5134856420583631e-06,
"loss": 0.465,
"num_input_tokens_seen": 19466368,
"step": 6180
},
{
"epoch": 0.39594136098841304,
"grad_norm": 25.44246482849121,
"learning_rate": 1.5125265100189614e-06,
"loss": 0.34,
"num_input_tokens_seen": 19482624,
"step": 6185
},
{
"epoch": 0.3962614429293899,
"grad_norm": 32.58120346069336,
"learning_rate": 1.5115667380841948e-06,
"loss": 0.5382,
"num_input_tokens_seen": 19498048,
"step": 6190
},
{
"epoch": 0.3965815248703668,
"grad_norm": 18.999216079711914,
"learning_rate": 1.510606327452349e-06,
"loss": 0.4413,
"num_input_tokens_seen": 19515264,
"step": 6195
},
{
"epoch": 0.3969016068113437,
"grad_norm": 35.836219787597656,
"learning_rate": 1.5096452793225082e-06,
"loss": 0.4267,
"num_input_tokens_seen": 19533056,
"step": 6200
},
{
"epoch": 0.3972216887523206,
"grad_norm": 26.90237808227539,
"learning_rate": 1.5086835948945522e-06,
"loss": 0.3994,
"num_input_tokens_seen": 19548480,
"step": 6205
},
{
"epoch": 0.3975417706932975,
"grad_norm": 30.118810653686523,
"learning_rate": 1.5077212753691556e-06,
"loss": 0.3462,
"num_input_tokens_seen": 19563712,
"step": 6210
},
{
"epoch": 0.39786185263427437,
"grad_norm": 36.46988296508789,
"learning_rate": 1.5067583219477852e-06,
"loss": 0.41,
"num_input_tokens_seen": 19578624,
"step": 6215
},
{
"epoch": 0.39818193457525125,
"grad_norm": 31.342973709106445,
"learning_rate": 1.5057947358327e-06,
"loss": 0.3926,
"num_input_tokens_seen": 19593408,
"step": 6220
},
{
"epoch": 0.39850201651622813,
"grad_norm": 37.588436126708984,
"learning_rate": 1.504830518226948e-06,
"loss": 0.5044,
"num_input_tokens_seen": 19609216,
"step": 6225
},
{
"epoch": 0.39882209845720507,
"grad_norm": 29.45639419555664,
"learning_rate": 1.5038656703343672e-06,
"loss": 0.4468,
"num_input_tokens_seen": 19624896,
"step": 6230
},
{
"epoch": 0.39914218039818194,
"grad_norm": 72.1549072265625,
"learning_rate": 1.5029001933595805e-06,
"loss": 0.5125,
"num_input_tokens_seen": 19640128,
"step": 6235
},
{
"epoch": 0.3994622623391588,
"grad_norm": 34.15262985229492,
"learning_rate": 1.501934088507998e-06,
"loss": 0.3482,
"num_input_tokens_seen": 19655680,
"step": 6240
},
{
"epoch": 0.3997823442801357,
"grad_norm": 35.860618591308594,
"learning_rate": 1.5009673569858126e-06,
"loss": 0.6246,
"num_input_tokens_seen": 19672192,
"step": 6245
},
{
"epoch": 0.4001024262211126,
"grad_norm": 46.368167877197266,
"learning_rate": 1.5e-06,
"loss": 0.534,
"num_input_tokens_seen": 19688896,
"step": 6250
},
{
"epoch": 0.4004225081620895,
"grad_norm": 19.773387908935547,
"learning_rate": 1.4990320187583167e-06,
"loss": 0.3556,
"num_input_tokens_seen": 19704128,
"step": 6255
},
{
"epoch": 0.4004865245502849,
"eval_loss": 0.4279458224773407,
"eval_runtime": 50.6211,
"eval_samples_per_second": 274.312,
"eval_steps_per_second": 34.294,
"num_input_tokens_seen": 19707456,
"step": 6256
},
{
"epoch": 0.4007425901030664,
"grad_norm": 32.513755798339844,
"learning_rate": 1.4980634144692986e-06,
"loss": 0.3913,
"num_input_tokens_seen": 19719744,
"step": 6260
},
{
"epoch": 0.4010626720440433,
"grad_norm": 47.36183547973633,
"learning_rate": 1.4970941883422599e-06,
"loss": 0.3734,
"num_input_tokens_seen": 19736128,
"step": 6265
},
{
"epoch": 0.40138275398502016,
"grad_norm": 26.743701934814453,
"learning_rate": 1.4961243415872901e-06,
"loss": 0.4286,
"num_input_tokens_seen": 19751296,
"step": 6270
},
{
"epoch": 0.40170283592599704,
"grad_norm": 62.38422775268555,
"learning_rate": 1.4951538754152551e-06,
"loss": 0.3958,
"num_input_tokens_seen": 19765888,
"step": 6275
},
{
"epoch": 0.402022917866974,
"grad_norm": 31.93796157836914,
"learning_rate": 1.4941827910377925e-06,
"loss": 0.4227,
"num_input_tokens_seen": 19780864,
"step": 6280
},
{
"epoch": 0.40234299980795085,
"grad_norm": 22.445552825927734,
"learning_rate": 1.4932110896673131e-06,
"loss": 0.3978,
"num_input_tokens_seen": 19796864,
"step": 6285
},
{
"epoch": 0.40266308174892773,
"grad_norm": 29.57168960571289,
"learning_rate": 1.4922387725169973e-06,
"loss": 0.5383,
"num_input_tokens_seen": 19811904,
"step": 6290
},
{
"epoch": 0.4029831636899046,
"grad_norm": 32.471187591552734,
"learning_rate": 1.4912658408007947e-06,
"loss": 0.418,
"num_input_tokens_seen": 19827456,
"step": 6295
},
{
"epoch": 0.4033032456308815,
"grad_norm": 33.78974914550781,
"learning_rate": 1.4902922957334215e-06,
"loss": 0.4194,
"num_input_tokens_seen": 19842496,
"step": 6300
},
{
"epoch": 0.40362332757185837,
"grad_norm": 46.43672561645508,
"learning_rate": 1.4893181385303608e-06,
"loss": 0.4186,
"num_input_tokens_seen": 19858240,
"step": 6305
},
{
"epoch": 0.4039434095128353,
"grad_norm": 34.83802032470703,
"learning_rate": 1.4883433704078584e-06,
"loss": 0.4262,
"num_input_tokens_seen": 19874368,
"step": 6310
},
{
"epoch": 0.4042634914538122,
"grad_norm": 34.20176315307617,
"learning_rate": 1.4873679925829246e-06,
"loss": 0.3986,
"num_input_tokens_seen": 19891904,
"step": 6315
},
{
"epoch": 0.40458357339478906,
"grad_norm": 21.880064010620117,
"learning_rate": 1.4863920062733298e-06,
"loss": 0.4157,
"num_input_tokens_seen": 19907392,
"step": 6320
},
{
"epoch": 0.40490365533576594,
"grad_norm": 48.874141693115234,
"learning_rate": 1.485415412697604e-06,
"loss": 0.3822,
"num_input_tokens_seen": 19922624,
"step": 6325
},
{
"epoch": 0.4052237372767428,
"grad_norm": 34.258758544921875,
"learning_rate": 1.484438213075036e-06,
"loss": 0.4286,
"num_input_tokens_seen": 19939328,
"step": 6330
},
{
"epoch": 0.40554381921771976,
"grad_norm": 43.55635452270508,
"learning_rate": 1.4834604086256713e-06,
"loss": 0.4412,
"num_input_tokens_seen": 19955392,
"step": 6335
},
{
"epoch": 0.40586390115869664,
"grad_norm": 39.21355438232422,
"learning_rate": 1.4824820005703097e-06,
"loss": 0.401,
"num_input_tokens_seen": 19971520,
"step": 6340
},
{
"epoch": 0.4061839830996735,
"grad_norm": 24.532764434814453,
"learning_rate": 1.4815029901305061e-06,
"loss": 0.448,
"num_input_tokens_seen": 19988352,
"step": 6345
},
{
"epoch": 0.4065040650406504,
"grad_norm": 28.755565643310547,
"learning_rate": 1.480523378528565e-06,
"loss": 0.4706,
"num_input_tokens_seen": 20005184,
"step": 6350
},
{
"epoch": 0.4068241469816273,
"grad_norm": 43.57781982421875,
"learning_rate": 1.4795431669875441e-06,
"loss": 0.4379,
"num_input_tokens_seen": 20020800,
"step": 6355
},
{
"epoch": 0.4071442289226042,
"grad_norm": 30.764387130737305,
"learning_rate": 1.478562356731249e-06,
"loss": 0.475,
"num_input_tokens_seen": 20036416,
"step": 6360
},
{
"epoch": 0.4074643108635811,
"grad_norm": 42.592384338378906,
"learning_rate": 1.4775809489842326e-06,
"loss": 0.4608,
"num_input_tokens_seen": 20053184,
"step": 6365
},
{
"epoch": 0.40778439280455797,
"grad_norm": 28.14908790588379,
"learning_rate": 1.4765989449717937e-06,
"loss": 0.3944,
"num_input_tokens_seen": 20069888,
"step": 6370
},
{
"epoch": 0.40810447474553485,
"grad_norm": 62.522220611572266,
"learning_rate": 1.4756163459199763e-06,
"loss": 0.534,
"num_input_tokens_seen": 20085760,
"step": 6375
},
{
"epoch": 0.40842455668651173,
"grad_norm": 34.16120910644531,
"learning_rate": 1.4746331530555665e-06,
"loss": 0.2694,
"num_input_tokens_seen": 20101056,
"step": 6380
},
{
"epoch": 0.4087446386274886,
"grad_norm": 39.74694061279297,
"learning_rate": 1.4736493676060923e-06,
"loss": 0.4114,
"num_input_tokens_seen": 20116352,
"step": 6385
},
{
"epoch": 0.40906472056846555,
"grad_norm": 21.04083824157715,
"learning_rate": 1.4726649907998216e-06,
"loss": 0.3752,
"num_input_tokens_seen": 20131712,
"step": 6390
},
{
"epoch": 0.4093848025094424,
"grad_norm": 33.7485466003418,
"learning_rate": 1.4716800238657599e-06,
"loss": 0.3816,
"num_input_tokens_seen": 20146880,
"step": 6395
},
{
"epoch": 0.4097048844504193,
"grad_norm": 19.112497329711914,
"learning_rate": 1.4706944680336505e-06,
"loss": 0.285,
"num_input_tokens_seen": 20163520,
"step": 6400
},
{
"epoch": 0.4100249663913962,
"grad_norm": 43.30415344238281,
"learning_rate": 1.469708324533971e-06,
"loss": 0.4656,
"num_input_tokens_seen": 20177984,
"step": 6405
},
{
"epoch": 0.41034504833237306,
"grad_norm": 18.54940414428711,
"learning_rate": 1.4687215945979335e-06,
"loss": 0.3425,
"num_input_tokens_seen": 20193472,
"step": 6410
},
{
"epoch": 0.41066513027335,
"grad_norm": 41.684871673583984,
"learning_rate": 1.4677342794574815e-06,
"loss": 0.4557,
"num_input_tokens_seen": 20210624,
"step": 6415
},
{
"epoch": 0.4109852122143269,
"grad_norm": 59.87638854980469,
"learning_rate": 1.4667463803452902e-06,
"loss": 0.4171,
"num_input_tokens_seen": 20226688,
"step": 6420
},
{
"epoch": 0.41130529415530376,
"grad_norm": 43.94029998779297,
"learning_rate": 1.4657578984947627e-06,
"loss": 0.4553,
"num_input_tokens_seen": 20244608,
"step": 6425
},
{
"epoch": 0.41162537609628064,
"grad_norm": 34.1412353515625,
"learning_rate": 1.4647688351400303e-06,
"loss": 0.3597,
"num_input_tokens_seen": 20261184,
"step": 6430
},
{
"epoch": 0.4119454580372575,
"grad_norm": 22.3857421875,
"learning_rate": 1.46377919151595e-06,
"loss": 0.3288,
"num_input_tokens_seen": 20276736,
"step": 6435
},
{
"epoch": 0.41226553997823445,
"grad_norm": 33.48893737792969,
"learning_rate": 1.462788968858104e-06,
"loss": 0.47,
"num_input_tokens_seen": 20293888,
"step": 6440
},
{
"epoch": 0.41258562191921133,
"grad_norm": 20.228092193603516,
"learning_rate": 1.4617981684027966e-06,
"loss": 0.4858,
"num_input_tokens_seen": 20309696,
"step": 6445
},
{
"epoch": 0.4129057038601882,
"grad_norm": 19.324357986450195,
"learning_rate": 1.4608067913870536e-06,
"loss": 0.3958,
"num_input_tokens_seen": 20325632,
"step": 6450
},
{
"epoch": 0.4132257858011651,
"grad_norm": 24.772836685180664,
"learning_rate": 1.4598148390486213e-06,
"loss": 0.3994,
"num_input_tokens_seen": 20341888,
"step": 6455
},
{
"epoch": 0.41354586774214197,
"grad_norm": 30.82358741760254,
"learning_rate": 1.4588223126259639e-06,
"loss": 0.5083,
"num_input_tokens_seen": 20358656,
"step": 6460
},
{
"epoch": 0.4138659496831189,
"grad_norm": 15.612091064453125,
"learning_rate": 1.4578292133582615e-06,
"loss": 0.3307,
"num_input_tokens_seen": 20372864,
"step": 6465
},
{
"epoch": 0.4141860316240958,
"grad_norm": 28.77939224243164,
"learning_rate": 1.456835542485411e-06,
"loss": 0.3876,
"num_input_tokens_seen": 20387840,
"step": 6470
},
{
"epoch": 0.41450611356507266,
"grad_norm": 34.10711669921875,
"learning_rate": 1.4558413012480215e-06,
"loss": 0.4136,
"num_input_tokens_seen": 20404736,
"step": 6475
},
{
"epoch": 0.41482619550604954,
"grad_norm": 39.10059356689453,
"learning_rate": 1.4548464908874156e-06,
"loss": 0.5707,
"num_input_tokens_seen": 20422848,
"step": 6480
},
{
"epoch": 0.4151462774470264,
"grad_norm": 31.53403091430664,
"learning_rate": 1.4538511126456255e-06,
"loss": 0.4077,
"num_input_tokens_seen": 20438016,
"step": 6485
},
{
"epoch": 0.4154663593880033,
"grad_norm": 54.74279022216797,
"learning_rate": 1.452855167765392e-06,
"loss": 0.5888,
"num_input_tokens_seen": 20454464,
"step": 6490
},
{
"epoch": 0.41578644132898024,
"grad_norm": 25.2100887298584,
"learning_rate": 1.4518586574901647e-06,
"loss": 0.4553,
"num_input_tokens_seen": 20470464,
"step": 6495
},
{
"epoch": 0.4161065232699571,
"grad_norm": 33.02887725830078,
"learning_rate": 1.450861583064098e-06,
"loss": 0.4639,
"num_input_tokens_seen": 20485696,
"step": 6500
},
{
"epoch": 0.416426605210934,
"grad_norm": 24.64836883544922,
"learning_rate": 1.4498639457320515e-06,
"loss": 0.352,
"num_input_tokens_seen": 20500608,
"step": 6505
},
{
"epoch": 0.4167466871519109,
"grad_norm": 35.120365142822266,
"learning_rate": 1.4488657467395865e-06,
"loss": 0.4715,
"num_input_tokens_seen": 20515776,
"step": 6510
},
{
"epoch": 0.41706676909288776,
"grad_norm": 41.49770736694336,
"learning_rate": 1.4478669873329663e-06,
"loss": 0.5086,
"num_input_tokens_seen": 20531456,
"step": 6515
},
{
"epoch": 0.4173868510338647,
"grad_norm": 32.16014099121094,
"learning_rate": 1.4468676687591536e-06,
"loss": 0.3953,
"num_input_tokens_seen": 20547200,
"step": 6520
},
{
"epoch": 0.41770693297484157,
"grad_norm": 29.224876403808594,
"learning_rate": 1.4458677922658104e-06,
"loss": 0.4326,
"num_input_tokens_seen": 20562560,
"step": 6525
},
{
"epoch": 0.41802701491581845,
"grad_norm": 18.81861686706543,
"learning_rate": 1.444867359101293e-06,
"loss": 0.2884,
"num_input_tokens_seen": 20577344,
"step": 6530
},
{
"epoch": 0.41834709685679533,
"grad_norm": 35.77077865600586,
"learning_rate": 1.4438663705146545e-06,
"loss": 0.3541,
"num_input_tokens_seen": 20593088,
"step": 6535
},
{
"epoch": 0.4186671787977722,
"grad_norm": 28.87957191467285,
"learning_rate": 1.442864827755641e-06,
"loss": 0.3645,
"num_input_tokens_seen": 20609792,
"step": 6540
},
{
"epoch": 0.41898726073874915,
"grad_norm": 20.035581588745117,
"learning_rate": 1.4418627320746901e-06,
"loss": 0.4507,
"num_input_tokens_seen": 20625280,
"step": 6545
},
{
"epoch": 0.419307342679726,
"grad_norm": 32.727542877197266,
"learning_rate": 1.4408600847229304e-06,
"loss": 0.3912,
"num_input_tokens_seen": 20641984,
"step": 6550
},
{
"epoch": 0.4196274246207029,
"grad_norm": 36.26650619506836,
"learning_rate": 1.4398568869521782e-06,
"loss": 0.5483,
"num_input_tokens_seen": 20658240,
"step": 6555
},
{
"epoch": 0.4199475065616798,
"grad_norm": 32.617122650146484,
"learning_rate": 1.4388531400149384e-06,
"loss": 0.3603,
"num_input_tokens_seen": 20673408,
"step": 6560
},
{
"epoch": 0.42026758850265666,
"grad_norm": 44.918235778808594,
"learning_rate": 1.4378488451644007e-06,
"loss": 0.3865,
"num_input_tokens_seen": 20688960,
"step": 6565
},
{
"epoch": 0.42058767044363354,
"grad_norm": 27.99346351623535,
"learning_rate": 1.4368440036544386e-06,
"loss": 0.4216,
"num_input_tokens_seen": 20704768,
"step": 6570
},
{
"epoch": 0.4209077523846105,
"grad_norm": 41.81321716308594,
"learning_rate": 1.435838616739609e-06,
"loss": 0.4157,
"num_input_tokens_seen": 20719808,
"step": 6575
},
{
"epoch": 0.42122783432558736,
"grad_norm": 35.78312683105469,
"learning_rate": 1.4348326856751493e-06,
"loss": 0.5319,
"num_input_tokens_seen": 20735680,
"step": 6580
},
{
"epoch": 0.42154791626656424,
"grad_norm": 27.771835327148438,
"learning_rate": 1.433826211716976e-06,
"loss": 0.3379,
"num_input_tokens_seen": 20750144,
"step": 6585
},
{
"epoch": 0.4218679982075411,
"grad_norm": 30.699609756469727,
"learning_rate": 1.4328191961216835e-06,
"loss": 0.3988,
"num_input_tokens_seen": 20766016,
"step": 6590
},
{
"epoch": 0.422188080148518,
"grad_norm": 52.447792053222656,
"learning_rate": 1.4318116401465427e-06,
"loss": 0.4818,
"num_input_tokens_seen": 20782720,
"step": 6595
},
{
"epoch": 0.42250816208949493,
"grad_norm": 29.143312454223633,
"learning_rate": 1.430803545049499e-06,
"loss": 0.3925,
"num_input_tokens_seen": 20798208,
"step": 6600
},
{
"epoch": 0.4228282440304718,
"grad_norm": 18.094640731811523,
"learning_rate": 1.4297949120891716e-06,
"loss": 0.5891,
"num_input_tokens_seen": 20813056,
"step": 6605
},
{
"epoch": 0.4231483259714487,
"grad_norm": 37.29645919799805,
"learning_rate": 1.4287857425248497e-06,
"loss": 0.4266,
"num_input_tokens_seen": 20828800,
"step": 6610
},
{
"epoch": 0.42346840791242557,
"grad_norm": 24.383594512939453,
"learning_rate": 1.427776037616494e-06,
"loss": 0.4956,
"num_input_tokens_seen": 20844736,
"step": 6615
},
{
"epoch": 0.42378848985340245,
"grad_norm": 32.46372604370117,
"learning_rate": 1.4267657986247326e-06,
"loss": 0.3504,
"num_input_tokens_seen": 20860672,
"step": 6620
},
{
"epoch": 0.4241085717943794,
"grad_norm": 35.29887771606445,
"learning_rate": 1.425755026810861e-06,
"loss": 0.3666,
"num_input_tokens_seen": 20877184,
"step": 6625
},
{
"epoch": 0.42442865373535626,
"grad_norm": 58.47334671020508,
"learning_rate": 1.4247437234368394e-06,
"loss": 0.3965,
"num_input_tokens_seen": 20894208,
"step": 6630
},
{
"epoch": 0.42474873567633314,
"grad_norm": 38.44792175292969,
"learning_rate": 1.423731889765292e-06,
"loss": 0.407,
"num_input_tokens_seen": 20909696,
"step": 6635
},
{
"epoch": 0.42506881761731,
"grad_norm": 16.839555740356445,
"learning_rate": 1.422719527059505e-06,
"loss": 0.3465,
"num_input_tokens_seen": 20926016,
"step": 6640
},
{
"epoch": 0.4253888995582869,
"grad_norm": 21.848594665527344,
"learning_rate": 1.4217066365834253e-06,
"loss": 0.362,
"num_input_tokens_seen": 20941440,
"step": 6645
},
{
"epoch": 0.42570898149926384,
"grad_norm": 35.94594955444336,
"learning_rate": 1.4206932196016586e-06,
"loss": 0.4566,
"num_input_tokens_seen": 20956352,
"step": 6650
},
{
"epoch": 0.4260290634402407,
"grad_norm": 51.25725173950195,
"learning_rate": 1.4196792773794672e-06,
"loss": 0.3947,
"num_input_tokens_seen": 20973056,
"step": 6655
},
{
"epoch": 0.4263491453812176,
"grad_norm": 39.09233474731445,
"learning_rate": 1.418664811182771e-06,
"loss": 0.4406,
"num_input_tokens_seen": 20989248,
"step": 6660
},
{
"epoch": 0.4266692273221945,
"grad_norm": 40.44568634033203,
"learning_rate": 1.417649822278142e-06,
"loss": 0.4946,
"num_input_tokens_seen": 21004096,
"step": 6665
},
{
"epoch": 0.42698930926317136,
"grad_norm": 25.228567123413086,
"learning_rate": 1.4166343119328064e-06,
"loss": 0.489,
"num_input_tokens_seen": 21020224,
"step": 6670
},
{
"epoch": 0.42730939120414824,
"grad_norm": 30.947425842285156,
"learning_rate": 1.4156182814146404e-06,
"loss": 0.466,
"num_input_tokens_seen": 21035264,
"step": 6675
},
{
"epoch": 0.42762947314512517,
"grad_norm": 19.565750122070312,
"learning_rate": 1.4146017319921701e-06,
"loss": 0.354,
"num_input_tokens_seen": 21051904,
"step": 6680
},
{
"epoch": 0.42794955508610205,
"grad_norm": 30.911516189575195,
"learning_rate": 1.4135846649345695e-06,
"loss": 0.4117,
"num_input_tokens_seen": 21069504,
"step": 6685
},
{
"epoch": 0.42826963702707893,
"grad_norm": 30.636323928833008,
"learning_rate": 1.4125670815116589e-06,
"loss": 0.4259,
"num_input_tokens_seen": 21084288,
"step": 6690
},
{
"epoch": 0.4285897189680558,
"grad_norm": 26.054248809814453,
"learning_rate": 1.4115489829939025e-06,
"loss": 0.2933,
"num_input_tokens_seen": 21100544,
"step": 6695
},
{
"epoch": 0.4289098009090327,
"grad_norm": 29.667612075805664,
"learning_rate": 1.4105303706524093e-06,
"loss": 0.4315,
"num_input_tokens_seen": 21116608,
"step": 6700
},
{
"epoch": 0.4292298828500096,
"grad_norm": 49.060630798339844,
"learning_rate": 1.4095112457589276e-06,
"loss": 0.6147,
"num_input_tokens_seen": 21131776,
"step": 6705
},
{
"epoch": 0.4295499647909865,
"grad_norm": 31.449359893798828,
"learning_rate": 1.4084916095858477e-06,
"loss": 0.4185,
"num_input_tokens_seen": 21146368,
"step": 6710
},
{
"epoch": 0.4298700467319634,
"grad_norm": 75.84903717041016,
"learning_rate": 1.407471463406197e-06,
"loss": 0.509,
"num_input_tokens_seen": 21162368,
"step": 6715
},
{
"epoch": 0.43019012867294026,
"grad_norm": 31.45880699157715,
"learning_rate": 1.4064508084936399e-06,
"loss": 0.4404,
"num_input_tokens_seen": 21179008,
"step": 6720
},
{
"epoch": 0.43051021061391714,
"grad_norm": 33.05830383300781,
"learning_rate": 1.405429646122476e-06,
"loss": 0.569,
"num_input_tokens_seen": 21196160,
"step": 6725
},
{
"epoch": 0.4308302925548941,
"grad_norm": 21.873218536376953,
"learning_rate": 1.4044079775676392e-06,
"loss": 0.5342,
"num_input_tokens_seen": 21212032,
"step": 6730
},
{
"epoch": 0.43115037449587096,
"grad_norm": 21.718421936035156,
"learning_rate": 1.4033858041046936e-06,
"loss": 0.3587,
"num_input_tokens_seen": 21230272,
"step": 6735
},
{
"epoch": 0.43147045643684784,
"grad_norm": 25.18842124938965,
"learning_rate": 1.4023631270098352e-06,
"loss": 0.3928,
"num_input_tokens_seen": 21245760,
"step": 6740
},
{
"epoch": 0.4317905383778247,
"grad_norm": 29.12677764892578,
"learning_rate": 1.4013399475598888e-06,
"loss": 0.3446,
"num_input_tokens_seen": 21260992,
"step": 6745
},
{
"epoch": 0.4321106203188016,
"grad_norm": 22.8419189453125,
"learning_rate": 1.4003162670323056e-06,
"loss": 0.2819,
"num_input_tokens_seen": 21275136,
"step": 6750
},
{
"epoch": 0.4324307022597785,
"grad_norm": 73.31881713867188,
"learning_rate": 1.3992920867051627e-06,
"loss": 0.5416,
"num_input_tokens_seen": 21290560,
"step": 6755
},
{
"epoch": 0.4327507842007554,
"grad_norm": 42.87895202636719,
"learning_rate": 1.3982674078571614e-06,
"loss": 0.3552,
"num_input_tokens_seen": 21305536,
"step": 6760
},
{
"epoch": 0.4330708661417323,
"grad_norm": 27.110273361206055,
"learning_rate": 1.3972422317676252e-06,
"loss": 0.3758,
"num_input_tokens_seen": 21320576,
"step": 6765
},
{
"epoch": 0.43339094808270917,
"grad_norm": 18.68414306640625,
"learning_rate": 1.3962165597164985e-06,
"loss": 0.3698,
"num_input_tokens_seen": 21335680,
"step": 6770
},
{
"epoch": 0.43371103002368605,
"grad_norm": 28.053197860717773,
"learning_rate": 1.395190392984345e-06,
"loss": 0.3519,
"num_input_tokens_seen": 21351808,
"step": 6775
},
{
"epoch": 0.43403111196466293,
"grad_norm": 24.955060958862305,
"learning_rate": 1.3941637328523452e-06,
"loss": 0.4522,
"num_input_tokens_seen": 21366464,
"step": 6780
},
{
"epoch": 0.43435119390563987,
"grad_norm": 38.06902313232422,
"learning_rate": 1.3931365806022978e-06,
"loss": 0.3038,
"num_input_tokens_seen": 21383296,
"step": 6785
},
{
"epoch": 0.43467127584661674,
"grad_norm": 37.151485443115234,
"learning_rate": 1.3921089375166131e-06,
"loss": 0.3111,
"num_input_tokens_seen": 21399616,
"step": 6790
},
{
"epoch": 0.4349913577875936,
"grad_norm": 21.17021942138672,
"learning_rate": 1.391080804878316e-06,
"loss": 0.4455,
"num_input_tokens_seen": 21414848,
"step": 6795
},
{
"epoch": 0.4353114397285705,
"grad_norm": 60.63893508911133,
"learning_rate": 1.3900521839710427e-06,
"loss": 0.3804,
"num_input_tokens_seen": 21430144,
"step": 6800
},
{
"epoch": 0.4356315216695474,
"grad_norm": 24.321428298950195,
"learning_rate": 1.3890230760790373e-06,
"loss": 0.3503,
"num_input_tokens_seen": 21445248,
"step": 6805
},
{
"epoch": 0.4359516036105243,
"grad_norm": 102.32804870605469,
"learning_rate": 1.3879934824871544e-06,
"loss": 0.598,
"num_input_tokens_seen": 21460544,
"step": 6810
},
{
"epoch": 0.4362716855515012,
"grad_norm": 30.45060920715332,
"learning_rate": 1.3869634044808526e-06,
"loss": 0.5102,
"num_input_tokens_seen": 21476224,
"step": 6815
},
{
"epoch": 0.4365917674924781,
"grad_norm": 40.25202560424805,
"learning_rate": 1.3859328433461971e-06,
"loss": 0.6093,
"num_input_tokens_seen": 21491712,
"step": 6820
},
{
"epoch": 0.43691184943345496,
"grad_norm": 65.34822082519531,
"learning_rate": 1.3849018003698553e-06,
"loss": 0.5794,
"num_input_tokens_seen": 21508928,
"step": 6825
},
{
"epoch": 0.43723193137443184,
"grad_norm": 38.05977249145508,
"learning_rate": 1.3838702768390964e-06,
"loss": 0.3975,
"num_input_tokens_seen": 21523648,
"step": 6830
},
{
"epoch": 0.43755201331540877,
"grad_norm": 32.886993408203125,
"learning_rate": 1.38283827404179e-06,
"loss": 0.474,
"num_input_tokens_seen": 21539264,
"step": 6835
},
{
"epoch": 0.43787209525638565,
"grad_norm": 48.08152389526367,
"learning_rate": 1.381805793266403e-06,
"loss": 0.3763,
"num_input_tokens_seen": 21555520,
"step": 6840
},
{
"epoch": 0.43819217719736253,
"grad_norm": 34.62569808959961,
"learning_rate": 1.3807728358020009e-06,
"loss": 0.4524,
"num_input_tokens_seen": 21570112,
"step": 6845
},
{
"epoch": 0.4385122591383394,
"grad_norm": 51.81264114379883,
"learning_rate": 1.3797394029382416e-06,
"loss": 0.3372,
"num_input_tokens_seen": 21584768,
"step": 6850
},
{
"epoch": 0.4388323410793163,
"grad_norm": 23.349763870239258,
"learning_rate": 1.37870549596538e-06,
"loss": 0.3008,
"num_input_tokens_seen": 21599872,
"step": 6855
},
{
"epoch": 0.43915242302029317,
"grad_norm": 21.19732093811035,
"learning_rate": 1.3776711161742595e-06,
"loss": 0.5217,
"num_input_tokens_seen": 21615808,
"step": 6860
},
{
"epoch": 0.4394725049612701,
"grad_norm": 28.972196578979492,
"learning_rate": 1.3766362648563166e-06,
"loss": 0.4772,
"num_input_tokens_seen": 21630656,
"step": 6865
},
{
"epoch": 0.439792586902247,
"grad_norm": 65.14180755615234,
"learning_rate": 1.3756009433035744e-06,
"loss": 0.4123,
"num_input_tokens_seen": 21646976,
"step": 6870
},
{
"epoch": 0.44011266884322386,
"grad_norm": 28.106571197509766,
"learning_rate": 1.3745651528086447e-06,
"loss": 0.5783,
"num_input_tokens_seen": 21665024,
"step": 6875
},
{
"epoch": 0.44043275078420074,
"grad_norm": 16.411785125732422,
"learning_rate": 1.373528894664724e-06,
"loss": 0.4489,
"num_input_tokens_seen": 21680128,
"step": 6880
},
{
"epoch": 0.4407528327251776,
"grad_norm": 26.3304386138916,
"learning_rate": 1.3724921701655924e-06,
"loss": 0.3466,
"num_input_tokens_seen": 21695808,
"step": 6885
},
{
"epoch": 0.44107291466615456,
"grad_norm": 15.10905647277832,
"learning_rate": 1.3714549806056125e-06,
"loss": 0.3186,
"num_input_tokens_seen": 21711936,
"step": 6890
},
{
"epoch": 0.44139299660713144,
"grad_norm": 45.09797668457031,
"learning_rate": 1.3704173272797283e-06,
"loss": 0.4162,
"num_input_tokens_seen": 21727488,
"step": 6895
},
{
"epoch": 0.4417130785481083,
"grad_norm": 38.701011657714844,
"learning_rate": 1.3693792114834619e-06,
"loss": 0.4556,
"num_input_tokens_seen": 21745280,
"step": 6900
},
{
"epoch": 0.4420331604890852,
"grad_norm": 26.589208602905273,
"learning_rate": 1.3683406345129129e-06,
"loss": 0.467,
"num_input_tokens_seen": 21760000,
"step": 6905
},
{
"epoch": 0.4423532424300621,
"grad_norm": 25.628881454467773,
"learning_rate": 1.3673015976647567e-06,
"loss": 0.3971,
"num_input_tokens_seen": 21775232,
"step": 6910
},
{
"epoch": 0.442673324371039,
"grad_norm": 37.34769821166992,
"learning_rate": 1.3662621022362435e-06,
"loss": 0.3979,
"num_input_tokens_seen": 21790656,
"step": 6915
},
{
"epoch": 0.4429934063120159,
"grad_norm": 52.09904479980469,
"learning_rate": 1.3652221495251952e-06,
"loss": 0.462,
"num_input_tokens_seen": 21806336,
"step": 6920
},
{
"epoch": 0.44331348825299277,
"grad_norm": 27.45566749572754,
"learning_rate": 1.3641817408300049e-06,
"loss": 0.3242,
"num_input_tokens_seen": 21823744,
"step": 6925
},
{
"epoch": 0.44363357019396965,
"grad_norm": 31.710731506347656,
"learning_rate": 1.3631408774496352e-06,
"loss": 0.559,
"num_input_tokens_seen": 21839104,
"step": 6930
},
{
"epoch": 0.44395365213494653,
"grad_norm": 28.635961532592773,
"learning_rate": 1.3620995606836165e-06,
"loss": 0.3616,
"num_input_tokens_seen": 21854528,
"step": 6935
},
{
"epoch": 0.4442737340759234,
"grad_norm": 58.52728271484375,
"learning_rate": 1.3610577918320446e-06,
"loss": 0.6013,
"num_input_tokens_seen": 21870592,
"step": 6940
},
{
"epoch": 0.44459381601690035,
"grad_norm": 47.7674674987793,
"learning_rate": 1.3600155721955802e-06,
"loss": 0.3823,
"num_input_tokens_seen": 21885696,
"step": 6945
},
{
"epoch": 0.4449138979578772,
"grad_norm": 25.71700096130371,
"learning_rate": 1.3589729030754468e-06,
"loss": 0.4017,
"num_input_tokens_seen": 21901248,
"step": 6950
},
{
"epoch": 0.4452339798988541,
"grad_norm": 30.66419219970703,
"learning_rate": 1.3579297857734293e-06,
"loss": 0.4293,
"num_input_tokens_seen": 21916352,
"step": 6955
},
{
"epoch": 0.445554061839831,
"grad_norm": 18.087440490722656,
"learning_rate": 1.3568862215918717e-06,
"loss": 0.3354,
"num_input_tokens_seen": 21931072,
"step": 6960
},
{
"epoch": 0.44587414378080786,
"grad_norm": 33.805599212646484,
"learning_rate": 1.3558422118336762e-06,
"loss": 0.5014,
"num_input_tokens_seen": 21946752,
"step": 6965
},
{
"epoch": 0.4461942257217848,
"grad_norm": 39.23421096801758,
"learning_rate": 1.354797757802301e-06,
"loss": 0.4669,
"num_input_tokens_seen": 21962176,
"step": 6970
},
{
"epoch": 0.4465143076627617,
"grad_norm": 18.45398712158203,
"learning_rate": 1.3537528608017596e-06,
"loss": 0.3986,
"num_input_tokens_seen": 21978496,
"step": 6975
},
{
"epoch": 0.44683438960373856,
"grad_norm": 24.98866844177246,
"learning_rate": 1.352707522136618e-06,
"loss": 0.3989,
"num_input_tokens_seen": 21992576,
"step": 6980
},
{
"epoch": 0.44715447154471544,
"grad_norm": 18.77157211303711,
"learning_rate": 1.3516617431119934e-06,
"loss": 0.3987,
"num_input_tokens_seen": 22008000,
"step": 6985
},
{
"epoch": 0.4474745534856923,
"grad_norm": 35.73682403564453,
"learning_rate": 1.350615525033554e-06,
"loss": 0.5453,
"num_input_tokens_seen": 22022976,
"step": 6990
},
{
"epoch": 0.44779463542666925,
"grad_norm": 28.581218719482422,
"learning_rate": 1.3495688692075144e-06,
"loss": 0.4055,
"num_input_tokens_seen": 22038144,
"step": 6995
},
{
"epoch": 0.44811471736764613,
"grad_norm": 32.27814483642578,
"learning_rate": 1.3485217769406376e-06,
"loss": 0.35,
"num_input_tokens_seen": 22054016,
"step": 7000
},
{
"epoch": 0.448434799308623,
"grad_norm": 24.271724700927734,
"learning_rate": 1.3474742495402303e-06,
"loss": 0.3627,
"num_input_tokens_seen": 22073920,
"step": 7005
},
{
"epoch": 0.4487548812495999,
"grad_norm": 53.558128356933594,
"learning_rate": 1.3464262883141425e-06,
"loss": 0.4295,
"num_input_tokens_seen": 22089728,
"step": 7010
},
{
"epoch": 0.44907496319057677,
"grad_norm": 36.53765869140625,
"learning_rate": 1.3453778945707663e-06,
"loss": 0.5883,
"num_input_tokens_seen": 22105344,
"step": 7015
},
{
"epoch": 0.4493950451315537,
"grad_norm": 54.6480712890625,
"learning_rate": 1.3443290696190332e-06,
"loss": 0.4596,
"num_input_tokens_seen": 22121792,
"step": 7020
},
{
"epoch": 0.4497151270725306,
"grad_norm": 24.6829776763916,
"learning_rate": 1.343279814768414e-06,
"loss": 0.4175,
"num_input_tokens_seen": 22136128,
"step": 7025
},
{
"epoch": 0.45003520901350746,
"grad_norm": 23.87900733947754,
"learning_rate": 1.3422301313289156e-06,
"loss": 0.3849,
"num_input_tokens_seen": 22151936,
"step": 7030
},
{
"epoch": 0.45035529095448434,
"grad_norm": 21.608688354492188,
"learning_rate": 1.34118002061108e-06,
"loss": 0.3775,
"num_input_tokens_seen": 22168128,
"step": 7035
},
{
"epoch": 0.4505473401190705,
"eval_loss": 0.43633610010147095,
"eval_runtime": 50.6412,
"eval_samples_per_second": 274.203,
"eval_steps_per_second": 34.28,
"num_input_tokens_seen": 22178432,
"step": 7038
},
{
"epoch": 0.4506753728954612,
"grad_norm": 43.05158615112305,
"learning_rate": 1.3401294839259828e-06,
"loss": 0.4432,
"num_input_tokens_seen": 22184512,
"step": 7040
},
{
"epoch": 0.4509954548364381,
"grad_norm": 34.34581756591797,
"learning_rate": 1.3390785225852312e-06,
"loss": 0.5428,
"num_input_tokens_seen": 22199872,
"step": 7045
},
{
"epoch": 0.45131553677741504,
"grad_norm": 20.483835220336914,
"learning_rate": 1.3380271379009631e-06,
"loss": 0.444,
"num_input_tokens_seen": 22216960,
"step": 7050
},
{
"epoch": 0.4516356187183919,
"grad_norm": 20.620986938476562,
"learning_rate": 1.3369753311858442e-06,
"loss": 0.2645,
"num_input_tokens_seen": 22231488,
"step": 7055
},
{
"epoch": 0.4519557006593688,
"grad_norm": 27.00153350830078,
"learning_rate": 1.3359231037530682e-06,
"loss": 0.4597,
"num_input_tokens_seen": 22246976,
"step": 7060
},
{
"epoch": 0.4522757826003457,
"grad_norm": 17.260738372802734,
"learning_rate": 1.3348704569163527e-06,
"loss": 0.4178,
"num_input_tokens_seen": 22263680,
"step": 7065
},
{
"epoch": 0.45259586454132256,
"grad_norm": 19.05507469177246,
"learning_rate": 1.33381739198994e-06,
"loss": 0.3371,
"num_input_tokens_seen": 22279552,
"step": 7070
},
{
"epoch": 0.4529159464822995,
"grad_norm": 20.88991355895996,
"learning_rate": 1.3327639102885938e-06,
"loss": 0.4463,
"num_input_tokens_seen": 22295296,
"step": 7075
},
{
"epoch": 0.45323602842327637,
"grad_norm": 36.431251525878906,
"learning_rate": 1.3317100131275986e-06,
"loss": 0.3979,
"num_input_tokens_seen": 22310400,
"step": 7080
},
{
"epoch": 0.45355611036425325,
"grad_norm": 60.086669921875,
"learning_rate": 1.3306557018227576e-06,
"loss": 0.4852,
"num_input_tokens_seen": 22326848,
"step": 7085
},
{
"epoch": 0.45387619230523013,
"grad_norm": 33.853111267089844,
"learning_rate": 1.3296009776903903e-06,
"loss": 0.4673,
"num_input_tokens_seen": 22342592,
"step": 7090
},
{
"epoch": 0.454196274246207,
"grad_norm": 29.175647735595703,
"learning_rate": 1.3285458420473323e-06,
"loss": 0.4693,
"num_input_tokens_seen": 22358912,
"step": 7095
},
{
"epoch": 0.45451635618718395,
"grad_norm": 30.104026794433594,
"learning_rate": 1.3274902962109332e-06,
"loss": 0.3789,
"num_input_tokens_seen": 22374528,
"step": 7100
},
{
"epoch": 0.4548364381281608,
"grad_norm": 20.507343292236328,
"learning_rate": 1.3264343414990539e-06,
"loss": 0.3752,
"num_input_tokens_seen": 22389824,
"step": 7105
},
{
"epoch": 0.4551565200691377,
"grad_norm": 32.275794982910156,
"learning_rate": 1.3253779792300663e-06,
"loss": 0.4269,
"num_input_tokens_seen": 22405376,
"step": 7110
},
{
"epoch": 0.4554766020101146,
"grad_norm": 17.600027084350586,
"learning_rate": 1.3243212107228518e-06,
"loss": 0.3442,
"num_input_tokens_seen": 22420032,
"step": 7115
},
{
"epoch": 0.45579668395109146,
"grad_norm": 15.571036338806152,
"learning_rate": 1.3232640372967974e-06,
"loss": 0.393,
"num_input_tokens_seen": 22434688,
"step": 7120
},
{
"epoch": 0.45611676589206834,
"grad_norm": 51.120750427246094,
"learning_rate": 1.3222064602717974e-06,
"loss": 0.4691,
"num_input_tokens_seen": 22451072,
"step": 7125
},
{
"epoch": 0.4564368478330453,
"grad_norm": 31.28764533996582,
"learning_rate": 1.321148480968248e-06,
"loss": 0.3578,
"num_input_tokens_seen": 22466688,
"step": 7130
},
{
"epoch": 0.45675692977402216,
"grad_norm": 39.04620361328125,
"learning_rate": 1.3200901007070495e-06,
"loss": 0.4627,
"num_input_tokens_seen": 22482432,
"step": 7135
},
{
"epoch": 0.45707701171499904,
"grad_norm": 45.138580322265625,
"learning_rate": 1.3190313208096022e-06,
"loss": 0.4653,
"num_input_tokens_seen": 22496960,
"step": 7140
},
{
"epoch": 0.4573970936559759,
"grad_norm": 62.76626968383789,
"learning_rate": 1.3179721425978048e-06,
"loss": 0.3506,
"num_input_tokens_seen": 22512256,
"step": 7145
},
{
"epoch": 0.4577171755969528,
"grad_norm": 29.606536865234375,
"learning_rate": 1.3169125673940541e-06,
"loss": 0.3801,
"num_input_tokens_seen": 22528192,
"step": 7150
},
{
"epoch": 0.45803725753792973,
"grad_norm": 23.77508544921875,
"learning_rate": 1.3158525965212422e-06,
"loss": 0.4222,
"num_input_tokens_seen": 22545408,
"step": 7155
},
{
"epoch": 0.4583573394789066,
"grad_norm": 46.141380310058594,
"learning_rate": 1.3147922313027548e-06,
"loss": 0.499,
"num_input_tokens_seen": 22560832,
"step": 7160
},
{
"epoch": 0.4586774214198835,
"grad_norm": 37.23847961425781,
"learning_rate": 1.3137314730624707e-06,
"loss": 0.3566,
"num_input_tokens_seen": 22577728,
"step": 7165
},
{
"epoch": 0.45899750336086037,
"grad_norm": 56.6774787902832,
"learning_rate": 1.3126703231247588e-06,
"loss": 0.4792,
"num_input_tokens_seen": 22594112,
"step": 7170
},
{
"epoch": 0.45931758530183725,
"grad_norm": 57.76353454589844,
"learning_rate": 1.3116087828144772e-06,
"loss": 0.3942,
"num_input_tokens_seen": 22609728,
"step": 7175
},
{
"epoch": 0.4596376672428142,
"grad_norm": 25.027061462402344,
"learning_rate": 1.310546853456972e-06,
"loss": 0.4788,
"num_input_tokens_seen": 22624704,
"step": 7180
},
{
"epoch": 0.45995774918379106,
"grad_norm": 29.121782302856445,
"learning_rate": 1.3094845363780737e-06,
"loss": 0.3133,
"num_input_tokens_seen": 22640448,
"step": 7185
},
{
"epoch": 0.46027783112476794,
"grad_norm": 27.954086303710938,
"learning_rate": 1.3084218329040976e-06,
"loss": 0.2221,
"num_input_tokens_seen": 22655680,
"step": 7190
},
{
"epoch": 0.4605979130657448,
"grad_norm": 19.17045021057129,
"learning_rate": 1.3073587443618425e-06,
"loss": 0.3836,
"num_input_tokens_seen": 22672128,
"step": 7195
},
{
"epoch": 0.4609179950067217,
"grad_norm": 63.576904296875,
"learning_rate": 1.3062952720785861e-06,
"loss": 0.528,
"num_input_tokens_seen": 22687104,
"step": 7200
},
{
"epoch": 0.4612380769476986,
"grad_norm": 57.46402359008789,
"learning_rate": 1.305231417382086e-06,
"loss": 0.3679,
"num_input_tokens_seen": 22702976,
"step": 7205
},
{
"epoch": 0.4615581588886755,
"grad_norm": 33.87038803100586,
"learning_rate": 1.3041671816005777e-06,
"loss": 0.3473,
"num_input_tokens_seen": 22718464,
"step": 7210
},
{
"epoch": 0.4618782408296524,
"grad_norm": 31.65737533569336,
"learning_rate": 1.3031025660627718e-06,
"loss": 0.3735,
"num_input_tokens_seen": 22734656,
"step": 7215
},
{
"epoch": 0.4621983227706293,
"grad_norm": 41.2495002746582,
"learning_rate": 1.3020375720978534e-06,
"loss": 0.4378,
"num_input_tokens_seen": 22750016,
"step": 7220
},
{
"epoch": 0.46251840471160616,
"grad_norm": 34.97142791748047,
"learning_rate": 1.3009722010354799e-06,
"loss": 0.385,
"num_input_tokens_seen": 22765632,
"step": 7225
},
{
"epoch": 0.46283848665258304,
"grad_norm": 39.41366958618164,
"learning_rate": 1.2999064542057794e-06,
"loss": 0.4572,
"num_input_tokens_seen": 22781184,
"step": 7230
},
{
"epoch": 0.46315856859355997,
"grad_norm": 32.739967346191406,
"learning_rate": 1.2988403329393495e-06,
"loss": 0.4955,
"num_input_tokens_seen": 22797248,
"step": 7235
},
{
"epoch": 0.46347865053453685,
"grad_norm": 29.355844497680664,
"learning_rate": 1.2977738385672557e-06,
"loss": 0.4186,
"num_input_tokens_seen": 22812800,
"step": 7240
},
{
"epoch": 0.46379873247551373,
"grad_norm": 23.447111129760742,
"learning_rate": 1.2967069724210278e-06,
"loss": 0.4086,
"num_input_tokens_seen": 22827200,
"step": 7245
},
{
"epoch": 0.4641188144164906,
"grad_norm": 32.68409729003906,
"learning_rate": 1.2956397358326609e-06,
"loss": 0.5472,
"num_input_tokens_seen": 22843264,
"step": 7250
},
{
"epoch": 0.4644388963574675,
"grad_norm": 40.67762756347656,
"learning_rate": 1.294572130134613e-06,
"loss": 0.3845,
"num_input_tokens_seen": 22858624,
"step": 7255
},
{
"epoch": 0.4647589782984444,
"grad_norm": 35.638824462890625,
"learning_rate": 1.2935041566598016e-06,
"loss": 0.5608,
"num_input_tokens_seen": 22873856,
"step": 7260
},
{
"epoch": 0.4650790602394213,
"grad_norm": 35.22060775756836,
"learning_rate": 1.2924358167416049e-06,
"loss": 0.3669,
"num_input_tokens_seen": 22889600,
"step": 7265
},
{
"epoch": 0.4653991421803982,
"grad_norm": 26.555004119873047,
"learning_rate": 1.2913671117138572e-06,
"loss": 0.4085,
"num_input_tokens_seen": 22904704,
"step": 7270
},
{
"epoch": 0.46571922412137506,
"grad_norm": 24.626081466674805,
"learning_rate": 1.29029804291085e-06,
"loss": 0.3516,
"num_input_tokens_seen": 22920384,
"step": 7275
},
{
"epoch": 0.46603930606235194,
"grad_norm": 46.00627899169922,
"learning_rate": 1.2892286116673269e-06,
"loss": 0.3724,
"num_input_tokens_seen": 22937024,
"step": 7280
},
{
"epoch": 0.4663593880033289,
"grad_norm": 27.727231979370117,
"learning_rate": 1.2881588193184865e-06,
"loss": 0.501,
"num_input_tokens_seen": 22954816,
"step": 7285
},
{
"epoch": 0.46667946994430576,
"grad_norm": 25.16861915588379,
"learning_rate": 1.287088667199977e-06,
"loss": 0.2811,
"num_input_tokens_seen": 22969472,
"step": 7290
},
{
"epoch": 0.46699955188528264,
"grad_norm": 25.07572364807129,
"learning_rate": 1.2860181566478956e-06,
"loss": 0.4666,
"num_input_tokens_seen": 22984192,
"step": 7295
},
{
"epoch": 0.4673196338262595,
"grad_norm": 13.781168937683105,
"learning_rate": 1.2849472889987874e-06,
"loss": 0.3772,
"num_input_tokens_seen": 22999680,
"step": 7300
},
{
"epoch": 0.4676397157672364,
"grad_norm": 28.55607032775879,
"learning_rate": 1.2838760655896431e-06,
"loss": 0.3756,
"num_input_tokens_seen": 23014720,
"step": 7305
},
{
"epoch": 0.4679597977082133,
"grad_norm": 37.8064079284668,
"learning_rate": 1.2828044877578983e-06,
"loss": 0.4629,
"num_input_tokens_seen": 23030528,
"step": 7310
},
{
"epoch": 0.4682798796491902,
"grad_norm": 27.748788833618164,
"learning_rate": 1.2817325568414297e-06,
"loss": 0.5176,
"num_input_tokens_seen": 23046784,
"step": 7315
},
{
"epoch": 0.4685999615901671,
"grad_norm": 25.824499130249023,
"learning_rate": 1.2806602741785562e-06,
"loss": 0.3307,
"num_input_tokens_seen": 23061632,
"step": 7320
},
{
"epoch": 0.46892004353114397,
"grad_norm": 18.876747131347656,
"learning_rate": 1.2795876411080346e-06,
"loss": 0.3325,
"num_input_tokens_seen": 23077888,
"step": 7325
},
{
"epoch": 0.46924012547212085,
"grad_norm": 25.548311233520508,
"learning_rate": 1.278514658969061e-06,
"loss": 0.3222,
"num_input_tokens_seen": 23093568,
"step": 7330
},
{
"epoch": 0.46956020741309773,
"grad_norm": 28.950227737426758,
"learning_rate": 1.2774413291012648e-06,
"loss": 0.5175,
"num_input_tokens_seen": 23108992,
"step": 7335
},
{
"epoch": 0.46988028935407467,
"grad_norm": 26.01544761657715,
"learning_rate": 1.2763676528447122e-06,
"loss": 0.4328,
"num_input_tokens_seen": 23124992,
"step": 7340
},
{
"epoch": 0.47020037129505154,
"grad_norm": 30.87302589416504,
"learning_rate": 1.2752936315399003e-06,
"loss": 0.3446,
"num_input_tokens_seen": 23141888,
"step": 7345
},
{
"epoch": 0.4705204532360284,
"grad_norm": 31.088359832763672,
"learning_rate": 1.2742192665277566e-06,
"loss": 0.343,
"num_input_tokens_seen": 23157888,
"step": 7350
},
{
"epoch": 0.4708405351770053,
"grad_norm": 26.50160026550293,
"learning_rate": 1.2731445591496393e-06,
"loss": 0.2838,
"num_input_tokens_seen": 23172864,
"step": 7355
},
{
"epoch": 0.4711606171179822,
"grad_norm": 35.69820022583008,
"learning_rate": 1.2720695107473325e-06,
"loss": 0.456,
"num_input_tokens_seen": 23188352,
"step": 7360
},
{
"epoch": 0.4714806990589591,
"grad_norm": 39.46232986450195,
"learning_rate": 1.2709941226630475e-06,
"loss": 0.3861,
"num_input_tokens_seen": 23204096,
"step": 7365
},
{
"epoch": 0.471800780999936,
"grad_norm": 30.66231346130371,
"learning_rate": 1.2699183962394182e-06,
"loss": 0.3526,
"num_input_tokens_seen": 23219072,
"step": 7370
},
{
"epoch": 0.4721208629409129,
"grad_norm": 16.721717834472656,
"learning_rate": 1.2688423328195021e-06,
"loss": 0.4323,
"num_input_tokens_seen": 23234560,
"step": 7375
},
{
"epoch": 0.47244094488188976,
"grad_norm": 45.48991394042969,
"learning_rate": 1.267765933746777e-06,
"loss": 0.3497,
"num_input_tokens_seen": 23250304,
"step": 7380
},
{
"epoch": 0.47276102682286664,
"grad_norm": 44.07309341430664,
"learning_rate": 1.2666892003651397e-06,
"loss": 0.6383,
"num_input_tokens_seen": 23265664,
"step": 7385
},
{
"epoch": 0.4730811087638435,
"grad_norm": 31.322635650634766,
"learning_rate": 1.2656121340189043e-06,
"loss": 0.453,
"num_input_tokens_seen": 23281472,
"step": 7390
},
{
"epoch": 0.47340119070482045,
"grad_norm": 24.864459991455078,
"learning_rate": 1.264534736052801e-06,
"loss": 0.4142,
"num_input_tokens_seen": 23297024,
"step": 7395
},
{
"epoch": 0.47372127264579733,
"grad_norm": 42.676055908203125,
"learning_rate": 1.2634570078119739e-06,
"loss": 0.4348,
"num_input_tokens_seen": 23313344,
"step": 7400
},
{
"epoch": 0.4740413545867742,
"grad_norm": 29.135011672973633,
"learning_rate": 1.262378950641979e-06,
"loss": 0.535,
"num_input_tokens_seen": 23328512,
"step": 7405
},
{
"epoch": 0.4743614365277511,
"grad_norm": 23.750194549560547,
"learning_rate": 1.2613005658887836e-06,
"loss": 0.444,
"num_input_tokens_seen": 23342400,
"step": 7410
},
{
"epoch": 0.47468151846872797,
"grad_norm": 37.87858200073242,
"learning_rate": 1.2602218548987637e-06,
"loss": 0.4198,
"num_input_tokens_seen": 23358400,
"step": 7415
},
{
"epoch": 0.4750016004097049,
"grad_norm": 33.777381896972656,
"learning_rate": 1.2591428190187029e-06,
"loss": 0.4155,
"num_input_tokens_seen": 23373376,
"step": 7420
},
{
"epoch": 0.4753216823506818,
"grad_norm": 56.56295394897461,
"learning_rate": 1.2580634595957898e-06,
"loss": 0.5093,
"num_input_tokens_seen": 23390400,
"step": 7425
},
{
"epoch": 0.47564176429165866,
"grad_norm": 27.435312271118164,
"learning_rate": 1.2569837779776172e-06,
"loss": 0.3871,
"num_input_tokens_seen": 23406400,
"step": 7430
},
{
"epoch": 0.47596184623263554,
"grad_norm": 30.586233139038086,
"learning_rate": 1.2559037755121804e-06,
"loss": 0.3134,
"num_input_tokens_seen": 23421824,
"step": 7435
},
{
"epoch": 0.4762819281736124,
"grad_norm": 50.65870666503906,
"learning_rate": 1.2548234535478754e-06,
"loss": 0.4599,
"num_input_tokens_seen": 23438272,
"step": 7440
},
{
"epoch": 0.47660201011458936,
"grad_norm": 18.9100341796875,
"learning_rate": 1.2537428134334968e-06,
"loss": 0.4267,
"num_input_tokens_seen": 23454976,
"step": 7445
},
{
"epoch": 0.47692209205556624,
"grad_norm": 89.36604309082031,
"learning_rate": 1.252661856518236e-06,
"loss": 0.5302,
"num_input_tokens_seen": 23471168,
"step": 7450
},
{
"epoch": 0.4772421739965431,
"grad_norm": 30.56731605529785,
"learning_rate": 1.251580584151681e-06,
"loss": 0.3683,
"num_input_tokens_seen": 23486720,
"step": 7455
},
{
"epoch": 0.47756225593752,
"grad_norm": 20.963420867919922,
"learning_rate": 1.2504989976838129e-06,
"loss": 0.309,
"num_input_tokens_seen": 23502912,
"step": 7460
},
{
"epoch": 0.4778823378784969,
"grad_norm": 25.654800415039062,
"learning_rate": 1.2494170984650048e-06,
"loss": 0.3629,
"num_input_tokens_seen": 23519552,
"step": 7465
},
{
"epoch": 0.4782024198194738,
"grad_norm": 27.470243453979492,
"learning_rate": 1.248334887846021e-06,
"loss": 0.4253,
"num_input_tokens_seen": 23535936,
"step": 7470
},
{
"epoch": 0.4785225017604507,
"grad_norm": 33.39189910888672,
"learning_rate": 1.2472523671780135e-06,
"loss": 0.4411,
"num_input_tokens_seen": 23551040,
"step": 7475
},
{
"epoch": 0.47884258370142757,
"grad_norm": 35.72373580932617,
"learning_rate": 1.2461695378125233e-06,
"loss": 0.309,
"num_input_tokens_seen": 23566208,
"step": 7480
},
{
"epoch": 0.47916266564240445,
"grad_norm": 22.527515411376953,
"learning_rate": 1.245086401101474e-06,
"loss": 0.4347,
"num_input_tokens_seen": 23581696,
"step": 7485
},
{
"epoch": 0.47948274758338133,
"grad_norm": 74.2776870727539,
"learning_rate": 1.2440029583971757e-06,
"loss": 0.4439,
"num_input_tokens_seen": 23597248,
"step": 7490
},
{
"epoch": 0.4798028295243582,
"grad_norm": 14.569833755493164,
"learning_rate": 1.2429192110523188e-06,
"loss": 0.502,
"num_input_tokens_seen": 23612800,
"step": 7495
},
{
"epoch": 0.48012291146533514,
"grad_norm": 28.646181106567383,
"learning_rate": 1.2418351604199746e-06,
"loss": 0.3388,
"num_input_tokens_seen": 23629056,
"step": 7500
},
{
"epoch": 0.480442993406312,
"grad_norm": 41.83679962158203,
"learning_rate": 1.2407508078535934e-06,
"loss": 0.4502,
"num_input_tokens_seen": 23644352,
"step": 7505
},
{
"epoch": 0.4807630753472889,
"grad_norm": 29.25275230407715,
"learning_rate": 1.2396661547070017e-06,
"loss": 0.2899,
"num_input_tokens_seen": 23661120,
"step": 7510
},
{
"epoch": 0.4810831572882658,
"grad_norm": 18.579309463500977,
"learning_rate": 1.238581202334402e-06,
"loss": 0.3362,
"num_input_tokens_seen": 23677632,
"step": 7515
},
{
"epoch": 0.48140323922924266,
"grad_norm": 35.11944580078125,
"learning_rate": 1.2374959520903699e-06,
"loss": 0.3676,
"num_input_tokens_seen": 23693952,
"step": 7520
},
{
"epoch": 0.4817233211702196,
"grad_norm": 20.344608306884766,
"learning_rate": 1.2364104053298531e-06,
"loss": 0.3442,
"num_input_tokens_seen": 23708736,
"step": 7525
},
{
"epoch": 0.4820434031111965,
"grad_norm": 30.352449417114258,
"learning_rate": 1.2353245634081692e-06,
"loss": 0.392,
"num_input_tokens_seen": 23724864,
"step": 7530
},
{
"epoch": 0.48236348505217336,
"grad_norm": 23.443605422973633,
"learning_rate": 1.2342384276810053e-06,
"loss": 0.4165,
"num_input_tokens_seen": 23740160,
"step": 7535
},
{
"epoch": 0.48268356699315024,
"grad_norm": 72.33162689208984,
"learning_rate": 1.233151999504414e-06,
"loss": 0.435,
"num_input_tokens_seen": 23755264,
"step": 7540
},
{
"epoch": 0.4830036489341271,
"grad_norm": 53.03398895263672,
"learning_rate": 1.232065280234814e-06,
"loss": 0.3445,
"num_input_tokens_seen": 23770112,
"step": 7545
},
{
"epoch": 0.48332373087510405,
"grad_norm": 26.0847110748291,
"learning_rate": 1.2309782712289867e-06,
"loss": 0.4075,
"num_input_tokens_seen": 23785536,
"step": 7550
},
{
"epoch": 0.48364381281608093,
"grad_norm": 53.779842376708984,
"learning_rate": 1.2298909738440758e-06,
"loss": 0.4257,
"num_input_tokens_seen": 23801280,
"step": 7555
},
{
"epoch": 0.4839638947570578,
"grad_norm": 39.13923263549805,
"learning_rate": 1.2288033894375847e-06,
"loss": 0.3893,
"num_input_tokens_seen": 23816448,
"step": 7560
},
{
"epoch": 0.4842839766980347,
"grad_norm": 32.721370697021484,
"learning_rate": 1.2277155193673755e-06,
"loss": 0.541,
"num_input_tokens_seen": 23832512,
"step": 7565
},
{
"epoch": 0.48460405863901157,
"grad_norm": 15.527641296386719,
"learning_rate": 1.2266273649916668e-06,
"loss": 0.3945,
"num_input_tokens_seen": 23848192,
"step": 7570
},
{
"epoch": 0.48492414057998845,
"grad_norm": 20.2283992767334,
"learning_rate": 1.2255389276690318e-06,
"loss": 0.4394,
"num_input_tokens_seen": 23863808,
"step": 7575
},
{
"epoch": 0.4852442225209654,
"grad_norm": 34.1326904296875,
"learning_rate": 1.2244502087583978e-06,
"loss": 0.3096,
"num_input_tokens_seen": 23880960,
"step": 7580
},
{
"epoch": 0.48556430446194226,
"grad_norm": 40.73923110961914,
"learning_rate": 1.2233612096190426e-06,
"loss": 0.3963,
"num_input_tokens_seen": 23896256,
"step": 7585
},
{
"epoch": 0.48588438640291914,
"grad_norm": 38.855438232421875,
"learning_rate": 1.222271931610595e-06,
"loss": 0.5109,
"num_input_tokens_seen": 23912832,
"step": 7590
},
{
"epoch": 0.486204468343896,
"grad_norm": 25.933034896850586,
"learning_rate": 1.2211823760930306e-06,
"loss": 0.4938,
"num_input_tokens_seen": 23928768,
"step": 7595
},
{
"epoch": 0.4865245502848729,
"grad_norm": 19.541147232055664,
"learning_rate": 1.2200925444266726e-06,
"loss": 0.4297,
"num_input_tokens_seen": 23945088,
"step": 7600
},
{
"epoch": 0.48684463222584984,
"grad_norm": 38.088348388671875,
"learning_rate": 1.219002437972189e-06,
"loss": 0.5101,
"num_input_tokens_seen": 23960192,
"step": 7605
},
{
"epoch": 0.4871647141668267,
"grad_norm": 31.72063446044922,
"learning_rate": 1.21791205809059e-06,
"loss": 0.4324,
"num_input_tokens_seen": 23977152,
"step": 7610
},
{
"epoch": 0.4874847961078036,
"grad_norm": 31.89913558959961,
"learning_rate": 1.2168214061432283e-06,
"loss": 0.3628,
"num_input_tokens_seen": 23992448,
"step": 7615
},
{
"epoch": 0.4878048780487805,
"grad_norm": 25.90325164794922,
"learning_rate": 1.2157304834917947e-06,
"loss": 0.4397,
"num_input_tokens_seen": 24008384,
"step": 7620
},
{
"epoch": 0.48812495998975736,
"grad_norm": 28.418067932128906,
"learning_rate": 1.2146392914983202e-06,
"loss": 0.6103,
"num_input_tokens_seen": 24025728,
"step": 7625
},
{
"epoch": 0.4884450419307343,
"grad_norm": 45.870277404785156,
"learning_rate": 1.2135478315251694e-06,
"loss": 0.51,
"num_input_tokens_seen": 24040448,
"step": 7630
},
{
"epoch": 0.48876512387171117,
"grad_norm": 30.824810028076172,
"learning_rate": 1.2124561049350442e-06,
"loss": 0.36,
"num_input_tokens_seen": 24055168,
"step": 7635
},
{
"epoch": 0.48908520581268805,
"grad_norm": 40.22310256958008,
"learning_rate": 1.2113641130909772e-06,
"loss": 0.4474,
"num_input_tokens_seen": 24070016,
"step": 7640
},
{
"epoch": 0.48940528775366493,
"grad_norm": 51.548828125,
"learning_rate": 1.2102718573563334e-06,
"loss": 0.3074,
"num_input_tokens_seen": 24084800,
"step": 7645
},
{
"epoch": 0.4897253696946418,
"grad_norm": 53.92572021484375,
"learning_rate": 1.2091793390948066e-06,
"loss": 0.4884,
"num_input_tokens_seen": 24100416,
"step": 7650
},
{
"epoch": 0.49004545163561875,
"grad_norm": 18.71347427368164,
"learning_rate": 1.2080865596704191e-06,
"loss": 0.2873,
"num_input_tokens_seen": 24117120,
"step": 7655
},
{
"epoch": 0.4903655335765956,
"grad_norm": 30.77579116821289,
"learning_rate": 1.2069935204475187e-06,
"loss": 0.4317,
"num_input_tokens_seen": 24132224,
"step": 7660
},
{
"epoch": 0.4906856155175725,
"grad_norm": 24.197696685791016,
"learning_rate": 1.2059002227907776e-06,
"loss": 0.4037,
"num_input_tokens_seen": 24147712,
"step": 7665
},
{
"epoch": 0.4910056974585494,
"grad_norm": 39.67091751098633,
"learning_rate": 1.2048066680651908e-06,
"loss": 0.408,
"num_input_tokens_seen": 24164288,
"step": 7670
},
{
"epoch": 0.49132577939952626,
"grad_norm": 39.406349182128906,
"learning_rate": 1.2037128576360743e-06,
"loss": 0.5751,
"num_input_tokens_seen": 24193728,
"step": 7675
},
{
"epoch": 0.49164586134050314,
"grad_norm": 39.66476821899414,
"learning_rate": 1.2026187928690627e-06,
"loss": 0.406,
"num_input_tokens_seen": 24208832,
"step": 7680
},
{
"epoch": 0.4919659432814801,
"grad_norm": 36.91902160644531,
"learning_rate": 1.2015244751301098e-06,
"loss": 0.5004,
"num_input_tokens_seen": 24223424,
"step": 7685
},
{
"epoch": 0.49228602522245696,
"grad_norm": 47.22456741333008,
"learning_rate": 1.2004299057854832e-06,
"loss": 0.444,
"num_input_tokens_seen": 24238976,
"step": 7690
},
{
"epoch": 0.49260610716343384,
"grad_norm": 23.27979850769043,
"learning_rate": 1.1993350862017661e-06,
"loss": 0.3837,
"num_input_tokens_seen": 24253632,
"step": 7695
},
{
"epoch": 0.4929261891044107,
"grad_norm": 37.785526275634766,
"learning_rate": 1.1982400177458534e-06,
"loss": 0.4074,
"num_input_tokens_seen": 24270720,
"step": 7700
},
{
"epoch": 0.4932462710453876,
"grad_norm": 37.083473205566406,
"learning_rate": 1.197144701784951e-06,
"loss": 0.4385,
"num_input_tokens_seen": 24285312,
"step": 7705
},
{
"epoch": 0.49356635298636453,
"grad_norm": 32.1000862121582,
"learning_rate": 1.1960491396865735e-06,
"loss": 0.409,
"num_input_tokens_seen": 24300352,
"step": 7710
},
{
"epoch": 0.4938864349273414,
"grad_norm": 26.52760887145996,
"learning_rate": 1.1949533328185435e-06,
"loss": 0.3518,
"num_input_tokens_seen": 24317056,
"step": 7715
},
{
"epoch": 0.4942065168683183,
"grad_norm": 27.518896102905273,
"learning_rate": 1.1938572825489883e-06,
"loss": 0.3705,
"num_input_tokens_seen": 24333184,
"step": 7720
},
{
"epoch": 0.49452659880929517,
"grad_norm": 32.64544677734375,
"learning_rate": 1.1927609902463394e-06,
"loss": 0.4313,
"num_input_tokens_seen": 24348672,
"step": 7725
},
{
"epoch": 0.49484668075027205,
"grad_norm": 44.3544807434082,
"learning_rate": 1.1916644572793314e-06,
"loss": 0.4342,
"num_input_tokens_seen": 24363648,
"step": 7730
},
{
"epoch": 0.495166762691249,
"grad_norm": 80.76370239257812,
"learning_rate": 1.190567685016998e-06,
"loss": 0.4951,
"num_input_tokens_seen": 24380992,
"step": 7735
},
{
"epoch": 0.49548684463222586,
"grad_norm": 30.18761444091797,
"learning_rate": 1.189470674828672e-06,
"loss": 0.4152,
"num_input_tokens_seen": 24395776,
"step": 7740
},
{
"epoch": 0.49580692657320274,
"grad_norm": 26.904821395874023,
"learning_rate": 1.188373428083984e-06,
"loss": 0.3851,
"num_input_tokens_seen": 24411584,
"step": 7745
},
{
"epoch": 0.4961270085141796,
"grad_norm": 43.27383041381836,
"learning_rate": 1.1872759461528596e-06,
"loss": 0.5355,
"num_input_tokens_seen": 24426560,
"step": 7750
},
{
"epoch": 0.4964470904551565,
"grad_norm": 12.632915496826172,
"learning_rate": 1.1861782304055174e-06,
"loss": 0.4046,
"num_input_tokens_seen": 24441856,
"step": 7755
},
{
"epoch": 0.4967671723961334,
"grad_norm": 19.117013931274414,
"learning_rate": 1.1850802822124686e-06,
"loss": 0.3269,
"num_input_tokens_seen": 24457472,
"step": 7760
},
{
"epoch": 0.4970872543371103,
"grad_norm": 61.428348541259766,
"learning_rate": 1.1839821029445143e-06,
"loss": 0.5104,
"num_input_tokens_seen": 24471936,
"step": 7765
},
{
"epoch": 0.4974073362780872,
"grad_norm": 39.07609939575195,
"learning_rate": 1.1828836939727442e-06,
"loss": 0.3332,
"num_input_tokens_seen": 24487616,
"step": 7770
},
{
"epoch": 0.4977274182190641,
"grad_norm": 38.0487060546875,
"learning_rate": 1.181785056668535e-06,
"loss": 0.4292,
"num_input_tokens_seen": 24503936,
"step": 7775
},
{
"epoch": 0.49804750016004096,
"grad_norm": 31.99479103088379,
"learning_rate": 1.180686192403548e-06,
"loss": 0.429,
"num_input_tokens_seen": 24518464,
"step": 7780
},
{
"epoch": 0.49836758210101784,
"grad_norm": 50.00809097290039,
"learning_rate": 1.1795871025497285e-06,
"loss": 0.3479,
"num_input_tokens_seen": 24533184,
"step": 7785
},
{
"epoch": 0.49868766404199477,
"grad_norm": 40.826236724853516,
"learning_rate": 1.1784877884793029e-06,
"loss": 0.4288,
"num_input_tokens_seen": 24548992,
"step": 7790
},
{
"epoch": 0.49900774598297165,
"grad_norm": 35.559532165527344,
"learning_rate": 1.1773882515647776e-06,
"loss": 0.3681,
"num_input_tokens_seen": 24566592,
"step": 7795
},
{
"epoch": 0.49932782792394853,
"grad_norm": 28.459714889526367,
"learning_rate": 1.1762884931789376e-06,
"loss": 0.4776,
"num_input_tokens_seen": 24583552,
"step": 7800
},
{
"epoch": 0.4996479098649254,
"grad_norm": 15.405625343322754,
"learning_rate": 1.1751885146948436e-06,
"loss": 0.4538,
"num_input_tokens_seen": 24599552,
"step": 7805
},
{
"epoch": 0.4999679918059023,
"grad_norm": 35.40610122680664,
"learning_rate": 1.1740883174858327e-06,
"loss": 0.3799,
"num_input_tokens_seen": 24614912,
"step": 7810
},
{
"epoch": 0.5002880737468792,
"grad_norm": 35.4653434753418,
"learning_rate": 1.1729879029255127e-06,
"loss": 0.3643,
"num_input_tokens_seen": 24629696,
"step": 7815
},
{
"epoch": 0.5006081556878561,
"grad_norm": 33.1492919921875,
"learning_rate": 1.171887272387765e-06,
"loss": 0.3997,
"num_input_tokens_seen": 24646208,
"step": 7820
},
{
"epoch": 0.5006081556878561,
"eval_loss": 0.4178144633769989,
"eval_runtime": 50.7766,
"eval_samples_per_second": 273.473,
"eval_steps_per_second": 34.189,
"num_input_tokens_seen": 24646208,
"step": 7820
},
{
"epoch": 0.500928237628833,
"grad_norm": 91.9991226196289,
"learning_rate": 1.1707864272467397e-06,
"loss": 0.4907,
"num_input_tokens_seen": 24661120,
"step": 7825
},
{
"epoch": 0.5012483195698099,
"grad_norm": 40.02210235595703,
"learning_rate": 1.169685368876855e-06,
"loss": 0.4269,
"num_input_tokens_seen": 24678336,
"step": 7830
},
{
"epoch": 0.5015684015107867,
"grad_norm": 62.80644989013672,
"learning_rate": 1.1685840986527946e-06,
"loss": 0.5471,
"num_input_tokens_seen": 24694336,
"step": 7835
},
{
"epoch": 0.5018884834517636,
"grad_norm": 35.32887649536133,
"learning_rate": 1.1674826179495076e-06,
"loss": 0.3986,
"num_input_tokens_seen": 24708608,
"step": 7840
},
{
"epoch": 0.5022085653927405,
"grad_norm": 35.998695373535156,
"learning_rate": 1.1663809281422056e-06,
"loss": 0.4302,
"num_input_tokens_seen": 24724672,
"step": 7845
},
{
"epoch": 0.5025286473337174,
"grad_norm": 43.341339111328125,
"learning_rate": 1.1652790306063615e-06,
"loss": 0.4556,
"num_input_tokens_seen": 24740608,
"step": 7850
},
{
"epoch": 0.5028487292746944,
"grad_norm": 35.64778137207031,
"learning_rate": 1.164176926717707e-06,
"loss": 0.4065,
"num_input_tokens_seen": 24758528,
"step": 7855
},
{
"epoch": 0.5031688112156713,
"grad_norm": 25.157997131347656,
"learning_rate": 1.1630746178522315e-06,
"loss": 0.3737,
"num_input_tokens_seen": 24772992,
"step": 7860
},
{
"epoch": 0.5034888931566481,
"grad_norm": 29.157825469970703,
"learning_rate": 1.1619721053861816e-06,
"loss": 0.4417,
"num_input_tokens_seen": 24788160,
"step": 7865
},
{
"epoch": 0.503808975097625,
"grad_norm": 22.016447067260742,
"learning_rate": 1.1608693906960558e-06,
"loss": 0.4104,
"num_input_tokens_seen": 24804224,
"step": 7870
},
{
"epoch": 0.5041290570386019,
"grad_norm": 32.85700988769531,
"learning_rate": 1.1597664751586069e-06,
"loss": 0.4523,
"num_input_tokens_seen": 24820928,
"step": 7875
},
{
"epoch": 0.5044491389795788,
"grad_norm": 49.433475494384766,
"learning_rate": 1.1586633601508382e-06,
"loss": 0.3953,
"num_input_tokens_seen": 24835776,
"step": 7880
},
{
"epoch": 0.5047692209205557,
"grad_norm": 48.25388717651367,
"learning_rate": 1.1575600470500014e-06,
"loss": 0.3764,
"num_input_tokens_seen": 24851648,
"step": 7885
},
{
"epoch": 0.5050893028615325,
"grad_norm": 61.41056823730469,
"learning_rate": 1.1564565372335957e-06,
"loss": 0.4222,
"num_input_tokens_seen": 24866880,
"step": 7890
},
{
"epoch": 0.5054093848025094,
"grad_norm": 42.029537200927734,
"learning_rate": 1.1553528320793663e-06,
"loss": 0.3276,
"num_input_tokens_seen": 24881856,
"step": 7895
},
{
"epoch": 0.5057294667434863,
"grad_norm": 24.842376708984375,
"learning_rate": 1.1542489329653022e-06,
"loss": 0.4327,
"num_input_tokens_seen": 24898560,
"step": 7900
},
{
"epoch": 0.5060495486844632,
"grad_norm": 28.775615692138672,
"learning_rate": 1.1531448412696343e-06,
"loss": 0.3841,
"num_input_tokens_seen": 24913216,
"step": 7905
},
{
"epoch": 0.5063696306254402,
"grad_norm": 22.78350067138672,
"learning_rate": 1.1520405583708337e-06,
"loss": 0.5014,
"num_input_tokens_seen": 24928832,
"step": 7910
},
{
"epoch": 0.506689712566417,
"grad_norm": 35.34454345703125,
"learning_rate": 1.1509360856476109e-06,
"loss": 0.4926,
"num_input_tokens_seen": 24944512,
"step": 7915
},
{
"epoch": 0.5070097945073939,
"grad_norm": 40.0589485168457,
"learning_rate": 1.149831424478913e-06,
"loss": 0.4731,
"num_input_tokens_seen": 24959744,
"step": 7920
},
{
"epoch": 0.5073298764483708,
"grad_norm": 44.30174255371094,
"learning_rate": 1.1487265762439224e-06,
"loss": 0.3939,
"num_input_tokens_seen": 24975488,
"step": 7925
},
{
"epoch": 0.5076499583893477,
"grad_norm": 41.936622619628906,
"learning_rate": 1.1476215423220547e-06,
"loss": 0.3612,
"num_input_tokens_seen": 24990272,
"step": 7930
},
{
"epoch": 0.5079700403303246,
"grad_norm": 39.607147216796875,
"learning_rate": 1.146516324092959e-06,
"loss": 0.3724,
"num_input_tokens_seen": 25006272,
"step": 7935
},
{
"epoch": 0.5082901222713014,
"grad_norm": 25.23112678527832,
"learning_rate": 1.1454109229365117e-06,
"loss": 0.2965,
"num_input_tokens_seen": 25022464,
"step": 7940
},
{
"epoch": 0.5086102042122783,
"grad_norm": 28.533090591430664,
"learning_rate": 1.14430534023282e-06,
"loss": 0.3151,
"num_input_tokens_seen": 25037376,
"step": 7945
},
{
"epoch": 0.5089302861532552,
"grad_norm": 44.65055465698242,
"learning_rate": 1.1431995773622167e-06,
"loss": 0.4737,
"num_input_tokens_seen": 25053440,
"step": 7950
},
{
"epoch": 0.5092503680942321,
"grad_norm": 22.451980590820312,
"learning_rate": 1.1420936357052597e-06,
"loss": 0.4343,
"num_input_tokens_seen": 25069120,
"step": 7955
},
{
"epoch": 0.5095704500352091,
"grad_norm": 21.964155197143555,
"learning_rate": 1.1409875166427303e-06,
"loss": 0.3024,
"num_input_tokens_seen": 25084224,
"step": 7960
},
{
"epoch": 0.509890531976186,
"grad_norm": 36.40359115600586,
"learning_rate": 1.1398812215556308e-06,
"loss": 0.5023,
"num_input_tokens_seen": 25099520,
"step": 7965
},
{
"epoch": 0.5102106139171628,
"grad_norm": 28.904006958007812,
"learning_rate": 1.1387747518251837e-06,
"loss": 0.372,
"num_input_tokens_seen": 25115200,
"step": 7970
},
{
"epoch": 0.5105306958581397,
"grad_norm": 21.367204666137695,
"learning_rate": 1.13766810883283e-06,
"loss": 0.3171,
"num_input_tokens_seen": 25131520,
"step": 7975
},
{
"epoch": 0.5108507777991166,
"grad_norm": 39.231876373291016,
"learning_rate": 1.1365612939602255e-06,
"loss": 0.5088,
"num_input_tokens_seen": 25147776,
"step": 7980
},
{
"epoch": 0.5111708597400935,
"grad_norm": 23.78633689880371,
"learning_rate": 1.1354543085892423e-06,
"loss": 0.3884,
"num_input_tokens_seen": 25162816,
"step": 7985
},
{
"epoch": 0.5114909416810703,
"grad_norm": 36.806182861328125,
"learning_rate": 1.1343471541019646e-06,
"loss": 0.3417,
"num_input_tokens_seen": 25178752,
"step": 7990
},
{
"epoch": 0.5118110236220472,
"grad_norm": 61.8624267578125,
"learning_rate": 1.1332398318806872e-06,
"loss": 0.3672,
"num_input_tokens_seen": 25194048,
"step": 7995
},
{
"epoch": 0.5121311055630241,
"grad_norm": 33.29353713989258,
"learning_rate": 1.1321323433079158e-06,
"loss": 0.3787,
"num_input_tokens_seen": 25209216,
"step": 8000
},
{
"epoch": 0.512451187504001,
"grad_norm": 31.924596786499023,
"learning_rate": 1.1310246897663623e-06,
"loss": 0.3897,
"num_input_tokens_seen": 25224640,
"step": 8005
},
{
"epoch": 0.5127712694449779,
"grad_norm": 25.257596969604492,
"learning_rate": 1.1299168726389447e-06,
"loss": 0.4115,
"num_input_tokens_seen": 25239808,
"step": 8010
},
{
"epoch": 0.5130913513859549,
"grad_norm": 50.187477111816406,
"learning_rate": 1.1288088933087868e-06,
"loss": 0.346,
"num_input_tokens_seen": 25257344,
"step": 8015
},
{
"epoch": 0.5134114333269317,
"grad_norm": 27.19639778137207,
"learning_rate": 1.1277007531592127e-06,
"loss": 0.3318,
"num_input_tokens_seen": 25272064,
"step": 8020
},
{
"epoch": 0.5137315152679086,
"grad_norm": 29.604515075683594,
"learning_rate": 1.1265924535737492e-06,
"loss": 0.3698,
"num_input_tokens_seen": 25287936,
"step": 8025
},
{
"epoch": 0.5140515972088855,
"grad_norm": 39.24460983276367,
"learning_rate": 1.125483995936121e-06,
"loss": 0.3019,
"num_input_tokens_seen": 25303232,
"step": 8030
},
{
"epoch": 0.5143716791498624,
"grad_norm": 20.35449981689453,
"learning_rate": 1.1243753816302507e-06,
"loss": 0.3742,
"num_input_tokens_seen": 25318656,
"step": 8035
},
{
"epoch": 0.5146917610908393,
"grad_norm": 43.355865478515625,
"learning_rate": 1.1232666120402558e-06,
"loss": 0.4047,
"num_input_tokens_seen": 25333760,
"step": 8040
},
{
"epoch": 0.5150118430318161,
"grad_norm": 34.452640533447266,
"learning_rate": 1.1221576885504487e-06,
"loss": 0.3819,
"num_input_tokens_seen": 25349824,
"step": 8045
},
{
"epoch": 0.515331924972793,
"grad_norm": 20.325450897216797,
"learning_rate": 1.121048612545333e-06,
"loss": 0.3978,
"num_input_tokens_seen": 25365376,
"step": 8050
},
{
"epoch": 0.5156520069137699,
"grad_norm": 47.98875427246094,
"learning_rate": 1.1199393854096034e-06,
"loss": 0.459,
"num_input_tokens_seen": 25380928,
"step": 8055
},
{
"epoch": 0.5159720888547468,
"grad_norm": 73.50482940673828,
"learning_rate": 1.118830008528143e-06,
"loss": 0.3448,
"num_input_tokens_seen": 25396352,
"step": 8060
},
{
"epoch": 0.5162921707957238,
"grad_norm": 22.99143409729004,
"learning_rate": 1.1177204832860212e-06,
"loss": 0.3084,
"num_input_tokens_seen": 25411456,
"step": 8065
},
{
"epoch": 0.5166122527367006,
"grad_norm": 20.2192325592041,
"learning_rate": 1.1166108110684947e-06,
"loss": 0.4402,
"num_input_tokens_seen": 25428544,
"step": 8070
},
{
"epoch": 0.5169323346776775,
"grad_norm": 38.74795150756836,
"learning_rate": 1.1155009932610003e-06,
"loss": 0.4209,
"num_input_tokens_seen": 25443968,
"step": 8075
},
{
"epoch": 0.5172524166186544,
"grad_norm": 47.510467529296875,
"learning_rate": 1.1143910312491605e-06,
"loss": 0.3319,
"num_input_tokens_seen": 25458880,
"step": 8080
},
{
"epoch": 0.5175724985596313,
"grad_norm": 60.593082427978516,
"learning_rate": 1.1132809264187748e-06,
"loss": 0.3206,
"num_input_tokens_seen": 25474304,
"step": 8085
},
{
"epoch": 0.5178925805006082,
"grad_norm": 58.513587951660156,
"learning_rate": 1.1121706801558226e-06,
"loss": 0.4119,
"num_input_tokens_seen": 25489472,
"step": 8090
},
{
"epoch": 0.518212662441585,
"grad_norm": 42.54377365112305,
"learning_rate": 1.111060293846459e-06,
"loss": 0.3921,
"num_input_tokens_seen": 25504896,
"step": 8095
},
{
"epoch": 0.5185327443825619,
"grad_norm": 77.95099639892578,
"learning_rate": 1.1099497688770148e-06,
"loss": 0.4749,
"num_input_tokens_seen": 25519360,
"step": 8100
},
{
"epoch": 0.5188528263235388,
"grad_norm": 36.677886962890625,
"learning_rate": 1.1088391066339928e-06,
"loss": 0.4449,
"num_input_tokens_seen": 25535680,
"step": 8105
},
{
"epoch": 0.5191729082645157,
"grad_norm": 44.013736724853516,
"learning_rate": 1.1077283085040684e-06,
"loss": 0.5377,
"num_input_tokens_seen": 25550592,
"step": 8110
},
{
"epoch": 0.5194929902054926,
"grad_norm": 40.93517303466797,
"learning_rate": 1.1066173758740863e-06,
"loss": 0.3997,
"num_input_tokens_seen": 25565696,
"step": 8115
},
{
"epoch": 0.5198130721464695,
"grad_norm": 19.403085708618164,
"learning_rate": 1.105506310131058e-06,
"loss": 0.3523,
"num_input_tokens_seen": 25581568,
"step": 8120
},
{
"epoch": 0.5201331540874464,
"grad_norm": 53.75294876098633,
"learning_rate": 1.1043951126621634e-06,
"loss": 0.4599,
"num_input_tokens_seen": 25597760,
"step": 8125
},
{
"epoch": 0.5204532360284233,
"grad_norm": 29.208173751831055,
"learning_rate": 1.1032837848547445e-06,
"loss": 0.4081,
"num_input_tokens_seen": 25615424,
"step": 8130
},
{
"epoch": 0.5207733179694002,
"grad_norm": 33.89809799194336,
"learning_rate": 1.1021723280963074e-06,
"loss": 0.4117,
"num_input_tokens_seen": 25630720,
"step": 8135
},
{
"epoch": 0.5210933999103771,
"grad_norm": 44.60910415649414,
"learning_rate": 1.1010607437745194e-06,
"loss": 0.5029,
"num_input_tokens_seen": 25649280,
"step": 8140
},
{
"epoch": 0.5214134818513539,
"grad_norm": 49.8292236328125,
"learning_rate": 1.0999490332772057e-06,
"loss": 0.5131,
"num_input_tokens_seen": 25664576,
"step": 8145
},
{
"epoch": 0.5217335637923308,
"grad_norm": 27.133338928222656,
"learning_rate": 1.0988371979923507e-06,
"loss": 0.426,
"num_input_tokens_seen": 25680384,
"step": 8150
},
{
"epoch": 0.5220536457333077,
"grad_norm": 26.779043197631836,
"learning_rate": 1.097725239308094e-06,
"loss": 0.4235,
"num_input_tokens_seen": 25696128,
"step": 8155
},
{
"epoch": 0.5223737276742846,
"grad_norm": 15.962846755981445,
"learning_rate": 1.0966131586127278e-06,
"loss": 0.2819,
"num_input_tokens_seen": 25712768,
"step": 8160
},
{
"epoch": 0.5226938096152615,
"grad_norm": 26.21531105041504,
"learning_rate": 1.0955009572946992e-06,
"loss": 0.4086,
"num_input_tokens_seen": 25727616,
"step": 8165
},
{
"epoch": 0.5230138915562383,
"grad_norm": 23.129108428955078,
"learning_rate": 1.094388636742604e-06,
"loss": 0.4159,
"num_input_tokens_seen": 25744384,
"step": 8170
},
{
"epoch": 0.5233339734972153,
"grad_norm": 37.188663482666016,
"learning_rate": 1.0932761983451878e-06,
"loss": 0.3516,
"num_input_tokens_seen": 25760640,
"step": 8175
},
{
"epoch": 0.5236540554381922,
"grad_norm": 32.39508819580078,
"learning_rate": 1.0921636434913425e-06,
"loss": 0.3157,
"num_input_tokens_seen": 25776640,
"step": 8180
},
{
"epoch": 0.5239741373791691,
"grad_norm": 26.821107864379883,
"learning_rate": 1.091050973570106e-06,
"loss": 0.2979,
"num_input_tokens_seen": 25791744,
"step": 8185
},
{
"epoch": 0.524294219320146,
"grad_norm": 60.61221694946289,
"learning_rate": 1.08993818997066e-06,
"loss": 0.5589,
"num_input_tokens_seen": 25808256,
"step": 8190
},
{
"epoch": 0.5246143012611229,
"grad_norm": 35.13313674926758,
"learning_rate": 1.0888252940823283e-06,
"loss": 0.4481,
"num_input_tokens_seen": 25824128,
"step": 8195
},
{
"epoch": 0.5249343832020997,
"grad_norm": 40.965362548828125,
"learning_rate": 1.0877122872945737e-06,
"loss": 0.4767,
"num_input_tokens_seen": 25840576,
"step": 8200
},
{
"epoch": 0.5252544651430766,
"grad_norm": 35.6567497253418,
"learning_rate": 1.0865991709969983e-06,
"loss": 0.3206,
"num_input_tokens_seen": 25856256,
"step": 8205
},
{
"epoch": 0.5255745470840535,
"grad_norm": 21.82451820373535,
"learning_rate": 1.0854859465793416e-06,
"loss": 0.4424,
"num_input_tokens_seen": 25871424,
"step": 8210
},
{
"epoch": 0.5258946290250304,
"grad_norm": 32.83222961425781,
"learning_rate": 1.0843726154314767e-06,
"loss": 0.4916,
"num_input_tokens_seen": 25886272,
"step": 8215
},
{
"epoch": 0.5262147109660072,
"grad_norm": 31.14963722229004,
"learning_rate": 1.083259178943411e-06,
"loss": 0.4302,
"num_input_tokens_seen": 25901952,
"step": 8220
},
{
"epoch": 0.5265347929069842,
"grad_norm": 20.824260711669922,
"learning_rate": 1.0821456385052822e-06,
"loss": 0.3779,
"num_input_tokens_seen": 25917888,
"step": 8225
},
{
"epoch": 0.5268548748479611,
"grad_norm": 45.031227111816406,
"learning_rate": 1.0810319955073598e-06,
"loss": 0.4074,
"num_input_tokens_seen": 25933824,
"step": 8230
},
{
"epoch": 0.527174956788938,
"grad_norm": 38.65109634399414,
"learning_rate": 1.0799182513400393e-06,
"loss": 0.3842,
"num_input_tokens_seen": 25951360,
"step": 8235
},
{
"epoch": 0.5274950387299149,
"grad_norm": 42.26575469970703,
"learning_rate": 1.0788044073938438e-06,
"loss": 0.3524,
"num_input_tokens_seen": 25967232,
"step": 8240
},
{
"epoch": 0.5278151206708918,
"grad_norm": 38.96324157714844,
"learning_rate": 1.0776904650594205e-06,
"loss": 0.4361,
"num_input_tokens_seen": 25982592,
"step": 8245
},
{
"epoch": 0.5281352026118686,
"grad_norm": 59.213871002197266,
"learning_rate": 1.0765764257275394e-06,
"loss": 0.4055,
"num_input_tokens_seen": 25997824,
"step": 8250
},
{
"epoch": 0.5284552845528455,
"grad_norm": 35.566043853759766,
"learning_rate": 1.0754622907890914e-06,
"loss": 0.4559,
"num_input_tokens_seen": 26013632,
"step": 8255
},
{
"epoch": 0.5287753664938224,
"grad_norm": 28.799236297607422,
"learning_rate": 1.0743480616350873e-06,
"loss": 0.3412,
"num_input_tokens_seen": 26028800,
"step": 8260
},
{
"epoch": 0.5290954484347993,
"grad_norm": 31.209489822387695,
"learning_rate": 1.0732337396566558e-06,
"loss": 0.3488,
"num_input_tokens_seen": 26044672,
"step": 8265
},
{
"epoch": 0.5294155303757762,
"grad_norm": 22.62862777709961,
"learning_rate": 1.07211932624504e-06,
"loss": 0.3944,
"num_input_tokens_seen": 26060544,
"step": 8270
},
{
"epoch": 0.529735612316753,
"grad_norm": 17.837751388549805,
"learning_rate": 1.0710048227915988e-06,
"loss": 0.3714,
"num_input_tokens_seen": 26076160,
"step": 8275
},
{
"epoch": 0.53005569425773,
"grad_norm": 28.29045295715332,
"learning_rate": 1.0698902306878024e-06,
"loss": 0.4306,
"num_input_tokens_seen": 26092352,
"step": 8280
},
{
"epoch": 0.5303757761987069,
"grad_norm": 26.03973960876465,
"learning_rate": 1.0687755513252325e-06,
"loss": 0.3033,
"num_input_tokens_seen": 26107776,
"step": 8285
},
{
"epoch": 0.5306958581396838,
"grad_norm": 11.766892433166504,
"learning_rate": 1.0676607860955794e-06,
"loss": 0.3065,
"num_input_tokens_seen": 26123712,
"step": 8290
},
{
"epoch": 0.5310159400806607,
"grad_norm": 44.154823303222656,
"learning_rate": 1.0665459363906404e-06,
"loss": 0.3837,
"num_input_tokens_seen": 26139200,
"step": 8295
},
{
"epoch": 0.5313360220216375,
"grad_norm": 23.409717559814453,
"learning_rate": 1.0654310036023185e-06,
"loss": 0.4238,
"num_input_tokens_seen": 26153600,
"step": 8300
},
{
"epoch": 0.5316561039626144,
"grad_norm": 21.443828582763672,
"learning_rate": 1.0643159891226203e-06,
"loss": 0.4224,
"num_input_tokens_seen": 26169600,
"step": 8305
},
{
"epoch": 0.5319761859035913,
"grad_norm": 32.9205322265625,
"learning_rate": 1.0632008943436545e-06,
"loss": 0.3419,
"num_input_tokens_seen": 26185536,
"step": 8310
},
{
"epoch": 0.5322962678445682,
"grad_norm": 19.891855239868164,
"learning_rate": 1.0620857206576299e-06,
"loss": 0.4642,
"num_input_tokens_seen": 26201536,
"step": 8315
},
{
"epoch": 0.5326163497855451,
"grad_norm": 14.737898826599121,
"learning_rate": 1.0609704694568546e-06,
"loss": 0.2997,
"num_input_tokens_seen": 26216576,
"step": 8320
},
{
"epoch": 0.5329364317265219,
"grad_norm": 24.686262130737305,
"learning_rate": 1.0598551421337318e-06,
"loss": 0.2991,
"num_input_tokens_seen": 26232640,
"step": 8325
},
{
"epoch": 0.5332565136674989,
"grad_norm": 21.631492614746094,
"learning_rate": 1.0587397400807617e-06,
"loss": 0.539,
"num_input_tokens_seen": 26248448,
"step": 8330
},
{
"epoch": 0.5335765956084758,
"grad_norm": 35.107906341552734,
"learning_rate": 1.057624264690536e-06,
"loss": 0.5144,
"num_input_tokens_seen": 26263872,
"step": 8335
},
{
"epoch": 0.5338966775494527,
"grad_norm": 37.214752197265625,
"learning_rate": 1.0565087173557394e-06,
"loss": 0.4616,
"num_input_tokens_seen": 26279872,
"step": 8340
},
{
"epoch": 0.5342167594904296,
"grad_norm": 25.41779136657715,
"learning_rate": 1.055393099469146e-06,
"loss": 0.3469,
"num_input_tokens_seen": 26295680,
"step": 8345
},
{
"epoch": 0.5345368414314065,
"grad_norm": 37.366329193115234,
"learning_rate": 1.054277412423617e-06,
"loss": 0.4155,
"num_input_tokens_seen": 26311040,
"step": 8350
},
{
"epoch": 0.5348569233723833,
"grad_norm": 27.91765594482422,
"learning_rate": 1.0531616576121017e-06,
"loss": 0.47,
"num_input_tokens_seen": 26326144,
"step": 8355
},
{
"epoch": 0.5351770053133602,
"grad_norm": 24.44135856628418,
"learning_rate": 1.0520458364276325e-06,
"loss": 0.336,
"num_input_tokens_seen": 26341952,
"step": 8360
},
{
"epoch": 0.5354970872543371,
"grad_norm": 37.1027717590332,
"learning_rate": 1.0509299502633256e-06,
"loss": 0.3636,
"num_input_tokens_seen": 26356672,
"step": 8365
},
{
"epoch": 0.535817169195314,
"grad_norm": 23.37914276123047,
"learning_rate": 1.0498140005123777e-06,
"loss": 0.4452,
"num_input_tokens_seen": 26373056,
"step": 8370
},
{
"epoch": 0.5361372511362908,
"grad_norm": 13.970296859741211,
"learning_rate": 1.0486979885680653e-06,
"loss": 0.4254,
"num_input_tokens_seen": 26388032,
"step": 8375
},
{
"epoch": 0.5364573330772677,
"grad_norm": 60.650535583496094,
"learning_rate": 1.0475819158237424e-06,
"loss": 0.4324,
"num_input_tokens_seen": 26402880,
"step": 8380
},
{
"epoch": 0.5367774150182447,
"grad_norm": 20.941083908081055,
"learning_rate": 1.0464657836728389e-06,
"loss": 0.481,
"num_input_tokens_seen": 26419328,
"step": 8385
},
{
"epoch": 0.5370974969592216,
"grad_norm": 34.531517028808594,
"learning_rate": 1.045349593508859e-06,
"loss": 0.4169,
"num_input_tokens_seen": 26434112,
"step": 8390
},
{
"epoch": 0.5374175789001985,
"grad_norm": 22.334697723388672,
"learning_rate": 1.0442333467253788e-06,
"loss": 0.2911,
"num_input_tokens_seen": 26450688,
"step": 8395
},
{
"epoch": 0.5377376608411754,
"grad_norm": 37.3916130065918,
"learning_rate": 1.0431170447160463e-06,
"loss": 0.3651,
"num_input_tokens_seen": 26466368,
"step": 8400
},
{
"epoch": 0.5380577427821522,
"grad_norm": 22.045780181884766,
"learning_rate": 1.0420006888745767e-06,
"loss": 0.3663,
"num_input_tokens_seen": 26482624,
"step": 8405
},
{
"epoch": 0.5383778247231291,
"grad_norm": 23.31511688232422,
"learning_rate": 1.0408842805947543e-06,
"loss": 0.3745,
"num_input_tokens_seen": 26499200,
"step": 8410
},
{
"epoch": 0.538697906664106,
"grad_norm": 35.43497848510742,
"learning_rate": 1.0397678212704276e-06,
"loss": 0.5144,
"num_input_tokens_seen": 26514048,
"step": 8415
},
{
"epoch": 0.5390179886050829,
"grad_norm": 33.43364715576172,
"learning_rate": 1.038651312295509e-06,
"loss": 0.4061,
"num_input_tokens_seen": 26529216,
"step": 8420
},
{
"epoch": 0.5393380705460598,
"grad_norm": 26.025461196899414,
"learning_rate": 1.037534755063973e-06,
"loss": 0.4173,
"num_input_tokens_seen": 26545152,
"step": 8425
},
{
"epoch": 0.5396581524870366,
"grad_norm": 46.11745071411133,
"learning_rate": 1.0364181509698548e-06,
"loss": 0.4124,
"num_input_tokens_seen": 26560512,
"step": 8430
},
{
"epoch": 0.5399782344280136,
"grad_norm": 32.11628723144531,
"learning_rate": 1.0353015014072476e-06,
"loss": 0.3606,
"num_input_tokens_seen": 26575488,
"step": 8435
},
{
"epoch": 0.5402983163689905,
"grad_norm": 50.48931884765625,
"learning_rate": 1.0341848077703013e-06,
"loss": 0.4008,
"num_input_tokens_seen": 26591040,
"step": 8440
},
{
"epoch": 0.5406183983099674,
"grad_norm": 26.283470153808594,
"learning_rate": 1.033068071453221e-06,
"loss": 0.3228,
"num_input_tokens_seen": 26606976,
"step": 8445
},
{
"epoch": 0.5409384802509443,
"grad_norm": 35.38628005981445,
"learning_rate": 1.0319512938502653e-06,
"loss": 0.372,
"num_input_tokens_seen": 26623296,
"step": 8450
},
{
"epoch": 0.5412585621919211,
"grad_norm": 33.70118713378906,
"learning_rate": 1.0308344763557444e-06,
"loss": 0.3241,
"num_input_tokens_seen": 26638336,
"step": 8455
},
{
"epoch": 0.541578644132898,
"grad_norm": 18.15200424194336,
"learning_rate": 1.0297176203640175e-06,
"loss": 0.2886,
"num_input_tokens_seen": 26654400,
"step": 8460
},
{
"epoch": 0.5418987260738749,
"grad_norm": 54.90450668334961,
"learning_rate": 1.0286007272694924e-06,
"loss": 0.3553,
"num_input_tokens_seen": 26669568,
"step": 8465
},
{
"epoch": 0.5422188080148518,
"grad_norm": 28.259803771972656,
"learning_rate": 1.0274837984666239e-06,
"loss": 0.4816,
"num_input_tokens_seen": 26686016,
"step": 8470
},
{
"epoch": 0.5425388899558287,
"grad_norm": 31.482337951660156,
"learning_rate": 1.02636683534991e-06,
"loss": 0.4212,
"num_input_tokens_seen": 26701504,
"step": 8475
},
{
"epoch": 0.5428589718968055,
"grad_norm": 61.22187042236328,
"learning_rate": 1.0252498393138928e-06,
"loss": 0.5995,
"num_input_tokens_seen": 26717120,
"step": 8480
},
{
"epoch": 0.5431790538377824,
"grad_norm": 66.0864486694336,
"learning_rate": 1.0241328117531546e-06,
"loss": 0.415,
"num_input_tokens_seen": 26732736,
"step": 8485
},
{
"epoch": 0.5434991357787594,
"grad_norm": 33.275177001953125,
"learning_rate": 1.0230157540623174e-06,
"loss": 0.4128,
"num_input_tokens_seen": 26747392,
"step": 8490
},
{
"epoch": 0.5438192177197363,
"grad_norm": 24.452760696411133,
"learning_rate": 1.0218986676360415e-06,
"loss": 0.4605,
"num_input_tokens_seen": 26762112,
"step": 8495
},
{
"epoch": 0.5441392996607132,
"grad_norm": 22.125686645507812,
"learning_rate": 1.0207815538690216e-06,
"loss": 0.3673,
"num_input_tokens_seen": 26777856,
"step": 8500
},
{
"epoch": 0.54445938160169,
"grad_norm": 49.08604049682617,
"learning_rate": 1.0196644141559877e-06,
"loss": 0.3133,
"num_input_tokens_seen": 26794048,
"step": 8505
},
{
"epoch": 0.5447794635426669,
"grad_norm": 40.573177337646484,
"learning_rate": 1.0185472498917021e-06,
"loss": 0.3397,
"num_input_tokens_seen": 26809792,
"step": 8510
},
{
"epoch": 0.5450995454836438,
"grad_norm": 53.308963775634766,
"learning_rate": 1.017430062470957e-06,
"loss": 0.4261,
"num_input_tokens_seen": 26825024,
"step": 8515
},
{
"epoch": 0.5454196274246207,
"grad_norm": 29.25503921508789,
"learning_rate": 1.016312853288574e-06,
"loss": 0.3494,
"num_input_tokens_seen": 26841536,
"step": 8520
},
{
"epoch": 0.5457397093655976,
"grad_norm": 23.03032684326172,
"learning_rate": 1.0151956237394027e-06,
"loss": 0.3875,
"num_input_tokens_seen": 26857600,
"step": 8525
},
{
"epoch": 0.5460597913065744,
"grad_norm": 30.113536834716797,
"learning_rate": 1.0140783752183164e-06,
"loss": 0.3999,
"num_input_tokens_seen": 26874176,
"step": 8530
},
{
"epoch": 0.5463798732475513,
"grad_norm": 29.10158348083496,
"learning_rate": 1.0129611091202138e-06,
"loss": 0.4338,
"num_input_tokens_seen": 26890176,
"step": 8535
},
{
"epoch": 0.5466999551885282,
"grad_norm": 29.037277221679688,
"learning_rate": 1.0118438268400135e-06,
"loss": 0.2926,
"num_input_tokens_seen": 26905728,
"step": 8540
},
{
"epoch": 0.5470200371295052,
"grad_norm": 48.003170013427734,
"learning_rate": 1.0107265297726568e-06,
"loss": 0.4599,
"num_input_tokens_seen": 26921280,
"step": 8545
},
{
"epoch": 0.5473401190704821,
"grad_norm": 32.988037109375,
"learning_rate": 1.009609219313102e-06,
"loss": 0.4048,
"num_input_tokens_seen": 26936704,
"step": 8550
},
{
"epoch": 0.547660201011459,
"grad_norm": 18.555313110351562,
"learning_rate": 1.0084918968563236e-06,
"loss": 0.3919,
"num_input_tokens_seen": 26952448,
"step": 8555
},
{
"epoch": 0.5479802829524358,
"grad_norm": 31.703615188598633,
"learning_rate": 1.0073745637973124e-06,
"loss": 0.3917,
"num_input_tokens_seen": 26967680,
"step": 8560
},
{
"epoch": 0.5483003648934127,
"grad_norm": 17.730825424194336,
"learning_rate": 1.0062572215310718e-06,
"loss": 0.3606,
"num_input_tokens_seen": 26982400,
"step": 8565
},
{
"epoch": 0.5486204468343896,
"grad_norm": 45.23028564453125,
"learning_rate": 1.0051398714526165e-06,
"loss": 0.3227,
"num_input_tokens_seen": 26998400,
"step": 8570
},
{
"epoch": 0.5489405287753665,
"grad_norm": 54.13836669921875,
"learning_rate": 1.0040225149569712e-06,
"loss": 0.3731,
"num_input_tokens_seen": 27015936,
"step": 8575
},
{
"epoch": 0.5492606107163434,
"grad_norm": 40.3460693359375,
"learning_rate": 1.0029051534391693e-06,
"loss": 0.3339,
"num_input_tokens_seen": 27030528,
"step": 8580
},
{
"epoch": 0.5495806926573202,
"grad_norm": 20.486562728881836,
"learning_rate": 1.001787788294249e-06,
"loss": 0.3793,
"num_input_tokens_seen": 27046080,
"step": 8585
},
{
"epoch": 0.5499007745982971,
"grad_norm": 27.3046932220459,
"learning_rate": 1.0006704209172537e-06,
"loss": 0.4226,
"num_input_tokens_seen": 27061504,
"step": 8590
},
{
"epoch": 0.5502208565392741,
"grad_norm": 53.627471923828125,
"learning_rate": 9.995530527032301e-07,
"loss": 0.4382,
"num_input_tokens_seen": 27077056,
"step": 8595
},
{
"epoch": 0.550540938480251,
"grad_norm": 27.333585739135742,
"learning_rate": 9.984356850472257e-07,
"loss": 0.3435,
"num_input_tokens_seen": 27095168,
"step": 8600
},
{
"epoch": 0.5506689712566417,
"eval_loss": 0.40140706300735474,
"eval_runtime": 50.6758,
"eval_samples_per_second": 274.017,
"eval_steps_per_second": 34.257,
"num_input_tokens_seen": 27101056,
"step": 8602
},
{
"epoch": 0.5508610204212279,
"grad_norm": 15.570174217224121,
"learning_rate": 9.97318319344287e-07,
"loss": 0.3753,
"num_input_tokens_seen": 27110144,
"step": 8605
},
{
"epoch": 0.5511811023622047,
"grad_norm": 27.186506271362305,
"learning_rate": 9.962009569894577e-07,
"loss": 0.5273,
"num_input_tokens_seen": 27124864,
"step": 8610
},
{
"epoch": 0.5515011843031816,
"grad_norm": 32.424312591552734,
"learning_rate": 9.95083599377778e-07,
"loss": 0.3813,
"num_input_tokens_seen": 27140160,
"step": 8615
},
{
"epoch": 0.5518212662441585,
"grad_norm": 29.457183837890625,
"learning_rate": 9.939662479042828e-07,
"loss": 0.3966,
"num_input_tokens_seen": 27155712,
"step": 8620
},
{
"epoch": 0.5521413481851354,
"grad_norm": 62.78722381591797,
"learning_rate": 9.92848903963998e-07,
"loss": 0.4682,
"num_input_tokens_seen": 27171520,
"step": 8625
},
{
"epoch": 0.5524614301261123,
"grad_norm": 38.034725189208984,
"learning_rate": 9.9173156895194e-07,
"loss": 0.441,
"num_input_tokens_seen": 27186752,
"step": 8630
},
{
"epoch": 0.5527815120670891,
"grad_norm": 42.317752838134766,
"learning_rate": 9.906142442631154e-07,
"loss": 0.3889,
"num_input_tokens_seen": 27201664,
"step": 8635
},
{
"epoch": 0.553101594008066,
"grad_norm": 18.538700103759766,
"learning_rate": 9.894969312925171e-07,
"loss": 0.3914,
"num_input_tokens_seen": 27218880,
"step": 8640
},
{
"epoch": 0.5534216759490429,
"grad_norm": 47.84024429321289,
"learning_rate": 9.883796314351234e-07,
"loss": 0.3477,
"num_input_tokens_seen": 27235648,
"step": 8645
},
{
"epoch": 0.5537417578900199,
"grad_norm": 15.358366012573242,
"learning_rate": 9.872623460858966e-07,
"loss": 0.3945,
"num_input_tokens_seen": 27250880,
"step": 8650
},
{
"epoch": 0.5540618398309968,
"grad_norm": 13.712646484375,
"learning_rate": 9.861450766397799e-07,
"loss": 0.3152,
"num_input_tokens_seen": 27266880,
"step": 8655
},
{
"epoch": 0.5543819217719737,
"grad_norm": 25.727859497070312,
"learning_rate": 9.850278244916976e-07,
"loss": 0.4157,
"num_input_tokens_seen": 27282816,
"step": 8660
},
{
"epoch": 0.5547020037129505,
"grad_norm": 19.773151397705078,
"learning_rate": 9.839105910365524e-07,
"loss": 0.4323,
"num_input_tokens_seen": 27298496,
"step": 8665
},
{
"epoch": 0.5550220856539274,
"grad_norm": 41.33201217651367,
"learning_rate": 9.827933776692235e-07,
"loss": 0.3436,
"num_input_tokens_seen": 27313856,
"step": 8670
},
{
"epoch": 0.5553421675949043,
"grad_norm": 30.78877067565918,
"learning_rate": 9.81676185784564e-07,
"loss": 0.3362,
"num_input_tokens_seen": 27328448,
"step": 8675
},
{
"epoch": 0.5556622495358812,
"grad_norm": 17.378814697265625,
"learning_rate": 9.805590167774021e-07,
"loss": 0.4001,
"num_input_tokens_seen": 27343872,
"step": 8680
},
{
"epoch": 0.555982331476858,
"grad_norm": 80.3104476928711,
"learning_rate": 9.79441872042536e-07,
"loss": 0.5593,
"num_input_tokens_seen": 27358720,
"step": 8685
},
{
"epoch": 0.5563024134178349,
"grad_norm": 31.166763305664062,
"learning_rate": 9.783247529747338e-07,
"loss": 0.3818,
"num_input_tokens_seen": 27373312,
"step": 8690
},
{
"epoch": 0.5566224953588118,
"grad_norm": 24.34861946105957,
"learning_rate": 9.772076609687323e-07,
"loss": 0.358,
"num_input_tokens_seen": 27388544,
"step": 8695
},
{
"epoch": 0.5569425772997888,
"grad_norm": 37.79168701171875,
"learning_rate": 9.760905974192334e-07,
"loss": 0.3191,
"num_input_tokens_seen": 27405120,
"step": 8700
},
{
"epoch": 0.5572626592407657,
"grad_norm": 25.076248168945312,
"learning_rate": 9.749735637209044e-07,
"loss": 0.4284,
"num_input_tokens_seen": 27420544,
"step": 8705
},
{
"epoch": 0.5575827411817426,
"grad_norm": 18.509187698364258,
"learning_rate": 9.738565612683754e-07,
"loss": 0.3233,
"num_input_tokens_seen": 27435456,
"step": 8710
},
{
"epoch": 0.5579028231227194,
"grad_norm": 29.372270584106445,
"learning_rate": 9.727395914562363e-07,
"loss": 0.3406,
"num_input_tokens_seen": 27452032,
"step": 8715
},
{
"epoch": 0.5582229050636963,
"grad_norm": 26.19953155517578,
"learning_rate": 9.716226556790372e-07,
"loss": 0.4084,
"num_input_tokens_seen": 27467520,
"step": 8720
},
{
"epoch": 0.5585429870046732,
"grad_norm": 29.907907485961914,
"learning_rate": 9.705057553312855e-07,
"loss": 0.3149,
"num_input_tokens_seen": 27482816,
"step": 8725
},
{
"epoch": 0.5588630689456501,
"grad_norm": 26.37492561340332,
"learning_rate": 9.693888918074452e-07,
"loss": 0.3853,
"num_input_tokens_seen": 27497600,
"step": 8730
},
{
"epoch": 0.559183150886627,
"grad_norm": 41.72505187988281,
"learning_rate": 9.682720665019325e-07,
"loss": 0.4952,
"num_input_tokens_seen": 27513344,
"step": 8735
},
{
"epoch": 0.5595032328276038,
"grad_norm": 21.078269958496094,
"learning_rate": 9.671552808091172e-07,
"loss": 0.4144,
"num_input_tokens_seen": 27530304,
"step": 8740
},
{
"epoch": 0.5598233147685807,
"grad_norm": 24.30893898010254,
"learning_rate": 9.660385361233195e-07,
"loss": 0.3504,
"num_input_tokens_seen": 27545664,
"step": 8745
},
{
"epoch": 0.5601433967095576,
"grad_norm": 21.70425796508789,
"learning_rate": 9.649218338388084e-07,
"loss": 0.3053,
"num_input_tokens_seen": 27560704,
"step": 8750
},
{
"epoch": 0.5604634786505346,
"grad_norm": 23.361452102661133,
"learning_rate": 9.638051753497994e-07,
"loss": 0.4472,
"num_input_tokens_seen": 27577472,
"step": 8755
},
{
"epoch": 0.5607835605915115,
"grad_norm": 21.00771713256836,
"learning_rate": 9.62688562050454e-07,
"loss": 0.3676,
"num_input_tokens_seen": 27592960,
"step": 8760
},
{
"epoch": 0.5611036425324883,
"grad_norm": 16.86574363708496,
"learning_rate": 9.615719953348772e-07,
"loss": 0.4074,
"num_input_tokens_seen": 27610304,
"step": 8765
},
{
"epoch": 0.5614237244734652,
"grad_norm": 36.6751594543457,
"learning_rate": 9.604554765971148e-07,
"loss": 0.568,
"num_input_tokens_seen": 27628288,
"step": 8770
},
{
"epoch": 0.5617438064144421,
"grad_norm": 21.949472427368164,
"learning_rate": 9.593390072311549e-07,
"loss": 0.4119,
"num_input_tokens_seen": 27643904,
"step": 8775
},
{
"epoch": 0.562063888355419,
"grad_norm": 21.52281951904297,
"learning_rate": 9.582225886309216e-07,
"loss": 0.3703,
"num_input_tokens_seen": 27660224,
"step": 8780
},
{
"epoch": 0.5623839702963959,
"grad_norm": 17.920726776123047,
"learning_rate": 9.571062221902767e-07,
"loss": 0.3098,
"num_input_tokens_seen": 27675136,
"step": 8785
},
{
"epoch": 0.5627040522373727,
"grad_norm": 56.251609802246094,
"learning_rate": 9.559899093030175e-07,
"loss": 0.3557,
"num_input_tokens_seen": 27690176,
"step": 8790
},
{
"epoch": 0.5630241341783496,
"grad_norm": 34.147911071777344,
"learning_rate": 9.54873651362873e-07,
"loss": 0.3065,
"num_input_tokens_seen": 27704512,
"step": 8795
},
{
"epoch": 0.5633442161193265,
"grad_norm": 47.091190338134766,
"learning_rate": 9.537574497635043e-07,
"loss": 0.4565,
"num_input_tokens_seen": 27720448,
"step": 8800
},
{
"epoch": 0.5636642980603035,
"grad_norm": 30.534454345703125,
"learning_rate": 9.52641305898503e-07,
"loss": 0.5121,
"num_input_tokens_seen": 27735808,
"step": 8805
},
{
"epoch": 0.5639843800012804,
"grad_norm": 26.536653518676758,
"learning_rate": 9.515252211613873e-07,
"loss": 0.3203,
"num_input_tokens_seen": 27750464,
"step": 8810
},
{
"epoch": 0.5643044619422573,
"grad_norm": 37.218082427978516,
"learning_rate": 9.504091969456021e-07,
"loss": 0.4539,
"num_input_tokens_seen": 27764352,
"step": 8815
},
{
"epoch": 0.5646245438832341,
"grad_norm": 21.591670989990234,
"learning_rate": 9.492932346445165e-07,
"loss": 0.3435,
"num_input_tokens_seen": 27779840,
"step": 8820
},
{
"epoch": 0.564944625824211,
"grad_norm": 23.125856399536133,
"learning_rate": 9.48177335651423e-07,
"loss": 0.2767,
"num_input_tokens_seen": 27796352,
"step": 8825
},
{
"epoch": 0.5652647077651879,
"grad_norm": 34.4669075012207,
"learning_rate": 9.470615013595346e-07,
"loss": 0.343,
"num_input_tokens_seen": 27810624,
"step": 8830
},
{
"epoch": 0.5655847897061648,
"grad_norm": 37.58964538574219,
"learning_rate": 9.459457331619829e-07,
"loss": 0.4395,
"num_input_tokens_seen": 27825152,
"step": 8835
},
{
"epoch": 0.5659048716471416,
"grad_norm": 32.50901794433594,
"learning_rate": 9.448300324518182e-07,
"loss": 0.4142,
"num_input_tokens_seen": 27840384,
"step": 8840
},
{
"epoch": 0.5662249535881185,
"grad_norm": 36.96337127685547,
"learning_rate": 9.437144006220058e-07,
"loss": 0.3014,
"num_input_tokens_seen": 27856640,
"step": 8845
},
{
"epoch": 0.5665450355290954,
"grad_norm": 9.99196720123291,
"learning_rate": 9.425988390654249e-07,
"loss": 0.2097,
"num_input_tokens_seen": 27872768,
"step": 8850
},
{
"epoch": 0.5668651174700723,
"grad_norm": 39.93415069580078,
"learning_rate": 9.414833491748677e-07,
"loss": 0.5239,
"num_input_tokens_seen": 27887488,
"step": 8855
},
{
"epoch": 0.5671851994110493,
"grad_norm": 42.46604919433594,
"learning_rate": 9.40367932343036e-07,
"loss": 0.2943,
"num_input_tokens_seen": 27902720,
"step": 8860
},
{
"epoch": 0.5675052813520262,
"grad_norm": 32.06291961669922,
"learning_rate": 9.392525899625407e-07,
"loss": 0.3817,
"num_input_tokens_seen": 27918080,
"step": 8865
},
{
"epoch": 0.567825363293003,
"grad_norm": 50.3513298034668,
"learning_rate": 9.381373234259004e-07,
"loss": 0.3887,
"num_input_tokens_seen": 27933760,
"step": 8870
},
{
"epoch": 0.5681454452339799,
"grad_norm": 45.42866134643555,
"learning_rate": 9.370221341255382e-07,
"loss": 0.3858,
"num_input_tokens_seen": 27948992,
"step": 8875
},
{
"epoch": 0.5684655271749568,
"grad_norm": 33.09145736694336,
"learning_rate": 9.359070234537807e-07,
"loss": 0.3428,
"num_input_tokens_seen": 27966848,
"step": 8880
},
{
"epoch": 0.5687856091159337,
"grad_norm": 28.965185165405273,
"learning_rate": 9.34791992802857e-07,
"loss": 0.3816,
"num_input_tokens_seen": 27981696,
"step": 8885
},
{
"epoch": 0.5691056910569106,
"grad_norm": 28.548234939575195,
"learning_rate": 9.336770435648963e-07,
"loss": 0.2578,
"num_input_tokens_seen": 27997376,
"step": 8890
},
{
"epoch": 0.5694257729978874,
"grad_norm": 27.79604148864746,
"learning_rate": 9.325621771319246e-07,
"loss": 0.4013,
"num_input_tokens_seen": 28014016,
"step": 8895
},
{
"epoch": 0.5697458549388643,
"grad_norm": 21.713794708251953,
"learning_rate": 9.314473948958673e-07,
"loss": 0.4245,
"num_input_tokens_seen": 28030400,
"step": 8900
},
{
"epoch": 0.5700659368798412,
"grad_norm": 28.179527282714844,
"learning_rate": 9.303326982485422e-07,
"loss": 0.3464,
"num_input_tokens_seen": 28047104,
"step": 8905
},
{
"epoch": 0.5703860188208181,
"grad_norm": 53.25274658203125,
"learning_rate": 9.29218088581661e-07,
"loss": 0.3751,
"num_input_tokens_seen": 28063168,
"step": 8910
},
{
"epoch": 0.5707061007617951,
"grad_norm": 29.214618682861328,
"learning_rate": 9.281035672868278e-07,
"loss": 0.3567,
"num_input_tokens_seen": 28079104,
"step": 8915
},
{
"epoch": 0.571026182702772,
"grad_norm": 28.718603134155273,
"learning_rate": 9.269891357555348e-07,
"loss": 0.4098,
"num_input_tokens_seen": 28094720,
"step": 8920
},
{
"epoch": 0.5713462646437488,
"grad_norm": 38.927711486816406,
"learning_rate": 9.25874795379163e-07,
"loss": 0.2775,
"num_input_tokens_seen": 28110848,
"step": 8925
},
{
"epoch": 0.5716663465847257,
"grad_norm": 23.283519744873047,
"learning_rate": 9.247605475489793e-07,
"loss": 0.4246,
"num_input_tokens_seen": 28127040,
"step": 8930
},
{
"epoch": 0.5719864285257026,
"grad_norm": 33.84523010253906,
"learning_rate": 9.236463936561358e-07,
"loss": 0.3106,
"num_input_tokens_seen": 28143424,
"step": 8935
},
{
"epoch": 0.5723065104666795,
"grad_norm": 48.51530456542969,
"learning_rate": 9.225323350916661e-07,
"loss": 0.5312,
"num_input_tokens_seen": 28158528,
"step": 8940
},
{
"epoch": 0.5726265924076563,
"grad_norm": 38.23236846923828,
"learning_rate": 9.214183732464855e-07,
"loss": 0.3963,
"num_input_tokens_seen": 28173888,
"step": 8945
},
{
"epoch": 0.5729466743486332,
"grad_norm": 21.70241928100586,
"learning_rate": 9.203045095113886e-07,
"loss": 0.3663,
"num_input_tokens_seen": 28191872,
"step": 8950
},
{
"epoch": 0.5732667562896101,
"grad_norm": 45.41887283325195,
"learning_rate": 9.191907452770476e-07,
"loss": 0.4394,
"num_input_tokens_seen": 28206912,
"step": 8955
},
{
"epoch": 0.573586838230587,
"grad_norm": 30.060129165649414,
"learning_rate": 9.180770819340095e-07,
"loss": 0.4103,
"num_input_tokens_seen": 28222336,
"step": 8960
},
{
"epoch": 0.573906920171564,
"grad_norm": 13.660242080688477,
"learning_rate": 9.169635208726967e-07,
"loss": 0.3816,
"num_input_tokens_seen": 28238144,
"step": 8965
},
{
"epoch": 0.5742270021125409,
"grad_norm": 61.72530746459961,
"learning_rate": 9.15850063483403e-07,
"loss": 0.3919,
"num_input_tokens_seen": 28253376,
"step": 8970
},
{
"epoch": 0.5745470840535177,
"grad_norm": 27.641267776489258,
"learning_rate": 9.147367111562928e-07,
"loss": 0.3549,
"num_input_tokens_seen": 28269248,
"step": 8975
},
{
"epoch": 0.5748671659944946,
"grad_norm": 41.6641845703125,
"learning_rate": 9.136234652814005e-07,
"loss": 0.4151,
"num_input_tokens_seen": 28285440,
"step": 8980
},
{
"epoch": 0.5751872479354715,
"grad_norm": 31.049930572509766,
"learning_rate": 9.125103272486255e-07,
"loss": 0.3061,
"num_input_tokens_seen": 28300736,
"step": 8985
},
{
"epoch": 0.5755073298764484,
"grad_norm": 31.115554809570312,
"learning_rate": 9.11397298447734e-07,
"loss": 0.3626,
"num_input_tokens_seen": 28315712,
"step": 8990
},
{
"epoch": 0.5758274118174252,
"grad_norm": 28.032060623168945,
"learning_rate": 9.10284380268356e-07,
"loss": 0.342,
"num_input_tokens_seen": 28332032,
"step": 8995
},
{
"epoch": 0.5761474937584021,
"grad_norm": 27.94725227355957,
"learning_rate": 9.091715740999828e-07,
"loss": 0.4546,
"num_input_tokens_seen": 28347968,
"step": 9000
},
{
"epoch": 0.576467575699379,
"grad_norm": 26.713380813598633,
"learning_rate": 9.080588813319654e-07,
"loss": 0.39,
"num_input_tokens_seen": 28362944,
"step": 9005
},
{
"epoch": 0.5767876576403559,
"grad_norm": 41.13479995727539,
"learning_rate": 9.069463033535143e-07,
"loss": 0.2894,
"num_input_tokens_seen": 28378624,
"step": 9010
},
{
"epoch": 0.5771077395813328,
"grad_norm": 62.73693084716797,
"learning_rate": 9.058338415536962e-07,
"loss": 0.3832,
"num_input_tokens_seen": 28394048,
"step": 9015
},
{
"epoch": 0.5774278215223098,
"grad_norm": 40.832916259765625,
"learning_rate": 9.04721497321432e-07,
"loss": 0.3796,
"num_input_tokens_seen": 28409664,
"step": 9020
},
{
"epoch": 0.5777479034632866,
"grad_norm": 33.98543930053711,
"learning_rate": 9.036092720454977e-07,
"loss": 0.3794,
"num_input_tokens_seen": 28424768,
"step": 9025
},
{
"epoch": 0.5780679854042635,
"grad_norm": 32.285762786865234,
"learning_rate": 9.024971671145189e-07,
"loss": 0.3439,
"num_input_tokens_seen": 28439424,
"step": 9030
},
{
"epoch": 0.5783880673452404,
"grad_norm": 46.4213752746582,
"learning_rate": 9.013851839169718e-07,
"loss": 0.443,
"num_input_tokens_seen": 28456064,
"step": 9035
},
{
"epoch": 0.5787081492862173,
"grad_norm": 41.62101745605469,
"learning_rate": 9.002733238411801e-07,
"loss": 0.3457,
"num_input_tokens_seen": 28472768,
"step": 9040
},
{
"epoch": 0.5790282312271942,
"grad_norm": 31.970539093017578,
"learning_rate": 8.991615882753147e-07,
"loss": 0.3528,
"num_input_tokens_seen": 28488704,
"step": 9045
},
{
"epoch": 0.579348313168171,
"grad_norm": 60.6664924621582,
"learning_rate": 8.980499786073904e-07,
"loss": 0.4516,
"num_input_tokens_seen": 28503808,
"step": 9050
},
{
"epoch": 0.5796683951091479,
"grad_norm": 67.51182556152344,
"learning_rate": 8.969384962252645e-07,
"loss": 0.4616,
"num_input_tokens_seen": 28520320,
"step": 9055
},
{
"epoch": 0.5799884770501248,
"grad_norm": 48.923702239990234,
"learning_rate": 8.958271425166366e-07,
"loss": 0.4395,
"num_input_tokens_seen": 28535680,
"step": 9060
},
{
"epoch": 0.5803085589911017,
"grad_norm": 24.09952735900879,
"learning_rate": 8.947159188690442e-07,
"loss": 0.3943,
"num_input_tokens_seen": 28551488,
"step": 9065
},
{
"epoch": 0.5806286409320787,
"grad_norm": 63.0959587097168,
"learning_rate": 8.93604826669863e-07,
"loss": 0.4633,
"num_input_tokens_seen": 28567040,
"step": 9070
},
{
"epoch": 0.5809487228730555,
"grad_norm": 25.30870246887207,
"learning_rate": 8.924938673063052e-07,
"loss": 0.389,
"num_input_tokens_seen": 28581568,
"step": 9075
},
{
"epoch": 0.5812688048140324,
"grad_norm": 15.70492172241211,
"learning_rate": 8.913830421654166e-07,
"loss": 0.3616,
"num_input_tokens_seen": 28596992,
"step": 9080
},
{
"epoch": 0.5815888867550093,
"grad_norm": 25.204086303710938,
"learning_rate": 8.902723526340746e-07,
"loss": 0.4752,
"num_input_tokens_seen": 28613952,
"step": 9085
},
{
"epoch": 0.5819089686959862,
"grad_norm": 33.69202423095703,
"learning_rate": 8.89161800098989e-07,
"loss": 0.4343,
"num_input_tokens_seen": 28628736,
"step": 9090
},
{
"epoch": 0.5822290506369631,
"grad_norm": 56.151214599609375,
"learning_rate": 8.880513859466974e-07,
"loss": 0.3683,
"num_input_tokens_seen": 28644928,
"step": 9095
},
{
"epoch": 0.5825491325779399,
"grad_norm": 20.137807846069336,
"learning_rate": 8.869411115635645e-07,
"loss": 0.2861,
"num_input_tokens_seen": 28661184,
"step": 9100
},
{
"epoch": 0.5828692145189168,
"grad_norm": 19.313888549804688,
"learning_rate": 8.858309783357816e-07,
"loss": 0.2823,
"num_input_tokens_seen": 28675776,
"step": 9105
},
{
"epoch": 0.5831892964598937,
"grad_norm": 55.6663703918457,
"learning_rate": 8.847209876493629e-07,
"loss": 0.4335,
"num_input_tokens_seen": 28692160,
"step": 9110
},
{
"epoch": 0.5835093784008706,
"grad_norm": 29.397314071655273,
"learning_rate": 8.836111408901441e-07,
"loss": 0.2627,
"num_input_tokens_seen": 28707328,
"step": 9115
},
{
"epoch": 0.5838294603418475,
"grad_norm": 45.07856369018555,
"learning_rate": 8.825014394437828e-07,
"loss": 0.4159,
"num_input_tokens_seen": 28722624,
"step": 9120
},
{
"epoch": 0.5841495422828245,
"grad_norm": 19.385255813598633,
"learning_rate": 8.813918846957542e-07,
"loss": 0.4013,
"num_input_tokens_seen": 28737856,
"step": 9125
},
{
"epoch": 0.5844696242238013,
"grad_norm": 20.24775505065918,
"learning_rate": 8.802824780313499e-07,
"loss": 0.4447,
"num_input_tokens_seen": 28752448,
"step": 9130
},
{
"epoch": 0.5847897061647782,
"grad_norm": 24.05107307434082,
"learning_rate": 8.791732208356771e-07,
"loss": 0.3924,
"num_input_tokens_seen": 28767616,
"step": 9135
},
{
"epoch": 0.5851097881057551,
"grad_norm": 16.49118995666504,
"learning_rate": 8.780641144936573e-07,
"loss": 0.4676,
"num_input_tokens_seen": 28782400,
"step": 9140
},
{
"epoch": 0.585429870046732,
"grad_norm": 51.63336944580078,
"learning_rate": 8.76955160390022e-07,
"loss": 0.446,
"num_input_tokens_seen": 28798336,
"step": 9145
},
{
"epoch": 0.5857499519877089,
"grad_norm": 16.7198543548584,
"learning_rate": 8.758463599093136e-07,
"loss": 0.2893,
"num_input_tokens_seen": 28814336,
"step": 9150
},
{
"epoch": 0.5860700339286857,
"grad_norm": 42.81842041015625,
"learning_rate": 8.747377144358825e-07,
"loss": 0.5245,
"num_input_tokens_seen": 28830656,
"step": 9155
},
{
"epoch": 0.5863901158696626,
"grad_norm": 45.69813919067383,
"learning_rate": 8.736292253538861e-07,
"loss": 0.4169,
"num_input_tokens_seen": 28846656,
"step": 9160
},
{
"epoch": 0.5867101978106395,
"grad_norm": 36.1093635559082,
"learning_rate": 8.725208940472851e-07,
"loss": 0.3115,
"num_input_tokens_seen": 28862848,
"step": 9165
},
{
"epoch": 0.5870302797516164,
"grad_norm": 15.554344177246094,
"learning_rate": 8.714127218998448e-07,
"loss": 0.4071,
"num_input_tokens_seen": 28878400,
"step": 9170
},
{
"epoch": 0.5873503616925934,
"grad_norm": 65.59827423095703,
"learning_rate": 8.70304710295131e-07,
"loss": 0.5141,
"num_input_tokens_seen": 28893568,
"step": 9175
},
{
"epoch": 0.5876704436335702,
"grad_norm": 33.058006286621094,
"learning_rate": 8.691968606165092e-07,
"loss": 0.3766,
"num_input_tokens_seen": 28909824,
"step": 9180
},
{
"epoch": 0.5879905255745471,
"grad_norm": 31.04238510131836,
"learning_rate": 8.680891742471429e-07,
"loss": 0.3189,
"num_input_tokens_seen": 28925568,
"step": 9185
},
{
"epoch": 0.588310607515524,
"grad_norm": 28.480064392089844,
"learning_rate": 8.669816525699912e-07,
"loss": 0.3236,
"num_input_tokens_seen": 28941056,
"step": 9190
},
{
"epoch": 0.5886306894565009,
"grad_norm": 35.62641143798828,
"learning_rate": 8.658742969678079e-07,
"loss": 0.4153,
"num_input_tokens_seen": 28955456,
"step": 9195
},
{
"epoch": 0.5889507713974778,
"grad_norm": 35.563079833984375,
"learning_rate": 8.647671088231398e-07,
"loss": 0.2925,
"num_input_tokens_seen": 28971136,
"step": 9200
},
{
"epoch": 0.5892708533384546,
"grad_norm": 49.758174896240234,
"learning_rate": 8.636600895183245e-07,
"loss": 0.4144,
"num_input_tokens_seen": 28988480,
"step": 9205
},
{
"epoch": 0.5895909352794315,
"grad_norm": 45.750244140625,
"learning_rate": 8.625532404354877e-07,
"loss": 0.3702,
"num_input_tokens_seen": 29004544,
"step": 9210
},
{
"epoch": 0.5899110172204084,
"grad_norm": 15.713849067687988,
"learning_rate": 8.614465629565443e-07,
"loss": 0.3944,
"num_input_tokens_seen": 29019328,
"step": 9215
},
{
"epoch": 0.5902310991613853,
"grad_norm": 27.874608993530273,
"learning_rate": 8.603400584631939e-07,
"loss": 0.3414,
"num_input_tokens_seen": 29034752,
"step": 9220
},
{
"epoch": 0.5905511811023622,
"grad_norm": 35.91742706298828,
"learning_rate": 8.592337283369198e-07,
"loss": 0.4473,
"num_input_tokens_seen": 29050816,
"step": 9225
},
{
"epoch": 0.5908712630433391,
"grad_norm": 28.39652442932129,
"learning_rate": 8.581275739589893e-07,
"loss": 0.2833,
"num_input_tokens_seen": 29065920,
"step": 9230
},
{
"epoch": 0.591191344984316,
"grad_norm": 38.084529876708984,
"learning_rate": 8.570215967104481e-07,
"loss": 0.509,
"num_input_tokens_seen": 29080960,
"step": 9235
},
{
"epoch": 0.5915114269252929,
"grad_norm": 25.937759399414062,
"learning_rate": 8.559157979721225e-07,
"loss": 0.4754,
"num_input_tokens_seen": 29096768,
"step": 9240
},
{
"epoch": 0.5918315088662698,
"grad_norm": 34.819908142089844,
"learning_rate": 8.548101791246145e-07,
"loss": 0.5592,
"num_input_tokens_seen": 29112448,
"step": 9245
},
{
"epoch": 0.5921515908072467,
"grad_norm": 25.236101150512695,
"learning_rate": 8.537047415483028e-07,
"loss": 0.3436,
"num_input_tokens_seen": 29127808,
"step": 9250
},
{
"epoch": 0.5924716727482235,
"grad_norm": 14.861101150512695,
"learning_rate": 8.525994866233388e-07,
"loss": 0.2783,
"num_input_tokens_seen": 29142912,
"step": 9255
},
{
"epoch": 0.5927917546892004,
"grad_norm": 46.569793701171875,
"learning_rate": 8.514944157296464e-07,
"loss": 0.3963,
"num_input_tokens_seen": 29159168,
"step": 9260
},
{
"epoch": 0.5931118366301773,
"grad_norm": 38.154476165771484,
"learning_rate": 8.503895302469199e-07,
"loss": 0.3875,
"num_input_tokens_seen": 29175488,
"step": 9265
},
{
"epoch": 0.5934319185711542,
"grad_norm": 38.24485778808594,
"learning_rate": 8.492848315546214e-07,
"loss": 0.4151,
"num_input_tokens_seen": 29191104,
"step": 9270
},
{
"epoch": 0.5937520005121311,
"grad_norm": 18.280685424804688,
"learning_rate": 8.4818032103198e-07,
"loss": 0.4485,
"num_input_tokens_seen": 29206208,
"step": 9275
},
{
"epoch": 0.5940720824531079,
"grad_norm": 44.53709030151367,
"learning_rate": 8.470760000579906e-07,
"loss": 0.4186,
"num_input_tokens_seen": 29221312,
"step": 9280
},
{
"epoch": 0.5943921643940849,
"grad_norm": 47.62037658691406,
"learning_rate": 8.459718700114108e-07,
"loss": 0.5047,
"num_input_tokens_seen": 29236800,
"step": 9285
},
{
"epoch": 0.5947122463350618,
"grad_norm": 32.94901657104492,
"learning_rate": 8.448679322707595e-07,
"loss": 0.4508,
"num_input_tokens_seen": 29252480,
"step": 9290
},
{
"epoch": 0.5950323282760387,
"grad_norm": 41.171470642089844,
"learning_rate": 8.437641882143163e-07,
"loss": 0.6011,
"num_input_tokens_seen": 29266944,
"step": 9295
},
{
"epoch": 0.5953524102170156,
"grad_norm": 19.1787166595459,
"learning_rate": 8.426606392201185e-07,
"loss": 0.3106,
"num_input_tokens_seen": 29282816,
"step": 9300
},
{
"epoch": 0.5956724921579925,
"grad_norm": 25.21051597595215,
"learning_rate": 8.415572866659599e-07,
"loss": 0.3154,
"num_input_tokens_seen": 29297984,
"step": 9305
},
{
"epoch": 0.5959925740989693,
"grad_norm": 24.09900665283203,
"learning_rate": 8.404541319293896e-07,
"loss": 0.3652,
"num_input_tokens_seen": 29313664,
"step": 9310
},
{
"epoch": 0.5963126560399462,
"grad_norm": 21.775083541870117,
"learning_rate": 8.393511763877086e-07,
"loss": 0.593,
"num_input_tokens_seen": 29329472,
"step": 9315
},
{
"epoch": 0.5966327379809231,
"grad_norm": 33.893714904785156,
"learning_rate": 8.3824842141797e-07,
"loss": 0.4438,
"num_input_tokens_seen": 29346048,
"step": 9320
},
{
"epoch": 0.5969528199219,
"grad_norm": 30.351116180419922,
"learning_rate": 8.371458683969765e-07,
"loss": 0.3806,
"num_input_tokens_seen": 29361664,
"step": 9325
},
{
"epoch": 0.5972729018628768,
"grad_norm": 23.981342315673828,
"learning_rate": 8.360435187012787e-07,
"loss": 0.3848,
"num_input_tokens_seen": 29376896,
"step": 9330
},
{
"epoch": 0.5975929838038538,
"grad_norm": 38.46620559692383,
"learning_rate": 8.349413737071725e-07,
"loss": 0.3866,
"num_input_tokens_seen": 29392640,
"step": 9335
},
{
"epoch": 0.5979130657448307,
"grad_norm": 31.351964950561523,
"learning_rate": 8.338394347906994e-07,
"loss": 0.4486,
"num_input_tokens_seen": 29407808,
"step": 9340
},
{
"epoch": 0.5982331476858076,
"grad_norm": 39.49605178833008,
"learning_rate": 8.327377033276431e-07,
"loss": 0.3114,
"num_input_tokens_seen": 29422528,
"step": 9345
},
{
"epoch": 0.5985532296267845,
"grad_norm": 27.02570343017578,
"learning_rate": 8.316361806935279e-07,
"loss": 0.3484,
"num_input_tokens_seen": 29438272,
"step": 9350
},
{
"epoch": 0.5988733115677614,
"grad_norm": 31.83344078063965,
"learning_rate": 8.305348682636177e-07,
"loss": 0.4397,
"num_input_tokens_seen": 29453376,
"step": 9355
},
{
"epoch": 0.5991933935087382,
"grad_norm": 31.224191665649414,
"learning_rate": 8.294337674129144e-07,
"loss": 0.4149,
"num_input_tokens_seen": 29469248,
"step": 9360
},
{
"epoch": 0.5995134754497151,
"grad_norm": 36.2857780456543,
"learning_rate": 8.283328795161554e-07,
"loss": 0.2745,
"num_input_tokens_seen": 29485888,
"step": 9365
},
{
"epoch": 0.599833557390692,
"grad_norm": 29.26090431213379,
"learning_rate": 8.272322059478114e-07,
"loss": 0.3205,
"num_input_tokens_seen": 29500864,
"step": 9370
},
{
"epoch": 0.6001536393316689,
"grad_norm": 22.406578063964844,
"learning_rate": 8.261317480820871e-07,
"loss": 0.2427,
"num_input_tokens_seen": 29516288,
"step": 9375
},
{
"epoch": 0.6004737212726458,
"grad_norm": 34.06975555419922,
"learning_rate": 8.250315072929168e-07,
"loss": 0.4129,
"num_input_tokens_seen": 29530880,
"step": 9380
},
{
"epoch": 0.6007297868254273,
"eval_loss": 0.39462828636169434,
"eval_runtime": 50.6774,
"eval_samples_per_second": 274.008,
"eval_steps_per_second": 34.256,
"num_input_tokens_seen": 29544576,
"step": 9384
},
{
"epoch": 0.6007938032136226,
"grad_norm": 21.22164535522461,
"learning_rate": 8.239314849539637e-07,
"loss": 0.35,
"num_input_tokens_seen": 29547840,
"step": 9385
},
{
"epoch": 0.6011138851545996,
"grad_norm": 33.94794464111328,
"learning_rate": 8.228316824386193e-07,
"loss": 0.4234,
"num_input_tokens_seen": 29564096,
"step": 9390
},
{
"epoch": 0.6014339670955765,
"grad_norm": 38.5579948425293,
"learning_rate": 8.217321011199995e-07,
"loss": 0.378,
"num_input_tokens_seen": 29579520,
"step": 9395
},
{
"epoch": 0.6017540490365534,
"grad_norm": 48.82097625732422,
"learning_rate": 8.206327423709441e-07,
"loss": 0.433,
"num_input_tokens_seen": 29594048,
"step": 9400
},
{
"epoch": 0.6020741309775303,
"grad_norm": 24.50299644470215,
"learning_rate": 8.195336075640163e-07,
"loss": 0.3913,
"num_input_tokens_seen": 29610368,
"step": 9405
},
{
"epoch": 0.6023942129185071,
"grad_norm": 33.91872787475586,
"learning_rate": 8.184346980714984e-07,
"loss": 0.4248,
"num_input_tokens_seen": 29625792,
"step": 9410
},
{
"epoch": 0.602714294859484,
"grad_norm": 45.42316818237305,
"learning_rate": 8.173360152653914e-07,
"loss": 0.3563,
"num_input_tokens_seen": 29642240,
"step": 9415
},
{
"epoch": 0.6030343768004609,
"grad_norm": 29.160640716552734,
"learning_rate": 8.162375605174143e-07,
"loss": 0.3138,
"num_input_tokens_seen": 29658176,
"step": 9420
},
{
"epoch": 0.6033544587414378,
"grad_norm": 29.974868774414062,
"learning_rate": 8.151393351990005e-07,
"loss": 0.3068,
"num_input_tokens_seen": 29675392,
"step": 9425
},
{
"epoch": 0.6036745406824147,
"grad_norm": 30.860563278198242,
"learning_rate": 8.140413406812971e-07,
"loss": 0.4185,
"num_input_tokens_seen": 29690048,
"step": 9430
},
{
"epoch": 0.6039946226233915,
"grad_norm": 44.825531005859375,
"learning_rate": 8.129435783351635e-07,
"loss": 0.3111,
"num_input_tokens_seen": 29705088,
"step": 9435
},
{
"epoch": 0.6043147045643685,
"grad_norm": 30.896249771118164,
"learning_rate": 8.118460495311685e-07,
"loss": 0.4421,
"num_input_tokens_seen": 29720576,
"step": 9440
},
{
"epoch": 0.6046347865053454,
"grad_norm": 30.919109344482422,
"learning_rate": 8.107487556395901e-07,
"loss": 0.4352,
"num_input_tokens_seen": 29736896,
"step": 9445
},
{
"epoch": 0.6049548684463223,
"grad_norm": 31.737159729003906,
"learning_rate": 8.096516980304115e-07,
"loss": 0.3688,
"num_input_tokens_seen": 29752768,
"step": 9450
},
{
"epoch": 0.6052749503872992,
"grad_norm": 50.259193420410156,
"learning_rate": 8.085548780733238e-07,
"loss": 0.3448,
"num_input_tokens_seen": 29768640,
"step": 9455
},
{
"epoch": 0.605595032328276,
"grad_norm": 30.822101593017578,
"learning_rate": 8.074582971377182e-07,
"loss": 0.3368,
"num_input_tokens_seen": 29786240,
"step": 9460
},
{
"epoch": 0.6059151142692529,
"grad_norm": 40.47896194458008,
"learning_rate": 8.063619565926892e-07,
"loss": 0.4407,
"num_input_tokens_seen": 29802176,
"step": 9465
},
{
"epoch": 0.6062351962102298,
"grad_norm": 17.590438842773438,
"learning_rate": 8.052658578070313e-07,
"loss": 0.3992,
"num_input_tokens_seen": 29817600,
"step": 9470
},
{
"epoch": 0.6065552781512067,
"grad_norm": 14.345329284667969,
"learning_rate": 8.041700021492362e-07,
"loss": 0.3233,
"num_input_tokens_seen": 29832960,
"step": 9475
},
{
"epoch": 0.6068753600921836,
"grad_norm": 21.619794845581055,
"learning_rate": 8.030743909874924e-07,
"loss": 0.2929,
"num_input_tokens_seen": 29848448,
"step": 9480
},
{
"epoch": 0.6071954420331604,
"grad_norm": 19.34469985961914,
"learning_rate": 8.019790256896839e-07,
"loss": 0.3299,
"num_input_tokens_seen": 29863296,
"step": 9485
},
{
"epoch": 0.6075155239741373,
"grad_norm": 49.3680419921875,
"learning_rate": 8.008839076233871e-07,
"loss": 0.3934,
"num_input_tokens_seen": 29880128,
"step": 9490
},
{
"epoch": 0.6078356059151143,
"grad_norm": 24.034257888793945,
"learning_rate": 7.997890381558691e-07,
"loss": 0.3564,
"num_input_tokens_seen": 29895296,
"step": 9495
},
{
"epoch": 0.6081556878560912,
"grad_norm": 33.2259521484375,
"learning_rate": 7.986944186540878e-07,
"loss": 0.434,
"num_input_tokens_seen": 29911296,
"step": 9500
},
{
"epoch": 0.6084757697970681,
"grad_norm": 48.712547302246094,
"learning_rate": 7.976000504846885e-07,
"loss": 0.4603,
"num_input_tokens_seen": 29926912,
"step": 9505
},
{
"epoch": 0.608795851738045,
"grad_norm": 96.85162353515625,
"learning_rate": 7.965059350140024e-07,
"loss": 0.4725,
"num_input_tokens_seen": 29942272,
"step": 9510
},
{
"epoch": 0.6091159336790218,
"grad_norm": 39.1156120300293,
"learning_rate": 7.954120736080461e-07,
"loss": 0.4093,
"num_input_tokens_seen": 29958016,
"step": 9515
},
{
"epoch": 0.6094360156199987,
"grad_norm": 25.699668884277344,
"learning_rate": 7.943184676325178e-07,
"loss": 0.5561,
"num_input_tokens_seen": 29974720,
"step": 9520
},
{
"epoch": 0.6097560975609756,
"grad_norm": 27.526456832885742,
"learning_rate": 7.932251184527974e-07,
"loss": 0.4295,
"num_input_tokens_seen": 29991680,
"step": 9525
},
{
"epoch": 0.6100761795019525,
"grad_norm": 25.670839309692383,
"learning_rate": 7.921320274339446e-07,
"loss": 0.2678,
"num_input_tokens_seen": 30007168,
"step": 9530
},
{
"epoch": 0.6103962614429294,
"grad_norm": 40.132652282714844,
"learning_rate": 7.910391959406966e-07,
"loss": 0.34,
"num_input_tokens_seen": 30022656,
"step": 9535
},
{
"epoch": 0.6107163433839062,
"grad_norm": 35.257442474365234,
"learning_rate": 7.899466253374653e-07,
"loss": 0.3896,
"num_input_tokens_seen": 30038144,
"step": 9540
},
{
"epoch": 0.6110364253248832,
"grad_norm": 33.6865234375,
"learning_rate": 7.88854316988339e-07,
"loss": 0.3321,
"num_input_tokens_seen": 30055488,
"step": 9545
},
{
"epoch": 0.6113565072658601,
"grad_norm": 41.24062728881836,
"learning_rate": 7.877622722570771e-07,
"loss": 0.3085,
"num_input_tokens_seen": 30071040,
"step": 9550
},
{
"epoch": 0.611676589206837,
"grad_norm": 26.09576416015625,
"learning_rate": 7.866704925071101e-07,
"loss": 0.4224,
"num_input_tokens_seen": 30088000,
"step": 9555
},
{
"epoch": 0.6119966711478139,
"grad_norm": 25.814329147338867,
"learning_rate": 7.855789791015377e-07,
"loss": 0.4359,
"num_input_tokens_seen": 30103040,
"step": 9560
},
{
"epoch": 0.6123167530887907,
"grad_norm": 42.739044189453125,
"learning_rate": 7.844877334031277e-07,
"loss": 0.3887,
"num_input_tokens_seen": 30117760,
"step": 9565
},
{
"epoch": 0.6126368350297676,
"grad_norm": 33.584739685058594,
"learning_rate": 7.833967567743131e-07,
"loss": 0.4969,
"num_input_tokens_seen": 30133888,
"step": 9570
},
{
"epoch": 0.6129569169707445,
"grad_norm": 33.699302673339844,
"learning_rate": 7.823060505771903e-07,
"loss": 0.3596,
"num_input_tokens_seen": 30149312,
"step": 9575
},
{
"epoch": 0.6132769989117214,
"grad_norm": 46.7923698425293,
"learning_rate": 7.812156161735199e-07,
"loss": 0.4176,
"num_input_tokens_seen": 30163840,
"step": 9580
},
{
"epoch": 0.6135970808526983,
"grad_norm": 67.17172241210938,
"learning_rate": 7.801254549247215e-07,
"loss": 0.5474,
"num_input_tokens_seen": 30180544,
"step": 9585
},
{
"epoch": 0.6139171627936751,
"grad_norm": 19.53614044189453,
"learning_rate": 7.790355681918739e-07,
"loss": 0.338,
"num_input_tokens_seen": 30197120,
"step": 9590
},
{
"epoch": 0.614237244734652,
"grad_norm": 56.427513122558594,
"learning_rate": 7.779459573357144e-07,
"loss": 0.4222,
"num_input_tokens_seen": 30213376,
"step": 9595
},
{
"epoch": 0.614557326675629,
"grad_norm": 21.21477508544922,
"learning_rate": 7.768566237166338e-07,
"loss": 0.4138,
"num_input_tokens_seen": 30229120,
"step": 9600
},
{
"epoch": 0.6148774086166059,
"grad_norm": 44.509056091308594,
"learning_rate": 7.757675686946786e-07,
"loss": 0.5188,
"num_input_tokens_seen": 30244544,
"step": 9605
},
{
"epoch": 0.6151974905575828,
"grad_norm": 29.196828842163086,
"learning_rate": 7.746787936295468e-07,
"loss": 0.4258,
"num_input_tokens_seen": 30260864,
"step": 9610
},
{
"epoch": 0.6155175724985597,
"grad_norm": 43.83945846557617,
"learning_rate": 7.735902998805868e-07,
"loss": 0.3681,
"num_input_tokens_seen": 30275456,
"step": 9615
},
{
"epoch": 0.6158376544395365,
"grad_norm": 48.068565368652344,
"learning_rate": 7.725020888067955e-07,
"loss": 0.4284,
"num_input_tokens_seen": 30291008,
"step": 9620
},
{
"epoch": 0.6161577363805134,
"grad_norm": 19.066804885864258,
"learning_rate": 7.714141617668176e-07,
"loss": 0.4779,
"num_input_tokens_seen": 30306816,
"step": 9625
},
{
"epoch": 0.6164778183214903,
"grad_norm": 25.80859375,
"learning_rate": 7.703265201189426e-07,
"loss": 0.3342,
"num_input_tokens_seen": 30322240,
"step": 9630
},
{
"epoch": 0.6167979002624672,
"grad_norm": 18.206134796142578,
"learning_rate": 7.692391652211036e-07,
"loss": 0.3333,
"num_input_tokens_seen": 30338048,
"step": 9635
},
{
"epoch": 0.617117982203444,
"grad_norm": 39.0733757019043,
"learning_rate": 7.681520984308769e-07,
"loss": 0.3256,
"num_input_tokens_seen": 30353984,
"step": 9640
},
{
"epoch": 0.6174380641444209,
"grad_norm": 41.74904251098633,
"learning_rate": 7.670653211054772e-07,
"loss": 0.496,
"num_input_tokens_seen": 30370048,
"step": 9645
},
{
"epoch": 0.6177581460853978,
"grad_norm": 36.59706497192383,
"learning_rate": 7.659788346017591e-07,
"loss": 0.4137,
"num_input_tokens_seen": 30385344,
"step": 9650
},
{
"epoch": 0.6180782280263748,
"grad_norm": 45.44746017456055,
"learning_rate": 7.648926402762133e-07,
"loss": 0.3994,
"num_input_tokens_seen": 30400576,
"step": 9655
},
{
"epoch": 0.6183983099673517,
"grad_norm": 39.379695892333984,
"learning_rate": 7.638067394849671e-07,
"loss": 0.3861,
"num_input_tokens_seen": 30415424,
"step": 9660
},
{
"epoch": 0.6187183919083286,
"grad_norm": 45.68638229370117,
"learning_rate": 7.627211335837797e-07,
"loss": 0.3971,
"num_input_tokens_seen": 30430592,
"step": 9665
},
{
"epoch": 0.6190384738493054,
"grad_norm": 22.4736385345459,
"learning_rate": 7.616358239280427e-07,
"loss": 0.4285,
"num_input_tokens_seen": 30445952,
"step": 9670
},
{
"epoch": 0.6193585557902823,
"grad_norm": 30.363630294799805,
"learning_rate": 7.605508118727787e-07,
"loss": 0.3194,
"num_input_tokens_seen": 30461568,
"step": 9675
},
{
"epoch": 0.6196786377312592,
"grad_norm": 26.44789695739746,
"learning_rate": 7.594660987726373e-07,
"loss": 0.3642,
"num_input_tokens_seen": 30476672,
"step": 9680
},
{
"epoch": 0.6199987196722361,
"grad_norm": 48.39008712768555,
"learning_rate": 7.583816859818956e-07,
"loss": 0.3969,
"num_input_tokens_seen": 30492672,
"step": 9685
},
{
"epoch": 0.620318801613213,
"grad_norm": 23.859933853149414,
"learning_rate": 7.57297574854456e-07,
"loss": 0.3783,
"num_input_tokens_seen": 30507712,
"step": 9690
},
{
"epoch": 0.6206388835541898,
"grad_norm": 48.97274398803711,
"learning_rate": 7.56213766743844e-07,
"loss": 0.4477,
"num_input_tokens_seen": 30524032,
"step": 9695
},
{
"epoch": 0.6209589654951667,
"grad_norm": 15.872191429138184,
"learning_rate": 7.551302630032064e-07,
"loss": 0.3281,
"num_input_tokens_seen": 30539776,
"step": 9700
},
{
"epoch": 0.6212790474361437,
"grad_norm": 20.59368324279785,
"learning_rate": 7.540470649853106e-07,
"loss": 0.3758,
"num_input_tokens_seen": 30554752,
"step": 9705
},
{
"epoch": 0.6215991293771206,
"grad_norm": 24.169780731201172,
"learning_rate": 7.529641740425419e-07,
"loss": 0.3955,
"num_input_tokens_seen": 30571968,
"step": 9710
},
{
"epoch": 0.6219192113180975,
"grad_norm": 31.49615478515625,
"learning_rate": 7.518815915269023e-07,
"loss": 0.449,
"num_input_tokens_seen": 30587264,
"step": 9715
},
{
"epoch": 0.6222392932590743,
"grad_norm": 19.680313110351562,
"learning_rate": 7.507993187900092e-07,
"loss": 0.3823,
"num_input_tokens_seen": 30603200,
"step": 9720
},
{
"epoch": 0.6225593752000512,
"grad_norm": 26.631305694580078,
"learning_rate": 7.497173571830926e-07,
"loss": 0.4186,
"num_input_tokens_seen": 30617856,
"step": 9725
},
{
"epoch": 0.6228794571410281,
"grad_norm": 45.39612579345703,
"learning_rate": 7.486357080569938e-07,
"loss": 0.4631,
"num_input_tokens_seen": 30632448,
"step": 9730
},
{
"epoch": 0.623199539082005,
"grad_norm": 23.03763771057129,
"learning_rate": 7.47554372762165e-07,
"loss": 0.3768,
"num_input_tokens_seen": 30647680,
"step": 9735
},
{
"epoch": 0.6235196210229819,
"grad_norm": 55.08168029785156,
"learning_rate": 7.464733526486662e-07,
"loss": 0.4872,
"num_input_tokens_seen": 30663616,
"step": 9740
},
{
"epoch": 0.6238397029639587,
"grad_norm": 44.099178314208984,
"learning_rate": 7.453926490661628e-07,
"loss": 0.3515,
"num_input_tokens_seen": 30682496,
"step": 9745
},
{
"epoch": 0.6241597849049356,
"grad_norm": 45.976837158203125,
"learning_rate": 7.443122633639267e-07,
"loss": 0.3687,
"num_input_tokens_seen": 30697664,
"step": 9750
},
{
"epoch": 0.6244798668459125,
"grad_norm": 68.98104858398438,
"learning_rate": 7.432321968908319e-07,
"loss": 0.3856,
"num_input_tokens_seen": 30713408,
"step": 9755
},
{
"epoch": 0.6247999487868895,
"grad_norm": 24.77080535888672,
"learning_rate": 7.421524509953543e-07,
"loss": 0.3178,
"num_input_tokens_seen": 30730496,
"step": 9760
},
{
"epoch": 0.6251200307278664,
"grad_norm": 30.445371627807617,
"learning_rate": 7.410730270255687e-07,
"loss": 0.4143,
"num_input_tokens_seen": 30745664,
"step": 9765
},
{
"epoch": 0.6254401126688433,
"grad_norm": 35.7066764831543,
"learning_rate": 7.399939263291493e-07,
"loss": 0.3747,
"num_input_tokens_seen": 30760960,
"step": 9770
},
{
"epoch": 0.6257601946098201,
"grad_norm": 36.02008819580078,
"learning_rate": 7.389151502533657e-07,
"loss": 0.479,
"num_input_tokens_seen": 30775872,
"step": 9775
},
{
"epoch": 0.626080276550797,
"grad_norm": 19.047998428344727,
"learning_rate": 7.378367001450819e-07,
"loss": 0.3696,
"num_input_tokens_seen": 30791424,
"step": 9780
},
{
"epoch": 0.6264003584917739,
"grad_norm": 57.05532455444336,
"learning_rate": 7.367585773507567e-07,
"loss": 0.426,
"num_input_tokens_seen": 30807680,
"step": 9785
},
{
"epoch": 0.6267204404327508,
"grad_norm": 42.29533386230469,
"learning_rate": 7.356807832164385e-07,
"loss": 0.4515,
"num_input_tokens_seen": 30823680,
"step": 9790
},
{
"epoch": 0.6270405223737276,
"grad_norm": 19.884836196899414,
"learning_rate": 7.346033190877654e-07,
"loss": 0.4401,
"num_input_tokens_seen": 30839360,
"step": 9795
},
{
"epoch": 0.6273606043147045,
"grad_norm": 32.67311096191406,
"learning_rate": 7.335261863099651e-07,
"loss": 0.3541,
"num_input_tokens_seen": 30854784,
"step": 9800
},
{
"epoch": 0.6276806862556814,
"grad_norm": 33.140811920166016,
"learning_rate": 7.324493862278498e-07,
"loss": 0.4232,
"num_input_tokens_seen": 30870592,
"step": 9805
},
{
"epoch": 0.6280007681966584,
"grad_norm": 41.48550796508789,
"learning_rate": 7.313729201858167e-07,
"loss": 0.4636,
"num_input_tokens_seen": 30885952,
"step": 9810
},
{
"epoch": 0.6283208501376353,
"grad_norm": 23.54149627685547,
"learning_rate": 7.302967895278473e-07,
"loss": 0.3329,
"num_input_tokens_seen": 30902080,
"step": 9815
},
{
"epoch": 0.6286409320786122,
"grad_norm": 30.32986831665039,
"learning_rate": 7.292209955975028e-07,
"loss": 0.4042,
"num_input_tokens_seen": 30919232,
"step": 9820
},
{
"epoch": 0.628961014019589,
"grad_norm": 37.84484100341797,
"learning_rate": 7.281455397379244e-07,
"loss": 0.4078,
"num_input_tokens_seen": 30936448,
"step": 9825
},
{
"epoch": 0.6292810959605659,
"grad_norm": 40.27985763549805,
"learning_rate": 7.270704232918316e-07,
"loss": 0.3225,
"num_input_tokens_seen": 30952256,
"step": 9830
},
{
"epoch": 0.6296011779015428,
"grad_norm": 56.62047576904297,
"learning_rate": 7.2599564760152e-07,
"loss": 0.4216,
"num_input_tokens_seen": 30967360,
"step": 9835
},
{
"epoch": 0.6299212598425197,
"grad_norm": 28.868928909301758,
"learning_rate": 7.249212140088592e-07,
"loss": 0.3852,
"num_input_tokens_seen": 30982016,
"step": 9840
},
{
"epoch": 0.6302413417834966,
"grad_norm": 20.260948181152344,
"learning_rate": 7.23847123855293e-07,
"loss": 0.3347,
"num_input_tokens_seen": 30998080,
"step": 9845
},
{
"epoch": 0.6305614237244734,
"grad_norm": 20.72256088256836,
"learning_rate": 7.227733784818349e-07,
"loss": 0.2805,
"num_input_tokens_seen": 31013184,
"step": 9850
},
{
"epoch": 0.6308815056654503,
"grad_norm": 11.925490379333496,
"learning_rate": 7.216999792290683e-07,
"loss": 0.3804,
"num_input_tokens_seen": 31028800,
"step": 9855
},
{
"epoch": 0.6312015876064272,
"grad_norm": 35.1019287109375,
"learning_rate": 7.206269274371457e-07,
"loss": 0.49,
"num_input_tokens_seen": 31044736,
"step": 9860
},
{
"epoch": 0.6315216695474042,
"grad_norm": 17.91670036315918,
"learning_rate": 7.195542244457845e-07,
"loss": 0.3496,
"num_input_tokens_seen": 31059968,
"step": 9865
},
{
"epoch": 0.6318417514883811,
"grad_norm": 22.282245635986328,
"learning_rate": 7.184818715942666e-07,
"loss": 0.3266,
"num_input_tokens_seen": 31074880,
"step": 9870
},
{
"epoch": 0.6321618334293579,
"grad_norm": 35.720767974853516,
"learning_rate": 7.174098702214374e-07,
"loss": 0.355,
"num_input_tokens_seen": 31090432,
"step": 9875
},
{
"epoch": 0.6324819153703348,
"grad_norm": 28.854347229003906,
"learning_rate": 7.163382216657033e-07,
"loss": 0.37,
"num_input_tokens_seen": 31107264,
"step": 9880
},
{
"epoch": 0.6328019973113117,
"grad_norm": 52.169334411621094,
"learning_rate": 7.152669272650302e-07,
"loss": 0.3444,
"num_input_tokens_seen": 31124096,
"step": 9885
},
{
"epoch": 0.6331220792522886,
"grad_norm": 51.4202766418457,
"learning_rate": 7.141959883569411e-07,
"loss": 0.3869,
"num_input_tokens_seen": 31138752,
"step": 9890
},
{
"epoch": 0.6334421611932655,
"grad_norm": 29.026763916015625,
"learning_rate": 7.131254062785165e-07,
"loss": 0.4701,
"num_input_tokens_seen": 31154048,
"step": 9895
},
{
"epoch": 0.6337622431342423,
"grad_norm": 26.36556625366211,
"learning_rate": 7.120551823663907e-07,
"loss": 0.5118,
"num_input_tokens_seen": 31170304,
"step": 9900
},
{
"epoch": 0.6340823250752192,
"grad_norm": 15.353890419006348,
"learning_rate": 7.109853179567499e-07,
"loss": 0.2817,
"num_input_tokens_seen": 31186368,
"step": 9905
},
{
"epoch": 0.6344024070161961,
"grad_norm": 23.679868698120117,
"learning_rate": 7.099158143853337e-07,
"loss": 0.4235,
"num_input_tokens_seen": 31201664,
"step": 9910
},
{
"epoch": 0.634722488957173,
"grad_norm": 39.42850112915039,
"learning_rate": 7.088466729874289e-07,
"loss": 0.3891,
"num_input_tokens_seen": 31217216,
"step": 9915
},
{
"epoch": 0.63504257089815,
"grad_norm": 30.177127838134766,
"learning_rate": 7.077778950978713e-07,
"loss": 0.3784,
"num_input_tokens_seen": 31233728,
"step": 9920
},
{
"epoch": 0.6353626528391269,
"grad_norm": 19.685361862182617,
"learning_rate": 7.06709482051043e-07,
"loss": 0.4682,
"num_input_tokens_seen": 31249664,
"step": 9925
},
{
"epoch": 0.6356827347801037,
"grad_norm": 21.19015121459961,
"learning_rate": 7.056414351808698e-07,
"loss": 0.3033,
"num_input_tokens_seen": 31265408,
"step": 9930
},
{
"epoch": 0.6360028167210806,
"grad_norm": 26.959909439086914,
"learning_rate": 7.045737558208206e-07,
"loss": 0.3517,
"num_input_tokens_seen": 31281088,
"step": 9935
},
{
"epoch": 0.6363228986620575,
"grad_norm": 30.385330200195312,
"learning_rate": 7.035064453039064e-07,
"loss": 0.4014,
"num_input_tokens_seen": 31296512,
"step": 9940
},
{
"epoch": 0.6366429806030344,
"grad_norm": 14.387809753417969,
"learning_rate": 7.024395049626766e-07,
"loss": 0.3772,
"num_input_tokens_seen": 31312000,
"step": 9945
},
{
"epoch": 0.6369630625440112,
"grad_norm": 43.21665954589844,
"learning_rate": 7.013729361292182e-07,
"loss": 0.3408,
"num_input_tokens_seen": 31327488,
"step": 9950
},
{
"epoch": 0.6372831444849881,
"grad_norm": 40.43202209472656,
"learning_rate": 7.003067401351554e-07,
"loss": 0.3065,
"num_input_tokens_seen": 31343936,
"step": 9955
},
{
"epoch": 0.637603226425965,
"grad_norm": 74.08061218261719,
"learning_rate": 6.992409183116465e-07,
"loss": 0.406,
"num_input_tokens_seen": 31359232,
"step": 9960
},
{
"epoch": 0.6379233083669419,
"grad_norm": 19.582399368286133,
"learning_rate": 6.981754719893826e-07,
"loss": 0.3724,
"num_input_tokens_seen": 31375616,
"step": 9965
},
{
"epoch": 0.6382433903079189,
"grad_norm": 47.04770278930664,
"learning_rate": 6.971104024985852e-07,
"loss": 0.4679,
"num_input_tokens_seen": 31391680,
"step": 9970
},
{
"epoch": 0.6385634722488958,
"grad_norm": 29.400909423828125,
"learning_rate": 6.960457111690068e-07,
"loss": 0.3809,
"num_input_tokens_seen": 31407424,
"step": 9975
},
{
"epoch": 0.6388835541898726,
"grad_norm": 20.47035789489746,
"learning_rate": 6.94981399329927e-07,
"loss": 0.3787,
"num_input_tokens_seen": 31422912,
"step": 9980
},
{
"epoch": 0.6392036361308495,
"grad_norm": 73.91484832763672,
"learning_rate": 6.939174683101509e-07,
"loss": 0.3921,
"num_input_tokens_seen": 31438912,
"step": 9985
},
{
"epoch": 0.6395237180718264,
"grad_norm": 23.82988739013672,
"learning_rate": 6.9285391943801e-07,
"loss": 0.2898,
"num_input_tokens_seen": 31455168,
"step": 9990
},
{
"epoch": 0.6398438000128033,
"grad_norm": 32.45968246459961,
"learning_rate": 6.917907540413569e-07,
"loss": 0.3133,
"num_input_tokens_seen": 31470592,
"step": 9995
},
{
"epoch": 0.6401638819537802,
"grad_norm": 32.134952545166016,
"learning_rate": 6.907279734475659e-07,
"loss": 0.3477,
"num_input_tokens_seen": 31485632,
"step": 10000
},
{
"epoch": 0.640483963894757,
"grad_norm": 35.19672393798828,
"learning_rate": 6.896655789835317e-07,
"loss": 0.3725,
"num_input_tokens_seen": 31500352,
"step": 10005
},
{
"epoch": 0.6408040458357339,
"grad_norm": 39.76215744018555,
"learning_rate": 6.886035719756656e-07,
"loss": 0.3702,
"num_input_tokens_seen": 31516928,
"step": 10010
},
{
"epoch": 0.6411241277767108,
"grad_norm": 20.91424560546875,
"learning_rate": 6.875419537498959e-07,
"loss": 0.279,
"num_input_tokens_seen": 31532608,
"step": 10015
},
{
"epoch": 0.6414442097176877,
"grad_norm": 57.11235809326172,
"learning_rate": 6.864807256316658e-07,
"loss": 0.6005,
"num_input_tokens_seen": 31548608,
"step": 10020
},
{
"epoch": 0.6417642916586647,
"grad_norm": 28.377958297729492,
"learning_rate": 6.854198889459311e-07,
"loss": 0.4117,
"num_input_tokens_seen": 31564224,
"step": 10025
},
{
"epoch": 0.6420843735996415,
"grad_norm": 8.59209156036377,
"learning_rate": 6.84359445017158e-07,
"loss": 0.2567,
"num_input_tokens_seen": 31579200,
"step": 10030
},
{
"epoch": 0.6424044555406184,
"grad_norm": 46.38290023803711,
"learning_rate": 6.832993951693244e-07,
"loss": 0.4257,
"num_input_tokens_seen": 31594816,
"step": 10035
},
{
"epoch": 0.6427245374815953,
"grad_norm": 16.47113800048828,
"learning_rate": 6.822397407259144e-07,
"loss": 0.3547,
"num_input_tokens_seen": 31610432,
"step": 10040
},
{
"epoch": 0.6430446194225722,
"grad_norm": 40.31021499633789,
"learning_rate": 6.811804830099186e-07,
"loss": 0.3794,
"num_input_tokens_seen": 31627520,
"step": 10045
},
{
"epoch": 0.6433647013635491,
"grad_norm": 48.28114318847656,
"learning_rate": 6.801216233438336e-07,
"loss": 0.3557,
"num_input_tokens_seen": 31644352,
"step": 10050
},
{
"epoch": 0.6436847833045259,
"grad_norm": 32.33661651611328,
"learning_rate": 6.790631630496575e-07,
"loss": 0.3919,
"num_input_tokens_seen": 31660160,
"step": 10055
},
{
"epoch": 0.6440048652455028,
"grad_norm": 47.6024169921875,
"learning_rate": 6.780051034488903e-07,
"loss": 0.45,
"num_input_tokens_seen": 31676352,
"step": 10060
},
{
"epoch": 0.6443249471864797,
"grad_norm": 95.15774536132812,
"learning_rate": 6.769474458625323e-07,
"loss": 0.3409,
"num_input_tokens_seen": 31692160,
"step": 10065
},
{
"epoch": 0.6446450291274566,
"grad_norm": 19.220699310302734,
"learning_rate": 6.758901916110813e-07,
"loss": 0.316,
"num_input_tokens_seen": 31707712,
"step": 10070
},
{
"epoch": 0.6449651110684336,
"grad_norm": 16.066856384277344,
"learning_rate": 6.748333420145315e-07,
"loss": 0.3278,
"num_input_tokens_seen": 31723776,
"step": 10075
},
{
"epoch": 0.6452851930094105,
"grad_norm": 22.89158058166504,
"learning_rate": 6.737768983923718e-07,
"loss": 0.4116,
"num_input_tokens_seen": 31740672,
"step": 10080
},
{
"epoch": 0.6456052749503873,
"grad_norm": 35.07290267944336,
"learning_rate": 6.727208620635849e-07,
"loss": 0.2941,
"num_input_tokens_seen": 31755648,
"step": 10085
},
{
"epoch": 0.6459253568913642,
"grad_norm": 32.60226058959961,
"learning_rate": 6.716652343466446e-07,
"loss": 0.4488,
"num_input_tokens_seen": 31770624,
"step": 10090
},
{
"epoch": 0.6462454388323411,
"grad_norm": 39.100215911865234,
"learning_rate": 6.706100165595139e-07,
"loss": 0.3044,
"num_input_tokens_seen": 31786816,
"step": 10095
},
{
"epoch": 0.646565520773318,
"grad_norm": 34.54078674316406,
"learning_rate": 6.695552100196452e-07,
"loss": 0.3924,
"num_input_tokens_seen": 31801792,
"step": 10100
},
{
"epoch": 0.6468856027142948,
"grad_norm": 69.25830841064453,
"learning_rate": 6.685008160439769e-07,
"loss": 0.5025,
"num_input_tokens_seen": 31818944,
"step": 10105
},
{
"epoch": 0.6472056846552717,
"grad_norm": 33.3784294128418,
"learning_rate": 6.674468359489313e-07,
"loss": 0.406,
"num_input_tokens_seen": 31834176,
"step": 10110
},
{
"epoch": 0.6475257665962486,
"grad_norm": 29.71148109436035,
"learning_rate": 6.663932710504163e-07,
"loss": 0.3488,
"num_input_tokens_seen": 31850176,
"step": 10115
},
{
"epoch": 0.6478458485372255,
"grad_norm": 48.27974319458008,
"learning_rate": 6.653401226638192e-07,
"loss": 0.3845,
"num_input_tokens_seen": 31865600,
"step": 10120
},
{
"epoch": 0.6481659304782024,
"grad_norm": 23.962369918823242,
"learning_rate": 6.64287392104008e-07,
"loss": 0.3985,
"num_input_tokens_seen": 31880512,
"step": 10125
},
{
"epoch": 0.6484860124191794,
"grad_norm": 24.32285499572754,
"learning_rate": 6.632350806853299e-07,
"loss": 0.4502,
"num_input_tokens_seen": 31896512,
"step": 10130
},
{
"epoch": 0.6488060943601562,
"grad_norm": 44.430274963378906,
"learning_rate": 6.621831897216074e-07,
"loss": 0.4127,
"num_input_tokens_seen": 31912768,
"step": 10135
},
{
"epoch": 0.6491261763011331,
"grad_norm": 137.93301391601562,
"learning_rate": 6.611317205261387e-07,
"loss": 0.4332,
"num_input_tokens_seen": 31927488,
"step": 10140
},
{
"epoch": 0.64944625824211,
"grad_norm": 28.834609985351562,
"learning_rate": 6.60080674411696e-07,
"loss": 0.3464,
"num_input_tokens_seen": 31942784,
"step": 10145
},
{
"epoch": 0.6497663401830869,
"grad_norm": 15.092977523803711,
"learning_rate": 6.590300526905225e-07,
"loss": 0.3139,
"num_input_tokens_seen": 31958528,
"step": 10150
},
{
"epoch": 0.6500864221240638,
"grad_norm": 38.77704620361328,
"learning_rate": 6.579798566743313e-07,
"loss": 0.4675,
"num_input_tokens_seen": 31974016,
"step": 10155
},
{
"epoch": 0.6504065040650406,
"grad_norm": 41.677734375,
"learning_rate": 6.569300876743049e-07,
"loss": 0.3272,
"num_input_tokens_seen": 31990720,
"step": 10160
},
{
"epoch": 0.6507265860060175,
"grad_norm": 31.978822708129883,
"learning_rate": 6.558807470010923e-07,
"loss": 0.324,
"num_input_tokens_seen": 32007168,
"step": 10165
},
{
"epoch": 0.6507906023942129,
"eval_loss": 0.38159435987472534,
"eval_runtime": 50.6443,
"eval_samples_per_second": 274.187,
"eval_steps_per_second": 34.278,
"num_input_tokens_seen": 32010176,
"step": 10166
},
{
"epoch": 0.6510466679469944,
"grad_norm": 30.454833984375,
"learning_rate": 6.548318359648071e-07,
"loss": 0.355,
"num_input_tokens_seen": 32022208,
"step": 10170
},
{
"epoch": 0.6513667498879713,
"grad_norm": 41.25565719604492,
"learning_rate": 6.537833558750279e-07,
"loss": 0.4036,
"num_input_tokens_seen": 32037760,
"step": 10175
},
{
"epoch": 0.6516868318289483,
"grad_norm": 51.35231018066406,
"learning_rate": 6.527353080407938e-07,
"loss": 0.3108,
"num_input_tokens_seen": 32052800,
"step": 10180
},
{
"epoch": 0.6520069137699251,
"grad_norm": 28.18378448486328,
"learning_rate": 6.516876937706048e-07,
"loss": 0.3491,
"num_input_tokens_seen": 32068288,
"step": 10185
},
{
"epoch": 0.652326995710902,
"grad_norm": 26.677705764770508,
"learning_rate": 6.506405143724196e-07,
"loss": 0.3769,
"num_input_tokens_seen": 32083200,
"step": 10190
},
{
"epoch": 0.6526470776518789,
"grad_norm": 50.78616714477539,
"learning_rate": 6.495937711536546e-07,
"loss": 0.4685,
"num_input_tokens_seen": 32098432,
"step": 10195
},
{
"epoch": 0.6529671595928558,
"grad_norm": 38.68675994873047,
"learning_rate": 6.485474654211803e-07,
"loss": 0.4177,
"num_input_tokens_seen": 32114944,
"step": 10200
},
{
"epoch": 0.6532872415338327,
"grad_norm": 40.70989227294922,
"learning_rate": 6.475015984813217e-07,
"loss": 0.3062,
"num_input_tokens_seen": 32131520,
"step": 10205
},
{
"epoch": 0.6536073234748095,
"grad_norm": 13.664650917053223,
"learning_rate": 6.464561716398564e-07,
"loss": 0.321,
"num_input_tokens_seen": 32147008,
"step": 10210
},
{
"epoch": 0.6539274054157864,
"grad_norm": 33.89069366455078,
"learning_rate": 6.454111862020122e-07,
"loss": 0.3851,
"num_input_tokens_seen": 32162560,
"step": 10215
},
{
"epoch": 0.6542474873567633,
"grad_norm": 28.84914207458496,
"learning_rate": 6.443666434724649e-07,
"loss": 0.3665,
"num_input_tokens_seen": 32177024,
"step": 10220
},
{
"epoch": 0.6545675692977402,
"grad_norm": 25.591217041015625,
"learning_rate": 6.43322544755339e-07,
"loss": 0.542,
"num_input_tokens_seen": 32193024,
"step": 10225
},
{
"epoch": 0.6548876512387171,
"grad_norm": 30.0502986907959,
"learning_rate": 6.422788913542038e-07,
"loss": 0.3447,
"num_input_tokens_seen": 32208896,
"step": 10230
},
{
"epoch": 0.655207733179694,
"grad_norm": 16.857473373413086,
"learning_rate": 6.412356845720726e-07,
"loss": 0.338,
"num_input_tokens_seen": 32225280,
"step": 10235
},
{
"epoch": 0.6555278151206709,
"grad_norm": 17.287302017211914,
"learning_rate": 6.40192925711402e-07,
"loss": 0.3601,
"num_input_tokens_seen": 32240768,
"step": 10240
},
{
"epoch": 0.6558478970616478,
"grad_norm": 27.92411994934082,
"learning_rate": 6.39150616074088e-07,
"loss": 0.3259,
"num_input_tokens_seen": 32255872,
"step": 10245
},
{
"epoch": 0.6561679790026247,
"grad_norm": 30.83510971069336,
"learning_rate": 6.381087569614668e-07,
"loss": 0.4068,
"num_input_tokens_seen": 32272512,
"step": 10250
},
{
"epoch": 0.6564880609436016,
"grad_norm": 14.433576583862305,
"learning_rate": 6.370673496743116e-07,
"loss": 0.3801,
"num_input_tokens_seen": 32286272,
"step": 10255
},
{
"epoch": 0.6568081428845784,
"grad_norm": 24.54606056213379,
"learning_rate": 6.360263955128315e-07,
"loss": 0.4224,
"num_input_tokens_seen": 32301952,
"step": 10260
},
{
"epoch": 0.6571282248255553,
"grad_norm": 18.52509307861328,
"learning_rate": 6.349858957766701e-07,
"loss": 0.3657,
"num_input_tokens_seen": 32318208,
"step": 10265
},
{
"epoch": 0.6574483067665322,
"grad_norm": 23.090232849121094,
"learning_rate": 6.339458517649036e-07,
"loss": 0.3385,
"num_input_tokens_seen": 32333504,
"step": 10270
},
{
"epoch": 0.6577683887075091,
"grad_norm": 33.056419372558594,
"learning_rate": 6.329062647760395e-07,
"loss": 0.3685,
"num_input_tokens_seen": 32350208,
"step": 10275
},
{
"epoch": 0.658088470648486,
"grad_norm": 35.26163864135742,
"learning_rate": 6.318671361080137e-07,
"loss": 0.3259,
"num_input_tokens_seen": 32365376,
"step": 10280
},
{
"epoch": 0.6584085525894628,
"grad_norm": 18.646900177001953,
"learning_rate": 6.308284670581906e-07,
"loss": 0.3411,
"num_input_tokens_seen": 32381248,
"step": 10285
},
{
"epoch": 0.6587286345304398,
"grad_norm": 28.427839279174805,
"learning_rate": 6.297902589233612e-07,
"loss": 0.47,
"num_input_tokens_seen": 32395968,
"step": 10290
},
{
"epoch": 0.6590487164714167,
"grad_norm": 32.31058883666992,
"learning_rate": 6.287525129997404e-07,
"loss": 0.3728,
"num_input_tokens_seen": 32411456,
"step": 10295
},
{
"epoch": 0.6593687984123936,
"grad_norm": 25.885282516479492,
"learning_rate": 6.277152305829656e-07,
"loss": 0.4016,
"num_input_tokens_seen": 32426880,
"step": 10300
},
{
"epoch": 0.6596888803533705,
"grad_norm": 29.73259925842285,
"learning_rate": 6.266784129680968e-07,
"loss": 0.326,
"num_input_tokens_seen": 32442368,
"step": 10305
},
{
"epoch": 0.6600089622943474,
"grad_norm": 39.80248260498047,
"learning_rate": 6.256420614496129e-07,
"loss": 0.3979,
"num_input_tokens_seen": 32457920,
"step": 10310
},
{
"epoch": 0.6603290442353242,
"grad_norm": 36.66291809082031,
"learning_rate": 6.246061773214102e-07,
"loss": 0.4182,
"num_input_tokens_seen": 32473536,
"step": 10315
},
{
"epoch": 0.6606491261763011,
"grad_norm": 38.01105499267578,
"learning_rate": 6.235707618768032e-07,
"loss": 0.4073,
"num_input_tokens_seen": 32490240,
"step": 10320
},
{
"epoch": 0.660969208117278,
"grad_norm": 57.50590515136719,
"learning_rate": 6.225358164085196e-07,
"loss": 0.344,
"num_input_tokens_seen": 32505728,
"step": 10325
},
{
"epoch": 0.6612892900582549,
"grad_norm": 46.767845153808594,
"learning_rate": 6.21501342208701e-07,
"loss": 0.3463,
"num_input_tokens_seen": 32520960,
"step": 10330
},
{
"epoch": 0.6616093719992318,
"grad_norm": 22.598268508911133,
"learning_rate": 6.204673405689007e-07,
"loss": 0.3945,
"num_input_tokens_seen": 32535872,
"step": 10335
},
{
"epoch": 0.6619294539402087,
"grad_norm": 21.846588134765625,
"learning_rate": 6.194338127800823e-07,
"loss": 0.3129,
"num_input_tokens_seen": 32552448,
"step": 10340
},
{
"epoch": 0.6622495358811856,
"grad_norm": 34.799537658691406,
"learning_rate": 6.184007601326165e-07,
"loss": 0.3936,
"num_input_tokens_seen": 32567232,
"step": 10345
},
{
"epoch": 0.6625696178221625,
"grad_norm": 30.42659568786621,
"learning_rate": 6.173681839162824e-07,
"loss": 0.37,
"num_input_tokens_seen": 32583360,
"step": 10350
},
{
"epoch": 0.6628896997631394,
"grad_norm": 31.33951187133789,
"learning_rate": 6.163360854202635e-07,
"loss": 0.3328,
"num_input_tokens_seen": 32598656,
"step": 10355
},
{
"epoch": 0.6632097817041163,
"grad_norm": 19.95844841003418,
"learning_rate": 6.153044659331461e-07,
"loss": 0.3189,
"num_input_tokens_seen": 32614144,
"step": 10360
},
{
"epoch": 0.6635298636450931,
"grad_norm": 32.167152404785156,
"learning_rate": 6.142733267429203e-07,
"loss": 0.3708,
"num_input_tokens_seen": 32629120,
"step": 10365
},
{
"epoch": 0.66384994558607,
"grad_norm": 25.17389678955078,
"learning_rate": 6.132426691369748e-07,
"loss": 0.4218,
"num_input_tokens_seen": 32645952,
"step": 10370
},
{
"epoch": 0.6641700275270469,
"grad_norm": 13.657832145690918,
"learning_rate": 6.122124944020977e-07,
"loss": 0.3955,
"num_input_tokens_seen": 32661696,
"step": 10375
},
{
"epoch": 0.6644901094680238,
"grad_norm": 24.945608139038086,
"learning_rate": 6.111828038244749e-07,
"loss": 0.3779,
"num_input_tokens_seen": 32677760,
"step": 10380
},
{
"epoch": 0.6648101914090007,
"grad_norm": 15.53358268737793,
"learning_rate": 6.101535986896866e-07,
"loss": 0.3063,
"num_input_tokens_seen": 32693568,
"step": 10385
},
{
"epoch": 0.6651302733499775,
"grad_norm": 16.71603775024414,
"learning_rate": 6.091248802827076e-07,
"loss": 0.2929,
"num_input_tokens_seen": 32708736,
"step": 10390
},
{
"epoch": 0.6654503552909545,
"grad_norm": 23.295944213867188,
"learning_rate": 6.080966498879048e-07,
"loss": 0.3258,
"num_input_tokens_seen": 32725440,
"step": 10395
},
{
"epoch": 0.6657704372319314,
"grad_norm": 39.92107009887695,
"learning_rate": 6.070689087890363e-07,
"loss": 0.293,
"num_input_tokens_seen": 32740608,
"step": 10400
},
{
"epoch": 0.6660905191729083,
"grad_norm": 21.400508880615234,
"learning_rate": 6.060416582692487e-07,
"loss": 0.4026,
"num_input_tokens_seen": 32756416,
"step": 10405
},
{
"epoch": 0.6664106011138852,
"grad_norm": 31.731203079223633,
"learning_rate": 6.05014899611076e-07,
"loss": 0.3334,
"num_input_tokens_seen": 32771904,
"step": 10410
},
{
"epoch": 0.666730683054862,
"grad_norm": 53.159175872802734,
"learning_rate": 6.039886340964391e-07,
"loss": 0.3801,
"num_input_tokens_seen": 32787392,
"step": 10415
},
{
"epoch": 0.6670507649958389,
"grad_norm": 21.526613235473633,
"learning_rate": 6.029628630066423e-07,
"loss": 0.3367,
"num_input_tokens_seen": 32803136,
"step": 10420
},
{
"epoch": 0.6673708469368158,
"grad_norm": 33.03938293457031,
"learning_rate": 6.019375876223724e-07,
"loss": 0.4266,
"num_input_tokens_seen": 32818624,
"step": 10425
},
{
"epoch": 0.6676909288777927,
"grad_norm": 28.526151657104492,
"learning_rate": 6.009128092236982e-07,
"loss": 0.4689,
"num_input_tokens_seen": 32833920,
"step": 10430
},
{
"epoch": 0.6680110108187696,
"grad_norm": 19.832090377807617,
"learning_rate": 5.998885290900679e-07,
"loss": 0.3876,
"num_input_tokens_seen": 32848512,
"step": 10435
},
{
"epoch": 0.6683310927597464,
"grad_norm": 27.41183853149414,
"learning_rate": 5.988647485003061e-07,
"loss": 0.3414,
"num_input_tokens_seen": 32865088,
"step": 10440
},
{
"epoch": 0.6686511747007234,
"grad_norm": 63.17194366455078,
"learning_rate": 5.978414687326164e-07,
"loss": 0.4652,
"num_input_tokens_seen": 32882048,
"step": 10445
},
{
"epoch": 0.6689712566417003,
"grad_norm": 34.8876953125,
"learning_rate": 5.968186910645745e-07,
"loss": 0.3775,
"num_input_tokens_seen": 32898624,
"step": 10450
},
{
"epoch": 0.6692913385826772,
"grad_norm": 36.981510162353516,
"learning_rate": 5.957964167731305e-07,
"loss": 0.5049,
"num_input_tokens_seen": 32914176,
"step": 10455
},
{
"epoch": 0.6696114205236541,
"grad_norm": 41.19829559326172,
"learning_rate": 5.947746471346065e-07,
"loss": 0.4117,
"num_input_tokens_seen": 32931136,
"step": 10460
},
{
"epoch": 0.669931502464631,
"grad_norm": 48.431705474853516,
"learning_rate": 5.937533834246932e-07,
"loss": 0.3321,
"num_input_tokens_seen": 32947648,
"step": 10465
},
{
"epoch": 0.6702515844056078,
"grad_norm": 25.455169677734375,
"learning_rate": 5.927326269184504e-07,
"loss": 0.3795,
"num_input_tokens_seen": 32964224,
"step": 10470
},
{
"epoch": 0.6705716663465847,
"grad_norm": 49.933773040771484,
"learning_rate": 5.917123788903049e-07,
"loss": 0.4602,
"num_input_tokens_seen": 32982080,
"step": 10475
},
{
"epoch": 0.6708917482875616,
"grad_norm": 38.94703674316406,
"learning_rate": 5.906926406140484e-07,
"loss": 0.4674,
"num_input_tokens_seen": 32997440,
"step": 10480
},
{
"epoch": 0.6712118302285385,
"grad_norm": 37.308963775634766,
"learning_rate": 5.896734133628354e-07,
"loss": 0.424,
"num_input_tokens_seen": 33013056,
"step": 10485
},
{
"epoch": 0.6715319121695154,
"grad_norm": 24.161361694335938,
"learning_rate": 5.886546984091838e-07,
"loss": 0.3804,
"num_input_tokens_seen": 33028416,
"step": 10490
},
{
"epoch": 0.6718519941104922,
"grad_norm": 31.681415557861328,
"learning_rate": 5.876364970249711e-07,
"loss": 0.3567,
"num_input_tokens_seen": 33042880,
"step": 10495
},
{
"epoch": 0.6721720760514692,
"grad_norm": 34.7954216003418,
"learning_rate": 5.866188104814336e-07,
"loss": 0.2744,
"num_input_tokens_seen": 33058240,
"step": 10500
},
{
"epoch": 0.6724921579924461,
"grad_norm": 19.30687141418457,
"learning_rate": 5.856016400491646e-07,
"loss": 0.3833,
"num_input_tokens_seen": 33073920,
"step": 10505
},
{
"epoch": 0.672812239933423,
"grad_norm": 9.509024620056152,
"learning_rate": 5.845849869981136e-07,
"loss": 0.3158,
"num_input_tokens_seen": 33089344,
"step": 10510
},
{
"epoch": 0.6731323218743999,
"grad_norm": 23.290632247924805,
"learning_rate": 5.835688525975842e-07,
"loss": 0.3608,
"num_input_tokens_seen": 33104384,
"step": 10515
},
{
"epoch": 0.6734524038153767,
"grad_norm": 24.240638732910156,
"learning_rate": 5.825532381162311e-07,
"loss": 0.3926,
"num_input_tokens_seen": 33120064,
"step": 10520
},
{
"epoch": 0.6737724857563536,
"grad_norm": 24.758691787719727,
"learning_rate": 5.815381448220619e-07,
"loss": 0.3889,
"num_input_tokens_seen": 33136128,
"step": 10525
},
{
"epoch": 0.6740925676973305,
"grad_norm": 29.978185653686523,
"learning_rate": 5.805235739824327e-07,
"loss": 0.3599,
"num_input_tokens_seen": 33154816,
"step": 10530
},
{
"epoch": 0.6744126496383074,
"grad_norm": 44.5728759765625,
"learning_rate": 5.795095268640458e-07,
"loss": 0.5053,
"num_input_tokens_seen": 33169920,
"step": 10535
},
{
"epoch": 0.6747327315792843,
"grad_norm": 37.81836700439453,
"learning_rate": 5.784960047329519e-07,
"loss": 0.5436,
"num_input_tokens_seen": 33187712,
"step": 10540
},
{
"epoch": 0.6750528135202611,
"grad_norm": 14.268577575683594,
"learning_rate": 5.774830088545452e-07,
"loss": 0.3931,
"num_input_tokens_seen": 33202880,
"step": 10545
},
{
"epoch": 0.6753728954612381,
"grad_norm": 17.859638214111328,
"learning_rate": 5.76470540493563e-07,
"loss": 0.3059,
"num_input_tokens_seen": 33218944,
"step": 10550
},
{
"epoch": 0.675692977402215,
"grad_norm": 27.35489845275879,
"learning_rate": 5.754586009140836e-07,
"loss": 0.4468,
"num_input_tokens_seen": 33234688,
"step": 10555
},
{
"epoch": 0.6760130593431919,
"grad_norm": 52.45825958251953,
"learning_rate": 5.744471913795256e-07,
"loss": 0.3582,
"num_input_tokens_seen": 33249920,
"step": 10560
},
{
"epoch": 0.6763331412841688,
"grad_norm": 35.08700180053711,
"learning_rate": 5.734363131526459e-07,
"loss": 0.3455,
"num_input_tokens_seen": 33265792,
"step": 10565
},
{
"epoch": 0.6766532232251457,
"grad_norm": 40.150508880615234,
"learning_rate": 5.724259674955377e-07,
"loss": 0.3779,
"num_input_tokens_seen": 33280832,
"step": 10570
},
{
"epoch": 0.6769733051661225,
"grad_norm": 30.927886962890625,
"learning_rate": 5.714161556696291e-07,
"loss": 0.3829,
"num_input_tokens_seen": 33296576,
"step": 10575
},
{
"epoch": 0.6772933871070994,
"grad_norm": 44.968849182128906,
"learning_rate": 5.704068789356824e-07,
"loss": 0.3425,
"num_input_tokens_seen": 33316672,
"step": 10580
},
{
"epoch": 0.6776134690480763,
"grad_norm": 28.765954971313477,
"learning_rate": 5.693981385537912e-07,
"loss": 0.3569,
"num_input_tokens_seen": 33331456,
"step": 10585
},
{
"epoch": 0.6779335509890532,
"grad_norm": 26.096681594848633,
"learning_rate": 5.683899357833801e-07,
"loss": 0.3483,
"num_input_tokens_seen": 33346752,
"step": 10590
},
{
"epoch": 0.67825363293003,
"grad_norm": 31.515544891357422,
"learning_rate": 5.673822718832015e-07,
"loss": 0.4486,
"num_input_tokens_seen": 33362688,
"step": 10595
},
{
"epoch": 0.6785737148710069,
"grad_norm": 40.024139404296875,
"learning_rate": 5.663751481113362e-07,
"loss": 0.3732,
"num_input_tokens_seen": 33377600,
"step": 10600
},
{
"epoch": 0.6788937968119839,
"grad_norm": 26.293109893798828,
"learning_rate": 5.653685657251896e-07,
"loss": 0.4346,
"num_input_tokens_seen": 33393280,
"step": 10605
},
{
"epoch": 0.6792138787529608,
"grad_norm": 42.274269104003906,
"learning_rate": 5.643625259814922e-07,
"loss": 0.378,
"num_input_tokens_seen": 33410112,
"step": 10610
},
{
"epoch": 0.6795339606939377,
"grad_norm": 18.171001434326172,
"learning_rate": 5.633570301362953e-07,
"loss": 0.3557,
"num_input_tokens_seen": 33426624,
"step": 10615
},
{
"epoch": 0.6798540426349146,
"grad_norm": 36.791378021240234,
"learning_rate": 5.623520794449739e-07,
"loss": 0.3642,
"num_input_tokens_seen": 33442240,
"step": 10620
},
{
"epoch": 0.6801741245758914,
"grad_norm": 39.23925018310547,
"learning_rate": 5.613476751622195e-07,
"loss": 0.4764,
"num_input_tokens_seen": 33458432,
"step": 10625
},
{
"epoch": 0.6804942065168683,
"grad_norm": 27.09739112854004,
"learning_rate": 5.603438185420426e-07,
"loss": 0.4373,
"num_input_tokens_seen": 33473856,
"step": 10630
},
{
"epoch": 0.6808142884578452,
"grad_norm": 51.08125305175781,
"learning_rate": 5.593405108377714e-07,
"loss": 0.473,
"num_input_tokens_seen": 33489216,
"step": 10635
},
{
"epoch": 0.6811343703988221,
"grad_norm": 24.226774215698242,
"learning_rate": 5.583377533020457e-07,
"loss": 0.4676,
"num_input_tokens_seen": 33505280,
"step": 10640
},
{
"epoch": 0.681454452339799,
"grad_norm": 38.91953659057617,
"learning_rate": 5.573355471868201e-07,
"loss": 0.2929,
"num_input_tokens_seen": 33520512,
"step": 10645
},
{
"epoch": 0.6817745342807758,
"grad_norm": 25.037273406982422,
"learning_rate": 5.563338937433621e-07,
"loss": 0.3535,
"num_input_tokens_seen": 33537344,
"step": 10650
},
{
"epoch": 0.6820946162217527,
"grad_norm": 17.132577896118164,
"learning_rate": 5.553327942222472e-07,
"loss": 0.2518,
"num_input_tokens_seen": 33552128,
"step": 10655
},
{
"epoch": 0.6824146981627297,
"grad_norm": 27.395191192626953,
"learning_rate": 5.54332249873359e-07,
"loss": 0.3535,
"num_input_tokens_seen": 33566784,
"step": 10660
},
{
"epoch": 0.6827347801037066,
"grad_norm": 21.259674072265625,
"learning_rate": 5.533322619458896e-07,
"loss": 0.2955,
"num_input_tokens_seen": 33582080,
"step": 10665
},
{
"epoch": 0.6830548620446835,
"grad_norm": 45.052799224853516,
"learning_rate": 5.52332831688336e-07,
"loss": 0.4268,
"num_input_tokens_seen": 33596864,
"step": 10670
},
{
"epoch": 0.6833749439856603,
"grad_norm": 72.60736083984375,
"learning_rate": 5.513339603484981e-07,
"loss": 0.3527,
"num_input_tokens_seen": 33613056,
"step": 10675
},
{
"epoch": 0.6836950259266372,
"grad_norm": 67.02263641357422,
"learning_rate": 5.503356491734785e-07,
"loss": 0.4979,
"num_input_tokens_seen": 33628160,
"step": 10680
},
{
"epoch": 0.6840151078676141,
"grad_norm": 19.969636917114258,
"learning_rate": 5.493378994096806e-07,
"loss": 0.4457,
"num_input_tokens_seen": 33645184,
"step": 10685
},
{
"epoch": 0.684335189808591,
"grad_norm": 20.564537048339844,
"learning_rate": 5.483407123028067e-07,
"loss": 0.39,
"num_input_tokens_seen": 33660800,
"step": 10690
},
{
"epoch": 0.6846552717495679,
"grad_norm": 39.593467712402344,
"learning_rate": 5.473440890978566e-07,
"loss": 0.4734,
"num_input_tokens_seen": 33676736,
"step": 10695
},
{
"epoch": 0.6849753536905447,
"grad_norm": 25.922780990600586,
"learning_rate": 5.463480310391261e-07,
"loss": 0.4094,
"num_input_tokens_seen": 33692928,
"step": 10700
},
{
"epoch": 0.6852954356315216,
"grad_norm": 24.099258422851562,
"learning_rate": 5.453525393702052e-07,
"loss": 0.3824,
"num_input_tokens_seen": 33708352,
"step": 10705
},
{
"epoch": 0.6856155175724986,
"grad_norm": 37.71764373779297,
"learning_rate": 5.443576153339771e-07,
"loss": 0.3687,
"num_input_tokens_seen": 33723968,
"step": 10710
},
{
"epoch": 0.6859355995134755,
"grad_norm": 50.91608428955078,
"learning_rate": 5.433632601726159e-07,
"loss": 0.3238,
"num_input_tokens_seen": 33739200,
"step": 10715
},
{
"epoch": 0.6862556814544524,
"grad_norm": 34.224552154541016,
"learning_rate": 5.42369475127586e-07,
"loss": 0.3306,
"num_input_tokens_seen": 33754944,
"step": 10720
},
{
"epoch": 0.6865757633954293,
"grad_norm": 64.74101257324219,
"learning_rate": 5.413762614396396e-07,
"loss": 0.4715,
"num_input_tokens_seen": 33769472,
"step": 10725
},
{
"epoch": 0.6868958453364061,
"grad_norm": 37.43178939819336,
"learning_rate": 5.403836203488157e-07,
"loss": 0.4267,
"num_input_tokens_seen": 33784896,
"step": 10730
},
{
"epoch": 0.687215927277383,
"grad_norm": 20.12882423400879,
"learning_rate": 5.393915530944382e-07,
"loss": 0.3686,
"num_input_tokens_seen": 33800320,
"step": 10735
},
{
"epoch": 0.6875360092183599,
"grad_norm": 26.698909759521484,
"learning_rate": 5.384000609151145e-07,
"loss": 0.3743,
"num_input_tokens_seen": 33816896,
"step": 10740
},
{
"epoch": 0.6878560911593368,
"grad_norm": 22.242435455322266,
"learning_rate": 5.374091450487353e-07,
"loss": 0.3655,
"num_input_tokens_seen": 33833344,
"step": 10745
},
{
"epoch": 0.6881761731003136,
"grad_norm": 31.46050453186035,
"learning_rate": 5.364188067324693e-07,
"loss": 0.3346,
"num_input_tokens_seen": 33849856,
"step": 10750
},
{
"epoch": 0.6884962550412905,
"grad_norm": 16.067712783813477,
"learning_rate": 5.354290472027659e-07,
"loss": 0.3566,
"num_input_tokens_seen": 33865344,
"step": 10755
},
{
"epoch": 0.6888163369822674,
"grad_norm": 67.90633392333984,
"learning_rate": 5.344398676953525e-07,
"loss": 0.4921,
"num_input_tokens_seen": 33881792,
"step": 10760
},
{
"epoch": 0.6891364189232444,
"grad_norm": 33.24453353881836,
"learning_rate": 5.334512694452303e-07,
"loss": 0.4873,
"num_input_tokens_seen": 33898368,
"step": 10765
},
{
"epoch": 0.6894565008642213,
"grad_norm": 22.773826599121094,
"learning_rate": 5.324632536866755e-07,
"loss": 0.345,
"num_input_tokens_seen": 33914368,
"step": 10770
},
{
"epoch": 0.6897765828051982,
"grad_norm": 40.114479064941406,
"learning_rate": 5.314758216532386e-07,
"loss": 0.349,
"num_input_tokens_seen": 33929728,
"step": 10775
},
{
"epoch": 0.690096664746175,
"grad_norm": 21.76487159729004,
"learning_rate": 5.304889745777396e-07,
"loss": 0.3866,
"num_input_tokens_seen": 33944704,
"step": 10780
},
{
"epoch": 0.6904167466871519,
"grad_norm": 33.12062454223633,
"learning_rate": 5.295027136922678e-07,
"loss": 0.6472,
"num_input_tokens_seen": 33960128,
"step": 10785
},
{
"epoch": 0.6907368286281288,
"grad_norm": 23.689847946166992,
"learning_rate": 5.285170402281827e-07,
"loss": 0.4201,
"num_input_tokens_seen": 33975104,
"step": 10790
},
{
"epoch": 0.6910569105691057,
"grad_norm": 36.35055923461914,
"learning_rate": 5.275319554161087e-07,
"loss": 0.459,
"num_input_tokens_seen": 33990720,
"step": 10795
},
{
"epoch": 0.6913769925100826,
"grad_norm": 32.871768951416016,
"learning_rate": 5.265474604859356e-07,
"loss": 0.4207,
"num_input_tokens_seen": 34006272,
"step": 10800
},
{
"epoch": 0.6916970744510594,
"grad_norm": 27.291601181030273,
"learning_rate": 5.255635566668171e-07,
"loss": 0.3828,
"num_input_tokens_seen": 34022400,
"step": 10805
},
{
"epoch": 0.6920171563920363,
"grad_norm": 24.551733016967773,
"learning_rate": 5.245802451871686e-07,
"loss": 0.3811,
"num_input_tokens_seen": 34038720,
"step": 10810
},
{
"epoch": 0.6923372383330133,
"grad_norm": 24.757070541381836,
"learning_rate": 5.235975272746663e-07,
"loss": 0.4381,
"num_input_tokens_seen": 34053760,
"step": 10815
},
{
"epoch": 0.6926573202739902,
"grad_norm": 23.8084716796875,
"learning_rate": 5.226154041562442e-07,
"loss": 0.3033,
"num_input_tokens_seen": 34069568,
"step": 10820
},
{
"epoch": 0.6929774022149671,
"grad_norm": 26.603805541992188,
"learning_rate": 5.216338770580953e-07,
"loss": 0.4078,
"num_input_tokens_seen": 34086912,
"step": 10825
},
{
"epoch": 0.6932974841559439,
"grad_norm": 21.865713119506836,
"learning_rate": 5.206529472056678e-07,
"loss": 0.359,
"num_input_tokens_seen": 34101696,
"step": 10830
},
{
"epoch": 0.6936175660969208,
"grad_norm": 14.517900466918945,
"learning_rate": 5.196726158236637e-07,
"loss": 0.3084,
"num_input_tokens_seen": 34115904,
"step": 10835
},
{
"epoch": 0.6939376480378977,
"grad_norm": 25.503976821899414,
"learning_rate": 5.186928841360384e-07,
"loss": 0.3404,
"num_input_tokens_seen": 34131328,
"step": 10840
},
{
"epoch": 0.6942577299788746,
"grad_norm": 30.737321853637695,
"learning_rate": 5.177137533659985e-07,
"loss": 0.4466,
"num_input_tokens_seen": 34148544,
"step": 10845
},
{
"epoch": 0.6945778119198515,
"grad_norm": 22.222187042236328,
"learning_rate": 5.167352247360002e-07,
"loss": 0.4562,
"num_input_tokens_seen": 34163520,
"step": 10850
},
{
"epoch": 0.6948978938608283,
"grad_norm": 30.551633834838867,
"learning_rate": 5.157572994677479e-07,
"loss": 0.398,
"num_input_tokens_seen": 34178368,
"step": 10855
},
{
"epoch": 0.6952179758018052,
"grad_norm": 34.05165100097656,
"learning_rate": 5.147799787821929e-07,
"loss": 0.4086,
"num_input_tokens_seen": 34193920,
"step": 10860
},
{
"epoch": 0.6955380577427821,
"grad_norm": 39.741065979003906,
"learning_rate": 5.138032638995315e-07,
"loss": 0.4939,
"num_input_tokens_seen": 34210176,
"step": 10865
},
{
"epoch": 0.6958581396837591,
"grad_norm": 55.47737503051758,
"learning_rate": 5.128271560392037e-07,
"loss": 0.3602,
"num_input_tokens_seen": 34227328,
"step": 10870
},
{
"epoch": 0.696178221624736,
"grad_norm": 31.166147232055664,
"learning_rate": 5.118516564198916e-07,
"loss": 0.3959,
"num_input_tokens_seen": 34241984,
"step": 10875
},
{
"epoch": 0.6964983035657129,
"grad_norm": 22.45415496826172,
"learning_rate": 5.108767662595175e-07,
"loss": 0.3339,
"num_input_tokens_seen": 34256896,
"step": 10880
},
{
"epoch": 0.6968183855066897,
"grad_norm": 20.498132705688477,
"learning_rate": 5.099024867752446e-07,
"loss": 0.3904,
"num_input_tokens_seen": 34273792,
"step": 10885
},
{
"epoch": 0.6971384674476666,
"grad_norm": 37.48936462402344,
"learning_rate": 5.089288191834709e-07,
"loss": 0.3381,
"num_input_tokens_seen": 34290752,
"step": 10890
},
{
"epoch": 0.6974585493886435,
"grad_norm": 34.213294982910156,
"learning_rate": 5.079557646998318e-07,
"loss": 0.3422,
"num_input_tokens_seen": 34308416,
"step": 10895
},
{
"epoch": 0.6977786313296204,
"grad_norm": 14.548563003540039,
"learning_rate": 5.069833245391981e-07,
"loss": 0.3981,
"num_input_tokens_seen": 34323776,
"step": 10900
},
{
"epoch": 0.6980987132705972,
"grad_norm": 24.348325729370117,
"learning_rate": 5.060114999156728e-07,
"loss": 0.2941,
"num_input_tokens_seen": 34338944,
"step": 10905
},
{
"epoch": 0.6984187952115741,
"grad_norm": 34.29429626464844,
"learning_rate": 5.050402920425895e-07,
"loss": 0.3407,
"num_input_tokens_seen": 34354432,
"step": 10910
},
{
"epoch": 0.698738877152551,
"grad_norm": 18.496606826782227,
"learning_rate": 5.040697021325128e-07,
"loss": 0.2503,
"num_input_tokens_seen": 34370432,
"step": 10915
},
{
"epoch": 0.699058959093528,
"grad_norm": 33.679046630859375,
"learning_rate": 5.030997313972361e-07,
"loss": 0.4438,
"num_input_tokens_seen": 34386496,
"step": 10920
},
{
"epoch": 0.6993790410345049,
"grad_norm": 22.789405822753906,
"learning_rate": 5.021303810477795e-07,
"loss": 0.3692,
"num_input_tokens_seen": 34402560,
"step": 10925
},
{
"epoch": 0.6996991229754818,
"grad_norm": 17.632328033447266,
"learning_rate": 5.011616522943869e-07,
"loss": 0.2937,
"num_input_tokens_seen": 34418496,
"step": 10930
},
{
"epoch": 0.7000192049164586,
"grad_norm": 48.676658630371094,
"learning_rate": 5.001935463465289e-07,
"loss": 0.2772,
"num_input_tokens_seen": 34434752,
"step": 10935
},
{
"epoch": 0.7003392868574355,
"grad_norm": 25.778039932250977,
"learning_rate": 4.99226064412897e-07,
"loss": 0.3775,
"num_input_tokens_seen": 34450176,
"step": 10940
},
{
"epoch": 0.7006593687984124,
"grad_norm": 19.5657958984375,
"learning_rate": 4.982592077014026e-07,
"loss": 0.4286,
"num_input_tokens_seen": 34465600,
"step": 10945
},
{
"epoch": 0.7008514179629985,
"eval_loss": 0.3744131922721863,
"eval_runtime": 50.689,
"eval_samples_per_second": 273.945,
"eval_steps_per_second": 34.248,
"num_input_tokens_seen": 34475136,
"step": 10948
},
{
"epoch": 0.7009794507393893,
"grad_norm": 34.119754791259766,
"learning_rate": 4.97292977419179e-07,
"loss": 0.2973,
"num_input_tokens_seen": 34481600,
"step": 10950
},
{
"epoch": 0.7012995326803662,
"grad_norm": 21.208982467651367,
"learning_rate": 4.963273747725755e-07,
"loss": 0.2881,
"num_input_tokens_seen": 34498752,
"step": 10955
},
{
"epoch": 0.701619614621343,
"grad_norm": 26.19671630859375,
"learning_rate": 4.953624009671582e-07,
"loss": 0.413,
"num_input_tokens_seen": 34514240,
"step": 10960
},
{
"epoch": 0.7019396965623199,
"grad_norm": 44.88737487792969,
"learning_rate": 4.943980572077086e-07,
"loss": 0.4164,
"num_input_tokens_seen": 34528704,
"step": 10965
},
{
"epoch": 0.7022597785032968,
"grad_norm": 38.300411224365234,
"learning_rate": 4.934343446982209e-07,
"loss": 0.3207,
"num_input_tokens_seen": 34544704,
"step": 10970
},
{
"epoch": 0.7025798604442738,
"grad_norm": 13.832147598266602,
"learning_rate": 4.924712646419016e-07,
"loss": 0.3836,
"num_input_tokens_seen": 34560000,
"step": 10975
},
{
"epoch": 0.7028999423852507,
"grad_norm": 70.15164947509766,
"learning_rate": 4.915088182411674e-07,
"loss": 0.3222,
"num_input_tokens_seen": 34575296,
"step": 10980
},
{
"epoch": 0.7032200243262275,
"grad_norm": 35.194576263427734,
"learning_rate": 4.905470066976439e-07,
"loss": 0.3897,
"num_input_tokens_seen": 34590528,
"step": 10985
},
{
"epoch": 0.7035401062672044,
"grad_norm": 37.24507141113281,
"learning_rate": 4.895858312121644e-07,
"loss": 0.4156,
"num_input_tokens_seen": 34605312,
"step": 10990
},
{
"epoch": 0.7038601882081813,
"grad_norm": 25.404296875,
"learning_rate": 4.886252929847674e-07,
"loss": 0.4342,
"num_input_tokens_seen": 34620736,
"step": 10995
},
{
"epoch": 0.7041802701491582,
"grad_norm": 41.86030578613281,
"learning_rate": 4.876653932146963e-07,
"loss": 0.4627,
"num_input_tokens_seen": 34636736,
"step": 11000
},
{
"epoch": 0.7045003520901351,
"grad_norm": 31.331430435180664,
"learning_rate": 4.86706133100397e-07,
"loss": 0.3895,
"num_input_tokens_seen": 34651776,
"step": 11005
},
{
"epoch": 0.7048204340311119,
"grad_norm": 46.46551513671875,
"learning_rate": 4.857475138395178e-07,
"loss": 0.2889,
"num_input_tokens_seen": 34666176,
"step": 11010
},
{
"epoch": 0.7051405159720888,
"grad_norm": 15.934460639953613,
"learning_rate": 4.847895366289054e-07,
"loss": 0.2493,
"num_input_tokens_seen": 34682112,
"step": 11015
},
{
"epoch": 0.7054605979130657,
"grad_norm": 33.465232849121094,
"learning_rate": 4.838322026646057e-07,
"loss": 0.3825,
"num_input_tokens_seen": 34697024,
"step": 11020
},
{
"epoch": 0.7057806798540426,
"grad_norm": 22.186031341552734,
"learning_rate": 4.82875513141861e-07,
"loss": 0.371,
"num_input_tokens_seen": 34712704,
"step": 11025
},
{
"epoch": 0.7061007617950196,
"grad_norm": 28.25750160217285,
"learning_rate": 4.819194692551106e-07,
"loss": 0.375,
"num_input_tokens_seen": 34728256,
"step": 11030
},
{
"epoch": 0.7064208437359965,
"grad_norm": 17.980060577392578,
"learning_rate": 4.809640721979855e-07,
"loss": 0.435,
"num_input_tokens_seen": 34744512,
"step": 11035
},
{
"epoch": 0.7067409256769733,
"grad_norm": 46.3629264831543,
"learning_rate": 4.8000932316331e-07,
"loss": 0.4181,
"num_input_tokens_seen": 34758912,
"step": 11040
},
{
"epoch": 0.7070610076179502,
"grad_norm": 29.38511085510254,
"learning_rate": 4.790552233431002e-07,
"loss": 0.3914,
"num_input_tokens_seen": 34774848,
"step": 11045
},
{
"epoch": 0.7073810895589271,
"grad_norm": 34.6492805480957,
"learning_rate": 4.781017739285611e-07,
"loss": 0.416,
"num_input_tokens_seen": 34790016,
"step": 11050
},
{
"epoch": 0.707701171499904,
"grad_norm": 14.354450225830078,
"learning_rate": 4.771489761100842e-07,
"loss": 0.3528,
"num_input_tokens_seen": 34804992,
"step": 11055
},
{
"epoch": 0.7080212534408808,
"grad_norm": 39.80025863647461,
"learning_rate": 4.761968310772501e-07,
"loss": 0.2746,
"num_input_tokens_seen": 34820288,
"step": 11060
},
{
"epoch": 0.7083413353818577,
"grad_norm": 40.38865661621094,
"learning_rate": 4.7524534001882267e-07,
"loss": 0.2814,
"num_input_tokens_seen": 34836096,
"step": 11065
},
{
"epoch": 0.7086614173228346,
"grad_norm": 29.79903221130371,
"learning_rate": 4.7429450412274897e-07,
"loss": 0.3875,
"num_input_tokens_seen": 34851584,
"step": 11070
},
{
"epoch": 0.7089814992638115,
"grad_norm": 23.854610443115234,
"learning_rate": 4.733443245761596e-07,
"loss": 0.3542,
"num_input_tokens_seen": 34868032,
"step": 11075
},
{
"epoch": 0.7093015812047885,
"grad_norm": 26.83568572998047,
"learning_rate": 4.723948025653646e-07,
"loss": 0.3826,
"num_input_tokens_seen": 34884032,
"step": 11080
},
{
"epoch": 0.7096216631457654,
"grad_norm": 31.357860565185547,
"learning_rate": 4.714459392758534e-07,
"loss": 0.3252,
"num_input_tokens_seen": 34899456,
"step": 11085
},
{
"epoch": 0.7099417450867422,
"grad_norm": 50.6710205078125,
"learning_rate": 4.70497735892293e-07,
"loss": 0.3772,
"num_input_tokens_seen": 34915456,
"step": 11090
},
{
"epoch": 0.7102618270277191,
"grad_norm": 17.10684585571289,
"learning_rate": 4.695501935985263e-07,
"loss": 0.3408,
"num_input_tokens_seen": 34931328,
"step": 11095
},
{
"epoch": 0.710581908968696,
"grad_norm": 37.1428337097168,
"learning_rate": 4.686033135775711e-07,
"loss": 0.4064,
"num_input_tokens_seen": 34946816,
"step": 11100
},
{
"epoch": 0.7109019909096729,
"grad_norm": 25.48462677001953,
"learning_rate": 4.6765709701161817e-07,
"loss": 0.3274,
"num_input_tokens_seen": 34964544,
"step": 11105
},
{
"epoch": 0.7112220728506498,
"grad_norm": 88.31197357177734,
"learning_rate": 4.6671154508203003e-07,
"loss": 0.3861,
"num_input_tokens_seen": 34982208,
"step": 11110
},
{
"epoch": 0.7115421547916266,
"grad_norm": 39.50213623046875,
"learning_rate": 4.657666589693393e-07,
"loss": 0.3523,
"num_input_tokens_seen": 35000576,
"step": 11115
},
{
"epoch": 0.7118622367326035,
"grad_norm": 26.08376693725586,
"learning_rate": 4.6482243985324753e-07,
"loss": 0.3167,
"num_input_tokens_seen": 35014912,
"step": 11120
},
{
"epoch": 0.7121823186735804,
"grad_norm": 29.463659286499023,
"learning_rate": 4.638788889126232e-07,
"loss": 0.2867,
"num_input_tokens_seen": 35029632,
"step": 11125
},
{
"epoch": 0.7125024006145573,
"grad_norm": 28.548364639282227,
"learning_rate": 4.6293600732550085e-07,
"loss": 0.3423,
"num_input_tokens_seen": 35044992,
"step": 11130
},
{
"epoch": 0.7128224825555343,
"grad_norm": 18.932619094848633,
"learning_rate": 4.619937962690792e-07,
"loss": 0.4721,
"num_input_tokens_seen": 35060544,
"step": 11135
},
{
"epoch": 0.7131425644965111,
"grad_norm": 56.20754623413086,
"learning_rate": 4.610522569197197e-07,
"loss": 0.5205,
"num_input_tokens_seen": 35075648,
"step": 11140
},
{
"epoch": 0.713462646437488,
"grad_norm": 21.114213943481445,
"learning_rate": 4.6011139045294554e-07,
"loss": 0.3271,
"num_input_tokens_seen": 35090880,
"step": 11145
},
{
"epoch": 0.7137827283784649,
"grad_norm": 89.78885650634766,
"learning_rate": 4.59171198043439e-07,
"loss": 0.3935,
"num_input_tokens_seen": 35106432,
"step": 11150
},
{
"epoch": 0.7141028103194418,
"grad_norm": 29.25298309326172,
"learning_rate": 4.582316808650424e-07,
"loss": 0.4446,
"num_input_tokens_seen": 35121664,
"step": 11155
},
{
"epoch": 0.7144228922604187,
"grad_norm": 42.61500549316406,
"learning_rate": 4.572928400907529e-07,
"loss": 0.4704,
"num_input_tokens_seen": 35137152,
"step": 11160
},
{
"epoch": 0.7147429742013955,
"grad_norm": 41.548580169677734,
"learning_rate": 4.5635467689272434e-07,
"loss": 0.3787,
"num_input_tokens_seen": 35153088,
"step": 11165
},
{
"epoch": 0.7150630561423724,
"grad_norm": 23.75127410888672,
"learning_rate": 4.554171924422655e-07,
"loss": 0.3674,
"num_input_tokens_seen": 35168192,
"step": 11170
},
{
"epoch": 0.7153831380833493,
"grad_norm": 23.422161102294922,
"learning_rate": 4.544803879098356e-07,
"loss": 0.3288,
"num_input_tokens_seen": 35184192,
"step": 11175
},
{
"epoch": 0.7157032200243262,
"grad_norm": 24.220752716064453,
"learning_rate": 4.535442644650462e-07,
"loss": 0.3703,
"num_input_tokens_seen": 35200256,
"step": 11180
},
{
"epoch": 0.7160233019653032,
"grad_norm": 23.171953201293945,
"learning_rate": 4.5260882327665906e-07,
"loss": 0.4906,
"num_input_tokens_seen": 35214720,
"step": 11185
},
{
"epoch": 0.71634338390628,
"grad_norm": 38.95206069946289,
"learning_rate": 4.5167406551258347e-07,
"loss": 0.5148,
"num_input_tokens_seen": 35230720,
"step": 11190
},
{
"epoch": 0.7166634658472569,
"grad_norm": 30.46370506286621,
"learning_rate": 4.5073999233987445e-07,
"loss": 0.3863,
"num_input_tokens_seen": 35246400,
"step": 11195
},
{
"epoch": 0.7169835477882338,
"grad_norm": 31.349842071533203,
"learning_rate": 4.4980660492473434e-07,
"loss": 0.47,
"num_input_tokens_seen": 35262784,
"step": 11200
},
{
"epoch": 0.7173036297292107,
"grad_norm": 15.219905853271484,
"learning_rate": 4.4887390443250804e-07,
"loss": 0.2775,
"num_input_tokens_seen": 35277632,
"step": 11205
},
{
"epoch": 0.7176237116701876,
"grad_norm": 18.419071197509766,
"learning_rate": 4.4794189202768295e-07,
"loss": 0.2913,
"num_input_tokens_seen": 35292544,
"step": 11210
},
{
"epoch": 0.7179437936111644,
"grad_norm": 33.719818115234375,
"learning_rate": 4.4701056887388757e-07,
"loss": 0.368,
"num_input_tokens_seen": 35308352,
"step": 11215
},
{
"epoch": 0.7182638755521413,
"grad_norm": 32.227081298828125,
"learning_rate": 4.460799361338897e-07,
"loss": 0.3343,
"num_input_tokens_seen": 35323904,
"step": 11220
},
{
"epoch": 0.7185839574931182,
"grad_norm": 19.966176986694336,
"learning_rate": 4.451499949695954e-07,
"loss": 0.4156,
"num_input_tokens_seen": 35340224,
"step": 11225
},
{
"epoch": 0.7189040394340951,
"grad_norm": 17.299413681030273,
"learning_rate": 4.44220746542047e-07,
"loss": 0.375,
"num_input_tokens_seen": 35355776,
"step": 11230
},
{
"epoch": 0.719224121375072,
"grad_norm": 28.009838104248047,
"learning_rate": 4.432921920114221e-07,
"loss": 0.4772,
"num_input_tokens_seen": 35371072,
"step": 11235
},
{
"epoch": 0.719544203316049,
"grad_norm": 36.488346099853516,
"learning_rate": 4.4236433253703185e-07,
"loss": 0.3169,
"num_input_tokens_seen": 35387520,
"step": 11240
},
{
"epoch": 0.7198642852570258,
"grad_norm": 38.739227294921875,
"learning_rate": 4.4143716927732e-07,
"loss": 0.3928,
"num_input_tokens_seen": 35403840,
"step": 11245
},
{
"epoch": 0.7201843671980027,
"grad_norm": 30.09905433654785,
"learning_rate": 4.405107033898604e-07,
"loss": 0.3873,
"num_input_tokens_seen": 35420032,
"step": 11250
},
{
"epoch": 0.7205044491389796,
"grad_norm": 35.15446472167969,
"learning_rate": 4.395849360313568e-07,
"loss": 0.2845,
"num_input_tokens_seen": 35436032,
"step": 11255
},
{
"epoch": 0.7208245310799565,
"grad_norm": 37.864742279052734,
"learning_rate": 4.386598683576406e-07,
"loss": 0.3583,
"num_input_tokens_seen": 35451136,
"step": 11260
},
{
"epoch": 0.7211446130209334,
"grad_norm": 17.579322814941406,
"learning_rate": 4.377355015236696e-07,
"loss": 0.4711,
"num_input_tokens_seen": 35466816,
"step": 11265
},
{
"epoch": 0.7214646949619102,
"grad_norm": 34.376991271972656,
"learning_rate": 4.368118366835266e-07,
"loss": 0.3555,
"num_input_tokens_seen": 35483456,
"step": 11270
},
{
"epoch": 0.7217847769028871,
"grad_norm": 37.30057144165039,
"learning_rate": 4.358888749904177e-07,
"loss": 0.4612,
"num_input_tokens_seen": 35499584,
"step": 11275
},
{
"epoch": 0.722104858843864,
"grad_norm": 25.58052635192871,
"learning_rate": 4.349666175966725e-07,
"loss": 0.3546,
"num_input_tokens_seen": 35515328,
"step": 11280
},
{
"epoch": 0.7224249407848409,
"grad_norm": 18.5773983001709,
"learning_rate": 4.340450656537392e-07,
"loss": 0.4744,
"num_input_tokens_seen": 35530048,
"step": 11285
},
{
"epoch": 0.7227450227258178,
"grad_norm": 32.79454040527344,
"learning_rate": 4.331242203121861e-07,
"loss": 0.2965,
"num_input_tokens_seen": 35545792,
"step": 11290
},
{
"epoch": 0.7230651046667947,
"grad_norm": 44.36042785644531,
"learning_rate": 4.322040827217004e-07,
"loss": 0.3871,
"num_input_tokens_seen": 35561344,
"step": 11295
},
{
"epoch": 0.7233851866077716,
"grad_norm": 43.496337890625,
"learning_rate": 4.312846540310838e-07,
"loss": 0.405,
"num_input_tokens_seen": 35577024,
"step": 11300
},
{
"epoch": 0.7237052685487485,
"grad_norm": 28.110532760620117,
"learning_rate": 4.3036593538825373e-07,
"loss": 0.3728,
"num_input_tokens_seen": 35592192,
"step": 11305
},
{
"epoch": 0.7240253504897254,
"grad_norm": 15.80125904083252,
"learning_rate": 4.2944792794024196e-07,
"loss": 0.3287,
"num_input_tokens_seen": 35607872,
"step": 11310
},
{
"epoch": 0.7243454324307023,
"grad_norm": 23.26807975769043,
"learning_rate": 4.285306328331915e-07,
"loss": 0.3117,
"num_input_tokens_seen": 35623872,
"step": 11315
},
{
"epoch": 0.7246655143716791,
"grad_norm": 27.19857406616211,
"learning_rate": 4.2761405121235506e-07,
"loss": 0.3067,
"num_input_tokens_seen": 35638720,
"step": 11320
},
{
"epoch": 0.724985596312656,
"grad_norm": 23.31117057800293,
"learning_rate": 4.266981842220965e-07,
"loss": 0.5403,
"num_input_tokens_seen": 35655680,
"step": 11325
},
{
"epoch": 0.7253056782536329,
"grad_norm": 27.773475646972656,
"learning_rate": 4.257830330058864e-07,
"loss": 0.2708,
"num_input_tokens_seen": 35671168,
"step": 11330
},
{
"epoch": 0.7256257601946098,
"grad_norm": 32.48497772216797,
"learning_rate": 4.248685987063019e-07,
"loss": 0.4088,
"num_input_tokens_seen": 35686848,
"step": 11335
},
{
"epoch": 0.7259458421355867,
"grad_norm": 25.382577896118164,
"learning_rate": 4.2395488246502396e-07,
"loss": 0.3478,
"num_input_tokens_seen": 35702720,
"step": 11340
},
{
"epoch": 0.7262659240765637,
"grad_norm": 34.661277770996094,
"learning_rate": 4.2304188542283913e-07,
"loss": 0.4566,
"num_input_tokens_seen": 35720640,
"step": 11345
},
{
"epoch": 0.7265860060175405,
"grad_norm": 61.42772674560547,
"learning_rate": 4.221296087196347e-07,
"loss": 0.3923,
"num_input_tokens_seen": 35735424,
"step": 11350
},
{
"epoch": 0.7269060879585174,
"grad_norm": 23.816184997558594,
"learning_rate": 4.2121805349439867e-07,
"loss": 0.4596,
"num_input_tokens_seen": 35751168,
"step": 11355
},
{
"epoch": 0.7272261698994943,
"grad_norm": 32.27251052856445,
"learning_rate": 4.203072208852184e-07,
"loss": 0.3787,
"num_input_tokens_seen": 35767168,
"step": 11360
},
{
"epoch": 0.7275462518404712,
"grad_norm": 42.87307357788086,
"learning_rate": 4.193971120292793e-07,
"loss": 0.439,
"num_input_tokens_seen": 35782464,
"step": 11365
},
{
"epoch": 0.727866333781448,
"grad_norm": 20.0147705078125,
"learning_rate": 4.184877280628629e-07,
"loss": 0.406,
"num_input_tokens_seen": 35798592,
"step": 11370
},
{
"epoch": 0.7281864157224249,
"grad_norm": 36.623085021972656,
"learning_rate": 4.1757907012134565e-07,
"loss": 0.396,
"num_input_tokens_seen": 35814720,
"step": 11375
},
{
"epoch": 0.7285064976634018,
"grad_norm": 34.45808792114258,
"learning_rate": 4.166711393391978e-07,
"loss": 0.2826,
"num_input_tokens_seen": 35830016,
"step": 11380
},
{
"epoch": 0.7288265796043787,
"grad_norm": 18.86490821838379,
"learning_rate": 4.1576393684998146e-07,
"loss": 0.345,
"num_input_tokens_seen": 35845632,
"step": 11385
},
{
"epoch": 0.7291466615453556,
"grad_norm": 23.06288719177246,
"learning_rate": 4.1485746378634966e-07,
"loss": 0.3556,
"num_input_tokens_seen": 35861184,
"step": 11390
},
{
"epoch": 0.7294667434863324,
"grad_norm": 37.96226501464844,
"learning_rate": 4.1395172128004473e-07,
"loss": 0.4311,
"num_input_tokens_seen": 35876864,
"step": 11395
},
{
"epoch": 0.7297868254273094,
"grad_norm": 25.186145782470703,
"learning_rate": 4.130467104618963e-07,
"loss": 0.3318,
"num_input_tokens_seen": 35893568,
"step": 11400
},
{
"epoch": 0.7301069073682863,
"grad_norm": 30.771581649780273,
"learning_rate": 4.1214243246182223e-07,
"loss": 0.3364,
"num_input_tokens_seen": 35909696,
"step": 11405
},
{
"epoch": 0.7304269893092632,
"grad_norm": 35.04588317871094,
"learning_rate": 4.1123888840882306e-07,
"loss": 0.5046,
"num_input_tokens_seen": 35925120,
"step": 11410
},
{
"epoch": 0.7307470712502401,
"grad_norm": 28.61571502685547,
"learning_rate": 4.1033607943098415e-07,
"loss": 0.3223,
"num_input_tokens_seen": 35940800,
"step": 11415
},
{
"epoch": 0.731067153191217,
"grad_norm": 12.746711730957031,
"learning_rate": 4.0943400665547423e-07,
"loss": 0.3444,
"num_input_tokens_seen": 35955968,
"step": 11420
},
{
"epoch": 0.7313872351321938,
"grad_norm": 43.476295471191406,
"learning_rate": 4.0853267120854064e-07,
"loss": 0.3449,
"num_input_tokens_seen": 35972096,
"step": 11425
},
{
"epoch": 0.7317073170731707,
"grad_norm": 27.513898849487305,
"learning_rate": 4.076320742155117e-07,
"loss": 0.3315,
"num_input_tokens_seen": 35986624,
"step": 11430
},
{
"epoch": 0.7320273990141476,
"grad_norm": 13.824392318725586,
"learning_rate": 4.067322168007928e-07,
"loss": 0.3493,
"num_input_tokens_seen": 36003008,
"step": 11435
},
{
"epoch": 0.7323474809551245,
"grad_norm": 27.101316452026367,
"learning_rate": 4.0583310008786775e-07,
"loss": 0.3597,
"num_input_tokens_seen": 36017152,
"step": 11440
},
{
"epoch": 0.7326675628961014,
"grad_norm": 38.84494400024414,
"learning_rate": 4.049347251992932e-07,
"loss": 0.271,
"num_input_tokens_seen": 36031936,
"step": 11445
},
{
"epoch": 0.7329876448370783,
"grad_norm": 29.943532943725586,
"learning_rate": 4.0403709325670064e-07,
"loss": 0.353,
"num_input_tokens_seen": 36048064,
"step": 11450
},
{
"epoch": 0.7333077267780552,
"grad_norm": 56.9892692565918,
"learning_rate": 4.03140205380795e-07,
"loss": 0.4653,
"num_input_tokens_seen": 36064256,
"step": 11455
},
{
"epoch": 0.7336278087190321,
"grad_norm": 67.21539306640625,
"learning_rate": 4.0224406269135115e-07,
"loss": 0.6715,
"num_input_tokens_seen": 36079424,
"step": 11460
},
{
"epoch": 0.733947890660009,
"grad_norm": 48.787593841552734,
"learning_rate": 4.0134866630721266e-07,
"loss": 0.3111,
"num_input_tokens_seen": 36095424,
"step": 11465
},
{
"epoch": 0.7342679726009859,
"grad_norm": 21.646329879760742,
"learning_rate": 4.0045401734629367e-07,
"loss": 0.3618,
"num_input_tokens_seen": 36111360,
"step": 11470
},
{
"epoch": 0.7345880545419627,
"grad_norm": 26.18387794494629,
"learning_rate": 3.9956011692557377e-07,
"loss": 0.3825,
"num_input_tokens_seen": 36127232,
"step": 11475
},
{
"epoch": 0.7349081364829396,
"grad_norm": 51.78586196899414,
"learning_rate": 3.986669661610972e-07,
"loss": 0.3532,
"num_input_tokens_seen": 36143168,
"step": 11480
},
{
"epoch": 0.7352282184239165,
"grad_norm": 33.300621032714844,
"learning_rate": 3.9777456616797414e-07,
"loss": 0.3323,
"num_input_tokens_seen": 36158272,
"step": 11485
},
{
"epoch": 0.7355483003648934,
"grad_norm": 51.799171447753906,
"learning_rate": 3.968829180603761e-07,
"loss": 0.3731,
"num_input_tokens_seen": 36173056,
"step": 11490
},
{
"epoch": 0.7358683823058703,
"grad_norm": 45.12591552734375,
"learning_rate": 3.9599202295153624e-07,
"loss": 0.3927,
"num_input_tokens_seen": 36187904,
"step": 11495
},
{
"epoch": 0.7361884642468471,
"grad_norm": 74.60730743408203,
"learning_rate": 3.951018819537476e-07,
"loss": 0.3596,
"num_input_tokens_seen": 36205632,
"step": 11500
},
{
"epoch": 0.7365085461878241,
"grad_norm": 36.89905548095703,
"learning_rate": 3.942124961783616e-07,
"loss": 0.3478,
"num_input_tokens_seen": 36220160,
"step": 11505
},
{
"epoch": 0.736828628128801,
"grad_norm": 28.41156005859375,
"learning_rate": 3.933238667357869e-07,
"loss": 0.3164,
"num_input_tokens_seen": 36236416,
"step": 11510
},
{
"epoch": 0.7371487100697779,
"grad_norm": 41.953426361083984,
"learning_rate": 3.924359947354876e-07,
"loss": 0.3449,
"num_input_tokens_seen": 36251584,
"step": 11515
},
{
"epoch": 0.7374687920107548,
"grad_norm": 15.202105522155762,
"learning_rate": 3.915488812859826e-07,
"loss": 0.3289,
"num_input_tokens_seen": 36265856,
"step": 11520
},
{
"epoch": 0.7377888739517316,
"grad_norm": 62.20964050292969,
"learning_rate": 3.90662527494843e-07,
"loss": 0.3927,
"num_input_tokens_seen": 36283904,
"step": 11525
},
{
"epoch": 0.7381089558927085,
"grad_norm": 34.758705139160156,
"learning_rate": 3.8977693446869285e-07,
"loss": 0.3627,
"num_input_tokens_seen": 36298432,
"step": 11530
},
{
"epoch": 0.7384290378336854,
"grad_norm": 26.30589485168457,
"learning_rate": 3.8889210331320445e-07,
"loss": 0.3247,
"num_input_tokens_seen": 36313728,
"step": 11535
},
{
"epoch": 0.7387491197746623,
"grad_norm": 21.73337173461914,
"learning_rate": 3.8800803513310033e-07,
"loss": 0.3595,
"num_input_tokens_seen": 36329088,
"step": 11540
},
{
"epoch": 0.7390692017156392,
"grad_norm": 38.45037841796875,
"learning_rate": 3.8712473103214993e-07,
"loss": 0.4255,
"num_input_tokens_seen": 36345024,
"step": 11545
},
{
"epoch": 0.739389283656616,
"grad_norm": 21.429397583007812,
"learning_rate": 3.862421921131688e-07,
"loss": 0.3089,
"num_input_tokens_seen": 36361792,
"step": 11550
},
{
"epoch": 0.739709365597593,
"grad_norm": 35.84434127807617,
"learning_rate": 3.85360419478017e-07,
"loss": 0.2832,
"num_input_tokens_seen": 36377152,
"step": 11555
},
{
"epoch": 0.7400294475385699,
"grad_norm": 23.184368133544922,
"learning_rate": 3.8447941422759786e-07,
"loss": 0.3552,
"num_input_tokens_seen": 36394048,
"step": 11560
},
{
"epoch": 0.7403495294795468,
"grad_norm": 34.779685974121094,
"learning_rate": 3.835991774618579e-07,
"loss": 0.3684,
"num_input_tokens_seen": 36409152,
"step": 11565
},
{
"epoch": 0.7406696114205237,
"grad_norm": 88.90668487548828,
"learning_rate": 3.827197102797818e-07,
"loss": 0.3859,
"num_input_tokens_seen": 36427072,
"step": 11570
},
{
"epoch": 0.7409896933615006,
"grad_norm": 66.85957336425781,
"learning_rate": 3.818410137793947e-07,
"loss": 0.4771,
"num_input_tokens_seen": 36444288,
"step": 11575
},
{
"epoch": 0.7413097753024774,
"grad_norm": 20.060245513916016,
"learning_rate": 3.809630890577602e-07,
"loss": 0.4402,
"num_input_tokens_seen": 36460096,
"step": 11580
},
{
"epoch": 0.7416298572434543,
"grad_norm": 126.41194152832031,
"learning_rate": 3.800859372109777e-07,
"loss": 0.3388,
"num_input_tokens_seen": 36475264,
"step": 11585
},
{
"epoch": 0.7419499391844312,
"grad_norm": 17.512025833129883,
"learning_rate": 3.7920955933418055e-07,
"loss": 0.325,
"num_input_tokens_seen": 36491264,
"step": 11590
},
{
"epoch": 0.7422700211254081,
"grad_norm": 46.947872161865234,
"learning_rate": 3.7833395652153775e-07,
"loss": 0.3245,
"num_input_tokens_seen": 36506368,
"step": 11595
},
{
"epoch": 0.742590103066385,
"grad_norm": 33.25145721435547,
"learning_rate": 3.774591298662497e-07,
"loss": 0.3117,
"num_input_tokens_seen": 36522432,
"step": 11600
},
{
"epoch": 0.7429101850073618,
"grad_norm": 64.40025329589844,
"learning_rate": 3.765850804605468e-07,
"loss": 0.4221,
"num_input_tokens_seen": 36539008,
"step": 11605
},
{
"epoch": 0.7432302669483388,
"grad_norm": 28.040822982788086,
"learning_rate": 3.7571180939569104e-07,
"loss": 0.2818,
"num_input_tokens_seen": 36554240,
"step": 11610
},
{
"epoch": 0.7435503488893157,
"grad_norm": 36.580570220947266,
"learning_rate": 3.748393177619711e-07,
"loss": 0.3181,
"num_input_tokens_seen": 36569920,
"step": 11615
},
{
"epoch": 0.7438704308302926,
"grad_norm": 33.55031204223633,
"learning_rate": 3.739676066487032e-07,
"loss": 0.3139,
"num_input_tokens_seen": 36585792,
"step": 11620
},
{
"epoch": 0.7441905127712695,
"grad_norm": 23.285324096679688,
"learning_rate": 3.730966771442289e-07,
"loss": 0.2923,
"num_input_tokens_seen": 36601280,
"step": 11625
},
{
"epoch": 0.7445105947122463,
"grad_norm": 33.82448196411133,
"learning_rate": 3.722265303359137e-07,
"loss": 0.5229,
"num_input_tokens_seen": 36617152,
"step": 11630
},
{
"epoch": 0.7448306766532232,
"grad_norm": 60.476661682128906,
"learning_rate": 3.713571673101463e-07,
"loss": 0.4046,
"num_input_tokens_seen": 36632512,
"step": 11635
},
{
"epoch": 0.7451507585942001,
"grad_norm": 15.05838394165039,
"learning_rate": 3.704885891523366e-07,
"loss": 0.344,
"num_input_tokens_seen": 36647744,
"step": 11640
},
{
"epoch": 0.745470840535177,
"grad_norm": 34.84885025024414,
"learning_rate": 3.696207969469146e-07,
"loss": 0.3938,
"num_input_tokens_seen": 36663360,
"step": 11645
},
{
"epoch": 0.7457909224761539,
"grad_norm": 29.558528900146484,
"learning_rate": 3.6875379177732913e-07,
"loss": 0.373,
"num_input_tokens_seen": 36678656,
"step": 11650
},
{
"epoch": 0.7461110044171307,
"grad_norm": 73.68892669677734,
"learning_rate": 3.6788757472604634e-07,
"loss": 0.5096,
"num_input_tokens_seen": 36693952,
"step": 11655
},
{
"epoch": 0.7464310863581076,
"grad_norm": 35.93594741821289,
"learning_rate": 3.6702214687454825e-07,
"loss": 0.3264,
"num_input_tokens_seen": 36709888,
"step": 11660
},
{
"epoch": 0.7467511682990846,
"grad_norm": 28.113248825073242,
"learning_rate": 3.6615750930333177e-07,
"loss": 0.3066,
"num_input_tokens_seen": 36725504,
"step": 11665
},
{
"epoch": 0.7470712502400615,
"grad_norm": 11.288246154785156,
"learning_rate": 3.65293663091907e-07,
"loss": 0.3025,
"num_input_tokens_seen": 36741376,
"step": 11670
},
{
"epoch": 0.7473913321810384,
"grad_norm": 31.954673767089844,
"learning_rate": 3.6443060931879623e-07,
"loss": 0.435,
"num_input_tokens_seen": 36756864,
"step": 11675
},
{
"epoch": 0.7477114141220152,
"grad_norm": 29.685522079467773,
"learning_rate": 3.635683490615321e-07,
"loss": 0.4612,
"num_input_tokens_seen": 36772608,
"step": 11680
},
{
"epoch": 0.7480314960629921,
"grad_norm": 76.63870239257812,
"learning_rate": 3.6270688339665634e-07,
"loss": 0.3057,
"num_input_tokens_seen": 36788352,
"step": 11685
},
{
"epoch": 0.748351578003969,
"grad_norm": 41.03104782104492,
"learning_rate": 3.6184621339972e-07,
"loss": 0.3581,
"num_input_tokens_seen": 36804096,
"step": 11690
},
{
"epoch": 0.7486716599449459,
"grad_norm": 41.0787353515625,
"learning_rate": 3.609863401452786e-07,
"loss": 0.3592,
"num_input_tokens_seen": 36819776,
"step": 11695
},
{
"epoch": 0.7489917418859228,
"grad_norm": 36.56906509399414,
"learning_rate": 3.6012726470689416e-07,
"loss": 0.4102,
"num_input_tokens_seen": 36835072,
"step": 11700
},
{
"epoch": 0.7493118238268996,
"grad_norm": 26.697582244873047,
"learning_rate": 3.592689881571329e-07,
"loss": 0.3346,
"num_input_tokens_seen": 36850816,
"step": 11705
},
{
"epoch": 0.7496319057678765,
"grad_norm": 39.08557891845703,
"learning_rate": 3.5841151156756334e-07,
"loss": 0.4205,
"num_input_tokens_seen": 36866368,
"step": 11710
},
{
"epoch": 0.7499519877088535,
"grad_norm": 40.700199127197266,
"learning_rate": 3.575548360087539e-07,
"loss": 0.4196,
"num_input_tokens_seen": 36885376,
"step": 11715
},
{
"epoch": 0.7502720696498304,
"grad_norm": 18.3825626373291,
"learning_rate": 3.5669896255027533e-07,
"loss": 0.3191,
"num_input_tokens_seen": 36900288,
"step": 11720
},
{
"epoch": 0.7505921515908073,
"grad_norm": 17.36146354675293,
"learning_rate": 3.5584389226069543e-07,
"loss": 0.3892,
"num_input_tokens_seen": 36916224,
"step": 11725
},
{
"epoch": 0.7509122335317842,
"grad_norm": 20.827890396118164,
"learning_rate": 3.5498962620757866e-07,
"loss": 0.3097,
"num_input_tokens_seen": 36931648,
"step": 11730
},
{
"epoch": 0.7509122335317842,
"eval_loss": 0.36731547117233276,
"eval_runtime": 50.5825,
"eval_samples_per_second": 274.522,
"eval_steps_per_second": 34.32,
"num_input_tokens_seen": 36931648,
"step": 11730
},
{
"epoch": 0.751232315472761,
"grad_norm": 83.0255126953125,
"learning_rate": 3.5413616545748713e-07,
"loss": 0.4301,
"num_input_tokens_seen": 36945856,
"step": 11735
},
{
"epoch": 0.7515523974137379,
"grad_norm": 25.14464569091797,
"learning_rate": 3.532835110759763e-07,
"loss": 0.509,
"num_input_tokens_seen": 36961792,
"step": 11740
},
{
"epoch": 0.7518724793547148,
"grad_norm": 25.07977294921875,
"learning_rate": 3.524316641275955e-07,
"loss": 0.3072,
"num_input_tokens_seen": 36977152,
"step": 11745
},
{
"epoch": 0.7521925612956917,
"grad_norm": 20.042966842651367,
"learning_rate": 3.5158062567588467e-07,
"loss": 0.4213,
"num_input_tokens_seen": 36991936,
"step": 11750
},
{
"epoch": 0.7525126432366686,
"grad_norm": 94.77703857421875,
"learning_rate": 3.5073039678337633e-07,
"loss": 0.4065,
"num_input_tokens_seen": 37006784,
"step": 11755
},
{
"epoch": 0.7528327251776454,
"grad_norm": 38.61663055419922,
"learning_rate": 3.498809785115908e-07,
"loss": 0.3394,
"num_input_tokens_seen": 37022208,
"step": 11760
},
{
"epoch": 0.7531528071186223,
"grad_norm": 12.624959945678711,
"learning_rate": 3.4903237192103697e-07,
"loss": 0.3495,
"num_input_tokens_seen": 37039488,
"step": 11765
},
{
"epoch": 0.7534728890595993,
"grad_norm": 40.64104080200195,
"learning_rate": 3.481845780712099e-07,
"loss": 0.3453,
"num_input_tokens_seen": 37056064,
"step": 11770
},
{
"epoch": 0.7537929710005762,
"grad_norm": 36.66989517211914,
"learning_rate": 3.4733759802059037e-07,
"loss": 0.3434,
"num_input_tokens_seen": 37072256,
"step": 11775
},
{
"epoch": 0.7541130529415531,
"grad_norm": 63.55211639404297,
"learning_rate": 3.4649143282664273e-07,
"loss": 0.428,
"num_input_tokens_seen": 37087360,
"step": 11780
},
{
"epoch": 0.7544331348825299,
"grad_norm": 27.92076873779297,
"learning_rate": 3.456460835458143e-07,
"loss": 0.3164,
"num_input_tokens_seen": 37102144,
"step": 11785
},
{
"epoch": 0.7547532168235068,
"grad_norm": 40.95823669433594,
"learning_rate": 3.4480155123353337e-07,
"loss": 0.3131,
"num_input_tokens_seen": 37117568,
"step": 11790
},
{
"epoch": 0.7550732987644837,
"grad_norm": 38.527374267578125,
"learning_rate": 3.4395783694420875e-07,
"loss": 0.4608,
"num_input_tokens_seen": 37132800,
"step": 11795
},
{
"epoch": 0.7553933807054606,
"grad_norm": 22.126087188720703,
"learning_rate": 3.4311494173122743e-07,
"loss": 0.4036,
"num_input_tokens_seen": 37147776,
"step": 11800
},
{
"epoch": 0.7557134626464375,
"grad_norm": 26.484477996826172,
"learning_rate": 3.422728666469541e-07,
"loss": 0.3944,
"num_input_tokens_seen": 37163904,
"step": 11805
},
{
"epoch": 0.7560335445874143,
"grad_norm": 43.04818344116211,
"learning_rate": 3.41431612742729e-07,
"loss": 0.4316,
"num_input_tokens_seen": 37180416,
"step": 11810
},
{
"epoch": 0.7563536265283912,
"grad_norm": 23.99943733215332,
"learning_rate": 3.4059118106886855e-07,
"loss": 0.4235,
"num_input_tokens_seen": 37196480,
"step": 11815
},
{
"epoch": 0.7566737084693682,
"grad_norm": 76.9852066040039,
"learning_rate": 3.3975157267466036e-07,
"loss": 0.5208,
"num_input_tokens_seen": 37211648,
"step": 11820
},
{
"epoch": 0.7569937904103451,
"grad_norm": 28.845922470092773,
"learning_rate": 3.389127886083656e-07,
"loss": 0.2942,
"num_input_tokens_seen": 37227072,
"step": 11825
},
{
"epoch": 0.757313872351322,
"grad_norm": 23.833080291748047,
"learning_rate": 3.3807482991721667e-07,
"loss": 0.3342,
"num_input_tokens_seen": 37243968,
"step": 11830
},
{
"epoch": 0.7576339542922989,
"grad_norm": 19.469358444213867,
"learning_rate": 3.3723769764741474e-07,
"loss": 0.32,
"num_input_tokens_seen": 37259200,
"step": 11835
},
{
"epoch": 0.7579540362332757,
"grad_norm": 17.347503662109375,
"learning_rate": 3.3640139284412825e-07,
"loss": 0.2946,
"num_input_tokens_seen": 37275072,
"step": 11840
},
{
"epoch": 0.7582741181742526,
"grad_norm": 43.63488006591797,
"learning_rate": 3.355659165514948e-07,
"loss": 0.4,
"num_input_tokens_seen": 37291392,
"step": 11845
},
{
"epoch": 0.7585942001152295,
"grad_norm": 18.54771614074707,
"learning_rate": 3.347312698126161e-07,
"loss": 0.2828,
"num_input_tokens_seen": 37307648,
"step": 11850
},
{
"epoch": 0.7589142820562064,
"grad_norm": 14.965799331665039,
"learning_rate": 3.338974536695578e-07,
"loss": 0.2188,
"num_input_tokens_seen": 37323136,
"step": 11855
},
{
"epoch": 0.7592343639971832,
"grad_norm": 22.427433013916016,
"learning_rate": 3.330644691633492e-07,
"loss": 0.3193,
"num_input_tokens_seen": 37338496,
"step": 11860
},
{
"epoch": 0.7595544459381601,
"grad_norm": 11.61136245727539,
"learning_rate": 3.322323173339818e-07,
"loss": 0.2764,
"num_input_tokens_seen": 37356800,
"step": 11865
},
{
"epoch": 0.759874527879137,
"grad_norm": 27.630123138427734,
"learning_rate": 3.314009992204071e-07,
"loss": 0.4461,
"num_input_tokens_seen": 37372800,
"step": 11870
},
{
"epoch": 0.760194609820114,
"grad_norm": 63.511077880859375,
"learning_rate": 3.3057051586053443e-07,
"loss": 0.3172,
"num_input_tokens_seen": 37388608,
"step": 11875
},
{
"epoch": 0.7605146917610909,
"grad_norm": 34.02855682373047,
"learning_rate": 3.297408682912329e-07,
"loss": 0.4503,
"num_input_tokens_seen": 37405184,
"step": 11880
},
{
"epoch": 0.7608347737020678,
"grad_norm": 21.150548934936523,
"learning_rate": 3.289120575483271e-07,
"loss": 0.2743,
"num_input_tokens_seen": 37420096,
"step": 11885
},
{
"epoch": 0.7611548556430446,
"grad_norm": 33.34170150756836,
"learning_rate": 3.280840846665969e-07,
"loss": 0.4177,
"num_input_tokens_seen": 37434368,
"step": 11890
},
{
"epoch": 0.7614749375840215,
"grad_norm": 32.07157897949219,
"learning_rate": 3.272569506797761e-07,
"loss": 0.3019,
"num_input_tokens_seen": 37449344,
"step": 11895
},
{
"epoch": 0.7617950195249984,
"grad_norm": 29.10479736328125,
"learning_rate": 3.2643065662055136e-07,
"loss": 0.3364,
"num_input_tokens_seen": 37464448,
"step": 11900
},
{
"epoch": 0.7621151014659753,
"grad_norm": 68.0009765625,
"learning_rate": 3.2560520352056033e-07,
"loss": 0.2844,
"num_input_tokens_seen": 37481856,
"step": 11905
},
{
"epoch": 0.7624351834069522,
"grad_norm": 19.72753143310547,
"learning_rate": 3.24780592410391e-07,
"loss": 0.3952,
"num_input_tokens_seen": 37497856,
"step": 11910
},
{
"epoch": 0.762755265347929,
"grad_norm": 40.24898147583008,
"learning_rate": 3.2395682431957994e-07,
"loss": 0.4545,
"num_input_tokens_seen": 37513600,
"step": 11915
},
{
"epoch": 0.7630753472889059,
"grad_norm": 36.124610900878906,
"learning_rate": 3.231339002766115e-07,
"loss": 0.3272,
"num_input_tokens_seen": 37529408,
"step": 11920
},
{
"epoch": 0.7633954292298829,
"grad_norm": 30.874311447143555,
"learning_rate": 3.2231182130891564e-07,
"loss": 0.3396,
"num_input_tokens_seen": 37545984,
"step": 11925
},
{
"epoch": 0.7637155111708598,
"grad_norm": 70.74797058105469,
"learning_rate": 3.214905884428679e-07,
"loss": 0.3342,
"num_input_tokens_seen": 37561856,
"step": 11930
},
{
"epoch": 0.7640355931118367,
"grad_norm": 29.5023136138916,
"learning_rate": 3.206702027037868e-07,
"loss": 0.3292,
"num_input_tokens_seen": 37578624,
"step": 11935
},
{
"epoch": 0.7643556750528135,
"grad_norm": 45.15686798095703,
"learning_rate": 3.198506651159344e-07,
"loss": 0.3962,
"num_input_tokens_seen": 37593920,
"step": 11940
},
{
"epoch": 0.7646757569937904,
"grad_norm": 24.18119239807129,
"learning_rate": 3.190319767025121e-07,
"loss": 0.3658,
"num_input_tokens_seen": 37609664,
"step": 11945
},
{
"epoch": 0.7649958389347673,
"grad_norm": 58.468135833740234,
"learning_rate": 3.1821413848566213e-07,
"loss": 0.4959,
"num_input_tokens_seen": 37626048,
"step": 11950
},
{
"epoch": 0.7653159208757442,
"grad_norm": 19.08656883239746,
"learning_rate": 3.1739715148646564e-07,
"loss": 0.3753,
"num_input_tokens_seen": 37641792,
"step": 11955
},
{
"epoch": 0.7656360028167211,
"grad_norm": 54.501651763916016,
"learning_rate": 3.1658101672494043e-07,
"loss": 0.4534,
"num_input_tokens_seen": 37656512,
"step": 11960
},
{
"epoch": 0.7659560847576979,
"grad_norm": 47.72370910644531,
"learning_rate": 3.157657352200397e-07,
"loss": 0.3377,
"num_input_tokens_seen": 37672000,
"step": 11965
},
{
"epoch": 0.7662761666986748,
"grad_norm": 40.14820861816406,
"learning_rate": 3.149513079896521e-07,
"loss": 0.3278,
"num_input_tokens_seen": 37687232,
"step": 11970
},
{
"epoch": 0.7665962486396517,
"grad_norm": 19.0128116607666,
"learning_rate": 3.1413773605060034e-07,
"loss": 0.3237,
"num_input_tokens_seen": 37702656,
"step": 11975
},
{
"epoch": 0.7669163305806287,
"grad_norm": 59.66189956665039,
"learning_rate": 3.1332502041863783e-07,
"loss": 0.4234,
"num_input_tokens_seen": 37718080,
"step": 11980
},
{
"epoch": 0.7672364125216056,
"grad_norm": 28.364665985107422,
"learning_rate": 3.1251316210844946e-07,
"loss": 0.3181,
"num_input_tokens_seen": 37735680,
"step": 11985
},
{
"epoch": 0.7675564944625825,
"grad_norm": 51.28329849243164,
"learning_rate": 3.1170216213365055e-07,
"loss": 0.2871,
"num_input_tokens_seen": 37749952,
"step": 11990
},
{
"epoch": 0.7678765764035593,
"grad_norm": 34.2696647644043,
"learning_rate": 3.1089202150678397e-07,
"loss": 0.4582,
"num_input_tokens_seen": 37765312,
"step": 11995
},
{
"epoch": 0.7681966583445362,
"grad_norm": 50.565311431884766,
"learning_rate": 3.1008274123931886e-07,
"loss": 0.4919,
"num_input_tokens_seen": 37780160,
"step": 12000
},
{
"epoch": 0.7685167402855131,
"grad_norm": 28.518428802490234,
"learning_rate": 3.092743223416523e-07,
"loss": 0.2657,
"num_input_tokens_seen": 37796352,
"step": 12005
},
{
"epoch": 0.76883682222649,
"grad_norm": 60.307430267333984,
"learning_rate": 3.0846676582310413e-07,
"loss": 0.3551,
"num_input_tokens_seen": 37812864,
"step": 12010
},
{
"epoch": 0.7691569041674668,
"grad_norm": 45.83395767211914,
"learning_rate": 3.076600726919185e-07,
"loss": 0.3818,
"num_input_tokens_seen": 37827840,
"step": 12015
},
{
"epoch": 0.7694769861084437,
"grad_norm": 32.84312438964844,
"learning_rate": 3.0685424395526106e-07,
"loss": 0.3599,
"num_input_tokens_seen": 37847040,
"step": 12020
},
{
"epoch": 0.7697970680494206,
"grad_norm": 38.15679168701172,
"learning_rate": 3.060492806192184e-07,
"loss": 0.2875,
"num_input_tokens_seen": 37862464,
"step": 12025
},
{
"epoch": 0.7701171499903975,
"grad_norm": 36.667755126953125,
"learning_rate": 3.052451836887968e-07,
"loss": 0.3826,
"num_input_tokens_seen": 37877760,
"step": 12030
},
{
"epoch": 0.7704372319313745,
"grad_norm": 28.926128387451172,
"learning_rate": 3.044419541679207e-07,
"loss": 0.2867,
"num_input_tokens_seen": 37892800,
"step": 12035
},
{
"epoch": 0.7707573138723514,
"grad_norm": 59.06888961791992,
"learning_rate": 3.0363959305943153e-07,
"loss": 0.4353,
"num_input_tokens_seen": 37909056,
"step": 12040
},
{
"epoch": 0.7710773958133282,
"grad_norm": 42.93339157104492,
"learning_rate": 3.028381013650867e-07,
"loss": 0.3447,
"num_input_tokens_seen": 37925376,
"step": 12045
},
{
"epoch": 0.7713974777543051,
"grad_norm": 39.62418746948242,
"learning_rate": 3.0203748008555783e-07,
"loss": 0.3705,
"num_input_tokens_seen": 37941632,
"step": 12050
},
{
"epoch": 0.771717559695282,
"grad_norm": 38.263912200927734,
"learning_rate": 3.012377302204301e-07,
"loss": 0.374,
"num_input_tokens_seen": 37957056,
"step": 12055
},
{
"epoch": 0.7720376416362589,
"grad_norm": 47.257015228271484,
"learning_rate": 3.0043885276820046e-07,
"loss": 0.3959,
"num_input_tokens_seen": 37973184,
"step": 12060
},
{
"epoch": 0.7723577235772358,
"grad_norm": 25.244918823242188,
"learning_rate": 2.99640848726277e-07,
"loss": 0.3027,
"num_input_tokens_seen": 37988288,
"step": 12065
},
{
"epoch": 0.7726778055182126,
"grad_norm": 25.476991653442383,
"learning_rate": 2.9884371909097704e-07,
"loss": 0.3723,
"num_input_tokens_seen": 38004224,
"step": 12070
},
{
"epoch": 0.7729978874591895,
"grad_norm": 23.3084774017334,
"learning_rate": 2.9804746485752616e-07,
"loss": 0.3721,
"num_input_tokens_seen": 38019456,
"step": 12075
},
{
"epoch": 0.7733179694001664,
"grad_norm": 28.834396362304688,
"learning_rate": 2.972520870200573e-07,
"loss": 0.4237,
"num_input_tokens_seen": 38035264,
"step": 12080
},
{
"epoch": 0.7736380513411434,
"grad_norm": 32.87154006958008,
"learning_rate": 2.9645758657160904e-07,
"loss": 0.4166,
"num_input_tokens_seen": 38051072,
"step": 12085
},
{
"epoch": 0.7739581332821203,
"grad_norm": 16.961706161499023,
"learning_rate": 2.9566396450412444e-07,
"loss": 0.3573,
"num_input_tokens_seen": 38066688,
"step": 12090
},
{
"epoch": 0.7742782152230971,
"grad_norm": 22.967132568359375,
"learning_rate": 2.9487122180844957e-07,
"loss": 0.3237,
"num_input_tokens_seen": 38082048,
"step": 12095
},
{
"epoch": 0.774598297164074,
"grad_norm": 57.26237487792969,
"learning_rate": 2.9407935947433406e-07,
"loss": 0.3143,
"num_input_tokens_seen": 38097344,
"step": 12100
},
{
"epoch": 0.7749183791050509,
"grad_norm": 45.43290328979492,
"learning_rate": 2.932883784904264e-07,
"loss": 0.4448,
"num_input_tokens_seen": 38112320,
"step": 12105
},
{
"epoch": 0.7752384610460278,
"grad_norm": 15.891319274902344,
"learning_rate": 2.9249827984427555e-07,
"loss": 0.244,
"num_input_tokens_seen": 38128000,
"step": 12110
},
{
"epoch": 0.7755585429870047,
"grad_norm": 30.146347045898438,
"learning_rate": 2.917090645223297e-07,
"loss": 0.3049,
"num_input_tokens_seen": 38143168,
"step": 12115
},
{
"epoch": 0.7758786249279815,
"grad_norm": 28.579742431640625,
"learning_rate": 2.909207335099332e-07,
"loss": 0.301,
"num_input_tokens_seen": 38157824,
"step": 12120
},
{
"epoch": 0.7761987068689584,
"grad_norm": 32.7314567565918,
"learning_rate": 2.9013328779132595e-07,
"loss": 0.3329,
"num_input_tokens_seen": 38172864,
"step": 12125
},
{
"epoch": 0.7765187888099353,
"grad_norm": 102.57218170166016,
"learning_rate": 2.893467283496439e-07,
"loss": 0.4221,
"num_input_tokens_seen": 38187264,
"step": 12130
},
{
"epoch": 0.7768388707509122,
"grad_norm": 21.075590133666992,
"learning_rate": 2.885610561669155e-07,
"loss": 0.3534,
"num_input_tokens_seen": 38204288,
"step": 12135
},
{
"epoch": 0.7771589526918892,
"grad_norm": 29.551855087280273,
"learning_rate": 2.8777627222406163e-07,
"loss": 0.3447,
"num_input_tokens_seen": 38219264,
"step": 12140
},
{
"epoch": 0.777479034632866,
"grad_norm": 41.8278923034668,
"learning_rate": 2.869923775008943e-07,
"loss": 0.3845,
"num_input_tokens_seen": 38234496,
"step": 12145
},
{
"epoch": 0.7777991165738429,
"grad_norm": 41.65421676635742,
"learning_rate": 2.862093729761155e-07,
"loss": 0.2729,
"num_input_tokens_seen": 38251072,
"step": 12150
},
{
"epoch": 0.7781191985148198,
"grad_norm": 29.3076114654541,
"learning_rate": 2.854272596273152e-07,
"loss": 0.3971,
"num_input_tokens_seen": 38266560,
"step": 12155
},
{
"epoch": 0.7784392804557967,
"grad_norm": 44.304229736328125,
"learning_rate": 2.8464603843097134e-07,
"loss": 0.331,
"num_input_tokens_seen": 38282944,
"step": 12160
},
{
"epoch": 0.7787593623967736,
"grad_norm": 33.777957916259766,
"learning_rate": 2.8386571036244764e-07,
"loss": 0.3274,
"num_input_tokens_seen": 38299264,
"step": 12165
},
{
"epoch": 0.7790794443377504,
"grad_norm": 51.79270553588867,
"learning_rate": 2.830862763959929e-07,
"loss": 0.3866,
"num_input_tokens_seen": 38314368,
"step": 12170
},
{
"epoch": 0.7793995262787273,
"grad_norm": 11.458423614501953,
"learning_rate": 2.8230773750473956e-07,
"loss": 0.3108,
"num_input_tokens_seen": 38329664,
"step": 12175
},
{
"epoch": 0.7797196082197042,
"grad_norm": 28.3763427734375,
"learning_rate": 2.8153009466070267e-07,
"loss": 0.3067,
"num_input_tokens_seen": 38345408,
"step": 12180
},
{
"epoch": 0.7800396901606811,
"grad_norm": 32.849178314208984,
"learning_rate": 2.807533488347783e-07,
"loss": 0.2959,
"num_input_tokens_seen": 38362688,
"step": 12185
},
{
"epoch": 0.7803597721016581,
"grad_norm": 24.91496467590332,
"learning_rate": 2.7997750099674277e-07,
"loss": 0.2508,
"num_input_tokens_seen": 38377600,
"step": 12190
},
{
"epoch": 0.780679854042635,
"grad_norm": 38.691551208496094,
"learning_rate": 2.792025521152512e-07,
"loss": 0.5263,
"num_input_tokens_seen": 38392640,
"step": 12195
},
{
"epoch": 0.7809999359836118,
"grad_norm": 34.44416427612305,
"learning_rate": 2.784285031578365e-07,
"loss": 0.4457,
"num_input_tokens_seen": 38408448,
"step": 12200
},
{
"epoch": 0.7813200179245887,
"grad_norm": 26.372634887695312,
"learning_rate": 2.7765535509090786e-07,
"loss": 0.3649,
"num_input_tokens_seen": 38424512,
"step": 12205
},
{
"epoch": 0.7816400998655656,
"grad_norm": 31.301618576049805,
"learning_rate": 2.768831088797495e-07,
"loss": 0.4661,
"num_input_tokens_seen": 38439296,
"step": 12210
},
{
"epoch": 0.7819601818065425,
"grad_norm": 16.063852310180664,
"learning_rate": 2.761117654885201e-07,
"loss": 0.247,
"num_input_tokens_seen": 38455424,
"step": 12215
},
{
"epoch": 0.7822802637475194,
"grad_norm": 25.676212310791016,
"learning_rate": 2.7534132588025063e-07,
"loss": 0.3314,
"num_input_tokens_seen": 38470976,
"step": 12220
},
{
"epoch": 0.7826003456884962,
"grad_norm": 28.28862762451172,
"learning_rate": 2.7457179101684483e-07,
"loss": 0.5088,
"num_input_tokens_seen": 38486016,
"step": 12225
},
{
"epoch": 0.7829204276294731,
"grad_norm": 23.850549697875977,
"learning_rate": 2.7380316185907506e-07,
"loss": 0.2958,
"num_input_tokens_seen": 38501248,
"step": 12230
},
{
"epoch": 0.78324050957045,
"grad_norm": 19.70224380493164,
"learning_rate": 2.730354393665839e-07,
"loss": 0.3508,
"num_input_tokens_seen": 38516992,
"step": 12235
},
{
"epoch": 0.7835605915114269,
"grad_norm": 30.95526123046875,
"learning_rate": 2.7226862449788245e-07,
"loss": 0.3871,
"num_input_tokens_seen": 38531456,
"step": 12240
},
{
"epoch": 0.7838806734524039,
"grad_norm": 39.920440673828125,
"learning_rate": 2.715027182103482e-07,
"loss": 0.3283,
"num_input_tokens_seen": 38546880,
"step": 12245
},
{
"epoch": 0.7842007553933807,
"grad_norm": 22.294261932373047,
"learning_rate": 2.707377214602232e-07,
"loss": 0.3104,
"num_input_tokens_seen": 38562176,
"step": 12250
},
{
"epoch": 0.7845208373343576,
"grad_norm": 38.912017822265625,
"learning_rate": 2.699736352026157e-07,
"loss": 0.4304,
"num_input_tokens_seen": 38577472,
"step": 12255
},
{
"epoch": 0.7848409192753345,
"grad_norm": 22.714643478393555,
"learning_rate": 2.6921046039149645e-07,
"loss": 0.3265,
"num_input_tokens_seen": 38593088,
"step": 12260
},
{
"epoch": 0.7851610012163114,
"grad_norm": 32.15650939941406,
"learning_rate": 2.6844819797969744e-07,
"loss": 0.3378,
"num_input_tokens_seen": 38607936,
"step": 12265
},
{
"epoch": 0.7854810831572883,
"grad_norm": 41.70045471191406,
"learning_rate": 2.6768684891891236e-07,
"loss": 0.2504,
"num_input_tokens_seen": 38625024,
"step": 12270
},
{
"epoch": 0.7858011650982651,
"grad_norm": 31.758371353149414,
"learning_rate": 2.6692641415969497e-07,
"loss": 0.3268,
"num_input_tokens_seen": 38641792,
"step": 12275
},
{
"epoch": 0.786121247039242,
"grad_norm": 50.608848571777344,
"learning_rate": 2.66166894651457e-07,
"loss": 0.4112,
"num_input_tokens_seen": 38656896,
"step": 12280
},
{
"epoch": 0.7864413289802189,
"grad_norm": 43.49479675292969,
"learning_rate": 2.654082913424668e-07,
"loss": 0.343,
"num_input_tokens_seen": 38672448,
"step": 12285
},
{
"epoch": 0.7867614109211958,
"grad_norm": 28.721969604492188,
"learning_rate": 2.6465060517985003e-07,
"loss": 0.305,
"num_input_tokens_seen": 38688576,
"step": 12290
},
{
"epoch": 0.7870814928621728,
"grad_norm": 45.75242233276367,
"learning_rate": 2.638938371095867e-07,
"loss": 0.5196,
"num_input_tokens_seen": 38704064,
"step": 12295
},
{
"epoch": 0.7874015748031497,
"grad_norm": 20.558774948120117,
"learning_rate": 2.6313798807651065e-07,
"loss": 0.3756,
"num_input_tokens_seen": 38718976,
"step": 12300
},
{
"epoch": 0.7877216567441265,
"grad_norm": 19.721187591552734,
"learning_rate": 2.6238305902430813e-07,
"loss": 0.3578,
"num_input_tokens_seen": 38734272,
"step": 12305
},
{
"epoch": 0.7880417386851034,
"grad_norm": 14.175429344177246,
"learning_rate": 2.61629050895517e-07,
"loss": 0.3147,
"num_input_tokens_seen": 38749504,
"step": 12310
},
{
"epoch": 0.7883618206260803,
"grad_norm": 29.975229263305664,
"learning_rate": 2.608759646315253e-07,
"loss": 0.3237,
"num_input_tokens_seen": 38764352,
"step": 12315
},
{
"epoch": 0.7886819025670572,
"grad_norm": 21.73525619506836,
"learning_rate": 2.6012380117257005e-07,
"loss": 0.3771,
"num_input_tokens_seen": 38780096,
"step": 12320
},
{
"epoch": 0.789001984508034,
"grad_norm": 28.346630096435547,
"learning_rate": 2.5937256145773613e-07,
"loss": 0.3853,
"num_input_tokens_seen": 38795712,
"step": 12325
},
{
"epoch": 0.7893220664490109,
"grad_norm": 38.748958587646484,
"learning_rate": 2.586222464249551e-07,
"loss": 0.3191,
"num_input_tokens_seen": 38811328,
"step": 12330
},
{
"epoch": 0.7896421483899878,
"grad_norm": 39.744686126708984,
"learning_rate": 2.5787285701100413e-07,
"loss": 0.2067,
"num_input_tokens_seen": 38826240,
"step": 12335
},
{
"epoch": 0.7899622303309647,
"grad_norm": 39.24365234375,
"learning_rate": 2.571243941515048e-07,
"loss": 0.3655,
"num_input_tokens_seen": 38842624,
"step": 12340
},
{
"epoch": 0.7902823122719416,
"grad_norm": 24.49032211303711,
"learning_rate": 2.563768587809213e-07,
"loss": 0.278,
"num_input_tokens_seen": 38857472,
"step": 12345
},
{
"epoch": 0.7906023942129186,
"grad_norm": 60.89975357055664,
"learning_rate": 2.5563025183256137e-07,
"loss": 0.4174,
"num_input_tokens_seen": 38872256,
"step": 12350
},
{
"epoch": 0.7909224761538954,
"grad_norm": 39.74103927612305,
"learning_rate": 2.548845742385717e-07,
"loss": 0.5513,
"num_input_tokens_seen": 38890048,
"step": 12355
},
{
"epoch": 0.7912425580948723,
"grad_norm": 38.84343719482422,
"learning_rate": 2.541398269299393e-07,
"loss": 0.2424,
"num_input_tokens_seen": 38905664,
"step": 12360
},
{
"epoch": 0.7915626400358492,
"grad_norm": 14.47574234008789,
"learning_rate": 2.5339601083649063e-07,
"loss": 0.3106,
"num_input_tokens_seen": 38926144,
"step": 12365
},
{
"epoch": 0.7918827219768261,
"grad_norm": 37.022544860839844,
"learning_rate": 2.526531268868889e-07,
"loss": 0.5144,
"num_input_tokens_seen": 38942720,
"step": 12370
},
{
"epoch": 0.792202803917803,
"grad_norm": 26.23043441772461,
"learning_rate": 2.5191117600863266e-07,
"loss": 0.3388,
"num_input_tokens_seen": 38958144,
"step": 12375
},
{
"epoch": 0.7925228858587798,
"grad_norm": 19.583799362182617,
"learning_rate": 2.511701591280565e-07,
"loss": 0.2559,
"num_input_tokens_seen": 38973376,
"step": 12380
},
{
"epoch": 0.7928429677997567,
"grad_norm": 40.05327224731445,
"learning_rate": 2.504300771703295e-07,
"loss": 0.3501,
"num_input_tokens_seen": 38989504,
"step": 12385
},
{
"epoch": 0.7931630497407336,
"grad_norm": 64.94096374511719,
"learning_rate": 2.496909310594517e-07,
"loss": 0.3819,
"num_input_tokens_seen": 39005056,
"step": 12390
},
{
"epoch": 0.7934831316817105,
"grad_norm": 41.124534606933594,
"learning_rate": 2.4895272171825587e-07,
"loss": 0.4581,
"num_input_tokens_seen": 39020608,
"step": 12395
},
{
"epoch": 0.7938032136226874,
"grad_norm": 34.6364860534668,
"learning_rate": 2.482154500684055e-07,
"loss": 0.4464,
"num_input_tokens_seen": 39035712,
"step": 12400
},
{
"epoch": 0.7941232955636643,
"grad_norm": 29.507856369018555,
"learning_rate": 2.4747911703039293e-07,
"loss": 0.3431,
"num_input_tokens_seen": 39050880,
"step": 12405
},
{
"epoch": 0.7944433775046412,
"grad_norm": 35.2116813659668,
"learning_rate": 2.467437235235378e-07,
"loss": 0.3737,
"num_input_tokens_seen": 39065792,
"step": 12410
},
{
"epoch": 0.7947634594456181,
"grad_norm": 32.397830963134766,
"learning_rate": 2.460092704659883e-07,
"loss": 0.3441,
"num_input_tokens_seen": 39080960,
"step": 12415
},
{
"epoch": 0.795083541386595,
"grad_norm": 16.87535285949707,
"learning_rate": 2.452757587747174e-07,
"loss": 0.2641,
"num_input_tokens_seen": 39097216,
"step": 12420
},
{
"epoch": 0.7954036233275719,
"grad_norm": 24.034717559814453,
"learning_rate": 2.445431893655232e-07,
"loss": 0.182,
"num_input_tokens_seen": 39113152,
"step": 12425
},
{
"epoch": 0.7957237052685487,
"grad_norm": 36.59601593017578,
"learning_rate": 2.438115631530271e-07,
"loss": 0.3652,
"num_input_tokens_seen": 39130176,
"step": 12430
},
{
"epoch": 0.7960437872095256,
"grad_norm": 28.420482635498047,
"learning_rate": 2.4308088105067305e-07,
"loss": 0.2338,
"num_input_tokens_seen": 39145792,
"step": 12435
},
{
"epoch": 0.7963638691505025,
"grad_norm": 68.1366958618164,
"learning_rate": 2.423511439707262e-07,
"loss": 0.4227,
"num_input_tokens_seen": 39161280,
"step": 12440
},
{
"epoch": 0.7966839510914794,
"grad_norm": 24.002521514892578,
"learning_rate": 2.4162235282427177e-07,
"loss": 0.2807,
"num_input_tokens_seen": 39176512,
"step": 12445
},
{
"epoch": 0.7970040330324563,
"grad_norm": 42.29568099975586,
"learning_rate": 2.408945085212144e-07,
"loss": 0.353,
"num_input_tokens_seen": 39191808,
"step": 12450
},
{
"epoch": 0.7973241149734333,
"grad_norm": 36.45928955078125,
"learning_rate": 2.401676119702759e-07,
"loss": 0.2507,
"num_input_tokens_seen": 39208640,
"step": 12455
},
{
"epoch": 0.7976441969144101,
"grad_norm": 20.824121475219727,
"learning_rate": 2.394416640789952e-07,
"loss": 0.3667,
"num_input_tokens_seen": 39223232,
"step": 12460
},
{
"epoch": 0.797964278855387,
"grad_norm": 40.27499008178711,
"learning_rate": 2.3871666575372696e-07,
"loss": 0.3149,
"num_input_tokens_seen": 39238656,
"step": 12465
},
{
"epoch": 0.7982843607963639,
"grad_norm": 55.65762710571289,
"learning_rate": 2.3799261789963964e-07,
"loss": 0.5348,
"num_input_tokens_seen": 39255872,
"step": 12470
},
{
"epoch": 0.7986044427373408,
"grad_norm": 21.946813583374023,
"learning_rate": 2.3726952142071644e-07,
"loss": 0.269,
"num_input_tokens_seen": 39270784,
"step": 12475
},
{
"epoch": 0.7989245246783176,
"grad_norm": 42.15665054321289,
"learning_rate": 2.365473772197508e-07,
"loss": 0.3524,
"num_input_tokens_seen": 39286080,
"step": 12480
},
{
"epoch": 0.7992446066192945,
"grad_norm": 35.08050537109375,
"learning_rate": 2.3582618619834883e-07,
"loss": 0.3557,
"num_input_tokens_seen": 39301312,
"step": 12485
},
{
"epoch": 0.7995646885602714,
"grad_norm": 16.935348510742188,
"learning_rate": 2.3510594925692528e-07,
"loss": 0.2214,
"num_input_tokens_seen": 39316736,
"step": 12490
},
{
"epoch": 0.7998847705012483,
"grad_norm": 32.69172668457031,
"learning_rate": 2.343866672947057e-07,
"loss": 0.3518,
"num_input_tokens_seen": 39331264,
"step": 12495
},
{
"epoch": 0.8002048524422252,
"grad_norm": 34.507137298583984,
"learning_rate": 2.336683412097209e-07,
"loss": 0.2711,
"num_input_tokens_seen": 39345856,
"step": 12500
},
{
"epoch": 0.800524934383202,
"grad_norm": 24.427064895629883,
"learning_rate": 2.329509718988095e-07,
"loss": 0.3662,
"num_input_tokens_seen": 39361280,
"step": 12505
},
{
"epoch": 0.800845016324179,
"grad_norm": 34.9528694152832,
"learning_rate": 2.3223456025761645e-07,
"loss": 0.3395,
"num_input_tokens_seen": 39375872,
"step": 12510
},
{
"epoch": 0.8009730491005698,
"eval_loss": 0.3655269742012024,
"eval_runtime": 50.6164,
"eval_samples_per_second": 274.338,
"eval_steps_per_second": 34.297,
"num_input_tokens_seen": 39382144,
"step": 12512
},
{
"epoch": 0.8011650982651559,
"grad_norm": 20.289682388305664,
"learning_rate": 2.315191071805892e-07,
"loss": 0.3043,
"num_input_tokens_seen": 39392320,
"step": 12515
},
{
"epoch": 0.8014851802061328,
"grad_norm": 68.8118667602539,
"learning_rate": 2.3080461356097937e-07,
"loss": 0.3619,
"num_input_tokens_seen": 39407680,
"step": 12520
},
{
"epoch": 0.8018052621471097,
"grad_norm": 20.164321899414062,
"learning_rate": 2.30091080290841e-07,
"loss": 0.2951,
"num_input_tokens_seen": 39424512,
"step": 12525
},
{
"epoch": 0.8021253440880866,
"grad_norm": 48.468223571777344,
"learning_rate": 2.29378508261029e-07,
"loss": 0.3417,
"num_input_tokens_seen": 39439296,
"step": 12530
},
{
"epoch": 0.8024454260290634,
"grad_norm": 51.00064468383789,
"learning_rate": 2.2866689836119702e-07,
"loss": 0.3672,
"num_input_tokens_seen": 39456576,
"step": 12535
},
{
"epoch": 0.8027655079700403,
"grad_norm": 64.20645141601562,
"learning_rate": 2.2795625147979913e-07,
"loss": 0.3553,
"num_input_tokens_seen": 39472512,
"step": 12540
},
{
"epoch": 0.8030855899110172,
"grad_norm": 22.278350830078125,
"learning_rate": 2.2724656850408597e-07,
"loss": 0.2351,
"num_input_tokens_seen": 39488192,
"step": 12545
},
{
"epoch": 0.8034056718519941,
"grad_norm": 44.79075622558594,
"learning_rate": 2.2653785032010532e-07,
"loss": 0.3808,
"num_input_tokens_seen": 39503552,
"step": 12550
},
{
"epoch": 0.803725753792971,
"grad_norm": 40.77724838256836,
"learning_rate": 2.258300978126999e-07,
"loss": 0.3368,
"num_input_tokens_seen": 39519744,
"step": 12555
},
{
"epoch": 0.804045835733948,
"grad_norm": 23.474609375,
"learning_rate": 2.2512331186550715e-07,
"loss": 0.4903,
"num_input_tokens_seen": 39535232,
"step": 12560
},
{
"epoch": 0.8043659176749248,
"grad_norm": 45.52729415893555,
"learning_rate": 2.244174933609575e-07,
"loss": 0.3867,
"num_input_tokens_seen": 39549568,
"step": 12565
},
{
"epoch": 0.8046859996159017,
"grad_norm": 27.22245216369629,
"learning_rate": 2.2371264318027383e-07,
"loss": 0.2726,
"num_input_tokens_seen": 39566016,
"step": 12570
},
{
"epoch": 0.8050060815568786,
"grad_norm": 28.36591339111328,
"learning_rate": 2.2300876220346975e-07,
"loss": 0.2337,
"num_input_tokens_seen": 39581760,
"step": 12575
},
{
"epoch": 0.8053261634978555,
"grad_norm": 38.8742561340332,
"learning_rate": 2.2230585130934897e-07,
"loss": 0.2888,
"num_input_tokens_seen": 39597888,
"step": 12580
},
{
"epoch": 0.8056462454388323,
"grad_norm": 25.22014045715332,
"learning_rate": 2.2160391137550394e-07,
"loss": 0.4469,
"num_input_tokens_seen": 39613568,
"step": 12585
},
{
"epoch": 0.8059663273798092,
"grad_norm": 59.23908996582031,
"learning_rate": 2.2090294327831494e-07,
"loss": 0.4226,
"num_input_tokens_seen": 39628096,
"step": 12590
},
{
"epoch": 0.8062864093207861,
"grad_norm": 41.97724914550781,
"learning_rate": 2.202029478929488e-07,
"loss": 0.2881,
"num_input_tokens_seen": 39642560,
"step": 12595
},
{
"epoch": 0.806606491261763,
"grad_norm": 18.373050689697266,
"learning_rate": 2.195039260933581e-07,
"loss": 0.2958,
"num_input_tokens_seen": 39658112,
"step": 12600
},
{
"epoch": 0.8069265732027399,
"grad_norm": 31.841543197631836,
"learning_rate": 2.1880587875227973e-07,
"loss": 0.2724,
"num_input_tokens_seen": 39674112,
"step": 12605
},
{
"epoch": 0.8072466551437167,
"grad_norm": 28.09347915649414,
"learning_rate": 2.18108806741234e-07,
"loss": 0.3308,
"num_input_tokens_seen": 39690432,
"step": 12610
},
{
"epoch": 0.8075667370846937,
"grad_norm": 23.086257934570312,
"learning_rate": 2.1741271093052315e-07,
"loss": 0.3547,
"num_input_tokens_seen": 39705792,
"step": 12615
},
{
"epoch": 0.8078868190256706,
"grad_norm": 46.950721740722656,
"learning_rate": 2.167175921892318e-07,
"loss": 0.4658,
"num_input_tokens_seen": 39722048,
"step": 12620
},
{
"epoch": 0.8082069009666475,
"grad_norm": 28.440935134887695,
"learning_rate": 2.1602345138522314e-07,
"loss": 0.4219,
"num_input_tokens_seen": 39738304,
"step": 12625
},
{
"epoch": 0.8085269829076244,
"grad_norm": 31.971548080444336,
"learning_rate": 2.1533028938514008e-07,
"loss": 0.3551,
"num_input_tokens_seen": 39753728,
"step": 12630
},
{
"epoch": 0.8088470648486012,
"grad_norm": 43.10588836669922,
"learning_rate": 2.1463810705440433e-07,
"loss": 0.3441,
"num_input_tokens_seen": 39769600,
"step": 12635
},
{
"epoch": 0.8091671467895781,
"grad_norm": 35.64780044555664,
"learning_rate": 2.139469052572127e-07,
"loss": 0.3571,
"num_input_tokens_seen": 39784000,
"step": 12640
},
{
"epoch": 0.809487228730555,
"grad_norm": 46.72938537597656,
"learning_rate": 2.1325668485653891e-07,
"loss": 0.3587,
"num_input_tokens_seen": 39800320,
"step": 12645
},
{
"epoch": 0.8098073106715319,
"grad_norm": 29.774227142333984,
"learning_rate": 2.1256744671413173e-07,
"loss": 0.4617,
"num_input_tokens_seen": 39815360,
"step": 12650
},
{
"epoch": 0.8101273926125088,
"grad_norm": 32.227561950683594,
"learning_rate": 2.1187919169051316e-07,
"loss": 0.3819,
"num_input_tokens_seen": 39829952,
"step": 12655
},
{
"epoch": 0.8104474745534856,
"grad_norm": 31.437002182006836,
"learning_rate": 2.111919206449767e-07,
"loss": 0.3505,
"num_input_tokens_seen": 39845376,
"step": 12660
},
{
"epoch": 0.8107675564944626,
"grad_norm": 27.419315338134766,
"learning_rate": 2.1050563443558922e-07,
"loss": 0.4955,
"num_input_tokens_seen": 39861696,
"step": 12665
},
{
"epoch": 0.8110876384354395,
"grad_norm": 41.58053970336914,
"learning_rate": 2.0982033391918697e-07,
"loss": 0.3,
"num_input_tokens_seen": 39877440,
"step": 12670
},
{
"epoch": 0.8114077203764164,
"grad_norm": 58.020626068115234,
"learning_rate": 2.0913601995137543e-07,
"loss": 0.3292,
"num_input_tokens_seen": 39893760,
"step": 12675
},
{
"epoch": 0.8117278023173933,
"grad_norm": 15.215536117553711,
"learning_rate": 2.084526933865287e-07,
"loss": 0.2889,
"num_input_tokens_seen": 39909568,
"step": 12680
},
{
"epoch": 0.8120478842583702,
"grad_norm": 30.34135627746582,
"learning_rate": 2.0777035507778817e-07,
"loss": 0.4667,
"num_input_tokens_seen": 39923648,
"step": 12685
},
{
"epoch": 0.812367966199347,
"grad_norm": 18.02565574645996,
"learning_rate": 2.0708900587706135e-07,
"loss": 0.4268,
"num_input_tokens_seen": 39939008,
"step": 12690
},
{
"epoch": 0.8126880481403239,
"grad_norm": 44.681556701660156,
"learning_rate": 2.0640864663502e-07,
"loss": 0.3356,
"num_input_tokens_seen": 39955072,
"step": 12695
},
{
"epoch": 0.8130081300813008,
"grad_norm": 30.91301727294922,
"learning_rate": 2.057292782011013e-07,
"loss": 0.4563,
"num_input_tokens_seen": 39970880,
"step": 12700
},
{
"epoch": 0.8133282120222777,
"grad_norm": 25.404428482055664,
"learning_rate": 2.0505090142350468e-07,
"loss": 0.3045,
"num_input_tokens_seen": 39986240,
"step": 12705
},
{
"epoch": 0.8136482939632546,
"grad_norm": 28.93308448791504,
"learning_rate": 2.0437351714919127e-07,
"loss": 0.3426,
"num_input_tokens_seen": 40001856,
"step": 12710
},
{
"epoch": 0.8139683759042314,
"grad_norm": 20.265243530273438,
"learning_rate": 2.0369712622388336e-07,
"loss": 0.3084,
"num_input_tokens_seen": 40018112,
"step": 12715
},
{
"epoch": 0.8142884578452084,
"grad_norm": 41.139366149902344,
"learning_rate": 2.0302172949206298e-07,
"loss": 0.2869,
"num_input_tokens_seen": 40033664,
"step": 12720
},
{
"epoch": 0.8146085397861853,
"grad_norm": 60.38472366333008,
"learning_rate": 2.0234732779697094e-07,
"loss": 0.3069,
"num_input_tokens_seen": 40048768,
"step": 12725
},
{
"epoch": 0.8149286217271622,
"grad_norm": 42.53269577026367,
"learning_rate": 2.016739219806056e-07,
"loss": 0.3267,
"num_input_tokens_seen": 40063232,
"step": 12730
},
{
"epoch": 0.8152487036681391,
"grad_norm": 20.74918556213379,
"learning_rate": 2.0100151288372215e-07,
"loss": 0.3839,
"num_input_tokens_seen": 40079296,
"step": 12735
},
{
"epoch": 0.8155687856091159,
"grad_norm": 59.69536209106445,
"learning_rate": 2.0033010134583084e-07,
"loss": 0.5609,
"num_input_tokens_seen": 40094976,
"step": 12740
},
{
"epoch": 0.8158888675500928,
"grad_norm": 32.35287857055664,
"learning_rate": 1.9965968820519763e-07,
"loss": 0.314,
"num_input_tokens_seen": 40110464,
"step": 12745
},
{
"epoch": 0.8162089494910697,
"grad_norm": 47.17245864868164,
"learning_rate": 1.9899027429884042e-07,
"loss": 0.4042,
"num_input_tokens_seen": 40125568,
"step": 12750
},
{
"epoch": 0.8165290314320466,
"grad_norm": 38.48648452758789,
"learning_rate": 1.983218604625305e-07,
"loss": 0.4302,
"num_input_tokens_seen": 40141440,
"step": 12755
},
{
"epoch": 0.8168491133730235,
"grad_norm": 16.031692504882812,
"learning_rate": 1.9765444753079096e-07,
"loss": 0.3288,
"num_input_tokens_seen": 40156416,
"step": 12760
},
{
"epoch": 0.8171691953140003,
"grad_norm": 32.27566909790039,
"learning_rate": 1.9698803633689408e-07,
"loss": 0.3985,
"num_input_tokens_seen": 40172928,
"step": 12765
},
{
"epoch": 0.8174892772549772,
"grad_norm": 22.818599700927734,
"learning_rate": 1.963226277128619e-07,
"loss": 0.2404,
"num_input_tokens_seen": 40188096,
"step": 12770
},
{
"epoch": 0.8178093591959542,
"grad_norm": 30.646739959716797,
"learning_rate": 1.956582224894655e-07,
"loss": 0.3559,
"num_input_tokens_seen": 40204032,
"step": 12775
},
{
"epoch": 0.8181294411369311,
"grad_norm": 46.02298355102539,
"learning_rate": 1.949948214962227e-07,
"loss": 0.369,
"num_input_tokens_seen": 40218944,
"step": 12780
},
{
"epoch": 0.818449523077908,
"grad_norm": 50.748172760009766,
"learning_rate": 1.943324255613964e-07,
"loss": 0.358,
"num_input_tokens_seen": 40235456,
"step": 12785
},
{
"epoch": 0.8187696050188848,
"grad_norm": 25.575429916381836,
"learning_rate": 1.936710355119967e-07,
"loss": 0.4564,
"num_input_tokens_seen": 40250176,
"step": 12790
},
{
"epoch": 0.8190896869598617,
"grad_norm": 34.35418701171875,
"learning_rate": 1.9301065217377655e-07,
"loss": 0.3312,
"num_input_tokens_seen": 40265472,
"step": 12795
},
{
"epoch": 0.8194097689008386,
"grad_norm": 24.442747116088867,
"learning_rate": 1.9235127637123249e-07,
"loss": 0.3995,
"num_input_tokens_seen": 40281728,
"step": 12800
},
{
"epoch": 0.8197298508418155,
"grad_norm": 51.47005081176758,
"learning_rate": 1.9169290892760225e-07,
"loss": 0.3221,
"num_input_tokens_seen": 40296768,
"step": 12805
},
{
"epoch": 0.8200499327827924,
"grad_norm": 47.755516052246094,
"learning_rate": 1.91035550664866e-07,
"loss": 0.3295,
"num_input_tokens_seen": 40311488,
"step": 12810
},
{
"epoch": 0.8203700147237692,
"grad_norm": 50.54772186279297,
"learning_rate": 1.903792024037433e-07,
"loss": 0.3238,
"num_input_tokens_seen": 40327232,
"step": 12815
},
{
"epoch": 0.8206900966647461,
"grad_norm": 33.594635009765625,
"learning_rate": 1.8972386496369185e-07,
"loss": 0.4338,
"num_input_tokens_seen": 40344064,
"step": 12820
},
{
"epoch": 0.8210101786057231,
"grad_norm": 40.6557502746582,
"learning_rate": 1.89069539162909e-07,
"loss": 0.3917,
"num_input_tokens_seen": 40359040,
"step": 12825
},
{
"epoch": 0.8213302605467,
"grad_norm": 20.805389404296875,
"learning_rate": 1.8841622581832783e-07,
"loss": 0.4034,
"num_input_tokens_seen": 40376384,
"step": 12830
},
{
"epoch": 0.8216503424876769,
"grad_norm": 27.590456008911133,
"learning_rate": 1.8776392574561783e-07,
"loss": 0.5928,
"num_input_tokens_seen": 40391936,
"step": 12835
},
{
"epoch": 0.8219704244286538,
"grad_norm": 17.456087112426758,
"learning_rate": 1.8711263975918322e-07,
"loss": 0.4702,
"num_input_tokens_seen": 40408832,
"step": 12840
},
{
"epoch": 0.8222905063696306,
"grad_norm": 37.28561019897461,
"learning_rate": 1.8646236867216215e-07,
"loss": 0.4516,
"num_input_tokens_seen": 40425280,
"step": 12845
},
{
"epoch": 0.8226105883106075,
"grad_norm": 36.03346633911133,
"learning_rate": 1.8581311329642591e-07,
"loss": 0.3451,
"num_input_tokens_seen": 40440832,
"step": 12850
},
{
"epoch": 0.8229306702515844,
"grad_norm": 30.598731994628906,
"learning_rate": 1.8516487444257723e-07,
"loss": 0.2711,
"num_input_tokens_seen": 40458624,
"step": 12855
},
{
"epoch": 0.8232507521925613,
"grad_norm": 34.196533203125,
"learning_rate": 1.8451765291995004e-07,
"loss": 0.4068,
"num_input_tokens_seen": 40474688,
"step": 12860
},
{
"epoch": 0.8235708341335382,
"grad_norm": 36.64088439941406,
"learning_rate": 1.8387144953660806e-07,
"loss": 0.3591,
"num_input_tokens_seen": 40490816,
"step": 12865
},
{
"epoch": 0.823890916074515,
"grad_norm": 39.154510498046875,
"learning_rate": 1.832262650993437e-07,
"loss": 0.4492,
"num_input_tokens_seen": 40506112,
"step": 12870
},
{
"epoch": 0.8242109980154919,
"grad_norm": 20.44598388671875,
"learning_rate": 1.825821004136774e-07,
"loss": 0.2973,
"num_input_tokens_seen": 40521344,
"step": 12875
},
{
"epoch": 0.8245310799564689,
"grad_norm": 30.141361236572266,
"learning_rate": 1.819389562838559e-07,
"loss": 0.2799,
"num_input_tokens_seen": 40537024,
"step": 12880
},
{
"epoch": 0.8248511618974458,
"grad_norm": 45.2744026184082,
"learning_rate": 1.8129683351285319e-07,
"loss": 0.3058,
"num_input_tokens_seen": 40552640,
"step": 12885
},
{
"epoch": 0.8251712438384227,
"grad_norm": 35.627498626708984,
"learning_rate": 1.8065573290236626e-07,
"loss": 0.3209,
"num_input_tokens_seen": 40568000,
"step": 12890
},
{
"epoch": 0.8254913257793995,
"grad_norm": 24.41234588623047,
"learning_rate": 1.8001565525281682e-07,
"loss": 0.3806,
"num_input_tokens_seen": 40584960,
"step": 12895
},
{
"epoch": 0.8258114077203764,
"grad_norm": 26.482351303100586,
"learning_rate": 1.793766013633493e-07,
"loss": 0.3707,
"num_input_tokens_seen": 40600704,
"step": 12900
},
{
"epoch": 0.8261314896613533,
"grad_norm": 29.43145751953125,
"learning_rate": 1.7873857203183074e-07,
"loss": 0.3865,
"num_input_tokens_seen": 40615872,
"step": 12905
},
{
"epoch": 0.8264515716023302,
"grad_norm": 53.48032760620117,
"learning_rate": 1.7810156805484733e-07,
"loss": 0.4632,
"num_input_tokens_seen": 40632640,
"step": 12910
},
{
"epoch": 0.8267716535433071,
"grad_norm": 24.980363845825195,
"learning_rate": 1.7746559022770612e-07,
"loss": 0.3007,
"num_input_tokens_seen": 40648064,
"step": 12915
},
{
"epoch": 0.8270917354842839,
"grad_norm": 30.67084503173828,
"learning_rate": 1.7683063934443342e-07,
"loss": 0.3833,
"num_input_tokens_seen": 40664704,
"step": 12920
},
{
"epoch": 0.8274118174252608,
"grad_norm": 40.46763610839844,
"learning_rate": 1.7619671619777277e-07,
"loss": 0.4074,
"num_input_tokens_seen": 40681024,
"step": 12925
},
{
"epoch": 0.8277318993662378,
"grad_norm": 31.0588321685791,
"learning_rate": 1.7556382157918404e-07,
"loss": 0.4121,
"num_input_tokens_seen": 40695936,
"step": 12930
},
{
"epoch": 0.8280519813072147,
"grad_norm": 27.82343292236328,
"learning_rate": 1.7493195627884427e-07,
"loss": 0.3177,
"num_input_tokens_seen": 40713472,
"step": 12935
},
{
"epoch": 0.8283720632481916,
"grad_norm": 41.197757720947266,
"learning_rate": 1.7430112108564465e-07,
"loss": 0.3141,
"num_input_tokens_seen": 40729344,
"step": 12940
},
{
"epoch": 0.8286921451891684,
"grad_norm": 38.33913040161133,
"learning_rate": 1.736713167871896e-07,
"loss": 0.3983,
"num_input_tokens_seen": 40745856,
"step": 12945
},
{
"epoch": 0.8290122271301453,
"grad_norm": 19.74918556213379,
"learning_rate": 1.7304254416979803e-07,
"loss": 0.2973,
"num_input_tokens_seen": 40761920,
"step": 12950
},
{
"epoch": 0.8293323090711222,
"grad_norm": 19.813852310180664,
"learning_rate": 1.7241480401849963e-07,
"loss": 0.263,
"num_input_tokens_seen": 40776960,
"step": 12955
},
{
"epoch": 0.8296523910120991,
"grad_norm": 21.785139083862305,
"learning_rate": 1.7178809711703524e-07,
"loss": 0.3413,
"num_input_tokens_seen": 40792192,
"step": 12960
},
{
"epoch": 0.829972472953076,
"grad_norm": 27.10121726989746,
"learning_rate": 1.7116242424785599e-07,
"loss": 0.36,
"num_input_tokens_seen": 40808256,
"step": 12965
},
{
"epoch": 0.8302925548940528,
"grad_norm": 42.24668884277344,
"learning_rate": 1.7053778619212166e-07,
"loss": 0.4272,
"num_input_tokens_seen": 40823424,
"step": 12970
},
{
"epoch": 0.8306126368350297,
"grad_norm": 39.586917877197266,
"learning_rate": 1.6991418372970022e-07,
"loss": 0.4132,
"num_input_tokens_seen": 40840960,
"step": 12975
},
{
"epoch": 0.8309327187760066,
"grad_norm": 28.66804313659668,
"learning_rate": 1.6929161763916666e-07,
"loss": 0.3849,
"num_input_tokens_seen": 40857536,
"step": 12980
},
{
"epoch": 0.8312528007169836,
"grad_norm": 26.58046531677246,
"learning_rate": 1.686700886978021e-07,
"loss": 0.3582,
"num_input_tokens_seen": 40874240,
"step": 12985
},
{
"epoch": 0.8315728826579605,
"grad_norm": 36.41627883911133,
"learning_rate": 1.6804959768159266e-07,
"loss": 0.3579,
"num_input_tokens_seen": 40888960,
"step": 12990
},
{
"epoch": 0.8318929645989374,
"grad_norm": 53.56745529174805,
"learning_rate": 1.674301453652287e-07,
"loss": 0.5373,
"num_input_tokens_seen": 40904512,
"step": 12995
},
{
"epoch": 0.8322130465399142,
"grad_norm": 42.032283782958984,
"learning_rate": 1.6681173252210378e-07,
"loss": 0.2969,
"num_input_tokens_seen": 40921856,
"step": 13000
},
{
"epoch": 0.8325331284808911,
"grad_norm": 49.92417526245117,
"learning_rate": 1.6619435992431342e-07,
"loss": 0.3801,
"num_input_tokens_seen": 40938752,
"step": 13005
},
{
"epoch": 0.832853210421868,
"grad_norm": 48.38226318359375,
"learning_rate": 1.6557802834265466e-07,
"loss": 0.3026,
"num_input_tokens_seen": 40954048,
"step": 13010
},
{
"epoch": 0.8331732923628449,
"grad_norm": 26.939504623413086,
"learning_rate": 1.649627385466248e-07,
"loss": 0.3634,
"num_input_tokens_seen": 40972672,
"step": 13015
},
{
"epoch": 0.8334933743038218,
"grad_norm": 20.158533096313477,
"learning_rate": 1.643484913044202e-07,
"loss": 0.2467,
"num_input_tokens_seen": 40987648,
"step": 13020
},
{
"epoch": 0.8338134562447986,
"grad_norm": 14.96458911895752,
"learning_rate": 1.6373528738293564e-07,
"loss": 0.3171,
"num_input_tokens_seen": 41003328,
"step": 13025
},
{
"epoch": 0.8341335381857755,
"grad_norm": 33.328121185302734,
"learning_rate": 1.6312312754776404e-07,
"loss": 0.2939,
"num_input_tokens_seen": 41018624,
"step": 13030
},
{
"epoch": 0.8344536201267524,
"grad_norm": 18.032512664794922,
"learning_rate": 1.6251201256319357e-07,
"loss": 0.3318,
"num_input_tokens_seen": 41034624,
"step": 13035
},
{
"epoch": 0.8347737020677294,
"grad_norm": 27.04534339904785,
"learning_rate": 1.619019431922083e-07,
"loss": 0.3699,
"num_input_tokens_seen": 41049664,
"step": 13040
},
{
"epoch": 0.8350937840087063,
"grad_norm": 34.27926254272461,
"learning_rate": 1.6129292019648754e-07,
"loss": 0.3494,
"num_input_tokens_seen": 41066368,
"step": 13045
},
{
"epoch": 0.8354138659496831,
"grad_norm": 26.79369354248047,
"learning_rate": 1.606849443364038e-07,
"loss": 0.2975,
"num_input_tokens_seen": 41082048,
"step": 13050
},
{
"epoch": 0.83573394789066,
"grad_norm": 17.270774841308594,
"learning_rate": 1.6007801637102104e-07,
"loss": 0.3425,
"num_input_tokens_seen": 41098048,
"step": 13055
},
{
"epoch": 0.8360540298316369,
"grad_norm": 20.079627990722656,
"learning_rate": 1.594721370580969e-07,
"loss": 0.3858,
"num_input_tokens_seen": 41112768,
"step": 13060
},
{
"epoch": 0.8363741117726138,
"grad_norm": 21.468425750732422,
"learning_rate": 1.588673071540788e-07,
"loss": 0.4241,
"num_input_tokens_seen": 41127488,
"step": 13065
},
{
"epoch": 0.8366941937135907,
"grad_norm": 35.24139404296875,
"learning_rate": 1.5826352741410332e-07,
"loss": 0.3195,
"num_input_tokens_seen": 41142272,
"step": 13070
},
{
"epoch": 0.8370142756545675,
"grad_norm": 55.96588134765625,
"learning_rate": 1.576607985919971e-07,
"loss": 0.2947,
"num_input_tokens_seen": 41157952,
"step": 13075
},
{
"epoch": 0.8373343575955444,
"grad_norm": 38.237064361572266,
"learning_rate": 1.57059121440274e-07,
"loss": 0.3547,
"num_input_tokens_seen": 41172992,
"step": 13080
},
{
"epoch": 0.8376544395365213,
"grad_norm": 53.96684646606445,
"learning_rate": 1.56458496710135e-07,
"loss": 0.3823,
"num_input_tokens_seen": 41187776,
"step": 13085
},
{
"epoch": 0.8379745214774983,
"grad_norm": 36.738887786865234,
"learning_rate": 1.5585892515146716e-07,
"loss": 0.3403,
"num_input_tokens_seen": 41204416,
"step": 13090
},
{
"epoch": 0.8382946034184752,
"grad_norm": 23.33167839050293,
"learning_rate": 1.5526040751284253e-07,
"loss": 0.4214,
"num_input_tokens_seen": 41220032,
"step": 13095
},
{
"epoch": 0.838614685359452,
"grad_norm": 37.20791244506836,
"learning_rate": 1.546629445415174e-07,
"loss": 0.3168,
"num_input_tokens_seen": 41235776,
"step": 13100
},
{
"epoch": 0.8389347673004289,
"grad_norm": 41.227115631103516,
"learning_rate": 1.5406653698343141e-07,
"loss": 0.3724,
"num_input_tokens_seen": 41252160,
"step": 13105
},
{
"epoch": 0.8392548492414058,
"grad_norm": 35.0400276184082,
"learning_rate": 1.5347118558320637e-07,
"loss": 0.3591,
"num_input_tokens_seen": 41269056,
"step": 13110
},
{
"epoch": 0.8395749311823827,
"grad_norm": 25.96977996826172,
"learning_rate": 1.5287689108414558e-07,
"loss": 0.3632,
"num_input_tokens_seen": 41285312,
"step": 13115
},
{
"epoch": 0.8398950131233596,
"grad_norm": 38.98981857299805,
"learning_rate": 1.5228365422823242e-07,
"loss": 0.3374,
"num_input_tokens_seen": 41300992,
"step": 13120
},
{
"epoch": 0.8402150950643364,
"grad_norm": 29.43157958984375,
"learning_rate": 1.5169147575613038e-07,
"loss": 0.2637,
"num_input_tokens_seen": 41317952,
"step": 13125
},
{
"epoch": 0.8405351770053133,
"grad_norm": 14.81241226196289,
"learning_rate": 1.5110035640718098e-07,
"loss": 0.297,
"num_input_tokens_seen": 41333440,
"step": 13130
},
{
"epoch": 0.8408552589462902,
"grad_norm": 32.161842346191406,
"learning_rate": 1.5051029691940387e-07,
"loss": 0.3665,
"num_input_tokens_seen": 41349312,
"step": 13135
},
{
"epoch": 0.8411753408872671,
"grad_norm": 32.124176025390625,
"learning_rate": 1.4992129802949515e-07,
"loss": 0.356,
"num_input_tokens_seen": 41364288,
"step": 13140
},
{
"epoch": 0.8414954228282441,
"grad_norm": 24.080873489379883,
"learning_rate": 1.4933336047282696e-07,
"loss": 0.2884,
"num_input_tokens_seen": 41379904,
"step": 13145
},
{
"epoch": 0.841815504769221,
"grad_norm": 34.71171951293945,
"learning_rate": 1.4874648498344579e-07,
"loss": 0.3481,
"num_input_tokens_seen": 41394432,
"step": 13150
},
{
"epoch": 0.8421355867101978,
"grad_norm": 57.97336196899414,
"learning_rate": 1.4816067229407348e-07,
"loss": 0.3485,
"num_input_tokens_seen": 41409984,
"step": 13155
},
{
"epoch": 0.8424556686511747,
"grad_norm": 19.453880310058594,
"learning_rate": 1.4757592313610322e-07,
"loss": 0.3051,
"num_input_tokens_seen": 41425984,
"step": 13160
},
{
"epoch": 0.8427757505921516,
"grad_norm": 19.217065811157227,
"learning_rate": 1.4699223823960128e-07,
"loss": 0.3312,
"num_input_tokens_seen": 41441920,
"step": 13165
},
{
"epoch": 0.8430958325331285,
"grad_norm": 39.03798294067383,
"learning_rate": 1.4640961833330579e-07,
"loss": 0.3389,
"num_input_tokens_seen": 41457664,
"step": 13170
},
{
"epoch": 0.8434159144741054,
"grad_norm": 18.345247268676758,
"learning_rate": 1.4582806414462378e-07,
"loss": 0.2518,
"num_input_tokens_seen": 41472832,
"step": 13175
},
{
"epoch": 0.8437359964150822,
"grad_norm": 24.834247589111328,
"learning_rate": 1.4524757639963258e-07,
"loss": 0.33,
"num_input_tokens_seen": 41490368,
"step": 13180
},
{
"epoch": 0.8440560783560591,
"grad_norm": 46.54368209838867,
"learning_rate": 1.4466815582307845e-07,
"loss": 0.4397,
"num_input_tokens_seen": 41506624,
"step": 13185
},
{
"epoch": 0.844376160297036,
"grad_norm": 10.444628715515137,
"learning_rate": 1.440898031383746e-07,
"loss": 0.251,
"num_input_tokens_seen": 41523264,
"step": 13190
},
{
"epoch": 0.844696242238013,
"grad_norm": 41.213134765625,
"learning_rate": 1.4351251906760064e-07,
"loss": 0.3803,
"num_input_tokens_seen": 41538944,
"step": 13195
},
{
"epoch": 0.8450163241789899,
"grad_norm": 39.923397064208984,
"learning_rate": 1.4293630433150317e-07,
"loss": 0.3939,
"num_input_tokens_seen": 41554880,
"step": 13200
},
{
"epoch": 0.8453364061199667,
"grad_norm": 47.86785125732422,
"learning_rate": 1.423611596494927e-07,
"loss": 0.4367,
"num_input_tokens_seen": 41569280,
"step": 13205
},
{
"epoch": 0.8456564880609436,
"grad_norm": 20.079484939575195,
"learning_rate": 1.4178708573964438e-07,
"loss": 0.3546,
"num_input_tokens_seen": 41584576,
"step": 13210
},
{
"epoch": 0.8459765700019205,
"grad_norm": 17.954130172729492,
"learning_rate": 1.4121408331869566e-07,
"loss": 0.3589,
"num_input_tokens_seen": 41600000,
"step": 13215
},
{
"epoch": 0.8462966519428974,
"grad_norm": 38.43409729003906,
"learning_rate": 1.406421531020474e-07,
"loss": 0.3603,
"num_input_tokens_seen": 41615040,
"step": 13220
},
{
"epoch": 0.8466167338838743,
"grad_norm": 87.97344207763672,
"learning_rate": 1.4007129580376097e-07,
"loss": 0.3551,
"num_input_tokens_seen": 41630208,
"step": 13225
},
{
"epoch": 0.8469368158248511,
"grad_norm": 38.874149322509766,
"learning_rate": 1.3950151213655847e-07,
"loss": 0.3672,
"num_input_tokens_seen": 41645440,
"step": 13230
},
{
"epoch": 0.847256897765828,
"grad_norm": 45.345767974853516,
"learning_rate": 1.389328028118214e-07,
"loss": 0.3281,
"num_input_tokens_seen": 41661184,
"step": 13235
},
{
"epoch": 0.8475769797068049,
"grad_norm": 31.71895980834961,
"learning_rate": 1.3836516853959e-07,
"loss": 0.358,
"num_input_tokens_seen": 41676224,
"step": 13240
},
{
"epoch": 0.8478970616477818,
"grad_norm": 20.565414428710938,
"learning_rate": 1.3779861002856242e-07,
"loss": 0.308,
"num_input_tokens_seen": 41690816,
"step": 13245
},
{
"epoch": 0.8482171435887588,
"grad_norm": 18.975496292114258,
"learning_rate": 1.3723312798609366e-07,
"loss": 0.3357,
"num_input_tokens_seen": 41706688,
"step": 13250
},
{
"epoch": 0.8485372255297357,
"grad_norm": 27.02278709411621,
"learning_rate": 1.3666872311819455e-07,
"loss": 0.349,
"num_input_tokens_seen": 41721920,
"step": 13255
},
{
"epoch": 0.8488573074707125,
"grad_norm": 22.8565731048584,
"learning_rate": 1.361053961295312e-07,
"loss": 0.285,
"num_input_tokens_seen": 41738112,
"step": 13260
},
{
"epoch": 0.8491773894116894,
"grad_norm": 48.835845947265625,
"learning_rate": 1.3554314772342412e-07,
"loss": 0.3445,
"num_input_tokens_seen": 41753792,
"step": 13265
},
{
"epoch": 0.8494974713526663,
"grad_norm": 22.680404663085938,
"learning_rate": 1.349819786018469e-07,
"loss": 0.3294,
"num_input_tokens_seen": 41771328,
"step": 13270
},
{
"epoch": 0.8498175532936432,
"grad_norm": 38.834434509277344,
"learning_rate": 1.3442188946542566e-07,
"loss": 0.3734,
"num_input_tokens_seen": 41787712,
"step": 13275
},
{
"epoch": 0.85013763523462,
"grad_norm": 26.3136043548584,
"learning_rate": 1.338628810134388e-07,
"loss": 0.3099,
"num_input_tokens_seen": 41803072,
"step": 13280
},
{
"epoch": 0.8504577171755969,
"grad_norm": 39.59318923950195,
"learning_rate": 1.3330495394381435e-07,
"loss": 0.3624,
"num_input_tokens_seen": 41818688,
"step": 13285
},
{
"epoch": 0.8507777991165738,
"grad_norm": 17.34198760986328,
"learning_rate": 1.3274810895313083e-07,
"loss": 0.2868,
"num_input_tokens_seen": 41833792,
"step": 13290
},
{
"epoch": 0.8510338646693554,
"eval_loss": 0.35909759998321533,
"eval_runtime": 50.7775,
"eval_samples_per_second": 273.468,
"eval_steps_per_second": 34.188,
"num_input_tokens_seen": 41847872,
"step": 13294
},
{
"epoch": 0.8510978810575507,
"grad_norm": 26.860233306884766,
"learning_rate": 1.321923467366164e-07,
"loss": 0.3846,
"num_input_tokens_seen": 41850880,
"step": 13295
},
{
"epoch": 0.8514179629985277,
"grad_norm": 15.31477165222168,
"learning_rate": 1.3163766798814603e-07,
"loss": 0.183,
"num_input_tokens_seen": 41866560,
"step": 13300
},
{
"epoch": 0.8517380449395046,
"grad_norm": 53.74724197387695,
"learning_rate": 1.3108407340024264e-07,
"loss": 0.3041,
"num_input_tokens_seen": 41882240,
"step": 13305
},
{
"epoch": 0.8520581268804814,
"grad_norm": 39.340850830078125,
"learning_rate": 1.3053156366407613e-07,
"loss": 0.3421,
"num_input_tokens_seen": 41898880,
"step": 13310
},
{
"epoch": 0.8523782088214583,
"grad_norm": 19.917110443115234,
"learning_rate": 1.2998013946946119e-07,
"loss": 0.2428,
"num_input_tokens_seen": 41915968,
"step": 13315
},
{
"epoch": 0.8526982907624352,
"grad_norm": 37.523658752441406,
"learning_rate": 1.2942980150485706e-07,
"loss": 0.3499,
"num_input_tokens_seen": 41930816,
"step": 13320
},
{
"epoch": 0.8530183727034121,
"grad_norm": 51.89887237548828,
"learning_rate": 1.2888055045736723e-07,
"loss": 0.3192,
"num_input_tokens_seen": 41947200,
"step": 13325
},
{
"epoch": 0.853338454644389,
"grad_norm": 22.378541946411133,
"learning_rate": 1.283323870127384e-07,
"loss": 0.301,
"num_input_tokens_seen": 41962240,
"step": 13330
},
{
"epoch": 0.8536585365853658,
"grad_norm": 24.78726577758789,
"learning_rate": 1.2778531185535911e-07,
"loss": 0.3015,
"num_input_tokens_seen": 41978752,
"step": 13335
},
{
"epoch": 0.8539786185263427,
"grad_norm": 21.3166446685791,
"learning_rate": 1.2723932566825844e-07,
"loss": 0.3288,
"num_input_tokens_seen": 41994112,
"step": 13340
},
{
"epoch": 0.8542987004673196,
"grad_norm": 16.688308715820312,
"learning_rate": 1.2669442913310723e-07,
"loss": 0.294,
"num_input_tokens_seen": 42010432,
"step": 13345
},
{
"epoch": 0.8546187824082965,
"grad_norm": 28.67268943786621,
"learning_rate": 1.2615062293021506e-07,
"loss": 0.2745,
"num_input_tokens_seen": 42025984,
"step": 13350
},
{
"epoch": 0.8549388643492735,
"grad_norm": 38.47801971435547,
"learning_rate": 1.2560790773853025e-07,
"loss": 0.3147,
"num_input_tokens_seen": 42040832,
"step": 13355
},
{
"epoch": 0.8552589462902503,
"grad_norm": 27.922290802001953,
"learning_rate": 1.2506628423563915e-07,
"loss": 0.4083,
"num_input_tokens_seen": 42057536,
"step": 13360
},
{
"epoch": 0.8555790282312272,
"grad_norm": 31.082223892211914,
"learning_rate": 1.2452575309776493e-07,
"loss": 0.2828,
"num_input_tokens_seen": 42073152,
"step": 13365
},
{
"epoch": 0.8558991101722041,
"grad_norm": 47.76811599731445,
"learning_rate": 1.2398631499976732e-07,
"loss": 0.3032,
"num_input_tokens_seen": 42088512,
"step": 13370
},
{
"epoch": 0.856219192113181,
"grad_norm": 23.319290161132812,
"learning_rate": 1.234479706151409e-07,
"loss": 0.4253,
"num_input_tokens_seen": 42103552,
"step": 13375
},
{
"epoch": 0.8565392740541579,
"grad_norm": 23.617815017700195,
"learning_rate": 1.2291072061601503e-07,
"loss": 0.3577,
"num_input_tokens_seen": 42119872,
"step": 13380
},
{
"epoch": 0.8568593559951347,
"grad_norm": 39.64377975463867,
"learning_rate": 1.2237456567315264e-07,
"loss": 0.4374,
"num_input_tokens_seen": 42136832,
"step": 13385
},
{
"epoch": 0.8571794379361116,
"grad_norm": 24.50478172302246,
"learning_rate": 1.2183950645594944e-07,
"loss": 0.3158,
"num_input_tokens_seen": 42152896,
"step": 13390
},
{
"epoch": 0.8574995198770885,
"grad_norm": 49.653892517089844,
"learning_rate": 1.2130554363243318e-07,
"loss": 0.3555,
"num_input_tokens_seen": 42168064,
"step": 13395
},
{
"epoch": 0.8578196018180654,
"grad_norm": 20.96121597290039,
"learning_rate": 1.207726778692625e-07,
"loss": 0.3738,
"num_input_tokens_seen": 42182784,
"step": 13400
},
{
"epoch": 0.8581396837590423,
"grad_norm": 23.166378021240234,
"learning_rate": 1.2024090983172718e-07,
"loss": 0.3228,
"num_input_tokens_seen": 42199744,
"step": 13405
},
{
"epoch": 0.8584597657000193,
"grad_norm": 40.654266357421875,
"learning_rate": 1.1971024018374532e-07,
"loss": 0.3631,
"num_input_tokens_seen": 42215040,
"step": 13410
},
{
"epoch": 0.8587798476409961,
"grad_norm": 35.028751373291016,
"learning_rate": 1.1918066958786432e-07,
"loss": 0.3079,
"num_input_tokens_seen": 42230144,
"step": 13415
},
{
"epoch": 0.859099929581973,
"grad_norm": 69.72550964355469,
"learning_rate": 1.1865219870525922e-07,
"loss": 0.3677,
"num_input_tokens_seen": 42246528,
"step": 13420
},
{
"epoch": 0.8594200115229499,
"grad_norm": 21.34828758239746,
"learning_rate": 1.1812482819573222e-07,
"loss": 0.4245,
"num_input_tokens_seen": 42263168,
"step": 13425
},
{
"epoch": 0.8597400934639268,
"grad_norm": 32.855438232421875,
"learning_rate": 1.1759855871771163e-07,
"loss": 0.3877,
"num_input_tokens_seen": 42278912,
"step": 13430
},
{
"epoch": 0.8600601754049036,
"grad_norm": 48.298091888427734,
"learning_rate": 1.1707339092825075e-07,
"loss": 0.387,
"num_input_tokens_seen": 42294656,
"step": 13435
},
{
"epoch": 0.8603802573458805,
"grad_norm": 45.29995346069336,
"learning_rate": 1.1654932548302842e-07,
"loss": 0.3927,
"num_input_tokens_seen": 42311552,
"step": 13440
},
{
"epoch": 0.8607003392868574,
"grad_norm": 46.05238723754883,
"learning_rate": 1.1602636303634595e-07,
"loss": 0.365,
"num_input_tokens_seen": 42327552,
"step": 13445
},
{
"epoch": 0.8610204212278343,
"grad_norm": 20.932029724121094,
"learning_rate": 1.1550450424112801e-07,
"loss": 0.3526,
"num_input_tokens_seen": 42343360,
"step": 13450
},
{
"epoch": 0.8613405031688112,
"grad_norm": 23.107404708862305,
"learning_rate": 1.1498374974892178e-07,
"loss": 0.3455,
"num_input_tokens_seen": 42360064,
"step": 13455
},
{
"epoch": 0.8616605851097882,
"grad_norm": 23.93453025817871,
"learning_rate": 1.144641002098955e-07,
"loss": 0.4371,
"num_input_tokens_seen": 42374976,
"step": 13460
},
{
"epoch": 0.861980667050765,
"grad_norm": 45.36137771606445,
"learning_rate": 1.1394555627283697e-07,
"loss": 0.3502,
"num_input_tokens_seen": 42391616,
"step": 13465
},
{
"epoch": 0.8623007489917419,
"grad_norm": 59.06273651123047,
"learning_rate": 1.134281185851551e-07,
"loss": 0.3075,
"num_input_tokens_seen": 42406528,
"step": 13470
},
{
"epoch": 0.8626208309327188,
"grad_norm": 28.857786178588867,
"learning_rate": 1.1291178779287691e-07,
"loss": 0.2948,
"num_input_tokens_seen": 42424320,
"step": 13475
},
{
"epoch": 0.8629409128736957,
"grad_norm": 43.43189239501953,
"learning_rate": 1.1239656454064683e-07,
"loss": 0.3616,
"num_input_tokens_seen": 42440960,
"step": 13480
},
{
"epoch": 0.8632609948146726,
"grad_norm": 17.330026626586914,
"learning_rate": 1.1188244947172776e-07,
"loss": 0.2464,
"num_input_tokens_seen": 42456448,
"step": 13485
},
{
"epoch": 0.8635810767556494,
"grad_norm": 20.37238311767578,
"learning_rate": 1.1136944322799812e-07,
"loss": 0.3201,
"num_input_tokens_seen": 42472448,
"step": 13490
},
{
"epoch": 0.8639011586966263,
"grad_norm": 52.59025192260742,
"learning_rate": 1.1085754644995227e-07,
"loss": 0.3177,
"num_input_tokens_seen": 42487808,
"step": 13495
},
{
"epoch": 0.8642212406376032,
"grad_norm": 36.363216400146484,
"learning_rate": 1.1034675977669938e-07,
"loss": 0.3577,
"num_input_tokens_seen": 42503744,
"step": 13500
},
{
"epoch": 0.8645413225785801,
"grad_norm": 52.50566482543945,
"learning_rate": 1.0983708384596258e-07,
"loss": 0.6111,
"num_input_tokens_seen": 42520768,
"step": 13505
},
{
"epoch": 0.864861404519557,
"grad_norm": 19.887405395507812,
"learning_rate": 1.0932851929407827e-07,
"loss": 0.3703,
"num_input_tokens_seen": 42537408,
"step": 13510
},
{
"epoch": 0.8651814864605339,
"grad_norm": 47.518165588378906,
"learning_rate": 1.0882106675599534e-07,
"loss": 0.3583,
"num_input_tokens_seen": 42553728,
"step": 13515
},
{
"epoch": 0.8655015684015108,
"grad_norm": 14.914018630981445,
"learning_rate": 1.0831472686527409e-07,
"loss": 0.3226,
"num_input_tokens_seen": 42568896,
"step": 13520
},
{
"epoch": 0.8658216503424877,
"grad_norm": 14.661453247070312,
"learning_rate": 1.0780950025408586e-07,
"loss": 0.2985,
"num_input_tokens_seen": 42584000,
"step": 13525
},
{
"epoch": 0.8661417322834646,
"grad_norm": 65.84357452392578,
"learning_rate": 1.0730538755321217e-07,
"loss": 0.3884,
"num_input_tokens_seen": 42600192,
"step": 13530
},
{
"epoch": 0.8664618142244415,
"grad_norm": 21.830345153808594,
"learning_rate": 1.0680238939204334e-07,
"loss": 0.2997,
"num_input_tokens_seen": 42614656,
"step": 13535
},
{
"epoch": 0.8667818961654183,
"grad_norm": 44.84056854248047,
"learning_rate": 1.0630050639857879e-07,
"loss": 0.402,
"num_input_tokens_seen": 42629504,
"step": 13540
},
{
"epoch": 0.8671019781063952,
"grad_norm": 24.932231903076172,
"learning_rate": 1.0579973919942508e-07,
"loss": 0.3165,
"num_input_tokens_seen": 42644224,
"step": 13545
},
{
"epoch": 0.8674220600473721,
"grad_norm": 28.818056106567383,
"learning_rate": 1.0530008841979621e-07,
"loss": 0.2452,
"num_input_tokens_seen": 42659584,
"step": 13550
},
{
"epoch": 0.867742141988349,
"grad_norm": 36.8608512878418,
"learning_rate": 1.048015546835117e-07,
"loss": 0.272,
"num_input_tokens_seen": 42675776,
"step": 13555
},
{
"epoch": 0.8680622239293259,
"grad_norm": 23.82661247253418,
"learning_rate": 1.0430413861299691e-07,
"loss": 0.388,
"num_input_tokens_seen": 42693184,
"step": 13560
},
{
"epoch": 0.8683823058703029,
"grad_norm": 46.35121536254883,
"learning_rate": 1.0380784082928196e-07,
"loss": 0.4564,
"num_input_tokens_seen": 42710784,
"step": 13565
},
{
"epoch": 0.8687023878112797,
"grad_norm": 37.92681884765625,
"learning_rate": 1.0331266195200006e-07,
"loss": 0.3905,
"num_input_tokens_seen": 42727040,
"step": 13570
},
{
"epoch": 0.8690224697522566,
"grad_norm": 20.17000389099121,
"learning_rate": 1.0281860259938779e-07,
"loss": 0.3189,
"num_input_tokens_seen": 42742208,
"step": 13575
},
{
"epoch": 0.8693425516932335,
"grad_norm": 20.845090866088867,
"learning_rate": 1.0232566338828452e-07,
"loss": 0.3634,
"num_input_tokens_seen": 42758464,
"step": 13580
},
{
"epoch": 0.8696626336342104,
"grad_norm": 47.362613677978516,
"learning_rate": 1.018338449341305e-07,
"loss": 0.4021,
"num_input_tokens_seen": 42774016,
"step": 13585
},
{
"epoch": 0.8699827155751872,
"grad_norm": 20.74382972717285,
"learning_rate": 1.0134314785096632e-07,
"loss": 0.3924,
"num_input_tokens_seen": 42789248,
"step": 13590
},
{
"epoch": 0.8703027975161641,
"grad_norm": 17.53841781616211,
"learning_rate": 1.0085357275143359e-07,
"loss": 0.3446,
"num_input_tokens_seen": 42804608,
"step": 13595
},
{
"epoch": 0.870622879457141,
"grad_norm": 34.560428619384766,
"learning_rate": 1.0036512024677268e-07,
"loss": 0.495,
"num_input_tokens_seen": 42819584,
"step": 13600
},
{
"epoch": 0.8709429613981179,
"grad_norm": 11.315316200256348,
"learning_rate": 9.98777909468217e-08,
"loss": 0.2823,
"num_input_tokens_seen": 42835200,
"step": 13605
},
{
"epoch": 0.8712630433390948,
"grad_norm": 48.00859069824219,
"learning_rate": 9.939158546001736e-08,
"loss": 0.4072,
"num_input_tokens_seen": 42852672,
"step": 13610
},
{
"epoch": 0.8715831252800716,
"grad_norm": 20.85955047607422,
"learning_rate": 9.890650439339299e-08,
"loss": 0.3252,
"num_input_tokens_seen": 42868672,
"step": 13615
},
{
"epoch": 0.8719032072210486,
"grad_norm": 58.062744140625,
"learning_rate": 9.842254835257791e-08,
"loss": 0.412,
"num_input_tokens_seen": 42884096,
"step": 13620
},
{
"epoch": 0.8722232891620255,
"grad_norm": 33.488006591796875,
"learning_rate": 9.793971794179679e-08,
"loss": 0.374,
"num_input_tokens_seen": 42898752,
"step": 13625
},
{
"epoch": 0.8725433711030024,
"grad_norm": 28.78290557861328,
"learning_rate": 9.745801376386931e-08,
"loss": 0.3535,
"num_input_tokens_seen": 42914688,
"step": 13630
},
{
"epoch": 0.8728634530439793,
"grad_norm": 47.25395965576172,
"learning_rate": 9.697743642020861e-08,
"loss": 0.3186,
"num_input_tokens_seen": 42930688,
"step": 13635
},
{
"epoch": 0.8731835349849562,
"grad_norm": 37.96920394897461,
"learning_rate": 9.649798651082119e-08,
"loss": 0.3329,
"num_input_tokens_seen": 42947008,
"step": 13640
},
{
"epoch": 0.873503616925933,
"grad_norm": 15.890763282775879,
"learning_rate": 9.601966463430588e-08,
"loss": 0.3973,
"num_input_tokens_seen": 42962816,
"step": 13645
},
{
"epoch": 0.8738236988669099,
"grad_norm": 16.609865188598633,
"learning_rate": 9.554247138785321e-08,
"loss": 0.3428,
"num_input_tokens_seen": 42977664,
"step": 13650
},
{
"epoch": 0.8741437808078868,
"grad_norm": 72.1360092163086,
"learning_rate": 9.506640736724447e-08,
"loss": 0.4653,
"num_input_tokens_seen": 42993472,
"step": 13655
},
{
"epoch": 0.8744638627488637,
"grad_norm": 31.26497459411621,
"learning_rate": 9.459147316685123e-08,
"loss": 0.3973,
"num_input_tokens_seen": 43010688,
"step": 13660
},
{
"epoch": 0.8747839446898406,
"grad_norm": 42.893550872802734,
"learning_rate": 9.41176693796345e-08,
"loss": 0.3411,
"num_input_tokens_seen": 43027392,
"step": 13665
},
{
"epoch": 0.8751040266308175,
"grad_norm": 43.10031509399414,
"learning_rate": 9.364499659714364e-08,
"loss": 0.4175,
"num_input_tokens_seen": 43043008,
"step": 13670
},
{
"epoch": 0.8754241085717944,
"grad_norm": 38.63743591308594,
"learning_rate": 9.31734554095165e-08,
"loss": 0.3438,
"num_input_tokens_seen": 43059072,
"step": 13675
},
{
"epoch": 0.8757441905127713,
"grad_norm": 36.11064529418945,
"learning_rate": 9.270304640547744e-08,
"loss": 0.3456,
"num_input_tokens_seen": 43074624,
"step": 13680
},
{
"epoch": 0.8760642724537482,
"grad_norm": 30.64196014404297,
"learning_rate": 9.223377017233768e-08,
"loss": 0.3922,
"num_input_tokens_seen": 43089536,
"step": 13685
},
{
"epoch": 0.8763843543947251,
"grad_norm": 29.195018768310547,
"learning_rate": 9.176562729599458e-08,
"loss": 0.361,
"num_input_tokens_seen": 43104512,
"step": 13690
},
{
"epoch": 0.8767044363357019,
"grad_norm": 47.08403778076172,
"learning_rate": 9.129861836092944e-08,
"loss": 0.3434,
"num_input_tokens_seen": 43120640,
"step": 13695
},
{
"epoch": 0.8770245182766788,
"grad_norm": 22.02703285217285,
"learning_rate": 9.083274395020845e-08,
"loss": 0.4433,
"num_input_tokens_seen": 43136384,
"step": 13700
},
{
"epoch": 0.8773446002176557,
"grad_norm": 21.463897705078125,
"learning_rate": 9.036800464548156e-08,
"loss": 0.4021,
"num_input_tokens_seen": 43153216,
"step": 13705
},
{
"epoch": 0.8776646821586326,
"grad_norm": 24.22488784790039,
"learning_rate": 8.990440102698138e-08,
"loss": 0.3506,
"num_input_tokens_seen": 43167936,
"step": 13710
},
{
"epoch": 0.8779847640996095,
"grad_norm": 39.060020446777344,
"learning_rate": 8.944193367352182e-08,
"loss": 0.2722,
"num_input_tokens_seen": 43183872,
"step": 13715
},
{
"epoch": 0.8783048460405863,
"grad_norm": 29.692768096923828,
"learning_rate": 8.898060316249944e-08,
"loss": 0.408,
"num_input_tokens_seen": 43200256,
"step": 13720
},
{
"epoch": 0.8786249279815633,
"grad_norm": 49.14101791381836,
"learning_rate": 8.852041006989064e-08,
"loss": 0.3606,
"num_input_tokens_seen": 43217600,
"step": 13725
},
{
"epoch": 0.8789450099225402,
"grad_norm": 49.38915252685547,
"learning_rate": 8.80613549702518e-08,
"loss": 0.3858,
"num_input_tokens_seen": 43233344,
"step": 13730
},
{
"epoch": 0.8792650918635171,
"grad_norm": 49.839324951171875,
"learning_rate": 8.760343843671824e-08,
"loss": 0.5397,
"num_input_tokens_seen": 43249280,
"step": 13735
},
{
"epoch": 0.879585173804494,
"grad_norm": 76.67366790771484,
"learning_rate": 8.714666104100487e-08,
"loss": 0.4595,
"num_input_tokens_seen": 43265024,
"step": 13740
},
{
"epoch": 0.8799052557454708,
"grad_norm": 76.83538055419922,
"learning_rate": 8.66910233534034e-08,
"loss": 0.3597,
"num_input_tokens_seen": 43280576,
"step": 13745
},
{
"epoch": 0.8802253376864477,
"grad_norm": 31.625341415405273,
"learning_rate": 8.62365259427823e-08,
"loss": 0.3074,
"num_input_tokens_seen": 43296064,
"step": 13750
},
{
"epoch": 0.8805454196274246,
"grad_norm": 29.102645874023438,
"learning_rate": 8.578316937658758e-08,
"loss": 0.292,
"num_input_tokens_seen": 43311552,
"step": 13755
},
{
"epoch": 0.8808655015684015,
"grad_norm": 20.883947372436523,
"learning_rate": 8.533095422083992e-08,
"loss": 0.3216,
"num_input_tokens_seen": 43326272,
"step": 13760
},
{
"epoch": 0.8811855835093784,
"grad_norm": 26.02059555053711,
"learning_rate": 8.487988104013533e-08,
"loss": 0.2926,
"num_input_tokens_seen": 43342592,
"step": 13765
},
{
"epoch": 0.8815056654503552,
"grad_norm": 23.774742126464844,
"learning_rate": 8.4429950397644e-08,
"loss": 0.3183,
"num_input_tokens_seen": 43357888,
"step": 13770
},
{
"epoch": 0.8818257473913321,
"grad_norm": 23.344446182250977,
"learning_rate": 8.398116285510948e-08,
"loss": 0.272,
"num_input_tokens_seen": 43374272,
"step": 13775
},
{
"epoch": 0.8821458293323091,
"grad_norm": 47.986976623535156,
"learning_rate": 8.353351897284844e-08,
"loss": 0.2715,
"num_input_tokens_seen": 43393280,
"step": 13780
},
{
"epoch": 0.882465911273286,
"grad_norm": 10.710731506347656,
"learning_rate": 8.308701930974949e-08,
"loss": 0.4713,
"num_input_tokens_seen": 43409600,
"step": 13785
},
{
"epoch": 0.8827859932142629,
"grad_norm": 30.01685333251953,
"learning_rate": 8.264166442327269e-08,
"loss": 0.4144,
"num_input_tokens_seen": 43424384,
"step": 13790
},
{
"epoch": 0.8831060751552398,
"grad_norm": 44.88009262084961,
"learning_rate": 8.219745486944885e-08,
"loss": 0.2591,
"num_input_tokens_seen": 43440128,
"step": 13795
},
{
"epoch": 0.8834261570962166,
"grad_norm": 81.11672973632812,
"learning_rate": 8.175439120287875e-08,
"loss": 0.4706,
"num_input_tokens_seen": 43455168,
"step": 13800
},
{
"epoch": 0.8837462390371935,
"grad_norm": 41.99856185913086,
"learning_rate": 8.131247397673269e-08,
"loss": 0.3454,
"num_input_tokens_seen": 43472064,
"step": 13805
},
{
"epoch": 0.8840663209781704,
"grad_norm": 84.95421600341797,
"learning_rate": 8.087170374274921e-08,
"loss": 0.4261,
"num_input_tokens_seen": 43488000,
"step": 13810
},
{
"epoch": 0.8843864029191473,
"grad_norm": 28.50680923461914,
"learning_rate": 8.043208105123578e-08,
"loss": 0.2942,
"num_input_tokens_seen": 43503488,
"step": 13815
},
{
"epoch": 0.8847064848601242,
"grad_norm": 40.26153564453125,
"learning_rate": 7.999360645106579e-08,
"loss": 0.3418,
"num_input_tokens_seen": 43518336,
"step": 13820
},
{
"epoch": 0.885026566801101,
"grad_norm": 18.43648910522461,
"learning_rate": 7.955628048968011e-08,
"loss": 0.2716,
"num_input_tokens_seen": 43532800,
"step": 13825
},
{
"epoch": 0.885346648742078,
"grad_norm": 31.317567825317383,
"learning_rate": 7.912010371308564e-08,
"loss": 0.2586,
"num_input_tokens_seen": 43547648,
"step": 13830
},
{
"epoch": 0.8856667306830549,
"grad_norm": 27.898588180541992,
"learning_rate": 7.868507666585422e-08,
"loss": 0.2934,
"num_input_tokens_seen": 43562688,
"step": 13835
},
{
"epoch": 0.8859868126240318,
"grad_norm": 42.73057174682617,
"learning_rate": 7.825119989112172e-08,
"loss": 0.4174,
"num_input_tokens_seen": 43578176,
"step": 13840
},
{
"epoch": 0.8863068945650087,
"grad_norm": 33.000614166259766,
"learning_rate": 7.78184739305886e-08,
"loss": 0.2904,
"num_input_tokens_seen": 43593920,
"step": 13845
},
{
"epoch": 0.8866269765059855,
"grad_norm": 22.872941970825195,
"learning_rate": 7.73868993245187e-08,
"loss": 0.3606,
"num_input_tokens_seen": 43610944,
"step": 13850
},
{
"epoch": 0.8869470584469624,
"grad_norm": 19.526351928710938,
"learning_rate": 7.695647661173754e-08,
"loss": 0.3406,
"num_input_tokens_seen": 43627008,
"step": 13855
},
{
"epoch": 0.8872671403879393,
"grad_norm": 46.44065856933594,
"learning_rate": 7.652720632963284e-08,
"loss": 0.3843,
"num_input_tokens_seen": 43642752,
"step": 13860
},
{
"epoch": 0.8875872223289162,
"grad_norm": 50.08705520629883,
"learning_rate": 7.609908901415396e-08,
"loss": 0.3506,
"num_input_tokens_seen": 43658496,
"step": 13865
},
{
"epoch": 0.8879073042698931,
"grad_norm": 52.07186508178711,
"learning_rate": 7.567212519981047e-08,
"loss": 0.3988,
"num_input_tokens_seen": 43674304,
"step": 13870
},
{
"epoch": 0.8882273862108699,
"grad_norm": 19.006641387939453,
"learning_rate": 7.524631541967108e-08,
"loss": 0.3315,
"num_input_tokens_seen": 43689536,
"step": 13875
},
{
"epoch": 0.8885474681518468,
"grad_norm": 76.33783721923828,
"learning_rate": 7.482166020536485e-08,
"loss": 0.2984,
"num_input_tokens_seen": 43706496,
"step": 13880
},
{
"epoch": 0.8888675500928238,
"grad_norm": 18.04372787475586,
"learning_rate": 7.439816008707877e-08,
"loss": 0.3097,
"num_input_tokens_seen": 43721408,
"step": 13885
},
{
"epoch": 0.8891876320338007,
"grad_norm": 19.781387329101562,
"learning_rate": 7.397581559355748e-08,
"loss": 0.3397,
"num_input_tokens_seen": 43737536,
"step": 13890
},
{
"epoch": 0.8895077139747776,
"grad_norm": 31.502649307250977,
"learning_rate": 7.355462725210315e-08,
"loss": 0.4171,
"num_input_tokens_seen": 43752640,
"step": 13895
},
{
"epoch": 0.8898277959157544,
"grad_norm": 27.26972770690918,
"learning_rate": 7.313459558857438e-08,
"loss": 0.4097,
"num_input_tokens_seen": 43768384,
"step": 13900
},
{
"epoch": 0.8901478778567313,
"grad_norm": 28.773365020751953,
"learning_rate": 7.271572112738566e-08,
"loss": 0.3141,
"num_input_tokens_seen": 43784320,
"step": 13905
},
{
"epoch": 0.8904679597977082,
"grad_norm": 35.17366409301758,
"learning_rate": 7.229800439150657e-08,
"loss": 0.3635,
"num_input_tokens_seen": 43799232,
"step": 13910
},
{
"epoch": 0.8907880417386851,
"grad_norm": 64.01599884033203,
"learning_rate": 7.188144590246148e-08,
"loss": 0.3806,
"num_input_tokens_seen": 43815360,
"step": 13915
},
{
"epoch": 0.891108123679662,
"grad_norm": 23.02184295654297,
"learning_rate": 7.146604618032848e-08,
"loss": 0.3317,
"num_input_tokens_seen": 43830336,
"step": 13920
},
{
"epoch": 0.8914282056206388,
"grad_norm": 37.91139221191406,
"learning_rate": 7.105180574373904e-08,
"loss": 0.4062,
"num_input_tokens_seen": 43846656,
"step": 13925
},
{
"epoch": 0.8917482875616157,
"grad_norm": 22.197053909301758,
"learning_rate": 7.063872510987712e-08,
"loss": 0.3279,
"num_input_tokens_seen": 43862720,
"step": 13930
},
{
"epoch": 0.8920683695025927,
"grad_norm": 34.20999526977539,
"learning_rate": 7.022680479447874e-08,
"loss": 0.3541,
"num_input_tokens_seen": 43876800,
"step": 13935
},
{
"epoch": 0.8923884514435696,
"grad_norm": 21.582101821899414,
"learning_rate": 6.98160453118316e-08,
"loss": 0.3046,
"num_input_tokens_seen": 43892160,
"step": 13940
},
{
"epoch": 0.8927085333845465,
"grad_norm": 39.55995178222656,
"learning_rate": 6.940644717477328e-08,
"loss": 0.3444,
"num_input_tokens_seen": 43908416,
"step": 13945
},
{
"epoch": 0.8930286153255234,
"grad_norm": 34.442626953125,
"learning_rate": 6.899801089469204e-08,
"loss": 0.4553,
"num_input_tokens_seen": 43923712,
"step": 13950
},
{
"epoch": 0.8933486972665002,
"grad_norm": 21.26590347290039,
"learning_rate": 6.85907369815254e-08,
"loss": 0.3491,
"num_input_tokens_seen": 43939520,
"step": 13955
},
{
"epoch": 0.8936687792074771,
"grad_norm": 48.566612243652344,
"learning_rate": 6.81846259437595e-08,
"loss": 0.3771,
"num_input_tokens_seen": 43954688,
"step": 13960
},
{
"epoch": 0.893988861148454,
"grad_norm": 53.70441436767578,
"learning_rate": 6.77796782884289e-08,
"loss": 0.3246,
"num_input_tokens_seen": 43969600,
"step": 13965
},
{
"epoch": 0.8943089430894309,
"grad_norm": 57.17893981933594,
"learning_rate": 6.737589452111526e-08,
"loss": 0.3885,
"num_input_tokens_seen": 43985472,
"step": 13970
},
{
"epoch": 0.8946290250304078,
"grad_norm": 38.82191467285156,
"learning_rate": 6.697327514594786e-08,
"loss": 0.4012,
"num_input_tokens_seen": 44000768,
"step": 13975
},
{
"epoch": 0.8949491069713846,
"grad_norm": 41.286277770996094,
"learning_rate": 6.657182066560118e-08,
"loss": 0.4538,
"num_input_tokens_seen": 44017088,
"step": 13980
},
{
"epoch": 0.8952691889123615,
"grad_norm": 26.847368240356445,
"learning_rate": 6.617153158129596e-08,
"loss": 0.3715,
"num_input_tokens_seen": 44031488,
"step": 13985
},
{
"epoch": 0.8955892708533385,
"grad_norm": 40.50960159301758,
"learning_rate": 6.577240839279807e-08,
"loss": 0.3356,
"num_input_tokens_seen": 44047296,
"step": 13990
},
{
"epoch": 0.8959093527943154,
"grad_norm": 33.30146408081055,
"learning_rate": 6.537445159841748e-08,
"loss": 0.3162,
"num_input_tokens_seen": 44063744,
"step": 13995
},
{
"epoch": 0.8962294347352923,
"grad_norm": 34.3074836730957,
"learning_rate": 6.497766169500752e-08,
"loss": 0.3898,
"num_input_tokens_seen": 44079168,
"step": 14000
},
{
"epoch": 0.8965495166762691,
"grad_norm": 14.267343521118164,
"learning_rate": 6.458203917796546e-08,
"loss": 0.2716,
"num_input_tokens_seen": 44093824,
"step": 14005
},
{
"epoch": 0.896869598617246,
"grad_norm": 18.694324493408203,
"learning_rate": 6.418758454123041e-08,
"loss": 0.4511,
"num_input_tokens_seen": 44111296,
"step": 14010
},
{
"epoch": 0.8971896805582229,
"grad_norm": 19.927507400512695,
"learning_rate": 6.379429827728377e-08,
"loss": 0.3912,
"num_input_tokens_seen": 44128000,
"step": 14015
},
{
"epoch": 0.8975097624991998,
"grad_norm": 18.84311866760254,
"learning_rate": 6.340218087714799e-08,
"loss": 0.3795,
"num_input_tokens_seen": 44143488,
"step": 14020
},
{
"epoch": 0.8978298444401767,
"grad_norm": 95.79281616210938,
"learning_rate": 6.301123283038634e-08,
"loss": 0.347,
"num_input_tokens_seen": 44158976,
"step": 14025
},
{
"epoch": 0.8981499263811535,
"grad_norm": 20.625757217407227,
"learning_rate": 6.262145462510193e-08,
"loss": 0.3207,
"num_input_tokens_seen": 44175808,
"step": 14030
},
{
"epoch": 0.8984700083221304,
"grad_norm": 45.679771423339844,
"learning_rate": 6.223284674793738e-08,
"loss": 0.2917,
"num_input_tokens_seen": 44190336,
"step": 14035
},
{
"epoch": 0.8987900902631074,
"grad_norm": 36.569252014160156,
"learning_rate": 6.184540968407437e-08,
"loss": 0.39,
"num_input_tokens_seen": 44205696,
"step": 14040
},
{
"epoch": 0.8991101722040843,
"grad_norm": 27.32700538635254,
"learning_rate": 6.145914391723239e-08,
"loss": 0.3515,
"num_input_tokens_seen": 44222016,
"step": 14045
},
{
"epoch": 0.8994302541450612,
"grad_norm": 30.203399658203125,
"learning_rate": 6.107404992966902e-08,
"loss": 0.327,
"num_input_tokens_seen": 44238592,
"step": 14050
},
{
"epoch": 0.899750336086038,
"grad_norm": 23.242624282836914,
"learning_rate": 6.069012820217856e-08,
"loss": 0.2489,
"num_input_tokens_seen": 44254016,
"step": 14055
},
{
"epoch": 0.9000704180270149,
"grad_norm": 30.345199584960938,
"learning_rate": 6.030737921409168e-08,
"loss": 0.3843,
"num_input_tokens_seen": 44269376,
"step": 14060
},
{
"epoch": 0.9003904999679918,
"grad_norm": 54.68627166748047,
"learning_rate": 5.992580344327503e-08,
"loss": 0.4579,
"num_input_tokens_seen": 44284672,
"step": 14065
},
{
"epoch": 0.9007105819089687,
"grad_norm": 34.02934265136719,
"learning_rate": 5.954540136613051e-08,
"loss": 0.346,
"num_input_tokens_seen": 44300224,
"step": 14070
},
{
"epoch": 0.9010306638499456,
"grad_norm": 24.38910675048828,
"learning_rate": 5.916617345759456e-08,
"loss": 0.3511,
"num_input_tokens_seen": 44315264,
"step": 14075
},
{
"epoch": 0.901094680238141,
"eval_loss": 0.35641103982925415,
"eval_runtime": 50.6103,
"eval_samples_per_second": 274.371,
"eval_steps_per_second": 34.301,
"num_input_tokens_seen": 44318848,
"step": 14076
},
{
"epoch": 0.9013507457909224,
"grad_norm": 41.97049331665039,
"learning_rate": 5.878812019113766e-08,
"loss": 0.4212,
"num_input_tokens_seen": 44330176,
"step": 14080
},
{
"epoch": 0.9016708277318993,
"grad_norm": 26.239030838012695,
"learning_rate": 5.84112420387638e-08,
"loss": 0.3065,
"num_input_tokens_seen": 44345152,
"step": 14085
},
{
"epoch": 0.9019909096728762,
"grad_norm": 32.693172454833984,
"learning_rate": 5.8035539471009697e-08,
"loss": 0.3625,
"num_input_tokens_seen": 44361152,
"step": 14090
},
{
"epoch": 0.9023109916138532,
"grad_norm": 35.59107208251953,
"learning_rate": 5.7661012956944253e-08,
"loss": 0.4095,
"num_input_tokens_seen": 44376128,
"step": 14095
},
{
"epoch": 0.9026310735548301,
"grad_norm": 18.91592788696289,
"learning_rate": 5.728766296416876e-08,
"loss": 0.2917,
"num_input_tokens_seen": 44392192,
"step": 14100
},
{
"epoch": 0.902951155495807,
"grad_norm": 34.914939880371094,
"learning_rate": 5.6915489958814453e-08,
"loss": 0.4205,
"num_input_tokens_seen": 44407680,
"step": 14105
},
{
"epoch": 0.9032712374367838,
"grad_norm": 52.385982513427734,
"learning_rate": 5.654449440554399e-08,
"loss": 0.4106,
"num_input_tokens_seen": 44424384,
"step": 14110
},
{
"epoch": 0.9035913193777607,
"grad_norm": 23.303749084472656,
"learning_rate": 5.617467676754972e-08,
"loss": 0.3803,
"num_input_tokens_seen": 44439744,
"step": 14115
},
{
"epoch": 0.9039114013187376,
"grad_norm": 23.8924560546875,
"learning_rate": 5.580603750655344e-08,
"loss": 0.296,
"num_input_tokens_seen": 44454272,
"step": 14120
},
{
"epoch": 0.9042314832597145,
"grad_norm": 30.840787887573242,
"learning_rate": 5.543857708280497e-08,
"loss": 0.3739,
"num_input_tokens_seen": 44468992,
"step": 14125
},
{
"epoch": 0.9045515652006914,
"grad_norm": 41.89210891723633,
"learning_rate": 5.507229595508367e-08,
"loss": 0.4703,
"num_input_tokens_seen": 44484864,
"step": 14130
},
{
"epoch": 0.9048716471416682,
"grad_norm": 15.071650505065918,
"learning_rate": 5.4707194580695504e-08,
"loss": 0.2887,
"num_input_tokens_seen": 44499968,
"step": 14135
},
{
"epoch": 0.9051917290826451,
"grad_norm": 37.00905990600586,
"learning_rate": 5.4343273415473846e-08,
"loss": 0.4279,
"num_input_tokens_seen": 44517952,
"step": 14140
},
{
"epoch": 0.905511811023622,
"grad_norm": 25.922666549682617,
"learning_rate": 5.3980532913778576e-08,
"loss": 0.3413,
"num_input_tokens_seen": 44532928,
"step": 14145
},
{
"epoch": 0.905831892964599,
"grad_norm": 32.402870178222656,
"learning_rate": 5.361897352849554e-08,
"loss": 0.3928,
"num_input_tokens_seen": 44548288,
"step": 14150
},
{
"epoch": 0.9061519749055759,
"grad_norm": 22.13262176513672,
"learning_rate": 5.325859571103586e-08,
"loss": 0.3204,
"num_input_tokens_seen": 44563712,
"step": 14155
},
{
"epoch": 0.9064720568465527,
"grad_norm": 23.106555938720703,
"learning_rate": 5.289939991133508e-08,
"loss": 0.3376,
"num_input_tokens_seen": 44579264,
"step": 14160
},
{
"epoch": 0.9067921387875296,
"grad_norm": 12.965365409851074,
"learning_rate": 5.2541386577853895e-08,
"loss": 0.2387,
"num_input_tokens_seen": 44594176,
"step": 14165
},
{
"epoch": 0.9071122207285065,
"grad_norm": 21.157468795776367,
"learning_rate": 5.2184556157576e-08,
"loss": 0.2536,
"num_input_tokens_seen": 44609664,
"step": 14170
},
{
"epoch": 0.9074323026694834,
"grad_norm": 53.38215255737305,
"learning_rate": 5.1828909096008234e-08,
"loss": 0.3807,
"num_input_tokens_seen": 44626944,
"step": 14175
},
{
"epoch": 0.9077523846104603,
"grad_norm": 16.712305068969727,
"learning_rate": 5.14744458371803e-08,
"loss": 0.2294,
"num_input_tokens_seen": 44643520,
"step": 14180
},
{
"epoch": 0.9080724665514371,
"grad_norm": 86.15962219238281,
"learning_rate": 5.1121166823643646e-08,
"loss": 0.4922,
"num_input_tokens_seen": 44657984,
"step": 14185
},
{
"epoch": 0.908392548492414,
"grad_norm": 30.85004997253418,
"learning_rate": 5.076907249647122e-08,
"loss": 0.3841,
"num_input_tokens_seen": 44673024,
"step": 14190
},
{
"epoch": 0.9087126304333909,
"grad_norm": 29.058664321899414,
"learning_rate": 5.0418163295257055e-08,
"loss": 0.4111,
"num_input_tokens_seen": 44687424,
"step": 14195
},
{
"epoch": 0.9090327123743679,
"grad_norm": 41.66473388671875,
"learning_rate": 5.006843965811536e-08,
"loss": 0.2901,
"num_input_tokens_seen": 44702976,
"step": 14200
},
{
"epoch": 0.9093527943153448,
"grad_norm": 44.971763610839844,
"learning_rate": 4.971990202168008e-08,
"loss": 0.4813,
"num_input_tokens_seen": 44718144,
"step": 14205
},
{
"epoch": 0.9096728762563216,
"grad_norm": 27.01129913330078,
"learning_rate": 4.9372550821104697e-08,
"loss": 0.3209,
"num_input_tokens_seen": 44734912,
"step": 14210
},
{
"epoch": 0.9099929581972985,
"grad_norm": 24.344900131225586,
"learning_rate": 4.902638649006119e-08,
"loss": 0.3205,
"num_input_tokens_seen": 44749888,
"step": 14215
},
{
"epoch": 0.9103130401382754,
"grad_norm": 22.84157943725586,
"learning_rate": 4.868140946073973e-08,
"loss": 0.3289,
"num_input_tokens_seen": 44764544,
"step": 14220
},
{
"epoch": 0.9106331220792523,
"grad_norm": 33.33592987060547,
"learning_rate": 4.833762016384857e-08,
"loss": 0.3017,
"num_input_tokens_seen": 44780992,
"step": 14225
},
{
"epoch": 0.9109532040202292,
"grad_norm": 54.81636428833008,
"learning_rate": 4.799501902861214e-08,
"loss": 0.3869,
"num_input_tokens_seen": 44796672,
"step": 14230
},
{
"epoch": 0.911273285961206,
"grad_norm": 45.477725982666016,
"learning_rate": 4.765360648277217e-08,
"loss": 0.4287,
"num_input_tokens_seen": 44812224,
"step": 14235
},
{
"epoch": 0.9115933679021829,
"grad_norm": 39.50046920776367,
"learning_rate": 4.7313382952586465e-08,
"loss": 0.4228,
"num_input_tokens_seen": 44827136,
"step": 14240
},
{
"epoch": 0.9119134498431598,
"grad_norm": 17.21167755126953,
"learning_rate": 4.6974348862828027e-08,
"loss": 0.3649,
"num_input_tokens_seen": 44842176,
"step": 14245
},
{
"epoch": 0.9122335317841367,
"grad_norm": 48.12306594848633,
"learning_rate": 4.663650463678448e-08,
"loss": 0.4412,
"num_input_tokens_seen": 44858880,
"step": 14250
},
{
"epoch": 0.9125536137251137,
"grad_norm": 23.15502166748047,
"learning_rate": 4.629985069625875e-08,
"loss": 0.4434,
"num_input_tokens_seen": 44875328,
"step": 14255
},
{
"epoch": 0.9128736956660906,
"grad_norm": 55.39519500732422,
"learning_rate": 4.596438746156728e-08,
"loss": 0.3751,
"num_input_tokens_seen": 44892032,
"step": 14260
},
{
"epoch": 0.9131937776070674,
"grad_norm": 32.531982421875,
"learning_rate": 4.563011535153949e-08,
"loss": 0.36,
"num_input_tokens_seen": 44907328,
"step": 14265
},
{
"epoch": 0.9135138595480443,
"grad_norm": 28.193361282348633,
"learning_rate": 4.52970347835181e-08,
"loss": 0.2689,
"num_input_tokens_seen": 44922560,
"step": 14270
},
{
"epoch": 0.9138339414890212,
"grad_norm": 33.741458892822266,
"learning_rate": 4.496514617335845e-08,
"loss": 0.327,
"num_input_tokens_seen": 44937728,
"step": 14275
},
{
"epoch": 0.9141540234299981,
"grad_norm": 43.2702522277832,
"learning_rate": 4.4634449935427197e-08,
"loss": 0.3603,
"num_input_tokens_seen": 44954560,
"step": 14280
},
{
"epoch": 0.914474105370975,
"grad_norm": 28.025667190551758,
"learning_rate": 4.430494648260219e-08,
"loss": 0.3096,
"num_input_tokens_seen": 44971520,
"step": 14285
},
{
"epoch": 0.9147941873119518,
"grad_norm": 41.917076110839844,
"learning_rate": 4.397663622627279e-08,
"loss": 0.4524,
"num_input_tokens_seen": 44987392,
"step": 14290
},
{
"epoch": 0.9151142692529287,
"grad_norm": 26.760129928588867,
"learning_rate": 4.364951957633789e-08,
"loss": 0.3122,
"num_input_tokens_seen": 45002688,
"step": 14295
},
{
"epoch": 0.9154343511939056,
"grad_norm": 32.50722122192383,
"learning_rate": 4.332359694120669e-08,
"loss": 0.2953,
"num_input_tokens_seen": 45017792,
"step": 14300
},
{
"epoch": 0.9157544331348826,
"grad_norm": 31.373065948486328,
"learning_rate": 4.299886872779734e-08,
"loss": 0.3571,
"num_input_tokens_seen": 45032640,
"step": 14305
},
{
"epoch": 0.9160745150758595,
"grad_norm": 37.13477325439453,
"learning_rate": 4.267533534153678e-08,
"loss": 0.2975,
"num_input_tokens_seen": 45048256,
"step": 14310
},
{
"epoch": 0.9163945970168363,
"grad_norm": 28.1370792388916,
"learning_rate": 4.2352997186360316e-08,
"loss": 0.3218,
"num_input_tokens_seen": 45064192,
"step": 14315
},
{
"epoch": 0.9167146789578132,
"grad_norm": 20.448345184326172,
"learning_rate": 4.203185466471082e-08,
"loss": 0.3243,
"num_input_tokens_seen": 45079488,
"step": 14320
},
{
"epoch": 0.9170347608987901,
"grad_norm": 22.20013999938965,
"learning_rate": 4.1711908177538556e-08,
"loss": 0.3984,
"num_input_tokens_seen": 45095616,
"step": 14325
},
{
"epoch": 0.917354842839767,
"grad_norm": 43.30317306518555,
"learning_rate": 4.139315812430055e-08,
"loss": 0.378,
"num_input_tokens_seen": 45110592,
"step": 14330
},
{
"epoch": 0.9176749247807439,
"grad_norm": 28.17921257019043,
"learning_rate": 4.1075604902959915e-08,
"loss": 0.3863,
"num_input_tokens_seen": 45127168,
"step": 14335
},
{
"epoch": 0.9179950067217207,
"grad_norm": 29.114439010620117,
"learning_rate": 4.07592489099855e-08,
"loss": 0.3137,
"num_input_tokens_seen": 45142208,
"step": 14340
},
{
"epoch": 0.9183150886626976,
"grad_norm": 39.799137115478516,
"learning_rate": 4.044409054035147e-08,
"loss": 0.3934,
"num_input_tokens_seen": 45157184,
"step": 14345
},
{
"epoch": 0.9186351706036745,
"grad_norm": 16.564228057861328,
"learning_rate": 4.0130130187537195e-08,
"loss": 0.3929,
"num_input_tokens_seen": 45174464,
"step": 14350
},
{
"epoch": 0.9189552525446514,
"grad_norm": 37.10162353515625,
"learning_rate": 3.981736824352522e-08,
"loss": 0.3149,
"num_input_tokens_seen": 45188992,
"step": 14355
},
{
"epoch": 0.9192753344856284,
"grad_norm": 32.673851013183594,
"learning_rate": 3.950580509880286e-08,
"loss": 0.4703,
"num_input_tokens_seen": 45204032,
"step": 14360
},
{
"epoch": 0.9195954164266052,
"grad_norm": 42.633636474609375,
"learning_rate": 3.9195441142360066e-08,
"loss": 0.3999,
"num_input_tokens_seen": 45219328,
"step": 14365
},
{
"epoch": 0.9199154983675821,
"grad_norm": 24.69942283630371,
"learning_rate": 3.888627676169043e-08,
"loss": 0.321,
"num_input_tokens_seen": 45235584,
"step": 14370
},
{
"epoch": 0.920235580308559,
"grad_norm": 32.20174026489258,
"learning_rate": 3.857831234278886e-08,
"loss": 0.3666,
"num_input_tokens_seen": 45250880,
"step": 14375
},
{
"epoch": 0.9205556622495359,
"grad_norm": 32.91603469848633,
"learning_rate": 3.827154827015255e-08,
"loss": 0.4145,
"num_input_tokens_seen": 45266752,
"step": 14380
},
{
"epoch": 0.9208757441905128,
"grad_norm": 13.686686515808105,
"learning_rate": 3.7965984926780383e-08,
"loss": 0.285,
"num_input_tokens_seen": 45282496,
"step": 14385
},
{
"epoch": 0.9211958261314896,
"grad_norm": 42.95164108276367,
"learning_rate": 3.766162269417139e-08,
"loss": 0.3521,
"num_input_tokens_seen": 45297024,
"step": 14390
},
{
"epoch": 0.9215159080724665,
"grad_norm": 42.11014938354492,
"learning_rate": 3.73584619523255e-08,
"loss": 0.3723,
"num_input_tokens_seen": 45314176,
"step": 14395
},
{
"epoch": 0.9218359900134434,
"grad_norm": 23.218151092529297,
"learning_rate": 3.7056503079742616e-08,
"loss": 0.352,
"num_input_tokens_seen": 45329344,
"step": 14400
},
{
"epoch": 0.9221560719544203,
"grad_norm": 28.30218505859375,
"learning_rate": 3.6755746453421945e-08,
"loss": 0.3452,
"num_input_tokens_seen": 45344384,
"step": 14405
},
{
"epoch": 0.9224761538953972,
"grad_norm": 15.89623737335205,
"learning_rate": 3.645619244886145e-08,
"loss": 0.2969,
"num_input_tokens_seen": 45360192,
"step": 14410
},
{
"epoch": 0.9227962358363742,
"grad_norm": 14.244527816772461,
"learning_rate": 3.615784144005796e-08,
"loss": 0.3147,
"num_input_tokens_seen": 45376000,
"step": 14415
},
{
"epoch": 0.923116317777351,
"grad_norm": 32.481868743896484,
"learning_rate": 3.5860693799506184e-08,
"loss": 0.4197,
"num_input_tokens_seen": 45390400,
"step": 14420
},
{
"epoch": 0.9234363997183279,
"grad_norm": 30.899581909179688,
"learning_rate": 3.5564749898198466e-08,
"loss": 0.4608,
"num_input_tokens_seen": 45406976,
"step": 14425
},
{
"epoch": 0.9237564816593048,
"grad_norm": 34.5185546875,
"learning_rate": 3.527001010562425e-08,
"loss": 0.3533,
"num_input_tokens_seen": 45422080,
"step": 14430
},
{
"epoch": 0.9240765636002817,
"grad_norm": 56.734092712402344,
"learning_rate": 3.4976474789769504e-08,
"loss": 0.3585,
"num_input_tokens_seen": 45439296,
"step": 14435
},
{
"epoch": 0.9243966455412586,
"grad_norm": 39.2069206237793,
"learning_rate": 3.4684144317116636e-08,
"loss": 0.2994,
"num_input_tokens_seen": 45454208,
"step": 14440
},
{
"epoch": 0.9247167274822354,
"grad_norm": 19.383590698242188,
"learning_rate": 3.439301905264369e-08,
"loss": 0.3015,
"num_input_tokens_seen": 45470400,
"step": 14445
},
{
"epoch": 0.9250368094232123,
"grad_norm": 43.57049560546875,
"learning_rate": 3.410309935982403e-08,
"loss": 0.324,
"num_input_tokens_seen": 45486528,
"step": 14450
},
{
"epoch": 0.9253568913641892,
"grad_norm": 20.486600875854492,
"learning_rate": 3.381438560062555e-08,
"loss": 0.3488,
"num_input_tokens_seen": 45501440,
"step": 14455
},
{
"epoch": 0.9256769733051661,
"grad_norm": 40.25934600830078,
"learning_rate": 3.3526878135511025e-08,
"loss": 0.3167,
"num_input_tokens_seen": 45517760,
"step": 14460
},
{
"epoch": 0.9259970552461431,
"grad_norm": 53.134132385253906,
"learning_rate": 3.324057732343666e-08,
"loss": 0.3751,
"num_input_tokens_seen": 45533056,
"step": 14465
},
{
"epoch": 0.9263171371871199,
"grad_norm": 26.382478713989258,
"learning_rate": 3.295548352185262e-08,
"loss": 0.421,
"num_input_tokens_seen": 45549248,
"step": 14470
},
{
"epoch": 0.9266372191280968,
"grad_norm": 35.308876037597656,
"learning_rate": 3.2671597086701753e-08,
"loss": 0.3503,
"num_input_tokens_seen": 45565760,
"step": 14475
},
{
"epoch": 0.9269573010690737,
"grad_norm": 23.9242000579834,
"learning_rate": 3.238891837241964e-08,
"loss": 0.3294,
"num_input_tokens_seen": 45581568,
"step": 14480
},
{
"epoch": 0.9272773830100506,
"grad_norm": 46.7926139831543,
"learning_rate": 3.210744773193386e-08,
"loss": 0.4179,
"num_input_tokens_seen": 45596928,
"step": 14485
},
{
"epoch": 0.9275974649510275,
"grad_norm": 45.16802978515625,
"learning_rate": 3.182718551666386e-08,
"loss": 0.3016,
"num_input_tokens_seen": 45612800,
"step": 14490
},
{
"epoch": 0.9279175468920043,
"grad_norm": 76.23828125,
"learning_rate": 3.154813207652063e-08,
"loss": 0.415,
"num_input_tokens_seen": 45627584,
"step": 14495
},
{
"epoch": 0.9282376288329812,
"grad_norm": 45.80707931518555,
"learning_rate": 3.1270287759905143e-08,
"loss": 0.3294,
"num_input_tokens_seen": 45643840,
"step": 14500
},
{
"epoch": 0.9285577107739581,
"grad_norm": 16.698345184326172,
"learning_rate": 3.0993652913709476e-08,
"loss": 0.2947,
"num_input_tokens_seen": 45659072,
"step": 14505
},
{
"epoch": 0.928877792714935,
"grad_norm": 24.177282333374023,
"learning_rate": 3.0718227883315796e-08,
"loss": 0.4243,
"num_input_tokens_seen": 45675328,
"step": 14510
},
{
"epoch": 0.9291978746559119,
"grad_norm": 41.06902313232422,
"learning_rate": 3.044401301259503e-08,
"loss": 0.3658,
"num_input_tokens_seen": 45690816,
"step": 14515
},
{
"epoch": 0.9295179565968889,
"grad_norm": 16.021848678588867,
"learning_rate": 3.017100864390787e-08,
"loss": 0.3301,
"num_input_tokens_seen": 45706432,
"step": 14520
},
{
"epoch": 0.9298380385378657,
"grad_norm": 53.745323181152344,
"learning_rate": 2.9899215118103446e-08,
"loss": 0.3406,
"num_input_tokens_seen": 45721920,
"step": 14525
},
{
"epoch": 0.9301581204788426,
"grad_norm": 15.850577354431152,
"learning_rate": 2.9628632774519435e-08,
"loss": 0.3547,
"num_input_tokens_seen": 45738048,
"step": 14530
},
{
"epoch": 0.9304782024198195,
"grad_norm": 25.69484519958496,
"learning_rate": 2.9359261950980485e-08,
"loss": 0.3313,
"num_input_tokens_seen": 45753856,
"step": 14535
},
{
"epoch": 0.9307982843607964,
"grad_norm": 23.73916244506836,
"learning_rate": 2.90911029837998e-08,
"loss": 0.2998,
"num_input_tokens_seen": 45768704,
"step": 14540
},
{
"epoch": 0.9311183663017732,
"grad_norm": 28.18566131591797,
"learning_rate": 2.8824156207776673e-08,
"loss": 0.2851,
"num_input_tokens_seen": 45783936,
"step": 14545
},
{
"epoch": 0.9314384482427501,
"grad_norm": 111.7474136352539,
"learning_rate": 2.8558421956197397e-08,
"loss": 0.4491,
"num_input_tokens_seen": 45800320,
"step": 14550
},
{
"epoch": 0.931758530183727,
"grad_norm": 39.434696197509766,
"learning_rate": 2.829390056083436e-08,
"loss": 0.3872,
"num_input_tokens_seen": 45816512,
"step": 14555
},
{
"epoch": 0.9320786121247039,
"grad_norm": 22.48917579650879,
"learning_rate": 2.8030592351945492e-08,
"loss": 0.3173,
"num_input_tokens_seen": 45831936,
"step": 14560
},
{
"epoch": 0.9323986940656808,
"grad_norm": 20.20699691772461,
"learning_rate": 2.776849765827427e-08,
"loss": 0.2995,
"num_input_tokens_seen": 45846784,
"step": 14565
},
{
"epoch": 0.9327187760066578,
"grad_norm": 42.71226501464844,
"learning_rate": 2.750761680704905e-08,
"loss": 0.4281,
"num_input_tokens_seen": 45862080,
"step": 14570
},
{
"epoch": 0.9330388579476346,
"grad_norm": 40.49309539794922,
"learning_rate": 2.724795012398251e-08,
"loss": 0.3977,
"num_input_tokens_seen": 45878528,
"step": 14575
},
{
"epoch": 0.9333589398886115,
"grad_norm": 37.85169219970703,
"learning_rate": 2.6989497933271543e-08,
"loss": 0.3726,
"num_input_tokens_seen": 45894016,
"step": 14580
},
{
"epoch": 0.9336790218295884,
"grad_norm": 20.322919845581055,
"learning_rate": 2.673226055759692e-08,
"loss": 0.3228,
"num_input_tokens_seen": 45909504,
"step": 14585
},
{
"epoch": 0.9339991037705653,
"grad_norm": 35.590576171875,
"learning_rate": 2.6476238318122402e-08,
"loss": 0.341,
"num_input_tokens_seen": 45925376,
"step": 14590
},
{
"epoch": 0.9343191857115422,
"grad_norm": 34.80650329589844,
"learning_rate": 2.6221431534494742e-08,
"loss": 0.3917,
"num_input_tokens_seen": 45940224,
"step": 14595
},
{
"epoch": 0.934639267652519,
"grad_norm": 58.117610931396484,
"learning_rate": 2.5967840524843243e-08,
"loss": 0.3508,
"num_input_tokens_seen": 45955072,
"step": 14600
},
{
"epoch": 0.9349593495934959,
"grad_norm": 32.32001495361328,
"learning_rate": 2.5715465605779195e-08,
"loss": 0.4243,
"num_input_tokens_seen": 45970240,
"step": 14605
},
{
"epoch": 0.9352794315344728,
"grad_norm": 85.06121063232422,
"learning_rate": 2.5464307092395777e-08,
"loss": 0.4145,
"num_input_tokens_seen": 45985856,
"step": 14610
},
{
"epoch": 0.9355995134754497,
"grad_norm": 22.371501922607422,
"learning_rate": 2.5214365298267148e-08,
"loss": 0.345,
"num_input_tokens_seen": 46000256,
"step": 14615
},
{
"epoch": 0.9359195954164266,
"grad_norm": 26.058761596679688,
"learning_rate": 2.4965640535448917e-08,
"loss": 0.3203,
"num_input_tokens_seen": 46015616,
"step": 14620
},
{
"epoch": 0.9362396773574035,
"grad_norm": 32.65026092529297,
"learning_rate": 2.471813311447657e-08,
"loss": 0.3659,
"num_input_tokens_seen": 46031040,
"step": 14625
},
{
"epoch": 0.9365597592983804,
"grad_norm": 34.6900634765625,
"learning_rate": 2.4471843344365915e-08,
"loss": 0.3221,
"num_input_tokens_seen": 46046016,
"step": 14630
},
{
"epoch": 0.9368798412393573,
"grad_norm": 19.555171966552734,
"learning_rate": 2.42267715326131e-08,
"loss": 0.2701,
"num_input_tokens_seen": 46062528,
"step": 14635
},
{
"epoch": 0.9371999231803342,
"grad_norm": 32.25774383544922,
"learning_rate": 2.3982917985192697e-08,
"loss": 0.3421,
"num_input_tokens_seen": 46078144,
"step": 14640
},
{
"epoch": 0.9375200051213111,
"grad_norm": 52.68037796020508,
"learning_rate": 2.3740283006558838e-08,
"loss": 0.3982,
"num_input_tokens_seen": 46096896,
"step": 14645
},
{
"epoch": 0.9378400870622879,
"grad_norm": 44.6820068359375,
"learning_rate": 2.349886689964431e-08,
"loss": 0.3756,
"num_input_tokens_seen": 46111808,
"step": 14650
},
{
"epoch": 0.9381601690032648,
"grad_norm": 36.16184616088867,
"learning_rate": 2.32586699658599e-08,
"loss": 0.2836,
"num_input_tokens_seen": 46127936,
"step": 14655
},
{
"epoch": 0.9384802509442417,
"grad_norm": 24.07101058959961,
"learning_rate": 2.3019692505094056e-08,
"loss": 0.3551,
"num_input_tokens_seen": 46142848,
"step": 14660
},
{
"epoch": 0.9388003328852186,
"grad_norm": 51.901004791259766,
"learning_rate": 2.2781934815713223e-08,
"loss": 0.5477,
"num_input_tokens_seen": 46158848,
"step": 14665
},
{
"epoch": 0.9391204148261955,
"grad_norm": 26.652942657470703,
"learning_rate": 2.254539719456061e-08,
"loss": 0.3611,
"num_input_tokens_seen": 46174912,
"step": 14670
},
{
"epoch": 0.9394404967671725,
"grad_norm": 19.459861755371094,
"learning_rate": 2.231007993695633e-08,
"loss": 0.2694,
"num_input_tokens_seen": 46189248,
"step": 14675
},
{
"epoch": 0.9397605787081493,
"grad_norm": 20.323457717895508,
"learning_rate": 2.2075983336696357e-08,
"loss": 0.3136,
"num_input_tokens_seen": 46204928,
"step": 14680
},
{
"epoch": 0.9400806606491262,
"grad_norm": 43.52647399902344,
"learning_rate": 2.1843107686053353e-08,
"loss": 0.3964,
"num_input_tokens_seen": 46220160,
"step": 14685
},
{
"epoch": 0.9404007425901031,
"grad_norm": 26.077720642089844,
"learning_rate": 2.1611453275775405e-08,
"loss": 0.4228,
"num_input_tokens_seen": 46235584,
"step": 14690
},
{
"epoch": 0.94072082453108,
"grad_norm": 28.561120986938477,
"learning_rate": 2.138102039508538e-08,
"loss": 0.2719,
"num_input_tokens_seen": 46251904,
"step": 14695
},
{
"epoch": 0.9410409064720568,
"grad_norm": 45.0341796875,
"learning_rate": 2.1151809331681703e-08,
"loss": 0.3995,
"num_input_tokens_seen": 46268032,
"step": 14700
},
{
"epoch": 0.9413609884130337,
"grad_norm": 54.203208923339844,
"learning_rate": 2.092382037173701e-08,
"loss": 0.3371,
"num_input_tokens_seen": 46283392,
"step": 14705
},
{
"epoch": 0.9416810703540106,
"grad_norm": 29.033945083618164,
"learning_rate": 2.0697053799898277e-08,
"loss": 0.3089,
"num_input_tokens_seen": 46298752,
"step": 14710
},
{
"epoch": 0.9420011522949875,
"grad_norm": 32.4161491394043,
"learning_rate": 2.0471509899286144e-08,
"loss": 0.3394,
"num_input_tokens_seen": 46314624,
"step": 14715
},
{
"epoch": 0.9423212342359644,
"grad_norm": 27.046730041503906,
"learning_rate": 2.0247188951494797e-08,
"loss": 0.3517,
"num_input_tokens_seen": 46331712,
"step": 14720
},
{
"epoch": 0.9426413161769412,
"grad_norm": 56.06675720214844,
"learning_rate": 2.0024091236591655e-08,
"loss": 0.5446,
"num_input_tokens_seen": 46347200,
"step": 14725
},
{
"epoch": 0.9429613981179182,
"grad_norm": 18.983240127563477,
"learning_rate": 1.98022170331168e-08,
"loss": 0.3148,
"num_input_tokens_seen": 46363008,
"step": 14730
},
{
"epoch": 0.9432814800588951,
"grad_norm": 31.674177169799805,
"learning_rate": 1.9581566618082744e-08,
"loss": 0.3808,
"num_input_tokens_seen": 46378816,
"step": 14735
},
{
"epoch": 0.943601561999872,
"grad_norm": 54.48043441772461,
"learning_rate": 1.9362140266974025e-08,
"loss": 0.4079,
"num_input_tokens_seen": 46395200,
"step": 14740
},
{
"epoch": 0.9439216439408489,
"grad_norm": 51.17926025390625,
"learning_rate": 1.9143938253747383e-08,
"loss": 0.3223,
"num_input_tokens_seen": 46411840,
"step": 14745
},
{
"epoch": 0.9442417258818258,
"grad_norm": 25.376482009887695,
"learning_rate": 1.892696085083023e-08,
"loss": 0.4503,
"num_input_tokens_seen": 46427776,
"step": 14750
},
{
"epoch": 0.9445618078228026,
"grad_norm": 35.88258361816406,
"learning_rate": 1.8711208329121542e-08,
"loss": 0.3146,
"num_input_tokens_seen": 46444736,
"step": 14755
},
{
"epoch": 0.9448818897637795,
"grad_norm": 25.610300064086914,
"learning_rate": 1.849668095799084e-08,
"loss": 0.3372,
"num_input_tokens_seen": 46460672,
"step": 14760
},
{
"epoch": 0.9452019717047564,
"grad_norm": 31.893014907836914,
"learning_rate": 1.8283379005278098e-08,
"loss": 0.3458,
"num_input_tokens_seen": 46476736,
"step": 14765
},
{
"epoch": 0.9455220536457333,
"grad_norm": 15.131741523742676,
"learning_rate": 1.807130273729329e-08,
"loss": 0.3238,
"num_input_tokens_seen": 46492416,
"step": 14770
},
{
"epoch": 0.9458421355867102,
"grad_norm": 38.48772430419922,
"learning_rate": 1.7860452418816173e-08,
"loss": 0.3331,
"num_input_tokens_seen": 46507264,
"step": 14775
},
{
"epoch": 0.946162217527687,
"grad_norm": 29.807109832763672,
"learning_rate": 1.7650828313095834e-08,
"loss": 0.3365,
"num_input_tokens_seen": 46524224,
"step": 14780
},
{
"epoch": 0.946482299468664,
"grad_norm": 16.179826736450195,
"learning_rate": 1.7442430681850362e-08,
"loss": 0.3172,
"num_input_tokens_seen": 46539456,
"step": 14785
},
{
"epoch": 0.9468023814096409,
"grad_norm": 40.83128356933594,
"learning_rate": 1.723525978526652e-08,
"loss": 0.402,
"num_input_tokens_seen": 46555136,
"step": 14790
},
{
"epoch": 0.9471224633506178,
"grad_norm": 27.168893814086914,
"learning_rate": 1.702931588199996e-08,
"loss": 0.3503,
"num_input_tokens_seen": 46570432,
"step": 14795
},
{
"epoch": 0.9474425452915947,
"grad_norm": 31.79697036743164,
"learning_rate": 1.6824599229173897e-08,
"loss": 0.3141,
"num_input_tokens_seen": 46586304,
"step": 14800
},
{
"epoch": 0.9477626272325715,
"grad_norm": 34.36116409301758,
"learning_rate": 1.662111008237932e-08,
"loss": 0.2946,
"num_input_tokens_seen": 46602432,
"step": 14805
},
{
"epoch": 0.9480827091735484,
"grad_norm": 32.66071701049805,
"learning_rate": 1.6418848695675003e-08,
"loss": 0.3135,
"num_input_tokens_seen": 46617472,
"step": 14810
},
{
"epoch": 0.9484027911145253,
"grad_norm": 36.13750457763672,
"learning_rate": 1.6217815321586614e-08,
"loss": 0.3713,
"num_input_tokens_seen": 46632896,
"step": 14815
},
{
"epoch": 0.9487228730555022,
"grad_norm": 20.115230560302734,
"learning_rate": 1.6018010211106602e-08,
"loss": 0.3516,
"num_input_tokens_seen": 46649408,
"step": 14820
},
{
"epoch": 0.9490429549964791,
"grad_norm": 18.152301788330078,
"learning_rate": 1.58194336136942e-08,
"loss": 0.2899,
"num_input_tokens_seen": 46665344,
"step": 14825
},
{
"epoch": 0.9493630369374559,
"grad_norm": 37.48488235473633,
"learning_rate": 1.5622085777274417e-08,
"loss": 0.4377,
"num_input_tokens_seen": 46680704,
"step": 14830
},
{
"epoch": 0.9496831188784329,
"grad_norm": 39.530426025390625,
"learning_rate": 1.542596694823839e-08,
"loss": 0.3267,
"num_input_tokens_seen": 46695936,
"step": 14835
},
{
"epoch": 0.9500032008194098,
"grad_norm": 56.298133850097656,
"learning_rate": 1.5231077371442914e-08,
"loss": 0.4208,
"num_input_tokens_seen": 46711680,
"step": 14840
},
{
"epoch": 0.9503232827603867,
"grad_norm": 24.305173873901367,
"learning_rate": 1.5037417290209685e-08,
"loss": 0.2846,
"num_input_tokens_seen": 46727040,
"step": 14845
},
{
"epoch": 0.9506433647013636,
"grad_norm": 38.6282844543457,
"learning_rate": 1.4844986946325743e-08,
"loss": 0.3933,
"num_input_tokens_seen": 46742720,
"step": 14850
},
{
"epoch": 0.9509634466423404,
"grad_norm": 26.332656860351562,
"learning_rate": 1.4653786580042681e-08,
"loss": 0.2686,
"num_input_tokens_seen": 46758336,
"step": 14855
},
{
"epoch": 0.9511554958069266,
"eval_loss": 0.35565948486328125,
"eval_runtime": 50.6621,
"eval_samples_per_second": 274.09,
"eval_steps_per_second": 34.266,
"num_input_tokens_seen": 46767552,
"step": 14858
},
{
"epoch": 0.9512835285833173,
"grad_norm": 22.38384246826172,
"learning_rate": 1.4463816430076215e-08,
"loss": 0.2999,
"num_input_tokens_seen": 46773312,
"step": 14860
},
{
"epoch": 0.9516036105242942,
"grad_norm": 38.47566604614258,
"learning_rate": 1.4275076733606395e-08,
"loss": 0.3573,
"num_input_tokens_seen": 46787968,
"step": 14865
},
{
"epoch": 0.9519236924652711,
"grad_norm": 26.21702766418457,
"learning_rate": 1.4087567726277061e-08,
"loss": 0.2955,
"num_input_tokens_seen": 46803712,
"step": 14870
},
{
"epoch": 0.952243774406248,
"grad_norm": 29.341995239257812,
"learning_rate": 1.390128964219528e-08,
"loss": 0.2811,
"num_input_tokens_seen": 46820288,
"step": 14875
},
{
"epoch": 0.9525638563472248,
"grad_norm": 45.728111267089844,
"learning_rate": 1.3716242713931348e-08,
"loss": 0.3966,
"num_input_tokens_seen": 46835904,
"step": 14880
},
{
"epoch": 0.9528839382882017,
"grad_norm": 28.86192512512207,
"learning_rate": 1.3532427172518789e-08,
"loss": 0.3738,
"num_input_tokens_seen": 46851136,
"step": 14885
},
{
"epoch": 0.9532040202291787,
"grad_norm": 37.157073974609375,
"learning_rate": 1.3349843247453252e-08,
"loss": 0.3431,
"num_input_tokens_seen": 46867456,
"step": 14890
},
{
"epoch": 0.9535241021701556,
"grad_norm": 28.760547637939453,
"learning_rate": 1.3168491166692941e-08,
"loss": 0.2796,
"num_input_tokens_seen": 46882816,
"step": 14895
},
{
"epoch": 0.9538441841111325,
"grad_norm": 42.88594436645508,
"learning_rate": 1.2988371156658073e-08,
"loss": 0.4594,
"num_input_tokens_seen": 46898624,
"step": 14900
},
{
"epoch": 0.9541642660521094,
"grad_norm": 27.931549072265625,
"learning_rate": 1.2809483442230763e-08,
"loss": 0.2959,
"num_input_tokens_seen": 46914304,
"step": 14905
},
{
"epoch": 0.9544843479930862,
"grad_norm": 25.078886032104492,
"learning_rate": 1.2631828246754128e-08,
"loss": 0.373,
"num_input_tokens_seen": 46930368,
"step": 14910
},
{
"epoch": 0.9548044299340631,
"grad_norm": 45.43979263305664,
"learning_rate": 1.2455405792032969e-08,
"loss": 0.3678,
"num_input_tokens_seen": 46945792,
"step": 14915
},
{
"epoch": 0.95512451187504,
"grad_norm": 23.990598678588867,
"learning_rate": 1.2280216298332646e-08,
"loss": 0.3474,
"num_input_tokens_seen": 46962048,
"step": 14920
},
{
"epoch": 0.9554445938160169,
"grad_norm": 56.57452392578125,
"learning_rate": 1.2106259984379642e-08,
"loss": 0.4736,
"num_input_tokens_seen": 46976768,
"step": 14925
},
{
"epoch": 0.9557646757569938,
"grad_norm": 45.05714416503906,
"learning_rate": 1.1933537067359889e-08,
"loss": 0.4153,
"num_input_tokens_seen": 46991424,
"step": 14930
},
{
"epoch": 0.9560847576979706,
"grad_norm": 24.859119415283203,
"learning_rate": 1.1762047762920446e-08,
"loss": 0.3603,
"num_input_tokens_seen": 47006656,
"step": 14935
},
{
"epoch": 0.9564048396389476,
"grad_norm": 47.79610061645508,
"learning_rate": 1.1591792285167602e-08,
"loss": 0.3643,
"num_input_tokens_seen": 47021824,
"step": 14940
},
{
"epoch": 0.9567249215799245,
"grad_norm": 31.10300064086914,
"learning_rate": 1.1422770846667206e-08,
"loss": 0.3862,
"num_input_tokens_seen": 47037440,
"step": 14945
},
{
"epoch": 0.9570450035209014,
"grad_norm": 18.239160537719727,
"learning_rate": 1.1254983658444572e-08,
"loss": 0.303,
"num_input_tokens_seen": 47053760,
"step": 14950
},
{
"epoch": 0.9573650854618783,
"grad_norm": 45.93240737915039,
"learning_rate": 1.1088430929984017e-08,
"loss": 0.3218,
"num_input_tokens_seen": 47068928,
"step": 14955
},
{
"epoch": 0.9576851674028551,
"grad_norm": 35.72513961791992,
"learning_rate": 1.0923112869228645e-08,
"loss": 0.3807,
"num_input_tokens_seen": 47084672,
"step": 14960
},
{
"epoch": 0.958005249343832,
"grad_norm": 42.905418395996094,
"learning_rate": 1.0759029682579801e-08,
"loss": 0.3554,
"num_input_tokens_seen": 47101632,
"step": 14965
},
{
"epoch": 0.9583253312848089,
"grad_norm": 25.768041610717773,
"learning_rate": 1.0596181574897389e-08,
"loss": 0.3051,
"num_input_tokens_seen": 47116480,
"step": 14970
},
{
"epoch": 0.9586454132257858,
"grad_norm": 29.31188201904297,
"learning_rate": 1.0434568749499107e-08,
"loss": 0.3227,
"num_input_tokens_seen": 47132992,
"step": 14975
},
{
"epoch": 0.9589654951667627,
"grad_norm": 28.007709503173828,
"learning_rate": 1.027419140816066e-08,
"loss": 0.3077,
"num_input_tokens_seen": 47149056,
"step": 14980
},
{
"epoch": 0.9592855771077395,
"grad_norm": 26.973087310791016,
"learning_rate": 1.0115049751114768e-08,
"loss": 0.3029,
"num_input_tokens_seen": 47164864,
"step": 14985
},
{
"epoch": 0.9596056590487164,
"grad_norm": 19.246578216552734,
"learning_rate": 9.957143977051941e-09,
"loss": 0.3514,
"num_input_tokens_seen": 47180544,
"step": 14990
},
{
"epoch": 0.9599257409896934,
"grad_norm": 34.508419036865234,
"learning_rate": 9.800474283119142e-09,
"loss": 0.3879,
"num_input_tokens_seen": 47196608,
"step": 14995
},
{
"epoch": 0.9602458229306703,
"grad_norm": 26.967897415161133,
"learning_rate": 9.645040864920462e-09,
"loss": 0.3755,
"num_input_tokens_seen": 47213504,
"step": 15000
},
{
"epoch": 0.9605659048716472,
"grad_norm": 33.09022903442383,
"learning_rate": 9.490843916516334e-09,
"loss": 0.4015,
"num_input_tokens_seen": 47228288,
"step": 15005
},
{
"epoch": 0.960885986812624,
"grad_norm": 26.84346580505371,
"learning_rate": 9.337883630423316e-09,
"loss": 0.452,
"num_input_tokens_seen": 47243712,
"step": 15010
},
{
"epoch": 0.9612060687536009,
"grad_norm": 54.509681701660156,
"learning_rate": 9.186160197614423e-09,
"loss": 0.5173,
"num_input_tokens_seen": 47259904,
"step": 15015
},
{
"epoch": 0.9615261506945778,
"grad_norm": 35.004150390625,
"learning_rate": 9.035673807517795e-09,
"loss": 0.4795,
"num_input_tokens_seen": 47275072,
"step": 15020
},
{
"epoch": 0.9618462326355547,
"grad_norm": 44.16777420043945,
"learning_rate": 8.886424648017698e-09,
"loss": 0.2802,
"num_input_tokens_seen": 47290688,
"step": 15025
},
{
"epoch": 0.9621663145765316,
"grad_norm": 21.235763549804688,
"learning_rate": 8.738412905453408e-09,
"loss": 0.34,
"num_input_tokens_seen": 47306496,
"step": 15030
},
{
"epoch": 0.9624863965175084,
"grad_norm": 31.245132446289062,
"learning_rate": 8.591638764619324e-09,
"loss": 0.3524,
"num_input_tokens_seen": 47321280,
"step": 15035
},
{
"epoch": 0.9628064784584853,
"grad_norm": 51.93947219848633,
"learning_rate": 8.446102408764643e-09,
"loss": 0.3707,
"num_input_tokens_seen": 47337536,
"step": 15040
},
{
"epoch": 0.9631265603994623,
"grad_norm": 37.979652404785156,
"learning_rate": 8.301804019593129e-09,
"loss": 0.2796,
"num_input_tokens_seen": 47353024,
"step": 15045
},
{
"epoch": 0.9634466423404392,
"grad_norm": 33.141231536865234,
"learning_rate": 8.158743777263333e-09,
"loss": 0.3505,
"num_input_tokens_seen": 47369088,
"step": 15050
},
{
"epoch": 0.9637667242814161,
"grad_norm": 25.881275177001953,
"learning_rate": 8.016921860387272e-09,
"loss": 0.3566,
"num_input_tokens_seen": 47384320,
"step": 15055
},
{
"epoch": 0.964086806222393,
"grad_norm": 27.325672149658203,
"learning_rate": 7.876338446031416e-09,
"loss": 0.3949,
"num_input_tokens_seen": 47400896,
"step": 15060
},
{
"epoch": 0.9644068881633698,
"grad_norm": 40.98260498046875,
"learning_rate": 7.736993709716033e-09,
"loss": 0.3234,
"num_input_tokens_seen": 47416896,
"step": 15065
},
{
"epoch": 0.9647269701043467,
"grad_norm": 54.15633010864258,
"learning_rate": 7.59888782541418e-09,
"loss": 0.4736,
"num_input_tokens_seen": 47432320,
"step": 15070
},
{
"epoch": 0.9650470520453236,
"grad_norm": 17.9570255279541,
"learning_rate": 7.462020965553151e-09,
"loss": 0.2698,
"num_input_tokens_seen": 47448320,
"step": 15075
},
{
"epoch": 0.9653671339863005,
"grad_norm": 19.267284393310547,
"learning_rate": 7.32639330101259e-09,
"loss": 0.4844,
"num_input_tokens_seen": 47463488,
"step": 15080
},
{
"epoch": 0.9656872159272774,
"grad_norm": 52.61140823364258,
"learning_rate": 7.1920050011252675e-09,
"loss": 0.3884,
"num_input_tokens_seen": 47479104,
"step": 15085
},
{
"epoch": 0.9660072978682542,
"grad_norm": 37.068931579589844,
"learning_rate": 7.058856233676525e-09,
"loss": 0.3994,
"num_input_tokens_seen": 47496448,
"step": 15090
},
{
"epoch": 0.9663273798092311,
"grad_norm": 83.22144317626953,
"learning_rate": 6.926947164904162e-09,
"loss": 0.3758,
"num_input_tokens_seen": 47511936,
"step": 15095
},
{
"epoch": 0.9666474617502081,
"grad_norm": 27.578569412231445,
"learning_rate": 6.796277959498331e-09,
"loss": 0.4048,
"num_input_tokens_seen": 47528320,
"step": 15100
},
{
"epoch": 0.966967543691185,
"grad_norm": 26.990234375,
"learning_rate": 6.666848780600864e-09,
"loss": 0.2726,
"num_input_tokens_seen": 47543296,
"step": 15105
},
{
"epoch": 0.9672876256321619,
"grad_norm": 11.355256080627441,
"learning_rate": 6.538659789805834e-09,
"loss": 0.2706,
"num_input_tokens_seen": 47558656,
"step": 15110
},
{
"epoch": 0.9676077075731387,
"grad_norm": 33.00820541381836,
"learning_rate": 6.411711147158438e-09,
"loss": 0.3739,
"num_input_tokens_seen": 47574720,
"step": 15115
},
{
"epoch": 0.9679277895141156,
"grad_norm": 47.104095458984375,
"learning_rate": 6.286003011155783e-09,
"loss": 0.3126,
"num_input_tokens_seen": 47590272,
"step": 15120
},
{
"epoch": 0.9682478714550925,
"grad_norm": 33.4498405456543,
"learning_rate": 6.161535538745877e-09,
"loss": 0.4041,
"num_input_tokens_seen": 47605696,
"step": 15125
},
{
"epoch": 0.9685679533960694,
"grad_norm": 32.14289093017578,
"learning_rate": 6.0383088853277475e-09,
"loss": 0.3798,
"num_input_tokens_seen": 47621760,
"step": 15130
},
{
"epoch": 0.9688880353370463,
"grad_norm": 31.649654388427734,
"learning_rate": 5.916323204751439e-09,
"loss": 0.3175,
"num_input_tokens_seen": 47639296,
"step": 15135
},
{
"epoch": 0.9692081172780231,
"grad_norm": 28.495525360107422,
"learning_rate": 5.795578649317345e-09,
"loss": 0.2636,
"num_input_tokens_seen": 47654656,
"step": 15140
},
{
"epoch": 0.969528199219,
"grad_norm": 44.42762756347656,
"learning_rate": 5.676075369776656e-09,
"loss": 0.3059,
"num_input_tokens_seen": 47671168,
"step": 15145
},
{
"epoch": 0.9698482811599769,
"grad_norm": 25.46860694885254,
"learning_rate": 5.557813515330468e-09,
"loss": 0.3451,
"num_input_tokens_seen": 47686400,
"step": 15150
},
{
"epoch": 0.9701683631009539,
"grad_norm": 28.098102569580078,
"learning_rate": 5.440793233630115e-09,
"loss": 0.3484,
"num_input_tokens_seen": 47701760,
"step": 15155
},
{
"epoch": 0.9704884450419308,
"grad_norm": 39.411136627197266,
"learning_rate": 5.325014670776951e-09,
"loss": 0.3073,
"num_input_tokens_seen": 47717248,
"step": 15160
},
{
"epoch": 0.9708085269829076,
"grad_norm": 60.595664978027344,
"learning_rate": 5.21047797132157e-09,
"loss": 0.3607,
"num_input_tokens_seen": 47734336,
"step": 15165
},
{
"epoch": 0.9711286089238845,
"grad_norm": 26.46986961364746,
"learning_rate": 5.097183278264694e-09,
"loss": 0.3428,
"num_input_tokens_seen": 47750464,
"step": 15170
},
{
"epoch": 0.9714486908648614,
"grad_norm": 27.248794555664062,
"learning_rate": 4.985130733055954e-09,
"loss": 0.4272,
"num_input_tokens_seen": 47765824,
"step": 15175
},
{
"epoch": 0.9717687728058383,
"grad_norm": 28.56536293029785,
"learning_rate": 4.874320475594107e-09,
"loss": 0.381,
"num_input_tokens_seen": 47781760,
"step": 15180
},
{
"epoch": 0.9720888547468152,
"grad_norm": 17.6987361907959,
"learning_rate": 4.764752644227377e-09,
"loss": 0.292,
"num_input_tokens_seen": 47797312,
"step": 15185
},
{
"epoch": 0.972408936687792,
"grad_norm": 28.92839241027832,
"learning_rate": 4.656427375752336e-09,
"loss": 0.335,
"num_input_tokens_seen": 47813440,
"step": 15190
},
{
"epoch": 0.9727290186287689,
"grad_norm": 31.72882843017578,
"learning_rate": 4.549344805414246e-09,
"loss": 0.343,
"num_input_tokens_seen": 47829440,
"step": 15195
},
{
"epoch": 0.9730491005697458,
"grad_norm": 33.45530700683594,
"learning_rate": 4.443505066907049e-09,
"loss": 0.4009,
"num_input_tokens_seen": 47844608,
"step": 15200
},
{
"epoch": 0.9733691825107228,
"grad_norm": 38.720211029052734,
"learning_rate": 4.338908292372934e-09,
"loss": 0.2898,
"num_input_tokens_seen": 47860160,
"step": 15205
},
{
"epoch": 0.9736892644516997,
"grad_norm": 43.10197448730469,
"learning_rate": 4.235554612402214e-09,
"loss": 0.3906,
"num_input_tokens_seen": 47875648,
"step": 15210
},
{
"epoch": 0.9740093463926766,
"grad_norm": 45.98534393310547,
"learning_rate": 4.133444156033006e-09,
"loss": 0.3799,
"num_input_tokens_seen": 47892736,
"step": 15215
},
{
"epoch": 0.9743294283336534,
"grad_norm": 37.45781326293945,
"learning_rate": 4.032577050751551e-09,
"loss": 0.3319,
"num_input_tokens_seen": 47908992,
"step": 15220
},
{
"epoch": 0.9746495102746303,
"grad_norm": 29.421432495117188,
"learning_rate": 3.932953422491669e-09,
"loss": 0.3489,
"num_input_tokens_seen": 47924736,
"step": 15225
},
{
"epoch": 0.9749695922156072,
"grad_norm": 48.30408477783203,
"learning_rate": 3.8345733956345326e-09,
"loss": 0.2816,
"num_input_tokens_seen": 47941056,
"step": 15230
},
{
"epoch": 0.9752896741565841,
"grad_norm": 30.11746597290039,
"learning_rate": 3.737437093008777e-09,
"loss": 0.3635,
"num_input_tokens_seen": 47957824,
"step": 15235
},
{
"epoch": 0.975609756097561,
"grad_norm": 42.55520248413086,
"learning_rate": 3.641544635890281e-09,
"loss": 0.4132,
"num_input_tokens_seen": 47973056,
"step": 15240
},
{
"epoch": 0.9759298380385378,
"grad_norm": 19.125173568725586,
"learning_rate": 3.546896144001832e-09,
"loss": 0.3959,
"num_input_tokens_seen": 47988928,
"step": 15245
},
{
"epoch": 0.9762499199795147,
"grad_norm": 50.54212951660156,
"learning_rate": 3.4534917355132364e-09,
"loss": 0.3935,
"num_input_tokens_seen": 48004032,
"step": 15250
},
{
"epoch": 0.9765700019204916,
"grad_norm": 33.77085494995117,
"learning_rate": 3.361331527040878e-09,
"loss": 0.4168,
"num_input_tokens_seen": 48020800,
"step": 15255
},
{
"epoch": 0.9768900838614686,
"grad_norm": 32.820064544677734,
"learning_rate": 3.270415633647938e-09,
"loss": 0.3997,
"num_input_tokens_seen": 48036800,
"step": 15260
},
{
"epoch": 0.9772101658024455,
"grad_norm": 23.62579345703125,
"learning_rate": 3.180744168843952e-09,
"loss": 0.2911,
"num_input_tokens_seen": 48051264,
"step": 15265
},
{
"epoch": 0.9775302477434223,
"grad_norm": 27.472536087036133,
"learning_rate": 3.0923172445849187e-09,
"loss": 0.226,
"num_input_tokens_seen": 48066176,
"step": 15270
},
{
"epoch": 0.9778503296843992,
"grad_norm": 36.02907943725586,
"learning_rate": 3.0051349712727493e-09,
"loss": 0.3135,
"num_input_tokens_seen": 48081984,
"step": 15275
},
{
"epoch": 0.9781704116253761,
"grad_norm": 30.320404052734375,
"learning_rate": 2.9191974577555954e-09,
"loss": 0.4143,
"num_input_tokens_seen": 48096896,
"step": 15280
},
{
"epoch": 0.978490493566353,
"grad_norm": 17.627288818359375,
"learning_rate": 2.8345048113274096e-09,
"loss": 0.2341,
"num_input_tokens_seen": 48112128,
"step": 15285
},
{
"epoch": 0.9788105755073299,
"grad_norm": 31.73265838623047,
"learning_rate": 2.751057137727941e-09,
"loss": 0.3353,
"num_input_tokens_seen": 48127616,
"step": 15290
},
{
"epoch": 0.9791306574483067,
"grad_norm": 66.26738739013672,
"learning_rate": 2.66885454114274e-09,
"loss": 0.3961,
"num_input_tokens_seen": 48142144,
"step": 15295
},
{
"epoch": 0.9794507393892836,
"grad_norm": 62.21368408203125,
"learning_rate": 2.5878971242025983e-09,
"loss": 0.3685,
"num_input_tokens_seen": 48158272,
"step": 15300
},
{
"epoch": 0.9797708213302605,
"grad_norm": 26.044448852539062,
"learning_rate": 2.5081849879837746e-09,
"loss": 0.3216,
"num_input_tokens_seen": 48173120,
"step": 15305
},
{
"epoch": 0.9800909032712375,
"grad_norm": 20.764760971069336,
"learning_rate": 2.429718232007771e-09,
"loss": 0.3423,
"num_input_tokens_seen": 48188672,
"step": 15310
},
{
"epoch": 0.9804109852122144,
"grad_norm": 26.268352508544922,
"learning_rate": 2.3524969542414453e-09,
"loss": 0.2693,
"num_input_tokens_seen": 48204480,
"step": 15315
},
{
"epoch": 0.9807310671531912,
"grad_norm": 15.829779624938965,
"learning_rate": 2.2765212510963418e-09,
"loss": 0.3537,
"num_input_tokens_seen": 48219584,
"step": 15320
},
{
"epoch": 0.9810511490941681,
"grad_norm": 32.62509536743164,
"learning_rate": 2.2017912174289164e-09,
"loss": 0.2813,
"num_input_tokens_seen": 48235904,
"step": 15325
},
{
"epoch": 0.981371231035145,
"grad_norm": 31.979473114013672,
"learning_rate": 2.128306946540648e-09,
"loss": 0.3963,
"num_input_tokens_seen": 48252992,
"step": 15330
},
{
"epoch": 0.9816913129761219,
"grad_norm": 31.331867218017578,
"learning_rate": 2.0560685301774792e-09,
"loss": 0.3328,
"num_input_tokens_seen": 48267840,
"step": 15335
},
{
"epoch": 0.9820113949170988,
"grad_norm": 21.032350540161133,
"learning_rate": 1.985076058529933e-09,
"loss": 0.3753,
"num_input_tokens_seen": 48282688,
"step": 15340
},
{
"epoch": 0.9823314768580756,
"grad_norm": 37.26376724243164,
"learning_rate": 1.9153296202328863e-09,
"loss": 0.478,
"num_input_tokens_seen": 48300096,
"step": 15345
},
{
"epoch": 0.9826515587990525,
"grad_norm": 34.11316680908203,
"learning_rate": 1.8468293023656823e-09,
"loss": 0.3943,
"num_input_tokens_seen": 48315136,
"step": 15350
},
{
"epoch": 0.9829716407400294,
"grad_norm": 18.752748489379883,
"learning_rate": 1.7795751904515766e-09,
"loss": 0.4025,
"num_input_tokens_seen": 48330240,
"step": 15355
},
{
"epoch": 0.9832917226810063,
"grad_norm": 58.279869079589844,
"learning_rate": 1.7135673684584019e-09,
"loss": 0.3109,
"num_input_tokens_seen": 48345280,
"step": 15360
},
{
"epoch": 0.9836118046219833,
"grad_norm": 34.251678466796875,
"learning_rate": 1.6488059187974579e-09,
"loss": 0.403,
"num_input_tokens_seen": 48361792,
"step": 15365
},
{
"epoch": 0.9839318865629602,
"grad_norm": 30.479703903198242,
"learning_rate": 1.5852909223242894e-09,
"loss": 0.4034,
"num_input_tokens_seen": 48377408,
"step": 15370
},
{
"epoch": 0.984251968503937,
"grad_norm": 16.166257858276367,
"learning_rate": 1.5230224583380192e-09,
"loss": 0.3679,
"num_input_tokens_seen": 48392896,
"step": 15375
},
{
"epoch": 0.9845720504449139,
"grad_norm": 40.26905822753906,
"learning_rate": 1.4620006045816813e-09,
"loss": 0.4625,
"num_input_tokens_seen": 48407552,
"step": 15380
},
{
"epoch": 0.9848921323858908,
"grad_norm": 16.73556137084961,
"learning_rate": 1.4022254372417774e-09,
"loss": 0.2809,
"num_input_tokens_seen": 48424320,
"step": 15385
},
{
"epoch": 0.9852122143268677,
"grad_norm": 35.438411712646484,
"learning_rate": 1.3436970309481655e-09,
"loss": 0.5055,
"num_input_tokens_seen": 48441984,
"step": 15390
},
{
"epoch": 0.9855322962678446,
"grad_norm": 16.894039154052734,
"learning_rate": 1.2864154587742815e-09,
"loss": 0.333,
"num_input_tokens_seen": 48456832,
"step": 15395
},
{
"epoch": 0.9858523782088214,
"grad_norm": 33.132869720458984,
"learning_rate": 1.2303807922370292e-09,
"loss": 0.3719,
"num_input_tokens_seen": 48472512,
"step": 15400
},
{
"epoch": 0.9861724601497983,
"grad_norm": 62.29145431518555,
"learning_rate": 1.1755931012961128e-09,
"loss": 0.3169,
"num_input_tokens_seen": 48488832,
"step": 15405
},
{
"epoch": 0.9864925420907752,
"grad_norm": 19.01352310180664,
"learning_rate": 1.122052454354705e-09,
"loss": 0.3615,
"num_input_tokens_seen": 48503936,
"step": 15410
},
{
"epoch": 0.9868126240317522,
"grad_norm": 21.924360275268555,
"learning_rate": 1.0697589182590005e-09,
"loss": 0.4383,
"num_input_tokens_seen": 48519040,
"step": 15415
},
{
"epoch": 0.9871327059727291,
"grad_norm": 27.970205307006836,
"learning_rate": 1.018712558297996e-09,
"loss": 0.6,
"num_input_tokens_seen": 48535040,
"step": 15420
},
{
"epoch": 0.9874527879137059,
"grad_norm": 38.23207473754883,
"learning_rate": 9.689134382037113e-10,
"loss": 0.4438,
"num_input_tokens_seen": 48551808,
"step": 15425
},
{
"epoch": 0.9877728698546828,
"grad_norm": 37.27165985107422,
"learning_rate": 9.203616201508557e-10,
"loss": 0.3976,
"num_input_tokens_seen": 48566592,
"step": 15430
},
{
"epoch": 0.9880929517956597,
"grad_norm": 48.10826110839844,
"learning_rate": 8.730571647570517e-10,
"loss": 0.3103,
"num_input_tokens_seen": 48582720,
"step": 15435
},
{
"epoch": 0.9884130337366366,
"grad_norm": 50.18999099731445,
"learning_rate": 8.270001310825003e-10,
"loss": 0.4765,
"num_input_tokens_seen": 48599104,
"step": 15440
},
{
"epoch": 0.9887331156776135,
"grad_norm": 13.126496315002441,
"learning_rate": 7.821905766297599e-10,
"loss": 0.3114,
"num_input_tokens_seen": 48615040,
"step": 15445
},
{
"epoch": 0.9890531976185903,
"grad_norm": 27.947267532348633,
"learning_rate": 7.386285573441897e-10,
"loss": 0.3971,
"num_input_tokens_seen": 48630976,
"step": 15450
},
{
"epoch": 0.9893732795595672,
"grad_norm": 27.045764923095703,
"learning_rate": 6.963141276136175e-10,
"loss": 0.283,
"num_input_tokens_seen": 48646080,
"step": 15455
},
{
"epoch": 0.9896933615005441,
"grad_norm": 19.698862075805664,
"learning_rate": 6.552473402678949e-10,
"loss": 0.2476,
"num_input_tokens_seen": 48662528,
"step": 15460
},
{
"epoch": 0.990013443441521,
"grad_norm": 46.163021087646484,
"learning_rate": 6.154282465794524e-10,
"loss": 0.3244,
"num_input_tokens_seen": 48680000,
"step": 15465
},
{
"epoch": 0.990333525382498,
"grad_norm": 30.393213272094727,
"learning_rate": 5.768568962629672e-10,
"loss": 0.4256,
"num_input_tokens_seen": 48696256,
"step": 15470
},
{
"epoch": 0.9906536073234748,
"grad_norm": 43.717071533203125,
"learning_rate": 5.395333374751398e-10,
"loss": 0.3062,
"num_input_tokens_seen": 48711168,
"step": 15475
},
{
"epoch": 0.9909736892644517,
"grad_norm": 45.56324005126953,
"learning_rate": 5.034576168149174e-10,
"loss": 0.5477,
"num_input_tokens_seen": 48726848,
"step": 15480
},
{
"epoch": 0.9912937712054286,
"grad_norm": 48.832122802734375,
"learning_rate": 4.686297793231597e-10,
"loss": 0.4838,
"num_input_tokens_seen": 48743232,
"step": 15485
},
{
"epoch": 0.9916138531464055,
"grad_norm": 26.313810348510742,
"learning_rate": 4.350498684829729e-10,
"loss": 0.4541,
"num_input_tokens_seen": 48758080,
"step": 15490
},
{
"epoch": 0.9919339350873824,
"grad_norm": 42.293983459472656,
"learning_rate": 4.0271792621926483e-10,
"loss": 0.3123,
"num_input_tokens_seen": 48773120,
"step": 15495
},
{
"epoch": 0.9922540170283592,
"grad_norm": 14.059409141540527,
"learning_rate": 3.716339928987455e-10,
"loss": 0.3749,
"num_input_tokens_seen": 48789056,
"step": 15500
},
{
"epoch": 0.9925740989693361,
"grad_norm": 67.56918334960938,
"learning_rate": 3.41798107330149e-10,
"loss": 0.4189,
"num_input_tokens_seen": 48804288,
"step": 15505
},
{
"epoch": 0.992894180910313,
"grad_norm": 35.841514587402344,
"learning_rate": 3.1321030676390027e-10,
"loss": 0.3683,
"num_input_tokens_seen": 48818816,
"step": 15510
},
{
"epoch": 0.9932142628512899,
"grad_norm": 25.882349014282227,
"learning_rate": 2.8587062689222617e-10,
"loss": 0.291,
"num_input_tokens_seen": 48835520,
"step": 15515
},
{
"epoch": 0.9935343447922668,
"grad_norm": 29.71622085571289,
"learning_rate": 2.5977910184904473e-10,
"loss": 0.3139,
"num_input_tokens_seen": 48851328,
"step": 15520
},
{
"epoch": 0.9938544267332438,
"grad_norm": 35.663578033447266,
"learning_rate": 2.3493576420985373e-10,
"loss": 0.3466,
"num_input_tokens_seen": 48866304,
"step": 15525
},
{
"epoch": 0.9941745086742206,
"grad_norm": 15.850737571716309,
"learning_rate": 2.11340644991842e-10,
"loss": 0.3311,
"num_input_tokens_seen": 48882752,
"step": 15530
},
{
"epoch": 0.9944945906151975,
"grad_norm": 39.47983169555664,
"learning_rate": 1.8899377365388936e-10,
"loss": 0.3046,
"num_input_tokens_seen": 48898304,
"step": 15535
},
{
"epoch": 0.9948146725561744,
"grad_norm": 19.749874114990234,
"learning_rate": 1.6789517809634447e-10,
"loss": 0.4284,
"num_input_tokens_seen": 48914048,
"step": 15540
},
{
"epoch": 0.9951347544971513,
"grad_norm": 66.90821075439453,
"learning_rate": 1.480448846609139e-10,
"loss": 0.3291,
"num_input_tokens_seen": 48930176,
"step": 15545
},
{
"epoch": 0.9954548364381282,
"grad_norm": 25.663530349731445,
"learning_rate": 1.294429181311063e-10,
"loss": 0.3522,
"num_input_tokens_seen": 48945920,
"step": 15550
},
{
"epoch": 0.995774918379105,
"grad_norm": 23.315927505493164,
"learning_rate": 1.1208930173145503e-10,
"loss": 0.4063,
"num_input_tokens_seen": 48960832,
"step": 15555
},
{
"epoch": 0.9960950003200819,
"grad_norm": 21.742311477661133,
"learning_rate": 9.598405712840651e-11,
"loss": 0.3278,
"num_input_tokens_seen": 48977280,
"step": 15560
},
{
"epoch": 0.9964150822610588,
"grad_norm": 21.519027709960938,
"learning_rate": 8.1127204429432e-11,
"loss": 0.3526,
"num_input_tokens_seen": 48992512,
"step": 15565
},
{
"epoch": 0.9967351642020357,
"grad_norm": 32.181678771972656,
"learning_rate": 6.751876218336061e-11,
"loss": 0.351,
"num_input_tokens_seen": 49008128,
"step": 15570
},
{
"epoch": 0.9970552461430127,
"grad_norm": 24.76947593688965,
"learning_rate": 5.515874738071247e-11,
"loss": 0.3451,
"num_input_tokens_seen": 49024512,
"step": 15575
},
{
"epoch": 0.9973753280839895,
"grad_norm": 49.30158615112305,
"learning_rate": 4.404717545303249e-11,
"loss": 0.3131,
"num_input_tokens_seen": 49040128,
"step": 15580
},
{
"epoch": 0.9976954100249664,
"grad_norm": 15.697863578796387,
"learning_rate": 3.418406027322352e-11,
"loss": 0.3111,
"num_input_tokens_seen": 49055360,
"step": 15585
},
{
"epoch": 0.9980154919659433,
"grad_norm": 33.91520309448242,
"learning_rate": 2.5569414155546254e-11,
"loss": 0.3576,
"num_input_tokens_seen": 49071360,
"step": 15590
},
{
"epoch": 0.9983355739069202,
"grad_norm": 49.540733337402344,
"learning_rate": 1.8203247855397287e-11,
"loss": 0.2698,
"num_input_tokens_seen": 49086144,
"step": 15595
},
{
"epoch": 0.9986556558478971,
"grad_norm": 34.655052185058594,
"learning_rate": 1.2085570569642101e-11,
"loss": 0.3915,
"num_input_tokens_seen": 49101312,
"step": 15600
},
{
"epoch": 0.9989757377888739,
"grad_norm": 57.687381744384766,
"learning_rate": 7.216389936171019e-12,
"loss": 0.3151,
"num_input_tokens_seen": 49116672,
"step": 15605
},
{
"epoch": 0.9992958197298508,
"grad_norm": 17.42046546936035,
"learning_rate": 3.5957120342322567e-12,
"loss": 0.1751,
"num_input_tokens_seen": 49132288,
"step": 15610
},
{
"epoch": 0.9996159016708277,
"grad_norm": 17.059843063354492,
"learning_rate": 1.2235413842098807e-12,
"loss": 0.3884,
"num_input_tokens_seen": 49148096,
"step": 15615
},
{
"epoch": 0.9999359836118046,
"grad_norm": 20.215747833251953,
"learning_rate": 9.98809480678986e-14,
"loss": 0.2485,
"num_input_tokens_seen": 49163840,
"step": 15620
},
{
"epoch": 1.0,
"num_input_tokens_seen": 49166912,
"step": 15621,
"total_flos": 2.8707953551107686e+17,
"train_loss": 0.44674425404505724,
"train_runtime": 6032.4024,
"train_samples_per_second": 20.716,
"train_steps_per_second": 2.59
}
],
"logging_steps": 5,
"max_steps": 15621,
"num_input_tokens_seen": 49166912,
"num_train_epochs": 1,
"save_steps": 782,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.8707953551107686e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}