Files
train_boolq_42_1776331558/trainer_state.json

8704 lines
227 KiB
JSON
Raw Normal View History

{
"best_global_step": 1064,
"best_metric": 0.18848362565040588,
"best_model_checkpoint": "saves_bts_preliminary/base/llama-3.2-1b-instruct/train_boolq_42_1776331558/checkpoint-1064",
"epoch": 5.0,
"eval_steps": 266,
"global_step": 5305,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00471253534401508,
"grad_norm": 318.0024719238281,
"learning_rate": 3.766478342749529e-08,
"loss": 0.8967,
"num_input_tokens_seen": 10752,
"step": 5
},
{
"epoch": 0.00942507068803016,
"grad_norm": 311.7052001953125,
"learning_rate": 8.474576271186442e-08,
"loss": 0.865,
"num_input_tokens_seen": 20736,
"step": 10
},
{
"epoch": 0.01413760603204524,
"grad_norm": 320.9153747558594,
"learning_rate": 1.3182674199623353e-07,
"loss": 0.8712,
"num_input_tokens_seen": 31232,
"step": 15
},
{
"epoch": 0.01885014137606032,
"grad_norm": 198.89523315429688,
"learning_rate": 1.7890772128060264e-07,
"loss": 0.7056,
"num_input_tokens_seen": 40320,
"step": 20
},
{
"epoch": 0.0235626767200754,
"grad_norm": 119.11978149414062,
"learning_rate": 2.2598870056497177e-07,
"loss": 0.5698,
"num_input_tokens_seen": 51136,
"step": 25
},
{
"epoch": 0.02827521206409048,
"grad_norm": 40.273216247558594,
"learning_rate": 2.730696798493409e-07,
"loss": 0.4948,
"num_input_tokens_seen": 63424,
"step": 30
},
{
"epoch": 0.03298774740810556,
"grad_norm": 48.785831451416016,
"learning_rate": 3.2015065913371e-07,
"loss": 0.3516,
"num_input_tokens_seen": 75328,
"step": 35
},
{
"epoch": 0.03770028275212064,
"grad_norm": 40.042724609375,
"learning_rate": 3.6723163841807916e-07,
"loss": 0.3321,
"num_input_tokens_seen": 86784,
"step": 40
},
{
"epoch": 0.04241281809613572,
"grad_norm": 27.21363639831543,
"learning_rate": 4.1431261770244826e-07,
"loss": 0.3329,
"num_input_tokens_seen": 99136,
"step": 45
},
{
"epoch": 0.0471253534401508,
"grad_norm": 165.71754455566406,
"learning_rate": 4.613935969868174e-07,
"loss": 0.4395,
"num_input_tokens_seen": 108672,
"step": 50
},
{
"epoch": 0.051837888784165884,
"grad_norm": 41.76438903808594,
"learning_rate": 5.084745762711865e-07,
"loss": 0.4549,
"num_input_tokens_seen": 119808,
"step": 55
},
{
"epoch": 0.05655042412818096,
"grad_norm": 67.6883544921875,
"learning_rate": 5.555555555555555e-07,
"loss": 0.3434,
"num_input_tokens_seen": 130048,
"step": 60
},
{
"epoch": 0.061262959472196045,
"grad_norm": 72.87140655517578,
"learning_rate": 6.026365348399247e-07,
"loss": 0.3434,
"num_input_tokens_seen": 142464,
"step": 65
},
{
"epoch": 0.06597549481621112,
"grad_norm": 34.45458221435547,
"learning_rate": 6.497175141242938e-07,
"loss": 0.3516,
"num_input_tokens_seen": 154048,
"step": 70
},
{
"epoch": 0.0706880301602262,
"grad_norm": 33.45477294921875,
"learning_rate": 6.96798493408663e-07,
"loss": 0.3088,
"num_input_tokens_seen": 166720,
"step": 75
},
{
"epoch": 0.07540056550424128,
"grad_norm": 39.73252868652344,
"learning_rate": 7.43879472693032e-07,
"loss": 0.3218,
"num_input_tokens_seen": 179200,
"step": 80
},
{
"epoch": 0.08011310084825636,
"grad_norm": 39.0076789855957,
"learning_rate": 7.909604519774013e-07,
"loss": 0.3962,
"num_input_tokens_seen": 190464,
"step": 85
},
{
"epoch": 0.08482563619227144,
"grad_norm": 23.60104751586914,
"learning_rate": 8.380414312617704e-07,
"loss": 0.3243,
"num_input_tokens_seen": 205376,
"step": 90
},
{
"epoch": 0.08953817153628653,
"grad_norm": 43.6406364440918,
"learning_rate": 8.851224105461394e-07,
"loss": 0.383,
"num_input_tokens_seen": 215424,
"step": 95
},
{
"epoch": 0.0942507068803016,
"grad_norm": 20.507476806640625,
"learning_rate": 9.322033898305086e-07,
"loss": 0.2987,
"num_input_tokens_seen": 226688,
"step": 100
},
{
"epoch": 0.09896324222431668,
"grad_norm": 27.949003219604492,
"learning_rate": 9.792843691148776e-07,
"loss": 0.2859,
"num_input_tokens_seen": 237248,
"step": 105
},
{
"epoch": 0.10367577756833177,
"grad_norm": 44.55869674682617,
"learning_rate": 1.0263653483992468e-06,
"loss": 0.3517,
"num_input_tokens_seen": 254144,
"step": 110
},
{
"epoch": 0.10838831291234684,
"grad_norm": 65.89197540283203,
"learning_rate": 1.073446327683616e-06,
"loss": 0.296,
"num_input_tokens_seen": 265920,
"step": 115
},
{
"epoch": 0.11310084825636192,
"grad_norm": 77.50157165527344,
"learning_rate": 1.120527306967985e-06,
"loss": 0.4375,
"num_input_tokens_seen": 277184,
"step": 120
},
{
"epoch": 0.117813383600377,
"grad_norm": 48.51222229003906,
"learning_rate": 1.167608286252354e-06,
"loss": 0.2998,
"num_input_tokens_seen": 289024,
"step": 125
},
{
"epoch": 0.12252591894439209,
"grad_norm": 36.21791076660156,
"learning_rate": 1.2146892655367234e-06,
"loss": 0.2883,
"num_input_tokens_seen": 299456,
"step": 130
},
{
"epoch": 0.12723845428840716,
"grad_norm": 38.02757263183594,
"learning_rate": 1.2617702448210926e-06,
"loss": 0.3329,
"num_input_tokens_seen": 313728,
"step": 135
},
{
"epoch": 0.13195098963242224,
"grad_norm": 26.75225257873535,
"learning_rate": 1.3088512241054615e-06,
"loss": 0.2533,
"num_input_tokens_seen": 326080,
"step": 140
},
{
"epoch": 0.13666352497643733,
"grad_norm": 31.08643341064453,
"learning_rate": 1.3559322033898307e-06,
"loss": 0.2705,
"num_input_tokens_seen": 338688,
"step": 145
},
{
"epoch": 0.1413760603204524,
"grad_norm": 38.68683624267578,
"learning_rate": 1.4030131826741996e-06,
"loss": 0.3576,
"num_input_tokens_seen": 349632,
"step": 150
},
{
"epoch": 0.1460885956644675,
"grad_norm": 36.93141174316406,
"learning_rate": 1.4500941619585688e-06,
"loss": 0.2256,
"num_input_tokens_seen": 363968,
"step": 155
},
{
"epoch": 0.15080113100848255,
"grad_norm": 47.699241638183594,
"learning_rate": 1.4971751412429381e-06,
"loss": 0.4483,
"num_input_tokens_seen": 375680,
"step": 160
},
{
"epoch": 0.15551366635249764,
"grad_norm": 24.248348236083984,
"learning_rate": 1.544256120527307e-06,
"loss": 0.269,
"num_input_tokens_seen": 386368,
"step": 165
},
{
"epoch": 0.16022620169651272,
"grad_norm": 18.501712799072266,
"learning_rate": 1.5913370998116762e-06,
"loss": 0.3019,
"num_input_tokens_seen": 396992,
"step": 170
},
{
"epoch": 0.1649387370405278,
"grad_norm": 13.451111793518066,
"learning_rate": 1.6384180790960452e-06,
"loss": 0.3423,
"num_input_tokens_seen": 408960,
"step": 175
},
{
"epoch": 0.1696512723845429,
"grad_norm": 24.91837501525879,
"learning_rate": 1.6854990583804145e-06,
"loss": 0.2813,
"num_input_tokens_seen": 419008,
"step": 180
},
{
"epoch": 0.17436380772855797,
"grad_norm": 23.55257225036621,
"learning_rate": 1.7325800376647837e-06,
"loss": 0.2445,
"num_input_tokens_seen": 430144,
"step": 185
},
{
"epoch": 0.17907634307257306,
"grad_norm": 50.514373779296875,
"learning_rate": 1.7796610169491526e-06,
"loss": 0.2385,
"num_input_tokens_seen": 441216,
"step": 190
},
{
"epoch": 0.18378887841658811,
"grad_norm": 63.581478118896484,
"learning_rate": 1.8267419962335218e-06,
"loss": 0.2216,
"num_input_tokens_seen": 451584,
"step": 195
},
{
"epoch": 0.1885014137606032,
"grad_norm": 132.5011444091797,
"learning_rate": 1.873822975517891e-06,
"loss": 0.4569,
"num_input_tokens_seen": 464384,
"step": 200
},
{
"epoch": 0.19321394910461828,
"grad_norm": 17.86366081237793,
"learning_rate": 1.92090395480226e-06,
"loss": 0.4075,
"num_input_tokens_seen": 481792,
"step": 205
},
{
"epoch": 0.19792648444863337,
"grad_norm": 22.02827262878418,
"learning_rate": 1.9679849340866293e-06,
"loss": 0.2703,
"num_input_tokens_seen": 493952,
"step": 210
},
{
"epoch": 0.20263901979264845,
"grad_norm": 22.791790008544922,
"learning_rate": 2.015065913370998e-06,
"loss": 0.2604,
"num_input_tokens_seen": 504832,
"step": 215
},
{
"epoch": 0.20735155513666353,
"grad_norm": 26.63323974609375,
"learning_rate": 2.062146892655367e-06,
"loss": 0.277,
"num_input_tokens_seen": 514368,
"step": 220
},
{
"epoch": 0.21206409048067862,
"grad_norm": 48.75139617919922,
"learning_rate": 2.1092278719397365e-06,
"loss": 0.2405,
"num_input_tokens_seen": 525568,
"step": 225
},
{
"epoch": 0.21677662582469368,
"grad_norm": 29.34063148498535,
"learning_rate": 2.1563088512241055e-06,
"loss": 0.259,
"num_input_tokens_seen": 537664,
"step": 230
},
{
"epoch": 0.22148916116870876,
"grad_norm": 26.771190643310547,
"learning_rate": 2.203389830508475e-06,
"loss": 0.2561,
"num_input_tokens_seen": 547584,
"step": 235
},
{
"epoch": 0.22620169651272384,
"grad_norm": 32.1733512878418,
"learning_rate": 2.2504708097928438e-06,
"loss": 0.3491,
"num_input_tokens_seen": 558144,
"step": 240
},
{
"epoch": 0.23091423185673893,
"grad_norm": 17.223844528198242,
"learning_rate": 2.297551789077213e-06,
"loss": 0.2543,
"num_input_tokens_seen": 569024,
"step": 245
},
{
"epoch": 0.235626767200754,
"grad_norm": 28.303009033203125,
"learning_rate": 2.344632768361582e-06,
"loss": 0.3033,
"num_input_tokens_seen": 580864,
"step": 250
},
{
"epoch": 0.2403393025447691,
"grad_norm": 50.68221664428711,
"learning_rate": 2.391713747645951e-06,
"loss": 0.2747,
"num_input_tokens_seen": 592768,
"step": 255
},
{
"epoch": 0.24505183788878418,
"grad_norm": 22.0999698638916,
"learning_rate": 2.4387947269303204e-06,
"loss": 0.336,
"num_input_tokens_seen": 604032,
"step": 260
},
{
"epoch": 0.24976437323279924,
"grad_norm": 29.03078269958496,
"learning_rate": 2.4858757062146898e-06,
"loss": 0.2277,
"num_input_tokens_seen": 616256,
"step": 265
},
{
"epoch": 0.25070688030160226,
"eval_loss": 0.25048765540122986,
"eval_runtime": 2.7618,
"eval_samples_per_second": 341.444,
"eval_steps_per_second": 42.726,
"num_input_tokens_seen": 618432,
"step": 266
},
{
"epoch": 0.2544769085768143,
"grad_norm": 33.82787322998047,
"learning_rate": 2.5329566854990583e-06,
"loss": 0.2331,
"num_input_tokens_seen": 627072,
"step": 270
},
{
"epoch": 0.25918944392082943,
"grad_norm": 19.519996643066406,
"learning_rate": 2.5800376647834272e-06,
"loss": 0.157,
"num_input_tokens_seen": 638592,
"step": 275
},
{
"epoch": 0.2639019792648445,
"grad_norm": 43.415740966796875,
"learning_rate": 2.627118644067797e-06,
"loss": 0.3209,
"num_input_tokens_seen": 648448,
"step": 280
},
{
"epoch": 0.26861451460885954,
"grad_norm": 33.58452606201172,
"learning_rate": 2.674199623352166e-06,
"loss": 0.2578,
"num_input_tokens_seen": 662784,
"step": 285
},
{
"epoch": 0.27332704995287466,
"grad_norm": 22.619098663330078,
"learning_rate": 2.7212806026365353e-06,
"loss": 0.3557,
"num_input_tokens_seen": 673856,
"step": 290
},
{
"epoch": 0.2780395852968897,
"grad_norm": 19.780139923095703,
"learning_rate": 2.7683615819209043e-06,
"loss": 0.2089,
"num_input_tokens_seen": 683136,
"step": 295
},
{
"epoch": 0.2827521206409048,
"grad_norm": 16.88971519470215,
"learning_rate": 2.8154425612052732e-06,
"loss": 0.2989,
"num_input_tokens_seen": 694784,
"step": 300
},
{
"epoch": 0.2874646559849199,
"grad_norm": 18.517738342285156,
"learning_rate": 2.862523540489642e-06,
"loss": 0.2632,
"num_input_tokens_seen": 706624,
"step": 305
},
{
"epoch": 0.292177191328935,
"grad_norm": 26.673988342285156,
"learning_rate": 2.9096045197740115e-06,
"loss": 0.2979,
"num_input_tokens_seen": 716800,
"step": 310
},
{
"epoch": 0.29688972667295005,
"grad_norm": 40.26359176635742,
"learning_rate": 2.9566854990583805e-06,
"loss": 0.3261,
"num_input_tokens_seen": 728704,
"step": 315
},
{
"epoch": 0.3016022620169651,
"grad_norm": 20.343751907348633,
"learning_rate": 3.00376647834275e-06,
"loss": 0.1851,
"num_input_tokens_seen": 740352,
"step": 320
},
{
"epoch": 0.3063147973609802,
"grad_norm": 30.753236770629883,
"learning_rate": 3.0508474576271192e-06,
"loss": 0.2727,
"num_input_tokens_seen": 751936,
"step": 325
},
{
"epoch": 0.3110273327049953,
"grad_norm": 27.787220001220703,
"learning_rate": 3.097928436911488e-06,
"loss": 0.3077,
"num_input_tokens_seen": 763264,
"step": 330
},
{
"epoch": 0.3157398680490104,
"grad_norm": 13.1635103225708,
"learning_rate": 3.145009416195857e-06,
"loss": 0.3285,
"num_input_tokens_seen": 772992,
"step": 335
},
{
"epoch": 0.32045240339302544,
"grad_norm": 32.643653869628906,
"learning_rate": 3.192090395480226e-06,
"loss": 0.2493,
"num_input_tokens_seen": 787008,
"step": 340
},
{
"epoch": 0.32516493873704055,
"grad_norm": 31.31944465637207,
"learning_rate": 3.2391713747645954e-06,
"loss": 0.233,
"num_input_tokens_seen": 798848,
"step": 345
},
{
"epoch": 0.3298774740810556,
"grad_norm": 69.64946746826172,
"learning_rate": 3.2862523540489644e-06,
"loss": 0.3409,
"num_input_tokens_seen": 811584,
"step": 350
},
{
"epoch": 0.33459000942507067,
"grad_norm": 21.657026290893555,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.1946,
"num_input_tokens_seen": 822208,
"step": 355
},
{
"epoch": 0.3393025447690858,
"grad_norm": 48.79526138305664,
"learning_rate": 3.3804143126177023e-06,
"loss": 0.3024,
"num_input_tokens_seen": 833792,
"step": 360
},
{
"epoch": 0.34401508011310084,
"grad_norm": 22.62244415283203,
"learning_rate": 3.427495291902072e-06,
"loss": 0.347,
"num_input_tokens_seen": 845568,
"step": 365
},
{
"epoch": 0.34872761545711595,
"grad_norm": 23.822986602783203,
"learning_rate": 3.474576271186441e-06,
"loss": 0.3787,
"num_input_tokens_seen": 855040,
"step": 370
},
{
"epoch": 0.353440150801131,
"grad_norm": 8.774383544921875,
"learning_rate": 3.5216572504708104e-06,
"loss": 0.3066,
"num_input_tokens_seen": 867712,
"step": 375
},
{
"epoch": 0.3581526861451461,
"grad_norm": 8.585469245910645,
"learning_rate": 3.5687382297551793e-06,
"loss": 0.2843,
"num_input_tokens_seen": 880960,
"step": 380
},
{
"epoch": 0.36286522148916117,
"grad_norm": 16.818485260009766,
"learning_rate": 3.6158192090395483e-06,
"loss": 0.2422,
"num_input_tokens_seen": 894784,
"step": 385
},
{
"epoch": 0.36757775683317623,
"grad_norm": 16.943395614624023,
"learning_rate": 3.662900188323917e-06,
"loss": 0.2664,
"num_input_tokens_seen": 905600,
"step": 390
},
{
"epoch": 0.37229029217719134,
"grad_norm": 26.195528030395508,
"learning_rate": 3.7099811676082866e-06,
"loss": 0.3223,
"num_input_tokens_seen": 915072,
"step": 395
},
{
"epoch": 0.3770028275212064,
"grad_norm": 37.17734146118164,
"learning_rate": 3.7570621468926555e-06,
"loss": 0.2066,
"num_input_tokens_seen": 927552,
"step": 400
},
{
"epoch": 0.3817153628652215,
"grad_norm": 43.06858444213867,
"learning_rate": 3.8041431261770245e-06,
"loss": 0.2964,
"num_input_tokens_seen": 940160,
"step": 405
},
{
"epoch": 0.38642789820923656,
"grad_norm": 31.916301727294922,
"learning_rate": 3.851224105461394e-06,
"loss": 0.1989,
"num_input_tokens_seen": 949760,
"step": 410
},
{
"epoch": 0.3911404335532517,
"grad_norm": 21.781017303466797,
"learning_rate": 3.898305084745763e-06,
"loss": 0.2152,
"num_input_tokens_seen": 960896,
"step": 415
},
{
"epoch": 0.39585296889726673,
"grad_norm": 18.697711944580078,
"learning_rate": 3.945386064030132e-06,
"loss": 0.3237,
"num_input_tokens_seen": 971648,
"step": 420
},
{
"epoch": 0.4005655042412818,
"grad_norm": 24.69769287109375,
"learning_rate": 3.992467043314501e-06,
"loss": 0.2182,
"num_input_tokens_seen": 981504,
"step": 425
},
{
"epoch": 0.4052780395852969,
"grad_norm": 26.988998413085938,
"learning_rate": 4.03954802259887e-06,
"loss": 0.1811,
"num_input_tokens_seen": 993664,
"step": 430
},
{
"epoch": 0.40999057492931196,
"grad_norm": 39.637481689453125,
"learning_rate": 4.08662900188324e-06,
"loss": 0.2151,
"num_input_tokens_seen": 1003008,
"step": 435
},
{
"epoch": 0.41470311027332707,
"grad_norm": 36.60890579223633,
"learning_rate": 4.133709981167609e-06,
"loss": 0.2333,
"num_input_tokens_seen": 1013248,
"step": 440
},
{
"epoch": 0.4194156456173421,
"grad_norm": 21.840532302856445,
"learning_rate": 4.180790960451978e-06,
"loss": 0.2694,
"num_input_tokens_seen": 1023296,
"step": 445
},
{
"epoch": 0.42412818096135724,
"grad_norm": 48.131988525390625,
"learning_rate": 4.2278719397363475e-06,
"loss": 0.3493,
"num_input_tokens_seen": 1033152,
"step": 450
},
{
"epoch": 0.4288407163053723,
"grad_norm": 30.9437198638916,
"learning_rate": 4.2749529190207165e-06,
"loss": 0.2147,
"num_input_tokens_seen": 1045248,
"step": 455
},
{
"epoch": 0.43355325164938735,
"grad_norm": 38.648189544677734,
"learning_rate": 4.322033898305085e-06,
"loss": 0.3071,
"num_input_tokens_seen": 1058944,
"step": 460
},
{
"epoch": 0.43826578699340246,
"grad_norm": 45.260948181152344,
"learning_rate": 4.369114877589454e-06,
"loss": 0.2118,
"num_input_tokens_seen": 1071680,
"step": 465
},
{
"epoch": 0.4429783223374175,
"grad_norm": 30.458763122558594,
"learning_rate": 4.416195856873823e-06,
"loss": 0.2319,
"num_input_tokens_seen": 1083328,
"step": 470
},
{
"epoch": 0.44769085768143263,
"grad_norm": 44.823246002197266,
"learning_rate": 4.463276836158192e-06,
"loss": 0.4004,
"num_input_tokens_seen": 1097600,
"step": 475
},
{
"epoch": 0.4524033930254477,
"grad_norm": 30.685049057006836,
"learning_rate": 4.510357815442561e-06,
"loss": 0.218,
"num_input_tokens_seen": 1109376,
"step": 480
},
{
"epoch": 0.4571159283694628,
"grad_norm": 20.631444931030273,
"learning_rate": 4.55743879472693e-06,
"loss": 0.2579,
"num_input_tokens_seen": 1120448,
"step": 485
},
{
"epoch": 0.46182846371347785,
"grad_norm": 27.939373016357422,
"learning_rate": 4.6045197740113e-06,
"loss": 0.298,
"num_input_tokens_seen": 1131392,
"step": 490
},
{
"epoch": 0.4665409990574929,
"grad_norm": 13.28208065032959,
"learning_rate": 4.651600753295669e-06,
"loss": 0.2203,
"num_input_tokens_seen": 1143552,
"step": 495
},
{
"epoch": 0.471253534401508,
"grad_norm": 55.77337646484375,
"learning_rate": 4.698681732580039e-06,
"loss": 0.1785,
"num_input_tokens_seen": 1153088,
"step": 500
},
{
"epoch": 0.4759660697455231,
"grad_norm": 32.49051284790039,
"learning_rate": 4.745762711864408e-06,
"loss": 0.2125,
"num_input_tokens_seen": 1166080,
"step": 505
},
{
"epoch": 0.4806786050895382,
"grad_norm": 63.475467681884766,
"learning_rate": 4.7928436911487765e-06,
"loss": 0.3882,
"num_input_tokens_seen": 1176896,
"step": 510
},
{
"epoch": 0.48539114043355325,
"grad_norm": 49.208858489990234,
"learning_rate": 4.8399246704331455e-06,
"loss": 0.2743,
"num_input_tokens_seen": 1187392,
"step": 515
},
{
"epoch": 0.49010367577756836,
"grad_norm": 38.7435417175293,
"learning_rate": 4.8870056497175144e-06,
"loss": 0.3309,
"num_input_tokens_seen": 1196672,
"step": 520
},
{
"epoch": 0.4948162111215834,
"grad_norm": 37.79723358154297,
"learning_rate": 4.934086629001883e-06,
"loss": 0.3264,
"num_input_tokens_seen": 1209344,
"step": 525
},
{
"epoch": 0.49952874646559847,
"grad_norm": 14.449431419372559,
"learning_rate": 4.981167608286252e-06,
"loss": 0.2193,
"num_input_tokens_seen": 1221504,
"step": 530
},
{
"epoch": 0.5014137606032045,
"eval_loss": 0.31662699580192566,
"eval_runtime": 3.716,
"eval_samples_per_second": 253.768,
"eval_steps_per_second": 31.755,
"num_input_tokens_seen": 1225408,
"step": 532
},
{
"epoch": 0.5042412818096136,
"grad_norm": 28.057838439941406,
"learning_rate": 4.999995128224159e-06,
"loss": 0.3059,
"num_input_tokens_seen": 1232256,
"step": 535
},
{
"epoch": 0.5089538171536286,
"grad_norm": 24.673755645751953,
"learning_rate": 4.999965356329446e-06,
"loss": 0.2494,
"num_input_tokens_seen": 1242880,
"step": 540
},
{
"epoch": 0.5136663524976437,
"grad_norm": 24.47283172607422,
"learning_rate": 4.99990851940408e-06,
"loss": 0.2812,
"num_input_tokens_seen": 1253248,
"step": 545
},
{
"epoch": 0.5183788878416589,
"grad_norm": 34.05777359008789,
"learning_rate": 4.999824618063384e-06,
"loss": 0.2639,
"num_input_tokens_seen": 1265280,
"step": 550
},
{
"epoch": 0.5230914231856739,
"grad_norm": 23.17024803161621,
"learning_rate": 4.99971365321569e-06,
"loss": 0.3403,
"num_input_tokens_seen": 1275328,
"step": 555
},
{
"epoch": 0.527803958529689,
"grad_norm": 14.374199867248535,
"learning_rate": 4.9995756260623194e-06,
"loss": 0.2942,
"num_input_tokens_seen": 1286272,
"step": 560
},
{
"epoch": 0.532516493873704,
"grad_norm": 23.844751358032227,
"learning_rate": 4.999410538097579e-06,
"loss": 0.2036,
"num_input_tokens_seen": 1298816,
"step": 565
},
{
"epoch": 0.5372290292177191,
"grad_norm": 25.96900177001953,
"learning_rate": 4.999218391108735e-06,
"loss": 0.2656,
"num_input_tokens_seen": 1311680,
"step": 570
},
{
"epoch": 0.5419415645617343,
"grad_norm": 43.49790954589844,
"learning_rate": 4.9989991871760054e-06,
"loss": 0.2828,
"num_input_tokens_seen": 1324672,
"step": 575
},
{
"epoch": 0.5466540999057493,
"grad_norm": 14.262967109680176,
"learning_rate": 4.998752928672525e-06,
"loss": 0.2081,
"num_input_tokens_seen": 1336896,
"step": 580
},
{
"epoch": 0.5513666352497644,
"grad_norm": 3.46243953704834,
"learning_rate": 4.9984796182643285e-06,
"loss": 0.1485,
"num_input_tokens_seen": 1348928,
"step": 585
},
{
"epoch": 0.5560791705937794,
"grad_norm": 34.33613204956055,
"learning_rate": 4.99817925891032e-06,
"loss": 0.6339,
"num_input_tokens_seen": 1362496,
"step": 590
},
{
"epoch": 0.5607917059377945,
"grad_norm": 34.32979965209961,
"learning_rate": 4.997851853862237e-06,
"loss": 0.5079,
"num_input_tokens_seen": 1377152,
"step": 595
},
{
"epoch": 0.5655042412818096,
"grad_norm": 37.79439926147461,
"learning_rate": 4.997497406664621e-06,
"loss": 0.2396,
"num_input_tokens_seen": 1390016,
"step": 600
},
{
"epoch": 0.5702167766258247,
"grad_norm": 17.340059280395508,
"learning_rate": 4.997115921154774e-06,
"loss": 0.2335,
"num_input_tokens_seen": 1401856,
"step": 605
},
{
"epoch": 0.5749293119698398,
"grad_norm": 29.506044387817383,
"learning_rate": 4.9967074014627206e-06,
"loss": 0.1719,
"num_input_tokens_seen": 1412736,
"step": 610
},
{
"epoch": 0.5796418473138548,
"grad_norm": 37.41396713256836,
"learning_rate": 4.996271852011161e-06,
"loss": 0.3201,
"num_input_tokens_seen": 1425280,
"step": 615
},
{
"epoch": 0.58435438265787,
"grad_norm": 80.07007598876953,
"learning_rate": 4.995809277515424e-06,
"loss": 0.1993,
"num_input_tokens_seen": 1436480,
"step": 620
},
{
"epoch": 0.589066918001885,
"grad_norm": 25.41455841064453,
"learning_rate": 4.995319682983417e-06,
"loss": 0.3072,
"num_input_tokens_seen": 1447808,
"step": 625
},
{
"epoch": 0.5937794533459001,
"grad_norm": 25.022064208984375,
"learning_rate": 4.99480307371557e-06,
"loss": 0.3263,
"num_input_tokens_seen": 1460352,
"step": 630
},
{
"epoch": 0.5984919886899152,
"grad_norm": 15.871000289916992,
"learning_rate": 4.9942594553047775e-06,
"loss": 0.2747,
"num_input_tokens_seen": 1472640,
"step": 635
},
{
"epoch": 0.6032045240339302,
"grad_norm": 9.052998542785645,
"learning_rate": 4.993688833636341e-06,
"loss": 0.2984,
"num_input_tokens_seen": 1482688,
"step": 640
},
{
"epoch": 0.6079170593779454,
"grad_norm": 11.553794860839844,
"learning_rate": 4.993091214887904e-06,
"loss": 0.2671,
"num_input_tokens_seen": 1494336,
"step": 645
},
{
"epoch": 0.6126295947219604,
"grad_norm": 11.756524085998535,
"learning_rate": 4.992466605529384e-06,
"loss": 0.1511,
"num_input_tokens_seen": 1504896,
"step": 650
},
{
"epoch": 0.6173421300659755,
"grad_norm": 34.58543395996094,
"learning_rate": 4.991815012322902e-06,
"loss": 0.3427,
"num_input_tokens_seen": 1518592,
"step": 655
},
{
"epoch": 0.6220546654099905,
"grad_norm": 26.234251022338867,
"learning_rate": 4.991136442322713e-06,
"loss": 0.2164,
"num_input_tokens_seen": 1531264,
"step": 660
},
{
"epoch": 0.6267672007540056,
"grad_norm": 22.865320205688477,
"learning_rate": 4.990430902875125e-06,
"loss": 0.2187,
"num_input_tokens_seen": 1541376,
"step": 665
},
{
"epoch": 0.6314797360980208,
"grad_norm": 17.565690994262695,
"learning_rate": 4.989698401618423e-06,
"loss": 0.2911,
"num_input_tokens_seen": 1551424,
"step": 670
},
{
"epoch": 0.6361922714420358,
"grad_norm": 31.6148624420166,
"learning_rate": 4.988938946482786e-06,
"loss": 0.1331,
"num_input_tokens_seen": 1562624,
"step": 675
},
{
"epoch": 0.6409048067860509,
"grad_norm": 18.770896911621094,
"learning_rate": 4.988152545690197e-06,
"loss": 0.2686,
"num_input_tokens_seen": 1574016,
"step": 680
},
{
"epoch": 0.6456173421300659,
"grad_norm": 43.168392181396484,
"learning_rate": 4.987339207754358e-06,
"loss": 0.3307,
"num_input_tokens_seen": 1586688,
"step": 685
},
{
"epoch": 0.6503298774740811,
"grad_norm": 11.960392951965332,
"learning_rate": 4.9864989414806e-06,
"loss": 0.247,
"num_input_tokens_seen": 1596992,
"step": 690
},
{
"epoch": 0.6550424128180962,
"grad_norm": 13.759454727172852,
"learning_rate": 4.985631755965779e-06,
"loss": 0.3232,
"num_input_tokens_seen": 1609920,
"step": 695
},
{
"epoch": 0.6597549481621112,
"grad_norm": 11.490863800048828,
"learning_rate": 4.984737660598187e-06,
"loss": 0.2132,
"num_input_tokens_seen": 1620736,
"step": 700
},
{
"epoch": 0.6644674835061263,
"grad_norm": 39.30780029296875,
"learning_rate": 4.983816665057447e-06,
"loss": 0.2797,
"num_input_tokens_seen": 1632512,
"step": 705
},
{
"epoch": 0.6691800188501413,
"grad_norm": 10.898017883300781,
"learning_rate": 4.982868779314405e-06,
"loss": 0.3142,
"num_input_tokens_seen": 1643264,
"step": 710
},
{
"epoch": 0.6738925541941565,
"grad_norm": 16.296348571777344,
"learning_rate": 4.981894013631026e-06,
"loss": 0.1914,
"num_input_tokens_seen": 1654208,
"step": 715
},
{
"epoch": 0.6786050895381716,
"grad_norm": 23.66090202331543,
"learning_rate": 4.980892378560281e-06,
"loss": 0.1985,
"num_input_tokens_seen": 1664640,
"step": 720
},
{
"epoch": 0.6833176248821866,
"grad_norm": 27.988893508911133,
"learning_rate": 4.979863884946034e-06,
"loss": 0.2831,
"num_input_tokens_seen": 1676864,
"step": 725
},
{
"epoch": 0.6880301602262017,
"grad_norm": 20.06635284423828,
"learning_rate": 4.978808543922925e-06,
"loss": 0.2082,
"num_input_tokens_seen": 1691072,
"step": 730
},
{
"epoch": 0.6927426955702167,
"grad_norm": 19.601367950439453,
"learning_rate": 4.9777263669162465e-06,
"loss": 0.1227,
"num_input_tokens_seen": 1702400,
"step": 735
},
{
"epoch": 0.6974552309142319,
"grad_norm": 6.3521199226379395,
"learning_rate": 4.976617365641822e-06,
"loss": 0.1471,
"num_input_tokens_seen": 1714944,
"step": 740
},
{
"epoch": 0.702167766258247,
"grad_norm": 87.46131134033203,
"learning_rate": 4.97548155210588e-06,
"loss": 0.3082,
"num_input_tokens_seen": 1725376,
"step": 745
},
{
"epoch": 0.706880301602262,
"grad_norm": 80.72607421875,
"learning_rate": 4.974318938604921e-06,
"loss": 0.4432,
"num_input_tokens_seen": 1737152,
"step": 750
},
{
"epoch": 0.7115928369462771,
"grad_norm": 13.310392379760742,
"learning_rate": 4.9731295377255885e-06,
"loss": 0.1969,
"num_input_tokens_seen": 1749120,
"step": 755
},
{
"epoch": 0.7163053722902922,
"grad_norm": 23.76306915283203,
"learning_rate": 4.971913362344529e-06,
"loss": 0.272,
"num_input_tokens_seen": 1760384,
"step": 760
},
{
"epoch": 0.7210179076343073,
"grad_norm": 33.018524169921875,
"learning_rate": 4.970670425628255e-06,
"loss": 0.1454,
"num_input_tokens_seen": 1773632,
"step": 765
},
{
"epoch": 0.7257304429783223,
"grad_norm": 21.914316177368164,
"learning_rate": 4.969400741032999e-06,
"loss": 0.184,
"num_input_tokens_seen": 1787776,
"step": 770
},
{
"epoch": 0.7304429783223374,
"grad_norm": 38.7669792175293,
"learning_rate": 4.968104322304575e-06,
"loss": 0.2148,
"num_input_tokens_seen": 1798336,
"step": 775
},
{
"epoch": 0.7351555136663525,
"grad_norm": 41.227027893066406,
"learning_rate": 4.966781183478223e-06,
"loss": 0.2897,
"num_input_tokens_seen": 1809216,
"step": 780
},
{
"epoch": 0.7398680490103676,
"grad_norm": 26.2169189453125,
"learning_rate": 4.965431338878456e-06,
"loss": 0.2981,
"num_input_tokens_seen": 1822144,
"step": 785
},
{
"epoch": 0.7445805843543827,
"grad_norm": 5.09712553024292,
"learning_rate": 4.9640548031189125e-06,
"loss": 0.2476,
"num_input_tokens_seen": 1833088,
"step": 790
},
{
"epoch": 0.7492931196983977,
"grad_norm": 15.851964950561523,
"learning_rate": 4.962651591102191e-06,
"loss": 0.2554,
"num_input_tokens_seen": 1845056,
"step": 795
},
{
"epoch": 0.7521206409048068,
"eval_loss": 0.2178538739681244,
"eval_runtime": 2.7742,
"eval_samples_per_second": 339.916,
"eval_steps_per_second": 42.535,
"num_input_tokens_seen": 1851072,
"step": 798
},
{
"epoch": 0.7540056550424128,
"grad_norm": 14.348052978515625,
"learning_rate": 4.961221718019695e-06,
"loss": 0.2507,
"num_input_tokens_seen": 1855168,
"step": 800
},
{
"epoch": 0.7587181903864278,
"grad_norm": 20.442550659179688,
"learning_rate": 4.9597651993514585e-06,
"loss": 0.3006,
"num_input_tokens_seen": 1867328,
"step": 805
},
{
"epoch": 0.763430725730443,
"grad_norm": 18.405014038085938,
"learning_rate": 4.9582820508659924e-06,
"loss": 0.1949,
"num_input_tokens_seen": 1882560,
"step": 810
},
{
"epoch": 0.7681432610744581,
"grad_norm": 26.241788864135742,
"learning_rate": 4.956772288620101e-06,
"loss": 0.1866,
"num_input_tokens_seen": 1893376,
"step": 815
},
{
"epoch": 0.7728557964184731,
"grad_norm": 4.750776290893555,
"learning_rate": 4.955235928958716e-06,
"loss": 0.1114,
"num_input_tokens_seen": 1906048,
"step": 820
},
{
"epoch": 0.7775683317624882,
"grad_norm": 22.653051376342773,
"learning_rate": 4.953672988514716e-06,
"loss": 0.2425,
"num_input_tokens_seen": 1917568,
"step": 825
},
{
"epoch": 0.7822808671065034,
"grad_norm": 56.989315032958984,
"learning_rate": 4.95208348420875e-06,
"loss": 0.4121,
"num_input_tokens_seen": 1929216,
"step": 830
},
{
"epoch": 0.7869934024505184,
"grad_norm": 21.19652557373047,
"learning_rate": 4.950467433249046e-06,
"loss": 0.1859,
"num_input_tokens_seen": 1940416,
"step": 835
},
{
"epoch": 0.7917059377945335,
"grad_norm": 17.347103118896484,
"learning_rate": 4.948824853131237e-06,
"loss": 0.2065,
"num_input_tokens_seen": 1949632,
"step": 840
},
{
"epoch": 0.7964184731385485,
"grad_norm": 32.96878433227539,
"learning_rate": 4.94715576163816e-06,
"loss": 0.2102,
"num_input_tokens_seen": 1961920,
"step": 845
},
{
"epoch": 0.8011310084825636,
"grad_norm": 4.76591157913208,
"learning_rate": 4.945460176839671e-06,
"loss": 0.2975,
"num_input_tokens_seen": 1973696,
"step": 850
},
{
"epoch": 0.8058435438265787,
"grad_norm": 17.566848754882812,
"learning_rate": 4.943738117092447e-06,
"loss": 0.294,
"num_input_tokens_seen": 1985280,
"step": 855
},
{
"epoch": 0.8105560791705938,
"grad_norm": 34.71393966674805,
"learning_rate": 4.941989601039785e-06,
"loss": 0.2107,
"num_input_tokens_seen": 1997504,
"step": 860
},
{
"epoch": 0.8152686145146089,
"grad_norm": 15.716105461120605,
"learning_rate": 4.940214647611405e-06,
"loss": 0.2815,
"num_input_tokens_seen": 2009600,
"step": 865
},
{
"epoch": 0.8199811498586239,
"grad_norm": 5.163911819458008,
"learning_rate": 4.9384132760232395e-06,
"loss": 0.1509,
"num_input_tokens_seen": 2020672,
"step": 870
},
{
"epoch": 0.824693685202639,
"grad_norm": 32.56769943237305,
"learning_rate": 4.93658550577723e-06,
"loss": 0.258,
"num_input_tokens_seen": 2033408,
"step": 875
},
{
"epoch": 0.8294062205466541,
"grad_norm": 21.050493240356445,
"learning_rate": 4.9347313566611145e-06,
"loss": 0.2403,
"num_input_tokens_seen": 2043328,
"step": 880
},
{
"epoch": 0.8341187558906692,
"grad_norm": 13.551421165466309,
"learning_rate": 4.9328508487482115e-06,
"loss": 0.2631,
"num_input_tokens_seen": 2054656,
"step": 885
},
{
"epoch": 0.8388312912346843,
"grad_norm": 19.12700080871582,
"learning_rate": 4.930944002397204e-06,
"loss": 0.2302,
"num_input_tokens_seen": 2064128,
"step": 890
},
{
"epoch": 0.8435438265786993,
"grad_norm": 29.187570571899414,
"learning_rate": 4.929010838251923e-06,
"loss": 0.2009,
"num_input_tokens_seen": 2076864,
"step": 895
},
{
"epoch": 0.8482563619227145,
"grad_norm": 20.132150650024414,
"learning_rate": 4.927051377241115e-06,
"loss": 0.1868,
"num_input_tokens_seen": 2087104,
"step": 900
},
{
"epoch": 0.8529688972667295,
"grad_norm": 19.931499481201172,
"learning_rate": 4.9250656405782215e-06,
"loss": 0.3066,
"num_input_tokens_seen": 2097728,
"step": 905
},
{
"epoch": 0.8576814326107446,
"grad_norm": 21.25429916381836,
"learning_rate": 4.9230536497611525e-06,
"loss": 0.1685,
"num_input_tokens_seen": 2107904,
"step": 910
},
{
"epoch": 0.8623939679547596,
"grad_norm": 35.41661834716797,
"learning_rate": 4.921015426572047e-06,
"loss": 0.3358,
"num_input_tokens_seen": 2120192,
"step": 915
},
{
"epoch": 0.8671065032987747,
"grad_norm": 20.501426696777344,
"learning_rate": 4.918950993077039e-06,
"loss": 0.2411,
"num_input_tokens_seen": 2131904,
"step": 920
},
{
"epoch": 0.8718190386427899,
"grad_norm": 30.00881576538086,
"learning_rate": 4.91686037162602e-06,
"loss": 0.3069,
"num_input_tokens_seen": 2144640,
"step": 925
},
{
"epoch": 0.8765315739868049,
"grad_norm": 30.22358512878418,
"learning_rate": 4.9147435848523975e-06,
"loss": 0.1587,
"num_input_tokens_seen": 2154112,
"step": 930
},
{
"epoch": 0.88124410933082,
"grad_norm": 10.572684288024902,
"learning_rate": 4.91260065567285e-06,
"loss": 0.1468,
"num_input_tokens_seen": 2167232,
"step": 935
},
{
"epoch": 0.885956644674835,
"grad_norm": 61.71476745605469,
"learning_rate": 4.910431607287075e-06,
"loss": 0.2699,
"num_input_tokens_seen": 2179264,
"step": 940
},
{
"epoch": 0.8906691800188501,
"grad_norm": 10.816360473632812,
"learning_rate": 4.908236463177544e-06,
"loss": 0.3797,
"num_input_tokens_seen": 2191488,
"step": 945
},
{
"epoch": 0.8953817153628653,
"grad_norm": 39.76873016357422,
"learning_rate": 4.906015247109242e-06,
"loss": 0.1988,
"num_input_tokens_seen": 2201856,
"step": 950
},
{
"epoch": 0.9000942507068803,
"grad_norm": 23.409250259399414,
"learning_rate": 4.903767983129414e-06,
"loss": 0.3161,
"num_input_tokens_seen": 2214464,
"step": 955
},
{
"epoch": 0.9048067860508954,
"grad_norm": 23.47569465637207,
"learning_rate": 4.901494695567306e-06,
"loss": 0.2565,
"num_input_tokens_seen": 2229184,
"step": 960
},
{
"epoch": 0.9095193213949104,
"grad_norm": 12.153125762939453,
"learning_rate": 4.899195409033897e-06,
"loss": 0.2214,
"num_input_tokens_seen": 2239104,
"step": 965
},
{
"epoch": 0.9142318567389256,
"grad_norm": 13.904633522033691,
"learning_rate": 4.896870148421637e-06,
"loss": 0.1992,
"num_input_tokens_seen": 2249152,
"step": 970
},
{
"epoch": 0.9189443920829407,
"grad_norm": 9.68702507019043,
"learning_rate": 4.894518938904175e-06,
"loss": 0.1527,
"num_input_tokens_seen": 2261312,
"step": 975
},
{
"epoch": 0.9236569274269557,
"grad_norm": 35.594173431396484,
"learning_rate": 4.892141805936085e-06,
"loss": 0.1398,
"num_input_tokens_seen": 2275008,
"step": 980
},
{
"epoch": 0.9283694627709708,
"grad_norm": 30.414966583251953,
"learning_rate": 4.889738775252596e-06,
"loss": 0.276,
"num_input_tokens_seen": 2287680,
"step": 985
},
{
"epoch": 0.9330819981149858,
"grad_norm": 36.264251708984375,
"learning_rate": 4.887309872869308e-06,
"loss": 0.2869,
"num_input_tokens_seen": 2299840,
"step": 990
},
{
"epoch": 0.937794533459001,
"grad_norm": 34.444374084472656,
"learning_rate": 4.884855125081912e-06,
"loss": 0.2347,
"num_input_tokens_seen": 2311104,
"step": 995
},
{
"epoch": 0.942507068803016,
"grad_norm": 39.005767822265625,
"learning_rate": 4.882374558465906e-06,
"loss": 0.326,
"num_input_tokens_seen": 2322432,
"step": 1000
},
{
"epoch": 0.9472196041470311,
"grad_norm": 23.73866081237793,
"learning_rate": 4.8798681998763056e-06,
"loss": 0.2946,
"num_input_tokens_seen": 2333120,
"step": 1005
},
{
"epoch": 0.9519321394910462,
"grad_norm": 17.239654541015625,
"learning_rate": 4.877336076447358e-06,
"loss": 0.2846,
"num_input_tokens_seen": 2345472,
"step": 1010
},
{
"epoch": 0.9566446748350612,
"grad_norm": 16.902143478393555,
"learning_rate": 4.87477821559224e-06,
"loss": 0.1988,
"num_input_tokens_seen": 2357568,
"step": 1015
},
{
"epoch": 0.9613572101790764,
"grad_norm": 20.823362350463867,
"learning_rate": 4.87219464500277e-06,
"loss": 0.2295,
"num_input_tokens_seen": 2368064,
"step": 1020
},
{
"epoch": 0.9660697455230914,
"grad_norm": 22.501428604125977,
"learning_rate": 4.869585392649102e-06,
"loss": 0.2166,
"num_input_tokens_seen": 2381184,
"step": 1025
},
{
"epoch": 0.9707822808671065,
"grad_norm": 12.077306747436523,
"learning_rate": 4.866950486779425e-06,
"loss": 0.1964,
"num_input_tokens_seen": 2393408,
"step": 1030
},
{
"epoch": 0.9754948162111216,
"grad_norm": 24.82265281677246,
"learning_rate": 4.864289955919658e-06,
"loss": 0.2603,
"num_input_tokens_seen": 2406720,
"step": 1035
},
{
"epoch": 0.9802073515551367,
"grad_norm": 24.67642593383789,
"learning_rate": 4.8616038288731394e-06,
"loss": 0.3101,
"num_input_tokens_seen": 2420288,
"step": 1040
},
{
"epoch": 0.9849198868991518,
"grad_norm": 7.1168532371521,
"learning_rate": 4.8588921347203175e-06,
"loss": 0.1463,
"num_input_tokens_seen": 2431488,
"step": 1045
},
{
"epoch": 0.9896324222431668,
"grad_norm": 12.157154083251953,
"learning_rate": 4.8561549028184315e-06,
"loss": 0.2497,
"num_input_tokens_seen": 2444032,
"step": 1050
},
{
"epoch": 0.9943449575871819,
"grad_norm": 18.19011688232422,
"learning_rate": 4.8533921628012e-06,
"loss": 0.1574,
"num_input_tokens_seen": 2454912,
"step": 1055
},
{
"epoch": 0.9990574929311969,
"grad_norm": 22.441247940063477,
"learning_rate": 4.850603944578494e-06,
"loss": 0.3676,
"num_input_tokens_seen": 2467584,
"step": 1060
},
{
"epoch": 1.002827521206409,
"eval_loss": 0.18848362565040588,
"eval_runtime": 2.7314,
"eval_samples_per_second": 345.248,
"eval_steps_per_second": 43.202,
"num_input_tokens_seen": 2475808,
"step": 1064
},
{
"epoch": 1.003770028275212,
"grad_norm": 12.251239776611328,
"learning_rate": 4.847790278336017e-06,
"loss": 0.1493,
"num_input_tokens_seen": 2478048,
"step": 1065
},
{
"epoch": 1.0084825636192272,
"grad_norm": 23.926055908203125,
"learning_rate": 4.844951194534975e-06,
"loss": 0.1749,
"num_input_tokens_seen": 2492576,
"step": 1070
},
{
"epoch": 1.0131950989632421,
"grad_norm": 2.156106472015381,
"learning_rate": 4.842086723911751e-06,
"loss": 0.1307,
"num_input_tokens_seen": 2505440,
"step": 1075
},
{
"epoch": 1.0179076343072573,
"grad_norm": 29.823352813720703,
"learning_rate": 4.839196897477569e-06,
"loss": 0.1119,
"num_input_tokens_seen": 2515488,
"step": 1080
},
{
"epoch": 1.0226201696512724,
"grad_norm": 7.730029106140137,
"learning_rate": 4.836281746518159e-06,
"loss": 0.1664,
"num_input_tokens_seen": 2529504,
"step": 1085
},
{
"epoch": 1.0273327049952874,
"grad_norm": 35.77005386352539,
"learning_rate": 4.833341302593417e-06,
"loss": 0.1393,
"num_input_tokens_seen": 2539872,
"step": 1090
},
{
"epoch": 1.0320452403393026,
"grad_norm": 0.4067946672439575,
"learning_rate": 4.830375597537068e-06,
"loss": 0.0376,
"num_input_tokens_seen": 2549856,
"step": 1095
},
{
"epoch": 1.0367577756833177,
"grad_norm": 0.01726607233285904,
"learning_rate": 4.827384663456315e-06,
"loss": 0.1836,
"num_input_tokens_seen": 2559328,
"step": 1100
},
{
"epoch": 1.0414703110273327,
"grad_norm": 201.8726043701172,
"learning_rate": 4.824368532731496e-06,
"loss": 0.369,
"num_input_tokens_seen": 2569440,
"step": 1105
},
{
"epoch": 1.0461828463713478,
"grad_norm": 123.39364624023438,
"learning_rate": 4.821327238015732e-06,
"loss": 0.084,
"num_input_tokens_seen": 2580448,
"step": 1110
},
{
"epoch": 1.0508953817153628,
"grad_norm": 93.78629302978516,
"learning_rate": 4.818260812234572e-06,
"loss": 0.4176,
"num_input_tokens_seen": 2590752,
"step": 1115
},
{
"epoch": 1.055607917059378,
"grad_norm": 54.99407196044922,
"learning_rate": 4.815169288585641e-06,
"loss": 0.0664,
"num_input_tokens_seen": 2600160,
"step": 1120
},
{
"epoch": 1.0603204524033931,
"grad_norm": 33.015010833740234,
"learning_rate": 4.812052700538274e-06,
"loss": 0.1558,
"num_input_tokens_seen": 2611232,
"step": 1125
},
{
"epoch": 1.065032987747408,
"grad_norm": 10.432161331176758,
"learning_rate": 4.808911081833161e-06,
"loss": 0.1476,
"num_input_tokens_seen": 2623712,
"step": 1130
},
{
"epoch": 1.0697455230914232,
"grad_norm": 21.43227767944336,
"learning_rate": 4.805744466481974e-06,
"loss": 0.0875,
"num_input_tokens_seen": 2635936,
"step": 1135
},
{
"epoch": 1.0744580584354382,
"grad_norm": 0.39066728949546814,
"learning_rate": 4.802552888767005e-06,
"loss": 0.1297,
"num_input_tokens_seen": 2645920,
"step": 1140
},
{
"epoch": 1.0791705937794533,
"grad_norm": 79.24580383300781,
"learning_rate": 4.799336383240793e-06,
"loss": 0.2563,
"num_input_tokens_seen": 2660768,
"step": 1145
},
{
"epoch": 1.0838831291234685,
"grad_norm": 90.48008728027344,
"learning_rate": 4.796094984725749e-06,
"loss": 0.1484,
"num_input_tokens_seen": 2671200,
"step": 1150
},
{
"epoch": 1.0885956644674835,
"grad_norm": 0.7040526270866394,
"learning_rate": 4.792828728313778e-06,
"loss": 0.1145,
"num_input_tokens_seen": 2683040,
"step": 1155
},
{
"epoch": 1.0933081998114986,
"grad_norm": 80.87930297851562,
"learning_rate": 4.789537649365904e-06,
"loss": 0.0767,
"num_input_tokens_seen": 2694432,
"step": 1160
},
{
"epoch": 1.0980207351555136,
"grad_norm": 0.26429542899131775,
"learning_rate": 4.78622178351188e-06,
"loss": 0.0079,
"num_input_tokens_seen": 2707168,
"step": 1165
},
{
"epoch": 1.1027332704995287,
"grad_norm": 0.19729509949684143,
"learning_rate": 4.782881166649808e-06,
"loss": 0.1644,
"num_input_tokens_seen": 2717984,
"step": 1170
},
{
"epoch": 1.107445805843544,
"grad_norm": 220.86328125,
"learning_rate": 4.77951583494575e-06,
"loss": 0.2543,
"num_input_tokens_seen": 2730784,
"step": 1175
},
{
"epoch": 1.1121583411875589,
"grad_norm": 21.004072189331055,
"learning_rate": 4.77612582483333e-06,
"loss": 0.4821,
"num_input_tokens_seen": 2740704,
"step": 1180
},
{
"epoch": 1.116870876531574,
"grad_norm": 45.33163833618164,
"learning_rate": 4.772711173013352e-06,
"loss": 0.2498,
"num_input_tokens_seen": 2751968,
"step": 1185
},
{
"epoch": 1.121583411875589,
"grad_norm": 4.449422359466553,
"learning_rate": 4.769271916453387e-06,
"loss": 0.1649,
"num_input_tokens_seen": 2763808,
"step": 1190
},
{
"epoch": 1.1262959472196041,
"grad_norm": 154.34603881835938,
"learning_rate": 4.765808092387385e-06,
"loss": 0.0735,
"num_input_tokens_seen": 2774624,
"step": 1195
},
{
"epoch": 1.1310084825636193,
"grad_norm": 100.58317565917969,
"learning_rate": 4.762319738315269e-06,
"loss": 0.2639,
"num_input_tokens_seen": 2785888,
"step": 1200
},
{
"epoch": 1.1357210179076342,
"grad_norm": 45.887027740478516,
"learning_rate": 4.758806892002526e-06,
"loss": 0.3194,
"num_input_tokens_seen": 2797216,
"step": 1205
},
{
"epoch": 1.1404335532516494,
"grad_norm": 36.13898849487305,
"learning_rate": 4.7552695914798e-06,
"loss": 0.1395,
"num_input_tokens_seen": 2808032,
"step": 1210
},
{
"epoch": 1.1451460885956646,
"grad_norm": 96.65644836425781,
"learning_rate": 4.751707875042481e-06,
"loss": 0.2734,
"num_input_tokens_seen": 2823008,
"step": 1215
},
{
"epoch": 1.1498586239396795,
"grad_norm": 2.167825698852539,
"learning_rate": 4.748121781250288e-06,
"loss": 0.0883,
"num_input_tokens_seen": 2835936,
"step": 1220
},
{
"epoch": 1.1545711592836947,
"grad_norm": 14.599705696105957,
"learning_rate": 4.744511348926855e-06,
"loss": 0.169,
"num_input_tokens_seen": 2847584,
"step": 1225
},
{
"epoch": 1.1592836946277096,
"grad_norm": 68.31897735595703,
"learning_rate": 4.740876617159308e-06,
"loss": 0.1451,
"num_input_tokens_seen": 2857952,
"step": 1230
},
{
"epoch": 1.1639962299717248,
"grad_norm": 77.1812515258789,
"learning_rate": 4.737217625297844e-06,
"loss": 0.2114,
"num_input_tokens_seen": 2868192,
"step": 1235
},
{
"epoch": 1.1687087653157398,
"grad_norm": 6.400179862976074,
"learning_rate": 4.733534412955301e-06,
"loss": 0.1145,
"num_input_tokens_seen": 2879904,
"step": 1240
},
{
"epoch": 1.173421300659755,
"grad_norm": 1.274997353553772,
"learning_rate": 4.729827020006735e-06,
"loss": 0.1768,
"num_input_tokens_seen": 2892384,
"step": 1245
},
{
"epoch": 1.17813383600377,
"grad_norm": 32.56444549560547,
"learning_rate": 4.726095486588983e-06,
"loss": 0.1507,
"num_input_tokens_seen": 2905184,
"step": 1250
},
{
"epoch": 1.182846371347785,
"grad_norm": 7.450242042541504,
"learning_rate": 4.722339853100232e-06,
"loss": 0.0958,
"num_input_tokens_seen": 2916640,
"step": 1255
},
{
"epoch": 1.1875589066918002,
"grad_norm": 4.951867580413818,
"learning_rate": 4.718560160199579e-06,
"loss": 0.1192,
"num_input_tokens_seen": 2927072,
"step": 1260
},
{
"epoch": 1.1922714420358154,
"grad_norm": 50.1746940612793,
"learning_rate": 4.714756448806592e-06,
"loss": 0.2693,
"num_input_tokens_seen": 2937888,
"step": 1265
},
{
"epoch": 1.1969839773798303,
"grad_norm": 0.2431841641664505,
"learning_rate": 4.71092876010087e-06,
"loss": 0.1689,
"num_input_tokens_seen": 2950752,
"step": 1270
},
{
"epoch": 1.2016965127238455,
"grad_norm": 40.15456771850586,
"learning_rate": 4.70707713552159e-06,
"loss": 0.0997,
"num_input_tokens_seen": 2961056,
"step": 1275
},
{
"epoch": 1.2064090480678604,
"grad_norm": 154.8431396484375,
"learning_rate": 4.703201616767067e-06,
"loss": 0.1164,
"num_input_tokens_seen": 2971552,
"step": 1280
},
{
"epoch": 1.2111215834118756,
"grad_norm": 67.9471206665039,
"learning_rate": 4.699302245794293e-06,
"loss": 0.0178,
"num_input_tokens_seen": 2985120,
"step": 1285
},
{
"epoch": 1.2158341187558908,
"grad_norm": 104.93325805664062,
"learning_rate": 4.6953790648184924e-06,
"loss": 0.1821,
"num_input_tokens_seen": 2996128,
"step": 1290
},
{
"epoch": 1.2205466540999057,
"grad_norm": 0.03052549809217453,
"learning_rate": 4.691432116312661e-06,
"loss": 0.0199,
"num_input_tokens_seen": 3007072,
"step": 1295
},
{
"epoch": 1.2252591894439209,
"grad_norm": 0.9742458462715149,
"learning_rate": 4.687461443007101e-06,
"loss": 0.006,
"num_input_tokens_seen": 3018656,
"step": 1300
},
{
"epoch": 1.2299717247879358,
"grad_norm": 0.007309742737561464,
"learning_rate": 4.683467087888967e-06,
"loss": 0.1915,
"num_input_tokens_seen": 3030624,
"step": 1305
},
{
"epoch": 1.234684260131951,
"grad_norm": 0.3931090831756592,
"learning_rate": 4.6794490942017955e-06,
"loss": 0.2276,
"num_input_tokens_seen": 3043040,
"step": 1310
},
{
"epoch": 1.2393967954759662,
"grad_norm": 8.714564323425293,
"learning_rate": 4.6754075054450385e-06,
"loss": 0.0236,
"num_input_tokens_seen": 3057632,
"step": 1315
},
{
"epoch": 1.244109330819981,
"grad_norm": 0.008542931638658047,
"learning_rate": 4.671342365373592e-06,
"loss": 0.1376,
"num_input_tokens_seen": 3069792,
"step": 1320
},
{
"epoch": 1.2488218661639963,
"grad_norm": 52.45071792602539,
"learning_rate": 4.667253717997324e-06,
"loss": 0.2062,
"num_input_tokens_seen": 3080608,
"step": 1325
},
{
"epoch": 1.2535344015080114,
"grad_norm": 10.894562721252441,
"learning_rate": 4.663141607580589e-06,
"loss": 0.165,
"num_input_tokens_seen": 3091552,
"step": 1330
},
{
"epoch": 1.2535344015080114,
"eval_loss": 0.4607957601547241,
"eval_runtime": 2.7224,
"eval_samples_per_second": 346.386,
"eval_steps_per_second": 43.344,
"num_input_tokens_seen": 3091552,
"step": 1330
},
{
"epoch": 1.2582469368520264,
"grad_norm": 121.4914321899414,
"learning_rate": 4.659006078641766e-06,
"loss": 0.222,
"num_input_tokens_seen": 3103712,
"step": 1335
},
{
"epoch": 1.2629594721960415,
"grad_norm": 2.8751637935638428,
"learning_rate": 4.6548471759527634e-06,
"loss": 0.2312,
"num_input_tokens_seen": 3115104,
"step": 1340
},
{
"epoch": 1.2676720075400565,
"grad_norm": 3.6843035221099854,
"learning_rate": 4.6506649445385335e-06,
"loss": 0.011,
"num_input_tokens_seen": 3127648,
"step": 1345
},
{
"epoch": 1.2723845428840717,
"grad_norm": 26.937593460083008,
"learning_rate": 4.646459429676594e-06,
"loss": 0.2732,
"num_input_tokens_seen": 3138208,
"step": 1350
},
{
"epoch": 1.2770970782280866,
"grad_norm": 41.53554916381836,
"learning_rate": 4.642230676896531e-06,
"loss": 0.148,
"num_input_tokens_seen": 3148256,
"step": 1355
},
{
"epoch": 1.2818096135721018,
"grad_norm": 74.98961639404297,
"learning_rate": 4.6379787319795076e-06,
"loss": 0.0901,
"num_input_tokens_seen": 3157856,
"step": 1360
},
{
"epoch": 1.286522148916117,
"grad_norm": 1.2443631887435913,
"learning_rate": 4.6337036409577705e-06,
"loss": 0.24,
"num_input_tokens_seen": 3167136,
"step": 1365
},
{
"epoch": 1.2912346842601319,
"grad_norm": 0.20186370611190796,
"learning_rate": 4.62940545011415e-06,
"loss": 0.0842,
"num_input_tokens_seen": 3181984,
"step": 1370
},
{
"epoch": 1.295947219604147,
"grad_norm": 22.39756965637207,
"learning_rate": 4.625084205981554e-06,
"loss": 0.1368,
"num_input_tokens_seen": 3195744,
"step": 1375
},
{
"epoch": 1.3006597549481622,
"grad_norm": 9.254731178283691,
"learning_rate": 4.620739955342476e-06,
"loss": 0.2497,
"num_input_tokens_seen": 3207776,
"step": 1380
},
{
"epoch": 1.3053722902921772,
"grad_norm": 0.06419213116168976,
"learning_rate": 4.616372745228477e-06,
"loss": 0.0782,
"num_input_tokens_seen": 3219296,
"step": 1385
},
{
"epoch": 1.3100848256361923,
"grad_norm": 56.7759895324707,
"learning_rate": 4.611982622919684e-06,
"loss": 0.3956,
"num_input_tokens_seen": 3230048,
"step": 1390
},
{
"epoch": 1.3147973609802073,
"grad_norm": 68.79596710205078,
"learning_rate": 4.607569635944271e-06,
"loss": 0.1166,
"num_input_tokens_seen": 3239200,
"step": 1395
},
{
"epoch": 1.3195098963242224,
"grad_norm": 27.92612648010254,
"learning_rate": 4.603133832077953e-06,
"loss": 0.2557,
"num_input_tokens_seen": 3255008,
"step": 1400
},
{
"epoch": 1.3242224316682374,
"grad_norm": 13.399755477905273,
"learning_rate": 4.598675259343462e-06,
"loss": 0.2547,
"num_input_tokens_seen": 3267040,
"step": 1405
},
{
"epoch": 1.3289349670122526,
"grad_norm": 25.696258544921875,
"learning_rate": 4.594193966010031e-06,
"loss": 0.2374,
"num_input_tokens_seen": 3276960,
"step": 1410
},
{
"epoch": 1.3336475023562677,
"grad_norm": 29.0289363861084,
"learning_rate": 4.589690000592868e-06,
"loss": 0.0795,
"num_input_tokens_seen": 3287840,
"step": 1415
},
{
"epoch": 1.3383600377002827,
"grad_norm": 30.088584899902344,
"learning_rate": 4.585163411852632e-06,
"loss": 0.2095,
"num_input_tokens_seen": 3300256,
"step": 1420
},
{
"epoch": 1.3430725730442978,
"grad_norm": 3.960421562194824,
"learning_rate": 4.58061424879491e-06,
"loss": 0.3144,
"num_input_tokens_seen": 3311712,
"step": 1425
},
{
"epoch": 1.347785108388313,
"grad_norm": 75.69437408447266,
"learning_rate": 4.576042560669678e-06,
"loss": 0.1113,
"num_input_tokens_seen": 3322144,
"step": 1430
},
{
"epoch": 1.352497643732328,
"grad_norm": 53.89783477783203,
"learning_rate": 4.571448396970773e-06,
"loss": 0.4022,
"num_input_tokens_seen": 3333856,
"step": 1435
},
{
"epoch": 1.3572101790763431,
"grad_norm": 17.59637451171875,
"learning_rate": 4.566831807435359e-06,
"loss": 0.1542,
"num_input_tokens_seen": 3345696,
"step": 1440
},
{
"epoch": 1.3619227144203583,
"grad_norm": 15.906473159790039,
"learning_rate": 4.562192842043381e-06,
"loss": 0.2594,
"num_input_tokens_seen": 3357024,
"step": 1445
},
{
"epoch": 1.3666352497643732,
"grad_norm": 53.453163146972656,
"learning_rate": 4.557531551017034e-06,
"loss": 0.1721,
"num_input_tokens_seen": 3368480,
"step": 1450
},
{
"epoch": 1.3713477851083884,
"grad_norm": 10.427976608276367,
"learning_rate": 4.552847984820208e-06,
"loss": 0.1418,
"num_input_tokens_seen": 3378720,
"step": 1455
},
{
"epoch": 1.3760603204524033,
"grad_norm": 17.01227569580078,
"learning_rate": 4.548142194157951e-06,
"loss": 0.1344,
"num_input_tokens_seen": 3390688,
"step": 1460
},
{
"epoch": 1.3807728557964185,
"grad_norm": 0.41409215331077576,
"learning_rate": 4.54341422997592e-06,
"loss": 0.2518,
"num_input_tokens_seen": 3403488,
"step": 1465
},
{
"epoch": 1.3854853911404335,
"grad_norm": 3.571580410003662,
"learning_rate": 4.538664143459819e-06,
"loss": 0.1194,
"num_input_tokens_seen": 3415648,
"step": 1470
},
{
"epoch": 1.3901979264844486,
"grad_norm": 39.68430709838867,
"learning_rate": 4.5338919860348565e-06,
"loss": 0.1113,
"num_input_tokens_seen": 3427168,
"step": 1475
},
{
"epoch": 1.3949104618284638,
"grad_norm": 0.09742722660303116,
"learning_rate": 4.529097809365184e-06,
"loss": 0.1426,
"num_input_tokens_seen": 3437664,
"step": 1480
},
{
"epoch": 1.3996229971724787,
"grad_norm": 80.09423828125,
"learning_rate": 4.524281665353334e-06,
"loss": 0.3136,
"num_input_tokens_seen": 3450144,
"step": 1485
},
{
"epoch": 1.404335532516494,
"grad_norm": 38.64655303955078,
"learning_rate": 4.519443606139665e-06,
"loss": 0.1617,
"num_input_tokens_seen": 3461280,
"step": 1490
},
{
"epoch": 1.409048067860509,
"grad_norm": 60.909393310546875,
"learning_rate": 4.514583684101792e-06,
"loss": 0.2666,
"num_input_tokens_seen": 3472608,
"step": 1495
},
{
"epoch": 1.413760603204524,
"grad_norm": 89.08367919921875,
"learning_rate": 4.509701951854018e-06,
"loss": 0.105,
"num_input_tokens_seen": 3485024,
"step": 1500
},
{
"epoch": 1.4184731385485392,
"grad_norm": 73.14676666259766,
"learning_rate": 4.504798462246768e-06,
"loss": 0.2341,
"num_input_tokens_seen": 3496096,
"step": 1505
},
{
"epoch": 1.4231856738925541,
"grad_norm": 33.10121154785156,
"learning_rate": 4.499873268366017e-06,
"loss": 0.2829,
"num_input_tokens_seen": 3506848,
"step": 1510
},
{
"epoch": 1.4278982092365693,
"grad_norm": 45.99144744873047,
"learning_rate": 4.494926423532715e-06,
"loss": 0.1819,
"num_input_tokens_seen": 3521568,
"step": 1515
},
{
"epoch": 1.4326107445805842,
"grad_norm": 3.1161906719207764,
"learning_rate": 4.4899579813022046e-06,
"loss": 0.1103,
"num_input_tokens_seen": 3533856,
"step": 1520
},
{
"epoch": 1.4373232799245994,
"grad_norm": 1.9241315126419067,
"learning_rate": 4.484967995463648e-06,
"loss": 0.216,
"num_input_tokens_seen": 3544544,
"step": 1525
},
{
"epoch": 1.4420358152686146,
"grad_norm": 26.153079986572266,
"learning_rate": 4.479956520039443e-06,
"loss": 0.303,
"num_input_tokens_seen": 3554336,
"step": 1530
},
{
"epoch": 1.4467483506126295,
"grad_norm": 8.090953826904297,
"learning_rate": 4.474923609284635e-06,
"loss": 0.0434,
"num_input_tokens_seen": 3564384,
"step": 1535
},
{
"epoch": 1.4514608859566447,
"grad_norm": 0.26238393783569336,
"learning_rate": 4.469869317686332e-06,
"loss": 0.1438,
"num_input_tokens_seen": 3576992,
"step": 1540
},
{
"epoch": 1.4561734213006599,
"grad_norm": 92.67262268066406,
"learning_rate": 4.464793699963116e-06,
"loss": 0.1766,
"num_input_tokens_seen": 3587872,
"step": 1545
},
{
"epoch": 1.4608859566446748,
"grad_norm": 11.002724647521973,
"learning_rate": 4.4596968110644484e-06,
"loss": 0.0997,
"num_input_tokens_seen": 3598560,
"step": 1550
},
{
"epoch": 1.46559849198869,
"grad_norm": 77.25719451904297,
"learning_rate": 4.454578706170075e-06,
"loss": 0.1595,
"num_input_tokens_seen": 3608864,
"step": 1555
},
{
"epoch": 1.4703110273327051,
"grad_norm": 1.6689245700836182,
"learning_rate": 4.44943944068943e-06,
"loss": 0.0274,
"num_input_tokens_seen": 3620960,
"step": 1560
},
{
"epoch": 1.47502356267672,
"grad_norm": 103.46016693115234,
"learning_rate": 4.444279070261035e-06,
"loss": 0.4584,
"num_input_tokens_seen": 3632096,
"step": 1565
},
{
"epoch": 1.479736098020735,
"grad_norm": 57.57553482055664,
"learning_rate": 4.4390976507518994e-06,
"loss": 0.2423,
"num_input_tokens_seen": 3643424,
"step": 1570
},
{
"epoch": 1.4844486333647502,
"grad_norm": 0.6700392961502075,
"learning_rate": 4.433895238256909e-06,
"loss": 0.046,
"num_input_tokens_seen": 3654624,
"step": 1575
},
{
"epoch": 1.4891611687087654,
"grad_norm": 58.0783576965332,
"learning_rate": 4.4286718890982275e-06,
"loss": 0.0609,
"num_input_tokens_seen": 3665504,
"step": 1580
},
{
"epoch": 1.4938737040527803,
"grad_norm": 142.61090087890625,
"learning_rate": 4.423427659824681e-06,
"loss": 0.2488,
"num_input_tokens_seen": 3676448,
"step": 1585
},
{
"epoch": 1.4985862393967955,
"grad_norm": 40.1721305847168,
"learning_rate": 4.418162607211146e-06,
"loss": 0.4721,
"num_input_tokens_seen": 3686432,
"step": 1590
},
{
"epoch": 1.5032987747408106,
"grad_norm": 25.409154891967773,
"learning_rate": 4.412876788257936e-06,
"loss": 0.2207,
"num_input_tokens_seen": 3697312,
"step": 1595
},
{
"epoch": 1.5042412818096136,
"eval_loss": 0.35448023676872253,
"eval_runtime": 2.7456,
"eval_samples_per_second": 343.46,
"eval_steps_per_second": 42.978,
"num_input_tokens_seen": 3699104,
"step": 1596
},
{
"epoch": 1.5080113100848256,
"grad_norm": 44.117496490478516,
"learning_rate": 4.407570260190186e-06,
"loss": 0.2648,
"num_input_tokens_seen": 3707808,
"step": 1600
},
{
"epoch": 1.5127238454288408,
"grad_norm": 26.070695877075195,
"learning_rate": 4.402243080457229e-06,
"loss": 0.3225,
"num_input_tokens_seen": 3719840,
"step": 1605
},
{
"epoch": 1.517436380772856,
"grad_norm": 1.1607394218444824,
"learning_rate": 4.396895306731978e-06,
"loss": 0.2234,
"num_input_tokens_seen": 3731168,
"step": 1610
},
{
"epoch": 1.5221489161168709,
"grad_norm": 103.62728881835938,
"learning_rate": 4.391526996910298e-06,
"loss": 0.2199,
"num_input_tokens_seen": 3744160,
"step": 1615
},
{
"epoch": 1.5268614514608858,
"grad_norm": 31.115297317504883,
"learning_rate": 4.386138209110385e-06,
"loss": 0.1515,
"num_input_tokens_seen": 3754912,
"step": 1620
},
{
"epoch": 1.5315739868049012,
"grad_norm": 1.294524073600769,
"learning_rate": 4.3807290016721265e-06,
"loss": 0.1179,
"num_input_tokens_seen": 3767776,
"step": 1625
},
{
"epoch": 1.5362865221489161,
"grad_norm": 92.95679473876953,
"learning_rate": 4.375299433156483e-06,
"loss": 0.1079,
"num_input_tokens_seen": 3779104,
"step": 1630
},
{
"epoch": 1.540999057492931,
"grad_norm": 72.8927001953125,
"learning_rate": 4.3698495623448424e-06,
"loss": 0.359,
"num_input_tokens_seen": 3789408,
"step": 1635
},
{
"epoch": 1.5457115928369463,
"grad_norm": 31.62137794494629,
"learning_rate": 4.364379448238392e-06,
"loss": 0.1058,
"num_input_tokens_seen": 3799584,
"step": 1640
},
{
"epoch": 1.5504241281809614,
"grad_norm": 80.54794311523438,
"learning_rate": 4.358889150057476e-06,
"loss": 0.3319,
"num_input_tokens_seen": 3813344,
"step": 1645
},
{
"epoch": 1.5551366635249764,
"grad_norm": 91.38248443603516,
"learning_rate": 4.35337872724095e-06,
"loss": 0.1354,
"num_input_tokens_seen": 3823328,
"step": 1650
},
{
"epoch": 1.5598491988689915,
"grad_norm": 86.33023071289062,
"learning_rate": 4.347848239445548e-06,
"loss": 0.1612,
"num_input_tokens_seen": 3835232,
"step": 1655
},
{
"epoch": 1.5645617342130067,
"grad_norm": 24.640047073364258,
"learning_rate": 4.342297746545228e-06,
"loss": 0.2858,
"num_input_tokens_seen": 3846368,
"step": 1660
},
{
"epoch": 1.5692742695570217,
"grad_norm": 0.5544624924659729,
"learning_rate": 4.336727308630527e-06,
"loss": 0.0313,
"num_input_tokens_seen": 3858656,
"step": 1665
},
{
"epoch": 1.5739868049010366,
"grad_norm": 23.30266761779785,
"learning_rate": 4.33113698600791e-06,
"loss": 0.1587,
"num_input_tokens_seen": 3871776,
"step": 1670
},
{
"epoch": 1.578699340245052,
"grad_norm": 0.21707068383693695,
"learning_rate": 4.325526839199115e-06,
"loss": 0.0377,
"num_input_tokens_seen": 3884384,
"step": 1675
},
{
"epoch": 1.583411875589067,
"grad_norm": 97.02978515625,
"learning_rate": 4.319896928940505e-06,
"loss": 0.2741,
"num_input_tokens_seen": 3896224,
"step": 1680
},
{
"epoch": 1.5881244109330819,
"grad_norm": 6.382898807525635,
"learning_rate": 4.3142473161824e-06,
"loss": 0.1037,
"num_input_tokens_seen": 3906528,
"step": 1685
},
{
"epoch": 1.592836946277097,
"grad_norm": 36.04171371459961,
"learning_rate": 4.308578062088426e-06,
"loss": 0.1437,
"num_input_tokens_seen": 3917728,
"step": 1690
},
{
"epoch": 1.5975494816211122,
"grad_norm": 61.61280822753906,
"learning_rate": 4.302889228034846e-06,
"loss": 0.3957,
"num_input_tokens_seen": 3928032,
"step": 1695
},
{
"epoch": 1.6022620169651272,
"grad_norm": 1.8270617723464966,
"learning_rate": 4.297180875609902e-06,
"loss": 0.1641,
"num_input_tokens_seen": 3940384,
"step": 1700
},
{
"epoch": 1.6069745523091423,
"grad_norm": 0.7876982092857361,
"learning_rate": 4.2914530666131436e-06,
"loss": 0.0949,
"num_input_tokens_seen": 3951904,
"step": 1705
},
{
"epoch": 1.6116870876531575,
"grad_norm": 59.75898742675781,
"learning_rate": 4.285705863054759e-06,
"loss": 0.2799,
"num_input_tokens_seen": 3963360,
"step": 1710
},
{
"epoch": 1.6163996229971724,
"grad_norm": 50.44517517089844,
"learning_rate": 4.279939327154909e-06,
"loss": 0.3126,
"num_input_tokens_seen": 3974432,
"step": 1715
},
{
"epoch": 1.6211121583411876,
"grad_norm": 22.407121658325195,
"learning_rate": 4.274153521343047e-06,
"loss": 0.2358,
"num_input_tokens_seen": 3984352,
"step": 1720
},
{
"epoch": 1.6258246936852028,
"grad_norm": 2.445833206176758,
"learning_rate": 4.268348508257243e-06,
"loss": 0.0892,
"num_input_tokens_seen": 3994016,
"step": 1725
},
{
"epoch": 1.6305372290292177,
"grad_norm": 79.69355010986328,
"learning_rate": 4.262524350743512e-06,
"loss": 0.3199,
"num_input_tokens_seen": 4005856,
"step": 1730
},
{
"epoch": 1.6352497643732327,
"grad_norm": 27.91238784790039,
"learning_rate": 4.25668111185513e-06,
"loss": 0.1497,
"num_input_tokens_seen": 4017248,
"step": 1735
},
{
"epoch": 1.6399622997172478,
"grad_norm": 65.74903106689453,
"learning_rate": 4.250818854851948e-06,
"loss": 0.1124,
"num_input_tokens_seen": 4028128,
"step": 1740
},
{
"epoch": 1.644674835061263,
"grad_norm": 16.284719467163086,
"learning_rate": 4.244937643199711e-06,
"loss": 0.1923,
"num_input_tokens_seen": 4044768,
"step": 1745
},
{
"epoch": 1.649387370405278,
"grad_norm": 68.08360290527344,
"learning_rate": 4.239037540569373e-06,
"loss": 0.1026,
"num_input_tokens_seen": 4062432,
"step": 1750
},
{
"epoch": 1.654099905749293,
"grad_norm": 16.83579444885254,
"learning_rate": 4.233118610836401e-06,
"loss": 0.0699,
"num_input_tokens_seen": 4074016,
"step": 1755
},
{
"epoch": 1.6588124410933083,
"grad_norm": 26.799367904663086,
"learning_rate": 4.227180918080089e-06,
"loss": 0.1875,
"num_input_tokens_seen": 4084704,
"step": 1760
},
{
"epoch": 1.6635249764373232,
"grad_norm": 10.665923118591309,
"learning_rate": 4.221224526582863e-06,
"loss": 0.0828,
"num_input_tokens_seen": 4095136,
"step": 1765
},
{
"epoch": 1.6682375117813384,
"grad_norm": 0.24358469247817993,
"learning_rate": 4.215249500829583e-06,
"loss": 0.1379,
"num_input_tokens_seen": 4107744,
"step": 1770
},
{
"epoch": 1.6729500471253536,
"grad_norm": 0.6852381229400635,
"learning_rate": 4.209255905506847e-06,
"loss": 0.2322,
"num_input_tokens_seen": 4118624,
"step": 1775
},
{
"epoch": 1.6776625824693685,
"grad_norm": 0.456554651260376,
"learning_rate": 4.2032438055022925e-06,
"loss": 0.1804,
"num_input_tokens_seen": 4129184,
"step": 1780
},
{
"epoch": 1.6823751178133834,
"grad_norm": 96.7328872680664,
"learning_rate": 4.197213265903889e-06,
"loss": 0.3414,
"num_input_tokens_seen": 4141024,
"step": 1785
},
{
"epoch": 1.6870876531573988,
"grad_norm": 16.629526138305664,
"learning_rate": 4.191164351999236e-06,
"loss": 0.3523,
"num_input_tokens_seen": 4151840,
"step": 1790
},
{
"epoch": 1.6918001885014138,
"grad_norm": 23.59195899963379,
"learning_rate": 4.18509712927486e-06,
"loss": 0.2797,
"num_input_tokens_seen": 4164704,
"step": 1795
},
{
"epoch": 1.6965127238454287,
"grad_norm": 38.683265686035156,
"learning_rate": 4.179011663415494e-06,
"loss": 0.2943,
"num_input_tokens_seen": 4177184,
"step": 1800
},
{
"epoch": 1.701225259189444,
"grad_norm": 20.35943031311035,
"learning_rate": 4.172908020303384e-06,
"loss": 0.0589,
"num_input_tokens_seen": 4188768,
"step": 1805
},
{
"epoch": 1.705937794533459,
"grad_norm": 25.21088218688965,
"learning_rate": 4.166786266017557e-06,
"loss": 0.1865,
"num_input_tokens_seen": 4200480,
"step": 1810
},
{
"epoch": 1.710650329877474,
"grad_norm": 18.756656646728516,
"learning_rate": 4.160646466833121e-06,
"loss": 0.1045,
"num_input_tokens_seen": 4212064,
"step": 1815
},
{
"epoch": 1.7153628652214892,
"grad_norm": 38.346832275390625,
"learning_rate": 4.154488689220536e-06,
"loss": 0.2373,
"num_input_tokens_seen": 4221728,
"step": 1820
},
{
"epoch": 1.7200754005655043,
"grad_norm": 61.90775680541992,
"learning_rate": 4.1483129998449035e-06,
"loss": 0.216,
"num_input_tokens_seen": 4233888,
"step": 1825
},
{
"epoch": 1.7247879359095193,
"grad_norm": 35.818946838378906,
"learning_rate": 4.142119465565238e-06,
"loss": 0.2308,
"num_input_tokens_seen": 4245344,
"step": 1830
},
{
"epoch": 1.7295004712535345,
"grad_norm": 42.63814163208008,
"learning_rate": 4.135908153433748e-06,
"loss": 0.0663,
"num_input_tokens_seen": 4256992,
"step": 1835
},
{
"epoch": 1.7342130065975496,
"grad_norm": 1.1722609996795654,
"learning_rate": 4.129679130695105e-06,
"loss": 0.0795,
"num_input_tokens_seen": 4266784,
"step": 1840
},
{
"epoch": 1.7389255419415646,
"grad_norm": 73.20691680908203,
"learning_rate": 4.123432464785721e-06,
"loss": 0.0953,
"num_input_tokens_seen": 4281504,
"step": 1845
},
{
"epoch": 1.7436380772855795,
"grad_norm": 61.06163024902344,
"learning_rate": 4.117168223333015e-06,
"loss": 0.3657,
"num_input_tokens_seen": 4296032,
"step": 1850
},
{
"epoch": 1.7483506126295947,
"grad_norm": 3.197977304458618,
"learning_rate": 4.1108864741546815e-06,
"loss": 0.0417,
"num_input_tokens_seen": 4309280,
"step": 1855
},
{
"epoch": 1.7530631479736098,
"grad_norm": 0.4998331665992737,
"learning_rate": 4.1045872852579546e-06,
"loss": 0.1138,
"num_input_tokens_seen": 4319648,
"step": 1860
},
{
"epoch": 1.7549481621112157,
"eval_loss": 0.3500010073184967,
"eval_runtime": 2.7501,
"eval_samples_per_second": 342.894,
"eval_steps_per_second": 42.907,
"num_input_tokens_seen": 4324256,
"step": 1862
},
{
"epoch": 1.7577756833176248,
"grad_norm": 108.458740234375,
"learning_rate": 4.098270724838879e-06,
"loss": 0.0767,
"num_input_tokens_seen": 4330144,
"step": 1865
},
{
"epoch": 1.76248821866164,
"grad_norm": 0.2290242463350296,
"learning_rate": 4.091936861281561e-06,
"loss": 0.0415,
"num_input_tokens_seen": 4343712,
"step": 1870
},
{
"epoch": 1.7672007540056551,
"grad_norm": 93.17559814453125,
"learning_rate": 4.085585763157435e-06,
"loss": 0.4214,
"num_input_tokens_seen": 4354144,
"step": 1875
},
{
"epoch": 1.77191328934967,
"grad_norm": 10.659987449645996,
"learning_rate": 4.07921749922452e-06,
"loss": 0.013,
"num_input_tokens_seen": 4364896,
"step": 1880
},
{
"epoch": 1.7766258246936852,
"grad_norm": 0.5930144786834717,
"learning_rate": 4.0728321384266764e-06,
"loss": 0.1879,
"num_input_tokens_seen": 4377120,
"step": 1885
},
{
"epoch": 1.7813383600377004,
"grad_norm": 0.13112248480319977,
"learning_rate": 4.066429749892854e-06,
"loss": 0.1512,
"num_input_tokens_seen": 4388128,
"step": 1890
},
{
"epoch": 1.7860508953817154,
"grad_norm": 31.263877868652344,
"learning_rate": 4.060010402936353e-06,
"loss": 0.1946,
"num_input_tokens_seen": 4402272,
"step": 1895
},
{
"epoch": 1.7907634307257303,
"grad_norm": 66.94145965576172,
"learning_rate": 4.053574167054063e-06,
"loss": 0.0513,
"num_input_tokens_seen": 4412640,
"step": 1900
},
{
"epoch": 1.7954759660697457,
"grad_norm": 30.63470458984375,
"learning_rate": 4.047121111925718e-06,
"loss": 0.2935,
"num_input_tokens_seen": 4424096,
"step": 1905
},
{
"epoch": 1.8001885014137606,
"grad_norm": 64.27619171142578,
"learning_rate": 4.040651307413142e-06,
"loss": 0.1499,
"num_input_tokens_seen": 4434144,
"step": 1910
},
{
"epoch": 1.8049010367577756,
"grad_norm": 88.78367614746094,
"learning_rate": 4.034164823559487e-06,
"loss": 0.1671,
"num_input_tokens_seen": 4446240,
"step": 1915
},
{
"epoch": 1.8096135721017907,
"grad_norm": 47.201698303222656,
"learning_rate": 4.02766173058848e-06,
"loss": 0.183,
"num_input_tokens_seen": 4455712,
"step": 1920
},
{
"epoch": 1.814326107445806,
"grad_norm": 17.526779174804688,
"learning_rate": 4.021142098903662e-06,
"loss": 0.2619,
"num_input_tokens_seen": 4466144,
"step": 1925
},
{
"epoch": 1.8190386427898209,
"grad_norm": 18.032976150512695,
"learning_rate": 4.014605999087623e-06,
"loss": 0.2168,
"num_input_tokens_seen": 4476064,
"step": 1930
},
{
"epoch": 1.823751178133836,
"grad_norm": 4.104875564575195,
"learning_rate": 4.008053501901239e-06,
"loss": 0.1402,
"num_input_tokens_seen": 4487456,
"step": 1935
},
{
"epoch": 1.8284637134778512,
"grad_norm": 28.21024513244629,
"learning_rate": 4.001484678282911e-06,
"loss": 0.2318,
"num_input_tokens_seen": 4498400,
"step": 1940
},
{
"epoch": 1.8331762488218661,
"grad_norm": 36.88951873779297,
"learning_rate": 3.994899599347787e-06,
"loss": 0.1527,
"num_input_tokens_seen": 4511520,
"step": 1945
},
{
"epoch": 1.837888784165881,
"grad_norm": 12.032304763793945,
"learning_rate": 3.9882983363869995e-06,
"loss": 0.151,
"num_input_tokens_seen": 4523232,
"step": 1950
},
{
"epoch": 1.8426013195098965,
"grad_norm": 22.562625885009766,
"learning_rate": 3.981680960866896e-06,
"loss": 0.084,
"num_input_tokens_seen": 4536416,
"step": 1955
},
{
"epoch": 1.8473138548539114,
"grad_norm": 2.119037389755249,
"learning_rate": 3.9750475444282545e-06,
"loss": 0.1193,
"num_input_tokens_seen": 4546528,
"step": 1960
},
{
"epoch": 1.8520263901979264,
"grad_norm": 5.9970574378967285,
"learning_rate": 3.968398158885519e-06,
"loss": 0.0301,
"num_input_tokens_seen": 4559008,
"step": 1965
},
{
"epoch": 1.8567389255419415,
"grad_norm": 53.16204071044922,
"learning_rate": 3.961732876226016e-06,
"loss": 0.1272,
"num_input_tokens_seen": 4569824,
"step": 1970
},
{
"epoch": 1.8614514608859567,
"grad_norm": 34.37496566772461,
"learning_rate": 3.955051768609179e-06,
"loss": 0.0125,
"num_input_tokens_seen": 4581664,
"step": 1975
},
{
"epoch": 1.8661639962299716,
"grad_norm": 5.8095011711120605,
"learning_rate": 3.948354908365762e-06,
"loss": 0.2273,
"num_input_tokens_seen": 4593696,
"step": 1980
},
{
"epoch": 1.8708765315739868,
"grad_norm": 82.38545989990234,
"learning_rate": 3.941642367997062e-06,
"loss": 0.3306,
"num_input_tokens_seen": 4604064,
"step": 1985
},
{
"epoch": 1.875589066918002,
"grad_norm": 13.79807186126709,
"learning_rate": 3.934914220174128e-06,
"loss": 0.2246,
"num_input_tokens_seen": 4613856,
"step": 1990
},
{
"epoch": 1.880301602262017,
"grad_norm": 9.43858528137207,
"learning_rate": 3.9281705377369814e-06,
"loss": 0.262,
"num_input_tokens_seen": 4624480,
"step": 1995
},
{
"epoch": 1.885014137606032,
"grad_norm": 0.6858423352241516,
"learning_rate": 3.921411393693823e-06,
"loss": 0.0359,
"num_input_tokens_seen": 4634720,
"step": 2000
},
{
"epoch": 1.8897266729500473,
"grad_norm": 12.693150520324707,
"learning_rate": 3.9146368612202425e-06,
"loss": 0.1522,
"num_input_tokens_seen": 4644320,
"step": 2005
},
{
"epoch": 1.8944392082940622,
"grad_norm": 0.35528820753097534,
"learning_rate": 3.907847013658429e-06,
"loss": 0.1144,
"num_input_tokens_seen": 4656672,
"step": 2010
},
{
"epoch": 1.8991517436380771,
"grad_norm": 0.7190976142883301,
"learning_rate": 3.901041924516372e-06,
"loss": 0.152,
"num_input_tokens_seen": 4668832,
"step": 2015
},
{
"epoch": 1.9038642789820923,
"grad_norm": 18.8311767578125,
"learning_rate": 3.894221667467074e-06,
"loss": 0.0683,
"num_input_tokens_seen": 4680096,
"step": 2020
},
{
"epoch": 1.9085768143261075,
"grad_norm": 2.0841264724731445,
"learning_rate": 3.887386316347742e-06,
"loss": 0.0966,
"num_input_tokens_seen": 4692320,
"step": 2025
},
{
"epoch": 1.9132893496701224,
"grad_norm": 90.08401489257812,
"learning_rate": 3.880535945158997e-06,
"loss": 0.1503,
"num_input_tokens_seen": 4709344,
"step": 2030
},
{
"epoch": 1.9180018850141376,
"grad_norm": 0.7957233786582947,
"learning_rate": 3.873670628064071e-06,
"loss": 0.0726,
"num_input_tokens_seen": 4721888,
"step": 2035
},
{
"epoch": 1.9227144203581528,
"grad_norm": 115.30460357666016,
"learning_rate": 3.866790439387998e-06,
"loss": 0.117,
"num_input_tokens_seen": 4732384,
"step": 2040
},
{
"epoch": 1.9274269557021677,
"grad_norm": 0.2744818925857544,
"learning_rate": 3.85989545361682e-06,
"loss": 0.2188,
"num_input_tokens_seen": 4743264,
"step": 2045
},
{
"epoch": 1.9321394910461829,
"grad_norm": 0.26964840292930603,
"learning_rate": 3.85298574539677e-06,
"loss": 0.1091,
"num_input_tokens_seen": 4753248,
"step": 2050
},
{
"epoch": 1.936852026390198,
"grad_norm": 151.45645141601562,
"learning_rate": 3.846061389533472e-06,
"loss": 0.0907,
"num_input_tokens_seen": 4764768,
"step": 2055
},
{
"epoch": 1.941564561734213,
"grad_norm": 72.78887939453125,
"learning_rate": 3.839122460991124e-06,
"loss": 0.2683,
"num_input_tokens_seen": 4775456,
"step": 2060
},
{
"epoch": 1.946277097078228,
"grad_norm": 203.98098754882812,
"learning_rate": 3.832169034891695e-06,
"loss": 0.3549,
"num_input_tokens_seen": 4789152,
"step": 2065
},
{
"epoch": 1.9509896324222433,
"grad_norm": 12.131155014038086,
"learning_rate": 3.825201186514103e-06,
"loss": 0.0639,
"num_input_tokens_seen": 4803488,
"step": 2070
},
{
"epoch": 1.9557021677662583,
"grad_norm": 8.148255348205566,
"learning_rate": 3.818218991293406e-06,
"loss": 0.2019,
"num_input_tokens_seen": 4813216,
"step": 2075
},
{
"epoch": 1.9604147031102732,
"grad_norm": 39.3453369140625,
"learning_rate": 3.811222524819983e-06,
"loss": 0.1943,
"num_input_tokens_seen": 4823584,
"step": 2080
},
{
"epoch": 1.9651272384542884,
"grad_norm": 44.195316314697266,
"learning_rate": 3.8042118628387138e-06,
"loss": 0.0531,
"num_input_tokens_seen": 4838624,
"step": 2085
},
{
"epoch": 1.9698397737983036,
"grad_norm": 69.47586059570312,
"learning_rate": 3.7971870812481636e-06,
"loss": 0.0121,
"num_input_tokens_seen": 4851552,
"step": 2090
},
{
"epoch": 1.9745523091423185,
"grad_norm": 34.5429573059082,
"learning_rate": 3.7901482560997577e-06,
"loss": 0.1929,
"num_input_tokens_seen": 4864352,
"step": 2095
},
{
"epoch": 1.9792648444863337,
"grad_norm": 4.417181015014648,
"learning_rate": 3.78309546359696e-06,
"loss": 0.2053,
"num_input_tokens_seen": 4875616,
"step": 2100
},
{
"epoch": 1.9839773798303488,
"grad_norm": 43.39990997314453,
"learning_rate": 3.776028780094446e-06,
"loss": 0.0107,
"num_input_tokens_seen": 4886560,
"step": 2105
},
{
"epoch": 1.9886899151743638,
"grad_norm": 31.191131591796875,
"learning_rate": 3.7689482820972797e-06,
"loss": 0.2379,
"num_input_tokens_seen": 4898592,
"step": 2110
},
{
"epoch": 1.993402450518379,
"grad_norm": 87.375244140625,
"learning_rate": 3.7618540462600792e-06,
"loss": 0.2504,
"num_input_tokens_seen": 4912160,
"step": 2115
},
{
"epoch": 1.998114985862394,
"grad_norm": 16.684934616088867,
"learning_rate": 3.7547461493861948e-06,
"loss": 0.1832,
"num_input_tokens_seen": 4923424,
"step": 2120
},
{
"epoch": 2.002827521206409,
"grad_norm": 0.0688318982720375,
"learning_rate": 3.7476246684268703e-06,
"loss": 0.0762,
"num_input_tokens_seen": 4932416,
"step": 2125
},
{
"epoch": 2.005655042412818,
"eval_loss": 0.33445462584495544,
"eval_runtime": 3.3719,
"eval_samples_per_second": 279.667,
"eval_steps_per_second": 34.996,
"num_input_tokens_seen": 4940992,
"step": 2128
},
{
"epoch": 2.007540056550424,
"grad_norm": 2.835258722305298,
"learning_rate": 3.740489680480415e-06,
"loss": 0.0528,
"num_input_tokens_seen": 4948288,
"step": 2130
},
{
"epoch": 2.0122525918944394,
"grad_norm": 0.02049732208251953,
"learning_rate": 3.733341262791366e-06,
"loss": 0.0067,
"num_input_tokens_seen": 4960512,
"step": 2135
},
{
"epoch": 2.0169651272384543,
"grad_norm": 0.09395653009414673,
"learning_rate": 3.7261794927496535e-06,
"loss": 0.0027,
"num_input_tokens_seen": 4972352,
"step": 2140
},
{
"epoch": 2.0216776625824693,
"grad_norm": 159.60162353515625,
"learning_rate": 3.719004447889762e-06,
"loss": 0.0681,
"num_input_tokens_seen": 4982272,
"step": 2145
},
{
"epoch": 2.0263901979264842,
"grad_norm": 0.5360152721405029,
"learning_rate": 3.7118162058898915e-06,
"loss": 0.1795,
"num_input_tokens_seen": 4993088,
"step": 2150
},
{
"epoch": 2.0311027332704996,
"grad_norm": 0.0288984514772892,
"learning_rate": 3.704614844571117e-06,
"loss": 0.0124,
"num_input_tokens_seen": 5003392,
"step": 2155
},
{
"epoch": 2.0358152686145146,
"grad_norm": 0.07737737149000168,
"learning_rate": 3.6974004418965435e-06,
"loss": 0.0007,
"num_input_tokens_seen": 5014592,
"step": 2160
},
{
"epoch": 2.0405278039585295,
"grad_norm": 81.7595443725586,
"learning_rate": 3.6901730759704674e-06,
"loss": 0.1943,
"num_input_tokens_seen": 5028160,
"step": 2165
},
{
"epoch": 2.045240339302545,
"grad_norm": 0.019410187378525734,
"learning_rate": 3.682932825037523e-06,
"loss": 0.1365,
"num_input_tokens_seen": 5037504,
"step": 2170
},
{
"epoch": 2.04995287464656,
"grad_norm": 15.080971717834473,
"learning_rate": 3.675679767481842e-06,
"loss": 0.0894,
"num_input_tokens_seen": 5052288,
"step": 2175
},
{
"epoch": 2.054665409990575,
"grad_norm": 10.959814071655273,
"learning_rate": 3.6684139818262045e-06,
"loss": 0.1397,
"num_input_tokens_seen": 5064384,
"step": 2180
},
{
"epoch": 2.05937794533459,
"grad_norm": 158.70689392089844,
"learning_rate": 3.6611355467311825e-06,
"loss": 0.0268,
"num_input_tokens_seen": 5074240,
"step": 2185
},
{
"epoch": 2.064090480678605,
"grad_norm": 0.12513531744480133,
"learning_rate": 3.653844540994298e-06,
"loss": 0.0081,
"num_input_tokens_seen": 5085312,
"step": 2190
},
{
"epoch": 2.06880301602262,
"grad_norm": 0.03574146702885628,
"learning_rate": 3.6465410435491603e-06,
"loss": 0.0006,
"num_input_tokens_seen": 5094592,
"step": 2195
},
{
"epoch": 2.0735155513666355,
"grad_norm": 0.017842473462224007,
"learning_rate": 3.6392251334646194e-06,
"loss": 0.0012,
"num_input_tokens_seen": 5108544,
"step": 2200
},
{
"epoch": 2.0782280867106504,
"grad_norm": 0.040509432554244995,
"learning_rate": 3.6318968899439042e-06,
"loss": 0.2164,
"num_input_tokens_seen": 5118976,
"step": 2205
},
{
"epoch": 2.0829406220546653,
"grad_norm": 0.03663352131843567,
"learning_rate": 3.6245563923237692e-06,
"loss": 0.0004,
"num_input_tokens_seen": 5134272,
"step": 2210
},
{
"epoch": 2.0876531573986803,
"grad_norm": 0.11897611618041992,
"learning_rate": 3.617203720073633e-06,
"loss": 0.0463,
"num_input_tokens_seen": 5145408,
"step": 2215
},
{
"epoch": 2.0923656927426957,
"grad_norm": 0.11080852895975113,
"learning_rate": 3.6098389527947164e-06,
"loss": 0.1413,
"num_input_tokens_seen": 5157440,
"step": 2220
},
{
"epoch": 2.0970782280867106,
"grad_norm": 0.09218670427799225,
"learning_rate": 3.6024621702191876e-06,
"loss": 0.0007,
"num_input_tokens_seen": 5170176,
"step": 2225
},
{
"epoch": 2.1017907634307256,
"grad_norm": 1.5784250497817993,
"learning_rate": 3.5950734522092908e-06,
"loss": 0.2877,
"num_input_tokens_seen": 5178944,
"step": 2230
},
{
"epoch": 2.106503298774741,
"grad_norm": 0.22626134753227234,
"learning_rate": 3.587672878756487e-06,
"loss": 0.0007,
"num_input_tokens_seen": 5190272,
"step": 2235
},
{
"epoch": 2.111215834118756,
"grad_norm": 0.011661054566502571,
"learning_rate": 3.5802605299805843e-06,
"loss": 0.0004,
"num_input_tokens_seen": 5202304,
"step": 2240
},
{
"epoch": 2.115928369462771,
"grad_norm": 129.8340301513672,
"learning_rate": 3.5728364861288743e-06,
"loss": 0.1757,
"num_input_tokens_seen": 5215808,
"step": 2245
},
{
"epoch": 2.1206409048067862,
"grad_norm": 0.05797062814235687,
"learning_rate": 3.5654008275752607e-06,
"loss": 0.0003,
"num_input_tokens_seen": 5229056,
"step": 2250
},
{
"epoch": 2.125353440150801,
"grad_norm": 0.6185352206230164,
"learning_rate": 3.557953634819389e-06,
"loss": 0.0007,
"num_input_tokens_seen": 5239616,
"step": 2255
},
{
"epoch": 2.130065975494816,
"grad_norm": 271.65594482421875,
"learning_rate": 3.550494988485777e-06,
"loss": 0.1511,
"num_input_tokens_seen": 5249600,
"step": 2260
},
{
"epoch": 2.1347785108388315,
"grad_norm": 0.7488783001899719,
"learning_rate": 3.5430249693229403e-06,
"loss": 0.2004,
"num_input_tokens_seen": 5261888,
"step": 2265
},
{
"epoch": 2.1394910461828465,
"grad_norm": 0.022314058616757393,
"learning_rate": 3.5355436582025184e-06,
"loss": 0.0272,
"num_input_tokens_seen": 5272768,
"step": 2270
},
{
"epoch": 2.1442035815268614,
"grad_norm": 0.029360728338360786,
"learning_rate": 3.5280511361183995e-06,
"loss": 0.142,
"num_input_tokens_seen": 5283520,
"step": 2275
},
{
"epoch": 2.1489161168708764,
"grad_norm": 0.04351954534649849,
"learning_rate": 3.5205474841858444e-06,
"loss": 0.0003,
"num_input_tokens_seen": 5294336,
"step": 2280
},
{
"epoch": 2.1536286522148917,
"grad_norm": 0.8838725090026855,
"learning_rate": 3.513032783640605e-06,
"loss": 0.0445,
"num_input_tokens_seen": 5304960,
"step": 2285
},
{
"epoch": 2.1583411875589067,
"grad_norm": 0.011690633371472359,
"learning_rate": 3.5055071158380512e-06,
"loss": 0.0002,
"num_input_tokens_seen": 5317184,
"step": 2290
},
{
"epoch": 2.1630537229029216,
"grad_norm": 0.16222970187664032,
"learning_rate": 3.497970562252282e-06,
"loss": 0.0003,
"num_input_tokens_seen": 5329152,
"step": 2295
},
{
"epoch": 2.167766258246937,
"grad_norm": 128.02944946289062,
"learning_rate": 3.4904232044752507e-06,
"loss": 0.232,
"num_input_tokens_seen": 5342016,
"step": 2300
},
{
"epoch": 2.172478793590952,
"grad_norm": 73.5108413696289,
"learning_rate": 3.4828651242158764e-06,
"loss": 0.1157,
"num_input_tokens_seen": 5352768,
"step": 2305
},
{
"epoch": 2.177191328934967,
"grad_norm": 0.029827579855918884,
"learning_rate": 3.4752964032991638e-06,
"loss": 0.1506,
"num_input_tokens_seen": 5364160,
"step": 2310
},
{
"epoch": 2.181903864278982,
"grad_norm": 0.14194637537002563,
"learning_rate": 3.4677171236653133e-06,
"loss": 0.1442,
"num_input_tokens_seen": 5376448,
"step": 2315
},
{
"epoch": 2.1866163996229973,
"grad_norm": 90.07513427734375,
"learning_rate": 3.460127367368836e-06,
"loss": 0.0562,
"num_input_tokens_seen": 5386560,
"step": 2320
},
{
"epoch": 2.191328934967012,
"grad_norm": 0.10351494699716568,
"learning_rate": 3.452527216577665e-06,
"loss": 0.1956,
"num_input_tokens_seen": 5399296,
"step": 2325
},
{
"epoch": 2.196041470311027,
"grad_norm": 0.17435222864151,
"learning_rate": 3.444916753572267e-06,
"loss": 0.1061,
"num_input_tokens_seen": 5410944,
"step": 2330
},
{
"epoch": 2.2007540056550425,
"grad_norm": 0.2240123301744461,
"learning_rate": 3.4372960607447493e-06,
"loss": 0.0012,
"num_input_tokens_seen": 5423168,
"step": 2335
},
{
"epoch": 2.2054665409990575,
"grad_norm": 0.03307259455323219,
"learning_rate": 3.429665220597968e-06,
"loss": 0.0111,
"num_input_tokens_seen": 5436544,
"step": 2340
},
{
"epoch": 2.2101790763430724,
"grad_norm": 0.026153933256864548,
"learning_rate": 3.4220243157446388e-06,
"loss": 0.0934,
"num_input_tokens_seen": 5448512,
"step": 2345
},
{
"epoch": 2.214891611687088,
"grad_norm": 150.3295440673828,
"learning_rate": 3.4143734289064363e-06,
"loss": 0.0139,
"num_input_tokens_seen": 5460032,
"step": 2350
},
{
"epoch": 2.2196041470311028,
"grad_norm": 0.09933875501155853,
"learning_rate": 3.4067126429131035e-06,
"loss": 0.0004,
"num_input_tokens_seen": 5472896,
"step": 2355
},
{
"epoch": 2.2243166823751177,
"grad_norm": 0.017140116542577744,
"learning_rate": 3.3990420407015534e-06,
"loss": 0.0005,
"num_input_tokens_seen": 5482944,
"step": 2360
},
{
"epoch": 2.229029217719133,
"grad_norm": 88.95214080810547,
"learning_rate": 3.3913617053149694e-06,
"loss": 0.0536,
"num_input_tokens_seen": 5494336,
"step": 2365
},
{
"epoch": 2.233741753063148,
"grad_norm": 0.016227245330810547,
"learning_rate": 3.3836717199019087e-06,
"loss": 0.0001,
"num_input_tokens_seen": 5505728,
"step": 2370
},
{
"epoch": 2.238454288407163,
"grad_norm": 0.01039363257586956,
"learning_rate": 3.3759721677154022e-06,
"loss": 0.0861,
"num_input_tokens_seen": 5515328,
"step": 2375
},
{
"epoch": 2.243166823751178,
"grad_norm": 0.014802216552197933,
"learning_rate": 3.3682631321120507e-06,
"loss": 0.0002,
"num_input_tokens_seen": 5525760,
"step": 2380
},
{
"epoch": 2.2478793590951933,
"grad_norm": 0.8376834392547607,
"learning_rate": 3.3605446965511256e-06,
"loss": 0.168,
"num_input_tokens_seen": 5537280,
"step": 2385
},
{
"epoch": 2.2525918944392083,
"grad_norm": 20.87982749938965,
"learning_rate": 3.3528169445936616e-06,
"loss": 0.0898,
"num_input_tokens_seen": 5548928,
"step": 2390
},
{
"epoch": 2.2563619227144205,
"eval_loss": 0.46465176343917847,
"eval_runtime": 2.7461,
"eval_samples_per_second": 343.401,
"eval_steps_per_second": 42.971,
"num_input_tokens_seen": 5558144,
"step": 2394
},
{
"epoch": 2.257304429783223,
"grad_norm": 446.71234130859375,
"learning_rate": 3.3450799599015567e-06,
"loss": 0.1847,
"num_input_tokens_seen": 5559872,
"step": 2395
},
{
"epoch": 2.2620169651272386,
"grad_norm": 0.04414854571223259,
"learning_rate": 3.3373338262366617e-06,
"loss": 0.0234,
"num_input_tokens_seen": 5571264,
"step": 2400
},
{
"epoch": 2.2667295004712535,
"grad_norm": 0.1296156644821167,
"learning_rate": 3.329578627459878e-06,
"loss": 0.0881,
"num_input_tokens_seen": 5581312,
"step": 2405
},
{
"epoch": 2.2714420358152685,
"grad_norm": 0.03757103905081749,
"learning_rate": 3.3218144475302444e-06,
"loss": 0.0004,
"num_input_tokens_seen": 5592384,
"step": 2410
},
{
"epoch": 2.276154571159284,
"grad_norm": 0.04516446590423584,
"learning_rate": 3.314041370504034e-06,
"loss": 0.1036,
"num_input_tokens_seen": 5603456,
"step": 2415
},
{
"epoch": 2.280867106503299,
"grad_norm": 0.09362529218196869,
"learning_rate": 3.30625948053384e-06,
"loss": 0.0579,
"num_input_tokens_seen": 5614464,
"step": 2420
},
{
"epoch": 2.2855796418473138,
"grad_norm": 10.87307071685791,
"learning_rate": 3.2984688618676665e-06,
"loss": 0.089,
"num_input_tokens_seen": 5626112,
"step": 2425
},
{
"epoch": 2.290292177191329,
"grad_norm": 0.38255080580711365,
"learning_rate": 3.2906695988480144e-06,
"loss": 0.0886,
"num_input_tokens_seen": 5637248,
"step": 2430
},
{
"epoch": 2.295004712535344,
"grad_norm": 35.04936599731445,
"learning_rate": 3.2828617759109715e-06,
"loss": 0.0709,
"num_input_tokens_seen": 5647552,
"step": 2435
},
{
"epoch": 2.299717247879359,
"grad_norm": 0.16362737119197845,
"learning_rate": 3.2750454775852956e-06,
"loss": 0.0006,
"num_input_tokens_seen": 5662080,
"step": 2440
},
{
"epoch": 2.304429783223374,
"grad_norm": 0.023827245458960533,
"learning_rate": 3.2672207884915017e-06,
"loss": 0.0005,
"num_input_tokens_seen": 5673856,
"step": 2445
},
{
"epoch": 2.3091423185673894,
"grad_norm": 55.789493560791016,
"learning_rate": 3.2593877933409436e-06,
"loss": 0.107,
"num_input_tokens_seen": 5683904,
"step": 2450
},
{
"epoch": 2.3138548539114043,
"grad_norm": 0.02205372042953968,
"learning_rate": 3.251546576934897e-06,
"loss": 0.0003,
"num_input_tokens_seen": 5694400,
"step": 2455
},
{
"epoch": 2.3185673892554193,
"grad_norm": 65.09204864501953,
"learning_rate": 3.2436972241636443e-06,
"loss": 0.1635,
"num_input_tokens_seen": 5705664,
"step": 2460
},
{
"epoch": 2.3232799245994347,
"grad_norm": 0.022804006934165955,
"learning_rate": 3.2358398200055515e-06,
"loss": 0.0001,
"num_input_tokens_seen": 5718848,
"step": 2465
},
{
"epoch": 2.3279924599434496,
"grad_norm": 0.01745908334851265,
"learning_rate": 3.227974449526152e-06,
"loss": 0.0504,
"num_input_tokens_seen": 5732096,
"step": 2470
},
{
"epoch": 2.3327049952874646,
"grad_norm": 91.0146713256836,
"learning_rate": 3.2201011978772224e-06,
"loss": 0.09,
"num_input_tokens_seen": 5742144,
"step": 2475
},
{
"epoch": 2.3374175306314795,
"grad_norm": 0.06392789632081985,
"learning_rate": 3.2122201502958635e-06,
"loss": 0.0647,
"num_input_tokens_seen": 5754176,
"step": 2480
},
{
"epoch": 2.342130065975495,
"grad_norm": 0.008629159070551395,
"learning_rate": 3.2043313921035747e-06,
"loss": 0.0155,
"num_input_tokens_seen": 5767104,
"step": 2485
},
{
"epoch": 2.34684260131951,
"grad_norm": 113.13795471191406,
"learning_rate": 3.1964350087053323e-06,
"loss": 0.3015,
"num_input_tokens_seen": 5779520,
"step": 2490
},
{
"epoch": 2.3515551366635252,
"grad_norm": 243.08010864257812,
"learning_rate": 3.1885310855886655e-06,
"loss": 0.0284,
"num_input_tokens_seen": 5792640,
"step": 2495
},
{
"epoch": 2.35626767200754,
"grad_norm": 0.029916413128376007,
"learning_rate": 3.1806197083227276e-06,
"loss": 0.0001,
"num_input_tokens_seen": 5805696,
"step": 2500
},
{
"epoch": 2.360980207351555,
"grad_norm": 0.012451832182705402,
"learning_rate": 3.172700962557373e-06,
"loss": 0.168,
"num_input_tokens_seen": 5819840,
"step": 2505
},
{
"epoch": 2.36569274269557,
"grad_norm": 0.06405292451381683,
"learning_rate": 3.1647749340222288e-06,
"loss": 0.1209,
"num_input_tokens_seen": 5830016,
"step": 2510
},
{
"epoch": 2.3704052780395855,
"grad_norm": 31.766504287719727,
"learning_rate": 3.1568417085257653e-06,
"loss": 0.0744,
"num_input_tokens_seen": 5840000,
"step": 2515
},
{
"epoch": 2.3751178133836004,
"grad_norm": 117.67131805419922,
"learning_rate": 3.1489013719543703e-06,
"loss": 0.0681,
"num_input_tokens_seen": 5849920,
"step": 2520
},
{
"epoch": 2.3798303487276153,
"grad_norm": 17.114727020263672,
"learning_rate": 3.140954010271416e-06,
"loss": 0.2567,
"num_input_tokens_seen": 5860480,
"step": 2525
},
{
"epoch": 2.3845428840716307,
"grad_norm": 0.0346570685505867,
"learning_rate": 3.132999709516329e-06,
"loss": 0.0055,
"num_input_tokens_seen": 5873408,
"step": 2530
},
{
"epoch": 2.3892554194156457,
"grad_norm": 3.354789972305298,
"learning_rate": 3.1250385558036606e-06,
"loss": 0.0887,
"num_input_tokens_seen": 5884608,
"step": 2535
},
{
"epoch": 2.3939679547596606,
"grad_norm": 46.3475227355957,
"learning_rate": 3.1170706353221525e-06,
"loss": 0.2362,
"num_input_tokens_seen": 5896064,
"step": 2540
},
{
"epoch": 2.3986804901036756,
"grad_norm": 0.14980660378932953,
"learning_rate": 3.109096034333805e-06,
"loss": 0.0014,
"num_input_tokens_seen": 5907776,
"step": 2545
},
{
"epoch": 2.403393025447691,
"grad_norm": 63.976871490478516,
"learning_rate": 3.1011148391729434e-06,
"loss": 0.0292,
"num_input_tokens_seen": 5919744,
"step": 2550
},
{
"epoch": 2.408105560791706,
"grad_norm": 0.936824381351471,
"learning_rate": 3.0931271362452803e-06,
"loss": 0.18,
"num_input_tokens_seen": 5932224,
"step": 2555
},
{
"epoch": 2.412818096135721,
"grad_norm": 0.04161603003740311,
"learning_rate": 3.085133012026985e-06,
"loss": 0.001,
"num_input_tokens_seen": 5943424,
"step": 2560
},
{
"epoch": 2.4175306314797362,
"grad_norm": 50.99045181274414,
"learning_rate": 3.0771325530637434e-06,
"loss": 0.1243,
"num_input_tokens_seen": 5955904,
"step": 2565
},
{
"epoch": 2.422243166823751,
"grad_norm": 1.0831135511398315,
"learning_rate": 3.0691258459698227e-06,
"loss": 0.0789,
"num_input_tokens_seen": 5967360,
"step": 2570
},
{
"epoch": 2.426955702167766,
"grad_norm": 0.2694717049598694,
"learning_rate": 3.0611129774271318e-06,
"loss": 0.1948,
"num_input_tokens_seen": 5980608,
"step": 2575
},
{
"epoch": 2.4316682375117815,
"grad_norm": 0.017116645351052284,
"learning_rate": 3.0530940341842883e-06,
"loss": 0.0003,
"num_input_tokens_seen": 5993472,
"step": 2580
},
{
"epoch": 2.4363807728557965,
"grad_norm": 0.10097761452198029,
"learning_rate": 3.045069103055672e-06,
"loss": 0.0005,
"num_input_tokens_seen": 6003520,
"step": 2585
},
{
"epoch": 2.4410933081998114,
"grad_norm": 2.4190433025360107,
"learning_rate": 3.037038270920489e-06,
"loss": 0.0118,
"num_input_tokens_seen": 6014720,
"step": 2590
},
{
"epoch": 2.445805843543827,
"grad_norm": 0.22586967051029205,
"learning_rate": 3.0290016247218323e-06,
"loss": 0.0956,
"num_input_tokens_seen": 6032192,
"step": 2595
},
{
"epoch": 2.4505183788878417,
"grad_norm": 0.04388771951198578,
"learning_rate": 3.0209592514657365e-06,
"loss": 0.2412,
"num_input_tokens_seen": 6043328,
"step": 2600
},
{
"epoch": 2.4552309142318567,
"grad_norm": 27.575590133666992,
"learning_rate": 3.012911238220241e-06,
"loss": 0.0061,
"num_input_tokens_seen": 6055424,
"step": 2605
},
{
"epoch": 2.4599434495758716,
"grad_norm": 0.026035049930214882,
"learning_rate": 3.004857672114443e-06,
"loss": 0.2284,
"num_input_tokens_seen": 6065472,
"step": 2610
},
{
"epoch": 2.464655984919887,
"grad_norm": 1.065898060798645,
"learning_rate": 2.996798640337556e-06,
"loss": 0.0007,
"num_input_tokens_seen": 6078016,
"step": 2615
},
{
"epoch": 2.469368520263902,
"grad_norm": 17.635618209838867,
"learning_rate": 2.9887342301379653e-06,
"loss": 0.0974,
"num_input_tokens_seen": 6089472,
"step": 2620
},
{
"epoch": 2.474081055607917,
"grad_norm": 9.892471313476562,
"learning_rate": 2.9806645288222854e-06,
"loss": 0.1484,
"num_input_tokens_seen": 6100992,
"step": 2625
},
{
"epoch": 2.4787935909519323,
"grad_norm": 0.05071339011192322,
"learning_rate": 2.9725896237544115e-06,
"loss": 0.0821,
"num_input_tokens_seen": 6112768,
"step": 2630
},
{
"epoch": 2.4835061262959472,
"grad_norm": 0.19504772126674652,
"learning_rate": 2.9645096023545774e-06,
"loss": 0.0017,
"num_input_tokens_seen": 6122752,
"step": 2635
},
{
"epoch": 2.488218661639962,
"grad_norm": 0.5724993348121643,
"learning_rate": 2.956424552098405e-06,
"loss": 0.05,
"num_input_tokens_seen": 6136256,
"step": 2640
},
{
"epoch": 2.492931196983977,
"grad_norm": 0.17895296216011047,
"learning_rate": 2.94833456051596e-06,
"loss": 0.0714,
"num_input_tokens_seen": 6147264,
"step": 2645
},
{
"epoch": 2.4976437323279925,
"grad_norm": 0.440418004989624,
"learning_rate": 2.9402397151908056e-06,
"loss": 0.0012,
"num_input_tokens_seen": 6161088,
"step": 2650
},
{
"epoch": 2.5023562676720075,
"grad_norm": 0.04092998430132866,
"learning_rate": 2.93214010375905e-06,
"loss": 0.0567,
"num_input_tokens_seen": 6173568,
"step": 2655
},
{
"epoch": 2.507068803016023,
"grad_norm": 0.02762027271091938,
"learning_rate": 2.924035813908402e-06,
"loss": 0.0692,
"num_input_tokens_seen": 6183872,
"step": 2660
},
{
"epoch": 2.507068803016023,
"eval_loss": 0.40977799892425537,
"eval_runtime": 2.7366,
"eval_samples_per_second": 344.588,
"eval_steps_per_second": 43.119,
"num_input_tokens_seen": 6183872,
"step": 2660
},
{
"epoch": 2.511781338360038,
"grad_norm": 0.12844966351985931,
"learning_rate": 2.9159269333772173e-06,
"loss": 0.0693,
"num_input_tokens_seen": 6195648,
"step": 2665
},
{
"epoch": 2.5164938737040528,
"grad_norm": 0.043845776468515396,
"learning_rate": 2.9078135499535535e-06,
"loss": 0.0003,
"num_input_tokens_seen": 6205696,
"step": 2670
},
{
"epoch": 2.5212064090480677,
"grad_norm": 0.1523384004831314,
"learning_rate": 2.8996957514742164e-06,
"loss": 0.0993,
"num_input_tokens_seen": 6219648,
"step": 2675
},
{
"epoch": 2.525918944392083,
"grad_norm": 0.03311268240213394,
"learning_rate": 2.891573625823808e-06,
"loss": 0.0016,
"num_input_tokens_seen": 6233664,
"step": 2680
},
{
"epoch": 2.530631479736098,
"grad_norm": 0.013956856913864613,
"learning_rate": 2.883447260933781e-06,
"loss": 0.0002,
"num_input_tokens_seen": 6246400,
"step": 2685
},
{
"epoch": 2.535344015080113,
"grad_norm": 0.00951316673308611,
"learning_rate": 2.875316744781479e-06,
"loss": 0.0776,
"num_input_tokens_seen": 6256576,
"step": 2690
},
{
"epoch": 2.5400565504241284,
"grad_norm": 18.4660587310791,
"learning_rate": 2.8671821653891903e-06,
"loss": 0.0909,
"num_input_tokens_seen": 6266240,
"step": 2695
},
{
"epoch": 2.5447690857681433,
"grad_norm": 14.168307304382324,
"learning_rate": 2.85904361082319e-06,
"loss": 0.1384,
"num_input_tokens_seen": 6279872,
"step": 2700
},
{
"epoch": 2.5494816211121583,
"grad_norm": 0.031402263790369034,
"learning_rate": 2.8509011691927923e-06,
"loss": 0.0001,
"num_input_tokens_seen": 6290048,
"step": 2705
},
{
"epoch": 2.554194156456173,
"grad_norm": 0.06130323186516762,
"learning_rate": 2.8427549286493906e-06,
"loss": 0.0368,
"num_input_tokens_seen": 6301120,
"step": 2710
},
{
"epoch": 2.5589066918001886,
"grad_norm": 0.13699810206890106,
"learning_rate": 2.8346049773855077e-06,
"loss": 0.1002,
"num_input_tokens_seen": 6312512,
"step": 2715
},
{
"epoch": 2.5636192271442035,
"grad_norm": 0.026166977360844612,
"learning_rate": 2.8264514036338385e-06,
"loss": 0.0002,
"num_input_tokens_seen": 6323776,
"step": 2720
},
{
"epoch": 2.568331762488219,
"grad_norm": 0.05908394604921341,
"learning_rate": 2.818294295666295e-06,
"loss": 0.0003,
"num_input_tokens_seen": 6334208,
"step": 2725
},
{
"epoch": 2.573044297832234,
"grad_norm": 1.3038756847381592,
"learning_rate": 2.8101337417930523e-06,
"loss": 0.0952,
"num_input_tokens_seen": 6345216,
"step": 2730
},
{
"epoch": 2.577756833176249,
"grad_norm": 39.35552215576172,
"learning_rate": 2.8019698303615912e-06,
"loss": 0.2239,
"num_input_tokens_seen": 6354304,
"step": 2735
},
{
"epoch": 2.5824693685202638,
"grad_norm": 0.07452750205993652,
"learning_rate": 2.7938026497557414e-06,
"loss": 0.0628,
"num_input_tokens_seen": 6368192,
"step": 2740
},
{
"epoch": 2.5871819038642787,
"grad_norm": 0.034812554717063904,
"learning_rate": 2.7856322883947253e-06,
"loss": 0.0454,
"num_input_tokens_seen": 6382400,
"step": 2745
},
{
"epoch": 2.591894439208294,
"grad_norm": 0.02569451369345188,
"learning_rate": 2.7774588347322016e-06,
"loss": 0.0836,
"num_input_tokens_seen": 6395584,
"step": 2750
},
{
"epoch": 2.596606974552309,
"grad_norm": 479.6157531738281,
"learning_rate": 2.7692823772553057e-06,
"loss": 0.1468,
"num_input_tokens_seen": 6406720,
"step": 2755
},
{
"epoch": 2.6013195098963244,
"grad_norm": 371.5435791015625,
"learning_rate": 2.7611030044836927e-06,
"loss": 0.1705,
"num_input_tokens_seen": 6418112,
"step": 2760
},
{
"epoch": 2.6060320452403394,
"grad_norm": 58.181087493896484,
"learning_rate": 2.752920804968581e-06,
"loss": 0.0602,
"num_input_tokens_seen": 6431104,
"step": 2765
},
{
"epoch": 2.6107445805843543,
"grad_norm": 3.1500463485717773,
"learning_rate": 2.744735867291789e-06,
"loss": 0.0038,
"num_input_tokens_seen": 6441792,
"step": 2770
},
{
"epoch": 2.6154571159283693,
"grad_norm": 0.12725692987442017,
"learning_rate": 2.736548280064781e-06,
"loss": 0.167,
"num_input_tokens_seen": 6452672,
"step": 2775
},
{
"epoch": 2.6201696512723847,
"grad_norm": 0.05792888626456261,
"learning_rate": 2.728358131927704e-06,
"loss": 0.1083,
"num_input_tokens_seen": 6465600,
"step": 2780
},
{
"epoch": 2.6248821866163996,
"grad_norm": 2.1484336853027344,
"learning_rate": 2.720165511548433e-06,
"loss": 0.0731,
"num_input_tokens_seen": 6477312,
"step": 2785
},
{
"epoch": 2.6295947219604145,
"grad_norm": 13.481400489807129,
"learning_rate": 2.711970507621603e-06,
"loss": 0.179,
"num_input_tokens_seen": 6486592,
"step": 2790
},
{
"epoch": 2.63430725730443,
"grad_norm": 0.2009446918964386,
"learning_rate": 2.7037732088676583e-06,
"loss": 0.0011,
"num_input_tokens_seen": 6497088,
"step": 2795
},
{
"epoch": 2.639019792648445,
"grad_norm": 0.11857722699642181,
"learning_rate": 2.6955737040318853e-06,
"loss": 0.0035,
"num_input_tokens_seen": 6505984,
"step": 2800
},
{
"epoch": 2.64373232799246,
"grad_norm": 0.26333293318748474,
"learning_rate": 2.687372081883454e-06,
"loss": 0.0009,
"num_input_tokens_seen": 6516928,
"step": 2805
},
{
"epoch": 2.6484448633364748,
"grad_norm": 0.025169173255562782,
"learning_rate": 2.6791684312144565e-06,
"loss": 0.0096,
"num_input_tokens_seen": 6527424,
"step": 2810
},
{
"epoch": 2.65315739868049,
"grad_norm": 0.04603775218129158,
"learning_rate": 2.670962840838946e-06,
"loss": 0.0955,
"num_input_tokens_seen": 6538432,
"step": 2815
},
{
"epoch": 2.657869934024505,
"grad_norm": 116.4062271118164,
"learning_rate": 2.6627553995919763e-06,
"loss": 0.0341,
"num_input_tokens_seen": 6551552,
"step": 2820
},
{
"epoch": 2.6625824693685205,
"grad_norm": 0.07408913224935532,
"learning_rate": 2.6545461963286374e-06,
"loss": 0.0005,
"num_input_tokens_seen": 6566208,
"step": 2825
},
{
"epoch": 2.6672950047125354,
"grad_norm": 67.10398864746094,
"learning_rate": 2.646335319923097e-06,
"loss": 0.1887,
"num_input_tokens_seen": 6577472,
"step": 2830
},
{
"epoch": 2.6720075400565504,
"grad_norm": 0.03928399085998535,
"learning_rate": 2.6381228592676343e-06,
"loss": 0.1243,
"num_input_tokens_seen": 6588608,
"step": 2835
},
{
"epoch": 2.6767200754005653,
"grad_norm": 0.28905507922172546,
"learning_rate": 2.629908903271683e-06,
"loss": 0.1048,
"num_input_tokens_seen": 6601088,
"step": 2840
},
{
"epoch": 2.6814326107445807,
"grad_norm": 0.06444710493087769,
"learning_rate": 2.6216935408608617e-06,
"loss": 0.0005,
"num_input_tokens_seen": 6611392,
"step": 2845
},
{
"epoch": 2.6861451460885957,
"grad_norm": 14.772208213806152,
"learning_rate": 2.6134768609760187e-06,
"loss": 0.001,
"num_input_tokens_seen": 6622656,
"step": 2850
},
{
"epoch": 2.6908576814326106,
"grad_norm": 222.56637573242188,
"learning_rate": 2.605258952572263e-06,
"loss": 0.0916,
"num_input_tokens_seen": 6635264,
"step": 2855
},
{
"epoch": 2.695570216776626,
"grad_norm": 38.73012924194336,
"learning_rate": 2.5970399046180043e-06,
"loss": 0.0028,
"num_input_tokens_seen": 6647680,
"step": 2860
},
{
"epoch": 2.700282752120641,
"grad_norm": 0.014610473066568375,
"learning_rate": 2.588819806093991e-06,
"loss": 0.0001,
"num_input_tokens_seen": 6662016,
"step": 2865
},
{
"epoch": 2.704995287464656,
"grad_norm": 0.013276712968945503,
"learning_rate": 2.580598745992342e-06,
"loss": 0.1805,
"num_input_tokens_seen": 6673024,
"step": 2870
},
{
"epoch": 2.709707822808671,
"grad_norm": 0.011643171310424805,
"learning_rate": 2.5723768133155894e-06,
"loss": 0.0001,
"num_input_tokens_seen": 6684416,
"step": 2875
},
{
"epoch": 2.7144203581526862,
"grad_norm": 0.04866794869303703,
"learning_rate": 2.5641540970757105e-06,
"loss": 0.0783,
"num_input_tokens_seen": 6696448,
"step": 2880
},
{
"epoch": 2.719132893496701,
"grad_norm": 0.007833253592252731,
"learning_rate": 2.555930686293165e-06,
"loss": 0.0002,
"num_input_tokens_seen": 6710528,
"step": 2885
},
{
"epoch": 2.7238454288407166,
"grad_norm": 0.03470620512962341,
"learning_rate": 2.547706669995933e-06,
"loss": 0.0004,
"num_input_tokens_seen": 6722176,
"step": 2890
},
{
"epoch": 2.7285579641847315,
"grad_norm": 31.088220596313477,
"learning_rate": 2.53948213721855e-06,
"loss": 0.1775,
"num_input_tokens_seen": 6732416,
"step": 2895
},
{
"epoch": 2.7332704995287465,
"grad_norm": 0.12457949668169022,
"learning_rate": 2.531257177001141e-06,
"loss": 0.1137,
"num_input_tokens_seen": 6745728,
"step": 2900
},
{
"epoch": 2.7379830348727614,
"grad_norm": 11.16380786895752,
"learning_rate": 2.523031878388463e-06,
"loss": 0.0956,
"num_input_tokens_seen": 6756096,
"step": 2905
},
{
"epoch": 2.742695570216777,
"grad_norm": 1.0219601392745972,
"learning_rate": 2.5148063304289306e-06,
"loss": 0.063,
"num_input_tokens_seen": 6766976,
"step": 2910
},
{
"epoch": 2.7474081055607917,
"grad_norm": 0.03135580196976662,
"learning_rate": 2.5065806221736617e-06,
"loss": 0.1039,
"num_input_tokens_seen": 6777792,
"step": 2915
},
{
"epoch": 2.7521206409048067,
"grad_norm": 0.06795566529035568,
"learning_rate": 2.4983548426755104e-06,
"loss": 0.0003,
"num_input_tokens_seen": 6789568,
"step": 2920
},
{
"epoch": 2.756833176248822,
"grad_norm": 0.024466486647725105,
"learning_rate": 2.4901290809880984e-06,
"loss": 0.227,
"num_input_tokens_seen": 6803392,
"step": 2925
},
{
"epoch": 2.757775683317625,
"eval_loss": 0.43027257919311523,
"eval_runtime": 2.7751,
"eval_samples_per_second": 339.806,
"eval_steps_per_second": 42.521,
"num_input_tokens_seen": 6806208,
"step": 2926
},
{
"epoch": 2.761545711592837,
"grad_norm": 0.08801653981208801,
"learning_rate": 2.4819034261648574e-06,
"loss": 0.0645,
"num_input_tokens_seen": 6821760,
"step": 2930
},
{
"epoch": 2.766258246936852,
"grad_norm": 0.02651871182024479,
"learning_rate": 2.4736779672580625e-06,
"loss": 0.2084,
"num_input_tokens_seen": 6834688,
"step": 2935
},
{
"epoch": 2.770970782280867,
"grad_norm": 0.9018564224243164,
"learning_rate": 2.465452793317865e-06,
"loss": 0.0731,
"num_input_tokens_seen": 6846784,
"step": 2940
},
{
"epoch": 2.7756833176248823,
"grad_norm": 0.10618780553340912,
"learning_rate": 2.457227993391333e-06,
"loss": 0.0866,
"num_input_tokens_seen": 6859520,
"step": 2945
},
{
"epoch": 2.7803958529688972,
"grad_norm": 0.10207852721214294,
"learning_rate": 2.4490036565214876e-06,
"loss": 0.0008,
"num_input_tokens_seen": 6871296,
"step": 2950
},
{
"epoch": 2.785108388312912,
"grad_norm": 125.38693237304688,
"learning_rate": 2.440779871746331e-06,
"loss": 0.0151,
"num_input_tokens_seen": 6882496,
"step": 2955
},
{
"epoch": 2.7898209236569276,
"grad_norm": 0.05238529294729233,
"learning_rate": 2.4325567280978937e-06,
"loss": 0.0708,
"num_input_tokens_seen": 6894528,
"step": 2960
},
{
"epoch": 2.7945334590009425,
"grad_norm": 56.63581848144531,
"learning_rate": 2.424334314601263e-06,
"loss": 0.1738,
"num_input_tokens_seen": 6904960,
"step": 2965
},
{
"epoch": 2.7992459943449575,
"grad_norm": 0.020289117470383644,
"learning_rate": 2.416112720273623e-06,
"loss": 0.155,
"num_input_tokens_seen": 6914944,
"step": 2970
},
{
"epoch": 2.8039585296889724,
"grad_norm": 0.2439601868391037,
"learning_rate": 2.4078920341232856e-06,
"loss": 0.0006,
"num_input_tokens_seen": 6926080,
"step": 2975
},
{
"epoch": 2.808671065032988,
"grad_norm": 22.042722702026367,
"learning_rate": 2.3996723451487344e-06,
"loss": 0.0028,
"num_input_tokens_seen": 6936832,
"step": 2980
},
{
"epoch": 2.8133836003770027,
"grad_norm": 0.06483375281095505,
"learning_rate": 2.391453742337657e-06,
"loss": 0.2284,
"num_input_tokens_seen": 6948160,
"step": 2985
},
{
"epoch": 2.818096135721018,
"grad_norm": 0.019181005656719208,
"learning_rate": 2.3832363146659806e-06,
"loss": 0.0003,
"num_input_tokens_seen": 6958848,
"step": 2990
},
{
"epoch": 2.822808671065033,
"grad_norm": 265.2681579589844,
"learning_rate": 2.37502015109691e-06,
"loss": 0.1133,
"num_input_tokens_seen": 6970432,
"step": 2995
},
{
"epoch": 2.827521206409048,
"grad_norm": 0.018625818192958832,
"learning_rate": 2.3668053405799667e-06,
"loss": 0.0691,
"num_input_tokens_seen": 6980480,
"step": 3000
},
{
"epoch": 2.832233741753063,
"grad_norm": 241.9073486328125,
"learning_rate": 2.3585919720500214e-06,
"loss": 0.0368,
"num_input_tokens_seen": 6989760,
"step": 3005
},
{
"epoch": 2.8369462770970784,
"grad_norm": 0.006676084361970425,
"learning_rate": 2.3503801344263347e-06,
"loss": 0.093,
"num_input_tokens_seen": 6999232,
"step": 3010
},
{
"epoch": 2.8416588124410933,
"grad_norm": 97.66014862060547,
"learning_rate": 2.3421699166115946e-06,
"loss": 0.2148,
"num_input_tokens_seen": 7010944,
"step": 3015
},
{
"epoch": 2.8463713477851083,
"grad_norm": 0.0063159572891891,
"learning_rate": 2.3339614074909495e-06,
"loss": 0.1475,
"num_input_tokens_seen": 7021824,
"step": 3020
},
{
"epoch": 2.8510838831291236,
"grad_norm": 0.022584695369005203,
"learning_rate": 2.325754695931054e-06,
"loss": 0.1085,
"num_input_tokens_seen": 7031488,
"step": 3025
},
{
"epoch": 2.8557964184731386,
"grad_norm": 34.398460388183594,
"learning_rate": 2.3175498707790964e-06,
"loss": 0.0536,
"num_input_tokens_seen": 7041088,
"step": 3030
},
{
"epoch": 2.8605089538171535,
"grad_norm": 48.400482177734375,
"learning_rate": 2.3093470208618467e-06,
"loss": 0.1759,
"num_input_tokens_seen": 7051840,
"step": 3035
},
{
"epoch": 2.8652214891611685,
"grad_norm": 0.9638648629188538,
"learning_rate": 2.3011462349846907e-06,
"loss": 0.0005,
"num_input_tokens_seen": 7062848,
"step": 3040
},
{
"epoch": 2.869934024505184,
"grad_norm": 2.5327022075653076,
"learning_rate": 2.292947601930664e-06,
"loss": 0.0006,
"num_input_tokens_seen": 7079296,
"step": 3045
},
{
"epoch": 2.874646559849199,
"grad_norm": 27.3743896484375,
"learning_rate": 2.2847512104595005e-06,
"loss": 0.1614,
"num_input_tokens_seen": 7090752,
"step": 3050
},
{
"epoch": 2.879359095193214,
"grad_norm": 0.02387884445488453,
"learning_rate": 2.2765571493066647e-06,
"loss": 0.0003,
"num_input_tokens_seen": 7102464,
"step": 3055
},
{
"epoch": 2.884071630537229,
"grad_norm": 0.011423971503973007,
"learning_rate": 2.2683655071823925e-06,
"loss": 0.038,
"num_input_tokens_seen": 7117376,
"step": 3060
},
{
"epoch": 2.888784165881244,
"grad_norm": 0.048735495656728745,
"learning_rate": 2.2601763727707295e-06,
"loss": 0.0809,
"num_input_tokens_seen": 7131584,
"step": 3065
},
{
"epoch": 2.893496701225259,
"grad_norm": 0.3432343602180481,
"learning_rate": 2.2519898347285745e-06,
"loss": 0.1831,
"num_input_tokens_seen": 7142720,
"step": 3070
},
{
"epoch": 2.8982092365692744,
"grad_norm": 48.5273323059082,
"learning_rate": 2.2438059816847165e-06,
"loss": 0.1239,
"num_input_tokens_seen": 7155520,
"step": 3075
},
{
"epoch": 2.9029217719132894,
"grad_norm": 0.019747210666537285,
"learning_rate": 2.235624902238879e-06,
"loss": 0.0753,
"num_input_tokens_seen": 7165504,
"step": 3080
},
{
"epoch": 2.9076343072573043,
"grad_norm": 87.9238052368164,
"learning_rate": 2.2274466849607526e-06,
"loss": 0.118,
"num_input_tokens_seen": 7176384,
"step": 3085
},
{
"epoch": 2.9123468426013197,
"grad_norm": 0.024710891768336296,
"learning_rate": 2.219271418389046e-06,
"loss": 0.0012,
"num_input_tokens_seen": 7188288,
"step": 3090
},
{
"epoch": 2.9170593779453347,
"grad_norm": 48.7574577331543,
"learning_rate": 2.2110991910305233e-06,
"loss": 0.1523,
"num_input_tokens_seen": 7199680,
"step": 3095
},
{
"epoch": 2.9217719132893496,
"grad_norm": 24.760589599609375,
"learning_rate": 2.2029300913590413e-06,
"loss": 0.0548,
"num_input_tokens_seen": 7211520,
"step": 3100
},
{
"epoch": 2.9264844486333645,
"grad_norm": 0.13239729404449463,
"learning_rate": 2.1947642078146005e-06,
"loss": 0.0932,
"num_input_tokens_seen": 7221440,
"step": 3105
},
{
"epoch": 2.93119698397738,
"grad_norm": 0.09687553346157074,
"learning_rate": 2.1866016288023815e-06,
"loss": 0.0528,
"num_input_tokens_seen": 7232128,
"step": 3110
},
{
"epoch": 2.935909519321395,
"grad_norm": 23.3001651763916,
"learning_rate": 2.178442442691789e-06,
"loss": 0.1414,
"num_input_tokens_seen": 7241984,
"step": 3115
},
{
"epoch": 2.9406220546654103,
"grad_norm": 0.11602967977523804,
"learning_rate": 2.170286737815495e-06,
"loss": 0.0745,
"num_input_tokens_seen": 7252672,
"step": 3120
},
{
"epoch": 2.945334590009425,
"grad_norm": 0.5076259970664978,
"learning_rate": 2.1621346024684854e-06,
"loss": 0.0453,
"num_input_tokens_seen": 7264064,
"step": 3125
},
{
"epoch": 2.95004712535344,
"grad_norm": 2.5609045028686523,
"learning_rate": 2.1539861249071004e-06,
"loss": 0.0268,
"num_input_tokens_seen": 7275776,
"step": 3130
},
{
"epoch": 2.954759660697455,
"grad_norm": 1.9134023189544678,
"learning_rate": 2.145841393348079e-06,
"loss": 0.0361,
"num_input_tokens_seen": 7287680,
"step": 3135
},
{
"epoch": 2.95947219604147,
"grad_norm": 0.09775304049253464,
"learning_rate": 2.1377004959676086e-06,
"loss": 0.001,
"num_input_tokens_seen": 7300032,
"step": 3140
},
{
"epoch": 2.9641847313854854,
"grad_norm": 1.4209673404693604,
"learning_rate": 2.129563520900364e-06,
"loss": 0.0632,
"num_input_tokens_seen": 7311616,
"step": 3145
},
{
"epoch": 2.9688972667295004,
"grad_norm": 0.10756014287471771,
"learning_rate": 2.1214305562385592e-06,
"loss": 0.1604,
"num_input_tokens_seen": 7321600,
"step": 3150
},
{
"epoch": 2.9736098020735158,
"grad_norm": 0.026215003803372383,
"learning_rate": 2.1133016900309876e-06,
"loss": 0.0003,
"num_input_tokens_seen": 7333376,
"step": 3155
},
{
"epoch": 2.9783223374175307,
"grad_norm": 0.01626676134765148,
"learning_rate": 2.1051770102820755e-06,
"loss": 0.0002,
"num_input_tokens_seen": 7344384,
"step": 3160
},
{
"epoch": 2.9830348727615457,
"grad_norm": 0.2678651511669159,
"learning_rate": 2.0970566049509236e-06,
"loss": 0.0799,
"num_input_tokens_seen": 7355840,
"step": 3165
},
{
"epoch": 2.9877474081055606,
"grad_norm": 0.012307991273701191,
"learning_rate": 2.088940561950359e-06,
"loss": 0.0002,
"num_input_tokens_seen": 7368128,
"step": 3170
},
{
"epoch": 2.992459943449576,
"grad_norm": 0.03959092125296593,
"learning_rate": 2.080828969145979e-06,
"loss": 0.1426,
"num_input_tokens_seen": 7381056,
"step": 3175
},
{
"epoch": 2.997172478793591,
"grad_norm": 16.04501724243164,
"learning_rate": 2.0727219143552034e-06,
"loss": 0.094,
"num_input_tokens_seen": 7393536,
"step": 3180
},
{
"epoch": 3.001885014137606,
"grad_norm": 0.013006187044084072,
"learning_rate": 2.0646194853463255e-06,
"loss": 0.0923,
"num_input_tokens_seen": 7402656,
"step": 3185
},
{
"epoch": 3.0065975494816213,
"grad_norm": 0.05889980494976044,
"learning_rate": 2.056521769837553e-06,
"loss": 0.0004,
"num_input_tokens_seen": 7416480,
"step": 3190
},
{
"epoch": 3.008482563619227,
"eval_loss": 0.3936729431152344,
"eval_runtime": 2.7505,
"eval_samples_per_second": 342.849,
"eval_steps_per_second": 42.902,
"num_input_tokens_seen": 7421856,
"step": 3192
},
{
"epoch": 3.0113100848256362,
"grad_norm": 0.11007480323314667,
"learning_rate": 2.0484288554960707e-06,
"loss": 0.0003,
"num_input_tokens_seen": 7430304,
"step": 3195
},
{
"epoch": 3.016022620169651,
"grad_norm": 0.023877175524830818,
"learning_rate": 2.040340829937082e-06,
"loss": 0.052,
"num_input_tokens_seen": 7441568,
"step": 3200
},
{
"epoch": 3.0207351555136666,
"grad_norm": 0.016011981293559074,
"learning_rate": 2.032257780722865e-06,
"loss": 0.0003,
"num_input_tokens_seen": 7451744,
"step": 3205
},
{
"epoch": 3.0254476908576815,
"grad_norm": 0.02110173925757408,
"learning_rate": 2.0241797953618204e-06,
"loss": 0.0002,
"num_input_tokens_seen": 7463008,
"step": 3210
},
{
"epoch": 3.0301602262016964,
"grad_norm": 0.018056534230709076,
"learning_rate": 2.0161069613075295e-06,
"loss": 0.0001,
"num_input_tokens_seen": 7475424,
"step": 3215
},
{
"epoch": 3.0348727615457114,
"grad_norm": 0.018598072230815887,
"learning_rate": 2.008039365957804e-06,
"loss": 0.0002,
"num_input_tokens_seen": 7486368,
"step": 3220
},
{
"epoch": 3.039585296889727,
"grad_norm": 2.553637981414795,
"learning_rate": 1.9999770966537416e-06,
"loss": 0.0005,
"num_input_tokens_seen": 7497312,
"step": 3225
},
{
"epoch": 3.0442978322337417,
"grad_norm": 81.77398681640625,
"learning_rate": 1.991920240678776e-06,
"loss": 0.0457,
"num_input_tokens_seen": 7507552,
"step": 3230
},
{
"epoch": 3.0490103675777567,
"grad_norm": 0.0028707189485430717,
"learning_rate": 1.983868885257739e-06,
"loss": 0.0001,
"num_input_tokens_seen": 7519008,
"step": 3235
},
{
"epoch": 3.053722902921772,
"grad_norm": 25.919538497924805,
"learning_rate": 1.97582311755591e-06,
"loss": 0.0908,
"num_input_tokens_seen": 7530400,
"step": 3240
},
{
"epoch": 3.058435438265787,
"grad_norm": 0.009589405730366707,
"learning_rate": 1.9677830246780764e-06,
"loss": 0.0002,
"num_input_tokens_seen": 7544096,
"step": 3245
},
{
"epoch": 3.063147973609802,
"grad_norm": 0.04477664828300476,
"learning_rate": 1.9597486936675886e-06,
"loss": 0.0044,
"num_input_tokens_seen": 7554784,
"step": 3250
},
{
"epoch": 3.0678605089538173,
"grad_norm": 0.0675143375992775,
"learning_rate": 1.9517202115054174e-06,
"loss": 0.0001,
"num_input_tokens_seen": 7567392,
"step": 3255
},
{
"epoch": 3.0725730442978323,
"grad_norm": 0.010941299609839916,
"learning_rate": 1.9436976651092143e-06,
"loss": 0.0001,
"num_input_tokens_seen": 7578016,
"step": 3260
},
{
"epoch": 3.0772855796418472,
"grad_norm": 0.00475684879347682,
"learning_rate": 1.9356811413323686e-06,
"loss": 0.0689,
"num_input_tokens_seen": 7589728,
"step": 3265
},
{
"epoch": 3.081998114985862,
"grad_norm": 0.009347557090222836,
"learning_rate": 1.9276707269630664e-06,
"loss": 0.0006,
"num_input_tokens_seen": 7601184,
"step": 3270
},
{
"epoch": 3.0867106503298776,
"grad_norm": 0.02251746505498886,
"learning_rate": 1.9196665087233548e-06,
"loss": 0.0001,
"num_input_tokens_seen": 7612128,
"step": 3275
},
{
"epoch": 3.0914231856738925,
"grad_norm": 0.006476939655840397,
"learning_rate": 1.9116685732681995e-06,
"loss": 0.0004,
"num_input_tokens_seen": 7623776,
"step": 3280
},
{
"epoch": 3.0961357210179075,
"grad_norm": 0.008666305802762508,
"learning_rate": 1.9036770071845467e-06,
"loss": 0.0001,
"num_input_tokens_seen": 7636128,
"step": 3285
},
{
"epoch": 3.100848256361923,
"grad_norm": 0.07765082269906998,
"learning_rate": 1.8956918969903881e-06,
"loss": 0.0002,
"num_input_tokens_seen": 7646432,
"step": 3290
},
{
"epoch": 3.105560791705938,
"grad_norm": 0.006188265047967434,
"learning_rate": 1.887713329133824e-06,
"loss": 0.0,
"num_input_tokens_seen": 7657824,
"step": 3295
},
{
"epoch": 3.1102733270499527,
"grad_norm": 28.873966217041016,
"learning_rate": 1.8797413899921224e-06,
"loss": 0.0829,
"num_input_tokens_seen": 7669920,
"step": 3300
},
{
"epoch": 3.114985862393968,
"grad_norm": 34.75840759277344,
"learning_rate": 1.8717761658707916e-06,
"loss": 0.0054,
"num_input_tokens_seen": 7681952,
"step": 3305
},
{
"epoch": 3.119698397737983,
"grad_norm": 0.005098584573715925,
"learning_rate": 1.86381774300264e-06,
"loss": 0.0,
"num_input_tokens_seen": 7692832,
"step": 3310
},
{
"epoch": 3.124410933081998,
"grad_norm": 0.012214220128953457,
"learning_rate": 1.8558662075468468e-06,
"loss": 0.1029,
"num_input_tokens_seen": 7703072,
"step": 3315
},
{
"epoch": 3.1291234684260134,
"grad_norm": 0.05773286893963814,
"learning_rate": 1.8479216455880225e-06,
"loss": 0.0,
"num_input_tokens_seen": 7714016,
"step": 3320
},
{
"epoch": 3.1338360037700284,
"grad_norm": 0.0058107743971049786,
"learning_rate": 1.8399841431352855e-06,
"loss": 0.0002,
"num_input_tokens_seen": 7726688,
"step": 3325
},
{
"epoch": 3.1385485391140433,
"grad_norm": 0.11002416163682938,
"learning_rate": 1.8320537861213267e-06,
"loss": 0.0001,
"num_input_tokens_seen": 7739680,
"step": 3330
},
{
"epoch": 3.1432610744580582,
"grad_norm": 0.04805764928460121,
"learning_rate": 1.8241306604014761e-06,
"loss": 0.0001,
"num_input_tokens_seen": 7749024,
"step": 3335
},
{
"epoch": 3.1479736098020736,
"grad_norm": 0.001311355852521956,
"learning_rate": 1.816214851752779e-06,
"loss": 0.0008,
"num_input_tokens_seen": 7761568,
"step": 3340
},
{
"epoch": 3.1526861451460886,
"grad_norm": 0.013387499377131462,
"learning_rate": 1.8083064458730651e-06,
"loss": 0.0001,
"num_input_tokens_seen": 7772640,
"step": 3345
},
{
"epoch": 3.1573986804901035,
"grad_norm": 0.017248639836907387,
"learning_rate": 1.8004055283800204e-06,
"loss": 0.0004,
"num_input_tokens_seen": 7784672,
"step": 3350
},
{
"epoch": 3.162111215834119,
"grad_norm": 0.0028156836051493883,
"learning_rate": 1.7925121848102583e-06,
"loss": 0.0,
"num_input_tokens_seen": 7795872,
"step": 3355
},
{
"epoch": 3.166823751178134,
"grad_norm": 0.005071389954537153,
"learning_rate": 1.7846265006183976e-06,
"loss": 0.0,
"num_input_tokens_seen": 7808416,
"step": 3360
},
{
"epoch": 3.171536286522149,
"grad_norm": 0.013269501738250256,
"learning_rate": 1.776748561176137e-06,
"loss": 0.0,
"num_input_tokens_seen": 7820640,
"step": 3365
},
{
"epoch": 3.176248821866164,
"grad_norm": 0.010563570074737072,
"learning_rate": 1.7688784517713247e-06,
"loss": 0.0,
"num_input_tokens_seen": 7831072,
"step": 3370
},
{
"epoch": 3.180961357210179,
"grad_norm": 29.668603897094727,
"learning_rate": 1.761016257607044e-06,
"loss": 0.0969,
"num_input_tokens_seen": 7841888,
"step": 3375
},
{
"epoch": 3.185673892554194,
"grad_norm": 0.07287805527448654,
"learning_rate": 1.7531620638006834e-06,
"loss": 0.0488,
"num_input_tokens_seen": 7852896,
"step": 3380
},
{
"epoch": 3.190386427898209,
"grad_norm": 27.53697395324707,
"learning_rate": 1.7453159553830217e-06,
"loss": 0.0013,
"num_input_tokens_seen": 7868384,
"step": 3385
},
{
"epoch": 3.1950989632422244,
"grad_norm": 0.006890235003083944,
"learning_rate": 1.7374780172973004e-06,
"loss": 0.0001,
"num_input_tokens_seen": 7881312,
"step": 3390
},
{
"epoch": 3.1998114985862394,
"grad_norm": 0.0019902060739696026,
"learning_rate": 1.7296483343983095e-06,
"loss": 0.0564,
"num_input_tokens_seen": 7892128,
"step": 3395
},
{
"epoch": 3.2045240339302543,
"grad_norm": 0.020912524312734604,
"learning_rate": 1.7218269914514668e-06,
"loss": 0.0002,
"num_input_tokens_seen": 7902624,
"step": 3400
},
{
"epoch": 3.2092365692742697,
"grad_norm": 0.007507129572331905,
"learning_rate": 1.714014073131901e-06,
"loss": 0.0001,
"num_input_tokens_seen": 7915168,
"step": 3405
},
{
"epoch": 3.2139491046182846,
"grad_norm": 0.053086619824171066,
"learning_rate": 1.7062096640235327e-06,
"loss": 0.0002,
"num_input_tokens_seen": 7925472,
"step": 3410
},
{
"epoch": 3.2186616399622996,
"grad_norm": 0.1808883100748062,
"learning_rate": 1.6984138486181612e-06,
"loss": 0.0001,
"num_input_tokens_seen": 7940576,
"step": 3415
},
{
"epoch": 3.223374175306315,
"grad_norm": 0.015256045386195183,
"learning_rate": 1.6906267113145514e-06,
"loss": 0.0323,
"num_input_tokens_seen": 7956064,
"step": 3420
},
{
"epoch": 3.22808671065033,
"grad_norm": 0.022556964308023453,
"learning_rate": 1.6828483364175127e-06,
"loss": 0.0,
"num_input_tokens_seen": 7967264,
"step": 3425
},
{
"epoch": 3.232799245994345,
"grad_norm": 1.4760725498199463,
"learning_rate": 1.6750788081369951e-06,
"loss": 0.0003,
"num_input_tokens_seen": 7978144,
"step": 3430
},
{
"epoch": 3.23751178133836,
"grad_norm": 18.443159103393555,
"learning_rate": 1.6673182105871733e-06,
"loss": 0.0443,
"num_input_tokens_seen": 7989152,
"step": 3435
},
{
"epoch": 3.242224316682375,
"grad_norm": 0.005763629917055368,
"learning_rate": 1.659566627785536e-06,
"loss": 0.0,
"num_input_tokens_seen": 8000800,
"step": 3440
},
{
"epoch": 3.24693685202639,
"grad_norm": 0.0024256331380456686,
"learning_rate": 1.651824143651975e-06,
"loss": 0.0004,
"num_input_tokens_seen": 8014816,
"step": 3445
},
{
"epoch": 3.251649387370405,
"grad_norm": 0.003130377735942602,
"learning_rate": 1.644090842007881e-06,
"loss": 0.0,
"num_input_tokens_seen": 8025120,
"step": 3450
},
{
"epoch": 3.2563619227144205,
"grad_norm": 0.02095922827720642,
"learning_rate": 1.6363668065752336e-06,
"loss": 0.0,
"num_input_tokens_seen": 8037344,
"step": 3455
},
{
"epoch": 3.2591894439208295,
"eval_loss": 0.5191035270690918,
"eval_runtime": 2.7496,
"eval_samples_per_second": 342.955,
"eval_steps_per_second": 42.915,
"num_input_tokens_seen": 8043744,
"step": 3458
},
{
"epoch": 3.2610744580584354,
"grad_norm": 0.0036951308138668537,
"learning_rate": 1.6286521209756917e-06,
"loss": 0.0875,
"num_input_tokens_seen": 8048096,
"step": 3460
},
{
"epoch": 3.2657869934024504,
"grad_norm": 0.004653128329664469,
"learning_rate": 1.6209468687296947e-06,
"loss": 0.0,
"num_input_tokens_seen": 8061344,
"step": 3465
},
{
"epoch": 3.2704995287464658,
"grad_norm": 0.004204562399536371,
"learning_rate": 1.613251133255554e-06,
"loss": 0.0,
"num_input_tokens_seen": 8073184,
"step": 3470
},
{
"epoch": 3.2752120640904807,
"grad_norm": 0.005397483240813017,
"learning_rate": 1.6055649978685517e-06,
"loss": 0.0,
"num_input_tokens_seen": 8082976,
"step": 3475
},
{
"epoch": 3.2799245994344957,
"grad_norm": 0.01232027355581522,
"learning_rate": 1.5978885457800348e-06,
"loss": 0.0,
"num_input_tokens_seen": 8094624,
"step": 3480
},
{
"epoch": 3.284637134778511,
"grad_norm": 57.04811096191406,
"learning_rate": 1.59022186009652e-06,
"loss": 0.0843,
"num_input_tokens_seen": 8104928,
"step": 3485
},
{
"epoch": 3.289349670122526,
"grad_norm": 0.009285739623010159,
"learning_rate": 1.5825650238187918e-06,
"loss": 0.0,
"num_input_tokens_seen": 8116896,
"step": 3490
},
{
"epoch": 3.294062205466541,
"grad_norm": 0.009216835722327232,
"learning_rate": 1.5749181198410014e-06,
"loss": 0.0875,
"num_input_tokens_seen": 8127968,
"step": 3495
},
{
"epoch": 3.298774740810556,
"grad_norm": 0.00624003866687417,
"learning_rate": 1.5672812309497722e-06,
"loss": 0.0326,
"num_input_tokens_seen": 8139936,
"step": 3500
},
{
"epoch": 3.3034872761545713,
"grad_norm": 0.014653928577899933,
"learning_rate": 1.5596544398233028e-06,
"loss": 0.0001,
"num_input_tokens_seen": 8151392,
"step": 3505
},
{
"epoch": 3.308199811498586,
"grad_norm": 0.00760252121835947,
"learning_rate": 1.5520378290304723e-06,
"loss": 0.0,
"num_input_tokens_seen": 8165280,
"step": 3510
},
{
"epoch": 3.312912346842601,
"grad_norm": 0.037401266396045685,
"learning_rate": 1.544431481029944e-06,
"loss": 0.0,
"num_input_tokens_seen": 8177696,
"step": 3515
},
{
"epoch": 3.3176248821866166,
"grad_norm": 0.004210233688354492,
"learning_rate": 1.5368354781692764e-06,
"loss": 0.0,
"num_input_tokens_seen": 8189280,
"step": 3520
},
{
"epoch": 3.3223374175306315,
"grad_norm": 0.08442122489213943,
"learning_rate": 1.5292499026840292e-06,
"loss": 0.0001,
"num_input_tokens_seen": 8202784,
"step": 3525
},
{
"epoch": 3.3270499528746464,
"grad_norm": 0.0204016100615263,
"learning_rate": 1.5216748366968743e-06,
"loss": 0.1032,
"num_input_tokens_seen": 8216032,
"step": 3530
},
{
"epoch": 3.331762488218662,
"grad_norm": 0.002625198569148779,
"learning_rate": 1.5141103622167042e-06,
"loss": 0.0001,
"num_input_tokens_seen": 8228320,
"step": 3535
},
{
"epoch": 3.336475023562677,
"grad_norm": 0.00422442564740777,
"learning_rate": 1.5065565611377472e-06,
"loss": 0.0487,
"num_input_tokens_seen": 8240416,
"step": 3540
},
{
"epoch": 3.3411875589066917,
"grad_norm": 0.001890690764412284,
"learning_rate": 1.4990135152386814e-06,
"loss": 0.0,
"num_input_tokens_seen": 8252640,
"step": 3545
},
{
"epoch": 3.345900094250707,
"grad_norm": 0.04529090225696564,
"learning_rate": 1.4914813061817434e-06,
"loss": 0.0001,
"num_input_tokens_seen": 8261984,
"step": 3550
},
{
"epoch": 3.350612629594722,
"grad_norm": 0.004377726465463638,
"learning_rate": 1.4839600155118525e-06,
"loss": 0.0036,
"num_input_tokens_seen": 8273568,
"step": 3555
},
{
"epoch": 3.355325164938737,
"grad_norm": 0.003906300291419029,
"learning_rate": 1.4764497246557214e-06,
"loss": 0.0001,
"num_input_tokens_seen": 8285472,
"step": 3560
},
{
"epoch": 3.360037700282752,
"grad_norm": 5.829066753387451,
"learning_rate": 1.4689505149209788e-06,
"loss": 0.0008,
"num_input_tokens_seen": 8294816,
"step": 3565
},
{
"epoch": 3.3647502356267673,
"grad_norm": 0.004101856611669064,
"learning_rate": 1.4614624674952843e-06,
"loss": 0.0,
"num_input_tokens_seen": 8305504,
"step": 3570
},
{
"epoch": 3.3694627709707823,
"grad_norm": 0.002960493555292487,
"learning_rate": 1.4539856634454558e-06,
"loss": 0.0518,
"num_input_tokens_seen": 8316320,
"step": 3575
},
{
"epoch": 3.3741753063147972,
"grad_norm": 0.004375193268060684,
"learning_rate": 1.4465201837165876e-06,
"loss": 0.0384,
"num_input_tokens_seen": 8327200,
"step": 3580
},
{
"epoch": 3.3788878416588126,
"grad_norm": 0.0019584076944738626,
"learning_rate": 1.4390661091311742e-06,
"loss": 0.0,
"num_input_tokens_seen": 8339488,
"step": 3585
},
{
"epoch": 3.3836003770028276,
"grad_norm": 0.12523356080055237,
"learning_rate": 1.4316235203882373e-06,
"loss": 0.0642,
"num_input_tokens_seen": 8353120,
"step": 3590
},
{
"epoch": 3.3883129123468425,
"grad_norm": 0.002279081614688039,
"learning_rate": 1.4241924980624485e-06,
"loss": 0.0,
"num_input_tokens_seen": 8364768,
"step": 3595
},
{
"epoch": 3.3930254476908575,
"grad_norm": 7.863749027252197,
"learning_rate": 1.4167731226032656e-06,
"loss": 0.0029,
"num_input_tokens_seen": 8376480,
"step": 3600
},
{
"epoch": 3.397737983034873,
"grad_norm": 0.0011470771860331297,
"learning_rate": 1.4093654743340462e-06,
"loss": 0.0122,
"num_input_tokens_seen": 8386784,
"step": 3605
},
{
"epoch": 3.402450518378888,
"grad_norm": 0.001517343451268971,
"learning_rate": 1.4019696334511962e-06,
"loss": 0.0,
"num_input_tokens_seen": 8397984,
"step": 3610
},
{
"epoch": 3.4071630537229027,
"grad_norm": 0.005868109408766031,
"learning_rate": 1.3945856800232874e-06,
"loss": 0.0,
"num_input_tokens_seen": 8408544,
"step": 3615
},
{
"epoch": 3.411875589066918,
"grad_norm": 0.0029909429140388966,
"learning_rate": 1.3872136939902004e-06,
"loss": 0.0,
"num_input_tokens_seen": 8419552,
"step": 3620
},
{
"epoch": 3.416588124410933,
"grad_norm": 0.4216376841068268,
"learning_rate": 1.379853755162249e-06,
"loss": 0.0001,
"num_input_tokens_seen": 8429664,
"step": 3625
},
{
"epoch": 3.421300659754948,
"grad_norm": 0.0006833241204731166,
"learning_rate": 1.3725059432193278e-06,
"loss": 0.0,
"num_input_tokens_seen": 8441376,
"step": 3630
},
{
"epoch": 3.4260131950989634,
"grad_norm": 0.0014978962717577815,
"learning_rate": 1.3651703377100406e-06,
"loss": 0.0,
"num_input_tokens_seen": 8452896,
"step": 3635
},
{
"epoch": 3.4307257304429783,
"grad_norm": 0.0020364606752991676,
"learning_rate": 1.3578470180508432e-06,
"loss": 0.0,
"num_input_tokens_seen": 8463328,
"step": 3640
},
{
"epoch": 3.4354382657869933,
"grad_norm": 0.003118757624179125,
"learning_rate": 1.3505360635251813e-06,
"loss": 0.0,
"num_input_tokens_seen": 8475808,
"step": 3645
},
{
"epoch": 3.4401508011310087,
"grad_norm": 0.002336150733754039,
"learning_rate": 1.3432375532826374e-06,
"loss": 0.0122,
"num_input_tokens_seen": 8487456,
"step": 3650
},
{
"epoch": 3.4448633364750236,
"grad_norm": 0.04974250867962837,
"learning_rate": 1.3359515663380668e-06,
"loss": 0.0,
"num_input_tokens_seen": 8503712,
"step": 3655
},
{
"epoch": 3.4495758718190386,
"grad_norm": 0.069328673183918,
"learning_rate": 1.3286781815707465e-06,
"loss": 0.2188,
"num_input_tokens_seen": 8514848,
"step": 3660
},
{
"epoch": 3.4542884071630535,
"grad_norm": 0.0022926589008420706,
"learning_rate": 1.3214174777235192e-06,
"loss": 0.0985,
"num_input_tokens_seen": 8524960,
"step": 3665
},
{
"epoch": 3.459000942507069,
"grad_norm": 0.03560859337449074,
"learning_rate": 1.3141695334019453e-06,
"loss": 0.0001,
"num_input_tokens_seen": 8535520,
"step": 3670
},
{
"epoch": 3.463713477851084,
"grad_norm": 0.013610146008431911,
"learning_rate": 1.3069344270734452e-06,
"loss": 0.0023,
"num_input_tokens_seen": 8544864,
"step": 3675
},
{
"epoch": 3.468426013195099,
"grad_norm": 0.0055809905752539635,
"learning_rate": 1.2997122370664538e-06,
"loss": 0.0001,
"num_input_tokens_seen": 8556960,
"step": 3680
},
{
"epoch": 3.473138548539114,
"grad_norm": 0.011014264076948166,
"learning_rate": 1.2925030415695727e-06,
"loss": 0.0001,
"num_input_tokens_seen": 8567968,
"step": 3685
},
{
"epoch": 3.477851083883129,
"grad_norm": 30.655986785888672,
"learning_rate": 1.285306918630722e-06,
"loss": 0.0595,
"num_input_tokens_seen": 8581920,
"step": 3690
},
{
"epoch": 3.482563619227144,
"grad_norm": 0.047711387276649475,
"learning_rate": 1.2781239461562966e-06,
"loss": 0.0442,
"num_input_tokens_seen": 8594720,
"step": 3695
},
{
"epoch": 3.4872761545711595,
"grad_norm": 0.00917474739253521,
"learning_rate": 1.2709542019103211e-06,
"loss": 0.0001,
"num_input_tokens_seen": 8606560,
"step": 3700
},
{
"epoch": 3.4919886899151744,
"grad_norm": 0.016317633911967278,
"learning_rate": 1.2637977635136123e-06,
"loss": 0.0017,
"num_input_tokens_seen": 8618208,
"step": 3705
},
{
"epoch": 3.4967012252591894,
"grad_norm": 0.028710726648569107,
"learning_rate": 1.2566547084429326e-06,
"loss": 0.0089,
"num_input_tokens_seen": 8631584,
"step": 3710
},
{
"epoch": 3.5014137606032048,
"grad_norm": 0.0347968190908432,
"learning_rate": 1.2495251140301553e-06,
"loss": 0.0338,
"num_input_tokens_seen": 8642912,
"step": 3715
},
{
"epoch": 3.5061262959472197,
"grad_norm": 0.01212374772876501,
"learning_rate": 1.2424090574614262e-06,
"loss": 0.0002,
"num_input_tokens_seen": 8652384,
"step": 3720
},
{
"epoch": 3.5098963242224315,
"eval_loss": 0.4635506868362427,
"eval_runtime": 2.7946,
"eval_samples_per_second": 337.431,
"eval_steps_per_second": 42.224,
"num_input_tokens_seen": 8660768,
"step": 3724
},
{
"epoch": 3.5108388312912346,
"grad_norm": 0.008465130813419819,
"learning_rate": 1.2353066157763305e-06,
"loss": 0.0008,
"num_input_tokens_seen": 8662624,
"step": 3725
},
{
"epoch": 3.5155513666352496,
"grad_norm": 0.02242710441350937,
"learning_rate": 1.2282178658670514e-06,
"loss": 0.0001,
"num_input_tokens_seen": 8672864,
"step": 3730
},
{
"epoch": 3.520263901979265,
"grad_norm": 0.004590487107634544,
"learning_rate": 1.221142884477548e-06,
"loss": 0.0001,
"num_input_tokens_seen": 8684448,
"step": 3735
},
{
"epoch": 3.52497643732328,
"grad_norm": 0.0026052999310195446,
"learning_rate": 1.2140817482027155e-06,
"loss": 0.0001,
"num_input_tokens_seen": 8698336,
"step": 3740
},
{
"epoch": 3.529688972667295,
"grad_norm": 0.002777635119855404,
"learning_rate": 1.207034533487564e-06,
"loss": 0.0,
"num_input_tokens_seen": 8711072,
"step": 3745
},
{
"epoch": 3.5344015080113103,
"grad_norm": 0.004557525273412466,
"learning_rate": 1.2000013166263803e-06,
"loss": 0.0001,
"num_input_tokens_seen": 8723872,
"step": 3750
},
{
"epoch": 3.539114043355325,
"grad_norm": 10.647988319396973,
"learning_rate": 1.1929821737619132e-06,
"loss": 0.0013,
"num_input_tokens_seen": 8735776,
"step": 3755
},
{
"epoch": 3.54382657869934,
"grad_norm": 0.0055831498466432095,
"learning_rate": 1.1859771808845417e-06,
"loss": 0.0,
"num_input_tokens_seen": 8752736,
"step": 3760
},
{
"epoch": 3.548539114043355,
"grad_norm": 0.007252705283463001,
"learning_rate": 1.1789864138314577e-06,
"loss": 0.0001,
"num_input_tokens_seen": 8766688,
"step": 3765
},
{
"epoch": 3.5532516493873705,
"grad_norm": 0.031091537326574326,
"learning_rate": 1.1720099482858364e-06,
"loss": 0.0,
"num_input_tokens_seen": 8781536,
"step": 3770
},
{
"epoch": 3.5579641847313854,
"grad_norm": 0.0167153999209404,
"learning_rate": 1.1650478597760284e-06,
"loss": 0.0001,
"num_input_tokens_seen": 8792224,
"step": 3775
},
{
"epoch": 3.562676720075401,
"grad_norm": 0.002952584996819496,
"learning_rate": 1.158100223674733e-06,
"loss": 0.0704,
"num_input_tokens_seen": 8803168,
"step": 3780
},
{
"epoch": 3.5673892554194158,
"grad_norm": 0.003907319158315659,
"learning_rate": 1.1511671151981861e-06,
"loss": 0.0001,
"num_input_tokens_seen": 8813536,
"step": 3785
},
{
"epoch": 3.5721017907634307,
"grad_norm": 0.004613762255758047,
"learning_rate": 1.1442486094053445e-06,
"loss": 0.0,
"num_input_tokens_seen": 8823840,
"step": 3790
},
{
"epoch": 3.5768143261074457,
"grad_norm": 0.008234544657170773,
"learning_rate": 1.1373447811970762e-06,
"loss": 0.0,
"num_input_tokens_seen": 8836576,
"step": 3795
},
{
"epoch": 3.581526861451461,
"grad_norm": 0.004143028054386377,
"learning_rate": 1.130455705315345e-06,
"loss": 0.0,
"num_input_tokens_seen": 8849824,
"step": 3800
},
{
"epoch": 3.586239396795476,
"grad_norm": 0.010429292917251587,
"learning_rate": 1.1235814563424046e-06,
"loss": 0.1829,
"num_input_tokens_seen": 8860448,
"step": 3805
},
{
"epoch": 3.590951932139491,
"grad_norm": 0.005767806898802519,
"learning_rate": 1.1167221086999897e-06,
"loss": 0.0001,
"num_input_tokens_seen": 8871776,
"step": 3810
},
{
"epoch": 3.5956644674835063,
"grad_norm": 4.553272724151611,
"learning_rate": 1.10987773664851e-06,
"loss": 0.0006,
"num_input_tokens_seen": 8885728,
"step": 3815
},
{
"epoch": 3.6003770028275213,
"grad_norm": 0.002544153481721878,
"learning_rate": 1.1030484142862511e-06,
"loss": 0.0,
"num_input_tokens_seen": 8895904,
"step": 3820
},
{
"epoch": 3.605089538171536,
"grad_norm": 0.5232880711555481,
"learning_rate": 1.0962342155485613e-06,
"loss": 0.0006,
"num_input_tokens_seen": 8907808,
"step": 3825
},
{
"epoch": 3.609802073515551,
"grad_norm": 0.0035159303806722164,
"learning_rate": 1.0894352142070652e-06,
"loss": 0.0,
"num_input_tokens_seen": 8918432,
"step": 3830
},
{
"epoch": 3.6145146088595665,
"grad_norm": 21.874401092529297,
"learning_rate": 1.0826514838688533e-06,
"loss": 0.072,
"num_input_tokens_seen": 8929248,
"step": 3835
},
{
"epoch": 3.6192271442035815,
"grad_norm": 0.03307318687438965,
"learning_rate": 1.075883097975691e-06,
"loss": 0.0001,
"num_input_tokens_seen": 8940384,
"step": 3840
},
{
"epoch": 3.623939679547597,
"grad_norm": 0.0033383166883140802,
"learning_rate": 1.0691301298032218e-06,
"loss": 0.0,
"num_input_tokens_seen": 8950816,
"step": 3845
},
{
"epoch": 3.628652214891612,
"grad_norm": 0.004945589695125818,
"learning_rate": 1.0623926524601771e-06,
"loss": 0.0001,
"num_input_tokens_seen": 8963296,
"step": 3850
},
{
"epoch": 3.6333647502356268,
"grad_norm": 0.014219646342098713,
"learning_rate": 1.0556707388875786e-06,
"loss": 0.0,
"num_input_tokens_seen": 8974624,
"step": 3855
},
{
"epoch": 3.6380772855796417,
"grad_norm": 13.508224487304688,
"learning_rate": 1.048964461857954e-06,
"loss": 0.0596,
"num_input_tokens_seen": 8985952,
"step": 3860
},
{
"epoch": 3.6427898209236567,
"grad_norm": 0.022754942998290062,
"learning_rate": 1.0422738939745453e-06,
"loss": 0.0002,
"num_input_tokens_seen": 8996064,
"step": 3865
},
{
"epoch": 3.647502356267672,
"grad_norm": 0.023511681705713272,
"learning_rate": 1.035599107670529e-06,
"loss": 0.0002,
"num_input_tokens_seen": 9006368,
"step": 3870
},
{
"epoch": 3.652214891611687,
"grad_norm": 0.01913359761238098,
"learning_rate": 1.0289401752082214e-06,
"loss": 0.0001,
"num_input_tokens_seen": 9018272,
"step": 3875
},
{
"epoch": 3.6569274269557024,
"grad_norm": 0.03516068682074547,
"learning_rate": 1.0222971686783089e-06,
"loss": 0.1112,
"num_input_tokens_seen": 9029472,
"step": 3880
},
{
"epoch": 3.6616399622997173,
"grad_norm": 0.10616306960582733,
"learning_rate": 1.0156701599990562e-06,
"loss": 0.0001,
"num_input_tokens_seen": 9041824,
"step": 3885
},
{
"epoch": 3.6663524976437323,
"grad_norm": 0.007794396486133337,
"learning_rate": 1.0090592209155373e-06,
"loss": 0.0381,
"num_input_tokens_seen": 9054752,
"step": 3890
},
{
"epoch": 3.6710650329877472,
"grad_norm": 2.6590635776519775,
"learning_rate": 1.0024644229988484e-06,
"loss": 0.002,
"num_input_tokens_seen": 9064928,
"step": 3895
},
{
"epoch": 3.6757775683317626,
"grad_norm": 0.02115059085190296,
"learning_rate": 9.95885837645344e-07,
"loss": 0.0001,
"num_input_tokens_seen": 9076256,
"step": 3900
},
{
"epoch": 3.6804901036757776,
"grad_norm": 0.035742953419685364,
"learning_rate": 9.893235360758565e-07,
"loss": 0.0954,
"num_input_tokens_seen": 9086624,
"step": 3905
},
{
"epoch": 3.6852026390197925,
"grad_norm": 0.007046286016702652,
"learning_rate": 9.827775893349273e-07,
"loss": 0.0001,
"num_input_tokens_seen": 9097824,
"step": 3910
},
{
"epoch": 3.689915174363808,
"grad_norm": 379.1387939453125,
"learning_rate": 9.762480682900374e-07,
"loss": 0.0323,
"num_input_tokens_seen": 9107296,
"step": 3915
},
{
"epoch": 3.694627709707823,
"grad_norm": 18.87242889404297,
"learning_rate": 9.697350436308428e-07,
"loss": 0.0039,
"num_input_tokens_seen": 9119008,
"step": 3920
},
{
"epoch": 3.699340245051838,
"grad_norm": 0.0049491687677800655,
"learning_rate": 9.63238585868405e-07,
"loss": 0.0001,
"num_input_tokens_seen": 9131296,
"step": 3925
},
{
"epoch": 3.7040527803958527,
"grad_norm": 0.01330646499991417,
"learning_rate": 9.567587653344295e-07,
"loss": 0.0001,
"num_input_tokens_seen": 9141664,
"step": 3930
},
{
"epoch": 3.708765315739868,
"grad_norm": 0.01977146603167057,
"learning_rate": 9.502956521805054e-07,
"loss": 0.0001,
"num_input_tokens_seen": 9151328,
"step": 3935
},
{
"epoch": 3.713477851083883,
"grad_norm": 0.08915020525455475,
"learning_rate": 9.438493163773433e-07,
"loss": 0.0002,
"num_input_tokens_seen": 9164192,
"step": 3940
},
{
"epoch": 3.7181903864278985,
"grad_norm": 0.004182067699730396,
"learning_rate": 9.374198277140237e-07,
"loss": 0.0003,
"num_input_tokens_seen": 9176544,
"step": 3945
},
{
"epoch": 3.7229029217719134,
"grad_norm": 0.028023963794112206,
"learning_rate": 9.310072557972305e-07,
"loss": 0.0162,
"num_input_tokens_seen": 9188512,
"step": 3950
},
{
"epoch": 3.7276154571159283,
"grad_norm": 0.003678038949146867,
"learning_rate": 9.246116700505109e-07,
"loss": 0.0001,
"num_input_tokens_seen": 9200992,
"step": 3955
},
{
"epoch": 3.7323279924599433,
"grad_norm": 0.004443558864295483,
"learning_rate": 9.18233139713513e-07,
"loss": 0.0001,
"num_input_tokens_seen": 9211168,
"step": 3960
},
{
"epoch": 3.7370405278039587,
"grad_norm": 0.012858624570071697,
"learning_rate": 9.118717338412414e-07,
"loss": 0.0,
"num_input_tokens_seen": 9223456,
"step": 3965
},
{
"epoch": 3.7417530631479736,
"grad_norm": 0.037973713129758835,
"learning_rate": 9.055275213033077e-07,
"loss": 0.0002,
"num_input_tokens_seen": 9233632,
"step": 3970
},
{
"epoch": 3.7464655984919886,
"grad_norm": 49.70805358886719,
"learning_rate": 8.992005707831877e-07,
"loss": 0.0751,
"num_input_tokens_seen": 9243296,
"step": 3975
},
{
"epoch": 3.751178133836004,
"grad_norm": 0.0022637660149484873,
"learning_rate": 8.928909507774741e-07,
"loss": 0.0002,
"num_input_tokens_seen": 9259424,
"step": 3980
},
{
"epoch": 3.755890669180019,
"grad_norm": 0.009885657578706741,
"learning_rate": 8.86598729595137e-07,
"loss": 0.0,
"num_input_tokens_seen": 9271840,
"step": 3985
},
{
"epoch": 3.760603204524034,
"grad_norm": 0.010455531068146229,
"learning_rate": 8.80323975356783e-07,
"loss": 0.0,
"num_input_tokens_seen": 9286304,
"step": 3990
},
{
"epoch": 3.760603204524034,
"eval_loss": 0.5201095938682556,
"eval_runtime": 2.7914,
"eval_samples_per_second": 337.823,
"eval_steps_per_second": 42.273,
"num_input_tokens_seen": 9286304,
"step": 3990
},
{
"epoch": 3.765315739868049,
"grad_norm": 3.8269801139831543,
"learning_rate": 8.740667559939217e-07,
"loss": 0.0004,
"num_input_tokens_seen": 9297056,
"step": 3995
},
{
"epoch": 3.770028275212064,
"grad_norm": 0.010600591078400612,
"learning_rate": 8.678271392482243e-07,
"loss": 0.0,
"num_input_tokens_seen": 9307872,
"step": 4000
},
{
"epoch": 3.774740810556079,
"grad_norm": 0.00742421904578805,
"learning_rate": 8.616051926707941e-07,
"loss": 0.0,
"num_input_tokens_seen": 9318816,
"step": 4005
},
{
"epoch": 3.7794533459000945,
"grad_norm": 0.005168728996068239,
"learning_rate": 8.554009836214345e-07,
"loss": 0.0308,
"num_input_tokens_seen": 9331232,
"step": 4010
},
{
"epoch": 3.7841658812441095,
"grad_norm": 0.002414106857031584,
"learning_rate": 8.49214579267921e-07,
"loss": 0.0657,
"num_input_tokens_seen": 9342112,
"step": 4015
},
{
"epoch": 3.7888784165881244,
"grad_norm": 0.05582628399133682,
"learning_rate": 8.430460465852683e-07,
"loss": 0.0,
"num_input_tokens_seen": 9355872,
"step": 4020
},
{
"epoch": 3.7935909519321394,
"grad_norm": 0.0030152511317282915,
"learning_rate": 8.368954523550146e-07,
"loss": 0.0,
"num_input_tokens_seen": 9367008,
"step": 4025
},
{
"epoch": 3.7983034872761543,
"grad_norm": 0.008660019375383854,
"learning_rate": 8.307628631644904e-07,
"loss": 0.0001,
"num_input_tokens_seen": 9380000,
"step": 4030
},
{
"epoch": 3.8030160226201697,
"grad_norm": 0.012062069959938526,
"learning_rate": 8.246483454061016e-07,
"loss": 0.0,
"num_input_tokens_seen": 9390368,
"step": 4035
},
{
"epoch": 3.8077285579641846,
"grad_norm": 0.1445380598306656,
"learning_rate": 8.185519652766091e-07,
"loss": 0.0829,
"num_input_tokens_seen": 9401952,
"step": 4040
},
{
"epoch": 3.8124410933082,
"grad_norm": 0.017705656588077545,
"learning_rate": 8.124737887764148e-07,
"loss": 0.0,
"num_input_tokens_seen": 9413536,
"step": 4045
},
{
"epoch": 3.817153628652215,
"grad_norm": 13.497117042541504,
"learning_rate": 8.064138817088429e-07,
"loss": 0.09,
"num_input_tokens_seen": 9424864,
"step": 4050
},
{
"epoch": 3.82186616399623,
"grad_norm": 0.0018770827446132898,
"learning_rate": 8.003723096794314e-07,
"loss": 0.0,
"num_input_tokens_seen": 9437152,
"step": 4055
},
{
"epoch": 3.826578699340245,
"grad_norm": 0.005844899918884039,
"learning_rate": 7.94349138095219e-07,
"loss": 0.0001,
"num_input_tokens_seen": 9448032,
"step": 4060
},
{
"epoch": 3.8312912346842602,
"grad_norm": 0.07622718065977097,
"learning_rate": 7.883444321640383e-07,
"loss": 0.0001,
"num_input_tokens_seen": 9459424,
"step": 4065
},
{
"epoch": 3.836003770028275,
"grad_norm": 0.00536764832213521,
"learning_rate": 7.82358256893812e-07,
"loss": 0.0001,
"num_input_tokens_seen": 9469536,
"step": 4070
},
{
"epoch": 3.84071630537229,
"grad_norm": 0.0012705104891210794,
"learning_rate": 7.763906770918428e-07,
"loss": 0.0,
"num_input_tokens_seen": 9482976,
"step": 4075
},
{
"epoch": 3.8454288407163055,
"grad_norm": 0.008968825452029705,
"learning_rate": 7.704417573641196e-07,
"loss": 0.0001,
"num_input_tokens_seen": 9492704,
"step": 4080
},
{
"epoch": 3.8501413760603205,
"grad_norm": 0.00470514502376318,
"learning_rate": 7.645115621146116e-07,
"loss": 0.0,
"num_input_tokens_seen": 9504864,
"step": 4085
},
{
"epoch": 3.8548539114043354,
"grad_norm": 0.04696199670433998,
"learning_rate": 7.586001555445773e-07,
"loss": 0.1079,
"num_input_tokens_seen": 9515424,
"step": 4090
},
{
"epoch": 3.8595664467483504,
"grad_norm": 0.0076852161437273026,
"learning_rate": 7.527076016518603e-07,
"loss": 0.0001,
"num_input_tokens_seen": 9525792,
"step": 4095
},
{
"epoch": 3.8642789820923658,
"grad_norm": 0.04222777113318443,
"learning_rate": 7.468339642302077e-07,
"loss": 0.0001,
"num_input_tokens_seen": 9536416,
"step": 4100
},
{
"epoch": 3.8689915174363807,
"grad_norm": 0.024708032608032227,
"learning_rate": 7.409793068685709e-07,
"loss": 0.0722,
"num_input_tokens_seen": 9550880,
"step": 4105
},
{
"epoch": 3.873704052780396,
"grad_norm": 0.007110555190593004,
"learning_rate": 7.351436929504203e-07,
"loss": 0.0,
"num_input_tokens_seen": 9564768,
"step": 4110
},
{
"epoch": 3.878416588124411,
"grad_norm": 0.013784021139144897,
"learning_rate": 7.293271856530585e-07,
"loss": 0.0001,
"num_input_tokens_seen": 9575776,
"step": 4115
},
{
"epoch": 3.883129123468426,
"grad_norm": 0.0009743549744598567,
"learning_rate": 7.235298479469391e-07,
"loss": 0.0323,
"num_input_tokens_seen": 9588192,
"step": 4120
},
{
"epoch": 3.887841658812441,
"grad_norm": 0.006835806183516979,
"learning_rate": 7.177517425949801e-07,
"loss": 0.0,
"num_input_tokens_seen": 9598432,
"step": 4125
},
{
"epoch": 3.8925541941564563,
"grad_norm": 0.07676160335540771,
"learning_rate": 7.119929321518876e-07,
"loss": 0.0001,
"num_input_tokens_seen": 9613920,
"step": 4130
},
{
"epoch": 3.8972667295004713,
"grad_norm": 0.0023403996601700783,
"learning_rate": 7.062534789634772e-07,
"loss": 0.0001,
"num_input_tokens_seen": 9624864,
"step": 4135
},
{
"epoch": 3.901979264844486,
"grad_norm": 0.002742171287536621,
"learning_rate": 7.005334451660034e-07,
"loss": 0.0004,
"num_input_tokens_seen": 9635232,
"step": 4140
},
{
"epoch": 3.9066918001885016,
"grad_norm": 0.012737921439111233,
"learning_rate": 6.948328926854767e-07,
"loss": 0.0,
"num_input_tokens_seen": 9648544,
"step": 4145
},
{
"epoch": 3.9114043355325165,
"grad_norm": 0.05681464448571205,
"learning_rate": 6.891518832370059e-07,
"loss": 0.0074,
"num_input_tokens_seen": 9659424,
"step": 4150
},
{
"epoch": 3.9161168708765315,
"grad_norm": 0.034557852894067764,
"learning_rate": 6.834904783241198e-07,
"loss": 0.0,
"num_input_tokens_seen": 9669920,
"step": 4155
},
{
"epoch": 3.9208294062205464,
"grad_norm": 0.608920693397522,
"learning_rate": 6.778487392381089e-07,
"loss": 0.0002,
"num_input_tokens_seen": 9681376,
"step": 4160
},
{
"epoch": 3.925541941564562,
"grad_norm": 0.0012145772343501449,
"learning_rate": 6.722267270573529e-07,
"loss": 0.0,
"num_input_tokens_seen": 9691552,
"step": 4165
},
{
"epoch": 3.9302544769085768,
"grad_norm": 0.017469940707087517,
"learning_rate": 6.666245026466708e-07,
"loss": 0.0001,
"num_input_tokens_seen": 9704288,
"step": 4170
},
{
"epoch": 3.934967012252592,
"grad_norm": 0.0016305310418829322,
"learning_rate": 6.61042126656652e-07,
"loss": 0.0595,
"num_input_tokens_seen": 9713952,
"step": 4175
},
{
"epoch": 3.939679547596607,
"grad_norm": 26.813234329223633,
"learning_rate": 6.554796595230051e-07,
"loss": 0.0642,
"num_input_tokens_seen": 9724576,
"step": 4180
},
{
"epoch": 3.944392082940622,
"grad_norm": 0.0023128024768084288,
"learning_rate": 6.499371614659019e-07,
"loss": 0.0002,
"num_input_tokens_seen": 9735392,
"step": 4185
},
{
"epoch": 3.949104618284637,
"grad_norm": 0.00788215734064579,
"learning_rate": 6.444146924893252e-07,
"loss": 0.0766,
"num_input_tokens_seen": 9745888,
"step": 4190
},
{
"epoch": 3.9538171536286524,
"grad_norm": 0.0028776442632079124,
"learning_rate": 6.389123123804217e-07,
"loss": 0.111,
"num_input_tokens_seen": 9755104,
"step": 4195
},
{
"epoch": 3.9585296889726673,
"grad_norm": 0.0523090660572052,
"learning_rate": 6.334300807088509e-07,
"loss": 0.0003,
"num_input_tokens_seen": 9766944,
"step": 4200
},
{
"epoch": 3.9632422243166823,
"grad_norm": 0.008871479891240597,
"learning_rate": 6.279680568261423e-07,
"loss": 0.0782,
"num_input_tokens_seen": 9778336,
"step": 4205
},
{
"epoch": 3.9679547596606977,
"grad_norm": 0.22362910211086273,
"learning_rate": 6.225262998650525e-07,
"loss": 0.0004,
"num_input_tokens_seen": 9789088,
"step": 4210
},
{
"epoch": 3.9726672950047126,
"grad_norm": 1.362607479095459,
"learning_rate": 6.171048687389273e-07,
"loss": 0.0003,
"num_input_tokens_seen": 9799392,
"step": 4215
},
{
"epoch": 3.9773798303487276,
"grad_norm": 0.0025137532502412796,
"learning_rate": 6.117038221410568e-07,
"loss": 0.0,
"num_input_tokens_seen": 9811360,
"step": 4220
},
{
"epoch": 3.9820923656927425,
"grad_norm": 0.11764784902334213,
"learning_rate": 6.063232185440507e-07,
"loss": 0.1016,
"num_input_tokens_seen": 9824160,
"step": 4225
},
{
"epoch": 3.986804901036758,
"grad_norm": 0.011414180509746075,
"learning_rate": 6.009631161991958e-07,
"loss": 0.0007,
"num_input_tokens_seen": 9834784,
"step": 4230
},
{
"epoch": 3.991517436380773,
"grad_norm": 0.004349572584033012,
"learning_rate": 5.956235731358298e-07,
"loss": 0.0,
"num_input_tokens_seen": 9845920,
"step": 4235
},
{
"epoch": 3.9962299717247878,
"grad_norm": 0.0038933674804866314,
"learning_rate": 5.903046471607121e-07,
"loss": 0.0,
"num_input_tokens_seen": 9858208,
"step": 4240
},
{
"epoch": 4.000942507068803,
"grad_norm": 0.014259828254580498,
"learning_rate": 5.850063958573993e-07,
"loss": 0.032,
"num_input_tokens_seen": 9868192,
"step": 4245
},
{
"epoch": 4.005655042412818,
"grad_norm": 0.002472294494509697,
"learning_rate": 5.797288765856196e-07,
"loss": 0.0,
"num_input_tokens_seen": 9882784,
"step": 4250
},
{
"epoch": 4.010367577756833,
"grad_norm": 0.17401158809661865,
"learning_rate": 5.74472146480653e-07,
"loss": 0.0001,
"num_input_tokens_seen": 9892448,
"step": 4255
},
{
"epoch": 4.011310084825636,
"eval_loss": 0.5145591497421265,
"eval_runtime": 2.7443,
"eval_samples_per_second": 343.621,
"eval_steps_per_second": 42.998,
"num_input_tokens_seen": 9894624,
"step": 4256
},
{
"epoch": 4.015080113100848,
"grad_norm": 0.0075249760411679745,
"learning_rate": 5.692362624527117e-07,
"loss": 0.0,
"num_input_tokens_seen": 9905376,
"step": 4260
},
{
"epoch": 4.019792648444863,
"grad_norm": 0.07316409796476364,
"learning_rate": 5.640212811863277e-07,
"loss": 0.0,
"num_input_tokens_seen": 9915616,
"step": 4265
},
{
"epoch": 4.024505183788879,
"grad_norm": 0.007110144477337599,
"learning_rate": 5.588272591397337e-07,
"loss": 0.0,
"num_input_tokens_seen": 9928288,
"step": 4270
},
{
"epoch": 4.029217719132894,
"grad_norm": 0.002868575043976307,
"learning_rate": 5.536542525442554e-07,
"loss": 0.0001,
"num_input_tokens_seen": 9939232,
"step": 4275
},
{
"epoch": 4.033930254476909,
"grad_norm": 0.004913229029625654,
"learning_rate": 5.485023174037005e-07,
"loss": 0.0,
"num_input_tokens_seen": 9950688,
"step": 4280
},
{
"epoch": 4.038642789820924,
"grad_norm": 0.0007955088512971997,
"learning_rate": 5.433715094937575e-07,
"loss": 0.0,
"num_input_tokens_seen": 9961824,
"step": 4285
},
{
"epoch": 4.043355325164939,
"grad_norm": 0.002959765959531069,
"learning_rate": 5.382618843613827e-07,
"loss": 0.0,
"num_input_tokens_seen": 9974560,
"step": 4290
},
{
"epoch": 4.0480678605089535,
"grad_norm": 0.0037353690713644028,
"learning_rate": 5.331734973242089e-07,
"loss": 0.0,
"num_input_tokens_seen": 9987040,
"step": 4295
},
{
"epoch": 4.0527803958529685,
"grad_norm": 0.001677420805208385,
"learning_rate": 5.28106403469939e-07,
"loss": 0.0,
"num_input_tokens_seen": 10002400,
"step": 4300
},
{
"epoch": 4.057492931196984,
"grad_norm": 0.024959493428468704,
"learning_rate": 5.23060657655754e-07,
"loss": 0.0,
"num_input_tokens_seen": 10012448,
"step": 4305
},
{
"epoch": 4.062205466540999,
"grad_norm": 0.002381374826654792,
"learning_rate": 5.180363145077164e-07,
"loss": 0.0001,
"num_input_tokens_seen": 10023392,
"step": 4310
},
{
"epoch": 4.066918001885014,
"grad_norm": 0.0020008094143122435,
"learning_rate": 5.130334284201799e-07,
"loss": 0.0002,
"num_input_tokens_seen": 10034528,
"step": 4315
},
{
"epoch": 4.071630537229029,
"grad_norm": 0.0024472337681800127,
"learning_rate": 5.080520535552028e-07,
"loss": 0.0,
"num_input_tokens_seen": 10045024,
"step": 4320
},
{
"epoch": 4.076343072573044,
"grad_norm": 0.05074908956885338,
"learning_rate": 5.030922438419569e-07,
"loss": 0.0,
"num_input_tokens_seen": 10055328,
"step": 4325
},
{
"epoch": 4.081055607917059,
"grad_norm": 0.008851269260048866,
"learning_rate": 4.981540529761473e-07,
"loss": 0.0,
"num_input_tokens_seen": 10065184,
"step": 4330
},
{
"epoch": 4.085768143261075,
"grad_norm": 0.009758401662111282,
"learning_rate": 4.932375344194285e-07,
"loss": 0.0,
"num_input_tokens_seen": 10077088,
"step": 4335
},
{
"epoch": 4.09048067860509,
"grad_norm": 0.005179739557206631,
"learning_rate": 4.88342741398831e-07,
"loss": 0.0,
"num_input_tokens_seen": 10087840,
"step": 4340
},
{
"epoch": 4.095193213949105,
"grad_norm": 0.020409001037478447,
"learning_rate": 4.83469726906175e-07,
"loss": 0.0,
"num_input_tokens_seen": 10098656,
"step": 4345
},
{
"epoch": 4.09990574929312,
"grad_norm": 0.0022380428854376078,
"learning_rate": 4.786185436975085e-07,
"loss": 0.0,
"num_input_tokens_seen": 10111456,
"step": 4350
},
{
"epoch": 4.104618284637135,
"grad_norm": 0.0027253238949924707,
"learning_rate": 4.7378924429252735e-07,
"loss": 0.0,
"num_input_tokens_seen": 10122912,
"step": 4355
},
{
"epoch": 4.10933081998115,
"grad_norm": 0.22917087376117706,
"learning_rate": 4.689818809740118e-07,
"loss": 0.0003,
"num_input_tokens_seen": 10135072,
"step": 4360
},
{
"epoch": 4.1140433553251645,
"grad_norm": 0.04845559597015381,
"learning_rate": 4.641965057872552e-07,
"loss": 0.0001,
"num_input_tokens_seen": 10145760,
"step": 4365
},
{
"epoch": 4.11875589066918,
"grad_norm": 0.004377515520900488,
"learning_rate": 4.594331705395078e-07,
"loss": 0.0001,
"num_input_tokens_seen": 10156000,
"step": 4370
},
{
"epoch": 4.123468426013195,
"grad_norm": 0.002658847952261567,
"learning_rate": 4.5469192679940905e-07,
"loss": 0.0,
"num_input_tokens_seen": 10168736,
"step": 4375
},
{
"epoch": 4.12818096135721,
"grad_norm": 0.022708803415298462,
"learning_rate": 4.4997282589643363e-07,
"loss": 0.0,
"num_input_tokens_seen": 10181408,
"step": 4380
},
{
"epoch": 4.132893496701225,
"grad_norm": 0.004133255686610937,
"learning_rate": 4.4527591892033263e-07,
"loss": 0.0,
"num_input_tokens_seen": 10191904,
"step": 4385
},
{
"epoch": 4.13760603204524,
"grad_norm": 0.0023588186595588923,
"learning_rate": 4.406012567205847e-07,
"loss": 0.0,
"num_input_tokens_seen": 10202080,
"step": 4390
},
{
"epoch": 4.142318567389255,
"grad_norm": 0.0017039207741618156,
"learning_rate": 4.359488899058409e-07,
"loss": 0.0,
"num_input_tokens_seen": 10212064,
"step": 4395
},
{
"epoch": 4.147031102733271,
"grad_norm": 0.004048160742968321,
"learning_rate": 4.313188688433792e-07,
"loss": 0.0,
"num_input_tokens_seen": 10223136,
"step": 4400
},
{
"epoch": 4.151743638077286,
"grad_norm": 0.0019121092045679688,
"learning_rate": 4.2671124365855853e-07,
"loss": 0.0,
"num_input_tokens_seen": 10238432,
"step": 4405
},
{
"epoch": 4.156456173421301,
"grad_norm": 0.004715082701295614,
"learning_rate": 4.2212606423427867e-07,
"loss": 0.0252,
"num_input_tokens_seen": 10250784,
"step": 4410
},
{
"epoch": 4.161168708765316,
"grad_norm": 0.04374701902270317,
"learning_rate": 4.175633802104337e-07,
"loss": 0.0,
"num_input_tokens_seen": 10265440,
"step": 4415
},
{
"epoch": 4.165881244109331,
"grad_norm": 0.014298969879746437,
"learning_rate": 4.1302324098338315e-07,
"loss": 0.0,
"num_input_tokens_seen": 10276704,
"step": 4420
},
{
"epoch": 4.170593779453346,
"grad_norm": 0.004060305189341307,
"learning_rate": 4.0850569570541036e-07,
"loss": 0.0,
"num_input_tokens_seen": 10286496,
"step": 4425
},
{
"epoch": 4.175306314797361,
"grad_norm": 0.0036463961005210876,
"learning_rate": 4.0401079328419384e-07,
"loss": 0.0,
"num_input_tokens_seen": 10297376,
"step": 4430
},
{
"epoch": 4.180018850141376,
"grad_norm": 0.007656124886125326,
"learning_rate": 3.995385823822767e-07,
"loss": 0.0,
"num_input_tokens_seen": 10306976,
"step": 4435
},
{
"epoch": 4.184731385485391,
"grad_norm": 0.009232879616320133,
"learning_rate": 3.9508911141653896e-07,
"loss": 0.0,
"num_input_tokens_seen": 10318880,
"step": 4440
},
{
"epoch": 4.189443920829406,
"grad_norm": 0.5042584538459778,
"learning_rate": 3.906624285576771e-07,
"loss": 0.0001,
"num_input_tokens_seen": 10330784,
"step": 4445
},
{
"epoch": 4.194156456173421,
"grad_norm": 0.009172679856419563,
"learning_rate": 3.862585817296771e-07,
"loss": 0.0,
"num_input_tokens_seen": 10341088,
"step": 4450
},
{
"epoch": 4.198868991517436,
"grad_norm": 0.00793259497731924,
"learning_rate": 3.8187761860929956e-07,
"loss": 0.0,
"num_input_tokens_seen": 10352096,
"step": 4455
},
{
"epoch": 4.203581526861451,
"grad_norm": 0.010999282822012901,
"learning_rate": 3.775195866255618e-07,
"loss": 0.0,
"num_input_tokens_seen": 10364448,
"step": 4460
},
{
"epoch": 4.208294062205466,
"grad_norm": 0.0006450503133237362,
"learning_rate": 3.731845329592268e-07,
"loss": 0.0,
"num_input_tokens_seen": 10376928,
"step": 4465
},
{
"epoch": 4.213006597549482,
"grad_norm": 0.0008582579903304577,
"learning_rate": 3.6887250454228666e-07,
"loss": 0.0,
"num_input_tokens_seen": 10389216,
"step": 4470
},
{
"epoch": 4.217719132893497,
"grad_norm": 0.002186185447499156,
"learning_rate": 3.6458354805746304e-07,
"loss": 0.0,
"num_input_tokens_seen": 10406944,
"step": 4475
},
{
"epoch": 4.222431668237512,
"grad_norm": 0.0032066500280052423,
"learning_rate": 3.603177099376931e-07,
"loss": 0.0,
"num_input_tokens_seen": 10417760,
"step": 4480
},
{
"epoch": 4.227144203581527,
"grad_norm": 0.0038139999378472567,
"learning_rate": 3.5607503636563484e-07,
"loss": 0.0,
"num_input_tokens_seen": 10429216,
"step": 4485
},
{
"epoch": 4.231856738925542,
"grad_norm": 0.004827831871807575,
"learning_rate": 3.5185557327315797e-07,
"loss": 0.0,
"num_input_tokens_seen": 10442784,
"step": 4490
},
{
"epoch": 4.236569274269557,
"grad_norm": 0.0007723537273705006,
"learning_rate": 3.47659366340857e-07,
"loss": 0.0,
"num_input_tokens_seen": 10454496,
"step": 4495
},
{
"epoch": 4.2412818096135725,
"grad_norm": 0.00883992575109005,
"learning_rate": 3.43486460997548e-07,
"loss": 0.0,
"num_input_tokens_seen": 10466464,
"step": 4500
},
{
"epoch": 4.245994344957587,
"grad_norm": 0.009929073974490166,
"learning_rate": 3.393369024197826e-07,
"loss": 0.0,
"num_input_tokens_seen": 10476768,
"step": 4505
},
{
"epoch": 4.250706880301602,
"grad_norm": 0.003360740141943097,
"learning_rate": 3.352107355313536e-07,
"loss": 0.0,
"num_input_tokens_seen": 10487392,
"step": 4510
},
{
"epoch": 4.255419415645617,
"grad_norm": 0.004852129612118006,
"learning_rate": 3.311080050028148e-07,
"loss": 0.0,
"num_input_tokens_seen": 10498144,
"step": 4515
},
{
"epoch": 4.260131950989632,
"grad_norm": 0.0017156790709123015,
"learning_rate": 3.2702875525099235e-07,
"loss": 0.0782,
"num_input_tokens_seen": 10507808,
"step": 4520
},
{
"epoch": 4.262016965127239,
"eval_loss": 0.5548250675201416,
"eval_runtime": 2.7767,
"eval_samples_per_second": 339.618,
"eval_steps_per_second": 42.497,
"num_input_tokens_seen": 10512416,
"step": 4522
},
{
"epoch": 4.264844486333647,
"grad_norm": 0.014055570587515831,
"learning_rate": 3.2297303043850564e-07,
"loss": 0.0,
"num_input_tokens_seen": 10517408,
"step": 4525
},
{
"epoch": 4.269557021677663,
"grad_norm": 0.002077508484944701,
"learning_rate": 3.189408744732897e-07,
"loss": 0.0,
"num_input_tokens_seen": 10528416,
"step": 4530
},
{
"epoch": 4.274269557021678,
"grad_norm": 0.036649417132139206,
"learning_rate": 3.149323310081201e-07,
"loss": 0.0,
"num_input_tokens_seen": 10541216,
"step": 4535
},
{
"epoch": 4.278982092365693,
"grad_norm": 0.00107799272518605,
"learning_rate": 3.1094744344013855e-07,
"loss": 0.0,
"num_input_tokens_seen": 10554016,
"step": 4540
},
{
"epoch": 4.283694627709708,
"grad_norm": 0.011105876415967941,
"learning_rate": 3.069862549103841e-07,
"loss": 0.0,
"num_input_tokens_seen": 10563552,
"step": 4545
},
{
"epoch": 4.288407163053723,
"grad_norm": 0.005242721643298864,
"learning_rate": 3.030488083033273e-07,
"loss": 0.0,
"num_input_tokens_seen": 10576288,
"step": 4550
},
{
"epoch": 4.293119698397738,
"grad_norm": 0.013468295335769653,
"learning_rate": 2.991351462464037e-07,
"loss": 0.0,
"num_input_tokens_seen": 10586784,
"step": 4555
},
{
"epoch": 4.297832233741753,
"grad_norm": 0.005555103067308664,
"learning_rate": 2.9524531110955406e-07,
"loss": 0.0,
"num_input_tokens_seen": 10597792,
"step": 4560
},
{
"epoch": 4.3025447690857686,
"grad_norm": 0.009637890383601189,
"learning_rate": 2.913793450047639e-07,
"loss": 0.0,
"num_input_tokens_seen": 10610720,
"step": 4565
},
{
"epoch": 4.3072573044297835,
"grad_norm": 0.002298228908330202,
"learning_rate": 2.875372897856113e-07,
"loss": 0.0,
"num_input_tokens_seen": 10622176,
"step": 4570
},
{
"epoch": 4.311969839773798,
"grad_norm": 0.025600271299481392,
"learning_rate": 2.837191870468084e-07,
"loss": 0.0,
"num_input_tokens_seen": 10632864,
"step": 4575
},
{
"epoch": 4.316682375117813,
"grad_norm": 0.000993955647572875,
"learning_rate": 2.7992507812375557e-07,
"loss": 0.0039,
"num_input_tokens_seen": 10642784,
"step": 4580
},
{
"epoch": 4.321394910461828,
"grad_norm": 0.01237708143889904,
"learning_rate": 2.76155004092091e-07,
"loss": 0.0153,
"num_input_tokens_seen": 10652896,
"step": 4585
},
{
"epoch": 4.326107445805843,
"grad_norm": 0.002820044755935669,
"learning_rate": 2.7240900576724904e-07,
"loss": 0.1078,
"num_input_tokens_seen": 10665248,
"step": 4590
},
{
"epoch": 4.330819981149858,
"grad_norm": 0.0086582712829113,
"learning_rate": 2.686871237040151e-07,
"loss": 0.0001,
"num_input_tokens_seen": 10676384,
"step": 4595
},
{
"epoch": 4.335532516493874,
"grad_norm": 0.0017868748400360346,
"learning_rate": 2.6498939819608827e-07,
"loss": 0.0,
"num_input_tokens_seen": 10688352,
"step": 4600
},
{
"epoch": 4.340245051837889,
"grad_norm": 0.026829157024621964,
"learning_rate": 2.613158692756443e-07,
"loss": 0.0,
"num_input_tokens_seen": 10698080,
"step": 4605
},
{
"epoch": 4.344957587181904,
"grad_norm": 0.004218968562781811,
"learning_rate": 2.576665767129055e-07,
"loss": 0.0,
"num_input_tokens_seen": 10710816,
"step": 4610
},
{
"epoch": 4.349670122525919,
"grad_norm": 0.0005848800064995885,
"learning_rate": 2.5404156001570257e-07,
"loss": 0.0,
"num_input_tokens_seen": 10722592,
"step": 4615
},
{
"epoch": 4.354382657869934,
"grad_norm": 0.0013897489989176393,
"learning_rate": 2.5044085842905686e-07,
"loss": 0.0,
"num_input_tokens_seen": 10734752,
"step": 4620
},
{
"epoch": 4.359095193213949,
"grad_norm": 0.003906297497451305,
"learning_rate": 2.4686451093474673e-07,
"loss": 0.0001,
"num_input_tokens_seen": 10746464,
"step": 4625
},
{
"epoch": 4.363807728557964,
"grad_norm": 0.010411670431494713,
"learning_rate": 2.433125562508917e-07,
"loss": 0.0,
"num_input_tokens_seen": 10757472,
"step": 4630
},
{
"epoch": 4.36852026390198,
"grad_norm": 24.992229461669922,
"learning_rate": 2.3978503283152847e-07,
"loss": 0.1078,
"num_input_tokens_seen": 10769056,
"step": 4635
},
{
"epoch": 4.3732327992459945,
"grad_norm": 0.007977725006639957,
"learning_rate": 2.3628197886619852e-07,
"loss": 0.0,
"num_input_tokens_seen": 10780384,
"step": 4640
},
{
"epoch": 4.3779453345900095,
"grad_norm": 0.0031011472456157207,
"learning_rate": 2.3280343227953305e-07,
"loss": 0.0,
"num_input_tokens_seen": 10792928,
"step": 4645
},
{
"epoch": 4.382657869934024,
"grad_norm": 0.0017057248624041677,
"learning_rate": 2.293494307308411e-07,
"loss": 0.0,
"num_input_tokens_seen": 10803808,
"step": 4650
},
{
"epoch": 4.387370405278039,
"grad_norm": 0.002497282810509205,
"learning_rate": 2.2592001161370392e-07,
"loss": 0.0,
"num_input_tokens_seen": 10814496,
"step": 4655
},
{
"epoch": 4.392082940622054,
"grad_norm": 0.005025547929108143,
"learning_rate": 2.2251521205557042e-07,
"loss": 0.0,
"num_input_tokens_seen": 10827168,
"step": 4660
},
{
"epoch": 4.39679547596607,
"grad_norm": 0.014085683971643448,
"learning_rate": 2.1913506891735242e-07,
"loss": 0.0,
"num_input_tokens_seen": 10839392,
"step": 4665
},
{
"epoch": 4.401508011310085,
"grad_norm": 0.0015956445131450891,
"learning_rate": 2.1577961879302807e-07,
"loss": 0.0,
"num_input_tokens_seen": 10851744,
"step": 4670
},
{
"epoch": 4.4062205466541,
"grad_norm": 0.0027405947912484407,
"learning_rate": 2.124488980092454e-07,
"loss": 0.0,
"num_input_tokens_seen": 10864608,
"step": 4675
},
{
"epoch": 4.410933081998115,
"grad_norm": 0.0018958933651447296,
"learning_rate": 2.0914294262492723e-07,
"loss": 0.0,
"num_input_tokens_seen": 10877856,
"step": 4680
},
{
"epoch": 4.41564561734213,
"grad_norm": 0.0035127715673297644,
"learning_rate": 2.0586178843088473e-07,
"loss": 0.0044,
"num_input_tokens_seen": 10891616,
"step": 4685
},
{
"epoch": 4.420358152686145,
"grad_norm": 0.003918149974197149,
"learning_rate": 2.026054709494235e-07,
"loss": 0.0,
"num_input_tokens_seen": 10901024,
"step": 4690
},
{
"epoch": 4.425070688030161,
"grad_norm": 0.004163654521107674,
"learning_rate": 1.9937402543396683e-07,
"loss": 0.0,
"num_input_tokens_seen": 10910560,
"step": 4695
},
{
"epoch": 4.429783223374176,
"grad_norm": 0.0017460703384131193,
"learning_rate": 1.961674868686675e-07,
"loss": 0.0,
"num_input_tokens_seen": 10921824,
"step": 4700
},
{
"epoch": 4.434495758718191,
"grad_norm": 0.0011986172758042812,
"learning_rate": 1.929858899680323e-07,
"loss": 0.0,
"num_input_tokens_seen": 10934944,
"step": 4705
},
{
"epoch": 4.4392082940622055,
"grad_norm": 0.0017532928613945842,
"learning_rate": 1.8982926917654575e-07,
"loss": 0.0922,
"num_input_tokens_seen": 10946400,
"step": 4710
},
{
"epoch": 4.4439208294062205,
"grad_norm": 0.0029414647724479437,
"learning_rate": 1.8669765866829724e-07,
"loss": 0.0,
"num_input_tokens_seen": 10958112,
"step": 4715
},
{
"epoch": 4.448633364750235,
"grad_norm": 0.0008323266520164907,
"learning_rate": 1.835910923466097e-07,
"loss": 0.0,
"num_input_tokens_seen": 10970528,
"step": 4720
},
{
"epoch": 4.45334590009425,
"grad_norm": 0.002249309793114662,
"learning_rate": 1.805096038436749e-07,
"loss": 0.0,
"num_input_tokens_seen": 10982048,
"step": 4725
},
{
"epoch": 4.458058435438266,
"grad_norm": 0.001825433922931552,
"learning_rate": 1.774532265201867e-07,
"loss": 0.0,
"num_input_tokens_seen": 10994848,
"step": 4730
},
{
"epoch": 4.462770970782281,
"grad_norm": 0.04776029288768768,
"learning_rate": 1.7442199346498294e-07,
"loss": 0.0001,
"num_input_tokens_seen": 11004896,
"step": 4735
},
{
"epoch": 4.467483506126296,
"grad_norm": 0.01915101520717144,
"learning_rate": 1.7141593749468361e-07,
"loss": 0.0,
"num_input_tokens_seen": 11017056,
"step": 4740
},
{
"epoch": 4.472196041470311,
"grad_norm": 0.0076557123102247715,
"learning_rate": 1.6843509115333917e-07,
"loss": 0.0,
"num_input_tokens_seen": 11026912,
"step": 4745
},
{
"epoch": 4.476908576814326,
"grad_norm": 0.010826838202774525,
"learning_rate": 1.6547948671207515e-07,
"loss": 0.0,
"num_input_tokens_seen": 11038176,
"step": 4750
},
{
"epoch": 4.481621112158341,
"grad_norm": 0.0004848201060667634,
"learning_rate": 1.6254915616874645e-07,
"loss": 0.0,
"num_input_tokens_seen": 11047648,
"step": 4755
},
{
"epoch": 4.486333647502356,
"grad_norm": 0.004097946919500828,
"learning_rate": 1.5964413124758492e-07,
"loss": 0.0441,
"num_input_tokens_seen": 11056864,
"step": 4760
},
{
"epoch": 4.491046182846372,
"grad_norm": 0.0032770747784525156,
"learning_rate": 1.5676444339886327e-07,
"loss": 0.0,
"num_input_tokens_seen": 11067744,
"step": 4765
},
{
"epoch": 4.495758718190387,
"grad_norm": 0.0030054976232349873,
"learning_rate": 1.5391012379854937e-07,
"loss": 0.0,
"num_input_tokens_seen": 11077920,
"step": 4770
},
{
"epoch": 4.500471253534402,
"grad_norm": 0.0032186508178710938,
"learning_rate": 1.5108120334797e-07,
"loss": 0.0,
"num_input_tokens_seen": 11088864,
"step": 4775
},
{
"epoch": 4.5051837888784165,
"grad_norm": 0.006772062741219997,
"learning_rate": 1.4827771267347662e-07,
"loss": 0.0,
"num_input_tokens_seen": 11098336,
"step": 4780
},
{
"epoch": 4.5098963242224315,
"grad_norm": 0.003302761586382985,
"learning_rate": 1.4549968212611538e-07,
"loss": 0.0,
"num_input_tokens_seen": 11107680,
"step": 4785
},
{
"epoch": 4.512723845428841,
"eval_loss": 0.5418137311935425,
"eval_runtime": 2.7286,
"eval_samples_per_second": 345.594,
"eval_steps_per_second": 43.245,
"num_input_tokens_seen": 11115040,
"step": 4788
},
{
"epoch": 4.514608859566446,
"grad_norm": 0.009535914286971092,
"learning_rate": 1.4274714178129534e-07,
"loss": 0.0,
"num_input_tokens_seen": 11120480,
"step": 4790
},
{
"epoch": 4.519321394910461,
"grad_norm": 0.0017795681487768888,
"learning_rate": 1.4002012143846472e-07,
"loss": 0.0,
"num_input_tokens_seen": 11132320,
"step": 4795
},
{
"epoch": 4.524033930254477,
"grad_norm": 0.004151622299104929,
"learning_rate": 1.3731865062078853e-07,
"loss": 0.0006,
"num_input_tokens_seen": 11148960,
"step": 4800
},
{
"epoch": 4.528746465598492,
"grad_norm": 0.009143157862126827,
"learning_rate": 1.3464275857482778e-07,
"loss": 0.0,
"num_input_tokens_seen": 11159968,
"step": 4805
},
{
"epoch": 4.533459000942507,
"grad_norm": 0.00789305754005909,
"learning_rate": 1.3199247427022528e-07,
"loss": 0.122,
"num_input_tokens_seen": 11170848,
"step": 4810
},
{
"epoch": 4.538171536286522,
"grad_norm": 0.0016827658982947469,
"learning_rate": 1.293678263993872e-07,
"loss": 0.0,
"num_input_tokens_seen": 11184288,
"step": 4815
},
{
"epoch": 4.542884071630537,
"grad_norm": 0.004033006262034178,
"learning_rate": 1.2676884337717882e-07,
"loss": 0.0,
"num_input_tokens_seen": 11197856,
"step": 4820
},
{
"epoch": 4.547596606974552,
"grad_norm": 0.0016169328009709716,
"learning_rate": 1.241955533406114e-07,
"loss": 0.0,
"num_input_tokens_seen": 11209696,
"step": 4825
},
{
"epoch": 4.552309142318568,
"grad_norm": 0.0008929000468924642,
"learning_rate": 1.2164798414854073e-07,
"loss": 0.0,
"num_input_tokens_seen": 11220064,
"step": 4830
},
{
"epoch": 4.557021677662583,
"grad_norm": 0.002499083988368511,
"learning_rate": 1.1912616338136396e-07,
"loss": 0.0,
"num_input_tokens_seen": 11230304,
"step": 4835
},
{
"epoch": 4.561734213006598,
"grad_norm": 0.0028031610418111086,
"learning_rate": 1.1663011834072257e-07,
"loss": 0.0,
"num_input_tokens_seen": 11240096,
"step": 4840
},
{
"epoch": 4.566446748350613,
"grad_norm": 0.002892805030569434,
"learning_rate": 1.1415987604920492e-07,
"loss": 0.0,
"num_input_tokens_seen": 11251104,
"step": 4845
},
{
"epoch": 4.5711592836946275,
"grad_norm": 0.002147044287994504,
"learning_rate": 1.11715463250055e-07,
"loss": 0.0,
"num_input_tokens_seen": 11261088,
"step": 4850
},
{
"epoch": 4.5758718190386425,
"grad_norm": 69.84650421142578,
"learning_rate": 1.0929690640688218e-07,
"loss": 0.0072,
"num_input_tokens_seen": 11273312,
"step": 4855
},
{
"epoch": 4.580584354382658,
"grad_norm": 0.00039674167055636644,
"learning_rate": 1.0690423170337554e-07,
"loss": 0.0003,
"num_input_tokens_seen": 11284896,
"step": 4860
},
{
"epoch": 4.585296889726673,
"grad_norm": 0.002157399896532297,
"learning_rate": 1.0453746504302003e-07,
"loss": 0.0,
"num_input_tokens_seen": 11294560,
"step": 4865
},
{
"epoch": 4.590009425070688,
"grad_norm": 0.012543817050755024,
"learning_rate": 1.021966320488152e-07,
"loss": 0.0813,
"num_input_tokens_seen": 11308128,
"step": 4870
},
{
"epoch": 4.594721960414703,
"grad_norm": 0.004514993634074926,
"learning_rate": 9.988175806299877e-08,
"loss": 0.0,
"num_input_tokens_seen": 11321056,
"step": 4875
},
{
"epoch": 4.599434495758718,
"grad_norm": 0.06353511661291122,
"learning_rate": 9.759286814677305e-08,
"loss": 0.0,
"num_input_tokens_seen": 11334496,
"step": 4880
},
{
"epoch": 4.604147031102733,
"grad_norm": 0.00232234806753695,
"learning_rate": 9.532998708003061e-08,
"loss": 0.0,
"num_input_tokens_seen": 11346208,
"step": 4885
},
{
"epoch": 4.608859566446748,
"grad_norm": 0.0016301957657560706,
"learning_rate": 9.309313936108983e-08,
"loss": 0.0,
"num_input_tokens_seen": 11358112,
"step": 4890
},
{
"epoch": 4.613572101790764,
"grad_norm": 0.0005986247560940683,
"learning_rate": 9.088234920642703e-08,
"loss": 0.0,
"num_input_tokens_seen": 11368096,
"step": 4895
},
{
"epoch": 4.618284637134779,
"grad_norm": 0.004071133676916361,
"learning_rate": 8.869764055041501e-08,
"loss": 0.0,
"num_input_tokens_seen": 11378976,
"step": 4900
},
{
"epoch": 4.622997172478794,
"grad_norm": 0.0028420898597687483,
"learning_rate": 8.653903704506389e-08,
"loss": 0.0,
"num_input_tokens_seen": 11390688,
"step": 4905
},
{
"epoch": 4.627709707822809,
"grad_norm": 0.011091876775026321,
"learning_rate": 8.440656205976644e-08,
"loss": 0.0,
"num_input_tokens_seen": 11401440,
"step": 4910
},
{
"epoch": 4.632422243166824,
"grad_norm": 0.001331353560090065,
"learning_rate": 8.230023868104231e-08,
"loss": 0.0,
"num_input_tokens_seen": 11412448,
"step": 4915
},
{
"epoch": 4.6371347785108386,
"grad_norm": 0.01889793761074543,
"learning_rate": 8.022008971229039e-08,
"loss": 0.0,
"num_input_tokens_seen": 11422496,
"step": 4920
},
{
"epoch": 4.6418473138548535,
"grad_norm": 0.002929375506937504,
"learning_rate": 7.816613767354098e-08,
"loss": 0.0,
"num_input_tokens_seen": 11433632,
"step": 4925
},
{
"epoch": 4.646559849198869,
"grad_norm": 0.002046855865046382,
"learning_rate": 7.613840480121176e-08,
"loss": 0.0,
"num_input_tokens_seen": 11446112,
"step": 4930
},
{
"epoch": 4.651272384542884,
"grad_norm": 0.0020440209191292524,
"learning_rate": 7.41369130478689e-08,
"loss": 0.0,
"num_input_tokens_seen": 11459552,
"step": 4935
},
{
"epoch": 4.655984919886899,
"grad_norm": 0.01186713483184576,
"learning_rate": 7.216168408198554e-08,
"loss": 0.0,
"num_input_tokens_seen": 11469984,
"step": 4940
},
{
"epoch": 4.660697455230914,
"grad_norm": 0.005305349826812744,
"learning_rate": 7.021273928771221e-08,
"loss": 0.0,
"num_input_tokens_seen": 11481888,
"step": 4945
},
{
"epoch": 4.665409990574929,
"grad_norm": 0.0028958169277757406,
"learning_rate": 6.829009976464102e-08,
"loss": 0.0579,
"num_input_tokens_seen": 11494944,
"step": 4950
},
{
"epoch": 4.670122525918944,
"grad_norm": 0.0005974304513074458,
"learning_rate": 6.639378632757986e-08,
"loss": 0.0,
"num_input_tokens_seen": 11505184,
"step": 4955
},
{
"epoch": 4.674835061262959,
"grad_norm": 0.0025155956391245127,
"learning_rate": 6.452381950632469e-08,
"loss": 0.0,
"num_input_tokens_seen": 11517856,
"step": 4960
},
{
"epoch": 4.679547596606975,
"grad_norm": 0.0031720330007374287,
"learning_rate": 6.268021954544095e-08,
"loss": 0.0,
"num_input_tokens_seen": 11530016,
"step": 4965
},
{
"epoch": 4.68426013195099,
"grad_norm": 0.004775336477905512,
"learning_rate": 6.08630064040408e-08,
"loss": 0.0,
"num_input_tokens_seen": 11545376,
"step": 4970
},
{
"epoch": 4.688972667295005,
"grad_norm": 0.001557852141559124,
"learning_rate": 5.9072199755567936e-08,
"loss": 0.0,
"num_input_tokens_seen": 11556448,
"step": 4975
},
{
"epoch": 4.69368520263902,
"grad_norm": 0.0037680910900235176,
"learning_rate": 5.730781898758614e-08,
"loss": 0.0,
"num_input_tokens_seen": 11566304,
"step": 4980
},
{
"epoch": 4.698397737983035,
"grad_norm": 0.0020881840027868748,
"learning_rate": 5.556988320156831e-08,
"loss": 0.0,
"num_input_tokens_seen": 11577056,
"step": 4985
},
{
"epoch": 4.7031102733270505,
"grad_norm": 0.0009613709407858551,
"learning_rate": 5.3858411212689146e-08,
"loss": 0.0,
"num_input_tokens_seen": 11589536,
"step": 4990
},
{
"epoch": 4.707822808671065,
"grad_norm": 0.11669395118951797,
"learning_rate": 5.2173421549621685e-08,
"loss": 0.0001,
"num_input_tokens_seen": 11599648,
"step": 4995
},
{
"epoch": 4.71253534401508,
"grad_norm": 0.0027235562447458506,
"learning_rate": 5.051493245433775e-08,
"loss": 0.0,
"num_input_tokens_seen": 11610272,
"step": 5000
},
{
"epoch": 4.717247879359095,
"grad_norm": 0.06230725720524788,
"learning_rate": 4.888296188190977e-08,
"loss": 0.0,
"num_input_tokens_seen": 11620768,
"step": 5005
},
{
"epoch": 4.72196041470311,
"grad_norm": 0.0018570433603599668,
"learning_rate": 4.727752750031511e-08,
"loss": 0.0,
"num_input_tokens_seen": 11632608,
"step": 5010
},
{
"epoch": 4.726672950047125,
"grad_norm": 0.01290284376591444,
"learning_rate": 4.5698646690247874e-08,
"loss": 0.0,
"num_input_tokens_seen": 11644896,
"step": 5015
},
{
"epoch": 4.73138548539114,
"grad_norm": 0.005762244574725628,
"learning_rate": 4.414633654492767e-08,
"loss": 0.0,
"num_input_tokens_seen": 11661344,
"step": 5020
},
{
"epoch": 4.736098020735156,
"grad_norm": 0.0015709196450188756,
"learning_rate": 4.2620613869915894e-08,
"loss": 0.0,
"num_input_tokens_seen": 11672288,
"step": 5025
},
{
"epoch": 4.740810556079171,
"grad_norm": 0.0015942390309646726,
"learning_rate": 4.112149518293362e-08,
"loss": 0.0,
"num_input_tokens_seen": 11684960,
"step": 5030
},
{
"epoch": 4.745523091423186,
"grad_norm": 0.013269704766571522,
"learning_rate": 3.9648996713683715e-08,
"loss": 0.0,
"num_input_tokens_seen": 11696160,
"step": 5035
},
{
"epoch": 4.750235626767201,
"grad_norm": 0.004723825957626104,
"learning_rate": 3.8203134403672905e-08,
"loss": 0.0,
"num_input_tokens_seen": 11705952,
"step": 5040
},
{
"epoch": 4.754948162111216,
"grad_norm": 0.006132619455456734,
"learning_rate": 3.678392390604163e-08,
"loss": 0.0,
"num_input_tokens_seen": 11716192,
"step": 5045
},
{
"epoch": 4.759660697455231,
"grad_norm": 0.003462289460003376,
"learning_rate": 3.539138058539282e-08,
"loss": 0.0,
"num_input_tokens_seen": 11728160,
"step": 5050
},
{
"epoch": 4.763430725730443,
"eval_loss": 0.54215008020401,
"eval_runtime": 2.8215,
"eval_samples_per_second": 334.225,
"eval_steps_per_second": 41.822,
"num_input_tokens_seen": 11736672,
"step": 5054
},
{
"epoch": 4.764373232799246,
"grad_norm": 0.0019681937992572784,
"learning_rate": 3.4025519517626174e-08,
"loss": 0.0,
"num_input_tokens_seen": 11738720,
"step": 5055
},
{
"epoch": 4.7690857681432615,
"grad_norm": 0.0026169854681938887,
"learning_rate": 3.268635548977633e-08,
"loss": 0.0,
"num_input_tokens_seen": 11750176,
"step": 5060
},
{
"epoch": 4.773798303487276,
"grad_norm": 0.0066421544179320335,
"learning_rate": 3.137390299984888e-08,
"loss": 0.0,
"num_input_tokens_seen": 11761312,
"step": 5065
},
{
"epoch": 4.778510838831291,
"grad_norm": 0.0028810338117182255,
"learning_rate": 3.0088176256668765e-08,
"loss": 0.0,
"num_input_tokens_seen": 11773728,
"step": 5070
},
{
"epoch": 4.783223374175306,
"grad_norm": 0.001111470046453178,
"learning_rate": 2.8829189179721552e-08,
"loss": 0.0,
"num_input_tokens_seen": 11784672,
"step": 5075
},
{
"epoch": 4.787935909519321,
"grad_norm": 0.009747396223247051,
"learning_rate": 2.759695539900603e-08,
"loss": 0.0,
"num_input_tokens_seen": 11796512,
"step": 5080
},
{
"epoch": 4.792648444863336,
"grad_norm": 0.0038332142867147923,
"learning_rate": 2.639148825488491e-08,
"loss": 0.0,
"num_input_tokens_seen": 11810464,
"step": 5085
},
{
"epoch": 4.797360980207351,
"grad_norm": 0.003033042885363102,
"learning_rate": 2.5212800797941582e-08,
"loss": 0.0,
"num_input_tokens_seen": 11820768,
"step": 5090
},
{
"epoch": 4.802073515551367,
"grad_norm": 0.007614613976329565,
"learning_rate": 2.406090578883691e-08,
"loss": 0.0,
"num_input_tokens_seen": 11831776,
"step": 5095
},
{
"epoch": 4.806786050895382,
"grad_norm": 0.0031820612493902445,
"learning_rate": 2.2935815698174045e-08,
"loss": 0.0,
"num_input_tokens_seen": 11843296,
"step": 5100
},
{
"epoch": 4.811498586239397,
"grad_norm": 0.00583045044913888,
"learning_rate": 2.1837542706359958e-08,
"loss": 0.0,
"num_input_tokens_seen": 11860000,
"step": 5105
},
{
"epoch": 4.816211121583412,
"grad_norm": 0.016117779538035393,
"learning_rate": 2.0766098703477178e-08,
"loss": 0.0,
"num_input_tokens_seen": 11872160,
"step": 5110
},
{
"epoch": 4.820923656927427,
"grad_norm": 0.006356321275234222,
"learning_rate": 1.9721495289152237e-08,
"loss": 0.0,
"num_input_tokens_seen": 11883168,
"step": 5115
},
{
"epoch": 4.825636192271442,
"grad_norm": 0.0020537914242595434,
"learning_rate": 1.8703743772430783e-08,
"loss": 0.0,
"num_input_tokens_seen": 11895584,
"step": 5120
},
{
"epoch": 4.830348727615457,
"grad_norm": 0.006692873314023018,
"learning_rate": 1.7712855171655996e-08,
"loss": 0.0,
"num_input_tokens_seen": 11906784,
"step": 5125
},
{
"epoch": 4.8350612629594725,
"grad_norm": 0.0012435702374204993,
"learning_rate": 1.6748840214348972e-08,
"loss": 0.0,
"num_input_tokens_seen": 11917600,
"step": 5130
},
{
"epoch": 4.839773798303487,
"grad_norm": 0.00899266917258501,
"learning_rate": 1.5811709337091862e-08,
"loss": 0.0,
"num_input_tokens_seen": 11929632,
"step": 5135
},
{
"epoch": 4.844486333647502,
"grad_norm": 0.0016645839205011725,
"learning_rate": 1.4901472685415475e-08,
"loss": 0.0,
"num_input_tokens_seen": 11938720,
"step": 5140
},
{
"epoch": 4.849198868991517,
"grad_norm": 0.00441192090511322,
"learning_rate": 1.4018140113689904e-08,
"loss": 0.0072,
"num_input_tokens_seen": 11951648,
"step": 5145
},
{
"epoch": 4.853911404335532,
"grad_norm": 0.001967532094568014,
"learning_rate": 1.3161721185016852e-08,
"loss": 0.0,
"num_input_tokens_seen": 11962336,
"step": 5150
},
{
"epoch": 4.858623939679548,
"grad_norm": 0.005055475980043411,
"learning_rate": 1.2332225171126366e-08,
"loss": 0.0,
"num_input_tokens_seen": 11975904,
"step": 5155
},
{
"epoch": 4.863336475023563,
"grad_norm": 0.014400842599570751,
"learning_rate": 1.152966105227693e-08,
"loss": 0.0,
"num_input_tokens_seen": 11986208,
"step": 5160
},
{
"epoch": 4.868049010367578,
"grad_norm": 0.0008923859568312764,
"learning_rate": 1.0754037517158312e-08,
"loss": 0.0,
"num_input_tokens_seen": 11999520,
"step": 5165
},
{
"epoch": 4.872761545711593,
"grad_norm": 0.0033309967257082462,
"learning_rate": 1.0005362962796362e-08,
"loss": 0.0,
"num_input_tokens_seen": 12011424,
"step": 5170
},
{
"epoch": 4.877474081055608,
"grad_norm": 0.005915912799537182,
"learning_rate": 9.283645494463368e-09,
"loss": 0.0,
"num_input_tokens_seen": 12024864,
"step": 5175
},
{
"epoch": 4.882186616399623,
"grad_norm": 0.0010997394565492868,
"learning_rate": 8.588892925590064e-09,
"loss": 0.0,
"num_input_tokens_seen": 12035936,
"step": 5180
},
{
"epoch": 4.886899151743638,
"grad_norm": 0.005926317069679499,
"learning_rate": 7.92111277768015e-09,
"loss": 0.0,
"num_input_tokens_seen": 12045920,
"step": 5185
},
{
"epoch": 4.891611687087654,
"grad_norm": 0.011491699144244194,
"learning_rate": 7.280312280230073e-09,
"loss": 0.0,
"num_input_tokens_seen": 12058144,
"step": 5190
},
{
"epoch": 4.8963242224316685,
"grad_norm": 0.0012660275679081678,
"learning_rate": 6.666498370650198e-09,
"loss": 0.0,
"num_input_tokens_seen": 12069792,
"step": 5195
},
{
"epoch": 4.9010367577756835,
"grad_norm": 0.03111676499247551,
"learning_rate": 6.079677694189046e-09,
"loss": 0.0,
"num_input_tokens_seen": 12080864,
"step": 5200
},
{
"epoch": 4.905749293119698,
"grad_norm": 0.0013736054534092546,
"learning_rate": 5.5198566038627835e-09,
"loss": 0.0,
"num_input_tokens_seen": 12092256,
"step": 5205
},
{
"epoch": 4.910461828463713,
"grad_norm": 0.008520975708961487,
"learning_rate": 4.987041160385287e-09,
"loss": 0.0,
"num_input_tokens_seen": 12106784,
"step": 5210
},
{
"epoch": 4.915174363807728,
"grad_norm": 0.038982491940259933,
"learning_rate": 4.481237132103189e-09,
"loss": 0.0003,
"num_input_tokens_seen": 12117088,
"step": 5215
},
{
"epoch": 4.919886899151743,
"grad_norm": 0.0053964899852871895,
"learning_rate": 4.002449994932878e-09,
"loss": 0.0,
"num_input_tokens_seen": 12128736,
"step": 5220
},
{
"epoch": 4.924599434495759,
"grad_norm": 0.0056511214934289455,
"learning_rate": 3.550684932301374e-09,
"loss": 0.0,
"num_input_tokens_seen": 12145376,
"step": 5225
},
{
"epoch": 4.929311969839774,
"grad_norm": 0.004293152131140232,
"learning_rate": 3.1259468350910982e-09,
"loss": 0.0,
"num_input_tokens_seen": 12156320,
"step": 5230
},
{
"epoch": 4.934024505183789,
"grad_norm": 0.002154400572180748,
"learning_rate": 2.7282403015849167e-09,
"loss": 0.0,
"num_input_tokens_seen": 12167968,
"step": 5235
},
{
"epoch": 4.938737040527804,
"grad_norm": 0.0020166414324194193,
"learning_rate": 2.3575696374189548e-09,
"loss": 0.0,
"num_input_tokens_seen": 12179744,
"step": 5240
},
{
"epoch": 4.943449575871819,
"grad_norm": 0.0013167713768780231,
"learning_rate": 2.013938855533748e-09,
"loss": 0.0001,
"num_input_tokens_seen": 12192288,
"step": 5245
},
{
"epoch": 4.948162111215834,
"grad_norm": 0.00897101778537035,
"learning_rate": 1.6973516761317755e-09,
"loss": 0.0,
"num_input_tokens_seen": 12203360,
"step": 5250
},
{
"epoch": 4.952874646559849,
"grad_norm": 0.7944397330284119,
"learning_rate": 1.407811526637215e-09,
"loss": 0.0007,
"num_input_tokens_seen": 12215392,
"step": 5255
},
{
"epoch": 4.957587181903865,
"grad_norm": 0.004428845830261707,
"learning_rate": 1.145321541659028e-09,
"loss": 0.0,
"num_input_tokens_seen": 12227680,
"step": 5260
},
{
"epoch": 4.9622997172478795,
"grad_norm": 0.005368073936551809,
"learning_rate": 9.098845629559871e-10,
"loss": 0.0,
"num_input_tokens_seen": 12242336,
"step": 5265
},
{
"epoch": 4.9670122525918945,
"grad_norm": 0.055548761039972305,
"learning_rate": 7.015031394072557e-10,
"loss": 0.0,
"num_input_tokens_seen": 12251936,
"step": 5270
},
{
"epoch": 4.971724787935909,
"grad_norm": 0.006647925358265638,
"learning_rate": 5.201795269837995e-10,
"loss": 0.0,
"num_input_tokens_seen": 12262432,
"step": 5275
},
{
"epoch": 4.976437323279924,
"grad_norm": 0.0016112312441691756,
"learning_rate": 3.6591568872451634e-10,
"loss": 0.0,
"num_input_tokens_seen": 12275872,
"step": 5280
},
{
"epoch": 4.981149858623939,
"grad_norm": 0.003358457935974002,
"learning_rate": 2.387132947151427e-10,
"loss": 0.0,
"num_input_tokens_seen": 12288928,
"step": 5285
},
{
"epoch": 4.985862393967954,
"grad_norm": 0.011470218189060688,
"learning_rate": 1.3857372206882436e-10,
"loss": 0.0969,
"num_input_tokens_seen": 12300832,
"step": 5290
},
{
"epoch": 4.99057492931197,
"grad_norm": 0.018770242109894753,
"learning_rate": 6.549805491307127e-11,
"loss": 0.0,
"num_input_tokens_seen": 12312352,
"step": 5295
},
{
"epoch": 4.995287464655985,
"grad_norm": 0.002519431058317423,
"learning_rate": 1.948708437726765e-11,
"loss": 0.0,
"num_input_tokens_seen": 12322528,
"step": 5300
},
{
"epoch": 5.0,
"grad_norm": 0.0025235991925001144,
"learning_rate": 5.413085829575338e-13,
"loss": 0.0,
"num_input_tokens_seen": 12333600,
"step": 5305
},
{
"epoch": 5.0,
"num_input_tokens_seen": 12333600,
"step": 5305,
"total_flos": 7.20143693217792e+16,
"train_loss": 0.11108403178044919,
"train_runtime": 1575.5047,
"train_samples_per_second": 26.925,
"train_steps_per_second": 3.367
}
],
"logging_steps": 5,
"max_steps": 5305,
"num_input_tokens_seen": 12333600,
"num_train_epochs": 5,
"save_steps": 266,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 7.20143693217792e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}