Files
train_sst2_42_1779354537/trainer_state.json
ModelHub XC effa9b2bb0 初始化项目,由ModelHub XC社区提供模型
Model: rbelanec/train_sst2_42_1779354537
Source: Original Platform
2026-06-10 23:53:51 +08:00

12336 lines
323 KiB
JSON

{
"best_global_step": 5306,
"best_metric": 0.09084735810756683,
"best_model_checkpoint": "saves_bts_preliminary/freeze/llama-3.2-1b-instruct/train_sst2_42_1779354537/checkpoint-5306",
"epoch": 1.0,
"eval_steps": 379,
"global_step": 7577,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0006598917777484492,
"grad_norm": 442.1559753417969,
"learning_rate": 1.0554089709762531e-08,
"loss": 1.413,
"num_input_tokens_seen": 2240,
"step": 5
},
{
"epoch": 0.0013197835554968984,
"grad_norm": 437.0361328125,
"learning_rate": 2.3746701846965696e-08,
"loss": 1.5134,
"num_input_tokens_seen": 4672,
"step": 10
},
{
"epoch": 0.0019796753332453477,
"grad_norm": 441.0426330566406,
"learning_rate": 3.6939313984168866e-08,
"loss": 1.3995,
"num_input_tokens_seen": 7040,
"step": 15
},
{
"epoch": 0.002639567110993797,
"grad_norm": 481.140380859375,
"learning_rate": 5.013192612137203e-08,
"loss": 1.422,
"num_input_tokens_seen": 9600,
"step": 20
},
{
"epoch": 0.0032994588887422464,
"grad_norm": 432.22509765625,
"learning_rate": 6.33245382585752e-08,
"loss": 1.2976,
"num_input_tokens_seen": 12160,
"step": 25
},
{
"epoch": 0.0039593506664906955,
"grad_norm": 370.72210693359375,
"learning_rate": 7.651715039577835e-08,
"loss": 1.3834,
"num_input_tokens_seen": 14528,
"step": 30
},
{
"epoch": 0.004619242444239145,
"grad_norm": 356.2071533203125,
"learning_rate": 8.970976253298153e-08,
"loss": 1.1343,
"num_input_tokens_seen": 16768,
"step": 35
},
{
"epoch": 0.005279134221987594,
"grad_norm": 318.7434997558594,
"learning_rate": 1.0290237467018468e-07,
"loss": 1.1513,
"num_input_tokens_seen": 19264,
"step": 40
},
{
"epoch": 0.005939025999736044,
"grad_norm": 264.9164733886719,
"learning_rate": 1.1609498680738786e-07,
"loss": 0.8287,
"num_input_tokens_seen": 21632,
"step": 45
},
{
"epoch": 0.006598917777484493,
"grad_norm": 186.7604217529297,
"learning_rate": 1.29287598944591e-07,
"loss": 0.7425,
"num_input_tokens_seen": 24000,
"step": 50
},
{
"epoch": 0.007258809555232942,
"grad_norm": 162.783203125,
"learning_rate": 1.424802110817942e-07,
"loss": 0.7064,
"num_input_tokens_seen": 26496,
"step": 55
},
{
"epoch": 0.007918701332981391,
"grad_norm": 45.531829833984375,
"learning_rate": 1.5567282321899736e-07,
"loss": 0.3853,
"num_input_tokens_seen": 29120,
"step": 60
},
{
"epoch": 0.008578593110729841,
"grad_norm": 28.7725887298584,
"learning_rate": 1.688654353562005e-07,
"loss": 0.3076,
"num_input_tokens_seen": 31744,
"step": 65
},
{
"epoch": 0.00923848488847829,
"grad_norm": 51.50053024291992,
"learning_rate": 1.820580474934037e-07,
"loss": 0.2971,
"num_input_tokens_seen": 34176,
"step": 70
},
{
"epoch": 0.009898376666226739,
"grad_norm": 48.889060974121094,
"learning_rate": 1.9525065963060686e-07,
"loss": 0.3004,
"num_input_tokens_seen": 36864,
"step": 75
},
{
"epoch": 0.010558268443975187,
"grad_norm": 26.879812240600586,
"learning_rate": 2.0844327176781002e-07,
"loss": 0.2532,
"num_input_tokens_seen": 39424,
"step": 80
},
{
"epoch": 0.011218160221723637,
"grad_norm": 41.192012786865234,
"learning_rate": 2.2163588390501316e-07,
"loss": 0.2616,
"num_input_tokens_seen": 42112,
"step": 85
},
{
"epoch": 0.011878051999472087,
"grad_norm": 29.90664291381836,
"learning_rate": 2.3482849604221635e-07,
"loss": 0.2528,
"num_input_tokens_seen": 44544,
"step": 90
},
{
"epoch": 0.012537943777220536,
"grad_norm": 42.19837951660156,
"learning_rate": 2.480211081794195e-07,
"loss": 0.1904,
"num_input_tokens_seen": 47104,
"step": 95
},
{
"epoch": 0.013197835554968985,
"grad_norm": 23.84099769592285,
"learning_rate": 2.612137203166227e-07,
"loss": 0.1653,
"num_input_tokens_seen": 49664,
"step": 100
},
{
"epoch": 0.013857727332717434,
"grad_norm": 22.509811401367188,
"learning_rate": 2.744063324538258e-07,
"loss": 0.137,
"num_input_tokens_seen": 52352,
"step": 105
},
{
"epoch": 0.014517619110465884,
"grad_norm": 86.81343078613281,
"learning_rate": 2.8759894459102903e-07,
"loss": 0.102,
"num_input_tokens_seen": 54720,
"step": 110
},
{
"epoch": 0.015177510888214334,
"grad_norm": 43.25410461425781,
"learning_rate": 3.007915567282322e-07,
"loss": 0.1477,
"num_input_tokens_seen": 57152,
"step": 115
},
{
"epoch": 0.015837402665962782,
"grad_norm": 98.41868591308594,
"learning_rate": 3.139841688654353e-07,
"loss": 0.2005,
"num_input_tokens_seen": 59776,
"step": 120
},
{
"epoch": 0.01649729444371123,
"grad_norm": 23.211767196655273,
"learning_rate": 3.271767810026385e-07,
"loss": 0.1417,
"num_input_tokens_seen": 62464,
"step": 125
},
{
"epoch": 0.017157186221459682,
"grad_norm": 80.11978912353516,
"learning_rate": 3.403693931398417e-07,
"loss": 0.1226,
"num_input_tokens_seen": 65088,
"step": 130
},
{
"epoch": 0.01781707799920813,
"grad_norm": 74.83419036865234,
"learning_rate": 3.5356200527704485e-07,
"loss": 0.2123,
"num_input_tokens_seen": 67776,
"step": 135
},
{
"epoch": 0.01847696977695658,
"grad_norm": 67.47618865966797,
"learning_rate": 3.66754617414248e-07,
"loss": 0.2606,
"num_input_tokens_seen": 70400,
"step": 140
},
{
"epoch": 0.01913686155470503,
"grad_norm": 51.34063720703125,
"learning_rate": 3.7994722955145113e-07,
"loss": 0.1463,
"num_input_tokens_seen": 72704,
"step": 145
},
{
"epoch": 0.019796753332453478,
"grad_norm": 63.031131744384766,
"learning_rate": 3.9313984168865435e-07,
"loss": 0.346,
"num_input_tokens_seen": 75136,
"step": 150
},
{
"epoch": 0.020456645110201926,
"grad_norm": 25.44994354248047,
"learning_rate": 4.063324538258575e-07,
"loss": 0.0609,
"num_input_tokens_seen": 77632,
"step": 155
},
{
"epoch": 0.021116536887950375,
"grad_norm": 72.61922454833984,
"learning_rate": 4.195250659630606e-07,
"loss": 0.2492,
"num_input_tokens_seen": 80320,
"step": 160
},
{
"epoch": 0.021776428665698826,
"grad_norm": 107.4610824584961,
"learning_rate": 4.3271767810026384e-07,
"loss": 0.1542,
"num_input_tokens_seen": 82752,
"step": 165
},
{
"epoch": 0.022436320443447275,
"grad_norm": 148.21913146972656,
"learning_rate": 4.45910290237467e-07,
"loss": 0.3095,
"num_input_tokens_seen": 85248,
"step": 170
},
{
"epoch": 0.023096212221195723,
"grad_norm": 166.9718475341797,
"learning_rate": 4.5910290237467017e-07,
"loss": 0.2917,
"num_input_tokens_seen": 87872,
"step": 175
},
{
"epoch": 0.023756103998944175,
"grad_norm": 49.3593864440918,
"learning_rate": 4.7229551451187333e-07,
"loss": 0.1369,
"num_input_tokens_seen": 90368,
"step": 180
},
{
"epoch": 0.024415995776692623,
"grad_norm": 39.84811019897461,
"learning_rate": 4.854881266490765e-07,
"loss": 0.0624,
"num_input_tokens_seen": 92928,
"step": 185
},
{
"epoch": 0.02507588755444107,
"grad_norm": 61.82024383544922,
"learning_rate": 4.986807387862796e-07,
"loss": 0.1617,
"num_input_tokens_seen": 95296,
"step": 190
},
{
"epoch": 0.02573577933218952,
"grad_norm": 56.04179763793945,
"learning_rate": 5.118733509234829e-07,
"loss": 0.1049,
"num_input_tokens_seen": 97984,
"step": 195
},
{
"epoch": 0.02639567110993797,
"grad_norm": 102.43315124511719,
"learning_rate": 5.250659630606859e-07,
"loss": 0.1992,
"num_input_tokens_seen": 100352,
"step": 200
},
{
"epoch": 0.02705556288768642,
"grad_norm": 17.690580368041992,
"learning_rate": 5.382585751978892e-07,
"loss": 0.1434,
"num_input_tokens_seen": 102464,
"step": 205
},
{
"epoch": 0.027715454665434867,
"grad_norm": 14.600909233093262,
"learning_rate": 5.514511873350924e-07,
"loss": 0.1237,
"num_input_tokens_seen": 105088,
"step": 210
},
{
"epoch": 0.02837534644318332,
"grad_norm": 16.153587341308594,
"learning_rate": 5.646437994722954e-07,
"loss": 0.3024,
"num_input_tokens_seen": 107648,
"step": 215
},
{
"epoch": 0.029035238220931767,
"grad_norm": 108.86598205566406,
"learning_rate": 5.778364116094987e-07,
"loss": 0.2383,
"num_input_tokens_seen": 110144,
"step": 220
},
{
"epoch": 0.029695129998680216,
"grad_norm": 30.326353073120117,
"learning_rate": 5.910290237467019e-07,
"loss": 0.1367,
"num_input_tokens_seen": 112576,
"step": 225
},
{
"epoch": 0.030355021776428667,
"grad_norm": 56.80265808105469,
"learning_rate": 6.042216358839049e-07,
"loss": 0.2351,
"num_input_tokens_seen": 115264,
"step": 230
},
{
"epoch": 0.031014913554177116,
"grad_norm": 17.4112606048584,
"learning_rate": 6.174142480211082e-07,
"loss": 0.0839,
"num_input_tokens_seen": 117888,
"step": 235
},
{
"epoch": 0.031674805331925564,
"grad_norm": 34.93491744995117,
"learning_rate": 6.306068601583114e-07,
"loss": 0.0189,
"num_input_tokens_seen": 120320,
"step": 240
},
{
"epoch": 0.032334697109674015,
"grad_norm": 89.36637115478516,
"learning_rate": 6.437994722955144e-07,
"loss": 0.1832,
"num_input_tokens_seen": 122688,
"step": 245
},
{
"epoch": 0.03299458888742246,
"grad_norm": 80.80409240722656,
"learning_rate": 6.569920844327177e-07,
"loss": 0.2718,
"num_input_tokens_seen": 125248,
"step": 250
},
{
"epoch": 0.03365448066517091,
"grad_norm": 89.6211166381836,
"learning_rate": 6.701846965699208e-07,
"loss": 0.2037,
"num_input_tokens_seen": 127680,
"step": 255
},
{
"epoch": 0.034314372442919364,
"grad_norm": 17.70661735534668,
"learning_rate": 6.833773087071239e-07,
"loss": 0.0833,
"num_input_tokens_seen": 130496,
"step": 260
},
{
"epoch": 0.03497426422066781,
"grad_norm": 31.946949005126953,
"learning_rate": 6.965699208443272e-07,
"loss": 0.19,
"num_input_tokens_seen": 132992,
"step": 265
},
{
"epoch": 0.03563415599841626,
"grad_norm": 58.67094802856445,
"learning_rate": 7.097625329815303e-07,
"loss": 0.1429,
"num_input_tokens_seen": 135040,
"step": 270
},
{
"epoch": 0.03629404777616471,
"grad_norm": 67.81129455566406,
"learning_rate": 7.229551451187335e-07,
"loss": 0.2488,
"num_input_tokens_seen": 137600,
"step": 275
},
{
"epoch": 0.03695393955391316,
"grad_norm": 61.778663635253906,
"learning_rate": 7.361477572559367e-07,
"loss": 0.1125,
"num_input_tokens_seen": 139904,
"step": 280
},
{
"epoch": 0.03761383133166161,
"grad_norm": 15.433332443237305,
"learning_rate": 7.493403693931398e-07,
"loss": 0.1465,
"num_input_tokens_seen": 142016,
"step": 285
},
{
"epoch": 0.03827372310941006,
"grad_norm": 8.442534446716309,
"learning_rate": 7.62532981530343e-07,
"loss": 0.0092,
"num_input_tokens_seen": 144576,
"step": 290
},
{
"epoch": 0.038933614887158505,
"grad_norm": 112.9028091430664,
"learning_rate": 7.757255936675461e-07,
"loss": 0.0894,
"num_input_tokens_seen": 146880,
"step": 295
},
{
"epoch": 0.039593506664906956,
"grad_norm": 95.21463775634766,
"learning_rate": 7.889182058047493e-07,
"loss": 0.3917,
"num_input_tokens_seen": 149184,
"step": 300
},
{
"epoch": 0.0402533984426554,
"grad_norm": 0.18586313724517822,
"learning_rate": 8.021108179419525e-07,
"loss": 0.2216,
"num_input_tokens_seen": 151296,
"step": 305
},
{
"epoch": 0.04091329022040385,
"grad_norm": 79.36148071289062,
"learning_rate": 8.153034300791555e-07,
"loss": 0.2024,
"num_input_tokens_seen": 153664,
"step": 310
},
{
"epoch": 0.041573181998152305,
"grad_norm": 55.61602020263672,
"learning_rate": 8.284960422163588e-07,
"loss": 0.2624,
"num_input_tokens_seen": 156032,
"step": 315
},
{
"epoch": 0.04223307377590075,
"grad_norm": 0.2602592408657074,
"learning_rate": 8.41688654353562e-07,
"loss": 0.2276,
"num_input_tokens_seen": 158528,
"step": 320
},
{
"epoch": 0.0428929655536492,
"grad_norm": 18.09028434753418,
"learning_rate": 8.54881266490765e-07,
"loss": 0.1227,
"num_input_tokens_seen": 160704,
"step": 325
},
{
"epoch": 0.04355285733139765,
"grad_norm": 9.590105056762695,
"learning_rate": 8.680738786279683e-07,
"loss": 0.1889,
"num_input_tokens_seen": 163072,
"step": 330
},
{
"epoch": 0.0442127491091461,
"grad_norm": 17.7542724609375,
"learning_rate": 8.812664907651715e-07,
"loss": 0.1251,
"num_input_tokens_seen": 165568,
"step": 335
},
{
"epoch": 0.04487264088689455,
"grad_norm": 0.6858607530593872,
"learning_rate": 8.944591029023745e-07,
"loss": 0.2737,
"num_input_tokens_seen": 167936,
"step": 340
},
{
"epoch": 0.045532532664643,
"grad_norm": 168.79275512695312,
"learning_rate": 9.076517150395778e-07,
"loss": 0.0704,
"num_input_tokens_seen": 170176,
"step": 345
},
{
"epoch": 0.046192424442391446,
"grad_norm": 137.41461181640625,
"learning_rate": 9.20844327176781e-07,
"loss": 0.1521,
"num_input_tokens_seen": 172352,
"step": 350
},
{
"epoch": 0.0468523162201399,
"grad_norm": 42.20560073852539,
"learning_rate": 9.340369393139841e-07,
"loss": 0.2593,
"num_input_tokens_seen": 175168,
"step": 355
},
{
"epoch": 0.04751220799788835,
"grad_norm": 1.3534170389175415,
"learning_rate": 9.472295514511873e-07,
"loss": 0.1364,
"num_input_tokens_seen": 177856,
"step": 360
},
{
"epoch": 0.048172099775636794,
"grad_norm": 38.15507125854492,
"learning_rate": 9.604221635883904e-07,
"loss": 0.3046,
"num_input_tokens_seen": 180160,
"step": 365
},
{
"epoch": 0.048831991553385246,
"grad_norm": 143.81930541992188,
"learning_rate": 9.736147757255936e-07,
"loss": 0.1873,
"num_input_tokens_seen": 182784,
"step": 370
},
{
"epoch": 0.0494918833311337,
"grad_norm": 0.975854754447937,
"learning_rate": 9.86807387862797e-07,
"loss": 0.0584,
"num_input_tokens_seen": 185024,
"step": 375
},
{
"epoch": 0.05001979675333245,
"eval_loss": 0.17531457543373108,
"eval_runtime": 7.4576,
"eval_samples_per_second": 903.11,
"eval_steps_per_second": 112.905,
"num_input_tokens_seen": 187072,
"step": 379
},
{
"epoch": 0.05015177510888214,
"grad_norm": 59.641075134277344,
"learning_rate": 1e-06,
"loss": 0.1671,
"num_input_tokens_seen": 187712,
"step": 380
},
{
"epoch": 0.050811666886630594,
"grad_norm": 49.6882209777832,
"learning_rate": 1.0131926121372032e-06,
"loss": 0.2583,
"num_input_tokens_seen": 190400,
"step": 385
},
{
"epoch": 0.05147155866437904,
"grad_norm": 53.95302963256836,
"learning_rate": 1.0263852242744063e-06,
"loss": 0.1182,
"num_input_tokens_seen": 193280,
"step": 390
},
{
"epoch": 0.05213145044212749,
"grad_norm": 30.752676010131836,
"learning_rate": 1.0395778364116096e-06,
"loss": 0.1517,
"num_input_tokens_seen": 195584,
"step": 395
},
{
"epoch": 0.05279134221987594,
"grad_norm": 14.899801254272461,
"learning_rate": 1.0527704485488126e-06,
"loss": 0.0928,
"num_input_tokens_seen": 198208,
"step": 400
},
{
"epoch": 0.05345123399762439,
"grad_norm": 85.27983093261719,
"learning_rate": 1.0659630606860157e-06,
"loss": 0.2377,
"num_input_tokens_seen": 200704,
"step": 405
},
{
"epoch": 0.05411112577537284,
"grad_norm": 81.29493713378906,
"learning_rate": 1.079155672823219e-06,
"loss": 0.2115,
"num_input_tokens_seen": 203136,
"step": 410
},
{
"epoch": 0.05477101755312129,
"grad_norm": 64.4339370727539,
"learning_rate": 1.0923482849604222e-06,
"loss": 0.1501,
"num_input_tokens_seen": 205504,
"step": 415
},
{
"epoch": 0.055430909330869735,
"grad_norm": 108.44839477539062,
"learning_rate": 1.1055408970976253e-06,
"loss": 0.2442,
"num_input_tokens_seen": 208000,
"step": 420
},
{
"epoch": 0.056090801108618187,
"grad_norm": 229.58607482910156,
"learning_rate": 1.1187335092348285e-06,
"loss": 0.0951,
"num_input_tokens_seen": 210560,
"step": 425
},
{
"epoch": 0.05675069288636664,
"grad_norm": 34.30244827270508,
"learning_rate": 1.1319261213720316e-06,
"loss": 0.1749,
"num_input_tokens_seen": 213056,
"step": 430
},
{
"epoch": 0.05741058466411508,
"grad_norm": 48.8880500793457,
"learning_rate": 1.1451187335092347e-06,
"loss": 0.1071,
"num_input_tokens_seen": 215296,
"step": 435
},
{
"epoch": 0.058070476441863535,
"grad_norm": 0.0803152322769165,
"learning_rate": 1.158311345646438e-06,
"loss": 0.0089,
"num_input_tokens_seen": 217472,
"step": 440
},
{
"epoch": 0.058730368219611986,
"grad_norm": 173.73971557617188,
"learning_rate": 1.1715039577836412e-06,
"loss": 0.0408,
"num_input_tokens_seen": 219968,
"step": 445
},
{
"epoch": 0.05939025999736043,
"grad_norm": 0.06401122361421585,
"learning_rate": 1.1846965699208443e-06,
"loss": 0.0381,
"num_input_tokens_seen": 222592,
"step": 450
},
{
"epoch": 0.06005015177510888,
"grad_norm": 118.57979583740234,
"learning_rate": 1.1978891820580475e-06,
"loss": 0.5112,
"num_input_tokens_seen": 224768,
"step": 455
},
{
"epoch": 0.060710043552857335,
"grad_norm": 3.2848386764526367,
"learning_rate": 1.2110817941952508e-06,
"loss": 0.2367,
"num_input_tokens_seen": 227264,
"step": 460
},
{
"epoch": 0.06136993533060578,
"grad_norm": 39.09904861450195,
"learning_rate": 1.2242744063324536e-06,
"loss": 0.2476,
"num_input_tokens_seen": 229760,
"step": 465
},
{
"epoch": 0.06202982710835423,
"grad_norm": 75.33295440673828,
"learning_rate": 1.237467018469657e-06,
"loss": 0.1846,
"num_input_tokens_seen": 232000,
"step": 470
},
{
"epoch": 0.06268971888610268,
"grad_norm": 1.241437315940857,
"learning_rate": 1.2506596306068602e-06,
"loss": 0.1636,
"num_input_tokens_seen": 234176,
"step": 475
},
{
"epoch": 0.06334961066385113,
"grad_norm": 0.22346267104148865,
"learning_rate": 1.2638522427440632e-06,
"loss": 0.1056,
"num_input_tokens_seen": 236736,
"step": 480
},
{
"epoch": 0.06400950244159957,
"grad_norm": 0.07724567502737045,
"learning_rate": 1.2770448548812665e-06,
"loss": 0.0032,
"num_input_tokens_seen": 239104,
"step": 485
},
{
"epoch": 0.06466939421934803,
"grad_norm": 162.94638061523438,
"learning_rate": 1.2902374670184698e-06,
"loss": 0.4157,
"num_input_tokens_seen": 241600,
"step": 490
},
{
"epoch": 0.06532928599709648,
"grad_norm": 143.5639190673828,
"learning_rate": 1.3034300791556726e-06,
"loss": 0.3641,
"num_input_tokens_seen": 244160,
"step": 495
},
{
"epoch": 0.06598917777484492,
"grad_norm": 0.8768963813781738,
"learning_rate": 1.316622691292876e-06,
"loss": 0.175,
"num_input_tokens_seen": 246464,
"step": 500
},
{
"epoch": 0.06664906955259338,
"grad_norm": 1.546777606010437,
"learning_rate": 1.3298153034300792e-06,
"loss": 0.0893,
"num_input_tokens_seen": 248960,
"step": 505
},
{
"epoch": 0.06730896133034182,
"grad_norm": 82.6065902709961,
"learning_rate": 1.3430079155672822e-06,
"loss": 0.1824,
"num_input_tokens_seen": 251392,
"step": 510
},
{
"epoch": 0.06796885310809027,
"grad_norm": 29.751699447631836,
"learning_rate": 1.3562005277044855e-06,
"loss": 0.2085,
"num_input_tokens_seen": 253888,
"step": 515
},
{
"epoch": 0.06862874488583873,
"grad_norm": 26.04964828491211,
"learning_rate": 1.3693931398416888e-06,
"loss": 0.1502,
"num_input_tokens_seen": 256384,
"step": 520
},
{
"epoch": 0.06928863666358717,
"grad_norm": 60.13207244873047,
"learning_rate": 1.3825857519788916e-06,
"loss": 0.1721,
"num_input_tokens_seen": 258496,
"step": 525
},
{
"epoch": 0.06994852844133562,
"grad_norm": 0.15003204345703125,
"learning_rate": 1.3957783641160949e-06,
"loss": 0.0959,
"num_input_tokens_seen": 260864,
"step": 530
},
{
"epoch": 0.07060842021908408,
"grad_norm": 19.173173904418945,
"learning_rate": 1.4089709762532982e-06,
"loss": 0.0484,
"num_input_tokens_seen": 263360,
"step": 535
},
{
"epoch": 0.07126831199683252,
"grad_norm": 233.016357421875,
"learning_rate": 1.4221635883905012e-06,
"loss": 0.306,
"num_input_tokens_seen": 266112,
"step": 540
},
{
"epoch": 0.07192820377458096,
"grad_norm": 39.65471267700195,
"learning_rate": 1.4353562005277045e-06,
"loss": 0.2425,
"num_input_tokens_seen": 268544,
"step": 545
},
{
"epoch": 0.07258809555232942,
"grad_norm": 0.22139348089694977,
"learning_rate": 1.4485488126649078e-06,
"loss": 0.4216,
"num_input_tokens_seen": 270912,
"step": 550
},
{
"epoch": 0.07324798733007787,
"grad_norm": 0.2836262285709381,
"learning_rate": 1.4617414248021108e-06,
"loss": 0.0023,
"num_input_tokens_seen": 273664,
"step": 555
},
{
"epoch": 0.07390787910782631,
"grad_norm": 91.80794525146484,
"learning_rate": 1.4749340369393139e-06,
"loss": 0.1737,
"num_input_tokens_seen": 276160,
"step": 560
},
{
"epoch": 0.07456777088557477,
"grad_norm": 0.3804304897785187,
"learning_rate": 1.4881266490765171e-06,
"loss": 0.0057,
"num_input_tokens_seen": 278784,
"step": 565
},
{
"epoch": 0.07522766266332322,
"grad_norm": 25.789026260375977,
"learning_rate": 1.5013192612137202e-06,
"loss": 0.2339,
"num_input_tokens_seen": 281152,
"step": 570
},
{
"epoch": 0.07588755444107166,
"grad_norm": 56.50286865234375,
"learning_rate": 1.5145118733509235e-06,
"loss": 0.2223,
"num_input_tokens_seen": 283456,
"step": 575
},
{
"epoch": 0.07654744621882012,
"grad_norm": 3.2999558448791504,
"learning_rate": 1.5277044854881265e-06,
"loss": 0.0743,
"num_input_tokens_seen": 286016,
"step": 580
},
{
"epoch": 0.07720733799656856,
"grad_norm": 0.23049865663051605,
"learning_rate": 1.5408970976253298e-06,
"loss": 0.1493,
"num_input_tokens_seen": 288448,
"step": 585
},
{
"epoch": 0.07786722977431701,
"grad_norm": 0.022031376138329506,
"learning_rate": 1.5540897097625329e-06,
"loss": 0.1879,
"num_input_tokens_seen": 290816,
"step": 590
},
{
"epoch": 0.07852712155206547,
"grad_norm": 71.99144744873047,
"learning_rate": 1.567282321899736e-06,
"loss": 0.2187,
"num_input_tokens_seen": 293504,
"step": 595
},
{
"epoch": 0.07918701332981391,
"grad_norm": 0.02517612837255001,
"learning_rate": 1.5804749340369392e-06,
"loss": 0.1335,
"num_input_tokens_seen": 295744,
"step": 600
},
{
"epoch": 0.07984690510756236,
"grad_norm": 0.04881107434630394,
"learning_rate": 1.5936675461741425e-06,
"loss": 0.0816,
"num_input_tokens_seen": 298112,
"step": 605
},
{
"epoch": 0.0805067968853108,
"grad_norm": 197.50973510742188,
"learning_rate": 1.6068601583113455e-06,
"loss": 0.1134,
"num_input_tokens_seen": 300608,
"step": 610
},
{
"epoch": 0.08116668866305926,
"grad_norm": 0.06382615864276886,
"learning_rate": 1.6200527704485488e-06,
"loss": 0.0047,
"num_input_tokens_seen": 303360,
"step": 615
},
{
"epoch": 0.0818265804408077,
"grad_norm": 9.255777359008789,
"learning_rate": 1.633245382585752e-06,
"loss": 0.0712,
"num_input_tokens_seen": 305920,
"step": 620
},
{
"epoch": 0.08248647221855615,
"grad_norm": 11.119955062866211,
"learning_rate": 1.646437994722955e-06,
"loss": 0.0795,
"num_input_tokens_seen": 308416,
"step": 625
},
{
"epoch": 0.08314636399630461,
"grad_norm": 0.05398223549127579,
"learning_rate": 1.6596306068601582e-06,
"loss": 0.1324,
"num_input_tokens_seen": 310848,
"step": 630
},
{
"epoch": 0.08380625577405305,
"grad_norm": 55.00618362426758,
"learning_rate": 1.6728232189973614e-06,
"loss": 0.2123,
"num_input_tokens_seen": 313408,
"step": 635
},
{
"epoch": 0.0844661475518015,
"grad_norm": 111.69770050048828,
"learning_rate": 1.6860158311345645e-06,
"loss": 0.1099,
"num_input_tokens_seen": 315904,
"step": 640
},
{
"epoch": 0.08512603932954996,
"grad_norm": 48.804962158203125,
"learning_rate": 1.6992084432717678e-06,
"loss": 0.2301,
"num_input_tokens_seen": 318080,
"step": 645
},
{
"epoch": 0.0857859311072984,
"grad_norm": 6.783302307128906,
"learning_rate": 1.712401055408971e-06,
"loss": 0.0621,
"num_input_tokens_seen": 320256,
"step": 650
},
{
"epoch": 0.08644582288504685,
"grad_norm": 0.6253184676170349,
"learning_rate": 1.7255936675461739e-06,
"loss": 0.0199,
"num_input_tokens_seen": 322496,
"step": 655
},
{
"epoch": 0.0871057146627953,
"grad_norm": 257.97125244140625,
"learning_rate": 1.7387862796833772e-06,
"loss": 0.1723,
"num_input_tokens_seen": 325120,
"step": 660
},
{
"epoch": 0.08776560644054375,
"grad_norm": 29.855276107788086,
"learning_rate": 1.7519788918205804e-06,
"loss": 0.0485,
"num_input_tokens_seen": 327296,
"step": 665
},
{
"epoch": 0.0884254982182922,
"grad_norm": 42.55568313598633,
"learning_rate": 1.7651715039577835e-06,
"loss": 0.4327,
"num_input_tokens_seen": 329664,
"step": 670
},
{
"epoch": 0.08908538999604065,
"grad_norm": 23.53718376159668,
"learning_rate": 1.7783641160949868e-06,
"loss": 0.0918,
"num_input_tokens_seen": 332416,
"step": 675
},
{
"epoch": 0.0897452817737891,
"grad_norm": 0.3492559790611267,
"learning_rate": 1.79155672823219e-06,
"loss": 0.0255,
"num_input_tokens_seen": 334976,
"step": 680
},
{
"epoch": 0.09040517355153754,
"grad_norm": 0.0223238542675972,
"learning_rate": 1.8047493403693929e-06,
"loss": 0.0856,
"num_input_tokens_seen": 337472,
"step": 685
},
{
"epoch": 0.091065065329286,
"grad_norm": 8.461647033691406,
"learning_rate": 1.8179419525065961e-06,
"loss": 0.1861,
"num_input_tokens_seen": 339904,
"step": 690
},
{
"epoch": 0.09172495710703445,
"grad_norm": 131.66806030273438,
"learning_rate": 1.8311345646437994e-06,
"loss": 0.1639,
"num_input_tokens_seen": 342272,
"step": 695
},
{
"epoch": 0.09238484888478289,
"grad_norm": 0.0658893883228302,
"learning_rate": 1.8443271767810025e-06,
"loss": 0.1908,
"num_input_tokens_seen": 344640,
"step": 700
},
{
"epoch": 0.09304474066253135,
"grad_norm": 0.02002323418855667,
"learning_rate": 1.8575197889182057e-06,
"loss": 0.0427,
"num_input_tokens_seen": 347072,
"step": 705
},
{
"epoch": 0.0937046324402798,
"grad_norm": 0.023283669725060463,
"learning_rate": 1.870712401055409e-06,
"loss": 0.194,
"num_input_tokens_seen": 349696,
"step": 710
},
{
"epoch": 0.09436452421802824,
"grad_norm": 12.705941200256348,
"learning_rate": 1.883905013192612e-06,
"loss": 0.0821,
"num_input_tokens_seen": 352256,
"step": 715
},
{
"epoch": 0.0950244159957767,
"grad_norm": 0.16988928616046906,
"learning_rate": 1.8970976253298151e-06,
"loss": 0.1312,
"num_input_tokens_seen": 355008,
"step": 720
},
{
"epoch": 0.09568430777352514,
"grad_norm": 82.4719009399414,
"learning_rate": 1.9102902374670186e-06,
"loss": 0.2885,
"num_input_tokens_seen": 357376,
"step": 725
},
{
"epoch": 0.09634419955127359,
"grad_norm": 6.462850093841553,
"learning_rate": 1.9234828496042215e-06,
"loss": 0.1712,
"num_input_tokens_seen": 359680,
"step": 730
},
{
"epoch": 0.09700409132902205,
"grad_norm": 2.923388957977295,
"learning_rate": 1.9366754617414247e-06,
"loss": 0.1537,
"num_input_tokens_seen": 362176,
"step": 735
},
{
"epoch": 0.09766398310677049,
"grad_norm": 104.2777328491211,
"learning_rate": 1.949868073878628e-06,
"loss": 0.2028,
"num_input_tokens_seen": 365056,
"step": 740
},
{
"epoch": 0.09832387488451894,
"grad_norm": 2.3851282596588135,
"learning_rate": 1.963060686015831e-06,
"loss": 0.2106,
"num_input_tokens_seen": 367488,
"step": 745
},
{
"epoch": 0.0989837666622674,
"grad_norm": 1.8862087726593018,
"learning_rate": 1.976253298153034e-06,
"loss": 0.2852,
"num_input_tokens_seen": 369792,
"step": 750
},
{
"epoch": 0.09964365844001584,
"grad_norm": 102.61363220214844,
"learning_rate": 1.9894459102902374e-06,
"loss": 0.1154,
"num_input_tokens_seen": 372160,
"step": 755
},
{
"epoch": 0.1000395935066649,
"eval_loss": 0.129482701420784,
"eval_runtime": 7.7189,
"eval_samples_per_second": 872.532,
"eval_steps_per_second": 109.083,
"num_input_tokens_seen": 373504,
"step": 758
},
{
"epoch": 0.10030355021776428,
"grad_norm": 4.566295146942139,
"learning_rate": 1.9999998938723955e-06,
"loss": 0.0874,
"num_input_tokens_seen": 374272,
"step": 760
},
{
"epoch": 0.10096344199551274,
"grad_norm": 25.750286102294922,
"learning_rate": 1.9999961794086063e-06,
"loss": 0.0774,
"num_input_tokens_seen": 376704,
"step": 765
},
{
"epoch": 0.10162333377326119,
"grad_norm": 149.0970458984375,
"learning_rate": 1.999987158587122e-06,
"loss": 0.2165,
"num_input_tokens_seen": 379136,
"step": 770
},
{
"epoch": 0.10228322555100963,
"grad_norm": 47.778255462646484,
"learning_rate": 1.9999728314558114e-06,
"loss": 0.1505,
"num_input_tokens_seen": 381568,
"step": 775
},
{
"epoch": 0.10294311732875808,
"grad_norm": 0.1281862109899521,
"learning_rate": 1.9999531980906988e-06,
"loss": 0.2297,
"num_input_tokens_seen": 384128,
"step": 780
},
{
"epoch": 0.10360300910650654,
"grad_norm": 105.48400115966797,
"learning_rate": 1.999928258595967e-06,
"loss": 0.0893,
"num_input_tokens_seen": 386304,
"step": 785
},
{
"epoch": 0.10426290088425498,
"grad_norm": 16.267196655273438,
"learning_rate": 1.9998980131039534e-06,
"loss": 0.2538,
"num_input_tokens_seen": 388864,
"step": 790
},
{
"epoch": 0.10492279266200343,
"grad_norm": 34.18339920043945,
"learning_rate": 1.999862461775153e-06,
"loss": 0.0914,
"num_input_tokens_seen": 391104,
"step": 795
},
{
"epoch": 0.10558268443975188,
"grad_norm": 14.670069694519043,
"learning_rate": 1.999821604798214e-06,
"loss": 0.1431,
"num_input_tokens_seen": 393856,
"step": 800
},
{
"epoch": 0.10624257621750033,
"grad_norm": 32.27194595336914,
"learning_rate": 1.999775442389939e-06,
"loss": 0.3214,
"num_input_tokens_seen": 396352,
"step": 805
},
{
"epoch": 0.10690246799524877,
"grad_norm": 1.3998618125915527,
"learning_rate": 1.9997239747952843e-06,
"loss": 0.1422,
"num_input_tokens_seen": 398592,
"step": 810
},
{
"epoch": 0.10756235977299723,
"grad_norm": 177.2610321044922,
"learning_rate": 1.9996672022873546e-06,
"loss": 0.0609,
"num_input_tokens_seen": 401088,
"step": 815
},
{
"epoch": 0.10822225155074568,
"grad_norm": 182.47579956054688,
"learning_rate": 1.9996051251674073e-06,
"loss": 0.0726,
"num_input_tokens_seen": 403456,
"step": 820
},
{
"epoch": 0.10888214332849412,
"grad_norm": 31.635814666748047,
"learning_rate": 1.999537743764847e-06,
"loss": 0.1602,
"num_input_tokens_seen": 405696,
"step": 825
},
{
"epoch": 0.10954203510624258,
"grad_norm": 20.697343826293945,
"learning_rate": 1.999465058437225e-06,
"loss": 0.4649,
"num_input_tokens_seen": 408128,
"step": 830
},
{
"epoch": 0.11020192688399102,
"grad_norm": 0.6629725694656372,
"learning_rate": 1.9993870695702364e-06,
"loss": 0.0112,
"num_input_tokens_seen": 411008,
"step": 835
},
{
"epoch": 0.11086181866173947,
"grad_norm": 247.9231719970703,
"learning_rate": 1.9993037775777206e-06,
"loss": 0.3035,
"num_input_tokens_seen": 413312,
"step": 840
},
{
"epoch": 0.11152171043948793,
"grad_norm": 0.9605908989906311,
"learning_rate": 1.999215182901656e-06,
"loss": 0.1141,
"num_input_tokens_seen": 415616,
"step": 845
},
{
"epoch": 0.11218160221723637,
"grad_norm": 75.42913055419922,
"learning_rate": 1.9991212860121587e-06,
"loss": 0.1391,
"num_input_tokens_seen": 418368,
"step": 850
},
{
"epoch": 0.11284149399498482,
"grad_norm": 0.684021532535553,
"learning_rate": 1.999022087407482e-06,
"loss": 0.0502,
"num_input_tokens_seen": 420864,
"step": 855
},
{
"epoch": 0.11350138577273328,
"grad_norm": 61.68302536010742,
"learning_rate": 1.998917587614011e-06,
"loss": 0.3102,
"num_input_tokens_seen": 423040,
"step": 860
},
{
"epoch": 0.11416127755048172,
"grad_norm": 26.822439193725586,
"learning_rate": 1.9988077871862615e-06,
"loss": 0.3563,
"num_input_tokens_seen": 425344,
"step": 865
},
{
"epoch": 0.11482116932823017,
"grad_norm": 1.1649112701416016,
"learning_rate": 1.9986926867068752e-06,
"loss": 0.0052,
"num_input_tokens_seen": 427968,
"step": 870
},
{
"epoch": 0.11548106110597862,
"grad_norm": 0.3206559121608734,
"learning_rate": 1.998572286786619e-06,
"loss": 0.2265,
"num_input_tokens_seen": 430592,
"step": 875
},
{
"epoch": 0.11614095288372707,
"grad_norm": 51.00387954711914,
"learning_rate": 1.9984465880643807e-06,
"loss": 0.295,
"num_input_tokens_seen": 433152,
"step": 880
},
{
"epoch": 0.11680084466147551,
"grad_norm": 94.77568817138672,
"learning_rate": 1.998315591207165e-06,
"loss": 0.0961,
"num_input_tokens_seen": 435456,
"step": 885
},
{
"epoch": 0.11746073643922397,
"grad_norm": 14.036933898925781,
"learning_rate": 1.9981792969100912e-06,
"loss": 0.1703,
"num_input_tokens_seen": 438080,
"step": 890
},
{
"epoch": 0.11812062821697242,
"grad_norm": 8.309388160705566,
"learning_rate": 1.9980377058963875e-06,
"loss": 0.2036,
"num_input_tokens_seen": 440640,
"step": 895
},
{
"epoch": 0.11878051999472086,
"grad_norm": 2.4462878704071045,
"learning_rate": 1.99789081891739e-06,
"loss": 0.0225,
"num_input_tokens_seen": 443008,
"step": 900
},
{
"epoch": 0.11944041177246932,
"grad_norm": 77.5080337524414,
"learning_rate": 1.997738636752536e-06,
"loss": 0.2203,
"num_input_tokens_seen": 445312,
"step": 905
},
{
"epoch": 0.12010030355021777,
"grad_norm": 0.06471412628889084,
"learning_rate": 1.9975811602093624e-06,
"loss": 0.0016,
"num_input_tokens_seen": 447680,
"step": 910
},
{
"epoch": 0.12076019532796621,
"grad_norm": 123.2421875,
"learning_rate": 1.9974183901234984e-06,
"loss": 0.3289,
"num_input_tokens_seen": 450368,
"step": 915
},
{
"epoch": 0.12142008710571467,
"grad_norm": 5.337376594543457,
"learning_rate": 1.997250327358664e-06,
"loss": 0.3008,
"num_input_tokens_seen": 452800,
"step": 920
},
{
"epoch": 0.12207997888346311,
"grad_norm": 15.385493278503418,
"learning_rate": 1.997076972806664e-06,
"loss": 0.1352,
"num_input_tokens_seen": 455744,
"step": 925
},
{
"epoch": 0.12273987066121156,
"grad_norm": 60.07589340209961,
"learning_rate": 1.9968983273873827e-06,
"loss": 0.2869,
"num_input_tokens_seen": 458176,
"step": 930
},
{
"epoch": 0.12339976243896002,
"grad_norm": 1.007535457611084,
"learning_rate": 1.99671439204878e-06,
"loss": 0.1694,
"num_input_tokens_seen": 460480,
"step": 935
},
{
"epoch": 0.12405965421670846,
"grad_norm": 0.9021179676055908,
"learning_rate": 1.9965251677668873e-06,
"loss": 0.1448,
"num_input_tokens_seen": 462656,
"step": 940
},
{
"epoch": 0.1247195459944569,
"grad_norm": 130.83981323242188,
"learning_rate": 1.9963306555458e-06,
"loss": 0.2976,
"num_input_tokens_seen": 465344,
"step": 945
},
{
"epoch": 0.12537943777220537,
"grad_norm": 55.70017623901367,
"learning_rate": 1.9961308564176723e-06,
"loss": 0.3325,
"num_input_tokens_seen": 467712,
"step": 950
},
{
"epoch": 0.1260393295499538,
"grad_norm": 21.9194278717041,
"learning_rate": 1.9959257714427147e-06,
"loss": 0.1471,
"num_input_tokens_seen": 470080,
"step": 955
},
{
"epoch": 0.12669922132770225,
"grad_norm": 9.642888069152832,
"learning_rate": 1.995715401709186e-06,
"loss": 0.1476,
"num_input_tokens_seen": 472512,
"step": 960
},
{
"epoch": 0.1273591131054507,
"grad_norm": 22.075639724731445,
"learning_rate": 1.995499748333387e-06,
"loss": 0.1639,
"num_input_tokens_seen": 474752,
"step": 965
},
{
"epoch": 0.12801900488319914,
"grad_norm": 34.80426025390625,
"learning_rate": 1.9952788124596555e-06,
"loss": 0.1539,
"num_input_tokens_seen": 477440,
"step": 970
},
{
"epoch": 0.12867889666094762,
"grad_norm": 16.518495559692383,
"learning_rate": 1.9950525952603617e-06,
"loss": 0.1507,
"num_input_tokens_seen": 480000,
"step": 975
},
{
"epoch": 0.12933878843869606,
"grad_norm": 63.373817443847656,
"learning_rate": 1.994821097935899e-06,
"loss": 0.1434,
"num_input_tokens_seen": 482368,
"step": 980
},
{
"epoch": 0.1299986802164445,
"grad_norm": 18.813962936401367,
"learning_rate": 1.9945843217146804e-06,
"loss": 0.0706,
"num_input_tokens_seen": 484544,
"step": 985
},
{
"epoch": 0.13065857199419295,
"grad_norm": 0.16079047322273254,
"learning_rate": 1.9943422678531293e-06,
"loss": 0.1142,
"num_input_tokens_seen": 486720,
"step": 990
},
{
"epoch": 0.1313184637719414,
"grad_norm": 0.1510768085718155,
"learning_rate": 1.994094937635675e-06,
"loss": 0.0692,
"num_input_tokens_seen": 489344,
"step": 995
},
{
"epoch": 0.13197835554968984,
"grad_norm": 19.409828186035156,
"learning_rate": 1.9938423323747457e-06,
"loss": 0.0421,
"num_input_tokens_seen": 491776,
"step": 1000
},
{
"epoch": 0.1326382473274383,
"grad_norm": 170.2490997314453,
"learning_rate": 1.99358445341076e-06,
"loss": 0.1827,
"num_input_tokens_seen": 493952,
"step": 1005
},
{
"epoch": 0.13329813910518676,
"grad_norm": 15.425086975097656,
"learning_rate": 1.993321302112121e-06,
"loss": 0.2152,
"num_input_tokens_seen": 496320,
"step": 1010
},
{
"epoch": 0.1339580308829352,
"grad_norm": 117.909423828125,
"learning_rate": 1.993052879875209e-06,
"loss": 0.0299,
"num_input_tokens_seen": 498496,
"step": 1015
},
{
"epoch": 0.13461792266068365,
"grad_norm": 47.48206329345703,
"learning_rate": 1.992779188124374e-06,
"loss": 0.1351,
"num_input_tokens_seen": 501056,
"step": 1020
},
{
"epoch": 0.1352778144384321,
"grad_norm": 1.8825244903564453,
"learning_rate": 1.992500228311928e-06,
"loss": 0.0501,
"num_input_tokens_seen": 503296,
"step": 1025
},
{
"epoch": 0.13593770621618054,
"grad_norm": 12.106839179992676,
"learning_rate": 1.9922160019181372e-06,
"loss": 0.3259,
"num_input_tokens_seen": 505856,
"step": 1030
},
{
"epoch": 0.13659759799392898,
"grad_norm": 0.0899241715669632,
"learning_rate": 1.9919265104512138e-06,
"loss": 0.1532,
"num_input_tokens_seen": 508416,
"step": 1035
},
{
"epoch": 0.13725748977167745,
"grad_norm": 2.1223573684692383,
"learning_rate": 1.9916317554473094e-06,
"loss": 0.2708,
"num_input_tokens_seen": 511040,
"step": 1040
},
{
"epoch": 0.1379173815494259,
"grad_norm": 57.11883544921875,
"learning_rate": 1.9913317384705052e-06,
"loss": 0.188,
"num_input_tokens_seen": 513216,
"step": 1045
},
{
"epoch": 0.13857727332717434,
"grad_norm": 12.335477828979492,
"learning_rate": 1.991026461112805e-06,
"loss": 0.1146,
"num_input_tokens_seen": 515456,
"step": 1050
},
{
"epoch": 0.1392371651049228,
"grad_norm": 0.4140935242176056,
"learning_rate": 1.9907159249941257e-06,
"loss": 0.1353,
"num_input_tokens_seen": 517824,
"step": 1055
},
{
"epoch": 0.13989705688267123,
"grad_norm": 101.3670425415039,
"learning_rate": 1.990400131762289e-06,
"loss": 0.112,
"num_input_tokens_seen": 520320,
"step": 1060
},
{
"epoch": 0.14055694866041968,
"grad_norm": 0.5620743632316589,
"learning_rate": 1.9900790830930134e-06,
"loss": 0.0702,
"num_input_tokens_seen": 522752,
"step": 1065
},
{
"epoch": 0.14121684043816815,
"grad_norm": 2.9457738399505615,
"learning_rate": 1.9897527806899047e-06,
"loss": 0.1085,
"num_input_tokens_seen": 525376,
"step": 1070
},
{
"epoch": 0.1418767322159166,
"grad_norm": 628.5321655273438,
"learning_rate": 1.9894212262844465e-06,
"loss": 0.2922,
"num_input_tokens_seen": 527808,
"step": 1075
},
{
"epoch": 0.14253662399366504,
"grad_norm": 0.10223134607076645,
"learning_rate": 1.989084421635992e-06,
"loss": 0.1607,
"num_input_tokens_seen": 530304,
"step": 1080
},
{
"epoch": 0.14319651577141349,
"grad_norm": 9.022106170654297,
"learning_rate": 1.988742368531754e-06,
"loss": 0.2576,
"num_input_tokens_seen": 532480,
"step": 1085
},
{
"epoch": 0.14385640754916193,
"grad_norm": 3.4002270698547363,
"learning_rate": 1.9883950687867947e-06,
"loss": 0.0676,
"num_input_tokens_seen": 535168,
"step": 1090
},
{
"epoch": 0.14451629932691037,
"grad_norm": 19.492107391357422,
"learning_rate": 1.9880425242440187e-06,
"loss": 0.1067,
"num_input_tokens_seen": 537600,
"step": 1095
},
{
"epoch": 0.14517619110465885,
"grad_norm": 77.36679077148438,
"learning_rate": 1.9876847367741607e-06,
"loss": 0.1435,
"num_input_tokens_seen": 540096,
"step": 1100
},
{
"epoch": 0.1458360828824073,
"grad_norm": 0.1341482549905777,
"learning_rate": 1.987321708275776e-06,
"loss": 0.1568,
"num_input_tokens_seen": 542592,
"step": 1105
},
{
"epoch": 0.14649597466015574,
"grad_norm": 0.15566033124923706,
"learning_rate": 1.986953440675231e-06,
"loss": 0.0017,
"num_input_tokens_seen": 544960,
"step": 1110
},
{
"epoch": 0.14715586643790418,
"grad_norm": 0.3482903242111206,
"learning_rate": 1.9865799359266925e-06,
"loss": 0.0812,
"num_input_tokens_seen": 547136,
"step": 1115
},
{
"epoch": 0.14781575821565263,
"grad_norm": 0.914465606212616,
"learning_rate": 1.986201196012118e-06,
"loss": 0.0878,
"num_input_tokens_seen": 549440,
"step": 1120
},
{
"epoch": 0.14847564999340107,
"grad_norm": 57.043827056884766,
"learning_rate": 1.985817222941245e-06,
"loss": 0.2476,
"num_input_tokens_seen": 552064,
"step": 1125
},
{
"epoch": 0.14913554177114954,
"grad_norm": 10.63588809967041,
"learning_rate": 1.9854280187515794e-06,
"loss": 0.082,
"num_input_tokens_seen": 554432,
"step": 1130
},
{
"epoch": 0.149795433548898,
"grad_norm": 0.7898812890052795,
"learning_rate": 1.985033585508386e-06,
"loss": 0.0745,
"num_input_tokens_seen": 556800,
"step": 1135
},
{
"epoch": 0.15005939025999737,
"eval_loss": 0.19488762319087982,
"eval_runtime": 7.5589,
"eval_samples_per_second": 890.998,
"eval_steps_per_second": 111.391,
"num_input_tokens_seen": 557824,
"step": 1137
},
{
"epoch": 0.15045532532664643,
"grad_norm": 250.70848083496094,
"learning_rate": 1.9846339253046766e-06,
"loss": 0.5451,
"num_input_tokens_seen": 559296,
"step": 1140
},
{
"epoch": 0.15111521710439488,
"grad_norm": 27.038022994995117,
"learning_rate": 1.984229040261199e-06,
"loss": 0.1735,
"num_input_tokens_seen": 562112,
"step": 1145
},
{
"epoch": 0.15177510888214332,
"grad_norm": 0.19544407725334167,
"learning_rate": 1.9838189325264263e-06,
"loss": 0.2349,
"num_input_tokens_seen": 564288,
"step": 1150
},
{
"epoch": 0.15243500065989177,
"grad_norm": 199.0702667236328,
"learning_rate": 1.983403604276546e-06,
"loss": 0.0845,
"num_input_tokens_seen": 566848,
"step": 1155
},
{
"epoch": 0.15309489243764024,
"grad_norm": 44.44175720214844,
"learning_rate": 1.9829830577154457e-06,
"loss": 0.394,
"num_input_tokens_seen": 569152,
"step": 1160
},
{
"epoch": 0.15375478421538868,
"grad_norm": 19.46393585205078,
"learning_rate": 1.982557295074705e-06,
"loss": 0.0604,
"num_input_tokens_seen": 571456,
"step": 1165
},
{
"epoch": 0.15441467599313713,
"grad_norm": 0.14685490727424622,
"learning_rate": 1.982126318613581e-06,
"loss": 0.1545,
"num_input_tokens_seen": 573824,
"step": 1170
},
{
"epoch": 0.15507456777088557,
"grad_norm": 0.41161906719207764,
"learning_rate": 1.9816901306189977e-06,
"loss": 0.0016,
"num_input_tokens_seen": 576128,
"step": 1175
},
{
"epoch": 0.15573445954863402,
"grad_norm": 2.800428867340088,
"learning_rate": 1.9812487334055342e-06,
"loss": 0.139,
"num_input_tokens_seen": 578432,
"step": 1180
},
{
"epoch": 0.15639435132638246,
"grad_norm": 0.6683080196380615,
"learning_rate": 1.98080212931541e-06,
"loss": 0.1618,
"num_input_tokens_seen": 580736,
"step": 1185
},
{
"epoch": 0.15705424310413094,
"grad_norm": 0.13595707714557648,
"learning_rate": 1.980350320718476e-06,
"loss": 0.0846,
"num_input_tokens_seen": 583040,
"step": 1190
},
{
"epoch": 0.15771413488187938,
"grad_norm": 0.4379376471042633,
"learning_rate": 1.9798933100121985e-06,
"loss": 0.0073,
"num_input_tokens_seen": 585344,
"step": 1195
},
{
"epoch": 0.15837402665962783,
"grad_norm": 4.045234203338623,
"learning_rate": 1.97943109962165e-06,
"loss": 0.0793,
"num_input_tokens_seen": 587904,
"step": 1200
},
{
"epoch": 0.15903391843737627,
"grad_norm": 0.751695990562439,
"learning_rate": 1.978963691999493e-06,
"loss": 0.1511,
"num_input_tokens_seen": 590208,
"step": 1205
},
{
"epoch": 0.15969381021512472,
"grad_norm": 21.781272888183594,
"learning_rate": 1.978491089625969e-06,
"loss": 0.0853,
"num_input_tokens_seen": 592512,
"step": 1210
},
{
"epoch": 0.16035370199287316,
"grad_norm": 0.15117277204990387,
"learning_rate": 1.9780132950088854e-06,
"loss": 0.1785,
"num_input_tokens_seen": 595072,
"step": 1215
},
{
"epoch": 0.1610135937706216,
"grad_norm": 23.114465713500977,
"learning_rate": 1.9775303106836e-06,
"loss": 0.2842,
"num_input_tokens_seen": 597632,
"step": 1220
},
{
"epoch": 0.16167348554837008,
"grad_norm": 0.19639664888381958,
"learning_rate": 1.977042139213011e-06,
"loss": 0.0847,
"num_input_tokens_seen": 600192,
"step": 1225
},
{
"epoch": 0.16233337732611852,
"grad_norm": 0.22633503377437592,
"learning_rate": 1.9765487831875404e-06,
"loss": 0.0931,
"num_input_tokens_seen": 602304,
"step": 1230
},
{
"epoch": 0.16299326910386697,
"grad_norm": 0.8158997297286987,
"learning_rate": 1.9760502452251217e-06,
"loss": 0.1418,
"num_input_tokens_seen": 604608,
"step": 1235
},
{
"epoch": 0.1636531608816154,
"grad_norm": 9.417763710021973,
"learning_rate": 1.975546527971186e-06,
"loss": 0.1102,
"num_input_tokens_seen": 606976,
"step": 1240
},
{
"epoch": 0.16431305265936386,
"grad_norm": 0.3996043801307678,
"learning_rate": 1.9750376340986472e-06,
"loss": 0.0447,
"num_input_tokens_seen": 609600,
"step": 1245
},
{
"epoch": 0.1649729444371123,
"grad_norm": 17.319820404052734,
"learning_rate": 1.974523566307889e-06,
"loss": 0.1681,
"num_input_tokens_seen": 611840,
"step": 1250
},
{
"epoch": 0.16563283621486077,
"grad_norm": 17.101892471313477,
"learning_rate": 1.9740043273267487e-06,
"loss": 0.1085,
"num_input_tokens_seen": 614528,
"step": 1255
},
{
"epoch": 0.16629272799260922,
"grad_norm": 0.14512968063354492,
"learning_rate": 1.973479919910505e-06,
"loss": 0.0217,
"num_input_tokens_seen": 617024,
"step": 1260
},
{
"epoch": 0.16695261977035766,
"grad_norm": 26.45575523376465,
"learning_rate": 1.972950346841862e-06,
"loss": 0.1141,
"num_input_tokens_seen": 619392,
"step": 1265
},
{
"epoch": 0.1676125115481061,
"grad_norm": 46.64674758911133,
"learning_rate": 1.972415610930934e-06,
"loss": 0.0049,
"num_input_tokens_seen": 621888,
"step": 1270
},
{
"epoch": 0.16827240332585455,
"grad_norm": 48.495487213134766,
"learning_rate": 1.9718757150152324e-06,
"loss": 0.2469,
"num_input_tokens_seen": 624192,
"step": 1275
},
{
"epoch": 0.168932295103603,
"grad_norm": 0.485227108001709,
"learning_rate": 1.9713306619596488e-06,
"loss": 0.0511,
"num_input_tokens_seen": 626624,
"step": 1280
},
{
"epoch": 0.16959218688135147,
"grad_norm": 34.0601692199707,
"learning_rate": 1.9707804546564407e-06,
"loss": 0.0686,
"num_input_tokens_seen": 628928,
"step": 1285
},
{
"epoch": 0.17025207865909991,
"grad_norm": 2.3748066425323486,
"learning_rate": 1.9702250960252164e-06,
"loss": 0.0234,
"num_input_tokens_seen": 631616,
"step": 1290
},
{
"epoch": 0.17091197043684836,
"grad_norm": 0.013682112097740173,
"learning_rate": 1.969664589012918e-06,
"loss": 0.0015,
"num_input_tokens_seen": 634112,
"step": 1295
},
{
"epoch": 0.1715718622145968,
"grad_norm": 314.3330078125,
"learning_rate": 1.9690989365938077e-06,
"loss": 0.3855,
"num_input_tokens_seen": 636416,
"step": 1300
},
{
"epoch": 0.17223175399234525,
"grad_norm": 0.05270430073142052,
"learning_rate": 1.9685281417694513e-06,
"loss": 0.0051,
"num_input_tokens_seen": 638848,
"step": 1305
},
{
"epoch": 0.1728916457700937,
"grad_norm": 0.23324760794639587,
"learning_rate": 1.967952207568702e-06,
"loss": 0.1125,
"num_input_tokens_seen": 641216,
"step": 1310
},
{
"epoch": 0.17355153754784217,
"grad_norm": 0.26865366101264954,
"learning_rate": 1.967371137047685e-06,
"loss": 0.0011,
"num_input_tokens_seen": 644032,
"step": 1315
},
{
"epoch": 0.1742114293255906,
"grad_norm": 0.24145404994487762,
"learning_rate": 1.966784933289778e-06,
"loss": 0.1494,
"num_input_tokens_seen": 646528,
"step": 1320
},
{
"epoch": 0.17487132110333906,
"grad_norm": 0.08738990128040314,
"learning_rate": 1.9661935994056014e-06,
"loss": 0.1951,
"num_input_tokens_seen": 649088,
"step": 1325
},
{
"epoch": 0.1755312128810875,
"grad_norm": 0.9014714956283569,
"learning_rate": 1.965597138532996e-06,
"loss": 0.0093,
"num_input_tokens_seen": 651520,
"step": 1330
},
{
"epoch": 0.17619110465883595,
"grad_norm": 0.6617699265480042,
"learning_rate": 1.964995553837009e-06,
"loss": 0.0409,
"num_input_tokens_seen": 654016,
"step": 1335
},
{
"epoch": 0.1768509964365844,
"grad_norm": 0.015406480059027672,
"learning_rate": 1.964388848509875e-06,
"loss": 0.1143,
"num_input_tokens_seen": 656320,
"step": 1340
},
{
"epoch": 0.17751088821433286,
"grad_norm": 0.025758925825357437,
"learning_rate": 1.9637770257710026e-06,
"loss": 0.1683,
"num_input_tokens_seen": 658880,
"step": 1345
},
{
"epoch": 0.1781707799920813,
"grad_norm": 0.08237680792808533,
"learning_rate": 1.9631600888669545e-06,
"loss": 0.0205,
"num_input_tokens_seen": 661184,
"step": 1350
},
{
"epoch": 0.17883067176982975,
"grad_norm": 0.07278398424386978,
"learning_rate": 1.962538041071431e-06,
"loss": 0.0664,
"num_input_tokens_seen": 663680,
"step": 1355
},
{
"epoch": 0.1794905635475782,
"grad_norm": 25.769346237182617,
"learning_rate": 1.961910885685253e-06,
"loss": 0.0688,
"num_input_tokens_seen": 666048,
"step": 1360
},
{
"epoch": 0.18015045532532664,
"grad_norm": 164.78553771972656,
"learning_rate": 1.9612786260363436e-06,
"loss": 0.2636,
"num_input_tokens_seen": 668480,
"step": 1365
},
{
"epoch": 0.1808103471030751,
"grad_norm": 0.17773790657520294,
"learning_rate": 1.9606412654797116e-06,
"loss": 0.1108,
"num_input_tokens_seen": 671488,
"step": 1370
},
{
"epoch": 0.18147023888082353,
"grad_norm": 66.89860534667969,
"learning_rate": 1.9599988073974332e-06,
"loss": 0.1088,
"num_input_tokens_seen": 673920,
"step": 1375
},
{
"epoch": 0.182130130658572,
"grad_norm": 187.47903442382812,
"learning_rate": 1.959351255198634e-06,
"loss": 0.1413,
"num_input_tokens_seen": 676416,
"step": 1380
},
{
"epoch": 0.18279002243632045,
"grad_norm": 2.0588765144348145,
"learning_rate": 1.9586986123194704e-06,
"loss": 0.0008,
"num_input_tokens_seen": 679040,
"step": 1385
},
{
"epoch": 0.1834499142140689,
"grad_norm": 0.09783805161714554,
"learning_rate": 1.958040882223112e-06,
"loss": 0.1041,
"num_input_tokens_seen": 681920,
"step": 1390
},
{
"epoch": 0.18410980599181734,
"grad_norm": 0.06469712406396866,
"learning_rate": 1.9573780683997235e-06,
"loss": 0.04,
"num_input_tokens_seen": 684416,
"step": 1395
},
{
"epoch": 0.18476969776956578,
"grad_norm": 222.29971313476562,
"learning_rate": 1.956710174366445e-06,
"loss": 0.3574,
"num_input_tokens_seen": 686976,
"step": 1400
},
{
"epoch": 0.18542958954731423,
"grad_norm": 0.0895252674818039,
"learning_rate": 1.9560372036673764e-06,
"loss": 0.2731,
"num_input_tokens_seen": 689408,
"step": 1405
},
{
"epoch": 0.1860894813250627,
"grad_norm": 0.06232970580458641,
"learning_rate": 1.955359159873553e-06,
"loss": 0.0238,
"num_input_tokens_seen": 691712,
"step": 1410
},
{
"epoch": 0.18674937310281114,
"grad_norm": 0.0344870463013649,
"learning_rate": 1.954676046582932e-06,
"loss": 0.1341,
"num_input_tokens_seen": 694080,
"step": 1415
},
{
"epoch": 0.1874092648805596,
"grad_norm": 38.71489334106445,
"learning_rate": 1.9539878674203706e-06,
"loss": 0.2135,
"num_input_tokens_seen": 696640,
"step": 1420
},
{
"epoch": 0.18806915665830803,
"grad_norm": 0.06116657704114914,
"learning_rate": 1.9532946260376076e-06,
"loss": 0.0011,
"num_input_tokens_seen": 699136,
"step": 1425
},
{
"epoch": 0.18872904843605648,
"grad_norm": 53.41019821166992,
"learning_rate": 1.952596326113244e-06,
"loss": 0.3109,
"num_input_tokens_seen": 701696,
"step": 1430
},
{
"epoch": 0.18938894021380492,
"grad_norm": 1.2848087549209595,
"learning_rate": 1.9518929713527226e-06,
"loss": 0.1812,
"num_input_tokens_seen": 704384,
"step": 1435
},
{
"epoch": 0.1900488319915534,
"grad_norm": 2.0177323818206787,
"learning_rate": 1.9511845654883097e-06,
"loss": 0.0066,
"num_input_tokens_seen": 706560,
"step": 1440
},
{
"epoch": 0.19070872376930184,
"grad_norm": 0.4617765247821808,
"learning_rate": 1.9504711122790754e-06,
"loss": 0.0755,
"num_input_tokens_seen": 709248,
"step": 1445
},
{
"epoch": 0.19136861554705029,
"grad_norm": 45.93152618408203,
"learning_rate": 1.949752615510871e-06,
"loss": 0.2258,
"num_input_tokens_seen": 711296,
"step": 1450
},
{
"epoch": 0.19202850732479873,
"grad_norm": 0.20753158628940582,
"learning_rate": 1.949029078996313e-06,
"loss": 0.0457,
"num_input_tokens_seen": 713728,
"step": 1455
},
{
"epoch": 0.19268839910254718,
"grad_norm": 13.657062530517578,
"learning_rate": 1.9483005065747584e-06,
"loss": 0.1224,
"num_input_tokens_seen": 716224,
"step": 1460
},
{
"epoch": 0.19334829088029562,
"grad_norm": 13.369616508483887,
"learning_rate": 1.947566902112289e-06,
"loss": 0.3816,
"num_input_tokens_seen": 718528,
"step": 1465
},
{
"epoch": 0.1940081826580441,
"grad_norm": 60.77271270751953,
"learning_rate": 1.9468282695016863e-06,
"loss": 0.1841,
"num_input_tokens_seen": 720960,
"step": 1470
},
{
"epoch": 0.19466807443579254,
"grad_norm": 1.3714667558670044,
"learning_rate": 1.946084612662415e-06,
"loss": 0.1318,
"num_input_tokens_seen": 723200,
"step": 1475
},
{
"epoch": 0.19532796621354098,
"grad_norm": 114.1025619506836,
"learning_rate": 1.9453359355405987e-06,
"loss": 0.1708,
"num_input_tokens_seen": 725888,
"step": 1480
},
{
"epoch": 0.19598785799128943,
"grad_norm": 0.23408390581607819,
"learning_rate": 1.944582242109002e-06,
"loss": 0.0194,
"num_input_tokens_seen": 728256,
"step": 1485
},
{
"epoch": 0.19664774976903787,
"grad_norm": 0.22887404263019562,
"learning_rate": 1.943823536367006e-06,
"loss": 0.1454,
"num_input_tokens_seen": 730688,
"step": 1490
},
{
"epoch": 0.19730764154678632,
"grad_norm": 2.37292742729187,
"learning_rate": 1.9430598223405913e-06,
"loss": 0.1624,
"num_input_tokens_seen": 732992,
"step": 1495
},
{
"epoch": 0.1979675333245348,
"grad_norm": 0.2745613157749176,
"learning_rate": 1.9422911040823125e-06,
"loss": 0.1476,
"num_input_tokens_seen": 735424,
"step": 1500
},
{
"epoch": 0.19862742510228323,
"grad_norm": 132.48385620117188,
"learning_rate": 1.941517385671279e-06,
"loss": 0.3263,
"num_input_tokens_seen": 737664,
"step": 1505
},
{
"epoch": 0.19928731688003168,
"grad_norm": 0.1534176468849182,
"learning_rate": 1.940738671213134e-06,
"loss": 0.0942,
"num_input_tokens_seen": 740096,
"step": 1510
},
{
"epoch": 0.19994720865778012,
"grad_norm": 46.99830627441406,
"learning_rate": 1.93995496484003e-06,
"loss": 0.1712,
"num_input_tokens_seen": 742912,
"step": 1515
},
{
"epoch": 0.2000791870133298,
"eval_loss": 0.1068890318274498,
"eval_runtime": 7.6888,
"eval_samples_per_second": 875.951,
"eval_steps_per_second": 109.51,
"num_input_tokens_seen": 743424,
"step": 1516
},
{
"epoch": 0.20060710043552857,
"grad_norm": 160.11495971679688,
"learning_rate": 1.9391662707106092e-06,
"loss": 0.1021,
"num_input_tokens_seen": 745536,
"step": 1520
},
{
"epoch": 0.201266992213277,
"grad_norm": 0.16469451785087585,
"learning_rate": 1.9383725930099814e-06,
"loss": 0.0031,
"num_input_tokens_seen": 747968,
"step": 1525
},
{
"epoch": 0.20192688399102549,
"grad_norm": 0.772555947303772,
"learning_rate": 1.9375739359497e-06,
"loss": 0.1222,
"num_input_tokens_seen": 750464,
"step": 1530
},
{
"epoch": 0.20258677576877393,
"grad_norm": 0.41962626576423645,
"learning_rate": 1.936770303767741e-06,
"loss": 0.2416,
"num_input_tokens_seen": 752896,
"step": 1535
},
{
"epoch": 0.20324666754652237,
"grad_norm": 11.837217330932617,
"learning_rate": 1.9359617007284815e-06,
"loss": 0.1974,
"num_input_tokens_seen": 755648,
"step": 1540
},
{
"epoch": 0.20390655932427082,
"grad_norm": 9.827956199645996,
"learning_rate": 1.9351481311226738e-06,
"loss": 0.2312,
"num_input_tokens_seen": 758144,
"step": 1545
},
{
"epoch": 0.20456645110201926,
"grad_norm": 0.8918312788009644,
"learning_rate": 1.934329599267426e-06,
"loss": 0.1313,
"num_input_tokens_seen": 760704,
"step": 1550
},
{
"epoch": 0.2052263428797677,
"grad_norm": 43.78156280517578,
"learning_rate": 1.933506109506178e-06,
"loss": 0.0468,
"num_input_tokens_seen": 763136,
"step": 1555
},
{
"epoch": 0.20588623465751615,
"grad_norm": 1.698026418685913,
"learning_rate": 1.9326776662086765e-06,
"loss": 0.1132,
"num_input_tokens_seen": 766016,
"step": 1560
},
{
"epoch": 0.20654612643526463,
"grad_norm": 59.669952392578125,
"learning_rate": 1.9318442737709565e-06,
"loss": 0.3367,
"num_input_tokens_seen": 768512,
"step": 1565
},
{
"epoch": 0.20720601821301307,
"grad_norm": 0.267106831073761,
"learning_rate": 1.9310059366153116e-06,
"loss": 0.2047,
"num_input_tokens_seen": 770816,
"step": 1570
},
{
"epoch": 0.20786590999076152,
"grad_norm": 0.7591071724891663,
"learning_rate": 1.930162659190277e-06,
"loss": 0.2302,
"num_input_tokens_seen": 773312,
"step": 1575
},
{
"epoch": 0.20852580176850996,
"grad_norm": 1.2925443649291992,
"learning_rate": 1.9293144459706007e-06,
"loss": 0.0029,
"num_input_tokens_seen": 775680,
"step": 1580
},
{
"epoch": 0.2091856935462584,
"grad_norm": 17.853742599487305,
"learning_rate": 1.928461301457223e-06,
"loss": 0.1877,
"num_input_tokens_seen": 778048,
"step": 1585
},
{
"epoch": 0.20984558532400685,
"grad_norm": 0.08952134847640991,
"learning_rate": 1.92760323017725e-06,
"loss": 0.3027,
"num_input_tokens_seen": 780672,
"step": 1590
},
{
"epoch": 0.21050547710175532,
"grad_norm": 0.1787254512310028,
"learning_rate": 1.9267402366839338e-06,
"loss": 0.216,
"num_input_tokens_seen": 783360,
"step": 1595
},
{
"epoch": 0.21116536887950377,
"grad_norm": 9.013484954833984,
"learning_rate": 1.9258723255566433e-06,
"loss": 0.1268,
"num_input_tokens_seen": 785856,
"step": 1600
},
{
"epoch": 0.2118252606572522,
"grad_norm": 1.6822223663330078,
"learning_rate": 1.924999501400843e-06,
"loss": 0.1832,
"num_input_tokens_seen": 788480,
"step": 1605
},
{
"epoch": 0.21248515243500066,
"grad_norm": 0.5006535649299622,
"learning_rate": 1.924121768848068e-06,
"loss": 0.1511,
"num_input_tokens_seen": 791040,
"step": 1610
},
{
"epoch": 0.2131450442127491,
"grad_norm": 0.24185070395469666,
"learning_rate": 1.923239132555899e-06,
"loss": 0.1088,
"num_input_tokens_seen": 793600,
"step": 1615
},
{
"epoch": 0.21380493599049755,
"grad_norm": 1.2802025079727173,
"learning_rate": 1.9223515972079378e-06,
"loss": 0.1302,
"num_input_tokens_seen": 795968,
"step": 1620
},
{
"epoch": 0.21446482776824602,
"grad_norm": 6.617660999298096,
"learning_rate": 1.9214591675137813e-06,
"loss": 0.049,
"num_input_tokens_seen": 798272,
"step": 1625
},
{
"epoch": 0.21512471954599446,
"grad_norm": 111.40785217285156,
"learning_rate": 1.9205618482090003e-06,
"loss": 0.144,
"num_input_tokens_seen": 801024,
"step": 1630
},
{
"epoch": 0.2157846113237429,
"grad_norm": 22.670175552368164,
"learning_rate": 1.91965964405511e-06,
"loss": 0.1374,
"num_input_tokens_seen": 803584,
"step": 1635
},
{
"epoch": 0.21644450310149135,
"grad_norm": 56.14551544189453,
"learning_rate": 1.9187525598395457e-06,
"loss": 0.0117,
"num_input_tokens_seen": 805952,
"step": 1640
},
{
"epoch": 0.2171043948792398,
"grad_norm": 35.73996353149414,
"learning_rate": 1.9178406003756396e-06,
"loss": 0.1249,
"num_input_tokens_seen": 808512,
"step": 1645
},
{
"epoch": 0.21776428665698824,
"grad_norm": 68.0622787475586,
"learning_rate": 1.9169237705025936e-06,
"loss": 0.0819,
"num_input_tokens_seen": 811136,
"step": 1650
},
{
"epoch": 0.21842417843473672,
"grad_norm": 0.02525966428220272,
"learning_rate": 1.9160020750854533e-06,
"loss": 0.0183,
"num_input_tokens_seen": 813376,
"step": 1655
},
{
"epoch": 0.21908407021248516,
"grad_norm": 180.0795135498047,
"learning_rate": 1.915075519015083e-06,
"loss": 0.199,
"num_input_tokens_seen": 815872,
"step": 1660
},
{
"epoch": 0.2197439619902336,
"grad_norm": 1.1811161041259766,
"learning_rate": 1.914144107208139e-06,
"loss": 0.0725,
"num_input_tokens_seen": 818240,
"step": 1665
},
{
"epoch": 0.22040385376798205,
"grad_norm": 0.16843393445014954,
"learning_rate": 1.913207844607045e-06,
"loss": 0.0539,
"num_input_tokens_seen": 820736,
"step": 1670
},
{
"epoch": 0.2210637455457305,
"grad_norm": 21.720033645629883,
"learning_rate": 1.912266736179964e-06,
"loss": 0.2528,
"num_input_tokens_seen": 823616,
"step": 1675
},
{
"epoch": 0.22172363732347894,
"grad_norm": 8.987836837768555,
"learning_rate": 1.9113207869207727e-06,
"loss": 0.1707,
"num_input_tokens_seen": 826112,
"step": 1680
},
{
"epoch": 0.2223835291012274,
"grad_norm": 0.8188716769218445,
"learning_rate": 1.9103700018490365e-06,
"loss": 0.1356,
"num_input_tokens_seen": 828672,
"step": 1685
},
{
"epoch": 0.22304342087897586,
"grad_norm": 1.7912400960922241,
"learning_rate": 1.9094143860099787e-06,
"loss": 0.1711,
"num_input_tokens_seen": 831296,
"step": 1690
},
{
"epoch": 0.2237033126567243,
"grad_norm": 108.46529388427734,
"learning_rate": 1.9084539444744594e-06,
"loss": 0.0895,
"num_input_tokens_seen": 833856,
"step": 1695
},
{
"epoch": 0.22436320443447275,
"grad_norm": 0.1527111977338791,
"learning_rate": 1.907488682338944e-06,
"loss": 0.1324,
"num_input_tokens_seen": 836480,
"step": 1700
},
{
"epoch": 0.2250230962122212,
"grad_norm": 63.81155776977539,
"learning_rate": 1.9065186047254782e-06,
"loss": 0.0553,
"num_input_tokens_seen": 838976,
"step": 1705
},
{
"epoch": 0.22568298798996964,
"grad_norm": 0.6900471448898315,
"learning_rate": 1.9055437167816604e-06,
"loss": 0.2205,
"num_input_tokens_seen": 841728,
"step": 1710
},
{
"epoch": 0.22634287976771808,
"grad_norm": 0.05360851809382439,
"learning_rate": 1.9045640236806149e-06,
"loss": 0.0143,
"num_input_tokens_seen": 843968,
"step": 1715
},
{
"epoch": 0.22700277154546655,
"grad_norm": 0.35860204696655273,
"learning_rate": 1.903579530620963e-06,
"loss": 0.3401,
"num_input_tokens_seen": 846464,
"step": 1720
},
{
"epoch": 0.227662663323215,
"grad_norm": 0.1349165141582489,
"learning_rate": 1.9025902428267975e-06,
"loss": 0.0967,
"num_input_tokens_seen": 849088,
"step": 1725
},
{
"epoch": 0.22832255510096344,
"grad_norm": 0.09187756478786469,
"learning_rate": 1.901596165547653e-06,
"loss": 0.2082,
"num_input_tokens_seen": 851712,
"step": 1730
},
{
"epoch": 0.2289824468787119,
"grad_norm": 0.5252279043197632,
"learning_rate": 1.9005973040584796e-06,
"loss": 0.102,
"num_input_tokens_seen": 854208,
"step": 1735
},
{
"epoch": 0.22964233865646033,
"grad_norm": 0.3394613564014435,
"learning_rate": 1.8995936636596138e-06,
"loss": 0.088,
"num_input_tokens_seen": 856576,
"step": 1740
},
{
"epoch": 0.23030223043420878,
"grad_norm": 0.196676105260849,
"learning_rate": 1.8985852496767504e-06,
"loss": 0.1348,
"num_input_tokens_seen": 859008,
"step": 1745
},
{
"epoch": 0.23096212221195725,
"grad_norm": 83.27044677734375,
"learning_rate": 1.897572067460916e-06,
"loss": 0.1643,
"num_input_tokens_seen": 861440,
"step": 1750
},
{
"epoch": 0.2316220139897057,
"grad_norm": 1.6085481643676758,
"learning_rate": 1.8965541223884377e-06,
"loss": 0.0848,
"num_input_tokens_seen": 863936,
"step": 1755
},
{
"epoch": 0.23228190576745414,
"grad_norm": 23.993480682373047,
"learning_rate": 1.8955314198609171e-06,
"loss": 0.1238,
"num_input_tokens_seen": 866176,
"step": 1760
},
{
"epoch": 0.23294179754520258,
"grad_norm": 0.1617557555437088,
"learning_rate": 1.8945039653052005e-06,
"loss": 0.0977,
"num_input_tokens_seen": 868480,
"step": 1765
},
{
"epoch": 0.23360168932295103,
"grad_norm": 0.15750542283058167,
"learning_rate": 1.8934717641733498e-06,
"loss": 0.0877,
"num_input_tokens_seen": 870976,
"step": 1770
},
{
"epoch": 0.23426158110069947,
"grad_norm": 38.80494689941406,
"learning_rate": 1.8924348219426143e-06,
"loss": 0.2471,
"num_input_tokens_seen": 873088,
"step": 1775
},
{
"epoch": 0.23492147287844795,
"grad_norm": 11.684532165527344,
"learning_rate": 1.8913931441154016e-06,
"loss": 0.2694,
"num_input_tokens_seen": 875520,
"step": 1780
},
{
"epoch": 0.2355813646561964,
"grad_norm": 32.83953857421875,
"learning_rate": 1.8903467362192482e-06,
"loss": 0.0401,
"num_input_tokens_seen": 877632,
"step": 1785
},
{
"epoch": 0.23624125643394484,
"grad_norm": 97.62303161621094,
"learning_rate": 1.8892956038067895e-06,
"loss": 0.0696,
"num_input_tokens_seen": 880000,
"step": 1790
},
{
"epoch": 0.23690114821169328,
"grad_norm": 97.3688735961914,
"learning_rate": 1.8882397524557317e-06,
"loss": 0.0238,
"num_input_tokens_seen": 882176,
"step": 1795
},
{
"epoch": 0.23756103998944172,
"grad_norm": 0.09657946974039078,
"learning_rate": 1.8871791877688208e-06,
"loss": 0.0642,
"num_input_tokens_seen": 884800,
"step": 1800
},
{
"epoch": 0.23822093176719017,
"grad_norm": 0.5457859635353088,
"learning_rate": 1.8861139153738143e-06,
"loss": 0.0068,
"num_input_tokens_seen": 887104,
"step": 1805
},
{
"epoch": 0.23888082354493864,
"grad_norm": 2.4806833267211914,
"learning_rate": 1.8850439409234498e-06,
"loss": 0.0012,
"num_input_tokens_seen": 889408,
"step": 1810
},
{
"epoch": 0.2395407153226871,
"grad_norm": 156.25328063964844,
"learning_rate": 1.8839692700954161e-06,
"loss": 0.1943,
"num_input_tokens_seen": 891648,
"step": 1815
},
{
"epoch": 0.24020060710043553,
"grad_norm": 184.66175842285156,
"learning_rate": 1.8828899085923234e-06,
"loss": 0.3211,
"num_input_tokens_seen": 894208,
"step": 1820
},
{
"epoch": 0.24086049887818398,
"grad_norm": 0.037798941135406494,
"learning_rate": 1.881805862141671e-06,
"loss": 0.2085,
"num_input_tokens_seen": 896704,
"step": 1825
},
{
"epoch": 0.24152039065593242,
"grad_norm": 0.04176515340805054,
"learning_rate": 1.8807171364958196e-06,
"loss": 0.082,
"num_input_tokens_seen": 899264,
"step": 1830
},
{
"epoch": 0.24218028243368087,
"grad_norm": 0.2008335441350937,
"learning_rate": 1.879623737431959e-06,
"loss": 0.0206,
"num_input_tokens_seen": 901760,
"step": 1835
},
{
"epoch": 0.24284017421142934,
"grad_norm": 62.211387634277344,
"learning_rate": 1.8785256707520778e-06,
"loss": 0.3077,
"num_input_tokens_seen": 903872,
"step": 1840
},
{
"epoch": 0.24350006598917778,
"grad_norm": 0.0855240523815155,
"learning_rate": 1.8774229422829325e-06,
"loss": 0.0012,
"num_input_tokens_seen": 906368,
"step": 1845
},
{
"epoch": 0.24415995776692623,
"grad_norm": 0.062163472175598145,
"learning_rate": 1.8763155578760181e-06,
"loss": 0.0491,
"num_input_tokens_seen": 908864,
"step": 1850
},
{
"epoch": 0.24481984954467467,
"grad_norm": 0.138889878988266,
"learning_rate": 1.8752035234075336e-06,
"loss": 0.0892,
"num_input_tokens_seen": 911040,
"step": 1855
},
{
"epoch": 0.24547974132242312,
"grad_norm": 21.301368713378906,
"learning_rate": 1.8740868447783554e-06,
"loss": 0.1932,
"num_input_tokens_seen": 913408,
"step": 1860
},
{
"epoch": 0.24613963310017156,
"grad_norm": 77.18330383300781,
"learning_rate": 1.8729655279140012e-06,
"loss": 0.2285,
"num_input_tokens_seen": 915968,
"step": 1865
},
{
"epoch": 0.24679952487792003,
"grad_norm": 28.638566970825195,
"learning_rate": 1.8718395787646029e-06,
"loss": 0.1745,
"num_input_tokens_seen": 918528,
"step": 1870
},
{
"epoch": 0.24745941665566848,
"grad_norm": 0.12477682530879974,
"learning_rate": 1.870709003304872e-06,
"loss": 0.0009,
"num_input_tokens_seen": 921152,
"step": 1875
},
{
"epoch": 0.24811930843341692,
"grad_norm": 0.03698311001062393,
"learning_rate": 1.8695738075340693e-06,
"loss": 0.0005,
"num_input_tokens_seen": 923520,
"step": 1880
},
{
"epoch": 0.24877920021116537,
"grad_norm": 0.9180229902267456,
"learning_rate": 1.8684339974759723e-06,
"loss": 0.1696,
"num_input_tokens_seen": 925888,
"step": 1885
},
{
"epoch": 0.2494390919889138,
"grad_norm": 14.323315620422363,
"learning_rate": 1.8672895791788445e-06,
"loss": 0.0881,
"num_input_tokens_seen": 928704,
"step": 1890
},
{
"epoch": 0.2500989837666623,
"grad_norm": 118.73922729492188,
"learning_rate": 1.8661405587154017e-06,
"loss": 0.2865,
"num_input_tokens_seen": 930944,
"step": 1895
},
{
"epoch": 0.2500989837666623,
"eval_loss": 0.12773367762565613,
"eval_runtime": 7.6378,
"eval_samples_per_second": 881.797,
"eval_steps_per_second": 110.241,
"num_input_tokens_seen": 930944,
"step": 1895
},
{
"epoch": 0.25075887554441073,
"grad_norm": 12.755705833435059,
"learning_rate": 1.8649869421827808e-06,
"loss": 0.2389,
"num_input_tokens_seen": 933376,
"step": 1900
},
{
"epoch": 0.2514187673221592,
"grad_norm": 0.946739137172699,
"learning_rate": 1.863828735702507e-06,
"loss": 0.0517,
"num_input_tokens_seen": 936000,
"step": 1905
},
{
"epoch": 0.2520786590999076,
"grad_norm": 19.37730598449707,
"learning_rate": 1.862665945420462e-06,
"loss": 0.0611,
"num_input_tokens_seen": 938432,
"step": 1910
},
{
"epoch": 0.25273855087765607,
"grad_norm": 4.929696559906006,
"learning_rate": 1.8614985775068498e-06,
"loss": 0.0838,
"num_input_tokens_seen": 941312,
"step": 1915
},
{
"epoch": 0.2533984426554045,
"grad_norm": 59.3293342590332,
"learning_rate": 1.860326638156167e-06,
"loss": 0.0099,
"num_input_tokens_seen": 943488,
"step": 1920
},
{
"epoch": 0.25405833443315295,
"grad_norm": 32.63521194458008,
"learning_rate": 1.8591501335871653e-06,
"loss": 0.1064,
"num_input_tokens_seen": 945856,
"step": 1925
},
{
"epoch": 0.2547182262109014,
"grad_norm": 163.6297149658203,
"learning_rate": 1.857969070042824e-06,
"loss": 0.2861,
"num_input_tokens_seen": 948352,
"step": 1930
},
{
"epoch": 0.25537811798864984,
"grad_norm": 0.6843308210372925,
"learning_rate": 1.8567834537903116e-06,
"loss": 0.0541,
"num_input_tokens_seen": 950976,
"step": 1935
},
{
"epoch": 0.2560380097663983,
"grad_norm": 15.151936531066895,
"learning_rate": 1.8555932911209565e-06,
"loss": 0.1499,
"num_input_tokens_seen": 953216,
"step": 1940
},
{
"epoch": 0.25669790154414673,
"grad_norm": 1.8535500764846802,
"learning_rate": 1.8543985883502119e-06,
"loss": 0.0338,
"num_input_tokens_seen": 955648,
"step": 1945
},
{
"epoch": 0.25735779332189523,
"grad_norm": 0.5665189623832703,
"learning_rate": 1.8531993518176216e-06,
"loss": 0.0462,
"num_input_tokens_seen": 957888,
"step": 1950
},
{
"epoch": 0.2580176850996437,
"grad_norm": 91.90030670166016,
"learning_rate": 1.8519955878867889e-06,
"loss": 0.1275,
"num_input_tokens_seen": 960128,
"step": 1955
},
{
"epoch": 0.2586775768773921,
"grad_norm": 0.0542287677526474,
"learning_rate": 1.8507873029453392e-06,
"loss": 0.1778,
"num_input_tokens_seen": 962496,
"step": 1960
},
{
"epoch": 0.25933746865514057,
"grad_norm": 9.215625762939453,
"learning_rate": 1.8495745034048896e-06,
"loss": 0.2342,
"num_input_tokens_seen": 965120,
"step": 1965
},
{
"epoch": 0.259997360432889,
"grad_norm": 0.16024070978164673,
"learning_rate": 1.8483571957010127e-06,
"loss": 0.0074,
"num_input_tokens_seen": 967616,
"step": 1970
},
{
"epoch": 0.26065725221063746,
"grad_norm": 0.10168848931789398,
"learning_rate": 1.8471353862932035e-06,
"loss": 0.0688,
"num_input_tokens_seen": 970240,
"step": 1975
},
{
"epoch": 0.2613171439883859,
"grad_norm": 71.9769515991211,
"learning_rate": 1.8459090816648444e-06,
"loss": 0.1752,
"num_input_tokens_seen": 972544,
"step": 1980
},
{
"epoch": 0.26197703576613435,
"grad_norm": 0.1407454013824463,
"learning_rate": 1.8446782883231713e-06,
"loss": 0.2913,
"num_input_tokens_seen": 974912,
"step": 1985
},
{
"epoch": 0.2626369275438828,
"grad_norm": 17.948266983032227,
"learning_rate": 1.8434430127992387e-06,
"loss": 0.3162,
"num_input_tokens_seen": 977088,
"step": 1990
},
{
"epoch": 0.26329681932163124,
"grad_norm": 32.353912353515625,
"learning_rate": 1.8422032616478857e-06,
"loss": 0.1709,
"num_input_tokens_seen": 979648,
"step": 1995
},
{
"epoch": 0.2639567110993797,
"grad_norm": 1.2442036867141724,
"learning_rate": 1.8409590414477001e-06,
"loss": 0.1159,
"num_input_tokens_seen": 982336,
"step": 2000
},
{
"epoch": 0.2646166028771281,
"grad_norm": 3.403188705444336,
"learning_rate": 1.839710358800985e-06,
"loss": 0.0056,
"num_input_tokens_seen": 984768,
"step": 2005
},
{
"epoch": 0.2652764946548766,
"grad_norm": 5.241207599639893,
"learning_rate": 1.8384572203337224e-06,
"loss": 0.0349,
"num_input_tokens_seen": 987136,
"step": 2010
},
{
"epoch": 0.26593638643262507,
"grad_norm": 0.26890337467193604,
"learning_rate": 1.837199632695538e-06,
"loss": 0.1309,
"num_input_tokens_seen": 989824,
"step": 2015
},
{
"epoch": 0.2665962782103735,
"grad_norm": 53.87063217163086,
"learning_rate": 1.8359376025596682e-06,
"loss": 0.3374,
"num_input_tokens_seen": 992064,
"step": 2020
},
{
"epoch": 0.26725616998812196,
"grad_norm": 17.814453125,
"learning_rate": 1.8346711366229215e-06,
"loss": 0.1366,
"num_input_tokens_seen": 994368,
"step": 2025
},
{
"epoch": 0.2679160617658704,
"grad_norm": 18.101577758789062,
"learning_rate": 1.8334002416056442e-06,
"loss": 0.215,
"num_input_tokens_seen": 996864,
"step": 2030
},
{
"epoch": 0.26857595354361885,
"grad_norm": 0.25549983978271484,
"learning_rate": 1.8321249242516865e-06,
"loss": 0.2084,
"num_input_tokens_seen": 999360,
"step": 2035
},
{
"epoch": 0.2692358453213673,
"grad_norm": 0.35009151697158813,
"learning_rate": 1.8308451913283638e-06,
"loss": 0.0868,
"num_input_tokens_seen": 1001920,
"step": 2040
},
{
"epoch": 0.26989573709911574,
"grad_norm": 0.3472491502761841,
"learning_rate": 1.8295610496264229e-06,
"loss": 0.0602,
"num_input_tokens_seen": 1004224,
"step": 2045
},
{
"epoch": 0.2705556288768642,
"grad_norm": 0.34922727942466736,
"learning_rate": 1.828272505960005e-06,
"loss": 0.0027,
"num_input_tokens_seen": 1006528,
"step": 2050
},
{
"epoch": 0.27121552065461263,
"grad_norm": 0.13754220306873322,
"learning_rate": 1.8269795671666098e-06,
"loss": 0.1856,
"num_input_tokens_seen": 1008896,
"step": 2055
},
{
"epoch": 0.2718754124323611,
"grad_norm": 2.371704339981079,
"learning_rate": 1.8256822401070591e-06,
"loss": 0.1347,
"num_input_tokens_seen": 1011648,
"step": 2060
},
{
"epoch": 0.2725353042101095,
"grad_norm": 125.92493438720703,
"learning_rate": 1.8243805316654611e-06,
"loss": 0.0254,
"num_input_tokens_seen": 1014208,
"step": 2065
},
{
"epoch": 0.27319519598785796,
"grad_norm": 6.873655796051025,
"learning_rate": 1.823074448749172e-06,
"loss": 0.2187,
"num_input_tokens_seen": 1016640,
"step": 2070
},
{
"epoch": 0.27385508776560646,
"grad_norm": 0.0956018716096878,
"learning_rate": 1.8217639982887623e-06,
"loss": 0.0403,
"num_input_tokens_seen": 1019328,
"step": 2075
},
{
"epoch": 0.2745149795433549,
"grad_norm": 0.05648243427276611,
"learning_rate": 1.8204491872379769e-06,
"loss": 0.0603,
"num_input_tokens_seen": 1021696,
"step": 2080
},
{
"epoch": 0.27517487132110335,
"grad_norm": 148.0998077392578,
"learning_rate": 1.8191300225737e-06,
"loss": 0.0996,
"num_input_tokens_seen": 1024256,
"step": 2085
},
{
"epoch": 0.2758347630988518,
"grad_norm": 23.68135643005371,
"learning_rate": 1.8178065112959184e-06,
"loss": 0.2261,
"num_input_tokens_seen": 1026560,
"step": 2090
},
{
"epoch": 0.27649465487660024,
"grad_norm": 23.90264129638672,
"learning_rate": 1.8164786604276832e-06,
"loss": 0.3078,
"num_input_tokens_seen": 1029184,
"step": 2095
},
{
"epoch": 0.2771545466543487,
"grad_norm": 1.5052696466445923,
"learning_rate": 1.8151464770150727e-06,
"loss": 0.1119,
"num_input_tokens_seen": 1031744,
"step": 2100
},
{
"epoch": 0.27781443843209713,
"grad_norm": 0.3358094096183777,
"learning_rate": 1.8138099681271558e-06,
"loss": 0.2357,
"num_input_tokens_seen": 1034048,
"step": 2105
},
{
"epoch": 0.2784743302098456,
"grad_norm": 0.09559042006731033,
"learning_rate": 1.8124691408559536e-06,
"loss": 0.1489,
"num_input_tokens_seen": 1036544,
"step": 2110
},
{
"epoch": 0.279134221987594,
"grad_norm": 0.12438903003931046,
"learning_rate": 1.8111240023164023e-06,
"loss": 0.1057,
"num_input_tokens_seen": 1038848,
"step": 2115
},
{
"epoch": 0.27979411376534247,
"grad_norm": 0.17464138567447662,
"learning_rate": 1.809774559646316e-06,
"loss": 0.0049,
"num_input_tokens_seen": 1041152,
"step": 2120
},
{
"epoch": 0.2804540055430909,
"grad_norm": 15.5991792678833,
"learning_rate": 1.8084208200063469e-06,
"loss": 0.1192,
"num_input_tokens_seen": 1043968,
"step": 2125
},
{
"epoch": 0.28111389732083936,
"grad_norm": 16.062332153320312,
"learning_rate": 1.8070627905799496e-06,
"loss": 0.2678,
"num_input_tokens_seen": 1046272,
"step": 2130
},
{
"epoch": 0.28177378909858786,
"grad_norm": 38.3685302734375,
"learning_rate": 1.8057004785733413e-06,
"loss": 0.0892,
"num_input_tokens_seen": 1048448,
"step": 2135
},
{
"epoch": 0.2824336808763363,
"grad_norm": 1.2147469520568848,
"learning_rate": 1.8043338912154647e-06,
"loss": 0.171,
"num_input_tokens_seen": 1051072,
"step": 2140
},
{
"epoch": 0.28309357265408475,
"grad_norm": 2.57453989982605,
"learning_rate": 1.8029630357579486e-06,
"loss": 0.0537,
"num_input_tokens_seen": 1053312,
"step": 2145
},
{
"epoch": 0.2837534644318332,
"grad_norm": 0.08693689852952957,
"learning_rate": 1.8015879194750702e-06,
"loss": 0.0727,
"num_input_tokens_seen": 1055680,
"step": 2150
},
{
"epoch": 0.28441335620958164,
"grad_norm": 0.2852920889854431,
"learning_rate": 1.8002085496637165e-06,
"loss": 0.1279,
"num_input_tokens_seen": 1057984,
"step": 2155
},
{
"epoch": 0.2850732479873301,
"grad_norm": 0.3056880831718445,
"learning_rate": 1.7988249336433448e-06,
"loss": 0.1195,
"num_input_tokens_seen": 1060736,
"step": 2160
},
{
"epoch": 0.2857331397650785,
"grad_norm": 40.0366096496582,
"learning_rate": 1.7974370787559447e-06,
"loss": 0.1191,
"num_input_tokens_seen": 1063424,
"step": 2165
},
{
"epoch": 0.28639303154282697,
"grad_norm": 0.06313258409500122,
"learning_rate": 1.796044992365999e-06,
"loss": 0.0407,
"num_input_tokens_seen": 1065728,
"step": 2170
},
{
"epoch": 0.2870529233205754,
"grad_norm": 0.0497964546084404,
"learning_rate": 1.794648681860444e-06,
"loss": 0.0343,
"num_input_tokens_seen": 1068160,
"step": 2175
},
{
"epoch": 0.28771281509832386,
"grad_norm": 0.049737598747015,
"learning_rate": 1.7932481546486312e-06,
"loss": 0.2582,
"num_input_tokens_seen": 1070592,
"step": 2180
},
{
"epoch": 0.2883727068760723,
"grad_norm": 15.30761432647705,
"learning_rate": 1.791843418162287e-06,
"loss": 0.161,
"num_input_tokens_seen": 1073280,
"step": 2185
},
{
"epoch": 0.28903259865382075,
"grad_norm": 2.0795376300811768,
"learning_rate": 1.7904344798554748e-06,
"loss": 0.0127,
"num_input_tokens_seen": 1075584,
"step": 2190
},
{
"epoch": 0.28969249043156925,
"grad_norm": 2.7730255126953125,
"learning_rate": 1.789021347204553e-06,
"loss": 0.0962,
"num_input_tokens_seen": 1078016,
"step": 2195
},
{
"epoch": 0.2903523822093177,
"grad_norm": 42.26045608520508,
"learning_rate": 1.7876040277081381e-06,
"loss": 0.1665,
"num_input_tokens_seen": 1080512,
"step": 2200
},
{
"epoch": 0.29101227398706614,
"grad_norm": 49.438175201416016,
"learning_rate": 1.7861825288870632e-06,
"loss": 0.1979,
"num_input_tokens_seen": 1082752,
"step": 2205
},
{
"epoch": 0.2916721657648146,
"grad_norm": 19.054122924804688,
"learning_rate": 1.7847568582843376e-06,
"loss": 0.3436,
"num_input_tokens_seen": 1085184,
"step": 2210
},
{
"epoch": 0.29233205754256303,
"grad_norm": 13.935524940490723,
"learning_rate": 1.7833270234651088e-06,
"loss": 0.1458,
"num_input_tokens_seen": 1087360,
"step": 2215
},
{
"epoch": 0.2929919493203115,
"grad_norm": 12.456001281738281,
"learning_rate": 1.781893032016621e-06,
"loss": 0.0619,
"num_input_tokens_seen": 1089984,
"step": 2220
},
{
"epoch": 0.2936518410980599,
"grad_norm": 0.7875421643257141,
"learning_rate": 1.7804548915481746e-06,
"loss": 0.0185,
"num_input_tokens_seen": 1092608,
"step": 2225
},
{
"epoch": 0.29431173287580836,
"grad_norm": 0.38268232345581055,
"learning_rate": 1.7790126096910865e-06,
"loss": 0.1235,
"num_input_tokens_seen": 1095040,
"step": 2230
},
{
"epoch": 0.2949716246535568,
"grad_norm": 8.145883560180664,
"learning_rate": 1.7775661940986492e-06,
"loss": 0.064,
"num_input_tokens_seen": 1097728,
"step": 2235
},
{
"epoch": 0.29563151643130525,
"grad_norm": 4.164917469024658,
"learning_rate": 1.776115652446091e-06,
"loss": 0.2202,
"num_input_tokens_seen": 1100096,
"step": 2240
},
{
"epoch": 0.2962914082090537,
"grad_norm": 51.62826919555664,
"learning_rate": 1.7746609924305336e-06,
"loss": 0.1252,
"num_input_tokens_seen": 1102400,
"step": 2245
},
{
"epoch": 0.29695129998680214,
"grad_norm": 0.07670289278030396,
"learning_rate": 1.7732022217709534e-06,
"loss": 0.1016,
"num_input_tokens_seen": 1104960,
"step": 2250
},
{
"epoch": 0.2976111917645506,
"grad_norm": 9.224966049194336,
"learning_rate": 1.7717393482081384e-06,
"loss": 0.0905,
"num_input_tokens_seen": 1107520,
"step": 2255
},
{
"epoch": 0.2982710835422991,
"grad_norm": 26.702306747436523,
"learning_rate": 1.7702723795046492e-06,
"loss": 0.1454,
"num_input_tokens_seen": 1109952,
"step": 2260
},
{
"epoch": 0.29893097532004753,
"grad_norm": 0.32974258065223694,
"learning_rate": 1.7688013234447757e-06,
"loss": 0.0226,
"num_input_tokens_seen": 1112128,
"step": 2265
},
{
"epoch": 0.299590867097796,
"grad_norm": 0.09137266874313354,
"learning_rate": 1.7673261878344973e-06,
"loss": 0.1225,
"num_input_tokens_seen": 1114688,
"step": 2270
},
{
"epoch": 0.30011878051999474,
"eval_loss": 0.10979828983545303,
"eval_runtime": 7.5343,
"eval_samples_per_second": 893.91,
"eval_steps_per_second": 111.755,
"num_input_tokens_seen": 1116800,
"step": 2274
},
{
"epoch": 0.3002507588755444,
"grad_norm": 0.13611248135566711,
"learning_rate": 1.7658469805014414e-06,
"loss": 0.1963,
"num_input_tokens_seen": 1117248,
"step": 2275
},
{
"epoch": 0.30091065065329287,
"grad_norm": 11.078180313110352,
"learning_rate": 1.7643637092948415e-06,
"loss": 0.1312,
"num_input_tokens_seen": 1119808,
"step": 2280
},
{
"epoch": 0.3015705424310413,
"grad_norm": 13.604643821716309,
"learning_rate": 1.7628763820854948e-06,
"loss": 0.2181,
"num_input_tokens_seen": 1122112,
"step": 2285
},
{
"epoch": 0.30223043420878976,
"grad_norm": 0.21895840764045715,
"learning_rate": 1.7613850067657216e-06,
"loss": 0.0905,
"num_input_tokens_seen": 1124544,
"step": 2290
},
{
"epoch": 0.3028903259865382,
"grad_norm": 0.23491473495960236,
"learning_rate": 1.7598895912493232e-06,
"loss": 0.0688,
"num_input_tokens_seen": 1127104,
"step": 2295
},
{
"epoch": 0.30355021776428665,
"grad_norm": 3.027573823928833,
"learning_rate": 1.7583901434715397e-06,
"loss": 0.0735,
"num_input_tokens_seen": 1129536,
"step": 2300
},
{
"epoch": 0.3042101095420351,
"grad_norm": 0.24895241856575012,
"learning_rate": 1.7568866713890074e-06,
"loss": 0.0694,
"num_input_tokens_seen": 1131840,
"step": 2305
},
{
"epoch": 0.30487000131978353,
"grad_norm": 0.47908198833465576,
"learning_rate": 1.7553791829797175e-06,
"loss": 0.1669,
"num_input_tokens_seen": 1134336,
"step": 2310
},
{
"epoch": 0.305529893097532,
"grad_norm": 8.449284553527832,
"learning_rate": 1.7538676862429737e-06,
"loss": 0.2863,
"num_input_tokens_seen": 1136640,
"step": 2315
},
{
"epoch": 0.3061897848752805,
"grad_norm": 20.579036712646484,
"learning_rate": 1.7523521891993486e-06,
"loss": 0.1177,
"num_input_tokens_seen": 1139136,
"step": 2320
},
{
"epoch": 0.3068496766530289,
"grad_norm": 0.1681635081768036,
"learning_rate": 1.7508326998906422e-06,
"loss": 0.0919,
"num_input_tokens_seen": 1141568,
"step": 2325
},
{
"epoch": 0.30750956843077737,
"grad_norm": 0.1681900918483734,
"learning_rate": 1.7493092263798394e-06,
"loss": 0.004,
"num_input_tokens_seen": 1143936,
"step": 2330
},
{
"epoch": 0.3081694602085258,
"grad_norm": 96.35763549804688,
"learning_rate": 1.7477817767510664e-06,
"loss": 0.037,
"num_input_tokens_seen": 1146624,
"step": 2335
},
{
"epoch": 0.30882935198627426,
"grad_norm": 0.5398063063621521,
"learning_rate": 1.7462503591095484e-06,
"loss": 0.0055,
"num_input_tokens_seen": 1149120,
"step": 2340
},
{
"epoch": 0.3094892437640227,
"grad_norm": 0.031823668628931046,
"learning_rate": 1.7447149815815659e-06,
"loss": 0.0421,
"num_input_tokens_seen": 1151488,
"step": 2345
},
{
"epoch": 0.31014913554177115,
"grad_norm": 0.048917245119810104,
"learning_rate": 1.7431756523144126e-06,
"loss": 0.1083,
"num_input_tokens_seen": 1153600,
"step": 2350
},
{
"epoch": 0.3108090273195196,
"grad_norm": 0.025062644854187965,
"learning_rate": 1.7416323794763512e-06,
"loss": 0.0665,
"num_input_tokens_seen": 1156224,
"step": 2355
},
{
"epoch": 0.31146891909726804,
"grad_norm": 0.01710972562432289,
"learning_rate": 1.7400851712565707e-06,
"loss": 0.2148,
"num_input_tokens_seen": 1158656,
"step": 2360
},
{
"epoch": 0.3121288108750165,
"grad_norm": 0.0433725044131279,
"learning_rate": 1.7385340358651432e-06,
"loss": 0.2065,
"num_input_tokens_seen": 1161408,
"step": 2365
},
{
"epoch": 0.3127887026527649,
"grad_norm": 119.09558868408203,
"learning_rate": 1.736978981532979e-06,
"loss": 0.0283,
"num_input_tokens_seen": 1163904,
"step": 2370
},
{
"epoch": 0.31344859443051337,
"grad_norm": 0.16062359511852264,
"learning_rate": 1.7354200165117838e-06,
"loss": 0.2238,
"num_input_tokens_seen": 1166208,
"step": 2375
},
{
"epoch": 0.3141084862082619,
"grad_norm": 226.05735778808594,
"learning_rate": 1.733857149074016e-06,
"loss": 0.2442,
"num_input_tokens_seen": 1168512,
"step": 2380
},
{
"epoch": 0.3147683779860103,
"grad_norm": 0.14660975337028503,
"learning_rate": 1.7322903875128402e-06,
"loss": 0.1859,
"num_input_tokens_seen": 1171072,
"step": 2385
},
{
"epoch": 0.31542826976375876,
"grad_norm": 0.15770386159420013,
"learning_rate": 1.7307197401420858e-06,
"loss": 0.0042,
"num_input_tokens_seen": 1173312,
"step": 2390
},
{
"epoch": 0.3160881615415072,
"grad_norm": 0.41774991154670715,
"learning_rate": 1.7291452152962018e-06,
"loss": 0.0649,
"num_input_tokens_seen": 1175744,
"step": 2395
},
{
"epoch": 0.31674805331925565,
"grad_norm": 0.3258494734764099,
"learning_rate": 1.7275668213302116e-06,
"loss": 0.1831,
"num_input_tokens_seen": 1178112,
"step": 2400
},
{
"epoch": 0.3174079450970041,
"grad_norm": 57.20950698852539,
"learning_rate": 1.72598456661967e-06,
"loss": 0.0443,
"num_input_tokens_seen": 1180352,
"step": 2405
},
{
"epoch": 0.31806783687475254,
"grad_norm": 0.16907945275306702,
"learning_rate": 1.7243984595606191e-06,
"loss": 0.1393,
"num_input_tokens_seen": 1182528,
"step": 2410
},
{
"epoch": 0.318727728652501,
"grad_norm": 0.5881856679916382,
"learning_rate": 1.722808508569542e-06,
"loss": 0.0891,
"num_input_tokens_seen": 1185280,
"step": 2415
},
{
"epoch": 0.31938762043024943,
"grad_norm": 14.51777458190918,
"learning_rate": 1.72121472208332e-06,
"loss": 0.0768,
"num_input_tokens_seen": 1188032,
"step": 2420
},
{
"epoch": 0.3200475122079979,
"grad_norm": 83.25537872314453,
"learning_rate": 1.7196171085591864e-06,
"loss": 0.2321,
"num_input_tokens_seen": 1190464,
"step": 2425
},
{
"epoch": 0.3207074039857463,
"grad_norm": 222.58181762695312,
"learning_rate": 1.7180156764746824e-06,
"loss": 0.2085,
"num_input_tokens_seen": 1192960,
"step": 2430
},
{
"epoch": 0.32136729576349476,
"grad_norm": 0.1936943382024765,
"learning_rate": 1.7164104343276113e-06,
"loss": 0.0272,
"num_input_tokens_seen": 1195072,
"step": 2435
},
{
"epoch": 0.3220271875412432,
"grad_norm": 0.019381973892450333,
"learning_rate": 1.714801390635996e-06,
"loss": 0.0063,
"num_input_tokens_seen": 1197376,
"step": 2440
},
{
"epoch": 0.3226870793189917,
"grad_norm": 0.04609265550971031,
"learning_rate": 1.7131885539380297e-06,
"loss": 0.038,
"num_input_tokens_seen": 1199936,
"step": 2445
},
{
"epoch": 0.32334697109674015,
"grad_norm": 31.58684539794922,
"learning_rate": 1.7115719327920335e-06,
"loss": 0.1487,
"num_input_tokens_seen": 1202368,
"step": 2450
},
{
"epoch": 0.3240068628744886,
"grad_norm": 0.015219883061945438,
"learning_rate": 1.70995153577641e-06,
"loss": 0.0011,
"num_input_tokens_seen": 1204800,
"step": 2455
},
{
"epoch": 0.32466675465223704,
"grad_norm": 27.071508407592773,
"learning_rate": 1.7083273714895991e-06,
"loss": 0.0641,
"num_input_tokens_seen": 1207552,
"step": 2460
},
{
"epoch": 0.3253266464299855,
"grad_norm": 0.03585462644696236,
"learning_rate": 1.7066994485500298e-06,
"loss": 0.2123,
"num_input_tokens_seen": 1209856,
"step": 2465
},
{
"epoch": 0.32598653820773393,
"grad_norm": 0.20441071689128876,
"learning_rate": 1.7050677755960762e-06,
"loss": 0.0982,
"num_input_tokens_seen": 1212352,
"step": 2470
},
{
"epoch": 0.3266464299854824,
"grad_norm": 99.99305725097656,
"learning_rate": 1.7034323612860124e-06,
"loss": 0.1048,
"num_input_tokens_seen": 1214912,
"step": 2475
},
{
"epoch": 0.3273063217632308,
"grad_norm": 0.028969811275601387,
"learning_rate": 1.7017932142979645e-06,
"loss": 0.0354,
"num_input_tokens_seen": 1217088,
"step": 2480
},
{
"epoch": 0.32796621354097927,
"grad_norm": 4.167091369628906,
"learning_rate": 1.700150343329866e-06,
"loss": 0.2006,
"num_input_tokens_seen": 1219584,
"step": 2485
},
{
"epoch": 0.3286261053187277,
"grad_norm": 30.39582633972168,
"learning_rate": 1.6985037570994113e-06,
"loss": 0.1335,
"num_input_tokens_seen": 1222336,
"step": 2490
},
{
"epoch": 0.32928599709647616,
"grad_norm": 0.7080073952674866,
"learning_rate": 1.6968534643440088e-06,
"loss": 0.0688,
"num_input_tokens_seen": 1224832,
"step": 2495
},
{
"epoch": 0.3299458888742246,
"grad_norm": 36.44700241088867,
"learning_rate": 1.6951994738207364e-06,
"loss": 0.1821,
"num_input_tokens_seen": 1227392,
"step": 2500
},
{
"epoch": 0.3306057806519731,
"grad_norm": 12.277377128601074,
"learning_rate": 1.6935417943062928e-06,
"loss": 0.2034,
"num_input_tokens_seen": 1229952,
"step": 2505
},
{
"epoch": 0.33126567242972155,
"grad_norm": 0.4052470028400421,
"learning_rate": 1.6918804345969516e-06,
"loss": 0.0106,
"num_input_tokens_seen": 1232640,
"step": 2510
},
{
"epoch": 0.33192556420747,
"grad_norm": 26.028976440429688,
"learning_rate": 1.6902154035085156e-06,
"loss": 0.0161,
"num_input_tokens_seen": 1235200,
"step": 2515
},
{
"epoch": 0.33258545598521844,
"grad_norm": 0.1962713748216629,
"learning_rate": 1.688546709876269e-06,
"loss": 0.0893,
"num_input_tokens_seen": 1237632,
"step": 2520
},
{
"epoch": 0.3332453477629669,
"grad_norm": 1.0130256414413452,
"learning_rate": 1.6868743625549314e-06,
"loss": 0.0905,
"num_input_tokens_seen": 1239936,
"step": 2525
},
{
"epoch": 0.3339052395407153,
"grad_norm": 0.6345663666725159,
"learning_rate": 1.6851983704186092e-06,
"loss": 0.0392,
"num_input_tokens_seen": 1242304,
"step": 2530
},
{
"epoch": 0.33456513131846377,
"grad_norm": 0.025052571669220924,
"learning_rate": 1.6835187423607503e-06,
"loss": 0.0036,
"num_input_tokens_seen": 1244736,
"step": 2535
},
{
"epoch": 0.3352250230962122,
"grad_norm": 0.6511954665184021,
"learning_rate": 1.681835487294096e-06,
"loss": 0.2003,
"num_input_tokens_seen": 1247488,
"step": 2540
},
{
"epoch": 0.33588491487396066,
"grad_norm": 0.022017456591129303,
"learning_rate": 1.6801486141506342e-06,
"loss": 0.2557,
"num_input_tokens_seen": 1250048,
"step": 2545
},
{
"epoch": 0.3365448066517091,
"grad_norm": 160.31907653808594,
"learning_rate": 1.6784581318815514e-06,
"loss": 0.3749,
"num_input_tokens_seen": 1252928,
"step": 2550
},
{
"epoch": 0.33720469842945755,
"grad_norm": 0.035775672644376755,
"learning_rate": 1.6767640494571849e-06,
"loss": 0.146,
"num_input_tokens_seen": 1255488,
"step": 2555
},
{
"epoch": 0.337864590207206,
"grad_norm": 25.89168357849121,
"learning_rate": 1.6750663758669767e-06,
"loss": 0.3346,
"num_input_tokens_seen": 1257984,
"step": 2560
},
{
"epoch": 0.3385244819849545,
"grad_norm": 0.10507479310035706,
"learning_rate": 1.6733651201194245e-06,
"loss": 0.1044,
"num_input_tokens_seen": 1260416,
"step": 2565
},
{
"epoch": 0.33918437376270294,
"grad_norm": 36.37501907348633,
"learning_rate": 1.6716602912420342e-06,
"loss": 0.0797,
"num_input_tokens_seen": 1263168,
"step": 2570
},
{
"epoch": 0.3398442655404514,
"grad_norm": 0.687891960144043,
"learning_rate": 1.6699518982812726e-06,
"loss": 0.1608,
"num_input_tokens_seen": 1265600,
"step": 2575
},
{
"epoch": 0.34050415731819983,
"grad_norm": 0.11499731987714767,
"learning_rate": 1.6682399503025183e-06,
"loss": 0.0033,
"num_input_tokens_seen": 1268032,
"step": 2580
},
{
"epoch": 0.3411640490959483,
"grad_norm": 42.86396408081055,
"learning_rate": 1.666524456390014e-06,
"loss": 0.1571,
"num_input_tokens_seen": 1270336,
"step": 2585
},
{
"epoch": 0.3418239408736967,
"grad_norm": 30.411161422729492,
"learning_rate": 1.664805425646819e-06,
"loss": 0.0566,
"num_input_tokens_seen": 1273088,
"step": 2590
},
{
"epoch": 0.34248383265144516,
"grad_norm": 0.1486613005399704,
"learning_rate": 1.6630828671947606e-06,
"loss": 0.2203,
"num_input_tokens_seen": 1275456,
"step": 2595
},
{
"epoch": 0.3431437244291936,
"grad_norm": 0.21017670631408691,
"learning_rate": 1.6613567901743842e-06,
"loss": 0.0365,
"num_input_tokens_seen": 1277888,
"step": 2600
},
{
"epoch": 0.34380361620694205,
"grad_norm": 0.2567872107028961,
"learning_rate": 1.6596272037449075e-06,
"loss": 0.0013,
"num_input_tokens_seen": 1280384,
"step": 2605
},
{
"epoch": 0.3444635079846905,
"grad_norm": 35.565086364746094,
"learning_rate": 1.6578941170841696e-06,
"loss": 0.064,
"num_input_tokens_seen": 1282944,
"step": 2610
},
{
"epoch": 0.34512339976243894,
"grad_norm": 0.281055212020874,
"learning_rate": 1.6561575393885833e-06,
"loss": 0.0664,
"num_input_tokens_seen": 1285184,
"step": 2615
},
{
"epoch": 0.3457832915401874,
"grad_norm": 0.0915956199169159,
"learning_rate": 1.6544174798730864e-06,
"loss": 0.1976,
"num_input_tokens_seen": 1287808,
"step": 2620
},
{
"epoch": 0.34644318331793583,
"grad_norm": 0.18124467134475708,
"learning_rate": 1.6526739477710923e-06,
"loss": 0.1552,
"num_input_tokens_seen": 1290432,
"step": 2625
},
{
"epoch": 0.34710307509568433,
"grad_norm": 0.16865764558315277,
"learning_rate": 1.650926952334441e-06,
"loss": 0.2257,
"num_input_tokens_seen": 1292736,
"step": 2630
},
{
"epoch": 0.3477629668734328,
"grad_norm": 0.4664243757724762,
"learning_rate": 1.6491765028333516e-06,
"loss": 0.2674,
"num_input_tokens_seen": 1295104,
"step": 2635
},
{
"epoch": 0.3484228586511812,
"grad_norm": 0.6427319645881653,
"learning_rate": 1.6474226085563693e-06,
"loss": 0.0204,
"num_input_tokens_seen": 1297600,
"step": 2640
},
{
"epoch": 0.34908275042892967,
"grad_norm": 0.10163812339305878,
"learning_rate": 1.6456652788103215e-06,
"loss": 0.0496,
"num_input_tokens_seen": 1300224,
"step": 2645
},
{
"epoch": 0.3497426422066781,
"grad_norm": 2.703385829925537,
"learning_rate": 1.6439045229202631e-06,
"loss": 0.1152,
"num_input_tokens_seen": 1302528,
"step": 2650
},
{
"epoch": 0.3501385772733272,
"eval_loss": 0.12348020076751709,
"eval_runtime": 7.625,
"eval_samples_per_second": 883.275,
"eval_steps_per_second": 110.426,
"num_input_tokens_seen": 1303872,
"step": 2653
},
{
"epoch": 0.35040253398442656,
"grad_norm": 0.24891549348831177,
"learning_rate": 1.6421403502294307e-06,
"loss": 0.159,
"num_input_tokens_seen": 1305024,
"step": 2655
},
{
"epoch": 0.351062425762175,
"grad_norm": 0.2551489472389221,
"learning_rate": 1.6403727700991915e-06,
"loss": 0.1813,
"num_input_tokens_seen": 1307392,
"step": 2660
},
{
"epoch": 0.35172231753992345,
"grad_norm": 0.29944464564323425,
"learning_rate": 1.6386017919089933e-06,
"loss": 0.1581,
"num_input_tokens_seen": 1310016,
"step": 2665
},
{
"epoch": 0.3523822093176719,
"grad_norm": 0.0917045846581459,
"learning_rate": 1.636827425056316e-06,
"loss": 0.0066,
"num_input_tokens_seen": 1312576,
"step": 2670
},
{
"epoch": 0.35304210109542034,
"grad_norm": 0.10297297686338425,
"learning_rate": 1.635049678956621e-06,
"loss": 0.1432,
"num_input_tokens_seen": 1315072,
"step": 2675
},
{
"epoch": 0.3537019928731688,
"grad_norm": 32.947994232177734,
"learning_rate": 1.633268563043301e-06,
"loss": 0.1222,
"num_input_tokens_seen": 1317504,
"step": 2680
},
{
"epoch": 0.3543618846509172,
"grad_norm": 0.27496451139450073,
"learning_rate": 1.63148408676763e-06,
"loss": 0.0023,
"num_input_tokens_seen": 1319680,
"step": 2685
},
{
"epoch": 0.3550217764286657,
"grad_norm": 0.06333144754171371,
"learning_rate": 1.6296962595987141e-06,
"loss": 0.0014,
"num_input_tokens_seen": 1322240,
"step": 2690
},
{
"epoch": 0.35568166820641417,
"grad_norm": 1.364142894744873,
"learning_rate": 1.6279050910234392e-06,
"loss": 0.1142,
"num_input_tokens_seen": 1324736,
"step": 2695
},
{
"epoch": 0.3563415599841626,
"grad_norm": 0.07366377115249634,
"learning_rate": 1.626110590546423e-06,
"loss": 0.0407,
"num_input_tokens_seen": 1327104,
"step": 2700
},
{
"epoch": 0.35700145176191106,
"grad_norm": 134.90122985839844,
"learning_rate": 1.6243127676899635e-06,
"loss": 0.248,
"num_input_tokens_seen": 1329920,
"step": 2705
},
{
"epoch": 0.3576613435396595,
"grad_norm": 0.04035777971148491,
"learning_rate": 1.6225116319939884e-06,
"loss": 0.2153,
"num_input_tokens_seen": 1332352,
"step": 2710
},
{
"epoch": 0.35832123531740795,
"grad_norm": 75.87095642089844,
"learning_rate": 1.6207071930160044e-06,
"loss": 0.1084,
"num_input_tokens_seen": 1335040,
"step": 2715
},
{
"epoch": 0.3589811270951564,
"grad_norm": 0.1767151951789856,
"learning_rate": 1.6188994603310468e-06,
"loss": 0.0054,
"num_input_tokens_seen": 1337472,
"step": 2720
},
{
"epoch": 0.35964101887290484,
"grad_norm": 3.7952630519866943,
"learning_rate": 1.617088443531628e-06,
"loss": 0.1694,
"num_input_tokens_seen": 1339712,
"step": 2725
},
{
"epoch": 0.3603009106506533,
"grad_norm": 0.17187942564487457,
"learning_rate": 1.6152741522276882e-06,
"loss": 0.0016,
"num_input_tokens_seen": 1342144,
"step": 2730
},
{
"epoch": 0.36096080242840173,
"grad_norm": 0.7987899780273438,
"learning_rate": 1.6134565960465425e-06,
"loss": 0.108,
"num_input_tokens_seen": 1344512,
"step": 2735
},
{
"epoch": 0.3616206942061502,
"grad_norm": 0.12640990316867828,
"learning_rate": 1.6116357846328312e-06,
"loss": 0.242,
"num_input_tokens_seen": 1346880,
"step": 2740
},
{
"epoch": 0.3622805859838986,
"grad_norm": 0.04579659551382065,
"learning_rate": 1.609811727648468e-06,
"loss": 0.1324,
"num_input_tokens_seen": 1349056,
"step": 2745
},
{
"epoch": 0.36294047776164706,
"grad_norm": 0.21617701649665833,
"learning_rate": 1.6079844347725882e-06,
"loss": 0.0724,
"num_input_tokens_seen": 1351488,
"step": 2750
},
{
"epoch": 0.36360036953939556,
"grad_norm": 0.17689555883407593,
"learning_rate": 1.6061539157014987e-06,
"loss": 0.0532,
"num_input_tokens_seen": 1353920,
"step": 2755
},
{
"epoch": 0.364260261317144,
"grad_norm": 0.18878047168254852,
"learning_rate": 1.6043201801486257e-06,
"loss": 0.2916,
"num_input_tokens_seen": 1356352,
"step": 2760
},
{
"epoch": 0.36492015309489245,
"grad_norm": 1.1614915132522583,
"learning_rate": 1.6024832378444628e-06,
"loss": 0.2542,
"num_input_tokens_seen": 1359104,
"step": 2765
},
{
"epoch": 0.3655800448726409,
"grad_norm": 26.108612060546875,
"learning_rate": 1.6006430985365204e-06,
"loss": 0.2718,
"num_input_tokens_seen": 1361536,
"step": 2770
},
{
"epoch": 0.36623993665038934,
"grad_norm": 113.12171936035156,
"learning_rate": 1.5987997719892735e-06,
"loss": 0.2648,
"num_input_tokens_seen": 1364160,
"step": 2775
},
{
"epoch": 0.3668998284281378,
"grad_norm": 0.58730149269104,
"learning_rate": 1.5969532679841088e-06,
"loss": 0.0465,
"num_input_tokens_seen": 1366656,
"step": 2780
},
{
"epoch": 0.36755972020588623,
"grad_norm": 32.10945510864258,
"learning_rate": 1.5951035963192752e-06,
"loss": 0.0486,
"num_input_tokens_seen": 1369216,
"step": 2785
},
{
"epoch": 0.3682196119836347,
"grad_norm": 1.1091487407684326,
"learning_rate": 1.593250766809829e-06,
"loss": 0.2435,
"num_input_tokens_seen": 1371712,
"step": 2790
},
{
"epoch": 0.3688795037613831,
"grad_norm": 61.50751495361328,
"learning_rate": 1.5913947892875842e-06,
"loss": 0.1572,
"num_input_tokens_seen": 1374080,
"step": 2795
},
{
"epoch": 0.36953939553913157,
"grad_norm": 0.4279724657535553,
"learning_rate": 1.589535673601059e-06,
"loss": 0.1055,
"num_input_tokens_seen": 1377024,
"step": 2800
},
{
"epoch": 0.37019928731688,
"grad_norm": 42.588748931884766,
"learning_rate": 1.587673429615424e-06,
"loss": 0.0806,
"num_input_tokens_seen": 1379392,
"step": 2805
},
{
"epoch": 0.37085917909462845,
"grad_norm": 0.18637718260288239,
"learning_rate": 1.5858080672124495e-06,
"loss": 0.1468,
"num_input_tokens_seen": 1381760,
"step": 2810
},
{
"epoch": 0.37151907087237696,
"grad_norm": 0.43665367364883423,
"learning_rate": 1.5839395962904536e-06,
"loss": 0.0923,
"num_input_tokens_seen": 1384128,
"step": 2815
},
{
"epoch": 0.3721789626501254,
"grad_norm": 0.0831814855337143,
"learning_rate": 1.5820680267642494e-06,
"loss": 0.0594,
"num_input_tokens_seen": 1386496,
"step": 2820
},
{
"epoch": 0.37283885442787384,
"grad_norm": 0.25996115803718567,
"learning_rate": 1.5801933685650917e-06,
"loss": 0.0668,
"num_input_tokens_seen": 1388736,
"step": 2825
},
{
"epoch": 0.3734987462056223,
"grad_norm": 2.1776347160339355,
"learning_rate": 1.5783156316406259e-06,
"loss": 0.002,
"num_input_tokens_seen": 1391040,
"step": 2830
},
{
"epoch": 0.37415863798337073,
"grad_norm": 66.52011108398438,
"learning_rate": 1.5764348259548334e-06,
"loss": 0.218,
"num_input_tokens_seen": 1393344,
"step": 2835
},
{
"epoch": 0.3748185297611192,
"grad_norm": 234.61207580566406,
"learning_rate": 1.5745509614879806e-06,
"loss": 0.056,
"num_input_tokens_seen": 1395648,
"step": 2840
},
{
"epoch": 0.3754784215388676,
"grad_norm": 0.03497995808720589,
"learning_rate": 1.572664048236564e-06,
"loss": 0.2865,
"num_input_tokens_seen": 1398272,
"step": 2845
},
{
"epoch": 0.37613831331661607,
"grad_norm": 0.07777401059865952,
"learning_rate": 1.570774096213259e-06,
"loss": 0.0507,
"num_input_tokens_seen": 1400576,
"step": 2850
},
{
"epoch": 0.3767982050943645,
"grad_norm": 0.07564707100391388,
"learning_rate": 1.5688811154468649e-06,
"loss": 0.0513,
"num_input_tokens_seen": 1403136,
"step": 2855
},
{
"epoch": 0.37745809687211296,
"grad_norm": 0.08237399160861969,
"learning_rate": 1.5669851159822532e-06,
"loss": 0.1228,
"num_input_tokens_seen": 1405504,
"step": 2860
},
{
"epoch": 0.3781179886498614,
"grad_norm": 42.22079086303711,
"learning_rate": 1.5650861078803137e-06,
"loss": 0.1389,
"num_input_tokens_seen": 1407808,
"step": 2865
},
{
"epoch": 0.37877788042760985,
"grad_norm": 6.883021831512451,
"learning_rate": 1.5631841012179013e-06,
"loss": 0.0692,
"num_input_tokens_seen": 1410304,
"step": 2870
},
{
"epoch": 0.37943777220535835,
"grad_norm": 0.3424462676048279,
"learning_rate": 1.5612791060877818e-06,
"loss": 0.004,
"num_input_tokens_seen": 1412736,
"step": 2875
},
{
"epoch": 0.3800976639831068,
"grad_norm": 75.88460540771484,
"learning_rate": 1.5593711325985801e-06,
"loss": 0.0961,
"num_input_tokens_seen": 1415488,
"step": 2880
},
{
"epoch": 0.38075755576085524,
"grad_norm": 0.043806418776512146,
"learning_rate": 1.5574601908747245e-06,
"loss": 0.21,
"num_input_tokens_seen": 1417856,
"step": 2885
},
{
"epoch": 0.3814174475386037,
"grad_norm": 0.06361314654350281,
"learning_rate": 1.5555462910563936e-06,
"loss": 0.0664,
"num_input_tokens_seen": 1420096,
"step": 2890
},
{
"epoch": 0.3820773393163521,
"grad_norm": 25.98211097717285,
"learning_rate": 1.5536294432994636e-06,
"loss": 0.2344,
"num_input_tokens_seen": 1422656,
"step": 2895
},
{
"epoch": 0.38273723109410057,
"grad_norm": 92.6849594116211,
"learning_rate": 1.5517096577754528e-06,
"loss": 0.0884,
"num_input_tokens_seen": 1425152,
"step": 2900
},
{
"epoch": 0.383397122871849,
"grad_norm": 0.08511543273925781,
"learning_rate": 1.5497869446714695e-06,
"loss": 0.0623,
"num_input_tokens_seen": 1427840,
"step": 2905
},
{
"epoch": 0.38405701464959746,
"grad_norm": 0.13399949669837952,
"learning_rate": 1.5478613141901558e-06,
"loss": 0.0019,
"num_input_tokens_seen": 1430144,
"step": 2910
},
{
"epoch": 0.3847169064273459,
"grad_norm": 0.18390312790870667,
"learning_rate": 1.5459327765496348e-06,
"loss": 0.1492,
"num_input_tokens_seen": 1432448,
"step": 2915
},
{
"epoch": 0.38537679820509435,
"grad_norm": 1.747375726699829,
"learning_rate": 1.5440013419834563e-06,
"loss": 0.0071,
"num_input_tokens_seen": 1434752,
"step": 2920
},
{
"epoch": 0.3860366899828428,
"grad_norm": 0.4480796158313751,
"learning_rate": 1.5420670207405419e-06,
"loss": 0.0011,
"num_input_tokens_seen": 1437184,
"step": 2925
},
{
"epoch": 0.38669658176059124,
"grad_norm": 7.325652122497559,
"learning_rate": 1.5401298230851314e-06,
"loss": 0.1098,
"num_input_tokens_seen": 1440000,
"step": 2930
},
{
"epoch": 0.3873564735383397,
"grad_norm": 5.879019737243652,
"learning_rate": 1.5381897592967275e-06,
"loss": 0.0072,
"num_input_tokens_seen": 1442624,
"step": 2935
},
{
"epoch": 0.3880163653160882,
"grad_norm": 0.20650486648082733,
"learning_rate": 1.5362468396700426e-06,
"loss": 0.0702,
"num_input_tokens_seen": 1445184,
"step": 2940
},
{
"epoch": 0.38867625709383663,
"grad_norm": 22.289382934570312,
"learning_rate": 1.5343010745149418e-06,
"loss": 0.322,
"num_input_tokens_seen": 1447616,
"step": 2945
},
{
"epoch": 0.3893361488715851,
"grad_norm": 0.035571977496147156,
"learning_rate": 1.532352474156391e-06,
"loss": 0.0715,
"num_input_tokens_seen": 1450176,
"step": 2950
},
{
"epoch": 0.3899960406493335,
"grad_norm": 3.2316651344299316,
"learning_rate": 1.5304010489343995e-06,
"loss": 0.4706,
"num_input_tokens_seen": 1452672,
"step": 2955
},
{
"epoch": 0.39065593242708196,
"grad_norm": 0.06907609850168228,
"learning_rate": 1.528446809203968e-06,
"loss": 0.2238,
"num_input_tokens_seen": 1455232,
"step": 2960
},
{
"epoch": 0.3913158242048304,
"grad_norm": 82.65614318847656,
"learning_rate": 1.526489765335031e-06,
"loss": 0.1729,
"num_input_tokens_seen": 1457792,
"step": 2965
},
{
"epoch": 0.39197571598257885,
"grad_norm": 0.3325257897377014,
"learning_rate": 1.5245299277124026e-06,
"loss": 0.1528,
"num_input_tokens_seen": 1460160,
"step": 2970
},
{
"epoch": 0.3926356077603273,
"grad_norm": 0.9707848429679871,
"learning_rate": 1.5225673067357218e-06,
"loss": 0.1434,
"num_input_tokens_seen": 1462400,
"step": 2975
},
{
"epoch": 0.39329549953807574,
"grad_norm": 22.089210510253906,
"learning_rate": 1.5206019128193981e-06,
"loss": 0.1209,
"num_input_tokens_seen": 1465088,
"step": 2980
},
{
"epoch": 0.3939553913158242,
"grad_norm": 1.0957697629928589,
"learning_rate": 1.5186337563925538e-06,
"loss": 0.1168,
"num_input_tokens_seen": 1467456,
"step": 2985
},
{
"epoch": 0.39461528309357263,
"grad_norm": 0.22268956899642944,
"learning_rate": 1.516662847898971e-06,
"loss": 0.0016,
"num_input_tokens_seen": 1470016,
"step": 2990
},
{
"epoch": 0.3952751748713211,
"grad_norm": 0.2794409990310669,
"learning_rate": 1.5146891977970349e-06,
"loss": 0.1024,
"num_input_tokens_seen": 1472448,
"step": 2995
},
{
"epoch": 0.3959350666490696,
"grad_norm": 55.23267364501953,
"learning_rate": 1.5127128165596794e-06,
"loss": 0.1009,
"num_input_tokens_seen": 1475072,
"step": 3000
},
{
"epoch": 0.396594958426818,
"grad_norm": 0.32357192039489746,
"learning_rate": 1.51073371467433e-06,
"loss": 0.0499,
"num_input_tokens_seen": 1477440,
"step": 3005
},
{
"epoch": 0.39725485020456647,
"grad_norm": 2.3438990116119385,
"learning_rate": 1.5087519026428498e-06,
"loss": 0.0043,
"num_input_tokens_seen": 1479872,
"step": 3010
},
{
"epoch": 0.3979147419823149,
"grad_norm": 214.1775665283203,
"learning_rate": 1.5067673909814818e-06,
"loss": 0.1242,
"num_input_tokens_seen": 1481920,
"step": 3015
},
{
"epoch": 0.39857463376006336,
"grad_norm": 0.06694573163986206,
"learning_rate": 1.5047801902207953e-06,
"loss": 0.1901,
"num_input_tokens_seen": 1484992,
"step": 3020
},
{
"epoch": 0.3992345255378118,
"grad_norm": 37.85984802246094,
"learning_rate": 1.5027903109056288e-06,
"loss": 0.1508,
"num_input_tokens_seen": 1487232,
"step": 3025
},
{
"epoch": 0.39989441731556025,
"grad_norm": 22.730335235595703,
"learning_rate": 1.5007977635950336e-06,
"loss": 0.1615,
"num_input_tokens_seen": 1489728,
"step": 3030
},
{
"epoch": 0.4001583740266596,
"eval_loss": 0.13228875398635864,
"eval_runtime": 7.7073,
"eval_samples_per_second": 873.842,
"eval_steps_per_second": 109.246,
"num_input_tokens_seen": 1490688,
"step": 3032
},
{
"epoch": 0.4005543090933087,
"grad_norm": 96.17181396484375,
"learning_rate": 1.498802558862219e-06,
"loss": 0.154,
"num_input_tokens_seen": 1491968,
"step": 3035
},
{
"epoch": 0.40121420087105714,
"grad_norm": 0.3932342231273651,
"learning_rate": 1.496804707294496e-06,
"loss": 0.1078,
"num_input_tokens_seen": 1494336,
"step": 3040
},
{
"epoch": 0.4018740926488056,
"grad_norm": 0.33634519577026367,
"learning_rate": 1.4948042194932195e-06,
"loss": 0.0599,
"num_input_tokens_seen": 1497472,
"step": 3045
},
{
"epoch": 0.402533984426554,
"grad_norm": 0.19691598415374756,
"learning_rate": 1.4928011060737341e-06,
"loss": 0.0399,
"num_input_tokens_seen": 1499968,
"step": 3050
},
{
"epoch": 0.40319387620430247,
"grad_norm": 0.058707304298877716,
"learning_rate": 1.4907953776653171e-06,
"loss": 0.0741,
"num_input_tokens_seen": 1502336,
"step": 3055
},
{
"epoch": 0.40385376798205097,
"grad_norm": 17.177833557128906,
"learning_rate": 1.4887870449111206e-06,
"loss": 0.1581,
"num_input_tokens_seen": 1504576,
"step": 3060
},
{
"epoch": 0.4045136597597994,
"grad_norm": 0.7955127954483032,
"learning_rate": 1.486776118468118e-06,
"loss": 0.1605,
"num_input_tokens_seen": 1507136,
"step": 3065
},
{
"epoch": 0.40517355153754786,
"grad_norm": 0.5847259163856506,
"learning_rate": 1.4847626090070451e-06,
"loss": 0.0716,
"num_input_tokens_seen": 1509696,
"step": 3070
},
{
"epoch": 0.4058334433152963,
"grad_norm": 0.25745320320129395,
"learning_rate": 1.4827465272123439e-06,
"loss": 0.299,
"num_input_tokens_seen": 1512192,
"step": 3075
},
{
"epoch": 0.40649333509304475,
"grad_norm": 0.3554550111293793,
"learning_rate": 1.4807278837821063e-06,
"loss": 0.0453,
"num_input_tokens_seen": 1514752,
"step": 3080
},
{
"epoch": 0.4071532268707932,
"grad_norm": 12.156785011291504,
"learning_rate": 1.4787066894280178e-06,
"loss": 0.2992,
"num_input_tokens_seen": 1517440,
"step": 3085
},
{
"epoch": 0.40781311864854164,
"grad_norm": 0.10129724442958832,
"learning_rate": 1.476682954875299e-06,
"loss": 0.0637,
"num_input_tokens_seen": 1519744,
"step": 3090
},
{
"epoch": 0.4084730104262901,
"grad_norm": 84.23600769042969,
"learning_rate": 1.4746566908626506e-06,
"loss": 0.0773,
"num_input_tokens_seen": 1522176,
"step": 3095
},
{
"epoch": 0.40913290220403853,
"grad_norm": 1.9050307273864746,
"learning_rate": 1.4726279081421956e-06,
"loss": 0.0516,
"num_input_tokens_seen": 1524352,
"step": 3100
},
{
"epoch": 0.409792793981787,
"grad_norm": 35.056800842285156,
"learning_rate": 1.4705966174794216e-06,
"loss": 0.2317,
"num_input_tokens_seen": 1526976,
"step": 3105
},
{
"epoch": 0.4104526857595354,
"grad_norm": 0.22622281312942505,
"learning_rate": 1.4685628296531248e-06,
"loss": 0.1563,
"num_input_tokens_seen": 1529152,
"step": 3110
},
{
"epoch": 0.41111257753728386,
"grad_norm": 1.48894202709198,
"learning_rate": 1.466526555455352e-06,
"loss": 0.051,
"num_input_tokens_seen": 1531648,
"step": 3115
},
{
"epoch": 0.4117724693150323,
"grad_norm": 0.444116473197937,
"learning_rate": 1.4644878056913432e-06,
"loss": 0.0057,
"num_input_tokens_seen": 1533952,
"step": 3120
},
{
"epoch": 0.4124323610927808,
"grad_norm": 48.74332046508789,
"learning_rate": 1.4624465911794764e-06,
"loss": 0.1887,
"num_input_tokens_seen": 1536640,
"step": 3125
},
{
"epoch": 0.41309225287052925,
"grad_norm": 0.06482608616352081,
"learning_rate": 1.4604029227512062e-06,
"loss": 0.0053,
"num_input_tokens_seen": 1539200,
"step": 3130
},
{
"epoch": 0.4137521446482777,
"grad_norm": 81.11097717285156,
"learning_rate": 1.4583568112510108e-06,
"loss": 0.1908,
"num_input_tokens_seen": 1541632,
"step": 3135
},
{
"epoch": 0.41441203642602614,
"grad_norm": 12.146714210510254,
"learning_rate": 1.4563082675363302e-06,
"loss": 0.0965,
"num_input_tokens_seen": 1544128,
"step": 3140
},
{
"epoch": 0.4150719282037746,
"grad_norm": 0.2594153583049774,
"learning_rate": 1.4542573024775122e-06,
"loss": 0.0228,
"num_input_tokens_seen": 1546368,
"step": 3145
},
{
"epoch": 0.41573181998152303,
"grad_norm": 4.159293174743652,
"learning_rate": 1.4522039269577521e-06,
"loss": 0.2984,
"num_input_tokens_seen": 1548736,
"step": 3150
},
{
"epoch": 0.4163917117592715,
"grad_norm": 0.10340887308120728,
"learning_rate": 1.4501481518730372e-06,
"loss": 0.2461,
"num_input_tokens_seen": 1551168,
"step": 3155
},
{
"epoch": 0.4170516035370199,
"grad_norm": 0.2676301598548889,
"learning_rate": 1.4480899881320868e-06,
"loss": 0.0719,
"num_input_tokens_seen": 1553664,
"step": 3160
},
{
"epoch": 0.41771149531476837,
"grad_norm": 25.496265411376953,
"learning_rate": 1.4460294466562956e-06,
"loss": 0.1771,
"num_input_tokens_seen": 1555968,
"step": 3165
},
{
"epoch": 0.4183713870925168,
"grad_norm": 0.47720712423324585,
"learning_rate": 1.4439665383796756e-06,
"loss": 0.0399,
"num_input_tokens_seen": 1558208,
"step": 3170
},
{
"epoch": 0.41903127887026526,
"grad_norm": 2.1485588550567627,
"learning_rate": 1.4419012742487972e-06,
"loss": 0.0054,
"num_input_tokens_seen": 1560640,
"step": 3175
},
{
"epoch": 0.4196911706480137,
"grad_norm": 5.430055618286133,
"learning_rate": 1.4398336652227335e-06,
"loss": 0.095,
"num_input_tokens_seen": 1563328,
"step": 3180
},
{
"epoch": 0.4203510624257622,
"grad_norm": 0.05566899850964546,
"learning_rate": 1.4377637222729986e-06,
"loss": 0.1201,
"num_input_tokens_seen": 1565696,
"step": 3185
},
{
"epoch": 0.42101095420351065,
"grad_norm": 0.08947694301605225,
"learning_rate": 1.435691456383493e-06,
"loss": 0.1675,
"num_input_tokens_seen": 1568640,
"step": 3190
},
{
"epoch": 0.4216708459812591,
"grad_norm": 2.342318058013916,
"learning_rate": 1.433616878550442e-06,
"loss": 0.1212,
"num_input_tokens_seen": 1571328,
"step": 3195
},
{
"epoch": 0.42233073775900754,
"grad_norm": 18.465282440185547,
"learning_rate": 1.4315399997823403e-06,
"loss": 0.3175,
"num_input_tokens_seen": 1574016,
"step": 3200
},
{
"epoch": 0.422990629536756,
"grad_norm": 12.997380256652832,
"learning_rate": 1.429460831099891e-06,
"loss": 0.2534,
"num_input_tokens_seen": 1576384,
"step": 3205
},
{
"epoch": 0.4236505213145044,
"grad_norm": 0.08205987513065338,
"learning_rate": 1.4273793835359492e-06,
"loss": 0.2136,
"num_input_tokens_seen": 1579200,
"step": 3210
},
{
"epoch": 0.42431041309225287,
"grad_norm": 66.97320556640625,
"learning_rate": 1.4252956681354631e-06,
"loss": 0.0964,
"num_input_tokens_seen": 1581632,
"step": 3215
},
{
"epoch": 0.4249703048700013,
"grad_norm": 0.7273184657096863,
"learning_rate": 1.4232096959554135e-06,
"loss": 0.0035,
"num_input_tokens_seen": 1584064,
"step": 3220
},
{
"epoch": 0.42563019664774976,
"grad_norm": 65.00259399414062,
"learning_rate": 1.4211214780647572e-06,
"loss": 0.0297,
"num_input_tokens_seen": 1586752,
"step": 3225
},
{
"epoch": 0.4262900884254982,
"grad_norm": 9.714056968688965,
"learning_rate": 1.4190310255443676e-06,
"loss": 0.0918,
"num_input_tokens_seen": 1589248,
"step": 3230
},
{
"epoch": 0.42694998020324665,
"grad_norm": 0.03953593969345093,
"learning_rate": 1.4169383494869764e-06,
"loss": 0.0286,
"num_input_tokens_seen": 1591552,
"step": 3235
},
{
"epoch": 0.4276098719809951,
"grad_norm": 117.95477294921875,
"learning_rate": 1.414843460997113e-06,
"loss": 0.0616,
"num_input_tokens_seen": 1594048,
"step": 3240
},
{
"epoch": 0.4282697637587436,
"grad_norm": 17.138263702392578,
"learning_rate": 1.4127463711910483e-06,
"loss": 0.1517,
"num_input_tokens_seen": 1596544,
"step": 3245
},
{
"epoch": 0.42892965553649204,
"grad_norm": 5.194220542907715,
"learning_rate": 1.410647091196733e-06,
"loss": 0.1214,
"num_input_tokens_seen": 1599104,
"step": 3250
},
{
"epoch": 0.4295895473142405,
"grad_norm": 0.02321782521903515,
"learning_rate": 1.4085456321537402e-06,
"loss": 0.124,
"num_input_tokens_seen": 1601344,
"step": 3255
},
{
"epoch": 0.43024943909198893,
"grad_norm": 10.903656005859375,
"learning_rate": 1.4064420052132056e-06,
"loss": 0.1022,
"num_input_tokens_seen": 1603968,
"step": 3260
},
{
"epoch": 0.4309093308697374,
"grad_norm": 75.95123291015625,
"learning_rate": 1.4043362215377696e-06,
"loss": 0.078,
"num_input_tokens_seen": 1606400,
"step": 3265
},
{
"epoch": 0.4315692226474858,
"grad_norm": 0.12190647423267365,
"learning_rate": 1.4022282923015158e-06,
"loss": 0.1095,
"num_input_tokens_seen": 1608960,
"step": 3270
},
{
"epoch": 0.43222911442523426,
"grad_norm": 0.8287085294723511,
"learning_rate": 1.4001182286899136e-06,
"loss": 0.0042,
"num_input_tokens_seen": 1611456,
"step": 3275
},
{
"epoch": 0.4328890062029827,
"grad_norm": 0.0886739045381546,
"learning_rate": 1.398006041899758e-06,
"loss": 0.0458,
"num_input_tokens_seen": 1613952,
"step": 3280
},
{
"epoch": 0.43354889798073115,
"grad_norm": 27.18416404724121,
"learning_rate": 1.3958917431391102e-06,
"loss": 0.1192,
"num_input_tokens_seen": 1616320,
"step": 3285
},
{
"epoch": 0.4342087897584796,
"grad_norm": 0.13577166199684143,
"learning_rate": 1.3937753436272388e-06,
"loss": 0.1763,
"num_input_tokens_seen": 1619136,
"step": 3290
},
{
"epoch": 0.43486868153622804,
"grad_norm": 431.9822082519531,
"learning_rate": 1.3916568545945597e-06,
"loss": 0.0483,
"num_input_tokens_seen": 1621632,
"step": 3295
},
{
"epoch": 0.4355285733139765,
"grad_norm": 0.2625204920768738,
"learning_rate": 1.3895362872825764e-06,
"loss": 0.1352,
"num_input_tokens_seen": 1624064,
"step": 3300
},
{
"epoch": 0.43618846509172493,
"grad_norm": 0.5975183844566345,
"learning_rate": 1.3874136529438205e-06,
"loss": 0.1454,
"num_input_tokens_seen": 1626496,
"step": 3305
},
{
"epoch": 0.43684835686947343,
"grad_norm": 9.573996543884277,
"learning_rate": 1.3852889628417918e-06,
"loss": 0.0691,
"num_input_tokens_seen": 1628800,
"step": 3310
},
{
"epoch": 0.4375082486472219,
"grad_norm": 2.738884925842285,
"learning_rate": 1.3831622282508994e-06,
"loss": 0.0967,
"num_input_tokens_seen": 1631232,
"step": 3315
},
{
"epoch": 0.4381681404249703,
"grad_norm": 0.1655990183353424,
"learning_rate": 1.3810334604564007e-06,
"loss": 0.0018,
"num_input_tokens_seen": 1633728,
"step": 3320
},
{
"epoch": 0.43882803220271877,
"grad_norm": 0.21200844645500183,
"learning_rate": 1.3789026707543423e-06,
"loss": 0.0695,
"num_input_tokens_seen": 1636224,
"step": 3325
},
{
"epoch": 0.4394879239804672,
"grad_norm": 0.12617841362953186,
"learning_rate": 1.3767698704514998e-06,
"loss": 0.0631,
"num_input_tokens_seen": 1638272,
"step": 3330
},
{
"epoch": 0.44014781575821565,
"grad_norm": 0.025392625480890274,
"learning_rate": 1.3746350708653175e-06,
"loss": 0.1898,
"num_input_tokens_seen": 1640512,
"step": 3335
},
{
"epoch": 0.4408077075359641,
"grad_norm": 51.78602981567383,
"learning_rate": 1.3724982833238495e-06,
"loss": 0.1903,
"num_input_tokens_seen": 1642944,
"step": 3340
},
{
"epoch": 0.44146759931371254,
"grad_norm": 0.11096933484077454,
"learning_rate": 1.370359519165697e-06,
"loss": 0.0559,
"num_input_tokens_seen": 1645376,
"step": 3345
},
{
"epoch": 0.442127491091461,
"grad_norm": 259.23699951171875,
"learning_rate": 1.368218789739952e-06,
"loss": 0.0108,
"num_input_tokens_seen": 1647936,
"step": 3350
},
{
"epoch": 0.44278738286920943,
"grad_norm": 0.37444016337394714,
"learning_rate": 1.3660761064061337e-06,
"loss": 0.065,
"num_input_tokens_seen": 1650496,
"step": 3355
},
{
"epoch": 0.4434472746469579,
"grad_norm": 0.05476607382297516,
"learning_rate": 1.3639314805341297e-06,
"loss": 0.0935,
"num_input_tokens_seen": 1652992,
"step": 3360
},
{
"epoch": 0.4441071664247063,
"grad_norm": 0.11798688024282455,
"learning_rate": 1.3617849235041355e-06,
"loss": 0.0665,
"num_input_tokens_seen": 1655488,
"step": 3365
},
{
"epoch": 0.4447670582024548,
"grad_norm": 0.04145582392811775,
"learning_rate": 1.3596364467065938e-06,
"loss": 0.1599,
"num_input_tokens_seen": 1657984,
"step": 3370
},
{
"epoch": 0.44542694998020327,
"grad_norm": 90.30973052978516,
"learning_rate": 1.3574860615421346e-06,
"loss": 0.229,
"num_input_tokens_seen": 1660736,
"step": 3375
},
{
"epoch": 0.4460868417579517,
"grad_norm": 12.61612319946289,
"learning_rate": 1.3553337794215147e-06,
"loss": 0.192,
"num_input_tokens_seen": 1663104,
"step": 3380
},
{
"epoch": 0.44674673353570016,
"grad_norm": 75.10413360595703,
"learning_rate": 1.3531796117655565e-06,
"loss": 0.0766,
"num_input_tokens_seen": 1665344,
"step": 3385
},
{
"epoch": 0.4474066253134486,
"grad_norm": 30.948253631591797,
"learning_rate": 1.3510235700050873e-06,
"loss": 0.1651,
"num_input_tokens_seen": 1668096,
"step": 3390
},
{
"epoch": 0.44806651709119705,
"grad_norm": 22.553556442260742,
"learning_rate": 1.3488656655808801e-06,
"loss": 0.0679,
"num_input_tokens_seen": 1670272,
"step": 3395
},
{
"epoch": 0.4487264088689455,
"grad_norm": 1.1050207614898682,
"learning_rate": 1.3467059099435912e-06,
"loss": 0.0905,
"num_input_tokens_seen": 1672448,
"step": 3400
},
{
"epoch": 0.44938630064669394,
"grad_norm": 0.16898778080940247,
"learning_rate": 1.3445443145537002e-06,
"loss": 0.0608,
"num_input_tokens_seen": 1675200,
"step": 3405
},
{
"epoch": 0.4500461924244424,
"grad_norm": 1.0715267658233643,
"learning_rate": 1.3423808908814494e-06,
"loss": 0.0698,
"num_input_tokens_seen": 1677696,
"step": 3410
},
{
"epoch": 0.45017817077999206,
"eval_loss": 0.1182408258318901,
"eval_runtime": 7.6199,
"eval_samples_per_second": 883.874,
"eval_steps_per_second": 110.501,
"num_input_tokens_seen": 1678208,
"step": 3411
},
{
"epoch": 0.4507060842021908,
"grad_norm": 14.29131031036377,
"learning_rate": 1.3402156504067826e-06,
"loss": 0.0969,
"num_input_tokens_seen": 1680256,
"step": 3415
},
{
"epoch": 0.45136597597993927,
"grad_norm": 0.1442999541759491,
"learning_rate": 1.338048604619284e-06,
"loss": 0.1191,
"num_input_tokens_seen": 1682624,
"step": 3420
},
{
"epoch": 0.4520258677576877,
"grad_norm": 33.37054443359375,
"learning_rate": 1.3358797650181178e-06,
"loss": 0.0365,
"num_input_tokens_seen": 1685056,
"step": 3425
},
{
"epoch": 0.45268575953543616,
"grad_norm": 132.64529418945312,
"learning_rate": 1.3337091431119662e-06,
"loss": 0.1349,
"num_input_tokens_seen": 1687168,
"step": 3430
},
{
"epoch": 0.45334565131318466,
"grad_norm": 168.06629943847656,
"learning_rate": 1.3315367504189698e-06,
"loss": 0.3197,
"num_input_tokens_seen": 1689216,
"step": 3435
},
{
"epoch": 0.4540055430909331,
"grad_norm": 86.57543182373047,
"learning_rate": 1.3293625984666656e-06,
"loss": 0.0946,
"num_input_tokens_seen": 1691776,
"step": 3440
},
{
"epoch": 0.45466543486868155,
"grad_norm": 0.10748296976089478,
"learning_rate": 1.3271866987919254e-06,
"loss": 0.0012,
"num_input_tokens_seen": 1694336,
"step": 3445
},
{
"epoch": 0.45532532664643,
"grad_norm": 0.3375436067581177,
"learning_rate": 1.325009062940895e-06,
"loss": 0.2113,
"num_input_tokens_seen": 1696640,
"step": 3450
},
{
"epoch": 0.45598521842417844,
"grad_norm": 15.320273399353027,
"learning_rate": 1.3228297024689336e-06,
"loss": 0.0765,
"num_input_tokens_seen": 1698880,
"step": 3455
},
{
"epoch": 0.4566451102019269,
"grad_norm": 23.91095733642578,
"learning_rate": 1.3206486289405519e-06,
"loss": 0.1025,
"num_input_tokens_seen": 1701312,
"step": 3460
},
{
"epoch": 0.45730500197967533,
"grad_norm": 44.923030853271484,
"learning_rate": 1.3184658539293496e-06,
"loss": 0.1407,
"num_input_tokens_seen": 1703808,
"step": 3465
},
{
"epoch": 0.4579648937574238,
"grad_norm": 65.6329116821289,
"learning_rate": 1.3162813890179564e-06,
"loss": 0.125,
"num_input_tokens_seen": 1706304,
"step": 3470
},
{
"epoch": 0.4586247855351722,
"grad_norm": 12.479512214660645,
"learning_rate": 1.314095245797969e-06,
"loss": 0.3138,
"num_input_tokens_seen": 1708736,
"step": 3475
},
{
"epoch": 0.45928467731292066,
"grad_norm": 0.6768988370895386,
"learning_rate": 1.3119074358698891e-06,
"loss": 0.1379,
"num_input_tokens_seen": 1711232,
"step": 3480
},
{
"epoch": 0.4599445690906691,
"grad_norm": 0.6303845047950745,
"learning_rate": 1.3097179708430634e-06,
"loss": 0.0039,
"num_input_tokens_seen": 1713600,
"step": 3485
},
{
"epoch": 0.46060446086841755,
"grad_norm": 0.1511518806219101,
"learning_rate": 1.3075268623356214e-06,
"loss": 0.2013,
"num_input_tokens_seen": 1716224,
"step": 3490
},
{
"epoch": 0.46126435264616605,
"grad_norm": 34.9669189453125,
"learning_rate": 1.305334121974412e-06,
"loss": 0.1515,
"num_input_tokens_seen": 1718720,
"step": 3495
},
{
"epoch": 0.4619242444239145,
"grad_norm": 46.562442779541016,
"learning_rate": 1.3031397613949448e-06,
"loss": 0.1062,
"num_input_tokens_seen": 1721280,
"step": 3500
},
{
"epoch": 0.46258413620166294,
"grad_norm": 93.35523986816406,
"learning_rate": 1.3009437922413266e-06,
"loss": 0.0727,
"num_input_tokens_seen": 1723712,
"step": 3505
},
{
"epoch": 0.4632440279794114,
"grad_norm": 87.05264282226562,
"learning_rate": 1.2987462261661994e-06,
"loss": 0.0932,
"num_input_tokens_seen": 1725952,
"step": 3510
},
{
"epoch": 0.46390391975715983,
"grad_norm": 58.2432975769043,
"learning_rate": 1.2965470748306798e-06,
"loss": 0.0048,
"num_input_tokens_seen": 1728512,
"step": 3515
},
{
"epoch": 0.4645638115349083,
"grad_norm": 9.179746627807617,
"learning_rate": 1.2943463499042957e-06,
"loss": 0.094,
"num_input_tokens_seen": 1731008,
"step": 3520
},
{
"epoch": 0.4652237033126567,
"grad_norm": 0.5701031684875488,
"learning_rate": 1.2921440630649257e-06,
"loss": 0.1567,
"num_input_tokens_seen": 1733696,
"step": 3525
},
{
"epoch": 0.46588359509040517,
"grad_norm": 245.243408203125,
"learning_rate": 1.2899402259987355e-06,
"loss": 0.0778,
"num_input_tokens_seen": 1736256,
"step": 3530
},
{
"epoch": 0.4665434868681536,
"grad_norm": 0.34011900424957275,
"learning_rate": 1.287734850400118e-06,
"loss": 0.2758,
"num_input_tokens_seen": 1738944,
"step": 3535
},
{
"epoch": 0.46720337864590206,
"grad_norm": 19.37761116027832,
"learning_rate": 1.2855279479716297e-06,
"loss": 0.1846,
"num_input_tokens_seen": 1741568,
"step": 3540
},
{
"epoch": 0.4678632704236505,
"grad_norm": 0.1848049759864807,
"learning_rate": 1.283319530423929e-06,
"loss": 0.0017,
"num_input_tokens_seen": 1743808,
"step": 3545
},
{
"epoch": 0.46852316220139895,
"grad_norm": 0.10032381117343903,
"learning_rate": 1.2811096094757144e-06,
"loss": 0.0026,
"num_input_tokens_seen": 1746176,
"step": 3550
},
{
"epoch": 0.46918305397914745,
"grad_norm": 0.09643909335136414,
"learning_rate": 1.2788981968536612e-06,
"loss": 0.1779,
"num_input_tokens_seen": 1748608,
"step": 3555
},
{
"epoch": 0.4698429457568959,
"grad_norm": 0.24367760121822357,
"learning_rate": 1.2766853042923607e-06,
"loss": 0.1046,
"num_input_tokens_seen": 1751040,
"step": 3560
},
{
"epoch": 0.47050283753464434,
"grad_norm": 1.557897686958313,
"learning_rate": 1.2744709435342573e-06,
"loss": 0.0626,
"num_input_tokens_seen": 1753280,
"step": 3565
},
{
"epoch": 0.4711627293123928,
"grad_norm": 13.281846046447754,
"learning_rate": 1.2722551263295864e-06,
"loss": 0.2856,
"num_input_tokens_seen": 1755712,
"step": 3570
},
{
"epoch": 0.4718226210901412,
"grad_norm": 53.76845169067383,
"learning_rate": 1.2700378644363114e-06,
"loss": 0.1173,
"num_input_tokens_seen": 1757952,
"step": 3575
},
{
"epoch": 0.47248251286788967,
"grad_norm": 23.442663192749023,
"learning_rate": 1.2678191696200621e-06,
"loss": 0.0951,
"num_input_tokens_seen": 1760384,
"step": 3580
},
{
"epoch": 0.4731424046456381,
"grad_norm": 0.13637100160121918,
"learning_rate": 1.2655990536540717e-06,
"loss": 0.0029,
"num_input_tokens_seen": 1762944,
"step": 3585
},
{
"epoch": 0.47380229642338656,
"grad_norm": 36.00935363769531,
"learning_rate": 1.2633775283191144e-06,
"loss": 0.275,
"num_input_tokens_seen": 1765504,
"step": 3590
},
{
"epoch": 0.474462188201135,
"grad_norm": 0.4418662190437317,
"learning_rate": 1.2611546054034436e-06,
"loss": 0.0527,
"num_input_tokens_seen": 1768128,
"step": 3595
},
{
"epoch": 0.47512207997888345,
"grad_norm": 0.2341255098581314,
"learning_rate": 1.2589302967027285e-06,
"loss": 0.1554,
"num_input_tokens_seen": 1770624,
"step": 3600
},
{
"epoch": 0.4757819717566319,
"grad_norm": 23.149660110473633,
"learning_rate": 1.2567046140199914e-06,
"loss": 0.2221,
"num_input_tokens_seen": 1773248,
"step": 3605
},
{
"epoch": 0.47644186353438034,
"grad_norm": 1.1026215553283691,
"learning_rate": 1.2544775691655463e-06,
"loss": 0.0267,
"num_input_tokens_seen": 1775488,
"step": 3610
},
{
"epoch": 0.4771017553121288,
"grad_norm": 0.24849441647529602,
"learning_rate": 1.2522491739569346e-06,
"loss": 0.1329,
"num_input_tokens_seen": 1777792,
"step": 3615
},
{
"epoch": 0.4777616470898773,
"grad_norm": 1.301603078842163,
"learning_rate": 1.250019440218864e-06,
"loss": 0.0942,
"num_input_tokens_seen": 1780352,
"step": 3620
},
{
"epoch": 0.47842153886762573,
"grad_norm": 0.6911696195602417,
"learning_rate": 1.247788379783144e-06,
"loss": 0.1692,
"num_input_tokens_seen": 1783168,
"step": 3625
},
{
"epoch": 0.4790814306453742,
"grad_norm": 97.18595123291016,
"learning_rate": 1.2455560044886248e-06,
"loss": 0.0503,
"num_input_tokens_seen": 1785920,
"step": 3630
},
{
"epoch": 0.4797413224231226,
"grad_norm": 0.041064053773880005,
"learning_rate": 1.2433223261811337e-06,
"loss": 0.1104,
"num_input_tokens_seen": 1788416,
"step": 3635
},
{
"epoch": 0.48040121420087106,
"grad_norm": 0.06536306440830231,
"learning_rate": 1.2410873567134115e-06,
"loss": 0.0317,
"num_input_tokens_seen": 1790848,
"step": 3640
},
{
"epoch": 0.4810611059786195,
"grad_norm": 2.3887031078338623,
"learning_rate": 1.238851107945051e-06,
"loss": 0.0394,
"num_input_tokens_seen": 1793280,
"step": 3645
},
{
"epoch": 0.48172099775636795,
"grad_norm": 0.03385510668158531,
"learning_rate": 1.2366135917424341e-06,
"loss": 0.1043,
"num_input_tokens_seen": 1795648,
"step": 3650
},
{
"epoch": 0.4823808895341164,
"grad_norm": 23.26211929321289,
"learning_rate": 1.2343748199786665e-06,
"loss": 0.183,
"num_input_tokens_seen": 1797952,
"step": 3655
},
{
"epoch": 0.48304078131186484,
"grad_norm": 0.2056346982717514,
"learning_rate": 1.2321348045335182e-06,
"loss": 0.0865,
"num_input_tokens_seen": 1800192,
"step": 3660
},
{
"epoch": 0.4837006730896133,
"grad_norm": 0.4568115174770355,
"learning_rate": 1.2298935572933575e-06,
"loss": 0.1479,
"num_input_tokens_seen": 1802560,
"step": 3665
},
{
"epoch": 0.48436056486736173,
"grad_norm": 23.873966217041016,
"learning_rate": 1.2276510901510892e-06,
"loss": 0.1646,
"num_input_tokens_seen": 1805056,
"step": 3670
},
{
"epoch": 0.4850204566451102,
"grad_norm": 2.0380196571350098,
"learning_rate": 1.2254074150060915e-06,
"loss": 0.1443,
"num_input_tokens_seen": 1807744,
"step": 3675
},
{
"epoch": 0.4856803484228587,
"grad_norm": 56.635318756103516,
"learning_rate": 1.2231625437641535e-06,
"loss": 0.0999,
"num_input_tokens_seen": 1810368,
"step": 3680
},
{
"epoch": 0.4863402402006071,
"grad_norm": 0.2982792258262634,
"learning_rate": 1.2209164883374096e-06,
"loss": 0.0791,
"num_input_tokens_seen": 1813056,
"step": 3685
},
{
"epoch": 0.48700013197835557,
"grad_norm": 0.19904585182666779,
"learning_rate": 1.2186692606442793e-06,
"loss": 0.2265,
"num_input_tokens_seen": 1815360,
"step": 3690
},
{
"epoch": 0.487660023756104,
"grad_norm": 144.61109924316406,
"learning_rate": 1.216420872609402e-06,
"loss": 0.1958,
"num_input_tokens_seen": 1817920,
"step": 3695
},
{
"epoch": 0.48831991553385246,
"grad_norm": 12.121625900268555,
"learning_rate": 1.2141713361635739e-06,
"loss": 0.0936,
"num_input_tokens_seen": 1820288,
"step": 3700
},
{
"epoch": 0.4889798073116009,
"grad_norm": 0.04935774579644203,
"learning_rate": 1.2119206632436864e-06,
"loss": 0.157,
"num_input_tokens_seen": 1822656,
"step": 3705
},
{
"epoch": 0.48963969908934935,
"grad_norm": 0.5263445973396301,
"learning_rate": 1.209668865792661e-06,
"loss": 0.116,
"num_input_tokens_seen": 1824832,
"step": 3710
},
{
"epoch": 0.4902995908670978,
"grad_norm": 35.05288314819336,
"learning_rate": 1.207415955759385e-06,
"loss": 0.0906,
"num_input_tokens_seen": 1827200,
"step": 3715
},
{
"epoch": 0.49095948264484623,
"grad_norm": 10.884110450744629,
"learning_rate": 1.2051619450986514e-06,
"loss": 0.1443,
"num_input_tokens_seen": 1829632,
"step": 3720
},
{
"epoch": 0.4916193744225947,
"grad_norm": 1.7360846996307373,
"learning_rate": 1.2029068457710923e-06,
"loss": 0.076,
"num_input_tokens_seen": 1832192,
"step": 3725
},
{
"epoch": 0.4922792662003431,
"grad_norm": 3.593554973602295,
"learning_rate": 1.200650669743117e-06,
"loss": 0.1089,
"num_input_tokens_seen": 1834752,
"step": 3730
},
{
"epoch": 0.49293915797809157,
"grad_norm": 24.667346954345703,
"learning_rate": 1.1983934289868488e-06,
"loss": 0.0533,
"num_input_tokens_seen": 1837248,
"step": 3735
},
{
"epoch": 0.49359904975584007,
"grad_norm": 40.43445587158203,
"learning_rate": 1.1961351354800595e-06,
"loss": 0.2063,
"num_input_tokens_seen": 1839680,
"step": 3740
},
{
"epoch": 0.4942589415335885,
"grad_norm": 0.25334975123405457,
"learning_rate": 1.193875801206109e-06,
"loss": 0.1478,
"num_input_tokens_seen": 1842304,
"step": 3745
},
{
"epoch": 0.49491883331133696,
"grad_norm": 0.46043312549591064,
"learning_rate": 1.1916154381538786e-06,
"loss": 0.0398,
"num_input_tokens_seen": 1844480,
"step": 3750
},
{
"epoch": 0.4955787250890854,
"grad_norm": 0.318348228931427,
"learning_rate": 1.1893540583177083e-06,
"loss": 0.1799,
"num_input_tokens_seen": 1846912,
"step": 3755
},
{
"epoch": 0.49623861686683385,
"grad_norm": 13.051739692687988,
"learning_rate": 1.187091673697335e-06,
"loss": 0.0861,
"num_input_tokens_seen": 1849024,
"step": 3760
},
{
"epoch": 0.4968985086445823,
"grad_norm": 0.8000279068946838,
"learning_rate": 1.184828296297826e-06,
"loss": 0.0693,
"num_input_tokens_seen": 1851712,
"step": 3765
},
{
"epoch": 0.49755840042233074,
"grad_norm": 26.590360641479492,
"learning_rate": 1.182563938129518e-06,
"loss": 0.074,
"num_input_tokens_seen": 1854208,
"step": 3770
},
{
"epoch": 0.4982182922000792,
"grad_norm": 0.07655533403158188,
"learning_rate": 1.1802986112079507e-06,
"loss": 0.0972,
"num_input_tokens_seen": 1856704,
"step": 3775
},
{
"epoch": 0.4988781839778276,
"grad_norm": 2.7111520767211914,
"learning_rate": 1.1780323275538056e-06,
"loss": 0.0812,
"num_input_tokens_seen": 1858944,
"step": 3780
},
{
"epoch": 0.49953807575557607,
"grad_norm": 2.1287126541137695,
"learning_rate": 1.1757650991928393e-06,
"loss": 0.2014,
"num_input_tokens_seen": 1861696,
"step": 3785
},
{
"epoch": 0.5001979675333246,
"grad_norm": 0.28718459606170654,
"learning_rate": 1.1734969381558235e-06,
"loss": 0.3465,
"num_input_tokens_seen": 1864128,
"step": 3790
},
{
"epoch": 0.5001979675333246,
"eval_loss": 0.13253989815711975,
"eval_runtime": 7.6606,
"eval_samples_per_second": 879.171,
"eval_steps_per_second": 109.913,
"num_input_tokens_seen": 1864128,
"step": 3790
},
{
"epoch": 0.500857859311073,
"grad_norm": 0.05410047248005867,
"learning_rate": 1.1712278564784774e-06,
"loss": 0.0012,
"num_input_tokens_seen": 1866432,
"step": 3795
},
{
"epoch": 0.5015177510888215,
"grad_norm": 50.43254089355469,
"learning_rate": 1.1689578662014064e-06,
"loss": 0.071,
"num_input_tokens_seen": 1868736,
"step": 3800
},
{
"epoch": 0.5021776428665699,
"grad_norm": 10.290699005126953,
"learning_rate": 1.1666869793700362e-06,
"loss": 0.2416,
"num_input_tokens_seen": 1871360,
"step": 3805
},
{
"epoch": 0.5028375346443184,
"grad_norm": 0.025802727788686752,
"learning_rate": 1.1644152080345515e-06,
"loss": 0.0019,
"num_input_tokens_seen": 1873536,
"step": 3810
},
{
"epoch": 0.5034974264220667,
"grad_norm": 32.99125289916992,
"learning_rate": 1.1621425642498289e-06,
"loss": 0.2788,
"num_input_tokens_seen": 1875904,
"step": 3815
},
{
"epoch": 0.5041573181998152,
"grad_norm": 88.0829849243164,
"learning_rate": 1.1598690600753759e-06,
"loss": 0.2056,
"num_input_tokens_seen": 1878464,
"step": 3820
},
{
"epoch": 0.5048172099775636,
"grad_norm": 69.9671630859375,
"learning_rate": 1.1575947075752644e-06,
"loss": 0.2253,
"num_input_tokens_seen": 1880640,
"step": 3825
},
{
"epoch": 0.5054771017553121,
"grad_norm": 16.678607940673828,
"learning_rate": 1.1553195188180691e-06,
"loss": 0.1243,
"num_input_tokens_seen": 1882944,
"step": 3830
},
{
"epoch": 0.5061369935330606,
"grad_norm": 0.3082711398601532,
"learning_rate": 1.1530435058768008e-06,
"loss": 0.0629,
"num_input_tokens_seen": 1885248,
"step": 3835
},
{
"epoch": 0.506796885310809,
"grad_norm": 16.876184463500977,
"learning_rate": 1.150766680828845e-06,
"loss": 0.0576,
"num_input_tokens_seen": 1887872,
"step": 3840
},
{
"epoch": 0.5074567770885575,
"grad_norm": 11.138367652893066,
"learning_rate": 1.1484890557558955e-06,
"loss": 0.004,
"num_input_tokens_seen": 1890560,
"step": 3845
},
{
"epoch": 0.5081166688663059,
"grad_norm": 11.504974365234375,
"learning_rate": 1.146210642743892e-06,
"loss": 0.0781,
"num_input_tokens_seen": 1893056,
"step": 3850
},
{
"epoch": 0.5087765606440544,
"grad_norm": 0.10916353017091751,
"learning_rate": 1.1439314538829554e-06,
"loss": 0.0498,
"num_input_tokens_seen": 1895360,
"step": 3855
},
{
"epoch": 0.5094364524218028,
"grad_norm": 0.09748303145170212,
"learning_rate": 1.141651501267323e-06,
"loss": 0.0617,
"num_input_tokens_seen": 1897664,
"step": 3860
},
{
"epoch": 0.5100963441995513,
"grad_norm": 126.38017272949219,
"learning_rate": 1.1393707969952847e-06,
"loss": 0.1711,
"num_input_tokens_seen": 1900288,
"step": 3865
},
{
"epoch": 0.5107562359772997,
"grad_norm": 139.21932983398438,
"learning_rate": 1.13708935316912e-06,
"loss": 0.1191,
"num_input_tokens_seen": 1903040,
"step": 3870
},
{
"epoch": 0.5114161277550482,
"grad_norm": 2.1678948402404785,
"learning_rate": 1.134807181895032e-06,
"loss": 0.0025,
"num_input_tokens_seen": 1905472,
"step": 3875
},
{
"epoch": 0.5120760195327966,
"grad_norm": 75.74095916748047,
"learning_rate": 1.132524295283084e-06,
"loss": 0.1253,
"num_input_tokens_seen": 1907712,
"step": 3880
},
{
"epoch": 0.5127359113105451,
"grad_norm": 0.061001695692539215,
"learning_rate": 1.1302407054471355e-06,
"loss": 0.0096,
"num_input_tokens_seen": 1910080,
"step": 3885
},
{
"epoch": 0.5133958030882935,
"grad_norm": 64.87725067138672,
"learning_rate": 1.1279564245047767e-06,
"loss": 0.2717,
"num_input_tokens_seen": 1912512,
"step": 3890
},
{
"epoch": 0.514055694866042,
"grad_norm": 0.10021132230758667,
"learning_rate": 1.1256714645772662e-06,
"loss": 0.0696,
"num_input_tokens_seen": 1914752,
"step": 3895
},
{
"epoch": 0.5147155866437905,
"grad_norm": 0.13533316552639008,
"learning_rate": 1.1233858377894647e-06,
"loss": 0.0073,
"num_input_tokens_seen": 1917120,
"step": 3900
},
{
"epoch": 0.5153754784215389,
"grad_norm": 72.85858917236328,
"learning_rate": 1.1210995562697722e-06,
"loss": 0.0094,
"num_input_tokens_seen": 1919232,
"step": 3905
},
{
"epoch": 0.5160353701992874,
"grad_norm": 20.26717758178711,
"learning_rate": 1.1188126321500621e-06,
"loss": 0.0061,
"num_input_tokens_seen": 1921856,
"step": 3910
},
{
"epoch": 0.5166952619770357,
"grad_norm": 105.2000732421875,
"learning_rate": 1.1165250775656188e-06,
"loss": 0.1091,
"num_input_tokens_seen": 1924224,
"step": 3915
},
{
"epoch": 0.5173551537547842,
"grad_norm": 0.042006537318229675,
"learning_rate": 1.1142369046550708e-06,
"loss": 0.0258,
"num_input_tokens_seen": 1926464,
"step": 3920
},
{
"epoch": 0.5180150455325326,
"grad_norm": 0.038001008331775665,
"learning_rate": 1.1119481255603289e-06,
"loss": 0.253,
"num_input_tokens_seen": 1928896,
"step": 3925
},
{
"epoch": 0.5186749373102811,
"grad_norm": 3.7172634601593018,
"learning_rate": 1.1096587524265197e-06,
"loss": 0.0598,
"num_input_tokens_seen": 1931200,
"step": 3930
},
{
"epoch": 0.5193348290880295,
"grad_norm": 0.01534217782318592,
"learning_rate": 1.107368797401923e-06,
"loss": 0.1918,
"num_input_tokens_seen": 1933632,
"step": 3935
},
{
"epoch": 0.519994720865778,
"grad_norm": 0.24886855483055115,
"learning_rate": 1.1050782726379054e-06,
"loss": 0.0022,
"num_input_tokens_seen": 1935872,
"step": 3940
},
{
"epoch": 0.5206546126435264,
"grad_norm": 0.19433605670928955,
"learning_rate": 1.1027871902888566e-06,
"loss": 0.104,
"num_input_tokens_seen": 1938048,
"step": 3945
},
{
"epoch": 0.5213145044212749,
"grad_norm": 46.62074661254883,
"learning_rate": 1.1004955625121257e-06,
"loss": 0.059,
"num_input_tokens_seen": 1940608,
"step": 3950
},
{
"epoch": 0.5219743961990233,
"grad_norm": 4.506015777587891,
"learning_rate": 1.0982034014679561e-06,
"loss": 0.2127,
"num_input_tokens_seen": 1943040,
"step": 3955
},
{
"epoch": 0.5226342879767718,
"grad_norm": 1.7702916860580444,
"learning_rate": 1.0959107193194206e-06,
"loss": 0.279,
"num_input_tokens_seen": 1945664,
"step": 3960
},
{
"epoch": 0.5232941797545203,
"grad_norm": 0.04471131041646004,
"learning_rate": 1.0936175282323575e-06,
"loss": 0.0022,
"num_input_tokens_seen": 1948032,
"step": 3965
},
{
"epoch": 0.5239540715322687,
"grad_norm": 96.3348617553711,
"learning_rate": 1.091323840375305e-06,
"loss": 0.0235,
"num_input_tokens_seen": 1950208,
"step": 3970
},
{
"epoch": 0.5246139633100172,
"grad_norm": 46.00945281982422,
"learning_rate": 1.0890296679194378e-06,
"loss": 0.2217,
"num_input_tokens_seen": 1952896,
"step": 3975
},
{
"epoch": 0.5252738550877656,
"grad_norm": 0.07118234783411026,
"learning_rate": 1.086735023038502e-06,
"loss": 0.0466,
"num_input_tokens_seen": 1955200,
"step": 3980
},
{
"epoch": 0.5259337468655141,
"grad_norm": 0.24527551233768463,
"learning_rate": 1.0844399179087512e-06,
"loss": 0.0765,
"num_input_tokens_seen": 1957376,
"step": 3985
},
{
"epoch": 0.5265936386432625,
"grad_norm": 0.29095086455345154,
"learning_rate": 1.0821443647088802e-06,
"loss": 0.2646,
"num_input_tokens_seen": 1960064,
"step": 3990
},
{
"epoch": 0.527253530421011,
"grad_norm": 0.09518618881702423,
"learning_rate": 1.0798483756199623e-06,
"loss": 0.1166,
"num_input_tokens_seen": 1962624,
"step": 3995
},
{
"epoch": 0.5279134221987594,
"grad_norm": 0.0448361411690712,
"learning_rate": 1.0775519628253833e-06,
"loss": 0.0901,
"num_input_tokens_seen": 1965056,
"step": 4000
},
{
"epoch": 0.5285733139765079,
"grad_norm": 0.339200496673584,
"learning_rate": 1.0752551385107772e-06,
"loss": 0.1363,
"num_input_tokens_seen": 1967424,
"step": 4005
},
{
"epoch": 0.5292332057542563,
"grad_norm": 12.845752716064453,
"learning_rate": 1.0729579148639621e-06,
"loss": 0.1608,
"num_input_tokens_seen": 1969856,
"step": 4010
},
{
"epoch": 0.5298930975320048,
"grad_norm": 0.18415102362632751,
"learning_rate": 1.0706603040748747e-06,
"loss": 0.0527,
"num_input_tokens_seen": 1972544,
"step": 4015
},
{
"epoch": 0.5305529893097533,
"grad_norm": 0.05650022253394127,
"learning_rate": 1.0683623183355071e-06,
"loss": 0.0851,
"num_input_tokens_seen": 1974912,
"step": 4020
},
{
"epoch": 0.5312128810875016,
"grad_norm": 13.724897384643555,
"learning_rate": 1.0660639698398392e-06,
"loss": 0.0918,
"num_input_tokens_seen": 1977216,
"step": 4025
},
{
"epoch": 0.5318727728652501,
"grad_norm": 4.000504970550537,
"learning_rate": 1.0637652707837773e-06,
"loss": 0.069,
"num_input_tokens_seen": 1979648,
"step": 4030
},
{
"epoch": 0.5325326646429985,
"grad_norm": 63.9135627746582,
"learning_rate": 1.0614662333650876e-06,
"loss": 0.0788,
"num_input_tokens_seen": 1981888,
"step": 4035
},
{
"epoch": 0.533192556420747,
"grad_norm": 15.316259384155273,
"learning_rate": 1.0591668697833311e-06,
"loss": 0.199,
"num_input_tokens_seen": 1984448,
"step": 4040
},
{
"epoch": 0.5338524481984954,
"grad_norm": 31.211254119873047,
"learning_rate": 1.0568671922398005e-06,
"loss": 0.1948,
"num_input_tokens_seen": 1987072,
"step": 4045
},
{
"epoch": 0.5345123399762439,
"grad_norm": 0.47070229053497314,
"learning_rate": 1.054567212937454e-06,
"loss": 0.1732,
"num_input_tokens_seen": 1989632,
"step": 4050
},
{
"epoch": 0.5351722317539923,
"grad_norm": 0.44888266921043396,
"learning_rate": 1.0522669440808508e-06,
"loss": 0.0482,
"num_input_tokens_seen": 1992192,
"step": 4055
},
{
"epoch": 0.5358321235317408,
"grad_norm": 1.2094718217849731,
"learning_rate": 1.0499663978760871e-06,
"loss": 0.2351,
"num_input_tokens_seen": 1994624,
"step": 4060
},
{
"epoch": 0.5364920153094892,
"grad_norm": 9.957518577575684,
"learning_rate": 1.0476655865307308e-06,
"loss": 0.0567,
"num_input_tokens_seen": 1997056,
"step": 4065
},
{
"epoch": 0.5371519070872377,
"grad_norm": 0.34155920147895813,
"learning_rate": 1.0453645222537556e-06,
"loss": 0.0665,
"num_input_tokens_seen": 1999360,
"step": 4070
},
{
"epoch": 0.5378117988649861,
"grad_norm": 111.1448974609375,
"learning_rate": 1.0430632172554796e-06,
"loss": 0.0719,
"num_input_tokens_seen": 2001856,
"step": 4075
},
{
"epoch": 0.5384716906427346,
"grad_norm": 36.95001220703125,
"learning_rate": 1.0407616837474963e-06,
"loss": 0.1029,
"num_input_tokens_seen": 2004288,
"step": 4080
},
{
"epoch": 0.5391315824204831,
"grad_norm": 1.12558114528656,
"learning_rate": 1.038459933942612e-06,
"loss": 0.0145,
"num_input_tokens_seen": 2006976,
"step": 4085
},
{
"epoch": 0.5397914741982315,
"grad_norm": 11.313764572143555,
"learning_rate": 1.036157980054782e-06,
"loss": 0.0129,
"num_input_tokens_seen": 2009280,
"step": 4090
},
{
"epoch": 0.54045136597598,
"grad_norm": 168.36546325683594,
"learning_rate": 1.0338558342990431e-06,
"loss": 0.0985,
"num_input_tokens_seen": 2011776,
"step": 4095
},
{
"epoch": 0.5411112577537284,
"grad_norm": 0.4781351089477539,
"learning_rate": 1.0315535088914508e-06,
"loss": 0.2285,
"num_input_tokens_seen": 2014336,
"step": 4100
},
{
"epoch": 0.5417711495314769,
"grad_norm": 33.78492736816406,
"learning_rate": 1.0292510160490146e-06,
"loss": 0.1558,
"num_input_tokens_seen": 2017152,
"step": 4105
},
{
"epoch": 0.5424310413092253,
"grad_norm": 17.072744369506836,
"learning_rate": 1.0269483679896308e-06,
"loss": 0.1097,
"num_input_tokens_seen": 2019520,
"step": 4110
},
{
"epoch": 0.5430909330869738,
"grad_norm": 32.48529052734375,
"learning_rate": 1.0246455769320211e-06,
"loss": 0.164,
"num_input_tokens_seen": 2021632,
"step": 4115
},
{
"epoch": 0.5437508248647221,
"grad_norm": 1.9809108972549438,
"learning_rate": 1.0223426550956647e-06,
"loss": 0.1157,
"num_input_tokens_seen": 2023744,
"step": 4120
},
{
"epoch": 0.5444107166424706,
"grad_norm": 2.384786367416382,
"learning_rate": 1.0200396147007354e-06,
"loss": 0.06,
"num_input_tokens_seen": 2026048,
"step": 4125
},
{
"epoch": 0.545070608420219,
"grad_norm": 0.03345398232340813,
"learning_rate": 1.0177364679680367e-06,
"loss": 0.1203,
"num_input_tokens_seen": 2028352,
"step": 4130
},
{
"epoch": 0.5457305001979675,
"grad_norm": 0.08933035284280777,
"learning_rate": 1.015433227118935e-06,
"loss": 0.0494,
"num_input_tokens_seen": 2030848,
"step": 4135
},
{
"epoch": 0.5463903919757159,
"grad_norm": 0.14635981619358063,
"learning_rate": 1.0131299043752967e-06,
"loss": 0.1369,
"num_input_tokens_seen": 2033344,
"step": 4140
},
{
"epoch": 0.5470502837534644,
"grad_norm": 0.22250190377235413,
"learning_rate": 1.0108265119594233e-06,
"loss": 0.0777,
"num_input_tokens_seen": 2035584,
"step": 4145
},
{
"epoch": 0.5477101755312129,
"grad_norm": 13.305469512939453,
"learning_rate": 1.0085230620939853e-06,
"loss": 0.0407,
"num_input_tokens_seen": 2038272,
"step": 4150
},
{
"epoch": 0.5483700673089613,
"grad_norm": 11.508169174194336,
"learning_rate": 1.0062195670019583e-06,
"loss": 0.0956,
"num_input_tokens_seen": 2040768,
"step": 4155
},
{
"epoch": 0.5490299590867098,
"grad_norm": 114.46903991699219,
"learning_rate": 1.0039160389065582e-06,
"loss": 0.1461,
"num_input_tokens_seen": 2043072,
"step": 4160
},
{
"epoch": 0.5496898508644582,
"grad_norm": 9.968348503112793,
"learning_rate": 1.0016124900311755e-06,
"loss": 0.1538,
"num_input_tokens_seen": 2045824,
"step": 4165
},
{
"epoch": 0.550217764286657,
"eval_loss": 0.0976191833615303,
"eval_runtime": 7.5976,
"eval_samples_per_second": 886.459,
"eval_steps_per_second": 110.824,
"num_input_tokens_seen": 2047552,
"step": 4169
},
{
"epoch": 0.5503497426422067,
"grad_norm": 24.443077087402344,
"learning_rate": 9.99308932599311e-07,
"loss": 0.233,
"num_input_tokens_seen": 2048064,
"step": 4170
},
{
"epoch": 0.5510096344199551,
"grad_norm": 0.5319744944572449,
"learning_rate": 9.970053788345112e-07,
"loss": 0.0557,
"num_input_tokens_seen": 2050432,
"step": 4175
},
{
"epoch": 0.5516695261977036,
"grad_norm": 0.8921132683753967,
"learning_rate": 9.947018409603036e-07,
"loss": 0.0547,
"num_input_tokens_seen": 2052928,
"step": 4180
},
{
"epoch": 0.552329417975452,
"grad_norm": 0.3344038724899292,
"learning_rate": 9.923983312001304e-07,
"loss": 0.0658,
"num_input_tokens_seen": 2055424,
"step": 4185
},
{
"epoch": 0.5529893097532005,
"grad_norm": 0.5421162843704224,
"learning_rate": 9.900948617772846e-07,
"loss": 0.1874,
"num_input_tokens_seen": 2057536,
"step": 4190
},
{
"epoch": 0.5536492015309489,
"grad_norm": 43.32229995727539,
"learning_rate": 9.877914449148462e-07,
"loss": 0.1518,
"num_input_tokens_seen": 2059840,
"step": 4195
},
{
"epoch": 0.5543090933086974,
"grad_norm": 87.34823608398438,
"learning_rate": 9.854880928356157e-07,
"loss": 0.2201,
"num_input_tokens_seen": 2062656,
"step": 4200
},
{
"epoch": 0.5549689850864459,
"grad_norm": 0.3885681629180908,
"learning_rate": 9.831848177620493e-07,
"loss": 0.22,
"num_input_tokens_seen": 2064960,
"step": 4205
},
{
"epoch": 0.5556288768641943,
"grad_norm": 18.198888778686523,
"learning_rate": 9.808816319161961e-07,
"loss": 0.2685,
"num_input_tokens_seen": 2067008,
"step": 4210
},
{
"epoch": 0.5562887686419428,
"grad_norm": 0.18500889837741852,
"learning_rate": 9.785785475196298e-07,
"loss": 0.0021,
"num_input_tokens_seen": 2069696,
"step": 4215
},
{
"epoch": 0.5569486604196912,
"grad_norm": 1.4052083492279053,
"learning_rate": 9.76275576793387e-07,
"loss": 0.0054,
"num_input_tokens_seen": 2072320,
"step": 4220
},
{
"epoch": 0.5576085521974397,
"grad_norm": 1.9056949615478516,
"learning_rate": 9.739727319579007e-07,
"loss": 0.0023,
"num_input_tokens_seen": 2074752,
"step": 4225
},
{
"epoch": 0.558268443975188,
"grad_norm": 1.0958954095840454,
"learning_rate": 9.716700252329361e-07,
"loss": 0.0678,
"num_input_tokens_seen": 2077440,
"step": 4230
},
{
"epoch": 0.5589283357529365,
"grad_norm": 20.575729370117188,
"learning_rate": 9.693674688375254e-07,
"loss": 0.2046,
"num_input_tokens_seen": 2080000,
"step": 4235
},
{
"epoch": 0.5595882275306849,
"grad_norm": 0.2594149708747864,
"learning_rate": 9.67065074989903e-07,
"loss": 0.1257,
"num_input_tokens_seen": 2082560,
"step": 4240
},
{
"epoch": 0.5602481193084334,
"grad_norm": 36.21245193481445,
"learning_rate": 9.647628559074415e-07,
"loss": 0.0827,
"num_input_tokens_seen": 2084864,
"step": 4245
},
{
"epoch": 0.5609080110861818,
"grad_norm": 0.03890296071767807,
"learning_rate": 9.62460823806585e-07,
"loss": 0.1167,
"num_input_tokens_seen": 2087424,
"step": 4250
},
{
"epoch": 0.5615679028639303,
"grad_norm": 4.345874786376953,
"learning_rate": 9.601589909027857e-07,
"loss": 0.2136,
"num_input_tokens_seen": 2090048,
"step": 4255
},
{
"epoch": 0.5622277946416787,
"grad_norm": 0.06426483392715454,
"learning_rate": 9.578573694104394e-07,
"loss": 0.0795,
"num_input_tokens_seen": 2092416,
"step": 4260
},
{
"epoch": 0.5628876864194272,
"grad_norm": 5.784552097320557,
"learning_rate": 9.555559715428199e-07,
"loss": 0.0455,
"num_input_tokens_seen": 2094656,
"step": 4265
},
{
"epoch": 0.5635475781971757,
"grad_norm": 0.20891836285591125,
"learning_rate": 9.532548095120134e-07,
"loss": 0.0031,
"num_input_tokens_seen": 2097024,
"step": 4270
},
{
"epoch": 0.5642074699749241,
"grad_norm": 0.08341825008392334,
"learning_rate": 9.509538955288564e-07,
"loss": 0.0884,
"num_input_tokens_seen": 2099392,
"step": 4275
},
{
"epoch": 0.5648673617526726,
"grad_norm": 0.749411940574646,
"learning_rate": 9.486532418028672e-07,
"loss": 0.0815,
"num_input_tokens_seen": 2102016,
"step": 4280
},
{
"epoch": 0.565527253530421,
"grad_norm": 25.93520164489746,
"learning_rate": 9.463528605421844e-07,
"loss": 0.117,
"num_input_tokens_seen": 2104320,
"step": 4285
},
{
"epoch": 0.5661871453081695,
"grad_norm": 45.35911178588867,
"learning_rate": 9.440527639535004e-07,
"loss": 0.0795,
"num_input_tokens_seen": 2107136,
"step": 4290
},
{
"epoch": 0.5668470370859179,
"grad_norm": 0.20163391530513763,
"learning_rate": 9.417529642419971e-07,
"loss": 0.0935,
"num_input_tokens_seen": 2109888,
"step": 4295
},
{
"epoch": 0.5675069288636664,
"grad_norm": 24.672039031982422,
"learning_rate": 9.394534736112815e-07,
"loss": 0.1225,
"num_input_tokens_seen": 2112192,
"step": 4300
},
{
"epoch": 0.5681668206414148,
"grad_norm": 0.07875992357730865,
"learning_rate": 9.371543042633192e-07,
"loss": 0.1277,
"num_input_tokens_seen": 2114752,
"step": 4305
},
{
"epoch": 0.5688267124191633,
"grad_norm": 0.11948826909065247,
"learning_rate": 9.348554683983722e-07,
"loss": 0.1616,
"num_input_tokens_seen": 2117184,
"step": 4310
},
{
"epoch": 0.5694866041969117,
"grad_norm": 0.17669005692005157,
"learning_rate": 9.325569782149323e-07,
"loss": 0.0485,
"num_input_tokens_seen": 2119552,
"step": 4315
},
{
"epoch": 0.5701464959746602,
"grad_norm": 18.713947296142578,
"learning_rate": 9.302588459096574e-07,
"loss": 0.0897,
"num_input_tokens_seen": 2121920,
"step": 4320
},
{
"epoch": 0.5708063877524086,
"grad_norm": 8.844649314880371,
"learning_rate": 9.279610836773064e-07,
"loss": 0.1948,
"num_input_tokens_seen": 2124096,
"step": 4325
},
{
"epoch": 0.571466279530157,
"grad_norm": 62.913169860839844,
"learning_rate": 9.256637037106735e-07,
"loss": 0.0979,
"num_input_tokens_seen": 2126528,
"step": 4330
},
{
"epoch": 0.5721261713079056,
"grad_norm": 35.835323333740234,
"learning_rate": 9.233667182005259e-07,
"loss": 0.0585,
"num_input_tokens_seen": 2128576,
"step": 4335
},
{
"epoch": 0.5727860630856539,
"grad_norm": 236.8058319091797,
"learning_rate": 9.210701393355361e-07,
"loss": 0.1142,
"num_input_tokens_seen": 2130688,
"step": 4340
},
{
"epoch": 0.5734459548634024,
"grad_norm": 0.6673513650894165,
"learning_rate": 9.187739793022198e-07,
"loss": 0.1147,
"num_input_tokens_seen": 2133312,
"step": 4345
},
{
"epoch": 0.5741058466411508,
"grad_norm": 0.05369502305984497,
"learning_rate": 9.164782502848702e-07,
"loss": 0.0315,
"num_input_tokens_seen": 2135680,
"step": 4350
},
{
"epoch": 0.5747657384188993,
"grad_norm": 0.035501688718795776,
"learning_rate": 9.141829644654936e-07,
"loss": 0.2153,
"num_input_tokens_seen": 2138112,
"step": 4355
},
{
"epoch": 0.5754256301966477,
"grad_norm": 7.459763526916504,
"learning_rate": 9.118881340237432e-07,
"loss": 0.3872,
"num_input_tokens_seen": 2140352,
"step": 4360
},
{
"epoch": 0.5760855219743962,
"grad_norm": 0.08102209866046906,
"learning_rate": 9.095937711368573e-07,
"loss": 0.0637,
"num_input_tokens_seen": 2143040,
"step": 4365
},
{
"epoch": 0.5767454137521446,
"grad_norm": 0.06749647855758667,
"learning_rate": 9.072998879795923e-07,
"loss": 0.1285,
"num_input_tokens_seen": 2145280,
"step": 4370
},
{
"epoch": 0.5774053055298931,
"grad_norm": 51.86709976196289,
"learning_rate": 9.050064967241596e-07,
"loss": 0.0807,
"num_input_tokens_seen": 2147904,
"step": 4375
},
{
"epoch": 0.5780651973076415,
"grad_norm": 0.10375242680311203,
"learning_rate": 9.027136095401598e-07,
"loss": 0.0728,
"num_input_tokens_seen": 2150400,
"step": 4380
},
{
"epoch": 0.57872508908539,
"grad_norm": 0.2877858281135559,
"learning_rate": 9.004212385945187e-07,
"loss": 0.1274,
"num_input_tokens_seen": 2153088,
"step": 4385
},
{
"epoch": 0.5793849808631385,
"grad_norm": 0.05926657095551491,
"learning_rate": 8.981293960514233e-07,
"loss": 0.0495,
"num_input_tokens_seen": 2155776,
"step": 4390
},
{
"epoch": 0.5800448726408869,
"grad_norm": 1.292005181312561,
"learning_rate": 8.958380940722564e-07,
"loss": 0.1366,
"num_input_tokens_seen": 2158400,
"step": 4395
},
{
"epoch": 0.5807047644186354,
"grad_norm": 0.3705070912837982,
"learning_rate": 8.935473448155326e-07,
"loss": 0.0731,
"num_input_tokens_seen": 2160704,
"step": 4400
},
{
"epoch": 0.5813646561963838,
"grad_norm": 26.712739944458008,
"learning_rate": 8.912571604368324e-07,
"loss": 0.0423,
"num_input_tokens_seen": 2163200,
"step": 4405
},
{
"epoch": 0.5820245479741323,
"grad_norm": 68.38367462158203,
"learning_rate": 8.889675530887404e-07,
"loss": 0.1252,
"num_input_tokens_seen": 2165376,
"step": 4410
},
{
"epoch": 0.5826844397518807,
"grad_norm": 0.06487785279750824,
"learning_rate": 8.866785349207786e-07,
"loss": 0.131,
"num_input_tokens_seen": 2167808,
"step": 4415
},
{
"epoch": 0.5833443315296292,
"grad_norm": 15.265974044799805,
"learning_rate": 8.843901180793423e-07,
"loss": 0.1223,
"num_input_tokens_seen": 2170112,
"step": 4420
},
{
"epoch": 0.5840042233073776,
"grad_norm": 1.6116943359375,
"learning_rate": 8.821023147076362e-07,
"loss": 0.001,
"num_input_tokens_seen": 2172480,
"step": 4425
},
{
"epoch": 0.5846641150851261,
"grad_norm": 4.275770664215088,
"learning_rate": 8.798151369456098e-07,
"loss": 0.0822,
"num_input_tokens_seen": 2175104,
"step": 4430
},
{
"epoch": 0.5853240068628744,
"grad_norm": 12.192449569702148,
"learning_rate": 8.775285969298931e-07,
"loss": 0.0803,
"num_input_tokens_seen": 2177280,
"step": 4435
},
{
"epoch": 0.585983898640623,
"grad_norm": 0.0718933716416359,
"learning_rate": 8.752427067937312e-07,
"loss": 0.0628,
"num_input_tokens_seen": 2179776,
"step": 4440
},
{
"epoch": 0.5866437904183713,
"grad_norm": 0.020002318546175957,
"learning_rate": 8.729574786669214e-07,
"loss": 0.0845,
"num_input_tokens_seen": 2182400,
"step": 4445
},
{
"epoch": 0.5873036821961198,
"grad_norm": 0.39394357800483704,
"learning_rate": 8.706729246757477e-07,
"loss": 0.06,
"num_input_tokens_seen": 2185088,
"step": 4450
},
{
"epoch": 0.5879635739738683,
"grad_norm": 1.8858518600463867,
"learning_rate": 8.683890569429173e-07,
"loss": 0.0725,
"num_input_tokens_seen": 2187776,
"step": 4455
},
{
"epoch": 0.5886234657516167,
"grad_norm": 0.07854912430047989,
"learning_rate": 8.661058875874956e-07,
"loss": 0.0027,
"num_input_tokens_seen": 2190016,
"step": 4460
},
{
"epoch": 0.5892833575293652,
"grad_norm": 0.09435324370861053,
"learning_rate": 8.638234287248423e-07,
"loss": 0.0013,
"num_input_tokens_seen": 2192320,
"step": 4465
},
{
"epoch": 0.5899432493071136,
"grad_norm": 44.07099533081055,
"learning_rate": 8.615416924665464e-07,
"loss": 0.0578,
"num_input_tokens_seen": 2194752,
"step": 4470
},
{
"epoch": 0.5906031410848621,
"grad_norm": 0.29922375082969666,
"learning_rate": 8.592606909203629e-07,
"loss": 0.0962,
"num_input_tokens_seen": 2197056,
"step": 4475
},
{
"epoch": 0.5912630328626105,
"grad_norm": 0.052084218710660934,
"learning_rate": 8.569804361901485e-07,
"loss": 0.0401,
"num_input_tokens_seen": 2199296,
"step": 4480
},
{
"epoch": 0.591922924640359,
"grad_norm": 59.697113037109375,
"learning_rate": 8.547009403757963e-07,
"loss": 0.4233,
"num_input_tokens_seen": 2201664,
"step": 4485
},
{
"epoch": 0.5925828164181074,
"grad_norm": 16.623720169067383,
"learning_rate": 8.524222155731731e-07,
"loss": 0.1601,
"num_input_tokens_seen": 2204288,
"step": 4490
},
{
"epoch": 0.5932427081958559,
"grad_norm": 82.14921569824219,
"learning_rate": 8.501442738740538e-07,
"loss": 0.1259,
"num_input_tokens_seen": 2206528,
"step": 4495
},
{
"epoch": 0.5939025999736043,
"grad_norm": 0.7616731524467468,
"learning_rate": 8.47867127366058e-07,
"loss": 0.0636,
"num_input_tokens_seen": 2209024,
"step": 4500
},
{
"epoch": 0.5945624917513528,
"grad_norm": 0.1041426807641983,
"learning_rate": 8.455907881325858e-07,
"loss": 0.0027,
"num_input_tokens_seen": 2211584,
"step": 4505
},
{
"epoch": 0.5952223835291012,
"grad_norm": 1.8390711545944214,
"learning_rate": 8.433152682527533e-07,
"loss": 0.1052,
"num_input_tokens_seen": 2213952,
"step": 4510
},
{
"epoch": 0.5958822753068497,
"grad_norm": 0.08113599568605423,
"learning_rate": 8.410405798013298e-07,
"loss": 0.0747,
"num_input_tokens_seen": 2216192,
"step": 4515
},
{
"epoch": 0.5965421670845982,
"grad_norm": 16.143348693847656,
"learning_rate": 8.387667348486712e-07,
"loss": 0.0035,
"num_input_tokens_seen": 2218688,
"step": 4520
},
{
"epoch": 0.5972020588623466,
"grad_norm": 135.14500427246094,
"learning_rate": 8.364937454606585e-07,
"loss": 0.1296,
"num_input_tokens_seen": 2220928,
"step": 4525
},
{
"epoch": 0.5978619506400951,
"grad_norm": 12.444659233093262,
"learning_rate": 8.342216236986329e-07,
"loss": 0.0014,
"num_input_tokens_seen": 2223360,
"step": 4530
},
{
"epoch": 0.5985218424178435,
"grad_norm": 0.052838534116744995,
"learning_rate": 8.319503816193305e-07,
"loss": 0.1463,
"num_input_tokens_seen": 2225792,
"step": 4535
},
{
"epoch": 0.599181734195592,
"grad_norm": 29.65154457092285,
"learning_rate": 8.296800312748206e-07,
"loss": 0.1496,
"num_input_tokens_seen": 2228288,
"step": 4540
},
{
"epoch": 0.5998416259733403,
"grad_norm": 1.4917051792144775,
"learning_rate": 8.274105847124404e-07,
"loss": 0.1911,
"num_input_tokens_seen": 2230848,
"step": 4545
},
{
"epoch": 0.6002375610399895,
"eval_loss": 0.11496574431657791,
"eval_runtime": 7.6571,
"eval_samples_per_second": 879.582,
"eval_steps_per_second": 109.964,
"num_input_tokens_seen": 2232448,
"step": 4548
},
{
"epoch": 0.6005015177510888,
"grad_norm": 35.08987808227539,
"learning_rate": 8.251420539747311e-07,
"loss": 0.1187,
"num_input_tokens_seen": 2233472,
"step": 4550
},
{
"epoch": 0.6011614095288372,
"grad_norm": 0.22071610391139984,
"learning_rate": 8.228744510993742e-07,
"loss": 0.1799,
"num_input_tokens_seen": 2236096,
"step": 4555
},
{
"epoch": 0.6018213013065857,
"grad_norm": 0.21558649837970734,
"learning_rate": 8.206077881191274e-07,
"loss": 0.0908,
"num_input_tokens_seen": 2238720,
"step": 4560
},
{
"epoch": 0.6024811930843341,
"grad_norm": 24.909807205200195,
"learning_rate": 8.183420770617614e-07,
"loss": 0.1394,
"num_input_tokens_seen": 2241216,
"step": 4565
},
{
"epoch": 0.6031410848620826,
"grad_norm": 2.2823469638824463,
"learning_rate": 8.160773299499955e-07,
"loss": 0.0631,
"num_input_tokens_seen": 2243648,
"step": 4570
},
{
"epoch": 0.6038009766398311,
"grad_norm": 1.838703989982605,
"learning_rate": 8.138135588014339e-07,
"loss": 0.0464,
"num_input_tokens_seen": 2246080,
"step": 4575
},
{
"epoch": 0.6044608684175795,
"grad_norm": 22.0809268951416,
"learning_rate": 8.115507756285017e-07,
"loss": 0.0632,
"num_input_tokens_seen": 2248256,
"step": 4580
},
{
"epoch": 0.605120760195328,
"grad_norm": 0.09841513633728027,
"learning_rate": 8.092889924383819e-07,
"loss": 0.1037,
"num_input_tokens_seen": 2250688,
"step": 4585
},
{
"epoch": 0.6057806519730764,
"grad_norm": 3.10756516456604,
"learning_rate": 8.070282212329508e-07,
"loss": 0.0775,
"num_input_tokens_seen": 2253120,
"step": 4590
},
{
"epoch": 0.6064405437508249,
"grad_norm": 170.31297302246094,
"learning_rate": 8.047684740087156e-07,
"loss": 0.22,
"num_input_tokens_seen": 2255360,
"step": 4595
},
{
"epoch": 0.6071004355285733,
"grad_norm": 46.11749267578125,
"learning_rate": 8.025097627567481e-07,
"loss": 0.1834,
"num_input_tokens_seen": 2257728,
"step": 4600
},
{
"epoch": 0.6077603273063218,
"grad_norm": 0.045084141194820404,
"learning_rate": 8.002520994626247e-07,
"loss": 0.0712,
"num_input_tokens_seen": 2260224,
"step": 4605
},
{
"epoch": 0.6084202190840702,
"grad_norm": 0.1346772313117981,
"learning_rate": 7.979954961063596e-07,
"loss": 0.0733,
"num_input_tokens_seen": 2262912,
"step": 4610
},
{
"epoch": 0.6090801108618187,
"grad_norm": 18.890954971313477,
"learning_rate": 7.957399646623436e-07,
"loss": 0.3433,
"num_input_tokens_seen": 2265152,
"step": 4615
},
{
"epoch": 0.6097400026395671,
"grad_norm": 0.26090413331985474,
"learning_rate": 7.934855170992788e-07,
"loss": 0.042,
"num_input_tokens_seen": 2267968,
"step": 4620
},
{
"epoch": 0.6103998944173156,
"grad_norm": 0.09057987481355667,
"learning_rate": 7.912321653801161e-07,
"loss": 0.0468,
"num_input_tokens_seen": 2270336,
"step": 4625
},
{
"epoch": 0.611059786195064,
"grad_norm": 19.550853729248047,
"learning_rate": 7.889799214619919e-07,
"loss": 0.1865,
"num_input_tokens_seen": 2273024,
"step": 4630
},
{
"epoch": 0.6117196779728125,
"grad_norm": 0.048422493040561676,
"learning_rate": 7.867287972961629e-07,
"loss": 0.0821,
"num_input_tokens_seen": 2275264,
"step": 4635
},
{
"epoch": 0.612379569750561,
"grad_norm": 0.2724073529243469,
"learning_rate": 7.844788048279453e-07,
"loss": 0.0704,
"num_input_tokens_seen": 2277888,
"step": 4640
},
{
"epoch": 0.6130394615283093,
"grad_norm": 0.041433185338974,
"learning_rate": 7.822299559966494e-07,
"loss": 0.0007,
"num_input_tokens_seen": 2280320,
"step": 4645
},
{
"epoch": 0.6136993533060578,
"grad_norm": 0.03420973941683769,
"learning_rate": 7.799822627355171e-07,
"loss": 0.0591,
"num_input_tokens_seen": 2282560,
"step": 4650
},
{
"epoch": 0.6143592450838062,
"grad_norm": 0.13017447292804718,
"learning_rate": 7.77735736971659e-07,
"loss": 0.0842,
"num_input_tokens_seen": 2284864,
"step": 4655
},
{
"epoch": 0.6150191368615547,
"grad_norm": 0.07092246413230896,
"learning_rate": 7.754903906259889e-07,
"loss": 0.1524,
"num_input_tokens_seen": 2287168,
"step": 4660
},
{
"epoch": 0.6156790286393031,
"grad_norm": 166.20501708984375,
"learning_rate": 7.732462356131637e-07,
"loss": 0.059,
"num_input_tokens_seen": 2289600,
"step": 4665
},
{
"epoch": 0.6163389204170516,
"grad_norm": 0.29940545558929443,
"learning_rate": 7.710032838415179e-07,
"loss": 0.0896,
"num_input_tokens_seen": 2292160,
"step": 4670
},
{
"epoch": 0.6169988121948,
"grad_norm": 0.06602998822927475,
"learning_rate": 7.687615472130016e-07,
"loss": 0.155,
"num_input_tokens_seen": 2294912,
"step": 4675
},
{
"epoch": 0.6176587039725485,
"grad_norm": 0.13479486107826233,
"learning_rate": 7.665210376231165e-07,
"loss": 0.1138,
"num_input_tokens_seen": 2297024,
"step": 4680
},
{
"epoch": 0.6183185957502969,
"grad_norm": 7.841771125793457,
"learning_rate": 7.642817669608536e-07,
"loss": 0.1342,
"num_input_tokens_seen": 2299456,
"step": 4685
},
{
"epoch": 0.6189784875280454,
"grad_norm": 0.10149969160556793,
"learning_rate": 7.62043747108629e-07,
"loss": 0.0194,
"num_input_tokens_seen": 2301568,
"step": 4690
},
{
"epoch": 0.6196383793057938,
"grad_norm": 122.03047943115234,
"learning_rate": 7.598069899422221e-07,
"loss": 0.1988,
"num_input_tokens_seen": 2303936,
"step": 4695
},
{
"epoch": 0.6202982710835423,
"grad_norm": 8.390487670898438,
"learning_rate": 7.575715073307119e-07,
"loss": 0.2107,
"num_input_tokens_seen": 2306176,
"step": 4700
},
{
"epoch": 0.6209581628612908,
"grad_norm": 0.41683492064476013,
"learning_rate": 7.55337311136414e-07,
"loss": 0.0995,
"num_input_tokens_seen": 2308736,
"step": 4705
},
{
"epoch": 0.6216180546390392,
"grad_norm": 14.42542839050293,
"learning_rate": 7.531044132148183e-07,
"loss": 0.1775,
"num_input_tokens_seen": 2311104,
"step": 4710
},
{
"epoch": 0.6222779464167877,
"grad_norm": 16.42903709411621,
"learning_rate": 7.508728254145245e-07,
"loss": 0.0493,
"num_input_tokens_seen": 2313536,
"step": 4715
},
{
"epoch": 0.6229378381945361,
"grad_norm": 26.883657455444336,
"learning_rate": 7.486425595771817e-07,
"loss": 0.117,
"num_input_tokens_seen": 2316032,
"step": 4720
},
{
"epoch": 0.6235977299722846,
"grad_norm": 8.713482856750488,
"learning_rate": 7.464136275374223e-07,
"loss": 0.1853,
"num_input_tokens_seen": 2318656,
"step": 4725
},
{
"epoch": 0.624257621750033,
"grad_norm": 10.561690330505371,
"learning_rate": 7.441860411228029e-07,
"loss": 0.1311,
"num_input_tokens_seen": 2321216,
"step": 4730
},
{
"epoch": 0.6249175135277815,
"grad_norm": 39.474449157714844,
"learning_rate": 7.419598121537387e-07,
"loss": 0.1273,
"num_input_tokens_seen": 2323648,
"step": 4735
},
{
"epoch": 0.6255774053055299,
"grad_norm": 18.26643943786621,
"learning_rate": 7.397349524434424e-07,
"loss": 0.1446,
"num_input_tokens_seen": 2326080,
"step": 4740
},
{
"epoch": 0.6262372970832784,
"grad_norm": 8.37359619140625,
"learning_rate": 7.375114737978605e-07,
"loss": 0.0544,
"num_input_tokens_seen": 2328512,
"step": 4745
},
{
"epoch": 0.6268971888610267,
"grad_norm": 4.634432315826416,
"learning_rate": 7.352893880156106e-07,
"loss": 0.1048,
"num_input_tokens_seen": 2331008,
"step": 4750
},
{
"epoch": 0.6275570806387752,
"grad_norm": 1.4395649433135986,
"learning_rate": 7.330687068879202e-07,
"loss": 0.0516,
"num_input_tokens_seen": 2333376,
"step": 4755
},
{
"epoch": 0.6282169724165237,
"grad_norm": 135.29498291015625,
"learning_rate": 7.308494421985626e-07,
"loss": 0.1411,
"num_input_tokens_seen": 2335872,
"step": 4760
},
{
"epoch": 0.6288768641942721,
"grad_norm": 0.25262773036956787,
"learning_rate": 7.286316057237951e-07,
"loss": 0.0029,
"num_input_tokens_seen": 2338432,
"step": 4765
},
{
"epoch": 0.6295367559720206,
"grad_norm": 20.409406661987305,
"learning_rate": 7.264152092322963e-07,
"loss": 0.1567,
"num_input_tokens_seen": 2340928,
"step": 4770
},
{
"epoch": 0.630196647749769,
"grad_norm": 0.3037130832672119,
"learning_rate": 7.242002644851035e-07,
"loss": 0.0441,
"num_input_tokens_seen": 2343680,
"step": 4775
},
{
"epoch": 0.6308565395275175,
"grad_norm": 0.19966571033000946,
"learning_rate": 7.219867832355508e-07,
"loss": 0.0673,
"num_input_tokens_seen": 2346240,
"step": 4780
},
{
"epoch": 0.6315164313052659,
"grad_norm": 0.16028675436973572,
"learning_rate": 7.197747772292071e-07,
"loss": 0.0718,
"num_input_tokens_seen": 2348544,
"step": 4785
},
{
"epoch": 0.6321763230830144,
"grad_norm": 0.05191419646143913,
"learning_rate": 7.17564258203811e-07,
"loss": 0.2532,
"num_input_tokens_seen": 2350976,
"step": 4790
},
{
"epoch": 0.6328362148607628,
"grad_norm": 21.26822280883789,
"learning_rate": 7.153552378892128e-07,
"loss": 0.1214,
"num_input_tokens_seen": 2353216,
"step": 4795
},
{
"epoch": 0.6334961066385113,
"grad_norm": 0.49603065848350525,
"learning_rate": 7.131477280073091e-07,
"loss": 0.1191,
"num_input_tokens_seen": 2355584,
"step": 4800
},
{
"epoch": 0.6341559984162597,
"grad_norm": 0.12939685583114624,
"learning_rate": 7.109417402719813e-07,
"loss": 0.1127,
"num_input_tokens_seen": 2358144,
"step": 4805
},
{
"epoch": 0.6348158901940082,
"grad_norm": 14.447181701660156,
"learning_rate": 7.087372863890346e-07,
"loss": 0.0543,
"num_input_tokens_seen": 2360896,
"step": 4810
},
{
"epoch": 0.6354757819717566,
"grad_norm": 25.439424514770508,
"learning_rate": 7.065343780561344e-07,
"loss": 0.2546,
"num_input_tokens_seen": 2363264,
"step": 4815
},
{
"epoch": 0.6361356737495051,
"grad_norm": 10.288759231567383,
"learning_rate": 7.043330269627448e-07,
"loss": 0.0676,
"num_input_tokens_seen": 2365632,
"step": 4820
},
{
"epoch": 0.6367955655272536,
"grad_norm": 0.07639932632446289,
"learning_rate": 7.021332447900671e-07,
"loss": 0.0018,
"num_input_tokens_seen": 2368000,
"step": 4825
},
{
"epoch": 0.637455457305002,
"grad_norm": 63.019187927246094,
"learning_rate": 6.999350432109766e-07,
"loss": 0.1462,
"num_input_tokens_seen": 2370560,
"step": 4830
},
{
"epoch": 0.6381153490827505,
"grad_norm": 0.08439631760120392,
"learning_rate": 6.977384338899617e-07,
"loss": 0.001,
"num_input_tokens_seen": 2373120,
"step": 4835
},
{
"epoch": 0.6387752408604989,
"grad_norm": 0.06181376054883003,
"learning_rate": 6.955434284830619e-07,
"loss": 0.0052,
"num_input_tokens_seen": 2375872,
"step": 4840
},
{
"epoch": 0.6394351326382474,
"grad_norm": 0.05570792779326439,
"learning_rate": 6.933500386378056e-07,
"loss": 0.2037,
"num_input_tokens_seen": 2378432,
"step": 4845
},
{
"epoch": 0.6400950244159958,
"grad_norm": 50.27269744873047,
"learning_rate": 6.911582759931482e-07,
"loss": 0.1581,
"num_input_tokens_seen": 2380800,
"step": 4850
},
{
"epoch": 0.6407549161937443,
"grad_norm": 0.039350103586912155,
"learning_rate": 6.889681521794109e-07,
"loss": 0.2158,
"num_input_tokens_seen": 2383744,
"step": 4855
},
{
"epoch": 0.6414148079714926,
"grad_norm": 11.155346870422363,
"learning_rate": 6.867796788182181e-07,
"loss": 0.0894,
"num_input_tokens_seen": 2386112,
"step": 4860
},
{
"epoch": 0.6420746997492411,
"grad_norm": 8.643911361694336,
"learning_rate": 6.845928675224366e-07,
"loss": 0.1499,
"num_input_tokens_seen": 2388736,
"step": 4865
},
{
"epoch": 0.6427345915269895,
"grad_norm": 0.24349497258663177,
"learning_rate": 6.82407729896114e-07,
"loss": 0.0662,
"num_input_tokens_seen": 2391104,
"step": 4870
},
{
"epoch": 0.643394483304738,
"grad_norm": 15.820056915283203,
"learning_rate": 6.802242775344163e-07,
"loss": 0.0747,
"num_input_tokens_seen": 2393728,
"step": 4875
},
{
"epoch": 0.6440543750824864,
"grad_norm": 0.12001825124025345,
"learning_rate": 6.780425220235674e-07,
"loss": 0.1309,
"num_input_tokens_seen": 2396480,
"step": 4880
},
{
"epoch": 0.6447142668602349,
"grad_norm": 0.08038333803415298,
"learning_rate": 6.758624749407859e-07,
"loss": 0.008,
"num_input_tokens_seen": 2399104,
"step": 4885
},
{
"epoch": 0.6453741586379834,
"grad_norm": 15.686113357543945,
"learning_rate": 6.736841478542264e-07,
"loss": 0.0813,
"num_input_tokens_seen": 2401664,
"step": 4890
},
{
"epoch": 0.6460340504157318,
"grad_norm": 0.3630061447620392,
"learning_rate": 6.715075523229151e-07,
"loss": 0.0084,
"num_input_tokens_seen": 2404160,
"step": 4895
},
{
"epoch": 0.6466939421934803,
"grad_norm": 29.911376953125,
"learning_rate": 6.693326998966909e-07,
"loss": 0.129,
"num_input_tokens_seen": 2406592,
"step": 4900
},
{
"epoch": 0.6473538339712287,
"grad_norm": 0.05508751794695854,
"learning_rate": 6.671596021161431e-07,
"loss": 0.0684,
"num_input_tokens_seen": 2409088,
"step": 4905
},
{
"epoch": 0.6480137257489772,
"grad_norm": 0.06392798572778702,
"learning_rate": 6.649882705125494e-07,
"loss": 0.0965,
"num_input_tokens_seen": 2411584,
"step": 4910
},
{
"epoch": 0.6486736175267256,
"grad_norm": 0.36957481503486633,
"learning_rate": 6.628187166078163e-07,
"loss": 0.4483,
"num_input_tokens_seen": 2414400,
"step": 4915
},
{
"epoch": 0.6493335093044741,
"grad_norm": 18.36041259765625,
"learning_rate": 6.606509519144166e-07,
"loss": 0.0583,
"num_input_tokens_seen": 2416640,
"step": 4920
},
{
"epoch": 0.6499934010822225,
"grad_norm": 61.96574783325195,
"learning_rate": 6.584849879353289e-07,
"loss": 0.1499,
"num_input_tokens_seen": 2419136,
"step": 4925
},
{
"epoch": 0.6502573577933219,
"eval_loss": 0.09844312816858292,
"eval_runtime": 7.5167,
"eval_samples_per_second": 896.002,
"eval_steps_per_second": 112.017,
"num_input_tokens_seen": 2420096,
"step": 4927
},
{
"epoch": 0.650653292859971,
"grad_norm": 0.3677075207233429,
"learning_rate": 6.563208361639772e-07,
"loss": 0.0307,
"num_input_tokens_seen": 2421440,
"step": 4930
},
{
"epoch": 0.6513131846377194,
"grad_norm": 0.08293258398771286,
"learning_rate": 6.541585080841687e-07,
"loss": 0.0015,
"num_input_tokens_seen": 2424000,
"step": 4935
},
{
"epoch": 0.6519730764154679,
"grad_norm": 88.83380126953125,
"learning_rate": 6.519980151700332e-07,
"loss": 0.0999,
"num_input_tokens_seen": 2426240,
"step": 4940
},
{
"epoch": 0.6526329681932164,
"grad_norm": 0.1625138372182846,
"learning_rate": 6.498393688859629e-07,
"loss": 0.0789,
"num_input_tokens_seen": 2428864,
"step": 4945
},
{
"epoch": 0.6532928599709648,
"grad_norm": 0.05900685489177704,
"learning_rate": 6.47682580686551e-07,
"loss": 0.0011,
"num_input_tokens_seen": 2431296,
"step": 4950
},
{
"epoch": 0.6539527517487133,
"grad_norm": 0.054225701838731766,
"learning_rate": 6.455276620165307e-07,
"loss": 0.002,
"num_input_tokens_seen": 2433984,
"step": 4955
},
{
"epoch": 0.6546126435264616,
"grad_norm": 0.02803809382021427,
"learning_rate": 6.433746243107152e-07,
"loss": 0.4195,
"num_input_tokens_seen": 2436224,
"step": 4960
},
{
"epoch": 0.6552725353042101,
"grad_norm": 0.09517721086740494,
"learning_rate": 6.412234789939359e-07,
"loss": 0.229,
"num_input_tokens_seen": 2438720,
"step": 4965
},
{
"epoch": 0.6559324270819585,
"grad_norm": 0.13722281157970428,
"learning_rate": 6.390742374809832e-07,
"loss": 0.0818,
"num_input_tokens_seen": 2440960,
"step": 4970
},
{
"epoch": 0.656592318859707,
"grad_norm": 0.6646612286567688,
"learning_rate": 6.369269111765454e-07,
"loss": 0.0417,
"num_input_tokens_seen": 2443328,
"step": 4975
},
{
"epoch": 0.6572522106374554,
"grad_norm": 0.05688225477933884,
"learning_rate": 6.347815114751465e-07,
"loss": 0.1413,
"num_input_tokens_seen": 2445952,
"step": 4980
},
{
"epoch": 0.6579121024152039,
"grad_norm": 0.027482135221362114,
"learning_rate": 6.326380497610886e-07,
"loss": 0.1102,
"num_input_tokens_seen": 2448576,
"step": 4985
},
{
"epoch": 0.6585719941929523,
"grad_norm": 52.316715240478516,
"learning_rate": 6.304965374083899e-07,
"loss": 0.323,
"num_input_tokens_seen": 2451136,
"step": 4990
},
{
"epoch": 0.6592318859707008,
"grad_norm": 0.18591034412384033,
"learning_rate": 6.283569857807245e-07,
"loss": 0.0022,
"num_input_tokens_seen": 2453632,
"step": 4995
},
{
"epoch": 0.6598917777484492,
"grad_norm": 0.1707799881696701,
"learning_rate": 6.262194062313615e-07,
"loss": 0.0082,
"num_input_tokens_seen": 2456192,
"step": 5000
},
{
"epoch": 0.6605516695261977,
"grad_norm": 0.05098792165517807,
"learning_rate": 6.240838101031063e-07,
"loss": 0.0012,
"num_input_tokens_seen": 2458624,
"step": 5005
},
{
"epoch": 0.6612115613039462,
"grad_norm": 0.10480757057666779,
"learning_rate": 6.21950208728239e-07,
"loss": 0.134,
"num_input_tokens_seen": 2460928,
"step": 5010
},
{
"epoch": 0.6618714530816946,
"grad_norm": 0.10895920544862747,
"learning_rate": 6.198186134284554e-07,
"loss": 0.1085,
"num_input_tokens_seen": 2463552,
"step": 5015
},
{
"epoch": 0.6625313448594431,
"grad_norm": 25.51168441772461,
"learning_rate": 6.176890355148049e-07,
"loss": 0.0561,
"num_input_tokens_seen": 2465856,
"step": 5020
},
{
"epoch": 0.6631912366371915,
"grad_norm": 3.873609781265259,
"learning_rate": 6.155614862876335e-07,
"loss": 0.0902,
"num_input_tokens_seen": 2468288,
"step": 5025
},
{
"epoch": 0.66385112841494,
"grad_norm": 75.29798889160156,
"learning_rate": 6.134359770365214e-07,
"loss": 0.1482,
"num_input_tokens_seen": 2470912,
"step": 5030
},
{
"epoch": 0.6645110201926884,
"grad_norm": 0.2568621039390564,
"learning_rate": 6.11312519040224e-07,
"loss": 0.109,
"num_input_tokens_seen": 2473536,
"step": 5035
},
{
"epoch": 0.6651709119704369,
"grad_norm": 0.05576321855187416,
"learning_rate": 6.091911235666125e-07,
"loss": 0.0013,
"num_input_tokens_seen": 2476032,
"step": 5040
},
{
"epoch": 0.6658308037481853,
"grad_norm": 0.13206513226032257,
"learning_rate": 6.070718018726124e-07,
"loss": 0.1091,
"num_input_tokens_seen": 2478208,
"step": 5045
},
{
"epoch": 0.6664906955259338,
"grad_norm": 0.10654900968074799,
"learning_rate": 6.049545652041459e-07,
"loss": 0.1482,
"num_input_tokens_seen": 2480512,
"step": 5050
},
{
"epoch": 0.6671505873036822,
"grad_norm": 0.07339984178543091,
"learning_rate": 6.028394247960709e-07,
"loss": 0.1775,
"num_input_tokens_seen": 2483008,
"step": 5055
},
{
"epoch": 0.6678104790814307,
"grad_norm": 0.04593325033783913,
"learning_rate": 6.007263918721221e-07,
"loss": 0.1572,
"num_input_tokens_seen": 2485376,
"step": 5060
},
{
"epoch": 0.668470370859179,
"grad_norm": 0.19269201159477234,
"learning_rate": 5.986154776448507e-07,
"loss": 0.0559,
"num_input_tokens_seen": 2488064,
"step": 5065
},
{
"epoch": 0.6691302626369275,
"grad_norm": 13.757147789001465,
"learning_rate": 5.965066933155656e-07,
"loss": 0.0578,
"num_input_tokens_seen": 2490624,
"step": 5070
},
{
"epoch": 0.669790154414676,
"grad_norm": 20.430967330932617,
"learning_rate": 5.944000500742735e-07,
"loss": 0.2826,
"num_input_tokens_seen": 2493248,
"step": 5075
},
{
"epoch": 0.6704500461924244,
"grad_norm": 51.80553436279297,
"learning_rate": 5.922955590996195e-07,
"loss": 0.201,
"num_input_tokens_seen": 2495744,
"step": 5080
},
{
"epoch": 0.6711099379701729,
"grad_norm": 0.12118737399578094,
"learning_rate": 5.901932315588281e-07,
"loss": 0.0019,
"num_input_tokens_seen": 2498176,
"step": 5085
},
{
"epoch": 0.6717698297479213,
"grad_norm": 20.142244338989258,
"learning_rate": 5.880930786076441e-07,
"loss": 0.1805,
"num_input_tokens_seen": 2500416,
"step": 5090
},
{
"epoch": 0.6724297215256698,
"grad_norm": 0.4407406747341156,
"learning_rate": 5.859951113902728e-07,
"loss": 0.06,
"num_input_tokens_seen": 2502848,
"step": 5095
},
{
"epoch": 0.6730896133034182,
"grad_norm": 32.401554107666016,
"learning_rate": 5.83899341039321e-07,
"loss": 0.1099,
"num_input_tokens_seen": 2505152,
"step": 5100
},
{
"epoch": 0.6737495050811667,
"grad_norm": 34.423946380615234,
"learning_rate": 5.818057786757386e-07,
"loss": 0.1247,
"num_input_tokens_seen": 2507648,
"step": 5105
},
{
"epoch": 0.6744093968589151,
"grad_norm": 0.2243095338344574,
"learning_rate": 5.797144354087588e-07,
"loss": 0.0989,
"num_input_tokens_seen": 2510144,
"step": 5110
},
{
"epoch": 0.6750692886366636,
"grad_norm": 0.06958218663930893,
"learning_rate": 5.77625322335839e-07,
"loss": 0.076,
"num_input_tokens_seen": 2513024,
"step": 5115
},
{
"epoch": 0.675729180414412,
"grad_norm": 0.2868078649044037,
"learning_rate": 5.755384505426032e-07,
"loss": 0.0721,
"num_input_tokens_seen": 2515072,
"step": 5120
},
{
"epoch": 0.6763890721921605,
"grad_norm": 0.19552133977413177,
"learning_rate": 5.734538311027819e-07,
"loss": 0.0018,
"num_input_tokens_seen": 2517376,
"step": 5125
},
{
"epoch": 0.677048963969909,
"grad_norm": 0.6387649178504944,
"learning_rate": 5.713714750781533e-07,
"loss": 0.0036,
"num_input_tokens_seen": 2520064,
"step": 5130
},
{
"epoch": 0.6777088557476574,
"grad_norm": 0.640417218208313,
"learning_rate": 5.692913935184862e-07,
"loss": 0.0685,
"num_input_tokens_seen": 2522688,
"step": 5135
},
{
"epoch": 0.6783687475254059,
"grad_norm": 0.32035917043685913,
"learning_rate": 5.672135974614794e-07,
"loss": 0.0071,
"num_input_tokens_seen": 2525184,
"step": 5140
},
{
"epoch": 0.6790286393031543,
"grad_norm": 0.08546182513237,
"learning_rate": 5.651380979327034e-07,
"loss": 0.0014,
"num_input_tokens_seen": 2527552,
"step": 5145
},
{
"epoch": 0.6796885310809028,
"grad_norm": 1.3679804801940918,
"learning_rate": 5.630649059455444e-07,
"loss": 0.0442,
"num_input_tokens_seen": 2530240,
"step": 5150
},
{
"epoch": 0.6803484228586512,
"grad_norm": 0.5069653391838074,
"learning_rate": 5.609940325011413e-07,
"loss": 0.0023,
"num_input_tokens_seen": 2532480,
"step": 5155
},
{
"epoch": 0.6810083146363997,
"grad_norm": 0.1547362059354782,
"learning_rate": 5.589254885883325e-07,
"loss": 0.0007,
"num_input_tokens_seen": 2534912,
"step": 5160
},
{
"epoch": 0.681668206414148,
"grad_norm": 0.09271689504384995,
"learning_rate": 5.568592851835936e-07,
"loss": 0.0598,
"num_input_tokens_seen": 2537408,
"step": 5165
},
{
"epoch": 0.6823280981918965,
"grad_norm": 0.12092125415802002,
"learning_rate": 5.547954332509805e-07,
"loss": 0.3023,
"num_input_tokens_seen": 2539776,
"step": 5170
},
{
"epoch": 0.6829879899696449,
"grad_norm": 0.06238294392824173,
"learning_rate": 5.527339437420717e-07,
"loss": 0.0009,
"num_input_tokens_seen": 2542208,
"step": 5175
},
{
"epoch": 0.6836478817473934,
"grad_norm": 109.36412811279297,
"learning_rate": 5.506748275959094e-07,
"loss": 0.1061,
"num_input_tokens_seen": 2544704,
"step": 5180
},
{
"epoch": 0.6843077735251418,
"grad_norm": 0.061365850269794464,
"learning_rate": 5.48618095738943e-07,
"loss": 0.0535,
"num_input_tokens_seen": 2547072,
"step": 5185
},
{
"epoch": 0.6849676653028903,
"grad_norm": 0.15806028246879578,
"learning_rate": 5.465637590849681e-07,
"loss": 0.1301,
"num_input_tokens_seen": 2549440,
"step": 5190
},
{
"epoch": 0.6856275570806388,
"grad_norm": 21.357271194458008,
"learning_rate": 5.445118285350723e-07,
"loss": 0.2169,
"num_input_tokens_seen": 2552128,
"step": 5195
},
{
"epoch": 0.6862874488583872,
"grad_norm": 0.09460903704166412,
"learning_rate": 5.424623149775745e-07,
"loss": 0.0681,
"num_input_tokens_seen": 2554368,
"step": 5200
},
{
"epoch": 0.6869473406361357,
"grad_norm": 0.0203552208840847,
"learning_rate": 5.404152292879676e-07,
"loss": 0.1175,
"num_input_tokens_seen": 2556928,
"step": 5205
},
{
"epoch": 0.6876072324138841,
"grad_norm": 16.716796875,
"learning_rate": 5.38370582328863e-07,
"loss": 0.1624,
"num_input_tokens_seen": 2559360,
"step": 5210
},
{
"epoch": 0.6882671241916326,
"grad_norm": 0.22735337913036346,
"learning_rate": 5.363283849499293e-07,
"loss": 0.1578,
"num_input_tokens_seen": 2561856,
"step": 5215
},
{
"epoch": 0.688927015969381,
"grad_norm": 33.698936462402344,
"learning_rate": 5.342886479878387e-07,
"loss": 0.1794,
"num_input_tokens_seen": 2564352,
"step": 5220
},
{
"epoch": 0.6895869077471295,
"grad_norm": 0.5594123601913452,
"learning_rate": 5.32251382266206e-07,
"loss": 0.0437,
"num_input_tokens_seen": 2566784,
"step": 5225
},
{
"epoch": 0.6902467995248779,
"grad_norm": 0.27059707045555115,
"learning_rate": 5.302165985955327e-07,
"loss": 0.0593,
"num_input_tokens_seen": 2569152,
"step": 5230
},
{
"epoch": 0.6909066913026264,
"grad_norm": 0.09355846047401428,
"learning_rate": 5.281843077731511e-07,
"loss": 0.067,
"num_input_tokens_seen": 2571520,
"step": 5235
},
{
"epoch": 0.6915665830803748,
"grad_norm": 121.53573608398438,
"learning_rate": 5.26154520583163e-07,
"loss": 0.141,
"num_input_tokens_seen": 2574080,
"step": 5240
},
{
"epoch": 0.6922264748581233,
"grad_norm": 0.16486892104148865,
"learning_rate": 5.241272477963877e-07,
"loss": 0.0595,
"num_input_tokens_seen": 2576320,
"step": 5245
},
{
"epoch": 0.6928863666358717,
"grad_norm": 1.9759496450424194,
"learning_rate": 5.221025001703e-07,
"loss": 0.0576,
"num_input_tokens_seen": 2578752,
"step": 5250
},
{
"epoch": 0.6935462584136202,
"grad_norm": 17.89307403564453,
"learning_rate": 5.200802884489768e-07,
"loss": 0.1368,
"num_input_tokens_seen": 2581184,
"step": 5255
},
{
"epoch": 0.6942061501913687,
"grad_norm": 0.03805484250187874,
"learning_rate": 5.180606233630374e-07,
"loss": 0.1654,
"num_input_tokens_seen": 2583872,
"step": 5260
},
{
"epoch": 0.694866041969117,
"grad_norm": 0.12207946926355362,
"learning_rate": 5.160435156295879e-07,
"loss": 0.1912,
"num_input_tokens_seen": 2586304,
"step": 5265
},
{
"epoch": 0.6955259337468656,
"grad_norm": 0.035935211926698685,
"learning_rate": 5.14028975952165e-07,
"loss": 0.0201,
"num_input_tokens_seen": 2589056,
"step": 5270
},
{
"epoch": 0.6961858255246139,
"grad_norm": 9.020354270935059,
"learning_rate": 5.120170150206768e-07,
"loss": 0.14,
"num_input_tokens_seen": 2591488,
"step": 5275
},
{
"epoch": 0.6968457173023624,
"grad_norm": 18.322715759277344,
"learning_rate": 5.100076435113496e-07,
"loss": 0.0542,
"num_input_tokens_seen": 2593792,
"step": 5280
},
{
"epoch": 0.6975056090801108,
"grad_norm": 55.9955940246582,
"learning_rate": 5.080008720866673e-07,
"loss": 0.1538,
"num_input_tokens_seen": 2595968,
"step": 5285
},
{
"epoch": 0.6981655008578593,
"grad_norm": 11.932297706604004,
"learning_rate": 5.059967113953173e-07,
"loss": 0.2123,
"num_input_tokens_seen": 2598144,
"step": 5290
},
{
"epoch": 0.6988253926356077,
"grad_norm": 0.08165155351161957,
"learning_rate": 5.039951720721349e-07,
"loss": 0.0838,
"num_input_tokens_seen": 2600448,
"step": 5295
},
{
"epoch": 0.6994852844133562,
"grad_norm": 0.32456350326538086,
"learning_rate": 5.019962647380429e-07,
"loss": 0.0167,
"num_input_tokens_seen": 2602944,
"step": 5300
},
{
"epoch": 0.7001451761911046,
"grad_norm": 20.51830291748047,
"learning_rate": 5.000000000000002e-07,
"loss": 0.2014,
"num_input_tokens_seen": 2605120,
"step": 5305
},
{
"epoch": 0.7002771545466544,
"eval_loss": 0.09084735810756683,
"eval_runtime": 7.6666,
"eval_samples_per_second": 878.487,
"eval_steps_per_second": 109.827,
"num_input_tokens_seen": 2605504,
"step": 5306
},
{
"epoch": 0.7008050679688531,
"grad_norm": 1.9377256631851196,
"learning_rate": 4.980063884509414e-07,
"loss": 0.0377,
"num_input_tokens_seen": 2607296,
"step": 5310
},
{
"epoch": 0.7014649597466015,
"grad_norm": 0.11374177783727646,
"learning_rate": 4.960154406697229e-07,
"loss": 0.0463,
"num_input_tokens_seen": 2609728,
"step": 5315
},
{
"epoch": 0.70212485152435,
"grad_norm": 11.871938705444336,
"learning_rate": 4.940271672210667e-07,
"loss": 0.2924,
"num_input_tokens_seen": 2612224,
"step": 5320
},
{
"epoch": 0.7027847433020985,
"grad_norm": 0.26750093698501587,
"learning_rate": 4.920415786555025e-07,
"loss": 0.0513,
"num_input_tokens_seen": 2614720,
"step": 5325
},
{
"epoch": 0.7034446350798469,
"grad_norm": 0.12440818548202515,
"learning_rate": 4.900586855093144e-07,
"loss": 0.3194,
"num_input_tokens_seen": 2617344,
"step": 5330
},
{
"epoch": 0.7041045268575954,
"grad_norm": 23.306577682495117,
"learning_rate": 4.880784983044827e-07,
"loss": 0.1166,
"num_input_tokens_seen": 2619584,
"step": 5335
},
{
"epoch": 0.7047644186353438,
"grad_norm": 0.1234973892569542,
"learning_rate": 4.861010275486284e-07,
"loss": 0.0176,
"num_input_tokens_seen": 2621888,
"step": 5340
},
{
"epoch": 0.7054243104130923,
"grad_norm": 0.14019837975502014,
"learning_rate": 4.8412628373496e-07,
"loss": 0.0731,
"num_input_tokens_seen": 2624512,
"step": 5345
},
{
"epoch": 0.7060842021908407,
"grad_norm": 0.18232476711273193,
"learning_rate": 4.821542773422136e-07,
"loss": 0.0024,
"num_input_tokens_seen": 2627008,
"step": 5350
},
{
"epoch": 0.7067440939685892,
"grad_norm": 0.28430455923080444,
"learning_rate": 4.801850188346012e-07,
"loss": 0.0019,
"num_input_tokens_seen": 2629440,
"step": 5355
},
{
"epoch": 0.7074039857463376,
"grad_norm": 0.19436050951480865,
"learning_rate": 4.782185186617523e-07,
"loss": 0.1034,
"num_input_tokens_seen": 2631872,
"step": 5360
},
{
"epoch": 0.7080638775240861,
"grad_norm": 0.2109547257423401,
"learning_rate": 4.762547872586603e-07,
"loss": 0.0814,
"num_input_tokens_seen": 2634560,
"step": 5365
},
{
"epoch": 0.7087237693018344,
"grad_norm": 0.2513101100921631,
"learning_rate": 4.7429383504562605e-07,
"loss": 0.1396,
"num_input_tokens_seen": 2637120,
"step": 5370
},
{
"epoch": 0.709383661079583,
"grad_norm": 0.30243685841560364,
"learning_rate": 4.723356724282029e-07,
"loss": 0.0019,
"num_input_tokens_seen": 2639552,
"step": 5375
},
{
"epoch": 0.7100435528573315,
"grad_norm": 24.248998641967773,
"learning_rate": 4.703803097971426e-07,
"loss": 0.1315,
"num_input_tokens_seen": 2641984,
"step": 5380
},
{
"epoch": 0.7107034446350798,
"grad_norm": 8.986465454101562,
"learning_rate": 4.6842775752833763e-07,
"loss": 0.0708,
"num_input_tokens_seen": 2644352,
"step": 5385
},
{
"epoch": 0.7113633364128283,
"grad_norm": 0.1666085124015808,
"learning_rate": 4.664780259827689e-07,
"loss": 0.02,
"num_input_tokens_seen": 2647040,
"step": 5390
},
{
"epoch": 0.7120232281905767,
"grad_norm": 0.05778901278972626,
"learning_rate": 4.6453112550644857e-07,
"loss": 0.0013,
"num_input_tokens_seen": 2649472,
"step": 5395
},
{
"epoch": 0.7126831199683252,
"grad_norm": 0.1988663524389267,
"learning_rate": 4.625870664303663e-07,
"loss": 0.0411,
"num_input_tokens_seen": 2651840,
"step": 5400
},
{
"epoch": 0.7133430117460736,
"grad_norm": 0.19517682492733002,
"learning_rate": 4.6064585907043486e-07,
"loss": 0.0056,
"num_input_tokens_seen": 2654464,
"step": 5405
},
{
"epoch": 0.7140029035238221,
"grad_norm": 0.02337566576898098,
"learning_rate": 4.587075137274334e-07,
"loss": 0.0537,
"num_input_tokens_seen": 2656576,
"step": 5410
},
{
"epoch": 0.7146627953015705,
"grad_norm": 1.0309412479400635,
"learning_rate": 4.5677204068695597e-07,
"loss": 0.0546,
"num_input_tokens_seen": 2659008,
"step": 5415
},
{
"epoch": 0.715322687079319,
"grad_norm": 0.022054580971598625,
"learning_rate": 4.5483945021935356e-07,
"loss": 0.0401,
"num_input_tokens_seen": 2661632,
"step": 5420
},
{
"epoch": 0.7159825788570674,
"grad_norm": 0.02314288541674614,
"learning_rate": 4.5290975257968155e-07,
"loss": 0.0963,
"num_input_tokens_seen": 2664192,
"step": 5425
},
{
"epoch": 0.7166424706348159,
"grad_norm": 22.84745216369629,
"learning_rate": 4.509829580076452e-07,
"loss": 0.1819,
"num_input_tokens_seen": 2666624,
"step": 5430
},
{
"epoch": 0.7173023624125643,
"grad_norm": 0.063370481133461,
"learning_rate": 4.490590767275442e-07,
"loss": 0.1842,
"num_input_tokens_seen": 2669120,
"step": 5435
},
{
"epoch": 0.7179622541903128,
"grad_norm": 0.49410998821258545,
"learning_rate": 4.4713811894822064e-07,
"loss": 0.102,
"num_input_tokens_seen": 2671552,
"step": 5440
},
{
"epoch": 0.7186221459680613,
"grad_norm": 0.3350347578525543,
"learning_rate": 4.4522009486300204e-07,
"loss": 0.071,
"num_input_tokens_seen": 2674240,
"step": 5445
},
{
"epoch": 0.7192820377458097,
"grad_norm": 0.07053118199110031,
"learning_rate": 4.43305014649649e-07,
"loss": 0.1247,
"num_input_tokens_seen": 2676544,
"step": 5450
},
{
"epoch": 0.7199419295235582,
"grad_norm": 0.14452704787254333,
"learning_rate": 4.4139288847030155e-07,
"loss": 0.0005,
"num_input_tokens_seen": 2678912,
"step": 5455
},
{
"epoch": 0.7206018213013066,
"grad_norm": 0.08119305223226547,
"learning_rate": 4.394837264714233e-07,
"loss": 0.0554,
"num_input_tokens_seen": 2681344,
"step": 5460
},
{
"epoch": 0.7212617130790551,
"grad_norm": 0.15848740935325623,
"learning_rate": 4.3757753878375005e-07,
"loss": 0.0013,
"num_input_tokens_seen": 2683776,
"step": 5465
},
{
"epoch": 0.7219216048568035,
"grad_norm": 0.031311068683862686,
"learning_rate": 4.3567433552223375e-07,
"loss": 0.0567,
"num_input_tokens_seen": 2686016,
"step": 5470
},
{
"epoch": 0.722581496634552,
"grad_norm": 0.839226484298706,
"learning_rate": 4.3377412678599e-07,
"loss": 0.1963,
"num_input_tokens_seen": 2688128,
"step": 5475
},
{
"epoch": 0.7232413884123003,
"grad_norm": 0.17472581565380096,
"learning_rate": 4.318769226582454e-07,
"loss": 0.1399,
"num_input_tokens_seen": 2690368,
"step": 5480
},
{
"epoch": 0.7239012801900488,
"grad_norm": 0.12172765284776688,
"learning_rate": 4.299827332062811e-07,
"loss": 0.0348,
"num_input_tokens_seen": 2692992,
"step": 5485
},
{
"epoch": 0.7245611719677972,
"grad_norm": 75.4613037109375,
"learning_rate": 4.2809156848138363e-07,
"loss": 0.0968,
"num_input_tokens_seen": 2695424,
"step": 5490
},
{
"epoch": 0.7252210637455457,
"grad_norm": 71.69564056396484,
"learning_rate": 4.2620343851878616e-07,
"loss": 0.1639,
"num_input_tokens_seen": 2697856,
"step": 5495
},
{
"epoch": 0.7258809555232941,
"grad_norm": 0.060778968036174774,
"learning_rate": 4.2431835333762123e-07,
"loss": 0.0446,
"num_input_tokens_seen": 2700608,
"step": 5500
},
{
"epoch": 0.7265408473010426,
"grad_norm": 0.6623153686523438,
"learning_rate": 4.224363229408628e-07,
"loss": 0.0005,
"num_input_tokens_seen": 2703104,
"step": 5505
},
{
"epoch": 0.7272007390787911,
"grad_norm": 0.34537097811698914,
"learning_rate": 4.205573573152753e-07,
"loss": 0.1834,
"num_input_tokens_seen": 2705344,
"step": 5510
},
{
"epoch": 0.7278606308565395,
"grad_norm": 0.14280956983566284,
"learning_rate": 4.18681466431361e-07,
"loss": 0.0728,
"num_input_tokens_seen": 2707520,
"step": 5515
},
{
"epoch": 0.728520522634288,
"grad_norm": 1.0312310457229614,
"learning_rate": 4.168086602433055e-07,
"loss": 0.105,
"num_input_tokens_seen": 2709888,
"step": 5520
},
{
"epoch": 0.7291804144120364,
"grad_norm": 0.46186262369155884,
"learning_rate": 4.1493894868892676e-07,
"loss": 0.1888,
"num_input_tokens_seen": 2712192,
"step": 5525
},
{
"epoch": 0.7298403061897849,
"grad_norm": 0.5339822769165039,
"learning_rate": 4.1307234168962093e-07,
"loss": 0.0838,
"num_input_tokens_seen": 2714368,
"step": 5530
},
{
"epoch": 0.7305001979675333,
"grad_norm": 0.03589556738734245,
"learning_rate": 4.112088491503095e-07,
"loss": 0.0014,
"num_input_tokens_seen": 2716608,
"step": 5535
},
{
"epoch": 0.7311600897452818,
"grad_norm": 0.08683586120605469,
"learning_rate": 4.0934848095938937e-07,
"loss": 0.001,
"num_input_tokens_seen": 2718656,
"step": 5540
},
{
"epoch": 0.7318199815230302,
"grad_norm": 0.04359391704201698,
"learning_rate": 4.074912469886763e-07,
"loss": 0.098,
"num_input_tokens_seen": 2721152,
"step": 5545
},
{
"epoch": 0.7324798733007787,
"grad_norm": 0.47505855560302734,
"learning_rate": 4.0563715709335657e-07,
"loss": 0.0009,
"num_input_tokens_seen": 2723264,
"step": 5550
},
{
"epoch": 0.7331397650785271,
"grad_norm": 40.96418380737305,
"learning_rate": 4.037862211119315e-07,
"loss": 0.2022,
"num_input_tokens_seen": 2725568,
"step": 5555
},
{
"epoch": 0.7337996568562756,
"grad_norm": 0.221147358417511,
"learning_rate": 4.0193844886616715e-07,
"loss": 0.0389,
"num_input_tokens_seen": 2728192,
"step": 5560
},
{
"epoch": 0.7344595486340241,
"grad_norm": 28.70302391052246,
"learning_rate": 4.0009385016104137e-07,
"loss": 0.1632,
"num_input_tokens_seen": 2731072,
"step": 5565
},
{
"epoch": 0.7351194404117725,
"grad_norm": 0.036642443388700485,
"learning_rate": 3.9825243478469164e-07,
"loss": 0.1455,
"num_input_tokens_seen": 2733440,
"step": 5570
},
{
"epoch": 0.735779332189521,
"grad_norm": 8.163640022277832,
"learning_rate": 3.9641421250836484e-07,
"loss": 0.1211,
"num_input_tokens_seen": 2736064,
"step": 5575
},
{
"epoch": 0.7364392239672694,
"grad_norm": 36.30949401855469,
"learning_rate": 3.945791930863622e-07,
"loss": 0.0356,
"num_input_tokens_seen": 2738496,
"step": 5580
},
{
"epoch": 0.7370991157450179,
"grad_norm": 0.6677089333534241,
"learning_rate": 3.9274738625599137e-07,
"loss": 0.002,
"num_input_tokens_seen": 2740800,
"step": 5585
},
{
"epoch": 0.7377590075227662,
"grad_norm": 0.42139413952827454,
"learning_rate": 3.909188017375112e-07,
"loss": 0.0746,
"num_input_tokens_seen": 2743104,
"step": 5590
},
{
"epoch": 0.7384188993005147,
"grad_norm": 0.15833111107349396,
"learning_rate": 3.890934492340819e-07,
"loss": 0.1553,
"num_input_tokens_seen": 2745344,
"step": 5595
},
{
"epoch": 0.7390787910782631,
"grad_norm": 19.225963592529297,
"learning_rate": 3.872713384317147e-07,
"loss": 0.062,
"num_input_tokens_seen": 2747520,
"step": 5600
},
{
"epoch": 0.7397386828560116,
"grad_norm": 0.0576261468231678,
"learning_rate": 3.8545247899921776e-07,
"loss": 0.1382,
"num_input_tokens_seen": 2750016,
"step": 5605
},
{
"epoch": 0.74039857463376,
"grad_norm": 0.09810295701026917,
"learning_rate": 3.8363688058814614e-07,
"loss": 0.1139,
"num_input_tokens_seen": 2752704,
"step": 5610
},
{
"epoch": 0.7410584664115085,
"grad_norm": 37.168209075927734,
"learning_rate": 3.818245528327526e-07,
"loss": 0.1544,
"num_input_tokens_seen": 2755328,
"step": 5615
},
{
"epoch": 0.7417183581892569,
"grad_norm": 12.660454750061035,
"learning_rate": 3.8001550534993164e-07,
"loss": 0.0911,
"num_input_tokens_seen": 2757632,
"step": 5620
},
{
"epoch": 0.7423782499670054,
"grad_norm": 0.21876884996891022,
"learning_rate": 3.7820974773917413e-07,
"loss": 0.0665,
"num_input_tokens_seen": 2760192,
"step": 5625
},
{
"epoch": 0.7430381417447539,
"grad_norm": 0.09194961190223694,
"learning_rate": 3.764072895825117e-07,
"loss": 0.001,
"num_input_tokens_seen": 2762816,
"step": 5630
},
{
"epoch": 0.7436980335225023,
"grad_norm": 11.263919830322266,
"learning_rate": 3.7460814044446934e-07,
"loss": 0.0625,
"num_input_tokens_seen": 2765120,
"step": 5635
},
{
"epoch": 0.7443579253002508,
"grad_norm": 0.1615023910999298,
"learning_rate": 3.72812309872012e-07,
"loss": 0.0989,
"num_input_tokens_seen": 2767808,
"step": 5640
},
{
"epoch": 0.7450178170779992,
"grad_norm": 3.175365447998047,
"learning_rate": 3.71019807394495e-07,
"loss": 0.0518,
"num_input_tokens_seen": 2770176,
"step": 5645
},
{
"epoch": 0.7456777088557477,
"grad_norm": 0.27845051884651184,
"learning_rate": 3.6923064252361505e-07,
"loss": 0.0983,
"num_input_tokens_seen": 2772672,
"step": 5650
},
{
"epoch": 0.7463376006334961,
"grad_norm": 0.15922772884368896,
"learning_rate": 3.674448247533561e-07,
"loss": 0.1089,
"num_input_tokens_seen": 2775104,
"step": 5655
},
{
"epoch": 0.7469974924112446,
"grad_norm": 0.11377550661563873,
"learning_rate": 3.656623635599432e-07,
"loss": 0.2327,
"num_input_tokens_seen": 2777792,
"step": 5660
},
{
"epoch": 0.747657384188993,
"grad_norm": 55.64900207519531,
"learning_rate": 3.6388326840178865e-07,
"loss": 0.1313,
"num_input_tokens_seen": 2780416,
"step": 5665
},
{
"epoch": 0.7483172759667415,
"grad_norm": 0.06009421497583389,
"learning_rate": 3.621075487194435e-07,
"loss": 0.0056,
"num_input_tokens_seen": 2783232,
"step": 5670
},
{
"epoch": 0.7489771677444899,
"grad_norm": 11.293038368225098,
"learning_rate": 3.603352139355483e-07,
"loss": 0.1054,
"num_input_tokens_seen": 2785664,
"step": 5675
},
{
"epoch": 0.7496370595222384,
"grad_norm": 8.784896850585938,
"learning_rate": 3.58566273454781e-07,
"loss": 0.0984,
"num_input_tokens_seen": 2788224,
"step": 5680
},
{
"epoch": 0.7502969512999867,
"grad_norm": 0.07352028787136078,
"learning_rate": 3.5680073666380817e-07,
"loss": 0.0014,
"num_input_tokens_seen": 2790656,
"step": 5685
},
{
"epoch": 0.7502969512999867,
"eval_loss": 0.0956902727484703,
"eval_runtime": 7.594,
"eval_samples_per_second": 886.88,
"eval_steps_per_second": 110.876,
"num_input_tokens_seen": 2790656,
"step": 5685
},
{
"epoch": 0.7509568430777352,
"grad_norm": 0.028005223721265793,
"learning_rate": 3.5503861293123514e-07,
"loss": 0.1594,
"num_input_tokens_seen": 2792960,
"step": 5690
},
{
"epoch": 0.7516167348554837,
"grad_norm": 0.16075754165649414,
"learning_rate": 3.532799116075571e-07,
"loss": 0.0789,
"num_input_tokens_seen": 2795648,
"step": 5695
},
{
"epoch": 0.7522766266332321,
"grad_norm": 53.65882873535156,
"learning_rate": 3.5152464202510777e-07,
"loss": 0.098,
"num_input_tokens_seen": 2797696,
"step": 5700
},
{
"epoch": 0.7529365184109806,
"grad_norm": 15.231353759765625,
"learning_rate": 3.4977281349801056e-07,
"loss": 0.1334,
"num_input_tokens_seen": 2800192,
"step": 5705
},
{
"epoch": 0.753596410188729,
"grad_norm": 0.14780941605567932,
"learning_rate": 3.4802443532213056e-07,
"loss": 0.0133,
"num_input_tokens_seen": 2802560,
"step": 5710
},
{
"epoch": 0.7542563019664775,
"grad_norm": 0.030415428802371025,
"learning_rate": 3.4627951677502233e-07,
"loss": 0.2453,
"num_input_tokens_seen": 2804992,
"step": 5715
},
{
"epoch": 0.7549161937442259,
"grad_norm": 0.889872133731842,
"learning_rate": 3.4453806711588397e-07,
"loss": 0.0492,
"num_input_tokens_seen": 2807296,
"step": 5720
},
{
"epoch": 0.7555760855219744,
"grad_norm": 0.08172111958265305,
"learning_rate": 3.428000955855054e-07,
"loss": 0.0303,
"num_input_tokens_seen": 2809984,
"step": 5725
},
{
"epoch": 0.7562359772997228,
"grad_norm": 0.09841586649417877,
"learning_rate": 3.4106561140621983e-07,
"loss": 0.0023,
"num_input_tokens_seen": 2812736,
"step": 5730
},
{
"epoch": 0.7568958690774713,
"grad_norm": 0.37726613879203796,
"learning_rate": 3.393346237818567e-07,
"loss": 0.1465,
"num_input_tokens_seen": 2815040,
"step": 5735
},
{
"epoch": 0.7575557608552197,
"grad_norm": 0.7347794771194458,
"learning_rate": 3.3760714189769015e-07,
"loss": 0.1114,
"num_input_tokens_seen": 2817344,
"step": 5740
},
{
"epoch": 0.7582156526329682,
"grad_norm": 0.10844270884990692,
"learning_rate": 3.3588317492039266e-07,
"loss": 0.0357,
"num_input_tokens_seen": 2819648,
"step": 5745
},
{
"epoch": 0.7588755444107167,
"grad_norm": 46.741573333740234,
"learning_rate": 3.341627319979834e-07,
"loss": 0.1254,
"num_input_tokens_seen": 2822464,
"step": 5750
},
{
"epoch": 0.7595354361884651,
"grad_norm": 150.2995147705078,
"learning_rate": 3.324458222597839e-07,
"loss": 0.1943,
"num_input_tokens_seen": 2824896,
"step": 5755
},
{
"epoch": 0.7601953279662136,
"grad_norm": 0.15239302814006805,
"learning_rate": 3.307324548163657e-07,
"loss": 0.0749,
"num_input_tokens_seen": 2827648,
"step": 5760
},
{
"epoch": 0.760855219743962,
"grad_norm": 0.6753157377243042,
"learning_rate": 3.2902263875950374e-07,
"loss": 0.114,
"num_input_tokens_seen": 2830336,
"step": 5765
},
{
"epoch": 0.7615151115217105,
"grad_norm": 16.683015823364258,
"learning_rate": 3.2731638316212894e-07,
"loss": 0.0462,
"num_input_tokens_seen": 2832640,
"step": 5770
},
{
"epoch": 0.7621750032994589,
"grad_norm": 1.5054552555084229,
"learning_rate": 3.256136970782782e-07,
"loss": 0.049,
"num_input_tokens_seen": 2834880,
"step": 5775
},
{
"epoch": 0.7628348950772074,
"grad_norm": 0.033258408308029175,
"learning_rate": 3.23914589543047e-07,
"loss": 0.1447,
"num_input_tokens_seen": 2837440,
"step": 5780
},
{
"epoch": 0.7634947868549558,
"grad_norm": 103.95304107666016,
"learning_rate": 3.2221906957254276e-07,
"loss": 0.0424,
"num_input_tokens_seen": 2839808,
"step": 5785
},
{
"epoch": 0.7641546786327043,
"grad_norm": 5.235893726348877,
"learning_rate": 3.205271461638346e-07,
"loss": 0.1412,
"num_input_tokens_seen": 2842432,
"step": 5790
},
{
"epoch": 0.7648145704104526,
"grad_norm": 0.035734184086322784,
"learning_rate": 3.188388282949085e-07,
"loss": 0.1313,
"num_input_tokens_seen": 2845120,
"step": 5795
},
{
"epoch": 0.7654744621882011,
"grad_norm": 61.18632125854492,
"learning_rate": 3.171541249246166e-07,
"loss": 0.1633,
"num_input_tokens_seen": 2848000,
"step": 5800
},
{
"epoch": 0.7661343539659495,
"grad_norm": 14.463330268859863,
"learning_rate": 3.154730449926316e-07,
"loss": 0.161,
"num_input_tokens_seen": 2850624,
"step": 5805
},
{
"epoch": 0.766794245743698,
"grad_norm": 0.18341617286205292,
"learning_rate": 3.137955974194e-07,
"loss": 0.121,
"num_input_tokens_seen": 2852992,
"step": 5810
},
{
"epoch": 0.7674541375214465,
"grad_norm": 28.731979370117188,
"learning_rate": 3.1212179110609125e-07,
"loss": 0.1251,
"num_input_tokens_seen": 2855424,
"step": 5815
},
{
"epoch": 0.7681140292991949,
"grad_norm": 0.0925399586558342,
"learning_rate": 3.104516349345553e-07,
"loss": 0.137,
"num_input_tokens_seen": 2857984,
"step": 5820
},
{
"epoch": 0.7687739210769434,
"grad_norm": 0.09687471389770508,
"learning_rate": 3.0878513776727144e-07,
"loss": 0.0643,
"num_input_tokens_seen": 2860672,
"step": 5825
},
{
"epoch": 0.7694338128546918,
"grad_norm": 10.534875869750977,
"learning_rate": 3.0712230844730414e-07,
"loss": 0.1726,
"num_input_tokens_seen": 2863040,
"step": 5830
},
{
"epoch": 0.7700937046324403,
"grad_norm": 0.4192121624946594,
"learning_rate": 3.054631557982539e-07,
"loss": 0.0704,
"num_input_tokens_seen": 2865856,
"step": 5835
},
{
"epoch": 0.7707535964101887,
"grad_norm": 0.11545547842979431,
"learning_rate": 3.0380768862421156e-07,
"loss": 0.1005,
"num_input_tokens_seen": 2868096,
"step": 5840
},
{
"epoch": 0.7714134881879372,
"grad_norm": 0.13741333782672882,
"learning_rate": 3.0215591570971234e-07,
"loss": 0.0013,
"num_input_tokens_seen": 2870784,
"step": 5845
},
{
"epoch": 0.7720733799656856,
"grad_norm": 55.587005615234375,
"learning_rate": 3.005078458196868e-07,
"loss": 0.0712,
"num_input_tokens_seen": 2873216,
"step": 5850
},
{
"epoch": 0.7727332717434341,
"grad_norm": 0.19076700508594513,
"learning_rate": 2.988634876994175e-07,
"loss": 0.0011,
"num_input_tokens_seen": 2875776,
"step": 5855
},
{
"epoch": 0.7733931635211825,
"grad_norm": 0.12881390750408173,
"learning_rate": 2.972228500744898e-07,
"loss": 0.0336,
"num_input_tokens_seen": 2878336,
"step": 5860
},
{
"epoch": 0.774053055298931,
"grad_norm": 22.819622039794922,
"learning_rate": 2.955859416507467e-07,
"loss": 0.1431,
"num_input_tokens_seen": 2880896,
"step": 5865
},
{
"epoch": 0.7747129470766794,
"grad_norm": 0.040956467390060425,
"learning_rate": 2.9395277111424357e-07,
"loss": 0.0684,
"num_input_tokens_seen": 2883648,
"step": 5870
},
{
"epoch": 0.7753728388544279,
"grad_norm": 0.0455995537340641,
"learning_rate": 2.9232334713120035e-07,
"loss": 0.0016,
"num_input_tokens_seen": 2885952,
"step": 5875
},
{
"epoch": 0.7760327306321764,
"grad_norm": 0.3208160996437073,
"learning_rate": 2.9069767834795655e-07,
"loss": 0.0614,
"num_input_tokens_seen": 2888576,
"step": 5880
},
{
"epoch": 0.7766926224099248,
"grad_norm": 3.3780038356781006,
"learning_rate": 2.8907577339092483e-07,
"loss": 0.1331,
"num_input_tokens_seen": 2891136,
"step": 5885
},
{
"epoch": 0.7773525141876733,
"grad_norm": 0.030515162274241447,
"learning_rate": 2.8745764086654654e-07,
"loss": 0.0711,
"num_input_tokens_seen": 2893696,
"step": 5890
},
{
"epoch": 0.7780124059654216,
"grad_norm": 37.801578521728516,
"learning_rate": 2.8584328936124424e-07,
"loss": 0.0499,
"num_input_tokens_seen": 2896512,
"step": 5895
},
{
"epoch": 0.7786722977431701,
"grad_norm": 13.73175048828125,
"learning_rate": 2.8423272744137674e-07,
"loss": 0.1805,
"num_input_tokens_seen": 2899008,
"step": 5900
},
{
"epoch": 0.7793321895209185,
"grad_norm": 1.2914345264434814,
"learning_rate": 2.82625963653195e-07,
"loss": 0.0781,
"num_input_tokens_seen": 2901376,
"step": 5905
},
{
"epoch": 0.779992081298667,
"grad_norm": 8.905738830566406,
"learning_rate": 2.810230065227944e-07,
"loss": 0.1989,
"num_input_tokens_seen": 2903872,
"step": 5910
},
{
"epoch": 0.7806519730764154,
"grad_norm": 0.1153329461812973,
"learning_rate": 2.7942386455607203e-07,
"loss": 0.0016,
"num_input_tokens_seen": 2906240,
"step": 5915
},
{
"epoch": 0.7813118648541639,
"grad_norm": 0.40870046615600586,
"learning_rate": 2.77828546238679e-07,
"loss": 0.0695,
"num_input_tokens_seen": 2908736,
"step": 5920
},
{
"epoch": 0.7819717566319123,
"grad_norm": 50.935813903808594,
"learning_rate": 2.762370600359774e-07,
"loss": 0.1347,
"num_input_tokens_seen": 2911104,
"step": 5925
},
{
"epoch": 0.7826316484096608,
"grad_norm": 0.06911960244178772,
"learning_rate": 2.7464941439299484e-07,
"loss": 0.0614,
"num_input_tokens_seen": 2913472,
"step": 5930
},
{
"epoch": 0.7832915401874093,
"grad_norm": 12.452083587646484,
"learning_rate": 2.7306561773437887e-07,
"loss": 0.1583,
"num_input_tokens_seen": 2915840,
"step": 5935
},
{
"epoch": 0.7839514319651577,
"grad_norm": 0.09292475879192352,
"learning_rate": 2.714856784643533e-07,
"loss": 0.0047,
"num_input_tokens_seen": 2918144,
"step": 5940
},
{
"epoch": 0.7846113237429062,
"grad_norm": 0.06648958474397659,
"learning_rate": 2.6990960496667313e-07,
"loss": 0.1479,
"num_input_tokens_seen": 2920768,
"step": 5945
},
{
"epoch": 0.7852712155206546,
"grad_norm": 0.07045161724090576,
"learning_rate": 2.6833740560457976e-07,
"loss": 0.067,
"num_input_tokens_seen": 2923136,
"step": 5950
},
{
"epoch": 0.7859311072984031,
"grad_norm": 0.12214231491088867,
"learning_rate": 2.6676908872075757e-07,
"loss": 0.0702,
"num_input_tokens_seen": 2925568,
"step": 5955
},
{
"epoch": 0.7865909990761515,
"grad_norm": 0.0641525536775589,
"learning_rate": 2.6520466263728836e-07,
"loss": 0.0576,
"num_input_tokens_seen": 2928064,
"step": 5960
},
{
"epoch": 0.7872508908539,
"grad_norm": 38.231407165527344,
"learning_rate": 2.636441356556087e-07,
"loss": 0.2178,
"num_input_tokens_seen": 2930368,
"step": 5965
},
{
"epoch": 0.7879107826316484,
"grad_norm": 13.16163158416748,
"learning_rate": 2.620875160564645e-07,
"loss": 0.1005,
"num_input_tokens_seen": 2932928,
"step": 5970
},
{
"epoch": 0.7885706744093969,
"grad_norm": 208.31663513183594,
"learning_rate": 2.6053481209986715e-07,
"loss": 0.418,
"num_input_tokens_seen": 2935360,
"step": 5975
},
{
"epoch": 0.7892305661871453,
"grad_norm": 0.08345562219619751,
"learning_rate": 2.5898603202505155e-07,
"loss": 0.059,
"num_input_tokens_seen": 2937920,
"step": 5980
},
{
"epoch": 0.7898904579648938,
"grad_norm": 0.3885025382041931,
"learning_rate": 2.5744118405042923e-07,
"loss": 0.0502,
"num_input_tokens_seen": 2940224,
"step": 5985
},
{
"epoch": 0.7905503497426422,
"grad_norm": 0.12759974598884583,
"learning_rate": 2.559002763735485e-07,
"loss": 0.0017,
"num_input_tokens_seen": 2942848,
"step": 5990
},
{
"epoch": 0.7912102415203907,
"grad_norm": 0.2042687088251114,
"learning_rate": 2.543633171710472e-07,
"loss": 0.0591,
"num_input_tokens_seen": 2945344,
"step": 5995
},
{
"epoch": 0.7918701332981392,
"grad_norm": 2.7166707515716553,
"learning_rate": 2.5283031459861205e-07,
"loss": 0.0162,
"num_input_tokens_seen": 2947840,
"step": 6000
},
{
"epoch": 0.7925300250758875,
"grad_norm": 0.08386794477701187,
"learning_rate": 2.5130127679093396e-07,
"loss": 0.0344,
"num_input_tokens_seen": 2950144,
"step": 6005
},
{
"epoch": 0.793189916853636,
"grad_norm": 59.384368896484375,
"learning_rate": 2.497762118616652e-07,
"loss": 0.0428,
"num_input_tokens_seen": 2952384,
"step": 6010
},
{
"epoch": 0.7938498086313844,
"grad_norm": 0.061096593737602234,
"learning_rate": 2.4825512790337745e-07,
"loss": 0.0788,
"num_input_tokens_seen": 2955136,
"step": 6015
},
{
"epoch": 0.7945097004091329,
"grad_norm": 38.1906623840332,
"learning_rate": 2.467380329875163e-07,
"loss": 0.0441,
"num_input_tokens_seen": 2957824,
"step": 6020
},
{
"epoch": 0.7951695921868813,
"grad_norm": 0.729996919631958,
"learning_rate": 2.452249351643615e-07,
"loss": 0.0038,
"num_input_tokens_seen": 2960256,
"step": 6025
},
{
"epoch": 0.7958294839646298,
"grad_norm": 0.31032249331474304,
"learning_rate": 2.437158424629817e-07,
"loss": 0.0672,
"num_input_tokens_seen": 2962944,
"step": 6030
},
{
"epoch": 0.7964893757423782,
"grad_norm": 0.5417336821556091,
"learning_rate": 2.422107628911929e-07,
"loss": 0.2047,
"num_input_tokens_seen": 2965504,
"step": 6035
},
{
"epoch": 0.7971492675201267,
"grad_norm": 0.07609419524669647,
"learning_rate": 2.4070970443551673e-07,
"loss": 0.2335,
"num_input_tokens_seen": 2967744,
"step": 6040
},
{
"epoch": 0.7978091592978751,
"grad_norm": 0.5796427726745605,
"learning_rate": 2.392126750611362e-07,
"loss": 0.0017,
"num_input_tokens_seen": 2970240,
"step": 6045
},
{
"epoch": 0.7984690510756236,
"grad_norm": 19.56314468383789,
"learning_rate": 2.3771968271185538e-07,
"loss": 0.1777,
"num_input_tokens_seen": 2972928,
"step": 6050
},
{
"epoch": 0.799128942853372,
"grad_norm": 0.015146835707128048,
"learning_rate": 2.3623073531005579e-07,
"loss": 0.1485,
"num_input_tokens_seen": 2975168,
"step": 6055
},
{
"epoch": 0.7997888346311205,
"grad_norm": 0.11885405331850052,
"learning_rate": 2.3474584075665493e-07,
"loss": 0.1294,
"num_input_tokens_seen": 2977408,
"step": 6060
},
{
"epoch": 0.8003167480533192,
"eval_loss": 0.0954766720533371,
"eval_runtime": 7.5442,
"eval_samples_per_second": 892.739,
"eval_steps_per_second": 111.609,
"num_input_tokens_seen": 2979456,
"step": 6064
},
{
"epoch": 0.800448726408869,
"grad_norm": 0.06911212205886841,
"learning_rate": 2.3326500693106533e-07,
"loss": 0.0013,
"num_input_tokens_seen": 2979968,
"step": 6065
},
{
"epoch": 0.8011086181866174,
"grad_norm": 82.93882751464844,
"learning_rate": 2.3178824169114975e-07,
"loss": 0.209,
"num_input_tokens_seen": 2982528,
"step": 6070
},
{
"epoch": 0.8017685099643659,
"grad_norm": 0.18306070566177368,
"learning_rate": 2.303155528731837e-07,
"loss": 0.0494,
"num_input_tokens_seen": 2984832,
"step": 6075
},
{
"epoch": 0.8024284017421143,
"grad_norm": 1.8438490629196167,
"learning_rate": 2.2884694829181016e-07,
"loss": 0.0014,
"num_input_tokens_seen": 2987328,
"step": 6080
},
{
"epoch": 0.8030882935198628,
"grad_norm": 0.09328246116638184,
"learning_rate": 2.273824357400005e-07,
"loss": 0.0083,
"num_input_tokens_seen": 2989760,
"step": 6085
},
{
"epoch": 0.8037481852976112,
"grad_norm": 0.17672888934612274,
"learning_rate": 2.2592202298901174e-07,
"loss": 0.0188,
"num_input_tokens_seen": 2992320,
"step": 6090
},
{
"epoch": 0.8044080770753597,
"grad_norm": 0.12940169870853424,
"learning_rate": 2.2446571778834555e-07,
"loss": 0.0014,
"num_input_tokens_seen": 2995136,
"step": 6095
},
{
"epoch": 0.805067968853108,
"grad_norm": 0.075173020362854,
"learning_rate": 2.2301352786570827e-07,
"loss": 0.0009,
"num_input_tokens_seen": 2998016,
"step": 6100
},
{
"epoch": 0.8057278606308566,
"grad_norm": 0.03360762447118759,
"learning_rate": 2.215654609269685e-07,
"loss": 0.1425,
"num_input_tokens_seen": 3000832,
"step": 6105
},
{
"epoch": 0.8063877524086049,
"grad_norm": 15.617521286010742,
"learning_rate": 2.201215246561161e-07,
"loss": 0.1461,
"num_input_tokens_seen": 3003584,
"step": 6110
},
{
"epoch": 0.8070476441863534,
"grad_norm": 62.255313873291016,
"learning_rate": 2.1868172671522357e-07,
"loss": 0.0738,
"num_input_tokens_seen": 3006464,
"step": 6115
},
{
"epoch": 0.8077075359641019,
"grad_norm": 0.16907618939876556,
"learning_rate": 2.1724607474440216e-07,
"loss": 0.0824,
"num_input_tokens_seen": 3008896,
"step": 6120
},
{
"epoch": 0.8083674277418503,
"grad_norm": 21.065229415893555,
"learning_rate": 2.158145763617646e-07,
"loss": 0.1463,
"num_input_tokens_seen": 3011392,
"step": 6125
},
{
"epoch": 0.8090273195195988,
"grad_norm": 0.23459585011005402,
"learning_rate": 2.1438723916338198e-07,
"loss": 0.2764,
"num_input_tokens_seen": 3014016,
"step": 6130
},
{
"epoch": 0.8096872112973472,
"grad_norm": 0.41196635365486145,
"learning_rate": 2.1296407072324495e-07,
"loss": 0.1715,
"num_input_tokens_seen": 3016576,
"step": 6135
},
{
"epoch": 0.8103471030750957,
"grad_norm": 163.61370849609375,
"learning_rate": 2.1154507859322336e-07,
"loss": 0.0432,
"num_input_tokens_seen": 3019008,
"step": 6140
},
{
"epoch": 0.8110069948528441,
"grad_norm": 36.33530044555664,
"learning_rate": 2.101302703030252e-07,
"loss": 0.1229,
"num_input_tokens_seen": 3021504,
"step": 6145
},
{
"epoch": 0.8116668866305926,
"grad_norm": 10.142012596130371,
"learning_rate": 2.0871965336015885e-07,
"loss": 0.0575,
"num_input_tokens_seen": 3023552,
"step": 6150
},
{
"epoch": 0.812326778408341,
"grad_norm": 0.6284022331237793,
"learning_rate": 2.0731323524989031e-07,
"loss": 0.0704,
"num_input_tokens_seen": 3025856,
"step": 6155
},
{
"epoch": 0.8129866701860895,
"grad_norm": 1.1452326774597168,
"learning_rate": 2.0591102343520616e-07,
"loss": 0.2049,
"num_input_tokens_seen": 3028096,
"step": 6160
},
{
"epoch": 0.8136465619638379,
"grad_norm": 44.43306350708008,
"learning_rate": 2.0451302535677206e-07,
"loss": 0.159,
"num_input_tokens_seen": 3030528,
"step": 6165
},
{
"epoch": 0.8143064537415864,
"grad_norm": 113.5491943359375,
"learning_rate": 2.0311924843289396e-07,
"loss": 0.227,
"num_input_tokens_seen": 3033088,
"step": 6170
},
{
"epoch": 0.8149663455193348,
"grad_norm": 0.11563657224178314,
"learning_rate": 2.017297000594794e-07,
"loss": 0.0642,
"num_input_tokens_seen": 3035200,
"step": 6175
},
{
"epoch": 0.8156262372970833,
"grad_norm": 0.15644113719463348,
"learning_rate": 2.0034438760999696e-07,
"loss": 0.0604,
"num_input_tokens_seen": 3037696,
"step": 6180
},
{
"epoch": 0.8162861290748318,
"grad_norm": 14.904664039611816,
"learning_rate": 1.9896331843543856e-07,
"loss": 0.1423,
"num_input_tokens_seen": 3040128,
"step": 6185
},
{
"epoch": 0.8169460208525802,
"grad_norm": 0.2976359724998474,
"learning_rate": 1.975864998642789e-07,
"loss": 0.1184,
"num_input_tokens_seen": 3042560,
"step": 6190
},
{
"epoch": 0.8176059126303287,
"grad_norm": 37.44635772705078,
"learning_rate": 1.9621393920243767e-07,
"loss": 0.2826,
"num_input_tokens_seen": 3044800,
"step": 6195
},
{
"epoch": 0.8182658044080771,
"grad_norm": 105.21582794189453,
"learning_rate": 1.9484564373324074e-07,
"loss": 0.1028,
"num_input_tokens_seen": 3047040,
"step": 6200
},
{
"epoch": 0.8189256961858256,
"grad_norm": 0.048264820128679276,
"learning_rate": 1.934816207173805e-07,
"loss": 0.0495,
"num_input_tokens_seen": 3049600,
"step": 6205
},
{
"epoch": 0.819585587963574,
"grad_norm": 0.17185015976428986,
"learning_rate": 1.9212187739287943e-07,
"loss": 0.158,
"num_input_tokens_seen": 3052416,
"step": 6210
},
{
"epoch": 0.8202454797413224,
"grad_norm": 0.28126591444015503,
"learning_rate": 1.907664209750488e-07,
"loss": 0.0135,
"num_input_tokens_seen": 3055040,
"step": 6215
},
{
"epoch": 0.8209053715190708,
"grad_norm": 0.4188820719718933,
"learning_rate": 1.8941525865645336e-07,
"loss": 0.0446,
"num_input_tokens_seen": 3057856,
"step": 6220
},
{
"epoch": 0.8215652632968193,
"grad_norm": 49.17670440673828,
"learning_rate": 1.8806839760687076e-07,
"loss": 0.2045,
"num_input_tokens_seen": 3060160,
"step": 6225
},
{
"epoch": 0.8222251550745677,
"grad_norm": 0.09683864563703537,
"learning_rate": 1.867258449732545e-07,
"loss": 0.1205,
"num_input_tokens_seen": 3062592,
"step": 6230
},
{
"epoch": 0.8228850468523162,
"grad_norm": 15.184978485107422,
"learning_rate": 1.8538760787969676e-07,
"loss": 0.0502,
"num_input_tokens_seen": 3065088,
"step": 6235
},
{
"epoch": 0.8235449386300646,
"grad_norm": 1.2835743427276611,
"learning_rate": 1.8405369342738907e-07,
"loss": 0.0019,
"num_input_tokens_seen": 3067712,
"step": 6240
},
{
"epoch": 0.8242048304078131,
"grad_norm": 122.01753997802734,
"learning_rate": 1.8272410869458598e-07,
"loss": 0.0876,
"num_input_tokens_seen": 3070144,
"step": 6245
},
{
"epoch": 0.8248647221855616,
"grad_norm": 0.3181722164154053,
"learning_rate": 1.8139886073656653e-07,
"loss": 0.2369,
"num_input_tokens_seen": 3072448,
"step": 6250
},
{
"epoch": 0.82552461396331,
"grad_norm": 0.28298959136009216,
"learning_rate": 1.800779565855971e-07,
"loss": 0.2066,
"num_input_tokens_seen": 3075072,
"step": 6255
},
{
"epoch": 0.8261845057410585,
"grad_norm": 0.08039449155330658,
"learning_rate": 1.7876140325089463e-07,
"loss": 0.0029,
"num_input_tokens_seen": 3077376,
"step": 6260
},
{
"epoch": 0.8268443975188069,
"grad_norm": 0.7492879629135132,
"learning_rate": 1.774492077185883e-07,
"loss": 0.1344,
"num_input_tokens_seen": 3079808,
"step": 6265
},
{
"epoch": 0.8275042892965554,
"grad_norm": 0.0355597622692585,
"learning_rate": 1.7614137695168408e-07,
"loss": 0.0009,
"num_input_tokens_seen": 3082560,
"step": 6270
},
{
"epoch": 0.8281641810743038,
"grad_norm": 0.3212199807167053,
"learning_rate": 1.748379178900261e-07,
"loss": 0.0705,
"num_input_tokens_seen": 3084608,
"step": 6275
},
{
"epoch": 0.8288240728520523,
"grad_norm": 17.22373390197754,
"learning_rate": 1.7353883745026055e-07,
"loss": 0.228,
"num_input_tokens_seen": 3087104,
"step": 6280
},
{
"epoch": 0.8294839646298007,
"grad_norm": 1.737422227859497,
"learning_rate": 1.722441425257999e-07,
"loss": 0.1102,
"num_input_tokens_seen": 3089408,
"step": 6285
},
{
"epoch": 0.8301438564075492,
"grad_norm": 0.23113372921943665,
"learning_rate": 1.7095383998678402e-07,
"loss": 0.0552,
"num_input_tokens_seen": 3091776,
"step": 6290
},
{
"epoch": 0.8308037481852976,
"grad_norm": 0.0835813581943512,
"learning_rate": 1.6966793668004653e-07,
"loss": 0.1083,
"num_input_tokens_seen": 3094208,
"step": 6295
},
{
"epoch": 0.8314636399630461,
"grad_norm": 0.1578727513551712,
"learning_rate": 1.6838643942907625e-07,
"loss": 0.0801,
"num_input_tokens_seen": 3096768,
"step": 6300
},
{
"epoch": 0.8321235317407946,
"grad_norm": 72.27742004394531,
"learning_rate": 1.671093550339815e-07,
"loss": 0.06,
"num_input_tokens_seen": 3099456,
"step": 6305
},
{
"epoch": 0.832783423518543,
"grad_norm": 0.853486180305481,
"learning_rate": 1.6583669027145542e-07,
"loss": 0.0046,
"num_input_tokens_seen": 3102208,
"step": 6310
},
{
"epoch": 0.8334433152962915,
"grad_norm": 0.05901863053441048,
"learning_rate": 1.6456845189473767e-07,
"loss": 0.0014,
"num_input_tokens_seen": 3104896,
"step": 6315
},
{
"epoch": 0.8341032070740398,
"grad_norm": 0.11249249428510666,
"learning_rate": 1.6330464663358123e-07,
"loss": 0.1178,
"num_input_tokens_seen": 3107520,
"step": 6320
},
{
"epoch": 0.8347630988517883,
"grad_norm": 0.057853005826473236,
"learning_rate": 1.6204528119421346e-07,
"loss": 0.0014,
"num_input_tokens_seen": 3110144,
"step": 6325
},
{
"epoch": 0.8354229906295367,
"grad_norm": 0.1531873196363449,
"learning_rate": 1.607903622593042e-07,
"loss": 0.0501,
"num_input_tokens_seen": 3112768,
"step": 6330
},
{
"epoch": 0.8360828824072852,
"grad_norm": 0.04447514936327934,
"learning_rate": 1.5953989648792743e-07,
"loss": 0.0007,
"num_input_tokens_seen": 3115328,
"step": 6335
},
{
"epoch": 0.8367427741850336,
"grad_norm": 0.14438007771968842,
"learning_rate": 1.5829389051552678e-07,
"loss": 0.0323,
"num_input_tokens_seen": 3117888,
"step": 6340
},
{
"epoch": 0.8374026659627821,
"grad_norm": 101.97913360595703,
"learning_rate": 1.5705235095388136e-07,
"loss": 0.038,
"num_input_tokens_seen": 3120384,
"step": 6345
},
{
"epoch": 0.8380625577405305,
"grad_norm": 0.08870197832584381,
"learning_rate": 1.5581528439106907e-07,
"loss": 0.0436,
"num_input_tokens_seen": 3123008,
"step": 6350
},
{
"epoch": 0.838722449518279,
"grad_norm": 0.36987948417663574,
"learning_rate": 1.5458269739143292e-07,
"loss": 0.0796,
"num_input_tokens_seen": 3125504,
"step": 6355
},
{
"epoch": 0.8393823412960274,
"grad_norm": 2.8769209384918213,
"learning_rate": 1.5335459649554538e-07,
"loss": 0.0025,
"num_input_tokens_seen": 3127744,
"step": 6360
},
{
"epoch": 0.8400422330737759,
"grad_norm": 0.1269061416387558,
"learning_rate": 1.5213098822017357e-07,
"loss": 0.1043,
"num_input_tokens_seen": 3130048,
"step": 6365
},
{
"epoch": 0.8407021248515244,
"grad_norm": 0.26083889603614807,
"learning_rate": 1.50911879058246e-07,
"loss": 0.0469,
"num_input_tokens_seen": 3132480,
"step": 6370
},
{
"epoch": 0.8413620166292728,
"grad_norm": 107.53121948242188,
"learning_rate": 1.4969727547881628e-07,
"loss": 0.1012,
"num_input_tokens_seen": 3135104,
"step": 6375
},
{
"epoch": 0.8420219084070213,
"grad_norm": 14.834338188171387,
"learning_rate": 1.4848718392703052e-07,
"loss": 0.1743,
"num_input_tokens_seen": 3137344,
"step": 6380
},
{
"epoch": 0.8426818001847697,
"grad_norm": 16.3470401763916,
"learning_rate": 1.472816108240915e-07,
"loss": 0.1728,
"num_input_tokens_seen": 3140096,
"step": 6385
},
{
"epoch": 0.8433416919625182,
"grad_norm": 0.3881288170814514,
"learning_rate": 1.46080562567226e-07,
"loss": 0.0782,
"num_input_tokens_seen": 3142400,
"step": 6390
},
{
"epoch": 0.8440015837402666,
"grad_norm": 0.1693449318408966,
"learning_rate": 1.4488404552964993e-07,
"loss": 0.0657,
"num_input_tokens_seen": 3144512,
"step": 6395
},
{
"epoch": 0.8446614755180151,
"grad_norm": 68.23955535888672,
"learning_rate": 1.4369206606053463e-07,
"loss": 0.0303,
"num_input_tokens_seen": 3146944,
"step": 6400
},
{
"epoch": 0.8453213672957635,
"grad_norm": 0.20148129761219025,
"learning_rate": 1.425046304849742e-07,
"loss": 0.0816,
"num_input_tokens_seen": 3149376,
"step": 6405
},
{
"epoch": 0.845981259073512,
"grad_norm": 0.5425065755844116,
"learning_rate": 1.4132174510395024e-07,
"loss": 0.1094,
"num_input_tokens_seen": 3151744,
"step": 6410
},
{
"epoch": 0.8466411508512603,
"grad_norm": 0.209197536110878,
"learning_rate": 1.4014341619430003e-07,
"loss": 0.0082,
"num_input_tokens_seen": 3154112,
"step": 6415
},
{
"epoch": 0.8473010426290088,
"grad_norm": 0.13178904354572296,
"learning_rate": 1.3896965000868188e-07,
"loss": 0.0082,
"num_input_tokens_seen": 3156480,
"step": 6420
},
{
"epoch": 0.8479609344067572,
"grad_norm": 33.16168975830078,
"learning_rate": 1.3780045277554276e-07,
"loss": 0.138,
"num_input_tokens_seen": 3158784,
"step": 6425
},
{
"epoch": 0.8486208261845057,
"grad_norm": 0.1764160841703415,
"learning_rate": 1.3663583069908535e-07,
"loss": 0.1674,
"num_input_tokens_seen": 3161152,
"step": 6430
},
{
"epoch": 0.8492807179622542,
"grad_norm": 0.02807113528251648,
"learning_rate": 1.3547578995923447e-07,
"loss": 0.0385,
"num_input_tokens_seen": 3163776,
"step": 6435
},
{
"epoch": 0.8499406097400026,
"grad_norm": 34.52450180053711,
"learning_rate": 1.3432033671160458e-07,
"loss": 0.1202,
"num_input_tokens_seen": 3166272,
"step": 6440
},
{
"epoch": 0.8503365448066517,
"eval_loss": 0.09701072424650192,
"eval_runtime": 7.7873,
"eval_samples_per_second": 864.874,
"eval_steps_per_second": 108.125,
"num_input_tokens_seen": 3167488,
"step": 6443
},
{
"epoch": 0.8506005015177511,
"grad_norm": 1.1372543573379517,
"learning_rate": 1.3316947708746762e-07,
"loss": 0.0653,
"num_input_tokens_seen": 3168640,
"step": 6445
},
{
"epoch": 0.8512603932954995,
"grad_norm": 0.052345190197229385,
"learning_rate": 1.3202321719371967e-07,
"loss": 0.1256,
"num_input_tokens_seen": 3171008,
"step": 6450
},
{
"epoch": 0.851920285073248,
"grad_norm": 0.021850943565368652,
"learning_rate": 1.3088156311284893e-07,
"loss": 0.1099,
"num_input_tokens_seen": 3173312,
"step": 6455
},
{
"epoch": 0.8525801768509964,
"grad_norm": 0.08195928484201431,
"learning_rate": 1.2974452090290322e-07,
"loss": 0.2267,
"num_input_tokens_seen": 3175808,
"step": 6460
},
{
"epoch": 0.8532400686287449,
"grad_norm": 0.05846588686108589,
"learning_rate": 1.2861209659745865e-07,
"loss": 0.0888,
"num_input_tokens_seen": 3178048,
"step": 6465
},
{
"epoch": 0.8538999604064933,
"grad_norm": 0.08579205721616745,
"learning_rate": 1.2748429620558654e-07,
"loss": 0.0148,
"num_input_tokens_seen": 3180544,
"step": 6470
},
{
"epoch": 0.8545598521842418,
"grad_norm": 9.61294937133789,
"learning_rate": 1.2636112571182167e-07,
"loss": 0.1561,
"num_input_tokens_seen": 3183040,
"step": 6475
},
{
"epoch": 0.8552197439619902,
"grad_norm": 0.1357005089521408,
"learning_rate": 1.2524259107613178e-07,
"loss": 0.1766,
"num_input_tokens_seen": 3185664,
"step": 6480
},
{
"epoch": 0.8558796357397387,
"grad_norm": 15.378425598144531,
"learning_rate": 1.2412869823388382e-07,
"loss": 0.146,
"num_input_tokens_seen": 3188672,
"step": 6485
},
{
"epoch": 0.8565395275174872,
"grad_norm": 0.08732222765684128,
"learning_rate": 1.2301945309581486e-07,
"loss": 0.0385,
"num_input_tokens_seen": 3191168,
"step": 6490
},
{
"epoch": 0.8571994192952356,
"grad_norm": 1.1724306344985962,
"learning_rate": 1.2191486154799846e-07,
"loss": 0.0822,
"num_input_tokens_seen": 3193664,
"step": 6495
},
{
"epoch": 0.8578593110729841,
"grad_norm": 0.05099056288599968,
"learning_rate": 1.208149294518147e-07,
"loss": 0.001,
"num_input_tokens_seen": 3196224,
"step": 6500
},
{
"epoch": 0.8585192028507325,
"grad_norm": 0.06140226498246193,
"learning_rate": 1.1971966264391954e-07,
"loss": 0.1988,
"num_input_tokens_seen": 3198784,
"step": 6505
},
{
"epoch": 0.859179094628481,
"grad_norm": 0.07323023676872253,
"learning_rate": 1.1862906693621233e-07,
"loss": 0.1104,
"num_input_tokens_seen": 3201472,
"step": 6510
},
{
"epoch": 0.8598389864062294,
"grad_norm": 0.11436515301465988,
"learning_rate": 1.1754314811580623e-07,
"loss": 0.1169,
"num_input_tokens_seen": 3203584,
"step": 6515
},
{
"epoch": 0.8604988781839779,
"grad_norm": 0.09268505871295929,
"learning_rate": 1.1646191194499655e-07,
"loss": 0.0712,
"num_input_tokens_seen": 3205888,
"step": 6520
},
{
"epoch": 0.8611587699617262,
"grad_norm": 1.127016544342041,
"learning_rate": 1.1538536416123168e-07,
"loss": 0.1908,
"num_input_tokens_seen": 3208000,
"step": 6525
},
{
"epoch": 0.8618186617394747,
"grad_norm": 0.36033815145492554,
"learning_rate": 1.1431351047708072e-07,
"loss": 0.0208,
"num_input_tokens_seen": 3210240,
"step": 6530
},
{
"epoch": 0.8624785535172231,
"grad_norm": 46.454463958740234,
"learning_rate": 1.1324635658020432e-07,
"loss": 0.1363,
"num_input_tokens_seen": 3212672,
"step": 6535
},
{
"epoch": 0.8631384452949716,
"grad_norm": 9.648067474365234,
"learning_rate": 1.1218390813332479e-07,
"loss": 0.1361,
"num_input_tokens_seen": 3215360,
"step": 6540
},
{
"epoch": 0.86379833707272,
"grad_norm": 0.07266692072153091,
"learning_rate": 1.1112617077419472e-07,
"loss": 0.1234,
"num_input_tokens_seen": 3218112,
"step": 6545
},
{
"epoch": 0.8644582288504685,
"grad_norm": 0.30243945121765137,
"learning_rate": 1.1007315011556884e-07,
"loss": 0.0346,
"num_input_tokens_seen": 3220288,
"step": 6550
},
{
"epoch": 0.865118120628217,
"grad_norm": 0.1822911947965622,
"learning_rate": 1.0902485174517251e-07,
"loss": 0.0015,
"num_input_tokens_seen": 3222976,
"step": 6555
},
{
"epoch": 0.8657780124059654,
"grad_norm": 0.06488798558712006,
"learning_rate": 1.0798128122567285e-07,
"loss": 0.0725,
"num_input_tokens_seen": 3225472,
"step": 6560
},
{
"epoch": 0.8664379041837139,
"grad_norm": 0.4437669813632965,
"learning_rate": 1.0694244409464992e-07,
"loss": 0.1631,
"num_input_tokens_seen": 3228096,
"step": 6565
},
{
"epoch": 0.8670977959614623,
"grad_norm": 0.030641254037618637,
"learning_rate": 1.0590834586456577e-07,
"loss": 0.1158,
"num_input_tokens_seen": 3230720,
"step": 6570
},
{
"epoch": 0.8677576877392108,
"grad_norm": 149.95916748046875,
"learning_rate": 1.0487899202273708e-07,
"loss": 0.1239,
"num_input_tokens_seen": 3233088,
"step": 6575
},
{
"epoch": 0.8684175795169592,
"grad_norm": 26.74003791809082,
"learning_rate": 1.0385438803130364e-07,
"loss": 0.1255,
"num_input_tokens_seen": 3235712,
"step": 6580
},
{
"epoch": 0.8690774712947077,
"grad_norm": 0.10586915165185928,
"learning_rate": 1.0283453932720199e-07,
"loss": 0.1423,
"num_input_tokens_seen": 3238528,
"step": 6585
},
{
"epoch": 0.8697373630724561,
"grad_norm": 0.10404416173696518,
"learning_rate": 1.0181945132213476e-07,
"loss": 0.0738,
"num_input_tokens_seen": 3240896,
"step": 6590
},
{
"epoch": 0.8703972548502046,
"grad_norm": 0.10048986971378326,
"learning_rate": 1.0080912940254227e-07,
"loss": 0.0016,
"num_input_tokens_seen": 3243392,
"step": 6595
},
{
"epoch": 0.871057146627953,
"grad_norm": 0.277065247297287,
"learning_rate": 9.980357892957492e-08,
"loss": 0.0041,
"num_input_tokens_seen": 3245824,
"step": 6600
},
{
"epoch": 0.8717170384057015,
"grad_norm": 7.954074859619141,
"learning_rate": 9.880280523906337e-08,
"loss": 0.0031,
"num_input_tokens_seen": 3248128,
"step": 6605
},
{
"epoch": 0.8723769301834499,
"grad_norm": 13.635552406311035,
"learning_rate": 9.780681364149091e-08,
"loss": 0.1351,
"num_input_tokens_seen": 3250624,
"step": 6610
},
{
"epoch": 0.8730368219611984,
"grad_norm": 0.3135831356048584,
"learning_rate": 9.681560942196587e-08,
"loss": 0.1127,
"num_input_tokens_seen": 3253312,
"step": 6615
},
{
"epoch": 0.8736967137389469,
"grad_norm": 0.04287222400307655,
"learning_rate": 9.582919784019194e-08,
"loss": 0.1168,
"num_input_tokens_seen": 3255488,
"step": 6620
},
{
"epoch": 0.8743566055166953,
"grad_norm": 0.04105079546570778,
"learning_rate": 9.484758413044236e-08,
"loss": 0.0668,
"num_input_tokens_seen": 3257664,
"step": 6625
},
{
"epoch": 0.8750164972944438,
"grad_norm": 0.12905164062976837,
"learning_rate": 9.387077350153017e-08,
"loss": 0.0542,
"num_input_tokens_seen": 3260160,
"step": 6630
},
{
"epoch": 0.8756763890721921,
"grad_norm": 13.714322090148926,
"learning_rate": 9.289877113678168e-08,
"loss": 0.0616,
"num_input_tokens_seen": 3262528,
"step": 6635
},
{
"epoch": 0.8763362808499406,
"grad_norm": 0.020841121673583984,
"learning_rate": 9.19315821940092e-08,
"loss": 0.0576,
"num_input_tokens_seen": 3265024,
"step": 6640
},
{
"epoch": 0.876996172627689,
"grad_norm": 0.17767778038978577,
"learning_rate": 9.096921180548234e-08,
"loss": 0.1659,
"num_input_tokens_seen": 3267456,
"step": 6645
},
{
"epoch": 0.8776560644054375,
"grad_norm": 0.2553451657295227,
"learning_rate": 9.001166507790259e-08,
"loss": 0.0915,
"num_input_tokens_seen": 3270208,
"step": 6650
},
{
"epoch": 0.8783159561831859,
"grad_norm": 12.365303993225098,
"learning_rate": 8.905894709237427e-08,
"loss": 0.1045,
"num_input_tokens_seen": 3272960,
"step": 6655
},
{
"epoch": 0.8789758479609344,
"grad_norm": 0.07707412540912628,
"learning_rate": 8.811106290437975e-08,
"loss": 0.0736,
"num_input_tokens_seen": 3275136,
"step": 6660
},
{
"epoch": 0.8796357397386828,
"grad_norm": 169.89840698242188,
"learning_rate": 8.716801754375036e-08,
"loss": 0.1122,
"num_input_tokens_seen": 3277696,
"step": 6665
},
{
"epoch": 0.8802956315164313,
"grad_norm": 12.09985065460205,
"learning_rate": 8.62298160146413e-08,
"loss": 0.1268,
"num_input_tokens_seen": 3280064,
"step": 6670
},
{
"epoch": 0.8809555232941798,
"grad_norm": 0.17147305607795715,
"learning_rate": 8.529646329550466e-08,
"loss": 0.002,
"num_input_tokens_seen": 3282304,
"step": 6675
},
{
"epoch": 0.8816154150719282,
"grad_norm": 0.1868370920419693,
"learning_rate": 8.436796433906235e-08,
"loss": 0.0268,
"num_input_tokens_seen": 3284736,
"step": 6680
},
{
"epoch": 0.8822753068496767,
"grad_norm": 16.801742553710938,
"learning_rate": 8.344432407228141e-08,
"loss": 0.0431,
"num_input_tokens_seen": 3287168,
"step": 6685
},
{
"epoch": 0.8829351986274251,
"grad_norm": 1.1702982187271118,
"learning_rate": 8.252554739634577e-08,
"loss": 0.0486,
"num_input_tokens_seen": 3289600,
"step": 6690
},
{
"epoch": 0.8835950904051736,
"grad_norm": 0.043459370732307434,
"learning_rate": 8.16116391866316e-08,
"loss": 0.0731,
"num_input_tokens_seen": 3292160,
"step": 6695
},
{
"epoch": 0.884254982182922,
"grad_norm": 17.922903060913086,
"learning_rate": 8.070260429268172e-08,
"loss": 0.1312,
"num_input_tokens_seen": 3294592,
"step": 6700
},
{
"epoch": 0.8849148739606705,
"grad_norm": 129.87510681152344,
"learning_rate": 7.979844753817855e-08,
"loss": 0.0078,
"num_input_tokens_seen": 3296960,
"step": 6705
},
{
"epoch": 0.8855747657384189,
"grad_norm": 29.814653396606445,
"learning_rate": 7.889917372091982e-08,
"loss": 0.0772,
"num_input_tokens_seen": 3299200,
"step": 6710
},
{
"epoch": 0.8862346575161674,
"grad_norm": 8.637088775634766,
"learning_rate": 7.800478761279183e-08,
"loss": 0.2034,
"num_input_tokens_seen": 3301568,
"step": 6715
},
{
"epoch": 0.8868945492939158,
"grad_norm": 11.602696418762207,
"learning_rate": 7.711529395974592e-08,
"loss": 0.1794,
"num_input_tokens_seen": 3304064,
"step": 6720
},
{
"epoch": 0.8875544410716643,
"grad_norm": 0.04998312518000603,
"learning_rate": 7.623069748177135e-08,
"loss": 0.1778,
"num_input_tokens_seen": 3306432,
"step": 6725
},
{
"epoch": 0.8882143328494126,
"grad_norm": 0.4295664131641388,
"learning_rate": 7.535100287287111e-08,
"loss": 0.1002,
"num_input_tokens_seen": 3308736,
"step": 6730
},
{
"epoch": 0.8888742246271611,
"grad_norm": 0.11967656761407852,
"learning_rate": 7.447621480103783e-08,
"loss": 0.0022,
"num_input_tokens_seen": 3311168,
"step": 6735
},
{
"epoch": 0.8895341164049096,
"grad_norm": 17.428560256958008,
"learning_rate": 7.360633790822713e-08,
"loss": 0.2822,
"num_input_tokens_seen": 3313664,
"step": 6740
},
{
"epoch": 0.890194008182658,
"grad_norm": 0.2180459350347519,
"learning_rate": 7.274137681033498e-08,
"loss": 0.022,
"num_input_tokens_seen": 3316224,
"step": 6745
},
{
"epoch": 0.8908538999604065,
"grad_norm": 0.13484865427017212,
"learning_rate": 7.188133609717184e-08,
"loss": 0.0855,
"num_input_tokens_seen": 3318464,
"step": 6750
},
{
"epoch": 0.8915137917381549,
"grad_norm": 0.0493309311568737,
"learning_rate": 7.102622033243843e-08,
"loss": 0.0011,
"num_input_tokens_seen": 3320896,
"step": 6755
},
{
"epoch": 0.8921736835159034,
"grad_norm": 0.22488893568515778,
"learning_rate": 7.017603405370276e-08,
"loss": 0.1368,
"num_input_tokens_seen": 3323648,
"step": 6760
},
{
"epoch": 0.8928335752936518,
"grad_norm": 0.15953336656093597,
"learning_rate": 6.933078177237429e-08,
"loss": 0.1476,
"num_input_tokens_seen": 3326208,
"step": 6765
},
{
"epoch": 0.8934934670714003,
"grad_norm": 0.4283379912376404,
"learning_rate": 6.849046797368108e-08,
"loss": 0.0651,
"num_input_tokens_seen": 3328576,
"step": 6770
},
{
"epoch": 0.8941533588491487,
"grad_norm": 28.798320770263672,
"learning_rate": 6.765509711664574e-08,
"loss": 0.003,
"num_input_tokens_seen": 3331520,
"step": 6775
},
{
"epoch": 0.8948132506268972,
"grad_norm": 0.33185452222824097,
"learning_rate": 6.682467363406174e-08,
"loss": 0.0235,
"num_input_tokens_seen": 3334336,
"step": 6780
},
{
"epoch": 0.8954731424046456,
"grad_norm": 0.24480366706848145,
"learning_rate": 6.59992019324701e-08,
"loss": 0.0671,
"num_input_tokens_seen": 3336896,
"step": 6785
},
{
"epoch": 0.8961330341823941,
"grad_norm": 9.714395523071289,
"learning_rate": 6.517868639213553e-08,
"loss": 0.1574,
"num_input_tokens_seen": 3339328,
"step": 6790
},
{
"epoch": 0.8967929259601425,
"grad_norm": 0.48568111658096313,
"learning_rate": 6.436313136702387e-08,
"loss": 0.0331,
"num_input_tokens_seen": 3341760,
"step": 6795
},
{
"epoch": 0.897452817737891,
"grad_norm": 0.3631482720375061,
"learning_rate": 6.355254118477815e-08,
"loss": 0.0527,
"num_input_tokens_seen": 3344448,
"step": 6800
},
{
"epoch": 0.8981127095156395,
"grad_norm": 0.10991880297660828,
"learning_rate": 6.274692014669602e-08,
"loss": 0.0009,
"num_input_tokens_seen": 3347008,
"step": 6805
},
{
"epoch": 0.8987726012933879,
"grad_norm": 0.15773239731788635,
"learning_rate": 6.194627252770768e-08,
"loss": 0.0008,
"num_input_tokens_seen": 3349824,
"step": 6810
},
{
"epoch": 0.8994324930711364,
"grad_norm": 0.0758163183927536,
"learning_rate": 6.115060257635174e-08,
"loss": 0.0687,
"num_input_tokens_seen": 3352320,
"step": 6815
},
{
"epoch": 0.9000923848488848,
"grad_norm": 0.21164242923259735,
"learning_rate": 6.035991451475375e-08,
"loss": 0.0013,
"num_input_tokens_seen": 3354688,
"step": 6820
},
{
"epoch": 0.9003563415599841,
"eval_loss": 0.09568765014410019,
"eval_runtime": 7.581,
"eval_samples_per_second": 888.409,
"eval_steps_per_second": 111.068,
"num_input_tokens_seen": 3355520,
"step": 6822
},
{
"epoch": 0.9007522766266333,
"grad_norm": 0.030890563502907753,
"learning_rate": 5.9574212538603505e-08,
"loss": 0.0891,
"num_input_tokens_seen": 3357056,
"step": 6825
},
{
"epoch": 0.9014121684043817,
"grad_norm": 0.39177563786506653,
"learning_rate": 5.879350081713252e-08,
"loss": 0.0683,
"num_input_tokens_seen": 3359488,
"step": 6830
},
{
"epoch": 0.9020720601821302,
"grad_norm": 0.23050019145011902,
"learning_rate": 5.8017783493092386e-08,
"loss": 0.2249,
"num_input_tokens_seen": 3361920,
"step": 6835
},
{
"epoch": 0.9027319519598785,
"grad_norm": 0.1468856930732727,
"learning_rate": 5.7247064682732104e-08,
"loss": 0.0018,
"num_input_tokens_seen": 3364416,
"step": 6840
},
{
"epoch": 0.903391843737627,
"grad_norm": 0.22081144154071808,
"learning_rate": 5.6481348475777566e-08,
"loss": 0.0617,
"num_input_tokens_seen": 3366912,
"step": 6845
},
{
"epoch": 0.9040517355153754,
"grad_norm": 0.021701961755752563,
"learning_rate": 5.5720638935407796e-08,
"loss": 0.0014,
"num_input_tokens_seen": 3369088,
"step": 6850
},
{
"epoch": 0.9047116272931239,
"grad_norm": 0.013656373135745525,
"learning_rate": 5.49649400982356e-08,
"loss": 0.1392,
"num_input_tokens_seen": 3371520,
"step": 6855
},
{
"epoch": 0.9053715190708723,
"grad_norm": 0.04417372867465019,
"learning_rate": 5.421425597428442e-08,
"loss": 0.0007,
"num_input_tokens_seen": 3374080,
"step": 6860
},
{
"epoch": 0.9060314108486208,
"grad_norm": 126.34750366210938,
"learning_rate": 5.346859054696784e-08,
"loss": 0.0786,
"num_input_tokens_seen": 3376640,
"step": 6865
},
{
"epoch": 0.9066913026263693,
"grad_norm": 0.02389339543879032,
"learning_rate": 5.2727947773068773e-08,
"loss": 0.0794,
"num_input_tokens_seen": 3379072,
"step": 6870
},
{
"epoch": 0.9073511944041177,
"grad_norm": 0.42352914810180664,
"learning_rate": 5.199233158271732e-08,
"loss": 0.0732,
"num_input_tokens_seen": 3381696,
"step": 6875
},
{
"epoch": 0.9080110861818662,
"grad_norm": 11.932153701782227,
"learning_rate": 5.126174587937149e-08,
"loss": 0.2058,
"num_input_tokens_seen": 3384064,
"step": 6880
},
{
"epoch": 0.9086709779596146,
"grad_norm": 0.0787430927157402,
"learning_rate": 5.053619453979485e-08,
"loss": 0.0036,
"num_input_tokens_seen": 3386304,
"step": 6885
},
{
"epoch": 0.9093308697373631,
"grad_norm": 0.03228071704506874,
"learning_rate": 4.9815681414037025e-08,
"loss": 0.1486,
"num_input_tokens_seen": 3388800,
"step": 6890
},
{
"epoch": 0.9099907615151115,
"grad_norm": 0.38972869515419006,
"learning_rate": 4.910021032541334e-08,
"loss": 0.0886,
"num_input_tokens_seen": 3391232,
"step": 6895
},
{
"epoch": 0.91065065329286,
"grad_norm": 29.313077926635742,
"learning_rate": 4.838978507048319e-08,
"loss": 0.0815,
"num_input_tokens_seen": 3393664,
"step": 6900
},
{
"epoch": 0.9113105450706084,
"grad_norm": 2.1044397354125977,
"learning_rate": 4.768440941903207e-08,
"loss": 0.0055,
"num_input_tokens_seen": 3395968,
"step": 6905
},
{
"epoch": 0.9119704368483569,
"grad_norm": 0.08754704892635345,
"learning_rate": 4.698408711404944e-08,
"loss": 0.0122,
"num_input_tokens_seen": 3398272,
"step": 6910
},
{
"epoch": 0.9126303286261053,
"grad_norm": 19.10022735595703,
"learning_rate": 4.628882187171046e-08,
"loss": 0.0763,
"num_input_tokens_seen": 3400960,
"step": 6915
},
{
"epoch": 0.9132902204038538,
"grad_norm": 20.788782119750977,
"learning_rate": 4.559861738135506e-08,
"loss": 0.155,
"num_input_tokens_seen": 3403520,
"step": 6920
},
{
"epoch": 0.9139501121816023,
"grad_norm": 1.3679563999176025,
"learning_rate": 4.491347730546913e-08,
"loss": 0.229,
"num_input_tokens_seen": 3405952,
"step": 6925
},
{
"epoch": 0.9146100039593507,
"grad_norm": 18.096542358398438,
"learning_rate": 4.423340527966512e-08,
"loss": 0.128,
"num_input_tokens_seen": 3408320,
"step": 6930
},
{
"epoch": 0.9152698957370992,
"grad_norm": 0.17555084824562073,
"learning_rate": 4.355840491266205e-08,
"loss": 0.0052,
"num_input_tokens_seen": 3410880,
"step": 6935
},
{
"epoch": 0.9159297875148475,
"grad_norm": 0.056320879608392715,
"learning_rate": 4.288847978626686e-08,
"loss": 0.0576,
"num_input_tokens_seen": 3413440,
"step": 6940
},
{
"epoch": 0.916589679292596,
"grad_norm": 26.998863220214844,
"learning_rate": 4.222363345535585e-08,
"loss": 0.1275,
"num_input_tokens_seen": 3416000,
"step": 6945
},
{
"epoch": 0.9172495710703444,
"grad_norm": 12.58722972869873,
"learning_rate": 4.1563869447854505e-08,
"loss": 0.1253,
"num_input_tokens_seen": 3418240,
"step": 6950
},
{
"epoch": 0.9179094628480929,
"grad_norm": 0.30387794971466064,
"learning_rate": 4.090919126472048e-08,
"loss": 0.1407,
"num_input_tokens_seen": 3420672,
"step": 6955
},
{
"epoch": 0.9185693546258413,
"grad_norm": 65.01815795898438,
"learning_rate": 4.025960237992332e-08,
"loss": 0.0538,
"num_input_tokens_seen": 3422912,
"step": 6960
},
{
"epoch": 0.9192292464035898,
"grad_norm": 7.150808334350586,
"learning_rate": 3.961510624042741e-08,
"loss": 0.0027,
"num_input_tokens_seen": 3425408,
"step": 6965
},
{
"epoch": 0.9198891381813382,
"grad_norm": 14.636774063110352,
"learning_rate": 3.8975706266172636e-08,
"loss": 0.1111,
"num_input_tokens_seen": 3427776,
"step": 6970
},
{
"epoch": 0.9205490299590867,
"grad_norm": 41.149513244628906,
"learning_rate": 3.834140585005696e-08,
"loss": 0.0538,
"num_input_tokens_seen": 3430336,
"step": 6975
},
{
"epoch": 0.9212089217368351,
"grad_norm": 37.5268669128418,
"learning_rate": 3.771220835791844e-08,
"loss": 0.2688,
"num_input_tokens_seen": 3432896,
"step": 6980
},
{
"epoch": 0.9218688135145836,
"grad_norm": 0.18734599649906158,
"learning_rate": 3.708811712851634e-08,
"loss": 0.0703,
"num_input_tokens_seen": 3435136,
"step": 6985
},
{
"epoch": 0.9225287052923321,
"grad_norm": 0.09961698204278946,
"learning_rate": 3.6469135473514936e-08,
"loss": 0.0604,
"num_input_tokens_seen": 3437824,
"step": 6990
},
{
"epoch": 0.9231885970700805,
"grad_norm": 0.04659373685717583,
"learning_rate": 3.5855266677464744e-08,
"loss": 0.0066,
"num_input_tokens_seen": 3440320,
"step": 6995
},
{
"epoch": 0.923848488847829,
"grad_norm": 0.21239009499549866,
"learning_rate": 3.524651399778555e-08,
"loss": 0.0499,
"num_input_tokens_seen": 3442880,
"step": 7000
},
{
"epoch": 0.9245083806255774,
"grad_norm": 0.08486049622297287,
"learning_rate": 3.4642880664749296e-08,
"loss": 0.0009,
"num_input_tokens_seen": 3445120,
"step": 7005
},
{
"epoch": 0.9251682724033259,
"grad_norm": 0.2830374538898468,
"learning_rate": 3.404436988146242e-08,
"loss": 0.1758,
"num_input_tokens_seen": 3447424,
"step": 7010
},
{
"epoch": 0.9258281641810743,
"grad_norm": 0.012739721685647964,
"learning_rate": 3.345098482384956e-08,
"loss": 0.0461,
"num_input_tokens_seen": 3449920,
"step": 7015
},
{
"epoch": 0.9264880559588228,
"grad_norm": 0.5981858968734741,
"learning_rate": 3.2862728640636105e-08,
"loss": 0.0499,
"num_input_tokens_seen": 3452416,
"step": 7020
},
{
"epoch": 0.9271479477365712,
"grad_norm": 16.553138732910156,
"learning_rate": 3.227960445333155e-08,
"loss": 0.1119,
"num_input_tokens_seen": 3454912,
"step": 7025
},
{
"epoch": 0.9278078395143197,
"grad_norm": 0.03474080190062523,
"learning_rate": 3.1701615356213295e-08,
"loss": 0.0654,
"num_input_tokens_seen": 3457472,
"step": 7030
},
{
"epoch": 0.928467731292068,
"grad_norm": 0.11611025035381317,
"learning_rate": 3.112876441630985e-08,
"loss": 0.0654,
"num_input_tokens_seen": 3459712,
"step": 7035
},
{
"epoch": 0.9291276230698166,
"grad_norm": 0.19927047193050385,
"learning_rate": 3.05610546733851e-08,
"loss": 0.0532,
"num_input_tokens_seen": 3462144,
"step": 7040
},
{
"epoch": 0.9297875148475649,
"grad_norm": 13.10682201385498,
"learning_rate": 2.99984891399212e-08,
"loss": 0.2881,
"num_input_tokens_seen": 3464512,
"step": 7045
},
{
"epoch": 0.9304474066253134,
"grad_norm": 0.17246191203594208,
"learning_rate": 2.9441070801103808e-08,
"loss": 0.0061,
"num_input_tokens_seen": 3466880,
"step": 7050
},
{
"epoch": 0.931107298403062,
"grad_norm": 0.28195682168006897,
"learning_rate": 2.8888802614805085e-08,
"loss": 0.1035,
"num_input_tokens_seen": 3469248,
"step": 7055
},
{
"epoch": 0.9317671901808103,
"grad_norm": 41.38626480102539,
"learning_rate": 2.8341687511568734e-08,
"loss": 0.2707,
"num_input_tokens_seen": 3471616,
"step": 7060
},
{
"epoch": 0.9324270819585588,
"grad_norm": 0.20374363660812378,
"learning_rate": 2.7799728394594547e-08,
"loss": 0.0773,
"num_input_tokens_seen": 3474240,
"step": 7065
},
{
"epoch": 0.9330869737363072,
"grad_norm": 0.10206926614046097,
"learning_rate": 2.7262928139722198e-08,
"loss": 0.0759,
"num_input_tokens_seen": 3476800,
"step": 7070
},
{
"epoch": 0.9337468655140557,
"grad_norm": 0.04854326695203781,
"learning_rate": 2.673128959541693e-08,
"loss": 0.0879,
"num_input_tokens_seen": 3479488,
"step": 7075
},
{
"epoch": 0.9344067572918041,
"grad_norm": 0.021472515538334846,
"learning_rate": 2.620481558275367e-08,
"loss": 0.0007,
"num_input_tokens_seen": 3482176,
"step": 7080
},
{
"epoch": 0.9350666490695526,
"grad_norm": 69.08782958984375,
"learning_rate": 2.5683508895402382e-08,
"loss": 0.0318,
"num_input_tokens_seen": 3484800,
"step": 7085
},
{
"epoch": 0.935726540847301,
"grad_norm": 0.1581341028213501,
"learning_rate": 2.5167372299613853e-08,
"loss": 0.1076,
"num_input_tokens_seen": 3487488,
"step": 7090
},
{
"epoch": 0.9363864326250495,
"grad_norm": 11.627638816833496,
"learning_rate": 2.4656408534203365e-08,
"loss": 0.238,
"num_input_tokens_seen": 3489728,
"step": 7095
},
{
"epoch": 0.9370463244027979,
"grad_norm": 0.025092612951993942,
"learning_rate": 2.4150620310538273e-08,
"loss": 0.2424,
"num_input_tokens_seen": 3491904,
"step": 7100
},
{
"epoch": 0.9377062161805464,
"grad_norm": 12.157607078552246,
"learning_rate": 2.3650010312521673e-08,
"loss": 0.0751,
"num_input_tokens_seen": 3494592,
"step": 7105
},
{
"epoch": 0.9383661079582949,
"grad_norm": 0.0817142128944397,
"learning_rate": 2.3154581196579648e-08,
"loss": 0.1782,
"num_input_tokens_seen": 3497088,
"step": 7110
},
{
"epoch": 0.9390259997360433,
"grad_norm": 0.06925242394208908,
"learning_rate": 2.2664335591646377e-08,
"loss": 0.0552,
"num_input_tokens_seen": 3499520,
"step": 7115
},
{
"epoch": 0.9396858915137918,
"grad_norm": 0.029523100703954697,
"learning_rate": 2.2179276099150158e-08,
"loss": 0.1962,
"num_input_tokens_seen": 3502208,
"step": 7120
},
{
"epoch": 0.9403457832915402,
"grad_norm": 121.08486938476562,
"learning_rate": 2.1699405293000182e-08,
"loss": 0.1811,
"num_input_tokens_seen": 3504640,
"step": 7125
},
{
"epoch": 0.9410056750692887,
"grad_norm": 0.1253107488155365,
"learning_rate": 2.1224725719572235e-08,
"loss": 0.0653,
"num_input_tokens_seen": 3506944,
"step": 7130
},
{
"epoch": 0.9416655668470371,
"grad_norm": 46.052162170410156,
"learning_rate": 2.0755239897695453e-08,
"loss": 0.1533,
"num_input_tokens_seen": 3509376,
"step": 7135
},
{
"epoch": 0.9423254586247856,
"grad_norm": 0.4726586639881134,
"learning_rate": 2.0290950318639256e-08,
"loss": 0.1645,
"num_input_tokens_seen": 3511680,
"step": 7140
},
{
"epoch": 0.942985350402534,
"grad_norm": 3.1492843627929688,
"learning_rate": 1.983185944609944e-08,
"loss": 0.0611,
"num_input_tokens_seen": 3514112,
"step": 7145
},
{
"epoch": 0.9436452421802825,
"grad_norm": 0.20620296895503998,
"learning_rate": 1.9377969716185994e-08,
"loss": 0.0665,
"num_input_tokens_seen": 3516480,
"step": 7150
},
{
"epoch": 0.9443051339580308,
"grad_norm": 0.07421538978815079,
"learning_rate": 1.8929283537408968e-08,
"loss": 0.1162,
"num_input_tokens_seen": 3518720,
"step": 7155
},
{
"epoch": 0.9449650257357793,
"grad_norm": 0.12716051936149597,
"learning_rate": 1.848580329066718e-08,
"loss": 0.0086,
"num_input_tokens_seen": 3521216,
"step": 7160
},
{
"epoch": 0.9456249175135277,
"grad_norm": 18.4110164642334,
"learning_rate": 1.804753132923431e-08,
"loss": 0.3859,
"num_input_tokens_seen": 3523776,
"step": 7165
},
{
"epoch": 0.9462848092912762,
"grad_norm": 0.3863736093044281,
"learning_rate": 1.7614469978746827e-08,
"loss": 0.0012,
"num_input_tokens_seen": 3526272,
"step": 7170
},
{
"epoch": 0.9469447010690247,
"grad_norm": 67.91719818115234,
"learning_rate": 1.7186621537192304e-08,
"loss": 0.0324,
"num_input_tokens_seen": 3528576,
"step": 7175
},
{
"epoch": 0.9476045928467731,
"grad_norm": 0.12099135667085648,
"learning_rate": 1.6763988274896003e-08,
"loss": 0.0012,
"num_input_tokens_seen": 3531136,
"step": 7180
},
{
"epoch": 0.9482644846245216,
"grad_norm": 14.467368125915527,
"learning_rate": 1.6346572434509876e-08,
"loss": 0.1503,
"num_input_tokens_seen": 3533696,
"step": 7185
},
{
"epoch": 0.94892437640227,
"grad_norm": 0.32109934091567993,
"learning_rate": 1.5934376231000248e-08,
"loss": 0.1569,
"num_input_tokens_seen": 3536064,
"step": 7190
},
{
"epoch": 0.9495842681800185,
"grad_norm": 0.11669757217168808,
"learning_rate": 1.55274018516357e-08,
"loss": 0.0044,
"num_input_tokens_seen": 3538432,
"step": 7195
},
{
"epoch": 0.9502441599577669,
"grad_norm": 0.06187443807721138,
"learning_rate": 1.512565145597633e-08,
"loss": 0.05,
"num_input_tokens_seen": 3541120,
"step": 7200
},
{
"epoch": 0.9503761383133166,
"eval_loss": 0.09555233269929886,
"eval_runtime": 7.635,
"eval_samples_per_second": 882.126,
"eval_steps_per_second": 110.282,
"num_input_tokens_seen": 3541632,
"step": 7201
},
{
"epoch": 0.9509040517355154,
"grad_norm": 72.59228515625,
"learning_rate": 1.47291271758615e-08,
"loss": 0.0498,
"num_input_tokens_seen": 3543680,
"step": 7205
},
{
"epoch": 0.9515639435132638,
"grad_norm": 107.97425079345703,
"learning_rate": 1.4337831115398991e-08,
"loss": 0.1477,
"num_input_tokens_seen": 3545984,
"step": 7210
},
{
"epoch": 0.9522238352910123,
"grad_norm": 17.91878318786621,
"learning_rate": 1.3951765350953548e-08,
"loss": 0.1276,
"num_input_tokens_seen": 3548544,
"step": 7215
},
{
"epoch": 0.9528837270687607,
"grad_norm": 0.03271764516830444,
"learning_rate": 1.3570931931136009e-08,
"loss": 0.1596,
"num_input_tokens_seen": 3551040,
"step": 7220
},
{
"epoch": 0.9535436188465092,
"grad_norm": 0.11517995595932007,
"learning_rate": 1.3195332876792532e-08,
"loss": 0.0839,
"num_input_tokens_seen": 3553536,
"step": 7225
},
{
"epoch": 0.9542035106242576,
"grad_norm": 0.08386459946632385,
"learning_rate": 1.2824970180993488e-08,
"loss": 0.1149,
"num_input_tokens_seen": 3555712,
"step": 7230
},
{
"epoch": 0.9548634024020061,
"grad_norm": 0.11899381130933762,
"learning_rate": 1.2459845809023484e-08,
"loss": 0.1233,
"num_input_tokens_seen": 3558080,
"step": 7235
},
{
"epoch": 0.9555232941797546,
"grad_norm": 25.366180419921875,
"learning_rate": 1.2099961698370353e-08,
"loss": 0.3036,
"num_input_tokens_seen": 3560640,
"step": 7240
},
{
"epoch": 0.956183185957503,
"grad_norm": 11.879151344299316,
"learning_rate": 1.1745319758715288e-08,
"loss": 0.0906,
"num_input_tokens_seen": 3563392,
"step": 7245
},
{
"epoch": 0.9568430777352515,
"grad_norm": 15.595983505249023,
"learning_rate": 1.1395921871922509e-08,
"loss": 0.1414,
"num_input_tokens_seen": 3565824,
"step": 7250
},
{
"epoch": 0.9575029695129998,
"grad_norm": 0.1044035479426384,
"learning_rate": 1.105176989202905e-08,
"loss": 0.0009,
"num_input_tokens_seen": 3568256,
"step": 7255
},
{
"epoch": 0.9581628612907483,
"grad_norm": 194.96958923339844,
"learning_rate": 1.0712865645235659e-08,
"loss": 0.0157,
"num_input_tokens_seen": 3570752,
"step": 7260
},
{
"epoch": 0.9588227530684967,
"grad_norm": 0.13970717787742615,
"learning_rate": 1.0379210929896131e-08,
"loss": 0.0805,
"num_input_tokens_seen": 3572928,
"step": 7265
},
{
"epoch": 0.9594826448462452,
"grad_norm": 26.312641143798828,
"learning_rate": 1.0050807516508553e-08,
"loss": 0.2674,
"num_input_tokens_seen": 3575296,
"step": 7270
},
{
"epoch": 0.9601425366239936,
"grad_norm": 0.14852392673492432,
"learning_rate": 9.727657147705737e-09,
"loss": 0.0011,
"num_input_tokens_seen": 3577664,
"step": 7275
},
{
"epoch": 0.9608024284017421,
"grad_norm": 0.4168717563152313,
"learning_rate": 9.409761538245575e-09,
"loss": 0.1992,
"num_input_tokens_seen": 3580160,
"step": 7280
},
{
"epoch": 0.9614623201794905,
"grad_norm": 0.20540349185466766,
"learning_rate": 9.097122375002264e-09,
"loss": 0.0761,
"num_input_tokens_seen": 3582464,
"step": 7285
},
{
"epoch": 0.962122211957239,
"grad_norm": 0.28160926699638367,
"learning_rate": 8.789741316957312e-09,
"loss": 0.1308,
"num_input_tokens_seen": 3584896,
"step": 7290
},
{
"epoch": 0.9627821037349875,
"grad_norm": 35.051509857177734,
"learning_rate": 8.487619995190986e-09,
"loss": 0.005,
"num_input_tokens_seen": 3587584,
"step": 7295
},
{
"epoch": 0.9634419955127359,
"grad_norm": 19.22887420654297,
"learning_rate": 8.19076001287311e-09,
"loss": 0.1393,
"num_input_tokens_seen": 3590144,
"step": 7300
},
{
"epoch": 0.9641018872904844,
"grad_norm": 0.0941128209233284,
"learning_rate": 7.899162945254945e-09,
"loss": 0.0012,
"num_input_tokens_seen": 3592832,
"step": 7305
},
{
"epoch": 0.9647617790682328,
"grad_norm": 35.726531982421875,
"learning_rate": 7.612830339660758e-09,
"loss": 0.0509,
"num_input_tokens_seen": 3595456,
"step": 7310
},
{
"epoch": 0.9654216708459813,
"grad_norm": 20.33081817626953,
"learning_rate": 7.3317637154796105e-09,
"loss": 0.1043,
"num_input_tokens_seen": 3597888,
"step": 7315
},
{
"epoch": 0.9660815626237297,
"grad_norm": 19.57093048095703,
"learning_rate": 7.0559645641572465e-09,
"loss": 0.0687,
"num_input_tokens_seen": 3600384,
"step": 7320
},
{
"epoch": 0.9667414544014782,
"grad_norm": 19.5794677734375,
"learning_rate": 6.785434349188102e-09,
"loss": 0.1628,
"num_input_tokens_seen": 3602880,
"step": 7325
},
{
"epoch": 0.9674013461792266,
"grad_norm": 0.20137454569339752,
"learning_rate": 6.520174506107867e-09,
"loss": 0.0423,
"num_input_tokens_seen": 3605248,
"step": 7330
},
{
"epoch": 0.9680612379569751,
"grad_norm": 0.1589362919330597,
"learning_rate": 6.260186442485494e-09,
"loss": 0.0011,
"num_input_tokens_seen": 3607808,
"step": 7335
},
{
"epoch": 0.9687211297347235,
"grad_norm": 6.267104148864746,
"learning_rate": 6.005471537915863e-09,
"loss": 0.1108,
"num_input_tokens_seen": 3610112,
"step": 7340
},
{
"epoch": 0.969381021512472,
"grad_norm": 6.449219226837158,
"learning_rate": 5.756031144012685e-09,
"loss": 0.0454,
"num_input_tokens_seen": 3612352,
"step": 7345
},
{
"epoch": 0.9700409132902204,
"grad_norm": 17.85236930847168,
"learning_rate": 5.511866584400837e-09,
"loss": 0.1715,
"num_input_tokens_seen": 3614848,
"step": 7350
},
{
"epoch": 0.9707008050679689,
"grad_norm": 0.49963685870170593,
"learning_rate": 5.2729791547097e-09,
"loss": 0.0017,
"num_input_tokens_seen": 3617408,
"step": 7355
},
{
"epoch": 0.9713606968457174,
"grad_norm": 20.02973175048828,
"learning_rate": 5.039370122566389e-09,
"loss": 0.0783,
"num_input_tokens_seen": 3619968,
"step": 7360
},
{
"epoch": 0.9720205886234657,
"grad_norm": 0.46782761812210083,
"learning_rate": 4.811040727588755e-09,
"loss": 0.0965,
"num_input_tokens_seen": 3622016,
"step": 7365
},
{
"epoch": 0.9726804804012142,
"grad_norm": 14.681106567382812,
"learning_rate": 4.58799218137873e-09,
"loss": 0.1156,
"num_input_tokens_seen": 3624192,
"step": 7370
},
{
"epoch": 0.9733403721789626,
"grad_norm": 0.10944530367851257,
"learning_rate": 4.370225667516325e-09,
"loss": 0.0009,
"num_input_tokens_seen": 3626624,
"step": 7375
},
{
"epoch": 0.9740002639567111,
"grad_norm": 10.72696304321289,
"learning_rate": 4.157742341552861e-09,
"loss": 0.1827,
"num_input_tokens_seen": 3628928,
"step": 7380
},
{
"epoch": 0.9746601557344595,
"grad_norm": 0.05703306198120117,
"learning_rate": 3.950543331005307e-09,
"loss": 0.0786,
"num_input_tokens_seen": 3631552,
"step": 7385
},
{
"epoch": 0.975320047512208,
"grad_norm": 0.22080153226852417,
"learning_rate": 3.748629735349839e-09,
"loss": 0.0009,
"num_input_tokens_seen": 3633984,
"step": 7390
},
{
"epoch": 0.9759799392899564,
"grad_norm": 0.05749522149562836,
"learning_rate": 3.552002626016293e-09,
"loss": 0.1332,
"num_input_tokens_seen": 3636224,
"step": 7395
},
{
"epoch": 0.9766398310677049,
"grad_norm": 0.09394296258687973,
"learning_rate": 3.3606630463824947e-09,
"loss": 0.2453,
"num_input_tokens_seen": 3638656,
"step": 7400
},
{
"epoch": 0.9772997228454533,
"grad_norm": 0.21530580520629883,
"learning_rate": 3.174612011768607e-09,
"loss": 0.0011,
"num_input_tokens_seen": 3641408,
"step": 7405
},
{
"epoch": 0.9779596146232018,
"grad_norm": 11.811311721801758,
"learning_rate": 2.9938505094316834e-09,
"loss": 0.1615,
"num_input_tokens_seen": 3643840,
"step": 7410
},
{
"epoch": 0.9786195064009502,
"grad_norm": 0.10265132784843445,
"learning_rate": 2.8183794985605637e-09,
"loss": 0.0006,
"num_input_tokens_seen": 3646336,
"step": 7415
},
{
"epoch": 0.9792793981786987,
"grad_norm": 0.0780910775065422,
"learning_rate": 2.6481999102707654e-09,
"loss": 0.0664,
"num_input_tokens_seen": 3648960,
"step": 7420
},
{
"epoch": 0.9799392899564472,
"grad_norm": 0.4066920280456543,
"learning_rate": 2.4833126475994894e-09,
"loss": 0.0011,
"num_input_tokens_seen": 3651200,
"step": 7425
},
{
"epoch": 0.9805991817341956,
"grad_norm": 7.960684776306152,
"learning_rate": 2.3237185855008443e-09,
"loss": 0.0056,
"num_input_tokens_seen": 3653504,
"step": 7430
},
{
"epoch": 0.9812590735119441,
"grad_norm": 62.566429138183594,
"learning_rate": 2.1694185708414083e-09,
"loss": 0.2456,
"num_input_tokens_seen": 3656064,
"step": 7435
},
{
"epoch": 0.9819189652896925,
"grad_norm": 156.15281677246094,
"learning_rate": 2.0204134223952284e-09,
"loss": 0.2749,
"num_input_tokens_seen": 3658112,
"step": 7440
},
{
"epoch": 0.982578857067441,
"grad_norm": 16.413667678833008,
"learning_rate": 1.87670393083994e-09,
"loss": 0.1727,
"num_input_tokens_seen": 3660928,
"step": 7445
},
{
"epoch": 0.9832387488451894,
"grad_norm": 23.082048416137695,
"learning_rate": 1.7382908587525447e-09,
"loss": 0.0298,
"num_input_tokens_seen": 3663232,
"step": 7450
},
{
"epoch": 0.9838986406229379,
"grad_norm": 0.05155172944068909,
"learning_rate": 1.6051749406049697e-09,
"loss": 0.0013,
"num_input_tokens_seen": 3665600,
"step": 7455
},
{
"epoch": 0.9845585324006862,
"grad_norm": 0.047289684414863586,
"learning_rate": 1.4773568827607386e-09,
"loss": 0.0008,
"num_input_tokens_seen": 3668096,
"step": 7460
},
{
"epoch": 0.9852184241784347,
"grad_norm": 0.13699422776699066,
"learning_rate": 1.354837363470529e-09,
"loss": 0.0016,
"num_input_tokens_seen": 3670656,
"step": 7465
},
{
"epoch": 0.9858783159561831,
"grad_norm": 117.30753326416016,
"learning_rate": 1.23761703286962e-09,
"loss": 0.1466,
"num_input_tokens_seen": 3673024,
"step": 7470
},
{
"epoch": 0.9865382077339316,
"grad_norm": 0.04031828045845032,
"learning_rate": 1.1256965129730068e-09,
"loss": 0.0012,
"num_input_tokens_seen": 3675712,
"step": 7475
},
{
"epoch": 0.9871980995116801,
"grad_norm": 0.14629097282886505,
"learning_rate": 1.0190763976734018e-09,
"loss": 0.1029,
"num_input_tokens_seen": 3678080,
"step": 7480
},
{
"epoch": 0.9878579912894285,
"grad_norm": 59.77119064331055,
"learning_rate": 9.177572527375721e-10,
"loss": 0.1536,
"num_input_tokens_seen": 3680448,
"step": 7485
},
{
"epoch": 0.988517883067177,
"grad_norm": 9.770659446716309,
"learning_rate": 8.217396158030076e-10,
"loss": 0.0019,
"num_input_tokens_seen": 3682752,
"step": 7490
},
{
"epoch": 0.9891777748449254,
"grad_norm": 77.23456573486328,
"learning_rate": 7.310239963755904e-10,
"loss": 0.1517,
"num_input_tokens_seen": 3685376,
"step": 7495
},
{
"epoch": 0.9898376666226739,
"grad_norm": 0.24194952845573425,
"learning_rate": 6.456108758268186e-10,
"loss": 0.0016,
"num_input_tokens_seen": 3687744,
"step": 7500
},
{
"epoch": 0.9904975584004223,
"grad_norm": 49.376041412353516,
"learning_rate": 5.655007073909202e-10,
"loss": 0.1517,
"num_input_tokens_seen": 3690240,
"step": 7505
},
{
"epoch": 0.9911574501781708,
"grad_norm": 0.04725305363535881,
"learning_rate": 4.906939161627432e-10,
"loss": 0.0507,
"num_input_tokens_seen": 3692736,
"step": 7510
},
{
"epoch": 0.9918173419559192,
"grad_norm": 14.56191635131836,
"learning_rate": 4.2119089909542495e-10,
"loss": 0.201,
"num_input_tokens_seen": 3695360,
"step": 7515
},
{
"epoch": 0.9924772337336677,
"grad_norm": 0.656075119972229,
"learning_rate": 3.569920249981706e-10,
"loss": 0.0593,
"num_input_tokens_seen": 3697856,
"step": 7520
},
{
"epoch": 0.9931371255114161,
"grad_norm": 1.89599609375,
"learning_rate": 2.980976345344777e-10,
"loss": 0.027,
"num_input_tokens_seen": 3700224,
"step": 7525
},
{
"epoch": 0.9937970172891646,
"grad_norm": 0.08695349097251892,
"learning_rate": 2.445080402202482e-10,
"loss": 0.0772,
"num_input_tokens_seen": 3702592,
"step": 7530
},
{
"epoch": 0.994456909066913,
"grad_norm": 0.4214935898780823,
"learning_rate": 1.962235264222345e-10,
"loss": 0.1564,
"num_input_tokens_seen": 3704896,
"step": 7535
},
{
"epoch": 0.9951168008446615,
"grad_norm": 4.679486274719238,
"learning_rate": 1.5324434935615195e-10,
"loss": 0.0446,
"num_input_tokens_seen": 3707264,
"step": 7540
},
{
"epoch": 0.99577669262241,
"grad_norm": 11.486593246459961,
"learning_rate": 1.1557073708579057e-10,
"loss": 0.1154,
"num_input_tokens_seen": 3709824,
"step": 7545
},
{
"epoch": 0.9964365844001584,
"grad_norm": 1.4587411880493164,
"learning_rate": 8.320288952168297e-11,
"loss": 0.1286,
"num_input_tokens_seen": 3712192,
"step": 7550
},
{
"epoch": 0.9970964761779069,
"grad_norm": 97.42974090576172,
"learning_rate": 5.614097841988297e-11,
"loss": 0.0832,
"num_input_tokens_seen": 3714880,
"step": 7555
},
{
"epoch": 0.9977563679556553,
"grad_norm": 0.4481765627861023,
"learning_rate": 3.43851473808554e-11,
"loss": 0.0311,
"num_input_tokens_seen": 3717184,
"step": 7560
},
{
"epoch": 0.9984162597334038,
"grad_norm": 0.09104231745004654,
"learning_rate": 1.7935511849587192e-11,
"loss": 0.0522,
"num_input_tokens_seen": 3719424,
"step": 7565
},
{
"epoch": 0.9990761515111521,
"grad_norm": 0.0702981948852539,
"learning_rate": 6.792159113921947e-12,
"loss": 0.1208,
"num_input_tokens_seen": 3721920,
"step": 7570
},
{
"epoch": 0.9997360432889006,
"grad_norm": 39.189273834228516,
"learning_rate": 9.55148304560005e-13,
"loss": 0.1169,
"num_input_tokens_seen": 3724288,
"step": 7575
},
{
"epoch": 1.0,
"num_input_tokens_seen": 3725120,
"step": 7577,
"total_flos": 2.175051626840064e+16,
"train_loss": 0.12470523826255549,
"train_runtime": 1215.5483,
"train_samples_per_second": 49.866,
"train_steps_per_second": 6.233
}
],
"logging_steps": 5,
"max_steps": 7577,
"num_input_tokens_seen": 3725120,
"num_train_epochs": 1,
"save_steps": 379,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.175051626840064e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}