Files
llama3.2-1b-deita-dpo-stude…/trainer_state.json

2462 lines
60 KiB
JSON
Raw Permalink Normal View History

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 1713,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.008764241893076249,
"grad_norm": 5.367548942565918,
"learning_rate": 4.651162790697675e-07,
"loss": 1.491,
"step": 5
},
{
"epoch": 0.017528483786152498,
"grad_norm": 3.861912488937378,
"learning_rate": 1.0465116279069768e-06,
"loss": 1.4607,
"step": 10
},
{
"epoch": 0.026292725679228746,
"grad_norm": 2.542222738265991,
"learning_rate": 1.6279069767441862e-06,
"loss": 1.4704,
"step": 15
},
{
"epoch": 0.035056967572304996,
"grad_norm": 2.1328587532043457,
"learning_rate": 2.2093023255813954e-06,
"loss": 1.4085,
"step": 20
},
{
"epoch": 0.04382120946538125,
"grad_norm": 1.5579408407211304,
"learning_rate": 2.790697674418605e-06,
"loss": 1.3603,
"step": 25
},
{
"epoch": 0.05258545135845749,
"grad_norm": 1.6026604175567627,
"learning_rate": 3.372093023255814e-06,
"loss": 1.3568,
"step": 30
},
{
"epoch": 0.06134969325153374,
"grad_norm": 1.5183006525039673,
"learning_rate": 3.953488372093024e-06,
"loss": 1.3702,
"step": 35
},
{
"epoch": 0.07011393514460999,
"grad_norm": 1.416035532951355,
"learning_rate": 4.5348837209302326e-06,
"loss": 1.3288,
"step": 40
},
{
"epoch": 0.07887817703768624,
"grad_norm": 1.4895626306533813,
"learning_rate": 5.116279069767442e-06,
"loss": 1.3292,
"step": 45
},
{
"epoch": 0.0876424189307625,
"grad_norm": 1.3430354595184326,
"learning_rate": 5.697674418604652e-06,
"loss": 1.3227,
"step": 50
},
{
"epoch": 0.09640666082383874,
"grad_norm": 1.4117517471313477,
"learning_rate": 6.279069767441861e-06,
"loss": 1.2902,
"step": 55
},
{
"epoch": 0.10517090271691498,
"grad_norm": 1.3359665870666504,
"learning_rate": 6.86046511627907e-06,
"loss": 1.3327,
"step": 60
},
{
"epoch": 0.11393514460999124,
"grad_norm": 1.4718199968338013,
"learning_rate": 7.44186046511628e-06,
"loss": 1.2973,
"step": 65
},
{
"epoch": 0.12269938650306748,
"grad_norm": 1.2470380067825317,
"learning_rate": 8.023255813953488e-06,
"loss": 1.2706,
"step": 70
},
{
"epoch": 0.13146362839614373,
"grad_norm": 1.324803352355957,
"learning_rate": 8.604651162790698e-06,
"loss": 1.2178,
"step": 75
},
{
"epoch": 0.14022787028921999,
"grad_norm": 1.3574628829956055,
"learning_rate": 9.186046511627908e-06,
"loss": 1.2316,
"step": 80
},
{
"epoch": 0.14899211218229624,
"grad_norm": 1.3636841773986816,
"learning_rate": 9.767441860465117e-06,
"loss": 1.283,
"step": 85
},
{
"epoch": 0.15775635407537247,
"grad_norm": 1.7021708488464355,
"learning_rate": 1.0348837209302327e-05,
"loss": 1.2635,
"step": 90
},
{
"epoch": 0.16652059596844873,
"grad_norm": 1.243608832359314,
"learning_rate": 1.0930232558139535e-05,
"loss": 1.2079,
"step": 95
},
{
"epoch": 0.175284837861525,
"grad_norm": 1.8144162893295288,
"learning_rate": 1.1511627906976746e-05,
"loss": 1.2186,
"step": 100
},
{
"epoch": 0.18404907975460122,
"grad_norm": 1.1823457479476929,
"learning_rate": 1.2093023255813954e-05,
"loss": 1.2103,
"step": 105
},
{
"epoch": 0.19281332164767748,
"grad_norm": 1.198132872581482,
"learning_rate": 1.2674418604651164e-05,
"loss": 1.2044,
"step": 110
},
{
"epoch": 0.20157756354075373,
"grad_norm": 11.093875885009766,
"learning_rate": 1.3255813953488372e-05,
"loss": 1.1683,
"step": 115
},
{
"epoch": 0.21034180543382996,
"grad_norm": 1.0984971523284912,
"learning_rate": 1.3837209302325583e-05,
"loss": 1.2289,
"step": 120
},
{
"epoch": 0.21910604732690622,
"grad_norm": 1.2427825927734375,
"learning_rate": 1.441860465116279e-05,
"loss": 1.1449,
"step": 125
},
{
"epoch": 0.22787028921998248,
"grad_norm": 1.2608261108398438,
"learning_rate": 1.5000000000000002e-05,
"loss": 1.1524,
"step": 130
},
{
"epoch": 0.2366345311130587,
"grad_norm": 1.114823818206787,
"learning_rate": 1.558139534883721e-05,
"loss": 1.1769,
"step": 135
},
{
"epoch": 0.24539877300613497,
"grad_norm": 1.1239306926727295,
"learning_rate": 1.616279069767442e-05,
"loss": 1.1516,
"step": 140
},
{
"epoch": 0.2541630148992112,
"grad_norm": 1.1203042268753052,
"learning_rate": 1.674418604651163e-05,
"loss": 1.1243,
"step": 145
},
{
"epoch": 0.26292725679228746,
"grad_norm": 1.0693674087524414,
"learning_rate": 1.7325581395348837e-05,
"loss": 1.1574,
"step": 150
},
{
"epoch": 0.27169149868536374,
"grad_norm": 1.1013996601104736,
"learning_rate": 1.790697674418605e-05,
"loss": 1.1621,
"step": 155
},
{
"epoch": 0.28045574057843997,
"grad_norm": 1.1914992332458496,
"learning_rate": 1.8488372093023256e-05,
"loss": 1.1468,
"step": 160
},
{
"epoch": 0.2892199824715162,
"grad_norm": 1.1144826412200928,
"learning_rate": 1.9069767441860468e-05,
"loss": 1.153,
"step": 165
},
{
"epoch": 0.2979842243645925,
"grad_norm": 1.1576107740402222,
"learning_rate": 1.9651162790697676e-05,
"loss": 1.151,
"step": 170
},
{
"epoch": 0.3067484662576687,
"grad_norm": 1.037223219871521,
"learning_rate": 1.999991687649223e-05,
"loss": 1.1386,
"step": 175
},
{
"epoch": 0.31551270815074495,
"grad_norm": 1.1823593378067017,
"learning_rate": 1.999898175290004e-05,
"loss": 1.1368,
"step": 180
},
{
"epoch": 0.32427695004382123,
"grad_norm": 1.0528627634048462,
"learning_rate": 1.9997007698817558e-05,
"loss": 1.183,
"step": 185
},
{
"epoch": 0.33304119193689746,
"grad_norm": 1.1595643758773804,
"learning_rate": 1.9993994919356167e-05,
"loss": 1.1687,
"step": 190
},
{
"epoch": 0.3418054338299737,
"grad_norm": 1.0715525150299072,
"learning_rate": 1.9989943727554597e-05,
"loss": 1.1648,
"step": 195
},
{
"epoch": 0.35056967572305,
"grad_norm": 1.0216760635375977,
"learning_rate": 1.9984854544346367e-05,
"loss": 1.1587,
"step": 200
},
{
"epoch": 0.3593339176161262,
"grad_norm": 1.0617367029190063,
"learning_rate": 1.9978727898516087e-05,
"loss": 1.145,
"step": 205
},
{
"epoch": 0.36809815950920244,
"grad_norm": 1.0882047414779663,
"learning_rate": 1.997156442664449e-05,
"loss": 1.1652,
"step": 210
},
{
"epoch": 0.3768624014022787,
"grad_norm": 1.154795527458191,
"learning_rate": 1.9963364873042298e-05,
"loss": 1.135,
"step": 215
},
{
"epoch": 0.38562664329535495,
"grad_norm": 1.0904324054718018,
"learning_rate": 1.9954130089672893e-05,
"loss": 1.1262,
"step": 220
},
{
"epoch": 0.3943908851884312,
"grad_norm": 1.0021111965179443,
"learning_rate": 1.994386103606377e-05,
"loss": 1.1422,
"step": 225
},
{
"epoch": 0.40315512708150747,
"grad_norm": 1.0234806537628174,
"learning_rate": 1.9932558779206873e-05,
"loss": 1.1315,
"step": 230
},
{
"epoch": 0.4119193689745837,
"grad_norm": 1.10641610622406,
"learning_rate": 1.9920224493447702e-05,
"loss": 1.1824,
"step": 235
},
{
"epoch": 0.42068361086765993,
"grad_norm": 1.0238338708877563,
"learning_rate": 1.9906859460363307e-05,
"loss": 1.1442,
"step": 240
},
{
"epoch": 0.4294478527607362,
"grad_norm": 0.990487813949585,
"learning_rate": 1.989246506862913e-05,
"loss": 1.1276,
"step": 245
},
{
"epoch": 0.43821209465381245,
"grad_norm": 20.3802490234375,
"learning_rate": 1.9877042813874712e-05,
"loss": 1.1744,
"step": 250
},
{
"epoch": 0.4469763365468887,
"grad_norm": 1.1273601055145264,
"learning_rate": 1.9860594298528283e-05,
"loss": 1.1774,
"step": 255
},
{
"epoch": 0.45574057843996496,
"grad_norm": 1.014085292816162,
"learning_rate": 1.984312123165028e-05,
"loss": 1.162,
"step": 260
},
{
"epoch": 0.4645048203330412,
"grad_norm": 1.0875205993652344,
"learning_rate": 1.982462542875576e-05,
"loss": 1.1485,
"step": 265
},
{
"epoch": 0.4732690622261174,
"grad_norm": 1.0361530780792236,
"learning_rate": 1.9805108811625774e-05,
"loss": 1.1422,
"step": 270
},
{
"epoch": 0.4820333041191937,
"grad_norm": 1.0539902448654175,
"learning_rate": 1.9784573408107657e-05,
"loss": 1.0915,
"step": 275
},
{
"epoch": 0.49079754601226994,
"grad_norm": 1.05149245262146,
"learning_rate": 1.976302135190436e-05,
"loss": 1.1372,
"step": 280
},
{
"epoch": 0.49956178790534617,
"grad_norm": 1.0928102731704712,
"learning_rate": 1.9740454882352733e-05,
"loss": 1.1239,
"step": 285
},
{
"epoch": 0.5083260297984225,
"grad_norm": 1.0785322189331055,
"learning_rate": 1.971687634419086e-05,
"loss": 1.1429,
"step": 290
},
{
"epoch": 0.5170902716914987,
"grad_norm": 1.020357370376587,
"learning_rate": 1.9692288187314423e-05,
"loss": 1.1195,
"step": 295
},
{
"epoch": 0.5258545135845749,
"grad_norm": 0.9896298050880432,
"learning_rate": 1.9666692966522144e-05,
"loss": 1.1217,
"step": 300
},
{
"epoch": 0.5346187554776511,
"grad_norm": 0.9637587070465088,
"learning_rate": 1.9640093341250356e-05,
"loss": 1.1082,
"step": 305
},
{
"epoch": 0.5433829973707275,
"grad_norm": 1.2339686155319214,
"learning_rate": 1.961249207529665e-05,
"loss": 1.1459,
"step": 310
},
{
"epoch": 0.5521472392638037,
"grad_norm": 1.0626837015151978,
"learning_rate": 1.9583892036532726e-05,
"loss": 1.1257,
"step": 315
},
{
"epoch": 0.5609114811568799,
"grad_norm": 1.0179359912872314,
"learning_rate": 1.9554296196606395e-05,
"loss": 1.1111,
"step": 320
},
{
"epoch": 0.5696757230499562,
"grad_norm": 1.0226428508758545,
"learning_rate": 1.9523707630632834e-05,
"loss": 1.1673,
"step": 325
},
{
"epoch": 0.5784399649430324,
"grad_norm": 1.0737133026123047,
"learning_rate": 1.9492129516875055e-05,
"loss": 1.1325,
"step": 330
},
{
"epoch": 0.5872042068361086,
"grad_norm": 1.0531032085418701,
"learning_rate": 1.9459565136413667e-05,
"loss": 1.1478,
"step": 335
},
{
"epoch": 0.595968448729185,
"grad_norm": 1.0400668382644653,
"learning_rate": 1.942601787280598e-05,
"loss": 1.1403,
"step": 340
},
{
"epoch": 0.6047326906222612,
"grad_norm": 0.9359525442123413,
"learning_rate": 1.9391491211734426e-05,
"loss": 1.1298,
"step": 345
},
{
"epoch": 0.6134969325153374,
"grad_norm": 3.9531524181365967,
"learning_rate": 1.935598874064438e-05,
"loss": 1.1923,
"step": 350
},
{
"epoch": 0.6222611744084137,
"grad_norm": 1.0364443063735962,
"learning_rate": 1.9319514148371436e-05,
"loss": 1.1096,
"step": 355
},
{
"epoch": 0.6310254163014899,
"grad_norm": 1.0656158924102783,
"learning_rate": 1.9282071224758092e-05,
"loss": 1.1282,
"step": 360
},
{
"epoch": 0.6397896581945661,
"grad_norm": 1.0614289045333862,
"learning_rate": 1.9243663860259992e-05,
"loss": 1.1137,
"step": 365
},
{
"epoch": 0.6485539000876425,
"grad_norm": 1.002898931503296,
"learning_rate": 1.9204296045541686e-05,
"loss": 1.1091,
"step": 370
},
{
"epoch": 0.6573181419807187,
"grad_norm": 1.0451066493988037,
"learning_rate": 1.916397187106199e-05,
"loss": 1.0919,
"step": 375
},
{
"epoch": 0.6660823838737949,
"grad_norm": 1.192143201828003,
"learning_rate": 1.9122695526648968e-05,
"loss": 1.1581,
"step": 380
},
{
"epoch": 0.6748466257668712,
"grad_norm": 1.0061026811599731,
"learning_rate": 1.90804713010646e-05,
"loss": 1.116,
"step": 385
},
{
"epoch": 0.6836108676599474,
"grad_norm": 2.3462023735046387,
"learning_rate": 1.9037303581559143e-05,
"loss": 1.1323,
"step": 390
},
{
"epoch": 0.6923751095530236,
"grad_norm": 0.9700145125389099,
"learning_rate": 1.899319685341532e-05,
"loss": 1.1075,
"step": 395
},
{
"epoch": 0.7011393514461,
"grad_norm": 0.9761490821838379,
"learning_rate": 1.8948155699482243e-05,
"loss": 1.1291,
"step": 400
},
{
"epoch": 0.7099035933391762,
"grad_norm": 1.0112907886505127,
"learning_rate": 1.8902184799699265e-05,
"loss": 1.1087,
"step": 405
},
{
"epoch": 0.7186678352322524,
"grad_norm": 0.9741994738578796,
"learning_rate": 1.885528893060969e-05,
"loss": 1.1181,
"step": 410
},
{
"epoch": 0.7274320771253286,
"grad_norm": 0.9536153078079224,
"learning_rate": 1.8807472964864516e-05,
"loss": 1.114,
"step": 415
},
{
"epoch": 0.7361963190184049,
"grad_norm": 0.9664406180381775,
"learning_rate": 1.8758741870716093e-05,
"loss": 1.1474,
"step": 420
},
{
"epoch": 0.7449605609114811,
"grad_norm": 0.999437689781189,
"learning_rate": 1.8709100711501957e-05,
"loss": 1.1067,
"step": 425
},
{
"epoch": 0.7537248028045574,
"grad_norm": 1.0034998655319214,
"learning_rate": 1.865855464511869e-05,
"loss": 1.1409,
"step": 430
},
{
"epoch": 0.7624890446976337,
"grad_norm": 1.0300318002700806,
"learning_rate": 1.8607108923486025e-05,
"loss": 1.1289,
"step": 435
},
{
"epoch": 0.7712532865907099,
"grad_norm": 0.9994638562202454,
"learning_rate": 1.8554768892001137e-05,
"loss": 1.1093,
"step": 440
},
{
"epoch": 0.7800175284837861,
"grad_norm": 0.9193391799926758,
"learning_rate": 1.8501539988983234e-05,
"loss": 1.1377,
"step": 445
},
{
"epoch": 0.7887817703768624,
"grad_norm": 1.0165811777114868,
"learning_rate": 1.844742774510851e-05,
"loss": 1.1204,
"step": 450
},
{
"epoch": 0.7975460122699386,
"grad_norm": 0.9755986928939819,
"learning_rate": 1.8392437782835475e-05,
"loss": 1.0935,
"step": 455
},
{
"epoch": 0.8063102541630149,
"grad_norm": 0.977584183216095,
"learning_rate": 1.8336575815820764e-05,
"loss": 1.1064,
"step": 460
},
{
"epoch": 0.8150744960560912,
"grad_norm": 0.9432125687599182,
"learning_rate": 1.8279847648325478e-05,
"loss": 1.099,
"step": 465
},
{
"epoch": 0.8238387379491674,
"grad_norm": 1.0756127834320068,
"learning_rate": 1.822225917461208e-05,
"loss": 1.0926,
"step": 470
},
{
"epoch": 0.8326029798422436,
"grad_norm": 1.0018426179885864,
"learning_rate": 1.8163816378331983e-05,
"loss": 1.1292,
"step": 475
},
{
"epoch": 0.8413672217353199,
"grad_norm": 1.0097193717956543,
"learning_rate": 1.81045253319038e-05,
"loss": 1.0738,
"step": 480
},
{
"epoch": 0.8501314636283961,
"grad_norm": 0.9783721566200256,
"learning_rate": 1.8044392195882428e-05,
"loss": 1.1059,
"step": 485
},
{
"epoch": 0.8588957055214724,
"grad_norm": 0.9834737181663513,
"learning_rate": 1.7983423218318918e-05,
"loss": 1.1063,
"step": 490
},
{
"epoch": 0.8676599474145487,
"grad_norm": 1.0263484716415405,
"learning_rate": 1.7921624734111292e-05,
"loss": 1.1325,
"step": 495
},
{
"epoch": 0.8764241893076249,
"grad_norm": 0.9454247951507568,
"learning_rate": 1.7859003164346334e-05,
"loss": 1.0937,
"step": 500
},
{
"epoch": 0.8851884312007011,
"grad_norm": 1.006463646888733,
"learning_rate": 1.779556501563239e-05,
"loss": 1.0511,
"step": 505
},
{
"epoch": 0.8939526730937774,
"grad_norm": 6.430685043334961,
"learning_rate": 1.773131687942333e-05,
"loss": 1.0899,
"step": 510
},
{
"epoch": 0.9027169149868537,
"grad_norm": 1.3062087297439575,
"learning_rate": 1.7666265431333654e-05,
"loss": 1.1047,
"step": 515
},
{
"epoch": 0.9114811568799299,
"grad_norm": 1.0522316694259644,
"learning_rate": 1.76004174304449e-05,
"loss": 1.1009,
"step": 520
},
{
"epoch": 0.9202453987730062,
"grad_norm": 0.9894193410873413,
"learning_rate": 1.7533779718603315e-05,
"loss": 1.0761,
"step": 525
},
{
"epoch": 0.9290096406660824,
"grad_norm": 1.0116757154464722,
"learning_rate": 1.7466359219708987e-05,
"loss": 1.1305,
"step": 530
},
{
"epoch": 0.9377738825591586,
"grad_norm": 0.962745726108551,
"learning_rate": 1.739816293899642e-05,
"loss": 1.0758,
"step": 535
},
{
"epoch": 0.9465381244522348,
"grad_norm": 0.9733975529670715,
"learning_rate": 1.7329197962306666e-05,
"loss": 1.0752,
"step": 540
},
{
"epoch": 0.9553023663453112,
"grad_norm": 1.026983618736267,
"learning_rate": 1.7259471455351072e-05,
"loss": 1.0576,
"step": 545
},
{
"epoch": 0.9640666082383874,
"grad_norm": 0.9675541520118713,
"learning_rate": 1.718899066296675e-05,
"loss": 1.0759,
"step": 550
},
{
"epoch": 0.9728308501314636,
"grad_norm": 0.9842016100883484,
"learning_rate": 1.71177629083638e-05,
"loss": 1.0704,
"step": 555
},
{
"epoch": 0.9815950920245399,
"grad_norm": 0.9556295871734619,
"learning_rate": 1.7045795592364413e-05,
"loss": 1.1343,
"step": 560
},
{
"epoch": 0.9903593339176161,
"grad_norm": 1.033588171005249,
"learning_rate": 1.6973096192633884e-05,
"loss": 1.0947,
"step": 565
},
{
"epoch": 0.9991235758106923,
"grad_norm": 1.971771240234375,
"learning_rate": 1.6899672262903675e-05,
"loss": 1.1293,
"step": 570
},
{
"epoch": 1.0,
"eval_loss": 1.1597641706466675,
"eval_runtime": 199.4031,
"eval_samples_per_second": 9.162,
"eval_steps_per_second": 2.292,
"step": 571
},
{
"epoch": 1.007011393514461,
"grad_norm": 1.0958155393600464,
"learning_rate": 1.6825531432186545e-05,
"loss": 1.0193,
"step": 575
},
{
"epoch": 1.0157756354075373,
"grad_norm": 1.108912467956543,
"learning_rate": 1.6750681403983847e-05,
"loss": 0.9767,
"step": 580
},
{
"epoch": 1.0245398773006136,
"grad_norm": 1.0128231048583984,
"learning_rate": 1.6675129955485154e-05,
"loss": 0.9534,
"step": 585
},
{
"epoch": 1.0333041191936898,
"grad_norm": 0.9520951509475708,
"learning_rate": 1.659888493676013e-05,
"loss": 0.9388,
"step": 590
},
{
"epoch": 1.042068361086766,
"grad_norm": 0.98039710521698,
"learning_rate": 1.652195426994292e-05,
"loss": 0.97,
"step": 595
},
{
"epoch": 1.0508326029798423,
"grad_norm": 1.0683668851852417,
"learning_rate": 1.6444345948408985e-05,
"loss": 0.9539,
"step": 600
},
{
"epoch": 1.0595968448729185,
"grad_norm": 1.0525304079055786,
"learning_rate": 1.636606803594457e-05,
"loss": 0.9534,
"step": 605
},
{
"epoch": 1.0683610867659947,
"grad_norm": 0.999165415763855,
"learning_rate": 1.628712866590885e-05,
"loss": 0.9634,
"step": 610
},
{
"epoch": 1.077125328659071,
"grad_norm": 0.9724875688552856,
"learning_rate": 1.6207536040388844e-05,
"loss": 0.9559,
"step": 615
},
{
"epoch": 1.0858895705521472,
"grad_norm": 0.9775224924087524,
"learning_rate": 1.612729842934718e-05,
"loss": 0.9771,
"step": 620
},
{
"epoch": 1.0946538124452234,
"grad_norm": 1.2882146835327148,
"learning_rate": 1.604642416976283e-05,
"loss": 0.9027,
"step": 625
},
{
"epoch": 1.1034180543382996,
"grad_norm": 1.0088601112365723,
"learning_rate": 1.596492166476485e-05,
"loss": 0.9494,
"step": 630
},
{
"epoch": 1.112182296231376,
"grad_norm": 1.0667091608047485,
"learning_rate": 1.588279938275929e-05,
"loss": 0.9493,
"step": 635
},
{
"epoch": 1.1209465381244523,
"grad_norm": 1.0000181198120117,
"learning_rate": 1.580006585654927e-05,
"loss": 0.9609,
"step": 640
},
{
"epoch": 1.1297107800175286,
"grad_norm": 0.9999110102653503,
"learning_rate": 1.5716729682448392e-05,
"loss": 1.0068,
"step": 645
},
{
"epoch": 1.1384750219106048,
"grad_norm": 1.0657200813293457,
"learning_rate": 1.563279951938758e-05,
"loss": 0.9676,
"step": 650
},
{
"epoch": 1.147239263803681,
"grad_norm": 1.029891848564148,
"learning_rate": 1.5548284088015354e-05,
"loss": 0.9623,
"step": 655
},
{
"epoch": 1.1560035056967572,
"grad_norm": 1.015758752822876,
"learning_rate": 1.546319216979174e-05,
"loss": 0.9897,
"step": 660
},
{
"epoch": 1.1647677475898335,
"grad_norm": 0.9652720093727112,
"learning_rate": 1.537753260607584e-05,
"loss": 0.9607,
"step": 665
},
{
"epoch": 1.1735319894829097,
"grad_norm": 1.0845791101455688,
"learning_rate": 1.5291314297207177e-05,
"loss": 0.9783,
"step": 670
},
{
"epoch": 1.182296231375986,
"grad_norm": 1.0521138906478882,
"learning_rate": 1.520454620158093e-05,
"loss": 0.9836,
"step": 675
},
{
"epoch": 1.1910604732690622,
"grad_norm": 0.9807194471359253,
"learning_rate": 1.5117237334717117e-05,
"loss": 0.9443,
"step": 680
},
{
"epoch": 1.1998247151621384,
"grad_norm": 1.0408189296722412,
"learning_rate": 1.5029396768323847e-05,
"loss": 0.9755,
"step": 685
},
{
"epoch": 1.2085889570552146,
"grad_norm": 1.0140528678894043,
"learning_rate": 1.4941033629354735e-05,
"loss": 0.942,
"step": 690
},
{
"epoch": 1.2173531989482909,
"grad_norm": 1.028287649154663,
"learning_rate": 1.4852157099060595e-05,
"loss": 0.9362,
"step": 695
},
{
"epoch": 1.2261174408413673,
"grad_norm": 0.9807888269424438,
"learning_rate": 1.4762776412035455e-05,
"loss": 0.9752,
"step": 700
},
{
"epoch": 1.2348816827344435,
"grad_norm": 1.0794785022735596,
"learning_rate": 1.4672900855257056e-05,
"loss": 0.9508,
"step": 705
},
{
"epoch": 1.2436459246275198,
"grad_norm": 1.0464166402816772,
"learning_rate": 1.4582539767121904e-05,
"loss": 0.9519,
"step": 710
},
{
"epoch": 1.252410166520596,
"grad_norm": 0.9949556589126587,
"learning_rate": 1.449170253647498e-05,
"loss": 0.9188,
"step": 715
},
{
"epoch": 1.2611744084136722,
"grad_norm": 0.9590442180633545,
"learning_rate": 1.4400398601634189e-05,
"loss": 0.9686,
"step": 720
},
{
"epoch": 1.2699386503067485,
"grad_norm": 1.0098439455032349,
"learning_rate": 1.4308637449409705e-05,
"loss": 0.9848,
"step": 725
},
{
"epoch": 1.2787028921998247,
"grad_norm": 1.026219367980957,
"learning_rate": 1.4216428614118245e-05,
"loss": 0.9595,
"step": 730
},
{
"epoch": 1.287467134092901,
"grad_norm": 1.0277692079544067,
"learning_rate": 1.4123781676592418e-05,
"loss": 0.9773,
"step": 735
},
{
"epoch": 1.2962313759859772,
"grad_norm": 1.0222140550613403,
"learning_rate": 1.4030706263185248e-05,
"loss": 0.9399,
"step": 740
},
{
"epoch": 1.3049956178790534,
"grad_norm": 0.9892441630363464,
"learning_rate": 1.3937212044769957e-05,
"loss": 0.985,
"step": 745
},
{
"epoch": 1.3137598597721296,
"grad_norm": 1.0329406261444092,
"learning_rate": 1.384330873573513e-05,
"loss": 0.9369,
"step": 750
},
{
"epoch": 1.322524101665206,
"grad_norm": 0.9816661477088928,
"learning_rate": 1.3749006092975347e-05,
"loss": 0.9457,
"step": 755
},
{
"epoch": 1.331288343558282,
"grad_norm": 1.0054512023925781,
"learning_rate": 1.3654313914877414e-05,
"loss": 0.9087,
"step": 760
},
{
"epoch": 1.3400525854513585,
"grad_norm": 17.338027954101562,
"learning_rate": 1.3559242040302274e-05,
"loss": 0.9808,
"step": 765
},
{
"epoch": 1.3488168273444348,
"grad_norm": 1.0706207752227783,
"learning_rate": 1.3463800347562705e-05,
"loss": 0.9679,
"step": 770
},
{
"epoch": 1.357581069237511,
"grad_norm": 1.040747046470642,
"learning_rate": 1.3367998753396944e-05,
"loss": 0.9974,
"step": 775
},
{
"epoch": 1.3663453111305872,
"grad_norm": 0.9935981631278992,
"learning_rate": 1.3271847211938286e-05,
"loss": 0.9428,
"step": 780
},
{
"epoch": 1.3751095530236634,
"grad_norm": 1.0025993585586548,
"learning_rate": 1.317535571368082e-05,
"loss": 0.9462,
"step": 785
},
{
"epoch": 1.3838737949167397,
"grad_norm": 0.9988533854484558,
"learning_rate": 1.3078534284441382e-05,
"loss": 0.9734,
"step": 790
},
{
"epoch": 1.392638036809816,
"grad_norm": 1.0070812702178955,
"learning_rate": 1.2981392984317835e-05,
"loss": 0.9622,
"step": 795
},
{
"epoch": 1.4014022787028921,
"grad_norm": 1.0259467363357544,
"learning_rate": 1.2883941906643786e-05,
"loss": 0.9671,
"step": 800
},
{
"epoch": 1.4101665205959684,
"grad_norm": 1.0248597860336304,
"learning_rate": 1.2786191176939848e-05,
"loss": 0.9402,
"step": 805
},
{
"epoch": 1.4189307624890448,
"grad_norm": 1.008159875869751,
"learning_rate": 1.2688150951861582e-05,
"loss": 1.0111,
"step": 810
},
{
"epoch": 1.4276950043821208,
"grad_norm": 1.024697184562683,
"learning_rate": 1.2589831418144156e-05,
"loss": 0.9354,
"step": 815
},
{
"epoch": 1.4364592462751973,
"grad_norm": 0.9872326254844666,
"learning_rate": 1.2491242791543922e-05,
"loss": 0.9424,
"step": 820
},
{
"epoch": 1.4452234881682735,
"grad_norm": 1.0979632139205933,
"learning_rate": 1.2392395315776964e-05,
"loss": 0.9594,
"step": 825
},
{
"epoch": 1.4539877300613497,
"grad_norm": 0.9879066944122314,
"learning_rate": 1.2293299261454726e-05,
"loss": 0.9762,
"step": 830
},
{
"epoch": 1.462751971954426,
"grad_norm": 1.278245210647583,
"learning_rate": 1.2193964925016872e-05,
"loss": 0.9458,
"step": 835
},
{
"epoch": 1.4715162138475022,
"grad_norm": 1.4075230360031128,
"learning_rate": 1.2094402627661447e-05,
"loss": 0.9496,
"step": 840
},
{
"epoch": 1.4802804557405784,
"grad_norm": 1.059368371963501,
"learning_rate": 1.1994622714272448e-05,
"loss": 0.965,
"step": 845
},
{
"epoch": 1.4890446976336547,
"grad_norm": 0.9740917086601257,
"learning_rate": 1.1894635552344976e-05,
"loss": 0.939,
"step": 850
},
{
"epoch": 1.4978089395267309,
"grad_norm": 0.9713614583015442,
"learning_rate": 1.1794451530908011e-05,
"loss": 0.9256,
"step": 855
},
{
"epoch": 1.5065731814198071,
"grad_norm": 1.023720145225525,
"learning_rate": 1.1694081059444947e-05,
"loss": 0.9548,
"step": 860
},
{
"epoch": 1.5153374233128836,
"grad_norm": 1.1291546821594238,
"learning_rate": 1.159353456681201e-05,
"loss": 0.9512,
"step": 865
},
{
"epoch": 1.5241016652059596,
"grad_norm": 0.9696962833404541,
"learning_rate": 1.1492822500154668e-05,
"loss": 0.9715,
"step": 870
},
{
"epoch": 1.532865907099036,
"grad_norm": 1.0114858150482178,
"learning_rate": 1.1391955323822126e-05,
"loss": 0.9355,
"step": 875
},
{
"epoch": 1.541630148992112,
"grad_norm": 1.0963616371154785,
"learning_rate": 1.1290943518280058e-05,
"loss": 0.9779,
"step": 880
},
{
"epoch": 1.5503943908851885,
"grad_norm": 0.9969412684440613,
"learning_rate": 1.118979757902162e-05,
"loss": 0.9589,
"step": 885
},
{
"epoch": 1.5591586327782647,
"grad_norm": 1.022300362586975,
"learning_rate": 1.1088528015476965e-05,
"loss": 0.9656,
"step": 890
},
{
"epoch": 1.567922874671341,
"grad_norm": 0.974607527256012,
"learning_rate": 1.098714534992125e-05,
"loss": 0.9622,
"step": 895
},
{
"epoch": 1.5766871165644172,
"grad_norm": 1.0116885900497437,
"learning_rate": 1.088566011638134e-05,
"loss": 0.9343,
"step": 900
},
{
"epoch": 1.5854513584574934,
"grad_norm": 1.0116138458251953,
"learning_rate": 1.0784082859541291e-05,
"loss": 0.9383,
"step": 905
},
{
"epoch": 1.5942156003505696,
"grad_norm": 1.2108194828033447,
"learning_rate": 1.0682424133646712e-05,
"loss": 0.9171,
"step": 910
},
{
"epoch": 1.6029798422436459,
"grad_norm": 1.0214340686798096,
"learning_rate": 1.0580694501408138e-05,
"loss": 0.9675,
"step": 915
},
{
"epoch": 1.6117440841367223,
"grad_norm": 1.0362666845321655,
"learning_rate": 1.0478904532903535e-05,
"loss": 1.0028,
"step": 920
},
{
"epoch": 1.6205083260297983,
"grad_norm": 0.9954794049263,
"learning_rate": 1.0377064804480025e-05,
"loss": 0.9624,
"step": 925
},
{
"epoch": 1.6292725679228748,
"grad_norm": 1.0109649896621704,
"learning_rate": 1.0275185897654972e-05,
"loss": 0.9501,
"step": 930
},
{
"epoch": 1.6380368098159508,
"grad_norm": 1.0172914266586304,
"learning_rate": 1.0173278398016502e-05,
"loss": 0.9354,
"step": 935
},
{
"epoch": 1.6468010517090272,
"grad_norm": 0.9905017614364624,
"learning_rate": 1.0071352894123654e-05,
"loss": 0.9758,
"step": 940
},
{
"epoch": 1.6555652936021035,
"grad_norm": 0.9832938313484192,
"learning_rate": 9.969419976406166e-06,
"loss": 0.9737,
"step": 945
},
{
"epoch": 1.6643295354951797,
"grad_norm": 0.9569029808044434,
"learning_rate": 9.867490236064109e-06,
"loss": 0.9212,
"step": 950
},
{
"epoch": 1.673093777388256,
"grad_norm": 1.0192569494247437,
"learning_rate": 9.765574263967397e-06,
"loss": 0.9472,
"step": 955
},
{
"epoch": 1.6818580192813322,
"grad_norm": 0.9713300466537476,
"learning_rate": 9.663682649555389e-06,
"loss": 0.9644,
"step": 960
},
{
"epoch": 1.6906222611744084,
"grad_norm": 0.9462825655937195,
"learning_rate": 9.56182597973658e-06,
"loss": 0.9576,
"step": 965
},
{
"epoch": 1.6993865030674846,
"grad_norm": 0.9868680834770203,
"learning_rate": 9.460014837788605e-06,
"loss": 0.9667,
"step": 970
},
{
"epoch": 1.708150744960561,
"grad_norm": 1.0376570224761963,
"learning_rate": 9.358259802258582e-06,
"loss": 0.9452,
"step": 975
},
{
"epoch": 1.716914986853637,
"grad_norm": 1.0066869258880615,
"learning_rate": 9.256571445863972e-06,
"loss": 0.9534,
"step": 980
},
{
"epoch": 1.7256792287467135,
"grad_norm": 1.0090934038162231,
"learning_rate": 9.154960334394027e-06,
"loss": 0.955,
"step": 985
},
{
"epoch": 1.7344434706397895,
"grad_norm": 0.9518396854400635,
"learning_rate": 9.053437025611974e-06,
"loss": 0.9342,
"step": 990
},
{
"epoch": 1.743207712532866,
"grad_norm": 0.9992371797561646,
"learning_rate": 8.952012068158027e-06,
"loss": 0.9722,
"step": 995
},
{
"epoch": 1.751971954425942,
"grad_norm": 0.9554047584533691,
"learning_rate": 8.850696000453327e-06,
"loss": 0.9357,
"step": 1000
},
{
"epoch": 1.7607361963190185,
"grad_norm": 0.9989141225814819,
"learning_rate": 8.749499349604992e-06,
"loss": 0.9821,
"step": 1005
},
{
"epoch": 1.7695004382120947,
"grad_norm": 0.9504846334457397,
"learning_rate": 8.64843263031228e-06,
"loss": 0.9537,
"step": 1010
},
{
"epoch": 1.778264680105171,
"grad_norm": 0.9558340907096863,
"learning_rate": 8.547506343774097e-06,
"loss": 0.9289,
"step": 1015
},
{
"epoch": 1.7870289219982471,
"grad_norm": 1.0170910358428955,
"learning_rate": 8.446730976597877e-06,
"loss": 0.9501,
"step": 1020
},
{
"epoch": 1.7957931638913234,
"grad_norm": 0.9939414262771606,
"learning_rate": 8.346116999709975e-06,
"loss": 0.9472,
"step": 1025
},
{
"epoch": 1.8045574057843996,
"grad_norm": 0.9810356497764587,
"learning_rate": 8.245674867267724e-06,
"loss": 0.9491,
"step": 1030
},
{
"epoch": 1.8133216476774758,
"grad_norm": 0.9643825888633728,
"learning_rate": 8.145415015573183e-06,
"loss": 0.947,
"step": 1035
},
{
"epoch": 1.8220858895705523,
"grad_norm": 0.9195330739021301,
"learning_rate": 8.045347861988789e-06,
"loss": 0.876,
"step": 1040
},
{
"epoch": 1.8308501314636283,
"grad_norm": 0.9632524847984314,
"learning_rate": 7.945483803854937e-06,
"loss": 0.9173,
"step": 1045
},
{
"epoch": 1.8396143733567047,
"grad_norm": 0.9642296433448792,
"learning_rate": 7.845833217409677e-06,
"loss": 0.9233,
"step": 1050
},
{
"epoch": 1.8483786152497808,
"grad_norm": 0.9421396851539612,
"learning_rate": 7.746406456710564e-06,
"loss": 0.9187,
"step": 1055
},
{
"epoch": 1.8571428571428572,
"grad_norm": 0.9888685345649719,
"learning_rate": 7.64721385255886e-06,
"loss": 0.9289,
"step": 1060
},
{
"epoch": 1.8659070990359334,
"grad_norm": 0.9585088491439819,
"learning_rate": 7.548265711426105e-06,
"loss": 0.9291,
"step": 1065
},
{
"epoch": 1.8746713409290097,
"grad_norm": 0.9842194318771362,
"learning_rate": 7.449572314383237e-06,
"loss": 0.9521,
"step": 1070
},
{
"epoch": 1.883435582822086,
"grad_norm": 0.9899460077285767,
"learning_rate": 7.351143916032375e-06,
"loss": 0.9238,
"step": 1075
},
{
"epoch": 1.8921998247151621,
"grad_norm": 1.044392466545105,
"learning_rate": 7.252990743441293e-06,
"loss": 0.9354,
"step": 1080
},
{
"epoch": 1.9009640666082384,
"grad_norm": 0.9790138006210327,
"learning_rate": 7.155122995080826e-06,
"loss": 0.9527,
"step": 1085
},
{
"epoch": 1.9097283085013146,
"grad_norm": 1.002880334854126,
"learning_rate": 7.0575508397651885e-06,
"loss": 0.9471,
"step": 1090
},
{
"epoch": 1.918492550394391,
"grad_norm": 0.9698459506034851,
"learning_rate": 6.960284415595407e-06,
"loss": 0.9402,
"step": 1095
},
{
"epoch": 1.927256792287467,
"grad_norm": 0.9467353224754333,
"learning_rate": 6.863333828905929e-06,
"loss": 0.9409,
"step": 1100
},
{
"epoch": 1.9360210341805435,
"grad_norm": 0.965829074382782,
"learning_rate": 6.766709153214541e-06,
"loss": 0.9425,
"step": 1105
},
{
"epoch": 1.9447852760736195,
"grad_norm": 0.9571474194526672,
"learning_rate": 6.670420428175706e-06,
"loss": 0.9405,
"step": 1110
},
{
"epoch": 1.953549517966696,
"grad_norm": 0.9341493248939514,
"learning_rate": 6.574477658537375e-06,
"loss": 0.9145,
"step": 1115
},
{
"epoch": 1.962313759859772,
"grad_norm": 0.9990600943565369,
"learning_rate": 6.4788908131014995e-06,
"loss": 0.952,
"step": 1120
},
{
"epoch": 1.9710780017528484,
"grad_norm": 0.917290210723877,
"learning_rate": 6.383669823688191e-06,
"loss": 0.951,
"step": 1125
},
{
"epoch": 1.9798422436459246,
"grad_norm": 0.9599776864051819,
"learning_rate": 6.288824584103815e-06,
"loss": 0.936,
"step": 1130
},
{
"epoch": 1.9886064855390009,
"grad_norm": 0.9636255502700806,
"learning_rate": 6.194364949112952e-06,
"loss": 0.9582,
"step": 1135
},
{
"epoch": 1.997370727432077,
"grad_norm": 1.1487308740615845,
"learning_rate": 6.100300733414473e-06,
"loss": 0.9276,
"step": 1140
},
{
"epoch": 2.0,
"eval_loss": 1.151129961013794,
"eval_runtime": 199.4284,
"eval_samples_per_second": 9.161,
"eval_steps_per_second": 2.292,
"step": 1142
},
{
"epoch": 2.005258545135846,
"grad_norm": 1.1605374813079834,
"learning_rate": 6.006641710621746e-06,
"loss": 0.8479,
"step": 1145
},
{
"epoch": 2.014022787028922,
"grad_norm": 1.0491231679916382,
"learning_rate": 5.913397612247121e-06,
"loss": 0.8032,
"step": 1150
},
{
"epoch": 2.0227870289219982,
"grad_norm": 1.0855581760406494,
"learning_rate": 5.82057812669081e-06,
"loss": 0.8839,
"step": 1155
},
{
"epoch": 2.0315512708150747,
"grad_norm": 0.9942172169685364,
"learning_rate": 5.728192898234195e-06,
"loss": 0.7986,
"step": 1160
},
{
"epoch": 2.0403155127081507,
"grad_norm": 1.0435779094696045,
"learning_rate": 5.636251526037784e-06,
"loss": 0.8263,
"step": 1165
},
{
"epoch": 2.049079754601227,
"grad_norm": 1.0303524732589722,
"learning_rate": 5.544763563143794e-06,
"loss": 0.8188,
"step": 1170
},
{
"epoch": 2.057843996494303,
"grad_norm": 0.9739100933074951,
"learning_rate": 5.453738515483586e-06,
"loss": 0.8488,
"step": 1175
},
{
"epoch": 2.0666082383873796,
"grad_norm": 1.021791696548462,
"learning_rate": 5.363185840889935e-06,
"loss": 0.8666,
"step": 1180
},
{
"epoch": 2.0753724802804556,
"grad_norm": 0.9683573842048645,
"learning_rate": 5.273114948114346e-06,
"loss": 0.8276,
"step": 1185
},
{
"epoch": 2.084136722173532,
"grad_norm": 1.0052560567855835,
"learning_rate": 5.1835351958494515e-06,
"loss": 0.8089,
"step": 1190
},
{
"epoch": 2.092900964066608,
"grad_norm": 0.9584820866584778,
"learning_rate": 5.094455891756587e-06,
"loss": 0.8276,
"step": 1195
},
{
"epoch": 2.1016652059596845,
"grad_norm": 0.9803566932678223,
"learning_rate": 5.0058862914987204e-06,
"loss": 0.8256,
"step": 1200
},
{
"epoch": 2.1104294478527605,
"grad_norm": 0.9923965334892273,
"learning_rate": 4.917835597778731e-06,
"loss": 0.8241,
"step": 1205
},
{
"epoch": 2.119193689745837,
"grad_norm": 1.022495985031128,
"learning_rate": 4.830312959383238e-06,
"loss": 0.8074,
"step": 1210
},
{
"epoch": 2.127957931638913,
"grad_norm": 0.9760512709617615,
"learning_rate": 4.743327470231982e-06,
"loss": 0.8058,
"step": 1215
},
{
"epoch": 2.1367221735319895,
"grad_norm": 0.9603386521339417,
"learning_rate": 4.656888168432962e-06,
"loss": 0.8133,
"step": 1220
},
{
"epoch": 2.145486415425066,
"grad_norm": 1.034776210784912,
"learning_rate": 4.571004035343315e-06,
"loss": 0.818,
"step": 1225
},
{
"epoch": 2.154250657318142,
"grad_norm": 0.9763988256454468,
"learning_rate": 4.485683994636144e-06,
"loss": 0.8165,
"step": 1230
},
{
"epoch": 2.1630148992112184,
"grad_norm": 0.9729757905006409,
"learning_rate": 4.400936911373308e-06,
"loss": 0.808,
"step": 1235
},
{
"epoch": 2.1717791411042944,
"grad_norm": 1.0068873167037964,
"learning_rate": 4.316771591084297e-06,
"loss": 0.8038,
"step": 1240
},
{
"epoch": 2.180543382997371,
"grad_norm": 0.9344819188117981,
"learning_rate": 4.2331967788513295e-06,
"loss": 0.8335,
"step": 1245
},
{
"epoch": 2.189307624890447,
"grad_norm": 1.0315194129943848,
"learning_rate": 4.150221158400683e-06,
"loss": 0.8154,
"step": 1250
},
{
"epoch": 2.1980718667835233,
"grad_norm": 0.9959366321563721,
"learning_rate": 4.067853351200446e-06,
"loss": 0.8317,
"step": 1255
},
{
"epoch": 2.2068361086765993,
"grad_norm": 1.0919640064239502,
"learning_rate": 3.986101915564695e-06,
"loss": 0.8236,
"step": 1260
},
{
"epoch": 2.2156003505696757,
"grad_norm": 0.9548513293266296,
"learning_rate": 3.904975345764262e-06,
"loss": 0.849,
"step": 1265
},
{
"epoch": 2.224364592462752,
"grad_norm": 0.9864785075187683,
"learning_rate": 3.824482071144164e-06,
"loss": 0.8259,
"step": 1270
},
{
"epoch": 2.233128834355828,
"grad_norm": 1.014013648033142,
"learning_rate": 3.7446304552477387e-06,
"loss": 0.7696,
"step": 1275
},
{
"epoch": 2.2418930762489047,
"grad_norm": 0.95964115858078,
"learning_rate": 3.665428794947663e-06,
"loss": 0.7758,
"step": 1280
},
{
"epoch": 2.2506573181419807,
"grad_norm": 0.9974411725997925,
"learning_rate": 3.5868853195838582e-06,
"loss": 0.8512,
"step": 1285
},
{
"epoch": 2.259421560035057,
"grad_norm": 0.990260124206543,
"learning_rate": 3.509008190108453e-06,
"loss": 0.8096,
"step": 1290
},
{
"epoch": 2.268185801928133,
"grad_norm": 0.982060968875885,
"learning_rate": 3.431805498237808e-06,
"loss": 0.8259,
"step": 1295
},
{
"epoch": 2.2769500438212096,
"grad_norm": 0.9737572073936462,
"learning_rate": 3.355285265611784e-06,
"loss": 0.8368,
"step": 1300
},
{
"epoch": 2.2857142857142856,
"grad_norm": 0.9657383561134338,
"learning_rate": 3.2794554429602377e-06,
"loss": 0.8129,
"step": 1305
},
{
"epoch": 2.294478527607362,
"grad_norm": 0.9947619438171387,
"learning_rate": 3.204323909276924e-06,
"loss": 0.8034,
"step": 1310
},
{
"epoch": 2.303242769500438,
"grad_norm": 1.0247445106506348,
"learning_rate": 3.1298984710008483e-06,
"loss": 0.8267,
"step": 1315
},
{
"epoch": 2.3120070113935145,
"grad_norm": 0.9986540079116821,
"learning_rate": 3.056186861205136e-06,
"loss": 0.8233,
"step": 1320
},
{
"epoch": 2.3207712532865905,
"grad_norm": 0.9882351160049438,
"learning_rate": 2.983196738793547e-06,
"loss": 0.8097,
"step": 1325
},
{
"epoch": 2.329535495179667,
"grad_norm": 0.9737289547920227,
"learning_rate": 2.910935687704671e-06,
"loss": 0.8285,
"step": 1330
},
{
"epoch": 2.3382997370727434,
"grad_norm": 0.9512819647789001,
"learning_rate": 2.8394112161239606e-06,
"loss": 0.7998,
"step": 1335
},
{
"epoch": 2.3470639789658194,
"grad_norm": 0.980267345905304,
"learning_rate": 2.7686307557035684e-06,
"loss": 0.8364,
"step": 1340
},
{
"epoch": 2.355828220858896,
"grad_norm": 0.9798904061317444,
"learning_rate": 2.698601660790191e-06,
"loss": 0.8288,
"step": 1345
},
{
"epoch": 2.364592462751972,
"grad_norm": 0.9910169839859009,
"learning_rate": 2.629331207660931e-06,
"loss": 0.8182,
"step": 1350
},
{
"epoch": 2.3733567046450483,
"grad_norm": 1.0095982551574707,
"learning_rate": 2.560826593767244e-06,
"loss": 0.8651,
"step": 1355
},
{
"epoch": 2.3821209465381243,
"grad_norm": 1.0415823459625244,
"learning_rate": 2.4930949369871205e-06,
"loss": 0.7934,
"step": 1360
},
{
"epoch": 2.390885188431201,
"grad_norm": 0.9959484934806824,
"learning_rate": 2.426143274885493e-06,
"loss": 0.8131,
"step": 1365
},
{
"epoch": 2.399649430324277,
"grad_norm": 0.9777078628540039,
"learning_rate": 2.359978563983022e-06,
"loss": 0.8125,
"step": 1370
},
{
"epoch": 2.4084136722173533,
"grad_norm": 1.0206762552261353,
"learning_rate": 2.294607679033283e-06,
"loss": 0.7912,
"step": 1375
},
{
"epoch": 2.4171779141104293,
"grad_norm": 0.9738752245903015,
"learning_rate": 2.230037412308452e-06,
"loss": 0.8411,
"step": 1380
},
{
"epoch": 2.4259421560035057,
"grad_norm": 0.9954826831817627,
"learning_rate": 2.166274472893567e-06,
"loss": 0.8052,
"step": 1385
},
{
"epoch": 2.4347063978965817,
"grad_norm": 0.9861373901367188,
"learning_rate": 2.1033254859894224e-06,
"loss": 0.8041,
"step": 1390
},
{
"epoch": 2.443470639789658,
"grad_norm": 0.9600276947021484,
"learning_rate": 2.041196992224206e-06,
"loss": 0.8326,
"step": 1395
},
{
"epoch": 2.4522348816827346,
"grad_norm": 1.127557396888733,
"learning_rate": 1.9798954469738762e-06,
"loss": 0.8355,
"step": 1400
},
{
"epoch": 2.4609991235758106,
"grad_norm": 0.9988298416137695,
"learning_rate": 1.9194272196914533e-06,
"loss": 0.8473,
"step": 1405
},
{
"epoch": 2.469763365468887,
"grad_norm": 0.972212553024292,
"learning_rate": 1.8597985932451856e-06,
"loss": 0.816,
"step": 1410
},
{
"epoch": 2.478527607361963,
"grad_norm": 0.9716165065765381,
"learning_rate": 1.8010157632657544e-06,
"loss": 0.8157,
"step": 1415
},
{
"epoch": 2.4872918492550395,
"grad_norm": 0.9722920656204224,
"learning_rate": 1.7430848375025178e-06,
"loss": 0.8238,
"step": 1420
},
{
"epoch": 2.4960560911481156,
"grad_norm": 1.0044946670532227,
"learning_rate": 1.686011835188891e-06,
"loss": 0.8473,
"step": 1425
},
{
"epoch": 2.504820333041192,
"grad_norm": 0.9682095050811768,
"learning_rate": 1.6298026864169336e-06,
"loss": 0.8132,
"step": 1430
},
{
"epoch": 2.513584574934268,
"grad_norm": 0.9928619861602783,
"learning_rate": 1.5744632315211815e-06,
"loss": 0.837,
"step": 1435
},
{
"epoch": 2.5223488168273445,
"grad_norm": 0.9613544344902039,
"learning_rate": 1.5199992204718295e-06,
"loss": 0.8209,
"step": 1440
},
{
"epoch": 2.531113058720421,
"grad_norm": 1.0032097101211548,
"learning_rate": 1.466416312277269e-06,
"loss": 0.8303,
"step": 1445
},
{
"epoch": 2.539877300613497,
"grad_norm": 0.9630649089813232,
"learning_rate": 1.4137200743961189e-06,
"loss": 0.825,
"step": 1450
},
{
"epoch": 2.548641542506573,
"grad_norm": 0.9702491164207458,
"learning_rate": 1.3619159821587236e-06,
"loss": 0.8148,
"step": 1455
},
{
"epoch": 2.5574057843996494,
"grad_norm": 0.9509206414222717,
"learning_rate": 1.3110094181982657e-06,
"loss": 0.7695,
"step": 1460
},
{
"epoch": 2.566170026292726,
"grad_norm": 0.9589338302612305,
"learning_rate": 1.261005671891482e-06,
"loss": 0.8532,
"step": 1465
},
{
"epoch": 2.574934268185802,
"grad_norm": 0.9704285264015198,
"learning_rate": 1.2119099388090715e-06,
"loss": 0.797,
"step": 1470
},
{
"epoch": 2.5836985100788783,
"grad_norm": 1.0093833208084106,
"learning_rate": 1.1637273201758747e-06,
"loss": 0.8233,
"step": 1475
},
{
"epoch": 2.5924627519719543,
"grad_norm": 0.9612089991569519,
"learning_rate": 1.1164628223408169e-06,
"loss": 0.8489,
"step": 1480
},
{
"epoch": 2.6012269938650308,
"grad_norm": 0.9347235560417175,
"learning_rate": 1.0701213562567491e-06,
"loss": 0.7855,
"step": 1485
},
{
"epoch": 2.6099912357581068,
"grad_norm": 1.00240957736969,
"learning_rate": 1.0247077369701653e-06,
"loss": 0.8322,
"step": 1490
},
{
"epoch": 2.618755477651183,
"grad_norm": 0.99866783618927,
"learning_rate": 9.802266831209206e-07,
"loss": 0.8133,
"step": 1495
},
{
"epoch": 2.6275197195442592,
"grad_norm": 1.0041725635528564,
"learning_rate": 9.36682816451926e-07,
"loss": 0.8715,
"step": 1500
},
{
"epoch": 2.6362839614373357,
"grad_norm": 0.9615875482559204,
"learning_rate": 8.940806613289499e-07,
"loss": 0.8075,
"step": 1505
},
{
"epoch": 2.645048203330412,
"grad_norm": 0.9449265003204346,
"learning_rate": 8.524246442705153e-07,
"loss": 0.7974,
"step": 1510
},
{
"epoch": 2.653812445223488,
"grad_norm": 0.9578828811645508,
"learning_rate": 8.117190934879593e-07,
"loss": 0.8175,
"step": 1515
},
{
"epoch": 2.662576687116564,
"grad_norm": 0.9990285038948059,
"learning_rate": 7.719682384357308e-07,
"loss": 0.8147,
"step": 1520
},
{
"epoch": 2.6713409290096406,
"grad_norm": 0.9652912616729736,
"learning_rate": 7.33176209371923e-07,
"loss": 0.8429,
"step": 1525
},
{
"epoch": 2.680105170902717,
"grad_norm": 0.9373207092285156,
"learning_rate": 6.953470369291349e-07,
"loss": 0.825,
"step": 1530
},
{
"epoch": 2.688869412795793,
"grad_norm": 0.9682218432426453,
"learning_rate": 6.5848465169566e-07,
"loss": 0.7916,
"step": 1535
},
{
"epoch": 2.6976336546888695,
"grad_norm": 0.995035707950592,
"learning_rate": 6.225928838071016e-07,
"loss": 0.829,
"step": 1540
},
{
"epoch": 2.7063978965819455,
"grad_norm": 0.9676108956336975,
"learning_rate": 5.876754625483904e-07,
"loss": 0.8497,
"step": 1545
},
{
"epoch": 2.715162138475022,
"grad_norm": 0.9674281477928162,
"learning_rate": 5.537360159663107e-07,
"loss": 0.8126,
"step": 1550
},
{
"epoch": 2.7239263803680984,
"grad_norm": 0.9768509864807129,
"learning_rate": 5.207780704925314e-07,
"loss": 0.8432,
"step": 1555
},
{
"epoch": 2.7326906222611744,
"grad_norm": 0.9932735562324524,
"learning_rate": 4.888050505771869e-07,
"loss": 0.8293,
"step": 1560
},
{
"epoch": 2.7414548641542504,
"grad_norm": 0.9800174832344055,
"learning_rate": 4.5782027833307983e-07,
"loss": 0.7843,
"step": 1565
},
{
"epoch": 2.750219106047327,
"grad_norm": 0.9393450021743774,
"learning_rate": 4.2782697319048603e-07,
"loss": 0.8016,
"step": 1570
},
{
"epoch": 2.7589833479404033,
"grad_norm": 0.9714465737342834,
"learning_rate": 3.9882825156265846e-07,
"loss": 0.8264,
"step": 1575
},
{
"epoch": 2.7677475898334793,
"grad_norm": 0.975568950176239,
"learning_rate": 3.708271265220087e-07,
"loss": 0.802,
"step": 1580
},
{
"epoch": 2.776511831726556,
"grad_norm": 0.9788158535957336,
"learning_rate": 3.4382650748704173e-07,
"loss": 0.8374,
"step": 1585
},
{
"epoch": 2.785276073619632,
"grad_norm": 0.9417116641998291,
"learning_rate": 3.178291999200633e-07,
"loss": 0.8181,
"step": 1590
},
{
"epoch": 2.7940403155127083,
"grad_norm": 0.9802819490432739,
"learning_rate": 2.928379050356722e-07,
"loss": 0.8208,
"step": 1595
},
{
"epoch": 2.8028045574057843,
"grad_norm": 0.9727985858917236,
"learning_rate": 2.6885521952010105e-07,
"loss": 0.7862,
"step": 1600
},
{
"epoch": 2.8115687992988607,
"grad_norm": 0.9225666522979736,
"learning_rate": 2.458836352614069e-07,
"loss": 0.7791,
"step": 1605
},
{
"epoch": 2.8203330411919367,
"grad_norm": 1.038718342781067,
"learning_rate": 2.2392553909055813e-07,
"loss": 0.8164,
"step": 1610
},
{
"epoch": 2.829097283085013,
"grad_norm": 0.945773184299469,
"learning_rate": 2.029832125334319e-07,
"loss": 0.8277,
"step": 1615
},
{
"epoch": 2.8378615249780896,
"grad_norm": 0.9560094475746155,
"learning_rate": 1.8305883157375804e-07,
"loss": 0.7974,
"step": 1620
},
{
"epoch": 2.8466257668711656,
"grad_norm": 0.9896951913833618,
"learning_rate": 1.6415446642702337e-07,
"loss": 0.8084,
"step": 1625
},
{
"epoch": 2.8553900087642416,
"grad_norm": 0.9845879077911377,
"learning_rate": 1.4627208132536818e-07,
"loss": 0.8216,
"step": 1630
},
{
"epoch": 2.864154250657318,
"grad_norm": 0.9730380177497864,
"learning_rate": 1.2941353431350058e-07,
"loss": 0.7997,
"step": 1635
},
{
"epoch": 2.8729184925503946,
"grad_norm": 0.9810739159584045,
"learning_rate": 1.1358057705563641e-07,
"loss": 0.8212,
"step": 1640
},
{
"epoch": 2.8816827344434706,
"grad_norm": 0.9314019083976746,
"learning_rate": 9.877485465349057e-08,
"loss": 0.7794,
"step": 1645
},
{
"epoch": 2.890446976336547,
"grad_norm": 0.9651502966880798,
"learning_rate": 8.499790547535025e-08,
"loss": 0.8138,
"step": 1650
},
{
"epoch": 2.899211218229623,
"grad_norm": 0.966038167476654,
"learning_rate": 7.225116099623287e-08,
"loss": 0.8212,
"step": 1655
},
{
"epoch": 2.9079754601226995,
"grad_norm": 0.9493021965026855,
"learning_rate": 6.053594564914611e-08,
"loss": 0.832,
"step": 1660
},
{
"epoch": 2.9167397020157755,
"grad_norm": 0.9688047766685486,
"learning_rate": 4.985347668747809e-08,
"loss": 0.8239,
"step": 1665
},
{
"epoch": 2.925503943908852,
"grad_norm": 0.9778699278831482,
"learning_rate": 4.020486405852286e-08,
"loss": 0.7976,
"step": 1670
},
{
"epoch": 2.934268185801928,
"grad_norm": 0.9479379653930664,
"learning_rate": 3.15911102881461e-08,
"loss": 0.8375,
"step": 1675
},
{
"epoch": 2.9430324276950044,
"grad_norm": 1.0030702352523804,
"learning_rate": 2.4013110376623906e-08,
"loss": 0.8225,
"step": 1680
},
{
"epoch": 2.951796669588081,
"grad_norm": 1.0119658708572388,
"learning_rate": 1.747165170564724e-08,
"loss": 0.8276,
"step": 1685
},
{
"epoch": 2.960560911481157,
"grad_norm": 0.9981706738471985,
"learning_rate": 1.1967413956510687e-08,
"loss": 0.8661,
"step": 1690
},
{
"epoch": 2.969325153374233,
"grad_norm": 0.9298052787780762,
"learning_rate": 7.500969039491156e-09,
"loss": 0.8439,
"step": 1695
},
{
"epoch": 2.9780893952673093,
"grad_norm": 0.9936395287513733,
"learning_rate": 4.072781034425432e-09,
"loss": 0.8221,
"step": 1700
},
{
"epoch": 2.9868536371603858,
"grad_norm": 1.0040860176086426,
"learning_rate": 1.6832061424865155e-09,
"loss": 0.818,
"step": 1705
},
{
"epoch": 2.9956178790534618,
"grad_norm": 0.9950876235961914,
"learning_rate": 3.324926491787839e-10,
"loss": 0.8279,
"step": 1710
},
{
"epoch": 3.0,
"eval_loss": 1.1766911745071411,
"eval_runtime": 199.3045,
"eval_samples_per_second": 9.167,
"eval_steps_per_second": 2.293,
"step": 1713
},
{
"epoch": 3.0,
"step": 1713,
"total_flos": 90953314467840.0,
"train_loss": 0.9790556504583052,
"train_runtime": 12705.6497,
"train_samples_per_second": 8.618,
"train_steps_per_second": 0.135
}
],
"logging_steps": 5,
"max_steps": 1713,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 90953314467840.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}