{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 1713, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008764241893076249, "grad_norm": 5.367548942565918, "learning_rate": 4.651162790697675e-07, "loss": 1.491, "step": 5 }, { "epoch": 0.017528483786152498, "grad_norm": 3.861912488937378, "learning_rate": 1.0465116279069768e-06, "loss": 1.4607, "step": 10 }, { "epoch": 0.026292725679228746, "grad_norm": 2.542222738265991, "learning_rate": 1.6279069767441862e-06, "loss": 1.4704, "step": 15 }, { "epoch": 0.035056967572304996, "grad_norm": 2.1328587532043457, "learning_rate": 2.2093023255813954e-06, "loss": 1.4085, "step": 20 }, { "epoch": 0.04382120946538125, "grad_norm": 1.5579408407211304, "learning_rate": 2.790697674418605e-06, "loss": 1.3603, "step": 25 }, { "epoch": 0.05258545135845749, "grad_norm": 1.6026604175567627, "learning_rate": 3.372093023255814e-06, "loss": 1.3568, "step": 30 }, { "epoch": 0.06134969325153374, "grad_norm": 1.5183006525039673, "learning_rate": 3.953488372093024e-06, "loss": 1.3702, "step": 35 }, { "epoch": 0.07011393514460999, "grad_norm": 1.416035532951355, "learning_rate": 4.5348837209302326e-06, "loss": 1.3288, "step": 40 }, { "epoch": 0.07887817703768624, "grad_norm": 1.4895626306533813, "learning_rate": 5.116279069767442e-06, "loss": 1.3292, "step": 45 }, { "epoch": 0.0876424189307625, "grad_norm": 1.3430354595184326, "learning_rate": 5.697674418604652e-06, "loss": 1.3227, "step": 50 }, { "epoch": 0.09640666082383874, "grad_norm": 1.4117517471313477, "learning_rate": 6.279069767441861e-06, "loss": 1.2902, "step": 55 }, { "epoch": 0.10517090271691498, "grad_norm": 1.3359665870666504, "learning_rate": 6.86046511627907e-06, "loss": 1.3327, "step": 60 }, { "epoch": 0.11393514460999124, "grad_norm": 1.4718199968338013, "learning_rate": 7.44186046511628e-06, "loss": 1.2973, "step": 65 }, { "epoch": 0.12269938650306748, "grad_norm": 1.2470380067825317, "learning_rate": 8.023255813953488e-06, "loss": 1.2706, "step": 70 }, { "epoch": 0.13146362839614373, "grad_norm": 1.324803352355957, "learning_rate": 8.604651162790698e-06, "loss": 1.2178, "step": 75 }, { "epoch": 0.14022787028921999, "grad_norm": 1.3574628829956055, "learning_rate": 9.186046511627908e-06, "loss": 1.2316, "step": 80 }, { "epoch": 0.14899211218229624, "grad_norm": 1.3636841773986816, "learning_rate": 9.767441860465117e-06, "loss": 1.283, "step": 85 }, { "epoch": 0.15775635407537247, "grad_norm": 1.7021708488464355, "learning_rate": 1.0348837209302327e-05, "loss": 1.2635, "step": 90 }, { "epoch": 0.16652059596844873, "grad_norm": 1.243608832359314, "learning_rate": 1.0930232558139535e-05, "loss": 1.2079, "step": 95 }, { "epoch": 0.175284837861525, "grad_norm": 1.8144162893295288, "learning_rate": 1.1511627906976746e-05, "loss": 1.2186, "step": 100 }, { "epoch": 0.18404907975460122, "grad_norm": 1.1823457479476929, "learning_rate": 1.2093023255813954e-05, "loss": 1.2103, "step": 105 }, { "epoch": 0.19281332164767748, "grad_norm": 1.198132872581482, "learning_rate": 1.2674418604651164e-05, "loss": 1.2044, "step": 110 }, { "epoch": 0.20157756354075373, "grad_norm": 11.093875885009766, "learning_rate": 1.3255813953488372e-05, "loss": 1.1683, "step": 115 }, { "epoch": 0.21034180543382996, "grad_norm": 1.0984971523284912, "learning_rate": 1.3837209302325583e-05, "loss": 1.2289, "step": 120 }, { "epoch": 0.21910604732690622, "grad_norm": 1.2427825927734375, "learning_rate": 1.441860465116279e-05, "loss": 1.1449, "step": 125 }, { "epoch": 0.22787028921998248, "grad_norm": 1.2608261108398438, "learning_rate": 1.5000000000000002e-05, "loss": 1.1524, "step": 130 }, { "epoch": 0.2366345311130587, "grad_norm": 1.114823818206787, "learning_rate": 1.558139534883721e-05, "loss": 1.1769, "step": 135 }, { "epoch": 0.24539877300613497, "grad_norm": 1.1239306926727295, "learning_rate": 1.616279069767442e-05, "loss": 1.1516, "step": 140 }, { "epoch": 0.2541630148992112, "grad_norm": 1.1203042268753052, "learning_rate": 1.674418604651163e-05, "loss": 1.1243, "step": 145 }, { "epoch": 0.26292725679228746, "grad_norm": 1.0693674087524414, "learning_rate": 1.7325581395348837e-05, "loss": 1.1574, "step": 150 }, { "epoch": 0.27169149868536374, "grad_norm": 1.1013996601104736, "learning_rate": 1.790697674418605e-05, "loss": 1.1621, "step": 155 }, { "epoch": 0.28045574057843997, "grad_norm": 1.1914992332458496, "learning_rate": 1.8488372093023256e-05, "loss": 1.1468, "step": 160 }, { "epoch": 0.2892199824715162, "grad_norm": 1.1144826412200928, "learning_rate": 1.9069767441860468e-05, "loss": 1.153, "step": 165 }, { "epoch": 0.2979842243645925, "grad_norm": 1.1576107740402222, "learning_rate": 1.9651162790697676e-05, "loss": 1.151, "step": 170 }, { "epoch": 0.3067484662576687, "grad_norm": 1.037223219871521, "learning_rate": 1.999991687649223e-05, "loss": 1.1386, "step": 175 }, { "epoch": 0.31551270815074495, "grad_norm": 1.1823593378067017, "learning_rate": 1.999898175290004e-05, "loss": 1.1368, "step": 180 }, { "epoch": 0.32427695004382123, "grad_norm": 1.0528627634048462, "learning_rate": 1.9997007698817558e-05, "loss": 1.183, "step": 185 }, { "epoch": 0.33304119193689746, "grad_norm": 1.1595643758773804, "learning_rate": 1.9993994919356167e-05, "loss": 1.1687, "step": 190 }, { "epoch": 0.3418054338299737, "grad_norm": 1.0715525150299072, "learning_rate": 1.9989943727554597e-05, "loss": 1.1648, "step": 195 }, { "epoch": 0.35056967572305, "grad_norm": 1.0216760635375977, "learning_rate": 1.9984854544346367e-05, "loss": 1.1587, "step": 200 }, { "epoch": 0.3593339176161262, "grad_norm": 1.0617367029190063, "learning_rate": 1.9978727898516087e-05, "loss": 1.145, "step": 205 }, { "epoch": 0.36809815950920244, "grad_norm": 1.0882047414779663, "learning_rate": 1.997156442664449e-05, "loss": 1.1652, "step": 210 }, { "epoch": 0.3768624014022787, "grad_norm": 1.154795527458191, "learning_rate": 1.9963364873042298e-05, "loss": 1.135, "step": 215 }, { "epoch": 0.38562664329535495, "grad_norm": 1.0904324054718018, "learning_rate": 1.9954130089672893e-05, "loss": 1.1262, "step": 220 }, { "epoch": 0.3943908851884312, "grad_norm": 1.0021111965179443, "learning_rate": 1.994386103606377e-05, "loss": 1.1422, "step": 225 }, { "epoch": 0.40315512708150747, "grad_norm": 1.0234806537628174, "learning_rate": 1.9932558779206873e-05, "loss": 1.1315, "step": 230 }, { "epoch": 0.4119193689745837, "grad_norm": 1.10641610622406, "learning_rate": 1.9920224493447702e-05, "loss": 1.1824, "step": 235 }, { "epoch": 0.42068361086765993, "grad_norm": 1.0238338708877563, "learning_rate": 1.9906859460363307e-05, "loss": 1.1442, "step": 240 }, { "epoch": 0.4294478527607362, "grad_norm": 0.990487813949585, "learning_rate": 1.989246506862913e-05, "loss": 1.1276, "step": 245 }, { "epoch": 0.43821209465381245, "grad_norm": 20.3802490234375, "learning_rate": 1.9877042813874712e-05, "loss": 1.1744, "step": 250 }, { "epoch": 0.4469763365468887, "grad_norm": 1.1273601055145264, "learning_rate": 1.9860594298528283e-05, "loss": 1.1774, "step": 255 }, { "epoch": 0.45574057843996496, "grad_norm": 1.014085292816162, "learning_rate": 1.984312123165028e-05, "loss": 1.162, "step": 260 }, { "epoch": 0.4645048203330412, "grad_norm": 1.0875205993652344, "learning_rate": 1.982462542875576e-05, "loss": 1.1485, "step": 265 }, { "epoch": 0.4732690622261174, "grad_norm": 1.0361530780792236, "learning_rate": 1.9805108811625774e-05, "loss": 1.1422, "step": 270 }, { "epoch": 0.4820333041191937, "grad_norm": 1.0539902448654175, "learning_rate": 1.9784573408107657e-05, "loss": 1.0915, "step": 275 }, { "epoch": 0.49079754601226994, "grad_norm": 1.05149245262146, "learning_rate": 1.976302135190436e-05, "loss": 1.1372, "step": 280 }, { "epoch": 0.49956178790534617, "grad_norm": 1.0928102731704712, "learning_rate": 1.9740454882352733e-05, "loss": 1.1239, "step": 285 }, { "epoch": 0.5083260297984225, "grad_norm": 1.0785322189331055, "learning_rate": 1.971687634419086e-05, "loss": 1.1429, "step": 290 }, { "epoch": 0.5170902716914987, "grad_norm": 1.020357370376587, "learning_rate": 1.9692288187314423e-05, "loss": 1.1195, "step": 295 }, { "epoch": 0.5258545135845749, "grad_norm": 0.9896298050880432, "learning_rate": 1.9666692966522144e-05, "loss": 1.1217, "step": 300 }, { "epoch": 0.5346187554776511, "grad_norm": 0.9637587070465088, "learning_rate": 1.9640093341250356e-05, "loss": 1.1082, "step": 305 }, { "epoch": 0.5433829973707275, "grad_norm": 1.2339686155319214, "learning_rate": 1.961249207529665e-05, "loss": 1.1459, "step": 310 }, { "epoch": 0.5521472392638037, "grad_norm": 1.0626837015151978, "learning_rate": 1.9583892036532726e-05, "loss": 1.1257, "step": 315 }, { "epoch": 0.5609114811568799, "grad_norm": 1.0179359912872314, "learning_rate": 1.9554296196606395e-05, "loss": 1.1111, "step": 320 }, { "epoch": 0.5696757230499562, "grad_norm": 1.0226428508758545, "learning_rate": 1.9523707630632834e-05, "loss": 1.1673, "step": 325 }, { "epoch": 0.5784399649430324, "grad_norm": 1.0737133026123047, "learning_rate": 1.9492129516875055e-05, "loss": 1.1325, "step": 330 }, { "epoch": 0.5872042068361086, "grad_norm": 1.0531032085418701, "learning_rate": 1.9459565136413667e-05, "loss": 1.1478, "step": 335 }, { "epoch": 0.595968448729185, "grad_norm": 1.0400668382644653, "learning_rate": 1.942601787280598e-05, "loss": 1.1403, "step": 340 }, { "epoch": 0.6047326906222612, "grad_norm": 0.9359525442123413, "learning_rate": 1.9391491211734426e-05, "loss": 1.1298, "step": 345 }, { "epoch": 0.6134969325153374, "grad_norm": 3.9531524181365967, "learning_rate": 1.935598874064438e-05, "loss": 1.1923, "step": 350 }, { "epoch": 0.6222611744084137, "grad_norm": 1.0364443063735962, "learning_rate": 1.9319514148371436e-05, "loss": 1.1096, "step": 355 }, { "epoch": 0.6310254163014899, "grad_norm": 1.0656158924102783, "learning_rate": 1.9282071224758092e-05, "loss": 1.1282, "step": 360 }, { "epoch": 0.6397896581945661, "grad_norm": 1.0614289045333862, "learning_rate": 1.9243663860259992e-05, "loss": 1.1137, "step": 365 }, { "epoch": 0.6485539000876425, "grad_norm": 1.002898931503296, "learning_rate": 1.9204296045541686e-05, "loss": 1.1091, "step": 370 }, { "epoch": 0.6573181419807187, "grad_norm": 1.0451066493988037, "learning_rate": 1.916397187106199e-05, "loss": 1.0919, "step": 375 }, { "epoch": 0.6660823838737949, "grad_norm": 1.192143201828003, "learning_rate": 1.9122695526648968e-05, "loss": 1.1581, "step": 380 }, { "epoch": 0.6748466257668712, "grad_norm": 1.0061026811599731, "learning_rate": 1.90804713010646e-05, "loss": 1.116, "step": 385 }, { "epoch": 0.6836108676599474, "grad_norm": 2.3462023735046387, "learning_rate": 1.9037303581559143e-05, "loss": 1.1323, "step": 390 }, { "epoch": 0.6923751095530236, "grad_norm": 0.9700145125389099, "learning_rate": 1.899319685341532e-05, "loss": 1.1075, "step": 395 }, { "epoch": 0.7011393514461, "grad_norm": 0.9761490821838379, "learning_rate": 1.8948155699482243e-05, "loss": 1.1291, "step": 400 }, { "epoch": 0.7099035933391762, "grad_norm": 1.0112907886505127, "learning_rate": 1.8902184799699265e-05, "loss": 1.1087, "step": 405 }, { "epoch": 0.7186678352322524, "grad_norm": 0.9741994738578796, "learning_rate": 1.885528893060969e-05, "loss": 1.1181, "step": 410 }, { "epoch": 0.7274320771253286, "grad_norm": 0.9536153078079224, "learning_rate": 1.8807472964864516e-05, "loss": 1.114, "step": 415 }, { "epoch": 0.7361963190184049, "grad_norm": 0.9664406180381775, "learning_rate": 1.8758741870716093e-05, "loss": 1.1474, "step": 420 }, { "epoch": 0.7449605609114811, "grad_norm": 0.999437689781189, "learning_rate": 1.8709100711501957e-05, "loss": 1.1067, "step": 425 }, { "epoch": 0.7537248028045574, "grad_norm": 1.0034998655319214, "learning_rate": 1.865855464511869e-05, "loss": 1.1409, "step": 430 }, { "epoch": 0.7624890446976337, "grad_norm": 1.0300318002700806, "learning_rate": 1.8607108923486025e-05, "loss": 1.1289, "step": 435 }, { "epoch": 0.7712532865907099, "grad_norm": 0.9994638562202454, "learning_rate": 1.8554768892001137e-05, "loss": 1.1093, "step": 440 }, { "epoch": 0.7800175284837861, "grad_norm": 0.9193391799926758, "learning_rate": 1.8501539988983234e-05, "loss": 1.1377, "step": 445 }, { "epoch": 0.7887817703768624, "grad_norm": 1.0165811777114868, "learning_rate": 1.844742774510851e-05, "loss": 1.1204, "step": 450 }, { "epoch": 0.7975460122699386, "grad_norm": 0.9755986928939819, "learning_rate": 1.8392437782835475e-05, "loss": 1.0935, "step": 455 }, { "epoch": 0.8063102541630149, "grad_norm": 0.977584183216095, "learning_rate": 1.8336575815820764e-05, "loss": 1.1064, "step": 460 }, { "epoch": 0.8150744960560912, "grad_norm": 0.9432125687599182, "learning_rate": 1.8279847648325478e-05, "loss": 1.099, "step": 465 }, { "epoch": 0.8238387379491674, "grad_norm": 1.0756127834320068, "learning_rate": 1.822225917461208e-05, "loss": 1.0926, "step": 470 }, { "epoch": 0.8326029798422436, "grad_norm": 1.0018426179885864, "learning_rate": 1.8163816378331983e-05, "loss": 1.1292, "step": 475 }, { "epoch": 0.8413672217353199, "grad_norm": 1.0097193717956543, "learning_rate": 1.81045253319038e-05, "loss": 1.0738, "step": 480 }, { "epoch": 0.8501314636283961, "grad_norm": 0.9783721566200256, "learning_rate": 1.8044392195882428e-05, "loss": 1.1059, "step": 485 }, { "epoch": 0.8588957055214724, "grad_norm": 0.9834737181663513, "learning_rate": 1.7983423218318918e-05, "loss": 1.1063, "step": 490 }, { "epoch": 0.8676599474145487, "grad_norm": 1.0263484716415405, "learning_rate": 1.7921624734111292e-05, "loss": 1.1325, "step": 495 }, { "epoch": 0.8764241893076249, "grad_norm": 0.9454247951507568, "learning_rate": 1.7859003164346334e-05, "loss": 1.0937, "step": 500 }, { "epoch": 0.8851884312007011, "grad_norm": 1.006463646888733, "learning_rate": 1.779556501563239e-05, "loss": 1.0511, "step": 505 }, { "epoch": 0.8939526730937774, "grad_norm": 6.430685043334961, "learning_rate": 1.773131687942333e-05, "loss": 1.0899, "step": 510 }, { "epoch": 0.9027169149868537, "grad_norm": 1.3062087297439575, "learning_rate": 1.7666265431333654e-05, "loss": 1.1047, "step": 515 }, { "epoch": 0.9114811568799299, "grad_norm": 1.0522316694259644, "learning_rate": 1.76004174304449e-05, "loss": 1.1009, "step": 520 }, { "epoch": 0.9202453987730062, "grad_norm": 0.9894193410873413, "learning_rate": 1.7533779718603315e-05, "loss": 1.0761, "step": 525 }, { "epoch": 0.9290096406660824, "grad_norm": 1.0116757154464722, "learning_rate": 1.7466359219708987e-05, "loss": 1.1305, "step": 530 }, { "epoch": 0.9377738825591586, "grad_norm": 0.962745726108551, "learning_rate": 1.739816293899642e-05, "loss": 1.0758, "step": 535 }, { "epoch": 0.9465381244522348, "grad_norm": 0.9733975529670715, "learning_rate": 1.7329197962306666e-05, "loss": 1.0752, "step": 540 }, { "epoch": 0.9553023663453112, "grad_norm": 1.026983618736267, "learning_rate": 1.7259471455351072e-05, "loss": 1.0576, "step": 545 }, { "epoch": 0.9640666082383874, "grad_norm": 0.9675541520118713, "learning_rate": 1.718899066296675e-05, "loss": 1.0759, "step": 550 }, { "epoch": 0.9728308501314636, "grad_norm": 0.9842016100883484, "learning_rate": 1.71177629083638e-05, "loss": 1.0704, "step": 555 }, { "epoch": 0.9815950920245399, "grad_norm": 0.9556295871734619, "learning_rate": 1.7045795592364413e-05, "loss": 1.1343, "step": 560 }, { "epoch": 0.9903593339176161, "grad_norm": 1.033588171005249, "learning_rate": 1.6973096192633884e-05, "loss": 1.0947, "step": 565 }, { "epoch": 0.9991235758106923, "grad_norm": 1.971771240234375, "learning_rate": 1.6899672262903675e-05, "loss": 1.1293, "step": 570 }, { "epoch": 1.0, "eval_loss": 1.1597641706466675, "eval_runtime": 199.4031, "eval_samples_per_second": 9.162, "eval_steps_per_second": 2.292, "step": 571 }, { "epoch": 1.007011393514461, "grad_norm": 1.0958155393600464, "learning_rate": 1.6825531432186545e-05, "loss": 1.0193, "step": 575 }, { "epoch": 1.0157756354075373, "grad_norm": 1.108912467956543, "learning_rate": 1.6750681403983847e-05, "loss": 0.9767, "step": 580 }, { "epoch": 1.0245398773006136, "grad_norm": 1.0128231048583984, "learning_rate": 1.6675129955485154e-05, "loss": 0.9534, "step": 585 }, { "epoch": 1.0333041191936898, "grad_norm": 0.9520951509475708, "learning_rate": 1.659888493676013e-05, "loss": 0.9388, "step": 590 }, { "epoch": 1.042068361086766, "grad_norm": 0.98039710521698, "learning_rate": 1.652195426994292e-05, "loss": 0.97, "step": 595 }, { "epoch": 1.0508326029798423, "grad_norm": 1.0683668851852417, "learning_rate": 1.6444345948408985e-05, "loss": 0.9539, "step": 600 }, { "epoch": 1.0595968448729185, "grad_norm": 1.0525304079055786, "learning_rate": 1.636606803594457e-05, "loss": 0.9534, "step": 605 }, { "epoch": 1.0683610867659947, "grad_norm": 0.999165415763855, "learning_rate": 1.628712866590885e-05, "loss": 0.9634, "step": 610 }, { "epoch": 1.077125328659071, "grad_norm": 0.9724875688552856, "learning_rate": 1.6207536040388844e-05, "loss": 0.9559, "step": 615 }, { "epoch": 1.0858895705521472, "grad_norm": 0.9775224924087524, "learning_rate": 1.612729842934718e-05, "loss": 0.9771, "step": 620 }, { "epoch": 1.0946538124452234, "grad_norm": 1.2882146835327148, "learning_rate": 1.604642416976283e-05, "loss": 0.9027, "step": 625 }, { "epoch": 1.1034180543382996, "grad_norm": 1.0088601112365723, "learning_rate": 1.596492166476485e-05, "loss": 0.9494, "step": 630 }, { "epoch": 1.112182296231376, "grad_norm": 1.0667091608047485, "learning_rate": 1.588279938275929e-05, "loss": 0.9493, "step": 635 }, { "epoch": 1.1209465381244523, "grad_norm": 1.0000181198120117, "learning_rate": 1.580006585654927e-05, "loss": 0.9609, "step": 640 }, { "epoch": 1.1297107800175286, "grad_norm": 0.9999110102653503, "learning_rate": 1.5716729682448392e-05, "loss": 1.0068, "step": 645 }, { "epoch": 1.1384750219106048, "grad_norm": 1.0657200813293457, "learning_rate": 1.563279951938758e-05, "loss": 0.9676, "step": 650 }, { "epoch": 1.147239263803681, "grad_norm": 1.029891848564148, "learning_rate": 1.5548284088015354e-05, "loss": 0.9623, "step": 655 }, { "epoch": 1.1560035056967572, "grad_norm": 1.015758752822876, "learning_rate": 1.546319216979174e-05, "loss": 0.9897, "step": 660 }, { "epoch": 1.1647677475898335, "grad_norm": 0.9652720093727112, "learning_rate": 1.537753260607584e-05, "loss": 0.9607, "step": 665 }, { "epoch": 1.1735319894829097, "grad_norm": 1.0845791101455688, "learning_rate": 1.5291314297207177e-05, "loss": 0.9783, "step": 670 }, { "epoch": 1.182296231375986, "grad_norm": 1.0521138906478882, "learning_rate": 1.520454620158093e-05, "loss": 0.9836, "step": 675 }, { "epoch": 1.1910604732690622, "grad_norm": 0.9807194471359253, "learning_rate": 1.5117237334717117e-05, "loss": 0.9443, "step": 680 }, { "epoch": 1.1998247151621384, "grad_norm": 1.0408189296722412, "learning_rate": 1.5029396768323847e-05, "loss": 0.9755, "step": 685 }, { "epoch": 1.2085889570552146, "grad_norm": 1.0140528678894043, "learning_rate": 1.4941033629354735e-05, "loss": 0.942, "step": 690 }, { "epoch": 1.2173531989482909, "grad_norm": 1.028287649154663, "learning_rate": 1.4852157099060595e-05, "loss": 0.9362, "step": 695 }, { "epoch": 1.2261174408413673, "grad_norm": 0.9807888269424438, "learning_rate": 1.4762776412035455e-05, "loss": 0.9752, "step": 700 }, { "epoch": 1.2348816827344435, "grad_norm": 1.0794785022735596, "learning_rate": 1.4672900855257056e-05, "loss": 0.9508, "step": 705 }, { "epoch": 1.2436459246275198, "grad_norm": 1.0464166402816772, "learning_rate": 1.4582539767121904e-05, "loss": 0.9519, "step": 710 }, { "epoch": 1.252410166520596, "grad_norm": 0.9949556589126587, "learning_rate": 1.449170253647498e-05, "loss": 0.9188, "step": 715 }, { "epoch": 1.2611744084136722, "grad_norm": 0.9590442180633545, "learning_rate": 1.4400398601634189e-05, "loss": 0.9686, "step": 720 }, { "epoch": 1.2699386503067485, "grad_norm": 1.0098439455032349, "learning_rate": 1.4308637449409705e-05, "loss": 0.9848, "step": 725 }, { "epoch": 1.2787028921998247, "grad_norm": 1.026219367980957, "learning_rate": 1.4216428614118245e-05, "loss": 0.9595, "step": 730 }, { "epoch": 1.287467134092901, "grad_norm": 1.0277692079544067, "learning_rate": 1.4123781676592418e-05, "loss": 0.9773, "step": 735 }, { "epoch": 1.2962313759859772, "grad_norm": 1.0222140550613403, "learning_rate": 1.4030706263185248e-05, "loss": 0.9399, "step": 740 }, { "epoch": 1.3049956178790534, "grad_norm": 0.9892441630363464, "learning_rate": 1.3937212044769957e-05, "loss": 0.985, "step": 745 }, { "epoch": 1.3137598597721296, "grad_norm": 1.0329406261444092, "learning_rate": 1.384330873573513e-05, "loss": 0.9369, "step": 750 }, { "epoch": 1.322524101665206, "grad_norm": 0.9816661477088928, "learning_rate": 1.3749006092975347e-05, "loss": 0.9457, "step": 755 }, { "epoch": 1.331288343558282, "grad_norm": 1.0054512023925781, "learning_rate": 1.3654313914877414e-05, "loss": 0.9087, "step": 760 }, { "epoch": 1.3400525854513585, "grad_norm": 17.338027954101562, "learning_rate": 1.3559242040302274e-05, "loss": 0.9808, "step": 765 }, { "epoch": 1.3488168273444348, "grad_norm": 1.0706207752227783, "learning_rate": 1.3463800347562705e-05, "loss": 0.9679, "step": 770 }, { "epoch": 1.357581069237511, "grad_norm": 1.040747046470642, "learning_rate": 1.3367998753396944e-05, "loss": 0.9974, "step": 775 }, { "epoch": 1.3663453111305872, "grad_norm": 0.9935981631278992, "learning_rate": 1.3271847211938286e-05, "loss": 0.9428, "step": 780 }, { "epoch": 1.3751095530236634, "grad_norm": 1.0025993585586548, "learning_rate": 1.317535571368082e-05, "loss": 0.9462, "step": 785 }, { "epoch": 1.3838737949167397, "grad_norm": 0.9988533854484558, "learning_rate": 1.3078534284441382e-05, "loss": 0.9734, "step": 790 }, { "epoch": 1.392638036809816, "grad_norm": 1.0070812702178955, "learning_rate": 1.2981392984317835e-05, "loss": 0.9622, "step": 795 }, { "epoch": 1.4014022787028921, "grad_norm": 1.0259467363357544, "learning_rate": 1.2883941906643786e-05, "loss": 0.9671, "step": 800 }, { "epoch": 1.4101665205959684, "grad_norm": 1.0248597860336304, "learning_rate": 1.2786191176939848e-05, "loss": 0.9402, "step": 805 }, { "epoch": 1.4189307624890448, "grad_norm": 1.008159875869751, "learning_rate": 1.2688150951861582e-05, "loss": 1.0111, "step": 810 }, { "epoch": 1.4276950043821208, "grad_norm": 1.024697184562683, "learning_rate": 1.2589831418144156e-05, "loss": 0.9354, "step": 815 }, { "epoch": 1.4364592462751973, "grad_norm": 0.9872326254844666, "learning_rate": 1.2491242791543922e-05, "loss": 0.9424, "step": 820 }, { "epoch": 1.4452234881682735, "grad_norm": 1.0979632139205933, "learning_rate": 1.2392395315776964e-05, "loss": 0.9594, "step": 825 }, { "epoch": 1.4539877300613497, "grad_norm": 0.9879066944122314, "learning_rate": 1.2293299261454726e-05, "loss": 0.9762, "step": 830 }, { "epoch": 1.462751971954426, "grad_norm": 1.278245210647583, "learning_rate": 1.2193964925016872e-05, "loss": 0.9458, "step": 835 }, { "epoch": 1.4715162138475022, "grad_norm": 1.4075230360031128, "learning_rate": 1.2094402627661447e-05, "loss": 0.9496, "step": 840 }, { "epoch": 1.4802804557405784, "grad_norm": 1.059368371963501, "learning_rate": 1.1994622714272448e-05, "loss": 0.965, "step": 845 }, { "epoch": 1.4890446976336547, "grad_norm": 0.9740917086601257, "learning_rate": 1.1894635552344976e-05, "loss": 0.939, "step": 850 }, { "epoch": 1.4978089395267309, "grad_norm": 0.9713614583015442, "learning_rate": 1.1794451530908011e-05, "loss": 0.9256, "step": 855 }, { "epoch": 1.5065731814198071, "grad_norm": 1.023720145225525, "learning_rate": 1.1694081059444947e-05, "loss": 0.9548, "step": 860 }, { "epoch": 1.5153374233128836, "grad_norm": 1.1291546821594238, "learning_rate": 1.159353456681201e-05, "loss": 0.9512, "step": 865 }, { "epoch": 1.5241016652059596, "grad_norm": 0.9696962833404541, "learning_rate": 1.1492822500154668e-05, "loss": 0.9715, "step": 870 }, { "epoch": 1.532865907099036, "grad_norm": 1.0114858150482178, "learning_rate": 1.1391955323822126e-05, "loss": 0.9355, "step": 875 }, { "epoch": 1.541630148992112, "grad_norm": 1.0963616371154785, "learning_rate": 1.1290943518280058e-05, "loss": 0.9779, "step": 880 }, { "epoch": 1.5503943908851885, "grad_norm": 0.9969412684440613, "learning_rate": 1.118979757902162e-05, "loss": 0.9589, "step": 885 }, { "epoch": 1.5591586327782647, "grad_norm": 1.022300362586975, "learning_rate": 1.1088528015476965e-05, "loss": 0.9656, "step": 890 }, { "epoch": 1.567922874671341, "grad_norm": 0.974607527256012, "learning_rate": 1.098714534992125e-05, "loss": 0.9622, "step": 895 }, { "epoch": 1.5766871165644172, "grad_norm": 1.0116885900497437, "learning_rate": 1.088566011638134e-05, "loss": 0.9343, "step": 900 }, { "epoch": 1.5854513584574934, "grad_norm": 1.0116138458251953, "learning_rate": 1.0784082859541291e-05, "loss": 0.9383, "step": 905 }, { "epoch": 1.5942156003505696, "grad_norm": 1.2108194828033447, "learning_rate": 1.0682424133646712e-05, "loss": 0.9171, "step": 910 }, { "epoch": 1.6029798422436459, "grad_norm": 1.0214340686798096, "learning_rate": 1.0580694501408138e-05, "loss": 0.9675, "step": 915 }, { "epoch": 1.6117440841367223, "grad_norm": 1.0362666845321655, "learning_rate": 1.0478904532903535e-05, "loss": 1.0028, "step": 920 }, { "epoch": 1.6205083260297983, "grad_norm": 0.9954794049263, "learning_rate": 1.0377064804480025e-05, "loss": 0.9624, "step": 925 }, { "epoch": 1.6292725679228748, "grad_norm": 1.0109649896621704, "learning_rate": 1.0275185897654972e-05, "loss": 0.9501, "step": 930 }, { "epoch": 1.6380368098159508, "grad_norm": 1.0172914266586304, "learning_rate": 1.0173278398016502e-05, "loss": 0.9354, "step": 935 }, { "epoch": 1.6468010517090272, "grad_norm": 0.9905017614364624, "learning_rate": 1.0071352894123654e-05, "loss": 0.9758, "step": 940 }, { "epoch": 1.6555652936021035, "grad_norm": 0.9832938313484192, "learning_rate": 9.969419976406166e-06, "loss": 0.9737, "step": 945 }, { "epoch": 1.6643295354951797, "grad_norm": 0.9569029808044434, "learning_rate": 9.867490236064109e-06, "loss": 0.9212, "step": 950 }, { "epoch": 1.673093777388256, "grad_norm": 1.0192569494247437, "learning_rate": 9.765574263967397e-06, "loss": 0.9472, "step": 955 }, { "epoch": 1.6818580192813322, "grad_norm": 0.9713300466537476, "learning_rate": 9.663682649555389e-06, "loss": 0.9644, "step": 960 }, { "epoch": 1.6906222611744084, "grad_norm": 0.9462825655937195, "learning_rate": 9.56182597973658e-06, "loss": 0.9576, "step": 965 }, { "epoch": 1.6993865030674846, "grad_norm": 0.9868680834770203, "learning_rate": 9.460014837788605e-06, "loss": 0.9667, "step": 970 }, { "epoch": 1.708150744960561, "grad_norm": 1.0376570224761963, "learning_rate": 9.358259802258582e-06, "loss": 0.9452, "step": 975 }, { "epoch": 1.716914986853637, "grad_norm": 1.0066869258880615, "learning_rate": 9.256571445863972e-06, "loss": 0.9534, "step": 980 }, { "epoch": 1.7256792287467135, "grad_norm": 1.0090934038162231, "learning_rate": 9.154960334394027e-06, "loss": 0.955, "step": 985 }, { "epoch": 1.7344434706397895, "grad_norm": 0.9518396854400635, "learning_rate": 9.053437025611974e-06, "loss": 0.9342, "step": 990 }, { "epoch": 1.743207712532866, "grad_norm": 0.9992371797561646, "learning_rate": 8.952012068158027e-06, "loss": 0.9722, "step": 995 }, { "epoch": 1.751971954425942, "grad_norm": 0.9554047584533691, "learning_rate": 8.850696000453327e-06, "loss": 0.9357, "step": 1000 }, { "epoch": 1.7607361963190185, "grad_norm": 0.9989141225814819, "learning_rate": 8.749499349604992e-06, "loss": 0.9821, "step": 1005 }, { "epoch": 1.7695004382120947, "grad_norm": 0.9504846334457397, "learning_rate": 8.64843263031228e-06, "loss": 0.9537, "step": 1010 }, { "epoch": 1.778264680105171, "grad_norm": 0.9558340907096863, "learning_rate": 8.547506343774097e-06, "loss": 0.9289, "step": 1015 }, { "epoch": 1.7870289219982471, "grad_norm": 1.0170910358428955, "learning_rate": 8.446730976597877e-06, "loss": 0.9501, "step": 1020 }, { "epoch": 1.7957931638913234, "grad_norm": 0.9939414262771606, "learning_rate": 8.346116999709975e-06, "loss": 0.9472, "step": 1025 }, { "epoch": 1.8045574057843996, "grad_norm": 0.9810356497764587, "learning_rate": 8.245674867267724e-06, "loss": 0.9491, "step": 1030 }, { "epoch": 1.8133216476774758, "grad_norm": 0.9643825888633728, "learning_rate": 8.145415015573183e-06, "loss": 0.947, "step": 1035 }, { "epoch": 1.8220858895705523, "grad_norm": 0.9195330739021301, "learning_rate": 8.045347861988789e-06, "loss": 0.876, "step": 1040 }, { "epoch": 1.8308501314636283, "grad_norm": 0.9632524847984314, "learning_rate": 7.945483803854937e-06, "loss": 0.9173, "step": 1045 }, { "epoch": 1.8396143733567047, "grad_norm": 0.9642296433448792, "learning_rate": 7.845833217409677e-06, "loss": 0.9233, "step": 1050 }, { "epoch": 1.8483786152497808, "grad_norm": 0.9421396851539612, "learning_rate": 7.746406456710564e-06, "loss": 0.9187, "step": 1055 }, { "epoch": 1.8571428571428572, "grad_norm": 0.9888685345649719, "learning_rate": 7.64721385255886e-06, "loss": 0.9289, "step": 1060 }, { "epoch": 1.8659070990359334, "grad_norm": 0.9585088491439819, "learning_rate": 7.548265711426105e-06, "loss": 0.9291, "step": 1065 }, { "epoch": 1.8746713409290097, "grad_norm": 0.9842194318771362, "learning_rate": 7.449572314383237e-06, "loss": 0.9521, "step": 1070 }, { "epoch": 1.883435582822086, "grad_norm": 0.9899460077285767, "learning_rate": 7.351143916032375e-06, "loss": 0.9238, "step": 1075 }, { "epoch": 1.8921998247151621, "grad_norm": 1.044392466545105, "learning_rate": 7.252990743441293e-06, "loss": 0.9354, "step": 1080 }, { "epoch": 1.9009640666082384, "grad_norm": 0.9790138006210327, "learning_rate": 7.155122995080826e-06, "loss": 0.9527, "step": 1085 }, { "epoch": 1.9097283085013146, "grad_norm": 1.002880334854126, "learning_rate": 7.0575508397651885e-06, "loss": 0.9471, "step": 1090 }, { "epoch": 1.918492550394391, "grad_norm": 0.9698459506034851, "learning_rate": 6.960284415595407e-06, "loss": 0.9402, "step": 1095 }, { "epoch": 1.927256792287467, "grad_norm": 0.9467353224754333, "learning_rate": 6.863333828905929e-06, "loss": 0.9409, "step": 1100 }, { "epoch": 1.9360210341805435, "grad_norm": 0.965829074382782, "learning_rate": 6.766709153214541e-06, "loss": 0.9425, "step": 1105 }, { "epoch": 1.9447852760736195, "grad_norm": 0.9571474194526672, "learning_rate": 6.670420428175706e-06, "loss": 0.9405, "step": 1110 }, { "epoch": 1.953549517966696, "grad_norm": 0.9341493248939514, "learning_rate": 6.574477658537375e-06, "loss": 0.9145, "step": 1115 }, { "epoch": 1.962313759859772, "grad_norm": 0.9990600943565369, "learning_rate": 6.4788908131014995e-06, "loss": 0.952, "step": 1120 }, { "epoch": 1.9710780017528484, "grad_norm": 0.917290210723877, "learning_rate": 6.383669823688191e-06, "loss": 0.951, "step": 1125 }, { "epoch": 1.9798422436459246, "grad_norm": 0.9599776864051819, "learning_rate": 6.288824584103815e-06, "loss": 0.936, "step": 1130 }, { "epoch": 1.9886064855390009, "grad_norm": 0.9636255502700806, "learning_rate": 6.194364949112952e-06, "loss": 0.9582, "step": 1135 }, { "epoch": 1.997370727432077, "grad_norm": 1.1487308740615845, "learning_rate": 6.100300733414473e-06, "loss": 0.9276, "step": 1140 }, { "epoch": 2.0, "eval_loss": 1.151129961013794, "eval_runtime": 199.4284, "eval_samples_per_second": 9.161, "eval_steps_per_second": 2.292, "step": 1142 }, { "epoch": 2.005258545135846, "grad_norm": 1.1605374813079834, "learning_rate": 6.006641710621746e-06, "loss": 0.8479, "step": 1145 }, { "epoch": 2.014022787028922, "grad_norm": 1.0491231679916382, "learning_rate": 5.913397612247121e-06, "loss": 0.8032, "step": 1150 }, { "epoch": 2.0227870289219982, "grad_norm": 1.0855581760406494, "learning_rate": 5.82057812669081e-06, "loss": 0.8839, "step": 1155 }, { "epoch": 2.0315512708150747, "grad_norm": 0.9942172169685364, "learning_rate": 5.728192898234195e-06, "loss": 0.7986, "step": 1160 }, { "epoch": 2.0403155127081507, "grad_norm": 1.0435779094696045, "learning_rate": 5.636251526037784e-06, "loss": 0.8263, "step": 1165 }, { "epoch": 2.049079754601227, "grad_norm": 1.0303524732589722, "learning_rate": 5.544763563143794e-06, "loss": 0.8188, "step": 1170 }, { "epoch": 2.057843996494303, "grad_norm": 0.9739100933074951, "learning_rate": 5.453738515483586e-06, "loss": 0.8488, "step": 1175 }, { "epoch": 2.0666082383873796, "grad_norm": 1.021791696548462, "learning_rate": 5.363185840889935e-06, "loss": 0.8666, "step": 1180 }, { "epoch": 2.0753724802804556, "grad_norm": 0.9683573842048645, "learning_rate": 5.273114948114346e-06, "loss": 0.8276, "step": 1185 }, { "epoch": 2.084136722173532, "grad_norm": 1.0052560567855835, "learning_rate": 5.1835351958494515e-06, "loss": 0.8089, "step": 1190 }, { "epoch": 2.092900964066608, "grad_norm": 0.9584820866584778, "learning_rate": 5.094455891756587e-06, "loss": 0.8276, "step": 1195 }, { "epoch": 2.1016652059596845, "grad_norm": 0.9803566932678223, "learning_rate": 5.0058862914987204e-06, "loss": 0.8256, "step": 1200 }, { "epoch": 2.1104294478527605, "grad_norm": 0.9923965334892273, "learning_rate": 4.917835597778731e-06, "loss": 0.8241, "step": 1205 }, { "epoch": 2.119193689745837, "grad_norm": 1.022495985031128, "learning_rate": 4.830312959383238e-06, "loss": 0.8074, "step": 1210 }, { "epoch": 2.127957931638913, "grad_norm": 0.9760512709617615, "learning_rate": 4.743327470231982e-06, "loss": 0.8058, "step": 1215 }, { "epoch": 2.1367221735319895, "grad_norm": 0.9603386521339417, "learning_rate": 4.656888168432962e-06, "loss": 0.8133, "step": 1220 }, { "epoch": 2.145486415425066, "grad_norm": 1.034776210784912, "learning_rate": 4.571004035343315e-06, "loss": 0.818, "step": 1225 }, { "epoch": 2.154250657318142, "grad_norm": 0.9763988256454468, "learning_rate": 4.485683994636144e-06, "loss": 0.8165, "step": 1230 }, { "epoch": 2.1630148992112184, "grad_norm": 0.9729757905006409, "learning_rate": 4.400936911373308e-06, "loss": 0.808, "step": 1235 }, { "epoch": 2.1717791411042944, "grad_norm": 1.0068873167037964, "learning_rate": 4.316771591084297e-06, "loss": 0.8038, "step": 1240 }, { "epoch": 2.180543382997371, "grad_norm": 0.9344819188117981, "learning_rate": 4.2331967788513295e-06, "loss": 0.8335, "step": 1245 }, { "epoch": 2.189307624890447, "grad_norm": 1.0315194129943848, "learning_rate": 4.150221158400683e-06, "loss": 0.8154, "step": 1250 }, { "epoch": 2.1980718667835233, "grad_norm": 0.9959366321563721, "learning_rate": 4.067853351200446e-06, "loss": 0.8317, "step": 1255 }, { "epoch": 2.2068361086765993, "grad_norm": 1.0919640064239502, "learning_rate": 3.986101915564695e-06, "loss": 0.8236, "step": 1260 }, { "epoch": 2.2156003505696757, "grad_norm": 0.9548513293266296, "learning_rate": 3.904975345764262e-06, "loss": 0.849, "step": 1265 }, { "epoch": 2.224364592462752, "grad_norm": 0.9864785075187683, "learning_rate": 3.824482071144164e-06, "loss": 0.8259, "step": 1270 }, { "epoch": 2.233128834355828, "grad_norm": 1.014013648033142, "learning_rate": 3.7446304552477387e-06, "loss": 0.7696, "step": 1275 }, { "epoch": 2.2418930762489047, "grad_norm": 0.95964115858078, "learning_rate": 3.665428794947663e-06, "loss": 0.7758, "step": 1280 }, { "epoch": 2.2506573181419807, "grad_norm": 0.9974411725997925, "learning_rate": 3.5868853195838582e-06, "loss": 0.8512, "step": 1285 }, { "epoch": 2.259421560035057, "grad_norm": 0.990260124206543, "learning_rate": 3.509008190108453e-06, "loss": 0.8096, "step": 1290 }, { "epoch": 2.268185801928133, "grad_norm": 0.982060968875885, "learning_rate": 3.431805498237808e-06, "loss": 0.8259, "step": 1295 }, { "epoch": 2.2769500438212096, "grad_norm": 0.9737572073936462, "learning_rate": 3.355285265611784e-06, "loss": 0.8368, "step": 1300 }, { "epoch": 2.2857142857142856, "grad_norm": 0.9657383561134338, "learning_rate": 3.2794554429602377e-06, "loss": 0.8129, "step": 1305 }, { "epoch": 2.294478527607362, "grad_norm": 0.9947619438171387, "learning_rate": 3.204323909276924e-06, "loss": 0.8034, "step": 1310 }, { "epoch": 2.303242769500438, "grad_norm": 1.0247445106506348, "learning_rate": 3.1298984710008483e-06, "loss": 0.8267, "step": 1315 }, { "epoch": 2.3120070113935145, "grad_norm": 0.9986540079116821, "learning_rate": 3.056186861205136e-06, "loss": 0.8233, "step": 1320 }, { "epoch": 2.3207712532865905, "grad_norm": 0.9882351160049438, "learning_rate": 2.983196738793547e-06, "loss": 0.8097, "step": 1325 }, { "epoch": 2.329535495179667, "grad_norm": 0.9737289547920227, "learning_rate": 2.910935687704671e-06, "loss": 0.8285, "step": 1330 }, { "epoch": 2.3382997370727434, "grad_norm": 0.9512819647789001, "learning_rate": 2.8394112161239606e-06, "loss": 0.7998, "step": 1335 }, { "epoch": 2.3470639789658194, "grad_norm": 0.980267345905304, "learning_rate": 2.7686307557035684e-06, "loss": 0.8364, "step": 1340 }, { "epoch": 2.355828220858896, "grad_norm": 0.9798904061317444, "learning_rate": 2.698601660790191e-06, "loss": 0.8288, "step": 1345 }, { "epoch": 2.364592462751972, "grad_norm": 0.9910169839859009, "learning_rate": 2.629331207660931e-06, "loss": 0.8182, "step": 1350 }, { "epoch": 2.3733567046450483, "grad_norm": 1.0095982551574707, "learning_rate": 2.560826593767244e-06, "loss": 0.8651, "step": 1355 }, { "epoch": 2.3821209465381243, "grad_norm": 1.0415823459625244, "learning_rate": 2.4930949369871205e-06, "loss": 0.7934, "step": 1360 }, { "epoch": 2.390885188431201, "grad_norm": 0.9959484934806824, "learning_rate": 2.426143274885493e-06, "loss": 0.8131, "step": 1365 }, { "epoch": 2.399649430324277, "grad_norm": 0.9777078628540039, "learning_rate": 2.359978563983022e-06, "loss": 0.8125, "step": 1370 }, { "epoch": 2.4084136722173533, "grad_norm": 1.0206762552261353, "learning_rate": 2.294607679033283e-06, "loss": 0.7912, "step": 1375 }, { "epoch": 2.4171779141104293, "grad_norm": 0.9738752245903015, "learning_rate": 2.230037412308452e-06, "loss": 0.8411, "step": 1380 }, { "epoch": 2.4259421560035057, "grad_norm": 0.9954826831817627, "learning_rate": 2.166274472893567e-06, "loss": 0.8052, "step": 1385 }, { "epoch": 2.4347063978965817, "grad_norm": 0.9861373901367188, "learning_rate": 2.1033254859894224e-06, "loss": 0.8041, "step": 1390 }, { "epoch": 2.443470639789658, "grad_norm": 0.9600276947021484, "learning_rate": 2.041196992224206e-06, "loss": 0.8326, "step": 1395 }, { "epoch": 2.4522348816827346, "grad_norm": 1.127557396888733, "learning_rate": 1.9798954469738762e-06, "loss": 0.8355, "step": 1400 }, { "epoch": 2.4609991235758106, "grad_norm": 0.9988298416137695, "learning_rate": 1.9194272196914533e-06, "loss": 0.8473, "step": 1405 }, { "epoch": 2.469763365468887, "grad_norm": 0.972212553024292, "learning_rate": 1.8597985932451856e-06, "loss": 0.816, "step": 1410 }, { "epoch": 2.478527607361963, "grad_norm": 0.9716165065765381, "learning_rate": 1.8010157632657544e-06, "loss": 0.8157, "step": 1415 }, { "epoch": 2.4872918492550395, "grad_norm": 0.9722920656204224, "learning_rate": 1.7430848375025178e-06, "loss": 0.8238, "step": 1420 }, { "epoch": 2.4960560911481156, "grad_norm": 1.0044946670532227, "learning_rate": 1.686011835188891e-06, "loss": 0.8473, "step": 1425 }, { "epoch": 2.504820333041192, "grad_norm": 0.9682095050811768, "learning_rate": 1.6298026864169336e-06, "loss": 0.8132, "step": 1430 }, { "epoch": 2.513584574934268, "grad_norm": 0.9928619861602783, "learning_rate": 1.5744632315211815e-06, "loss": 0.837, "step": 1435 }, { "epoch": 2.5223488168273445, "grad_norm": 0.9613544344902039, "learning_rate": 1.5199992204718295e-06, "loss": 0.8209, "step": 1440 }, { "epoch": 2.531113058720421, "grad_norm": 1.0032097101211548, "learning_rate": 1.466416312277269e-06, "loss": 0.8303, "step": 1445 }, { "epoch": 2.539877300613497, "grad_norm": 0.9630649089813232, "learning_rate": 1.4137200743961189e-06, "loss": 0.825, "step": 1450 }, { "epoch": 2.548641542506573, "grad_norm": 0.9702491164207458, "learning_rate": 1.3619159821587236e-06, "loss": 0.8148, "step": 1455 }, { "epoch": 2.5574057843996494, "grad_norm": 0.9509206414222717, "learning_rate": 1.3110094181982657e-06, "loss": 0.7695, "step": 1460 }, { "epoch": 2.566170026292726, "grad_norm": 0.9589338302612305, "learning_rate": 1.261005671891482e-06, "loss": 0.8532, "step": 1465 }, { "epoch": 2.574934268185802, "grad_norm": 0.9704285264015198, "learning_rate": 1.2119099388090715e-06, "loss": 0.797, "step": 1470 }, { "epoch": 2.5836985100788783, "grad_norm": 1.0093833208084106, "learning_rate": 1.1637273201758747e-06, "loss": 0.8233, "step": 1475 }, { "epoch": 2.5924627519719543, "grad_norm": 0.9612089991569519, "learning_rate": 1.1164628223408169e-06, "loss": 0.8489, "step": 1480 }, { "epoch": 2.6012269938650308, "grad_norm": 0.9347235560417175, "learning_rate": 1.0701213562567491e-06, "loss": 0.7855, "step": 1485 }, { "epoch": 2.6099912357581068, "grad_norm": 1.00240957736969, "learning_rate": 1.0247077369701653e-06, "loss": 0.8322, "step": 1490 }, { "epoch": 2.618755477651183, "grad_norm": 0.99866783618927, "learning_rate": 9.802266831209206e-07, "loss": 0.8133, "step": 1495 }, { "epoch": 2.6275197195442592, "grad_norm": 1.0041725635528564, "learning_rate": 9.36682816451926e-07, "loss": 0.8715, "step": 1500 }, { "epoch": 2.6362839614373357, "grad_norm": 0.9615875482559204, "learning_rate": 8.940806613289499e-07, "loss": 0.8075, "step": 1505 }, { "epoch": 2.645048203330412, "grad_norm": 0.9449265003204346, "learning_rate": 8.524246442705153e-07, "loss": 0.7974, "step": 1510 }, { "epoch": 2.653812445223488, "grad_norm": 0.9578828811645508, "learning_rate": 8.117190934879593e-07, "loss": 0.8175, "step": 1515 }, { "epoch": 2.662576687116564, "grad_norm": 0.9990285038948059, "learning_rate": 7.719682384357308e-07, "loss": 0.8147, "step": 1520 }, { "epoch": 2.6713409290096406, "grad_norm": 0.9652912616729736, "learning_rate": 7.33176209371923e-07, "loss": 0.8429, "step": 1525 }, { "epoch": 2.680105170902717, "grad_norm": 0.9373207092285156, "learning_rate": 6.953470369291349e-07, "loss": 0.825, "step": 1530 }, { "epoch": 2.688869412795793, "grad_norm": 0.9682218432426453, "learning_rate": 6.5848465169566e-07, "loss": 0.7916, "step": 1535 }, { "epoch": 2.6976336546888695, "grad_norm": 0.995035707950592, "learning_rate": 6.225928838071016e-07, "loss": 0.829, "step": 1540 }, { "epoch": 2.7063978965819455, "grad_norm": 0.9676108956336975, "learning_rate": 5.876754625483904e-07, "loss": 0.8497, "step": 1545 }, { "epoch": 2.715162138475022, "grad_norm": 0.9674281477928162, "learning_rate": 5.537360159663107e-07, "loss": 0.8126, "step": 1550 }, { "epoch": 2.7239263803680984, "grad_norm": 0.9768509864807129, "learning_rate": 5.207780704925314e-07, "loss": 0.8432, "step": 1555 }, { "epoch": 2.7326906222611744, "grad_norm": 0.9932735562324524, "learning_rate": 4.888050505771869e-07, "loss": 0.8293, "step": 1560 }, { "epoch": 2.7414548641542504, "grad_norm": 0.9800174832344055, "learning_rate": 4.5782027833307983e-07, "loss": 0.7843, "step": 1565 }, { "epoch": 2.750219106047327, "grad_norm": 0.9393450021743774, "learning_rate": 4.2782697319048603e-07, "loss": 0.8016, "step": 1570 }, { "epoch": 2.7589833479404033, "grad_norm": 0.9714465737342834, "learning_rate": 3.9882825156265846e-07, "loss": 0.8264, "step": 1575 }, { "epoch": 2.7677475898334793, "grad_norm": 0.975568950176239, "learning_rate": 3.708271265220087e-07, "loss": 0.802, "step": 1580 }, { "epoch": 2.776511831726556, "grad_norm": 0.9788158535957336, "learning_rate": 3.4382650748704173e-07, "loss": 0.8374, "step": 1585 }, { "epoch": 2.785276073619632, "grad_norm": 0.9417116641998291, "learning_rate": 3.178291999200633e-07, "loss": 0.8181, "step": 1590 }, { "epoch": 2.7940403155127083, "grad_norm": 0.9802819490432739, "learning_rate": 2.928379050356722e-07, "loss": 0.8208, "step": 1595 }, { "epoch": 2.8028045574057843, "grad_norm": 0.9727985858917236, "learning_rate": 2.6885521952010105e-07, "loss": 0.7862, "step": 1600 }, { "epoch": 2.8115687992988607, "grad_norm": 0.9225666522979736, "learning_rate": 2.458836352614069e-07, "loss": 0.7791, "step": 1605 }, { "epoch": 2.8203330411919367, "grad_norm": 1.038718342781067, "learning_rate": 2.2392553909055813e-07, "loss": 0.8164, "step": 1610 }, { "epoch": 2.829097283085013, "grad_norm": 0.945773184299469, "learning_rate": 2.029832125334319e-07, "loss": 0.8277, "step": 1615 }, { "epoch": 2.8378615249780896, "grad_norm": 0.9560094475746155, "learning_rate": 1.8305883157375804e-07, "loss": 0.7974, "step": 1620 }, { "epoch": 2.8466257668711656, "grad_norm": 0.9896951913833618, "learning_rate": 1.6415446642702337e-07, "loss": 0.8084, "step": 1625 }, { "epoch": 2.8553900087642416, "grad_norm": 0.9845879077911377, "learning_rate": 1.4627208132536818e-07, "loss": 0.8216, "step": 1630 }, { "epoch": 2.864154250657318, "grad_norm": 0.9730380177497864, "learning_rate": 1.2941353431350058e-07, "loss": 0.7997, "step": 1635 }, { "epoch": 2.8729184925503946, "grad_norm": 0.9810739159584045, "learning_rate": 1.1358057705563641e-07, "loss": 0.8212, "step": 1640 }, { "epoch": 2.8816827344434706, "grad_norm": 0.9314019083976746, "learning_rate": 9.877485465349057e-08, "loss": 0.7794, "step": 1645 }, { "epoch": 2.890446976336547, "grad_norm": 0.9651502966880798, "learning_rate": 8.499790547535025e-08, "loss": 0.8138, "step": 1650 }, { "epoch": 2.899211218229623, "grad_norm": 0.966038167476654, "learning_rate": 7.225116099623287e-08, "loss": 0.8212, "step": 1655 }, { "epoch": 2.9079754601226995, "grad_norm": 0.9493021965026855, "learning_rate": 6.053594564914611e-08, "loss": 0.832, "step": 1660 }, { "epoch": 2.9167397020157755, "grad_norm": 0.9688047766685486, "learning_rate": 4.985347668747809e-08, "loss": 0.8239, "step": 1665 }, { "epoch": 2.925503943908852, "grad_norm": 0.9778699278831482, "learning_rate": 4.020486405852286e-08, "loss": 0.7976, "step": 1670 }, { "epoch": 2.934268185801928, "grad_norm": 0.9479379653930664, "learning_rate": 3.15911102881461e-08, "loss": 0.8375, "step": 1675 }, { "epoch": 2.9430324276950044, "grad_norm": 1.0030702352523804, "learning_rate": 2.4013110376623906e-08, "loss": 0.8225, "step": 1680 }, { "epoch": 2.951796669588081, "grad_norm": 1.0119658708572388, "learning_rate": 1.747165170564724e-08, "loss": 0.8276, "step": 1685 }, { "epoch": 2.960560911481157, "grad_norm": 0.9981706738471985, "learning_rate": 1.1967413956510687e-08, "loss": 0.8661, "step": 1690 }, { "epoch": 2.969325153374233, "grad_norm": 0.9298052787780762, "learning_rate": 7.500969039491156e-09, "loss": 0.8439, "step": 1695 }, { "epoch": 2.9780893952673093, "grad_norm": 0.9936395287513733, "learning_rate": 4.072781034425432e-09, "loss": 0.8221, "step": 1700 }, { "epoch": 2.9868536371603858, "grad_norm": 1.0040860176086426, "learning_rate": 1.6832061424865155e-09, "loss": 0.818, "step": 1705 }, { "epoch": 2.9956178790534618, "grad_norm": 0.9950876235961914, "learning_rate": 3.324926491787839e-10, "loss": 0.8279, "step": 1710 }, { "epoch": 3.0, "eval_loss": 1.1766911745071411, "eval_runtime": 199.3045, "eval_samples_per_second": 9.167, "eval_steps_per_second": 2.293, "step": 1713 }, { "epoch": 3.0, "step": 1713, "total_flos": 90953314467840.0, "train_loss": 0.9790556504583052, "train_runtime": 12705.6497, "train_samples_per_second": 8.618, "train_steps_per_second": 0.135 } ], "logging_steps": 5, "max_steps": 1713, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 90953314467840.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }