{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9994655264564404, "eval_steps": 200, "global_step": 935, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0010689470871191875, "grad_norm": 4.714404593717477e+18, "learning_rate": 2.1276595744680852e-07, "loss": 1.5727, "step": 1 }, { "epoch": 0.005344735435595938, "grad_norm": 3523.978271484375, "learning_rate": 1.0638297872340427e-06, "loss": 1.5802, "step": 5 }, { "epoch": 0.010689470871191877, "grad_norm": 36.699684143066406, "learning_rate": 2.1276595744680853e-06, "loss": 1.539, "step": 10 }, { "epoch": 0.016034206306787813, "grad_norm": 6.539373874664307, "learning_rate": 3.191489361702128e-06, "loss": 1.4339, "step": 15 }, { "epoch": 0.021378941742383754, "grad_norm": 2.321072816848755, "learning_rate": 4.255319148936171e-06, "loss": 1.3441, "step": 20 }, { "epoch": 0.02672367717797969, "grad_norm": 2.6276676654815674, "learning_rate": 5.319148936170213e-06, "loss": 1.2829, "step": 25 }, { "epoch": 0.032068412613575625, "grad_norm": 1.8791221380233765, "learning_rate": 6.382978723404256e-06, "loss": 1.2579, "step": 30 }, { "epoch": 0.03741314804917156, "grad_norm": 1.898274540901184, "learning_rate": 7.446808510638298e-06, "loss": 1.2338, "step": 35 }, { "epoch": 0.04275788348476751, "grad_norm": 1.6828669309616089, "learning_rate": 8.510638297872341e-06, "loss": 1.2116, "step": 40 }, { "epoch": 0.048102618920363445, "grad_norm": 1.6606314182281494, "learning_rate": 9.574468085106385e-06, "loss": 1.1804, "step": 45 }, { "epoch": 0.05344735435595938, "grad_norm": 1.5807420015335083, "learning_rate": 1.0638297872340426e-05, "loss": 1.191, "step": 50 }, { "epoch": 0.05879208979155532, "grad_norm": 1.5150554180145264, "learning_rate": 1.170212765957447e-05, "loss": 1.1707, "step": 55 }, { "epoch": 0.06413682522715125, "grad_norm": 1.198437213897705, "learning_rate": 1.2765957446808513e-05, "loss": 1.1583, "step": 60 }, { "epoch": 0.06948156066274719, "grad_norm": 1.2076998949050903, "learning_rate": 1.3829787234042556e-05, "loss": 1.1604, "step": 65 }, { "epoch": 0.07482629609834313, "grad_norm": 1.4316256046295166, "learning_rate": 1.4893617021276596e-05, "loss": 1.1664, "step": 70 }, { "epoch": 0.08017103153393906, "grad_norm": 1.2154533863067627, "learning_rate": 1.595744680851064e-05, "loss": 1.1603, "step": 75 }, { "epoch": 0.08551576696953501, "grad_norm": 1.6808208227157593, "learning_rate": 1.7021276595744682e-05, "loss": 1.1415, "step": 80 }, { "epoch": 0.09086050240513095, "grad_norm": 1.1716338396072388, "learning_rate": 1.8085106382978724e-05, "loss": 1.1511, "step": 85 }, { "epoch": 0.09620523784072689, "grad_norm": 1.4733761548995972, "learning_rate": 1.914893617021277e-05, "loss": 1.1462, "step": 90 }, { "epoch": 0.10154997327632283, "grad_norm": 1.0243571996688843, "learning_rate": 1.9999930228629612e-05, "loss": 1.1484, "step": 95 }, { "epoch": 0.10689470871191876, "grad_norm": 1.1259580850601196, "learning_rate": 1.999748833289337e-05, "loss": 1.1425, "step": 100 }, { "epoch": 0.1122394441475147, "grad_norm": 1.4838624000549316, "learning_rate": 1.999155884218539e-05, "loss": 1.161, "step": 105 }, { "epoch": 0.11758417958311064, "grad_norm": 1.2224469184875488, "learning_rate": 1.9982143824991402e-05, "loss": 1.1318, "step": 110 }, { "epoch": 0.12292891501870658, "grad_norm": 0.9379732608795166, "learning_rate": 1.9969246565713005e-05, "loss": 1.1533, "step": 115 }, { "epoch": 0.1282736504543025, "grad_norm": 1.209952473640442, "learning_rate": 1.99528715635219e-05, "loss": 1.1494, "step": 120 }, { "epoch": 0.13361838588989844, "grad_norm": 1.0505174398422241, "learning_rate": 1.9933024530790377e-05, "loss": 1.142, "step": 125 }, { "epoch": 0.13896312132549438, "grad_norm": 1.0835527181625366, "learning_rate": 1.990971239109856e-05, "loss": 1.1377, "step": 130 }, { "epoch": 0.14430785676109031, "grad_norm": 1.0426019430160522, "learning_rate": 1.9882943276819153e-05, "loss": 1.1378, "step": 135 }, { "epoch": 0.14965259219668625, "grad_norm": 1.105392336845398, "learning_rate": 1.9852726526280467e-05, "loss": 1.1298, "step": 140 }, { "epoch": 0.1549973276322822, "grad_norm": 1.0859570503234863, "learning_rate": 1.981907268050878e-05, "loss": 1.144, "step": 145 }, { "epoch": 0.16034206306787813, "grad_norm": 1.0400607585906982, "learning_rate": 1.9781993479551124e-05, "loss": 1.1431, "step": 150 }, { "epoch": 0.16568679850347406, "grad_norm": 0.8823645710945129, "learning_rate": 1.9741501858379828e-05, "loss": 1.1449, "step": 155 }, { "epoch": 0.17103153393907003, "grad_norm": 0.9304960370063782, "learning_rate": 1.969761194238015e-05, "loss": 1.1289, "step": 160 }, { "epoch": 0.17637626937466597, "grad_norm": 0.9176273345947266, "learning_rate": 1.9650339042422707e-05, "loss": 1.1303, "step": 165 }, { "epoch": 0.1817210048102619, "grad_norm": 0.8516846299171448, "learning_rate": 1.9599699649522318e-05, "loss": 1.1169, "step": 170 }, { "epoch": 0.18706574024585784, "grad_norm": 0.9081844687461853, "learning_rate": 1.9545711429085138e-05, "loss": 1.1263, "step": 175 }, { "epoch": 0.19241047568145378, "grad_norm": 0.8308460712432861, "learning_rate": 1.948839321474617e-05, "loss": 1.1327, "step": 180 }, { "epoch": 0.19775521111704972, "grad_norm": 0.7651630640029907, "learning_rate": 1.942776500179918e-05, "loss": 1.1364, "step": 185 }, { "epoch": 0.20309994655264565, "grad_norm": 0.8433904647827148, "learning_rate": 1.9363847940221396e-05, "loss": 1.1305, "step": 190 }, { "epoch": 0.2084446819882416, "grad_norm": 0.7462895512580872, "learning_rate": 1.929666432729541e-05, "loss": 1.1378, "step": 195 }, { "epoch": 0.21378941742383753, "grad_norm": 0.7528314590454102, "learning_rate": 1.9226237599830834e-05, "loss": 1.1457, "step": 200 }, { "epoch": 0.21378941742383753, "eval_loss": 1.1342483758926392, "eval_runtime": 298.4179, "eval_samples_per_second": 44.387, "eval_steps_per_second": 5.549, "step": 200 }, { "epoch": 0.21913415285943347, "grad_norm": 0.8370968103408813, "learning_rate": 1.9152592325988428e-05, "loss": 1.1102, "step": 205 }, { "epoch": 0.2244788882950294, "grad_norm": 0.8102027177810669, "learning_rate": 1.9075754196709574e-05, "loss": 1.1246, "step": 210 }, { "epoch": 0.22982362373062534, "grad_norm": 0.7332049012184143, "learning_rate": 1.8995750016754066e-05, "loss": 1.1459, "step": 215 }, { "epoch": 0.23516835916622128, "grad_norm": 0.8379830718040466, "learning_rate": 1.8912607695349348e-05, "loss": 1.1129, "step": 220 }, { "epoch": 0.24051309460181722, "grad_norm": 0.8001225590705872, "learning_rate": 1.882635623645446e-05, "loss": 1.1253, "step": 225 }, { "epoch": 0.24585783003741316, "grad_norm": 0.8222916722297668, "learning_rate": 1.873702572864208e-05, "loss": 1.1273, "step": 230 }, { "epoch": 0.25120256547300907, "grad_norm": 0.7916209697723389, "learning_rate": 1.8644647334602225e-05, "loss": 1.1032, "step": 235 }, { "epoch": 0.256547300908605, "grad_norm": 0.8102867603302002, "learning_rate": 1.8549253280271232e-05, "loss": 1.1098, "step": 240 }, { "epoch": 0.26189203634420094, "grad_norm": 0.7470313906669617, "learning_rate": 1.8450876843589837e-05, "loss": 1.1207, "step": 245 }, { "epoch": 0.2672367717797969, "grad_norm": 0.8269715309143066, "learning_rate": 1.834955234289425e-05, "loss": 1.1254, "step": 250 }, { "epoch": 0.2725815072153928, "grad_norm": 0.8336250185966492, "learning_rate": 1.824531512494432e-05, "loss": 1.1306, "step": 255 }, { "epoch": 0.27792624265098875, "grad_norm": 0.7656748294830322, "learning_rate": 1.81382015525929e-05, "loss": 1.1106, "step": 260 }, { "epoch": 0.2832709780865847, "grad_norm": 0.7885479927062988, "learning_rate": 1.8028248992100783e-05, "loss": 1.1133, "step": 265 }, { "epoch": 0.28861571352218063, "grad_norm": 0.8161848783493042, "learning_rate": 1.7915495800101594e-05, "loss": 1.114, "step": 270 }, { "epoch": 0.29396044895777657, "grad_norm": 0.7918674945831299, "learning_rate": 1.7799981310221172e-05, "loss": 1.1255, "step": 275 }, { "epoch": 0.2993051843933725, "grad_norm": 0.7479060292243958, "learning_rate": 1.7681745819356163e-05, "loss": 1.111, "step": 280 }, { "epoch": 0.30464991982896844, "grad_norm": 0.7473365664482117, "learning_rate": 1.756083057361657e-05, "loss": 1.1083, "step": 285 }, { "epoch": 0.3099946552645644, "grad_norm": 0.7499710917472839, "learning_rate": 1.743727775393713e-05, "loss": 1.1198, "step": 290 }, { "epoch": 0.3153393907001603, "grad_norm": 0.7550699710845947, "learning_rate": 1.7311130461362658e-05, "loss": 1.103, "step": 295 }, { "epoch": 0.32068412613575625, "grad_norm": 0.7409452795982361, "learning_rate": 1.7182432702012363e-05, "loss": 1.111, "step": 300 }, { "epoch": 0.3260288615713522, "grad_norm": 0.7733418941497803, "learning_rate": 1.7051229371728418e-05, "loss": 1.128, "step": 305 }, { "epoch": 0.33137359700694813, "grad_norm": 0.7811718583106995, "learning_rate": 1.6917566240414197e-05, "loss": 1.1172, "step": 310 }, { "epoch": 0.3367183324425441, "grad_norm": 0.7376381158828735, "learning_rate": 1.678148993606757e-05, "loss": 1.11, "step": 315 }, { "epoch": 0.34206306787814006, "grad_norm": 0.8394958972930908, "learning_rate": 1.6643047928514862e-05, "loss": 1.1133, "step": 320 }, { "epoch": 0.347407803313736, "grad_norm": 0.7389253377914429, "learning_rate": 1.6502288512851124e-05, "loss": 1.1056, "step": 325 }, { "epoch": 0.35275253874933193, "grad_norm": 0.811557948589325, "learning_rate": 1.635926079259257e-05, "loss": 1.1121, "step": 330 }, { "epoch": 0.35809727418492787, "grad_norm": 0.718371570110321, "learning_rate": 1.6214014662546897e-05, "loss": 1.1188, "step": 335 }, { "epoch": 0.3634420096205238, "grad_norm": 0.8406382203102112, "learning_rate": 1.606660079140769e-05, "loss": 1.094, "step": 340 }, { "epoch": 0.36878674505611975, "grad_norm": 0.8330205678939819, "learning_rate": 1.5917070604078736e-05, "loss": 1.1225, "step": 345 }, { "epoch": 0.3741314804917157, "grad_norm": 0.8457123637199402, "learning_rate": 1.576547626373464e-05, "loss": 1.1178, "step": 350 }, { "epoch": 0.3794762159273116, "grad_norm": 0.7380478382110596, "learning_rate": 1.5611870653623826e-05, "loss": 1.0822, "step": 355 }, { "epoch": 0.38482095136290756, "grad_norm": 0.8306733965873718, "learning_rate": 1.5456307358620372e-05, "loss": 1.1232, "step": 360 }, { "epoch": 0.3901656867985035, "grad_norm": 1.030051589012146, "learning_rate": 1.5298840646531093e-05, "loss": 1.0938, "step": 365 }, { "epoch": 0.39551042223409943, "grad_norm": 0.7108510732650757, "learning_rate": 1.5139525449164358e-05, "loss": 1.096, "step": 370 }, { "epoch": 0.40085515766969537, "grad_norm": 0.6886569261550903, "learning_rate": 1.49784173431673e-05, "loss": 1.1025, "step": 375 }, { "epoch": 0.4061998931052913, "grad_norm": 0.7875552177429199, "learning_rate": 1.4815572530638046e-05, "loss": 1.1171, "step": 380 }, { "epoch": 0.41154462854088725, "grad_norm": 0.7455640435218811, "learning_rate": 1.4651047819519804e-05, "loss": 1.1102, "step": 385 }, { "epoch": 0.4168893639764832, "grad_norm": 0.815104067325592, "learning_rate": 1.4484900603783544e-05, "loss": 1.1004, "step": 390 }, { "epoch": 0.4222340994120791, "grad_norm": 0.7801194190979004, "learning_rate": 1.4317188843406304e-05, "loss": 1.1172, "step": 395 }, { "epoch": 0.42757883484767506, "grad_norm": 0.6944659352302551, "learning_rate": 1.4147971044152002e-05, "loss": 1.0927, "step": 400 }, { "epoch": 0.42757883484767506, "eval_loss": 1.1091774702072144, "eval_runtime": 299.2382, "eval_samples_per_second": 44.266, "eval_steps_per_second": 5.534, "step": 400 }, { "epoch": 0.432923570283271, "grad_norm": 0.7244230508804321, "learning_rate": 1.3977306237161877e-05, "loss": 1.1015, "step": 405 }, { "epoch": 0.43826830571886694, "grad_norm": 0.7366450428962708, "learning_rate": 1.3805253958361641e-05, "loss": 1.1064, "step": 410 }, { "epoch": 0.4436130411544629, "grad_norm": 0.728798508644104, "learning_rate": 1.3631874227692549e-05, "loss": 1.0934, "step": 415 }, { "epoch": 0.4489577765900588, "grad_norm": 0.6889216303825378, "learning_rate": 1.3457227528173613e-05, "loss": 1.0692, "step": 420 }, { "epoch": 0.45430251202565475, "grad_norm": 0.8071165084838867, "learning_rate": 1.3281374784802263e-05, "loss": 1.1183, "step": 425 }, { "epoch": 0.4596472474612507, "grad_norm": 0.7213979363441467, "learning_rate": 1.3104377343300868e-05, "loss": 1.0848, "step": 430 }, { "epoch": 0.4649919828968466, "grad_norm": 0.7587602138519287, "learning_rate": 1.292629694871642e-05, "loss": 1.0936, "step": 435 }, { "epoch": 0.47033671833244256, "grad_norm": 0.7268496155738831, "learning_rate": 1.2747195723880976e-05, "loss": 1.1016, "step": 440 }, { "epoch": 0.4756814537680385, "grad_norm": 0.7437026500701904, "learning_rate": 1.2567136147740294e-05, "loss": 1.0934, "step": 445 }, { "epoch": 0.48102618920363444, "grad_norm": 0.7416156530380249, "learning_rate": 1.2386181033558205e-05, "loss": 1.0981, "step": 450 }, { "epoch": 0.4863709246392304, "grad_norm": 2.2510459423065186, "learning_rate": 1.2204393507004404e-05, "loss": 1.0839, "step": 455 }, { "epoch": 0.4917156600748263, "grad_norm": 0.7769641280174255, "learning_rate": 1.2021836984133255e-05, "loss": 1.1056, "step": 460 }, { "epoch": 0.49706039551042225, "grad_norm": 0.7865785956382751, "learning_rate": 1.1838575149261256e-05, "loss": 1.0846, "step": 465 }, { "epoch": 0.5024051309460181, "grad_norm": 0.7650848031044006, "learning_rate": 1.165467193275097e-05, "loss": 1.0914, "step": 470 }, { "epoch": 0.5077498663816141, "grad_norm": 0.7357913851737976, "learning_rate": 1.1470191488709086e-05, "loss": 1.0999, "step": 475 }, { "epoch": 0.51309460181721, "grad_norm": 0.7361023426055908, "learning_rate": 1.1285198172606466e-05, "loss": 1.0888, "step": 480 }, { "epoch": 0.518439337252806, "grad_norm": 0.6836273074150085, "learning_rate": 1.1099756518827895e-05, "loss": 1.0858, "step": 485 }, { "epoch": 0.5237840726884019, "grad_norm": 0.7213249802589417, "learning_rate": 1.0913931218159482e-05, "loss": 1.0799, "step": 490 }, { "epoch": 0.5291288081239979, "grad_norm": 0.6905862092971802, "learning_rate": 1.072778709522143e-05, "loss": 1.0845, "step": 495 }, { "epoch": 0.5344735435595938, "grad_norm": 0.6546228528022766, "learning_rate": 1.0541389085854177e-05, "loss": 1.072, "step": 500 }, { "epoch": 0.5398182789951897, "grad_norm": 0.7062543630599976, "learning_rate": 1.0354802214465715e-05, "loss": 1.0642, "step": 505 }, { "epoch": 0.5451630144307856, "grad_norm": 0.6921901106834412, "learning_rate": 1.0168091571348003e-05, "loss": 1.0773, "step": 510 }, { "epoch": 0.5505077498663816, "grad_norm": 0.696793258190155, "learning_rate": 9.981322289970407e-06, "loss": 1.085, "step": 515 }, { "epoch": 0.5558524853019775, "grad_norm": 0.6655827164649963, "learning_rate": 9.794559524258089e-06, "loss": 1.1033, "step": 520 }, { "epoch": 0.5611972207375735, "grad_norm": 0.7247300148010254, "learning_rate": 9.607868425863235e-06, "loss": 1.0884, "step": 525 }, { "epoch": 0.5665419561731694, "grad_norm": 0.704136312007904, "learning_rate": 9.421314121437093e-06, "loss": 1.0921, "step": 530 }, { "epoch": 0.5718866916087654, "grad_norm": 0.6913681626319885, "learning_rate": 9.234961689910735e-06, "loss": 1.092, "step": 535 }, { "epoch": 0.5772314270443613, "grad_norm": 0.7085949182510376, "learning_rate": 9.04887613979244e-06, "loss": 1.0779, "step": 540 }, { "epoch": 0.5825761624799572, "grad_norm": 0.6966550350189209, "learning_rate": 8.863122386489704e-06, "loss": 1.0858, "step": 545 }, { "epoch": 0.5879208979155531, "grad_norm": 0.6893490552902222, "learning_rate": 8.677765229663634e-06, "loss": 1.074, "step": 550 }, { "epoch": 0.5932656333511491, "grad_norm": 0.6713391542434692, "learning_rate": 8.492869330623813e-06, "loss": 1.0883, "step": 555 }, { "epoch": 0.598610368786745, "grad_norm": 0.6851401329040527, "learning_rate": 8.308499189771375e-06, "loss": 1.0786, "step": 560 }, { "epoch": 0.603955104222341, "grad_norm": 0.6715465188026428, "learning_rate": 8.124719124098218e-06, "loss": 1.0586, "step": 565 }, { "epoch": 0.6092998396579369, "grad_norm": 0.7100517153739929, "learning_rate": 7.941593244750232e-06, "loss": 1.0852, "step": 570 }, { "epoch": 0.6146445750935329, "grad_norm": 0.6566494703292847, "learning_rate": 7.759185434662281e-06, "loss": 1.0728, "step": 575 }, { "epoch": 0.6199893105291288, "grad_norm": 0.6481814980506897, "learning_rate": 7.57755932627284e-06, "loss": 1.0824, "step": 580 }, { "epoch": 0.6253340459647247, "grad_norm": 0.6776189208030701, "learning_rate": 7.396778279326006e-06, "loss": 1.083, "step": 585 }, { "epoch": 0.6306787814003206, "grad_norm": 0.666384756565094, "learning_rate": 7.216905358768622e-06, "loss": 1.0879, "step": 590 }, { "epoch": 0.6360235168359166, "grad_norm": 0.6301067471504211, "learning_rate": 7.038003312750263e-06, "loss": 1.0773, "step": 595 }, { "epoch": 0.6413682522715125, "grad_norm": 0.664985716342926, "learning_rate": 6.860134550733727e-06, "loss": 1.0811, "step": 600 }, { "epoch": 0.6413682522715125, "eval_loss": 1.0846972465515137, "eval_runtime": 306.2721, "eval_samples_per_second": 43.249, "eval_steps_per_second": 5.407, "step": 600 }, { "epoch": 0.6467129877071085, "grad_norm": 0.6260780096054077, "learning_rate": 6.68336112172366e-06, "loss": 1.0902, "step": 605 }, { "epoch": 0.6520577231427044, "grad_norm": 0.6706629395484924, "learning_rate": 6.5077446926209475e-06, "loss": 1.0781, "step": 610 }, { "epoch": 0.6574024585783004, "grad_norm": 0.6518301963806152, "learning_rate": 6.333346526710398e-06, "loss": 1.0538, "step": 615 }, { "epoch": 0.6627471940138963, "grad_norm": 0.6535404920578003, "learning_rate": 6.1602274622892175e-06, "loss": 1.0582, "step": 620 }, { "epoch": 0.6680919294494923, "grad_norm": 0.681788980960846, "learning_rate": 5.988447891443744e-06, "loss": 1.0796, "step": 625 }, { "epoch": 0.6734366648850882, "grad_norm": 0.6322731971740723, "learning_rate": 5.818067738981851e-06, "loss": 1.0557, "step": 630 }, { "epoch": 0.6787814003206841, "grad_norm": 0.6635384559631348, "learning_rate": 5.649146441528341e-06, "loss": 1.0889, "step": 635 }, { "epoch": 0.6841261357562801, "grad_norm": 0.6500101089477539, "learning_rate": 5.48174292679065e-06, "loss": 1.0699, "step": 640 }, { "epoch": 0.689470871191876, "grad_norm": 0.6220327615737915, "learning_rate": 5.3159155930021e-06, "loss": 1.0885, "step": 645 }, { "epoch": 0.694815606627472, "grad_norm": 0.6437745094299316, "learning_rate": 5.151722288549828e-06, "loss": 1.0731, "step": 650 }, { "epoch": 0.7001603420630679, "grad_norm": 0.607314944267273, "learning_rate": 4.989220291794549e-06, "loss": 1.0514, "step": 655 }, { "epoch": 0.7055050774986639, "grad_norm": 0.6423488259315491, "learning_rate": 4.82846629108917e-06, "loss": 1.0769, "step": 660 }, { "epoch": 0.7108498129342598, "grad_norm": 0.6608098149299622, "learning_rate": 4.66951636500322e-06, "loss": 1.077, "step": 665 }, { "epoch": 0.7161945483698557, "grad_norm": 0.6718878149986267, "learning_rate": 4.512425962759992e-06, "loss": 1.0588, "step": 670 }, { "epoch": 0.7215392838054516, "grad_norm": 0.6484812498092651, "learning_rate": 4.357249884893252e-06, "loss": 1.0599, "step": 675 }, { "epoch": 0.7268840192410476, "grad_norm": 0.6472474336624146, "learning_rate": 4.204042264130227e-06, "loss": 1.0646, "step": 680 }, { "epoch": 0.7322287546766435, "grad_norm": 0.6388882398605347, "learning_rate": 4.052856546507565e-06, "loss": 1.0855, "step": 685 }, { "epoch": 0.7375734901122395, "grad_norm": 0.6360565423965454, "learning_rate": 3.9037454727268375e-06, "loss": 1.0778, "step": 690 }, { "epoch": 0.7429182255478354, "grad_norm": 0.6330989003181458, "learning_rate": 3.7567610597560854e-06, "loss": 1.0813, "step": 695 }, { "epoch": 0.7482629609834314, "grad_norm": 0.6103405952453613, "learning_rate": 3.611954582683861e-06, "loss": 1.0548, "step": 700 }, { "epoch": 0.7536076964190273, "grad_norm": 0.6149457097053528, "learning_rate": 3.469376556832069e-06, "loss": 1.0617, "step": 705 }, { "epoch": 0.7589524318546232, "grad_norm": 0.6596876382827759, "learning_rate": 3.3290767201338247e-06, "loss": 1.0632, "step": 710 }, { "epoch": 0.7642971672902191, "grad_norm": 0.6746628284454346, "learning_rate": 3.1911040157825256e-06, "loss": 1.0642, "step": 715 }, { "epoch": 0.7696419027258151, "grad_norm": 0.6416762471199036, "learning_rate": 3.055506575158168e-06, "loss": 1.0641, "step": 720 }, { "epoch": 0.774986638161411, "grad_norm": 0.6470797657966614, "learning_rate": 2.922331701036848e-06, "loss": 1.0592, "step": 725 }, { "epoch": 0.780331373597007, "grad_norm": 0.6156336069107056, "learning_rate": 2.791625851089317e-06, "loss": 1.0917, "step": 730 }, { "epoch": 0.7856761090326029, "grad_norm": 0.6239079833030701, "learning_rate": 2.663434621674367e-06, "loss": 1.071, "step": 735 }, { "epoch": 0.7910208444681989, "grad_norm": 0.6546154618263245, "learning_rate": 2.537802731932674e-06, "loss": 1.0616, "step": 740 }, { "epoch": 0.7963655799037948, "grad_norm": 0.6523792147636414, "learning_rate": 2.4147740081866423e-06, "loss": 1.0687, "step": 745 }, { "epoch": 0.8017103153393907, "grad_norm": 0.6275489330291748, "learning_rate": 2.294391368651735e-06, "loss": 1.0502, "step": 750 }, { "epoch": 0.8070550507749866, "grad_norm": 0.6088879704475403, "learning_rate": 2.176696808464559e-06, "loss": 1.0632, "step": 755 }, { "epoch": 0.8123997862105826, "grad_norm": 0.6577759981155396, "learning_rate": 2.0617313850330067e-06, "loss": 1.0601, "step": 760 }, { "epoch": 0.8177445216461785, "grad_norm": 0.6168049573898315, "learning_rate": 1.949535203713474e-06, "loss": 1.0583, "step": 765 }, { "epoch": 0.8230892570817745, "grad_norm": 0.6288221478462219, "learning_rate": 1.8401474038202338e-06, "loss": 1.061, "step": 770 }, { "epoch": 0.8284339925173704, "grad_norm": 0.6249226927757263, "learning_rate": 1.7336061449717967e-06, "loss": 1.0555, "step": 775 }, { "epoch": 0.8337787279529664, "grad_norm": 0.6102626323699951, "learning_rate": 1.6299485937790505e-06, "loss": 1.0724, "step": 780 }, { "epoch": 0.8391234633885623, "grad_norm": 0.623283326625824, "learning_rate": 1.5292109108797726e-06, "loss": 1.0696, "step": 785 }, { "epoch": 0.8444681988241582, "grad_norm": 0.6192260384559631, "learning_rate": 1.4314282383241097e-06, "loss": 1.0768, "step": 790 }, { "epoch": 0.8498129342597541, "grad_norm": 0.6554280519485474, "learning_rate": 1.3366346873153703e-06, "loss": 1.0525, "step": 795 }, { "epoch": 0.8551576696953501, "grad_norm": 0.5920155048370361, "learning_rate": 1.2448633263104415e-06, "loss": 1.0531, "step": 800 }, { "epoch": 0.8551576696953501, "eval_loss": 1.0703972578048706, "eval_runtime": 298.1755, "eval_samples_per_second": 44.424, "eval_steps_per_second": 5.554, "step": 800 }, { "epoch": 0.860502405130946, "grad_norm": 0.6562113761901855, "learning_rate": 1.1561461694839304e-06, "loss": 1.0479, "step": 805 }, { "epoch": 0.865847140566542, "grad_norm": 0.5837886333465576, "learning_rate": 1.070514165560138e-06, "loss": 1.045, "step": 810 }, { "epoch": 0.8711918760021379, "grad_norm": 0.6308085918426514, "learning_rate": 9.879971870166628e-07, "loss": 1.0544, "step": 815 }, { "epoch": 0.8765366114377339, "grad_norm": 0.6370830535888672, "learning_rate": 9.086240196634899e-07, "loss": 1.0663, "step": 820 }, { "epoch": 0.8818813468733298, "grad_norm": 0.6129716634750366, "learning_rate": 8.324223526011321e-07, "loss": 1.059, "step": 825 }, { "epoch": 0.8872260823089257, "grad_norm": 0.615529477596283, "learning_rate": 7.594187685613763e-07, "loss": 1.06, "step": 830 }, { "epoch": 0.8925708177445216, "grad_norm": 0.6301218867301941, "learning_rate": 6.896387346339683e-07, "loss": 1.0409, "step": 835 }, { "epoch": 0.8979155531801176, "grad_norm": 0.6093020439147949, "learning_rate": 6.231065933824975e-07, "loss": 1.0489, "step": 840 }, { "epoch": 0.9032602886157135, "grad_norm": 0.6007770299911499, "learning_rate": 5.598455543525571e-07, "loss": 1.0489, "step": 845 }, { "epoch": 0.9086050240513095, "grad_norm": 0.6080732941627502, "learning_rate": 4.998776859751619e-07, "loss": 1.067, "step": 850 }, { "epoch": 0.9139497594869054, "grad_norm": 0.6055812835693359, "learning_rate": 4.4322390786824986e-07, "loss": 1.0577, "step": 855 }, { "epoch": 0.9192944949225014, "grad_norm": 0.608756959438324, "learning_rate": 3.8990398353891954e-07, "loss": 1.0603, "step": 860 }, { "epoch": 0.9246392303580973, "grad_norm": 0.5943930149078369, "learning_rate": 3.3993651348899537e-07, "loss": 1.0735, "step": 865 }, { "epoch": 0.9299839657936932, "grad_norm": 0.6033700108528137, "learning_rate": 2.9333892872629664e-07, "loss": 1.0772, "step": 870 }, { "epoch": 0.9353287012292891, "grad_norm": 0.6112857460975647, "learning_rate": 2.501274846838797e-07, "loss": 1.0584, "step": 875 }, { "epoch": 0.9406734366648851, "grad_norm": 0.6191092729568481, "learning_rate": 2.1031725554937378e-07, "loss": 1.0667, "step": 880 }, { "epoch": 0.946018172100481, "grad_norm": 0.5986559391021729, "learning_rate": 1.739221290063986e-07, "loss": 1.0548, "step": 885 }, { "epoch": 0.951362907536077, "grad_norm": 0.6083486676216125, "learning_rate": 1.4095480138988204e-07, "loss": 1.0605, "step": 890 }, { "epoch": 0.9567076429716729, "grad_norm": 0.6409095525741577, "learning_rate": 1.1142677325698514e-07, "loss": 1.0561, "step": 895 }, { "epoch": 0.9620523784072689, "grad_norm": 0.6235837340354919, "learning_rate": 8.534834537516246e-08, "loss": 1.0667, "step": 900 }, { "epoch": 0.9673971138428648, "grad_norm": 0.606450617313385, "learning_rate": 6.272861512876871e-08, "loss": 1.0558, "step": 905 }, { "epoch": 0.9727418492784607, "grad_norm": 0.6099078059196472, "learning_rate": 4.357547334546408e-08, "loss": 1.0739, "step": 910 }, { "epoch": 0.9780865847140566, "grad_norm": 0.6281602382659912, "learning_rate": 2.7895601543520557e-08, "loss": 1.0675, "step": 915 }, { "epoch": 0.9834313201496526, "grad_norm": 0.6282256245613098, "learning_rate": 1.56944696009953e-08, "loss": 1.0526, "step": 920 }, { "epoch": 0.9887760555852485, "grad_norm": 0.5956407785415649, "learning_rate": 6.976333847578121e-09, "loss": 1.0485, "step": 925 }, { "epoch": 0.9941207910208445, "grad_norm": 0.6351339221000671, "learning_rate": 1.7442355797825383e-09, "loss": 1.0796, "step": 930 }, { "epoch": 0.9994655264564404, "grad_norm": 0.580534040927887, "learning_rate": 0.0, "loss": 1.0454, "step": 935 }, { "epoch": 0.9994655264564404, "step": 935, "total_flos": 2.759237889794507e+18, "train_loss": 1.1061673424460672, "train_runtime": 11435.0508, "train_samples_per_second": 10.471, "train_steps_per_second": 0.082 } ], "logging_steps": 5, "max_steps": 935, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.759237889794507e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }