{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 50.0, "global_step": 1185, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0025324469768914213, "grad_norm": 6.541903357166615, "learning_rate": 1.6666666666666668e-07, "loss": 1.6022449731826782, "step": 1 }, { "epoch": 0.012662234884457106, "grad_norm": 6.812644002932686, "learning_rate": 8.333333333333333e-07, "loss": 1.5844556093215942, "step": 5 }, { "epoch": 0.025324469768914212, "grad_norm": 5.463309643291768, "learning_rate": 1.6666666666666667e-06, "loss": 1.5758234024047852, "step": 10 }, { "epoch": 0.03798670465337132, "grad_norm": 3.5947186062437324, "learning_rate": 2.5e-06, "loss": 1.5148856163024902, "step": 15 }, { "epoch": 0.050648939537828425, "grad_norm": 2.022183241828244, "learning_rate": 3.3333333333333333e-06, "loss": 1.4563226699829102, "step": 20 }, { "epoch": 0.06331117442228554, "grad_norm": 2.568206156447999, "learning_rate": 4.166666666666667e-06, "loss": 1.4071508407592774, "step": 25 }, { "epoch": 0.07597340930674264, "grad_norm": 1.3172558883152181, "learning_rate": 5e-06, "loss": 1.361931037902832, "step": 30 }, { "epoch": 0.08863564419119975, "grad_norm": 1.289611427468001, "learning_rate": 5.833333333333334e-06, "loss": 1.3146369934082032, "step": 35 }, { "epoch": 0.10129787907565685, "grad_norm": 1.0096819520657572, "learning_rate": 6.666666666666667e-06, "loss": 1.2890718460083008, "step": 40 }, { "epoch": 0.11396011396011396, "grad_norm": 0.9312368002064222, "learning_rate": 7.500000000000001e-06, "loss": 1.262472152709961, "step": 45 }, { "epoch": 0.12662234884457108, "grad_norm": 0.9372677018897868, "learning_rate": 8.333333333333334e-06, "loss": 1.2449541091918945, "step": 50 }, { "epoch": 0.13928458372902816, "grad_norm": 1.156336182386965, "learning_rate": 9.166666666666666e-06, "loss": 1.23805570602417, "step": 55 }, { "epoch": 0.15194681861348527, "grad_norm": 1.2881982404625736, "learning_rate": 1e-05, "loss": 1.2132294654846192, "step": 60 }, { "epoch": 0.1646090534979424, "grad_norm": 1.0254220781070695, "learning_rate": 9.999512620046523e-06, "loss": 1.220973587036133, "step": 65 }, { "epoch": 0.1772712883823995, "grad_norm": 0.9489950684909466, "learning_rate": 9.998050575201772e-06, "loss": 1.2019853591918945, "step": 70 }, { "epoch": 0.1899335232668566, "grad_norm": 0.9316575478826806, "learning_rate": 9.995614150494293e-06, "loss": 1.2073640823364258, "step": 75 }, { "epoch": 0.2025957581513137, "grad_norm": 1.0042469325935621, "learning_rate": 9.992203820909906e-06, "loss": 1.1844447135925293, "step": 80 }, { "epoch": 0.2152579930357708, "grad_norm": 0.9710381702713043, "learning_rate": 9.987820251299121e-06, "loss": 1.1868626594543457, "step": 85 }, { "epoch": 0.22792022792022792, "grad_norm": 0.9015713419278082, "learning_rate": 9.982464296247523e-06, "loss": 1.16792631149292, "step": 90 }, { "epoch": 0.24058246280468504, "grad_norm": 0.9242205484551671, "learning_rate": 9.976136999909156e-06, "loss": 1.1806648254394532, "step": 95 }, { "epoch": 0.25324469768914215, "grad_norm": 0.8421714973436404, "learning_rate": 9.968839595802982e-06, "loss": 1.1688653945922851, "step": 100 }, { "epoch": 0.26590693257359926, "grad_norm": 0.9053511703432988, "learning_rate": 9.960573506572391e-06, "loss": 1.1603254318237304, "step": 105 }, { "epoch": 0.2785691674580563, "grad_norm": 0.8815755237366663, "learning_rate": 9.951340343707852e-06, "loss": 1.1436431884765625, "step": 110 }, { "epoch": 0.29123140234251343, "grad_norm": 0.9133167544949871, "learning_rate": 9.941141907232766e-06, "loss": 1.1711238861083983, "step": 115 }, { "epoch": 0.30389363722697055, "grad_norm": 0.9280708661664501, "learning_rate": 9.929980185352525e-06, "loss": 1.1607641220092773, "step": 120 }, { "epoch": 0.31655587211142766, "grad_norm": 0.8789051540869617, "learning_rate": 9.91785735406693e-06, "loss": 1.1655372619628905, "step": 125 }, { "epoch": 0.3292181069958848, "grad_norm": 0.9387606971380588, "learning_rate": 9.904775776745959e-06, "loss": 1.1415754318237306, "step": 130 }, { "epoch": 0.3418803418803419, "grad_norm": 0.8962535961715238, "learning_rate": 9.890738003669029e-06, "loss": 1.141004753112793, "step": 135 }, { "epoch": 0.354542576764799, "grad_norm": 0.8628618510513137, "learning_rate": 9.875746771527817e-06, "loss": 1.1703954696655274, "step": 140 }, { "epoch": 0.3672048116492561, "grad_norm": 0.901181222022341, "learning_rate": 9.859805002892733e-06, "loss": 1.1528019905090332, "step": 145 }, { "epoch": 0.3798670465337132, "grad_norm": 0.8630918009712893, "learning_rate": 9.842915805643156e-06, "loss": 1.1367189407348632, "step": 150 }, { "epoch": 0.3925292814181703, "grad_norm": 0.8702195806012554, "learning_rate": 9.825082472361558e-06, "loss": 1.1533798217773437, "step": 155 }, { "epoch": 0.4051915163026274, "grad_norm": 0.8708614692916694, "learning_rate": 9.806308479691595e-06, "loss": 1.158640480041504, "step": 160 }, { "epoch": 0.4178537511870845, "grad_norm": 0.8793848769376316, "learning_rate": 9.786597487660336e-06, "loss": 1.1480545043945312, "step": 165 }, { "epoch": 0.4305159860715416, "grad_norm": 0.8658001687150836, "learning_rate": 9.765953338964736e-06, "loss": 1.1336278915405273, "step": 170 }, { "epoch": 0.44317822095599874, "grad_norm": 0.8569493052828222, "learning_rate": 9.744380058222483e-06, "loss": 1.1366922378540039, "step": 175 }, { "epoch": 0.45584045584045585, "grad_norm": 0.8658238768368638, "learning_rate": 9.721881851187406e-06, "loss": 1.1221330642700196, "step": 180 }, { "epoch": 0.46850269072491296, "grad_norm": 0.8315025062463812, "learning_rate": 9.698463103929542e-06, "loss": 1.137201690673828, "step": 185 }, { "epoch": 0.4811649256093701, "grad_norm": 0.8646733066379476, "learning_rate": 9.674128381980073e-06, "loss": 1.1246437072753905, "step": 190 }, { "epoch": 0.49382716049382713, "grad_norm": 0.9329613102085004, "learning_rate": 9.648882429441258e-06, "loss": 1.1196226119995116, "step": 195 }, { "epoch": 0.5064893953782843, "grad_norm": 0.8893896484251661, "learning_rate": 9.622730168061568e-06, "loss": 1.1334550857543946, "step": 200 }, { "epoch": 0.5191516302627414, "grad_norm": 0.912333387639604, "learning_rate": 9.595676696276173e-06, "loss": 1.1253994941711425, "step": 205 }, { "epoch": 0.5318138651471985, "grad_norm": 0.982396926968246, "learning_rate": 9.567727288213005e-06, "loss": 1.1222535133361817, "step": 210 }, { "epoch": 0.5444761000316556, "grad_norm": 0.885141191565451, "learning_rate": 9.538887392664544e-06, "loss": 1.1143704414367677, "step": 215 }, { "epoch": 0.5571383349161126, "grad_norm": 0.840306231211871, "learning_rate": 9.50916263202557e-06, "loss": 1.1145578384399415, "step": 220 }, { "epoch": 0.5698005698005698, "grad_norm": 0.873418768799577, "learning_rate": 9.478558801197065e-06, "loss": 1.1184951782226562, "step": 225 }, { "epoch": 0.5824628046850269, "grad_norm": 0.8644731775393019, "learning_rate": 9.44708186645649e-06, "loss": 1.1118096351623534, "step": 230 }, { "epoch": 0.595125039569484, "grad_norm": 0.8383543019686877, "learning_rate": 9.414737964294636e-06, "loss": 1.1120855331420898, "step": 235 }, { "epoch": 0.6077872744539411, "grad_norm": 0.8339381439594867, "learning_rate": 9.381533400219319e-06, "loss": 1.0976166725158691, "step": 240 }, { "epoch": 0.6204495093383983, "grad_norm": 1.2527861157729694, "learning_rate": 9.347474647526095e-06, "loss": 1.1195283889770509, "step": 245 }, { "epoch": 0.6331117442228553, "grad_norm": 0.896892265554505, "learning_rate": 9.312568346036288e-06, "loss": 1.1280832290649414, "step": 250 }, { "epoch": 0.6457739791073125, "grad_norm": 0.863217024084411, "learning_rate": 9.276821300802535e-06, "loss": 1.1169985771179198, "step": 255 }, { "epoch": 0.6584362139917695, "grad_norm": 0.8613522109819245, "learning_rate": 9.24024048078213e-06, "loss": 1.110457420349121, "step": 260 }, { "epoch": 0.6710984488762266, "grad_norm": 0.8269568651408957, "learning_rate": 9.202833017478421e-06, "loss": 1.1079233169555665, "step": 265 }, { "epoch": 0.6837606837606838, "grad_norm": 0.9106153459573166, "learning_rate": 9.164606203550498e-06, "loss": 1.115132713317871, "step": 270 }, { "epoch": 0.6964229186451408, "grad_norm": 0.8475270076896408, "learning_rate": 9.125567491391476e-06, "loss": 1.114927101135254, "step": 275 }, { "epoch": 0.709085153529598, "grad_norm": 0.8419303390301319, "learning_rate": 9.085724491675642e-06, "loss": 1.1053291320800782, "step": 280 }, { "epoch": 0.7217473884140551, "grad_norm": 0.8793202963091465, "learning_rate": 9.045084971874738e-06, "loss": 1.1043977737426758, "step": 285 }, { "epoch": 0.7344096232985122, "grad_norm": 0.8844837887961337, "learning_rate": 9.003656854743667e-06, "loss": 1.0930152893066407, "step": 290 }, { "epoch": 0.7470718581829693, "grad_norm": 0.8243068254935355, "learning_rate": 8.961448216775955e-06, "loss": 1.1083423614501953, "step": 295 }, { "epoch": 0.7597340930674265, "grad_norm": 0.8231480643363308, "learning_rate": 8.9184672866292e-06, "loss": 1.093316650390625, "step": 300 }, { "epoch": 0.7723963279518835, "grad_norm": 0.856487105562053, "learning_rate": 8.874722443520898e-06, "loss": 1.0935728073120117, "step": 305 }, { "epoch": 0.7850585628363406, "grad_norm": 0.9069069891533103, "learning_rate": 8.83022221559489e-06, "loss": 1.085923957824707, "step": 310 }, { "epoch": 0.7977207977207977, "grad_norm": 0.8415924258567633, "learning_rate": 8.784975278258783e-06, "loss": 1.1055352210998535, "step": 315 }, { "epoch": 0.8103830326052548, "grad_norm": 0.8547842295217172, "learning_rate": 8.73899045249266e-06, "loss": 1.1053098678588866, "step": 320 }, { "epoch": 0.823045267489712, "grad_norm": 0.9042040663099864, "learning_rate": 8.692276703129421e-06, "loss": 1.100543212890625, "step": 325 }, { "epoch": 0.835707502374169, "grad_norm": 0.840156677605529, "learning_rate": 8.644843137107058e-06, "loss": 1.1007650375366211, "step": 330 }, { "epoch": 0.8483697372586262, "grad_norm": 0.8554168041829401, "learning_rate": 8.596699001693257e-06, "loss": 1.095210647583008, "step": 335 }, { "epoch": 0.8610319721430832, "grad_norm": 0.8378136162576828, "learning_rate": 8.547853682682605e-06, "loss": 1.0945035934448242, "step": 340 }, { "epoch": 0.8736942070275404, "grad_norm": 0.8300982370825878, "learning_rate": 8.498316702566828e-06, "loss": 1.0824993133544922, "step": 345 }, { "epoch": 0.8863564419119975, "grad_norm": 0.8879949006435145, "learning_rate": 8.44809771867835e-06, "loss": 1.0910042762756347, "step": 350 }, { "epoch": 0.8990186767964545, "grad_norm": 0.8363110809635331, "learning_rate": 8.397206521307584e-06, "loss": 1.085635280609131, "step": 355 }, { "epoch": 0.9116809116809117, "grad_norm": 0.8250978511317656, "learning_rate": 8.345653031794292e-06, "loss": 1.0832603454589844, "step": 360 }, { "epoch": 0.9243431465653688, "grad_norm": 0.8250625494950978, "learning_rate": 8.293447300593402e-06, "loss": 1.0881545066833496, "step": 365 }, { "epoch": 0.9370053814498259, "grad_norm": 0.9637417812174898, "learning_rate": 8.240599505315656e-06, "loss": 1.077590274810791, "step": 370 }, { "epoch": 0.949667616334283, "grad_norm": 0.938188486575515, "learning_rate": 8.18711994874345e-06, "loss": 1.0923616409301757, "step": 375 }, { "epoch": 0.9623298512187402, "grad_norm": 0.829053167214024, "learning_rate": 8.133019056822303e-06, "loss": 1.0790325164794923, "step": 380 }, { "epoch": 0.9749920861031972, "grad_norm": 0.8296874845053457, "learning_rate": 8.078307376628292e-06, "loss": 1.0690267562866211, "step": 385 }, { "epoch": 0.9876543209876543, "grad_norm": 0.8248755231512207, "learning_rate": 8.022995574311876e-06, "loss": 1.0922147750854492, "step": 390 }, { "epoch": 1.0, "grad_norm": 0.9123714875418006, "learning_rate": 7.967094433018508e-06, "loss": 1.0716293334960938, "step": 395 }, { "epoch": 1.0126622348844572, "grad_norm": 0.8825626316822892, "learning_rate": 7.910614850786448e-06, "loss": 0.9421855926513671, "step": 400 }, { "epoch": 1.0253244697689141, "grad_norm": 0.981129259243819, "learning_rate": 7.85356783842216e-06, "loss": 0.9680027008056641, "step": 405 }, { "epoch": 1.0379867046533713, "grad_norm": 0.9490494582638624, "learning_rate": 7.795964517353734e-06, "loss": 0.9392026901245117, "step": 410 }, { "epoch": 1.0506489395378285, "grad_norm": 1.0436527309713077, "learning_rate": 7.737816117462752e-06, "loss": 0.9481110572814941, "step": 415 }, { "epoch": 1.0633111744222856, "grad_norm": 0.9193717140597131, "learning_rate": 7.679133974894984e-06, "loss": 0.9479268074035645, "step": 420 }, { "epoch": 1.0759734093067426, "grad_norm": 0.9176846478769476, "learning_rate": 7.619929529850397e-06, "loss": 0.9510162353515625, "step": 425 }, { "epoch": 1.0886356441911997, "grad_norm": 0.9263690784461404, "learning_rate": 7.560214324352858e-06, "loss": 0.9560428619384765, "step": 430 }, { "epoch": 1.101297879075657, "grad_norm": 0.8985018721390384, "learning_rate": 7.500000000000001e-06, "loss": 0.9549171447753906, "step": 435 }, { "epoch": 1.1139601139601139, "grad_norm": 0.8383045792822509, "learning_rate": 7.4392982956936644e-06, "loss": 0.9572299957275391, "step": 440 }, { "epoch": 1.126622348844571, "grad_norm": 0.8693402459631241, "learning_rate": 7.378121045351378e-06, "loss": 0.9538370132446289, "step": 445 }, { "epoch": 1.1392845837290282, "grad_norm": 0.8465948151936904, "learning_rate": 7.31648017559931e-06, "loss": 0.9445423126220703, "step": 450 }, { "epoch": 1.1519468186134854, "grad_norm": 0.8993258971886791, "learning_rate": 7.254387703447154e-06, "loss": 0.9402847290039062, "step": 455 }, { "epoch": 1.1646090534979423, "grad_norm": 0.8973654441260622, "learning_rate": 7.191855733945388e-06, "loss": 0.9458431243896485, "step": 460 }, { "epoch": 1.1772712883823995, "grad_norm": 0.8975789539843146, "learning_rate": 7.128896457825364e-06, "loss": 0.9456979751586914, "step": 465 }, { "epoch": 1.1899335232668566, "grad_norm": 0.9025883974896288, "learning_rate": 7.06552214912271e-06, "loss": 0.958702278137207, "step": 470 }, { "epoch": 1.2025957581513138, "grad_norm": 0.8943619241590697, "learning_rate": 7.0017451627844765e-06, "loss": 0.9409778594970704, "step": 475 }, { "epoch": 1.2152579930357708, "grad_norm": 0.8987697465779751, "learning_rate": 6.9375779322605154e-06, "loss": 0.952575397491455, "step": 480 }, { "epoch": 1.227920227920228, "grad_norm": 0.8957262384243423, "learning_rate": 6.873032967079562e-06, "loss": 0.9412460327148438, "step": 485 }, { "epoch": 1.240582462804685, "grad_norm": 0.9191287064439484, "learning_rate": 6.808122850410461e-06, "loss": 0.9442897796630859, "step": 490 }, { "epoch": 1.253244697689142, "grad_norm": 0.9120111224616239, "learning_rate": 6.7428602366090764e-06, "loss": 0.9721967697143554, "step": 495 }, { "epoch": 1.2659069325735992, "grad_norm": 0.9297557344562997, "learning_rate": 6.677257848751276e-06, "loss": 0.9427990913391113, "step": 500 }, { "epoch": 1.2785691674580564, "grad_norm": 0.9256360350131605, "learning_rate": 6.611328476152557e-06, "loss": 0.9448193550109864, "step": 505 }, { "epoch": 1.2912314023425133, "grad_norm": 0.9178166712574457, "learning_rate": 6.545084971874738e-06, "loss": 0.9285225868225098, "step": 510 }, { "epoch": 1.3038936372269705, "grad_norm": 0.8824737418151191, "learning_rate": 6.4785402502202345e-06, "loss": 0.9465466499328613, "step": 515 }, { "epoch": 1.3165558721114277, "grad_norm": 0.8714305178817582, "learning_rate": 6.411707284214384e-06, "loss": 0.9558137893676758, "step": 520 }, { "epoch": 1.3292181069958848, "grad_norm": 1.6420471551581535, "learning_rate": 6.344599103076329e-06, "loss": 0.9441043853759765, "step": 525 }, { "epoch": 1.341880341880342, "grad_norm": 0.8940534993249484, "learning_rate": 6.277228789678953e-06, "loss": 0.9406339645385742, "step": 530 }, { "epoch": 1.354542576764799, "grad_norm": 0.8657105103377609, "learning_rate": 6.209609477998339e-06, "loss": 0.9400988578796386, "step": 535 }, { "epoch": 1.3672048116492561, "grad_norm": 0.8795303497602281, "learning_rate": 6.141754350553279e-06, "loss": 0.9375904083251954, "step": 540 }, { "epoch": 1.3798670465337133, "grad_norm": 0.8778881000839949, "learning_rate": 6.073676635835317e-06, "loss": 0.9534420013427735, "step": 545 }, { "epoch": 1.3925292814181702, "grad_norm": 0.8609329406866304, "learning_rate": 6.005389605729824e-06, "loss": 0.9435734748840332, "step": 550 }, { "epoch": 1.4051915163026274, "grad_norm": 0.901450340070586, "learning_rate": 5.936906572928625e-06, "loss": 0.9454706192016602, "step": 555 }, { "epoch": 1.4178537511870846, "grad_norm": 0.9056724009579911, "learning_rate": 5.8682408883346535e-06, "loss": 0.9358626365661621, "step": 560 }, { "epoch": 1.4305159860715415, "grad_norm": 0.8767791922734569, "learning_rate": 5.799405938459175e-06, "loss": 0.9384665489196777, "step": 565 }, { "epoch": 1.4431782209559987, "grad_norm": 0.9226108292554362, "learning_rate": 5.730415142812059e-06, "loss": 0.9389400482177734, "step": 570 }, { "epoch": 1.4558404558404558, "grad_norm": 0.8635227126945888, "learning_rate": 5.661281951285613e-06, "loss": 0.9539518356323242, "step": 575 }, { "epoch": 1.468502690724913, "grad_norm": 0.8840260265705664, "learning_rate": 5.592019841532507e-06, "loss": 0.9480253219604492, "step": 580 }, { "epoch": 1.4811649256093702, "grad_norm": 0.9151680057009149, "learning_rate": 5.522642316338268e-06, "loss": 0.9404661178588867, "step": 585 }, { "epoch": 1.4938271604938271, "grad_norm": 0.9450262697016882, "learning_rate": 5.453162900988902e-06, "loss": 0.9321787834167481, "step": 590 }, { "epoch": 1.5064893953782843, "grad_norm": 0.8402436559360018, "learning_rate": 5.383595140634093e-06, "loss": 0.9440553665161133, "step": 595 }, { "epoch": 1.5191516302627415, "grad_norm": 0.8778976142471068, "learning_rate": 5.3139525976465675e-06, "loss": 0.9511254310607911, "step": 600 }, { "epoch": 1.5318138651471984, "grad_norm": 0.8781843644707367, "learning_rate": 5.244248848978067e-06, "loss": 0.9387626647949219, "step": 605 }, { "epoch": 1.5444761000316556, "grad_norm": 0.8642449781808372, "learning_rate": 5.174497483512506e-06, "loss": 0.956205177307129, "step": 610 }, { "epoch": 1.5571383349161128, "grad_norm": 0.8846802147972775, "learning_rate": 5.1047120994167855e-06, "loss": 0.9363911628723145, "step": 615 }, { "epoch": 1.5698005698005697, "grad_norm": 0.8739137758439613, "learning_rate": 5.034906301489808e-06, "loss": 0.9367790222167969, "step": 620 }, { "epoch": 1.5824628046850269, "grad_norm": 0.8953494651595788, "learning_rate": 4.965093698510192e-06, "loss": 0.9425483703613281, "step": 625 }, { "epoch": 1.595125039569484, "grad_norm": 0.8615421639128288, "learning_rate": 4.895287900583216e-06, "loss": 0.9341062545776367, "step": 630 }, { "epoch": 1.607787274453941, "grad_norm": 0.8353360832306662, "learning_rate": 4.825502516487497e-06, "loss": 0.949849796295166, "step": 635 }, { "epoch": 1.6204495093383984, "grad_norm": 0.8563998366304418, "learning_rate": 4.755751151021934e-06, "loss": 0.9409940719604493, "step": 640 }, { "epoch": 1.6331117442228553, "grad_norm": 0.9360183967885729, "learning_rate": 4.686047402353433e-06, "loss": 0.939891242980957, "step": 645 }, { "epoch": 1.6457739791073125, "grad_norm": 0.8806457976411894, "learning_rate": 4.6164048593659076e-06, "loss": 0.952726173400879, "step": 650 }, { "epoch": 1.6584362139917697, "grad_norm": 0.8871650293826654, "learning_rate": 4.546837099011101e-06, "loss": 0.9440122604370117, "step": 655 }, { "epoch": 1.6710984488762266, "grad_norm": 0.8543495337665787, "learning_rate": 4.477357683661734e-06, "loss": 0.9277559280395508, "step": 660 }, { "epoch": 1.6837606837606838, "grad_norm": 0.8754310619944701, "learning_rate": 4.4079801584674955e-06, "loss": 0.9328133583068847, "step": 665 }, { "epoch": 1.696422918645141, "grad_norm": 0.846881206379322, "learning_rate": 4.3387180487143875e-06, "loss": 0.9440486907958985, "step": 670 }, { "epoch": 1.709085153529598, "grad_norm": 0.8123484252146217, "learning_rate": 4.269584857187942e-06, "loss": 0.9334369659423828, "step": 675 }, { "epoch": 1.721747388414055, "grad_norm": 0.8860941606484654, "learning_rate": 4.200594061540827e-06, "loss": 0.9386373519897461, "step": 680 }, { "epoch": 1.7344096232985122, "grad_norm": 0.8710977899292981, "learning_rate": 4.131759111665349e-06, "loss": 0.9379000663757324, "step": 685 }, { "epoch": 1.7470718581829692, "grad_norm": 0.8989668390644706, "learning_rate": 4.063093427071376e-06, "loss": 0.9351366043090821, "step": 690 }, { "epoch": 1.7597340930674266, "grad_norm": 0.8426262295188102, "learning_rate": 3.994610394270178e-06, "loss": 0.9458501815795899, "step": 695 }, { "epoch": 1.7723963279518835, "grad_norm": 0.8490556601435445, "learning_rate": 3.926323364164684e-06, "loss": 0.9344646453857421, "step": 700 }, { "epoch": 1.7850585628363405, "grad_norm": 0.857013306646358, "learning_rate": 3.8582456494467214e-06, "loss": 0.9324585914611816, "step": 705 }, { "epoch": 1.7977207977207978, "grad_norm": 0.8442075171060656, "learning_rate": 3.790390522001662e-06, "loss": 0.9345897674560547, "step": 710 }, { "epoch": 1.8103830326052548, "grad_norm": 0.8635838902214552, "learning_rate": 3.7227712103210485e-06, "loss": 0.9480118751525879, "step": 715 }, { "epoch": 1.823045267489712, "grad_norm": 0.8701785787205291, "learning_rate": 3.655400896923672e-06, "loss": 0.9411863327026367, "step": 720 }, { "epoch": 1.8357075023741691, "grad_norm": 0.9278897279843371, "learning_rate": 3.5882927157856175e-06, "loss": 0.9384016036987305, "step": 725 }, { "epoch": 1.848369737258626, "grad_norm": 0.8675201640437896, "learning_rate": 3.521459749779769e-06, "loss": 0.9388191223144531, "step": 730 }, { "epoch": 1.8610319721430832, "grad_norm": 0.9047480946293855, "learning_rate": 3.4549150281252635e-06, "loss": 0.943515396118164, "step": 735 }, { "epoch": 1.8736942070275404, "grad_norm": 0.9100256206799584, "learning_rate": 3.3886715238474454e-06, "loss": 0.9317167282104493, "step": 740 }, { "epoch": 1.8863564419119974, "grad_norm": 0.9121240599713055, "learning_rate": 3.322742151248726e-06, "loss": 0.9298182487487793, "step": 745 }, { "epoch": 1.8990186767964545, "grad_norm": 0.8360632961222116, "learning_rate": 3.2571397633909252e-06, "loss": 0.9383123397827149, "step": 750 }, { "epoch": 1.9116809116809117, "grad_norm": 0.8449980062948027, "learning_rate": 3.1918771495895395e-06, "loss": 0.9380681991577149, "step": 755 }, { "epoch": 1.9243431465653686, "grad_norm": 0.8358057866853585, "learning_rate": 3.12696703292044e-06, "loss": 0.9311031341552735, "step": 760 }, { "epoch": 1.937005381449826, "grad_norm": 0.8261214369483678, "learning_rate": 3.0624220677394854e-06, "loss": 0.9335260391235352, "step": 765 }, { "epoch": 1.949667616334283, "grad_norm": 0.8746978630306859, "learning_rate": 2.9982548372155264e-06, "loss": 0.9282594680786133, "step": 770 }, { "epoch": 1.9623298512187402, "grad_norm": 0.8914685495920053, "learning_rate": 2.934477850877292e-06, "loss": 0.9267834663391114, "step": 775 }, { "epoch": 1.9749920861031973, "grad_norm": 0.8730909900000534, "learning_rate": 2.871103542174637e-06, "loss": 0.9400104522705078, "step": 780 }, { "epoch": 1.9876543209876543, "grad_norm": 0.9195388866817068, "learning_rate": 2.8081442660546126e-06, "loss": 0.9355339050292969, "step": 785 }, { "epoch": 2.0, "grad_norm": 0.8941990040688051, "learning_rate": 2.7456122965528475e-06, "loss": 0.9464699745178222, "step": 790 }, { "epoch": 2.012662234884457, "grad_norm": 0.9551299167570609, "learning_rate": 2.683519824400693e-06, "loss": 0.8369241714477539, "step": 795 }, { "epoch": 2.0253244697689143, "grad_norm": 0.9503417747763285, "learning_rate": 2.6218789546486235e-06, "loss": 0.8305461883544922, "step": 800 }, { "epoch": 2.0379867046533713, "grad_norm": 0.9428708677587196, "learning_rate": 2.560701704306336e-06, "loss": 0.8380617141723633, "step": 805 }, { "epoch": 2.0506489395378282, "grad_norm": 0.9141118129164282, "learning_rate": 2.5000000000000015e-06, "loss": 0.8350756645202637, "step": 810 }, { "epoch": 2.0633111744222856, "grad_norm": 0.8925703133521584, "learning_rate": 2.4397856756471435e-06, "loss": 0.8253829956054688, "step": 815 }, { "epoch": 2.0759734093067426, "grad_norm": 0.9010434641718755, "learning_rate": 2.380070470149605e-06, "loss": 0.8296566009521484, "step": 820 }, { "epoch": 2.0886356441912, "grad_norm": 0.9108639104627096, "learning_rate": 2.320866025105016e-06, "loss": 0.8311027526855469, "step": 825 }, { "epoch": 2.101297879075657, "grad_norm": 0.8776111588691332, "learning_rate": 2.2621838825372496e-06, "loss": 0.8341006278991699, "step": 830 }, { "epoch": 2.113960113960114, "grad_norm": 0.9815553215946904, "learning_rate": 2.204035482646267e-06, "loss": 0.8500799179077149, "step": 835 }, { "epoch": 2.1266223488445712, "grad_norm": 0.9142581836968815, "learning_rate": 2.146432161577842e-06, "loss": 0.8405605316162109, "step": 840 }, { "epoch": 2.139284583729028, "grad_norm": 1.145797436281295, "learning_rate": 2.0893851492135536e-06, "loss": 0.8333783149719238, "step": 845 }, { "epoch": 2.151946818613485, "grad_norm": 1.0132915822668673, "learning_rate": 2.0329055669814936e-06, "loss": 0.8394683837890625, "step": 850 }, { "epoch": 2.1646090534979425, "grad_norm": 1.054834919380244, "learning_rate": 1.977004425688126e-06, "loss": 0.8199810028076172, "step": 855 }, { "epoch": 2.1772712883823995, "grad_norm": 0.8659196804614238, "learning_rate": 1.9216926233717087e-06, "loss": 0.8200090408325196, "step": 860 }, { "epoch": 2.1899335232668564, "grad_norm": 0.8936160842705467, "learning_rate": 1.8669809431776991e-06, "loss": 0.819823932647705, "step": 865 }, { "epoch": 2.202595758151314, "grad_norm": 0.8936829197489651, "learning_rate": 1.8128800512565514e-06, "loss": 0.8329672813415527, "step": 870 }, { "epoch": 2.2152579930357708, "grad_norm": 0.865124596235692, "learning_rate": 1.7594004946843458e-06, "loss": 0.830903434753418, "step": 875 }, { "epoch": 2.2279202279202277, "grad_norm": 1.0004908923945968, "learning_rate": 1.7065526994065973e-06, "loss": 0.8222661972045898, "step": 880 }, { "epoch": 2.240582462804685, "grad_norm": 0.9974019476784539, "learning_rate": 1.6543469682057105e-06, "loss": 0.8375696182250977, "step": 885 }, { "epoch": 2.253244697689142, "grad_norm": 0.9489943802555122, "learning_rate": 1.6027934786924187e-06, "loss": 0.8297539710998535, "step": 890 }, { "epoch": 2.2659069325735994, "grad_norm": 0.8526558052313017, "learning_rate": 1.551902281321651e-06, "loss": 0.8450464248657227, "step": 895 }, { "epoch": 2.2785691674580564, "grad_norm": 0.9095006101158244, "learning_rate": 1.5016832974331725e-06, "loss": 0.8434087753295898, "step": 900 }, { "epoch": 2.2912314023425133, "grad_norm": 0.8941646461803728, "learning_rate": 1.4521463173173966e-06, "loss": 0.8199748992919922, "step": 905 }, { "epoch": 2.3038936372269707, "grad_norm": 1.0151629908802393, "learning_rate": 1.4033009983067454e-06, "loss": 0.8257926940917969, "step": 910 }, { "epoch": 2.3165558721114277, "grad_norm": 0.8688789281578927, "learning_rate": 1.3551568628929434e-06, "loss": 0.8222599029541016, "step": 915 }, { "epoch": 2.3292181069958846, "grad_norm": 0.9256160248355862, "learning_rate": 1.3077232968705805e-06, "loss": 0.8179254531860352, "step": 920 }, { "epoch": 2.341880341880342, "grad_norm": 0.8992368646832662, "learning_rate": 1.2610095475073415e-06, "loss": 0.8351934432983399, "step": 925 }, { "epoch": 2.354542576764799, "grad_norm": 0.8950940935118609, "learning_rate": 1.2150247217412186e-06, "loss": 0.8317380905151367, "step": 930 }, { "epoch": 2.3672048116492563, "grad_norm": 0.8832980241323902, "learning_rate": 1.1697777844051105e-06, "loss": 0.8315254211425781, "step": 935 }, { "epoch": 2.3798670465337133, "grad_norm": 0.9169712891318174, "learning_rate": 1.1252775564791023e-06, "loss": 0.8270421981811523, "step": 940 }, { "epoch": 2.3925292814181702, "grad_norm": 0.8820889479067028, "learning_rate": 1.0815327133708015e-06, "loss": 0.8373619079589844, "step": 945 }, { "epoch": 2.4051915163026276, "grad_norm": 0.8633797543130399, "learning_rate": 1.0385517832240472e-06, "loss": 0.822084617614746, "step": 950 }, { "epoch": 2.4178537511870846, "grad_norm": 0.883673978191503, "learning_rate": 9.963431452563331e-07, "loss": 0.8369743347167968, "step": 955 }, { "epoch": 2.4305159860715415, "grad_norm": 0.8729883350584554, "learning_rate": 9.549150281252633e-07, "loss": 0.8232148170471192, "step": 960 }, { "epoch": 2.443178220955999, "grad_norm": 0.9022268286398805, "learning_rate": 9.142755083243577e-07, "loss": 0.8386312484741211, "step": 965 }, { "epoch": 2.455840455840456, "grad_norm": 0.886835830092829, "learning_rate": 8.744325086085248e-07, "loss": 0.8283025741577148, "step": 970 }, { "epoch": 2.468502690724913, "grad_norm": 0.8893818705075751, "learning_rate": 8.353937964495029e-07, "loss": 0.8303911209106445, "step": 975 }, { "epoch": 2.48116492560937, "grad_norm": 0.922324297214932, "learning_rate": 7.971669825215789e-07, "loss": 0.836126708984375, "step": 980 }, { "epoch": 2.493827160493827, "grad_norm": 0.904255523759268, "learning_rate": 7.597595192178702e-07, "loss": 0.8196451187133789, "step": 985 }, { "epoch": 2.506489395378284, "grad_norm": 0.8636934543130641, "learning_rate": 7.23178699197467e-07, "loss": 0.8335494995117188, "step": 990 }, { "epoch": 2.5191516302627415, "grad_norm": 0.897039019918768, "learning_rate": 6.874316539637127e-07, "loss": 0.8088079452514648, "step": 995 }, { "epoch": 2.5318138651471984, "grad_norm": 0.8864019874531589, "learning_rate": 6.52525352473905e-07, "loss": 0.8233877182006836, "step": 1000 }, { "epoch": 2.5444761000316554, "grad_norm": 0.9088661368290617, "learning_rate": 6.184665997806832e-07, "loss": 0.8182021141052246, "step": 1005 }, { "epoch": 2.5571383349161128, "grad_norm": 0.9072624944819723, "learning_rate": 5.852620357053651e-07, "loss": 0.835714054107666, "step": 1010 }, { "epoch": 2.5698005698005697, "grad_norm": 0.8731742132357438, "learning_rate": 5.529181335435124e-07, "loss": 0.8283638000488281, "step": 1015 }, { "epoch": 2.5824628046850266, "grad_norm": 0.8761704900525855, "learning_rate": 5.214411988029355e-07, "loss": 0.828251838684082, "step": 1020 }, { "epoch": 2.595125039569484, "grad_norm": 0.8629998756845685, "learning_rate": 4.908373679744316e-07, "loss": 0.8239392280578614, "step": 1025 }, { "epoch": 2.607787274453941, "grad_norm": 0.9025713028894049, "learning_rate": 4.6111260733545714e-07, "loss": 0.8368805885314942, "step": 1030 }, { "epoch": 2.6204495093383984, "grad_norm": 0.8791508721949534, "learning_rate": 4.322727117869951e-07, "loss": 0.8214786529541016, "step": 1035 }, { "epoch": 2.6331117442228553, "grad_norm": 0.8747271828916487, "learning_rate": 4.043233037238281e-07, "loss": 0.8331809997558594, "step": 1040 }, { "epoch": 2.6457739791073127, "grad_norm": 0.9189023842289675, "learning_rate": 3.772698319384349e-07, "loss": 0.8299878120422364, "step": 1045 }, { "epoch": 2.6584362139917697, "grad_norm": 0.9012554611673713, "learning_rate": 3.511175705587433e-07, "loss": 0.8398582458496093, "step": 1050 }, { "epoch": 2.6710984488762266, "grad_norm": 0.8857980961838654, "learning_rate": 3.258716180199278e-07, "loss": 0.818387794494629, "step": 1055 }, { "epoch": 2.683760683760684, "grad_norm": 0.8627721513188427, "learning_rate": 3.015368960704584e-07, "loss": 0.8408231735229492, "step": 1060 }, { "epoch": 2.696422918645141, "grad_norm": 1.0338964144567573, "learning_rate": 2.7811814881259503e-07, "loss": 0.8292581558227539, "step": 1065 }, { "epoch": 2.709085153529598, "grad_norm": 0.8858535803633547, "learning_rate": 2.556199417775174e-07, "loss": 0.8229169845581055, "step": 1070 }, { "epoch": 2.7217473884140553, "grad_norm": 0.8487724578022646, "learning_rate": 2.3404666103526542e-07, "loss": 0.8243260383605957, "step": 1075 }, { "epoch": 2.7344096232985122, "grad_norm": 0.9142856277892702, "learning_rate": 2.134025123396638e-07, "loss": 0.8411771774291992, "step": 1080 }, { "epoch": 2.747071858182969, "grad_norm": 0.9044836277079623, "learning_rate": 1.9369152030840553e-07, "loss": 0.8182785034179687, "step": 1085 }, { "epoch": 2.7597340930674266, "grad_norm": 0.8753811947010942, "learning_rate": 1.7491752763844294e-07, "loss": 0.8330059051513672, "step": 1090 }, { "epoch": 2.7723963279518835, "grad_norm": 0.9127704993164228, "learning_rate": 1.5708419435684463e-07, "loss": 0.8270849227905274, "step": 1095 }, { "epoch": 2.7850585628363405, "grad_norm": 0.8885180442416001, "learning_rate": 1.4019499710726913e-07, "loss": 0.8345333099365234, "step": 1100 }, { "epoch": 2.797720797720798, "grad_norm": 0.9121582122101339, "learning_rate": 1.2425322847218368e-07, "loss": 0.8229399681091308, "step": 1105 }, { "epoch": 2.810383032605255, "grad_norm": 0.9127878209504691, "learning_rate": 1.0926199633097156e-07, "loss": 0.82467041015625, "step": 1110 }, { "epoch": 2.8230452674897117, "grad_norm": 1.744776544604879, "learning_rate": 9.522422325404234e-08, "loss": 0.8274450302124023, "step": 1115 }, { "epoch": 2.835707502374169, "grad_norm": 0.8982447409078338, "learning_rate": 8.214264593307097e-08, "loss": 0.8290293693542481, "step": 1120 }, { "epoch": 2.848369737258626, "grad_norm": 0.8866891213730514, "learning_rate": 7.001981464747565e-08, "loss": 0.8212656021118164, "step": 1125 }, { "epoch": 2.861031972143083, "grad_norm": 0.9114969683085143, "learning_rate": 5.8858092767236084e-08, "loss": 0.8231026649475097, "step": 1130 }, { "epoch": 2.8736942070275404, "grad_norm": 0.8832450189167574, "learning_rate": 4.865965629214819e-08, "loss": 0.830931282043457, "step": 1135 }, { "epoch": 2.8863564419119974, "grad_norm": 0.8487481576996411, "learning_rate": 3.9426493427611177e-08, "loss": 0.8255987167358398, "step": 1140 }, { "epoch": 2.8990186767964543, "grad_norm": 0.879731012515658, "learning_rate": 3.1160404197018155e-08, "loss": 0.8359064102172852, "step": 1145 }, { "epoch": 2.9116809116809117, "grad_norm": 0.9058610366933287, "learning_rate": 2.386300009084408e-08, "loss": 0.8246042251586914, "step": 1150 }, { "epoch": 2.9243431465653686, "grad_norm": 0.898984738606933, "learning_rate": 1.753570375247815e-08, "loss": 0.8313743591308593, "step": 1155 }, { "epoch": 2.937005381449826, "grad_norm": 0.8885810042812119, "learning_rate": 1.2179748700879013e-08, "loss": 0.829072380065918, "step": 1160 }, { "epoch": 2.949667616334283, "grad_norm": 0.903649383117039, "learning_rate": 7.796179090094891e-09, "loss": 0.8449357986450196, "step": 1165 }, { "epoch": 2.9623298512187404, "grad_norm": 0.9753427783203841, "learning_rate": 4.385849505708084e-09, "loss": 0.8176769256591797, "step": 1170 }, { "epoch": 2.9749920861031973, "grad_norm": 0.8748130538451198, "learning_rate": 1.9494247982282386e-09, "loss": 0.8217670440673828, "step": 1175 }, { "epoch": 2.9876543209876543, "grad_norm": 0.9286601621176515, "learning_rate": 4.87379953478806e-10, "loss": 0.8410984992980957, "step": 1180 }, { "epoch": 3.0, "grad_norm": 0.9389103063767028, "learning_rate": 0.0, "loss": 0.8439925193786622, "step": 1185 } ], "logging_steps": 5, "max_steps": 1185, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.9001299950934426e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }