{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 1114, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.017969451931716084, "grad_norm": 1.0288746356964111, "learning_rate": 4.959605026929982e-06, "loss": 0.5949527740478515, "step": 10 }, { "epoch": 0.03593890386343217, "grad_norm": 0.6101402044296265, "learning_rate": 4.9147217235188516e-06, "loss": 0.41325950622558594, "step": 20 }, { "epoch": 0.05390835579514825, "grad_norm": 0.5305581092834473, "learning_rate": 4.86983842010772e-06, "loss": 0.3709533929824829, "step": 30 }, { "epoch": 0.07187780772686433, "grad_norm": 0.5169686675071716, "learning_rate": 4.8249551166965895e-06, "loss": 0.3509422540664673, "step": 40 }, { "epoch": 0.08984725965858041, "grad_norm": 0.5222465991973877, "learning_rate": 4.780071813285458e-06, "loss": 0.3454415321350098, "step": 50 }, { "epoch": 0.1078167115902965, "grad_norm": 0.4856426417827606, "learning_rate": 4.7351885098743274e-06, "loss": 0.33249969482421876, "step": 60 }, { "epoch": 0.12578616352201258, "grad_norm": 0.5357626676559448, "learning_rate": 4.690305206463196e-06, "loss": 0.3292850971221924, "step": 70 }, { "epoch": 0.14375561545372867, "grad_norm": 0.46837398409843445, "learning_rate": 4.6454219030520645e-06, "loss": 0.3244313716888428, "step": 80 }, { "epoch": 0.16172506738544473, "grad_norm": 0.48174625635147095, "learning_rate": 4.600538599640934e-06, "loss": 0.3236015558242798, "step": 90 }, { "epoch": 0.17969451931716082, "grad_norm": 0.5259532332420349, "learning_rate": 4.5556552962298025e-06, "loss": 0.3194127559661865, "step": 100 }, { "epoch": 0.1976639712488769, "grad_norm": 0.5332797765731812, "learning_rate": 4.510771992818672e-06, "loss": 0.31786675453186036, "step": 110 }, { "epoch": 0.215633423180593, "grad_norm": 0.4909115433692932, "learning_rate": 4.465888689407541e-06, "loss": 0.3128951072692871, "step": 120 }, { "epoch": 0.23360287511230907, "grad_norm": 0.4780581295490265, "learning_rate": 4.42100538599641e-06, "loss": 0.31437077522277834, "step": 130 }, { "epoch": 0.25157232704402516, "grad_norm": 0.5149464011192322, "learning_rate": 4.376122082585278e-06, "loss": 0.30969116687774656, "step": 140 }, { "epoch": 0.2695417789757412, "grad_norm": 0.4815337657928467, "learning_rate": 4.331238779174148e-06, "loss": 0.31036303043365476, "step": 150 }, { "epoch": 0.28751123090745734, "grad_norm": 0.4882141053676605, "learning_rate": 4.286355475763016e-06, "loss": 0.30779433250427246, "step": 160 }, { "epoch": 0.3054806828391734, "grad_norm": 0.47035127878189087, "learning_rate": 4.241472172351886e-06, "loss": 0.3062736511230469, "step": 170 }, { "epoch": 0.32345013477088946, "grad_norm": 0.47444701194763184, "learning_rate": 4.196588868940754e-06, "loss": 0.30041847229003904, "step": 180 }, { "epoch": 0.3414195867026056, "grad_norm": 0.4834694564342499, "learning_rate": 4.151705565529624e-06, "loss": 0.29730544090270994, "step": 190 }, { "epoch": 0.35938903863432164, "grad_norm": 0.508245587348938, "learning_rate": 4.106822262118492e-06, "loss": 0.30029687881469724, "step": 200 }, { "epoch": 0.37735849056603776, "grad_norm": 0.48643767833709717, "learning_rate": 4.061938958707361e-06, "loss": 0.29685449600219727, "step": 210 }, { "epoch": 0.3953279424977538, "grad_norm": 0.4577917456626892, "learning_rate": 4.01705565529623e-06, "loss": 0.2990954160690308, "step": 220 }, { "epoch": 0.4132973944294699, "grad_norm": 0.5375077724456787, "learning_rate": 3.9721723518850995e-06, "loss": 0.30233011245727537, "step": 230 }, { "epoch": 0.431266846361186, "grad_norm": 0.4925467371940613, "learning_rate": 3.927289048473968e-06, "loss": 0.2941945314407349, "step": 240 }, { "epoch": 0.44923629829290207, "grad_norm": 0.5110061168670654, "learning_rate": 3.882405745062837e-06, "loss": 0.3003401279449463, "step": 250 }, { "epoch": 0.46720575022461813, "grad_norm": 0.44966429471969604, "learning_rate": 3.837522441651706e-06, "loss": 0.2935019016265869, "step": 260 }, { "epoch": 0.48517520215633425, "grad_norm": 0.49473223090171814, "learning_rate": 3.792639138240575e-06, "loss": 0.2941242218017578, "step": 270 }, { "epoch": 0.5031446540880503, "grad_norm": 0.4826172888278961, "learning_rate": 3.7477558348294435e-06, "loss": 0.2936396598815918, "step": 280 }, { "epoch": 0.5211141060197664, "grad_norm": 0.5087786316871643, "learning_rate": 3.702872531418313e-06, "loss": 0.28728442192077636, "step": 290 }, { "epoch": 0.5390835579514824, "grad_norm": 0.45754265785217285, "learning_rate": 3.6579892280071814e-06, "loss": 0.29381372928619387, "step": 300 }, { "epoch": 0.5570530098831986, "grad_norm": 0.47864410281181335, "learning_rate": 3.6131059245960504e-06, "loss": 0.28871979713439944, "step": 310 }, { "epoch": 0.5750224618149147, "grad_norm": 0.446613073348999, "learning_rate": 3.5682226211849198e-06, "loss": 0.2919660806655884, "step": 320 }, { "epoch": 0.5929919137466307, "grad_norm": 0.5203211903572083, "learning_rate": 3.5233393177737883e-06, "loss": 0.2949108600616455, "step": 330 }, { "epoch": 0.6109613656783468, "grad_norm": 0.5251737236976624, "learning_rate": 3.4784560143626573e-06, "loss": 0.28940815925598146, "step": 340 }, { "epoch": 0.6289308176100629, "grad_norm": 0.4626797139644623, "learning_rate": 3.4335727109515267e-06, "loss": 0.2877013683319092, "step": 350 }, { "epoch": 0.6469002695417789, "grad_norm": 0.5425576567649841, "learning_rate": 3.3886894075403952e-06, "loss": 0.28816981315612794, "step": 360 }, { "epoch": 0.6648697214734951, "grad_norm": 0.5507893562316895, "learning_rate": 3.343806104129264e-06, "loss": 0.28344998359680174, "step": 370 }, { "epoch": 0.6828391734052112, "grad_norm": 0.45895373821258545, "learning_rate": 3.2989228007181327e-06, "loss": 0.28453927040100097, "step": 380 }, { "epoch": 0.7008086253369272, "grad_norm": 0.47491055727005005, "learning_rate": 3.254039497307002e-06, "loss": 0.27885701656341555, "step": 390 }, { "epoch": 0.7187780772686433, "grad_norm": 0.4567403793334961, "learning_rate": 3.209156193895871e-06, "loss": 0.2846828937530518, "step": 400 }, { "epoch": 0.7367475292003594, "grad_norm": 0.506420910358429, "learning_rate": 3.1642728904847396e-06, "loss": 0.2884047269821167, "step": 410 }, { "epoch": 0.7547169811320755, "grad_norm": 0.4960302710533142, "learning_rate": 3.119389587073609e-06, "loss": 0.28609886169433596, "step": 420 }, { "epoch": 0.7726864330637916, "grad_norm": 0.44618239998817444, "learning_rate": 3.074506283662478e-06, "loss": 0.2808084487915039, "step": 430 }, { "epoch": 0.7906558849955077, "grad_norm": 0.45904698967933655, "learning_rate": 3.0296229802513465e-06, "loss": 0.28656601905822754, "step": 440 }, { "epoch": 0.8086253369272237, "grad_norm": 0.5420985817909241, "learning_rate": 2.984739676840216e-06, "loss": 0.2885767936706543, "step": 450 }, { "epoch": 0.8265947888589398, "grad_norm": 0.49061647057533264, "learning_rate": 2.939856373429085e-06, "loss": 0.28384861946105955, "step": 460 }, { "epoch": 0.8445642407906558, "grad_norm": 0.5167312026023865, "learning_rate": 2.8949730700179535e-06, "loss": 0.28023710250854494, "step": 470 }, { "epoch": 0.862533692722372, "grad_norm": 0.46029844880104065, "learning_rate": 2.8500897666068224e-06, "loss": 0.280789852142334, "step": 480 }, { "epoch": 0.8805031446540881, "grad_norm": 0.44982901215553284, "learning_rate": 2.8052064631956914e-06, "loss": 0.27998642921447753, "step": 490 }, { "epoch": 0.8984725965858041, "grad_norm": 0.4832385182380676, "learning_rate": 2.7603231597845604e-06, "loss": 0.2860716819763184, "step": 500 }, { "epoch": 0.9164420485175202, "grad_norm": 0.5139860510826111, "learning_rate": 2.715439856373429e-06, "loss": 0.2779590845108032, "step": 510 }, { "epoch": 0.9344115004492363, "grad_norm": 0.4550414979457855, "learning_rate": 2.6705565529622983e-06, "loss": 0.2789080381393433, "step": 520 }, { "epoch": 0.9523809523809523, "grad_norm": 0.4613369107246399, "learning_rate": 2.6256732495511673e-06, "loss": 0.28540740013122556, "step": 530 }, { "epoch": 0.9703504043126685, "grad_norm": 0.45095086097717285, "learning_rate": 2.580789946140036e-06, "loss": 0.276381254196167, "step": 540 }, { "epoch": 0.9883198562443846, "grad_norm": 0.48203322291374207, "learning_rate": 2.535906642728905e-06, "loss": 0.2832359790802002, "step": 550 }, { "epoch": 1.005390835579515, "grad_norm": 0.4708728492259979, "learning_rate": 2.491023339317774e-06, "loss": 0.2769860029220581, "step": 560 }, { "epoch": 1.0233602875112309, "grad_norm": 0.4912715554237366, "learning_rate": 2.4461400359066427e-06, "loss": 0.2540097713470459, "step": 570 }, { "epoch": 1.041329739442947, "grad_norm": 0.48824694752693176, "learning_rate": 2.4012567324955117e-06, "loss": 0.2609401226043701, "step": 580 }, { "epoch": 1.059299191374663, "grad_norm": 0.4870210289955139, "learning_rate": 2.356373429084381e-06, "loss": 0.25025138854980467, "step": 590 }, { "epoch": 1.0772686433063792, "grad_norm": 0.5163658261299133, "learning_rate": 2.3114901256732496e-06, "loss": 0.25263664722442625, "step": 600 }, { "epoch": 1.0952380952380953, "grad_norm": 0.5006254315376282, "learning_rate": 2.2666068222621186e-06, "loss": 0.25940570831298826, "step": 610 }, { "epoch": 1.1132075471698113, "grad_norm": 0.511043131351471, "learning_rate": 2.2217235188509876e-06, "loss": 0.2521126508712769, "step": 620 }, { "epoch": 1.1311769991015275, "grad_norm": 0.49282217025756836, "learning_rate": 2.1768402154398565e-06, "loss": 0.25096635818481444, "step": 630 }, { "epoch": 1.1491464510332434, "grad_norm": 0.5031591653823853, "learning_rate": 2.1319569120287255e-06, "loss": 0.2539719581604004, "step": 640 }, { "epoch": 1.1671159029649596, "grad_norm": 0.5004000067710876, "learning_rate": 2.0870736086175945e-06, "loss": 0.2542546510696411, "step": 650 }, { "epoch": 1.1850853548966755, "grad_norm": 0.47906896471977234, "learning_rate": 2.0421903052064634e-06, "loss": 0.2506051778793335, "step": 660 }, { "epoch": 1.2030548068283917, "grad_norm": 0.5111077427864075, "learning_rate": 1.9973070017953324e-06, "loss": 0.2501336336135864, "step": 670 }, { "epoch": 1.221024258760108, "grad_norm": 0.46470290422439575, "learning_rate": 1.9524236983842014e-06, "loss": 0.2527280330657959, "step": 680 }, { "epoch": 1.2389937106918238, "grad_norm": 0.49279844760894775, "learning_rate": 1.9075403949730703e-06, "loss": 0.25149285793304443, "step": 690 }, { "epoch": 1.25696316262354, "grad_norm": 0.48192131519317627, "learning_rate": 1.862657091561939e-06, "loss": 0.2485593795776367, "step": 700 }, { "epoch": 1.2749326145552562, "grad_norm": 0.49026069045066833, "learning_rate": 1.817773788150808e-06, "loss": 0.24832606315612793, "step": 710 }, { "epoch": 1.2929020664869721, "grad_norm": 0.46640709042549133, "learning_rate": 1.7728904847396768e-06, "loss": 0.2521926164627075, "step": 720 }, { "epoch": 1.310871518418688, "grad_norm": 0.5054717063903809, "learning_rate": 1.728007181328546e-06, "loss": 0.25048768520355225, "step": 730 }, { "epoch": 1.3288409703504043, "grad_norm": 0.4634091258049011, "learning_rate": 1.683123877917415e-06, "loss": 0.24852404594421387, "step": 740 }, { "epoch": 1.3468104222821204, "grad_norm": 0.4614594578742981, "learning_rate": 1.6382405745062837e-06, "loss": 0.2514226198196411, "step": 750 }, { "epoch": 1.3647798742138364, "grad_norm": 0.5008041262626648, "learning_rate": 1.593357271095153e-06, "loss": 0.2507458686828613, "step": 760 }, { "epoch": 1.3827493261455526, "grad_norm": 0.47305938601493835, "learning_rate": 1.5484739676840217e-06, "loss": 0.2497103691101074, "step": 770 }, { "epoch": 1.4007187780772687, "grad_norm": 0.5139908194541931, "learning_rate": 1.5035906642728906e-06, "loss": 0.24853968620300293, "step": 780 }, { "epoch": 1.4186882300089847, "grad_norm": 0.4631156027317047, "learning_rate": 1.4587073608617596e-06, "loss": 0.248740816116333, "step": 790 }, { "epoch": 1.4366576819407009, "grad_norm": 0.47681012749671936, "learning_rate": 1.4138240574506283e-06, "loss": 0.2534752368927002, "step": 800 }, { "epoch": 1.4546271338724168, "grad_norm": 0.4538913667201996, "learning_rate": 1.3689407540394975e-06, "loss": 0.24337444305419922, "step": 810 }, { "epoch": 1.472596585804133, "grad_norm": 0.48104986548423767, "learning_rate": 1.3240574506283663e-06, "loss": 0.2502609729766846, "step": 820 }, { "epoch": 1.490566037735849, "grad_norm": 0.4610423147678375, "learning_rate": 1.2791741472172353e-06, "loss": 0.24652738571166993, "step": 830 }, { "epoch": 1.5085354896675651, "grad_norm": 0.4587244689464569, "learning_rate": 1.2342908438061042e-06, "loss": 0.25362207889556887, "step": 840 }, { "epoch": 1.5265049415992813, "grad_norm": 0.4708814322948456, "learning_rate": 1.1894075403949732e-06, "loss": 0.24814538955688475, "step": 850 }, { "epoch": 1.5444743935309972, "grad_norm": 0.4898167550563812, "learning_rate": 1.144524236983842e-06, "loss": 0.2517171621322632, "step": 860 }, { "epoch": 1.5624438454627134, "grad_norm": 0.5054773688316345, "learning_rate": 1.0996409335727111e-06, "loss": 0.25095720291137696, "step": 870 }, { "epoch": 1.5804132973944296, "grad_norm": 0.5150067806243896, "learning_rate": 1.05475763016158e-06, "loss": 0.25122294425964353, "step": 880 }, { "epoch": 1.5983827493261455, "grad_norm": 0.44859108328819275, "learning_rate": 1.0098743267504488e-06, "loss": 0.24518187046051027, "step": 890 }, { "epoch": 1.6163522012578615, "grad_norm": 0.4460717737674713, "learning_rate": 9.649910233393178e-07, "loss": 0.25164237022399905, "step": 900 }, { "epoch": 1.6343216531895777, "grad_norm": 0.4884060323238373, "learning_rate": 9.201077199281867e-07, "loss": 0.2497255325317383, "step": 910 }, { "epoch": 1.6522911051212938, "grad_norm": 0.4527634084224701, "learning_rate": 8.752244165170558e-07, "loss": 0.2494762897491455, "step": 920 }, { "epoch": 1.6702605570530098, "grad_norm": 0.47182497382164, "learning_rate": 8.303411131059247e-07, "loss": 0.24840357303619384, "step": 930 }, { "epoch": 1.688230008984726, "grad_norm": 0.4759376347064972, "learning_rate": 7.854578096947936e-07, "loss": 0.25184221267700196, "step": 940 }, { "epoch": 1.7061994609164421, "grad_norm": 0.495343416929245, "learning_rate": 7.405745062836626e-07, "loss": 0.25055861473083496, "step": 950 }, { "epoch": 1.724168912848158, "grad_norm": 0.5005154609680176, "learning_rate": 6.956912028725314e-07, "loss": 0.25119876861572266, "step": 960 }, { "epoch": 1.742138364779874, "grad_norm": 0.47676777839660645, "learning_rate": 6.508078994614005e-07, "loss": 0.2516517162322998, "step": 970 }, { "epoch": 1.7601078167115904, "grad_norm": 0.4394581913948059, "learning_rate": 6.059245960502694e-07, "loss": 0.250733470916748, "step": 980 }, { "epoch": 1.7780772686433064, "grad_norm": 0.4702657163143158, "learning_rate": 5.610412926391383e-07, "loss": 0.2478208065032959, "step": 990 }, { "epoch": 1.7960467205750223, "grad_norm": 0.4843612611293793, "learning_rate": 5.161579892280072e-07, "loss": 0.24935145378112794, "step": 1000 }, { "epoch": 1.8140161725067385, "grad_norm": 0.4673105776309967, "learning_rate": 4.7127468581687615e-07, "loss": 0.24984090328216552, "step": 1010 }, { "epoch": 1.8319856244384547, "grad_norm": 0.4820215404033661, "learning_rate": 4.2639138240574507e-07, "loss": 0.24917204380035402, "step": 1020 }, { "epoch": 1.8499550763701706, "grad_norm": 0.45277148485183716, "learning_rate": 3.815080789946141e-07, "loss": 0.24739840030670165, "step": 1030 }, { "epoch": 1.8679245283018868, "grad_norm": 0.48467275500297546, "learning_rate": 3.3662477558348295e-07, "loss": 0.24694859981536865, "step": 1040 }, { "epoch": 1.885893980233603, "grad_norm": 0.46758314967155457, "learning_rate": 2.917414721723519e-07, "loss": 0.24703009128570558, "step": 1050 }, { "epoch": 1.903863432165319, "grad_norm": 0.4634384512901306, "learning_rate": 2.4685816876122083e-07, "loss": 0.2495879650115967, "step": 1060 }, { "epoch": 1.921832884097035, "grad_norm": 0.4621906578540802, "learning_rate": 2.0197486535008978e-07, "loss": 0.25526316165924073, "step": 1070 }, { "epoch": 1.939802336028751, "grad_norm": 0.46646031737327576, "learning_rate": 1.5709156193895872e-07, "loss": 0.24767594337463378, "step": 1080 }, { "epoch": 1.9577717879604672, "grad_norm": 0.4569203555583954, "learning_rate": 1.1220825852782766e-07, "loss": 0.24955098628997802, "step": 1090 }, { "epoch": 1.9757412398921832, "grad_norm": 0.47747698426246643, "learning_rate": 6.732495511669659e-08, "loss": 0.2489546775817871, "step": 1100 }, { "epoch": 1.9937106918238994, "grad_norm": 0.47046294808387756, "learning_rate": 2.2441651705565532e-08, "loss": 0.24430301189422607, "step": 1110 }, { "epoch": 2.0, "step": 1114, "total_flos": 1.4534558685629252e+19, "train_loss": 0.27833450065266935, "train_runtime": 6971.6374, "train_samples_per_second": 20.435, "train_steps_per_second": 0.16 } ], "logging_steps": 10, "max_steps": 1114, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.4534558685629252e+19, "train_batch_size": 4, "trial_name": null, "trial_params": null }