{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 18053, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005539245554755442, "grad_norm": 1.546007513999939, "learning_rate": 4.972580734503961e-05, "loss": 1.2806, "step": 100 }, { "epoch": 0.011078491109510884, "grad_norm": 1.6162946224212646, "learning_rate": 4.945161469007921e-05, "loss": 1.2654, "step": 200 }, { "epoch": 0.016617736664266327, "grad_norm": 2.059379816055298, "learning_rate": 4.917465241234144e-05, "loss": 1.2231, "step": 300 }, { "epoch": 0.022156982219021768, "grad_norm": 1.6531622409820557, "learning_rate": 4.890045975738105e-05, "loss": 1.2161, "step": 400 }, { "epoch": 0.027696227773777213, "grad_norm": 1.57131028175354, "learning_rate": 4.8623497479643274e-05, "loss": 1.1963, "step": 500 }, { "epoch": 0.033235473328532654, "grad_norm": 1.4883884191513062, "learning_rate": 4.83465352019055e-05, "loss": 1.1716, "step": 600 }, { "epoch": 0.038774718883288095, "grad_norm": 1.4190359115600586, "learning_rate": 4.806957292416773e-05, "loss": 1.1969, "step": 700 }, { "epoch": 0.044313964438043536, "grad_norm": 1.6759884357452393, "learning_rate": 4.779261064642996e-05, "loss": 1.179, "step": 800 }, { "epoch": 0.04985320999279898, "grad_norm": 1.4575251340866089, "learning_rate": 4.7515648368692186e-05, "loss": 1.1543, "step": 900 }, { "epoch": 0.055392455547554426, "grad_norm": 1.4753471612930298, "learning_rate": 4.723868609095441e-05, "loss": 1.1502, "step": 1000 }, { "epoch": 0.06093170110230987, "grad_norm": 1.660173773765564, "learning_rate": 4.696172381321664e-05, "loss": 1.1231, "step": 1100 }, { "epoch": 0.06647094665706531, "grad_norm": 1.5432391166687012, "learning_rate": 4.668476153547887e-05, "loss": 1.1036, "step": 1200 }, { "epoch": 0.07201019221182076, "grad_norm": 1.3922276496887207, "learning_rate": 4.64077992577411e-05, "loss": 1.1286, "step": 1300 }, { "epoch": 0.07754943776657619, "grad_norm": 1.361965298652649, "learning_rate": 4.613083698000333e-05, "loss": 1.1039, "step": 1400 }, { "epoch": 0.08308868332133164, "grad_norm": 1.3993804454803467, "learning_rate": 4.585387470226555e-05, "loss": 1.1166, "step": 1500 }, { "epoch": 0.08862792887608707, "grad_norm": 1.5207942724227905, "learning_rate": 4.557691242452778e-05, "loss": 1.1079, "step": 1600 }, { "epoch": 0.09416717443084252, "grad_norm": 1.6368706226348877, "learning_rate": 4.529995014679001e-05, "loss": 1.1103, "step": 1700 }, { "epoch": 0.09970641998559795, "grad_norm": 1.4837383031845093, "learning_rate": 4.5022987869052236e-05, "loss": 1.1003, "step": 1800 }, { "epoch": 0.1052456655403534, "grad_norm": 1.483654499053955, "learning_rate": 4.474602559131447e-05, "loss": 1.0686, "step": 1900 }, { "epoch": 0.11078491109510885, "grad_norm": 1.776803731918335, "learning_rate": 4.446906331357669e-05, "loss": 1.0892, "step": 2000 }, { "epoch": 0.11632415664986429, "grad_norm": 1.6472382545471191, "learning_rate": 4.419210103583892e-05, "loss": 1.0892, "step": 2100 }, { "epoch": 0.12186340220461973, "grad_norm": 1.4846845865249634, "learning_rate": 4.391513875810115e-05, "loss": 1.0421, "step": 2200 }, { "epoch": 0.12740264775937518, "grad_norm": 1.4608880281448364, "learning_rate": 4.3638176480363375e-05, "loss": 1.0578, "step": 2300 }, { "epoch": 0.13294189331413062, "grad_norm": 1.515207052230835, "learning_rate": 4.336121420262561e-05, "loss": 1.0652, "step": 2400 }, { "epoch": 0.13848113886888605, "grad_norm": 1.5072288513183594, "learning_rate": 4.308425192488783e-05, "loss": 1.0855, "step": 2500 }, { "epoch": 0.1440203844236415, "grad_norm": 1.4887125492095947, "learning_rate": 4.281005926992744e-05, "loss": 1.0739, "step": 2600 }, { "epoch": 0.14955962997839695, "grad_norm": 1.3779743909835815, "learning_rate": 4.2533096992189665e-05, "loss": 1.0558, "step": 2700 }, { "epoch": 0.15509887553315238, "grad_norm": 1.4754050970077515, "learning_rate": 4.22561347144519e-05, "loss": 1.0458, "step": 2800 }, { "epoch": 0.16063812108790781, "grad_norm": 1.3826239109039307, "learning_rate": 4.197917243671412e-05, "loss": 1.0416, "step": 2900 }, { "epoch": 0.16617736664266328, "grad_norm": 1.3614164590835571, "learning_rate": 4.170221015897635e-05, "loss": 1.0405, "step": 3000 }, { "epoch": 0.1717166121974187, "grad_norm": 1.8164361715316772, "learning_rate": 4.142524788123858e-05, "loss": 1.0464, "step": 3100 }, { "epoch": 0.17725585775217415, "grad_norm": 1.497776746749878, "learning_rate": 4.11482856035008e-05, "loss": 1.0242, "step": 3200 }, { "epoch": 0.1827951033069296, "grad_norm": 1.5668854713439941, "learning_rate": 4.0871323325763036e-05, "loss": 1.0444, "step": 3300 }, { "epoch": 0.18833434886168504, "grad_norm": 1.3012648820877075, "learning_rate": 4.0594361048025256e-05, "loss": 1.019, "step": 3400 }, { "epoch": 0.19387359441644048, "grad_norm": 1.4110270738601685, "learning_rate": 4.031739877028749e-05, "loss": 1.031, "step": 3500 }, { "epoch": 0.1994128399711959, "grad_norm": 1.7231452465057373, "learning_rate": 4.0040436492549715e-05, "loss": 1.0458, "step": 3600 }, { "epoch": 0.20495208552595137, "grad_norm": 1.5503065586090088, "learning_rate": 3.976347421481195e-05, "loss": 1.0141, "step": 3700 }, { "epoch": 0.2104913310807068, "grad_norm": 1.3918309211730957, "learning_rate": 3.9486511937074175e-05, "loss": 1.0169, "step": 3800 }, { "epoch": 0.21603057663546224, "grad_norm": 1.4343814849853516, "learning_rate": 3.9209549659336394e-05, "loss": 1.0247, "step": 3900 }, { "epoch": 0.2215698221902177, "grad_norm": 1.4719725847244263, "learning_rate": 3.893258738159863e-05, "loss": 1.0043, "step": 4000 }, { "epoch": 0.22710906774497314, "grad_norm": 1.3622781038284302, "learning_rate": 3.8655625103860854e-05, "loss": 1.0282, "step": 4100 }, { "epoch": 0.23264831329972857, "grad_norm": 1.5316309928894043, "learning_rate": 3.837866282612309e-05, "loss": 1.0034, "step": 4200 }, { "epoch": 0.23818755885448403, "grad_norm": 1.3682796955108643, "learning_rate": 3.8104470171162684e-05, "loss": 1.0456, "step": 4300 }, { "epoch": 0.24372680440923947, "grad_norm": 1.4921820163726807, "learning_rate": 3.782750789342492e-05, "loss": 1.0166, "step": 4400 }, { "epoch": 0.2492660499639949, "grad_norm": 1.4957038164138794, "learning_rate": 3.7550545615687143e-05, "loss": 1.0079, "step": 4500 }, { "epoch": 0.25480529551875036, "grad_norm": 1.5265921354293823, "learning_rate": 3.7273583337949377e-05, "loss": 1.0297, "step": 4600 }, { "epoch": 0.2603445410735058, "grad_norm": 1.4102026224136353, "learning_rate": 3.69966210602116e-05, "loss": 0.978, "step": 4700 }, { "epoch": 0.26588378662826123, "grad_norm": 1.3901424407958984, "learning_rate": 3.671965878247383e-05, "loss": 1.0043, "step": 4800 }, { "epoch": 0.27142303218301667, "grad_norm": 1.4173741340637207, "learning_rate": 3.6442696504736056e-05, "loss": 0.9725, "step": 4900 }, { "epoch": 0.2769622777377721, "grad_norm": 1.2784532308578491, "learning_rate": 3.616573422699828e-05, "loss": 0.9876, "step": 5000 }, { "epoch": 0.28250152329252753, "grad_norm": 1.4648102521896362, "learning_rate": 3.5888771949260515e-05, "loss": 0.9715, "step": 5100 }, { "epoch": 0.288040768847283, "grad_norm": 1.410418152809143, "learning_rate": 3.561180967152274e-05, "loss": 1.0037, "step": 5200 }, { "epoch": 0.29358001440203846, "grad_norm": 1.6049284934997559, "learning_rate": 3.533484739378497e-05, "loss": 0.9674, "step": 5300 }, { "epoch": 0.2991192599567939, "grad_norm": 1.3856281042099, "learning_rate": 3.5057885116047194e-05, "loss": 0.9927, "step": 5400 }, { "epoch": 0.3046585055115493, "grad_norm": 1.485903024673462, "learning_rate": 3.478092283830943e-05, "loss": 0.9421, "step": 5500 }, { "epoch": 0.31019775106630476, "grad_norm": 1.2787377834320068, "learning_rate": 3.4503960560571654e-05, "loss": 0.9697, "step": 5600 }, { "epoch": 0.3157369966210602, "grad_norm": 1.4596630334854126, "learning_rate": 3.422699828283388e-05, "loss": 0.9743, "step": 5700 }, { "epoch": 0.32127624217581563, "grad_norm": 1.6178170442581177, "learning_rate": 3.3950036005096106e-05, "loss": 0.979, "step": 5800 }, { "epoch": 0.3268154877305711, "grad_norm": 1.5128576755523682, "learning_rate": 3.367307372735833e-05, "loss": 0.9507, "step": 5900 }, { "epoch": 0.33235473328532655, "grad_norm": 1.4769525527954102, "learning_rate": 3.3396111449620566e-05, "loss": 0.9651, "step": 6000 }, { "epoch": 0.337893978840082, "grad_norm": 1.3777639865875244, "learning_rate": 3.311914917188279e-05, "loss": 0.9747, "step": 6100 }, { "epoch": 0.3434332243948374, "grad_norm": 1.492494821548462, "learning_rate": 3.284218689414502e-05, "loss": 0.9858, "step": 6200 }, { "epoch": 0.34897246994959286, "grad_norm": 1.282101035118103, "learning_rate": 3.2565224616407245e-05, "loss": 0.9761, "step": 6300 }, { "epoch": 0.3545117155043483, "grad_norm": 1.3632028102874756, "learning_rate": 3.228826233866947e-05, "loss": 0.958, "step": 6400 }, { "epoch": 0.3600509610591037, "grad_norm": 1.4041866064071655, "learning_rate": 3.2011300060931704e-05, "loss": 0.9772, "step": 6500 }, { "epoch": 0.3655902066138592, "grad_norm": 1.3332486152648926, "learning_rate": 3.173433778319393e-05, "loss": 0.9669, "step": 6600 }, { "epoch": 0.37112945216861465, "grad_norm": 1.3557853698730469, "learning_rate": 3.145737550545616e-05, "loss": 0.9431, "step": 6700 }, { "epoch": 0.3766686977233701, "grad_norm": 1.311341404914856, "learning_rate": 3.118041322771838e-05, "loss": 0.9675, "step": 6800 }, { "epoch": 0.3822079432781255, "grad_norm": 1.3420363664627075, "learning_rate": 3.0903450949980616e-05, "loss": 0.9443, "step": 6900 }, { "epoch": 0.38774718883288095, "grad_norm": 1.5872620344161987, "learning_rate": 3.062648867224284e-05, "loss": 0.9534, "step": 7000 }, { "epoch": 0.3932864343876364, "grad_norm": 1.5827562808990479, "learning_rate": 3.035229601728245e-05, "loss": 0.9618, "step": 7100 }, { "epoch": 0.3988256799423918, "grad_norm": 1.4099847078323364, "learning_rate": 3.0075333739544676e-05, "loss": 0.94, "step": 7200 }, { "epoch": 0.4043649254971473, "grad_norm": 1.6116340160369873, "learning_rate": 2.97983714618069e-05, "loss": 0.9461, "step": 7300 }, { "epoch": 0.40990417105190274, "grad_norm": 1.3645659685134888, "learning_rate": 2.9521409184069133e-05, "loss": 0.9427, "step": 7400 }, { "epoch": 0.4154434166066582, "grad_norm": 1.7124513387680054, "learning_rate": 2.924444690633136e-05, "loss": 0.9431, "step": 7500 }, { "epoch": 0.4209826621614136, "grad_norm": 1.47300124168396, "learning_rate": 2.896748462859359e-05, "loss": 0.94, "step": 7600 }, { "epoch": 0.42652190771616905, "grad_norm": 1.4425369501113892, "learning_rate": 2.8690522350855815e-05, "loss": 0.9439, "step": 7700 }, { "epoch": 0.4320611532709245, "grad_norm": 1.1862937211990356, "learning_rate": 2.8413560073118045e-05, "loss": 0.9429, "step": 7800 }, { "epoch": 0.4376003988256799, "grad_norm": 1.3416625261306763, "learning_rate": 2.813659779538027e-05, "loss": 0.9228, "step": 7900 }, { "epoch": 0.4431396443804354, "grad_norm": 1.4095027446746826, "learning_rate": 2.7859635517642497e-05, "loss": 0.9477, "step": 8000 }, { "epoch": 0.44867888993519084, "grad_norm": 1.3794485330581665, "learning_rate": 2.7582673239904727e-05, "loss": 0.9234, "step": 8100 }, { "epoch": 0.4542181354899463, "grad_norm": 1.638173222541809, "learning_rate": 2.7305710962166953e-05, "loss": 0.9239, "step": 8200 }, { "epoch": 0.4597573810447017, "grad_norm": 1.4151724576950073, "learning_rate": 2.7028748684429183e-05, "loss": 0.9488, "step": 8300 }, { "epoch": 0.46529662659945714, "grad_norm": 1.4664607048034668, "learning_rate": 2.675178640669141e-05, "loss": 0.9281, "step": 8400 }, { "epoch": 0.4708358721542126, "grad_norm": 1.4272499084472656, "learning_rate": 2.647482412895364e-05, "loss": 0.9252, "step": 8500 }, { "epoch": 0.47637511770896807, "grad_norm": 1.380058765411377, "learning_rate": 2.6197861851215866e-05, "loss": 0.9067, "step": 8600 }, { "epoch": 0.4819143632637235, "grad_norm": 1.385686993598938, "learning_rate": 2.5920899573478092e-05, "loss": 0.921, "step": 8700 }, { "epoch": 0.48745360881847893, "grad_norm": 1.4072484970092773, "learning_rate": 2.5643937295740322e-05, "loss": 0.9134, "step": 8800 }, { "epoch": 0.49299285437323437, "grad_norm": 1.5359854698181152, "learning_rate": 2.5366975018002548e-05, "loss": 0.9355, "step": 8900 }, { "epoch": 0.4985320999279898, "grad_norm": 1.3707598447799683, "learning_rate": 2.5090012740264778e-05, "loss": 0.932, "step": 9000 }, { "epoch": 0.5040713454827452, "grad_norm": 1.4937622547149658, "learning_rate": 2.4815820085304385e-05, "loss": 0.929, "step": 9100 }, { "epoch": 0.5096105910375007, "grad_norm": 1.4973763227462769, "learning_rate": 2.4538857807566608e-05, "loss": 0.9313, "step": 9200 }, { "epoch": 0.5151498365922561, "grad_norm": 1.3731300830841064, "learning_rate": 2.4261895529828838e-05, "loss": 0.8977, "step": 9300 }, { "epoch": 0.5206890821470116, "grad_norm": 1.5620144605636597, "learning_rate": 2.3984933252091064e-05, "loss": 0.9255, "step": 9400 }, { "epoch": 0.526228327701767, "grad_norm": 1.4226034879684448, "learning_rate": 2.3707970974353294e-05, "loss": 0.8994, "step": 9500 }, { "epoch": 0.5317675732565225, "grad_norm": 1.3993937969207764, "learning_rate": 2.3431008696615524e-05, "loss": 0.9212, "step": 9600 }, { "epoch": 0.537306818811278, "grad_norm": 1.4125133752822876, "learning_rate": 2.315404641887775e-05, "loss": 0.9072, "step": 9700 }, { "epoch": 0.5428460643660333, "grad_norm": 1.2466013431549072, "learning_rate": 2.287708414113998e-05, "loss": 0.9155, "step": 9800 }, { "epoch": 0.5483853099207888, "grad_norm": 1.319427251815796, "learning_rate": 2.2600121863402206e-05, "loss": 0.8968, "step": 9900 }, { "epoch": 0.5539245554755442, "grad_norm": 1.5401791334152222, "learning_rate": 2.2323159585664432e-05, "loss": 0.9146, "step": 10000 }, { "epoch": 0.5594638010302997, "grad_norm": 1.2928884029388428, "learning_rate": 2.2046197307926662e-05, "loss": 0.8996, "step": 10100 }, { "epoch": 0.5650030465850551, "grad_norm": 1.5264612436294556, "learning_rate": 2.176923503018889e-05, "loss": 0.8889, "step": 10200 }, { "epoch": 0.5705422921398106, "grad_norm": 1.5760724544525146, "learning_rate": 2.1492272752451118e-05, "loss": 0.8978, "step": 10300 }, { "epoch": 0.576081537694566, "grad_norm": 1.414896845817566, "learning_rate": 2.1215310474713345e-05, "loss": 0.8989, "step": 10400 }, { "epoch": 0.5816207832493214, "grad_norm": 1.422662377357483, "learning_rate": 2.0938348196975574e-05, "loss": 0.8695, "step": 10500 }, { "epoch": 0.5871600288040769, "grad_norm": 1.6162989139556885, "learning_rate": 2.06613859192378e-05, "loss": 0.9017, "step": 10600 }, { "epoch": 0.5926992743588323, "grad_norm": 1.537391185760498, "learning_rate": 2.0384423641500027e-05, "loss": 0.8946, "step": 10700 }, { "epoch": 0.5982385199135878, "grad_norm": 1.3381567001342773, "learning_rate": 2.0107461363762257e-05, "loss": 0.8901, "step": 10800 }, { "epoch": 0.6037777654683432, "grad_norm": 1.3765337467193604, "learning_rate": 1.9830499086024483e-05, "loss": 0.8872, "step": 10900 }, { "epoch": 0.6093170110230987, "grad_norm": 1.5019334554672241, "learning_rate": 1.9553536808286713e-05, "loss": 0.8688, "step": 11000 }, { "epoch": 0.6148562565778541, "grad_norm": 1.5896552801132202, "learning_rate": 1.9276574530548943e-05, "loss": 0.8863, "step": 11100 }, { "epoch": 0.6203955021326095, "grad_norm": 1.481461524963379, "learning_rate": 1.899961225281117e-05, "loss": 0.8735, "step": 11200 }, { "epoch": 0.625934747687365, "grad_norm": 1.5073565244674683, "learning_rate": 1.8722649975073395e-05, "loss": 0.8929, "step": 11300 }, { "epoch": 0.6314739932421204, "grad_norm": 1.3693833351135254, "learning_rate": 1.844568769733562e-05, "loss": 0.8478, "step": 11400 }, { "epoch": 0.6370132387968759, "grad_norm": 1.481180191040039, "learning_rate": 1.816872541959785e-05, "loss": 0.8784, "step": 11500 }, { "epoch": 0.6425524843516313, "grad_norm": 1.5158451795578003, "learning_rate": 1.789176314186008e-05, "loss": 0.8697, "step": 11600 }, { "epoch": 0.6480917299063867, "grad_norm": 1.4979525804519653, "learning_rate": 1.7614800864122307e-05, "loss": 0.9006, "step": 11700 }, { "epoch": 0.6536309754611422, "grad_norm": 1.3702558279037476, "learning_rate": 1.7337838586384537e-05, "loss": 0.8745, "step": 11800 }, { "epoch": 0.6591702210158976, "grad_norm": 1.298034429550171, "learning_rate": 1.7060876308646763e-05, "loss": 0.8719, "step": 11900 }, { "epoch": 0.6647094665706531, "grad_norm": 1.3086830377578735, "learning_rate": 1.678668365368637e-05, "loss": 0.8478, "step": 12000 }, { "epoch": 0.6702487121254085, "grad_norm": 1.4746365547180176, "learning_rate": 1.6509721375948597e-05, "loss": 0.8843, "step": 12100 }, { "epoch": 0.675787957680164, "grad_norm": 1.503086805343628, "learning_rate": 1.6232759098210827e-05, "loss": 0.8948, "step": 12200 }, { "epoch": 0.6813272032349194, "grad_norm": 1.4060204029083252, "learning_rate": 1.595579682047305e-05, "loss": 0.8695, "step": 12300 }, { "epoch": 0.6868664487896748, "grad_norm": 1.5291565656661987, "learning_rate": 1.568160416551266e-05, "loss": 0.9056, "step": 12400 }, { "epoch": 0.6924056943444303, "grad_norm": 1.3221312761306763, "learning_rate": 1.5404641887774887e-05, "loss": 0.8651, "step": 12500 }, { "epoch": 0.6979449398991857, "grad_norm": 1.5272188186645508, "learning_rate": 1.5127679610037113e-05, "loss": 0.8725, "step": 12600 }, { "epoch": 0.7034841854539412, "grad_norm": 1.4335006475448608, "learning_rate": 1.4850717332299341e-05, "loss": 0.8663, "step": 12700 }, { "epoch": 0.7090234310086966, "grad_norm": 1.321509838104248, "learning_rate": 1.457375505456157e-05, "loss": 0.8565, "step": 12800 }, { "epoch": 0.7145626765634521, "grad_norm": 1.3843624591827393, "learning_rate": 1.4296792776823797e-05, "loss": 0.8614, "step": 12900 }, { "epoch": 0.7201019221182074, "grad_norm": 1.3091601133346558, "learning_rate": 1.4019830499086025e-05, "loss": 0.8538, "step": 13000 }, { "epoch": 0.7256411676729629, "grad_norm": 1.5815715789794922, "learning_rate": 1.3742868221348253e-05, "loss": 0.8624, "step": 13100 }, { "epoch": 0.7311804132277184, "grad_norm": 1.6078437566757202, "learning_rate": 1.346590594361048e-05, "loss": 0.8405, "step": 13200 }, { "epoch": 0.7367196587824738, "grad_norm": 1.446047306060791, "learning_rate": 1.3188943665872708e-05, "loss": 0.8445, "step": 13300 }, { "epoch": 0.7422589043372293, "grad_norm": 1.207653284072876, "learning_rate": 1.2911981388134936e-05, "loss": 0.8431, "step": 13400 }, { "epoch": 0.7477981498919847, "grad_norm": 1.3394980430603027, "learning_rate": 1.2635019110397164e-05, "loss": 0.868, "step": 13500 }, { "epoch": 0.7533373954467402, "grad_norm": 1.549267053604126, "learning_rate": 1.2358056832659392e-05, "loss": 0.8604, "step": 13600 }, { "epoch": 0.7588766410014955, "grad_norm": 1.3387057781219482, "learning_rate": 1.2081094554921622e-05, "loss": 0.8563, "step": 13700 }, { "epoch": 0.764415886556251, "grad_norm": 1.3845041990280151, "learning_rate": 1.1804132277183848e-05, "loss": 0.8365, "step": 13800 }, { "epoch": 0.7699551321110065, "grad_norm": 1.4750843048095703, "learning_rate": 1.1527169999446076e-05, "loss": 0.8322, "step": 13900 }, { "epoch": 0.7754943776657619, "grad_norm": 1.4781407117843628, "learning_rate": 1.1250207721708304e-05, "loss": 0.8696, "step": 14000 }, { "epoch": 0.7810336232205174, "grad_norm": 1.5403274297714233, "learning_rate": 1.0973245443970532e-05, "loss": 0.8381, "step": 14100 }, { "epoch": 0.7865728687752728, "grad_norm": 1.4403986930847168, "learning_rate": 1.069628316623276e-05, "loss": 0.8341, "step": 14200 }, { "epoch": 0.7921121143300283, "grad_norm": 1.509552001953125, "learning_rate": 1.0419320888494988e-05, "loss": 0.8689, "step": 14300 }, { "epoch": 0.7976513598847836, "grad_norm": 1.567030668258667, "learning_rate": 1.0142358610757214e-05, "loss": 0.8496, "step": 14400 }, { "epoch": 0.8031906054395391, "grad_norm": 1.4610817432403564, "learning_rate": 9.865396333019443e-06, "loss": 0.8694, "step": 14500 }, { "epoch": 0.8087298509942946, "grad_norm": 1.4665472507476807, "learning_rate": 9.58843405528167e-06, "loss": 0.843, "step": 14600 }, { "epoch": 0.81426909654905, "grad_norm": 1.45100998878479, "learning_rate": 9.3114717775439e-06, "loss": 0.8529, "step": 14700 }, { "epoch": 0.8198083421038055, "grad_norm": 1.4459706544876099, "learning_rate": 9.034509499806127e-06, "loss": 0.8251, "step": 14800 }, { "epoch": 0.8253475876585609, "grad_norm": 1.408592700958252, "learning_rate": 8.757547222068355e-06, "loss": 0.858, "step": 14900 }, { "epoch": 0.8308868332133164, "grad_norm": 1.4576612710952759, "learning_rate": 8.480584944330583e-06, "loss": 0.8294, "step": 15000 }, { "epoch": 0.8364260787680717, "grad_norm": 1.4453418254852295, "learning_rate": 8.203622666592809e-06, "loss": 0.8421, "step": 15100 }, { "epoch": 0.8419653243228272, "grad_norm": 1.4345848560333252, "learning_rate": 7.926660388855039e-06, "loss": 0.8338, "step": 15200 }, { "epoch": 0.8475045698775827, "grad_norm": 1.4117823839187622, "learning_rate": 7.649698111117267e-06, "loss": 0.8494, "step": 15300 }, { "epoch": 0.8530438154323381, "grad_norm": 1.7380582094192505, "learning_rate": 7.372735833379495e-06, "loss": 0.831, "step": 15400 }, { "epoch": 0.8585830609870936, "grad_norm": 1.372897744178772, "learning_rate": 7.095773555641721e-06, "loss": 0.8201, "step": 15500 }, { "epoch": 0.864122306541849, "grad_norm": 1.482770562171936, "learning_rate": 6.81881127790395e-06, "loss": 0.8221, "step": 15600 }, { "epoch": 0.8696615520966045, "grad_norm": 1.394877314567566, "learning_rate": 6.541849000166178e-06, "loss": 0.8441, "step": 15700 }, { "epoch": 0.8752007976513598, "grad_norm": 1.4818041324615479, "learning_rate": 6.264886722428405e-06, "loss": 0.843, "step": 15800 }, { "epoch": 0.8807400432061153, "grad_norm": 1.8270115852355957, "learning_rate": 5.9879244446906334e-06, "loss": 0.851, "step": 15900 }, { "epoch": 0.8862792887608708, "grad_norm": 1.4031203985214233, "learning_rate": 5.7109621669528615e-06, "loss": 0.8515, "step": 16000 }, { "epoch": 0.8918185343156262, "grad_norm": 1.5000931024551392, "learning_rate": 5.4339998892150895e-06, "loss": 0.819, "step": 16100 }, { "epoch": 0.8973577798703817, "grad_norm": 1.4219608306884766, "learning_rate": 5.1570376114773176e-06, "loss": 0.8528, "step": 16200 }, { "epoch": 0.9028970254251371, "grad_norm": 1.6713409423828125, "learning_rate": 4.880075333739545e-06, "loss": 0.8253, "step": 16300 }, { "epoch": 0.9084362709798925, "grad_norm": 1.642972707748413, "learning_rate": 4.603113056001773e-06, "loss": 0.8199, "step": 16400 }, { "epoch": 0.913975516534648, "grad_norm": 1.4362276792526245, "learning_rate": 4.328920401041378e-06, "loss": 0.8186, "step": 16500 }, { "epoch": 0.9195147620894034, "grad_norm": 1.387813925743103, "learning_rate": 4.051958123303606e-06, "loss": 0.8447, "step": 16600 }, { "epoch": 0.9250540076441589, "grad_norm": 1.7197731733322144, "learning_rate": 3.774995845565834e-06, "loss": 0.8416, "step": 16700 }, { "epoch": 0.9305932531989143, "grad_norm": 1.4148505926132202, "learning_rate": 3.498033567828062e-06, "loss": 0.8388, "step": 16800 }, { "epoch": 0.9361324987536698, "grad_norm": 1.5057902336120605, "learning_rate": 3.2210712900902897e-06, "loss": 0.8141, "step": 16900 }, { "epoch": 0.9416717443084252, "grad_norm": 1.6533297300338745, "learning_rate": 2.9441090123525177e-06, "loss": 0.8131, "step": 17000 }, { "epoch": 0.9472109898631806, "grad_norm": 1.550384759902954, "learning_rate": 2.6671467346147458e-06, "loss": 0.8474, "step": 17100 }, { "epoch": 0.9527502354179361, "grad_norm": 1.3930208683013916, "learning_rate": 2.3901844568769734e-06, "loss": 0.8252, "step": 17200 }, { "epoch": 0.9582894809726915, "grad_norm": 1.534490704536438, "learning_rate": 2.1132221791392014e-06, "loss": 0.8333, "step": 17300 }, { "epoch": 0.963828726527447, "grad_norm": 1.496036171913147, "learning_rate": 1.836259901401429e-06, "loss": 0.8452, "step": 17400 }, { "epoch": 0.9693679720822024, "grad_norm": 1.4429802894592285, "learning_rate": 1.559297623663657e-06, "loss": 0.8368, "step": 17500 }, { "epoch": 0.9749072176369579, "grad_norm": 1.4180331230163574, "learning_rate": 1.282335345925885e-06, "loss": 0.8139, "step": 17600 }, { "epoch": 0.9804464631917132, "grad_norm": 1.423274278640747, "learning_rate": 1.005373068188113e-06, "loss": 0.809, "step": 17700 }, { "epoch": 0.9859857087464687, "grad_norm": 1.4893913269042969, "learning_rate": 7.284107904503407e-07, "loss": 0.8271, "step": 17800 }, { "epoch": 0.9915249543012242, "grad_norm": 1.5484572649002075, "learning_rate": 4.5144851271256857e-07, "loss": 0.8409, "step": 17900 }, { "epoch": 0.9970641998559796, "grad_norm": 1.423147439956665, "learning_rate": 1.7448623497479645e-07, "loss": 0.8285, "step": 18000 } ], "logging_steps": 100, "max_steps": 18053, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 18053, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.47361101474431e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }