{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 38780, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.07735946364105209, "grad_norm": 1.558766484260559, "learning_rate": 7.726098191214471e-05, "loss": 7.285, "step": 300 }, { "epoch": 0.15471892728210418, "grad_norm": 0.9385259747505188, "learning_rate": 9.944781600812648e-05, "loss": 4.7972, "step": 600 }, { "epoch": 0.23207839092315627, "grad_norm": 0.9864380359649658, "learning_rate": 9.866642356679604e-05, "loss": 4.5563, "step": 900 }, { "epoch": 0.30943785456420836, "grad_norm": 0.971341073513031, "learning_rate": 9.788503112546558e-05, "loss": 4.4545, "step": 1200 }, { "epoch": 0.3867973182052604, "grad_norm": 0.8991674184799194, "learning_rate": 9.710363868413513e-05, "loss": 4.3616, "step": 1500 }, { "epoch": 0.46415678184631254, "grad_norm": 0.9576361775398254, "learning_rate": 9.632224624280469e-05, "loss": 4.2695, "step": 1800 }, { "epoch": 0.5415162454873647, "grad_norm": 0.9536678791046143, "learning_rate": 9.554085380147423e-05, "loss": 4.2254, "step": 2100 }, { "epoch": 0.6188757091284167, "grad_norm": 0.892548680305481, "learning_rate": 9.475946136014378e-05, "loss": 4.1759, "step": 2400 }, { "epoch": 0.6962351727694688, "grad_norm": 0.9262155294418335, "learning_rate": 9.397806891881334e-05, "loss": 4.1328, "step": 2700 }, { "epoch": 0.7735946364105208, "grad_norm": 1.055438756942749, "learning_rate": 9.319667647748288e-05, "loss": 4.0732, "step": 3000 }, { "epoch": 0.850954100051573, "grad_norm": 1.0588972568511963, "learning_rate": 9.241528403615243e-05, "loss": 4.0574, "step": 3300 }, { "epoch": 0.9283135636926251, "grad_norm": 1.344167947769165, "learning_rate": 9.163389159482197e-05, "loss": 4.0472, "step": 3600 }, { "epoch": 1.0056730273336771, "grad_norm": 0.9573405981063843, "learning_rate": 9.085249915349152e-05, "loss": 3.972, "step": 3900 }, { "epoch": 1.0830324909747293, "grad_norm": 1.0597045421600342, "learning_rate": 9.007110671216108e-05, "loss": 3.8933, "step": 4200 }, { "epoch": 1.1603919546157813, "grad_norm": 1.1895560026168823, "learning_rate": 8.928971427083062e-05, "loss": 3.8657, "step": 4500 }, { "epoch": 1.2377514182568334, "grad_norm": 1.1971007585525513, "learning_rate": 8.850832182950017e-05, "loss": 3.8486, "step": 4800 }, { "epoch": 1.3151108818978856, "grad_norm": 1.2342840433120728, "learning_rate": 8.772692938816972e-05, "loss": 3.8417, "step": 5100 }, { "epoch": 1.3924703455389376, "grad_norm": 1.213428258895874, "learning_rate": 8.694553694683926e-05, "loss": 3.8048, "step": 5400 }, { "epoch": 1.4698298091799897, "grad_norm": 1.191662073135376, "learning_rate": 8.616414450550882e-05, "loss": 3.7818, "step": 5700 }, { "epoch": 1.5471892728210417, "grad_norm": 1.3016968965530396, "learning_rate": 8.538275206417838e-05, "loss": 3.7365, "step": 6000 }, { "epoch": 1.6245487364620939, "grad_norm": 1.179246187210083, "learning_rate": 8.460135962284792e-05, "loss": 3.7605, "step": 6300 }, { "epoch": 1.701908200103146, "grad_norm": 1.2382755279541016, "learning_rate": 8.381996718151747e-05, "loss": 3.6887, "step": 6600 }, { "epoch": 1.7792676637441982, "grad_norm": 1.209956169128418, "learning_rate": 8.303857474018702e-05, "loss": 3.7204, "step": 6900 }, { "epoch": 1.8566271273852502, "grad_norm": 1.119544267654419, "learning_rate": 8.225718229885656e-05, "loss": 3.6682, "step": 7200 }, { "epoch": 1.933986591026302, "grad_norm": 1.2890771627426147, "learning_rate": 8.147578985752612e-05, "loss": 3.6434, "step": 7500 }, { "epoch": 2.0113460546673543, "grad_norm": 1.2189580202102661, "learning_rate": 8.069439741619567e-05, "loss": 3.6477, "step": 7800 }, { "epoch": 2.0887055183084065, "grad_norm": 1.4200156927108765, "learning_rate": 7.991300497486521e-05, "loss": 3.5883, "step": 8100 }, { "epoch": 2.1660649819494586, "grad_norm": 1.1501333713531494, "learning_rate": 7.913161253353476e-05, "loss": 3.6023, "step": 8400 }, { "epoch": 2.2434244455905104, "grad_norm": 1.321439504623413, "learning_rate": 7.83502200922043e-05, "loss": 3.5765, "step": 8700 }, { "epoch": 2.3207839092315625, "grad_norm": 1.4532771110534668, "learning_rate": 7.756882765087386e-05, "loss": 3.5858, "step": 9000 }, { "epoch": 2.3981433728726147, "grad_norm": 1.2922136783599854, "learning_rate": 7.67874352095434e-05, "loss": 3.5483, "step": 9300 }, { "epoch": 2.475502836513667, "grad_norm": 1.459169864654541, "learning_rate": 7.600604276821297e-05, "loss": 3.551, "step": 9600 }, { "epoch": 2.552862300154719, "grad_norm": 1.3106615543365479, "learning_rate": 7.522465032688251e-05, "loss": 3.5216, "step": 9900 }, { "epoch": 2.630221763795771, "grad_norm": 1.4244039058685303, "learning_rate": 7.444325788555206e-05, "loss": 3.5251, "step": 10200 }, { "epoch": 2.707581227436823, "grad_norm": 1.3957465887069702, "learning_rate": 7.366186544422162e-05, "loss": 3.5245, "step": 10500 }, { "epoch": 2.784940691077875, "grad_norm": 1.4246965646743774, "learning_rate": 7.288047300289116e-05, "loss": 3.4938, "step": 10800 }, { "epoch": 2.8623001547189273, "grad_norm": 1.3009408712387085, "learning_rate": 7.209908056156071e-05, "loss": 3.4972, "step": 11100 }, { "epoch": 2.9396596183599795, "grad_norm": 1.2788194417953491, "learning_rate": 7.131768812023025e-05, "loss": 3.4835, "step": 11400 }, { "epoch": 3.0170190820010316, "grad_norm": 1.415262222290039, "learning_rate": 7.05362956788998e-05, "loss": 3.4686, "step": 11700 }, { "epoch": 3.0943785456420834, "grad_norm": 1.3552271127700806, "learning_rate": 6.975490323756934e-05, "loss": 3.4434, "step": 12000 }, { "epoch": 3.1717380092831355, "grad_norm": 1.2953003644943237, "learning_rate": 6.89735107962389e-05, "loss": 3.406, "step": 12300 }, { "epoch": 3.2490974729241877, "grad_norm": 1.2616957426071167, "learning_rate": 6.819211835490845e-05, "loss": 3.4219, "step": 12600 }, { "epoch": 3.32645693656524, "grad_norm": 1.3086093664169312, "learning_rate": 6.7410725913578e-05, "loss": 3.4327, "step": 12900 }, { "epoch": 3.403816400206292, "grad_norm": 1.5225331783294678, "learning_rate": 6.662933347224755e-05, "loss": 3.3881, "step": 13200 }, { "epoch": 3.4811758638473442, "grad_norm": 1.3017733097076416, "learning_rate": 6.58479410309171e-05, "loss": 3.4253, "step": 13500 }, { "epoch": 3.558535327488396, "grad_norm": 1.4945634603500366, "learning_rate": 6.506654858958666e-05, "loss": 3.3739, "step": 13800 }, { "epoch": 3.635894791129448, "grad_norm": 1.3506596088409424, "learning_rate": 6.42851561482562e-05, "loss": 3.3795, "step": 14100 }, { "epoch": 3.7132542547705003, "grad_norm": 1.3715941905975342, "learning_rate": 6.350376370692575e-05, "loss": 3.3621, "step": 14400 }, { "epoch": 3.7906137184115525, "grad_norm": 1.4353686571121216, "learning_rate": 6.27223712655953e-05, "loss": 3.3625, "step": 14700 }, { "epoch": 3.867973182052604, "grad_norm": 1.4907252788543701, "learning_rate": 6.194097882426484e-05, "loss": 3.3694, "step": 15000 }, { "epoch": 3.9453326456936564, "grad_norm": 1.3906782865524292, "learning_rate": 6.11595863829344e-05, "loss": 3.3778, "step": 15300 }, { "epoch": 4.0226921093347086, "grad_norm": 1.4113860130310059, "learning_rate": 6.0378193941603944e-05, "loss": 3.3418, "step": 15600 }, { "epoch": 4.100051572975761, "grad_norm": 1.371813416481018, "learning_rate": 5.959680150027349e-05, "loss": 3.2831, "step": 15900 }, { "epoch": 4.177411036616813, "grad_norm": 1.433017611503601, "learning_rate": 5.881540905894304e-05, "loss": 3.2937, "step": 16200 }, { "epoch": 4.254770500257865, "grad_norm": 1.454952597618103, "learning_rate": 5.803401661761259e-05, "loss": 3.2925, "step": 16500 }, { "epoch": 4.332129963898917, "grad_norm": 1.4268256425857544, "learning_rate": 5.725262417628213e-05, "loss": 3.3171, "step": 16800 }, { "epoch": 4.409489427539969, "grad_norm": 1.4231845140457153, "learning_rate": 5.647123173495169e-05, "loss": 3.3303, "step": 17100 }, { "epoch": 4.486848891181021, "grad_norm": 1.358296275138855, "learning_rate": 5.568983929362124e-05, "loss": 3.3273, "step": 17400 }, { "epoch": 4.564208354822073, "grad_norm": 1.4314409494400024, "learning_rate": 5.490844685229078e-05, "loss": 3.3027, "step": 17700 }, { "epoch": 4.641567818463125, "grad_norm": 1.447662353515625, "learning_rate": 5.4127054410960335e-05, "loss": 3.2779, "step": 18000 }, { "epoch": 4.718927282104177, "grad_norm": 1.498307466506958, "learning_rate": 5.334566196962988e-05, "loss": 3.2733, "step": 18300 }, { "epoch": 4.796286745745229, "grad_norm": 1.3249318599700928, "learning_rate": 5.256426952829944e-05, "loss": 3.2159, "step": 18600 }, { "epoch": 4.873646209386282, "grad_norm": 1.7372560501098633, "learning_rate": 5.1782877086968985e-05, "loss": 3.2769, "step": 18900 }, { "epoch": 4.951005673027334, "grad_norm": 1.475892186164856, "learning_rate": 5.100148464563853e-05, "loss": 3.2724, "step": 19200 }, { "epoch": 5.028365136668386, "grad_norm": 1.5225096940994263, "learning_rate": 5.0220092204308076e-05, "loss": 3.2132, "step": 19500 }, { "epoch": 5.105724600309438, "grad_norm": 1.5637928247451782, "learning_rate": 4.943869976297763e-05, "loss": 3.2313, "step": 19800 }, { "epoch": 5.18308406395049, "grad_norm": 1.5115944147109985, "learning_rate": 4.865730732164718e-05, "loss": 3.2163, "step": 20100 }, { "epoch": 5.260443527591542, "grad_norm": 1.4446969032287598, "learning_rate": 4.7875914880316726e-05, "loss": 3.1997, "step": 20400 }, { "epoch": 5.337802991232594, "grad_norm": 1.4487448930740356, "learning_rate": 4.709452243898628e-05, "loss": 3.2236, "step": 20700 }, { "epoch": 5.415162454873646, "grad_norm": 1.5380080938339233, "learning_rate": 4.6313129997655824e-05, "loss": 3.2076, "step": 21000 }, { "epoch": 5.492521918514698, "grad_norm": 1.4626458883285522, "learning_rate": 4.5531737556325376e-05, "loss": 3.2204, "step": 21300 }, { "epoch": 5.56988138215575, "grad_norm": 1.6070873737335205, "learning_rate": 4.475034511499492e-05, "loss": 3.182, "step": 21600 }, { "epoch": 5.647240845796802, "grad_norm": 1.5365498065948486, "learning_rate": 4.3968952673664474e-05, "loss": 3.1846, "step": 21900 }, { "epoch": 5.724600309437855, "grad_norm": 1.6350524425506592, "learning_rate": 4.3187560232334026e-05, "loss": 3.2233, "step": 22200 }, { "epoch": 5.801959773078907, "grad_norm": 1.5178848505020142, "learning_rate": 4.240616779100357e-05, "loss": 3.2046, "step": 22500 }, { "epoch": 5.879319236719959, "grad_norm": 1.5043169260025024, "learning_rate": 4.162477534967312e-05, "loss": 3.16, "step": 22800 }, { "epoch": 5.956678700361011, "grad_norm": 1.371469259262085, "learning_rate": 4.084338290834267e-05, "loss": 3.2134, "step": 23100 }, { "epoch": 6.034038164002063, "grad_norm": 1.660897970199585, "learning_rate": 4.0061990467012215e-05, "loss": 3.1417, "step": 23400 }, { "epoch": 6.111397627643115, "grad_norm": 1.6934055089950562, "learning_rate": 3.928059802568177e-05, "loss": 3.1623, "step": 23700 }, { "epoch": 6.188757091284167, "grad_norm": 1.6035997867584229, "learning_rate": 3.849920558435132e-05, "loss": 3.1501, "step": 24000 }, { "epoch": 6.266116554925219, "grad_norm": 1.618349313735962, "learning_rate": 3.7717813143020865e-05, "loss": 3.1305, "step": 24300 }, { "epoch": 6.343476018566271, "grad_norm": 1.519572377204895, "learning_rate": 3.693642070169042e-05, "loss": 3.1529, "step": 24600 }, { "epoch": 6.420835482207323, "grad_norm": 1.5830146074295044, "learning_rate": 3.615502826035996e-05, "loss": 3.1571, "step": 24900 }, { "epoch": 6.498194945848375, "grad_norm": 1.6157386302947998, "learning_rate": 3.537363581902951e-05, "loss": 3.1564, "step": 25200 }, { "epoch": 6.575554409489428, "grad_norm": 1.5344434976577759, "learning_rate": 3.459224337769906e-05, "loss": 3.1638, "step": 25500 }, { "epoch": 6.65291387313048, "grad_norm": 1.6386032104492188, "learning_rate": 3.381085093636861e-05, "loss": 3.0942, "step": 25800 }, { "epoch": 6.730273336771532, "grad_norm": 1.5561423301696777, "learning_rate": 3.302945849503816e-05, "loss": 3.121, "step": 26100 }, { "epoch": 6.807632800412584, "grad_norm": 1.6447923183441162, "learning_rate": 3.224806605370771e-05, "loss": 3.1108, "step": 26400 }, { "epoch": 6.884992264053636, "grad_norm": 1.6027878522872925, "learning_rate": 3.1466673612377256e-05, "loss": 3.1331, "step": 26700 }, { "epoch": 6.9623517276946885, "grad_norm": 1.6786209344863892, "learning_rate": 3.068528117104681e-05, "loss": 3.1515, "step": 27000 }, { "epoch": 7.03971119133574, "grad_norm": 1.725610613822937, "learning_rate": 2.9903888729716357e-05, "loss": 3.1025, "step": 27300 }, { "epoch": 7.117070654976792, "grad_norm": 1.6194796562194824, "learning_rate": 2.9122496288385903e-05, "loss": 3.0819, "step": 27600 }, { "epoch": 7.194430118617844, "grad_norm": 1.7126758098602295, "learning_rate": 2.8341103847055455e-05, "loss": 3.1056, "step": 27900 }, { "epoch": 7.271789582258896, "grad_norm": 1.610686182975769, "learning_rate": 2.7559711405725004e-05, "loss": 3.0932, "step": 28200 }, { "epoch": 7.349149045899948, "grad_norm": 1.6700507402420044, "learning_rate": 2.677831896439455e-05, "loss": 3.0938, "step": 28500 }, { "epoch": 7.426508509541001, "grad_norm": 1.5000895261764526, "learning_rate": 2.59969265230641e-05, "loss": 3.0911, "step": 28800 }, { "epoch": 7.503867973182053, "grad_norm": 1.6568007469177246, "learning_rate": 2.521553408173365e-05, "loss": 3.0938, "step": 29100 }, { "epoch": 7.581227436823105, "grad_norm": 1.7494336366653442, "learning_rate": 2.44341416404032e-05, "loss": 3.0711, "step": 29400 }, { "epoch": 7.658586900464157, "grad_norm": 1.7158912420272827, "learning_rate": 2.3652749199072748e-05, "loss": 3.072, "step": 29700 }, { "epoch": 7.735946364105208, "grad_norm": 1.7721878290176392, "learning_rate": 2.2871356757742297e-05, "loss": 3.1069, "step": 30000 }, { "epoch": 7.813305827746261, "grad_norm": 1.5379910469055176, "learning_rate": 2.2089964316411846e-05, "loss": 3.0882, "step": 30300 }, { "epoch": 7.890665291387313, "grad_norm": 1.6254152059555054, "learning_rate": 2.1308571875081395e-05, "loss": 3.0506, "step": 30600 }, { "epoch": 7.968024755028365, "grad_norm": 1.6591140031814575, "learning_rate": 2.0527179433750944e-05, "loss": 3.0912, "step": 30900 }, { "epoch": 8.045384218669417, "grad_norm": 1.4908177852630615, "learning_rate": 1.9745786992420496e-05, "loss": 3.0567, "step": 31200 }, { "epoch": 8.12274368231047, "grad_norm": 1.6893351078033447, "learning_rate": 1.896439455109004e-05, "loss": 3.0538, "step": 31500 }, { "epoch": 8.200103145951521, "grad_norm": 1.6335694789886475, "learning_rate": 1.818300210975959e-05, "loss": 3.0596, "step": 31800 }, { "epoch": 8.277462609592574, "grad_norm": 1.814844012260437, "learning_rate": 1.7401609668429143e-05, "loss": 3.0789, "step": 32100 }, { "epoch": 8.354822073233626, "grad_norm": 1.666052222251892, "learning_rate": 1.662021722709869e-05, "loss": 3.0435, "step": 32400 }, { "epoch": 8.432181536874678, "grad_norm": 1.8534607887268066, "learning_rate": 1.5838824785768237e-05, "loss": 3.0542, "step": 32700 }, { "epoch": 8.50954100051573, "grad_norm": 1.8089135885238647, "learning_rate": 1.5057432344437788e-05, "loss": 3.0435, "step": 33000 }, { "epoch": 8.586900464156782, "grad_norm": 1.5717253684997559, "learning_rate": 1.4276039903107338e-05, "loss": 3.0323, "step": 33300 }, { "epoch": 8.664259927797834, "grad_norm": 1.681136131286621, "learning_rate": 1.3494647461776889e-05, "loss": 3.0528, "step": 33600 }, { "epoch": 8.741619391438887, "grad_norm": 1.700218915939331, "learning_rate": 1.2713255020446434e-05, "loss": 3.0454, "step": 33900 }, { "epoch": 8.818978855079939, "grad_norm": 1.8672676086425781, "learning_rate": 1.1931862579115985e-05, "loss": 3.0757, "step": 34200 }, { "epoch": 8.896338318720991, "grad_norm": 1.7094194889068604, "learning_rate": 1.1150470137785534e-05, "loss": 3.0514, "step": 34500 }, { "epoch": 8.973697782362041, "grad_norm": 1.7016539573669434, "learning_rate": 1.0369077696455083e-05, "loss": 3.022, "step": 34800 }, { "epoch": 9.051057246003094, "grad_norm": 1.7859755754470825, "learning_rate": 9.587685255124633e-06, "loss": 3.0189, "step": 35100 }, { "epoch": 9.128416709644146, "grad_norm": 1.6786860227584839, "learning_rate": 8.80629281379418e-06, "loss": 3.0062, "step": 35400 }, { "epoch": 9.205776173285198, "grad_norm": 1.7441751956939697, "learning_rate": 8.024900372463731e-06, "loss": 3.0036, "step": 35700 }, { "epoch": 9.28313563692625, "grad_norm": 1.6931071281433105, "learning_rate": 7.243507931133279e-06, "loss": 3.0179, "step": 36000 }, { "epoch": 9.360495100567302, "grad_norm": 1.5787148475646973, "learning_rate": 6.46211548980283e-06, "loss": 3.045, "step": 36300 }, { "epoch": 9.437854564208354, "grad_norm": 1.8229496479034424, "learning_rate": 5.680723048472378e-06, "loss": 3.0025, "step": 36600 }, { "epoch": 9.515214027849407, "grad_norm": 1.8122637271881104, "learning_rate": 4.899330607141927e-06, "loss": 3.0239, "step": 36900 }, { "epoch": 9.592573491490459, "grad_norm": 1.5085257291793823, "learning_rate": 4.117938165811476e-06, "loss": 3.0226, "step": 37200 }, { "epoch": 9.669932955131511, "grad_norm": 1.8228789567947388, "learning_rate": 3.336545724481025e-06, "loss": 3.0286, "step": 37500 }, { "epoch": 9.747292418772563, "grad_norm": 1.5136455297470093, "learning_rate": 2.5551532831505747e-06, "loss": 3.0184, "step": 37800 }, { "epoch": 9.824651882413615, "grad_norm": 1.7498648166656494, "learning_rate": 1.7737608418201238e-06, "loss": 3.002, "step": 38100 }, { "epoch": 9.902011346054667, "grad_norm": 1.625130534172058, "learning_rate": 9.923684004896727e-07, "loss": 3.0423, "step": 38400 }, { "epoch": 9.97937080969572, "grad_norm": 1.782974362373352, "learning_rate": 2.1097595915922174e-07, "loss": 3.023, "step": 38700 } ], "logging_steps": 300, "max_steps": 38780, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.20181986164736e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }