{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 9.969088098918084, "eval_steps": 500, "global_step": 38700, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.07727975270479134, "grad_norm": 0.7823086380958557, "learning_rate": 7.70618556701031e-05, "loss": 8.8779, "step": 300 }, { "epoch": 0.1545595054095827, "grad_norm": 0.9648156762123108, "learning_rate": 9.945097835137386e-05, "loss": 7.1769, "step": 600 }, { "epoch": 0.23183925811437403, "grad_norm": 1.250138521194458, "learning_rate": 9.867037885095755e-05, "loss": 6.8295, "step": 900 }, { "epoch": 0.3091190108191654, "grad_norm": 1.0978425741195679, "learning_rate": 9.788977935054123e-05, "loss": 6.6388, "step": 1200 }, { "epoch": 0.38639876352395675, "grad_norm": 0.9411094188690186, "learning_rate": 9.71091798501249e-05, "loss": 6.5125, "step": 1500 }, { "epoch": 0.46367851622874806, "grad_norm": 0.9909970760345459, "learning_rate": 9.632858034970859e-05, "loss": 6.4121, "step": 1800 }, { "epoch": 0.5409582689335394, "grad_norm": 1.0140202045440674, "learning_rate": 9.554798084929226e-05, "loss": 6.3059, "step": 2100 }, { "epoch": 0.6182380216383307, "grad_norm": 1.0913721323013306, "learning_rate": 9.476738134887594e-05, "loss": 6.2234, "step": 2400 }, { "epoch": 0.6955177743431221, "grad_norm": 1.2915701866149902, "learning_rate": 9.398678184845962e-05, "loss": 6.1446, "step": 2700 }, { "epoch": 0.7727975270479135, "grad_norm": 1.2342078685760498, "learning_rate": 9.32061823480433e-05, "loss": 6.0817, "step": 3000 }, { "epoch": 0.8500772797527048, "grad_norm": 1.219164252281189, "learning_rate": 9.242558284762698e-05, "loss": 6.0084, "step": 3300 }, { "epoch": 0.9273570324574961, "grad_norm": 1.2726490497589111, "learning_rate": 9.164498334721067e-05, "loss": 5.9507, "step": 3600 }, { "epoch": 1.0046367851622875, "grad_norm": 1.4035961627960205, "learning_rate": 9.086438384679435e-05, "loss": 5.8927, "step": 3900 }, { "epoch": 1.0819165378670788, "grad_norm": 1.2813678979873657, "learning_rate": 9.008378434637802e-05, "loss": 5.8083, "step": 4200 }, { "epoch": 1.1591962905718702, "grad_norm": 1.3927570581436157, "learning_rate": 8.93031848459617e-05, "loss": 5.7591, "step": 4500 }, { "epoch": 1.2364760432766615, "grad_norm": 1.3866077661514282, "learning_rate": 8.852258534554538e-05, "loss": 5.7344, "step": 4800 }, { "epoch": 1.3137557959814528, "grad_norm": 1.3302923440933228, "learning_rate": 8.774198584512905e-05, "loss": 5.6947, "step": 5100 }, { "epoch": 1.3910355486862442, "grad_norm": 1.3887799978256226, "learning_rate": 8.696138634471274e-05, "loss": 5.66, "step": 5400 }, { "epoch": 1.4683153013910355, "grad_norm": 1.4702309370040894, "learning_rate": 8.618078684429643e-05, "loss": 5.6274, "step": 5700 }, { "epoch": 1.545595054095827, "grad_norm": 1.5133150815963745, "learning_rate": 8.54001873438801e-05, "loss": 5.6, "step": 6000 }, { "epoch": 1.6228748068006182, "grad_norm": 1.4330689907073975, "learning_rate": 8.461958784346379e-05, "loss": 5.5547, "step": 6300 }, { "epoch": 1.7001545595054095, "grad_norm": 1.4223123788833618, "learning_rate": 8.383898834304746e-05, "loss": 5.5333, "step": 6600 }, { "epoch": 1.7774343122102008, "grad_norm": 1.4957046508789062, "learning_rate": 8.305838884263114e-05, "loss": 5.5098, "step": 6900 }, { "epoch": 1.8547140649149922, "grad_norm": 1.486960768699646, "learning_rate": 8.227778934221483e-05, "loss": 5.4852, "step": 7200 }, { "epoch": 1.9319938176197837, "grad_norm": 1.4700745344161987, "learning_rate": 8.14971898417985e-05, "loss": 5.4687, "step": 7500 }, { "epoch": 2.009273570324575, "grad_norm": 1.5500389337539673, "learning_rate": 8.071659034138219e-05, "loss": 5.4279, "step": 7800 }, { "epoch": 2.0865533230293662, "grad_norm": 1.603614091873169, "learning_rate": 7.993599084096587e-05, "loss": 5.3462, "step": 8100 }, { "epoch": 2.1638330757341575, "grad_norm": 1.4903242588043213, "learning_rate": 7.915539134054955e-05, "loss": 5.341, "step": 8400 }, { "epoch": 2.2411128284389488, "grad_norm": 1.534023404121399, "learning_rate": 7.837479184013322e-05, "loss": 5.3166, "step": 8700 }, { "epoch": 2.3183925811437405, "grad_norm": 1.64164400100708, "learning_rate": 7.759419233971691e-05, "loss": 5.3076, "step": 9000 }, { "epoch": 2.3956723338485317, "grad_norm": 1.6406952142715454, "learning_rate": 7.681359283930058e-05, "loss": 5.2879, "step": 9300 }, { "epoch": 2.472952086553323, "grad_norm": 1.5534299612045288, "learning_rate": 7.603299333888426e-05, "loss": 5.2685, "step": 9600 }, { "epoch": 2.5502318392581143, "grad_norm": 1.6569850444793701, "learning_rate": 7.525239383846795e-05, "loss": 5.2539, "step": 9900 }, { "epoch": 2.6275115919629055, "grad_norm": 1.6819453239440918, "learning_rate": 7.447179433805162e-05, "loss": 5.2531, "step": 10200 }, { "epoch": 2.704791344667697, "grad_norm": 1.5832303762435913, "learning_rate": 7.369119483763531e-05, "loss": 5.2187, "step": 10500 }, { "epoch": 2.7820710973724885, "grad_norm": 1.6432205438613892, "learning_rate": 7.2910595337219e-05, "loss": 5.2298, "step": 10800 }, { "epoch": 2.8593508500772797, "grad_norm": 1.604533314704895, "learning_rate": 7.212999583680267e-05, "loss": 5.2024, "step": 11100 }, { "epoch": 2.936630602782071, "grad_norm": 1.5682111978530884, "learning_rate": 7.134939633638634e-05, "loss": 5.1857, "step": 11400 }, { "epoch": 3.0139103554868623, "grad_norm": 1.7060903310775757, "learning_rate": 7.056879683597003e-05, "loss": 5.1547, "step": 11700 }, { "epoch": 3.091190108191654, "grad_norm": 1.7026489973068237, "learning_rate": 6.97881973355537e-05, "loss": 5.0767, "step": 12000 }, { "epoch": 3.1684698608964452, "grad_norm": 1.7393746376037598, "learning_rate": 6.900759783513739e-05, "loss": 5.0796, "step": 12300 }, { "epoch": 3.2457496136012365, "grad_norm": 1.7103971242904663, "learning_rate": 6.822699833472106e-05, "loss": 5.0745, "step": 12600 }, { "epoch": 3.3230293663060277, "grad_norm": 1.6574651002883911, "learning_rate": 6.744639883430475e-05, "loss": 5.0685, "step": 12900 }, { "epoch": 3.400309119010819, "grad_norm": 1.7125356197357178, "learning_rate": 6.666579933388844e-05, "loss": 5.0585, "step": 13200 }, { "epoch": 3.4775888717156107, "grad_norm": 1.6284891366958618, "learning_rate": 6.588519983347211e-05, "loss": 5.0563, "step": 13500 }, { "epoch": 3.554868624420402, "grad_norm": 1.8488448858261108, "learning_rate": 6.510460033305579e-05, "loss": 5.0448, "step": 13800 }, { "epoch": 3.6321483771251932, "grad_norm": 1.7912287712097168, "learning_rate": 6.432400083263948e-05, "loss": 5.0311, "step": 14100 }, { "epoch": 3.7094281298299845, "grad_norm": 1.7133835554122925, "learning_rate": 6.354340133222315e-05, "loss": 5.0296, "step": 14400 }, { "epoch": 3.7867078825347757, "grad_norm": 1.691361904144287, "learning_rate": 6.276280183180682e-05, "loss": 5.0148, "step": 14700 }, { "epoch": 3.8639876352395675, "grad_norm": 1.7667992115020752, "learning_rate": 6.198220233139051e-05, "loss": 5.0047, "step": 15000 }, { "epoch": 3.9412673879443587, "grad_norm": 1.6980960369110107, "learning_rate": 6.12016028309742e-05, "loss": 4.9993, "step": 15300 }, { "epoch": 4.01854714064915, "grad_norm": 1.7540613412857056, "learning_rate": 6.0421003330557865e-05, "loss": 4.9747, "step": 15600 }, { "epoch": 4.095826893353942, "grad_norm": 1.7740099430084229, "learning_rate": 5.964040383014155e-05, "loss": 4.9056, "step": 15900 }, { "epoch": 4.1731066460587325, "grad_norm": 1.792930245399475, "learning_rate": 5.885980432972523e-05, "loss": 4.8974, "step": 16200 }, { "epoch": 4.250386398763524, "grad_norm": 1.9168815612792969, "learning_rate": 5.807920482930891e-05, "loss": 4.8905, "step": 16500 }, { "epoch": 4.327666151468315, "grad_norm": 1.8642754554748535, "learning_rate": 5.7298605328892594e-05, "loss": 4.8985, "step": 16800 }, { "epoch": 4.404945904173107, "grad_norm": 1.864404320716858, "learning_rate": 5.6518005828476275e-05, "loss": 4.898, "step": 17100 }, { "epoch": 4.4822256568778975, "grad_norm": 1.7236896753311157, "learning_rate": 5.573740632805995e-05, "loss": 4.8897, "step": 17400 }, { "epoch": 4.559505409582689, "grad_norm": 1.785808801651001, "learning_rate": 5.4956806827643636e-05, "loss": 4.8785, "step": 17700 }, { "epoch": 4.636785162287481, "grad_norm": 1.7861013412475586, "learning_rate": 5.417620732722731e-05, "loss": 4.8745, "step": 18000 }, { "epoch": 4.714064914992272, "grad_norm": 1.796295166015625, "learning_rate": 5.339560782681099e-05, "loss": 4.8789, "step": 18300 }, { "epoch": 4.7913446676970635, "grad_norm": 2.020608901977539, "learning_rate": 5.261500832639468e-05, "loss": 4.8753, "step": 18600 }, { "epoch": 4.868624420401854, "grad_norm": 1.8208060264587402, "learning_rate": 5.183440882597835e-05, "loss": 4.8755, "step": 18900 }, { "epoch": 4.945904173106646, "grad_norm": 2.2022621631622314, "learning_rate": 5.105380932556203e-05, "loss": 4.8692, "step": 19200 }, { "epoch": 5.023183925811438, "grad_norm": 1.8397164344787598, "learning_rate": 5.027320982514572e-05, "loss": 4.8313, "step": 19500 }, { "epoch": 5.1004636785162285, "grad_norm": 1.9096678495407104, "learning_rate": 4.9492610324729395e-05, "loss": 4.7779, "step": 19800 }, { "epoch": 5.17774343122102, "grad_norm": 1.9197343587875366, "learning_rate": 4.8712010824313075e-05, "loss": 4.786, "step": 20100 }, { "epoch": 5.255023183925811, "grad_norm": 1.9042634963989258, "learning_rate": 4.7931411323896756e-05, "loss": 4.7738, "step": 20400 }, { "epoch": 5.332302936630603, "grad_norm": 1.9367069005966187, "learning_rate": 4.7150811823480437e-05, "loss": 4.7644, "step": 20700 }, { "epoch": 5.409582689335394, "grad_norm": 1.8912991285324097, "learning_rate": 4.637021232306412e-05, "loss": 4.7681, "step": 21000 }, { "epoch": 5.486862442040185, "grad_norm": 1.906936764717102, "learning_rate": 4.55896128226478e-05, "loss": 4.7725, "step": 21300 }, { "epoch": 5.564142194744977, "grad_norm": 1.9264295101165771, "learning_rate": 4.480901332223147e-05, "loss": 4.78, "step": 21600 }, { "epoch": 5.641421947449768, "grad_norm": 1.919461727142334, "learning_rate": 4.402841382181516e-05, "loss": 4.7594, "step": 21900 }, { "epoch": 5.7187017001545595, "grad_norm": 2.101870059967041, "learning_rate": 4.324781432139884e-05, "loss": 4.7544, "step": 22200 }, { "epoch": 5.795981452859351, "grad_norm": 1.9173074960708618, "learning_rate": 4.2467214820982514e-05, "loss": 4.7367, "step": 22500 }, { "epoch": 5.873261205564142, "grad_norm": 1.9772671461105347, "learning_rate": 4.1686615320566195e-05, "loss": 4.7492, "step": 22800 }, { "epoch": 5.950540958268934, "grad_norm": 2.0612528324127197, "learning_rate": 4.090601582014988e-05, "loss": 4.7568, "step": 23100 }, { "epoch": 6.0278207109737245, "grad_norm": 1.9577621221542358, "learning_rate": 4.0125416319733556e-05, "loss": 4.7241, "step": 23400 }, { "epoch": 6.105100463678516, "grad_norm": 2.115077257156372, "learning_rate": 3.934481681931724e-05, "loss": 4.6697, "step": 23700 }, { "epoch": 6.182380216383308, "grad_norm": 1.9163872003555298, "learning_rate": 3.856421731890092e-05, "loss": 4.6675, "step": 24000 }, { "epoch": 6.259659969088099, "grad_norm": 2.0956552028656006, "learning_rate": 3.77836178184846e-05, "loss": 4.6686, "step": 24300 }, { "epoch": 6.3369397217928904, "grad_norm": 2.0884017944335938, "learning_rate": 3.700301831806828e-05, "loss": 4.6678, "step": 24600 }, { "epoch": 6.414219474497681, "grad_norm": 2.037775993347168, "learning_rate": 3.622241881765196e-05, "loss": 4.6747, "step": 24900 }, { "epoch": 6.491499227202473, "grad_norm": 1.9964643716812134, "learning_rate": 3.544181931723563e-05, "loss": 4.6882, "step": 25200 }, { "epoch": 6.568778979907265, "grad_norm": 1.9362142086029053, "learning_rate": 3.466121981681932e-05, "loss": 4.6888, "step": 25500 }, { "epoch": 6.6460587326120555, "grad_norm": 2.003726005554199, "learning_rate": 3.3880620316403e-05, "loss": 4.6601, "step": 25800 }, { "epoch": 6.723338485316847, "grad_norm": 2.019465208053589, "learning_rate": 3.310002081598668e-05, "loss": 4.6749, "step": 26100 }, { "epoch": 6.800618238021638, "grad_norm": 2.1456525325775146, "learning_rate": 3.2319421315570356e-05, "loss": 4.6648, "step": 26400 }, { "epoch": 6.87789799072643, "grad_norm": 2.0015764236450195, "learning_rate": 3.1538821815154043e-05, "loss": 4.6735, "step": 26700 }, { "epoch": 6.955177743431221, "grad_norm": 1.9557108879089355, "learning_rate": 3.0758222314737724e-05, "loss": 4.6652, "step": 27000 }, { "epoch": 7.032457496136012, "grad_norm": 2.0976319313049316, "learning_rate": 2.9977622814321398e-05, "loss": 4.6445, "step": 27300 }, { "epoch": 7.109737248840804, "grad_norm": 2.0086190700531006, "learning_rate": 2.9197023313905082e-05, "loss": 4.5983, "step": 27600 }, { "epoch": 7.187017001545595, "grad_norm": 1.9664223194122314, "learning_rate": 2.8416423813488763e-05, "loss": 4.6066, "step": 27900 }, { "epoch": 7.2642967542503865, "grad_norm": 1.9912647008895874, "learning_rate": 2.763582431307244e-05, "loss": 4.5991, "step": 28200 }, { "epoch": 7.341576506955178, "grad_norm": 2.009106397628784, "learning_rate": 2.685522481265612e-05, "loss": 4.5851, "step": 28500 }, { "epoch": 7.418856259659969, "grad_norm": 2.167221784591675, "learning_rate": 2.6074625312239805e-05, "loss": 4.6036, "step": 28800 }, { "epoch": 7.496136012364761, "grad_norm": 2.011990785598755, "learning_rate": 2.529402581182348e-05, "loss": 4.5995, "step": 29100 }, { "epoch": 7.5734157650695515, "grad_norm": 2.0887067317962646, "learning_rate": 2.4513426311407163e-05, "loss": 4.6041, "step": 29400 }, { "epoch": 7.650695517774343, "grad_norm": 2.0609374046325684, "learning_rate": 2.373282681099084e-05, "loss": 4.6159, "step": 29700 }, { "epoch": 7.727975270479135, "grad_norm": 2.158097743988037, "learning_rate": 2.2952227310574524e-05, "loss": 4.5919, "step": 30000 }, { "epoch": 7.805255023183926, "grad_norm": 2.091304302215576, "learning_rate": 2.21716278101582e-05, "loss": 4.5995, "step": 30300 }, { "epoch": 7.882534775888717, "grad_norm": 2.134960651397705, "learning_rate": 2.1391028309741886e-05, "loss": 4.6093, "step": 30600 }, { "epoch": 7.959814528593508, "grad_norm": 2.0260465145111084, "learning_rate": 2.0610428809325563e-05, "loss": 4.598, "step": 30900 }, { "epoch": 8.0370942812983, "grad_norm": 2.1602635383605957, "learning_rate": 1.9829829308909244e-05, "loss": 4.5441, "step": 31200 }, { "epoch": 8.114374034003092, "grad_norm": 2.1607472896575928, "learning_rate": 1.9049229808492924e-05, "loss": 4.53, "step": 31500 }, { "epoch": 8.191653786707883, "grad_norm": 2.1066789627075195, "learning_rate": 1.8268630308076605e-05, "loss": 4.5494, "step": 31800 }, { "epoch": 8.268933539412673, "grad_norm": 2.367823600769043, "learning_rate": 1.7488030807660282e-05, "loss": 4.5495, "step": 32100 }, { "epoch": 8.346213292117465, "grad_norm": 2.0811874866485596, "learning_rate": 1.6707431307243966e-05, "loss": 4.5578, "step": 32400 }, { "epoch": 8.423493044822257, "grad_norm": 2.1922411918640137, "learning_rate": 1.5926831806827644e-05, "loss": 4.5516, "step": 32700 }, { "epoch": 8.500772797527048, "grad_norm": 1.9728052616119385, "learning_rate": 1.5146232306411326e-05, "loss": 4.5536, "step": 33000 }, { "epoch": 8.578052550231838, "grad_norm": 2.1590144634246826, "learning_rate": 1.4365632805995005e-05, "loss": 4.5389, "step": 33300 }, { "epoch": 8.65533230293663, "grad_norm": 2.170055389404297, "learning_rate": 1.3585033305578684e-05, "loss": 4.5369, "step": 33600 }, { "epoch": 8.732612055641422, "grad_norm": 2.144286632537842, "learning_rate": 1.2804433805162366e-05, "loss": 4.5571, "step": 33900 }, { "epoch": 8.809891808346213, "grad_norm": 2.1359333992004395, "learning_rate": 1.2023834304746045e-05, "loss": 4.5517, "step": 34200 }, { "epoch": 8.887171561051005, "grad_norm": 2.076216220855713, "learning_rate": 1.1243234804329726e-05, "loss": 4.5474, "step": 34500 }, { "epoch": 8.964451313755795, "grad_norm": 2.1602680683135986, "learning_rate": 1.0462635303913405e-05, "loss": 4.5278, "step": 34800 }, { "epoch": 9.041731066460587, "grad_norm": 2.1748087406158447, "learning_rate": 9.682035803497086e-06, "loss": 4.5316, "step": 35100 }, { "epoch": 9.119010819165378, "grad_norm": 2.262298107147217, "learning_rate": 8.901436303080766e-06, "loss": 4.5028, "step": 35400 }, { "epoch": 9.19629057187017, "grad_norm": 2.1434860229492188, "learning_rate": 8.120836802664447e-06, "loss": 4.4978, "step": 35700 }, { "epoch": 9.273570324574962, "grad_norm": 2.154770851135254, "learning_rate": 7.340237302248126e-06, "loss": 4.5075, "step": 36000 }, { "epoch": 9.350850077279752, "grad_norm": 2.1110429763793945, "learning_rate": 6.559637801831807e-06, "loss": 4.5127, "step": 36300 }, { "epoch": 9.428129829984544, "grad_norm": 2.2476682662963867, "learning_rate": 5.7790383014154874e-06, "loss": 4.4908, "step": 36600 }, { "epoch": 9.505409582689335, "grad_norm": 2.1349868774414062, "learning_rate": 4.998438800999167e-06, "loss": 4.5032, "step": 36900 }, { "epoch": 9.582689335394127, "grad_norm": 2.139244556427002, "learning_rate": 4.217839300582848e-06, "loss": 4.4968, "step": 37200 }, { "epoch": 9.659969088098919, "grad_norm": 2.3121719360351562, "learning_rate": 3.437239800166528e-06, "loss": 4.5214, "step": 37500 }, { "epoch": 9.737248840803709, "grad_norm": 2.07022762298584, "learning_rate": 2.656640299750208e-06, "loss": 4.5042, "step": 37800 }, { "epoch": 9.8145285935085, "grad_norm": 2.230458974838257, "learning_rate": 1.8760407993338883e-06, "loss": 4.5125, "step": 38100 }, { "epoch": 9.891808346213292, "grad_norm": 2.098798990249634, "learning_rate": 1.0954412989175688e-06, "loss": 4.5148, "step": 38400 }, { "epoch": 9.969088098918084, "grad_norm": 2.1367547512054443, "learning_rate": 3.14841798501249e-07, "loss": 4.5073, "step": 38700 } ], "logging_steps": 300, "max_steps": 38820, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.199135204573184e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }