{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 331, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.015105740181268883, "grad_norm": 65.32524871826172, "learning_rate": 2.3529411764705885e-06, "loss": 1.7699, "step": 5 }, { "epoch": 0.030211480362537766, "grad_norm": 1.1226208209991455, "learning_rate": 5.294117647058824e-06, "loss": 0.3128, "step": 10 }, { "epoch": 0.045317220543806644, "grad_norm": 1.0650861263275146, "learning_rate": 8.23529411764706e-06, "loss": 0.0691, "step": 15 }, { "epoch": 0.06042296072507553, "grad_norm": 0.5207144021987915, "learning_rate": 1.1176470588235295e-05, "loss": 0.0675, "step": 20 }, { "epoch": 0.0755287009063444, "grad_norm": 0.518444299697876, "learning_rate": 1.4117647058823532e-05, "loss": 0.0669, "step": 25 }, { "epoch": 0.09063444108761329, "grad_norm": 0.43817996978759766, "learning_rate": 1.7058823529411767e-05, "loss": 0.0658, "step": 30 }, { "epoch": 0.10574018126888217, "grad_norm": 1.0586720705032349, "learning_rate": 2e-05, "loss": 0.0648, "step": 35 }, { "epoch": 0.12084592145015106, "grad_norm": 1.3109872341156006, "learning_rate": 1.9986017152454497e-05, "loss": 0.0657, "step": 40 }, { "epoch": 0.13595166163141995, "grad_norm": 0.48269444704055786, "learning_rate": 1.9944107713823068e-05, "loss": 0.0609, "step": 45 }, { "epoch": 0.1510574018126888, "grad_norm": 0.9956803917884827, "learning_rate": 1.9874388886763944e-05, "loss": 0.0712, "step": 50 }, { "epoch": 0.1661631419939577, "grad_norm": 0.8316070437431335, "learning_rate": 1.9777055644823087e-05, "loss": 0.0669, "step": 55 }, { "epoch": 0.18126888217522658, "grad_norm": 0.09602731466293335, "learning_rate": 1.9652380187177128e-05, "loss": 0.0623, "step": 60 }, { "epoch": 0.19637462235649547, "grad_norm": 0.23348145186901093, "learning_rate": 1.9500711177409456e-05, "loss": 0.0561, "step": 65 }, { "epoch": 0.21148036253776434, "grad_norm": 0.1733577698469162, "learning_rate": 1.932247276844826e-05, "loss": 0.0614, "step": 70 }, { "epoch": 0.22658610271903323, "grad_norm": 0.21881505846977234, "learning_rate": 1.9118163416393392e-05, "loss": 0.0567, "step": 75 }, { "epoch": 0.24169184290030213, "grad_norm": 0.3463956117630005, "learning_rate": 1.8888354486549238e-05, "loss": 0.0598, "step": 80 }, { "epoch": 0.256797583081571, "grad_norm": 0.11425940692424774, "learning_rate": 1.863368865556191e-05, "loss": 0.0564, "step": 85 }, { "epoch": 0.2719033232628399, "grad_norm": 0.15162110328674316, "learning_rate": 1.8354878114129368e-05, "loss": 0.056, "step": 90 }, { "epoch": 0.28700906344410876, "grad_norm": 0.1482110172510147, "learning_rate": 1.8052702575310588e-05, "loss": 0.0493, "step": 95 }, { "epoch": 0.3021148036253776, "grad_norm": 0.36696767807006836, "learning_rate": 1.772800709400383e-05, "loss": 0.048, "step": 100 }, { "epoch": 0.31722054380664655, "grad_norm": 0.25465500354766846, "learning_rate": 1.7381699703691866e-05, "loss": 0.0537, "step": 105 }, { "epoch": 0.3323262839879154, "grad_norm": 0.38290655612945557, "learning_rate": 1.7014748877063212e-05, "loss": 0.0537, "step": 110 }, { "epoch": 0.3474320241691843, "grad_norm": 0.3702133297920227, "learning_rate": 1.6628180817610963e-05, "loss": 0.051, "step": 115 }, { "epoch": 0.36253776435045315, "grad_norm": 0.1437833160161972, "learning_rate": 1.6223076589783368e-05, "loss": 0.0473, "step": 120 }, { "epoch": 0.3776435045317221, "grad_norm": 0.15008145570755005, "learning_rate": 1.5800569095711983e-05, "loss": 0.0509, "step": 125 }, { "epoch": 0.39274924471299094, "grad_norm": 0.15623906254768372, "learning_rate": 1.5361839906972095e-05, "loss": 0.0577, "step": 130 }, { "epoch": 0.4078549848942598, "grad_norm": 0.10994814336299896, "learning_rate": 1.4908115960235683e-05, "loss": 0.0513, "step": 135 }, { "epoch": 0.4229607250755287, "grad_norm": 0.15456193685531616, "learning_rate": 1.4440666126057743e-05, "loss": 0.0572, "step": 140 }, { "epoch": 0.4380664652567976, "grad_norm": 0.16252553462982178, "learning_rate": 1.396079766039157e-05, "loss": 0.0538, "step": 145 }, { "epoch": 0.45317220543806647, "grad_norm": 0.4133623540401459, "learning_rate": 1.3469852548756626e-05, "loss": 0.0512, "step": 150 }, { "epoch": 0.46827794561933533, "grad_norm": 0.1537550389766693, "learning_rate": 1.296920375328275e-05, "loss": 0.0494, "step": 155 }, { "epoch": 0.48338368580060426, "grad_norm": 0.5091661214828491, "learning_rate": 1.2460251373126136e-05, "loss": 0.0513, "step": 160 }, { "epoch": 0.4984894259818731, "grad_norm": 0.18158994615077972, "learning_rate": 1.194441872899471e-05, "loss": 0.046, "step": 165 }, { "epoch": 0.513595166163142, "grad_norm": 0.39592429995536804, "learning_rate": 1.1423148382732854e-05, "loss": 0.05, "step": 170 }, { "epoch": 0.5287009063444109, "grad_norm": 0.22556884586811066, "learning_rate": 1.0897898103096917e-05, "loss": 0.0485, "step": 175 }, { "epoch": 0.5438066465256798, "grad_norm": 0.35634785890579224, "learning_rate": 1.0370136789003582e-05, "loss": 0.0458, "step": 180 }, { "epoch": 0.5589123867069486, "grad_norm": 0.5002294778823853, "learning_rate": 9.841340361651921e-06, "loss": 0.0471, "step": 185 }, { "epoch": 0.5740181268882175, "grad_norm": 0.1892608106136322, "learning_rate": 9.312987637007191e-06, "loss": 0.0397, "step": 190 }, { "epoch": 0.5891238670694864, "grad_norm": 0.2997090816497803, "learning_rate": 8.786556190189183e-06, "loss": 0.0432, "step": 195 }, { "epoch": 0.6042296072507553, "grad_norm": 0.10828253626823425, "learning_rate": 8.263518223330698e-06, "loss": 0.0456, "step": 200 }, { "epoch": 0.6193353474320241, "grad_norm": 0.11140269786119461, "learning_rate": 7.745336448461958e-06, "loss": 0.0464, "step": 205 }, { "epoch": 0.6344410876132931, "grad_norm": 0.21382257342338562, "learning_rate": 7.233459996934731e-06, "loss": 0.0436, "step": 210 }, { "epoch": 0.649546827794562, "grad_norm": 0.22236904501914978, "learning_rate": 6.729320366825785e-06, "loss": 0.0395, "step": 215 }, { "epoch": 0.6646525679758308, "grad_norm": 0.18689888715744019, "learning_rate": 6.234327419653013e-06, "loss": 0.0434, "step": 220 }, { "epoch": 0.6797583081570997, "grad_norm": 0.11527472734451294, "learning_rate": 5.749865437599703e-06, "loss": 0.0448, "step": 225 }, { "epoch": 0.6948640483383686, "grad_norm": 0.12977807223796844, "learning_rate": 5.277289252273175e-06, "loss": 0.0443, "step": 230 }, { "epoch": 0.7099697885196374, "grad_norm": 0.28918567299842834, "learning_rate": 4.817920455824045e-06, "loss": 0.042, "step": 235 }, { "epoch": 0.7250755287009063, "grad_norm": 0.1708402782678604, "learning_rate": 4.373043705021899e-06, "loss": 0.036, "step": 240 }, { "epoch": 0.7401812688821753, "grad_norm": 0.2525939345359802, "learning_rate": 3.943903128623336e-06, "loss": 0.0392, "step": 245 }, { "epoch": 0.7552870090634441, "grad_norm": 0.3220805525779724, "learning_rate": 3.5316988480794255e-06, "loss": 0.036, "step": 250 }, { "epoch": 0.770392749244713, "grad_norm": 0.3768947422504425, "learning_rate": 3.1375836213126653e-06, "loss": 0.0373, "step": 255 }, { "epoch": 0.7854984894259819, "grad_norm": 0.15737439692020416, "learning_rate": 2.7626596189492983e-06, "loss": 0.0363, "step": 260 }, { "epoch": 0.8006042296072508, "grad_norm": 0.24124516546726227, "learning_rate": 2.4079753420225694e-06, "loss": 0.0345, "step": 265 }, { "epoch": 0.8157099697885196, "grad_norm": 0.2828380763530731, "learning_rate": 2.0745226897666858e-06, "loss": 0.0427, "step": 270 }, { "epoch": 0.8308157099697885, "grad_norm": 0.4477688670158386, "learning_rate": 1.7632341857016733e-06, "loss": 0.0382, "step": 275 }, { "epoch": 0.8459214501510574, "grad_norm": 0.35590699315071106, "learning_rate": 1.4749803697665366e-06, "loss": 0.0423, "step": 280 }, { "epoch": 0.8610271903323263, "grad_norm": 0.5138667821884155, "learning_rate": 1.2105673637938054e-06, "loss": 0.0411, "step": 285 }, { "epoch": 0.8761329305135952, "grad_norm": 0.2003919929265976, "learning_rate": 9.707346171337895e-07, "loss": 0.0366, "step": 290 }, { "epoch": 0.8912386706948641, "grad_norm": 0.4709911346435547, "learning_rate": 7.561528387330797e-07, "loss": 0.036, "step": 295 }, { "epoch": 0.9063444108761329, "grad_norm": 0.14394928514957428, "learning_rate": 5.674221214503639e-07, "loss": 0.0324, "step": 300 }, { "epoch": 0.9214501510574018, "grad_norm": 0.2846868932247162, "learning_rate": 4.0507026385502747e-07, "loss": 0.036, "step": 305 }, { "epoch": 0.9365558912386707, "grad_norm": 0.2303285300731659, "learning_rate": 2.6955129420176193e-07, "loss": 0.0389, "step": 310 }, { "epoch": 0.9516616314199395, "grad_norm": 0.27734264731407166, "learning_rate": 1.612442007090076e-07, "loss": 0.038, "step": 315 }, { "epoch": 0.9667673716012085, "grad_norm": 0.17142164707183838, "learning_rate": 8.04518716920466e-08, "loss": 0.0294, "step": 320 }, { "epoch": 0.9818731117824774, "grad_norm": 0.14783106744289398, "learning_rate": 2.7400248514776184e-08, "loss": 0.0394, "step": 325 }, { "epoch": 0.9969788519637462, "grad_norm": 0.2399517148733139, "learning_rate": 2.237693728981416e-09, "loss": 0.0394, "step": 330 }, { "epoch": 1.0, "step": 331, "total_flos": 3.732036479342346e+17, "train_loss": 0.07862781884086817, "train_runtime": 1853.8902, "train_samples_per_second": 11.415, "train_steps_per_second": 0.179 } ], "logging_steps": 5, "max_steps": 331, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.732036479342346e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }