{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 287, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.017452006980802792, "grad_norm": 18.758426666259766, "learning_rate": 2.7586206896551725e-06, "loss": 1.2108, "step": 5 }, { "epoch": 0.034904013961605584, "grad_norm": 0.6289834976196289, "learning_rate": 6.206896551724138e-06, "loss": 0.2604, "step": 10 }, { "epoch": 0.05235602094240838, "grad_norm": 0.7779368758201599, "learning_rate": 9.655172413793105e-06, "loss": 0.0645, "step": 15 }, { "epoch": 0.06980802792321117, "grad_norm": 0.7711329460144043, "learning_rate": 1.310344827586207e-05, "loss": 0.0643, "step": 20 }, { "epoch": 0.08726003490401396, "grad_norm": 1.1758290529251099, "learning_rate": 1.6551724137931037e-05, "loss": 0.0682, "step": 25 }, { "epoch": 0.10471204188481675, "grad_norm": 0.20803742110729218, "learning_rate": 2e-05, "loss": 0.0648, "step": 30 }, { "epoch": 0.12216404886561955, "grad_norm": 0.3231872320175171, "learning_rate": 1.998147167378645e-05, "loss": 0.0638, "step": 35 }, { "epoch": 0.13961605584642234, "grad_norm": 0.20903366804122925, "learning_rate": 1.9925955354920265e-05, "loss": 0.0631, "step": 40 }, { "epoch": 0.15706806282722513, "grad_norm": 0.1551412045955658, "learning_rate": 1.983365676829466e-05, "loss": 0.0637, "step": 45 }, { "epoch": 0.17452006980802792, "grad_norm": 0.10300405323505402, "learning_rate": 1.9704917941574053e-05, "loss": 0.0633, "step": 50 }, { "epoch": 0.19197207678883071, "grad_norm": 0.05463937669992447, "learning_rate": 1.954021593775401e-05, "loss": 0.0634, "step": 55 }, { "epoch": 0.2094240837696335, "grad_norm": 0.05097668617963791, "learning_rate": 1.9340161087325483e-05, "loss": 0.0637, "step": 60 }, { "epoch": 0.2268760907504363, "grad_norm": 0.025731965899467468, "learning_rate": 1.9105494726594344e-05, "loss": 0.0634, "step": 65 }, { "epoch": 0.2443280977312391, "grad_norm": 0.5567801594734192, "learning_rate": 1.8837086450537195e-05, "loss": 0.0643, "step": 70 }, { "epoch": 0.2617801047120419, "grad_norm": 0.07695559412240982, "learning_rate": 1.8535930890373467e-05, "loss": 0.0633, "step": 75 }, { "epoch": 0.2792321116928447, "grad_norm": 0.13339029252529144, "learning_rate": 1.820314402779511e-05, "loss": 0.0633, "step": 80 }, { "epoch": 0.29668411867364747, "grad_norm": 0.10536781698465347, "learning_rate": 1.7839959059512016e-05, "loss": 0.0638, "step": 85 }, { "epoch": 0.31413612565445026, "grad_norm": 0.12401806563138962, "learning_rate": 1.744772182743782e-05, "loss": 0.0633, "step": 90 }, { "epoch": 0.33158813263525305, "grad_norm": 0.1011064425110817, "learning_rate": 1.7027885831450318e-05, "loss": 0.0629, "step": 95 }, { "epoch": 0.34904013961605584, "grad_norm": 0.13563387095928192, "learning_rate": 1.658200684320748e-05, "loss": 0.0632, "step": 100 }, { "epoch": 0.36649214659685864, "grad_norm": 0.26744481921195984, "learning_rate": 1.6111737140978495e-05, "loss": 0.0633, "step": 105 }, { "epoch": 0.38394415357766143, "grad_norm": 0.6496581435203552, "learning_rate": 1.5618819386853607e-05, "loss": 0.0638, "step": 110 }, { "epoch": 0.4013961605584642, "grad_norm": 0.2886026203632355, "learning_rate": 1.5105080169021792e-05, "loss": 0.0637, "step": 115 }, { "epoch": 0.418848167539267, "grad_norm": 0.07766488194465637, "learning_rate": 1.4572423233046386e-05, "loss": 0.064, "step": 120 }, { "epoch": 0.4363001745200698, "grad_norm": 0.152951180934906, "learning_rate": 1.4022822427221325e-05, "loss": 0.0637, "step": 125 }, { "epoch": 0.4537521815008726, "grad_norm": 0.4545815587043762, "learning_rate": 1.3458314388150115e-05, "loss": 0.0631, "step": 130 }, { "epoch": 0.4712041884816754, "grad_norm": 0.13478335738182068, "learning_rate": 1.2880990993652379e-05, "loss": 0.0627, "step": 135 }, { "epoch": 0.4886561954624782, "grad_norm": 0.45286211371421814, "learning_rate": 1.2292991610964902e-05, "loss": 0.0637, "step": 140 }, { "epoch": 0.506108202443281, "grad_norm": 0.44334903359413147, "learning_rate": 1.1696495168962848e-05, "loss": 0.0639, "step": 145 }, { "epoch": 0.5235602094240838, "grad_norm": 0.6045412421226501, "learning_rate": 1.1093712083778748e-05, "loss": 0.0644, "step": 150 }, { "epoch": 0.5410122164048866, "grad_norm": 0.5224294066429138, "learning_rate": 1.0486876067740253e-05, "loss": 0.0655, "step": 155 }, { "epoch": 0.5584642233856894, "grad_norm": 0.37020203471183777, "learning_rate": 9.878235851980027e-06, "loss": 0.0639, "step": 160 }, { "epoch": 0.5759162303664922, "grad_norm": 0.01445784978568554, "learning_rate": 9.270046853390924e-06, "loss": 0.0636, "step": 165 }, { "epoch": 0.5933682373472949, "grad_norm": 0.5739990472793579, "learning_rate": 8.664562816806022e-06, "loss": 0.0644, "step": 170 }, { "epoch": 0.6108202443280978, "grad_norm": 0.21191075444221497, "learning_rate": 8.064027463374702e-06, "loss": 0.0629, "step": 175 }, { "epoch": 0.6282722513089005, "grad_norm": 0.3500339686870575, "learning_rate": 7.470666176083193e-06, "loss": 0.0645, "step": 180 }, { "epoch": 0.6457242582897034, "grad_norm": 0.31313106417655945, "learning_rate": 6.886677753230184e-06, "loss": 0.0623, "step": 185 }, { "epoch": 0.6631762652705061, "grad_norm": 0.3150012791156769, "learning_rate": 6.314226260416383e-06, "loss": 0.0624, "step": 190 }, { "epoch": 0.680628272251309, "grad_norm": 0.19164550304412842, "learning_rate": 5.755433011241851e-06, "loss": 0.0621, "step": 195 }, { "epoch": 0.6980802792321117, "grad_norm": 0.448416143655777, "learning_rate": 5.212368706427913e-06, "loss": 0.0638, "step": 200 }, { "epoch": 0.7155322862129145, "grad_norm": 0.0443989560008049, "learning_rate": 4.687045760493468e-06, "loss": 0.0614, "step": 205 }, { "epoch": 0.7329842931937173, "grad_norm": 0.32341665029525757, "learning_rate": 4.181410844420473e-06, "loss": 0.0623, "step": 210 }, { "epoch": 0.7504363001745201, "grad_norm": 0.2636391222476959, "learning_rate": 3.6973376719429134e-06, "loss": 0.0604, "step": 215 }, { "epoch": 0.7678883071553229, "grad_norm": 0.27186042070388794, "learning_rate": 3.236620056190972e-06, "loss": 0.0601, "step": 220 }, { "epoch": 0.7853403141361257, "grad_norm": 0.5704047679901123, "learning_rate": 2.8009652624200436e-06, "loss": 0.0613, "step": 225 }, { "epoch": 0.8027923211169284, "grad_norm": 0.4298834204673767, "learning_rate": 2.3919876814572197e-06, "loss": 0.0592, "step": 230 }, { "epoch": 0.8202443280977313, "grad_norm": 0.08873734623193741, "learning_rate": 2.0112028473093294e-06, "loss": 0.0595, "step": 235 }, { "epoch": 0.837696335078534, "grad_norm": 0.39123955368995667, "learning_rate": 1.660021821101222e-06, "loss": 0.0567, "step": 240 }, { "epoch": 0.8551483420593369, "grad_norm": 0.16050003468990326, "learning_rate": 1.339745962155613e-06, "loss": 0.0571, "step": 245 }, { "epoch": 0.8726003490401396, "grad_norm": 0.12748093903064728, "learning_rate": 1.051562105591082e-06, "loss": 0.0607, "step": 250 }, { "epoch": 0.8900523560209425, "grad_norm": 0.1128767654299736, "learning_rate": 7.965381643084069e-07, "loss": 0.0582, "step": 255 }, { "epoch": 0.9075043630017452, "grad_norm": 0.5375702381134033, "learning_rate": 5.756191716628556e-07, "loss": 0.0621, "step": 260 }, { "epoch": 0.924956369982548, "grad_norm": 0.272128164768219, "learning_rate": 3.8962377948693395e-07, "loss": 0.0579, "step": 265 }, { "epoch": 0.9424083769633508, "grad_norm": 0.12358862906694412, "learning_rate": 2.392412244407294e-07, "loss": 0.058, "step": 270 }, { "epoch": 0.9598603839441536, "grad_norm": 0.13405446708202362, "learning_rate": 1.2502877393158587e-07, "loss": 0.0592, "step": 275 }, { "epoch": 0.9773123909249564, "grad_norm": 0.12268463522195816, "learning_rate": 4.740966106764222e-08, "loss": 0.0565, "step": 280 }, { "epoch": 0.9947643979057592, "grad_norm": 0.6271886825561523, "learning_rate": 6.671516297606095e-09, "loss": 0.0593, "step": 285 }, { "epoch": 1.0, "step": 287, "total_flos": 3.259472961077248e+17, "train_loss": 0.08579332347738618, "train_runtime": 1625.0751, "train_samples_per_second": 11.273, "train_steps_per_second": 0.177 } ], "logging_steps": 5, "max_steps": 287, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.259472961077248e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }