{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 99999999, "global_step": 5337, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0001873909326212478, "grad_norm": 245.6398468017578, "learning_rate": 5e-05, "loss": 175.7636, "step": 1 }, { "epoch": 0.0001873909326212478, "eval_accuracy": 0.015452299171252725, "eval_loss": 10.732580184936523, "eval_runtime": 219.7551, "eval_samples_per_second": 35.922, "eval_steps_per_second": 4.491, "step": 1 }, { "epoch": 0.0003747818652424956, "grad_norm": 113.21331024169922, "learning_rate": 4.9990631440884397e-05, "loss": 161.0339, "step": 2 }, { "epoch": 0.0003747818652424956, "eval_accuracy": 0.015452299171252725, "eval_loss": 10.66187572479248, "eval_runtime": 137.4523, "eval_samples_per_second": 57.431, "eval_steps_per_second": 7.181, "step": 2 }, { "epoch": 0.0007495637304849911, "grad_norm": 65.50191497802734, "learning_rate": 4.997189432265318e-05, "loss": 156.102, "step": 4 }, { "epoch": 0.0007495637304849911, "eval_accuracy": 0.015464063058397669, "eval_loss": 10.59908676147461, "eval_runtime": 188.7788, "eval_samples_per_second": 41.816, "eval_steps_per_second": 5.228, "step": 4 }, { "epoch": 0.0014991274609699823, "grad_norm": 38.306678771972656, "learning_rate": 4.993442008619075e-05, "loss": 152.2921, "step": 8 }, { "epoch": 0.0014991274609699823, "eval_accuracy": 0.015459481333930691, "eval_loss": 10.50749397277832, "eval_runtime": 140.9589, "eval_samples_per_second": 56.002, "eval_steps_per_second": 7.002, "step": 8 }, { "epoch": 0.0029982549219399646, "grad_norm": 33.0531005859375, "learning_rate": 4.985947161326589e-05, "loss": 148.7237, "step": 16 }, { "epoch": 0.0029982549219399646, "eval_accuracy": 0.018715353804478252, "eval_loss": 10.448219299316406, "eval_runtime": 215.51, "eval_samples_per_second": 36.629, "eval_steps_per_second": 4.58, "step": 16 }, { "epoch": 0.005996509843879929, "grad_norm": 33.14303207397461, "learning_rate": 4.970957466741615e-05, "loss": 141.2197, "step": 32 }, { "epoch": 0.005996509843879929, "eval_accuracy": 0.02386372614066984, "eval_loss": 10.312174797058105, "eval_runtime": 201.4605, "eval_samples_per_second": 39.184, "eval_steps_per_second": 4.899, "step": 32 }, { "epoch": 0.011993019687759858, "grad_norm": 17.63327407836914, "learning_rate": 4.94097807757167e-05, "loss": 128.0666, "step": 64 }, { "epoch": 0.011993019687759858, "eval_accuracy": 0.0342046782626398, "eval_loss": 9.543290138244629, "eval_runtime": 210.7504, "eval_samples_per_second": 37.457, "eval_steps_per_second": 4.683, "step": 64 }, { "epoch": 0.023986039375519717, "grad_norm": 22.085046768188477, "learning_rate": 4.881019299231778e-05, "loss": 118.9934, "step": 128 }, { "epoch": 0.023986039375519717, "eval_accuracy": 0.03304265387350131, "eval_loss": 7.98660945892334, "eval_runtime": 227.9954, "eval_samples_per_second": 34.624, "eval_steps_per_second": 4.329, "step": 128 }, { "epoch": 0.04797207875103943, "grad_norm": 17.06017303466797, "learning_rate": 4.761101742551995e-05, "loss": 102.9587, "step": 256 }, { "epoch": 0.04797207875103943, "eval_accuracy": 0.055030721081703045, "eval_loss": 5.9021315574646, "eval_runtime": 240.335, "eval_samples_per_second": 32.846, "eval_steps_per_second": 4.107, "step": 256 }, { "epoch": 0.09594415750207887, "grad_norm": 68.73705291748047, "learning_rate": 4.52126662919243e-05, "loss": 93.4498, "step": 512 }, { "epoch": 0.09594415750207887, "eval_accuracy": 0.18273613650666046, "eval_loss": 4.6600799560546875, "eval_runtime": 236.8236, "eval_samples_per_second": 33.333, "eval_steps_per_second": 4.168, "step": 512 }, { "epoch": 0.19188831500415773, "grad_norm": 16.876415252685547, "learning_rate": 4.0415964024733e-05, "loss": 74.4021, "step": 1024 }, { "epoch": 0.19188831500415773, "eval_accuracy": 0.1382732248232383, "eval_loss": 4.865310192108154, "eval_runtime": 268.5345, "eval_samples_per_second": 29.397, "eval_steps_per_second": 3.676, "step": 1024 }, { "epoch": 0.38377663000831547, "grad_norm": 7.590776443481445, "learning_rate": 3.082255949035038e-05, "loss": 63.2125, "step": 2048 }, { "epoch": 0.38377663000831547, "eval_accuracy": 0.22398527805247487, "eval_loss": 4.008070468902588, "eval_runtime": 233.7103, "eval_samples_per_second": 33.777, "eval_steps_per_second": 4.223, "step": 2048 }, { "epoch": 0.7675532600166309, "grad_norm": 9.217267990112305, "learning_rate": 1.163575042158516e-05, "loss": 46.9282, "step": 4096 }, { "epoch": 0.7675532600166309, "eval_accuracy": 0.31996237537399874, "eval_loss": 3.480340003967285, "eval_runtime": 277.8889, "eval_samples_per_second": 28.407, "eval_steps_per_second": 3.552, "step": 4096 } ], "logging_steps": 99999999, "max_steps": 5337, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 99999999, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.56955225587712e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }