{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 99999999, "global_step": 4114, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000243116756822464, "grad_norm": 245.63973999023438, "learning_rate": 5e-05, "loss": 175.7636, "step": 1 }, { "epoch": 0.000243116756822464, "eval_accuracy": 0.03065178878801227, "eval_loss": 10.476305961608887, "eval_runtime": 122.5372, "eval_samples_per_second": 51.397, "eval_steps_per_second": 6.431, "step": 1 }, { "epoch": 0.000486233513644928, "grad_norm": 113.21363067626953, "learning_rate": 4.998784637822071e-05, "loss": 161.0339, "step": 2 }, { "epoch": 0.000486233513644928, "eval_accuracy": 0.03065178878801227, "eval_loss": 10.297815322875977, "eval_runtime": 120.5427, "eval_samples_per_second": 52.247, "eval_steps_per_second": 6.537, "step": 2 }, { "epoch": 0.000972467027289856, "grad_norm": 65.4986801147461, "learning_rate": 4.996353913466213e-05, "loss": 156.102, "step": 4 }, { "epoch": 0.000972467027289856, "eval_accuracy": 0.03066870675635363, "eval_loss": 10.12234878540039, "eval_runtime": 134.3118, "eval_samples_per_second": 46.891, "eval_steps_per_second": 5.867, "step": 4 }, { "epoch": 0.001944934054579712, "grad_norm": 38.307884216308594, "learning_rate": 4.991492464754497e-05, "loss": 152.2924, "step": 8 }, { "epoch": 0.001944934054579712, "eval_accuracy": 0.030665912963416524, "eval_loss": 10.031266212463379, "eval_runtime": 123.2321, "eval_samples_per_second": 51.107, "eval_steps_per_second": 6.394, "step": 8 }, { "epoch": 0.003889868109159424, "grad_norm": 33.05560302734375, "learning_rate": 4.981769567331065e-05, "loss": 148.7256, "step": 16 }, { "epoch": 0.003889868109159424, "eval_accuracy": 0.03471380850784451, "eval_loss": 9.771815299987793, "eval_runtime": 121.9704, "eval_samples_per_second": 51.635, "eval_steps_per_second": 6.461, "step": 16 }, { "epoch": 0.007779736218318848, "grad_norm": 33.15456771850586, "learning_rate": 4.9623237724842005e-05, "loss": 141.2284, "step": 32 }, { "epoch": 0.007779736218318848, "eval_accuracy": 0.04097904438002165, "eval_loss": 9.279535293579102, "eval_runtime": 124.7723, "eval_samples_per_second": 50.476, "eval_steps_per_second": 6.316, "step": 32 }, { "epoch": 0.015559472436637696, "grad_norm": 20.02243423461914, "learning_rate": 4.9234321827904715e-05, "loss": 127.002, "step": 64 }, { "epoch": 0.015559472436637696, "eval_accuracy": 0.06789103090028115, "eval_loss": 8.393986701965332, "eval_runtime": 137.4464, "eval_samples_per_second": 45.821, "eval_steps_per_second": 5.733, "step": 64 }, { "epoch": 0.031118944873275392, "grad_norm": 28.861949920654297, "learning_rate": 4.8456490034030144e-05, "loss": 116.9628, "step": 128 }, { "epoch": 0.031118944873275392, "eval_accuracy": 0.099492243654753, "eval_loss": 7.541510105133057, "eval_runtime": 137.5865, "eval_samples_per_second": 45.775, "eval_steps_per_second": 5.727, "step": 128 }, { "epoch": 0.062237889746550784, "grad_norm": 32.826515197753906, "learning_rate": 4.6900826446280993e-05, "loss": 103.7502, "step": 256 }, { "epoch": 0.062237889746550784, "eval_accuracy": 0.11699364908781108, "eval_loss": 6.44133186340332, "eval_runtime": 127.4576, "eval_samples_per_second": 49.413, "eval_steps_per_second": 6.182, "step": 256 }, { "epoch": 0.12447577949310157, "grad_norm": 24.268686294555664, "learning_rate": 4.37894992707827e-05, "loss": 99.1234, "step": 512 }, { "epoch": 0.12447577949310157, "eval_accuracy": 0.16933101386435265, "eval_loss": 5.398009300231934, "eval_runtime": 135.7602, "eval_samples_per_second": 46.391, "eval_steps_per_second": 5.804, "step": 512 }, { "epoch": 0.24895155898620314, "grad_norm": 8.205699920654297, "learning_rate": 3.75668449197861e-05, "loss": 84.3663, "step": 1024 }, { "epoch": 0.24895155898620314, "eval_accuracy": 0.21648232289603334, "eval_loss": 4.741060733795166, "eval_runtime": 116.5089, "eval_samples_per_second": 54.056, "eval_steps_per_second": 6.763, "step": 1024 }, { "epoch": 0.4979031179724063, "grad_norm": 10.254305839538574, "learning_rate": 2.51215362177929e-05, "loss": 69.5639, "step": 2048 }, { "epoch": 0.4979031179724063, "eval_accuracy": 0.20927154332536482, "eval_loss": 4.547865390777588, "eval_runtime": 135.1009, "eval_samples_per_second": 46.617, "eval_steps_per_second": 5.833, "step": 2048 }, { "epoch": 0.9958062359448125, "grad_norm": 20.215749740600586, "learning_rate": 2.3091881380651435e-07, "loss": 56.836, "step": 4096 }, { "epoch": 0.9958062359448125, "eval_accuracy": 0.3042501040687869, "eval_loss": 4.02394437789917, "eval_runtime": 126.7254, "eval_samples_per_second": 49.698, "eval_steps_per_second": 6.218, "step": 4096 }, { "epoch": 1.0, "step": 4114, "total_flos": 2.75136851607552e+17, "train_loss": 69.67335734276963, "train_runtime": 23183.8018, "train_samples_per_second": 22.71, "train_steps_per_second": 0.177 } ], "logging_steps": 99999999, "max_steps": 4114, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 99999999, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.75136851607552e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }