{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.56, "eval_steps": 0, "global_step": 16000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.08, "grad_norm": 9.120098114013672, "learning_rate": 5.2906666666666675e-06, "loss": 0.772038330078125, "step": 500 }, { "epoch": 0.16, "grad_norm": 16.881511688232422, "learning_rate": 1.0624e-05, "loss": 0.37112783813476563, "step": 1000 }, { "epoch": 0.24, "grad_norm": 9.139080047607422, "learning_rate": 1.5957333333333334e-05, "loss": 0.36207400512695315, "step": 1500 }, { "epoch": 0.32, "grad_norm": 3.8323566913604736, "learning_rate": 1.9856592592592595e-05, "loss": 0.34735809326171874, "step": 2000 }, { "epoch": 0.4, "grad_norm": 4.640134811401367, "learning_rate": 1.9265185185185186e-05, "loss": 0.329142578125, "step": 2500 }, { "epoch": 0.48, "grad_norm": 7.571471214294434, "learning_rate": 1.8672592592592594e-05, "loss": 0.3232558288574219, "step": 3000 }, { "epoch": 0.56, "grad_norm": 5.5788984298706055, "learning_rate": 1.8080000000000003e-05, "loss": 0.33851583862304685, "step": 3500 }, { "epoch": 0.64, "grad_norm": 4.823075771331787, "learning_rate": 1.748740740740741e-05, "loss": 0.3265293884277344, "step": 4000 }, { "epoch": 0.72, "grad_norm": 5.9052886962890625, "learning_rate": 1.6897185185185187e-05, "loss": 0.3228313903808594, "step": 4500 }, { "epoch": 0.8, "grad_norm": 2.530646562576294, "learning_rate": 1.6304592592592593e-05, "loss": 0.3212389831542969, "step": 5000 }, { "epoch": 0.88, "grad_norm": 2.003970146179199, "learning_rate": 1.5712e-05, "loss": 0.3108310546875, "step": 5500 }, { "epoch": 0.96, "grad_norm": 2.259843111038208, "learning_rate": 1.511940740740741e-05, "loss": 0.33287890625, "step": 6000 }, { "epoch": 1.04, "grad_norm": 1.1002967357635498, "learning_rate": 1.4526814814814815e-05, "loss": 0.3126535949707031, "step": 6500 }, { "epoch": 1.12, "grad_norm": 2.696305751800537, "learning_rate": 1.3934222222222222e-05, "loss": 0.3110744323730469, "step": 7000 }, { "epoch": 1.2, "grad_norm": 3.0759758949279785, "learning_rate": 1.3341629629629631e-05, "loss": 0.307027099609375, "step": 7500 }, { "epoch": 1.28, "grad_norm": 1.2770161628723145, "learning_rate": 1.2749037037037038e-05, "loss": 0.31455657958984373, "step": 8000 }, { "epoch": 1.3599999999999999, "grad_norm": 2.4967329502105713, "learning_rate": 1.2156444444444447e-05, "loss": 0.31456546020507814, "step": 8500 }, { "epoch": 1.44, "grad_norm": 5.275321006774902, "learning_rate": 1.1565037037037039e-05, "loss": 0.3131004943847656, "step": 9000 }, { "epoch": 1.52, "grad_norm": 2.145164966583252, "learning_rate": 1.0972444444444446e-05, "loss": 0.30567279052734375, "step": 9500 }, { "epoch": 1.6, "grad_norm": 2.0739190578460693, "learning_rate": 1.0379851851851853e-05, "loss": 0.28998117065429685, "step": 10000 }, { "epoch": 1.6800000000000002, "grad_norm": 3.1562881469726562, "learning_rate": 9.78725925925926e-06, "loss": 0.29778146362304686, "step": 10500 }, { "epoch": 1.76, "grad_norm": 3.498109817504883, "learning_rate": 9.194666666666667e-06, "loss": 0.2988756103515625, "step": 11000 }, { "epoch": 1.8399999999999999, "grad_norm": 3.3291115760803223, "learning_rate": 8.602074074074076e-06, "loss": 0.2985075988769531, "step": 11500 }, { "epoch": 1.92, "grad_norm": 1.631378173828125, "learning_rate": 8.009481481481483e-06, "loss": 0.2991393737792969, "step": 12000 }, { "epoch": 2.0, "grad_norm": 1.3230953216552734, "learning_rate": 7.416888888888889e-06, "loss": 0.30236148071289065, "step": 12500 }, { "epoch": 2.08, "grad_norm": 2.339695930480957, "learning_rate": 6.825481481481482e-06, "loss": 0.295860595703125, "step": 13000 }, { "epoch": 2.16, "grad_norm": 1.0685478448867798, "learning_rate": 6.234074074074075e-06, "loss": 0.2980207824707031, "step": 13500 }, { "epoch": 2.24, "grad_norm": 0.947058379650116, "learning_rate": 5.6414814814814825e-06, "loss": 0.29257803344726563, "step": 14000 }, { "epoch": 2.32, "grad_norm": 2.2130205631256104, "learning_rate": 5.0488888888888895e-06, "loss": 0.2826576843261719, "step": 14500 }, { "epoch": 2.4, "grad_norm": 6.699328422546387, "learning_rate": 4.4562962962962965e-06, "loss": 0.30620053100585937, "step": 15000 }, { "epoch": 2.48, "grad_norm": 0.5939074158668518, "learning_rate": 3.863703703703704e-06, "loss": 0.3044532470703125, "step": 15500 }, { "epoch": 2.56, "grad_norm": 2.3366057872772217, "learning_rate": 3.2711111111111117e-06, "loss": 0.28407180786132813, "step": 16000 } ], "logging_steps": 500, "max_steps": 18750, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }