{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 104, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.009615384615384616, "grad_norm": Infinity, "learning_rate": 0.0, "loss": 3.4921, "step": 1 }, { "epoch": 0.04807692307692308, "grad_norm": Infinity, "learning_rate": 7.272727272727273e-06, "loss": 3.4372, "step": 5 }, { "epoch": 0.09615384615384616, "grad_norm": 1.3386897115800666e+19, "learning_rate": 1.6363636363636366e-05, "loss": 3.4356, "step": 10 }, { "epoch": 0.14423076923076922, "grad_norm": 49.75270462036133, "learning_rate": 1.994869323391895e-05, "loss": 3.1194, "step": 15 }, { "epoch": 0.19230769230769232, "grad_norm": 81.14952087402344, "learning_rate": 1.963705643889941e-05, "loss": 2.6138, "step": 20 }, { "epoch": 0.2403846153846154, "grad_norm": 11.374646186828613, "learning_rate": 1.9051145072503216e-05, "loss": 2.3041, "step": 25 }, { "epoch": 0.28846153846153844, "grad_norm": 7.008129119873047, "learning_rate": 1.8207634412072765e-05, "loss": 2.0668, "step": 30 }, { "epoch": 0.33653846153846156, "grad_norm": 16.298776626586914, "learning_rate": 1.7130531116312202e-05, "loss": 1.8889, "step": 35 }, { "epoch": 0.38461538461538464, "grad_norm": 5.356802940368652, "learning_rate": 1.5850489985953076e-05, "loss": 1.83, "step": 40 }, { "epoch": 0.4326923076923077, "grad_norm": 7.75869607925415, "learning_rate": 1.4403941515576344e-05, "loss": 1.7812, "step": 45 }, { "epoch": 0.4807692307692308, "grad_norm": 4.185631275177002, "learning_rate": 1.283205506682304e-05, "loss": 1.75, "step": 50 }, { "epoch": 0.5288461538461539, "grad_norm": 2.844418525695801, "learning_rate": 1.1179567171508463e-05, "loss": 1.6893, "step": 55 }, { "epoch": 0.5769230769230769, "grad_norm": 2.267258405685425, "learning_rate": 9.493508311612874e-06, "loss": 1.6535, "step": 60 }, { "epoch": 0.625, "grad_norm": 1.7939544916152954, "learning_rate": 7.821864412511485e-06, "loss": 1.6214, "step": 65 }, { "epoch": 0.6730769230769231, "grad_norm": 1.9716706275939941, "learning_rate": 6.21221114389424e-06, "loss": 1.6081, "step": 70 }, { "epoch": 0.7211538461538461, "grad_norm": 1.7848585844039917, "learning_rate": 4.710359896730379e-06, "loss": 1.5761, "step": 75 }, { "epoch": 0.7692307692307693, "grad_norm": 1.6977120637893677, "learning_rate": 3.3590539723276083e-06, "loss": 1.5688, "step": 80 }, { "epoch": 0.8173076923076923, "grad_norm": 1.863008737564087, "learning_rate": 2.196752090479083e-06, "loss": 1.5659, "step": 85 }, { "epoch": 0.8653846153846154, "grad_norm": 1.4840096235275269, "learning_rate": 1.2565338385541792e-06, "loss": 1.5474, "step": 90 }, { "epoch": 0.9134615384615384, "grad_norm": 1.3736684322357178, "learning_rate": 5.651582129001987e-07, "loss": 1.5542, "step": 95 }, { "epoch": 0.9615384615384616, "grad_norm": 1.390762448310852, "learning_rate": 1.4230204685196202e-07, "loss": 1.5418, "step": 100 }, { "epoch": 0.9615384615384616, "eval_loss": 1.5660021305084229, "eval_runtime": 1.8376, "eval_samples_per_second": 405.967, "eval_steps_per_second": 3.265, "step": 100 }, { "epoch": 1.0, "step": 104, "total_flos": 3.836373525279539e+16, "train_loss": 1.9909558915174925, "train_runtime": 382.237, "train_samples_per_second": 34.549, "train_steps_per_second": 0.272 } ], "logging_steps": 5, "max_steps": 104, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.836373525279539e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null }