{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 250, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.4, "grad_norm": 13.665339392120242, "learning_rate": 3.6000000000000003e-06, "loss": 0.7972, "step": 10 }, { "epoch": 0.8, "grad_norm": 1.1326594871506976, "learning_rate": 7.600000000000001e-06, "loss": 0.5693, "step": 20 }, { "epoch": 1.2, "grad_norm": 1.229628721329319, "learning_rate": 9.992203820909906e-06, "loss": 0.4321, "step": 30 }, { "epoch": 1.6, "grad_norm": 23.98083749455022, "learning_rate": 9.904775776745959e-06, "loss": 0.3428, "step": 40 }, { "epoch": 2.0, "grad_norm": 1.4779282929051816, "learning_rate": 9.721881851187406e-06, "loss": 0.297, "step": 50 }, { "epoch": 2.4, "grad_norm": 0.7030928394182192, "learning_rate": 9.44708186645649e-06, "loss": 0.2048, "step": 60 }, { "epoch": 2.8, "grad_norm": 0.36208494729096075, "learning_rate": 9.085724491675642e-06, "loss": 0.1956, "step": 70 }, { "epoch": 3.2, "grad_norm": 15.852415619527582, "learning_rate": 8.644843137107058e-06, "loss": 0.1364, "step": 80 }, { "epoch": 3.6, "grad_norm": 0.4440039793906722, "learning_rate": 8.133019056822303e-06, "loss": 0.1061, "step": 90 }, { "epoch": 4.0, "grad_norm": 0.42486560707680776, "learning_rate": 7.560214324352858e-06, "loss": 0.0929, "step": 100 }, { "epoch": 4.4, "grad_norm": 0.44641303384271735, "learning_rate": 6.9375779322605154e-06, "loss": 0.0502, "step": 110 }, { "epoch": 4.8, "grad_norm": 0.24553081143144395, "learning_rate": 6.277228789678953e-06, "loss": 0.0508, "step": 120 }, { "epoch": 5.2, "grad_norm": 0.21311247261529426, "learning_rate": 5.592019841532507e-06, "loss": 0.0367, "step": 130 }, { "epoch": 5.6, "grad_norm": 0.2652581124769846, "learning_rate": 4.895287900583216e-06, "loss": 0.027, "step": 140 }, { "epoch": 6.0, "grad_norm": 0.22255970725351767, "learning_rate": 4.200594061540827e-06, "loss": 0.0246, "step": 150 }, { "epoch": 6.4, "grad_norm": 0.20260633596438954, "learning_rate": 3.521459749779769e-06, "loss": 0.0139, "step": 160 }, { "epoch": 6.8, "grad_norm": 0.225354022287482, "learning_rate": 2.871103542174637e-06, "loss": 0.0122, "step": 170 }, { "epoch": 7.2, "grad_norm": 44.88434252382171, "learning_rate": 2.2621838825372496e-06, "loss": 0.0091, "step": 180 }, { "epoch": 7.6, "grad_norm": 0.15078014988540728, "learning_rate": 1.7065526994065973e-06, "loss": 0.0069, "step": 190 }, { "epoch": 8.0, "grad_norm": 0.11684871135125384, "learning_rate": 1.2150247217412186e-06, "loss": 0.0064, "step": 200 }, { "epoch": 8.4, "grad_norm": 0.10691723109581759, "learning_rate": 7.971669825215789e-07, "loss": 0.0045, "step": 210 }, { "epoch": 8.8, "grad_norm": 0.0895459643872266, "learning_rate": 4.6111260733545714e-07, "loss": 0.0043, "step": 220 }, { "epoch": 9.2, "grad_norm": 0.10141341473243017, "learning_rate": 2.134025123396638e-07, "loss": 0.0043, "step": 230 }, { "epoch": 9.6, "grad_norm": 0.08426717702147542, "learning_rate": 5.8858092767236084e-08, "loss": 0.0037, "step": 240 }, { "epoch": 10.0, "grad_norm": 0.07136839003077843, "learning_rate": 4.87379953478806e-10, "loss": 0.0038, "step": 250 }, { "epoch": 10.0, "step": 250, "total_flos": 502552426708992.0, "train_loss": 0.13730106168985368, "train_runtime": 20764.2529, "train_samples_per_second": 0.384, "train_steps_per_second": 0.012 } ], "logging_steps": 10, "max_steps": 250, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 502552426708992.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }