{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 18.75, "eval_steps": 500, "global_step": 1350, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.6944444444444444, "grad_norm": 2.09375, "learning_rate": 9.94575799721836e-06, "loss": 1.9929, "mean_token_accuracy": 0.5180467286705971, "num_tokens": 565403.0, "step": 50 }, { "epoch": 1.3888888888888888, "grad_norm": 2.015625, "learning_rate": 9.876216968011127e-06, "loss": 1.7844, "mean_token_accuracy": 0.5439706787467002, "num_tokens": 1137207.0, "step": 100 }, { "epoch": 2.0833333333333335, "grad_norm": 2.15625, "learning_rate": 9.806675938803894e-06, "loss": 1.7471, "mean_token_accuracy": 0.5503649765253067, "num_tokens": 1691787.0, "step": 150 }, { "epoch": 2.7777777777777777, "grad_norm": 2.125, "learning_rate": 9.737134909596663e-06, "loss": 1.6946, "mean_token_accuracy": 0.558834604024887, "num_tokens": 2259114.0, "step": 200 }, { "epoch": 3.4722222222222223, "grad_norm": 2.09375, "learning_rate": 9.667593880389431e-06, "loss": 1.6698, "mean_token_accuracy": 0.5630534660816192, "num_tokens": 2824656.0, "step": 250 }, { "epoch": 4.166666666666667, "grad_norm": 2.15625, "learning_rate": 9.598052851182198e-06, "loss": 1.6426, "mean_token_accuracy": 0.5657697267830372, "num_tokens": 3383725.0, "step": 300 }, { "epoch": 4.861111111111111, "grad_norm": 2.203125, "learning_rate": 9.528511821974965e-06, "loss": 1.6163, "mean_token_accuracy": 0.572148412913084, "num_tokens": 3951935.0, "step": 350 }, { "epoch": 5.555555555555555, "grad_norm": 2.4375, "learning_rate": 9.458970792767734e-06, "loss": 1.5929, "mean_token_accuracy": 0.5758887875080109, "num_tokens": 4517240.0, "step": 400 }, { "epoch": 6.25, "grad_norm": 2.21875, "learning_rate": 9.389429763560501e-06, "loss": 1.5578, "mean_token_accuracy": 0.5819848603010178, "num_tokens": 5081608.0, "step": 450 }, { "epoch": 6.944444444444445, "grad_norm": 2.21875, "learning_rate": 9.31988873435327e-06, "loss": 1.5426, "mean_token_accuracy": 0.5854928362369537, "num_tokens": 5642941.0, "step": 500 }, { "epoch": 7.638888888888889, "grad_norm": 2.3125, "learning_rate": 9.250347705146037e-06, "loss": 1.5061, "mean_token_accuracy": 0.5927784067392349, "num_tokens": 6209257.0, "step": 550 }, { "epoch": 8.333333333333334, "grad_norm": 2.109375, "learning_rate": 9.180806675938806e-06, "loss": 1.5025, "mean_token_accuracy": 0.5923233330249786, "num_tokens": 6774284.0, "step": 600 }, { "epoch": 9.027777777777779, "grad_norm": 2.328125, "learning_rate": 9.111265646731573e-06, "loss": 1.477, "mean_token_accuracy": 0.597908786535263, "num_tokens": 7336223.0, "step": 650 }, { "epoch": 9.722222222222221, "grad_norm": 2.53125, "learning_rate": 9.04172461752434e-06, "loss": 1.4452, "mean_token_accuracy": 0.6044489535689354, "num_tokens": 7894148.0, "step": 700 }, { "epoch": 10.416666666666666, "grad_norm": 2.5625, "learning_rate": 8.972183588317108e-06, "loss": 1.4268, "mean_token_accuracy": 0.6078107115626336, "num_tokens": 8463546.0, "step": 750 }, { "epoch": 11.11111111111111, "grad_norm": 2.609375, "learning_rate": 8.902642559109875e-06, "loss": 1.4126, "mean_token_accuracy": 0.6102430355548859, "num_tokens": 9025580.0, "step": 800 }, { "epoch": 11.805555555555555, "grad_norm": 2.609375, "learning_rate": 8.833101529902644e-06, "loss": 1.3873, "mean_token_accuracy": 0.6162240096926689, "num_tokens": 9591962.0, "step": 850 }, { "epoch": 12.5, "grad_norm": 2.90625, "learning_rate": 8.763560500695411e-06, "loss": 1.3538, "mean_token_accuracy": 0.6232619461417198, "num_tokens": 10156358.0, "step": 900 }, { "epoch": 13.194444444444445, "grad_norm": 2.59375, "learning_rate": 8.694019471488178e-06, "loss": 1.3373, "mean_token_accuracy": 0.6272177976369858, "num_tokens": 10723149.0, "step": 950 }, { "epoch": 13.88888888888889, "grad_norm": 2.96875, "learning_rate": 8.624478442280947e-06, "loss": 1.3169, "mean_token_accuracy": 0.6308149287104606, "num_tokens": 11281835.0, "step": 1000 }, { "epoch": 14.583333333333334, "grad_norm": 3.109375, "learning_rate": 8.554937413073714e-06, "loss": 1.2865, "mean_token_accuracy": 0.6381412792205811, "num_tokens": 11846572.0, "step": 1050 }, { "epoch": 15.277777777777779, "grad_norm": 3.28125, "learning_rate": 8.485396383866483e-06, "loss": 1.261, "mean_token_accuracy": 0.6445417484641075, "num_tokens": 12410336.0, "step": 1100 }, { "epoch": 15.972222222222221, "grad_norm": 3.21875, "learning_rate": 8.41585535465925e-06, "loss": 1.245, "mean_token_accuracy": 0.6479020461440086, "num_tokens": 12973627.0, "step": 1150 }, { "epoch": 16.666666666666668, "grad_norm": 3.125, "learning_rate": 8.346314325452017e-06, "loss": 1.2052, "mean_token_accuracy": 0.6577865305542946, "num_tokens": 13538332.0, "step": 1200 }, { "epoch": 17.36111111111111, "grad_norm": 3.34375, "learning_rate": 8.276773296244786e-06, "loss": 1.1764, "mean_token_accuracy": 0.6647191798686981, "num_tokens": 14102319.0, "step": 1250 }, { "epoch": 18.055555555555557, "grad_norm": 3.5, "learning_rate": 8.207232267037553e-06, "loss": 1.1584, "mean_token_accuracy": 0.6698976960778237, "num_tokens": 14671757.0, "step": 1300 }, { "epoch": 18.75, "grad_norm": 4.1875, "learning_rate": 8.137691237830321e-06, "loss": 1.1171, "mean_token_accuracy": 0.6794905725121498, "num_tokens": 15232777.0, "step": 1350 } ], "logging_steps": 50, "max_steps": 7200, "num_input_tokens_seen": 0, "num_train_epochs": 100, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.561784415232942e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }