{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 310, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0032258064516129032, "grad_norm": 58.05511474609375, "learning_rate": 0.0, "loss": 1.9, "step": 1 }, { "epoch": 0.016129032258064516, "grad_norm": 21.915355682373047, "learning_rate": 2.580645161290323e-06, "loss": 1.7687, "step": 5 }, { "epoch": 0.03225806451612903, "grad_norm": 8.215951919555664, "learning_rate": 5.806451612903226e-06, "loss": 1.5337, "step": 10 }, { "epoch": 0.04838709677419355, "grad_norm": 4.512474060058594, "learning_rate": 9.03225806451613e-06, "loss": 1.4014, "step": 15 }, { "epoch": 0.06451612903225806, "grad_norm": 5.174832820892334, "learning_rate": 1.2258064516129034e-05, "loss": 1.3709, "step": 20 }, { "epoch": 0.08064516129032258, "grad_norm": 4.159119129180908, "learning_rate": 1.5483870967741936e-05, "loss": 1.3652, "step": 25 }, { "epoch": 0.0967741935483871, "grad_norm": 4.116876602172852, "learning_rate": 1.870967741935484e-05, "loss": 1.3885, "step": 30 }, { "epoch": 0.11290322580645161, "grad_norm": 3.840414047241211, "learning_rate": 1.999429490929718e-05, "loss": 1.3982, "step": 35 }, { "epoch": 0.12903225806451613, "grad_norm": 4.314554691314697, "learning_rate": 1.9959454037227215e-05, "loss": 1.3744, "step": 40 }, { "epoch": 0.14516129032258066, "grad_norm": 4.36101770401001, "learning_rate": 1.989305206325792e-05, "loss": 1.394, "step": 45 }, { "epoch": 0.16129032258064516, "grad_norm": 3.858530282974243, "learning_rate": 1.9795299412524948e-05, "loss": 1.3652, "step": 50 }, { "epoch": 0.1774193548387097, "grad_norm": 4.053957462310791, "learning_rate": 1.9666505859174462e-05, "loss": 1.3825, "step": 55 }, { "epoch": 0.1935483870967742, "grad_norm": 3.8405709266662598, "learning_rate": 1.9507079544701583e-05, "loss": 1.3449, "step": 60 }, { "epoch": 0.20967741935483872, "grad_norm": 3.8573291301727295, "learning_rate": 1.9317525684566686e-05, "loss": 1.3268, "step": 65 }, { "epoch": 0.22580645161290322, "grad_norm": 4.115865230560303, "learning_rate": 1.9098444967188308e-05, "loss": 1.3096, "step": 70 }, { "epoch": 0.24193548387096775, "grad_norm": 3.9826207160949707, "learning_rate": 1.8850531650386154e-05, "loss": 1.3243, "step": 75 }, { "epoch": 0.25806451612903225, "grad_norm": 8.022098541259766, "learning_rate": 1.857457136130651e-05, "loss": 1.309, "step": 80 }, { "epoch": 0.27419354838709675, "grad_norm": 3.8160018920898438, "learning_rate": 1.827143860680199e-05, "loss": 1.278, "step": 85 }, { "epoch": 0.2903225806451613, "grad_norm": 4.026176452636719, "learning_rate": 1.7942094002155122e-05, "loss": 1.2788, "step": 90 }, { "epoch": 0.3064516129032258, "grad_norm": 4.071281909942627, "learning_rate": 1.758758122692791e-05, "loss": 1.2412, "step": 95 }, { "epoch": 0.3225806451612903, "grad_norm": 3.6679961681365967, "learning_rate": 1.7209023717584013e-05, "loss": 1.1916, "step": 100 }, { "epoch": 0.3225806451612903, "eval_loss": 1.2297844886779785, "eval_runtime": 5.4527, "eval_samples_per_second": 196.965, "eval_steps_per_second": 6.235, "step": 100 }, { "epoch": 0.3387096774193548, "grad_norm": 3.7028727531433105, "learning_rate": 1.6807621107364613e-05, "loss": 1.2004, "step": 105 }, { "epoch": 0.3548387096774194, "grad_norm": 3.7159862518310547, "learning_rate": 1.6384645424699835e-05, "loss": 1.1933, "step": 110 }, { "epoch": 0.3709677419354839, "grad_norm": 3.5471580028533936, "learning_rate": 1.594143706220273e-05, "loss": 1.1913, "step": 115 }, { "epoch": 0.3870967741935484, "grad_norm": 4.138647079467773, "learning_rate": 1.5479400529019987e-05, "loss": 1.1872, "step": 120 }, { "epoch": 0.4032258064516129, "grad_norm": 3.7750704288482666, "learning_rate": 1.5000000000000002e-05, "loss": 1.1359, "step": 125 }, { "epoch": 0.41935483870967744, "grad_norm": 3.429034948348999, "learning_rate": 1.4504754675782731e-05, "loss": 1.1518, "step": 130 }, { "epoch": 0.43548387096774194, "grad_norm": 3.416344404220581, "learning_rate": 1.3995233968515105e-05, "loss": 1.1405, "step": 135 }, { "epoch": 0.45161290322580644, "grad_norm": 4.399573802947998, "learning_rate": 1.3473052528448203e-05, "loss": 1.1137, "step": 140 }, { "epoch": 0.46774193548387094, "grad_norm": 3.7525525093078613, "learning_rate": 1.2939865127176771e-05, "loss": 1.1235, "step": 145 }, { "epoch": 0.4838709677419355, "grad_norm": 3.8501410484313965, "learning_rate": 1.2397361413735785e-05, "loss": 1.0858, "step": 150 }, { "epoch": 0.5, "grad_norm": 3.5585899353027344, "learning_rate": 1.1847260560171895e-05, "loss": 1.0692, "step": 155 }, { "epoch": 0.5161290322580645, "grad_norm": 3.620709180831909, "learning_rate": 1.1291305813557616e-05, "loss": 1.0313, "step": 160 }, { "epoch": 0.532258064516129, "grad_norm": 3.7223265171051025, "learning_rate": 1.0731258971712762e-05, "loss": 1.0268, "step": 165 }, { "epoch": 0.5483870967741935, "grad_norm": 3.67724871635437, "learning_rate": 1.0168894800139311e-05, "loss": 1.0123, "step": 170 }, { "epoch": 0.5645161290322581, "grad_norm": 4.000744342803955, "learning_rate": 9.605995407862248e-06, "loss": 0.9905, "step": 175 }, { "epoch": 0.5806451612903226, "grad_norm": 3.4585959911346436, "learning_rate": 9.04434459999902e-06, "loss": 0.9682, "step": 180 }, { "epoch": 0.5967741935483871, "grad_norm": 3.573272466659546, "learning_rate": 8.485722224954237e-06, "loss": 0.9364, "step": 185 }, { "epoch": 0.6129032258064516, "grad_norm": 3.517512083053589, "learning_rate": 7.93189853415293e-06, "loss": 0.9559, "step": 190 }, { "epoch": 0.6290322580645161, "grad_norm": 3.629652500152588, "learning_rate": 7.384628572186334e-06, "loss": 0.9376, "step": 195 }, { "epoch": 0.6451612903225806, "grad_norm": 3.6871094703674316, "learning_rate": 6.845646615147445e-06, "loss": 0.9281, "step": 200 }, { "epoch": 0.6451612903225806, "eval_loss": 0.9366754293441772, "eval_runtime": 5.3925, "eval_samples_per_second": 199.164, "eval_steps_per_second": 6.305, "step": 200 }, { "epoch": 0.6612903225806451, "grad_norm": 3.7159454822540283, "learning_rate": 6.31666067478113e-06, "loss": 0.9359, "step": 205 }, { "epoch": 0.6774193548387096, "grad_norm": 3.5741474628448486, "learning_rate": 5.799347085864851e-06, "loss": 0.8788, "step": 210 }, { "epoch": 0.6935483870967742, "grad_norm": 3.8699448108673096, "learning_rate": 5.295345193972445e-06, "loss": 0.8994, "step": 215 }, { "epoch": 0.7096774193548387, "grad_norm": 3.681175470352173, "learning_rate": 4.8062521604551245e-06, "loss": 0.8894, "step": 220 }, { "epoch": 0.7258064516129032, "grad_norm": 4.077960968017578, "learning_rate": 4.333617901102592e-06, "loss": 0.8478, "step": 225 }, { "epoch": 0.7419354838709677, "grad_norm": 4.131643772125244, "learning_rate": 3.878940174523371e-06, "loss": 0.8409, "step": 230 }, { "epoch": 0.7580645161290323, "grad_norm": 3.835108995437622, "learning_rate": 3.4436598358091577e-06, "loss": 0.878, "step": 235 }, { "epoch": 0.7741935483870968, "grad_norm": 3.9890787601470947, "learning_rate": 3.0291562705240107e-06, "loss": 0.8321, "step": 240 }, { "epoch": 0.7903225806451613, "grad_norm": 3.728030204772949, "learning_rate": 2.6367430234880286e-06, "loss": 0.8269, "step": 245 }, { "epoch": 0.8064516129032258, "grad_norm": 3.8121144771575928, "learning_rate": 2.2676636362076075e-06, "loss": 0.7872, "step": 250 }, { "epoch": 0.8225806451612904, "grad_norm": 3.7709388732910156, "learning_rate": 1.9230877061433505e-06, "loss": 0.8187, "step": 255 }, { "epoch": 0.8387096774193549, "grad_norm": 3.6049516201019287, "learning_rate": 1.60410718030361e-06, "loss": 0.8195, "step": 260 }, { "epoch": 0.8548387096774194, "grad_norm": 3.6554863452911377, "learning_rate": 1.3117328949091634e-06, "loss": 0.8091, "step": 265 }, { "epoch": 0.8709677419354839, "grad_norm": 3.8024368286132812, "learning_rate": 1.0468913720946084e-06, "loss": 0.7855, "step": 270 }, { "epoch": 0.8870967741935484, "grad_norm": 3.79064679145813, "learning_rate": 8.10421883797694e-07, "loss": 0.7959, "step": 275 }, { "epoch": 0.9032258064516129, "grad_norm": 3.7366783618927, "learning_rate": 6.030737921409169e-07, "loss": 0.7824, "step": 280 }, { "epoch": 0.9193548387096774, "grad_norm": 3.792200803756714, "learning_rate": 4.2550417473364524e-07, "loss": 0.7653, "step": 285 }, { "epoch": 0.9354838709677419, "grad_norm": 3.850405216217041, "learning_rate": 2.7827574242009434e-07, "loss": 0.759, "step": 290 }, { "epoch": 0.9516129032258065, "grad_norm": 3.8457884788513184, "learning_rate": 1.6185505607171027e-07, "loss": 0.7866, "step": 295 }, { "epoch": 0.967741935483871, "grad_norm": 3.824265241622925, "learning_rate": 7.661104807487607e-08, "loss": 0.7683, "step": 300 }, { "epoch": 0.967741935483871, "eval_loss": 0.7902069091796875, "eval_runtime": 5.4529, "eval_samples_per_second": 196.96, "eval_steps_per_second": 6.235, "step": 300 }, { "epoch": 0.9838709677419355, "grad_norm": 3.8102200031280518, "learning_rate": 2.2813853199292745e-08, "loss": 0.778, "step": 305 }, { "epoch": 1.0, "grad_norm": 3.923637628555298, "learning_rate": 6.339525519594159e-10, "loss": 0.7865, "step": 310 }, { "epoch": 1.0, "step": 310, "total_flos": 1.0839391709062758e+17, "train_loss": 1.0778046692571333, "train_runtime": 974.833, "train_samples_per_second": 20.339, "train_steps_per_second": 0.318 } ], "logging_steps": 5, "max_steps": 310, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.0839391709062758e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }