{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 10, "global_step": 50, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.020151133501259445, "grad_norm": 200.0, "learning_rate": 0.0, "loss": 11.0118, "step": 1 }, { "epoch": 0.04030226700251889, "grad_norm": 191.0, "learning_rate": 2.0000000000000003e-06, "loss": 11.6099, "step": 2 }, { "epoch": 0.060453400503778336, "grad_norm": 193.0, "learning_rate": 4.000000000000001e-06, "loss": 11.119, "step": 3 }, { "epoch": 0.08060453400503778, "grad_norm": 187.0, "learning_rate": 6e-06, "loss": 10.1865, "step": 4 }, { "epoch": 0.10075566750629723, "grad_norm": 159.0, "learning_rate": 8.000000000000001e-06, "loss": 8.6306, "step": 5 }, { "epoch": 0.12090680100755667, "grad_norm": 130.0, "learning_rate": 1e-05, "loss": 6.7238, "step": 6 }, { "epoch": 0.14105793450881612, "grad_norm": 99.5, "learning_rate": 9.777777777777779e-06, "loss": 4.568, "step": 7 }, { "epoch": 0.16120906801007556, "grad_norm": 62.25, "learning_rate": 9.555555555555556e-06, "loss": 3.7907, "step": 8 }, { "epoch": 0.181360201511335, "grad_norm": 38.5, "learning_rate": 9.333333333333334e-06, "loss": 2.9669, "step": 9 }, { "epoch": 0.20151133501259447, "grad_norm": 36.0, "learning_rate": 9.111111111111112e-06, "loss": 3.1071, "step": 10 }, { "epoch": 0.20151133501259447, "eval_loss": 2.249746799468994, "eval_model_preparation_time": 0.0215, "eval_runtime": 2.9615, "eval_samples_per_second": 32.078, "eval_steps_per_second": 16.208, "step": 10 }, { "epoch": 0.2216624685138539, "grad_norm": 29.125, "learning_rate": 8.888888888888888e-06, "loss": 2.2238, "step": 11 }, { "epoch": 0.24181360201511334, "grad_norm": 27.125, "learning_rate": 8.666666666666668e-06, "loss": 1.978, "step": 12 }, { "epoch": 0.2619647355163728, "grad_norm": 21.625, "learning_rate": 8.444444444444446e-06, "loss": 1.7079, "step": 13 }, { "epoch": 0.28211586901763225, "grad_norm": 22.5, "learning_rate": 8.222222222222222e-06, "loss": 1.7878, "step": 14 }, { "epoch": 0.3022670025188917, "grad_norm": 15.8125, "learning_rate": 8.000000000000001e-06, "loss": 1.1862, "step": 15 }, { "epoch": 0.3224181360201511, "grad_norm": 13.9375, "learning_rate": 7.77777777777778e-06, "loss": 1.2518, "step": 16 }, { "epoch": 0.3425692695214106, "grad_norm": 17.25, "learning_rate": 7.555555555555556e-06, "loss": 1.107, "step": 17 }, { "epoch": 0.36272040302267, "grad_norm": 13.5625, "learning_rate": 7.333333333333333e-06, "loss": 0.9791, "step": 18 }, { "epoch": 0.38287153652392947, "grad_norm": 14.875, "learning_rate": 7.111111111111112e-06, "loss": 1.1575, "step": 19 }, { "epoch": 0.40302267002518893, "grad_norm": 14.9375, "learning_rate": 6.88888888888889e-06, "loss": 0.818, "step": 20 }, { "epoch": 0.40302267002518893, "eval_loss": 0.7283493876457214, "eval_model_preparation_time": 0.0215, "eval_runtime": 3.0654, "eval_samples_per_second": 30.991, "eval_steps_per_second": 15.659, "step": 20 }, { "epoch": 0.42317380352644834, "grad_norm": 13.75, "learning_rate": 6.666666666666667e-06, "loss": 0.785, "step": 21 }, { "epoch": 0.4433249370277078, "grad_norm": 14.9375, "learning_rate": 6.444444444444445e-06, "loss": 0.6213, "step": 22 }, { "epoch": 0.4634760705289673, "grad_norm": 18.375, "learning_rate": 6.222222222222223e-06, "loss": 0.6943, "step": 23 }, { "epoch": 0.4836272040302267, "grad_norm": 14.3125, "learning_rate": 6e-06, "loss": 0.5944, "step": 24 }, { "epoch": 0.5037783375314862, "grad_norm": 10.6875, "learning_rate": 5.777777777777778e-06, "loss": 0.5768, "step": 25 }, { "epoch": 0.5239294710327456, "grad_norm": 14.8125, "learning_rate": 5.555555555555557e-06, "loss": 0.6205, "step": 26 }, { "epoch": 0.5440806045340051, "grad_norm": 10.25, "learning_rate": 5.333333333333334e-06, "loss": 0.5576, "step": 27 }, { "epoch": 0.5642317380352645, "grad_norm": 8.6875, "learning_rate": 5.1111111111111115e-06, "loss": 0.585, "step": 28 }, { "epoch": 0.5843828715365239, "grad_norm": 9.3125, "learning_rate": 4.888888888888889e-06, "loss": 0.4842, "step": 29 }, { "epoch": 0.6045340050377834, "grad_norm": 9.625, "learning_rate": 4.666666666666667e-06, "loss": 0.5162, "step": 30 }, { "epoch": 0.6045340050377834, "eval_loss": 0.5592977404594421, "eval_model_preparation_time": 0.0215, "eval_runtime": 3.3756, "eval_samples_per_second": 28.143, "eval_steps_per_second": 14.22, "step": 30 }, { "epoch": 0.6246851385390428, "grad_norm": 6.9375, "learning_rate": 4.444444444444444e-06, "loss": 0.4891, "step": 31 }, { "epoch": 0.6448362720403022, "grad_norm": 13.5, "learning_rate": 4.222222222222223e-06, "loss": 0.4537, "step": 32 }, { "epoch": 0.6649874055415617, "grad_norm": 12.5625, "learning_rate": 4.000000000000001e-06, "loss": 0.5064, "step": 33 }, { "epoch": 0.6851385390428212, "grad_norm": 14.375, "learning_rate": 3.777777777777778e-06, "loss": 0.5586, "step": 34 }, { "epoch": 0.7052896725440806, "grad_norm": 7.9375, "learning_rate": 3.555555555555556e-06, "loss": 0.4467, "step": 35 }, { "epoch": 0.72544080604534, "grad_norm": 11.3125, "learning_rate": 3.3333333333333333e-06, "loss": 0.4768, "step": 36 }, { "epoch": 0.7455919395465995, "grad_norm": 9.0, "learning_rate": 3.1111111111111116e-06, "loss": 0.5006, "step": 37 }, { "epoch": 0.7657430730478589, "grad_norm": 9.6875, "learning_rate": 2.888888888888889e-06, "loss": 0.4835, "step": 38 }, { "epoch": 0.7858942065491183, "grad_norm": 6.5625, "learning_rate": 2.666666666666667e-06, "loss": 0.4814, "step": 39 }, { "epoch": 0.8060453400503779, "grad_norm": 6.375, "learning_rate": 2.4444444444444447e-06, "loss": 0.3869, "step": 40 }, { "epoch": 0.8060453400503779, "eval_loss": 0.5029374957084656, "eval_model_preparation_time": 0.0215, "eval_runtime": 2.8351, "eval_samples_per_second": 33.508, "eval_steps_per_second": 16.931, "step": 40 }, { "epoch": 0.8261964735516373, "grad_norm": 8.3125, "learning_rate": 2.222222222222222e-06, "loss": 0.5224, "step": 41 }, { "epoch": 0.8463476070528967, "grad_norm": 7.625, "learning_rate": 2.0000000000000003e-06, "loss": 0.4716, "step": 42 }, { "epoch": 0.8664987405541562, "grad_norm": 5.6875, "learning_rate": 1.777777777777778e-06, "loss": 0.3948, "step": 43 }, { "epoch": 0.8866498740554156, "grad_norm": 7.5625, "learning_rate": 1.5555555555555558e-06, "loss": 0.4883, "step": 44 }, { "epoch": 0.906801007556675, "grad_norm": 7.375, "learning_rate": 1.3333333333333334e-06, "loss": 0.4354, "step": 45 }, { "epoch": 0.9269521410579346, "grad_norm": 8.375, "learning_rate": 1.111111111111111e-06, "loss": 0.4541, "step": 46 }, { "epoch": 0.947103274559194, "grad_norm": 7.5, "learning_rate": 8.88888888888889e-07, "loss": 0.4197, "step": 47 }, { "epoch": 0.9672544080604534, "grad_norm": 6.0, "learning_rate": 6.666666666666667e-07, "loss": 0.4558, "step": 48 }, { "epoch": 0.9874055415617129, "grad_norm": 6.09375, "learning_rate": 4.444444444444445e-07, "loss": 0.4534, "step": 49 }, { "epoch": 1.0, "grad_norm": 7.65625, "learning_rate": 2.2222222222222224e-07, "loss": 0.4926, "step": 50 }, { "epoch": 1.0, "eval_loss": 0.49504080414772034, "eval_model_preparation_time": 0.0215, "eval_runtime": 3.2667, "eval_samples_per_second": 29.082, "eval_steps_per_second": 14.694, "step": 50 } ], "logging_steps": 1, "max_steps": 50, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1033611294953472.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }