{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 10, "global_step": 50, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.020151133501259445, "grad_norm": 286.0, "learning_rate": 0.0, "loss": 6.3323, "step": 1 }, { "epoch": 0.04030226700251889, "grad_norm": 294.0, "learning_rate": 2.0000000000000003e-06, "loss": 6.9278, "step": 2 }, { "epoch": 0.060453400503778336, "grad_norm": 258.0, "learning_rate": 4.000000000000001e-06, "loss": 6.5878, "step": 3 }, { "epoch": 0.08060453400503778, "grad_norm": 236.0, "learning_rate": 6e-06, "loss": 4.9359, "step": 4 }, { "epoch": 0.10075566750629723, "grad_norm": 98.5, "learning_rate": 8.000000000000001e-06, "loss": 3.6571, "step": 5 }, { "epoch": 0.12090680100755667, "grad_norm": 71.5, "learning_rate": 1e-05, "loss": 2.9318, "step": 6 }, { "epoch": 0.14105793450881612, "grad_norm": 40.25, "learning_rate": 9.777777777777779e-06, "loss": 2.16, "step": 7 }, { "epoch": 0.16120906801007556, "grad_norm": 40.75, "learning_rate": 9.555555555555556e-06, "loss": 2.028, "step": 8 }, { "epoch": 0.181360201511335, "grad_norm": 39.75, "learning_rate": 9.333333333333334e-06, "loss": 1.6548, "step": 9 }, { "epoch": 0.20151133501259447, "grad_norm": 35.75, "learning_rate": 9.111111111111112e-06, "loss": 2.1349, "step": 10 }, { "epoch": 0.20151133501259447, "eval_loss": 1.6403069496154785, "eval_model_preparation_time": 0.0199, "eval_runtime": 2.5723, "eval_samples_per_second": 36.931, "eval_steps_per_second": 18.66, "step": 10 }, { "epoch": 0.2216624685138539, "grad_norm": 50.0, "learning_rate": 8.888888888888888e-06, "loss": 1.2909, "step": 11 }, { "epoch": 0.24181360201511334, "grad_norm": 30.0, "learning_rate": 8.666666666666668e-06, "loss": 1.2002, "step": 12 }, { "epoch": 0.2619647355163728, "grad_norm": 25.25, "learning_rate": 8.444444444444446e-06, "loss": 1.1529, "step": 13 }, { "epoch": 0.28211586901763225, "grad_norm": 28.5, "learning_rate": 8.222222222222222e-06, "loss": 1.436, "step": 14 }, { "epoch": 0.3022670025188917, "grad_norm": 21.5, "learning_rate": 8.000000000000001e-06, "loss": 1.1747, "step": 15 }, { "epoch": 0.3224181360201511, "grad_norm": 25.25, "learning_rate": 7.77777777777778e-06, "loss": 1.2486, "step": 16 }, { "epoch": 0.3425692695214106, "grad_norm": 29.125, "learning_rate": 7.555555555555556e-06, "loss": 1.3699, "step": 17 }, { "epoch": 0.36272040302267, "grad_norm": 28.625, "learning_rate": 7.333333333333333e-06, "loss": 1.0368, "step": 18 }, { "epoch": 0.38287153652392947, "grad_norm": 31.25, "learning_rate": 7.111111111111112e-06, "loss": 1.7601, "step": 19 }, { "epoch": 0.40302267002518893, "grad_norm": 27.125, "learning_rate": 6.88888888888889e-06, "loss": 1.0943, "step": 20 }, { "epoch": 0.40302267002518893, "eval_loss": 1.2779872417449951, "eval_model_preparation_time": 0.0199, "eval_runtime": 2.5465, "eval_samples_per_second": 37.305, "eval_steps_per_second": 18.849, "step": 20 }, { "epoch": 0.42317380352644834, "grad_norm": 29.0, "learning_rate": 6.666666666666667e-06, "loss": 1.1963, "step": 21 }, { "epoch": 0.4433249370277078, "grad_norm": 20.625, "learning_rate": 6.444444444444445e-06, "loss": 0.8011, "step": 22 }, { "epoch": 0.4634760705289673, "grad_norm": 26.625, "learning_rate": 6.222222222222223e-06, "loss": 0.9265, "step": 23 }, { "epoch": 0.4836272040302267, "grad_norm": 26.375, "learning_rate": 6e-06, "loss": 0.9816, "step": 24 }, { "epoch": 0.5037783375314862, "grad_norm": 27.25, "learning_rate": 5.777777777777778e-06, "loss": 1.1287, "step": 25 }, { "epoch": 0.5239294710327456, "grad_norm": 22.75, "learning_rate": 5.555555555555557e-06, "loss": 1.031, "step": 26 }, { "epoch": 0.5440806045340051, "grad_norm": 28.25, "learning_rate": 5.333333333333334e-06, "loss": 1.184, "step": 27 }, { "epoch": 0.5642317380352645, "grad_norm": 20.5, "learning_rate": 5.1111111111111115e-06, "loss": 0.9292, "step": 28 }, { "epoch": 0.5843828715365239, "grad_norm": 27.75, "learning_rate": 4.888888888888889e-06, "loss": 0.9952, "step": 29 }, { "epoch": 0.6045340050377834, "grad_norm": 21.125, "learning_rate": 4.666666666666667e-06, "loss": 0.886, "step": 30 }, { "epoch": 0.6045340050377834, "eval_loss": 1.2436152696609497, "eval_model_preparation_time": 0.0199, "eval_runtime": 2.7811, "eval_samples_per_second": 34.159, "eval_steps_per_second": 17.259, "step": 30 }, { "epoch": 0.6246851385390428, "grad_norm": 22.625, "learning_rate": 4.444444444444444e-06, "loss": 0.9749, "step": 31 }, { "epoch": 0.6448362720403022, "grad_norm": 17.0, "learning_rate": 4.222222222222223e-06, "loss": 0.8473, "step": 32 }, { "epoch": 0.6649874055415617, "grad_norm": 25.125, "learning_rate": 4.000000000000001e-06, "loss": 0.928, "step": 33 }, { "epoch": 0.6851385390428212, "grad_norm": 38.25, "learning_rate": 3.777777777777778e-06, "loss": 1.1706, "step": 34 }, { "epoch": 0.7052896725440806, "grad_norm": 15.75, "learning_rate": 3.555555555555556e-06, "loss": 0.6605, "step": 35 }, { "epoch": 0.72544080604534, "grad_norm": 22.875, "learning_rate": 3.3333333333333333e-06, "loss": 0.9448, "step": 36 }, { "epoch": 0.7455919395465995, "grad_norm": 16.25, "learning_rate": 3.1111111111111116e-06, "loss": 0.8088, "step": 37 }, { "epoch": 0.7657430730478589, "grad_norm": 15.5625, "learning_rate": 2.888888888888889e-06, "loss": 0.7572, "step": 38 }, { "epoch": 0.7858942065491183, "grad_norm": 28.75, "learning_rate": 2.666666666666667e-06, "loss": 0.9594, "step": 39 }, { "epoch": 0.8060453400503779, "grad_norm": 20.375, "learning_rate": 2.4444444444444447e-06, "loss": 0.7057, "step": 40 }, { "epoch": 0.8060453400503779, "eval_loss": 1.194969892501831, "eval_model_preparation_time": 0.0199, "eval_runtime": 2.4247, "eval_samples_per_second": 39.181, "eval_steps_per_second": 19.797, "step": 40 }, { "epoch": 0.8261964735516373, "grad_norm": 20.375, "learning_rate": 2.222222222222222e-06, "loss": 1.032, "step": 41 }, { "epoch": 0.8463476070528967, "grad_norm": 24.0, "learning_rate": 2.0000000000000003e-06, "loss": 0.9994, "step": 42 }, { "epoch": 0.8664987405541562, "grad_norm": 25.625, "learning_rate": 1.777777777777778e-06, "loss": 0.7317, "step": 43 }, { "epoch": 0.8866498740554156, "grad_norm": 21.5, "learning_rate": 1.5555555555555558e-06, "loss": 1.0102, "step": 44 }, { "epoch": 0.906801007556675, "grad_norm": 23.125, "learning_rate": 1.3333333333333334e-06, "loss": 0.9164, "step": 45 }, { "epoch": 0.9269521410579346, "grad_norm": 18.375, "learning_rate": 1.111111111111111e-06, "loss": 0.8684, "step": 46 }, { "epoch": 0.947103274559194, "grad_norm": 16.75, "learning_rate": 8.88888888888889e-07, "loss": 0.8164, "step": 47 }, { "epoch": 0.9672544080604534, "grad_norm": 29.0, "learning_rate": 6.666666666666667e-07, "loss": 1.2547, "step": 48 }, { "epoch": 0.9874055415617129, "grad_norm": 19.5, "learning_rate": 4.444444444444445e-07, "loss": 0.7327, "step": 49 }, { "epoch": 1.0, "grad_norm": 26.875, "learning_rate": 2.2222222222222224e-07, "loss": 1.0369, "step": 50 }, { "epoch": 1.0, "eval_loss": 1.154162883758545, "eval_model_preparation_time": 0.0199, "eval_runtime": 2.5588, "eval_samples_per_second": 37.127, "eval_steps_per_second": 18.759, "step": 50 } ], "logging_steps": 1, "max_steps": 50, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1842168632426496.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }