{ "best_global_step": 2030, "best_metric": 3.28729248046875, "best_model_checkpoint": "/home/p318482/syntactic-bootstrapping/model_trained/cds_sh1_fr_30/checkpoint-2030", "epoch": 16.0, "eval_steps": 500, "global_step": 2320, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006896551724137931, "grad_norm": 34.13589859008789, "learning_rate": 0.0, "loss": 9.8164, "step": 1 }, { "epoch": 1.0, "grad_norm": 1.4214282035827637, "learning_rate": 2.88e-05, "loss": 6.0017, "step": 145 }, { "epoch": 1.0, "eval_loss": 4.516501426696777, "eval_runtime": 4.3416, "eval_samples_per_second": 810.983, "eval_steps_per_second": 3.225, "step": 145 }, { "epoch": 2.0, "grad_norm": 1.7844622135162354, "learning_rate": 5.7799999999999995e-05, "loss": 4.1747, "step": 290 }, { "epoch": 2.0, "eval_loss": 4.028092384338379, "eval_runtime": 4.4671, "eval_samples_per_second": 788.2, "eval_steps_per_second": 3.134, "step": 290 }, { "epoch": 3.0, "grad_norm": 1.2072691917419434, "learning_rate": 8.680000000000001e-05, "loss": 3.8777, "step": 435 }, { "epoch": 3.0, "eval_loss": 3.7704145908355713, "eval_runtime": 4.3729, "eval_samples_per_second": 805.181, "eval_steps_per_second": 3.202, "step": 435 }, { "epoch": 4.0, "grad_norm": 1.2117047309875488, "learning_rate": 9.668067226890756e-05, "loss": 3.6723, "step": 580 }, { "epoch": 4.0, "eval_loss": 3.6042661666870117, "eval_runtime": 4.4715, "eval_samples_per_second": 787.434, "eval_steps_per_second": 3.131, "step": 580 }, { "epoch": 5.0, "grad_norm": 1.2041577100753784, "learning_rate": 9.058823529411765e-05, "loss": 3.5366, "step": 725 }, { "epoch": 5.0, "eval_loss": 3.5113701820373535, "eval_runtime": 4.511, "eval_samples_per_second": 780.542, "eval_steps_per_second": 3.104, "step": 725 }, { "epoch": 6.0, "grad_norm": 1.1716428995132446, "learning_rate": 8.449579831932774e-05, "loss": 3.4452, "step": 870 }, { "epoch": 6.0, "eval_loss": 3.4479944705963135, "eval_runtime": 4.4139, "eval_samples_per_second": 797.713, "eval_steps_per_second": 3.172, "step": 870 }, { "epoch": 7.0, "grad_norm": 1.1970024108886719, "learning_rate": 7.840336134453782e-05, "loss": 3.3743, "step": 1015 }, { "epoch": 7.0, "eval_loss": 3.4008681774139404, "eval_runtime": 4.385, "eval_samples_per_second": 802.963, "eval_steps_per_second": 3.193, "step": 1015 }, { "epoch": 8.0, "grad_norm": 1.2204298973083496, "learning_rate": 7.23109243697479e-05, "loss": 3.3153, "step": 1160 }, { "epoch": 8.0, "eval_loss": 3.366744041442871, "eval_runtime": 4.437, "eval_samples_per_second": 793.546, "eval_steps_per_second": 3.155, "step": 1160 }, { "epoch": 9.0, "grad_norm": 1.3044636249542236, "learning_rate": 6.621848739495798e-05, "loss": 3.2657, "step": 1305 }, { "epoch": 9.0, "eval_loss": 3.3413963317871094, "eval_runtime": 4.694, "eval_samples_per_second": 750.11, "eval_steps_per_second": 2.983, "step": 1305 }, { "epoch": 10.0, "grad_norm": 1.4050198793411255, "learning_rate": 6.012605042016807e-05, "loss": 3.2193, "step": 1450 }, { "epoch": 10.0, "eval_loss": 3.3189010620117188, "eval_runtime": 4.3858, "eval_samples_per_second": 802.816, "eval_steps_per_second": 3.192, "step": 1450 }, { "epoch": 11.0, "grad_norm": 1.4973615407943726, "learning_rate": 5.403361344537815e-05, "loss": 3.1773, "step": 1595 }, { "epoch": 11.0, "eval_loss": 3.30843448638916, "eval_runtime": 4.4298, "eval_samples_per_second": 794.852, "eval_steps_per_second": 3.16, "step": 1595 }, { "epoch": 12.0, "grad_norm": 1.5111000537872314, "learning_rate": 4.794117647058824e-05, "loss": 3.1366, "step": 1740 }, { "epoch": 12.0, "eval_loss": 3.2951903343200684, "eval_runtime": 4.3718, "eval_samples_per_second": 805.398, "eval_steps_per_second": 3.202, "step": 1740 }, { "epoch": 13.0, "grad_norm": 1.6258167028427124, "learning_rate": 4.184873949579832e-05, "loss": 3.0985, "step": 1885 }, { "epoch": 13.0, "eval_loss": 3.2935917377471924, "eval_runtime": 4.4103, "eval_samples_per_second": 798.356, "eval_steps_per_second": 3.174, "step": 1885 }, { "epoch": 14.0, "grad_norm": 1.7234424352645874, "learning_rate": 3.575630252100841e-05, "loss": 3.061, "step": 2030 }, { "epoch": 14.0, "eval_loss": 3.28729248046875, "eval_runtime": 4.3993, "eval_samples_per_second": 800.353, "eval_steps_per_second": 3.182, "step": 2030 }, { "epoch": 15.0, "grad_norm": 1.8610221147537231, "learning_rate": 2.966386554621849e-05, "loss": 3.0252, "step": 2175 }, { "epoch": 15.0, "eval_loss": 3.2955870628356934, "eval_runtime": 4.5452, "eval_samples_per_second": 774.661, "eval_steps_per_second": 3.08, "step": 2175 }, { "epoch": 16.0, "grad_norm": 1.9218178987503052, "learning_rate": 2.357142857142857e-05, "loss": 2.9897, "step": 2320 }, { "epoch": 16.0, "eval_loss": 3.2937674522399902, "eval_runtime": 4.4093, "eval_samples_per_second": 798.538, "eval_steps_per_second": 3.175, "step": 2320 } ], "logging_steps": 500, "max_steps": 2900, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.8599104135168e+16, "train_batch_size": 256, "trial_name": null, "trial_params": null }