{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 1000, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.025, "grad_norm": 7.022058486938477, "learning_rate": 3.3333333333333333e-06, "loss": 1.2871, "step": 5 }, { "epoch": 0.05, "grad_norm": 2.1450326442718506, "learning_rate": 4.997050398198977e-06, "loss": 1.0775, "step": 10 }, { "epoch": 0.075, "grad_norm": 1.9242591857910156, "learning_rate": 4.979050253066064e-06, "loss": 0.9553, "step": 15 }, { "epoch": 0.1, "grad_norm": 1.4348323345184326, "learning_rate": 4.944806430988927e-06, "loss": 0.8635, "step": 20 }, { "epoch": 0.125, "grad_norm": 1.0105319023132324, "learning_rate": 4.894543310469968e-06, "loss": 0.8612, "step": 25 }, { "epoch": 0.15, "grad_norm": 0.6607258319854736, "learning_rate": 4.828590234527107e-06, "loss": 0.8138, "step": 30 }, { "epoch": 0.175, "grad_norm": 0.4061996638774872, "learning_rate": 4.747379352713489e-06, "loss": 0.7887, "step": 35 }, { "epoch": 0.2, "grad_norm": 0.2776339054107666, "learning_rate": 4.651442789509813e-06, "loss": 0.7501, "step": 40 }, { "epoch": 0.225, "grad_norm": 0.25159376859664917, "learning_rate": 4.541409157643027e-06, "loss": 0.7603, "step": 45 }, { "epoch": 0.25, "grad_norm": 0.22889962792396545, "learning_rate": 4.417999439177465e-06, "loss": 0.7377, "step": 50 }, { "epoch": 0.275, "grad_norm": 0.25352779030799866, "learning_rate": 4.282022261367074e-06, "loss": 0.7541, "step": 55 }, { "epoch": 0.3, "grad_norm": 0.2564803957939148, "learning_rate": 4.134368598223132e-06, "loss": 0.7207, "step": 60 }, { "epoch": 0.325, "grad_norm": 0.21964561939239502, "learning_rate": 3.976005932514807e-06, "loss": 0.7318, "step": 65 }, { "epoch": 0.35, "grad_norm": 0.1952921450138092, "learning_rate": 3.807971916455325e-06, "loss": 0.7166, "step": 70 }, { "epoch": 0.375, "grad_norm": 0.21189448237419128, "learning_rate": 3.631367572611348e-06, "loss": 0.7351, "step": 75 }, { "epoch": 0.4, "grad_norm": 0.20247572660446167, "learning_rate": 3.4473500795857674e-06, "loss": 0.7192, "step": 80 }, { "epoch": 0.425, "grad_norm": 0.1986052691936493, "learning_rate": 3.257125189744877e-06, "loss": 0.7018, "step": 85 }, { "epoch": 0.45, "grad_norm": 0.20034098625183105, "learning_rate": 3.061939328671824e-06, "loss": 0.6939, "step": 90 }, { "epoch": 0.475, "grad_norm": 0.1981743723154068, "learning_rate": 2.8630714281137263e-06, "loss": 0.7129, "step": 95 }, { "epoch": 0.5, "grad_norm": 0.2070968598127365, "learning_rate": 2.6618245459360896e-06, "loss": 0.6993, "step": 100 }, { "epoch": 0.525, "grad_norm": 0.19754789769649506, "learning_rate": 2.4595173279937464e-06, "loss": 0.7106, "step": 105 }, { "epoch": 0.55, "grad_norm": 0.18637321889400482, "learning_rate": 2.25747536786338e-06, "loss": 0.6975, "step": 110 }, { "epoch": 0.575, "grad_norm": 0.1984306424856186, "learning_rate": 2.0570225210519433e-06, "loss": 0.7025, "step": 115 }, { "epoch": 0.6, "grad_norm": 0.19408036768436432, "learning_rate": 1.8594722305935691e-06, "loss": 0.6748, "step": 120 }, { "epoch": 0.625, "grad_norm": 0.19682057201862335, "learning_rate": 1.6661189208729492e-06, "loss": 0.7103, "step": 125 }, { "epoch": 0.65, "grad_norm": 0.2067941576242447, "learning_rate": 1.4782295160661103e-06, "loss": 0.6787, "step": 130 }, { "epoch": 0.675, "grad_norm": 0.19579337537288666, "learning_rate": 1.2970351387729875e-06, "loss": 0.6844, "step": 135 }, { "epoch": 0.7, "grad_norm": 0.19357725977897644, "learning_rate": 1.1237230432354912e-06, "loss": 0.6872, "step": 140 }, { "epoch": 0.725, "grad_norm": 0.19475950300693512, "learning_rate": 9.594288359976817e-07, "loss": 0.6843, "step": 145 }, { "epoch": 0.75, "grad_norm": 0.19206686317920685, "learning_rate": 8.052290349812419e-07, "loss": 0.6983, "step": 150 }, { "epoch": 0.775, "grad_norm": 0.18994426727294922, "learning_rate": 6.621340157319998e-07, "loss": 0.6907, "step": 155 }, { "epoch": 0.8, "grad_norm": 0.19607287645339966, "learning_rate": 5.310813910563645e-07, "loss": 0.6765, "step": 160 }, { "epoch": 0.825, "grad_norm": 0.18892605602741241, "learning_rate": 4.129298674268226e-07, "loss": 0.682, "step": 165 }, { "epoch": 0.85, "grad_norm": 0.20095601677894592, "learning_rate": 3.08453618411631e-07, "loss": 0.6918, "step": 170 }, { "epoch": 0.875, "grad_norm": 0.1872701197862625, "learning_rate": 2.1833721199614992e-07, "loss": 0.6782, "step": 175 }, { "epoch": 0.9, "grad_norm": 0.1944805085659027, "learning_rate": 1.4317112503391433e-07, "loss": 0.6997, "step": 180 }, { "epoch": 0.925, "grad_norm": 0.19366849958896637, "learning_rate": 8.344787421847216e-08, "loss": 0.6741, "step": 185 }, { "epoch": 0.95, "grad_norm": 0.18197080492973328, "learning_rate": 3.955878892731441e-08, "loss": 0.685, "step": 190 }, { "epoch": 0.975, "grad_norm": 0.18788814544677734, "learning_rate": 1.1791447083465136e-08, "loss": 0.6769, "step": 195 }, { "epoch": 1.0, "grad_norm": 0.2002282291650772, "learning_rate": 3.277908359194948e-10, "loss": 0.693, "step": 200 }, { "epoch": 1.0, "step": 200, "total_flos": 4.0836188273115136e+18, "train_loss": 0.746429123878479, "train_runtime": 2870.5475, "train_samples_per_second": 1.114, "train_steps_per_second": 0.07 } ], "logging_steps": 5, "max_steps": 200, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.0836188273115136e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }