{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05, "grad_norm": 11.879703521728516, "learning_rate": 1.6000000000000003e-05, "loss": 2.3428, "step": 25 }, { "epoch": 0.1, "grad_norm": 5.421560287475586, "learning_rate": 1.998107236150145e-05, "loss": 0.7368, "step": 50 }, { "epoch": 0.15, "grad_norm": 2.313567876815796, "learning_rate": 1.989863301061654e-05, "loss": 0.5067, "step": 75 }, { "epoch": 0.2, "grad_norm": 2.5316965579986572, "learning_rate": 1.9751334064160708e-05, "loss": 0.4279, "step": 100 }, { "epoch": 0.25, "grad_norm": 2.5976717472076416, "learning_rate": 1.9540140680664915e-05, "loss": 0.419, "step": 125 }, { "epoch": 0.3, "grad_norm": 1.39866304397583, "learning_rate": 1.9266436679230866e-05, "loss": 0.4207, "step": 150 }, { "epoch": 0.35, "grad_norm": 1.8767157793045044, "learning_rate": 1.8932015472223692e-05, "loss": 0.4169, "step": 175 }, { "epoch": 0.4, "grad_norm": 2.209038734436035, "learning_rate": 1.8539068314154355e-05, "loss": 0.4185, "step": 200 }, { "epoch": 0.45, "grad_norm": 2.1389389038085938, "learning_rate": 1.8090169943749477e-05, "loss": 0.4197, "step": 225 }, { "epoch": 0.5, "grad_norm": 1.957894206047058, "learning_rate": 1.758826171328727e-05, "loss": 0.42, "step": 250 }, { "epoch": 0.55, "grad_norm": 1.7354559898376465, "learning_rate": 1.7036632315742464e-05, "loss": 0.4209, "step": 275 }, { "epoch": 0.6, "grad_norm": 1.4375723600387573, "learning_rate": 1.6438896236023374e-05, "loss": 0.4168, "step": 300 }, { "epoch": 0.65, "grad_norm": 1.7558571100234985, "learning_rate": 1.57989700674967e-05, "loss": 0.4155, "step": 325 }, { "epoch": 0.7, "grad_norm": 1.6743788719177246, "learning_rate": 1.512104684898319e-05, "loss": 0.4171, "step": 350 }, { "epoch": 0.75, "grad_norm": 1.1145944595336914, "learning_rate": 1.4409568590377918e-05, "loss": 0.4143, "step": 375 }, { "epoch": 0.8, "grad_norm": 2.5990407466888428, "learning_rate": 1.3669197166917723e-05, "loss": 0.4144, "step": 400 }, { "epoch": 0.85, "grad_norm": 2.7390551567077637, "learning_rate": 1.2904783772807534e-05, "loss": 0.4152, "step": 425 }, { "epoch": 0.9, "grad_norm": 1.1414133310317993, "learning_rate": 1.2121337134357121e-05, "loss": 0.4161, "step": 450 }, { "epoch": 0.95, "grad_norm": 1.2553099393844604, "learning_rate": 1.1323990690907734e-05, "loss": 0.4154, "step": 475 }, { "epoch": 1.0, "grad_norm": 1.7206685543060303, "learning_rate": 1.0517968958591705e-05, "loss": 0.4142, "step": 500 }, { "epoch": 1.05, "grad_norm": 1.1427602767944336, "learning_rate": 9.708553297322407e-06, "loss": 0.4138, "step": 525 }, { "epoch": 1.1, "grad_norm": 1.4284802675247192, "learning_rate": 8.901047305322172e-06, "loss": 0.4142, "step": 550 }, { "epoch": 1.15, "grad_norm": 1.6178677082061768, "learning_rate": 8.100742067936432e-06, "loss": 0.4138, "step": 575 }, { "epoch": 1.2, "grad_norm": 1.2968213558197021, "learning_rate": 7.312881488436928e-06, "loss": 0.4125, "step": 600 }, { "epoch": 1.25, "grad_norm": 1.2272377014160156, "learning_rate": 6.542627927979772e-06, "loss": 0.4108, "step": 625 }, { "epoch": 1.3, "grad_norm": 1.5362012386322021, "learning_rate": 5.795028379858355e-06, "loss": 0.4112, "step": 650 }, { "epoch": 1.35, "grad_norm": 1.4780094623565674, "learning_rate": 5.074981399690219e-06, "loss": 0.413, "step": 675 }, { "epoch": 1.4, "grad_norm": 2.2684173583984375, "learning_rate": 4.3872050082238535e-06, "loss": 0.4136, "step": 700 }, { "epoch": 1.45, "grad_norm": 1.7533961534500122, "learning_rate": 3.736205777078381e-06, "loss": 0.4115, "step": 725 }, { "epoch": 1.5, "grad_norm": 2.0084352493286133, "learning_rate": 3.126249299978086e-06, "loss": 0.4125, "step": 750 }, { "epoch": 1.55, "grad_norm": 1.6931928396224976, "learning_rate": 2.5613322429654573e-06, "loss": 0.4122, "step": 775 }, { "epoch": 1.6, "grad_norm": 1.7414414882659912, "learning_rate": 2.0451561567303378e-06, "loss": 0.412, "step": 800 }, { "epoch": 1.65, "grad_norm": 1.7203795909881592, "learning_rate": 1.5811032226467304e-06, "loss": 0.4123, "step": 825 }, { "epoch": 1.7, "grad_norm": 2.252686023712158, "learning_rate": 1.1722140914384162e-06, "loss": 0.4106, "step": 850 }, { "epoch": 1.75, "grad_norm": 1.8625959157943726, "learning_rate": 8.211679596828481e-07, "loss": 0.4118, "step": 875 }, { "epoch": 1.8, "grad_norm": 1.6265811920166016, "learning_rate": 5.30265014699628e-07, "loss": 0.4125, "step": 900 }, { "epoch": 1.85, "grad_norm": 1.9654788970947266, "learning_rate": 3.0141136285129825e-07, "loss": 0.4115, "step": 925 }, { "epoch": 1.9, "grad_norm": 1.8853169679641724, "learning_rate": 1.361065400119399e-07, "loss": 0.4122, "step": 950 }, { "epoch": 1.95, "grad_norm": 1.8307050466537476, "learning_rate": 3.543368603973529e-08, "loss": 0.4112, "step": 975 }, { "epoch": 2.0, "grad_norm": 1.647140622138977, "learning_rate": 5.244763404133046e-11, "loss": 0.4103, "step": 1000 }, { "epoch": 2.0, "step": 1000, "total_flos": 7576988418048000.0, "train_loss": 0.4733153915405273, "train_runtime": 560.9841, "train_samples_per_second": 28.521, "train_steps_per_second": 1.783 } ], "logging_steps": 25, "max_steps": 1000, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7576988418048000.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }