{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 269, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0037174721189591076, "grad_norm": 14.262325286865234, "learning_rate": 0.0, "loss": 3.044, "step": 1 }, { "epoch": 0.01858736059479554, "grad_norm": 13.0241060256958, "learning_rate": 2.962962962962963e-06, "loss": 3.0801, "step": 5 }, { "epoch": 0.03717472118959108, "grad_norm": 6.920816898345947, "learning_rate": 6.666666666666667e-06, "loss": 2.8515, "step": 10 }, { "epoch": 0.055762081784386616, "grad_norm": 4.249760627746582, "learning_rate": 1.037037037037037e-05, "loss": 2.469, "step": 15 }, { "epoch": 0.07434944237918216, "grad_norm": 5.996034145355225, "learning_rate": 1.4074074074074075e-05, "loss": 2.2633, "step": 20 }, { "epoch": 0.09293680297397769, "grad_norm": 8.92640495300293, "learning_rate": 1.7777777777777777e-05, "loss": 2.152, "step": 25 }, { "epoch": 0.11152416356877323, "grad_norm": 8.087935447692871, "learning_rate": 1.9996629653035128e-05, "loss": 2.0381, "step": 30 }, { "epoch": 0.13011152416356878, "grad_norm": 6.582911968231201, "learning_rate": 1.995873933559535e-05, "loss": 1.9322, "step": 35 }, { "epoch": 0.14869888475836432, "grad_norm": 5.658934593200684, "learning_rate": 1.9878905881817254e-05, "loss": 1.8054, "step": 40 }, { "epoch": 0.16728624535315986, "grad_norm": 3.6228199005126953, "learning_rate": 1.975746552556772e-05, "loss": 1.7393, "step": 45 }, { "epoch": 0.18587360594795538, "grad_norm": 2.397761106491089, "learning_rate": 1.9594929736144978e-05, "loss": 1.6738, "step": 50 }, { "epoch": 0.20446096654275092, "grad_norm": 2.159792423248291, "learning_rate": 1.939198306412775e-05, "loss": 1.6584, "step": 55 }, { "epoch": 0.22304832713754646, "grad_norm": 2.4025373458862305, "learning_rate": 1.9149480258259535e-05, "loss": 1.6347, "step": 60 }, { "epoch": 0.241635687732342, "grad_norm": 2.0866482257843018, "learning_rate": 1.886844266551068e-05, "loss": 1.598, "step": 65 }, { "epoch": 0.26022304832713755, "grad_norm": 1.7645834684371948, "learning_rate": 1.8550053929480202e-05, "loss": 1.5923, "step": 70 }, { "epoch": 0.2788104089219331, "grad_norm": 1.642554759979248, "learning_rate": 1.8195655005254274e-05, "loss": 1.5452, "step": 75 }, { "epoch": 0.29739776951672864, "grad_norm": 1.5671523809432983, "learning_rate": 1.780673851171728e-05, "loss": 1.5275, "step": 80 }, { "epoch": 0.3159851301115242, "grad_norm": 1.6036931276321411, "learning_rate": 1.7384942445101772e-05, "loss": 1.5358, "step": 85 }, { "epoch": 0.3345724907063197, "grad_norm": 1.540089726448059, "learning_rate": 1.6932043280253892e-05, "loss": 1.5234, "step": 90 }, { "epoch": 0.35315985130111527, "grad_norm": 1.1673122644424438, "learning_rate": 1.644994848866964e-05, "loss": 1.5211, "step": 95 }, { "epoch": 0.37174721189591076, "grad_norm": 1.3127870559692383, "learning_rate": 1.5940688504813664e-05, "loss": 1.5038, "step": 100 }, { "epoch": 0.37174721189591076, "eval_loss": 1.5038524866104126, "eval_runtime": 5.4348, "eval_samples_per_second": 171.303, "eval_steps_per_second": 5.52, "step": 100 }, { "epoch": 0.3903345724907063, "grad_norm": 1.1808836460113525, "learning_rate": 1.5406408174555978e-05, "loss": 1.4911, "step": 105 }, { "epoch": 0.40892193308550184, "grad_norm": 8.379034042358398, "learning_rate": 1.4849357721743169e-05, "loss": 1.4598, "step": 110 }, { "epoch": 0.4275092936802974, "grad_norm": 1.090346097946167, "learning_rate": 1.4271883270950073e-05, "loss": 1.4944, "step": 115 }, { "epoch": 0.44609665427509293, "grad_norm": 1.1020578145980835, "learning_rate": 1.3676416966327201e-05, "loss": 1.4808, "step": 120 }, { "epoch": 0.4646840148698885, "grad_norm": 1.1537312269210815, "learning_rate": 1.3065466728160253e-05, "loss": 1.4493, "step": 125 }, { "epoch": 0.483271375464684, "grad_norm": 0.9695990085601807, "learning_rate": 1.2441605690283915e-05, "loss": 1.456, "step": 130 }, { "epoch": 0.5018587360594795, "grad_norm": 1.1512304544448853, "learning_rate": 1.1807461362836382e-05, "loss": 1.4493, "step": 135 }, { "epoch": 0.5204460966542751, "grad_norm": 1.1557819843292236, "learning_rate": 1.1165704565997593e-05, "loss": 1.4528, "step": 140 }, { "epoch": 0.5390334572490706, "grad_norm": 0.9956693053245544, "learning_rate": 1.0519038181319e-05, "loss": 1.4129, "step": 145 }, { "epoch": 0.5576208178438662, "grad_norm": 0.9141682982444763, "learning_rate": 9.870185768020694e-06, "loss": 1.4179, "step": 150 }, { "epoch": 0.5762081784386617, "grad_norm": 1.0787816047668457, "learning_rate": 9.221880092200601e-06, "loss": 1.4139, "step": 155 }, { "epoch": 0.5947955390334573, "grad_norm": 1.0250240564346313, "learning_rate": 8.576851617267151e-06, "loss": 1.3976, "step": 160 }, { "epoch": 0.6133828996282528, "grad_norm": 0.9707216024398804, "learning_rate": 7.93781700407012e-06, "loss": 1.4043, "step": 165 }, { "epoch": 0.6319702602230484, "grad_norm": 1.020119309425354, "learning_rate": 7.307467669163655e-06, "loss": 1.4153, "step": 170 }, { "epoch": 0.6505576208178439, "grad_norm": 1.0523854494094849, "learning_rate": 6.688458449390438e-06, "loss": 1.3896, "step": 175 }, { "epoch": 0.6691449814126395, "grad_norm": 1.0011796951293945, "learning_rate": 6.083396420528298e-06, "loss": 1.3938, "step": 180 }, { "epoch": 0.6877323420074349, "grad_norm": 0.9738273024559021, "learning_rate": 5.494829917091733e-06, "loss": 1.371, "step": 185 }, { "epoch": 0.7063197026022305, "grad_norm": 0.9486119151115417, "learning_rate": 4.925237799533445e-06, "loss": 1.3906, "step": 190 }, { "epoch": 0.724907063197026, "grad_norm": 0.9746046662330627, "learning_rate": 4.377019014049223e-06, "loss": 1.3798, "step": 195 }, { "epoch": 0.7434944237918215, "grad_norm": 0.9329619407653809, "learning_rate": 3.852482488956992e-06, "loss": 1.3595, "step": 200 }, { "epoch": 0.7434944237918215, "eval_loss": 1.3681989908218384, "eval_runtime": 5.3831, "eval_samples_per_second": 172.95, "eval_steps_per_second": 5.573, "step": 200 }, { "epoch": 0.7620817843866171, "grad_norm": 1.0122321844100952, "learning_rate": 3.3538374102033865e-06, "loss": 1.3626, "step": 205 }, { "epoch": 0.7806691449814126, "grad_norm": 0.9424676895141602, "learning_rate": 2.8831839169543998e-06, "loss": 1.3451, "step": 210 }, { "epoch": 0.7992565055762082, "grad_norm": 0.9907885193824768, "learning_rate": 2.4425042564574186e-06, "loss": 1.371, "step": 215 }, { "epoch": 0.8178438661710037, "grad_norm": 0.9360305666923523, "learning_rate": 2.03365443542764e-06, "loss": 1.348, "step": 220 }, { "epoch": 0.8364312267657993, "grad_norm": 1.073083519935608, "learning_rate": 1.6583564031206357e-06, "loss": 1.3557, "step": 225 }, { "epoch": 0.8550185873605948, "grad_norm": 0.9857882857322693, "learning_rate": 1.3181907990135624e-06, "loss": 1.3387, "step": 230 }, { "epoch": 0.8736059479553904, "grad_norm": 0.9631243348121643, "learning_rate": 1.0145902956395449e-06, "loss": 1.3554, "step": 235 }, { "epoch": 0.8921933085501859, "grad_norm": 1.0025112628936768, "learning_rate": 7.488335646131628e-07, "loss": 1.3396, "step": 240 }, { "epoch": 0.9107806691449815, "grad_norm": 0.9768189191818237, "learning_rate": 5.22039891260262e-07, "loss": 1.3361, "step": 245 }, { "epoch": 0.929368029739777, "grad_norm": 0.9644155502319336, "learning_rate": 3.3516446053363015e-07, "loss": 1.3138, "step": 250 }, { "epoch": 0.9479553903345725, "grad_norm": 0.9707642197608948, "learning_rate": 1.889943340687961e-07, "loss": 1.3315, "step": 255 }, { "epoch": 0.966542750929368, "grad_norm": 1.0248664617538452, "learning_rate": 8.41451353233369e-08, "loss": 1.3724, "step": 260 }, { "epoch": 0.9851301115241635, "grad_norm": 0.9727205634117126, "learning_rate": 2.10584567608918e-08, "loss": 1.3296, "step": 265 }, { "epoch": 1.0, "step": 269, "total_flos": 1.000686074462208e+17, "train_loss": 1.5741833559199336, "train_runtime": 402.4792, "train_samples_per_second": 42.723, "train_steps_per_second": 0.668 } ], "logging_steps": 5, "max_steps": 269, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.000686074462208e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }