{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 396, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.07590132827324478, "grad_norm": 22.0, "learning_rate": 9e-08, "loss": 1.0496648788452148, "step": 10 }, { "epoch": 0.15180265654648956, "grad_norm": 21.5, "learning_rate": 1.8999999999999998e-07, "loss": 1.0672600746154786, "step": 20 }, { "epoch": 0.22770398481973433, "grad_norm": 23.375, "learning_rate": 2.9e-07, "loss": 1.074104118347168, "step": 30 }, { "epoch": 0.3036053130929791, "grad_norm": 19.0, "learning_rate": 3.8999999999999997e-07, "loss": 1.0784456253051757, "step": 40 }, { "epoch": 0.3795066413662239, "grad_norm": 21.25, "learning_rate": 4.9e-07, "loss": 1.0631229400634765, "step": 50 }, { "epoch": 0.45540796963946867, "grad_norm": 18.625, "learning_rate": 5.9e-07, "loss": 1.0021364212036132, "step": 60 }, { "epoch": 0.5313092979127134, "grad_norm": 19.625, "learning_rate": 6.9e-07, "loss": 1.0368730545043945, "step": 70 }, { "epoch": 0.6072106261859582, "grad_norm": 13.1875, "learning_rate": 7.9e-07, "loss": 0.9585152626037597, "step": 80 }, { "epoch": 0.683111954459203, "grad_norm": 10.4375, "learning_rate": 8.9e-07, "loss": 0.8780872344970703, "step": 90 }, { "epoch": 0.7590132827324478, "grad_norm": 6.84375, "learning_rate": 9.9e-07, "loss": 0.8695860862731933, "step": 100 }, { "epoch": 0.8349146110056926, "grad_norm": 3.3125, "learning_rate": 9.977206495402552e-07, "loss": 0.8098324775695801, "step": 110 }, { "epoch": 0.9108159392789373, "grad_norm": 2.96875, "learning_rate": 9.898680903107666e-07, "loss": 0.8349854469299316, "step": 120 }, { "epoch": 0.9867172675521821, "grad_norm": 2.9375, "learning_rate": 9.76502534086636e-07, "loss": 0.8502012252807617, "step": 130 }, { "epoch": 1.060721062618596, "grad_norm": 2.78125, "learning_rate": 9.577743974243872e-07, "loss": 0.7735446929931641, "step": 140 }, { "epoch": 1.1366223908918407, "grad_norm": 3.359375, "learning_rate": 9.338944475962236e-07, "loss": 0.8373490333557129, "step": 150 }, { "epoch": 1.2125237191650853, "grad_norm": 2.59375, "learning_rate": 9.051314306058933e-07, "loss": 0.8185565948486329, "step": 160 }, { "epoch": 1.2884250474383303, "grad_norm": 2.9375, "learning_rate": 8.718090467093653e-07, "loss": 0.8056498527526855, "step": 170 }, { "epoch": 1.364326375711575, "grad_norm": 2.59375, "learning_rate": 8.343023074779368e-07, "loss": 0.8409146308898926, "step": 180 }, { "epoch": 1.4402277039848197, "grad_norm": 3.390625, "learning_rate": 7.930333154015465e-07, "loss": 0.7972420215606689, "step": 190 }, { "epoch": 1.5161290322580645, "grad_norm": 2.546875, "learning_rate": 7.484665135288213e-07, "loss": 0.8233073234558106, "step": 200 }, { "epoch": 1.5920303605313093, "grad_norm": 2.640625, "learning_rate": 7.011034586046176e-07, "loss": 0.8327888488769531, "step": 210 }, { "epoch": 1.6679316888045541, "grad_norm": 2.65625, "learning_rate": 6.514771765283942e-07, "loss": 0.7899296283721924, "step": 220 }, { "epoch": 1.7438330170777987, "grad_norm": 2.984375, "learning_rate": 6.001461636573396e-07, "loss": 0.7956095695495605, "step": 230 }, { "epoch": 1.8197343453510437, "grad_norm": 3.625, "learning_rate": 5.47688101463849e-07, "loss": 0.8178939819335938, "step": 240 }, { "epoch": 1.8956356736242883, "grad_norm": 2.828125, "learning_rate": 4.946933552828719e-07, "loss": 0.8414199829101563, "step": 250 }, { "epoch": 1.9715370018975333, "grad_norm": 2.375, "learning_rate": 4.417583303145147e-07, "loss": 0.797624921798706, "step": 260 }, { "epoch": 2.0455407969639468, "grad_norm": 2.890625, "learning_rate": 3.894787596537351e-07, "loss": 0.8394969940185547, "step": 270 }, { "epoch": 2.121442125237192, "grad_norm": 2.375, "learning_rate": 3.384429998839375e-07, "loss": 0.799235200881958, "step": 280 }, { "epoch": 2.1973434535104364, "grad_norm": 2.46875, "learning_rate": 2.8922540968615283e-07, "loss": 0.8156853675842285, "step": 290 }, { "epoch": 2.2732447817836814, "grad_norm": 2.421875, "learning_rate": 2.423798859812275e-07, "loss": 0.8007305145263672, "step": 300 }, { "epoch": 2.349146110056926, "grad_norm": 2.5, "learning_rate": 1.9843363034955795e-07, "loss": 0.8174427032470704, "step": 310 }, { "epoch": 2.4250474383301706, "grad_norm": 2.203125, "learning_rate": 1.5788121588135972e-07, "loss": 0.7784779071807861, "step": 320 }, { "epoch": 2.5009487666034156, "grad_norm": 2.703125, "learning_rate": 1.211790212293986e-07, "loss": 0.8244176864624023, "step": 330 }, { "epoch": 2.5768500948766606, "grad_norm": 2.828125, "learning_rate": 8.874009450359426e-08, "loss": 0.831035041809082, "step": 340 }, { "epoch": 2.652751423149905, "grad_norm": 2.34375, "learning_rate": 6.092950480945897e-08, "loss": 0.7749196052551269, "step": 350 }, { "epoch": 2.72865275142315, "grad_norm": 2.328125, "learning_rate": 3.806023374435663e-08, "loss": 0.8048405647277832, "step": 360 }, { "epoch": 2.804554079696395, "grad_norm": 2.625, "learning_rate": 2.0389653088865033e-08, "loss": 0.8310153961181641, "step": 370 }, { "epoch": 2.8804554079696394, "grad_norm": 2.78125, "learning_rate": 8.11662833345822e-09, "loss": 0.8102119445800782, "step": 380 }, { "epoch": 2.956356736242884, "grad_norm": 2.453125, "learning_rate": 1.3792806375464427e-09, "loss": 0.7998002052307129, "step": 390 } ], "logging_steps": 10, "max_steps": 396, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.087240815119319e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }