{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.998003992015968, "eval_steps": 100, "global_step": 250, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003992015968063872, "grad_norm": 71.5754165649414, "learning_rate": 0.0, "loss": 2.0095, "step": 1 }, { "epoch": 0.01996007984031936, "grad_norm": 25.808263778686523, "learning_rate": 3.2000000000000003e-06, "loss": 1.8226, "step": 5 }, { "epoch": 0.03992015968063872, "grad_norm": 10.55435562133789, "learning_rate": 7.2000000000000005e-06, "loss": 1.5338, "step": 10 }, { "epoch": 0.059880239520958084, "grad_norm": 13.148735046386719, "learning_rate": 1.1200000000000001e-05, "loss": 1.4367, "step": 15 }, { "epoch": 0.07984031936127745, "grad_norm": 6.636435031890869, "learning_rate": 1.5200000000000002e-05, "loss": 1.444, "step": 20 }, { "epoch": 0.0998003992015968, "grad_norm": 4.353740215301514, "learning_rate": 1.9200000000000003e-05, "loss": 1.4407, "step": 25 }, { "epoch": 0.11976047904191617, "grad_norm": 3.963563919067383, "learning_rate": 1.9984407641819812e-05, "loss": 1.4644, "step": 30 }, { "epoch": 0.13972055888223553, "grad_norm": 4.042232036590576, "learning_rate": 1.9921147013144782e-05, "loss": 1.4582, "step": 35 }, { "epoch": 0.1596806387225549, "grad_norm": 3.6070656776428223, "learning_rate": 1.9809551553491918e-05, "loss": 1.461, "step": 40 }, { "epoch": 0.17964071856287425, "grad_norm": 3.843057632446289, "learning_rate": 1.9650164944723116e-05, "loss": 1.4496, "step": 45 }, { "epoch": 0.1996007984031936, "grad_norm": 3.784003734588623, "learning_rate": 1.944376370237481e-05, "loss": 1.4632, "step": 50 }, { "epoch": 0.21956087824351297, "grad_norm": 3.471970319747925, "learning_rate": 1.9191353392552346e-05, "loss": 1.4363, "step": 55 }, { "epoch": 0.23952095808383234, "grad_norm": 3.609161615371704, "learning_rate": 1.889416373291298e-05, "loss": 1.4209, "step": 60 }, { "epoch": 0.25948103792415167, "grad_norm": 3.706693649291992, "learning_rate": 1.855364260160507e-05, "loss": 1.3991, "step": 65 }, { "epoch": 0.27944111776447106, "grad_norm": 3.828991174697876, "learning_rate": 1.8171448983351284e-05, "loss": 1.4168, "step": 70 }, { "epoch": 0.2994011976047904, "grad_norm": 3.53777813911438, "learning_rate": 1.7749444887041797e-05, "loss": 1.4197, "step": 75 }, { "epoch": 0.3193612774451098, "grad_norm": 3.46360182762146, "learning_rate": 1.7289686274214116e-05, "loss": 1.4041, "step": 80 }, { "epoch": 0.3393213572854291, "grad_norm": 3.3420891761779785, "learning_rate": 1.6794413042615168e-05, "loss": 1.361, "step": 85 }, { "epoch": 0.3592814371257485, "grad_norm": 3.3036203384399414, "learning_rate": 1.6266038113644605e-05, "loss": 1.3671, "step": 90 }, { "epoch": 0.37924151696606784, "grad_norm": 3.4878897666931152, "learning_rate": 1.570713567684432e-05, "loss": 1.346, "step": 95 }, { "epoch": 0.3992015968063872, "grad_norm": 4.090396404266357, "learning_rate": 1.5120428648705716e-05, "loss": 1.3645, "step": 100 }, { "epoch": 0.3992015968063872, "eval_loss": 1.3725436925888062, "eval_runtime": 4.6422, "eval_samples_per_second": 194.519, "eval_steps_per_second": 6.247, "step": 100 }, { "epoch": 0.41916167664670656, "grad_norm": 3.2958004474639893, "learning_rate": 1.4508775406894308e-05, "loss": 1.3203, "step": 105 }, { "epoch": 0.43912175648702595, "grad_norm": 3.205641746520996, "learning_rate": 1.3875155864521031e-05, "loss": 1.3251, "step": 110 }, { "epoch": 0.4590818363273453, "grad_norm": 3.419351100921631, "learning_rate": 1.3222656952305113e-05, "loss": 1.3093, "step": 115 }, { "epoch": 0.47904191616766467, "grad_norm": 3.5063862800598145, "learning_rate": 1.2554457579357906e-05, "loss": 1.297, "step": 120 }, { "epoch": 0.499001996007984, "grad_norm": 3.2938807010650635, "learning_rate": 1.187381314585725e-05, "loss": 1.2889, "step": 125 }, { "epoch": 0.5189620758483033, "grad_norm": 3.2896780967712402, "learning_rate": 1.1184039683065014e-05, "loss": 1.2707, "step": 130 }, { "epoch": 0.5389221556886228, "grad_norm": 3.1759278774261475, "learning_rate": 1.0488497697956134e-05, "loss": 1.2518, "step": 135 }, { "epoch": 0.5588822355289421, "grad_norm": 3.616849422454834, "learning_rate": 9.790575801166432e-06, "loss": 1.2737, "step": 140 }, { "epoch": 0.5788423153692615, "grad_norm": 3.459834098815918, "learning_rate": 9.093674198022201e-06, "loss": 1.2496, "step": 145 }, { "epoch": 0.5988023952095808, "grad_norm": 3.072103261947632, "learning_rate": 8.401188123081653e-06, "loss": 1.2129, "step": 150 }, { "epoch": 0.6187624750499002, "grad_norm": 3.2528676986694336, "learning_rate": 7.716491298893443e-06, "loss": 1.2096, "step": 155 }, { "epoch": 0.6387225548902196, "grad_norm": 3.041900157928467, "learning_rate": 7.042919499559538e-06, "loss": 1.2171, "step": 160 }, { "epoch": 0.6586826347305389, "grad_norm": 3.830709457397461, "learning_rate": 6.383754299179079e-06, "loss": 1.2038, "step": 165 }, { "epoch": 0.6786427145708582, "grad_norm": 3.1818060874938965, "learning_rate": 5.742207084349274e-06, "loss": 1.1999, "step": 170 }, { "epoch": 0.6986027944111777, "grad_norm": 3.237358331680298, "learning_rate": 5.121403408612672e-06, "loss": 1.1821, "step": 175 }, { "epoch": 0.718562874251497, "grad_norm": 3.207139015197754, "learning_rate": 4.524367765074499e-06, "loss": 1.1617, "step": 180 }, { "epoch": 0.7385229540918163, "grad_norm": 3.0992743968963623, "learning_rate": 3.954008851376252e-06, "loss": 1.1629, "step": 185 }, { "epoch": 0.7584830339321357, "grad_norm": 3.1126255989074707, "learning_rate": 3.4131053988131947e-06, "loss": 1.1688, "step": 190 }, { "epoch": 0.7784431137724551, "grad_norm": 3.3172667026519775, "learning_rate": 2.9042926346347932e-06, "loss": 1.1507, "step": 195 }, { "epoch": 0.7984031936127745, "grad_norm": 3.125807762145996, "learning_rate": 2.4300494434824373e-06, "loss": 1.1459, "step": 200 }, { "epoch": 0.7984031936127745, "eval_loss": 1.1677805185317993, "eval_runtime": 4.6292, "eval_samples_per_second": 195.067, "eval_steps_per_second": 6.265, "step": 200 }, { "epoch": 0.8183632734530938, "grad_norm": 3.1806719303131104, "learning_rate": 1.9926862905126663e-06, "loss": 1.1508, "step": 205 }, { "epoch": 0.8383233532934131, "grad_norm": 3.2433359622955322, "learning_rate": 1.5943339650431578e-06, "loss": 1.1156, "step": 210 }, { "epoch": 0.8582834331337326, "grad_norm": 3.1037845611572266, "learning_rate": 1.2369331995613664e-06, "loss": 1.1278, "step": 215 }, { "epoch": 0.8782435129740519, "grad_norm": 3.121793270111084, "learning_rate": 9.222252146709143e-07, "loss": 1.1291, "step": 220 }, { "epoch": 0.8982035928143712, "grad_norm": 3.311478614807129, "learning_rate": 6.517432360398556e-07, "loss": 1.1606, "step": 225 }, { "epoch": 0.9181636726546906, "grad_norm": 3.1572906970977783, "learning_rate": 4.268050246793276e-07, "loss": 1.1376, "step": 230 }, { "epoch": 0.93812375249501, "grad_norm": 3.125819683074951, "learning_rate": 2.4850645694436736e-07, "loss": 1.1042, "step": 235 }, { "epoch": 0.9580838323353293, "grad_norm": 3.240495443344116, "learning_rate": 1.1771618553447217e-07, "loss": 1.1349, "step": 240 }, { "epoch": 0.9780439121756487, "grad_norm": 3.0710411071777344, "learning_rate": 3.50714075049563e-08, "loss": 1.1139, "step": 245 }, { "epoch": 0.998003992015968, "grad_norm": 3.2199409008026123, "learning_rate": 9.74759906957612e-10, "loss": 1.1324, "step": 250 }, { "epoch": 0.998003992015968, "step": 250, "total_flos": 8.741444925364634e+16, "train_loss": 1.2971151485443115, "train_runtime": 891.5874, "train_samples_per_second": 17.966, "train_steps_per_second": 0.28 } ], "logging_steps": 5, "max_steps": 250, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.741444925364634e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }