{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 7.0, "eval_steps": 500, "global_step": 231, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.15151515151515152, "grad_norm": 4.101644677095025, "learning_rate": 6.666666666666667e-06, "loss": 0.4313, "loss_nan_ranks": 0, "loss_rank_avg": 0.13728275895118713, "step": 5, "valid_targets_mean": 14623.7, "valid_targets_min": 8472 }, { "epoch": 0.30303030303030304, "grad_norm": 1.4454499538073753, "learning_rate": 1.5000000000000002e-05, "loss": 0.3895, "loss_nan_ranks": 0, "loss_rank_avg": 0.12719786167144775, "step": 10, "valid_targets_mean": 13794.9, "valid_targets_min": 8012 }, { "epoch": 0.45454545454545453, "grad_norm": 0.5038065535865701, "learning_rate": 2.3333333333333336e-05, "loss": 0.3418, "loss_nan_ranks": 0, "loss_rank_avg": 0.10844413936138153, "step": 15, "valid_targets_mean": 13961.8, "valid_targets_min": 7669 }, { "epoch": 0.6060606060606061, "grad_norm": 0.41739277046604123, "learning_rate": 3.1666666666666666e-05, "loss": 0.3153, "loss_nan_ranks": 0, "loss_rank_avg": 0.10475271940231323, "step": 20, "valid_targets_mean": 13416.8, "valid_targets_min": 8687 }, { "epoch": 0.7575757575757576, "grad_norm": 0.33255764844488456, "learning_rate": 4e-05, "loss": 0.2946, "loss_nan_ranks": 0, "loss_rank_avg": 0.0998668372631073, "step": 25, "valid_targets_mean": 14083.4, "valid_targets_min": 6271 }, { "epoch": 0.9090909090909091, "grad_norm": 0.24004137696165997, "learning_rate": 3.994244399375679e-05, "loss": 0.2711, "loss_nan_ranks": 0, "loss_rank_avg": 0.09284907579421997, "step": 30, "valid_targets_mean": 14132.5, "valid_targets_min": 5776 }, { "epoch": 1.0606060606060606, "grad_norm": 0.1967596430894074, "learning_rate": 3.977010724441261e-05, "loss": 0.2525, "loss_nan_ranks": 0, "loss_rank_avg": 0.08178113400936127, "step": 35, "valid_targets_mean": 13705.5, "valid_targets_min": 6252 }, { "epoch": 1.2121212121212122, "grad_norm": 0.1734645093560045, "learning_rate": 3.9483981653469586e-05, "loss": 0.2416, "loss_nan_ranks": 0, "loss_rank_avg": 0.07989491522312164, "step": 40, "valid_targets_mean": 13126.1, "valid_targets_min": 3159 }, { "epoch": 1.3636363636363638, "grad_norm": 0.14595289141638512, "learning_rate": 3.908571404555758e-05, "loss": 0.2317, "loss_nan_ranks": 0, "loss_rank_avg": 0.07932181656360626, "step": 45, "valid_targets_mean": 14597.3, "valid_targets_min": 6419 }, { "epoch": 1.5151515151515151, "grad_norm": 0.12986441353280917, "learning_rate": 3.8577596689969346e-05, "loss": 0.2282, "loss_nan_ranks": 0, "loss_rank_avg": 0.07134190946817398, "step": 50, "valid_targets_mean": 14610.6, "valid_targets_min": 9878 }, { "epoch": 1.6666666666666665, "grad_norm": 0.12387431430113442, "learning_rate": 3.7962554107273926e-05, "loss": 0.2185, "loss_nan_ranks": 0, "loss_rank_avg": 0.07062166929244995, "step": 55, "valid_targets_mean": 14205.5, "valid_targets_min": 8994 }, { "epoch": 1.8181818181818183, "grad_norm": 0.12267549383494643, "learning_rate": 3.724412623694427e-05, "loss": 0.2129, "loss_nan_ranks": 0, "loss_rank_avg": 0.0706438273191452, "step": 60, "valid_targets_mean": 14081.0, "valid_targets_min": 7128 }, { "epoch": 1.9696969696969697, "grad_norm": 0.13132589101740186, "learning_rate": 3.642644806287938e-05, "loss": 0.2075, "loss_nan_ranks": 0, "loss_rank_avg": 0.06862840801477432, "step": 65, "valid_targets_mean": 13509.2, "valid_targets_min": 6997 }, { "epoch": 2.121212121212121, "grad_norm": 0.12677199044727938, "learning_rate": 3.55142258140884e-05, "loss": 0.206, "loss_nan_ranks": 0, "loss_rank_avg": 0.06504399329423904, "step": 70, "valid_targets_mean": 13193.5, "valid_targets_min": 3186 }, { "epoch": 2.2727272727272725, "grad_norm": 0.12762593759862736, "learning_rate": 3.451270987751598e-05, "loss": 0.206, "loss_nan_ranks": 0, "loss_rank_avg": 0.07134096324443817, "step": 75, "valid_targets_mean": 14511.4, "valid_targets_min": 5311 }, { "epoch": 2.4242424242424243, "grad_norm": 0.13192592609992332, "learning_rate": 3.342766457891194e-05, "loss": 0.1969, "loss_nan_ranks": 0, "loss_rank_avg": 0.07061142474412918, "step": 80, "valid_targets_mean": 14961.1, "valid_targets_min": 9318 }, { "epoch": 2.5757575757575757, "grad_norm": 0.13179397065972617, "learning_rate": 3.226533500567433e-05, "loss": 0.1944, "loss_nan_ranks": 0, "loss_rank_avg": 0.06371868401765823, "step": 85, "valid_targets_mean": 14331.9, "valid_targets_min": 8629 }, { "epoch": 2.7272727272727275, "grad_norm": 0.1328210848350246, "learning_rate": 3.1032411062620544e-05, "loss": 0.1965, "loss_nan_ranks": 0, "loss_rank_avg": 0.06617560237646103, "step": 90, "valid_targets_mean": 14391.3, "valid_targets_min": 9017 }, { "epoch": 2.878787878787879, "grad_norm": 0.12863302635004717, "learning_rate": 2.973598896756697e-05, "loss": 0.192, "loss_nan_ranks": 0, "loss_rank_avg": 0.054877690970897675, "step": 95, "valid_targets_mean": 14151.3, "valid_targets_min": 6828 }, { "epoch": 3.0303030303030303, "grad_norm": 0.12536418590701268, "learning_rate": 2.8383530408333285e-05, "loss": 0.1911, "loss_nan_ranks": 0, "loss_rank_avg": 0.06761929392814636, "step": 100, "valid_targets_mean": 13563.6, "valid_targets_min": 5230 }, { "epoch": 3.1818181818181817, "grad_norm": 0.1333475753151719, "learning_rate": 2.6982819596247373e-05, "loss": 0.1887, "loss_nan_ranks": 0, "loss_rank_avg": 0.06502775102853775, "step": 105, "valid_targets_mean": 13841.5, "valid_targets_min": 7737 }, { "epoch": 3.3333333333333335, "grad_norm": 0.132217993667279, "learning_rate": 2.554191846333378e-05, "loss": 0.1847, "loss_nan_ranks": 0, "loss_rank_avg": 0.06282747536897659, "step": 110, "valid_targets_mean": 13982.2, "valid_targets_min": 6906 }, { "epoch": 3.484848484848485, "grad_norm": 0.1436057374223123, "learning_rate": 2.4069120261052682e-05, "loss": 0.1857, "loss_nan_ranks": 0, "loss_rank_avg": 0.06083241477608681, "step": 115, "valid_targets_mean": 14595.4, "valid_targets_min": 7001 }, { "epoch": 3.6363636363636362, "grad_norm": 0.12244006139108565, "learning_rate": 2.2572901827656626e-05, "loss": 0.1867, "loss_nan_ranks": 0, "loss_rank_avg": 0.06318728625774384, "step": 120, "valid_targets_mean": 14134.8, "valid_targets_min": 4539 }, { "epoch": 3.787878787878788, "grad_norm": 0.11988143398274238, "learning_rate": 2.1061874798894992e-05, "loss": 0.1845, "loss_nan_ranks": 0, "loss_rank_avg": 0.056565746665000916, "step": 125, "valid_targets_mean": 13645.1, "valid_targets_min": 6953 }, { "epoch": 3.9393939393939394, "grad_norm": 0.11973237571725541, "learning_rate": 1.9544736042877886e-05, "loss": 0.1823, "loss_nan_ranks": 0, "loss_rank_avg": 0.06245823949575424, "step": 130, "valid_targets_mean": 14213.2, "valid_targets_min": 7309 }, { "epoch": 4.090909090909091, "grad_norm": 0.12095919425002426, "learning_rate": 1.8030217604376628e-05, "loss": 0.1826, "loss_nan_ranks": 0, "loss_rank_avg": 0.06537604331970215, "step": 135, "valid_targets_mean": 14312.2, "valid_targets_min": 4618 }, { "epoch": 4.242424242424242, "grad_norm": 0.13130755282071463, "learning_rate": 1.6527036446661396e-05, "loss": 0.1834, "loss_nan_ranks": 0, "loss_rank_avg": 0.06200536713004112, "step": 140, "valid_targets_mean": 13995.9, "valid_targets_min": 3159 }, { "epoch": 4.393939393939394, "grad_norm": 0.12849185934902171, "learning_rate": 1.5043844280142005e-05, "loss": 0.1806, "loss_nan_ranks": 0, "loss_rank_avg": 0.06337574869394302, "step": 145, "valid_targets_mean": 14415.2, "valid_targets_min": 7395 }, { "epoch": 4.545454545454545, "grad_norm": 0.11717509615507843, "learning_rate": 1.358917776657806e-05, "loss": 0.1778, "loss_nan_ranks": 0, "loss_rank_avg": 0.05878318101167679, "step": 150, "valid_targets_mean": 13608.7, "valid_targets_min": 4237 }, { "epoch": 4.696969696969697, "grad_norm": 0.11445364177176875, "learning_rate": 1.2171409385463218e-05, "loss": 0.1763, "loss_nan_ranks": 0, "loss_rank_avg": 0.05898250639438629, "step": 155, "valid_targets_mean": 13554.9, "valid_targets_min": 6402 }, { "epoch": 4.848484848484849, "grad_norm": 0.12374376070955681, "learning_rate": 1.0798699245376959e-05, "loss": 0.1777, "loss_nan_ranks": 0, "loss_rank_avg": 0.0593249574303627, "step": 160, "valid_targets_mean": 14594.9, "valid_targets_min": 7248 }, { "epoch": 5.0, "grad_norm": 0.12668564763351842, "learning_rate": 9.478948117658577e-06, "loss": 0.1781, "loss_nan_ranks": 0, "loss_rank_avg": 0.058884840458631516, "step": 165, "valid_targets_mean": 13341.7, "valid_targets_min": 4728 }, { "epoch": 5.151515151515151, "grad_norm": 0.1277080409655682, "learning_rate": 8.219751962722726e-06, "loss": 0.1799, "loss_nan_ranks": 0, "loss_rank_avg": 0.059690214693546295, "step": 170, "valid_targets_mean": 14140.9, "valid_targets_min": 5241 }, { "epoch": 5.303030303030303, "grad_norm": 0.12129503849947158, "learning_rate": 7.028358210744881e-06, "loss": 0.1719, "loss_nan_ranks": 0, "loss_rank_avg": 0.05800726264715195, "step": 175, "valid_targets_mean": 13437.8, "valid_targets_min": 3456 }, { "epoch": 5.454545454545454, "grad_norm": 0.12508479858740726, "learning_rate": 5.911624048347757e-06, "loss": 0.1778, "loss_nan_ranks": 0, "loss_rank_avg": 0.0543239489197731, "step": 180, "valid_targets_mean": 13450.3, "valid_targets_min": 8527 }, { "epoch": 5.606060606060606, "grad_norm": 0.11850280030919257, "learning_rate": 4.875976951373633e-06, "loss": 0.1766, "loss_nan_ranks": 0, "loss_rank_avg": 0.05465681105852127, "step": 185, "valid_targets_mean": 14527.5, "valid_targets_min": 6291 }, { "epoch": 5.757575757575758, "grad_norm": 0.1259060690244162, "learning_rate": 3.927377690900436e-06, "loss": 0.1751, "loss_nan_ranks": 0, "loss_rank_avg": 0.06268830597400665, "step": 190, "valid_targets_mean": 15079.5, "valid_targets_min": 5311 }, { "epoch": 5.909090909090909, "grad_norm": 0.14573628407483133, "learning_rate": 3.071286025423983e-06, "loss": 0.1757, "loss_nan_ranks": 0, "loss_rank_avg": 0.056300319731235504, "step": 195, "valid_targets_mean": 13432.3, "valid_targets_min": 4391 }, { "epoch": 6.0606060606060606, "grad_norm": 0.1202999097496865, "learning_rate": 2.312629276668554e-06, "loss": 0.1741, "loss_nan_ranks": 0, "loss_rank_avg": 0.05936381220817566, "step": 200, "valid_targets_mean": 14715.9, "valid_targets_min": 8912 }, { "epoch": 6.212121212121212, "grad_norm": 0.1180038599779071, "learning_rate": 1.6557739698909436e-06, "loss": 0.176, "loss_nan_ranks": 0, "loss_rank_avg": 0.058680810034275055, "step": 205, "valid_targets_mean": 13863.8, "valid_targets_min": 5335 }, { "epoch": 6.363636363636363, "grad_norm": 0.11530000691449392, "learning_rate": 1.1045007019049182e-06, "loss": 0.1749, "loss_nan_ranks": 0, "loss_rank_avg": 0.05926910787820816, "step": 210, "valid_targets_mean": 13876.2, "valid_targets_min": 4510 }, { "epoch": 6.515151515151516, "grad_norm": 0.10947627448010318, "learning_rate": 6.619823814758786e-07, "loss": 0.172, "loss_nan_ranks": 0, "loss_rank_avg": 0.04738888889551163, "step": 215, "valid_targets_mean": 13260.7, "valid_targets_min": 2975 }, { "epoch": 6.666666666666667, "grad_norm": 0.11639605353220038, "learning_rate": 3.307659673251595e-07, "loss": 0.1755, "loss_nan_ranks": 0, "loss_rank_avg": 0.05408981069922447, "step": 220, "valid_targets_mean": 13885.9, "valid_targets_min": 4618 }, { "epoch": 6.818181818181818, "grad_norm": 0.11887366873653407, "learning_rate": 1.1275780885282806e-07, "loss": 0.1759, "loss_nan_ranks": 0, "loss_rank_avg": 0.060421451926231384, "step": 225, "valid_targets_mean": 13998.0, "valid_targets_min": 7669 }, { "epoch": 6.96969696969697, "grad_norm": 0.11790553258946189, "learning_rate": 9.212673951897177e-09, "loss": 0.1756, "loss_nan_ranks": 0, "loss_rank_avg": 0.061030447483062744, "step": 230, "valid_targets_mean": 14301.2, "valid_targets_min": 7992 }, { "epoch": 7.0, "step": 231, "total_flos": 1.7948424939556045e+18, "train_loss": 0.0, "train_runtime": 4.778, "train_samples_per_second": 4629.543, "train_steps_per_second": 48.346 } ], "logging_steps": 5, "max_steps": 231, "num_input_tokens_seen": 0, "num_train_epochs": 7, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.7948424939556045e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }