{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 7.0, "eval_steps": 500, "global_step": 231, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.15151515151515152, "grad_norm": 6.057079066807802, "learning_rate": 6.666666666666667e-06, "loss": 0.3733, "loss_nan_ranks": 0, "loss_rank_avg": 0.12278221547603607, "step": 5, "valid_targets_mean": 3397.3, "valid_targets_min": 1727 }, { "epoch": 0.30303030303030304, "grad_norm": 1.9357682815309185, "learning_rate": 1.5000000000000002e-05, "loss": 0.312, "loss_nan_ranks": 0, "loss_rank_avg": 0.1016702950000763, "step": 10, "valid_targets_mean": 3230.0, "valid_targets_min": 1680 }, { "epoch": 0.45454545454545453, "grad_norm": 0.8312941460348809, "learning_rate": 2.3333333333333336e-05, "loss": 0.2681, "loss_nan_ranks": 0, "loss_rank_avg": 0.09491170942783356, "step": 15, "valid_targets_mean": 2857.5, "valid_targets_min": 1839 }, { "epoch": 0.6060606060606061, "grad_norm": 0.4158059347121491, "learning_rate": 3.1666666666666666e-05, "loss": 0.2136, "loss_nan_ranks": 0, "loss_rank_avg": 0.05754208192229271, "step": 20, "valid_targets_mean": 3214.3, "valid_targets_min": 1884 }, { "epoch": 0.7575757575757576, "grad_norm": 0.31278109144655264, "learning_rate": 4e-05, "loss": 0.1873, "loss_nan_ranks": 0, "loss_rank_avg": 0.06500861793756485, "step": 25, "valid_targets_mean": 4166.3, "valid_targets_min": 1993 }, { "epoch": 0.9090909090909091, "grad_norm": 0.2571670345543004, "learning_rate": 3.994244399375679e-05, "loss": 0.1678, "loss_nan_ranks": 0, "loss_rank_avg": 0.05598926544189453, "step": 30, "valid_targets_mean": 3023.3, "valid_targets_min": 1964 }, { "epoch": 1.0606060606060606, "grad_norm": 0.18211562027887052, "learning_rate": 3.977010724441261e-05, "loss": 0.154, "loss_nan_ranks": 0, "loss_rank_avg": 0.04845510050654411, "step": 35, "valid_targets_mean": 4795.8, "valid_targets_min": 1751 }, { "epoch": 1.2121212121212122, "grad_norm": 0.1847599593737035, "learning_rate": 3.9483981653469586e-05, "loss": 0.1478, "loss_nan_ranks": 0, "loss_rank_avg": 0.044909439980983734, "step": 40, "valid_targets_mean": 3843.7, "valid_targets_min": 1952 }, { "epoch": 1.3636363636363638, "grad_norm": 0.2095868038344813, "learning_rate": 3.908571404555758e-05, "loss": 0.1413, "loss_nan_ranks": 0, "loss_rank_avg": 0.05491482838988304, "step": 45, "valid_targets_mean": 4408.9, "valid_targets_min": 1981 }, { "epoch": 1.5151515151515151, "grad_norm": 0.1638929758946134, "learning_rate": 3.8577596689969346e-05, "loss": 0.1279, "loss_nan_ranks": 0, "loss_rank_avg": 0.04170622676610947, "step": 50, "valid_targets_mean": 3393.7, "valid_targets_min": 1884 }, { "epoch": 1.6666666666666665, "grad_norm": 0.17488311510988763, "learning_rate": 3.7962554107273926e-05, "loss": 0.1353, "loss_nan_ranks": 0, "loss_rank_avg": 0.04213090240955353, "step": 55, "valid_targets_mean": 4166.2, "valid_targets_min": 1919 }, { "epoch": 1.8181818181818183, "grad_norm": 0.16614643886860392, "learning_rate": 3.724412623694427e-05, "loss": 0.1222, "loss_nan_ranks": 0, "loss_rank_avg": 0.04267009347677231, "step": 60, "valid_targets_mean": 3267.3, "valid_targets_min": 1836 }, { "epoch": 1.9696969696969697, "grad_norm": 0.18257612037274068, "learning_rate": 3.642644806287938e-05, "loss": 0.1232, "loss_nan_ranks": 0, "loss_rank_avg": 0.04186457395553589, "step": 65, "valid_targets_mean": 3968.7, "valid_targets_min": 2080 }, { "epoch": 2.121212121212121, "grad_norm": 0.19648836279548257, "learning_rate": 3.55142258140884e-05, "loss": 0.1158, "loss_nan_ranks": 0, "loss_rank_avg": 0.04170043393969536, "step": 70, "valid_targets_mean": 3565.9, "valid_targets_min": 1709 }, { "epoch": 2.2727272727272725, "grad_norm": 0.161248915765299, "learning_rate": 3.451270987751598e-05, "loss": 0.1118, "loss_nan_ranks": 0, "loss_rank_avg": 0.033035580068826675, "step": 75, "valid_targets_mean": 4527.8, "valid_targets_min": 1786 }, { "epoch": 2.4242424242424243, "grad_norm": 0.17273322419326265, "learning_rate": 3.342766457891194e-05, "loss": 0.1109, "loss_nan_ranks": 0, "loss_rank_avg": 0.04147561639547348, "step": 80, "valid_targets_mean": 4303.3, "valid_targets_min": 1802 }, { "epoch": 2.5757575757575757, "grad_norm": 0.21495077559714412, "learning_rate": 3.226533500567433e-05, "loss": 0.107, "loss_nan_ranks": 0, "loss_rank_avg": 0.03459729626774788, "step": 85, "valid_targets_mean": 5005.4, "valid_targets_min": 1959 }, { "epoch": 2.7272727272727275, "grad_norm": 0.17330277599242455, "learning_rate": 3.1032411062620544e-05, "loss": 0.1065, "loss_nan_ranks": 0, "loss_rank_avg": 0.03450329601764679, "step": 90, "valid_targets_mean": 3855.8, "valid_targets_min": 2000 }, { "epoch": 2.878787878787879, "grad_norm": 0.2058052432782279, "learning_rate": 2.973598896756697e-05, "loss": 0.1002, "loss_nan_ranks": 0, "loss_rank_avg": 0.034510307013988495, "step": 95, "valid_targets_mean": 2783.3, "valid_targets_min": 1517 }, { "epoch": 3.0303030303030303, "grad_norm": 0.1831716887148403, "learning_rate": 2.8383530408333285e-05, "loss": 0.1045, "loss_nan_ranks": 0, "loss_rank_avg": 0.034129731357097626, "step": 100, "valid_targets_mean": 4927.6, "valid_targets_min": 1839 }, { "epoch": 3.1818181818181817, "grad_norm": 0.2213504054767785, "learning_rate": 2.6982819596247373e-05, "loss": 0.0913, "loss_nan_ranks": 0, "loss_rank_avg": 0.03140562027692795, "step": 105, "valid_targets_mean": 4125.8, "valid_targets_min": 1952 }, { "epoch": 3.3333333333333335, "grad_norm": 0.2357403108609295, "learning_rate": 2.554191846333378e-05, "loss": 0.0977, "loss_nan_ranks": 0, "loss_rank_avg": 0.04095553234219551, "step": 110, "valid_targets_mean": 3955.2, "valid_targets_min": 1884 }, { "epoch": 3.484848484848485, "grad_norm": 0.21189842608787948, "learning_rate": 2.4069120261052682e-05, "loss": 0.0944, "loss_nan_ranks": 0, "loss_rank_avg": 0.03247535228729248, "step": 115, "valid_targets_mean": 4148.2, "valid_targets_min": 1833 }, { "epoch": 3.6363636363636362, "grad_norm": 0.17986746292389105, "learning_rate": 2.2572901827656626e-05, "loss": 0.0941, "loss_nan_ranks": 0, "loss_rank_avg": 0.028431307524442673, "step": 120, "valid_targets_mean": 3585.2, "valid_targets_min": 1838 }, { "epoch": 3.787878787878788, "grad_norm": 0.16969823562432254, "learning_rate": 2.1061874798894992e-05, "loss": 0.0887, "loss_nan_ranks": 0, "loss_rank_avg": 0.029099609702825546, "step": 125, "valid_targets_mean": 4505.8, "valid_targets_min": 1839 }, { "epoch": 3.9393939393939394, "grad_norm": 0.1984359751489041, "learning_rate": 1.9544736042877886e-05, "loss": 0.0949, "loss_nan_ranks": 0, "loss_rank_avg": 0.0295043233782053, "step": 130, "valid_targets_mean": 3523.4, "valid_targets_min": 1771 }, { "epoch": 4.090909090909091, "grad_norm": 0.20675052605469652, "learning_rate": 1.8030217604376628e-05, "loss": 0.0865, "loss_nan_ranks": 0, "loss_rank_avg": 0.02581365406513214, "step": 135, "valid_targets_mean": 3747.0, "valid_targets_min": 1891 }, { "epoch": 4.242424242424242, "grad_norm": 0.20602993352135518, "learning_rate": 1.6527036446661396e-05, "loss": 0.0831, "loss_nan_ranks": 0, "loss_rank_avg": 0.026312127709388733, "step": 140, "valid_targets_mean": 3792.3, "valid_targets_min": 1601 }, { "epoch": 4.393939393939394, "grad_norm": 0.35322379499269224, "learning_rate": 1.5043844280142005e-05, "loss": 0.0811, "loss_nan_ranks": 0, "loss_rank_avg": 0.022907646372914314, "step": 145, "valid_targets_mean": 3479.7, "valid_targets_min": 1764 }, { "epoch": 4.545454545454545, "grad_norm": 0.20717971378191377, "learning_rate": 1.358917776657806e-05, "loss": 0.0825, "loss_nan_ranks": 0, "loss_rank_avg": 0.02863912098109722, "step": 150, "valid_targets_mean": 4172.6, "valid_targets_min": 1907 }, { "epoch": 4.696969696969697, "grad_norm": 0.21192392671392013, "learning_rate": 1.2171409385463218e-05, "loss": 0.0834, "loss_nan_ranks": 0, "loss_rank_avg": 0.027893800288438797, "step": 155, "valid_targets_mean": 5243.5, "valid_targets_min": 2145 }, { "epoch": 4.848484848484849, "grad_norm": 0.183311922132141, "learning_rate": 1.0798699245376959e-05, "loss": 0.0802, "loss_nan_ranks": 0, "loss_rank_avg": 0.02220100164413452, "step": 160, "valid_targets_mean": 4365.3, "valid_targets_min": 1802 }, { "epoch": 5.0, "grad_norm": 0.2352242705791683, "learning_rate": 9.478948117658577e-06, "loss": 0.0866, "loss_nan_ranks": 0, "loss_rank_avg": 0.0274435393512249, "step": 165, "valid_targets_mean": 2765.8, "valid_targets_min": 1918 }, { "epoch": 5.151515151515151, "grad_norm": 0.22402682073481225, "learning_rate": 8.219751962722726e-06, "loss": 0.0809, "loss_nan_ranks": 0, "loss_rank_avg": 0.025124292820692062, "step": 170, "valid_targets_mean": 2667.7, "valid_targets_min": 1680 }, { "epoch": 5.303030303030303, "grad_norm": 0.2253066902214136, "learning_rate": 7.028358210744881e-06, "loss": 0.0749, "loss_nan_ranks": 0, "loss_rank_avg": 0.02643812820315361, "step": 175, "valid_targets_mean": 3633.1, "valid_targets_min": 2001 }, { "epoch": 5.454545454545454, "grad_norm": 0.2122452461863688, "learning_rate": 5.911624048347757e-06, "loss": 0.077, "loss_nan_ranks": 0, "loss_rank_avg": 0.025532612577080727, "step": 180, "valid_targets_mean": 3961.9, "valid_targets_min": 1904 }, { "epoch": 5.606060606060606, "grad_norm": 0.21672480103767722, "learning_rate": 4.875976951373633e-06, "loss": 0.0761, "loss_nan_ranks": 0, "loss_rank_avg": 0.02824552357196808, "step": 185, "valid_targets_mean": 3794.2, "valid_targets_min": 1709 }, { "epoch": 5.757575757575758, "grad_norm": 0.21336791110807513, "learning_rate": 3.927377690900436e-06, "loss": 0.0786, "loss_nan_ranks": 0, "loss_rank_avg": 0.023748915642499924, "step": 190, "valid_targets_mean": 2816.2, "valid_targets_min": 1931 }, { "epoch": 5.909090909090909, "grad_norm": 0.19971132204134792, "learning_rate": 3.071286025423983e-06, "loss": 0.0754, "loss_nan_ranks": 0, "loss_rank_avg": 0.021909723058342934, "step": 195, "valid_targets_mean": 3034.3, "valid_targets_min": 1709 }, { "epoch": 6.0606060606060606, "grad_norm": 0.19967467647532736, "learning_rate": 2.312629276668554e-06, "loss": 0.0761, "loss_nan_ranks": 0, "loss_rank_avg": 0.02368747815489769, "step": 200, "valid_targets_mean": 3495.7, "valid_targets_min": 1891 }, { "epoch": 6.212121212121212, "grad_norm": 0.21684418439150877, "learning_rate": 1.6557739698909436e-06, "loss": 0.0757, "loss_nan_ranks": 0, "loss_rank_avg": 0.025541555136442184, "step": 205, "valid_targets_mean": 4154.6, "valid_targets_min": 1868 }, { "epoch": 6.363636363636363, "grad_norm": 0.24983119975734241, "learning_rate": 1.1045007019049182e-06, "loss": 0.0749, "loss_nan_ranks": 0, "loss_rank_avg": 0.02706868201494217, "step": 210, "valid_targets_mean": 3688.0, "valid_targets_min": 2086 }, { "epoch": 6.515151515151516, "grad_norm": 0.21233551956379038, "learning_rate": 6.619823814758786e-07, "loss": 0.0746, "loss_nan_ranks": 0, "loss_rank_avg": 0.023870835080742836, "step": 215, "valid_targets_mean": 4682.3, "valid_targets_min": 1914 }, { "epoch": 6.666666666666667, "grad_norm": 0.19011857432739673, "learning_rate": 3.307659673251595e-07, "loss": 0.0733, "loss_nan_ranks": 0, "loss_rank_avg": 0.027834340929985046, "step": 220, "valid_targets_mean": 4573.5, "valid_targets_min": 2055 }, { "epoch": 6.818181818181818, "grad_norm": 0.2113606209898585, "learning_rate": 1.1275780885282806e-07, "loss": 0.0763, "loss_nan_ranks": 0, "loss_rank_avg": 0.021668311208486557, "step": 225, "valid_targets_mean": 3098.8, "valid_targets_min": 1838 }, { "epoch": 6.96969696969697, "grad_norm": 0.23339033147330146, "learning_rate": 9.212673951897177e-09, "loss": 0.0708, "loss_nan_ranks": 0, "loss_rank_avg": 0.023356806486845016, "step": 230, "valid_targets_mean": 3134.9, "valid_targets_min": 1601 }, { "epoch": 7.0, "step": 231, "total_flos": 2.9757843152725606e+17, "train_loss": 0.0, "train_runtime": 1.1973, "train_samples_per_second": 18474.619, "train_steps_per_second": 192.931 } ], "logging_steps": 5, "max_steps": 231, "num_input_tokens_seen": 0, "num_train_epochs": 7, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.9757843152725606e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }