{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 7.0, "eval_steps": 500, "global_step": 231, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.15151515151515152, "grad_norm": 4.114635968171536, "learning_rate": 6.666666666666667e-06, "loss": 0.4313, "loss_nan_ranks": 0, "loss_rank_avg": 0.13727591931819916, "step": 5, "valid_targets_mean": 14623.7, "valid_targets_min": 8472 }, { "epoch": 0.30303030303030304, "grad_norm": 1.4536543569838813, "learning_rate": 1.5000000000000002e-05, "loss": 0.3895, "loss_nan_ranks": 0, "loss_rank_avg": 0.12720003724098206, "step": 10, "valid_targets_mean": 13794.9, "valid_targets_min": 8012 }, { "epoch": 0.45454545454545453, "grad_norm": 0.5050980157229342, "learning_rate": 2.3333333333333336e-05, "loss": 0.3418, "loss_nan_ranks": 0, "loss_rank_avg": 0.10843911021947861, "step": 15, "valid_targets_mean": 13961.8, "valid_targets_min": 7669 }, { "epoch": 0.6060606060606061, "grad_norm": 0.42246792506810416, "learning_rate": 3.1666666666666666e-05, "loss": 0.3153, "loss_nan_ranks": 0, "loss_rank_avg": 0.10476820170879364, "step": 20, "valid_targets_mean": 13416.8, "valid_targets_min": 8687 }, { "epoch": 0.7575757575757576, "grad_norm": 0.32959630386266386, "learning_rate": 4e-05, "loss": 0.2947, "loss_nan_ranks": 0, "loss_rank_avg": 0.09988001734018326, "step": 25, "valid_targets_mean": 14083.4, "valid_targets_min": 6271 }, { "epoch": 0.9090909090909091, "grad_norm": 0.2350625305435842, "learning_rate": 3.994244399375679e-05, "loss": 0.271, "loss_nan_ranks": 0, "loss_rank_avg": 0.09274592995643616, "step": 30, "valid_targets_mean": 14132.5, "valid_targets_min": 5776 }, { "epoch": 1.0606060606060606, "grad_norm": 0.19138295760535673, "learning_rate": 3.977010724441261e-05, "loss": 0.2523, "loss_nan_ranks": 0, "loss_rank_avg": 0.08168983459472656, "step": 35, "valid_targets_mean": 13705.5, "valid_targets_min": 6252 }, { "epoch": 1.2121212121212122, "grad_norm": 0.16875241063378532, "learning_rate": 3.9483981653469586e-05, "loss": 0.2413, "loss_nan_ranks": 0, "loss_rank_avg": 0.07980272173881531, "step": 40, "valid_targets_mean": 13126.1, "valid_targets_min": 3159 }, { "epoch": 1.3636363636363638, "grad_norm": 0.14569427919381664, "learning_rate": 3.908571404555758e-05, "loss": 0.2315, "loss_nan_ranks": 0, "loss_rank_avg": 0.0792369619011879, "step": 45, "valid_targets_mean": 14597.3, "valid_targets_min": 6419 }, { "epoch": 1.5151515151515151, "grad_norm": 0.12740714547686136, "learning_rate": 3.8577596689969346e-05, "loss": 0.228, "loss_nan_ranks": 0, "loss_rank_avg": 0.07131356000900269, "step": 50, "valid_targets_mean": 14610.6, "valid_targets_min": 9878 }, { "epoch": 1.6666666666666665, "grad_norm": 0.13219914885031364, "learning_rate": 3.7962554107273926e-05, "loss": 0.2184, "loss_nan_ranks": 0, "loss_rank_avg": 0.07061326503753662, "step": 55, "valid_targets_mean": 14205.5, "valid_targets_min": 8994 }, { "epoch": 1.8181818181818183, "grad_norm": 0.12537992825560595, "learning_rate": 3.724412623694427e-05, "loss": 0.2129, "loss_nan_ranks": 0, "loss_rank_avg": 0.0706273764371872, "step": 60, "valid_targets_mean": 14081.0, "valid_targets_min": 7128 }, { "epoch": 1.9696969696969697, "grad_norm": 0.1317216509879008, "learning_rate": 3.642644806287938e-05, "loss": 0.2075, "loss_nan_ranks": 0, "loss_rank_avg": 0.06866224110126495, "step": 65, "valid_targets_mean": 13509.2, "valid_targets_min": 6997 }, { "epoch": 2.121212121212121, "grad_norm": 0.13400162499874246, "learning_rate": 3.55142258140884e-05, "loss": 0.2062, "loss_nan_ranks": 0, "loss_rank_avg": 0.06507541239261627, "step": 70, "valid_targets_mean": 13193.5, "valid_targets_min": 3186 }, { "epoch": 2.2727272727272725, "grad_norm": 0.14105797671143652, "learning_rate": 3.451270987751598e-05, "loss": 0.2062, "loss_nan_ranks": 0, "loss_rank_avg": 0.07145757973194122, "step": 75, "valid_targets_mean": 14511.4, "valid_targets_min": 5311 }, { "epoch": 2.4242424242424243, "grad_norm": 0.1342111928769276, "learning_rate": 3.342766457891194e-05, "loss": 0.1972, "loss_nan_ranks": 0, "loss_rank_avg": 0.07077564299106598, "step": 80, "valid_targets_mean": 14961.1, "valid_targets_min": 9318 }, { "epoch": 2.5757575757575757, "grad_norm": 0.12636739349250312, "learning_rate": 3.226533500567433e-05, "loss": 0.1948, "loss_nan_ranks": 0, "loss_rank_avg": 0.0638246163725853, "step": 85, "valid_targets_mean": 14331.9, "valid_targets_min": 8629 }, { "epoch": 2.7272727272727275, "grad_norm": 0.1376354010348529, "learning_rate": 3.1032411062620544e-05, "loss": 0.1969, "loss_nan_ranks": 0, "loss_rank_avg": 0.06627653539180756, "step": 90, "valid_targets_mean": 14391.3, "valid_targets_min": 9017 }, { "epoch": 2.878787878787879, "grad_norm": 0.12491286274381265, "learning_rate": 2.973598896756697e-05, "loss": 0.1923, "loss_nan_ranks": 0, "loss_rank_avg": 0.05493774637579918, "step": 95, "valid_targets_mean": 14151.3, "valid_targets_min": 6828 }, { "epoch": 3.0303030303030303, "grad_norm": 0.13478052499964224, "learning_rate": 2.8383530408333285e-05, "loss": 0.1914, "loss_nan_ranks": 0, "loss_rank_avg": 0.06786017119884491, "step": 100, "valid_targets_mean": 13563.6, "valid_targets_min": 5230 }, { "epoch": 3.1818181818181817, "grad_norm": 0.13423564287671794, "learning_rate": 2.6982819596247373e-05, "loss": 0.1892, "loss_nan_ranks": 0, "loss_rank_avg": 0.06517742574214935, "step": 105, "valid_targets_mean": 13841.5, "valid_targets_min": 7737 }, { "epoch": 3.3333333333333335, "grad_norm": 0.12892135797363977, "learning_rate": 2.554191846333378e-05, "loss": 0.1852, "loss_nan_ranks": 0, "loss_rank_avg": 0.06293950974941254, "step": 110, "valid_targets_mean": 13982.2, "valid_targets_min": 6906 }, { "epoch": 3.484848484848485, "grad_norm": 0.13050995936742432, "learning_rate": 2.4069120261052682e-05, "loss": 0.1861, "loss_nan_ranks": 0, "loss_rank_avg": 0.060917340219020844, "step": 115, "valid_targets_mean": 14595.4, "valid_targets_min": 7001 }, { "epoch": 3.6363636363636362, "grad_norm": 0.12603585687316865, "learning_rate": 2.2572901827656626e-05, "loss": 0.1872, "loss_nan_ranks": 0, "loss_rank_avg": 0.06335198879241943, "step": 120, "valid_targets_mean": 14134.8, "valid_targets_min": 4539 }, { "epoch": 3.787878787878788, "grad_norm": 0.12458026081937879, "learning_rate": 2.1061874798894992e-05, "loss": 0.1849, "loss_nan_ranks": 0, "loss_rank_avg": 0.0566466748714447, "step": 125, "valid_targets_mean": 13645.1, "valid_targets_min": 6953 }, { "epoch": 3.9393939393939394, "grad_norm": 0.1237313936998869, "learning_rate": 1.9544736042877886e-05, "loss": 0.1827, "loss_nan_ranks": 0, "loss_rank_avg": 0.06262873858213425, "step": 130, "valid_targets_mean": 14213.2, "valid_targets_min": 7309 }, { "epoch": 4.090909090909091, "grad_norm": 0.12687157253526618, "learning_rate": 1.8030217604376628e-05, "loss": 0.183, "loss_nan_ranks": 0, "loss_rank_avg": 0.06545929610729218, "step": 135, "valid_targets_mean": 14312.2, "valid_targets_min": 4618 }, { "epoch": 4.242424242424242, "grad_norm": 0.13286557514006167, "learning_rate": 1.6527036446661396e-05, "loss": 0.1838, "loss_nan_ranks": 0, "loss_rank_avg": 0.0621417798101902, "step": 140, "valid_targets_mean": 13995.9, "valid_targets_min": 3159 }, { "epoch": 4.393939393939394, "grad_norm": 0.13911478169619562, "learning_rate": 1.5043844280142005e-05, "loss": 0.1811, "loss_nan_ranks": 0, "loss_rank_avg": 0.06362389773130417, "step": 145, "valid_targets_mean": 14415.2, "valid_targets_min": 7395 }, { "epoch": 4.545454545454545, "grad_norm": 0.12748545316526233, "learning_rate": 1.358917776657806e-05, "loss": 0.1782, "loss_nan_ranks": 0, "loss_rank_avg": 0.05887322872877121, "step": 150, "valid_targets_mean": 13608.7, "valid_targets_min": 4237 }, { "epoch": 4.696969696969697, "grad_norm": 0.12065267038653112, "learning_rate": 1.2171409385463218e-05, "loss": 0.1767, "loss_nan_ranks": 0, "loss_rank_avg": 0.05912807583808899, "step": 155, "valid_targets_mean": 13554.9, "valid_targets_min": 6402 }, { "epoch": 4.848484848484849, "grad_norm": 0.1242188167578349, "learning_rate": 1.0798699245376959e-05, "loss": 0.1781, "loss_nan_ranks": 0, "loss_rank_avg": 0.05947456508874893, "step": 160, "valid_targets_mean": 14594.9, "valid_targets_min": 7248 }, { "epoch": 5.0, "grad_norm": 0.14434343975971592, "learning_rate": 9.478948117658577e-06, "loss": 0.1786, "loss_nan_ranks": 0, "loss_rank_avg": 0.059081535786390305, "step": 165, "valid_targets_mean": 13341.7, "valid_targets_min": 4728 }, { "epoch": 5.151515151515151, "grad_norm": 0.13210237534940392, "learning_rate": 8.219751962722726e-06, "loss": 0.1803, "loss_nan_ranks": 0, "loss_rank_avg": 0.059808894991874695, "step": 170, "valid_targets_mean": 14140.9, "valid_targets_min": 5241 }, { "epoch": 5.303030303030303, "grad_norm": 0.11760754357701442, "learning_rate": 7.028358210744881e-06, "loss": 0.1723, "loss_nan_ranks": 0, "loss_rank_avg": 0.05813150107860565, "step": 175, "valid_targets_mean": 13437.8, "valid_targets_min": 3456 }, { "epoch": 5.454545454545454, "grad_norm": 0.23292822424304802, "learning_rate": 5.911624048347757e-06, "loss": 0.1782, "loss_nan_ranks": 0, "loss_rank_avg": 0.054474152624607086, "step": 180, "valid_targets_mean": 13450.3, "valid_targets_min": 8527 }, { "epoch": 5.606060606060606, "grad_norm": 0.11629211221337708, "learning_rate": 4.875976951373633e-06, "loss": 0.177, "loss_nan_ranks": 0, "loss_rank_avg": 0.05472658574581146, "step": 185, "valid_targets_mean": 14527.5, "valid_targets_min": 6291 }, { "epoch": 5.757575757575758, "grad_norm": 0.12053316478201255, "learning_rate": 3.927377690900436e-06, "loss": 0.1755, "loss_nan_ranks": 0, "loss_rank_avg": 0.06293582916259766, "step": 190, "valid_targets_mean": 15079.5, "valid_targets_min": 5311 }, { "epoch": 5.909090909090909, "grad_norm": 0.14093615075318672, "learning_rate": 3.071286025423983e-06, "loss": 0.1762, "loss_nan_ranks": 0, "loss_rank_avg": 0.0564623698592186, "step": 195, "valid_targets_mean": 13432.3, "valid_targets_min": 4391 }, { "epoch": 6.0606060606060606, "grad_norm": 0.11794615480974098, "learning_rate": 2.312629276668554e-06, "loss": 0.1746, "loss_nan_ranks": 0, "loss_rank_avg": 0.059501372277736664, "step": 200, "valid_targets_mean": 14715.9, "valid_targets_min": 8912 }, { "epoch": 6.212121212121212, "grad_norm": 0.12194911582381218, "learning_rate": 1.6557739698909436e-06, "loss": 0.1764, "loss_nan_ranks": 0, "loss_rank_avg": 0.0586993545293808, "step": 205, "valid_targets_mean": 13863.8, "valid_targets_min": 5335 }, { "epoch": 6.363636363636363, "grad_norm": 0.12137779369589421, "learning_rate": 1.1045007019049182e-06, "loss": 0.1753, "loss_nan_ranks": 0, "loss_rank_avg": 0.05951303243637085, "step": 210, "valid_targets_mean": 13876.2, "valid_targets_min": 4510 }, { "epoch": 6.515151515151516, "grad_norm": 0.11115005771790042, "learning_rate": 6.619823814758786e-07, "loss": 0.1725, "loss_nan_ranks": 0, "loss_rank_avg": 0.0475098192691803, "step": 215, "valid_targets_mean": 13260.7, "valid_targets_min": 2975 }, { "epoch": 6.666666666666667, "grad_norm": 0.11465648239113958, "learning_rate": 3.307659673251595e-07, "loss": 0.1759, "loss_nan_ranks": 0, "loss_rank_avg": 0.05422068014740944, "step": 220, "valid_targets_mean": 13885.9, "valid_targets_min": 4618 }, { "epoch": 6.818181818181818, "grad_norm": 0.12646402426260042, "learning_rate": 1.1275780885282806e-07, "loss": 0.1764, "loss_nan_ranks": 0, "loss_rank_avg": 0.06054677814245224, "step": 225, "valid_targets_mean": 13998.0, "valid_targets_min": 7669 }, { "epoch": 6.96969696969697, "grad_norm": 0.11699617410563377, "learning_rate": 9.212673951897177e-09, "loss": 0.176, "loss_nan_ranks": 0, "loss_rank_avg": 0.061230286955833435, "step": 230, "valid_targets_mean": 14301.2, "valid_targets_min": 7992 }, { "epoch": 7.0, "step": 231, "total_flos": 1.7948424939556045e+18, "train_loss": 0.0, "train_runtime": 1.1533, "train_samples_per_second": 19179.804, "train_steps_per_second": 200.295 } ], "logging_steps": 5, "max_steps": 231, "num_input_tokens_seen": 0, "num_train_epochs": 7, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.7948424939556045e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }