{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 7.0, "eval_steps": 500, "global_step": 231, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.15151515151515152, "grad_norm": 3.5902750493970834, "learning_rate": 6.666666666666667e-06, "loss": 0.367, "loss_nan_ranks": 0, "loss_rank_avg": 0.11483248323202133, "step": 5, "valid_targets_mean": 6069.3, "valid_targets_min": 1102 }, { "epoch": 0.30303030303030304, "grad_norm": 1.523293973634436, "learning_rate": 1.5000000000000002e-05, "loss": 0.3295, "loss_nan_ranks": 0, "loss_rank_avg": 0.09902024269104004, "step": 10, "valid_targets_mean": 5740.2, "valid_targets_min": 975 }, { "epoch": 0.45454545454545453, "grad_norm": 0.5204685681128319, "learning_rate": 2.3333333333333336e-05, "loss": 0.2893, "loss_nan_ranks": 0, "loss_rank_avg": 0.09341852366924286, "step": 15, "valid_targets_mean": 6076.2, "valid_targets_min": 826 }, { "epoch": 0.6060606060606061, "grad_norm": 0.42120403433587034, "learning_rate": 3.1666666666666666e-05, "loss": 0.2586, "loss_nan_ranks": 0, "loss_rank_avg": 0.09784922748804092, "step": 20, "valid_targets_mean": 6676.1, "valid_targets_min": 1629 }, { "epoch": 0.7575757575757576, "grad_norm": 0.29716404096054466, "learning_rate": 4e-05, "loss": 0.2338, "loss_nan_ranks": 0, "loss_rank_avg": 0.07702332735061646, "step": 25, "valid_targets_mean": 5522.2, "valid_targets_min": 1847 }, { "epoch": 0.9090909090909091, "grad_norm": 0.2074554716015637, "learning_rate": 3.994244399375679e-05, "loss": 0.2068, "loss_nan_ranks": 0, "loss_rank_avg": 0.08520618081092834, "step": 30, "valid_targets_mean": 7733.2, "valid_targets_min": 1249 }, { "epoch": 1.0606060606060606, "grad_norm": 0.19383308521357237, "learning_rate": 3.977010724441261e-05, "loss": 0.1976, "loss_nan_ranks": 0, "loss_rank_avg": 0.05378055199980736, "step": 35, "valid_targets_mean": 4624.0, "valid_targets_min": 355 }, { "epoch": 1.2121212121212122, "grad_norm": 0.16410603240518978, "learning_rate": 3.9483981653469586e-05, "loss": 0.1883, "loss_nan_ranks": 0, "loss_rank_avg": 0.06615366041660309, "step": 40, "valid_targets_mean": 6550.9, "valid_targets_min": 1255 }, { "epoch": 1.3636363636363638, "grad_norm": 0.15449427864484538, "learning_rate": 3.908571404555758e-05, "loss": 0.1793, "loss_nan_ranks": 0, "loss_rank_avg": 0.05769656226038933, "step": 45, "valid_targets_mean": 6499.9, "valid_targets_min": 1751 }, { "epoch": 1.5151515151515151, "grad_norm": 0.14398778497253703, "learning_rate": 3.8577596689969346e-05, "loss": 0.18, "loss_nan_ranks": 0, "loss_rank_avg": 0.056848011910915375, "step": 50, "valid_targets_mean": 7237.5, "valid_targets_min": 372 }, { "epoch": 1.6666666666666665, "grad_norm": 0.13956070879698368, "learning_rate": 3.7962554107273926e-05, "loss": 0.1726, "loss_nan_ranks": 0, "loss_rank_avg": 0.06066435948014259, "step": 55, "valid_targets_mean": 6926.7, "valid_targets_min": 1357 }, { "epoch": 1.8181818181818183, "grad_norm": 0.1443322809922013, "learning_rate": 3.724412623694427e-05, "loss": 0.173, "loss_nan_ranks": 0, "loss_rank_avg": 0.05192989856004715, "step": 60, "valid_targets_mean": 5459.8, "valid_targets_min": 433 }, { "epoch": 1.9696969696969697, "grad_norm": 0.15138224844404377, "learning_rate": 3.642644806287938e-05, "loss": 0.178, "loss_nan_ranks": 0, "loss_rank_avg": 0.0544576533138752, "step": 65, "valid_targets_mean": 6176.4, "valid_targets_min": 1173 }, { "epoch": 2.121212121212121, "grad_norm": 0.14788211295786652, "learning_rate": 3.55142258140884e-05, "loss": 0.1706, "loss_nan_ranks": 0, "loss_rank_avg": 0.050428979098796844, "step": 70, "valid_targets_mean": 6172.1, "valid_targets_min": 1078 }, { "epoch": 2.2727272727272725, "grad_norm": 0.1489025346241851, "learning_rate": 3.451270987751598e-05, "loss": 0.1622, "loss_nan_ranks": 0, "loss_rank_avg": 0.0515875369310379, "step": 75, "valid_targets_mean": 5412.4, "valid_targets_min": 1256 }, { "epoch": 2.4242424242424243, "grad_norm": 0.15974193131852008, "learning_rate": 3.342766457891194e-05, "loss": 0.1596, "loss_nan_ranks": 0, "loss_rank_avg": 0.052027106285095215, "step": 80, "valid_targets_mean": 5327.2, "valid_targets_min": 1029 }, { "epoch": 2.5757575757575757, "grad_norm": 0.14890130460372, "learning_rate": 3.226533500567433e-05, "loss": 0.1565, "loss_nan_ranks": 0, "loss_rank_avg": 0.05260023474693298, "step": 85, "valid_targets_mean": 6336.1, "valid_targets_min": 826 }, { "epoch": 2.7272727272727275, "grad_norm": 0.15497597762513154, "learning_rate": 3.1032411062620544e-05, "loss": 0.1604, "loss_nan_ranks": 0, "loss_rank_avg": 0.0602923147380352, "step": 90, "valid_targets_mean": 5714.4, "valid_targets_min": 1409 }, { "epoch": 2.878787878787879, "grad_norm": 0.14743414796367607, "learning_rate": 2.973598896756697e-05, "loss": 0.162, "loss_nan_ranks": 0, "loss_rank_avg": 0.04700557142496109, "step": 95, "valid_targets_mean": 5250.4, "valid_targets_min": 978 }, { "epoch": 3.0303030303030303, "grad_norm": 0.14875208431462822, "learning_rate": 2.8383530408333285e-05, "loss": 0.1527, "loss_nan_ranks": 0, "loss_rank_avg": 0.050968416035175323, "step": 100, "valid_targets_mean": 6015.5, "valid_targets_min": 1215 }, { "epoch": 3.1818181818181817, "grad_norm": 0.14720059201307634, "learning_rate": 2.6982819596247373e-05, "loss": 0.1494, "loss_nan_ranks": 0, "loss_rank_avg": 0.048501770943403244, "step": 105, "valid_targets_mean": 6575.0, "valid_targets_min": 1490 }, { "epoch": 3.3333333333333335, "grad_norm": 0.14554091849093376, "learning_rate": 2.554191846333378e-05, "loss": 0.1517, "loss_nan_ranks": 0, "loss_rank_avg": 0.04890170693397522, "step": 110, "valid_targets_mean": 6384.5, "valid_targets_min": 1102 }, { "epoch": 3.484848484848485, "grad_norm": 0.14816303968679037, "learning_rate": 2.4069120261052682e-05, "loss": 0.1503, "loss_nan_ranks": 0, "loss_rank_avg": 0.05088581144809723, "step": 115, "valid_targets_mean": 6863.3, "valid_targets_min": 2033 }, { "epoch": 3.6363636363636362, "grad_norm": 0.18861192867251467, "learning_rate": 2.2572901827656626e-05, "loss": 0.1504, "loss_nan_ranks": 0, "loss_rank_avg": 0.051915183663368225, "step": 120, "valid_targets_mean": 5318.8, "valid_targets_min": 909 }, { "epoch": 3.787878787878788, "grad_norm": 0.17975884303700998, "learning_rate": 2.1061874798894992e-05, "loss": 0.1471, "loss_nan_ranks": 0, "loss_rank_avg": 0.04352159798145294, "step": 125, "valid_targets_mean": 5009.9, "valid_targets_min": 372 }, { "epoch": 3.9393939393939394, "grad_norm": 0.14992809475322363, "learning_rate": 1.9544736042877886e-05, "loss": 0.1483, "loss_nan_ranks": 0, "loss_rank_avg": 0.06831197440624237, "step": 130, "valid_targets_mean": 6830.1, "valid_targets_min": 947 }, { "epoch": 4.090909090909091, "grad_norm": 0.14832791011346455, "learning_rate": 1.8030217604376628e-05, "loss": 0.1394, "loss_nan_ranks": 0, "loss_rank_avg": 0.04658203199505806, "step": 135, "valid_targets_mean": 5643.2, "valid_targets_min": 355 }, { "epoch": 4.242424242424242, "grad_norm": 0.17266841542411188, "learning_rate": 1.6527036446661396e-05, "loss": 0.1412, "loss_nan_ranks": 0, "loss_rank_avg": 0.046023882925510406, "step": 140, "valid_targets_mean": 6153.0, "valid_targets_min": 510 }, { "epoch": 4.393939393939394, "grad_norm": 0.15432680905719748, "learning_rate": 1.5043844280142005e-05, "loss": 0.1369, "loss_nan_ranks": 0, "loss_rank_avg": 0.04154321551322937, "step": 145, "valid_targets_mean": 6060.3, "valid_targets_min": 563 }, { "epoch": 4.545454545454545, "grad_norm": 0.15852075197915375, "learning_rate": 1.358917776657806e-05, "loss": 0.1429, "loss_nan_ranks": 0, "loss_rank_avg": 0.045568957924842834, "step": 150, "valid_targets_mean": 6214.1, "valid_targets_min": 932 }, { "epoch": 4.696969696969697, "grad_norm": 0.15361105601774092, "learning_rate": 1.2171409385463218e-05, "loss": 0.1379, "loss_nan_ranks": 0, "loss_rank_avg": 0.046185821294784546, "step": 155, "valid_targets_mean": 6136.9, "valid_targets_min": 1255 }, { "epoch": 4.848484848484849, "grad_norm": 0.14910920681280979, "learning_rate": 1.0798699245376959e-05, "loss": 0.147, "loss_nan_ranks": 0, "loss_rank_avg": 0.053926583379507065, "step": 160, "valid_targets_mean": 7605.3, "valid_targets_min": 1808 }, { "epoch": 5.0, "grad_norm": 0.15956391595199854, "learning_rate": 9.478948117658577e-06, "loss": 0.1418, "loss_nan_ranks": 0, "loss_rank_avg": 0.04224388301372528, "step": 165, "valid_targets_mean": 4956.4, "valid_targets_min": 1186 }, { "epoch": 5.151515151515151, "grad_norm": 0.16166354923486542, "learning_rate": 8.219751962722726e-06, "loss": 0.1385, "loss_nan_ranks": 0, "loss_rank_avg": 0.04815865680575371, "step": 170, "valid_targets_mean": 6351.8, "valid_targets_min": 1414 }, { "epoch": 5.303030303030303, "grad_norm": 0.15713749630090504, "learning_rate": 7.028358210744881e-06, "loss": 0.1335, "loss_nan_ranks": 0, "loss_rank_avg": 0.04832237958908081, "step": 175, "valid_targets_mean": 6324.0, "valid_targets_min": 537 }, { "epoch": 5.454545454545454, "grad_norm": 0.16623362872731065, "learning_rate": 5.911624048347757e-06, "loss": 0.1368, "loss_nan_ranks": 0, "loss_rank_avg": 0.04412321001291275, "step": 180, "valid_targets_mean": 5107.0, "valid_targets_min": 975 }, { "epoch": 5.606060606060606, "grad_norm": 0.14675183366084796, "learning_rate": 4.875976951373633e-06, "loss": 0.1307, "loss_nan_ranks": 0, "loss_rank_avg": 0.03708821162581444, "step": 185, "valid_targets_mean": 6408.9, "valid_targets_min": 1162 }, { "epoch": 5.757575757575758, "grad_norm": 0.16118513545891452, "learning_rate": 3.927377690900436e-06, "loss": 0.1369, "loss_nan_ranks": 0, "loss_rank_avg": 0.04341595247387886, "step": 190, "valid_targets_mean": 6496.9, "valid_targets_min": 1027 }, { "epoch": 5.909090909090909, "grad_norm": 0.18904653293771853, "learning_rate": 3.071286025423983e-06, "loss": 0.1389, "loss_nan_ranks": 0, "loss_rank_avg": 0.059906527400016785, "step": 195, "valid_targets_mean": 6184.4, "valid_targets_min": 978 }, { "epoch": 6.0606060606060606, "grad_norm": 0.16058499044576713, "learning_rate": 2.312629276668554e-06, "loss": 0.1477, "loss_nan_ranks": 0, "loss_rank_avg": 0.0467909500002861, "step": 200, "valid_targets_mean": 5470.1, "valid_targets_min": 1107 }, { "epoch": 6.212121212121212, "grad_norm": 0.16458313236617705, "learning_rate": 1.6557739698909436e-06, "loss": 0.1363, "loss_nan_ranks": 0, "loss_rank_avg": 0.039594992995262146, "step": 205, "valid_targets_mean": 6872.1, "valid_targets_min": 1222 }, { "epoch": 6.363636363636363, "grad_norm": 0.1520783842215095, "learning_rate": 1.1045007019049182e-06, "loss": 0.1381, "loss_nan_ranks": 0, "loss_rank_avg": 0.042891182005405426, "step": 210, "valid_targets_mean": 5591.5, "valid_targets_min": 1134 }, { "epoch": 6.515151515151516, "grad_norm": 0.17583323971560555, "learning_rate": 6.619823814758786e-07, "loss": 0.1336, "loss_nan_ranks": 0, "loss_rank_avg": 0.04007503390312195, "step": 215, "valid_targets_mean": 5927.2, "valid_targets_min": 1202 }, { "epoch": 6.666666666666667, "grad_norm": 0.15107528231407247, "learning_rate": 3.307659673251595e-07, "loss": 0.1378, "loss_nan_ranks": 0, "loss_rank_avg": 0.05574905872344971, "step": 220, "valid_targets_mean": 7710.5, "valid_targets_min": 975 }, { "epoch": 6.818181818181818, "grad_norm": 0.1525570611116513, "learning_rate": 1.1275780885282806e-07, "loss": 0.1311, "loss_nan_ranks": 0, "loss_rank_avg": 0.036159858107566833, "step": 225, "valid_targets_mean": 6852.6, "valid_targets_min": 474 }, { "epoch": 6.96969696969697, "grad_norm": 0.1772473530958802, "learning_rate": 9.212673951897177e-09, "loss": 0.1337, "loss_nan_ranks": 0, "loss_rank_avg": 0.048351939767599106, "step": 230, "valid_targets_mean": 7166.3, "valid_targets_min": 1121 }, { "epoch": 7.0, "step": 231, "total_flos": 6.648213778192138e+17, "train_loss": 0.0, "train_runtime": 0.8229, "train_samples_per_second": 26879.554, "train_steps_per_second": 280.704 } ], "logging_steps": 5, "max_steps": 231, "num_input_tokens_seen": 0, "num_train_epochs": 7, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.648213778192138e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }