{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 7.0, "eval_steps": 500, "global_step": 287, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.12195121951219512, "grad_norm": 11.268886282590001, "learning_rate": 5.517241379310345e-06, "loss": 0.7129, "loss_nan_ranks": 0, "loss_rank_avg": 0.08619208633899689, "step": 5, "valid_targets_mean": 1809.8, "valid_targets_min": 413 }, { "epoch": 0.24390243902439024, "grad_norm": 6.618387158921114, "learning_rate": 1.2413793103448277e-05, "loss": 0.5949, "loss_nan_ranks": 0, "loss_rank_avg": 0.15227359533309937, "step": 10, "valid_targets_mean": 1454.2, "valid_targets_min": 1048 }, { "epoch": 0.36585365853658536, "grad_norm": 1.1697598603814097, "learning_rate": 1.931034482758621e-05, "loss": 0.3616, "loss_nan_ranks": 0, "loss_rank_avg": 0.03705377131700516, "step": 15, "valid_targets_mean": 1068.8, "valid_targets_min": 603 }, { "epoch": 0.4878048780487805, "grad_norm": 1.0542992580910668, "learning_rate": 2.620689655172414e-05, "loss": 0.3145, "loss_nan_ranks": 0, "loss_rank_avg": 0.05863872170448303, "step": 20, "valid_targets_mean": 1012.8, "valid_targets_min": 786 }, { "epoch": 0.6097560975609756, "grad_norm": 0.8906250398426856, "learning_rate": 3.310344827586207e-05, "loss": 0.3165, "loss_nan_ranks": 0, "loss_rank_avg": 0.1044623851776123, "step": 25, "valid_targets_mean": 3050.0, "valid_targets_min": 1178 }, { "epoch": 0.7317073170731707, "grad_norm": 0.7873917512747328, "learning_rate": 4e-05, "loss": 0.2592, "loss_nan_ranks": 0, "loss_rank_avg": 0.07892066985368729, "step": 30, "valid_targets_mean": 2248.5, "valid_targets_min": 1487 }, { "epoch": 0.8536585365853658, "grad_norm": 0.7365758820610112, "learning_rate": 3.99629433475729e-05, "loss": 0.2519, "loss_nan_ranks": 0, "loss_rank_avg": 0.05776943266391754, "step": 35, "valid_targets_mean": 2209.0, "valid_targets_min": 853 }, { "epoch": 0.975609756097561, "grad_norm": 0.9982728088095962, "learning_rate": 3.985191070984053e-05, "loss": 0.2633, "loss_nan_ranks": 0, "loss_rank_avg": 0.08143679797649384, "step": 40, "valid_targets_mean": 1733.5, "valid_targets_min": 542 }, { "epoch": 1.0975609756097562, "grad_norm": 0.6049830734772758, "learning_rate": 3.966731353658932e-05, "loss": 0.2297, "loss_nan_ranks": 0, "loss_rank_avg": 0.05846400931477547, "step": 45, "valid_targets_mean": 2682.8, "valid_targets_min": 1287 }, { "epoch": 1.2195121951219512, "grad_norm": 0.6718448910202545, "learning_rate": 3.940983588314811e-05, "loss": 0.2282, "loss_nan_ranks": 0, "loss_rank_avg": 0.05738037824630737, "step": 50, "valid_targets_mean": 2266.2, "valid_targets_min": 917 }, { "epoch": 1.3414634146341464, "grad_norm": 0.7687467547629783, "learning_rate": 3.908043187550802e-05, "loss": 0.2176, "loss_nan_ranks": 0, "loss_rank_avg": 0.034274592995643616, "step": 55, "valid_targets_mean": 1044.0, "valid_targets_min": 912 }, { "epoch": 1.4634146341463414, "grad_norm": 0.4409997756684539, "learning_rate": 3.868032217465097e-05, "loss": 0.2294, "loss_nan_ranks": 0, "loss_rank_avg": 0.06288576126098633, "step": 60, "valid_targets_mean": 7464.2, "valid_targets_min": 4233 }, { "epoch": 1.5853658536585367, "grad_norm": 0.6669183491635846, "learning_rate": 3.821098945318869e-05, "loss": 0.2184, "loss_nan_ranks": 0, "loss_rank_avg": 0.041596993803977966, "step": 65, "valid_targets_mean": 2462.2, "valid_targets_min": 603 }, { "epoch": 1.7073170731707317, "grad_norm": 0.833951005600001, "learning_rate": 3.767417290107439e-05, "loss": 0.2229, "loss_nan_ranks": 0, "loss_rank_avg": 0.0828239768743515, "step": 70, "valid_targets_mean": 1676.0, "valid_targets_min": 960 }, { "epoch": 1.8292682926829267, "grad_norm": 0.3827765345675253, "learning_rate": 3.7071861780746934e-05, "loss": 0.1918, "loss_nan_ranks": 0, "loss_rank_avg": 0.03231481462717056, "step": 75, "valid_targets_mean": 6333.8, "valid_targets_min": 1529 }, { "epoch": 1.951219512195122, "grad_norm": 0.405420209692415, "learning_rate": 3.640628805559022e-05, "loss": 0.1834, "loss_nan_ranks": 0, "loss_rank_avg": 0.021378157660365105, "step": 80, "valid_targets_mean": 2585.5, "valid_targets_min": 894 }, { "epoch": 2.073170731707317, "grad_norm": 0.6706457928261834, "learning_rate": 3.567991811902403e-05, "loss": 0.1949, "loss_nan_ranks": 0, "loss_rank_avg": 0.029131846502423286, "step": 85, "valid_targets_mean": 1901.5, "valid_targets_min": 758 }, { "epoch": 2.1951219512195124, "grad_norm": 0.6506505238615833, "learning_rate": 3.489544365487564e-05, "loss": 0.1904, "loss_nan_ranks": 0, "loss_rank_avg": 0.06596972048282623, "step": 90, "valid_targets_mean": 2361.2, "valid_targets_min": 421 }, { "epoch": 2.317073170731707, "grad_norm": 0.5666611455881608, "learning_rate": 3.4055771662900637e-05, "loss": 0.1738, "loss_nan_ranks": 0, "loss_rank_avg": 0.032204728573560715, "step": 95, "valid_targets_mean": 1985.5, "valid_targets_min": 813 }, { "epoch": 2.4390243902439024, "grad_norm": 0.5528709212555227, "learning_rate": 3.316401368641496e-05, "loss": 0.1743, "loss_nan_ranks": 0, "loss_rank_avg": 0.03457804396748543, "step": 100, "valid_targets_mean": 3545.0, "valid_targets_min": 423 }, { "epoch": 2.5609756097560976, "grad_norm": 0.8867699611106878, "learning_rate": 3.222347428195699e-05, "loss": 0.182, "loss_nan_ranks": 0, "loss_rank_avg": 0.06031285971403122, "step": 105, "valid_targets_mean": 1640.8, "valid_targets_min": 938 }, { "epoch": 2.682926829268293, "grad_norm": 0.5690766882507081, "learning_rate": 3.1237638773707214e-05, "loss": 0.1587, "loss_nan_ranks": 0, "loss_rank_avg": 0.04014682024717331, "step": 110, "valid_targets_mean": 2029.8, "valid_targets_min": 960 }, { "epoch": 2.8048780487804876, "grad_norm": 0.7008728096682685, "learning_rate": 3.0210160338043583e-05, "loss": 0.1708, "loss_nan_ranks": 0, "loss_rank_avg": 0.07026761770248413, "step": 115, "valid_targets_mean": 2580.8, "valid_targets_min": 786 }, { "epoch": 2.926829268292683, "grad_norm": 0.7716450116227951, "learning_rate": 2.9144846466092773e-05, "loss": 0.1624, "loss_nan_ranks": 0, "loss_rank_avg": 0.06721051037311554, "step": 120, "valid_targets_mean": 2570.2, "valid_targets_min": 1543 }, { "epoch": 3.048780487804878, "grad_norm": 1.5503052769053867, "learning_rate": 2.804564485444265e-05, "loss": 0.1503, "loss_nan_ranks": 0, "loss_rank_avg": 0.0261751189827919, "step": 125, "valid_targets_mean": 1452.5, "valid_targets_min": 917 }, { "epoch": 3.1707317073170733, "grad_norm": 0.6734417093831508, "learning_rate": 2.691662877630023e-05, "loss": 0.1451, "loss_nan_ranks": 0, "loss_rank_avg": 0.028553606942296028, "step": 130, "valid_targets_mean": 2922.2, "valid_targets_min": 1260 }, { "epoch": 3.292682926829268, "grad_norm": 1.456490794322615, "learning_rate": 2.5761981987304757e-05, "loss": 0.1455, "loss_nan_ranks": 0, "loss_rank_avg": 0.03396756574511528, "step": 135, "valid_targets_mean": 1340.2, "valid_targets_min": 796 }, { "epoch": 3.4146341463414633, "grad_norm": 0.9411604050978699, "learning_rate": 2.4585983221929803e-05, "loss": 0.1411, "loss_nan_ranks": 0, "loss_rank_avg": 0.037783801555633545, "step": 140, "valid_targets_mean": 2368.0, "valid_targets_min": 797 }, { "epoch": 3.5365853658536586, "grad_norm": 0.9034332983731574, "learning_rate": 2.3392990337925696e-05, "loss": 0.1504, "loss_nan_ranks": 0, "loss_rank_avg": 0.04916565865278244, "step": 145, "valid_targets_mean": 1793.0, "valid_targets_min": 648 }, { "epoch": 3.658536585365854, "grad_norm": 0.9084489941265945, "learning_rate": 2.2187424167557496e-05, "loss": 0.1577, "loss_nan_ranks": 0, "loss_rank_avg": 0.04747513309121132, "step": 150, "valid_targets_mean": 2595.5, "valid_targets_min": 870 }, { "epoch": 3.7804878048780486, "grad_norm": 0.8065019341177397, "learning_rate": 2.0973752135480505e-05, "loss": 0.1459, "loss_nan_ranks": 0, "loss_rank_avg": 0.030843544751405716, "step": 155, "valid_targets_mean": 1864.2, "valid_targets_min": 1047 }, { "epoch": 3.902439024390244, "grad_norm": 0.5952164537741085, "learning_rate": 1.9756471703960053e-05, "loss": 0.1474, "loss_nan_ranks": 0, "loss_rank_avg": 0.020152652636170387, "step": 160, "valid_targets_mean": 1157.0, "valid_targets_min": 537 }, { "epoch": 4.024390243902439, "grad_norm": 0.5913764265658609, "learning_rate": 1.8540093706781848e-05, "loss": 0.1105, "loss_nan_ranks": 0, "loss_rank_avg": 0.023658908903598785, "step": 165, "valid_targets_mean": 1657.0, "valid_targets_min": 1129 }, { "epoch": 4.146341463414634, "grad_norm": 0.8822354625847237, "learning_rate": 1.7329125633612044e-05, "loss": 0.1177, "loss_nan_ranks": 0, "loss_rank_avg": 0.03471684455871582, "step": 170, "valid_targets_mean": 1434.2, "valid_targets_min": 1080 }, { "epoch": 4.2682926829268295, "grad_norm": 1.1958146779982144, "learning_rate": 1.6128054926749403e-05, "loss": 0.1245, "loss_nan_ranks": 0, "loss_rank_avg": 0.03604736179113388, "step": 175, "valid_targets_mean": 2452.0, "valid_targets_min": 883 }, { "epoch": 4.390243902439025, "grad_norm": 0.660365938604491, "learning_rate": 1.4941332352166385e-05, "loss": 0.1253, "loss_nan_ranks": 0, "loss_rank_avg": 0.022628050297498703, "step": 180, "valid_targets_mean": 2193.5, "valid_targets_min": 832 }, { "epoch": 4.512195121951219, "grad_norm": 0.6469748368984156, "learning_rate": 1.3773355506460369e-05, "loss": 0.1212, "loss_nan_ranks": 0, "loss_rank_avg": 0.04104577377438545, "step": 185, "valid_targets_mean": 2404.2, "valid_targets_min": 491 }, { "epoch": 4.634146341463414, "grad_norm": 0.5688289937825557, "learning_rate": 1.2628452520832766e-05, "loss": 0.1289, "loss_nan_ranks": 0, "loss_rank_avg": 0.026066523045301437, "step": 190, "valid_targets_mean": 1617.0, "valid_targets_min": 648 }, { "epoch": 4.7560975609756095, "grad_norm": 0.8846513600102238, "learning_rate": 1.1510866022483702e-05, "loss": 0.112, "loss_nan_ranks": 0, "loss_rank_avg": 0.03839657083153725, "step": 195, "valid_targets_mean": 2619.5, "valid_targets_min": 832 }, { "epoch": 4.878048780487805, "grad_norm": 0.8205515547497473, "learning_rate": 1.0424737412855825e-05, "loss": 0.1243, "loss_nan_ranks": 0, "loss_rank_avg": 0.05243876576423645, "step": 200, "valid_targets_mean": 1985.8, "valid_targets_min": 1428 }, { "epoch": 5.0, "grad_norm": 0.8961051415857917, "learning_rate": 9.374091520986936e-06, "loss": 0.144, "loss_nan_ranks": 0, "loss_rank_avg": 0.026954276487231255, "step": 205, "valid_targets_mean": 2568.2, "valid_targets_min": 795 }, { "epoch": 5.121951219512195, "grad_norm": 0.6162209862161441, "learning_rate": 8.362821688840947e-06, "loss": 0.1142, "loss_nan_ranks": 0, "loss_rank_avg": 0.024338502436876297, "step": 210, "valid_targets_mean": 1392.8, "valid_targets_min": 1031 }, { "epoch": 5.2439024390243905, "grad_norm": 0.8925372695642547, "learning_rate": 7.394675343885827e-06, "loss": 0.1041, "loss_nan_ranks": 0, "loss_rank_avg": 0.03295106813311577, "step": 215, "valid_targets_mean": 3177.8, "valid_targets_min": 1031 }, { "epoch": 5.365853658536586, "grad_norm": 0.6073755624637226, "learning_rate": 6.473240112381944e-06, "loss": 0.1122, "loss_nan_ranks": 0, "loss_rank_avg": 0.02173219993710518, "step": 220, "valid_targets_mean": 3121.0, "valid_targets_min": 1748 }, { "epoch": 5.487804878048781, "grad_norm": 1.040705900656647, "learning_rate": 5.601930524840087e-06, "loss": 0.1248, "loss_nan_ranks": 0, "loss_rank_avg": 0.025791862979531288, "step": 225, "valid_targets_mean": 2800.5, "valid_targets_min": 929 }, { "epoch": 5.609756097560975, "grad_norm": 0.8950373265293078, "learning_rate": 4.7839753629144395e-06, "loss": 0.1106, "loss_nan_ranks": 0, "loss_rank_avg": 0.014685861766338348, "step": 230, "valid_targets_mean": 945.0, "valid_targets_min": 421 }, { "epoch": 5.7317073170731705, "grad_norm": 0.7649064738807579, "learning_rate": 4.022405694618659e-06, "loss": 0.108, "loss_nan_ranks": 0, "loss_rank_avg": 0.02761884592473507, "step": 235, "valid_targets_mean": 1485.0, "valid_targets_min": 331 }, { "epoch": 5.853658536585366, "grad_norm": 0.8719679552921337, "learning_rate": 3.320043642202444e-06, "loss": 0.1033, "loss_nan_ranks": 0, "loss_rank_avg": 0.03598159924149513, "step": 240, "valid_targets_mean": 3070.0, "valid_targets_min": 1260 }, { "epoch": 5.975609756097561, "grad_norm": 0.6242438144503197, "learning_rate": 2.679491924311226e-06, "loss": 0.1044, "loss_nan_ranks": 0, "loss_rank_avg": 0.025556661188602448, "step": 245, "valid_targets_mean": 2245.0, "valid_targets_min": 1069 }, { "epoch": 6.097560975609756, "grad_norm": 0.8732700455256523, "learning_rate": 2.103124211182164e-06, "loss": 0.1087, "loss_nan_ranks": 0, "loss_rank_avg": 0.03347954526543617, "step": 250, "valid_targets_mean": 1517.0, "valid_targets_min": 606 }, { "epoch": 6.219512195121951, "grad_norm": 1.0111215639307505, "learning_rate": 1.5930763286168138e-06, "loss": 0.1049, "loss_nan_ranks": 0, "loss_rank_avg": 0.026082392781972885, "step": 255, "valid_targets_mean": 1642.8, "valid_targets_min": 960 }, { "epoch": 6.341463414634147, "grad_norm": 0.7577281317860388, "learning_rate": 1.1512383433257112e-06, "loss": 0.0969, "loss_nan_ranks": 0, "loss_rank_avg": 0.01752658188343048, "step": 260, "valid_targets_mean": 2109.5, "valid_targets_min": 999 }, { "epoch": 6.463414634146342, "grad_norm": 0.6730869279262718, "learning_rate": 7.792475589738679e-07, "loss": 0.1071, "loss_nan_ranks": 0, "loss_rank_avg": 0.020278798416256905, "step": 265, "valid_targets_mean": 1482.5, "valid_targets_min": 859 }, { "epoch": 6.585365853658536, "grad_norm": 0.7854617549186186, "learning_rate": 4.784824488814588e-07, "loss": 0.1053, "loss_nan_ranks": 0, "loss_rank_avg": 0.022827567532658577, "step": 270, "valid_targets_mean": 2941.8, "valid_targets_min": 695 }, { "epoch": 6.7073170731707314, "grad_norm": 0.5254164831244615, "learning_rate": 2.5005754786317173e-07, "loss": 0.1065, "loss_nan_ranks": 0, "loss_rank_avg": 0.022403722628951073, "step": 275, "valid_targets_mean": 2718.0, "valid_targets_min": 1325 }, { "epoch": 6.829268292682927, "grad_norm": 0.6678257554250038, "learning_rate": 9.481932213528444e-08, "loss": 0.0987, "loss_nan_ranks": 0, "loss_rank_avg": 0.027537822723388672, "step": 280, "valid_targets_mean": 3525.0, "valid_targets_min": 1519 }, { "epoch": 6.951219512195122, "grad_norm": 0.8096613625255807, "learning_rate": 1.334303259521219e-08, "loss": 0.1098, "loss_nan_ranks": 0, "loss_rank_avg": 0.03351534903049469, "step": 285, "valid_targets_mean": 1808.5, "valid_targets_min": 1244 }, { "epoch": 7.0, "loss_nan_ranks": 0, "loss_rank_avg": 0.016517292708158493, "step": 287, "total_flos": 9.154523001624986e+16, "train_loss": 0.17878121568558522, "train_runtime": 15474.591, "train_samples_per_second": 0.297, "train_steps_per_second": 0.019, "valid_targets_mean": 880.2, "valid_targets_min": 562 } ], "logging_steps": 5, "max_steps": 287, "num_input_tokens_seen": 0, "num_train_epochs": 7, "save_steps": 1500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.154523001624986e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }