{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 7.0, "eval_steps": 500, "global_step": 588, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05952380952380952, "grad_norm": 26.475136303319733, "learning_rate": 2.7118644067796613e-06, "loss": 0.8757, "loss_nan_ranks": 0, "loss_rank_avg": 0.28182709217071533, "step": 5, "valid_targets_mean": 5259.6, "valid_targets_min": 241 }, { "epoch": 0.11904761904761904, "grad_norm": 10.752364491117543, "learning_rate": 6.1016949152542385e-06, "loss": 0.7723, "loss_nan_ranks": 0, "loss_rank_avg": 0.23036649823188782, "step": 10, "valid_targets_mean": 5471.6, "valid_targets_min": 308 }, { "epoch": 0.17857142857142858, "grad_norm": 2.0975582533943844, "learning_rate": 9.491525423728815e-06, "loss": 0.5988, "loss_nan_ranks": 0, "loss_rank_avg": 0.1777261197566986, "step": 15, "valid_targets_mean": 5108.2, "valid_targets_min": 437 }, { "epoch": 0.23809523809523808, "grad_norm": 1.4703223243635486, "learning_rate": 1.288135593220339e-05, "loss": 0.548, "loss_nan_ranks": 0, "loss_rank_avg": 0.1800306737422943, "step": 20, "valid_targets_mean": 5685.6, "valid_targets_min": 102 }, { "epoch": 0.2976190476190476, "grad_norm": 0.8534483461368364, "learning_rate": 1.6271186440677967e-05, "loss": 0.5087, "loss_nan_ranks": 0, "loss_rank_avg": 0.14695829153060913, "step": 25, "valid_targets_mean": 4944.7, "valid_targets_min": 2418 }, { "epoch": 0.35714285714285715, "grad_norm": 0.582534993586709, "learning_rate": 1.9661016949152545e-05, "loss": 0.4839, "loss_nan_ranks": 0, "loss_rank_avg": 0.16712947189807892, "step": 30, "valid_targets_mean": 6107.1, "valid_targets_min": 2487 }, { "epoch": 0.4166666666666667, "grad_norm": 0.49806474628335173, "learning_rate": 2.3050847457627122e-05, "loss": 0.4524, "loss_nan_ranks": 0, "loss_rank_avg": 0.15718674659729004, "step": 35, "valid_targets_mean": 6053.5, "valid_targets_min": 439 }, { "epoch": 0.47619047619047616, "grad_norm": 0.39977189555606873, "learning_rate": 2.6440677966101696e-05, "loss": 0.4284, "loss_nan_ranks": 0, "loss_rank_avg": 0.13855797052383423, "step": 40, "valid_targets_mean": 5599.5, "valid_targets_min": 2254 }, { "epoch": 0.5357142857142857, "grad_norm": 0.35501868807640347, "learning_rate": 2.9830508474576274e-05, "loss": 0.4083, "loss_nan_ranks": 0, "loss_rank_avg": 0.1349203884601593, "step": 45, "valid_targets_mean": 6183.1, "valid_targets_min": 231 }, { "epoch": 0.5952380952380952, "grad_norm": 0.3120105713672939, "learning_rate": 3.322033898305085e-05, "loss": 0.3911, "loss_nan_ranks": 0, "loss_rank_avg": 0.14311061799526215, "step": 50, "valid_targets_mean": 5855.2, "valid_targets_min": 2956 }, { "epoch": 0.6547619047619048, "grad_norm": 0.2727710443190103, "learning_rate": 3.6610169491525426e-05, "loss": 0.3768, "loss_nan_ranks": 0, "loss_rank_avg": 0.12037570774555206, "step": 55, "valid_targets_mean": 5236.8, "valid_targets_min": 354 }, { "epoch": 0.7142857142857143, "grad_norm": 0.2757726260661749, "learning_rate": 4e-05, "loss": 0.3573, "loss_nan_ranks": 0, "loss_rank_avg": 0.11726392805576324, "step": 60, "valid_targets_mean": 5693.2, "valid_targets_min": 229 }, { "epoch": 0.7738095238095238, "grad_norm": 0.2786546982169116, "learning_rate": 3.9991183494162245e-05, "loss": 0.3501, "loss_nan_ranks": 0, "loss_rank_avg": 0.11834583431482315, "step": 65, "valid_targets_mean": 6120.8, "valid_targets_min": 2344 }, { "epoch": 0.8333333333333334, "grad_norm": 0.23636721617724882, "learning_rate": 3.996474174972647e-05, "loss": 0.3417, "loss_nan_ranks": 0, "loss_rank_avg": 0.11998672783374786, "step": 70, "valid_targets_mean": 5844.7, "valid_targets_min": 2484 }, { "epoch": 0.8928571428571429, "grad_norm": 0.24309351059305612, "learning_rate": 3.9920698079072125e-05, "loss": 0.3344, "loss_nan_ranks": 0, "loss_rank_avg": 0.10884048044681549, "step": 75, "valid_targets_mean": 5633.0, "valid_targets_min": 494 }, { "epoch": 0.9523809523809523, "grad_norm": 0.2472743814168574, "learning_rate": 3.9859091313327124e-05, "loss": 0.3289, "loss_nan_ranks": 0, "loss_rank_avg": 0.09308268129825592, "step": 80, "valid_targets_mean": 5145.1, "valid_targets_min": 2288 }, { "epoch": 1.0119047619047619, "grad_norm": 0.2635411800539745, "learning_rate": 3.977997576813247e-05, "loss": 0.3198, "loss_nan_ranks": 0, "loss_rank_avg": 0.11220236867666245, "step": 85, "valid_targets_mean": 5678.7, "valid_targets_min": 2563 }, { "epoch": 1.0714285714285714, "grad_norm": 0.2880444080728388, "learning_rate": 3.968342119575477e-05, "loss": 0.3123, "loss_nan_ranks": 0, "loss_rank_avg": 0.10120871663093567, "step": 90, "valid_targets_mean": 5122.5, "valid_targets_min": 2218 }, { "epoch": 1.130952380952381, "grad_norm": 0.2710755850501796, "learning_rate": 3.956951272358911e-05, "loss": 0.3128, "loss_nan_ranks": 0, "loss_rank_avg": 0.10858584940433502, "step": 95, "valid_targets_mean": 5793.9, "valid_targets_min": 2072 }, { "epoch": 1.1904761904761905, "grad_norm": 0.2559535005305801, "learning_rate": 3.943835077910648e-05, "loss": 0.3078, "loss_nan_ranks": 0, "loss_rank_avg": 0.08397121727466583, "step": 100, "valid_targets_mean": 4776.5, "valid_targets_min": 376 }, { "epoch": 1.25, "grad_norm": 0.2362436236599847, "learning_rate": 3.9290051001311815e-05, "loss": 0.3036, "loss_nan_ranks": 0, "loss_rank_avg": 0.10915631055831909, "step": 105, "valid_targets_mean": 6117.2, "valid_targets_min": 1862 }, { "epoch": 1.3095238095238095, "grad_norm": 0.2852961689327156, "learning_rate": 3.912474413879077e-05, "loss": 0.3055, "loss_nan_ranks": 0, "loss_rank_avg": 0.09754365682601929, "step": 110, "valid_targets_mean": 5148.4, "valid_targets_min": 436 }, { "epoch": 1.369047619047619, "grad_norm": 0.2794753115317547, "learning_rate": 3.894257593443519e-05, "loss": 0.2956, "loss_nan_ranks": 0, "loss_rank_avg": 0.10125661641359329, "step": 115, "valid_targets_mean": 6039.0, "valid_targets_min": 494 }, { "epoch": 1.4285714285714286, "grad_norm": 0.26532864715105864, "learning_rate": 3.874370699694878e-05, "loss": 0.298, "loss_nan_ranks": 0, "loss_rank_avg": 0.10564189404249191, "step": 120, "valid_targets_mean": 6079.8, "valid_targets_min": 471 }, { "epoch": 1.4880952380952381, "grad_norm": 0.29837003528349804, "learning_rate": 3.8528312659246395e-05, "loss": 0.2976, "loss_nan_ranks": 0, "loss_rank_avg": 0.09849759191274643, "step": 125, "valid_targets_mean": 5342.6, "valid_targets_min": 1368 }, { "epoch": 1.5476190476190477, "grad_norm": 0.23646296987288887, "learning_rate": 3.8296582823871596e-05, "loss": 0.2975, "loss_nan_ranks": 0, "loss_rank_avg": 0.09054168313741684, "step": 130, "valid_targets_mean": 4966.0, "valid_targets_min": 220 }, { "epoch": 1.6071428571428572, "grad_norm": 0.2618136463189114, "learning_rate": 3.804872179556902e-05, "loss": 0.2944, "loss_nan_ranks": 0, "loss_rank_avg": 0.09321890771389008, "step": 135, "valid_targets_mean": 4762.0, "valid_targets_min": 165 }, { "epoch": 1.6666666666666665, "grad_norm": 0.2708942100931431, "learning_rate": 3.778494810115896e-05, "loss": 0.2938, "loss_nan_ranks": 0, "loss_rank_avg": 0.1079484075307846, "step": 140, "valid_targets_mean": 5604.2, "valid_targets_min": 451 }, { "epoch": 1.7261904761904763, "grad_norm": 0.2789099088534343, "learning_rate": 3.750549429687309e-05, "loss": 0.2933, "loss_nan_ranks": 0, "loss_rank_avg": 0.08427592366933823, "step": 145, "valid_targets_mean": 5146.2, "valid_targets_min": 427 }, { "epoch": 1.7857142857142856, "grad_norm": 0.2678992582042396, "learning_rate": 3.721060676332109e-05, "loss": 0.2962, "loss_nan_ranks": 0, "loss_rank_avg": 0.10124705731868744, "step": 150, "valid_targets_mean": 5797.0, "valid_targets_min": 389 }, { "epoch": 1.8452380952380953, "grad_norm": 0.2684299288852733, "learning_rate": 3.6900545488269066e-05, "loss": 0.2894, "loss_nan_ranks": 0, "loss_rank_avg": 0.0845484584569931, "step": 155, "valid_targets_mean": 5063.4, "valid_targets_min": 445 }, { "epoch": 1.9047619047619047, "grad_norm": 0.2833413534409504, "learning_rate": 3.657558383742117e-05, "loss": 0.2919, "loss_nan_ranks": 0, "loss_rank_avg": 0.09527384489774704, "step": 160, "valid_targets_mean": 5275.2, "valid_targets_min": 308 }, { "epoch": 1.9642857142857144, "grad_norm": 0.28986568017083997, "learning_rate": 3.6236008313406594e-05, "loss": 0.2871, "loss_nan_ranks": 0, "loss_rank_avg": 0.10269998013973236, "step": 165, "valid_targets_mean": 5881.0, "valid_targets_min": 386 }, { "epoch": 2.0238095238095237, "grad_norm": 0.27082985663787124, "learning_rate": 3.58821183031843e-05, "loss": 0.2813, "loss_nan_ranks": 0, "loss_rank_avg": 0.09580333530902863, "step": 170, "valid_targets_mean": 5845.4, "valid_targets_min": 1920 }, { "epoch": 2.0833333333333335, "grad_norm": 0.27912438131905437, "learning_rate": 3.55142258140884e-05, "loss": 0.2816, "loss_nan_ranks": 0, "loss_rank_avg": 0.0991968959569931, "step": 175, "valid_targets_mean": 6037.7, "valid_targets_min": 2878 }, { "epoch": 2.142857142857143, "grad_norm": 0.2495296583091803, "learning_rate": 3.513265519874668e-05, "loss": 0.2779, "loss_nan_ranks": 0, "loss_rank_avg": 0.0991060882806778, "step": 180, "valid_targets_mean": 5959.1, "valid_targets_min": 470 }, { "epoch": 2.2023809523809526, "grad_norm": 0.24797503017191885, "learning_rate": 3.473774286911489e-05, "loss": 0.2784, "loss_nan_ranks": 0, "loss_rank_avg": 0.10953380167484283, "step": 185, "valid_targets_mean": 6153.2, "valid_targets_min": 3118 }, { "epoch": 2.261904761904762, "grad_norm": 0.2635797062915291, "learning_rate": 3.432983699987901e-05, "loss": 0.2785, "loss_nan_ranks": 0, "loss_rank_avg": 0.07885843515396118, "step": 190, "valid_targets_mean": 4745.1, "valid_targets_min": 335 }, { "epoch": 2.3214285714285716, "grad_norm": 0.27743273802507495, "learning_rate": 3.390929722148677e-05, "loss": 0.2786, "loss_nan_ranks": 0, "loss_rank_avg": 0.08541278541088104, "step": 195, "valid_targets_mean": 5393.5, "valid_targets_min": 2005 }, { "epoch": 2.380952380952381, "grad_norm": 0.27104625586587555, "learning_rate": 3.3476494303079285e-05, "loss": 0.2778, "loss_nan_ranks": 0, "loss_rank_avg": 0.08726505935192108, "step": 200, "valid_targets_mean": 5373.0, "valid_targets_min": 257 }, { "epoch": 2.4404761904761907, "grad_norm": 0.25680629085820356, "learning_rate": 3.303180982560224e-05, "loss": 0.2746, "loss_nan_ranks": 0, "loss_rank_avg": 0.08836235105991364, "step": 205, "valid_targets_mean": 5117.2, "valid_targets_min": 361 }, { "epoch": 2.5, "grad_norm": 0.2722466936160436, "learning_rate": 3.2575635845384787e-05, "loss": 0.2746, "loss_nan_ranks": 0, "loss_rank_avg": 0.09811330586671829, "step": 210, "valid_targets_mean": 5944.3, "valid_targets_min": 2473 }, { "epoch": 2.5595238095238093, "grad_norm": 0.2751403433864084, "learning_rate": 3.21083745484829e-05, "loss": 0.2707, "loss_nan_ranks": 0, "loss_rank_avg": 0.07897627353668213, "step": 215, "valid_targets_mean": 4240.3, "valid_targets_min": 222 }, { "epoch": 2.619047619047619, "grad_norm": 0.2862317925607407, "learning_rate": 3.1630437896091756e-05, "loss": 0.2792, "loss_nan_ranks": 0, "loss_rank_avg": 0.08746589720249176, "step": 220, "valid_targets_mean": 5690.9, "valid_targets_min": 440 }, { "epoch": 2.678571428571429, "grad_norm": 0.2813165779096547, "learning_rate": 3.114224726133996e-05, "loss": 0.273, "loss_nan_ranks": 0, "loss_rank_avg": 0.0828898623585701, "step": 225, "valid_targets_mean": 4970.9, "valid_targets_min": 1760 }, { "epoch": 2.738095238095238, "grad_norm": 0.27056484197253816, "learning_rate": 3.0644233057785615e-05, "loss": 0.2706, "loss_nan_ranks": 0, "loss_rank_avg": 0.08945001661777496, "step": 230, "valid_targets_mean": 5406.1, "valid_targets_min": 426 }, { "epoch": 2.7976190476190474, "grad_norm": 0.2685779777737954, "learning_rate": 3.0136834359942032e-05, "loss": 0.2684, "loss_nan_ranks": 0, "loss_rank_avg": 0.08828261494636536, "step": 235, "valid_targets_mean": 5503.7, "valid_targets_min": 368 }, { "epoch": 2.857142857142857, "grad_norm": 0.2732406218897489, "learning_rate": 2.9620498516167356e-05, "loss": 0.2717, "loss_nan_ranks": 0, "loss_rank_avg": 0.07858332991600037, "step": 240, "valid_targets_mean": 5290.2, "valid_targets_min": 2127 }, { "epoch": 2.9166666666666665, "grad_norm": 0.2399472243045982, "learning_rate": 2.9095680754259687e-05, "loss": 0.2677, "loss_nan_ranks": 0, "loss_rank_avg": 0.08259420096874237, "step": 245, "valid_targets_mean": 4813.6, "valid_targets_min": 409 }, { "epoch": 2.9761904761904763, "grad_norm": 0.24882382594549618, "learning_rate": 2.8562843780105182e-05, "loss": 0.2702, "loss_nan_ranks": 0, "loss_rank_avg": 0.10204588621854782, "step": 250, "valid_targets_mean": 6258.4, "valid_targets_min": 1978 }, { "epoch": 3.0357142857142856, "grad_norm": 0.24889920345299785, "learning_rate": 2.8022457369733165e-05, "loss": 0.2662, "loss_nan_ranks": 0, "loss_rank_avg": 0.08854080736637115, "step": 255, "valid_targets_mean": 6058.7, "valid_targets_min": 437 }, { "epoch": 3.0952380952380953, "grad_norm": 0.26589186213113003, "learning_rate": 2.7474997955137803e-05, "loss": 0.2625, "loss_nan_ranks": 0, "loss_rank_avg": 0.07796312123537064, "step": 260, "valid_targets_mean": 5013.2, "valid_targets_min": 416 }, { "epoch": 3.1547619047619047, "grad_norm": 0.2892003595844532, "learning_rate": 2.6920948204231573e-05, "loss": 0.2658, "loss_nan_ranks": 0, "loss_rank_avg": 0.0770723819732666, "step": 265, "valid_targets_mean": 4645.8, "valid_targets_min": 424 }, { "epoch": 3.2142857142857144, "grad_norm": 0.29019028159438703, "learning_rate": 2.636079659530079e-05, "loss": 0.265, "loss_nan_ranks": 0, "loss_rank_avg": 0.08780994266271591, "step": 270, "valid_targets_mean": 5075.2, "valid_targets_min": 311 }, { "epoch": 3.2738095238095237, "grad_norm": 0.25707211946258385, "learning_rate": 2.5795036986338477e-05, "loss": 0.2656, "loss_nan_ranks": 0, "loss_rank_avg": 0.07921002805233002, "step": 275, "valid_targets_mean": 4983.1, "valid_targets_min": 259 }, { "epoch": 3.3333333333333335, "grad_norm": 0.2393833152778856, "learning_rate": 2.522416817963416e-05, "loss": 0.2615, "loss_nan_ranks": 0, "loss_rank_avg": 0.08294697105884552, "step": 280, "valid_targets_mean": 5771.5, "valid_targets_min": 3118 }, { "epoch": 3.392857142857143, "grad_norm": 0.25514689276071223, "learning_rate": 2.464869348200452e-05, "loss": 0.2592, "loss_nan_ranks": 0, "loss_rank_avg": 0.09682682156562805, "step": 285, "valid_targets_mean": 5856.5, "valid_targets_min": 487 }, { "epoch": 3.4523809523809526, "grad_norm": 0.25666137663935856, "learning_rate": 2.4069120261052682e-05, "loss": 0.2658, "loss_nan_ranks": 0, "loss_rank_avg": 0.10615084320306778, "step": 290, "valid_targets_mean": 6388.9, "valid_targets_min": 2484 }, { "epoch": 3.511904761904762, "grad_norm": 0.24250690776748496, "learning_rate": 2.3485959497847223e-05, "loss": 0.2583, "loss_nan_ranks": 0, "loss_rank_avg": 0.07441714406013489, "step": 295, "valid_targets_mean": 5149.7, "valid_targets_min": 396 }, { "epoch": 3.571428571428571, "grad_norm": 0.2605704598646807, "learning_rate": 2.2899725336415468e-05, "loss": 0.2619, "loss_nan_ranks": 0, "loss_rank_avg": 0.08115138858556747, "step": 300, "valid_targets_mean": 5677.9, "valid_targets_min": 2348 }, { "epoch": 3.630952380952381, "grad_norm": 0.24661982746609362, "learning_rate": 2.2310934630448076e-05, "loss": 0.2598, "loss_nan_ranks": 0, "loss_rank_avg": 0.08594545722007751, "step": 305, "valid_targets_mean": 5679.9, "valid_targets_min": 1757 }, { "epoch": 3.6904761904761907, "grad_norm": 0.25915778890419666, "learning_rate": 2.1720106487614678e-05, "loss": 0.2564, "loss_nan_ranks": 0, "loss_rank_avg": 0.09154875576496124, "step": 310, "valid_targets_mean": 5870.5, "valid_targets_min": 2366 }, { "epoch": 3.75, "grad_norm": 0.2517589506891814, "learning_rate": 2.112776181189232e-05, "loss": 0.2622, "loss_nan_ranks": 0, "loss_rank_avg": 0.08415160328149796, "step": 315, "valid_targets_mean": 5251.9, "valid_targets_min": 401 }, { "epoch": 3.8095238095238093, "grad_norm": 0.2700239209259606, "learning_rate": 2.0534422844310144e-05, "loss": 0.2644, "loss_nan_ranks": 0, "loss_rank_avg": 0.07980450242757797, "step": 320, "valid_targets_mean": 4789.0, "valid_targets_min": 321 }, { "epoch": 3.869047619047619, "grad_norm": 0.2549269120465629, "learning_rate": 1.9940612702515292e-05, "loss": 0.2574, "loss_nan_ranks": 0, "loss_rank_avg": 0.1039179265499115, "step": 325, "valid_targets_mean": 6290.8, "valid_targets_min": 515 }, { "epoch": 3.928571428571429, "grad_norm": 0.28891001950049783, "learning_rate": 1.934685491956595e-05, "loss": 0.2585, "loss_nan_ranks": 0, "loss_rank_avg": 0.08675411343574524, "step": 330, "valid_targets_mean": 5501.4, "valid_targets_min": 414 }, { "epoch": 3.988095238095238, "grad_norm": 0.25893042450210946, "learning_rate": 1.8753672982358055e-05, "loss": 0.2617, "loss_nan_ranks": 0, "loss_rank_avg": 0.08645622432231903, "step": 335, "valid_targets_mean": 5373.1, "valid_targets_min": 2838 }, { "epoch": 4.0476190476190474, "grad_norm": 0.25042985945546514, "learning_rate": 1.8161589870092842e-05, "loss": 0.2585, "loss_nan_ranks": 0, "loss_rank_avg": 0.08585283160209656, "step": 340, "valid_targets_mean": 5608.5, "valid_targets_min": 421 }, { "epoch": 4.107142857142857, "grad_norm": 0.26603784455504254, "learning_rate": 1.7571127593191877e-05, "loss": 0.252, "loss_nan_ranks": 0, "loss_rank_avg": 0.08927032351493835, "step": 345, "valid_targets_mean": 5348.3, "valid_targets_min": 435 }, { "epoch": 4.166666666666667, "grad_norm": 0.2808689664261119, "learning_rate": 1.6982806733066303e-05, "loss": 0.2544, "loss_nan_ranks": 0, "loss_rank_avg": 0.08572648465633392, "step": 350, "valid_targets_mean": 5795.5, "valid_targets_min": 1835 }, { "epoch": 4.226190476190476, "grad_norm": 0.2584899797868476, "learning_rate": 1.639714598314588e-05, "loss": 0.2524, "loss_nan_ranks": 0, "loss_rank_avg": 0.07797005027532578, "step": 355, "valid_targets_mean": 4981.5, "valid_targets_min": 264 }, { "epoch": 4.285714285714286, "grad_norm": 0.2485126801540494, "learning_rate": 1.5814661691572673e-05, "loss": 0.2534, "loss_nan_ranks": 0, "loss_rank_avg": 0.08520189672708511, "step": 360, "valid_targets_mean": 6290.2, "valid_targets_min": 1974 }, { "epoch": 4.345238095238095, "grad_norm": 0.2513271250286924, "learning_rate": 1.5235867405962397e-05, "loss": 0.2562, "loss_nan_ranks": 0, "loss_rank_avg": 0.09782940149307251, "step": 365, "valid_targets_mean": 6345.8, "valid_targets_min": 335 }, { "epoch": 4.404761904761905, "grad_norm": 0.250961969245666, "learning_rate": 1.4661273420634836e-05, "loss": 0.2557, "loss_nan_ranks": 0, "loss_rank_avg": 0.09486376494169235, "step": 370, "valid_targets_mean": 5938.4, "valid_targets_min": 2327 }, { "epoch": 4.464285714285714, "grad_norm": 0.24731218596658003, "learning_rate": 1.4091386326712599e-05, "loss": 0.2534, "loss_nan_ranks": 0, "loss_rank_avg": 0.0861523300409317, "step": 375, "valid_targets_mean": 5634.8, "valid_targets_min": 349 }, { "epoch": 4.523809523809524, "grad_norm": 0.24844990717404133, "learning_rate": 1.3526708565484726e-05, "loss": 0.2555, "loss_nan_ranks": 0, "loss_rank_avg": 0.08229143917560577, "step": 380, "valid_targets_mean": 5422.2, "valid_targets_min": 1875 }, { "epoch": 4.583333333333333, "grad_norm": 0.2333334277350641, "learning_rate": 1.2967737985429041e-05, "loss": 0.2506, "loss_nan_ranks": 0, "loss_rank_avg": 0.08357102423906326, "step": 385, "valid_targets_mean": 5235.4, "valid_targets_min": 1760 }, { "epoch": 4.642857142857143, "grad_norm": 0.25215688549131215, "learning_rate": 1.2414967403283776e-05, "loss": 0.2491, "loss_nan_ranks": 0, "loss_rank_avg": 0.08776518702507019, "step": 390, "valid_targets_mean": 5544.2, "valid_targets_min": 464 }, { "epoch": 4.7023809523809526, "grad_norm": 0.2293231506390837, "learning_rate": 1.1868884169555353e-05, "loss": 0.2525, "loss_nan_ranks": 0, "loss_rank_avg": 0.07636909186840057, "step": 395, "valid_targets_mean": 5233.3, "valid_targets_min": 308 }, { "epoch": 4.761904761904762, "grad_norm": 0.2664616026061766, "learning_rate": 1.1329969738845605e-05, "loss": 0.258, "loss_nan_ranks": 0, "loss_rank_avg": 0.08857310563325882, "step": 400, "valid_targets_mean": 5488.7, "valid_targets_min": 2368 }, { "epoch": 4.821428571428571, "grad_norm": 0.22965867799638506, "learning_rate": 1.0798699245376959e-05, "loss": 0.2578, "loss_nan_ranks": 0, "loss_rank_avg": 0.09623508155345917, "step": 405, "valid_targets_mean": 6153.3, "valid_targets_min": 2642 }, { "epoch": 4.880952380952381, "grad_norm": 0.23714434789026348, "learning_rate": 1.0275541084090127e-05, "loss": 0.2492, "loss_nan_ranks": 0, "loss_rank_avg": 0.09711508452892303, "step": 410, "valid_targets_mean": 6321.8, "valid_targets_min": 2856 }, { "epoch": 4.940476190476191, "grad_norm": 0.2335193268265453, "learning_rate": 9.760956497683412e-06, "loss": 0.2555, "loss_nan_ranks": 0, "loss_rank_avg": 0.08174271881580353, "step": 415, "valid_targets_mean": 5300.7, "valid_targets_min": 222 }, { "epoch": 5.0, "grad_norm": 0.24720488171742358, "learning_rate": 9.255399169957823e-06, "loss": 0.2522, "loss_nan_ranks": 0, "loss_rank_avg": 0.09777113795280457, "step": 420, "valid_targets_mean": 5800.2, "valid_targets_min": 2100 }, { "epoch": 5.059523809523809, "grad_norm": 0.24642098972509158, "learning_rate": 8.759314825826486e-06, "loss": 0.2524, "loss_nan_ranks": 0, "loss_rank_avg": 0.08720912039279938, "step": 425, "valid_targets_mean": 5300.3, "valid_targets_min": 1875 }, { "epoch": 5.119047619047619, "grad_norm": 0.2571302114424818, "learning_rate": 8.273140838341003e-06, "loss": 0.2535, "loss_nan_ranks": 0, "loss_rank_avg": 0.07905574142932892, "step": 430, "valid_targets_mean": 5588.5, "valid_targets_min": 355 }, { "epoch": 5.178571428571429, "grad_norm": 0.242878252474619, "learning_rate": 7.797305843081255e-06, "loss": 0.2555, "loss_nan_ranks": 0, "loss_rank_avg": 0.07647094130516052, "step": 435, "valid_targets_mean": 5138.4, "valid_targets_min": 449 }, { "epoch": 5.238095238095238, "grad_norm": 0.23526010582884302, "learning_rate": 7.332229360248597e-06, "loss": 0.2501, "loss_nan_ranks": 0, "loss_rank_avg": 0.07705177366733551, "step": 440, "valid_targets_mean": 5300.1, "valid_targets_min": 316 }, { "epoch": 5.2976190476190474, "grad_norm": 0.25015849653867017, "learning_rate": 6.87832142479562e-06, "loss": 0.2473, "loss_nan_ranks": 0, "loss_rank_avg": 0.07398971170186996, "step": 445, "valid_targets_mean": 4832.6, "valid_targets_min": 386 }, { "epoch": 5.357142857142857, "grad_norm": 0.2378763042427905, "learning_rate": 6.4359822249185934e-06, "loss": 0.2481, "loss_nan_ranks": 0, "loss_rank_avg": 0.08934387564659119, "step": 450, "valid_targets_mean": 5822.2, "valid_targets_min": 1976 }, { "epoch": 5.416666666666667, "grad_norm": 0.2205842931707241, "learning_rate": 6.005601749231318e-06, "loss": 0.2495, "loss_nan_ranks": 0, "loss_rank_avg": 0.08240607380867004, "step": 455, "valid_targets_mean": 5815.5, "valid_targets_min": 2105 }, { "epoch": 5.476190476190476, "grad_norm": 0.23001451637384798, "learning_rate": 5.587559442931429e-06, "loss": 0.2478, "loss_nan_ranks": 0, "loss_rank_avg": 0.09403956681489944, "step": 460, "valid_targets_mean": 5927.9, "valid_targets_min": 1898 }, { "epoch": 5.535714285714286, "grad_norm": 0.2457826876030719, "learning_rate": 5.18222387326232e-06, "loss": 0.2451, "loss_nan_ranks": 0, "loss_rank_avg": 0.07478703558444977, "step": 465, "valid_targets_mean": 5302.4, "valid_targets_min": 2446 }, { "epoch": 5.595238095238095, "grad_norm": 0.23812933696457267, "learning_rate": 4.7899524045656186e-06, "loss": 0.2456, "loss_nan_ranks": 0, "loss_rank_avg": 0.07499450445175171, "step": 470, "valid_targets_mean": 5108.1, "valid_targets_min": 429 }, { "epoch": 5.654761904761905, "grad_norm": 0.2369372815448469, "learning_rate": 4.411090883210684e-06, "loss": 0.2469, "loss_nan_ranks": 0, "loss_rank_avg": 0.07033229619264603, "step": 475, "valid_targets_mean": 5013.0, "valid_targets_min": 576 }, { "epoch": 5.714285714285714, "grad_norm": 0.23807774857438638, "learning_rate": 4.0459733326790055e-06, "loss": 0.2463, "loss_nan_ranks": 0, "loss_rank_avg": 0.08270242065191269, "step": 480, "valid_targets_mean": 5699.3, "valid_targets_min": 457 }, { "epoch": 5.773809523809524, "grad_norm": 0.24524035384617784, "learning_rate": 3.6949216590721506e-06, "loss": 0.2523, "loss_nan_ranks": 0, "loss_rank_avg": 0.08760789036750793, "step": 485, "valid_targets_mean": 5810.1, "valid_targets_min": 2306 }, { "epoch": 5.833333333333333, "grad_norm": 0.251686432685376, "learning_rate": 3.3582453673030923e-06, "loss": 0.2447, "loss_nan_ranks": 0, "loss_rank_avg": 0.08097212016582489, "step": 490, "valid_targets_mean": 4964.3, "valid_targets_min": 319 }, { "epoch": 5.892857142857143, "grad_norm": 0.2235799505428903, "learning_rate": 3.0362412882210156e-06, "loss": 0.2522, "loss_nan_ranks": 0, "loss_rank_avg": 0.08248492330312729, "step": 495, "valid_targets_mean": 5602.2, "valid_targets_min": 2422 }, { "epoch": 5.9523809523809526, "grad_norm": 0.23565923364897154, "learning_rate": 2.72919331691021e-06, "loss": 0.2464, "loss_nan_ranks": 0, "loss_rank_avg": 0.07904832065105438, "step": 500, "valid_targets_mean": 5078.7, "valid_targets_min": 426 }, { "epoch": 6.011904761904762, "grad_norm": 0.23856362236717238, "learning_rate": 2.437372162393834e-06, "loss": 0.2533, "loss_nan_ranks": 0, "loss_rank_avg": 0.09080533683300018, "step": 505, "valid_targets_mean": 5782.2, "valid_targets_min": 376 }, { "epoch": 6.071428571428571, "grad_norm": 0.23251648798736518, "learning_rate": 2.16103510896313e-06, "loss": 0.2501, "loss_nan_ranks": 0, "loss_rank_avg": 0.08888229727745056, "step": 510, "valid_targets_mean": 6173.2, "valid_targets_min": 1828 }, { "epoch": 6.130952380952381, "grad_norm": 0.2277877587137689, "learning_rate": 1.9004257893425726e-06, "loss": 0.2506, "loss_nan_ranks": 0, "loss_rank_avg": 0.08594680577516556, "step": 515, "valid_targets_mean": 5924.8, "valid_targets_min": 311 }, { "epoch": 6.190476190476191, "grad_norm": 0.234192139822292, "learning_rate": 1.6557739698909436e-06, "loss": 0.2413, "loss_nan_ranks": 0, "loss_rank_avg": 0.0754304900765419, "step": 520, "valid_targets_mean": 4934.6, "valid_targets_min": 290 }, { "epoch": 6.25, "grad_norm": 0.22373772258265975, "learning_rate": 1.4272953480276774e-06, "loss": 0.2479, "loss_nan_ranks": 0, "loss_rank_avg": 0.08419759571552277, "step": 525, "valid_targets_mean": 6025.2, "valid_targets_min": 1811 }, { "epoch": 6.309523809523809, "grad_norm": 0.23039419023997973, "learning_rate": 1.215191362063124e-06, "loss": 0.2469, "loss_nan_ranks": 0, "loss_rank_avg": 0.07240813970565796, "step": 530, "valid_targets_mean": 4975.7, "valid_targets_min": 269 }, { "epoch": 6.369047619047619, "grad_norm": 0.22205982567688556, "learning_rate": 1.0196490136003322e-06, "loss": 0.2454, "loss_nan_ranks": 0, "loss_rank_avg": 0.07454642653465271, "step": 535, "valid_targets_mean": 5364.7, "valid_targets_min": 367 }, { "epoch": 6.428571428571429, "grad_norm": 0.25426169519831565, "learning_rate": 8.408407026649778e-07, "loss": 0.2505, "loss_nan_ranks": 0, "loss_rank_avg": 0.0795418918132782, "step": 540, "valid_targets_mean": 5291.6, "valid_targets_min": 264 }, { "epoch": 6.488095238095238, "grad_norm": 0.22890244722184025, "learning_rate": 6.789240757087823e-07, "loss": 0.2462, "loss_nan_ranks": 0, "loss_rank_avg": 0.08053529262542725, "step": 545, "valid_targets_mean": 5164.7, "valid_targets_min": 434 }, { "epoch": 6.5476190476190474, "grad_norm": 0.2258429317576449, "learning_rate": 5.340418866204177e-07, "loss": 0.2434, "loss_nan_ranks": 0, "loss_rank_avg": 0.08423105627298355, "step": 550, "valid_targets_mean": 5882.1, "valid_targets_min": 1985 }, { "epoch": 6.607142857142857, "grad_norm": 0.2308750096320961, "learning_rate": 4.063218708664751e-07, "loss": 0.2513, "loss_nan_ranks": 0, "loss_rank_avg": 0.09404976665973663, "step": 555, "valid_targets_mean": 6039.2, "valid_targets_min": 2171 }, { "epoch": 6.666666666666667, "grad_norm": 0.30684656384369974, "learning_rate": 2.9587663287340864e-07, "loss": 0.2463, "loss_nan_ranks": 0, "loss_rank_avg": 0.07616351544857025, "step": 560, "valid_targets_mean": 5435.8, "valid_targets_min": 509 }, { "epoch": 6.726190476190476, "grad_norm": 0.24369553807123656, "learning_rate": 2.0280354674976576e-07, "loss": 0.2457, "loss_nan_ranks": 0, "loss_rank_avg": 0.08108322322368622, "step": 565, "valid_targets_mean": 5453.7, "valid_targets_min": 308 }, { "epoch": 6.785714285714286, "grad_norm": 0.22143496142468144, "learning_rate": 1.2718467043626448e-07, "loss": 0.2486, "loss_nan_ranks": 0, "loss_rank_avg": 0.08642816543579102, "step": 570, "valid_targets_mean": 5529.3, "valid_targets_min": 168 }, { "epoch": 6.845238095238095, "grad_norm": 0.22289015044632118, "learning_rate": 6.90866733593465e-08, "loss": 0.2464, "loss_nan_ranks": 0, "loss_rank_avg": 0.07365033030509949, "step": 575, "valid_targets_mean": 5419.3, "valid_targets_min": 327 }, { "epoch": 6.904761904761905, "grad_norm": 0.24055593132796596, "learning_rate": 2.856077765205356e-08, "loss": 0.2489, "loss_nan_ranks": 0, "loss_rank_avg": 0.09108748286962509, "step": 580, "valid_targets_mean": 5626.8, "valid_targets_min": 1730 }, { "epoch": 6.964285714285714, "grad_norm": 0.23554247965669858, "learning_rate": 5.642712993993993e-09, "loss": 0.2481, "loss_nan_ranks": 0, "loss_rank_avg": 0.0795075073838234, "step": 585, "valid_targets_mean": 5410.0, "valid_targets_min": 423 }, { "epoch": 7.0, "loss_nan_ranks": 0, "loss_rank_avg": 0.08444557338953018, "step": 588, "total_flos": 2.4214382862254735e+18, "train_loss": 0.2928499853124424, "train_runtime": 13980.8507, "train_samples_per_second": 4.035, "train_steps_per_second": 0.042, "valid_targets_mean": 5498.0, "valid_targets_min": 368 } ], "logging_steps": 5, "max_steps": 588, "num_input_tokens_seen": 0, "num_train_epochs": 7, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.4214382862254735e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }