Files
Kimi-2.5-swesmith-r2egym-so…/trainer_state.json

1335 lines
37 KiB
JSON
Raw Permalink Normal View History

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 7.0,
"eval_steps": 500,
"global_step": 588,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.05952380952380952,
"grad_norm": 26.475136303319733,
"learning_rate": 2.7118644067796613e-06,
"loss": 0.8757,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.28182709217071533,
"step": 5,
"valid_targets_mean": 5259.6,
"valid_targets_min": 241
},
{
"epoch": 0.11904761904761904,
"grad_norm": 10.752364491117543,
"learning_rate": 6.1016949152542385e-06,
"loss": 0.7723,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.23036649823188782,
"step": 10,
"valid_targets_mean": 5471.6,
"valid_targets_min": 308
},
{
"epoch": 0.17857142857142858,
"grad_norm": 2.0975582533943844,
"learning_rate": 9.491525423728815e-06,
"loss": 0.5988,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1777261197566986,
"step": 15,
"valid_targets_mean": 5108.2,
"valid_targets_min": 437
},
{
"epoch": 0.23809523809523808,
"grad_norm": 1.4703223243635486,
"learning_rate": 1.288135593220339e-05,
"loss": 0.548,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1800306737422943,
"step": 20,
"valid_targets_mean": 5685.6,
"valid_targets_min": 102
},
{
"epoch": 0.2976190476190476,
"grad_norm": 0.8534483461368364,
"learning_rate": 1.6271186440677967e-05,
"loss": 0.5087,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14695829153060913,
"step": 25,
"valid_targets_mean": 4944.7,
"valid_targets_min": 2418
},
{
"epoch": 0.35714285714285715,
"grad_norm": 0.582534993586709,
"learning_rate": 1.9661016949152545e-05,
"loss": 0.4839,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.16712947189807892,
"step": 30,
"valid_targets_mean": 6107.1,
"valid_targets_min": 2487
},
{
"epoch": 0.4166666666666667,
"grad_norm": 0.49806474628335173,
"learning_rate": 2.3050847457627122e-05,
"loss": 0.4524,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.15718674659729004,
"step": 35,
"valid_targets_mean": 6053.5,
"valid_targets_min": 439
},
{
"epoch": 0.47619047619047616,
"grad_norm": 0.39977189555606873,
"learning_rate": 2.6440677966101696e-05,
"loss": 0.4284,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.13855797052383423,
"step": 40,
"valid_targets_mean": 5599.5,
"valid_targets_min": 2254
},
{
"epoch": 0.5357142857142857,
"grad_norm": 0.35501868807640347,
"learning_rate": 2.9830508474576274e-05,
"loss": 0.4083,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1349203884601593,
"step": 45,
"valid_targets_mean": 6183.1,
"valid_targets_min": 231
},
{
"epoch": 0.5952380952380952,
"grad_norm": 0.3120105713672939,
"learning_rate": 3.322033898305085e-05,
"loss": 0.3911,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.14311061799526215,
"step": 50,
"valid_targets_mean": 5855.2,
"valid_targets_min": 2956
},
{
"epoch": 0.6547619047619048,
"grad_norm": 0.2727710443190103,
"learning_rate": 3.6610169491525426e-05,
"loss": 0.3768,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.12037570774555206,
"step": 55,
"valid_targets_mean": 5236.8,
"valid_targets_min": 354
},
{
"epoch": 0.7142857142857143,
"grad_norm": 0.2757726260661749,
"learning_rate": 4e-05,
"loss": 0.3573,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11726392805576324,
"step": 60,
"valid_targets_mean": 5693.2,
"valid_targets_min": 229
},
{
"epoch": 0.7738095238095238,
"grad_norm": 0.2786546982169116,
"learning_rate": 3.9991183494162245e-05,
"loss": 0.3501,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11834583431482315,
"step": 65,
"valid_targets_mean": 6120.8,
"valid_targets_min": 2344
},
{
"epoch": 0.8333333333333334,
"grad_norm": 0.23636721617724882,
"learning_rate": 3.996474174972647e-05,
"loss": 0.3417,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11998672783374786,
"step": 70,
"valid_targets_mean": 5844.7,
"valid_targets_min": 2484
},
{
"epoch": 0.8928571428571429,
"grad_norm": 0.24309351059305612,
"learning_rate": 3.9920698079072125e-05,
"loss": 0.3344,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10884048044681549,
"step": 75,
"valid_targets_mean": 5633.0,
"valid_targets_min": 494
},
{
"epoch": 0.9523809523809523,
"grad_norm": 0.2472743814168574,
"learning_rate": 3.9859091313327124e-05,
"loss": 0.3289,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09308268129825592,
"step": 80,
"valid_targets_mean": 5145.1,
"valid_targets_min": 2288
},
{
"epoch": 1.0119047619047619,
"grad_norm": 0.2635411800539745,
"learning_rate": 3.977997576813247e-05,
"loss": 0.3198,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.11220236867666245,
"step": 85,
"valid_targets_mean": 5678.7,
"valid_targets_min": 2563
},
{
"epoch": 1.0714285714285714,
"grad_norm": 0.2880444080728388,
"learning_rate": 3.968342119575477e-05,
"loss": 0.3123,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10120871663093567,
"step": 90,
"valid_targets_mean": 5122.5,
"valid_targets_min": 2218
},
{
"epoch": 1.130952380952381,
"grad_norm": 0.2710755850501796,
"learning_rate": 3.956951272358911e-05,
"loss": 0.3128,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10858584940433502,
"step": 95,
"valid_targets_mean": 5793.9,
"valid_targets_min": 2072
},
{
"epoch": 1.1904761904761905,
"grad_norm": 0.2559535005305801,
"learning_rate": 3.943835077910648e-05,
"loss": 0.3078,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08397121727466583,
"step": 100,
"valid_targets_mean": 4776.5,
"valid_targets_min": 376
},
{
"epoch": 1.25,
"grad_norm": 0.2362436236599847,
"learning_rate": 3.9290051001311815e-05,
"loss": 0.3036,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10915631055831909,
"step": 105,
"valid_targets_mean": 6117.2,
"valid_targets_min": 1862
},
{
"epoch": 1.3095238095238095,
"grad_norm": 0.2852961689327156,
"learning_rate": 3.912474413879077e-05,
"loss": 0.3055,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09754365682601929,
"step": 110,
"valid_targets_mean": 5148.4,
"valid_targets_min": 436
},
{
"epoch": 1.369047619047619,
"grad_norm": 0.2794753115317547,
"learning_rate": 3.894257593443519e-05,
"loss": 0.2956,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10125661641359329,
"step": 115,
"valid_targets_mean": 6039.0,
"valid_targets_min": 494
},
{
"epoch": 1.4285714285714286,
"grad_norm": 0.26532864715105864,
"learning_rate": 3.874370699694878e-05,
"loss": 0.298,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10564189404249191,
"step": 120,
"valid_targets_mean": 6079.8,
"valid_targets_min": 471
},
{
"epoch": 1.4880952380952381,
"grad_norm": 0.29837003528349804,
"learning_rate": 3.8528312659246395e-05,
"loss": 0.2976,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09849759191274643,
"step": 125,
"valid_targets_mean": 5342.6,
"valid_targets_min": 1368
},
{
"epoch": 1.5476190476190477,
"grad_norm": 0.23646296987288887,
"learning_rate": 3.8296582823871596e-05,
"loss": 0.2975,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09054168313741684,
"step": 130,
"valid_targets_mean": 4966.0,
"valid_targets_min": 220
},
{
"epoch": 1.6071428571428572,
"grad_norm": 0.2618136463189114,
"learning_rate": 3.804872179556902e-05,
"loss": 0.2944,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09321890771389008,
"step": 135,
"valid_targets_mean": 4762.0,
"valid_targets_min": 165
},
{
"epoch": 1.6666666666666665,
"grad_norm": 0.2708942100931431,
"learning_rate": 3.778494810115896e-05,
"loss": 0.2938,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1079484075307846,
"step": 140,
"valid_targets_mean": 5604.2,
"valid_targets_min": 451
},
{
"epoch": 1.7261904761904763,
"grad_norm": 0.2789099088534343,
"learning_rate": 3.750549429687309e-05,
"loss": 0.2933,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08427592366933823,
"step": 145,
"valid_targets_mean": 5146.2,
"valid_targets_min": 427
},
{
"epoch": 1.7857142857142856,
"grad_norm": 0.2678992582042396,
"learning_rate": 3.721060676332109e-05,
"loss": 0.2962,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10124705731868744,
"step": 150,
"valid_targets_mean": 5797.0,
"valid_targets_min": 389
},
{
"epoch": 1.8452380952380953,
"grad_norm": 0.2684299288852733,
"learning_rate": 3.6900545488269066e-05,
"loss": 0.2894,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0845484584569931,
"step": 155,
"valid_targets_mean": 5063.4,
"valid_targets_min": 445
},
{
"epoch": 1.9047619047619047,
"grad_norm": 0.2833413534409504,
"learning_rate": 3.657558383742117e-05,
"loss": 0.2919,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09527384489774704,
"step": 160,
"valid_targets_mean": 5275.2,
"valid_targets_min": 308
},
{
"epoch": 1.9642857142857144,
"grad_norm": 0.28986568017083997,
"learning_rate": 3.6236008313406594e-05,
"loss": 0.2871,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10269998013973236,
"step": 165,
"valid_targets_mean": 5881.0,
"valid_targets_min": 386
},
{
"epoch": 2.0238095238095237,
"grad_norm": 0.27082985663787124,
"learning_rate": 3.58821183031843e-05,
"loss": 0.2813,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09580333530902863,
"step": 170,
"valid_targets_mean": 5845.4,
"valid_targets_min": 1920
},
{
"epoch": 2.0833333333333335,
"grad_norm": 0.27912438131905437,
"learning_rate": 3.55142258140884e-05,
"loss": 0.2816,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0991968959569931,
"step": 175,
"valid_targets_mean": 6037.7,
"valid_targets_min": 2878
},
{
"epoch": 2.142857142857143,
"grad_norm": 0.2495296583091803,
"learning_rate": 3.513265519874668e-05,
"loss": 0.2779,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0991060882806778,
"step": 180,
"valid_targets_mean": 5959.1,
"valid_targets_min": 470
},
{
"epoch": 2.2023809523809526,
"grad_norm": 0.24797503017191885,
"learning_rate": 3.473774286911489e-05,
"loss": 0.2784,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10953380167484283,
"step": 185,
"valid_targets_mean": 6153.2,
"valid_targets_min": 3118
},
{
"epoch": 2.261904761904762,
"grad_norm": 0.2635797062915291,
"learning_rate": 3.432983699987901e-05,
"loss": 0.2785,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.07885843515396118,
"step": 190,
"valid_targets_mean": 4745.1,
"valid_targets_min": 335
},
{
"epoch": 2.3214285714285716,
"grad_norm": 0.27743273802507495,
"learning_rate": 3.390929722148677e-05,
"loss": 0.2786,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08541278541088104,
"step": 195,
"valid_targets_mean": 5393.5,
"valid_targets_min": 2005
},
{
"epoch": 2.380952380952381,
"grad_norm": 0.27104625586587555,
"learning_rate": 3.3476494303079285e-05,
"loss": 0.2778,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08726505935192108,
"step": 200,
"valid_targets_mean": 5373.0,
"valid_targets_min": 257
},
{
"epoch": 2.4404761904761907,
"grad_norm": 0.25680629085820356,
"learning_rate": 3.303180982560224e-05,
"loss": 0.2746,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08836235105991364,
"step": 205,
"valid_targets_mean": 5117.2,
"valid_targets_min": 361
},
{
"epoch": 2.5,
"grad_norm": 0.2722466936160436,
"learning_rate": 3.2575635845384787e-05,
"loss": 0.2746,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09811330586671829,
"step": 210,
"valid_targets_mean": 5944.3,
"valid_targets_min": 2473
},
{
"epoch": 2.5595238095238093,
"grad_norm": 0.2751403433864084,
"learning_rate": 3.21083745484829e-05,
"loss": 0.2707,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.07897627353668213,
"step": 215,
"valid_targets_mean": 4240.3,
"valid_targets_min": 222
},
{
"epoch": 2.619047619047619,
"grad_norm": 0.2862317925607407,
"learning_rate": 3.1630437896091756e-05,
"loss": 0.2792,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08746589720249176,
"step": 220,
"valid_targets_mean": 5690.9,
"valid_targets_min": 440
},
{
"epoch": 2.678571428571429,
"grad_norm": 0.2813165779096547,
"learning_rate": 3.114224726133996e-05,
"loss": 0.273,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0828898623585701,
"step": 225,
"valid_targets_mean": 4970.9,
"valid_targets_min": 1760
},
{
"epoch": 2.738095238095238,
"grad_norm": 0.27056484197253816,
"learning_rate": 3.0644233057785615e-05,
"loss": 0.2706,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08945001661777496,
"step": 230,
"valid_targets_mean": 5406.1,
"valid_targets_min": 426
},
{
"epoch": 2.7976190476190474,
"grad_norm": 0.2685779777737954,
"learning_rate": 3.0136834359942032e-05,
"loss": 0.2684,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08828261494636536,
"step": 235,
"valid_targets_mean": 5503.7,
"valid_targets_min": 368
},
{
"epoch": 2.857142857142857,
"grad_norm": 0.2732406218897489,
"learning_rate": 2.9620498516167356e-05,
"loss": 0.2717,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.07858332991600037,
"step": 240,
"valid_targets_mean": 5290.2,
"valid_targets_min": 2127
},
{
"epoch": 2.9166666666666665,
"grad_norm": 0.2399472243045982,
"learning_rate": 2.9095680754259687e-05,
"loss": 0.2677,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08259420096874237,
"step": 245,
"valid_targets_mean": 4813.6,
"valid_targets_min": 409
},
{
"epoch": 2.9761904761904763,
"grad_norm": 0.24882382594549618,
"learning_rate": 2.8562843780105182e-05,
"loss": 0.2702,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10204588621854782,
"step": 250,
"valid_targets_mean": 6258.4,
"valid_targets_min": 1978
},
{
"epoch": 3.0357142857142856,
"grad_norm": 0.24889920345299785,
"learning_rate": 2.8022457369733165e-05,
"loss": 0.2662,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08854080736637115,
"step": 255,
"valid_targets_mean": 6058.7,
"valid_targets_min": 437
},
{
"epoch": 3.0952380952380953,
"grad_norm": 0.26589186213113003,
"learning_rate": 2.7474997955137803e-05,
"loss": 0.2625,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.07796312123537064,
"step": 260,
"valid_targets_mean": 5013.2,
"valid_targets_min": 416
},
{
"epoch": 3.1547619047619047,
"grad_norm": 0.2892003595844532,
"learning_rate": 2.6920948204231573e-05,
"loss": 0.2658,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0770723819732666,
"step": 265,
"valid_targets_mean": 4645.8,
"valid_targets_min": 424
},
{
"epoch": 3.2142857142857144,
"grad_norm": 0.29019028159438703,
"learning_rate": 2.636079659530079e-05,
"loss": 0.265,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08780994266271591,
"step": 270,
"valid_targets_mean": 5075.2,
"valid_targets_min": 311
},
{
"epoch": 3.2738095238095237,
"grad_norm": 0.25707211946258385,
"learning_rate": 2.5795036986338477e-05,
"loss": 0.2656,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.07921002805233002,
"step": 275,
"valid_targets_mean": 4983.1,
"valid_targets_min": 259
},
{
"epoch": 3.3333333333333335,
"grad_norm": 0.2393833152778856,
"learning_rate": 2.522416817963416e-05,
"loss": 0.2615,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08294697105884552,
"step": 280,
"valid_targets_mean": 5771.5,
"valid_targets_min": 3118
},
{
"epoch": 3.392857142857143,
"grad_norm": 0.25514689276071223,
"learning_rate": 2.464869348200452e-05,
"loss": 0.2592,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09682682156562805,
"step": 285,
"valid_targets_mean": 5856.5,
"valid_targets_min": 487
},
{
"epoch": 3.4523809523809526,
"grad_norm": 0.25666137663935856,
"learning_rate": 2.4069120261052682e-05,
"loss": 0.2658,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.10615084320306778,
"step": 290,
"valid_targets_mean": 6388.9,
"valid_targets_min": 2484
},
{
"epoch": 3.511904761904762,
"grad_norm": 0.24250690776748496,
"learning_rate": 2.3485959497847223e-05,
"loss": 0.2583,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.07441714406013489,
"step": 295,
"valid_targets_mean": 5149.7,
"valid_targets_min": 396
},
{
"epoch": 3.571428571428571,
"grad_norm": 0.2605704598646807,
"learning_rate": 2.2899725336415468e-05,
"loss": 0.2619,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08115138858556747,
"step": 300,
"valid_targets_mean": 5677.9,
"valid_targets_min": 2348
},
{
"epoch": 3.630952380952381,
"grad_norm": 0.24661982746609362,
"learning_rate": 2.2310934630448076e-05,
"loss": 0.2598,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08594545722007751,
"step": 305,
"valid_targets_mean": 5679.9,
"valid_targets_min": 1757
},
{
"epoch": 3.6904761904761907,
"grad_norm": 0.25915778890419666,
"learning_rate": 2.1720106487614678e-05,
"loss": 0.2564,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09154875576496124,
"step": 310,
"valid_targets_mean": 5870.5,
"valid_targets_min": 2366
},
{
"epoch": 3.75,
"grad_norm": 0.2517589506891814,
"learning_rate": 2.112776181189232e-05,
"loss": 0.2622,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08415160328149796,
"step": 315,
"valid_targets_mean": 5251.9,
"valid_targets_min": 401
},
{
"epoch": 3.8095238095238093,
"grad_norm": 0.2700239209259606,
"learning_rate": 2.0534422844310144e-05,
"loss": 0.2644,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.07980450242757797,
"step": 320,
"valid_targets_mean": 4789.0,
"valid_targets_min": 321
},
{
"epoch": 3.869047619047619,
"grad_norm": 0.2549269120465629,
"learning_rate": 1.9940612702515292e-05,
"loss": 0.2574,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.1039179265499115,
"step": 325,
"valid_targets_mean": 6290.8,
"valid_targets_min": 515
},
{
"epoch": 3.928571428571429,
"grad_norm": 0.28891001950049783,
"learning_rate": 1.934685491956595e-05,
"loss": 0.2585,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08675411343574524,
"step": 330,
"valid_targets_mean": 5501.4,
"valid_targets_min": 414
},
{
"epoch": 3.988095238095238,
"grad_norm": 0.25893042450210946,
"learning_rate": 1.8753672982358055e-05,
"loss": 0.2617,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08645622432231903,
"step": 335,
"valid_targets_mean": 5373.1,
"valid_targets_min": 2838
},
{
"epoch": 4.0476190476190474,
"grad_norm": 0.25042985945546514,
"learning_rate": 1.8161589870092842e-05,
"loss": 0.2585,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08585283160209656,
"step": 340,
"valid_targets_mean": 5608.5,
"valid_targets_min": 421
},
{
"epoch": 4.107142857142857,
"grad_norm": 0.26603784455504254,
"learning_rate": 1.7571127593191877e-05,
"loss": 0.252,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08927032351493835,
"step": 345,
"valid_targets_mean": 5348.3,
"valid_targets_min": 435
},
{
"epoch": 4.166666666666667,
"grad_norm": 0.2808689664261119,
"learning_rate": 1.6982806733066303e-05,
"loss": 0.2544,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08572648465633392,
"step": 350,
"valid_targets_mean": 5795.5,
"valid_targets_min": 1835
},
{
"epoch": 4.226190476190476,
"grad_norm": 0.2584899797868476,
"learning_rate": 1.639714598314588e-05,
"loss": 0.2524,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.07797005027532578,
"step": 355,
"valid_targets_mean": 4981.5,
"valid_targets_min": 264
},
{
"epoch": 4.285714285714286,
"grad_norm": 0.2485126801540494,
"learning_rate": 1.5814661691572673e-05,
"loss": 0.2534,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08520189672708511,
"step": 360,
"valid_targets_mean": 6290.2,
"valid_targets_min": 1974
},
{
"epoch": 4.345238095238095,
"grad_norm": 0.2513271250286924,
"learning_rate": 1.5235867405962397e-05,
"loss": 0.2562,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09782940149307251,
"step": 365,
"valid_targets_mean": 6345.8,
"valid_targets_min": 335
},
{
"epoch": 4.404761904761905,
"grad_norm": 0.250961969245666,
"learning_rate": 1.4661273420634836e-05,
"loss": 0.2557,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09486376494169235,
"step": 370,
"valid_targets_mean": 5938.4,
"valid_targets_min": 2327
},
{
"epoch": 4.464285714285714,
"grad_norm": 0.24731218596658003,
"learning_rate": 1.4091386326712599e-05,
"loss": 0.2534,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0861523300409317,
"step": 375,
"valid_targets_mean": 5634.8,
"valid_targets_min": 349
},
{
"epoch": 4.523809523809524,
"grad_norm": 0.24844990717404133,
"learning_rate": 1.3526708565484726e-05,
"loss": 0.2555,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08229143917560577,
"step": 380,
"valid_targets_mean": 5422.2,
"valid_targets_min": 1875
},
{
"epoch": 4.583333333333333,
"grad_norm": 0.2333334277350641,
"learning_rate": 1.2967737985429041e-05,
"loss": 0.2506,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08357102423906326,
"step": 385,
"valid_targets_mean": 5235.4,
"valid_targets_min": 1760
},
{
"epoch": 4.642857142857143,
"grad_norm": 0.25215688549131215,
"learning_rate": 1.2414967403283776e-05,
"loss": 0.2491,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08776518702507019,
"step": 390,
"valid_targets_mean": 5544.2,
"valid_targets_min": 464
},
{
"epoch": 4.7023809523809526,
"grad_norm": 0.2293231506390837,
"learning_rate": 1.1868884169555353e-05,
"loss": 0.2525,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.07636909186840057,
"step": 395,
"valid_targets_mean": 5233.3,
"valid_targets_min": 308
},
{
"epoch": 4.761904761904762,
"grad_norm": 0.2664616026061766,
"learning_rate": 1.1329969738845605e-05,
"loss": 0.258,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08857310563325882,
"step": 400,
"valid_targets_mean": 5488.7,
"valid_targets_min": 2368
},
{
"epoch": 4.821428571428571,
"grad_norm": 0.22965867799638506,
"learning_rate": 1.0798699245376959e-05,
"loss": 0.2578,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09623508155345917,
"step": 405,
"valid_targets_mean": 6153.3,
"valid_targets_min": 2642
},
{
"epoch": 4.880952380952381,
"grad_norm": 0.23714434789026348,
"learning_rate": 1.0275541084090127e-05,
"loss": 0.2492,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09711508452892303,
"step": 410,
"valid_targets_mean": 6321.8,
"valid_targets_min": 2856
},
{
"epoch": 4.940476190476191,
"grad_norm": 0.2335193268265453,
"learning_rate": 9.760956497683412e-06,
"loss": 0.2555,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08174271881580353,
"step": 415,
"valid_targets_mean": 5300.7,
"valid_targets_min": 222
},
{
"epoch": 5.0,
"grad_norm": 0.24720488171742358,
"learning_rate": 9.255399169957823e-06,
"loss": 0.2522,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09777113795280457,
"step": 420,
"valid_targets_mean": 5800.2,
"valid_targets_min": 2100
},
{
"epoch": 5.059523809523809,
"grad_norm": 0.24642098972509158,
"learning_rate": 8.759314825826486e-06,
"loss": 0.2524,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08720912039279938,
"step": 425,
"valid_targets_mean": 5300.3,
"valid_targets_min": 1875
},
{
"epoch": 5.119047619047619,
"grad_norm": 0.2571302114424818,
"learning_rate": 8.273140838341003e-06,
"loss": 0.2535,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.07905574142932892,
"step": 430,
"valid_targets_mean": 5588.5,
"valid_targets_min": 355
},
{
"epoch": 5.178571428571429,
"grad_norm": 0.242878252474619,
"learning_rate": 7.797305843081255e-06,
"loss": 0.2555,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.07647094130516052,
"step": 435,
"valid_targets_mean": 5138.4,
"valid_targets_min": 449
},
{
"epoch": 5.238095238095238,
"grad_norm": 0.23526010582884302,
"learning_rate": 7.332229360248597e-06,
"loss": 0.2501,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.07705177366733551,
"step": 440,
"valid_targets_mean": 5300.1,
"valid_targets_min": 316
},
{
"epoch": 5.2976190476190474,
"grad_norm": 0.25015849653867017,
"learning_rate": 6.87832142479562e-06,
"loss": 0.2473,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.07398971170186996,
"step": 445,
"valid_targets_mean": 4832.6,
"valid_targets_min": 386
},
{
"epoch": 5.357142857142857,
"grad_norm": 0.2378763042427905,
"learning_rate": 6.4359822249185934e-06,
"loss": 0.2481,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08934387564659119,
"step": 450,
"valid_targets_mean": 5822.2,
"valid_targets_min": 1976
},
{
"epoch": 5.416666666666667,
"grad_norm": 0.2205842931707241,
"learning_rate": 6.005601749231318e-06,
"loss": 0.2495,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08240607380867004,
"step": 455,
"valid_targets_mean": 5815.5,
"valid_targets_min": 2105
},
{
"epoch": 5.476190476190476,
"grad_norm": 0.23001451637384798,
"learning_rate": 5.587559442931429e-06,
"loss": 0.2478,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09403956681489944,
"step": 460,
"valid_targets_mean": 5927.9,
"valid_targets_min": 1898
},
{
"epoch": 5.535714285714286,
"grad_norm": 0.2457826876030719,
"learning_rate": 5.18222387326232e-06,
"loss": 0.2451,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.07478703558444977,
"step": 465,
"valid_targets_mean": 5302.4,
"valid_targets_min": 2446
},
{
"epoch": 5.595238095238095,
"grad_norm": 0.23812933696457267,
"learning_rate": 4.7899524045656186e-06,
"loss": 0.2456,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.07499450445175171,
"step": 470,
"valid_targets_mean": 5108.1,
"valid_targets_min": 429
},
{
"epoch": 5.654761904761905,
"grad_norm": 0.2369372815448469,
"learning_rate": 4.411090883210684e-06,
"loss": 0.2469,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.07033229619264603,
"step": 475,
"valid_targets_mean": 5013.0,
"valid_targets_min": 576
},
{
"epoch": 5.714285714285714,
"grad_norm": 0.23807774857438638,
"learning_rate": 4.0459733326790055e-06,
"loss": 0.2463,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08270242065191269,
"step": 480,
"valid_targets_mean": 5699.3,
"valid_targets_min": 457
},
{
"epoch": 5.773809523809524,
"grad_norm": 0.24524035384617784,
"learning_rate": 3.6949216590721506e-06,
"loss": 0.2523,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08760789036750793,
"step": 485,
"valid_targets_mean": 5810.1,
"valid_targets_min": 2306
},
{
"epoch": 5.833333333333333,
"grad_norm": 0.251686432685376,
"learning_rate": 3.3582453673030923e-06,
"loss": 0.2447,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08097212016582489,
"step": 490,
"valid_targets_mean": 4964.3,
"valid_targets_min": 319
},
{
"epoch": 5.892857142857143,
"grad_norm": 0.2235799505428903,
"learning_rate": 3.0362412882210156e-06,
"loss": 0.2522,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08248492330312729,
"step": 495,
"valid_targets_mean": 5602.2,
"valid_targets_min": 2422
},
{
"epoch": 5.9523809523809526,
"grad_norm": 0.23565923364897154,
"learning_rate": 2.72919331691021e-06,
"loss": 0.2464,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.07904832065105438,
"step": 500,
"valid_targets_mean": 5078.7,
"valid_targets_min": 426
},
{
"epoch": 6.011904761904762,
"grad_norm": 0.23856362236717238,
"learning_rate": 2.437372162393834e-06,
"loss": 0.2533,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09080533683300018,
"step": 505,
"valid_targets_mean": 5782.2,
"valid_targets_min": 376
},
{
"epoch": 6.071428571428571,
"grad_norm": 0.23251648798736518,
"learning_rate": 2.16103510896313e-06,
"loss": 0.2501,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08888229727745056,
"step": 510,
"valid_targets_mean": 6173.2,
"valid_targets_min": 1828
},
{
"epoch": 6.130952380952381,
"grad_norm": 0.2277877587137689,
"learning_rate": 1.9004257893425726e-06,
"loss": 0.2506,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08594680577516556,
"step": 515,
"valid_targets_mean": 5924.8,
"valid_targets_min": 311
},
{
"epoch": 6.190476190476191,
"grad_norm": 0.234192139822292,
"learning_rate": 1.6557739698909436e-06,
"loss": 0.2413,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0754304900765419,
"step": 520,
"valid_targets_mean": 4934.6,
"valid_targets_min": 290
},
{
"epoch": 6.25,
"grad_norm": 0.22373772258265975,
"learning_rate": 1.4272953480276774e-06,
"loss": 0.2479,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08419759571552277,
"step": 525,
"valid_targets_mean": 6025.2,
"valid_targets_min": 1811
},
{
"epoch": 6.309523809523809,
"grad_norm": 0.23039419023997973,
"learning_rate": 1.215191362063124e-06,
"loss": 0.2469,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.07240813970565796,
"step": 530,
"valid_targets_mean": 4975.7,
"valid_targets_min": 269
},
{
"epoch": 6.369047619047619,
"grad_norm": 0.22205982567688556,
"learning_rate": 1.0196490136003322e-06,
"loss": 0.2454,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.07454642653465271,
"step": 535,
"valid_targets_mean": 5364.7,
"valid_targets_min": 367
},
{
"epoch": 6.428571428571429,
"grad_norm": 0.25426169519831565,
"learning_rate": 8.408407026649778e-07,
"loss": 0.2505,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0795418918132782,
"step": 540,
"valid_targets_mean": 5291.6,
"valid_targets_min": 264
},
{
"epoch": 6.488095238095238,
"grad_norm": 0.22890244722184025,
"learning_rate": 6.789240757087823e-07,
"loss": 0.2462,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08053529262542725,
"step": 545,
"valid_targets_mean": 5164.7,
"valid_targets_min": 434
},
{
"epoch": 6.5476190476190474,
"grad_norm": 0.2258429317576449,
"learning_rate": 5.340418866204177e-07,
"loss": 0.2434,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08423105627298355,
"step": 550,
"valid_targets_mean": 5882.1,
"valid_targets_min": 1985
},
{
"epoch": 6.607142857142857,
"grad_norm": 0.2308750096320961,
"learning_rate": 4.063218708664751e-07,
"loss": 0.2513,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09404976665973663,
"step": 555,
"valid_targets_mean": 6039.2,
"valid_targets_min": 2171
},
{
"epoch": 6.666666666666667,
"grad_norm": 0.30684656384369974,
"learning_rate": 2.9587663287340864e-07,
"loss": 0.2463,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.07616351544857025,
"step": 560,
"valid_targets_mean": 5435.8,
"valid_targets_min": 509
},
{
"epoch": 6.726190476190476,
"grad_norm": 0.24369553807123656,
"learning_rate": 2.0280354674976576e-07,
"loss": 0.2457,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08108322322368622,
"step": 565,
"valid_targets_mean": 5453.7,
"valid_targets_min": 308
},
{
"epoch": 6.785714285714286,
"grad_norm": 0.22143496142468144,
"learning_rate": 1.2718467043626448e-07,
"loss": 0.2486,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08642816543579102,
"step": 570,
"valid_targets_mean": 5529.3,
"valid_targets_min": 168
},
{
"epoch": 6.845238095238095,
"grad_norm": 0.22289015044632118,
"learning_rate": 6.90866733593465e-08,
"loss": 0.2464,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.07365033030509949,
"step": 575,
"valid_targets_mean": 5419.3,
"valid_targets_min": 327
},
{
"epoch": 6.904761904761905,
"grad_norm": 0.24055593132796596,
"learning_rate": 2.856077765205356e-08,
"loss": 0.2489,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.09108748286962509,
"step": 580,
"valid_targets_mean": 5626.8,
"valid_targets_min": 1730
},
{
"epoch": 6.964285714285714,
"grad_norm": 0.23554247965669858,
"learning_rate": 5.642712993993993e-09,
"loss": 0.2481,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.0795075073838234,
"step": 585,
"valid_targets_mean": 5410.0,
"valid_targets_min": 423
},
{
"epoch": 7.0,
"loss_nan_ranks": 0,
"loss_rank_avg": 0.08444557338953018,
"step": 588,
"total_flos": 2.4214382862254735e+18,
"train_loss": 0.2928499853124424,
"train_runtime": 13980.8507,
"train_samples_per_second": 4.035,
"train_steps_per_second": 0.042,
"valid_targets_mean": 5498.0,
"valid_targets_min": 368
}
],
"logging_steps": 5,
"max_steps": 588,
"num_input_tokens_seen": 0,
"num_train_epochs": 7,
"save_steps": 300,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.4214382862254735e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}