605 lines
17 KiB
JSON
605 lines
17 KiB
JSON
|
|
{
|
||
|
|
"best_global_step": null,
|
||
|
|
"best_metric": null,
|
||
|
|
"best_model_checkpoint": null,
|
||
|
|
"epoch": 7.0,
|
||
|
|
"eval_steps": 500,
|
||
|
|
"global_step": 259,
|
||
|
|
"is_hyper_param_search": false,
|
||
|
|
"is_local_process_zero": true,
|
||
|
|
"is_world_process_zero": true,
|
||
|
|
"log_history": [
|
||
|
|
{
|
||
|
|
"epoch": 0.13513513513513514,
|
||
|
|
"grad_norm": 5.259959182216627,
|
||
|
|
"learning_rate": 6.153846153846155e-06,
|
||
|
|
"loss": 0.3116,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.09571799635887146,
|
||
|
|
"step": 5,
|
||
|
|
"valid_targets_mean": 6433.7,
|
||
|
|
"valid_targets_min": 577
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2702702702702703,
|
||
|
|
"grad_norm": 1.3423309952029923,
|
||
|
|
"learning_rate": 1.3846153846153847e-05,
|
||
|
|
"loss": 0.2587,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.07112158834934235,
|
||
|
|
"step": 10,
|
||
|
|
"valid_targets_mean": 7327.8,
|
||
|
|
"valid_targets_min": 413
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.40540540540540543,
|
||
|
|
"grad_norm": 0.503106602146357,
|
||
|
|
"learning_rate": 2.153846153846154e-05,
|
||
|
|
"loss": 0.2116,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.06387270987033844,
|
||
|
|
"step": 15,
|
||
|
|
"valid_targets_mean": 7716.1,
|
||
|
|
"valid_targets_min": 701
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5405405405405406,
|
||
|
|
"grad_norm": 0.43730375431686763,
|
||
|
|
"learning_rate": 2.923076923076923e-05,
|
||
|
|
"loss": 0.188,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.057055432349443436,
|
||
|
|
"step": 20,
|
||
|
|
"valid_targets_mean": 6165.1,
|
||
|
|
"valid_targets_min": 2120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6756756756756757,
|
||
|
|
"grad_norm": 0.23457297766166457,
|
||
|
|
"learning_rate": 3.692307692307693e-05,
|
||
|
|
"loss": 0.1511,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.04511473327875137,
|
||
|
|
"step": 25,
|
||
|
|
"valid_targets_mean": 6698.5,
|
||
|
|
"valid_targets_min": 579
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8108108108108109,
|
||
|
|
"grad_norm": 0.22580079812560477,
|
||
|
|
"learning_rate": 3.998364045590232e-05,
|
||
|
|
"loss": 0.1382,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.04277999699115753,
|
||
|
|
"step": 30,
|
||
|
|
"valid_targets_mean": 5174.9,
|
||
|
|
"valid_targets_min": 340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9459459459459459,
|
||
|
|
"grad_norm": 0.15302847392142027,
|
||
|
|
"learning_rate": 3.988376236895231e-05,
|
||
|
|
"loss": 0.1277,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.03875000774860382,
|
||
|
|
"step": 35,
|
||
|
|
"valid_targets_mean": 6631.1,
|
||
|
|
"valid_targets_min": 431
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0810810810810811,
|
||
|
|
"grad_norm": 0.14846277384715484,
|
||
|
|
"learning_rate": 3.969354804762473e-05,
|
||
|
|
"loss": 0.1171,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.035136736929416656,
|
||
|
|
"step": 40,
|
||
|
|
"valid_targets_mean": 5226.1,
|
||
|
|
"valid_targets_min": 427
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2162162162162162,
|
||
|
|
"grad_norm": 0.17362666626644624,
|
||
|
|
"learning_rate": 3.9413861676735034e-05,
|
||
|
|
"loss": 0.1152,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.035750940442085266,
|
||
|
|
"step": 45,
|
||
|
|
"valid_targets_mean": 6524.1,
|
||
|
|
"valid_targets_min": 497
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3513513513513513,
|
||
|
|
"grad_norm": 0.13178939301683926,
|
||
|
|
"learning_rate": 3.9045973931977495e-05,
|
||
|
|
"loss": 0.1102,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.036801472306251526,
|
||
|
|
"step": 50,
|
||
|
|
"valid_targets_mean": 6943.8,
|
||
|
|
"valid_targets_min": 2015
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4864864864864864,
|
||
|
|
"grad_norm": 0.18185896796674594,
|
||
|
|
"learning_rate": 3.8591556206970594e-05,
|
||
|
|
"loss": 0.106,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.03439167141914368,
|
||
|
|
"step": 55,
|
||
|
|
"valid_targets_mean": 5706.4,
|
||
|
|
"valid_targets_min": 453
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6216216216216215,
|
||
|
|
"grad_norm": 0.1385453505062919,
|
||
|
|
"learning_rate": 3.805267301975424e-05,
|
||
|
|
"loss": 0.1059,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.035251468420028687,
|
||
|
|
"step": 60,
|
||
|
|
"valid_targets_mean": 6418.2,
|
||
|
|
"valid_targets_min": 1363
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7567567567567568,
|
||
|
|
"grad_norm": 0.13076540307986736,
|
||
|
|
"learning_rate": 3.743177263323758e-05,
|
||
|
|
"loss": 0.1042,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.03142703324556351,
|
||
|
|
"step": 65,
|
||
|
|
"valid_targets_mean": 5221.7,
|
||
|
|
"valid_targets_min": 577
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8918918918918919,
|
||
|
|
"grad_norm": 0.13340506373409342,
|
||
|
|
"learning_rate": 3.673167593221097e-05,
|
||
|
|
"loss": 0.0995,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.034315336495637894,
|
||
|
|
"step": 70,
|
||
|
|
"valid_targets_mean": 6568.5,
|
||
|
|
"valid_targets_min": 723
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.027027027027027,
|
||
|
|
"grad_norm": 0.1410479342684041,
|
||
|
|
"learning_rate": 3.5955563607456025e-05,
|
||
|
|
"loss": 0.0975,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.030078843235969543,
|
||
|
|
"step": 75,
|
||
|
|
"valid_targets_mean": 5617.2,
|
||
|
|
"valid_targets_min": 387
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1621621621621623,
|
||
|
|
"grad_norm": 0.14495862504665655,
|
||
|
|
"learning_rate": 3.510696170517927e-05,
|
||
|
|
"loss": 0.0949,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.028750576078891754,
|
||
|
|
"step": 80,
|
||
|
|
"valid_targets_mean": 5150.5,
|
||
|
|
"valid_targets_min": 427
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2972972972972974,
|
||
|
|
"grad_norm": 0.14212400520351012,
|
||
|
|
"learning_rate": 3.418972560742133e-05,
|
||
|
|
"loss": 0.0905,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0317218042910099,
|
||
|
|
"step": 85,
|
||
|
|
"valid_targets_mean": 6628.6,
|
||
|
|
"valid_targets_min": 883
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4324324324324325,
|
||
|
|
"grad_norm": 0.1337772062885725,
|
||
|
|
"learning_rate": 3.3208022516222195e-05,
|
||
|
|
"loss": 0.091,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.0257179643958807,
|
||
|
|
"step": 90,
|
||
|
|
"valid_targets_mean": 6095.6,
|
||
|
|
"valid_targets_min": 436
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5675675675675675,
|
||
|
|
"grad_norm": 0.17276086488015027,
|
||
|
|
"learning_rate": 3.2166312521120775e-05,
|
||
|
|
"loss": 0.0876,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.030876662582159042,
|
||
|
|
"step": 95,
|
||
|
|
"valid_targets_mean": 6510.9,
|
||
|
|
"valid_targets_min": 1635
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7027027027027026,
|
||
|
|
"grad_norm": 0.1480157679597937,
|
||
|
|
"learning_rate": 3.106932833600314e-05,
|
||
|
|
"loss": 0.0865,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.02585354819893837,
|
||
|
|
"step": 100,
|
||
|
|
"valid_targets_mean": 6586.1,
|
||
|
|
"valid_targets_min": 1657
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8378378378378377,
|
||
|
|
"grad_norm": 0.15909100754186584,
|
||
|
|
"learning_rate": 2.9922053797359406e-05,
|
||
|
|
"loss": 0.0901,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.027233216911554337,
|
||
|
|
"step": 105,
|
||
|
|
"valid_targets_mean": 5279.3,
|
||
|
|
"valid_targets_min": 393
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.972972972972973,
|
||
|
|
"grad_norm": 0.18306212162162813,
|
||
|
|
"learning_rate": 2.8729701221636294e-05,
|
||
|
|
"loss": 0.0865,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.02899256721138954,
|
||
|
|
"step": 110,
|
||
|
|
"valid_targets_mean": 5776.7,
|
||
|
|
"valid_targets_min": 1181
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.108108108108108,
|
||
|
|
"grad_norm": 0.15654440385954815,
|
||
|
|
"learning_rate": 2.74976877245558e-05,
|
||
|
|
"loss": 0.0843,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.026833169162273407,
|
||
|
|
"step": 115,
|
||
|
|
"valid_targets_mean": 5595.8,
|
||
|
|
"valid_targets_min": 395
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.2432432432432434,
|
||
|
|
"grad_norm": 0.13190386927690592,
|
||
|
|
"learning_rate": 2.6231610609986442e-05,
|
||
|
|
"loss": 0.0776,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.024204321205615997,
|
||
|
|
"step": 120,
|
||
|
|
"valid_targets_mean": 8543.6,
|
||
|
|
"valid_targets_min": 1525
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.3783783783783785,
|
||
|
|
"grad_norm": 0.15764713783801354,
|
||
|
|
"learning_rate": 2.493722194018082e-05,
|
||
|
|
"loss": 0.0813,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.024981103837490082,
|
||
|
|
"step": 125,
|
||
|
|
"valid_targets_mean": 5946.0,
|
||
|
|
"valid_targets_min": 1137
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.5135135135135136,
|
||
|
|
"grad_norm": 0.1529541859974241,
|
||
|
|
"learning_rate": 2.362040240291227e-05,
|
||
|
|
"loss": 0.0815,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.026058465242385864,
|
||
|
|
"step": 130,
|
||
|
|
"valid_targets_mean": 6072.2,
|
||
|
|
"valid_targets_min": 802
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.6486486486486487,
|
||
|
|
"grad_norm": 0.33060538437630244,
|
||
|
|
"learning_rate": 2.228713459423804e-05,
|
||
|
|
"loss": 0.0778,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.02479572780430317,
|
||
|
|
"step": 135,
|
||
|
|
"valid_targets_mean": 5208.4,
|
||
|
|
"valid_targets_min": 497
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.7837837837837838,
|
||
|
|
"grad_norm": 0.15640399728656484,
|
||
|
|
"learning_rate": 2.094347583827102e-05,
|
||
|
|
"loss": 0.0817,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.026677442714571953,
|
||
|
|
"step": 140,
|
||
|
|
"valid_targets_mean": 5517.4,
|
||
|
|
"valid_targets_min": 393
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.918918918918919,
|
||
|
|
"grad_norm": 0.1517001071529229,
|
||
|
|
"learning_rate": 1.9595530667445775e-05,
|
||
|
|
"loss": 0.076,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.026775188744068146,
|
||
|
|
"step": 145,
|
||
|
|
"valid_targets_mean": 6216.9,
|
||
|
|
"valid_targets_min": 820
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.054054054054054,
|
||
|
|
"grad_norm": 0.1520783489774219,
|
||
|
|
"learning_rate": 1.824942308830696e-05,
|
||
|
|
"loss": 0.0787,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.02473868615925312,
|
||
|
|
"step": 150,
|
||
|
|
"valid_targets_mean": 5801.8,
|
||
|
|
"valid_targets_min": 692
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.1891891891891895,
|
||
|
|
"grad_norm": 0.14931092491442138,
|
||
|
|
"learning_rate": 1.691126875882263e-05,
|
||
|
|
"loss": 0.0752,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.021454155445098877,
|
||
|
|
"step": 155,
|
||
|
|
"valid_targets_mean": 6150.3,
|
||
|
|
"valid_targets_min": 577
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.324324324324325,
|
||
|
|
"grad_norm": 0.15680952631799258,
|
||
|
|
"learning_rate": 1.5587147203626934e-05,
|
||
|
|
"loss": 0.0712,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.022702787071466446,
|
||
|
|
"step": 160,
|
||
|
|
"valid_targets_mean": 6739.9,
|
||
|
|
"valid_targets_min": 1051
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.45945945945946,
|
||
|
|
"grad_norm": 0.15760466083796523,
|
||
|
|
"learning_rate": 1.4283074193424379e-05,
|
||
|
|
"loss": 0.0713,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.022617951035499573,
|
||
|
|
"step": 165,
|
||
|
|
"valid_targets_mean": 6288.2,
|
||
|
|
"valid_targets_min": 579
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.594594594594595,
|
||
|
|
"grad_norm": 0.16238993129940618,
|
||
|
|
"learning_rate": 1.3004974414041987e-05,
|
||
|
|
"loss": 0.0738,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.02532375603914261,
|
||
|
|
"step": 170,
|
||
|
|
"valid_targets_mean": 5912.8,
|
||
|
|
"valid_targets_min": 1757
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.72972972972973,
|
||
|
|
"grad_norm": 0.1636253567113291,
|
||
|
|
"learning_rate": 1.1758654549299735e-05,
|
||
|
|
"loss": 0.0711,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.022857841104269028,
|
||
|
|
"step": 175,
|
||
|
|
"valid_targets_mean": 5464.2,
|
||
|
|
"valid_targets_min": 577
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.864864864864865,
|
||
|
|
"grad_norm": 0.18989576780948067,
|
||
|
|
"learning_rate": 1.0549776899989686e-05,
|
||
|
|
"loss": 0.0727,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.021705035120248795,
|
||
|
|
"step": 180,
|
||
|
|
"valid_targets_mean": 7371.7,
|
||
|
|
"valid_targets_min": 355
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.0,
|
||
|
|
"grad_norm": 0.17912554773879225,
|
||
|
|
"learning_rate": 9.3838336588184e-06,
|
||
|
|
"loss": 0.074,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.029517250135540962,
|
||
|
|
"step": 185,
|
||
|
|
"valid_targets_mean": 5847.1,
|
||
|
|
"valid_targets_min": 342
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.135135135135135,
|
||
|
|
"grad_norm": 0.17463959150551875,
|
||
|
|
"learning_rate": 8.266121958187246e-06,
|
||
|
|
"loss": 0.0709,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.024629643186926842,
|
||
|
|
"step": 190,
|
||
|
|
"valid_targets_mean": 7010.6,
|
||
|
|
"valid_targets_min": 1067
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.27027027027027,
|
||
|
|
"grad_norm": 0.16012384004377175,
|
||
|
|
"learning_rate": 7.201719804173797e-06,
|
||
|
|
"loss": 0.0683,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.02399550750851631,
|
||
|
|
"step": 195,
|
||
|
|
"valid_targets_mean": 6869.6,
|
||
|
|
"valid_targets_min": 405
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.405405405405405,
|
||
|
|
"grad_norm": 0.27301642675588644,
|
||
|
|
"learning_rate": 6.1954630060516005e-06,
|
||
|
|
"loss": 0.0676,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.023653771728277206,
|
||
|
|
"step": 200,
|
||
|
|
"valid_targets_mean": 4831.9,
|
||
|
|
"valid_targets_min": 1161
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.54054054054054,
|
||
|
|
"grad_norm": 0.1781041058075561,
|
||
|
|
"learning_rate": 5.2519232061624255e-06,
|
||
|
|
"loss": 0.0698,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.02264336310327053,
|
||
|
|
"step": 205,
|
||
|
|
"valid_targets_mean": 6068.0,
|
||
|
|
"valid_targets_min": 467
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.675675675675675,
|
||
|
|
"grad_norm": 0.1623201456410524,
|
||
|
|
"learning_rate": 4.375387109955953e-06,
|
||
|
|
"loss": 0.0693,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.022714542225003242,
|
||
|
|
"step": 210,
|
||
|
|
"valid_targets_mean": 6206.7,
|
||
|
|
"valid_targets_min": 1846
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.8108108108108105,
|
||
|
|
"grad_norm": 0.18485993941649267,
|
||
|
|
"learning_rate": 3.569837010559505e-06,
|
||
|
|
"loss": 0.0661,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.023270253092050552,
|
||
|
|
"step": 215,
|
||
|
|
"valid_targets_mean": 7176.0,
|
||
|
|
"valid_targets_min": 372
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.945945945945946,
|
||
|
|
"grad_norm": 0.18961469137543394,
|
||
|
|
"learning_rate": 2.838932696358798e-06,
|
||
|
|
"loss": 0.0695,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.024968810379505157,
|
||
|
|
"step": 220,
|
||
|
|
"valid_targets_mean": 6053.5,
|
||
|
|
"valid_targets_min": 427
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.081081081081081,
|
||
|
|
"grad_norm": 0.15015876207979317,
|
||
|
|
"learning_rate": 2.1859948237874517e-06,
|
||
|
|
"loss": 0.0681,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.01862494647502899,
|
||
|
|
"step": 225,
|
||
|
|
"valid_targets_mean": 5860.2,
|
||
|
|
"valid_targets_min": 387
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.216216216216216,
|
||
|
|
"grad_norm": 0.15711014903589288,
|
||
|
|
"learning_rate": 1.6139898308664093e-06,
|
||
|
|
"loss": 0.0663,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.02179352566599846,
|
||
|
|
"step": 230,
|
||
|
|
"valid_targets_mean": 6635.5,
|
||
|
|
"valid_targets_min": 2015
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.351351351351352,
|
||
|
|
"grad_norm": 0.19367059367410314,
|
||
|
|
"learning_rate": 1.1255164600341816e-06,
|
||
|
|
"loss": 0.0674,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.022510964423418045,
|
||
|
|
"step": 235,
|
||
|
|
"valid_targets_mean": 5707.8,
|
||
|
|
"valid_targets_min": 445
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.486486486486487,
|
||
|
|
"grad_norm": 0.17844773460429106,
|
||
|
|
"learning_rate": 7.227939514977422e-07,
|
||
|
|
"loss": 0.0666,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.023919889703392982,
|
||
|
|
"step": 240,
|
||
|
|
"valid_targets_mean": 5842.8,
|
||
|
|
"valid_targets_min": 453
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.621621621621622,
|
||
|
|
"grad_norm": 0.16782709344204977,
|
||
|
|
"learning_rate": 4.0765196074406433e-07,
|
||
|
|
"loss": 0.0697,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.01854727789759636,
|
||
|
|
"step": 245,
|
||
|
|
"valid_targets_mean": 5001.1,
|
||
|
|
"valid_targets_min": 1381
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.756756756756757,
|
||
|
|
"grad_norm": 0.16683190263277645,
|
||
|
|
"learning_rate": 1.8152224601943435e-07,
|
||
|
|
"loss": 0.0666,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.02349766716361046,
|
||
|
|
"step": 250,
|
||
|
|
"valid_targets_mean": 7224.0,
|
||
|
|
"valid_targets_min": 405
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.891891891891892,
|
||
|
|
"grad_norm": 0.165105262824618,
|
||
|
|
"learning_rate": 4.5432163541960785e-08,
|
||
|
|
"loss": 0.0668,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.026979651302099228,
|
||
|
|
"step": 255,
|
||
|
|
"valid_targets_mean": 7160.0,
|
||
|
|
"valid_targets_min": 434
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 7.0,
|
||
|
|
"step": 259,
|
||
|
|
"total_flos": 1.36492518344124e+18,
|
||
|
|
"train_loss": 0.0,
|
||
|
|
"train_runtime": 1.313,
|
||
|
|
"train_samples_per_second": 18873.051,
|
||
|
|
"train_steps_per_second": 197.261
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"logging_steps": 5,
|
||
|
|
"max_steps": 259,
|
||
|
|
"num_input_tokens_seen": 0,
|
||
|
|
"num_train_epochs": 7,
|
||
|
|
"save_steps": 300,
|
||
|
|
"stateful_callbacks": {
|
||
|
|
"TrainerControl": {
|
||
|
|
"args": {
|
||
|
|
"should_epoch_stop": false,
|
||
|
|
"should_evaluate": false,
|
||
|
|
"should_log": false,
|
||
|
|
"should_save": true,
|
||
|
|
"should_training_stop": true
|
||
|
|
},
|
||
|
|
"attributes": {}
|
||
|
|
}
|
||
|
|
},
|
||
|
|
"total_flos": 1.36492518344124e+18,
|
||
|
|
"train_batch_size": 1,
|
||
|
|
"trial_name": null,
|
||
|
|
"trial_params": null
|
||
|
|
}
|