1287 lines
36 KiB
JSON
1287 lines
36 KiB
JSON
|
|
{
|
||
|
|
"best_global_step": null,
|
||
|
|
"best_metric": null,
|
||
|
|
"best_model_checkpoint": null,
|
||
|
|
"epoch": 7.0,
|
||
|
|
"eval_steps": 500,
|
||
|
|
"global_step": 567,
|
||
|
|
"is_hyper_param_search": false,
|
||
|
|
"is_local_process_zero": true,
|
||
|
|
"is_world_process_zero": true,
|
||
|
|
"log_history": [
|
||
|
|
{
|
||
|
|
"epoch": 0.06172839506172839,
|
||
|
|
"grad_norm": 10.429516590233682,
|
||
|
|
"learning_rate": 2.8070175438596493e-06,
|
||
|
|
"loss": 0.8393,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.2808097004890442,
|
||
|
|
"step": 5,
|
||
|
|
"valid_targets_mean": 9607.8,
|
||
|
|
"valid_targets_min": 3068
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.12345679012345678,
|
||
|
|
"grad_norm": 5.065259502819224,
|
||
|
|
"learning_rate": 6.31578947368421e-06,
|
||
|
|
"loss": 0.7853,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.2494523823261261,
|
||
|
|
"step": 10,
|
||
|
|
"valid_targets_mean": 9841.2,
|
||
|
|
"valid_targets_min": 5719
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.18518518518518517,
|
||
|
|
"grad_norm": 1.6595151332203468,
|
||
|
|
"learning_rate": 9.824561403508772e-06,
|
||
|
|
"loss": 0.7006,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.23369759321212769,
|
||
|
|
"step": 15,
|
||
|
|
"valid_targets_mean": 10128.8,
|
||
|
|
"valid_targets_min": 7342
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.24691358024691357,
|
||
|
|
"grad_norm": 1.2957924076936072,
|
||
|
|
"learning_rate": 1.3333333333333333e-05,
|
||
|
|
"loss": 0.6693,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.21334174275398254,
|
||
|
|
"step": 20,
|
||
|
|
"valid_targets_mean": 9463.6,
|
||
|
|
"valid_targets_min": 4593
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.30864197530864196,
|
||
|
|
"grad_norm": 0.7601632497112111,
|
||
|
|
"learning_rate": 1.6842105263157896e-05,
|
||
|
|
"loss": 0.6305,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.20799314975738525,
|
||
|
|
"step": 25,
|
||
|
|
"valid_targets_mean": 11056.0,
|
||
|
|
"valid_targets_min": 7081
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.37037037037037035,
|
||
|
|
"grad_norm": 0.5556757386274306,
|
||
|
|
"learning_rate": 2.035087719298246e-05,
|
||
|
|
"loss": 0.6038,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.19161568582057953,
|
||
|
|
"step": 30,
|
||
|
|
"valid_targets_mean": 9825.4,
|
||
|
|
"valid_targets_min": 6376
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.43209876543209874,
|
||
|
|
"grad_norm": 0.5153662825148531,
|
||
|
|
"learning_rate": 2.385964912280702e-05,
|
||
|
|
"loss": 0.5804,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.19309845566749573,
|
||
|
|
"step": 35,
|
||
|
|
"valid_targets_mean": 10027.2,
|
||
|
|
"valid_targets_min": 5491
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.49382716049382713,
|
||
|
|
"grad_norm": 0.4106317408119749,
|
||
|
|
"learning_rate": 2.7368421052631583e-05,
|
||
|
|
"loss": 0.5557,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.1776849925518036,
|
||
|
|
"step": 40,
|
||
|
|
"valid_targets_mean": 9618.7,
|
||
|
|
"valid_targets_min": 3252
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5555555555555556,
|
||
|
|
"grad_norm": 0.35411307929247426,
|
||
|
|
"learning_rate": 3.087719298245614e-05,
|
||
|
|
"loss": 0.5349,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.17469243705272675,
|
||
|
|
"step": 45,
|
||
|
|
"valid_targets_mean": 10166.2,
|
||
|
|
"valid_targets_min": 6091
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6172839506172839,
|
||
|
|
"grad_norm": 0.3028156061226061,
|
||
|
|
"learning_rate": 3.43859649122807e-05,
|
||
|
|
"loss": 0.5171,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.16911481320858002,
|
||
|
|
"step": 50,
|
||
|
|
"valid_targets_mean": 10112.2,
|
||
|
|
"valid_targets_min": 1278
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6790123456790124,
|
||
|
|
"grad_norm": 0.24266635032746242,
|
||
|
|
"learning_rate": 3.789473684210526e-05,
|
||
|
|
"loss": 0.5033,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.15108071267604828,
|
||
|
|
"step": 55,
|
||
|
|
"valid_targets_mean": 9476.6,
|
||
|
|
"valid_targets_min": 405
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7407407407407407,
|
||
|
|
"grad_norm": 0.225540686496095,
|
||
|
|
"learning_rate": 3.999848220229662e-05,
|
||
|
|
"loss": 0.4919,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.1633068025112152,
|
||
|
|
"step": 60,
|
||
|
|
"valid_targets_mean": 10242.9,
|
||
|
|
"valid_targets_min": 6762
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8024691358024691,
|
||
|
|
"grad_norm": 0.233196689176046,
|
||
|
|
"learning_rate": 3.998140962368987e-05,
|
||
|
|
"loss": 0.479,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.15809178352355957,
|
||
|
|
"step": 65,
|
||
|
|
"valid_targets_mean": 10012.9,
|
||
|
|
"valid_targets_min": 5638
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8641975308641975,
|
||
|
|
"grad_norm": 0.22642340209998182,
|
||
|
|
"learning_rate": 3.994538346771576e-05,
|
||
|
|
"loss": 0.4669,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.145830899477005,
|
||
|
|
"step": 70,
|
||
|
|
"valid_targets_mean": 10058.6,
|
||
|
|
"valid_targets_min": 7540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9259259259259259,
|
||
|
|
"grad_norm": 0.21957679869413865,
|
||
|
|
"learning_rate": 3.989043790736547e-05,
|
||
|
|
"loss": 0.4607,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.14295609295368195,
|
||
|
|
"step": 75,
|
||
|
|
"valid_targets_mean": 9880.9,
|
||
|
|
"valid_targets_min": 4567
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9876543209876543,
|
||
|
|
"grad_norm": 0.259082441417068,
|
||
|
|
"learning_rate": 3.9816625061831206e-05,
|
||
|
|
"loss": 0.4539,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.14566341042518616,
|
||
|
|
"step": 80,
|
||
|
|
"valid_targets_mean": 10036.9,
|
||
|
|
"valid_targets_min": 6974
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0493827160493827,
|
||
|
|
"grad_norm": 0.29381252653535816,
|
||
|
|
"learning_rate": 3.972401494706805e-05,
|
||
|
|
"loss": 0.4489,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.14456677436828613,
|
||
|
|
"step": 85,
|
||
|
|
"valid_targets_mean": 10007.5,
|
||
|
|
"valid_targets_min": 6170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1111111111111112,
|
||
|
|
"grad_norm": 0.2514362103454548,
|
||
|
|
"learning_rate": 3.9612695409379555e-05,
|
||
|
|
"loss": 0.4473,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.156326562166214,
|
||
|
|
"step": 90,
|
||
|
|
"valid_targets_mean": 10273.3,
|
||
|
|
"valid_targets_min": 6712
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1728395061728394,
|
||
|
|
"grad_norm": 0.24751778888652187,
|
||
|
|
"learning_rate": 3.948277204209021e-05,
|
||
|
|
"loss": 0.4372,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.14379754662513733,
|
||
|
|
"step": 95,
|
||
|
|
"valid_targets_mean": 10190.5,
|
||
|
|
"valid_targets_min": 5246
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2345679012345678,
|
||
|
|
"grad_norm": 0.2494831866139906,
|
||
|
|
"learning_rate": 3.933436808538375e-05,
|
||
|
|
"loss": 0.4398,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.13491873443126678,
|
||
|
|
"step": 100,
|
||
|
|
"valid_targets_mean": 9260.9,
|
||
|
|
"valid_targets_min": 5003
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2962962962962963,
|
||
|
|
"grad_norm": 0.24159264180274795,
|
||
|
|
"learning_rate": 3.916762430940245e-05,
|
||
|
|
"loss": 0.4271,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.1485365480184555,
|
||
|
|
"step": 105,
|
||
|
|
"valid_targets_mean": 10453.7,
|
||
|
|
"valid_targets_min": 6676
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3580246913580247,
|
||
|
|
"grad_norm": 0.240571750078312,
|
||
|
|
"learning_rate": 3.898269888071803e-05,
|
||
|
|
"loss": 0.4294,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.1464928835630417,
|
||
|
|
"step": 110,
|
||
|
|
"valid_targets_mean": 10115.0,
|
||
|
|
"valid_targets_min": 321
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4197530864197532,
|
||
|
|
"grad_norm": 0.23171615291838898,
|
||
|
|
"learning_rate": 3.877976721230114e-05,
|
||
|
|
"loss": 0.4238,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.13552333414554596,
|
||
|
|
"step": 115,
|
||
|
|
"valid_targets_mean": 10116.0,
|
||
|
|
"valid_targets_min": 4165
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4814814814814814,
|
||
|
|
"grad_norm": 0.23123951273615284,
|
||
|
|
"learning_rate": 3.85590217971315e-05,
|
||
|
|
"loss": 0.4247,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.1396927833557129,
|
||
|
|
"step": 120,
|
||
|
|
"valid_targets_mean": 9472.6,
|
||
|
|
"valid_targets_min": 3369
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5432098765432098,
|
||
|
|
"grad_norm": 0.27876028230350475,
|
||
|
|
"learning_rate": 3.832067202560668e-05,
|
||
|
|
"loss": 0.4182,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.14294642210006714,
|
||
|
|
"step": 125,
|
||
|
|
"valid_targets_mean": 9702.7,
|
||
|
|
"valid_targets_min": 3563
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6049382716049383,
|
||
|
|
"grad_norm": 0.24112690266489265,
|
||
|
|
"learning_rate": 3.806494398692258e-05,
|
||
|
|
"loss": 0.4223,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.1513240933418274,
|
||
|
|
"step": 130,
|
||
|
|
"valid_targets_mean": 10773.1,
|
||
|
|
"valid_targets_min": 6916
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6666666666666665,
|
||
|
|
"grad_norm": 0.28682215602360966,
|
||
|
|
"learning_rate": 3.77920802546142e-05,
|
||
|
|
"loss": 0.4225,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.13537049293518066,
|
||
|
|
"step": 135,
|
||
|
|
"valid_targets_mean": 8778.2,
|
||
|
|
"valid_targets_min": 4773
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7283950617283952,
|
||
|
|
"grad_norm": 0.22939970462745563,
|
||
|
|
"learning_rate": 3.750233965645985e-05,
|
||
|
|
"loss": 0.4182,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.1321515440940857,
|
||
|
|
"step": 140,
|
||
|
|
"valid_targets_mean": 9729.9,
|
||
|
|
"valid_targets_min": 4328
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7901234567901234,
|
||
|
|
"grad_norm": 0.2532856415818664,
|
||
|
|
"learning_rate": 3.719599702896745e-05,
|
||
|
|
"loss": 0.4163,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.13605017960071564,
|
||
|
|
"step": 145,
|
||
|
|
"valid_targets_mean": 10582.2,
|
||
|
|
"valid_targets_min": 7105
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8518518518518519,
|
||
|
|
"grad_norm": 0.21865623660828168,
|
||
|
|
"learning_rate": 3.687334295667533e-05,
|
||
|
|
"loss": 0.4135,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.1345449984073639,
|
||
|
|
"step": 150,
|
||
|
|
"valid_targets_mean": 9758.0,
|
||
|
|
"valid_targets_min": 3245
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9135802469135803,
|
||
|
|
"grad_norm": 0.22032076579466348,
|
||
|
|
"learning_rate": 3.653468349651527e-05,
|
||
|
|
"loss": 0.4082,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.1432110071182251,
|
||
|
|
"step": 155,
|
||
|
|
"valid_targets_mean": 10140.4,
|
||
|
|
"valid_targets_min": 3954
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9753086419753085,
|
||
|
|
"grad_norm": 0.2300084517459775,
|
||
|
|
"learning_rate": 3.6180339887498953e-05,
|
||
|
|
"loss": 0.4116,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.14358800649642944,
|
||
|
|
"step": 160,
|
||
|
|
"valid_targets_mean": 10277.0,
|
||
|
|
"valid_targets_min": 5517
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.037037037037037,
|
||
|
|
"grad_norm": 0.26725994957529775,
|
||
|
|
"learning_rate": 3.581064824600327e-05,
|
||
|
|
"loss": 0.4073,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.12686286866664886,
|
||
|
|
"step": 165,
|
||
|
|
"valid_targets_mean": 9887.2,
|
||
|
|
"valid_targets_min": 6671
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0987654320987654,
|
||
|
|
"grad_norm": 0.2839301734419817,
|
||
|
|
"learning_rate": 3.542595924694362e-05,
|
||
|
|
"loss": 0.4006,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.12411850690841675,
|
||
|
|
"step": 170,
|
||
|
|
"valid_targets_mean": 10430.6,
|
||
|
|
"valid_targets_min": 4962
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1604938271604937,
|
||
|
|
"grad_norm": 0.23886149599663145,
|
||
|
|
"learning_rate": 3.502663779113747e-05,
|
||
|
|
"loss": 0.4057,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.1273406594991684,
|
||
|
|
"step": 175,
|
||
|
|
"valid_targets_mean": 9780.4,
|
||
|
|
"valid_targets_min": 6883
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2222222222222223,
|
||
|
|
"grad_norm": 0.2508286434733983,
|
||
|
|
"learning_rate": 3.4613062659173865e-05,
|
||
|
|
"loss": 0.4005,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.13528917729854584,
|
||
|
|
"step": 180,
|
||
|
|
"valid_targets_mean": 10432.4,
|
||
|
|
"valid_targets_min": 2672
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2839506172839505,
|
||
|
|
"grad_norm": 0.28154810246127465,
|
||
|
|
"learning_rate": 3.418562615211707e-05,
|
||
|
|
"loss": 0.3986,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.13502192497253418,
|
||
|
|
"step": 185,
|
||
|
|
"valid_targets_mean": 9436.0,
|
||
|
|
"valid_targets_min": 321
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3456790123456788,
|
||
|
|
"grad_norm": 0.25037886356869404,
|
||
|
|
"learning_rate": 3.374473371938526e-05,
|
||
|
|
"loss": 0.4055,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.13342218101024628,
|
||
|
|
"step": 190,
|
||
|
|
"valid_targets_mean": 10217.9,
|
||
|
|
"valid_targets_min": 5884
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4074074074074074,
|
||
|
|
"grad_norm": 0.22241283286592672,
|
||
|
|
"learning_rate": 3.329080357415716e-05,
|
||
|
|
"loss": 0.4011,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.13278549909591675,
|
||
|
|
"step": 195,
|
||
|
|
"valid_targets_mean": 10201.8,
|
||
|
|
"valid_targets_min": 5485
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4691358024691357,
|
||
|
|
"grad_norm": 0.25914481987156485,
|
||
|
|
"learning_rate": 3.282426629667157e-05,
|
||
|
|
"loss": 0.4,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.13765950500965118,
|
||
|
|
"step": 200,
|
||
|
|
"valid_targets_mean": 9836.8,
|
||
|
|
"valid_targets_min": 1763
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5308641975308643,
|
||
|
|
"grad_norm": 0.2678736062298823,
|
||
|
|
"learning_rate": 3.234556442579586e-05,
|
||
|
|
"loss": 0.3963,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.1374882459640503,
|
||
|
|
"step": 205,
|
||
|
|
"valid_targets_mean": 9808.9,
|
||
|
|
"valid_targets_min": 5540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5925925925925926,
|
||
|
|
"grad_norm": 0.22739006549004376,
|
||
|
|
"learning_rate": 3.18551520392511e-05,
|
||
|
|
"loss": 0.3911,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.11978818476200104,
|
||
|
|
"step": 210,
|
||
|
|
"valid_targets_mean": 9023.9,
|
||
|
|
"valid_targets_min": 5865
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6543209876543212,
|
||
|
|
"grad_norm": 0.2480434833866747,
|
||
|
|
"learning_rate": 3.1353494322891806e-05,
|
||
|
|
"loss": 0.3967,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.13371124863624573,
|
||
|
|
"step": 215,
|
||
|
|
"valid_targets_mean": 10042.4,
|
||
|
|
"valid_targets_min": 5992
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7160493827160495,
|
||
|
|
"grad_norm": 0.2415014429826874,
|
||
|
|
"learning_rate": 3.084106712944899e-05,
|
||
|
|
"loss": 0.3924,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.11986161768436432,
|
||
|
|
"step": 220,
|
||
|
|
"valid_targets_mean": 8822.5,
|
||
|
|
"valid_targets_min": 5914
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7777777777777777,
|
||
|
|
"grad_norm": 0.22748383065140926,
|
||
|
|
"learning_rate": 3.0318356527155024e-05,
|
||
|
|
"loss": 0.3959,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.13959044218063354,
|
||
|
|
"step": 225,
|
||
|
|
"valid_targets_mean": 11017.7,
|
||
|
|
"valid_targets_min": 7406
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8395061728395063,
|
||
|
|
"grad_norm": 0.22579559038440425,
|
||
|
|
"learning_rate": 2.9785858338678474e-05,
|
||
|
|
"loss": 0.3958,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.1356910616159439,
|
||
|
|
"step": 230,
|
||
|
|
"valid_targets_mean": 10416.5,
|
||
|
|
"valid_targets_min": 5935
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9012345679012346,
|
||
|
|
"grad_norm": 0.259395872875029,
|
||
|
|
"learning_rate": 2.924407767080627e-05,
|
||
|
|
"loss": 0.3952,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.13255149126052856,
|
||
|
|
"step": 235,
|
||
|
|
"valid_targets_mean": 10616.9,
|
||
|
|
"valid_targets_min": 452
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.962962962962963,
|
||
|
|
"grad_norm": 0.2505789103446424,
|
||
|
|
"learning_rate": 2.8693528435319304e-05,
|
||
|
|
"loss": 0.3927,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.13311100006103516,
|
||
|
|
"step": 240,
|
||
|
|
"valid_targets_mean": 10259.8,
|
||
|
|
"valid_targets_min": 5469
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.0246913580246915,
|
||
|
|
"grad_norm": 0.25985101425137663,
|
||
|
|
"learning_rate": 2.813473286151601e-05,
|
||
|
|
"loss": 0.3877,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.1392827033996582,
|
||
|
|
"step": 245,
|
||
|
|
"valid_targets_mean": 10576.5,
|
||
|
|
"valid_targets_min": 6382
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.0864197530864197,
|
||
|
|
"grad_norm": 0.2523285023282264,
|
||
|
|
"learning_rate": 2.756822100084621e-05,
|
||
|
|
"loss": 0.3882,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.13310596346855164,
|
||
|
|
"step": 250,
|
||
|
|
"valid_targets_mean": 10070.9,
|
||
|
|
"valid_targets_min": 5036
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.148148148148148,
|
||
|
|
"grad_norm": 0.24591905608450526,
|
||
|
|
"learning_rate": 2.6994530224125225e-05,
|
||
|
|
"loss": 0.3876,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.12935864925384521,
|
||
|
|
"step": 255,
|
||
|
|
"valid_targets_mean": 9908.3,
|
||
|
|
"valid_targets_min": 5504
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.2098765432098766,
|
||
|
|
"grad_norm": 0.23137220587321322,
|
||
|
|
"learning_rate": 2.6414204711805106e-05,
|
||
|
|
"loss": 0.3827,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.12253047525882721,
|
||
|
|
"step": 260,
|
||
|
|
"valid_targets_mean": 9940.8,
|
||
|
|
"valid_targets_min": 5580
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.271604938271605,
|
||
|
|
"grad_norm": 0.22538191153928455,
|
||
|
|
"learning_rate": 2.5827794937786497e-05,
|
||
|
|
"loss": 0.3896,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.12234168499708176,
|
||
|
|
"step": 265,
|
||
|
|
"valid_targets_mean": 9346.2,
|
||
|
|
"valid_targets_min": 2359
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.3333333333333335,
|
||
|
|
"grad_norm": 0.23311402995020197,
|
||
|
|
"learning_rate": 2.523585714726081e-05,
|
||
|
|
"loss": 0.3879,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.1369541585445404,
|
||
|
|
"step": 270,
|
||
|
|
"valid_targets_mean": 10217.0,
|
||
|
|
"valid_targets_min": 5332
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.3950617283950617,
|
||
|
|
"grad_norm": 0.2514619172750129,
|
||
|
|
"learning_rate": 2.4638952829077964e-05,
|
||
|
|
"loss": 0.3823,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.1313236802816391,
|
||
|
|
"step": 275,
|
||
|
|
"valid_targets_mean": 10161.1,
|
||
|
|
"valid_targets_min": 5918
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.45679012345679,
|
||
|
|
"grad_norm": 0.21958361450520822,
|
||
|
|
"learning_rate": 2.4037648183140205e-05,
|
||
|
|
"loss": 0.3813,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.12724569439888,
|
||
|
|
"step": 280,
|
||
|
|
"valid_targets_mean": 10332.4,
|
||
|
|
"valid_targets_min": 6695
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.5185185185185186,
|
||
|
|
"grad_norm": 0.2059815034095156,
|
||
|
|
"learning_rate": 2.3432513583327198e-05,
|
||
|
|
"loss": 0.3789,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.12159313261508942,
|
||
|
|
"step": 285,
|
||
|
|
"valid_targets_mean": 9543.0,
|
||
|
|
"valid_targets_min": 5540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.580246913580247,
|
||
|
|
"grad_norm": 0.2172755088091282,
|
||
|
|
"learning_rate": 2.282412303646183e-05,
|
||
|
|
"loss": 0.3824,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.13618609309196472,
|
||
|
|
"step": 290,
|
||
|
|
"valid_targets_mean": 10702.6,
|
||
|
|
"valid_targets_min": 5676
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.6419753086419755,
|
||
|
|
"grad_norm": 0.23724659970939752,
|
||
|
|
"learning_rate": 2.2213053637830016e-05,
|
||
|
|
"loss": 0.3827,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.13978168368339539,
|
||
|
|
"step": 295,
|
||
|
|
"valid_targets_mean": 10434.6,
|
||
|
|
"valid_targets_min": 6350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.7037037037037037,
|
||
|
|
"grad_norm": 0.21912152242726737,
|
||
|
|
"learning_rate": 2.1599885023770833e-05,
|
||
|
|
"loss": 0.3791,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.1349799931049347,
|
||
|
|
"step": 300,
|
||
|
|
"valid_targets_mean": 10565.1,
|
||
|
|
"valid_targets_min": 6091
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.765432098765432,
|
||
|
|
"grad_norm": 0.23294883680524386,
|
||
|
|
"learning_rate": 2.098519882185634e-05,
|
||
|
|
"loss": 0.3835,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.12474770843982697,
|
||
|
|
"step": 305,
|
||
|
|
"valid_targets_mean": 9782.5,
|
||
|
|
"valid_targets_min": 6639
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.8271604938271606,
|
||
|
|
"grad_norm": 0.210413727788084,
|
||
|
|
"learning_rate": 2.03695780991826e-05,
|
||
|
|
"loss": 0.3793,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.12241166830062866,
|
||
|
|
"step": 310,
|
||
|
|
"valid_targets_mean": 9305.9,
|
||
|
|
"valid_targets_min": 452
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.888888888888889,
|
||
|
|
"grad_norm": 0.2336083067719978,
|
||
|
|
"learning_rate": 1.9753606809295234e-05,
|
||
|
|
"loss": 0.3817,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.1337265968322754,
|
||
|
|
"step": 315,
|
||
|
|
"valid_targets_mean": 10231.2,
|
||
|
|
"valid_targets_min": 5605
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.950617283950617,
|
||
|
|
"grad_norm": 0.22993388632584225,
|
||
|
|
"learning_rate": 1.9137869238274095e-05,
|
||
|
|
"loss": 0.3867,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.12258920818567276,
|
||
|
|
"step": 320,
|
||
|
|
"valid_targets_mean": 9413.1,
|
||
|
|
"valid_targets_min": 3903
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.012345679012346,
|
||
|
|
"grad_norm": 0.23931040818992602,
|
||
|
|
"learning_rate": 1.8522949450502522e-05,
|
||
|
|
"loss": 0.385,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.11768842488527298,
|
||
|
|
"step": 325,
|
||
|
|
"valid_targets_mean": 10013.8,
|
||
|
|
"valid_targets_min": 6307
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.074074074074074,
|
||
|
|
"grad_norm": 0.23366624804084687,
|
||
|
|
"learning_rate": 1.7909430734646936e-05,
|
||
|
|
"loss": 0.3755,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.12187112867832184,
|
||
|
|
"step": 330,
|
||
|
|
"valid_targets_mean": 9138.8,
|
||
|
|
"valid_targets_min": 405
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.135802469135802,
|
||
|
|
"grad_norm": 0.2386159155105413,
|
||
|
|
"learning_rate": 1.7297895050372147e-05,
|
||
|
|
"loss": 0.3783,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.11817651987075806,
|
||
|
|
"step": 335,
|
||
|
|
"valid_targets_mean": 9925.3,
|
||
|
|
"valid_targets_min": 5315
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.197530864197531,
|
||
|
|
"grad_norm": 0.24141329497025246,
|
||
|
|
"learning_rate": 1.66889224763174e-05,
|
||
|
|
"loss": 0.3786,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.12400516867637634,
|
||
|
|
"step": 340,
|
||
|
|
"valid_targets_mean": 10173.1,
|
||
|
|
"valid_targets_min": 6521
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.2592592592592595,
|
||
|
|
"grad_norm": 0.21961371398861715,
|
||
|
|
"learning_rate": 1.6083090659856665e-05,
|
||
|
|
"loss": 0.3769,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.12499363720417023,
|
||
|
|
"step": 345,
|
||
|
|
"valid_targets_mean": 9886.8,
|
||
|
|
"valid_targets_min": 3903
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.320987654320987,
|
||
|
|
"grad_norm": 0.20691809860193006,
|
||
|
|
"learning_rate": 1.5480974269165053e-05,
|
||
|
|
"loss": 0.3765,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.12656159698963165,
|
||
|
|
"step": 350,
|
||
|
|
"valid_targets_mean": 10278.9,
|
||
|
|
"valid_targets_min": 5947
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.382716049382716,
|
||
|
|
"grad_norm": 0.2080579551581286,
|
||
|
|
"learning_rate": 1.4883144448111288e-05,
|
||
|
|
"loss": 0.3756,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.12579810619354248,
|
||
|
|
"step": 355,
|
||
|
|
"valid_targets_mean": 9957.8,
|
||
|
|
"valid_targets_min": 6975
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.444444444444445,
|
||
|
|
"grad_norm": 0.22870148650025382,
|
||
|
|
"learning_rate": 1.4290168274493161e-05,
|
||
|
|
"loss": 0.3743,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.12021129578351974,
|
||
|
|
"step": 360,
|
||
|
|
"valid_targets_mean": 9520.9,
|
||
|
|
"valid_targets_min": 4051
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.506172839506172,
|
||
|
|
"grad_norm": 0.21233421060501143,
|
||
|
|
"learning_rate": 1.3702608222129845e-05,
|
||
|
|
"loss": 0.3785,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.13181990385055542,
|
||
|
|
"step": 365,
|
||
|
|
"valid_targets_mean": 9865.2,
|
||
|
|
"valid_targets_min": 2359
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.567901234567901,
|
||
|
|
"grad_norm": 0.21174790278305963,
|
||
|
|
"learning_rate": 1.3121021627321438e-05,
|
||
|
|
"loss": 0.3762,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.11894433945417404,
|
||
|
|
"step": 370,
|
||
|
|
"valid_targets_mean": 9994.8,
|
||
|
|
"valid_targets_min": 6103
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.62962962962963,
|
||
|
|
"grad_norm": 0.22880042834392986,
|
||
|
|
"learning_rate": 1.254596016018172e-05,
|
||
|
|
"loss": 0.3718,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.11400240659713745,
|
||
|
|
"step": 375,
|
||
|
|
"valid_targets_mean": 9684.1,
|
||
|
|
"valid_targets_min": 1809
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.6913580246913575,
|
||
|
|
"grad_norm": 0.20714503394784406,
|
||
|
|
"learning_rate": 1.1977969301345627e-05,
|
||
|
|
"loss": 0.3741,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.12407234311103821,
|
||
|
|
"step": 380,
|
||
|
|
"valid_targets_mean": 10558.5,
|
||
|
|
"valid_targets_min": 3563
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.753086419753086,
|
||
|
|
"grad_norm": 0.2245999439376979,
|
||
|
|
"learning_rate": 1.1417587824547822e-05,
|
||
|
|
"loss": 0.3756,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.11711939424276352,
|
||
|
|
"step": 385,
|
||
|
|
"valid_targets_mean": 10091.3,
|
||
|
|
"valid_targets_min": 5831
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.814814814814815,
|
||
|
|
"grad_norm": 0.19667919626855399,
|
||
|
|
"learning_rate": 1.086534728556319e-05,
|
||
|
|
"loss": 0.3701,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.1273461878299713,
|
||
|
|
"step": 390,
|
||
|
|
"valid_targets_mean": 9572.9,
|
||
|
|
"valid_targets_min": 4151
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.8765432098765435,
|
||
|
|
"grad_norm": 0.21765308723193594,
|
||
|
|
"learning_rate": 1.032177151799397e-05,
|
||
|
|
"loss": 0.3723,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.12290293723344803,
|
||
|
|
"step": 395,
|
||
|
|
"valid_targets_mean": 10243.7,
|
||
|
|
"valid_targets_min": 403
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.938271604938271,
|
||
|
|
"grad_norm": 0.2135466709612721,
|
||
|
|
"learning_rate": 9.787376136381866e-06,
|
||
|
|
"loss": 0.3801,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.12904667854309082,
|
||
|
|
"step": 400,
|
||
|
|
"valid_targets_mean": 10656.8,
|
||
|
|
"valid_targets_min": 5102
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.0,
|
||
|
|
"grad_norm": 0.2361103567960002,
|
||
|
|
"learning_rate": 9.262668047116399e-06,
|
||
|
|
"loss": 0.3742,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.11932142823934555,
|
||
|
|
"step": 405,
|
||
|
|
"valid_targets_mean": 9562.3,
|
||
|
|
"valid_targets_min": 5152
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.061728395061729,
|
||
|
|
"grad_norm": 0.19675209800473384,
|
||
|
|
"learning_rate": 8.748144967603538e-06,
|
||
|
|
"loss": 0.3748,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.12642104923725128,
|
||
|
|
"step": 410,
|
||
|
|
"valid_targets_mean": 9635.7,
|
||
|
|
"valid_targets_min": 3304
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.1234567901234565,
|
||
|
|
"grad_norm": 0.19935243749779888,
|
||
|
|
"learning_rate": 8.24429495415054e-06,
|
||
|
|
"loss": 0.3701,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.1311061680316925,
|
||
|
|
"step": 415,
|
||
|
|
"valid_targets_mean": 10193.9,
|
||
|
|
"valid_targets_min": 4731
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.185185185185185,
|
||
|
|
"grad_norm": 0.20563411967663517,
|
||
|
|
"learning_rate": 7.751595939015005e-06,
|
||
|
|
"loss": 0.3648,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.12725985050201416,
|
||
|
|
"step": 420,
|
||
|
|
"valid_targets_mean": 9884.4,
|
||
|
|
"valid_targets_min": 6098
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.246913580246914,
|
||
|
|
"grad_norm": 0.23745531664425062,
|
||
|
|
"learning_rate": 7.270515277057178e-06,
|
||
|
|
"loss": 0.3704,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.12347559630870819,
|
||
|
|
"step": 425,
|
||
|
|
"valid_targets_mean": 10205.5,
|
||
|
|
"valid_targets_min": 4462
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.308641975308642,
|
||
|
|
"grad_norm": 0.1974181070193832,
|
||
|
|
"learning_rate": 6.801509302425553e-06,
|
||
|
|
"loss": 0.3728,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.13114777207374573,
|
||
|
|
"step": 430,
|
||
|
|
"valid_targets_mean": 10419.2,
|
||
|
|
"valid_targets_min": 6888
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.37037037037037,
|
||
|
|
"grad_norm": 0.20324970189648778,
|
||
|
|
"learning_rate": 6.3450228956962915e-06,
|
||
|
|
"loss": 0.3697,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.12115702033042908,
|
||
|
|
"step": 435,
|
||
|
|
"valid_targets_mean": 9735.0,
|
||
|
|
"valid_targets_min": 452
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.432098765432099,
|
||
|
|
"grad_norm": 0.21598912192747013,
|
||
|
|
"learning_rate": 5.90148906187706e-06,
|
||
|
|
"loss": 0.3716,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.1321217566728592,
|
||
|
|
"step": 440,
|
||
|
|
"valid_targets_mean": 10511.7,
|
||
|
|
"valid_targets_min": 6367
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.493827160493828,
|
||
|
|
"grad_norm": 0.18869463173340542,
|
||
|
|
"learning_rate": 5.471328519675521e-06,
|
||
|
|
"loss": 0.3706,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.11856191605329514,
|
||
|
|
"step": 445,
|
||
|
|
"valid_targets_mean": 10337.3,
|
||
|
|
"valid_targets_min": 6856
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.555555555555555,
|
||
|
|
"grad_norm": 0.3057251506093963,
|
||
|
|
"learning_rate": 5.054949302422178e-06,
|
||
|
|
"loss": 0.374,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.1269569993019104,
|
||
|
|
"step": 450,
|
||
|
|
"valid_targets_mean": 10418.6,
|
||
|
|
"valid_targets_min": 7562
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.617283950617284,
|
||
|
|
"grad_norm": 0.18558587270619328,
|
||
|
|
"learning_rate": 4.65274637102606e-06,
|
||
|
|
"loss": 0.37,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.12654009461402893,
|
||
|
|
"step": 455,
|
||
|
|
"valid_targets_mean": 10451.2,
|
||
|
|
"valid_targets_min": 6890
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.679012345679013,
|
||
|
|
"grad_norm": 0.18925832500358447,
|
||
|
|
"learning_rate": 4.265101239330336e-06,
|
||
|
|
"loss": 0.3692,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.11512520909309387,
|
||
|
|
"step": 460,
|
||
|
|
"valid_targets_mean": 9696.8,
|
||
|
|
"valid_targets_min": 6270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.7407407407407405,
|
||
|
|
"grad_norm": 0.2123661843964898,
|
||
|
|
"learning_rate": 3.892381612223348e-06,
|
||
|
|
"loss": 0.3731,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.12849582731723785,
|
||
|
|
"step": 465,
|
||
|
|
"valid_targets_mean": 9772.2,
|
||
|
|
"valid_targets_min": 5044
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.802469135802469,
|
||
|
|
"grad_norm": 0.19388821506347134,
|
||
|
|
"learning_rate": 3.534941036848258e-06,
|
||
|
|
"loss": 0.3697,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.11666221916675568,
|
||
|
|
"step": 470,
|
||
|
|
"valid_targets_mean": 9419.0,
|
||
|
|
"valid_targets_min": 5724
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.864197530864198,
|
||
|
|
"grad_norm": 0.19304850153428105,
|
||
|
|
"learning_rate": 3.193118567242148e-06,
|
||
|
|
"loss": 0.3717,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.12014692276716232,
|
||
|
|
"step": 475,
|
||
|
|
"valid_targets_mean": 10304.2,
|
||
|
|
"valid_targets_min": 6152
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.925925925925926,
|
||
|
|
"grad_norm": 0.20024207191873417,
|
||
|
|
"learning_rate": 2.8672384427227484e-06,
|
||
|
|
"loss": 0.3728,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.13074594736099243,
|
||
|
|
"step": 480,
|
||
|
|
"valid_targets_mean": 10167.5,
|
||
|
|
"valid_targets_min": 4046
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.987654320987654,
|
||
|
|
"grad_norm": 0.18822272097106288,
|
||
|
|
"learning_rate": 2.5576097803277833e-06,
|
||
|
|
"loss": 0.3705,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.12458598613739014,
|
||
|
|
"step": 485,
|
||
|
|
"valid_targets_mean": 9746.9,
|
||
|
|
"valid_targets_min": 5329
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.049382716049383,
|
||
|
|
"grad_norm": 0.1920896017408439,
|
||
|
|
"learning_rate": 2.264526281598762e-06,
|
||
|
|
"loss": 0.3731,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.11540110409259796,
|
||
|
|
"step": 490,
|
||
|
|
"valid_targets_mean": 9151.7,
|
||
|
|
"valid_targets_min": 2545
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.111111111111111,
|
||
|
|
"grad_norm": 0.1963272832970611,
|
||
|
|
"learning_rate": 1.988265953987254e-06,
|
||
|
|
"loss": 0.3719,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.1279304027557373,
|
||
|
|
"step": 495,
|
||
|
|
"valid_targets_mean": 10241.9,
|
||
|
|
"valid_targets_min": 6053
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.172839506172839,
|
||
|
|
"grad_norm": 0.19192168324701758,
|
||
|
|
"learning_rate": 1.7290908471479805e-06,
|
||
|
|
"loss": 0.3697,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.11920693516731262,
|
||
|
|
"step": 500,
|
||
|
|
"valid_targets_mean": 9656.2,
|
||
|
|
"valid_targets_min": 3898
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.234567901234568,
|
||
|
|
"grad_norm": 0.1838675547975408,
|
||
|
|
"learning_rate": 1.487246804368876e-06,
|
||
|
|
"loss": 0.3665,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.1233929991722107,
|
||
|
|
"step": 505,
|
||
|
|
"valid_targets_mean": 10345.6,
|
||
|
|
"valid_targets_min": 5701
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.296296296296296,
|
||
|
|
"grad_norm": 0.18797572906177723,
|
||
|
|
"learning_rate": 1.2629632293737903e-06,
|
||
|
|
"loss": 0.3707,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.12090076506137848,
|
||
|
|
"step": 510,
|
||
|
|
"valid_targets_mean": 9881.3,
|
||
|
|
"valid_targets_min": 6218
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.3580246913580245,
|
||
|
|
"grad_norm": 0.20133771310875587,
|
||
|
|
"learning_rate": 1.0564528687191954e-06,
|
||
|
|
"loss": 0.3745,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.12166153639554977,
|
||
|
|
"step": 515,
|
||
|
|
"valid_targets_mean": 9838.0,
|
||
|
|
"valid_targets_min": 5384
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.419753086419753,
|
||
|
|
"grad_norm": 0.19437281332257697,
|
||
|
|
"learning_rate": 8.679116099911855e-07,
|
||
|
|
"loss": 0.3682,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.12425738573074341,
|
||
|
|
"step": 520,
|
||
|
|
"valid_targets_mean": 10094.1,
|
||
|
|
"valid_targets_min": 5805
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.481481481481482,
|
||
|
|
"grad_norm": 0.20313423058358762,
|
||
|
|
"learning_rate": 6.975182959942195e-07,
|
||
|
|
"loss": 0.3647,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.12808644771575928,
|
||
|
|
"step": 525,
|
||
|
|
"valid_targets_mean": 10183.5,
|
||
|
|
"valid_targets_min": 6429
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.54320987654321,
|
||
|
|
"grad_norm": 0.19806939309627028,
|
||
|
|
"learning_rate": 5.454345551079044e-07,
|
||
|
|
"loss": 0.371,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.11769580841064453,
|
||
|
|
"step": 530,
|
||
|
|
"valid_targets_mean": 10095.5,
|
||
|
|
"valid_targets_min": 6403
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.604938271604938,
|
||
|
|
"grad_norm": 0.19162098246941137,
|
||
|
|
"learning_rate": 4.118046479726823e-07,
|
||
|
|
"loss": 0.3693,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.12202047556638718,
|
||
|
|
"step": 535,
|
||
|
|
"valid_targets_mean": 10323.8,
|
||
|
|
"valid_targets_min": 6837
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.666666666666667,
|
||
|
|
"grad_norm": 0.18761681557750176,
|
||
|
|
"learning_rate": 2.9675533064986937e-07,
|
||
|
|
"loss": 0.3699,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.1217864602804184,
|
||
|
|
"step": 540,
|
||
|
|
"valid_targets_mean": 9897.5,
|
||
|
|
"valid_targets_min": 5402
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.728395061728395,
|
||
|
|
"grad_norm": 0.19040954143447755,
|
||
|
|
"learning_rate": 2.0039573438586091e-07,
|
||
|
|
"loss": 0.3682,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.1232626885175705,
|
||
|
|
"step": 545,
|
||
|
|
"valid_targets_mean": 10269.8,
|
||
|
|
"valid_targets_min": 6322
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.790123456790123,
|
||
|
|
"grad_norm": 0.1840751374024284,
|
||
|
|
"learning_rate": 1.2281726209452782e-07,
|
||
|
|
"loss": 0.3704,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.1274663656949997,
|
||
|
|
"step": 550,
|
||
|
|
"valid_targets_mean": 10375.0,
|
||
|
|
"valid_targets_min": 5210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.851851851851852,
|
||
|
|
"grad_norm": 0.1884444000769013,
|
||
|
|
"learning_rate": 6.409350165601957e-08,
|
||
|
|
"loss": 0.3645,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.11835850775241852,
|
||
|
|
"step": 555,
|
||
|
|
"valid_targets_mean": 9355.1,
|
||
|
|
"valid_targets_min": 5793
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.91358024691358,
|
||
|
|
"grad_norm": 0.18733829002548932,
|
||
|
|
"learning_rate": 2.4280156114202537e-08,
|
||
|
|
"loss": 0.3741,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.12829899787902832,
|
||
|
|
"step": 560,
|
||
|
|
"valid_targets_mean": 9559.5,
|
||
|
|
"valid_targets_min": 4937
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 6.9753086419753085,
|
||
|
|
"grad_norm": 0.1898364388549884,
|
||
|
|
"learning_rate": 3.414990838945809e-09,
|
||
|
|
"loss": 0.3617,
|
||
|
|
"loss_nan_ranks": 0,
|
||
|
|
"loss_rank_avg": 0.122779980301857,
|
||
|
|
"step": 565,
|
||
|
|
"valid_targets_mean": 10042.1,
|
||
|
|
"valid_targets_min": 5310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 7.0,
|
||
|
|
"step": 567,
|
||
|
|
"total_flos": 2.4619618330916946e+18,
|
||
|
|
"train_loss": 0.0,
|
||
|
|
"train_runtime": 6.5836,
|
||
|
|
"train_samples_per_second": 8267.793,
|
||
|
|
"train_steps_per_second": 86.123
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"logging_steps": 5,
|
||
|
|
"max_steps": 567,
|
||
|
|
"num_input_tokens_seen": 0,
|
||
|
|
"num_train_epochs": 7,
|
||
|
|
"save_steps": 300,
|
||
|
|
"stateful_callbacks": {
|
||
|
|
"TrainerControl": {
|
||
|
|
"args": {
|
||
|
|
"should_epoch_stop": false,
|
||
|
|
"should_evaluate": false,
|
||
|
|
"should_log": false,
|
||
|
|
"should_save": true,
|
||
|
|
"should_training_stop": true
|
||
|
|
},
|
||
|
|
"attributes": {}
|
||
|
|
}
|
||
|
|
},
|
||
|
|
"total_flos": 2.4619618330916946e+18,
|
||
|
|
"train_batch_size": 1,
|
||
|
|
"trial_name": null,
|
||
|
|
"trial_params": null
|
||
|
|
}
|