8435 lines
188 KiB
JSON
8435 lines
188 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 4.8,
|
|
"eval_steps": 500,
|
|
"global_step": 1200,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.004,
|
|
"grad_norm": 7.180798530578613,
|
|
"learning_rate": 0.0,
|
|
"loss": 1.4284,
|
|
"step": 1
|
|
},
|
|
{
|
|
"epoch": 0.008,
|
|
"grad_norm": 7.071687698364258,
|
|
"learning_rate": 1.0000000000000002e-06,
|
|
"loss": 1.2964,
|
|
"step": 2
|
|
},
|
|
{
|
|
"epoch": 0.012,
|
|
"grad_norm": 8.010491371154785,
|
|
"learning_rate": 2.0000000000000003e-06,
|
|
"loss": 1.5582,
|
|
"step": 3
|
|
},
|
|
{
|
|
"epoch": 0.016,
|
|
"grad_norm": 6.5002946853637695,
|
|
"learning_rate": 3e-06,
|
|
"loss": 1.3382,
|
|
"step": 4
|
|
},
|
|
{
|
|
"epoch": 0.02,
|
|
"grad_norm": 5.291814804077148,
|
|
"learning_rate": 4.000000000000001e-06,
|
|
"loss": 1.249,
|
|
"step": 5
|
|
},
|
|
{
|
|
"epoch": 0.024,
|
|
"grad_norm": 5.405974864959717,
|
|
"learning_rate": 5e-06,
|
|
"loss": 1.4124,
|
|
"step": 6
|
|
},
|
|
{
|
|
"epoch": 0.028,
|
|
"grad_norm": 3.7647454738616943,
|
|
"learning_rate": 6e-06,
|
|
"loss": 1.2449,
|
|
"step": 7
|
|
},
|
|
{
|
|
"epoch": 0.032,
|
|
"grad_norm": 4.103897571563721,
|
|
"learning_rate": 7e-06,
|
|
"loss": 1.1926,
|
|
"step": 8
|
|
},
|
|
{
|
|
"epoch": 0.036,
|
|
"grad_norm": 4.49144983291626,
|
|
"learning_rate": 8.000000000000001e-06,
|
|
"loss": 1.3012,
|
|
"step": 9
|
|
},
|
|
{
|
|
"epoch": 0.04,
|
|
"grad_norm": 4.010332107543945,
|
|
"learning_rate": 9e-06,
|
|
"loss": 1.1781,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 0.044,
|
|
"grad_norm": 3.8404905796051025,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.2206,
|
|
"step": 11
|
|
},
|
|
{
|
|
"epoch": 0.048,
|
|
"grad_norm": 2.9078028202056885,
|
|
"learning_rate": 9.991935483870968e-06,
|
|
"loss": 1.0996,
|
|
"step": 12
|
|
},
|
|
{
|
|
"epoch": 0.052,
|
|
"grad_norm": 3.6060760021209717,
|
|
"learning_rate": 9.983870967741936e-06,
|
|
"loss": 1.4656,
|
|
"step": 13
|
|
},
|
|
{
|
|
"epoch": 0.056,
|
|
"grad_norm": 3.05355167388916,
|
|
"learning_rate": 9.975806451612904e-06,
|
|
"loss": 1.2291,
|
|
"step": 14
|
|
},
|
|
{
|
|
"epoch": 0.06,
|
|
"grad_norm": 3.198852062225342,
|
|
"learning_rate": 9.967741935483871e-06,
|
|
"loss": 1.2603,
|
|
"step": 15
|
|
},
|
|
{
|
|
"epoch": 0.064,
|
|
"grad_norm": 3.001765012741089,
|
|
"learning_rate": 9.959677419354839e-06,
|
|
"loss": 1.2455,
|
|
"step": 16
|
|
},
|
|
{
|
|
"epoch": 0.068,
|
|
"grad_norm": 2.4708402156829834,
|
|
"learning_rate": 9.951612903225807e-06,
|
|
"loss": 1.3416,
|
|
"step": 17
|
|
},
|
|
{
|
|
"epoch": 0.072,
|
|
"grad_norm": 2.4723620414733887,
|
|
"learning_rate": 9.943548387096776e-06,
|
|
"loss": 1.3001,
|
|
"step": 18
|
|
},
|
|
{
|
|
"epoch": 0.076,
|
|
"grad_norm": 2.3858444690704346,
|
|
"learning_rate": 9.935483870967742e-06,
|
|
"loss": 1.3496,
|
|
"step": 19
|
|
},
|
|
{
|
|
"epoch": 0.08,
|
|
"grad_norm": 2.5942535400390625,
|
|
"learning_rate": 9.927419354838711e-06,
|
|
"loss": 1.1709,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.084,
|
|
"grad_norm": 2.6939024925231934,
|
|
"learning_rate": 9.919354838709679e-06,
|
|
"loss": 1.3068,
|
|
"step": 21
|
|
},
|
|
{
|
|
"epoch": 0.088,
|
|
"grad_norm": 2.307511568069458,
|
|
"learning_rate": 9.911290322580645e-06,
|
|
"loss": 1.0283,
|
|
"step": 22
|
|
},
|
|
{
|
|
"epoch": 0.092,
|
|
"grad_norm": 2.2905514240264893,
|
|
"learning_rate": 9.903225806451614e-06,
|
|
"loss": 1.3321,
|
|
"step": 23
|
|
},
|
|
{
|
|
"epoch": 0.096,
|
|
"grad_norm": 2.527710437774658,
|
|
"learning_rate": 9.895161290322582e-06,
|
|
"loss": 1.4547,
|
|
"step": 24
|
|
},
|
|
{
|
|
"epoch": 0.1,
|
|
"grad_norm": 2.5127201080322266,
|
|
"learning_rate": 9.88709677419355e-06,
|
|
"loss": 1.4558,
|
|
"step": 25
|
|
},
|
|
{
|
|
"epoch": 0.104,
|
|
"grad_norm": 2.472888708114624,
|
|
"learning_rate": 9.879032258064517e-06,
|
|
"loss": 1.2011,
|
|
"step": 26
|
|
},
|
|
{
|
|
"epoch": 0.108,
|
|
"grad_norm": 2.0180959701538086,
|
|
"learning_rate": 9.870967741935485e-06,
|
|
"loss": 1.1346,
|
|
"step": 27
|
|
},
|
|
{
|
|
"epoch": 0.112,
|
|
"grad_norm": 2.4276342391967773,
|
|
"learning_rate": 9.862903225806453e-06,
|
|
"loss": 1.4617,
|
|
"step": 28
|
|
},
|
|
{
|
|
"epoch": 0.116,
|
|
"grad_norm": 2.316664218902588,
|
|
"learning_rate": 9.85483870967742e-06,
|
|
"loss": 1.2723,
|
|
"step": 29
|
|
},
|
|
{
|
|
"epoch": 0.12,
|
|
"grad_norm": 2.4735896587371826,
|
|
"learning_rate": 9.846774193548388e-06,
|
|
"loss": 1.1424,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 0.124,
|
|
"grad_norm": 1.9205927848815918,
|
|
"learning_rate": 9.838709677419356e-06,
|
|
"loss": 1.035,
|
|
"step": 31
|
|
},
|
|
{
|
|
"epoch": 0.128,
|
|
"grad_norm": 2.980786085128784,
|
|
"learning_rate": 9.830645161290323e-06,
|
|
"loss": 1.0171,
|
|
"step": 32
|
|
},
|
|
{
|
|
"epoch": 0.132,
|
|
"grad_norm": 2.645132303237915,
|
|
"learning_rate": 9.822580645161291e-06,
|
|
"loss": 1.5191,
|
|
"step": 33
|
|
},
|
|
{
|
|
"epoch": 0.136,
|
|
"grad_norm": 2.342191696166992,
|
|
"learning_rate": 9.814516129032259e-06,
|
|
"loss": 1.5577,
|
|
"step": 34
|
|
},
|
|
{
|
|
"epoch": 0.14,
|
|
"grad_norm": 2.1698765754699707,
|
|
"learning_rate": 9.806451612903226e-06,
|
|
"loss": 1.3242,
|
|
"step": 35
|
|
},
|
|
{
|
|
"epoch": 0.144,
|
|
"grad_norm": 2.1578097343444824,
|
|
"learning_rate": 9.798387096774194e-06,
|
|
"loss": 1.297,
|
|
"step": 36
|
|
},
|
|
{
|
|
"epoch": 0.148,
|
|
"grad_norm": 2.3967559337615967,
|
|
"learning_rate": 9.790322580645162e-06,
|
|
"loss": 1.4494,
|
|
"step": 37
|
|
},
|
|
{
|
|
"epoch": 0.152,
|
|
"grad_norm": 2.436760902404785,
|
|
"learning_rate": 9.782258064516131e-06,
|
|
"loss": 1.6418,
|
|
"step": 38
|
|
},
|
|
{
|
|
"epoch": 0.156,
|
|
"grad_norm": 2.287909507751465,
|
|
"learning_rate": 9.774193548387097e-06,
|
|
"loss": 1.3089,
|
|
"step": 39
|
|
},
|
|
{
|
|
"epoch": 0.16,
|
|
"grad_norm": 2.4291112422943115,
|
|
"learning_rate": 9.766129032258065e-06,
|
|
"loss": 1.1915,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.164,
|
|
"grad_norm": 2.0664143562316895,
|
|
"learning_rate": 9.758064516129034e-06,
|
|
"loss": 1.3664,
|
|
"step": 41
|
|
},
|
|
{
|
|
"epoch": 0.168,
|
|
"grad_norm": 2.0575056076049805,
|
|
"learning_rate": 9.75e-06,
|
|
"loss": 1.0187,
|
|
"step": 42
|
|
},
|
|
{
|
|
"epoch": 0.172,
|
|
"grad_norm": 2.311537742614746,
|
|
"learning_rate": 9.74193548387097e-06,
|
|
"loss": 1.3643,
|
|
"step": 43
|
|
},
|
|
{
|
|
"epoch": 0.176,
|
|
"grad_norm": 2.620941400527954,
|
|
"learning_rate": 9.733870967741937e-06,
|
|
"loss": 1.4038,
|
|
"step": 44
|
|
},
|
|
{
|
|
"epoch": 0.18,
|
|
"grad_norm": 2.0609350204467773,
|
|
"learning_rate": 9.725806451612903e-06,
|
|
"loss": 1.0915,
|
|
"step": 45
|
|
},
|
|
{
|
|
"epoch": 0.184,
|
|
"grad_norm": 2.421088218688965,
|
|
"learning_rate": 9.717741935483872e-06,
|
|
"loss": 1.1979,
|
|
"step": 46
|
|
},
|
|
{
|
|
"epoch": 0.188,
|
|
"grad_norm": 2.348494291305542,
|
|
"learning_rate": 9.70967741935484e-06,
|
|
"loss": 1.2259,
|
|
"step": 47
|
|
},
|
|
{
|
|
"epoch": 0.192,
|
|
"grad_norm": 2.0419907569885254,
|
|
"learning_rate": 9.701612903225807e-06,
|
|
"loss": 1.3981,
|
|
"step": 48
|
|
},
|
|
{
|
|
"epoch": 0.196,
|
|
"grad_norm": 2.134453058242798,
|
|
"learning_rate": 9.693548387096775e-06,
|
|
"loss": 1.1282,
|
|
"step": 49
|
|
},
|
|
{
|
|
"epoch": 0.2,
|
|
"grad_norm": 2.2903339862823486,
|
|
"learning_rate": 9.685483870967743e-06,
|
|
"loss": 1.0865,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 0.204,
|
|
"grad_norm": 1.951794981956482,
|
|
"learning_rate": 9.67741935483871e-06,
|
|
"loss": 1.2592,
|
|
"step": 51
|
|
},
|
|
{
|
|
"epoch": 0.208,
|
|
"grad_norm": 1.9836851358413696,
|
|
"learning_rate": 9.669354838709678e-06,
|
|
"loss": 0.9517,
|
|
"step": 52
|
|
},
|
|
{
|
|
"epoch": 0.212,
|
|
"grad_norm": 1.9208632707595825,
|
|
"learning_rate": 9.661290322580646e-06,
|
|
"loss": 1.2101,
|
|
"step": 53
|
|
},
|
|
{
|
|
"epoch": 0.216,
|
|
"grad_norm": 2.3081140518188477,
|
|
"learning_rate": 9.653225806451613e-06,
|
|
"loss": 1.4485,
|
|
"step": 54
|
|
},
|
|
{
|
|
"epoch": 0.22,
|
|
"grad_norm": 2.1603920459747314,
|
|
"learning_rate": 9.645161290322581e-06,
|
|
"loss": 1.2366,
|
|
"step": 55
|
|
},
|
|
{
|
|
"epoch": 0.224,
|
|
"grad_norm": 2.097900152206421,
|
|
"learning_rate": 9.637096774193549e-06,
|
|
"loss": 1.2874,
|
|
"step": 56
|
|
},
|
|
{
|
|
"epoch": 0.228,
|
|
"grad_norm": 2.3777639865875244,
|
|
"learning_rate": 9.629032258064516e-06,
|
|
"loss": 1.7109,
|
|
"step": 57
|
|
},
|
|
{
|
|
"epoch": 0.232,
|
|
"grad_norm": 2.0834310054779053,
|
|
"learning_rate": 9.620967741935484e-06,
|
|
"loss": 1.0803,
|
|
"step": 58
|
|
},
|
|
{
|
|
"epoch": 0.236,
|
|
"grad_norm": 2.084362030029297,
|
|
"learning_rate": 9.612903225806453e-06,
|
|
"loss": 1.2013,
|
|
"step": 59
|
|
},
|
|
{
|
|
"epoch": 0.24,
|
|
"grad_norm": 2.0952343940734863,
|
|
"learning_rate": 9.60483870967742e-06,
|
|
"loss": 1.0039,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.244,
|
|
"grad_norm": 2.1214962005615234,
|
|
"learning_rate": 9.596774193548389e-06,
|
|
"loss": 1.2694,
|
|
"step": 61
|
|
},
|
|
{
|
|
"epoch": 0.248,
|
|
"grad_norm": 2.306159734725952,
|
|
"learning_rate": 9.588709677419356e-06,
|
|
"loss": 1.325,
|
|
"step": 62
|
|
},
|
|
{
|
|
"epoch": 0.252,
|
|
"grad_norm": 1.8500971794128418,
|
|
"learning_rate": 9.580645161290322e-06,
|
|
"loss": 1.0988,
|
|
"step": 63
|
|
},
|
|
{
|
|
"epoch": 0.256,
|
|
"grad_norm": 2.684373140335083,
|
|
"learning_rate": 9.572580645161292e-06,
|
|
"loss": 1.6293,
|
|
"step": 64
|
|
},
|
|
{
|
|
"epoch": 0.26,
|
|
"grad_norm": 2.1704890727996826,
|
|
"learning_rate": 9.56451612903226e-06,
|
|
"loss": 1.4308,
|
|
"step": 65
|
|
},
|
|
{
|
|
"epoch": 0.264,
|
|
"grad_norm": 2.252781629562378,
|
|
"learning_rate": 9.556451612903227e-06,
|
|
"loss": 1.3501,
|
|
"step": 66
|
|
},
|
|
{
|
|
"epoch": 0.268,
|
|
"grad_norm": 1.9889864921569824,
|
|
"learning_rate": 9.548387096774195e-06,
|
|
"loss": 1.4615,
|
|
"step": 67
|
|
},
|
|
{
|
|
"epoch": 0.272,
|
|
"grad_norm": 2.2946317195892334,
|
|
"learning_rate": 9.540322580645162e-06,
|
|
"loss": 1.3951,
|
|
"step": 68
|
|
},
|
|
{
|
|
"epoch": 0.276,
|
|
"grad_norm": 2.1460459232330322,
|
|
"learning_rate": 9.53225806451613e-06,
|
|
"loss": 1.1989,
|
|
"step": 69
|
|
},
|
|
{
|
|
"epoch": 0.28,
|
|
"grad_norm": 2.1480677127838135,
|
|
"learning_rate": 9.524193548387098e-06,
|
|
"loss": 1.3122,
|
|
"step": 70
|
|
},
|
|
{
|
|
"epoch": 0.284,
|
|
"grad_norm": 1.9918060302734375,
|
|
"learning_rate": 9.516129032258065e-06,
|
|
"loss": 0.8457,
|
|
"step": 71
|
|
},
|
|
{
|
|
"epoch": 0.288,
|
|
"grad_norm": 2.288792371749878,
|
|
"learning_rate": 9.508064516129033e-06,
|
|
"loss": 0.9798,
|
|
"step": 72
|
|
},
|
|
{
|
|
"epoch": 0.292,
|
|
"grad_norm": 1.8729608058929443,
|
|
"learning_rate": 9.5e-06,
|
|
"loss": 1.0014,
|
|
"step": 73
|
|
},
|
|
{
|
|
"epoch": 0.296,
|
|
"grad_norm": 2.185215473175049,
|
|
"learning_rate": 9.491935483870968e-06,
|
|
"loss": 1.1505,
|
|
"step": 74
|
|
},
|
|
{
|
|
"epoch": 0.3,
|
|
"grad_norm": 2.0389175415039062,
|
|
"learning_rate": 9.483870967741936e-06,
|
|
"loss": 1.1004,
|
|
"step": 75
|
|
},
|
|
{
|
|
"epoch": 0.304,
|
|
"grad_norm": 1.9789763689041138,
|
|
"learning_rate": 9.475806451612905e-06,
|
|
"loss": 1.2499,
|
|
"step": 76
|
|
},
|
|
{
|
|
"epoch": 0.308,
|
|
"grad_norm": 2.25372052192688,
|
|
"learning_rate": 9.467741935483871e-06,
|
|
"loss": 1.2954,
|
|
"step": 77
|
|
},
|
|
{
|
|
"epoch": 0.312,
|
|
"grad_norm": 2.0553982257843018,
|
|
"learning_rate": 9.459677419354839e-06,
|
|
"loss": 1.1647,
|
|
"step": 78
|
|
},
|
|
{
|
|
"epoch": 0.316,
|
|
"grad_norm": 2.0189993381500244,
|
|
"learning_rate": 9.451612903225808e-06,
|
|
"loss": 1.263,
|
|
"step": 79
|
|
},
|
|
{
|
|
"epoch": 0.32,
|
|
"grad_norm": 2.3836658000946045,
|
|
"learning_rate": 9.443548387096774e-06,
|
|
"loss": 1.2895,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.324,
|
|
"grad_norm": 2.480907678604126,
|
|
"learning_rate": 9.435483870967743e-06,
|
|
"loss": 1.4312,
|
|
"step": 81
|
|
},
|
|
{
|
|
"epoch": 0.328,
|
|
"grad_norm": 2.28787899017334,
|
|
"learning_rate": 9.427419354838711e-06,
|
|
"loss": 1.2719,
|
|
"step": 82
|
|
},
|
|
{
|
|
"epoch": 0.332,
|
|
"grad_norm": 2.060723066329956,
|
|
"learning_rate": 9.419354838709677e-06,
|
|
"loss": 1.2143,
|
|
"step": 83
|
|
},
|
|
{
|
|
"epoch": 0.336,
|
|
"grad_norm": 1.9831432104110718,
|
|
"learning_rate": 9.411290322580646e-06,
|
|
"loss": 1.1807,
|
|
"step": 84
|
|
},
|
|
{
|
|
"epoch": 0.34,
|
|
"grad_norm": 2.140202283859253,
|
|
"learning_rate": 9.403225806451614e-06,
|
|
"loss": 1.4506,
|
|
"step": 85
|
|
},
|
|
{
|
|
"epoch": 0.344,
|
|
"grad_norm": 1.962363600730896,
|
|
"learning_rate": 9.395161290322582e-06,
|
|
"loss": 1.0856,
|
|
"step": 86
|
|
},
|
|
{
|
|
"epoch": 0.348,
|
|
"grad_norm": 2.1660656929016113,
|
|
"learning_rate": 9.38709677419355e-06,
|
|
"loss": 1.4112,
|
|
"step": 87
|
|
},
|
|
{
|
|
"epoch": 0.352,
|
|
"grad_norm": 2.1091177463531494,
|
|
"learning_rate": 9.379032258064517e-06,
|
|
"loss": 1.429,
|
|
"step": 88
|
|
},
|
|
{
|
|
"epoch": 0.356,
|
|
"grad_norm": 2.195801019668579,
|
|
"learning_rate": 9.370967741935485e-06,
|
|
"loss": 0.9717,
|
|
"step": 89
|
|
},
|
|
{
|
|
"epoch": 0.36,
|
|
"grad_norm": 2.0965685844421387,
|
|
"learning_rate": 9.362903225806452e-06,
|
|
"loss": 1.0021,
|
|
"step": 90
|
|
},
|
|
{
|
|
"epoch": 0.364,
|
|
"grad_norm": 2.0085933208465576,
|
|
"learning_rate": 9.35483870967742e-06,
|
|
"loss": 1.0799,
|
|
"step": 91
|
|
},
|
|
{
|
|
"epoch": 0.368,
|
|
"grad_norm": 2.3196582794189453,
|
|
"learning_rate": 9.346774193548388e-06,
|
|
"loss": 1.4787,
|
|
"step": 92
|
|
},
|
|
{
|
|
"epoch": 0.372,
|
|
"grad_norm": 1.9600956439971924,
|
|
"learning_rate": 9.338709677419355e-06,
|
|
"loss": 1.0858,
|
|
"step": 93
|
|
},
|
|
{
|
|
"epoch": 0.376,
|
|
"grad_norm": 1.9554862976074219,
|
|
"learning_rate": 9.330645161290323e-06,
|
|
"loss": 1.1764,
|
|
"step": 94
|
|
},
|
|
{
|
|
"epoch": 0.38,
|
|
"grad_norm": 2.0512537956237793,
|
|
"learning_rate": 9.32258064516129e-06,
|
|
"loss": 1.4589,
|
|
"step": 95
|
|
},
|
|
{
|
|
"epoch": 0.384,
|
|
"grad_norm": 2.0751116275787354,
|
|
"learning_rate": 9.314516129032258e-06,
|
|
"loss": 0.9747,
|
|
"step": 96
|
|
},
|
|
{
|
|
"epoch": 0.388,
|
|
"grad_norm": 2.3022332191467285,
|
|
"learning_rate": 9.306451612903226e-06,
|
|
"loss": 1.1592,
|
|
"step": 97
|
|
},
|
|
{
|
|
"epoch": 0.392,
|
|
"grad_norm": 1.7814546823501587,
|
|
"learning_rate": 9.298387096774194e-06,
|
|
"loss": 0.8474,
|
|
"step": 98
|
|
},
|
|
{
|
|
"epoch": 0.396,
|
|
"grad_norm": 2.174898862838745,
|
|
"learning_rate": 9.290322580645163e-06,
|
|
"loss": 1.2139,
|
|
"step": 99
|
|
},
|
|
{
|
|
"epoch": 0.4,
|
|
"grad_norm": 2.0458245277404785,
|
|
"learning_rate": 9.28225806451613e-06,
|
|
"loss": 1.427,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.404,
|
|
"grad_norm": 1.9763966798782349,
|
|
"learning_rate": 9.274193548387097e-06,
|
|
"loss": 1.3954,
|
|
"step": 101
|
|
},
|
|
{
|
|
"epoch": 0.408,
|
|
"grad_norm": 2.1670455932617188,
|
|
"learning_rate": 9.266129032258066e-06,
|
|
"loss": 0.9883,
|
|
"step": 102
|
|
},
|
|
{
|
|
"epoch": 0.412,
|
|
"grad_norm": 1.9239592552185059,
|
|
"learning_rate": 9.258064516129034e-06,
|
|
"loss": 1.0499,
|
|
"step": 103
|
|
},
|
|
{
|
|
"epoch": 0.416,
|
|
"grad_norm": 2.060406446456909,
|
|
"learning_rate": 9.250000000000001e-06,
|
|
"loss": 1.2469,
|
|
"step": 104
|
|
},
|
|
{
|
|
"epoch": 0.42,
|
|
"grad_norm": 2.253674030303955,
|
|
"learning_rate": 9.241935483870969e-06,
|
|
"loss": 1.5057,
|
|
"step": 105
|
|
},
|
|
{
|
|
"epoch": 0.424,
|
|
"grad_norm": 2.1826705932617188,
|
|
"learning_rate": 9.233870967741937e-06,
|
|
"loss": 1.4926,
|
|
"step": 106
|
|
},
|
|
{
|
|
"epoch": 0.428,
|
|
"grad_norm": 2.0552401542663574,
|
|
"learning_rate": 9.225806451612904e-06,
|
|
"loss": 1.4567,
|
|
"step": 107
|
|
},
|
|
{
|
|
"epoch": 0.432,
|
|
"grad_norm": 2.0697274208068848,
|
|
"learning_rate": 9.217741935483872e-06,
|
|
"loss": 1.4088,
|
|
"step": 108
|
|
},
|
|
{
|
|
"epoch": 0.436,
|
|
"grad_norm": 1.7860722541809082,
|
|
"learning_rate": 9.20967741935484e-06,
|
|
"loss": 0.9367,
|
|
"step": 109
|
|
},
|
|
{
|
|
"epoch": 0.44,
|
|
"grad_norm": 2.535959243774414,
|
|
"learning_rate": 9.201612903225807e-06,
|
|
"loss": 0.9932,
|
|
"step": 110
|
|
},
|
|
{
|
|
"epoch": 0.444,
|
|
"grad_norm": 2.0369575023651123,
|
|
"learning_rate": 9.193548387096775e-06,
|
|
"loss": 1.2099,
|
|
"step": 111
|
|
},
|
|
{
|
|
"epoch": 0.448,
|
|
"grad_norm": 1.94306480884552,
|
|
"learning_rate": 9.185483870967742e-06,
|
|
"loss": 1.0501,
|
|
"step": 112
|
|
},
|
|
{
|
|
"epoch": 0.452,
|
|
"grad_norm": 2.0562283992767334,
|
|
"learning_rate": 9.17741935483871e-06,
|
|
"loss": 1.3824,
|
|
"step": 113
|
|
},
|
|
{
|
|
"epoch": 0.456,
|
|
"grad_norm": 1.9372371435165405,
|
|
"learning_rate": 9.16935483870968e-06,
|
|
"loss": 1.0479,
|
|
"step": 114
|
|
},
|
|
{
|
|
"epoch": 0.46,
|
|
"grad_norm": 2.2749247550964355,
|
|
"learning_rate": 9.161290322580645e-06,
|
|
"loss": 1.3404,
|
|
"step": 115
|
|
},
|
|
{
|
|
"epoch": 0.464,
|
|
"grad_norm": 2.251894950866699,
|
|
"learning_rate": 9.153225806451613e-06,
|
|
"loss": 1.5167,
|
|
"step": 116
|
|
},
|
|
{
|
|
"epoch": 0.468,
|
|
"grad_norm": 2.301532030105591,
|
|
"learning_rate": 9.145161290322582e-06,
|
|
"loss": 1.003,
|
|
"step": 117
|
|
},
|
|
{
|
|
"epoch": 0.472,
|
|
"grad_norm": 2.2831201553344727,
|
|
"learning_rate": 9.137096774193548e-06,
|
|
"loss": 1.304,
|
|
"step": 118
|
|
},
|
|
{
|
|
"epoch": 0.476,
|
|
"grad_norm": 2.0962982177734375,
|
|
"learning_rate": 9.129032258064518e-06,
|
|
"loss": 1.1256,
|
|
"step": 119
|
|
},
|
|
{
|
|
"epoch": 0.48,
|
|
"grad_norm": 1.9318597316741943,
|
|
"learning_rate": 9.120967741935485e-06,
|
|
"loss": 0.8632,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.484,
|
|
"grad_norm": 2.1837871074676514,
|
|
"learning_rate": 9.112903225806451e-06,
|
|
"loss": 1.481,
|
|
"step": 121
|
|
},
|
|
{
|
|
"epoch": 0.488,
|
|
"grad_norm": 2.128391742706299,
|
|
"learning_rate": 9.10483870967742e-06,
|
|
"loss": 1.0906,
|
|
"step": 122
|
|
},
|
|
{
|
|
"epoch": 0.492,
|
|
"grad_norm": 1.8772141933441162,
|
|
"learning_rate": 9.096774193548388e-06,
|
|
"loss": 1.0969,
|
|
"step": 123
|
|
},
|
|
{
|
|
"epoch": 0.496,
|
|
"grad_norm": 2.020388603210449,
|
|
"learning_rate": 9.088709677419354e-06,
|
|
"loss": 1.1063,
|
|
"step": 124
|
|
},
|
|
{
|
|
"epoch": 0.5,
|
|
"grad_norm": 2.527641773223877,
|
|
"learning_rate": 9.080645161290324e-06,
|
|
"loss": 1.1885,
|
|
"step": 125
|
|
},
|
|
{
|
|
"epoch": 0.504,
|
|
"grad_norm": 2.061718702316284,
|
|
"learning_rate": 9.072580645161291e-06,
|
|
"loss": 1.2897,
|
|
"step": 126
|
|
},
|
|
{
|
|
"epoch": 0.508,
|
|
"grad_norm": 2.0752320289611816,
|
|
"learning_rate": 9.064516129032259e-06,
|
|
"loss": 1.1053,
|
|
"step": 127
|
|
},
|
|
{
|
|
"epoch": 0.512,
|
|
"grad_norm": 2.0922017097473145,
|
|
"learning_rate": 9.056451612903227e-06,
|
|
"loss": 1.1852,
|
|
"step": 128
|
|
},
|
|
{
|
|
"epoch": 0.516,
|
|
"grad_norm": 2.303179979324341,
|
|
"learning_rate": 9.048387096774194e-06,
|
|
"loss": 1.2175,
|
|
"step": 129
|
|
},
|
|
{
|
|
"epoch": 0.52,
|
|
"grad_norm": 1.902031660079956,
|
|
"learning_rate": 9.040322580645162e-06,
|
|
"loss": 0.9485,
|
|
"step": 130
|
|
},
|
|
{
|
|
"epoch": 0.524,
|
|
"grad_norm": 2.122629165649414,
|
|
"learning_rate": 9.03225806451613e-06,
|
|
"loss": 1.1103,
|
|
"step": 131
|
|
},
|
|
{
|
|
"epoch": 0.528,
|
|
"grad_norm": 2.079432725906372,
|
|
"learning_rate": 9.024193548387097e-06,
|
|
"loss": 1.3427,
|
|
"step": 132
|
|
},
|
|
{
|
|
"epoch": 0.532,
|
|
"grad_norm": 2.4613113403320312,
|
|
"learning_rate": 9.016129032258065e-06,
|
|
"loss": 1.3931,
|
|
"step": 133
|
|
},
|
|
{
|
|
"epoch": 0.536,
|
|
"grad_norm": 2.04841685295105,
|
|
"learning_rate": 9.008064516129033e-06,
|
|
"loss": 1.4206,
|
|
"step": 134
|
|
},
|
|
{
|
|
"epoch": 0.54,
|
|
"grad_norm": 2.021791934967041,
|
|
"learning_rate": 9e-06,
|
|
"loss": 1.2454,
|
|
"step": 135
|
|
},
|
|
{
|
|
"epoch": 0.544,
|
|
"grad_norm": 2.0248231887817383,
|
|
"learning_rate": 8.991935483870968e-06,
|
|
"loss": 1.1466,
|
|
"step": 136
|
|
},
|
|
{
|
|
"epoch": 0.548,
|
|
"grad_norm": 2.7430648803710938,
|
|
"learning_rate": 8.983870967741937e-06,
|
|
"loss": 1.1914,
|
|
"step": 137
|
|
},
|
|
{
|
|
"epoch": 0.552,
|
|
"grad_norm": 2.1386618614196777,
|
|
"learning_rate": 8.975806451612903e-06,
|
|
"loss": 1.1773,
|
|
"step": 138
|
|
},
|
|
{
|
|
"epoch": 0.556,
|
|
"grad_norm": 1.9300769567489624,
|
|
"learning_rate": 8.967741935483871e-06,
|
|
"loss": 1.0833,
|
|
"step": 139
|
|
},
|
|
{
|
|
"epoch": 0.56,
|
|
"grad_norm": 2.113100290298462,
|
|
"learning_rate": 8.95967741935484e-06,
|
|
"loss": 1.33,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 0.564,
|
|
"grad_norm": 1.8950531482696533,
|
|
"learning_rate": 8.951612903225806e-06,
|
|
"loss": 0.961,
|
|
"step": 141
|
|
},
|
|
{
|
|
"epoch": 0.568,
|
|
"grad_norm": 2.254307985305786,
|
|
"learning_rate": 8.943548387096776e-06,
|
|
"loss": 1.0956,
|
|
"step": 142
|
|
},
|
|
{
|
|
"epoch": 0.572,
|
|
"grad_norm": 2.296546697616577,
|
|
"learning_rate": 8.935483870967743e-06,
|
|
"loss": 0.9973,
|
|
"step": 143
|
|
},
|
|
{
|
|
"epoch": 0.576,
|
|
"grad_norm": 2.0698964595794678,
|
|
"learning_rate": 8.92741935483871e-06,
|
|
"loss": 1.1144,
|
|
"step": 144
|
|
},
|
|
{
|
|
"epoch": 0.58,
|
|
"grad_norm": 1.9619230031967163,
|
|
"learning_rate": 8.919354838709678e-06,
|
|
"loss": 0.9931,
|
|
"step": 145
|
|
},
|
|
{
|
|
"epoch": 0.584,
|
|
"grad_norm": 1.7675387859344482,
|
|
"learning_rate": 8.911290322580646e-06,
|
|
"loss": 0.9522,
|
|
"step": 146
|
|
},
|
|
{
|
|
"epoch": 0.588,
|
|
"grad_norm": 1.7958853244781494,
|
|
"learning_rate": 8.903225806451614e-06,
|
|
"loss": 0.9351,
|
|
"step": 147
|
|
},
|
|
{
|
|
"epoch": 0.592,
|
|
"grad_norm": 2.1336379051208496,
|
|
"learning_rate": 8.895161290322581e-06,
|
|
"loss": 1.3449,
|
|
"step": 148
|
|
},
|
|
{
|
|
"epoch": 0.596,
|
|
"grad_norm": 1.9022713899612427,
|
|
"learning_rate": 8.887096774193549e-06,
|
|
"loss": 1.2237,
|
|
"step": 149
|
|
},
|
|
{
|
|
"epoch": 0.6,
|
|
"grad_norm": 1.8845552206039429,
|
|
"learning_rate": 8.879032258064517e-06,
|
|
"loss": 1.0805,
|
|
"step": 150
|
|
},
|
|
{
|
|
"epoch": 0.604,
|
|
"grad_norm": 1.8935421705245972,
|
|
"learning_rate": 8.870967741935484e-06,
|
|
"loss": 0.9688,
|
|
"step": 151
|
|
},
|
|
{
|
|
"epoch": 0.608,
|
|
"grad_norm": 2.0399153232574463,
|
|
"learning_rate": 8.862903225806452e-06,
|
|
"loss": 0.826,
|
|
"step": 152
|
|
},
|
|
{
|
|
"epoch": 0.612,
|
|
"grad_norm": 2.116387128829956,
|
|
"learning_rate": 8.85483870967742e-06,
|
|
"loss": 1.0629,
|
|
"step": 153
|
|
},
|
|
{
|
|
"epoch": 0.616,
|
|
"grad_norm": 2.244560480117798,
|
|
"learning_rate": 8.846774193548387e-06,
|
|
"loss": 1.2835,
|
|
"step": 154
|
|
},
|
|
{
|
|
"epoch": 0.62,
|
|
"grad_norm": 2.0261640548706055,
|
|
"learning_rate": 8.838709677419357e-06,
|
|
"loss": 0.8826,
|
|
"step": 155
|
|
},
|
|
{
|
|
"epoch": 0.624,
|
|
"grad_norm": 2.364264726638794,
|
|
"learning_rate": 8.830645161290323e-06,
|
|
"loss": 1.0676,
|
|
"step": 156
|
|
},
|
|
{
|
|
"epoch": 0.628,
|
|
"grad_norm": 2.4923110008239746,
|
|
"learning_rate": 8.82258064516129e-06,
|
|
"loss": 1.5691,
|
|
"step": 157
|
|
},
|
|
{
|
|
"epoch": 0.632,
|
|
"grad_norm": 1.8004636764526367,
|
|
"learning_rate": 8.81451612903226e-06,
|
|
"loss": 0.8545,
|
|
"step": 158
|
|
},
|
|
{
|
|
"epoch": 0.636,
|
|
"grad_norm": 2.0030720233917236,
|
|
"learning_rate": 8.806451612903226e-06,
|
|
"loss": 1.1769,
|
|
"step": 159
|
|
},
|
|
{
|
|
"epoch": 0.64,
|
|
"grad_norm": 1.9849048852920532,
|
|
"learning_rate": 8.798387096774195e-06,
|
|
"loss": 1.0444,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 0.644,
|
|
"grad_norm": 1.9952068328857422,
|
|
"learning_rate": 8.790322580645163e-06,
|
|
"loss": 1.2118,
|
|
"step": 161
|
|
},
|
|
{
|
|
"epoch": 0.648,
|
|
"grad_norm": 2.0582053661346436,
|
|
"learning_rate": 8.782258064516129e-06,
|
|
"loss": 1.4597,
|
|
"step": 162
|
|
},
|
|
{
|
|
"epoch": 0.652,
|
|
"grad_norm": 1.863604187965393,
|
|
"learning_rate": 8.774193548387098e-06,
|
|
"loss": 1.3389,
|
|
"step": 163
|
|
},
|
|
{
|
|
"epoch": 0.656,
|
|
"grad_norm": 1.4981471300125122,
|
|
"learning_rate": 8.766129032258066e-06,
|
|
"loss": 0.7606,
|
|
"step": 164
|
|
},
|
|
{
|
|
"epoch": 0.66,
|
|
"grad_norm": 2.0630156993865967,
|
|
"learning_rate": 8.758064516129033e-06,
|
|
"loss": 0.778,
|
|
"step": 165
|
|
},
|
|
{
|
|
"epoch": 0.664,
|
|
"grad_norm": 2.324496030807495,
|
|
"learning_rate": 8.750000000000001e-06,
|
|
"loss": 1.713,
|
|
"step": 166
|
|
},
|
|
{
|
|
"epoch": 0.668,
|
|
"grad_norm": 1.8909348249435425,
|
|
"learning_rate": 8.741935483870969e-06,
|
|
"loss": 1.1013,
|
|
"step": 167
|
|
},
|
|
{
|
|
"epoch": 0.672,
|
|
"grad_norm": 2.0191423892974854,
|
|
"learning_rate": 8.733870967741936e-06,
|
|
"loss": 1.4583,
|
|
"step": 168
|
|
},
|
|
{
|
|
"epoch": 0.676,
|
|
"grad_norm": 2.192335605621338,
|
|
"learning_rate": 8.725806451612904e-06,
|
|
"loss": 1.0042,
|
|
"step": 169
|
|
},
|
|
{
|
|
"epoch": 0.68,
|
|
"grad_norm": 1.9449669122695923,
|
|
"learning_rate": 8.717741935483872e-06,
|
|
"loss": 1.2174,
|
|
"step": 170
|
|
},
|
|
{
|
|
"epoch": 0.684,
|
|
"grad_norm": 2.0404837131500244,
|
|
"learning_rate": 8.70967741935484e-06,
|
|
"loss": 1.4411,
|
|
"step": 171
|
|
},
|
|
{
|
|
"epoch": 0.688,
|
|
"grad_norm": 2.162858486175537,
|
|
"learning_rate": 8.701612903225807e-06,
|
|
"loss": 1.593,
|
|
"step": 172
|
|
},
|
|
{
|
|
"epoch": 0.692,
|
|
"grad_norm": 2.00299072265625,
|
|
"learning_rate": 8.693548387096775e-06,
|
|
"loss": 1.0078,
|
|
"step": 173
|
|
},
|
|
{
|
|
"epoch": 0.696,
|
|
"grad_norm": 1.987657070159912,
|
|
"learning_rate": 8.685483870967742e-06,
|
|
"loss": 1.2282,
|
|
"step": 174
|
|
},
|
|
{
|
|
"epoch": 0.7,
|
|
"grad_norm": 1.8024024963378906,
|
|
"learning_rate": 8.677419354838712e-06,
|
|
"loss": 0.9503,
|
|
"step": 175
|
|
},
|
|
{
|
|
"epoch": 0.704,
|
|
"grad_norm": 1.9623843431472778,
|
|
"learning_rate": 8.669354838709677e-06,
|
|
"loss": 1.1567,
|
|
"step": 176
|
|
},
|
|
{
|
|
"epoch": 0.708,
|
|
"grad_norm": 1.9090282917022705,
|
|
"learning_rate": 8.661290322580645e-06,
|
|
"loss": 0.9131,
|
|
"step": 177
|
|
},
|
|
{
|
|
"epoch": 0.712,
|
|
"grad_norm": 1.9411958456039429,
|
|
"learning_rate": 8.653225806451614e-06,
|
|
"loss": 1.0084,
|
|
"step": 178
|
|
},
|
|
{
|
|
"epoch": 0.716,
|
|
"grad_norm": 1.9970715045928955,
|
|
"learning_rate": 8.64516129032258e-06,
|
|
"loss": 1.0465,
|
|
"step": 179
|
|
},
|
|
{
|
|
"epoch": 0.72,
|
|
"grad_norm": 2.0260093212127686,
|
|
"learning_rate": 8.63709677419355e-06,
|
|
"loss": 1.1494,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 0.724,
|
|
"grad_norm": 2.1397087574005127,
|
|
"learning_rate": 8.629032258064517e-06,
|
|
"loss": 1.14,
|
|
"step": 181
|
|
},
|
|
{
|
|
"epoch": 0.728,
|
|
"grad_norm": 2.3102598190307617,
|
|
"learning_rate": 8.620967741935483e-06,
|
|
"loss": 1.0327,
|
|
"step": 182
|
|
},
|
|
{
|
|
"epoch": 0.732,
|
|
"grad_norm": 1.8367112874984741,
|
|
"learning_rate": 8.612903225806453e-06,
|
|
"loss": 1.0707,
|
|
"step": 183
|
|
},
|
|
{
|
|
"epoch": 0.736,
|
|
"grad_norm": 1.896563172340393,
|
|
"learning_rate": 8.60483870967742e-06,
|
|
"loss": 1.1511,
|
|
"step": 184
|
|
},
|
|
{
|
|
"epoch": 0.74,
|
|
"grad_norm": 2.064681053161621,
|
|
"learning_rate": 8.596774193548388e-06,
|
|
"loss": 1.3135,
|
|
"step": 185
|
|
},
|
|
{
|
|
"epoch": 0.744,
|
|
"grad_norm": 2.307497024536133,
|
|
"learning_rate": 8.588709677419356e-06,
|
|
"loss": 1.0588,
|
|
"step": 186
|
|
},
|
|
{
|
|
"epoch": 0.748,
|
|
"grad_norm": 1.8731898069381714,
|
|
"learning_rate": 8.580645161290323e-06,
|
|
"loss": 1.0906,
|
|
"step": 187
|
|
},
|
|
{
|
|
"epoch": 0.752,
|
|
"grad_norm": 1.9699805974960327,
|
|
"learning_rate": 8.572580645161291e-06,
|
|
"loss": 1.1101,
|
|
"step": 188
|
|
},
|
|
{
|
|
"epoch": 0.756,
|
|
"grad_norm": 1.9156608581542969,
|
|
"learning_rate": 8.564516129032259e-06,
|
|
"loss": 0.9281,
|
|
"step": 189
|
|
},
|
|
{
|
|
"epoch": 0.76,
|
|
"grad_norm": 2.0008058547973633,
|
|
"learning_rate": 8.556451612903226e-06,
|
|
"loss": 1.2469,
|
|
"step": 190
|
|
},
|
|
{
|
|
"epoch": 0.764,
|
|
"grad_norm": 2.0837509632110596,
|
|
"learning_rate": 8.548387096774194e-06,
|
|
"loss": 1.1126,
|
|
"step": 191
|
|
},
|
|
{
|
|
"epoch": 0.768,
|
|
"grad_norm": 2.146651029586792,
|
|
"learning_rate": 8.540322580645162e-06,
|
|
"loss": 0.9677,
|
|
"step": 192
|
|
},
|
|
{
|
|
"epoch": 0.772,
|
|
"grad_norm": 1.7224905490875244,
|
|
"learning_rate": 8.53225806451613e-06,
|
|
"loss": 0.8243,
|
|
"step": 193
|
|
},
|
|
{
|
|
"epoch": 0.776,
|
|
"grad_norm": 2.0242717266082764,
|
|
"learning_rate": 8.524193548387097e-06,
|
|
"loss": 0.9788,
|
|
"step": 194
|
|
},
|
|
{
|
|
"epoch": 0.78,
|
|
"grad_norm": 2.278810739517212,
|
|
"learning_rate": 8.516129032258065e-06,
|
|
"loss": 1.7079,
|
|
"step": 195
|
|
},
|
|
{
|
|
"epoch": 0.784,
|
|
"grad_norm": 2.4467318058013916,
|
|
"learning_rate": 8.508064516129034e-06,
|
|
"loss": 1.1674,
|
|
"step": 196
|
|
},
|
|
{
|
|
"epoch": 0.788,
|
|
"grad_norm": 2.0715184211730957,
|
|
"learning_rate": 8.5e-06,
|
|
"loss": 1.1899,
|
|
"step": 197
|
|
},
|
|
{
|
|
"epoch": 0.792,
|
|
"grad_norm": 2.1980104446411133,
|
|
"learning_rate": 8.49193548387097e-06,
|
|
"loss": 1.3366,
|
|
"step": 198
|
|
},
|
|
{
|
|
"epoch": 0.796,
|
|
"grad_norm": 1.8685814142227173,
|
|
"learning_rate": 8.483870967741937e-06,
|
|
"loss": 0.9196,
|
|
"step": 199
|
|
},
|
|
{
|
|
"epoch": 0.8,
|
|
"grad_norm": 2.1090056896209717,
|
|
"learning_rate": 8.475806451612903e-06,
|
|
"loss": 1.1189,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.804,
|
|
"grad_norm": 1.9667922258377075,
|
|
"learning_rate": 8.467741935483872e-06,
|
|
"loss": 0.9924,
|
|
"step": 201
|
|
},
|
|
{
|
|
"epoch": 0.808,
|
|
"grad_norm": 2.147761106491089,
|
|
"learning_rate": 8.45967741935484e-06,
|
|
"loss": 1.1266,
|
|
"step": 202
|
|
},
|
|
{
|
|
"epoch": 0.812,
|
|
"grad_norm": 1.9285451173782349,
|
|
"learning_rate": 8.451612903225808e-06,
|
|
"loss": 1.0698,
|
|
"step": 203
|
|
},
|
|
{
|
|
"epoch": 0.816,
|
|
"grad_norm": 1.980491280555725,
|
|
"learning_rate": 8.443548387096775e-06,
|
|
"loss": 1.3168,
|
|
"step": 204
|
|
},
|
|
{
|
|
"epoch": 0.82,
|
|
"grad_norm": 2.2818551063537598,
|
|
"learning_rate": 8.435483870967743e-06,
|
|
"loss": 1.0842,
|
|
"step": 205
|
|
},
|
|
{
|
|
"epoch": 0.824,
|
|
"grad_norm": 1.848507285118103,
|
|
"learning_rate": 8.42741935483871e-06,
|
|
"loss": 1.1319,
|
|
"step": 206
|
|
},
|
|
{
|
|
"epoch": 0.828,
|
|
"grad_norm": 2.248605489730835,
|
|
"learning_rate": 8.419354838709678e-06,
|
|
"loss": 1.3158,
|
|
"step": 207
|
|
},
|
|
{
|
|
"epoch": 0.832,
|
|
"grad_norm": 1.948811650276184,
|
|
"learning_rate": 8.411290322580646e-06,
|
|
"loss": 1.0943,
|
|
"step": 208
|
|
},
|
|
{
|
|
"epoch": 0.836,
|
|
"grad_norm": 1.8275185823440552,
|
|
"learning_rate": 8.403225806451613e-06,
|
|
"loss": 1.1819,
|
|
"step": 209
|
|
},
|
|
{
|
|
"epoch": 0.84,
|
|
"grad_norm": 2.0042636394500732,
|
|
"learning_rate": 8.395161290322581e-06,
|
|
"loss": 1.0532,
|
|
"step": 210
|
|
},
|
|
{
|
|
"epoch": 0.844,
|
|
"grad_norm": 1.974337100982666,
|
|
"learning_rate": 8.387096774193549e-06,
|
|
"loss": 1.2762,
|
|
"step": 211
|
|
},
|
|
{
|
|
"epoch": 0.848,
|
|
"grad_norm": 1.8739511966705322,
|
|
"learning_rate": 8.379032258064516e-06,
|
|
"loss": 0.9527,
|
|
"step": 212
|
|
},
|
|
{
|
|
"epoch": 0.852,
|
|
"grad_norm": 2.174652338027954,
|
|
"learning_rate": 8.370967741935484e-06,
|
|
"loss": 1.3532,
|
|
"step": 213
|
|
},
|
|
{
|
|
"epoch": 0.856,
|
|
"grad_norm": 2.062572956085205,
|
|
"learning_rate": 8.362903225806452e-06,
|
|
"loss": 0.9614,
|
|
"step": 214
|
|
},
|
|
{
|
|
"epoch": 0.86,
|
|
"grad_norm": 1.943835973739624,
|
|
"learning_rate": 8.35483870967742e-06,
|
|
"loss": 1.2107,
|
|
"step": 215
|
|
},
|
|
{
|
|
"epoch": 0.864,
|
|
"grad_norm": 1.712361454963684,
|
|
"learning_rate": 8.346774193548389e-06,
|
|
"loss": 0.9988,
|
|
"step": 216
|
|
},
|
|
{
|
|
"epoch": 0.868,
|
|
"grad_norm": 1.9365915060043335,
|
|
"learning_rate": 8.338709677419355e-06,
|
|
"loss": 1.3026,
|
|
"step": 217
|
|
},
|
|
{
|
|
"epoch": 0.872,
|
|
"grad_norm": 1.9738059043884277,
|
|
"learning_rate": 8.330645161290322e-06,
|
|
"loss": 1.5525,
|
|
"step": 218
|
|
},
|
|
{
|
|
"epoch": 0.876,
|
|
"grad_norm": 1.974971890449524,
|
|
"learning_rate": 8.322580645161292e-06,
|
|
"loss": 1.0492,
|
|
"step": 219
|
|
},
|
|
{
|
|
"epoch": 0.88,
|
|
"grad_norm": 1.9549391269683838,
|
|
"learning_rate": 8.314516129032258e-06,
|
|
"loss": 1.0535,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 0.884,
|
|
"grad_norm": 1.8761863708496094,
|
|
"learning_rate": 8.306451612903227e-06,
|
|
"loss": 1.3403,
|
|
"step": 221
|
|
},
|
|
{
|
|
"epoch": 0.888,
|
|
"grad_norm": 1.9009919166564941,
|
|
"learning_rate": 8.298387096774195e-06,
|
|
"loss": 1.0114,
|
|
"step": 222
|
|
},
|
|
{
|
|
"epoch": 0.892,
|
|
"grad_norm": 1.8550859689712524,
|
|
"learning_rate": 8.29032258064516e-06,
|
|
"loss": 1.1675,
|
|
"step": 223
|
|
},
|
|
{
|
|
"epoch": 0.896,
|
|
"grad_norm": 1.727038860321045,
|
|
"learning_rate": 8.28225806451613e-06,
|
|
"loss": 0.9587,
|
|
"step": 224
|
|
},
|
|
{
|
|
"epoch": 0.9,
|
|
"grad_norm": 1.8912358283996582,
|
|
"learning_rate": 8.274193548387098e-06,
|
|
"loss": 1.101,
|
|
"step": 225
|
|
},
|
|
{
|
|
"epoch": 0.904,
|
|
"grad_norm": 1.8172194957733154,
|
|
"learning_rate": 8.266129032258065e-06,
|
|
"loss": 1.0145,
|
|
"step": 226
|
|
},
|
|
{
|
|
"epoch": 0.908,
|
|
"grad_norm": 2.0112392902374268,
|
|
"learning_rate": 8.258064516129033e-06,
|
|
"loss": 1.1499,
|
|
"step": 227
|
|
},
|
|
{
|
|
"epoch": 0.912,
|
|
"grad_norm": 2.0185554027557373,
|
|
"learning_rate": 8.25e-06,
|
|
"loss": 1.5163,
|
|
"step": 228
|
|
},
|
|
{
|
|
"epoch": 0.916,
|
|
"grad_norm": 1.9389221668243408,
|
|
"learning_rate": 8.241935483870968e-06,
|
|
"loss": 1.0711,
|
|
"step": 229
|
|
},
|
|
{
|
|
"epoch": 0.92,
|
|
"grad_norm": 2.015944004058838,
|
|
"learning_rate": 8.233870967741936e-06,
|
|
"loss": 1.1014,
|
|
"step": 230
|
|
},
|
|
{
|
|
"epoch": 0.924,
|
|
"grad_norm": 1.83272385597229,
|
|
"learning_rate": 8.225806451612904e-06,
|
|
"loss": 0.8464,
|
|
"step": 231
|
|
},
|
|
{
|
|
"epoch": 0.928,
|
|
"grad_norm": 1.9649468660354614,
|
|
"learning_rate": 8.217741935483871e-06,
|
|
"loss": 1.1034,
|
|
"step": 232
|
|
},
|
|
{
|
|
"epoch": 0.932,
|
|
"grad_norm": 2.1457502841949463,
|
|
"learning_rate": 8.209677419354839e-06,
|
|
"loss": 1.3939,
|
|
"step": 233
|
|
},
|
|
{
|
|
"epoch": 0.936,
|
|
"grad_norm": 1.770036220550537,
|
|
"learning_rate": 8.201612903225807e-06,
|
|
"loss": 0.9748,
|
|
"step": 234
|
|
},
|
|
{
|
|
"epoch": 0.94,
|
|
"grad_norm": 1.947691559791565,
|
|
"learning_rate": 8.193548387096774e-06,
|
|
"loss": 1.014,
|
|
"step": 235
|
|
},
|
|
{
|
|
"epoch": 0.944,
|
|
"grad_norm": 2.1747374534606934,
|
|
"learning_rate": 8.185483870967744e-06,
|
|
"loss": 1.2469,
|
|
"step": 236
|
|
},
|
|
{
|
|
"epoch": 0.948,
|
|
"grad_norm": 2.02323579788208,
|
|
"learning_rate": 8.177419354838711e-06,
|
|
"loss": 1.2203,
|
|
"step": 237
|
|
},
|
|
{
|
|
"epoch": 0.952,
|
|
"grad_norm": 1.9631866216659546,
|
|
"learning_rate": 8.169354838709677e-06,
|
|
"loss": 1.0833,
|
|
"step": 238
|
|
},
|
|
{
|
|
"epoch": 0.956,
|
|
"grad_norm": 2.2246932983398438,
|
|
"learning_rate": 8.161290322580647e-06,
|
|
"loss": 1.4365,
|
|
"step": 239
|
|
},
|
|
{
|
|
"epoch": 0.96,
|
|
"grad_norm": 1.8288980722427368,
|
|
"learning_rate": 8.153225806451614e-06,
|
|
"loss": 0.9002,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 0.964,
|
|
"grad_norm": 2.1047258377075195,
|
|
"learning_rate": 8.145161290322582e-06,
|
|
"loss": 1.0669,
|
|
"step": 241
|
|
},
|
|
{
|
|
"epoch": 0.968,
|
|
"grad_norm": 2.1168065071105957,
|
|
"learning_rate": 8.13709677419355e-06,
|
|
"loss": 1.1497,
|
|
"step": 242
|
|
},
|
|
{
|
|
"epoch": 0.972,
|
|
"grad_norm": 2.0137243270874023,
|
|
"learning_rate": 8.129032258064517e-06,
|
|
"loss": 0.9738,
|
|
"step": 243
|
|
},
|
|
{
|
|
"epoch": 0.976,
|
|
"grad_norm": 1.6851848363876343,
|
|
"learning_rate": 8.120967741935485e-06,
|
|
"loss": 0.8467,
|
|
"step": 244
|
|
},
|
|
{
|
|
"epoch": 0.98,
|
|
"grad_norm": 1.9099082946777344,
|
|
"learning_rate": 8.112903225806452e-06,
|
|
"loss": 1.1605,
|
|
"step": 245
|
|
},
|
|
{
|
|
"epoch": 0.984,
|
|
"grad_norm": 2.069336414337158,
|
|
"learning_rate": 8.10483870967742e-06,
|
|
"loss": 0.9732,
|
|
"step": 246
|
|
},
|
|
{
|
|
"epoch": 0.988,
|
|
"grad_norm": 2.0151474475860596,
|
|
"learning_rate": 8.096774193548388e-06,
|
|
"loss": 1.0807,
|
|
"step": 247
|
|
},
|
|
{
|
|
"epoch": 0.992,
|
|
"grad_norm": 1.981397271156311,
|
|
"learning_rate": 8.088709677419355e-06,
|
|
"loss": 1.2777,
|
|
"step": 248
|
|
},
|
|
{
|
|
"epoch": 0.996,
|
|
"grad_norm": 1.761157751083374,
|
|
"learning_rate": 8.080645161290323e-06,
|
|
"loss": 0.8838,
|
|
"step": 249
|
|
},
|
|
{
|
|
"epoch": 1.0,
|
|
"grad_norm": 2.0894131660461426,
|
|
"learning_rate": 8.07258064516129e-06,
|
|
"loss": 1.2467,
|
|
"step": 250
|
|
},
|
|
{
|
|
"epoch": 1.004,
|
|
"grad_norm": 2.042200803756714,
|
|
"learning_rate": 8.064516129032258e-06,
|
|
"loss": 1.226,
|
|
"step": 251
|
|
},
|
|
{
|
|
"epoch": 1.008,
|
|
"grad_norm": 1.893397569656372,
|
|
"learning_rate": 8.056451612903226e-06,
|
|
"loss": 0.9192,
|
|
"step": 252
|
|
},
|
|
{
|
|
"epoch": 1.012,
|
|
"grad_norm": 1.9246549606323242,
|
|
"learning_rate": 8.048387096774194e-06,
|
|
"loss": 0.9392,
|
|
"step": 253
|
|
},
|
|
{
|
|
"epoch": 1.016,
|
|
"grad_norm": 1.8478291034698486,
|
|
"learning_rate": 8.040322580645163e-06,
|
|
"loss": 1.285,
|
|
"step": 254
|
|
},
|
|
{
|
|
"epoch": 1.02,
|
|
"grad_norm": 1.9581583738327026,
|
|
"learning_rate": 8.032258064516129e-06,
|
|
"loss": 1.1652,
|
|
"step": 255
|
|
},
|
|
{
|
|
"epoch": 1.024,
|
|
"grad_norm": 1.6915805339813232,
|
|
"learning_rate": 8.024193548387097e-06,
|
|
"loss": 0.7934,
|
|
"step": 256
|
|
},
|
|
{
|
|
"epoch": 1.028,
|
|
"grad_norm": 1.7810734510421753,
|
|
"learning_rate": 8.016129032258066e-06,
|
|
"loss": 1.1249,
|
|
"step": 257
|
|
},
|
|
{
|
|
"epoch": 1.032,
|
|
"grad_norm": 1.6987296342849731,
|
|
"learning_rate": 8.008064516129032e-06,
|
|
"loss": 0.8559,
|
|
"step": 258
|
|
},
|
|
{
|
|
"epoch": 1.036,
|
|
"grad_norm": 1.769662857055664,
|
|
"learning_rate": 8.000000000000001e-06,
|
|
"loss": 1.0913,
|
|
"step": 259
|
|
},
|
|
{
|
|
"epoch": 1.04,
|
|
"grad_norm": 1.8633121252059937,
|
|
"learning_rate": 7.991935483870969e-06,
|
|
"loss": 0.8986,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 1.044,
|
|
"grad_norm": 2.1259944438934326,
|
|
"learning_rate": 7.983870967741935e-06,
|
|
"loss": 1.0982,
|
|
"step": 261
|
|
},
|
|
{
|
|
"epoch": 1.048,
|
|
"grad_norm": 1.9265049695968628,
|
|
"learning_rate": 7.975806451612904e-06,
|
|
"loss": 1.0264,
|
|
"step": 262
|
|
},
|
|
{
|
|
"epoch": 1.052,
|
|
"grad_norm": 1.9437800645828247,
|
|
"learning_rate": 7.967741935483872e-06,
|
|
"loss": 1.2013,
|
|
"step": 263
|
|
},
|
|
{
|
|
"epoch": 1.056,
|
|
"grad_norm": 1.837809681892395,
|
|
"learning_rate": 7.95967741935484e-06,
|
|
"loss": 1.3034,
|
|
"step": 264
|
|
},
|
|
{
|
|
"epoch": 1.06,
|
|
"grad_norm": 1.9760485887527466,
|
|
"learning_rate": 7.951612903225807e-06,
|
|
"loss": 1.2059,
|
|
"step": 265
|
|
},
|
|
{
|
|
"epoch": 1.064,
|
|
"grad_norm": 1.8548455238342285,
|
|
"learning_rate": 7.943548387096775e-06,
|
|
"loss": 0.807,
|
|
"step": 266
|
|
},
|
|
{
|
|
"epoch": 1.068,
|
|
"grad_norm": 1.6947693824768066,
|
|
"learning_rate": 7.935483870967743e-06,
|
|
"loss": 0.8937,
|
|
"step": 267
|
|
},
|
|
{
|
|
"epoch": 1.072,
|
|
"grad_norm": 1.9188228845596313,
|
|
"learning_rate": 7.92741935483871e-06,
|
|
"loss": 1.1405,
|
|
"step": 268
|
|
},
|
|
{
|
|
"epoch": 1.076,
|
|
"grad_norm": 1.9461287260055542,
|
|
"learning_rate": 7.919354838709678e-06,
|
|
"loss": 1.2617,
|
|
"step": 269
|
|
},
|
|
{
|
|
"epoch": 1.08,
|
|
"grad_norm": 1.8000714778900146,
|
|
"learning_rate": 7.911290322580646e-06,
|
|
"loss": 1.2302,
|
|
"step": 270
|
|
},
|
|
{
|
|
"epoch": 1.084,
|
|
"grad_norm": 1.9787355661392212,
|
|
"learning_rate": 7.903225806451613e-06,
|
|
"loss": 1.2263,
|
|
"step": 271
|
|
},
|
|
{
|
|
"epoch": 1.088,
|
|
"grad_norm": 1.7489923238754272,
|
|
"learning_rate": 7.895161290322581e-06,
|
|
"loss": 1.0783,
|
|
"step": 272
|
|
},
|
|
{
|
|
"epoch": 1.092,
|
|
"grad_norm": 1.8225479125976562,
|
|
"learning_rate": 7.887096774193549e-06,
|
|
"loss": 0.9199,
|
|
"step": 273
|
|
},
|
|
{
|
|
"epoch": 1.096,
|
|
"grad_norm": 1.7303674221038818,
|
|
"learning_rate": 7.879032258064518e-06,
|
|
"loss": 0.9423,
|
|
"step": 274
|
|
},
|
|
{
|
|
"epoch": 1.1,
|
|
"grad_norm": 1.911805510520935,
|
|
"learning_rate": 7.870967741935484e-06,
|
|
"loss": 0.9044,
|
|
"step": 275
|
|
},
|
|
{
|
|
"epoch": 1.104,
|
|
"grad_norm": 1.943996548652649,
|
|
"learning_rate": 7.862903225806451e-06,
|
|
"loss": 0.9731,
|
|
"step": 276
|
|
},
|
|
{
|
|
"epoch": 1.108,
|
|
"grad_norm": 2.116546154022217,
|
|
"learning_rate": 7.85483870967742e-06,
|
|
"loss": 1.3928,
|
|
"step": 277
|
|
},
|
|
{
|
|
"epoch": 1.112,
|
|
"grad_norm": 1.8358001708984375,
|
|
"learning_rate": 7.846774193548388e-06,
|
|
"loss": 1.0155,
|
|
"step": 278
|
|
},
|
|
{
|
|
"epoch": 1.116,
|
|
"grad_norm": 2.039339780807495,
|
|
"learning_rate": 7.838709677419354e-06,
|
|
"loss": 0.9361,
|
|
"step": 279
|
|
},
|
|
{
|
|
"epoch": 1.12,
|
|
"grad_norm": 1.9571505784988403,
|
|
"learning_rate": 7.830645161290324e-06,
|
|
"loss": 0.7762,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 1.124,
|
|
"grad_norm": 2.0101685523986816,
|
|
"learning_rate": 7.822580645161291e-06,
|
|
"loss": 1.1705,
|
|
"step": 281
|
|
},
|
|
{
|
|
"epoch": 1.1280000000000001,
|
|
"grad_norm": 1.9054312705993652,
|
|
"learning_rate": 7.814516129032259e-06,
|
|
"loss": 0.9305,
|
|
"step": 282
|
|
},
|
|
{
|
|
"epoch": 1.1320000000000001,
|
|
"grad_norm": 1.6266229152679443,
|
|
"learning_rate": 7.806451612903227e-06,
|
|
"loss": 0.7991,
|
|
"step": 283
|
|
},
|
|
{
|
|
"epoch": 1.1360000000000001,
|
|
"grad_norm": 1.9757091999053955,
|
|
"learning_rate": 7.798387096774194e-06,
|
|
"loss": 0.9544,
|
|
"step": 284
|
|
},
|
|
{
|
|
"epoch": 1.1400000000000001,
|
|
"grad_norm": 1.9758026599884033,
|
|
"learning_rate": 7.790322580645162e-06,
|
|
"loss": 1.2189,
|
|
"step": 285
|
|
},
|
|
{
|
|
"epoch": 1.144,
|
|
"grad_norm": 1.8889524936676025,
|
|
"learning_rate": 7.78225806451613e-06,
|
|
"loss": 0.8184,
|
|
"step": 286
|
|
},
|
|
{
|
|
"epoch": 1.148,
|
|
"grad_norm": 1.9242223501205444,
|
|
"learning_rate": 7.774193548387097e-06,
|
|
"loss": 1.1267,
|
|
"step": 287
|
|
},
|
|
{
|
|
"epoch": 1.152,
|
|
"grad_norm": 1.8673722743988037,
|
|
"learning_rate": 7.766129032258065e-06,
|
|
"loss": 1.0659,
|
|
"step": 288
|
|
},
|
|
{
|
|
"epoch": 1.156,
|
|
"grad_norm": 2.0750553607940674,
|
|
"learning_rate": 7.758064516129033e-06,
|
|
"loss": 0.912,
|
|
"step": 289
|
|
},
|
|
{
|
|
"epoch": 1.16,
|
|
"grad_norm": 1.7121838331222534,
|
|
"learning_rate": 7.75e-06,
|
|
"loss": 0.7481,
|
|
"step": 290
|
|
},
|
|
{
|
|
"epoch": 1.164,
|
|
"grad_norm": 1.895327091217041,
|
|
"learning_rate": 7.741935483870968e-06,
|
|
"loss": 1.0651,
|
|
"step": 291
|
|
},
|
|
{
|
|
"epoch": 1.168,
|
|
"grad_norm": 1.9888561964035034,
|
|
"learning_rate": 7.733870967741937e-06,
|
|
"loss": 1.0139,
|
|
"step": 292
|
|
},
|
|
{
|
|
"epoch": 1.172,
|
|
"grad_norm": 2.134798049926758,
|
|
"learning_rate": 7.725806451612903e-06,
|
|
"loss": 1.116,
|
|
"step": 293
|
|
},
|
|
{
|
|
"epoch": 1.176,
|
|
"grad_norm": 2.063161611557007,
|
|
"learning_rate": 7.717741935483871e-06,
|
|
"loss": 1.1218,
|
|
"step": 294
|
|
},
|
|
{
|
|
"epoch": 1.18,
|
|
"grad_norm": 1.748466968536377,
|
|
"learning_rate": 7.70967741935484e-06,
|
|
"loss": 0.9331,
|
|
"step": 295
|
|
},
|
|
{
|
|
"epoch": 1.184,
|
|
"grad_norm": 1.7615900039672852,
|
|
"learning_rate": 7.701612903225806e-06,
|
|
"loss": 0.9822,
|
|
"step": 296
|
|
},
|
|
{
|
|
"epoch": 1.188,
|
|
"grad_norm": 1.980162501335144,
|
|
"learning_rate": 7.693548387096776e-06,
|
|
"loss": 1.1527,
|
|
"step": 297
|
|
},
|
|
{
|
|
"epoch": 1.192,
|
|
"grad_norm": 1.8901320695877075,
|
|
"learning_rate": 7.685483870967743e-06,
|
|
"loss": 1.0305,
|
|
"step": 298
|
|
},
|
|
{
|
|
"epoch": 1.196,
|
|
"grad_norm": 2.2169246673583984,
|
|
"learning_rate": 7.67741935483871e-06,
|
|
"loss": 1.2761,
|
|
"step": 299
|
|
},
|
|
{
|
|
"epoch": 1.2,
|
|
"grad_norm": 2.0211877822875977,
|
|
"learning_rate": 7.669354838709679e-06,
|
|
"loss": 1.1125,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 1.204,
|
|
"grad_norm": 1.9308112859725952,
|
|
"learning_rate": 7.661290322580646e-06,
|
|
"loss": 1.1144,
|
|
"step": 301
|
|
},
|
|
{
|
|
"epoch": 1.208,
|
|
"grad_norm": 1.9019742012023926,
|
|
"learning_rate": 7.653225806451614e-06,
|
|
"loss": 1.0119,
|
|
"step": 302
|
|
},
|
|
{
|
|
"epoch": 1.212,
|
|
"grad_norm": 2.1698246002197266,
|
|
"learning_rate": 7.645161290322582e-06,
|
|
"loss": 1.0064,
|
|
"step": 303
|
|
},
|
|
{
|
|
"epoch": 1.216,
|
|
"grad_norm": 2.0672056674957275,
|
|
"learning_rate": 7.63709677419355e-06,
|
|
"loss": 1.3564,
|
|
"step": 304
|
|
},
|
|
{
|
|
"epoch": 1.22,
|
|
"grad_norm": 1.9018466472625732,
|
|
"learning_rate": 7.629032258064517e-06,
|
|
"loss": 1.1786,
|
|
"step": 305
|
|
},
|
|
{
|
|
"epoch": 1.224,
|
|
"grad_norm": 2.1040728092193604,
|
|
"learning_rate": 7.6209677419354845e-06,
|
|
"loss": 1.1445,
|
|
"step": 306
|
|
},
|
|
{
|
|
"epoch": 1.228,
|
|
"grad_norm": 2.0901339054107666,
|
|
"learning_rate": 7.612903225806451e-06,
|
|
"loss": 0.9766,
|
|
"step": 307
|
|
},
|
|
{
|
|
"epoch": 1.232,
|
|
"grad_norm": 2.0483977794647217,
|
|
"learning_rate": 7.60483870967742e-06,
|
|
"loss": 0.8798,
|
|
"step": 308
|
|
},
|
|
{
|
|
"epoch": 1.236,
|
|
"grad_norm": 1.9911725521087646,
|
|
"learning_rate": 7.5967741935483875e-06,
|
|
"loss": 1.1939,
|
|
"step": 309
|
|
},
|
|
{
|
|
"epoch": 1.24,
|
|
"grad_norm": 1.9746226072311401,
|
|
"learning_rate": 7.588709677419356e-06,
|
|
"loss": 1.2349,
|
|
"step": 310
|
|
},
|
|
{
|
|
"epoch": 1.244,
|
|
"grad_norm": 1.916457176208496,
|
|
"learning_rate": 7.580645161290323e-06,
|
|
"loss": 0.8643,
|
|
"step": 311
|
|
},
|
|
{
|
|
"epoch": 1.248,
|
|
"grad_norm": 1.877477765083313,
|
|
"learning_rate": 7.5725806451612904e-06,
|
|
"loss": 1.0597,
|
|
"step": 312
|
|
},
|
|
{
|
|
"epoch": 1.252,
|
|
"grad_norm": 1.820234775543213,
|
|
"learning_rate": 7.564516129032259e-06,
|
|
"loss": 0.8479,
|
|
"step": 313
|
|
},
|
|
{
|
|
"epoch": 1.256,
|
|
"grad_norm": 2.0701229572296143,
|
|
"learning_rate": 7.556451612903226e-06,
|
|
"loss": 0.87,
|
|
"step": 314
|
|
},
|
|
{
|
|
"epoch": 1.26,
|
|
"grad_norm": 1.964243769645691,
|
|
"learning_rate": 7.548387096774194e-06,
|
|
"loss": 0.9032,
|
|
"step": 315
|
|
},
|
|
{
|
|
"epoch": 1.264,
|
|
"grad_norm": 1.9548320770263672,
|
|
"learning_rate": 7.540322580645162e-06,
|
|
"loss": 1.1532,
|
|
"step": 316
|
|
},
|
|
{
|
|
"epoch": 1.268,
|
|
"grad_norm": 1.8334695100784302,
|
|
"learning_rate": 7.5322580645161296e-06,
|
|
"loss": 1.005,
|
|
"step": 317
|
|
},
|
|
{
|
|
"epoch": 1.272,
|
|
"grad_norm": 1.7740626335144043,
|
|
"learning_rate": 7.524193548387097e-06,
|
|
"loss": 0.6827,
|
|
"step": 318
|
|
},
|
|
{
|
|
"epoch": 1.276,
|
|
"grad_norm": 1.8981671333312988,
|
|
"learning_rate": 7.516129032258065e-06,
|
|
"loss": 1.0224,
|
|
"step": 319
|
|
},
|
|
{
|
|
"epoch": 1.28,
|
|
"grad_norm": 1.9381623268127441,
|
|
"learning_rate": 7.508064516129033e-06,
|
|
"loss": 1.1752,
|
|
"step": 320
|
|
},
|
|
{
|
|
"epoch": 1.284,
|
|
"grad_norm": 1.8517154455184937,
|
|
"learning_rate": 7.500000000000001e-06,
|
|
"loss": 1.17,
|
|
"step": 321
|
|
},
|
|
{
|
|
"epoch": 1.288,
|
|
"grad_norm": 1.7893662452697754,
|
|
"learning_rate": 7.491935483870968e-06,
|
|
"loss": 1.0283,
|
|
"step": 322
|
|
},
|
|
{
|
|
"epoch": 1.292,
|
|
"grad_norm": 2.0215885639190674,
|
|
"learning_rate": 7.483870967741936e-06,
|
|
"loss": 1.2029,
|
|
"step": 323
|
|
},
|
|
{
|
|
"epoch": 1.296,
|
|
"grad_norm": 1.9150351285934448,
|
|
"learning_rate": 7.475806451612904e-06,
|
|
"loss": 0.888,
|
|
"step": 324
|
|
},
|
|
{
|
|
"epoch": 1.3,
|
|
"grad_norm": 1.7969810962677002,
|
|
"learning_rate": 7.467741935483872e-06,
|
|
"loss": 0.9533,
|
|
"step": 325
|
|
},
|
|
{
|
|
"epoch": 1.304,
|
|
"grad_norm": 1.9353724718093872,
|
|
"learning_rate": 7.459677419354839e-06,
|
|
"loss": 0.8941,
|
|
"step": 326
|
|
},
|
|
{
|
|
"epoch": 1.308,
|
|
"grad_norm": 1.9007946252822876,
|
|
"learning_rate": 7.451612903225807e-06,
|
|
"loss": 0.88,
|
|
"step": 327
|
|
},
|
|
{
|
|
"epoch": 1.312,
|
|
"grad_norm": 1.9816163778305054,
|
|
"learning_rate": 7.4435483870967755e-06,
|
|
"loss": 0.9307,
|
|
"step": 328
|
|
},
|
|
{
|
|
"epoch": 1.316,
|
|
"grad_norm": 1.7767106294631958,
|
|
"learning_rate": 7.435483870967742e-06,
|
|
"loss": 0.8723,
|
|
"step": 329
|
|
},
|
|
{
|
|
"epoch": 1.32,
|
|
"grad_norm": 2.6831021308898926,
|
|
"learning_rate": 7.427419354838711e-06,
|
|
"loss": 0.9343,
|
|
"step": 330
|
|
},
|
|
{
|
|
"epoch": 1.324,
|
|
"grad_norm": 2.1091558933258057,
|
|
"learning_rate": 7.4193548387096784e-06,
|
|
"loss": 1.0657,
|
|
"step": 331
|
|
},
|
|
{
|
|
"epoch": 1.328,
|
|
"grad_norm": 1.727107048034668,
|
|
"learning_rate": 7.411290322580645e-06,
|
|
"loss": 0.7765,
|
|
"step": 332
|
|
},
|
|
{
|
|
"epoch": 1.332,
|
|
"grad_norm": 1.861849069595337,
|
|
"learning_rate": 7.403225806451614e-06,
|
|
"loss": 0.8598,
|
|
"step": 333
|
|
},
|
|
{
|
|
"epoch": 1.336,
|
|
"grad_norm": 1.8695261478424072,
|
|
"learning_rate": 7.395161290322581e-06,
|
|
"loss": 0.8469,
|
|
"step": 334
|
|
},
|
|
{
|
|
"epoch": 1.34,
|
|
"grad_norm": 1.9783046245574951,
|
|
"learning_rate": 7.38709677419355e-06,
|
|
"loss": 1.1789,
|
|
"step": 335
|
|
},
|
|
{
|
|
"epoch": 1.3439999999999999,
|
|
"grad_norm": 2.044293165206909,
|
|
"learning_rate": 7.379032258064517e-06,
|
|
"loss": 1.1059,
|
|
"step": 336
|
|
},
|
|
{
|
|
"epoch": 1.3479999999999999,
|
|
"grad_norm": 2.19840145111084,
|
|
"learning_rate": 7.370967741935484e-06,
|
|
"loss": 1.1622,
|
|
"step": 337
|
|
},
|
|
{
|
|
"epoch": 1.3519999999999999,
|
|
"grad_norm": 2.1134932041168213,
|
|
"learning_rate": 7.362903225806453e-06,
|
|
"loss": 1.3109,
|
|
"step": 338
|
|
},
|
|
{
|
|
"epoch": 1.3559999999999999,
|
|
"grad_norm": 2.0898921489715576,
|
|
"learning_rate": 7.35483870967742e-06,
|
|
"loss": 1.1852,
|
|
"step": 339
|
|
},
|
|
{
|
|
"epoch": 1.3599999999999999,
|
|
"grad_norm": 2.190387010574341,
|
|
"learning_rate": 7.346774193548387e-06,
|
|
"loss": 1.2016,
|
|
"step": 340
|
|
},
|
|
{
|
|
"epoch": 1.3639999999999999,
|
|
"grad_norm": 1.914016604423523,
|
|
"learning_rate": 7.338709677419356e-06,
|
|
"loss": 1.0764,
|
|
"step": 341
|
|
},
|
|
{
|
|
"epoch": 1.3679999999999999,
|
|
"grad_norm": 1.908305287361145,
|
|
"learning_rate": 7.330645161290323e-06,
|
|
"loss": 0.9787,
|
|
"step": 342
|
|
},
|
|
{
|
|
"epoch": 1.3719999999999999,
|
|
"grad_norm": 2.0489273071289062,
|
|
"learning_rate": 7.322580645161291e-06,
|
|
"loss": 1.37,
|
|
"step": 343
|
|
},
|
|
{
|
|
"epoch": 1.376,
|
|
"grad_norm": 1.885549783706665,
|
|
"learning_rate": 7.314516129032259e-06,
|
|
"loss": 0.8716,
|
|
"step": 344
|
|
},
|
|
{
|
|
"epoch": 1.38,
|
|
"grad_norm": 1.9682085514068604,
|
|
"learning_rate": 7.306451612903226e-06,
|
|
"loss": 1.0618,
|
|
"step": 345
|
|
},
|
|
{
|
|
"epoch": 1.384,
|
|
"grad_norm": 1.9825321435928345,
|
|
"learning_rate": 7.298387096774194e-06,
|
|
"loss": 0.9762,
|
|
"step": 346
|
|
},
|
|
{
|
|
"epoch": 1.388,
|
|
"grad_norm": 1.710522174835205,
|
|
"learning_rate": 7.290322580645162e-06,
|
|
"loss": 0.8213,
|
|
"step": 347
|
|
},
|
|
{
|
|
"epoch": 1.392,
|
|
"grad_norm": 1.8329979181289673,
|
|
"learning_rate": 7.28225806451613e-06,
|
|
"loss": 0.8985,
|
|
"step": 348
|
|
},
|
|
{
|
|
"epoch": 1.396,
|
|
"grad_norm": 1.7996221780776978,
|
|
"learning_rate": 7.274193548387097e-06,
|
|
"loss": 0.8497,
|
|
"step": 349
|
|
},
|
|
{
|
|
"epoch": 1.4,
|
|
"grad_norm": 2.049039363861084,
|
|
"learning_rate": 7.266129032258065e-06,
|
|
"loss": 1.0881,
|
|
"step": 350
|
|
},
|
|
{
|
|
"epoch": 1.404,
|
|
"grad_norm": 1.8585624694824219,
|
|
"learning_rate": 7.258064516129033e-06,
|
|
"loss": 0.7929,
|
|
"step": 351
|
|
},
|
|
{
|
|
"epoch": 1.408,
|
|
"grad_norm": 1.9661332368850708,
|
|
"learning_rate": 7.25e-06,
|
|
"loss": 1.1871,
|
|
"step": 352
|
|
},
|
|
{
|
|
"epoch": 1.412,
|
|
"grad_norm": 2.0757462978363037,
|
|
"learning_rate": 7.2419354838709685e-06,
|
|
"loss": 1.2421,
|
|
"step": 353
|
|
},
|
|
{
|
|
"epoch": 1.416,
|
|
"grad_norm": 1.9401495456695557,
|
|
"learning_rate": 7.233870967741936e-06,
|
|
"loss": 1.158,
|
|
"step": 354
|
|
},
|
|
{
|
|
"epoch": 1.42,
|
|
"grad_norm": 1.9338098764419556,
|
|
"learning_rate": 7.225806451612903e-06,
|
|
"loss": 1.0955,
|
|
"step": 355
|
|
},
|
|
{
|
|
"epoch": 1.424,
|
|
"grad_norm": 1.8598445653915405,
|
|
"learning_rate": 7.2177419354838715e-06,
|
|
"loss": 0.9375,
|
|
"step": 356
|
|
},
|
|
{
|
|
"epoch": 1.428,
|
|
"grad_norm": 2.073228359222412,
|
|
"learning_rate": 7.209677419354839e-06,
|
|
"loss": 0.99,
|
|
"step": 357
|
|
},
|
|
{
|
|
"epoch": 1.432,
|
|
"grad_norm": 1.7986657619476318,
|
|
"learning_rate": 7.201612903225808e-06,
|
|
"loss": 0.9106,
|
|
"step": 358
|
|
},
|
|
{
|
|
"epoch": 1.436,
|
|
"grad_norm": 1.9713162183761597,
|
|
"learning_rate": 7.1935483870967745e-06,
|
|
"loss": 0.7723,
|
|
"step": 359
|
|
},
|
|
{
|
|
"epoch": 1.44,
|
|
"grad_norm": 1.9422978162765503,
|
|
"learning_rate": 7.185483870967742e-06,
|
|
"loss": 0.8295,
|
|
"step": 360
|
|
},
|
|
{
|
|
"epoch": 1.444,
|
|
"grad_norm": 1.93070650100708,
|
|
"learning_rate": 7.177419354838711e-06,
|
|
"loss": 0.9505,
|
|
"step": 361
|
|
},
|
|
{
|
|
"epoch": 1.448,
|
|
"grad_norm": 1.8391205072402954,
|
|
"learning_rate": 7.1693548387096774e-06,
|
|
"loss": 1.0174,
|
|
"step": 362
|
|
},
|
|
{
|
|
"epoch": 1.452,
|
|
"grad_norm": 2.107405185699463,
|
|
"learning_rate": 7.161290322580646e-06,
|
|
"loss": 0.9669,
|
|
"step": 363
|
|
},
|
|
{
|
|
"epoch": 1.456,
|
|
"grad_norm": 1.9300522804260254,
|
|
"learning_rate": 7.153225806451614e-06,
|
|
"loss": 0.855,
|
|
"step": 364
|
|
},
|
|
{
|
|
"epoch": 1.46,
|
|
"grad_norm": 2.127452850341797,
|
|
"learning_rate": 7.145161290322581e-06,
|
|
"loss": 0.9638,
|
|
"step": 365
|
|
},
|
|
{
|
|
"epoch": 1.464,
|
|
"grad_norm": 2.0022165775299072,
|
|
"learning_rate": 7.137096774193549e-06,
|
|
"loss": 1.3054,
|
|
"step": 366
|
|
},
|
|
{
|
|
"epoch": 1.468,
|
|
"grad_norm": 2.113560199737549,
|
|
"learning_rate": 7.1290322580645166e-06,
|
|
"loss": 1.1549,
|
|
"step": 367
|
|
},
|
|
{
|
|
"epoch": 1.472,
|
|
"grad_norm": 1.8650003671646118,
|
|
"learning_rate": 7.120967741935484e-06,
|
|
"loss": 1.1583,
|
|
"step": 368
|
|
},
|
|
{
|
|
"epoch": 1.476,
|
|
"grad_norm": 2.0937418937683105,
|
|
"learning_rate": 7.112903225806453e-06,
|
|
"loss": 1.0378,
|
|
"step": 369
|
|
},
|
|
{
|
|
"epoch": 1.48,
|
|
"grad_norm": 1.8015251159667969,
|
|
"learning_rate": 7.1048387096774195e-06,
|
|
"loss": 1.0748,
|
|
"step": 370
|
|
},
|
|
{
|
|
"epoch": 1.484,
|
|
"grad_norm": 1.937143087387085,
|
|
"learning_rate": 7.096774193548388e-06,
|
|
"loss": 1.2231,
|
|
"step": 371
|
|
},
|
|
{
|
|
"epoch": 1.488,
|
|
"grad_norm": 1.769061803817749,
|
|
"learning_rate": 7.088709677419356e-06,
|
|
"loss": 0.9692,
|
|
"step": 372
|
|
},
|
|
{
|
|
"epoch": 1.492,
|
|
"grad_norm": 2.050584316253662,
|
|
"learning_rate": 7.0806451612903225e-06,
|
|
"loss": 1.2983,
|
|
"step": 373
|
|
},
|
|
{
|
|
"epoch": 1.496,
|
|
"grad_norm": 1.7325676679611206,
|
|
"learning_rate": 7.072580645161291e-06,
|
|
"loss": 0.8723,
|
|
"step": 374
|
|
},
|
|
{
|
|
"epoch": 1.5,
|
|
"grad_norm": 1.8762125968933105,
|
|
"learning_rate": 7.064516129032259e-06,
|
|
"loss": 0.9408,
|
|
"step": 375
|
|
},
|
|
{
|
|
"epoch": 1.504,
|
|
"grad_norm": 1.9299713373184204,
|
|
"learning_rate": 7.056451612903227e-06,
|
|
"loss": 1.0404,
|
|
"step": 376
|
|
},
|
|
{
|
|
"epoch": 1.508,
|
|
"grad_norm": 2.071428060531616,
|
|
"learning_rate": 7.048387096774194e-06,
|
|
"loss": 1.2014,
|
|
"step": 377
|
|
},
|
|
{
|
|
"epoch": 1.512,
|
|
"grad_norm": 2.0758373737335205,
|
|
"learning_rate": 7.040322580645162e-06,
|
|
"loss": 1.1974,
|
|
"step": 378
|
|
},
|
|
{
|
|
"epoch": 1.516,
|
|
"grad_norm": 2.0668601989746094,
|
|
"learning_rate": 7.03225806451613e-06,
|
|
"loss": 0.9456,
|
|
"step": 379
|
|
},
|
|
{
|
|
"epoch": 1.52,
|
|
"grad_norm": 1.9508721828460693,
|
|
"learning_rate": 7.024193548387097e-06,
|
|
"loss": 1.0519,
|
|
"step": 380
|
|
},
|
|
{
|
|
"epoch": 1.524,
|
|
"grad_norm": 1.8952667713165283,
|
|
"learning_rate": 7.0161290322580654e-06,
|
|
"loss": 0.8057,
|
|
"step": 381
|
|
},
|
|
{
|
|
"epoch": 1.528,
|
|
"grad_norm": 2.4796831607818604,
|
|
"learning_rate": 7.008064516129033e-06,
|
|
"loss": 0.8467,
|
|
"step": 382
|
|
},
|
|
{
|
|
"epoch": 1.532,
|
|
"grad_norm": 2.1569836139678955,
|
|
"learning_rate": 7e-06,
|
|
"loss": 1.0599,
|
|
"step": 383
|
|
},
|
|
{
|
|
"epoch": 1.536,
|
|
"grad_norm": 1.8130536079406738,
|
|
"learning_rate": 6.991935483870968e-06,
|
|
"loss": 1.0255,
|
|
"step": 384
|
|
},
|
|
{
|
|
"epoch": 1.54,
|
|
"grad_norm": 2.090113639831543,
|
|
"learning_rate": 6.983870967741936e-06,
|
|
"loss": 0.9335,
|
|
"step": 385
|
|
},
|
|
{
|
|
"epoch": 1.544,
|
|
"grad_norm": 1.9055856466293335,
|
|
"learning_rate": 6.9758064516129046e-06,
|
|
"loss": 1.0086,
|
|
"step": 386
|
|
},
|
|
{
|
|
"epoch": 1.548,
|
|
"grad_norm": 2.1749179363250732,
|
|
"learning_rate": 6.967741935483871e-06,
|
|
"loss": 0.9453,
|
|
"step": 387
|
|
},
|
|
{
|
|
"epoch": 1.552,
|
|
"grad_norm": 1.8622307777404785,
|
|
"learning_rate": 6.959677419354839e-06,
|
|
"loss": 1.0079,
|
|
"step": 388
|
|
},
|
|
{
|
|
"epoch": 1.556,
|
|
"grad_norm": 1.8603782653808594,
|
|
"learning_rate": 6.9516129032258075e-06,
|
|
"loss": 0.8453,
|
|
"step": 389
|
|
},
|
|
{
|
|
"epoch": 1.56,
|
|
"grad_norm": 2.0890860557556152,
|
|
"learning_rate": 6.943548387096774e-06,
|
|
"loss": 1.0772,
|
|
"step": 390
|
|
},
|
|
{
|
|
"epoch": 1.564,
|
|
"grad_norm": 1.6885607242584229,
|
|
"learning_rate": 6.935483870967743e-06,
|
|
"loss": 0.8288,
|
|
"step": 391
|
|
},
|
|
{
|
|
"epoch": 1.568,
|
|
"grad_norm": 1.810863733291626,
|
|
"learning_rate": 6.9274193548387105e-06,
|
|
"loss": 0.931,
|
|
"step": 392
|
|
},
|
|
{
|
|
"epoch": 1.572,
|
|
"grad_norm": 2.0588154792785645,
|
|
"learning_rate": 6.919354838709677e-06,
|
|
"loss": 0.9439,
|
|
"step": 393
|
|
},
|
|
{
|
|
"epoch": 1.576,
|
|
"grad_norm": 1.9780665636062622,
|
|
"learning_rate": 6.911290322580646e-06,
|
|
"loss": 0.8887,
|
|
"step": 394
|
|
},
|
|
{
|
|
"epoch": 1.58,
|
|
"grad_norm": 1.7643142938613892,
|
|
"learning_rate": 6.9032258064516135e-06,
|
|
"loss": 0.8651,
|
|
"step": 395
|
|
},
|
|
{
|
|
"epoch": 1.584,
|
|
"grad_norm": 1.984982967376709,
|
|
"learning_rate": 6.895161290322582e-06,
|
|
"loss": 0.976,
|
|
"step": 396
|
|
},
|
|
{
|
|
"epoch": 1.588,
|
|
"grad_norm": 1.7668675184249878,
|
|
"learning_rate": 6.887096774193549e-06,
|
|
"loss": 0.8342,
|
|
"step": 397
|
|
},
|
|
{
|
|
"epoch": 1.592,
|
|
"grad_norm": 2.10591983795166,
|
|
"learning_rate": 6.879032258064516e-06,
|
|
"loss": 1.1061,
|
|
"step": 398
|
|
},
|
|
{
|
|
"epoch": 1.596,
|
|
"grad_norm": 2.0348215103149414,
|
|
"learning_rate": 6.870967741935485e-06,
|
|
"loss": 0.9554,
|
|
"step": 399
|
|
},
|
|
{
|
|
"epoch": 1.6,
|
|
"grad_norm": 1.9119504690170288,
|
|
"learning_rate": 6.862903225806452e-06,
|
|
"loss": 1.005,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 1.604,
|
|
"grad_norm": 2.0619728565216064,
|
|
"learning_rate": 6.854838709677419e-06,
|
|
"loss": 1.1878,
|
|
"step": 401
|
|
},
|
|
{
|
|
"epoch": 1.608,
|
|
"grad_norm": 1.6374893188476562,
|
|
"learning_rate": 6.846774193548388e-06,
|
|
"loss": 0.6809,
|
|
"step": 402
|
|
},
|
|
{
|
|
"epoch": 1.612,
|
|
"grad_norm": 1.9424902200698853,
|
|
"learning_rate": 6.838709677419355e-06,
|
|
"loss": 1.0557,
|
|
"step": 403
|
|
},
|
|
{
|
|
"epoch": 1.616,
|
|
"grad_norm": 2.002797842025757,
|
|
"learning_rate": 6.830645161290323e-06,
|
|
"loss": 0.861,
|
|
"step": 404
|
|
},
|
|
{
|
|
"epoch": 1.62,
|
|
"grad_norm": 1.8212698698043823,
|
|
"learning_rate": 6.822580645161291e-06,
|
|
"loss": 0.845,
|
|
"step": 405
|
|
},
|
|
{
|
|
"epoch": 1.624,
|
|
"grad_norm": 2.012608528137207,
|
|
"learning_rate": 6.8145161290322585e-06,
|
|
"loss": 1.0074,
|
|
"step": 406
|
|
},
|
|
{
|
|
"epoch": 1.6280000000000001,
|
|
"grad_norm": 1.9290971755981445,
|
|
"learning_rate": 6.806451612903226e-06,
|
|
"loss": 0.9082,
|
|
"step": 407
|
|
},
|
|
{
|
|
"epoch": 1.6320000000000001,
|
|
"grad_norm": 1.8718587160110474,
|
|
"learning_rate": 6.798387096774194e-06,
|
|
"loss": 0.927,
|
|
"step": 408
|
|
},
|
|
{
|
|
"epoch": 1.6360000000000001,
|
|
"grad_norm": 1.7788649797439575,
|
|
"learning_rate": 6.790322580645162e-06,
|
|
"loss": 0.8388,
|
|
"step": 409
|
|
},
|
|
{
|
|
"epoch": 1.6400000000000001,
|
|
"grad_norm": 1.988103985786438,
|
|
"learning_rate": 6.78225806451613e-06,
|
|
"loss": 0.8716,
|
|
"step": 410
|
|
},
|
|
{
|
|
"epoch": 1.6440000000000001,
|
|
"grad_norm": 2.083486318588257,
|
|
"learning_rate": 6.774193548387097e-06,
|
|
"loss": 1.3621,
|
|
"step": 411
|
|
},
|
|
{
|
|
"epoch": 1.6480000000000001,
|
|
"grad_norm": 1.8370341062545776,
|
|
"learning_rate": 6.766129032258065e-06,
|
|
"loss": 0.8775,
|
|
"step": 412
|
|
},
|
|
{
|
|
"epoch": 1.6520000000000001,
|
|
"grad_norm": 2.1198623180389404,
|
|
"learning_rate": 6.758064516129033e-06,
|
|
"loss": 1.1957,
|
|
"step": 413
|
|
},
|
|
{
|
|
"epoch": 1.6560000000000001,
|
|
"grad_norm": 1.887670636177063,
|
|
"learning_rate": 6.750000000000001e-06,
|
|
"loss": 0.8128,
|
|
"step": 414
|
|
},
|
|
{
|
|
"epoch": 1.6600000000000001,
|
|
"grad_norm": 1.776105284690857,
|
|
"learning_rate": 6.741935483870968e-06,
|
|
"loss": 1.0328,
|
|
"step": 415
|
|
},
|
|
{
|
|
"epoch": 1.6640000000000001,
|
|
"grad_norm": 2.0482587814331055,
|
|
"learning_rate": 6.733870967741936e-06,
|
|
"loss": 1.1532,
|
|
"step": 416
|
|
},
|
|
{
|
|
"epoch": 1.6680000000000001,
|
|
"grad_norm": 1.9450839757919312,
|
|
"learning_rate": 6.725806451612904e-06,
|
|
"loss": 0.8195,
|
|
"step": 417
|
|
},
|
|
{
|
|
"epoch": 1.6720000000000002,
|
|
"grad_norm": 1.8145220279693604,
|
|
"learning_rate": 6.717741935483871e-06,
|
|
"loss": 0.9485,
|
|
"step": 418
|
|
},
|
|
{
|
|
"epoch": 1.6760000000000002,
|
|
"grad_norm": 1.8135958909988403,
|
|
"learning_rate": 6.70967741935484e-06,
|
|
"loss": 0.7613,
|
|
"step": 419
|
|
},
|
|
{
|
|
"epoch": 1.6800000000000002,
|
|
"grad_norm": 1.711436152458191,
|
|
"learning_rate": 6.701612903225807e-06,
|
|
"loss": 0.8422,
|
|
"step": 420
|
|
},
|
|
{
|
|
"epoch": 1.6840000000000002,
|
|
"grad_norm": 1.8717050552368164,
|
|
"learning_rate": 6.693548387096774e-06,
|
|
"loss": 0.9559,
|
|
"step": 421
|
|
},
|
|
{
|
|
"epoch": 1.688,
|
|
"grad_norm": 2.074445962905884,
|
|
"learning_rate": 6.685483870967743e-06,
|
|
"loss": 1.0544,
|
|
"step": 422
|
|
},
|
|
{
|
|
"epoch": 1.692,
|
|
"grad_norm": 2.0965824127197266,
|
|
"learning_rate": 6.67741935483871e-06,
|
|
"loss": 1.0758,
|
|
"step": 423
|
|
},
|
|
{
|
|
"epoch": 1.696,
|
|
"grad_norm": 1.8185127973556519,
|
|
"learning_rate": 6.669354838709679e-06,
|
|
"loss": 0.8732,
|
|
"step": 424
|
|
},
|
|
{
|
|
"epoch": 1.7,
|
|
"grad_norm": 1.8362635374069214,
|
|
"learning_rate": 6.661290322580646e-06,
|
|
"loss": 0.7973,
|
|
"step": 425
|
|
},
|
|
{
|
|
"epoch": 1.704,
|
|
"grad_norm": 1.9787343740463257,
|
|
"learning_rate": 6.653225806451613e-06,
|
|
"loss": 1.4597,
|
|
"step": 426
|
|
},
|
|
{
|
|
"epoch": 1.708,
|
|
"grad_norm": 1.9990483522415161,
|
|
"learning_rate": 6.645161290322582e-06,
|
|
"loss": 1.0183,
|
|
"step": 427
|
|
},
|
|
{
|
|
"epoch": 1.712,
|
|
"grad_norm": 2.142268419265747,
|
|
"learning_rate": 6.637096774193549e-06,
|
|
"loss": 1.2538,
|
|
"step": 428
|
|
},
|
|
{
|
|
"epoch": 1.716,
|
|
"grad_norm": 1.8665509223937988,
|
|
"learning_rate": 6.629032258064517e-06,
|
|
"loss": 0.9783,
|
|
"step": 429
|
|
},
|
|
{
|
|
"epoch": 1.72,
|
|
"grad_norm": 2.159270763397217,
|
|
"learning_rate": 6.620967741935485e-06,
|
|
"loss": 0.8965,
|
|
"step": 430
|
|
},
|
|
{
|
|
"epoch": 1.724,
|
|
"grad_norm": 2.037621021270752,
|
|
"learning_rate": 6.612903225806452e-06,
|
|
"loss": 0.8806,
|
|
"step": 431
|
|
},
|
|
{
|
|
"epoch": 1.728,
|
|
"grad_norm": 1.7615220546722412,
|
|
"learning_rate": 6.60483870967742e-06,
|
|
"loss": 0.8228,
|
|
"step": 432
|
|
},
|
|
{
|
|
"epoch": 1.732,
|
|
"grad_norm": 2.1290621757507324,
|
|
"learning_rate": 6.596774193548388e-06,
|
|
"loss": 0.7951,
|
|
"step": 433
|
|
},
|
|
{
|
|
"epoch": 1.736,
|
|
"grad_norm": 1.9679354429244995,
|
|
"learning_rate": 6.5887096774193545e-06,
|
|
"loss": 0.8681,
|
|
"step": 434
|
|
},
|
|
{
|
|
"epoch": 1.74,
|
|
"grad_norm": 1.9964251518249512,
|
|
"learning_rate": 6.580645161290323e-06,
|
|
"loss": 0.8912,
|
|
"step": 435
|
|
},
|
|
{
|
|
"epoch": 1.744,
|
|
"grad_norm": 1.8060578107833862,
|
|
"learning_rate": 6.572580645161291e-06,
|
|
"loss": 0.8951,
|
|
"step": 436
|
|
},
|
|
{
|
|
"epoch": 1.748,
|
|
"grad_norm": 1.6865990161895752,
|
|
"learning_rate": 6.564516129032259e-06,
|
|
"loss": 0.7064,
|
|
"step": 437
|
|
},
|
|
{
|
|
"epoch": 1.752,
|
|
"grad_norm": 1.6390618085861206,
|
|
"learning_rate": 6.556451612903226e-06,
|
|
"loss": 0.658,
|
|
"step": 438
|
|
},
|
|
{
|
|
"epoch": 1.756,
|
|
"grad_norm": 1.9682906866073608,
|
|
"learning_rate": 6.548387096774194e-06,
|
|
"loss": 1.3055,
|
|
"step": 439
|
|
},
|
|
{
|
|
"epoch": 1.76,
|
|
"grad_norm": 2.312551498413086,
|
|
"learning_rate": 6.540322580645162e-06,
|
|
"loss": 1.2867,
|
|
"step": 440
|
|
},
|
|
{
|
|
"epoch": 1.764,
|
|
"grad_norm": 2.086641550064087,
|
|
"learning_rate": 6.532258064516129e-06,
|
|
"loss": 1.0897,
|
|
"step": 441
|
|
},
|
|
{
|
|
"epoch": 1.768,
|
|
"grad_norm": 2.1024136543273926,
|
|
"learning_rate": 6.5241935483870975e-06,
|
|
"loss": 0.9324,
|
|
"step": 442
|
|
},
|
|
{
|
|
"epoch": 1.772,
|
|
"grad_norm": 1.8934117555618286,
|
|
"learning_rate": 6.516129032258065e-06,
|
|
"loss": 0.8183,
|
|
"step": 443
|
|
},
|
|
{
|
|
"epoch": 1.776,
|
|
"grad_norm": 2.1037709712982178,
|
|
"learning_rate": 6.508064516129032e-06,
|
|
"loss": 1.3476,
|
|
"step": 444
|
|
},
|
|
{
|
|
"epoch": 1.78,
|
|
"grad_norm": 1.903442144393921,
|
|
"learning_rate": 6.5000000000000004e-06,
|
|
"loss": 0.795,
|
|
"step": 445
|
|
},
|
|
{
|
|
"epoch": 1.784,
|
|
"grad_norm": 1.7604291439056396,
|
|
"learning_rate": 6.491935483870968e-06,
|
|
"loss": 0.8118,
|
|
"step": 446
|
|
},
|
|
{
|
|
"epoch": 1.788,
|
|
"grad_norm": 1.8407769203186035,
|
|
"learning_rate": 6.483870967741937e-06,
|
|
"loss": 0.8989,
|
|
"step": 447
|
|
},
|
|
{
|
|
"epoch": 1.792,
|
|
"grad_norm": 1.908974528312683,
|
|
"learning_rate": 6.475806451612903e-06,
|
|
"loss": 0.9908,
|
|
"step": 448
|
|
},
|
|
{
|
|
"epoch": 1.796,
|
|
"grad_norm": 1.6634641885757446,
|
|
"learning_rate": 6.467741935483871e-06,
|
|
"loss": 0.7673,
|
|
"step": 449
|
|
},
|
|
{
|
|
"epoch": 1.8,
|
|
"grad_norm": 2.033625602722168,
|
|
"learning_rate": 6.4596774193548396e-06,
|
|
"loss": 1.1297,
|
|
"step": 450
|
|
},
|
|
{
|
|
"epoch": 1.804,
|
|
"grad_norm": 1.7993324995040894,
|
|
"learning_rate": 6.451612903225806e-06,
|
|
"loss": 0.8454,
|
|
"step": 451
|
|
},
|
|
{
|
|
"epoch": 1.808,
|
|
"grad_norm": 1.9716646671295166,
|
|
"learning_rate": 6.443548387096775e-06,
|
|
"loss": 1.064,
|
|
"step": 452
|
|
},
|
|
{
|
|
"epoch": 1.812,
|
|
"grad_norm": 1.8964923620224,
|
|
"learning_rate": 6.4354838709677425e-06,
|
|
"loss": 1.0819,
|
|
"step": 453
|
|
},
|
|
{
|
|
"epoch": 1.8159999999999998,
|
|
"grad_norm": 1.8927526473999023,
|
|
"learning_rate": 6.42741935483871e-06,
|
|
"loss": 0.9889,
|
|
"step": 454
|
|
},
|
|
{
|
|
"epoch": 1.8199999999999998,
|
|
"grad_norm": 2.130237579345703,
|
|
"learning_rate": 6.419354838709678e-06,
|
|
"loss": 0.8271,
|
|
"step": 455
|
|
},
|
|
{
|
|
"epoch": 1.8239999999999998,
|
|
"grad_norm": 2.2165160179138184,
|
|
"learning_rate": 6.4112903225806455e-06,
|
|
"loss": 1.1792,
|
|
"step": 456
|
|
},
|
|
{
|
|
"epoch": 1.8279999999999998,
|
|
"grad_norm": 2.3909194469451904,
|
|
"learning_rate": 6.403225806451614e-06,
|
|
"loss": 1.1584,
|
|
"step": 457
|
|
},
|
|
{
|
|
"epoch": 1.8319999999999999,
|
|
"grad_norm": 1.752228856086731,
|
|
"learning_rate": 6.395161290322582e-06,
|
|
"loss": 0.9875,
|
|
"step": 458
|
|
},
|
|
{
|
|
"epoch": 1.8359999999999999,
|
|
"grad_norm": 1.9226669073104858,
|
|
"learning_rate": 6.3870967741935485e-06,
|
|
"loss": 1.0947,
|
|
"step": 459
|
|
},
|
|
{
|
|
"epoch": 1.8399999999999999,
|
|
"grad_norm": 1.792618751525879,
|
|
"learning_rate": 6.379032258064517e-06,
|
|
"loss": 0.82,
|
|
"step": 460
|
|
},
|
|
{
|
|
"epoch": 1.8439999999999999,
|
|
"grad_norm": 1.9961439371109009,
|
|
"learning_rate": 6.370967741935485e-06,
|
|
"loss": 0.8837,
|
|
"step": 461
|
|
},
|
|
{
|
|
"epoch": 1.8479999999999999,
|
|
"grad_norm": 1.900079369544983,
|
|
"learning_rate": 6.3629032258064514e-06,
|
|
"loss": 0.8935,
|
|
"step": 462
|
|
},
|
|
{
|
|
"epoch": 1.8519999999999999,
|
|
"grad_norm": 1.9829587936401367,
|
|
"learning_rate": 6.35483870967742e-06,
|
|
"loss": 0.9662,
|
|
"step": 463
|
|
},
|
|
{
|
|
"epoch": 1.8559999999999999,
|
|
"grad_norm": 2.161857843399048,
|
|
"learning_rate": 6.346774193548388e-06,
|
|
"loss": 1.0381,
|
|
"step": 464
|
|
},
|
|
{
|
|
"epoch": 1.8599999999999999,
|
|
"grad_norm": 2.085419178009033,
|
|
"learning_rate": 6.338709677419356e-06,
|
|
"loss": 0.8401,
|
|
"step": 465
|
|
},
|
|
{
|
|
"epoch": 1.8639999999999999,
|
|
"grad_norm": 1.981245756149292,
|
|
"learning_rate": 6.330645161290323e-06,
|
|
"loss": 1.2207,
|
|
"step": 466
|
|
},
|
|
{
|
|
"epoch": 1.8679999999999999,
|
|
"grad_norm": 2.1605920791625977,
|
|
"learning_rate": 6.3225806451612906e-06,
|
|
"loss": 1.1644,
|
|
"step": 467
|
|
},
|
|
{
|
|
"epoch": 1.8719999999999999,
|
|
"grad_norm": 1.9331471920013428,
|
|
"learning_rate": 6.314516129032259e-06,
|
|
"loss": 1.1664,
|
|
"step": 468
|
|
},
|
|
{
|
|
"epoch": 1.876,
|
|
"grad_norm": 1.9107038974761963,
|
|
"learning_rate": 6.306451612903226e-06,
|
|
"loss": 0.9665,
|
|
"step": 469
|
|
},
|
|
{
|
|
"epoch": 1.88,
|
|
"grad_norm": 1.829116940498352,
|
|
"learning_rate": 6.298387096774194e-06,
|
|
"loss": 0.6706,
|
|
"step": 470
|
|
},
|
|
{
|
|
"epoch": 1.884,
|
|
"grad_norm": 1.9585076570510864,
|
|
"learning_rate": 6.290322580645162e-06,
|
|
"loss": 0.9477,
|
|
"step": 471
|
|
},
|
|
{
|
|
"epoch": 1.888,
|
|
"grad_norm": 1.8251086473464966,
|
|
"learning_rate": 6.282258064516129e-06,
|
|
"loss": 1.0448,
|
|
"step": 472
|
|
},
|
|
{
|
|
"epoch": 1.892,
|
|
"grad_norm": 1.955357551574707,
|
|
"learning_rate": 6.274193548387097e-06,
|
|
"loss": 0.8644,
|
|
"step": 473
|
|
},
|
|
{
|
|
"epoch": 1.896,
|
|
"grad_norm": 1.7243505716323853,
|
|
"learning_rate": 6.266129032258065e-06,
|
|
"loss": 0.9741,
|
|
"step": 474
|
|
},
|
|
{
|
|
"epoch": 1.9,
|
|
"grad_norm": 1.7721489667892456,
|
|
"learning_rate": 6.2580645161290335e-06,
|
|
"loss": 1.0634,
|
|
"step": 475
|
|
},
|
|
{
|
|
"epoch": 1.904,
|
|
"grad_norm": 2.113557815551758,
|
|
"learning_rate": 6.25e-06,
|
|
"loss": 1.1883,
|
|
"step": 476
|
|
},
|
|
{
|
|
"epoch": 1.908,
|
|
"grad_norm": 2.1622400283813477,
|
|
"learning_rate": 6.241935483870968e-06,
|
|
"loss": 1.0856,
|
|
"step": 477
|
|
},
|
|
{
|
|
"epoch": 1.912,
|
|
"grad_norm": 1.9250800609588623,
|
|
"learning_rate": 6.2338709677419365e-06,
|
|
"loss": 0.9533,
|
|
"step": 478
|
|
},
|
|
{
|
|
"epoch": 1.916,
|
|
"grad_norm": 2.027179002761841,
|
|
"learning_rate": 6.225806451612903e-06,
|
|
"loss": 1.0671,
|
|
"step": 479
|
|
},
|
|
{
|
|
"epoch": 1.92,
|
|
"grad_norm": 2.1702287197113037,
|
|
"learning_rate": 6.217741935483872e-06,
|
|
"loss": 0.9522,
|
|
"step": 480
|
|
},
|
|
{
|
|
"epoch": 1.924,
|
|
"grad_norm": 1.8383008241653442,
|
|
"learning_rate": 6.209677419354839e-06,
|
|
"loss": 0.9636,
|
|
"step": 481
|
|
},
|
|
{
|
|
"epoch": 1.928,
|
|
"grad_norm": 1.9702820777893066,
|
|
"learning_rate": 6.201612903225806e-06,
|
|
"loss": 0.9917,
|
|
"step": 482
|
|
},
|
|
{
|
|
"epoch": 1.932,
|
|
"grad_norm": 1.850930094718933,
|
|
"learning_rate": 6.193548387096775e-06,
|
|
"loss": 1.208,
|
|
"step": 483
|
|
},
|
|
{
|
|
"epoch": 1.936,
|
|
"grad_norm": 1.9581983089447021,
|
|
"learning_rate": 6.185483870967742e-06,
|
|
"loss": 0.9504,
|
|
"step": 484
|
|
},
|
|
{
|
|
"epoch": 1.94,
|
|
"grad_norm": 2.117535352706909,
|
|
"learning_rate": 6.177419354838711e-06,
|
|
"loss": 0.8385,
|
|
"step": 485
|
|
},
|
|
{
|
|
"epoch": 1.944,
|
|
"grad_norm": 2.0964229106903076,
|
|
"learning_rate": 6.169354838709678e-06,
|
|
"loss": 1.1359,
|
|
"step": 486
|
|
},
|
|
{
|
|
"epoch": 1.948,
|
|
"grad_norm": 1.9881926774978638,
|
|
"learning_rate": 6.161290322580645e-06,
|
|
"loss": 1.1685,
|
|
"step": 487
|
|
},
|
|
{
|
|
"epoch": 1.952,
|
|
"grad_norm": 1.8612600564956665,
|
|
"learning_rate": 6.153225806451614e-06,
|
|
"loss": 1.0151,
|
|
"step": 488
|
|
},
|
|
{
|
|
"epoch": 1.956,
|
|
"grad_norm": 1.9227721691131592,
|
|
"learning_rate": 6.145161290322581e-06,
|
|
"loss": 0.9583,
|
|
"step": 489
|
|
},
|
|
{
|
|
"epoch": 1.96,
|
|
"grad_norm": 1.8418318033218384,
|
|
"learning_rate": 6.137096774193549e-06,
|
|
"loss": 0.9942,
|
|
"step": 490
|
|
},
|
|
{
|
|
"epoch": 1.964,
|
|
"grad_norm": 2.0368409156799316,
|
|
"learning_rate": 6.129032258064517e-06,
|
|
"loss": 1.1448,
|
|
"step": 491
|
|
},
|
|
{
|
|
"epoch": 1.968,
|
|
"grad_norm": 1.862594723701477,
|
|
"learning_rate": 6.120967741935484e-06,
|
|
"loss": 1.0607,
|
|
"step": 492
|
|
},
|
|
{
|
|
"epoch": 1.972,
|
|
"grad_norm": 2.114074468612671,
|
|
"learning_rate": 6.112903225806452e-06,
|
|
"loss": 1.1504,
|
|
"step": 493
|
|
},
|
|
{
|
|
"epoch": 1.976,
|
|
"grad_norm": 1.8414007425308228,
|
|
"learning_rate": 6.10483870967742e-06,
|
|
"loss": 0.9802,
|
|
"step": 494
|
|
},
|
|
{
|
|
"epoch": 1.98,
|
|
"grad_norm": 2.0259172916412354,
|
|
"learning_rate": 6.0967741935483874e-06,
|
|
"loss": 1.0735,
|
|
"step": 495
|
|
},
|
|
{
|
|
"epoch": 1.984,
|
|
"grad_norm": 1.812017560005188,
|
|
"learning_rate": 6.088709677419355e-06,
|
|
"loss": 0.9636,
|
|
"step": 496
|
|
},
|
|
{
|
|
"epoch": 1.988,
|
|
"grad_norm": 1.7785333395004272,
|
|
"learning_rate": 6.080645161290323e-06,
|
|
"loss": 0.8428,
|
|
"step": 497
|
|
},
|
|
{
|
|
"epoch": 1.992,
|
|
"grad_norm": 1.907761812210083,
|
|
"learning_rate": 6.072580645161291e-06,
|
|
"loss": 0.9633,
|
|
"step": 498
|
|
},
|
|
{
|
|
"epoch": 1.996,
|
|
"grad_norm": 2.5004708766937256,
|
|
"learning_rate": 6.064516129032259e-06,
|
|
"loss": 1.4453,
|
|
"step": 499
|
|
},
|
|
{
|
|
"epoch": 2.0,
|
|
"grad_norm": 1.9316829442977905,
|
|
"learning_rate": 6.056451612903226e-06,
|
|
"loss": 0.9325,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 2.004,
|
|
"grad_norm": 1.7854198217391968,
|
|
"learning_rate": 6.048387096774194e-06,
|
|
"loss": 0.7874,
|
|
"step": 501
|
|
},
|
|
{
|
|
"epoch": 2.008,
|
|
"grad_norm": 2.056880235671997,
|
|
"learning_rate": 6.040322580645162e-06,
|
|
"loss": 0.8134,
|
|
"step": 502
|
|
},
|
|
{
|
|
"epoch": 2.012,
|
|
"grad_norm": 1.676098108291626,
|
|
"learning_rate": 6.0322580645161295e-06,
|
|
"loss": 0.5839,
|
|
"step": 503
|
|
},
|
|
{
|
|
"epoch": 2.016,
|
|
"grad_norm": 2.050013780593872,
|
|
"learning_rate": 6.024193548387097e-06,
|
|
"loss": 0.8682,
|
|
"step": 504
|
|
},
|
|
{
|
|
"epoch": 2.02,
|
|
"grad_norm": 1.8117666244506836,
|
|
"learning_rate": 6.016129032258065e-06,
|
|
"loss": 1.0753,
|
|
"step": 505
|
|
},
|
|
{
|
|
"epoch": 2.024,
|
|
"grad_norm": 1.8990579843521118,
|
|
"learning_rate": 6.008064516129033e-06,
|
|
"loss": 0.897,
|
|
"step": 506
|
|
},
|
|
{
|
|
"epoch": 2.028,
|
|
"grad_norm": 1.809313416481018,
|
|
"learning_rate": 6e-06,
|
|
"loss": 0.8301,
|
|
"step": 507
|
|
},
|
|
{
|
|
"epoch": 2.032,
|
|
"grad_norm": 1.8551738262176514,
|
|
"learning_rate": 5.991935483870969e-06,
|
|
"loss": 0.8942,
|
|
"step": 508
|
|
},
|
|
{
|
|
"epoch": 2.036,
|
|
"grad_norm": 1.9050978422164917,
|
|
"learning_rate": 5.983870967741936e-06,
|
|
"loss": 0.8722,
|
|
"step": 509
|
|
},
|
|
{
|
|
"epoch": 2.04,
|
|
"grad_norm": 2.03041934967041,
|
|
"learning_rate": 5.975806451612903e-06,
|
|
"loss": 0.9807,
|
|
"step": 510
|
|
},
|
|
{
|
|
"epoch": 2.044,
|
|
"grad_norm": 2.041501998901367,
|
|
"learning_rate": 5.967741935483872e-06,
|
|
"loss": 1.1012,
|
|
"step": 511
|
|
},
|
|
{
|
|
"epoch": 2.048,
|
|
"grad_norm": 1.8471384048461914,
|
|
"learning_rate": 5.959677419354839e-06,
|
|
"loss": 0.7939,
|
|
"step": 512
|
|
},
|
|
{
|
|
"epoch": 2.052,
|
|
"grad_norm": 2.0582258701324463,
|
|
"learning_rate": 5.951612903225808e-06,
|
|
"loss": 1.0515,
|
|
"step": 513
|
|
},
|
|
{
|
|
"epoch": 2.056,
|
|
"grad_norm": 2.1479275226593018,
|
|
"learning_rate": 5.943548387096775e-06,
|
|
"loss": 1.0338,
|
|
"step": 514
|
|
},
|
|
{
|
|
"epoch": 2.06,
|
|
"grad_norm": 2.1146697998046875,
|
|
"learning_rate": 5.935483870967742e-06,
|
|
"loss": 0.8127,
|
|
"step": 515
|
|
},
|
|
{
|
|
"epoch": 2.064,
|
|
"grad_norm": 2.1258368492126465,
|
|
"learning_rate": 5.927419354838711e-06,
|
|
"loss": 1.0184,
|
|
"step": 516
|
|
},
|
|
{
|
|
"epoch": 2.068,
|
|
"grad_norm": 1.7795977592468262,
|
|
"learning_rate": 5.9193548387096776e-06,
|
|
"loss": 1.0998,
|
|
"step": 517
|
|
},
|
|
{
|
|
"epoch": 2.072,
|
|
"grad_norm": 2.0097572803497314,
|
|
"learning_rate": 5.911290322580646e-06,
|
|
"loss": 0.8023,
|
|
"step": 518
|
|
},
|
|
{
|
|
"epoch": 2.076,
|
|
"grad_norm": 1.9408986568450928,
|
|
"learning_rate": 5.903225806451614e-06,
|
|
"loss": 0.799,
|
|
"step": 519
|
|
},
|
|
{
|
|
"epoch": 2.08,
|
|
"grad_norm": 1.8501501083374023,
|
|
"learning_rate": 5.8951612903225805e-06,
|
|
"loss": 0.6468,
|
|
"step": 520
|
|
},
|
|
{
|
|
"epoch": 2.084,
|
|
"grad_norm": 1.9509570598602295,
|
|
"learning_rate": 5.887096774193549e-06,
|
|
"loss": 0.8961,
|
|
"step": 521
|
|
},
|
|
{
|
|
"epoch": 2.088,
|
|
"grad_norm": 1.845482587814331,
|
|
"learning_rate": 5.879032258064517e-06,
|
|
"loss": 0.8763,
|
|
"step": 522
|
|
},
|
|
{
|
|
"epoch": 2.092,
|
|
"grad_norm": 1.9946309328079224,
|
|
"learning_rate": 5.8709677419354835e-06,
|
|
"loss": 0.9319,
|
|
"step": 523
|
|
},
|
|
{
|
|
"epoch": 2.096,
|
|
"grad_norm": 2.023287296295166,
|
|
"learning_rate": 5.862903225806452e-06,
|
|
"loss": 0.9106,
|
|
"step": 524
|
|
},
|
|
{
|
|
"epoch": 2.1,
|
|
"grad_norm": 1.8573815822601318,
|
|
"learning_rate": 5.85483870967742e-06,
|
|
"loss": 0.8133,
|
|
"step": 525
|
|
},
|
|
{
|
|
"epoch": 2.104,
|
|
"grad_norm": 1.9675424098968506,
|
|
"learning_rate": 5.846774193548388e-06,
|
|
"loss": 0.8816,
|
|
"step": 526
|
|
},
|
|
{
|
|
"epoch": 2.108,
|
|
"grad_norm": 2.4199588298797607,
|
|
"learning_rate": 5.838709677419355e-06,
|
|
"loss": 1.0439,
|
|
"step": 527
|
|
},
|
|
{
|
|
"epoch": 2.112,
|
|
"grad_norm": 1.9206243753433228,
|
|
"learning_rate": 5.830645161290323e-06,
|
|
"loss": 1.0431,
|
|
"step": 528
|
|
},
|
|
{
|
|
"epoch": 2.116,
|
|
"grad_norm": 2.0801823139190674,
|
|
"learning_rate": 5.822580645161291e-06,
|
|
"loss": 0.8906,
|
|
"step": 529
|
|
},
|
|
{
|
|
"epoch": 2.12,
|
|
"grad_norm": 2.001232385635376,
|
|
"learning_rate": 5.814516129032258e-06,
|
|
"loss": 0.7845,
|
|
"step": 530
|
|
},
|
|
{
|
|
"epoch": 2.124,
|
|
"grad_norm": 1.8495033979415894,
|
|
"learning_rate": 5.806451612903226e-06,
|
|
"loss": 0.8729,
|
|
"step": 531
|
|
},
|
|
{
|
|
"epoch": 2.128,
|
|
"grad_norm": 2.0924770832061768,
|
|
"learning_rate": 5.798387096774194e-06,
|
|
"loss": 0.7579,
|
|
"step": 532
|
|
},
|
|
{
|
|
"epoch": 2.132,
|
|
"grad_norm": 2.0251944065093994,
|
|
"learning_rate": 5.790322580645161e-06,
|
|
"loss": 0.8986,
|
|
"step": 533
|
|
},
|
|
{
|
|
"epoch": 2.136,
|
|
"grad_norm": 2.0584230422973633,
|
|
"learning_rate": 5.782258064516129e-06,
|
|
"loss": 0.9409,
|
|
"step": 534
|
|
},
|
|
{
|
|
"epoch": 2.14,
|
|
"grad_norm": 2.1257565021514893,
|
|
"learning_rate": 5.774193548387097e-06,
|
|
"loss": 1.0291,
|
|
"step": 535
|
|
},
|
|
{
|
|
"epoch": 2.144,
|
|
"grad_norm": 2.18398118019104,
|
|
"learning_rate": 5.7661290322580655e-06,
|
|
"loss": 1.0488,
|
|
"step": 536
|
|
},
|
|
{
|
|
"epoch": 2.148,
|
|
"grad_norm": 1.9207417964935303,
|
|
"learning_rate": 5.758064516129032e-06,
|
|
"loss": 0.7732,
|
|
"step": 537
|
|
},
|
|
{
|
|
"epoch": 2.152,
|
|
"grad_norm": 1.9380196332931519,
|
|
"learning_rate": 5.75e-06,
|
|
"loss": 0.874,
|
|
"step": 538
|
|
},
|
|
{
|
|
"epoch": 2.156,
|
|
"grad_norm": 1.9792423248291016,
|
|
"learning_rate": 5.7419354838709685e-06,
|
|
"loss": 0.6038,
|
|
"step": 539
|
|
},
|
|
{
|
|
"epoch": 2.16,
|
|
"grad_norm": 2.2578721046447754,
|
|
"learning_rate": 5.733870967741936e-06,
|
|
"loss": 1.0358,
|
|
"step": 540
|
|
},
|
|
{
|
|
"epoch": 2.164,
|
|
"grad_norm": 2.0607879161834717,
|
|
"learning_rate": 5.725806451612904e-06,
|
|
"loss": 1.0594,
|
|
"step": 541
|
|
},
|
|
{
|
|
"epoch": 2.168,
|
|
"grad_norm": 2.0992348194122314,
|
|
"learning_rate": 5.7177419354838715e-06,
|
|
"loss": 1.1117,
|
|
"step": 542
|
|
},
|
|
{
|
|
"epoch": 2.172,
|
|
"grad_norm": 1.9730112552642822,
|
|
"learning_rate": 5.709677419354839e-06,
|
|
"loss": 0.9292,
|
|
"step": 543
|
|
},
|
|
{
|
|
"epoch": 2.176,
|
|
"grad_norm": 1.9976557493209839,
|
|
"learning_rate": 5.701612903225807e-06,
|
|
"loss": 0.8501,
|
|
"step": 544
|
|
},
|
|
{
|
|
"epoch": 2.18,
|
|
"grad_norm": 2.0966856479644775,
|
|
"learning_rate": 5.6935483870967744e-06,
|
|
"loss": 1.0269,
|
|
"step": 545
|
|
},
|
|
{
|
|
"epoch": 2.184,
|
|
"grad_norm": 1.7158764600753784,
|
|
"learning_rate": 5.685483870967743e-06,
|
|
"loss": 0.5562,
|
|
"step": 546
|
|
},
|
|
{
|
|
"epoch": 2.188,
|
|
"grad_norm": 2.0837273597717285,
|
|
"learning_rate": 5.677419354838711e-06,
|
|
"loss": 1.1216,
|
|
"step": 547
|
|
},
|
|
{
|
|
"epoch": 2.192,
|
|
"grad_norm": 2.0325253009796143,
|
|
"learning_rate": 5.669354838709677e-06,
|
|
"loss": 1.1302,
|
|
"step": 548
|
|
},
|
|
{
|
|
"epoch": 2.196,
|
|
"grad_norm": 2.179825782775879,
|
|
"learning_rate": 5.661290322580646e-06,
|
|
"loss": 0.8691,
|
|
"step": 549
|
|
},
|
|
{
|
|
"epoch": 2.2,
|
|
"grad_norm": 2.073207378387451,
|
|
"learning_rate": 5.6532258064516136e-06,
|
|
"loss": 0.8533,
|
|
"step": 550
|
|
},
|
|
{
|
|
"epoch": 2.204,
|
|
"grad_norm": 1.89866304397583,
|
|
"learning_rate": 5.645161290322582e-06,
|
|
"loss": 0.6803,
|
|
"step": 551
|
|
},
|
|
{
|
|
"epoch": 2.208,
|
|
"grad_norm": 2.177705764770508,
|
|
"learning_rate": 5.637096774193549e-06,
|
|
"loss": 0.9427,
|
|
"step": 552
|
|
},
|
|
{
|
|
"epoch": 2.212,
|
|
"grad_norm": 1.882057547569275,
|
|
"learning_rate": 5.6290322580645165e-06,
|
|
"loss": 0.8539,
|
|
"step": 553
|
|
},
|
|
{
|
|
"epoch": 2.216,
|
|
"grad_norm": 2.101376533508301,
|
|
"learning_rate": 5.620967741935485e-06,
|
|
"loss": 1.0185,
|
|
"step": 554
|
|
},
|
|
{
|
|
"epoch": 2.22,
|
|
"grad_norm": 1.9109890460968018,
|
|
"learning_rate": 5.612903225806452e-06,
|
|
"loss": 0.8022,
|
|
"step": 555
|
|
},
|
|
{
|
|
"epoch": 2.224,
|
|
"grad_norm": 2.090585708618164,
|
|
"learning_rate": 5.6048387096774195e-06,
|
|
"loss": 1.149,
|
|
"step": 556
|
|
},
|
|
{
|
|
"epoch": 2.228,
|
|
"grad_norm": 2.002155065536499,
|
|
"learning_rate": 5.596774193548388e-06,
|
|
"loss": 0.8258,
|
|
"step": 557
|
|
},
|
|
{
|
|
"epoch": 2.232,
|
|
"grad_norm": 2.0641942024230957,
|
|
"learning_rate": 5.588709677419355e-06,
|
|
"loss": 0.7931,
|
|
"step": 558
|
|
},
|
|
{
|
|
"epoch": 2.2359999999999998,
|
|
"grad_norm": 1.8409854173660278,
|
|
"learning_rate": 5.580645161290323e-06,
|
|
"loss": 0.5935,
|
|
"step": 559
|
|
},
|
|
{
|
|
"epoch": 2.24,
|
|
"grad_norm": 1.7998849153518677,
|
|
"learning_rate": 5.572580645161291e-06,
|
|
"loss": 0.7823,
|
|
"step": 560
|
|
},
|
|
{
|
|
"epoch": 2.2439999999999998,
|
|
"grad_norm": 1.995986819267273,
|
|
"learning_rate": 5.564516129032258e-06,
|
|
"loss": 0.8117,
|
|
"step": 561
|
|
},
|
|
{
|
|
"epoch": 2.248,
|
|
"grad_norm": 2.903075695037842,
|
|
"learning_rate": 5.556451612903226e-06,
|
|
"loss": 0.8912,
|
|
"step": 562
|
|
},
|
|
{
|
|
"epoch": 2.252,
|
|
"grad_norm": 1.9949880838394165,
|
|
"learning_rate": 5.548387096774194e-06,
|
|
"loss": 0.9567,
|
|
"step": 563
|
|
},
|
|
{
|
|
"epoch": 2.2560000000000002,
|
|
"grad_norm": 2.518598794937134,
|
|
"learning_rate": 5.5403225806451624e-06,
|
|
"loss": 1.1984,
|
|
"step": 564
|
|
},
|
|
{
|
|
"epoch": 2.26,
|
|
"grad_norm": 1.9082916975021362,
|
|
"learning_rate": 5.532258064516129e-06,
|
|
"loss": 0.996,
|
|
"step": 565
|
|
},
|
|
{
|
|
"epoch": 2.2640000000000002,
|
|
"grad_norm": 2.1674211025238037,
|
|
"learning_rate": 5.524193548387097e-06,
|
|
"loss": 0.9543,
|
|
"step": 566
|
|
},
|
|
{
|
|
"epoch": 2.268,
|
|
"grad_norm": 2.0850236415863037,
|
|
"learning_rate": 5.516129032258065e-06,
|
|
"loss": 0.8417,
|
|
"step": 567
|
|
},
|
|
{
|
|
"epoch": 2.2720000000000002,
|
|
"grad_norm": 2.213880777359009,
|
|
"learning_rate": 5.508064516129032e-06,
|
|
"loss": 1.115,
|
|
"step": 568
|
|
},
|
|
{
|
|
"epoch": 2.276,
|
|
"grad_norm": 1.9851431846618652,
|
|
"learning_rate": 5.500000000000001e-06,
|
|
"loss": 0.9302,
|
|
"step": 569
|
|
},
|
|
{
|
|
"epoch": 2.2800000000000002,
|
|
"grad_norm": 2.029381513595581,
|
|
"learning_rate": 5.491935483870968e-06,
|
|
"loss": 0.7905,
|
|
"step": 570
|
|
},
|
|
{
|
|
"epoch": 2.284,
|
|
"grad_norm": 2.088749885559082,
|
|
"learning_rate": 5.483870967741935e-06,
|
|
"loss": 0.951,
|
|
"step": 571
|
|
},
|
|
{
|
|
"epoch": 2.288,
|
|
"grad_norm": 1.7397806644439697,
|
|
"learning_rate": 5.475806451612904e-06,
|
|
"loss": 0.705,
|
|
"step": 572
|
|
},
|
|
{
|
|
"epoch": 2.292,
|
|
"grad_norm": 2.039262294769287,
|
|
"learning_rate": 5.467741935483871e-06,
|
|
"loss": 0.852,
|
|
"step": 573
|
|
},
|
|
{
|
|
"epoch": 2.296,
|
|
"grad_norm": 1.953506350517273,
|
|
"learning_rate": 5.45967741935484e-06,
|
|
"loss": 0.8474,
|
|
"step": 574
|
|
},
|
|
{
|
|
"epoch": 2.3,
|
|
"grad_norm": 1.8871941566467285,
|
|
"learning_rate": 5.451612903225807e-06,
|
|
"loss": 0.745,
|
|
"step": 575
|
|
},
|
|
{
|
|
"epoch": 2.304,
|
|
"grad_norm": 1.7728246450424194,
|
|
"learning_rate": 5.443548387096774e-06,
|
|
"loss": 0.6626,
|
|
"step": 576
|
|
},
|
|
{
|
|
"epoch": 2.308,
|
|
"grad_norm": 1.984948754310608,
|
|
"learning_rate": 5.435483870967743e-06,
|
|
"loss": 0.8932,
|
|
"step": 577
|
|
},
|
|
{
|
|
"epoch": 2.312,
|
|
"grad_norm": 1.9669218063354492,
|
|
"learning_rate": 5.42741935483871e-06,
|
|
"loss": 0.933,
|
|
"step": 578
|
|
},
|
|
{
|
|
"epoch": 2.316,
|
|
"grad_norm": 2.025562286376953,
|
|
"learning_rate": 5.419354838709678e-06,
|
|
"loss": 0.825,
|
|
"step": 579
|
|
},
|
|
{
|
|
"epoch": 2.32,
|
|
"grad_norm": 2.116050958633423,
|
|
"learning_rate": 5.411290322580646e-06,
|
|
"loss": 1.0288,
|
|
"step": 580
|
|
},
|
|
{
|
|
"epoch": 2.324,
|
|
"grad_norm": 2.0228822231292725,
|
|
"learning_rate": 5.4032258064516126e-06,
|
|
"loss": 0.9068,
|
|
"step": 581
|
|
},
|
|
{
|
|
"epoch": 2.328,
|
|
"grad_norm": 1.9385783672332764,
|
|
"learning_rate": 5.395161290322581e-06,
|
|
"loss": 0.9609,
|
|
"step": 582
|
|
},
|
|
{
|
|
"epoch": 2.332,
|
|
"grad_norm": 2.0579097270965576,
|
|
"learning_rate": 5.387096774193549e-06,
|
|
"loss": 1.0191,
|
|
"step": 583
|
|
},
|
|
{
|
|
"epoch": 2.336,
|
|
"grad_norm": 1.9625400304794312,
|
|
"learning_rate": 5.379032258064517e-06,
|
|
"loss": 0.872,
|
|
"step": 584
|
|
},
|
|
{
|
|
"epoch": 2.34,
|
|
"grad_norm": 2.110098361968994,
|
|
"learning_rate": 5.370967741935484e-06,
|
|
"loss": 1.1903,
|
|
"step": 585
|
|
},
|
|
{
|
|
"epoch": 2.344,
|
|
"grad_norm": 2.013211250305176,
|
|
"learning_rate": 5.362903225806452e-06,
|
|
"loss": 0.71,
|
|
"step": 586
|
|
},
|
|
{
|
|
"epoch": 2.348,
|
|
"grad_norm": 1.926029920578003,
|
|
"learning_rate": 5.35483870967742e-06,
|
|
"loss": 0.785,
|
|
"step": 587
|
|
},
|
|
{
|
|
"epoch": 2.352,
|
|
"grad_norm": 1.944735050201416,
|
|
"learning_rate": 5.346774193548388e-06,
|
|
"loss": 0.9547,
|
|
"step": 588
|
|
},
|
|
{
|
|
"epoch": 2.356,
|
|
"grad_norm": 2.068157911300659,
|
|
"learning_rate": 5.338709677419355e-06,
|
|
"loss": 1.0035,
|
|
"step": 589
|
|
},
|
|
{
|
|
"epoch": 2.36,
|
|
"grad_norm": 1.9762252569198608,
|
|
"learning_rate": 5.330645161290323e-06,
|
|
"loss": 0.9913,
|
|
"step": 590
|
|
},
|
|
{
|
|
"epoch": 2.364,
|
|
"grad_norm": 2.0391647815704346,
|
|
"learning_rate": 5.322580645161291e-06,
|
|
"loss": 1.1314,
|
|
"step": 591
|
|
},
|
|
{
|
|
"epoch": 2.368,
|
|
"grad_norm": 2.0903215408325195,
|
|
"learning_rate": 5.3145161290322585e-06,
|
|
"loss": 0.9021,
|
|
"step": 592
|
|
},
|
|
{
|
|
"epoch": 2.372,
|
|
"grad_norm": 2.012575149536133,
|
|
"learning_rate": 5.306451612903226e-06,
|
|
"loss": 0.8438,
|
|
"step": 593
|
|
},
|
|
{
|
|
"epoch": 2.376,
|
|
"grad_norm": 3.0318682193756104,
|
|
"learning_rate": 5.298387096774194e-06,
|
|
"loss": 0.9652,
|
|
"step": 594
|
|
},
|
|
{
|
|
"epoch": 2.38,
|
|
"grad_norm": 2.0645689964294434,
|
|
"learning_rate": 5.290322580645162e-06,
|
|
"loss": 0.931,
|
|
"step": 595
|
|
},
|
|
{
|
|
"epoch": 2.384,
|
|
"grad_norm": 2.010096549987793,
|
|
"learning_rate": 5.282258064516129e-06,
|
|
"loss": 0.8592,
|
|
"step": 596
|
|
},
|
|
{
|
|
"epoch": 2.388,
|
|
"grad_norm": 2.2321157455444336,
|
|
"learning_rate": 5.274193548387098e-06,
|
|
"loss": 0.8352,
|
|
"step": 597
|
|
},
|
|
{
|
|
"epoch": 2.392,
|
|
"grad_norm": 2.0409233570098877,
|
|
"learning_rate": 5.266129032258065e-06,
|
|
"loss": 0.9383,
|
|
"step": 598
|
|
},
|
|
{
|
|
"epoch": 2.396,
|
|
"grad_norm": 1.9163116216659546,
|
|
"learning_rate": 5.258064516129032e-06,
|
|
"loss": 0.6861,
|
|
"step": 599
|
|
},
|
|
{
|
|
"epoch": 2.4,
|
|
"grad_norm": 1.9450281858444214,
|
|
"learning_rate": 5.2500000000000006e-06,
|
|
"loss": 0.748,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 2.404,
|
|
"grad_norm": 1.9742106199264526,
|
|
"learning_rate": 5.241935483870968e-06,
|
|
"loss": 0.862,
|
|
"step": 601
|
|
},
|
|
{
|
|
"epoch": 2.408,
|
|
"grad_norm": 2.3256115913391113,
|
|
"learning_rate": 5.233870967741937e-06,
|
|
"loss": 0.8754,
|
|
"step": 602
|
|
},
|
|
{
|
|
"epoch": 2.412,
|
|
"grad_norm": 1.8236236572265625,
|
|
"learning_rate": 5.2258064516129035e-06,
|
|
"loss": 0.7874,
|
|
"step": 603
|
|
},
|
|
{
|
|
"epoch": 2.416,
|
|
"grad_norm": 2.0002152919769287,
|
|
"learning_rate": 5.217741935483871e-06,
|
|
"loss": 0.9769,
|
|
"step": 604
|
|
},
|
|
{
|
|
"epoch": 2.42,
|
|
"grad_norm": 1.9513111114501953,
|
|
"learning_rate": 5.20967741935484e-06,
|
|
"loss": 0.8151,
|
|
"step": 605
|
|
},
|
|
{
|
|
"epoch": 2.424,
|
|
"grad_norm": 1.953925371170044,
|
|
"learning_rate": 5.2016129032258065e-06,
|
|
"loss": 0.8143,
|
|
"step": 606
|
|
},
|
|
{
|
|
"epoch": 2.428,
|
|
"grad_norm": 2.4850172996520996,
|
|
"learning_rate": 5.193548387096775e-06,
|
|
"loss": 1.0706,
|
|
"step": 607
|
|
},
|
|
{
|
|
"epoch": 2.432,
|
|
"grad_norm": 1.8947373628616333,
|
|
"learning_rate": 5.185483870967743e-06,
|
|
"loss": 0.7941,
|
|
"step": 608
|
|
},
|
|
{
|
|
"epoch": 2.436,
|
|
"grad_norm": 2.0892701148986816,
|
|
"learning_rate": 5.1774193548387095e-06,
|
|
"loss": 0.9921,
|
|
"step": 609
|
|
},
|
|
{
|
|
"epoch": 2.44,
|
|
"grad_norm": 2.635343074798584,
|
|
"learning_rate": 5.169354838709678e-06,
|
|
"loss": 0.9391,
|
|
"step": 610
|
|
},
|
|
{
|
|
"epoch": 2.444,
|
|
"grad_norm": 2.0520694255828857,
|
|
"learning_rate": 5.161290322580646e-06,
|
|
"loss": 0.8278,
|
|
"step": 611
|
|
},
|
|
{
|
|
"epoch": 2.448,
|
|
"grad_norm": 2.127861499786377,
|
|
"learning_rate": 5.153225806451614e-06,
|
|
"loss": 1.1514,
|
|
"step": 612
|
|
},
|
|
{
|
|
"epoch": 2.452,
|
|
"grad_norm": 2.0440480709075928,
|
|
"learning_rate": 5.145161290322581e-06,
|
|
"loss": 0.7209,
|
|
"step": 613
|
|
},
|
|
{
|
|
"epoch": 2.456,
|
|
"grad_norm": 1.8232911825180054,
|
|
"learning_rate": 5.1370967741935486e-06,
|
|
"loss": 0.6083,
|
|
"step": 614
|
|
},
|
|
{
|
|
"epoch": 2.46,
|
|
"grad_norm": 2.0450599193573,
|
|
"learning_rate": 5.129032258064517e-06,
|
|
"loss": 0.824,
|
|
"step": 615
|
|
},
|
|
{
|
|
"epoch": 2.464,
|
|
"grad_norm": 2.1209301948547363,
|
|
"learning_rate": 5.120967741935484e-06,
|
|
"loss": 1.1781,
|
|
"step": 616
|
|
},
|
|
{
|
|
"epoch": 2.468,
|
|
"grad_norm": 2.2119786739349365,
|
|
"learning_rate": 5.1129032258064515e-06,
|
|
"loss": 1.1847,
|
|
"step": 617
|
|
},
|
|
{
|
|
"epoch": 2.472,
|
|
"grad_norm": 2.3321080207824707,
|
|
"learning_rate": 5.10483870967742e-06,
|
|
"loss": 0.8121,
|
|
"step": 618
|
|
},
|
|
{
|
|
"epoch": 2.476,
|
|
"grad_norm": 2.008375644683838,
|
|
"learning_rate": 5.096774193548387e-06,
|
|
"loss": 0.8741,
|
|
"step": 619
|
|
},
|
|
{
|
|
"epoch": 2.48,
|
|
"grad_norm": 2.2514259815216064,
|
|
"learning_rate": 5.088709677419355e-06,
|
|
"loss": 1.1202,
|
|
"step": 620
|
|
},
|
|
{
|
|
"epoch": 2.484,
|
|
"grad_norm": 1.8712433576583862,
|
|
"learning_rate": 5.080645161290323e-06,
|
|
"loss": 1.002,
|
|
"step": 621
|
|
},
|
|
{
|
|
"epoch": 2.488,
|
|
"grad_norm": 2.0136568546295166,
|
|
"learning_rate": 5.07258064516129e-06,
|
|
"loss": 0.9409,
|
|
"step": 622
|
|
},
|
|
{
|
|
"epoch": 2.492,
|
|
"grad_norm": 2.0185978412628174,
|
|
"learning_rate": 5.064516129032258e-06,
|
|
"loss": 0.7706,
|
|
"step": 623
|
|
},
|
|
{
|
|
"epoch": 2.496,
|
|
"grad_norm": 2.5203325748443604,
|
|
"learning_rate": 5.056451612903226e-06,
|
|
"loss": 0.8937,
|
|
"step": 624
|
|
},
|
|
{
|
|
"epoch": 2.5,
|
|
"grad_norm": 1.8861210346221924,
|
|
"learning_rate": 5.0483870967741945e-06,
|
|
"loss": 0.7429,
|
|
"step": 625
|
|
},
|
|
{
|
|
"epoch": 2.504,
|
|
"grad_norm": 2.0991008281707764,
|
|
"learning_rate": 5.040322580645161e-06,
|
|
"loss": 1.1311,
|
|
"step": 626
|
|
},
|
|
{
|
|
"epoch": 2.508,
|
|
"grad_norm": 1.919195532798767,
|
|
"learning_rate": 5.032258064516129e-06,
|
|
"loss": 0.8449,
|
|
"step": 627
|
|
},
|
|
{
|
|
"epoch": 2.512,
|
|
"grad_norm": 2.0639350414276123,
|
|
"learning_rate": 5.0241935483870974e-06,
|
|
"loss": 0.7829,
|
|
"step": 628
|
|
},
|
|
{
|
|
"epoch": 2.516,
|
|
"grad_norm": 2.0387368202209473,
|
|
"learning_rate": 5.016129032258065e-06,
|
|
"loss": 0.7948,
|
|
"step": 629
|
|
},
|
|
{
|
|
"epoch": 2.52,
|
|
"grad_norm": 2.121511459350586,
|
|
"learning_rate": 5.008064516129033e-06,
|
|
"loss": 0.8836,
|
|
"step": 630
|
|
},
|
|
{
|
|
"epoch": 2.524,
|
|
"grad_norm": 2.033421277999878,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.8941,
|
|
"step": 631
|
|
},
|
|
{
|
|
"epoch": 2.528,
|
|
"grad_norm": 1.9068338871002197,
|
|
"learning_rate": 4.991935483870968e-06,
|
|
"loss": 0.7699,
|
|
"step": 632
|
|
},
|
|
{
|
|
"epoch": 2.532,
|
|
"grad_norm": 2.0993237495422363,
|
|
"learning_rate": 4.983870967741936e-06,
|
|
"loss": 0.9509,
|
|
"step": 633
|
|
},
|
|
{
|
|
"epoch": 2.536,
|
|
"grad_norm": 1.8794467449188232,
|
|
"learning_rate": 4.975806451612903e-06,
|
|
"loss": 1.0481,
|
|
"step": 634
|
|
},
|
|
{
|
|
"epoch": 2.54,
|
|
"grad_norm": 2.022815704345703,
|
|
"learning_rate": 4.967741935483871e-06,
|
|
"loss": 0.8893,
|
|
"step": 635
|
|
},
|
|
{
|
|
"epoch": 2.544,
|
|
"grad_norm": 2.1620354652404785,
|
|
"learning_rate": 4.9596774193548395e-06,
|
|
"loss": 0.8449,
|
|
"step": 636
|
|
},
|
|
{
|
|
"epoch": 2.548,
|
|
"grad_norm": 2.199138641357422,
|
|
"learning_rate": 4.951612903225807e-06,
|
|
"loss": 1.099,
|
|
"step": 637
|
|
},
|
|
{
|
|
"epoch": 2.552,
|
|
"grad_norm": 1.8625229597091675,
|
|
"learning_rate": 4.943548387096775e-06,
|
|
"loss": 0.6892,
|
|
"step": 638
|
|
},
|
|
{
|
|
"epoch": 2.556,
|
|
"grad_norm": 1.9562162160873413,
|
|
"learning_rate": 4.9354838709677425e-06,
|
|
"loss": 0.832,
|
|
"step": 639
|
|
},
|
|
{
|
|
"epoch": 2.56,
|
|
"grad_norm": 2.2847657203674316,
|
|
"learning_rate": 4.92741935483871e-06,
|
|
"loss": 0.985,
|
|
"step": 640
|
|
},
|
|
{
|
|
"epoch": 2.564,
|
|
"grad_norm": 1.908446192741394,
|
|
"learning_rate": 4.919354838709678e-06,
|
|
"loss": 0.7687,
|
|
"step": 641
|
|
},
|
|
{
|
|
"epoch": 2.568,
|
|
"grad_norm": 2.076167106628418,
|
|
"learning_rate": 4.9112903225806455e-06,
|
|
"loss": 0.7979,
|
|
"step": 642
|
|
},
|
|
{
|
|
"epoch": 2.572,
|
|
"grad_norm": 1.9638773202896118,
|
|
"learning_rate": 4.903225806451613e-06,
|
|
"loss": 0.7731,
|
|
"step": 643
|
|
},
|
|
{
|
|
"epoch": 2.576,
|
|
"grad_norm": 1.938830018043518,
|
|
"learning_rate": 4.895161290322581e-06,
|
|
"loss": 0.6483,
|
|
"step": 644
|
|
},
|
|
{
|
|
"epoch": 2.58,
|
|
"grad_norm": 1.9963139295578003,
|
|
"learning_rate": 4.8870967741935484e-06,
|
|
"loss": 0.7376,
|
|
"step": 645
|
|
},
|
|
{
|
|
"epoch": 2.584,
|
|
"grad_norm": 2.0620100498199463,
|
|
"learning_rate": 4.879032258064517e-06,
|
|
"loss": 0.7429,
|
|
"step": 646
|
|
},
|
|
{
|
|
"epoch": 2.588,
|
|
"grad_norm": 2.256847858428955,
|
|
"learning_rate": 4.870967741935485e-06,
|
|
"loss": 1.0364,
|
|
"step": 647
|
|
},
|
|
{
|
|
"epoch": 2.592,
|
|
"grad_norm": 1.9625539779663086,
|
|
"learning_rate": 4.862903225806451e-06,
|
|
"loss": 0.7753,
|
|
"step": 648
|
|
},
|
|
{
|
|
"epoch": 2.596,
|
|
"grad_norm": 2.0241689682006836,
|
|
"learning_rate": 4.85483870967742e-06,
|
|
"loss": 0.7314,
|
|
"step": 649
|
|
},
|
|
{
|
|
"epoch": 2.6,
|
|
"grad_norm": 2.127464771270752,
|
|
"learning_rate": 4.8467741935483876e-06,
|
|
"loss": 0.9116,
|
|
"step": 650
|
|
},
|
|
{
|
|
"epoch": 2.604,
|
|
"grad_norm": 2.0825483798980713,
|
|
"learning_rate": 4.838709677419355e-06,
|
|
"loss": 0.892,
|
|
"step": 651
|
|
},
|
|
{
|
|
"epoch": 2.608,
|
|
"grad_norm": 1.9793882369995117,
|
|
"learning_rate": 4.830645161290323e-06,
|
|
"loss": 0.8634,
|
|
"step": 652
|
|
},
|
|
{
|
|
"epoch": 2.612,
|
|
"grad_norm": 2.1449151039123535,
|
|
"learning_rate": 4.8225806451612905e-06,
|
|
"loss": 0.8948,
|
|
"step": 653
|
|
},
|
|
{
|
|
"epoch": 2.616,
|
|
"grad_norm": 2.0711538791656494,
|
|
"learning_rate": 4.814516129032258e-06,
|
|
"loss": 0.9662,
|
|
"step": 654
|
|
},
|
|
{
|
|
"epoch": 2.62,
|
|
"grad_norm": 2.1446480751037598,
|
|
"learning_rate": 4.806451612903227e-06,
|
|
"loss": 1.1214,
|
|
"step": 655
|
|
},
|
|
{
|
|
"epoch": 2.624,
|
|
"grad_norm": 2.095132350921631,
|
|
"learning_rate": 4.798387096774194e-06,
|
|
"loss": 0.9048,
|
|
"step": 656
|
|
},
|
|
{
|
|
"epoch": 2.628,
|
|
"grad_norm": 2.2719180583953857,
|
|
"learning_rate": 4.790322580645161e-06,
|
|
"loss": 0.9439,
|
|
"step": 657
|
|
},
|
|
{
|
|
"epoch": 2.632,
|
|
"grad_norm": 2.134521961212158,
|
|
"learning_rate": 4.78225806451613e-06,
|
|
"loss": 0.9083,
|
|
"step": 658
|
|
},
|
|
{
|
|
"epoch": 2.636,
|
|
"grad_norm": 2.250610589981079,
|
|
"learning_rate": 4.774193548387097e-06,
|
|
"loss": 1.0183,
|
|
"step": 659
|
|
},
|
|
{
|
|
"epoch": 2.64,
|
|
"grad_norm": 2.1581926345825195,
|
|
"learning_rate": 4.766129032258065e-06,
|
|
"loss": 1.0784,
|
|
"step": 660
|
|
},
|
|
{
|
|
"epoch": 2.644,
|
|
"grad_norm": 1.8543059825897217,
|
|
"learning_rate": 4.758064516129033e-06,
|
|
"loss": 0.7357,
|
|
"step": 661
|
|
},
|
|
{
|
|
"epoch": 2.648,
|
|
"grad_norm": 2.0010993480682373,
|
|
"learning_rate": 4.75e-06,
|
|
"loss": 0.8364,
|
|
"step": 662
|
|
},
|
|
{
|
|
"epoch": 2.652,
|
|
"grad_norm": 2.1048102378845215,
|
|
"learning_rate": 4.741935483870968e-06,
|
|
"loss": 0.945,
|
|
"step": 663
|
|
},
|
|
{
|
|
"epoch": 2.656,
|
|
"grad_norm": 1.93734872341156,
|
|
"learning_rate": 4.7338709677419356e-06,
|
|
"loss": 0.9438,
|
|
"step": 664
|
|
},
|
|
{
|
|
"epoch": 2.66,
|
|
"grad_norm": 1.8726097345352173,
|
|
"learning_rate": 4.725806451612904e-06,
|
|
"loss": 0.7453,
|
|
"step": 665
|
|
},
|
|
{
|
|
"epoch": 2.664,
|
|
"grad_norm": 1.912177324295044,
|
|
"learning_rate": 4.717741935483872e-06,
|
|
"loss": 0.8832,
|
|
"step": 666
|
|
},
|
|
{
|
|
"epoch": 2.668,
|
|
"grad_norm": 2.0042824745178223,
|
|
"learning_rate": 4.7096774193548385e-06,
|
|
"loss": 0.9554,
|
|
"step": 667
|
|
},
|
|
{
|
|
"epoch": 2.672,
|
|
"grad_norm": 2.0271189212799072,
|
|
"learning_rate": 4.701612903225807e-06,
|
|
"loss": 0.8369,
|
|
"step": 668
|
|
},
|
|
{
|
|
"epoch": 2.676,
|
|
"grad_norm": 1.935368299484253,
|
|
"learning_rate": 4.693548387096775e-06,
|
|
"loss": 0.839,
|
|
"step": 669
|
|
},
|
|
{
|
|
"epoch": 2.68,
|
|
"grad_norm": 1.7850247621536255,
|
|
"learning_rate": 4.685483870967742e-06,
|
|
"loss": 0.6585,
|
|
"step": 670
|
|
},
|
|
{
|
|
"epoch": 2.684,
|
|
"grad_norm": 2.017695665359497,
|
|
"learning_rate": 4.67741935483871e-06,
|
|
"loss": 0.983,
|
|
"step": 671
|
|
},
|
|
{
|
|
"epoch": 2.6879999999999997,
|
|
"grad_norm": 2.1200356483459473,
|
|
"learning_rate": 4.669354838709678e-06,
|
|
"loss": 1.0346,
|
|
"step": 672
|
|
},
|
|
{
|
|
"epoch": 2.692,
|
|
"grad_norm": 1.8420408964157104,
|
|
"learning_rate": 4.661290322580645e-06,
|
|
"loss": 0.6386,
|
|
"step": 673
|
|
},
|
|
{
|
|
"epoch": 2.6959999999999997,
|
|
"grad_norm": 2.109278678894043,
|
|
"learning_rate": 4.653225806451613e-06,
|
|
"loss": 0.8037,
|
|
"step": 674
|
|
},
|
|
{
|
|
"epoch": 2.7,
|
|
"grad_norm": 2.1483943462371826,
|
|
"learning_rate": 4.6451612903225815e-06,
|
|
"loss": 0.9849,
|
|
"step": 675
|
|
},
|
|
{
|
|
"epoch": 2.7039999999999997,
|
|
"grad_norm": 2.1622354984283447,
|
|
"learning_rate": 4.637096774193548e-06,
|
|
"loss": 0.9104,
|
|
"step": 676
|
|
},
|
|
{
|
|
"epoch": 2.708,
|
|
"grad_norm": 2.178973913192749,
|
|
"learning_rate": 4.629032258064517e-06,
|
|
"loss": 0.9923,
|
|
"step": 677
|
|
},
|
|
{
|
|
"epoch": 2.7119999999999997,
|
|
"grad_norm": 2.0353808403015137,
|
|
"learning_rate": 4.6209677419354844e-06,
|
|
"loss": 0.9072,
|
|
"step": 678
|
|
},
|
|
{
|
|
"epoch": 2.716,
|
|
"grad_norm": 2.2480483055114746,
|
|
"learning_rate": 4.612903225806452e-06,
|
|
"loss": 1.1416,
|
|
"step": 679
|
|
},
|
|
{
|
|
"epoch": 2.7199999999999998,
|
|
"grad_norm": 2.0765092372894287,
|
|
"learning_rate": 4.60483870967742e-06,
|
|
"loss": 0.9671,
|
|
"step": 680
|
|
},
|
|
{
|
|
"epoch": 2.724,
|
|
"grad_norm": 2.0080316066741943,
|
|
"learning_rate": 4.596774193548387e-06,
|
|
"loss": 1.0271,
|
|
"step": 681
|
|
},
|
|
{
|
|
"epoch": 2.7279999999999998,
|
|
"grad_norm": 2.1100165843963623,
|
|
"learning_rate": 4.588709677419355e-06,
|
|
"loss": 0.8433,
|
|
"step": 682
|
|
},
|
|
{
|
|
"epoch": 2.732,
|
|
"grad_norm": 1.883931279182434,
|
|
"learning_rate": 4.580645161290323e-06,
|
|
"loss": 0.7499,
|
|
"step": 683
|
|
},
|
|
{
|
|
"epoch": 2.7359999999999998,
|
|
"grad_norm": 2.1434648036956787,
|
|
"learning_rate": 4.572580645161291e-06,
|
|
"loss": 0.8798,
|
|
"step": 684
|
|
},
|
|
{
|
|
"epoch": 2.74,
|
|
"grad_norm": 2.163625717163086,
|
|
"learning_rate": 4.564516129032259e-06,
|
|
"loss": 1.0695,
|
|
"step": 685
|
|
},
|
|
{
|
|
"epoch": 2.7439999999999998,
|
|
"grad_norm": 2.0244061946868896,
|
|
"learning_rate": 4.556451612903226e-06,
|
|
"loss": 1.1869,
|
|
"step": 686
|
|
},
|
|
{
|
|
"epoch": 2.748,
|
|
"grad_norm": 1.9727342128753662,
|
|
"learning_rate": 4.548387096774194e-06,
|
|
"loss": 0.8893,
|
|
"step": 687
|
|
},
|
|
{
|
|
"epoch": 2.752,
|
|
"grad_norm": 2.0887742042541504,
|
|
"learning_rate": 4.540322580645162e-06,
|
|
"loss": 1.0081,
|
|
"step": 688
|
|
},
|
|
{
|
|
"epoch": 2.7560000000000002,
|
|
"grad_norm": 1.9039736986160278,
|
|
"learning_rate": 4.5322580645161295e-06,
|
|
"loss": 0.7422,
|
|
"step": 689
|
|
},
|
|
{
|
|
"epoch": 2.76,
|
|
"grad_norm": 1.9305413961410522,
|
|
"learning_rate": 4.524193548387097e-06,
|
|
"loss": 0.946,
|
|
"step": 690
|
|
},
|
|
{
|
|
"epoch": 2.7640000000000002,
|
|
"grad_norm": 2.099938154220581,
|
|
"learning_rate": 4.516129032258065e-06,
|
|
"loss": 0.8641,
|
|
"step": 691
|
|
},
|
|
{
|
|
"epoch": 2.768,
|
|
"grad_norm": 2.146622657775879,
|
|
"learning_rate": 4.5080645161290325e-06,
|
|
"loss": 0.9026,
|
|
"step": 692
|
|
},
|
|
{
|
|
"epoch": 2.7720000000000002,
|
|
"grad_norm": 2.158890724182129,
|
|
"learning_rate": 4.5e-06,
|
|
"loss": 0.8535,
|
|
"step": 693
|
|
},
|
|
{
|
|
"epoch": 2.776,
|
|
"grad_norm": 1.977545976638794,
|
|
"learning_rate": 4.491935483870969e-06,
|
|
"loss": 0.8315,
|
|
"step": 694
|
|
},
|
|
{
|
|
"epoch": 2.7800000000000002,
|
|
"grad_norm": 2.205862522125244,
|
|
"learning_rate": 4.4838709677419354e-06,
|
|
"loss": 0.8047,
|
|
"step": 695
|
|
},
|
|
{
|
|
"epoch": 2.784,
|
|
"grad_norm": 2.0016543865203857,
|
|
"learning_rate": 4.475806451612903e-06,
|
|
"loss": 0.7461,
|
|
"step": 696
|
|
},
|
|
{
|
|
"epoch": 2.7880000000000003,
|
|
"grad_norm": 2.0579326152801514,
|
|
"learning_rate": 4.467741935483872e-06,
|
|
"loss": 0.9703,
|
|
"step": 697
|
|
},
|
|
{
|
|
"epoch": 2.792,
|
|
"grad_norm": 2.0946247577667236,
|
|
"learning_rate": 4.459677419354839e-06,
|
|
"loss": 0.9027,
|
|
"step": 698
|
|
},
|
|
{
|
|
"epoch": 2.7960000000000003,
|
|
"grad_norm": 2.0501272678375244,
|
|
"learning_rate": 4.451612903225807e-06,
|
|
"loss": 0.8489,
|
|
"step": 699
|
|
},
|
|
{
|
|
"epoch": 2.8,
|
|
"grad_norm": 2.2338743209838867,
|
|
"learning_rate": 4.4435483870967745e-06,
|
|
"loss": 1.0747,
|
|
"step": 700
|
|
},
|
|
{
|
|
"epoch": 2.8040000000000003,
|
|
"grad_norm": 1.8594715595245361,
|
|
"learning_rate": 4.435483870967742e-06,
|
|
"loss": 0.7034,
|
|
"step": 701
|
|
},
|
|
{
|
|
"epoch": 2.808,
|
|
"grad_norm": 2.1036953926086426,
|
|
"learning_rate": 4.42741935483871e-06,
|
|
"loss": 1.1334,
|
|
"step": 702
|
|
},
|
|
{
|
|
"epoch": 2.8120000000000003,
|
|
"grad_norm": 2.05660343170166,
|
|
"learning_rate": 4.419354838709678e-06,
|
|
"loss": 0.9696,
|
|
"step": 703
|
|
},
|
|
{
|
|
"epoch": 2.816,
|
|
"grad_norm": 1.8654855489730835,
|
|
"learning_rate": 4.411290322580645e-06,
|
|
"loss": 0.7639,
|
|
"step": 704
|
|
},
|
|
{
|
|
"epoch": 2.82,
|
|
"grad_norm": 2.0342979431152344,
|
|
"learning_rate": 4.403225806451613e-06,
|
|
"loss": 0.7359,
|
|
"step": 705
|
|
},
|
|
{
|
|
"epoch": 2.824,
|
|
"grad_norm": 2.1098721027374268,
|
|
"learning_rate": 4.395161290322581e-06,
|
|
"loss": 0.9963,
|
|
"step": 706
|
|
},
|
|
{
|
|
"epoch": 2.828,
|
|
"grad_norm": 2.033388614654541,
|
|
"learning_rate": 4.387096774193549e-06,
|
|
"loss": 1.0258,
|
|
"step": 707
|
|
},
|
|
{
|
|
"epoch": 2.832,
|
|
"grad_norm": 2.041229248046875,
|
|
"learning_rate": 4.379032258064517e-06,
|
|
"loss": 0.8926,
|
|
"step": 708
|
|
},
|
|
{
|
|
"epoch": 2.836,
|
|
"grad_norm": 2.04064679145813,
|
|
"learning_rate": 4.370967741935484e-06,
|
|
"loss": 0.9707,
|
|
"step": 709
|
|
},
|
|
{
|
|
"epoch": 2.84,
|
|
"grad_norm": 2.09248685836792,
|
|
"learning_rate": 4.362903225806452e-06,
|
|
"loss": 0.9311,
|
|
"step": 710
|
|
},
|
|
{
|
|
"epoch": 2.844,
|
|
"grad_norm": 1.9359509944915771,
|
|
"learning_rate": 4.35483870967742e-06,
|
|
"loss": 0.7429,
|
|
"step": 711
|
|
},
|
|
{
|
|
"epoch": 2.848,
|
|
"grad_norm": 1.8163801431655884,
|
|
"learning_rate": 4.346774193548387e-06,
|
|
"loss": 0.7459,
|
|
"step": 712
|
|
},
|
|
{
|
|
"epoch": 2.852,
|
|
"grad_norm": 2.0014097690582275,
|
|
"learning_rate": 4.338709677419356e-06,
|
|
"loss": 0.7952,
|
|
"step": 713
|
|
},
|
|
{
|
|
"epoch": 2.856,
|
|
"grad_norm": 1.8234827518463135,
|
|
"learning_rate": 4.3306451612903226e-06,
|
|
"loss": 0.7551,
|
|
"step": 714
|
|
},
|
|
{
|
|
"epoch": 2.86,
|
|
"grad_norm": 1.8983055353164673,
|
|
"learning_rate": 4.32258064516129e-06,
|
|
"loss": 0.8233,
|
|
"step": 715
|
|
},
|
|
{
|
|
"epoch": 2.864,
|
|
"grad_norm": 2.007643222808838,
|
|
"learning_rate": 4.314516129032259e-06,
|
|
"loss": 0.8136,
|
|
"step": 716
|
|
},
|
|
{
|
|
"epoch": 2.868,
|
|
"grad_norm": 2.030146598815918,
|
|
"learning_rate": 4.306451612903226e-06,
|
|
"loss": 0.9544,
|
|
"step": 717
|
|
},
|
|
{
|
|
"epoch": 2.872,
|
|
"grad_norm": 1.9575273990631104,
|
|
"learning_rate": 4.298387096774194e-06,
|
|
"loss": 0.8132,
|
|
"step": 718
|
|
},
|
|
{
|
|
"epoch": 2.876,
|
|
"grad_norm": 2.100543260574341,
|
|
"learning_rate": 4.290322580645162e-06,
|
|
"loss": 0.8235,
|
|
"step": 719
|
|
},
|
|
{
|
|
"epoch": 2.88,
|
|
"grad_norm": 2.001739501953125,
|
|
"learning_rate": 4.282258064516129e-06,
|
|
"loss": 0.8988,
|
|
"step": 720
|
|
},
|
|
{
|
|
"epoch": 2.884,
|
|
"grad_norm": 1.9895164966583252,
|
|
"learning_rate": 4.274193548387097e-06,
|
|
"loss": 0.8197,
|
|
"step": 721
|
|
},
|
|
{
|
|
"epoch": 2.888,
|
|
"grad_norm": 2.17423677444458,
|
|
"learning_rate": 4.266129032258065e-06,
|
|
"loss": 1.0093,
|
|
"step": 722
|
|
},
|
|
{
|
|
"epoch": 2.892,
|
|
"grad_norm": 2.095899820327759,
|
|
"learning_rate": 4.258064516129032e-06,
|
|
"loss": 0.8459,
|
|
"step": 723
|
|
},
|
|
{
|
|
"epoch": 2.896,
|
|
"grad_norm": 1.9749155044555664,
|
|
"learning_rate": 4.25e-06,
|
|
"loss": 0.728,
|
|
"step": 724
|
|
},
|
|
{
|
|
"epoch": 2.9,
|
|
"grad_norm": 2.1287543773651123,
|
|
"learning_rate": 4.2419354838709685e-06,
|
|
"loss": 0.893,
|
|
"step": 725
|
|
},
|
|
{
|
|
"epoch": 2.904,
|
|
"grad_norm": 2.2005615234375,
|
|
"learning_rate": 4.233870967741936e-06,
|
|
"loss": 0.9092,
|
|
"step": 726
|
|
},
|
|
{
|
|
"epoch": 2.908,
|
|
"grad_norm": 1.9065170288085938,
|
|
"learning_rate": 4.225806451612904e-06,
|
|
"loss": 0.7934,
|
|
"step": 727
|
|
},
|
|
{
|
|
"epoch": 2.912,
|
|
"grad_norm": 2.1782727241516113,
|
|
"learning_rate": 4.2177419354838714e-06,
|
|
"loss": 0.9975,
|
|
"step": 728
|
|
},
|
|
{
|
|
"epoch": 2.916,
|
|
"grad_norm": 1.943291425704956,
|
|
"learning_rate": 4.209677419354839e-06,
|
|
"loss": 0.6421,
|
|
"step": 729
|
|
},
|
|
{
|
|
"epoch": 2.92,
|
|
"grad_norm": 2.0664920806884766,
|
|
"learning_rate": 4.201612903225807e-06,
|
|
"loss": 0.9832,
|
|
"step": 730
|
|
},
|
|
{
|
|
"epoch": 2.924,
|
|
"grad_norm": 2.025261163711548,
|
|
"learning_rate": 4.193548387096774e-06,
|
|
"loss": 0.8664,
|
|
"step": 731
|
|
},
|
|
{
|
|
"epoch": 2.928,
|
|
"grad_norm": 2.236361503601074,
|
|
"learning_rate": 4.185483870967742e-06,
|
|
"loss": 0.9429,
|
|
"step": 732
|
|
},
|
|
{
|
|
"epoch": 2.932,
|
|
"grad_norm": 2.07635235786438,
|
|
"learning_rate": 4.17741935483871e-06,
|
|
"loss": 0.8334,
|
|
"step": 733
|
|
},
|
|
{
|
|
"epoch": 2.936,
|
|
"grad_norm": 2.529341697692871,
|
|
"learning_rate": 4.169354838709677e-06,
|
|
"loss": 1.3918,
|
|
"step": 734
|
|
},
|
|
{
|
|
"epoch": 2.94,
|
|
"grad_norm": 2.07926869392395,
|
|
"learning_rate": 4.161290322580646e-06,
|
|
"loss": 0.8891,
|
|
"step": 735
|
|
},
|
|
{
|
|
"epoch": 2.944,
|
|
"grad_norm": 2.050360679626465,
|
|
"learning_rate": 4.1532258064516135e-06,
|
|
"loss": 0.6847,
|
|
"step": 736
|
|
},
|
|
{
|
|
"epoch": 2.948,
|
|
"grad_norm": 1.9370849132537842,
|
|
"learning_rate": 4.14516129032258e-06,
|
|
"loss": 0.9969,
|
|
"step": 737
|
|
},
|
|
{
|
|
"epoch": 2.952,
|
|
"grad_norm": 2.014514923095703,
|
|
"learning_rate": 4.137096774193549e-06,
|
|
"loss": 0.9307,
|
|
"step": 738
|
|
},
|
|
{
|
|
"epoch": 2.956,
|
|
"grad_norm": 1.977842092514038,
|
|
"learning_rate": 4.1290322580645165e-06,
|
|
"loss": 0.7776,
|
|
"step": 739
|
|
},
|
|
{
|
|
"epoch": 2.96,
|
|
"grad_norm": 2.2410061359405518,
|
|
"learning_rate": 4.120967741935484e-06,
|
|
"loss": 0.7748,
|
|
"step": 740
|
|
},
|
|
{
|
|
"epoch": 2.964,
|
|
"grad_norm": 2.128809928894043,
|
|
"learning_rate": 4.112903225806452e-06,
|
|
"loss": 0.9865,
|
|
"step": 741
|
|
},
|
|
{
|
|
"epoch": 2.968,
|
|
"grad_norm": 2.3587141036987305,
|
|
"learning_rate": 4.1048387096774195e-06,
|
|
"loss": 1.0742,
|
|
"step": 742
|
|
},
|
|
{
|
|
"epoch": 2.972,
|
|
"grad_norm": 1.8111485242843628,
|
|
"learning_rate": 4.096774193548387e-06,
|
|
"loss": 0.7442,
|
|
"step": 743
|
|
},
|
|
{
|
|
"epoch": 2.976,
|
|
"grad_norm": 1.935282588005066,
|
|
"learning_rate": 4.088709677419356e-06,
|
|
"loss": 0.785,
|
|
"step": 744
|
|
},
|
|
{
|
|
"epoch": 2.98,
|
|
"grad_norm": 2.1059017181396484,
|
|
"learning_rate": 4.080645161290323e-06,
|
|
"loss": 0.9952,
|
|
"step": 745
|
|
},
|
|
{
|
|
"epoch": 2.984,
|
|
"grad_norm": 1.992270588874817,
|
|
"learning_rate": 4.072580645161291e-06,
|
|
"loss": 1.0289,
|
|
"step": 746
|
|
},
|
|
{
|
|
"epoch": 2.988,
|
|
"grad_norm": 2.0285916328430176,
|
|
"learning_rate": 4.064516129032259e-06,
|
|
"loss": 0.9602,
|
|
"step": 747
|
|
},
|
|
{
|
|
"epoch": 2.992,
|
|
"grad_norm": 2.2532975673675537,
|
|
"learning_rate": 4.056451612903226e-06,
|
|
"loss": 0.8776,
|
|
"step": 748
|
|
},
|
|
{
|
|
"epoch": 2.996,
|
|
"grad_norm": 2.032003879547119,
|
|
"learning_rate": 4.048387096774194e-06,
|
|
"loss": 0.8821,
|
|
"step": 749
|
|
},
|
|
{
|
|
"epoch": 3.0,
|
|
"grad_norm": 2.123439311981201,
|
|
"learning_rate": 4.0403225806451615e-06,
|
|
"loss": 1.1439,
|
|
"step": 750
|
|
},
|
|
{
|
|
"epoch": 3.004,
|
|
"grad_norm": 1.7719855308532715,
|
|
"learning_rate": 4.032258064516129e-06,
|
|
"loss": 0.7289,
|
|
"step": 751
|
|
},
|
|
{
|
|
"epoch": 3.008,
|
|
"grad_norm": 2.1318604946136475,
|
|
"learning_rate": 4.024193548387097e-06,
|
|
"loss": 0.8148,
|
|
"step": 752
|
|
},
|
|
{
|
|
"epoch": 3.012,
|
|
"grad_norm": 1.8814866542816162,
|
|
"learning_rate": 4.0161290322580645e-06,
|
|
"loss": 0.6389,
|
|
"step": 753
|
|
},
|
|
{
|
|
"epoch": 3.016,
|
|
"grad_norm": 1.8504319190979004,
|
|
"learning_rate": 4.008064516129033e-06,
|
|
"loss": 0.7059,
|
|
"step": 754
|
|
},
|
|
{
|
|
"epoch": 3.02,
|
|
"grad_norm": 1.9470055103302002,
|
|
"learning_rate": 4.000000000000001e-06,
|
|
"loss": 0.8993,
|
|
"step": 755
|
|
},
|
|
{
|
|
"epoch": 3.024,
|
|
"grad_norm": 2.0780255794525146,
|
|
"learning_rate": 3.9919354838709675e-06,
|
|
"loss": 0.9513,
|
|
"step": 756
|
|
},
|
|
{
|
|
"epoch": 3.028,
|
|
"grad_norm": 1.9300411939620972,
|
|
"learning_rate": 3.983870967741936e-06,
|
|
"loss": 0.9704,
|
|
"step": 757
|
|
},
|
|
{
|
|
"epoch": 3.032,
|
|
"grad_norm": 2.2161500453948975,
|
|
"learning_rate": 3.975806451612904e-06,
|
|
"loss": 0.9227,
|
|
"step": 758
|
|
},
|
|
{
|
|
"epoch": 3.036,
|
|
"grad_norm": 1.9605953693389893,
|
|
"learning_rate": 3.967741935483871e-06,
|
|
"loss": 0.7131,
|
|
"step": 759
|
|
},
|
|
{
|
|
"epoch": 3.04,
|
|
"grad_norm": 2.048816442489624,
|
|
"learning_rate": 3.959677419354839e-06,
|
|
"loss": 0.81,
|
|
"step": 760
|
|
},
|
|
{
|
|
"epoch": 3.044,
|
|
"grad_norm": 2.2346277236938477,
|
|
"learning_rate": 3.951612903225807e-06,
|
|
"loss": 1.105,
|
|
"step": 761
|
|
},
|
|
{
|
|
"epoch": 3.048,
|
|
"grad_norm": 2.1492414474487305,
|
|
"learning_rate": 3.943548387096774e-06,
|
|
"loss": 0.7878,
|
|
"step": 762
|
|
},
|
|
{
|
|
"epoch": 3.052,
|
|
"grad_norm": 2.0384480953216553,
|
|
"learning_rate": 3.935483870967742e-06,
|
|
"loss": 0.9071,
|
|
"step": 763
|
|
},
|
|
{
|
|
"epoch": 3.056,
|
|
"grad_norm": 1.9908150434494019,
|
|
"learning_rate": 3.92741935483871e-06,
|
|
"loss": 0.7479,
|
|
"step": 764
|
|
},
|
|
{
|
|
"epoch": 3.06,
|
|
"grad_norm": 2.333285093307495,
|
|
"learning_rate": 3.919354838709677e-06,
|
|
"loss": 0.872,
|
|
"step": 765
|
|
},
|
|
{
|
|
"epoch": 3.064,
|
|
"grad_norm": 2.2055699825286865,
|
|
"learning_rate": 3.911290322580646e-06,
|
|
"loss": 1.0272,
|
|
"step": 766
|
|
},
|
|
{
|
|
"epoch": 3.068,
|
|
"grad_norm": 2.220351219177246,
|
|
"learning_rate": 3.903225806451613e-06,
|
|
"loss": 0.805,
|
|
"step": 767
|
|
},
|
|
{
|
|
"epoch": 3.072,
|
|
"grad_norm": 2.3372137546539307,
|
|
"learning_rate": 3.895161290322581e-06,
|
|
"loss": 0.9424,
|
|
"step": 768
|
|
},
|
|
{
|
|
"epoch": 3.076,
|
|
"grad_norm": 1.9927171468734741,
|
|
"learning_rate": 3.887096774193549e-06,
|
|
"loss": 0.8291,
|
|
"step": 769
|
|
},
|
|
{
|
|
"epoch": 3.08,
|
|
"grad_norm": 2.0067288875579834,
|
|
"learning_rate": 3.879032258064516e-06,
|
|
"loss": 0.7085,
|
|
"step": 770
|
|
},
|
|
{
|
|
"epoch": 3.084,
|
|
"grad_norm": 2.0726656913757324,
|
|
"learning_rate": 3.870967741935484e-06,
|
|
"loss": 0.7554,
|
|
"step": 771
|
|
},
|
|
{
|
|
"epoch": 3.088,
|
|
"grad_norm": 2.2010180950164795,
|
|
"learning_rate": 3.862903225806452e-06,
|
|
"loss": 0.8685,
|
|
"step": 772
|
|
},
|
|
{
|
|
"epoch": 3.092,
|
|
"grad_norm": 1.9578466415405273,
|
|
"learning_rate": 3.85483870967742e-06,
|
|
"loss": 0.7926,
|
|
"step": 773
|
|
},
|
|
{
|
|
"epoch": 3.096,
|
|
"grad_norm": 2.350506067276001,
|
|
"learning_rate": 3.846774193548388e-06,
|
|
"loss": 0.8615,
|
|
"step": 774
|
|
},
|
|
{
|
|
"epoch": 3.1,
|
|
"grad_norm": 2.1344101428985596,
|
|
"learning_rate": 3.838709677419355e-06,
|
|
"loss": 0.7115,
|
|
"step": 775
|
|
},
|
|
{
|
|
"epoch": 3.104,
|
|
"grad_norm": 2.144338846206665,
|
|
"learning_rate": 3.830645161290323e-06,
|
|
"loss": 0.8837,
|
|
"step": 776
|
|
},
|
|
{
|
|
"epoch": 3.108,
|
|
"grad_norm": 2.0415239334106445,
|
|
"learning_rate": 3.822580645161291e-06,
|
|
"loss": 0.7773,
|
|
"step": 777
|
|
},
|
|
{
|
|
"epoch": 3.112,
|
|
"grad_norm": 2.1297361850738525,
|
|
"learning_rate": 3.8145161290322584e-06,
|
|
"loss": 0.7109,
|
|
"step": 778
|
|
},
|
|
{
|
|
"epoch": 3.116,
|
|
"grad_norm": 2.2194135189056396,
|
|
"learning_rate": 3.8064516129032257e-06,
|
|
"loss": 1.0434,
|
|
"step": 779
|
|
},
|
|
{
|
|
"epoch": 3.12,
|
|
"grad_norm": 2.218269109725952,
|
|
"learning_rate": 3.7983870967741937e-06,
|
|
"loss": 1.0466,
|
|
"step": 780
|
|
},
|
|
{
|
|
"epoch": 3.124,
|
|
"grad_norm": 2.140355110168457,
|
|
"learning_rate": 3.7903225806451614e-06,
|
|
"loss": 0.9349,
|
|
"step": 781
|
|
},
|
|
{
|
|
"epoch": 3.128,
|
|
"grad_norm": 2.2219574451446533,
|
|
"learning_rate": 3.7822580645161295e-06,
|
|
"loss": 0.9302,
|
|
"step": 782
|
|
},
|
|
{
|
|
"epoch": 3.132,
|
|
"grad_norm": 2.2847862243652344,
|
|
"learning_rate": 3.774193548387097e-06,
|
|
"loss": 0.8846,
|
|
"step": 783
|
|
},
|
|
{
|
|
"epoch": 3.136,
|
|
"grad_norm": 2.7046589851379395,
|
|
"learning_rate": 3.7661290322580648e-06,
|
|
"loss": 1.1264,
|
|
"step": 784
|
|
},
|
|
{
|
|
"epoch": 3.14,
|
|
"grad_norm": 2.2613577842712402,
|
|
"learning_rate": 3.7580645161290324e-06,
|
|
"loss": 0.8623,
|
|
"step": 785
|
|
},
|
|
{
|
|
"epoch": 3.144,
|
|
"grad_norm": 2.1418726444244385,
|
|
"learning_rate": 3.7500000000000005e-06,
|
|
"loss": 0.708,
|
|
"step": 786
|
|
},
|
|
{
|
|
"epoch": 3.148,
|
|
"grad_norm": 2.306887149810791,
|
|
"learning_rate": 3.741935483870968e-06,
|
|
"loss": 0.8445,
|
|
"step": 787
|
|
},
|
|
{
|
|
"epoch": 3.152,
|
|
"grad_norm": 2.23420786857605,
|
|
"learning_rate": 3.733870967741936e-06,
|
|
"loss": 0.7293,
|
|
"step": 788
|
|
},
|
|
{
|
|
"epoch": 3.156,
|
|
"grad_norm": 2.2788777351379395,
|
|
"learning_rate": 3.7258064516129035e-06,
|
|
"loss": 0.8153,
|
|
"step": 789
|
|
},
|
|
{
|
|
"epoch": 3.16,
|
|
"grad_norm": 1.8298105001449585,
|
|
"learning_rate": 3.717741935483871e-06,
|
|
"loss": 0.5402,
|
|
"step": 790
|
|
},
|
|
{
|
|
"epoch": 3.164,
|
|
"grad_norm": 2.0371906757354736,
|
|
"learning_rate": 3.7096774193548392e-06,
|
|
"loss": 0.8256,
|
|
"step": 791
|
|
},
|
|
{
|
|
"epoch": 3.168,
|
|
"grad_norm": 2.2342934608459473,
|
|
"learning_rate": 3.701612903225807e-06,
|
|
"loss": 0.7587,
|
|
"step": 792
|
|
},
|
|
{
|
|
"epoch": 3.172,
|
|
"grad_norm": 1.977276086807251,
|
|
"learning_rate": 3.693548387096775e-06,
|
|
"loss": 0.6206,
|
|
"step": 793
|
|
},
|
|
{
|
|
"epoch": 3.176,
|
|
"grad_norm": 2.0874223709106445,
|
|
"learning_rate": 3.685483870967742e-06,
|
|
"loss": 0.7768,
|
|
"step": 794
|
|
},
|
|
{
|
|
"epoch": 3.18,
|
|
"grad_norm": 2.3658487796783447,
|
|
"learning_rate": 3.67741935483871e-06,
|
|
"loss": 1.0724,
|
|
"step": 795
|
|
},
|
|
{
|
|
"epoch": 3.184,
|
|
"grad_norm": 2.1156909465789795,
|
|
"learning_rate": 3.669354838709678e-06,
|
|
"loss": 0.7234,
|
|
"step": 796
|
|
},
|
|
{
|
|
"epoch": 3.188,
|
|
"grad_norm": 2.087329387664795,
|
|
"learning_rate": 3.6612903225806456e-06,
|
|
"loss": 0.7502,
|
|
"step": 797
|
|
},
|
|
{
|
|
"epoch": 3.192,
|
|
"grad_norm": 2.002694606781006,
|
|
"learning_rate": 3.653225806451613e-06,
|
|
"loss": 0.6678,
|
|
"step": 798
|
|
},
|
|
{
|
|
"epoch": 3.196,
|
|
"grad_norm": 2.2412350177764893,
|
|
"learning_rate": 3.645161290322581e-06,
|
|
"loss": 0.8916,
|
|
"step": 799
|
|
},
|
|
{
|
|
"epoch": 3.2,
|
|
"grad_norm": 2.1156818866729736,
|
|
"learning_rate": 3.6370967741935485e-06,
|
|
"loss": 0.8201,
|
|
"step": 800
|
|
},
|
|
{
|
|
"epoch": 3.204,
|
|
"grad_norm": 1.9646267890930176,
|
|
"learning_rate": 3.6290322580645166e-06,
|
|
"loss": 0.6466,
|
|
"step": 801
|
|
},
|
|
{
|
|
"epoch": 3.208,
|
|
"grad_norm": 2.306908369064331,
|
|
"learning_rate": 3.6209677419354843e-06,
|
|
"loss": 0.8978,
|
|
"step": 802
|
|
},
|
|
{
|
|
"epoch": 3.212,
|
|
"grad_norm": 2.0926380157470703,
|
|
"learning_rate": 3.6129032258064515e-06,
|
|
"loss": 0.8653,
|
|
"step": 803
|
|
},
|
|
{
|
|
"epoch": 3.216,
|
|
"grad_norm": 1.9711487293243408,
|
|
"learning_rate": 3.6048387096774196e-06,
|
|
"loss": 0.6135,
|
|
"step": 804
|
|
},
|
|
{
|
|
"epoch": 3.22,
|
|
"grad_norm": 1.9564828872680664,
|
|
"learning_rate": 3.5967741935483872e-06,
|
|
"loss": 0.5758,
|
|
"step": 805
|
|
},
|
|
{
|
|
"epoch": 3.224,
|
|
"grad_norm": 2.0477702617645264,
|
|
"learning_rate": 3.5887096774193553e-06,
|
|
"loss": 0.6318,
|
|
"step": 806
|
|
},
|
|
{
|
|
"epoch": 3.228,
|
|
"grad_norm": 2.380937337875366,
|
|
"learning_rate": 3.580645161290323e-06,
|
|
"loss": 0.7827,
|
|
"step": 807
|
|
},
|
|
{
|
|
"epoch": 3.232,
|
|
"grad_norm": 2.2055320739746094,
|
|
"learning_rate": 3.5725806451612906e-06,
|
|
"loss": 1.0348,
|
|
"step": 808
|
|
},
|
|
{
|
|
"epoch": 3.2359999999999998,
|
|
"grad_norm": 2.1511037349700928,
|
|
"learning_rate": 3.5645161290322583e-06,
|
|
"loss": 0.7221,
|
|
"step": 809
|
|
},
|
|
{
|
|
"epoch": 3.24,
|
|
"grad_norm": 2.1401073932647705,
|
|
"learning_rate": 3.5564516129032264e-06,
|
|
"loss": 0.7153,
|
|
"step": 810
|
|
},
|
|
{
|
|
"epoch": 3.2439999999999998,
|
|
"grad_norm": 2.51979660987854,
|
|
"learning_rate": 3.548387096774194e-06,
|
|
"loss": 0.6526,
|
|
"step": 811
|
|
},
|
|
{
|
|
"epoch": 3.248,
|
|
"grad_norm": 2.209287643432617,
|
|
"learning_rate": 3.5403225806451612e-06,
|
|
"loss": 0.9964,
|
|
"step": 812
|
|
},
|
|
{
|
|
"epoch": 3.252,
|
|
"grad_norm": 1.9091753959655762,
|
|
"learning_rate": 3.5322580645161293e-06,
|
|
"loss": 0.6887,
|
|
"step": 813
|
|
},
|
|
{
|
|
"epoch": 3.2560000000000002,
|
|
"grad_norm": 2.0265955924987793,
|
|
"learning_rate": 3.524193548387097e-06,
|
|
"loss": 0.8468,
|
|
"step": 814
|
|
},
|
|
{
|
|
"epoch": 3.26,
|
|
"grad_norm": 2.014711618423462,
|
|
"learning_rate": 3.516129032258065e-06,
|
|
"loss": 0.8604,
|
|
"step": 815
|
|
},
|
|
{
|
|
"epoch": 3.2640000000000002,
|
|
"grad_norm": 1.97525155544281,
|
|
"learning_rate": 3.5080645161290327e-06,
|
|
"loss": 0.6563,
|
|
"step": 816
|
|
},
|
|
{
|
|
"epoch": 3.268,
|
|
"grad_norm": 2.1765522956848145,
|
|
"learning_rate": 3.5e-06,
|
|
"loss": 0.9257,
|
|
"step": 817
|
|
},
|
|
{
|
|
"epoch": 3.2720000000000002,
|
|
"grad_norm": 2.2417075634002686,
|
|
"learning_rate": 3.491935483870968e-06,
|
|
"loss": 0.8021,
|
|
"step": 818
|
|
},
|
|
{
|
|
"epoch": 3.276,
|
|
"grad_norm": 2.251518487930298,
|
|
"learning_rate": 3.4838709677419357e-06,
|
|
"loss": 0.7227,
|
|
"step": 819
|
|
},
|
|
{
|
|
"epoch": 3.2800000000000002,
|
|
"grad_norm": 2.335770606994629,
|
|
"learning_rate": 3.4758064516129038e-06,
|
|
"loss": 0.8255,
|
|
"step": 820
|
|
},
|
|
{
|
|
"epoch": 3.284,
|
|
"grad_norm": 2.1436550617218018,
|
|
"learning_rate": 3.4677419354838714e-06,
|
|
"loss": 0.9514,
|
|
"step": 821
|
|
},
|
|
{
|
|
"epoch": 3.288,
|
|
"grad_norm": 2.09932804107666,
|
|
"learning_rate": 3.4596774193548386e-06,
|
|
"loss": 0.5868,
|
|
"step": 822
|
|
},
|
|
{
|
|
"epoch": 3.292,
|
|
"grad_norm": 2.198049306869507,
|
|
"learning_rate": 3.4516129032258067e-06,
|
|
"loss": 0.8944,
|
|
"step": 823
|
|
},
|
|
{
|
|
"epoch": 3.296,
|
|
"grad_norm": 2.0434253215789795,
|
|
"learning_rate": 3.4435483870967744e-06,
|
|
"loss": 0.8541,
|
|
"step": 824
|
|
},
|
|
{
|
|
"epoch": 3.3,
|
|
"grad_norm": 2.1087992191314697,
|
|
"learning_rate": 3.4354838709677425e-06,
|
|
"loss": 0.8743,
|
|
"step": 825
|
|
},
|
|
{
|
|
"epoch": 3.304,
|
|
"grad_norm": 2.2953779697418213,
|
|
"learning_rate": 3.4274193548387097e-06,
|
|
"loss": 0.8574,
|
|
"step": 826
|
|
},
|
|
{
|
|
"epoch": 3.308,
|
|
"grad_norm": 2.283322811126709,
|
|
"learning_rate": 3.4193548387096773e-06,
|
|
"loss": 0.8048,
|
|
"step": 827
|
|
},
|
|
{
|
|
"epoch": 3.312,
|
|
"grad_norm": 2.1171464920043945,
|
|
"learning_rate": 3.4112903225806454e-06,
|
|
"loss": 0.8447,
|
|
"step": 828
|
|
},
|
|
{
|
|
"epoch": 3.316,
|
|
"grad_norm": 2.184858560562134,
|
|
"learning_rate": 3.403225806451613e-06,
|
|
"loss": 0.8631,
|
|
"step": 829
|
|
},
|
|
{
|
|
"epoch": 3.32,
|
|
"grad_norm": 2.0850114822387695,
|
|
"learning_rate": 3.395161290322581e-06,
|
|
"loss": 0.7252,
|
|
"step": 830
|
|
},
|
|
{
|
|
"epoch": 3.324,
|
|
"grad_norm": 2.0298373699188232,
|
|
"learning_rate": 3.3870967741935484e-06,
|
|
"loss": 0.609,
|
|
"step": 831
|
|
},
|
|
{
|
|
"epoch": 3.328,
|
|
"grad_norm": 1.981493353843689,
|
|
"learning_rate": 3.3790322580645165e-06,
|
|
"loss": 0.6752,
|
|
"step": 832
|
|
},
|
|
{
|
|
"epoch": 3.332,
|
|
"grad_norm": 1.9498553276062012,
|
|
"learning_rate": 3.370967741935484e-06,
|
|
"loss": 0.7262,
|
|
"step": 833
|
|
},
|
|
{
|
|
"epoch": 3.336,
|
|
"grad_norm": 2.145020008087158,
|
|
"learning_rate": 3.362903225806452e-06,
|
|
"loss": 0.757,
|
|
"step": 834
|
|
},
|
|
{
|
|
"epoch": 3.34,
|
|
"grad_norm": 2.355727434158325,
|
|
"learning_rate": 3.35483870967742e-06,
|
|
"loss": 0.982,
|
|
"step": 835
|
|
},
|
|
{
|
|
"epoch": 3.344,
|
|
"grad_norm": 2.1901140213012695,
|
|
"learning_rate": 3.346774193548387e-06,
|
|
"loss": 1.0534,
|
|
"step": 836
|
|
},
|
|
{
|
|
"epoch": 3.348,
|
|
"grad_norm": 2.0761780738830566,
|
|
"learning_rate": 3.338709677419355e-06,
|
|
"loss": 0.668,
|
|
"step": 837
|
|
},
|
|
{
|
|
"epoch": 3.352,
|
|
"grad_norm": 1.9439619779586792,
|
|
"learning_rate": 3.330645161290323e-06,
|
|
"loss": 0.6646,
|
|
"step": 838
|
|
},
|
|
{
|
|
"epoch": 3.356,
|
|
"grad_norm": 2.3940210342407227,
|
|
"learning_rate": 3.322580645161291e-06,
|
|
"loss": 0.8633,
|
|
"step": 839
|
|
},
|
|
{
|
|
"epoch": 3.36,
|
|
"grad_norm": 2.194489002227783,
|
|
"learning_rate": 3.3145161290322586e-06,
|
|
"loss": 0.7908,
|
|
"step": 840
|
|
},
|
|
{
|
|
"epoch": 3.364,
|
|
"grad_norm": 2.1915009021759033,
|
|
"learning_rate": 3.306451612903226e-06,
|
|
"loss": 0.7874,
|
|
"step": 841
|
|
},
|
|
{
|
|
"epoch": 3.368,
|
|
"grad_norm": 2.230459451675415,
|
|
"learning_rate": 3.298387096774194e-06,
|
|
"loss": 0.8457,
|
|
"step": 842
|
|
},
|
|
{
|
|
"epoch": 3.372,
|
|
"grad_norm": 1.936184048652649,
|
|
"learning_rate": 3.2903225806451615e-06,
|
|
"loss": 0.5494,
|
|
"step": 843
|
|
},
|
|
{
|
|
"epoch": 3.376,
|
|
"grad_norm": 1.977290391921997,
|
|
"learning_rate": 3.2822580645161296e-06,
|
|
"loss": 0.5436,
|
|
"step": 844
|
|
},
|
|
{
|
|
"epoch": 3.38,
|
|
"grad_norm": 1.9874720573425293,
|
|
"learning_rate": 3.274193548387097e-06,
|
|
"loss": 0.6058,
|
|
"step": 845
|
|
},
|
|
{
|
|
"epoch": 3.384,
|
|
"grad_norm": 2.178884983062744,
|
|
"learning_rate": 3.2661290322580645e-06,
|
|
"loss": 0.786,
|
|
"step": 846
|
|
},
|
|
{
|
|
"epoch": 3.388,
|
|
"grad_norm": 2.1946234703063965,
|
|
"learning_rate": 3.2580645161290326e-06,
|
|
"loss": 0.8966,
|
|
"step": 847
|
|
},
|
|
{
|
|
"epoch": 3.392,
|
|
"grad_norm": 2.4029574394226074,
|
|
"learning_rate": 3.2500000000000002e-06,
|
|
"loss": 0.8739,
|
|
"step": 848
|
|
},
|
|
{
|
|
"epoch": 3.396,
|
|
"grad_norm": 2.372209072113037,
|
|
"learning_rate": 3.2419354838709683e-06,
|
|
"loss": 0.8783,
|
|
"step": 849
|
|
},
|
|
{
|
|
"epoch": 3.4,
|
|
"grad_norm": 2.1974384784698486,
|
|
"learning_rate": 3.2338709677419355e-06,
|
|
"loss": 0.7697,
|
|
"step": 850
|
|
},
|
|
{
|
|
"epoch": 3.404,
|
|
"grad_norm": 2.1393015384674072,
|
|
"learning_rate": 3.225806451612903e-06,
|
|
"loss": 0.7115,
|
|
"step": 851
|
|
},
|
|
{
|
|
"epoch": 3.408,
|
|
"grad_norm": 2.1827282905578613,
|
|
"learning_rate": 3.2177419354838713e-06,
|
|
"loss": 0.6953,
|
|
"step": 852
|
|
},
|
|
{
|
|
"epoch": 3.412,
|
|
"grad_norm": 2.194448709487915,
|
|
"learning_rate": 3.209677419354839e-06,
|
|
"loss": 0.7056,
|
|
"step": 853
|
|
},
|
|
{
|
|
"epoch": 3.416,
|
|
"grad_norm": 2.071216106414795,
|
|
"learning_rate": 3.201612903225807e-06,
|
|
"loss": 0.6491,
|
|
"step": 854
|
|
},
|
|
{
|
|
"epoch": 3.42,
|
|
"grad_norm": 2.0821046829223633,
|
|
"learning_rate": 3.1935483870967742e-06,
|
|
"loss": 0.7056,
|
|
"step": 855
|
|
},
|
|
{
|
|
"epoch": 3.424,
|
|
"grad_norm": 2.3360350131988525,
|
|
"learning_rate": 3.1854838709677423e-06,
|
|
"loss": 0.8335,
|
|
"step": 856
|
|
},
|
|
{
|
|
"epoch": 3.428,
|
|
"grad_norm": 2.363644599914551,
|
|
"learning_rate": 3.17741935483871e-06,
|
|
"loss": 0.9493,
|
|
"step": 857
|
|
},
|
|
{
|
|
"epoch": 3.432,
|
|
"grad_norm": 2.0370776653289795,
|
|
"learning_rate": 3.169354838709678e-06,
|
|
"loss": 0.6045,
|
|
"step": 858
|
|
},
|
|
{
|
|
"epoch": 3.436,
|
|
"grad_norm": 1.980400562286377,
|
|
"learning_rate": 3.1612903225806453e-06,
|
|
"loss": 0.7546,
|
|
"step": 859
|
|
},
|
|
{
|
|
"epoch": 3.44,
|
|
"grad_norm": 2.0285377502441406,
|
|
"learning_rate": 3.153225806451613e-06,
|
|
"loss": 0.7595,
|
|
"step": 860
|
|
},
|
|
{
|
|
"epoch": 3.444,
|
|
"grad_norm": 2.0669169425964355,
|
|
"learning_rate": 3.145161290322581e-06,
|
|
"loss": 0.7445,
|
|
"step": 861
|
|
},
|
|
{
|
|
"epoch": 3.448,
|
|
"grad_norm": 2.302081346511841,
|
|
"learning_rate": 3.1370967741935487e-06,
|
|
"loss": 0.8515,
|
|
"step": 862
|
|
},
|
|
{
|
|
"epoch": 3.452,
|
|
"grad_norm": 2.183147668838501,
|
|
"learning_rate": 3.1290322580645167e-06,
|
|
"loss": 0.839,
|
|
"step": 863
|
|
},
|
|
{
|
|
"epoch": 3.456,
|
|
"grad_norm": 2.367276191711426,
|
|
"learning_rate": 3.120967741935484e-06,
|
|
"loss": 0.8589,
|
|
"step": 864
|
|
},
|
|
{
|
|
"epoch": 3.46,
|
|
"grad_norm": 2.2339820861816406,
|
|
"learning_rate": 3.1129032258064516e-06,
|
|
"loss": 0.756,
|
|
"step": 865
|
|
},
|
|
{
|
|
"epoch": 3.464,
|
|
"grad_norm": 2.2325429916381836,
|
|
"learning_rate": 3.1048387096774197e-06,
|
|
"loss": 0.7331,
|
|
"step": 866
|
|
},
|
|
{
|
|
"epoch": 3.468,
|
|
"grad_norm": 1.9906346797943115,
|
|
"learning_rate": 3.0967741935483874e-06,
|
|
"loss": 0.6628,
|
|
"step": 867
|
|
},
|
|
{
|
|
"epoch": 3.472,
|
|
"grad_norm": 1.958089828491211,
|
|
"learning_rate": 3.0887096774193554e-06,
|
|
"loss": 0.5923,
|
|
"step": 868
|
|
},
|
|
{
|
|
"epoch": 3.476,
|
|
"grad_norm": 2.2247402667999268,
|
|
"learning_rate": 3.0806451612903227e-06,
|
|
"loss": 0.8411,
|
|
"step": 869
|
|
},
|
|
{
|
|
"epoch": 3.48,
|
|
"grad_norm": 2.3002967834472656,
|
|
"learning_rate": 3.0725806451612903e-06,
|
|
"loss": 0.9978,
|
|
"step": 870
|
|
},
|
|
{
|
|
"epoch": 3.484,
|
|
"grad_norm": 2.2606053352355957,
|
|
"learning_rate": 3.0645161290322584e-06,
|
|
"loss": 0.9531,
|
|
"step": 871
|
|
},
|
|
{
|
|
"epoch": 3.488,
|
|
"grad_norm": 2.662911891937256,
|
|
"learning_rate": 3.056451612903226e-06,
|
|
"loss": 1.014,
|
|
"step": 872
|
|
},
|
|
{
|
|
"epoch": 3.492,
|
|
"grad_norm": 2.019650936126709,
|
|
"learning_rate": 3.0483870967741937e-06,
|
|
"loss": 0.6297,
|
|
"step": 873
|
|
},
|
|
{
|
|
"epoch": 3.496,
|
|
"grad_norm": 2.097616195678711,
|
|
"learning_rate": 3.0403225806451614e-06,
|
|
"loss": 0.7289,
|
|
"step": 874
|
|
},
|
|
{
|
|
"epoch": 3.5,
|
|
"grad_norm": 2.0663158893585205,
|
|
"learning_rate": 3.0322580645161295e-06,
|
|
"loss": 0.602,
|
|
"step": 875
|
|
},
|
|
{
|
|
"epoch": 3.504,
|
|
"grad_norm": 2.182816505432129,
|
|
"learning_rate": 3.024193548387097e-06,
|
|
"loss": 0.851,
|
|
"step": 876
|
|
},
|
|
{
|
|
"epoch": 3.508,
|
|
"grad_norm": 2.0033254623413086,
|
|
"learning_rate": 3.0161290322580648e-06,
|
|
"loss": 0.538,
|
|
"step": 877
|
|
},
|
|
{
|
|
"epoch": 3.512,
|
|
"grad_norm": 2.444193124771118,
|
|
"learning_rate": 3.0080645161290324e-06,
|
|
"loss": 0.8755,
|
|
"step": 878
|
|
},
|
|
{
|
|
"epoch": 3.516,
|
|
"grad_norm": 2.156662940979004,
|
|
"learning_rate": 3e-06,
|
|
"loss": 0.866,
|
|
"step": 879
|
|
},
|
|
{
|
|
"epoch": 3.52,
|
|
"grad_norm": 2.0604827404022217,
|
|
"learning_rate": 2.991935483870968e-06,
|
|
"loss": 0.7635,
|
|
"step": 880
|
|
},
|
|
{
|
|
"epoch": 3.524,
|
|
"grad_norm": 2.1980183124542236,
|
|
"learning_rate": 2.983870967741936e-06,
|
|
"loss": 0.9103,
|
|
"step": 881
|
|
},
|
|
{
|
|
"epoch": 3.528,
|
|
"grad_norm": 2.5254287719726562,
|
|
"learning_rate": 2.975806451612904e-06,
|
|
"loss": 1.0833,
|
|
"step": 882
|
|
},
|
|
{
|
|
"epoch": 3.532,
|
|
"grad_norm": 2.141465902328491,
|
|
"learning_rate": 2.967741935483871e-06,
|
|
"loss": 0.667,
|
|
"step": 883
|
|
},
|
|
{
|
|
"epoch": 3.536,
|
|
"grad_norm": 2.2055859565734863,
|
|
"learning_rate": 2.9596774193548388e-06,
|
|
"loss": 0.6874,
|
|
"step": 884
|
|
},
|
|
{
|
|
"epoch": 3.54,
|
|
"grad_norm": 2.136110544204712,
|
|
"learning_rate": 2.951612903225807e-06,
|
|
"loss": 0.7897,
|
|
"step": 885
|
|
},
|
|
{
|
|
"epoch": 3.544,
|
|
"grad_norm": 2.2698121070861816,
|
|
"learning_rate": 2.9435483870967745e-06,
|
|
"loss": 0.7304,
|
|
"step": 886
|
|
},
|
|
{
|
|
"epoch": 3.548,
|
|
"grad_norm": 2.3062589168548584,
|
|
"learning_rate": 2.9354838709677417e-06,
|
|
"loss": 1.0791,
|
|
"step": 887
|
|
},
|
|
{
|
|
"epoch": 3.552,
|
|
"grad_norm": 2.4121716022491455,
|
|
"learning_rate": 2.92741935483871e-06,
|
|
"loss": 0.9852,
|
|
"step": 888
|
|
},
|
|
{
|
|
"epoch": 3.556,
|
|
"grad_norm": 2.3910701274871826,
|
|
"learning_rate": 2.9193548387096775e-06,
|
|
"loss": 0.9276,
|
|
"step": 889
|
|
},
|
|
{
|
|
"epoch": 3.56,
|
|
"grad_norm": 2.2537155151367188,
|
|
"learning_rate": 2.9112903225806456e-06,
|
|
"loss": 0.9438,
|
|
"step": 890
|
|
},
|
|
{
|
|
"epoch": 3.564,
|
|
"grad_norm": 2.2412519454956055,
|
|
"learning_rate": 2.903225806451613e-06,
|
|
"loss": 0.6798,
|
|
"step": 891
|
|
},
|
|
{
|
|
"epoch": 3.568,
|
|
"grad_norm": 2.003661632537842,
|
|
"learning_rate": 2.8951612903225804e-06,
|
|
"loss": 0.6765,
|
|
"step": 892
|
|
},
|
|
{
|
|
"epoch": 3.572,
|
|
"grad_norm": 2.2614223957061768,
|
|
"learning_rate": 2.8870967741935485e-06,
|
|
"loss": 0.871,
|
|
"step": 893
|
|
},
|
|
{
|
|
"epoch": 3.576,
|
|
"grad_norm": 2.2518208026885986,
|
|
"learning_rate": 2.879032258064516e-06,
|
|
"loss": 0.829,
|
|
"step": 894
|
|
},
|
|
{
|
|
"epoch": 3.58,
|
|
"grad_norm": 2.1910247802734375,
|
|
"learning_rate": 2.8709677419354843e-06,
|
|
"loss": 0.7672,
|
|
"step": 895
|
|
},
|
|
{
|
|
"epoch": 3.584,
|
|
"grad_norm": 2.079641580581665,
|
|
"learning_rate": 2.862903225806452e-06,
|
|
"loss": 0.8007,
|
|
"step": 896
|
|
},
|
|
{
|
|
"epoch": 3.588,
|
|
"grad_norm": 2.3112521171569824,
|
|
"learning_rate": 2.8548387096774196e-06,
|
|
"loss": 0.8943,
|
|
"step": 897
|
|
},
|
|
{
|
|
"epoch": 3.592,
|
|
"grad_norm": 2.2150983810424805,
|
|
"learning_rate": 2.8467741935483872e-06,
|
|
"loss": 0.8935,
|
|
"step": 898
|
|
},
|
|
{
|
|
"epoch": 3.596,
|
|
"grad_norm": 2.2401678562164307,
|
|
"learning_rate": 2.8387096774193553e-06,
|
|
"loss": 0.834,
|
|
"step": 899
|
|
},
|
|
{
|
|
"epoch": 3.6,
|
|
"grad_norm": 2.1043715476989746,
|
|
"learning_rate": 2.830645161290323e-06,
|
|
"loss": 0.7918,
|
|
"step": 900
|
|
},
|
|
{
|
|
"epoch": 3.604,
|
|
"grad_norm": 2.1325125694274902,
|
|
"learning_rate": 2.822580645161291e-06,
|
|
"loss": 0.7057,
|
|
"step": 901
|
|
},
|
|
{
|
|
"epoch": 3.608,
|
|
"grad_norm": 2.4012649059295654,
|
|
"learning_rate": 2.8145161290322583e-06,
|
|
"loss": 0.856,
|
|
"step": 902
|
|
},
|
|
{
|
|
"epoch": 3.612,
|
|
"grad_norm": 2.29689884185791,
|
|
"learning_rate": 2.806451612903226e-06,
|
|
"loss": 0.6157,
|
|
"step": 903
|
|
},
|
|
{
|
|
"epoch": 3.616,
|
|
"grad_norm": 2.371718168258667,
|
|
"learning_rate": 2.798387096774194e-06,
|
|
"loss": 0.7604,
|
|
"step": 904
|
|
},
|
|
{
|
|
"epoch": 3.62,
|
|
"grad_norm": 2.0902979373931885,
|
|
"learning_rate": 2.7903225806451617e-06,
|
|
"loss": 0.639,
|
|
"step": 905
|
|
},
|
|
{
|
|
"epoch": 3.624,
|
|
"grad_norm": 2.0452613830566406,
|
|
"learning_rate": 2.782258064516129e-06,
|
|
"loss": 0.6567,
|
|
"step": 906
|
|
},
|
|
{
|
|
"epoch": 3.628,
|
|
"grad_norm": 2.4750518798828125,
|
|
"learning_rate": 2.774193548387097e-06,
|
|
"loss": 0.9825,
|
|
"step": 907
|
|
},
|
|
{
|
|
"epoch": 3.632,
|
|
"grad_norm": 2.1596009731292725,
|
|
"learning_rate": 2.7661290322580646e-06,
|
|
"loss": 0.7421,
|
|
"step": 908
|
|
},
|
|
{
|
|
"epoch": 3.636,
|
|
"grad_norm": 2.1303629875183105,
|
|
"learning_rate": 2.7580645161290327e-06,
|
|
"loss": 0.7502,
|
|
"step": 909
|
|
},
|
|
{
|
|
"epoch": 3.64,
|
|
"grad_norm": 2.1187257766723633,
|
|
"learning_rate": 2.7500000000000004e-06,
|
|
"loss": 0.7001,
|
|
"step": 910
|
|
},
|
|
{
|
|
"epoch": 3.644,
|
|
"grad_norm": 2.2745378017425537,
|
|
"learning_rate": 2.7419354838709676e-06,
|
|
"loss": 1.0053,
|
|
"step": 911
|
|
},
|
|
{
|
|
"epoch": 3.648,
|
|
"grad_norm": 2.392577886581421,
|
|
"learning_rate": 2.7338709677419357e-06,
|
|
"loss": 0.842,
|
|
"step": 912
|
|
},
|
|
{
|
|
"epoch": 3.652,
|
|
"grad_norm": 2.352764844894409,
|
|
"learning_rate": 2.7258064516129033e-06,
|
|
"loss": 0.8263,
|
|
"step": 913
|
|
},
|
|
{
|
|
"epoch": 3.656,
|
|
"grad_norm": 2.2857632637023926,
|
|
"learning_rate": 2.7177419354838714e-06,
|
|
"loss": 0.9502,
|
|
"step": 914
|
|
},
|
|
{
|
|
"epoch": 3.66,
|
|
"grad_norm": 2.1214029788970947,
|
|
"learning_rate": 2.709677419354839e-06,
|
|
"loss": 0.7304,
|
|
"step": 915
|
|
},
|
|
{
|
|
"epoch": 3.664,
|
|
"grad_norm": 2.1727020740509033,
|
|
"learning_rate": 2.7016129032258063e-06,
|
|
"loss": 0.7385,
|
|
"step": 916
|
|
},
|
|
{
|
|
"epoch": 3.668,
|
|
"grad_norm": 2.2126569747924805,
|
|
"learning_rate": 2.6935483870967744e-06,
|
|
"loss": 0.8303,
|
|
"step": 917
|
|
},
|
|
{
|
|
"epoch": 3.672,
|
|
"grad_norm": 2.3064305782318115,
|
|
"learning_rate": 2.685483870967742e-06,
|
|
"loss": 0.9337,
|
|
"step": 918
|
|
},
|
|
{
|
|
"epoch": 3.676,
|
|
"grad_norm": 2.20000958442688,
|
|
"learning_rate": 2.67741935483871e-06,
|
|
"loss": 0.8639,
|
|
"step": 919
|
|
},
|
|
{
|
|
"epoch": 3.68,
|
|
"grad_norm": 2.3370859622955322,
|
|
"learning_rate": 2.6693548387096773e-06,
|
|
"loss": 0.9072,
|
|
"step": 920
|
|
},
|
|
{
|
|
"epoch": 3.684,
|
|
"grad_norm": 2.3687403202056885,
|
|
"learning_rate": 2.6612903225806454e-06,
|
|
"loss": 0.9368,
|
|
"step": 921
|
|
},
|
|
{
|
|
"epoch": 3.6879999999999997,
|
|
"grad_norm": 2.11373233795166,
|
|
"learning_rate": 2.653225806451613e-06,
|
|
"loss": 0.6405,
|
|
"step": 922
|
|
},
|
|
{
|
|
"epoch": 3.692,
|
|
"grad_norm": 2.117852210998535,
|
|
"learning_rate": 2.645161290322581e-06,
|
|
"loss": 0.9027,
|
|
"step": 923
|
|
},
|
|
{
|
|
"epoch": 3.6959999999999997,
|
|
"grad_norm": 2.4241926670074463,
|
|
"learning_rate": 2.637096774193549e-06,
|
|
"loss": 0.9222,
|
|
"step": 924
|
|
},
|
|
{
|
|
"epoch": 3.7,
|
|
"grad_norm": 2.1808900833129883,
|
|
"learning_rate": 2.629032258064516e-06,
|
|
"loss": 0.6647,
|
|
"step": 925
|
|
},
|
|
{
|
|
"epoch": 3.7039999999999997,
|
|
"grad_norm": 2.2304883003234863,
|
|
"learning_rate": 2.620967741935484e-06,
|
|
"loss": 0.8719,
|
|
"step": 926
|
|
},
|
|
{
|
|
"epoch": 3.708,
|
|
"grad_norm": 2.160715341567993,
|
|
"learning_rate": 2.6129032258064518e-06,
|
|
"loss": 0.838,
|
|
"step": 927
|
|
},
|
|
{
|
|
"epoch": 3.7119999999999997,
|
|
"grad_norm": 2.529524803161621,
|
|
"learning_rate": 2.60483870967742e-06,
|
|
"loss": 1.0631,
|
|
"step": 928
|
|
},
|
|
{
|
|
"epoch": 3.716,
|
|
"grad_norm": 2.2674691677093506,
|
|
"learning_rate": 2.5967741935483875e-06,
|
|
"loss": 0.704,
|
|
"step": 929
|
|
},
|
|
{
|
|
"epoch": 3.7199999999999998,
|
|
"grad_norm": 2.225947380065918,
|
|
"learning_rate": 2.5887096774193547e-06,
|
|
"loss": 0.6462,
|
|
"step": 930
|
|
},
|
|
{
|
|
"epoch": 3.724,
|
|
"grad_norm": 2.0101845264434814,
|
|
"learning_rate": 2.580645161290323e-06,
|
|
"loss": 0.6034,
|
|
"step": 931
|
|
},
|
|
{
|
|
"epoch": 3.7279999999999998,
|
|
"grad_norm": 2.166468858718872,
|
|
"learning_rate": 2.5725806451612905e-06,
|
|
"loss": 0.7711,
|
|
"step": 932
|
|
},
|
|
{
|
|
"epoch": 3.732,
|
|
"grad_norm": 2.0121777057647705,
|
|
"learning_rate": 2.5645161290322585e-06,
|
|
"loss": 0.7086,
|
|
"step": 933
|
|
},
|
|
{
|
|
"epoch": 3.7359999999999998,
|
|
"grad_norm": 2.2757959365844727,
|
|
"learning_rate": 2.5564516129032258e-06,
|
|
"loss": 0.8797,
|
|
"step": 934
|
|
},
|
|
{
|
|
"epoch": 3.74,
|
|
"grad_norm": 2.2584173679351807,
|
|
"learning_rate": 2.5483870967741934e-06,
|
|
"loss": 0.8662,
|
|
"step": 935
|
|
},
|
|
{
|
|
"epoch": 3.7439999999999998,
|
|
"grad_norm": 2.009277105331421,
|
|
"learning_rate": 2.5403225806451615e-06,
|
|
"loss": 0.5832,
|
|
"step": 936
|
|
},
|
|
{
|
|
"epoch": 3.748,
|
|
"grad_norm": 2.0112719535827637,
|
|
"learning_rate": 2.532258064516129e-06,
|
|
"loss": 0.7917,
|
|
"step": 937
|
|
},
|
|
{
|
|
"epoch": 3.752,
|
|
"grad_norm": 2.2253365516662598,
|
|
"learning_rate": 2.5241935483870972e-06,
|
|
"loss": 0.8194,
|
|
"step": 938
|
|
},
|
|
{
|
|
"epoch": 3.7560000000000002,
|
|
"grad_norm": 2.218528985977173,
|
|
"learning_rate": 2.5161290322580645e-06,
|
|
"loss": 0.6909,
|
|
"step": 939
|
|
},
|
|
{
|
|
"epoch": 3.76,
|
|
"grad_norm": 2.150059938430786,
|
|
"learning_rate": 2.5080645161290325e-06,
|
|
"loss": 0.7104,
|
|
"step": 940
|
|
},
|
|
{
|
|
"epoch": 3.7640000000000002,
|
|
"grad_norm": 2.401381254196167,
|
|
"learning_rate": 2.5e-06,
|
|
"loss": 0.9667,
|
|
"step": 941
|
|
},
|
|
{
|
|
"epoch": 3.768,
|
|
"grad_norm": 2.2755489349365234,
|
|
"learning_rate": 2.491935483870968e-06,
|
|
"loss": 0.7664,
|
|
"step": 942
|
|
},
|
|
{
|
|
"epoch": 3.7720000000000002,
|
|
"grad_norm": 2.206324577331543,
|
|
"learning_rate": 2.4838709677419355e-06,
|
|
"loss": 1.0714,
|
|
"step": 943
|
|
},
|
|
{
|
|
"epoch": 3.776,
|
|
"grad_norm": 2.0583205223083496,
|
|
"learning_rate": 2.4758064516129036e-06,
|
|
"loss": 0.8074,
|
|
"step": 944
|
|
},
|
|
{
|
|
"epoch": 3.7800000000000002,
|
|
"grad_norm": 2.0630557537078857,
|
|
"learning_rate": 2.4677419354838712e-06,
|
|
"loss": 0.7139,
|
|
"step": 945
|
|
},
|
|
{
|
|
"epoch": 3.784,
|
|
"grad_norm": 1.9153733253479004,
|
|
"learning_rate": 2.459677419354839e-06,
|
|
"loss": 0.6087,
|
|
"step": 946
|
|
},
|
|
{
|
|
"epoch": 3.7880000000000003,
|
|
"grad_norm": 2.1446893215179443,
|
|
"learning_rate": 2.4516129032258066e-06,
|
|
"loss": 0.651,
|
|
"step": 947
|
|
},
|
|
{
|
|
"epoch": 3.792,
|
|
"grad_norm": 2.325532913208008,
|
|
"learning_rate": 2.4435483870967742e-06,
|
|
"loss": 0.9991,
|
|
"step": 948
|
|
},
|
|
{
|
|
"epoch": 3.7960000000000003,
|
|
"grad_norm": 2.2939658164978027,
|
|
"learning_rate": 2.4354838709677423e-06,
|
|
"loss": 0.9107,
|
|
"step": 949
|
|
},
|
|
{
|
|
"epoch": 3.8,
|
|
"grad_norm": 1.944509506225586,
|
|
"learning_rate": 2.42741935483871e-06,
|
|
"loss": 0.677,
|
|
"step": 950
|
|
},
|
|
{
|
|
"epoch": 3.8040000000000003,
|
|
"grad_norm": 2.1296355724334717,
|
|
"learning_rate": 2.4193548387096776e-06,
|
|
"loss": 0.6326,
|
|
"step": 951
|
|
},
|
|
{
|
|
"epoch": 3.808,
|
|
"grad_norm": 2.449873447418213,
|
|
"learning_rate": 2.4112903225806453e-06,
|
|
"loss": 1.0493,
|
|
"step": 952
|
|
},
|
|
{
|
|
"epoch": 3.8120000000000003,
|
|
"grad_norm": 2.2936856746673584,
|
|
"learning_rate": 2.4032258064516133e-06,
|
|
"loss": 0.751,
|
|
"step": 953
|
|
},
|
|
{
|
|
"epoch": 3.816,
|
|
"grad_norm": 2.239792823791504,
|
|
"learning_rate": 2.3951612903225806e-06,
|
|
"loss": 0.8381,
|
|
"step": 954
|
|
},
|
|
{
|
|
"epoch": 3.82,
|
|
"grad_norm": 2.564072608947754,
|
|
"learning_rate": 2.3870967741935486e-06,
|
|
"loss": 0.9443,
|
|
"step": 955
|
|
},
|
|
{
|
|
"epoch": 3.824,
|
|
"grad_norm": 2.301388740539551,
|
|
"learning_rate": 2.3790322580645163e-06,
|
|
"loss": 0.9068,
|
|
"step": 956
|
|
},
|
|
{
|
|
"epoch": 3.828,
|
|
"grad_norm": 2.4470112323760986,
|
|
"learning_rate": 2.370967741935484e-06,
|
|
"loss": 0.834,
|
|
"step": 957
|
|
},
|
|
{
|
|
"epoch": 3.832,
|
|
"grad_norm": 2.2462210655212402,
|
|
"learning_rate": 2.362903225806452e-06,
|
|
"loss": 0.8344,
|
|
"step": 958
|
|
},
|
|
{
|
|
"epoch": 3.836,
|
|
"grad_norm": 2.14129900932312,
|
|
"learning_rate": 2.3548387096774193e-06,
|
|
"loss": 0.7379,
|
|
"step": 959
|
|
},
|
|
{
|
|
"epoch": 3.84,
|
|
"grad_norm": 2.263746976852417,
|
|
"learning_rate": 2.3467741935483873e-06,
|
|
"loss": 0.8391,
|
|
"step": 960
|
|
},
|
|
{
|
|
"epoch": 3.844,
|
|
"grad_norm": 2.0509262084960938,
|
|
"learning_rate": 2.338709677419355e-06,
|
|
"loss": 0.7171,
|
|
"step": 961
|
|
},
|
|
{
|
|
"epoch": 3.848,
|
|
"grad_norm": 2.1760873794555664,
|
|
"learning_rate": 2.3306451612903227e-06,
|
|
"loss": 0.8904,
|
|
"step": 962
|
|
},
|
|
{
|
|
"epoch": 3.852,
|
|
"grad_norm": 2.5136802196502686,
|
|
"learning_rate": 2.3225806451612907e-06,
|
|
"loss": 0.9318,
|
|
"step": 963
|
|
},
|
|
{
|
|
"epoch": 3.856,
|
|
"grad_norm": 2.2124810218811035,
|
|
"learning_rate": 2.3145161290322584e-06,
|
|
"loss": 0.8245,
|
|
"step": 964
|
|
},
|
|
{
|
|
"epoch": 3.86,
|
|
"grad_norm": 2.1728949546813965,
|
|
"learning_rate": 2.306451612903226e-06,
|
|
"loss": 0.8638,
|
|
"step": 965
|
|
},
|
|
{
|
|
"epoch": 3.864,
|
|
"grad_norm": 2.495039939880371,
|
|
"learning_rate": 2.2983870967741937e-06,
|
|
"loss": 0.9993,
|
|
"step": 966
|
|
},
|
|
{
|
|
"epoch": 3.868,
|
|
"grad_norm": 2.2015881538391113,
|
|
"learning_rate": 2.2903225806451614e-06,
|
|
"loss": 0.7732,
|
|
"step": 967
|
|
},
|
|
{
|
|
"epoch": 3.872,
|
|
"grad_norm": 2.163741111755371,
|
|
"learning_rate": 2.2822580645161294e-06,
|
|
"loss": 0.759,
|
|
"step": 968
|
|
},
|
|
{
|
|
"epoch": 3.876,
|
|
"grad_norm": 2.3122260570526123,
|
|
"learning_rate": 2.274193548387097e-06,
|
|
"loss": 1.0268,
|
|
"step": 969
|
|
},
|
|
{
|
|
"epoch": 3.88,
|
|
"grad_norm": 2.0754685401916504,
|
|
"learning_rate": 2.2661290322580647e-06,
|
|
"loss": 0.624,
|
|
"step": 970
|
|
},
|
|
{
|
|
"epoch": 3.884,
|
|
"grad_norm": 2.128565549850464,
|
|
"learning_rate": 2.2580645161290324e-06,
|
|
"loss": 0.7533,
|
|
"step": 971
|
|
},
|
|
{
|
|
"epoch": 3.888,
|
|
"grad_norm": 2.1690149307250977,
|
|
"learning_rate": 2.25e-06,
|
|
"loss": 0.6677,
|
|
"step": 972
|
|
},
|
|
{
|
|
"epoch": 3.892,
|
|
"grad_norm": 2.17118239402771,
|
|
"learning_rate": 2.2419354838709677e-06,
|
|
"loss": 0.7237,
|
|
"step": 973
|
|
},
|
|
{
|
|
"epoch": 3.896,
|
|
"grad_norm": 2.3318235874176025,
|
|
"learning_rate": 2.233870967741936e-06,
|
|
"loss": 0.7337,
|
|
"step": 974
|
|
},
|
|
{
|
|
"epoch": 3.9,
|
|
"grad_norm": 2.1360106468200684,
|
|
"learning_rate": 2.2258064516129034e-06,
|
|
"loss": 0.8524,
|
|
"step": 975
|
|
},
|
|
{
|
|
"epoch": 3.904,
|
|
"grad_norm": 2.201362371444702,
|
|
"learning_rate": 2.217741935483871e-06,
|
|
"loss": 0.8545,
|
|
"step": 976
|
|
},
|
|
{
|
|
"epoch": 3.908,
|
|
"grad_norm": 2.2761240005493164,
|
|
"learning_rate": 2.209677419354839e-06,
|
|
"loss": 0.8508,
|
|
"step": 977
|
|
},
|
|
{
|
|
"epoch": 3.912,
|
|
"grad_norm": 1.9397066831588745,
|
|
"learning_rate": 2.2016129032258064e-06,
|
|
"loss": 0.5631,
|
|
"step": 978
|
|
},
|
|
{
|
|
"epoch": 3.916,
|
|
"grad_norm": 2.3536376953125,
|
|
"learning_rate": 2.1935483870967745e-06,
|
|
"loss": 0.9373,
|
|
"step": 979
|
|
},
|
|
{
|
|
"epoch": 3.92,
|
|
"grad_norm": 2.205050230026245,
|
|
"learning_rate": 2.185483870967742e-06,
|
|
"loss": 0.7274,
|
|
"step": 980
|
|
},
|
|
{
|
|
"epoch": 3.924,
|
|
"grad_norm": 2.0966451168060303,
|
|
"learning_rate": 2.17741935483871e-06,
|
|
"loss": 0.5711,
|
|
"step": 981
|
|
},
|
|
{
|
|
"epoch": 3.928,
|
|
"grad_norm": 2.1973869800567627,
|
|
"learning_rate": 2.169354838709678e-06,
|
|
"loss": 0.8734,
|
|
"step": 982
|
|
},
|
|
{
|
|
"epoch": 3.932,
|
|
"grad_norm": 2.3555471897125244,
|
|
"learning_rate": 2.161290322580645e-06,
|
|
"loss": 0.9951,
|
|
"step": 983
|
|
},
|
|
{
|
|
"epoch": 3.936,
|
|
"grad_norm": 2.1399526596069336,
|
|
"learning_rate": 2.153225806451613e-06,
|
|
"loss": 0.8126,
|
|
"step": 984
|
|
},
|
|
{
|
|
"epoch": 3.94,
|
|
"grad_norm": 2.199490785598755,
|
|
"learning_rate": 2.145161290322581e-06,
|
|
"loss": 0.9233,
|
|
"step": 985
|
|
},
|
|
{
|
|
"epoch": 3.944,
|
|
"grad_norm": 2.1005666255950928,
|
|
"learning_rate": 2.1370967741935485e-06,
|
|
"loss": 0.6775,
|
|
"step": 986
|
|
},
|
|
{
|
|
"epoch": 3.948,
|
|
"grad_norm": 2.1148951053619385,
|
|
"learning_rate": 2.129032258064516e-06,
|
|
"loss": 0.621,
|
|
"step": 987
|
|
},
|
|
{
|
|
"epoch": 3.952,
|
|
"grad_norm": 2.326742649078369,
|
|
"learning_rate": 2.1209677419354842e-06,
|
|
"loss": 0.7486,
|
|
"step": 988
|
|
},
|
|
{
|
|
"epoch": 3.956,
|
|
"grad_norm": 2.4045190811157227,
|
|
"learning_rate": 2.112903225806452e-06,
|
|
"loss": 0.8569,
|
|
"step": 989
|
|
},
|
|
{
|
|
"epoch": 3.96,
|
|
"grad_norm": 2.2377429008483887,
|
|
"learning_rate": 2.1048387096774195e-06,
|
|
"loss": 0.8598,
|
|
"step": 990
|
|
},
|
|
{
|
|
"epoch": 3.964,
|
|
"grad_norm": 2.4663689136505127,
|
|
"learning_rate": 2.096774193548387e-06,
|
|
"loss": 0.8422,
|
|
"step": 991
|
|
},
|
|
{
|
|
"epoch": 3.968,
|
|
"grad_norm": 2.2024405002593994,
|
|
"learning_rate": 2.088709677419355e-06,
|
|
"loss": 0.7979,
|
|
"step": 992
|
|
},
|
|
{
|
|
"epoch": 3.972,
|
|
"grad_norm": 2.1740357875823975,
|
|
"learning_rate": 2.080645161290323e-06,
|
|
"loss": 0.6501,
|
|
"step": 993
|
|
},
|
|
{
|
|
"epoch": 3.976,
|
|
"grad_norm": 2.392676830291748,
|
|
"learning_rate": 2.07258064516129e-06,
|
|
"loss": 0.9717,
|
|
"step": 994
|
|
},
|
|
{
|
|
"epoch": 3.98,
|
|
"grad_norm": 2.417933225631714,
|
|
"learning_rate": 2.0645161290322582e-06,
|
|
"loss": 0.9765,
|
|
"step": 995
|
|
},
|
|
{
|
|
"epoch": 3.984,
|
|
"grad_norm": 2.3684704303741455,
|
|
"learning_rate": 2.056451612903226e-06,
|
|
"loss": 0.7818,
|
|
"step": 996
|
|
},
|
|
{
|
|
"epoch": 3.988,
|
|
"grad_norm": 2.265115737915039,
|
|
"learning_rate": 2.0483870967741936e-06,
|
|
"loss": 0.9738,
|
|
"step": 997
|
|
},
|
|
{
|
|
"epoch": 3.992,
|
|
"grad_norm": 2.0875535011291504,
|
|
"learning_rate": 2.0403225806451616e-06,
|
|
"loss": 0.6718,
|
|
"step": 998
|
|
},
|
|
{
|
|
"epoch": 3.996,
|
|
"grad_norm": 2.160306453704834,
|
|
"learning_rate": 2.0322580645161293e-06,
|
|
"loss": 0.647,
|
|
"step": 999
|
|
},
|
|
{
|
|
"epoch": 4.0,
|
|
"grad_norm": 2.2271740436553955,
|
|
"learning_rate": 2.024193548387097e-06,
|
|
"loss": 0.926,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 4.004,
|
|
"grad_norm": 2.406463623046875,
|
|
"learning_rate": 2.0161290322580646e-06,
|
|
"loss": 1.0294,
|
|
"step": 1001
|
|
},
|
|
{
|
|
"epoch": 4.008,
|
|
"grad_norm": 2.06563401222229,
|
|
"learning_rate": 2.0080645161290323e-06,
|
|
"loss": 0.7166,
|
|
"step": 1002
|
|
},
|
|
{
|
|
"epoch": 4.012,
|
|
"grad_norm": 2.054182529449463,
|
|
"learning_rate": 2.0000000000000003e-06,
|
|
"loss": 0.7667,
|
|
"step": 1003
|
|
},
|
|
{
|
|
"epoch": 4.016,
|
|
"grad_norm": 2.4627397060394287,
|
|
"learning_rate": 1.991935483870968e-06,
|
|
"loss": 0.6641,
|
|
"step": 1004
|
|
},
|
|
{
|
|
"epoch": 4.02,
|
|
"grad_norm": 2.3262436389923096,
|
|
"learning_rate": 1.9838709677419356e-06,
|
|
"loss": 0.9825,
|
|
"step": 1005
|
|
},
|
|
{
|
|
"epoch": 4.024,
|
|
"grad_norm": 2.2590932846069336,
|
|
"learning_rate": 1.9758064516129033e-06,
|
|
"loss": 0.7796,
|
|
"step": 1006
|
|
},
|
|
{
|
|
"epoch": 4.028,
|
|
"grad_norm": 1.988510012626648,
|
|
"learning_rate": 1.967741935483871e-06,
|
|
"loss": 0.7545,
|
|
"step": 1007
|
|
},
|
|
{
|
|
"epoch": 4.032,
|
|
"grad_norm": 2.131132125854492,
|
|
"learning_rate": 1.9596774193548386e-06,
|
|
"loss": 0.7807,
|
|
"step": 1008
|
|
},
|
|
{
|
|
"epoch": 4.036,
|
|
"grad_norm": 2.2927346229553223,
|
|
"learning_rate": 1.9516129032258067e-06,
|
|
"loss": 0.9807,
|
|
"step": 1009
|
|
},
|
|
{
|
|
"epoch": 4.04,
|
|
"grad_norm": 2.2677903175354004,
|
|
"learning_rate": 1.9435483870967743e-06,
|
|
"loss": 0.6969,
|
|
"step": 1010
|
|
},
|
|
{
|
|
"epoch": 4.044,
|
|
"grad_norm": 2.0686841011047363,
|
|
"learning_rate": 1.935483870967742e-06,
|
|
"loss": 0.5789,
|
|
"step": 1011
|
|
},
|
|
{
|
|
"epoch": 4.048,
|
|
"grad_norm": 2.2940640449523926,
|
|
"learning_rate": 1.92741935483871e-06,
|
|
"loss": 0.6927,
|
|
"step": 1012
|
|
},
|
|
{
|
|
"epoch": 4.052,
|
|
"grad_norm": 2.3742549419403076,
|
|
"learning_rate": 1.9193548387096773e-06,
|
|
"loss": 0.7842,
|
|
"step": 1013
|
|
},
|
|
{
|
|
"epoch": 4.056,
|
|
"grad_norm": 2.1507906913757324,
|
|
"learning_rate": 1.9112903225806454e-06,
|
|
"loss": 0.777,
|
|
"step": 1014
|
|
},
|
|
{
|
|
"epoch": 4.06,
|
|
"grad_norm": 2.125037670135498,
|
|
"learning_rate": 1.9032258064516128e-06,
|
|
"loss": 0.715,
|
|
"step": 1015
|
|
},
|
|
{
|
|
"epoch": 4.064,
|
|
"grad_norm": 2.556920289993286,
|
|
"learning_rate": 1.8951612903225807e-06,
|
|
"loss": 0.9831,
|
|
"step": 1016
|
|
},
|
|
{
|
|
"epoch": 4.068,
|
|
"grad_norm": 2.2890443801879883,
|
|
"learning_rate": 1.8870967741935486e-06,
|
|
"loss": 0.8791,
|
|
"step": 1017
|
|
},
|
|
{
|
|
"epoch": 4.072,
|
|
"grad_norm": 2.3837790489196777,
|
|
"learning_rate": 1.8790322580645162e-06,
|
|
"loss": 0.7557,
|
|
"step": 1018
|
|
},
|
|
{
|
|
"epoch": 4.076,
|
|
"grad_norm": 2.2448434829711914,
|
|
"learning_rate": 1.870967741935484e-06,
|
|
"loss": 0.6021,
|
|
"step": 1019
|
|
},
|
|
{
|
|
"epoch": 4.08,
|
|
"grad_norm": 2.1938180923461914,
|
|
"learning_rate": 1.8629032258064517e-06,
|
|
"loss": 0.5916,
|
|
"step": 1020
|
|
},
|
|
{
|
|
"epoch": 4.084,
|
|
"grad_norm": 2.184678792953491,
|
|
"learning_rate": 1.8548387096774196e-06,
|
|
"loss": 0.6774,
|
|
"step": 1021
|
|
},
|
|
{
|
|
"epoch": 4.088,
|
|
"grad_norm": 2.091639518737793,
|
|
"learning_rate": 1.8467741935483875e-06,
|
|
"loss": 0.5811,
|
|
"step": 1022
|
|
},
|
|
{
|
|
"epoch": 4.092,
|
|
"grad_norm": 2.2382893562316895,
|
|
"learning_rate": 1.838709677419355e-06,
|
|
"loss": 0.7811,
|
|
"step": 1023
|
|
},
|
|
{
|
|
"epoch": 4.096,
|
|
"grad_norm": 2.4816744327545166,
|
|
"learning_rate": 1.8306451612903228e-06,
|
|
"loss": 0.7529,
|
|
"step": 1024
|
|
},
|
|
{
|
|
"epoch": 4.1,
|
|
"grad_norm": 2.398247718811035,
|
|
"learning_rate": 1.8225806451612904e-06,
|
|
"loss": 0.7077,
|
|
"step": 1025
|
|
},
|
|
{
|
|
"epoch": 4.104,
|
|
"grad_norm": 2.3608181476593018,
|
|
"learning_rate": 1.8145161290322583e-06,
|
|
"loss": 0.6565,
|
|
"step": 1026
|
|
},
|
|
{
|
|
"epoch": 4.108,
|
|
"grad_norm": 2.1144907474517822,
|
|
"learning_rate": 1.8064516129032258e-06,
|
|
"loss": 0.5724,
|
|
"step": 1027
|
|
},
|
|
{
|
|
"epoch": 4.112,
|
|
"grad_norm": 2.3261606693267822,
|
|
"learning_rate": 1.7983870967741936e-06,
|
|
"loss": 0.6656,
|
|
"step": 1028
|
|
},
|
|
{
|
|
"epoch": 4.116,
|
|
"grad_norm": 2.0781667232513428,
|
|
"learning_rate": 1.7903225806451615e-06,
|
|
"loss": 0.7156,
|
|
"step": 1029
|
|
},
|
|
{
|
|
"epoch": 4.12,
|
|
"grad_norm": 2.208777666091919,
|
|
"learning_rate": 1.7822580645161291e-06,
|
|
"loss": 0.7395,
|
|
"step": 1030
|
|
},
|
|
{
|
|
"epoch": 4.124,
|
|
"grad_norm": 2.323026180267334,
|
|
"learning_rate": 1.774193548387097e-06,
|
|
"loss": 0.6843,
|
|
"step": 1031
|
|
},
|
|
{
|
|
"epoch": 4.128,
|
|
"grad_norm": 2.2553014755249023,
|
|
"learning_rate": 1.7661290322580647e-06,
|
|
"loss": 0.7674,
|
|
"step": 1032
|
|
},
|
|
{
|
|
"epoch": 4.132,
|
|
"grad_norm": 2.1738624572753906,
|
|
"learning_rate": 1.7580645161290325e-06,
|
|
"loss": 0.7363,
|
|
"step": 1033
|
|
},
|
|
{
|
|
"epoch": 4.136,
|
|
"grad_norm": 2.306398630142212,
|
|
"learning_rate": 1.75e-06,
|
|
"loss": 0.8493,
|
|
"step": 1034
|
|
},
|
|
{
|
|
"epoch": 4.14,
|
|
"grad_norm": 2.269899845123291,
|
|
"learning_rate": 1.7419354838709678e-06,
|
|
"loss": 0.7735,
|
|
"step": 1035
|
|
},
|
|
{
|
|
"epoch": 4.144,
|
|
"grad_norm": 2.2679085731506348,
|
|
"learning_rate": 1.7338709677419357e-06,
|
|
"loss": 0.6658,
|
|
"step": 1036
|
|
},
|
|
{
|
|
"epoch": 4.148,
|
|
"grad_norm": 2.300226926803589,
|
|
"learning_rate": 1.7258064516129034e-06,
|
|
"loss": 0.7193,
|
|
"step": 1037
|
|
},
|
|
{
|
|
"epoch": 4.152,
|
|
"grad_norm": 2.3566718101501465,
|
|
"learning_rate": 1.7177419354838712e-06,
|
|
"loss": 0.786,
|
|
"step": 1038
|
|
},
|
|
{
|
|
"epoch": 4.156,
|
|
"grad_norm": 2.0547354221343994,
|
|
"learning_rate": 1.7096774193548387e-06,
|
|
"loss": 0.5577,
|
|
"step": 1039
|
|
},
|
|
{
|
|
"epoch": 4.16,
|
|
"grad_norm": 2.0804097652435303,
|
|
"learning_rate": 1.7016129032258065e-06,
|
|
"loss": 0.5129,
|
|
"step": 1040
|
|
},
|
|
{
|
|
"epoch": 4.164,
|
|
"grad_norm": 2.2978291511535645,
|
|
"learning_rate": 1.6935483870967742e-06,
|
|
"loss": 0.7825,
|
|
"step": 1041
|
|
},
|
|
{
|
|
"epoch": 4.168,
|
|
"grad_norm": 2.329310894012451,
|
|
"learning_rate": 1.685483870967742e-06,
|
|
"loss": 0.8602,
|
|
"step": 1042
|
|
},
|
|
{
|
|
"epoch": 4.172,
|
|
"grad_norm": 2.0429697036743164,
|
|
"learning_rate": 1.67741935483871e-06,
|
|
"loss": 0.6153,
|
|
"step": 1043
|
|
},
|
|
{
|
|
"epoch": 4.176,
|
|
"grad_norm": 2.251570463180542,
|
|
"learning_rate": 1.6693548387096776e-06,
|
|
"loss": 0.7544,
|
|
"step": 1044
|
|
},
|
|
{
|
|
"epoch": 4.18,
|
|
"grad_norm": 2.3504586219787598,
|
|
"learning_rate": 1.6612903225806455e-06,
|
|
"loss": 0.7068,
|
|
"step": 1045
|
|
},
|
|
{
|
|
"epoch": 4.184,
|
|
"grad_norm": 2.4925265312194824,
|
|
"learning_rate": 1.653225806451613e-06,
|
|
"loss": 0.8326,
|
|
"step": 1046
|
|
},
|
|
{
|
|
"epoch": 4.188,
|
|
"grad_norm": 2.126401424407959,
|
|
"learning_rate": 1.6451612903225808e-06,
|
|
"loss": 0.6162,
|
|
"step": 1047
|
|
},
|
|
{
|
|
"epoch": 4.192,
|
|
"grad_norm": 2.3744008541107178,
|
|
"learning_rate": 1.6370967741935484e-06,
|
|
"loss": 0.841,
|
|
"step": 1048
|
|
},
|
|
{
|
|
"epoch": 4.196,
|
|
"grad_norm": 2.235731840133667,
|
|
"learning_rate": 1.6290322580645163e-06,
|
|
"loss": 0.7612,
|
|
"step": 1049
|
|
},
|
|
{
|
|
"epoch": 4.2,
|
|
"grad_norm": 2.3301475048065186,
|
|
"learning_rate": 1.6209677419354842e-06,
|
|
"loss": 0.7748,
|
|
"step": 1050
|
|
},
|
|
{
|
|
"epoch": 4.204,
|
|
"grad_norm": 2.1939594745635986,
|
|
"learning_rate": 1.6129032258064516e-06,
|
|
"loss": 0.6976,
|
|
"step": 1051
|
|
},
|
|
{
|
|
"epoch": 4.208,
|
|
"grad_norm": 2.33828067779541,
|
|
"learning_rate": 1.6048387096774195e-06,
|
|
"loss": 0.8672,
|
|
"step": 1052
|
|
},
|
|
{
|
|
"epoch": 4.212,
|
|
"grad_norm": 2.3657424449920654,
|
|
"learning_rate": 1.5967741935483871e-06,
|
|
"loss": 0.7897,
|
|
"step": 1053
|
|
},
|
|
{
|
|
"epoch": 4.216,
|
|
"grad_norm": 2.33683705329895,
|
|
"learning_rate": 1.588709677419355e-06,
|
|
"loss": 0.8152,
|
|
"step": 1054
|
|
},
|
|
{
|
|
"epoch": 4.22,
|
|
"grad_norm": 2.3173093795776367,
|
|
"learning_rate": 1.5806451612903226e-06,
|
|
"loss": 0.7319,
|
|
"step": 1055
|
|
},
|
|
{
|
|
"epoch": 4.224,
|
|
"grad_norm": 2.067448139190674,
|
|
"learning_rate": 1.5725806451612905e-06,
|
|
"loss": 0.6357,
|
|
"step": 1056
|
|
},
|
|
{
|
|
"epoch": 4.228,
|
|
"grad_norm": 2.410370349884033,
|
|
"learning_rate": 1.5645161290322584e-06,
|
|
"loss": 0.6698,
|
|
"step": 1057
|
|
},
|
|
{
|
|
"epoch": 4.232,
|
|
"grad_norm": 2.0703933238983154,
|
|
"learning_rate": 1.5564516129032258e-06,
|
|
"loss": 0.5286,
|
|
"step": 1058
|
|
},
|
|
{
|
|
"epoch": 4.236,
|
|
"grad_norm": 2.5206849575042725,
|
|
"learning_rate": 1.5483870967741937e-06,
|
|
"loss": 0.9473,
|
|
"step": 1059
|
|
},
|
|
{
|
|
"epoch": 4.24,
|
|
"grad_norm": 2.193002939224243,
|
|
"learning_rate": 1.5403225806451613e-06,
|
|
"loss": 0.7604,
|
|
"step": 1060
|
|
},
|
|
{
|
|
"epoch": 4.244,
|
|
"grad_norm": 2.514152765274048,
|
|
"learning_rate": 1.5322580645161292e-06,
|
|
"loss": 0.8542,
|
|
"step": 1061
|
|
},
|
|
{
|
|
"epoch": 4.248,
|
|
"grad_norm": 2.1836607456207275,
|
|
"learning_rate": 1.5241935483870969e-06,
|
|
"loss": 0.6228,
|
|
"step": 1062
|
|
},
|
|
{
|
|
"epoch": 4.252,
|
|
"grad_norm": 2.289808511734009,
|
|
"learning_rate": 1.5161290322580647e-06,
|
|
"loss": 0.8151,
|
|
"step": 1063
|
|
},
|
|
{
|
|
"epoch": 4.256,
|
|
"grad_norm": 2.335754632949829,
|
|
"learning_rate": 1.5080645161290324e-06,
|
|
"loss": 0.7623,
|
|
"step": 1064
|
|
},
|
|
{
|
|
"epoch": 4.26,
|
|
"grad_norm": 2.4347617626190186,
|
|
"learning_rate": 1.5e-06,
|
|
"loss": 0.834,
|
|
"step": 1065
|
|
},
|
|
{
|
|
"epoch": 4.264,
|
|
"grad_norm": 2.4381332397460938,
|
|
"learning_rate": 1.491935483870968e-06,
|
|
"loss": 0.7373,
|
|
"step": 1066
|
|
},
|
|
{
|
|
"epoch": 4.268,
|
|
"grad_norm": 2.4642021656036377,
|
|
"learning_rate": 1.4838709677419356e-06,
|
|
"loss": 0.904,
|
|
"step": 1067
|
|
},
|
|
{
|
|
"epoch": 4.272,
|
|
"grad_norm": 2.3428714275360107,
|
|
"learning_rate": 1.4758064516129034e-06,
|
|
"loss": 0.67,
|
|
"step": 1068
|
|
},
|
|
{
|
|
"epoch": 4.276,
|
|
"grad_norm": 2.1284022331237793,
|
|
"learning_rate": 1.4677419354838709e-06,
|
|
"loss": 0.71,
|
|
"step": 1069
|
|
},
|
|
{
|
|
"epoch": 4.28,
|
|
"grad_norm": 2.2314751148223877,
|
|
"learning_rate": 1.4596774193548387e-06,
|
|
"loss": 0.5407,
|
|
"step": 1070
|
|
},
|
|
{
|
|
"epoch": 4.284,
|
|
"grad_norm": 2.235234022140503,
|
|
"learning_rate": 1.4516129032258066e-06,
|
|
"loss": 0.6042,
|
|
"step": 1071
|
|
},
|
|
{
|
|
"epoch": 4.288,
|
|
"grad_norm": 2.1681883335113525,
|
|
"learning_rate": 1.4435483870967743e-06,
|
|
"loss": 0.5758,
|
|
"step": 1072
|
|
},
|
|
{
|
|
"epoch": 4.292,
|
|
"grad_norm": 2.2013158798217773,
|
|
"learning_rate": 1.4354838709677421e-06,
|
|
"loss": 0.6864,
|
|
"step": 1073
|
|
},
|
|
{
|
|
"epoch": 4.296,
|
|
"grad_norm": 2.1779909133911133,
|
|
"learning_rate": 1.4274193548387098e-06,
|
|
"loss": 0.8579,
|
|
"step": 1074
|
|
},
|
|
{
|
|
"epoch": 4.3,
|
|
"grad_norm": 2.1225690841674805,
|
|
"learning_rate": 1.4193548387096776e-06,
|
|
"loss": 0.5255,
|
|
"step": 1075
|
|
},
|
|
{
|
|
"epoch": 4.304,
|
|
"grad_norm": 2.294562339782715,
|
|
"learning_rate": 1.4112903225806455e-06,
|
|
"loss": 0.6786,
|
|
"step": 1076
|
|
},
|
|
{
|
|
"epoch": 4.308,
|
|
"grad_norm": 2.2220890522003174,
|
|
"learning_rate": 1.403225806451613e-06,
|
|
"loss": 0.6386,
|
|
"step": 1077
|
|
},
|
|
{
|
|
"epoch": 4.312,
|
|
"grad_norm": 2.431384801864624,
|
|
"learning_rate": 1.3951612903225808e-06,
|
|
"loss": 0.6489,
|
|
"step": 1078
|
|
},
|
|
{
|
|
"epoch": 4.316,
|
|
"grad_norm": 2.2231733798980713,
|
|
"learning_rate": 1.3870967741935485e-06,
|
|
"loss": 0.6481,
|
|
"step": 1079
|
|
},
|
|
{
|
|
"epoch": 4.32,
|
|
"grad_norm": 2.2959814071655273,
|
|
"learning_rate": 1.3790322580645163e-06,
|
|
"loss": 0.8887,
|
|
"step": 1080
|
|
},
|
|
{
|
|
"epoch": 4.324,
|
|
"grad_norm": 2.6281895637512207,
|
|
"learning_rate": 1.3709677419354838e-06,
|
|
"loss": 0.8555,
|
|
"step": 1081
|
|
},
|
|
{
|
|
"epoch": 4.328,
|
|
"grad_norm": 2.2837817668914795,
|
|
"learning_rate": 1.3629032258064517e-06,
|
|
"loss": 0.7436,
|
|
"step": 1082
|
|
},
|
|
{
|
|
"epoch": 4.332,
|
|
"grad_norm": 2.278343439102173,
|
|
"learning_rate": 1.3548387096774195e-06,
|
|
"loss": 0.8137,
|
|
"step": 1083
|
|
},
|
|
{
|
|
"epoch": 4.336,
|
|
"grad_norm": 2.593653678894043,
|
|
"learning_rate": 1.3467741935483872e-06,
|
|
"loss": 0.9537,
|
|
"step": 1084
|
|
},
|
|
{
|
|
"epoch": 4.34,
|
|
"grad_norm": 2.2890312671661377,
|
|
"learning_rate": 1.338709677419355e-06,
|
|
"loss": 0.7581,
|
|
"step": 1085
|
|
},
|
|
{
|
|
"epoch": 4.344,
|
|
"grad_norm": 2.2410354614257812,
|
|
"learning_rate": 1.3306451612903227e-06,
|
|
"loss": 0.6705,
|
|
"step": 1086
|
|
},
|
|
{
|
|
"epoch": 4.348,
|
|
"grad_norm": 2.34249210357666,
|
|
"learning_rate": 1.3225806451612906e-06,
|
|
"loss": 0.6918,
|
|
"step": 1087
|
|
},
|
|
{
|
|
"epoch": 4.352,
|
|
"grad_norm": 2.224848508834839,
|
|
"learning_rate": 1.314516129032258e-06,
|
|
"loss": 0.661,
|
|
"step": 1088
|
|
},
|
|
{
|
|
"epoch": 4.356,
|
|
"grad_norm": 2.231247901916504,
|
|
"learning_rate": 1.3064516129032259e-06,
|
|
"loss": 0.6758,
|
|
"step": 1089
|
|
},
|
|
{
|
|
"epoch": 4.36,
|
|
"grad_norm": 2.2124154567718506,
|
|
"learning_rate": 1.2983870967741937e-06,
|
|
"loss": 0.691,
|
|
"step": 1090
|
|
},
|
|
{
|
|
"epoch": 4.364,
|
|
"grad_norm": 2.3312814235687256,
|
|
"learning_rate": 1.2903225806451614e-06,
|
|
"loss": 0.8702,
|
|
"step": 1091
|
|
},
|
|
{
|
|
"epoch": 4.368,
|
|
"grad_norm": 2.3130455017089844,
|
|
"learning_rate": 1.2822580645161293e-06,
|
|
"loss": 0.7612,
|
|
"step": 1092
|
|
},
|
|
{
|
|
"epoch": 4.372,
|
|
"grad_norm": 2.3148789405822754,
|
|
"learning_rate": 1.2741935483870967e-06,
|
|
"loss": 0.8986,
|
|
"step": 1093
|
|
},
|
|
{
|
|
"epoch": 4.376,
|
|
"grad_norm": 2.438842296600342,
|
|
"learning_rate": 1.2661290322580646e-06,
|
|
"loss": 1.0276,
|
|
"step": 1094
|
|
},
|
|
{
|
|
"epoch": 4.38,
|
|
"grad_norm": 2.1867611408233643,
|
|
"learning_rate": 1.2580645161290322e-06,
|
|
"loss": 0.6475,
|
|
"step": 1095
|
|
},
|
|
{
|
|
"epoch": 4.384,
|
|
"grad_norm": 2.132303476333618,
|
|
"learning_rate": 1.25e-06,
|
|
"loss": 0.7559,
|
|
"step": 1096
|
|
},
|
|
{
|
|
"epoch": 4.388,
|
|
"grad_norm": 2.495635509490967,
|
|
"learning_rate": 1.2419354838709678e-06,
|
|
"loss": 0.983,
|
|
"step": 1097
|
|
},
|
|
{
|
|
"epoch": 4.392,
|
|
"grad_norm": 2.5002808570861816,
|
|
"learning_rate": 1.2338709677419356e-06,
|
|
"loss": 0.933,
|
|
"step": 1098
|
|
},
|
|
{
|
|
"epoch": 4.396,
|
|
"grad_norm": 2.1121408939361572,
|
|
"learning_rate": 1.2258064516129033e-06,
|
|
"loss": 0.6312,
|
|
"step": 1099
|
|
},
|
|
{
|
|
"epoch": 4.4,
|
|
"grad_norm": 2.2128350734710693,
|
|
"learning_rate": 1.2177419354838711e-06,
|
|
"loss": 0.657,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"epoch": 4.404,
|
|
"grad_norm": 2.482390880584717,
|
|
"learning_rate": 1.2096774193548388e-06,
|
|
"loss": 0.8101,
|
|
"step": 1101
|
|
},
|
|
{
|
|
"epoch": 4.408,
|
|
"grad_norm": 2.305540084838867,
|
|
"learning_rate": 1.2016129032258067e-06,
|
|
"loss": 0.7443,
|
|
"step": 1102
|
|
},
|
|
{
|
|
"epoch": 4.412,
|
|
"grad_norm": 2.619077682495117,
|
|
"learning_rate": 1.1935483870967743e-06,
|
|
"loss": 1.055,
|
|
"step": 1103
|
|
},
|
|
{
|
|
"epoch": 4.416,
|
|
"grad_norm": 2.0542643070220947,
|
|
"learning_rate": 1.185483870967742e-06,
|
|
"loss": 0.5384,
|
|
"step": 1104
|
|
},
|
|
{
|
|
"epoch": 4.42,
|
|
"grad_norm": 2.1693155765533447,
|
|
"learning_rate": 1.1774193548387096e-06,
|
|
"loss": 0.6142,
|
|
"step": 1105
|
|
},
|
|
{
|
|
"epoch": 4.424,
|
|
"grad_norm": 2.2738637924194336,
|
|
"learning_rate": 1.1693548387096775e-06,
|
|
"loss": 0.7541,
|
|
"step": 1106
|
|
},
|
|
{
|
|
"epoch": 4.428,
|
|
"grad_norm": 2.372004270553589,
|
|
"learning_rate": 1.1612903225806454e-06,
|
|
"loss": 0.815,
|
|
"step": 1107
|
|
},
|
|
{
|
|
"epoch": 4.432,
|
|
"grad_norm": 2.3350911140441895,
|
|
"learning_rate": 1.153225806451613e-06,
|
|
"loss": 0.9225,
|
|
"step": 1108
|
|
},
|
|
{
|
|
"epoch": 4.436,
|
|
"grad_norm": 2.01047682762146,
|
|
"learning_rate": 1.1451612903225807e-06,
|
|
"loss": 0.4809,
|
|
"step": 1109
|
|
},
|
|
{
|
|
"epoch": 4.44,
|
|
"grad_norm": 2.1741702556610107,
|
|
"learning_rate": 1.1370967741935485e-06,
|
|
"loss": 0.6214,
|
|
"step": 1110
|
|
},
|
|
{
|
|
"epoch": 4.444,
|
|
"grad_norm": 2.320693254470825,
|
|
"learning_rate": 1.1290322580645162e-06,
|
|
"loss": 0.8867,
|
|
"step": 1111
|
|
},
|
|
{
|
|
"epoch": 4.448,
|
|
"grad_norm": 2.3759713172912598,
|
|
"learning_rate": 1.1209677419354839e-06,
|
|
"loss": 0.7674,
|
|
"step": 1112
|
|
},
|
|
{
|
|
"epoch": 4.452,
|
|
"grad_norm": 2.2036936283111572,
|
|
"learning_rate": 1.1129032258064517e-06,
|
|
"loss": 0.6323,
|
|
"step": 1113
|
|
},
|
|
{
|
|
"epoch": 4.456,
|
|
"grad_norm": 2.4528908729553223,
|
|
"learning_rate": 1.1048387096774196e-06,
|
|
"loss": 0.8555,
|
|
"step": 1114
|
|
},
|
|
{
|
|
"epoch": 4.46,
|
|
"grad_norm": 2.1635193824768066,
|
|
"learning_rate": 1.0967741935483872e-06,
|
|
"loss": 0.7599,
|
|
"step": 1115
|
|
},
|
|
{
|
|
"epoch": 4.464,
|
|
"grad_norm": 2.2776708602905273,
|
|
"learning_rate": 1.088709677419355e-06,
|
|
"loss": 0.7247,
|
|
"step": 1116
|
|
},
|
|
{
|
|
"epoch": 4.468,
|
|
"grad_norm": 2.4202232360839844,
|
|
"learning_rate": 1.0806451612903226e-06,
|
|
"loss": 0.7726,
|
|
"step": 1117
|
|
},
|
|
{
|
|
"epoch": 4.4719999999999995,
|
|
"grad_norm": 2.226058006286621,
|
|
"learning_rate": 1.0725806451612904e-06,
|
|
"loss": 0.7561,
|
|
"step": 1118
|
|
},
|
|
{
|
|
"epoch": 4.476,
|
|
"grad_norm": 2.371166706085205,
|
|
"learning_rate": 1.064516129032258e-06,
|
|
"loss": 0.8106,
|
|
"step": 1119
|
|
},
|
|
{
|
|
"epoch": 4.48,
|
|
"grad_norm": 2.545086145401001,
|
|
"learning_rate": 1.056451612903226e-06,
|
|
"loss": 0.6927,
|
|
"step": 1120
|
|
},
|
|
{
|
|
"epoch": 4.484,
|
|
"grad_norm": 2.384808301925659,
|
|
"learning_rate": 1.0483870967741936e-06,
|
|
"loss": 0.8472,
|
|
"step": 1121
|
|
},
|
|
{
|
|
"epoch": 4.4879999999999995,
|
|
"grad_norm": 2.267472267150879,
|
|
"learning_rate": 1.0403225806451615e-06,
|
|
"loss": 0.6731,
|
|
"step": 1122
|
|
},
|
|
{
|
|
"epoch": 4.492,
|
|
"grad_norm": 2.5474860668182373,
|
|
"learning_rate": 1.0322580645161291e-06,
|
|
"loss": 1.0592,
|
|
"step": 1123
|
|
},
|
|
{
|
|
"epoch": 4.496,
|
|
"grad_norm": 2.423821210861206,
|
|
"learning_rate": 1.0241935483870968e-06,
|
|
"loss": 0.7909,
|
|
"step": 1124
|
|
},
|
|
{
|
|
"epoch": 4.5,
|
|
"grad_norm": 2.09659743309021,
|
|
"learning_rate": 1.0161290322580646e-06,
|
|
"loss": 0.6294,
|
|
"step": 1125
|
|
},
|
|
{
|
|
"epoch": 4.504,
|
|
"grad_norm": 2.1957030296325684,
|
|
"learning_rate": 1.0080645161290323e-06,
|
|
"loss": 0.5339,
|
|
"step": 1126
|
|
},
|
|
{
|
|
"epoch": 4.508,
|
|
"grad_norm": 2.3419601917266846,
|
|
"learning_rate": 1.0000000000000002e-06,
|
|
"loss": 0.815,
|
|
"step": 1127
|
|
},
|
|
{
|
|
"epoch": 4.5120000000000005,
|
|
"grad_norm": 2.3392527103424072,
|
|
"learning_rate": 9.919354838709678e-07,
|
|
"loss": 0.7399,
|
|
"step": 1128
|
|
},
|
|
{
|
|
"epoch": 4.516,
|
|
"grad_norm": 2.591635227203369,
|
|
"learning_rate": 9.838709677419355e-07,
|
|
"loss": 0.8111,
|
|
"step": 1129
|
|
},
|
|
{
|
|
"epoch": 4.52,
|
|
"grad_norm": 2.3863515853881836,
|
|
"learning_rate": 9.758064516129033e-07,
|
|
"loss": 0.7086,
|
|
"step": 1130
|
|
},
|
|
{
|
|
"epoch": 4.524,
|
|
"grad_norm": 2.4174227714538574,
|
|
"learning_rate": 9.67741935483871e-07,
|
|
"loss": 0.7625,
|
|
"step": 1131
|
|
},
|
|
{
|
|
"epoch": 4.5280000000000005,
|
|
"grad_norm": 2.3676483631134033,
|
|
"learning_rate": 9.596774193548387e-07,
|
|
"loss": 0.7432,
|
|
"step": 1132
|
|
},
|
|
{
|
|
"epoch": 4.532,
|
|
"grad_norm": 2.2280187606811523,
|
|
"learning_rate": 9.516129032258064e-07,
|
|
"loss": 0.6567,
|
|
"step": 1133
|
|
},
|
|
{
|
|
"epoch": 4.536,
|
|
"grad_norm": 2.323503255844116,
|
|
"learning_rate": 9.435483870967743e-07,
|
|
"loss": 0.7162,
|
|
"step": 1134
|
|
},
|
|
{
|
|
"epoch": 4.54,
|
|
"grad_norm": 2.330273151397705,
|
|
"learning_rate": 9.35483870967742e-07,
|
|
"loss": 0.7492,
|
|
"step": 1135
|
|
},
|
|
{
|
|
"epoch": 4.5440000000000005,
|
|
"grad_norm": 2.3072502613067627,
|
|
"learning_rate": 9.274193548387098e-07,
|
|
"loss": 0.7717,
|
|
"step": 1136
|
|
},
|
|
{
|
|
"epoch": 4.548,
|
|
"grad_norm": 2.1893625259399414,
|
|
"learning_rate": 9.193548387096775e-07,
|
|
"loss": 0.7577,
|
|
"step": 1137
|
|
},
|
|
{
|
|
"epoch": 4.552,
|
|
"grad_norm": 2.299312114715576,
|
|
"learning_rate": 9.112903225806452e-07,
|
|
"loss": 0.6489,
|
|
"step": 1138
|
|
},
|
|
{
|
|
"epoch": 4.556,
|
|
"grad_norm": 2.2672600746154785,
|
|
"learning_rate": 9.032258064516129e-07,
|
|
"loss": 0.7582,
|
|
"step": 1139
|
|
},
|
|
{
|
|
"epoch": 4.5600000000000005,
|
|
"grad_norm": 2.35774302482605,
|
|
"learning_rate": 8.951612903225807e-07,
|
|
"loss": 0.6944,
|
|
"step": 1140
|
|
},
|
|
{
|
|
"epoch": 4.564,
|
|
"grad_norm": 2.4566967487335205,
|
|
"learning_rate": 8.870967741935485e-07,
|
|
"loss": 0.7874,
|
|
"step": 1141
|
|
},
|
|
{
|
|
"epoch": 4.568,
|
|
"grad_norm": 2.346771478652954,
|
|
"learning_rate": 8.790322580645163e-07,
|
|
"loss": 0.7592,
|
|
"step": 1142
|
|
},
|
|
{
|
|
"epoch": 4.572,
|
|
"grad_norm": 2.4383773803710938,
|
|
"learning_rate": 8.709677419354839e-07,
|
|
"loss": 0.7337,
|
|
"step": 1143
|
|
},
|
|
{
|
|
"epoch": 4.576,
|
|
"grad_norm": 2.19571852684021,
|
|
"learning_rate": 8.629032258064517e-07,
|
|
"loss": 0.6448,
|
|
"step": 1144
|
|
},
|
|
{
|
|
"epoch": 4.58,
|
|
"grad_norm": 2.2884631156921387,
|
|
"learning_rate": 8.548387096774193e-07,
|
|
"loss": 0.5722,
|
|
"step": 1145
|
|
},
|
|
{
|
|
"epoch": 4.584,
|
|
"grad_norm": 2.3079285621643066,
|
|
"learning_rate": 8.467741935483871e-07,
|
|
"loss": 0.7114,
|
|
"step": 1146
|
|
},
|
|
{
|
|
"epoch": 4.588,
|
|
"grad_norm": 2.3879995346069336,
|
|
"learning_rate": 8.38709677419355e-07,
|
|
"loss": 0.6822,
|
|
"step": 1147
|
|
},
|
|
{
|
|
"epoch": 4.592,
|
|
"grad_norm": 2.3414344787597656,
|
|
"learning_rate": 8.306451612903227e-07,
|
|
"loss": 0.8221,
|
|
"step": 1148
|
|
},
|
|
{
|
|
"epoch": 4.596,
|
|
"grad_norm": 2.401705026626587,
|
|
"learning_rate": 8.225806451612904e-07,
|
|
"loss": 0.8003,
|
|
"step": 1149
|
|
},
|
|
{
|
|
"epoch": 4.6,
|
|
"grad_norm": 2.3082618713378906,
|
|
"learning_rate": 8.145161290322581e-07,
|
|
"loss": 0.7632,
|
|
"step": 1150
|
|
},
|
|
{
|
|
"epoch": 4.604,
|
|
"grad_norm": 2.3123667240142822,
|
|
"learning_rate": 8.064516129032258e-07,
|
|
"loss": 0.6528,
|
|
"step": 1151
|
|
},
|
|
{
|
|
"epoch": 4.608,
|
|
"grad_norm": 2.3781580924987793,
|
|
"learning_rate": 7.983870967741936e-07,
|
|
"loss": 0.788,
|
|
"step": 1152
|
|
},
|
|
{
|
|
"epoch": 4.612,
|
|
"grad_norm": 2.3994123935699463,
|
|
"learning_rate": 7.903225806451613e-07,
|
|
"loss": 0.8394,
|
|
"step": 1153
|
|
},
|
|
{
|
|
"epoch": 4.616,
|
|
"grad_norm": 2.3815858364105225,
|
|
"learning_rate": 7.822580645161292e-07,
|
|
"loss": 1.0368,
|
|
"step": 1154
|
|
},
|
|
{
|
|
"epoch": 4.62,
|
|
"grad_norm": 1.8045047521591187,
|
|
"learning_rate": 7.741935483870968e-07,
|
|
"loss": 0.4503,
|
|
"step": 1155
|
|
},
|
|
{
|
|
"epoch": 4.624,
|
|
"grad_norm": 2.581266403198242,
|
|
"learning_rate": 7.661290322580646e-07,
|
|
"loss": 0.7589,
|
|
"step": 1156
|
|
},
|
|
{
|
|
"epoch": 4.628,
|
|
"grad_norm": 2.2751896381378174,
|
|
"learning_rate": 7.580645161290324e-07,
|
|
"loss": 0.6624,
|
|
"step": 1157
|
|
},
|
|
{
|
|
"epoch": 4.632,
|
|
"grad_norm": 2.2207417488098145,
|
|
"learning_rate": 7.5e-07,
|
|
"loss": 0.7476,
|
|
"step": 1158
|
|
},
|
|
{
|
|
"epoch": 4.636,
|
|
"grad_norm": 2.173737049102783,
|
|
"learning_rate": 7.419354838709678e-07,
|
|
"loss": 0.5195,
|
|
"step": 1159
|
|
},
|
|
{
|
|
"epoch": 4.64,
|
|
"grad_norm": 2.327514886856079,
|
|
"learning_rate": 7.338709677419354e-07,
|
|
"loss": 0.7557,
|
|
"step": 1160
|
|
},
|
|
{
|
|
"epoch": 4.644,
|
|
"grad_norm": 2.36411190032959,
|
|
"learning_rate": 7.258064516129033e-07,
|
|
"loss": 0.8697,
|
|
"step": 1161
|
|
},
|
|
{
|
|
"epoch": 4.648,
|
|
"grad_norm": 2.3163628578186035,
|
|
"learning_rate": 7.177419354838711e-07,
|
|
"loss": 0.742,
|
|
"step": 1162
|
|
},
|
|
{
|
|
"epoch": 4.652,
|
|
"grad_norm": 2.373682975769043,
|
|
"learning_rate": 7.096774193548388e-07,
|
|
"loss": 0.7396,
|
|
"step": 1163
|
|
},
|
|
{
|
|
"epoch": 4.656,
|
|
"grad_norm": 2.7890610694885254,
|
|
"learning_rate": 7.016129032258065e-07,
|
|
"loss": 0.9487,
|
|
"step": 1164
|
|
},
|
|
{
|
|
"epoch": 4.66,
|
|
"grad_norm": 1.9923303127288818,
|
|
"learning_rate": 6.935483870967742e-07,
|
|
"loss": 0.5078,
|
|
"step": 1165
|
|
},
|
|
{
|
|
"epoch": 4.664,
|
|
"grad_norm": 2.2962071895599365,
|
|
"learning_rate": 6.854838709677419e-07,
|
|
"loss": 0.6853,
|
|
"step": 1166
|
|
},
|
|
{
|
|
"epoch": 4.668,
|
|
"grad_norm": 2.216494083404541,
|
|
"learning_rate": 6.774193548387098e-07,
|
|
"loss": 0.5454,
|
|
"step": 1167
|
|
},
|
|
{
|
|
"epoch": 4.672,
|
|
"grad_norm": 2.3010871410369873,
|
|
"learning_rate": 6.693548387096775e-07,
|
|
"loss": 0.6566,
|
|
"step": 1168
|
|
},
|
|
{
|
|
"epoch": 4.676,
|
|
"grad_norm": 2.5437018871307373,
|
|
"learning_rate": 6.612903225806453e-07,
|
|
"loss": 1.0156,
|
|
"step": 1169
|
|
},
|
|
{
|
|
"epoch": 4.68,
|
|
"grad_norm": 2.4474453926086426,
|
|
"learning_rate": 6.532258064516129e-07,
|
|
"loss": 0.7806,
|
|
"step": 1170
|
|
},
|
|
{
|
|
"epoch": 4.684,
|
|
"grad_norm": 2.4709243774414062,
|
|
"learning_rate": 6.451612903225807e-07,
|
|
"loss": 0.7458,
|
|
"step": 1171
|
|
},
|
|
{
|
|
"epoch": 4.688,
|
|
"grad_norm": 2.0869922637939453,
|
|
"learning_rate": 6.370967741935484e-07,
|
|
"loss": 0.6425,
|
|
"step": 1172
|
|
},
|
|
{
|
|
"epoch": 4.692,
|
|
"grad_norm": 2.2414233684539795,
|
|
"learning_rate": 6.290322580645161e-07,
|
|
"loss": 0.7052,
|
|
"step": 1173
|
|
},
|
|
{
|
|
"epoch": 4.696,
|
|
"grad_norm": 2.1230409145355225,
|
|
"learning_rate": 6.209677419354839e-07,
|
|
"loss": 0.5901,
|
|
"step": 1174
|
|
},
|
|
{
|
|
"epoch": 4.7,
|
|
"grad_norm": 2.0766208171844482,
|
|
"learning_rate": 6.129032258064516e-07,
|
|
"loss": 0.627,
|
|
"step": 1175
|
|
},
|
|
{
|
|
"epoch": 4.704,
|
|
"grad_norm": 2.3263955116271973,
|
|
"learning_rate": 6.048387096774194e-07,
|
|
"loss": 0.6084,
|
|
"step": 1176
|
|
},
|
|
{
|
|
"epoch": 4.708,
|
|
"grad_norm": 2.28118634223938,
|
|
"learning_rate": 5.967741935483872e-07,
|
|
"loss": 0.7532,
|
|
"step": 1177
|
|
},
|
|
{
|
|
"epoch": 4.712,
|
|
"grad_norm": 2.4390621185302734,
|
|
"learning_rate": 5.887096774193548e-07,
|
|
"loss": 0.8747,
|
|
"step": 1178
|
|
},
|
|
{
|
|
"epoch": 4.716,
|
|
"grad_norm": 2.4335639476776123,
|
|
"learning_rate": 5.806451612903227e-07,
|
|
"loss": 0.7191,
|
|
"step": 1179
|
|
},
|
|
{
|
|
"epoch": 4.72,
|
|
"grad_norm": 2.284865617752075,
|
|
"learning_rate": 5.725806451612903e-07,
|
|
"loss": 0.6934,
|
|
"step": 1180
|
|
},
|
|
{
|
|
"epoch": 4.724,
|
|
"grad_norm": 2.382296085357666,
|
|
"learning_rate": 5.645161290322581e-07,
|
|
"loss": 0.8553,
|
|
"step": 1181
|
|
},
|
|
{
|
|
"epoch": 4.728,
|
|
"grad_norm": 2.481611967086792,
|
|
"learning_rate": 5.564516129032259e-07,
|
|
"loss": 0.7187,
|
|
"step": 1182
|
|
},
|
|
{
|
|
"epoch": 4.732,
|
|
"grad_norm": 2.524700880050659,
|
|
"learning_rate": 5.483870967741936e-07,
|
|
"loss": 0.8057,
|
|
"step": 1183
|
|
},
|
|
{
|
|
"epoch": 4.736,
|
|
"grad_norm": 2.0938427448272705,
|
|
"learning_rate": 5.403225806451613e-07,
|
|
"loss": 0.5949,
|
|
"step": 1184
|
|
},
|
|
{
|
|
"epoch": 4.74,
|
|
"grad_norm": 2.2317137718200684,
|
|
"learning_rate": 5.32258064516129e-07,
|
|
"loss": 0.5911,
|
|
"step": 1185
|
|
},
|
|
{
|
|
"epoch": 4.744,
|
|
"grad_norm": 2.4001145362854004,
|
|
"learning_rate": 5.241935483870968e-07,
|
|
"loss": 0.6954,
|
|
"step": 1186
|
|
},
|
|
{
|
|
"epoch": 4.748,
|
|
"grad_norm": 2.1287758350372314,
|
|
"learning_rate": 5.161290322580646e-07,
|
|
"loss": 0.6147,
|
|
"step": 1187
|
|
},
|
|
{
|
|
"epoch": 4.752,
|
|
"grad_norm": 2.6044185161590576,
|
|
"learning_rate": 5.080645161290323e-07,
|
|
"loss": 0.8149,
|
|
"step": 1188
|
|
},
|
|
{
|
|
"epoch": 4.756,
|
|
"grad_norm": 2.4292356967926025,
|
|
"learning_rate": 5.000000000000001e-07,
|
|
"loss": 0.9372,
|
|
"step": 1189
|
|
},
|
|
{
|
|
"epoch": 4.76,
|
|
"grad_norm": 2.4027163982391357,
|
|
"learning_rate": 4.919354838709677e-07,
|
|
"loss": 0.8758,
|
|
"step": 1190
|
|
},
|
|
{
|
|
"epoch": 4.764,
|
|
"grad_norm": 2.3395657539367676,
|
|
"learning_rate": 4.838709677419355e-07,
|
|
"loss": 0.8588,
|
|
"step": 1191
|
|
},
|
|
{
|
|
"epoch": 4.768,
|
|
"grad_norm": 2.1531972885131836,
|
|
"learning_rate": 4.758064516129032e-07,
|
|
"loss": 0.6956,
|
|
"step": 1192
|
|
},
|
|
{
|
|
"epoch": 4.772,
|
|
"grad_norm": 2.2789173126220703,
|
|
"learning_rate": 4.67741935483871e-07,
|
|
"loss": 0.594,
|
|
"step": 1193
|
|
},
|
|
{
|
|
"epoch": 4.776,
|
|
"grad_norm": 2.3256616592407227,
|
|
"learning_rate": 4.5967741935483873e-07,
|
|
"loss": 0.6754,
|
|
"step": 1194
|
|
},
|
|
{
|
|
"epoch": 4.78,
|
|
"grad_norm": 2.6827950477600098,
|
|
"learning_rate": 4.5161290322580644e-07,
|
|
"loss": 1.123,
|
|
"step": 1195
|
|
},
|
|
{
|
|
"epoch": 4.784,
|
|
"grad_norm": 2.393773078918457,
|
|
"learning_rate": 4.4354838709677425e-07,
|
|
"loss": 0.8604,
|
|
"step": 1196
|
|
},
|
|
{
|
|
"epoch": 4.788,
|
|
"grad_norm": 2.275068521499634,
|
|
"learning_rate": 4.3548387096774196e-07,
|
|
"loss": 0.6247,
|
|
"step": 1197
|
|
},
|
|
{
|
|
"epoch": 4.792,
|
|
"grad_norm": 2.2951745986938477,
|
|
"learning_rate": 4.2741935483870967e-07,
|
|
"loss": 0.7261,
|
|
"step": 1198
|
|
},
|
|
{
|
|
"epoch": 4.796,
|
|
"grad_norm": 2.144277811050415,
|
|
"learning_rate": 4.193548387096775e-07,
|
|
"loss": 0.7152,
|
|
"step": 1199
|
|
},
|
|
{
|
|
"epoch": 4.8,
|
|
"grad_norm": 2.2701258659362793,
|
|
"learning_rate": 4.112903225806452e-07,
|
|
"loss": 0.7496,
|
|
"step": 1200
|
|
}
|
|
],
|
|
"logging_steps": 1,
|
|
"max_steps": 1250,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 5,
|
|
"save_steps": 50,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": false
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 6.056150075029094e+16,
|
|
"train_batch_size": 1,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|