7027 lines
153 KiB
JSON
7027 lines
153 KiB
JSON
{
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 9.0,
|
|
"eval_steps": 500,
|
|
"global_step": 999,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.009009009009009009,
|
|
"grad_norm": 1.7692377372839305,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3442,
|
|
"step": 1
|
|
},
|
|
{
|
|
"epoch": 0.018018018018018018,
|
|
"grad_norm": 1.655774168281545,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.374,
|
|
"step": 2
|
|
},
|
|
{
|
|
"epoch": 0.02702702702702703,
|
|
"grad_norm": 1.5838611317965265,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3763,
|
|
"step": 3
|
|
},
|
|
{
|
|
"epoch": 0.036036036036036036,
|
|
"grad_norm": 1.4722490643600856,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3667,
|
|
"step": 4
|
|
},
|
|
{
|
|
"epoch": 0.04504504504504504,
|
|
"grad_norm": 0.967982129724269,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3378,
|
|
"step": 5
|
|
},
|
|
{
|
|
"epoch": 0.05405405405405406,
|
|
"grad_norm": 0.9655512366546067,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.325,
|
|
"step": 6
|
|
},
|
|
{
|
|
"epoch": 0.06306306306306306,
|
|
"grad_norm": 0.7980444967597017,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3418,
|
|
"step": 7
|
|
},
|
|
{
|
|
"epoch": 0.07207207207207207,
|
|
"grad_norm": 0.5186921114651042,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3508,
|
|
"step": 8
|
|
},
|
|
{
|
|
"epoch": 0.08108108108108109,
|
|
"grad_norm": 0.5518215648538942,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.348,
|
|
"step": 9
|
|
},
|
|
{
|
|
"epoch": 0.09009009009009009,
|
|
"grad_norm": 0.6622509236535837,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3539,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 0.0990990990990991,
|
|
"grad_norm": 0.6537261351376887,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3364,
|
|
"step": 11
|
|
},
|
|
{
|
|
"epoch": 0.10810810810810811,
|
|
"grad_norm": 0.6557224204301801,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3529,
|
|
"step": 12
|
|
},
|
|
{
|
|
"epoch": 0.11711711711711711,
|
|
"grad_norm": 0.6677743317643713,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3233,
|
|
"step": 13
|
|
},
|
|
{
|
|
"epoch": 0.12612612612612611,
|
|
"grad_norm": 0.5771734482767436,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3433,
|
|
"step": 14
|
|
},
|
|
{
|
|
"epoch": 0.13513513513513514,
|
|
"grad_norm": 0.5194262746227281,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2968,
|
|
"step": 15
|
|
},
|
|
{
|
|
"epoch": 0.14414414414414414,
|
|
"grad_norm": 0.5871866323370637,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3177,
|
|
"step": 16
|
|
},
|
|
{
|
|
"epoch": 0.15315315315315314,
|
|
"grad_norm": 0.6823752349157315,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3217,
|
|
"step": 17
|
|
},
|
|
{
|
|
"epoch": 0.16216216216216217,
|
|
"grad_norm": 0.5298328303770766,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.333,
|
|
"step": 18
|
|
},
|
|
{
|
|
"epoch": 0.17117117117117117,
|
|
"grad_norm": 0.45672917289622006,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2998,
|
|
"step": 19
|
|
},
|
|
{
|
|
"epoch": 0.18018018018018017,
|
|
"grad_norm": 0.4892453526407057,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3301,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.1891891891891892,
|
|
"grad_norm": 0.4157035225188495,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3097,
|
|
"step": 21
|
|
},
|
|
{
|
|
"epoch": 0.1981981981981982,
|
|
"grad_norm": 0.42144355038756004,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.336,
|
|
"step": 22
|
|
},
|
|
{
|
|
"epoch": 0.2072072072072072,
|
|
"grad_norm": 0.40449172267977285,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3203,
|
|
"step": 23
|
|
},
|
|
{
|
|
"epoch": 0.21621621621621623,
|
|
"grad_norm": 0.3817504264369776,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3282,
|
|
"step": 24
|
|
},
|
|
{
|
|
"epoch": 0.22522522522522523,
|
|
"grad_norm": 0.37458931065383283,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3427,
|
|
"step": 25
|
|
},
|
|
{
|
|
"epoch": 0.23423423423423423,
|
|
"grad_norm": 0.43415654347436194,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3361,
|
|
"step": 26
|
|
},
|
|
{
|
|
"epoch": 0.24324324324324326,
|
|
"grad_norm": 0.34734907350951355,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3081,
|
|
"step": 27
|
|
},
|
|
{
|
|
"epoch": 0.25225225225225223,
|
|
"grad_norm": 0.3446691978222806,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3104,
|
|
"step": 28
|
|
},
|
|
{
|
|
"epoch": 0.26126126126126126,
|
|
"grad_norm": 0.3219457244434707,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3154,
|
|
"step": 29
|
|
},
|
|
{
|
|
"epoch": 0.2702702702702703,
|
|
"grad_norm": 0.35333024684448033,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3238,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 0.27927927927927926,
|
|
"grad_norm": 0.38018940900412435,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3479,
|
|
"step": 31
|
|
},
|
|
{
|
|
"epoch": 0.2882882882882883,
|
|
"grad_norm": 0.4160537077429581,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3225,
|
|
"step": 32
|
|
},
|
|
{
|
|
"epoch": 0.2972972972972973,
|
|
"grad_norm": 0.4001899610048794,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3378,
|
|
"step": 33
|
|
},
|
|
{
|
|
"epoch": 0.3063063063063063,
|
|
"grad_norm": 0.3966450451230361,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3136,
|
|
"step": 34
|
|
},
|
|
{
|
|
"epoch": 0.3153153153153153,
|
|
"grad_norm": 0.35442342787868963,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3272,
|
|
"step": 35
|
|
},
|
|
{
|
|
"epoch": 0.32432432432432434,
|
|
"grad_norm": 0.31417075347024526,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.34,
|
|
"step": 36
|
|
},
|
|
{
|
|
"epoch": 0.3333333333333333,
|
|
"grad_norm": 0.31460209634883374,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3068,
|
|
"step": 37
|
|
},
|
|
{
|
|
"epoch": 0.34234234234234234,
|
|
"grad_norm": 0.27638346613404846,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3355,
|
|
"step": 38
|
|
},
|
|
{
|
|
"epoch": 0.35135135135135137,
|
|
"grad_norm": 0.31966876717000925,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3176,
|
|
"step": 39
|
|
},
|
|
{
|
|
"epoch": 0.36036036036036034,
|
|
"grad_norm": 0.2841478766107157,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3031,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.36936936936936937,
|
|
"grad_norm": 0.30103555060540843,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3149,
|
|
"step": 41
|
|
},
|
|
{
|
|
"epoch": 0.3783783783783784,
|
|
"grad_norm": 0.3245469606074819,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3147,
|
|
"step": 42
|
|
},
|
|
{
|
|
"epoch": 0.38738738738738737,
|
|
"grad_norm": 0.32434042643762057,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3305,
|
|
"step": 43
|
|
},
|
|
{
|
|
"epoch": 0.3963963963963964,
|
|
"grad_norm": 0.27778799916309627,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3384,
|
|
"step": 44
|
|
},
|
|
{
|
|
"epoch": 0.40540540540540543,
|
|
"grad_norm": 0.2745056010877783,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3156,
|
|
"step": 45
|
|
},
|
|
{
|
|
"epoch": 0.4144144144144144,
|
|
"grad_norm": 0.29410832050755714,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3112,
|
|
"step": 46
|
|
},
|
|
{
|
|
"epoch": 0.42342342342342343,
|
|
"grad_norm": 0.39421096404575884,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3504,
|
|
"step": 47
|
|
},
|
|
{
|
|
"epoch": 0.43243243243243246,
|
|
"grad_norm": 0.30297987367745016,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3133,
|
|
"step": 48
|
|
},
|
|
{
|
|
"epoch": 0.44144144144144143,
|
|
"grad_norm": 0.312599049596589,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3298,
|
|
"step": 49
|
|
},
|
|
{
|
|
"epoch": 0.45045045045045046,
|
|
"grad_norm": 0.28890139188869196,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3016,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 0.4594594594594595,
|
|
"grad_norm": 0.27234641580243496,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3349,
|
|
"step": 51
|
|
},
|
|
{
|
|
"epoch": 0.46846846846846846,
|
|
"grad_norm": 0.30882782510454476,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3307,
|
|
"step": 52
|
|
},
|
|
{
|
|
"epoch": 0.4774774774774775,
|
|
"grad_norm": 0.2657310651267706,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3246,
|
|
"step": 53
|
|
},
|
|
{
|
|
"epoch": 0.4864864864864865,
|
|
"grad_norm": 0.2876695765716273,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.336,
|
|
"step": 54
|
|
},
|
|
{
|
|
"epoch": 0.4954954954954955,
|
|
"grad_norm": 0.29656571676225046,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3428,
|
|
"step": 55
|
|
},
|
|
{
|
|
"epoch": 0.5045045045045045,
|
|
"grad_norm": 0.25789947550982967,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3035,
|
|
"step": 56
|
|
},
|
|
{
|
|
"epoch": 0.5135135135135135,
|
|
"grad_norm": 0.3359664317488606,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3221,
|
|
"step": 57
|
|
},
|
|
{
|
|
"epoch": 0.5225225225225225,
|
|
"grad_norm": 0.26901646941539337,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3061,
|
|
"step": 58
|
|
},
|
|
{
|
|
"epoch": 0.5315315315315315,
|
|
"grad_norm": 0.26500112714488566,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3203,
|
|
"step": 59
|
|
},
|
|
{
|
|
"epoch": 0.5405405405405406,
|
|
"grad_norm": 0.2614586643859284,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3068,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.5495495495495496,
|
|
"grad_norm": 0.28994271054547277,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3568,
|
|
"step": 61
|
|
},
|
|
{
|
|
"epoch": 0.5585585585585585,
|
|
"grad_norm": 0.3372155822417667,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3559,
|
|
"step": 62
|
|
},
|
|
{
|
|
"epoch": 0.5675675675675675,
|
|
"grad_norm": 0.30224128387648297,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3069,
|
|
"step": 63
|
|
},
|
|
{
|
|
"epoch": 0.5765765765765766,
|
|
"grad_norm": 0.3130672270163632,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3513,
|
|
"step": 64
|
|
},
|
|
{
|
|
"epoch": 0.5855855855855856,
|
|
"grad_norm": 0.3065414445284105,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3194,
|
|
"step": 65
|
|
},
|
|
{
|
|
"epoch": 0.5945945945945946,
|
|
"grad_norm": 0.29075353592758474,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.364,
|
|
"step": 66
|
|
},
|
|
{
|
|
"epoch": 0.6036036036036037,
|
|
"grad_norm": 0.28085597006714386,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3626,
|
|
"step": 67
|
|
},
|
|
{
|
|
"epoch": 0.6126126126126126,
|
|
"grad_norm": 0.30828909246343983,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3278,
|
|
"step": 68
|
|
},
|
|
{
|
|
"epoch": 0.6216216216216216,
|
|
"grad_norm": 0.30901462421223835,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3338,
|
|
"step": 69
|
|
},
|
|
{
|
|
"epoch": 0.6306306306306306,
|
|
"grad_norm": 0.3316361212286006,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3444,
|
|
"step": 70
|
|
},
|
|
{
|
|
"epoch": 0.6396396396396397,
|
|
"grad_norm": 0.26217545165384337,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3226,
|
|
"step": 71
|
|
},
|
|
{
|
|
"epoch": 0.6486486486486487,
|
|
"grad_norm": 0.2563886493400457,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3409,
|
|
"step": 72
|
|
},
|
|
{
|
|
"epoch": 0.6576576576576577,
|
|
"grad_norm": 0.2962337946705661,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3196,
|
|
"step": 73
|
|
},
|
|
{
|
|
"epoch": 0.6666666666666666,
|
|
"grad_norm": 0.2542256020612804,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3281,
|
|
"step": 74
|
|
},
|
|
{
|
|
"epoch": 0.6756756756756757,
|
|
"grad_norm": 0.32938420923589096,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3185,
|
|
"step": 75
|
|
},
|
|
{
|
|
"epoch": 0.6846846846846847,
|
|
"grad_norm": 0.33155654515742616,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3153,
|
|
"step": 76
|
|
},
|
|
{
|
|
"epoch": 0.6936936936936937,
|
|
"grad_norm": 0.25618184255532905,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3109,
|
|
"step": 77
|
|
},
|
|
{
|
|
"epoch": 0.7027027027027027,
|
|
"grad_norm": 0.27091352477500336,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3371,
|
|
"step": 78
|
|
},
|
|
{
|
|
"epoch": 0.7117117117117117,
|
|
"grad_norm": 0.270383658268609,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3325,
|
|
"step": 79
|
|
},
|
|
{
|
|
"epoch": 0.7207207207207207,
|
|
"grad_norm": 0.2525642887483178,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3288,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.7297297297297297,
|
|
"grad_norm": 0.3027811916633369,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3472,
|
|
"step": 81
|
|
},
|
|
{
|
|
"epoch": 0.7387387387387387,
|
|
"grad_norm": 0.3506129591935139,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3383,
|
|
"step": 82
|
|
},
|
|
{
|
|
"epoch": 0.7477477477477478,
|
|
"grad_norm": 0.32710280320818547,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3095,
|
|
"step": 83
|
|
},
|
|
{
|
|
"epoch": 0.7567567567567568,
|
|
"grad_norm": 0.28423266582526613,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2909,
|
|
"step": 84
|
|
},
|
|
{
|
|
"epoch": 0.7657657657657657,
|
|
"grad_norm": 0.30514786428468144,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3181,
|
|
"step": 85
|
|
},
|
|
{
|
|
"epoch": 0.7747747747747747,
|
|
"grad_norm": 0.3219013280475637,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3421,
|
|
"step": 86
|
|
},
|
|
{
|
|
"epoch": 0.7837837837837838,
|
|
"grad_norm": 0.30765019613171724,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.339,
|
|
"step": 87
|
|
},
|
|
{
|
|
"epoch": 0.7927927927927928,
|
|
"grad_norm": 0.31363666903509363,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.32,
|
|
"step": 88
|
|
},
|
|
{
|
|
"epoch": 0.8018018018018018,
|
|
"grad_norm": 0.2802553985535834,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3698,
|
|
"step": 89
|
|
},
|
|
{
|
|
"epoch": 0.8108108108108109,
|
|
"grad_norm": 0.29928509597489333,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3465,
|
|
"step": 90
|
|
},
|
|
{
|
|
"epoch": 0.8198198198198198,
|
|
"grad_norm": 0.30368274583450106,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.321,
|
|
"step": 91
|
|
},
|
|
{
|
|
"epoch": 0.8288288288288288,
|
|
"grad_norm": 0.28901190479096217,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3034,
|
|
"step": 92
|
|
},
|
|
{
|
|
"epoch": 0.8378378378378378,
|
|
"grad_norm": 0.27035852334114224,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3379,
|
|
"step": 93
|
|
},
|
|
{
|
|
"epoch": 0.8468468468468469,
|
|
"grad_norm": 0.2757989755002078,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.335,
|
|
"step": 94
|
|
},
|
|
{
|
|
"epoch": 0.8558558558558559,
|
|
"grad_norm": 0.30063030136785046,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3299,
|
|
"step": 95
|
|
},
|
|
{
|
|
"epoch": 0.8648648648648649,
|
|
"grad_norm": 0.3436429105109027,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2906,
|
|
"step": 96
|
|
},
|
|
{
|
|
"epoch": 0.8738738738738738,
|
|
"grad_norm": 0.2995609579715489,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.317,
|
|
"step": 97
|
|
},
|
|
{
|
|
"epoch": 0.8828828828828829,
|
|
"grad_norm": 0.2860865100969785,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3281,
|
|
"step": 98
|
|
},
|
|
{
|
|
"epoch": 0.8918918918918919,
|
|
"grad_norm": 0.29202094172851817,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3225,
|
|
"step": 99
|
|
},
|
|
{
|
|
"epoch": 0.9009009009009009,
|
|
"grad_norm": 0.2931365896073913,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3022,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.9099099099099099,
|
|
"grad_norm": 0.30610410355543166,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3287,
|
|
"step": 101
|
|
},
|
|
{
|
|
"epoch": 0.918918918918919,
|
|
"grad_norm": 0.27050744175601266,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3004,
|
|
"step": 102
|
|
},
|
|
{
|
|
"epoch": 0.9279279279279279,
|
|
"grad_norm": 0.2530762314307683,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3153,
|
|
"step": 103
|
|
},
|
|
{
|
|
"epoch": 0.9369369369369369,
|
|
"grad_norm": 0.2939696187606388,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3039,
|
|
"step": 104
|
|
},
|
|
{
|
|
"epoch": 0.9459459459459459,
|
|
"grad_norm": 0.269725936200039,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3028,
|
|
"step": 105
|
|
},
|
|
{
|
|
"epoch": 0.954954954954955,
|
|
"grad_norm": 0.32481323273559976,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3328,
|
|
"step": 106
|
|
},
|
|
{
|
|
"epoch": 0.963963963963964,
|
|
"grad_norm": 0.3297388133110706,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3303,
|
|
"step": 107
|
|
},
|
|
{
|
|
"epoch": 0.972972972972973,
|
|
"grad_norm": 0.3137683488542705,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3317,
|
|
"step": 108
|
|
},
|
|
{
|
|
"epoch": 0.9819819819819819,
|
|
"grad_norm": 0.2724212797943338,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3231,
|
|
"step": 109
|
|
},
|
|
{
|
|
"epoch": 0.990990990990991,
|
|
"grad_norm": 0.26974252035068974,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3052,
|
|
"step": 110
|
|
},
|
|
{
|
|
"epoch": 1.0,
|
|
"grad_norm": 0.27546705234954955,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2872,
|
|
"step": 111
|
|
},
|
|
{
|
|
"epoch": 1.009009009009009,
|
|
"grad_norm": 0.26532136740094475,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2831,
|
|
"step": 112
|
|
},
|
|
{
|
|
"epoch": 1.018018018018018,
|
|
"grad_norm": 0.2847617719081207,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3127,
|
|
"step": 113
|
|
},
|
|
{
|
|
"epoch": 1.027027027027027,
|
|
"grad_norm": 0.25187489870567525,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3179,
|
|
"step": 114
|
|
},
|
|
{
|
|
"epoch": 1.0360360360360361,
|
|
"grad_norm": 0.2470210561590589,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2888,
|
|
"step": 115
|
|
},
|
|
{
|
|
"epoch": 1.045045045045045,
|
|
"grad_norm": 0.2908873792372198,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3172,
|
|
"step": 116
|
|
},
|
|
{
|
|
"epoch": 1.054054054054054,
|
|
"grad_norm": 0.2545755890884819,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3044,
|
|
"step": 117
|
|
},
|
|
{
|
|
"epoch": 1.063063063063063,
|
|
"grad_norm": 0.2720375854552878,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3171,
|
|
"step": 118
|
|
},
|
|
{
|
|
"epoch": 1.072072072072072,
|
|
"grad_norm": 0.2804009954248822,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2903,
|
|
"step": 119
|
|
},
|
|
{
|
|
"epoch": 1.0810810810810811,
|
|
"grad_norm": 0.2584639986814767,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2786,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 1.09009009009009,
|
|
"grad_norm": 0.2523704924311713,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3009,
|
|
"step": 121
|
|
},
|
|
{
|
|
"epoch": 1.0990990990990992,
|
|
"grad_norm": 0.27166739983138516,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3144,
|
|
"step": 122
|
|
},
|
|
{
|
|
"epoch": 1.1081081081081081,
|
|
"grad_norm": 0.291934322919287,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3199,
|
|
"step": 123
|
|
},
|
|
{
|
|
"epoch": 1.117117117117117,
|
|
"grad_norm": 0.2869424658137007,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2768,
|
|
"step": 124
|
|
},
|
|
{
|
|
"epoch": 1.1261261261261262,
|
|
"grad_norm": 0.35542461802439873,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2972,
|
|
"step": 125
|
|
},
|
|
{
|
|
"epoch": 1.135135135135135,
|
|
"grad_norm": 0.25765779721736737,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2715,
|
|
"step": 126
|
|
},
|
|
{
|
|
"epoch": 1.1441441441441442,
|
|
"grad_norm": 0.2850720419420103,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2861,
|
|
"step": 127
|
|
},
|
|
{
|
|
"epoch": 1.1531531531531531,
|
|
"grad_norm": 0.2869267701696132,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2797,
|
|
"step": 128
|
|
},
|
|
{
|
|
"epoch": 1.1621621621621623,
|
|
"grad_norm": 0.27437916265446266,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.284,
|
|
"step": 129
|
|
},
|
|
{
|
|
"epoch": 1.1711711711711712,
|
|
"grad_norm": 0.26640743341471523,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.282,
|
|
"step": 130
|
|
},
|
|
{
|
|
"epoch": 1.1801801801801801,
|
|
"grad_norm": 0.2600732173679119,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2919,
|
|
"step": 131
|
|
},
|
|
{
|
|
"epoch": 1.1891891891891893,
|
|
"grad_norm": 0.2665092682109021,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3143,
|
|
"step": 132
|
|
},
|
|
{
|
|
"epoch": 1.1981981981981982,
|
|
"grad_norm": 0.24683974895824953,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2987,
|
|
"step": 133
|
|
},
|
|
{
|
|
"epoch": 1.2072072072072073,
|
|
"grad_norm": 0.2908036694917544,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3158,
|
|
"step": 134
|
|
},
|
|
{
|
|
"epoch": 1.2162162162162162,
|
|
"grad_norm": 0.2945953899064198,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3152,
|
|
"step": 135
|
|
},
|
|
{
|
|
"epoch": 1.2252252252252251,
|
|
"grad_norm": 0.2616231868963709,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3152,
|
|
"step": 136
|
|
},
|
|
{
|
|
"epoch": 1.2342342342342343,
|
|
"grad_norm": 0.27650089751312973,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3029,
|
|
"step": 137
|
|
},
|
|
{
|
|
"epoch": 1.2432432432432432,
|
|
"grad_norm": 0.2631481660529609,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3084,
|
|
"step": 138
|
|
},
|
|
{
|
|
"epoch": 1.2522522522522523,
|
|
"grad_norm": 0.28830473220819297,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3251,
|
|
"step": 139
|
|
},
|
|
{
|
|
"epoch": 1.2612612612612613,
|
|
"grad_norm": 0.3062303048487267,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3093,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 1.2702702702702702,
|
|
"grad_norm": 0.3066815320224598,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2881,
|
|
"step": 141
|
|
},
|
|
{
|
|
"epoch": 1.2792792792792793,
|
|
"grad_norm": 0.29129920876550947,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.282,
|
|
"step": 142
|
|
},
|
|
{
|
|
"epoch": 1.2882882882882882,
|
|
"grad_norm": 0.2895564905632834,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3076,
|
|
"step": 143
|
|
},
|
|
{
|
|
"epoch": 1.2972972972972974,
|
|
"grad_norm": 0.25687914463290057,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.283,
|
|
"step": 144
|
|
},
|
|
{
|
|
"epoch": 1.3063063063063063,
|
|
"grad_norm": 0.2543976032045274,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2987,
|
|
"step": 145
|
|
},
|
|
{
|
|
"epoch": 1.3153153153153152,
|
|
"grad_norm": 0.27423309545031693,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2981,
|
|
"step": 146
|
|
},
|
|
{
|
|
"epoch": 1.3243243243243243,
|
|
"grad_norm": 0.3127504643012831,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3091,
|
|
"step": 147
|
|
},
|
|
{
|
|
"epoch": 1.3333333333333333,
|
|
"grad_norm": 0.2738777266583336,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2577,
|
|
"step": 148
|
|
},
|
|
{
|
|
"epoch": 1.3423423423423424,
|
|
"grad_norm": 0.2669333852747903,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2855,
|
|
"step": 149
|
|
},
|
|
{
|
|
"epoch": 1.3513513513513513,
|
|
"grad_norm": 0.26761386479699967,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3019,
|
|
"step": 150
|
|
},
|
|
{
|
|
"epoch": 1.3603603603603602,
|
|
"grad_norm": 0.25789802423884284,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3107,
|
|
"step": 151
|
|
},
|
|
{
|
|
"epoch": 1.3693693693693694,
|
|
"grad_norm": 0.27940713368034126,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2843,
|
|
"step": 152
|
|
},
|
|
{
|
|
"epoch": 1.3783783783783785,
|
|
"grad_norm": 0.277366156708692,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2799,
|
|
"step": 153
|
|
},
|
|
{
|
|
"epoch": 1.3873873873873874,
|
|
"grad_norm": 0.2607843059312788,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.292,
|
|
"step": 154
|
|
},
|
|
{
|
|
"epoch": 1.3963963963963963,
|
|
"grad_norm": 0.2649281612489507,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3134,
|
|
"step": 155
|
|
},
|
|
{
|
|
"epoch": 1.4054054054054055,
|
|
"grad_norm": 0.27271972468771527,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2972,
|
|
"step": 156
|
|
},
|
|
{
|
|
"epoch": 1.4144144144144144,
|
|
"grad_norm": 0.26207901754212165,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2822,
|
|
"step": 157
|
|
},
|
|
{
|
|
"epoch": 1.4234234234234235,
|
|
"grad_norm": 0.2641717963089793,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2971,
|
|
"step": 158
|
|
},
|
|
{
|
|
"epoch": 1.4324324324324325,
|
|
"grad_norm": 0.2579842614638958,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3024,
|
|
"step": 159
|
|
},
|
|
{
|
|
"epoch": 1.4414414414414414,
|
|
"grad_norm": 0.2870255938899811,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2885,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 1.4504504504504505,
|
|
"grad_norm": 0.2777224839264993,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2892,
|
|
"step": 161
|
|
},
|
|
{
|
|
"epoch": 1.4594594594594594,
|
|
"grad_norm": 0.27625106290913043,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2805,
|
|
"step": 162
|
|
},
|
|
{
|
|
"epoch": 1.4684684684684686,
|
|
"grad_norm": 0.2700016737510603,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2992,
|
|
"step": 163
|
|
},
|
|
{
|
|
"epoch": 1.4774774774774775,
|
|
"grad_norm": 0.25372514988722056,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2972,
|
|
"step": 164
|
|
},
|
|
{
|
|
"epoch": 1.4864864864864864,
|
|
"grad_norm": 0.28782834487825465,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3018,
|
|
"step": 165
|
|
},
|
|
{
|
|
"epoch": 1.4954954954954955,
|
|
"grad_norm": 0.27036226357763354,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2968,
|
|
"step": 166
|
|
},
|
|
{
|
|
"epoch": 1.5045045045045045,
|
|
"grad_norm": 0.24997568182394178,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2482,
|
|
"step": 167
|
|
},
|
|
{
|
|
"epoch": 1.5135135135135136,
|
|
"grad_norm": 0.28025540658752757,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3314,
|
|
"step": 168
|
|
},
|
|
{
|
|
"epoch": 1.5225225225225225,
|
|
"grad_norm": 0.25563343479526396,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3163,
|
|
"step": 169
|
|
},
|
|
{
|
|
"epoch": 1.5315315315315314,
|
|
"grad_norm": 0.3556162754506623,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2925,
|
|
"step": 170
|
|
},
|
|
{
|
|
"epoch": 1.5405405405405406,
|
|
"grad_norm": 0.27599016482238853,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2838,
|
|
"step": 171
|
|
},
|
|
{
|
|
"epoch": 1.5495495495495497,
|
|
"grad_norm": 0.272343971725021,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3088,
|
|
"step": 172
|
|
},
|
|
{
|
|
"epoch": 1.5585585585585586,
|
|
"grad_norm": 0.28693003610171597,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2921,
|
|
"step": 173
|
|
},
|
|
{
|
|
"epoch": 1.5675675675675675,
|
|
"grad_norm": 0.2955327518594707,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2777,
|
|
"step": 174
|
|
},
|
|
{
|
|
"epoch": 1.5765765765765765,
|
|
"grad_norm": 0.27961760151449894,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2838,
|
|
"step": 175
|
|
},
|
|
{
|
|
"epoch": 1.5855855855855856,
|
|
"grad_norm": 0.24665431850909808,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2781,
|
|
"step": 176
|
|
},
|
|
{
|
|
"epoch": 1.5945945945945947,
|
|
"grad_norm": 0.26426261640553667,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2816,
|
|
"step": 177
|
|
},
|
|
{
|
|
"epoch": 1.6036036036036037,
|
|
"grad_norm": 0.2711333903704824,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3142,
|
|
"step": 178
|
|
},
|
|
{
|
|
"epoch": 1.6126126126126126,
|
|
"grad_norm": 0.2722379287245898,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2816,
|
|
"step": 179
|
|
},
|
|
{
|
|
"epoch": 1.6216216216216215,
|
|
"grad_norm": 0.3012330875667607,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3263,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 1.6306306306306306,
|
|
"grad_norm": 0.2669108739090265,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2952,
|
|
"step": 181
|
|
},
|
|
{
|
|
"epoch": 1.6396396396396398,
|
|
"grad_norm": 0.2748579289599078,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2823,
|
|
"step": 182
|
|
},
|
|
{
|
|
"epoch": 1.6486486486486487,
|
|
"grad_norm": 0.29837425745633833,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3038,
|
|
"step": 183
|
|
},
|
|
{
|
|
"epoch": 1.6576576576576576,
|
|
"grad_norm": 0.3305979404285009,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3017,
|
|
"step": 184
|
|
},
|
|
{
|
|
"epoch": 1.6666666666666665,
|
|
"grad_norm": 0.26365462645864157,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2952,
|
|
"step": 185
|
|
},
|
|
{
|
|
"epoch": 1.6756756756756757,
|
|
"grad_norm": 0.27117354048602127,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2713,
|
|
"step": 186
|
|
},
|
|
{
|
|
"epoch": 1.6846846846846848,
|
|
"grad_norm": 0.2618109082938301,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3259,
|
|
"step": 187
|
|
},
|
|
{
|
|
"epoch": 1.6936936936936937,
|
|
"grad_norm": 0.24890174240606217,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2864,
|
|
"step": 188
|
|
},
|
|
{
|
|
"epoch": 1.7027027027027026,
|
|
"grad_norm": 0.28948368345439884,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3242,
|
|
"step": 189
|
|
},
|
|
{
|
|
"epoch": 1.7117117117117115,
|
|
"grad_norm": 0.2659473815766033,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2928,
|
|
"step": 190
|
|
},
|
|
{
|
|
"epoch": 1.7207207207207207,
|
|
"grad_norm": 0.26435921312812555,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2747,
|
|
"step": 191
|
|
},
|
|
{
|
|
"epoch": 1.7297297297297298,
|
|
"grad_norm": 0.2834566804404197,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3373,
|
|
"step": 192
|
|
},
|
|
{
|
|
"epoch": 1.7387387387387387,
|
|
"grad_norm": 0.26226690378932954,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2713,
|
|
"step": 193
|
|
},
|
|
{
|
|
"epoch": 1.7477477477477477,
|
|
"grad_norm": 0.2574908549961044,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2949,
|
|
"step": 194
|
|
},
|
|
{
|
|
"epoch": 1.7567567567567568,
|
|
"grad_norm": 0.2670216430713444,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3105,
|
|
"step": 195
|
|
},
|
|
{
|
|
"epoch": 1.7657657657657657,
|
|
"grad_norm": 0.2644549565961117,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2974,
|
|
"step": 196
|
|
},
|
|
{
|
|
"epoch": 1.7747747747747749,
|
|
"grad_norm": 0.2754975911578592,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3019,
|
|
"step": 197
|
|
},
|
|
{
|
|
"epoch": 1.7837837837837838,
|
|
"grad_norm": 0.28446592391114817,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3148,
|
|
"step": 198
|
|
},
|
|
{
|
|
"epoch": 1.7927927927927927,
|
|
"grad_norm": 0.28893386362511947,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3189,
|
|
"step": 199
|
|
},
|
|
{
|
|
"epoch": 1.8018018018018018,
|
|
"grad_norm": 0.2869246676669029,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3015,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 1.810810810810811,
|
|
"grad_norm": 0.2847178633594474,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2628,
|
|
"step": 201
|
|
},
|
|
{
|
|
"epoch": 1.8198198198198199,
|
|
"grad_norm": 0.2946725850660284,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2768,
|
|
"step": 202
|
|
},
|
|
{
|
|
"epoch": 1.8288288288288288,
|
|
"grad_norm": 0.29608299277278,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3171,
|
|
"step": 203
|
|
},
|
|
{
|
|
"epoch": 1.8378378378378377,
|
|
"grad_norm": 0.28628382246998885,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3096,
|
|
"step": 204
|
|
},
|
|
{
|
|
"epoch": 1.8468468468468469,
|
|
"grad_norm": 0.2660371973699119,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2685,
|
|
"step": 205
|
|
},
|
|
{
|
|
"epoch": 1.855855855855856,
|
|
"grad_norm": 0.2514264016055165,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2622,
|
|
"step": 206
|
|
},
|
|
{
|
|
"epoch": 1.864864864864865,
|
|
"grad_norm": 0.2675623714158383,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3324,
|
|
"step": 207
|
|
},
|
|
{
|
|
"epoch": 1.8738738738738738,
|
|
"grad_norm": 0.2817065371989752,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2926,
|
|
"step": 208
|
|
},
|
|
{
|
|
"epoch": 1.8828828828828827,
|
|
"grad_norm": 0.24376840027264843,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2695,
|
|
"step": 209
|
|
},
|
|
{
|
|
"epoch": 1.8918918918918919,
|
|
"grad_norm": 0.2679237524654036,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2606,
|
|
"step": 210
|
|
},
|
|
{
|
|
"epoch": 1.900900900900901,
|
|
"grad_norm": 0.2593077892544588,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2781,
|
|
"step": 211
|
|
},
|
|
{
|
|
"epoch": 1.90990990990991,
|
|
"grad_norm": 0.2555343741606999,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2616,
|
|
"step": 212
|
|
},
|
|
{
|
|
"epoch": 1.9189189189189189,
|
|
"grad_norm": 0.27065363914180135,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.309,
|
|
"step": 213
|
|
},
|
|
{
|
|
"epoch": 1.9279279279279278,
|
|
"grad_norm": 0.29950662348843465,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2953,
|
|
"step": 214
|
|
},
|
|
{
|
|
"epoch": 1.936936936936937,
|
|
"grad_norm": 0.30392398016557,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3302,
|
|
"step": 215
|
|
},
|
|
{
|
|
"epoch": 1.945945945945946,
|
|
"grad_norm": 0.2688781676455933,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2946,
|
|
"step": 216
|
|
},
|
|
{
|
|
"epoch": 1.954954954954955,
|
|
"grad_norm": 0.27334249580678227,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3169,
|
|
"step": 217
|
|
},
|
|
{
|
|
"epoch": 1.9639639639639639,
|
|
"grad_norm": 0.2637661232011851,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2923,
|
|
"step": 218
|
|
},
|
|
{
|
|
"epoch": 1.972972972972973,
|
|
"grad_norm": 0.24845919128888916,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2956,
|
|
"step": 219
|
|
},
|
|
{
|
|
"epoch": 1.981981981981982,
|
|
"grad_norm": 0.2677476120892863,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2725,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 1.990990990990991,
|
|
"grad_norm": 0.27245457118100547,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.304,
|
|
"step": 221
|
|
},
|
|
{
|
|
"epoch": 2.0,
|
|
"grad_norm": 0.2632364290696338,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2759,
|
|
"step": 222
|
|
},
|
|
{
|
|
"epoch": 2.009009009009009,
|
|
"grad_norm": 0.29524131111947416,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2467,
|
|
"step": 223
|
|
},
|
|
{
|
|
"epoch": 2.018018018018018,
|
|
"grad_norm": 0.26959444826517864,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2509,
|
|
"step": 224
|
|
},
|
|
{
|
|
"epoch": 2.027027027027027,
|
|
"grad_norm": 0.24776989679141162,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2647,
|
|
"step": 225
|
|
},
|
|
{
|
|
"epoch": 2.036036036036036,
|
|
"grad_norm": 0.24922491602278132,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2734,
|
|
"step": 226
|
|
},
|
|
{
|
|
"epoch": 2.045045045045045,
|
|
"grad_norm": 0.2637011140567836,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2538,
|
|
"step": 227
|
|
},
|
|
{
|
|
"epoch": 2.054054054054054,
|
|
"grad_norm": 0.24677968833597697,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2569,
|
|
"step": 228
|
|
},
|
|
{
|
|
"epoch": 2.063063063063063,
|
|
"grad_norm": 0.25749179244984177,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.266,
|
|
"step": 229
|
|
},
|
|
{
|
|
"epoch": 2.0720720720720722,
|
|
"grad_norm": 0.2704364348984915,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2645,
|
|
"step": 230
|
|
},
|
|
{
|
|
"epoch": 2.081081081081081,
|
|
"grad_norm": 0.2848341811917101,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.258,
|
|
"step": 231
|
|
},
|
|
{
|
|
"epoch": 2.09009009009009,
|
|
"grad_norm": 0.2539455237645273,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2648,
|
|
"step": 232
|
|
},
|
|
{
|
|
"epoch": 2.099099099099099,
|
|
"grad_norm": 0.2534894136461773,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2611,
|
|
"step": 233
|
|
},
|
|
{
|
|
"epoch": 2.108108108108108,
|
|
"grad_norm": 0.2666435185167066,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2671,
|
|
"step": 234
|
|
},
|
|
{
|
|
"epoch": 2.1171171171171173,
|
|
"grad_norm": 0.275032039682747,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2807,
|
|
"step": 235
|
|
},
|
|
{
|
|
"epoch": 2.126126126126126,
|
|
"grad_norm": 0.24537895004936466,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2777,
|
|
"step": 236
|
|
},
|
|
{
|
|
"epoch": 2.135135135135135,
|
|
"grad_norm": 0.29459998669694115,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2782,
|
|
"step": 237
|
|
},
|
|
{
|
|
"epoch": 2.144144144144144,
|
|
"grad_norm": 0.2727554788191977,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2687,
|
|
"step": 238
|
|
},
|
|
{
|
|
"epoch": 2.153153153153153,
|
|
"grad_norm": 0.30880501847599995,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2878,
|
|
"step": 239
|
|
},
|
|
{
|
|
"epoch": 2.1621621621621623,
|
|
"grad_norm": 0.2886633976684916,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.267,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 2.171171171171171,
|
|
"grad_norm": 0.2597628174067978,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2648,
|
|
"step": 241
|
|
},
|
|
{
|
|
"epoch": 2.18018018018018,
|
|
"grad_norm": 0.2534324931692372,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2981,
|
|
"step": 242
|
|
},
|
|
{
|
|
"epoch": 2.189189189189189,
|
|
"grad_norm": 0.2563993838591747,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2487,
|
|
"step": 243
|
|
},
|
|
{
|
|
"epoch": 2.1981981981981984,
|
|
"grad_norm": 0.2852726219398302,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2543,
|
|
"step": 244
|
|
},
|
|
{
|
|
"epoch": 2.2072072072072073,
|
|
"grad_norm": 0.30478195170068134,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2562,
|
|
"step": 245
|
|
},
|
|
{
|
|
"epoch": 2.2162162162162162,
|
|
"grad_norm": 0.24772685929517294,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2869,
|
|
"step": 246
|
|
},
|
|
{
|
|
"epoch": 2.225225225225225,
|
|
"grad_norm": 0.26428977786941277,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2709,
|
|
"step": 247
|
|
},
|
|
{
|
|
"epoch": 2.234234234234234,
|
|
"grad_norm": 0.2447098843485426,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2241,
|
|
"step": 248
|
|
},
|
|
{
|
|
"epoch": 2.2432432432432434,
|
|
"grad_norm": 0.2841804786817898,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2398,
|
|
"step": 249
|
|
},
|
|
{
|
|
"epoch": 2.2522522522522523,
|
|
"grad_norm": 0.2837413945636495,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2755,
|
|
"step": 250
|
|
},
|
|
{
|
|
"epoch": 2.2612612612612613,
|
|
"grad_norm": 0.27688677145182117,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2581,
|
|
"step": 251
|
|
},
|
|
{
|
|
"epoch": 2.27027027027027,
|
|
"grad_norm": 0.2524013812037196,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2447,
|
|
"step": 252
|
|
},
|
|
{
|
|
"epoch": 2.279279279279279,
|
|
"grad_norm": 0.25708866849265744,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2629,
|
|
"step": 253
|
|
},
|
|
{
|
|
"epoch": 2.2882882882882885,
|
|
"grad_norm": 0.31089756790372536,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.262,
|
|
"step": 254
|
|
},
|
|
{
|
|
"epoch": 2.2972972972972974,
|
|
"grad_norm": 0.2580437334513352,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2672,
|
|
"step": 255
|
|
},
|
|
{
|
|
"epoch": 2.3063063063063063,
|
|
"grad_norm": 0.25589033140205797,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2854,
|
|
"step": 256
|
|
},
|
|
{
|
|
"epoch": 2.315315315315315,
|
|
"grad_norm": 0.2851188761111017,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2847,
|
|
"step": 257
|
|
},
|
|
{
|
|
"epoch": 2.3243243243243246,
|
|
"grad_norm": 0.2742352435214708,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2863,
|
|
"step": 258
|
|
},
|
|
{
|
|
"epoch": 2.3333333333333335,
|
|
"grad_norm": 0.25574343614682743,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2744,
|
|
"step": 259
|
|
},
|
|
{
|
|
"epoch": 2.3423423423423424,
|
|
"grad_norm": 0.2704501372387818,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2356,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 2.3513513513513513,
|
|
"grad_norm": 0.2694883625074875,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2845,
|
|
"step": 261
|
|
},
|
|
{
|
|
"epoch": 2.3603603603603602,
|
|
"grad_norm": 0.2749897171746042,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2745,
|
|
"step": 262
|
|
},
|
|
{
|
|
"epoch": 2.3693693693693696,
|
|
"grad_norm": 0.33678826387641014,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3088,
|
|
"step": 263
|
|
},
|
|
{
|
|
"epoch": 2.3783783783783785,
|
|
"grad_norm": 0.2773165283746789,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2946,
|
|
"step": 264
|
|
},
|
|
{
|
|
"epoch": 2.3873873873873874,
|
|
"grad_norm": 0.31677913584086903,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3179,
|
|
"step": 265
|
|
},
|
|
{
|
|
"epoch": 2.3963963963963963,
|
|
"grad_norm": 0.2563051452749462,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2625,
|
|
"step": 266
|
|
},
|
|
{
|
|
"epoch": 2.4054054054054053,
|
|
"grad_norm": 0.321688693489085,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2671,
|
|
"step": 267
|
|
},
|
|
{
|
|
"epoch": 2.4144144144144146,
|
|
"grad_norm": 0.26634437339972133,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2613,
|
|
"step": 268
|
|
},
|
|
{
|
|
"epoch": 2.4234234234234235,
|
|
"grad_norm": 0.27171211584580457,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2628,
|
|
"step": 269
|
|
},
|
|
{
|
|
"epoch": 2.4324324324324325,
|
|
"grad_norm": 0.2555430715005437,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2687,
|
|
"step": 270
|
|
},
|
|
{
|
|
"epoch": 2.4414414414414414,
|
|
"grad_norm": 0.24255848197003171,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2941,
|
|
"step": 271
|
|
},
|
|
{
|
|
"epoch": 2.4504504504504503,
|
|
"grad_norm": 0.29538238957980967,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2777,
|
|
"step": 272
|
|
},
|
|
{
|
|
"epoch": 2.4594594594594597,
|
|
"grad_norm": 0.2876545631402078,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2764,
|
|
"step": 273
|
|
},
|
|
{
|
|
"epoch": 2.4684684684684686,
|
|
"grad_norm": 0.2773762933353327,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2834,
|
|
"step": 274
|
|
},
|
|
{
|
|
"epoch": 2.4774774774774775,
|
|
"grad_norm": 0.25275190194965114,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2625,
|
|
"step": 275
|
|
},
|
|
{
|
|
"epoch": 2.4864864864864864,
|
|
"grad_norm": 0.30548139692249815,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.264,
|
|
"step": 276
|
|
},
|
|
{
|
|
"epoch": 2.4954954954954953,
|
|
"grad_norm": 0.2857116539220258,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2663,
|
|
"step": 277
|
|
},
|
|
{
|
|
"epoch": 2.5045045045045047,
|
|
"grad_norm": 0.27127459034653845,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.289,
|
|
"step": 278
|
|
},
|
|
{
|
|
"epoch": 2.5135135135135136,
|
|
"grad_norm": 0.29403524162565264,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2665,
|
|
"step": 279
|
|
},
|
|
{
|
|
"epoch": 2.5225225225225225,
|
|
"grad_norm": 0.2982604039719257,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2635,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 2.5315315315315314,
|
|
"grad_norm": 0.25776587175299304,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2592,
|
|
"step": 281
|
|
},
|
|
{
|
|
"epoch": 2.5405405405405403,
|
|
"grad_norm": 0.2646598986862087,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2579,
|
|
"step": 282
|
|
},
|
|
{
|
|
"epoch": 2.5495495495495497,
|
|
"grad_norm": 0.24717949544087905,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2544,
|
|
"step": 283
|
|
},
|
|
{
|
|
"epoch": 2.5585585585585586,
|
|
"grad_norm": 0.2657887766041429,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2656,
|
|
"step": 284
|
|
},
|
|
{
|
|
"epoch": 2.5675675675675675,
|
|
"grad_norm": 0.27748457946008864,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2675,
|
|
"step": 285
|
|
},
|
|
{
|
|
"epoch": 2.5765765765765765,
|
|
"grad_norm": 0.25089374600320746,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2816,
|
|
"step": 286
|
|
},
|
|
{
|
|
"epoch": 2.5855855855855854,
|
|
"grad_norm": 0.28897866413916584,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2589,
|
|
"step": 287
|
|
},
|
|
{
|
|
"epoch": 2.5945945945945947,
|
|
"grad_norm": 0.26235423487495346,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.271,
|
|
"step": 288
|
|
},
|
|
{
|
|
"epoch": 2.6036036036036037,
|
|
"grad_norm": 0.29773828111895406,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2884,
|
|
"step": 289
|
|
},
|
|
{
|
|
"epoch": 2.6126126126126126,
|
|
"grad_norm": 0.2732062490555635,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.297,
|
|
"step": 290
|
|
},
|
|
{
|
|
"epoch": 2.6216216216216215,
|
|
"grad_norm": 0.28269145506341653,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2794,
|
|
"step": 291
|
|
},
|
|
{
|
|
"epoch": 2.6306306306306304,
|
|
"grad_norm": 0.2592351362804753,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2653,
|
|
"step": 292
|
|
},
|
|
{
|
|
"epoch": 2.6396396396396398,
|
|
"grad_norm": 0.27363184791488976,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2659,
|
|
"step": 293
|
|
},
|
|
{
|
|
"epoch": 2.6486486486486487,
|
|
"grad_norm": 0.2687283362268144,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2881,
|
|
"step": 294
|
|
},
|
|
{
|
|
"epoch": 2.6576576576576576,
|
|
"grad_norm": 0.2669999794761192,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2658,
|
|
"step": 295
|
|
},
|
|
{
|
|
"epoch": 2.6666666666666665,
|
|
"grad_norm": 0.2584313873251436,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2597,
|
|
"step": 296
|
|
},
|
|
{
|
|
"epoch": 2.6756756756756754,
|
|
"grad_norm": 0.2870412914872632,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.273,
|
|
"step": 297
|
|
},
|
|
{
|
|
"epoch": 2.684684684684685,
|
|
"grad_norm": 0.2565405158611234,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2471,
|
|
"step": 298
|
|
},
|
|
{
|
|
"epoch": 2.6936936936936937,
|
|
"grad_norm": 0.2718920473228364,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2556,
|
|
"step": 299
|
|
},
|
|
{
|
|
"epoch": 2.7027027027027026,
|
|
"grad_norm": 0.2732398668856954,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2729,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 2.7117117117117115,
|
|
"grad_norm": 0.25213076888264274,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2659,
|
|
"step": 301
|
|
},
|
|
{
|
|
"epoch": 2.7207207207207205,
|
|
"grad_norm": 0.25342262780535696,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2791,
|
|
"step": 302
|
|
},
|
|
{
|
|
"epoch": 2.72972972972973,
|
|
"grad_norm": 0.2929513672092119,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2714,
|
|
"step": 303
|
|
},
|
|
{
|
|
"epoch": 2.7387387387387387,
|
|
"grad_norm": 0.27482309634629043,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2646,
|
|
"step": 304
|
|
},
|
|
{
|
|
"epoch": 2.7477477477477477,
|
|
"grad_norm": 0.26495695016553,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2655,
|
|
"step": 305
|
|
},
|
|
{
|
|
"epoch": 2.756756756756757,
|
|
"grad_norm": 0.2751450071843517,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.282,
|
|
"step": 306
|
|
},
|
|
{
|
|
"epoch": 2.7657657657657655,
|
|
"grad_norm": 0.2492074837362159,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2728,
|
|
"step": 307
|
|
},
|
|
{
|
|
"epoch": 2.774774774774775,
|
|
"grad_norm": 0.24588259514355568,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2506,
|
|
"step": 308
|
|
},
|
|
{
|
|
"epoch": 2.7837837837837838,
|
|
"grad_norm": 0.290865691950273,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3019,
|
|
"step": 309
|
|
},
|
|
{
|
|
"epoch": 2.7927927927927927,
|
|
"grad_norm": 0.24649105252907824,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3028,
|
|
"step": 310
|
|
},
|
|
{
|
|
"epoch": 2.801801801801802,
|
|
"grad_norm": 0.24865219694730992,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2647,
|
|
"step": 311
|
|
},
|
|
{
|
|
"epoch": 2.810810810810811,
|
|
"grad_norm": 0.2641273618850612,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2743,
|
|
"step": 312
|
|
},
|
|
{
|
|
"epoch": 2.81981981981982,
|
|
"grad_norm": 0.27036448999028867,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2483,
|
|
"step": 313
|
|
},
|
|
{
|
|
"epoch": 2.828828828828829,
|
|
"grad_norm": 0.277820288498933,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2478,
|
|
"step": 314
|
|
},
|
|
{
|
|
"epoch": 2.8378378378378377,
|
|
"grad_norm": 0.25834412495274456,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2395,
|
|
"step": 315
|
|
},
|
|
{
|
|
"epoch": 2.846846846846847,
|
|
"grad_norm": 0.25827263911198917,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.222,
|
|
"step": 316
|
|
},
|
|
{
|
|
"epoch": 2.855855855855856,
|
|
"grad_norm": 0.28475747286608616,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2747,
|
|
"step": 317
|
|
},
|
|
{
|
|
"epoch": 2.864864864864865,
|
|
"grad_norm": 0.25037323222188695,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2689,
|
|
"step": 318
|
|
},
|
|
{
|
|
"epoch": 2.873873873873874,
|
|
"grad_norm": 0.2652972773806203,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2477,
|
|
"step": 319
|
|
},
|
|
{
|
|
"epoch": 2.8828828828828827,
|
|
"grad_norm": 0.26279014403702605,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2734,
|
|
"step": 320
|
|
},
|
|
{
|
|
"epoch": 2.891891891891892,
|
|
"grad_norm": 0.2854042712916503,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2679,
|
|
"step": 321
|
|
},
|
|
{
|
|
"epoch": 2.900900900900901,
|
|
"grad_norm": 0.26077805779165003,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2608,
|
|
"step": 322
|
|
},
|
|
{
|
|
"epoch": 2.90990990990991,
|
|
"grad_norm": 0.255112093170312,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2695,
|
|
"step": 323
|
|
},
|
|
{
|
|
"epoch": 2.918918918918919,
|
|
"grad_norm": 0.26211588620202336,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2424,
|
|
"step": 324
|
|
},
|
|
{
|
|
"epoch": 2.9279279279279278,
|
|
"grad_norm": 0.2685084403266774,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.235,
|
|
"step": 325
|
|
},
|
|
{
|
|
"epoch": 2.936936936936937,
|
|
"grad_norm": 0.27269803144536753,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2759,
|
|
"step": 326
|
|
},
|
|
{
|
|
"epoch": 2.945945945945946,
|
|
"grad_norm": 0.26751393672770624,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2564,
|
|
"step": 327
|
|
},
|
|
{
|
|
"epoch": 2.954954954954955,
|
|
"grad_norm": 0.2665543902683488,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2763,
|
|
"step": 328
|
|
},
|
|
{
|
|
"epoch": 2.963963963963964,
|
|
"grad_norm": 0.28496550173938856,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2762,
|
|
"step": 329
|
|
},
|
|
{
|
|
"epoch": 2.972972972972973,
|
|
"grad_norm": 0.2567341688659859,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2756,
|
|
"step": 330
|
|
},
|
|
{
|
|
"epoch": 2.981981981981982,
|
|
"grad_norm": 0.2584671428651672,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.3001,
|
|
"step": 331
|
|
},
|
|
{
|
|
"epoch": 2.990990990990991,
|
|
"grad_norm": 0.2804525292556161,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2785,
|
|
"step": 332
|
|
},
|
|
{
|
|
"epoch": 3.0,
|
|
"grad_norm": 0.24187503112431247,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2454,
|
|
"step": 333
|
|
},
|
|
{
|
|
"epoch": 3.009009009009009,
|
|
"grad_norm": 0.26503328616806615,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2332,
|
|
"step": 334
|
|
},
|
|
{
|
|
"epoch": 3.018018018018018,
|
|
"grad_norm": 0.2631846355641096,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2658,
|
|
"step": 335
|
|
},
|
|
{
|
|
"epoch": 3.027027027027027,
|
|
"grad_norm": 0.2786137851508687,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2519,
|
|
"step": 336
|
|
},
|
|
{
|
|
"epoch": 3.036036036036036,
|
|
"grad_norm": 0.2755722489630619,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.206,
|
|
"step": 337
|
|
},
|
|
{
|
|
"epoch": 3.045045045045045,
|
|
"grad_norm": 0.2530795628224832,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2029,
|
|
"step": 338
|
|
},
|
|
{
|
|
"epoch": 3.054054054054054,
|
|
"grad_norm": 0.25959049991529565,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2371,
|
|
"step": 339
|
|
},
|
|
{
|
|
"epoch": 3.063063063063063,
|
|
"grad_norm": 0.2916294807412774,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2556,
|
|
"step": 340
|
|
},
|
|
{
|
|
"epoch": 3.0720720720720722,
|
|
"grad_norm": 0.2790615318391773,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2198,
|
|
"step": 341
|
|
},
|
|
{
|
|
"epoch": 3.081081081081081,
|
|
"grad_norm": 0.3203392671142568,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2693,
|
|
"step": 342
|
|
},
|
|
{
|
|
"epoch": 3.09009009009009,
|
|
"grad_norm": 0.2576637679316666,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2304,
|
|
"step": 343
|
|
},
|
|
{
|
|
"epoch": 3.099099099099099,
|
|
"grad_norm": 0.24928248944605377,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2291,
|
|
"step": 344
|
|
},
|
|
{
|
|
"epoch": 3.108108108108108,
|
|
"grad_norm": 0.26793696602953165,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2445,
|
|
"step": 345
|
|
},
|
|
{
|
|
"epoch": 3.1171171171171173,
|
|
"grad_norm": 0.2971915014155351,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2416,
|
|
"step": 346
|
|
},
|
|
{
|
|
"epoch": 3.126126126126126,
|
|
"grad_norm": 0.27752473751722373,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.219,
|
|
"step": 347
|
|
},
|
|
{
|
|
"epoch": 3.135135135135135,
|
|
"grad_norm": 0.2794534994824032,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2297,
|
|
"step": 348
|
|
},
|
|
{
|
|
"epoch": 3.144144144144144,
|
|
"grad_norm": 0.25926948541703204,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2411,
|
|
"step": 349
|
|
},
|
|
{
|
|
"epoch": 3.153153153153153,
|
|
"grad_norm": 0.24722918054941584,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2419,
|
|
"step": 350
|
|
},
|
|
{
|
|
"epoch": 3.1621621621621623,
|
|
"grad_norm": 0.26203389917376085,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.234,
|
|
"step": 351
|
|
},
|
|
{
|
|
"epoch": 3.171171171171171,
|
|
"grad_norm": 0.2472074514309984,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2363,
|
|
"step": 352
|
|
},
|
|
{
|
|
"epoch": 3.18018018018018,
|
|
"grad_norm": 0.2945063702553609,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2435,
|
|
"step": 353
|
|
},
|
|
{
|
|
"epoch": 3.189189189189189,
|
|
"grad_norm": 0.27925373635853185,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2746,
|
|
"step": 354
|
|
},
|
|
{
|
|
"epoch": 3.1981981981981984,
|
|
"grad_norm": 0.24996405356591392,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2357,
|
|
"step": 355
|
|
},
|
|
{
|
|
"epoch": 3.2072072072072073,
|
|
"grad_norm": 0.2556106250304069,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2499,
|
|
"step": 356
|
|
},
|
|
{
|
|
"epoch": 3.2162162162162162,
|
|
"grad_norm": 0.26114830248277804,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2435,
|
|
"step": 357
|
|
},
|
|
{
|
|
"epoch": 3.225225225225225,
|
|
"grad_norm": 0.2469470177899144,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2391,
|
|
"step": 358
|
|
},
|
|
{
|
|
"epoch": 3.234234234234234,
|
|
"grad_norm": 0.2641345310685226,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1981,
|
|
"step": 359
|
|
},
|
|
{
|
|
"epoch": 3.2432432432432434,
|
|
"grad_norm": 0.2630942786949833,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2098,
|
|
"step": 360
|
|
},
|
|
{
|
|
"epoch": 3.2522522522522523,
|
|
"grad_norm": 0.24708329710543495,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2228,
|
|
"step": 361
|
|
},
|
|
{
|
|
"epoch": 3.2612612612612613,
|
|
"grad_norm": 0.25000693689900794,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.242,
|
|
"step": 362
|
|
},
|
|
{
|
|
"epoch": 3.27027027027027,
|
|
"grad_norm": 0.2554644897448756,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2558,
|
|
"step": 363
|
|
},
|
|
{
|
|
"epoch": 3.279279279279279,
|
|
"grad_norm": 0.25264038317978293,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2009,
|
|
"step": 364
|
|
},
|
|
{
|
|
"epoch": 3.2882882882882885,
|
|
"grad_norm": 0.2743512388274681,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2331,
|
|
"step": 365
|
|
},
|
|
{
|
|
"epoch": 3.2972972972972974,
|
|
"grad_norm": 0.2728238972210015,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.238,
|
|
"step": 366
|
|
},
|
|
{
|
|
"epoch": 3.3063063063063063,
|
|
"grad_norm": 0.2602352997656632,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2595,
|
|
"step": 367
|
|
},
|
|
{
|
|
"epoch": 3.315315315315315,
|
|
"grad_norm": 0.27036311534944873,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2093,
|
|
"step": 368
|
|
},
|
|
{
|
|
"epoch": 3.3243243243243246,
|
|
"grad_norm": 0.264625202176752,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2579,
|
|
"step": 369
|
|
},
|
|
{
|
|
"epoch": 3.3333333333333335,
|
|
"grad_norm": 0.259895631515348,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2375,
|
|
"step": 370
|
|
},
|
|
{
|
|
"epoch": 3.3423423423423424,
|
|
"grad_norm": 0.2563353260712296,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2364,
|
|
"step": 371
|
|
},
|
|
{
|
|
"epoch": 3.3513513513513513,
|
|
"grad_norm": 0.28822107627305354,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2493,
|
|
"step": 372
|
|
},
|
|
{
|
|
"epoch": 3.3603603603603602,
|
|
"grad_norm": 0.25680447088580227,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2501,
|
|
"step": 373
|
|
},
|
|
{
|
|
"epoch": 3.3693693693693696,
|
|
"grad_norm": 0.27784185650966475,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.245,
|
|
"step": 374
|
|
},
|
|
{
|
|
"epoch": 3.3783783783783785,
|
|
"grad_norm": 0.2627541742958857,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2387,
|
|
"step": 375
|
|
},
|
|
{
|
|
"epoch": 3.3873873873873874,
|
|
"grad_norm": 0.24193274859474298,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2375,
|
|
"step": 376
|
|
},
|
|
{
|
|
"epoch": 3.3963963963963963,
|
|
"grad_norm": 0.258378796876473,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2281,
|
|
"step": 377
|
|
},
|
|
{
|
|
"epoch": 3.4054054054054053,
|
|
"grad_norm": 0.2749899330352957,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.24,
|
|
"step": 378
|
|
},
|
|
{
|
|
"epoch": 3.4144144144144146,
|
|
"grad_norm": 0.25777164751813997,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2524,
|
|
"step": 379
|
|
},
|
|
{
|
|
"epoch": 3.4234234234234235,
|
|
"grad_norm": 0.2805168544005753,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2415,
|
|
"step": 380
|
|
},
|
|
{
|
|
"epoch": 3.4324324324324325,
|
|
"grad_norm": 0.25842839628916536,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2433,
|
|
"step": 381
|
|
},
|
|
{
|
|
"epoch": 3.4414414414414414,
|
|
"grad_norm": 0.26639980982056893,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2403,
|
|
"step": 382
|
|
},
|
|
{
|
|
"epoch": 3.4504504504504503,
|
|
"grad_norm": 0.3060982219805088,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2236,
|
|
"step": 383
|
|
},
|
|
{
|
|
"epoch": 3.4594594594594597,
|
|
"grad_norm": 0.26146902459280136,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2504,
|
|
"step": 384
|
|
},
|
|
{
|
|
"epoch": 3.4684684684684686,
|
|
"grad_norm": 0.25380491317438975,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2464,
|
|
"step": 385
|
|
},
|
|
{
|
|
"epoch": 3.4774774774774775,
|
|
"grad_norm": 0.27324232509875496,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2404,
|
|
"step": 386
|
|
},
|
|
{
|
|
"epoch": 3.4864864864864864,
|
|
"grad_norm": 0.2651723560610241,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.226,
|
|
"step": 387
|
|
},
|
|
{
|
|
"epoch": 3.4954954954954953,
|
|
"grad_norm": 0.2689389917124243,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2751,
|
|
"step": 388
|
|
},
|
|
{
|
|
"epoch": 3.5045045045045047,
|
|
"grad_norm": 0.2643418768447757,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2298,
|
|
"step": 389
|
|
},
|
|
{
|
|
"epoch": 3.5135135135135136,
|
|
"grad_norm": 0.24935046689303417,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2505,
|
|
"step": 390
|
|
},
|
|
{
|
|
"epoch": 3.5225225225225225,
|
|
"grad_norm": 0.2508765789856499,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2478,
|
|
"step": 391
|
|
},
|
|
{
|
|
"epoch": 3.5315315315315314,
|
|
"grad_norm": 0.26705709850776205,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2225,
|
|
"step": 392
|
|
},
|
|
{
|
|
"epoch": 3.5405405405405403,
|
|
"grad_norm": 0.2573422869010653,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2122,
|
|
"step": 393
|
|
},
|
|
{
|
|
"epoch": 3.5495495495495497,
|
|
"grad_norm": 0.2770154762726231,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2802,
|
|
"step": 394
|
|
},
|
|
{
|
|
"epoch": 3.5585585585585586,
|
|
"grad_norm": 0.26710684568846427,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2275,
|
|
"step": 395
|
|
},
|
|
{
|
|
"epoch": 3.5675675675675675,
|
|
"grad_norm": 0.2527476600992376,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2445,
|
|
"step": 396
|
|
},
|
|
{
|
|
"epoch": 3.5765765765765765,
|
|
"grad_norm": 0.2521141774058005,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2342,
|
|
"step": 397
|
|
},
|
|
{
|
|
"epoch": 3.5855855855855854,
|
|
"grad_norm": 0.2689995200221707,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2518,
|
|
"step": 398
|
|
},
|
|
{
|
|
"epoch": 3.5945945945945947,
|
|
"grad_norm": 0.25908754443823273,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2386,
|
|
"step": 399
|
|
},
|
|
{
|
|
"epoch": 3.6036036036036037,
|
|
"grad_norm": 0.273518168337783,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2641,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 3.6126126126126126,
|
|
"grad_norm": 0.26669639385445737,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2359,
|
|
"step": 401
|
|
},
|
|
{
|
|
"epoch": 3.6216216216216215,
|
|
"grad_norm": 0.2560702170541,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2337,
|
|
"step": 402
|
|
},
|
|
{
|
|
"epoch": 3.6306306306306304,
|
|
"grad_norm": 0.2461177958525498,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2401,
|
|
"step": 403
|
|
},
|
|
{
|
|
"epoch": 3.6396396396396398,
|
|
"grad_norm": 0.2648097200804019,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2144,
|
|
"step": 404
|
|
},
|
|
{
|
|
"epoch": 3.6486486486486487,
|
|
"grad_norm": 0.2646834290329095,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2493,
|
|
"step": 405
|
|
},
|
|
{
|
|
"epoch": 3.6576576576576576,
|
|
"grad_norm": 0.2796973639180676,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2467,
|
|
"step": 406
|
|
},
|
|
{
|
|
"epoch": 3.6666666666666665,
|
|
"grad_norm": 0.25308085485220105,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.227,
|
|
"step": 407
|
|
},
|
|
{
|
|
"epoch": 3.6756756756756754,
|
|
"grad_norm": 0.2587012503429008,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2285,
|
|
"step": 408
|
|
},
|
|
{
|
|
"epoch": 3.684684684684685,
|
|
"grad_norm": 0.2958300777778266,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2547,
|
|
"step": 409
|
|
},
|
|
{
|
|
"epoch": 3.6936936936936937,
|
|
"grad_norm": 0.25334395158267925,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2693,
|
|
"step": 410
|
|
},
|
|
{
|
|
"epoch": 3.7027027027027026,
|
|
"grad_norm": 0.29019457155713096,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2518,
|
|
"step": 411
|
|
},
|
|
{
|
|
"epoch": 3.7117117117117115,
|
|
"grad_norm": 0.2473020184393372,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2344,
|
|
"step": 412
|
|
},
|
|
{
|
|
"epoch": 3.7207207207207205,
|
|
"grad_norm": 0.270453761649425,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2275,
|
|
"step": 413
|
|
},
|
|
{
|
|
"epoch": 3.72972972972973,
|
|
"grad_norm": 0.2602131546551776,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2428,
|
|
"step": 414
|
|
},
|
|
{
|
|
"epoch": 3.7387387387387387,
|
|
"grad_norm": 0.29110180180417683,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2301,
|
|
"step": 415
|
|
},
|
|
{
|
|
"epoch": 3.7477477477477477,
|
|
"grad_norm": 0.25367703106621997,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2377,
|
|
"step": 416
|
|
},
|
|
{
|
|
"epoch": 3.756756756756757,
|
|
"grad_norm": 0.257299738969486,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2137,
|
|
"step": 417
|
|
},
|
|
{
|
|
"epoch": 3.7657657657657655,
|
|
"grad_norm": 0.257656312443973,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2567,
|
|
"step": 418
|
|
},
|
|
{
|
|
"epoch": 3.774774774774775,
|
|
"grad_norm": 0.2808325095308855,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2571,
|
|
"step": 419
|
|
},
|
|
{
|
|
"epoch": 3.7837837837837838,
|
|
"grad_norm": 0.2657618644204265,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2382,
|
|
"step": 420
|
|
},
|
|
{
|
|
"epoch": 3.7927927927927927,
|
|
"grad_norm": 0.27556658674977147,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2748,
|
|
"step": 421
|
|
},
|
|
{
|
|
"epoch": 3.801801801801802,
|
|
"grad_norm": 0.2783118243091199,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2349,
|
|
"step": 422
|
|
},
|
|
{
|
|
"epoch": 3.810810810810811,
|
|
"grad_norm": 0.27683880390148435,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2545,
|
|
"step": 423
|
|
},
|
|
{
|
|
"epoch": 3.81981981981982,
|
|
"grad_norm": 0.24903071725050696,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2436,
|
|
"step": 424
|
|
},
|
|
{
|
|
"epoch": 3.828828828828829,
|
|
"grad_norm": 0.27140890180707533,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2103,
|
|
"step": 425
|
|
},
|
|
{
|
|
"epoch": 3.8378378378378377,
|
|
"grad_norm": 0.25999693913444694,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2703,
|
|
"step": 426
|
|
},
|
|
{
|
|
"epoch": 3.846846846846847,
|
|
"grad_norm": 0.28165585165926776,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2395,
|
|
"step": 427
|
|
},
|
|
{
|
|
"epoch": 3.855855855855856,
|
|
"grad_norm": 0.26800670806664434,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2855,
|
|
"step": 428
|
|
},
|
|
{
|
|
"epoch": 3.864864864864865,
|
|
"grad_norm": 0.26752171553410126,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2525,
|
|
"step": 429
|
|
},
|
|
{
|
|
"epoch": 3.873873873873874,
|
|
"grad_norm": 0.2550812423474624,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2357,
|
|
"step": 430
|
|
},
|
|
{
|
|
"epoch": 3.8828828828828827,
|
|
"grad_norm": 0.25341757674985854,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2752,
|
|
"step": 431
|
|
},
|
|
{
|
|
"epoch": 3.891891891891892,
|
|
"grad_norm": 0.2714456590973952,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2399,
|
|
"step": 432
|
|
},
|
|
{
|
|
"epoch": 3.900900900900901,
|
|
"grad_norm": 0.2832850264958553,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.232,
|
|
"step": 433
|
|
},
|
|
{
|
|
"epoch": 3.90990990990991,
|
|
"grad_norm": 0.2560994537050628,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2457,
|
|
"step": 434
|
|
},
|
|
{
|
|
"epoch": 3.918918918918919,
|
|
"grad_norm": 0.2624403245035626,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2782,
|
|
"step": 435
|
|
},
|
|
{
|
|
"epoch": 3.9279279279279278,
|
|
"grad_norm": 0.2645012258501843,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2432,
|
|
"step": 436
|
|
},
|
|
{
|
|
"epoch": 3.936936936936937,
|
|
"grad_norm": 0.26607477226554654,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2202,
|
|
"step": 437
|
|
},
|
|
{
|
|
"epoch": 3.945945945945946,
|
|
"grad_norm": 0.2731452758231204,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2689,
|
|
"step": 438
|
|
},
|
|
{
|
|
"epoch": 3.954954954954955,
|
|
"grad_norm": 0.2964590337329977,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.229,
|
|
"step": 439
|
|
},
|
|
{
|
|
"epoch": 3.963963963963964,
|
|
"grad_norm": 0.2787999534447745,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2527,
|
|
"step": 440
|
|
},
|
|
{
|
|
"epoch": 3.972972972972973,
|
|
"grad_norm": 0.24055312465968123,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2231,
|
|
"step": 441
|
|
},
|
|
{
|
|
"epoch": 3.981981981981982,
|
|
"grad_norm": 0.2757745274177008,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2437,
|
|
"step": 442
|
|
},
|
|
{
|
|
"epoch": 3.990990990990991,
|
|
"grad_norm": 0.26536706718909975,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2335,
|
|
"step": 443
|
|
},
|
|
{
|
|
"epoch": 4.0,
|
|
"grad_norm": 0.2390963333912312,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2237,
|
|
"step": 444
|
|
},
|
|
{
|
|
"epoch": 4.009009009009009,
|
|
"grad_norm": 0.2720207934109716,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2316,
|
|
"step": 445
|
|
},
|
|
{
|
|
"epoch": 4.018018018018018,
|
|
"grad_norm": 0.2673459557274162,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2205,
|
|
"step": 446
|
|
},
|
|
{
|
|
"epoch": 4.027027027027027,
|
|
"grad_norm": 0.24447403903164172,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2033,
|
|
"step": 447
|
|
},
|
|
{
|
|
"epoch": 4.036036036036036,
|
|
"grad_norm": 0.29354577394627634,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2059,
|
|
"step": 448
|
|
},
|
|
{
|
|
"epoch": 4.045045045045045,
|
|
"grad_norm": 0.28252004790921936,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2222,
|
|
"step": 449
|
|
},
|
|
{
|
|
"epoch": 4.054054054054054,
|
|
"grad_norm": 0.279624558559084,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2485,
|
|
"step": 450
|
|
},
|
|
{
|
|
"epoch": 4.063063063063063,
|
|
"grad_norm": 0.2742544682456035,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2153,
|
|
"step": 451
|
|
},
|
|
{
|
|
"epoch": 4.072072072072072,
|
|
"grad_norm": 0.26315979594288036,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1967,
|
|
"step": 452
|
|
},
|
|
{
|
|
"epoch": 4.081081081081081,
|
|
"grad_norm": 0.25548950244986113,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2108,
|
|
"step": 453
|
|
},
|
|
{
|
|
"epoch": 4.09009009009009,
|
|
"grad_norm": 0.260763131351132,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.21,
|
|
"step": 454
|
|
},
|
|
{
|
|
"epoch": 4.099099099099099,
|
|
"grad_norm": 0.2705243300559351,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1926,
|
|
"step": 455
|
|
},
|
|
{
|
|
"epoch": 4.108108108108108,
|
|
"grad_norm": 0.2572296275587587,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2182,
|
|
"step": 456
|
|
},
|
|
{
|
|
"epoch": 4.117117117117117,
|
|
"grad_norm": 0.32370825872086306,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1912,
|
|
"step": 457
|
|
},
|
|
{
|
|
"epoch": 4.126126126126126,
|
|
"grad_norm": 0.24556795850306926,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2071,
|
|
"step": 458
|
|
},
|
|
{
|
|
"epoch": 4.135135135135135,
|
|
"grad_norm": 0.23389148126428516,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.234,
|
|
"step": 459
|
|
},
|
|
{
|
|
"epoch": 4.1441441441441444,
|
|
"grad_norm": 0.2428236778448457,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2119,
|
|
"step": 460
|
|
},
|
|
{
|
|
"epoch": 4.153153153153153,
|
|
"grad_norm": 0.31106881930176683,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2124,
|
|
"step": 461
|
|
},
|
|
{
|
|
"epoch": 4.162162162162162,
|
|
"grad_norm": 0.27122185214756195,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2226,
|
|
"step": 462
|
|
},
|
|
{
|
|
"epoch": 4.171171171171171,
|
|
"grad_norm": 0.2996732981773459,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2299,
|
|
"step": 463
|
|
},
|
|
{
|
|
"epoch": 4.18018018018018,
|
|
"grad_norm": 0.27023462008753,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2557,
|
|
"step": 464
|
|
},
|
|
{
|
|
"epoch": 4.1891891891891895,
|
|
"grad_norm": 0.25842796305339033,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2325,
|
|
"step": 465
|
|
},
|
|
{
|
|
"epoch": 4.198198198198198,
|
|
"grad_norm": 0.2437169161762717,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1797,
|
|
"step": 466
|
|
},
|
|
{
|
|
"epoch": 4.207207207207207,
|
|
"grad_norm": 0.26780073229070595,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2067,
|
|
"step": 467
|
|
},
|
|
{
|
|
"epoch": 4.216216216216216,
|
|
"grad_norm": 0.2670888124294135,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2205,
|
|
"step": 468
|
|
},
|
|
{
|
|
"epoch": 4.225225225225225,
|
|
"grad_norm": 0.25879921020859936,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2085,
|
|
"step": 469
|
|
},
|
|
{
|
|
"epoch": 4.2342342342342345,
|
|
"grad_norm": 0.26317981293875226,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2336,
|
|
"step": 470
|
|
},
|
|
{
|
|
"epoch": 4.243243243243243,
|
|
"grad_norm": 0.23931715866089387,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2124,
|
|
"step": 471
|
|
},
|
|
{
|
|
"epoch": 4.252252252252252,
|
|
"grad_norm": 0.2691126922298142,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2065,
|
|
"step": 472
|
|
},
|
|
{
|
|
"epoch": 4.261261261261261,
|
|
"grad_norm": 0.23991879914940956,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2045,
|
|
"step": 473
|
|
},
|
|
{
|
|
"epoch": 4.27027027027027,
|
|
"grad_norm": 0.2548563923839949,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1836,
|
|
"step": 474
|
|
},
|
|
{
|
|
"epoch": 4.2792792792792795,
|
|
"grad_norm": 0.24697361737276458,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1792,
|
|
"step": 475
|
|
},
|
|
{
|
|
"epoch": 4.288288288288288,
|
|
"grad_norm": 0.2829022630675641,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2098,
|
|
"step": 476
|
|
},
|
|
{
|
|
"epoch": 4.297297297297297,
|
|
"grad_norm": 0.2620700761228271,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2102,
|
|
"step": 477
|
|
},
|
|
{
|
|
"epoch": 4.306306306306306,
|
|
"grad_norm": 0.2628063026021744,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2025,
|
|
"step": 478
|
|
},
|
|
{
|
|
"epoch": 4.315315315315315,
|
|
"grad_norm": 0.2863724297024661,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.237,
|
|
"step": 479
|
|
},
|
|
{
|
|
"epoch": 4.324324324324325,
|
|
"grad_norm": 0.25990971129318524,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2116,
|
|
"step": 480
|
|
},
|
|
{
|
|
"epoch": 4.333333333333333,
|
|
"grad_norm": 0.2606038664504591,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2092,
|
|
"step": 481
|
|
},
|
|
{
|
|
"epoch": 4.342342342342342,
|
|
"grad_norm": 0.253863280864317,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.164,
|
|
"step": 482
|
|
},
|
|
{
|
|
"epoch": 4.351351351351352,
|
|
"grad_norm": 0.24650022322727727,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1852,
|
|
"step": 483
|
|
},
|
|
{
|
|
"epoch": 4.36036036036036,
|
|
"grad_norm": 0.25369962373757826,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2183,
|
|
"step": 484
|
|
},
|
|
{
|
|
"epoch": 4.36936936936937,
|
|
"grad_norm": 0.28375278856958064,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2052,
|
|
"step": 485
|
|
},
|
|
{
|
|
"epoch": 4.378378378378378,
|
|
"grad_norm": 0.24267201305207473,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2367,
|
|
"step": 486
|
|
},
|
|
{
|
|
"epoch": 4.387387387387387,
|
|
"grad_norm": 0.25205690964559024,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2212,
|
|
"step": 487
|
|
},
|
|
{
|
|
"epoch": 4.396396396396397,
|
|
"grad_norm": 0.25716800876310375,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1906,
|
|
"step": 488
|
|
},
|
|
{
|
|
"epoch": 4.405405405405405,
|
|
"grad_norm": 0.23704968081876604,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2075,
|
|
"step": 489
|
|
},
|
|
{
|
|
"epoch": 4.414414414414415,
|
|
"grad_norm": 0.3201956523912786,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.198,
|
|
"step": 490
|
|
},
|
|
{
|
|
"epoch": 4.423423423423423,
|
|
"grad_norm": 0.26301398337918436,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1878,
|
|
"step": 491
|
|
},
|
|
{
|
|
"epoch": 4.4324324324324325,
|
|
"grad_norm": 0.27402028797210554,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2104,
|
|
"step": 492
|
|
},
|
|
{
|
|
"epoch": 4.441441441441442,
|
|
"grad_norm": 0.33955450203665727,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2205,
|
|
"step": 493
|
|
},
|
|
{
|
|
"epoch": 4.45045045045045,
|
|
"grad_norm": 0.26220441610422024,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.218,
|
|
"step": 494
|
|
},
|
|
{
|
|
"epoch": 4.45945945945946,
|
|
"grad_norm": 0.281656218031479,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2235,
|
|
"step": 495
|
|
},
|
|
{
|
|
"epoch": 4.468468468468468,
|
|
"grad_norm": 0.29159551817654267,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2178,
|
|
"step": 496
|
|
},
|
|
{
|
|
"epoch": 4.4774774774774775,
|
|
"grad_norm": 0.2623117148967965,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2178,
|
|
"step": 497
|
|
},
|
|
{
|
|
"epoch": 4.486486486486487,
|
|
"grad_norm": 0.2531840506893455,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2086,
|
|
"step": 498
|
|
},
|
|
{
|
|
"epoch": 4.495495495495495,
|
|
"grad_norm": 0.25528977769788064,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2186,
|
|
"step": 499
|
|
},
|
|
{
|
|
"epoch": 4.504504504504505,
|
|
"grad_norm": 0.2679628655435481,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2237,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 4.513513513513513,
|
|
"grad_norm": 0.263719988749634,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.194,
|
|
"step": 501
|
|
},
|
|
{
|
|
"epoch": 4.5225225225225225,
|
|
"grad_norm": 0.273138889734998,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2086,
|
|
"step": 502
|
|
},
|
|
{
|
|
"epoch": 4.531531531531532,
|
|
"grad_norm": 0.292878429342998,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2154,
|
|
"step": 503
|
|
},
|
|
{
|
|
"epoch": 4.54054054054054,
|
|
"grad_norm": 0.27619815070018144,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2049,
|
|
"step": 504
|
|
},
|
|
{
|
|
"epoch": 4.54954954954955,
|
|
"grad_norm": 0.27527630799114594,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2213,
|
|
"step": 505
|
|
},
|
|
{
|
|
"epoch": 4.558558558558558,
|
|
"grad_norm": 0.26879133234631997,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2238,
|
|
"step": 506
|
|
},
|
|
{
|
|
"epoch": 4.5675675675675675,
|
|
"grad_norm": 0.272548643979066,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2114,
|
|
"step": 507
|
|
},
|
|
{
|
|
"epoch": 4.576576576576577,
|
|
"grad_norm": 0.27819059711468064,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2157,
|
|
"step": 508
|
|
},
|
|
{
|
|
"epoch": 4.585585585585585,
|
|
"grad_norm": 0.27618387944584083,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2048,
|
|
"step": 509
|
|
},
|
|
{
|
|
"epoch": 4.594594594594595,
|
|
"grad_norm": 0.2549425189875316,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2174,
|
|
"step": 510
|
|
},
|
|
{
|
|
"epoch": 4.603603603603604,
|
|
"grad_norm": 0.2645903835474375,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2216,
|
|
"step": 511
|
|
},
|
|
{
|
|
"epoch": 4.612612612612613,
|
|
"grad_norm": 0.2640684028376376,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2182,
|
|
"step": 512
|
|
},
|
|
{
|
|
"epoch": 4.621621621621622,
|
|
"grad_norm": 0.26051198776980117,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1808,
|
|
"step": 513
|
|
},
|
|
{
|
|
"epoch": 4.63063063063063,
|
|
"grad_norm": 0.2931023356142575,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1941,
|
|
"step": 514
|
|
},
|
|
{
|
|
"epoch": 4.63963963963964,
|
|
"grad_norm": 0.25284181276914397,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2362,
|
|
"step": 515
|
|
},
|
|
{
|
|
"epoch": 4.648648648648649,
|
|
"grad_norm": 0.2590084071736973,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2417,
|
|
"step": 516
|
|
},
|
|
{
|
|
"epoch": 4.657657657657658,
|
|
"grad_norm": 0.30404451969520124,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2186,
|
|
"step": 517
|
|
},
|
|
{
|
|
"epoch": 4.666666666666667,
|
|
"grad_norm": 0.2673580882682002,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2224,
|
|
"step": 518
|
|
},
|
|
{
|
|
"epoch": 4.675675675675675,
|
|
"grad_norm": 0.2636588657441614,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2324,
|
|
"step": 519
|
|
},
|
|
{
|
|
"epoch": 4.684684684684685,
|
|
"grad_norm": 0.2876900527962799,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2246,
|
|
"step": 520
|
|
},
|
|
{
|
|
"epoch": 4.693693693693694,
|
|
"grad_norm": 0.33566773437219993,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2148,
|
|
"step": 521
|
|
},
|
|
{
|
|
"epoch": 4.702702702702703,
|
|
"grad_norm": 0.25837694824532986,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2435,
|
|
"step": 522
|
|
},
|
|
{
|
|
"epoch": 4.711711711711712,
|
|
"grad_norm": 0.2618996341262811,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2504,
|
|
"step": 523
|
|
},
|
|
{
|
|
"epoch": 4.7207207207207205,
|
|
"grad_norm": 0.2916721768764094,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1969,
|
|
"step": 524
|
|
},
|
|
{
|
|
"epoch": 4.72972972972973,
|
|
"grad_norm": 0.2695124629228616,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2242,
|
|
"step": 525
|
|
},
|
|
{
|
|
"epoch": 4.738738738738739,
|
|
"grad_norm": 0.25003767055634085,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2396,
|
|
"step": 526
|
|
},
|
|
{
|
|
"epoch": 4.747747747747748,
|
|
"grad_norm": 0.26273587385726827,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2389,
|
|
"step": 527
|
|
},
|
|
{
|
|
"epoch": 4.756756756756757,
|
|
"grad_norm": 0.2633999928270432,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1999,
|
|
"step": 528
|
|
},
|
|
{
|
|
"epoch": 4.7657657657657655,
|
|
"grad_norm": 0.2657486138733691,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1973,
|
|
"step": 529
|
|
},
|
|
{
|
|
"epoch": 4.774774774774775,
|
|
"grad_norm": 0.2615424263109113,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2172,
|
|
"step": 530
|
|
},
|
|
{
|
|
"epoch": 4.783783783783784,
|
|
"grad_norm": 0.2725460425087256,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2316,
|
|
"step": 531
|
|
},
|
|
{
|
|
"epoch": 4.792792792792793,
|
|
"grad_norm": 0.29663406158664646,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.215,
|
|
"step": 532
|
|
},
|
|
{
|
|
"epoch": 4.801801801801802,
|
|
"grad_norm": 0.2680114226198382,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1913,
|
|
"step": 533
|
|
},
|
|
{
|
|
"epoch": 4.8108108108108105,
|
|
"grad_norm": 0.2717779322025023,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2134,
|
|
"step": 534
|
|
},
|
|
{
|
|
"epoch": 4.81981981981982,
|
|
"grad_norm": 0.2461871817136421,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2242,
|
|
"step": 535
|
|
},
|
|
{
|
|
"epoch": 4.828828828828829,
|
|
"grad_norm": 0.23898230675599963,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1906,
|
|
"step": 536
|
|
},
|
|
{
|
|
"epoch": 4.837837837837838,
|
|
"grad_norm": 0.24493103786606743,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2157,
|
|
"step": 537
|
|
},
|
|
{
|
|
"epoch": 4.846846846846847,
|
|
"grad_norm": 0.2513533399485069,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2251,
|
|
"step": 538
|
|
},
|
|
{
|
|
"epoch": 4.8558558558558556,
|
|
"grad_norm": 0.25335345934289205,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1981,
|
|
"step": 539
|
|
},
|
|
{
|
|
"epoch": 4.864864864864865,
|
|
"grad_norm": 0.24569861483369518,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1923,
|
|
"step": 540
|
|
},
|
|
{
|
|
"epoch": 4.873873873873874,
|
|
"grad_norm": 0.3107988513160903,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2022,
|
|
"step": 541
|
|
},
|
|
{
|
|
"epoch": 4.882882882882883,
|
|
"grad_norm": 0.2440474159047901,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2102,
|
|
"step": 542
|
|
},
|
|
{
|
|
"epoch": 4.891891891891892,
|
|
"grad_norm": 0.269910179414699,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2023,
|
|
"step": 543
|
|
},
|
|
{
|
|
"epoch": 4.900900900900901,
|
|
"grad_norm": 0.28697178165278897,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2026,
|
|
"step": 544
|
|
},
|
|
{
|
|
"epoch": 4.90990990990991,
|
|
"grad_norm": 0.27623354559228885,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2477,
|
|
"step": 545
|
|
},
|
|
{
|
|
"epoch": 4.918918918918919,
|
|
"grad_norm": 0.2598034982021407,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.202,
|
|
"step": 546
|
|
},
|
|
{
|
|
"epoch": 4.927927927927928,
|
|
"grad_norm": 0.2982050262473221,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1987,
|
|
"step": 547
|
|
},
|
|
{
|
|
"epoch": 4.936936936936937,
|
|
"grad_norm": 0.2506438136769937,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2332,
|
|
"step": 548
|
|
},
|
|
{
|
|
"epoch": 4.945945945945946,
|
|
"grad_norm": 0.2619541945846186,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1929,
|
|
"step": 549
|
|
},
|
|
{
|
|
"epoch": 4.954954954954955,
|
|
"grad_norm": 0.263321542176826,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.199,
|
|
"step": 550
|
|
},
|
|
{
|
|
"epoch": 4.963963963963964,
|
|
"grad_norm": 0.2601674233941214,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2515,
|
|
"step": 551
|
|
},
|
|
{
|
|
"epoch": 4.972972972972973,
|
|
"grad_norm": 0.29934162295077227,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2177,
|
|
"step": 552
|
|
},
|
|
{
|
|
"epoch": 4.981981981981982,
|
|
"grad_norm": 0.24535555883333707,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2038,
|
|
"step": 553
|
|
},
|
|
{
|
|
"epoch": 4.990990990990991,
|
|
"grad_norm": 0.2743717244598402,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1904,
|
|
"step": 554
|
|
},
|
|
{
|
|
"epoch": 5.0,
|
|
"grad_norm": 0.2346279054988279,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2318,
|
|
"step": 555
|
|
},
|
|
{
|
|
"epoch": 5.009009009009009,
|
|
"grad_norm": 0.3139762280953865,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2065,
|
|
"step": 556
|
|
},
|
|
{
|
|
"epoch": 5.018018018018018,
|
|
"grad_norm": 0.2318927222535076,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2063,
|
|
"step": 557
|
|
},
|
|
{
|
|
"epoch": 5.027027027027027,
|
|
"grad_norm": 0.25650614267529076,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2042,
|
|
"step": 558
|
|
},
|
|
{
|
|
"epoch": 5.036036036036036,
|
|
"grad_norm": 0.25768317605269925,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2057,
|
|
"step": 559
|
|
},
|
|
{
|
|
"epoch": 5.045045045045045,
|
|
"grad_norm": 0.29060238578973707,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1944,
|
|
"step": 560
|
|
},
|
|
{
|
|
"epoch": 5.054054054054054,
|
|
"grad_norm": 0.28407299845741896,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1718,
|
|
"step": 561
|
|
},
|
|
{
|
|
"epoch": 5.063063063063063,
|
|
"grad_norm": 0.29213793767158686,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1878,
|
|
"step": 562
|
|
},
|
|
{
|
|
"epoch": 5.072072072072072,
|
|
"grad_norm": 0.26810675570875164,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2125,
|
|
"step": 563
|
|
},
|
|
{
|
|
"epoch": 5.081081081081081,
|
|
"grad_norm": 0.2692377641775085,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1846,
|
|
"step": 564
|
|
},
|
|
{
|
|
"epoch": 5.09009009009009,
|
|
"grad_norm": 0.405649877673358,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1837,
|
|
"step": 565
|
|
},
|
|
{
|
|
"epoch": 5.099099099099099,
|
|
"grad_norm": 0.26726682072971775,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2062,
|
|
"step": 566
|
|
},
|
|
{
|
|
"epoch": 5.108108108108108,
|
|
"grad_norm": 0.2940841675590565,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2165,
|
|
"step": 567
|
|
},
|
|
{
|
|
"epoch": 5.117117117117117,
|
|
"grad_norm": 0.3398159316706572,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1926,
|
|
"step": 568
|
|
},
|
|
{
|
|
"epoch": 5.126126126126126,
|
|
"grad_norm": 0.2826251512922728,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1848,
|
|
"step": 569
|
|
},
|
|
{
|
|
"epoch": 5.135135135135135,
|
|
"grad_norm": 0.25092563468699364,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.193,
|
|
"step": 570
|
|
},
|
|
{
|
|
"epoch": 5.1441441441441444,
|
|
"grad_norm": 0.25159248777659954,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1723,
|
|
"step": 571
|
|
},
|
|
{
|
|
"epoch": 5.153153153153153,
|
|
"grad_norm": 0.2681017671845892,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1634,
|
|
"step": 572
|
|
},
|
|
{
|
|
"epoch": 5.162162162162162,
|
|
"grad_norm": 0.2733469299319058,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1702,
|
|
"step": 573
|
|
},
|
|
{
|
|
"epoch": 5.171171171171171,
|
|
"grad_norm": 0.2643697126326926,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2108,
|
|
"step": 574
|
|
},
|
|
{
|
|
"epoch": 5.18018018018018,
|
|
"grad_norm": 0.2929652382664824,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2104,
|
|
"step": 575
|
|
},
|
|
{
|
|
"epoch": 5.1891891891891895,
|
|
"grad_norm": 0.30518478646977765,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2049,
|
|
"step": 576
|
|
},
|
|
{
|
|
"epoch": 5.198198198198198,
|
|
"grad_norm": 0.29565787595285775,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1701,
|
|
"step": 577
|
|
},
|
|
{
|
|
"epoch": 5.207207207207207,
|
|
"grad_norm": 0.24799846379048632,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1849,
|
|
"step": 578
|
|
},
|
|
{
|
|
"epoch": 5.216216216216216,
|
|
"grad_norm": 0.26812878158143444,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1939,
|
|
"step": 579
|
|
},
|
|
{
|
|
"epoch": 5.225225225225225,
|
|
"grad_norm": 0.2832327785366025,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1748,
|
|
"step": 580
|
|
},
|
|
{
|
|
"epoch": 5.2342342342342345,
|
|
"grad_norm": 0.24530353488882148,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1672,
|
|
"step": 581
|
|
},
|
|
{
|
|
"epoch": 5.243243243243243,
|
|
"grad_norm": 0.267893260322143,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1982,
|
|
"step": 582
|
|
},
|
|
{
|
|
"epoch": 5.252252252252252,
|
|
"grad_norm": 0.28205728775241223,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1402,
|
|
"step": 583
|
|
},
|
|
{
|
|
"epoch": 5.261261261261261,
|
|
"grad_norm": 0.2616195565718879,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1669,
|
|
"step": 584
|
|
},
|
|
{
|
|
"epoch": 5.27027027027027,
|
|
"grad_norm": 0.2623448971573745,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2006,
|
|
"step": 585
|
|
},
|
|
{
|
|
"epoch": 5.2792792792792795,
|
|
"grad_norm": 0.24193944254287217,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1737,
|
|
"step": 586
|
|
},
|
|
{
|
|
"epoch": 5.288288288288288,
|
|
"grad_norm": 0.27208641316196014,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1837,
|
|
"step": 587
|
|
},
|
|
{
|
|
"epoch": 5.297297297297297,
|
|
"grad_norm": 0.25067910651417047,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1934,
|
|
"step": 588
|
|
},
|
|
{
|
|
"epoch": 5.306306306306306,
|
|
"grad_norm": 0.25385900871383876,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1689,
|
|
"step": 589
|
|
},
|
|
{
|
|
"epoch": 5.315315315315315,
|
|
"grad_norm": 0.32902079040677734,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1534,
|
|
"step": 590
|
|
},
|
|
{
|
|
"epoch": 5.324324324324325,
|
|
"grad_norm": 0.2529027343155485,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2102,
|
|
"step": 591
|
|
},
|
|
{
|
|
"epoch": 5.333333333333333,
|
|
"grad_norm": 0.28906659508958055,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.181,
|
|
"step": 592
|
|
},
|
|
{
|
|
"epoch": 5.342342342342342,
|
|
"grad_norm": 0.282108480924088,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2128,
|
|
"step": 593
|
|
},
|
|
{
|
|
"epoch": 5.351351351351352,
|
|
"grad_norm": 0.2604161116106256,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1844,
|
|
"step": 594
|
|
},
|
|
{
|
|
"epoch": 5.36036036036036,
|
|
"grad_norm": 0.2789492989923241,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1683,
|
|
"step": 595
|
|
},
|
|
{
|
|
"epoch": 5.36936936936937,
|
|
"grad_norm": 0.2559431308271593,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2137,
|
|
"step": 596
|
|
},
|
|
{
|
|
"epoch": 5.378378378378378,
|
|
"grad_norm": 0.30088029917481107,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1892,
|
|
"step": 597
|
|
},
|
|
{
|
|
"epoch": 5.387387387387387,
|
|
"grad_norm": 0.26253812275245714,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1765,
|
|
"step": 598
|
|
},
|
|
{
|
|
"epoch": 5.396396396396397,
|
|
"grad_norm": 0.26495943964336816,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1843,
|
|
"step": 599
|
|
},
|
|
{
|
|
"epoch": 5.405405405405405,
|
|
"grad_norm": 0.25894821975432253,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1881,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 5.414414414414415,
|
|
"grad_norm": 0.24931805970093998,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1878,
|
|
"step": 601
|
|
},
|
|
{
|
|
"epoch": 5.423423423423423,
|
|
"grad_norm": 0.23455479372929255,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1906,
|
|
"step": 602
|
|
},
|
|
{
|
|
"epoch": 5.4324324324324325,
|
|
"grad_norm": 0.25467507848802673,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1717,
|
|
"step": 603
|
|
},
|
|
{
|
|
"epoch": 5.441441441441442,
|
|
"grad_norm": 0.33202611172740315,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1688,
|
|
"step": 604
|
|
},
|
|
{
|
|
"epoch": 5.45045045045045,
|
|
"grad_norm": 0.29109320447844156,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2029,
|
|
"step": 605
|
|
},
|
|
{
|
|
"epoch": 5.45945945945946,
|
|
"grad_norm": 0.24981105367499418,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1941,
|
|
"step": 606
|
|
},
|
|
{
|
|
"epoch": 5.468468468468468,
|
|
"grad_norm": 0.24367601204155379,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1671,
|
|
"step": 607
|
|
},
|
|
{
|
|
"epoch": 5.4774774774774775,
|
|
"grad_norm": 0.2932072155115799,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1705,
|
|
"step": 608
|
|
},
|
|
{
|
|
"epoch": 5.486486486486487,
|
|
"grad_norm": 0.2882005482228378,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2065,
|
|
"step": 609
|
|
},
|
|
{
|
|
"epoch": 5.495495495495495,
|
|
"grad_norm": 0.25719022643699463,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2111,
|
|
"step": 610
|
|
},
|
|
{
|
|
"epoch": 5.504504504504505,
|
|
"grad_norm": 0.2611846325545377,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2016,
|
|
"step": 611
|
|
},
|
|
{
|
|
"epoch": 5.513513513513513,
|
|
"grad_norm": 0.23251839540489064,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1711,
|
|
"step": 612
|
|
},
|
|
{
|
|
"epoch": 5.5225225225225225,
|
|
"grad_norm": 0.3956644880260737,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1794,
|
|
"step": 613
|
|
},
|
|
{
|
|
"epoch": 5.531531531531532,
|
|
"grad_norm": 0.27250839467887433,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1676,
|
|
"step": 614
|
|
},
|
|
{
|
|
"epoch": 5.54054054054054,
|
|
"grad_norm": 0.2638663157341973,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.189,
|
|
"step": 615
|
|
},
|
|
{
|
|
"epoch": 5.54954954954955,
|
|
"grad_norm": 0.2635087015420886,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1894,
|
|
"step": 616
|
|
},
|
|
{
|
|
"epoch": 5.558558558558558,
|
|
"grad_norm": 0.25884441144311887,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1625,
|
|
"step": 617
|
|
},
|
|
{
|
|
"epoch": 5.5675675675675675,
|
|
"grad_norm": 0.33989481367732455,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1632,
|
|
"step": 618
|
|
},
|
|
{
|
|
"epoch": 5.576576576576577,
|
|
"grad_norm": 0.33951958077722966,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1956,
|
|
"step": 619
|
|
},
|
|
{
|
|
"epoch": 5.585585585585585,
|
|
"grad_norm": 0.2547652235180218,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2291,
|
|
"step": 620
|
|
},
|
|
{
|
|
"epoch": 5.594594594594595,
|
|
"grad_norm": 0.24750295719042112,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1726,
|
|
"step": 621
|
|
},
|
|
{
|
|
"epoch": 5.603603603603604,
|
|
"grad_norm": 0.2508541551180729,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1776,
|
|
"step": 622
|
|
},
|
|
{
|
|
"epoch": 5.612612612612613,
|
|
"grad_norm": 0.2506039467248062,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2185,
|
|
"step": 623
|
|
},
|
|
{
|
|
"epoch": 5.621621621621622,
|
|
"grad_norm": 0.26927980609500457,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2051,
|
|
"step": 624
|
|
},
|
|
{
|
|
"epoch": 5.63063063063063,
|
|
"grad_norm": 0.2902598041361342,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1836,
|
|
"step": 625
|
|
},
|
|
{
|
|
"epoch": 5.63963963963964,
|
|
"grad_norm": 0.27400647533943007,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.187,
|
|
"step": 626
|
|
},
|
|
{
|
|
"epoch": 5.648648648648649,
|
|
"grad_norm": 0.29199710457207273,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1969,
|
|
"step": 627
|
|
},
|
|
{
|
|
"epoch": 5.657657657657658,
|
|
"grad_norm": 0.3025760209241755,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1745,
|
|
"step": 628
|
|
},
|
|
{
|
|
"epoch": 5.666666666666667,
|
|
"grad_norm": 0.2863200552497931,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1763,
|
|
"step": 629
|
|
},
|
|
{
|
|
"epoch": 5.675675675675675,
|
|
"grad_norm": 0.3046187504171871,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1765,
|
|
"step": 630
|
|
},
|
|
{
|
|
"epoch": 5.684684684684685,
|
|
"grad_norm": 0.2594010152562734,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1922,
|
|
"step": 631
|
|
},
|
|
{
|
|
"epoch": 5.693693693693694,
|
|
"grad_norm": 0.25276964471192975,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2011,
|
|
"step": 632
|
|
},
|
|
{
|
|
"epoch": 5.702702702702703,
|
|
"grad_norm": 0.2457717647956263,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1878,
|
|
"step": 633
|
|
},
|
|
{
|
|
"epoch": 5.711711711711712,
|
|
"grad_norm": 0.27348692878164155,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1992,
|
|
"step": 634
|
|
},
|
|
{
|
|
"epoch": 5.7207207207207205,
|
|
"grad_norm": 0.2599835122727351,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1649,
|
|
"step": 635
|
|
},
|
|
{
|
|
"epoch": 5.72972972972973,
|
|
"grad_norm": 0.2712466634459408,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2089,
|
|
"step": 636
|
|
},
|
|
{
|
|
"epoch": 5.738738738738739,
|
|
"grad_norm": 0.2732874374632016,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1613,
|
|
"step": 637
|
|
},
|
|
{
|
|
"epoch": 5.747747747747748,
|
|
"grad_norm": 0.3335330997010001,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2057,
|
|
"step": 638
|
|
},
|
|
{
|
|
"epoch": 5.756756756756757,
|
|
"grad_norm": 0.29795878105581997,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1619,
|
|
"step": 639
|
|
},
|
|
{
|
|
"epoch": 5.7657657657657655,
|
|
"grad_norm": 0.27969811406236256,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.212,
|
|
"step": 640
|
|
},
|
|
{
|
|
"epoch": 5.774774774774775,
|
|
"grad_norm": 0.26108347760571876,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1898,
|
|
"step": 641
|
|
},
|
|
{
|
|
"epoch": 5.783783783783784,
|
|
"grad_norm": 0.2954357533804664,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1845,
|
|
"step": 642
|
|
},
|
|
{
|
|
"epoch": 5.792792792792793,
|
|
"grad_norm": 0.2897237531310712,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1778,
|
|
"step": 643
|
|
},
|
|
{
|
|
"epoch": 5.801801801801802,
|
|
"grad_norm": 0.2862916112801224,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1616,
|
|
"step": 644
|
|
},
|
|
{
|
|
"epoch": 5.8108108108108105,
|
|
"grad_norm": 0.2655242503575118,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2125,
|
|
"step": 645
|
|
},
|
|
{
|
|
"epoch": 5.81981981981982,
|
|
"grad_norm": 0.29888735697965757,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.17,
|
|
"step": 646
|
|
},
|
|
{
|
|
"epoch": 5.828828828828829,
|
|
"grad_norm": 0.3878759366680586,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1747,
|
|
"step": 647
|
|
},
|
|
{
|
|
"epoch": 5.837837837837838,
|
|
"grad_norm": 0.2780142798396489,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.208,
|
|
"step": 648
|
|
},
|
|
{
|
|
"epoch": 5.846846846846847,
|
|
"grad_norm": 0.32189822402929597,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1903,
|
|
"step": 649
|
|
},
|
|
{
|
|
"epoch": 5.8558558558558556,
|
|
"grad_norm": 0.26666263423680436,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1753,
|
|
"step": 650
|
|
},
|
|
{
|
|
"epoch": 5.864864864864865,
|
|
"grad_norm": 0.26508147192359016,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1956,
|
|
"step": 651
|
|
},
|
|
{
|
|
"epoch": 5.873873873873874,
|
|
"grad_norm": 0.2800648058751269,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1796,
|
|
"step": 652
|
|
},
|
|
{
|
|
"epoch": 5.882882882882883,
|
|
"grad_norm": 0.25602425755319697,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2028,
|
|
"step": 653
|
|
},
|
|
{
|
|
"epoch": 5.891891891891892,
|
|
"grad_norm": 0.296930917045288,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1924,
|
|
"step": 654
|
|
},
|
|
{
|
|
"epoch": 5.900900900900901,
|
|
"grad_norm": 0.3452776489094155,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1964,
|
|
"step": 655
|
|
},
|
|
{
|
|
"epoch": 5.90990990990991,
|
|
"grad_norm": 0.2910698717731606,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1751,
|
|
"step": 656
|
|
},
|
|
{
|
|
"epoch": 5.918918918918919,
|
|
"grad_norm": 0.2591517828645954,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.187,
|
|
"step": 657
|
|
},
|
|
{
|
|
"epoch": 5.927927927927928,
|
|
"grad_norm": 0.32146446439072224,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1945,
|
|
"step": 658
|
|
},
|
|
{
|
|
"epoch": 5.936936936936937,
|
|
"grad_norm": 0.29057113691944186,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1796,
|
|
"step": 659
|
|
},
|
|
{
|
|
"epoch": 5.945945945945946,
|
|
"grad_norm": 0.2673772774533524,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1871,
|
|
"step": 660
|
|
},
|
|
{
|
|
"epoch": 5.954954954954955,
|
|
"grad_norm": 0.25292557260096377,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1748,
|
|
"step": 661
|
|
},
|
|
{
|
|
"epoch": 5.963963963963964,
|
|
"grad_norm": 0.24315435369686791,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1838,
|
|
"step": 662
|
|
},
|
|
{
|
|
"epoch": 5.972972972972973,
|
|
"grad_norm": 0.30275438050027514,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1633,
|
|
"step": 663
|
|
},
|
|
{
|
|
"epoch": 5.981981981981982,
|
|
"grad_norm": 0.28436057893273076,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1876,
|
|
"step": 664
|
|
},
|
|
{
|
|
"epoch": 5.990990990990991,
|
|
"grad_norm": 0.28562922979220184,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2009,
|
|
"step": 665
|
|
},
|
|
{
|
|
"epoch": 6.0,
|
|
"grad_norm": 0.2653424601600143,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1905,
|
|
"step": 666
|
|
},
|
|
{
|
|
"epoch": 6.009009009009009,
|
|
"grad_norm": 0.2860376096075966,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1957,
|
|
"step": 667
|
|
},
|
|
{
|
|
"epoch": 6.018018018018018,
|
|
"grad_norm": 0.25196665164096865,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1563,
|
|
"step": 668
|
|
},
|
|
{
|
|
"epoch": 6.027027027027027,
|
|
"grad_norm": 0.24029344524647256,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1879,
|
|
"step": 669
|
|
},
|
|
{
|
|
"epoch": 6.036036036036036,
|
|
"grad_norm": 0.2620085799429486,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1811,
|
|
"step": 670
|
|
},
|
|
{
|
|
"epoch": 6.045045045045045,
|
|
"grad_norm": 0.27308115959180734,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.155,
|
|
"step": 671
|
|
},
|
|
{
|
|
"epoch": 6.054054054054054,
|
|
"grad_norm": 0.26803737868546207,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1537,
|
|
"step": 672
|
|
},
|
|
{
|
|
"epoch": 6.063063063063063,
|
|
"grad_norm": 0.30441930072274076,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1902,
|
|
"step": 673
|
|
},
|
|
{
|
|
"epoch": 6.072072072072072,
|
|
"grad_norm": 0.2465984202629159,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1807,
|
|
"step": 674
|
|
},
|
|
{
|
|
"epoch": 6.081081081081081,
|
|
"grad_norm": 0.2674335217467193,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1487,
|
|
"step": 675
|
|
},
|
|
{
|
|
"epoch": 6.09009009009009,
|
|
"grad_norm": 0.2905204800351543,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1567,
|
|
"step": 676
|
|
},
|
|
{
|
|
"epoch": 6.099099099099099,
|
|
"grad_norm": 0.2954597077236978,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1535,
|
|
"step": 677
|
|
},
|
|
{
|
|
"epoch": 6.108108108108108,
|
|
"grad_norm": 0.3045298345267351,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1689,
|
|
"step": 678
|
|
},
|
|
{
|
|
"epoch": 6.117117117117117,
|
|
"grad_norm": 0.2740781768489349,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1797,
|
|
"step": 679
|
|
},
|
|
{
|
|
"epoch": 6.126126126126126,
|
|
"grad_norm": 0.39475528450021763,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1617,
|
|
"step": 680
|
|
},
|
|
{
|
|
"epoch": 6.135135135135135,
|
|
"grad_norm": 0.6235225287605396,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1238,
|
|
"step": 681
|
|
},
|
|
{
|
|
"epoch": 6.1441441441441444,
|
|
"grad_norm": 0.26845753887421847,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1511,
|
|
"step": 682
|
|
},
|
|
{
|
|
"epoch": 6.153153153153153,
|
|
"grad_norm": 0.3602960092750115,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1678,
|
|
"step": 683
|
|
},
|
|
{
|
|
"epoch": 6.162162162162162,
|
|
"grad_norm": 0.33224893659794336,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1412,
|
|
"step": 684
|
|
},
|
|
{
|
|
"epoch": 6.171171171171171,
|
|
"grad_norm": 0.24094527332728147,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1695,
|
|
"step": 685
|
|
},
|
|
{
|
|
"epoch": 6.18018018018018,
|
|
"grad_norm": 0.341428905288911,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1839,
|
|
"step": 686
|
|
},
|
|
{
|
|
"epoch": 6.1891891891891895,
|
|
"grad_norm": 0.2956801407312396,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1692,
|
|
"step": 687
|
|
},
|
|
{
|
|
"epoch": 6.198198198198198,
|
|
"grad_norm": 0.3054866867274709,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1546,
|
|
"step": 688
|
|
},
|
|
{
|
|
"epoch": 6.207207207207207,
|
|
"grad_norm": 0.23806841375933424,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1554,
|
|
"step": 689
|
|
},
|
|
{
|
|
"epoch": 6.216216216216216,
|
|
"grad_norm": 0.24481339250784975,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1953,
|
|
"step": 690
|
|
},
|
|
{
|
|
"epoch": 6.225225225225225,
|
|
"grad_norm": 0.3014128409778474,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1674,
|
|
"step": 691
|
|
},
|
|
{
|
|
"epoch": 6.2342342342342345,
|
|
"grad_norm": 0.2708989701315342,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1479,
|
|
"step": 692
|
|
},
|
|
{
|
|
"epoch": 6.243243243243243,
|
|
"grad_norm": 0.31024609108715306,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1553,
|
|
"step": 693
|
|
},
|
|
{
|
|
"epoch": 6.252252252252252,
|
|
"grad_norm": 0.29134393470015496,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1437,
|
|
"step": 694
|
|
},
|
|
{
|
|
"epoch": 6.261261261261261,
|
|
"grad_norm": 0.2793592485054197,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1684,
|
|
"step": 695
|
|
},
|
|
{
|
|
"epoch": 6.27027027027027,
|
|
"grad_norm": 0.30498815020407055,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1968,
|
|
"step": 696
|
|
},
|
|
{
|
|
"epoch": 6.2792792792792795,
|
|
"grad_norm": 0.2652672098205942,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1565,
|
|
"step": 697
|
|
},
|
|
{
|
|
"epoch": 6.288288288288288,
|
|
"grad_norm": 0.3018458330908521,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1726,
|
|
"step": 698
|
|
},
|
|
{
|
|
"epoch": 6.297297297297297,
|
|
"grad_norm": 0.2592172426217306,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1656,
|
|
"step": 699
|
|
},
|
|
{
|
|
"epoch": 6.306306306306306,
|
|
"grad_norm": 0.2565291008895072,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1981,
|
|
"step": 700
|
|
},
|
|
{
|
|
"epoch": 6.315315315315315,
|
|
"grad_norm": 0.2980025331247,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1656,
|
|
"step": 701
|
|
},
|
|
{
|
|
"epoch": 6.324324324324325,
|
|
"grad_norm": 0.26039013888986284,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1656,
|
|
"step": 702
|
|
},
|
|
{
|
|
"epoch": 6.333333333333333,
|
|
"grad_norm": 0.2568935923552546,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1451,
|
|
"step": 703
|
|
},
|
|
{
|
|
"epoch": 6.342342342342342,
|
|
"grad_norm": 0.29222987592831656,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.161,
|
|
"step": 704
|
|
},
|
|
{
|
|
"epoch": 6.351351351351352,
|
|
"grad_norm": 0.2622511449178775,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1553,
|
|
"step": 705
|
|
},
|
|
{
|
|
"epoch": 6.36036036036036,
|
|
"grad_norm": 0.2703894332854895,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1803,
|
|
"step": 706
|
|
},
|
|
{
|
|
"epoch": 6.36936936936937,
|
|
"grad_norm": 0.2516505913848481,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.16,
|
|
"step": 707
|
|
},
|
|
{
|
|
"epoch": 6.378378378378378,
|
|
"grad_norm": 0.26750256687760715,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1461,
|
|
"step": 708
|
|
},
|
|
{
|
|
"epoch": 6.387387387387387,
|
|
"grad_norm": 0.2539871109081379,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1488,
|
|
"step": 709
|
|
},
|
|
{
|
|
"epoch": 6.396396396396397,
|
|
"grad_norm": 0.2769403607227516,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1924,
|
|
"step": 710
|
|
},
|
|
{
|
|
"epoch": 6.405405405405405,
|
|
"grad_norm": 0.2946720991492928,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1503,
|
|
"step": 711
|
|
},
|
|
{
|
|
"epoch": 6.414414414414415,
|
|
"grad_norm": 0.24458166835948247,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1855,
|
|
"step": 712
|
|
},
|
|
{
|
|
"epoch": 6.423423423423423,
|
|
"grad_norm": 0.2840232732624716,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1643,
|
|
"step": 713
|
|
},
|
|
{
|
|
"epoch": 6.4324324324324325,
|
|
"grad_norm": 0.31757015141649597,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1622,
|
|
"step": 714
|
|
},
|
|
{
|
|
"epoch": 6.441441441441442,
|
|
"grad_norm": 0.28847324036631117,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1365,
|
|
"step": 715
|
|
},
|
|
{
|
|
"epoch": 6.45045045045045,
|
|
"grad_norm": 0.24694988398848988,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1594,
|
|
"step": 716
|
|
},
|
|
{
|
|
"epoch": 6.45945945945946,
|
|
"grad_norm": 0.29307213864672693,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1411,
|
|
"step": 717
|
|
},
|
|
{
|
|
"epoch": 6.468468468468468,
|
|
"grad_norm": 0.30163977699200506,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1649,
|
|
"step": 718
|
|
},
|
|
{
|
|
"epoch": 6.4774774774774775,
|
|
"grad_norm": 0.2854457863377953,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1918,
|
|
"step": 719
|
|
},
|
|
{
|
|
"epoch": 6.486486486486487,
|
|
"grad_norm": 0.27342932900047295,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.176,
|
|
"step": 720
|
|
},
|
|
{
|
|
"epoch": 6.495495495495495,
|
|
"grad_norm": 0.28175783115173536,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1422,
|
|
"step": 721
|
|
},
|
|
{
|
|
"epoch": 6.504504504504505,
|
|
"grad_norm": 0.2840989876099184,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1633,
|
|
"step": 722
|
|
},
|
|
{
|
|
"epoch": 6.513513513513513,
|
|
"grad_norm": 0.2867793910350591,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1796,
|
|
"step": 723
|
|
},
|
|
{
|
|
"epoch": 6.5225225225225225,
|
|
"grad_norm": 0.28428808863989385,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1529,
|
|
"step": 724
|
|
},
|
|
{
|
|
"epoch": 6.531531531531532,
|
|
"grad_norm": 0.29279024558574074,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1392,
|
|
"step": 725
|
|
},
|
|
{
|
|
"epoch": 6.54054054054054,
|
|
"grad_norm": 0.23321656651720726,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1472,
|
|
"step": 726
|
|
},
|
|
{
|
|
"epoch": 6.54954954954955,
|
|
"grad_norm": 0.27521328727823563,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1577,
|
|
"step": 727
|
|
},
|
|
{
|
|
"epoch": 6.558558558558558,
|
|
"grad_norm": 0.32541783429708115,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1782,
|
|
"step": 728
|
|
},
|
|
{
|
|
"epoch": 6.5675675675675675,
|
|
"grad_norm": 0.4716962818206086,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1247,
|
|
"step": 729
|
|
},
|
|
{
|
|
"epoch": 6.576576576576577,
|
|
"grad_norm": 0.26285903206886113,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1744,
|
|
"step": 730
|
|
},
|
|
{
|
|
"epoch": 6.585585585585585,
|
|
"grad_norm": 0.30168633716148247,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1422,
|
|
"step": 731
|
|
},
|
|
{
|
|
"epoch": 6.594594594594595,
|
|
"grad_norm": 0.2745027764141301,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1672,
|
|
"step": 732
|
|
},
|
|
{
|
|
"epoch": 6.603603603603604,
|
|
"grad_norm": 0.35380479132918236,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1663,
|
|
"step": 733
|
|
},
|
|
{
|
|
"epoch": 6.612612612612613,
|
|
"grad_norm": 0.27601944134435535,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1695,
|
|
"step": 734
|
|
},
|
|
{
|
|
"epoch": 6.621621621621622,
|
|
"grad_norm": 0.2528980448808799,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2017,
|
|
"step": 735
|
|
},
|
|
{
|
|
"epoch": 6.63063063063063,
|
|
"grad_norm": 0.30767286206094524,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1261,
|
|
"step": 736
|
|
},
|
|
{
|
|
"epoch": 6.63963963963964,
|
|
"grad_norm": 0.2602137688236013,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1636,
|
|
"step": 737
|
|
},
|
|
{
|
|
"epoch": 6.648648648648649,
|
|
"grad_norm": 0.23824598012350529,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1584,
|
|
"step": 738
|
|
},
|
|
{
|
|
"epoch": 6.657657657657658,
|
|
"grad_norm": 0.29134756631872455,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1835,
|
|
"step": 739
|
|
},
|
|
{
|
|
"epoch": 6.666666666666667,
|
|
"grad_norm": 0.25765774787058354,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1603,
|
|
"step": 740
|
|
},
|
|
{
|
|
"epoch": 6.675675675675675,
|
|
"grad_norm": 0.2600078403016356,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1826,
|
|
"step": 741
|
|
},
|
|
{
|
|
"epoch": 6.684684684684685,
|
|
"grad_norm": 0.2617835836231004,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1464,
|
|
"step": 742
|
|
},
|
|
{
|
|
"epoch": 6.693693693693694,
|
|
"grad_norm": 0.32078684057749896,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1252,
|
|
"step": 743
|
|
},
|
|
{
|
|
"epoch": 6.702702702702703,
|
|
"grad_norm": 0.29351670808174113,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1548,
|
|
"step": 744
|
|
},
|
|
{
|
|
"epoch": 6.711711711711712,
|
|
"grad_norm": 0.30854251276850175,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.137,
|
|
"step": 745
|
|
},
|
|
{
|
|
"epoch": 6.7207207207207205,
|
|
"grad_norm": 0.26688862536435537,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.124,
|
|
"step": 746
|
|
},
|
|
{
|
|
"epoch": 6.72972972972973,
|
|
"grad_norm": 0.44923760580414157,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1736,
|
|
"step": 747
|
|
},
|
|
{
|
|
"epoch": 6.738738738738739,
|
|
"grad_norm": 0.39218610199513526,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1418,
|
|
"step": 748
|
|
},
|
|
{
|
|
"epoch": 6.747747747747748,
|
|
"grad_norm": 0.2664334967715308,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1517,
|
|
"step": 749
|
|
},
|
|
{
|
|
"epoch": 6.756756756756757,
|
|
"grad_norm": 0.28834080697901254,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1422,
|
|
"step": 750
|
|
},
|
|
{
|
|
"epoch": 6.7657657657657655,
|
|
"grad_norm": 0.29170783307220777,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1466,
|
|
"step": 751
|
|
},
|
|
{
|
|
"epoch": 6.774774774774775,
|
|
"grad_norm": 0.2802071333171322,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.2085,
|
|
"step": 752
|
|
},
|
|
{
|
|
"epoch": 6.783783783783784,
|
|
"grad_norm": 0.27926590125257916,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1923,
|
|
"step": 753
|
|
},
|
|
{
|
|
"epoch": 6.792792792792793,
|
|
"grad_norm": 0.26518681255237936,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1766,
|
|
"step": 754
|
|
},
|
|
{
|
|
"epoch": 6.801801801801802,
|
|
"grad_norm": 0.347354240325402,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1825,
|
|
"step": 755
|
|
},
|
|
{
|
|
"epoch": 6.8108108108108105,
|
|
"grad_norm": 0.2783190286987182,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.169,
|
|
"step": 756
|
|
},
|
|
{
|
|
"epoch": 6.81981981981982,
|
|
"grad_norm": 0.25398065438526435,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1586,
|
|
"step": 757
|
|
},
|
|
{
|
|
"epoch": 6.828828828828829,
|
|
"grad_norm": 0.2677100544625917,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1422,
|
|
"step": 758
|
|
},
|
|
{
|
|
"epoch": 6.837837837837838,
|
|
"grad_norm": 0.25767020061357093,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1562,
|
|
"step": 759
|
|
},
|
|
{
|
|
"epoch": 6.846846846846847,
|
|
"grad_norm": 0.2741476785712207,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1672,
|
|
"step": 760
|
|
},
|
|
{
|
|
"epoch": 6.8558558558558556,
|
|
"grad_norm": 0.3040893781704407,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1571,
|
|
"step": 761
|
|
},
|
|
{
|
|
"epoch": 6.864864864864865,
|
|
"grad_norm": 0.3190900071250576,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1424,
|
|
"step": 762
|
|
},
|
|
{
|
|
"epoch": 6.873873873873874,
|
|
"grad_norm": 0.29513021972878545,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1553,
|
|
"step": 763
|
|
},
|
|
{
|
|
"epoch": 6.882882882882883,
|
|
"grad_norm": 0.24584895713037455,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1648,
|
|
"step": 764
|
|
},
|
|
{
|
|
"epoch": 6.891891891891892,
|
|
"grad_norm": 0.29616482928194166,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1628,
|
|
"step": 765
|
|
},
|
|
{
|
|
"epoch": 6.900900900900901,
|
|
"grad_norm": 0.2698717209619555,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1486,
|
|
"step": 766
|
|
},
|
|
{
|
|
"epoch": 6.90990990990991,
|
|
"grad_norm": 0.2842949721560408,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1533,
|
|
"step": 767
|
|
},
|
|
{
|
|
"epoch": 6.918918918918919,
|
|
"grad_norm": 0.2548658270502879,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1828,
|
|
"step": 768
|
|
},
|
|
{
|
|
"epoch": 6.927927927927928,
|
|
"grad_norm": 0.29650593537088255,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1609,
|
|
"step": 769
|
|
},
|
|
{
|
|
"epoch": 6.936936936936937,
|
|
"grad_norm": 0.28258003962728084,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1593,
|
|
"step": 770
|
|
},
|
|
{
|
|
"epoch": 6.945945945945946,
|
|
"grad_norm": 0.2635611766361993,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1854,
|
|
"step": 771
|
|
},
|
|
{
|
|
"epoch": 6.954954954954955,
|
|
"grad_norm": 0.30424832142174796,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.146,
|
|
"step": 772
|
|
},
|
|
{
|
|
"epoch": 6.963963963963964,
|
|
"grad_norm": 0.3060917294529799,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1733,
|
|
"step": 773
|
|
},
|
|
{
|
|
"epoch": 6.972972972972973,
|
|
"grad_norm": 0.32441537080653826,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1537,
|
|
"step": 774
|
|
},
|
|
{
|
|
"epoch": 6.981981981981982,
|
|
"grad_norm": 0.2765707057627649,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1693,
|
|
"step": 775
|
|
},
|
|
{
|
|
"epoch": 6.990990990990991,
|
|
"grad_norm": 0.2603233373640257,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1796,
|
|
"step": 776
|
|
},
|
|
{
|
|
"epoch": 7.0,
|
|
"grad_norm": 0.2781503743653767,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.184,
|
|
"step": 777
|
|
},
|
|
{
|
|
"epoch": 7.009009009009009,
|
|
"grad_norm": 0.28846916522682,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1327,
|
|
"step": 778
|
|
},
|
|
{
|
|
"epoch": 7.018018018018018,
|
|
"grad_norm": 0.2909211896726909,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1436,
|
|
"step": 779
|
|
},
|
|
{
|
|
"epoch": 7.027027027027027,
|
|
"grad_norm": 0.29059846677673873,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1264,
|
|
"step": 780
|
|
},
|
|
{
|
|
"epoch": 7.036036036036036,
|
|
"grad_norm": 0.2589502571701869,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1433,
|
|
"step": 781
|
|
},
|
|
{
|
|
"epoch": 7.045045045045045,
|
|
"grad_norm": 0.30299652950475636,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1139,
|
|
"step": 782
|
|
},
|
|
{
|
|
"epoch": 7.054054054054054,
|
|
"grad_norm": 0.3100291804047275,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1592,
|
|
"step": 783
|
|
},
|
|
{
|
|
"epoch": 7.063063063063063,
|
|
"grad_norm": 0.2975691545809747,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.157,
|
|
"step": 784
|
|
},
|
|
{
|
|
"epoch": 7.072072072072072,
|
|
"grad_norm": 0.3068091793086541,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1232,
|
|
"step": 785
|
|
},
|
|
{
|
|
"epoch": 7.081081081081081,
|
|
"grad_norm": 0.3089806282089307,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1221,
|
|
"step": 786
|
|
},
|
|
{
|
|
"epoch": 7.09009009009009,
|
|
"grad_norm": 0.2847038649852651,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1281,
|
|
"step": 787
|
|
},
|
|
{
|
|
"epoch": 7.099099099099099,
|
|
"grad_norm": 0.2615997547703096,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1434,
|
|
"step": 788
|
|
},
|
|
{
|
|
"epoch": 7.108108108108108,
|
|
"grad_norm": 0.2807779495824356,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1245,
|
|
"step": 789
|
|
},
|
|
{
|
|
"epoch": 7.117117117117117,
|
|
"grad_norm": 0.32154910892821653,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1398,
|
|
"step": 790
|
|
},
|
|
{
|
|
"epoch": 7.126126126126126,
|
|
"grad_norm": 0.2909515213375792,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1617,
|
|
"step": 791
|
|
},
|
|
{
|
|
"epoch": 7.135135135135135,
|
|
"grad_norm": 0.3077346518188213,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1798,
|
|
"step": 792
|
|
},
|
|
{
|
|
"epoch": 7.1441441441441444,
|
|
"grad_norm": 0.2950118643236569,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1189,
|
|
"step": 793
|
|
},
|
|
{
|
|
"epoch": 7.153153153153153,
|
|
"grad_norm": 0.3358905154061822,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1145,
|
|
"step": 794
|
|
},
|
|
{
|
|
"epoch": 7.162162162162162,
|
|
"grad_norm": 0.2784628731056912,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1413,
|
|
"step": 795
|
|
},
|
|
{
|
|
"epoch": 7.171171171171171,
|
|
"grad_norm": 0.332241278578818,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.141,
|
|
"step": 796
|
|
},
|
|
{
|
|
"epoch": 7.18018018018018,
|
|
"grad_norm": 0.3319493746279513,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1265,
|
|
"step": 797
|
|
},
|
|
{
|
|
"epoch": 7.1891891891891895,
|
|
"grad_norm": 0.2918711891065202,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1848,
|
|
"step": 798
|
|
},
|
|
{
|
|
"epoch": 7.198198198198198,
|
|
"grad_norm": 0.294971929143932,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1102,
|
|
"step": 799
|
|
},
|
|
{
|
|
"epoch": 7.207207207207207,
|
|
"grad_norm": 0.28582274936569596,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1364,
|
|
"step": 800
|
|
},
|
|
{
|
|
"epoch": 7.216216216216216,
|
|
"grad_norm": 0.2659870012399625,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1402,
|
|
"step": 801
|
|
},
|
|
{
|
|
"epoch": 7.225225225225225,
|
|
"grad_norm": 0.27334027202909716,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.16,
|
|
"step": 802
|
|
},
|
|
{
|
|
"epoch": 7.2342342342342345,
|
|
"grad_norm": 0.29814746528630565,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1313,
|
|
"step": 803
|
|
},
|
|
{
|
|
"epoch": 7.243243243243243,
|
|
"grad_norm": 0.30947580830786586,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.16,
|
|
"step": 804
|
|
},
|
|
{
|
|
"epoch": 7.252252252252252,
|
|
"grad_norm": 0.38586634200713993,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1162,
|
|
"step": 805
|
|
},
|
|
{
|
|
"epoch": 7.261261261261261,
|
|
"grad_norm": 0.24887533724067495,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1632,
|
|
"step": 806
|
|
},
|
|
{
|
|
"epoch": 7.27027027027027,
|
|
"grad_norm": 0.26959801719398596,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.152,
|
|
"step": 807
|
|
},
|
|
{
|
|
"epoch": 7.2792792792792795,
|
|
"grad_norm": 0.29035577733855455,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1283,
|
|
"step": 808
|
|
},
|
|
{
|
|
"epoch": 7.288288288288288,
|
|
"grad_norm": 0.28290473947861045,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1411,
|
|
"step": 809
|
|
},
|
|
{
|
|
"epoch": 7.297297297297297,
|
|
"grad_norm": 0.32523778027288563,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1198,
|
|
"step": 810
|
|
},
|
|
{
|
|
"epoch": 7.306306306306306,
|
|
"grad_norm": 0.27833761872914975,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1555,
|
|
"step": 811
|
|
},
|
|
{
|
|
"epoch": 7.315315315315315,
|
|
"grad_norm": 0.3367892879965876,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1314,
|
|
"step": 812
|
|
},
|
|
{
|
|
"epoch": 7.324324324324325,
|
|
"grad_norm": 0.277313010335673,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1364,
|
|
"step": 813
|
|
},
|
|
{
|
|
"epoch": 7.333333333333333,
|
|
"grad_norm": 0.27375407683101277,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1331,
|
|
"step": 814
|
|
},
|
|
{
|
|
"epoch": 7.342342342342342,
|
|
"grad_norm": 0.26697608186333877,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1458,
|
|
"step": 815
|
|
},
|
|
{
|
|
"epoch": 7.351351351351352,
|
|
"grad_norm": 0.2974176115893814,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1394,
|
|
"step": 816
|
|
},
|
|
{
|
|
"epoch": 7.36036036036036,
|
|
"grad_norm": 0.2764189750660692,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1421,
|
|
"step": 817
|
|
},
|
|
{
|
|
"epoch": 7.36936936936937,
|
|
"grad_norm": 0.26103798171790754,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1277,
|
|
"step": 818
|
|
},
|
|
{
|
|
"epoch": 7.378378378378378,
|
|
"grad_norm": 0.2702444951963583,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1248,
|
|
"step": 819
|
|
},
|
|
{
|
|
"epoch": 7.387387387387387,
|
|
"grad_norm": 0.3091293494651806,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.107,
|
|
"step": 820
|
|
},
|
|
{
|
|
"epoch": 7.396396396396397,
|
|
"grad_norm": 0.27618979276963385,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1286,
|
|
"step": 821
|
|
},
|
|
{
|
|
"epoch": 7.405405405405405,
|
|
"grad_norm": 0.2850325224623505,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1524,
|
|
"step": 822
|
|
},
|
|
{
|
|
"epoch": 7.414414414414415,
|
|
"grad_norm": 0.2699678297509792,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1504,
|
|
"step": 823
|
|
},
|
|
{
|
|
"epoch": 7.423423423423423,
|
|
"grad_norm": 0.2684653976073876,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1472,
|
|
"step": 824
|
|
},
|
|
{
|
|
"epoch": 7.4324324324324325,
|
|
"grad_norm": 0.27562604313146905,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1647,
|
|
"step": 825
|
|
},
|
|
{
|
|
"epoch": 7.441441441441442,
|
|
"grad_norm": 0.32271326740564915,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1373,
|
|
"step": 826
|
|
},
|
|
{
|
|
"epoch": 7.45045045045045,
|
|
"grad_norm": 0.3017797868640937,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1177,
|
|
"step": 827
|
|
},
|
|
{
|
|
"epoch": 7.45945945945946,
|
|
"grad_norm": 0.3746544705141892,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1266,
|
|
"step": 828
|
|
},
|
|
{
|
|
"epoch": 7.468468468468468,
|
|
"grad_norm": 0.36162531031963435,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1723,
|
|
"step": 829
|
|
},
|
|
{
|
|
"epoch": 7.4774774774774775,
|
|
"grad_norm": 0.30931420713180063,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1497,
|
|
"step": 830
|
|
},
|
|
{
|
|
"epoch": 7.486486486486487,
|
|
"grad_norm": 0.27294359139491653,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1409,
|
|
"step": 831
|
|
},
|
|
{
|
|
"epoch": 7.495495495495495,
|
|
"grad_norm": 0.2643297412103037,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1339,
|
|
"step": 832
|
|
},
|
|
{
|
|
"epoch": 7.504504504504505,
|
|
"grad_norm": 0.27159435674152455,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1432,
|
|
"step": 833
|
|
},
|
|
{
|
|
"epoch": 7.513513513513513,
|
|
"grad_norm": 0.29349995019825675,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1431,
|
|
"step": 834
|
|
},
|
|
{
|
|
"epoch": 7.5225225225225225,
|
|
"grad_norm": 0.31813922083817525,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1237,
|
|
"step": 835
|
|
},
|
|
{
|
|
"epoch": 7.531531531531532,
|
|
"grad_norm": 0.25431234598026253,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1231,
|
|
"step": 836
|
|
},
|
|
{
|
|
"epoch": 7.54054054054054,
|
|
"grad_norm": 0.26549876685780915,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1079,
|
|
"step": 837
|
|
},
|
|
{
|
|
"epoch": 7.54954954954955,
|
|
"grad_norm": 0.2843904679866454,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1456,
|
|
"step": 838
|
|
},
|
|
{
|
|
"epoch": 7.558558558558558,
|
|
"grad_norm": 0.2764008229289936,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1294,
|
|
"step": 839
|
|
},
|
|
{
|
|
"epoch": 7.5675675675675675,
|
|
"grad_norm": 0.27550196745644295,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1169,
|
|
"step": 840
|
|
},
|
|
{
|
|
"epoch": 7.576576576576577,
|
|
"grad_norm": 0.2657966726978357,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1283,
|
|
"step": 841
|
|
},
|
|
{
|
|
"epoch": 7.585585585585585,
|
|
"grad_norm": 0.279726882287188,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1151,
|
|
"step": 842
|
|
},
|
|
{
|
|
"epoch": 7.594594594594595,
|
|
"grad_norm": 0.27489297214494474,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1515,
|
|
"step": 843
|
|
},
|
|
{
|
|
"epoch": 7.603603603603604,
|
|
"grad_norm": 0.277774516155618,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1503,
|
|
"step": 844
|
|
},
|
|
{
|
|
"epoch": 7.612612612612613,
|
|
"grad_norm": 0.27480641761427765,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1075,
|
|
"step": 845
|
|
},
|
|
{
|
|
"epoch": 7.621621621621622,
|
|
"grad_norm": 0.2846350822817088,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1514,
|
|
"step": 846
|
|
},
|
|
{
|
|
"epoch": 7.63063063063063,
|
|
"grad_norm": 0.28181647241406504,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1314,
|
|
"step": 847
|
|
},
|
|
{
|
|
"epoch": 7.63963963963964,
|
|
"grad_norm": 0.3260999375199513,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.138,
|
|
"step": 848
|
|
},
|
|
{
|
|
"epoch": 7.648648648648649,
|
|
"grad_norm": 0.26191958014959504,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1258,
|
|
"step": 849
|
|
},
|
|
{
|
|
"epoch": 7.657657657657658,
|
|
"grad_norm": 0.3229999390728544,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.15,
|
|
"step": 850
|
|
},
|
|
{
|
|
"epoch": 7.666666666666667,
|
|
"grad_norm": 0.26451999606394416,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1495,
|
|
"step": 851
|
|
},
|
|
{
|
|
"epoch": 7.675675675675675,
|
|
"grad_norm": 0.26990766634628194,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.136,
|
|
"step": 852
|
|
},
|
|
{
|
|
"epoch": 7.684684684684685,
|
|
"grad_norm": 0.3024735158151713,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1506,
|
|
"step": 853
|
|
},
|
|
{
|
|
"epoch": 7.693693693693694,
|
|
"grad_norm": 0.2924088710387093,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1327,
|
|
"step": 854
|
|
},
|
|
{
|
|
"epoch": 7.702702702702703,
|
|
"grad_norm": 0.3127786707020021,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.139,
|
|
"step": 855
|
|
},
|
|
{
|
|
"epoch": 7.711711711711712,
|
|
"grad_norm": 0.284180680266691,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.144,
|
|
"step": 856
|
|
},
|
|
{
|
|
"epoch": 7.7207207207207205,
|
|
"grad_norm": 0.28516328758030296,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1428,
|
|
"step": 857
|
|
},
|
|
{
|
|
"epoch": 7.72972972972973,
|
|
"grad_norm": 0.2862117655255194,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1579,
|
|
"step": 858
|
|
},
|
|
{
|
|
"epoch": 7.738738738738739,
|
|
"grad_norm": 0.3006220321307271,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1603,
|
|
"step": 859
|
|
},
|
|
{
|
|
"epoch": 7.747747747747748,
|
|
"grad_norm": 0.29806627084091325,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.138,
|
|
"step": 860
|
|
},
|
|
{
|
|
"epoch": 7.756756756756757,
|
|
"grad_norm": 0.2947193709313226,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1523,
|
|
"step": 861
|
|
},
|
|
{
|
|
"epoch": 7.7657657657657655,
|
|
"grad_norm": 0.27653789730185635,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.0974,
|
|
"step": 862
|
|
},
|
|
{
|
|
"epoch": 7.774774774774775,
|
|
"grad_norm": 0.3359884991528938,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1498,
|
|
"step": 863
|
|
},
|
|
{
|
|
"epoch": 7.783783783783784,
|
|
"grad_norm": 0.3548503598800355,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1437,
|
|
"step": 864
|
|
},
|
|
{
|
|
"epoch": 7.792792792792793,
|
|
"grad_norm": 0.2847477764044701,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1393,
|
|
"step": 865
|
|
},
|
|
{
|
|
"epoch": 7.801801801801802,
|
|
"grad_norm": 0.3065450470724209,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1575,
|
|
"step": 866
|
|
},
|
|
{
|
|
"epoch": 7.8108108108108105,
|
|
"grad_norm": 0.3012473965068844,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1032,
|
|
"step": 867
|
|
},
|
|
{
|
|
"epoch": 7.81981981981982,
|
|
"grad_norm": 0.3010781272384048,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1475,
|
|
"step": 868
|
|
},
|
|
{
|
|
"epoch": 7.828828828828829,
|
|
"grad_norm": 0.2802542506009257,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1493,
|
|
"step": 869
|
|
},
|
|
{
|
|
"epoch": 7.837837837837838,
|
|
"grad_norm": 0.257772116445583,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1712,
|
|
"step": 870
|
|
},
|
|
{
|
|
"epoch": 7.846846846846847,
|
|
"grad_norm": 0.2619565806462764,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1511,
|
|
"step": 871
|
|
},
|
|
{
|
|
"epoch": 7.8558558558558556,
|
|
"grad_norm": 0.24721830116730928,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.11,
|
|
"step": 872
|
|
},
|
|
{
|
|
"epoch": 7.864864864864865,
|
|
"grad_norm": 0.2608797590307874,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1434,
|
|
"step": 873
|
|
},
|
|
{
|
|
"epoch": 7.873873873873874,
|
|
"grad_norm": 0.3098206461383644,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1244,
|
|
"step": 874
|
|
},
|
|
{
|
|
"epoch": 7.882882882882883,
|
|
"grad_norm": 0.2543474293117471,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1484,
|
|
"step": 875
|
|
},
|
|
{
|
|
"epoch": 7.891891891891892,
|
|
"grad_norm": 0.275008000690447,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1866,
|
|
"step": 876
|
|
},
|
|
{
|
|
"epoch": 7.900900900900901,
|
|
"grad_norm": 0.2574884831625064,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1163,
|
|
"step": 877
|
|
},
|
|
{
|
|
"epoch": 7.90990990990991,
|
|
"grad_norm": 0.2735186605433472,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.181,
|
|
"step": 878
|
|
},
|
|
{
|
|
"epoch": 7.918918918918919,
|
|
"grad_norm": 0.274445320782787,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1501,
|
|
"step": 879
|
|
},
|
|
{
|
|
"epoch": 7.927927927927928,
|
|
"grad_norm": 0.32592488504503725,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.149,
|
|
"step": 880
|
|
},
|
|
{
|
|
"epoch": 7.936936936936937,
|
|
"grad_norm": 0.2733852933643102,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1434,
|
|
"step": 881
|
|
},
|
|
{
|
|
"epoch": 7.945945945945946,
|
|
"grad_norm": 0.2801042118029067,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1348,
|
|
"step": 882
|
|
},
|
|
{
|
|
"epoch": 7.954954954954955,
|
|
"grad_norm": 0.27111497067523815,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1597,
|
|
"step": 883
|
|
},
|
|
{
|
|
"epoch": 7.963963963963964,
|
|
"grad_norm": 0.27209799660825884,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1521,
|
|
"step": 884
|
|
},
|
|
{
|
|
"epoch": 7.972972972972973,
|
|
"grad_norm": 0.3930775839000158,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1559,
|
|
"step": 885
|
|
},
|
|
{
|
|
"epoch": 7.981981981981982,
|
|
"grad_norm": 0.2678257713350625,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1541,
|
|
"step": 886
|
|
},
|
|
{
|
|
"epoch": 7.990990990990991,
|
|
"grad_norm": 0.2557726554508295,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1293,
|
|
"step": 887
|
|
},
|
|
{
|
|
"epoch": 8.0,
|
|
"grad_norm": 0.30361837110340506,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.118,
|
|
"step": 888
|
|
},
|
|
{
|
|
"epoch": 8.00900900900901,
|
|
"grad_norm": 0.3724300858125284,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.119,
|
|
"step": 889
|
|
},
|
|
{
|
|
"epoch": 8.018018018018019,
|
|
"grad_norm": 0.278602815740515,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1553,
|
|
"step": 890
|
|
},
|
|
{
|
|
"epoch": 8.027027027027026,
|
|
"grad_norm": 0.2757271522278008,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1182,
|
|
"step": 891
|
|
},
|
|
{
|
|
"epoch": 8.036036036036036,
|
|
"grad_norm": 0.2929517079994954,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1158,
|
|
"step": 892
|
|
},
|
|
{
|
|
"epoch": 8.045045045045045,
|
|
"grad_norm": 0.36372793530477965,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1134,
|
|
"step": 893
|
|
},
|
|
{
|
|
"epoch": 8.054054054054054,
|
|
"grad_norm": 0.2817436945034752,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.0886,
|
|
"step": 894
|
|
},
|
|
{
|
|
"epoch": 8.063063063063064,
|
|
"grad_norm": 0.26617606652724307,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1196,
|
|
"step": 895
|
|
},
|
|
{
|
|
"epoch": 8.072072072072071,
|
|
"grad_norm": 0.28867983451275775,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1194,
|
|
"step": 896
|
|
},
|
|
{
|
|
"epoch": 8.08108108108108,
|
|
"grad_norm": 0.32961696028266857,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1137,
|
|
"step": 897
|
|
},
|
|
{
|
|
"epoch": 8.09009009009009,
|
|
"grad_norm": 0.3261806017299068,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1164,
|
|
"step": 898
|
|
},
|
|
{
|
|
"epoch": 8.0990990990991,
|
|
"grad_norm": 0.3047004644766596,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1126,
|
|
"step": 899
|
|
},
|
|
{
|
|
"epoch": 8.108108108108109,
|
|
"grad_norm": 0.33711404617474894,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1321,
|
|
"step": 900
|
|
},
|
|
{
|
|
"epoch": 8.117117117117116,
|
|
"grad_norm": 0.28781948503164106,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.0933,
|
|
"step": 901
|
|
},
|
|
{
|
|
"epoch": 8.126126126126126,
|
|
"grad_norm": 0.28954810553500565,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1232,
|
|
"step": 902
|
|
},
|
|
{
|
|
"epoch": 8.135135135135135,
|
|
"grad_norm": 0.2934734159929239,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1197,
|
|
"step": 903
|
|
},
|
|
{
|
|
"epoch": 8.144144144144144,
|
|
"grad_norm": 0.2851801626338276,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1371,
|
|
"step": 904
|
|
},
|
|
{
|
|
"epoch": 8.153153153153154,
|
|
"grad_norm": 0.27920553933946246,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1166,
|
|
"step": 905
|
|
},
|
|
{
|
|
"epoch": 8.162162162162161,
|
|
"grad_norm": 0.31343774042661315,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1351,
|
|
"step": 906
|
|
},
|
|
{
|
|
"epoch": 8.17117117117117,
|
|
"grad_norm": 0.28444182940880375,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1102,
|
|
"step": 907
|
|
},
|
|
{
|
|
"epoch": 8.18018018018018,
|
|
"grad_norm": 0.27773525186278575,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1051,
|
|
"step": 908
|
|
},
|
|
{
|
|
"epoch": 8.18918918918919,
|
|
"grad_norm": 0.2822074526087139,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1147,
|
|
"step": 909
|
|
},
|
|
{
|
|
"epoch": 8.198198198198199,
|
|
"grad_norm": 0.3445008346134781,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1627,
|
|
"step": 910
|
|
},
|
|
{
|
|
"epoch": 8.207207207207206,
|
|
"grad_norm": 0.2948133646560585,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1243,
|
|
"step": 911
|
|
},
|
|
{
|
|
"epoch": 8.216216216216216,
|
|
"grad_norm": 0.3548137336190423,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1034,
|
|
"step": 912
|
|
},
|
|
{
|
|
"epoch": 8.225225225225225,
|
|
"grad_norm": 0.24835233982448732,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1083,
|
|
"step": 913
|
|
},
|
|
{
|
|
"epoch": 8.234234234234235,
|
|
"grad_norm": 0.28608021315126475,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.0924,
|
|
"step": 914
|
|
},
|
|
{
|
|
"epoch": 8.243243243243244,
|
|
"grad_norm": 0.2489920981375781,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.0732,
|
|
"step": 915
|
|
},
|
|
{
|
|
"epoch": 8.252252252252251,
|
|
"grad_norm": 0.32648292494227393,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.101,
|
|
"step": 916
|
|
},
|
|
{
|
|
"epoch": 8.26126126126126,
|
|
"grad_norm": 0.26393238433265215,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.129,
|
|
"step": 917
|
|
},
|
|
{
|
|
"epoch": 8.27027027027027,
|
|
"grad_norm": 0.33000321897677853,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1092,
|
|
"step": 918
|
|
},
|
|
{
|
|
"epoch": 8.27927927927928,
|
|
"grad_norm": 0.36101227626943727,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1164,
|
|
"step": 919
|
|
},
|
|
{
|
|
"epoch": 8.288288288288289,
|
|
"grad_norm": 0.31480014766124803,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1042,
|
|
"step": 920
|
|
},
|
|
{
|
|
"epoch": 8.297297297297296,
|
|
"grad_norm": 0.3411446684131361,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1165,
|
|
"step": 921
|
|
},
|
|
{
|
|
"epoch": 8.306306306306306,
|
|
"grad_norm": 0.332814645717202,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1078,
|
|
"step": 922
|
|
},
|
|
{
|
|
"epoch": 8.315315315315315,
|
|
"grad_norm": 0.5298958645427678,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.129,
|
|
"step": 923
|
|
},
|
|
{
|
|
"epoch": 8.324324324324325,
|
|
"grad_norm": 0.359036677437228,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1297,
|
|
"step": 924
|
|
},
|
|
{
|
|
"epoch": 8.333333333333334,
|
|
"grad_norm": 0.3038540419282529,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.0996,
|
|
"step": 925
|
|
},
|
|
{
|
|
"epoch": 8.342342342342342,
|
|
"grad_norm": 0.2836788711371018,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1064,
|
|
"step": 926
|
|
},
|
|
{
|
|
"epoch": 8.35135135135135,
|
|
"grad_norm": 0.2840741884034298,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1068,
|
|
"step": 927
|
|
},
|
|
{
|
|
"epoch": 8.36036036036036,
|
|
"grad_norm": 0.3606217340289843,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1161,
|
|
"step": 928
|
|
},
|
|
{
|
|
"epoch": 8.36936936936937,
|
|
"grad_norm": 0.33639738597690616,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1172,
|
|
"step": 929
|
|
},
|
|
{
|
|
"epoch": 8.378378378378379,
|
|
"grad_norm": 0.34030156462417316,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.0924,
|
|
"step": 930
|
|
},
|
|
{
|
|
"epoch": 8.387387387387387,
|
|
"grad_norm": 0.2833799915899549,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1433,
|
|
"step": 931
|
|
},
|
|
{
|
|
"epoch": 8.396396396396396,
|
|
"grad_norm": 0.29693654573168704,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1192,
|
|
"step": 932
|
|
},
|
|
{
|
|
"epoch": 8.405405405405405,
|
|
"grad_norm": 0.2834266663051093,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1289,
|
|
"step": 933
|
|
},
|
|
{
|
|
"epoch": 8.414414414414415,
|
|
"grad_norm": 0.29926603005274255,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1238,
|
|
"step": 934
|
|
},
|
|
{
|
|
"epoch": 8.423423423423424,
|
|
"grad_norm": 0.27131987409072494,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1137,
|
|
"step": 935
|
|
},
|
|
{
|
|
"epoch": 8.432432432432432,
|
|
"grad_norm": 0.27245579434724637,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.0845,
|
|
"step": 936
|
|
},
|
|
{
|
|
"epoch": 8.441441441441441,
|
|
"grad_norm": 0.29790096192104826,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1455,
|
|
"step": 937
|
|
},
|
|
{
|
|
"epoch": 8.45045045045045,
|
|
"grad_norm": 0.30112523923077406,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1243,
|
|
"step": 938
|
|
},
|
|
{
|
|
"epoch": 8.45945945945946,
|
|
"grad_norm": 0.2975979835875494,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1124,
|
|
"step": 939
|
|
},
|
|
{
|
|
"epoch": 8.468468468468469,
|
|
"grad_norm": 0.325062595930168,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.125,
|
|
"step": 940
|
|
},
|
|
{
|
|
"epoch": 8.477477477477478,
|
|
"grad_norm": 0.316039949527555,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1423,
|
|
"step": 941
|
|
},
|
|
{
|
|
"epoch": 8.486486486486486,
|
|
"grad_norm": 0.38300001583678706,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1402,
|
|
"step": 942
|
|
},
|
|
{
|
|
"epoch": 8.495495495495495,
|
|
"grad_norm": 0.368069280096857,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1394,
|
|
"step": 943
|
|
},
|
|
{
|
|
"epoch": 8.504504504504505,
|
|
"grad_norm": 0.2836701503469006,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1212,
|
|
"step": 944
|
|
},
|
|
{
|
|
"epoch": 8.513513513513514,
|
|
"grad_norm": 0.28208846723398523,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1176,
|
|
"step": 945
|
|
},
|
|
{
|
|
"epoch": 8.522522522522522,
|
|
"grad_norm": 0.3223580674636738,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1085,
|
|
"step": 946
|
|
},
|
|
{
|
|
"epoch": 8.531531531531531,
|
|
"grad_norm": 0.2975838495736659,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1388,
|
|
"step": 947
|
|
},
|
|
{
|
|
"epoch": 8.54054054054054,
|
|
"grad_norm": 0.2926384544838666,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1446,
|
|
"step": 948
|
|
},
|
|
{
|
|
"epoch": 8.54954954954955,
|
|
"grad_norm": 0.253328197509007,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1106,
|
|
"step": 949
|
|
},
|
|
{
|
|
"epoch": 8.558558558558559,
|
|
"grad_norm": 0.29078679203113755,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.147,
|
|
"step": 950
|
|
},
|
|
{
|
|
"epoch": 8.567567567567568,
|
|
"grad_norm": 0.3242982227120515,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.114,
|
|
"step": 951
|
|
},
|
|
{
|
|
"epoch": 8.576576576576576,
|
|
"grad_norm": 0.31916745252108,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1467,
|
|
"step": 952
|
|
},
|
|
{
|
|
"epoch": 8.585585585585585,
|
|
"grad_norm": 0.30713744803388243,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1165,
|
|
"step": 953
|
|
},
|
|
{
|
|
"epoch": 8.594594594594595,
|
|
"grad_norm": 0.31006975320296604,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1293,
|
|
"step": 954
|
|
},
|
|
{
|
|
"epoch": 8.603603603603604,
|
|
"grad_norm": 0.2929902011566849,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1134,
|
|
"step": 955
|
|
},
|
|
{
|
|
"epoch": 8.612612612612612,
|
|
"grad_norm": 0.26235543384430693,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1053,
|
|
"step": 956
|
|
},
|
|
{
|
|
"epoch": 8.621621621621621,
|
|
"grad_norm": 0.310009086101237,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1353,
|
|
"step": 957
|
|
},
|
|
{
|
|
"epoch": 8.63063063063063,
|
|
"grad_norm": 0.4012614816736551,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1248,
|
|
"step": 958
|
|
},
|
|
{
|
|
"epoch": 8.63963963963964,
|
|
"grad_norm": 0.2922213629865694,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1229,
|
|
"step": 959
|
|
},
|
|
{
|
|
"epoch": 8.64864864864865,
|
|
"grad_norm": 0.28311443461185315,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1309,
|
|
"step": 960
|
|
},
|
|
{
|
|
"epoch": 8.657657657657658,
|
|
"grad_norm": 0.3882325365638355,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.134,
|
|
"step": 961
|
|
},
|
|
{
|
|
"epoch": 8.666666666666666,
|
|
"grad_norm": 0.3162063136020091,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1166,
|
|
"step": 962
|
|
},
|
|
{
|
|
"epoch": 8.675675675675675,
|
|
"grad_norm": 0.3408953528226783,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1289,
|
|
"step": 963
|
|
},
|
|
{
|
|
"epoch": 8.684684684684685,
|
|
"grad_norm": 0.2558993641119161,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.0915,
|
|
"step": 964
|
|
},
|
|
{
|
|
"epoch": 8.693693693693694,
|
|
"grad_norm": 0.3240415851794722,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1283,
|
|
"step": 965
|
|
},
|
|
{
|
|
"epoch": 8.702702702702704,
|
|
"grad_norm": 0.2802378425767391,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1462,
|
|
"step": 966
|
|
},
|
|
{
|
|
"epoch": 8.711711711711711,
|
|
"grad_norm": 0.32417830620793386,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1332,
|
|
"step": 967
|
|
},
|
|
{
|
|
"epoch": 8.72072072072072,
|
|
"grad_norm": 0.27199769909957633,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.111,
|
|
"step": 968
|
|
},
|
|
{
|
|
"epoch": 8.72972972972973,
|
|
"grad_norm": 0.3143617324505027,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1215,
|
|
"step": 969
|
|
},
|
|
{
|
|
"epoch": 8.73873873873874,
|
|
"grad_norm": 0.3294346785496534,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1054,
|
|
"step": 970
|
|
},
|
|
{
|
|
"epoch": 8.747747747747749,
|
|
"grad_norm": 0.30155526964094,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1222,
|
|
"step": 971
|
|
},
|
|
{
|
|
"epoch": 8.756756756756756,
|
|
"grad_norm": 0.3117950940446377,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.114,
|
|
"step": 972
|
|
},
|
|
{
|
|
"epoch": 8.765765765765765,
|
|
"grad_norm": 0.3392300144846105,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1163,
|
|
"step": 973
|
|
},
|
|
{
|
|
"epoch": 8.774774774774775,
|
|
"grad_norm": 0.2977388674828091,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1386,
|
|
"step": 974
|
|
},
|
|
{
|
|
"epoch": 8.783783783783784,
|
|
"grad_norm": 0.3088070527312149,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1622,
|
|
"step": 975
|
|
},
|
|
{
|
|
"epoch": 8.792792792792794,
|
|
"grad_norm": 0.28460989793331587,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1321,
|
|
"step": 976
|
|
},
|
|
{
|
|
"epoch": 8.801801801801801,
|
|
"grad_norm": 0.301114864385651,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1084,
|
|
"step": 977
|
|
},
|
|
{
|
|
"epoch": 8.81081081081081,
|
|
"grad_norm": 0.3080454311504172,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.0958,
|
|
"step": 978
|
|
},
|
|
{
|
|
"epoch": 8.81981981981982,
|
|
"grad_norm": 0.35110800631668737,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1272,
|
|
"step": 979
|
|
},
|
|
{
|
|
"epoch": 8.82882882882883,
|
|
"grad_norm": 0.27956217271886274,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1327,
|
|
"step": 980
|
|
},
|
|
{
|
|
"epoch": 8.837837837837839,
|
|
"grad_norm": 0.32361648737642695,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1214,
|
|
"step": 981
|
|
},
|
|
{
|
|
"epoch": 8.846846846846846,
|
|
"grad_norm": 0.33228900753392643,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.0843,
|
|
"step": 982
|
|
},
|
|
{
|
|
"epoch": 8.855855855855856,
|
|
"grad_norm": 0.32323655451004957,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.117,
|
|
"step": 983
|
|
},
|
|
{
|
|
"epoch": 8.864864864864865,
|
|
"grad_norm": 0.3411596228576446,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1137,
|
|
"step": 984
|
|
},
|
|
{
|
|
"epoch": 8.873873873873874,
|
|
"grad_norm": 0.33618399554078643,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1232,
|
|
"step": 985
|
|
},
|
|
{
|
|
"epoch": 8.882882882882884,
|
|
"grad_norm": 0.31438715504842607,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1188,
|
|
"step": 986
|
|
},
|
|
{
|
|
"epoch": 8.891891891891891,
|
|
"grad_norm": 0.29235691269480235,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.0907,
|
|
"step": 987
|
|
},
|
|
{
|
|
"epoch": 8.9009009009009,
|
|
"grad_norm": 0.31569994309412647,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.139,
|
|
"step": 988
|
|
},
|
|
{
|
|
"epoch": 8.90990990990991,
|
|
"grad_norm": 0.3488819032640533,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.0925,
|
|
"step": 989
|
|
},
|
|
{
|
|
"epoch": 8.91891891891892,
|
|
"grad_norm": 0.3287782461836467,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1144,
|
|
"step": 990
|
|
},
|
|
{
|
|
"epoch": 8.927927927927929,
|
|
"grad_norm": 0.342018883415981,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1554,
|
|
"step": 991
|
|
},
|
|
{
|
|
"epoch": 8.936936936936936,
|
|
"grad_norm": 0.30922097124521764,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.0773,
|
|
"step": 992
|
|
},
|
|
{
|
|
"epoch": 8.945945945945946,
|
|
"grad_norm": 0.3170605884389048,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.0961,
|
|
"step": 993
|
|
},
|
|
{
|
|
"epoch": 8.954954954954955,
|
|
"grad_norm": 0.2894394712507756,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1175,
|
|
"step": 994
|
|
},
|
|
{
|
|
"epoch": 8.963963963963964,
|
|
"grad_norm": 0.3507612251253632,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.0898,
|
|
"step": 995
|
|
},
|
|
{
|
|
"epoch": 8.972972972972974,
|
|
"grad_norm": 0.2916461407037756,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1162,
|
|
"step": 996
|
|
},
|
|
{
|
|
"epoch": 8.981981981981981,
|
|
"grad_norm": 0.3148022366204299,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1263,
|
|
"step": 997
|
|
},
|
|
{
|
|
"epoch": 8.99099099099099,
|
|
"grad_norm": 0.2917019835417808,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1088,
|
|
"step": 998
|
|
},
|
|
{
|
|
"epoch": 9.0,
|
|
"grad_norm": 0.27462543957892144,
|
|
"learning_rate": 5e-06,
|
|
"loss": 0.1158,
|
|
"step": 999
|
|
}
|
|
],
|
|
"logging_steps": 1.0,
|
|
"max_steps": 11100,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 100,
|
|
"save_steps": 500,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": false
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 1278638161920000.0,
|
|
"train_batch_size": 1,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|