5217 lines
126 KiB
JSON
5217 lines
126 KiB
JSON
|
|
{
|
||
|
|
"best_global_step": null,
|
||
|
|
"best_metric": null,
|
||
|
|
"best_model_checkpoint": null,
|
||
|
|
"epoch": 3.0,
|
||
|
|
"eval_steps": 500,
|
||
|
|
"global_step": 1479,
|
||
|
|
"is_hyper_param_search": false,
|
||
|
|
"is_local_process_zero": true,
|
||
|
|
"is_world_process_zero": true,
|
||
|
|
"log_history": [
|
||
|
|
{
|
||
|
|
"epoch": 0.004058853373921867,
|
||
|
|
"grad_norm": 1.6877160845613697,
|
||
|
|
"learning_rate": 6.756756756756757e-08,
|
||
|
|
"loss": 0.3778,
|
||
|
|
"step": 2
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.008117706747843734,
|
||
|
|
"grad_norm": 1.7651022477491063,
|
||
|
|
"learning_rate": 2.0270270270270273e-07,
|
||
|
|
"loss": 0.3539,
|
||
|
|
"step": 4
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.0121765601217656,
|
||
|
|
"grad_norm": 1.7346867297246225,
|
||
|
|
"learning_rate": 3.378378378378379e-07,
|
||
|
|
"loss": 0.3621,
|
||
|
|
"step": 6
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.016235413495687467,
|
||
|
|
"grad_norm": 1.572133422531505,
|
||
|
|
"learning_rate": 4.7297297297297305e-07,
|
||
|
|
"loss": 0.3345,
|
||
|
|
"step": 8
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.020294266869609334,
|
||
|
|
"grad_norm": 1.619733499528023,
|
||
|
|
"learning_rate": 6.081081081081082e-07,
|
||
|
|
"loss": 0.3408,
|
||
|
|
"step": 10
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.0243531202435312,
|
||
|
|
"grad_norm": 1.658160676313538,
|
||
|
|
"learning_rate": 7.432432432432434e-07,
|
||
|
|
"loss": 0.351,
|
||
|
|
"step": 12
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.028411973617453068,
|
||
|
|
"grad_norm": 1.5596628408438766,
|
||
|
|
"learning_rate": 8.783783783783785e-07,
|
||
|
|
"loss": 0.3319,
|
||
|
|
"step": 14
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.032470826991374935,
|
||
|
|
"grad_norm": 1.582923343783565,
|
||
|
|
"learning_rate": 1.0135135135135136e-06,
|
||
|
|
"loss": 0.3631,
|
||
|
|
"step": 16
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.0365296803652968,
|
||
|
|
"grad_norm": 1.6210099160307392,
|
||
|
|
"learning_rate": 1.148648648648649e-06,
|
||
|
|
"loss": 0.3369,
|
||
|
|
"step": 18
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.04058853373921867,
|
||
|
|
"grad_norm": 1.4490925925277196,
|
||
|
|
"learning_rate": 1.2837837837837838e-06,
|
||
|
|
"loss": 0.3401,
|
||
|
|
"step": 20
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.044647387113140535,
|
||
|
|
"grad_norm": 1.5981937222734808,
|
||
|
|
"learning_rate": 1.418918918918919e-06,
|
||
|
|
"loss": 0.4064,
|
||
|
|
"step": 22
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.0487062404870624,
|
||
|
|
"grad_norm": 1.4745472664304582,
|
||
|
|
"learning_rate": 1.5540540540540541e-06,
|
||
|
|
"loss": 0.3511,
|
||
|
|
"step": 24
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.05276509386098427,
|
||
|
|
"grad_norm": 1.5094086215074392,
|
||
|
|
"learning_rate": 1.6891891891891894e-06,
|
||
|
|
"loss": 0.3303,
|
||
|
|
"step": 26
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.056823947234906136,
|
||
|
|
"grad_norm": 1.585436570063055,
|
||
|
|
"learning_rate": 1.8243243243243245e-06,
|
||
|
|
"loss": 0.3458,
|
||
|
|
"step": 28
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.060882800608828,
|
||
|
|
"grad_norm": 1.738359897683998,
|
||
|
|
"learning_rate": 1.9594594594594595e-06,
|
||
|
|
"loss": 0.3401,
|
||
|
|
"step": 30
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.06494165398274987,
|
||
|
|
"grad_norm": 1.5432250105335408,
|
||
|
|
"learning_rate": 2.0945945945945946e-06,
|
||
|
|
"loss": 0.341,
|
||
|
|
"step": 32
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.06900050735667174,
|
||
|
|
"grad_norm": 1.4879837682144732,
|
||
|
|
"learning_rate": 2.22972972972973e-06,
|
||
|
|
"loss": 0.333,
|
||
|
|
"step": 34
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.0730593607305936,
|
||
|
|
"grad_norm": 1.5249793712374056,
|
||
|
|
"learning_rate": 2.364864864864865e-06,
|
||
|
|
"loss": 0.3389,
|
||
|
|
"step": 36
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07711821410451547,
|
||
|
|
"grad_norm": 1.5591368574163726,
|
||
|
|
"learning_rate": 2.5e-06,
|
||
|
|
"loss": 0.3422,
|
||
|
|
"step": 38
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.08117706747843734,
|
||
|
|
"grad_norm": 1.6074325994660499,
|
||
|
|
"learning_rate": 2.6351351351351353e-06,
|
||
|
|
"loss": 0.348,
|
||
|
|
"step": 40
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.0852359208523592,
|
||
|
|
"grad_norm": 1.517968202412236,
|
||
|
|
"learning_rate": 2.7702702702702703e-06,
|
||
|
|
"loss": 0.3376,
|
||
|
|
"step": 42
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.08929477422628107,
|
||
|
|
"grad_norm": 1.5371668709250539,
|
||
|
|
"learning_rate": 2.9054054054054054e-06,
|
||
|
|
"loss": 0.3556,
|
||
|
|
"step": 44
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.09335362760020294,
|
||
|
|
"grad_norm": 1.4812012090460671,
|
||
|
|
"learning_rate": 3.040540540540541e-06,
|
||
|
|
"loss": 0.3524,
|
||
|
|
"step": 46
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.0974124809741248,
|
||
|
|
"grad_norm": 1.5363097735419804,
|
||
|
|
"learning_rate": 3.1756756756756755e-06,
|
||
|
|
"loss": 0.3279,
|
||
|
|
"step": 48
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.10147133434804667,
|
||
|
|
"grad_norm": 1.4293275584526721,
|
||
|
|
"learning_rate": 3.310810810810811e-06,
|
||
|
|
"loss": 0.3063,
|
||
|
|
"step": 50
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.10553018772196854,
|
||
|
|
"grad_norm": 1.5566072184509325,
|
||
|
|
"learning_rate": 3.445945945945946e-06,
|
||
|
|
"loss": 0.3453,
|
||
|
|
"step": 52
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1095890410958904,
|
||
|
|
"grad_norm": 1.411331335157791,
|
||
|
|
"learning_rate": 3.5810810810810816e-06,
|
||
|
|
"loss": 0.3151,
|
||
|
|
"step": 54
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.11364789446981227,
|
||
|
|
"grad_norm": 1.559975757164133,
|
||
|
|
"learning_rate": 3.7162162162162162e-06,
|
||
|
|
"loss": 0.3267,
|
||
|
|
"step": 56
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.11770674784373414,
|
||
|
|
"grad_norm": 1.3575124310454543,
|
||
|
|
"learning_rate": 3.851351351351352e-06,
|
||
|
|
"loss": 0.3426,
|
||
|
|
"step": 58
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.121765601217656,
|
||
|
|
"grad_norm": 1.566074257576769,
|
||
|
|
"learning_rate": 3.986486486486487e-06,
|
||
|
|
"loss": 0.3431,
|
||
|
|
"step": 60
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.12582445459157787,
|
||
|
|
"grad_norm": 1.4663548045652957,
|
||
|
|
"learning_rate": 4.121621621621622e-06,
|
||
|
|
"loss": 0.3328,
|
||
|
|
"step": 62
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.12988330796549974,
|
||
|
|
"grad_norm": 1.4786594077137154,
|
||
|
|
"learning_rate": 4.256756756756757e-06,
|
||
|
|
"loss": 0.318,
|
||
|
|
"step": 64
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1339421613394216,
|
||
|
|
"grad_norm": 1.4943486857884478,
|
||
|
|
"learning_rate": 4.391891891891892e-06,
|
||
|
|
"loss": 0.3457,
|
||
|
|
"step": 66
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.13800101471334347,
|
||
|
|
"grad_norm": 1.4843598946184238,
|
||
|
|
"learning_rate": 4.527027027027027e-06,
|
||
|
|
"loss": 0.329,
|
||
|
|
"step": 68
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.14205986808726534,
|
||
|
|
"grad_norm": 1.4234589903170214,
|
||
|
|
"learning_rate": 4.6621621621621625e-06,
|
||
|
|
"loss": 0.3346,
|
||
|
|
"step": 70
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1461187214611872,
|
||
|
|
"grad_norm": 1.363410197784669,
|
||
|
|
"learning_rate": 4.797297297297297e-06,
|
||
|
|
"loss": 0.33,
|
||
|
|
"step": 72
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.15017757483510907,
|
||
|
|
"grad_norm": 1.3795399226185014,
|
||
|
|
"learning_rate": 4.932432432432433e-06,
|
||
|
|
"loss": 0.3111,
|
||
|
|
"step": 74
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.15423642820903094,
|
||
|
|
"grad_norm": 1.4630782525423722,
|
||
|
|
"learning_rate": 5.067567567567568e-06,
|
||
|
|
"loss": 0.3164,
|
||
|
|
"step": 76
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1582952815829528,
|
||
|
|
"grad_norm": 1.535896727076301,
|
||
|
|
"learning_rate": 5.202702702702704e-06,
|
||
|
|
"loss": 0.3462,
|
||
|
|
"step": 78
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.16235413495687467,
|
||
|
|
"grad_norm": 1.3384695460866296,
|
||
|
|
"learning_rate": 5.337837837837838e-06,
|
||
|
|
"loss": 0.328,
|
||
|
|
"step": 80
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.16641298833079654,
|
||
|
|
"grad_norm": 1.564060215758407,
|
||
|
|
"learning_rate": 5.472972972972973e-06,
|
||
|
|
"loss": 0.3356,
|
||
|
|
"step": 82
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1704718417047184,
|
||
|
|
"grad_norm": 1.5875357299856645,
|
||
|
|
"learning_rate": 5.608108108108109e-06,
|
||
|
|
"loss": 0.3317,
|
||
|
|
"step": 84
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.17453069507864027,
|
||
|
|
"grad_norm": 1.3150986310423165,
|
||
|
|
"learning_rate": 5.743243243243244e-06,
|
||
|
|
"loss": 0.3197,
|
||
|
|
"step": 86
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.17858954845256214,
|
||
|
|
"grad_norm": 1.4780350536596663,
|
||
|
|
"learning_rate": 5.8783783783783786e-06,
|
||
|
|
"loss": 0.3328,
|
||
|
|
"step": 88
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.182648401826484,
|
||
|
|
"grad_norm": 1.3565098348860962,
|
||
|
|
"learning_rate": 6.013513513513514e-06,
|
||
|
|
"loss": 0.3081,
|
||
|
|
"step": 90
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.18670725520040587,
|
||
|
|
"grad_norm": 1.5123567866175038,
|
||
|
|
"learning_rate": 6.1486486486486495e-06,
|
||
|
|
"loss": 0.3276,
|
||
|
|
"step": 92
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.19076610857432774,
|
||
|
|
"grad_norm": 1.4884022484673987,
|
||
|
|
"learning_rate": 6.283783783783784e-06,
|
||
|
|
"loss": 0.3523,
|
||
|
|
"step": 94
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1948249619482496,
|
||
|
|
"grad_norm": 1.601046629607006,
|
||
|
|
"learning_rate": 6.41891891891892e-06,
|
||
|
|
"loss": 0.3612,
|
||
|
|
"step": 96
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.19888381532217148,
|
||
|
|
"grad_norm": 1.479212704609007,
|
||
|
|
"learning_rate": 6.554054054054054e-06,
|
||
|
|
"loss": 0.3106,
|
||
|
|
"step": 98
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.20294266869609334,
|
||
|
|
"grad_norm": 1.4220008074976263,
|
||
|
|
"learning_rate": 6.689189189189191e-06,
|
||
|
|
"loss": 0.3441,
|
||
|
|
"step": 100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2070015220700152,
|
||
|
|
"grad_norm": 1.3514096746438489,
|
||
|
|
"learning_rate": 6.824324324324325e-06,
|
||
|
|
"loss": 0.334,
|
||
|
|
"step": 102
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.21106037544393708,
|
||
|
|
"grad_norm": 1.4495742654024877,
|
||
|
|
"learning_rate": 6.95945945945946e-06,
|
||
|
|
"loss": 0.3365,
|
||
|
|
"step": 104
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.21511922881785894,
|
||
|
|
"grad_norm": 1.4742675513138555,
|
||
|
|
"learning_rate": 7.0945945945945946e-06,
|
||
|
|
"loss": 0.3306,
|
||
|
|
"step": 106
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2191780821917808,
|
||
|
|
"grad_norm": 1.5180836151687533,
|
||
|
|
"learning_rate": 7.229729729729731e-06,
|
||
|
|
"loss": 0.3435,
|
||
|
|
"step": 108
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.22323693556570268,
|
||
|
|
"grad_norm": 1.54712083151063,
|
||
|
|
"learning_rate": 7.3648648648648655e-06,
|
||
|
|
"loss": 0.3698,
|
||
|
|
"step": 110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.22729578893962454,
|
||
|
|
"grad_norm": 1.5589085128625726,
|
||
|
|
"learning_rate": 7.500000000000001e-06,
|
||
|
|
"loss": 0.3209,
|
||
|
|
"step": 112
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2313546423135464,
|
||
|
|
"grad_norm": 1.5013917554759992,
|
||
|
|
"learning_rate": 7.635135135135135e-06,
|
||
|
|
"loss": 0.3488,
|
||
|
|
"step": 114
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.23541349568746828,
|
||
|
|
"grad_norm": 1.4511795332039656,
|
||
|
|
"learning_rate": 7.77027027027027e-06,
|
||
|
|
"loss": 0.3506,
|
||
|
|
"step": 116
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.23947234906139014,
|
||
|
|
"grad_norm": 1.4069378206541412,
|
||
|
|
"learning_rate": 7.905405405405406e-06,
|
||
|
|
"loss": 0.3481,
|
||
|
|
"step": 118
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.243531202435312,
|
||
|
|
"grad_norm": 1.582455691561815,
|
||
|
|
"learning_rate": 8.040540540540541e-06,
|
||
|
|
"loss": 0.3603,
|
||
|
|
"step": 120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.24759005580923388,
|
||
|
|
"grad_norm": 1.434757897777904,
|
||
|
|
"learning_rate": 8.175675675675677e-06,
|
||
|
|
"loss": 0.3351,
|
||
|
|
"step": 122
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.25164890918315574,
|
||
|
|
"grad_norm": 1.4655011777161737,
|
||
|
|
"learning_rate": 8.31081081081081e-06,
|
||
|
|
"loss": 0.3579,
|
||
|
|
"step": 124
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2557077625570776,
|
||
|
|
"grad_norm": 1.2773884939900029,
|
||
|
|
"learning_rate": 8.445945945945948e-06,
|
||
|
|
"loss": 0.3344,
|
||
|
|
"step": 126
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2597666159309995,
|
||
|
|
"grad_norm": 1.566232859360701,
|
||
|
|
"learning_rate": 8.581081081081082e-06,
|
||
|
|
"loss": 0.3671,
|
||
|
|
"step": 128
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.26382546930492135,
|
||
|
|
"grad_norm": 1.600278240352609,
|
||
|
|
"learning_rate": 8.716216216216217e-06,
|
||
|
|
"loss": 0.3668,
|
||
|
|
"step": 130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2678843226788432,
|
||
|
|
"grad_norm": 1.4285015615854608,
|
||
|
|
"learning_rate": 8.851351351351351e-06,
|
||
|
|
"loss": 0.3701,
|
||
|
|
"step": 132
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2719431760527651,
|
||
|
|
"grad_norm": 1.567165427429299,
|
||
|
|
"learning_rate": 8.986486486486488e-06,
|
||
|
|
"loss": 0.3718,
|
||
|
|
"step": 134
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.27600202942668695,
|
||
|
|
"grad_norm": 1.4657335810014254,
|
||
|
|
"learning_rate": 9.121621621621622e-06,
|
||
|
|
"loss": 0.3576,
|
||
|
|
"step": 136
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2800608828006088,
|
||
|
|
"grad_norm": 1.5291620734959124,
|
||
|
|
"learning_rate": 9.256756756756757e-06,
|
||
|
|
"loss": 0.3838,
|
||
|
|
"step": 138
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2841197361745307,
|
||
|
|
"grad_norm": 1.4553153344151037,
|
||
|
|
"learning_rate": 9.391891891891893e-06,
|
||
|
|
"loss": 0.3782,
|
||
|
|
"step": 140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.28817858954845255,
|
||
|
|
"grad_norm": 1.406160463172771,
|
||
|
|
"learning_rate": 9.527027027027028e-06,
|
||
|
|
"loss": 0.3666,
|
||
|
|
"step": 142
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2922374429223744,
|
||
|
|
"grad_norm": 1.5917073052853832,
|
||
|
|
"learning_rate": 9.662162162162164e-06,
|
||
|
|
"loss": 0.392,
|
||
|
|
"step": 144
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2962962962962963,
|
||
|
|
"grad_norm": 1.511564904546133,
|
||
|
|
"learning_rate": 9.797297297297298e-06,
|
||
|
|
"loss": 0.3852,
|
||
|
|
"step": 146
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.30035514967021815,
|
||
|
|
"grad_norm": 1.4461988102226726,
|
||
|
|
"learning_rate": 9.932432432432433e-06,
|
||
|
|
"loss": 0.3947,
|
||
|
|
"step": 148
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.30441400304414,
|
||
|
|
"grad_norm": 1.5436908100507405,
|
||
|
|
"learning_rate": 9.999986072170506e-06,
|
||
|
|
"loss": 0.3778,
|
||
|
|
"step": 150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3084728564180619,
|
||
|
|
"grad_norm": 1.413238952587016,
|
||
|
|
"learning_rate": 9.99987465000011e-06,
|
||
|
|
"loss": 0.3634,
|
||
|
|
"step": 152
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.31253170979198375,
|
||
|
|
"grad_norm": 1.1989032000359305,
|
||
|
|
"learning_rate": 9.999651808142305e-06,
|
||
|
|
"loss": 0.3629,
|
||
|
|
"step": 154
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3165905631659056,
|
||
|
|
"grad_norm": 1.2868027887610292,
|
||
|
|
"learning_rate": 9.999317551563011e-06,
|
||
|
|
"loss": 0.3674,
|
||
|
|
"step": 156
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3206494165398275,
|
||
|
|
"grad_norm": 1.3871216563915811,
|
||
|
|
"learning_rate": 9.998871887710965e-06,
|
||
|
|
"loss": 0.3844,
|
||
|
|
"step": 158
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.32470826991374935,
|
||
|
|
"grad_norm": 1.532088928705796,
|
||
|
|
"learning_rate": 9.998314826517564e-06,
|
||
|
|
"loss": 0.3986,
|
||
|
|
"step": 160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3287671232876712,
|
||
|
|
"grad_norm": 1.5418249912864774,
|
||
|
|
"learning_rate": 9.997646380396633e-06,
|
||
|
|
"loss": 0.3934,
|
||
|
|
"step": 162
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3328259766615931,
|
||
|
|
"grad_norm": 1.4957436906571129,
|
||
|
|
"learning_rate": 9.996866564244158e-06,
|
||
|
|
"loss": 0.3958,
|
||
|
|
"step": 164
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.33688483003551495,
|
||
|
|
"grad_norm": 1.4005270008180681,
|
||
|
|
"learning_rate": 9.995975395437952e-06,
|
||
|
|
"loss": 0.3697,
|
||
|
|
"step": 166
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3409436834094368,
|
||
|
|
"grad_norm": 1.3477487401644073,
|
||
|
|
"learning_rate": 9.994972893837259e-06,
|
||
|
|
"loss": 0.382,
|
||
|
|
"step": 168
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3450025367833587,
|
||
|
|
"grad_norm": 1.4090292300223908,
|
||
|
|
"learning_rate": 9.993859081782322e-06,
|
||
|
|
"loss": 0.3989,
|
||
|
|
"step": 170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.34906139015728055,
|
||
|
|
"grad_norm": 1.2787360296779213,
|
||
|
|
"learning_rate": 9.992633984093886e-06,
|
||
|
|
"loss": 0.3746,
|
||
|
|
"step": 172
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3531202435312024,
|
||
|
|
"grad_norm": 1.410889902419896,
|
||
|
|
"learning_rate": 9.991297628072632e-06,
|
||
|
|
"loss": 0.3965,
|
||
|
|
"step": 174
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3571790969051243,
|
||
|
|
"grad_norm": 1.5306391383373583,
|
||
|
|
"learning_rate": 9.98985004349858e-06,
|
||
|
|
"loss": 0.418,
|
||
|
|
"step": 176
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.36123795027904615,
|
||
|
|
"grad_norm": 1.412306543323218,
|
||
|
|
"learning_rate": 9.988291262630425e-06,
|
||
|
|
"loss": 0.3954,
|
||
|
|
"step": 178
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.365296803652968,
|
||
|
|
"grad_norm": 1.376967456626685,
|
||
|
|
"learning_rate": 9.986621320204813e-06,
|
||
|
|
"loss": 0.3944,
|
||
|
|
"step": 180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3693556570268899,
|
||
|
|
"grad_norm": 1.325650682628611,
|
||
|
|
"learning_rate": 9.984840253435569e-06,
|
||
|
|
"loss": 0.396,
|
||
|
|
"step": 182
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.37341451040081175,
|
||
|
|
"grad_norm": 1.248201726411196,
|
||
|
|
"learning_rate": 9.982948102012866e-06,
|
||
|
|
"loss": 0.3783,
|
||
|
|
"step": 184
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3774733637747336,
|
||
|
|
"grad_norm": 1.3492739121359127,
|
||
|
|
"learning_rate": 9.98094490810235e-06,
|
||
|
|
"loss": 0.4078,
|
||
|
|
"step": 186
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3815322171486555,
|
||
|
|
"grad_norm": 1.4387342464186235,
|
||
|
|
"learning_rate": 9.978830716344185e-06,
|
||
|
|
"loss": 0.3892,
|
||
|
|
"step": 188
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.38559107052257735,
|
||
|
|
"grad_norm": 1.2509510227667138,
|
||
|
|
"learning_rate": 9.976605573852071e-06,
|
||
|
|
"loss": 0.3696,
|
||
|
|
"step": 190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3896499238964992,
|
||
|
|
"grad_norm": 1.3154238912694323,
|
||
|
|
"learning_rate": 9.974269530212185e-06,
|
||
|
|
"loss": 0.405,
|
||
|
|
"step": 192
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3937087772704211,
|
||
|
|
"grad_norm": 1.5250960877423454,
|
||
|
|
"learning_rate": 9.971822637482085e-06,
|
||
|
|
"loss": 0.4135,
|
||
|
|
"step": 194
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.39776763064434295,
|
||
|
|
"grad_norm": 1.3908097291583248,
|
||
|
|
"learning_rate": 9.969264950189539e-06,
|
||
|
|
"loss": 0.4006,
|
||
|
|
"step": 196
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4018264840182648,
|
||
|
|
"grad_norm": 1.4790280124932251,
|
||
|
|
"learning_rate": 9.966596525331324e-06,
|
||
|
|
"loss": 0.4188,
|
||
|
|
"step": 198
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4058853373921867,
|
||
|
|
"grad_norm": 1.4233060958120591,
|
||
|
|
"learning_rate": 9.96381742237194e-06,
|
||
|
|
"loss": 0.4042,
|
||
|
|
"step": 200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.40994419076610855,
|
||
|
|
"grad_norm": 1.3044712016817912,
|
||
|
|
"learning_rate": 9.960927703242298e-06,
|
||
|
|
"loss": 0.3956,
|
||
|
|
"step": 202
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4140030441400304,
|
||
|
|
"grad_norm": 1.2296559721817601,
|
||
|
|
"learning_rate": 9.957927432338332e-06,
|
||
|
|
"loss": 0.3813,
|
||
|
|
"step": 204
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4180618975139523,
|
||
|
|
"grad_norm": 1.247811043333453,
|
||
|
|
"learning_rate": 9.954816676519569e-06,
|
||
|
|
"loss": 0.3846,
|
||
|
|
"step": 206
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.42212075088787415,
|
||
|
|
"grad_norm": 1.4552540186289,
|
||
|
|
"learning_rate": 9.951595505107633e-06,
|
||
|
|
"loss": 0.3826,
|
||
|
|
"step": 208
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.426179604261796,
|
||
|
|
"grad_norm": 1.3877999146640874,
|
||
|
|
"learning_rate": 9.948263989884708e-06,
|
||
|
|
"loss": 0.4118,
|
||
|
|
"step": 210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4302384576357179,
|
||
|
|
"grad_norm": 1.3634788475367725,
|
||
|
|
"learning_rate": 9.944822205091929e-06,
|
||
|
|
"loss": 0.3974,
|
||
|
|
"step": 212
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.43429731100963975,
|
||
|
|
"grad_norm": 1.2479391778044153,
|
||
|
|
"learning_rate": 9.94127022742774e-06,
|
||
|
|
"loss": 0.3784,
|
||
|
|
"step": 214
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4383561643835616,
|
||
|
|
"grad_norm": 1.2120926044150644,
|
||
|
|
"learning_rate": 9.937608136046171e-06,
|
||
|
|
"loss": 0.3857,
|
||
|
|
"step": 216
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4424150177574835,
|
||
|
|
"grad_norm": 1.363599562180868,
|
||
|
|
"learning_rate": 9.933836012555083e-06,
|
||
|
|
"loss": 0.4089,
|
||
|
|
"step": 218
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.44647387113140535,
|
||
|
|
"grad_norm": 1.169807478788221,
|
||
|
|
"learning_rate": 9.929953941014349e-06,
|
||
|
|
"loss": 0.3649,
|
||
|
|
"step": 220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4505327245053272,
|
||
|
|
"grad_norm": 1.3307716150293825,
|
||
|
|
"learning_rate": 9.925962007933975e-06,
|
||
|
|
"loss": 0.4093,
|
||
|
|
"step": 222
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4545915778792491,
|
||
|
|
"grad_norm": 1.2092559857310445,
|
||
|
|
"learning_rate": 9.921860302272184e-06,
|
||
|
|
"loss": 0.3959,
|
||
|
|
"step": 224
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.45865043125317095,
|
||
|
|
"grad_norm": 1.40047176469619,
|
||
|
|
"learning_rate": 9.917648915433413e-06,
|
||
|
|
"loss": 0.4271,
|
||
|
|
"step": 226
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4627092846270928,
|
||
|
|
"grad_norm": 1.2607295390446736,
|
||
|
|
"learning_rate": 9.9133279412663e-06,
|
||
|
|
"loss": 0.3963,
|
||
|
|
"step": 228
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4667681380010147,
|
||
|
|
"grad_norm": 1.2887307875981555,
|
||
|
|
"learning_rate": 9.908897476061576e-06,
|
||
|
|
"loss": 0.4128,
|
||
|
|
"step": 230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.47082699137493655,
|
||
|
|
"grad_norm": 1.3538240839163793,
|
||
|
|
"learning_rate": 9.904357618549925e-06,
|
||
|
|
"loss": 0.4032,
|
||
|
|
"step": 232
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4748858447488584,
|
||
|
|
"grad_norm": 1.2106198378461424,
|
||
|
|
"learning_rate": 9.899708469899786e-06,
|
||
|
|
"loss": 0.402,
|
||
|
|
"step": 234
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4789446981227803,
|
||
|
|
"grad_norm": 1.214556237570029,
|
||
|
|
"learning_rate": 9.894950133715094e-06,
|
||
|
|
"loss": 0.4079,
|
||
|
|
"step": 236
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.48300355149670215,
|
||
|
|
"grad_norm": 1.3315305311943295,
|
||
|
|
"learning_rate": 9.89008271603297e-06,
|
||
|
|
"loss": 0.3908,
|
||
|
|
"step": 238
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.487062404870624,
|
||
|
|
"grad_norm": 1.2949635168468638,
|
||
|
|
"learning_rate": 9.885106325321371e-06,
|
||
|
|
"loss": 0.418,
|
||
|
|
"step": 240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4911212582445459,
|
||
|
|
"grad_norm": 1.3622823844909377,
|
||
|
|
"learning_rate": 9.880021072476651e-06,
|
||
|
|
"loss": 0.4136,
|
||
|
|
"step": 242
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.49518011161846776,
|
||
|
|
"grad_norm": 1.298645600673932,
|
||
|
|
"learning_rate": 9.874827070821112e-06,
|
||
|
|
"loss": 0.4037,
|
||
|
|
"step": 244
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4992389649923896,
|
||
|
|
"grad_norm": 1.1869885168664382,
|
||
|
|
"learning_rate": 9.869524436100458e-06,
|
||
|
|
"loss": 0.3723,
|
||
|
|
"step": 246
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5032978183663115,
|
||
|
|
"grad_norm": 1.1960510107751574,
|
||
|
|
"learning_rate": 9.864113286481237e-06,
|
||
|
|
"loss": 0.3665,
|
||
|
|
"step": 248
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5073566717402334,
|
||
|
|
"grad_norm": 1.4192185941613773,
|
||
|
|
"learning_rate": 9.85859374254819e-06,
|
||
|
|
"loss": 0.4243,
|
||
|
|
"step": 250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5114155251141552,
|
||
|
|
"grad_norm": 1.2602840725571196,
|
||
|
|
"learning_rate": 9.852965927301573e-06,
|
||
|
|
"loss": 0.3945,
|
||
|
|
"step": 252
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5154743784880771,
|
||
|
|
"grad_norm": 1.3076466906647164,
|
||
|
|
"learning_rate": 9.847229966154415e-06,
|
||
|
|
"loss": 0.4303,
|
||
|
|
"step": 254
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.519533231861999,
|
||
|
|
"grad_norm": 1.2994672624028094,
|
||
|
|
"learning_rate": 9.841385986929716e-06,
|
||
|
|
"loss": 0.4223,
|
||
|
|
"step": 256
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5235920852359208,
|
||
|
|
"grad_norm": 1.2718506455560323,
|
||
|
|
"learning_rate": 9.835434119857612e-06,
|
||
|
|
"loss": 0.4124,
|
||
|
|
"step": 258
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5276509386098427,
|
||
|
|
"grad_norm": 1.233515929409816,
|
||
|
|
"learning_rate": 9.829374497572461e-06,
|
||
|
|
"loss": 0.4156,
|
||
|
|
"step": 260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5317097919837646,
|
||
|
|
"grad_norm": 1.192750940555937,
|
||
|
|
"learning_rate": 9.823207255109891e-06,
|
||
|
|
"loss": 0.3865,
|
||
|
|
"step": 262
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5357686453576864,
|
||
|
|
"grad_norm": 1.2146025999616998,
|
||
|
|
"learning_rate": 9.816932529903795e-06,
|
||
|
|
"loss": 0.381,
|
||
|
|
"step": 264
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5398274987316083,
|
||
|
|
"grad_norm": 1.254427394175359,
|
||
|
|
"learning_rate": 9.810550461783261e-06,
|
||
|
|
"loss": 0.4209,
|
||
|
|
"step": 266
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5438863521055302,
|
||
|
|
"grad_norm": 1.2007483465859314,
|
||
|
|
"learning_rate": 9.804061192969465e-06,
|
||
|
|
"loss": 0.3935,
|
||
|
|
"step": 268
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.547945205479452,
|
||
|
|
"grad_norm": 1.1962317702272547,
|
||
|
|
"learning_rate": 9.797464868072489e-06,
|
||
|
|
"loss": 0.4055,
|
||
|
|
"step": 270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5520040588533739,
|
||
|
|
"grad_norm": 1.2388775998656307,
|
||
|
|
"learning_rate": 9.790761634088108e-06,
|
||
|
|
"loss": 0.4016,
|
||
|
|
"step": 272
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5560629122272958,
|
||
|
|
"grad_norm": 1.1565724419815788,
|
||
|
|
"learning_rate": 9.78395164039452e-06,
|
||
|
|
"loss": 0.4066,
|
||
|
|
"step": 274
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5601217656012176,
|
||
|
|
"grad_norm": 1.287795256739133,
|
||
|
|
"learning_rate": 9.777035038749002e-06,
|
||
|
|
"loss": 0.4072,
|
||
|
|
"step": 276
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5641806189751395,
|
||
|
|
"grad_norm": 1.1990919314621633,
|
||
|
|
"learning_rate": 9.77001198328453e-06,
|
||
|
|
"loss": 0.385,
|
||
|
|
"step": 278
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5682394723490614,
|
||
|
|
"grad_norm": 1.1155810438092542,
|
||
|
|
"learning_rate": 9.762882630506366e-06,
|
||
|
|
"loss": 0.4138,
|
||
|
|
"step": 280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5722983257229832,
|
||
|
|
"grad_norm": 1.270188734252511,
|
||
|
|
"learning_rate": 9.75564713928854e-06,
|
||
|
|
"loss": 0.4108,
|
||
|
|
"step": 282
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5763571790969051,
|
||
|
|
"grad_norm": 1.2854062183745893,
|
||
|
|
"learning_rate": 9.748305670870326e-06,
|
||
|
|
"loss": 0.4105,
|
||
|
|
"step": 284
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.580416032470827,
|
||
|
|
"grad_norm": 1.2822199195202089,
|
||
|
|
"learning_rate": 9.740858388852652e-06,
|
||
|
|
"loss": 0.4187,
|
||
|
|
"step": 286
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5844748858447488,
|
||
|
|
"grad_norm": 1.1789452806981648,
|
||
|
|
"learning_rate": 9.733305459194444e-06,
|
||
|
|
"loss": 0.4026,
|
||
|
|
"step": 288
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5885337392186707,
|
||
|
|
"grad_norm": 1.2063791823863752,
|
||
|
|
"learning_rate": 9.725647050208936e-06,
|
||
|
|
"loss": 0.4194,
|
||
|
|
"step": 290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5925925925925926,
|
||
|
|
"grad_norm": 1.1212621894773256,
|
||
|
|
"learning_rate": 9.717883332559911e-06,
|
||
|
|
"loss": 0.4043,
|
||
|
|
"step": 292
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5966514459665144,
|
||
|
|
"grad_norm": 1.236354295472038,
|
||
|
|
"learning_rate": 9.710014479257906e-06,
|
||
|
|
"loss": 0.4279,
|
||
|
|
"step": 294
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6007102993404363,
|
||
|
|
"grad_norm": 1.230960872966148,
|
||
|
|
"learning_rate": 9.702040665656353e-06,
|
||
|
|
"loss": 0.417,
|
||
|
|
"step": 296
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6047691527143582,
|
||
|
|
"grad_norm": 1.302936449552778,
|
||
|
|
"learning_rate": 9.693962069447669e-06,
|
||
|
|
"loss": 0.4399,
|
||
|
|
"step": 298
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.60882800608828,
|
||
|
|
"grad_norm": 1.1296630845707911,
|
||
|
|
"learning_rate": 9.685778870659301e-06,
|
||
|
|
"loss": 0.4024,
|
||
|
|
"step": 300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6128868594622019,
|
||
|
|
"grad_norm": 1.1746225535864936,
|
||
|
|
"learning_rate": 9.677491251649711e-06,
|
||
|
|
"loss": 0.3912,
|
||
|
|
"step": 302
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6169457128361238,
|
||
|
|
"grad_norm": 1.241320530846212,
|
||
|
|
"learning_rate": 9.669099397104314e-06,
|
||
|
|
"loss": 0.4174,
|
||
|
|
"step": 304
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6210045662100456,
|
||
|
|
"grad_norm": 1.2219507615770004,
|
||
|
|
"learning_rate": 9.660603494031358e-06,
|
||
|
|
"loss": 0.3918,
|
||
|
|
"step": 306
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6250634195839675,
|
||
|
|
"grad_norm": 1.1589702070871013,
|
||
|
|
"learning_rate": 9.652003731757763e-06,
|
||
|
|
"loss": 0.4157,
|
||
|
|
"step": 308
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6291222729578894,
|
||
|
|
"grad_norm": 1.1220293339629992,
|
||
|
|
"learning_rate": 9.643300301924902e-06,
|
||
|
|
"loss": 0.4015,
|
||
|
|
"step": 310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6331811263318112,
|
||
|
|
"grad_norm": 1.2563582002979947,
|
||
|
|
"learning_rate": 9.634493398484319e-06,
|
||
|
|
"loss": 0.4128,
|
||
|
|
"step": 312
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6372399797057331,
|
||
|
|
"grad_norm": 1.1888367524986483,
|
||
|
|
"learning_rate": 9.625583217693419e-06,
|
||
|
|
"loss": 0.3874,
|
||
|
|
"step": 314
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.641298833079655,
|
||
|
|
"grad_norm": 1.1925360068598152,
|
||
|
|
"learning_rate": 9.616569958111097e-06,
|
||
|
|
"loss": 0.4219,
|
||
|
|
"step": 316
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6453576864535768,
|
||
|
|
"grad_norm": 1.2776062593085378,
|
||
|
|
"learning_rate": 9.607453820593297e-06,
|
||
|
|
"loss": 0.4138,
|
||
|
|
"step": 318
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6494165398274987,
|
||
|
|
"grad_norm": 1.157480079096016,
|
||
|
|
"learning_rate": 9.598235008288551e-06,
|
||
|
|
"loss": 0.4075,
|
||
|
|
"step": 320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6534753932014206,
|
||
|
|
"grad_norm": 1.2352282756489477,
|
||
|
|
"learning_rate": 9.58891372663345e-06,
|
||
|
|
"loss": 0.4111,
|
||
|
|
"step": 322
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6575342465753424,
|
||
|
|
"grad_norm": 1.2837461432435215,
|
||
|
|
"learning_rate": 9.579490183348052e-06,
|
||
|
|
"loss": 0.4358,
|
||
|
|
"step": 324
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6615930999492643,
|
||
|
|
"grad_norm": 1.172789813592292,
|
||
|
|
"learning_rate": 9.56996458843128e-06,
|
||
|
|
"loss": 0.3986,
|
||
|
|
"step": 326
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6656519533231862,
|
||
|
|
"grad_norm": 1.194020795966964,
|
||
|
|
"learning_rate": 9.56033715415621e-06,
|
||
|
|
"loss": 0.4075,
|
||
|
|
"step": 328
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.669710806697108,
|
||
|
|
"grad_norm": 1.0964374769088712,
|
||
|
|
"learning_rate": 9.550608095065367e-06,
|
||
|
|
"loss": 0.4071,
|
||
|
|
"step": 330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6737696600710299,
|
||
|
|
"grad_norm": 1.089373021702181,
|
||
|
|
"learning_rate": 9.540777627965933e-06,
|
||
|
|
"loss": 0.3957,
|
||
|
|
"step": 332
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6778285134449518,
|
||
|
|
"grad_norm": 1.1992667011972529,
|
||
|
|
"learning_rate": 9.53084597192491e-06,
|
||
|
|
"loss": 0.4158,
|
||
|
|
"step": 334
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6818873668188736,
|
||
|
|
"grad_norm": 1.2172349749770106,
|
||
|
|
"learning_rate": 9.520813348264252e-06,
|
||
|
|
"loss": 0.4277,
|
||
|
|
"step": 336
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6859462201927955,
|
||
|
|
"grad_norm": 1.2574902319962946,
|
||
|
|
"learning_rate": 9.510679980555922e-06,
|
||
|
|
"loss": 0.3995,
|
||
|
|
"step": 338
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6900050735667174,
|
||
|
|
"grad_norm": 1.131615777672815,
|
||
|
|
"learning_rate": 9.500446094616911e-06,
|
||
|
|
"loss": 0.4005,
|
||
|
|
"step": 340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6940639269406392,
|
||
|
|
"grad_norm": 1.246895890559624,
|
||
|
|
"learning_rate": 9.490111918504213e-06,
|
||
|
|
"loss": 0.4169,
|
||
|
|
"step": 342
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6981227803145611,
|
||
|
|
"grad_norm": 1.181624286199365,
|
||
|
|
"learning_rate": 9.479677682509737e-06,
|
||
|
|
"loss": 0.3986,
|
||
|
|
"step": 344
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.702181633688483,
|
||
|
|
"grad_norm": 1.132690259540531,
|
||
|
|
"learning_rate": 9.469143619155172e-06,
|
||
|
|
"loss": 0.3923,
|
||
|
|
"step": 346
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7062404870624048,
|
||
|
|
"grad_norm": 1.047890655047983,
|
||
|
|
"learning_rate": 9.458509963186815e-06,
|
||
|
|
"loss": 0.4043,
|
||
|
|
"step": 348
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7102993404363267,
|
||
|
|
"grad_norm": 1.147246283887197,
|
||
|
|
"learning_rate": 9.44777695157033e-06,
|
||
|
|
"loss": 0.4066,
|
||
|
|
"step": 350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7143581938102486,
|
||
|
|
"grad_norm": 1.215824000969317,
|
||
|
|
"learning_rate": 9.436944823485475e-06,
|
||
|
|
"loss": 0.4146,
|
||
|
|
"step": 352
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7184170471841704,
|
||
|
|
"grad_norm": 1.1437849010452408,
|
||
|
|
"learning_rate": 9.426013820320764e-06,
|
||
|
|
"loss": 0.4206,
|
||
|
|
"step": 354
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7224759005580923,
|
||
|
|
"grad_norm": 1.1210034262809383,
|
||
|
|
"learning_rate": 9.414984185668097e-06,
|
||
|
|
"loss": 0.3991,
|
||
|
|
"step": 356
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7265347539320142,
|
||
|
|
"grad_norm": 1.3628388125490938,
|
||
|
|
"learning_rate": 9.403856165317322e-06,
|
||
|
|
"loss": 0.4359,
|
||
|
|
"step": 358
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.730593607305936,
|
||
|
|
"grad_norm": 1.236630161906545,
|
||
|
|
"learning_rate": 9.392630007250769e-06,
|
||
|
|
"loss": 0.4415,
|
||
|
|
"step": 360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7346524606798579,
|
||
|
|
"grad_norm": 1.1213292521942286,
|
||
|
|
"learning_rate": 9.381305961637713e-06,
|
||
|
|
"loss": 0.4219,
|
||
|
|
"step": 362
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7387113140537798,
|
||
|
|
"grad_norm": 1.209577588106072,
|
||
|
|
"learning_rate": 9.369884280828806e-06,
|
||
|
|
"loss": 0.4308,
|
||
|
|
"step": 364
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7427701674277016,
|
||
|
|
"grad_norm": 1.3143877988319919,
|
||
|
|
"learning_rate": 9.358365219350448e-06,
|
||
|
|
"loss": 0.4376,
|
||
|
|
"step": 366
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7468290208016235,
|
||
|
|
"grad_norm": 1.1794072785475278,
|
||
|
|
"learning_rate": 9.346749033899121e-06,
|
||
|
|
"loss": 0.4331,
|
||
|
|
"step": 368
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7508878741755454,
|
||
|
|
"grad_norm": 1.1813808165518036,
|
||
|
|
"learning_rate": 9.335035983335667e-06,
|
||
|
|
"loss": 0.3992,
|
||
|
|
"step": 370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7549467275494672,
|
||
|
|
"grad_norm": 1.1568780935799914,
|
||
|
|
"learning_rate": 9.323226328679512e-06,
|
||
|
|
"loss": 0.4044,
|
||
|
|
"step": 372
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7590055809233891,
|
||
|
|
"grad_norm": 1.0660018515522698,
|
||
|
|
"learning_rate": 9.311320333102864e-06,
|
||
|
|
"loss": 0.3954,
|
||
|
|
"step": 374
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.763064434297311,
|
||
|
|
"grad_norm": 1.108920689047685,
|
||
|
|
"learning_rate": 9.299318261924834e-06,
|
||
|
|
"loss": 0.3998,
|
||
|
|
"step": 376
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7671232876712328,
|
||
|
|
"grad_norm": 1.136789158664533,
|
||
|
|
"learning_rate": 9.287220382605532e-06,
|
||
|
|
"loss": 0.4042,
|
||
|
|
"step": 378
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7711821410451547,
|
||
|
|
"grad_norm": 1.2496770566654822,
|
||
|
|
"learning_rate": 9.275026964740101e-06,
|
||
|
|
"loss": 0.4067,
|
||
|
|
"step": 380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7752409944190766,
|
||
|
|
"grad_norm": 1.1996061958131852,
|
||
|
|
"learning_rate": 9.262738280052715e-06,
|
||
|
|
"loss": 0.4183,
|
||
|
|
"step": 382
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7792998477929984,
|
||
|
|
"grad_norm": 1.2149866273575285,
|
||
|
|
"learning_rate": 9.250354602390523e-06,
|
||
|
|
"loss": 0.4409,
|
||
|
|
"step": 384
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7833587011669203,
|
||
|
|
"grad_norm": 1.1177682668450932,
|
||
|
|
"learning_rate": 9.237876207717538e-06,
|
||
|
|
"loss": 0.4029,
|
||
|
|
"step": 386
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7874175545408422,
|
||
|
|
"grad_norm": 1.1677530585024758,
|
||
|
|
"learning_rate": 9.225303374108503e-06,
|
||
|
|
"loss": 0.4178,
|
||
|
|
"step": 388
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.791476407914764,
|
||
|
|
"grad_norm": 1.3678602117567324,
|
||
|
|
"learning_rate": 9.212636381742676e-06,
|
||
|
|
"loss": 0.4197,
|
||
|
|
"step": 390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7955352612886859,
|
||
|
|
"grad_norm": 1.189997795752436,
|
||
|
|
"learning_rate": 9.199875512897602e-06,
|
||
|
|
"loss": 0.4173,
|
||
|
|
"step": 392
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7995941146626078,
|
||
|
|
"grad_norm": 1.186213329132832,
|
||
|
|
"learning_rate": 9.187021051942814e-06,
|
||
|
|
"loss": 0.4145,
|
||
|
|
"step": 394
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8036529680365296,
|
||
|
|
"grad_norm": 1.203493805658719,
|
||
|
|
"learning_rate": 9.174073285333498e-06,
|
||
|
|
"loss": 0.4181,
|
||
|
|
"step": 396
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8077118214104515,
|
||
|
|
"grad_norm": 1.175802247814532,
|
||
|
|
"learning_rate": 9.161032501604106e-06,
|
||
|
|
"loss": 0.3949,
|
||
|
|
"step": 398
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8117706747843734,
|
||
|
|
"grad_norm": 1.212190555266731,
|
||
|
|
"learning_rate": 9.147898991361936e-06,
|
||
|
|
"loss": 0.4076,
|
||
|
|
"step": 400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8158295281582952,
|
||
|
|
"grad_norm": 1.191973289112244,
|
||
|
|
"learning_rate": 9.134673047280644e-06,
|
||
|
|
"loss": 0.4233,
|
||
|
|
"step": 402
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8198883815322171,
|
||
|
|
"grad_norm": 1.2279490044480763,
|
||
|
|
"learning_rate": 9.121354964093732e-06,
|
||
|
|
"loss": 0.4127,
|
||
|
|
"step": 404
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.823947234906139,
|
||
|
|
"grad_norm": 1.151451234197627,
|
||
|
|
"learning_rate": 9.107945038587974e-06,
|
||
|
|
"loss": 0.4226,
|
||
|
|
"step": 406
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8280060882800608,
|
||
|
|
"grad_norm": 1.119728494545527,
|
||
|
|
"learning_rate": 9.094443569596802e-06,
|
||
|
|
"loss": 0.4033,
|
||
|
|
"step": 408
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8320649416539827,
|
||
|
|
"grad_norm": 1.13012343405543,
|
||
|
|
"learning_rate": 9.08085085799365e-06,
|
||
|
|
"loss": 0.4088,
|
||
|
|
"step": 410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8361237950279046,
|
||
|
|
"grad_norm": 1.159098363094475,
|
||
|
|
"learning_rate": 9.067167206685248e-06,
|
||
|
|
"loss": 0.4124,
|
||
|
|
"step": 412
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8401826484018264,
|
||
|
|
"grad_norm": 1.2347110694455659,
|
||
|
|
"learning_rate": 9.05339292060487e-06,
|
||
|
|
"loss": 0.434,
|
||
|
|
"step": 414
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8442415017757483,
|
||
|
|
"grad_norm": 1.2402415983547357,
|
||
|
|
"learning_rate": 9.039528306705543e-06,
|
||
|
|
"loss": 0.425,
|
||
|
|
"step": 416
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8483003551496702,
|
||
|
|
"grad_norm": 1.1683847145500172,
|
||
|
|
"learning_rate": 9.025573673953201e-06,
|
||
|
|
"loss": 0.4423,
|
||
|
|
"step": 418
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.852359208523592,
|
||
|
|
"grad_norm": 0.9967973069250277,
|
||
|
|
"learning_rate": 9.011529333319804e-06,
|
||
|
|
"loss": 0.3987,
|
||
|
|
"step": 420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8564180618975139,
|
||
|
|
"grad_norm": 1.0663571132874041,
|
||
|
|
"learning_rate": 8.997395597776404e-06,
|
||
|
|
"loss": 0.3908,
|
||
|
|
"step": 422
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8604769152714358,
|
||
|
|
"grad_norm": 1.1585376506062766,
|
||
|
|
"learning_rate": 8.98317278228618e-06,
|
||
|
|
"loss": 0.4055,
|
||
|
|
"step": 424
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8645357686453576,
|
||
|
|
"grad_norm": 1.3272038292907982,
|
||
|
|
"learning_rate": 8.96886120379741e-06,
|
||
|
|
"loss": 0.4241,
|
||
|
|
"step": 426
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8685946220192795,
|
||
|
|
"grad_norm": 1.1134457404863736,
|
||
|
|
"learning_rate": 8.954461181236406e-06,
|
||
|
|
"loss": 0.4343,
|
||
|
|
"step": 428
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8726534753932014,
|
||
|
|
"grad_norm": 1.152191927893708,
|
||
|
|
"learning_rate": 8.939973035500418e-06,
|
||
|
|
"loss": 0.4012,
|
||
|
|
"step": 430
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8767123287671232,
|
||
|
|
"grad_norm": 1.157272959329367,
|
||
|
|
"learning_rate": 8.925397089450473e-06,
|
||
|
|
"loss": 0.4116,
|
||
|
|
"step": 432
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8807711821410451,
|
||
|
|
"grad_norm": 1.1617646168179858,
|
||
|
|
"learning_rate": 8.910733667904186e-06,
|
||
|
|
"loss": 0.4128,
|
||
|
|
"step": 434
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.884830035514967,
|
||
|
|
"grad_norm": 1.2116957635700267,
|
||
|
|
"learning_rate": 8.895983097628515e-06,
|
||
|
|
"loss": 0.4332,
|
||
|
|
"step": 436
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8888888888888888,
|
||
|
|
"grad_norm": 1.1950006191376203,
|
||
|
|
"learning_rate": 8.88114570733249e-06,
|
||
|
|
"loss": 0.4005,
|
||
|
|
"step": 438
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8929477422628107,
|
||
|
|
"grad_norm": 1.1470604791719932,
|
||
|
|
"learning_rate": 8.866221827659876e-06,
|
||
|
|
"loss": 0.4233,
|
||
|
|
"step": 440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8970065956367326,
|
||
|
|
"grad_norm": 1.2230557240685258,
|
||
|
|
"learning_rate": 8.851211791181813e-06,
|
||
|
|
"loss": 0.4133,
|
||
|
|
"step": 442
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9010654490106544,
|
||
|
|
"grad_norm": 1.2680481957203948,
|
||
|
|
"learning_rate": 8.8361159323894e-06,
|
||
|
|
"loss": 0.447,
|
||
|
|
"step": 444
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9051243023845763,
|
||
|
|
"grad_norm": 1.095625464598396,
|
||
|
|
"learning_rate": 8.820934587686247e-06,
|
||
|
|
"loss": 0.3884,
|
||
|
|
"step": 446
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9091831557584982,
|
||
|
|
"grad_norm": 1.1796975421785947,
|
||
|
|
"learning_rate": 8.805668095380969e-06,
|
||
|
|
"loss": 0.4139,
|
||
|
|
"step": 448
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.91324200913242,
|
||
|
|
"grad_norm": 1.2337117620848044,
|
||
|
|
"learning_rate": 8.790316795679654e-06,
|
||
|
|
"loss": 0.4258,
|
||
|
|
"step": 450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9173008625063419,
|
||
|
|
"grad_norm": 1.060326310057752,
|
||
|
|
"learning_rate": 8.774881030678284e-06,
|
||
|
|
"loss": 0.4039,
|
||
|
|
"step": 452
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9213597158802638,
|
||
|
|
"grad_norm": 1.0818729217545202,
|
||
|
|
"learning_rate": 8.759361144355103e-06,
|
||
|
|
"loss": 0.4186,
|
||
|
|
"step": 454
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9254185692541856,
|
||
|
|
"grad_norm": 1.2188647624805096,
|
||
|
|
"learning_rate": 8.74375748256296e-06,
|
||
|
|
"loss": 0.43,
|
||
|
|
"step": 456
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9294774226281075,
|
||
|
|
"grad_norm": 1.1517012313266344,
|
||
|
|
"learning_rate": 8.728070393021595e-06,
|
||
|
|
"loss": 0.3952,
|
||
|
|
"step": 458
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9335362760020294,
|
||
|
|
"grad_norm": 1.146308857460623,
|
||
|
|
"learning_rate": 8.712300225309894e-06,
|
||
|
|
"loss": 0.419,
|
||
|
|
"step": 460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9375951293759512,
|
||
|
|
"grad_norm": 1.1733285567505642,
|
||
|
|
"learning_rate": 8.6964473308581e-06,
|
||
|
|
"loss": 0.4295,
|
||
|
|
"step": 462
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9416539827498731,
|
||
|
|
"grad_norm": 1.2133986892308575,
|
||
|
|
"learning_rate": 8.680512062939976e-06,
|
||
|
|
"loss": 0.3994,
|
||
|
|
"step": 464
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.945712836123795,
|
||
|
|
"grad_norm": 1.2860321510839698,
|
||
|
|
"learning_rate": 8.664494776664942e-06,
|
||
|
|
"loss": 0.4305,
|
||
|
|
"step": 466
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9497716894977168,
|
||
|
|
"grad_norm": 1.222015638603744,
|
||
|
|
"learning_rate": 8.64839582897015e-06,
|
||
|
|
"loss": 0.4247,
|
||
|
|
"step": 468
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9538305428716387,
|
||
|
|
"grad_norm": 1.1303294072263912,
|
||
|
|
"learning_rate": 8.63221557861254e-06,
|
||
|
|
"loss": 0.414,
|
||
|
|
"step": 470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9578893962455606,
|
||
|
|
"grad_norm": 1.1883694044651687,
|
||
|
|
"learning_rate": 8.615954386160836e-06,
|
||
|
|
"loss": 0.3944,
|
||
|
|
"step": 472
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9619482496194824,
|
||
|
|
"grad_norm": 1.0686371570768038,
|
||
|
|
"learning_rate": 8.599612613987522e-06,
|
||
|
|
"loss": 0.4138,
|
||
|
|
"step": 474
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9660071029934043,
|
||
|
|
"grad_norm": 1.1520582178885161,
|
||
|
|
"learning_rate": 8.583190626260754e-06,
|
||
|
|
"loss": 0.408,
|
||
|
|
"step": 476
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9700659563673262,
|
||
|
|
"grad_norm": 1.2111448095961146,
|
||
|
|
"learning_rate": 8.566688788936254e-06,
|
||
|
|
"loss": 0.4326,
|
||
|
|
"step": 478
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.974124809741248,
|
||
|
|
"grad_norm": 1.0960113471423047,
|
||
|
|
"learning_rate": 8.550107469749159e-06,
|
||
|
|
"loss": 0.4095,
|
||
|
|
"step": 480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9781836631151699,
|
||
|
|
"grad_norm": 1.2031487005930193,
|
||
|
|
"learning_rate": 8.533447038205805e-06,
|
||
|
|
"loss": 0.4019,
|
||
|
|
"step": 482
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9822425164890918,
|
||
|
|
"grad_norm": 1.0541006609815473,
|
||
|
|
"learning_rate": 8.516707865575515e-06,
|
||
|
|
"loss": 0.4301,
|
||
|
|
"step": 484
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9863013698630136,
|
||
|
|
"grad_norm": 1.1625544941021624,
|
||
|
|
"learning_rate": 8.499890324882323e-06,
|
||
|
|
"loss": 0.3998,
|
||
|
|
"step": 486
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9903602232369355,
|
||
|
|
"grad_norm": 1.3066175946513412,
|
||
|
|
"learning_rate": 8.482994790896645e-06,
|
||
|
|
"loss": 0.4422,
|
||
|
|
"step": 488
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9944190766108574,
|
||
|
|
"grad_norm": 1.0701571633478897,
|
||
|
|
"learning_rate": 8.466021640126946e-06,
|
||
|
|
"loss": 0.4122,
|
||
|
|
"step": 490
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9984779299847792,
|
||
|
|
"grad_norm": 1.167166516481942,
|
||
|
|
"learning_rate": 8.448971250811337e-06,
|
||
|
|
"loss": 0.4137,
|
||
|
|
"step": 492
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.002029426686961,
|
||
|
|
"grad_norm": 1.2941849102817737,
|
||
|
|
"learning_rate": 8.431844002909153e-06,
|
||
|
|
"loss": 0.3068,
|
||
|
|
"step": 494
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0060882800608828,
|
||
|
|
"grad_norm": 1.111084517678234,
|
||
|
|
"learning_rate": 8.414640278092485e-06,
|
||
|
|
"loss": 0.2196,
|
||
|
|
"step": 496
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0101471334348047,
|
||
|
|
"grad_norm": 1.0562648719597976,
|
||
|
|
"learning_rate": 8.397360459737673e-06,
|
||
|
|
"loss": 0.214,
|
||
|
|
"step": 498
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0142059868087265,
|
||
|
|
"grad_norm": 1.1997363186960575,
|
||
|
|
"learning_rate": 8.38000493291676e-06,
|
||
|
|
"loss": 0.1968,
|
||
|
|
"step": 500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0182648401826484,
|
||
|
|
"grad_norm": 1.173481743208335,
|
||
|
|
"learning_rate": 8.362574084388921e-06,
|
||
|
|
"loss": 0.2037,
|
||
|
|
"step": 502
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0223236935565703,
|
||
|
|
"grad_norm": 1.0170814335662817,
|
||
|
|
"learning_rate": 8.34506830259183e-06,
|
||
|
|
"loss": 0.1732,
|
||
|
|
"step": 504
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0263825469304921,
|
||
|
|
"grad_norm": 0.997455021719729,
|
||
|
|
"learning_rate": 8.327487977633013e-06,
|
||
|
|
"loss": 0.198,
|
||
|
|
"step": 506
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.030441400304414,
|
||
|
|
"grad_norm": 1.0596243435147559,
|
||
|
|
"learning_rate": 8.309833501281159e-06,
|
||
|
|
"loss": 0.1968,
|
||
|
|
"step": 508
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0345002536783359,
|
||
|
|
"grad_norm": 1.1329929736584996,
|
||
|
|
"learning_rate": 8.292105266957372e-06,
|
||
|
|
"loss": 0.2058,
|
||
|
|
"step": 510
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0385591070522577,
|
||
|
|
"grad_norm": 1.0617871718782863,
|
||
|
|
"learning_rate": 8.274303669726427e-06,
|
||
|
|
"loss": 0.1837,
|
||
|
|
"step": 512
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0426179604261796,
|
||
|
|
"grad_norm": 1.0158373026810432,
|
||
|
|
"learning_rate": 8.256429106287944e-06,
|
||
|
|
"loss": 0.1937,
|
||
|
|
"step": 514
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0466768138001015,
|
||
|
|
"grad_norm": 1.1124406488041407,
|
||
|
|
"learning_rate": 8.238481974967567e-06,
|
||
|
|
"loss": 0.2044,
|
||
|
|
"step": 516
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0507356671740233,
|
||
|
|
"grad_norm": 1.0954828147640017,
|
||
|
|
"learning_rate": 8.220462675708075e-06,
|
||
|
|
"loss": 0.2025,
|
||
|
|
"step": 518
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0547945205479452,
|
||
|
|
"grad_norm": 1.0243819070320326,
|
||
|
|
"learning_rate": 8.202371610060471e-06,
|
||
|
|
"loss": 0.1944,
|
||
|
|
"step": 520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.058853373921867,
|
||
|
|
"grad_norm": 1.0760742610687821,
|
||
|
|
"learning_rate": 8.184209181175038e-06,
|
||
|
|
"loss": 0.1949,
|
||
|
|
"step": 522
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.062912227295789,
|
||
|
|
"grad_norm": 1.020386464750481,
|
||
|
|
"learning_rate": 8.165975793792355e-06,
|
||
|
|
"loss": 0.1923,
|
||
|
|
"step": 524
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0669710806697108,
|
||
|
|
"grad_norm": 1.1029549076667262,
|
||
|
|
"learning_rate": 8.14767185423427e-06,
|
||
|
|
"loss": 0.18,
|
||
|
|
"step": 526
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0710299340436327,
|
||
|
|
"grad_norm": 1.1869108297914424,
|
||
|
|
"learning_rate": 8.129297770394855e-06,
|
||
|
|
"loss": 0.199,
|
||
|
|
"step": 528
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0750887874175545,
|
||
|
|
"grad_norm": 1.042008597384453,
|
||
|
|
"learning_rate": 8.11085395173131e-06,
|
||
|
|
"loss": 0.1758,
|
||
|
|
"step": 530
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0791476407914764,
|
||
|
|
"grad_norm": 1.0824678674361556,
|
||
|
|
"learning_rate": 8.092340809254844e-06,
|
||
|
|
"loss": 0.183,
|
||
|
|
"step": 532
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0832064941653983,
|
||
|
|
"grad_norm": 1.1733425605990007,
|
||
|
|
"learning_rate": 8.073758755521506e-06,
|
||
|
|
"loss": 0.2001,
|
||
|
|
"step": 534
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0872653475393201,
|
||
|
|
"grad_norm": 0.989241357527303,
|
||
|
|
"learning_rate": 8.055108204623001e-06,
|
||
|
|
"loss": 0.1854,
|
||
|
|
"step": 536
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.091324200913242,
|
||
|
|
"grad_norm": 1.105028315498873,
|
||
|
|
"learning_rate": 8.03638957217746e-06,
|
||
|
|
"loss": 0.1887,
|
||
|
|
"step": 538
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0953830542871639,
|
||
|
|
"grad_norm": 1.1215793382714723,
|
||
|
|
"learning_rate": 8.017603275320176e-06,
|
||
|
|
"loss": 0.206,
|
||
|
|
"step": 540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0994419076610857,
|
||
|
|
"grad_norm": 1.0430299772389053,
|
||
|
|
"learning_rate": 7.998749732694308e-06,
|
||
|
|
"loss": 0.1852,
|
||
|
|
"step": 542
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1035007610350076,
|
||
|
|
"grad_norm": 1.0539243906524998,
|
||
|
|
"learning_rate": 7.979829364441555e-06,
|
||
|
|
"loss": 0.1792,
|
||
|
|
"step": 544
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1075596144089295,
|
||
|
|
"grad_norm": 1.056850242292317,
|
||
|
|
"learning_rate": 7.960842592192792e-06,
|
||
|
|
"loss": 0.1914,
|
||
|
|
"step": 546
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1116184677828513,
|
||
|
|
"grad_norm": 1.0273529138082944,
|
||
|
|
"learning_rate": 7.94178983905867e-06,
|
||
|
|
"loss": 0.1947,
|
||
|
|
"step": 548
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1156773211567732,
|
||
|
|
"grad_norm": 1.0677655866471754,
|
||
|
|
"learning_rate": 7.922671529620192e-06,
|
||
|
|
"loss": 0.1901,
|
||
|
|
"step": 550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.119736174530695,
|
||
|
|
"grad_norm": 1.012213849308213,
|
||
|
|
"learning_rate": 7.903488089919253e-06,
|
||
|
|
"loss": 0.1732,
|
||
|
|
"step": 552
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.123795027904617,
|
||
|
|
"grad_norm": 1.1676973953753516,
|
||
|
|
"learning_rate": 7.88423994744914e-06,
|
||
|
|
"loss": 0.2106,
|
||
|
|
"step": 554
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1278538812785388,
|
||
|
|
"grad_norm": 1.0599154280202072,
|
||
|
|
"learning_rate": 7.864927531145012e-06,
|
||
|
|
"loss": 0.1868,
|
||
|
|
"step": 556
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1319127346524607,
|
||
|
|
"grad_norm": 1.0897459769656754,
|
||
|
|
"learning_rate": 7.845551271374333e-06,
|
||
|
|
"loss": 0.1814,
|
||
|
|
"step": 558
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1359715880263825,
|
||
|
|
"grad_norm": 1.055142014684741,
|
||
|
|
"learning_rate": 7.82611159992729e-06,
|
||
|
|
"loss": 0.1851,
|
||
|
|
"step": 560
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1400304414003044,
|
||
|
|
"grad_norm": 1.1116965314079303,
|
||
|
|
"learning_rate": 7.80660895000717e-06,
|
||
|
|
"loss": 0.196,
|
||
|
|
"step": 562
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1440892947742263,
|
||
|
|
"grad_norm": 1.039510707609459,
|
||
|
|
"learning_rate": 7.787043756220698e-06,
|
||
|
|
"loss": 0.1721,
|
||
|
|
"step": 564
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1481481481481481,
|
||
|
|
"grad_norm": 1.128546678780832,
|
||
|
|
"learning_rate": 7.767416454568358e-06,
|
||
|
|
"loss": 0.1848,
|
||
|
|
"step": 566
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.15220700152207,
|
||
|
|
"grad_norm": 1.1259620179696028,
|
||
|
|
"learning_rate": 7.747727482434679e-06,
|
||
|
|
"loss": 0.2007,
|
||
|
|
"step": 568
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1562658548959919,
|
||
|
|
"grad_norm": 1.1097229809191436,
|
||
|
|
"learning_rate": 7.727977278578484e-06,
|
||
|
|
"loss": 0.1881,
|
||
|
|
"step": 570
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1603247082699137,
|
||
|
|
"grad_norm": 1.0616464097857343,
|
||
|
|
"learning_rate": 7.708166283123118e-06,
|
||
|
|
"loss": 0.1945,
|
||
|
|
"step": 572
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1643835616438356,
|
||
|
|
"grad_norm": 1.073617478066992,
|
||
|
|
"learning_rate": 7.68829493754663e-06,
|
||
|
|
"loss": 0.1858,
|
||
|
|
"step": 574
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1684424150177575,
|
||
|
|
"grad_norm": 1.0887445235919726,
|
||
|
|
"learning_rate": 7.668363684671947e-06,
|
||
|
|
"loss": 0.1857,
|
||
|
|
"step": 576
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1725012683916793,
|
||
|
|
"grad_norm": 1.0401775398806878,
|
||
|
|
"learning_rate": 7.648372968656995e-06,
|
||
|
|
"loss": 0.1786,
|
||
|
|
"step": 578
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1765601217656012,
|
||
|
|
"grad_norm": 1.072786873168531,
|
||
|
|
"learning_rate": 7.628323234984806e-06,
|
||
|
|
"loss": 0.1848,
|
||
|
|
"step": 580
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.180618975139523,
|
||
|
|
"grad_norm": 1.183804677665548,
|
||
|
|
"learning_rate": 7.608214930453597e-06,
|
||
|
|
"loss": 0.2032,
|
||
|
|
"step": 582
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.184677828513445,
|
||
|
|
"grad_norm": 1.1546921624510742,
|
||
|
|
"learning_rate": 7.588048503166801e-06,
|
||
|
|
"loss": 0.1933,
|
||
|
|
"step": 584
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1887366818873668,
|
||
|
|
"grad_norm": 1.0646260835850125,
|
||
|
|
"learning_rate": 7.5678244025230894e-06,
|
||
|
|
"loss": 0.1842,
|
||
|
|
"step": 586
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1927955352612887,
|
||
|
|
"grad_norm": 0.9351171981377732,
|
||
|
|
"learning_rate": 7.547543079206355e-06,
|
||
|
|
"loss": 0.1711,
|
||
|
|
"step": 588
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1968543886352105,
|
||
|
|
"grad_norm": 1.1893988642652746,
|
||
|
|
"learning_rate": 7.5272049851756716e-06,
|
||
|
|
"loss": 0.2027,
|
||
|
|
"step": 590
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2009132420091324,
|
||
|
|
"grad_norm": 1.0632981222064524,
|
||
|
|
"learning_rate": 7.506810573655215e-06,
|
||
|
|
"loss": 0.1852,
|
||
|
|
"step": 592
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2049720953830543,
|
||
|
|
"grad_norm": 1.0836002498537225,
|
||
|
|
"learning_rate": 7.486360299124169e-06,
|
||
|
|
"loss": 0.1887,
|
||
|
|
"step": 594
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2090309487569761,
|
||
|
|
"grad_norm": 1.0213871056780877,
|
||
|
|
"learning_rate": 7.4658546173066005e-06,
|
||
|
|
"loss": 0.1826,
|
||
|
|
"step": 596
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.213089802130898,
|
||
|
|
"grad_norm": 0.9528373737399318,
|
||
|
|
"learning_rate": 7.445293985161296e-06,
|
||
|
|
"loss": 0.1722,
|
||
|
|
"step": 598
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2171486555048199,
|
||
|
|
"grad_norm": 0.9267772302163672,
|
||
|
|
"learning_rate": 7.424678860871584e-06,
|
||
|
|
"loss": 0.1754,
|
||
|
|
"step": 600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2212075088787417,
|
||
|
|
"grad_norm": 1.0580239859843474,
|
||
|
|
"learning_rate": 7.404009703835121e-06,
|
||
|
|
"loss": 0.1828,
|
||
|
|
"step": 602
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2252663622526636,
|
||
|
|
"grad_norm": 1.1609412830600723,
|
||
|
|
"learning_rate": 7.383286974653659e-06,
|
||
|
|
"loss": 0.2043,
|
||
|
|
"step": 604
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2293252156265855,
|
||
|
|
"grad_norm": 1.2441637701281891,
|
||
|
|
"learning_rate": 7.362511135122779e-06,
|
||
|
|
"loss": 0.2,
|
||
|
|
"step": 606
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2333840690005073,
|
||
|
|
"grad_norm": 1.0712250042029285,
|
||
|
|
"learning_rate": 7.341682648221591e-06,
|
||
|
|
"loss": 0.1823,
|
||
|
|
"step": 608
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2374429223744292,
|
||
|
|
"grad_norm": 0.9995902931065666,
|
||
|
|
"learning_rate": 7.320801978102434e-06,
|
||
|
|
"loss": 0.1826,
|
||
|
|
"step": 610
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.241501775748351,
|
||
|
|
"grad_norm": 1.1066959207293212,
|
||
|
|
"learning_rate": 7.299869590080524e-06,
|
||
|
|
"loss": 0.1916,
|
||
|
|
"step": 612
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.245560629122273,
|
||
|
|
"grad_norm": 1.102809387398261,
|
||
|
|
"learning_rate": 7.278885950623578e-06,
|
||
|
|
"loss": 0.2034,
|
||
|
|
"step": 614
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2496194824961948,
|
||
|
|
"grad_norm": 1.2015908572580698,
|
||
|
|
"learning_rate": 7.257851527341429e-06,
|
||
|
|
"loss": 0.2007,
|
||
|
|
"step": 616
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2536783358701167,
|
||
|
|
"grad_norm": 1.0215395009781163,
|
||
|
|
"learning_rate": 7.236766788975603e-06,
|
||
|
|
"loss": 0.1926,
|
||
|
|
"step": 618
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2577371892440385,
|
||
|
|
"grad_norm": 0.9684806459895816,
|
||
|
|
"learning_rate": 7.215632205388872e-06,
|
||
|
|
"loss": 0.1738,
|
||
|
|
"step": 620
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2617960426179604,
|
||
|
|
"grad_norm": 1.1014687188825973,
|
||
|
|
"learning_rate": 7.19444824755478e-06,
|
||
|
|
"loss": 0.1895,
|
||
|
|
"step": 622
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2658548959918823,
|
||
|
|
"grad_norm": 1.0685183294149176,
|
||
|
|
"learning_rate": 7.173215387547155e-06,
|
||
|
|
"loss": 0.1798,
|
||
|
|
"step": 624
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2699137493658041,
|
||
|
|
"grad_norm": 1.100155763152369,
|
||
|
|
"learning_rate": 7.151934098529583e-06,
|
||
|
|
"loss": 0.1876,
|
||
|
|
"step": 626
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.273972602739726,
|
||
|
|
"grad_norm": 1.1041782673624663,
|
||
|
|
"learning_rate": 7.130604854744871e-06,
|
||
|
|
"loss": 0.1959,
|
||
|
|
"step": 628
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2780314561136479,
|
||
|
|
"grad_norm": 1.1670239082453848,
|
||
|
|
"learning_rate": 7.109228131504465e-06,
|
||
|
|
"loss": 0.2055,
|
||
|
|
"step": 630
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2820903094875697,
|
||
|
|
"grad_norm": 1.0829456391928898,
|
||
|
|
"learning_rate": 7.087804405177876e-06,
|
||
|
|
"loss": 0.1866,
|
||
|
|
"step": 632
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2861491628614916,
|
||
|
|
"grad_norm": 1.0004546162300938,
|
||
|
|
"learning_rate": 7.066334153182049e-06,
|
||
|
|
"loss": 0.1805,
|
||
|
|
"step": 634
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2902080162354135,
|
||
|
|
"grad_norm": 1.032451555529876,
|
||
|
|
"learning_rate": 7.044817853970732e-06,
|
||
|
|
"loss": 0.1866,
|
||
|
|
"step": 636
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2942668696093353,
|
||
|
|
"grad_norm": 1.0864837314090992,
|
||
|
|
"learning_rate": 7.023255987023813e-06,
|
||
|
|
"loss": 0.182,
|
||
|
|
"step": 638
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2983257229832572,
|
||
|
|
"grad_norm": 1.0459756892568486,
|
||
|
|
"learning_rate": 7.001649032836631e-06,
|
||
|
|
"loss": 0.1863,
|
||
|
|
"step": 640
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.302384576357179,
|
||
|
|
"grad_norm": 1.0601944062820794,
|
||
|
|
"learning_rate": 6.9799974729092765e-06,
|
||
|
|
"loss": 0.1732,
|
||
|
|
"step": 642
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.306443429731101,
|
||
|
|
"grad_norm": 1.018697234233559,
|
||
|
|
"learning_rate": 6.958301789735853e-06,
|
||
|
|
"loss": 0.1763,
|
||
|
|
"step": 644
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3105022831050228,
|
||
|
|
"grad_norm": 1.2251116288030364,
|
||
|
|
"learning_rate": 6.936562466793724e-06,
|
||
|
|
"loss": 0.21,
|
||
|
|
"step": 646
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3145611364789447,
|
||
|
|
"grad_norm": 1.0662349795156443,
|
||
|
|
"learning_rate": 6.914779988532755e-06,
|
||
|
|
"loss": 0.1889,
|
||
|
|
"step": 648
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3186199898528665,
|
||
|
|
"grad_norm": 1.1235644124407285,
|
||
|
|
"learning_rate": 6.892954840364493e-06,
|
||
|
|
"loss": 0.2028,
|
||
|
|
"step": 650
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3226788432267884,
|
||
|
|
"grad_norm": 1.074570291741179,
|
||
|
|
"learning_rate": 6.871087508651373e-06,
|
||
|
|
"loss": 0.1884,
|
||
|
|
"step": 652
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3267376966007103,
|
||
|
|
"grad_norm": 1.0531061994655868,
|
||
|
|
"learning_rate": 6.8491784806958616e-06,
|
||
|
|
"loss": 0.2021,
|
||
|
|
"step": 654
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3307965499746321,
|
||
|
|
"grad_norm": 1.1819152398440131,
|
||
|
|
"learning_rate": 6.827228244729609e-06,
|
||
|
|
"loss": 0.1932,
|
||
|
|
"step": 656
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.334855403348554,
|
||
|
|
"grad_norm": 1.045483939181271,
|
||
|
|
"learning_rate": 6.805237289902565e-06,
|
||
|
|
"loss": 0.1965,
|
||
|
|
"step": 658
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3389142567224759,
|
||
|
|
"grad_norm": 1.1758637342179898,
|
||
|
|
"learning_rate": 6.783206106272076e-06,
|
||
|
|
"loss": 0.198,
|
||
|
|
"step": 660
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3429731100963977,
|
||
|
|
"grad_norm": 1.0914064475419278,
|
||
|
|
"learning_rate": 6.761135184791969e-06,
|
||
|
|
"loss": 0.1846,
|
||
|
|
"step": 662
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3470319634703196,
|
||
|
|
"grad_norm": 1.0211966623620905,
|
||
|
|
"learning_rate": 6.7390250173016104e-06,
|
||
|
|
"loss": 0.181,
|
||
|
|
"step": 664
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3510908168442415,
|
||
|
|
"grad_norm": 1.193565468654654,
|
||
|
|
"learning_rate": 6.716876096514944e-06,
|
||
|
|
"loss": 0.2095,
|
||
|
|
"step": 666
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3551496702181633,
|
||
|
|
"grad_norm": 1.1271722792849745,
|
||
|
|
"learning_rate": 6.694688916009505e-06,
|
||
|
|
"loss": 0.1848,
|
||
|
|
"step": 668
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3592085235920852,
|
||
|
|
"grad_norm": 1.1098782394361217,
|
||
|
|
"learning_rate": 6.672463970215436e-06,
|
||
|
|
"loss": 0.1961,
|
||
|
|
"step": 670
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.363267376966007,
|
||
|
|
"grad_norm": 1.1374613038031431,
|
||
|
|
"learning_rate": 6.650201754404455e-06,
|
||
|
|
"loss": 0.1836,
|
||
|
|
"step": 672
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.367326230339929,
|
||
|
|
"grad_norm": 1.0341949368176346,
|
||
|
|
"learning_rate": 6.627902764678824e-06,
|
||
|
|
"loss": 0.1881,
|
||
|
|
"step": 674
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3713850837138508,
|
||
|
|
"grad_norm": 1.109962989096539,
|
||
|
|
"learning_rate": 6.605567497960295e-06,
|
||
|
|
"loss": 0.1803,
|
||
|
|
"step": 676
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3754439370877727,
|
||
|
|
"grad_norm": 1.107735146712493,
|
||
|
|
"learning_rate": 6.583196451979031e-06,
|
||
|
|
"loss": 0.1917,
|
||
|
|
"step": 678
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3795027904616946,
|
||
|
|
"grad_norm": 1.1579886280607274,
|
||
|
|
"learning_rate": 6.560790125262524e-06,
|
||
|
|
"loss": 0.1979,
|
||
|
|
"step": 680
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3835616438356164,
|
||
|
|
"grad_norm": 0.878075470155148,
|
||
|
|
"learning_rate": 6.538349017124472e-06,
|
||
|
|
"loss": 0.1631,
|
||
|
|
"step": 682
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3876204972095383,
|
||
|
|
"grad_norm": 1.062317827656781,
|
||
|
|
"learning_rate": 6.515873627653663e-06,
|
||
|
|
"loss": 0.1808,
|
||
|
|
"step": 684
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3916793505834602,
|
||
|
|
"grad_norm": 1.0327124357250628,
|
||
|
|
"learning_rate": 6.493364457702831e-06,
|
||
|
|
"loss": 0.1799,
|
||
|
|
"step": 686
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.395738203957382,
|
||
|
|
"grad_norm": 1.132131135132688,
|
||
|
|
"learning_rate": 6.470822008877482e-06,
|
||
|
|
"loss": 0.1822,
|
||
|
|
"step": 688
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3997970573313039,
|
||
|
|
"grad_norm": 1.0462034604519863,
|
||
|
|
"learning_rate": 6.448246783524734e-06,
|
||
|
|
"loss": 0.1919,
|
||
|
|
"step": 690
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4038559107052258,
|
||
|
|
"grad_norm": 1.1044742895931243,
|
||
|
|
"learning_rate": 6.42563928472211e-06,
|
||
|
|
"loss": 0.1851,
|
||
|
|
"step": 692
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4079147640791476,
|
||
|
|
"grad_norm": 1.1674244148460076,
|
||
|
|
"learning_rate": 6.403000016266326e-06,
|
||
|
|
"loss": 0.1866,
|
||
|
|
"step": 694
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4119736174530695,
|
||
|
|
"grad_norm": 1.1296402785524131,
|
||
|
|
"learning_rate": 6.380329482662078e-06,
|
||
|
|
"loss": 0.2035,
|
||
|
|
"step": 696
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4160324708269914,
|
||
|
|
"grad_norm": 1.0524131174312268,
|
||
|
|
"learning_rate": 6.35762818911078e-06,
|
||
|
|
"loss": 0.1717,
|
||
|
|
"step": 698
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4200913242009132,
|
||
|
|
"grad_norm": 1.1191774371111942,
|
||
|
|
"learning_rate": 6.334896641499324e-06,
|
||
|
|
"loss": 0.178,
|
||
|
|
"step": 700
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.424150177574835,
|
||
|
|
"grad_norm": 1.047752132697504,
|
||
|
|
"learning_rate": 6.312135346388793e-06,
|
||
|
|
"loss": 0.1881,
|
||
|
|
"step": 702
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.428209030948757,
|
||
|
|
"grad_norm": 1.1743382728694667,
|
||
|
|
"learning_rate": 6.289344811003184e-06,
|
||
|
|
"loss": 0.2033,
|
||
|
|
"step": 704
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4322678843226788,
|
||
|
|
"grad_norm": 1.1804585608436726,
|
||
|
|
"learning_rate": 6.2665255432180916e-06,
|
||
|
|
"loss": 0.1931,
|
||
|
|
"step": 706
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4363267376966007,
|
||
|
|
"grad_norm": 1.0677096903056138,
|
||
|
|
"learning_rate": 6.2436780515494035e-06,
|
||
|
|
"loss": 0.1837,
|
||
|
|
"step": 708
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4403855910705226,
|
||
|
|
"grad_norm": 1.2099132702699213,
|
||
|
|
"learning_rate": 6.2208028451419575e-06,
|
||
|
|
"loss": 0.2112,
|
||
|
|
"step": 710
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4444444444444444,
|
||
|
|
"grad_norm": 1.1837036949871973,
|
||
|
|
"learning_rate": 6.197900433758205e-06,
|
||
|
|
"loss": 0.2021,
|
||
|
|
"step": 712
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4485032978183663,
|
||
|
|
"grad_norm": 1.167969784888959,
|
||
|
|
"learning_rate": 6.174971327766842e-06,
|
||
|
|
"loss": 0.1958,
|
||
|
|
"step": 714
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4525621511922882,
|
||
|
|
"grad_norm": 1.0584127834879178,
|
||
|
|
"learning_rate": 6.1520160381314465e-06,
|
||
|
|
"loss": 0.1854,
|
||
|
|
"step": 716
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.45662100456621,
|
||
|
|
"grad_norm": 1.0674665549424147,
|
||
|
|
"learning_rate": 6.129035076399077e-06,
|
||
|
|
"loss": 0.1896,
|
||
|
|
"step": 718
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4606798579401319,
|
||
|
|
"grad_norm": 1.1061278306008033,
|
||
|
|
"learning_rate": 6.106028954688892e-06,
|
||
|
|
"loss": 0.1903,
|
||
|
|
"step": 720
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4647387113140538,
|
||
|
|
"grad_norm": 1.0435415673333681,
|
||
|
|
"learning_rate": 6.082998185680718e-06,
|
||
|
|
"loss": 0.1872,
|
||
|
|
"step": 722
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4687975646879756,
|
||
|
|
"grad_norm": 1.058891200904623,
|
||
|
|
"learning_rate": 6.059943282603642e-06,
|
||
|
|
"loss": 0.1983,
|
||
|
|
"step": 724
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4728564180618975,
|
||
|
|
"grad_norm": 1.0795684120382831,
|
||
|
|
"learning_rate": 6.03686475922456e-06,
|
||
|
|
"loss": 0.178,
|
||
|
|
"step": 726
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4769152714358194,
|
||
|
|
"grad_norm": 1.1384310108536333,
|
||
|
|
"learning_rate": 6.013763129836739e-06,
|
||
|
|
"loss": 0.1874,
|
||
|
|
"step": 728
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4809741248097412,
|
||
|
|
"grad_norm": 1.1039777651990725,
|
||
|
|
"learning_rate": 5.990638909248352e-06,
|
||
|
|
"loss": 0.1941,
|
||
|
|
"step": 730
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.485032978183663,
|
||
|
|
"grad_norm": 1.0762677800080636,
|
||
|
|
"learning_rate": 5.967492612770999e-06,
|
||
|
|
"loss": 0.1869,
|
||
|
|
"step": 732
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.489091831557585,
|
||
|
|
"grad_norm": 1.0660512751481621,
|
||
|
|
"learning_rate": 5.944324756208238e-06,
|
||
|
|
"loss": 0.1807,
|
||
|
|
"step": 734
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4931506849315068,
|
||
|
|
"grad_norm": 1.0935996132707635,
|
||
|
|
"learning_rate": 5.92113585584408e-06,
|
||
|
|
"loss": 0.1945,
|
||
|
|
"step": 736
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4972095383054287,
|
||
|
|
"grad_norm": 1.1460959395776262,
|
||
|
|
"learning_rate": 5.897926428431485e-06,
|
||
|
|
"loss": 0.193,
|
||
|
|
"step": 738
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5012683916793506,
|
||
|
|
"grad_norm": 1.1965912612959004,
|
||
|
|
"learning_rate": 5.87469699118085e-06,
|
||
|
|
"loss": 0.1941,
|
||
|
|
"step": 740
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5053272450532724,
|
||
|
|
"grad_norm": 1.1926255825530645,
|
||
|
|
"learning_rate": 5.851448061748477e-06,
|
||
|
|
"loss": 0.1954,
|
||
|
|
"step": 742
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5093860984271943,
|
||
|
|
"grad_norm": 1.0050356960035571,
|
||
|
|
"learning_rate": 5.828180158225047e-06,
|
||
|
|
"loss": 0.1812,
|
||
|
|
"step": 744
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5134449518011162,
|
||
|
|
"grad_norm": 1.0259967073861873,
|
||
|
|
"learning_rate": 5.804893799124068e-06,
|
||
|
|
"loss": 0.1892,
|
||
|
|
"step": 746
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.517503805175038,
|
||
|
|
"grad_norm": 1.1502053032831951,
|
||
|
|
"learning_rate": 5.7815895033703164e-06,
|
||
|
|
"loss": 0.1965,
|
||
|
|
"step": 748
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.52156265854896,
|
||
|
|
"grad_norm": 1.2448018070646114,
|
||
|
|
"learning_rate": 5.758267790288282e-06,
|
||
|
|
"loss": 0.2082,
|
||
|
|
"step": 750
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5256215119228818,
|
||
|
|
"grad_norm": 1.0310538568503425,
|
||
|
|
"learning_rate": 5.734929179590593e-06,
|
||
|
|
"loss": 0.1801,
|
||
|
|
"step": 752
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5296803652968036,
|
||
|
|
"grad_norm": 1.0835597986581949,
|
||
|
|
"learning_rate": 5.711574191366427e-06,
|
||
|
|
"loss": 0.1807,
|
||
|
|
"step": 754
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5337392186707255,
|
||
|
|
"grad_norm": 1.1039579795978836,
|
||
|
|
"learning_rate": 5.6882033460699294e-06,
|
||
|
|
"loss": 0.1934,
|
||
|
|
"step": 756
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5377980720446474,
|
||
|
|
"grad_norm": 1.1564614770433477,
|
||
|
|
"learning_rate": 5.664817164508614e-06,
|
||
|
|
"loss": 0.183,
|
||
|
|
"step": 758
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5418569254185692,
|
||
|
|
"grad_norm": 1.1933306807050015,
|
||
|
|
"learning_rate": 5.641416167831752e-06,
|
||
|
|
"loss": 0.1983,
|
||
|
|
"step": 760
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.545915778792491,
|
||
|
|
"grad_norm": 1.1642784370439,
|
||
|
|
"learning_rate": 5.618000877518767e-06,
|
||
|
|
"loss": 0.205,
|
||
|
|
"step": 762
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.549974632166413,
|
||
|
|
"grad_norm": 1.096750111637783,
|
||
|
|
"learning_rate": 5.594571815367602e-06,
|
||
|
|
"loss": 0.1871,
|
||
|
|
"step": 764
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5540334855403348,
|
||
|
|
"grad_norm": 0.8896315598965203,
|
||
|
|
"learning_rate": 5.5711295034831034e-06,
|
||
|
|
"loss": 0.1588,
|
||
|
|
"step": 766
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5580923389142567,
|
||
|
|
"grad_norm": 1.021696941589894,
|
||
|
|
"learning_rate": 5.547674464265384e-06,
|
||
|
|
"loss": 0.1885,
|
||
|
|
"step": 768
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5621511922881786,
|
||
|
|
"grad_norm": 1.0760610238279678,
|
||
|
|
"learning_rate": 5.524207220398169e-06,
|
||
|
|
"loss": 0.1844,
|
||
|
|
"step": 770
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5662100456621004,
|
||
|
|
"grad_norm": 1.0146299892564568,
|
||
|
|
"learning_rate": 5.500728294837168e-06,
|
||
|
|
"loss": 0.1717,
|
||
|
|
"step": 772
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5702688990360223,
|
||
|
|
"grad_norm": 1.1203690278420046,
|
||
|
|
"learning_rate": 5.477238210798406e-06,
|
||
|
|
"loss": 0.1816,
|
||
|
|
"step": 774
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5743277524099442,
|
||
|
|
"grad_norm": 1.213922753663776,
|
||
|
|
"learning_rate": 5.453737491746572e-06,
|
||
|
|
"loss": 0.1956,
|
||
|
|
"step": 776
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.578386605783866,
|
||
|
|
"grad_norm": 1.0789723847536306,
|
||
|
|
"learning_rate": 5.430226661383348e-06,
|
||
|
|
"loss": 0.1831,
|
||
|
|
"step": 778
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.582445459157788,
|
||
|
|
"grad_norm": 1.0165965114615476,
|
||
|
|
"learning_rate": 5.406706243635742e-06,
|
||
|
|
"loss": 0.1859,
|
||
|
|
"step": 780
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5865043125317098,
|
||
|
|
"grad_norm": 0.9244907929973665,
|
||
|
|
"learning_rate": 5.383176762644416e-06,
|
||
|
|
"loss": 0.1799,
|
||
|
|
"step": 782
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5905631659056316,
|
||
|
|
"grad_norm": 1.015459762165936,
|
||
|
|
"learning_rate": 5.359638742751994e-06,
|
||
|
|
"loss": 0.1859,
|
||
|
|
"step": 784
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5946220192795535,
|
||
|
|
"grad_norm": 1.119032836602815,
|
||
|
|
"learning_rate": 5.3360927084913925e-06,
|
||
|
|
"loss": 0.1949,
|
||
|
|
"step": 786
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5986808726534754,
|
||
|
|
"grad_norm": 0.9834799242339374,
|
||
|
|
"learning_rate": 5.312539184574123e-06,
|
||
|
|
"loss": 0.1795,
|
||
|
|
"step": 788
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6027397260273972,
|
||
|
|
"grad_norm": 1.1257597437225455,
|
||
|
|
"learning_rate": 5.288978695878596e-06,
|
||
|
|
"loss": 0.1842,
|
||
|
|
"step": 790
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.606798579401319,
|
||
|
|
"grad_norm": 1.1490293546501014,
|
||
|
|
"learning_rate": 5.265411767438432e-06,
|
||
|
|
"loss": 0.1892,
|
||
|
|
"step": 792
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.610857432775241,
|
||
|
|
"grad_norm": 1.096272375625098,
|
||
|
|
"learning_rate": 5.241838924430757e-06,
|
||
|
|
"loss": 0.1857,
|
||
|
|
"step": 794
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6149162861491628,
|
||
|
|
"grad_norm": 0.9881695704441573,
|
||
|
|
"learning_rate": 5.2182606921645e-06,
|
||
|
|
"loss": 0.1839,
|
||
|
|
"step": 796
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6189751395230847,
|
||
|
|
"grad_norm": 1.0055701217382587,
|
||
|
|
"learning_rate": 5.194677596068689e-06,
|
||
|
|
"loss": 0.1974,
|
||
|
|
"step": 798
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6230339928970066,
|
||
|
|
"grad_norm": 1.0445320687597668,
|
||
|
|
"learning_rate": 5.171090161680736e-06,
|
||
|
|
"loss": 0.186,
|
||
|
|
"step": 800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6270928462709284,
|
||
|
|
"grad_norm": 1.010532137401968,
|
||
|
|
"learning_rate": 5.1474989146347355e-06,
|
||
|
|
"loss": 0.1818,
|
||
|
|
"step": 802
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6311516996448503,
|
||
|
|
"grad_norm": 1.059728950180328,
|
||
|
|
"learning_rate": 5.1239043806497365e-06,
|
||
|
|
"loss": 0.1878,
|
||
|
|
"step": 804
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6352105530187722,
|
||
|
|
"grad_norm": 1.02492281938429,
|
||
|
|
"learning_rate": 5.100307085518046e-06,
|
||
|
|
"loss": 0.1792,
|
||
|
|
"step": 806
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.639269406392694,
|
||
|
|
"grad_norm": 1.1092364608711534,
|
||
|
|
"learning_rate": 5.076707555093491e-06,
|
||
|
|
"loss": 0.1816,
|
||
|
|
"step": 808
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.643328259766616,
|
||
|
|
"grad_norm": 0.9816745515421457,
|
||
|
|
"learning_rate": 5.053106315279721e-06,
|
||
|
|
"loss": 0.2025,
|
||
|
|
"step": 810
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6473871131405378,
|
||
|
|
"grad_norm": 1.0000356438781097,
|
||
|
|
"learning_rate": 5.029503892018472e-06,
|
||
|
|
"loss": 0.1669,
|
||
|
|
"step": 812
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6514459665144596,
|
||
|
|
"grad_norm": 1.0450858430582273,
|
||
|
|
"learning_rate": 5.005900811277856e-06,
|
||
|
|
"loss": 0.1802,
|
||
|
|
"step": 814
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6555048198883815,
|
||
|
|
"grad_norm": 0.9390205529074375,
|
||
|
|
"learning_rate": 4.982297599040633e-06,
|
||
|
|
"loss": 0.1636,
|
||
|
|
"step": 816
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6595636732623034,
|
||
|
|
"grad_norm": 1.154620857603639,
|
||
|
|
"learning_rate": 4.958694781292496e-06,
|
||
|
|
"loss": 0.1923,
|
||
|
|
"step": 818
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6636225266362252,
|
||
|
|
"grad_norm": 1.1872938869192748,
|
||
|
|
"learning_rate": 4.935092884010347e-06,
|
||
|
|
"loss": 0.1873,
|
||
|
|
"step": 820
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.667681380010147,
|
||
|
|
"grad_norm": 1.0599792451386443,
|
||
|
|
"learning_rate": 4.911492433150573e-06,
|
||
|
|
"loss": 0.1809,
|
||
|
|
"step": 822
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.671740233384069,
|
||
|
|
"grad_norm": 1.1108344863089323,
|
||
|
|
"learning_rate": 4.887893954637335e-06,
|
||
|
|
"loss": 0.1864,
|
||
|
|
"step": 824
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6757990867579908,
|
||
|
|
"grad_norm": 1.058121443964045,
|
||
|
|
"learning_rate": 4.86429797435083e-06,
|
||
|
|
"loss": 0.1766,
|
||
|
|
"step": 826
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6798579401319127,
|
||
|
|
"grad_norm": 1.1323101168080565,
|
||
|
|
"learning_rate": 4.840705018115595e-06,
|
||
|
|
"loss": 0.1808,
|
||
|
|
"step": 828
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6839167935058346,
|
||
|
|
"grad_norm": 1.128025551519256,
|
||
|
|
"learning_rate": 4.8171156116887725e-06,
|
||
|
|
"loss": 0.1757,
|
||
|
|
"step": 830
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6879756468797564,
|
||
|
|
"grad_norm": 1.0934679444423028,
|
||
|
|
"learning_rate": 4.7935302807483965e-06,
|
||
|
|
"loss": 0.1924,
|
||
|
|
"step": 832
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6920345002536783,
|
||
|
|
"grad_norm": 1.0760501447523048,
|
||
|
|
"learning_rate": 4.769949550881687e-06,
|
||
|
|
"loss": 0.1902,
|
||
|
|
"step": 834
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6960933536276002,
|
||
|
|
"grad_norm": 1.0550473463955812,
|
||
|
|
"learning_rate": 4.746373947573325e-06,
|
||
|
|
"loss": 0.1787,
|
||
|
|
"step": 836
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.700152207001522,
|
||
|
|
"grad_norm": 1.2275660092618677,
|
||
|
|
"learning_rate": 4.722803996193753e-06,
|
||
|
|
"loss": 0.197,
|
||
|
|
"step": 838
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.704211060375444,
|
||
|
|
"grad_norm": 1.0505064696078903,
|
||
|
|
"learning_rate": 4.699240221987461e-06,
|
||
|
|
"loss": 0.1819,
|
||
|
|
"step": 840
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7082699137493658,
|
||
|
|
"grad_norm": 1.1523046961319277,
|
||
|
|
"learning_rate": 4.6756831500612846e-06,
|
||
|
|
"loss": 0.1888,
|
||
|
|
"step": 842
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7123287671232876,
|
||
|
|
"grad_norm": 0.9989431343495883,
|
||
|
|
"learning_rate": 4.652133305372705e-06,
|
||
|
|
"loss": 0.1727,
|
||
|
|
"step": 844
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7163876204972095,
|
||
|
|
"grad_norm": 1.0740718139978316,
|
||
|
|
"learning_rate": 4.628591212718144e-06,
|
||
|
|
"loss": 0.1756,
|
||
|
|
"step": 846
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7204464738711314,
|
||
|
|
"grad_norm": 1.1041425692480016,
|
||
|
|
"learning_rate": 4.605057396721275e-06,
|
||
|
|
"loss": 0.1741,
|
||
|
|
"step": 848
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7245053272450532,
|
||
|
|
"grad_norm": 1.212666044737014,
|
||
|
|
"learning_rate": 4.58153238182133e-06,
|
||
|
|
"loss": 0.1841,
|
||
|
|
"step": 850
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.728564180618975,
|
||
|
|
"grad_norm": 1.0783964750466963,
|
||
|
|
"learning_rate": 4.558016692261412e-06,
|
||
|
|
"loss": 0.1698,
|
||
|
|
"step": 852
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.732623033992897,
|
||
|
|
"grad_norm": 1.0683744846668402,
|
||
|
|
"learning_rate": 4.534510852076817e-06,
|
||
|
|
"loss": 0.1886,
|
||
|
|
"step": 854
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7366818873668188,
|
||
|
|
"grad_norm": 1.1011269960255068,
|
||
|
|
"learning_rate": 4.511015385083345e-06,
|
||
|
|
"loss": 0.1945,
|
||
|
|
"step": 856
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7407407407407407,
|
||
|
|
"grad_norm": 0.9991671395240459,
|
||
|
|
"learning_rate": 4.487530814865646e-06,
|
||
|
|
"loss": 0.1824,
|
||
|
|
"step": 858
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7447995941146626,
|
||
|
|
"grad_norm": 1.0566808884558716,
|
||
|
|
"learning_rate": 4.464057664765532e-06,
|
||
|
|
"loss": 0.1823,
|
||
|
|
"step": 860
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7488584474885844,
|
||
|
|
"grad_norm": 1.0940203228626781,
|
||
|
|
"learning_rate": 4.440596457870327e-06,
|
||
|
|
"loss": 0.1834,
|
||
|
|
"step": 862
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7529173008625063,
|
||
|
|
"grad_norm": 1.0234332049105062,
|
||
|
|
"learning_rate": 4.417147717001205e-06,
|
||
|
|
"loss": 0.1746,
|
||
|
|
"step": 864
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7569761542364282,
|
||
|
|
"grad_norm": 0.9623367599486023,
|
||
|
|
"learning_rate": 4.393711964701541e-06,
|
||
|
|
"loss": 0.1682,
|
||
|
|
"step": 866
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.76103500761035,
|
||
|
|
"grad_norm": 1.0516978200243972,
|
||
|
|
"learning_rate": 4.37028972322527e-06,
|
||
|
|
"loss": 0.1786,
|
||
|
|
"step": 868
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.765093860984272,
|
||
|
|
"grad_norm": 1.1391069464012384,
|
||
|
|
"learning_rate": 4.346881514525236e-06,
|
||
|
|
"loss": 0.1791,
|
||
|
|
"step": 870
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7691527143581938,
|
||
|
|
"grad_norm": 0.971499677774941,
|
||
|
|
"learning_rate": 4.323487860241582e-06,
|
||
|
|
"loss": 0.1672,
|
||
|
|
"step": 872
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7732115677321156,
|
||
|
|
"grad_norm": 1.1577835890912351,
|
||
|
|
"learning_rate": 4.3001092816901055e-06,
|
||
|
|
"loss": 0.1854,
|
||
|
|
"step": 874
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7772704211060375,
|
||
|
|
"grad_norm": 1.1217645675230743,
|
||
|
|
"learning_rate": 4.2767462998506485e-06,
|
||
|
|
"loss": 0.1823,
|
||
|
|
"step": 876
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7813292744799594,
|
||
|
|
"grad_norm": 1.1190282031824559,
|
||
|
|
"learning_rate": 4.253399435355492e-06,
|
||
|
|
"loss": 0.1895,
|
||
|
|
"step": 878
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7853881278538812,
|
||
|
|
"grad_norm": 1.0134907069750374,
|
||
|
|
"learning_rate": 4.230069208477745e-06,
|
||
|
|
"loss": 0.175,
|
||
|
|
"step": 880
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.789446981227803,
|
||
|
|
"grad_norm": 1.1494619938574746,
|
||
|
|
"learning_rate": 4.206756139119762e-06,
|
||
|
|
"loss": 0.1953,
|
||
|
|
"step": 882
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.793505834601725,
|
||
|
|
"grad_norm": 0.9248356141218419,
|
||
|
|
"learning_rate": 4.183460746801546e-06,
|
||
|
|
"loss": 0.1702,
|
||
|
|
"step": 884
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7975646879756468,
|
||
|
|
"grad_norm": 1.0725930962377248,
|
||
|
|
"learning_rate": 4.160183550649176e-06,
|
||
|
|
"loss": 0.1778,
|
||
|
|
"step": 886
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8016235413495687,
|
||
|
|
"grad_norm": 1.0788894008577279,
|
||
|
|
"learning_rate": 4.136925069383243e-06,
|
||
|
|
"loss": 0.1917,
|
||
|
|
"step": 888
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8056823947234906,
|
||
|
|
"grad_norm": 1.0122516476461982,
|
||
|
|
"learning_rate": 4.113685821307282e-06,
|
||
|
|
"loss": 0.1898,
|
||
|
|
"step": 890
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8097412480974124,
|
||
|
|
"grad_norm": 1.027424121449119,
|
||
|
|
"learning_rate": 4.090466324296228e-06,
|
||
|
|
"loss": 0.1822,
|
||
|
|
"step": 892
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8138001014713343,
|
||
|
|
"grad_norm": 1.1269393961404834,
|
||
|
|
"learning_rate": 4.067267095784871e-06,
|
||
|
|
"loss": 0.1841,
|
||
|
|
"step": 894
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8178589548452562,
|
||
|
|
"grad_norm": 1.0052196803723334,
|
||
|
|
"learning_rate": 4.044088652756332e-06,
|
||
|
|
"loss": 0.1629,
|
||
|
|
"step": 896
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.821917808219178,
|
||
|
|
"grad_norm": 1.079578577258494,
|
||
|
|
"learning_rate": 4.020931511730533e-06,
|
||
|
|
"loss": 0.1774,
|
||
|
|
"step": 898
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8259766615931,
|
||
|
|
"grad_norm": 0.9587436391914074,
|
||
|
|
"learning_rate": 3.997796188752695e-06,
|
||
|
|
"loss": 0.1733,
|
||
|
|
"step": 900
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8300355149670218,
|
||
|
|
"grad_norm": 0.9992614549374934,
|
||
|
|
"learning_rate": 3.974683199381836e-06,
|
||
|
|
"loss": 0.1685,
|
||
|
|
"step": 902
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8340943683409436,
|
||
|
|
"grad_norm": 0.9418897276184947,
|
||
|
|
"learning_rate": 3.951593058679276e-06,
|
||
|
|
"loss": 0.1672,
|
||
|
|
"step": 904
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8381532217148655,
|
||
|
|
"grad_norm": 1.1397268358795776,
|
||
|
|
"learning_rate": 3.928526281197169e-06,
|
||
|
|
"loss": 0.1749,
|
||
|
|
"step": 906
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8422120750887874,
|
||
|
|
"grad_norm": 1.0440206163216095,
|
||
|
|
"learning_rate": 3.905483380967027e-06,
|
||
|
|
"loss": 0.1722,
|
||
|
|
"step": 908
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8462709284627092,
|
||
|
|
"grad_norm": 1.048561547401053,
|
||
|
|
"learning_rate": 3.882464871488273e-06,
|
||
|
|
"loss": 0.1693,
|
||
|
|
"step": 910
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.850329781836631,
|
||
|
|
"grad_norm": 1.0284223905418497,
|
||
|
|
"learning_rate": 3.859471265716791e-06,
|
||
|
|
"loss": 0.1691,
|
||
|
|
"step": 912
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.854388635210553,
|
||
|
|
"grad_norm": 1.004974372673609,
|
||
|
|
"learning_rate": 3.836503076053501e-06,
|
||
|
|
"loss": 0.1751,
|
||
|
|
"step": 914
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8584474885844748,
|
||
|
|
"grad_norm": 1.1435852033856233,
|
||
|
|
"learning_rate": 3.8135608143329404e-06,
|
||
|
|
"loss": 0.1809,
|
||
|
|
"step": 916
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8625063419583967,
|
||
|
|
"grad_norm": 0.9996592509283232,
|
||
|
|
"learning_rate": 3.7906449918118493e-06,
|
||
|
|
"loss": 0.1696,
|
||
|
|
"step": 918
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8665651953323186,
|
||
|
|
"grad_norm": 1.0632454270482863,
|
||
|
|
"learning_rate": 3.7677561191577873e-06,
|
||
|
|
"loss": 0.17,
|
||
|
|
"step": 920
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8706240487062404,
|
||
|
|
"grad_norm": 1.026248873958979,
|
||
|
|
"learning_rate": 3.7448947064377496e-06,
|
||
|
|
"loss": 0.1768,
|
||
|
|
"step": 922
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8746829020801623,
|
||
|
|
"grad_norm": 1.0006033609765281,
|
||
|
|
"learning_rate": 3.722061263106797e-06,
|
||
|
|
"loss": 0.1712,
|
||
|
|
"step": 924
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8787417554540842,
|
||
|
|
"grad_norm": 1.0394166532597735,
|
||
|
|
"learning_rate": 3.699256297996714e-06,
|
||
|
|
"loss": 0.1802,
|
||
|
|
"step": 926
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.882800608828006,
|
||
|
|
"grad_norm": 1.1087386752635604,
|
||
|
|
"learning_rate": 3.6764803193046538e-06,
|
||
|
|
"loss": 0.1787,
|
||
|
|
"step": 928
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.886859462201928,
|
||
|
|
"grad_norm": 1.1539792806225302,
|
||
|
|
"learning_rate": 3.6537338345818273e-06,
|
||
|
|
"loss": 0.177,
|
||
|
|
"step": 930
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8909183155758498,
|
||
|
|
"grad_norm": 1.1358496184900775,
|
||
|
|
"learning_rate": 3.6310173507221884e-06,
|
||
|
|
"loss": 0.1784,
|
||
|
|
"step": 932
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8949771689497716,
|
||
|
|
"grad_norm": 0.9648197271266891,
|
||
|
|
"learning_rate": 3.6083313739511316e-06,
|
||
|
|
"loss": 0.1613,
|
||
|
|
"step": 934
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8990360223236935,
|
||
|
|
"grad_norm": 1.069217067686545,
|
||
|
|
"learning_rate": 3.5856764098142207e-06,
|
||
|
|
"loss": 0.1722,
|
||
|
|
"step": 936
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9030948756976154,
|
||
|
|
"grad_norm": 0.9827567009351711,
|
||
|
|
"learning_rate": 3.563052963165915e-06,
|
||
|
|
"loss": 0.1619,
|
||
|
|
"step": 938
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9071537290715372,
|
||
|
|
"grad_norm": 1.0416626747952469,
|
||
|
|
"learning_rate": 3.5404615381583264e-06,
|
||
|
|
"loss": 0.1786,
|
||
|
|
"step": 940
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.911212582445459,
|
||
|
|
"grad_norm": 0.9796952362181767,
|
||
|
|
"learning_rate": 3.5179026382299752e-06,
|
||
|
|
"loss": 0.1635,
|
||
|
|
"step": 942
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.915271435819381,
|
||
|
|
"grad_norm": 1.0913636067798673,
|
||
|
|
"learning_rate": 3.4953767660945825e-06,
|
||
|
|
"loss": 0.1849,
|
||
|
|
"step": 944
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9193302891933028,
|
||
|
|
"grad_norm": 1.054155532699976,
|
||
|
|
"learning_rate": 3.472884423729861e-06,
|
||
|
|
"loss": 0.1824,
|
||
|
|
"step": 946
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9233891425672247,
|
||
|
|
"grad_norm": 1.1299730992487989,
|
||
|
|
"learning_rate": 3.4504261123663243e-06,
|
||
|
|
"loss": 0.1741,
|
||
|
|
"step": 948
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9274479959411466,
|
||
|
|
"grad_norm": 0.9544541000662791,
|
||
|
|
"learning_rate": 3.4280023324761287e-06,
|
||
|
|
"loss": 0.1622,
|
||
|
|
"step": 950
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9315068493150684,
|
||
|
|
"grad_norm": 1.0633189960260987,
|
||
|
|
"learning_rate": 3.4056135837619077e-06,
|
||
|
|
"loss": 0.1714,
|
||
|
|
"step": 952
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9355657026889903,
|
||
|
|
"grad_norm": 0.9744187925381573,
|
||
|
|
"learning_rate": 3.3832603651456486e-06,
|
||
|
|
"loss": 0.1704,
|
||
|
|
"step": 954
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9396245560629122,
|
||
|
|
"grad_norm": 1.0876850028674756,
|
||
|
|
"learning_rate": 3.360943174757564e-06,
|
||
|
|
"loss": 0.1835,
|
||
|
|
"step": 956
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.943683409436834,
|
||
|
|
"grad_norm": 0.9933052886092801,
|
||
|
|
"learning_rate": 3.3386625099249957e-06,
|
||
|
|
"loss": 0.1722,
|
||
|
|
"step": 958
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.947742262810756,
|
||
|
|
"grad_norm": 1.0507655110716982,
|
||
|
|
"learning_rate": 3.3164188671613382e-06,
|
||
|
|
"loss": 0.1799,
|
||
|
|
"step": 960
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9518011161846778,
|
||
|
|
"grad_norm": 0.9635760320459535,
|
||
|
|
"learning_rate": 3.29421274215496e-06,
|
||
|
|
"loss": 0.1665,
|
||
|
|
"step": 962
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9558599695585996,
|
||
|
|
"grad_norm": 1.0098097588789372,
|
||
|
|
"learning_rate": 3.2720446297581696e-06,
|
||
|
|
"loss": 0.1756,
|
||
|
|
"step": 964
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9599188229325215,
|
||
|
|
"grad_norm": 0.9487863077375068,
|
||
|
|
"learning_rate": 3.2499150239761813e-06,
|
||
|
|
"loss": 0.1674,
|
||
|
|
"step": 966
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9639776763064434,
|
||
|
|
"grad_norm": 1.0886147748414823,
|
||
|
|
"learning_rate": 3.2278244179561107e-06,
|
||
|
|
"loss": 0.176,
|
||
|
|
"step": 968
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9680365296803652,
|
||
|
|
"grad_norm": 1.0933879192223048,
|
||
|
|
"learning_rate": 3.205773303975982e-06,
|
||
|
|
"loss": 0.1649,
|
||
|
|
"step": 970
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.972095383054287,
|
||
|
|
"grad_norm": 1.032499198840103,
|
||
|
|
"learning_rate": 3.1837621734337607e-06,
|
||
|
|
"loss": 0.1712,
|
||
|
|
"step": 972
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.976154236428209,
|
||
|
|
"grad_norm": 1.0255453322259884,
|
||
|
|
"learning_rate": 3.1617915168363994e-06,
|
||
|
|
"loss": 0.1835,
|
||
|
|
"step": 974
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9802130898021308,
|
||
|
|
"grad_norm": 0.9986750671875287,
|
||
|
|
"learning_rate": 3.1398618237889124e-06,
|
||
|
|
"loss": 0.1685,
|
||
|
|
"step": 976
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9842719431760527,
|
||
|
|
"grad_norm": 1.018414001751852,
|
||
|
|
"learning_rate": 3.11797358298346e-06,
|
||
|
|
"loss": 0.1707,
|
||
|
|
"step": 978
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9883307965499746,
|
||
|
|
"grad_norm": 1.0770634669309533,
|
||
|
|
"learning_rate": 3.096127282188458e-06,
|
||
|
|
"loss": 0.1687,
|
||
|
|
"step": 980
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9923896499238964,
|
||
|
|
"grad_norm": 1.1276389976988863,
|
||
|
|
"learning_rate": 3.074323408237716e-06,
|
||
|
|
"loss": 0.1788,
|
||
|
|
"step": 982
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9964485032978183,
|
||
|
|
"grad_norm": 1.1430608051198146,
|
||
|
|
"learning_rate": 3.0525624470195746e-06,
|
||
|
|
"loss": 0.1878,
|
||
|
|
"step": 984
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0,
|
||
|
|
"grad_norm": 1.1495443627078172,
|
||
|
|
"learning_rate": 3.0308448834660953e-06,
|
||
|
|
"loss": 0.1664,
|
||
|
|
"step": 986
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.004058853373922,
|
||
|
|
"grad_norm": 0.7595072680880963,
|
||
|
|
"learning_rate": 3.009171201542235e-06,
|
||
|
|
"loss": 0.073,
|
||
|
|
"step": 988
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0081177067478437,
|
||
|
|
"grad_norm": 0.6553749481487482,
|
||
|
|
"learning_rate": 2.987541884235078e-06,
|
||
|
|
"loss": 0.0666,
|
||
|
|
"step": 990
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0121765601217656,
|
||
|
|
"grad_norm": 0.6758730321723564,
|
||
|
|
"learning_rate": 2.965957413543063e-06,
|
||
|
|
"loss": 0.068,
|
||
|
|
"step": 992
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0162354134956875,
|
||
|
|
"grad_norm": 0.7705357959194599,
|
||
|
|
"learning_rate": 2.944418270465243e-06,
|
||
|
|
"loss": 0.0722,
|
||
|
|
"step": 994
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0202942668696093,
|
||
|
|
"grad_norm": 0.7447986269637112,
|
||
|
|
"learning_rate": 2.9229249349905686e-06,
|
||
|
|
"loss": 0.0636,
|
||
|
|
"step": 996
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.024353120243531,
|
||
|
|
"grad_norm": 0.7070522483364615,
|
||
|
|
"learning_rate": 2.9014778860871916e-06,
|
||
|
|
"loss": 0.056,
|
||
|
|
"step": 998
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.028411973617453,
|
||
|
|
"grad_norm": 0.8137822848077799,
|
||
|
|
"learning_rate": 2.880077601691793e-06,
|
||
|
|
"loss": 0.0711,
|
||
|
|
"step": 1000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.032470826991375,
|
||
|
|
"grad_norm": 0.6930740331498185,
|
||
|
|
"learning_rate": 2.8587245586989265e-06,
|
||
|
|
"loss": 0.0619,
|
||
|
|
"step": 1002
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.036529680365297,
|
||
|
|
"grad_norm": 0.6609838916960153,
|
||
|
|
"learning_rate": 2.8374192329503934e-06,
|
||
|
|
"loss": 0.0604,
|
||
|
|
"step": 1004
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0405885337392187,
|
||
|
|
"grad_norm": 0.6985284878939515,
|
||
|
|
"learning_rate": 2.8161620992246497e-06,
|
||
|
|
"loss": 0.0616,
|
||
|
|
"step": 1006
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0446473871131405,
|
||
|
|
"grad_norm": 0.697512001428481,
|
||
|
|
"learning_rate": 2.7949536312262048e-06,
|
||
|
|
"loss": 0.0649,
|
||
|
|
"step": 1008
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0487062404870624,
|
||
|
|
"grad_norm": 0.6960549223161825,
|
||
|
|
"learning_rate": 2.7737943015750862e-06,
|
||
|
|
"loss": 0.0699,
|
||
|
|
"step": 1010
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0527650938609843,
|
||
|
|
"grad_norm": 0.7147137635432833,
|
||
|
|
"learning_rate": 2.752684581796292e-06,
|
||
|
|
"loss": 0.0626,
|
||
|
|
"step": 1012
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.056823947234906,
|
||
|
|
"grad_norm": 0.6478212242747857,
|
||
|
|
"learning_rate": 2.7316249423092923e-06,
|
||
|
|
"loss": 0.0594,
|
||
|
|
"step": 1014
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.060882800608828,
|
||
|
|
"grad_norm": 0.7050061564241327,
|
||
|
|
"learning_rate": 2.7106158524175396e-06,
|
||
|
|
"loss": 0.0646,
|
||
|
|
"step": 1016
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.06494165398275,
|
||
|
|
"grad_norm": 0.5836812360722653,
|
||
|
|
"learning_rate": 2.689657780298019e-06,
|
||
|
|
"loss": 0.0552,
|
||
|
|
"step": 1018
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0690005073566717,
|
||
|
|
"grad_norm": 0.6963206280767881,
|
||
|
|
"learning_rate": 2.6687511929908093e-06,
|
||
|
|
"loss": 0.0633,
|
||
|
|
"step": 1020
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0730593607305936,
|
||
|
|
"grad_norm": 0.6223916195319845,
|
||
|
|
"learning_rate": 2.6478965563886745e-06,
|
||
|
|
"loss": 0.0567,
|
||
|
|
"step": 1022
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0771182141045155,
|
||
|
|
"grad_norm": 0.7658362298283596,
|
||
|
|
"learning_rate": 2.627094335226682e-06,
|
||
|
|
"loss": 0.059,
|
||
|
|
"step": 1024
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0811770674784373,
|
||
|
|
"grad_norm": 0.6242034540359834,
|
||
|
|
"learning_rate": 2.6063449930718487e-06,
|
||
|
|
"loss": 0.0566,
|
||
|
|
"step": 1026
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.085235920852359,
|
||
|
|
"grad_norm": 0.6057344953235689,
|
||
|
|
"learning_rate": 2.5856489923128136e-06,
|
||
|
|
"loss": 0.0573,
|
||
|
|
"step": 1028
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.089294774226281,
|
||
|
|
"grad_norm": 0.6209117592060627,
|
||
|
|
"learning_rate": 2.5650067941495236e-06,
|
||
|
|
"loss": 0.0543,
|
||
|
|
"step": 1030
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.093353627600203,
|
||
|
|
"grad_norm": 0.626145712835931,
|
||
|
|
"learning_rate": 2.5444188585829634e-06,
|
||
|
|
"loss": 0.0573,
|
||
|
|
"step": 1032
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.097412480974125,
|
||
|
|
"grad_norm": 0.7019968254783542,
|
||
|
|
"learning_rate": 2.523885644404906e-06,
|
||
|
|
"loss": 0.0629,
|
||
|
|
"step": 1034
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1014713343480467,
|
||
|
|
"grad_norm": 0.672737803900476,
|
||
|
|
"learning_rate": 2.5034076091876813e-06,
|
||
|
|
"loss": 0.0599,
|
||
|
|
"step": 1036
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1055301877219685,
|
||
|
|
"grad_norm": 0.7666550066371539,
|
||
|
|
"learning_rate": 2.48298520927399e-06,
|
||
|
|
"loss": 0.0685,
|
||
|
|
"step": 1038
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1095890410958904,
|
||
|
|
"grad_norm": 0.6186035266270663,
|
||
|
|
"learning_rate": 2.4626188997667224e-06,
|
||
|
|
"loss": 0.0528,
|
||
|
|
"step": 1040
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1136478944698123,
|
||
|
|
"grad_norm": 0.6362086867459513,
|
||
|
|
"learning_rate": 2.4423091345188244e-06,
|
||
|
|
"loss": 0.0609,
|
||
|
|
"step": 1042
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.117706747843734,
|
||
|
|
"grad_norm": 0.7043829575983082,
|
||
|
|
"learning_rate": 2.4220563661231793e-06,
|
||
|
|
"loss": 0.0607,
|
||
|
|
"step": 1044
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.121765601217656,
|
||
|
|
"grad_norm": 0.6342780093035242,
|
||
|
|
"learning_rate": 2.4018610459025317e-06,
|
||
|
|
"loss": 0.0614,
|
||
|
|
"step": 1046
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.125824454591578,
|
||
|
|
"grad_norm": 0.6371937965298768,
|
||
|
|
"learning_rate": 2.381723623899412e-06,
|
||
|
|
"loss": 0.0576,
|
||
|
|
"step": 1048
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1298833079654997,
|
||
|
|
"grad_norm": 0.7261372327288902,
|
||
|
|
"learning_rate": 2.361644548866127e-06,
|
||
|
|
"loss": 0.0612,
|
||
|
|
"step": 1050
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1339421613394216,
|
||
|
|
"grad_norm": 0.6921022790180155,
|
||
|
|
"learning_rate": 2.341624268254747e-06,
|
||
|
|
"loss": 0.0637,
|
||
|
|
"step": 1052
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1380010147133435,
|
||
|
|
"grad_norm": 0.7180722302503428,
|
||
|
|
"learning_rate": 2.3216632282071345e-06,
|
||
|
|
"loss": 0.0653,
|
||
|
|
"step": 1054
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1420598680872653,
|
||
|
|
"grad_norm": 0.6048159926460217,
|
||
|
|
"learning_rate": 2.3017618735450142e-06,
|
||
|
|
"loss": 0.055,
|
||
|
|
"step": 1056
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.146118721461187,
|
||
|
|
"grad_norm": 0.6876882976918033,
|
||
|
|
"learning_rate": 2.2819206477600462e-06,
|
||
|
|
"loss": 0.0593,
|
||
|
|
"step": 1058
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.150177574835109,
|
||
|
|
"grad_norm": 0.6830162087163217,
|
||
|
|
"learning_rate": 2.2621399930039493e-06,
|
||
|
|
"loss": 0.0576,
|
||
|
|
"step": 1060
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.154236428209031,
|
||
|
|
"grad_norm": 0.572916709401609,
|
||
|
|
"learning_rate": 2.2424203500786473e-06,
|
||
|
|
"loss": 0.0565,
|
||
|
|
"step": 1062
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.158295281582953,
|
||
|
|
"grad_norm": 0.6064104088259805,
|
||
|
|
"learning_rate": 2.2227621584264505e-06,
|
||
|
|
"loss": 0.0609,
|
||
|
|
"step": 1064
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1623541349568747,
|
||
|
|
"grad_norm": 0.5345159650560205,
|
||
|
|
"learning_rate": 2.203165856120251e-06,
|
||
|
|
"loss": 0.0486,
|
||
|
|
"step": 1066
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1664129883307965,
|
||
|
|
"grad_norm": 0.708241954697193,
|
||
|
|
"learning_rate": 2.183631879853776e-06,
|
||
|
|
"loss": 0.0592,
|
||
|
|
"step": 1068
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1704718417047184,
|
||
|
|
"grad_norm": 0.6237247767204619,
|
||
|
|
"learning_rate": 2.164160664931843e-06,
|
||
|
|
"loss": 0.0564,
|
||
|
|
"step": 1070
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1745306950786403,
|
||
|
|
"grad_norm": 0.6913274477398279,
|
||
|
|
"learning_rate": 2.1447526452606658e-06,
|
||
|
|
"loss": 0.0608,
|
||
|
|
"step": 1072
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.178589548452562,
|
||
|
|
"grad_norm": 0.615699302647019,
|
||
|
|
"learning_rate": 2.125408253338183e-06,
|
||
|
|
"loss": 0.0572,
|
||
|
|
"step": 1074
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.182648401826484,
|
||
|
|
"grad_norm": 0.7114790842641555,
|
||
|
|
"learning_rate": 2.106127920244423e-06,
|
||
|
|
"loss": 0.056,
|
||
|
|
"step": 1076
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.186707255200406,
|
||
|
|
"grad_norm": 0.6498843232992042,
|
||
|
|
"learning_rate": 2.086912075631896e-06,
|
||
|
|
"loss": 0.0579,
|
||
|
|
"step": 1078
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1907661085743277,
|
||
|
|
"grad_norm": 0.6342534301664807,
|
||
|
|
"learning_rate": 2.067761147716017e-06,
|
||
|
|
"loss": 0.0573,
|
||
|
|
"step": 1080
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1948249619482496,
|
||
|
|
"grad_norm": 0.7027558078351507,
|
||
|
|
"learning_rate": 2.0486755632655643e-06,
|
||
|
|
"loss": 0.0593,
|
||
|
|
"step": 1082
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1988838153221715,
|
||
|
|
"grad_norm": 0.6558346648533067,
|
||
|
|
"learning_rate": 2.029655747593169e-06,
|
||
|
|
"loss": 0.0605,
|
||
|
|
"step": 1084
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2029426686960933,
|
||
|
|
"grad_norm": 0.6450513139758751,
|
||
|
|
"learning_rate": 2.010702124545845e-06,
|
||
|
|
"loss": 0.0598,
|
||
|
|
"step": 1086
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.207001522070015,
|
||
|
|
"grad_norm": 0.7322704077213636,
|
||
|
|
"learning_rate": 1.9918151164955303e-06,
|
||
|
|
"loss": 0.0617,
|
||
|
|
"step": 1088
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.211060375443937,
|
||
|
|
"grad_norm": 0.7103114005030767,
|
||
|
|
"learning_rate": 1.9729951443296823e-06,
|
||
|
|
"loss": 0.0564,
|
||
|
|
"step": 1090
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.215119228817859,
|
||
|
|
"grad_norm": 0.6912085691591306,
|
||
|
|
"learning_rate": 1.9542426274418975e-06,
|
||
|
|
"loss": 0.0628,
|
||
|
|
"step": 1092
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.219178082191781,
|
||
|
|
"grad_norm": 0.7492868491245555,
|
||
|
|
"learning_rate": 1.9355579837225673e-06,
|
||
|
|
"loss": 0.0601,
|
||
|
|
"step": 1094
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2232369355657027,
|
||
|
|
"grad_norm": 0.6846296844726598,
|
||
|
|
"learning_rate": 1.916941629549565e-06,
|
||
|
|
"loss": 0.0562,
|
||
|
|
"step": 1096
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2272957889396245,
|
||
|
|
"grad_norm": 0.6860703433669731,
|
||
|
|
"learning_rate": 1.8983939797789624e-06,
|
||
|
|
"loss": 0.0604,
|
||
|
|
"step": 1098
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2313546423135464,
|
||
|
|
"grad_norm": 0.6583737331854461,
|
||
|
|
"learning_rate": 1.8799154477357883e-06,
|
||
|
|
"loss": 0.057,
|
||
|
|
"step": 1100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2354134956874683,
|
||
|
|
"grad_norm": 0.6015963283689161,
|
||
|
|
"learning_rate": 1.8615064452048181e-06,
|
||
|
|
"loss": 0.0529,
|
||
|
|
"step": 1102
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.23947234906139,
|
||
|
|
"grad_norm": 0.6596224589385736,
|
||
|
|
"learning_rate": 1.8431673824214013e-06,
|
||
|
|
"loss": 0.0607,
|
||
|
|
"step": 1104
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.243531202435312,
|
||
|
|
"grad_norm": 0.6295377331681089,
|
||
|
|
"learning_rate": 1.8248986680623077e-06,
|
||
|
|
"loss": 0.0524,
|
||
|
|
"step": 1106
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.247590055809234,
|
||
|
|
"grad_norm": 0.7623992953499044,
|
||
|
|
"learning_rate": 1.8067007092366368e-06,
|
||
|
|
"loss": 0.0633,
|
||
|
|
"step": 1108
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2516489091831557,
|
||
|
|
"grad_norm": 0.6747010480441555,
|
||
|
|
"learning_rate": 1.7885739114767292e-06,
|
||
|
|
"loss": 0.0575,
|
||
|
|
"step": 1110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2557077625570776,
|
||
|
|
"grad_norm": 0.6640676472618579,
|
||
|
|
"learning_rate": 1.770518678729139e-06,
|
||
|
|
"loss": 0.0532,
|
||
|
|
"step": 1112
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2597666159309995,
|
||
|
|
"grad_norm": 0.6588950267013456,
|
||
|
|
"learning_rate": 1.752535413345634e-06,
|
||
|
|
"loss": 0.0572,
|
||
|
|
"step": 1114
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2638254693049213,
|
||
|
|
"grad_norm": 0.6957155982625279,
|
||
|
|
"learning_rate": 1.734624516074221e-06,
|
||
|
|
"loss": 0.0591,
|
||
|
|
"step": 1116
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.267884322678843,
|
||
|
|
"grad_norm": 0.7568659886745189,
|
||
|
|
"learning_rate": 1.716786386050221e-06,
|
||
|
|
"loss": 0.0619,
|
||
|
|
"step": 1118
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.271943176052765,
|
||
|
|
"grad_norm": 0.7149883674184384,
|
||
|
|
"learning_rate": 1.6990214207873723e-06,
|
||
|
|
"loss": 0.0603,
|
||
|
|
"step": 1120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.276002029426687,
|
||
|
|
"grad_norm": 0.6375454124392296,
|
||
|
|
"learning_rate": 1.681330016168977e-06,
|
||
|
|
"loss": 0.0583,
|
||
|
|
"step": 1122
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.280060882800609,
|
||
|
|
"grad_norm": 0.7001731734101665,
|
||
|
|
"learning_rate": 1.6637125664390747e-06,
|
||
|
|
"loss": 0.06,
|
||
|
|
"step": 1124
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2841197361745307,
|
||
|
|
"grad_norm": 0.5744052821303995,
|
||
|
|
"learning_rate": 1.6461694641936544e-06,
|
||
|
|
"loss": 0.0532,
|
||
|
|
"step": 1126
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2881785895484525,
|
||
|
|
"grad_norm": 0.6379817597651213,
|
||
|
|
"learning_rate": 1.6287011003719105e-06,
|
||
|
|
"loss": 0.0581,
|
||
|
|
"step": 1128
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2922374429223744,
|
||
|
|
"grad_norm": 0.6936132311539229,
|
||
|
|
"learning_rate": 1.61130786424753e-06,
|
||
|
|
"loss": 0.0578,
|
||
|
|
"step": 1130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2962962962962963,
|
||
|
|
"grad_norm": 0.7643680282316857,
|
||
|
|
"learning_rate": 1.5939901434200145e-06,
|
||
|
|
"loss": 0.0587,
|
||
|
|
"step": 1132
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.300355149670218,
|
||
|
|
"grad_norm": 0.676227187971244,
|
||
|
|
"learning_rate": 1.5767483238060498e-06,
|
||
|
|
"loss": 0.0568,
|
||
|
|
"step": 1134
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.30441400304414,
|
||
|
|
"grad_norm": 0.6485083388679013,
|
||
|
|
"learning_rate": 1.5595827896308968e-06,
|
||
|
|
"loss": 0.0615,
|
||
|
|
"step": 1136
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.308472856418062,
|
||
|
|
"grad_norm": 0.6513490211287265,
|
||
|
|
"learning_rate": 1.5424939234198377e-06,
|
||
|
|
"loss": 0.0558,
|
||
|
|
"step": 1138
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3125317097919837,
|
||
|
|
"grad_norm": 0.6470779888157924,
|
||
|
|
"learning_rate": 1.5254821059896452e-06,
|
||
|
|
"loss": 0.0569,
|
||
|
|
"step": 1140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3165905631659056,
|
||
|
|
"grad_norm": 0.6690972634285901,
|
||
|
|
"learning_rate": 1.5085477164400975e-06,
|
||
|
|
"loss": 0.0564,
|
||
|
|
"step": 1142
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3206494165398275,
|
||
|
|
"grad_norm": 0.6192010726163617,
|
||
|
|
"learning_rate": 1.4916911321455362e-06,
|
||
|
|
"loss": 0.0566,
|
||
|
|
"step": 1144
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3247082699137493,
|
||
|
|
"grad_norm": 0.6718507373057219,
|
||
|
|
"learning_rate": 1.4749127287464483e-06,
|
||
|
|
"loss": 0.0566,
|
||
|
|
"step": 1146
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.328767123287671,
|
||
|
|
"grad_norm": 0.6291964856953526,
|
||
|
|
"learning_rate": 1.458212880141099e-06,
|
||
|
|
"loss": 0.0568,
|
||
|
|
"step": 1148
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.332825976661593,
|
||
|
|
"grad_norm": 0.6090771051076672,
|
||
|
|
"learning_rate": 1.4415919584771999e-06,
|
||
|
|
"loss": 0.0547,
|
||
|
|
"step": 1150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.336884830035515,
|
||
|
|
"grad_norm": 0.6315155634950337,
|
||
|
|
"learning_rate": 1.425050334143616e-06,
|
||
|
|
"loss": 0.0586,
|
||
|
|
"step": 1152
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.340943683409437,
|
||
|
|
"grad_norm": 0.682944731395333,
|
||
|
|
"learning_rate": 1.408588375762114e-06,
|
||
|
|
"loss": 0.0575,
|
||
|
|
"step": 1154
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3450025367833587,
|
||
|
|
"grad_norm": 0.6828351505916127,
|
||
|
|
"learning_rate": 1.39220645017914e-06,
|
||
|
|
"loss": 0.0575,
|
||
|
|
"step": 1156
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3490613901572805,
|
||
|
|
"grad_norm": 0.6343438475682116,
|
||
|
|
"learning_rate": 1.3759049224576516e-06,
|
||
|
|
"loss": 0.054,
|
||
|
|
"step": 1158
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3531202435312024,
|
||
|
|
"grad_norm": 0.6608391141452298,
|
||
|
|
"learning_rate": 1.3596841558689788e-06,
|
||
|
|
"loss": 0.0611,
|
||
|
|
"step": 1160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3571790969051243,
|
||
|
|
"grad_norm": 0.6847038775101427,
|
||
|
|
"learning_rate": 1.3435445118847362e-06,
|
||
|
|
"loss": 0.0597,
|
||
|
|
"step": 1162
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.361237950279046,
|
||
|
|
"grad_norm": 0.6758006038700237,
|
||
|
|
"learning_rate": 1.3274863501687546e-06,
|
||
|
|
"loss": 0.0582,
|
||
|
|
"step": 1164
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.365296803652968,
|
||
|
|
"grad_norm": 0.679273586750369,
|
||
|
|
"learning_rate": 1.3115100285690795e-06,
|
||
|
|
"loss": 0.0586,
|
||
|
|
"step": 1166
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.36935565702689,
|
||
|
|
"grad_norm": 0.6455382817232485,
|
||
|
|
"learning_rate": 1.2956159031099874e-06,
|
||
|
|
"loss": 0.0572,
|
||
|
|
"step": 1168
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3734145104008117,
|
||
|
|
"grad_norm": 0.6494368906651962,
|
||
|
|
"learning_rate": 1.2798043279840544e-06,
|
||
|
|
"loss": 0.0573,
|
||
|
|
"step": 1170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3774733637747336,
|
||
|
|
"grad_norm": 0.6652797010014586,
|
||
|
|
"learning_rate": 1.2640756555442684e-06,
|
||
|
|
"loss": 0.0585,
|
||
|
|
"step": 1172
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3815322171486555,
|
||
|
|
"grad_norm": 0.7085143214543068,
|
||
|
|
"learning_rate": 1.248430236296168e-06,
|
||
|
|
"loss": 0.0531,
|
||
|
|
"step": 1174
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3855910705225774,
|
||
|
|
"grad_norm": 0.7017172691247079,
|
||
|
|
"learning_rate": 1.2328684188900392e-06,
|
||
|
|
"loss": 0.0562,
|
||
|
|
"step": 1176
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.389649923896499,
|
||
|
|
"grad_norm": 0.5911653357727898,
|
||
|
|
"learning_rate": 1.2173905501131395e-06,
|
||
|
|
"loss": 0.0555,
|
||
|
|
"step": 1178
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.393708777270421,
|
||
|
|
"grad_norm": 0.7052853605271917,
|
||
|
|
"learning_rate": 1.2019969748819783e-06,
|
||
|
|
"loss": 0.0633,
|
||
|
|
"step": 1180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.397767630644343,
|
||
|
|
"grad_norm": 0.5468703307014926,
|
||
|
|
"learning_rate": 1.186688036234625e-06,
|
||
|
|
"loss": 0.0512,
|
||
|
|
"step": 1182
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.401826484018265,
|
||
|
|
"grad_norm": 0.6298551223639655,
|
||
|
|
"learning_rate": 1.1714640753230628e-06,
|
||
|
|
"loss": 0.0523,
|
||
|
|
"step": 1184
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4058853373921867,
|
||
|
|
"grad_norm": 0.7357211211763364,
|
||
|
|
"learning_rate": 1.1563254314055893e-06,
|
||
|
|
"loss": 0.0553,
|
||
|
|
"step": 1186
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4099441907661086,
|
||
|
|
"grad_norm": 0.6645237994069922,
|
||
|
|
"learning_rate": 1.1412724418392562e-06,
|
||
|
|
"loss": 0.0544,
|
||
|
|
"step": 1188
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4140030441400304,
|
||
|
|
"grad_norm": 0.661565716961166,
|
||
|
|
"learning_rate": 1.126305442072354e-06,
|
||
|
|
"loss": 0.055,
|
||
|
|
"step": 1190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4180618975139523,
|
||
|
|
"grad_norm": 0.6206331269413049,
|
||
|
|
"learning_rate": 1.1114247656369305e-06,
|
||
|
|
"loss": 0.0545,
|
||
|
|
"step": 1192
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.422120750887874,
|
||
|
|
"grad_norm": 0.7180840889505126,
|
||
|
|
"learning_rate": 1.0966307441413598e-06,
|
||
|
|
"loss": 0.0581,
|
||
|
|
"step": 1194
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.426179604261796,
|
||
|
|
"grad_norm": 0.7354347742714406,
|
||
|
|
"learning_rate": 1.0819237072629606e-06,
|
||
|
|
"loss": 0.0597,
|
||
|
|
"step": 1196
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.430238457635718,
|
||
|
|
"grad_norm": 0.664963118331295,
|
||
|
|
"learning_rate": 1.0673039827406373e-06,
|
||
|
|
"loss": 0.0592,
|
||
|
|
"step": 1198
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4342973110096398,
|
||
|
|
"grad_norm": 0.6596757154643482,
|
||
|
|
"learning_rate": 1.0527718963675871e-06,
|
||
|
|
"loss": 0.0543,
|
||
|
|
"step": 1200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4383561643835616,
|
||
|
|
"grad_norm": 0.614862575729698,
|
||
|
|
"learning_rate": 1.0383277719840318e-06,
|
||
|
|
"loss": 0.051,
|
||
|
|
"step": 1202
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4424150177574835,
|
||
|
|
"grad_norm": 0.6239737484592334,
|
||
|
|
"learning_rate": 1.0239719314700052e-06,
|
||
|
|
"loss": 0.0569,
|
||
|
|
"step": 1204
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4464738711314054,
|
||
|
|
"grad_norm": 0.7008527413286773,
|
||
|
|
"learning_rate": 1.0097046947381805e-06,
|
||
|
|
"loss": 0.0622,
|
||
|
|
"step": 1206
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.450532724505327,
|
||
|
|
"grad_norm": 0.646447221626618,
|
||
|
|
"learning_rate": 9.955263797267379e-07,
|
||
|
|
"loss": 0.0593,
|
||
|
|
"step": 1208
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.454591577879249,
|
||
|
|
"grad_norm": 0.6736248842428098,
|
||
|
|
"learning_rate": 9.814373023922851e-07,
|
||
|
|
"loss": 0.0573,
|
||
|
|
"step": 1210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.458650431253171,
|
||
|
|
"grad_norm": 0.7520656749859748,
|
||
|
|
"learning_rate": 9.674377767028142e-07,
|
||
|
|
"loss": 0.0595,
|
||
|
|
"step": 1212
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.462709284627093,
|
||
|
|
"grad_norm": 0.6256286530852058,
|
||
|
|
"learning_rate": 9.53528114630699e-07,
|
||
|
|
"loss": 0.0539,
|
||
|
|
"step": 1214
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4667681380010147,
|
||
|
|
"grad_norm": 0.7163476466314366,
|
||
|
|
"learning_rate": 9.397086261457511e-07,
|
||
|
|
"loss": 0.0587,
|
||
|
|
"step": 1216
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4708269913749366,
|
||
|
|
"grad_norm": 0.6810504627251797,
|
||
|
|
"learning_rate": 9.259796192083071e-07,
|
||
|
|
"loss": 0.0576,
|
||
|
|
"step": 1218
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4748858447488584,
|
||
|
|
"grad_norm": 0.6288859512257164,
|
||
|
|
"learning_rate": 9.123413997623714e-07,
|
||
|
|
"loss": 0.0543,
|
||
|
|
"step": 1220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4789446981227803,
|
||
|
|
"grad_norm": 0.6740418425171263,
|
||
|
|
"learning_rate": 8.987942717287923e-07,
|
||
|
|
"loss": 0.0578,
|
||
|
|
"step": 1222
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.483003551496702,
|
||
|
|
"grad_norm": 0.6224042862768536,
|
||
|
|
"learning_rate": 8.853385369984901e-07,
|
||
|
|
"loss": 0.0537,
|
||
|
|
"step": 1224
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.487062404870624,
|
||
|
|
"grad_norm": 0.6485173083194978,
|
||
|
|
"learning_rate": 8.719744954257375e-07,
|
||
|
|
"loss": 0.056,
|
||
|
|
"step": 1226
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.491121258244546,
|
||
|
|
"grad_norm": 0.6582025110825541,
|
||
|
|
"learning_rate": 8.587024448214637e-07,
|
||
|
|
"loss": 0.0541,
|
||
|
|
"step": 1228
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4951801116184678,
|
||
|
|
"grad_norm": 0.7255418838785723,
|
||
|
|
"learning_rate": 8.455226809466327e-07,
|
||
|
|
"loss": 0.0592,
|
||
|
|
"step": 1230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4992389649923896,
|
||
|
|
"grad_norm": 0.6507900300493289,
|
||
|
|
"learning_rate": 8.324354975056403e-07,
|
||
|
|
"loss": 0.0539,
|
||
|
|
"step": 1232
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5032978183663115,
|
||
|
|
"grad_norm": 0.6681824734246471,
|
||
|
|
"learning_rate": 8.19441186139776e-07,
|
||
|
|
"loss": 0.0591,
|
||
|
|
"step": 1234
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5073566717402334,
|
||
|
|
"grad_norm": 0.6936354997773724,
|
||
|
|
"learning_rate": 8.065400364207194e-07,
|
||
|
|
"loss": 0.0584,
|
||
|
|
"step": 1236
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5114155251141552,
|
||
|
|
"grad_norm": 0.6718221656136698,
|
||
|
|
"learning_rate": 7.937323358440935e-07,
|
||
|
|
"loss": 0.0543,
|
||
|
|
"step": 1238
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.515474378488077,
|
||
|
|
"grad_norm": 0.6632718887777156,
|
||
|
|
"learning_rate": 7.810183698230539e-07,
|
||
|
|
"loss": 0.0572,
|
||
|
|
"step": 1240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.519533231861999,
|
||
|
|
"grad_norm": 0.6239004674471934,
|
||
|
|
"learning_rate": 7.683984216819262e-07,
|
||
|
|
"loss": 0.0545,
|
||
|
|
"step": 1242
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.523592085235921,
|
||
|
|
"grad_norm": 0.5944403471674328,
|
||
|
|
"learning_rate": 7.55872772649896e-07,
|
||
|
|
"loss": 0.0535,
|
||
|
|
"step": 1244
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5276509386098427,
|
||
|
|
"grad_norm": 0.6165723170085607,
|
||
|
|
"learning_rate": 7.434417018547396e-07,
|
||
|
|
"loss": 0.0514,
|
||
|
|
"step": 1246
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5317097919837646,
|
||
|
|
"grad_norm": 0.7183547419188132,
|
||
|
|
"learning_rate": 7.311054863166095e-07,
|
||
|
|
"loss": 0.0588,
|
||
|
|
"step": 1248
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5357686453576864,
|
||
|
|
"grad_norm": 0.7242691023134634,
|
||
|
|
"learning_rate": 7.188644009418517e-07,
|
||
|
|
"loss": 0.0603,
|
||
|
|
"step": 1250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5398274987316083,
|
||
|
|
"grad_norm": 0.5791350405500479,
|
||
|
|
"learning_rate": 7.067187185168862e-07,
|
||
|
|
"loss": 0.0531,
|
||
|
|
"step": 1252
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.54388635210553,
|
||
|
|
"grad_norm": 0.6776524885992443,
|
||
|
|
"learning_rate": 6.946687097021249e-07,
|
||
|
|
"loss": 0.0544,
|
||
|
|
"step": 1254
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.547945205479452,
|
||
|
|
"grad_norm": 0.6068439005721586,
|
||
|
|
"learning_rate": 6.827146430259446e-07,
|
||
|
|
"loss": 0.0504,
|
||
|
|
"step": 1256
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.552004058853374,
|
||
|
|
"grad_norm": 0.6517780554148217,
|
||
|
|
"learning_rate": 6.70856784878699e-07,
|
||
|
|
"loss": 0.0576,
|
||
|
|
"step": 1258
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5560629122272958,
|
||
|
|
"grad_norm": 0.6551482566155284,
|
||
|
|
"learning_rate": 6.590953995067812e-07,
|
||
|
|
"loss": 0.0585,
|
||
|
|
"step": 1260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5601217656012176,
|
||
|
|
"grad_norm": 0.7195939705815774,
|
||
|
|
"learning_rate": 6.474307490067383e-07,
|
||
|
|
"loss": 0.0591,
|
||
|
|
"step": 1262
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5641806189751395,
|
||
|
|
"grad_norm": 0.7410151414847665,
|
||
|
|
"learning_rate": 6.358630933194282e-07,
|
||
|
|
"loss": 0.0618,
|
||
|
|
"step": 1264
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5682394723490614,
|
||
|
|
"grad_norm": 0.6972961543769066,
|
||
|
|
"learning_rate": 6.24392690224232e-07,
|
||
|
|
"loss": 0.0607,
|
||
|
|
"step": 1266
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5722983257229832,
|
||
|
|
"grad_norm": 0.6627967555045137,
|
||
|
|
"learning_rate": 6.130197953333017e-07,
|
||
|
|
"loss": 0.0602,
|
||
|
|
"step": 1268
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.576357179096905,
|
||
|
|
"grad_norm": 0.6185680861600283,
|
||
|
|
"learning_rate": 6.017446620858708e-07,
|
||
|
|
"loss": 0.0565,
|
||
|
|
"step": 1270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.580416032470827,
|
||
|
|
"grad_norm": 0.6250339920749016,
|
||
|
|
"learning_rate": 5.905675417426027e-07,
|
||
|
|
"loss": 0.0572,
|
||
|
|
"step": 1272
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.584474885844749,
|
||
|
|
"grad_norm": 0.6417143051901513,
|
||
|
|
"learning_rate": 5.794886833799923e-07,
|
||
|
|
"loss": 0.0514,
|
||
|
|
"step": 1274
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5885337392186707,
|
||
|
|
"grad_norm": 0.5742514445618982,
|
||
|
|
"learning_rate": 5.685083338848152e-07,
|
||
|
|
"loss": 0.0509,
|
||
|
|
"step": 1276
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5925925925925926,
|
||
|
|
"grad_norm": 0.6842845774267343,
|
||
|
|
"learning_rate": 5.576267379486294e-07,
|
||
|
|
"loss": 0.0608,
|
||
|
|
"step": 1278
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5966514459665144,
|
||
|
|
"grad_norm": 0.7682701087480387,
|
||
|
|
"learning_rate": 5.468441380623169e-07,
|
||
|
|
"loss": 0.0619,
|
||
|
|
"step": 1280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6007102993404363,
|
||
|
|
"grad_norm": 0.6379342121635503,
|
||
|
|
"learning_rate": 5.361607745106817e-07,
|
||
|
|
"loss": 0.0534,
|
||
|
|
"step": 1282
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.604769152714358,
|
||
|
|
"grad_norm": 0.6746813956871355,
|
||
|
|
"learning_rate": 5.255768853671011e-07,
|
||
|
|
"loss": 0.0568,
|
||
|
|
"step": 1284
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.60882800608828,
|
||
|
|
"grad_norm": 0.5662682410250746,
|
||
|
|
"learning_rate": 5.150927064882089e-07,
|
||
|
|
"loss": 0.0488,
|
||
|
|
"step": 1286
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.612886859462202,
|
||
|
|
"grad_norm": 0.7495492021339842,
|
||
|
|
"learning_rate": 5.047084715086515e-07,
|
||
|
|
"loss": 0.0627,
|
||
|
|
"step": 1288
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6169457128361238,
|
||
|
|
"grad_norm": 0.5821244808744749,
|
||
|
|
"learning_rate": 4.944244118358721e-07,
|
||
|
|
"loss": 0.0496,
|
||
|
|
"step": 1290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6210045662100456,
|
||
|
|
"grad_norm": 0.6716113774136223,
|
||
|
|
"learning_rate": 4.842407566449591e-07,
|
||
|
|
"loss": 0.0527,
|
||
|
|
"step": 1292
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6250634195839675,
|
||
|
|
"grad_norm": 0.6675478536309039,
|
||
|
|
"learning_rate": 4.741577328735364e-07,
|
||
|
|
"loss": 0.0562,
|
||
|
|
"step": 1294
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6291222729578894,
|
||
|
|
"grad_norm": 0.6215223340536244,
|
||
|
|
"learning_rate": 4.641755652167107e-07,
|
||
|
|
"loss": 0.0557,
|
||
|
|
"step": 1296
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6331811263318112,
|
||
|
|
"grad_norm": 0.6998801247830552,
|
||
|
|
"learning_rate": 4.5429447612205635e-07,
|
||
|
|
"loss": 0.0559,
|
||
|
|
"step": 1298
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.637239979705733,
|
||
|
|
"grad_norm": 0.7251216921883125,
|
||
|
|
"learning_rate": 4.445146857846672e-07,
|
||
|
|
"loss": 0.0505,
|
||
|
|
"step": 1300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.641298833079655,
|
||
|
|
"grad_norm": 0.674778502737252,
|
||
|
|
"learning_rate": 4.3483641214224325e-07,
|
||
|
|
"loss": 0.0536,
|
||
|
|
"step": 1302
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.645357686453577,
|
||
|
|
"grad_norm": 0.603226222531606,
|
||
|
|
"learning_rate": 4.2525987087023433e-07,
|
||
|
|
"loss": 0.0492,
|
||
|
|
"step": 1304
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6494165398274987,
|
||
|
|
"grad_norm": 0.7870111250427357,
|
||
|
|
"learning_rate": 4.1578527537703973e-07,
|
||
|
|
"loss": 0.061,
|
||
|
|
"step": 1306
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6534753932014206,
|
||
|
|
"grad_norm": 0.6451056158523719,
|
||
|
|
"learning_rate": 4.064128367992459e-07,
|
||
|
|
"loss": 0.0556,
|
||
|
|
"step": 1308
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6575342465753424,
|
||
|
|
"grad_norm": 0.7137232587300326,
|
||
|
|
"learning_rate": 3.971427639969233e-07,
|
||
|
|
"loss": 0.0557,
|
||
|
|
"step": 1310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6615930999492643,
|
||
|
|
"grad_norm": 0.6325976622831373,
|
||
|
|
"learning_rate": 3.879752635489736e-07,
|
||
|
|
"loss": 0.0525,
|
||
|
|
"step": 1312
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.665651953323186,
|
||
|
|
"grad_norm": 0.670332593338025,
|
||
|
|
"learning_rate": 3.7891053974852597e-07,
|
||
|
|
"loss": 0.0524,
|
||
|
|
"step": 1314
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.669710806697108,
|
||
|
|
"grad_norm": 0.6627524615579112,
|
||
|
|
"learning_rate": 3.6994879459838375e-07,
|
||
|
|
"loss": 0.0557,
|
||
|
|
"step": 1316
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.67376966007103,
|
||
|
|
"grad_norm": 0.6143115984862325,
|
||
|
|
"learning_rate": 3.6109022780652147e-07,
|
||
|
|
"loss": 0.0569,
|
||
|
|
"step": 1318
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6778285134449518,
|
||
|
|
"grad_norm": 0.645959952937934,
|
||
|
|
"learning_rate": 3.5233503678163696e-07,
|
||
|
|
"loss": 0.0571,
|
||
|
|
"step": 1320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6818873668188736,
|
||
|
|
"grad_norm": 0.6681068621470589,
|
||
|
|
"learning_rate": 3.4368341662875004e-07,
|
||
|
|
"loss": 0.0535,
|
||
|
|
"step": 1322
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6859462201927955,
|
||
|
|
"grad_norm": 0.6686418775510267,
|
||
|
|
"learning_rate": 3.3513556014485805e-07,
|
||
|
|
"loss": 0.0615,
|
||
|
|
"step": 1324
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6900050735667174,
|
||
|
|
"grad_norm": 0.7424511108861405,
|
||
|
|
"learning_rate": 3.26691657814634e-07,
|
||
|
|
"loss": 0.0592,
|
||
|
|
"step": 1326
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6940639269406392,
|
||
|
|
"grad_norm": 0.659620559037728,
|
||
|
|
"learning_rate": 3.183518978061895e-07,
|
||
|
|
"loss": 0.0555,
|
||
|
|
"step": 1328
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.698122780314561,
|
||
|
|
"grad_norm": 0.6744968743539785,
|
||
|
|
"learning_rate": 3.101164659668732e-07,
|
||
|
|
"loss": 0.0557,
|
||
|
|
"step": 1330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.702181633688483,
|
||
|
|
"grad_norm": 0.6611821221163022,
|
||
|
|
"learning_rate": 3.0198554581913343e-07,
|
||
|
|
"loss": 0.0572,
|
||
|
|
"step": 1332
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.706240487062405,
|
||
|
|
"grad_norm": 0.6422102786010568,
|
||
|
|
"learning_rate": 2.9395931855643043e-07,
|
||
|
|
"loss": 0.0529,
|
||
|
|
"step": 1334
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7102993404363267,
|
||
|
|
"grad_norm": 0.5990751150942317,
|
||
|
|
"learning_rate": 2.860379630391935e-07,
|
||
|
|
"loss": 0.0522,
|
||
|
|
"step": 1336
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7143581938102486,
|
||
|
|
"grad_norm": 0.6397947037965095,
|
||
|
|
"learning_rate": 2.7822165579084013e-07,
|
||
|
|
"loss": 0.0516,
|
||
|
|
"step": 1338
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7184170471841704,
|
||
|
|
"grad_norm": 0.6033731180075299,
|
||
|
|
"learning_rate": 2.705105709938388e-07,
|
||
|
|
"loss": 0.0522,
|
||
|
|
"step": 1340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7224759005580923,
|
||
|
|
"grad_norm": 0.5850767244585305,
|
||
|
|
"learning_rate": 2.629048804858275e-07,
|
||
|
|
"loss": 0.0571,
|
||
|
|
"step": 1342
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.726534753932014,
|
||
|
|
"grad_norm": 0.6793421437999266,
|
||
|
|
"learning_rate": 2.5540475375578967e-07,
|
||
|
|
"loss": 0.0579,
|
||
|
|
"step": 1344
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.730593607305936,
|
||
|
|
"grad_norm": 0.6089471972028935,
|
||
|
|
"learning_rate": 2.4801035794026987e-07,
|
||
|
|
"loss": 0.0537,
|
||
|
|
"step": 1346
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.734652460679858,
|
||
|
|
"grad_norm": 0.5978644646078147,
|
||
|
|
"learning_rate": 2.407218578196524e-07,
|
||
|
|
"loss": 0.0521,
|
||
|
|
"step": 1348
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7387113140537798,
|
||
|
|
"grad_norm": 0.6521169806760104,
|
||
|
|
"learning_rate": 2.3353941581449048e-07,
|
||
|
|
"loss": 0.0584,
|
||
|
|
"step": 1350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7427701674277016,
|
||
|
|
"grad_norm": 0.6427655880180726,
|
||
|
|
"learning_rate": 2.2646319198188495e-07,
|
||
|
|
"loss": 0.0531,
|
||
|
|
"step": 1352
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7468290208016235,
|
||
|
|
"grad_norm": 0.7400104362172217,
|
||
|
|
"learning_rate": 2.1949334401192013e-07,
|
||
|
|
"loss": 0.0597,
|
||
|
|
"step": 1354
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7508878741755454,
|
||
|
|
"grad_norm": 0.7047967548664266,
|
||
|
|
"learning_rate": 2.1263002722414383e-07,
|
||
|
|
"loss": 0.0593,
|
||
|
|
"step": 1356
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7549467275494672,
|
||
|
|
"grad_norm": 0.6950676115493591,
|
||
|
|
"learning_rate": 2.0587339456411503e-07,
|
||
|
|
"loss": 0.0558,
|
||
|
|
"step": 1358
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.759005580923389,
|
||
|
|
"grad_norm": 0.6247096449549893,
|
||
|
|
"learning_rate": 1.9922359659998724e-07,
|
||
|
|
"loss": 0.0535,
|
||
|
|
"step": 1360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.763064434297311,
|
||
|
|
"grad_norm": 0.6324280760328311,
|
||
|
|
"learning_rate": 1.9268078151915724e-07,
|
||
|
|
"loss": 0.0572,
|
||
|
|
"step": 1362
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.767123287671233,
|
||
|
|
"grad_norm": 0.6887667157732265,
|
||
|
|
"learning_rate": 1.8624509512496336e-07,
|
||
|
|
"loss": 0.0567,
|
||
|
|
"step": 1364
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7711821410451547,
|
||
|
|
"grad_norm": 0.6432677564664775,
|
||
|
|
"learning_rate": 1.799166808334335e-07,
|
||
|
|
"loss": 0.0561,
|
||
|
|
"step": 1366
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7752409944190766,
|
||
|
|
"grad_norm": 0.6374676188470956,
|
||
|
|
"learning_rate": 1.7369567967009226e-07,
|
||
|
|
"loss": 0.052,
|
||
|
|
"step": 1368
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7792998477929984,
|
||
|
|
"grad_norm": 0.6411778692609862,
|
||
|
|
"learning_rate": 1.6758223026681507e-07,
|
||
|
|
"loss": 0.056,
|
||
|
|
"step": 1370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7833587011669203,
|
||
|
|
"grad_norm": 0.5631134199833117,
|
||
|
|
"learning_rate": 1.615764688587429e-07,
|
||
|
|
"loss": 0.0508,
|
||
|
|
"step": 1372
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.787417554540842,
|
||
|
|
"grad_norm": 0.6819140357771947,
|
||
|
|
"learning_rate": 1.5567852928124237e-07,
|
||
|
|
"loss": 0.0571,
|
||
|
|
"step": 1374
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.791476407914764,
|
||
|
|
"grad_norm": 0.5799311609819854,
|
||
|
|
"learning_rate": 1.4988854296692557e-07,
|
||
|
|
"loss": 0.0503,
|
||
|
|
"step": 1376
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.795535261288686,
|
||
|
|
"grad_norm": 0.6901834332410363,
|
||
|
|
"learning_rate": 1.442066389427199e-07,
|
||
|
|
"loss": 0.0599,
|
||
|
|
"step": 1378
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7995941146626078,
|
||
|
|
"grad_norm": 0.6242349895835719,
|
||
|
|
"learning_rate": 1.386329438269929e-07,
|
||
|
|
"loss": 0.0563,
|
||
|
|
"step": 1380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8036529680365296,
|
||
|
|
"grad_norm": 0.6747442124346058,
|
||
|
|
"learning_rate": 1.3316758182673307e-07,
|
||
|
|
"loss": 0.0559,
|
||
|
|
"step": 1382
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8077118214104515,
|
||
|
|
"grad_norm": 0.6806732183113077,
|
||
|
|
"learning_rate": 1.2781067473477905e-07,
|
||
|
|
"loss": 0.0553,
|
||
|
|
"step": 1384
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8117706747843734,
|
||
|
|
"grad_norm": 0.5453489129772455,
|
||
|
|
"learning_rate": 1.225623419271055e-07,
|
||
|
|
"loss": 0.0492,
|
||
|
|
"step": 1386
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8158295281582952,
|
||
|
|
"grad_norm": 0.6628204710761366,
|
||
|
|
"learning_rate": 1.1742270036016523e-07,
|
||
|
|
"loss": 0.0542,
|
||
|
|
"step": 1388
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.819888381532217,
|
||
|
|
"grad_norm": 0.6122682091369429,
|
||
|
|
"learning_rate": 1.1239186456828033e-07,
|
||
|
|
"loss": 0.0551,
|
||
|
|
"step": 1390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.823947234906139,
|
||
|
|
"grad_norm": 0.6936962374041317,
|
||
|
|
"learning_rate": 1.0746994666109234e-07,
|
||
|
|
"loss": 0.0573,
|
||
|
|
"step": 1392
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.828006088280061,
|
||
|
|
"grad_norm": 0.5643200467907136,
|
||
|
|
"learning_rate": 1.0265705632106216e-07,
|
||
|
|
"loss": 0.0546,
|
||
|
|
"step": 1394
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8320649416539827,
|
||
|
|
"grad_norm": 0.6353165412067484,
|
||
|
|
"learning_rate": 9.795330080102527e-08,
|
||
|
|
"loss": 0.0541,
|
||
|
|
"step": 1396
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8361237950279046,
|
||
|
|
"grad_norm": 0.6131605918166376,
|
||
|
|
"learning_rate": 9.335878492180373e-08,
|
||
|
|
"loss": 0.0519,
|
||
|
|
"step": 1398
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8401826484018264,
|
||
|
|
"grad_norm": 0.6838700561564602,
|
||
|
|
"learning_rate": 8.887361106986848e-08,
|
||
|
|
"loss": 0.0557,
|
||
|
|
"step": 1400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8442415017757483,
|
||
|
|
"grad_norm": 0.6903440686599182,
|
||
|
|
"learning_rate": 8.44978791950607e-08,
|
||
|
|
"loss": 0.0594,
|
||
|
|
"step": 1402
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.84830035514967,
|
||
|
|
"grad_norm": 0.6727563667189779,
|
||
|
|
"learning_rate": 8.023168680835913e-08,
|
||
|
|
"loss": 0.0599,
|
||
|
|
"step": 1404
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.852359208523592,
|
||
|
|
"grad_norm": 0.6714864747461081,
|
||
|
|
"learning_rate": 7.60751289797118e-08,
|
||
|
|
"loss": 0.0596,
|
||
|
|
"step": 1406
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.856418061897514,
|
||
|
|
"grad_norm": 0.6100055956786066,
|
||
|
|
"learning_rate": 7.202829833591496e-08,
|
||
|
|
"loss": 0.056,
|
||
|
|
"step": 1408
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8604769152714358,
|
||
|
|
"grad_norm": 0.6561132306996892,
|
||
|
|
"learning_rate": 6.809128505855189e-08,
|
||
|
|
"loss": 0.0556,
|
||
|
|
"step": 1410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8645357686453576,
|
||
|
|
"grad_norm": 0.6332585194044941,
|
||
|
|
"learning_rate": 6.426417688197961e-08,
|
||
|
|
"loss": 0.0532,
|
||
|
|
"step": 1412
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8685946220192795,
|
||
|
|
"grad_norm": 0.6170375673674818,
|
||
|
|
"learning_rate": 6.054705909137426e-08,
|
||
|
|
"loss": 0.0496,
|
||
|
|
"step": 1414
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8726534753932014,
|
||
|
|
"grad_norm": 0.60220768047444,
|
||
|
|
"learning_rate": 5.6940014520834865e-08,
|
||
|
|
"loss": 0.0539,
|
||
|
|
"step": 1416
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8767123287671232,
|
||
|
|
"grad_norm": 0.5816521708453392,
|
||
|
|
"learning_rate": 5.344312355153036e-08,
|
||
|
|
"loss": 0.0523,
|
||
|
|
"step": 1418
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.880771182141045,
|
||
|
|
"grad_norm": 0.6233216693560418,
|
||
|
|
"learning_rate": 5.005646410991549e-08,
|
||
|
|
"loss": 0.0547,
|
||
|
|
"step": 1420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.884830035514967,
|
||
|
|
"grad_norm": 0.6419014793947228,
|
||
|
|
"learning_rate": 4.678011166598884e-08,
|
||
|
|
"loss": 0.0619,
|
||
|
|
"step": 1422
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.888888888888889,
|
||
|
|
"grad_norm": 0.6187502232959146,
|
||
|
|
"learning_rate": 4.3614139231614725e-08,
|
||
|
|
"loss": 0.0506,
|
||
|
|
"step": 1424
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8929477422628107,
|
||
|
|
"grad_norm": 0.6573679267165627,
|
||
|
|
"learning_rate": 4.0558617358892326e-08,
|
||
|
|
"loss": 0.054,
|
||
|
|
"step": 1426
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8970065956367326,
|
||
|
|
"grad_norm": 0.7332565562788992,
|
||
|
|
"learning_rate": 3.7613614138587995e-08,
|
||
|
|
"loss": 0.0587,
|
||
|
|
"step": 1428
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9010654490106544,
|
||
|
|
"grad_norm": 0.6415121216139319,
|
||
|
|
"learning_rate": 3.477919519861428e-08,
|
||
|
|
"loss": 0.0537,
|
||
|
|
"step": 1430
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9051243023845763,
|
||
|
|
"grad_norm": 0.7104972559533141,
|
||
|
|
"learning_rate": 3.205542370256997e-08,
|
||
|
|
"loss": 0.0549,
|
||
|
|
"step": 1432
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.909183155758498,
|
||
|
|
"grad_norm": 0.6659234362952994,
|
||
|
|
"learning_rate": 2.944236034832959e-08,
|
||
|
|
"loss": 0.059,
|
||
|
|
"step": 1434
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.91324200913242,
|
||
|
|
"grad_norm": 0.6504771749865272,
|
||
|
|
"learning_rate": 2.6940063366693303e-08,
|
||
|
|
"loss": 0.0545,
|
||
|
|
"step": 1436
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.917300862506342,
|
||
|
|
"grad_norm": 0.6035759484889454,
|
||
|
|
"learning_rate": 2.4548588520089123e-08,
|
||
|
|
"loss": 0.0544,
|
||
|
|
"step": 1438
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9213597158802638,
|
||
|
|
"grad_norm": 0.6114721671996689,
|
||
|
|
"learning_rate": 2.2267989101328878e-08,
|
||
|
|
"loss": 0.0531,
|
||
|
|
"step": 1440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9254185692541856,
|
||
|
|
"grad_norm": 0.6776141271918054,
|
||
|
|
"learning_rate": 2.0098315932421952e-08,
|
||
|
|
"loss": 0.0548,
|
||
|
|
"step": 1442
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9294774226281075,
|
||
|
|
"grad_norm": 0.5785157075548263,
|
||
|
|
"learning_rate": 1.803961736344062e-08,
|
||
|
|
"loss": 0.0489,
|
||
|
|
"step": 1444
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9335362760020294,
|
||
|
|
"grad_norm": 0.6257664598177896,
|
||
|
|
"learning_rate": 1.6091939271446478e-08,
|
||
|
|
"loss": 0.0541,
|
||
|
|
"step": 1446
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9375951293759512,
|
||
|
|
"grad_norm": 0.6581827205937377,
|
||
|
|
"learning_rate": 1.4255325059463477e-08,
|
||
|
|
"loss": 0.057,
|
||
|
|
"step": 1448
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.941653982749873,
|
||
|
|
"grad_norm": 0.5842892930844981,
|
||
|
|
"learning_rate": 1.252981565551481e-08,
|
||
|
|
"loss": 0.0524,
|
||
|
|
"step": 1450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.945712836123795,
|
||
|
|
"grad_norm": 0.660064548851826,
|
||
|
|
"learning_rate": 1.0915449511708088e-08,
|
||
|
|
"loss": 0.0546,
|
||
|
|
"step": 1452
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.949771689497717,
|
||
|
|
"grad_norm": 0.6419857691002046,
|
||
|
|
"learning_rate": 9.412262603378797e-09,
|
||
|
|
"loss": 0.0544,
|
||
|
|
"step": 1454
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9538305428716387,
|
||
|
|
"grad_norm": 0.5794278961094809,
|
||
|
|
"learning_rate": 8.020288428289836e-09,
|
||
|
|
"loss": 0.0532,
|
||
|
|
"step": 1456
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9578893962455606,
|
||
|
|
"grad_norm": 0.7121133884438979,
|
||
|
|
"learning_rate": 6.739558005884883e-09,
|
||
|
|
"loss": 0.0577,
|
||
|
|
"step": 1458
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9619482496194824,
|
||
|
|
"grad_norm": 0.6610137998761876,
|
||
|
|
"learning_rate": 5.570099876595625e-09,
|
||
|
|
"loss": 0.0582,
|
||
|
|
"step": 1460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9660071029934043,
|
||
|
|
"grad_norm": 0.5782156107576852,
|
||
|
|
"learning_rate": 4.511940101207812e-09,
|
||
|
|
"loss": 0.0517,
|
||
|
|
"step": 1462
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.970065956367326,
|
||
|
|
"grad_norm": 0.6497750292771783,
|
||
|
|
"learning_rate": 3.565102260278397e-09,
|
||
|
|
"loss": 0.0566,
|
||
|
|
"step": 1464
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.974124809741248,
|
||
|
|
"grad_norm": 0.6075980392613102,
|
||
|
|
"learning_rate": 2.72960745361206e-09,
|
||
|
|
"loss": 0.0521,
|
||
|
|
"step": 1466
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.97818366311517,
|
||
|
|
"grad_norm": 0.6593396919810925,
|
||
|
|
"learning_rate": 2.0054742997893674e-09,
|
||
|
|
"loss": 0.0561,
|
||
|
|
"step": 1468
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9822425164890918,
|
||
|
|
"grad_norm": 0.6349479858496213,
|
||
|
|
"learning_rate": 1.392718935752102e-09,
|
||
|
|
"loss": 0.0527,
|
||
|
|
"step": 1470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9863013698630136,
|
||
|
|
"grad_norm": 0.6624916331368633,
|
||
|
|
"learning_rate": 8.913550164463269e-10,
|
||
|
|
"loss": 0.053,
|
||
|
|
"step": 1472
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9903602232369355,
|
||
|
|
"grad_norm": 0.6290549360832741,
|
||
|
|
"learning_rate": 5.013937145131875e-10,
|
||
|
|
"loss": 0.0563,
|
||
|
|
"step": 1474
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9944190766108574,
|
||
|
|
"grad_norm": 0.625615525565498,
|
||
|
|
"learning_rate": 2.2284372004410804e-10,
|
||
|
|
"loss": 0.0562,
|
||
|
|
"step": 1476
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9984779299847792,
|
||
|
|
"grad_norm": 0.5809460613562152,
|
||
|
|
"learning_rate": 5.5711240385392106e-11,
|
||
|
|
"loss": 0.0501,
|
||
|
|
"step": 1478
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.0,
|
||
|
|
"step": 1479,
|
||
|
|
"total_flos": 4127658346151936.0,
|
||
|
|
"train_loss": 0.21015015432889433,
|
||
|
|
"train_runtime": 144374.0796,
|
||
|
|
"train_samples_per_second": 1.311,
|
||
|
|
"train_steps_per_second": 0.01
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"logging_steps": 2,
|
||
|
|
"max_steps": 1479,
|
||
|
|
"num_input_tokens_seen": 0,
|
||
|
|
"num_train_epochs": 3,
|
||
|
|
"save_steps": 500,
|
||
|
|
"stateful_callbacks": {
|
||
|
|
"TrainerControl": {
|
||
|
|
"args": {
|
||
|
|
"should_epoch_stop": false,
|
||
|
|
"should_evaluate": false,
|
||
|
|
"should_log": false,
|
||
|
|
"should_save": true,
|
||
|
|
"should_training_stop": true
|
||
|
|
},
|
||
|
|
"attributes": {}
|
||
|
|
}
|
||
|
|
},
|
||
|
|
"total_flos": 4127658346151936.0,
|
||
|
|
"train_batch_size": 1,
|
||
|
|
"trial_name": null,
|
||
|
|
"trial_params": null
|
||
|
|
}
|