6400 lines
157 KiB
JSON
6400 lines
157 KiB
JSON
|
|
{
|
||
|
|
"best_metric": null,
|
||
|
|
"best_model_checkpoint": null,
|
||
|
|
"epoch": 0.9998897828722584,
|
||
|
|
"eval_steps": 500,
|
||
|
|
"global_step": 4536,
|
||
|
|
"is_hyper_param_search": false,
|
||
|
|
"is_local_process_zero": true,
|
||
|
|
"is_world_process_zero": true,
|
||
|
|
"log_history": [
|
||
|
|
{
|
||
|
|
"epoch": 0.0011021712774165104,
|
||
|
|
"grad_norm": 59.11821880494146,
|
||
|
|
"learning_rate": 3.303964757709251e-06,
|
||
|
|
"loss": 3.825,
|
||
|
|
"step": 5
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.002204342554833021,
|
||
|
|
"grad_norm": 42.17582597601867,
|
||
|
|
"learning_rate": 6.607929515418502e-06,
|
||
|
|
"loss": 3.6223,
|
||
|
|
"step": 10
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.0033065138322495315,
|
||
|
|
"grad_norm": 18.347454692288814,
|
||
|
|
"learning_rate": 9.911894273127752e-06,
|
||
|
|
"loss": 2.8332,
|
||
|
|
"step": 15
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.004408685109666042,
|
||
|
|
"grad_norm": 7.929104229119195,
|
||
|
|
"learning_rate": 1.3215859030837005e-05,
|
||
|
|
"loss": 2.0196,
|
||
|
|
"step": 20
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.005510856387082552,
|
||
|
|
"grad_norm": 3.3860390518924492,
|
||
|
|
"learning_rate": 1.6519823788546254e-05,
|
||
|
|
"loss": 1.6062,
|
||
|
|
"step": 25
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.006613027664499063,
|
||
|
|
"grad_norm": 1.5094502156188854,
|
||
|
|
"learning_rate": 1.9823788546255504e-05,
|
||
|
|
"loss": 1.3491,
|
||
|
|
"step": 30
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.007715198941915574,
|
||
|
|
"grad_norm": 0.8243795908744608,
|
||
|
|
"learning_rate": 2.3127753303964757e-05,
|
||
|
|
"loss": 1.1719,
|
||
|
|
"step": 35
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.008817370219332083,
|
||
|
|
"grad_norm": 0.47607081050989597,
|
||
|
|
"learning_rate": 2.643171806167401e-05,
|
||
|
|
"loss": 1.1158,
|
||
|
|
"step": 40
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.009919541496748594,
|
||
|
|
"grad_norm": 0.34776556828145655,
|
||
|
|
"learning_rate": 2.9735682819383256e-05,
|
||
|
|
"loss": 1.0865,
|
||
|
|
"step": 45
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.011021712774165105,
|
||
|
|
"grad_norm": 0.34304699002193145,
|
||
|
|
"learning_rate": 3.303964757709251e-05,
|
||
|
|
"loss": 1.0369,
|
||
|
|
"step": 50
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.012123884051581615,
|
||
|
|
"grad_norm": 0.2531395849305058,
|
||
|
|
"learning_rate": 3.634361233480176e-05,
|
||
|
|
"loss": 1.0383,
|
||
|
|
"step": 55
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.013226055328998126,
|
||
|
|
"grad_norm": 0.21165915652546113,
|
||
|
|
"learning_rate": 3.964757709251101e-05,
|
||
|
|
"loss": 1.021,
|
||
|
|
"step": 60
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.014328226606414637,
|
||
|
|
"grad_norm": 0.1944258505202001,
|
||
|
|
"learning_rate": 4.295154185022026e-05,
|
||
|
|
"loss": 0.9806,
|
||
|
|
"step": 65
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.015430397883831147,
|
||
|
|
"grad_norm": 0.2225104161521765,
|
||
|
|
"learning_rate": 4.625550660792951e-05,
|
||
|
|
"loss": 0.9731,
|
||
|
|
"step": 70
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.016532569161247658,
|
||
|
|
"grad_norm": 0.18722320005454055,
|
||
|
|
"learning_rate": 4.9559471365638766e-05,
|
||
|
|
"loss": 0.9756,
|
||
|
|
"step": 75
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.017634740438664167,
|
||
|
|
"grad_norm": 0.18788262680578474,
|
||
|
|
"learning_rate": 5.286343612334802e-05,
|
||
|
|
"loss": 0.9827,
|
||
|
|
"step": 80
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.01873691171608068,
|
||
|
|
"grad_norm": 0.15447512609368866,
|
||
|
|
"learning_rate": 5.6167400881057265e-05,
|
||
|
|
"loss": 0.9681,
|
||
|
|
"step": 85
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.019839082993497188,
|
||
|
|
"grad_norm": 0.16941375745095524,
|
||
|
|
"learning_rate": 5.947136563876651e-05,
|
||
|
|
"loss": 0.9765,
|
||
|
|
"step": 90
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.0209412542709137,
|
||
|
|
"grad_norm": 0.14612889382972916,
|
||
|
|
"learning_rate": 6.277533039647576e-05,
|
||
|
|
"loss": 0.9359,
|
||
|
|
"step": 95
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.02204342554833021,
|
||
|
|
"grad_norm": 0.1405739682413469,
|
||
|
|
"learning_rate": 6.607929515418502e-05,
|
||
|
|
"loss": 0.9543,
|
||
|
|
"step": 100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.023145596825746722,
|
||
|
|
"grad_norm": 0.130605536696256,
|
||
|
|
"learning_rate": 6.938325991189426e-05,
|
||
|
|
"loss": 0.9136,
|
||
|
|
"step": 105
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.02424776810316323,
|
||
|
|
"grad_norm": 0.11300041363364274,
|
||
|
|
"learning_rate": 7.268722466960352e-05,
|
||
|
|
"loss": 0.9478,
|
||
|
|
"step": 110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.025349939380579743,
|
||
|
|
"grad_norm": 0.09643276514367859,
|
||
|
|
"learning_rate": 7.599118942731278e-05,
|
||
|
|
"loss": 0.9125,
|
||
|
|
"step": 115
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.026452110657996252,
|
||
|
|
"grad_norm": 0.10011972487944946,
|
||
|
|
"learning_rate": 7.929515418502201e-05,
|
||
|
|
"loss": 0.9309,
|
||
|
|
"step": 120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.027554281935412765,
|
||
|
|
"grad_norm": 0.08587719618641608,
|
||
|
|
"learning_rate": 8.259911894273126e-05,
|
||
|
|
"loss": 0.9023,
|
||
|
|
"step": 125
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.028656453212829273,
|
||
|
|
"grad_norm": 0.08726681655122204,
|
||
|
|
"learning_rate": 8.590308370044052e-05,
|
||
|
|
"loss": 0.9058,
|
||
|
|
"step": 130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.029758624490245786,
|
||
|
|
"grad_norm": 0.09342883329751345,
|
||
|
|
"learning_rate": 8.920704845814977e-05,
|
||
|
|
"loss": 0.9054,
|
||
|
|
"step": 135
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.030860795767662295,
|
||
|
|
"grad_norm": 0.08738078317709104,
|
||
|
|
"learning_rate": 9.251101321585903e-05,
|
||
|
|
"loss": 0.8833,
|
||
|
|
"step": 140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.031962967045078804,
|
||
|
|
"grad_norm": 0.07421598879157193,
|
||
|
|
"learning_rate": 9.581497797356827e-05,
|
||
|
|
"loss": 0.9187,
|
||
|
|
"step": 145
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.033065138322495316,
|
||
|
|
"grad_norm": 0.05948715305053877,
|
||
|
|
"learning_rate": 9.911894273127753e-05,
|
||
|
|
"loss": 0.8747,
|
||
|
|
"step": 150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.03416730959991183,
|
||
|
|
"grad_norm": 0.07139242065324014,
|
||
|
|
"learning_rate": 0.00010242290748898678,
|
||
|
|
"loss": 0.8821,
|
||
|
|
"step": 155
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.035269480877328334,
|
||
|
|
"grad_norm": 0.0688597307361162,
|
||
|
|
"learning_rate": 0.00010572687224669604,
|
||
|
|
"loss": 0.8818,
|
||
|
|
"step": 160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.036371652154744846,
|
||
|
|
"grad_norm": 0.06917882261732754,
|
||
|
|
"learning_rate": 0.00010903083700440527,
|
||
|
|
"loss": 0.9223,
|
||
|
|
"step": 165
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.03747382343216136,
|
||
|
|
"grad_norm": 0.06477308185348316,
|
||
|
|
"learning_rate": 0.00011233480176211453,
|
||
|
|
"loss": 0.8979,
|
||
|
|
"step": 170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.03857599470957787,
|
||
|
|
"grad_norm": 0.07148480842885613,
|
||
|
|
"learning_rate": 0.00011563876651982378,
|
||
|
|
"loss": 0.8864,
|
||
|
|
"step": 175
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.039678165986994376,
|
||
|
|
"grad_norm": 0.06963902310697093,
|
||
|
|
"learning_rate": 0.00011894273127753302,
|
||
|
|
"loss": 0.8924,
|
||
|
|
"step": 180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.04078033726441089,
|
||
|
|
"grad_norm": 0.06681188993008794,
|
||
|
|
"learning_rate": 0.00012224669603524228,
|
||
|
|
"loss": 0.8853,
|
||
|
|
"step": 185
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.0418825085418274,
|
||
|
|
"grad_norm": 0.07882523551134729,
|
||
|
|
"learning_rate": 0.00012555066079295151,
|
||
|
|
"loss": 0.8752,
|
||
|
|
"step": 190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.042984679819243914,
|
||
|
|
"grad_norm": 0.07046808160734085,
|
||
|
|
"learning_rate": 0.00012885462555066077,
|
||
|
|
"loss": 0.9005,
|
||
|
|
"step": 195
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.04408685109666042,
|
||
|
|
"grad_norm": 0.08831018054166795,
|
||
|
|
"learning_rate": 0.00013215859030837003,
|
||
|
|
"loss": 0.8779,
|
||
|
|
"step": 200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.04518902237407693,
|
||
|
|
"grad_norm": 0.06786610627531549,
|
||
|
|
"learning_rate": 0.0001354625550660793,
|
||
|
|
"loss": 0.8865,
|
||
|
|
"step": 205
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.046291193651493444,
|
||
|
|
"grad_norm": 0.06898944984160912,
|
||
|
|
"learning_rate": 0.00013876651982378853,
|
||
|
|
"loss": 0.8951,
|
||
|
|
"step": 210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.04739336492890995,
|
||
|
|
"grad_norm": 0.07193213519196924,
|
||
|
|
"learning_rate": 0.00014207048458149779,
|
||
|
|
"loss": 0.91,
|
||
|
|
"step": 215
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.04849553620632646,
|
||
|
|
"grad_norm": 0.06470248286974109,
|
||
|
|
"learning_rate": 0.00014537444933920705,
|
||
|
|
"loss": 0.8713,
|
||
|
|
"step": 220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.049597707483742974,
|
||
|
|
"grad_norm": 0.07558920025422085,
|
||
|
|
"learning_rate": 0.0001486784140969163,
|
||
|
|
"loss": 0.9003,
|
||
|
|
"step": 225
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.050699878761159486,
|
||
|
|
"grad_norm": 0.08225650056399321,
|
||
|
|
"learning_rate": 0.00015198237885462556,
|
||
|
|
"loss": 0.8744,
|
||
|
|
"step": 230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.05180205003857599,
|
||
|
|
"grad_norm": 0.08830347321776405,
|
||
|
|
"learning_rate": 0.0001552863436123348,
|
||
|
|
"loss": 0.8543,
|
||
|
|
"step": 235
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.052904221315992504,
|
||
|
|
"grad_norm": 0.06812983519818898,
|
||
|
|
"learning_rate": 0.00015859030837004403,
|
||
|
|
"loss": 0.9006,
|
||
|
|
"step": 240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.05400639259340902,
|
||
|
|
"grad_norm": 0.08404581821873025,
|
||
|
|
"learning_rate": 0.0001618942731277533,
|
||
|
|
"loss": 0.8915,
|
||
|
|
"step": 245
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.05510856387082553,
|
||
|
|
"grad_norm": 0.07050034877556227,
|
||
|
|
"learning_rate": 0.00016519823788546252,
|
||
|
|
"loss": 0.8737,
|
||
|
|
"step": 250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.056210735148242034,
|
||
|
|
"grad_norm": 0.060716071571555404,
|
||
|
|
"learning_rate": 0.0001685022026431718,
|
||
|
|
"loss": 0.8551,
|
||
|
|
"step": 255
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.05731290642565855,
|
||
|
|
"grad_norm": 0.06881772285936742,
|
||
|
|
"learning_rate": 0.00017180616740088104,
|
||
|
|
"loss": 0.8895,
|
||
|
|
"step": 260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.05841507770307506,
|
||
|
|
"grad_norm": 0.0616288543822739,
|
||
|
|
"learning_rate": 0.0001751101321585903,
|
||
|
|
"loss": 0.8761,
|
||
|
|
"step": 265
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.05951724898049157,
|
||
|
|
"grad_norm": 0.06164986983886088,
|
||
|
|
"learning_rate": 0.00017841409691629953,
|
||
|
|
"loss": 0.8948,
|
||
|
|
"step": 270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.06061942025790808,
|
||
|
|
"grad_norm": 0.08556452067304546,
|
||
|
|
"learning_rate": 0.00018171806167400882,
|
||
|
|
"loss": 0.8965,
|
||
|
|
"step": 275
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.06172159153532459,
|
||
|
|
"grad_norm": 0.06708897999556158,
|
||
|
|
"learning_rate": 0.00018502202643171805,
|
||
|
|
"loss": 0.8717,
|
||
|
|
"step": 280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.0628237628127411,
|
||
|
|
"grad_norm": 0.06626854890594584,
|
||
|
|
"learning_rate": 0.00018832599118942728,
|
||
|
|
"loss": 0.8967,
|
||
|
|
"step": 285
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.06392593409015761,
|
||
|
|
"grad_norm": 0.06798823718932381,
|
||
|
|
"learning_rate": 0.00019162995594713654,
|
||
|
|
"loss": 0.879,
|
||
|
|
"step": 290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.06502810536757413,
|
||
|
|
"grad_norm": 0.07021508305766244,
|
||
|
|
"learning_rate": 0.0001949339207048458,
|
||
|
|
"loss": 0.8761,
|
||
|
|
"step": 295
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.06613027664499063,
|
||
|
|
"grad_norm": 0.06347368995094158,
|
||
|
|
"learning_rate": 0.00019823788546255506,
|
||
|
|
"loss": 0.853,
|
||
|
|
"step": 300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.06723244792240714,
|
||
|
|
"grad_norm": 0.07136142919506049,
|
||
|
|
"learning_rate": 0.0002015418502202643,
|
||
|
|
"loss": 0.8771,
|
||
|
|
"step": 305
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.06833461919982366,
|
||
|
|
"grad_norm": 0.06846198864454035,
|
||
|
|
"learning_rate": 0.00020484581497797356,
|
||
|
|
"loss": 0.8903,
|
||
|
|
"step": 310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.06943679047724016,
|
||
|
|
"grad_norm": 0.06137623690084353,
|
||
|
|
"learning_rate": 0.0002081497797356828,
|
||
|
|
"loss": 0.853,
|
||
|
|
"step": 315
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07053896175465667,
|
||
|
|
"grad_norm": 0.07068069693448537,
|
||
|
|
"learning_rate": 0.00021145374449339208,
|
||
|
|
"loss": 0.8868,
|
||
|
|
"step": 320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07164113303207319,
|
||
|
|
"grad_norm": 0.0633263499263589,
|
||
|
|
"learning_rate": 0.0002147577092511013,
|
||
|
|
"loss": 0.8847,
|
||
|
|
"step": 325
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07274330430948969,
|
||
|
|
"grad_norm": 0.06653681784940939,
|
||
|
|
"learning_rate": 0.00021806167400881054,
|
||
|
|
"loss": 0.8725,
|
||
|
|
"step": 330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07384547558690621,
|
||
|
|
"grad_norm": 0.06583964059153263,
|
||
|
|
"learning_rate": 0.0002213656387665198,
|
||
|
|
"loss": 0.878,
|
||
|
|
"step": 335
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07494764686432272,
|
||
|
|
"grad_norm": 0.08061748319197447,
|
||
|
|
"learning_rate": 0.00022466960352422906,
|
||
|
|
"loss": 0.8984,
|
||
|
|
"step": 340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07604981814173922,
|
||
|
|
"grad_norm": 0.0731004519013094,
|
||
|
|
"learning_rate": 0.00022797356828193832,
|
||
|
|
"loss": 0.8629,
|
||
|
|
"step": 345
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07715198941915574,
|
||
|
|
"grad_norm": 0.06044943906403856,
|
||
|
|
"learning_rate": 0.00023127753303964755,
|
||
|
|
"loss": 0.8783,
|
||
|
|
"step": 350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07825416069657225,
|
||
|
|
"grad_norm": 0.06920900396817772,
|
||
|
|
"learning_rate": 0.0002345814977973568,
|
||
|
|
"loss": 0.8882,
|
||
|
|
"step": 355
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07935633197398875,
|
||
|
|
"grad_norm": 0.06396348587422171,
|
||
|
|
"learning_rate": 0.00023788546255506604,
|
||
|
|
"loss": 0.8557,
|
||
|
|
"step": 360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.08045850325140527,
|
||
|
|
"grad_norm": 0.06844620445410649,
|
||
|
|
"learning_rate": 0.00024118942731277533,
|
||
|
|
"loss": 0.8973,
|
||
|
|
"step": 365
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.08156067452882178,
|
||
|
|
"grad_norm": 0.06653654136399571,
|
||
|
|
"learning_rate": 0.00024449339207048456,
|
||
|
|
"loss": 0.8916,
|
||
|
|
"step": 370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.08266284580623828,
|
||
|
|
"grad_norm": 0.06216710353519921,
|
||
|
|
"learning_rate": 0.0002477973568281938,
|
||
|
|
"loss": 0.8499,
|
||
|
|
"step": 375
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.0837650170836548,
|
||
|
|
"grad_norm": 0.05880885627082627,
|
||
|
|
"learning_rate": 0.00025110132158590303,
|
||
|
|
"loss": 0.9042,
|
||
|
|
"step": 380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.08486718836107131,
|
||
|
|
"grad_norm": 0.06754842514493627,
|
||
|
|
"learning_rate": 0.0002544052863436123,
|
||
|
|
"loss": 0.8641,
|
||
|
|
"step": 385
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.08596935963848783,
|
||
|
|
"grad_norm": 0.06140819068848091,
|
||
|
|
"learning_rate": 0.00025770925110132155,
|
||
|
|
"loss": 0.8943,
|
||
|
|
"step": 390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.08707153091590433,
|
||
|
|
"grad_norm": 0.061754148705009615,
|
||
|
|
"learning_rate": 0.00026101321585903083,
|
||
|
|
"loss": 0.8718,
|
||
|
|
"step": 395
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.08817370219332084,
|
||
|
|
"grad_norm": 0.06597271175452663,
|
||
|
|
"learning_rate": 0.00026431718061674007,
|
||
|
|
"loss": 0.8791,
|
||
|
|
"step": 400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.08927587347073736,
|
||
|
|
"grad_norm": 0.058815224093820354,
|
||
|
|
"learning_rate": 0.00026762114537444935,
|
||
|
|
"loss": 0.8822,
|
||
|
|
"step": 405
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.09037804474815386,
|
||
|
|
"grad_norm": 0.06335803406426932,
|
||
|
|
"learning_rate": 0.0002709251101321586,
|
||
|
|
"loss": 0.8555,
|
||
|
|
"step": 410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.09148021602557037,
|
||
|
|
"grad_norm": 0.061177086710910725,
|
||
|
|
"learning_rate": 0.0002742290748898678,
|
||
|
|
"loss": 0.8999,
|
||
|
|
"step": 415
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.09258238730298689,
|
||
|
|
"grad_norm": 0.0640511083565074,
|
||
|
|
"learning_rate": 0.00027753303964757705,
|
||
|
|
"loss": 0.8907,
|
||
|
|
"step": 420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.0936845585804034,
|
||
|
|
"grad_norm": 0.06233966383032991,
|
||
|
|
"learning_rate": 0.0002808370044052863,
|
||
|
|
"loss": 0.8794,
|
||
|
|
"step": 425
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.0947867298578199,
|
||
|
|
"grad_norm": 0.057022002708166875,
|
||
|
|
"learning_rate": 0.00028414096916299557,
|
||
|
|
"loss": 0.9005,
|
||
|
|
"step": 430
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.09588890113523642,
|
||
|
|
"grad_norm": 0.0617487231342341,
|
||
|
|
"learning_rate": 0.0002874449339207048,
|
||
|
|
"loss": 0.8574,
|
||
|
|
"step": 435
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.09699107241265292,
|
||
|
|
"grad_norm": 0.07666912755096209,
|
||
|
|
"learning_rate": 0.0002907488986784141,
|
||
|
|
"loss": 0.89,
|
||
|
|
"step": 440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.09809324369006944,
|
||
|
|
"grad_norm": 0.06908006068504674,
|
||
|
|
"learning_rate": 0.0002940528634361233,
|
||
|
|
"loss": 0.9058,
|
||
|
|
"step": 445
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.09919541496748595,
|
||
|
|
"grad_norm": 0.06466123609412561,
|
||
|
|
"learning_rate": 0.0002973568281938326,
|
||
|
|
"loss": 0.8852,
|
||
|
|
"step": 450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.10029758624490245,
|
||
|
|
"grad_norm": 0.05620325485859758,
|
||
|
|
"learning_rate": 0.0002999999555762735,
|
||
|
|
"loss": 0.882,
|
||
|
|
"step": 455
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.10139975752231897,
|
||
|
|
"grad_norm": 0.05608554381774001,
|
||
|
|
"learning_rate": 0.0002999984007486092,
|
||
|
|
"loss": 0.8602,
|
||
|
|
"step": 460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.10250192879973548,
|
||
|
|
"grad_norm": 0.06300983111668536,
|
||
|
|
"learning_rate": 0.0002999946247609333,
|
||
|
|
"loss": 0.8939,
|
||
|
|
"step": 465
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.10360410007715198,
|
||
|
|
"grad_norm": 0.05359836181219753,
|
||
|
|
"learning_rate": 0.00029998862766916014,
|
||
|
|
"loss": 0.8719,
|
||
|
|
"step": 470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1047062713545685,
|
||
|
|
"grad_norm": 0.05972412283076983,
|
||
|
|
"learning_rate": 0.0002999804095620941,
|
||
|
|
"loss": 0.8567,
|
||
|
|
"step": 475
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.10580844263198501,
|
||
|
|
"grad_norm": 0.055736449030028,
|
||
|
|
"learning_rate": 0.00029996997056142786,
|
||
|
|
"loss": 0.8928,
|
||
|
|
"step": 480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.10691061390940153,
|
||
|
|
"grad_norm": 0.05535452244152051,
|
||
|
|
"learning_rate": 0.0002999573108217412,
|
||
|
|
"loss": 0.8815,
|
||
|
|
"step": 485
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.10801278518681803,
|
||
|
|
"grad_norm": 0.05571951712028587,
|
||
|
|
"learning_rate": 0.00029994243053049795,
|
||
|
|
"loss": 0.8273,
|
||
|
|
"step": 490
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.10911495646423454,
|
||
|
|
"grad_norm": 0.0572367057417111,
|
||
|
|
"learning_rate": 0.000299925329908044,
|
||
|
|
"loss": 0.8891,
|
||
|
|
"step": 495
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.11021712774165106,
|
||
|
|
"grad_norm": 0.05206161611783487,
|
||
|
|
"learning_rate": 0.00029990600920760355,
|
||
|
|
"loss": 0.8467,
|
||
|
|
"step": 500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.11131929901906756,
|
||
|
|
"grad_norm": 0.057618220952806956,
|
||
|
|
"learning_rate": 0.0002998844687152753,
|
||
|
|
"loss": 0.84,
|
||
|
|
"step": 505
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.11242147029648407,
|
||
|
|
"grad_norm": 0.0731502466891883,
|
||
|
|
"learning_rate": 0.0002998607087500286,
|
||
|
|
"loss": 0.8899,
|
||
|
|
"step": 510
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.11352364157390059,
|
||
|
|
"grad_norm": 0.06531011144403108,
|
||
|
|
"learning_rate": 0.00029983472966369835,
|
||
|
|
"loss": 0.8805,
|
||
|
|
"step": 515
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1146258128513171,
|
||
|
|
"grad_norm": 0.05719733051934535,
|
||
|
|
"learning_rate": 0.0002998065318409801,
|
||
|
|
"loss": 0.8998,
|
||
|
|
"step": 520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1157279841287336,
|
||
|
|
"grad_norm": 0.04786336091320019,
|
||
|
|
"learning_rate": 0.0002997761156994242,
|
||
|
|
"loss": 0.8454,
|
||
|
|
"step": 525
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.11683015540615012,
|
||
|
|
"grad_norm": 0.061209707796165434,
|
||
|
|
"learning_rate": 0.00029974348168942944,
|
||
|
|
"loss": 0.894,
|
||
|
|
"step": 530
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.11793232668356662,
|
||
|
|
"grad_norm": 0.05567071955981122,
|
||
|
|
"learning_rate": 0.0002997086302942368,
|
||
|
|
"loss": 0.8791,
|
||
|
|
"step": 535
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.11903449796098314,
|
||
|
|
"grad_norm": 0.051378774159055826,
|
||
|
|
"learning_rate": 0.00029967156202992184,
|
||
|
|
"loss": 0.8908,
|
||
|
|
"step": 540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.12013666923839965,
|
||
|
|
"grad_norm": 0.06416620508611666,
|
||
|
|
"learning_rate": 0.0002996322774453875,
|
||
|
|
"loss": 0.912,
|
||
|
|
"step": 545
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.12123884051581615,
|
||
|
|
"grad_norm": 0.049933851892099514,
|
||
|
|
"learning_rate": 0.0002995907771223556,
|
||
|
|
"loss": 0.8819,
|
||
|
|
"step": 550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.12234101179323267,
|
||
|
|
"grad_norm": 0.052571228924698295,
|
||
|
|
"learning_rate": 0.00029954706167535834,
|
||
|
|
"loss": 0.8926,
|
||
|
|
"step": 555
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.12344318307064918,
|
||
|
|
"grad_norm": 0.05946512503425042,
|
||
|
|
"learning_rate": 0.0002995011317517294,
|
||
|
|
"loss": 0.8767,
|
||
|
|
"step": 560
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.12454535434806568,
|
||
|
|
"grad_norm": 0.05379133265550323,
|
||
|
|
"learning_rate": 0.0002994529880315941,
|
||
|
|
"loss": 0.8541,
|
||
|
|
"step": 565
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1256475256254822,
|
||
|
|
"grad_norm": 0.04996259410414281,
|
||
|
|
"learning_rate": 0.00029940263122785936,
|
||
|
|
"loss": 0.8975,
|
||
|
|
"step": 570
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.12674969690289872,
|
||
|
|
"grad_norm": 0.06293201636932731,
|
||
|
|
"learning_rate": 0.0002993500620862033,
|
||
|
|
"loss": 0.8538,
|
||
|
|
"step": 575
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.12785186818031521,
|
||
|
|
"grad_norm": 0.0549834121731086,
|
||
|
|
"learning_rate": 0.000299295281385064,
|
||
|
|
"loss": 0.8766,
|
||
|
|
"step": 580
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.12895403945773173,
|
||
|
|
"grad_norm": 0.06031562586347478,
|
||
|
|
"learning_rate": 0.00029923828993562814,
|
||
|
|
"loss": 0.8519,
|
||
|
|
"step": 585
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.13005621073514825,
|
||
|
|
"grad_norm": 0.05792440007594611,
|
||
|
|
"learning_rate": 0.00029917908858181897,
|
||
|
|
"loss": 0.8295,
|
||
|
|
"step": 590
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.13115838201256474,
|
||
|
|
"grad_norm": 0.22881705102396294,
|
||
|
|
"learning_rate": 0.00029911767820028364,
|
||
|
|
"loss": 0.8934,
|
||
|
|
"step": 595
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.13226055328998126,
|
||
|
|
"grad_norm": 0.2022359302630765,
|
||
|
|
"learning_rate": 0.0002990540597003804,
|
||
|
|
"loss": 0.9332,
|
||
|
|
"step": 600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.13336272456739778,
|
||
|
|
"grad_norm": 0.07299327465806,
|
||
|
|
"learning_rate": 0.0002989882340241651,
|
||
|
|
"loss": 0.8848,
|
||
|
|
"step": 605
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.13446489584481428,
|
||
|
|
"grad_norm": 0.06761831265802426,
|
||
|
|
"learning_rate": 0.0002989202021463772,
|
||
|
|
"loss": 0.8613,
|
||
|
|
"step": 610
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1355670671222308,
|
||
|
|
"grad_norm": 0.08020708226646049,
|
||
|
|
"learning_rate": 0.0002988499650744254,
|
||
|
|
"loss": 0.8961,
|
||
|
|
"step": 615
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1366692383996473,
|
||
|
|
"grad_norm": 0.08560660775941882,
|
||
|
|
"learning_rate": 0.0002987775238483725,
|
||
|
|
"loss": 0.9122,
|
||
|
|
"step": 620
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1377714096770638,
|
||
|
|
"grad_norm": 0.05636244041478988,
|
||
|
|
"learning_rate": 0.0002987028795409204,
|
||
|
|
"loss": 0.8427,
|
||
|
|
"step": 625
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.13887358095448032,
|
||
|
|
"grad_norm": 1.0715539508022556,
|
||
|
|
"learning_rate": 0.0002986260332573939,
|
||
|
|
"loss": 0.8535,
|
||
|
|
"step": 630
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.13997575223189684,
|
||
|
|
"grad_norm": 0.07028650609498803,
|
||
|
|
"learning_rate": 0.0002985469861357243,
|
||
|
|
"loss": 0.8843,
|
||
|
|
"step": 635
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.14107792350931334,
|
||
|
|
"grad_norm": 0.10912168092090672,
|
||
|
|
"learning_rate": 0.0002984657393464329,
|
||
|
|
"loss": 0.8802,
|
||
|
|
"step": 640
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.14218009478672985,
|
||
|
|
"grad_norm": 0.07320669082767357,
|
||
|
|
"learning_rate": 0.0002983822940926133,
|
||
|
|
"loss": 0.8534,
|
||
|
|
"step": 645
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.14328226606414637,
|
||
|
|
"grad_norm": 0.049879806960918614,
|
||
|
|
"learning_rate": 0.0002982966516099137,
|
||
|
|
"loss": 0.8661,
|
||
|
|
"step": 650
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.14438443734156287,
|
||
|
|
"grad_norm": 0.05068348497813321,
|
||
|
|
"learning_rate": 0.00029820881316651866,
|
||
|
|
"loss": 0.881,
|
||
|
|
"step": 655
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.14548660861897939,
|
||
|
|
"grad_norm": 0.05280972060027596,
|
||
|
|
"learning_rate": 0.00029811878006313046,
|
||
|
|
"loss": 0.8552,
|
||
|
|
"step": 660
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1465887798963959,
|
||
|
|
"grad_norm": 0.060948440074130784,
|
||
|
|
"learning_rate": 0.00029802655363294934,
|
||
|
|
"loss": 0.8694,
|
||
|
|
"step": 665
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.14769095117381242,
|
||
|
|
"grad_norm": 0.051871602193463616,
|
||
|
|
"learning_rate": 0.0002979321352416543,
|
||
|
|
"loss": 0.8482,
|
||
|
|
"step": 670
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.14879312245122892,
|
||
|
|
"grad_norm": 0.05140620390257059,
|
||
|
|
"learning_rate": 0.0002978355262873826,
|
||
|
|
"loss": 0.871,
|
||
|
|
"step": 675
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.14989529372864543,
|
||
|
|
"grad_norm": 0.053923668745928,
|
||
|
|
"learning_rate": 0.00029773672820070915,
|
||
|
|
"loss": 0.8617,
|
||
|
|
"step": 680
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.15099746500606195,
|
||
|
|
"grad_norm": 0.05491254789252112,
|
||
|
|
"learning_rate": 0.0002976357424446253,
|
||
|
|
"loss": 0.8688,
|
||
|
|
"step": 685
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.15209963628347845,
|
||
|
|
"grad_norm": 0.057725313291247395,
|
||
|
|
"learning_rate": 0.00029753257051451707,
|
||
|
|
"loss": 0.8725,
|
||
|
|
"step": 690
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.15320180756089496,
|
||
|
|
"grad_norm": 0.06175295050381468,
|
||
|
|
"learning_rate": 0.0002974272139381433,
|
||
|
|
"loss": 0.8721,
|
||
|
|
"step": 695
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.15430397883831148,
|
||
|
|
"grad_norm": 0.05416095170725182,
|
||
|
|
"learning_rate": 0.00029731967427561266,
|
||
|
|
"loss": 0.8477,
|
||
|
|
"step": 700
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.15540615011572798,
|
||
|
|
"grad_norm": 0.05008825843504415,
|
||
|
|
"learning_rate": 0.00029720995311936077,
|
||
|
|
"loss": 0.8539,
|
||
|
|
"step": 705
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1565083213931445,
|
||
|
|
"grad_norm": 0.048098359842914856,
|
||
|
|
"learning_rate": 0.0002970980520941266,
|
||
|
|
"loss": 0.8391,
|
||
|
|
"step": 710
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.15761049267056101,
|
||
|
|
"grad_norm": 0.05792884649127905,
|
||
|
|
"learning_rate": 0.00029698397285692833,
|
||
|
|
"loss": 0.836,
|
||
|
|
"step": 715
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1587126639479775,
|
||
|
|
"grad_norm": 0.04553225276243662,
|
||
|
|
"learning_rate": 0.000296867717097039,
|
||
|
|
"loss": 0.8407,
|
||
|
|
"step": 720
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.15981483522539403,
|
||
|
|
"grad_norm": 0.04857966778373228,
|
||
|
|
"learning_rate": 0.0002967492865359611,
|
||
|
|
"loss": 0.843,
|
||
|
|
"step": 725
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.16091700650281054,
|
||
|
|
"grad_norm": 0.05464944380446163,
|
||
|
|
"learning_rate": 0.00029662868292740165,
|
||
|
|
"loss": 0.85,
|
||
|
|
"step": 730
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.16201917778022704,
|
||
|
|
"grad_norm": 0.045834951820991766,
|
||
|
|
"learning_rate": 0.00029650590805724574,
|
||
|
|
"loss": 0.8661,
|
||
|
|
"step": 735
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.16312134905764356,
|
||
|
|
"grad_norm": 0.053929904093643635,
|
||
|
|
"learning_rate": 0.0002963809637435303,
|
||
|
|
"loss": 0.9115,
|
||
|
|
"step": 740
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.16422352033506007,
|
||
|
|
"grad_norm": 0.04871618742526512,
|
||
|
|
"learning_rate": 0.00029625385183641706,
|
||
|
|
"loss": 0.845,
|
||
|
|
"step": 745
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.16532569161247657,
|
||
|
|
"grad_norm": 0.05247129804792461,
|
||
|
|
"learning_rate": 0.00029612457421816546,
|
||
|
|
"loss": 0.8772,
|
||
|
|
"step": 750
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.16642786288989309,
|
||
|
|
"grad_norm": 0.04850518585643222,
|
||
|
|
"learning_rate": 0.0002959931328031043,
|
||
|
|
"loss": 0.8687,
|
||
|
|
"step": 755
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1675300341673096,
|
||
|
|
"grad_norm": 0.05475798664220526,
|
||
|
|
"learning_rate": 0.00029585952953760386,
|
||
|
|
"loss": 0.8666,
|
||
|
|
"step": 760
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.16863220544472612,
|
||
|
|
"grad_norm": 0.04884060269417904,
|
||
|
|
"learning_rate": 0.00029572376640004674,
|
||
|
|
"loss": 0.8681,
|
||
|
|
"step": 765
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.16973437672214262,
|
||
|
|
"grad_norm": 0.04948889650089674,
|
||
|
|
"learning_rate": 0.00029558584540079864,
|
||
|
|
"loss": 0.8822,
|
||
|
|
"step": 770
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.17083654799955914,
|
||
|
|
"grad_norm": 0.044346461043723126,
|
||
|
|
"learning_rate": 0.0002954457685821789,
|
||
|
|
"loss": 0.8656,
|
||
|
|
"step": 775
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.17193871927697565,
|
||
|
|
"grad_norm": 0.05856011647955299,
|
||
|
|
"learning_rate": 0.0002953035380184296,
|
||
|
|
"loss": 0.8487,
|
||
|
|
"step": 780
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.17304089055439215,
|
||
|
|
"grad_norm": 0.05004499788895783,
|
||
|
|
"learning_rate": 0.0002951591558156856,
|
||
|
|
"loss": 0.8219,
|
||
|
|
"step": 785
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.17414306183180867,
|
||
|
|
"grad_norm": 0.04373207037056602,
|
||
|
|
"learning_rate": 0.0002950126241119429,
|
||
|
|
"loss": 0.8712,
|
||
|
|
"step": 790
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.17524523310922518,
|
||
|
|
"grad_norm": 0.04536155490811825,
|
||
|
|
"learning_rate": 0.0002948639450770269,
|
||
|
|
"loss": 0.8616,
|
||
|
|
"step": 795
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.17634740438664168,
|
||
|
|
"grad_norm": 0.04345801787038758,
|
||
|
|
"learning_rate": 0.0002947131209125607,
|
||
|
|
"loss": 0.859,
|
||
|
|
"step": 800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1774495756640582,
|
||
|
|
"grad_norm": 0.045860871445007584,
|
||
|
|
"learning_rate": 0.0002945601538519321,
|
||
|
|
"loss": 0.8497,
|
||
|
|
"step": 805
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.17855174694147471,
|
||
|
|
"grad_norm": 0.04997067347593218,
|
||
|
|
"learning_rate": 0.0002944050461602607,
|
||
|
|
"loss": 0.8428,
|
||
|
|
"step": 810
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1796539182188912,
|
||
|
|
"grad_norm": 0.04961878609503156,
|
||
|
|
"learning_rate": 0.00029424780013436434,
|
||
|
|
"loss": 0.8582,
|
||
|
|
"step": 815
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.18075608949630773,
|
||
|
|
"grad_norm": 0.047930042373250895,
|
||
|
|
"learning_rate": 0.0002940884181027251,
|
||
|
|
"loss": 0.8523,
|
||
|
|
"step": 820
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.18185826077372425,
|
||
|
|
"grad_norm": 0.05532903520929939,
|
||
|
|
"learning_rate": 0.0002939269024254547,
|
||
|
|
"loss": 0.8544,
|
||
|
|
"step": 825
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.18296043205114074,
|
||
|
|
"grad_norm": 0.051023957267831634,
|
||
|
|
"learning_rate": 0.0002937632554942598,
|
||
|
|
"loss": 0.8419,
|
||
|
|
"step": 830
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.18406260332855726,
|
||
|
|
"grad_norm": 0.0453229581659907,
|
||
|
|
"learning_rate": 0.0002935974797324064,
|
||
|
|
"loss": 0.8335,
|
||
|
|
"step": 835
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.18516477460597378,
|
||
|
|
"grad_norm": 0.04606389790907126,
|
||
|
|
"learning_rate": 0.0002934295775946839,
|
||
|
|
"loss": 0.8368,
|
||
|
|
"step": 840
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.18626694588339027,
|
||
|
|
"grad_norm": 0.045473867263415464,
|
||
|
|
"learning_rate": 0.00029325955156736885,
|
||
|
|
"loss": 0.8304,
|
||
|
|
"step": 845
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1873691171608068,
|
||
|
|
"grad_norm": 0.044416090960029395,
|
||
|
|
"learning_rate": 0.0002930874041681883,
|
||
|
|
"loss": 0.8526,
|
||
|
|
"step": 850
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1884712884382233,
|
||
|
|
"grad_norm": 0.05975247770022884,
|
||
|
|
"learning_rate": 0.0002929131379462821,
|
||
|
|
"loss": 0.8442,
|
||
|
|
"step": 855
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1895734597156398,
|
||
|
|
"grad_norm": 0.048173050625599255,
|
||
|
|
"learning_rate": 0.00029273675548216563,
|
||
|
|
"loss": 0.8725,
|
||
|
|
"step": 860
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.19067563099305632,
|
||
|
|
"grad_norm": 0.04762107623243284,
|
||
|
|
"learning_rate": 0.0002925582593876912,
|
||
|
|
"loss": 0.8666,
|
||
|
|
"step": 865
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.19177780227047284,
|
||
|
|
"grad_norm": 0.05283512920588099,
|
||
|
|
"learning_rate": 0.0002923776523060095,
|
||
|
|
"loss": 0.8584,
|
||
|
|
"step": 870
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.19287997354788936,
|
||
|
|
"grad_norm": 0.05051732507674948,
|
||
|
|
"learning_rate": 0.0002921949369115307,
|
||
|
|
"loss": 0.8685,
|
||
|
|
"step": 875
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.19398214482530585,
|
||
|
|
"grad_norm": 0.04737129568146193,
|
||
|
|
"learning_rate": 0.00029201011590988444,
|
||
|
|
"loss": 0.8108,
|
||
|
|
"step": 880
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.19508431610272237,
|
||
|
|
"grad_norm": 0.052972441854251665,
|
||
|
|
"learning_rate": 0.00029182319203788,
|
||
|
|
"loss": 0.8554,
|
||
|
|
"step": 885
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.19618648738013889,
|
||
|
|
"grad_norm": 0.045056064701286494,
|
||
|
|
"learning_rate": 0.0002916341680634657,
|
||
|
|
"loss": 0.8271,
|
||
|
|
"step": 890
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.19728865865755538,
|
||
|
|
"grad_norm": 0.04699409375298448,
|
||
|
|
"learning_rate": 0.00029144304678568807,
|
||
|
|
"loss": 0.836,
|
||
|
|
"step": 895
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1983908299349719,
|
||
|
|
"grad_norm": 0.048175359720427025,
|
||
|
|
"learning_rate": 0.00029124983103465026,
|
||
|
|
"loss": 0.8541,
|
||
|
|
"step": 900
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.19949300121238842,
|
||
|
|
"grad_norm": 0.04727829823889592,
|
||
|
|
"learning_rate": 0.00029105452367147,
|
||
|
|
"loss": 0.8502,
|
||
|
|
"step": 905
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2005951724898049,
|
||
|
|
"grad_norm": 0.05148206426329491,
|
||
|
|
"learning_rate": 0.0002908571275882376,
|
||
|
|
"loss": 0.8453,
|
||
|
|
"step": 910
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.20169734376722143,
|
||
|
|
"grad_norm": 0.0514916225658676,
|
||
|
|
"learning_rate": 0.00029065764570797276,
|
||
|
|
"loss": 0.8609,
|
||
|
|
"step": 915
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.20279951504463795,
|
||
|
|
"grad_norm": 0.04883020506266138,
|
||
|
|
"learning_rate": 0.0002904560809845814,
|
||
|
|
"loss": 0.8461,
|
||
|
|
"step": 920
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.20390168632205444,
|
||
|
|
"grad_norm": 0.04568888763813157,
|
||
|
|
"learning_rate": 0.00029025243640281223,
|
||
|
|
"loss": 0.8827,
|
||
|
|
"step": 925
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.20500385759947096,
|
||
|
|
"grad_norm": 0.05356399559573866,
|
||
|
|
"learning_rate": 0.0002900467149782118,
|
||
|
|
"loss": 0.8606,
|
||
|
|
"step": 930
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.20610602887688748,
|
||
|
|
"grad_norm": 0.04851687061223907,
|
||
|
|
"learning_rate": 0.0002898389197570808,
|
||
|
|
"loss": 0.8586,
|
||
|
|
"step": 935
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.20720820015430397,
|
||
|
|
"grad_norm": 0.04669683817318346,
|
||
|
|
"learning_rate": 0.00028962905381642827,
|
||
|
|
"loss": 0.834,
|
||
|
|
"step": 940
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2083103714317205,
|
||
|
|
"grad_norm": 0.04842521203522015,
|
||
|
|
"learning_rate": 0.0002894171202639262,
|
||
|
|
"loss": 0.8352,
|
||
|
|
"step": 945
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.209412542709137,
|
||
|
|
"grad_norm": 0.0440579152345845,
|
||
|
|
"learning_rate": 0.0002892031222378635,
|
||
|
|
"loss": 0.8324,
|
||
|
|
"step": 950
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2105147139865535,
|
||
|
|
"grad_norm": 0.04627079587508343,
|
||
|
|
"learning_rate": 0.0002889870629070998,
|
||
|
|
"loss": 0.8253,
|
||
|
|
"step": 955
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.21161688526397002,
|
||
|
|
"grad_norm": 0.049721876458835836,
|
||
|
|
"learning_rate": 0.0002887689454710182,
|
||
|
|
"loss": 0.8322,
|
||
|
|
"step": 960
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.21271905654138654,
|
||
|
|
"grad_norm": 0.04459805370567298,
|
||
|
|
"learning_rate": 0.0002885487731594779,
|
||
|
|
"loss": 0.8522,
|
||
|
|
"step": 965
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.21382122781880306,
|
||
|
|
"grad_norm": 0.04335466084634585,
|
||
|
|
"learning_rate": 0.0002883265492327666,
|
||
|
|
"loss": 0.8385,
|
||
|
|
"step": 970
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.21492339909621955,
|
||
|
|
"grad_norm": 0.04968671178179274,
|
||
|
|
"learning_rate": 0.000288102276981552,
|
||
|
|
"loss": 0.8293,
|
||
|
|
"step": 975
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.21602557037363607,
|
||
|
|
"grad_norm": 0.04449544356929534,
|
||
|
|
"learning_rate": 0.00028787595972683326,
|
||
|
|
"loss": 0.8444,
|
||
|
|
"step": 980
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.21712774165105259,
|
||
|
|
"grad_norm": 0.043972214438083426,
|
||
|
|
"learning_rate": 0.0002876476008198917,
|
||
|
|
"loss": 0.8337,
|
||
|
|
"step": 985
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.21822991292846908,
|
||
|
|
"grad_norm": 0.04311951621843307,
|
||
|
|
"learning_rate": 0.00028741720364224113,
|
||
|
|
"loss": 0.851,
|
||
|
|
"step": 990
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2193320842058856,
|
||
|
|
"grad_norm": 0.045060530710752784,
|
||
|
|
"learning_rate": 0.000287184771605578,
|
||
|
|
"loss": 0.8404,
|
||
|
|
"step": 995
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.22043425548330212,
|
||
|
|
"grad_norm": 0.045664652704444204,
|
||
|
|
"learning_rate": 0.0002869503081517305,
|
||
|
|
"loss": 0.8181,
|
||
|
|
"step": 1000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2215364267607186,
|
||
|
|
"grad_norm": 0.048272419665718394,
|
||
|
|
"learning_rate": 0.0002867138167526081,
|
||
|
|
"loss": 0.851,
|
||
|
|
"step": 1005
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.22263859803813513,
|
||
|
|
"grad_norm": 0.047040645763608745,
|
||
|
|
"learning_rate": 0.0002864753009101497,
|
||
|
|
"loss": 0.8187,
|
||
|
|
"step": 1010
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.22374076931555165,
|
||
|
|
"grad_norm": 0.0428558339017769,
|
||
|
|
"learning_rate": 0.00028623476415627185,
|
||
|
|
"loss": 0.8425,
|
||
|
|
"step": 1015
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.22484294059296814,
|
||
|
|
"grad_norm": 0.04197047231886558,
|
||
|
|
"learning_rate": 0.0002859922100528168,
|
||
|
|
"loss": 0.8565,
|
||
|
|
"step": 1020
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.22594511187038466,
|
||
|
|
"grad_norm": 0.044195983390617165,
|
||
|
|
"learning_rate": 0.0002857476421914993,
|
||
|
|
"loss": 0.8265,
|
||
|
|
"step": 1025
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.22704728314780118,
|
||
|
|
"grad_norm": 0.04490807738250268,
|
||
|
|
"learning_rate": 0.0002855010641938536,
|
||
|
|
"loss": 0.8273,
|
||
|
|
"step": 1030
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.22814945442521767,
|
||
|
|
"grad_norm": 0.045467911300816795,
|
||
|
|
"learning_rate": 0.00028525247971118,
|
||
|
|
"loss": 0.8448,
|
||
|
|
"step": 1035
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2292516257026342,
|
||
|
|
"grad_norm": 0.06605089099746904,
|
||
|
|
"learning_rate": 0.0002850018924244903,
|
||
|
|
"loss": 0.8452,
|
||
|
|
"step": 1040
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2303537969800507,
|
||
|
|
"grad_norm": 0.04096505035492289,
|
||
|
|
"learning_rate": 0.00028474930604445404,
|
||
|
|
"loss": 0.8205,
|
||
|
|
"step": 1045
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2314559682574672,
|
||
|
|
"grad_norm": 0.04304934507293174,
|
||
|
|
"learning_rate": 0.0002844947243113427,
|
||
|
|
"loss": 0.8488,
|
||
|
|
"step": 1050
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.23255813953488372,
|
||
|
|
"grad_norm": 0.04230208217283045,
|
||
|
|
"learning_rate": 0.000284238150994975,
|
||
|
|
"loss": 0.8376,
|
||
|
|
"step": 1055
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.23366031081230024,
|
||
|
|
"grad_norm": 0.04708086600574921,
|
||
|
|
"learning_rate": 0.00028397958989466064,
|
||
|
|
"loss": 0.8231,
|
||
|
|
"step": 1060
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.23476248208971673,
|
||
|
|
"grad_norm": 0.046838021251746416,
|
||
|
|
"learning_rate": 0.00028371904483914437,
|
||
|
|
"loss": 0.8284,
|
||
|
|
"step": 1065
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.23586465336713325,
|
||
|
|
"grad_norm": 0.05160315046793239,
|
||
|
|
"learning_rate": 0.00028345651968654897,
|
||
|
|
"loss": 0.8489,
|
||
|
|
"step": 1070
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.23696682464454977,
|
||
|
|
"grad_norm": 0.04831039086387138,
|
||
|
|
"learning_rate": 0.0002831920183243184,
|
||
|
|
"loss": 0.8611,
|
||
|
|
"step": 1075
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2380689959219663,
|
||
|
|
"grad_norm": 0.05287417954296867,
|
||
|
|
"learning_rate": 0.00028292554466916004,
|
||
|
|
"loss": 0.8323,
|
||
|
|
"step": 1080
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.23917116719938278,
|
||
|
|
"grad_norm": 0.04693015235603077,
|
||
|
|
"learning_rate": 0.00028265710266698685,
|
||
|
|
"loss": 0.8632,
|
||
|
|
"step": 1085
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2402733384767993,
|
||
|
|
"grad_norm": 0.041988241282154073,
|
||
|
|
"learning_rate": 0.00028238669629285885,
|
||
|
|
"loss": 0.8068,
|
||
|
|
"step": 1090
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.24137550975421582,
|
||
|
|
"grad_norm": 0.04810130529903119,
|
||
|
|
"learning_rate": 0.0002821143295509241,
|
||
|
|
"loss": 0.8193,
|
||
|
|
"step": 1095
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2424776810316323,
|
||
|
|
"grad_norm": 0.04870507456288239,
|
||
|
|
"learning_rate": 0.0002818400064743599,
|
||
|
|
"loss": 0.8726,
|
||
|
|
"step": 1100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.24357985230904883,
|
||
|
|
"grad_norm": 0.04918127091170888,
|
||
|
|
"learning_rate": 0.00028156373112531234,
|
||
|
|
"loss": 0.8501,
|
||
|
|
"step": 1105
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.24468202358646535,
|
||
|
|
"grad_norm": 0.04501871928173725,
|
||
|
|
"learning_rate": 0.0002812855075948369,
|
||
|
|
"loss": 0.8623,
|
||
|
|
"step": 1110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.24578419486388184,
|
||
|
|
"grad_norm": 0.04700064721205868,
|
||
|
|
"learning_rate": 0.00028100534000283727,
|
||
|
|
"loss": 0.8334,
|
||
|
|
"step": 1115
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.24688636614129836,
|
||
|
|
"grad_norm": 0.043956386695142506,
|
||
|
|
"learning_rate": 0.0002807232324980048,
|
||
|
|
"loss": 0.8729,
|
||
|
|
"step": 1120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.24798853741871488,
|
||
|
|
"grad_norm": 0.044369171455137094,
|
||
|
|
"learning_rate": 0.00028043918925775666,
|
||
|
|
"loss": 0.8198,
|
||
|
|
"step": 1125
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.24909070869613137,
|
||
|
|
"grad_norm": 0.04924934475039299,
|
||
|
|
"learning_rate": 0.00028015321448817435,
|
||
|
|
"loss": 0.8425,
|
||
|
|
"step": 1130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2501928799735479,
|
||
|
|
"grad_norm": 0.052400033371239746,
|
||
|
|
"learning_rate": 0.0002798653124239411,
|
||
|
|
"loss": 0.8627,
|
||
|
|
"step": 1135
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2512950512509644,
|
||
|
|
"grad_norm": 0.04354647857372681,
|
||
|
|
"learning_rate": 0.0002795754873282794,
|
||
|
|
"loss": 0.8052,
|
||
|
|
"step": 1140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2523972225283809,
|
||
|
|
"grad_norm": 0.050485861813274184,
|
||
|
|
"learning_rate": 0.0002792837434928878,
|
||
|
|
"loss": 0.8437,
|
||
|
|
"step": 1145
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.25349939380579745,
|
||
|
|
"grad_norm": 0.04827775784152042,
|
||
|
|
"learning_rate": 0.00027899008523787726,
|
||
|
|
"loss": 0.8595,
|
||
|
|
"step": 1150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2546015650832139,
|
||
|
|
"grad_norm": 0.04298338268947314,
|
||
|
|
"learning_rate": 0.0002786945169117073,
|
||
|
|
"loss": 0.8306,
|
||
|
|
"step": 1155
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.25570373636063043,
|
||
|
|
"grad_norm": 0.04907197256608099,
|
||
|
|
"learning_rate": 0.0002783970428911216,
|
||
|
|
"loss": 0.8305,
|
||
|
|
"step": 1160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.25680590763804695,
|
||
|
|
"grad_norm": 0.049467752786233936,
|
||
|
|
"learning_rate": 0.000278097667581083,
|
||
|
|
"loss": 0.848,
|
||
|
|
"step": 1165
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.25790807891546347,
|
||
|
|
"grad_norm": 0.04656416214482384,
|
||
|
|
"learning_rate": 0.0002777963954147087,
|
||
|
|
"loss": 0.8165,
|
||
|
|
"step": 1170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.25901025019288,
|
||
|
|
"grad_norm": 0.051521287489935036,
|
||
|
|
"learning_rate": 0.0002774932308532041,
|
||
|
|
"loss": 0.8362,
|
||
|
|
"step": 1175
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2601124214702965,
|
||
|
|
"grad_norm": 0.053549824085333694,
|
||
|
|
"learning_rate": 0.00027718817838579706,
|
||
|
|
"loss": 0.8267,
|
||
|
|
"step": 1180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.26121459274771297,
|
||
|
|
"grad_norm": 0.054270101506079374,
|
||
|
|
"learning_rate": 0.0002768812425296714,
|
||
|
|
"loss": 0.8119,
|
||
|
|
"step": 1185
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2623167640251295,
|
||
|
|
"grad_norm": 0.05333532704116092,
|
||
|
|
"learning_rate": 0.00027657242782989987,
|
||
|
|
"loss": 0.8099,
|
||
|
|
"step": 1190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.263418935302546,
|
||
|
|
"grad_norm": 0.046490067153489543,
|
||
|
|
"learning_rate": 0.00027626173885937703,
|
||
|
|
"loss": 0.806,
|
||
|
|
"step": 1195
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.26452110657996253,
|
||
|
|
"grad_norm": 0.04891723101243592,
|
||
|
|
"learning_rate": 0.0002759491802187513,
|
||
|
|
"loss": 0.8336,
|
||
|
|
"step": 1200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.26562327785737905,
|
||
|
|
"grad_norm": 0.052736333806997764,
|
||
|
|
"learning_rate": 0.00027563475653635713,
|
||
|
|
"loss": 0.8471,
|
||
|
|
"step": 1205
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.26672544913479557,
|
||
|
|
"grad_norm": 0.04347575493484375,
|
||
|
|
"learning_rate": 0.00027531847246814613,
|
||
|
|
"loss": 0.8388,
|
||
|
|
"step": 1210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.26782762041221203,
|
||
|
|
"grad_norm": 0.044934221766930585,
|
||
|
|
"learning_rate": 0.00027500033269761855,
|
||
|
|
"loss": 0.8382,
|
||
|
|
"step": 1215
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.26892979168962855,
|
||
|
|
"grad_norm": 0.04472911222526219,
|
||
|
|
"learning_rate": 0.0002746803419357534,
|
||
|
|
"loss": 0.823,
|
||
|
|
"step": 1220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.27003196296704507,
|
||
|
|
"grad_norm": 0.042907862680138985,
|
||
|
|
"learning_rate": 0.0002743585049209391,
|
||
|
|
"loss": 0.8217,
|
||
|
|
"step": 1225
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2711341342444616,
|
||
|
|
"grad_norm": 0.0430952497583985,
|
||
|
|
"learning_rate": 0.00027403482641890324,
|
||
|
|
"loss": 0.8148,
|
||
|
|
"step": 1230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2722363055218781,
|
||
|
|
"grad_norm": 0.04625178973487922,
|
||
|
|
"learning_rate": 0.0002737093112226418,
|
||
|
|
"loss": 0.8633,
|
||
|
|
"step": 1235
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2733384767992946,
|
||
|
|
"grad_norm": 0.05941599713124254,
|
||
|
|
"learning_rate": 0.00027338196415234857,
|
||
|
|
"loss": 0.8307,
|
||
|
|
"step": 1240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.27444064807671115,
|
||
|
|
"grad_norm": 0.055472061651380494,
|
||
|
|
"learning_rate": 0.0002730527900553432,
|
||
|
|
"loss": 0.8527,
|
||
|
|
"step": 1245
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2755428193541276,
|
||
|
|
"grad_norm": 0.04678339387739163,
|
||
|
|
"learning_rate": 0.00027272179380600006,
|
||
|
|
"loss": 0.849,
|
||
|
|
"step": 1250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.27664499063154413,
|
||
|
|
"grad_norm": 0.04692059237171032,
|
||
|
|
"learning_rate": 0.0002723889803056756,
|
||
|
|
"loss": 0.8706,
|
||
|
|
"step": 1255
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.27774716190896065,
|
||
|
|
"grad_norm": 0.04559570312510178,
|
||
|
|
"learning_rate": 0.00027205435448263593,
|
||
|
|
"loss": 0.8418,
|
||
|
|
"step": 1260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.27884933318637717,
|
||
|
|
"grad_norm": 0.0419292041399672,
|
||
|
|
"learning_rate": 0.0002717179212919838,
|
||
|
|
"loss": 0.8583,
|
||
|
|
"step": 1265
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2799515044637937,
|
||
|
|
"grad_norm": 0.04430167866474891,
|
||
|
|
"learning_rate": 0.00027137968571558553,
|
||
|
|
"loss": 0.8333,
|
||
|
|
"step": 1270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2810536757412102,
|
||
|
|
"grad_norm": 0.0456691993264945,
|
||
|
|
"learning_rate": 0.00027103965276199647,
|
||
|
|
"loss": 0.8447,
|
||
|
|
"step": 1275
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.28215584701862667,
|
||
|
|
"grad_norm": 0.04665005990926171,
|
||
|
|
"learning_rate": 0.0002706978274663879,
|
||
|
|
"loss": 0.7695,
|
||
|
|
"step": 1280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2832580182960432,
|
||
|
|
"grad_norm": 0.043712884590464574,
|
||
|
|
"learning_rate": 0.0002703542148904715,
|
||
|
|
"loss": 0.8267,
|
||
|
|
"step": 1285
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2843601895734597,
|
||
|
|
"grad_norm": 0.04652550786503771,
|
||
|
|
"learning_rate": 0.00027000882012242496,
|
||
|
|
"loss": 0.8437,
|
||
|
|
"step": 1290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.28546236085087623,
|
||
|
|
"grad_norm": 0.045612398832845166,
|
||
|
|
"learning_rate": 0.00026966164827681643,
|
||
|
|
"loss": 0.8138,
|
||
|
|
"step": 1295
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.28656453212829275,
|
||
|
|
"grad_norm": 0.04839127808608333,
|
||
|
|
"learning_rate": 0.00026931270449452897,
|
||
|
|
"loss": 0.8372,
|
||
|
|
"step": 1300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.28766670340570927,
|
||
|
|
"grad_norm": 0.0445074064874431,
|
||
|
|
"learning_rate": 0.000268961993942684,
|
||
|
|
"loss": 0.8214,
|
||
|
|
"step": 1305
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.28876887468312573,
|
||
|
|
"grad_norm": 0.0402466463654265,
|
||
|
|
"learning_rate": 0.0002686095218145654,
|
||
|
|
"loss": 0.8086,
|
||
|
|
"step": 1310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.28987104596054225,
|
||
|
|
"grad_norm": 0.05239340243182599,
|
||
|
|
"learning_rate": 0.000268255293329542,
|
||
|
|
"loss": 0.8368,
|
||
|
|
"step": 1315
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.29097321723795877,
|
||
|
|
"grad_norm": 0.04600417253087546,
|
||
|
|
"learning_rate": 0.0002678993137329908,
|
||
|
|
"loss": 0.8081,
|
||
|
|
"step": 1320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2920753885153753,
|
||
|
|
"grad_norm": 0.04275984799543839,
|
||
|
|
"learning_rate": 0.0002675415882962189,
|
||
|
|
"loss": 0.8257,
|
||
|
|
"step": 1325
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2931775597927918,
|
||
|
|
"grad_norm": 0.044570824361962434,
|
||
|
|
"learning_rate": 0.0002671821223163858,
|
||
|
|
"loss": 0.8208,
|
||
|
|
"step": 1330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.29427973107020833,
|
||
|
|
"grad_norm": 0.045119447979777474,
|
||
|
|
"learning_rate": 0.0002668209211164244,
|
||
|
|
"loss": 0.8488,
|
||
|
|
"step": 1335
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.29538190234762485,
|
||
|
|
"grad_norm": 0.04036708835023624,
|
||
|
|
"learning_rate": 0.00026645799004496306,
|
||
|
|
"loss": 0.8512,
|
||
|
|
"step": 1340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2964840736250413,
|
||
|
|
"grad_norm": 0.041845363942691824,
|
||
|
|
"learning_rate": 0.0002660933344762455,
|
||
|
|
"loss": 0.8228,
|
||
|
|
"step": 1345
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.29758624490245783,
|
||
|
|
"grad_norm": 0.052565125884796726,
|
||
|
|
"learning_rate": 0.0002657269598100518,
|
||
|
|
"loss": 0.833,
|
||
|
|
"step": 1350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.29868841617987435,
|
||
|
|
"grad_norm": 0.05263005472080342,
|
||
|
|
"learning_rate": 0.0002653588714716181,
|
||
|
|
"loss": 0.8482,
|
||
|
|
"step": 1355
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.29979058745729087,
|
||
|
|
"grad_norm": 0.04489153452747211,
|
||
|
|
"learning_rate": 0.00026498907491155665,
|
||
|
|
"loss": 0.7975,
|
||
|
|
"step": 1360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3008927587347074,
|
||
|
|
"grad_norm": 0.0477109850798695,
|
||
|
|
"learning_rate": 0.0002646175756057745,
|
||
|
|
"loss": 0.8168,
|
||
|
|
"step": 1365
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3019949300121239,
|
||
|
|
"grad_norm": 0.04476257759581266,
|
||
|
|
"learning_rate": 0.00026424437905539315,
|
||
|
|
"loss": 0.8062,
|
||
|
|
"step": 1370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.30309710128954037,
|
||
|
|
"grad_norm": 0.04415673060651443,
|
||
|
|
"learning_rate": 0.00026386949078666653,
|
||
|
|
"loss": 0.8352,
|
||
|
|
"step": 1375
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3041992725669569,
|
||
|
|
"grad_norm": 0.04044906513322972,
|
||
|
|
"learning_rate": 0.0002634929163508993,
|
||
|
|
"loss": 0.8299,
|
||
|
|
"step": 1380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3053014438443734,
|
||
|
|
"grad_norm": 0.047407594480881006,
|
||
|
|
"learning_rate": 0.0002631146613243648,
|
||
|
|
"loss": 0.8509,
|
||
|
|
"step": 1385
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.30640361512178993,
|
||
|
|
"grad_norm": 0.04212908230670688,
|
||
|
|
"learning_rate": 0.00026273473130822235,
|
||
|
|
"loss": 0.8348,
|
||
|
|
"step": 1390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.30750578639920645,
|
||
|
|
"grad_norm": 0.04777049716354304,
|
||
|
|
"learning_rate": 0.0002623531319284343,
|
||
|
|
"loss": 0.8477,
|
||
|
|
"step": 1395
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.30860795767662297,
|
||
|
|
"grad_norm": 0.048704830358582564,
|
||
|
|
"learning_rate": 0.00026196986883568284,
|
||
|
|
"loss": 0.8514,
|
||
|
|
"step": 1400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.30971012895403943,
|
||
|
|
"grad_norm": 0.05238527702204111,
|
||
|
|
"learning_rate": 0.00026158494770528614,
|
||
|
|
"loss": 0.82,
|
||
|
|
"step": 1405
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.31081230023145595,
|
||
|
|
"grad_norm": 0.05352166328603628,
|
||
|
|
"learning_rate": 0.0002611983742371144,
|
||
|
|
"loss": 0.8293,
|
||
|
|
"step": 1410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.31191447150887247,
|
||
|
|
"grad_norm": 0.04537796100997172,
|
||
|
|
"learning_rate": 0.0002608101541555056,
|
||
|
|
"loss": 0.8001,
|
||
|
|
"step": 1415
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.313016642786289,
|
||
|
|
"grad_norm": 0.04950116503834519,
|
||
|
|
"learning_rate": 0.0002604202932091805,
|
||
|
|
"loss": 0.8406,
|
||
|
|
"step": 1420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3141188140637055,
|
||
|
|
"grad_norm": 0.042514906932797684,
|
||
|
|
"learning_rate": 0.0002600287971711576,
|
||
|
|
"loss": 0.8467,
|
||
|
|
"step": 1425
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.31522098534112203,
|
||
|
|
"grad_norm": 0.03957295732321364,
|
||
|
|
"learning_rate": 0.0002596356718386676,
|
||
|
|
"loss": 0.8457,
|
||
|
|
"step": 1430
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.31632315661853855,
|
||
|
|
"grad_norm": 0.04155394927306569,
|
||
|
|
"learning_rate": 0.0002592409230330677,
|
||
|
|
"loss": 0.8087,
|
||
|
|
"step": 1435
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.317425327895955,
|
||
|
|
"grad_norm": 0.04562885194483694,
|
||
|
|
"learning_rate": 0.0002588445565997554,
|
||
|
|
"loss": 0.8394,
|
||
|
|
"step": 1440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.31852749917337153,
|
||
|
|
"grad_norm": 0.04641875664850794,
|
||
|
|
"learning_rate": 0.0002584465784080817,
|
||
|
|
"loss": 0.8407,
|
||
|
|
"step": 1445
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.31962967045078805,
|
||
|
|
"grad_norm": 0.0430053369111926,
|
||
|
|
"learning_rate": 0.0002580469943512644,
|
||
|
|
"loss": 0.8494,
|
||
|
|
"step": 1450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.32073184172820457,
|
||
|
|
"grad_norm": 0.04973211543509925,
|
||
|
|
"learning_rate": 0.0002576458103463007,
|
||
|
|
"loss": 0.798,
|
||
|
|
"step": 1455
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3218340130056211,
|
||
|
|
"grad_norm": 0.04129288667946417,
|
||
|
|
"learning_rate": 0.00025724303233387987,
|
||
|
|
"loss": 0.8446,
|
||
|
|
"step": 1460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3229361842830376,
|
||
|
|
"grad_norm": 0.04473055100882414,
|
||
|
|
"learning_rate": 0.00025683866627829486,
|
||
|
|
"loss": 0.8455,
|
||
|
|
"step": 1465
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.32403835556045407,
|
||
|
|
"grad_norm": 0.046468538153449125,
|
||
|
|
"learning_rate": 0.00025643271816735416,
|
||
|
|
"loss": 0.8194,
|
||
|
|
"step": 1470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3251405268378706,
|
||
|
|
"grad_norm": 0.039997009480838126,
|
||
|
|
"learning_rate": 0.0002560251940122935,
|
||
|
|
"loss": 0.8198,
|
||
|
|
"step": 1475
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3262426981152871,
|
||
|
|
"grad_norm": 0.045094913214168995,
|
||
|
|
"learning_rate": 0.000255616099847686,
|
||
|
|
"loss": 0.8099,
|
||
|
|
"step": 1480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.32734486939270363,
|
||
|
|
"grad_norm": 0.04423782452900588,
|
||
|
|
"learning_rate": 0.0002552054417313538,
|
||
|
|
"loss": 0.8205,
|
||
|
|
"step": 1485
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.32844704067012015,
|
||
|
|
"grad_norm": 0.044248198901321145,
|
||
|
|
"learning_rate": 0.0002547932257442775,
|
||
|
|
"loss": 0.8115,
|
||
|
|
"step": 1490
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.32954921194753667,
|
||
|
|
"grad_norm": 0.04981089857434275,
|
||
|
|
"learning_rate": 0.00025437945799050674,
|
||
|
|
"loss": 0.8398,
|
||
|
|
"step": 1495
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.33065138322495313,
|
||
|
|
"grad_norm": 0.04545569085042973,
|
||
|
|
"learning_rate": 0.00025396414459706926,
|
||
|
|
"loss": 0.8086,
|
||
|
|
"step": 1500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.33175355450236965,
|
||
|
|
"grad_norm": 0.0428291316212456,
|
||
|
|
"learning_rate": 0.00025354729171388077,
|
||
|
|
"loss": 0.813,
|
||
|
|
"step": 1505
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.33285572577978617,
|
||
|
|
"grad_norm": 0.04223393291828146,
|
||
|
|
"learning_rate": 0.0002531289055136535,
|
||
|
|
"loss": 0.8322,
|
||
|
|
"step": 1510
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3339578970572027,
|
||
|
|
"grad_norm": 0.04755472767341248,
|
||
|
|
"learning_rate": 0.0002527089921918047,
|
||
|
|
"loss": 0.8496,
|
||
|
|
"step": 1515
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3350600683346192,
|
||
|
|
"grad_norm": 0.0440763985224576,
|
||
|
|
"learning_rate": 0.00025228755796636524,
|
||
|
|
"loss": 0.8317,
|
||
|
|
"step": 1520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.33616223961203573,
|
||
|
|
"grad_norm": 0.04739432394265274,
|
||
|
|
"learning_rate": 0.00025186460907788733,
|
||
|
|
"loss": 0.8291,
|
||
|
|
"step": 1525
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.33726441088945225,
|
||
|
|
"grad_norm": 0.04365113581228294,
|
||
|
|
"learning_rate": 0.0002514401517893521,
|
||
|
|
"loss": 0.8314,
|
||
|
|
"step": 1530
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3383665821668687,
|
||
|
|
"grad_norm": 0.05295299465042367,
|
||
|
|
"learning_rate": 0.0002510141923860769,
|
||
|
|
"loss": 0.8386,
|
||
|
|
"step": 1535
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.33946875344428523,
|
||
|
|
"grad_norm": 0.04480906078196552,
|
||
|
|
"learning_rate": 0.0002505867371756224,
|
||
|
|
"loss": 0.8087,
|
||
|
|
"step": 1540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.34057092472170175,
|
||
|
|
"grad_norm": 0.04166659673582314,
|
||
|
|
"learning_rate": 0.0002501577924876987,
|
||
|
|
"loss": 0.8336,
|
||
|
|
"step": 1545
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.34167309599911827,
|
||
|
|
"grad_norm": 0.04392236356125392,
|
||
|
|
"learning_rate": 0.0002497273646740723,
|
||
|
|
"loss": 0.8221,
|
||
|
|
"step": 1550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3427752672765348,
|
||
|
|
"grad_norm": 0.037583836271015914,
|
||
|
|
"learning_rate": 0.0002492954601084713,
|
||
|
|
"loss": 0.8347,
|
||
|
|
"step": 1555
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3438774385539513,
|
||
|
|
"grad_norm": 0.04245959731615741,
|
||
|
|
"learning_rate": 0.00024886208518649173,
|
||
|
|
"loss": 0.8341,
|
||
|
|
"step": 1560
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3449796098313678,
|
||
|
|
"grad_norm": 0.04299442216812149,
|
||
|
|
"learning_rate": 0.00024842724632550216,
|
||
|
|
"loss": 0.8143,
|
||
|
|
"step": 1565
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3460817811087843,
|
||
|
|
"grad_norm": 0.044416364158721855,
|
||
|
|
"learning_rate": 0.00024799094996454926,
|
||
|
|
"loss": 0.817,
|
||
|
|
"step": 1570
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3471839523862008,
|
||
|
|
"grad_norm": 0.04080370665097008,
|
||
|
|
"learning_rate": 0.0002475532025642621,
|
||
|
|
"loss": 0.8404,
|
||
|
|
"step": 1575
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.34828612366361733,
|
||
|
|
"grad_norm": 0.043404652747125856,
|
||
|
|
"learning_rate": 0.0002471140106067565,
|
||
|
|
"loss": 0.8056,
|
||
|
|
"step": 1580
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.34938829494103385,
|
||
|
|
"grad_norm": 0.04702543231191137,
|
||
|
|
"learning_rate": 0.0002466733805955394,
|
||
|
|
"loss": 0.8364,
|
||
|
|
"step": 1585
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.35049046621845037,
|
||
|
|
"grad_norm": 0.04379810092412971,
|
||
|
|
"learning_rate": 0.000246231319055412,
|
||
|
|
"loss": 0.7982,
|
||
|
|
"step": 1590
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.35159263749586683,
|
||
|
|
"grad_norm": 0.04861448921339962,
|
||
|
|
"learning_rate": 0.0002457878325323735,
|
||
|
|
"loss": 0.8108,
|
||
|
|
"step": 1595
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.35269480877328335,
|
||
|
|
"grad_norm": 0.05366919048317618,
|
||
|
|
"learning_rate": 0.00024534292759352414,
|
||
|
|
"loss": 0.8406,
|
||
|
|
"step": 1600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.35379698005069987,
|
||
|
|
"grad_norm": 0.04637591531118319,
|
||
|
|
"learning_rate": 0.000244896610826968,
|
||
|
|
"loss": 0.7878,
|
||
|
|
"step": 1605
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3548991513281164,
|
||
|
|
"grad_norm": 0.0451952378709979,
|
||
|
|
"learning_rate": 0.00024444888884171505,
|
||
|
|
"loss": 0.8073,
|
||
|
|
"step": 1610
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3560013226055329,
|
||
|
|
"grad_norm": 0.04229445731850634,
|
||
|
|
"learning_rate": 0.00024399976826758392,
|
||
|
|
"loss": 0.8133,
|
||
|
|
"step": 1615
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.35710349388294943,
|
||
|
|
"grad_norm": 0.04303409290387014,
|
||
|
|
"learning_rate": 0.00024354925575510315,
|
||
|
|
"loss": 0.7969,
|
||
|
|
"step": 1620
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3582056651603659,
|
||
|
|
"grad_norm": 0.04070575320795186,
|
||
|
|
"learning_rate": 0.00024309735797541318,
|
||
|
|
"loss": 0.8192,
|
||
|
|
"step": 1625
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3593078364377824,
|
||
|
|
"grad_norm": 0.04197565777082017,
|
||
|
|
"learning_rate": 0.0002426440816201671,
|
||
|
|
"loss": 0.8239,
|
||
|
|
"step": 1630
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.36041000771519893,
|
||
|
|
"grad_norm": 0.044014973357833005,
|
||
|
|
"learning_rate": 0.00024218943340143182,
|
||
|
|
"loss": 0.8334,
|
||
|
|
"step": 1635
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.36151217899261545,
|
||
|
|
"grad_norm": 0.04571887208233434,
|
||
|
|
"learning_rate": 0.00024173342005158894,
|
||
|
|
"loss": 0.8432,
|
||
|
|
"step": 1640
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.36261435027003197,
|
||
|
|
"grad_norm": 0.04094752957919555,
|
||
|
|
"learning_rate": 0.00024127604832323445,
|
||
|
|
"loss": 0.7932,
|
||
|
|
"step": 1645
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3637165215474485,
|
||
|
|
"grad_norm": 0.04013508208109058,
|
||
|
|
"learning_rate": 0.0002408173249890792,
|
||
|
|
"loss": 0.8034,
|
||
|
|
"step": 1650
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.364818692824865,
|
||
|
|
"grad_norm": 0.040311829670245755,
|
||
|
|
"learning_rate": 0.00024035725684184845,
|
||
|
|
"loss": 0.7866,
|
||
|
|
"step": 1655
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3659208641022815,
|
||
|
|
"grad_norm": 0.041899754058127306,
|
||
|
|
"learning_rate": 0.00023989585069418134,
|
||
|
|
"loss": 0.7872,
|
||
|
|
"step": 1660
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.367023035379698,
|
||
|
|
"grad_norm": 0.043805964643653376,
|
||
|
|
"learning_rate": 0.0002394331133785299,
|
||
|
|
"loss": 0.8146,
|
||
|
|
"step": 1665
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3681252066571145,
|
||
|
|
"grad_norm": 0.04065431765935861,
|
||
|
|
"learning_rate": 0.000238969051747058,
|
||
|
|
"loss": 0.8394,
|
||
|
|
"step": 1670
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.36922737793453103,
|
||
|
|
"grad_norm": 0.04291000131859918,
|
||
|
|
"learning_rate": 0.00023850367267153985,
|
||
|
|
"loss": 0.8414,
|
||
|
|
"step": 1675
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.37032954921194755,
|
||
|
|
"grad_norm": 0.04361025776347538,
|
||
|
|
"learning_rate": 0.00023803698304325824,
|
||
|
|
"loss": 0.7826,
|
||
|
|
"step": 1680
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.37143172048936407,
|
||
|
|
"grad_norm": 0.04765203149725648,
|
||
|
|
"learning_rate": 0.00023756898977290235,
|
||
|
|
"loss": 0.8113,
|
||
|
|
"step": 1685
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.37253389176678053,
|
||
|
|
"grad_norm": 0.04008616793311488,
|
||
|
|
"learning_rate": 0.00023709969979046576,
|
||
|
|
"loss": 0.8291,
|
||
|
|
"step": 1690
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.37363606304419705,
|
||
|
|
"grad_norm": 0.04271593974112689,
|
||
|
|
"learning_rate": 0.00023662912004514345,
|
||
|
|
"loss": 0.82,
|
||
|
|
"step": 1695
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3747382343216136,
|
||
|
|
"grad_norm": 0.04110249253005406,
|
||
|
|
"learning_rate": 0.00023615725750522913,
|
||
|
|
"loss": 0.8305,
|
||
|
|
"step": 1700
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3758404055990301,
|
||
|
|
"grad_norm": 0.0403637537968849,
|
||
|
|
"learning_rate": 0.00023568411915801205,
|
||
|
|
"loss": 0.8177,
|
||
|
|
"step": 1705
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3769425768764466,
|
||
|
|
"grad_norm": 0.044289241987404186,
|
||
|
|
"learning_rate": 0.00023520971200967334,
|
||
|
|
"loss": 0.8215,
|
||
|
|
"step": 1710
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.37804474815386313,
|
||
|
|
"grad_norm": 0.04129576160875138,
|
||
|
|
"learning_rate": 0.00023473404308518256,
|
||
|
|
"loss": 0.8337,
|
||
|
|
"step": 1715
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3791469194312796,
|
||
|
|
"grad_norm": 0.04173364684891999,
|
||
|
|
"learning_rate": 0.00023425711942819333,
|
||
|
|
"loss": 0.8067,
|
||
|
|
"step": 1720
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3802490907086961,
|
||
|
|
"grad_norm": 0.04440013453397569,
|
||
|
|
"learning_rate": 0.00023377894810093944,
|
||
|
|
"loss": 0.8396,
|
||
|
|
"step": 1725
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.38135126198611263,
|
||
|
|
"grad_norm": 0.04364153613366134,
|
||
|
|
"learning_rate": 0.00023329953618412985,
|
||
|
|
"loss": 0.8126,
|
||
|
|
"step": 1730
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.38245343326352915,
|
||
|
|
"grad_norm": 0.03880573318839816,
|
||
|
|
"learning_rate": 0.0002328188907768441,
|
||
|
|
"loss": 0.7861,
|
||
|
|
"step": 1735
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.38355560454094567,
|
||
|
|
"grad_norm": 0.044630935282676455,
|
||
|
|
"learning_rate": 0.00023233701899642712,
|
||
|
|
"loss": 0.8041,
|
||
|
|
"step": 1740
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3846577758183622,
|
||
|
|
"grad_norm": 0.045036425102858116,
|
||
|
|
"learning_rate": 0.0002318539279783839,
|
||
|
|
"loss": 0.8389,
|
||
|
|
"step": 1745
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3857599470957787,
|
||
|
|
"grad_norm": 0.044227375694242045,
|
||
|
|
"learning_rate": 0.0002313696248762737,
|
||
|
|
"loss": 0.8024,
|
||
|
|
"step": 1750
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3868621183731952,
|
||
|
|
"grad_norm": 0.04607651991018229,
|
||
|
|
"learning_rate": 0.00023088411686160415,
|
||
|
|
"loss": 0.8087,
|
||
|
|
"step": 1755
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3879642896506117,
|
||
|
|
"grad_norm": 0.04430605414125412,
|
||
|
|
"learning_rate": 0.00023039741112372528,
|
||
|
|
"loss": 0.8279,
|
||
|
|
"step": 1760
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3890664609280282,
|
||
|
|
"grad_norm": 0.047410669524145625,
|
||
|
|
"learning_rate": 0.00022990951486972258,
|
||
|
|
"loss": 0.8104,
|
||
|
|
"step": 1765
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.39016863220544473,
|
||
|
|
"grad_norm": 0.041525912031087554,
|
||
|
|
"learning_rate": 0.0002294204353243109,
|
||
|
|
"loss": 0.7937,
|
||
|
|
"step": 1770
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.39127080348286125,
|
||
|
|
"grad_norm": 0.03803908868674592,
|
||
|
|
"learning_rate": 0.00022893017972972686,
|
||
|
|
"loss": 0.8099,
|
||
|
|
"step": 1775
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.39237297476027777,
|
||
|
|
"grad_norm": 0.04663481529200365,
|
||
|
|
"learning_rate": 0.00022843875534562204,
|
||
|
|
"loss": 0.7985,
|
||
|
|
"step": 1780
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.39347514603769423,
|
||
|
|
"grad_norm": 0.04209155232991889,
|
||
|
|
"learning_rate": 0.0002279461694489553,
|
||
|
|
"loss": 0.7984,
|
||
|
|
"step": 1785
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.39457731731511075,
|
||
|
|
"grad_norm": 0.04273613495319002,
|
||
|
|
"learning_rate": 0.00022745242933388507,
|
||
|
|
"loss": 0.7856,
|
||
|
|
"step": 1790
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3956794885925273,
|
||
|
|
"grad_norm": 0.04007855374098208,
|
||
|
|
"learning_rate": 0.00022695754231166125,
|
||
|
|
"loss": 0.798,
|
||
|
|
"step": 1795
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3967816598699438,
|
||
|
|
"grad_norm": 0.03874216976594234,
|
||
|
|
"learning_rate": 0.0002264615157105171,
|
||
|
|
"loss": 0.8303,
|
||
|
|
"step": 1800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3978838311473603,
|
||
|
|
"grad_norm": 0.042291279075714026,
|
||
|
|
"learning_rate": 0.00022596435687556067,
|
||
|
|
"loss": 0.8284,
|
||
|
|
"step": 1805
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.39898600242477683,
|
||
|
|
"grad_norm": 0.04134434249625502,
|
||
|
|
"learning_rate": 0.00022546607316866583,
|
||
|
|
"loss": 0.8143,
|
||
|
|
"step": 1810
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4000881737021933,
|
||
|
|
"grad_norm": 0.0426391584789927,
|
||
|
|
"learning_rate": 0.00022496667196836358,
|
||
|
|
"loss": 0.8291,
|
||
|
|
"step": 1815
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4011903449796098,
|
||
|
|
"grad_norm": 0.03943254080642002,
|
||
|
|
"learning_rate": 0.0002244661606697326,
|
||
|
|
"loss": 0.8093,
|
||
|
|
"step": 1820
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.40229251625702633,
|
||
|
|
"grad_norm": 0.04164893383401447,
|
||
|
|
"learning_rate": 0.00022396454668428982,
|
||
|
|
"loss": 0.8135,
|
||
|
|
"step": 1825
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.40339468753444285,
|
||
|
|
"grad_norm": 0.04110460672662909,
|
||
|
|
"learning_rate": 0.00022346183743988056,
|
||
|
|
"loss": 0.8083,
|
||
|
|
"step": 1830
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.40449685881185937,
|
||
|
|
"grad_norm": 0.04143399066869764,
|
||
|
|
"learning_rate": 0.00022295804038056867,
|
||
|
|
"loss": 0.798,
|
||
|
|
"step": 1835
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4055990300892759,
|
||
|
|
"grad_norm": 0.04010672194926782,
|
||
|
|
"learning_rate": 0.0002224531629665263,
|
||
|
|
"loss": 0.8132,
|
||
|
|
"step": 1840
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4067012013666924,
|
||
|
|
"grad_norm": 0.040718372885152405,
|
||
|
|
"learning_rate": 0.00022194721267392324,
|
||
|
|
"loss": 0.8237,
|
||
|
|
"step": 1845
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4078033726441089,
|
||
|
|
"grad_norm": 0.044019144470429636,
|
||
|
|
"learning_rate": 0.0002214401969948164,
|
||
|
|
"loss": 0.7955,
|
||
|
|
"step": 1850
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4089055439215254,
|
||
|
|
"grad_norm": 0.0449280577770788,
|
||
|
|
"learning_rate": 0.00022093212343703893,
|
||
|
|
"loss": 0.7929,
|
||
|
|
"step": 1855
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4100077151989419,
|
||
|
|
"grad_norm": 0.050365477985191316,
|
||
|
|
"learning_rate": 0.00022042299952408872,
|
||
|
|
"loss": 0.8389,
|
||
|
|
"step": 1860
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.41110988647635843,
|
||
|
|
"grad_norm": 0.04032290135169436,
|
||
|
|
"learning_rate": 0.00021991283279501744,
|
||
|
|
"loss": 0.796,
|
||
|
|
"step": 1865
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.41221205775377495,
|
||
|
|
"grad_norm": 0.03859944961488867,
|
||
|
|
"learning_rate": 0.0002194016308043185,
|
||
|
|
"loss": 0.7977,
|
||
|
|
"step": 1870
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.41331422903119147,
|
||
|
|
"grad_norm": 0.04159446854663543,
|
||
|
|
"learning_rate": 0.00021888940112181542,
|
||
|
|
"loss": 0.826,
|
||
|
|
"step": 1875
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.41441640030860794,
|
||
|
|
"grad_norm": 0.04359969871410925,
|
||
|
|
"learning_rate": 0.0002183761513325496,
|
||
|
|
"loss": 0.8251,
|
||
|
|
"step": 1880
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.41551857158602445,
|
||
|
|
"grad_norm": 0.04343781370305786,
|
||
|
|
"learning_rate": 0.0002178618890366682,
|
||
|
|
"loss": 0.7984,
|
||
|
|
"step": 1885
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.416620742863441,
|
||
|
|
"grad_norm": 0.04412546972897982,
|
||
|
|
"learning_rate": 0.00021734662184931137,
|
||
|
|
"loss": 0.8275,
|
||
|
|
"step": 1890
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4177229141408575,
|
||
|
|
"grad_norm": 0.046901376458696624,
|
||
|
|
"learning_rate": 0.00021683035740049952,
|
||
|
|
"loss": 0.8286,
|
||
|
|
"step": 1895
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.418825085418274,
|
||
|
|
"grad_norm": 0.04296065580749498,
|
||
|
|
"learning_rate": 0.00021631310333502062,
|
||
|
|
"loss": 0.8245,
|
||
|
|
"step": 1900
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.41992725669569053,
|
||
|
|
"grad_norm": 0.042120916788582694,
|
||
|
|
"learning_rate": 0.00021579486731231653,
|
||
|
|
"loss": 0.7803,
|
||
|
|
"step": 1905
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.421029427973107,
|
||
|
|
"grad_norm": 0.040883347305042415,
|
||
|
|
"learning_rate": 0.00021527565700637003,
|
||
|
|
"loss": 0.8347,
|
||
|
|
"step": 1910
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4221315992505235,
|
||
|
|
"grad_norm": 0.04111643166609028,
|
||
|
|
"learning_rate": 0.0002147554801055908,
|
||
|
|
"loss": 0.7808,
|
||
|
|
"step": 1915
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.42323377052794003,
|
||
|
|
"grad_norm": 0.042728251907987964,
|
||
|
|
"learning_rate": 0.0002142343443127018,
|
||
|
|
"loss": 0.8306,
|
||
|
|
"step": 1920
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.42433594180535655,
|
||
|
|
"grad_norm": 0.042082803091619436,
|
||
|
|
"learning_rate": 0.0002137122573446254,
|
||
|
|
"loss": 0.8057,
|
||
|
|
"step": 1925
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4254381130827731,
|
||
|
|
"grad_norm": 0.04346598671784763,
|
||
|
|
"learning_rate": 0.00021318922693236845,
|
||
|
|
"loss": 0.812,
|
||
|
|
"step": 1930
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4265402843601896,
|
||
|
|
"grad_norm": 0.04311043189883461,
|
||
|
|
"learning_rate": 0.00021266526082090858,
|
||
|
|
"loss": 0.7732,
|
||
|
|
"step": 1935
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4276424556376061,
|
||
|
|
"grad_norm": 0.03914987981413652,
|
||
|
|
"learning_rate": 0.00021214036676907888,
|
||
|
|
"loss": 0.7875,
|
||
|
|
"step": 1940
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4287446269150226,
|
||
|
|
"grad_norm": 0.03762591331444608,
|
||
|
|
"learning_rate": 0.00021161455254945354,
|
||
|
|
"loss": 0.8256,
|
||
|
|
"step": 1945
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4298467981924391,
|
||
|
|
"grad_norm": 0.04123350564815786,
|
||
|
|
"learning_rate": 0.00021108782594823227,
|
||
|
|
"loss": 0.8177,
|
||
|
|
"step": 1950
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4309489694698556,
|
||
|
|
"grad_norm": 0.0396041073736449,
|
||
|
|
"learning_rate": 0.00021056019476512532,
|
||
|
|
"loss": 0.8145,
|
||
|
|
"step": 1955
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.43205114074727213,
|
||
|
|
"grad_norm": 0.0380641273178196,
|
||
|
|
"learning_rate": 0.00021003166681323794,
|
||
|
|
"loss": 0.7952,
|
||
|
|
"step": 1960
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.43315331202468865,
|
||
|
|
"grad_norm": 0.038313042170990505,
|
||
|
|
"learning_rate": 0.00020950224991895456,
|
||
|
|
"loss": 0.7872,
|
||
|
|
"step": 1965
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.43425548330210517,
|
||
|
|
"grad_norm": 0.04604786797820384,
|
||
|
|
"learning_rate": 0.00020897195192182299,
|
||
|
|
"loss": 0.8094,
|
||
|
|
"step": 1970
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.43535765457952164,
|
||
|
|
"grad_norm": 0.04044192743630967,
|
||
|
|
"learning_rate": 0.00020844078067443835,
|
||
|
|
"loss": 0.8141,
|
||
|
|
"step": 1975
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.43645982585693816,
|
||
|
|
"grad_norm": 0.04177226121543752,
|
||
|
|
"learning_rate": 0.00020790874404232667,
|
||
|
|
"loss": 0.8181,
|
||
|
|
"step": 1980
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4375619971343547,
|
||
|
|
"grad_norm": 0.04439891759072682,
|
||
|
|
"learning_rate": 0.00020737584990382862,
|
||
|
|
"loss": 0.7925,
|
||
|
|
"step": 1985
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4386641684117712,
|
||
|
|
"grad_norm": 0.04270949977662141,
|
||
|
|
"learning_rate": 0.0002068421061499826,
|
||
|
|
"loss": 0.7786,
|
||
|
|
"step": 1990
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4397663396891877,
|
||
|
|
"grad_norm": 0.040991624121934515,
|
||
|
|
"learning_rate": 0.0002063075206844082,
|
||
|
|
"loss": 0.8308,
|
||
|
|
"step": 1995
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.44086851096660423,
|
||
|
|
"grad_norm": 0.041275862470859626,
|
||
|
|
"learning_rate": 0.00020577210142318876,
|
||
|
|
"loss": 0.8342,
|
||
|
|
"step": 2000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4419706822440207,
|
||
|
|
"grad_norm": 0.040817642657457055,
|
||
|
|
"learning_rate": 0.00020523585629475457,
|
||
|
|
"loss": 0.8274,
|
||
|
|
"step": 2005
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4430728535214372,
|
||
|
|
"grad_norm": 0.04417509610771959,
|
||
|
|
"learning_rate": 0.00020469879323976517,
|
||
|
|
"loss": 0.8176,
|
||
|
|
"step": 2010
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.44417502479885373,
|
||
|
|
"grad_norm": 0.04251521072112325,
|
||
|
|
"learning_rate": 0.00020416092021099193,
|
||
|
|
"loss": 0.8049,
|
||
|
|
"step": 2015
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.44527719607627025,
|
||
|
|
"grad_norm": 0.042166821439132904,
|
||
|
|
"learning_rate": 0.00020362224517320014,
|
||
|
|
"loss": 0.8014,
|
||
|
|
"step": 2020
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4463793673536868,
|
||
|
|
"grad_norm": 0.041857744422797494,
|
||
|
|
"learning_rate": 0.0002030827761030312,
|
||
|
|
"loss": 0.7916,
|
||
|
|
"step": 2025
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4474815386311033,
|
||
|
|
"grad_norm": 0.0411849481904813,
|
||
|
|
"learning_rate": 0.00020254252098888447,
|
||
|
|
"loss": 0.7706,
|
||
|
|
"step": 2030
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.44858370990851976,
|
||
|
|
"grad_norm": 0.04092873755686162,
|
||
|
|
"learning_rate": 0.00020200148783079892,
|
||
|
|
"loss": 0.7896,
|
||
|
|
"step": 2035
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4496858811859363,
|
||
|
|
"grad_norm": 0.04336026242745058,
|
||
|
|
"learning_rate": 0.0002014596846403348,
|
||
|
|
"loss": 0.7672,
|
||
|
|
"step": 2040
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4507880524633528,
|
||
|
|
"grad_norm": 0.03910209178598118,
|
||
|
|
"learning_rate": 0.0002009171194404548,
|
||
|
|
"loss": 0.7752,
|
||
|
|
"step": 2045
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4518902237407693,
|
||
|
|
"grad_norm": 0.04605371401305054,
|
||
|
|
"learning_rate": 0.00020037380026540543,
|
||
|
|
"loss": 0.8172,
|
||
|
|
"step": 2050
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.45299239501818583,
|
||
|
|
"grad_norm": 0.04099809926171493,
|
||
|
|
"learning_rate": 0.000199829735160598,
|
||
|
|
"loss": 0.7939,
|
||
|
|
"step": 2055
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.45409456629560235,
|
||
|
|
"grad_norm": 0.04031696273456619,
|
||
|
|
"learning_rate": 0.0001992849321824894,
|
||
|
|
"loss": 0.7852,
|
||
|
|
"step": 2060
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4551967375730189,
|
||
|
|
"grad_norm": 0.038355940898486575,
|
||
|
|
"learning_rate": 0.0001987393993984629,
|
||
|
|
"loss": 0.7772,
|
||
|
|
"step": 2065
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.45629890885043534,
|
||
|
|
"grad_norm": 0.04164706481380678,
|
||
|
|
"learning_rate": 0.00019819314488670866,
|
||
|
|
"loss": 0.8031,
|
||
|
|
"step": 2070
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.45740108012785186,
|
||
|
|
"grad_norm": 0.042502765015544626,
|
||
|
|
"learning_rate": 0.00019764617673610413,
|
||
|
|
"loss": 0.8199,
|
||
|
|
"step": 2075
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4585032514052684,
|
||
|
|
"grad_norm": 0.0394129908093004,
|
||
|
|
"learning_rate": 0.0001970985030460942,
|
||
|
|
"loss": 0.7861,
|
||
|
|
"step": 2080
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4596054226826849,
|
||
|
|
"grad_norm": 0.03849198049749736,
|
||
|
|
"learning_rate": 0.00019655013192657135,
|
||
|
|
"loss": 0.79,
|
||
|
|
"step": 2085
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4607075939601014,
|
||
|
|
"grad_norm": 0.044138018952375395,
|
||
|
|
"learning_rate": 0.0001960010714977555,
|
||
|
|
"loss": 0.7813,
|
||
|
|
"step": 2090
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.46180976523751793,
|
||
|
|
"grad_norm": 0.04225359100682865,
|
||
|
|
"learning_rate": 0.00019545132989007375,
|
||
|
|
"loss": 0.7865,
|
||
|
|
"step": 2095
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4629119365149344,
|
||
|
|
"grad_norm": 0.041786418031829003,
|
||
|
|
"learning_rate": 0.00019490091524404016,
|
||
|
|
"loss": 0.7911,
|
||
|
|
"step": 2100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4640141077923509,
|
||
|
|
"grad_norm": 0.04528452410421116,
|
||
|
|
"learning_rate": 0.00019434983571013485,
|
||
|
|
"loss": 0.7932,
|
||
|
|
"step": 2105
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.46511627906976744,
|
||
|
|
"grad_norm": 0.03898208334364371,
|
||
|
|
"learning_rate": 0.00019379809944868376,
|
||
|
|
"loss": 0.8061,
|
||
|
|
"step": 2110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.46621845034718395,
|
||
|
|
"grad_norm": 0.039265674569634,
|
||
|
|
"learning_rate": 0.00019324571462973737,
|
||
|
|
"loss": 0.7707,
|
||
|
|
"step": 2115
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4673206216246005,
|
||
|
|
"grad_norm": 0.040002387303416556,
|
||
|
|
"learning_rate": 0.00019269268943295013,
|
||
|
|
"loss": 0.7777,
|
||
|
|
"step": 2120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.468422792902017,
|
||
|
|
"grad_norm": 0.04208726677362012,
|
||
|
|
"learning_rate": 0.00019213903204745895,
|
||
|
|
"loss": 0.7979,
|
||
|
|
"step": 2125
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.46952496417943346,
|
||
|
|
"grad_norm": 0.03816406171273399,
|
||
|
|
"learning_rate": 0.0001915847506717622,
|
||
|
|
"loss": 0.806,
|
||
|
|
"step": 2130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.47062713545685,
|
||
|
|
"grad_norm": 0.04211319505603739,
|
||
|
|
"learning_rate": 0.00019102985351359832,
|
||
|
|
"loss": 0.7887,
|
||
|
|
"step": 2135
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4717293067342665,
|
||
|
|
"grad_norm": 0.041883426079474324,
|
||
|
|
"learning_rate": 0.00019047434878982403,
|
||
|
|
"loss": 0.7814,
|
||
|
|
"step": 2140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.472831478011683,
|
||
|
|
"grad_norm": 0.037960810503627374,
|
||
|
|
"learning_rate": 0.00018991824472629293,
|
||
|
|
"loss": 0.7698,
|
||
|
|
"step": 2145
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.47393364928909953,
|
||
|
|
"grad_norm": 0.03832451540546507,
|
||
|
|
"learning_rate": 0.0001893615495577335,
|
||
|
|
"loss": 0.7953,
|
||
|
|
"step": 2150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.47503582056651605,
|
||
|
|
"grad_norm": 0.0399683526421958,
|
||
|
|
"learning_rate": 0.0001888042715276273,
|
||
|
|
"loss": 0.7875,
|
||
|
|
"step": 2155
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4761379918439326,
|
||
|
|
"grad_norm": 0.03995007716254075,
|
||
|
|
"learning_rate": 0.00018824641888808683,
|
||
|
|
"loss": 0.7958,
|
||
|
|
"step": 2160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.47724016312134904,
|
||
|
|
"grad_norm": 0.04082501729363916,
|
||
|
|
"learning_rate": 0.0001876879998997333,
|
||
|
|
"loss": 0.8004,
|
||
|
|
"step": 2165
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.47834233439876556,
|
||
|
|
"grad_norm": 0.03753976072129883,
|
||
|
|
"learning_rate": 0.00018712902283157438,
|
||
|
|
"loss": 0.7862,
|
||
|
|
"step": 2170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4794445056761821,
|
||
|
|
"grad_norm": 0.039247708791779135,
|
||
|
|
"learning_rate": 0.00018656949596088177,
|
||
|
|
"loss": 0.7846,
|
||
|
|
"step": 2175
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4805466769535986,
|
||
|
|
"grad_norm": 0.040194874528480064,
|
||
|
|
"learning_rate": 0.00018600942757306853,
|
||
|
|
"loss": 0.7948,
|
||
|
|
"step": 2180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4816488482310151,
|
||
|
|
"grad_norm": 0.042684703351327506,
|
||
|
|
"learning_rate": 0.00018544882596156643,
|
||
|
|
"loss": 0.8328,
|
||
|
|
"step": 2185
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.48275101950843163,
|
||
|
|
"grad_norm": 0.04256364237578435,
|
||
|
|
"learning_rate": 0.0001848876994277032,
|
||
|
|
"loss": 0.8036,
|
||
|
|
"step": 2190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4838531907858481,
|
||
|
|
"grad_norm": 0.04084201277383364,
|
||
|
|
"learning_rate": 0.0001843260562805796,
|
||
|
|
"loss": 0.7838,
|
||
|
|
"step": 2195
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4849553620632646,
|
||
|
|
"grad_norm": 0.03951302995836591,
|
||
|
|
"learning_rate": 0.0001837639048369462,
|
||
|
|
"loss": 0.7729,
|
||
|
|
"step": 2200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.48605753334068114,
|
||
|
|
"grad_norm": 0.0440116634757643,
|
||
|
|
"learning_rate": 0.00018320125342108058,
|
||
|
|
"loss": 0.8097,
|
||
|
|
"step": 2205
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.48715970461809766,
|
||
|
|
"grad_norm": 0.03880553291311738,
|
||
|
|
"learning_rate": 0.0001826381103646636,
|
||
|
|
"loss": 0.7858,
|
||
|
|
"step": 2210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4882618758955142,
|
||
|
|
"grad_norm": 0.04347096499790543,
|
||
|
|
"learning_rate": 0.00018207448400665656,
|
||
|
|
"loss": 0.7931,
|
||
|
|
"step": 2215
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4893640471729307,
|
||
|
|
"grad_norm": 0.04137171191530775,
|
||
|
|
"learning_rate": 0.0001815103826931772,
|
||
|
|
"loss": 0.7904,
|
||
|
|
"step": 2220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.49046621845034716,
|
||
|
|
"grad_norm": 0.03924451193006248,
|
||
|
|
"learning_rate": 0.00018094581477737652,
|
||
|
|
"loss": 0.7892,
|
||
|
|
"step": 2225
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4915683897277637,
|
||
|
|
"grad_norm": 0.035326553011083374,
|
||
|
|
"learning_rate": 0.00018038078861931482,
|
||
|
|
"loss": 0.7699,
|
||
|
|
"step": 2230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4926705610051802,
|
||
|
|
"grad_norm": 0.037295368892913544,
|
||
|
|
"learning_rate": 0.00017981531258583794,
|
||
|
|
"loss": 0.7688,
|
||
|
|
"step": 2235
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4937727322825967,
|
||
|
|
"grad_norm": 0.04102492750892726,
|
||
|
|
"learning_rate": 0.00017924939505045364,
|
||
|
|
"loss": 0.7959,
|
||
|
|
"step": 2240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.49487490356001324,
|
||
|
|
"grad_norm": 0.038911715276106644,
|
||
|
|
"learning_rate": 0.0001786830443932071,
|
||
|
|
"loss": 0.8129,
|
||
|
|
"step": 2245
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.49597707483742975,
|
||
|
|
"grad_norm": 0.0401123359070641,
|
||
|
|
"learning_rate": 0.00017811626900055748,
|
||
|
|
"loss": 0.8031,
|
||
|
|
"step": 2250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4970792461148463,
|
||
|
|
"grad_norm": 0.044018814366690515,
|
||
|
|
"learning_rate": 0.00017754907726525302,
|
||
|
|
"loss": 0.7963,
|
||
|
|
"step": 2255
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.49818141739226274,
|
||
|
|
"grad_norm": 0.03902942910566316,
|
||
|
|
"learning_rate": 0.00017698147758620736,
|
||
|
|
"loss": 0.7607,
|
||
|
|
"step": 2260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.49928358866967926,
|
||
|
|
"grad_norm": 0.04968656280514321,
|
||
|
|
"learning_rate": 0.0001764134783683748,
|
||
|
|
"loss": 0.8039,
|
||
|
|
"step": 2265
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5003857599470958,
|
||
|
|
"grad_norm": 0.04093320446137599,
|
||
|
|
"learning_rate": 0.00017584508802262602,
|
||
|
|
"loss": 0.8126,
|
||
|
|
"step": 2270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5014879312245123,
|
||
|
|
"grad_norm": 0.04617686206504717,
|
||
|
|
"learning_rate": 0.00017527631496562352,
|
||
|
|
"loss": 0.8063,
|
||
|
|
"step": 2275
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5025901025019288,
|
||
|
|
"grad_norm": 0.038529497279601546,
|
||
|
|
"learning_rate": 0.0001747071676196968,
|
||
|
|
"loss": 0.7816,
|
||
|
|
"step": 2280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5036922737793453,
|
||
|
|
"grad_norm": 0.03888023182945524,
|
||
|
|
"learning_rate": 0.000174137654412718,
|
||
|
|
"loss": 0.8077,
|
||
|
|
"step": 2285
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5047944450567619,
|
||
|
|
"grad_norm": 0.04001737897478531,
|
||
|
|
"learning_rate": 0.00017356778377797664,
|
||
|
|
"loss": 0.8262,
|
||
|
|
"step": 2290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5058966163341784,
|
||
|
|
"grad_norm": 0.039323331641178884,
|
||
|
|
"learning_rate": 0.00017299756415405524,
|
||
|
|
"loss": 0.795,
|
||
|
|
"step": 2295
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5069987876115949,
|
||
|
|
"grad_norm": 0.04467088321827032,
|
||
|
|
"learning_rate": 0.00017242700398470393,
|
||
|
|
"loss": 0.7939,
|
||
|
|
"step": 2300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5081009588890113,
|
||
|
|
"grad_norm": 0.043810507147911466,
|
||
|
|
"learning_rate": 0.00017185611171871573,
|
||
|
|
"loss": 0.7669,
|
||
|
|
"step": 2305
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5092031301664278,
|
||
|
|
"grad_norm": 0.040939929874561506,
|
||
|
|
"learning_rate": 0.0001712848958098012,
|
||
|
|
"loss": 0.8017,
|
||
|
|
"step": 2310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5103053014438443,
|
||
|
|
"grad_norm": 0.03898184972185426,
|
||
|
|
"learning_rate": 0.00017071336471646348,
|
||
|
|
"loss": 0.8045,
|
||
|
|
"step": 2315
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5114074727212609,
|
||
|
|
"grad_norm": 0.038962684331486475,
|
||
|
|
"learning_rate": 0.0001701415269018728,
|
||
|
|
"loss": 0.8071,
|
||
|
|
"step": 2320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5125096439986774,
|
||
|
|
"grad_norm": 0.04491474953288176,
|
||
|
|
"learning_rate": 0.0001695693908337414,
|
||
|
|
"loss": 0.7909,
|
||
|
|
"step": 2325
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5136118152760939,
|
||
|
|
"grad_norm": 0.04437277653745269,
|
||
|
|
"learning_rate": 0.00016899696498419794,
|
||
|
|
"loss": 0.7973,
|
||
|
|
"step": 2330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5147139865535104,
|
||
|
|
"grad_norm": 0.04158284754618555,
|
||
|
|
"learning_rate": 0.00016842425782966224,
|
||
|
|
"loss": 0.7778,
|
||
|
|
"step": 2335
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5158161578309269,
|
||
|
|
"grad_norm": 0.04116323767907288,
|
||
|
|
"learning_rate": 0.00016785127785071949,
|
||
|
|
"loss": 0.8043,
|
||
|
|
"step": 2340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5169183291083435,
|
||
|
|
"grad_norm": 0.034684359802969134,
|
||
|
|
"learning_rate": 0.000167278033531995,
|
||
|
|
"loss": 0.79,
|
||
|
|
"step": 2345
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.51802050038576,
|
||
|
|
"grad_norm": 0.04229406791148304,
|
||
|
|
"learning_rate": 0.0001667045333620283,
|
||
|
|
"loss": 0.7795,
|
||
|
|
"step": 2350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5191226716631765,
|
||
|
|
"grad_norm": 0.03764426814798344,
|
||
|
|
"learning_rate": 0.00016613078583314756,
|
||
|
|
"loss": 0.7781,
|
||
|
|
"step": 2355
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.520224842940593,
|
||
|
|
"grad_norm": 0.03973158416131846,
|
||
|
|
"learning_rate": 0.00016555679944134382,
|
||
|
|
"loss": 0.7873,
|
||
|
|
"step": 2360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5213270142180095,
|
||
|
|
"grad_norm": 0.039706342704437673,
|
||
|
|
"learning_rate": 0.00016498258268614514,
|
||
|
|
"loss": 0.761,
|
||
|
|
"step": 2365
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5224291854954259,
|
||
|
|
"grad_norm": 0.04246187330898323,
|
||
|
|
"learning_rate": 0.00016440814407049092,
|
||
|
|
"loss": 0.7904,
|
||
|
|
"step": 2370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5235313567728425,
|
||
|
|
"grad_norm": 0.037315547695955284,
|
||
|
|
"learning_rate": 0.00016383349210060555,
|
||
|
|
"loss": 0.7916,
|
||
|
|
"step": 2375
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.524633528050259,
|
||
|
|
"grad_norm": 0.03680043140841983,
|
||
|
|
"learning_rate": 0.000163258635285873,
|
||
|
|
"loss": 0.7839,
|
||
|
|
"step": 2380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5257356993276755,
|
||
|
|
"grad_norm": 0.04240483271155321,
|
||
|
|
"learning_rate": 0.00016268358213871058,
|
||
|
|
"loss": 0.7717,
|
||
|
|
"step": 2385
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.526837870605092,
|
||
|
|
"grad_norm": 0.038360421064488774,
|
||
|
|
"learning_rate": 0.0001621083411744427,
|
||
|
|
"loss": 0.8082,
|
||
|
|
"step": 2390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5279400418825085,
|
||
|
|
"grad_norm": 0.03923682854690225,
|
||
|
|
"learning_rate": 0.00016153292091117505,
|
||
|
|
"loss": 0.7675,
|
||
|
|
"step": 2395
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5290422131599251,
|
||
|
|
"grad_norm": 0.04266472133785625,
|
||
|
|
"learning_rate": 0.00016095732986966824,
|
||
|
|
"loss": 0.7826,
|
||
|
|
"step": 2400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5301443844373416,
|
||
|
|
"grad_norm": 0.04032119205943179,
|
||
|
|
"learning_rate": 0.00016038157657321202,
|
||
|
|
"loss": 0.7694,
|
||
|
|
"step": 2405
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5312465557147581,
|
||
|
|
"grad_norm": 0.038493359106615645,
|
||
|
|
"learning_rate": 0.0001598056695474984,
|
||
|
|
"loss": 0.7851,
|
||
|
|
"step": 2410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5323487269921746,
|
||
|
|
"grad_norm": 0.04788764713041358,
|
||
|
|
"learning_rate": 0.00015922961732049617,
|
||
|
|
"loss": 0.8041,
|
||
|
|
"step": 2415
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5334508982695911,
|
||
|
|
"grad_norm": 0.03867840390035932,
|
||
|
|
"learning_rate": 0.000158653428422324,
|
||
|
|
"loss": 0.763,
|
||
|
|
"step": 2420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5345530695470077,
|
||
|
|
"grad_norm": 0.03926183742629788,
|
||
|
|
"learning_rate": 0.00015807711138512458,
|
||
|
|
"loss": 0.774,
|
||
|
|
"step": 2425
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5356552408244241,
|
||
|
|
"grad_norm": 0.04275754859474233,
|
||
|
|
"learning_rate": 0.00015750067474293774,
|
||
|
|
"loss": 0.8008,
|
||
|
|
"step": 2430
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5367574121018406,
|
||
|
|
"grad_norm": 0.041396745192273696,
|
||
|
|
"learning_rate": 0.00015692412703157478,
|
||
|
|
"loss": 0.7899,
|
||
|
|
"step": 2435
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5378595833792571,
|
||
|
|
"grad_norm": 0.03771777455755809,
|
||
|
|
"learning_rate": 0.00015634747678849146,
|
||
|
|
"loss": 0.7662,
|
||
|
|
"step": 2440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5389617546566736,
|
||
|
|
"grad_norm": 0.037105475754751184,
|
||
|
|
"learning_rate": 0.00015577073255266185,
|
||
|
|
"loss": 0.7963,
|
||
|
|
"step": 2445
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5400639259340901,
|
||
|
|
"grad_norm": 0.042466734201556076,
|
||
|
|
"learning_rate": 0.00015519390286445201,
|
||
|
|
"loss": 0.7795,
|
||
|
|
"step": 2450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5411660972115067,
|
||
|
|
"grad_norm": 0.03800624488466557,
|
||
|
|
"learning_rate": 0.00015461699626549314,
|
||
|
|
"loss": 0.7789,
|
||
|
|
"step": 2455
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5422682684889232,
|
||
|
|
"grad_norm": 0.03799071186124082,
|
||
|
|
"learning_rate": 0.00015404002129855557,
|
||
|
|
"loss": 0.7621,
|
||
|
|
"step": 2460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5433704397663397,
|
||
|
|
"grad_norm": 0.04246315994111129,
|
||
|
|
"learning_rate": 0.00015346298650742177,
|
||
|
|
"loss": 0.7898,
|
||
|
|
"step": 2465
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5444726110437562,
|
||
|
|
"grad_norm": 0.0383853594452228,
|
||
|
|
"learning_rate": 0.00015288590043676027,
|
||
|
|
"loss": 0.7838,
|
||
|
|
"step": 2470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5455747823211727,
|
||
|
|
"grad_norm": 0.037342426062281935,
|
||
|
|
"learning_rate": 0.00015230877163199878,
|
||
|
|
"loss": 0.7746,
|
||
|
|
"step": 2475
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5466769535985893,
|
||
|
|
"grad_norm": 0.03967766879530587,
|
||
|
|
"learning_rate": 0.000151731608639198,
|
||
|
|
"loss": 0.7807,
|
||
|
|
"step": 2480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5477791248760058,
|
||
|
|
"grad_norm": 0.038046687905520335,
|
||
|
|
"learning_rate": 0.0001511544200049247,
|
||
|
|
"loss": 0.7624,
|
||
|
|
"step": 2485
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5488812961534223,
|
||
|
|
"grad_norm": 0.038282722756821576,
|
||
|
|
"learning_rate": 0.00015057721427612548,
|
||
|
|
"loss": 0.7781,
|
||
|
|
"step": 2490
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5499834674308387,
|
||
|
|
"grad_norm": 0.04204297605214361,
|
||
|
|
"learning_rate": 0.00015,
|
||
|
|
"loss": 0.7889,
|
||
|
|
"step": 2495
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5510856387082552,
|
||
|
|
"grad_norm": 0.04253941444925998,
|
||
|
|
"learning_rate": 0.00014942278572387452,
|
||
|
|
"loss": 0.7874,
|
||
|
|
"step": 2500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5521878099856717,
|
||
|
|
"grad_norm": 0.04099337109892425,
|
||
|
|
"learning_rate": 0.00014884557999507528,
|
||
|
|
"loss": 0.7932,
|
||
|
|
"step": 2505
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5532899812630883,
|
||
|
|
"grad_norm": 0.043225237652168194,
|
||
|
|
"learning_rate": 0.00014826839136080204,
|
||
|
|
"loss": 0.8035,
|
||
|
|
"step": 2510
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5543921525405048,
|
||
|
|
"grad_norm": 0.04237211794633771,
|
||
|
|
"learning_rate": 0.00014769122836800122,
|
||
|
|
"loss": 0.782,
|
||
|
|
"step": 2515
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5554943238179213,
|
||
|
|
"grad_norm": 0.0390643188084349,
|
||
|
|
"learning_rate": 0.00014711409956323976,
|
||
|
|
"loss": 0.8021,
|
||
|
|
"step": 2520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5565964950953378,
|
||
|
|
"grad_norm": 0.038912412857210685,
|
||
|
|
"learning_rate": 0.00014653701349257823,
|
||
|
|
"loss": 0.7713,
|
||
|
|
"step": 2525
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5576986663727543,
|
||
|
|
"grad_norm": 0.04021618253944335,
|
||
|
|
"learning_rate": 0.00014595997870144443,
|
||
|
|
"loss": 0.7711,
|
||
|
|
"step": 2530
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5588008376501709,
|
||
|
|
"grad_norm": 0.04054714580080947,
|
||
|
|
"learning_rate": 0.00014538300373450683,
|
||
|
|
"loss": 0.7959,
|
||
|
|
"step": 2535
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5599030089275874,
|
||
|
|
"grad_norm": 0.0378078133538945,
|
||
|
|
"learning_rate": 0.00014480609713554796,
|
||
|
|
"loss": 0.7533,
|
||
|
|
"step": 2540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5610051802050039,
|
||
|
|
"grad_norm": 0.03566763348348747,
|
||
|
|
"learning_rate": 0.0001442292674473381,
|
||
|
|
"loss": 0.7842,
|
||
|
|
"step": 2545
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5621073514824204,
|
||
|
|
"grad_norm": 0.04162087831710151,
|
||
|
|
"learning_rate": 0.0001436525232115086,
|
||
|
|
"loss": 0.7765,
|
||
|
|
"step": 2550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5632095227598369,
|
||
|
|
"grad_norm": 0.039192296915950345,
|
||
|
|
"learning_rate": 0.00014307587296842524,
|
||
|
|
"loss": 0.7761,
|
||
|
|
"step": 2555
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5643116940372533,
|
||
|
|
"grad_norm": 0.040771917142651264,
|
||
|
|
"learning_rate": 0.00014249932525706223,
|
||
|
|
"loss": 0.7637,
|
||
|
|
"step": 2560
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5654138653146699,
|
||
|
|
"grad_norm": 0.0404557988061511,
|
||
|
|
"learning_rate": 0.00014192288861487545,
|
||
|
|
"loss": 0.7809,
|
||
|
|
"step": 2565
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5665160365920864,
|
||
|
|
"grad_norm": 0.0380287543914902,
|
||
|
|
"learning_rate": 0.00014134657157767593,
|
||
|
|
"loss": 0.7744,
|
||
|
|
"step": 2570
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5676182078695029,
|
||
|
|
"grad_norm": 0.037545959627603626,
|
||
|
|
"learning_rate": 0.00014077038267950383,
|
||
|
|
"loss": 0.7705,
|
||
|
|
"step": 2575
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5687203791469194,
|
||
|
|
"grad_norm": 0.04121660123612755,
|
||
|
|
"learning_rate": 0.00014019433045250158,
|
||
|
|
"loss": 0.7969,
|
||
|
|
"step": 2580
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5698225504243359,
|
||
|
|
"grad_norm": 0.03543171857215221,
|
||
|
|
"learning_rate": 0.00013961842342678798,
|
||
|
|
"loss": 0.7706,
|
||
|
|
"step": 2585
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5709247217017525,
|
||
|
|
"grad_norm": 0.03970445149727504,
|
||
|
|
"learning_rate": 0.0001390426701303317,
|
||
|
|
"loss": 0.7813,
|
||
|
|
"step": 2590
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.572026892979169,
|
||
|
|
"grad_norm": 0.04263582518492721,
|
||
|
|
"learning_rate": 0.00013846707908882498,
|
||
|
|
"loss": 0.8044,
|
||
|
|
"step": 2595
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5731290642565855,
|
||
|
|
"grad_norm": 0.03919599782168947,
|
||
|
|
"learning_rate": 0.0001378916588255573,
|
||
|
|
"loss": 0.7709,
|
||
|
|
"step": 2600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.574231235534002,
|
||
|
|
"grad_norm": 0.04085513710063447,
|
||
|
|
"learning_rate": 0.0001373164178612894,
|
||
|
|
"loss": 0.7916,
|
||
|
|
"step": 2605
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5753334068114185,
|
||
|
|
"grad_norm": 0.03947664604661718,
|
||
|
|
"learning_rate": 0.0001367413647141269,
|
||
|
|
"loss": 0.7829,
|
||
|
|
"step": 2610
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.576435578088835,
|
||
|
|
"grad_norm": 0.03819848642986916,
|
||
|
|
"learning_rate": 0.00013616650789939443,
|
||
|
|
"loss": 0.7736,
|
||
|
|
"step": 2615
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5775377493662515,
|
||
|
|
"grad_norm": 0.03928261920892333,
|
||
|
|
"learning_rate": 0.0001355918559295091,
|
||
|
|
"loss": 0.7934,
|
||
|
|
"step": 2620
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.578639920643668,
|
||
|
|
"grad_norm": 0.04465492341767027,
|
||
|
|
"learning_rate": 0.00013501741731385483,
|
||
|
|
"loss": 0.7872,
|
||
|
|
"step": 2625
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5797420919210845,
|
||
|
|
"grad_norm": 0.04518050772542813,
|
||
|
|
"learning_rate": 0.00013444320055865618,
|
||
|
|
"loss": 0.7978,
|
||
|
|
"step": 2630
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.580844263198501,
|
||
|
|
"grad_norm": 0.03823568510951906,
|
||
|
|
"learning_rate": 0.00013386921416685239,
|
||
|
|
"loss": 0.8026,
|
||
|
|
"step": 2635
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5819464344759175,
|
||
|
|
"grad_norm": 0.03860337235855855,
|
||
|
|
"learning_rate": 0.0001332954666379717,
|
||
|
|
"loss": 0.7819,
|
||
|
|
"step": 2640
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5830486057533341,
|
||
|
|
"grad_norm": 0.040848904672585555,
|
||
|
|
"learning_rate": 0.00013272196646800497,
|
||
|
|
"loss": 0.7718,
|
||
|
|
"step": 2645
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5841507770307506,
|
||
|
|
"grad_norm": 0.039991424568808075,
|
||
|
|
"learning_rate": 0.0001321487221492805,
|
||
|
|
"loss": 0.7737,
|
||
|
|
"step": 2650
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5852529483081671,
|
||
|
|
"grad_norm": 0.04406998907502384,
|
||
|
|
"learning_rate": 0.00013157574217033773,
|
||
|
|
"loss": 0.7804,
|
||
|
|
"step": 2655
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5863551195855836,
|
||
|
|
"grad_norm": 0.042736167667461564,
|
||
|
|
"learning_rate": 0.00013100303501580206,
|
||
|
|
"loss": 0.7864,
|
||
|
|
"step": 2660
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5874572908630001,
|
||
|
|
"grad_norm": 0.039658625402537326,
|
||
|
|
"learning_rate": 0.0001304306091662586,
|
||
|
|
"loss": 0.7879,
|
||
|
|
"step": 2665
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5885594621404167,
|
||
|
|
"grad_norm": 0.03732667476656254,
|
||
|
|
"learning_rate": 0.0001298584730981272,
|
||
|
|
"loss": 0.7958,
|
||
|
|
"step": 2670
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5896616334178332,
|
||
|
|
"grad_norm": 0.0385663925190591,
|
||
|
|
"learning_rate": 0.00012928663528353652,
|
||
|
|
"loss": 0.7532,
|
||
|
|
"step": 2675
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5907638046952497,
|
||
|
|
"grad_norm": 0.039100737225537294,
|
||
|
|
"learning_rate": 0.00012871510419019876,
|
||
|
|
"loss": 0.8146,
|
||
|
|
"step": 2680
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5918659759726661,
|
||
|
|
"grad_norm": 0.04322733978868932,
|
||
|
|
"learning_rate": 0.0001281438882812843,
|
||
|
|
"loss": 0.7844,
|
||
|
|
"step": 2685
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5929681472500826,
|
||
|
|
"grad_norm": 0.03855540614705994,
|
||
|
|
"learning_rate": 0.00012757299601529604,
|
||
|
|
"loss": 0.7444,
|
||
|
|
"step": 2690
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5940703185274991,
|
||
|
|
"grad_norm": 0.03976869418403505,
|
||
|
|
"learning_rate": 0.00012700243584594479,
|
||
|
|
"loss": 0.7706,
|
||
|
|
"step": 2695
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5951724898049157,
|
||
|
|
"grad_norm": 0.038039179428976305,
|
||
|
|
"learning_rate": 0.00012643221622202336,
|
||
|
|
"loss": 0.7497,
|
||
|
|
"step": 2700
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5962746610823322,
|
||
|
|
"grad_norm": 0.03951382972148692,
|
||
|
|
"learning_rate": 0.00012586234558728207,
|
||
|
|
"loss": 0.7571,
|
||
|
|
"step": 2705
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5973768323597487,
|
||
|
|
"grad_norm": 0.04180256816699712,
|
||
|
|
"learning_rate": 0.0001252928323803032,
|
||
|
|
"loss": 0.7538,
|
||
|
|
"step": 2710
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5984790036371652,
|
||
|
|
"grad_norm": 0.037309356744730904,
|
||
|
|
"learning_rate": 0.00012472368503437648,
|
||
|
|
"loss": 0.7924,
|
||
|
|
"step": 2715
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5995811749145817,
|
||
|
|
"grad_norm": 0.04216451657557382,
|
||
|
|
"learning_rate": 0.00012415491197737395,
|
||
|
|
"loss": 0.7816,
|
||
|
|
"step": 2720
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6006833461919983,
|
||
|
|
"grad_norm": 0.039059339374739786,
|
||
|
|
"learning_rate": 0.00012358652163162523,
|
||
|
|
"loss": 0.7394,
|
||
|
|
"step": 2725
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6017855174694148,
|
||
|
|
"grad_norm": 0.03934478577799933,
|
||
|
|
"learning_rate": 0.00012301852241379267,
|
||
|
|
"loss": 0.7903,
|
||
|
|
"step": 2730
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6028876887468313,
|
||
|
|
"grad_norm": 0.03677529471356991,
|
||
|
|
"learning_rate": 0.00012245092273474695,
|
||
|
|
"loss": 0.7688,
|
||
|
|
"step": 2735
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6039898600242478,
|
||
|
|
"grad_norm": 0.04379989553642847,
|
||
|
|
"learning_rate": 0.00012188373099944252,
|
||
|
|
"loss": 0.7791,
|
||
|
|
"step": 2740
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6050920313016642,
|
||
|
|
"grad_norm": 0.03853225124689197,
|
||
|
|
"learning_rate": 0.00012131695560679285,
|
||
|
|
"loss": 0.7842,
|
||
|
|
"step": 2745
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6061942025790807,
|
||
|
|
"grad_norm": 0.04243715932465795,
|
||
|
|
"learning_rate": 0.0001207506049495464,
|
||
|
|
"loss": 0.7633,
|
||
|
|
"step": 2750
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6072963738564973,
|
||
|
|
"grad_norm": 0.042350895868304664,
|
||
|
|
"learning_rate": 0.00012018468741416206,
|
||
|
|
"loss": 0.7992,
|
||
|
|
"step": 2755
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6083985451339138,
|
||
|
|
"grad_norm": 0.042511366861454146,
|
||
|
|
"learning_rate": 0.00011961921138068517,
|
||
|
|
"loss": 0.7628,
|
||
|
|
"step": 2760
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6095007164113303,
|
||
|
|
"grad_norm": 0.03945967376445848,
|
||
|
|
"learning_rate": 0.00011905418522262343,
|
||
|
|
"loss": 0.7798,
|
||
|
|
"step": 2765
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6106028876887468,
|
||
|
|
"grad_norm": 0.04134467702346202,
|
||
|
|
"learning_rate": 0.00011848961730682276,
|
||
|
|
"loss": 0.7736,
|
||
|
|
"step": 2770
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6117050589661633,
|
||
|
|
"grad_norm": 0.04321593442131728,
|
||
|
|
"learning_rate": 0.00011792551599334342,
|
||
|
|
"loss": 0.7729,
|
||
|
|
"step": 2775
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6128072302435799,
|
||
|
|
"grad_norm": 0.03753481531786513,
|
||
|
|
"learning_rate": 0.00011736188963533636,
|
||
|
|
"loss": 0.7868,
|
||
|
|
"step": 2780
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6139094015209964,
|
||
|
|
"grad_norm": 0.046677240052044086,
|
||
|
|
"learning_rate": 0.0001167987465789194,
|
||
|
|
"loss": 0.788,
|
||
|
|
"step": 2785
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6150115727984129,
|
||
|
|
"grad_norm": 0.04004956558986007,
|
||
|
|
"learning_rate": 0.00011623609516305375,
|
||
|
|
"loss": 0.7669,
|
||
|
|
"step": 2790
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6161137440758294,
|
||
|
|
"grad_norm": 0.03877661195736084,
|
||
|
|
"learning_rate": 0.0001156739437194204,
|
||
|
|
"loss": 0.7403,
|
||
|
|
"step": 2795
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6172159153532459,
|
||
|
|
"grad_norm": 0.037935876666486346,
|
||
|
|
"learning_rate": 0.00011511230057229678,
|
||
|
|
"loss": 0.7373,
|
||
|
|
"step": 2800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6183180866306625,
|
||
|
|
"grad_norm": 0.038133591224345446,
|
||
|
|
"learning_rate": 0.00011455117403843358,
|
||
|
|
"loss": 0.7626,
|
||
|
|
"step": 2805
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6194202579080789,
|
||
|
|
"grad_norm": 0.040488281712004456,
|
||
|
|
"learning_rate": 0.00011399057242693143,
|
||
|
|
"loss": 0.7748,
|
||
|
|
"step": 2810
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6205224291854954,
|
||
|
|
"grad_norm": 0.045588727331748555,
|
||
|
|
"learning_rate": 0.00011343050403911823,
|
||
|
|
"loss": 0.7566,
|
||
|
|
"step": 2815
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6216246004629119,
|
||
|
|
"grad_norm": 0.04322927430800435,
|
||
|
|
"learning_rate": 0.0001128709771684256,
|
||
|
|
"loss": 0.7405,
|
||
|
|
"step": 2820
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6227267717403284,
|
||
|
|
"grad_norm": 0.03941622010511477,
|
||
|
|
"learning_rate": 0.00011231200010026668,
|
||
|
|
"loss": 0.7699,
|
||
|
|
"step": 2825
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6238289430177449,
|
||
|
|
"grad_norm": 0.040341244924510265,
|
||
|
|
"learning_rate": 0.00011175358111191316,
|
||
|
|
"loss": 0.7546,
|
||
|
|
"step": 2830
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6249311142951615,
|
||
|
|
"grad_norm": 0.0370716397793875,
|
||
|
|
"learning_rate": 0.00011119572847237272,
|
||
|
|
"loss": 0.7524,
|
||
|
|
"step": 2835
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.626033285572578,
|
||
|
|
"grad_norm": 0.04088692753580051,
|
||
|
|
"learning_rate": 0.00011063845044226649,
|
||
|
|
"loss": 0.7737,
|
||
|
|
"step": 2840
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6271354568499945,
|
||
|
|
"grad_norm": 0.04149954061127777,
|
||
|
|
"learning_rate": 0.00011008175527370708,
|
||
|
|
"loss": 0.7635,
|
||
|
|
"step": 2845
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.628237628127411,
|
||
|
|
"grad_norm": 0.04067030099074359,
|
||
|
|
"learning_rate": 0.00010952565121017595,
|
||
|
|
"loss": 0.7781,
|
||
|
|
"step": 2850
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6293397994048275,
|
||
|
|
"grad_norm": 0.037995610342879724,
|
||
|
|
"learning_rate": 0.00010897014648640164,
|
||
|
|
"loss": 0.7536,
|
||
|
|
"step": 2855
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6304419706822441,
|
||
|
|
"grad_norm": 0.038638564159584964,
|
||
|
|
"learning_rate": 0.0001084152493282378,
|
||
|
|
"loss": 0.7582,
|
||
|
|
"step": 2860
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6315441419596606,
|
||
|
|
"grad_norm": 0.03865577853003443,
|
||
|
|
"learning_rate": 0.00010786096795254105,
|
||
|
|
"loss": 0.743,
|
||
|
|
"step": 2865
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6326463132370771,
|
||
|
|
"grad_norm": 0.037742667105078676,
|
||
|
|
"learning_rate": 0.00010730731056704987,
|
||
|
|
"loss": 0.7738,
|
||
|
|
"step": 2870
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6337484845144935,
|
||
|
|
"grad_norm": 0.04087500905979875,
|
||
|
|
"learning_rate": 0.0001067542853702626,
|
||
|
|
"loss": 0.7454,
|
||
|
|
"step": 2875
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.63485065579191,
|
||
|
|
"grad_norm": 0.03681788140604627,
|
||
|
|
"learning_rate": 0.00010620190055131628,
|
||
|
|
"loss": 0.7513,
|
||
|
|
"step": 2880
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6359528270693265,
|
||
|
|
"grad_norm": 0.039530646965778786,
|
||
|
|
"learning_rate": 0.00010565016428986515,
|
||
|
|
"loss": 0.7863,
|
||
|
|
"step": 2885
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6370549983467431,
|
||
|
|
"grad_norm": 0.03637232048859645,
|
||
|
|
"learning_rate": 0.00010509908475595984,
|
||
|
|
"loss": 0.7871,
|
||
|
|
"step": 2890
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6381571696241596,
|
||
|
|
"grad_norm": 0.037532035474368244,
|
||
|
|
"learning_rate": 0.0001045486701099262,
|
||
|
|
"loss": 0.7868,
|
||
|
|
"step": 2895
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6392593409015761,
|
||
|
|
"grad_norm": 0.04139117865057032,
|
||
|
|
"learning_rate": 0.0001039989285022445,
|
||
|
|
"loss": 0.7544,
|
||
|
|
"step": 2900
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6403615121789926,
|
||
|
|
"grad_norm": 0.04037843881677972,
|
||
|
|
"learning_rate": 0.00010344986807342866,
|
||
|
|
"loss": 0.7612,
|
||
|
|
"step": 2905
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6414636834564091,
|
||
|
|
"grad_norm": 0.039565719386194985,
|
||
|
|
"learning_rate": 0.00010290149695390581,
|
||
|
|
"loss": 0.7616,
|
||
|
|
"step": 2910
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6425658547338257,
|
||
|
|
"grad_norm": 0.038310169105771584,
|
||
|
|
"learning_rate": 0.00010235382326389586,
|
||
|
|
"loss": 0.7576,
|
||
|
|
"step": 2915
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6436680260112422,
|
||
|
|
"grad_norm": 0.03936468865097429,
|
||
|
|
"learning_rate": 0.00010180685511329131,
|
||
|
|
"loss": 0.7702,
|
||
|
|
"step": 2920
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6447701972886587,
|
||
|
|
"grad_norm": 0.0398256871335756,
|
||
|
|
"learning_rate": 0.00010126060060153713,
|
||
|
|
"loss": 0.7822,
|
||
|
|
"step": 2925
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6458723685660752,
|
||
|
|
"grad_norm": 0.0413034275637383,
|
||
|
|
"learning_rate": 0.00010071506781751063,
|
||
|
|
"loss": 0.7542,
|
||
|
|
"step": 2930
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6469745398434916,
|
||
|
|
"grad_norm": 0.03871254029791003,
|
||
|
|
"learning_rate": 0.000100170264839402,
|
||
|
|
"loss": 0.7335,
|
||
|
|
"step": 2935
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6480767111209081,
|
||
|
|
"grad_norm": 0.036091901012243334,
|
||
|
|
"learning_rate": 9.962619973459453e-05,
|
||
|
|
"loss": 0.7748,
|
||
|
|
"step": 2940
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6491788823983247,
|
||
|
|
"grad_norm": 0.04138951063703848,
|
||
|
|
"learning_rate": 9.90828805595452e-05,
|
||
|
|
"loss": 0.7524,
|
||
|
|
"step": 2945
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6502810536757412,
|
||
|
|
"grad_norm": 0.03858848182433528,
|
||
|
|
"learning_rate": 9.854031535966521e-05,
|
||
|
|
"loss": 0.7517,
|
||
|
|
"step": 2950
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6513832249531577,
|
||
|
|
"grad_norm": 0.03602604340055366,
|
||
|
|
"learning_rate": 9.799851216920107e-05,
|
||
|
|
"loss": 0.7337,
|
||
|
|
"step": 2955
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6524853962305742,
|
||
|
|
"grad_norm": 0.03579532104123597,
|
||
|
|
"learning_rate": 9.745747901111552e-05,
|
||
|
|
"loss": 0.7623,
|
||
|
|
"step": 2960
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6535875675079907,
|
||
|
|
"grad_norm": 0.0381744125557122,
|
||
|
|
"learning_rate": 9.691722389696879e-05,
|
||
|
|
"loss": 0.7683,
|
||
|
|
"step": 2965
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6546897387854073,
|
||
|
|
"grad_norm": 0.03608442329560764,
|
||
|
|
"learning_rate": 9.637775482679988e-05,
|
||
|
|
"loss": 0.7732,
|
||
|
|
"step": 2970
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6557919100628238,
|
||
|
|
"grad_norm": 0.03784439256503652,
|
||
|
|
"learning_rate": 9.583907978900807e-05,
|
||
|
|
"loss": 0.7739,
|
||
|
|
"step": 2975
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6568940813402403,
|
||
|
|
"grad_norm": 0.03945771278463694,
|
||
|
|
"learning_rate": 9.530120676023482e-05,
|
||
|
|
"loss": 0.7442,
|
||
|
|
"step": 2980
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6579962526176568,
|
||
|
|
"grad_norm": 0.03667458666034089,
|
||
|
|
"learning_rate": 9.476414370524538e-05,
|
||
|
|
"loss": 0.7456,
|
||
|
|
"step": 2985
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6590984238950733,
|
||
|
|
"grad_norm": 0.03940939789339881,
|
||
|
|
"learning_rate": 9.422789857681124e-05,
|
||
|
|
"loss": 0.7438,
|
||
|
|
"step": 2990
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6602005951724899,
|
||
|
|
"grad_norm": 0.03803960926078932,
|
||
|
|
"learning_rate": 9.36924793155918e-05,
|
||
|
|
"loss": 0.778,
|
||
|
|
"step": 2995
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6613027664499063,
|
||
|
|
"grad_norm": 0.035231525831095054,
|
||
|
|
"learning_rate": 9.315789385001738e-05,
|
||
|
|
"loss": 0.7647,
|
||
|
|
"step": 3000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6624049377273228,
|
||
|
|
"grad_norm": 0.03745699399316068,
|
||
|
|
"learning_rate": 9.262415009617139e-05,
|
||
|
|
"loss": 0.7684,
|
||
|
|
"step": 3005
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6635071090047393,
|
||
|
|
"grad_norm": 0.03586306355233504,
|
||
|
|
"learning_rate": 9.209125595767336e-05,
|
||
|
|
"loss": 0.7458,
|
||
|
|
"step": 3010
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6646092802821558,
|
||
|
|
"grad_norm": 0.036839355886467764,
|
||
|
|
"learning_rate": 9.15592193255617e-05,
|
||
|
|
"loss": 0.7706,
|
||
|
|
"step": 3015
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6657114515595723,
|
||
|
|
"grad_norm": 0.03906285720782755,
|
||
|
|
"learning_rate": 9.102804807817699e-05,
|
||
|
|
"loss": 0.781,
|
||
|
|
"step": 3020
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6668136228369889,
|
||
|
|
"grad_norm": 0.03869457663028768,
|
||
|
|
"learning_rate": 9.049775008104542e-05,
|
||
|
|
"loss": 0.7486,
|
||
|
|
"step": 3025
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6679157941144054,
|
||
|
|
"grad_norm": 0.0393845820755994,
|
||
|
|
"learning_rate": 8.996833318676204e-05,
|
||
|
|
"loss": 0.7473,
|
||
|
|
"step": 3030
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6690179653918219,
|
||
|
|
"grad_norm": 0.03941498428248022,
|
||
|
|
"learning_rate": 8.943980523487469e-05,
|
||
|
|
"loss": 0.7846,
|
||
|
|
"step": 3035
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6701201366692384,
|
||
|
|
"grad_norm": 0.0420251439088419,
|
||
|
|
"learning_rate": 8.891217405176774e-05,
|
||
|
|
"loss": 0.7608,
|
||
|
|
"step": 3040
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6712223079466549,
|
||
|
|
"grad_norm": 0.035649123267955884,
|
||
|
|
"learning_rate": 8.838544745054645e-05,
|
||
|
|
"loss": 0.7719,
|
||
|
|
"step": 3045
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6723244792240715,
|
||
|
|
"grad_norm": 0.03877525844159031,
|
||
|
|
"learning_rate": 8.785963323092108e-05,
|
||
|
|
"loss": 0.7582,
|
||
|
|
"step": 3050
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.673426650501488,
|
||
|
|
"grad_norm": 0.037091463531239946,
|
||
|
|
"learning_rate": 8.733473917909144e-05,
|
||
|
|
"loss": 0.7411,
|
||
|
|
"step": 3055
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6745288217789045,
|
||
|
|
"grad_norm": 0.036271594287721816,
|
||
|
|
"learning_rate": 8.68107730676315e-05,
|
||
|
|
"loss": 0.7849,
|
||
|
|
"step": 3060
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6756309930563209,
|
||
|
|
"grad_norm": 0.03541699632121048,
|
||
|
|
"learning_rate": 8.628774265537462e-05,
|
||
|
|
"loss": 0.7514,
|
||
|
|
"step": 3065
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6767331643337374,
|
||
|
|
"grad_norm": 0.0362990364448018,
|
||
|
|
"learning_rate": 8.576565568729813e-05,
|
||
|
|
"loss": 0.7474,
|
||
|
|
"step": 3070
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6778353356111539,
|
||
|
|
"grad_norm": 0.03921802869726997,
|
||
|
|
"learning_rate": 8.524451989440918e-05,
|
||
|
|
"loss": 0.754,
|
||
|
|
"step": 3075
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6789375068885705,
|
||
|
|
"grad_norm": 0.037337016378149755,
|
||
|
|
"learning_rate": 8.472434299362998e-05,
|
||
|
|
"loss": 0.75,
|
||
|
|
"step": 3080
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.680039678165987,
|
||
|
|
"grad_norm": 0.03891038958554527,
|
||
|
|
"learning_rate": 8.420513268768347e-05,
|
||
|
|
"loss": 0.7859,
|
||
|
|
"step": 3085
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6811418494434035,
|
||
|
|
"grad_norm": 0.035894066909538044,
|
||
|
|
"learning_rate": 8.368689666497938e-05,
|
||
|
|
"loss": 0.7329,
|
||
|
|
"step": 3090
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.68224402072082,
|
||
|
|
"grad_norm": 0.039979769321902066,
|
||
|
|
"learning_rate": 8.31696425995004e-05,
|
||
|
|
"loss": 0.7503,
|
||
|
|
"step": 3095
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6833461919982365,
|
||
|
|
"grad_norm": 0.03936168369937806,
|
||
|
|
"learning_rate": 8.26533781506887e-05,
|
||
|
|
"loss": 0.7726,
|
||
|
|
"step": 3100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6844483632756531,
|
||
|
|
"grad_norm": 0.04019560215837221,
|
||
|
|
"learning_rate": 8.21381109633318e-05,
|
||
|
|
"loss": 0.7432,
|
||
|
|
"step": 3105
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6855505345530696,
|
||
|
|
"grad_norm": 0.039269909994462844,
|
||
|
|
"learning_rate": 8.162384866745036e-05,
|
||
|
|
"loss": 0.7538,
|
||
|
|
"step": 3110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6866527058304861,
|
||
|
|
"grad_norm": 0.03904037612780349,
|
||
|
|
"learning_rate": 8.111059887818459e-05,
|
||
|
|
"loss": 0.744,
|
||
|
|
"step": 3115
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6877548771079026,
|
||
|
|
"grad_norm": 0.03885534367589714,
|
||
|
|
"learning_rate": 8.059836919568152e-05,
|
||
|
|
"loss": 0.7328,
|
||
|
|
"step": 3120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.688857048385319,
|
||
|
|
"grad_norm": 0.03858452994442185,
|
||
|
|
"learning_rate": 8.008716720498253e-05,
|
||
|
|
"loss": 0.7701,
|
||
|
|
"step": 3125
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6899592196627355,
|
||
|
|
"grad_norm": 0.03519383464662896,
|
||
|
|
"learning_rate": 7.957700047591121e-05,
|
||
|
|
"loss": 0.7451,
|
||
|
|
"step": 3130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6910613909401521,
|
||
|
|
"grad_norm": 0.0380259068615794,
|
||
|
|
"learning_rate": 7.906787656296107e-05,
|
||
|
|
"loss": 0.7556,
|
||
|
|
"step": 3135
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6921635622175686,
|
||
|
|
"grad_norm": 0.038944765250869484,
|
||
|
|
"learning_rate": 7.855980300518354e-05,
|
||
|
|
"loss": 0.7389,
|
||
|
|
"step": 3140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6932657334949851,
|
||
|
|
"grad_norm": 0.04470561682300718,
|
||
|
|
"learning_rate": 7.805278732607678e-05,
|
||
|
|
"loss": 0.7568,
|
||
|
|
"step": 3145
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6943679047724016,
|
||
|
|
"grad_norm": 0.04297680477477516,
|
||
|
|
"learning_rate": 7.754683703347372e-05,
|
||
|
|
"loss": 0.7626,
|
||
|
|
"step": 3150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6954700760498181,
|
||
|
|
"grad_norm": 0.034697565387832634,
|
||
|
|
"learning_rate": 7.704195961943129e-05,
|
||
|
|
"loss": 0.7721,
|
||
|
|
"step": 3155
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6965722473272347,
|
||
|
|
"grad_norm": 0.04072535239631796,
|
||
|
|
"learning_rate": 7.653816256011941e-05,
|
||
|
|
"loss": 0.7757,
|
||
|
|
"step": 3160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6976744186046512,
|
||
|
|
"grad_norm": 0.035271346752025576,
|
||
|
|
"learning_rate": 7.603545331571018e-05,
|
||
|
|
"loss": 0.7629,
|
||
|
|
"step": 3165
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6987765898820677,
|
||
|
|
"grad_norm": 0.037357663884374157,
|
||
|
|
"learning_rate": 7.553383933026741e-05,
|
||
|
|
"loss": 0.7549,
|
||
|
|
"step": 3170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6998787611594842,
|
||
|
|
"grad_norm": 0.03637211308219045,
|
||
|
|
"learning_rate": 7.503332803163641e-05,
|
||
|
|
"loss": 0.7529,
|
||
|
|
"step": 3175
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7009809324369007,
|
||
|
|
"grad_norm": 0.039519222558093065,
|
||
|
|
"learning_rate": 7.453392683133415e-05,
|
||
|
|
"loss": 0.7879,
|
||
|
|
"step": 3180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7020831037143173,
|
||
|
|
"grad_norm": 0.03323807896911395,
|
||
|
|
"learning_rate": 7.403564312443932e-05,
|
||
|
|
"loss": 0.7189,
|
||
|
|
"step": 3185
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7031852749917337,
|
||
|
|
"grad_norm": 0.03896304992954701,
|
||
|
|
"learning_rate": 7.353848428948288e-05,
|
||
|
|
"loss": 0.732,
|
||
|
|
"step": 3190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7042874462691502,
|
||
|
|
"grad_norm": 0.03533752667617695,
|
||
|
|
"learning_rate": 7.304245768833872e-05,
|
||
|
|
"loss": 0.7499,
|
||
|
|
"step": 3195
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7053896175465667,
|
||
|
|
"grad_norm": 0.04289239640414403,
|
||
|
|
"learning_rate": 7.25475706661149e-05,
|
||
|
|
"loss": 0.7518,
|
||
|
|
"step": 3200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7064917888239832,
|
||
|
|
"grad_norm": 0.036700606939269846,
|
||
|
|
"learning_rate": 7.20538305510447e-05,
|
||
|
|
"loss": 0.7444,
|
||
|
|
"step": 3205
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7075939601013997,
|
||
|
|
"grad_norm": 0.039301039880959406,
|
||
|
|
"learning_rate": 7.156124465437799e-05,
|
||
|
|
"loss": 0.7647,
|
||
|
|
"step": 3210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7086961313788163,
|
||
|
|
"grad_norm": 0.039132429979466775,
|
||
|
|
"learning_rate": 7.106982027027314e-05,
|
||
|
|
"loss": 0.7464,
|
||
|
|
"step": 3215
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7097983026562328,
|
||
|
|
"grad_norm": 0.03668166024260441,
|
||
|
|
"learning_rate": 7.057956467568913e-05,
|
||
|
|
"loss": 0.768,
|
||
|
|
"step": 3220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7109004739336493,
|
||
|
|
"grad_norm": 0.03763349214718496,
|
||
|
|
"learning_rate": 7.009048513027738e-05,
|
||
|
|
"loss": 0.7627,
|
||
|
|
"step": 3225
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7120026452110658,
|
||
|
|
"grad_norm": 0.03910255645252377,
|
||
|
|
"learning_rate": 6.960258887627474e-05,
|
||
|
|
"loss": 0.7393,
|
||
|
|
"step": 3230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7131048164884823,
|
||
|
|
"grad_norm": 0.03847225495364757,
|
||
|
|
"learning_rate": 6.911588313839579e-05,
|
||
|
|
"loss": 0.758,
|
||
|
|
"step": 3235
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7142069877658989,
|
||
|
|
"grad_norm": 0.038410315616110316,
|
||
|
|
"learning_rate": 6.86303751237263e-05,
|
||
|
|
"loss": 0.7385,
|
||
|
|
"step": 3240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7153091590433154,
|
||
|
|
"grad_norm": 0.038761774380026405,
|
||
|
|
"learning_rate": 6.814607202161606e-05,
|
||
|
|
"loss": 0.7382,
|
||
|
|
"step": 3245
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7164113303207318,
|
||
|
|
"grad_norm": 0.03723197930881741,
|
||
|
|
"learning_rate": 6.766298100357281e-05,
|
||
|
|
"loss": 0.7359,
|
||
|
|
"step": 3250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7175135015981483,
|
||
|
|
"grad_norm": 0.040413685922780995,
|
||
|
|
"learning_rate": 6.718110922315593e-05,
|
||
|
|
"loss": 0.7342,
|
||
|
|
"step": 3255
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7186156728755648,
|
||
|
|
"grad_norm": 0.03978308906273803,
|
||
|
|
"learning_rate": 6.670046381587016e-05,
|
||
|
|
"loss": 0.7645,
|
||
|
|
"step": 3260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7197178441529813,
|
||
|
|
"grad_norm": 0.03897559054869522,
|
||
|
|
"learning_rate": 6.622105189906052e-05,
|
||
|
|
"loss": 0.7455,
|
||
|
|
"step": 3265
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7208200154303979,
|
||
|
|
"grad_norm": 0.03815718519490893,
|
||
|
|
"learning_rate": 6.574288057180663e-05,
|
||
|
|
"loss": 0.7615,
|
||
|
|
"step": 3270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7219221867078144,
|
||
|
|
"grad_norm": 0.03856332628344952,
|
||
|
|
"learning_rate": 6.526595691481746e-05,
|
||
|
|
"loss": 0.7599,
|
||
|
|
"step": 3275
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7230243579852309,
|
||
|
|
"grad_norm": 0.037326383736852486,
|
||
|
|
"learning_rate": 6.479028799032664e-05,
|
||
|
|
"loss": 0.7727,
|
||
|
|
"step": 3280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7241265292626474,
|
||
|
|
"grad_norm": 0.03759649917895476,
|
||
|
|
"learning_rate": 6.431588084198791e-05,
|
||
|
|
"loss": 0.733,
|
||
|
|
"step": 3285
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7252287005400639,
|
||
|
|
"grad_norm": 0.0351188278300472,
|
||
|
|
"learning_rate": 6.384274249477086e-05,
|
||
|
|
"loss": 0.7603,
|
||
|
|
"step": 3290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7263308718174805,
|
||
|
|
"grad_norm": 0.0387440053943191,
|
||
|
|
"learning_rate": 6.337087995485658e-05,
|
||
|
|
"loss": 0.7401,
|
||
|
|
"step": 3295
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.727433043094897,
|
||
|
|
"grad_norm": 0.03680120173341686,
|
||
|
|
"learning_rate": 6.290030020953423e-05,
|
||
|
|
"loss": 0.7811,
|
||
|
|
"step": 3300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7285352143723135,
|
||
|
|
"grad_norm": 0.037694080907078036,
|
||
|
|
"learning_rate": 6.243101022709761e-05,
|
||
|
|
"loss": 0.7279,
|
||
|
|
"step": 3305
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.72963738564973,
|
||
|
|
"grad_norm": 0.04067783323042442,
|
||
|
|
"learning_rate": 6.196301695674176e-05,
|
||
|
|
"loss": 0.7827,
|
||
|
|
"step": 3310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7307395569271464,
|
||
|
|
"grad_norm": 0.038537648221183,
|
||
|
|
"learning_rate": 6.14963273284601e-05,
|
||
|
|
"loss": 0.7586,
|
||
|
|
"step": 3315
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.731841728204563,
|
||
|
|
"grad_norm": 0.038447173395684923,
|
||
|
|
"learning_rate": 6.1030948252941985e-05,
|
||
|
|
"loss": 0.7599,
|
||
|
|
"step": 3320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7329438994819795,
|
||
|
|
"grad_norm": 0.037364125258692316,
|
||
|
|
"learning_rate": 6.056688662147012e-05,
|
||
|
|
"loss": 0.7546,
|
||
|
|
"step": 3325
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.734046070759396,
|
||
|
|
"grad_norm": 0.03886268408011641,
|
||
|
|
"learning_rate": 6.010414930581866e-05,
|
||
|
|
"loss": 0.7451,
|
||
|
|
"step": 3330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7351482420368125,
|
||
|
|
"grad_norm": 0.03769869242431956,
|
||
|
|
"learning_rate": 5.96427431581515e-05,
|
||
|
|
"loss": 0.768,
|
||
|
|
"step": 3335
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.736250413314229,
|
||
|
|
"grad_norm": 0.037020275271875513,
|
||
|
|
"learning_rate": 5.918267501092078e-05,
|
||
|
|
"loss": 0.7392,
|
||
|
|
"step": 3340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7373525845916455,
|
||
|
|
"grad_norm": 0.03729781683672499,
|
||
|
|
"learning_rate": 5.872395167676555e-05,
|
||
|
|
"loss": 0.7541,
|
||
|
|
"step": 3345
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7384547558690621,
|
||
|
|
"grad_norm": 0.03769698709111463,
|
||
|
|
"learning_rate": 5.826657994841104e-05,
|
||
|
|
"loss": 0.7464,
|
||
|
|
"step": 3350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7395569271464786,
|
||
|
|
"grad_norm": 0.035749490646957455,
|
||
|
|
"learning_rate": 5.78105665985681e-05,
|
||
|
|
"loss": 0.783,
|
||
|
|
"step": 3355
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7406590984238951,
|
||
|
|
"grad_norm": 0.03815712387980432,
|
||
|
|
"learning_rate": 5.7355918379832925e-05,
|
||
|
|
"loss": 0.7415,
|
||
|
|
"step": 3360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7417612697013116,
|
||
|
|
"grad_norm": 0.03756149653556473,
|
||
|
|
"learning_rate": 5.690264202458685e-05,
|
||
|
|
"loss": 0.7754,
|
||
|
|
"step": 3365
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7428634409787281,
|
||
|
|
"grad_norm": 0.03958578609528177,
|
||
|
|
"learning_rate": 5.64507442448968e-05,
|
||
|
|
"loss": 0.7835,
|
||
|
|
"step": 3370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7439656122561447,
|
||
|
|
"grad_norm": 0.038064835951232556,
|
||
|
|
"learning_rate": 5.6000231732416045e-05,
|
||
|
|
"loss": 0.7938,
|
||
|
|
"step": 3375
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7450677835335611,
|
||
|
|
"grad_norm": 0.0371943932393074,
|
||
|
|
"learning_rate": 5.555111115828492e-05,
|
||
|
|
"loss": 0.7406,
|
||
|
|
"step": 3380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7461699548109776,
|
||
|
|
"grad_norm": 0.03776336387841464,
|
||
|
|
"learning_rate": 5.510338917303204e-05,
|
||
|
|
"loss": 0.7459,
|
||
|
|
"step": 3385
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7472721260883941,
|
||
|
|
"grad_norm": 0.04142505105864296,
|
||
|
|
"learning_rate": 5.4657072406475816e-05,
|
||
|
|
"loss": 0.7419,
|
||
|
|
"step": 3390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7483742973658106,
|
||
|
|
"grad_norm": 0.03664875530168412,
|
||
|
|
"learning_rate": 5.421216746762651e-05,
|
||
|
|
"loss": 0.7701,
|
||
|
|
"step": 3395
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7494764686432271,
|
||
|
|
"grad_norm": 0.03963080933205579,
|
||
|
|
"learning_rate": 5.3768680944588006e-05,
|
||
|
|
"loss": 0.7449,
|
||
|
|
"step": 3400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7505786399206437,
|
||
|
|
"grad_norm": 0.03875301352382599,
|
||
|
|
"learning_rate": 5.3326619404460594e-05,
|
||
|
|
"loss": 0.7512,
|
||
|
|
"step": 3405
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7516808111980602,
|
||
|
|
"grad_norm": 0.03812435605779621,
|
||
|
|
"learning_rate": 5.2885989393243446e-05,
|
||
|
|
"loss": 0.7524,
|
||
|
|
"step": 3410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7527829824754767,
|
||
|
|
"grad_norm": 0.036201552008782494,
|
||
|
|
"learning_rate": 5.244679743573793e-05,
|
||
|
|
"loss": 0.7313,
|
||
|
|
"step": 3415
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7538851537528932,
|
||
|
|
"grad_norm": 0.03508489489495532,
|
||
|
|
"learning_rate": 5.200905003545072e-05,
|
||
|
|
"loss": 0.7143,
|
||
|
|
"step": 3420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7549873250303097,
|
||
|
|
"grad_norm": 0.03873819352052367,
|
||
|
|
"learning_rate": 5.1572753674497784e-05,
|
||
|
|
"loss": 0.7262,
|
||
|
|
"step": 3425
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7560894963077263,
|
||
|
|
"grad_norm": 0.03878661096421005,
|
||
|
|
"learning_rate": 5.11379148135083e-05,
|
||
|
|
"loss": 0.7388,
|
||
|
|
"step": 3430
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7571916675851428,
|
||
|
|
"grad_norm": 0.03624350442263521,
|
||
|
|
"learning_rate": 5.070453989152865e-05,
|
||
|
|
"loss": 0.7516,
|
||
|
|
"step": 3435
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7582938388625592,
|
||
|
|
"grad_norm": 0.03462784231516503,
|
||
|
|
"learning_rate": 5.0272635325927666e-05,
|
||
|
|
"loss": 0.735,
|
||
|
|
"step": 3440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7593960101399757,
|
||
|
|
"grad_norm": 0.03546841031831082,
|
||
|
|
"learning_rate": 4.9842207512301255e-05,
|
||
|
|
"loss": 0.7688,
|
||
|
|
"step": 3445
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7604981814173922,
|
||
|
|
"grad_norm": 0.03765881036086525,
|
||
|
|
"learning_rate": 4.941326282437765e-05,
|
||
|
|
"loss": 0.7584,
|
||
|
|
"step": 3450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7616003526948087,
|
||
|
|
"grad_norm": 0.04070540653962422,
|
||
|
|
"learning_rate": 4.8985807613923084e-05,
|
||
|
|
"loss": 0.7658,
|
||
|
|
"step": 3455
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7627025239722253,
|
||
|
|
"grad_norm": 0.041025307893189714,
|
||
|
|
"learning_rate": 4.855984821064789e-05,
|
||
|
|
"loss": 0.753,
|
||
|
|
"step": 3460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7638046952496418,
|
||
|
|
"grad_norm": 0.03747182722869465,
|
||
|
|
"learning_rate": 4.8135390922112687e-05,
|
||
|
|
"loss": 0.7481,
|
||
|
|
"step": 3465
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7649068665270583,
|
||
|
|
"grad_norm": 0.03475376097595749,
|
||
|
|
"learning_rate": 4.771244203363478e-05,
|
||
|
|
"loss": 0.7322,
|
||
|
|
"step": 3470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7660090378044748,
|
||
|
|
"grad_norm": 0.03620242697594977,
|
||
|
|
"learning_rate": 4.72910078081953e-05,
|
||
|
|
"loss": 0.7289,
|
||
|
|
"step": 3475
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7671112090818913,
|
||
|
|
"grad_norm": 0.039201952070474604,
|
||
|
|
"learning_rate": 4.687109448634647e-05,
|
||
|
|
"loss": 0.7663,
|
||
|
|
"step": 3480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7682133803593079,
|
||
|
|
"grad_norm": 0.038508731501384584,
|
||
|
|
"learning_rate": 4.6452708286119176e-05,
|
||
|
|
"loss": 0.7554,
|
||
|
|
"step": 3485
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7693155516367244,
|
||
|
|
"grad_norm": 0.03899698694328063,
|
||
|
|
"learning_rate": 4.603585540293071e-05,
|
||
|
|
"loss": 0.7736,
|
||
|
|
"step": 3490
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7704177229141409,
|
||
|
|
"grad_norm": 0.0368565333958254,
|
||
|
|
"learning_rate": 4.5620542009493304e-05,
|
||
|
|
"loss": 0.7516,
|
||
|
|
"step": 3495
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7715198941915574,
|
||
|
|
"grad_norm": 0.035388497953352936,
|
||
|
|
"learning_rate": 4.5206774255722504e-05,
|
||
|
|
"loss": 0.7484,
|
||
|
|
"step": 3500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7726220654689738,
|
||
|
|
"grad_norm": 0.03538316494242759,
|
||
|
|
"learning_rate": 4.4794558268646194e-05,
|
||
|
|
"loss": 0.7581,
|
||
|
|
"step": 3505
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7737242367463903,
|
||
|
|
"grad_norm": 0.037362884464824934,
|
||
|
|
"learning_rate": 4.4383900152313926e-05,
|
||
|
|
"loss": 0.7459,
|
||
|
|
"step": 3510
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7748264080238069,
|
||
|
|
"grad_norm": 0.036038446641414534,
|
||
|
|
"learning_rate": 4.397480598770652e-05,
|
||
|
|
"loss": 0.7606,
|
||
|
|
"step": 3515
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7759285793012234,
|
||
|
|
"grad_norm": 0.0402761096628342,
|
||
|
|
"learning_rate": 4.3567281832645815e-05,
|
||
|
|
"loss": 0.7813,
|
||
|
|
"step": 3520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7770307505786399,
|
||
|
|
"grad_norm": 0.03506614642106647,
|
||
|
|
"learning_rate": 4.3161333721705146e-05,
|
||
|
|
"loss": 0.7303,
|
||
|
|
"step": 3525
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7781329218560564,
|
||
|
|
"grad_norm": 0.03710237528152325,
|
||
|
|
"learning_rate": 4.275696766612007e-05,
|
||
|
|
"loss": 0.7658,
|
||
|
|
"step": 3530
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.779235093133473,
|
||
|
|
"grad_norm": 0.039207961974188736,
|
||
|
|
"learning_rate": 4.2354189653699234e-05,
|
||
|
|
"loss": 0.7686,
|
||
|
|
"step": 3535
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7803372644108895,
|
||
|
|
"grad_norm": 0.0400226786429818,
|
||
|
|
"learning_rate": 4.1953005648735606e-05,
|
||
|
|
"loss": 0.7365,
|
||
|
|
"step": 3540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.781439435688306,
|
||
|
|
"grad_norm": 0.038069210231566904,
|
||
|
|
"learning_rate": 4.1553421591918264e-05,
|
||
|
|
"loss": 0.7612,
|
||
|
|
"step": 3545
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7825416069657225,
|
||
|
|
"grad_norm": 0.036731650972072025,
|
||
|
|
"learning_rate": 4.115544340024456e-05,
|
||
|
|
"loss": 0.7276,
|
||
|
|
"step": 3550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.783643778243139,
|
||
|
|
"grad_norm": 0.03761683304943094,
|
||
|
|
"learning_rate": 4.075907696693224e-05,
|
||
|
|
"loss": 0.7397,
|
||
|
|
"step": 3555
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7847459495205555,
|
||
|
|
"grad_norm": 0.039130062081128986,
|
||
|
|
"learning_rate": 4.036432816133241e-05,
|
||
|
|
"loss": 0.7412,
|
||
|
|
"step": 3560
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.785848120797972,
|
||
|
|
"grad_norm": 0.03725082169003722,
|
||
|
|
"learning_rate": 3.99712028288424e-05,
|
||
|
|
"loss": 0.7378,
|
||
|
|
"step": 3565
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7869502920753885,
|
||
|
|
"grad_norm": 0.03581598167403878,
|
||
|
|
"learning_rate": 3.957970679081948e-05,
|
||
|
|
"loss": 0.7377,
|
||
|
|
"step": 3570
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.788052463352805,
|
||
|
|
"grad_norm": 0.036766846530443355,
|
||
|
|
"learning_rate": 3.918984584449435e-05,
|
||
|
|
"loss": 0.7606,
|
||
|
|
"step": 3575
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7891546346302215,
|
||
|
|
"grad_norm": 0.03708226420234272,
|
||
|
|
"learning_rate": 3.880162576288557e-05,
|
||
|
|
"loss": 0.763,
|
||
|
|
"step": 3580
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.790256805907638,
|
||
|
|
"grad_norm": 0.035646641087147025,
|
||
|
|
"learning_rate": 3.841505229471386e-05,
|
||
|
|
"loss": 0.7472,
|
||
|
|
"step": 3585
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7913589771850545,
|
||
|
|
"grad_norm": 0.03623704020179618,
|
||
|
|
"learning_rate": 3.803013116431716e-05,
|
||
|
|
"loss": 0.7371,
|
||
|
|
"step": 3590
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7924611484624711,
|
||
|
|
"grad_norm": 0.03685938607213482,
|
||
|
|
"learning_rate": 3.764686807156565e-05,
|
||
|
|
"loss": 0.7636,
|
||
|
|
"step": 3595
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7935633197398876,
|
||
|
|
"grad_norm": 0.03581401378985991,
|
||
|
|
"learning_rate": 3.72652686917776e-05,
|
||
|
|
"loss": 0.7436,
|
||
|
|
"step": 3600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7946654910173041,
|
||
|
|
"grad_norm": 0.03576091311918202,
|
||
|
|
"learning_rate": 3.6885338675635215e-05,
|
||
|
|
"loss": 0.741,
|
||
|
|
"step": 3605
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7957676622947206,
|
||
|
|
"grad_norm": 0.03938541436587627,
|
||
|
|
"learning_rate": 3.65070836491007e-05,
|
||
|
|
"loss": 0.7511,
|
||
|
|
"step": 3610
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7968698335721371,
|
||
|
|
"grad_norm": 0.03831985268675037,
|
||
|
|
"learning_rate": 3.613050921333345e-05,
|
||
|
|
"loss": 0.7581,
|
||
|
|
"step": 3615
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7979720048495537,
|
||
|
|
"grad_norm": 0.036183263282557804,
|
||
|
|
"learning_rate": 3.575562094460682e-05,
|
||
|
|
"loss": 0.7519,
|
||
|
|
"step": 3620
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7990741761269702,
|
||
|
|
"grad_norm": 0.039441336486759127,
|
||
|
|
"learning_rate": 3.5382424394225506e-05,
|
||
|
|
"loss": 0.7566,
|
||
|
|
"step": 3625
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8001763474043866,
|
||
|
|
"grad_norm": 0.03791319055471918,
|
||
|
|
"learning_rate": 3.501092508844339e-05,
|
||
|
|
"loss": 0.7483,
|
||
|
|
"step": 3630
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8012785186818031,
|
||
|
|
"grad_norm": 0.034917608244421146,
|
||
|
|
"learning_rate": 3.464112852838184e-05,
|
||
|
|
"loss": 0.7434,
|
||
|
|
"step": 3635
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8023806899592196,
|
||
|
|
"grad_norm": 0.03606780306247915,
|
||
|
|
"learning_rate": 3.427304018994821e-05,
|
||
|
|
"loss": 0.7478,
|
||
|
|
"step": 3640
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8034828612366361,
|
||
|
|
"grad_norm": 0.03680798923538717,
|
||
|
|
"learning_rate": 3.3906665523754504e-05,
|
||
|
|
"loss": 0.7496,
|
||
|
|
"step": 3645
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8045850325140527,
|
||
|
|
"grad_norm": 0.03865353271265747,
|
||
|
|
"learning_rate": 3.354200995503692e-05,
|
||
|
|
"loss": 0.7397,
|
||
|
|
"step": 3650
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8056872037914692,
|
||
|
|
"grad_norm": 0.03635669491983459,
|
||
|
|
"learning_rate": 3.3179078883575536e-05,
|
||
|
|
"loss": 0.7718,
|
||
|
|
"step": 3655
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8067893750688857,
|
||
|
|
"grad_norm": 0.0363260735871896,
|
||
|
|
"learning_rate": 3.2817877683614244e-05,
|
||
|
|
"loss": 0.7209,
|
||
|
|
"step": 3660
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8078915463463022,
|
||
|
|
"grad_norm": 0.03739024777521627,
|
||
|
|
"learning_rate": 3.245841170378106e-05,
|
||
|
|
"loss": 0.7276,
|
||
|
|
"step": 3665
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8089937176237187,
|
||
|
|
"grad_norm": 0.04056944004711678,
|
||
|
|
"learning_rate": 3.21006862670092e-05,
|
||
|
|
"loss": 0.7427,
|
||
|
|
"step": 3670
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8100958889011353,
|
||
|
|
"grad_norm": 0.038893468288220996,
|
||
|
|
"learning_rate": 3.174470667045801e-05,
|
||
|
|
"loss": 0.7337,
|
||
|
|
"step": 3675
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8111980601785518,
|
||
|
|
"grad_norm": 0.037502075980572286,
|
||
|
|
"learning_rate": 3.139047818543462e-05,
|
||
|
|
"loss": 0.7536,
|
||
|
|
"step": 3680
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8123002314559683,
|
||
|
|
"grad_norm": 0.03650106907146565,
|
||
|
|
"learning_rate": 3.103800605731598e-05,
|
||
|
|
"loss": 0.7533,
|
||
|
|
"step": 3685
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8134024027333848,
|
||
|
|
"grad_norm": 0.03819525064272321,
|
||
|
|
"learning_rate": 3.068729550547105e-05,
|
||
|
|
"loss": 0.7681,
|
||
|
|
"step": 3690
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8145045740108012,
|
||
|
|
"grad_norm": 0.03711022658424072,
|
||
|
|
"learning_rate": 3.033835172318355e-05,
|
||
|
|
"loss": 0.7449,
|
||
|
|
"step": 3695
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8156067452882177,
|
||
|
|
"grad_norm": 0.03418347940782499,
|
||
|
|
"learning_rate": 2.9991179877575032e-05,
|
||
|
|
"loss": 0.7393,
|
||
|
|
"step": 3700
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8167089165656343,
|
||
|
|
"grad_norm": 0.03626371876045612,
|
||
|
|
"learning_rate": 2.964578510952847e-05,
|
||
|
|
"loss": 0.7371,
|
||
|
|
"step": 3705
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8178110878430508,
|
||
|
|
"grad_norm": 0.03878297701004356,
|
||
|
|
"learning_rate": 2.9302172533612077e-05,
|
||
|
|
"loss": 0.747,
|
||
|
|
"step": 3710
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8189132591204673,
|
||
|
|
"grad_norm": 0.038112632260334955,
|
||
|
|
"learning_rate": 2.8960347238003488e-05,
|
||
|
|
"loss": 0.7579,
|
||
|
|
"step": 3715
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8200154303978838,
|
||
|
|
"grad_norm": 0.03926882455039221,
|
||
|
|
"learning_rate": 2.8620314284414486e-05,
|
||
|
|
"loss": 0.7529,
|
||
|
|
"step": 3720
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8211176016753003,
|
||
|
|
"grad_norm": 0.03648970703950733,
|
||
|
|
"learning_rate": 2.8282078708016163e-05,
|
||
|
|
"loss": 0.7473,
|
||
|
|
"step": 3725
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8222197729527169,
|
||
|
|
"grad_norm": 0.03630414782533231,
|
||
|
|
"learning_rate": 2.7945645517364064e-05,
|
||
|
|
"loss": 0.7355,
|
||
|
|
"step": 3730
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8233219442301334,
|
||
|
|
"grad_norm": 0.036468505132406466,
|
||
|
|
"learning_rate": 2.7611019694324415e-05,
|
||
|
|
"loss": 0.7101,
|
||
|
|
"step": 3735
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8244241155075499,
|
||
|
|
"grad_norm": 0.037520748694235606,
|
||
|
|
"learning_rate": 2.727820619399992e-05,
|
||
|
|
"loss": 0.7431,
|
||
|
|
"step": 3740
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8255262867849664,
|
||
|
|
"grad_norm": 0.03483967252567691,
|
||
|
|
"learning_rate": 2.6947209944656784e-05,
|
||
|
|
"loss": 0.7008,
|
||
|
|
"step": 3745
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8266284580623829,
|
||
|
|
"grad_norm": 0.03683830824123343,
|
||
|
|
"learning_rate": 2.661803584765143e-05,
|
||
|
|
"loss": 0.7397,
|
||
|
|
"step": 3750
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8277306293397994,
|
||
|
|
"grad_norm": 0.037699100377305624,
|
||
|
|
"learning_rate": 2.6290688777358164e-05,
|
||
|
|
"loss": 0.7663,
|
||
|
|
"step": 3755
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8288328006172159,
|
||
|
|
"grad_norm": 0.04181925311652994,
|
||
|
|
"learning_rate": 2.5965173581096748e-05,
|
||
|
|
"loss": 0.7553,
|
||
|
|
"step": 3760
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8299349718946324,
|
||
|
|
"grad_norm": 0.03779442944677197,
|
||
|
|
"learning_rate": 2.564149507906089e-05,
|
||
|
|
"loss": 0.7589,
|
||
|
|
"step": 3765
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8310371431720489,
|
||
|
|
"grad_norm": 0.03577568203273454,
|
||
|
|
"learning_rate": 2.5319658064246595e-05,
|
||
|
|
"loss": 0.7446,
|
||
|
|
"step": 3770
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8321393144494654,
|
||
|
|
"grad_norm": 0.03931081842755508,
|
||
|
|
"learning_rate": 2.4999667302381404e-05,
|
||
|
|
"loss": 0.751,
|
||
|
|
"step": 3775
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.833241485726882,
|
||
|
|
"grad_norm": 0.03745824041736122,
|
||
|
|
"learning_rate": 2.4681527531853835e-05,
|
||
|
|
"loss": 0.7123,
|
||
|
|
"step": 3780
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8343436570042985,
|
||
|
|
"grad_norm": 0.03527193664557951,
|
||
|
|
"learning_rate": 2.436524346364286e-05,
|
||
|
|
"loss": 0.7025,
|
||
|
|
"step": 3785
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.835445828281715,
|
||
|
|
"grad_norm": 0.03606947650450607,
|
||
|
|
"learning_rate": 2.4050819781248647e-05,
|
||
|
|
"loss": 0.7206,
|
||
|
|
"step": 3790
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8365479995591315,
|
||
|
|
"grad_norm": 0.035888622608410504,
|
||
|
|
"learning_rate": 2.373826114062296e-05,
|
||
|
|
"loss": 0.7537,
|
||
|
|
"step": 3795
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.837650170836548,
|
||
|
|
"grad_norm": 0.036850404718823324,
|
||
|
|
"learning_rate": 2.3427572170100112e-05,
|
||
|
|
"loss": 0.7638,
|
||
|
|
"step": 3800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8387523421139645,
|
||
|
|
"grad_norm": 0.037724339996222885,
|
||
|
|
"learning_rate": 2.311875747032858e-05,
|
||
|
|
"loss": 0.7557,
|
||
|
|
"step": 3805
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8398545133913811,
|
||
|
|
"grad_norm": 0.03518696843854021,
|
||
|
|
"learning_rate": 2.2811821614202897e-05,
|
||
|
|
"loss": 0.7602,
|
||
|
|
"step": 3810
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8409566846687976,
|
||
|
|
"grad_norm": 0.03634301372317722,
|
||
|
|
"learning_rate": 2.2506769146795893e-05,
|
||
|
|
"loss": 0.7427,
|
||
|
|
"step": 3815
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.842058855946214,
|
||
|
|
"grad_norm": 0.03565812651945666,
|
||
|
|
"learning_rate": 2.2203604585291303e-05,
|
||
|
|
"loss": 0.7336,
|
||
|
|
"step": 3820
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8431610272236305,
|
||
|
|
"grad_norm": 0.03768119047665601,
|
||
|
|
"learning_rate": 2.1902332418916956e-05,
|
||
|
|
"loss": 0.7661,
|
||
|
|
"step": 3825
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.844263198501047,
|
||
|
|
"grad_norm": 0.035791641873146804,
|
||
|
|
"learning_rate": 2.1602957108878434e-05,
|
||
|
|
"loss": 0.7589,
|
||
|
|
"step": 3830
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8453653697784635,
|
||
|
|
"grad_norm": 0.03888367446787956,
|
||
|
|
"learning_rate": 2.130548308829267e-05,
|
||
|
|
"loss": 0.7395,
|
||
|
|
"step": 3835
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8464675410558801,
|
||
|
|
"grad_norm": 0.03780624024053704,
|
||
|
|
"learning_rate": 2.1009914762122694e-05,
|
||
|
|
"loss": 0.7324,
|
||
|
|
"step": 3840
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8475697123332966,
|
||
|
|
"grad_norm": 0.03701658755711745,
|
||
|
|
"learning_rate": 2.071625650711217e-05,
|
||
|
|
"loss": 0.7261,
|
||
|
|
"step": 3845
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8486718836107131,
|
||
|
|
"grad_norm": 0.03349203678484499,
|
||
|
|
"learning_rate": 2.0424512671720566e-05,
|
||
|
|
"loss": 0.7285,
|
||
|
|
"step": 3850
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8497740548881296,
|
||
|
|
"grad_norm": 0.035807885382574296,
|
||
|
|
"learning_rate": 2.0134687576058878e-05,
|
||
|
|
"loss": 0.7513,
|
||
|
|
"step": 3855
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8508762261655461,
|
||
|
|
"grad_norm": 0.03540300293328915,
|
||
|
|
"learning_rate": 1.9846785511825618e-05,
|
||
|
|
"loss": 0.7506,
|
||
|
|
"step": 3860
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8519783974429627,
|
||
|
|
"grad_norm": 0.036548562423081375,
|
||
|
|
"learning_rate": 1.9560810742243298e-05,
|
||
|
|
"loss": 0.7486,
|
||
|
|
"step": 3865
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8530805687203792,
|
||
|
|
"grad_norm": 0.03688805027139149,
|
||
|
|
"learning_rate": 1.9276767501995206e-05,
|
||
|
|
"loss": 0.756,
|
||
|
|
"step": 3870
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8541827399977957,
|
||
|
|
"grad_norm": 0.03511921343987327,
|
||
|
|
"learning_rate": 1.8994659997162687e-05,
|
||
|
|
"loss": 0.7188,
|
||
|
|
"step": 3875
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8552849112752122,
|
||
|
|
"grad_norm": 0.03593480104178096,
|
||
|
|
"learning_rate": 1.8714492405163072e-05,
|
||
|
|
"loss": 0.7241,
|
||
|
|
"step": 3880
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8563870825526286,
|
||
|
|
"grad_norm": 0.03912290065785617,
|
||
|
|
"learning_rate": 1.843626887468764e-05,
|
||
|
|
"loss": 0.735,
|
||
|
|
"step": 3885
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8574892538300452,
|
||
|
|
"grad_norm": 0.036540249139284824,
|
||
|
|
"learning_rate": 1.8159993525640115e-05,
|
||
|
|
"loss": 0.7629,
|
||
|
|
"step": 3890
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8585914251074617,
|
||
|
|
"grad_norm": 0.03586116680339337,
|
||
|
|
"learning_rate": 1.788567044907585e-05,
|
||
|
|
"loss": 0.728,
|
||
|
|
"step": 3895
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8596935963848782,
|
||
|
|
"grad_norm": 0.04108515819021104,
|
||
|
|
"learning_rate": 1.7613303707141164e-05,
|
||
|
|
"loss": 0.7544,
|
||
|
|
"step": 3900
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8607957676622947,
|
||
|
|
"grad_norm": 0.04154327787956279,
|
||
|
|
"learning_rate": 1.7342897333013112e-05,
|
||
|
|
"loss": 0.715,
|
||
|
|
"step": 3905
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8618979389397112,
|
||
|
|
"grad_norm": 0.0375147580423421,
|
||
|
|
"learning_rate": 1.7074455330839943e-05,
|
||
|
|
"loss": 0.7325,
|
||
|
|
"step": 3910
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8630001102171277,
|
||
|
|
"grad_norm": 0.03505267883405521,
|
||
|
|
"learning_rate": 1.6807981675681587e-05,
|
||
|
|
"loss": 0.7463,
|
||
|
|
"step": 3915
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8641022814945443,
|
||
|
|
"grad_norm": 0.035319705305492576,
|
||
|
|
"learning_rate": 1.654348031345104e-05,
|
||
|
|
"loss": 0.7225,
|
||
|
|
"step": 3920
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8652044527719608,
|
||
|
|
"grad_norm": 0.038996518100504834,
|
||
|
|
"learning_rate": 1.6280955160855628e-05,
|
||
|
|
"loss": 0.7537,
|
||
|
|
"step": 3925
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8663066240493773,
|
||
|
|
"grad_norm": 0.03826307362758949,
|
||
|
|
"learning_rate": 1.602041010533934e-05,
|
||
|
|
"loss": 0.7287,
|
||
|
|
"step": 3930
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8674087953267938,
|
||
|
|
"grad_norm": 0.03711543681709775,
|
||
|
|
"learning_rate": 1.5761849005024985e-05,
|
||
|
|
"loss": 0.7709,
|
||
|
|
"step": 3935
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8685109666042103,
|
||
|
|
"grad_norm": 0.03886269645051025,
|
||
|
|
"learning_rate": 1.5505275688657275e-05,
|
||
|
|
"loss": 0.733,
|
||
|
|
"step": 3940
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8696131378816268,
|
||
|
|
"grad_norm": 0.039368459771957895,
|
||
|
|
"learning_rate": 1.5250693955545929e-05,
|
||
|
|
"loss": 0.7377,
|
||
|
|
"step": 3945
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8707153091590433,
|
||
|
|
"grad_norm": 0.03542501994719629,
|
||
|
|
"learning_rate": 1.4998107575509633e-05,
|
||
|
|
"loss": 0.7509,
|
||
|
|
"step": 3950
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8718174804364598,
|
||
|
|
"grad_norm": 0.03662399917022349,
|
||
|
|
"learning_rate": 1.4747520288820014e-05,
|
||
|
|
"loss": 0.7221,
|
||
|
|
"step": 3955
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8729196517138763,
|
||
|
|
"grad_norm": 0.03657298741205027,
|
||
|
|
"learning_rate": 1.449893580614636e-05,
|
||
|
|
"loss": 0.7497,
|
||
|
|
"step": 3960
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8740218229912928,
|
||
|
|
"grad_norm": 0.03781150680434851,
|
||
|
|
"learning_rate": 1.425235780850067e-05,
|
||
|
|
"loss": 0.7582,
|
||
|
|
"step": 3965
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8751239942687093,
|
||
|
|
"grad_norm": 0.04032655125637331,
|
||
|
|
"learning_rate": 1.4007789947183168e-05,
|
||
|
|
"loss": 0.7447,
|
||
|
|
"step": 3970
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8762261655461259,
|
||
|
|
"grad_norm": 0.03589883470863497,
|
||
|
|
"learning_rate": 1.3765235843728129e-05,
|
||
|
|
"loss": 0.7276,
|
||
|
|
"step": 3975
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8773283368235424,
|
||
|
|
"grad_norm": 0.03717658884234944,
|
||
|
|
"learning_rate": 1.3524699089850328e-05,
|
||
|
|
"loss": 0.7401,
|
||
|
|
"step": 3980
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8784305081009589,
|
||
|
|
"grad_norm": 0.03497351776047112,
|
||
|
|
"learning_rate": 1.3286183247391868e-05,
|
||
|
|
"loss": 0.7392,
|
||
|
|
"step": 3985
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8795326793783754,
|
||
|
|
"grad_norm": 0.03680345383865433,
|
||
|
|
"learning_rate": 1.3049691848269461e-05,
|
||
|
|
"loss": 0.7397,
|
||
|
|
"step": 3990
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.880634850655792,
|
||
|
|
"grad_norm": 0.037067904261301174,
|
||
|
|
"learning_rate": 1.2815228394421995e-05,
|
||
|
|
"loss": 0.7543,
|
||
|
|
"step": 3995
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8817370219332085,
|
||
|
|
"grad_norm": 0.03635951971560963,
|
||
|
|
"learning_rate": 1.2582796357758829e-05,
|
||
|
|
"loss": 0.7268,
|
||
|
|
"step": 4000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.882839193210625,
|
||
|
|
"grad_norm": 0.037149225593830666,
|
||
|
|
"learning_rate": 1.2352399180108286e-05,
|
||
|
|
"loss": 0.7447,
|
||
|
|
"step": 4005
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8839413644880414,
|
||
|
|
"grad_norm": 0.034848783140386315,
|
||
|
|
"learning_rate": 1.2124040273166691e-05,
|
||
|
|
"loss": 0.7311,
|
||
|
|
"step": 4010
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8850435357654579,
|
||
|
|
"grad_norm": 0.03985554439292569,
|
||
|
|
"learning_rate": 1.1897723018447946e-05,
|
||
|
|
"loss": 0.7288,
|
||
|
|
"step": 4015
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8861457070428744,
|
||
|
|
"grad_norm": 0.035706356323576875,
|
||
|
|
"learning_rate": 1.1673450767233388e-05,
|
||
|
|
"loss": 0.7326,
|
||
|
|
"step": 4020
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.887247878320291,
|
||
|
|
"grad_norm": 0.036740845098738074,
|
||
|
|
"learning_rate": 1.1451226840522077e-05,
|
||
|
|
"loss": 0.7496,
|
||
|
|
"step": 4025
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8883500495977075,
|
||
|
|
"grad_norm": 0.037031961816322526,
|
||
|
|
"learning_rate": 1.1231054528981765e-05,
|
||
|
|
"loss": 0.7524,
|
||
|
|
"step": 4030
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.889452220875124,
|
||
|
|
"grad_norm": 0.03756810225346114,
|
||
|
|
"learning_rate": 1.1012937092900126e-05,
|
||
|
|
"loss": 0.7312,
|
||
|
|
"step": 4035
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8905543921525405,
|
||
|
|
"grad_norm": 0.037472375253416255,
|
||
|
|
"learning_rate": 1.0796877762136458e-05,
|
||
|
|
"loss": 0.7544,
|
||
|
|
"step": 4040
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.891656563429957,
|
||
|
|
"grad_norm": 0.0359317417796455,
|
||
|
|
"learning_rate": 1.0582879736073819e-05,
|
||
|
|
"loss": 0.7354,
|
||
|
|
"step": 4045
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8927587347073735,
|
||
|
|
"grad_norm": 0.03749781446172768,
|
||
|
|
"learning_rate": 1.03709461835717e-05,
|
||
|
|
"loss": 0.7546,
|
||
|
|
"step": 4050
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8938609059847901,
|
||
|
|
"grad_norm": 0.03733411968021827,
|
||
|
|
"learning_rate": 1.0161080242919129e-05,
|
||
|
|
"loss": 0.7259,
|
||
|
|
"step": 4055
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8949630772622066,
|
||
|
|
"grad_norm": 0.03761288676599098,
|
||
|
|
"learning_rate": 9.953285021788143e-06,
|
||
|
|
"loss": 0.7489,
|
||
|
|
"step": 4060
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8960652485396231,
|
||
|
|
"grad_norm": 0.03412569507517913,
|
||
|
|
"learning_rate": 9.747563597187791e-06,
|
||
|
|
"loss": 0.7286,
|
||
|
|
"step": 4065
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8971674198170395,
|
||
|
|
"grad_norm": 0.03525939402552114,
|
||
|
|
"learning_rate": 9.543919015418516e-06,
|
||
|
|
"loss": 0.7513,
|
||
|
|
"step": 4070
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.898269591094456,
|
||
|
|
"grad_norm": 0.03720593815884494,
|
||
|
|
"learning_rate": 9.342354292027215e-06,
|
||
|
|
"loss": 0.7474,
|
||
|
|
"step": 4075
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8993717623718726,
|
||
|
|
"grad_norm": 0.0375416497695239,
|
||
|
|
"learning_rate": 9.142872411762354e-06,
|
||
|
|
"loss": 0.7685,
|
||
|
|
"step": 4080
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9004739336492891,
|
||
|
|
"grad_norm": 0.03906878754343892,
|
||
|
|
"learning_rate": 8.945476328529949e-06,
|
||
|
|
"loss": 0.732,
|
||
|
|
"step": 4085
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9015761049267056,
|
||
|
|
"grad_norm": 0.035056491589514995,
|
||
|
|
"learning_rate": 8.750168965349713e-06,
|
||
|
|
"loss": 0.7436,
|
||
|
|
"step": 4090
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9026782762041221,
|
||
|
|
"grad_norm": 0.03470624619912502,
|
||
|
|
"learning_rate": 8.556953214311896e-06,
|
||
|
|
"loss": 0.6928,
|
||
|
|
"step": 4095
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9037804474815386,
|
||
|
|
"grad_norm": 0.03557139688126229,
|
||
|
|
"learning_rate": 8.365831936534289e-06,
|
||
|
|
"loss": 0.7236,
|
||
|
|
"step": 4100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9048826187589551,
|
||
|
|
"grad_norm": 0.03860114842216224,
|
||
|
|
"learning_rate": 8.17680796212003e-06,
|
||
|
|
"loss": 0.7367,
|
||
|
|
"step": 4105
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9059847900363717,
|
||
|
|
"grad_norm": 0.0358454686914662,
|
||
|
|
"learning_rate": 7.989884090115579e-06,
|
||
|
|
"loss": 0.7393,
|
||
|
|
"step": 4110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9070869613137882,
|
||
|
|
"grad_norm": 0.036131468273917416,
|
||
|
|
"learning_rate": 7.80506308846927e-06,
|
||
|
|
"loss": 0.7187,
|
||
|
|
"step": 4115
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9081891325912047,
|
||
|
|
"grad_norm": 0.03431907826116676,
|
||
|
|
"learning_rate": 7.622347693990438e-06,
|
||
|
|
"loss": 0.7368,
|
||
|
|
"step": 4120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9092913038686212,
|
||
|
|
"grad_norm": 0.036378482006826245,
|
||
|
|
"learning_rate": 7.4417406123088e-06,
|
||
|
|
"loss": 0.7123,
|
||
|
|
"step": 4125
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9103934751460377,
|
||
|
|
"grad_norm": 0.03610068662140505,
|
||
|
|
"learning_rate": 7.263244517834365e-06,
|
||
|
|
"loss": 0.7298,
|
||
|
|
"step": 4130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9114956464234542,
|
||
|
|
"grad_norm": 0.035675737631172474,
|
||
|
|
"learning_rate": 7.086862053717867e-06,
|
||
|
|
"loss": 0.7329,
|
||
|
|
"step": 4135
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9125978177008707,
|
||
|
|
"grad_norm": 0.03488018397228746,
|
||
|
|
"learning_rate": 6.91259583181169e-06,
|
||
|
|
"loss": 0.7459,
|
||
|
|
"step": 4140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9136999889782872,
|
||
|
|
"grad_norm": 0.037711098080699654,
|
||
|
|
"learning_rate": 6.740448432631118e-06,
|
||
|
|
"loss": 0.7456,
|
||
|
|
"step": 4145
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9148021602557037,
|
||
|
|
"grad_norm": 0.03683158716361964,
|
||
|
|
"learning_rate": 6.570422405316117e-06,
|
||
|
|
"loss": 0.7477,
|
||
|
|
"step": 4150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9159043315331202,
|
||
|
|
"grad_norm": 0.03722178270525642,
|
||
|
|
"learning_rate": 6.4025202675935635e-06,
|
||
|
|
"loss": 0.7668,
|
||
|
|
"step": 4155
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9170065028105367,
|
||
|
|
"grad_norm": 0.03548538866880862,
|
||
|
|
"learning_rate": 6.236744505740126e-06,
|
||
|
|
"loss": 0.7612,
|
||
|
|
"step": 4160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9181086740879533,
|
||
|
|
"grad_norm": 0.03573889132178891,
|
||
|
|
"learning_rate": 6.073097574545244e-06,
|
||
|
|
"loss": 0.7374,
|
||
|
|
"step": 4165
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9192108453653698,
|
||
|
|
"grad_norm": 0.03490127726620679,
|
||
|
|
"learning_rate": 5.91158189727487e-06,
|
||
|
|
"loss": 0.7233,
|
||
|
|
"step": 4170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9203130166427863,
|
||
|
|
"grad_norm": 0.04015606965771483,
|
||
|
|
"learning_rate": 5.752199865635604e-06,
|
||
|
|
"loss": 0.7356,
|
||
|
|
"step": 4175
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9214151879202028,
|
||
|
|
"grad_norm": 0.038889913565105266,
|
||
|
|
"learning_rate": 5.594953839739252e-06,
|
||
|
|
"loss": 0.7571,
|
||
|
|
"step": 4180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9225173591976193,
|
||
|
|
"grad_norm": 0.03632902640398831,
|
||
|
|
"learning_rate": 5.439846148067856e-06,
|
||
|
|
"loss": 0.7478,
|
||
|
|
"step": 4185
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9236195304750359,
|
||
|
|
"grad_norm": 0.036911826600897674,
|
||
|
|
"learning_rate": 5.2868790874392495e-06,
|
||
|
|
"loss": 0.7351,
|
||
|
|
"step": 4190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9247217017524524,
|
||
|
|
"grad_norm": 0.035394959022726914,
|
||
|
|
"learning_rate": 5.13605492297306e-06,
|
||
|
|
"loss": 0.7513,
|
||
|
|
"step": 4195
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9258238730298688,
|
||
|
|
"grad_norm": 0.038584292984150205,
|
||
|
|
"learning_rate": 4.98737588805711e-06,
|
||
|
|
"loss": 0.7503,
|
||
|
|
"step": 4200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9269260443072853,
|
||
|
|
"grad_norm": 0.03443755510774623,
|
||
|
|
"learning_rate": 4.840844184314368e-06,
|
||
|
|
"loss": 0.7467,
|
||
|
|
"step": 4205
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9280282155847018,
|
||
|
|
"grad_norm": 0.03507393739539605,
|
||
|
|
"learning_rate": 4.696461981570371e-06,
|
||
|
|
"loss": 0.7479,
|
||
|
|
"step": 4210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9291303868621184,
|
||
|
|
"grad_norm": 0.03572284290016147,
|
||
|
|
"learning_rate": 4.554231417821147e-06,
|
||
|
|
"loss": 0.7438,
|
||
|
|
"step": 4215
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9302325581395349,
|
||
|
|
"grad_norm": 0.03939452564404578,
|
||
|
|
"learning_rate": 4.414154599201314e-06,
|
||
|
|
"loss": 0.7528,
|
||
|
|
"step": 4220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9313347294169514,
|
||
|
|
"grad_norm": 0.0343058423932778,
|
||
|
|
"learning_rate": 4.2762335999532494e-06,
|
||
|
|
"loss": 0.7123,
|
||
|
|
"step": 4225
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9324369006943679,
|
||
|
|
"grad_norm": 0.03688347815350498,
|
||
|
|
"learning_rate": 4.140470462396101e-06,
|
||
|
|
"loss": 0.7363,
|
||
|
|
"step": 4230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9335390719717844,
|
||
|
|
"grad_norm": 0.03669486942706104,
|
||
|
|
"learning_rate": 4.006867196895641e-06,
|
||
|
|
"loss": 0.7285,
|
||
|
|
"step": 4235
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.934641243249201,
|
||
|
|
"grad_norm": 0.035278173584095886,
|
||
|
|
"learning_rate": 3.8754257818345125e-06,
|
||
|
|
"loss": 0.7273,
|
||
|
|
"step": 4240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9357434145266175,
|
||
|
|
"grad_norm": 0.03838555636403366,
|
||
|
|
"learning_rate": 3.7461481635828793e-06,
|
||
|
|
"loss": 0.7406,
|
||
|
|
"step": 4245
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.936845585804034,
|
||
|
|
"grad_norm": 0.03449587188356417,
|
||
|
|
"learning_rate": 3.619036256469704e-06,
|
||
|
|
"loss": 0.719,
|
||
|
|
"step": 4250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9379477570814505,
|
||
|
|
"grad_norm": 0.03617056410014076,
|
||
|
|
"learning_rate": 3.4940919427542345e-06,
|
||
|
|
"loss": 0.7074,
|
||
|
|
"step": 4255
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9390499283588669,
|
||
|
|
"grad_norm": 0.03720545719113871,
|
||
|
|
"learning_rate": 3.371317072598312e-06,
|
||
|
|
"loss": 0.761,
|
||
|
|
"step": 4260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9401520996362834,
|
||
|
|
"grad_norm": 0.03733982842334115,
|
||
|
|
"learning_rate": 3.2507134640388566e-06,
|
||
|
|
"loss": 0.7373,
|
||
|
|
"step": 4265
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9412542709137,
|
||
|
|
"grad_norm": 0.03832577468879599,
|
||
|
|
"learning_rate": 3.132282902961025e-06,
|
||
|
|
"loss": 0.7744,
|
||
|
|
"step": 4270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9423564421911165,
|
||
|
|
"grad_norm": 0.03561487376514044,
|
||
|
|
"learning_rate": 3.016027143071631e-06,
|
||
|
|
"loss": 0.7367,
|
||
|
|
"step": 4275
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.943458613468533,
|
||
|
|
"grad_norm": 0.037260538854841714,
|
||
|
|
"learning_rate": 2.9019479058733974e-06,
|
||
|
|
"loss": 0.7412,
|
||
|
|
"step": 4280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9445607847459495,
|
||
|
|
"grad_norm": 0.033914247331261665,
|
||
|
|
"learning_rate": 2.7900468806392128e-06,
|
||
|
|
"loss": 0.7191,
|
||
|
|
"step": 4285
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.945662956023366,
|
||
|
|
"grad_norm": 0.036359448459938014,
|
||
|
|
"learning_rate": 2.6803257243873165e-06,
|
||
|
|
"loss": 0.7236,
|
||
|
|
"step": 4290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9467651273007825,
|
||
|
|
"grad_norm": 0.036671268303379516,
|
||
|
|
"learning_rate": 2.572786061856652e-06,
|
||
|
|
"loss": 0.706,
|
||
|
|
"step": 4295
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9478672985781991,
|
||
|
|
"grad_norm": 0.03688898129821375,
|
||
|
|
"learning_rate": 2.467429485482869e-06,
|
||
|
|
"loss": 0.7719,
|
||
|
|
"step": 4300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9489694698556156,
|
||
|
|
"grad_norm": 0.035848288240590206,
|
||
|
|
"learning_rate": 2.3642575553746933e-06,
|
||
|
|
"loss": 0.7375,
|
||
|
|
"step": 4305
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9500716411330321,
|
||
|
|
"grad_norm": 0.03649010626732304,
|
||
|
|
"learning_rate": 2.2632717992908278e-06,
|
||
|
|
"loss": 0.7492,
|
||
|
|
"step": 4310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9511738124104486,
|
||
|
|
"grad_norm": 0.035622089704062276,
|
||
|
|
"learning_rate": 2.164473712617387e-06,
|
||
|
|
"loss": 0.7277,
|
||
|
|
"step": 4315
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9522759836878651,
|
||
|
|
"grad_norm": 0.034793596935363234,
|
||
|
|
"learning_rate": 2.0678647583456995e-06,
|
||
|
|
"loss": 0.7167,
|
||
|
|
"step": 4320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9533781549652816,
|
||
|
|
"grad_norm": 0.038314433912779305,
|
||
|
|
"learning_rate": 1.973446367050674e-06,
|
||
|
|
"loss": 0.694,
|
||
|
|
"step": 4325
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9544803262426981,
|
||
|
|
"grad_norm": 0.032175274076676946,
|
||
|
|
"learning_rate": 1.8812199368695325e-06,
|
||
|
|
"loss": 0.7399,
|
||
|
|
"step": 4330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9555824975201146,
|
||
|
|
"grad_norm": 0.03747625530620859,
|
||
|
|
"learning_rate": 1.7911868334812618e-06,
|
||
|
|
"loss": 0.7543,
|
||
|
|
"step": 4335
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9566846687975311,
|
||
|
|
"grad_norm": 0.03469398206914549,
|
||
|
|
"learning_rate": 1.7033483900862953e-06,
|
||
|
|
"loss": 0.719,
|
||
|
|
"step": 4340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9577868400749476,
|
||
|
|
"grad_norm": 0.03716371251632767,
|
||
|
|
"learning_rate": 1.617705907386696e-06,
|
||
|
|
"loss": 0.7242,
|
||
|
|
"step": 4345
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9588890113523642,
|
||
|
|
"grad_norm": 0.033096194137156504,
|
||
|
|
"learning_rate": 1.5342606535670877e-06,
|
||
|
|
"loss": 0.7395,
|
||
|
|
"step": 4350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9599911826297807,
|
||
|
|
"grad_norm": 0.038402200718579416,
|
||
|
|
"learning_rate": 1.4530138642756872e-06,
|
||
|
|
"loss": 0.75,
|
||
|
|
"step": 4355
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9610933539071972,
|
||
|
|
"grad_norm": 0.03629339352134901,
|
||
|
|
"learning_rate": 1.3739667426061196e-06,
|
||
|
|
"loss": 0.7121,
|
||
|
|
"step": 4360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9621955251846137,
|
||
|
|
"grad_norm": 0.03480704310265325,
|
||
|
|
"learning_rate": 1.2971204590795813e-06,
|
||
|
|
"loss": 0.7347,
|
||
|
|
"step": 4365
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9632976964620302,
|
||
|
|
"grad_norm": 0.03454475792553819,
|
||
|
|
"learning_rate": 1.2224761516274883e-06,
|
||
|
|
"loss": 0.7322,
|
||
|
|
"step": 4370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9643998677394467,
|
||
|
|
"grad_norm": 0.0348941999387763,
|
||
|
|
"learning_rate": 1.1500349255746055e-06,
|
||
|
|
"loss": 0.744,
|
||
|
|
"step": 4375
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9655020390168633,
|
||
|
|
"grad_norm": 0.03661845220058871,
|
||
|
|
"learning_rate": 1.0797978536227602e-06,
|
||
|
|
"loss": 0.7429,
|
||
|
|
"step": 4380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9666042102942798,
|
||
|
|
"grad_norm": 0.03670130845399715,
|
||
|
|
"learning_rate": 1.011765975834855e-06,
|
||
|
|
"loss": 0.7508,
|
||
|
|
"step": 4385
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9677063815716962,
|
||
|
|
"grad_norm": 0.037482737750151054,
|
||
|
|
"learning_rate": 9.459402996195797e-07,
|
||
|
|
"loss": 0.7199,
|
||
|
|
"step": 4390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9688085528491127,
|
||
|
|
"grad_norm": 0.03609308663664764,
|
||
|
|
"learning_rate": 8.823217997163401e-07,
|
||
|
|
"loss": 0.7202,
|
||
|
|
"step": 4395
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9699107241265292,
|
||
|
|
"grad_norm": 0.03562801152195268,
|
||
|
|
"learning_rate": 8.209114181810029e-07,
|
||
|
|
"loss": 0.7519,
|
||
|
|
"step": 4400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9710128954039458,
|
||
|
|
"grad_norm": 0.037875623718305995,
|
||
|
|
"learning_rate": 7.617100643718066e-07,
|
||
|
|
"loss": 0.736,
|
||
|
|
"step": 4405
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9721150666813623,
|
||
|
|
"grad_norm": 0.0343579172461373,
|
||
|
|
"learning_rate": 7.04718614935973e-07,
|
||
|
|
"loss": 0.7319,
|
||
|
|
"step": 4410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9732172379587788,
|
||
|
|
"grad_norm": 0.0351522271270778,
|
||
|
|
"learning_rate": 6.499379137966831e-07,
|
||
|
|
"loss": 0.7158,
|
||
|
|
"step": 4415
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9743194092361953,
|
||
|
|
"grad_norm": 0.03685325990487051,
|
||
|
|
"learning_rate": 5.973687721405884e-07,
|
||
|
|
"loss": 0.7343,
|
||
|
|
"step": 4420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9754215805136118,
|
||
|
|
"grad_norm": 0.03353546299147592,
|
||
|
|
"learning_rate": 5.470119684058527e-07,
|
||
|
|
"loss": 0.73,
|
||
|
|
"step": 4425
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9765237517910283,
|
||
|
|
"grad_norm": 0.03850593917914308,
|
||
|
|
"learning_rate": 4.988682482705286e-07,
|
||
|
|
"loss": 0.756,
|
||
|
|
"step": 4430
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9776259230684449,
|
||
|
|
"grad_norm": 0.040410693859280186,
|
||
|
|
"learning_rate": 4.5293832464159965e-07,
|
||
|
|
"loss": 0.7463,
|
||
|
|
"step": 4435
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9787280943458614,
|
||
|
|
"grad_norm": 0.03585489720397234,
|
||
|
|
"learning_rate": 4.0922287764438843e-07,
|
||
|
|
"loss": 0.7227,
|
||
|
|
"step": 4440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9798302656232779,
|
||
|
|
"grad_norm": 0.03643983778724927,
|
||
|
|
"learning_rate": 3.677225546124818e-07,
|
||
|
|
"loss": 0.7297,
|
||
|
|
"step": 4445
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9809324369006943,
|
||
|
|
"grad_norm": 0.03527743845105325,
|
||
|
|
"learning_rate": 3.2843797007812147e-07,
|
||
|
|
"loss": 0.7343,
|
||
|
|
"step": 4450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9820346081781108,
|
||
|
|
"grad_norm": 0.03420044354382039,
|
||
|
|
"learning_rate": 2.913697057632114e-07,
|
||
|
|
"loss": 0.7211,
|
||
|
|
"step": 4455
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9831367794555274,
|
||
|
|
"grad_norm": 0.03541898225117583,
|
||
|
|
"learning_rate": 2.565183105705415e-07,
|
||
|
|
"loss": 0.7596,
|
||
|
|
"step": 4460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9842389507329439,
|
||
|
|
"grad_norm": 0.03803822863693149,
|
||
|
|
"learning_rate": 2.23884300575794e-07,
|
||
|
|
"loss": 0.7484,
|
||
|
|
"step": 4465
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9853411220103604,
|
||
|
|
"grad_norm": 0.0339643099555736,
|
||
|
|
"learning_rate": 1.9346815901984947e-07,
|
||
|
|
"loss": 0.7259,
|
||
|
|
"step": 4470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9864432932877769,
|
||
|
|
"grad_norm": 0.035078257970822355,
|
||
|
|
"learning_rate": 1.6527033630162613e-07,
|
||
|
|
"loss": 0.7306,
|
||
|
|
"step": 4475
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9875454645651934,
|
||
|
|
"grad_norm": 0.0345533584530425,
|
||
|
|
"learning_rate": 1.392912499714016e-07,
|
||
|
|
"loss": 0.734,
|
||
|
|
"step": 4480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.98864763584261,
|
||
|
|
"grad_norm": 0.03944350230563913,
|
||
|
|
"learning_rate": 1.1553128472468476e-07,
|
||
|
|
"loss": 0.764,
|
||
|
|
"step": 4485
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9897498071200265,
|
||
|
|
"grad_norm": 0.0357818169230016,
|
||
|
|
"learning_rate": 9.39907923964367e-08,
|
||
|
|
"loss": 0.7235,
|
||
|
|
"step": 4490
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.990851978397443,
|
||
|
|
"grad_norm": 0.03566232250402245,
|
||
|
|
"learning_rate": 7.467009195594176e-08,
|
||
|
|
"loss": 0.7433,
|
||
|
|
"step": 4495
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9919541496748595,
|
||
|
|
"grad_norm": 0.03513694284161295,
|
||
|
|
"learning_rate": 5.7569469502011247e-08,
|
||
|
|
"loss": 0.7419,
|
||
|
|
"step": 4500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.993056320952276,
|
||
|
|
"grad_norm": 0.033604401356817075,
|
||
|
|
"learning_rate": 4.2689178258820125e-08,
|
||
|
|
"loss": 0.7302,
|
||
|
|
"step": 4505
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9941584922296925,
|
||
|
|
"grad_norm": 0.038757934920082,
|
||
|
|
"learning_rate": 3.0029438572110045e-08,
|
||
|
|
"loss": 0.7509,
|
||
|
|
"step": 4510
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.995260663507109,
|
||
|
|
"grad_norm": 0.03542769692423219,
|
||
|
|
"learning_rate": 1.959043790590864e-08,
|
||
|
|
"loss": 0.7425,
|
||
|
|
"step": 4515
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9963628347845255,
|
||
|
|
"grad_norm": 0.037185347711832795,
|
||
|
|
"learning_rate": 1.137233083983169e-08,
|
||
|
|
"loss": 0.7187,
|
||
|
|
"step": 4520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.997465006061942,
|
||
|
|
"grad_norm": 0.03758234390906879,
|
||
|
|
"learning_rate": 5.375239066685022e-09,
|
||
|
|
"loss": 0.7592,
|
||
|
|
"step": 4525
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9985671773393585,
|
||
|
|
"grad_norm": 0.036473312127062285,
|
||
|
|
"learning_rate": 1.5992513907658878e-09,
|
||
|
|
"loss": 0.7272,
|
||
|
|
"step": 4530
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.999669348616775,
|
||
|
|
"grad_norm": 0.03634038177846733,
|
||
|
|
"learning_rate": 4.442372649737791e-11,
|
||
|
|
"loss": 0.7268,
|
||
|
|
"step": 4535
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9998897828722584,
|
||
|
|
"eval_loss": 1.1339110136032104,
|
||
|
|
"eval_runtime": 1020.4828,
|
||
|
|
"eval_samples_per_second": 187.325,
|
||
|
|
"eval_steps_per_second": 5.854,
|
||
|
|
"step": 4536
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9998897828722584,
|
||
|
|
"step": 4536,
|
||
|
|
"total_flos": 693442503278592.0,
|
||
|
|
"train_loss": 0.8111531063651491,
|
||
|
|
"train_runtime": 19545.8709,
|
||
|
|
"train_samples_per_second": 29.708,
|
||
|
|
"train_steps_per_second": 0.232
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"logging_steps": 5,
|
||
|
|
"max_steps": 4536,
|
||
|
|
"num_input_tokens_seen": 0,
|
||
|
|
"num_train_epochs": 1,
|
||
|
|
"save_steps": 500,
|
||
|
|
"stateful_callbacks": {
|
||
|
|
"TrainerControl": {
|
||
|
|
"args": {
|
||
|
|
"should_epoch_stop": false,
|
||
|
|
"should_evaluate": false,
|
||
|
|
"should_log": false,
|
||
|
|
"should_save": false,
|
||
|
|
"should_training_stop": false
|
||
|
|
},
|
||
|
|
"attributes": {}
|
||
|
|
}
|
||
|
|
},
|
||
|
|
"total_flos": 693442503278592.0,
|
||
|
|
"train_batch_size": 4,
|
||
|
|
"trial_name": null,
|
||
|
|
"trial_params": null
|
||
|
|
}
|