4562 lines
110 KiB
JSON
4562 lines
110 KiB
JSON
|
|
{
|
||
|
|
"best_global_step": null,
|
||
|
|
"best_metric": null,
|
||
|
|
"best_model_checkpoint": null,
|
||
|
|
"epoch": 3.0,
|
||
|
|
"eval_steps": 50,
|
||
|
|
"global_step": 633,
|
||
|
|
"is_hyper_param_search": false,
|
||
|
|
"is_local_process_zero": true,
|
||
|
|
"is_world_process_zero": true,
|
||
|
|
"log_history": [
|
||
|
|
{
|
||
|
|
"epoch": 0.004739336492890996,
|
||
|
|
"grad_norm": 33.21675090695969,
|
||
|
|
"learning_rate": 0.0,
|
||
|
|
"loss": 1.6607,
|
||
|
|
"step": 1
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.009478672985781991,
|
||
|
|
"grad_norm": 145.79448143612095,
|
||
|
|
"learning_rate": 4.739336492890996e-08,
|
||
|
|
"loss": 6.9341,
|
||
|
|
"step": 2
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.014218009478672985,
|
||
|
|
"grad_norm": 76.12710929983453,
|
||
|
|
"learning_rate": 9.478672985781992e-08,
|
||
|
|
"loss": 4.0894,
|
||
|
|
"step": 3
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.018957345971563982,
|
||
|
|
"grad_norm": 120.34557987524876,
|
||
|
|
"learning_rate": 1.4218009478672986e-07,
|
||
|
|
"loss": 4.8714,
|
||
|
|
"step": 4
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.023696682464454975,
|
||
|
|
"grad_norm": 45.93915597012246,
|
||
|
|
"learning_rate": 1.8957345971563984e-07,
|
||
|
|
"loss": 3.0785,
|
||
|
|
"step": 5
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.02843601895734597,
|
||
|
|
"grad_norm": 7.588158469972687,
|
||
|
|
"learning_rate": 2.3696682464454978e-07,
|
||
|
|
"loss": 1.1787,
|
||
|
|
"step": 6
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.03317535545023697,
|
||
|
|
"grad_norm": 123.2126299473189,
|
||
|
|
"learning_rate": 2.843601895734597e-07,
|
||
|
|
"loss": 4.9773,
|
||
|
|
"step": 7
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.037914691943127965,
|
||
|
|
"grad_norm": 34.85405544555481,
|
||
|
|
"learning_rate": 3.317535545023697e-07,
|
||
|
|
"loss": 1.9134,
|
||
|
|
"step": 8
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.04265402843601896,
|
||
|
|
"grad_norm": 6.387158096213077,
|
||
|
|
"learning_rate": 3.791469194312797e-07,
|
||
|
|
"loss": 1.078,
|
||
|
|
"step": 9
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.04739336492890995,
|
||
|
|
"grad_norm": 90.5056475587616,
|
||
|
|
"learning_rate": 4.265402843601896e-07,
|
||
|
|
"loss": 3.3833,
|
||
|
|
"step": 10
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.052132701421800945,
|
||
|
|
"grad_norm": 19.847643429642027,
|
||
|
|
"learning_rate": 4.7393364928909956e-07,
|
||
|
|
"loss": 1.6906,
|
||
|
|
"step": 11
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.05687203791469194,
|
||
|
|
"grad_norm": 69.85258553912271,
|
||
|
|
"learning_rate": 5.213270142180095e-07,
|
||
|
|
"loss": 3.443,
|
||
|
|
"step": 12
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.061611374407582936,
|
||
|
|
"grad_norm": 66.07803912705128,
|
||
|
|
"learning_rate": 5.687203791469194e-07,
|
||
|
|
"loss": 2.998,
|
||
|
|
"step": 13
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.06635071090047394,
|
||
|
|
"grad_norm": 60.825322694407426,
|
||
|
|
"learning_rate": 6.161137440758294e-07,
|
||
|
|
"loss": 2.9808,
|
||
|
|
"step": 14
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07109004739336493,
|
||
|
|
"grad_norm": 12.100377024176268,
|
||
|
|
"learning_rate": 6.635071090047394e-07,
|
||
|
|
"loss": 1.1544,
|
||
|
|
"step": 15
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07582938388625593,
|
||
|
|
"grad_norm": 67.6920658849621,
|
||
|
|
"learning_rate": 7.109004739336493e-07,
|
||
|
|
"loss": 2.9571,
|
||
|
|
"step": 16
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.08056872037914692,
|
||
|
|
"grad_norm": 18.392508410355724,
|
||
|
|
"learning_rate": 7.582938388625594e-07,
|
||
|
|
"loss": 1.3217,
|
||
|
|
"step": 17
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.08530805687203792,
|
||
|
|
"grad_norm": 70.96881552186444,
|
||
|
|
"learning_rate": 8.056872037914692e-07,
|
||
|
|
"loss": 3.4988,
|
||
|
|
"step": 18
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.09004739336492891,
|
||
|
|
"grad_norm": 11.624035689102646,
|
||
|
|
"learning_rate": 8.530805687203792e-07,
|
||
|
|
"loss": 1.3954,
|
||
|
|
"step": 19
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.0947867298578199,
|
||
|
|
"grad_norm": 63.05933859985993,
|
||
|
|
"learning_rate": 9.004739336492892e-07,
|
||
|
|
"loss": 2.6776,
|
||
|
|
"step": 20
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.0995260663507109,
|
||
|
|
"grad_norm": 8.32831094452385,
|
||
|
|
"learning_rate": 9.478672985781991e-07,
|
||
|
|
"loss": 1.0306,
|
||
|
|
"step": 21
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.10426540284360189,
|
||
|
|
"grad_norm": 45.10580181897829,
|
||
|
|
"learning_rate": 9.95260663507109e-07,
|
||
|
|
"loss": 2.3985,
|
||
|
|
"step": 22
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.10900473933649289,
|
||
|
|
"grad_norm": 15.893169103733232,
|
||
|
|
"learning_rate": 1.042654028436019e-06,
|
||
|
|
"loss": 1.2971,
|
||
|
|
"step": 23
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.11374407582938388,
|
||
|
|
"grad_norm": 29.8234296502016,
|
||
|
|
"learning_rate": 1.090047393364929e-06,
|
||
|
|
"loss": 1.919,
|
||
|
|
"step": 24
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.11848341232227488,
|
||
|
|
"grad_norm": 82.09962631491388,
|
||
|
|
"learning_rate": 1.1374407582938388e-06,
|
||
|
|
"loss": 3.6626,
|
||
|
|
"step": 25
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.12322274881516587,
|
||
|
|
"grad_norm": 40.04271756686343,
|
||
|
|
"learning_rate": 1.184834123222749e-06,
|
||
|
|
"loss": 2.308,
|
||
|
|
"step": 26
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.12796208530805686,
|
||
|
|
"grad_norm": 27.449517033776978,
|
||
|
|
"learning_rate": 1.2322274881516587e-06,
|
||
|
|
"loss": 1.8155,
|
||
|
|
"step": 27
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.13270142180094788,
|
||
|
|
"grad_norm": 10.639809456341752,
|
||
|
|
"learning_rate": 1.2796208530805687e-06,
|
||
|
|
"loss": 1.0393,
|
||
|
|
"step": 28
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.13744075829383887,
|
||
|
|
"grad_norm": 54.85981944259095,
|
||
|
|
"learning_rate": 1.3270142180094788e-06,
|
||
|
|
"loss": 2.7517,
|
||
|
|
"step": 29
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.14218009478672985,
|
||
|
|
"grad_norm": 10.614676945303964,
|
||
|
|
"learning_rate": 1.3744075829383887e-06,
|
||
|
|
"loss": 1.1101,
|
||
|
|
"step": 30
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.14691943127962084,
|
||
|
|
"grad_norm": 24.53181043088714,
|
||
|
|
"learning_rate": 1.4218009478672987e-06,
|
||
|
|
"loss": 1.7597,
|
||
|
|
"step": 31
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.15165876777251186,
|
||
|
|
"grad_norm": 21.276538744853784,
|
||
|
|
"learning_rate": 1.4691943127962086e-06,
|
||
|
|
"loss": 1.8453,
|
||
|
|
"step": 32
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.15639810426540285,
|
||
|
|
"grad_norm": 37.367412651323995,
|
||
|
|
"learning_rate": 1.5165876777251187e-06,
|
||
|
|
"loss": 1.9205,
|
||
|
|
"step": 33
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.16113744075829384,
|
||
|
|
"grad_norm": 4.908149316461357,
|
||
|
|
"learning_rate": 1.5639810426540287e-06,
|
||
|
|
"loss": 1.0521,
|
||
|
|
"step": 34
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.16587677725118483,
|
||
|
|
"grad_norm": 18.58884607239678,
|
||
|
|
"learning_rate": 1.6113744075829384e-06,
|
||
|
|
"loss": 1.2307,
|
||
|
|
"step": 35
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.17061611374407584,
|
||
|
|
"grad_norm": 4.500814297819521,
|
||
|
|
"learning_rate": 1.6587677725118483e-06,
|
||
|
|
"loss": 0.9168,
|
||
|
|
"step": 36
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.17535545023696683,
|
||
|
|
"grad_norm": 21.66083636003178,
|
||
|
|
"learning_rate": 1.7061611374407585e-06,
|
||
|
|
"loss": 1.5528,
|
||
|
|
"step": 37
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.18009478672985782,
|
||
|
|
"grad_norm": 18.173576160275513,
|
||
|
|
"learning_rate": 1.7535545023696684e-06,
|
||
|
|
"loss": 1.4227,
|
||
|
|
"step": 38
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1848341232227488,
|
||
|
|
"grad_norm": 3.6059374841290643,
|
||
|
|
"learning_rate": 1.8009478672985784e-06,
|
||
|
|
"loss": 0.9369,
|
||
|
|
"step": 39
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1895734597156398,
|
||
|
|
"grad_norm": 13.61011186508229,
|
||
|
|
"learning_rate": 1.8483412322274883e-06,
|
||
|
|
"loss": 1.3158,
|
||
|
|
"step": 40
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1943127962085308,
|
||
|
|
"grad_norm": 13.70693894260049,
|
||
|
|
"learning_rate": 1.8957345971563982e-06,
|
||
|
|
"loss": 1.3908,
|
||
|
|
"step": 41
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1990521327014218,
|
||
|
|
"grad_norm": 6.763803508410375,
|
||
|
|
"learning_rate": 1.943127962085308e-06,
|
||
|
|
"loss": 1.1349,
|
||
|
|
"step": 42
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2037914691943128,
|
||
|
|
"grad_norm": 5.206847724219033,
|
||
|
|
"learning_rate": 1.990521327014218e-06,
|
||
|
|
"loss": 0.8841,
|
||
|
|
"step": 43
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.20853080568720378,
|
||
|
|
"grad_norm": 4.113335406268759,
|
||
|
|
"learning_rate": 2.037914691943128e-06,
|
||
|
|
"loss": 0.8342,
|
||
|
|
"step": 44
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2132701421800948,
|
||
|
|
"grad_norm": 13.973887785697313,
|
||
|
|
"learning_rate": 2.085308056872038e-06,
|
||
|
|
"loss": 1.5232,
|
||
|
|
"step": 45
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.21800947867298578,
|
||
|
|
"grad_norm": 24.178882765868533,
|
||
|
|
"learning_rate": 2.1327014218009483e-06,
|
||
|
|
"loss": 1.7558,
|
||
|
|
"step": 46
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.22274881516587677,
|
||
|
|
"grad_norm": 15.258943952112597,
|
||
|
|
"learning_rate": 2.180094786729858e-06,
|
||
|
|
"loss": 1.4047,
|
||
|
|
"step": 47
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.22748815165876776,
|
||
|
|
"grad_norm": 6.028846809467111,
|
||
|
|
"learning_rate": 2.2274881516587678e-06,
|
||
|
|
"loss": 0.9313,
|
||
|
|
"step": 48
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.23222748815165878,
|
||
|
|
"grad_norm": 13.30832695052435,
|
||
|
|
"learning_rate": 2.2748815165876777e-06,
|
||
|
|
"loss": 1.3048,
|
||
|
|
"step": 49
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.23696682464454977,
|
||
|
|
"grad_norm": 32.03283861833234,
|
||
|
|
"learning_rate": 2.322274881516588e-06,
|
||
|
|
"loss": 1.9909,
|
||
|
|
"step": 50
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.23696682464454977,
|
||
|
|
"eval_loss": 1.5357390642166138,
|
||
|
|
"eval_runtime": 7.7918,
|
||
|
|
"eval_samples_per_second": 24.128,
|
||
|
|
"eval_steps_per_second": 6.032,
|
||
|
|
"step": 50
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.24170616113744076,
|
||
|
|
"grad_norm": 3.288468627282892,
|
||
|
|
"learning_rate": 2.369668246445498e-06,
|
||
|
|
"loss": 0.7391,
|
||
|
|
"step": 51
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.24644549763033174,
|
||
|
|
"grad_norm": 43.315163503233876,
|
||
|
|
"learning_rate": 2.417061611374408e-06,
|
||
|
|
"loss": 2.074,
|
||
|
|
"step": 52
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.25118483412322273,
|
||
|
|
"grad_norm": 18.160736628539848,
|
||
|
|
"learning_rate": 2.4644549763033174e-06,
|
||
|
|
"loss": 1.2847,
|
||
|
|
"step": 53
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2559241706161137,
|
||
|
|
"grad_norm": 12.634993975676384,
|
||
|
|
"learning_rate": 2.5118483412322274e-06,
|
||
|
|
"loss": 1.18,
|
||
|
|
"step": 54
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.26066350710900477,
|
||
|
|
"grad_norm": 15.065309167670087,
|
||
|
|
"learning_rate": 2.5592417061611373e-06,
|
||
|
|
"loss": 1.3315,
|
||
|
|
"step": 55
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.26540284360189575,
|
||
|
|
"grad_norm": 3.1570174369908455,
|
||
|
|
"learning_rate": 2.606635071090048e-06,
|
||
|
|
"loss": 0.7762,
|
||
|
|
"step": 56
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.27014218009478674,
|
||
|
|
"grad_norm": 29.459791307849596,
|
||
|
|
"learning_rate": 2.6540284360189576e-06,
|
||
|
|
"loss": 1.6226,
|
||
|
|
"step": 57
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.27488151658767773,
|
||
|
|
"grad_norm": 21.723079688552776,
|
||
|
|
"learning_rate": 2.7014218009478675e-06,
|
||
|
|
"loss": 1.3176,
|
||
|
|
"step": 58
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2796208530805687,
|
||
|
|
"grad_norm": 5.836364892449694,
|
||
|
|
"learning_rate": 2.7488151658767775e-06,
|
||
|
|
"loss": 0.7024,
|
||
|
|
"step": 59
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2843601895734597,
|
||
|
|
"grad_norm": 18.693320830865055,
|
||
|
|
"learning_rate": 2.7962085308056874e-06,
|
||
|
|
"loss": 1.2722,
|
||
|
|
"step": 60
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2890995260663507,
|
||
|
|
"grad_norm": 15.436571327706798,
|
||
|
|
"learning_rate": 2.8436018957345973e-06,
|
||
|
|
"loss": 1.3138,
|
||
|
|
"step": 61
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2938388625592417,
|
||
|
|
"grad_norm": 5.930903722307386,
|
||
|
|
"learning_rate": 2.8909952606635073e-06,
|
||
|
|
"loss": 0.8624,
|
||
|
|
"step": 62
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2985781990521327,
|
||
|
|
"grad_norm": 28.660964672923605,
|
||
|
|
"learning_rate": 2.938388625592417e-06,
|
||
|
|
"loss": 1.4304,
|
||
|
|
"step": 63
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3033175355450237,
|
||
|
|
"grad_norm": 7.170512177516791,
|
||
|
|
"learning_rate": 2.985781990521327e-06,
|
||
|
|
"loss": 0.7581,
|
||
|
|
"step": 64
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3080568720379147,
|
||
|
|
"grad_norm": 24.92537688193459,
|
||
|
|
"learning_rate": 3.0331753554502375e-06,
|
||
|
|
"loss": 1.5542,
|
||
|
|
"step": 65
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3127962085308057,
|
||
|
|
"grad_norm": 6.670020974801537,
|
||
|
|
"learning_rate": 3.0805687203791474e-06,
|
||
|
|
"loss": 0.7192,
|
||
|
|
"step": 66
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3175355450236967,
|
||
|
|
"grad_norm": 18.058205723760572,
|
||
|
|
"learning_rate": 3.1279620853080574e-06,
|
||
|
|
"loss": 1.2775,
|
||
|
|
"step": 67
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3222748815165877,
|
||
|
|
"grad_norm": 2.7688353603669373,
|
||
|
|
"learning_rate": 3.1753554502369673e-06,
|
||
|
|
"loss": 0.7382,
|
||
|
|
"step": 68
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.32701421800947866,
|
||
|
|
"grad_norm": 25.79053830589144,
|
||
|
|
"learning_rate": 3.222748815165877e-06,
|
||
|
|
"loss": 1.4285,
|
||
|
|
"step": 69
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.33175355450236965,
|
||
|
|
"grad_norm": 27.17162166449944,
|
||
|
|
"learning_rate": 3.2701421800947867e-06,
|
||
|
|
"loss": 1.3835,
|
||
|
|
"step": 70
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.33649289099526064,
|
||
|
|
"grad_norm": 2.7709621071894515,
|
||
|
|
"learning_rate": 3.3175355450236967e-06,
|
||
|
|
"loss": 0.8456,
|
||
|
|
"step": 71
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3412322274881517,
|
||
|
|
"grad_norm": 16.104863041856177,
|
||
|
|
"learning_rate": 3.3649289099526066e-06,
|
||
|
|
"loss": 1.0264,
|
||
|
|
"step": 72
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3459715639810427,
|
||
|
|
"grad_norm": 7.87118824845877,
|
||
|
|
"learning_rate": 3.412322274881517e-06,
|
||
|
|
"loss": 0.9177,
|
||
|
|
"step": 73
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.35071090047393366,
|
||
|
|
"grad_norm": 14.424951696237573,
|
||
|
|
"learning_rate": 3.459715639810427e-06,
|
||
|
|
"loss": 1.3698,
|
||
|
|
"step": 74
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.35545023696682465,
|
||
|
|
"grad_norm": 26.356301557715284,
|
||
|
|
"learning_rate": 3.507109004739337e-06,
|
||
|
|
"loss": 1.4675,
|
||
|
|
"step": 75
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.36018957345971564,
|
||
|
|
"grad_norm": 30.488132509935415,
|
||
|
|
"learning_rate": 3.5545023696682468e-06,
|
||
|
|
"loss": 1.4732,
|
||
|
|
"step": 76
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.36492890995260663,
|
||
|
|
"grad_norm": 7.144103257314455,
|
||
|
|
"learning_rate": 3.6018957345971567e-06,
|
||
|
|
"loss": 0.9201,
|
||
|
|
"step": 77
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3696682464454976,
|
||
|
|
"grad_norm": 12.335182641269517,
|
||
|
|
"learning_rate": 3.6492890995260666e-06,
|
||
|
|
"loss": 1.0759,
|
||
|
|
"step": 78
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3744075829383886,
|
||
|
|
"grad_norm": 9.761423382810872,
|
||
|
|
"learning_rate": 3.6966824644549766e-06,
|
||
|
|
"loss": 1.1193,
|
||
|
|
"step": 79
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3791469194312796,
|
||
|
|
"grad_norm": 6.228851618657622,
|
||
|
|
"learning_rate": 3.7440758293838865e-06,
|
||
|
|
"loss": 0.8188,
|
||
|
|
"step": 80
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.38388625592417064,
|
||
|
|
"grad_norm": 7.476211349226989,
|
||
|
|
"learning_rate": 3.7914691943127964e-06,
|
||
|
|
"loss": 1.0185,
|
||
|
|
"step": 81
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3886255924170616,
|
||
|
|
"grad_norm": 6.008754086378737,
|
||
|
|
"learning_rate": 3.838862559241707e-06,
|
||
|
|
"loss": 0.9504,
|
||
|
|
"step": 82
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3933649289099526,
|
||
|
|
"grad_norm": 39.784238511336135,
|
||
|
|
"learning_rate": 3.886255924170616e-06,
|
||
|
|
"loss": 1.508,
|
||
|
|
"step": 83
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3981042654028436,
|
||
|
|
"grad_norm": 16.093115365998983,
|
||
|
|
"learning_rate": 3.933649289099527e-06,
|
||
|
|
"loss": 0.9674,
|
||
|
|
"step": 84
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4028436018957346,
|
||
|
|
"grad_norm": 2.831228098225237,
|
||
|
|
"learning_rate": 3.981042654028436e-06,
|
||
|
|
"loss": 0.8576,
|
||
|
|
"step": 85
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4075829383886256,
|
||
|
|
"grad_norm": 2.6654563752530755,
|
||
|
|
"learning_rate": 4.0284360189573465e-06,
|
||
|
|
"loss": 0.881,
|
||
|
|
"step": 86
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.41232227488151657,
|
||
|
|
"grad_norm": 24.963850396340955,
|
||
|
|
"learning_rate": 4.075829383886256e-06,
|
||
|
|
"loss": 1.381,
|
||
|
|
"step": 87
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.41706161137440756,
|
||
|
|
"grad_norm": 18.32888630488476,
|
||
|
|
"learning_rate": 4.123222748815166e-06,
|
||
|
|
"loss": 0.8679,
|
||
|
|
"step": 88
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4218009478672986,
|
||
|
|
"grad_norm": 28.159604020225608,
|
||
|
|
"learning_rate": 4.170616113744076e-06,
|
||
|
|
"loss": 1.4164,
|
||
|
|
"step": 89
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4265402843601896,
|
||
|
|
"grad_norm": 18.38643187927111,
|
||
|
|
"learning_rate": 4.218009478672986e-06,
|
||
|
|
"loss": 0.8316,
|
||
|
|
"step": 90
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4312796208530806,
|
||
|
|
"grad_norm": 18.0776549889313,
|
||
|
|
"learning_rate": 4.265402843601897e-06,
|
||
|
|
"loss": 1.1651,
|
||
|
|
"step": 91
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.43601895734597157,
|
||
|
|
"grad_norm": 8.31511353691506,
|
||
|
|
"learning_rate": 4.312796208530806e-06,
|
||
|
|
"loss": 1.027,
|
||
|
|
"step": 92
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.44075829383886256,
|
||
|
|
"grad_norm": 2.984359322587433,
|
||
|
|
"learning_rate": 4.360189573459716e-06,
|
||
|
|
"loss": 0.7204,
|
||
|
|
"step": 93
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.44549763033175355,
|
||
|
|
"grad_norm": 6.1438951745676516,
|
||
|
|
"learning_rate": 4.407582938388626e-06,
|
||
|
|
"loss": 0.6249,
|
||
|
|
"step": 94
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.45023696682464454,
|
||
|
|
"grad_norm": 6.253599995679127,
|
||
|
|
"learning_rate": 4.4549763033175355e-06,
|
||
|
|
"loss": 0.8433,
|
||
|
|
"step": 95
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4549763033175355,
|
||
|
|
"grad_norm": 16.121167837366702,
|
||
|
|
"learning_rate": 4.502369668246446e-06,
|
||
|
|
"loss": 1.2633,
|
||
|
|
"step": 96
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4597156398104265,
|
||
|
|
"grad_norm": 23.92401887282444,
|
||
|
|
"learning_rate": 4.549763033175355e-06,
|
||
|
|
"loss": 1.1481,
|
||
|
|
"step": 97
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.46445497630331756,
|
||
|
|
"grad_norm": 7.54919968485265,
|
||
|
|
"learning_rate": 4.597156398104266e-06,
|
||
|
|
"loss": 0.8537,
|
||
|
|
"step": 98
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.46919431279620855,
|
||
|
|
"grad_norm": 16.4663797881457,
|
||
|
|
"learning_rate": 4.644549763033176e-06,
|
||
|
|
"loss": 1.0375,
|
||
|
|
"step": 99
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.47393364928909953,
|
||
|
|
"grad_norm": 3.2371645854636832,
|
||
|
|
"learning_rate": 4.691943127962086e-06,
|
||
|
|
"loss": 0.6856,
|
||
|
|
"step": 100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.47393364928909953,
|
||
|
|
"eval_loss": 1.0985443592071533,
|
||
|
|
"eval_runtime": 7.9946,
|
||
|
|
"eval_samples_per_second": 23.516,
|
||
|
|
"eval_steps_per_second": 5.879,
|
||
|
|
"step": 100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4786729857819905,
|
||
|
|
"grad_norm": 3.8478671547588474,
|
||
|
|
"learning_rate": 4.739336492890996e-06,
|
||
|
|
"loss": 0.7252,
|
||
|
|
"step": 101
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4834123222748815,
|
||
|
|
"grad_norm": 17.790211396697263,
|
||
|
|
"learning_rate": 4.7867298578199055e-06,
|
||
|
|
"loss": 1.2033,
|
||
|
|
"step": 102
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4881516587677725,
|
||
|
|
"grad_norm": 6.598774872646845,
|
||
|
|
"learning_rate": 4.834123222748816e-06,
|
||
|
|
"loss": 0.7996,
|
||
|
|
"step": 103
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4928909952606635,
|
||
|
|
"grad_norm": 17.90748957259168,
|
||
|
|
"learning_rate": 4.881516587677725e-06,
|
||
|
|
"loss": 0.6942,
|
||
|
|
"step": 104
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4976303317535545,
|
||
|
|
"grad_norm": 6.276702585855472,
|
||
|
|
"learning_rate": 4.928909952606635e-06,
|
||
|
|
"loss": 0.881,
|
||
|
|
"step": 105
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5023696682464455,
|
||
|
|
"grad_norm": 2.641182574106109,
|
||
|
|
"learning_rate": 4.976303317535545e-06,
|
||
|
|
"loss": 0.8275,
|
||
|
|
"step": 106
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5071090047393365,
|
||
|
|
"grad_norm": 3.1740261509578676,
|
||
|
|
"learning_rate": 5.023696682464455e-06,
|
||
|
|
"loss": 0.64,
|
||
|
|
"step": 107
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5118483412322274,
|
||
|
|
"grad_norm": 15.982354340368344,
|
||
|
|
"learning_rate": 5.071090047393366e-06,
|
||
|
|
"loss": 0.8089,
|
||
|
|
"step": 108
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5165876777251185,
|
||
|
|
"grad_norm": 13.62346496564989,
|
||
|
|
"learning_rate": 5.118483412322275e-06,
|
||
|
|
"loss": 1.0223,
|
||
|
|
"step": 109
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5213270142180095,
|
||
|
|
"grad_norm": 18.85115778380913,
|
||
|
|
"learning_rate": 5.165876777251185e-06,
|
||
|
|
"loss": 0.9443,
|
||
|
|
"step": 110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5260663507109005,
|
||
|
|
"grad_norm": 17.357644955205703,
|
||
|
|
"learning_rate": 5.213270142180096e-06,
|
||
|
|
"loss": 0.7135,
|
||
|
|
"step": 111
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5308056872037915,
|
||
|
|
"grad_norm": 18.435671376617172,
|
||
|
|
"learning_rate": 5.260663507109005e-06,
|
||
|
|
"loss": 0.9026,
|
||
|
|
"step": 112
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5355450236966824,
|
||
|
|
"grad_norm": 16.296720003324083,
|
||
|
|
"learning_rate": 5.308056872037915e-06,
|
||
|
|
"loss": 0.813,
|
||
|
|
"step": 113
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5402843601895735,
|
||
|
|
"grad_norm": 17.387322006549645,
|
||
|
|
"learning_rate": 5.355450236966825e-06,
|
||
|
|
"loss": 0.8703,
|
||
|
|
"step": 114
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5450236966824644,
|
||
|
|
"grad_norm": 17.84888433467405,
|
||
|
|
"learning_rate": 5.402843601895735e-06,
|
||
|
|
"loss": 0.9083,
|
||
|
|
"step": 115
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5497630331753555,
|
||
|
|
"grad_norm": 15.567515178037798,
|
||
|
|
"learning_rate": 5.4502369668246446e-06,
|
||
|
|
"loss": 0.8269,
|
||
|
|
"step": 116
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5545023696682464,
|
||
|
|
"grad_norm": 32.4120434440015,
|
||
|
|
"learning_rate": 5.497630331753555e-06,
|
||
|
|
"loss": 1.0408,
|
||
|
|
"step": 117
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5592417061611374,
|
||
|
|
"grad_norm": 34.75683609822539,
|
||
|
|
"learning_rate": 5.5450236966824644e-06,
|
||
|
|
"loss": 1.169,
|
||
|
|
"step": 118
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5639810426540285,
|
||
|
|
"grad_norm": 16.267510034467378,
|
||
|
|
"learning_rate": 5.592417061611375e-06,
|
||
|
|
"loss": 0.5434,
|
||
|
|
"step": 119
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5687203791469194,
|
||
|
|
"grad_norm": 10.050816033748056,
|
||
|
|
"learning_rate": 5.639810426540285e-06,
|
||
|
|
"loss": 0.9619,
|
||
|
|
"step": 120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5734597156398105,
|
||
|
|
"grad_norm": 18.015087265001927,
|
||
|
|
"learning_rate": 5.687203791469195e-06,
|
||
|
|
"loss": 0.9081,
|
||
|
|
"step": 121
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5781990521327014,
|
||
|
|
"grad_norm": 3.0788046399571103,
|
||
|
|
"learning_rate": 5.734597156398105e-06,
|
||
|
|
"loss": 0.8138,
|
||
|
|
"step": 122
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5829383886255924,
|
||
|
|
"grad_norm": 40.81995319269455,
|
||
|
|
"learning_rate": 5.7819905213270145e-06,
|
||
|
|
"loss": 1.0204,
|
||
|
|
"step": 123
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5876777251184834,
|
||
|
|
"grad_norm": 3.382154801748216,
|
||
|
|
"learning_rate": 5.829383886255925e-06,
|
||
|
|
"loss": 0.6919,
|
||
|
|
"step": 124
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5924170616113744,
|
||
|
|
"grad_norm": 3.7193644674392594,
|
||
|
|
"learning_rate": 5.876777251184834e-06,
|
||
|
|
"loss": 0.7168,
|
||
|
|
"step": 125
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5971563981042654,
|
||
|
|
"grad_norm": 2.952313965201417,
|
||
|
|
"learning_rate": 5.924170616113745e-06,
|
||
|
|
"loss": 0.771,
|
||
|
|
"step": 126
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6018957345971564,
|
||
|
|
"grad_norm": 13.003492397640734,
|
||
|
|
"learning_rate": 5.971563981042654e-06,
|
||
|
|
"loss": 0.8174,
|
||
|
|
"step": 127
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6066350710900474,
|
||
|
|
"grad_norm": 14.589344292080268,
|
||
|
|
"learning_rate": 6.018957345971565e-06,
|
||
|
|
"loss": 0.7877,
|
||
|
|
"step": 128
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6113744075829384,
|
||
|
|
"grad_norm": 11.638671492972737,
|
||
|
|
"learning_rate": 6.066350710900475e-06,
|
||
|
|
"loss": 0.7807,
|
||
|
|
"step": 129
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6161137440758294,
|
||
|
|
"grad_norm": 5.48286947202299,
|
||
|
|
"learning_rate": 6.1137440758293845e-06,
|
||
|
|
"loss": 0.8134,
|
||
|
|
"step": 130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6208530805687204,
|
||
|
|
"grad_norm": 16.323797258539084,
|
||
|
|
"learning_rate": 6.161137440758295e-06,
|
||
|
|
"loss": 0.5628,
|
||
|
|
"step": 131
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6255924170616114,
|
||
|
|
"grad_norm": 14.23866523935885,
|
||
|
|
"learning_rate": 6.208530805687204e-06,
|
||
|
|
"loss": 1.0402,
|
||
|
|
"step": 132
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6303317535545023,
|
||
|
|
"grad_norm": 17.516776965037845,
|
||
|
|
"learning_rate": 6.255924170616115e-06,
|
||
|
|
"loss": 0.5668,
|
||
|
|
"step": 133
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6350710900473934,
|
||
|
|
"grad_norm": 3.203974060674095,
|
||
|
|
"learning_rate": 6.303317535545023e-06,
|
||
|
|
"loss": 0.7752,
|
||
|
|
"step": 134
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6398104265402843,
|
||
|
|
"grad_norm": 15.925776501328883,
|
||
|
|
"learning_rate": 6.350710900473935e-06,
|
||
|
|
"loss": 0.4179,
|
||
|
|
"step": 135
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6445497630331753,
|
||
|
|
"grad_norm": 13.118953573289614,
|
||
|
|
"learning_rate": 6.398104265402843e-06,
|
||
|
|
"loss": 0.818,
|
||
|
|
"step": 136
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6492890995260664,
|
||
|
|
"grad_norm": 17.67906655380422,
|
||
|
|
"learning_rate": 6.445497630331754e-06,
|
||
|
|
"loss": 1.0207,
|
||
|
|
"step": 137
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6540284360189573,
|
||
|
|
"grad_norm": 4.771268317889247,
|
||
|
|
"learning_rate": 6.492890995260665e-06,
|
||
|
|
"loss": 0.7833,
|
||
|
|
"step": 138
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6587677725118484,
|
||
|
|
"grad_norm": 13.46170978768313,
|
||
|
|
"learning_rate": 6.5402843601895735e-06,
|
||
|
|
"loss": 0.8421,
|
||
|
|
"step": 139
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6635071090047393,
|
||
|
|
"grad_norm": 13.374555494546588,
|
||
|
|
"learning_rate": 6.587677725118484e-06,
|
||
|
|
"loss": 0.7757,
|
||
|
|
"step": 140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6682464454976303,
|
||
|
|
"grad_norm": 5.6967734677721635,
|
||
|
|
"learning_rate": 6.635071090047393e-06,
|
||
|
|
"loss": 0.7167,
|
||
|
|
"step": 141
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6729857819905213,
|
||
|
|
"grad_norm": 19.051040456172952,
|
||
|
|
"learning_rate": 6.682464454976304e-06,
|
||
|
|
"loss": 0.6418,
|
||
|
|
"step": 142
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6777251184834123,
|
||
|
|
"grad_norm": 4.437733313814945,
|
||
|
|
"learning_rate": 6.729857819905213e-06,
|
||
|
|
"loss": 0.8421,
|
||
|
|
"step": 143
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6824644549763034,
|
||
|
|
"grad_norm": 3.3824942666879303,
|
||
|
|
"learning_rate": 6.777251184834124e-06,
|
||
|
|
"loss": 0.6347,
|
||
|
|
"step": 144
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6872037914691943,
|
||
|
|
"grad_norm": 16.134029693349987,
|
||
|
|
"learning_rate": 6.824644549763034e-06,
|
||
|
|
"loss": 0.7729,
|
||
|
|
"step": 145
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6919431279620853,
|
||
|
|
"grad_norm": 9.522896679230966,
|
||
|
|
"learning_rate": 6.8720379146919435e-06,
|
||
|
|
"loss": 0.5691,
|
||
|
|
"step": 146
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6966824644549763,
|
||
|
|
"grad_norm": 11.26439032421294,
|
||
|
|
"learning_rate": 6.919431279620854e-06,
|
||
|
|
"loss": 0.6833,
|
||
|
|
"step": 147
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7014218009478673,
|
||
|
|
"grad_norm": 2.8946533391937144,
|
||
|
|
"learning_rate": 6.966824644549763e-06,
|
||
|
|
"loss": 0.8339,
|
||
|
|
"step": 148
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7061611374407583,
|
||
|
|
"grad_norm": 12.567294143726862,
|
||
|
|
"learning_rate": 7.014218009478674e-06,
|
||
|
|
"loss": 0.7937,
|
||
|
|
"step": 149
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7109004739336493,
|
||
|
|
"grad_norm": 4.118378272456221,
|
||
|
|
"learning_rate": 7.061611374407583e-06,
|
||
|
|
"loss": 0.9209,
|
||
|
|
"step": 150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7109004739336493,
|
||
|
|
"eval_loss": 0.5114782452583313,
|
||
|
|
"eval_runtime": 7.6835,
|
||
|
|
"eval_samples_per_second": 24.468,
|
||
|
|
"eval_steps_per_second": 6.117,
|
||
|
|
"step": 150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7156398104265402,
|
||
|
|
"grad_norm": 13.081643447337786,
|
||
|
|
"learning_rate": 7.1090047393364935e-06,
|
||
|
|
"loss": 0.5085,
|
||
|
|
"step": 151
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7203791469194313,
|
||
|
|
"grad_norm": 7.131592313344539,
|
||
|
|
"learning_rate": 7.156398104265403e-06,
|
||
|
|
"loss": 0.5927,
|
||
|
|
"step": 152
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7251184834123223,
|
||
|
|
"grad_norm": 11.868469385411386,
|
||
|
|
"learning_rate": 7.203791469194313e-06,
|
||
|
|
"loss": 0.4432,
|
||
|
|
"step": 153
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7298578199052133,
|
||
|
|
"grad_norm": 13.89380031996673,
|
||
|
|
"learning_rate": 7.251184834123224e-06,
|
||
|
|
"loss": 0.526,
|
||
|
|
"step": 154
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7345971563981043,
|
||
|
|
"grad_norm": 3.0245649047418084,
|
||
|
|
"learning_rate": 7.298578199052133e-06,
|
||
|
|
"loss": 0.6342,
|
||
|
|
"step": 155
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7393364928909952,
|
||
|
|
"grad_norm": 8.99445909152358,
|
||
|
|
"learning_rate": 7.345971563981044e-06,
|
||
|
|
"loss": 0.48,
|
||
|
|
"step": 156
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7440758293838863,
|
||
|
|
"grad_norm": 4.835328254993896,
|
||
|
|
"learning_rate": 7.393364928909953e-06,
|
||
|
|
"loss": 0.7006,
|
||
|
|
"step": 157
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7488151658767772,
|
||
|
|
"grad_norm": 2.56401277703409,
|
||
|
|
"learning_rate": 7.4407582938388635e-06,
|
||
|
|
"loss": 0.565,
|
||
|
|
"step": 158
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7535545023696683,
|
||
|
|
"grad_norm": 8.408290523647263,
|
||
|
|
"learning_rate": 7.488151658767773e-06,
|
||
|
|
"loss": 0.5788,
|
||
|
|
"step": 159
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7582938388625592,
|
||
|
|
"grad_norm": 2.448536630140397,
|
||
|
|
"learning_rate": 7.535545023696683e-06,
|
||
|
|
"loss": 0.7817,
|
||
|
|
"step": 160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7630331753554502,
|
||
|
|
"grad_norm": 3.2418181238906127,
|
||
|
|
"learning_rate": 7.582938388625593e-06,
|
||
|
|
"loss": 0.2056,
|
||
|
|
"step": 161
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7677725118483413,
|
||
|
|
"grad_norm": 2.216383090131846,
|
||
|
|
"learning_rate": 7.630331753554503e-06,
|
||
|
|
"loss": 0.43,
|
||
|
|
"step": 162
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7725118483412322,
|
||
|
|
"grad_norm": 2.7269338690903986,
|
||
|
|
"learning_rate": 7.677725118483414e-06,
|
||
|
|
"loss": 0.5986,
|
||
|
|
"step": 163
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7772511848341233,
|
||
|
|
"grad_norm": 3.703015103620324,
|
||
|
|
"learning_rate": 7.725118483412322e-06,
|
||
|
|
"loss": 0.2139,
|
||
|
|
"step": 164
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7819905213270142,
|
||
|
|
"grad_norm": 11.509890344708763,
|
||
|
|
"learning_rate": 7.772511848341233e-06,
|
||
|
|
"loss": 0.3832,
|
||
|
|
"step": 165
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7867298578199052,
|
||
|
|
"grad_norm": 13.221389377721215,
|
||
|
|
"learning_rate": 7.819905213270143e-06,
|
||
|
|
"loss": 0.6961,
|
||
|
|
"step": 166
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7914691943127962,
|
||
|
|
"grad_norm": 8.601955890360907,
|
||
|
|
"learning_rate": 7.867298578199053e-06,
|
||
|
|
"loss": 0.7069,
|
||
|
|
"step": 167
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7962085308056872,
|
||
|
|
"grad_norm": 6.464688976078771,
|
||
|
|
"learning_rate": 7.914691943127962e-06,
|
||
|
|
"loss": 0.2874,
|
||
|
|
"step": 168
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8009478672985783,
|
||
|
|
"grad_norm": 4.818205368328611,
|
||
|
|
"learning_rate": 7.962085308056872e-06,
|
||
|
|
"loss": 0.2536,
|
||
|
|
"step": 169
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8056872037914692,
|
||
|
|
"grad_norm": 6.818061320004181,
|
||
|
|
"learning_rate": 8.009478672985783e-06,
|
||
|
|
"loss": 0.8167,
|
||
|
|
"step": 170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8104265402843602,
|
||
|
|
"grad_norm": 6.814053715737355,
|
||
|
|
"learning_rate": 8.056872037914693e-06,
|
||
|
|
"loss": 0.4523,
|
||
|
|
"step": 171
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8151658767772512,
|
||
|
|
"grad_norm": 3.7622792940282554,
|
||
|
|
"learning_rate": 8.104265402843603e-06,
|
||
|
|
"loss": 0.437,
|
||
|
|
"step": 172
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8199052132701422,
|
||
|
|
"grad_norm": 17.887658231522614,
|
||
|
|
"learning_rate": 8.151658767772512e-06,
|
||
|
|
"loss": 0.5533,
|
||
|
|
"step": 173
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8246445497630331,
|
||
|
|
"grad_norm": 4.154002223531854,
|
||
|
|
"learning_rate": 8.199052132701422e-06,
|
||
|
|
"loss": 0.767,
|
||
|
|
"step": 174
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8293838862559242,
|
||
|
|
"grad_norm": 2.6690075960806445,
|
||
|
|
"learning_rate": 8.246445497630333e-06,
|
||
|
|
"loss": 0.4886,
|
||
|
|
"step": 175
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8341232227488151,
|
||
|
|
"grad_norm": 11.187655316079512,
|
||
|
|
"learning_rate": 8.293838862559243e-06,
|
||
|
|
"loss": 0.5197,
|
||
|
|
"step": 176
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8388625592417062,
|
||
|
|
"grad_norm": 2.3362587429831687,
|
||
|
|
"learning_rate": 8.341232227488152e-06,
|
||
|
|
"loss": 0.8314,
|
||
|
|
"step": 177
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8436018957345972,
|
||
|
|
"grad_norm": 2.8252680205720817,
|
||
|
|
"learning_rate": 8.388625592417062e-06,
|
||
|
|
"loss": 0.4624,
|
||
|
|
"step": 178
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8483412322274881,
|
||
|
|
"grad_norm": 4.528305551354439,
|
||
|
|
"learning_rate": 8.436018957345973e-06,
|
||
|
|
"loss": 0.2562,
|
||
|
|
"step": 179
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8530805687203792,
|
||
|
|
"grad_norm": 4.546641068403436,
|
||
|
|
"learning_rate": 8.483412322274883e-06,
|
||
|
|
"loss": 0.2464,
|
||
|
|
"step": 180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8578199052132701,
|
||
|
|
"grad_norm": 3.6038044992663334,
|
||
|
|
"learning_rate": 8.530805687203793e-06,
|
||
|
|
"loss": 0.4069,
|
||
|
|
"step": 181
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8625592417061612,
|
||
|
|
"grad_norm": 1.9811719048702106,
|
||
|
|
"learning_rate": 8.578199052132702e-06,
|
||
|
|
"loss": 0.5778,
|
||
|
|
"step": 182
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8672985781990521,
|
||
|
|
"grad_norm": 4.384545934120455,
|
||
|
|
"learning_rate": 8.625592417061612e-06,
|
||
|
|
"loss": 0.7424,
|
||
|
|
"step": 183
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8720379146919431,
|
||
|
|
"grad_norm": 2.224370506740259,
|
||
|
|
"learning_rate": 8.672985781990521e-06,
|
||
|
|
"loss": 0.5563,
|
||
|
|
"step": 184
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8767772511848341,
|
||
|
|
"grad_norm": 2.2438244553088804,
|
||
|
|
"learning_rate": 8.720379146919431e-06,
|
||
|
|
"loss": 0.5606,
|
||
|
|
"step": 185
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8815165876777251,
|
||
|
|
"grad_norm": 2.4153286166112657,
|
||
|
|
"learning_rate": 8.767772511848342e-06,
|
||
|
|
"loss": 0.7797,
|
||
|
|
"step": 186
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8862559241706162,
|
||
|
|
"grad_norm": 4.313982187372051,
|
||
|
|
"learning_rate": 8.815165876777252e-06,
|
||
|
|
"loss": 0.83,
|
||
|
|
"step": 187
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8909952606635071,
|
||
|
|
"grad_norm": 5.2744040798784635,
|
||
|
|
"learning_rate": 8.862559241706162e-06,
|
||
|
|
"loss": 0.5133,
|
||
|
|
"step": 188
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8957345971563981,
|
||
|
|
"grad_norm": 4.35905212043424,
|
||
|
|
"learning_rate": 8.909952606635071e-06,
|
||
|
|
"loss": 0.4265,
|
||
|
|
"step": 189
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9004739336492891,
|
||
|
|
"grad_norm": 6.916403849349734,
|
||
|
|
"learning_rate": 8.957345971563981e-06,
|
||
|
|
"loss": 0.5007,
|
||
|
|
"step": 190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9052132701421801,
|
||
|
|
"grad_norm": 4.0093681115073325,
|
||
|
|
"learning_rate": 9.004739336492892e-06,
|
||
|
|
"loss": 0.9012,
|
||
|
|
"step": 191
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.909952606635071,
|
||
|
|
"grad_norm": 3.3859156359807496,
|
||
|
|
"learning_rate": 9.052132701421802e-06,
|
||
|
|
"loss": 0.4262,
|
||
|
|
"step": 192
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9146919431279621,
|
||
|
|
"grad_norm": 4.488130094949602,
|
||
|
|
"learning_rate": 9.09952606635071e-06,
|
||
|
|
"loss": 0.5656,
|
||
|
|
"step": 193
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.919431279620853,
|
||
|
|
"grad_norm": 7.17629211137066,
|
||
|
|
"learning_rate": 9.146919431279621e-06,
|
||
|
|
"loss": 0.4695,
|
||
|
|
"step": 194
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9241706161137441,
|
||
|
|
"grad_norm": 2.7309294256882928,
|
||
|
|
"learning_rate": 9.194312796208532e-06,
|
||
|
|
"loss": 0.8346,
|
||
|
|
"step": 195
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9289099526066351,
|
||
|
|
"grad_norm": 5.085916739731668,
|
||
|
|
"learning_rate": 9.241706161137442e-06,
|
||
|
|
"loss": 0.5857,
|
||
|
|
"step": 196
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.933649289099526,
|
||
|
|
"grad_norm": 23.811646540122965,
|
||
|
|
"learning_rate": 9.289099526066352e-06,
|
||
|
|
"loss": 0.7435,
|
||
|
|
"step": 197
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9383886255924171,
|
||
|
|
"grad_norm": 2.4009497142615572,
|
||
|
|
"learning_rate": 9.336492890995261e-06,
|
||
|
|
"loss": 0.5708,
|
||
|
|
"step": 198
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.943127962085308,
|
||
|
|
"grad_norm": 6.581250829580497,
|
||
|
|
"learning_rate": 9.383886255924171e-06,
|
||
|
|
"loss": 0.4402,
|
||
|
|
"step": 199
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9478672985781991,
|
||
|
|
"grad_norm": 2.4308077426776142,
|
||
|
|
"learning_rate": 9.431279620853082e-06,
|
||
|
|
"loss": 0.4197,
|
||
|
|
"step": 200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9478672985781991,
|
||
|
|
"eval_loss": 0.4012674391269684,
|
||
|
|
"eval_runtime": 7.7289,
|
||
|
|
"eval_samples_per_second": 24.324,
|
||
|
|
"eval_steps_per_second": 6.081,
|
||
|
|
"step": 200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.95260663507109,
|
||
|
|
"grad_norm": 3.5198078059324027,
|
||
|
|
"learning_rate": 9.478672985781992e-06,
|
||
|
|
"loss": 0.5321,
|
||
|
|
"step": 201
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.957345971563981,
|
||
|
|
"grad_norm": 6.593784858432653,
|
||
|
|
"learning_rate": 9.5260663507109e-06,
|
||
|
|
"loss": 0.5953,
|
||
|
|
"step": 202
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9620853080568721,
|
||
|
|
"grad_norm": 4.212951711248403,
|
||
|
|
"learning_rate": 9.573459715639811e-06,
|
||
|
|
"loss": 0.6857,
|
||
|
|
"step": 203
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.966824644549763,
|
||
|
|
"grad_norm": 4.910386484070207,
|
||
|
|
"learning_rate": 9.620853080568721e-06,
|
||
|
|
"loss": 0.6271,
|
||
|
|
"step": 204
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9715639810426541,
|
||
|
|
"grad_norm": 6.155738225633911,
|
||
|
|
"learning_rate": 9.668246445497632e-06,
|
||
|
|
"loss": 0.601,
|
||
|
|
"step": 205
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.976303317535545,
|
||
|
|
"grad_norm": 3.268371754216749,
|
||
|
|
"learning_rate": 9.715639810426542e-06,
|
||
|
|
"loss": 0.5909,
|
||
|
|
"step": 206
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.981042654028436,
|
||
|
|
"grad_norm": 2.2280729563438784,
|
||
|
|
"learning_rate": 9.76303317535545e-06,
|
||
|
|
"loss": 0.4416,
|
||
|
|
"step": 207
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.985781990521327,
|
||
|
|
"grad_norm": 10.634221783829398,
|
||
|
|
"learning_rate": 9.810426540284361e-06,
|
||
|
|
"loss": 0.446,
|
||
|
|
"step": 208
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.990521327014218,
|
||
|
|
"grad_norm": 1.9752733492895744,
|
||
|
|
"learning_rate": 9.85781990521327e-06,
|
||
|
|
"loss": 0.5698,
|
||
|
|
"step": 209
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.995260663507109,
|
||
|
|
"grad_norm": 4.058938063363919,
|
||
|
|
"learning_rate": 9.905213270142182e-06,
|
||
|
|
"loss": 0.8303,
|
||
|
|
"step": 210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0,
|
||
|
|
"grad_norm": 2.3966848282479023,
|
||
|
|
"learning_rate": 9.95260663507109e-06,
|
||
|
|
"loss": 0.582,
|
||
|
|
"step": 211
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.004739336492891,
|
||
|
|
"grad_norm": 3.218631141884875,
|
||
|
|
"learning_rate": 1e-05,
|
||
|
|
"loss": 0.2923,
|
||
|
|
"step": 212
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.009478672985782,
|
||
|
|
"grad_norm": 2.5824515879316516,
|
||
|
|
"learning_rate": 9.999993157895144e-06,
|
||
|
|
"loss": 0.4631,
|
||
|
|
"step": 213
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.014218009478673,
|
||
|
|
"grad_norm": 3.6831706895158742,
|
||
|
|
"learning_rate": 9.9999726315993e-06,
|
||
|
|
"loss": 0.1689,
|
||
|
|
"step": 214
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.018957345971564,
|
||
|
|
"grad_norm": 3.3891043448121563,
|
||
|
|
"learning_rate": 9.999938421168647e-06,
|
||
|
|
"loss": 0.5278,
|
||
|
|
"step": 215
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0236966824644549,
|
||
|
|
"grad_norm": 2.356450269389873,
|
||
|
|
"learning_rate": 9.999890526696813e-06,
|
||
|
|
"loss": 0.4907,
|
||
|
|
"step": 216
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.028436018957346,
|
||
|
|
"grad_norm": 2.693113370180676,
|
||
|
|
"learning_rate": 9.999828948314876e-06,
|
||
|
|
"loss": 0.6182,
|
||
|
|
"step": 217
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.033175355450237,
|
||
|
|
"grad_norm": 10.485075200420136,
|
||
|
|
"learning_rate": 9.999753686191369e-06,
|
||
|
|
"loss": 0.1554,
|
||
|
|
"step": 218
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.037914691943128,
|
||
|
|
"grad_norm": 13.319934547783834,
|
||
|
|
"learning_rate": 9.99966474053227e-06,
|
||
|
|
"loss": 0.4698,
|
||
|
|
"step": 219
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.042654028436019,
|
||
|
|
"grad_norm": 2.8867025126626813,
|
||
|
|
"learning_rate": 9.999562111581011e-06,
|
||
|
|
"loss": 0.3821,
|
||
|
|
"step": 220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.04739336492891,
|
||
|
|
"grad_norm": 10.927946668852911,
|
||
|
|
"learning_rate": 9.99944579961847e-06,
|
||
|
|
"loss": 0.7159,
|
||
|
|
"step": 221
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.052132701421801,
|
||
|
|
"grad_norm": 4.262645462720551,
|
||
|
|
"learning_rate": 9.999315804962974e-06,
|
||
|
|
"loss": 0.7124,
|
||
|
|
"step": 222
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0568720379146919,
|
||
|
|
"grad_norm": 26.424366500171537,
|
||
|
|
"learning_rate": 9.999172127970301e-06,
|
||
|
|
"loss": 0.5217,
|
||
|
|
"step": 223
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.061611374407583,
|
||
|
|
"grad_norm": 3.1119784365221377,
|
||
|
|
"learning_rate": 9.99901476903367e-06,
|
||
|
|
"loss": 0.5714,
|
||
|
|
"step": 224
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.066350710900474,
|
||
|
|
"grad_norm": 5.335575516099637,
|
||
|
|
"learning_rate": 9.998843728583747e-06,
|
||
|
|
"loss": 0.6435,
|
||
|
|
"step": 225
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0710900473933649,
|
||
|
|
"grad_norm": 2.6631526273812023,
|
||
|
|
"learning_rate": 9.998659007088642e-06,
|
||
|
|
"loss": 0.7229,
|
||
|
|
"step": 226
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0758293838862558,
|
||
|
|
"grad_norm": 3.82136179301926,
|
||
|
|
"learning_rate": 9.998460605053911e-06,
|
||
|
|
"loss": 0.7317,
|
||
|
|
"step": 227
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.080568720379147,
|
||
|
|
"grad_norm": 4.481488279741978,
|
||
|
|
"learning_rate": 9.998248523022548e-06,
|
||
|
|
"loss": 0.7125,
|
||
|
|
"step": 228
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.085308056872038,
|
||
|
|
"grad_norm": 38.77713323075712,
|
||
|
|
"learning_rate": 9.998022761574989e-06,
|
||
|
|
"loss": 0.4422,
|
||
|
|
"step": 229
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0900473933649288,
|
||
|
|
"grad_norm": 14.05824520952655,
|
||
|
|
"learning_rate": 9.997783321329104e-06,
|
||
|
|
"loss": 0.3723,
|
||
|
|
"step": 230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.09478672985782,
|
||
|
|
"grad_norm": 5.808006574074876,
|
||
|
|
"learning_rate": 9.997530202940206e-06,
|
||
|
|
"loss": 0.503,
|
||
|
|
"step": 231
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.099526066350711,
|
||
|
|
"grad_norm": 3.7946635632876635,
|
||
|
|
"learning_rate": 9.997263407101038e-06,
|
||
|
|
"loss": 0.4076,
|
||
|
|
"step": 232
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1042654028436019,
|
||
|
|
"grad_norm": 3.804597829921628,
|
||
|
|
"learning_rate": 9.996982934541781e-06,
|
||
|
|
"loss": 0.6137,
|
||
|
|
"step": 233
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1090047393364928,
|
||
|
|
"grad_norm": 3.086388488116426,
|
||
|
|
"learning_rate": 9.996688786030042e-06,
|
||
|
|
"loss": 0.523,
|
||
|
|
"step": 234
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.113744075829384,
|
||
|
|
"grad_norm": 2.593227456525801,
|
||
|
|
"learning_rate": 9.996380962370859e-06,
|
||
|
|
"loss": 0.7126,
|
||
|
|
"step": 235
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1184834123222749,
|
||
|
|
"grad_norm": 3.0824248661799794,
|
||
|
|
"learning_rate": 9.9960594644067e-06,
|
||
|
|
"loss": 0.5933,
|
||
|
|
"step": 236
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1232227488151658,
|
||
|
|
"grad_norm": 2.401542304407477,
|
||
|
|
"learning_rate": 9.995724293017449e-06,
|
||
|
|
"loss": 0.5244,
|
||
|
|
"step": 237
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1279620853080567,
|
||
|
|
"grad_norm": 17.032209335154683,
|
||
|
|
"learning_rate": 9.995375449120419e-06,
|
||
|
|
"loss": 0.3041,
|
||
|
|
"step": 238
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.132701421800948,
|
||
|
|
"grad_norm": 14.316641226011127,
|
||
|
|
"learning_rate": 9.995012933670341e-06,
|
||
|
|
"loss": 0.3489,
|
||
|
|
"step": 239
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1374407582938388,
|
||
|
|
"grad_norm": 2.4244669733868593,
|
||
|
|
"learning_rate": 9.994636747659363e-06,
|
||
|
|
"loss": 0.5447,
|
||
|
|
"step": 240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1421800947867298,
|
||
|
|
"grad_norm": 2.8284944731557715,
|
||
|
|
"learning_rate": 9.994246892117046e-06,
|
||
|
|
"loss": 0.359,
|
||
|
|
"step": 241
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.146919431279621,
|
||
|
|
"grad_norm": 4.092835540014769,
|
||
|
|
"learning_rate": 9.993843368110363e-06,
|
||
|
|
"loss": 0.5189,
|
||
|
|
"step": 242
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1516587677725119,
|
||
|
|
"grad_norm": 7.344430749456483,
|
||
|
|
"learning_rate": 9.993426176743695e-06,
|
||
|
|
"loss": 0.6276,
|
||
|
|
"step": 243
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1563981042654028,
|
||
|
|
"grad_norm": 2.4913875975185813,
|
||
|
|
"learning_rate": 9.992995319158832e-06,
|
||
|
|
"loss": 0.4981,
|
||
|
|
"step": 244
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.161137440758294,
|
||
|
|
"grad_norm": 3.8542842696245603,
|
||
|
|
"learning_rate": 9.992550796534957e-06,
|
||
|
|
"loss": 0.5826,
|
||
|
|
"step": 245
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1658767772511849,
|
||
|
|
"grad_norm": 1.5510273419539844,
|
||
|
|
"learning_rate": 9.992092610088664e-06,
|
||
|
|
"loss": 0.1907,
|
||
|
|
"step": 246
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1706161137440758,
|
||
|
|
"grad_norm": 3.266454326874095,
|
||
|
|
"learning_rate": 9.991620761073932e-06,
|
||
|
|
"loss": 0.5856,
|
||
|
|
"step": 247
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1753554502369667,
|
||
|
|
"grad_norm": 2.634621688542773,
|
||
|
|
"learning_rate": 9.991135250782143e-06,
|
||
|
|
"loss": 0.6905,
|
||
|
|
"step": 248
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.180094786729858,
|
||
|
|
"grad_norm": 2.598153013578805,
|
||
|
|
"learning_rate": 9.990636080542056e-06,
|
||
|
|
"loss": 0.619,
|
||
|
|
"step": 249
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1848341232227488,
|
||
|
|
"grad_norm": 2.1773544624317007,
|
||
|
|
"learning_rate": 9.990123251719826e-06,
|
||
|
|
"loss": 0.3558,
|
||
|
|
"step": 250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1848341232227488,
|
||
|
|
"eval_loss": 0.33629781007766724,
|
||
|
|
"eval_runtime": 7.6017,
|
||
|
|
"eval_samples_per_second": 24.731,
|
||
|
|
"eval_steps_per_second": 6.183,
|
||
|
|
"step": 250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1895734597156398,
|
||
|
|
"grad_norm": 3.988497582009886,
|
||
|
|
"learning_rate": 9.989596765718981e-06,
|
||
|
|
"loss": 0.4084,
|
||
|
|
"step": 251
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1943127962085307,
|
||
|
|
"grad_norm": 2.4657243130513744,
|
||
|
|
"learning_rate": 9.989056623980431e-06,
|
||
|
|
"loss": 0.5131,
|
||
|
|
"step": 252
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1990521327014219,
|
||
|
|
"grad_norm": 3.535641712422993,
|
||
|
|
"learning_rate": 9.988502827982458e-06,
|
||
|
|
"loss": 0.4954,
|
||
|
|
"step": 253
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2037914691943128,
|
||
|
|
"grad_norm": 1.85755118873854,
|
||
|
|
"learning_rate": 9.987935379240715e-06,
|
||
|
|
"loss": 0.2961,
|
||
|
|
"step": 254
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2085308056872037,
|
||
|
|
"grad_norm": 2.5139993461113845,
|
||
|
|
"learning_rate": 9.98735427930822e-06,
|
||
|
|
"loss": 0.4925,
|
||
|
|
"step": 255
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2132701421800949,
|
||
|
|
"grad_norm": 1.924588366089442,
|
||
|
|
"learning_rate": 9.98675952977535e-06,
|
||
|
|
"loss": 0.5136,
|
||
|
|
"step": 256
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2180094786729858,
|
||
|
|
"grad_norm": 1.7640940730996135,
|
||
|
|
"learning_rate": 9.986151132269843e-06,
|
||
|
|
"loss": 0.3154,
|
||
|
|
"step": 257
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2227488151658767,
|
||
|
|
"grad_norm": 3.2916885300512413,
|
||
|
|
"learning_rate": 9.985529088456783e-06,
|
||
|
|
"loss": 0.5185,
|
||
|
|
"step": 258
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2274881516587677,
|
||
|
|
"grad_norm": 2.9386914924460767,
|
||
|
|
"learning_rate": 9.984893400038608e-06,
|
||
|
|
"loss": 0.6502,
|
||
|
|
"step": 259
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2322274881516588,
|
||
|
|
"grad_norm": 1.8843342232826004,
|
||
|
|
"learning_rate": 9.9842440687551e-06,
|
||
|
|
"loss": 0.4734,
|
||
|
|
"step": 260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2369668246445498,
|
||
|
|
"grad_norm": 2.7122087340956003,
|
||
|
|
"learning_rate": 9.98358109638337e-06,
|
||
|
|
"loss": 0.6829,
|
||
|
|
"step": 261
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2417061611374407,
|
||
|
|
"grad_norm": 2.2550679323996023,
|
||
|
|
"learning_rate": 9.98290448473787e-06,
|
||
|
|
"loss": 0.4525,
|
||
|
|
"step": 262
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2464454976303316,
|
||
|
|
"grad_norm": 1.9647842800496953,
|
||
|
|
"learning_rate": 9.982214235670383e-06,
|
||
|
|
"loss": 0.2775,
|
||
|
|
"step": 263
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2511848341232228,
|
||
|
|
"grad_norm": 2.9680154031551345,
|
||
|
|
"learning_rate": 9.981510351070008e-06,
|
||
|
|
"loss": 0.3757,
|
||
|
|
"step": 264
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2559241706161137,
|
||
|
|
"grad_norm": 2.957357955761695,
|
||
|
|
"learning_rate": 9.980792832863166e-06,
|
||
|
|
"loss": 0.4655,
|
||
|
|
"step": 265
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2606635071090047,
|
||
|
|
"grad_norm": 2.6409737245596316,
|
||
|
|
"learning_rate": 9.980061683013594e-06,
|
||
|
|
"loss": 0.7199,
|
||
|
|
"step": 266
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2654028436018958,
|
||
|
|
"grad_norm": 2.872009184076046,
|
||
|
|
"learning_rate": 9.979316903522328e-06,
|
||
|
|
"loss": 0.3885,
|
||
|
|
"step": 267
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2701421800947867,
|
||
|
|
"grad_norm": 12.283797018397255,
|
||
|
|
"learning_rate": 9.978558496427718e-06,
|
||
|
|
"loss": 0.448,
|
||
|
|
"step": 268
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2748815165876777,
|
||
|
|
"grad_norm": 5.246080493210068,
|
||
|
|
"learning_rate": 9.977786463805399e-06,
|
||
|
|
"loss": 0.6272,
|
||
|
|
"step": 269
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2796208530805688,
|
||
|
|
"grad_norm": 6.854106097690291,
|
||
|
|
"learning_rate": 9.977000807768306e-06,
|
||
|
|
"loss": 0.2441,
|
||
|
|
"step": 270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2843601895734598,
|
||
|
|
"grad_norm": 2.4002121073215914,
|
||
|
|
"learning_rate": 9.976201530466656e-06,
|
||
|
|
"loss": 0.5504,
|
||
|
|
"step": 271
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2890995260663507,
|
||
|
|
"grad_norm": 2.250753376506315,
|
||
|
|
"learning_rate": 9.97538863408794e-06,
|
||
|
|
"loss": 0.5053,
|
||
|
|
"step": 272
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2938388625592416,
|
||
|
|
"grad_norm": 2.4676051862347754,
|
||
|
|
"learning_rate": 9.97456212085693e-06,
|
||
|
|
"loss": 0.6538,
|
||
|
|
"step": 273
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2985781990521326,
|
||
|
|
"grad_norm": 1.7931457643079958,
|
||
|
|
"learning_rate": 9.973721993035664e-06,
|
||
|
|
"loss": 0.3308,
|
||
|
|
"step": 274
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3033175355450237,
|
||
|
|
"grad_norm": 8.793350718644787,
|
||
|
|
"learning_rate": 9.972868252923433e-06,
|
||
|
|
"loss": 0.4205,
|
||
|
|
"step": 275
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3080568720379147,
|
||
|
|
"grad_norm": 2.3111808159253497,
|
||
|
|
"learning_rate": 9.972000902856795e-06,
|
||
|
|
"loss": 0.35,
|
||
|
|
"step": 276
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3127962085308056,
|
||
|
|
"grad_norm": 1.9770114541725161,
|
||
|
|
"learning_rate": 9.971119945209548e-06,
|
||
|
|
"loss": 0.181,
|
||
|
|
"step": 277
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3175355450236967,
|
||
|
|
"grad_norm": 3.676033223865303,
|
||
|
|
"learning_rate": 9.970225382392733e-06,
|
||
|
|
"loss": 0.4626,
|
||
|
|
"step": 278
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3222748815165877,
|
||
|
|
"grad_norm": 3.0913472681704914,
|
||
|
|
"learning_rate": 9.969317216854627e-06,
|
||
|
|
"loss": 0.5468,
|
||
|
|
"step": 279
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3270142180094786,
|
||
|
|
"grad_norm": 1.6758883133404263,
|
||
|
|
"learning_rate": 9.968395451080736e-06,
|
||
|
|
"loss": 0.3027,
|
||
|
|
"step": 280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3317535545023698,
|
||
|
|
"grad_norm": 1.6906328567970788,
|
||
|
|
"learning_rate": 9.967460087593786e-06,
|
||
|
|
"loss": 0.2599,
|
||
|
|
"step": 281
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3364928909952607,
|
||
|
|
"grad_norm": 2.519747919443685,
|
||
|
|
"learning_rate": 9.966511128953723e-06,
|
||
|
|
"loss": 0.3654,
|
||
|
|
"step": 282
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3412322274881516,
|
||
|
|
"grad_norm": 3.0994348743265645,
|
||
|
|
"learning_rate": 9.965548577757691e-06,
|
||
|
|
"loss": 0.5109,
|
||
|
|
"step": 283
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3459715639810428,
|
||
|
|
"grad_norm": 5.19769356707834,
|
||
|
|
"learning_rate": 9.964572436640046e-06,
|
||
|
|
"loss": 0.6598,
|
||
|
|
"step": 284
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3507109004739337,
|
||
|
|
"grad_norm": 2.2151807054914103,
|
||
|
|
"learning_rate": 9.963582708272328e-06,
|
||
|
|
"loss": 0.3225,
|
||
|
|
"step": 285
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3554502369668247,
|
||
|
|
"grad_norm": 4.321591266645481,
|
||
|
|
"learning_rate": 9.96257939536327e-06,
|
||
|
|
"loss": 0.2298,
|
||
|
|
"step": 286
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3601895734597156,
|
||
|
|
"grad_norm": 2.5426417173798423,
|
||
|
|
"learning_rate": 9.961562500658779e-06,
|
||
|
|
"loss": 0.209,
|
||
|
|
"step": 287
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3649289099526065,
|
||
|
|
"grad_norm": 2.641414755534017,
|
||
|
|
"learning_rate": 9.960532026941934e-06,
|
||
|
|
"loss": 0.6695,
|
||
|
|
"step": 288
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3696682464454977,
|
||
|
|
"grad_norm": 2.350576800978289,
|
||
|
|
"learning_rate": 9.959487977032982e-06,
|
||
|
|
"loss": 0.1766,
|
||
|
|
"step": 289
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3744075829383886,
|
||
|
|
"grad_norm": 1.5070760436712913,
|
||
|
|
"learning_rate": 9.958430353789321e-06,
|
||
|
|
"loss": 0.2852,
|
||
|
|
"step": 290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3791469194312795,
|
||
|
|
"grad_norm": 2.693828234820816,
|
||
|
|
"learning_rate": 9.957359160105497e-06,
|
||
|
|
"loss": 0.7203,
|
||
|
|
"step": 291
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3838862559241707,
|
||
|
|
"grad_norm": 6.638947415556458,
|
||
|
|
"learning_rate": 9.956274398913201e-06,
|
||
|
|
"loss": 0.5427,
|
||
|
|
"step": 292
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3886255924170616,
|
||
|
|
"grad_norm": 2.234860448425406,
|
||
|
|
"learning_rate": 9.95517607318125e-06,
|
||
|
|
"loss": 0.6673,
|
||
|
|
"step": 293
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3933649289099526,
|
||
|
|
"grad_norm": 2.127742473833712,
|
||
|
|
"learning_rate": 9.954064185915589e-06,
|
||
|
|
"loss": 0.3178,
|
||
|
|
"step": 294
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3981042654028437,
|
||
|
|
"grad_norm": 2.2598415378345327,
|
||
|
|
"learning_rate": 9.952938740159278e-06,
|
||
|
|
"loss": 0.6143,
|
||
|
|
"step": 295
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4028436018957346,
|
||
|
|
"grad_norm": 3.5494883982443195,
|
||
|
|
"learning_rate": 9.951799738992484e-06,
|
||
|
|
"loss": 0.6594,
|
||
|
|
"step": 296
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4075829383886256,
|
||
|
|
"grad_norm": 3.9241929941872313,
|
||
|
|
"learning_rate": 9.950647185532473e-06,
|
||
|
|
"loss": 0.5619,
|
||
|
|
"step": 297
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4123222748815165,
|
||
|
|
"grad_norm": 8.034389257470865,
|
||
|
|
"learning_rate": 9.949481082933602e-06,
|
||
|
|
"loss": 0.4057,
|
||
|
|
"step": 298
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4170616113744074,
|
||
|
|
"grad_norm": 5.55376279395039,
|
||
|
|
"learning_rate": 9.948301434387308e-06,
|
||
|
|
"loss": 0.6668,
|
||
|
|
"step": 299
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4218009478672986,
|
||
|
|
"grad_norm": 8.382791791140692,
|
||
|
|
"learning_rate": 9.947108243122107e-06,
|
||
|
|
"loss": 0.6512,
|
||
|
|
"step": 300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4218009478672986,
|
||
|
|
"eval_loss": 0.3448152244091034,
|
||
|
|
"eval_runtime": 7.5747,
|
||
|
|
"eval_samples_per_second": 24.819,
|
||
|
|
"eval_steps_per_second": 6.205,
|
||
|
|
"step": 300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4265402843601895,
|
||
|
|
"grad_norm": 2.3895201954800327,
|
||
|
|
"learning_rate": 9.94590151240357e-06,
|
||
|
|
"loss": 0.3551,
|
||
|
|
"step": 301
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4312796208530805,
|
||
|
|
"grad_norm": 21.81484589554373,
|
||
|
|
"learning_rate": 9.944681245534329e-06,
|
||
|
|
"loss": 0.2085,
|
||
|
|
"step": 302
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4360189573459716,
|
||
|
|
"grad_norm": 2.4289020232068603,
|
||
|
|
"learning_rate": 9.943447445854065e-06,
|
||
|
|
"loss": 0.6601,
|
||
|
|
"step": 303
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4407582938388626,
|
||
|
|
"grad_norm": 6.537329090959136,
|
||
|
|
"learning_rate": 9.942200116739488e-06,
|
||
|
|
"loss": 0.4185,
|
||
|
|
"step": 304
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4454976303317535,
|
||
|
|
"grad_norm": 1.9272545240932075,
|
||
|
|
"learning_rate": 9.940939261604344e-06,
|
||
|
|
"loss": 0.3802,
|
||
|
|
"step": 305
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4502369668246446,
|
||
|
|
"grad_norm": 2.0804031032401604,
|
||
|
|
"learning_rate": 9.939664883899394e-06,
|
||
|
|
"loss": 0.479,
|
||
|
|
"step": 306
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4549763033175356,
|
||
|
|
"grad_norm": 1.5623893795033323,
|
||
|
|
"learning_rate": 9.938376987112406e-06,
|
||
|
|
"loss": 0.3465,
|
||
|
|
"step": 307
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4597156398104265,
|
||
|
|
"grad_norm": 2.4971685926961196,
|
||
|
|
"learning_rate": 9.937075574768152e-06,
|
||
|
|
"loss": 0.5371,
|
||
|
|
"step": 308
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4644549763033177,
|
||
|
|
"grad_norm": 1.9465256216683717,
|
||
|
|
"learning_rate": 9.93576065042839e-06,
|
||
|
|
"loss": 0.492,
|
||
|
|
"step": 309
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4691943127962086,
|
||
|
|
"grad_norm": 2.05113471327495,
|
||
|
|
"learning_rate": 9.934432217691862e-06,
|
||
|
|
"loss": 0.5045,
|
||
|
|
"step": 310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4739336492890995,
|
||
|
|
"grad_norm": 1.9196555117253529,
|
||
|
|
"learning_rate": 9.93309028019428e-06,
|
||
|
|
"loss": 0.3704,
|
||
|
|
"step": 311
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4786729857819905,
|
||
|
|
"grad_norm": 3.682379874191357,
|
||
|
|
"learning_rate": 9.931734841608311e-06,
|
||
|
|
"loss": 0.4535,
|
||
|
|
"step": 312
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4834123222748814,
|
||
|
|
"grad_norm": 3.3519197093172863,
|
||
|
|
"learning_rate": 9.930365905643578e-06,
|
||
|
|
"loss": 0.5185,
|
||
|
|
"step": 313
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4881516587677726,
|
||
|
|
"grad_norm": 2.881253847151378,
|
||
|
|
"learning_rate": 9.928983476046643e-06,
|
||
|
|
"loss": 0.5396,
|
||
|
|
"step": 314
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4928909952606635,
|
||
|
|
"grad_norm": 1.8237235275534953,
|
||
|
|
"learning_rate": 9.927587556600997e-06,
|
||
|
|
"loss": 0.1842,
|
||
|
|
"step": 315
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4976303317535544,
|
||
|
|
"grad_norm": 4.184421241513608,
|
||
|
|
"learning_rate": 9.926178151127049e-06,
|
||
|
|
"loss": 0.5325,
|
||
|
|
"step": 316
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5023696682464456,
|
||
|
|
"grad_norm": 3.9338291238727985,
|
||
|
|
"learning_rate": 9.924755263482121e-06,
|
||
|
|
"loss": 0.2008,
|
||
|
|
"step": 317
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5071090047393365,
|
||
|
|
"grad_norm": 5.51176109511911,
|
||
|
|
"learning_rate": 9.92331889756043e-06,
|
||
|
|
"loss": 0.4532,
|
||
|
|
"step": 318
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5118483412322274,
|
||
|
|
"grad_norm": 2.748622530097229,
|
||
|
|
"learning_rate": 9.921869057293086e-06,
|
||
|
|
"loss": 0.6899,
|
||
|
|
"step": 319
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5165876777251186,
|
||
|
|
"grad_norm": 2.336940895127292,
|
||
|
|
"learning_rate": 9.920405746648067e-06,
|
||
|
|
"loss": 0.4099,
|
||
|
|
"step": 320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5213270142180095,
|
||
|
|
"grad_norm": 4.063807492448013,
|
||
|
|
"learning_rate": 9.918928969630228e-06,
|
||
|
|
"loss": 0.4569,
|
||
|
|
"step": 321
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5260663507109005,
|
||
|
|
"grad_norm": 2.438274381368647,
|
||
|
|
"learning_rate": 9.917438730281273e-06,
|
||
|
|
"loss": 0.1749,
|
||
|
|
"step": 322
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5308056872037916,
|
||
|
|
"grad_norm": 2.5255975967536015,
|
||
|
|
"learning_rate": 9.91593503267975e-06,
|
||
|
|
"loss": 0.6794,
|
||
|
|
"step": 323
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5355450236966823,
|
||
|
|
"grad_norm": 2.435653867656926,
|
||
|
|
"learning_rate": 9.914417880941043e-06,
|
||
|
|
"loss": 0.6476,
|
||
|
|
"step": 324
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5402843601895735,
|
||
|
|
"grad_norm": 2.3714317752012914,
|
||
|
|
"learning_rate": 9.912887279217356e-06,
|
||
|
|
"loss": 0.4351,
|
||
|
|
"step": 325
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5450236966824644,
|
||
|
|
"grad_norm": 2.3675026644803006,
|
||
|
|
"learning_rate": 9.911343231697703e-06,
|
||
|
|
"loss": 0.2025,
|
||
|
|
"step": 326
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5497630331753554,
|
||
|
|
"grad_norm": 2.6934469311195643,
|
||
|
|
"learning_rate": 9.9097857426079e-06,
|
||
|
|
"loss": 0.4875,
|
||
|
|
"step": 327
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5545023696682465,
|
||
|
|
"grad_norm": 2.382299005054798,
|
||
|
|
"learning_rate": 9.908214816210548e-06,
|
||
|
|
"loss": 0.6983,
|
||
|
|
"step": 328
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5592417061611374,
|
||
|
|
"grad_norm": 2.5391906448996613,
|
||
|
|
"learning_rate": 9.906630456805024e-06,
|
||
|
|
"loss": 0.4902,
|
||
|
|
"step": 329
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5639810426540284,
|
||
|
|
"grad_norm": 5.074284156439468,
|
||
|
|
"learning_rate": 9.905032668727467e-06,
|
||
|
|
"loss": 0.3692,
|
||
|
|
"step": 330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5687203791469195,
|
||
|
|
"grad_norm": 2.2664917617556255,
|
||
|
|
"learning_rate": 9.903421456350776e-06,
|
||
|
|
"loss": 0.5135,
|
||
|
|
"step": 331
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5734597156398105,
|
||
|
|
"grad_norm": 2.19259108575062,
|
||
|
|
"learning_rate": 9.90179682408458e-06,
|
||
|
|
"loss": 0.4938,
|
||
|
|
"step": 332
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5781990521327014,
|
||
|
|
"grad_norm": 2.5361885097234933,
|
||
|
|
"learning_rate": 9.90015877637524e-06,
|
||
|
|
"loss": 0.5438,
|
||
|
|
"step": 333
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5829383886255926,
|
||
|
|
"grad_norm": 2.178138486962567,
|
||
|
|
"learning_rate": 9.898507317705837e-06,
|
||
|
|
"loss": 0.6188,
|
||
|
|
"step": 334
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5876777251184833,
|
||
|
|
"grad_norm": 1.812913729132945,
|
||
|
|
"learning_rate": 9.896842452596151e-06,
|
||
|
|
"loss": 0.3508,
|
||
|
|
"step": 335
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5924170616113744,
|
||
|
|
"grad_norm": 1.7087086271485783,
|
||
|
|
"learning_rate": 9.895164185602655e-06,
|
||
|
|
"loss": 0.3084,
|
||
|
|
"step": 336
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5971563981042654,
|
||
|
|
"grad_norm": 2.7388427108508053,
|
||
|
|
"learning_rate": 9.893472521318499e-06,
|
||
|
|
"loss": 0.1332,
|
||
|
|
"step": 337
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6018957345971563,
|
||
|
|
"grad_norm": 1.8791204965113866,
|
||
|
|
"learning_rate": 9.891767464373503e-06,
|
||
|
|
"loss": 0.4661,
|
||
|
|
"step": 338
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6066350710900474,
|
||
|
|
"grad_norm": 2.068312034507751,
|
||
|
|
"learning_rate": 9.890049019434135e-06,
|
||
|
|
"loss": 0.5085,
|
||
|
|
"step": 339
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6113744075829384,
|
||
|
|
"grad_norm": 1.799668228890366,
|
||
|
|
"learning_rate": 9.888317191203513e-06,
|
||
|
|
"loss": 0.3711,
|
||
|
|
"step": 340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6161137440758293,
|
||
|
|
"grad_norm": 1.9724625694968003,
|
||
|
|
"learning_rate": 9.886571984421371e-06,
|
||
|
|
"loss": 0.3308,
|
||
|
|
"step": 341
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6208530805687205,
|
||
|
|
"grad_norm": 0.5548688306542162,
|
||
|
|
"learning_rate": 9.884813403864067e-06,
|
||
|
|
"loss": 0.0027,
|
||
|
|
"step": 342
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6255924170616114,
|
||
|
|
"grad_norm": 1.1667921040825704,
|
||
|
|
"learning_rate": 9.883041454344558e-06,
|
||
|
|
"loss": 0.1846,
|
||
|
|
"step": 343
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6303317535545023,
|
||
|
|
"grad_norm": 2.504222470295781,
|
||
|
|
"learning_rate": 9.881256140712389e-06,
|
||
|
|
"loss": 0.4055,
|
||
|
|
"step": 344
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6350710900473935,
|
||
|
|
"grad_norm": 3.2053743663342202,
|
||
|
|
"learning_rate": 9.879457467853683e-06,
|
||
|
|
"loss": 0.3911,
|
||
|
|
"step": 345
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6398104265402842,
|
||
|
|
"grad_norm": 4.070213617774694,
|
||
|
|
"learning_rate": 9.877645440691122e-06,
|
||
|
|
"loss": 0.3496,
|
||
|
|
"step": 346
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6445497630331753,
|
||
|
|
"grad_norm": 4.238271512703958,
|
||
|
|
"learning_rate": 9.875820064183936e-06,
|
||
|
|
"loss": 0.7347,
|
||
|
|
"step": 347
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6492890995260665,
|
||
|
|
"grad_norm": 5.243679722478133,
|
||
|
|
"learning_rate": 9.873981343327895e-06,
|
||
|
|
"loss": 0.4416,
|
||
|
|
"step": 348
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6540284360189572,
|
||
|
|
"grad_norm": 2.057271614968161,
|
||
|
|
"learning_rate": 9.872129283155287e-06,
|
||
|
|
"loss": 0.4228,
|
||
|
|
"step": 349
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6587677725118484,
|
||
|
|
"grad_norm": 1.9694558162236104,
|
||
|
|
"learning_rate": 9.870263888734905e-06,
|
||
|
|
"loss": 0.4931,
|
||
|
|
"step": 350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6587677725118484,
|
||
|
|
"eval_loss": 0.2978443503379822,
|
||
|
|
"eval_runtime": 7.6174,
|
||
|
|
"eval_samples_per_second": 24.68,
|
||
|
|
"eval_steps_per_second": 6.17,
|
||
|
|
"step": 350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6635071090047393,
|
||
|
|
"grad_norm": 3.1554779761803013,
|
||
|
|
"learning_rate": 9.868385165172042e-06,
|
||
|
|
"loss": 0.6458,
|
||
|
|
"step": 351
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6682464454976302,
|
||
|
|
"grad_norm": 6.676783870329319,
|
||
|
|
"learning_rate": 9.866493117608468e-06,
|
||
|
|
"loss": 0.1516,
|
||
|
|
"step": 352
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6729857819905214,
|
||
|
|
"grad_norm": 2.7462701878820575,
|
||
|
|
"learning_rate": 9.864587751222416e-06,
|
||
|
|
"loss": 0.2574,
|
||
|
|
"step": 353
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6777251184834123,
|
||
|
|
"grad_norm": 3.8157454645498454,
|
||
|
|
"learning_rate": 9.862669071228572e-06,
|
||
|
|
"loss": 0.4856,
|
||
|
|
"step": 354
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6824644549763033,
|
||
|
|
"grad_norm": 1.3090823665483136,
|
||
|
|
"learning_rate": 9.860737082878062e-06,
|
||
|
|
"loss": 0.323,
|
||
|
|
"step": 355
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6872037914691944,
|
||
|
|
"grad_norm": 10.494439088857515,
|
||
|
|
"learning_rate": 9.858791791458431e-06,
|
||
|
|
"loss": 0.1988,
|
||
|
|
"step": 356
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6919431279620853,
|
||
|
|
"grad_norm": 2.0191995262815783,
|
||
|
|
"learning_rate": 9.856833202293637e-06,
|
||
|
|
"loss": 0.3119,
|
||
|
|
"step": 357
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6966824644549763,
|
||
|
|
"grad_norm": 3.5323445127912927,
|
||
|
|
"learning_rate": 9.854861320744024e-06,
|
||
|
|
"loss": 0.1471,
|
||
|
|
"step": 358
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7014218009478674,
|
||
|
|
"grad_norm": 4.056087735458044,
|
||
|
|
"learning_rate": 9.852876152206325e-06,
|
||
|
|
"loss": 0.4143,
|
||
|
|
"step": 359
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7061611374407581,
|
||
|
|
"grad_norm": 1.1699046556227357,
|
||
|
|
"learning_rate": 9.85087770211363e-06,
|
||
|
|
"loss": 0.1744,
|
||
|
|
"step": 360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7109004739336493,
|
||
|
|
"grad_norm": 7.948034395540595,
|
||
|
|
"learning_rate": 9.84886597593538e-06,
|
||
|
|
"loss": 0.583,
|
||
|
|
"step": 361
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7156398104265402,
|
||
|
|
"grad_norm": 1.8387572219605024,
|
||
|
|
"learning_rate": 9.846840979177354e-06,
|
||
|
|
"loss": 0.3403,
|
||
|
|
"step": 362
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7203791469194312,
|
||
|
|
"grad_norm": 2.0300481281041978,
|
||
|
|
"learning_rate": 9.844802717381649e-06,
|
||
|
|
"loss": 0.5911,
|
||
|
|
"step": 363
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7251184834123223,
|
||
|
|
"grad_norm": 2.442593688401152,
|
||
|
|
"learning_rate": 9.842751196126663e-06,
|
||
|
|
"loss": 0.3407,
|
||
|
|
"step": 364
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7298578199052133,
|
||
|
|
"grad_norm": 2.357867457832997,
|
||
|
|
"learning_rate": 9.840686421027085e-06,
|
||
|
|
"loss": 0.6408,
|
||
|
|
"step": 365
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7345971563981042,
|
||
|
|
"grad_norm": 1.4156381529281636,
|
||
|
|
"learning_rate": 9.83860839773388e-06,
|
||
|
|
"loss": 0.3078,
|
||
|
|
"step": 366
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7393364928909953,
|
||
|
|
"grad_norm": 1.4761961207478902,
|
||
|
|
"learning_rate": 9.836517131934267e-06,
|
||
|
|
"loss": 0.3368,
|
||
|
|
"step": 367
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7440758293838863,
|
||
|
|
"grad_norm": 2.233809665949993,
|
||
|
|
"learning_rate": 9.834412629351712e-06,
|
||
|
|
"loss": 0.59,
|
||
|
|
"step": 368
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7488151658767772,
|
||
|
|
"grad_norm": 4.153971174320609,
|
||
|
|
"learning_rate": 9.832294895745906e-06,
|
||
|
|
"loss": 0.6378,
|
||
|
|
"step": 369
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7535545023696684,
|
||
|
|
"grad_norm": 1.8346871985935636,
|
||
|
|
"learning_rate": 9.830163936912752e-06,
|
||
|
|
"loss": 0.2077,
|
||
|
|
"step": 370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.758293838862559,
|
||
|
|
"grad_norm": 2.185943824706028,
|
||
|
|
"learning_rate": 9.828019758684343e-06,
|
||
|
|
"loss": 0.632,
|
||
|
|
"step": 371
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7630331753554502,
|
||
|
|
"grad_norm": 3.585282208139966,
|
||
|
|
"learning_rate": 9.82586236692896e-06,
|
||
|
|
"loss": 0.414,
|
||
|
|
"step": 372
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7677725118483414,
|
||
|
|
"grad_norm": 2.9683580597033417,
|
||
|
|
"learning_rate": 9.823691767551042e-06,
|
||
|
|
"loss": 0.5511,
|
||
|
|
"step": 373
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.772511848341232,
|
||
|
|
"grad_norm": 1.8760918872517949,
|
||
|
|
"learning_rate": 9.821507966491178e-06,
|
||
|
|
"loss": 0.3139,
|
||
|
|
"step": 374
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7772511848341233,
|
||
|
|
"grad_norm": 2.521809140652447,
|
||
|
|
"learning_rate": 9.819310969726083e-06,
|
||
|
|
"loss": 0.3167,
|
||
|
|
"step": 375
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7819905213270142,
|
||
|
|
"grad_norm": 1.6335080451938315,
|
||
|
|
"learning_rate": 9.817100783268591e-06,
|
||
|
|
"loss": 0.3058,
|
||
|
|
"step": 376
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7867298578199051,
|
||
|
|
"grad_norm": 1.8046828342953851,
|
||
|
|
"learning_rate": 9.814877413167635e-06,
|
||
|
|
"loss": 0.3768,
|
||
|
|
"step": 377
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7914691943127963,
|
||
|
|
"grad_norm": 1.8830999035554479,
|
||
|
|
"learning_rate": 9.812640865508228e-06,
|
||
|
|
"loss": 0.4693,
|
||
|
|
"step": 378
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7962085308056872,
|
||
|
|
"grad_norm": 2.320257885040441,
|
||
|
|
"learning_rate": 9.810391146411445e-06,
|
||
|
|
"loss": 0.6267,
|
||
|
|
"step": 379
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8009478672985781,
|
||
|
|
"grad_norm": 10.80073855446241,
|
||
|
|
"learning_rate": 9.808128262034411e-06,
|
||
|
|
"loss": 0.1824,
|
||
|
|
"step": 380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8056872037914693,
|
||
|
|
"grad_norm": 8.414676884797586,
|
||
|
|
"learning_rate": 9.805852218570285e-06,
|
||
|
|
"loss": 0.31,
|
||
|
|
"step": 381
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8104265402843602,
|
||
|
|
"grad_norm": 4.325341013078249,
|
||
|
|
"learning_rate": 9.803563022248238e-06,
|
||
|
|
"loss": 0.5953,
|
||
|
|
"step": 382
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8151658767772512,
|
||
|
|
"grad_norm": 2.0110115883060855,
|
||
|
|
"learning_rate": 9.801260679333435e-06,
|
||
|
|
"loss": 0.5959,
|
||
|
|
"step": 383
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8199052132701423,
|
||
|
|
"grad_norm": 6.122877463440686,
|
||
|
|
"learning_rate": 9.79894519612703e-06,
|
||
|
|
"loss": 0.3405,
|
||
|
|
"step": 384
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.824644549763033,
|
||
|
|
"grad_norm": 3.650442506848263,
|
||
|
|
"learning_rate": 9.796616578966133e-06,
|
||
|
|
"loss": 0.3861,
|
||
|
|
"step": 385
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8293838862559242,
|
||
|
|
"grad_norm": 1.5157626892732818,
|
||
|
|
"learning_rate": 9.794274834223797e-06,
|
||
|
|
"loss": 0.2846,
|
||
|
|
"step": 386
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8341232227488151,
|
||
|
|
"grad_norm": 3.1507961365544745,
|
||
|
|
"learning_rate": 9.791919968309014e-06,
|
||
|
|
"loss": 0.3294,
|
||
|
|
"step": 387
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.838862559241706,
|
||
|
|
"grad_norm": 1.7306556573405594,
|
||
|
|
"learning_rate": 9.789551987666676e-06,
|
||
|
|
"loss": 0.3562,
|
||
|
|
"step": 388
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8436018957345972,
|
||
|
|
"grad_norm": 5.941824507077249,
|
||
|
|
"learning_rate": 9.787170898777571e-06,
|
||
|
|
"loss": 0.3274,
|
||
|
|
"step": 389
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8483412322274881,
|
||
|
|
"grad_norm": 3.030732093438433,
|
||
|
|
"learning_rate": 9.784776708158363e-06,
|
||
|
|
"loss": 0.4133,
|
||
|
|
"step": 390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.853080568720379,
|
||
|
|
"grad_norm": 1.3937150172782848,
|
||
|
|
"learning_rate": 9.782369422361576e-06,
|
||
|
|
"loss": 0.1756,
|
||
|
|
"step": 391
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8578199052132702,
|
||
|
|
"grad_norm": 12.927627074325668,
|
||
|
|
"learning_rate": 9.779949047975568e-06,
|
||
|
|
"loss": 0.361,
|
||
|
|
"step": 392
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8625592417061612,
|
||
|
|
"grad_norm": 4.493223615263428,
|
||
|
|
"learning_rate": 9.777515591624523e-06,
|
||
|
|
"loss": 0.4096,
|
||
|
|
"step": 393
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.867298578199052,
|
||
|
|
"grad_norm": 3.148554545451083,
|
||
|
|
"learning_rate": 9.775069059968426e-06,
|
||
|
|
"loss": 0.3309,
|
||
|
|
"step": 394
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8720379146919433,
|
||
|
|
"grad_norm": 2.3641866669229046,
|
||
|
|
"learning_rate": 9.772609459703046e-06,
|
||
|
|
"loss": 0.3819,
|
||
|
|
"step": 395
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.876777251184834,
|
||
|
|
"grad_norm": 2.2830547145033,
|
||
|
|
"learning_rate": 9.770136797559921e-06,
|
||
|
|
"loss": 0.4664,
|
||
|
|
"step": 396
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8815165876777251,
|
||
|
|
"grad_norm": 1.1790434142717119,
|
||
|
|
"learning_rate": 9.767651080306337e-06,
|
||
|
|
"loss": 0.1274,
|
||
|
|
"step": 397
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8862559241706163,
|
||
|
|
"grad_norm": 3.845230310093734,
|
||
|
|
"learning_rate": 9.76515231474531e-06,
|
||
|
|
"loss": 0.1594,
|
||
|
|
"step": 398
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.890995260663507,
|
||
|
|
"grad_norm": 1.8509410525795489,
|
||
|
|
"learning_rate": 9.762640507715563e-06,
|
||
|
|
"loss": 0.4798,
|
||
|
|
"step": 399
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8957345971563981,
|
||
|
|
"grad_norm": 2.781434794248597,
|
||
|
|
"learning_rate": 9.760115666091518e-06,
|
||
|
|
"loss": 0.1246,
|
||
|
|
"step": 400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8957345971563981,
|
||
|
|
"eval_loss": 0.2955733835697174,
|
||
|
|
"eval_runtime": 7.4936,
|
||
|
|
"eval_samples_per_second": 25.088,
|
||
|
|
"eval_steps_per_second": 6.272,
|
||
|
|
"step": 400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.900473933649289,
|
||
|
|
"grad_norm": 2.0822001446365808,
|
||
|
|
"learning_rate": 9.757577796783268e-06,
|
||
|
|
"loss": 0.1796,
|
||
|
|
"step": 401
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.90521327014218,
|
||
|
|
"grad_norm": 1.7683889575687497,
|
||
|
|
"learning_rate": 9.755026906736558e-06,
|
||
|
|
"loss": 0.3831,
|
||
|
|
"step": 402
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9099526066350712,
|
||
|
|
"grad_norm": 2.0251576776654665,
|
||
|
|
"learning_rate": 9.752463002932771e-06,
|
||
|
|
"loss": 0.3582,
|
||
|
|
"step": 403
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.914691943127962,
|
||
|
|
"grad_norm": 1.550976752082876,
|
||
|
|
"learning_rate": 9.749886092388907e-06,
|
||
|
|
"loss": 0.2987,
|
||
|
|
"step": 404
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.919431279620853,
|
||
|
|
"grad_norm": 2.2015043868744852,
|
||
|
|
"learning_rate": 9.747296182157562e-06,
|
||
|
|
"loss": 0.3003,
|
||
|
|
"step": 405
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9241706161137442,
|
||
|
|
"grad_norm": 1.3089765916841185,
|
||
|
|
"learning_rate": 9.744693279326915e-06,
|
||
|
|
"loss": 0.2871,
|
||
|
|
"step": 406
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9289099526066351,
|
||
|
|
"grad_norm": 3.3397852609601415,
|
||
|
|
"learning_rate": 9.742077391020695e-06,
|
||
|
|
"loss": 0.5265,
|
||
|
|
"step": 407
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.933649289099526,
|
||
|
|
"grad_norm": 1.946719052936926,
|
||
|
|
"learning_rate": 9.739448524398176e-06,
|
||
|
|
"loss": 0.5086,
|
||
|
|
"step": 408
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9383886255924172,
|
||
|
|
"grad_norm": 1.5684813908714574,
|
||
|
|
"learning_rate": 9.73680668665415e-06,
|
||
|
|
"loss": 0.2019,
|
||
|
|
"step": 409
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.943127962085308,
|
||
|
|
"grad_norm": 1.7688479660064684,
|
||
|
|
"learning_rate": 9.73415188501891e-06,
|
||
|
|
"loss": 0.4819,
|
||
|
|
"step": 410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.947867298578199,
|
||
|
|
"grad_norm": 1.4921551637020574,
|
||
|
|
"learning_rate": 9.731484126758231e-06,
|
||
|
|
"loss": 0.3432,
|
||
|
|
"step": 411
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.95260663507109,
|
||
|
|
"grad_norm": 1.6904871673634219,
|
||
|
|
"learning_rate": 9.72880341917334e-06,
|
||
|
|
"loss": 0.3338,
|
||
|
|
"step": 412
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.957345971563981,
|
||
|
|
"grad_norm": 2.231311267323003,
|
||
|
|
"learning_rate": 9.726109769600915e-06,
|
||
|
|
"loss": 0.3727,
|
||
|
|
"step": 413
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.962085308056872,
|
||
|
|
"grad_norm": 2.064721764431227,
|
||
|
|
"learning_rate": 9.72340318541305e-06,
|
||
|
|
"loss": 0.5217,
|
||
|
|
"step": 414
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.966824644549763,
|
||
|
|
"grad_norm": 1.8824517839226762,
|
||
|
|
"learning_rate": 9.720683674017232e-06,
|
||
|
|
"loss": 0.5255,
|
||
|
|
"step": 415
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.971563981042654,
|
||
|
|
"grad_norm": 2.741168518277324,
|
||
|
|
"learning_rate": 9.717951242856338e-06,
|
||
|
|
"loss": 0.574,
|
||
|
|
"step": 416
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9763033175355451,
|
||
|
|
"grad_norm": 0.36867385458926955,
|
||
|
|
"learning_rate": 9.7152058994086e-06,
|
||
|
|
"loss": 0.0081,
|
||
|
|
"step": 417
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.981042654028436,
|
||
|
|
"grad_norm": 3.7413445968502455,
|
||
|
|
"learning_rate": 9.712447651187589e-06,
|
||
|
|
"loss": 0.6644,
|
||
|
|
"step": 418
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.985781990521327,
|
||
|
|
"grad_norm": 2.036352574421493,
|
||
|
|
"learning_rate": 9.709676505742194e-06,
|
||
|
|
"loss": 0.508,
|
||
|
|
"step": 419
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9905213270142181,
|
||
|
|
"grad_norm": 2.332710921849896,
|
||
|
|
"learning_rate": 9.706892470656601e-06,
|
||
|
|
"loss": 0.672,
|
||
|
|
"step": 420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9952606635071088,
|
||
|
|
"grad_norm": 1.1977387854956147,
|
||
|
|
"learning_rate": 9.704095553550277e-06,
|
||
|
|
"loss": 0.3395,
|
||
|
|
"step": 421
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0,
|
||
|
|
"grad_norm": 1.306644738004841,
|
||
|
|
"learning_rate": 9.701285762077938e-06,
|
||
|
|
"loss": 0.2778,
|
||
|
|
"step": 422
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.004739336492891,
|
||
|
|
"grad_norm": 2.6627265449160573,
|
||
|
|
"learning_rate": 9.698463103929542e-06,
|
||
|
|
"loss": 0.4528,
|
||
|
|
"step": 423
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.009478672985782,
|
||
|
|
"grad_norm": 2.099358840952704,
|
||
|
|
"learning_rate": 9.695627586830258e-06,
|
||
|
|
"loss": 0.3879,
|
||
|
|
"step": 424
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.014218009478673,
|
||
|
|
"grad_norm": 5.2879020211730055,
|
||
|
|
"learning_rate": 9.692779218540449e-06,
|
||
|
|
"loss": 0.14,
|
||
|
|
"step": 425
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.018957345971564,
|
||
|
|
"grad_norm": 1.5018740536470672,
|
||
|
|
"learning_rate": 9.689918006855645e-06,
|
||
|
|
"loss": 0.1269,
|
||
|
|
"step": 426
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.023696682464455,
|
||
|
|
"grad_norm": 1.4094211658721043,
|
||
|
|
"learning_rate": 9.687043959606535e-06,
|
||
|
|
"loss": 0.2423,
|
||
|
|
"step": 427
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.028436018957346,
|
||
|
|
"grad_norm": 2.159506476596885,
|
||
|
|
"learning_rate": 9.684157084658929e-06,
|
||
|
|
"loss": 0.2815,
|
||
|
|
"step": 428
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0331753554502368,
|
||
|
|
"grad_norm": 0.8885269405665877,
|
||
|
|
"learning_rate": 9.681257389913747e-06,
|
||
|
|
"loss": 0.1421,
|
||
|
|
"step": 429
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.037914691943128,
|
||
|
|
"grad_norm": 2.0385950670605197,
|
||
|
|
"learning_rate": 9.678344883306997e-06,
|
||
|
|
"loss": 0.3958,
|
||
|
|
"step": 430
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.042654028436019,
|
||
|
|
"grad_norm": 6.205987948434508,
|
||
|
|
"learning_rate": 9.675419572809748e-06,
|
||
|
|
"loss": 0.271,
|
||
|
|
"step": 431
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0473933649289098,
|
||
|
|
"grad_norm": 2.7807782405533352,
|
||
|
|
"learning_rate": 9.672481466428114e-06,
|
||
|
|
"loss": 0.4354,
|
||
|
|
"step": 432
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.052132701421801,
|
||
|
|
"grad_norm": 1.5112315738532895,
|
||
|
|
"learning_rate": 9.669530572203228e-06,
|
||
|
|
"loss": 0.2338,
|
||
|
|
"step": 433
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.056872037914692,
|
||
|
|
"grad_norm": 2.403007718370798,
|
||
|
|
"learning_rate": 9.666566898211219e-06,
|
||
|
|
"loss": 0.1161,
|
||
|
|
"step": 434
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.061611374407583,
|
||
|
|
"grad_norm": 1.342375032653582,
|
||
|
|
"learning_rate": 9.663590452563193e-06,
|
||
|
|
"loss": 0.1454,
|
||
|
|
"step": 435
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.066350710900474,
|
||
|
|
"grad_norm": 3.4913666949654543,
|
||
|
|
"learning_rate": 9.660601243405214e-06,
|
||
|
|
"loss": 0.2455,
|
||
|
|
"step": 436
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.071090047393365,
|
||
|
|
"grad_norm": 2.7479972548361586,
|
||
|
|
"learning_rate": 9.657599278918278e-06,
|
||
|
|
"loss": 0.2492,
|
||
|
|
"step": 437
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.075829383886256,
|
||
|
|
"grad_norm": 1.3732106527900387,
|
||
|
|
"learning_rate": 9.654584567318279e-06,
|
||
|
|
"loss": 0.1943,
|
||
|
|
"step": 438
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.080568720379147,
|
||
|
|
"grad_norm": 1.814725144260375,
|
||
|
|
"learning_rate": 9.651557116856015e-06,
|
||
|
|
"loss": 0.256,
|
||
|
|
"step": 439
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.085308056872038,
|
||
|
|
"grad_norm": 2.2069597546240107,
|
||
|
|
"learning_rate": 9.648516935817133e-06,
|
||
|
|
"loss": 0.3961,
|
||
|
|
"step": 440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.090047393364929,
|
||
|
|
"grad_norm": 1.5420571195505541,
|
||
|
|
"learning_rate": 9.64546403252213e-06,
|
||
|
|
"loss": 0.251,
|
||
|
|
"step": 441
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.09478672985782,
|
||
|
|
"grad_norm": 1.5476167518379802,
|
||
|
|
"learning_rate": 9.642398415326321e-06,
|
||
|
|
"loss": 0.2603,
|
||
|
|
"step": 442
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0995260663507107,
|
||
|
|
"grad_norm": 2.034199340028726,
|
||
|
|
"learning_rate": 9.639320092619814e-06,
|
||
|
|
"loss": 0.1695,
|
||
|
|
"step": 443
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.104265402843602,
|
||
|
|
"grad_norm": 3.7349542663033612,
|
||
|
|
"learning_rate": 9.636229072827495e-06,
|
||
|
|
"loss": 0.128,
|
||
|
|
"step": 444
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.109004739336493,
|
||
|
|
"grad_norm": 2.821904163178345,
|
||
|
|
"learning_rate": 9.633125364408993e-06,
|
||
|
|
"loss": 0.2114,
|
||
|
|
"step": 445
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1137440758293837,
|
||
|
|
"grad_norm": 2.7924773405486736,
|
||
|
|
"learning_rate": 9.630008975858667e-06,
|
||
|
|
"loss": 0.4738,
|
||
|
|
"step": 446
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.118483412322275,
|
||
|
|
"grad_norm": 3.9989024730276475,
|
||
|
|
"learning_rate": 9.626879915705583e-06,
|
||
|
|
"loss": 0.1967,
|
||
|
|
"step": 447
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.123222748815166,
|
||
|
|
"grad_norm": 2.8673915004345267,
|
||
|
|
"learning_rate": 9.62373819251348e-06,
|
||
|
|
"loss": 0.4892,
|
||
|
|
"step": 448
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1279620853080567,
|
||
|
|
"grad_norm": 3.946682223252352,
|
||
|
|
"learning_rate": 9.620583814880763e-06,
|
||
|
|
"loss": 0.4804,
|
||
|
|
"step": 449
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.132701421800948,
|
||
|
|
"grad_norm": 1.5786772938702343,
|
||
|
|
"learning_rate": 9.617416791440461e-06,
|
||
|
|
"loss": 0.1804,
|
||
|
|
"step": 450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.132701421800948,
|
||
|
|
"eval_loss": 0.2992797791957855,
|
||
|
|
"eval_runtime": 7.7669,
|
||
|
|
"eval_samples_per_second": 24.205,
|
||
|
|
"eval_steps_per_second": 6.051,
|
||
|
|
"step": 450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.137440758293839,
|
||
|
|
"grad_norm": 2.165145174031428,
|
||
|
|
"learning_rate": 9.61423713086022e-06,
|
||
|
|
"loss": 0.4228,
|
||
|
|
"step": 451
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1421800947867298,
|
||
|
|
"grad_norm": 1.866067205365788,
|
||
|
|
"learning_rate": 9.611044841842264e-06,
|
||
|
|
"loss": 0.1926,
|
||
|
|
"step": 452
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.146919431279621,
|
||
|
|
"grad_norm": 2.346074490279633,
|
||
|
|
"learning_rate": 9.607839933123387e-06,
|
||
|
|
"loss": 0.4009,
|
||
|
|
"step": 453
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1516587677725116,
|
||
|
|
"grad_norm": 4.0592215780138945,
|
||
|
|
"learning_rate": 9.604622413474916e-06,
|
||
|
|
"loss": 0.3062,
|
||
|
|
"step": 454
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.156398104265403,
|
||
|
|
"grad_norm": 1.5094788846683966,
|
||
|
|
"learning_rate": 9.601392291702693e-06,
|
||
|
|
"loss": 0.253,
|
||
|
|
"step": 455
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.161137440758294,
|
||
|
|
"grad_norm": 2.214809533583907,
|
||
|
|
"learning_rate": 9.598149576647053e-06,
|
||
|
|
"loss": 0.3733,
|
||
|
|
"step": 456
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1658767772511847,
|
||
|
|
"grad_norm": 1.8852412240790106,
|
||
|
|
"learning_rate": 9.594894277182793e-06,
|
||
|
|
"loss": 0.2866,
|
||
|
|
"step": 457
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.170616113744076,
|
||
|
|
"grad_norm": 1.7526684425452752,
|
||
|
|
"learning_rate": 9.591626402219154e-06,
|
||
|
|
"loss": 0.3004,
|
||
|
|
"step": 458
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.175355450236967,
|
||
|
|
"grad_norm": 2.9894508978330783,
|
||
|
|
"learning_rate": 9.588345960699792e-06,
|
||
|
|
"loss": 0.2337,
|
||
|
|
"step": 459
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1800947867298577,
|
||
|
|
"grad_norm": 1.9990589843914348,
|
||
|
|
"learning_rate": 9.585052961602759e-06,
|
||
|
|
"loss": 0.3459,
|
||
|
|
"step": 460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.184834123222749,
|
||
|
|
"grad_norm": 2.35306488074333,
|
||
|
|
"learning_rate": 9.581747413940472e-06,
|
||
|
|
"loss": 0.1304,
|
||
|
|
"step": 461
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.18957345971564,
|
||
|
|
"grad_norm": 1.777896718380068,
|
||
|
|
"learning_rate": 9.57842932675969e-06,
|
||
|
|
"loss": 0.3115,
|
||
|
|
"step": 462
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1943127962085307,
|
||
|
|
"grad_norm": 1.7487858526836815,
|
||
|
|
"learning_rate": 9.575098709141496e-06,
|
||
|
|
"loss": 0.2286,
|
||
|
|
"step": 463
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.199052132701422,
|
||
|
|
"grad_norm": 2.3258340527853703,
|
||
|
|
"learning_rate": 9.571755570201266e-06,
|
||
|
|
"loss": 0.3659,
|
||
|
|
"step": 464
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2037914691943126,
|
||
|
|
"grad_norm": 1.465155899360022,
|
||
|
|
"learning_rate": 9.56839991908864e-06,
|
||
|
|
"loss": 0.1962,
|
||
|
|
"step": 465
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2085308056872037,
|
||
|
|
"grad_norm": 1.4928089369177426,
|
||
|
|
"learning_rate": 9.565031764987502e-06,
|
||
|
|
"loss": 0.1699,
|
||
|
|
"step": 466
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.213270142180095,
|
||
|
|
"grad_norm": 1.785576743274789,
|
||
|
|
"learning_rate": 9.561651117115962e-06,
|
||
|
|
"loss": 0.2761,
|
||
|
|
"step": 467
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2180094786729856,
|
||
|
|
"grad_norm": 2.2531836622374337,
|
||
|
|
"learning_rate": 9.558257984726319e-06,
|
||
|
|
"loss": 0.3987,
|
||
|
|
"step": 468
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2227488151658767,
|
||
|
|
"grad_norm": 2.315364186256386,
|
||
|
|
"learning_rate": 9.554852377105036e-06,
|
||
|
|
"loss": 0.3683,
|
||
|
|
"step": 469
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.227488151658768,
|
||
|
|
"grad_norm": 1.6764194598679394,
|
||
|
|
"learning_rate": 9.551434303572725e-06,
|
||
|
|
"loss": 0.1712,
|
||
|
|
"step": 470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2322274881516586,
|
||
|
|
"grad_norm": 1.7843409934441365,
|
||
|
|
"learning_rate": 9.548003773484115e-06,
|
||
|
|
"loss": 0.3081,
|
||
|
|
"step": 471
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2369668246445498,
|
||
|
|
"grad_norm": 2.3510258117985505,
|
||
|
|
"learning_rate": 9.544560796228022e-06,
|
||
|
|
"loss": 0.175,
|
||
|
|
"step": 472
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.241706161137441,
|
||
|
|
"grad_norm": 2.014710977925563,
|
||
|
|
"learning_rate": 9.54110538122733e-06,
|
||
|
|
"loss": 0.332,
|
||
|
|
"step": 473
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2464454976303316,
|
||
|
|
"grad_norm": 1.500227249636288,
|
||
|
|
"learning_rate": 9.537637537938966e-06,
|
||
|
|
"loss": 0.1706,
|
||
|
|
"step": 474
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.251184834123223,
|
||
|
|
"grad_norm": 5.326923875102078,
|
||
|
|
"learning_rate": 9.534157275853869e-06,
|
||
|
|
"loss": 0.3117,
|
||
|
|
"step": 475
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2559241706161135,
|
||
|
|
"grad_norm": 2.395469195860342,
|
||
|
|
"learning_rate": 9.530664604496964e-06,
|
||
|
|
"loss": 0.4078,
|
||
|
|
"step": 476
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2606635071090047,
|
||
|
|
"grad_norm": 1.783834474018257,
|
||
|
|
"learning_rate": 9.527159533427142e-06,
|
||
|
|
"loss": 0.3021,
|
||
|
|
"step": 477
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.265402843601896,
|
||
|
|
"grad_norm": 2.424152235039787,
|
||
|
|
"learning_rate": 9.52364207223723e-06,
|
||
|
|
"loss": 0.2725,
|
||
|
|
"step": 478
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.270142180094787,
|
||
|
|
"grad_norm": 1.8235299600725425,
|
||
|
|
"learning_rate": 9.520112230553959e-06,
|
||
|
|
"loss": 0.2643,
|
||
|
|
"step": 479
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2748815165876777,
|
||
|
|
"grad_norm": 2.051524555671329,
|
||
|
|
"learning_rate": 9.51657001803795e-06,
|
||
|
|
"loss": 0.0735,
|
||
|
|
"step": 480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.279620853080569,
|
||
|
|
"grad_norm": 2.3488000246022755,
|
||
|
|
"learning_rate": 9.513015444383682e-06,
|
||
|
|
"loss": 0.4467,
|
||
|
|
"step": 481
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2843601895734595,
|
||
|
|
"grad_norm": 2.1217146626056556,
|
||
|
|
"learning_rate": 9.509448519319455e-06,
|
||
|
|
"loss": 0.364,
|
||
|
|
"step": 482
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2890995260663507,
|
||
|
|
"grad_norm": 1.6891403485666334,
|
||
|
|
"learning_rate": 9.505869252607385e-06,
|
||
|
|
"loss": 0.2422,
|
||
|
|
"step": 483
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.293838862559242,
|
||
|
|
"grad_norm": 1.8691421689797754,
|
||
|
|
"learning_rate": 9.502277654043355e-06,
|
||
|
|
"loss": 0.1678,
|
||
|
|
"step": 484
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2985781990521326,
|
||
|
|
"grad_norm": 2.844637407365249,
|
||
|
|
"learning_rate": 9.498673733457007e-06,
|
||
|
|
"loss": 0.2028,
|
||
|
|
"step": 485
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3033175355450237,
|
||
|
|
"grad_norm": 4.691622791467041,
|
||
|
|
"learning_rate": 9.495057500711698e-06,
|
||
|
|
"loss": 0.3995,
|
||
|
|
"step": 486
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.308056872037915,
|
||
|
|
"grad_norm": 4.110449766395664,
|
||
|
|
"learning_rate": 9.491428965704486e-06,
|
||
|
|
"loss": 0.2655,
|
||
|
|
"step": 487
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3127962085308056,
|
||
|
|
"grad_norm": 3.181980318070533,
|
||
|
|
"learning_rate": 9.487788138366098e-06,
|
||
|
|
"loss": 0.2057,
|
||
|
|
"step": 488
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3175355450236967,
|
||
|
|
"grad_norm": 1.8751228151087935,
|
||
|
|
"learning_rate": 9.484135028660905e-06,
|
||
|
|
"loss": 0.3028,
|
||
|
|
"step": 489
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.322274881516588,
|
||
|
|
"grad_norm": 1.5661514970100012,
|
||
|
|
"learning_rate": 9.480469646586888e-06,
|
||
|
|
"loss": 0.1934,
|
||
|
|
"step": 490
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3270142180094786,
|
||
|
|
"grad_norm": 2.1943914786986927,
|
||
|
|
"learning_rate": 9.476792002175621e-06,
|
||
|
|
"loss": 0.2269,
|
||
|
|
"step": 491
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3317535545023698,
|
||
|
|
"grad_norm": 2.264533107109475,
|
||
|
|
"learning_rate": 9.473102105492234e-06,
|
||
|
|
"loss": 0.4183,
|
||
|
|
"step": 492
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3364928909952605,
|
||
|
|
"grad_norm": 2.7234498956935007,
|
||
|
|
"learning_rate": 9.469399966635392e-06,
|
||
|
|
"loss": 0.2831,
|
||
|
|
"step": 493
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3412322274881516,
|
||
|
|
"grad_norm": 1.8580160075814238,
|
||
|
|
"learning_rate": 9.465685595737263e-06,
|
||
|
|
"loss": 0.2907,
|
||
|
|
"step": 494
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.345971563981043,
|
||
|
|
"grad_norm": 1.7716495700178916,
|
||
|
|
"learning_rate": 9.461959002963492e-06,
|
||
|
|
"loss": 0.3222,
|
||
|
|
"step": 495
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3507109004739335,
|
||
|
|
"grad_norm": 2.1649991541967655,
|
||
|
|
"learning_rate": 9.458220198513178e-06,
|
||
|
|
"loss": 0.3767,
|
||
|
|
"step": 496
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3554502369668247,
|
||
|
|
"grad_norm": 2.2141707421354204,
|
||
|
|
"learning_rate": 9.454469192618834e-06,
|
||
|
|
"loss": 0.4129,
|
||
|
|
"step": 497
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.360189573459716,
|
||
|
|
"grad_norm": 1.1394313965651783,
|
||
|
|
"learning_rate": 9.45070599554637e-06,
|
||
|
|
"loss": 0.0941,
|
||
|
|
"step": 498
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3649289099526065,
|
||
|
|
"grad_norm": 1.5645808841620317,
|
||
|
|
"learning_rate": 9.446930617595066e-06,
|
||
|
|
"loss": 0.2175,
|
||
|
|
"step": 499
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3696682464454977,
|
||
|
|
"grad_norm": 0.9848357485987885,
|
||
|
|
"learning_rate": 9.443143069097531e-06,
|
||
|
|
"loss": 0.0178,
|
||
|
|
"step": 500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3696682464454977,
|
||
|
|
"eval_loss": 0.27932682633399963,
|
||
|
|
"eval_runtime": 7.8489,
|
||
|
|
"eval_samples_per_second": 23.952,
|
||
|
|
"eval_steps_per_second": 5.988,
|
||
|
|
"step": 500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.374407582938389,
|
||
|
|
"grad_norm": 1.463106965895241,
|
||
|
|
"learning_rate": 9.439343360419689e-06,
|
||
|
|
"loss": 0.1835,
|
||
|
|
"step": 501
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3791469194312795,
|
||
|
|
"grad_norm": 0.9453677984452025,
|
||
|
|
"learning_rate": 9.43553150196074e-06,
|
||
|
|
"loss": 0.0948,
|
||
|
|
"step": 502
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3838862559241707,
|
||
|
|
"grad_norm": 2.137434931225527,
|
||
|
|
"learning_rate": 9.431707504153138e-06,
|
||
|
|
"loss": 0.3164,
|
||
|
|
"step": 503
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3886255924170614,
|
||
|
|
"grad_norm": 2.641718579279774,
|
||
|
|
"learning_rate": 9.427871377462561e-06,
|
||
|
|
"loss": 0.3065,
|
||
|
|
"step": 504
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3933649289099526,
|
||
|
|
"grad_norm": 3.1991815401861565,
|
||
|
|
"learning_rate": 9.424023132387883e-06,
|
||
|
|
"loss": 0.2931,
|
||
|
|
"step": 505
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3981042654028437,
|
||
|
|
"grad_norm": 2.0386672451223515,
|
||
|
|
"learning_rate": 9.420162779461142e-06,
|
||
|
|
"loss": 0.2671,
|
||
|
|
"step": 506
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4028436018957344,
|
||
|
|
"grad_norm": 2.5967954379583396,
|
||
|
|
"learning_rate": 9.416290329247513e-06,
|
||
|
|
"loss": 0.0696,
|
||
|
|
"step": 507
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4075829383886256,
|
||
|
|
"grad_norm": 1.6357351836128409,
|
||
|
|
"learning_rate": 9.412405792345278e-06,
|
||
|
|
"loss": 0.2182,
|
||
|
|
"step": 508
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4123222748815167,
|
||
|
|
"grad_norm": 1.0931361872228085,
|
||
|
|
"learning_rate": 9.408509179385806e-06,
|
||
|
|
"loss": 0.0861,
|
||
|
|
"step": 509
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4170616113744074,
|
||
|
|
"grad_norm": 1.8664597671910188,
|
||
|
|
"learning_rate": 9.404600501033505e-06,
|
||
|
|
"loss": 0.1282,
|
||
|
|
"step": 510
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4218009478672986,
|
||
|
|
"grad_norm": 1.4441234779633076,
|
||
|
|
"learning_rate": 9.400679767985814e-06,
|
||
|
|
"loss": 0.2087,
|
||
|
|
"step": 511
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4265402843601898,
|
||
|
|
"grad_norm": 1.435779267833689,
|
||
|
|
"learning_rate": 9.39674699097316e-06,
|
||
|
|
"loss": 0.1959,
|
||
|
|
"step": 512
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4312796208530805,
|
||
|
|
"grad_norm": 1.843464890515756,
|
||
|
|
"learning_rate": 9.392802180758926e-06,
|
||
|
|
"loss": 0.2495,
|
||
|
|
"step": 513
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4360189573459716,
|
||
|
|
"grad_norm": 1.5674475560557937,
|
||
|
|
"learning_rate": 9.38884534813944e-06,
|
||
|
|
"loss": 0.1966,
|
||
|
|
"step": 514
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4407582938388623,
|
||
|
|
"grad_norm": 1.787304736128972,
|
||
|
|
"learning_rate": 9.384876503943929e-06,
|
||
|
|
"loss": 0.2913,
|
||
|
|
"step": 515
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4454976303317535,
|
||
|
|
"grad_norm": 2.297410213230877,
|
||
|
|
"learning_rate": 9.380895659034486e-06,
|
||
|
|
"loss": 0.2654,
|
||
|
|
"step": 516
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4502369668246446,
|
||
|
|
"grad_norm": 2.282985726906157,
|
||
|
|
"learning_rate": 9.376902824306058e-06,
|
||
|
|
"loss": 0.3368,
|
||
|
|
"step": 517
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4549763033175354,
|
||
|
|
"grad_norm": 1.8687545300392534,
|
||
|
|
"learning_rate": 9.3728980106864e-06,
|
||
|
|
"loss": 0.244,
|
||
|
|
"step": 518
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4597156398104265,
|
||
|
|
"grad_norm": 1.3880182815684456,
|
||
|
|
"learning_rate": 9.368881229136057e-06,
|
||
|
|
"loss": 0.1899,
|
||
|
|
"step": 519
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4644549763033177,
|
||
|
|
"grad_norm": 1.696370706635656,
|
||
|
|
"learning_rate": 9.364852490648327e-06,
|
||
|
|
"loss": 0.2128,
|
||
|
|
"step": 520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4691943127962084,
|
||
|
|
"grad_norm": 2.292613729117954,
|
||
|
|
"learning_rate": 9.360811806249224e-06,
|
||
|
|
"loss": 0.1888,
|
||
|
|
"step": 521
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4739336492890995,
|
||
|
|
"grad_norm": 9.78708000296614,
|
||
|
|
"learning_rate": 9.356759186997466e-06,
|
||
|
|
"loss": 0.3178,
|
||
|
|
"step": 522
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4786729857819907,
|
||
|
|
"grad_norm": 2.0045275397991293,
|
||
|
|
"learning_rate": 9.352694643984433e-06,
|
||
|
|
"loss": 0.3639,
|
||
|
|
"step": 523
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4834123222748814,
|
||
|
|
"grad_norm": 3.742887030355059,
|
||
|
|
"learning_rate": 9.348618188334135e-06,
|
||
|
|
"loss": 0.2554,
|
||
|
|
"step": 524
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4881516587677726,
|
||
|
|
"grad_norm": 1.94272441768136,
|
||
|
|
"learning_rate": 9.344529831203187e-06,
|
||
|
|
"loss": 0.3038,
|
||
|
|
"step": 525
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4928909952606633,
|
||
|
|
"grad_norm": 5.088624688688455,
|
||
|
|
"learning_rate": 9.340429583780774e-06,
|
||
|
|
"loss": 0.2156,
|
||
|
|
"step": 526
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4976303317535544,
|
||
|
|
"grad_norm": 2.6529254956977706,
|
||
|
|
"learning_rate": 9.33631745728863e-06,
|
||
|
|
"loss": 0.2925,
|
||
|
|
"step": 527
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5023696682464456,
|
||
|
|
"grad_norm": 0.8557408245301991,
|
||
|
|
"learning_rate": 9.33219346298099e-06,
|
||
|
|
"loss": 0.1333,
|
||
|
|
"step": 528
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5071090047393367,
|
||
|
|
"grad_norm": 2.068141298388673,
|
||
|
|
"learning_rate": 9.32805761214458e-06,
|
||
|
|
"loss": 0.4115,
|
||
|
|
"step": 529
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5118483412322274,
|
||
|
|
"grad_norm": 2.144402530171497,
|
||
|
|
"learning_rate": 9.323909916098566e-06,
|
||
|
|
"loss": 0.3903,
|
||
|
|
"step": 530
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5165876777251186,
|
||
|
|
"grad_norm": 2.0004594631335935,
|
||
|
|
"learning_rate": 9.319750386194537e-06,
|
||
|
|
"loss": 0.1363,
|
||
|
|
"step": 531
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5213270142180093,
|
||
|
|
"grad_norm": 4.035239471522499,
|
||
|
|
"learning_rate": 9.315579033816471e-06,
|
||
|
|
"loss": 0.2661,
|
||
|
|
"step": 532
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5260663507109005,
|
||
|
|
"grad_norm": 1.534201372487289,
|
||
|
|
"learning_rate": 9.311395870380699e-06,
|
||
|
|
"loss": 0.254,
|
||
|
|
"step": 533
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5308056872037916,
|
||
|
|
"grad_norm": 2.080407165105763,
|
||
|
|
"learning_rate": 9.307200907335875e-06,
|
||
|
|
"loss": 0.3705,
|
||
|
|
"step": 534
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5355450236966823,
|
||
|
|
"grad_norm": 1.502966463119228,
|
||
|
|
"learning_rate": 9.302994156162957e-06,
|
||
|
|
"loss": 0.1576,
|
||
|
|
"step": 535
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5402843601895735,
|
||
|
|
"grad_norm": 2.210241514895105,
|
||
|
|
"learning_rate": 9.29877562837515e-06,
|
||
|
|
"loss": 0.2451,
|
||
|
|
"step": 536
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.545023696682464,
|
||
|
|
"grad_norm": 8.781880069783684,
|
||
|
|
"learning_rate": 9.294545335517904e-06,
|
||
|
|
"loss": 0.2259,
|
||
|
|
"step": 537
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5497630331753554,
|
||
|
|
"grad_norm": 1.2217030042276933,
|
||
|
|
"learning_rate": 9.290303289168859e-06,
|
||
|
|
"loss": 0.1483,
|
||
|
|
"step": 538
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5545023696682465,
|
||
|
|
"grad_norm": 2.3742534672201043,
|
||
|
|
"learning_rate": 9.286049500937826e-06,
|
||
|
|
"loss": 0.3232,
|
||
|
|
"step": 539
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5592417061611377,
|
||
|
|
"grad_norm": 2.4723750071994988,
|
||
|
|
"learning_rate": 9.28178398246675e-06,
|
||
|
|
"loss": 0.2007,
|
||
|
|
"step": 540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5639810426540284,
|
||
|
|
"grad_norm": 2.0869954207453425,
|
||
|
|
"learning_rate": 9.277506745429684e-06,
|
||
|
|
"loss": 0.3059,
|
||
|
|
"step": 541
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5687203791469195,
|
||
|
|
"grad_norm": 0.9781642431980271,
|
||
|
|
"learning_rate": 9.273217801532744e-06,
|
||
|
|
"loss": 0.0759,
|
||
|
|
"step": 542
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5734597156398102,
|
||
|
|
"grad_norm": 1.7701299344624615,
|
||
|
|
"learning_rate": 9.268917162514098e-06,
|
||
|
|
"loss": 0.1652,
|
||
|
|
"step": 543
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5781990521327014,
|
||
|
|
"grad_norm": 1.3356330447131743,
|
||
|
|
"learning_rate": 9.26460484014391e-06,
|
||
|
|
"loss": 0.1213,
|
||
|
|
"step": 544
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5829383886255926,
|
||
|
|
"grad_norm": 2.4644062333165313,
|
||
|
|
"learning_rate": 9.260280846224328e-06,
|
||
|
|
"loss": 0.4874,
|
||
|
|
"step": 545
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5876777251184833,
|
||
|
|
"grad_norm": 2.062997126984779,
|
||
|
|
"learning_rate": 9.25594519258944e-06,
|
||
|
|
"loss": 0.3558,
|
||
|
|
"step": 546
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5924170616113744,
|
||
|
|
"grad_norm": 1.5375755186462028,
|
||
|
|
"learning_rate": 9.251597891105242e-06,
|
||
|
|
"loss": 0.2059,
|
||
|
|
"step": 547
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.597156398104265,
|
||
|
|
"grad_norm": 1.1624701008662408,
|
||
|
|
"learning_rate": 9.247238953669612e-06,
|
||
|
|
"loss": 0.0804,
|
||
|
|
"step": 548
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6018957345971563,
|
||
|
|
"grad_norm": 0.09001542056443658,
|
||
|
|
"learning_rate": 9.242868392212277e-06,
|
||
|
|
"loss": 0.0009,
|
||
|
|
"step": 549
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6066350710900474,
|
||
|
|
"grad_norm": 1.1436516887675665,
|
||
|
|
"learning_rate": 9.238486218694767e-06,
|
||
|
|
"loss": 0.1028,
|
||
|
|
"step": 550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6066350710900474,
|
||
|
|
"eval_loss": 0.26912641525268555,
|
||
|
|
"eval_runtime": 7.6787,
|
||
|
|
"eval_samples_per_second": 24.483,
|
||
|
|
"eval_steps_per_second": 6.121,
|
||
|
|
"step": 550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6113744075829386,
|
||
|
|
"grad_norm": 2.133961184910742,
|
||
|
|
"learning_rate": 9.234092445110401e-06,
|
||
|
|
"loss": 0.3103,
|
||
|
|
"step": 551
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6161137440758293,
|
||
|
|
"grad_norm": 2.072404781682465,
|
||
|
|
"learning_rate": 9.229687083484242e-06,
|
||
|
|
"loss": 0.4379,
|
||
|
|
"step": 552
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6208530805687205,
|
||
|
|
"grad_norm": 2.2020800312518833,
|
||
|
|
"learning_rate": 9.225270145873069e-06,
|
||
|
|
"loss": 0.5363,
|
||
|
|
"step": 553
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.625592417061611,
|
||
|
|
"grad_norm": 2.0457818451797647,
|
||
|
|
"learning_rate": 9.220841644365343e-06,
|
||
|
|
"loss": 0.361,
|
||
|
|
"step": 554
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6303317535545023,
|
||
|
|
"grad_norm": 11.521273969858171,
|
||
|
|
"learning_rate": 9.216401591081173e-06,
|
||
|
|
"loss": 0.2696,
|
||
|
|
"step": 555
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6350710900473935,
|
||
|
|
"grad_norm": 1.5840576742667678,
|
||
|
|
"learning_rate": 9.21194999817228e-06,
|
||
|
|
"loss": 0.2055,
|
||
|
|
"step": 556
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.639810426540284,
|
||
|
|
"grad_norm": 2.169655448274581,
|
||
|
|
"learning_rate": 9.207486877821971e-06,
|
||
|
|
"loss": 0.404,
|
||
|
|
"step": 557
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6445497630331753,
|
||
|
|
"grad_norm": 2.0410487060700704,
|
||
|
|
"learning_rate": 9.203012242245103e-06,
|
||
|
|
"loss": 0.3766,
|
||
|
|
"step": 558
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6492890995260665,
|
||
|
|
"grad_norm": 2.726533008767557,
|
||
|
|
"learning_rate": 9.198526103688045e-06,
|
||
|
|
"loss": 0.2197,
|
||
|
|
"step": 559
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.654028436018957,
|
||
|
|
"grad_norm": 2.163218749938081,
|
||
|
|
"learning_rate": 9.194028474428651e-06,
|
||
|
|
"loss": 0.3141,
|
||
|
|
"step": 560
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6587677725118484,
|
||
|
|
"grad_norm": 7.123794048200211,
|
||
|
|
"learning_rate": 9.189519366776218e-06,
|
||
|
|
"loss": 0.2282,
|
||
|
|
"step": 561
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6635071090047395,
|
||
|
|
"grad_norm": 4.431231299759084,
|
||
|
|
"learning_rate": 9.184998793071465e-06,
|
||
|
|
"loss": 0.2181,
|
||
|
|
"step": 562
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6682464454976302,
|
||
|
|
"grad_norm": 1.749031335601768,
|
||
|
|
"learning_rate": 9.180466765686485e-06,
|
||
|
|
"loss": 0.328,
|
||
|
|
"step": 563
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6729857819905214,
|
||
|
|
"grad_norm": 3.7503874156697377,
|
||
|
|
"learning_rate": 9.17592329702472e-06,
|
||
|
|
"loss": 0.201,
|
||
|
|
"step": 564
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.677725118483412,
|
||
|
|
"grad_norm": 2.1106719929114606,
|
||
|
|
"learning_rate": 9.171368399520925e-06,
|
||
|
|
"loss": 0.3745,
|
||
|
|
"step": 565
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6824644549763033,
|
||
|
|
"grad_norm": 2.2680399854307867,
|
||
|
|
"learning_rate": 9.16680208564114e-06,
|
||
|
|
"loss": 0.4675,
|
||
|
|
"step": 566
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6872037914691944,
|
||
|
|
"grad_norm": 3.0991891546864716,
|
||
|
|
"learning_rate": 9.162224367882639e-06,
|
||
|
|
"loss": 0.2777,
|
||
|
|
"step": 567
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6919431279620856,
|
||
|
|
"grad_norm": 3.4744764688143057,
|
||
|
|
"learning_rate": 9.157635258773915e-06,
|
||
|
|
"loss": 0.3598,
|
||
|
|
"step": 568
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6966824644549763,
|
||
|
|
"grad_norm": 3.322846604693465,
|
||
|
|
"learning_rate": 9.15303477087463e-06,
|
||
|
|
"loss": 0.34,
|
||
|
|
"step": 569
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7014218009478674,
|
||
|
|
"grad_norm": 1.4990976721598193,
|
||
|
|
"learning_rate": 9.148422916775596e-06,
|
||
|
|
"loss": 0.1349,
|
||
|
|
"step": 570
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.706161137440758,
|
||
|
|
"grad_norm": 1.457005855011899,
|
||
|
|
"learning_rate": 9.143799709098729e-06,
|
||
|
|
"loss": 0.1827,
|
||
|
|
"step": 571
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7109004739336493,
|
||
|
|
"grad_norm": 3.0061726721844053,
|
||
|
|
"learning_rate": 9.139165160497017e-06,
|
||
|
|
"loss": 0.3803,
|
||
|
|
"step": 572
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7156398104265405,
|
||
|
|
"grad_norm": 1.2933432816063761,
|
||
|
|
"learning_rate": 9.134519283654484e-06,
|
||
|
|
"loss": 0.163,
|
||
|
|
"step": 573
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.720379146919431,
|
||
|
|
"grad_norm": 2.04160773743751,
|
||
|
|
"learning_rate": 9.129862091286165e-06,
|
||
|
|
"loss": 0.3507,
|
||
|
|
"step": 574
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7251184834123223,
|
||
|
|
"grad_norm": 1.7075125314751585,
|
||
|
|
"learning_rate": 9.125193596138057e-06,
|
||
|
|
"loss": 0.2775,
|
||
|
|
"step": 575
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.729857819905213,
|
||
|
|
"grad_norm": 2.0702195565525376,
|
||
|
|
"learning_rate": 9.120513810987095e-06,
|
||
|
|
"loss": 0.4498,
|
||
|
|
"step": 576
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.734597156398104,
|
||
|
|
"grad_norm": 2.1490772482386538,
|
||
|
|
"learning_rate": 9.115822748641109e-06,
|
||
|
|
"loss": 0.4318,
|
||
|
|
"step": 577
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7393364928909953,
|
||
|
|
"grad_norm": 1.6911398901399057,
|
||
|
|
"learning_rate": 9.111120421938796e-06,
|
||
|
|
"loss": 0.2106,
|
||
|
|
"step": 578
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7440758293838865,
|
||
|
|
"grad_norm": 1.4174417301169415,
|
||
|
|
"learning_rate": 9.106406843749683e-06,
|
||
|
|
"loss": 0.203,
|
||
|
|
"step": 579
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.748815165876777,
|
||
|
|
"grad_norm": 1.6631284624013194,
|
||
|
|
"learning_rate": 9.101682026974086e-06,
|
||
|
|
"loss": 0.2728,
|
||
|
|
"step": 580
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7535545023696684,
|
||
|
|
"grad_norm": 3.139731671637309,
|
||
|
|
"learning_rate": 9.096945984543082e-06,
|
||
|
|
"loss": 0.3951,
|
||
|
|
"step": 581
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.758293838862559,
|
||
|
|
"grad_norm": 1.8737860217419084,
|
||
|
|
"learning_rate": 9.09219872941847e-06,
|
||
|
|
"loss": 0.2615,
|
||
|
|
"step": 582
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7630331753554502,
|
||
|
|
"grad_norm": 2.714671958141015,
|
||
|
|
"learning_rate": 9.08744027459274e-06,
|
||
|
|
"loss": 0.2639,
|
||
|
|
"step": 583
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7677725118483414,
|
||
|
|
"grad_norm": 2.1847837467103957,
|
||
|
|
"learning_rate": 9.082670633089028e-06,
|
||
|
|
"loss": 0.4095,
|
||
|
|
"step": 584
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.772511848341232,
|
||
|
|
"grad_norm": 1.3923844550111264,
|
||
|
|
"learning_rate": 9.077889817961089e-06,
|
||
|
|
"loss": 0.191,
|
||
|
|
"step": 585
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7772511848341233,
|
||
|
|
"grad_norm": 1.5983034252351458,
|
||
|
|
"learning_rate": 9.07309784229326e-06,
|
||
|
|
"loss": 0.2239,
|
||
|
|
"step": 586
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.781990521327014,
|
||
|
|
"grad_norm": 1.44827964004181,
|
||
|
|
"learning_rate": 9.068294719200422e-06,
|
||
|
|
"loss": 0.1246,
|
||
|
|
"step": 587
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.786729857819905,
|
||
|
|
"grad_norm": 1.6251016973740264,
|
||
|
|
"learning_rate": 9.063480461827958e-06,
|
||
|
|
"loss": 0.3035,
|
||
|
|
"step": 588
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7914691943127963,
|
||
|
|
"grad_norm": 2.52281647071451,
|
||
|
|
"learning_rate": 9.058655083351736e-06,
|
||
|
|
"loss": 0.2279,
|
||
|
|
"step": 589
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7962085308056874,
|
||
|
|
"grad_norm": 2.0494578495916147,
|
||
|
|
"learning_rate": 9.053818596978051e-06,
|
||
|
|
"loss": 0.1287,
|
||
|
|
"step": 590
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.800947867298578,
|
||
|
|
"grad_norm": 1.3704845634766936,
|
||
|
|
"learning_rate": 9.0489710159436e-06,
|
||
|
|
"loss": 0.1626,
|
||
|
|
"step": 591
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8056872037914693,
|
||
|
|
"grad_norm": 2.793908843701991,
|
||
|
|
"learning_rate": 9.044112353515451e-06,
|
||
|
|
"loss": 0.1489,
|
||
|
|
"step": 592
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.81042654028436,
|
||
|
|
"grad_norm": 1.2485798376541588,
|
||
|
|
"learning_rate": 9.039242622990991e-06,
|
||
|
|
"loss": 0.0234,
|
||
|
|
"step": 593
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.815165876777251,
|
||
|
|
"grad_norm": 1.450764852010275,
|
||
|
|
"learning_rate": 9.034361837697905e-06,
|
||
|
|
"loss": 0.1147,
|
||
|
|
"step": 594
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8199052132701423,
|
||
|
|
"grad_norm": 2.0316327906965688,
|
||
|
|
"learning_rate": 9.029470010994129e-06,
|
||
|
|
"loss": 0.3353,
|
||
|
|
"step": 595
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.824644549763033,
|
||
|
|
"grad_norm": 1.9981459231385812,
|
||
|
|
"learning_rate": 9.02456715626782e-06,
|
||
|
|
"loss": 0.2966,
|
||
|
|
"step": 596
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.829383886255924,
|
||
|
|
"grad_norm": 3.7732602269541897,
|
||
|
|
"learning_rate": 9.01965328693732e-06,
|
||
|
|
"loss": 0.1657,
|
||
|
|
"step": 597
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.834123222748815,
|
||
|
|
"grad_norm": 1.3140375816715553,
|
||
|
|
"learning_rate": 9.014728416451108e-06,
|
||
|
|
"loss": 0.1341,
|
||
|
|
"step": 598
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.838862559241706,
|
||
|
|
"grad_norm": 4.055146109244563,
|
||
|
|
"learning_rate": 9.009792558287777e-06,
|
||
|
|
"loss": 0.0918,
|
||
|
|
"step": 599
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.843601895734597,
|
||
|
|
"grad_norm": 1.5987581638887849,
|
||
|
|
"learning_rate": 9.004845725955993e-06,
|
||
|
|
"loss": 0.1849,
|
||
|
|
"step": 600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.843601895734597,
|
||
|
|
"eval_loss": 0.2646949291229248,
|
||
|
|
"eval_runtime": 7.7562,
|
||
|
|
"eval_samples_per_second": 24.239,
|
||
|
|
"eval_steps_per_second": 6.06,
|
||
|
|
"step": 600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8483412322274884,
|
||
|
|
"grad_norm": 1.4311870136077258,
|
||
|
|
"learning_rate": 8.999887932994451e-06,
|
||
|
|
"loss": 0.2196,
|
||
|
|
"step": 601
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.853080568720379,
|
||
|
|
"grad_norm": 1.5536357323703605,
|
||
|
|
"learning_rate": 8.994919192971849e-06,
|
||
|
|
"loss": 0.1919,
|
||
|
|
"step": 602
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8578199052132702,
|
||
|
|
"grad_norm": 2.701719571128564,
|
||
|
|
"learning_rate": 8.989939519486843e-06,
|
||
|
|
"loss": 0.26,
|
||
|
|
"step": 603
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.862559241706161,
|
||
|
|
"grad_norm": 1.636624610366843,
|
||
|
|
"learning_rate": 8.984948926168014e-06,
|
||
|
|
"loss": 0.2494,
|
||
|
|
"step": 604
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.867298578199052,
|
||
|
|
"grad_norm": 2.1995481470676457,
|
||
|
|
"learning_rate": 8.97994742667382e-06,
|
||
|
|
"loss": 0.3229,
|
||
|
|
"step": 605
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8720379146919433,
|
||
|
|
"grad_norm": 3.077386203425302,
|
||
|
|
"learning_rate": 8.974935034692584e-06,
|
||
|
|
"loss": 0.2151,
|
||
|
|
"step": 606
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.876777251184834,
|
||
|
|
"grad_norm": 2.5094488229151795,
|
||
|
|
"learning_rate": 8.969911763942422e-06,
|
||
|
|
"loss": 0.0976,
|
||
|
|
"step": 607
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.881516587677725,
|
||
|
|
"grad_norm": 2.0153082763397583,
|
||
|
|
"learning_rate": 8.96487762817124e-06,
|
||
|
|
"loss": 0.3267,
|
||
|
|
"step": 608
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8862559241706163,
|
||
|
|
"grad_norm": 2.504820362595467,
|
||
|
|
"learning_rate": 8.959832641156668e-06,
|
||
|
|
"loss": 0.4973,
|
||
|
|
"step": 609
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.890995260663507,
|
||
|
|
"grad_norm": 1.3840947253341358,
|
||
|
|
"learning_rate": 8.954776816706034e-06,
|
||
|
|
"loss": 0.2252,
|
||
|
|
"step": 610
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.895734597156398,
|
||
|
|
"grad_norm": 1.1475318182666634,
|
||
|
|
"learning_rate": 8.949710168656338e-06,
|
||
|
|
"loss": 0.1105,
|
||
|
|
"step": 611
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9004739336492893,
|
||
|
|
"grad_norm": 1.9729878568274046,
|
||
|
|
"learning_rate": 8.94463271087419e-06,
|
||
|
|
"loss": 0.2582,
|
||
|
|
"step": 612
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.90521327014218,
|
||
|
|
"grad_norm": 1.8025160811381695,
|
||
|
|
"learning_rate": 8.939544457255792e-06,
|
||
|
|
"loss": 0.3355,
|
||
|
|
"step": 613
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.909952606635071,
|
||
|
|
"grad_norm": 1.9099688440590183,
|
||
|
|
"learning_rate": 8.934445421726888e-06,
|
||
|
|
"loss": 0.3046,
|
||
|
|
"step": 614
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.914691943127962,
|
||
|
|
"grad_norm": 1.6522115356989961,
|
||
|
|
"learning_rate": 8.929335618242733e-06,
|
||
|
|
"loss": 0.2582,
|
||
|
|
"step": 615
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.919431279620853,
|
||
|
|
"grad_norm": 2.298386865763964,
|
||
|
|
"learning_rate": 8.924215060788052e-06,
|
||
|
|
"loss": 0.4088,
|
||
|
|
"step": 616
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.924170616113744,
|
||
|
|
"grad_norm": 2.147731530462001,
|
||
|
|
"learning_rate": 8.919083763377001e-06,
|
||
|
|
"loss": 0.2494,
|
||
|
|
"step": 617
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9289099526066353,
|
||
|
|
"grad_norm": 1.5422635054909362,
|
||
|
|
"learning_rate": 8.91394174005313e-06,
|
||
|
|
"loss": 0.2203,
|
||
|
|
"step": 618
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.933649289099526,
|
||
|
|
"grad_norm": 1.700833802485036,
|
||
|
|
"learning_rate": 8.908789004889344e-06,
|
||
|
|
"loss": 0.3243,
|
||
|
|
"step": 619
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.938388625592417,
|
||
|
|
"grad_norm": 2.2151123515867686,
|
||
|
|
"learning_rate": 8.903625571987863e-06,
|
||
|
|
"loss": 0.2661,
|
||
|
|
"step": 620
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.943127962085308,
|
||
|
|
"grad_norm": 1.595637385058635,
|
||
|
|
"learning_rate": 8.89845145548019e-06,
|
||
|
|
"loss": 0.2488,
|
||
|
|
"step": 621
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.947867298578199,
|
||
|
|
"grad_norm": 1.479935313492178,
|
||
|
|
"learning_rate": 8.893266669527063e-06,
|
||
|
|
"loss": 0.2238,
|
||
|
|
"step": 622
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9526066350710902,
|
||
|
|
"grad_norm": 2.2751814656976803,
|
||
|
|
"learning_rate": 8.888071228318422e-06,
|
||
|
|
"loss": 0.5095,
|
||
|
|
"step": 623
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.957345971563981,
|
||
|
|
"grad_norm": 2.4969401553056625,
|
||
|
|
"learning_rate": 8.882865146073365e-06,
|
||
|
|
"loss": 0.1634,
|
||
|
|
"step": 624
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.962085308056872,
|
||
|
|
"grad_norm": 1.532477373318212,
|
||
|
|
"learning_rate": 8.877648437040121e-06,
|
||
|
|
"loss": 0.2584,
|
||
|
|
"step": 625
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.966824644549763,
|
||
|
|
"grad_norm": 2.5398952366709056,
|
||
|
|
"learning_rate": 8.872421115495996e-06,
|
||
|
|
"loss": 0.5054,
|
||
|
|
"step": 626
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.971563981042654,
|
||
|
|
"grad_norm": 2.7892159953371167,
|
||
|
|
"learning_rate": 8.867183195747343e-06,
|
||
|
|
"loss": 0.3134,
|
||
|
|
"step": 627
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.976303317535545,
|
||
|
|
"grad_norm": 1.4358314980311842,
|
||
|
|
"learning_rate": 8.861934692129519e-06,
|
||
|
|
"loss": 0.1268,
|
||
|
|
"step": 628
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9810426540284363,
|
||
|
|
"grad_norm": 3.110877657855391,
|
||
|
|
"learning_rate": 8.85667561900685e-06,
|
||
|
|
"loss": 0.2085,
|
||
|
|
"step": 629
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.985781990521327,
|
||
|
|
"grad_norm": 1.4709345915042766,
|
||
|
|
"learning_rate": 8.851405990772588e-06,
|
||
|
|
"loss": 0.104,
|
||
|
|
"step": 630
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.990521327014218,
|
||
|
|
"grad_norm": 1.608173580874095,
|
||
|
|
"learning_rate": 8.846125821848874e-06,
|
||
|
|
"loss": 0.2228,
|
||
|
|
"step": 631
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.995260663507109,
|
||
|
|
"grad_norm": 1.7819696416704576,
|
||
|
|
"learning_rate": 8.840835126686694e-06,
|
||
|
|
"loss": 0.1161,
|
||
|
|
"step": 632
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.0,
|
||
|
|
"grad_norm": 2.3936724791423845,
|
||
|
|
"learning_rate": 8.835533919765844e-06,
|
||
|
|
"loss": 0.2689,
|
||
|
|
"step": 633
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"logging_steps": 1.0,
|
||
|
|
"max_steps": 2110,
|
||
|
|
"num_input_tokens_seen": 0,
|
||
|
|
"num_train_epochs": 10,
|
||
|
|
"save_steps": 500,
|
||
|
|
"stateful_callbacks": {
|
||
|
|
"TrainerControl": {
|
||
|
|
"args": {
|
||
|
|
"should_epoch_stop": false,
|
||
|
|
"should_evaluate": false,
|
||
|
|
"should_log": false,
|
||
|
|
"should_save": true,
|
||
|
|
"should_training_stop": false
|
||
|
|
},
|
||
|
|
"attributes": {}
|
||
|
|
}
|
||
|
|
},
|
||
|
|
"total_flos": 18303531614208.0,
|
||
|
|
"train_batch_size": 2,
|
||
|
|
"trial_name": null,
|
||
|
|
"trial_params": null
|
||
|
|
}
|