1575 lines
37 KiB
JSON
1575 lines
37 KiB
JSON
|
|
{
|
||
|
|
"best_global_step": null,
|
||
|
|
"best_metric": null,
|
||
|
|
"best_model_checkpoint": null,
|
||
|
|
"epoch": 5.0,
|
||
|
|
"eval_steps": 500,
|
||
|
|
"global_step": 220,
|
||
|
|
"is_hyper_param_search": false,
|
||
|
|
"is_local_process_zero": true,
|
||
|
|
"is_world_process_zero": true,
|
||
|
|
"log_history": [
|
||
|
|
{
|
||
|
|
"epoch": 0.022727272727272728,
|
||
|
|
"grad_norm": 26.87245830061202,
|
||
|
|
"learning_rate": 0.0,
|
||
|
|
"loss": 1.0691,
|
||
|
|
"step": 1
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.045454545454545456,
|
||
|
|
"grad_norm": 24.119318241718858,
|
||
|
|
"learning_rate": 3.3333333333333333e-06,
|
||
|
|
"loss": 0.989,
|
||
|
|
"step": 2
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.06818181818181818,
|
||
|
|
"grad_norm": 24.494901097246274,
|
||
|
|
"learning_rate": 6.666666666666667e-06,
|
||
|
|
"loss": 0.9827,
|
||
|
|
"step": 3
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.09090909090909091,
|
||
|
|
"grad_norm": 8.720154396981934,
|
||
|
|
"learning_rate": 1e-05,
|
||
|
|
"loss": 0.6482,
|
||
|
|
"step": 4
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.11363636363636363,
|
||
|
|
"grad_norm": 4.966678220163427,
|
||
|
|
"learning_rate": 9.999476022424688e-06,
|
||
|
|
"loss": 0.5896,
|
||
|
|
"step": 5
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.13636363636363635,
|
||
|
|
"grad_norm": 2.206240528335694,
|
||
|
|
"learning_rate": 9.997904199519748e-06,
|
||
|
|
"loss": 0.392,
|
||
|
|
"step": 6
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1590909090909091,
|
||
|
|
"grad_norm": 3.9617287120048257,
|
||
|
|
"learning_rate": 9.995284860725162e-06,
|
||
|
|
"loss": 0.4349,
|
||
|
|
"step": 7
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.18181818181818182,
|
||
|
|
"grad_norm": 2.7218886368671966,
|
||
|
|
"learning_rate": 9.991618555030848e-06,
|
||
|
|
"loss": 0.3502,
|
||
|
|
"step": 8
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.20454545454545456,
|
||
|
|
"grad_norm": 1.3224686041620617,
|
||
|
|
"learning_rate": 9.986906050861595e-06,
|
||
|
|
"loss": 0.342,
|
||
|
|
"step": 9
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.22727272727272727,
|
||
|
|
"grad_norm": 1.4655470659379455,
|
||
|
|
"learning_rate": 9.981148335916e-06,
|
||
|
|
"loss": 0.3311,
|
||
|
|
"step": 10
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.25,
|
||
|
|
"grad_norm": 2.099411259913016,
|
||
|
|
"learning_rate": 9.974346616959476e-06,
|
||
|
|
"loss": 0.3786,
|
||
|
|
"step": 11
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2727272727272727,
|
||
|
|
"grad_norm": 1.5150343483501494,
|
||
|
|
"learning_rate": 9.966502319571303e-06,
|
||
|
|
"loss": 0.2944,
|
||
|
|
"step": 12
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.29545454545454547,
|
||
|
|
"grad_norm": 1.8955486495667653,
|
||
|
|
"learning_rate": 9.95761708784585e-06,
|
||
|
|
"loss": 0.3586,
|
||
|
|
"step": 13
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3181818181818182,
|
||
|
|
"grad_norm": 1.0770593717657149,
|
||
|
|
"learning_rate": 9.94769278404799e-06,
|
||
|
|
"loss": 0.3189,
|
||
|
|
"step": 14
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3409090909090909,
|
||
|
|
"grad_norm": 1.8091317213751315,
|
||
|
|
"learning_rate": 9.936731488222776e-06,
|
||
|
|
"loss": 0.3416,
|
||
|
|
"step": 15
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.36363636363636365,
|
||
|
|
"grad_norm": 2.29274261320313,
|
||
|
|
"learning_rate": 9.924735497759497e-06,
|
||
|
|
"loss": 0.3399,
|
||
|
|
"step": 16
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.38636363636363635,
|
||
|
|
"grad_norm": 1.269837989850999,
|
||
|
|
"learning_rate": 9.911707326910145e-06,
|
||
|
|
"loss": 0.3543,
|
||
|
|
"step": 17
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4090909090909091,
|
||
|
|
"grad_norm": 1.5013795426439296,
|
||
|
|
"learning_rate": 9.897649706262474e-06,
|
||
|
|
"loss": 0.2903,
|
||
|
|
"step": 18
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4318181818181818,
|
||
|
|
"grad_norm": 1.4088618552574337,
|
||
|
|
"learning_rate": 9.882565582167673e-06,
|
||
|
|
"loss": 0.2845,
|
||
|
|
"step": 19
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.45454545454545453,
|
||
|
|
"grad_norm": 1.5165819472750817,
|
||
|
|
"learning_rate": 9.866458116122852e-06,
|
||
|
|
"loss": 0.316,
|
||
|
|
"step": 20
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4772727272727273,
|
||
|
|
"grad_norm": 1.6134481868835353,
|
||
|
|
"learning_rate": 9.849330684108409e-06,
|
||
|
|
"loss": 0.2928,
|
||
|
|
"step": 21
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5,
|
||
|
|
"grad_norm": 1.1143653341061437,
|
||
|
|
"learning_rate": 9.831186875880467e-06,
|
||
|
|
"loss": 0.276,
|
||
|
|
"step": 22
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5227272727272727,
|
||
|
|
"grad_norm": 0.9830408617009574,
|
||
|
|
"learning_rate": 9.812030494218484e-06,
|
||
|
|
"loss": 0.313,
|
||
|
|
"step": 23
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5454545454545454,
|
||
|
|
"grad_norm": 1.3736364481102779,
|
||
|
|
"learning_rate": 9.79186555412822e-06,
|
||
|
|
"loss": 0.3023,
|
||
|
|
"step": 24
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5681818181818182,
|
||
|
|
"grad_norm": 1.338556634218699,
|
||
|
|
"learning_rate": 9.770696282000245e-06,
|
||
|
|
"loss": 0.3273,
|
||
|
|
"step": 25
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5909090909090909,
|
||
|
|
"grad_norm": 1.34887345898166,
|
||
|
|
"learning_rate": 9.748527114724111e-06,
|
||
|
|
"loss": 0.3059,
|
||
|
|
"step": 26
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6136363636363636,
|
||
|
|
"grad_norm": 1.1465526284754688,
|
||
|
|
"learning_rate": 9.725362698758425e-06,
|
||
|
|
"loss": 0.254,
|
||
|
|
"step": 27
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6363636363636364,
|
||
|
|
"grad_norm": 1.1328089241339367,
|
||
|
|
"learning_rate": 9.701207889156989e-06,
|
||
|
|
"loss": 0.2727,
|
||
|
|
"step": 28
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6590909090909091,
|
||
|
|
"grad_norm": 1.4785097164649903,
|
||
|
|
"learning_rate": 9.676067748551232e-06,
|
||
|
|
"loss": 0.314,
|
||
|
|
"step": 29
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6818181818181818,
|
||
|
|
"grad_norm": 1.2861366584159655,
|
||
|
|
"learning_rate": 9.64994754608912e-06,
|
||
|
|
"loss": 0.3216,
|
||
|
|
"step": 30
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7045454545454546,
|
||
|
|
"grad_norm": 1.254630631559985,
|
||
|
|
"learning_rate": 9.622852756330797e-06,
|
||
|
|
"loss": 0.2671,
|
||
|
|
"step": 31
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7272727272727273,
|
||
|
|
"grad_norm": 1.4601173539398735,
|
||
|
|
"learning_rate": 9.594789058101154e-06,
|
||
|
|
"loss": 0.283,
|
||
|
|
"step": 32
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.75,
|
||
|
|
"grad_norm": 0.9800010703607837,
|
||
|
|
"learning_rate": 9.565762333299616e-06,
|
||
|
|
"loss": 0.2176,
|
||
|
|
"step": 33
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7727272727272727,
|
||
|
|
"grad_norm": 1.585779573547555,
|
||
|
|
"learning_rate": 9.535778665667334e-06,
|
||
|
|
"loss": 0.3186,
|
||
|
|
"step": 34
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7954545454545454,
|
||
|
|
"grad_norm": 1.3270309012768746,
|
||
|
|
"learning_rate": 9.504844339512096e-06,
|
||
|
|
"loss": 0.334,
|
||
|
|
"step": 35
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8181818181818182,
|
||
|
|
"grad_norm": 1.2326173009325117,
|
||
|
|
"learning_rate": 9.472965838391187e-06,
|
||
|
|
"loss": 0.2808,
|
||
|
|
"step": 36
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8409090909090909,
|
||
|
|
"grad_norm": 1.1558051437795536,
|
||
|
|
"learning_rate": 9.44014984375249e-06,
|
||
|
|
"loss": 0.2117,
|
||
|
|
"step": 37
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8636363636363636,
|
||
|
|
"grad_norm": 1.0782911468120715,
|
||
|
|
"learning_rate": 9.406403233534134e-06,
|
||
|
|
"loss": 0.2824,
|
||
|
|
"step": 38
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8863636363636364,
|
||
|
|
"grad_norm": 1.5406724902243696,
|
||
|
|
"learning_rate": 9.371733080722911e-06,
|
||
|
|
"loss": 0.2335,
|
||
|
|
"step": 39
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9090909090909091,
|
||
|
|
"grad_norm": 1.2044242055409695,
|
||
|
|
"learning_rate": 9.33614665187187e-06,
|
||
|
|
"loss": 0.2499,
|
||
|
|
"step": 40
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9318181818181818,
|
||
|
|
"grad_norm": 1.2616965501514557,
|
||
|
|
"learning_rate": 9.299651405577286e-06,
|
||
|
|
"loss": 0.2438,
|
||
|
|
"step": 41
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9545454545454546,
|
||
|
|
"grad_norm": 1.1136761921157818,
|
||
|
|
"learning_rate": 9.262254990915427e-06,
|
||
|
|
"loss": 0.2785,
|
||
|
|
"step": 42
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9772727272727273,
|
||
|
|
"grad_norm": 0.9966948364040108,
|
||
|
|
"learning_rate": 9.223965245839367e-06,
|
||
|
|
"loss": 0.2597,
|
||
|
|
"step": 43
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0,
|
||
|
|
"grad_norm": 1.3791645613025802,
|
||
|
|
"learning_rate": 9.184790195536217e-06,
|
||
|
|
"loss": 0.2679,
|
||
|
|
"step": 44
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0227272727272727,
|
||
|
|
"grad_norm": 1.179410268749222,
|
||
|
|
"learning_rate": 9.144738050745129e-06,
|
||
|
|
"loss": 0.181,
|
||
|
|
"step": 45
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0454545454545454,
|
||
|
|
"grad_norm": 1.1815878438627367,
|
||
|
|
"learning_rate": 9.103817206036383e-06,
|
||
|
|
"loss": 0.1863,
|
||
|
|
"step": 46
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0681818181818181,
|
||
|
|
"grad_norm": 0.8101413228519797,
|
||
|
|
"learning_rate": 9.062036238051978e-06,
|
||
|
|
"loss": 0.1843,
|
||
|
|
"step": 47
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0909090909090908,
|
||
|
|
"grad_norm": 0.9532028129997955,
|
||
|
|
"learning_rate": 9.019403903708036e-06,
|
||
|
|
"loss": 0.1732,
|
||
|
|
"step": 48
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1136363636363635,
|
||
|
|
"grad_norm": 0.992565251308887,
|
||
|
|
"learning_rate": 8.975929138359423e-06,
|
||
|
|
"loss": 0.2059,
|
||
|
|
"step": 49
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1363636363636362,
|
||
|
|
"grad_norm": 0.9117404975458566,
|
||
|
|
"learning_rate": 8.931621053926998e-06,
|
||
|
|
"loss": 0.2237,
|
||
|
|
"step": 50
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1590909090909092,
|
||
|
|
"grad_norm": 0.8003178422788053,
|
||
|
|
"learning_rate": 8.886488936987817e-06,
|
||
|
|
"loss": 0.1334,
|
||
|
|
"step": 51
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.1818181818181819,
|
||
|
|
"grad_norm": 1.2363455579654716,
|
||
|
|
"learning_rate": 8.840542246828763e-06,
|
||
|
|
"loss": 0.2168,
|
||
|
|
"step": 52
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2045454545454546,
|
||
|
|
"grad_norm": 1.2347708733857203,
|
||
|
|
"learning_rate": 8.793790613463956e-06,
|
||
|
|
"loss": 0.175,
|
||
|
|
"step": 53
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2272727272727273,
|
||
|
|
"grad_norm": 1.2303702228676998,
|
||
|
|
"learning_rate": 8.746243835616392e-06,
|
||
|
|
"loss": 0.1787,
|
||
|
|
"step": 54
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.25,
|
||
|
|
"grad_norm": 1.2497191463046406,
|
||
|
|
"learning_rate": 8.697911878664222e-06,
|
||
|
|
"loss": 0.1739,
|
||
|
|
"step": 55
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2727272727272727,
|
||
|
|
"grad_norm": 1.3951861645180035,
|
||
|
|
"learning_rate": 8.648804872552092e-06,
|
||
|
|
"loss": 0.1847,
|
||
|
|
"step": 56
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.2954545454545454,
|
||
|
|
"grad_norm": 1.245991460551998,
|
||
|
|
"learning_rate": 8.598933109667995e-06,
|
||
|
|
"loss": 0.1351,
|
||
|
|
"step": 57
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3181818181818181,
|
||
|
|
"grad_norm": 1.32907622391414,
|
||
|
|
"learning_rate": 8.548307042686093e-06,
|
||
|
|
"loss": 0.1546,
|
||
|
|
"step": 58
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3409090909090908,
|
||
|
|
"grad_norm": 1.4968562879002865,
|
||
|
|
"learning_rate": 8.496937282375912e-06,
|
||
|
|
"loss": 0.2356,
|
||
|
|
"step": 59
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3636363636363638,
|
||
|
|
"grad_norm": 0.9737096273924404,
|
||
|
|
"learning_rate": 8.444834595378434e-06,
|
||
|
|
"loss": 0.1335,
|
||
|
|
"step": 60
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.3863636363636362,
|
||
|
|
"grad_norm": 1.3589415450025601,
|
||
|
|
"learning_rate": 8.3920099019495e-06,
|
||
|
|
"loss": 0.1363,
|
||
|
|
"step": 61
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4090909090909092,
|
||
|
|
"grad_norm": 0.8664968714166548,
|
||
|
|
"learning_rate": 8.33847427367102e-06,
|
||
|
|
"loss": 0.1056,
|
||
|
|
"step": 62
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4318181818181819,
|
||
|
|
"grad_norm": 1.0430422759251574,
|
||
|
|
"learning_rate": 8.284238931130476e-06,
|
||
|
|
"loss": 0.1827,
|
||
|
|
"step": 63
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4545454545454546,
|
||
|
|
"grad_norm": 1.0086301864952136,
|
||
|
|
"learning_rate": 8.229315241569177e-06,
|
||
|
|
"loss": 0.1398,
|
||
|
|
"step": 64
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.4772727272727273,
|
||
|
|
"grad_norm": 0.9279203416268156,
|
||
|
|
"learning_rate": 8.173714716499801e-06,
|
||
|
|
"loss": 0.157,
|
||
|
|
"step": 65
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5,
|
||
|
|
"grad_norm": 1.1440300758673703,
|
||
|
|
"learning_rate": 8.117449009293668e-06,
|
||
|
|
"loss": 0.1685,
|
||
|
|
"step": 66
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5227272727272727,
|
||
|
|
"grad_norm": 1.1751439805537514,
|
||
|
|
"learning_rate": 8.060529912738316e-06,
|
||
|
|
"loss": 0.1572,
|
||
|
|
"step": 67
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5454545454545454,
|
||
|
|
"grad_norm": 1.0578357890566388,
|
||
|
|
"learning_rate": 8.002969356565822e-06,
|
||
|
|
"loss": 0.1598,
|
||
|
|
"step": 68
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5681818181818183,
|
||
|
|
"grad_norm": 1.3193928833299897,
|
||
|
|
"learning_rate": 7.94477940495245e-06,
|
||
|
|
"loss": 0.1854,
|
||
|
|
"step": 69
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.5909090909090908,
|
||
|
|
"grad_norm": 1.2978218132135766,
|
||
|
|
"learning_rate": 7.885972253990104e-06,
|
||
|
|
"loss": 0.1743,
|
||
|
|
"step": 70
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6136363636363638,
|
||
|
|
"grad_norm": 1.0105258814245202,
|
||
|
|
"learning_rate": 7.826560229130132e-06,
|
||
|
|
"loss": 0.1987,
|
||
|
|
"step": 71
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6363636363636362,
|
||
|
|
"grad_norm": 1.004604979048799,
|
||
|
|
"learning_rate": 7.766555782600023e-06,
|
||
|
|
"loss": 0.1795,
|
||
|
|
"step": 72
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6590909090909092,
|
||
|
|
"grad_norm": 1.1179470989774414,
|
||
|
|
"learning_rate": 7.70597149079354e-06,
|
||
|
|
"loss": 0.1815,
|
||
|
|
"step": 73
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.6818181818181817,
|
||
|
|
"grad_norm": 1.166448144503895,
|
||
|
|
"learning_rate": 7.644820051634813e-06,
|
||
|
|
"loss": 0.1617,
|
||
|
|
"step": 74
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7045454545454546,
|
||
|
|
"grad_norm": 0.9473819093100403,
|
||
|
|
"learning_rate": 7.5831142819169664e-06,
|
||
|
|
"loss": 0.1282,
|
||
|
|
"step": 75
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7272727272727273,
|
||
|
|
"grad_norm": 1.3707253122758942,
|
||
|
|
"learning_rate": 7.520867114615844e-06,
|
||
|
|
"loss": 0.1843,
|
||
|
|
"step": 76
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.75,
|
||
|
|
"grad_norm": 0.8753193774986169,
|
||
|
|
"learning_rate": 7.458091596179359e-06,
|
||
|
|
"loss": 0.1205,
|
||
|
|
"step": 77
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7727272727272727,
|
||
|
|
"grad_norm": 0.7554237843642733,
|
||
|
|
"learning_rate": 7.394800883793087e-06,
|
||
|
|
"loss": 0.0983,
|
||
|
|
"step": 78
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.7954545454545454,
|
||
|
|
"grad_norm": 1.1973348363016902,
|
||
|
|
"learning_rate": 7.331008242622637e-06,
|
||
|
|
"loss": 0.1848,
|
||
|
|
"step": 79
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8181818181818183,
|
||
|
|
"grad_norm": 1.1860354581221395,
|
||
|
|
"learning_rate": 7.266727043033386e-06,
|
||
|
|
"loss": 0.1527,
|
||
|
|
"step": 80
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8409090909090908,
|
||
|
|
"grad_norm": 1.4128677081083227,
|
||
|
|
"learning_rate": 7.201970757788172e-06,
|
||
|
|
"loss": 0.1602,
|
||
|
|
"step": 81
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8636363636363638,
|
||
|
|
"grad_norm": 1.296872182127621,
|
||
|
|
"learning_rate": 7.136752959223527e-06,
|
||
|
|
"loss": 0.2184,
|
||
|
|
"step": 82
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.8863636363636362,
|
||
|
|
"grad_norm": 1.4836645345593107,
|
||
|
|
"learning_rate": 7.071087316405037e-06,
|
||
|
|
"loss": 0.2896,
|
||
|
|
"step": 83
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9090909090909092,
|
||
|
|
"grad_norm": 1.1976778490053432,
|
||
|
|
"learning_rate": 7.00498759226242e-06,
|
||
|
|
"loss": 0.1659,
|
||
|
|
"step": 84
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9318181818181817,
|
||
|
|
"grad_norm": 0.9975231348395474,
|
||
|
|
"learning_rate": 6.938467640704953e-06,
|
||
|
|
"loss": 0.1535,
|
||
|
|
"step": 85
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9545454545454546,
|
||
|
|
"grad_norm": 0.9648415033602733,
|
||
|
|
"learning_rate": 6.871541403717808e-06,
|
||
|
|
"loss": 0.1753,
|
||
|
|
"step": 86
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.9772727272727273,
|
||
|
|
"grad_norm": 1.2019101492420445,
|
||
|
|
"learning_rate": 6.8042229084399325e-06,
|
||
|
|
"loss": 0.1562,
|
||
|
|
"step": 87
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0,
|
||
|
|
"grad_norm": 0.9443355724586839,
|
||
|
|
"learning_rate": 6.736526264224101e-06,
|
||
|
|
"loss": 0.1196,
|
||
|
|
"step": 88
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.022727272727273,
|
||
|
|
"grad_norm": 1.000147993017032,
|
||
|
|
"learning_rate": 6.668465659679714e-06,
|
||
|
|
"loss": 0.1105,
|
||
|
|
"step": 89
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0454545454545454,
|
||
|
|
"grad_norm": 0.9077637361309605,
|
||
|
|
"learning_rate": 6.600055359698984e-06,
|
||
|
|
"loss": 0.1359,
|
||
|
|
"step": 90
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.0681818181818183,
|
||
|
|
"grad_norm": 0.8899974700309011,
|
||
|
|
"learning_rate": 6.531309702467159e-06,
|
||
|
|
"loss": 0.1051,
|
||
|
|
"step": 91
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.090909090909091,
|
||
|
|
"grad_norm": 0.7405665057218146,
|
||
|
|
"learning_rate": 6.462243096457352e-06,
|
||
|
|
"loss": 0.0949,
|
||
|
|
"step": 92
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1136363636363638,
|
||
|
|
"grad_norm": 0.9232739478624769,
|
||
|
|
"learning_rate": 6.392870017410665e-06,
|
||
|
|
"loss": 0.0869,
|
||
|
|
"step": 93
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1363636363636362,
|
||
|
|
"grad_norm": 0.907192376363368,
|
||
|
|
"learning_rate": 6.323205005302199e-06,
|
||
|
|
"loss": 0.085,
|
||
|
|
"step": 94
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.159090909090909,
|
||
|
|
"grad_norm": 0.9510815353153362,
|
||
|
|
"learning_rate": 6.2532626612936035e-06,
|
||
|
|
"loss": 0.1041,
|
||
|
|
"step": 95
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.1818181818181817,
|
||
|
|
"grad_norm": 1.0694010726357495,
|
||
|
|
"learning_rate": 6.18305764467281e-06,
|
||
|
|
"loss": 0.0933,
|
||
|
|
"step": 96
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2045454545454546,
|
||
|
|
"grad_norm": 0.9096286210344772,
|
||
|
|
"learning_rate": 6.112604669781572e-06,
|
||
|
|
"loss": 0.0672,
|
||
|
|
"step": 97
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.227272727272727,
|
||
|
|
"grad_norm": 1.433917507625707,
|
||
|
|
"learning_rate": 6.041918502931473e-06,
|
||
|
|
"loss": 0.0879,
|
||
|
|
"step": 98
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.25,
|
||
|
|
"grad_norm": 0.9531885718869322,
|
||
|
|
"learning_rate": 5.971013959309038e-06,
|
||
|
|
"loss": 0.0596,
|
||
|
|
"step": 99
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2727272727272725,
|
||
|
|
"grad_norm": 1.1923953651383123,
|
||
|
|
"learning_rate": 5.8999058998706046e-06,
|
||
|
|
"loss": 0.0788,
|
||
|
|
"step": 100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.2954545454545454,
|
||
|
|
"grad_norm": 0.9554826618737247,
|
||
|
|
"learning_rate": 5.828609228227603e-06,
|
||
|
|
"loss": 0.073,
|
||
|
|
"step": 101
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3181818181818183,
|
||
|
|
"grad_norm": 1.0620618620218882,
|
||
|
|
"learning_rate": 5.757138887522884e-06,
|
||
|
|
"loss": 0.0852,
|
||
|
|
"step": 102
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.340909090909091,
|
||
|
|
"grad_norm": 1.1580458870328374,
|
||
|
|
"learning_rate": 5.685509857298781e-06,
|
||
|
|
"loss": 0.1011,
|
||
|
|
"step": 103
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3636363636363638,
|
||
|
|
"grad_norm": 1.5883148285084483,
|
||
|
|
"learning_rate": 5.613737150357528e-06,
|
||
|
|
"loss": 0.0791,
|
||
|
|
"step": 104
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.3863636363636362,
|
||
|
|
"grad_norm": 1.2367508056402248,
|
||
|
|
"learning_rate": 5.541835809614704e-06,
|
||
|
|
"loss": 0.0654,
|
||
|
|
"step": 105
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.409090909090909,
|
||
|
|
"grad_norm": 2.2791052878714653,
|
||
|
|
"learning_rate": 5.469820904946383e-06,
|
||
|
|
"loss": 0.087,
|
||
|
|
"step": 106
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4318181818181817,
|
||
|
|
"grad_norm": 1.2009177571989036,
|
||
|
|
"learning_rate": 5.397707530030621e-06,
|
||
|
|
"loss": 0.0754,
|
||
|
|
"step": 107
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4545454545454546,
|
||
|
|
"grad_norm": 1.24186865246545,
|
||
|
|
"learning_rate": 5.325510799183953e-06,
|
||
|
|
"loss": 0.0676,
|
||
|
|
"step": 108
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.4772727272727275,
|
||
|
|
"grad_norm": 1.3626254215524685,
|
||
|
|
"learning_rate": 5.253245844193564e-06,
|
||
|
|
"loss": 0.0897,
|
||
|
|
"step": 109
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5,
|
||
|
|
"grad_norm": 1.2267940513161908,
|
||
|
|
"learning_rate": 5.180927811145818e-06,
|
||
|
|
"loss": 0.081,
|
||
|
|
"step": 110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5227272727272725,
|
||
|
|
"grad_norm": 1.0280554800159314,
|
||
|
|
"learning_rate": 5.108571857251754e-06,
|
||
|
|
"loss": 0.0998,
|
||
|
|
"step": 111
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5454545454545454,
|
||
|
|
"grad_norm": 1.198523585670272,
|
||
|
|
"learning_rate": 5.036193147670286e-06,
|
||
|
|
"loss": 0.0943,
|
||
|
|
"step": 112
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.5681818181818183,
|
||
|
|
"grad_norm": 1.0299128746931727,
|
||
|
|
"learning_rate": 4.963806852329715e-06,
|
||
|
|
"loss": 0.0867,
|
||
|
|
"step": 113
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.590909090909091,
|
||
|
|
"grad_norm": 1.0781930889668705,
|
||
|
|
"learning_rate": 4.891428142748247e-06,
|
||
|
|
"loss": 0.0935,
|
||
|
|
"step": 114
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6136363636363638,
|
||
|
|
"grad_norm": 1.3870007299043179,
|
||
|
|
"learning_rate": 4.819072188854183e-06,
|
||
|
|
"loss": 0.1038,
|
||
|
|
"step": 115
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6363636363636362,
|
||
|
|
"grad_norm": 1.2763920896506822,
|
||
|
|
"learning_rate": 4.746754155806437e-06,
|
||
|
|
"loss": 0.1066,
|
||
|
|
"step": 116
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.659090909090909,
|
||
|
|
"grad_norm": 1.1553721597437976,
|
||
|
|
"learning_rate": 4.674489200816051e-06,
|
||
|
|
"loss": 0.0727,
|
||
|
|
"step": 117
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.6818181818181817,
|
||
|
|
"grad_norm": 1.2201292764615486,
|
||
|
|
"learning_rate": 4.602292469969381e-06,
|
||
|
|
"loss": 0.1029,
|
||
|
|
"step": 118
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7045454545454546,
|
||
|
|
"grad_norm": 1.0182353763504708,
|
||
|
|
"learning_rate": 4.5301790950536175e-06,
|
||
|
|
"loss": 0.081,
|
||
|
|
"step": 119
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7272727272727275,
|
||
|
|
"grad_norm": 0.8638534963225072,
|
||
|
|
"learning_rate": 4.458164190385297e-06,
|
||
|
|
"loss": 0.0743,
|
||
|
|
"step": 120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.75,
|
||
|
|
"grad_norm": 1.179392830465865,
|
||
|
|
"learning_rate": 4.386262849642474e-06,
|
||
|
|
"loss": 0.1008,
|
||
|
|
"step": 121
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7727272727272725,
|
||
|
|
"grad_norm": 0.8679825281753464,
|
||
|
|
"learning_rate": 4.31449014270122e-06,
|
||
|
|
"loss": 0.0493,
|
||
|
|
"step": 122
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.7954545454545454,
|
||
|
|
"grad_norm": 1.1253980579124658,
|
||
|
|
"learning_rate": 4.2428611124771184e-06,
|
||
|
|
"loss": 0.0848,
|
||
|
|
"step": 123
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8181818181818183,
|
||
|
|
"grad_norm": 1.0356630565413245,
|
||
|
|
"learning_rate": 4.171390771772399e-06,
|
||
|
|
"loss": 0.068,
|
||
|
|
"step": 124
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.840909090909091,
|
||
|
|
"grad_norm": 1.3477554175880992,
|
||
|
|
"learning_rate": 4.100094100129396e-06,
|
||
|
|
"loss": 0.1043,
|
||
|
|
"step": 125
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8636363636363638,
|
||
|
|
"grad_norm": 0.97866201549117,
|
||
|
|
"learning_rate": 4.028986040690963e-06,
|
||
|
|
"loss": 0.0847,
|
||
|
|
"step": 126
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.8863636363636362,
|
||
|
|
"grad_norm": 1.1349756610536412,
|
||
|
|
"learning_rate": 3.958081497068528e-06,
|
||
|
|
"loss": 0.0792,
|
||
|
|
"step": 127
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.909090909090909,
|
||
|
|
"grad_norm": 0.9146117017482877,
|
||
|
|
"learning_rate": 3.887395330218429e-06,
|
||
|
|
"loss": 0.0611,
|
||
|
|
"step": 128
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9318181818181817,
|
||
|
|
"grad_norm": 1.0807764635079844,
|
||
|
|
"learning_rate": 3.816942355327191e-06,
|
||
|
|
"loss": 0.0904,
|
||
|
|
"step": 129
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9545454545454546,
|
||
|
|
"grad_norm": 1.1271897510030453,
|
||
|
|
"learning_rate": 3.7467373387063973e-06,
|
||
|
|
"loss": 0.0769,
|
||
|
|
"step": 130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 2.9772727272727275,
|
||
|
|
"grad_norm": 1.079134591854598,
|
||
|
|
"learning_rate": 3.6767949946978026e-06,
|
||
|
|
"loss": 0.0936,
|
||
|
|
"step": 131
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.0,
|
||
|
|
"grad_norm": 1.0766678958231195,
|
||
|
|
"learning_rate": 3.607129982589337e-06,
|
||
|
|
"loss": 0.0836,
|
||
|
|
"step": 132
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.022727272727273,
|
||
|
|
"grad_norm": 0.6602194945121888,
|
||
|
|
"learning_rate": 3.5377569035426494e-06,
|
||
|
|
"loss": 0.0432,
|
||
|
|
"step": 133
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.0454545454545454,
|
||
|
|
"grad_norm": 0.9207788970106073,
|
||
|
|
"learning_rate": 3.468690297532843e-06,
|
||
|
|
"loss": 0.0614,
|
||
|
|
"step": 134
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.0681818181818183,
|
||
|
|
"grad_norm": 0.7792864367119448,
|
||
|
|
"learning_rate": 3.3999446403010156e-06,
|
||
|
|
"loss": 0.044,
|
||
|
|
"step": 135
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.090909090909091,
|
||
|
|
"grad_norm": 0.7824193339459661,
|
||
|
|
"learning_rate": 3.331534340320287e-06,
|
||
|
|
"loss": 0.0299,
|
||
|
|
"step": 136
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.1136363636363638,
|
||
|
|
"grad_norm": 1.035803750265937,
|
||
|
|
"learning_rate": 3.2634737357758994e-06,
|
||
|
|
"loss": 0.0481,
|
||
|
|
"step": 137
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.1363636363636362,
|
||
|
|
"grad_norm": 0.7826621695998633,
|
||
|
|
"learning_rate": 3.1957770915600696e-06,
|
||
|
|
"loss": 0.0388,
|
||
|
|
"step": 138
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.159090909090909,
|
||
|
|
"grad_norm": 0.6958054164558033,
|
||
|
|
"learning_rate": 3.1284585962821957e-06,
|
||
|
|
"loss": 0.0351,
|
||
|
|
"step": 139
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.1818181818181817,
|
||
|
|
"grad_norm": 0.9734916299688532,
|
||
|
|
"learning_rate": 3.0615323592950495e-06,
|
||
|
|
"loss": 0.0458,
|
||
|
|
"step": 140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.2045454545454546,
|
||
|
|
"grad_norm": 0.9750452432170936,
|
||
|
|
"learning_rate": 2.995012407737581e-06,
|
||
|
|
"loss": 0.044,
|
||
|
|
"step": 141
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.227272727272727,
|
||
|
|
"grad_norm": 1.1459789012585446,
|
||
|
|
"learning_rate": 2.9289126835949657e-06,
|
||
|
|
"loss": 0.0663,
|
||
|
|
"step": 142
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.25,
|
||
|
|
"grad_norm": 1.1249770693101788,
|
||
|
|
"learning_rate": 2.8632470407764746e-06,
|
||
|
|
"loss": 0.0431,
|
||
|
|
"step": 143
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.2727272727272725,
|
||
|
|
"grad_norm": 0.9864994227169254,
|
||
|
|
"learning_rate": 2.7980292422118282e-06,
|
||
|
|
"loss": 0.0606,
|
||
|
|
"step": 144
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.2954545454545454,
|
||
|
|
"grad_norm": 1.0092348933533415,
|
||
|
|
"learning_rate": 2.733272956966615e-06,
|
||
|
|
"loss": 0.0538,
|
||
|
|
"step": 145
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.3181818181818183,
|
||
|
|
"grad_norm": 1.39535927779365,
|
||
|
|
"learning_rate": 2.6689917573773615e-06,
|
||
|
|
"loss": 0.0531,
|
||
|
|
"step": 146
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.340909090909091,
|
||
|
|
"grad_norm": 0.9511300286527435,
|
||
|
|
"learning_rate": 2.605199116206912e-06,
|
||
|
|
"loss": 0.0382,
|
||
|
|
"step": 147
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.3636363636363638,
|
||
|
|
"grad_norm": 1.2541730166057663,
|
||
|
|
"learning_rate": 2.5419084038206422e-06,
|
||
|
|
"loss": 0.0419,
|
||
|
|
"step": 148
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.3863636363636362,
|
||
|
|
"grad_norm": 1.0095638491761618,
|
||
|
|
"learning_rate": 2.4791328853841577e-06,
|
||
|
|
"loss": 0.0434,
|
||
|
|
"step": 149
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.409090909090909,
|
||
|
|
"grad_norm": 0.8262732933318356,
|
||
|
|
"learning_rate": 2.416885718083035e-06,
|
||
|
|
"loss": 0.0322,
|
||
|
|
"step": 150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.4318181818181817,
|
||
|
|
"grad_norm": 0.8137374498919325,
|
||
|
|
"learning_rate": 2.3551799483651894e-06,
|
||
|
|
"loss": 0.0308,
|
||
|
|
"step": 151
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.4545454545454546,
|
||
|
|
"grad_norm": 0.8629006626767369,
|
||
|
|
"learning_rate": 2.294028509206461e-06,
|
||
|
|
"loss": 0.0459,
|
||
|
|
"step": 152
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.4772727272727275,
|
||
|
|
"grad_norm": 0.75511924638048,
|
||
|
|
"learning_rate": 2.2334442173999794e-06,
|
||
|
|
"loss": 0.0304,
|
||
|
|
"step": 153
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.5,
|
||
|
|
"grad_norm": 0.765294235454733,
|
||
|
|
"learning_rate": 2.17343977086987e-06,
|
||
|
|
"loss": 0.0436,
|
||
|
|
"step": 154
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.5227272727272725,
|
||
|
|
"grad_norm": 0.8507628894917487,
|
||
|
|
"learning_rate": 2.114027746009897e-06,
|
||
|
|
"loss": 0.0277,
|
||
|
|
"step": 155
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.5454545454545454,
|
||
|
|
"grad_norm": 1.1371497063801275,
|
||
|
|
"learning_rate": 2.055220595047551e-06,
|
||
|
|
"loss": 0.0463,
|
||
|
|
"step": 156
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.5681818181818183,
|
||
|
|
"grad_norm": 1.0468638172133997,
|
||
|
|
"learning_rate": 1.9970306434341806e-06,
|
||
|
|
"loss": 0.0354,
|
||
|
|
"step": 157
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.590909090909091,
|
||
|
|
"grad_norm": 0.8300360380363072,
|
||
|
|
"learning_rate": 1.9394700872616856e-06,
|
||
|
|
"loss": 0.0377,
|
||
|
|
"step": 158
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.6136363636363638,
|
||
|
|
"grad_norm": 1.2431358466370912,
|
||
|
|
"learning_rate": 1.8825509907063328e-06,
|
||
|
|
"loss": 0.0407,
|
||
|
|
"step": 159
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.6363636363636362,
|
||
|
|
"grad_norm": 1.0116784265871086,
|
||
|
|
"learning_rate": 1.826285283500201e-06,
|
||
|
|
"loss": 0.0506,
|
||
|
|
"step": 160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.659090909090909,
|
||
|
|
"grad_norm": 0.9351206425735656,
|
||
|
|
"learning_rate": 1.770684758430824e-06,
|
||
|
|
"loss": 0.0383,
|
||
|
|
"step": 161
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.6818181818181817,
|
||
|
|
"grad_norm": 0.7608293362619839,
|
||
|
|
"learning_rate": 1.7157610688695248e-06,
|
||
|
|
"loss": 0.0251,
|
||
|
|
"step": 162
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.7045454545454546,
|
||
|
|
"grad_norm": 1.0486186821729864,
|
||
|
|
"learning_rate": 1.6615257263289809e-06,
|
||
|
|
"loss": 0.0354,
|
||
|
|
"step": 163
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.7272727272727275,
|
||
|
|
"grad_norm": 0.8511206701235292,
|
||
|
|
"learning_rate": 1.607990098050501e-06,
|
||
|
|
"loss": 0.0375,
|
||
|
|
"step": 164
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.75,
|
||
|
|
"grad_norm": 0.9639257216156043,
|
||
|
|
"learning_rate": 1.555165404621567e-06,
|
||
|
|
"loss": 0.0406,
|
||
|
|
"step": 165
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.7727272727272725,
|
||
|
|
"grad_norm": 0.9665035321570787,
|
||
|
|
"learning_rate": 1.5030627176240903e-06,
|
||
|
|
"loss": 0.0386,
|
||
|
|
"step": 166
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.7954545454545454,
|
||
|
|
"grad_norm": 0.8935090470273086,
|
||
|
|
"learning_rate": 1.45169295731391e-06,
|
||
|
|
"loss": 0.0351,
|
||
|
|
"step": 167
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.8181818181818183,
|
||
|
|
"grad_norm": 1.1337183807356483,
|
||
|
|
"learning_rate": 1.4010668903320068e-06,
|
||
|
|
"loss": 0.0267,
|
||
|
|
"step": 168
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.840909090909091,
|
||
|
|
"grad_norm": 1.0191173211814406,
|
||
|
|
"learning_rate": 1.3511951274479096e-06,
|
||
|
|
"loss": 0.03,
|
||
|
|
"step": 169
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.8636363636363638,
|
||
|
|
"grad_norm": 0.8969739545223467,
|
||
|
|
"learning_rate": 1.3020881213357783e-06,
|
||
|
|
"loss": 0.0433,
|
||
|
|
"step": 170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.8863636363636362,
|
||
|
|
"grad_norm": 0.8374168117834856,
|
||
|
|
"learning_rate": 1.2537561643836087e-06,
|
||
|
|
"loss": 0.0272,
|
||
|
|
"step": 171
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.909090909090909,
|
||
|
|
"grad_norm": 1.0969039495955495,
|
||
|
|
"learning_rate": 1.2062093865360458e-06,
|
||
|
|
"loss": 0.052,
|
||
|
|
"step": 172
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.9318181818181817,
|
||
|
|
"grad_norm": 0.7906472210562279,
|
||
|
|
"learning_rate": 1.1594577531712392e-06,
|
||
|
|
"loss": 0.0492,
|
||
|
|
"step": 173
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.9545454545454546,
|
||
|
|
"grad_norm": 0.8214507384737139,
|
||
|
|
"learning_rate": 1.1135110630121837e-06,
|
||
|
|
"loss": 0.036,
|
||
|
|
"step": 174
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.9772727272727275,
|
||
|
|
"grad_norm": 0.8916597776872088,
|
||
|
|
"learning_rate": 1.0683789460730037e-06,
|
||
|
|
"loss": 0.0358,
|
||
|
|
"step": 175
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.0,
|
||
|
|
"grad_norm": 0.9910806202679314,
|
||
|
|
"learning_rate": 1.0240708616405788e-06,
|
||
|
|
"loss": 0.0471,
|
||
|
|
"step": 176
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.0227272727272725,
|
||
|
|
"grad_norm": 0.8017333339630908,
|
||
|
|
"learning_rate": 9.80596096291967e-07,
|
||
|
|
"loss": 0.0421,
|
||
|
|
"step": 177
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.045454545454546,
|
||
|
|
"grad_norm": 0.6088709030309449,
|
||
|
|
"learning_rate": 9.379637619480236e-07,
|
||
|
|
"loss": 0.0197,
|
||
|
|
"step": 178
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.068181818181818,
|
||
|
|
"grad_norm": 0.5056506758915681,
|
||
|
|
"learning_rate": 8.961827939636198e-07,
|
||
|
|
"loss": 0.0161,
|
||
|
|
"step": 179
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.090909090909091,
|
||
|
|
"grad_norm": 0.7756562125854171,
|
||
|
|
"learning_rate": 8.552619492548736e-07,
|
||
|
|
"loss": 0.0268,
|
||
|
|
"step": 180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.113636363636363,
|
||
|
|
"grad_norm": 0.5669855251712181,
|
||
|
|
"learning_rate": 8.15209804463783e-07,
|
||
|
|
"loss": 0.0202,
|
||
|
|
"step": 181
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.136363636363637,
|
||
|
|
"grad_norm": 0.6473780755422711,
|
||
|
|
"learning_rate": 7.760347541606339e-07,
|
||
|
|
"loss": 0.0307,
|
||
|
|
"step": 182
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.159090909090909,
|
||
|
|
"grad_norm": 0.5361573992028992,
|
||
|
|
"learning_rate": 7.377450090845733e-07,
|
||
|
|
"loss": 0.0215,
|
||
|
|
"step": 183
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.181818181818182,
|
||
|
|
"grad_norm": 0.6191327927742057,
|
||
|
|
"learning_rate": 7.003485944227162e-07,
|
||
|
|
"loss": 0.0297,
|
||
|
|
"step": 184
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.204545454545454,
|
||
|
|
"grad_norm": 0.7740351242770065,
|
||
|
|
"learning_rate": 6.638533481281323e-07,
|
||
|
|
"loss": 0.0277,
|
||
|
|
"step": 185
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.2272727272727275,
|
||
|
|
"grad_norm": 0.8036694531204502,
|
||
|
|
"learning_rate": 6.282669192770896e-07,
|
||
|
|
"loss": 0.0337,
|
||
|
|
"step": 186
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.25,
|
||
|
|
"grad_norm": 0.5569549110731221,
|
||
|
|
"learning_rate": 5.935967664658682e-07,
|
||
|
|
"loss": 0.023,
|
||
|
|
"step": 187
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.2727272727272725,
|
||
|
|
"grad_norm": 0.43268894985216455,
|
||
|
|
"learning_rate": 5.598501562475111e-07,
|
||
|
|
"loss": 0.015,
|
||
|
|
"step": 188
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.295454545454546,
|
||
|
|
"grad_norm": 0.5798546765255864,
|
||
|
|
"learning_rate": 5.270341616088153e-07,
|
||
|
|
"loss": 0.0161,
|
||
|
|
"step": 189
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.318181818181818,
|
||
|
|
"grad_norm": 0.667306809133945,
|
||
|
|
"learning_rate": 4.951556604879049e-07,
|
||
|
|
"loss": 0.0207,
|
||
|
|
"step": 190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.340909090909091,
|
||
|
|
"grad_norm": 0.5559259388326303,
|
||
|
|
"learning_rate": 4.6422133433266513e-07,
|
||
|
|
"loss": 0.0154,
|
||
|
|
"step": 191
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.363636363636363,
|
||
|
|
"grad_norm": 0.48805637239261085,
|
||
|
|
"learning_rate": 4.342376667003845e-07,
|
||
|
|
"loss": 0.0133,
|
||
|
|
"step": 192
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.386363636363637,
|
||
|
|
"grad_norm": 0.633760717360346,
|
||
|
|
"learning_rate": 4.05210941898847e-07,
|
||
|
|
"loss": 0.0166,
|
||
|
|
"step": 193
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.409090909090909,
|
||
|
|
"grad_norm": 0.4941494223196582,
|
||
|
|
"learning_rate": 3.771472436692053e-07,
|
||
|
|
"loss": 0.0136,
|
||
|
|
"step": 194
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.431818181818182,
|
||
|
|
"grad_norm": 0.694276868602463,
|
||
|
|
"learning_rate": 3.500524539108807e-07,
|
||
|
|
"loss": 0.0218,
|
||
|
|
"step": 195
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.454545454545454,
|
||
|
|
"grad_norm": 0.588819670846474,
|
||
|
|
"learning_rate": 3.239322514487686e-07,
|
||
|
|
"loss": 0.0176,
|
||
|
|
"step": 196
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.4772727272727275,
|
||
|
|
"grad_norm": 0.5265383444966927,
|
||
|
|
"learning_rate": 2.9879211084301194e-07,
|
||
|
|
"loss": 0.0151,
|
||
|
|
"step": 197
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.5,
|
||
|
|
"grad_norm": 0.44878162692299606,
|
||
|
|
"learning_rate": 2.7463730124157706e-07,
|
||
|
|
"loss": 0.014,
|
||
|
|
"step": 198
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.5227272727272725,
|
||
|
|
"grad_norm": 0.6569401749445752,
|
||
|
|
"learning_rate": 2.5147288527588964e-07,
|
||
|
|
"loss": 0.0164,
|
||
|
|
"step": 199
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.545454545454545,
|
||
|
|
"grad_norm": 1.2770158126939868,
|
||
|
|
"learning_rate": 2.2930371799975593e-07,
|
||
|
|
"loss": 0.0218,
|
||
|
|
"step": 200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.568181818181818,
|
||
|
|
"grad_norm": 0.5855843139833002,
|
||
|
|
"learning_rate": 2.0813444587178156e-07,
|
||
|
|
"loss": 0.0168,
|
||
|
|
"step": 201
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.590909090909091,
|
||
|
|
"grad_norm": 0.6376332689792783,
|
||
|
|
"learning_rate": 1.8796950578151785e-07,
|
||
|
|
"loss": 0.0152,
|
||
|
|
"step": 202
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.613636363636363,
|
||
|
|
"grad_norm": 0.6329781274553996,
|
||
|
|
"learning_rate": 1.6881312411953288e-07,
|
||
|
|
"loss": 0.0111,
|
||
|
|
"step": 203
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.636363636363637,
|
||
|
|
"grad_norm": 0.9765159855301974,
|
||
|
|
"learning_rate": 1.5066931589159118e-07,
|
||
|
|
"loss": 0.0278,
|
||
|
|
"step": 204
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.659090909090909,
|
||
|
|
"grad_norm": 0.6973740006107351,
|
||
|
|
"learning_rate": 1.3354188387715017e-07,
|
||
|
|
"loss": 0.022,
|
||
|
|
"step": 205
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.681818181818182,
|
||
|
|
"grad_norm": 0.5499873946022198,
|
||
|
|
"learning_rate": 1.174344178323289e-07,
|
||
|
|
"loss": 0.0155,
|
||
|
|
"step": 206
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.704545454545455,
|
||
|
|
"grad_norm": 0.7453329709429668,
|
||
|
|
"learning_rate": 1.0235029373752758e-07,
|
||
|
|
"loss": 0.0274,
|
||
|
|
"step": 207
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.7272727272727275,
|
||
|
|
"grad_norm": 0.4712400999164701,
|
||
|
|
"learning_rate": 8.829267308985535e-08,
|
||
|
|
"loss": 0.0128,
|
||
|
|
"step": 208
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.75,
|
||
|
|
"grad_norm": 0.791960861976085,
|
||
|
|
"learning_rate": 7.526450224050407e-08,
|
||
|
|
"loss": 0.0206,
|
||
|
|
"step": 209
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.7727272727272725,
|
||
|
|
"grad_norm": 0.8100989026594698,
|
||
|
|
"learning_rate": 6.326851177722304e-08,
|
||
|
|
"loss": 0.026,
|
||
|
|
"step": 210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.795454545454545,
|
||
|
|
"grad_norm": 0.5698845908406204,
|
||
|
|
"learning_rate": 5.230721595201049e-08,
|
||
|
|
"loss": 0.0115,
|
||
|
|
"step": 211
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.818181818181818,
|
||
|
|
"grad_norm": 0.7463531903553184,
|
||
|
|
"learning_rate": 4.2382912154150244e-08,
|
||
|
|
"loss": 0.0178,
|
||
|
|
"step": 212
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.840909090909091,
|
||
|
|
"grad_norm": 0.7813627200692648,
|
||
|
|
"learning_rate": 3.3497680428697943e-08,
|
||
|
|
"loss": 0.0163,
|
||
|
|
"step": 213
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.863636363636363,
|
||
|
|
"grad_norm": 0.66982841564412,
|
||
|
|
"learning_rate": 2.5653383040524228e-08,
|
||
|
|
"loss": 0.0186,
|
||
|
|
"step": 214
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.886363636363637,
|
||
|
|
"grad_norm": 0.4637052800834321,
|
||
|
|
"learning_rate": 1.8851664083999742e-08,
|
||
|
|
"loss": 0.013,
|
||
|
|
"step": 215
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.909090909090909,
|
||
|
|
"grad_norm": 0.43304208586446546,
|
||
|
|
"learning_rate": 1.3093949138406892e-08,
|
||
|
|
"loss": 0.0081,
|
||
|
|
"step": 216
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.931818181818182,
|
||
|
|
"grad_norm": 0.8973836308599885,
|
||
|
|
"learning_rate": 8.381444969151608e-09,
|
||
|
|
"loss": 0.0276,
|
||
|
|
"step": 217
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.954545454545455,
|
||
|
|
"grad_norm": 0.6757448220546857,
|
||
|
|
"learning_rate": 4.7151392748379095e-09,
|
||
|
|
"loss": 0.0185,
|
||
|
|
"step": 218
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 4.9772727272727275,
|
||
|
|
"grad_norm": 0.6378204095313933,
|
||
|
|
"learning_rate": 2.0958004802529297e-09,
|
||
|
|
"loss": 0.0239,
|
||
|
|
"step": 219
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 5.0,
|
||
|
|
"grad_norm": 0.7520281673571331,
|
||
|
|
"learning_rate": 5.239775753129728e-10,
|
||
|
|
"loss": 0.0294,
|
||
|
|
"step": 220
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"logging_steps": 1,
|
||
|
|
"max_steps": 220,
|
||
|
|
"num_input_tokens_seen": 0,
|
||
|
|
"num_train_epochs": 5,
|
||
|
|
"save_steps": 75,
|
||
|
|
"stateful_callbacks": {
|
||
|
|
"TrainerControl": {
|
||
|
|
"args": {
|
||
|
|
"should_epoch_stop": false,
|
||
|
|
"should_evaluate": false,
|
||
|
|
"should_log": false,
|
||
|
|
"should_save": true,
|
||
|
|
"should_training_stop": true
|
||
|
|
},
|
||
|
|
"attributes": {}
|
||
|
|
}
|
||
|
|
},
|
||
|
|
"total_flos": 14632660697088.0,
|
||
|
|
"train_batch_size": 2,
|
||
|
|
"trial_name": null,
|
||
|
|
"trial_params": null
|
||
|
|
}
|