1311 lines
29 KiB
JSON
1311 lines
29 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 0.01987918725974739,
|
|
"eval_steps": 500,
|
|
"global_step": 181,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.00010982976386600769,
|
|
"grad_norm": 18.411802291870117,
|
|
"learning_rate": 0.0,
|
|
"loss": 0.3251,
|
|
"step": 1
|
|
},
|
|
{
|
|
"epoch": 0.00021965952773201537,
|
|
"grad_norm": 14.977940559387207,
|
|
"learning_rate": 5.2631578947368416e-08,
|
|
"loss": 0.3606,
|
|
"step": 2
|
|
},
|
|
{
|
|
"epoch": 0.00032948929159802305,
|
|
"grad_norm": 14.84136962890625,
|
|
"learning_rate": 1.0526315789473683e-07,
|
|
"loss": 0.2777,
|
|
"step": 3
|
|
},
|
|
{
|
|
"epoch": 0.00043931905546403075,
|
|
"grad_norm": 15.656201362609863,
|
|
"learning_rate": 1.5789473684210525e-07,
|
|
"loss": 0.293,
|
|
"step": 4
|
|
},
|
|
{
|
|
"epoch": 0.0005491488193300384,
|
|
"grad_norm": 11.50175666809082,
|
|
"learning_rate": 2.1052631578947366e-07,
|
|
"loss": 0.271,
|
|
"step": 5
|
|
},
|
|
{
|
|
"epoch": 0.0006589785831960461,
|
|
"grad_norm": 13.30196762084961,
|
|
"learning_rate": 2.631578947368421e-07,
|
|
"loss": 0.3014,
|
|
"step": 6
|
|
},
|
|
{
|
|
"epoch": 0.0007688083470620538,
|
|
"grad_norm": 18.187772750854492,
|
|
"learning_rate": 3.157894736842105e-07,
|
|
"loss": 0.2838,
|
|
"step": 7
|
|
},
|
|
{
|
|
"epoch": 0.0008786381109280615,
|
|
"grad_norm": 12.342240333557129,
|
|
"learning_rate": 3.684210526315789e-07,
|
|
"loss": 0.2961,
|
|
"step": 8
|
|
},
|
|
{
|
|
"epoch": 0.0009884678747940692,
|
|
"grad_norm": 12.453307151794434,
|
|
"learning_rate": 4.2105263157894733e-07,
|
|
"loss": 0.2727,
|
|
"step": 9
|
|
},
|
|
{
|
|
"epoch": 0.001098297638660077,
|
|
"grad_norm": 13.236316680908203,
|
|
"learning_rate": 4.7368421052631574e-07,
|
|
"loss": 0.2777,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 0.0012081274025260845,
|
|
"grad_norm": 9.186564445495605,
|
|
"learning_rate": 5.263157894736842e-07,
|
|
"loss": 0.2362,
|
|
"step": 11
|
|
},
|
|
{
|
|
"epoch": 0.0013179571663920922,
|
|
"grad_norm": 14.68601131439209,
|
|
"learning_rate": 5.789473684210526e-07,
|
|
"loss": 0.3246,
|
|
"step": 12
|
|
},
|
|
{
|
|
"epoch": 0.0014277869302581,
|
|
"grad_norm": 12.118836402893066,
|
|
"learning_rate": 6.31578947368421e-07,
|
|
"loss": 0.2334,
|
|
"step": 13
|
|
},
|
|
{
|
|
"epoch": 0.0015376166941241077,
|
|
"grad_norm": 9.764254570007324,
|
|
"learning_rate": 6.842105263157895e-07,
|
|
"loss": 0.2382,
|
|
"step": 14
|
|
},
|
|
{
|
|
"epoch": 0.0016474464579901153,
|
|
"grad_norm": 7.129339694976807,
|
|
"learning_rate": 7.368421052631578e-07,
|
|
"loss": 0.2399,
|
|
"step": 15
|
|
},
|
|
{
|
|
"epoch": 0.001757276221856123,
|
|
"grad_norm": 6.999264717102051,
|
|
"learning_rate": 7.894736842105263e-07,
|
|
"loss": 0.1915,
|
|
"step": 16
|
|
},
|
|
{
|
|
"epoch": 0.0018671059857221306,
|
|
"grad_norm": 6.912661075592041,
|
|
"learning_rate": 8.421052631578947e-07,
|
|
"loss": 0.1996,
|
|
"step": 17
|
|
},
|
|
{
|
|
"epoch": 0.0019769357495881385,
|
|
"grad_norm": 6.077408313751221,
|
|
"learning_rate": 8.947368421052631e-07,
|
|
"loss": 0.1915,
|
|
"step": 18
|
|
},
|
|
{
|
|
"epoch": 0.002086765513454146,
|
|
"grad_norm": 4.185582160949707,
|
|
"learning_rate": 9.473684210526315e-07,
|
|
"loss": 0.176,
|
|
"step": 19
|
|
},
|
|
{
|
|
"epoch": 0.002196595277320154,
|
|
"grad_norm": 2.6936287879943848,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1309,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.0023064250411861617,
|
|
"grad_norm": 3.5675835609436035,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1515,
|
|
"step": 21
|
|
},
|
|
{
|
|
"epoch": 0.002416254805052169,
|
|
"grad_norm": 2.7305240631103516,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.132,
|
|
"step": 22
|
|
},
|
|
{
|
|
"epoch": 0.002526084568918177,
|
|
"grad_norm": 2.0667977333068848,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.107,
|
|
"step": 23
|
|
},
|
|
{
|
|
"epoch": 0.0026359143327841844,
|
|
"grad_norm": 2.3355114459991455,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1252,
|
|
"step": 24
|
|
},
|
|
{
|
|
"epoch": 0.0027457440966501922,
|
|
"grad_norm": 2.30021333694458,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1262,
|
|
"step": 25
|
|
},
|
|
{
|
|
"epoch": 0.0028555738605162,
|
|
"grad_norm": 2.639949083328247,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1611,
|
|
"step": 26
|
|
},
|
|
{
|
|
"epoch": 0.0029654036243822075,
|
|
"grad_norm": 2.6157100200653076,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1133,
|
|
"step": 27
|
|
},
|
|
{
|
|
"epoch": 0.0030752333882482154,
|
|
"grad_norm": 2.25714373588562,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1175,
|
|
"step": 28
|
|
},
|
|
{
|
|
"epoch": 0.003185063152114223,
|
|
"grad_norm": 2.413874864578247,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1335,
|
|
"step": 29
|
|
},
|
|
{
|
|
"epoch": 0.0032948929159802307,
|
|
"grad_norm": 2.9752049446105957,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1549,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 0.003404722679846238,
|
|
"grad_norm": 2.191883087158203,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1072,
|
|
"step": 31
|
|
},
|
|
{
|
|
"epoch": 0.003514552443712246,
|
|
"grad_norm": 2.182326078414917,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1322,
|
|
"step": 32
|
|
},
|
|
{
|
|
"epoch": 0.003624382207578254,
|
|
"grad_norm": 2.0499420166015625,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1252,
|
|
"step": 33
|
|
},
|
|
{
|
|
"epoch": 0.0037342119714442613,
|
|
"grad_norm": 2.898209571838379,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1564,
|
|
"step": 34
|
|
},
|
|
{
|
|
"epoch": 0.003844041735310269,
|
|
"grad_norm": 2.6807024478912354,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1215,
|
|
"step": 35
|
|
},
|
|
{
|
|
"epoch": 0.003953871499176277,
|
|
"grad_norm": 2.6628167629241943,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.125,
|
|
"step": 36
|
|
},
|
|
{
|
|
"epoch": 0.004063701263042284,
|
|
"grad_norm": 2.3685622215270996,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1423,
|
|
"step": 37
|
|
},
|
|
{
|
|
"epoch": 0.004173531026908292,
|
|
"grad_norm": 2.0525412559509277,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1321,
|
|
"step": 38
|
|
},
|
|
{
|
|
"epoch": 0.0042833607907743,
|
|
"grad_norm": 2.064305305480957,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1139,
|
|
"step": 39
|
|
},
|
|
{
|
|
"epoch": 0.004393190554640308,
|
|
"grad_norm": 2.255208969116211,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1075,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.004503020318506315,
|
|
"grad_norm": 2.1674306392669678,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1231,
|
|
"step": 41
|
|
},
|
|
{
|
|
"epoch": 0.004612850082372323,
|
|
"grad_norm": 2.1255414485931396,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.149,
|
|
"step": 42
|
|
},
|
|
{
|
|
"epoch": 0.004722679846238331,
|
|
"grad_norm": 1.9542911052703857,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1406,
|
|
"step": 43
|
|
},
|
|
{
|
|
"epoch": 0.004832509610104338,
|
|
"grad_norm": 2.3152379989624023,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.122,
|
|
"step": 44
|
|
},
|
|
{
|
|
"epoch": 0.004942339373970346,
|
|
"grad_norm": 2.2259297370910645,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1273,
|
|
"step": 45
|
|
},
|
|
{
|
|
"epoch": 0.005052169137836354,
|
|
"grad_norm": 2.1162829399108887,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1071,
|
|
"step": 46
|
|
},
|
|
{
|
|
"epoch": 0.005161998901702361,
|
|
"grad_norm": 2.2870123386383057,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1222,
|
|
"step": 47
|
|
},
|
|
{
|
|
"epoch": 0.005271828665568369,
|
|
"grad_norm": 2.1178293228149414,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1128,
|
|
"step": 48
|
|
},
|
|
{
|
|
"epoch": 0.005381658429434377,
|
|
"grad_norm": 2.811563730239868,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.113,
|
|
"step": 49
|
|
},
|
|
{
|
|
"epoch": 0.0054914881933003845,
|
|
"grad_norm": 1.9846251010894775,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1083,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 0.005601317957166392,
|
|
"grad_norm": 4.522186279296875,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1283,
|
|
"step": 51
|
|
},
|
|
{
|
|
"epoch": 0.0057111477210324,
|
|
"grad_norm": 1.9514762163162231,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.117,
|
|
"step": 52
|
|
},
|
|
{
|
|
"epoch": 0.005820977484898408,
|
|
"grad_norm": 1.644494652748108,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1097,
|
|
"step": 53
|
|
},
|
|
{
|
|
"epoch": 0.005930807248764415,
|
|
"grad_norm": 2.26704740524292,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0996,
|
|
"step": 54
|
|
},
|
|
{
|
|
"epoch": 0.0060406370126304225,
|
|
"grad_norm": 1.8278131484985352,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1175,
|
|
"step": 55
|
|
},
|
|
{
|
|
"epoch": 0.006150466776496431,
|
|
"grad_norm": 2.9004828929901123,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.118,
|
|
"step": 56
|
|
},
|
|
{
|
|
"epoch": 0.006260296540362438,
|
|
"grad_norm": 2.1837573051452637,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1052,
|
|
"step": 57
|
|
},
|
|
{
|
|
"epoch": 0.006370126304228446,
|
|
"grad_norm": 2.207904577255249,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1057,
|
|
"step": 58
|
|
},
|
|
{
|
|
"epoch": 0.006479956068094454,
|
|
"grad_norm": 1.937738060951233,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1377,
|
|
"step": 59
|
|
},
|
|
{
|
|
"epoch": 0.006589785831960461,
|
|
"grad_norm": 1.8323129415512085,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0962,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.006699615595826469,
|
|
"grad_norm": 1.9134773015975952,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0991,
|
|
"step": 61
|
|
},
|
|
{
|
|
"epoch": 0.006809445359692476,
|
|
"grad_norm": 2.54829478263855,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1199,
|
|
"step": 62
|
|
},
|
|
{
|
|
"epoch": 0.0069192751235584845,
|
|
"grad_norm": 2.0665552616119385,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1242,
|
|
"step": 63
|
|
},
|
|
{
|
|
"epoch": 0.007029104887424492,
|
|
"grad_norm": 1.9088149070739746,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1187,
|
|
"step": 64
|
|
},
|
|
{
|
|
"epoch": 0.007138934651290499,
|
|
"grad_norm": 1.6347726583480835,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1087,
|
|
"step": 65
|
|
},
|
|
{
|
|
"epoch": 0.007248764415156508,
|
|
"grad_norm": 1.8846579790115356,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.115,
|
|
"step": 66
|
|
},
|
|
{
|
|
"epoch": 0.007358594179022515,
|
|
"grad_norm": 2.108840227127075,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1296,
|
|
"step": 67
|
|
},
|
|
{
|
|
"epoch": 0.0074684239428885225,
|
|
"grad_norm": 2.0860307216644287,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1088,
|
|
"step": 68
|
|
},
|
|
{
|
|
"epoch": 0.007578253706754531,
|
|
"grad_norm": 2.0085508823394775,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1208,
|
|
"step": 69
|
|
},
|
|
{
|
|
"epoch": 0.007688083470620538,
|
|
"grad_norm": 1.8251607418060303,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1112,
|
|
"step": 70
|
|
},
|
|
{
|
|
"epoch": 0.007797913234486546,
|
|
"grad_norm": 1.9405887126922607,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1014,
|
|
"step": 71
|
|
},
|
|
{
|
|
"epoch": 0.007907742998352554,
|
|
"grad_norm": 2.248020887374878,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.107,
|
|
"step": 72
|
|
},
|
|
{
|
|
"epoch": 0.008017572762218561,
|
|
"grad_norm": 1.9570263624191284,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1071,
|
|
"step": 73
|
|
},
|
|
{
|
|
"epoch": 0.008127402526084569,
|
|
"grad_norm": 2.1239218711853027,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.133,
|
|
"step": 74
|
|
},
|
|
{
|
|
"epoch": 0.008237232289950576,
|
|
"grad_norm": 1.7767431735992432,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.102,
|
|
"step": 75
|
|
},
|
|
{
|
|
"epoch": 0.008347062053816584,
|
|
"grad_norm": 2.4217135906219482,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1004,
|
|
"step": 76
|
|
},
|
|
{
|
|
"epoch": 0.008456891817682591,
|
|
"grad_norm": 1.89665687084198,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1241,
|
|
"step": 77
|
|
},
|
|
{
|
|
"epoch": 0.0085667215815486,
|
|
"grad_norm": 1.431766152381897,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0849,
|
|
"step": 78
|
|
},
|
|
{
|
|
"epoch": 0.008676551345414608,
|
|
"grad_norm": 2.6968767642974854,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1129,
|
|
"step": 79
|
|
},
|
|
{
|
|
"epoch": 0.008786381109280615,
|
|
"grad_norm": 1.8317389488220215,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1033,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.008896210873146623,
|
|
"grad_norm": 2.0611159801483154,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1075,
|
|
"step": 81
|
|
},
|
|
{
|
|
"epoch": 0.00900604063701263,
|
|
"grad_norm": 2.4678268432617188,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0914,
|
|
"step": 82
|
|
},
|
|
{
|
|
"epoch": 0.009115870400878637,
|
|
"grad_norm": 2.097642183303833,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1033,
|
|
"step": 83
|
|
},
|
|
{
|
|
"epoch": 0.009225700164744647,
|
|
"grad_norm": 1.9599785804748535,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1063,
|
|
"step": 84
|
|
},
|
|
{
|
|
"epoch": 0.009335529928610654,
|
|
"grad_norm": 1.938198447227478,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0968,
|
|
"step": 85
|
|
},
|
|
{
|
|
"epoch": 0.009445359692476661,
|
|
"grad_norm": 2.0601954460144043,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1077,
|
|
"step": 86
|
|
},
|
|
{
|
|
"epoch": 0.009555189456342669,
|
|
"grad_norm": 1.9235936403274536,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1218,
|
|
"step": 87
|
|
},
|
|
{
|
|
"epoch": 0.009665019220208676,
|
|
"grad_norm": 1.6672967672348022,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0982,
|
|
"step": 88
|
|
},
|
|
{
|
|
"epoch": 0.009774848984074684,
|
|
"grad_norm": 1.9302681684494019,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1124,
|
|
"step": 89
|
|
},
|
|
{
|
|
"epoch": 0.009884678747940691,
|
|
"grad_norm": 2.2827959060668945,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1178,
|
|
"step": 90
|
|
},
|
|
{
|
|
"epoch": 0.0099945085118067,
|
|
"grad_norm": 1.8714021444320679,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0984,
|
|
"step": 91
|
|
},
|
|
{
|
|
"epoch": 0.010104338275672708,
|
|
"grad_norm": 2.1995835304260254,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1319,
|
|
"step": 92
|
|
},
|
|
{
|
|
"epoch": 0.010214168039538715,
|
|
"grad_norm": 1.92769455909729,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1026,
|
|
"step": 93
|
|
},
|
|
{
|
|
"epoch": 0.010323997803404723,
|
|
"grad_norm": 1.9699769020080566,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1189,
|
|
"step": 94
|
|
},
|
|
{
|
|
"epoch": 0.01043382756727073,
|
|
"grad_norm": 2.7029881477355957,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1282,
|
|
"step": 95
|
|
},
|
|
{
|
|
"epoch": 0.010543657331136738,
|
|
"grad_norm": 1.6077944040298462,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1096,
|
|
"step": 96
|
|
},
|
|
{
|
|
"epoch": 0.010653487095002745,
|
|
"grad_norm": 2.0745413303375244,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1154,
|
|
"step": 97
|
|
},
|
|
{
|
|
"epoch": 0.010763316858868754,
|
|
"grad_norm": 1.8612251281738281,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1014,
|
|
"step": 98
|
|
},
|
|
{
|
|
"epoch": 0.010873146622734762,
|
|
"grad_norm": 1.8795632123947144,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1029,
|
|
"step": 99
|
|
},
|
|
{
|
|
"epoch": 0.010982976386600769,
|
|
"grad_norm": 1.8857154846191406,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1014,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.011092806150466776,
|
|
"grad_norm": 1.87457275390625,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1047,
|
|
"step": 101
|
|
},
|
|
{
|
|
"epoch": 0.011202635914332784,
|
|
"grad_norm": 2.02274489402771,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.117,
|
|
"step": 102
|
|
},
|
|
{
|
|
"epoch": 0.011312465678198791,
|
|
"grad_norm": 2.1100752353668213,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1116,
|
|
"step": 103
|
|
},
|
|
{
|
|
"epoch": 0.0114222954420648,
|
|
"grad_norm": 2.1528773307800293,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1147,
|
|
"step": 104
|
|
},
|
|
{
|
|
"epoch": 0.011532125205930808,
|
|
"grad_norm": 5.85520076751709,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1089,
|
|
"step": 105
|
|
},
|
|
{
|
|
"epoch": 0.011641954969796815,
|
|
"grad_norm": 2.007204532623291,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1244,
|
|
"step": 106
|
|
},
|
|
{
|
|
"epoch": 0.011751784733662823,
|
|
"grad_norm": 1.9761431217193604,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.104,
|
|
"step": 107
|
|
},
|
|
{
|
|
"epoch": 0.01186161449752883,
|
|
"grad_norm": 1.6352622509002686,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1098,
|
|
"step": 108
|
|
},
|
|
{
|
|
"epoch": 0.011971444261394838,
|
|
"grad_norm": 1.898520588874817,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1122,
|
|
"step": 109
|
|
},
|
|
{
|
|
"epoch": 0.012081274025260845,
|
|
"grad_norm": 1.6044663190841675,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1044,
|
|
"step": 110
|
|
},
|
|
{
|
|
"epoch": 0.012191103789126854,
|
|
"grad_norm": 1.7996292114257812,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1107,
|
|
"step": 111
|
|
},
|
|
{
|
|
"epoch": 0.012300933552992862,
|
|
"grad_norm": 1.949839472770691,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1345,
|
|
"step": 112
|
|
},
|
|
{
|
|
"epoch": 0.012410763316858869,
|
|
"grad_norm": 1.7750391960144043,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1023,
|
|
"step": 113
|
|
},
|
|
{
|
|
"epoch": 0.012520593080724876,
|
|
"grad_norm": 1.8512459993362427,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0921,
|
|
"step": 114
|
|
},
|
|
{
|
|
"epoch": 0.012630422844590884,
|
|
"grad_norm": 1.8420369625091553,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0949,
|
|
"step": 115
|
|
},
|
|
{
|
|
"epoch": 0.012740252608456891,
|
|
"grad_norm": 1.885312795639038,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1118,
|
|
"step": 116
|
|
},
|
|
{
|
|
"epoch": 0.012850082372322899,
|
|
"grad_norm": 2.293736457824707,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1227,
|
|
"step": 117
|
|
},
|
|
{
|
|
"epoch": 0.012959912136188908,
|
|
"grad_norm": 1.9772549867630005,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1212,
|
|
"step": 118
|
|
},
|
|
{
|
|
"epoch": 0.013069741900054915,
|
|
"grad_norm": 2.280238151550293,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1192,
|
|
"step": 119
|
|
},
|
|
{
|
|
"epoch": 0.013179571663920923,
|
|
"grad_norm": 1.9888858795166016,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1027,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.01328940142778693,
|
|
"grad_norm": 1.5730584859848022,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0913,
|
|
"step": 121
|
|
},
|
|
{
|
|
"epoch": 0.013399231191652938,
|
|
"grad_norm": 1.7493692636489868,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.097,
|
|
"step": 122
|
|
},
|
|
{
|
|
"epoch": 0.013509060955518945,
|
|
"grad_norm": 2.1915624141693115,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1217,
|
|
"step": 123
|
|
},
|
|
{
|
|
"epoch": 0.013618890719384952,
|
|
"grad_norm": 2.0627121925354004,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1138,
|
|
"step": 124
|
|
},
|
|
{
|
|
"epoch": 0.013728720483250962,
|
|
"grad_norm": 1.8700608015060425,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0957,
|
|
"step": 125
|
|
},
|
|
{
|
|
"epoch": 0.013838550247116969,
|
|
"grad_norm": 4.867977142333984,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1143,
|
|
"step": 126
|
|
},
|
|
{
|
|
"epoch": 0.013948380010982976,
|
|
"grad_norm": 1.6484566926956177,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0883,
|
|
"step": 127
|
|
},
|
|
{
|
|
"epoch": 0.014058209774848984,
|
|
"grad_norm": 1.7756706476211548,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0992,
|
|
"step": 128
|
|
},
|
|
{
|
|
"epoch": 0.014168039538714991,
|
|
"grad_norm": 23.05768394470215,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0779,
|
|
"step": 129
|
|
},
|
|
{
|
|
"epoch": 0.014277869302580999,
|
|
"grad_norm": 1.8049583435058594,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1014,
|
|
"step": 130
|
|
},
|
|
{
|
|
"epoch": 0.014387699066447008,
|
|
"grad_norm": 1.9289432764053345,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0958,
|
|
"step": 131
|
|
},
|
|
{
|
|
"epoch": 0.014497528830313015,
|
|
"grad_norm": 1.9363269805908203,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1306,
|
|
"step": 132
|
|
},
|
|
{
|
|
"epoch": 0.014607358594179023,
|
|
"grad_norm": 1.7127413749694824,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1015,
|
|
"step": 133
|
|
},
|
|
{
|
|
"epoch": 0.01471718835804503,
|
|
"grad_norm": 2.036144733428955,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1002,
|
|
"step": 134
|
|
},
|
|
{
|
|
"epoch": 0.014827018121911038,
|
|
"grad_norm": 2.464301347732544,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1119,
|
|
"step": 135
|
|
},
|
|
{
|
|
"epoch": 0.014936847885777045,
|
|
"grad_norm": 1.9304972887039185,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1148,
|
|
"step": 136
|
|
},
|
|
{
|
|
"epoch": 0.015046677649643053,
|
|
"grad_norm": 1.6702697277069092,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1113,
|
|
"step": 137
|
|
},
|
|
{
|
|
"epoch": 0.015156507413509062,
|
|
"grad_norm": 1.804840326309204,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0956,
|
|
"step": 138
|
|
},
|
|
{
|
|
"epoch": 0.015266337177375069,
|
|
"grad_norm": 1.7338131666183472,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.107,
|
|
"step": 139
|
|
},
|
|
{
|
|
"epoch": 0.015376166941241077,
|
|
"grad_norm": 1.8909966945648193,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.093,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 0.015485996705107084,
|
|
"grad_norm": 1.957493543624878,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0968,
|
|
"step": 141
|
|
},
|
|
{
|
|
"epoch": 0.015595826468973091,
|
|
"grad_norm": 2.450575590133667,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1133,
|
|
"step": 142
|
|
},
|
|
{
|
|
"epoch": 0.0157056562328391,
|
|
"grad_norm": 2.1446874141693115,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1388,
|
|
"step": 143
|
|
},
|
|
{
|
|
"epoch": 0.015815485996705108,
|
|
"grad_norm": 2.1485435962677,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1315,
|
|
"step": 144
|
|
},
|
|
{
|
|
"epoch": 0.015925315760571115,
|
|
"grad_norm": 2.152599573135376,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1326,
|
|
"step": 145
|
|
},
|
|
{
|
|
"epoch": 0.016035145524437123,
|
|
"grad_norm": 1.9794915914535522,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1277,
|
|
"step": 146
|
|
},
|
|
{
|
|
"epoch": 0.01614497528830313,
|
|
"grad_norm": 1.7658240795135498,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1031,
|
|
"step": 147
|
|
},
|
|
{
|
|
"epoch": 0.016254805052169138,
|
|
"grad_norm": 1.8528263568878174,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0952,
|
|
"step": 148
|
|
},
|
|
{
|
|
"epoch": 0.016364634816035145,
|
|
"grad_norm": 1.5647423267364502,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1088,
|
|
"step": 149
|
|
},
|
|
{
|
|
"epoch": 0.016474464579901153,
|
|
"grad_norm": 2.419384717941284,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0967,
|
|
"step": 150
|
|
},
|
|
{
|
|
"epoch": 0.01658429434376716,
|
|
"grad_norm": 1.5838412046432495,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0839,
|
|
"step": 151
|
|
},
|
|
{
|
|
"epoch": 0.016694124107633167,
|
|
"grad_norm": 2.1404707431793213,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1142,
|
|
"step": 152
|
|
},
|
|
{
|
|
"epoch": 0.016803953871499175,
|
|
"grad_norm": 1.6462748050689697,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0947,
|
|
"step": 153
|
|
},
|
|
{
|
|
"epoch": 0.016913783635365182,
|
|
"grad_norm": 2.057058811187744,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1123,
|
|
"step": 154
|
|
},
|
|
{
|
|
"epoch": 0.017023613399231193,
|
|
"grad_norm": 1.7520567178726196,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1008,
|
|
"step": 155
|
|
},
|
|
{
|
|
"epoch": 0.0171334431630972,
|
|
"grad_norm": 2.039196014404297,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1322,
|
|
"step": 156
|
|
},
|
|
{
|
|
"epoch": 0.017243272926963208,
|
|
"grad_norm": 1.8729941844940186,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.102,
|
|
"step": 157
|
|
},
|
|
{
|
|
"epoch": 0.017353102690829215,
|
|
"grad_norm": 1.9198112487792969,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1141,
|
|
"step": 158
|
|
},
|
|
{
|
|
"epoch": 0.017462932454695223,
|
|
"grad_norm": 1.690664529800415,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0938,
|
|
"step": 159
|
|
},
|
|
{
|
|
"epoch": 0.01757276221856123,
|
|
"grad_norm": 2.284759044647217,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1171,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 0.017682591982427238,
|
|
"grad_norm": 1.7743721008300781,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0976,
|
|
"step": 161
|
|
},
|
|
{
|
|
"epoch": 0.017792421746293245,
|
|
"grad_norm": 2.1249804496765137,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0904,
|
|
"step": 162
|
|
},
|
|
{
|
|
"epoch": 0.017902251510159253,
|
|
"grad_norm": 3.607625722885132,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1277,
|
|
"step": 163
|
|
},
|
|
{
|
|
"epoch": 0.01801208127402526,
|
|
"grad_norm": 1.950108289718628,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1016,
|
|
"step": 164
|
|
},
|
|
{
|
|
"epoch": 0.018121911037891267,
|
|
"grad_norm": 1.6242471933364868,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0919,
|
|
"step": 165
|
|
},
|
|
{
|
|
"epoch": 0.018231740801757275,
|
|
"grad_norm": 2.4311513900756836,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1112,
|
|
"step": 166
|
|
},
|
|
{
|
|
"epoch": 0.018341570565623282,
|
|
"grad_norm": 1.5507546663284302,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0981,
|
|
"step": 167
|
|
},
|
|
{
|
|
"epoch": 0.018451400329489293,
|
|
"grad_norm": 1.9630911350250244,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1087,
|
|
"step": 168
|
|
},
|
|
{
|
|
"epoch": 0.0185612300933553,
|
|
"grad_norm": 1.6163691282272339,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.092,
|
|
"step": 169
|
|
},
|
|
{
|
|
"epoch": 0.018671059857221308,
|
|
"grad_norm": 1.647873878479004,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0867,
|
|
"step": 170
|
|
},
|
|
{
|
|
"epoch": 0.018780889621087316,
|
|
"grad_norm": 2.0003228187561035,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1166,
|
|
"step": 171
|
|
},
|
|
{
|
|
"epoch": 0.018890719384953323,
|
|
"grad_norm": 2.019808053970337,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1139,
|
|
"step": 172
|
|
},
|
|
{
|
|
"epoch": 0.01900054914881933,
|
|
"grad_norm": 1.6541454792022705,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0857,
|
|
"step": 173
|
|
},
|
|
{
|
|
"epoch": 0.019110378912685338,
|
|
"grad_norm": 2.194434642791748,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0919,
|
|
"step": 174
|
|
},
|
|
{
|
|
"epoch": 0.019220208676551345,
|
|
"grad_norm": 2.8449411392211914,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.113,
|
|
"step": 175
|
|
},
|
|
{
|
|
"epoch": 0.019330038440417353,
|
|
"grad_norm": 2.208855152130127,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0964,
|
|
"step": 176
|
|
},
|
|
{
|
|
"epoch": 0.01943986820428336,
|
|
"grad_norm": 1.9803837537765503,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0968,
|
|
"step": 177
|
|
},
|
|
{
|
|
"epoch": 0.019549697968149368,
|
|
"grad_norm": 1.8835409879684448,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1059,
|
|
"step": 178
|
|
},
|
|
{
|
|
"epoch": 0.019659527732015375,
|
|
"grad_norm": 2.523775339126587,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.1217,
|
|
"step": 179
|
|
},
|
|
{
|
|
"epoch": 0.019769357495881382,
|
|
"grad_norm": 2.160933494567871,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0974,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 0.01987918725974739,
|
|
"grad_norm": 1.6890003681182861,
|
|
"learning_rate": 1e-06,
|
|
"loss": 0.0856,
|
|
"step": 181
|
|
},
|
|
{
|
|
"epoch": 0.01987918725974739,
|
|
"step": 181,
|
|
"total_flos": 0.0,
|
|
"train_loss": 0.05325274675755211,
|
|
"train_runtime": 2257.3022,
|
|
"train_samples_per_second": 1.283,
|
|
"train_steps_per_second": 0.08
|
|
}
|
|
],
|
|
"logging_steps": 1,
|
|
"max_steps": 181,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 1,
|
|
"save_steps": 91,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 0.0,
|
|
"train_batch_size": 4,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|