Files
Llama-3.1-8B-Instruct_SDFT_…/trainer_state.json

1311 lines
29 KiB
JSON
Raw Normal View History

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.01987918725974739,
"eval_steps": 500,
"global_step": 181,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00010982976386600769,
"grad_norm": 18.411802291870117,
"learning_rate": 0.0,
"loss": 0.3251,
"step": 1
},
{
"epoch": 0.00021965952773201537,
"grad_norm": 14.977940559387207,
"learning_rate": 5.2631578947368416e-08,
"loss": 0.3606,
"step": 2
},
{
"epoch": 0.00032948929159802305,
"grad_norm": 14.84136962890625,
"learning_rate": 1.0526315789473683e-07,
"loss": 0.2777,
"step": 3
},
{
"epoch": 0.00043931905546403075,
"grad_norm": 15.656201362609863,
"learning_rate": 1.5789473684210525e-07,
"loss": 0.293,
"step": 4
},
{
"epoch": 0.0005491488193300384,
"grad_norm": 11.50175666809082,
"learning_rate": 2.1052631578947366e-07,
"loss": 0.271,
"step": 5
},
{
"epoch": 0.0006589785831960461,
"grad_norm": 13.30196762084961,
"learning_rate": 2.631578947368421e-07,
"loss": 0.3014,
"step": 6
},
{
"epoch": 0.0007688083470620538,
"grad_norm": 18.187772750854492,
"learning_rate": 3.157894736842105e-07,
"loss": 0.2838,
"step": 7
},
{
"epoch": 0.0008786381109280615,
"grad_norm": 12.342240333557129,
"learning_rate": 3.684210526315789e-07,
"loss": 0.2961,
"step": 8
},
{
"epoch": 0.0009884678747940692,
"grad_norm": 12.453307151794434,
"learning_rate": 4.2105263157894733e-07,
"loss": 0.2727,
"step": 9
},
{
"epoch": 0.001098297638660077,
"grad_norm": 13.236316680908203,
"learning_rate": 4.7368421052631574e-07,
"loss": 0.2777,
"step": 10
},
{
"epoch": 0.0012081274025260845,
"grad_norm": 9.186564445495605,
"learning_rate": 5.263157894736842e-07,
"loss": 0.2362,
"step": 11
},
{
"epoch": 0.0013179571663920922,
"grad_norm": 14.68601131439209,
"learning_rate": 5.789473684210526e-07,
"loss": 0.3246,
"step": 12
},
{
"epoch": 0.0014277869302581,
"grad_norm": 12.118836402893066,
"learning_rate": 6.31578947368421e-07,
"loss": 0.2334,
"step": 13
},
{
"epoch": 0.0015376166941241077,
"grad_norm": 9.764254570007324,
"learning_rate": 6.842105263157895e-07,
"loss": 0.2382,
"step": 14
},
{
"epoch": 0.0016474464579901153,
"grad_norm": 7.129339694976807,
"learning_rate": 7.368421052631578e-07,
"loss": 0.2399,
"step": 15
},
{
"epoch": 0.001757276221856123,
"grad_norm": 6.999264717102051,
"learning_rate": 7.894736842105263e-07,
"loss": 0.1915,
"step": 16
},
{
"epoch": 0.0018671059857221306,
"grad_norm": 6.912661075592041,
"learning_rate": 8.421052631578947e-07,
"loss": 0.1996,
"step": 17
},
{
"epoch": 0.0019769357495881385,
"grad_norm": 6.077408313751221,
"learning_rate": 8.947368421052631e-07,
"loss": 0.1915,
"step": 18
},
{
"epoch": 0.002086765513454146,
"grad_norm": 4.185582160949707,
"learning_rate": 9.473684210526315e-07,
"loss": 0.176,
"step": 19
},
{
"epoch": 0.002196595277320154,
"grad_norm": 2.6936287879943848,
"learning_rate": 1e-06,
"loss": 0.1309,
"step": 20
},
{
"epoch": 0.0023064250411861617,
"grad_norm": 3.5675835609436035,
"learning_rate": 1e-06,
"loss": 0.1515,
"step": 21
},
{
"epoch": 0.002416254805052169,
"grad_norm": 2.7305240631103516,
"learning_rate": 1e-06,
"loss": 0.132,
"step": 22
},
{
"epoch": 0.002526084568918177,
"grad_norm": 2.0667977333068848,
"learning_rate": 1e-06,
"loss": 0.107,
"step": 23
},
{
"epoch": 0.0026359143327841844,
"grad_norm": 2.3355114459991455,
"learning_rate": 1e-06,
"loss": 0.1252,
"step": 24
},
{
"epoch": 0.0027457440966501922,
"grad_norm": 2.30021333694458,
"learning_rate": 1e-06,
"loss": 0.1262,
"step": 25
},
{
"epoch": 0.0028555738605162,
"grad_norm": 2.639949083328247,
"learning_rate": 1e-06,
"loss": 0.1611,
"step": 26
},
{
"epoch": 0.0029654036243822075,
"grad_norm": 2.6157100200653076,
"learning_rate": 1e-06,
"loss": 0.1133,
"step": 27
},
{
"epoch": 0.0030752333882482154,
"grad_norm": 2.25714373588562,
"learning_rate": 1e-06,
"loss": 0.1175,
"step": 28
},
{
"epoch": 0.003185063152114223,
"grad_norm": 2.413874864578247,
"learning_rate": 1e-06,
"loss": 0.1335,
"step": 29
},
{
"epoch": 0.0032948929159802307,
"grad_norm": 2.9752049446105957,
"learning_rate": 1e-06,
"loss": 0.1549,
"step": 30
},
{
"epoch": 0.003404722679846238,
"grad_norm": 2.191883087158203,
"learning_rate": 1e-06,
"loss": 0.1072,
"step": 31
},
{
"epoch": 0.003514552443712246,
"grad_norm": 2.182326078414917,
"learning_rate": 1e-06,
"loss": 0.1322,
"step": 32
},
{
"epoch": 0.003624382207578254,
"grad_norm": 2.0499420166015625,
"learning_rate": 1e-06,
"loss": 0.1252,
"step": 33
},
{
"epoch": 0.0037342119714442613,
"grad_norm": 2.898209571838379,
"learning_rate": 1e-06,
"loss": 0.1564,
"step": 34
},
{
"epoch": 0.003844041735310269,
"grad_norm": 2.6807024478912354,
"learning_rate": 1e-06,
"loss": 0.1215,
"step": 35
},
{
"epoch": 0.003953871499176277,
"grad_norm": 2.6628167629241943,
"learning_rate": 1e-06,
"loss": 0.125,
"step": 36
},
{
"epoch": 0.004063701263042284,
"grad_norm": 2.3685622215270996,
"learning_rate": 1e-06,
"loss": 0.1423,
"step": 37
},
{
"epoch": 0.004173531026908292,
"grad_norm": 2.0525412559509277,
"learning_rate": 1e-06,
"loss": 0.1321,
"step": 38
},
{
"epoch": 0.0042833607907743,
"grad_norm": 2.064305305480957,
"learning_rate": 1e-06,
"loss": 0.1139,
"step": 39
},
{
"epoch": 0.004393190554640308,
"grad_norm": 2.255208969116211,
"learning_rate": 1e-06,
"loss": 0.1075,
"step": 40
},
{
"epoch": 0.004503020318506315,
"grad_norm": 2.1674306392669678,
"learning_rate": 1e-06,
"loss": 0.1231,
"step": 41
},
{
"epoch": 0.004612850082372323,
"grad_norm": 2.1255414485931396,
"learning_rate": 1e-06,
"loss": 0.149,
"step": 42
},
{
"epoch": 0.004722679846238331,
"grad_norm": 1.9542911052703857,
"learning_rate": 1e-06,
"loss": 0.1406,
"step": 43
},
{
"epoch": 0.004832509610104338,
"grad_norm": 2.3152379989624023,
"learning_rate": 1e-06,
"loss": 0.122,
"step": 44
},
{
"epoch": 0.004942339373970346,
"grad_norm": 2.2259297370910645,
"learning_rate": 1e-06,
"loss": 0.1273,
"step": 45
},
{
"epoch": 0.005052169137836354,
"grad_norm": 2.1162829399108887,
"learning_rate": 1e-06,
"loss": 0.1071,
"step": 46
},
{
"epoch": 0.005161998901702361,
"grad_norm": 2.2870123386383057,
"learning_rate": 1e-06,
"loss": 0.1222,
"step": 47
},
{
"epoch": 0.005271828665568369,
"grad_norm": 2.1178293228149414,
"learning_rate": 1e-06,
"loss": 0.1128,
"step": 48
},
{
"epoch": 0.005381658429434377,
"grad_norm": 2.811563730239868,
"learning_rate": 1e-06,
"loss": 0.113,
"step": 49
},
{
"epoch": 0.0054914881933003845,
"grad_norm": 1.9846251010894775,
"learning_rate": 1e-06,
"loss": 0.1083,
"step": 50
},
{
"epoch": 0.005601317957166392,
"grad_norm": 4.522186279296875,
"learning_rate": 1e-06,
"loss": 0.1283,
"step": 51
},
{
"epoch": 0.0057111477210324,
"grad_norm": 1.9514762163162231,
"learning_rate": 1e-06,
"loss": 0.117,
"step": 52
},
{
"epoch": 0.005820977484898408,
"grad_norm": 1.644494652748108,
"learning_rate": 1e-06,
"loss": 0.1097,
"step": 53
},
{
"epoch": 0.005930807248764415,
"grad_norm": 2.26704740524292,
"learning_rate": 1e-06,
"loss": 0.0996,
"step": 54
},
{
"epoch": 0.0060406370126304225,
"grad_norm": 1.8278131484985352,
"learning_rate": 1e-06,
"loss": 0.1175,
"step": 55
},
{
"epoch": 0.006150466776496431,
"grad_norm": 2.9004828929901123,
"learning_rate": 1e-06,
"loss": 0.118,
"step": 56
},
{
"epoch": 0.006260296540362438,
"grad_norm": 2.1837573051452637,
"learning_rate": 1e-06,
"loss": 0.1052,
"step": 57
},
{
"epoch": 0.006370126304228446,
"grad_norm": 2.207904577255249,
"learning_rate": 1e-06,
"loss": 0.1057,
"step": 58
},
{
"epoch": 0.006479956068094454,
"grad_norm": 1.937738060951233,
"learning_rate": 1e-06,
"loss": 0.1377,
"step": 59
},
{
"epoch": 0.006589785831960461,
"grad_norm": 1.8323129415512085,
"learning_rate": 1e-06,
"loss": 0.0962,
"step": 60
},
{
"epoch": 0.006699615595826469,
"grad_norm": 1.9134773015975952,
"learning_rate": 1e-06,
"loss": 0.0991,
"step": 61
},
{
"epoch": 0.006809445359692476,
"grad_norm": 2.54829478263855,
"learning_rate": 1e-06,
"loss": 0.1199,
"step": 62
},
{
"epoch": 0.0069192751235584845,
"grad_norm": 2.0665552616119385,
"learning_rate": 1e-06,
"loss": 0.1242,
"step": 63
},
{
"epoch": 0.007029104887424492,
"grad_norm": 1.9088149070739746,
"learning_rate": 1e-06,
"loss": 0.1187,
"step": 64
},
{
"epoch": 0.007138934651290499,
"grad_norm": 1.6347726583480835,
"learning_rate": 1e-06,
"loss": 0.1087,
"step": 65
},
{
"epoch": 0.007248764415156508,
"grad_norm": 1.8846579790115356,
"learning_rate": 1e-06,
"loss": 0.115,
"step": 66
},
{
"epoch": 0.007358594179022515,
"grad_norm": 2.108840227127075,
"learning_rate": 1e-06,
"loss": 0.1296,
"step": 67
},
{
"epoch": 0.0074684239428885225,
"grad_norm": 2.0860307216644287,
"learning_rate": 1e-06,
"loss": 0.1088,
"step": 68
},
{
"epoch": 0.007578253706754531,
"grad_norm": 2.0085508823394775,
"learning_rate": 1e-06,
"loss": 0.1208,
"step": 69
},
{
"epoch": 0.007688083470620538,
"grad_norm": 1.8251607418060303,
"learning_rate": 1e-06,
"loss": 0.1112,
"step": 70
},
{
"epoch": 0.007797913234486546,
"grad_norm": 1.9405887126922607,
"learning_rate": 1e-06,
"loss": 0.1014,
"step": 71
},
{
"epoch": 0.007907742998352554,
"grad_norm": 2.248020887374878,
"learning_rate": 1e-06,
"loss": 0.107,
"step": 72
},
{
"epoch": 0.008017572762218561,
"grad_norm": 1.9570263624191284,
"learning_rate": 1e-06,
"loss": 0.1071,
"step": 73
},
{
"epoch": 0.008127402526084569,
"grad_norm": 2.1239218711853027,
"learning_rate": 1e-06,
"loss": 0.133,
"step": 74
},
{
"epoch": 0.008237232289950576,
"grad_norm": 1.7767431735992432,
"learning_rate": 1e-06,
"loss": 0.102,
"step": 75
},
{
"epoch": 0.008347062053816584,
"grad_norm": 2.4217135906219482,
"learning_rate": 1e-06,
"loss": 0.1004,
"step": 76
},
{
"epoch": 0.008456891817682591,
"grad_norm": 1.89665687084198,
"learning_rate": 1e-06,
"loss": 0.1241,
"step": 77
},
{
"epoch": 0.0085667215815486,
"grad_norm": 1.431766152381897,
"learning_rate": 1e-06,
"loss": 0.0849,
"step": 78
},
{
"epoch": 0.008676551345414608,
"grad_norm": 2.6968767642974854,
"learning_rate": 1e-06,
"loss": 0.1129,
"step": 79
},
{
"epoch": 0.008786381109280615,
"grad_norm": 1.8317389488220215,
"learning_rate": 1e-06,
"loss": 0.1033,
"step": 80
},
{
"epoch": 0.008896210873146623,
"grad_norm": 2.0611159801483154,
"learning_rate": 1e-06,
"loss": 0.1075,
"step": 81
},
{
"epoch": 0.00900604063701263,
"grad_norm": 2.4678268432617188,
"learning_rate": 1e-06,
"loss": 0.0914,
"step": 82
},
{
"epoch": 0.009115870400878637,
"grad_norm": 2.097642183303833,
"learning_rate": 1e-06,
"loss": 0.1033,
"step": 83
},
{
"epoch": 0.009225700164744647,
"grad_norm": 1.9599785804748535,
"learning_rate": 1e-06,
"loss": 0.1063,
"step": 84
},
{
"epoch": 0.009335529928610654,
"grad_norm": 1.938198447227478,
"learning_rate": 1e-06,
"loss": 0.0968,
"step": 85
},
{
"epoch": 0.009445359692476661,
"grad_norm": 2.0601954460144043,
"learning_rate": 1e-06,
"loss": 0.1077,
"step": 86
},
{
"epoch": 0.009555189456342669,
"grad_norm": 1.9235936403274536,
"learning_rate": 1e-06,
"loss": 0.1218,
"step": 87
},
{
"epoch": 0.009665019220208676,
"grad_norm": 1.6672967672348022,
"learning_rate": 1e-06,
"loss": 0.0982,
"step": 88
},
{
"epoch": 0.009774848984074684,
"grad_norm": 1.9302681684494019,
"learning_rate": 1e-06,
"loss": 0.1124,
"step": 89
},
{
"epoch": 0.009884678747940691,
"grad_norm": 2.2827959060668945,
"learning_rate": 1e-06,
"loss": 0.1178,
"step": 90
},
{
"epoch": 0.0099945085118067,
"grad_norm": 1.8714021444320679,
"learning_rate": 1e-06,
"loss": 0.0984,
"step": 91
},
{
"epoch": 0.010104338275672708,
"grad_norm": 2.1995835304260254,
"learning_rate": 1e-06,
"loss": 0.1319,
"step": 92
},
{
"epoch": 0.010214168039538715,
"grad_norm": 1.92769455909729,
"learning_rate": 1e-06,
"loss": 0.1026,
"step": 93
},
{
"epoch": 0.010323997803404723,
"grad_norm": 1.9699769020080566,
"learning_rate": 1e-06,
"loss": 0.1189,
"step": 94
},
{
"epoch": 0.01043382756727073,
"grad_norm": 2.7029881477355957,
"learning_rate": 1e-06,
"loss": 0.1282,
"step": 95
},
{
"epoch": 0.010543657331136738,
"grad_norm": 1.6077944040298462,
"learning_rate": 1e-06,
"loss": 0.1096,
"step": 96
},
{
"epoch": 0.010653487095002745,
"grad_norm": 2.0745413303375244,
"learning_rate": 1e-06,
"loss": 0.1154,
"step": 97
},
{
"epoch": 0.010763316858868754,
"grad_norm": 1.8612251281738281,
"learning_rate": 1e-06,
"loss": 0.1014,
"step": 98
},
{
"epoch": 0.010873146622734762,
"grad_norm": 1.8795632123947144,
"learning_rate": 1e-06,
"loss": 0.1029,
"step": 99
},
{
"epoch": 0.010982976386600769,
"grad_norm": 1.8857154846191406,
"learning_rate": 1e-06,
"loss": 0.1014,
"step": 100
},
{
"epoch": 0.011092806150466776,
"grad_norm": 1.87457275390625,
"learning_rate": 1e-06,
"loss": 0.1047,
"step": 101
},
{
"epoch": 0.011202635914332784,
"grad_norm": 2.02274489402771,
"learning_rate": 1e-06,
"loss": 0.117,
"step": 102
},
{
"epoch": 0.011312465678198791,
"grad_norm": 2.1100752353668213,
"learning_rate": 1e-06,
"loss": 0.1116,
"step": 103
},
{
"epoch": 0.0114222954420648,
"grad_norm": 2.1528773307800293,
"learning_rate": 1e-06,
"loss": 0.1147,
"step": 104
},
{
"epoch": 0.011532125205930808,
"grad_norm": 5.85520076751709,
"learning_rate": 1e-06,
"loss": 0.1089,
"step": 105
},
{
"epoch": 0.011641954969796815,
"grad_norm": 2.007204532623291,
"learning_rate": 1e-06,
"loss": 0.1244,
"step": 106
},
{
"epoch": 0.011751784733662823,
"grad_norm": 1.9761431217193604,
"learning_rate": 1e-06,
"loss": 0.104,
"step": 107
},
{
"epoch": 0.01186161449752883,
"grad_norm": 1.6352622509002686,
"learning_rate": 1e-06,
"loss": 0.1098,
"step": 108
},
{
"epoch": 0.011971444261394838,
"grad_norm": 1.898520588874817,
"learning_rate": 1e-06,
"loss": 0.1122,
"step": 109
},
{
"epoch": 0.012081274025260845,
"grad_norm": 1.6044663190841675,
"learning_rate": 1e-06,
"loss": 0.1044,
"step": 110
},
{
"epoch": 0.012191103789126854,
"grad_norm": 1.7996292114257812,
"learning_rate": 1e-06,
"loss": 0.1107,
"step": 111
},
{
"epoch": 0.012300933552992862,
"grad_norm": 1.949839472770691,
"learning_rate": 1e-06,
"loss": 0.1345,
"step": 112
},
{
"epoch": 0.012410763316858869,
"grad_norm": 1.7750391960144043,
"learning_rate": 1e-06,
"loss": 0.1023,
"step": 113
},
{
"epoch": 0.012520593080724876,
"grad_norm": 1.8512459993362427,
"learning_rate": 1e-06,
"loss": 0.0921,
"step": 114
},
{
"epoch": 0.012630422844590884,
"grad_norm": 1.8420369625091553,
"learning_rate": 1e-06,
"loss": 0.0949,
"step": 115
},
{
"epoch": 0.012740252608456891,
"grad_norm": 1.885312795639038,
"learning_rate": 1e-06,
"loss": 0.1118,
"step": 116
},
{
"epoch": 0.012850082372322899,
"grad_norm": 2.293736457824707,
"learning_rate": 1e-06,
"loss": 0.1227,
"step": 117
},
{
"epoch": 0.012959912136188908,
"grad_norm": 1.9772549867630005,
"learning_rate": 1e-06,
"loss": 0.1212,
"step": 118
},
{
"epoch": 0.013069741900054915,
"grad_norm": 2.280238151550293,
"learning_rate": 1e-06,
"loss": 0.1192,
"step": 119
},
{
"epoch": 0.013179571663920923,
"grad_norm": 1.9888858795166016,
"learning_rate": 1e-06,
"loss": 0.1027,
"step": 120
},
{
"epoch": 0.01328940142778693,
"grad_norm": 1.5730584859848022,
"learning_rate": 1e-06,
"loss": 0.0913,
"step": 121
},
{
"epoch": 0.013399231191652938,
"grad_norm": 1.7493692636489868,
"learning_rate": 1e-06,
"loss": 0.097,
"step": 122
},
{
"epoch": 0.013509060955518945,
"grad_norm": 2.1915624141693115,
"learning_rate": 1e-06,
"loss": 0.1217,
"step": 123
},
{
"epoch": 0.013618890719384952,
"grad_norm": 2.0627121925354004,
"learning_rate": 1e-06,
"loss": 0.1138,
"step": 124
},
{
"epoch": 0.013728720483250962,
"grad_norm": 1.8700608015060425,
"learning_rate": 1e-06,
"loss": 0.0957,
"step": 125
},
{
"epoch": 0.013838550247116969,
"grad_norm": 4.867977142333984,
"learning_rate": 1e-06,
"loss": 0.1143,
"step": 126
},
{
"epoch": 0.013948380010982976,
"grad_norm": 1.6484566926956177,
"learning_rate": 1e-06,
"loss": 0.0883,
"step": 127
},
{
"epoch": 0.014058209774848984,
"grad_norm": 1.7756706476211548,
"learning_rate": 1e-06,
"loss": 0.0992,
"step": 128
},
{
"epoch": 0.014168039538714991,
"grad_norm": 23.05768394470215,
"learning_rate": 1e-06,
"loss": 0.0779,
"step": 129
},
{
"epoch": 0.014277869302580999,
"grad_norm": 1.8049583435058594,
"learning_rate": 1e-06,
"loss": 0.1014,
"step": 130
},
{
"epoch": 0.014387699066447008,
"grad_norm": 1.9289432764053345,
"learning_rate": 1e-06,
"loss": 0.0958,
"step": 131
},
{
"epoch": 0.014497528830313015,
"grad_norm": 1.9363269805908203,
"learning_rate": 1e-06,
"loss": 0.1306,
"step": 132
},
{
"epoch": 0.014607358594179023,
"grad_norm": 1.7127413749694824,
"learning_rate": 1e-06,
"loss": 0.1015,
"step": 133
},
{
"epoch": 0.01471718835804503,
"grad_norm": 2.036144733428955,
"learning_rate": 1e-06,
"loss": 0.1002,
"step": 134
},
{
"epoch": 0.014827018121911038,
"grad_norm": 2.464301347732544,
"learning_rate": 1e-06,
"loss": 0.1119,
"step": 135
},
{
"epoch": 0.014936847885777045,
"grad_norm": 1.9304972887039185,
"learning_rate": 1e-06,
"loss": 0.1148,
"step": 136
},
{
"epoch": 0.015046677649643053,
"grad_norm": 1.6702697277069092,
"learning_rate": 1e-06,
"loss": 0.1113,
"step": 137
},
{
"epoch": 0.015156507413509062,
"grad_norm": 1.804840326309204,
"learning_rate": 1e-06,
"loss": 0.0956,
"step": 138
},
{
"epoch": 0.015266337177375069,
"grad_norm": 1.7338131666183472,
"learning_rate": 1e-06,
"loss": 0.107,
"step": 139
},
{
"epoch": 0.015376166941241077,
"grad_norm": 1.8909966945648193,
"learning_rate": 1e-06,
"loss": 0.093,
"step": 140
},
{
"epoch": 0.015485996705107084,
"grad_norm": 1.957493543624878,
"learning_rate": 1e-06,
"loss": 0.0968,
"step": 141
},
{
"epoch": 0.015595826468973091,
"grad_norm": 2.450575590133667,
"learning_rate": 1e-06,
"loss": 0.1133,
"step": 142
},
{
"epoch": 0.0157056562328391,
"grad_norm": 2.1446874141693115,
"learning_rate": 1e-06,
"loss": 0.1388,
"step": 143
},
{
"epoch": 0.015815485996705108,
"grad_norm": 2.1485435962677,
"learning_rate": 1e-06,
"loss": 0.1315,
"step": 144
},
{
"epoch": 0.015925315760571115,
"grad_norm": 2.152599573135376,
"learning_rate": 1e-06,
"loss": 0.1326,
"step": 145
},
{
"epoch": 0.016035145524437123,
"grad_norm": 1.9794915914535522,
"learning_rate": 1e-06,
"loss": 0.1277,
"step": 146
},
{
"epoch": 0.01614497528830313,
"grad_norm": 1.7658240795135498,
"learning_rate": 1e-06,
"loss": 0.1031,
"step": 147
},
{
"epoch": 0.016254805052169138,
"grad_norm": 1.8528263568878174,
"learning_rate": 1e-06,
"loss": 0.0952,
"step": 148
},
{
"epoch": 0.016364634816035145,
"grad_norm": 1.5647423267364502,
"learning_rate": 1e-06,
"loss": 0.1088,
"step": 149
},
{
"epoch": 0.016474464579901153,
"grad_norm": 2.419384717941284,
"learning_rate": 1e-06,
"loss": 0.0967,
"step": 150
},
{
"epoch": 0.01658429434376716,
"grad_norm": 1.5838412046432495,
"learning_rate": 1e-06,
"loss": 0.0839,
"step": 151
},
{
"epoch": 0.016694124107633167,
"grad_norm": 2.1404707431793213,
"learning_rate": 1e-06,
"loss": 0.1142,
"step": 152
},
{
"epoch": 0.016803953871499175,
"grad_norm": 1.6462748050689697,
"learning_rate": 1e-06,
"loss": 0.0947,
"step": 153
},
{
"epoch": 0.016913783635365182,
"grad_norm": 2.057058811187744,
"learning_rate": 1e-06,
"loss": 0.1123,
"step": 154
},
{
"epoch": 0.017023613399231193,
"grad_norm": 1.7520567178726196,
"learning_rate": 1e-06,
"loss": 0.1008,
"step": 155
},
{
"epoch": 0.0171334431630972,
"grad_norm": 2.039196014404297,
"learning_rate": 1e-06,
"loss": 0.1322,
"step": 156
},
{
"epoch": 0.017243272926963208,
"grad_norm": 1.8729941844940186,
"learning_rate": 1e-06,
"loss": 0.102,
"step": 157
},
{
"epoch": 0.017353102690829215,
"grad_norm": 1.9198112487792969,
"learning_rate": 1e-06,
"loss": 0.1141,
"step": 158
},
{
"epoch": 0.017462932454695223,
"grad_norm": 1.690664529800415,
"learning_rate": 1e-06,
"loss": 0.0938,
"step": 159
},
{
"epoch": 0.01757276221856123,
"grad_norm": 2.284759044647217,
"learning_rate": 1e-06,
"loss": 0.1171,
"step": 160
},
{
"epoch": 0.017682591982427238,
"grad_norm": 1.7743721008300781,
"learning_rate": 1e-06,
"loss": 0.0976,
"step": 161
},
{
"epoch": 0.017792421746293245,
"grad_norm": 2.1249804496765137,
"learning_rate": 1e-06,
"loss": 0.0904,
"step": 162
},
{
"epoch": 0.017902251510159253,
"grad_norm": 3.607625722885132,
"learning_rate": 1e-06,
"loss": 0.1277,
"step": 163
},
{
"epoch": 0.01801208127402526,
"grad_norm": 1.950108289718628,
"learning_rate": 1e-06,
"loss": 0.1016,
"step": 164
},
{
"epoch": 0.018121911037891267,
"grad_norm": 1.6242471933364868,
"learning_rate": 1e-06,
"loss": 0.0919,
"step": 165
},
{
"epoch": 0.018231740801757275,
"grad_norm": 2.4311513900756836,
"learning_rate": 1e-06,
"loss": 0.1112,
"step": 166
},
{
"epoch": 0.018341570565623282,
"grad_norm": 1.5507546663284302,
"learning_rate": 1e-06,
"loss": 0.0981,
"step": 167
},
{
"epoch": 0.018451400329489293,
"grad_norm": 1.9630911350250244,
"learning_rate": 1e-06,
"loss": 0.1087,
"step": 168
},
{
"epoch": 0.0185612300933553,
"grad_norm": 1.6163691282272339,
"learning_rate": 1e-06,
"loss": 0.092,
"step": 169
},
{
"epoch": 0.018671059857221308,
"grad_norm": 1.647873878479004,
"learning_rate": 1e-06,
"loss": 0.0867,
"step": 170
},
{
"epoch": 0.018780889621087316,
"grad_norm": 2.0003228187561035,
"learning_rate": 1e-06,
"loss": 0.1166,
"step": 171
},
{
"epoch": 0.018890719384953323,
"grad_norm": 2.019808053970337,
"learning_rate": 1e-06,
"loss": 0.1139,
"step": 172
},
{
"epoch": 0.01900054914881933,
"grad_norm": 1.6541454792022705,
"learning_rate": 1e-06,
"loss": 0.0857,
"step": 173
},
{
"epoch": 0.019110378912685338,
"grad_norm": 2.194434642791748,
"learning_rate": 1e-06,
"loss": 0.0919,
"step": 174
},
{
"epoch": 0.019220208676551345,
"grad_norm": 2.8449411392211914,
"learning_rate": 1e-06,
"loss": 0.113,
"step": 175
},
{
"epoch": 0.019330038440417353,
"grad_norm": 2.208855152130127,
"learning_rate": 1e-06,
"loss": 0.0964,
"step": 176
},
{
"epoch": 0.01943986820428336,
"grad_norm": 1.9803837537765503,
"learning_rate": 1e-06,
"loss": 0.0968,
"step": 177
},
{
"epoch": 0.019549697968149368,
"grad_norm": 1.8835409879684448,
"learning_rate": 1e-06,
"loss": 0.1059,
"step": 178
},
{
"epoch": 0.019659527732015375,
"grad_norm": 2.523775339126587,
"learning_rate": 1e-06,
"loss": 0.1217,
"step": 179
},
{
"epoch": 0.019769357495881382,
"grad_norm": 2.160933494567871,
"learning_rate": 1e-06,
"loss": 0.0974,
"step": 180
},
{
"epoch": 0.01987918725974739,
"grad_norm": 1.6890003681182861,
"learning_rate": 1e-06,
"loss": 0.0856,
"step": 181
},
{
"epoch": 0.01987918725974739,
"step": 181,
"total_flos": 0.0,
"train_loss": 0.05325274675755211,
"train_runtime": 2257.3022,
"train_samples_per_second": 1.283,
"train_steps_per_second": 0.08
}
],
"logging_steps": 1,
"max_steps": 181,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 91,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}