Files
qwen3-8b-sft-datamix-350/trainer_state.json

1575 lines
37 KiB
JSON
Raw Permalink Normal View History

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.0,
"eval_steps": 500,
"global_step": 220,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.022727272727272728,
"grad_norm": 26.87245830061202,
"learning_rate": 0.0,
"loss": 1.0691,
"step": 1
},
{
"epoch": 0.045454545454545456,
"grad_norm": 24.119318241718858,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.989,
"step": 2
},
{
"epoch": 0.06818181818181818,
"grad_norm": 24.494901097246274,
"learning_rate": 6.666666666666667e-06,
"loss": 0.9827,
"step": 3
},
{
"epoch": 0.09090909090909091,
"grad_norm": 8.720154396981934,
"learning_rate": 1e-05,
"loss": 0.6482,
"step": 4
},
{
"epoch": 0.11363636363636363,
"grad_norm": 4.966678220163427,
"learning_rate": 9.999476022424688e-06,
"loss": 0.5896,
"step": 5
},
{
"epoch": 0.13636363636363635,
"grad_norm": 2.206240528335694,
"learning_rate": 9.997904199519748e-06,
"loss": 0.392,
"step": 6
},
{
"epoch": 0.1590909090909091,
"grad_norm": 3.9617287120048257,
"learning_rate": 9.995284860725162e-06,
"loss": 0.4349,
"step": 7
},
{
"epoch": 0.18181818181818182,
"grad_norm": 2.7218886368671966,
"learning_rate": 9.991618555030848e-06,
"loss": 0.3502,
"step": 8
},
{
"epoch": 0.20454545454545456,
"grad_norm": 1.3224686041620617,
"learning_rate": 9.986906050861595e-06,
"loss": 0.342,
"step": 9
},
{
"epoch": 0.22727272727272727,
"grad_norm": 1.4655470659379455,
"learning_rate": 9.981148335916e-06,
"loss": 0.3311,
"step": 10
},
{
"epoch": 0.25,
"grad_norm": 2.099411259913016,
"learning_rate": 9.974346616959476e-06,
"loss": 0.3786,
"step": 11
},
{
"epoch": 0.2727272727272727,
"grad_norm": 1.5150343483501494,
"learning_rate": 9.966502319571303e-06,
"loss": 0.2944,
"step": 12
},
{
"epoch": 0.29545454545454547,
"grad_norm": 1.8955486495667653,
"learning_rate": 9.95761708784585e-06,
"loss": 0.3586,
"step": 13
},
{
"epoch": 0.3181818181818182,
"grad_norm": 1.0770593717657149,
"learning_rate": 9.94769278404799e-06,
"loss": 0.3189,
"step": 14
},
{
"epoch": 0.3409090909090909,
"grad_norm": 1.8091317213751315,
"learning_rate": 9.936731488222776e-06,
"loss": 0.3416,
"step": 15
},
{
"epoch": 0.36363636363636365,
"grad_norm": 2.29274261320313,
"learning_rate": 9.924735497759497e-06,
"loss": 0.3399,
"step": 16
},
{
"epoch": 0.38636363636363635,
"grad_norm": 1.269837989850999,
"learning_rate": 9.911707326910145e-06,
"loss": 0.3543,
"step": 17
},
{
"epoch": 0.4090909090909091,
"grad_norm": 1.5013795426439296,
"learning_rate": 9.897649706262474e-06,
"loss": 0.2903,
"step": 18
},
{
"epoch": 0.4318181818181818,
"grad_norm": 1.4088618552574337,
"learning_rate": 9.882565582167673e-06,
"loss": 0.2845,
"step": 19
},
{
"epoch": 0.45454545454545453,
"grad_norm": 1.5165819472750817,
"learning_rate": 9.866458116122852e-06,
"loss": 0.316,
"step": 20
},
{
"epoch": 0.4772727272727273,
"grad_norm": 1.6134481868835353,
"learning_rate": 9.849330684108409e-06,
"loss": 0.2928,
"step": 21
},
{
"epoch": 0.5,
"grad_norm": 1.1143653341061437,
"learning_rate": 9.831186875880467e-06,
"loss": 0.276,
"step": 22
},
{
"epoch": 0.5227272727272727,
"grad_norm": 0.9830408617009574,
"learning_rate": 9.812030494218484e-06,
"loss": 0.313,
"step": 23
},
{
"epoch": 0.5454545454545454,
"grad_norm": 1.3736364481102779,
"learning_rate": 9.79186555412822e-06,
"loss": 0.3023,
"step": 24
},
{
"epoch": 0.5681818181818182,
"grad_norm": 1.338556634218699,
"learning_rate": 9.770696282000245e-06,
"loss": 0.3273,
"step": 25
},
{
"epoch": 0.5909090909090909,
"grad_norm": 1.34887345898166,
"learning_rate": 9.748527114724111e-06,
"loss": 0.3059,
"step": 26
},
{
"epoch": 0.6136363636363636,
"grad_norm": 1.1465526284754688,
"learning_rate": 9.725362698758425e-06,
"loss": 0.254,
"step": 27
},
{
"epoch": 0.6363636363636364,
"grad_norm": 1.1328089241339367,
"learning_rate": 9.701207889156989e-06,
"loss": 0.2727,
"step": 28
},
{
"epoch": 0.6590909090909091,
"grad_norm": 1.4785097164649903,
"learning_rate": 9.676067748551232e-06,
"loss": 0.314,
"step": 29
},
{
"epoch": 0.6818181818181818,
"grad_norm": 1.2861366584159655,
"learning_rate": 9.64994754608912e-06,
"loss": 0.3216,
"step": 30
},
{
"epoch": 0.7045454545454546,
"grad_norm": 1.254630631559985,
"learning_rate": 9.622852756330797e-06,
"loss": 0.2671,
"step": 31
},
{
"epoch": 0.7272727272727273,
"grad_norm": 1.4601173539398735,
"learning_rate": 9.594789058101154e-06,
"loss": 0.283,
"step": 32
},
{
"epoch": 0.75,
"grad_norm": 0.9800010703607837,
"learning_rate": 9.565762333299616e-06,
"loss": 0.2176,
"step": 33
},
{
"epoch": 0.7727272727272727,
"grad_norm": 1.585779573547555,
"learning_rate": 9.535778665667334e-06,
"loss": 0.3186,
"step": 34
},
{
"epoch": 0.7954545454545454,
"grad_norm": 1.3270309012768746,
"learning_rate": 9.504844339512096e-06,
"loss": 0.334,
"step": 35
},
{
"epoch": 0.8181818181818182,
"grad_norm": 1.2326173009325117,
"learning_rate": 9.472965838391187e-06,
"loss": 0.2808,
"step": 36
},
{
"epoch": 0.8409090909090909,
"grad_norm": 1.1558051437795536,
"learning_rate": 9.44014984375249e-06,
"loss": 0.2117,
"step": 37
},
{
"epoch": 0.8636363636363636,
"grad_norm": 1.0782911468120715,
"learning_rate": 9.406403233534134e-06,
"loss": 0.2824,
"step": 38
},
{
"epoch": 0.8863636363636364,
"grad_norm": 1.5406724902243696,
"learning_rate": 9.371733080722911e-06,
"loss": 0.2335,
"step": 39
},
{
"epoch": 0.9090909090909091,
"grad_norm": 1.2044242055409695,
"learning_rate": 9.33614665187187e-06,
"loss": 0.2499,
"step": 40
},
{
"epoch": 0.9318181818181818,
"grad_norm": 1.2616965501514557,
"learning_rate": 9.299651405577286e-06,
"loss": 0.2438,
"step": 41
},
{
"epoch": 0.9545454545454546,
"grad_norm": 1.1136761921157818,
"learning_rate": 9.262254990915427e-06,
"loss": 0.2785,
"step": 42
},
{
"epoch": 0.9772727272727273,
"grad_norm": 0.9966948364040108,
"learning_rate": 9.223965245839367e-06,
"loss": 0.2597,
"step": 43
},
{
"epoch": 1.0,
"grad_norm": 1.3791645613025802,
"learning_rate": 9.184790195536217e-06,
"loss": 0.2679,
"step": 44
},
{
"epoch": 1.0227272727272727,
"grad_norm": 1.179410268749222,
"learning_rate": 9.144738050745129e-06,
"loss": 0.181,
"step": 45
},
{
"epoch": 1.0454545454545454,
"grad_norm": 1.1815878438627367,
"learning_rate": 9.103817206036383e-06,
"loss": 0.1863,
"step": 46
},
{
"epoch": 1.0681818181818181,
"grad_norm": 0.8101413228519797,
"learning_rate": 9.062036238051978e-06,
"loss": 0.1843,
"step": 47
},
{
"epoch": 1.0909090909090908,
"grad_norm": 0.9532028129997955,
"learning_rate": 9.019403903708036e-06,
"loss": 0.1732,
"step": 48
},
{
"epoch": 1.1136363636363635,
"grad_norm": 0.992565251308887,
"learning_rate": 8.975929138359423e-06,
"loss": 0.2059,
"step": 49
},
{
"epoch": 1.1363636363636362,
"grad_norm": 0.9117404975458566,
"learning_rate": 8.931621053926998e-06,
"loss": 0.2237,
"step": 50
},
{
"epoch": 1.1590909090909092,
"grad_norm": 0.8003178422788053,
"learning_rate": 8.886488936987817e-06,
"loss": 0.1334,
"step": 51
},
{
"epoch": 1.1818181818181819,
"grad_norm": 1.2363455579654716,
"learning_rate": 8.840542246828763e-06,
"loss": 0.2168,
"step": 52
},
{
"epoch": 1.2045454545454546,
"grad_norm": 1.2347708733857203,
"learning_rate": 8.793790613463956e-06,
"loss": 0.175,
"step": 53
},
{
"epoch": 1.2272727272727273,
"grad_norm": 1.2303702228676998,
"learning_rate": 8.746243835616392e-06,
"loss": 0.1787,
"step": 54
},
{
"epoch": 1.25,
"grad_norm": 1.2497191463046406,
"learning_rate": 8.697911878664222e-06,
"loss": 0.1739,
"step": 55
},
{
"epoch": 1.2727272727272727,
"grad_norm": 1.3951861645180035,
"learning_rate": 8.648804872552092e-06,
"loss": 0.1847,
"step": 56
},
{
"epoch": 1.2954545454545454,
"grad_norm": 1.245991460551998,
"learning_rate": 8.598933109667995e-06,
"loss": 0.1351,
"step": 57
},
{
"epoch": 1.3181818181818181,
"grad_norm": 1.32907622391414,
"learning_rate": 8.548307042686093e-06,
"loss": 0.1546,
"step": 58
},
{
"epoch": 1.3409090909090908,
"grad_norm": 1.4968562879002865,
"learning_rate": 8.496937282375912e-06,
"loss": 0.2356,
"step": 59
},
{
"epoch": 1.3636363636363638,
"grad_norm": 0.9737096273924404,
"learning_rate": 8.444834595378434e-06,
"loss": 0.1335,
"step": 60
},
{
"epoch": 1.3863636363636362,
"grad_norm": 1.3589415450025601,
"learning_rate": 8.3920099019495e-06,
"loss": 0.1363,
"step": 61
},
{
"epoch": 1.4090909090909092,
"grad_norm": 0.8664968714166548,
"learning_rate": 8.33847427367102e-06,
"loss": 0.1056,
"step": 62
},
{
"epoch": 1.4318181818181819,
"grad_norm": 1.0430422759251574,
"learning_rate": 8.284238931130476e-06,
"loss": 0.1827,
"step": 63
},
{
"epoch": 1.4545454545454546,
"grad_norm": 1.0086301864952136,
"learning_rate": 8.229315241569177e-06,
"loss": 0.1398,
"step": 64
},
{
"epoch": 1.4772727272727273,
"grad_norm": 0.9279203416268156,
"learning_rate": 8.173714716499801e-06,
"loss": 0.157,
"step": 65
},
{
"epoch": 1.5,
"grad_norm": 1.1440300758673703,
"learning_rate": 8.117449009293668e-06,
"loss": 0.1685,
"step": 66
},
{
"epoch": 1.5227272727272727,
"grad_norm": 1.1751439805537514,
"learning_rate": 8.060529912738316e-06,
"loss": 0.1572,
"step": 67
},
{
"epoch": 1.5454545454545454,
"grad_norm": 1.0578357890566388,
"learning_rate": 8.002969356565822e-06,
"loss": 0.1598,
"step": 68
},
{
"epoch": 1.5681818181818183,
"grad_norm": 1.3193928833299897,
"learning_rate": 7.94477940495245e-06,
"loss": 0.1854,
"step": 69
},
{
"epoch": 1.5909090909090908,
"grad_norm": 1.2978218132135766,
"learning_rate": 7.885972253990104e-06,
"loss": 0.1743,
"step": 70
},
{
"epoch": 1.6136363636363638,
"grad_norm": 1.0105258814245202,
"learning_rate": 7.826560229130132e-06,
"loss": 0.1987,
"step": 71
},
{
"epoch": 1.6363636363636362,
"grad_norm": 1.004604979048799,
"learning_rate": 7.766555782600023e-06,
"loss": 0.1795,
"step": 72
},
{
"epoch": 1.6590909090909092,
"grad_norm": 1.1179470989774414,
"learning_rate": 7.70597149079354e-06,
"loss": 0.1815,
"step": 73
},
{
"epoch": 1.6818181818181817,
"grad_norm": 1.166448144503895,
"learning_rate": 7.644820051634813e-06,
"loss": 0.1617,
"step": 74
},
{
"epoch": 1.7045454545454546,
"grad_norm": 0.9473819093100403,
"learning_rate": 7.5831142819169664e-06,
"loss": 0.1282,
"step": 75
},
{
"epoch": 1.7272727272727273,
"grad_norm": 1.3707253122758942,
"learning_rate": 7.520867114615844e-06,
"loss": 0.1843,
"step": 76
},
{
"epoch": 1.75,
"grad_norm": 0.8753193774986169,
"learning_rate": 7.458091596179359e-06,
"loss": 0.1205,
"step": 77
},
{
"epoch": 1.7727272727272727,
"grad_norm": 0.7554237843642733,
"learning_rate": 7.394800883793087e-06,
"loss": 0.0983,
"step": 78
},
{
"epoch": 1.7954545454545454,
"grad_norm": 1.1973348363016902,
"learning_rate": 7.331008242622637e-06,
"loss": 0.1848,
"step": 79
},
{
"epoch": 1.8181818181818183,
"grad_norm": 1.1860354581221395,
"learning_rate": 7.266727043033386e-06,
"loss": 0.1527,
"step": 80
},
{
"epoch": 1.8409090909090908,
"grad_norm": 1.4128677081083227,
"learning_rate": 7.201970757788172e-06,
"loss": 0.1602,
"step": 81
},
{
"epoch": 1.8636363636363638,
"grad_norm": 1.296872182127621,
"learning_rate": 7.136752959223527e-06,
"loss": 0.2184,
"step": 82
},
{
"epoch": 1.8863636363636362,
"grad_norm": 1.4836645345593107,
"learning_rate": 7.071087316405037e-06,
"loss": 0.2896,
"step": 83
},
{
"epoch": 1.9090909090909092,
"grad_norm": 1.1976778490053432,
"learning_rate": 7.00498759226242e-06,
"loss": 0.1659,
"step": 84
},
{
"epoch": 1.9318181818181817,
"grad_norm": 0.9975231348395474,
"learning_rate": 6.938467640704953e-06,
"loss": 0.1535,
"step": 85
},
{
"epoch": 1.9545454545454546,
"grad_norm": 0.9648415033602733,
"learning_rate": 6.871541403717808e-06,
"loss": 0.1753,
"step": 86
},
{
"epoch": 1.9772727272727273,
"grad_norm": 1.2019101492420445,
"learning_rate": 6.8042229084399325e-06,
"loss": 0.1562,
"step": 87
},
{
"epoch": 2.0,
"grad_norm": 0.9443355724586839,
"learning_rate": 6.736526264224101e-06,
"loss": 0.1196,
"step": 88
},
{
"epoch": 2.022727272727273,
"grad_norm": 1.000147993017032,
"learning_rate": 6.668465659679714e-06,
"loss": 0.1105,
"step": 89
},
{
"epoch": 2.0454545454545454,
"grad_norm": 0.9077637361309605,
"learning_rate": 6.600055359698984e-06,
"loss": 0.1359,
"step": 90
},
{
"epoch": 2.0681818181818183,
"grad_norm": 0.8899974700309011,
"learning_rate": 6.531309702467159e-06,
"loss": 0.1051,
"step": 91
},
{
"epoch": 2.090909090909091,
"grad_norm": 0.7405665057218146,
"learning_rate": 6.462243096457352e-06,
"loss": 0.0949,
"step": 92
},
{
"epoch": 2.1136363636363638,
"grad_norm": 0.9232739478624769,
"learning_rate": 6.392870017410665e-06,
"loss": 0.0869,
"step": 93
},
{
"epoch": 2.1363636363636362,
"grad_norm": 0.907192376363368,
"learning_rate": 6.323205005302199e-06,
"loss": 0.085,
"step": 94
},
{
"epoch": 2.159090909090909,
"grad_norm": 0.9510815353153362,
"learning_rate": 6.2532626612936035e-06,
"loss": 0.1041,
"step": 95
},
{
"epoch": 2.1818181818181817,
"grad_norm": 1.0694010726357495,
"learning_rate": 6.18305764467281e-06,
"loss": 0.0933,
"step": 96
},
{
"epoch": 2.2045454545454546,
"grad_norm": 0.9096286210344772,
"learning_rate": 6.112604669781572e-06,
"loss": 0.0672,
"step": 97
},
{
"epoch": 2.227272727272727,
"grad_norm": 1.433917507625707,
"learning_rate": 6.041918502931473e-06,
"loss": 0.0879,
"step": 98
},
{
"epoch": 2.25,
"grad_norm": 0.9531885718869322,
"learning_rate": 5.971013959309038e-06,
"loss": 0.0596,
"step": 99
},
{
"epoch": 2.2727272727272725,
"grad_norm": 1.1923953651383123,
"learning_rate": 5.8999058998706046e-06,
"loss": 0.0788,
"step": 100
},
{
"epoch": 2.2954545454545454,
"grad_norm": 0.9554826618737247,
"learning_rate": 5.828609228227603e-06,
"loss": 0.073,
"step": 101
},
{
"epoch": 2.3181818181818183,
"grad_norm": 1.0620618620218882,
"learning_rate": 5.757138887522884e-06,
"loss": 0.0852,
"step": 102
},
{
"epoch": 2.340909090909091,
"grad_norm": 1.1580458870328374,
"learning_rate": 5.685509857298781e-06,
"loss": 0.1011,
"step": 103
},
{
"epoch": 2.3636363636363638,
"grad_norm": 1.5883148285084483,
"learning_rate": 5.613737150357528e-06,
"loss": 0.0791,
"step": 104
},
{
"epoch": 2.3863636363636362,
"grad_norm": 1.2367508056402248,
"learning_rate": 5.541835809614704e-06,
"loss": 0.0654,
"step": 105
},
{
"epoch": 2.409090909090909,
"grad_norm": 2.2791052878714653,
"learning_rate": 5.469820904946383e-06,
"loss": 0.087,
"step": 106
},
{
"epoch": 2.4318181818181817,
"grad_norm": 1.2009177571989036,
"learning_rate": 5.397707530030621e-06,
"loss": 0.0754,
"step": 107
},
{
"epoch": 2.4545454545454546,
"grad_norm": 1.24186865246545,
"learning_rate": 5.325510799183953e-06,
"loss": 0.0676,
"step": 108
},
{
"epoch": 2.4772727272727275,
"grad_norm": 1.3626254215524685,
"learning_rate": 5.253245844193564e-06,
"loss": 0.0897,
"step": 109
},
{
"epoch": 2.5,
"grad_norm": 1.2267940513161908,
"learning_rate": 5.180927811145818e-06,
"loss": 0.081,
"step": 110
},
{
"epoch": 2.5227272727272725,
"grad_norm": 1.0280554800159314,
"learning_rate": 5.108571857251754e-06,
"loss": 0.0998,
"step": 111
},
{
"epoch": 2.5454545454545454,
"grad_norm": 1.198523585670272,
"learning_rate": 5.036193147670286e-06,
"loss": 0.0943,
"step": 112
},
{
"epoch": 2.5681818181818183,
"grad_norm": 1.0299128746931727,
"learning_rate": 4.963806852329715e-06,
"loss": 0.0867,
"step": 113
},
{
"epoch": 2.590909090909091,
"grad_norm": 1.0781930889668705,
"learning_rate": 4.891428142748247e-06,
"loss": 0.0935,
"step": 114
},
{
"epoch": 2.6136363636363638,
"grad_norm": 1.3870007299043179,
"learning_rate": 4.819072188854183e-06,
"loss": 0.1038,
"step": 115
},
{
"epoch": 2.6363636363636362,
"grad_norm": 1.2763920896506822,
"learning_rate": 4.746754155806437e-06,
"loss": 0.1066,
"step": 116
},
{
"epoch": 2.659090909090909,
"grad_norm": 1.1553721597437976,
"learning_rate": 4.674489200816051e-06,
"loss": 0.0727,
"step": 117
},
{
"epoch": 2.6818181818181817,
"grad_norm": 1.2201292764615486,
"learning_rate": 4.602292469969381e-06,
"loss": 0.1029,
"step": 118
},
{
"epoch": 2.7045454545454546,
"grad_norm": 1.0182353763504708,
"learning_rate": 4.5301790950536175e-06,
"loss": 0.081,
"step": 119
},
{
"epoch": 2.7272727272727275,
"grad_norm": 0.8638534963225072,
"learning_rate": 4.458164190385297e-06,
"loss": 0.0743,
"step": 120
},
{
"epoch": 2.75,
"grad_norm": 1.179392830465865,
"learning_rate": 4.386262849642474e-06,
"loss": 0.1008,
"step": 121
},
{
"epoch": 2.7727272727272725,
"grad_norm": 0.8679825281753464,
"learning_rate": 4.31449014270122e-06,
"loss": 0.0493,
"step": 122
},
{
"epoch": 2.7954545454545454,
"grad_norm": 1.1253980579124658,
"learning_rate": 4.2428611124771184e-06,
"loss": 0.0848,
"step": 123
},
{
"epoch": 2.8181818181818183,
"grad_norm": 1.0356630565413245,
"learning_rate": 4.171390771772399e-06,
"loss": 0.068,
"step": 124
},
{
"epoch": 2.840909090909091,
"grad_norm": 1.3477554175880992,
"learning_rate": 4.100094100129396e-06,
"loss": 0.1043,
"step": 125
},
{
"epoch": 2.8636363636363638,
"grad_norm": 0.97866201549117,
"learning_rate": 4.028986040690963e-06,
"loss": 0.0847,
"step": 126
},
{
"epoch": 2.8863636363636362,
"grad_norm": 1.1349756610536412,
"learning_rate": 3.958081497068528e-06,
"loss": 0.0792,
"step": 127
},
{
"epoch": 2.909090909090909,
"grad_norm": 0.9146117017482877,
"learning_rate": 3.887395330218429e-06,
"loss": 0.0611,
"step": 128
},
{
"epoch": 2.9318181818181817,
"grad_norm": 1.0807764635079844,
"learning_rate": 3.816942355327191e-06,
"loss": 0.0904,
"step": 129
},
{
"epoch": 2.9545454545454546,
"grad_norm": 1.1271897510030453,
"learning_rate": 3.7467373387063973e-06,
"loss": 0.0769,
"step": 130
},
{
"epoch": 2.9772727272727275,
"grad_norm": 1.079134591854598,
"learning_rate": 3.6767949946978026e-06,
"loss": 0.0936,
"step": 131
},
{
"epoch": 3.0,
"grad_norm": 1.0766678958231195,
"learning_rate": 3.607129982589337e-06,
"loss": 0.0836,
"step": 132
},
{
"epoch": 3.022727272727273,
"grad_norm": 0.6602194945121888,
"learning_rate": 3.5377569035426494e-06,
"loss": 0.0432,
"step": 133
},
{
"epoch": 3.0454545454545454,
"grad_norm": 0.9207788970106073,
"learning_rate": 3.468690297532843e-06,
"loss": 0.0614,
"step": 134
},
{
"epoch": 3.0681818181818183,
"grad_norm": 0.7792864367119448,
"learning_rate": 3.3999446403010156e-06,
"loss": 0.044,
"step": 135
},
{
"epoch": 3.090909090909091,
"grad_norm": 0.7824193339459661,
"learning_rate": 3.331534340320287e-06,
"loss": 0.0299,
"step": 136
},
{
"epoch": 3.1136363636363638,
"grad_norm": 1.035803750265937,
"learning_rate": 3.2634737357758994e-06,
"loss": 0.0481,
"step": 137
},
{
"epoch": 3.1363636363636362,
"grad_norm": 0.7826621695998633,
"learning_rate": 3.1957770915600696e-06,
"loss": 0.0388,
"step": 138
},
{
"epoch": 3.159090909090909,
"grad_norm": 0.6958054164558033,
"learning_rate": 3.1284585962821957e-06,
"loss": 0.0351,
"step": 139
},
{
"epoch": 3.1818181818181817,
"grad_norm": 0.9734916299688532,
"learning_rate": 3.0615323592950495e-06,
"loss": 0.0458,
"step": 140
},
{
"epoch": 3.2045454545454546,
"grad_norm": 0.9750452432170936,
"learning_rate": 2.995012407737581e-06,
"loss": 0.044,
"step": 141
},
{
"epoch": 3.227272727272727,
"grad_norm": 1.1459789012585446,
"learning_rate": 2.9289126835949657e-06,
"loss": 0.0663,
"step": 142
},
{
"epoch": 3.25,
"grad_norm": 1.1249770693101788,
"learning_rate": 2.8632470407764746e-06,
"loss": 0.0431,
"step": 143
},
{
"epoch": 3.2727272727272725,
"grad_norm": 0.9864994227169254,
"learning_rate": 2.7980292422118282e-06,
"loss": 0.0606,
"step": 144
},
{
"epoch": 3.2954545454545454,
"grad_norm": 1.0092348933533415,
"learning_rate": 2.733272956966615e-06,
"loss": 0.0538,
"step": 145
},
{
"epoch": 3.3181818181818183,
"grad_norm": 1.39535927779365,
"learning_rate": 2.6689917573773615e-06,
"loss": 0.0531,
"step": 146
},
{
"epoch": 3.340909090909091,
"grad_norm": 0.9511300286527435,
"learning_rate": 2.605199116206912e-06,
"loss": 0.0382,
"step": 147
},
{
"epoch": 3.3636363636363638,
"grad_norm": 1.2541730166057663,
"learning_rate": 2.5419084038206422e-06,
"loss": 0.0419,
"step": 148
},
{
"epoch": 3.3863636363636362,
"grad_norm": 1.0095638491761618,
"learning_rate": 2.4791328853841577e-06,
"loss": 0.0434,
"step": 149
},
{
"epoch": 3.409090909090909,
"grad_norm": 0.8262732933318356,
"learning_rate": 2.416885718083035e-06,
"loss": 0.0322,
"step": 150
},
{
"epoch": 3.4318181818181817,
"grad_norm": 0.8137374498919325,
"learning_rate": 2.3551799483651894e-06,
"loss": 0.0308,
"step": 151
},
{
"epoch": 3.4545454545454546,
"grad_norm": 0.8629006626767369,
"learning_rate": 2.294028509206461e-06,
"loss": 0.0459,
"step": 152
},
{
"epoch": 3.4772727272727275,
"grad_norm": 0.75511924638048,
"learning_rate": 2.2334442173999794e-06,
"loss": 0.0304,
"step": 153
},
{
"epoch": 3.5,
"grad_norm": 0.765294235454733,
"learning_rate": 2.17343977086987e-06,
"loss": 0.0436,
"step": 154
},
{
"epoch": 3.5227272727272725,
"grad_norm": 0.8507628894917487,
"learning_rate": 2.114027746009897e-06,
"loss": 0.0277,
"step": 155
},
{
"epoch": 3.5454545454545454,
"grad_norm": 1.1371497063801275,
"learning_rate": 2.055220595047551e-06,
"loss": 0.0463,
"step": 156
},
{
"epoch": 3.5681818181818183,
"grad_norm": 1.0468638172133997,
"learning_rate": 1.9970306434341806e-06,
"loss": 0.0354,
"step": 157
},
{
"epoch": 3.590909090909091,
"grad_norm": 0.8300360380363072,
"learning_rate": 1.9394700872616856e-06,
"loss": 0.0377,
"step": 158
},
{
"epoch": 3.6136363636363638,
"grad_norm": 1.2431358466370912,
"learning_rate": 1.8825509907063328e-06,
"loss": 0.0407,
"step": 159
},
{
"epoch": 3.6363636363636362,
"grad_norm": 1.0116784265871086,
"learning_rate": 1.826285283500201e-06,
"loss": 0.0506,
"step": 160
},
{
"epoch": 3.659090909090909,
"grad_norm": 0.9351206425735656,
"learning_rate": 1.770684758430824e-06,
"loss": 0.0383,
"step": 161
},
{
"epoch": 3.6818181818181817,
"grad_norm": 0.7608293362619839,
"learning_rate": 1.7157610688695248e-06,
"loss": 0.0251,
"step": 162
},
{
"epoch": 3.7045454545454546,
"grad_norm": 1.0486186821729864,
"learning_rate": 1.6615257263289809e-06,
"loss": 0.0354,
"step": 163
},
{
"epoch": 3.7272727272727275,
"grad_norm": 0.8511206701235292,
"learning_rate": 1.607990098050501e-06,
"loss": 0.0375,
"step": 164
},
{
"epoch": 3.75,
"grad_norm": 0.9639257216156043,
"learning_rate": 1.555165404621567e-06,
"loss": 0.0406,
"step": 165
},
{
"epoch": 3.7727272727272725,
"grad_norm": 0.9665035321570787,
"learning_rate": 1.5030627176240903e-06,
"loss": 0.0386,
"step": 166
},
{
"epoch": 3.7954545454545454,
"grad_norm": 0.8935090470273086,
"learning_rate": 1.45169295731391e-06,
"loss": 0.0351,
"step": 167
},
{
"epoch": 3.8181818181818183,
"grad_norm": 1.1337183807356483,
"learning_rate": 1.4010668903320068e-06,
"loss": 0.0267,
"step": 168
},
{
"epoch": 3.840909090909091,
"grad_norm": 1.0191173211814406,
"learning_rate": 1.3511951274479096e-06,
"loss": 0.03,
"step": 169
},
{
"epoch": 3.8636363636363638,
"grad_norm": 0.8969739545223467,
"learning_rate": 1.3020881213357783e-06,
"loss": 0.0433,
"step": 170
},
{
"epoch": 3.8863636363636362,
"grad_norm": 0.8374168117834856,
"learning_rate": 1.2537561643836087e-06,
"loss": 0.0272,
"step": 171
},
{
"epoch": 3.909090909090909,
"grad_norm": 1.0969039495955495,
"learning_rate": 1.2062093865360458e-06,
"loss": 0.052,
"step": 172
},
{
"epoch": 3.9318181818181817,
"grad_norm": 0.7906472210562279,
"learning_rate": 1.1594577531712392e-06,
"loss": 0.0492,
"step": 173
},
{
"epoch": 3.9545454545454546,
"grad_norm": 0.8214507384737139,
"learning_rate": 1.1135110630121837e-06,
"loss": 0.036,
"step": 174
},
{
"epoch": 3.9772727272727275,
"grad_norm": 0.8916597776872088,
"learning_rate": 1.0683789460730037e-06,
"loss": 0.0358,
"step": 175
},
{
"epoch": 4.0,
"grad_norm": 0.9910806202679314,
"learning_rate": 1.0240708616405788e-06,
"loss": 0.0471,
"step": 176
},
{
"epoch": 4.0227272727272725,
"grad_norm": 0.8017333339630908,
"learning_rate": 9.80596096291967e-07,
"loss": 0.0421,
"step": 177
},
{
"epoch": 4.045454545454546,
"grad_norm": 0.6088709030309449,
"learning_rate": 9.379637619480236e-07,
"loss": 0.0197,
"step": 178
},
{
"epoch": 4.068181818181818,
"grad_norm": 0.5056506758915681,
"learning_rate": 8.961827939636198e-07,
"loss": 0.0161,
"step": 179
},
{
"epoch": 4.090909090909091,
"grad_norm": 0.7756562125854171,
"learning_rate": 8.552619492548736e-07,
"loss": 0.0268,
"step": 180
},
{
"epoch": 4.113636363636363,
"grad_norm": 0.5669855251712181,
"learning_rate": 8.15209804463783e-07,
"loss": 0.0202,
"step": 181
},
{
"epoch": 4.136363636363637,
"grad_norm": 0.6473780755422711,
"learning_rate": 7.760347541606339e-07,
"loss": 0.0307,
"step": 182
},
{
"epoch": 4.159090909090909,
"grad_norm": 0.5361573992028992,
"learning_rate": 7.377450090845733e-07,
"loss": 0.0215,
"step": 183
},
{
"epoch": 4.181818181818182,
"grad_norm": 0.6191327927742057,
"learning_rate": 7.003485944227162e-07,
"loss": 0.0297,
"step": 184
},
{
"epoch": 4.204545454545454,
"grad_norm": 0.7740351242770065,
"learning_rate": 6.638533481281323e-07,
"loss": 0.0277,
"step": 185
},
{
"epoch": 4.2272727272727275,
"grad_norm": 0.8036694531204502,
"learning_rate": 6.282669192770896e-07,
"loss": 0.0337,
"step": 186
},
{
"epoch": 4.25,
"grad_norm": 0.5569549110731221,
"learning_rate": 5.935967664658682e-07,
"loss": 0.023,
"step": 187
},
{
"epoch": 4.2727272727272725,
"grad_norm": 0.43268894985216455,
"learning_rate": 5.598501562475111e-07,
"loss": 0.015,
"step": 188
},
{
"epoch": 4.295454545454546,
"grad_norm": 0.5798546765255864,
"learning_rate": 5.270341616088153e-07,
"loss": 0.0161,
"step": 189
},
{
"epoch": 4.318181818181818,
"grad_norm": 0.667306809133945,
"learning_rate": 4.951556604879049e-07,
"loss": 0.0207,
"step": 190
},
{
"epoch": 4.340909090909091,
"grad_norm": 0.5559259388326303,
"learning_rate": 4.6422133433266513e-07,
"loss": 0.0154,
"step": 191
},
{
"epoch": 4.363636363636363,
"grad_norm": 0.48805637239261085,
"learning_rate": 4.342376667003845e-07,
"loss": 0.0133,
"step": 192
},
{
"epoch": 4.386363636363637,
"grad_norm": 0.633760717360346,
"learning_rate": 4.05210941898847e-07,
"loss": 0.0166,
"step": 193
},
{
"epoch": 4.409090909090909,
"grad_norm": 0.4941494223196582,
"learning_rate": 3.771472436692053e-07,
"loss": 0.0136,
"step": 194
},
{
"epoch": 4.431818181818182,
"grad_norm": 0.694276868602463,
"learning_rate": 3.500524539108807e-07,
"loss": 0.0218,
"step": 195
},
{
"epoch": 4.454545454545454,
"grad_norm": 0.588819670846474,
"learning_rate": 3.239322514487686e-07,
"loss": 0.0176,
"step": 196
},
{
"epoch": 4.4772727272727275,
"grad_norm": 0.5265383444966927,
"learning_rate": 2.9879211084301194e-07,
"loss": 0.0151,
"step": 197
},
{
"epoch": 4.5,
"grad_norm": 0.44878162692299606,
"learning_rate": 2.7463730124157706e-07,
"loss": 0.014,
"step": 198
},
{
"epoch": 4.5227272727272725,
"grad_norm": 0.6569401749445752,
"learning_rate": 2.5147288527588964e-07,
"loss": 0.0164,
"step": 199
},
{
"epoch": 4.545454545454545,
"grad_norm": 1.2770158126939868,
"learning_rate": 2.2930371799975593e-07,
"loss": 0.0218,
"step": 200
},
{
"epoch": 4.568181818181818,
"grad_norm": 0.5855843139833002,
"learning_rate": 2.0813444587178156e-07,
"loss": 0.0168,
"step": 201
},
{
"epoch": 4.590909090909091,
"grad_norm": 0.6376332689792783,
"learning_rate": 1.8796950578151785e-07,
"loss": 0.0152,
"step": 202
},
{
"epoch": 4.613636363636363,
"grad_norm": 0.6329781274553996,
"learning_rate": 1.6881312411953288e-07,
"loss": 0.0111,
"step": 203
},
{
"epoch": 4.636363636363637,
"grad_norm": 0.9765159855301974,
"learning_rate": 1.5066931589159118e-07,
"loss": 0.0278,
"step": 204
},
{
"epoch": 4.659090909090909,
"grad_norm": 0.6973740006107351,
"learning_rate": 1.3354188387715017e-07,
"loss": 0.022,
"step": 205
},
{
"epoch": 4.681818181818182,
"grad_norm": 0.5499873946022198,
"learning_rate": 1.174344178323289e-07,
"loss": 0.0155,
"step": 206
},
{
"epoch": 4.704545454545455,
"grad_norm": 0.7453329709429668,
"learning_rate": 1.0235029373752758e-07,
"loss": 0.0274,
"step": 207
},
{
"epoch": 4.7272727272727275,
"grad_norm": 0.4712400999164701,
"learning_rate": 8.829267308985535e-08,
"loss": 0.0128,
"step": 208
},
{
"epoch": 4.75,
"grad_norm": 0.791960861976085,
"learning_rate": 7.526450224050407e-08,
"loss": 0.0206,
"step": 209
},
{
"epoch": 4.7727272727272725,
"grad_norm": 0.8100989026594698,
"learning_rate": 6.326851177722304e-08,
"loss": 0.026,
"step": 210
},
{
"epoch": 4.795454545454545,
"grad_norm": 0.5698845908406204,
"learning_rate": 5.230721595201049e-08,
"loss": 0.0115,
"step": 211
},
{
"epoch": 4.818181818181818,
"grad_norm": 0.7463531903553184,
"learning_rate": 4.2382912154150244e-08,
"loss": 0.0178,
"step": 212
},
{
"epoch": 4.840909090909091,
"grad_norm": 0.7813627200692648,
"learning_rate": 3.3497680428697943e-08,
"loss": 0.0163,
"step": 213
},
{
"epoch": 4.863636363636363,
"grad_norm": 0.66982841564412,
"learning_rate": 2.5653383040524228e-08,
"loss": 0.0186,
"step": 214
},
{
"epoch": 4.886363636363637,
"grad_norm": 0.4637052800834321,
"learning_rate": 1.8851664083999742e-08,
"loss": 0.013,
"step": 215
},
{
"epoch": 4.909090909090909,
"grad_norm": 0.43304208586446546,
"learning_rate": 1.3093949138406892e-08,
"loss": 0.0081,
"step": 216
},
{
"epoch": 4.931818181818182,
"grad_norm": 0.8973836308599885,
"learning_rate": 8.381444969151608e-09,
"loss": 0.0276,
"step": 217
},
{
"epoch": 4.954545454545455,
"grad_norm": 0.6757448220546857,
"learning_rate": 4.7151392748379095e-09,
"loss": 0.0185,
"step": 218
},
{
"epoch": 4.9772727272727275,
"grad_norm": 0.6378204095313933,
"learning_rate": 2.0958004802529297e-09,
"loss": 0.0239,
"step": 219
},
{
"epoch": 5.0,
"grad_norm": 0.7520281673571331,
"learning_rate": 5.239775753129728e-10,
"loss": 0.0294,
"step": 220
}
],
"logging_steps": 1,
"max_steps": 220,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 75,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 14632660697088.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}