{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 220, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.022727272727272728, "grad_norm": 26.87245830061202, "learning_rate": 0.0, "loss": 1.0691, "step": 1 }, { "epoch": 0.045454545454545456, "grad_norm": 24.119318241718858, "learning_rate": 3.3333333333333333e-06, "loss": 0.989, "step": 2 }, { "epoch": 0.06818181818181818, "grad_norm": 24.494901097246274, "learning_rate": 6.666666666666667e-06, "loss": 0.9827, "step": 3 }, { "epoch": 0.09090909090909091, "grad_norm": 8.720154396981934, "learning_rate": 1e-05, "loss": 0.6482, "step": 4 }, { "epoch": 0.11363636363636363, "grad_norm": 4.966678220163427, "learning_rate": 9.999476022424688e-06, "loss": 0.5896, "step": 5 }, { "epoch": 0.13636363636363635, "grad_norm": 2.206240528335694, "learning_rate": 9.997904199519748e-06, "loss": 0.392, "step": 6 }, { "epoch": 0.1590909090909091, "grad_norm": 3.9617287120048257, "learning_rate": 9.995284860725162e-06, "loss": 0.4349, "step": 7 }, { "epoch": 0.18181818181818182, "grad_norm": 2.7218886368671966, "learning_rate": 9.991618555030848e-06, "loss": 0.3502, "step": 8 }, { "epoch": 0.20454545454545456, "grad_norm": 1.3224686041620617, "learning_rate": 9.986906050861595e-06, "loss": 0.342, "step": 9 }, { "epoch": 0.22727272727272727, "grad_norm": 1.4655470659379455, "learning_rate": 9.981148335916e-06, "loss": 0.3311, "step": 10 }, { "epoch": 0.25, "grad_norm": 2.099411259913016, "learning_rate": 9.974346616959476e-06, "loss": 0.3786, "step": 11 }, { "epoch": 0.2727272727272727, "grad_norm": 1.5150343483501494, "learning_rate": 9.966502319571303e-06, "loss": 0.2944, "step": 12 }, { "epoch": 0.29545454545454547, "grad_norm": 1.8955486495667653, "learning_rate": 9.95761708784585e-06, "loss": 0.3586, "step": 13 }, { "epoch": 0.3181818181818182, "grad_norm": 1.0770593717657149, "learning_rate": 9.94769278404799e-06, "loss": 0.3189, "step": 14 }, { "epoch": 0.3409090909090909, "grad_norm": 1.8091317213751315, "learning_rate": 9.936731488222776e-06, "loss": 0.3416, "step": 15 }, { "epoch": 0.36363636363636365, "grad_norm": 2.29274261320313, "learning_rate": 9.924735497759497e-06, "loss": 0.3399, "step": 16 }, { "epoch": 0.38636363636363635, "grad_norm": 1.269837989850999, "learning_rate": 9.911707326910145e-06, "loss": 0.3543, "step": 17 }, { "epoch": 0.4090909090909091, "grad_norm": 1.5013795426439296, "learning_rate": 9.897649706262474e-06, "loss": 0.2903, "step": 18 }, { "epoch": 0.4318181818181818, "grad_norm": 1.4088618552574337, "learning_rate": 9.882565582167673e-06, "loss": 0.2845, "step": 19 }, { "epoch": 0.45454545454545453, "grad_norm": 1.5165819472750817, "learning_rate": 9.866458116122852e-06, "loss": 0.316, "step": 20 }, { "epoch": 0.4772727272727273, "grad_norm": 1.6134481868835353, "learning_rate": 9.849330684108409e-06, "loss": 0.2928, "step": 21 }, { "epoch": 0.5, "grad_norm": 1.1143653341061437, "learning_rate": 9.831186875880467e-06, "loss": 0.276, "step": 22 }, { "epoch": 0.5227272727272727, "grad_norm": 0.9830408617009574, "learning_rate": 9.812030494218484e-06, "loss": 0.313, "step": 23 }, { "epoch": 0.5454545454545454, "grad_norm": 1.3736364481102779, "learning_rate": 9.79186555412822e-06, "loss": 0.3023, "step": 24 }, { "epoch": 0.5681818181818182, "grad_norm": 1.338556634218699, "learning_rate": 9.770696282000245e-06, "loss": 0.3273, "step": 25 }, { "epoch": 0.5909090909090909, "grad_norm": 1.34887345898166, "learning_rate": 9.748527114724111e-06, "loss": 0.3059, "step": 26 }, { "epoch": 0.6136363636363636, "grad_norm": 1.1465526284754688, "learning_rate": 9.725362698758425e-06, "loss": 0.254, "step": 27 }, { "epoch": 0.6363636363636364, "grad_norm": 1.1328089241339367, "learning_rate": 9.701207889156989e-06, "loss": 0.2727, "step": 28 }, { "epoch": 0.6590909090909091, "grad_norm": 1.4785097164649903, "learning_rate": 9.676067748551232e-06, "loss": 0.314, "step": 29 }, { "epoch": 0.6818181818181818, "grad_norm": 1.2861366584159655, "learning_rate": 9.64994754608912e-06, "loss": 0.3216, "step": 30 }, { "epoch": 0.7045454545454546, "grad_norm": 1.254630631559985, "learning_rate": 9.622852756330797e-06, "loss": 0.2671, "step": 31 }, { "epoch": 0.7272727272727273, "grad_norm": 1.4601173539398735, "learning_rate": 9.594789058101154e-06, "loss": 0.283, "step": 32 }, { "epoch": 0.75, "grad_norm": 0.9800010703607837, "learning_rate": 9.565762333299616e-06, "loss": 0.2176, "step": 33 }, { "epoch": 0.7727272727272727, "grad_norm": 1.585779573547555, "learning_rate": 9.535778665667334e-06, "loss": 0.3186, "step": 34 }, { "epoch": 0.7954545454545454, "grad_norm": 1.3270309012768746, "learning_rate": 9.504844339512096e-06, "loss": 0.334, "step": 35 }, { "epoch": 0.8181818181818182, "grad_norm": 1.2326173009325117, "learning_rate": 9.472965838391187e-06, "loss": 0.2808, "step": 36 }, { "epoch": 0.8409090909090909, "grad_norm": 1.1558051437795536, "learning_rate": 9.44014984375249e-06, "loss": 0.2117, "step": 37 }, { "epoch": 0.8636363636363636, "grad_norm": 1.0782911468120715, "learning_rate": 9.406403233534134e-06, "loss": 0.2824, "step": 38 }, { "epoch": 0.8863636363636364, "grad_norm": 1.5406724902243696, "learning_rate": 9.371733080722911e-06, "loss": 0.2335, "step": 39 }, { "epoch": 0.9090909090909091, "grad_norm": 1.2044242055409695, "learning_rate": 9.33614665187187e-06, "loss": 0.2499, "step": 40 }, { "epoch": 0.9318181818181818, "grad_norm": 1.2616965501514557, "learning_rate": 9.299651405577286e-06, "loss": 0.2438, "step": 41 }, { "epoch": 0.9545454545454546, "grad_norm": 1.1136761921157818, "learning_rate": 9.262254990915427e-06, "loss": 0.2785, "step": 42 }, { "epoch": 0.9772727272727273, "grad_norm": 0.9966948364040108, "learning_rate": 9.223965245839367e-06, "loss": 0.2597, "step": 43 }, { "epoch": 1.0, "grad_norm": 1.3791645613025802, "learning_rate": 9.184790195536217e-06, "loss": 0.2679, "step": 44 }, { "epoch": 1.0227272727272727, "grad_norm": 1.179410268749222, "learning_rate": 9.144738050745129e-06, "loss": 0.181, "step": 45 }, { "epoch": 1.0454545454545454, "grad_norm": 1.1815878438627367, "learning_rate": 9.103817206036383e-06, "loss": 0.1863, "step": 46 }, { "epoch": 1.0681818181818181, "grad_norm": 0.8101413228519797, "learning_rate": 9.062036238051978e-06, "loss": 0.1843, "step": 47 }, { "epoch": 1.0909090909090908, "grad_norm": 0.9532028129997955, "learning_rate": 9.019403903708036e-06, "loss": 0.1732, "step": 48 }, { "epoch": 1.1136363636363635, "grad_norm": 0.992565251308887, "learning_rate": 8.975929138359423e-06, "loss": 0.2059, "step": 49 }, { "epoch": 1.1363636363636362, "grad_norm": 0.9117404975458566, "learning_rate": 8.931621053926998e-06, "loss": 0.2237, "step": 50 }, { "epoch": 1.1590909090909092, "grad_norm": 0.8003178422788053, "learning_rate": 8.886488936987817e-06, "loss": 0.1334, "step": 51 }, { "epoch": 1.1818181818181819, "grad_norm": 1.2363455579654716, "learning_rate": 8.840542246828763e-06, "loss": 0.2168, "step": 52 }, { "epoch": 1.2045454545454546, "grad_norm": 1.2347708733857203, "learning_rate": 8.793790613463956e-06, "loss": 0.175, "step": 53 }, { "epoch": 1.2272727272727273, "grad_norm": 1.2303702228676998, "learning_rate": 8.746243835616392e-06, "loss": 0.1787, "step": 54 }, { "epoch": 1.25, "grad_norm": 1.2497191463046406, "learning_rate": 8.697911878664222e-06, "loss": 0.1739, "step": 55 }, { "epoch": 1.2727272727272727, "grad_norm": 1.3951861645180035, "learning_rate": 8.648804872552092e-06, "loss": 0.1847, "step": 56 }, { "epoch": 1.2954545454545454, "grad_norm": 1.245991460551998, "learning_rate": 8.598933109667995e-06, "loss": 0.1351, "step": 57 }, { "epoch": 1.3181818181818181, "grad_norm": 1.32907622391414, "learning_rate": 8.548307042686093e-06, "loss": 0.1546, "step": 58 }, { "epoch": 1.3409090909090908, "grad_norm": 1.4968562879002865, "learning_rate": 8.496937282375912e-06, "loss": 0.2356, "step": 59 }, { "epoch": 1.3636363636363638, "grad_norm": 0.9737096273924404, "learning_rate": 8.444834595378434e-06, "loss": 0.1335, "step": 60 }, { "epoch": 1.3863636363636362, "grad_norm": 1.3589415450025601, "learning_rate": 8.3920099019495e-06, "loss": 0.1363, "step": 61 }, { "epoch": 1.4090909090909092, "grad_norm": 0.8664968714166548, "learning_rate": 8.33847427367102e-06, "loss": 0.1056, "step": 62 }, { "epoch": 1.4318181818181819, "grad_norm": 1.0430422759251574, "learning_rate": 8.284238931130476e-06, "loss": 0.1827, "step": 63 }, { "epoch": 1.4545454545454546, "grad_norm": 1.0086301864952136, "learning_rate": 8.229315241569177e-06, "loss": 0.1398, "step": 64 }, { "epoch": 1.4772727272727273, "grad_norm": 0.9279203416268156, "learning_rate": 8.173714716499801e-06, "loss": 0.157, "step": 65 }, { "epoch": 1.5, "grad_norm": 1.1440300758673703, "learning_rate": 8.117449009293668e-06, "loss": 0.1685, "step": 66 }, { "epoch": 1.5227272727272727, "grad_norm": 1.1751439805537514, "learning_rate": 8.060529912738316e-06, "loss": 0.1572, "step": 67 }, { "epoch": 1.5454545454545454, "grad_norm": 1.0578357890566388, "learning_rate": 8.002969356565822e-06, "loss": 0.1598, "step": 68 }, { "epoch": 1.5681818181818183, "grad_norm": 1.3193928833299897, "learning_rate": 7.94477940495245e-06, "loss": 0.1854, "step": 69 }, { "epoch": 1.5909090909090908, "grad_norm": 1.2978218132135766, "learning_rate": 7.885972253990104e-06, "loss": 0.1743, "step": 70 }, { "epoch": 1.6136363636363638, "grad_norm": 1.0105258814245202, "learning_rate": 7.826560229130132e-06, "loss": 0.1987, "step": 71 }, { "epoch": 1.6363636363636362, "grad_norm": 1.004604979048799, "learning_rate": 7.766555782600023e-06, "loss": 0.1795, "step": 72 }, { "epoch": 1.6590909090909092, "grad_norm": 1.1179470989774414, "learning_rate": 7.70597149079354e-06, "loss": 0.1815, "step": 73 }, { "epoch": 1.6818181818181817, "grad_norm": 1.166448144503895, "learning_rate": 7.644820051634813e-06, "loss": 0.1617, "step": 74 }, { "epoch": 1.7045454545454546, "grad_norm": 0.9473819093100403, "learning_rate": 7.5831142819169664e-06, "loss": 0.1282, "step": 75 }, { "epoch": 1.7272727272727273, "grad_norm": 1.3707253122758942, "learning_rate": 7.520867114615844e-06, "loss": 0.1843, "step": 76 }, { "epoch": 1.75, "grad_norm": 0.8753193774986169, "learning_rate": 7.458091596179359e-06, "loss": 0.1205, "step": 77 }, { "epoch": 1.7727272727272727, "grad_norm": 0.7554237843642733, "learning_rate": 7.394800883793087e-06, "loss": 0.0983, "step": 78 }, { "epoch": 1.7954545454545454, "grad_norm": 1.1973348363016902, "learning_rate": 7.331008242622637e-06, "loss": 0.1848, "step": 79 }, { "epoch": 1.8181818181818183, "grad_norm": 1.1860354581221395, "learning_rate": 7.266727043033386e-06, "loss": 0.1527, "step": 80 }, { "epoch": 1.8409090909090908, "grad_norm": 1.4128677081083227, "learning_rate": 7.201970757788172e-06, "loss": 0.1602, "step": 81 }, { "epoch": 1.8636363636363638, "grad_norm": 1.296872182127621, "learning_rate": 7.136752959223527e-06, "loss": 0.2184, "step": 82 }, { "epoch": 1.8863636363636362, "grad_norm": 1.4836645345593107, "learning_rate": 7.071087316405037e-06, "loss": 0.2896, "step": 83 }, { "epoch": 1.9090909090909092, "grad_norm": 1.1976778490053432, "learning_rate": 7.00498759226242e-06, "loss": 0.1659, "step": 84 }, { "epoch": 1.9318181818181817, "grad_norm": 0.9975231348395474, "learning_rate": 6.938467640704953e-06, "loss": 0.1535, "step": 85 }, { "epoch": 1.9545454545454546, "grad_norm": 0.9648415033602733, "learning_rate": 6.871541403717808e-06, "loss": 0.1753, "step": 86 }, { "epoch": 1.9772727272727273, "grad_norm": 1.2019101492420445, "learning_rate": 6.8042229084399325e-06, "loss": 0.1562, "step": 87 }, { "epoch": 2.0, "grad_norm": 0.9443355724586839, "learning_rate": 6.736526264224101e-06, "loss": 0.1196, "step": 88 }, { "epoch": 2.022727272727273, "grad_norm": 1.000147993017032, "learning_rate": 6.668465659679714e-06, "loss": 0.1105, "step": 89 }, { "epoch": 2.0454545454545454, "grad_norm": 0.9077637361309605, "learning_rate": 6.600055359698984e-06, "loss": 0.1359, "step": 90 }, { "epoch": 2.0681818181818183, "grad_norm": 0.8899974700309011, "learning_rate": 6.531309702467159e-06, "loss": 0.1051, "step": 91 }, { "epoch": 2.090909090909091, "grad_norm": 0.7405665057218146, "learning_rate": 6.462243096457352e-06, "loss": 0.0949, "step": 92 }, { "epoch": 2.1136363636363638, "grad_norm": 0.9232739478624769, "learning_rate": 6.392870017410665e-06, "loss": 0.0869, "step": 93 }, { "epoch": 2.1363636363636362, "grad_norm": 0.907192376363368, "learning_rate": 6.323205005302199e-06, "loss": 0.085, "step": 94 }, { "epoch": 2.159090909090909, "grad_norm": 0.9510815353153362, "learning_rate": 6.2532626612936035e-06, "loss": 0.1041, "step": 95 }, { "epoch": 2.1818181818181817, "grad_norm": 1.0694010726357495, "learning_rate": 6.18305764467281e-06, "loss": 0.0933, "step": 96 }, { "epoch": 2.2045454545454546, "grad_norm": 0.9096286210344772, "learning_rate": 6.112604669781572e-06, "loss": 0.0672, "step": 97 }, { "epoch": 2.227272727272727, "grad_norm": 1.433917507625707, "learning_rate": 6.041918502931473e-06, "loss": 0.0879, "step": 98 }, { "epoch": 2.25, "grad_norm": 0.9531885718869322, "learning_rate": 5.971013959309038e-06, "loss": 0.0596, "step": 99 }, { "epoch": 2.2727272727272725, "grad_norm": 1.1923953651383123, "learning_rate": 5.8999058998706046e-06, "loss": 0.0788, "step": 100 }, { "epoch": 2.2954545454545454, "grad_norm": 0.9554826618737247, "learning_rate": 5.828609228227603e-06, "loss": 0.073, "step": 101 }, { "epoch": 2.3181818181818183, "grad_norm": 1.0620618620218882, "learning_rate": 5.757138887522884e-06, "loss": 0.0852, "step": 102 }, { "epoch": 2.340909090909091, "grad_norm": 1.1580458870328374, "learning_rate": 5.685509857298781e-06, "loss": 0.1011, "step": 103 }, { "epoch": 2.3636363636363638, "grad_norm": 1.5883148285084483, "learning_rate": 5.613737150357528e-06, "loss": 0.0791, "step": 104 }, { "epoch": 2.3863636363636362, "grad_norm": 1.2367508056402248, "learning_rate": 5.541835809614704e-06, "loss": 0.0654, "step": 105 }, { "epoch": 2.409090909090909, "grad_norm": 2.2791052878714653, "learning_rate": 5.469820904946383e-06, "loss": 0.087, "step": 106 }, { "epoch": 2.4318181818181817, "grad_norm": 1.2009177571989036, "learning_rate": 5.397707530030621e-06, "loss": 0.0754, "step": 107 }, { "epoch": 2.4545454545454546, "grad_norm": 1.24186865246545, "learning_rate": 5.325510799183953e-06, "loss": 0.0676, "step": 108 }, { "epoch": 2.4772727272727275, "grad_norm": 1.3626254215524685, "learning_rate": 5.253245844193564e-06, "loss": 0.0897, "step": 109 }, { "epoch": 2.5, "grad_norm": 1.2267940513161908, "learning_rate": 5.180927811145818e-06, "loss": 0.081, "step": 110 }, { "epoch": 2.5227272727272725, "grad_norm": 1.0280554800159314, "learning_rate": 5.108571857251754e-06, "loss": 0.0998, "step": 111 }, { "epoch": 2.5454545454545454, "grad_norm": 1.198523585670272, "learning_rate": 5.036193147670286e-06, "loss": 0.0943, "step": 112 }, { "epoch": 2.5681818181818183, "grad_norm": 1.0299128746931727, "learning_rate": 4.963806852329715e-06, "loss": 0.0867, "step": 113 }, { "epoch": 2.590909090909091, "grad_norm": 1.0781930889668705, "learning_rate": 4.891428142748247e-06, "loss": 0.0935, "step": 114 }, { "epoch": 2.6136363636363638, "grad_norm": 1.3870007299043179, "learning_rate": 4.819072188854183e-06, "loss": 0.1038, "step": 115 }, { "epoch": 2.6363636363636362, "grad_norm": 1.2763920896506822, "learning_rate": 4.746754155806437e-06, "loss": 0.1066, "step": 116 }, { "epoch": 2.659090909090909, "grad_norm": 1.1553721597437976, "learning_rate": 4.674489200816051e-06, "loss": 0.0727, "step": 117 }, { "epoch": 2.6818181818181817, "grad_norm": 1.2201292764615486, "learning_rate": 4.602292469969381e-06, "loss": 0.1029, "step": 118 }, { "epoch": 2.7045454545454546, "grad_norm": 1.0182353763504708, "learning_rate": 4.5301790950536175e-06, "loss": 0.081, "step": 119 }, { "epoch": 2.7272727272727275, "grad_norm": 0.8638534963225072, "learning_rate": 4.458164190385297e-06, "loss": 0.0743, "step": 120 }, { "epoch": 2.75, "grad_norm": 1.179392830465865, "learning_rate": 4.386262849642474e-06, "loss": 0.1008, "step": 121 }, { "epoch": 2.7727272727272725, "grad_norm": 0.8679825281753464, "learning_rate": 4.31449014270122e-06, "loss": 0.0493, "step": 122 }, { "epoch": 2.7954545454545454, "grad_norm": 1.1253980579124658, "learning_rate": 4.2428611124771184e-06, "loss": 0.0848, "step": 123 }, { "epoch": 2.8181818181818183, "grad_norm": 1.0356630565413245, "learning_rate": 4.171390771772399e-06, "loss": 0.068, "step": 124 }, { "epoch": 2.840909090909091, "grad_norm": 1.3477554175880992, "learning_rate": 4.100094100129396e-06, "loss": 0.1043, "step": 125 }, { "epoch": 2.8636363636363638, "grad_norm": 0.97866201549117, "learning_rate": 4.028986040690963e-06, "loss": 0.0847, "step": 126 }, { "epoch": 2.8863636363636362, "grad_norm": 1.1349756610536412, "learning_rate": 3.958081497068528e-06, "loss": 0.0792, "step": 127 }, { "epoch": 2.909090909090909, "grad_norm": 0.9146117017482877, "learning_rate": 3.887395330218429e-06, "loss": 0.0611, "step": 128 }, { "epoch": 2.9318181818181817, "grad_norm": 1.0807764635079844, "learning_rate": 3.816942355327191e-06, "loss": 0.0904, "step": 129 }, { "epoch": 2.9545454545454546, "grad_norm": 1.1271897510030453, "learning_rate": 3.7467373387063973e-06, "loss": 0.0769, "step": 130 }, { "epoch": 2.9772727272727275, "grad_norm": 1.079134591854598, "learning_rate": 3.6767949946978026e-06, "loss": 0.0936, "step": 131 }, { "epoch": 3.0, "grad_norm": 1.0766678958231195, "learning_rate": 3.607129982589337e-06, "loss": 0.0836, "step": 132 }, { "epoch": 3.022727272727273, "grad_norm": 0.6602194945121888, "learning_rate": 3.5377569035426494e-06, "loss": 0.0432, "step": 133 }, { "epoch": 3.0454545454545454, "grad_norm": 0.9207788970106073, "learning_rate": 3.468690297532843e-06, "loss": 0.0614, "step": 134 }, { "epoch": 3.0681818181818183, "grad_norm": 0.7792864367119448, "learning_rate": 3.3999446403010156e-06, "loss": 0.044, "step": 135 }, { "epoch": 3.090909090909091, "grad_norm": 0.7824193339459661, "learning_rate": 3.331534340320287e-06, "loss": 0.0299, "step": 136 }, { "epoch": 3.1136363636363638, "grad_norm": 1.035803750265937, "learning_rate": 3.2634737357758994e-06, "loss": 0.0481, "step": 137 }, { "epoch": 3.1363636363636362, "grad_norm": 0.7826621695998633, "learning_rate": 3.1957770915600696e-06, "loss": 0.0388, "step": 138 }, { "epoch": 3.159090909090909, "grad_norm": 0.6958054164558033, "learning_rate": 3.1284585962821957e-06, "loss": 0.0351, "step": 139 }, { "epoch": 3.1818181818181817, "grad_norm": 0.9734916299688532, "learning_rate": 3.0615323592950495e-06, "loss": 0.0458, "step": 140 }, { "epoch": 3.2045454545454546, "grad_norm": 0.9750452432170936, "learning_rate": 2.995012407737581e-06, "loss": 0.044, "step": 141 }, { "epoch": 3.227272727272727, "grad_norm": 1.1459789012585446, "learning_rate": 2.9289126835949657e-06, "loss": 0.0663, "step": 142 }, { "epoch": 3.25, "grad_norm": 1.1249770693101788, "learning_rate": 2.8632470407764746e-06, "loss": 0.0431, "step": 143 }, { "epoch": 3.2727272727272725, "grad_norm": 0.9864994227169254, "learning_rate": 2.7980292422118282e-06, "loss": 0.0606, "step": 144 }, { "epoch": 3.2954545454545454, "grad_norm": 1.0092348933533415, "learning_rate": 2.733272956966615e-06, "loss": 0.0538, "step": 145 }, { "epoch": 3.3181818181818183, "grad_norm": 1.39535927779365, "learning_rate": 2.6689917573773615e-06, "loss": 0.0531, "step": 146 }, { "epoch": 3.340909090909091, "grad_norm": 0.9511300286527435, "learning_rate": 2.605199116206912e-06, "loss": 0.0382, "step": 147 }, { "epoch": 3.3636363636363638, "grad_norm": 1.2541730166057663, "learning_rate": 2.5419084038206422e-06, "loss": 0.0419, "step": 148 }, { "epoch": 3.3863636363636362, "grad_norm": 1.0095638491761618, "learning_rate": 2.4791328853841577e-06, "loss": 0.0434, "step": 149 }, { "epoch": 3.409090909090909, "grad_norm": 0.8262732933318356, "learning_rate": 2.416885718083035e-06, "loss": 0.0322, "step": 150 }, { "epoch": 3.4318181818181817, "grad_norm": 0.8137374498919325, "learning_rate": 2.3551799483651894e-06, "loss": 0.0308, "step": 151 }, { "epoch": 3.4545454545454546, "grad_norm": 0.8629006626767369, "learning_rate": 2.294028509206461e-06, "loss": 0.0459, "step": 152 }, { "epoch": 3.4772727272727275, "grad_norm": 0.75511924638048, "learning_rate": 2.2334442173999794e-06, "loss": 0.0304, "step": 153 }, { "epoch": 3.5, "grad_norm": 0.765294235454733, "learning_rate": 2.17343977086987e-06, "loss": 0.0436, "step": 154 }, { "epoch": 3.5227272727272725, "grad_norm": 0.8507628894917487, "learning_rate": 2.114027746009897e-06, "loss": 0.0277, "step": 155 }, { "epoch": 3.5454545454545454, "grad_norm": 1.1371497063801275, "learning_rate": 2.055220595047551e-06, "loss": 0.0463, "step": 156 }, { "epoch": 3.5681818181818183, "grad_norm": 1.0468638172133997, "learning_rate": 1.9970306434341806e-06, "loss": 0.0354, "step": 157 }, { "epoch": 3.590909090909091, "grad_norm": 0.8300360380363072, "learning_rate": 1.9394700872616856e-06, "loss": 0.0377, "step": 158 }, { "epoch": 3.6136363636363638, "grad_norm": 1.2431358466370912, "learning_rate": 1.8825509907063328e-06, "loss": 0.0407, "step": 159 }, { "epoch": 3.6363636363636362, "grad_norm": 1.0116784265871086, "learning_rate": 1.826285283500201e-06, "loss": 0.0506, "step": 160 }, { "epoch": 3.659090909090909, "grad_norm": 0.9351206425735656, "learning_rate": 1.770684758430824e-06, "loss": 0.0383, "step": 161 }, { "epoch": 3.6818181818181817, "grad_norm": 0.7608293362619839, "learning_rate": 1.7157610688695248e-06, "loss": 0.0251, "step": 162 }, { "epoch": 3.7045454545454546, "grad_norm": 1.0486186821729864, "learning_rate": 1.6615257263289809e-06, "loss": 0.0354, "step": 163 }, { "epoch": 3.7272727272727275, "grad_norm": 0.8511206701235292, "learning_rate": 1.607990098050501e-06, "loss": 0.0375, "step": 164 }, { "epoch": 3.75, "grad_norm": 0.9639257216156043, "learning_rate": 1.555165404621567e-06, "loss": 0.0406, "step": 165 }, { "epoch": 3.7727272727272725, "grad_norm": 0.9665035321570787, "learning_rate": 1.5030627176240903e-06, "loss": 0.0386, "step": 166 }, { "epoch": 3.7954545454545454, "grad_norm": 0.8935090470273086, "learning_rate": 1.45169295731391e-06, "loss": 0.0351, "step": 167 }, { "epoch": 3.8181818181818183, "grad_norm": 1.1337183807356483, "learning_rate": 1.4010668903320068e-06, "loss": 0.0267, "step": 168 }, { "epoch": 3.840909090909091, "grad_norm": 1.0191173211814406, "learning_rate": 1.3511951274479096e-06, "loss": 0.03, "step": 169 }, { "epoch": 3.8636363636363638, "grad_norm": 0.8969739545223467, "learning_rate": 1.3020881213357783e-06, "loss": 0.0433, "step": 170 }, { "epoch": 3.8863636363636362, "grad_norm": 0.8374168117834856, "learning_rate": 1.2537561643836087e-06, "loss": 0.0272, "step": 171 }, { "epoch": 3.909090909090909, "grad_norm": 1.0969039495955495, "learning_rate": 1.2062093865360458e-06, "loss": 0.052, "step": 172 }, { "epoch": 3.9318181818181817, "grad_norm": 0.7906472210562279, "learning_rate": 1.1594577531712392e-06, "loss": 0.0492, "step": 173 }, { "epoch": 3.9545454545454546, "grad_norm": 0.8214507384737139, "learning_rate": 1.1135110630121837e-06, "loss": 0.036, "step": 174 }, { "epoch": 3.9772727272727275, "grad_norm": 0.8916597776872088, "learning_rate": 1.0683789460730037e-06, "loss": 0.0358, "step": 175 }, { "epoch": 4.0, "grad_norm": 0.9910806202679314, "learning_rate": 1.0240708616405788e-06, "loss": 0.0471, "step": 176 }, { "epoch": 4.0227272727272725, "grad_norm": 0.8017333339630908, "learning_rate": 9.80596096291967e-07, "loss": 0.0421, "step": 177 }, { "epoch": 4.045454545454546, "grad_norm": 0.6088709030309449, "learning_rate": 9.379637619480236e-07, "loss": 0.0197, "step": 178 }, { "epoch": 4.068181818181818, "grad_norm": 0.5056506758915681, "learning_rate": 8.961827939636198e-07, "loss": 0.0161, "step": 179 }, { "epoch": 4.090909090909091, "grad_norm": 0.7756562125854171, "learning_rate": 8.552619492548736e-07, "loss": 0.0268, "step": 180 }, { "epoch": 4.113636363636363, "grad_norm": 0.5669855251712181, "learning_rate": 8.15209804463783e-07, "loss": 0.0202, "step": 181 }, { "epoch": 4.136363636363637, "grad_norm": 0.6473780755422711, "learning_rate": 7.760347541606339e-07, "loss": 0.0307, "step": 182 }, { "epoch": 4.159090909090909, "grad_norm": 0.5361573992028992, "learning_rate": 7.377450090845733e-07, "loss": 0.0215, "step": 183 }, { "epoch": 4.181818181818182, "grad_norm": 0.6191327927742057, "learning_rate": 7.003485944227162e-07, "loss": 0.0297, "step": 184 }, { "epoch": 4.204545454545454, "grad_norm": 0.7740351242770065, "learning_rate": 6.638533481281323e-07, "loss": 0.0277, "step": 185 }, { "epoch": 4.2272727272727275, "grad_norm": 0.8036694531204502, "learning_rate": 6.282669192770896e-07, "loss": 0.0337, "step": 186 }, { "epoch": 4.25, "grad_norm": 0.5569549110731221, "learning_rate": 5.935967664658682e-07, "loss": 0.023, "step": 187 }, { "epoch": 4.2727272727272725, "grad_norm": 0.43268894985216455, "learning_rate": 5.598501562475111e-07, "loss": 0.015, "step": 188 }, { "epoch": 4.295454545454546, "grad_norm": 0.5798546765255864, "learning_rate": 5.270341616088153e-07, "loss": 0.0161, "step": 189 }, { "epoch": 4.318181818181818, "grad_norm": 0.667306809133945, "learning_rate": 4.951556604879049e-07, "loss": 0.0207, "step": 190 }, { "epoch": 4.340909090909091, "grad_norm": 0.5559259388326303, "learning_rate": 4.6422133433266513e-07, "loss": 0.0154, "step": 191 }, { "epoch": 4.363636363636363, "grad_norm": 0.48805637239261085, "learning_rate": 4.342376667003845e-07, "loss": 0.0133, "step": 192 }, { "epoch": 4.386363636363637, "grad_norm": 0.633760717360346, "learning_rate": 4.05210941898847e-07, "loss": 0.0166, "step": 193 }, { "epoch": 4.409090909090909, "grad_norm": 0.4941494223196582, "learning_rate": 3.771472436692053e-07, "loss": 0.0136, "step": 194 }, { "epoch": 4.431818181818182, "grad_norm": 0.694276868602463, "learning_rate": 3.500524539108807e-07, "loss": 0.0218, "step": 195 }, { "epoch": 4.454545454545454, "grad_norm": 0.588819670846474, "learning_rate": 3.239322514487686e-07, "loss": 0.0176, "step": 196 }, { "epoch": 4.4772727272727275, "grad_norm": 0.5265383444966927, "learning_rate": 2.9879211084301194e-07, "loss": 0.0151, "step": 197 }, { "epoch": 4.5, "grad_norm": 0.44878162692299606, "learning_rate": 2.7463730124157706e-07, "loss": 0.014, "step": 198 }, { "epoch": 4.5227272727272725, "grad_norm": 0.6569401749445752, "learning_rate": 2.5147288527588964e-07, "loss": 0.0164, "step": 199 }, { "epoch": 4.545454545454545, "grad_norm": 1.2770158126939868, "learning_rate": 2.2930371799975593e-07, "loss": 0.0218, "step": 200 }, { "epoch": 4.568181818181818, "grad_norm": 0.5855843139833002, "learning_rate": 2.0813444587178156e-07, "loss": 0.0168, "step": 201 }, { "epoch": 4.590909090909091, "grad_norm": 0.6376332689792783, "learning_rate": 1.8796950578151785e-07, "loss": 0.0152, "step": 202 }, { "epoch": 4.613636363636363, "grad_norm": 0.6329781274553996, "learning_rate": 1.6881312411953288e-07, "loss": 0.0111, "step": 203 }, { "epoch": 4.636363636363637, "grad_norm": 0.9765159855301974, "learning_rate": 1.5066931589159118e-07, "loss": 0.0278, "step": 204 }, { "epoch": 4.659090909090909, "grad_norm": 0.6973740006107351, "learning_rate": 1.3354188387715017e-07, "loss": 0.022, "step": 205 }, { "epoch": 4.681818181818182, "grad_norm": 0.5499873946022198, "learning_rate": 1.174344178323289e-07, "loss": 0.0155, "step": 206 }, { "epoch": 4.704545454545455, "grad_norm": 0.7453329709429668, "learning_rate": 1.0235029373752758e-07, "loss": 0.0274, "step": 207 }, { "epoch": 4.7272727272727275, "grad_norm": 0.4712400999164701, "learning_rate": 8.829267308985535e-08, "loss": 0.0128, "step": 208 }, { "epoch": 4.75, "grad_norm": 0.791960861976085, "learning_rate": 7.526450224050407e-08, "loss": 0.0206, "step": 209 }, { "epoch": 4.7727272727272725, "grad_norm": 0.8100989026594698, "learning_rate": 6.326851177722304e-08, "loss": 0.026, "step": 210 }, { "epoch": 4.795454545454545, "grad_norm": 0.5698845908406204, "learning_rate": 5.230721595201049e-08, "loss": 0.0115, "step": 211 }, { "epoch": 4.818181818181818, "grad_norm": 0.7463531903553184, "learning_rate": 4.2382912154150244e-08, "loss": 0.0178, "step": 212 }, { "epoch": 4.840909090909091, "grad_norm": 0.7813627200692648, "learning_rate": 3.3497680428697943e-08, "loss": 0.0163, "step": 213 }, { "epoch": 4.863636363636363, "grad_norm": 0.66982841564412, "learning_rate": 2.5653383040524228e-08, "loss": 0.0186, "step": 214 }, { "epoch": 4.886363636363637, "grad_norm": 0.4637052800834321, "learning_rate": 1.8851664083999742e-08, "loss": 0.013, "step": 215 }, { "epoch": 4.909090909090909, "grad_norm": 0.43304208586446546, "learning_rate": 1.3093949138406892e-08, "loss": 0.0081, "step": 216 }, { "epoch": 4.931818181818182, "grad_norm": 0.8973836308599885, "learning_rate": 8.381444969151608e-09, "loss": 0.0276, "step": 217 }, { "epoch": 4.954545454545455, "grad_norm": 0.6757448220546857, "learning_rate": 4.7151392748379095e-09, "loss": 0.0185, "step": 218 }, { "epoch": 4.9772727272727275, "grad_norm": 0.6378204095313933, "learning_rate": 2.0958004802529297e-09, "loss": 0.0239, "step": 219 }, { "epoch": 5.0, "grad_norm": 0.7520281673571331, "learning_rate": 5.239775753129728e-10, "loss": 0.0294, "step": 220 } ], "logging_steps": 1, "max_steps": 220, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 75, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 14632660697088.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }