{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.6501650165016502, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0033003300330033004, "grad_norm": 10.81499361768409, "learning_rate": 0.0, "loss": 1.2079360485076904, "step": 1 }, { "epoch": 0.006600660066006601, "grad_norm": 10.226770877445293, "learning_rate": 4.395604395604396e-07, "loss": 1.123347520828247, "step": 2 }, { "epoch": 0.009900990099009901, "grad_norm": 11.292644267807786, "learning_rate": 8.791208791208792e-07, "loss": 1.261695384979248, "step": 3 }, { "epoch": 0.013201320132013201, "grad_norm": 10.504638106263508, "learning_rate": 1.3186813186813187e-06, "loss": 1.1276888847351074, "step": 4 }, { "epoch": 0.0165016501650165, "grad_norm": 10.822100601159539, "learning_rate": 1.7582417582417585e-06, "loss": 1.2254480123519897, "step": 5 }, { "epoch": 0.019801980198019802, "grad_norm": 9.905516433474448, "learning_rate": 2.197802197802198e-06, "loss": 1.1809396743774414, "step": 6 }, { "epoch": 0.0231023102310231, "grad_norm": 9.323364829402967, "learning_rate": 2.6373626373626375e-06, "loss": 1.2000095844268799, "step": 7 }, { "epoch": 0.026402640264026403, "grad_norm": 6.706098746162178, "learning_rate": 3.0769230769230774e-06, "loss": 1.0248074531555176, "step": 8 }, { "epoch": 0.0297029702970297, "grad_norm": 5.761138380327878, "learning_rate": 3.516483516483517e-06, "loss": 1.0840561389923096, "step": 9 }, { "epoch": 0.033003300330033, "grad_norm": 2.7364343552329315, "learning_rate": 3.9560439560439565e-06, "loss": 0.955639123916626, "step": 10 }, { "epoch": 0.036303630363036306, "grad_norm": 2.113810438625661, "learning_rate": 4.395604395604396e-06, "loss": 0.9281604290008545, "step": 11 }, { "epoch": 0.039603960396039604, "grad_norm": 1.849238684536393, "learning_rate": 4.8351648351648355e-06, "loss": 0.9079018831253052, "step": 12 }, { "epoch": 0.0429042904290429, "grad_norm": 1.6747171029255208, "learning_rate": 5.274725274725275e-06, "loss": 0.9039217233657837, "step": 13 }, { "epoch": 0.0462046204620462, "grad_norm": 2.0121666555693416, "learning_rate": 5.7142857142857145e-06, "loss": 0.8910936117172241, "step": 14 }, { "epoch": 0.04950495049504951, "grad_norm": 2.0600124028897526, "learning_rate": 6.153846153846155e-06, "loss": 0.895532488822937, "step": 15 }, { "epoch": 0.052805280528052806, "grad_norm": 2.0613449368510044, "learning_rate": 6.5934065934065935e-06, "loss": 0.8889240622520447, "step": 16 }, { "epoch": 0.056105610561056105, "grad_norm": 1.785450637059245, "learning_rate": 7.032967032967034e-06, "loss": 0.8499570488929749, "step": 17 }, { "epoch": 0.0594059405940594, "grad_norm": 1.5894161631201256, "learning_rate": 7.472527472527473e-06, "loss": 0.839992105960846, "step": 18 }, { "epoch": 0.0627062706270627, "grad_norm": 1.1904834264503976, "learning_rate": 7.912087912087913e-06, "loss": 0.7718420028686523, "step": 19 }, { "epoch": 0.066006600660066, "grad_norm": 1.0397335564670163, "learning_rate": 8.351648351648353e-06, "loss": 0.7865867614746094, "step": 20 }, { "epoch": 0.06930693069306931, "grad_norm": 0.8314739102256958, "learning_rate": 8.791208791208792e-06, "loss": 0.7982739806175232, "step": 21 }, { "epoch": 0.07260726072607261, "grad_norm": 0.6542597896181986, "learning_rate": 9.230769230769232e-06, "loss": 0.7846421599388123, "step": 22 }, { "epoch": 0.07590759075907591, "grad_norm": 0.6269389928815381, "learning_rate": 9.670329670329671e-06, "loss": 0.7005743980407715, "step": 23 }, { "epoch": 0.07920792079207921, "grad_norm": 0.6603922634859757, "learning_rate": 1.010989010989011e-05, "loss": 0.7084314227104187, "step": 24 }, { "epoch": 0.08250825082508251, "grad_norm": 0.6856248928818359, "learning_rate": 1.054945054945055e-05, "loss": 0.7310304641723633, "step": 25 }, { "epoch": 0.0858085808580858, "grad_norm": 0.5728331825854258, "learning_rate": 1.098901098901099e-05, "loss": 0.7056888341903687, "step": 26 }, { "epoch": 0.0891089108910891, "grad_norm": 0.47956485465857923, "learning_rate": 1.1428571428571429e-05, "loss": 0.6987950205802917, "step": 27 }, { "epoch": 0.0924092409240924, "grad_norm": 0.47407141179043555, "learning_rate": 1.186813186813187e-05, "loss": 0.7319807410240173, "step": 28 }, { "epoch": 0.09570957095709572, "grad_norm": 0.4856924244101555, "learning_rate": 1.230769230769231e-05, "loss": 0.6983063220977783, "step": 29 }, { "epoch": 0.09900990099009901, "grad_norm": 0.49122925908544063, "learning_rate": 1.2747252747252747e-05, "loss": 0.70492023229599, "step": 30 }, { "epoch": 0.10231023102310231, "grad_norm": 0.4556788168903923, "learning_rate": 1.3186813186813187e-05, "loss": 0.7376629114151001, "step": 31 }, { "epoch": 0.10561056105610561, "grad_norm": 0.4272838300827657, "learning_rate": 1.3626373626373627e-05, "loss": 0.6623936295509338, "step": 32 }, { "epoch": 0.10891089108910891, "grad_norm": 0.40886227927218277, "learning_rate": 1.4065934065934068e-05, "loss": 0.7136330604553223, "step": 33 }, { "epoch": 0.11221122112211221, "grad_norm": 0.37821179606418975, "learning_rate": 1.4505494505494506e-05, "loss": 0.7113747596740723, "step": 34 }, { "epoch": 0.11551155115511551, "grad_norm": 0.4538557716923258, "learning_rate": 1.4945054945054947e-05, "loss": 0.8252867460250854, "step": 35 }, { "epoch": 0.1188118811881188, "grad_norm": 0.3875808052898815, "learning_rate": 1.5384615384615387e-05, "loss": 0.7406599521636963, "step": 36 }, { "epoch": 0.12211221122112212, "grad_norm": 0.3503240143986989, "learning_rate": 1.5824175824175826e-05, "loss": 0.6572297811508179, "step": 37 }, { "epoch": 0.1254125412541254, "grad_norm": 0.3779655372487014, "learning_rate": 1.6263736263736265e-05, "loss": 0.7520949840545654, "step": 38 }, { "epoch": 0.12871287128712872, "grad_norm": 0.36968690038350466, "learning_rate": 1.6703296703296707e-05, "loss": 0.6861323118209839, "step": 39 }, { "epoch": 0.132013201320132, "grad_norm": 0.3724328241107235, "learning_rate": 1.7142857142857142e-05, "loss": 0.6818518042564392, "step": 40 }, { "epoch": 0.1353135313531353, "grad_norm": 0.35542054984937593, "learning_rate": 1.7582417582417584e-05, "loss": 0.6663186550140381, "step": 41 }, { "epoch": 0.13861386138613863, "grad_norm": 0.3441266617586836, "learning_rate": 1.8021978021978023e-05, "loss": 0.6492191553115845, "step": 42 }, { "epoch": 0.1419141914191419, "grad_norm": 0.3478448092762331, "learning_rate": 1.8461538461538465e-05, "loss": 0.6444741487503052, "step": 43 }, { "epoch": 0.14521452145214522, "grad_norm": 0.34951148057960574, "learning_rate": 1.8901098901098903e-05, "loss": 0.6476814150810242, "step": 44 }, { "epoch": 0.1485148514851485, "grad_norm": 0.3356672452160599, "learning_rate": 1.9340659340659342e-05, "loss": 0.6660827994346619, "step": 45 }, { "epoch": 0.15181518151815182, "grad_norm": 0.30809956365723695, "learning_rate": 1.9780219780219784e-05, "loss": 0.6924091577529907, "step": 46 }, { "epoch": 0.1551155115511551, "grad_norm": 0.9030699054312887, "learning_rate": 2.021978021978022e-05, "loss": 0.6899605989456177, "step": 47 }, { "epoch": 0.15841584158415842, "grad_norm": 0.35784060194946976, "learning_rate": 2.0659340659340665e-05, "loss": 0.7242028713226318, "step": 48 }, { "epoch": 0.1617161716171617, "grad_norm": 0.3093966721093651, "learning_rate": 2.10989010989011e-05, "loss": 0.6203902959823608, "step": 49 }, { "epoch": 0.16501650165016502, "grad_norm": 0.4242705872636108, "learning_rate": 2.153846153846154e-05, "loss": 0.6420010328292847, "step": 50 }, { "epoch": 0.16831683168316833, "grad_norm": 0.35079960590346965, "learning_rate": 2.197802197802198e-05, "loss": 0.7517598867416382, "step": 51 }, { "epoch": 0.1716171617161716, "grad_norm": 0.3078803790362521, "learning_rate": 2.241758241758242e-05, "loss": 0.6568161249160767, "step": 52 }, { "epoch": 0.17491749174917492, "grad_norm": 0.34666662805484005, "learning_rate": 2.2857142857142858e-05, "loss": 0.7348504662513733, "step": 53 }, { "epoch": 0.1782178217821782, "grad_norm": 0.302791415801781, "learning_rate": 2.32967032967033e-05, "loss": 0.6164949536323547, "step": 54 }, { "epoch": 0.18151815181518152, "grad_norm": 0.33732756727763136, "learning_rate": 2.373626373626374e-05, "loss": 0.6505363583564758, "step": 55 }, { "epoch": 0.1848184818481848, "grad_norm": 0.34780152362496847, "learning_rate": 2.4175824175824177e-05, "loss": 0.7562520503997803, "step": 56 }, { "epoch": 0.18811881188118812, "grad_norm": 0.3310895358869482, "learning_rate": 2.461538461538462e-05, "loss": 0.6943148374557495, "step": 57 }, { "epoch": 0.19141914191419143, "grad_norm": 0.3367877938063833, "learning_rate": 2.5054945054945058e-05, "loss": 0.6571655869483948, "step": 58 }, { "epoch": 0.19471947194719472, "grad_norm": 0.32103256018771714, "learning_rate": 2.5494505494505493e-05, "loss": 0.7229321002960205, "step": 59 }, { "epoch": 0.19801980198019803, "grad_norm": 0.30468399230672144, "learning_rate": 2.593406593406594e-05, "loss": 0.6307672262191772, "step": 60 }, { "epoch": 0.20132013201320131, "grad_norm": 0.3282635121595526, "learning_rate": 2.6373626373626374e-05, "loss": 0.6336506009101868, "step": 61 }, { "epoch": 0.20462046204620463, "grad_norm": 0.3280360563022675, "learning_rate": 2.6813186813186813e-05, "loss": 0.6492213010787964, "step": 62 }, { "epoch": 0.2079207920792079, "grad_norm": 0.3292430577817229, "learning_rate": 2.7252747252747255e-05, "loss": 0.6763280034065247, "step": 63 }, { "epoch": 0.21122112211221122, "grad_norm": 0.47832355846700536, "learning_rate": 2.7692307692307694e-05, "loss": 0.7322396039962769, "step": 64 }, { "epoch": 0.2145214521452145, "grad_norm": 0.31915340164178446, "learning_rate": 2.8131868131868136e-05, "loss": 0.7080870270729065, "step": 65 }, { "epoch": 0.21782178217821782, "grad_norm": 0.3227571040968621, "learning_rate": 2.8571428571428574e-05, "loss": 0.6054466962814331, "step": 66 }, { "epoch": 0.22112211221122113, "grad_norm": 0.33375713186655664, "learning_rate": 2.9010989010989013e-05, "loss": 0.6782290935516357, "step": 67 }, { "epoch": 0.22442244224422442, "grad_norm": 0.3437770801965916, "learning_rate": 2.9450549450549455e-05, "loss": 0.6804753541946411, "step": 68 }, { "epoch": 0.22772277227722773, "grad_norm": 0.3228427319313703, "learning_rate": 2.9890109890109894e-05, "loss": 0.6493992805480957, "step": 69 }, { "epoch": 0.23102310231023102, "grad_norm": 0.3540211756840673, "learning_rate": 3.0329670329670332e-05, "loss": 0.6263789534568787, "step": 70 }, { "epoch": 0.23432343234323433, "grad_norm": 0.34989089824503405, "learning_rate": 3.0769230769230774e-05, "loss": 0.6960322856903076, "step": 71 }, { "epoch": 0.2376237623762376, "grad_norm": 0.33624443163866324, "learning_rate": 3.120879120879121e-05, "loss": 0.6146604418754578, "step": 72 }, { "epoch": 0.24092409240924093, "grad_norm": 0.39618402867027047, "learning_rate": 3.164835164835165e-05, "loss": 0.6361377239227295, "step": 73 }, { "epoch": 0.24422442244224424, "grad_norm": 0.361603087273114, "learning_rate": 3.2087912087912094e-05, "loss": 0.636134147644043, "step": 74 }, { "epoch": 0.24752475247524752, "grad_norm": 0.37985663132790304, "learning_rate": 3.252747252747253e-05, "loss": 0.5936564803123474, "step": 75 }, { "epoch": 0.2508250825082508, "grad_norm": 0.35883234873646996, "learning_rate": 3.296703296703297e-05, "loss": 0.6001103520393372, "step": 76 }, { "epoch": 0.25412541254125415, "grad_norm": 0.35227803701073973, "learning_rate": 3.340659340659341e-05, "loss": 0.6254594326019287, "step": 77 }, { "epoch": 0.25742574257425743, "grad_norm": 0.3563257650896171, "learning_rate": 3.384615384615385e-05, "loss": 0.6457959413528442, "step": 78 }, { "epoch": 0.2607260726072607, "grad_norm": 0.37234316340556584, "learning_rate": 3.4285714285714284e-05, "loss": 0.6186954975128174, "step": 79 }, { "epoch": 0.264026402640264, "grad_norm": 0.35352748449766547, "learning_rate": 3.4725274725274726e-05, "loss": 0.6175529956817627, "step": 80 }, { "epoch": 0.26732673267326734, "grad_norm": 0.35441369709658355, "learning_rate": 3.516483516483517e-05, "loss": 0.6694468259811401, "step": 81 }, { "epoch": 0.2706270627062706, "grad_norm": 0.39955400784840756, "learning_rate": 3.56043956043956e-05, "loss": 0.627490222454071, "step": 82 }, { "epoch": 0.2739273927392739, "grad_norm": 0.38314031523497477, "learning_rate": 3.6043956043956045e-05, "loss": 0.6410495638847351, "step": 83 }, { "epoch": 0.27722772277227725, "grad_norm": 0.36926215386141575, "learning_rate": 3.648351648351649e-05, "loss": 0.6305102109909058, "step": 84 }, { "epoch": 0.28052805280528054, "grad_norm": 0.38364118080284076, "learning_rate": 3.692307692307693e-05, "loss": 0.6558895111083984, "step": 85 }, { "epoch": 0.2838283828382838, "grad_norm": 0.3370292682974053, "learning_rate": 3.7362637362637365e-05, "loss": 0.6029388308525085, "step": 86 }, { "epoch": 0.2871287128712871, "grad_norm": 0.39541874871701704, "learning_rate": 3.7802197802197807e-05, "loss": 0.6551017761230469, "step": 87 }, { "epoch": 0.29042904290429045, "grad_norm": 0.3629036550044273, "learning_rate": 3.824175824175825e-05, "loss": 0.6588809490203857, "step": 88 }, { "epoch": 0.29372937293729373, "grad_norm": 0.37786447228212183, "learning_rate": 3.8681318681318684e-05, "loss": 0.614648699760437, "step": 89 }, { "epoch": 0.297029702970297, "grad_norm": 0.42911861803278684, "learning_rate": 3.9120879120879126e-05, "loss": 0.7034356594085693, "step": 90 }, { "epoch": 0.30033003300330036, "grad_norm": 0.3707184094312094, "learning_rate": 3.956043956043957e-05, "loss": 0.6908263564109802, "step": 91 }, { "epoch": 0.30363036303630364, "grad_norm": 0.38262186656216063, "learning_rate": 4e-05, "loss": 0.6882215738296509, "step": 92 }, { "epoch": 0.3069306930693069, "grad_norm": 0.3709464296309744, "learning_rate": 3.999985249980169e-05, "loss": 0.6377270221710205, "step": 93 }, { "epoch": 0.3102310231023102, "grad_norm": 0.3412837406106036, "learning_rate": 3.999941000138238e-05, "loss": 0.6735270619392395, "step": 94 }, { "epoch": 0.31353135313531355, "grad_norm": 0.40165192879996064, "learning_rate": 3.999867251126893e-05, "loss": 0.6934541463851929, "step": 95 }, { "epoch": 0.31683168316831684, "grad_norm": 0.34707128601816045, "learning_rate": 3.9997640040339335e-05, "loss": 0.6367039084434509, "step": 96 }, { "epoch": 0.3201320132013201, "grad_norm": 0.4268828113970776, "learning_rate": 3.999631260382257e-05, "loss": 0.6274522542953491, "step": 97 }, { "epoch": 0.3234323432343234, "grad_norm": 0.454428833020686, "learning_rate": 3.999469022129834e-05, "loss": 0.5874066352844238, "step": 98 }, { "epoch": 0.32673267326732675, "grad_norm": 0.4200675840489775, "learning_rate": 3.9992772916696824e-05, "loss": 0.6175942420959473, "step": 99 }, { "epoch": 0.33003300330033003, "grad_norm": 0.3796321080056305, "learning_rate": 3.99905607182983e-05, "loss": 0.5625832080841064, "step": 100 }, { "epoch": 0.3333333333333333, "grad_norm": 0.39108856096759403, "learning_rate": 3.998805365873274e-05, "loss": 0.6153020262718201, "step": 101 }, { "epoch": 0.33663366336633666, "grad_norm": 0.3873560194436071, "learning_rate": 3.998525177497932e-05, "loss": 0.5585426092147827, "step": 102 }, { "epoch": 0.33993399339933994, "grad_norm": 0.4084712106325698, "learning_rate": 3.998215510836589e-05, "loss": 0.6586359739303589, "step": 103 }, { "epoch": 0.3432343234323432, "grad_norm": 0.4383246876899704, "learning_rate": 3.997876370456833e-05, "loss": 0.62096107006073, "step": 104 }, { "epoch": 0.3465346534653465, "grad_norm": 0.4026893562706946, "learning_rate": 3.997507761360993e-05, "loss": 0.6059336066246033, "step": 105 }, { "epoch": 0.34983498349834985, "grad_norm": 0.46586240044914223, "learning_rate": 3.997109688986059e-05, "loss": 0.617970883846283, "step": 106 }, { "epoch": 0.35313531353135313, "grad_norm": 0.44949199032710474, "learning_rate": 3.9966821592036066e-05, "loss": 0.6453397274017334, "step": 107 }, { "epoch": 0.3564356435643564, "grad_norm": 0.4794978158156406, "learning_rate": 3.996225178319709e-05, "loss": 0.6371763348579407, "step": 108 }, { "epoch": 0.35973597359735976, "grad_norm": 0.4463512391721941, "learning_rate": 3.9957387530748435e-05, "loss": 0.5971124172210693, "step": 109 }, { "epoch": 0.36303630363036304, "grad_norm": 0.368079413354641, "learning_rate": 3.995222890643792e-05, "loss": 0.5679532289505005, "step": 110 }, { "epoch": 0.36633663366336633, "grad_norm": 0.43733705586285254, "learning_rate": 3.9946775986355346e-05, "loss": 0.5988069772720337, "step": 111 }, { "epoch": 0.3696369636963696, "grad_norm": 0.38235582844960775, "learning_rate": 3.994102885093141e-05, "loss": 0.6352983713150024, "step": 112 }, { "epoch": 0.37293729372937295, "grad_norm": 0.389837871286893, "learning_rate": 3.993498758493646e-05, "loss": 0.58957839012146, "step": 113 }, { "epoch": 0.37623762376237624, "grad_norm": 0.40399856168911097, "learning_rate": 3.992865227747929e-05, "loss": 0.6396822929382324, "step": 114 }, { "epoch": 0.3795379537953795, "grad_norm": 0.38891668976227123, "learning_rate": 3.992202302200582e-05, "loss": 0.6314754486083984, "step": 115 }, { "epoch": 0.38283828382838286, "grad_norm": 0.4087528543828922, "learning_rate": 3.991509991629769e-05, "loss": 0.673650860786438, "step": 116 }, { "epoch": 0.38613861386138615, "grad_norm": 0.36330054292020786, "learning_rate": 3.990788306247085e-05, "loss": 0.5813701152801514, "step": 117 }, { "epoch": 0.38943894389438943, "grad_norm": 0.4247110332678589, "learning_rate": 3.990037256697404e-05, "loss": 0.6419334411621094, "step": 118 }, { "epoch": 0.3927392739273927, "grad_norm": 0.4244126002071751, "learning_rate": 3.989256854058721e-05, "loss": 0.6319208145141602, "step": 119 }, { "epoch": 0.39603960396039606, "grad_norm": 0.3651632933942853, "learning_rate": 3.988447109841991e-05, "loss": 0.5989845991134644, "step": 120 }, { "epoch": 0.39933993399339934, "grad_norm": 0.393158353074077, "learning_rate": 3.987608035990957e-05, "loss": 0.5853303670883179, "step": 121 }, { "epoch": 0.40264026402640263, "grad_norm": 0.35965233332276103, "learning_rate": 3.986739644881975e-05, "loss": 0.6115257143974304, "step": 122 }, { "epoch": 0.40594059405940597, "grad_norm": 0.4252711474203845, "learning_rate": 3.985841949323831e-05, "loss": 0.6440504789352417, "step": 123 }, { "epoch": 0.40924092409240925, "grad_norm": 0.5578797297271848, "learning_rate": 3.984914962557553e-05, "loss": 0.5765030384063721, "step": 124 }, { "epoch": 0.41254125412541254, "grad_norm": 0.4362455029468141, "learning_rate": 3.983958698256214e-05, "loss": 0.6387556791305542, "step": 125 }, { "epoch": 0.4158415841584158, "grad_norm": 0.39274811063076087, "learning_rate": 3.98297317052473e-05, "loss": 0.6263147592544556, "step": 126 }, { "epoch": 0.41914191419141916, "grad_norm": 0.42682589637163704, "learning_rate": 3.981958393899656e-05, "loss": 0.6091845035552979, "step": 127 }, { "epoch": 0.42244224422442245, "grad_norm": 0.4033131171538041, "learning_rate": 3.980914383348967e-05, "loss": 0.6458015441894531, "step": 128 }, { "epoch": 0.42574257425742573, "grad_norm": 0.3881606915462862, "learning_rate": 3.9798411542718395e-05, "loss": 0.6115552186965942, "step": 129 }, { "epoch": 0.429042904290429, "grad_norm": 0.38910317938225847, "learning_rate": 3.978738722498423e-05, "loss": 0.6427993774414062, "step": 130 }, { "epoch": 0.43234323432343236, "grad_norm": 0.36836380096259913, "learning_rate": 3.977607104289609e-05, "loss": 0.6121467351913452, "step": 131 }, { "epoch": 0.43564356435643564, "grad_norm": 0.3743062201629088, "learning_rate": 3.9764463163367875e-05, "loss": 0.5951442718505859, "step": 132 }, { "epoch": 0.4389438943894389, "grad_norm": 0.3699746655092952, "learning_rate": 3.9752563757616045e-05, "loss": 0.6639472842216492, "step": 133 }, { "epoch": 0.44224422442244227, "grad_norm": 0.37398919831188604, "learning_rate": 3.974037300115706e-05, "loss": 0.6084764003753662, "step": 134 }, { "epoch": 0.44554455445544555, "grad_norm": 0.37043195153646374, "learning_rate": 3.972789107380484e-05, "loss": 0.6211085915565491, "step": 135 }, { "epoch": 0.44884488448844884, "grad_norm": 0.3509837417375981, "learning_rate": 3.9715118159668046e-05, "loss": 0.6098147034645081, "step": 136 }, { "epoch": 0.4521452145214521, "grad_norm": 0.3350785925775803, "learning_rate": 3.970205444714742e-05, "loss": 0.6155884861946106, "step": 137 }, { "epoch": 0.45544554455445546, "grad_norm": 0.38529379761335925, "learning_rate": 3.9688700128932975e-05, "loss": 0.5984665155410767, "step": 138 }, { "epoch": 0.45874587458745875, "grad_norm": 0.45130397769476205, "learning_rate": 3.967505540200117e-05, "loss": 0.6656880378723145, "step": 139 }, { "epoch": 0.46204620462046203, "grad_norm": 0.3277874952439621, "learning_rate": 3.966112046761201e-05, "loss": 0.6607398390769958, "step": 140 }, { "epoch": 0.46534653465346537, "grad_norm": 2.6727599644732267, "learning_rate": 3.9646895531306046e-05, "loss": 0.6578342914581299, "step": 141 }, { "epoch": 0.46864686468646866, "grad_norm": 0.47429126269764676, "learning_rate": 3.963238080290136e-05, "loss": 0.6103699803352356, "step": 142 }, { "epoch": 0.47194719471947194, "grad_norm": 0.32652590291724093, "learning_rate": 3.96175764964905e-05, "loss": 0.5484676957130432, "step": 143 }, { "epoch": 0.4752475247524752, "grad_norm": 0.4531372955951849, "learning_rate": 3.960248283043727e-05, "loss": 0.578776478767395, "step": 144 }, { "epoch": 0.47854785478547857, "grad_norm": 0.3685580706465372, "learning_rate": 3.958710002737355e-05, "loss": 0.6184446811676025, "step": 145 }, { "epoch": 0.48184818481848185, "grad_norm": 0.3584005630962511, "learning_rate": 3.9571428314195984e-05, "loss": 0.6307916045188904, "step": 146 }, { "epoch": 0.48514851485148514, "grad_norm": 0.4049679254542765, "learning_rate": 3.955546792206265e-05, "loss": 0.6064697504043579, "step": 147 }, { "epoch": 0.4884488448844885, "grad_norm": 0.3846258995775384, "learning_rate": 3.953921908638966e-05, "loss": 0.6055655479431152, "step": 148 }, { "epoch": 0.49174917491749176, "grad_norm": 0.3643318343315678, "learning_rate": 3.952268204684765e-05, "loss": 0.5856431126594543, "step": 149 }, { "epoch": 0.49504950495049505, "grad_norm": 0.3854715521866927, "learning_rate": 3.950585704735829e-05, "loss": 0.6634635925292969, "step": 150 }, { "epoch": 0.49834983498349833, "grad_norm": 0.34338835592304534, "learning_rate": 3.948874433609065e-05, "loss": 0.5880753397941589, "step": 151 }, { "epoch": 0.5016501650165016, "grad_norm": 0.3481018111538647, "learning_rate": 3.947134416545757e-05, "loss": 0.5594221949577332, "step": 152 }, { "epoch": 0.504950495049505, "grad_norm": 0.6570220882473125, "learning_rate": 3.94536567921119e-05, "loss": 0.664652407169342, "step": 153 }, { "epoch": 0.5082508250825083, "grad_norm": 0.340048306266198, "learning_rate": 3.9435682476942755e-05, "loss": 0.6002815961837769, "step": 154 }, { "epoch": 0.5115511551155115, "grad_norm": 0.3488682381523364, "learning_rate": 3.941742148507163e-05, "loss": 0.5905177593231201, "step": 155 }, { "epoch": 0.5148514851485149, "grad_norm": 0.33062666453941425, "learning_rate": 3.939887408584853e-05, "loss": 0.5636795163154602, "step": 156 }, { "epoch": 0.5181518151815182, "grad_norm": 0.35862086331061066, "learning_rate": 3.938004055284796e-05, "loss": 0.5639582276344299, "step": 157 }, { "epoch": 0.5214521452145214, "grad_norm": 0.31769111173717246, "learning_rate": 3.9360921163864895e-05, "loss": 0.6515591144561768, "step": 158 }, { "epoch": 0.5247524752475248, "grad_norm": 0.38401455820073427, "learning_rate": 3.934151620091071e-05, "loss": 0.5721683502197266, "step": 159 }, { "epoch": 0.528052805280528, "grad_norm": 0.3284331200684813, "learning_rate": 3.9321825950209e-05, "loss": 0.5801802277565002, "step": 160 }, { "epoch": 0.5313531353135313, "grad_norm": 0.3493998878359796, "learning_rate": 3.9301850702191344e-05, "loss": 0.603084921836853, "step": 161 }, { "epoch": 0.5346534653465347, "grad_norm": 0.32233519110844616, "learning_rate": 3.928159075149304e-05, "loss": 0.6376925110816956, "step": 162 }, { "epoch": 0.5379537953795379, "grad_norm": 0.35833134197704153, "learning_rate": 3.926104639694877e-05, "loss": 0.5764102935791016, "step": 163 }, { "epoch": 0.5412541254125413, "grad_norm": 0.3523567199445224, "learning_rate": 3.924021794158818e-05, "loss": 0.6102188229560852, "step": 164 }, { "epoch": 0.5445544554455446, "grad_norm": 0.36694222553878597, "learning_rate": 3.921910569263139e-05, "loss": 0.5833287835121155, "step": 165 }, { "epoch": 0.5478547854785478, "grad_norm": 0.37179813198977807, "learning_rate": 3.919770996148448e-05, "loss": 0.5891385078430176, "step": 166 }, { "epoch": 0.5511551155115512, "grad_norm": 0.3507301680001106, "learning_rate": 3.917603106373493e-05, "loss": 0.5838547348976135, "step": 167 }, { "epoch": 0.5544554455445545, "grad_norm": 0.3134001311174479, "learning_rate": 3.9154069319146904e-05, "loss": 0.5727800726890564, "step": 168 }, { "epoch": 0.5577557755775577, "grad_norm": 0.33531781904204605, "learning_rate": 3.913182505165656e-05, "loss": 0.6102641224861145, "step": 169 }, { "epoch": 0.5610561056105611, "grad_norm": 0.35178976522027133, "learning_rate": 3.91092985893673e-05, "loss": 0.5718260407447815, "step": 170 }, { "epoch": 0.5643564356435643, "grad_norm": 0.47006108726602863, "learning_rate": 3.908649026454488e-05, "loss": 0.6308504939079285, "step": 171 }, { "epoch": 0.5676567656765676, "grad_norm": 0.3687514240026255, "learning_rate": 3.906340041361255e-05, "loss": 0.6089432835578918, "step": 172 }, { "epoch": 0.570957095709571, "grad_norm": 0.3586674884704593, "learning_rate": 3.904002937714606e-05, "loss": 0.6583501696586609, "step": 173 }, { "epoch": 0.5742574257425742, "grad_norm": 0.3399808047240735, "learning_rate": 3.9016377499868666e-05, "loss": 0.6108609437942505, "step": 174 }, { "epoch": 0.5775577557755776, "grad_norm": 0.3840880337988826, "learning_rate": 3.899244513064603e-05, "loss": 0.63509202003479, "step": 175 }, { "epoch": 0.5808580858085809, "grad_norm": 0.3725541644477348, "learning_rate": 3.896823262248107e-05, "loss": 0.5759241580963135, "step": 176 }, { "epoch": 0.5841584158415841, "grad_norm": 0.30755721985114126, "learning_rate": 3.8943740332508754e-05, "loss": 0.6148169040679932, "step": 177 }, { "epoch": 0.5874587458745875, "grad_norm": 0.3916756097057637, "learning_rate": 3.891896862199086e-05, "loss": 0.5266364216804504, "step": 178 }, { "epoch": 0.5907590759075908, "grad_norm": 0.3417854779376455, "learning_rate": 3.88939178563106e-05, "loss": 0.5626640319824219, "step": 179 }, { "epoch": 0.594059405940594, "grad_norm": 0.33526488525207704, "learning_rate": 3.886858840496727e-05, "loss": 0.6063880920410156, "step": 180 }, { "epoch": 0.5973597359735974, "grad_norm": 0.37344333250119977, "learning_rate": 3.884298064157077e-05, "loss": 0.5979235768318176, "step": 181 }, { "epoch": 0.6006600660066007, "grad_norm": 0.3835133271197793, "learning_rate": 3.881709494383612e-05, "loss": 0.6628611087799072, "step": 182 }, { "epoch": 0.6039603960396039, "grad_norm": 0.4344526004756121, "learning_rate": 3.879093169357789e-05, "loss": 0.6215270757675171, "step": 183 }, { "epoch": 0.6072607260726073, "grad_norm": 0.3644174435488244, "learning_rate": 3.876449127670452e-05, "loss": 0.6148592233657837, "step": 184 }, { "epoch": 0.6105610561056105, "grad_norm": 0.3619226265536735, "learning_rate": 3.87377740832127e-05, "loss": 0.6254778504371643, "step": 185 }, { "epoch": 0.6138613861386139, "grad_norm": 0.3492162593840536, "learning_rate": 3.871078050718155e-05, "loss": 0.6025378704071045, "step": 186 }, { "epoch": 0.6171617161716172, "grad_norm": 0.3866924759539626, "learning_rate": 3.8683510946766866e-05, "loss": 0.5887518525123596, "step": 187 }, { "epoch": 0.6204620462046204, "grad_norm": 0.3357229513721586, "learning_rate": 3.865596580419519e-05, "loss": 0.6180317401885986, "step": 188 }, { "epoch": 0.6237623762376238, "grad_norm": 0.3594949077768003, "learning_rate": 3.8628145485757925e-05, "loss": 0.5970651507377625, "step": 189 }, { "epoch": 0.6270627062706271, "grad_norm": 0.3496234009951303, "learning_rate": 3.860005040180533e-05, "loss": 0.6027296781539917, "step": 190 }, { "epoch": 0.6303630363036303, "grad_norm": 0.3830042583584045, "learning_rate": 3.857168096674044e-05, "loss": 0.6326305270195007, "step": 191 }, { "epoch": 0.6336633663366337, "grad_norm": 0.333508477943962, "learning_rate": 3.854303759901299e-05, "loss": 0.6508482694625854, "step": 192 }, { "epoch": 0.636963696369637, "grad_norm": 0.352327105927571, "learning_rate": 3.851412072111322e-05, "loss": 0.6088548302650452, "step": 193 }, { "epoch": 0.6402640264026402, "grad_norm": 0.36196379228138037, "learning_rate": 3.8484930759565645e-05, "loss": 0.5975607633590698, "step": 194 }, { "epoch": 0.6435643564356436, "grad_norm": 0.3231664855297077, "learning_rate": 3.845546814492279e-05, "loss": 0.5467930436134338, "step": 195 }, { "epoch": 0.6468646864686468, "grad_norm": 0.35556526722817444, "learning_rate": 3.8425733311758795e-05, "loss": 0.583969235420227, "step": 196 }, { "epoch": 0.6501650165016502, "grad_norm": 0.331073543443887, "learning_rate": 3.8395726698663045e-05, "loss": 0.6007376909255981, "step": 197 }, { "epoch": 0.6534653465346535, "grad_norm": 0.34786293006180385, "learning_rate": 3.836544874823368e-05, "loss": 0.5971908569335938, "step": 198 }, { "epoch": 0.6567656765676567, "grad_norm": 0.3128647628132879, "learning_rate": 3.8334899907071064e-05, "loss": 0.592069685459137, "step": 199 }, { "epoch": 0.6600660066006601, "grad_norm": 0.3308125796746202, "learning_rate": 3.830408062577121e-05, "loss": 0.6188071966171265, "step": 200 }, { "epoch": 0.6633663366336634, "grad_norm": 0.34889077565364124, "learning_rate": 3.827299135891913e-05, "loss": 0.5976923704147339, "step": 201 }, { "epoch": 0.6666666666666666, "grad_norm": 0.33443153994631497, "learning_rate": 3.8241632565082124e-05, "loss": 0.6120954155921936, "step": 202 }, { "epoch": 0.66996699669967, "grad_norm": 0.3573334503206899, "learning_rate": 3.821000470680303e-05, "loss": 0.6661979556083679, "step": 203 }, { "epoch": 0.6732673267326733, "grad_norm": 0.34662331225184934, "learning_rate": 3.8178108250593384e-05, "loss": 0.5853559970855713, "step": 204 }, { "epoch": 0.6765676567656765, "grad_norm": 0.33823171869993424, "learning_rate": 3.814594366692654e-05, "loss": 0.6648768186569214, "step": 205 }, { "epoch": 0.6798679867986799, "grad_norm": 0.4178878629038068, "learning_rate": 3.8113511430230745e-05, "loss": 0.5893838405609131, "step": 206 }, { "epoch": 0.6831683168316832, "grad_norm": 0.36858896529016355, "learning_rate": 3.808081201888214e-05, "loss": 0.6177140474319458, "step": 207 }, { "epoch": 0.6864686468646864, "grad_norm": 0.38061402245158527, "learning_rate": 3.8047845915197695e-05, "loss": 0.5793695449829102, "step": 208 }, { "epoch": 0.6897689768976898, "grad_norm": 0.3591315376932048, "learning_rate": 3.8014613605428084e-05, "loss": 0.5571605563163757, "step": 209 }, { "epoch": 0.693069306930693, "grad_norm": 0.33319862057164595, "learning_rate": 3.798111557975053e-05, "loss": 0.5945760011672974, "step": 210 }, { "epoch": 0.6963696369636964, "grad_norm": 0.3495679574237745, "learning_rate": 3.7947352332261586e-05, "loss": 0.600873589515686, "step": 211 }, { "epoch": 0.6996699669966997, "grad_norm": 0.37390147639764304, "learning_rate": 3.791332436096983e-05, "loss": 0.6234852075576782, "step": 212 }, { "epoch": 0.7029702970297029, "grad_norm": 0.3571653694610809, "learning_rate": 3.7879032167788494e-05, "loss": 0.6129578948020935, "step": 213 }, { "epoch": 0.7062706270627063, "grad_norm": 0.48971881906384135, "learning_rate": 3.784447625852812e-05, "loss": 0.6204475164413452, "step": 214 }, { "epoch": 0.7095709570957096, "grad_norm": 0.3610294548812676, "learning_rate": 3.780965714288905e-05, "loss": 0.6734122037887573, "step": 215 }, { "epoch": 0.7128712871287128, "grad_norm": 0.35396639697907356, "learning_rate": 3.777457533445393e-05, "loss": 0.5678560137748718, "step": 216 }, { "epoch": 0.7161716171617162, "grad_norm": 0.3232076597831296, "learning_rate": 3.7739231350680135e-05, "loss": 0.5784683227539062, "step": 217 }, { "epoch": 0.7194719471947195, "grad_norm": 0.3540897506756201, "learning_rate": 3.7703625712892125e-05, "loss": 0.6060354113578796, "step": 218 }, { "epoch": 0.7227722772277227, "grad_norm": 0.35008278157890194, "learning_rate": 3.766775894627376e-05, "loss": 0.6248741745948792, "step": 219 }, { "epoch": 0.7260726072607261, "grad_norm": 0.32018676747331787, "learning_rate": 3.7631631579860553e-05, "loss": 0.6014479398727417, "step": 220 }, { "epoch": 0.7293729372937293, "grad_norm": 0.32068744744726313, "learning_rate": 3.759524414653189e-05, "loss": 0.6283233761787415, "step": 221 }, { "epoch": 0.7326732673267327, "grad_norm": 0.3047460979670785, "learning_rate": 3.755859718300313e-05, "loss": 0.5710185766220093, "step": 222 }, { "epoch": 0.735973597359736, "grad_norm": 0.34698489216212486, "learning_rate": 3.75216912298177e-05, "loss": 0.6007407903671265, "step": 223 }, { "epoch": 0.7392739273927392, "grad_norm": 0.4952362221345831, "learning_rate": 3.748452683133916e-05, "loss": 0.6852575540542603, "step": 224 }, { "epoch": 0.7425742574257426, "grad_norm": 0.32106680253004655, "learning_rate": 3.7447104535743115e-05, "loss": 0.6270833611488342, "step": 225 }, { "epoch": 0.7458745874587459, "grad_norm": 0.30214814189665545, "learning_rate": 3.740942489500916e-05, "loss": 0.5925471782684326, "step": 226 }, { "epoch": 0.7491749174917491, "grad_norm": 0.3171932777170319, "learning_rate": 3.737148846491275e-05, "loss": 0.573570728302002, "step": 227 }, { "epoch": 0.7524752475247525, "grad_norm": 0.31480815810804524, "learning_rate": 3.7333295805016986e-05, "loss": 0.6088368892669678, "step": 228 }, { "epoch": 0.7557755775577558, "grad_norm": 0.3103068539492526, "learning_rate": 3.729484747866435e-05, "loss": 0.5496470332145691, "step": 229 }, { "epoch": 0.759075907590759, "grad_norm": 0.3007603199811456, "learning_rate": 3.725614405296843e-05, "loss": 0.6008220314979553, "step": 230 }, { "epoch": 0.7623762376237624, "grad_norm": 0.3007492168191884, "learning_rate": 3.721718609880551e-05, "loss": 0.5982120037078857, "step": 231 }, { "epoch": 0.7656765676567657, "grad_norm": 0.3010002181490163, "learning_rate": 3.717797419080618e-05, "loss": 0.6404559016227722, "step": 232 }, { "epoch": 0.768976897689769, "grad_norm": 0.35604106645956024, "learning_rate": 3.713850890734689e-05, "loss": 0.5875239372253418, "step": 233 }, { "epoch": 0.7722772277227723, "grad_norm": 0.33191901009333297, "learning_rate": 3.709879083054133e-05, "loss": 0.5962772369384766, "step": 234 }, { "epoch": 0.7755775577557755, "grad_norm": 0.29418628627284477, "learning_rate": 3.705882054623192e-05, "loss": 0.5764110684394836, "step": 235 }, { "epoch": 0.7788778877887789, "grad_norm": 0.30409612807603364, "learning_rate": 3.7018598643981165e-05, "loss": 0.5635858178138733, "step": 236 }, { "epoch": 0.7821782178217822, "grad_norm": 0.3039645238556037, "learning_rate": 3.69781257170629e-05, "loss": 0.5880881547927856, "step": 237 }, { "epoch": 0.7854785478547854, "grad_norm": 0.30606246597511416, "learning_rate": 3.6937402362453606e-05, "loss": 0.5644733905792236, "step": 238 }, { "epoch": 0.7887788778877888, "grad_norm": 0.328325214152846, "learning_rate": 3.689642918082358e-05, "loss": 0.6431151032447815, "step": 239 }, { "epoch": 0.7920792079207921, "grad_norm": 0.2863869456911102, "learning_rate": 3.6855206776528055e-05, "loss": 0.5848085880279541, "step": 240 }, { "epoch": 0.7953795379537953, "grad_norm": 0.3169795193025283, "learning_rate": 3.681373575759831e-05, "loss": 0.590021550655365, "step": 241 }, { "epoch": 0.7986798679867987, "grad_norm": 0.3630216059086489, "learning_rate": 3.67720167357327e-05, "loss": 0.6217919588088989, "step": 242 }, { "epoch": 0.801980198019802, "grad_norm": 0.2999270957223198, "learning_rate": 3.673005032628763e-05, "loss": 0.6075180172920227, "step": 243 }, { "epoch": 0.8052805280528053, "grad_norm": 0.35145967135780704, "learning_rate": 3.668783714826846e-05, "loss": 0.6078404188156128, "step": 244 }, { "epoch": 0.8085808580858086, "grad_norm": 0.32650805345047657, "learning_rate": 3.664537782432042e-05, "loss": 0.6297526955604553, "step": 245 }, { "epoch": 0.8118811881188119, "grad_norm": 0.32461322862254094, "learning_rate": 3.660267298071936e-05, "loss": 0.5684514045715332, "step": 246 }, { "epoch": 0.8151815181518152, "grad_norm": 0.32171296221654416, "learning_rate": 3.655972324736259e-05, "loss": 0.6192148327827454, "step": 247 }, { "epoch": 0.8184818481848185, "grad_norm": 0.3322336621503604, "learning_rate": 3.6516529257759506e-05, "loss": 0.5900243520736694, "step": 248 }, { "epoch": 0.8217821782178217, "grad_norm": 0.35183312055445004, "learning_rate": 3.6473091649022337e-05, "loss": 0.5941751599311829, "step": 249 }, { "epoch": 0.8250825082508251, "grad_norm": 0.31255833045908565, "learning_rate": 3.6429411061856645e-05, "loss": 0.5744310021400452, "step": 250 }, { "epoch": 0.8283828382838284, "grad_norm": 0.3266269251233177, "learning_rate": 3.6385488140551985e-05, "loss": 0.5985124707221985, "step": 251 }, { "epoch": 0.8316831683168316, "grad_norm": 0.30426711611593643, "learning_rate": 3.6341323532972294e-05, "loss": 0.581912636756897, "step": 252 }, { "epoch": 0.834983498349835, "grad_norm": 0.3297819735063718, "learning_rate": 3.629691789054643e-05, "loss": 0.586786150932312, "step": 253 }, { "epoch": 0.8382838283828383, "grad_norm": 0.3074133078124695, "learning_rate": 3.625227186825848e-05, "loss": 0.6312603950500488, "step": 254 }, { "epoch": 0.8415841584158416, "grad_norm": 0.33007753969064285, "learning_rate": 3.620738612463818e-05, "loss": 0.5886626243591309, "step": 255 }, { "epoch": 0.8448844884488449, "grad_norm": 0.31334340596765187, "learning_rate": 3.6162261321751114e-05, "loss": 0.5892266035079956, "step": 256 }, { "epoch": 0.8481848184818482, "grad_norm": 0.31784442826893616, "learning_rate": 3.6116898125189045e-05, "loss": 0.5472115278244019, "step": 257 }, { "epoch": 0.8514851485148515, "grad_norm": 0.3456330158902343, "learning_rate": 3.6071297204059995e-05, "loss": 0.5981796383857727, "step": 258 }, { "epoch": 0.8547854785478548, "grad_norm": 0.3377124553034101, "learning_rate": 3.6025459230978475e-05, "loss": 0.6708342432975769, "step": 259 }, { "epoch": 0.858085808580858, "grad_norm": 0.3081391395426973, "learning_rate": 3.597938488205549e-05, "loss": 0.6306079626083374, "step": 260 }, { "epoch": 0.8613861386138614, "grad_norm": 0.3398583824115319, "learning_rate": 3.59330748368886e-05, "loss": 0.6098329424858093, "step": 261 }, { "epoch": 0.8646864686468647, "grad_norm": 0.32878067719138626, "learning_rate": 3.588652977855189e-05, "loss": 0.5617724061012268, "step": 262 }, { "epoch": 0.8679867986798679, "grad_norm": 0.34962664282188816, "learning_rate": 3.58397503935859e-05, "loss": 0.5780894756317139, "step": 263 }, { "epoch": 0.8712871287128713, "grad_norm": 0.32665214019362204, "learning_rate": 3.5792737371987477e-05, "loss": 0.578921377658844, "step": 264 }, { "epoch": 0.8745874587458746, "grad_norm": 0.36673188949709323, "learning_rate": 3.574549140719962e-05, "loss": 0.614944577217102, "step": 265 }, { "epoch": 0.8778877887788779, "grad_norm": 0.3248666143164946, "learning_rate": 3.569801319610125e-05, "loss": 0.6269869208335876, "step": 266 }, { "epoch": 0.8811881188118812, "grad_norm": 0.3338123662452596, "learning_rate": 3.565030343899693e-05, "loss": 0.6045581102371216, "step": 267 }, { "epoch": 0.8844884488448845, "grad_norm": 0.31011600887091817, "learning_rate": 3.5602362839606514e-05, "loss": 0.5872907638549805, "step": 268 }, { "epoch": 0.8877887788778878, "grad_norm": 0.31857062779594814, "learning_rate": 3.55541921050548e-05, "loss": 0.6283375024795532, "step": 269 }, { "epoch": 0.8910891089108911, "grad_norm": 0.32445751859048455, "learning_rate": 3.5505791945861076e-05, "loss": 0.5747002363204956, "step": 270 }, { "epoch": 0.8943894389438944, "grad_norm": 0.2923309334474062, "learning_rate": 3.545716307592864e-05, "loss": 0.6205827593803406, "step": 271 }, { "epoch": 0.8976897689768977, "grad_norm": 0.43972579907455317, "learning_rate": 3.54083062125343e-05, "loss": 0.5987251400947571, "step": 272 }, { "epoch": 0.900990099009901, "grad_norm": 0.33194286352506225, "learning_rate": 3.535922207631776e-05, "loss": 0.6275356411933899, "step": 273 }, { "epoch": 0.9042904290429042, "grad_norm": 0.3408278730793354, "learning_rate": 3.5309911391270996e-05, "loss": 0.6097655892372131, "step": 274 }, { "epoch": 0.9075907590759076, "grad_norm": 0.3441995699777348, "learning_rate": 3.52603748847276e-05, "loss": 0.544170618057251, "step": 275 }, { "epoch": 0.9108910891089109, "grad_norm": 0.3034867763949278, "learning_rate": 3.521061328735202e-05, "loss": 0.5723366141319275, "step": 276 }, { "epoch": 0.9141914191419142, "grad_norm": 0.3091145609625042, "learning_rate": 3.516062733312879e-05, "loss": 0.5801889896392822, "step": 277 }, { "epoch": 0.9174917491749175, "grad_norm": 0.3532845546992122, "learning_rate": 3.511041775935175e-05, "loss": 0.5942766666412354, "step": 278 }, { "epoch": 0.9207920792079208, "grad_norm": 0.3192035342587887, "learning_rate": 3.50599853066131e-05, "loss": 0.5604017972946167, "step": 279 }, { "epoch": 0.9240924092409241, "grad_norm": 0.4475571406552253, "learning_rate": 3.500933071879251e-05, "loss": 0.6151460409164429, "step": 280 }, { "epoch": 0.9273927392739274, "grad_norm": 0.30946498453996385, "learning_rate": 3.495845474304616e-05, "loss": 0.5854936838150024, "step": 281 }, { "epoch": 0.9306930693069307, "grad_norm": 0.3188531409769719, "learning_rate": 3.490735812979572e-05, "loss": 0.5586672425270081, "step": 282 }, { "epoch": 0.933993399339934, "grad_norm": 0.3250546549981712, "learning_rate": 3.485604163271721e-05, "loss": 0.578475832939148, "step": 283 }, { "epoch": 0.9372937293729373, "grad_norm": 0.45030229248281484, "learning_rate": 3.4804506008730015e-05, "loss": 0.5236382484436035, "step": 284 }, { "epoch": 0.9405940594059405, "grad_norm": 0.31677157675280776, "learning_rate": 3.475275201798559e-05, "loss": 0.5964822769165039, "step": 285 }, { "epoch": 0.9438943894389439, "grad_norm": 0.3221519247617692, "learning_rate": 3.4700780423856334e-05, "loss": 0.5551598072052002, "step": 286 }, { "epoch": 0.9471947194719472, "grad_norm": 0.31322506983838, "learning_rate": 3.464859199292429e-05, "loss": 0.6095103621482849, "step": 287 }, { "epoch": 0.9504950495049505, "grad_norm": 0.33333701342858213, "learning_rate": 3.4596187494969846e-05, "loss": 0.5893416404724121, "step": 288 }, { "epoch": 0.9537953795379538, "grad_norm": 0.31167002926986764, "learning_rate": 3.454356770296039e-05, "loss": 0.5992231965065002, "step": 289 }, { "epoch": 0.9570957095709571, "grad_norm": 0.3407826991036566, "learning_rate": 3.4490733393038895e-05, "loss": 0.6071972250938416, "step": 290 }, { "epoch": 0.9603960396039604, "grad_norm": 0.321397588262469, "learning_rate": 3.443768534451248e-05, "loss": 0.5836942195892334, "step": 291 }, { "epoch": 0.9636963696369637, "grad_norm": 0.3596023570145339, "learning_rate": 3.4384424339840916e-05, "loss": 0.5707553625106812, "step": 292 }, { "epoch": 0.966996699669967, "grad_norm": 0.326365753033755, "learning_rate": 3.4330951164625075e-05, "loss": 0.5883970260620117, "step": 293 }, { "epoch": 0.9702970297029703, "grad_norm": 0.3276030981345682, "learning_rate": 3.427726660759535e-05, "loss": 0.6281589269638062, "step": 294 }, { "epoch": 0.9735973597359736, "grad_norm": 0.3559560269123216, "learning_rate": 3.422337146060003e-05, "loss": 0.6641702651977539, "step": 295 }, { "epoch": 0.976897689768977, "grad_norm": 0.34661891319338206, "learning_rate": 3.4169266518593596e-05, "loss": 0.6398966312408447, "step": 296 }, { "epoch": 0.9801980198019802, "grad_norm": 0.3392015122860613, "learning_rate": 3.411495257962501e-05, "loss": 0.6376276016235352, "step": 297 }, { "epoch": 0.9834983498349835, "grad_norm": 0.3454832175281825, "learning_rate": 3.406043044482596e-05, "loss": 0.648975133895874, "step": 298 }, { "epoch": 0.9867986798679867, "grad_norm": 0.3284679145456545, "learning_rate": 3.4005700918399016e-05, "loss": 0.6201390624046326, "step": 299 }, { "epoch": 0.9900990099009901, "grad_norm": 0.33000362479964457, "learning_rate": 3.395076480760576e-05, "loss": 0.6103875637054443, "step": 300 }, { "epoch": 0.9933993399339934, "grad_norm": 0.31707924192462417, "learning_rate": 3.3895622922754936e-05, "loss": 0.5486876368522644, "step": 301 }, { "epoch": 0.9966996699669967, "grad_norm": 0.3094164003933957, "learning_rate": 3.384027607719043e-05, "loss": 0.5980846285820007, "step": 302 }, { "epoch": 1.0, "grad_norm": 0.33089398879681, "learning_rate": 3.378472508727931e-05, "loss": 0.5986801385879517, "step": 303 }, { "epoch": 1.0033003300330032, "grad_norm": 0.4690060258405477, "learning_rate": 3.372897077239979e-05, "loss": 0.5586727857589722, "step": 304 }, { "epoch": 1.0066006600660067, "grad_norm": 0.34686786747213394, "learning_rate": 3.36730139549291e-05, "loss": 0.5393255949020386, "step": 305 }, { "epoch": 1.00990099009901, "grad_norm": 0.4023568892604613, "learning_rate": 3.361685546023143e-05, "loss": 0.5377227067947388, "step": 306 }, { "epoch": 1.0132013201320131, "grad_norm": 0.39915820884177944, "learning_rate": 3.356049611664568e-05, "loss": 0.5223784446716309, "step": 307 }, { "epoch": 1.0165016501650166, "grad_norm": 0.3654265250846575, "learning_rate": 3.350393675547328e-05, "loss": 0.5502469539642334, "step": 308 }, { "epoch": 1.0198019801980198, "grad_norm": 0.42079557297663883, "learning_rate": 3.3447178210965936e-05, "loss": 0.5626603960990906, "step": 309 }, { "epoch": 1.023102310231023, "grad_norm": 0.3684084639129366, "learning_rate": 3.3390221320313303e-05, "loss": 0.48262274265289307, "step": 310 }, { "epoch": 1.0264026402640265, "grad_norm": 0.39908786063309193, "learning_rate": 3.333306692363065e-05, "loss": 0.5850967168807983, "step": 311 }, { "epoch": 1.0297029702970297, "grad_norm": 0.44262876970078274, "learning_rate": 3.3275715863946466e-05, "loss": 0.5444281697273254, "step": 312 }, { "epoch": 1.033003300330033, "grad_norm": 0.35239079669120155, "learning_rate": 3.3218168987190004e-05, "loss": 0.5329654216766357, "step": 313 }, { "epoch": 1.0363036303630364, "grad_norm": 0.38499730860339404, "learning_rate": 3.316042714217885e-05, "loss": 0.5276832580566406, "step": 314 }, { "epoch": 1.0396039603960396, "grad_norm": 0.3928937531164494, "learning_rate": 3.310249118060636e-05, "loss": 0.5344791412353516, "step": 315 }, { "epoch": 1.0429042904290429, "grad_norm": 0.3466589226743573, "learning_rate": 3.304436195702911e-05, "loss": 0.5479785203933716, "step": 316 }, { "epoch": 1.046204620462046, "grad_norm": 0.370325309360066, "learning_rate": 3.298604032885431e-05, "loss": 0.5223082900047302, "step": 317 }, { "epoch": 1.0495049504950495, "grad_norm": 0.4271803134046634, "learning_rate": 3.292752715632713e-05, "loss": 0.5667799711227417, "step": 318 }, { "epoch": 1.0528052805280528, "grad_norm": 0.33752277032768196, "learning_rate": 3.2868823302518016e-05, "loss": 0.5194317698478699, "step": 319 }, { "epoch": 1.056105610561056, "grad_norm": 0.35801795115870316, "learning_rate": 3.2809929633309985e-05, "loss": 0.4911007285118103, "step": 320 }, { "epoch": 1.0594059405940595, "grad_norm": 0.33819516112787196, "learning_rate": 3.2750847017385826e-05, "loss": 0.5269002914428711, "step": 321 }, { "epoch": 1.0627062706270627, "grad_norm": 0.3280280196094967, "learning_rate": 3.269157632621529e-05, "loss": 0.5124789476394653, "step": 322 }, { "epoch": 1.066006600660066, "grad_norm": 0.3841029677303286, "learning_rate": 3.263211843404225e-05, "loss": 0.5483890771865845, "step": 323 }, { "epoch": 1.0693069306930694, "grad_norm": 0.348752311292252, "learning_rate": 3.25724742178718e-05, "loss": 0.5582579374313354, "step": 324 }, { "epoch": 1.0726072607260726, "grad_norm": 0.3672218653955236, "learning_rate": 3.2512644557457304e-05, "loss": 0.5662975907325745, "step": 325 }, { "epoch": 1.0759075907590758, "grad_norm": 0.339133227284404, "learning_rate": 3.2452630335287445e-05, "loss": 0.5502511858940125, "step": 326 }, { "epoch": 1.0792079207920793, "grad_norm": 0.3607463939055526, "learning_rate": 3.239243243657318e-05, "loss": 0.5614978075027466, "step": 327 }, { "epoch": 1.0825082508250825, "grad_norm": 0.3354690532522152, "learning_rate": 3.233205174923472e-05, "loss": 0.4828110635280609, "step": 328 }, { "epoch": 1.0858085808580857, "grad_norm": 0.3296040603044689, "learning_rate": 3.22714891638884e-05, "loss": 0.5437847971916199, "step": 329 }, { "epoch": 1.0891089108910892, "grad_norm": 0.3295415767468974, "learning_rate": 3.221074557383355e-05, "loss": 0.6240063309669495, "step": 330 }, { "epoch": 1.0924092409240924, "grad_norm": 0.3032628226796708, "learning_rate": 3.2149821875039325e-05, "loss": 0.5435442328453064, "step": 331 }, { "epoch": 1.0957095709570956, "grad_norm": 0.30875440813945676, "learning_rate": 3.20887189661315e-05, "loss": 0.5240401029586792, "step": 332 }, { "epoch": 1.099009900990099, "grad_norm": 0.3043121620505056, "learning_rate": 3.202743774837919e-05, "loss": 0.5227692127227783, "step": 333 }, { "epoch": 1.1023102310231023, "grad_norm": 0.3439754692795775, "learning_rate": 3.196597912568157e-05, "loss": 0.5607417821884155, "step": 334 }, { "epoch": 1.1056105610561056, "grad_norm": 0.29691798670137787, "learning_rate": 3.1904344004554536e-05, "loss": 0.5607600808143616, "step": 335 }, { "epoch": 1.108910891089109, "grad_norm": 0.32493088910689055, "learning_rate": 3.184253329411737e-05, "loss": 0.47135430574417114, "step": 336 }, { "epoch": 1.1122112211221122, "grad_norm": 0.3202945703052858, "learning_rate": 3.178054790607924e-05, "loss": 0.5708764791488647, "step": 337 }, { "epoch": 1.1155115511551155, "grad_norm": 0.3164605548495645, "learning_rate": 3.1718388754725883e-05, "loss": 0.5522497296333313, "step": 338 }, { "epoch": 1.118811881188119, "grad_norm": 0.3449586600316318, "learning_rate": 3.1656056756906e-05, "loss": 0.5556532144546509, "step": 339 }, { "epoch": 1.1221122112211221, "grad_norm": 0.3130025484639745, "learning_rate": 3.1593552832017795e-05, "loss": 0.5727676153182983, "step": 340 }, { "epoch": 1.1254125412541254, "grad_norm": 0.3195703179740936, "learning_rate": 3.153087790199541e-05, "loss": 0.5131651759147644, "step": 341 }, { "epoch": 1.1287128712871288, "grad_norm": 0.3191177264656739, "learning_rate": 3.146803289129528e-05, "loss": 0.5143063068389893, "step": 342 }, { "epoch": 1.132013201320132, "grad_norm": 0.33398757419035885, "learning_rate": 3.1405018726882595e-05, "loss": 0.509161114692688, "step": 343 }, { "epoch": 1.1353135313531353, "grad_norm": 0.33058725446313514, "learning_rate": 3.13418363382175e-05, "loss": 0.5213526487350464, "step": 344 }, { "epoch": 1.1386138613861387, "grad_norm": 0.3226863318187914, "learning_rate": 3.127848665724149e-05, "loss": 0.5465434789657593, "step": 345 }, { "epoch": 1.141914191419142, "grad_norm": 0.6179658385179007, "learning_rate": 3.1214970618363626e-05, "loss": 0.5342190265655518, "step": 346 }, { "epoch": 1.1452145214521452, "grad_norm": 0.47777163001134637, "learning_rate": 3.115128915844672e-05, "loss": 0.541754424571991, "step": 347 }, { "epoch": 1.1485148514851484, "grad_norm": 0.33931974771490697, "learning_rate": 3.10874432167936e-05, "loss": 0.5318331122398376, "step": 348 }, { "epoch": 1.1518151815181519, "grad_norm": 0.32111740987941506, "learning_rate": 3.1023433735133134e-05, "loss": 0.4972509741783142, "step": 349 }, { "epoch": 1.155115511551155, "grad_norm": 0.30074948382432587, "learning_rate": 3.095926165760647e-05, "loss": 0.5417294502258301, "step": 350 }, { "epoch": 1.1584158415841583, "grad_norm": 0.3410522798436207, "learning_rate": 3.089492793075302e-05, "loss": 0.554945707321167, "step": 351 }, { "epoch": 1.1617161716171618, "grad_norm": 0.3254774061643724, "learning_rate": 3.083043350349653e-05, "loss": 0.5204564929008484, "step": 352 }, { "epoch": 1.165016501650165, "grad_norm": 0.3088402728006412, "learning_rate": 3.076577932713108e-05, "loss": 0.4856947064399719, "step": 353 }, { "epoch": 1.1683168316831682, "grad_norm": 0.2896918095760776, "learning_rate": 3.0700966355307055e-05, "loss": 0.5269368886947632, "step": 354 }, { "epoch": 1.1716171617161717, "grad_norm": 0.32747543865706225, "learning_rate": 3.063599554401708e-05, "loss": 0.5811939239501953, "step": 355 }, { "epoch": 1.174917491749175, "grad_norm": 0.29324577597304957, "learning_rate": 3.057086785158189e-05, "loss": 0.5636904239654541, "step": 356 }, { "epoch": 1.1782178217821782, "grad_norm": 0.31779620334412045, "learning_rate": 3.050558423863626e-05, "loss": 0.546089768409729, "step": 357 }, { "epoch": 1.1815181518151816, "grad_norm": 0.3093045991582328, "learning_rate": 3.0440145668114774e-05, "loss": 0.5239901542663574, "step": 358 }, { "epoch": 1.1848184818481848, "grad_norm": 0.31848934088179354, "learning_rate": 3.0374553105237637e-05, "loss": 0.5833466053009033, "step": 359 }, { "epoch": 1.188118811881188, "grad_norm": 0.33803859097620154, "learning_rate": 3.0308807517496456e-05, "loss": 0.5060774087905884, "step": 360 }, { "epoch": 1.1914191419141915, "grad_norm": 0.31145081064149094, "learning_rate": 3.0242909874639953e-05, "loss": 0.5164307355880737, "step": 361 }, { "epoch": 1.1947194719471947, "grad_norm": 0.29765085452905116, "learning_rate": 3.0176861148659672e-05, "loss": 0.49949395656585693, "step": 362 }, { "epoch": 1.198019801980198, "grad_norm": 0.3296486034239661, "learning_rate": 3.0110662313775623e-05, "loss": 0.5581181049346924, "step": 363 }, { "epoch": 1.2013201320132012, "grad_norm": 0.3116631729941006, "learning_rate": 3.0044314346421938e-05, "loss": 0.5657376646995544, "step": 364 }, { "epoch": 1.2046204620462047, "grad_norm": 0.33012695180790946, "learning_rate": 2.9977818225232443e-05, "loss": 0.5269935131072998, "step": 365 }, { "epoch": 1.2079207920792079, "grad_norm": 0.31869984664933465, "learning_rate": 2.991117493102626e-05, "loss": 0.5385931730270386, "step": 366 }, { "epoch": 1.2112211221122111, "grad_norm": 0.30491226427581125, "learning_rate": 2.984438544679329e-05, "loss": 0.5615143179893494, "step": 367 }, { "epoch": 1.2145214521452146, "grad_norm": 0.32195999076013593, "learning_rate": 2.9777450757679754e-05, "loss": 0.5175333023071289, "step": 368 }, { "epoch": 1.2178217821782178, "grad_norm": 0.30930257180361886, "learning_rate": 2.971037185097364e-05, "loss": 0.565494179725647, "step": 369 }, { "epoch": 1.221122112211221, "grad_norm": 0.34237830645177886, "learning_rate": 2.9643149716090146e-05, "loss": 0.5519120693206787, "step": 370 }, { "epoch": 1.2244224422442245, "grad_norm": 0.30959351563618437, "learning_rate": 2.9575785344557114e-05, "loss": 0.49374374747276306, "step": 371 }, { "epoch": 1.2277227722772277, "grad_norm": 0.31310768619122714, "learning_rate": 2.950827973000034e-05, "loss": 0.5608875751495361, "step": 372 }, { "epoch": 1.231023102310231, "grad_norm": 0.31986895424613543, "learning_rate": 2.944063386812899e-05, "loss": 0.5866271257400513, "step": 373 }, { "epoch": 1.2343234323432344, "grad_norm": 0.3359900469491975, "learning_rate": 2.9372848756720867e-05, "loss": 0.5342913269996643, "step": 374 }, { "epoch": 1.2376237623762376, "grad_norm": 0.2956484140793021, "learning_rate": 2.9304925395607696e-05, "loss": 0.5539537668228149, "step": 375 }, { "epoch": 1.2409240924092408, "grad_norm": 0.3239136306261367, "learning_rate": 2.9236864786660423e-05, "loss": 0.5614147186279297, "step": 376 }, { "epoch": 1.2442244224422443, "grad_norm": 0.3311932744032855, "learning_rate": 2.9168667933774356e-05, "loss": 0.46689367294311523, "step": 377 }, { "epoch": 1.2475247524752475, "grad_norm": 0.3291299090174619, "learning_rate": 2.910033584285444e-05, "loss": 0.5383083820343018, "step": 378 }, { "epoch": 1.2508250825082508, "grad_norm": 0.3013900588246958, "learning_rate": 2.903186952180037e-05, "loss": 0.5349752902984619, "step": 379 }, { "epoch": 1.2541254125412542, "grad_norm": 0.3219145450840317, "learning_rate": 2.8963269980491743e-05, "loss": 0.5792303681373596, "step": 380 }, { "epoch": 1.2574257425742574, "grad_norm": 0.2840550960191948, "learning_rate": 2.8894538230773147e-05, "loss": 0.524924099445343, "step": 381 }, { "epoch": 1.2607260726072607, "grad_norm": 0.3172399675943548, "learning_rate": 2.882567528643925e-05, "loss": 0.5137406587600708, "step": 382 }, { "epoch": 1.2640264026402641, "grad_norm": 0.2893676822687234, "learning_rate": 2.8756682163219857e-05, "loss": 0.5196574926376343, "step": 383 }, { "epoch": 1.2673267326732673, "grad_norm": 0.31363904787626334, "learning_rate": 2.8687559878764903e-05, "loss": 0.585644006729126, "step": 384 }, { "epoch": 1.2706270627062706, "grad_norm": 0.3310272877884813, "learning_rate": 2.8618309452629445e-05, "loss": 0.5973786115646362, "step": 385 }, { "epoch": 1.273927392739274, "grad_norm": 0.3201222210217655, "learning_rate": 2.854893190625865e-05, "loss": 0.5909825563430786, "step": 386 }, { "epoch": 1.2772277227722773, "grad_norm": 0.3507731714316878, "learning_rate": 2.84794282629727e-05, "loss": 0.5903690457344055, "step": 387 }, { "epoch": 1.2805280528052805, "grad_norm": 0.31011243056320775, "learning_rate": 2.840979954795171e-05, "loss": 0.5316457152366638, "step": 388 }, { "epoch": 1.283828382838284, "grad_norm": 0.32950464198309637, "learning_rate": 2.8340046788220613e-05, "loss": 0.5080389976501465, "step": 389 }, { "epoch": 1.2871287128712872, "grad_norm": 0.37769184930606736, "learning_rate": 2.8270171012633994e-05, "loss": 0.6137889623641968, "step": 390 }, { "epoch": 1.2904290429042904, "grad_norm": 0.34430823745531935, "learning_rate": 2.8200173251860928e-05, "loss": 0.5433805584907532, "step": 391 }, { "epoch": 1.2937293729372938, "grad_norm": 0.356563736773021, "learning_rate": 2.8130054538369775e-05, "loss": 0.4965590834617615, "step": 392 }, { "epoch": 1.297029702970297, "grad_norm": 0.29380923244218154, "learning_rate": 2.805981590641295e-05, "loss": 0.5361340045928955, "step": 393 }, { "epoch": 1.3003300330033003, "grad_norm": 0.31403525376793245, "learning_rate": 2.7989458392011678e-05, "loss": 0.47011327743530273, "step": 394 }, { "epoch": 1.3036303630363038, "grad_norm": 0.30710914438533876, "learning_rate": 2.7918983032940666e-05, "loss": 0.5893687605857849, "step": 395 }, { "epoch": 1.306930693069307, "grad_norm": 0.3126943781985397, "learning_rate": 2.7848390868712886e-05, "loss": 0.5219327211380005, "step": 396 }, { "epoch": 1.3102310231023102, "grad_norm": 0.35585146532127665, "learning_rate": 2.7777682940564142e-05, "loss": 0.5652155876159668, "step": 397 }, { "epoch": 1.3135313531353137, "grad_norm": 0.41906023992763497, "learning_rate": 2.7706860291437784e-05, "loss": 0.5361950397491455, "step": 398 }, { "epoch": 1.316831683168317, "grad_norm": 0.29071400108766793, "learning_rate": 2.763592396596929e-05, "loss": 0.5355206727981567, "step": 399 }, { "epoch": 1.3201320132013201, "grad_norm": 0.298123677847084, "learning_rate": 2.756487501047086e-05, "loss": 0.5082858800888062, "step": 400 }, { "epoch": 1.3234323432343233, "grad_norm": 0.3144050740212562, "learning_rate": 2.7493714472916013e-05, "loss": 0.5282934904098511, "step": 401 }, { "epoch": 1.3267326732673268, "grad_norm": 0.29396121691648713, "learning_rate": 2.7422443402924074e-05, "loss": 0.5502887964248657, "step": 402 }, { "epoch": 1.33003300330033, "grad_norm": 0.2854429234726643, "learning_rate": 2.7351062851744747e-05, "loss": 0.5374204516410828, "step": 403 }, { "epoch": 1.3333333333333333, "grad_norm": 0.30308752538818784, "learning_rate": 2.7279573872242574e-05, "loss": 0.5602293014526367, "step": 404 }, { "epoch": 1.3366336633663367, "grad_norm": 0.30975657746221447, "learning_rate": 2.7207977518881418e-05, "loss": 0.5321286916732788, "step": 405 }, { "epoch": 1.33993399339934, "grad_norm": 0.28965457921713383, "learning_rate": 2.713627484770892e-05, "loss": 0.5523560047149658, "step": 406 }, { "epoch": 1.3432343234323432, "grad_norm": 0.30598816879566076, "learning_rate": 2.706446691634089e-05, "loss": 0.47019705176353455, "step": 407 }, { "epoch": 1.3465346534653464, "grad_norm": 0.2977261513860205, "learning_rate": 2.6992554783945748e-05, "loss": 0.540359616279602, "step": 408 }, { "epoch": 1.3498349834983498, "grad_norm": 0.2845048826043699, "learning_rate": 2.6920539511228874e-05, "loss": 0.561464786529541, "step": 409 }, { "epoch": 1.353135313531353, "grad_norm": 0.2939741197740927, "learning_rate": 2.6848422160416956e-05, "loss": 0.5429259538650513, "step": 410 }, { "epoch": 1.3564356435643563, "grad_norm": 0.2968609589915083, "learning_rate": 2.677620379524237e-05, "loss": 0.5452640652656555, "step": 411 }, { "epoch": 1.3597359735973598, "grad_norm": 0.28949363661635646, "learning_rate": 2.670388548092741e-05, "loss": 0.49627864360809326, "step": 412 }, { "epoch": 1.363036303630363, "grad_norm": 0.328169978832012, "learning_rate": 2.663146828416867e-05, "loss": 0.5331633687019348, "step": 413 }, { "epoch": 1.3663366336633662, "grad_norm": 0.2926434963884909, "learning_rate": 2.6558953273121216e-05, "loss": 0.5447151064872742, "step": 414 }, { "epoch": 1.3696369636963697, "grad_norm": 0.2863360845432002, "learning_rate": 2.648634151738292e-05, "loss": 0.5467007160186768, "step": 415 }, { "epoch": 1.372937293729373, "grad_norm": 0.33044933855099695, "learning_rate": 2.6413634087978602e-05, "loss": 0.5804279446601868, "step": 416 }, { "epoch": 1.3762376237623761, "grad_norm": 0.29168904019746145, "learning_rate": 2.63408320573443e-05, "loss": 0.5323517322540283, "step": 417 }, { "epoch": 1.3795379537953796, "grad_norm": 0.3046417110987717, "learning_rate": 2.6267936499311402e-05, "loss": 0.5452409982681274, "step": 418 }, { "epoch": 1.3828382838283828, "grad_norm": 0.2878853361033164, "learning_rate": 2.619494848909084e-05, "loss": 0.4622665047645569, "step": 419 }, { "epoch": 1.386138613861386, "grad_norm": 0.3129938954769346, "learning_rate": 2.6121869103257206e-05, "loss": 0.531772255897522, "step": 420 }, { "epoch": 1.3894389438943895, "grad_norm": 0.3044320552061303, "learning_rate": 2.6048699419732897e-05, "loss": 0.519554853439331, "step": 421 }, { "epoch": 1.3927392739273927, "grad_norm": 0.32616258357306027, "learning_rate": 2.5975440517772187e-05, "loss": 0.545585572719574, "step": 422 }, { "epoch": 1.396039603960396, "grad_norm": 0.297995845019565, "learning_rate": 2.5902093477945345e-05, "loss": 0.5641547441482544, "step": 423 }, { "epoch": 1.3993399339933994, "grad_norm": 0.28406971495281874, "learning_rate": 2.5828659382122655e-05, "loss": 0.5578028559684753, "step": 424 }, { "epoch": 1.4026402640264026, "grad_norm": 0.35618435421860006, "learning_rate": 2.5755139313458484e-05, "loss": 0.5931404232978821, "step": 425 }, { "epoch": 1.4059405940594059, "grad_norm": 0.3227282264542969, "learning_rate": 2.5681534356375314e-05, "loss": 0.5486891865730286, "step": 426 }, { "epoch": 1.4092409240924093, "grad_norm": 0.31220449886262164, "learning_rate": 2.5607845596547706e-05, "loss": 0.5007671117782593, "step": 427 }, { "epoch": 1.4125412541254125, "grad_norm": 0.2970377848116104, "learning_rate": 2.5534074120886346e-05, "loss": 0.5044519901275635, "step": 428 }, { "epoch": 1.4158415841584158, "grad_norm": 0.30667327850480125, "learning_rate": 2.5460221017521952e-05, "loss": 0.5227789878845215, "step": 429 }, { "epoch": 1.4191419141914192, "grad_norm": 0.2902458759439887, "learning_rate": 2.538628737578926e-05, "loss": 0.5530189871788025, "step": 430 }, { "epoch": 1.4224422442244224, "grad_norm": 0.3114416510328153, "learning_rate": 2.5312274286210966e-05, "loss": 0.508142352104187, "step": 431 }, { "epoch": 1.4257425742574257, "grad_norm": 0.30284970816559353, "learning_rate": 2.523818284048159e-05, "loss": 0.5497263669967651, "step": 432 }, { "epoch": 1.4290429042904291, "grad_norm": 0.3619418905679721, "learning_rate": 2.5164014131451443e-05, "loss": 0.5477034449577332, "step": 433 }, { "epoch": 1.4323432343234324, "grad_norm": 0.28668741491270383, "learning_rate": 2.508976925311045e-05, "loss": 0.5091728568077087, "step": 434 }, { "epoch": 1.4356435643564356, "grad_norm": 0.2922234358135184, "learning_rate": 2.501544930057203e-05, "loss": 0.5022713541984558, "step": 435 }, { "epoch": 1.438943894389439, "grad_norm": 0.29994035273286174, "learning_rate": 2.494105537005697e-05, "loss": 0.5401599407196045, "step": 436 }, { "epoch": 1.4422442244224423, "grad_norm": 0.27863085551634303, "learning_rate": 2.4866588558877208e-05, "loss": 0.5632063150405884, "step": 437 }, { "epoch": 1.4455445544554455, "grad_norm": 0.2968792338733857, "learning_rate": 2.479204996541969e-05, "loss": 0.552355170249939, "step": 438 }, { "epoch": 1.448844884488449, "grad_norm": 0.3222205976590156, "learning_rate": 2.4717440689130154e-05, "loss": 0.5604996681213379, "step": 439 }, { "epoch": 1.4521452145214522, "grad_norm": 0.2781451863798608, "learning_rate": 2.4642761830496893e-05, "loss": 0.4961245656013489, "step": 440 }, { "epoch": 1.4554455445544554, "grad_norm": 0.3327533816855903, "learning_rate": 2.4568014491034565e-05, "loss": 0.5403590202331543, "step": 441 }, { "epoch": 1.4587458745874589, "grad_norm": 0.2944499869326328, "learning_rate": 2.4493199773267902e-05, "loss": 0.4753378629684448, "step": 442 }, { "epoch": 1.462046204620462, "grad_norm": 0.30936599048377306, "learning_rate": 2.4418318780715477e-05, "loss": 0.5125438570976257, "step": 443 }, { "epoch": 1.4653465346534653, "grad_norm": 0.3047486735791836, "learning_rate": 2.434337261787342e-05, "loss": 0.5670269727706909, "step": 444 }, { "epoch": 1.4686468646864688, "grad_norm": 0.3348418102837006, "learning_rate": 2.426836239019911e-05, "loss": 0.5538198947906494, "step": 445 }, { "epoch": 1.471947194719472, "grad_norm": 0.2790312641462961, "learning_rate": 2.4193289204094893e-05, "loss": 0.5012328028678894, "step": 446 }, { "epoch": 1.4752475247524752, "grad_norm": 0.30485310749783334, "learning_rate": 2.4118154166891762e-05, "loss": 0.538119912147522, "step": 447 }, { "epoch": 1.4785478547854787, "grad_norm": 0.32398781026753815, "learning_rate": 2.4042958386833003e-05, "loss": 0.5252339839935303, "step": 448 }, { "epoch": 1.481848184818482, "grad_norm": 0.326928536480608, "learning_rate": 2.3967702973057853e-05, "loss": 0.5367081761360168, "step": 449 }, { "epoch": 1.4851485148514851, "grad_norm": 0.3044938562463835, "learning_rate": 2.3892389035585167e-05, "loss": 0.5091884136199951, "step": 450 }, { "epoch": 1.4884488448844886, "grad_norm": 0.2897824690201277, "learning_rate": 2.3817017685297016e-05, "loss": 0.5079891681671143, "step": 451 }, { "epoch": 1.4917491749174918, "grad_norm": 0.2966882318097961, "learning_rate": 2.3741590033922313e-05, "loss": 0.511939287185669, "step": 452 }, { "epoch": 1.495049504950495, "grad_norm": 0.28797637565211376, "learning_rate": 2.3666107194020404e-05, "loss": 0.5070478916168213, "step": 453 }, { "epoch": 1.4983498349834983, "grad_norm": 0.29050652670321586, "learning_rate": 2.3590570278964682e-05, "loss": 0.547492504119873, "step": 454 }, { "epoch": 1.5016501650165015, "grad_norm": 0.311874965448668, "learning_rate": 2.3514980402926132e-05, "loss": 0.5386558771133423, "step": 455 }, { "epoch": 1.504950495049505, "grad_norm": 0.26980126113979913, "learning_rate": 2.3439338680856943e-05, "loss": 0.48668172955513, "step": 456 }, { "epoch": 1.5082508250825084, "grad_norm": 0.31689121328788056, "learning_rate": 2.3363646228474002e-05, "loss": 0.5497942566871643, "step": 457 }, { "epoch": 1.5115511551155114, "grad_norm": 0.3648919358675907, "learning_rate": 2.328790416224248e-05, "loss": 0.5267748832702637, "step": 458 }, { "epoch": 1.5148514851485149, "grad_norm": 0.3191029117024018, "learning_rate": 2.3212113599359368e-05, "loss": 0.5578982830047607, "step": 459 }, { "epoch": 1.5181518151815183, "grad_norm": 0.30610891906133464, "learning_rate": 2.3136275657736956e-05, "loss": 0.5136545896530151, "step": 460 }, { "epoch": 1.5214521452145213, "grad_norm": 0.28466532575384307, "learning_rate": 2.3060391455986403e-05, "loss": 0.5718669891357422, "step": 461 }, { "epoch": 1.5247524752475248, "grad_norm": 0.3064265170567389, "learning_rate": 2.2984462113401184e-05, "loss": 0.5427108407020569, "step": 462 }, { "epoch": 1.528052805280528, "grad_norm": 0.28495826208338726, "learning_rate": 2.2908488749940596e-05, "loss": 0.5293564200401306, "step": 463 }, { "epoch": 1.5313531353135312, "grad_norm": 0.3073240786964915, "learning_rate": 2.2832472486213275e-05, "loss": 0.550743579864502, "step": 464 }, { "epoch": 1.5346534653465347, "grad_norm": 0.30789089349395116, "learning_rate": 2.2756414443460602e-05, "loss": 0.5957387685775757, "step": 465 }, { "epoch": 1.537953795379538, "grad_norm": 0.2840660845057486, "learning_rate": 2.2680315743540234e-05, "loss": 0.4994407892227173, "step": 466 }, { "epoch": 1.5412541254125411, "grad_norm": 0.2912314912557071, "learning_rate": 2.260417750890949e-05, "loss": 0.5120857954025269, "step": 467 }, { "epoch": 1.5445544554455446, "grad_norm": 0.3024618438133355, "learning_rate": 2.2528000862608845e-05, "loss": 0.5727359056472778, "step": 468 }, { "epoch": 1.5478547854785478, "grad_norm": 0.30379584493476613, "learning_rate": 2.2451786928245344e-05, "loss": 0.584964394569397, "step": 469 }, { "epoch": 1.551155115511551, "grad_norm": 0.2782374360382863, "learning_rate": 2.237553682997603e-05, "loss": 0.5507112741470337, "step": 470 }, { "epoch": 1.5544554455445545, "grad_norm": 0.26333814455393634, "learning_rate": 2.2299251692491364e-05, "loss": 0.49136701226234436, "step": 471 }, { "epoch": 1.5577557755775577, "grad_norm": 0.31673569076077385, "learning_rate": 2.2222932640998635e-05, "loss": 0.5374805927276611, "step": 472 }, { "epoch": 1.561056105610561, "grad_norm": 0.29370656251116817, "learning_rate": 2.2146580801205362e-05, "loss": 0.523996114730835, "step": 473 }, { "epoch": 1.5643564356435644, "grad_norm": 0.27277397989040114, "learning_rate": 2.207019729930271e-05, "loss": 0.48198428750038147, "step": 474 }, { "epoch": 1.5676567656765676, "grad_norm": 0.2861287068823064, "learning_rate": 2.199378326194883e-05, "loss": 0.5148699879646301, "step": 475 }, { "epoch": 1.5709570957095709, "grad_norm": 0.2981231032466442, "learning_rate": 2.1917339816252303e-05, "loss": 0.5297671556472778, "step": 476 }, { "epoch": 1.5742574257425743, "grad_norm": 0.2775943923870632, "learning_rate": 2.1840868089755465e-05, "loss": 0.5082278847694397, "step": 477 }, { "epoch": 1.5775577557755776, "grad_norm": 0.2988631140370514, "learning_rate": 2.176436921041779e-05, "loss": 0.4755392372608185, "step": 478 }, { "epoch": 1.5808580858085808, "grad_norm": 0.28707182004966697, "learning_rate": 2.1687844306599275e-05, "loss": 0.5249454975128174, "step": 479 }, { "epoch": 1.5841584158415842, "grad_norm": 0.3023499942723386, "learning_rate": 2.161129450704376e-05, "loss": 0.5626166462898254, "step": 480 }, { "epoch": 1.5874587458745875, "grad_norm": 0.28182475866947054, "learning_rate": 2.1534720940862318e-05, "loss": 0.5590533018112183, "step": 481 }, { "epoch": 1.5907590759075907, "grad_norm": 0.2724331542693392, "learning_rate": 2.1458124737516557e-05, "loss": 0.5146170854568481, "step": 482 }, { "epoch": 1.5940594059405941, "grad_norm": 0.28834268248771533, "learning_rate": 2.1381507026802007e-05, "loss": 0.5633066296577454, "step": 483 }, { "epoch": 1.5973597359735974, "grad_norm": 0.29376551657635425, "learning_rate": 2.130486893883141e-05, "loss": 0.5273865461349487, "step": 484 }, { "epoch": 1.6006600660066006, "grad_norm": 0.277893471974935, "learning_rate": 2.1228211604018088e-05, "loss": 0.5040723085403442, "step": 485 }, { "epoch": 1.603960396039604, "grad_norm": 0.2901419412347278, "learning_rate": 2.1151536153059254e-05, "loss": 0.5254411697387695, "step": 486 }, { "epoch": 1.6072607260726073, "grad_norm": 0.29340041503520936, "learning_rate": 2.1074843716919323e-05, "loss": 0.5789728760719299, "step": 487 }, { "epoch": 1.6105610561056105, "grad_norm": 0.2858502686555999, "learning_rate": 2.0998135426813245e-05, "loss": 0.5521235466003418, "step": 488 }, { "epoch": 1.613861386138614, "grad_norm": 0.2770947277408911, "learning_rate": 2.092141241418984e-05, "loss": 0.4702959954738617, "step": 489 }, { "epoch": 1.6171617161716172, "grad_norm": 0.29713285242144816, "learning_rate": 2.0844675810715046e-05, "loss": 0.4960707128047943, "step": 490 }, { "epoch": 1.6204620462046204, "grad_norm": 0.2800759957297699, "learning_rate": 2.076792674825529e-05, "loss": 0.5334826111793518, "step": 491 }, { "epoch": 1.6237623762376239, "grad_norm": 0.4465546145157964, "learning_rate": 2.0691166358860775e-05, "loss": 0.5604894161224365, "step": 492 }, { "epoch": 1.627062706270627, "grad_norm": 0.2895889767199155, "learning_rate": 2.061439577474875e-05, "loss": 0.5565654635429382, "step": 493 }, { "epoch": 1.6303630363036303, "grad_norm": 0.2663082120203026, "learning_rate": 2.0537616128286875e-05, "loss": 0.541640043258667, "step": 494 }, { "epoch": 1.6336633663366338, "grad_norm": 0.27975047407467746, "learning_rate": 2.0460828551976436e-05, "loss": 0.5247132182121277, "step": 495 }, { "epoch": 1.636963696369637, "grad_norm": 0.30554958978585, "learning_rate": 2.0384034178435727e-05, "loss": 0.533937394618988, "step": 496 }, { "epoch": 1.6402640264026402, "grad_norm": 0.29094539458240765, "learning_rate": 2.0307234140383264e-05, "loss": 0.5857927799224854, "step": 497 }, { "epoch": 1.6435643564356437, "grad_norm": 0.2718482098386275, "learning_rate": 2.0230429570621134e-05, "loss": 0.5191807746887207, "step": 498 }, { "epoch": 1.6468646864686467, "grad_norm": 0.28523897670587156, "learning_rate": 2.0153621602018276e-05, "loss": 0.5255881547927856, "step": 499 }, { "epoch": 1.6501650165016502, "grad_norm": 0.27057309315143646, "learning_rate": 2.0076811367493736e-05, "loss": 0.5134017467498779, "step": 500 } ], "logging_steps": 1, "max_steps": 909, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 699790582349824.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }