{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 909, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0033003300330033004, "grad_norm": 10.81499361768409, "learning_rate": 0.0, "loss": 1.2079360485076904, "step": 1 }, { "epoch": 0.006600660066006601, "grad_norm": 10.226770877445293, "learning_rate": 4.395604395604396e-07, "loss": 1.123347520828247, "step": 2 }, { "epoch": 0.009900990099009901, "grad_norm": 11.292644267807786, "learning_rate": 8.791208791208792e-07, "loss": 1.261695384979248, "step": 3 }, { "epoch": 0.013201320132013201, "grad_norm": 10.504638106263508, "learning_rate": 1.3186813186813187e-06, "loss": 1.1276888847351074, "step": 4 }, { "epoch": 0.0165016501650165, "grad_norm": 10.822100601159539, "learning_rate": 1.7582417582417585e-06, "loss": 1.2254480123519897, "step": 5 }, { "epoch": 0.019801980198019802, "grad_norm": 9.905516433474448, "learning_rate": 2.197802197802198e-06, "loss": 1.1809396743774414, "step": 6 }, { "epoch": 0.0231023102310231, "grad_norm": 9.323364829402967, "learning_rate": 2.6373626373626375e-06, "loss": 1.2000095844268799, "step": 7 }, { "epoch": 0.026402640264026403, "grad_norm": 6.706098746162178, "learning_rate": 3.0769230769230774e-06, "loss": 1.0248074531555176, "step": 8 }, { "epoch": 0.0297029702970297, "grad_norm": 5.761138380327878, "learning_rate": 3.516483516483517e-06, "loss": 1.0840561389923096, "step": 9 }, { "epoch": 0.033003300330033, "grad_norm": 2.7364343552329315, "learning_rate": 3.9560439560439565e-06, "loss": 0.955639123916626, "step": 10 }, { "epoch": 0.036303630363036306, "grad_norm": 2.113810438625661, "learning_rate": 4.395604395604396e-06, "loss": 0.9281604290008545, "step": 11 }, { "epoch": 0.039603960396039604, "grad_norm": 1.849238684536393, "learning_rate": 4.8351648351648355e-06, "loss": 0.9079018831253052, "step": 12 }, { "epoch": 0.0429042904290429, "grad_norm": 1.6747171029255208, "learning_rate": 5.274725274725275e-06, "loss": 0.9039217233657837, "step": 13 }, { "epoch": 0.0462046204620462, "grad_norm": 2.0121666555693416, "learning_rate": 5.7142857142857145e-06, "loss": 0.8910936117172241, "step": 14 }, { "epoch": 0.04950495049504951, "grad_norm": 2.0600124028897526, "learning_rate": 6.153846153846155e-06, "loss": 0.895532488822937, "step": 15 }, { "epoch": 0.052805280528052806, "grad_norm": 2.0613449368510044, "learning_rate": 6.5934065934065935e-06, "loss": 0.8889240622520447, "step": 16 }, { "epoch": 0.056105610561056105, "grad_norm": 1.785450637059245, "learning_rate": 7.032967032967034e-06, "loss": 0.8499570488929749, "step": 17 }, { "epoch": 0.0594059405940594, "grad_norm": 1.5894161631201256, "learning_rate": 7.472527472527473e-06, "loss": 0.839992105960846, "step": 18 }, { "epoch": 0.0627062706270627, "grad_norm": 1.1904834264503976, "learning_rate": 7.912087912087913e-06, "loss": 0.7718420028686523, "step": 19 }, { "epoch": 0.066006600660066, "grad_norm": 1.0397335564670163, "learning_rate": 8.351648351648353e-06, "loss": 0.7865867614746094, "step": 20 }, { "epoch": 0.06930693069306931, "grad_norm": 0.8314739102256958, "learning_rate": 8.791208791208792e-06, "loss": 0.7982739806175232, "step": 21 }, { "epoch": 0.07260726072607261, "grad_norm": 0.6542597896181986, "learning_rate": 9.230769230769232e-06, "loss": 0.7846421599388123, "step": 22 }, { "epoch": 0.07590759075907591, "grad_norm": 0.6269389928815381, "learning_rate": 9.670329670329671e-06, "loss": 0.7005743980407715, "step": 23 }, { "epoch": 0.07920792079207921, "grad_norm": 0.6603922634859757, "learning_rate": 1.010989010989011e-05, "loss": 0.7084314227104187, "step": 24 }, { "epoch": 0.08250825082508251, "grad_norm": 0.6856248928818359, "learning_rate": 1.054945054945055e-05, "loss": 0.7310304641723633, "step": 25 }, { "epoch": 0.0858085808580858, "grad_norm": 0.5728331825854258, "learning_rate": 1.098901098901099e-05, "loss": 0.7056888341903687, "step": 26 }, { "epoch": 0.0891089108910891, "grad_norm": 0.47956485465857923, "learning_rate": 1.1428571428571429e-05, "loss": 0.6987950205802917, "step": 27 }, { "epoch": 0.0924092409240924, "grad_norm": 0.47407141179043555, "learning_rate": 1.186813186813187e-05, "loss": 0.7319807410240173, "step": 28 }, { "epoch": 0.09570957095709572, "grad_norm": 0.4856924244101555, "learning_rate": 1.230769230769231e-05, "loss": 0.6983063220977783, "step": 29 }, { "epoch": 0.09900990099009901, "grad_norm": 0.49122925908544063, "learning_rate": 1.2747252747252747e-05, "loss": 0.70492023229599, "step": 30 }, { "epoch": 0.10231023102310231, "grad_norm": 0.4556788168903923, "learning_rate": 1.3186813186813187e-05, "loss": 0.7376629114151001, "step": 31 }, { "epoch": 0.10561056105610561, "grad_norm": 0.4272838300827657, "learning_rate": 1.3626373626373627e-05, "loss": 0.6623936295509338, "step": 32 }, { "epoch": 0.10891089108910891, "grad_norm": 0.40886227927218277, "learning_rate": 1.4065934065934068e-05, "loss": 0.7136330604553223, "step": 33 }, { "epoch": 0.11221122112211221, "grad_norm": 0.37821179606418975, "learning_rate": 1.4505494505494506e-05, "loss": 0.7113747596740723, "step": 34 }, { "epoch": 0.11551155115511551, "grad_norm": 0.4538557716923258, "learning_rate": 1.4945054945054947e-05, "loss": 0.8252867460250854, "step": 35 }, { "epoch": 0.1188118811881188, "grad_norm": 0.3875808052898815, "learning_rate": 1.5384615384615387e-05, "loss": 0.7406599521636963, "step": 36 }, { "epoch": 0.12211221122112212, "grad_norm": 0.3503240143986989, "learning_rate": 1.5824175824175826e-05, "loss": 0.6572297811508179, "step": 37 }, { "epoch": 0.1254125412541254, "grad_norm": 0.3779655372487014, "learning_rate": 1.6263736263736265e-05, "loss": 0.7520949840545654, "step": 38 }, { "epoch": 0.12871287128712872, "grad_norm": 0.36968690038350466, "learning_rate": 1.6703296703296707e-05, "loss": 0.6861323118209839, "step": 39 }, { "epoch": 0.132013201320132, "grad_norm": 0.3724328241107235, "learning_rate": 1.7142857142857142e-05, "loss": 0.6818518042564392, "step": 40 }, { "epoch": 0.1353135313531353, "grad_norm": 0.35542054984937593, "learning_rate": 1.7582417582417584e-05, "loss": 0.6663186550140381, "step": 41 }, { "epoch": 0.13861386138613863, "grad_norm": 0.3441266617586836, "learning_rate": 1.8021978021978023e-05, "loss": 0.6492191553115845, "step": 42 }, { "epoch": 0.1419141914191419, "grad_norm": 0.3478448092762331, "learning_rate": 1.8461538461538465e-05, "loss": 0.6444741487503052, "step": 43 }, { "epoch": 0.14521452145214522, "grad_norm": 0.34951148057960574, "learning_rate": 1.8901098901098903e-05, "loss": 0.6476814150810242, "step": 44 }, { "epoch": 0.1485148514851485, "grad_norm": 0.3356672452160599, "learning_rate": 1.9340659340659342e-05, "loss": 0.6660827994346619, "step": 45 }, { "epoch": 0.15181518151815182, "grad_norm": 0.30809956365723695, "learning_rate": 1.9780219780219784e-05, "loss": 0.6924091577529907, "step": 46 }, { "epoch": 0.1551155115511551, "grad_norm": 0.9030699054312887, "learning_rate": 2.021978021978022e-05, "loss": 0.6899605989456177, "step": 47 }, { "epoch": 0.15841584158415842, "grad_norm": 0.35784060194946976, "learning_rate": 2.0659340659340665e-05, "loss": 0.7242028713226318, "step": 48 }, { "epoch": 0.1617161716171617, "grad_norm": 0.3093966721093651, "learning_rate": 2.10989010989011e-05, "loss": 0.6203902959823608, "step": 49 }, { "epoch": 0.16501650165016502, "grad_norm": 0.4242705872636108, "learning_rate": 2.153846153846154e-05, "loss": 0.6420010328292847, "step": 50 }, { "epoch": 0.16831683168316833, "grad_norm": 0.35079960590346965, "learning_rate": 2.197802197802198e-05, "loss": 0.7517598867416382, "step": 51 }, { "epoch": 0.1716171617161716, "grad_norm": 0.3078803790362521, "learning_rate": 2.241758241758242e-05, "loss": 0.6568161249160767, "step": 52 }, { "epoch": 0.17491749174917492, "grad_norm": 0.34666662805484005, "learning_rate": 2.2857142857142858e-05, "loss": 0.7348504662513733, "step": 53 }, { "epoch": 0.1782178217821782, "grad_norm": 0.302791415801781, "learning_rate": 2.32967032967033e-05, "loss": 0.6164949536323547, "step": 54 }, { "epoch": 0.18151815181518152, "grad_norm": 0.33732756727763136, "learning_rate": 2.373626373626374e-05, "loss": 0.6505363583564758, "step": 55 }, { "epoch": 0.1848184818481848, "grad_norm": 0.34780152362496847, "learning_rate": 2.4175824175824177e-05, "loss": 0.7562520503997803, "step": 56 }, { "epoch": 0.18811881188118812, "grad_norm": 0.3310895358869482, "learning_rate": 2.461538461538462e-05, "loss": 0.6943148374557495, "step": 57 }, { "epoch": 0.19141914191419143, "grad_norm": 0.3367877938063833, "learning_rate": 2.5054945054945058e-05, "loss": 0.6571655869483948, "step": 58 }, { "epoch": 0.19471947194719472, "grad_norm": 0.32103256018771714, "learning_rate": 2.5494505494505493e-05, "loss": 0.7229321002960205, "step": 59 }, { "epoch": 0.19801980198019803, "grad_norm": 0.30468399230672144, "learning_rate": 2.593406593406594e-05, "loss": 0.6307672262191772, "step": 60 }, { "epoch": 0.20132013201320131, "grad_norm": 0.3282635121595526, "learning_rate": 2.6373626373626374e-05, "loss": 0.6336506009101868, "step": 61 }, { "epoch": 0.20462046204620463, "grad_norm": 0.3280360563022675, "learning_rate": 2.6813186813186813e-05, "loss": 0.6492213010787964, "step": 62 }, { "epoch": 0.2079207920792079, "grad_norm": 0.3292430577817229, "learning_rate": 2.7252747252747255e-05, "loss": 0.6763280034065247, "step": 63 }, { "epoch": 0.21122112211221122, "grad_norm": 0.47832355846700536, "learning_rate": 2.7692307692307694e-05, "loss": 0.7322396039962769, "step": 64 }, { "epoch": 0.2145214521452145, "grad_norm": 0.31915340164178446, "learning_rate": 2.8131868131868136e-05, "loss": 0.7080870270729065, "step": 65 }, { "epoch": 0.21782178217821782, "grad_norm": 0.3227571040968621, "learning_rate": 2.8571428571428574e-05, "loss": 0.6054466962814331, "step": 66 }, { "epoch": 0.22112211221122113, "grad_norm": 0.33375713186655664, "learning_rate": 2.9010989010989013e-05, "loss": 0.6782290935516357, "step": 67 }, { "epoch": 0.22442244224422442, "grad_norm": 0.3437770801965916, "learning_rate": 2.9450549450549455e-05, "loss": 0.6804753541946411, "step": 68 }, { "epoch": 0.22772277227722773, "grad_norm": 0.3228427319313703, "learning_rate": 2.9890109890109894e-05, "loss": 0.6493992805480957, "step": 69 }, { "epoch": 0.23102310231023102, "grad_norm": 0.3540211756840673, "learning_rate": 3.0329670329670332e-05, "loss": 0.6263789534568787, "step": 70 }, { "epoch": 0.23432343234323433, "grad_norm": 0.34989089824503405, "learning_rate": 3.0769230769230774e-05, "loss": 0.6960322856903076, "step": 71 }, { "epoch": 0.2376237623762376, "grad_norm": 0.33624443163866324, "learning_rate": 3.120879120879121e-05, "loss": 0.6146604418754578, "step": 72 }, { "epoch": 0.24092409240924093, "grad_norm": 0.39618402867027047, "learning_rate": 3.164835164835165e-05, "loss": 0.6361377239227295, "step": 73 }, { "epoch": 0.24422442244224424, "grad_norm": 0.361603087273114, "learning_rate": 3.2087912087912094e-05, "loss": 0.636134147644043, "step": 74 }, { "epoch": 0.24752475247524752, "grad_norm": 0.37985663132790304, "learning_rate": 3.252747252747253e-05, "loss": 0.5936564803123474, "step": 75 }, { "epoch": 0.2508250825082508, "grad_norm": 0.35883234873646996, "learning_rate": 3.296703296703297e-05, "loss": 0.6001103520393372, "step": 76 }, { "epoch": 0.25412541254125415, "grad_norm": 0.35227803701073973, "learning_rate": 3.340659340659341e-05, "loss": 0.6254594326019287, "step": 77 }, { "epoch": 0.25742574257425743, "grad_norm": 0.3563257650896171, "learning_rate": 3.384615384615385e-05, "loss": 0.6457959413528442, "step": 78 }, { "epoch": 0.2607260726072607, "grad_norm": 0.37234316340556584, "learning_rate": 3.4285714285714284e-05, "loss": 0.6186954975128174, "step": 79 }, { "epoch": 0.264026402640264, "grad_norm": 0.35352748449766547, "learning_rate": 3.4725274725274726e-05, "loss": 0.6175529956817627, "step": 80 }, { "epoch": 0.26732673267326734, "grad_norm": 0.35441369709658355, "learning_rate": 3.516483516483517e-05, "loss": 0.6694468259811401, "step": 81 }, { "epoch": 0.2706270627062706, "grad_norm": 0.39955400784840756, "learning_rate": 3.56043956043956e-05, "loss": 0.627490222454071, "step": 82 }, { "epoch": 0.2739273927392739, "grad_norm": 0.38314031523497477, "learning_rate": 3.6043956043956045e-05, "loss": 0.6410495638847351, "step": 83 }, { "epoch": 0.27722772277227725, "grad_norm": 0.36926215386141575, "learning_rate": 3.648351648351649e-05, "loss": 0.6305102109909058, "step": 84 }, { "epoch": 0.28052805280528054, "grad_norm": 0.38364118080284076, "learning_rate": 3.692307692307693e-05, "loss": 0.6558895111083984, "step": 85 }, { "epoch": 0.2838283828382838, "grad_norm": 0.3370292682974053, "learning_rate": 3.7362637362637365e-05, "loss": 0.6029388308525085, "step": 86 }, { "epoch": 0.2871287128712871, "grad_norm": 0.39541874871701704, "learning_rate": 3.7802197802197807e-05, "loss": 0.6551017761230469, "step": 87 }, { "epoch": 0.29042904290429045, "grad_norm": 0.3629036550044273, "learning_rate": 3.824175824175825e-05, "loss": 0.6588809490203857, "step": 88 }, { "epoch": 0.29372937293729373, "grad_norm": 0.37786447228212183, "learning_rate": 3.8681318681318684e-05, "loss": 0.614648699760437, "step": 89 }, { "epoch": 0.297029702970297, "grad_norm": 0.42911861803278684, "learning_rate": 3.9120879120879126e-05, "loss": 0.7034356594085693, "step": 90 }, { "epoch": 0.30033003300330036, "grad_norm": 0.3707184094312094, "learning_rate": 3.956043956043957e-05, "loss": 0.6908263564109802, "step": 91 }, { "epoch": 0.30363036303630364, "grad_norm": 0.38262186656216063, "learning_rate": 4e-05, "loss": 0.6882215738296509, "step": 92 }, { "epoch": 0.3069306930693069, "grad_norm": 0.3709464296309744, "learning_rate": 3.999985249980169e-05, "loss": 0.6377270221710205, "step": 93 }, { "epoch": 0.3102310231023102, "grad_norm": 0.3412837406106036, "learning_rate": 3.999941000138238e-05, "loss": 0.6735270619392395, "step": 94 }, { "epoch": 0.31353135313531355, "grad_norm": 0.40165192879996064, "learning_rate": 3.999867251126893e-05, "loss": 0.6934541463851929, "step": 95 }, { "epoch": 0.31683168316831684, "grad_norm": 0.34707128601816045, "learning_rate": 3.9997640040339335e-05, "loss": 0.6367039084434509, "step": 96 }, { "epoch": 0.3201320132013201, "grad_norm": 0.4268828113970776, "learning_rate": 3.999631260382257e-05, "loss": 0.6274522542953491, "step": 97 }, { "epoch": 0.3234323432343234, "grad_norm": 0.454428833020686, "learning_rate": 3.999469022129834e-05, "loss": 0.5874066352844238, "step": 98 }, { "epoch": 0.32673267326732675, "grad_norm": 0.4200675840489775, "learning_rate": 3.9992772916696824e-05, "loss": 0.6175942420959473, "step": 99 }, { "epoch": 0.33003300330033003, "grad_norm": 0.3796321080056305, "learning_rate": 3.99905607182983e-05, "loss": 0.5625832080841064, "step": 100 }, { "epoch": 0.3333333333333333, "grad_norm": 0.39108856096759403, "learning_rate": 3.998805365873274e-05, "loss": 0.6153020262718201, "step": 101 }, { "epoch": 0.33663366336633666, "grad_norm": 0.3873560194436071, "learning_rate": 3.998525177497932e-05, "loss": 0.5585426092147827, "step": 102 }, { "epoch": 0.33993399339933994, "grad_norm": 0.4084712106325698, "learning_rate": 3.998215510836589e-05, "loss": 0.6586359739303589, "step": 103 }, { "epoch": 0.3432343234323432, "grad_norm": 0.4383246876899704, "learning_rate": 3.997876370456833e-05, "loss": 0.62096107006073, "step": 104 }, { "epoch": 0.3465346534653465, "grad_norm": 0.4026893562706946, "learning_rate": 3.997507761360993e-05, "loss": 0.6059336066246033, "step": 105 }, { "epoch": 0.34983498349834985, "grad_norm": 0.46586240044914223, "learning_rate": 3.997109688986059e-05, "loss": 0.617970883846283, "step": 106 }, { "epoch": 0.35313531353135313, "grad_norm": 0.44949199032710474, "learning_rate": 3.9966821592036066e-05, "loss": 0.6453397274017334, "step": 107 }, { "epoch": 0.3564356435643564, "grad_norm": 0.4794978158156406, "learning_rate": 3.996225178319709e-05, "loss": 0.6371763348579407, "step": 108 }, { "epoch": 0.35973597359735976, "grad_norm": 0.4463512391721941, "learning_rate": 3.9957387530748435e-05, "loss": 0.5971124172210693, "step": 109 }, { "epoch": 0.36303630363036304, "grad_norm": 0.368079413354641, "learning_rate": 3.995222890643792e-05, "loss": 0.5679532289505005, "step": 110 }, { "epoch": 0.36633663366336633, "grad_norm": 0.43733705586285254, "learning_rate": 3.9946775986355346e-05, "loss": 0.5988069772720337, "step": 111 }, { "epoch": 0.3696369636963696, "grad_norm": 0.38235582844960775, "learning_rate": 3.994102885093141e-05, "loss": 0.6352983713150024, "step": 112 }, { "epoch": 0.37293729372937295, "grad_norm": 0.389837871286893, "learning_rate": 3.993498758493646e-05, "loss": 0.58957839012146, "step": 113 }, { "epoch": 0.37623762376237624, "grad_norm": 0.40399856168911097, "learning_rate": 3.992865227747929e-05, "loss": 0.6396822929382324, "step": 114 }, { "epoch": 0.3795379537953795, "grad_norm": 0.38891668976227123, "learning_rate": 3.992202302200582e-05, "loss": 0.6314754486083984, "step": 115 }, { "epoch": 0.38283828382838286, "grad_norm": 0.4087528543828922, "learning_rate": 3.991509991629769e-05, "loss": 0.673650860786438, "step": 116 }, { "epoch": 0.38613861386138615, "grad_norm": 0.36330054292020786, "learning_rate": 3.990788306247085e-05, "loss": 0.5813701152801514, "step": 117 }, { "epoch": 0.38943894389438943, "grad_norm": 0.4247110332678589, "learning_rate": 3.990037256697404e-05, "loss": 0.6419334411621094, "step": 118 }, { "epoch": 0.3927392739273927, "grad_norm": 0.4244126002071751, "learning_rate": 3.989256854058721e-05, "loss": 0.6319208145141602, "step": 119 }, { "epoch": 0.39603960396039606, "grad_norm": 0.3651632933942853, "learning_rate": 3.988447109841991e-05, "loss": 0.5989845991134644, "step": 120 }, { "epoch": 0.39933993399339934, "grad_norm": 0.393158353074077, "learning_rate": 3.987608035990957e-05, "loss": 0.5853303670883179, "step": 121 }, { "epoch": 0.40264026402640263, "grad_norm": 0.35965233332276103, "learning_rate": 3.986739644881975e-05, "loss": 0.6115257143974304, "step": 122 }, { "epoch": 0.40594059405940597, "grad_norm": 0.4252711474203845, "learning_rate": 3.985841949323831e-05, "loss": 0.6440504789352417, "step": 123 }, { "epoch": 0.40924092409240925, "grad_norm": 0.5578797297271848, "learning_rate": 3.984914962557553e-05, "loss": 0.5765030384063721, "step": 124 }, { "epoch": 0.41254125412541254, "grad_norm": 0.4362455029468141, "learning_rate": 3.983958698256214e-05, "loss": 0.6387556791305542, "step": 125 }, { "epoch": 0.4158415841584158, "grad_norm": 0.39274811063076087, "learning_rate": 3.98297317052473e-05, "loss": 0.6263147592544556, "step": 126 }, { "epoch": 0.41914191419141916, "grad_norm": 0.42682589637163704, "learning_rate": 3.981958393899656e-05, "loss": 0.6091845035552979, "step": 127 }, { "epoch": 0.42244224422442245, "grad_norm": 0.4033131171538041, "learning_rate": 3.980914383348967e-05, "loss": 0.6458015441894531, "step": 128 }, { "epoch": 0.42574257425742573, "grad_norm": 0.3881606915462862, "learning_rate": 3.9798411542718395e-05, "loss": 0.6115552186965942, "step": 129 }, { "epoch": 0.429042904290429, "grad_norm": 0.38910317938225847, "learning_rate": 3.978738722498423e-05, "loss": 0.6427993774414062, "step": 130 }, { "epoch": 0.43234323432343236, "grad_norm": 0.36836380096259913, "learning_rate": 3.977607104289609e-05, "loss": 0.6121467351913452, "step": 131 }, { "epoch": 0.43564356435643564, "grad_norm": 0.3743062201629088, "learning_rate": 3.9764463163367875e-05, "loss": 0.5951442718505859, "step": 132 }, { "epoch": 0.4389438943894389, "grad_norm": 0.3699746655092952, "learning_rate": 3.9752563757616045e-05, "loss": 0.6639472842216492, "step": 133 }, { "epoch": 0.44224422442244227, "grad_norm": 0.37398919831188604, "learning_rate": 3.974037300115706e-05, "loss": 0.6084764003753662, "step": 134 }, { "epoch": 0.44554455445544555, "grad_norm": 0.37043195153646374, "learning_rate": 3.972789107380484e-05, "loss": 0.6211085915565491, "step": 135 }, { "epoch": 0.44884488448844884, "grad_norm": 0.3509837417375981, "learning_rate": 3.9715118159668046e-05, "loss": 0.6098147034645081, "step": 136 }, { "epoch": 0.4521452145214521, "grad_norm": 0.3350785925775803, "learning_rate": 3.970205444714742e-05, "loss": 0.6155884861946106, "step": 137 }, { "epoch": 0.45544554455445546, "grad_norm": 0.38529379761335925, "learning_rate": 3.9688700128932975e-05, "loss": 0.5984665155410767, "step": 138 }, { "epoch": 0.45874587458745875, "grad_norm": 0.45130397769476205, "learning_rate": 3.967505540200117e-05, "loss": 0.6656880378723145, "step": 139 }, { "epoch": 0.46204620462046203, "grad_norm": 0.3277874952439621, "learning_rate": 3.966112046761201e-05, "loss": 0.6607398390769958, "step": 140 }, { "epoch": 0.46534653465346537, "grad_norm": 2.6727599644732267, "learning_rate": 3.9646895531306046e-05, "loss": 0.6578342914581299, "step": 141 }, { "epoch": 0.46864686468646866, "grad_norm": 0.47429126269764676, "learning_rate": 3.963238080290136e-05, "loss": 0.6103699803352356, "step": 142 }, { "epoch": 0.47194719471947194, "grad_norm": 0.32652590291724093, "learning_rate": 3.96175764964905e-05, "loss": 0.5484676957130432, "step": 143 }, { "epoch": 0.4752475247524752, "grad_norm": 0.4531372955951849, "learning_rate": 3.960248283043727e-05, "loss": 0.578776478767395, "step": 144 }, { "epoch": 0.47854785478547857, "grad_norm": 0.3685580706465372, "learning_rate": 3.958710002737355e-05, "loss": 0.6184446811676025, "step": 145 }, { "epoch": 0.48184818481848185, "grad_norm": 0.3584005630962511, "learning_rate": 3.9571428314195984e-05, "loss": 0.6307916045188904, "step": 146 }, { "epoch": 0.48514851485148514, "grad_norm": 0.4049679254542765, "learning_rate": 3.955546792206265e-05, "loss": 0.6064697504043579, "step": 147 }, { "epoch": 0.4884488448844885, "grad_norm": 0.3846258995775384, "learning_rate": 3.953921908638966e-05, "loss": 0.6055655479431152, "step": 148 }, { "epoch": 0.49174917491749176, "grad_norm": 0.3643318343315678, "learning_rate": 3.952268204684765e-05, "loss": 0.5856431126594543, "step": 149 }, { "epoch": 0.49504950495049505, "grad_norm": 0.3854715521866927, "learning_rate": 3.950585704735829e-05, "loss": 0.6634635925292969, "step": 150 }, { "epoch": 0.49834983498349833, "grad_norm": 0.34338835592304534, "learning_rate": 3.948874433609065e-05, "loss": 0.5880753397941589, "step": 151 }, { "epoch": 0.5016501650165016, "grad_norm": 0.3481018111538647, "learning_rate": 3.947134416545757e-05, "loss": 0.5594221949577332, "step": 152 }, { "epoch": 0.504950495049505, "grad_norm": 0.6570220882473125, "learning_rate": 3.94536567921119e-05, "loss": 0.664652407169342, "step": 153 }, { "epoch": 0.5082508250825083, "grad_norm": 0.340048306266198, "learning_rate": 3.9435682476942755e-05, "loss": 0.6002815961837769, "step": 154 }, { "epoch": 0.5115511551155115, "grad_norm": 0.3488682381523364, "learning_rate": 3.941742148507163e-05, "loss": 0.5905177593231201, "step": 155 }, { "epoch": 0.5148514851485149, "grad_norm": 0.33062666453941425, "learning_rate": 3.939887408584853e-05, "loss": 0.5636795163154602, "step": 156 }, { "epoch": 0.5181518151815182, "grad_norm": 0.35862086331061066, "learning_rate": 3.938004055284796e-05, "loss": 0.5639582276344299, "step": 157 }, { "epoch": 0.5214521452145214, "grad_norm": 0.31769111173717246, "learning_rate": 3.9360921163864895e-05, "loss": 0.6515591144561768, "step": 158 }, { "epoch": 0.5247524752475248, "grad_norm": 0.38401455820073427, "learning_rate": 3.934151620091071e-05, "loss": 0.5721683502197266, "step": 159 }, { "epoch": 0.528052805280528, "grad_norm": 0.3284331200684813, "learning_rate": 3.9321825950209e-05, "loss": 0.5801802277565002, "step": 160 }, { "epoch": 0.5313531353135313, "grad_norm": 0.3493998878359796, "learning_rate": 3.9301850702191344e-05, "loss": 0.603084921836853, "step": 161 }, { "epoch": 0.5346534653465347, "grad_norm": 0.32233519110844616, "learning_rate": 3.928159075149304e-05, "loss": 0.6376925110816956, "step": 162 }, { "epoch": 0.5379537953795379, "grad_norm": 0.35833134197704153, "learning_rate": 3.926104639694877e-05, "loss": 0.5764102935791016, "step": 163 }, { "epoch": 0.5412541254125413, "grad_norm": 0.3523567199445224, "learning_rate": 3.924021794158818e-05, "loss": 0.6102188229560852, "step": 164 }, { "epoch": 0.5445544554455446, "grad_norm": 0.36694222553878597, "learning_rate": 3.921910569263139e-05, "loss": 0.5833287835121155, "step": 165 }, { "epoch": 0.5478547854785478, "grad_norm": 0.37179813198977807, "learning_rate": 3.919770996148448e-05, "loss": 0.5891385078430176, "step": 166 }, { "epoch": 0.5511551155115512, "grad_norm": 0.3507301680001106, "learning_rate": 3.917603106373493e-05, "loss": 0.5838547348976135, "step": 167 }, { "epoch": 0.5544554455445545, "grad_norm": 0.3134001311174479, "learning_rate": 3.9154069319146904e-05, "loss": 0.5727800726890564, "step": 168 }, { "epoch": 0.5577557755775577, "grad_norm": 0.33531781904204605, "learning_rate": 3.913182505165656e-05, "loss": 0.6102641224861145, "step": 169 }, { "epoch": 0.5610561056105611, "grad_norm": 0.35178976522027133, "learning_rate": 3.91092985893673e-05, "loss": 0.5718260407447815, "step": 170 }, { "epoch": 0.5643564356435643, "grad_norm": 0.47006108726602863, "learning_rate": 3.908649026454488e-05, "loss": 0.6308504939079285, "step": 171 }, { "epoch": 0.5676567656765676, "grad_norm": 0.3687514240026255, "learning_rate": 3.906340041361255e-05, "loss": 0.6089432835578918, "step": 172 }, { "epoch": 0.570957095709571, "grad_norm": 0.3586674884704593, "learning_rate": 3.904002937714606e-05, "loss": 0.6583501696586609, "step": 173 }, { "epoch": 0.5742574257425742, "grad_norm": 0.3399808047240735, "learning_rate": 3.9016377499868666e-05, "loss": 0.6108609437942505, "step": 174 }, { "epoch": 0.5775577557755776, "grad_norm": 0.3840880337988826, "learning_rate": 3.899244513064603e-05, "loss": 0.63509202003479, "step": 175 }, { "epoch": 0.5808580858085809, "grad_norm": 0.3725541644477348, "learning_rate": 3.896823262248107e-05, "loss": 0.5759241580963135, "step": 176 }, { "epoch": 0.5841584158415841, "grad_norm": 0.30755721985114126, "learning_rate": 3.8943740332508754e-05, "loss": 0.6148169040679932, "step": 177 }, { "epoch": 0.5874587458745875, "grad_norm": 0.3916756097057637, "learning_rate": 3.891896862199086e-05, "loss": 0.5266364216804504, "step": 178 }, { "epoch": 0.5907590759075908, "grad_norm": 0.3417854779376455, "learning_rate": 3.88939178563106e-05, "loss": 0.5626640319824219, "step": 179 }, { "epoch": 0.594059405940594, "grad_norm": 0.33526488525207704, "learning_rate": 3.886858840496727e-05, "loss": 0.6063880920410156, "step": 180 }, { "epoch": 0.5973597359735974, "grad_norm": 0.37344333250119977, "learning_rate": 3.884298064157077e-05, "loss": 0.5979235768318176, "step": 181 }, { "epoch": 0.6006600660066007, "grad_norm": 0.3835133271197793, "learning_rate": 3.881709494383612e-05, "loss": 0.6628611087799072, "step": 182 }, { "epoch": 0.6039603960396039, "grad_norm": 0.4344526004756121, "learning_rate": 3.879093169357789e-05, "loss": 0.6215270757675171, "step": 183 }, { "epoch": 0.6072607260726073, "grad_norm": 0.3644174435488244, "learning_rate": 3.876449127670452e-05, "loss": 0.6148592233657837, "step": 184 }, { "epoch": 0.6105610561056105, "grad_norm": 0.3619226265536735, "learning_rate": 3.87377740832127e-05, "loss": 0.6254778504371643, "step": 185 }, { "epoch": 0.6138613861386139, "grad_norm": 0.3492162593840536, "learning_rate": 3.871078050718155e-05, "loss": 0.6025378704071045, "step": 186 }, { "epoch": 0.6171617161716172, "grad_norm": 0.3866924759539626, "learning_rate": 3.8683510946766866e-05, "loss": 0.5887518525123596, "step": 187 }, { "epoch": 0.6204620462046204, "grad_norm": 0.3357229513721586, "learning_rate": 3.865596580419519e-05, "loss": 0.6180317401885986, "step": 188 }, { "epoch": 0.6237623762376238, "grad_norm": 0.3594949077768003, "learning_rate": 3.8628145485757925e-05, "loss": 0.5970651507377625, "step": 189 }, { "epoch": 0.6270627062706271, "grad_norm": 0.3496234009951303, "learning_rate": 3.860005040180533e-05, "loss": 0.6027296781539917, "step": 190 }, { "epoch": 0.6303630363036303, "grad_norm": 0.3830042583584045, "learning_rate": 3.857168096674044e-05, "loss": 0.6326305270195007, "step": 191 }, { "epoch": 0.6336633663366337, "grad_norm": 0.333508477943962, "learning_rate": 3.854303759901299e-05, "loss": 0.6508482694625854, "step": 192 }, { "epoch": 0.636963696369637, "grad_norm": 0.352327105927571, "learning_rate": 3.851412072111322e-05, "loss": 0.6088548302650452, "step": 193 }, { "epoch": 0.6402640264026402, "grad_norm": 0.36196379228138037, "learning_rate": 3.8484930759565645e-05, "loss": 0.5975607633590698, "step": 194 }, { "epoch": 0.6435643564356436, "grad_norm": 0.3231664855297077, "learning_rate": 3.845546814492279e-05, "loss": 0.5467930436134338, "step": 195 }, { "epoch": 0.6468646864686468, "grad_norm": 0.35556526722817444, "learning_rate": 3.8425733311758795e-05, "loss": 0.583969235420227, "step": 196 }, { "epoch": 0.6501650165016502, "grad_norm": 0.331073543443887, "learning_rate": 3.8395726698663045e-05, "loss": 0.6007376909255981, "step": 197 }, { "epoch": 0.6534653465346535, "grad_norm": 0.34786293006180385, "learning_rate": 3.836544874823368e-05, "loss": 0.5971908569335938, "step": 198 }, { "epoch": 0.6567656765676567, "grad_norm": 0.3128647628132879, "learning_rate": 3.8334899907071064e-05, "loss": 0.592069685459137, "step": 199 }, { "epoch": 0.6600660066006601, "grad_norm": 0.3308125796746202, "learning_rate": 3.830408062577121e-05, "loss": 0.6188071966171265, "step": 200 }, { "epoch": 0.6633663366336634, "grad_norm": 0.34889077565364124, "learning_rate": 3.827299135891913e-05, "loss": 0.5976923704147339, "step": 201 }, { "epoch": 0.6666666666666666, "grad_norm": 0.33443153994631497, "learning_rate": 3.8241632565082124e-05, "loss": 0.6120954155921936, "step": 202 }, { "epoch": 0.66996699669967, "grad_norm": 0.3573334503206899, "learning_rate": 3.821000470680303e-05, "loss": 0.6661979556083679, "step": 203 }, { "epoch": 0.6732673267326733, "grad_norm": 0.34662331225184934, "learning_rate": 3.8178108250593384e-05, "loss": 0.5853559970855713, "step": 204 }, { "epoch": 0.6765676567656765, "grad_norm": 0.33823171869993424, "learning_rate": 3.814594366692654e-05, "loss": 0.6648768186569214, "step": 205 }, { "epoch": 0.6798679867986799, "grad_norm": 0.4178878629038068, "learning_rate": 3.8113511430230745e-05, "loss": 0.5893838405609131, "step": 206 }, { "epoch": 0.6831683168316832, "grad_norm": 0.36858896529016355, "learning_rate": 3.808081201888214e-05, "loss": 0.6177140474319458, "step": 207 }, { "epoch": 0.6864686468646864, "grad_norm": 0.38061402245158527, "learning_rate": 3.8047845915197695e-05, "loss": 0.5793695449829102, "step": 208 }, { "epoch": 0.6897689768976898, "grad_norm": 0.3591315376932048, "learning_rate": 3.8014613605428084e-05, "loss": 0.5571605563163757, "step": 209 }, { "epoch": 0.693069306930693, "grad_norm": 0.33319862057164595, "learning_rate": 3.798111557975053e-05, "loss": 0.5945760011672974, "step": 210 }, { "epoch": 0.6963696369636964, "grad_norm": 0.3495679574237745, "learning_rate": 3.7947352332261586e-05, "loss": 0.600873589515686, "step": 211 }, { "epoch": 0.6996699669966997, "grad_norm": 0.37390147639764304, "learning_rate": 3.791332436096983e-05, "loss": 0.6234852075576782, "step": 212 }, { "epoch": 0.7029702970297029, "grad_norm": 0.3571653694610809, "learning_rate": 3.7879032167788494e-05, "loss": 0.6129578948020935, "step": 213 }, { "epoch": 0.7062706270627063, "grad_norm": 0.48971881906384135, "learning_rate": 3.784447625852812e-05, "loss": 0.6204475164413452, "step": 214 }, { "epoch": 0.7095709570957096, "grad_norm": 0.3610294548812676, "learning_rate": 3.780965714288905e-05, "loss": 0.6734122037887573, "step": 215 }, { "epoch": 0.7128712871287128, "grad_norm": 0.35396639697907356, "learning_rate": 3.777457533445393e-05, "loss": 0.5678560137748718, "step": 216 }, { "epoch": 0.7161716171617162, "grad_norm": 0.3232076597831296, "learning_rate": 3.7739231350680135e-05, "loss": 0.5784683227539062, "step": 217 }, { "epoch": 0.7194719471947195, "grad_norm": 0.3540897506756201, "learning_rate": 3.7703625712892125e-05, "loss": 0.6060354113578796, "step": 218 }, { "epoch": 0.7227722772277227, "grad_norm": 0.35008278157890194, "learning_rate": 3.766775894627376e-05, "loss": 0.6248741745948792, "step": 219 }, { "epoch": 0.7260726072607261, "grad_norm": 0.32018676747331787, "learning_rate": 3.7631631579860553e-05, "loss": 0.6014479398727417, "step": 220 }, { "epoch": 0.7293729372937293, "grad_norm": 0.32068744744726313, "learning_rate": 3.759524414653189e-05, "loss": 0.6283233761787415, "step": 221 }, { "epoch": 0.7326732673267327, "grad_norm": 0.3047460979670785, "learning_rate": 3.755859718300313e-05, "loss": 0.5710185766220093, "step": 222 }, { "epoch": 0.735973597359736, "grad_norm": 0.34698489216212486, "learning_rate": 3.75216912298177e-05, "loss": 0.6007407903671265, "step": 223 }, { "epoch": 0.7392739273927392, "grad_norm": 0.4952362221345831, "learning_rate": 3.748452683133916e-05, "loss": 0.6852575540542603, "step": 224 }, { "epoch": 0.7425742574257426, "grad_norm": 0.32106680253004655, "learning_rate": 3.7447104535743115e-05, "loss": 0.6270833611488342, "step": 225 }, { "epoch": 0.7458745874587459, "grad_norm": 0.30214814189665545, "learning_rate": 3.740942489500916e-05, "loss": 0.5925471782684326, "step": 226 }, { "epoch": 0.7491749174917491, "grad_norm": 0.3171932777170319, "learning_rate": 3.737148846491275e-05, "loss": 0.573570728302002, "step": 227 }, { "epoch": 0.7524752475247525, "grad_norm": 0.31480815810804524, "learning_rate": 3.7333295805016986e-05, "loss": 0.6088368892669678, "step": 228 }, { "epoch": 0.7557755775577558, "grad_norm": 0.3103068539492526, "learning_rate": 3.729484747866435e-05, "loss": 0.5496470332145691, "step": 229 }, { "epoch": 0.759075907590759, "grad_norm": 0.3007603199811456, "learning_rate": 3.725614405296843e-05, "loss": 0.6008220314979553, "step": 230 }, { "epoch": 0.7623762376237624, "grad_norm": 0.3007492168191884, "learning_rate": 3.721718609880551e-05, "loss": 0.5982120037078857, "step": 231 }, { "epoch": 0.7656765676567657, "grad_norm": 0.3010002181490163, "learning_rate": 3.717797419080618e-05, "loss": 0.6404559016227722, "step": 232 }, { "epoch": 0.768976897689769, "grad_norm": 0.35604106645956024, "learning_rate": 3.713850890734689e-05, "loss": 0.5875239372253418, "step": 233 }, { "epoch": 0.7722772277227723, "grad_norm": 0.33191901009333297, "learning_rate": 3.709879083054133e-05, "loss": 0.5962772369384766, "step": 234 }, { "epoch": 0.7755775577557755, "grad_norm": 0.29418628627284477, "learning_rate": 3.705882054623192e-05, "loss": 0.5764110684394836, "step": 235 }, { "epoch": 0.7788778877887789, "grad_norm": 0.30409612807603364, "learning_rate": 3.7018598643981165e-05, "loss": 0.5635858178138733, "step": 236 }, { "epoch": 0.7821782178217822, "grad_norm": 0.3039645238556037, "learning_rate": 3.69781257170629e-05, "loss": 0.5880881547927856, "step": 237 }, { "epoch": 0.7854785478547854, "grad_norm": 0.30606246597511416, "learning_rate": 3.6937402362453606e-05, "loss": 0.5644733905792236, "step": 238 }, { "epoch": 0.7887788778877888, "grad_norm": 0.328325214152846, "learning_rate": 3.689642918082358e-05, "loss": 0.6431151032447815, "step": 239 }, { "epoch": 0.7920792079207921, "grad_norm": 0.2863869456911102, "learning_rate": 3.6855206776528055e-05, "loss": 0.5848085880279541, "step": 240 }, { "epoch": 0.7953795379537953, "grad_norm": 0.3169795193025283, "learning_rate": 3.681373575759831e-05, "loss": 0.590021550655365, "step": 241 }, { "epoch": 0.7986798679867987, "grad_norm": 0.3630216059086489, "learning_rate": 3.67720167357327e-05, "loss": 0.6217919588088989, "step": 242 }, { "epoch": 0.801980198019802, "grad_norm": 0.2999270957223198, "learning_rate": 3.673005032628763e-05, "loss": 0.6075180172920227, "step": 243 }, { "epoch": 0.8052805280528053, "grad_norm": 0.35145967135780704, "learning_rate": 3.668783714826846e-05, "loss": 0.6078404188156128, "step": 244 }, { "epoch": 0.8085808580858086, "grad_norm": 0.32650805345047657, "learning_rate": 3.664537782432042e-05, "loss": 0.6297526955604553, "step": 245 }, { "epoch": 0.8118811881188119, "grad_norm": 0.32461322862254094, "learning_rate": 3.660267298071936e-05, "loss": 0.5684514045715332, "step": 246 }, { "epoch": 0.8151815181518152, "grad_norm": 0.32171296221654416, "learning_rate": 3.655972324736259e-05, "loss": 0.6192148327827454, "step": 247 }, { "epoch": 0.8184818481848185, "grad_norm": 0.3322336621503604, "learning_rate": 3.6516529257759506e-05, "loss": 0.5900243520736694, "step": 248 }, { "epoch": 0.8217821782178217, "grad_norm": 0.35183312055445004, "learning_rate": 3.6473091649022337e-05, "loss": 0.5941751599311829, "step": 249 }, { "epoch": 0.8250825082508251, "grad_norm": 0.31255833045908565, "learning_rate": 3.6429411061856645e-05, "loss": 0.5744310021400452, "step": 250 }, { "epoch": 0.8283828382838284, "grad_norm": 0.3266269251233177, "learning_rate": 3.6385488140551985e-05, "loss": 0.5985124707221985, "step": 251 }, { "epoch": 0.8316831683168316, "grad_norm": 0.30426711611593643, "learning_rate": 3.6341323532972294e-05, "loss": 0.581912636756897, "step": 252 }, { "epoch": 0.834983498349835, "grad_norm": 0.3297819735063718, "learning_rate": 3.629691789054643e-05, "loss": 0.586786150932312, "step": 253 }, { "epoch": 0.8382838283828383, "grad_norm": 0.3074133078124695, "learning_rate": 3.625227186825848e-05, "loss": 0.6312603950500488, "step": 254 }, { "epoch": 0.8415841584158416, "grad_norm": 0.33007753969064285, "learning_rate": 3.620738612463818e-05, "loss": 0.5886626243591309, "step": 255 }, { "epoch": 0.8448844884488449, "grad_norm": 0.31334340596765187, "learning_rate": 3.6162261321751114e-05, "loss": 0.5892266035079956, "step": 256 }, { "epoch": 0.8481848184818482, "grad_norm": 0.31784442826893616, "learning_rate": 3.6116898125189045e-05, "loss": 0.5472115278244019, "step": 257 }, { "epoch": 0.8514851485148515, "grad_norm": 0.3456330158902343, "learning_rate": 3.6071297204059995e-05, "loss": 0.5981796383857727, "step": 258 }, { "epoch": 0.8547854785478548, "grad_norm": 0.3377124553034101, "learning_rate": 3.6025459230978475e-05, "loss": 0.6708342432975769, "step": 259 }, { "epoch": 0.858085808580858, "grad_norm": 0.3081391395426973, "learning_rate": 3.597938488205549e-05, "loss": 0.6306079626083374, "step": 260 }, { "epoch": 0.8613861386138614, "grad_norm": 0.3398583824115319, "learning_rate": 3.59330748368886e-05, "loss": 0.6098329424858093, "step": 261 }, { "epoch": 0.8646864686468647, "grad_norm": 0.32878067719138626, "learning_rate": 3.588652977855189e-05, "loss": 0.5617724061012268, "step": 262 }, { "epoch": 0.8679867986798679, "grad_norm": 0.34962664282188816, "learning_rate": 3.58397503935859e-05, "loss": 0.5780894756317139, "step": 263 }, { "epoch": 0.8712871287128713, "grad_norm": 0.32665214019362204, "learning_rate": 3.5792737371987477e-05, "loss": 0.578921377658844, "step": 264 }, { "epoch": 0.8745874587458746, "grad_norm": 0.36673188949709323, "learning_rate": 3.574549140719962e-05, "loss": 0.614944577217102, "step": 265 }, { "epoch": 0.8778877887788779, "grad_norm": 0.3248666143164946, "learning_rate": 3.569801319610125e-05, "loss": 0.6269869208335876, "step": 266 }, { "epoch": 0.8811881188118812, "grad_norm": 0.3338123662452596, "learning_rate": 3.565030343899693e-05, "loss": 0.6045581102371216, "step": 267 }, { "epoch": 0.8844884488448845, "grad_norm": 0.31011600887091817, "learning_rate": 3.5602362839606514e-05, "loss": 0.5872907638549805, "step": 268 }, { "epoch": 0.8877887788778878, "grad_norm": 0.31857062779594814, "learning_rate": 3.55541921050548e-05, "loss": 0.6283375024795532, "step": 269 }, { "epoch": 0.8910891089108911, "grad_norm": 0.32445751859048455, "learning_rate": 3.5505791945861076e-05, "loss": 0.5747002363204956, "step": 270 }, { "epoch": 0.8943894389438944, "grad_norm": 0.2923309334474062, "learning_rate": 3.545716307592864e-05, "loss": 0.6205827593803406, "step": 271 }, { "epoch": 0.8976897689768977, "grad_norm": 0.43972579907455317, "learning_rate": 3.54083062125343e-05, "loss": 0.5987251400947571, "step": 272 }, { "epoch": 0.900990099009901, "grad_norm": 0.33194286352506225, "learning_rate": 3.535922207631776e-05, "loss": 0.6275356411933899, "step": 273 }, { "epoch": 0.9042904290429042, "grad_norm": 0.3408278730793354, "learning_rate": 3.5309911391270996e-05, "loss": 0.6097655892372131, "step": 274 }, { "epoch": 0.9075907590759076, "grad_norm": 0.3441995699777348, "learning_rate": 3.52603748847276e-05, "loss": 0.544170618057251, "step": 275 }, { "epoch": 0.9108910891089109, "grad_norm": 0.3034867763949278, "learning_rate": 3.521061328735202e-05, "loss": 0.5723366141319275, "step": 276 }, { "epoch": 0.9141914191419142, "grad_norm": 0.3091145609625042, "learning_rate": 3.516062733312879e-05, "loss": 0.5801889896392822, "step": 277 }, { "epoch": 0.9174917491749175, "grad_norm": 0.3532845546992122, "learning_rate": 3.511041775935175e-05, "loss": 0.5942766666412354, "step": 278 }, { "epoch": 0.9207920792079208, "grad_norm": 0.3192035342587887, "learning_rate": 3.50599853066131e-05, "loss": 0.5604017972946167, "step": 279 }, { "epoch": 0.9240924092409241, "grad_norm": 0.4475571406552253, "learning_rate": 3.500933071879251e-05, "loss": 0.6151460409164429, "step": 280 }, { "epoch": 0.9273927392739274, "grad_norm": 0.30946498453996385, "learning_rate": 3.495845474304616e-05, "loss": 0.5854936838150024, "step": 281 }, { "epoch": 0.9306930693069307, "grad_norm": 0.3188531409769719, "learning_rate": 3.490735812979572e-05, "loss": 0.5586672425270081, "step": 282 }, { "epoch": 0.933993399339934, "grad_norm": 0.3250546549981712, "learning_rate": 3.485604163271721e-05, "loss": 0.578475832939148, "step": 283 }, { "epoch": 0.9372937293729373, "grad_norm": 0.45030229248281484, "learning_rate": 3.4804506008730015e-05, "loss": 0.5236382484436035, "step": 284 }, { "epoch": 0.9405940594059405, "grad_norm": 0.31677157675280776, "learning_rate": 3.475275201798559e-05, "loss": 0.5964822769165039, "step": 285 }, { "epoch": 0.9438943894389439, "grad_norm": 0.3221519247617692, "learning_rate": 3.4700780423856334e-05, "loss": 0.5551598072052002, "step": 286 }, { "epoch": 0.9471947194719472, "grad_norm": 0.31322506983838, "learning_rate": 3.464859199292429e-05, "loss": 0.6095103621482849, "step": 287 }, { "epoch": 0.9504950495049505, "grad_norm": 0.33333701342858213, "learning_rate": 3.4596187494969846e-05, "loss": 0.5893416404724121, "step": 288 }, { "epoch": 0.9537953795379538, "grad_norm": 0.31167002926986764, "learning_rate": 3.454356770296039e-05, "loss": 0.5992231965065002, "step": 289 }, { "epoch": 0.9570957095709571, "grad_norm": 0.3407826991036566, "learning_rate": 3.4490733393038895e-05, "loss": 0.6071972250938416, "step": 290 }, { "epoch": 0.9603960396039604, "grad_norm": 0.321397588262469, "learning_rate": 3.443768534451248e-05, "loss": 0.5836942195892334, "step": 291 }, { "epoch": 0.9636963696369637, "grad_norm": 0.3596023570145339, "learning_rate": 3.4384424339840916e-05, "loss": 0.5707553625106812, "step": 292 }, { "epoch": 0.966996699669967, "grad_norm": 0.326365753033755, "learning_rate": 3.4330951164625075e-05, "loss": 0.5883970260620117, "step": 293 }, { "epoch": 0.9702970297029703, "grad_norm": 0.3276030981345682, "learning_rate": 3.427726660759535e-05, "loss": 0.6281589269638062, "step": 294 }, { "epoch": 0.9735973597359736, "grad_norm": 0.3559560269123216, "learning_rate": 3.422337146060003e-05, "loss": 0.6641702651977539, "step": 295 }, { "epoch": 0.976897689768977, "grad_norm": 0.34661891319338206, "learning_rate": 3.4169266518593596e-05, "loss": 0.6398966312408447, "step": 296 }, { "epoch": 0.9801980198019802, "grad_norm": 0.3392015122860613, "learning_rate": 3.411495257962501e-05, "loss": 0.6376276016235352, "step": 297 }, { "epoch": 0.9834983498349835, "grad_norm": 0.3454832175281825, "learning_rate": 3.406043044482596e-05, "loss": 0.648975133895874, "step": 298 }, { "epoch": 0.9867986798679867, "grad_norm": 0.3284679145456545, "learning_rate": 3.4005700918399016e-05, "loss": 0.6201390624046326, "step": 299 }, { "epoch": 0.9900990099009901, "grad_norm": 0.33000362479964457, "learning_rate": 3.395076480760576e-05, "loss": 0.6103875637054443, "step": 300 }, { "epoch": 0.9933993399339934, "grad_norm": 0.31707924192462417, "learning_rate": 3.3895622922754936e-05, "loss": 0.5486876368522644, "step": 301 }, { "epoch": 0.9966996699669967, "grad_norm": 0.3094164003933957, "learning_rate": 3.384027607719043e-05, "loss": 0.5980846285820007, "step": 302 }, { "epoch": 1.0, "grad_norm": 0.33089398879681, "learning_rate": 3.378472508727931e-05, "loss": 0.5986801385879517, "step": 303 }, { "epoch": 1.0033003300330032, "grad_norm": 0.4690060258405477, "learning_rate": 3.372897077239979e-05, "loss": 0.5586727857589722, "step": 304 }, { "epoch": 1.0066006600660067, "grad_norm": 0.34686786747213394, "learning_rate": 3.36730139549291e-05, "loss": 0.5393255949020386, "step": 305 }, { "epoch": 1.00990099009901, "grad_norm": 0.4023568892604613, "learning_rate": 3.361685546023143e-05, "loss": 0.5377227067947388, "step": 306 }, { "epoch": 1.0132013201320131, "grad_norm": 0.39915820884177944, "learning_rate": 3.356049611664568e-05, "loss": 0.5223784446716309, "step": 307 }, { "epoch": 1.0165016501650166, "grad_norm": 0.3654265250846575, "learning_rate": 3.350393675547328e-05, "loss": 0.5502469539642334, "step": 308 }, { "epoch": 1.0198019801980198, "grad_norm": 0.42079557297663883, "learning_rate": 3.3447178210965936e-05, "loss": 0.5626603960990906, "step": 309 }, { "epoch": 1.023102310231023, "grad_norm": 0.3684084639129366, "learning_rate": 3.3390221320313303e-05, "loss": 0.48262274265289307, "step": 310 }, { "epoch": 1.0264026402640265, "grad_norm": 0.39908786063309193, "learning_rate": 3.333306692363065e-05, "loss": 0.5850967168807983, "step": 311 }, { "epoch": 1.0297029702970297, "grad_norm": 0.44262876970078274, "learning_rate": 3.3275715863946466e-05, "loss": 0.5444281697273254, "step": 312 }, { "epoch": 1.033003300330033, "grad_norm": 0.35239079669120155, "learning_rate": 3.3218168987190004e-05, "loss": 0.5329654216766357, "step": 313 }, { "epoch": 1.0363036303630364, "grad_norm": 0.38499730860339404, "learning_rate": 3.316042714217885e-05, "loss": 0.5276832580566406, "step": 314 }, { "epoch": 1.0396039603960396, "grad_norm": 0.3928937531164494, "learning_rate": 3.310249118060636e-05, "loss": 0.5344791412353516, "step": 315 }, { "epoch": 1.0429042904290429, "grad_norm": 0.3466589226743573, "learning_rate": 3.304436195702911e-05, "loss": 0.5479785203933716, "step": 316 }, { "epoch": 1.046204620462046, "grad_norm": 0.370325309360066, "learning_rate": 3.298604032885431e-05, "loss": 0.5223082900047302, "step": 317 }, { "epoch": 1.0495049504950495, "grad_norm": 0.4271803134046634, "learning_rate": 3.292752715632713e-05, "loss": 0.5667799711227417, "step": 318 }, { "epoch": 1.0528052805280528, "grad_norm": 0.33752277032768196, "learning_rate": 3.2868823302518016e-05, "loss": 0.5194317698478699, "step": 319 }, { "epoch": 1.056105610561056, "grad_norm": 0.35801795115870316, "learning_rate": 3.2809929633309985e-05, "loss": 0.4911007285118103, "step": 320 }, { "epoch": 1.0594059405940595, "grad_norm": 0.33819516112787196, "learning_rate": 3.2750847017385826e-05, "loss": 0.5269002914428711, "step": 321 }, { "epoch": 1.0627062706270627, "grad_norm": 0.3280280196094967, "learning_rate": 3.269157632621529e-05, "loss": 0.5124789476394653, "step": 322 }, { "epoch": 1.066006600660066, "grad_norm": 0.3841029677303286, "learning_rate": 3.263211843404225e-05, "loss": 0.5483890771865845, "step": 323 }, { "epoch": 1.0693069306930694, "grad_norm": 0.348752311292252, "learning_rate": 3.25724742178718e-05, "loss": 0.5582579374313354, "step": 324 }, { "epoch": 1.0726072607260726, "grad_norm": 0.3672218653955236, "learning_rate": 3.2512644557457304e-05, "loss": 0.5662975907325745, "step": 325 }, { "epoch": 1.0759075907590758, "grad_norm": 0.339133227284404, "learning_rate": 3.2452630335287445e-05, "loss": 0.5502511858940125, "step": 326 }, { "epoch": 1.0792079207920793, "grad_norm": 0.3607463939055526, "learning_rate": 3.239243243657318e-05, "loss": 0.5614978075027466, "step": 327 }, { "epoch": 1.0825082508250825, "grad_norm": 0.3354690532522152, "learning_rate": 3.233205174923472e-05, "loss": 0.4828110635280609, "step": 328 }, { "epoch": 1.0858085808580857, "grad_norm": 0.3296040603044689, "learning_rate": 3.22714891638884e-05, "loss": 0.5437847971916199, "step": 329 }, { "epoch": 1.0891089108910892, "grad_norm": 0.3295415767468974, "learning_rate": 3.221074557383355e-05, "loss": 0.6240063309669495, "step": 330 }, { "epoch": 1.0924092409240924, "grad_norm": 0.3032628226796708, "learning_rate": 3.2149821875039325e-05, "loss": 0.5435442328453064, "step": 331 }, { "epoch": 1.0957095709570956, "grad_norm": 0.30875440813945676, "learning_rate": 3.20887189661315e-05, "loss": 0.5240401029586792, "step": 332 }, { "epoch": 1.099009900990099, "grad_norm": 0.3043121620505056, "learning_rate": 3.202743774837919e-05, "loss": 0.5227692127227783, "step": 333 }, { "epoch": 1.1023102310231023, "grad_norm": 0.3439754692795775, "learning_rate": 3.196597912568157e-05, "loss": 0.5607417821884155, "step": 334 }, { "epoch": 1.1056105610561056, "grad_norm": 0.29691798670137787, "learning_rate": 3.1904344004554536e-05, "loss": 0.5607600808143616, "step": 335 }, { "epoch": 1.108910891089109, "grad_norm": 0.32493088910689055, "learning_rate": 3.184253329411737e-05, "loss": 0.47135430574417114, "step": 336 }, { "epoch": 1.1122112211221122, "grad_norm": 0.3202945703052858, "learning_rate": 3.178054790607924e-05, "loss": 0.5708764791488647, "step": 337 }, { "epoch": 1.1155115511551155, "grad_norm": 0.3164605548495645, "learning_rate": 3.1718388754725883e-05, "loss": 0.5522497296333313, "step": 338 }, { "epoch": 1.118811881188119, "grad_norm": 0.3449586600316318, "learning_rate": 3.1656056756906e-05, "loss": 0.5556532144546509, "step": 339 }, { "epoch": 1.1221122112211221, "grad_norm": 0.3130025484639745, "learning_rate": 3.1593552832017795e-05, "loss": 0.5727676153182983, "step": 340 }, { "epoch": 1.1254125412541254, "grad_norm": 0.3195703179740936, "learning_rate": 3.153087790199541e-05, "loss": 0.5131651759147644, "step": 341 }, { "epoch": 1.1287128712871288, "grad_norm": 0.3191177264656739, "learning_rate": 3.146803289129528e-05, "loss": 0.5143063068389893, "step": 342 }, { "epoch": 1.132013201320132, "grad_norm": 0.33398757419035885, "learning_rate": 3.1405018726882595e-05, "loss": 0.509161114692688, "step": 343 }, { "epoch": 1.1353135313531353, "grad_norm": 0.33058725446313514, "learning_rate": 3.13418363382175e-05, "loss": 0.5213526487350464, "step": 344 }, { "epoch": 1.1386138613861387, "grad_norm": 0.3226863318187914, "learning_rate": 3.127848665724149e-05, "loss": 0.5465434789657593, "step": 345 }, { "epoch": 1.141914191419142, "grad_norm": 0.6179658385179007, "learning_rate": 3.1214970618363626e-05, "loss": 0.5342190265655518, "step": 346 }, { "epoch": 1.1452145214521452, "grad_norm": 0.47777163001134637, "learning_rate": 3.115128915844672e-05, "loss": 0.541754424571991, "step": 347 }, { "epoch": 1.1485148514851484, "grad_norm": 0.33931974771490697, "learning_rate": 3.10874432167936e-05, "loss": 0.5318331122398376, "step": 348 }, { "epoch": 1.1518151815181519, "grad_norm": 0.32111740987941506, "learning_rate": 3.1023433735133134e-05, "loss": 0.4972509741783142, "step": 349 }, { "epoch": 1.155115511551155, "grad_norm": 0.30074948382432587, "learning_rate": 3.095926165760647e-05, "loss": 0.5417294502258301, "step": 350 }, { "epoch": 1.1584158415841583, "grad_norm": 0.3410522798436207, "learning_rate": 3.089492793075302e-05, "loss": 0.554945707321167, "step": 351 }, { "epoch": 1.1617161716171618, "grad_norm": 0.3254774061643724, "learning_rate": 3.083043350349653e-05, "loss": 0.5204564929008484, "step": 352 }, { "epoch": 1.165016501650165, "grad_norm": 0.3088402728006412, "learning_rate": 3.076577932713108e-05, "loss": 0.4856947064399719, "step": 353 }, { "epoch": 1.1683168316831682, "grad_norm": 0.2896918095760776, "learning_rate": 3.0700966355307055e-05, "loss": 0.5269368886947632, "step": 354 }, { "epoch": 1.1716171617161717, "grad_norm": 0.32747543865706225, "learning_rate": 3.063599554401708e-05, "loss": 0.5811939239501953, "step": 355 }, { "epoch": 1.174917491749175, "grad_norm": 0.29324577597304957, "learning_rate": 3.057086785158189e-05, "loss": 0.5636904239654541, "step": 356 }, { "epoch": 1.1782178217821782, "grad_norm": 0.31779620334412045, "learning_rate": 3.050558423863626e-05, "loss": 0.546089768409729, "step": 357 }, { "epoch": 1.1815181518151816, "grad_norm": 0.3093045991582328, "learning_rate": 3.0440145668114774e-05, "loss": 0.5239901542663574, "step": 358 }, { "epoch": 1.1848184818481848, "grad_norm": 0.31848934088179354, "learning_rate": 3.0374553105237637e-05, "loss": 0.5833466053009033, "step": 359 }, { "epoch": 1.188118811881188, "grad_norm": 0.33803859097620154, "learning_rate": 3.0308807517496456e-05, "loss": 0.5060774087905884, "step": 360 }, { "epoch": 1.1914191419141915, "grad_norm": 0.31145081064149094, "learning_rate": 3.0242909874639953e-05, "loss": 0.5164307355880737, "step": 361 }, { "epoch": 1.1947194719471947, "grad_norm": 0.29765085452905116, "learning_rate": 3.0176861148659672e-05, "loss": 0.49949395656585693, "step": 362 }, { "epoch": 1.198019801980198, "grad_norm": 0.3296486034239661, "learning_rate": 3.0110662313775623e-05, "loss": 0.5581181049346924, "step": 363 }, { "epoch": 1.2013201320132012, "grad_norm": 0.3116631729941006, "learning_rate": 3.0044314346421938e-05, "loss": 0.5657376646995544, "step": 364 }, { "epoch": 1.2046204620462047, "grad_norm": 0.33012695180790946, "learning_rate": 2.9977818225232443e-05, "loss": 0.5269935131072998, "step": 365 }, { "epoch": 1.2079207920792079, "grad_norm": 0.31869984664933465, "learning_rate": 2.991117493102626e-05, "loss": 0.5385931730270386, "step": 366 }, { "epoch": 1.2112211221122111, "grad_norm": 0.30491226427581125, "learning_rate": 2.984438544679329e-05, "loss": 0.5615143179893494, "step": 367 }, { "epoch": 1.2145214521452146, "grad_norm": 0.32195999076013593, "learning_rate": 2.9777450757679754e-05, "loss": 0.5175333023071289, "step": 368 }, { "epoch": 1.2178217821782178, "grad_norm": 0.30930257180361886, "learning_rate": 2.971037185097364e-05, "loss": 0.565494179725647, "step": 369 }, { "epoch": 1.221122112211221, "grad_norm": 0.34237830645177886, "learning_rate": 2.9643149716090146e-05, "loss": 0.5519120693206787, "step": 370 }, { "epoch": 1.2244224422442245, "grad_norm": 0.30959351563618437, "learning_rate": 2.9575785344557114e-05, "loss": 0.49374374747276306, "step": 371 }, { "epoch": 1.2277227722772277, "grad_norm": 0.31310768619122714, "learning_rate": 2.950827973000034e-05, "loss": 0.5608875751495361, "step": 372 }, { "epoch": 1.231023102310231, "grad_norm": 0.31986895424613543, "learning_rate": 2.944063386812899e-05, "loss": 0.5866271257400513, "step": 373 }, { "epoch": 1.2343234323432344, "grad_norm": 0.3359900469491975, "learning_rate": 2.9372848756720867e-05, "loss": 0.5342913269996643, "step": 374 }, { "epoch": 1.2376237623762376, "grad_norm": 0.2956484140793021, "learning_rate": 2.9304925395607696e-05, "loss": 0.5539537668228149, "step": 375 }, { "epoch": 1.2409240924092408, "grad_norm": 0.3239136306261367, "learning_rate": 2.9236864786660423e-05, "loss": 0.5614147186279297, "step": 376 }, { "epoch": 1.2442244224422443, "grad_norm": 0.3311932744032855, "learning_rate": 2.9168667933774356e-05, "loss": 0.46689367294311523, "step": 377 }, { "epoch": 1.2475247524752475, "grad_norm": 0.3291299090174619, "learning_rate": 2.910033584285444e-05, "loss": 0.5383083820343018, "step": 378 }, { "epoch": 1.2508250825082508, "grad_norm": 0.3013900588246958, "learning_rate": 2.903186952180037e-05, "loss": 0.5349752902984619, "step": 379 }, { "epoch": 1.2541254125412542, "grad_norm": 0.3219145450840317, "learning_rate": 2.8963269980491743e-05, "loss": 0.5792303681373596, "step": 380 }, { "epoch": 1.2574257425742574, "grad_norm": 0.2840550960191948, "learning_rate": 2.8894538230773147e-05, "loss": 0.524924099445343, "step": 381 }, { "epoch": 1.2607260726072607, "grad_norm": 0.3172399675943548, "learning_rate": 2.882567528643925e-05, "loss": 0.5137406587600708, "step": 382 }, { "epoch": 1.2640264026402641, "grad_norm": 0.2893676822687234, "learning_rate": 2.8756682163219857e-05, "loss": 0.5196574926376343, "step": 383 }, { "epoch": 1.2673267326732673, "grad_norm": 0.31363904787626334, "learning_rate": 2.8687559878764903e-05, "loss": 0.585644006729126, "step": 384 }, { "epoch": 1.2706270627062706, "grad_norm": 0.3310272877884813, "learning_rate": 2.8618309452629445e-05, "loss": 0.5973786115646362, "step": 385 }, { "epoch": 1.273927392739274, "grad_norm": 0.3201222210217655, "learning_rate": 2.854893190625865e-05, "loss": 0.5909825563430786, "step": 386 }, { "epoch": 1.2772277227722773, "grad_norm": 0.3507731714316878, "learning_rate": 2.84794282629727e-05, "loss": 0.5903690457344055, "step": 387 }, { "epoch": 1.2805280528052805, "grad_norm": 0.31011243056320775, "learning_rate": 2.840979954795171e-05, "loss": 0.5316457152366638, "step": 388 }, { "epoch": 1.283828382838284, "grad_norm": 0.32950464198309637, "learning_rate": 2.8340046788220613e-05, "loss": 0.5080389976501465, "step": 389 }, { "epoch": 1.2871287128712872, "grad_norm": 0.37769184930606736, "learning_rate": 2.8270171012633994e-05, "loss": 0.6137889623641968, "step": 390 }, { "epoch": 1.2904290429042904, "grad_norm": 0.34430823745531935, "learning_rate": 2.8200173251860928e-05, "loss": 0.5433805584907532, "step": 391 }, { "epoch": 1.2937293729372938, "grad_norm": 0.356563736773021, "learning_rate": 2.8130054538369775e-05, "loss": 0.4965590834617615, "step": 392 }, { "epoch": 1.297029702970297, "grad_norm": 0.29380923244218154, "learning_rate": 2.805981590641295e-05, "loss": 0.5361340045928955, "step": 393 }, { "epoch": 1.3003300330033003, "grad_norm": 0.31403525376793245, "learning_rate": 2.7989458392011678e-05, "loss": 0.47011327743530273, "step": 394 }, { "epoch": 1.3036303630363038, "grad_norm": 0.30710914438533876, "learning_rate": 2.7918983032940666e-05, "loss": 0.5893687605857849, "step": 395 }, { "epoch": 1.306930693069307, "grad_norm": 0.3126943781985397, "learning_rate": 2.7848390868712886e-05, "loss": 0.5219327211380005, "step": 396 }, { "epoch": 1.3102310231023102, "grad_norm": 0.35585146532127665, "learning_rate": 2.7777682940564142e-05, "loss": 0.5652155876159668, "step": 397 }, { "epoch": 1.3135313531353137, "grad_norm": 0.41906023992763497, "learning_rate": 2.7706860291437784e-05, "loss": 0.5361950397491455, "step": 398 }, { "epoch": 1.316831683168317, "grad_norm": 0.29071400108766793, "learning_rate": 2.763592396596929e-05, "loss": 0.5355206727981567, "step": 399 }, { "epoch": 1.3201320132013201, "grad_norm": 0.298123677847084, "learning_rate": 2.756487501047086e-05, "loss": 0.5082858800888062, "step": 400 }, { "epoch": 1.3234323432343233, "grad_norm": 0.3144050740212562, "learning_rate": 2.7493714472916013e-05, "loss": 0.5282934904098511, "step": 401 }, { "epoch": 1.3267326732673268, "grad_norm": 0.29396121691648713, "learning_rate": 2.7422443402924074e-05, "loss": 0.5502887964248657, "step": 402 }, { "epoch": 1.33003300330033, "grad_norm": 0.2854429234726643, "learning_rate": 2.7351062851744747e-05, "loss": 0.5374204516410828, "step": 403 }, { "epoch": 1.3333333333333333, "grad_norm": 0.30308752538818784, "learning_rate": 2.7279573872242574e-05, "loss": 0.5602293014526367, "step": 404 }, { "epoch": 1.3366336633663367, "grad_norm": 0.30975657746221447, "learning_rate": 2.7207977518881418e-05, "loss": 0.5321286916732788, "step": 405 }, { "epoch": 1.33993399339934, "grad_norm": 0.28965457921713383, "learning_rate": 2.713627484770892e-05, "loss": 0.5523560047149658, "step": 406 }, { "epoch": 1.3432343234323432, "grad_norm": 0.30598816879566076, "learning_rate": 2.706446691634089e-05, "loss": 0.47019705176353455, "step": 407 }, { "epoch": 1.3465346534653464, "grad_norm": 0.2977261513860205, "learning_rate": 2.6992554783945748e-05, "loss": 0.540359616279602, "step": 408 }, { "epoch": 1.3498349834983498, "grad_norm": 0.2845048826043699, "learning_rate": 2.6920539511228874e-05, "loss": 0.561464786529541, "step": 409 }, { "epoch": 1.353135313531353, "grad_norm": 0.2939741197740927, "learning_rate": 2.6848422160416956e-05, "loss": 0.5429259538650513, "step": 410 }, { "epoch": 1.3564356435643563, "grad_norm": 0.2968609589915083, "learning_rate": 2.677620379524237e-05, "loss": 0.5452640652656555, "step": 411 }, { "epoch": 1.3597359735973598, "grad_norm": 0.28949363661635646, "learning_rate": 2.670388548092741e-05, "loss": 0.49627864360809326, "step": 412 }, { "epoch": 1.363036303630363, "grad_norm": 0.328169978832012, "learning_rate": 2.663146828416867e-05, "loss": 0.5331633687019348, "step": 413 }, { "epoch": 1.3663366336633662, "grad_norm": 0.2926434963884909, "learning_rate": 2.6558953273121216e-05, "loss": 0.5447151064872742, "step": 414 }, { "epoch": 1.3696369636963697, "grad_norm": 0.2863360845432002, "learning_rate": 2.648634151738292e-05, "loss": 0.5467007160186768, "step": 415 }, { "epoch": 1.372937293729373, "grad_norm": 0.33044933855099695, "learning_rate": 2.6413634087978602e-05, "loss": 0.5804279446601868, "step": 416 }, { "epoch": 1.3762376237623761, "grad_norm": 0.29168904019746145, "learning_rate": 2.63408320573443e-05, "loss": 0.5323517322540283, "step": 417 }, { "epoch": 1.3795379537953796, "grad_norm": 0.3046417110987717, "learning_rate": 2.6267936499311402e-05, "loss": 0.5452409982681274, "step": 418 }, { "epoch": 1.3828382838283828, "grad_norm": 0.2878853361033164, "learning_rate": 2.619494848909084e-05, "loss": 0.4622665047645569, "step": 419 }, { "epoch": 1.386138613861386, "grad_norm": 0.3129938954769346, "learning_rate": 2.6121869103257206e-05, "loss": 0.531772255897522, "step": 420 }, { "epoch": 1.3894389438943895, "grad_norm": 0.3044320552061303, "learning_rate": 2.6048699419732897e-05, "loss": 0.519554853439331, "step": 421 }, { "epoch": 1.3927392739273927, "grad_norm": 0.32616258357306027, "learning_rate": 2.5975440517772187e-05, "loss": 0.545585572719574, "step": 422 }, { "epoch": 1.396039603960396, "grad_norm": 0.297995845019565, "learning_rate": 2.5902093477945345e-05, "loss": 0.5641547441482544, "step": 423 }, { "epoch": 1.3993399339933994, "grad_norm": 0.28406971495281874, "learning_rate": 2.5828659382122655e-05, "loss": 0.5578028559684753, "step": 424 }, { "epoch": 1.4026402640264026, "grad_norm": 0.35618435421860006, "learning_rate": 2.5755139313458484e-05, "loss": 0.5931404232978821, "step": 425 }, { "epoch": 1.4059405940594059, "grad_norm": 0.3227282264542969, "learning_rate": 2.5681534356375314e-05, "loss": 0.5486891865730286, "step": 426 }, { "epoch": 1.4092409240924093, "grad_norm": 0.31220449886262164, "learning_rate": 2.5607845596547706e-05, "loss": 0.5007671117782593, "step": 427 }, { "epoch": 1.4125412541254125, "grad_norm": 0.2970377848116104, "learning_rate": 2.5534074120886346e-05, "loss": 0.5044519901275635, "step": 428 }, { "epoch": 1.4158415841584158, "grad_norm": 0.30667327850480125, "learning_rate": 2.5460221017521952e-05, "loss": 0.5227789878845215, "step": 429 }, { "epoch": 1.4191419141914192, "grad_norm": 0.2902458759439887, "learning_rate": 2.538628737578926e-05, "loss": 0.5530189871788025, "step": 430 }, { "epoch": 1.4224422442244224, "grad_norm": 0.3114416510328153, "learning_rate": 2.5312274286210966e-05, "loss": 0.508142352104187, "step": 431 }, { "epoch": 1.4257425742574257, "grad_norm": 0.30284970816559353, "learning_rate": 2.523818284048159e-05, "loss": 0.5497263669967651, "step": 432 }, { "epoch": 1.4290429042904291, "grad_norm": 0.3619418905679721, "learning_rate": 2.5164014131451443e-05, "loss": 0.5477034449577332, "step": 433 }, { "epoch": 1.4323432343234324, "grad_norm": 0.28668741491270383, "learning_rate": 2.508976925311045e-05, "loss": 0.5091728568077087, "step": 434 }, { "epoch": 1.4356435643564356, "grad_norm": 0.2922234358135184, "learning_rate": 2.501544930057203e-05, "loss": 0.5022713541984558, "step": 435 }, { "epoch": 1.438943894389439, "grad_norm": 0.29994035273286174, "learning_rate": 2.494105537005697e-05, "loss": 0.5401599407196045, "step": 436 }, { "epoch": 1.4422442244224423, "grad_norm": 0.27863085551634303, "learning_rate": 2.4866588558877208e-05, "loss": 0.5632063150405884, "step": 437 }, { "epoch": 1.4455445544554455, "grad_norm": 0.2968792338733857, "learning_rate": 2.479204996541969e-05, "loss": 0.552355170249939, "step": 438 }, { "epoch": 1.448844884488449, "grad_norm": 0.3222205976590156, "learning_rate": 2.4717440689130154e-05, "loss": 0.5604996681213379, "step": 439 }, { "epoch": 1.4521452145214522, "grad_norm": 0.2781451863798608, "learning_rate": 2.4642761830496893e-05, "loss": 0.4961245656013489, "step": 440 }, { "epoch": 1.4554455445544554, "grad_norm": 0.3327533816855903, "learning_rate": 2.4568014491034565e-05, "loss": 0.5403590202331543, "step": 441 }, { "epoch": 1.4587458745874589, "grad_norm": 0.2944499869326328, "learning_rate": 2.4493199773267902e-05, "loss": 0.4753378629684448, "step": 442 }, { "epoch": 1.462046204620462, "grad_norm": 0.30936599048377306, "learning_rate": 2.4418318780715477e-05, "loss": 0.5125438570976257, "step": 443 }, { "epoch": 1.4653465346534653, "grad_norm": 0.3047486735791836, "learning_rate": 2.434337261787342e-05, "loss": 0.5670269727706909, "step": 444 }, { "epoch": 1.4686468646864688, "grad_norm": 0.3348418102837006, "learning_rate": 2.426836239019911e-05, "loss": 0.5538198947906494, "step": 445 }, { "epoch": 1.471947194719472, "grad_norm": 0.2790312641462961, "learning_rate": 2.4193289204094893e-05, "loss": 0.5012328028678894, "step": 446 }, { "epoch": 1.4752475247524752, "grad_norm": 0.30485310749783334, "learning_rate": 2.4118154166891762e-05, "loss": 0.538119912147522, "step": 447 }, { "epoch": 1.4785478547854787, "grad_norm": 0.32398781026753815, "learning_rate": 2.4042958386833003e-05, "loss": 0.5252339839935303, "step": 448 }, { "epoch": 1.481848184818482, "grad_norm": 0.326928536480608, "learning_rate": 2.3967702973057853e-05, "loss": 0.5367081761360168, "step": 449 }, { "epoch": 1.4851485148514851, "grad_norm": 0.3044938562463835, "learning_rate": 2.3892389035585167e-05, "loss": 0.5091884136199951, "step": 450 }, { "epoch": 1.4884488448844886, "grad_norm": 0.2897824690201277, "learning_rate": 2.3817017685297016e-05, "loss": 0.5079891681671143, "step": 451 }, { "epoch": 1.4917491749174918, "grad_norm": 0.2966882318097961, "learning_rate": 2.3741590033922313e-05, "loss": 0.511939287185669, "step": 452 }, { "epoch": 1.495049504950495, "grad_norm": 0.28797637565211376, "learning_rate": 2.3666107194020404e-05, "loss": 0.5070478916168213, "step": 453 }, { "epoch": 1.4983498349834983, "grad_norm": 0.29050652670321586, "learning_rate": 2.3590570278964682e-05, "loss": 0.547492504119873, "step": 454 }, { "epoch": 1.5016501650165015, "grad_norm": 0.311874965448668, "learning_rate": 2.3514980402926132e-05, "loss": 0.5386558771133423, "step": 455 }, { "epoch": 1.504950495049505, "grad_norm": 0.26980126113979913, "learning_rate": 2.3439338680856943e-05, "loss": 0.48668172955513, "step": 456 }, { "epoch": 1.5082508250825084, "grad_norm": 0.31689121328788056, "learning_rate": 2.3363646228474002e-05, "loss": 0.5497942566871643, "step": 457 }, { "epoch": 1.5115511551155114, "grad_norm": 0.3648919358675907, "learning_rate": 2.328790416224248e-05, "loss": 0.5267748832702637, "step": 458 }, { "epoch": 1.5148514851485149, "grad_norm": 0.3191029117024018, "learning_rate": 2.3212113599359368e-05, "loss": 0.5578982830047607, "step": 459 }, { "epoch": 1.5181518151815183, "grad_norm": 0.30610891906133464, "learning_rate": 2.3136275657736956e-05, "loss": 0.5136545896530151, "step": 460 }, { "epoch": 1.5214521452145213, "grad_norm": 0.28466532575384307, "learning_rate": 2.3060391455986403e-05, "loss": 0.5718669891357422, "step": 461 }, { "epoch": 1.5247524752475248, "grad_norm": 0.3064265170567389, "learning_rate": 2.2984462113401184e-05, "loss": 0.5427108407020569, "step": 462 }, { "epoch": 1.528052805280528, "grad_norm": 0.28495826208338726, "learning_rate": 2.2908488749940596e-05, "loss": 0.5293564200401306, "step": 463 }, { "epoch": 1.5313531353135312, "grad_norm": 0.3073240786964915, "learning_rate": 2.2832472486213275e-05, "loss": 0.550743579864502, "step": 464 }, { "epoch": 1.5346534653465347, "grad_norm": 0.30789089349395116, "learning_rate": 2.2756414443460602e-05, "loss": 0.5957387685775757, "step": 465 }, { "epoch": 1.537953795379538, "grad_norm": 0.2840660845057486, "learning_rate": 2.2680315743540234e-05, "loss": 0.4994407892227173, "step": 466 }, { "epoch": 1.5412541254125411, "grad_norm": 0.2912314912557071, "learning_rate": 2.260417750890949e-05, "loss": 0.5120857954025269, "step": 467 }, { "epoch": 1.5445544554455446, "grad_norm": 0.3024618438133355, "learning_rate": 2.2528000862608845e-05, "loss": 0.5727359056472778, "step": 468 }, { "epoch": 1.5478547854785478, "grad_norm": 0.30379584493476613, "learning_rate": 2.2451786928245344e-05, "loss": 0.584964394569397, "step": 469 }, { "epoch": 1.551155115511551, "grad_norm": 0.2782374360382863, "learning_rate": 2.237553682997603e-05, "loss": 0.5507112741470337, "step": 470 }, { "epoch": 1.5544554455445545, "grad_norm": 0.26333814455393634, "learning_rate": 2.2299251692491364e-05, "loss": 0.49136701226234436, "step": 471 }, { "epoch": 1.5577557755775577, "grad_norm": 0.31673569076077385, "learning_rate": 2.2222932640998635e-05, "loss": 0.5374805927276611, "step": 472 }, { "epoch": 1.561056105610561, "grad_norm": 0.29370656251116817, "learning_rate": 2.2146580801205362e-05, "loss": 0.523996114730835, "step": 473 }, { "epoch": 1.5643564356435644, "grad_norm": 0.27277397989040114, "learning_rate": 2.207019729930271e-05, "loss": 0.48198428750038147, "step": 474 }, { "epoch": 1.5676567656765676, "grad_norm": 0.2861287068823064, "learning_rate": 2.199378326194883e-05, "loss": 0.5148699879646301, "step": 475 }, { "epoch": 1.5709570957095709, "grad_norm": 0.2981231032466442, "learning_rate": 2.1917339816252303e-05, "loss": 0.5297671556472778, "step": 476 }, { "epoch": 1.5742574257425743, "grad_norm": 0.2775943923870632, "learning_rate": 2.1840868089755465e-05, "loss": 0.5082278847694397, "step": 477 }, { "epoch": 1.5775577557755776, "grad_norm": 0.2988631140370514, "learning_rate": 2.176436921041779e-05, "loss": 0.4755392372608185, "step": 478 }, { "epoch": 1.5808580858085808, "grad_norm": 0.28707182004966697, "learning_rate": 2.1687844306599275e-05, "loss": 0.5249454975128174, "step": 479 }, { "epoch": 1.5841584158415842, "grad_norm": 0.3023499942723386, "learning_rate": 2.161129450704376e-05, "loss": 0.5626166462898254, "step": 480 }, { "epoch": 1.5874587458745875, "grad_norm": 0.28182475866947054, "learning_rate": 2.1534720940862318e-05, "loss": 0.5590533018112183, "step": 481 }, { "epoch": 1.5907590759075907, "grad_norm": 0.2724331542693392, "learning_rate": 2.1458124737516557e-05, "loss": 0.5146170854568481, "step": 482 }, { "epoch": 1.5940594059405941, "grad_norm": 0.28834268248771533, "learning_rate": 2.1381507026802007e-05, "loss": 0.5633066296577454, "step": 483 }, { "epoch": 1.5973597359735974, "grad_norm": 0.29376551657635425, "learning_rate": 2.130486893883141e-05, "loss": 0.5273865461349487, "step": 484 }, { "epoch": 1.6006600660066006, "grad_norm": 0.277893471974935, "learning_rate": 2.1228211604018088e-05, "loss": 0.5040723085403442, "step": 485 }, { "epoch": 1.603960396039604, "grad_norm": 0.2901419412347278, "learning_rate": 2.1151536153059254e-05, "loss": 0.5254411697387695, "step": 486 }, { "epoch": 1.6072607260726073, "grad_norm": 0.29340041503520936, "learning_rate": 2.1074843716919323e-05, "loss": 0.5789728760719299, "step": 487 }, { "epoch": 1.6105610561056105, "grad_norm": 0.2858502686555999, "learning_rate": 2.0998135426813245e-05, "loss": 0.5521235466003418, "step": 488 }, { "epoch": 1.613861386138614, "grad_norm": 0.2770947277408911, "learning_rate": 2.092141241418984e-05, "loss": 0.4702959954738617, "step": 489 }, { "epoch": 1.6171617161716172, "grad_norm": 0.29713285242144816, "learning_rate": 2.0844675810715046e-05, "loss": 0.4960707128047943, "step": 490 }, { "epoch": 1.6204620462046204, "grad_norm": 0.2800759957297699, "learning_rate": 2.076792674825529e-05, "loss": 0.5334826111793518, "step": 491 }, { "epoch": 1.6237623762376239, "grad_norm": 0.4465546145157964, "learning_rate": 2.0691166358860775e-05, "loss": 0.5604894161224365, "step": 492 }, { "epoch": 1.627062706270627, "grad_norm": 0.2895889767199155, "learning_rate": 2.061439577474875e-05, "loss": 0.5565654635429382, "step": 493 }, { "epoch": 1.6303630363036303, "grad_norm": 0.2663082120203026, "learning_rate": 2.0537616128286875e-05, "loss": 0.541640043258667, "step": 494 }, { "epoch": 1.6336633663366338, "grad_norm": 0.27975047407467746, "learning_rate": 2.0460828551976436e-05, "loss": 0.5247132182121277, "step": 495 }, { "epoch": 1.636963696369637, "grad_norm": 0.30554958978585, "learning_rate": 2.0384034178435727e-05, "loss": 0.533937394618988, "step": 496 }, { "epoch": 1.6402640264026402, "grad_norm": 0.29094539458240765, "learning_rate": 2.0307234140383264e-05, "loss": 0.5857927799224854, "step": 497 }, { "epoch": 1.6435643564356437, "grad_norm": 0.2718482098386275, "learning_rate": 2.0230429570621134e-05, "loss": 0.5191807746887207, "step": 498 }, { "epoch": 1.6468646864686467, "grad_norm": 0.28523897670587156, "learning_rate": 2.0153621602018276e-05, "loss": 0.5255881547927856, "step": 499 }, { "epoch": 1.6501650165016502, "grad_norm": 0.27057309315143646, "learning_rate": 2.0076811367493736e-05, "loss": 0.5134017467498779, "step": 500 }, { "epoch": 1.6534653465346536, "grad_norm": 0.2603322919481828, "learning_rate": 2e-05, "loss": 0.4548872113227844, "step": 501 }, { "epoch": 1.6567656765676566, "grad_norm": 0.2841830282558966, "learning_rate": 1.9923188632506268e-05, "loss": 0.4879235625267029, "step": 502 }, { "epoch": 1.66006600660066, "grad_norm": 0.2718072353452213, "learning_rate": 1.9846378397981737e-05, "loss": 0.5488070249557495, "step": 503 }, { "epoch": 1.6633663366336635, "grad_norm": 0.26980717544426264, "learning_rate": 1.976957042937887e-05, "loss": 0.474858820438385, "step": 504 }, { "epoch": 1.6666666666666665, "grad_norm": 0.2729038695715346, "learning_rate": 1.969276585961674e-05, "loss": 0.573983907699585, "step": 505 }, { "epoch": 1.66996699669967, "grad_norm": 0.2754435399081945, "learning_rate": 1.9615965821564284e-05, "loss": 0.5299487709999084, "step": 506 }, { "epoch": 1.6732673267326734, "grad_norm": 0.28078214205826996, "learning_rate": 1.9539171448023568e-05, "loss": 0.580963134765625, "step": 507 }, { "epoch": 1.6765676567656764, "grad_norm": 0.28056872169008745, "learning_rate": 1.946238387171313e-05, "loss": 0.5240850448608398, "step": 508 }, { "epoch": 1.6798679867986799, "grad_norm": 0.27579932032687055, "learning_rate": 1.9385604225251245e-05, "loss": 0.5397930145263672, "step": 509 }, { "epoch": 1.6831683168316833, "grad_norm": 0.2649239844230271, "learning_rate": 1.9308833641139235e-05, "loss": 0.4949077367782593, "step": 510 }, { "epoch": 1.6864686468646863, "grad_norm": 0.26821293718742795, "learning_rate": 1.9232073251744715e-05, "loss": 0.4906027913093567, "step": 511 }, { "epoch": 1.6897689768976898, "grad_norm": 0.30180544906142204, "learning_rate": 1.9155324189284957e-05, "loss": 0.562363862991333, "step": 512 }, { "epoch": 1.693069306930693, "grad_norm": 0.26560887539548794, "learning_rate": 1.9078587585810167e-05, "loss": 0.5347090363502502, "step": 513 }, { "epoch": 1.6963696369636962, "grad_norm": 0.28206984650870465, "learning_rate": 1.900186457318676e-05, "loss": 0.5554836988449097, "step": 514 }, { "epoch": 1.6996699669966997, "grad_norm": 0.2667791650009087, "learning_rate": 1.8925156283080684e-05, "loss": 0.5179104208946228, "step": 515 }, { "epoch": 1.702970297029703, "grad_norm": 0.2759730227945326, "learning_rate": 1.8848463846940756e-05, "loss": 0.552240252494812, "step": 516 }, { "epoch": 1.7062706270627062, "grad_norm": 0.34634391778922186, "learning_rate": 1.8771788395981915e-05, "loss": 0.534430980682373, "step": 517 }, { "epoch": 1.7095709570957096, "grad_norm": 0.26711110641337843, "learning_rate": 1.8695131061168598e-05, "loss": 0.5601803064346313, "step": 518 }, { "epoch": 1.7128712871287128, "grad_norm": 0.3479876576460715, "learning_rate": 1.8618492973198e-05, "loss": 0.5119711756706238, "step": 519 }, { "epoch": 1.716171617161716, "grad_norm": 0.32608510378908223, "learning_rate": 1.8541875262483446e-05, "loss": 0.5632577538490295, "step": 520 }, { "epoch": 1.7194719471947195, "grad_norm": 0.2744236737297373, "learning_rate": 1.8465279059137686e-05, "loss": 0.5499478578567505, "step": 521 }, { "epoch": 1.7227722772277227, "grad_norm": 0.2835433030263243, "learning_rate": 1.8388705492956244e-05, "loss": 0.5176683664321899, "step": 522 }, { "epoch": 1.726072607260726, "grad_norm": 0.30494439216544983, "learning_rate": 1.8312155693400735e-05, "loss": 0.49528205394744873, "step": 523 }, { "epoch": 1.7293729372937294, "grad_norm": 0.26710805184601655, "learning_rate": 1.8235630789582213e-05, "loss": 0.5684216022491455, "step": 524 }, { "epoch": 1.7326732673267327, "grad_norm": 0.3852411183060649, "learning_rate": 1.815913191024454e-05, "loss": 0.5375942587852478, "step": 525 }, { "epoch": 1.7359735973597359, "grad_norm": 0.33262500157086355, "learning_rate": 1.8082660183747704e-05, "loss": 0.5541956424713135, "step": 526 }, { "epoch": 1.7392739273927393, "grad_norm": 0.28642691265671333, "learning_rate": 1.8006216738051175e-05, "loss": 0.5304872393608093, "step": 527 }, { "epoch": 1.7425742574257426, "grad_norm": 0.2734388390360432, "learning_rate": 1.7929802700697297e-05, "loss": 0.48648735880851746, "step": 528 }, { "epoch": 1.7458745874587458, "grad_norm": 0.28617564742207474, "learning_rate": 1.7853419198794638e-05, "loss": 0.49221059679985046, "step": 529 }, { "epoch": 1.7491749174917492, "grad_norm": 0.2790947673251484, "learning_rate": 1.7777067359001375e-05, "loss": 0.5652948021888733, "step": 530 }, { "epoch": 1.7524752475247525, "grad_norm": 0.2853703561489374, "learning_rate": 1.7700748307508643e-05, "loss": 0.5187686681747437, "step": 531 }, { "epoch": 1.7557755775577557, "grad_norm": 0.2853976224574607, "learning_rate": 1.7624463170023974e-05, "loss": 0.5013114809989929, "step": 532 }, { "epoch": 1.7590759075907592, "grad_norm": 0.2619757068753479, "learning_rate": 1.7548213071754663e-05, "loss": 0.47477245330810547, "step": 533 }, { "epoch": 1.7623762376237624, "grad_norm": 0.29220608585061886, "learning_rate": 1.7471999137391162e-05, "loss": 0.5600515007972717, "step": 534 }, { "epoch": 1.7656765676567656, "grad_norm": 0.2680464906367101, "learning_rate": 1.7395822491090513e-05, "loss": 0.5017521381378174, "step": 535 }, { "epoch": 1.768976897689769, "grad_norm": 0.3164936697237469, "learning_rate": 1.7319684256459773e-05, "loss": 0.48718830943107605, "step": 536 }, { "epoch": 1.7722772277227723, "grad_norm": 0.26576630911317906, "learning_rate": 1.72435855565394e-05, "loss": 0.5348131060600281, "step": 537 }, { "epoch": 1.7755775577557755, "grad_norm": 0.3785718389935733, "learning_rate": 1.716752751378673e-05, "loss": 0.5132070183753967, "step": 538 }, { "epoch": 1.778877887788779, "grad_norm": 0.2912227396538846, "learning_rate": 1.7091511250059407e-05, "loss": 0.5194598436355591, "step": 539 }, { "epoch": 1.7821782178217822, "grad_norm": 0.25340183641995817, "learning_rate": 1.701553788659883e-05, "loss": 0.4950656294822693, "step": 540 }, { "epoch": 1.7854785478547854, "grad_norm": 0.32993048381725726, "learning_rate": 1.6939608544013603e-05, "loss": 0.5465744137763977, "step": 541 }, { "epoch": 1.7887788778877889, "grad_norm": 0.33326548174687204, "learning_rate": 1.6863724342263047e-05, "loss": 0.5328625440597534, "step": 542 }, { "epoch": 1.7920792079207921, "grad_norm": 0.2747817812302539, "learning_rate": 1.6787886400640645e-05, "loss": 0.483689546585083, "step": 543 }, { "epoch": 1.7953795379537953, "grad_norm": 0.2619017709081145, "learning_rate": 1.6712095837757525e-05, "loss": 0.5225390195846558, "step": 544 }, { "epoch": 1.7986798679867988, "grad_norm": 0.2718453161830156, "learning_rate": 1.6636353771526005e-05, "loss": 0.5168595314025879, "step": 545 }, { "epoch": 1.801980198019802, "grad_norm": 0.2915579523683445, "learning_rate": 1.6560661319143064e-05, "loss": 0.5257725119590759, "step": 546 }, { "epoch": 1.8052805280528053, "grad_norm": 0.2767711815305055, "learning_rate": 1.648501959707387e-05, "loss": 0.5023485422134399, "step": 547 }, { "epoch": 1.8085808580858087, "grad_norm": 0.267570701584644, "learning_rate": 1.6409429721035324e-05, "loss": 0.48897239565849304, "step": 548 }, { "epoch": 1.811881188118812, "grad_norm": 0.28714006005114934, "learning_rate": 1.63338928059796e-05, "loss": 0.5318676829338074, "step": 549 }, { "epoch": 1.8151815181518152, "grad_norm": 0.2802563301473015, "learning_rate": 1.6258409966077693e-05, "loss": 0.4996787905693054, "step": 550 }, { "epoch": 1.8184818481848186, "grad_norm": 0.28354713397276166, "learning_rate": 1.6182982314702987e-05, "loss": 0.4833434820175171, "step": 551 }, { "epoch": 1.8217821782178216, "grad_norm": 0.2904168234412241, "learning_rate": 1.6107610964414836e-05, "loss": 0.5050291419029236, "step": 552 }, { "epoch": 1.825082508250825, "grad_norm": 0.2859100119195952, "learning_rate": 1.6032297026942154e-05, "loss": 0.5423529148101807, "step": 553 }, { "epoch": 1.8283828382838285, "grad_norm": 0.2700093369793658, "learning_rate": 1.5957041613167007e-05, "loss": 0.5670536756515503, "step": 554 }, { "epoch": 1.8316831683168315, "grad_norm": 0.2784484594925466, "learning_rate": 1.5881845833108245e-05, "loss": 0.5148528814315796, "step": 555 }, { "epoch": 1.834983498349835, "grad_norm": 0.2795083034807244, "learning_rate": 1.5806710795905113e-05, "loss": 0.5441350340843201, "step": 556 }, { "epoch": 1.8382838283828384, "grad_norm": 0.27706485047893287, "learning_rate": 1.5731637609800897e-05, "loss": 0.5338016748428345, "step": 557 }, { "epoch": 1.8415841584158414, "grad_norm": 0.281671337152691, "learning_rate": 1.5656627382126587e-05, "loss": 0.522803783416748, "step": 558 }, { "epoch": 1.844884488448845, "grad_norm": 0.2867314215651197, "learning_rate": 1.5581681219284523e-05, "loss": 0.5079183578491211, "step": 559 }, { "epoch": 1.8481848184818483, "grad_norm": 0.2880604655799914, "learning_rate": 1.5506800226732104e-05, "loss": 0.5360547304153442, "step": 560 }, { "epoch": 1.8514851485148514, "grad_norm": 0.276328956502413, "learning_rate": 1.5431985508965438e-05, "loss": 0.5137909650802612, "step": 561 }, { "epoch": 1.8547854785478548, "grad_norm": 0.26198432963654783, "learning_rate": 1.5357238169503107e-05, "loss": 0.513020396232605, "step": 562 }, { "epoch": 1.858085808580858, "grad_norm": 0.3155751914603546, "learning_rate": 1.5282559310869856e-05, "loss": 0.5015939474105835, "step": 563 }, { "epoch": 1.8613861386138613, "grad_norm": 0.2654102353913447, "learning_rate": 1.5207950034580317e-05, "loss": 0.5012743473052979, "step": 564 }, { "epoch": 1.8646864686468647, "grad_norm": 0.27309132142690246, "learning_rate": 1.5133411441122799e-05, "loss": 0.48864254355430603, "step": 565 }, { "epoch": 1.867986798679868, "grad_norm": 0.6058665885379618, "learning_rate": 1.5058944629943044e-05, "loss": 0.437102347612381, "step": 566 }, { "epoch": 1.8712871287128712, "grad_norm": 0.2718164602566872, "learning_rate": 1.4984550699427978e-05, "loss": 0.5518525838851929, "step": 567 }, { "epoch": 1.8745874587458746, "grad_norm": 0.2832474093938169, "learning_rate": 1.4910230746889559e-05, "loss": 0.5618141889572144, "step": 568 }, { "epoch": 1.8778877887788779, "grad_norm": 0.2790138686096534, "learning_rate": 1.4835985868548557e-05, "loss": 0.4990406632423401, "step": 569 }, { "epoch": 1.881188118811881, "grad_norm": 0.26198363334655667, "learning_rate": 1.4761817159518415e-05, "loss": 0.5004926919937134, "step": 570 }, { "epoch": 1.8844884488448845, "grad_norm": 0.28233065536105734, "learning_rate": 1.4687725713789042e-05, "loss": 0.5166051983833313, "step": 571 }, { "epoch": 1.8877887788778878, "grad_norm": 0.2654807250852616, "learning_rate": 1.461371262421074e-05, "loss": 0.5510391592979431, "step": 572 }, { "epoch": 1.891089108910891, "grad_norm": 0.2766439695892797, "learning_rate": 1.4539778982478061e-05, "loss": 0.5305938720703125, "step": 573 }, { "epoch": 1.8943894389438944, "grad_norm": 0.35617765802983586, "learning_rate": 1.4465925879113663e-05, "loss": 0.562718391418457, "step": 574 }, { "epoch": 1.8976897689768977, "grad_norm": 0.26373250902859363, "learning_rate": 1.4392154403452294e-05, "loss": 0.541257381439209, "step": 575 }, { "epoch": 1.900990099009901, "grad_norm": 0.2584596806712207, "learning_rate": 1.4318465643624696e-05, "loss": 0.556663990020752, "step": 576 }, { "epoch": 1.9042904290429044, "grad_norm": 0.2655751613308258, "learning_rate": 1.4244860686541522e-05, "loss": 0.5691581964492798, "step": 577 }, { "epoch": 1.9075907590759076, "grad_norm": 0.3146864569567829, "learning_rate": 1.4171340617877349e-05, "loss": 0.513170063495636, "step": 578 }, { "epoch": 1.9108910891089108, "grad_norm": 0.288458498752148, "learning_rate": 1.4097906522054656e-05, "loss": 0.5679588317871094, "step": 579 }, { "epoch": 1.9141914191419143, "grad_norm": 0.2858005511149637, "learning_rate": 1.4024559482227818e-05, "loss": 0.513796329498291, "step": 580 }, { "epoch": 1.9174917491749175, "grad_norm": 0.25543101337641916, "learning_rate": 1.3951300580267108e-05, "loss": 0.4618416428565979, "step": 581 }, { "epoch": 1.9207920792079207, "grad_norm": 0.2670194314216259, "learning_rate": 1.3878130896742796e-05, "loss": 0.5491312742233276, "step": 582 }, { "epoch": 1.9240924092409242, "grad_norm": 0.24204031552297342, "learning_rate": 1.3805051510909164e-05, "loss": 0.5524745583534241, "step": 583 }, { "epoch": 1.9273927392739274, "grad_norm": 0.25091865473771396, "learning_rate": 1.3732063500688604e-05, "loss": 0.5232075452804565, "step": 584 }, { "epoch": 1.9306930693069306, "grad_norm": 0.26059464209400784, "learning_rate": 1.3659167942655702e-05, "loss": 0.5257346034049988, "step": 585 }, { "epoch": 1.933993399339934, "grad_norm": 0.2814401591736557, "learning_rate": 1.35863659120214e-05, "loss": 0.5196455717086792, "step": 586 }, { "epoch": 1.9372937293729373, "grad_norm": 0.2624714306516865, "learning_rate": 1.3513658482617085e-05, "loss": 0.5122568011283875, "step": 587 }, { "epoch": 1.9405940594059405, "grad_norm": 0.2644911414307543, "learning_rate": 1.3441046726878786e-05, "loss": 0.5236790180206299, "step": 588 }, { "epoch": 1.943894389438944, "grad_norm": 0.2699458396883844, "learning_rate": 1.3368531715831337e-05, "loss": 0.5508555173873901, "step": 589 }, { "epoch": 1.9471947194719472, "grad_norm": 0.26005129022694123, "learning_rate": 1.3296114519072594e-05, "loss": 0.4742932617664337, "step": 590 }, { "epoch": 1.9504950495049505, "grad_norm": 0.2530711129220065, "learning_rate": 1.3223796204757638e-05, "loss": 0.5406354665756226, "step": 591 }, { "epoch": 1.953795379537954, "grad_norm": 0.26847075280504556, "learning_rate": 1.3151577839583043e-05, "loss": 0.508262038230896, "step": 592 }, { "epoch": 1.9570957095709571, "grad_norm": 0.2601716190776577, "learning_rate": 1.3079460488771136e-05, "loss": 0.5260204672813416, "step": 593 }, { "epoch": 1.9603960396039604, "grad_norm": 0.2597900374740898, "learning_rate": 1.3007445216054257e-05, "loss": 0.522408127784729, "step": 594 }, { "epoch": 1.9636963696369638, "grad_norm": 0.23858694591096777, "learning_rate": 1.2935533083659114e-05, "loss": 0.4849371910095215, "step": 595 }, { "epoch": 1.966996699669967, "grad_norm": 0.26399518807159883, "learning_rate": 1.2863725152291091e-05, "loss": 0.5319019556045532, "step": 596 }, { "epoch": 1.9702970297029703, "grad_norm": 0.2797422170192374, "learning_rate": 1.2792022481118587e-05, "loss": 0.5562412738800049, "step": 597 }, { "epoch": 1.9735973597359737, "grad_norm": 0.2537907416959109, "learning_rate": 1.2720426127757431e-05, "loss": 0.49608999490737915, "step": 598 }, { "epoch": 1.976897689768977, "grad_norm": 0.2521690484869479, "learning_rate": 1.2648937148255253e-05, "loss": 0.5082768201828003, "step": 599 }, { "epoch": 1.9801980198019802, "grad_norm": 0.2572245668654862, "learning_rate": 1.2577556597075933e-05, "loss": 0.5706614255905151, "step": 600 }, { "epoch": 1.9834983498349836, "grad_norm": 0.2697883750179181, "learning_rate": 1.2506285527083991e-05, "loss": 0.5366507768630981, "step": 601 }, { "epoch": 1.9867986798679866, "grad_norm": 0.26402819852563175, "learning_rate": 1.2435124989529139e-05, "loss": 0.5462816953659058, "step": 602 }, { "epoch": 1.99009900990099, "grad_norm": 0.246894878071046, "learning_rate": 1.236407603403072e-05, "loss": 0.5050650238990784, "step": 603 }, { "epoch": 1.9933993399339935, "grad_norm": 0.477370357077484, "learning_rate": 1.2293139708562221e-05, "loss": 0.4915675222873688, "step": 604 }, { "epoch": 1.9966996699669965, "grad_norm": 0.2657795870076786, "learning_rate": 1.2222317059435863e-05, "loss": 0.5807889103889465, "step": 605 }, { "epoch": 2.0, "grad_norm": 0.2770967943671612, "learning_rate": 1.2151609131287124e-05, "loss": 0.49173152446746826, "step": 606 }, { "epoch": 2.0033003300330035, "grad_norm": 0.7014931959992592, "learning_rate": 1.2081016967059336e-05, "loss": 0.4426806569099426, "step": 607 }, { "epoch": 2.0066006600660065, "grad_norm": 0.3040348249510974, "learning_rate": 1.201054160798833e-05, "loss": 0.45669305324554443, "step": 608 }, { "epoch": 2.00990099009901, "grad_norm": 0.31030490189011145, "learning_rate": 1.1940184093587047e-05, "loss": 0.4638911783695221, "step": 609 }, { "epoch": 2.0132013201320134, "grad_norm": 0.36234285165121427, "learning_rate": 1.186994546163023e-05, "loss": 0.4541138708591461, "step": 610 }, { "epoch": 2.0165016501650164, "grad_norm": 0.38564024677228226, "learning_rate": 1.1799826748139079e-05, "loss": 0.49081191420555115, "step": 611 }, { "epoch": 2.01980198019802, "grad_norm": 0.3266656962672454, "learning_rate": 1.1729828987366009e-05, "loss": 0.4794033169746399, "step": 612 }, { "epoch": 2.0231023102310233, "grad_norm": 0.291304204290645, "learning_rate": 1.165995321177939e-05, "loss": 0.4142993688583374, "step": 613 }, { "epoch": 2.0264026402640263, "grad_norm": 0.33294658416576944, "learning_rate": 1.159020045204829e-05, "loss": 0.47322210669517517, "step": 614 }, { "epoch": 2.0297029702970297, "grad_norm": 0.3539618583487969, "learning_rate": 1.15205717370273e-05, "loss": 0.4899124503135681, "step": 615 }, { "epoch": 2.033003300330033, "grad_norm": 0.2952110750729378, "learning_rate": 1.1451068093741355e-05, "loss": 0.4857853055000305, "step": 616 }, { "epoch": 2.036303630363036, "grad_norm": 0.28290377247578213, "learning_rate": 1.1381690547370559e-05, "loss": 0.4790021479129791, "step": 617 }, { "epoch": 2.0396039603960396, "grad_norm": 0.2902876717109542, "learning_rate": 1.13124401212351e-05, "loss": 0.4519282281398773, "step": 618 }, { "epoch": 2.042904290429043, "grad_norm": 0.32584221310071065, "learning_rate": 1.1243317836780138e-05, "loss": 0.4738570749759674, "step": 619 }, { "epoch": 2.046204620462046, "grad_norm": 0.3093985088780693, "learning_rate": 1.1174324713560751e-05, "loss": 0.5111795663833618, "step": 620 }, { "epoch": 2.0495049504950495, "grad_norm": 0.2707360386310654, "learning_rate": 1.1105461769226858e-05, "loss": 0.4750926196575165, "step": 621 }, { "epoch": 2.052805280528053, "grad_norm": 0.3107814822051771, "learning_rate": 1.1036730019508259e-05, "loss": 0.4580341577529907, "step": 622 }, { "epoch": 2.056105610561056, "grad_norm": 0.28803288143665157, "learning_rate": 1.0968130478199635e-05, "loss": 0.43322116136550903, "step": 623 }, { "epoch": 2.0594059405940595, "grad_norm": 0.2810686637672446, "learning_rate": 1.0899664157145562e-05, "loss": 0.5015532374382019, "step": 624 }, { "epoch": 2.062706270627063, "grad_norm": 0.28464578766110366, "learning_rate": 1.0831332066225645e-05, "loss": 0.4508541226387024, "step": 625 }, { "epoch": 2.066006600660066, "grad_norm": 0.2904901154874499, "learning_rate": 1.0763135213339589e-05, "loss": 0.49554720520973206, "step": 626 }, { "epoch": 2.0693069306930694, "grad_norm": 0.27820378239401394, "learning_rate": 1.0695074604392305e-05, "loss": 0.4523652493953705, "step": 627 }, { "epoch": 2.072607260726073, "grad_norm": 0.2794675014886217, "learning_rate": 1.0627151243279136e-05, "loss": 0.44413498044013977, "step": 628 }, { "epoch": 2.075907590759076, "grad_norm": 0.30159300158430347, "learning_rate": 1.055936613187101e-05, "loss": 0.4645534157752991, "step": 629 }, { "epoch": 2.0792079207920793, "grad_norm": 0.26698861915138783, "learning_rate": 1.0491720269999663e-05, "loss": 0.44823265075683594, "step": 630 }, { "epoch": 2.0825082508250823, "grad_norm": 0.2813791646704669, "learning_rate": 1.0424214655442891e-05, "loss": 0.45181727409362793, "step": 631 }, { "epoch": 2.0858085808580857, "grad_norm": 0.28721240697359884, "learning_rate": 1.0356850283909852e-05, "loss": 0.5371145009994507, "step": 632 }, { "epoch": 2.089108910891089, "grad_norm": 0.26030729348418064, "learning_rate": 1.0289628149026369e-05, "loss": 0.4564274847507477, "step": 633 }, { "epoch": 2.092409240924092, "grad_norm": 0.3008427259435641, "learning_rate": 1.0222549242320254e-05, "loss": 0.4490276873111725, "step": 634 }, { "epoch": 2.0957095709570956, "grad_norm": 0.27241405218961473, "learning_rate": 1.0155614553206715e-05, "loss": 0.4663650095462799, "step": 635 }, { "epoch": 2.099009900990099, "grad_norm": 0.2814271376941218, "learning_rate": 1.0088825068973746e-05, "loss": 0.46265488862991333, "step": 636 }, { "epoch": 2.102310231023102, "grad_norm": 0.27083223857822414, "learning_rate": 1.002218177476756e-05, "loss": 0.45717963576316833, "step": 637 }, { "epoch": 2.1056105610561056, "grad_norm": 0.27321625989679976, "learning_rate": 9.955685653578068e-06, "loss": 0.47119495272636414, "step": 638 }, { "epoch": 2.108910891089109, "grad_norm": 0.2756031623165562, "learning_rate": 9.88933768622439e-06, "loss": 0.46565738320350647, "step": 639 }, { "epoch": 2.112211221122112, "grad_norm": 0.26745369116167694, "learning_rate": 9.823138851340337e-06, "loss": 0.45610398054122925, "step": 640 }, { "epoch": 2.1155115511551155, "grad_norm": 0.2722722292829376, "learning_rate": 9.75709012536005e-06, "loss": 0.4907280206680298, "step": 641 }, { "epoch": 2.118811881188119, "grad_norm": 0.3111977337695957, "learning_rate": 9.691192482503546e-06, "loss": 0.500091552734375, "step": 642 }, { "epoch": 2.122112211221122, "grad_norm": 0.2648612882642695, "learning_rate": 9.625446894762371e-06, "loss": 0.4330231547355652, "step": 643 }, { "epoch": 2.1254125412541254, "grad_norm": 0.2809597353379975, "learning_rate": 9.559854331885233e-06, "loss": 0.4750261902809143, "step": 644 }, { "epoch": 2.128712871287129, "grad_norm": 0.28201431758911444, "learning_rate": 9.49441576136374e-06, "loss": 0.4567373991012573, "step": 645 }, { "epoch": 2.132013201320132, "grad_norm": 0.2901654659031683, "learning_rate": 9.429132148418116e-06, "loss": 0.4601932168006897, "step": 646 }, { "epoch": 2.1353135313531353, "grad_norm": 0.2792782648133288, "learning_rate": 9.364004455982931e-06, "loss": 0.4909035265445709, "step": 647 }, { "epoch": 2.1386138613861387, "grad_norm": 0.2531215125004539, "learning_rate": 9.299033644692948e-06, "loss": 0.4443170428276062, "step": 648 }, { "epoch": 2.1419141914191417, "grad_norm": 0.2676386529649011, "learning_rate": 9.234220672868928e-06, "loss": 0.46534985303878784, "step": 649 }, { "epoch": 2.145214521452145, "grad_norm": 0.2667778492620529, "learning_rate": 9.169566496503476e-06, "loss": 0.4351472854614258, "step": 650 }, { "epoch": 2.1485148514851486, "grad_norm": 0.26819623679400084, "learning_rate": 9.105072069246983e-06, "loss": 0.41445475816726685, "step": 651 }, { "epoch": 2.1518151815181517, "grad_norm": 0.2627848025641513, "learning_rate": 9.040738342393532e-06, "loss": 0.475847989320755, "step": 652 }, { "epoch": 2.155115511551155, "grad_norm": 0.26883146792086515, "learning_rate": 8.976566264866876e-06, "loss": 0.48487618565559387, "step": 653 }, { "epoch": 2.1584158415841586, "grad_norm": 0.2373773636564882, "learning_rate": 8.912556783206414e-06, "loss": 0.4661785364151001, "step": 654 }, { "epoch": 2.1617161716171616, "grad_norm": 0.25939800378632233, "learning_rate": 8.84871084155328e-06, "loss": 0.48009538650512695, "step": 655 }, { "epoch": 2.165016501650165, "grad_norm": 0.26858346089342566, "learning_rate": 8.785029381636387e-06, "loss": 0.45644935965538025, "step": 656 }, { "epoch": 2.1683168316831685, "grad_norm": 0.25509808532967904, "learning_rate": 8.721513342758516e-06, "loss": 0.4896699786186218, "step": 657 }, { "epoch": 2.1716171617161715, "grad_norm": 0.2678040151014407, "learning_rate": 8.658163661782507e-06, "loss": 0.4286258816719055, "step": 658 }, { "epoch": 2.174917491749175, "grad_norm": 0.25541690613787077, "learning_rate": 8.59498127311742e-06, "loss": 0.42029869556427, "step": 659 }, { "epoch": 2.1782178217821784, "grad_norm": 0.2748486648157056, "learning_rate": 8.531967108704722e-06, "loss": 0.48522356152534485, "step": 660 }, { "epoch": 2.1815181518151814, "grad_norm": 0.37918495336042346, "learning_rate": 8.4691220980046e-06, "loss": 0.461814284324646, "step": 661 }, { "epoch": 2.184818481848185, "grad_norm": 0.2581277433441387, "learning_rate": 8.406447167982205e-06, "loss": 0.49913299083709717, "step": 662 }, { "epoch": 2.1881188118811883, "grad_norm": 0.2804949954645611, "learning_rate": 8.343943243094008e-06, "loss": 0.4936009645462036, "step": 663 }, { "epoch": 2.1914191419141913, "grad_norm": 0.2621319196989517, "learning_rate": 8.281611245274123e-06, "loss": 0.44817712903022766, "step": 664 }, { "epoch": 2.1947194719471947, "grad_norm": 0.26441078845804705, "learning_rate": 8.219452093920763e-06, "loss": 0.482817143201828, "step": 665 }, { "epoch": 2.198019801980198, "grad_norm": 0.25954690482303255, "learning_rate": 8.157466705882645e-06, "loss": 0.4643383026123047, "step": 666 }, { "epoch": 2.201320132013201, "grad_norm": 0.26531559844936237, "learning_rate": 8.095655995445472e-06, "loss": 0.4797602593898773, "step": 667 }, { "epoch": 2.2046204620462047, "grad_norm": 0.26505896756203806, "learning_rate": 8.03402087431844e-06, "loss": 0.44109994173049927, "step": 668 }, { "epoch": 2.207920792079208, "grad_norm": 0.24679836702691405, "learning_rate": 7.972562251620817e-06, "loss": 0.46359869837760925, "step": 669 }, { "epoch": 2.211221122112211, "grad_norm": 0.23925371744802634, "learning_rate": 7.9112810338685e-06, "loss": 0.4576035141944885, "step": 670 }, { "epoch": 2.2145214521452146, "grad_norm": 0.2854541383231889, "learning_rate": 7.850178124960678e-06, "loss": 0.40902045369148254, "step": 671 }, { "epoch": 2.217821782178218, "grad_norm": 0.2726752140080075, "learning_rate": 7.789254426166454e-06, "loss": 0.45797932147979736, "step": 672 }, { "epoch": 2.221122112211221, "grad_norm": 0.2463208855251595, "learning_rate": 7.728510836111602e-06, "loss": 0.43204474449157715, "step": 673 }, { "epoch": 2.2244224422442245, "grad_norm": 0.2632084235311744, "learning_rate": 7.667948250765278e-06, "loss": 0.46007901430130005, "step": 674 }, { "epoch": 2.227722772277228, "grad_norm": 0.2508043419515415, "learning_rate": 7.607567563426823e-06, "loss": 0.46342402696609497, "step": 675 }, { "epoch": 2.231023102310231, "grad_norm": 0.25728063807342477, "learning_rate": 7.5473696647125605e-06, "loss": 0.48953354358673096, "step": 676 }, { "epoch": 2.2343234323432344, "grad_norm": 0.2667124077929822, "learning_rate": 7.487355442542696e-06, "loss": 0.5022163391113281, "step": 677 }, { "epoch": 2.237623762376238, "grad_norm": 0.2666199657154719, "learning_rate": 7.4275257821281995e-06, "loss": 0.5144001245498657, "step": 678 }, { "epoch": 2.240924092409241, "grad_norm": 0.2598091753134079, "learning_rate": 7.3678815659577505e-06, "loss": 0.489937961101532, "step": 679 }, { "epoch": 2.2442244224422443, "grad_norm": 0.25000738365352393, "learning_rate": 7.3084236737847125e-06, "loss": 0.48842746019363403, "step": 680 }, { "epoch": 2.2475247524752477, "grad_norm": 0.2672754249714767, "learning_rate": 7.249152982614176e-06, "loss": 0.5024458765983582, "step": 681 }, { "epoch": 2.2508250825082508, "grad_norm": 0.25558161311007577, "learning_rate": 7.190070366690014e-06, "loss": 0.46162086725234985, "step": 682 }, { "epoch": 2.254125412541254, "grad_norm": 0.24807827286497117, "learning_rate": 7.13117669748199e-06, "loss": 0.44991785287857056, "step": 683 }, { "epoch": 2.2574257425742577, "grad_norm": 0.24635539567650763, "learning_rate": 7.072472843672877e-06, "loss": 0.43738633394241333, "step": 684 }, { "epoch": 2.2607260726072607, "grad_norm": 0.25605350464823584, "learning_rate": 7.013959671145691e-06, "loss": 0.46122169494628906, "step": 685 }, { "epoch": 2.264026402640264, "grad_norm": 0.24205320356251103, "learning_rate": 6.955638042970896e-06, "loss": 0.4504377841949463, "step": 686 }, { "epoch": 2.2673267326732676, "grad_norm": 0.2570116198268661, "learning_rate": 6.897508819393645e-06, "loss": 0.4620972275733948, "step": 687 }, { "epoch": 2.2706270627062706, "grad_norm": 0.2629731642768507, "learning_rate": 6.8395728578211525e-06, "loss": 0.5271490216255188, "step": 688 }, { "epoch": 2.273927392739274, "grad_norm": 1.9898738742816064, "learning_rate": 6.781831012810001e-06, "loss": 0.4448450803756714, "step": 689 }, { "epoch": 2.2772277227722775, "grad_norm": 0.3213733503923664, "learning_rate": 6.72428413605354e-06, "loss": 0.4602925181388855, "step": 690 }, { "epoch": 2.2805280528052805, "grad_norm": 0.26788259096559774, "learning_rate": 6.6669330763693485e-06, "loss": 0.4722862243652344, "step": 691 }, { "epoch": 2.283828382838284, "grad_norm": 0.25272077157298134, "learning_rate": 6.609778679686694e-06, "loss": 0.47454553842544556, "step": 692 }, { "epoch": 2.287128712871287, "grad_norm": 0.24015565864939845, "learning_rate": 6.552821789034067e-06, "loss": 0.4750802516937256, "step": 693 }, { "epoch": 2.2904290429042904, "grad_norm": 0.2559036200154721, "learning_rate": 6.496063244526723e-06, "loss": 0.4640570282936096, "step": 694 }, { "epoch": 2.293729372937294, "grad_norm": 0.25061879602537984, "learning_rate": 6.439503883354323e-06, "loss": 0.47181540727615356, "step": 695 }, { "epoch": 2.297029702970297, "grad_norm": 0.24588968301020392, "learning_rate": 6.3831445397685755e-06, "loss": 0.4335097372531891, "step": 696 }, { "epoch": 2.3003300330033003, "grad_norm": 0.26057507812572134, "learning_rate": 6.3269860450709016e-06, "loss": 0.5158364772796631, "step": 697 }, { "epoch": 2.3036303630363038, "grad_norm": 0.24767301357183136, "learning_rate": 6.271029227600216e-06, "loss": 0.497075617313385, "step": 698 }, { "epoch": 2.3069306930693068, "grad_norm": 0.2612680212099097, "learning_rate": 6.215274912720697e-06, "loss": 0.4946526288986206, "step": 699 }, { "epoch": 2.31023102310231, "grad_norm": 0.25694731286364175, "learning_rate": 6.159723922809577e-06, "loss": 0.4632418155670166, "step": 700 }, { "epoch": 2.3135313531353137, "grad_norm": 0.26826842519558464, "learning_rate": 6.10437707724507e-06, "loss": 0.4936927258968353, "step": 701 }, { "epoch": 2.3168316831683167, "grad_norm": 0.3039451981089408, "learning_rate": 6.049235192394242e-06, "loss": 0.4373137056827545, "step": 702 }, { "epoch": 2.32013201320132, "grad_norm": 0.2502753739217944, "learning_rate": 5.994299081600996e-06, "loss": 0.49224400520324707, "step": 703 }, { "epoch": 2.3234323432343236, "grad_norm": 0.25232784831466315, "learning_rate": 5.939569555174045e-06, "loss": 0.453000545501709, "step": 704 }, { "epoch": 2.3267326732673266, "grad_norm": 0.2443845287083898, "learning_rate": 5.885047420374992e-06, "loss": 0.4201410114765167, "step": 705 }, { "epoch": 2.33003300330033, "grad_norm": 0.2757856931959748, "learning_rate": 5.830733481406415e-06, "loss": 0.4817071557044983, "step": 706 }, { "epoch": 2.3333333333333335, "grad_norm": 0.23548633980687703, "learning_rate": 5.776628539399975e-06, "loss": 0.42609190940856934, "step": 707 }, { "epoch": 2.3366336633663365, "grad_norm": 0.2484780532867763, "learning_rate": 5.722733392404652e-06, "loss": 0.46225881576538086, "step": 708 }, { "epoch": 2.33993399339934, "grad_norm": 0.28677279656296756, "learning_rate": 5.669048835374933e-06, "loss": 0.49061962962150574, "step": 709 }, { "epoch": 2.3432343234323434, "grad_norm": 0.25600200089074804, "learning_rate": 5.615575660159089e-06, "loss": 0.4506024122238159, "step": 710 }, { "epoch": 2.3465346534653464, "grad_norm": 0.23921559671813297, "learning_rate": 5.562314655487522e-06, "loss": 0.4433022141456604, "step": 711 }, { "epoch": 2.34983498349835, "grad_norm": 0.26708565402858225, "learning_rate": 5.5092666069611055e-06, "loss": 0.45988917350769043, "step": 712 }, { "epoch": 2.3531353135313533, "grad_norm": 0.2294068192725238, "learning_rate": 5.4564322970396154e-06, "loss": 0.44675180315971375, "step": 713 }, { "epoch": 2.3564356435643563, "grad_norm": 0.2431380886271115, "learning_rate": 5.403812505030157e-06, "loss": 0.46991807222366333, "step": 714 }, { "epoch": 2.3597359735973598, "grad_norm": 0.2412850801003648, "learning_rate": 5.351408007075714e-06, "loss": 0.49208664894104004, "step": 715 }, { "epoch": 2.363036303630363, "grad_norm": 2.760535806072788, "learning_rate": 5.299219576143673e-06, "loss": 0.48280128836631775, "step": 716 }, { "epoch": 2.366336633663366, "grad_norm": 0.24609236023763137, "learning_rate": 5.247247982014414e-06, "loss": 0.4491961896419525, "step": 717 }, { "epoch": 2.3696369636963697, "grad_norm": 0.24672380739006747, "learning_rate": 5.195493991269991e-06, "loss": 0.4943190813064575, "step": 718 }, { "epoch": 2.372937293729373, "grad_norm": 0.27378763646010795, "learning_rate": 5.143958367282795e-06, "loss": 0.4586840867996216, "step": 719 }, { "epoch": 2.376237623762376, "grad_norm": 0.2422334792581867, "learning_rate": 5.0926418702042914e-06, "loss": 0.46227943897247314, "step": 720 }, { "epoch": 2.3795379537953796, "grad_norm": 0.23796137337817433, "learning_rate": 5.041545256953839e-06, "loss": 0.45386868715286255, "step": 721 }, { "epoch": 2.382838283828383, "grad_norm": 0.24415832537414764, "learning_rate": 4.990669281207492e-06, "loss": 0.5026980042457581, "step": 722 }, { "epoch": 2.386138613861386, "grad_norm": 0.247792875546048, "learning_rate": 4.940014693386909e-06, "loss": 0.4834757447242737, "step": 723 }, { "epoch": 2.3894389438943895, "grad_norm": 0.43027345510854853, "learning_rate": 4.889582240648254e-06, "loss": 0.44382545351982117, "step": 724 }, { "epoch": 2.3927392739273925, "grad_norm": 0.2519737312346543, "learning_rate": 4.839372666871212e-06, "loss": 0.45313894748687744, "step": 725 }, { "epoch": 2.396039603960396, "grad_norm": 0.23932824454201898, "learning_rate": 4.789386712647994e-06, "loss": 0.4597586393356323, "step": 726 }, { "epoch": 2.3993399339933994, "grad_norm": 0.23075224453442636, "learning_rate": 4.739625115272408e-06, "loss": 0.4427994191646576, "step": 727 }, { "epoch": 2.4026402640264024, "grad_norm": 0.24450312969705348, "learning_rate": 4.690088608729007e-06, "loss": 0.4459637403488159, "step": 728 }, { "epoch": 2.405940594059406, "grad_norm": 0.2516039358654293, "learning_rate": 4.640777923682247e-06, "loss": 0.5043150186538696, "step": 729 }, { "epoch": 2.4092409240924093, "grad_norm": 0.26743057517217783, "learning_rate": 4.5916937874657055e-06, "loss": 0.4942860007286072, "step": 730 }, { "epoch": 2.4125412541254123, "grad_norm": 0.25489023032736696, "learning_rate": 4.5428369240713655e-06, "loss": 0.4572402834892273, "step": 731 }, { "epoch": 2.4158415841584158, "grad_norm": 0.24954926782274506, "learning_rate": 4.494208054138934e-06, "loss": 0.44927412271499634, "step": 732 }, { "epoch": 2.419141914191419, "grad_norm": 0.24684795220524788, "learning_rate": 4.445807894945211e-06, "loss": 0.461928129196167, "step": 733 }, { "epoch": 2.4224422442244222, "grad_norm": 0.2375757440633774, "learning_rate": 4.397637160393493e-06, "loss": 0.46279191970825195, "step": 734 }, { "epoch": 2.4257425742574257, "grad_norm": 0.24407488686385456, "learning_rate": 4.349696561003076e-06, "loss": 0.48653045296669006, "step": 735 }, { "epoch": 2.429042904290429, "grad_norm": 0.2443771510662661, "learning_rate": 4.301986803898752e-06, "loss": 0.4587661027908325, "step": 736 }, { "epoch": 2.432343234323432, "grad_norm": 0.25142970699984885, "learning_rate": 4.2545085928003906e-06, "loss": 0.4946083426475525, "step": 737 }, { "epoch": 2.4356435643564356, "grad_norm": 0.2446760243354809, "learning_rate": 4.207262628012534e-06, "loss": 0.4614926278591156, "step": 738 }, { "epoch": 2.438943894389439, "grad_norm": 0.24323846273380414, "learning_rate": 4.160249606414109e-06, "loss": 0.46377992630004883, "step": 739 }, { "epoch": 2.442244224422442, "grad_norm": 0.2554844227936452, "learning_rate": 4.1134702214481126e-06, "loss": 0.4217844009399414, "step": 740 }, { "epoch": 2.4455445544554455, "grad_norm": 0.40365970056175393, "learning_rate": 4.066925163111406e-06, "loss": 0.4616321325302124, "step": 741 }, { "epoch": 2.448844884488449, "grad_norm": 0.23727547629912737, "learning_rate": 4.020615117944515e-06, "loss": 0.48755043745040894, "step": 742 }, { "epoch": 2.452145214521452, "grad_norm": 0.2636488971277773, "learning_rate": 3.974540769021529e-06, "loss": 0.47338151931762695, "step": 743 }, { "epoch": 2.4554455445544554, "grad_norm": 0.26687939105998304, "learning_rate": 3.928702795940007e-06, "loss": 0.47220849990844727, "step": 744 }, { "epoch": 2.458745874587459, "grad_norm": 0.23440870124340746, "learning_rate": 3.883101874810966e-06, "loss": 0.4117845296859741, "step": 745 }, { "epoch": 2.462046204620462, "grad_norm": 0.2389531188545627, "learning_rate": 3.8377386782488875e-06, "loss": 0.44338276982307434, "step": 746 }, { "epoch": 2.4653465346534653, "grad_norm": 0.28253943840492757, "learning_rate": 3.7926138753618257e-06, "loss": 0.470272958278656, "step": 747 }, { "epoch": 2.4686468646864688, "grad_norm": 0.2533414456878978, "learning_rate": 3.747728131741517e-06, "loss": 0.4825139045715332, "step": 748 }, { "epoch": 2.4719471947194718, "grad_norm": 0.22813621303002277, "learning_rate": 3.703082109453575e-06, "loss": 0.43612140417099, "step": 749 }, { "epoch": 2.4752475247524752, "grad_norm": 0.22709733679425215, "learning_rate": 3.6586764670277065e-06, "loss": 0.4573146402835846, "step": 750 }, { "epoch": 2.4785478547854787, "grad_norm": 0.24807030489347143, "learning_rate": 3.61451185944802e-06, "loss": 0.4419093430042267, "step": 751 }, { "epoch": 2.4818481848184817, "grad_norm": 0.23735191741997233, "learning_rate": 3.570588938143353e-06, "loss": 0.440906822681427, "step": 752 }, { "epoch": 2.485148514851485, "grad_norm": 0.24792760735437452, "learning_rate": 3.5269083509776735e-06, "loss": 0.432383269071579, "step": 753 }, { "epoch": 2.4884488448844886, "grad_norm": 0.24788857238042053, "learning_rate": 3.4834707422404957e-06, "loss": 0.4615401029586792, "step": 754 }, { "epoch": 2.4917491749174916, "grad_norm": 0.29288725170403773, "learning_rate": 3.440276752637417e-06, "loss": 0.43933019042015076, "step": 755 }, { "epoch": 2.495049504950495, "grad_norm": 0.24422605775888084, "learning_rate": 3.3973270192806427e-06, "loss": 0.4651945233345032, "step": 756 }, { "epoch": 2.4983498349834985, "grad_norm": 0.3408455968625333, "learning_rate": 3.3546221756795874e-06, "loss": 0.4423069953918457, "step": 757 }, { "epoch": 2.5016501650165015, "grad_norm": 0.32517130275625505, "learning_rate": 3.3121628517315373e-06, "loss": 0.4905679225921631, "step": 758 }, { "epoch": 2.504950495049505, "grad_norm": 0.24015956320352147, "learning_rate": 3.2699496737123758e-06, "loss": 0.46989548206329346, "step": 759 }, { "epoch": 2.5082508250825084, "grad_norm": 0.24393784259324253, "learning_rate": 3.2279832642673025e-06, "loss": 0.5168344378471375, "step": 760 }, { "epoch": 2.5115511551155114, "grad_norm": 0.2446798962745333, "learning_rate": 3.186264242401693e-06, "loss": 0.46055924892425537, "step": 761 }, { "epoch": 2.514851485148515, "grad_norm": 0.2561165095643357, "learning_rate": 3.144793223471949e-06, "loss": 0.5135318040847778, "step": 762 }, { "epoch": 2.5181518151815183, "grad_norm": 1.1234233736547772, "learning_rate": 3.1035708191764246e-06, "loss": 0.5026534199714661, "step": 763 }, { "epoch": 2.5214521452145213, "grad_norm": 0.23866674349332329, "learning_rate": 3.0625976375463938e-06, "loss": 0.43348389863967896, "step": 764 }, { "epoch": 2.5247524752475248, "grad_norm": 0.2295043927466033, "learning_rate": 3.021874282937103e-06, "loss": 0.4620594382286072, "step": 765 }, { "epoch": 2.5280528052805282, "grad_norm": 0.25250691113798673, "learning_rate": 2.9814013560188425e-06, "loss": 0.4646865725517273, "step": 766 }, { "epoch": 2.5313531353135312, "grad_norm": 0.2396511266141401, "learning_rate": 2.9411794537680795e-06, "loss": 0.46846333146095276, "step": 767 }, { "epoch": 2.5346534653465347, "grad_norm": 0.24818691561244743, "learning_rate": 2.901209169458672e-06, "loss": 0.487953782081604, "step": 768 }, { "epoch": 2.537953795379538, "grad_norm": 0.24296952409375147, "learning_rate": 2.861491092653115e-06, "loss": 0.4543481469154358, "step": 769 }, { "epoch": 2.541254125412541, "grad_norm": 0.24368208278529027, "learning_rate": 2.822025809193818e-06, "loss": 0.4961584806442261, "step": 770 }, { "epoch": 2.5445544554455446, "grad_norm": 0.2377375055697493, "learning_rate": 2.7828139011944967e-06, "loss": 0.44123750925064087, "step": 771 }, { "epoch": 2.547854785478548, "grad_norm": 0.2301227484744363, "learning_rate": 2.743855947031575e-06, "loss": 0.43014320731163025, "step": 772 }, { "epoch": 2.551155115511551, "grad_norm": 0.2250422650499226, "learning_rate": 2.7051525213356546e-06, "loss": 0.4774499535560608, "step": 773 }, { "epoch": 2.5544554455445545, "grad_norm": 0.23823454905644054, "learning_rate": 2.6667041949830186e-06, "loss": 0.44963133335113525, "step": 774 }, { "epoch": 2.557755775577558, "grad_norm": 0.2554981481850554, "learning_rate": 2.6285115350872524e-06, "loss": 0.4840245842933655, "step": 775 }, { "epoch": 2.561056105610561, "grad_norm": 0.2589754738757413, "learning_rate": 2.5905751049908466e-06, "loss": 0.5490096807479858, "step": 776 }, { "epoch": 2.5643564356435644, "grad_norm": 0.30754095371590884, "learning_rate": 2.5528954642568947e-06, "loss": 0.4965711832046509, "step": 777 }, { "epoch": 2.567656765676568, "grad_norm": 0.2261872478084121, "learning_rate": 2.5154731686608424e-06, "loss": 0.4518459439277649, "step": 778 }, { "epoch": 2.570957095709571, "grad_norm": 0.24374764034742216, "learning_rate": 2.4783087701823026e-06, "loss": 0.5022287964820862, "step": 779 }, { "epoch": 2.5742574257425743, "grad_norm": 0.2531412256958666, "learning_rate": 2.441402816996876e-06, "loss": 0.47195330262184143, "step": 780 }, { "epoch": 2.5775577557755778, "grad_norm": 0.25588546327446415, "learning_rate": 2.4047558534681124e-06, "loss": 0.5155715346336365, "step": 781 }, { "epoch": 2.580858085808581, "grad_norm": 0.26863032492519423, "learning_rate": 2.3683684201394507e-06, "loss": 0.46963661909103394, "step": 782 }, { "epoch": 2.5841584158415842, "grad_norm": 0.2303264290466175, "learning_rate": 2.3322410537262495e-06, "loss": 0.4279938340187073, "step": 783 }, { "epoch": 2.5874587458745877, "grad_norm": 0.24160002325917174, "learning_rate": 2.296374287107883e-06, "loss": 0.47818487882614136, "step": 784 }, { "epoch": 2.5907590759075907, "grad_norm": 0.23493031875502465, "learning_rate": 2.260768649319869e-06, "loss": 0.4445609152317047, "step": 785 }, { "epoch": 2.594059405940594, "grad_norm": 0.2545526596288379, "learning_rate": 2.2254246655460765e-06, "loss": 0.4838835895061493, "step": 786 }, { "epoch": 2.5973597359735976, "grad_norm": 0.24631479441885146, "learning_rate": 2.1903428571109566e-06, "loss": 0.4454101324081421, "step": 787 }, { "epoch": 2.6006600660066006, "grad_norm": 0.2399303225290425, "learning_rate": 2.1555237414718854e-06, "loss": 0.46468472480773926, "step": 788 }, { "epoch": 2.603960396039604, "grad_norm": 0.24533578787784271, "learning_rate": 2.1209678322115133e-06, "loss": 0.508684515953064, "step": 789 }, { "epoch": 2.6072607260726075, "grad_norm": 0.23699012050293838, "learning_rate": 2.0866756390301778e-06, "loss": 0.46998751163482666, "step": 790 }, { "epoch": 2.6105610561056105, "grad_norm": 0.22442653448303418, "learning_rate": 2.0526476677384123e-06, "loss": 0.41589513421058655, "step": 791 }, { "epoch": 2.613861386138614, "grad_norm": 0.23870429201603713, "learning_rate": 2.018884420249474e-06, "loss": 0.4948643445968628, "step": 792 }, { "epoch": 2.6171617161716174, "grad_norm": 0.23103305184303033, "learning_rate": 1.9853863945719243e-06, "loss": 0.4494874179363251, "step": 793 }, { "epoch": 2.6204620462046204, "grad_norm": 0.23980252076908543, "learning_rate": 1.9521540848023113e-06, "loss": 0.42173343896865845, "step": 794 }, { "epoch": 2.623762376237624, "grad_norm": 0.24254851053091633, "learning_rate": 1.9191879811178605e-06, "loss": 0.4319555461406708, "step": 795 }, { "epoch": 2.6270627062706273, "grad_norm": 0.21769714480169441, "learning_rate": 1.8864885697692582e-06, "loss": 0.40467706322669983, "step": 796 }, { "epoch": 2.6303630363036303, "grad_norm": 0.23815188307796767, "learning_rate": 1.8540563330734662e-06, "loss": 0.5141273736953735, "step": 797 }, { "epoch": 2.633663366336634, "grad_norm": 0.23237959155910853, "learning_rate": 1.8218917494066212e-06, "loss": 0.44990289211273193, "step": 798 }, { "epoch": 2.6369636963696372, "grad_norm": 0.2393948822814923, "learning_rate": 1.7899952931969756e-06, "loss": 0.4878673553466797, "step": 799 }, { "epoch": 2.6402640264026402, "grad_norm": 0.22595932266177446, "learning_rate": 1.7583674349178803e-06, "loss": 0.46406376361846924, "step": 800 }, { "epoch": 2.6435643564356437, "grad_norm": 0.22163499847677615, "learning_rate": 1.7270086410808762e-06, "loss": 0.44470641016960144, "step": 801 }, { "epoch": 2.6468646864686467, "grad_norm": 0.23461158504190754, "learning_rate": 1.695919374228796e-06, "loss": 0.5306479930877686, "step": 802 }, { "epoch": 2.65016501650165, "grad_norm": 0.23844670077139818, "learning_rate": 1.6651000929289462e-06, "loss": 0.4570600390434265, "step": 803 }, { "epoch": 2.6534653465346536, "grad_norm": 0.24202990025785212, "learning_rate": 1.6345512517663275e-06, "loss": 0.48561781644821167, "step": 804 }, { "epoch": 2.6567656765676566, "grad_norm": 0.23785932147050265, "learning_rate": 1.6042733013369604e-06, "loss": 0.4666748642921448, "step": 805 }, { "epoch": 2.66006600660066, "grad_norm": 0.2420529385568233, "learning_rate": 1.5742666882412106e-06, "loss": 0.4761434495449066, "step": 806 }, { "epoch": 2.6633663366336635, "grad_norm": 0.23716960917200494, "learning_rate": 1.5445318550772204e-06, "loss": 0.4475252628326416, "step": 807 }, { "epoch": 2.6666666666666665, "grad_norm": 0.2477540352529907, "learning_rate": 1.5150692404343637e-06, "loss": 0.5299564599990845, "step": 808 }, { "epoch": 2.66996699669967, "grad_norm": 0.23933028255710986, "learning_rate": 1.4858792788867904e-06, "loss": 0.518581748008728, "step": 809 }, { "epoch": 2.6732673267326734, "grad_norm": 0.2332077440459636, "learning_rate": 1.4569624009870165e-06, "loss": 0.5162506103515625, "step": 810 }, { "epoch": 2.6765676567656764, "grad_norm": 0.23396257763770162, "learning_rate": 1.4283190332595665e-06, "loss": 0.4762595593929291, "step": 811 }, { "epoch": 2.67986798679868, "grad_norm": 0.24891326451914347, "learning_rate": 1.3999495981946764e-06, "loss": 0.44347697496414185, "step": 812 }, { "epoch": 2.6831683168316833, "grad_norm": 0.22951918904681498, "learning_rate": 1.3718545142420768e-06, "loss": 0.4344146251678467, "step": 813 }, { "epoch": 2.6864686468646863, "grad_norm": 0.23863686607461265, "learning_rate": 1.344034195804813e-06, "loss": 0.4936307668685913, "step": 814 }, { "epoch": 2.68976897689769, "grad_norm": 0.23758007083024585, "learning_rate": 1.3164890532331386e-06, "loss": 0.43635520339012146, "step": 815 }, { "epoch": 2.693069306930693, "grad_norm": 0.24550816708533926, "learning_rate": 1.2892194928184499e-06, "loss": 0.48006054759025574, "step": 816 }, { "epoch": 2.6963696369636962, "grad_norm": 0.22610358677951214, "learning_rate": 1.2622259167873008e-06, "loss": 0.4296647906303406, "step": 817 }, { "epoch": 2.6996699669966997, "grad_norm": 0.3871947383123805, "learning_rate": 1.2355087232954754e-06, "loss": 0.47840994596481323, "step": 818 }, { "epoch": 2.7029702970297027, "grad_norm": 0.21432181977841594, "learning_rate": 1.209068306422112e-06, "loss": 0.41459953784942627, "step": 819 }, { "epoch": 2.706270627062706, "grad_norm": 0.24313471794627498, "learning_rate": 1.1829050561638766e-06, "loss": 0.4278629422187805, "step": 820 }, { "epoch": 2.7095709570957096, "grad_norm": 0.24379358416226346, "learning_rate": 1.1570193584292323e-06, "loss": 0.44538602232933044, "step": 821 }, { "epoch": 2.7128712871287126, "grad_norm": 0.23094639733408046, "learning_rate": 1.1314115950327365e-06, "loss": 0.4757949709892273, "step": 822 }, { "epoch": 2.716171617161716, "grad_norm": 0.22182336808333136, "learning_rate": 1.106082143689402e-06, "loss": 0.49131542444229126, "step": 823 }, { "epoch": 2.7194719471947195, "grad_norm": 0.2534124798335607, "learning_rate": 1.0810313780091408e-06, "loss": 0.4917967915534973, "step": 824 }, { "epoch": 2.7227722772277225, "grad_norm": 0.23670068032674005, "learning_rate": 1.056259667491244e-06, "loss": 0.4949303865432739, "step": 825 }, { "epoch": 2.726072607260726, "grad_norm": 0.23770304320813665, "learning_rate": 1.0317673775189374e-06, "loss": 0.4287925958633423, "step": 826 }, { "epoch": 2.7293729372937294, "grad_norm": 0.2425418928573913, "learning_rate": 1.007554869353975e-06, "loss": 0.5059949159622192, "step": 827 }, { "epoch": 2.7326732673267324, "grad_norm": 0.25049371554006, "learning_rate": 9.83622500131336e-07, "loss": 0.47914958000183105, "step": 828 }, { "epoch": 2.735973597359736, "grad_norm": 0.24168515794090734, "learning_rate": 9.599706228539452e-07, "loss": 0.5237720608711243, "step": 829 }, { "epoch": 2.7392739273927393, "grad_norm": 0.23836969767457952, "learning_rate": 9.365995863874566e-07, "loss": 0.4628916382789612, "step": 830 }, { "epoch": 2.7425742574257423, "grad_norm": 0.22835633263617844, "learning_rate": 9.135097354551203e-07, "loss": 0.49988898634910583, "step": 831 }, { "epoch": 2.745874587458746, "grad_norm": 0.2229937423966958, "learning_rate": 8.907014106327039e-07, "loss": 0.4631851315498352, "step": 832 }, { "epoch": 2.7491749174917492, "grad_norm": 0.24485133529173167, "learning_rate": 8.681749483434387e-07, "loss": 0.47001713514328003, "step": 833 }, { "epoch": 2.7524752475247523, "grad_norm": 0.23400965677751775, "learning_rate": 8.459306808530999e-07, "loss": 0.4437292218208313, "step": 834 }, { "epoch": 2.7557755775577557, "grad_norm": 0.26632452732629835, "learning_rate": 8.239689362650694e-07, "loss": 0.5006406903266907, "step": 835 }, { "epoch": 2.759075907590759, "grad_norm": 0.23471614589516374, "learning_rate": 8.022900385155185e-07, "loss": 0.45732003450393677, "step": 836 }, { "epoch": 2.762376237623762, "grad_norm": 0.47225644675751677, "learning_rate": 7.808943073686159e-07, "loss": 0.5012909173965454, "step": 837 }, { "epoch": 2.7656765676567656, "grad_norm": 0.25510766784506034, "learning_rate": 7.597820584118221e-07, "loss": 0.5104090571403503, "step": 838 }, { "epoch": 2.768976897689769, "grad_norm": 0.22536004830501363, "learning_rate": 7.38953603051229e-07, "loss": 0.44415900111198425, "step": 839 }, { "epoch": 2.772277227722772, "grad_norm": 0.23868123290562657, "learning_rate": 7.184092485069638e-07, "loss": 0.46958473324775696, "step": 840 }, { "epoch": 2.7755775577557755, "grad_norm": 0.22685199851447227, "learning_rate": 6.981492978086634e-07, "loss": 0.4305083155632019, "step": 841 }, { "epoch": 2.778877887788779, "grad_norm": 0.2363937135429503, "learning_rate": 6.78174049791005e-07, "loss": 0.4812752604484558, "step": 842 }, { "epoch": 2.782178217821782, "grad_norm": 0.23536493344498524, "learning_rate": 6.584837990892889e-07, "loss": 0.522142231464386, "step": 843 }, { "epoch": 2.7854785478547854, "grad_norm": 0.2629089101886439, "learning_rate": 6.390788361351053e-07, "loss": 0.4789726138114929, "step": 844 }, { "epoch": 2.788778877887789, "grad_norm": 0.221963892758326, "learning_rate": 6.199594471520453e-07, "loss": 0.44507476687431335, "step": 845 }, { "epoch": 2.792079207920792, "grad_norm": 0.23452674626378717, "learning_rate": 6.011259141514747e-07, "loss": 0.47613948583602905, "step": 846 }, { "epoch": 2.7953795379537953, "grad_norm": 0.22167932095355114, "learning_rate": 5.825785149283758e-07, "loss": 0.44828763604164124, "step": 847 }, { "epoch": 2.798679867986799, "grad_norm": 0.3027768768548174, "learning_rate": 5.64317523057254e-07, "loss": 0.4695909321308136, "step": 848 }, { "epoch": 2.801980198019802, "grad_norm": 0.2349539472452322, "learning_rate": 5.463432078881093e-07, "loss": 0.48341453075408936, "step": 849 }, { "epoch": 2.8052805280528053, "grad_norm": 0.21333400051209225, "learning_rate": 5.286558345424397e-07, "loss": 0.47008436918258667, "step": 850 }, { "epoch": 2.8085808580858087, "grad_norm": 0.2369125413431687, "learning_rate": 5.112556639093536e-07, "loss": 0.5081039071083069, "step": 851 }, { "epoch": 2.8118811881188117, "grad_norm": 0.23230496066562498, "learning_rate": 4.941429526417163e-07, "loss": 0.49790090322494507, "step": 852 }, { "epoch": 2.815181518151815, "grad_norm": 0.2314377157636827, "learning_rate": 4.773179531523542e-07, "loss": 0.476767897605896, "step": 853 }, { "epoch": 2.8184818481848186, "grad_norm": 0.234974793768271, "learning_rate": 4.6078091361034585e-07, "loss": 0.5067446231842041, "step": 854 }, { "epoch": 2.8217821782178216, "grad_norm": 0.2229121342330284, "learning_rate": 4.4453207793735185e-07, "loss": 0.45703452825546265, "step": 855 }, { "epoch": 2.825082508250825, "grad_norm": 0.25006675020075053, "learning_rate": 4.285716858040223e-07, "loss": 0.4193270206451416, "step": 856 }, { "epoch": 2.8283828382838285, "grad_norm": 0.2214334357956483, "learning_rate": 4.128999726264549e-07, "loss": 0.4367069602012634, "step": 857 }, { "epoch": 2.8316831683168315, "grad_norm": 0.23745672544685706, "learning_rate": 3.9751716956273113e-07, "loss": 0.46601590514183044, "step": 858 }, { "epoch": 2.834983498349835, "grad_norm": 0.23728948504727357, "learning_rate": 3.824235035095036e-07, "loss": 0.4801405072212219, "step": 859 }, { "epoch": 2.8382838283828384, "grad_norm": 0.2305722834125333, "learning_rate": 3.676191970986409e-07, "loss": 0.4729960262775421, "step": 860 }, { "epoch": 2.8415841584158414, "grad_norm": 0.2565962552578653, "learning_rate": 3.531044686939611e-07, "loss": 0.453819215297699, "step": 861 }, { "epoch": 2.844884488448845, "grad_norm": 0.2345568934684747, "learning_rate": 3.388795323879923e-07, "loss": 0.4655516743659973, "step": 862 }, { "epoch": 2.8481848184818483, "grad_norm": 0.2602122051468819, "learning_rate": 3.249445979988286e-07, "loss": 0.4915505647659302, "step": 863 }, { "epoch": 2.8514851485148514, "grad_norm": 0.227534967530927, "learning_rate": 3.112998710670279e-07, "loss": 0.46072205901145935, "step": 864 }, { "epoch": 2.854785478547855, "grad_norm": 0.2372527927247435, "learning_rate": 2.979455528525854e-07, "loss": 0.47496911883354187, "step": 865 }, { "epoch": 2.8580858085808583, "grad_norm": 0.2396587074165527, "learning_rate": 2.8488184033195867e-07, "loss": 0.4863288402557373, "step": 866 }, { "epoch": 2.8613861386138613, "grad_norm": 0.23166629272471134, "learning_rate": 2.721089261951626e-07, "loss": 0.4543803930282593, "step": 867 }, { "epoch": 2.8646864686468647, "grad_norm": 0.2431611152190322, "learning_rate": 2.5962699884293894e-07, "loss": 0.4589266777038574, "step": 868 }, { "epoch": 2.867986798679868, "grad_norm": 0.2225895431580723, "learning_rate": 2.474362423839627e-07, "loss": 0.45603302121162415, "step": 869 }, { "epoch": 2.871287128712871, "grad_norm": 0.2221408751585563, "learning_rate": 2.3553683663213088e-07, "loss": 0.4547184109687805, "step": 870 }, { "epoch": 2.8745874587458746, "grad_norm": 0.24123343867414457, "learning_rate": 2.2392895710391604e-07, "loss": 0.4900602102279663, "step": 871 }, { "epoch": 2.877887788778878, "grad_norm": 0.2412441535157341, "learning_rate": 2.126127750157725e-07, "loss": 0.48706525564193726, "step": 872 }, { "epoch": 2.881188118811881, "grad_norm": 0.24173675884162568, "learning_rate": 2.0158845728160958e-07, "loss": 0.4726618230342865, "step": 873 }, { "epoch": 2.8844884488448845, "grad_norm": 0.25907893004745514, "learning_rate": 1.9085616651033147e-07, "loss": 0.45884019136428833, "step": 874 }, { "epoch": 2.887788778877888, "grad_norm": 0.2641670850826395, "learning_rate": 1.804160610034411e-07, "loss": 0.4787840247154236, "step": 875 }, { "epoch": 2.891089108910891, "grad_norm": 0.24253910042279672, "learning_rate": 1.702682947527001e-07, "loss": 0.4758448004722595, "step": 876 }, { "epoch": 2.8943894389438944, "grad_norm": 0.2279011748861112, "learning_rate": 1.6041301743786596e-07, "loss": 0.47089093923568726, "step": 877 }, { "epoch": 2.897689768976898, "grad_norm": 0.29849498701163135, "learning_rate": 1.5085037442446937e-07, "loss": 0.46921056509017944, "step": 878 }, { "epoch": 2.900990099009901, "grad_norm": 0.2344970489799305, "learning_rate": 1.415805067616871e-07, "loss": 0.5218731164932251, "step": 879 }, { "epoch": 2.9042904290429044, "grad_norm": 0.2254215991599414, "learning_rate": 1.3260355118025036e-07, "loss": 0.43099671602249146, "step": 880 }, { "epoch": 2.907590759075908, "grad_norm": 0.23874830724823604, "learning_rate": 1.2391964009043078e-07, "loss": 0.48290592432022095, "step": 881 }, { "epoch": 2.910891089108911, "grad_norm": 0.23943766068140404, "learning_rate": 1.1552890158009311e-07, "loss": 0.4634360074996948, "step": 882 }, { "epoch": 2.9141914191419143, "grad_norm": 0.2453653346062948, "learning_rate": 1.0743145941279453e-07, "loss": 0.5041622519493103, "step": 883 }, { "epoch": 2.9174917491749177, "grad_norm": 0.21518547033713775, "learning_rate": 9.962743302596612e-08, "loss": 0.480410099029541, "step": 884 }, { "epoch": 2.9207920792079207, "grad_norm": 0.24487326504708118, "learning_rate": 9.211693752915419e-08, "loss": 0.49919891357421875, "step": 885 }, { "epoch": 2.924092409240924, "grad_norm": 0.23373083594094138, "learning_rate": 8.490008370231506e-08, "loss": 0.508806586265564, "step": 886 }, { "epoch": 2.9273927392739276, "grad_norm": 0.23076843849897602, "learning_rate": 7.797697799418525e-08, "loss": 0.4233350157737732, "step": 887 }, { "epoch": 2.9306930693069306, "grad_norm": 0.2406032429252954, "learning_rate": 7.134772252071154e-08, "loss": 0.4577901363372803, "step": 888 }, { "epoch": 2.933993399339934, "grad_norm": 0.22213331512067527, "learning_rate": 6.501241506354561e-08, "loss": 0.4028077721595764, "step": 889 }, { "epoch": 2.9372937293729375, "grad_norm": 0.23681508976522572, "learning_rate": 5.897114906859402e-08, "loss": 0.48321446776390076, "step": 890 }, { "epoch": 2.9405940594059405, "grad_norm": 0.27558742916404966, "learning_rate": 5.322401364465491e-08, "loss": 0.48732608556747437, "step": 891 }, { "epoch": 2.943894389438944, "grad_norm": 0.22725537704850798, "learning_rate": 4.777109356208565e-08, "loss": 0.46879494190216064, "step": 892 }, { "epoch": 2.9471947194719474, "grad_norm": 0.23495776431163154, "learning_rate": 4.261246925156837e-08, "loss": 0.4858628511428833, "step": 893 }, { "epoch": 2.9504950495049505, "grad_norm": 0.22802725333151694, "learning_rate": 3.7748216802913077e-08, "loss": 0.48119616508483887, "step": 894 }, { "epoch": 2.953795379537954, "grad_norm": 0.22512889420077337, "learning_rate": 3.3178407963938564e-08, "loss": 0.4994167983531952, "step": 895 }, { "epoch": 2.9570957095709574, "grad_norm": 0.23739211802797258, "learning_rate": 2.8903110139417712e-08, "loss": 0.46394845843315125, "step": 896 }, { "epoch": 2.9603960396039604, "grad_norm": 0.2476698533912655, "learning_rate": 2.4922386390076047e-08, "loss": 0.42504560947418213, "step": 897 }, { "epoch": 2.963696369636964, "grad_norm": 0.24523827629331452, "learning_rate": 2.1236295431670275e-08, "loss": 0.4186960756778717, "step": 898 }, { "epoch": 2.9669966996699673, "grad_norm": 0.22738870735932892, "learning_rate": 1.7844891634113402e-08, "loss": 0.4529160261154175, "step": 899 }, { "epoch": 2.9702970297029703, "grad_norm": 0.23734524364327658, "learning_rate": 1.4748225020679851e-08, "loss": 0.44012153148651123, "step": 900 }, { "epoch": 2.9735973597359737, "grad_norm": 0.23103066951863727, "learning_rate": 1.1946341267263794e-08, "loss": 0.4775368571281433, "step": 901 }, { "epoch": 2.976897689768977, "grad_norm": 0.22618868632744704, "learning_rate": 9.439281701704162e-09, "loss": 0.4465276002883911, "step": 902 }, { "epoch": 2.98019801980198, "grad_norm": 0.24271367480309458, "learning_rate": 7.227083303180671e-09, "loss": 0.4674132168292999, "step": 903 }, { "epoch": 2.9834983498349836, "grad_norm": 0.23142674174926925, "learning_rate": 5.30977870166316e-09, "loss": 0.4751841127872467, "step": 904 }, { "epoch": 2.9867986798679866, "grad_norm": 0.24061959007170008, "learning_rate": 3.687396177434188e-09, "loss": 0.4587743580341339, "step": 905 }, { "epoch": 2.99009900990099, "grad_norm": 0.22301171950064888, "learning_rate": 2.359959660667155e-09, "loss": 0.4815826416015625, "step": 906 }, { "epoch": 2.9933993399339935, "grad_norm": 0.22240717058445192, "learning_rate": 1.3274887310732454e-09, "loss": 0.45863479375839233, "step": 907 }, { "epoch": 2.9966996699669965, "grad_norm": 0.23321307876392341, "learning_rate": 5.899986176260974e-10, "loss": 0.4888804256916046, "step": 908 }, { "epoch": 3.0, "grad_norm": 0.2343821134475686, "learning_rate": 1.475001983131108e-10, "loss": 0.46804267168045044, "step": 909 }, { "epoch": 3.0, "step": 909, "total_flos": 1274755977576448.0, "train_loss": 0.5495063810720958, "train_runtime": 34913.8374, "train_samples_per_second": 3.33, "train_steps_per_second": 0.026 } ], "logging_steps": 1, "max_steps": 909, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1274755977576448.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }