{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 5735, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0017438311971401169, "grad_norm": 75.95468738643471, "learning_rate": 5.202312138728324e-06, "loss": 3.0208, "step": 10 }, { "epoch": 0.0034876623942802338, "grad_norm": 30.03360815962505, "learning_rate": 1.0982658959537573e-05, "loss": 1.9958, "step": 20 }, { "epoch": 0.0052314935914203504, "grad_norm": 17.34695165918347, "learning_rate": 1.676300578034682e-05, "loss": 1.0962, "step": 30 }, { "epoch": 0.0069753247885604676, "grad_norm": 12.491792934654203, "learning_rate": 2.254335260115607e-05, "loss": 0.6126, "step": 40 }, { "epoch": 0.008719155985700585, "grad_norm": 1.9114286148516435, "learning_rate": 2.832369942196532e-05, "loss": 0.4425, "step": 50 }, { "epoch": 0.010462987182840701, "grad_norm": 1.428534192558495, "learning_rate": 3.410404624277457e-05, "loss": 0.3402, "step": 60 }, { "epoch": 0.012206818379980817, "grad_norm": 2.416511852214603, "learning_rate": 3.988439306358382e-05, "loss": 0.2859, "step": 70 }, { "epoch": 0.013950649577120935, "grad_norm": 1.1798287792036775, "learning_rate": 4.566473988439307e-05, "loss": 0.2436, "step": 80 }, { "epoch": 0.01569448077426105, "grad_norm": 0.9422829664821649, "learning_rate": 5.1445086705202317e-05, "loss": 0.2153, "step": 90 }, { "epoch": 0.01743831197140117, "grad_norm": 0.6784111235814376, "learning_rate": 5.722543352601156e-05, "loss": 0.1949, "step": 100 }, { "epoch": 0.019182143168541284, "grad_norm": 0.6258877189910531, "learning_rate": 6.300578034682081e-05, "loss": 0.1748, "step": 110 }, { "epoch": 0.020925974365681402, "grad_norm": 1.0353787867694664, "learning_rate": 6.878612716763007e-05, "loss": 0.1579, "step": 120 }, { "epoch": 0.02266980556282152, "grad_norm": 0.4027613120700805, "learning_rate": 7.456647398843931e-05, "loss": 0.15, "step": 130 }, { "epoch": 0.024413636759961634, "grad_norm": 0.31222620383041055, "learning_rate": 8.034682080924855e-05, "loss": 0.1412, "step": 140 }, { "epoch": 0.026157467957101752, "grad_norm": 0.4650470906801264, "learning_rate": 8.612716763005781e-05, "loss": 0.1351, "step": 150 }, { "epoch": 0.02790129915424187, "grad_norm": 0.3734885043246775, "learning_rate": 9.190751445086706e-05, "loss": 0.1351, "step": 160 }, { "epoch": 0.029645130351381985, "grad_norm": 0.38052529337243646, "learning_rate": 9.768786127167631e-05, "loss": 0.1289, "step": 170 }, { "epoch": 0.0313889615485221, "grad_norm": 0.24922983883737135, "learning_rate": 9.999971286914107e-05, "loss": 0.1278, "step": 180 }, { "epoch": 0.03313279274566222, "grad_norm": 0.2337743034285069, "learning_rate": 9.999795819250125e-05, "loss": 0.12, "step": 190 }, { "epoch": 0.03487662394280234, "grad_norm": 0.1925200524883156, "learning_rate": 9.99946084122777e-05, "loss": 0.1187, "step": 200 }, { "epoch": 0.03662045513994246, "grad_norm": 0.25639013442385866, "learning_rate": 9.998966363533971e-05, "loss": 0.1155, "step": 210 }, { "epoch": 0.03836428633708257, "grad_norm": 0.5969296827345937, "learning_rate": 9.998312401944236e-05, "loss": 0.1123, "step": 220 }, { "epoch": 0.040108117534222686, "grad_norm": 0.19274626117019628, "learning_rate": 9.997498977322146e-05, "loss": 0.113, "step": 230 }, { "epoch": 0.041851948731362804, "grad_norm": 0.29247520821204726, "learning_rate": 9.996526115618692e-05, "loss": 0.1117, "step": 240 }, { "epoch": 0.04359577992850292, "grad_norm": 0.29663865702918896, "learning_rate": 9.995393847871446e-05, "loss": 0.1084, "step": 250 }, { "epoch": 0.04533961112564304, "grad_norm": 0.7694784504132202, "learning_rate": 9.994102210203567e-05, "loss": 0.1075, "step": 260 }, { "epoch": 0.04708344232278316, "grad_norm": 0.6628546556003793, "learning_rate": 9.992651243822657e-05, "loss": 0.1107, "step": 270 }, { "epoch": 0.04882727351992327, "grad_norm": 0.19910200146659307, "learning_rate": 9.991040995019441e-05, "loss": 0.1042, "step": 280 }, { "epoch": 0.050571104717063387, "grad_norm": 0.482892752541729, "learning_rate": 9.989271515166288e-05, "loss": 0.1067, "step": 290 }, { "epoch": 0.052314935914203504, "grad_norm": 0.3316707002189607, "learning_rate": 9.987342860715575e-05, "loss": 0.1051, "step": 300 }, { "epoch": 0.05405876711134362, "grad_norm": 0.1686835282656122, "learning_rate": 9.985255093197889e-05, "loss": 0.104, "step": 310 }, { "epoch": 0.05580259830848374, "grad_norm": 0.20484377424861627, "learning_rate": 9.983008279220061e-05, "loss": 0.1003, "step": 320 }, { "epoch": 0.05754642950562386, "grad_norm": 0.17506212895289214, "learning_rate": 9.980602490463036e-05, "loss": 0.1037, "step": 330 }, { "epoch": 0.05929026070276397, "grad_norm": 0.21847685544030518, "learning_rate": 9.978037803679594e-05, "loss": 0.1051, "step": 340 }, { "epoch": 0.06103409189990409, "grad_norm": 0.1545992773211455, "learning_rate": 9.975314300691897e-05, "loss": 0.1008, "step": 350 }, { "epoch": 0.0627779230970442, "grad_norm": 0.18093862679980835, "learning_rate": 9.972432068388884e-05, "loss": 0.1013, "step": 360 }, { "epoch": 0.06452175429418432, "grad_norm": 0.16690423874276675, "learning_rate": 9.969391198723489e-05, "loss": 0.1015, "step": 370 }, { "epoch": 0.06626558549132444, "grad_norm": 0.1592587474414291, "learning_rate": 9.966191788709716e-05, "loss": 0.0997, "step": 380 }, { "epoch": 0.06800941668846455, "grad_norm": 0.4135662463271595, "learning_rate": 9.96283394041954e-05, "loss": 0.0976, "step": 390 }, { "epoch": 0.06975324788560468, "grad_norm": 0.2936141046578689, "learning_rate": 9.959317760979653e-05, "loss": 0.098, "step": 400 }, { "epoch": 0.07149707908274479, "grad_norm": 0.16272012987508996, "learning_rate": 9.955643362568047e-05, "loss": 0.0975, "step": 410 }, { "epoch": 0.07324091027988491, "grad_norm": 0.2718060027024407, "learning_rate": 9.951810862410426e-05, "loss": 0.1025, "step": 420 }, { "epoch": 0.07498474147702502, "grad_norm": 0.48320652277711706, "learning_rate": 9.947820382776483e-05, "loss": 0.0998, "step": 430 }, { "epoch": 0.07672857267416514, "grad_norm": 0.34047211822697626, "learning_rate": 9.943672050975978e-05, "loss": 0.0973, "step": 440 }, { "epoch": 0.07847240387130526, "grad_norm": 0.12654950824029546, "learning_rate": 9.9393659993547e-05, "loss": 0.0952, "step": 450 }, { "epoch": 0.08021623506844537, "grad_norm": 0.19206792789150806, "learning_rate": 9.934902365290221e-05, "loss": 0.0916, "step": 460 }, { "epoch": 0.0819600662655855, "grad_norm": 0.3174049728598392, "learning_rate": 9.930281291187533e-05, "loss": 0.0926, "step": 470 }, { "epoch": 0.08370389746272561, "grad_norm": 0.21583614516207328, "learning_rate": 9.925502924474494e-05, "loss": 0.0968, "step": 480 }, { "epoch": 0.08544772865986572, "grad_norm": 0.20077795953974328, "learning_rate": 9.920567417597127e-05, "loss": 0.0947, "step": 490 }, { "epoch": 0.08719155985700584, "grad_norm": 0.5320546316014596, "learning_rate": 9.915474928014754e-05, "loss": 0.0922, "step": 500 }, { "epoch": 0.08893539105414595, "grad_norm": 0.25808125396162024, "learning_rate": 9.910225618194979e-05, "loss": 0.0938, "step": 510 }, { "epoch": 0.09067922225128608, "grad_norm": 0.16298921826803214, "learning_rate": 9.9048196556085e-05, "loss": 0.0934, "step": 520 }, { "epoch": 0.09242305344842619, "grad_norm": 0.23375968225532243, "learning_rate": 9.89925721272376e-05, "loss": 0.0932, "step": 530 }, { "epoch": 0.09416688464556631, "grad_norm": 0.16382604539487483, "learning_rate": 9.893538467001465e-05, "loss": 0.0898, "step": 540 }, { "epoch": 0.09591071584270643, "grad_norm": 0.10016462312465411, "learning_rate": 9.887663600888897e-05, "loss": 0.0942, "step": 550 }, { "epoch": 0.09765454703984654, "grad_norm": 0.14233007452976884, "learning_rate": 9.881632801814112e-05, "loss": 0.0907, "step": 560 }, { "epoch": 0.09939837823698666, "grad_norm": 0.17513502703887412, "learning_rate": 9.875446262179948e-05, "loss": 0.0911, "step": 570 }, { "epoch": 0.10114220943412677, "grad_norm": 0.14030487648990883, "learning_rate": 9.869104179357898e-05, "loss": 0.0934, "step": 580 }, { "epoch": 0.1028860406312669, "grad_norm": 0.12085926283675864, "learning_rate": 9.862606755681805e-05, "loss": 0.0884, "step": 590 }, { "epoch": 0.10462987182840701, "grad_norm": 0.23368729653021336, "learning_rate": 9.855954198441411e-05, "loss": 0.0911, "step": 600 }, { "epoch": 0.10637370302554713, "grad_norm": 0.16684004596578658, "learning_rate": 9.849146719875737e-05, "loss": 0.0922, "step": 610 }, { "epoch": 0.10811753422268724, "grad_norm": 0.1412977238838307, "learning_rate": 9.842184537166325e-05, "loss": 0.0901, "step": 620 }, { "epoch": 0.10986136541982736, "grad_norm": 0.15930483520725827, "learning_rate": 9.835067872430298e-05, "loss": 0.091, "step": 630 }, { "epoch": 0.11160519661696748, "grad_norm": 0.15145556668891252, "learning_rate": 9.827796952713271e-05, "loss": 0.0896, "step": 640 }, { "epoch": 0.11334902781410759, "grad_norm": 0.11359857757495083, "learning_rate": 9.820372009982122e-05, "loss": 0.0909, "step": 650 }, { "epoch": 0.11509285901124772, "grad_norm": 0.10805407788602606, "learning_rate": 9.81279328111758e-05, "loss": 0.0897, "step": 660 }, { "epoch": 0.11683669020838783, "grad_norm": 0.1379641185520003, "learning_rate": 9.805061007906669e-05, "loss": 0.0901, "step": 670 }, { "epoch": 0.11858052140552794, "grad_norm": 0.17169121682691532, "learning_rate": 9.797175437034997e-05, "loss": 0.0898, "step": 680 }, { "epoch": 0.12032435260266806, "grad_norm": 0.14025167310704112, "learning_rate": 9.789136820078883e-05, "loss": 0.0888, "step": 690 }, { "epoch": 0.12206818379980817, "grad_norm": 0.1521428482995378, "learning_rate": 9.780945413497337e-05, "loss": 0.0883, "step": 700 }, { "epoch": 0.1238120149969483, "grad_norm": 0.1564174281355041, "learning_rate": 9.77260147862387e-05, "loss": 0.0892, "step": 710 }, { "epoch": 0.1255558461940884, "grad_norm": 0.13358902120457766, "learning_rate": 9.76410528165816e-05, "loss": 0.086, "step": 720 }, { "epoch": 0.12729967739122852, "grad_norm": 0.21507213503538028, "learning_rate": 9.755457093657561e-05, "loss": 0.0852, "step": 730 }, { "epoch": 0.12904350858836863, "grad_norm": 0.10784925841529905, "learning_rate": 9.746657190528454e-05, "loss": 0.0853, "step": 740 }, { "epoch": 0.13078733978550877, "grad_norm": 0.1252377711727728, "learning_rate": 9.737705853017441e-05, "loss": 0.0885, "step": 750 }, { "epoch": 0.13253117098264888, "grad_norm": 0.13791730839422697, "learning_rate": 9.728603366702399e-05, "loss": 0.0863, "step": 760 }, { "epoch": 0.134275002179789, "grad_norm": 0.20745890428434363, "learning_rate": 9.719350021983356e-05, "loss": 0.0853, "step": 770 }, { "epoch": 0.1360188333769291, "grad_norm": 0.11532866798616691, "learning_rate": 9.709946114073232e-05, "loss": 0.0891, "step": 780 }, { "epoch": 0.13776266457406924, "grad_norm": 0.09902310686484139, "learning_rate": 9.700391942988423e-05, "loss": 0.0874, "step": 790 }, { "epoch": 0.13950649577120935, "grad_norm": 0.12236583263546631, "learning_rate": 9.690687813539229e-05, "loss": 0.0873, "step": 800 }, { "epoch": 0.14125032696834947, "grad_norm": 0.28381242657810296, "learning_rate": 9.680834035320127e-05, "loss": 0.0847, "step": 810 }, { "epoch": 0.14299415816548958, "grad_norm": 0.17407339426371563, "learning_rate": 9.670830922699889e-05, "loss": 0.0865, "step": 820 }, { "epoch": 0.1447379893626297, "grad_norm": 0.17139082411532464, "learning_rate": 9.660678794811568e-05, "loss": 0.0873, "step": 830 }, { "epoch": 0.14648182055976983, "grad_norm": 0.18180876897987616, "learning_rate": 9.650377975542297e-05, "loss": 0.0856, "step": 840 }, { "epoch": 0.14822565175690994, "grad_norm": 0.14881131848289347, "learning_rate": 9.639928793522976e-05, "loss": 0.0857, "step": 850 }, { "epoch": 0.14996948295405005, "grad_norm": 0.19484994491316202, "learning_rate": 9.629331582117766e-05, "loss": 0.0874, "step": 860 }, { "epoch": 0.15171331415119016, "grad_norm": 0.13056276669158606, "learning_rate": 9.618586679413477e-05, "loss": 0.0863, "step": 870 }, { "epoch": 0.15345714534833027, "grad_norm": 0.11769246064182205, "learning_rate": 9.607694428208759e-05, "loss": 0.0833, "step": 880 }, { "epoch": 0.1552009765454704, "grad_norm": 0.14628105454358595, "learning_rate": 9.596655176003184e-05, "loss": 0.084, "step": 890 }, { "epoch": 0.15694480774261052, "grad_norm": 0.10138944434580117, "learning_rate": 9.585469274986147e-05, "loss": 0.0837, "step": 900 }, { "epoch": 0.15868863893975063, "grad_norm": 0.2727810370648225, "learning_rate": 9.57413708202564e-05, "loss": 0.083, "step": 910 }, { "epoch": 0.16043247013689074, "grad_norm": 0.12791732468432543, "learning_rate": 9.562658958656855e-05, "loss": 0.0838, "step": 920 }, { "epoch": 0.16217630133403085, "grad_norm": 0.08919744277549542, "learning_rate": 9.551035271070664e-05, "loss": 0.0858, "step": 930 }, { "epoch": 0.163920132531171, "grad_norm": 0.10948146823445754, "learning_rate": 9.539266390101921e-05, "loss": 0.0844, "step": 940 }, { "epoch": 0.1656639637283111, "grad_norm": 0.14190891350092028, "learning_rate": 9.527352691217648e-05, "loss": 0.0821, "step": 950 }, { "epoch": 0.16740779492545121, "grad_norm": 0.13665202223597164, "learning_rate": 9.515294554505039e-05, "loss": 0.0831, "step": 960 }, { "epoch": 0.16915162612259133, "grad_norm": 0.2559230604815077, "learning_rate": 9.503092364659343e-05, "loss": 0.0851, "step": 970 }, { "epoch": 0.17089545731973144, "grad_norm": 0.12867152006889135, "learning_rate": 9.490746510971595e-05, "loss": 0.084, "step": 980 }, { "epoch": 0.17263928851687158, "grad_norm": 0.16931334823903008, "learning_rate": 9.47825738731619e-05, "loss": 0.0842, "step": 990 }, { "epoch": 0.1743831197140117, "grad_norm": 0.14129496751949874, "learning_rate": 9.465625392138313e-05, "loss": 0.0813, "step": 1000 }, { "epoch": 0.1761269509111518, "grad_norm": 0.11143855280545686, "learning_rate": 9.452850928441239e-05, "loss": 0.0811, "step": 1010 }, { "epoch": 0.1778707821082919, "grad_norm": 0.1051133376541276, "learning_rate": 9.439934403773468e-05, "loss": 0.0829, "step": 1020 }, { "epoch": 0.17961461330543205, "grad_norm": 0.0936072144785802, "learning_rate": 9.42687623021572e-05, "loss": 0.0835, "step": 1030 }, { "epoch": 0.18135844450257216, "grad_norm": 0.08594669519246889, "learning_rate": 9.413676824367799e-05, "loss": 0.084, "step": 1040 }, { "epoch": 0.18310227569971227, "grad_norm": 0.155895314210675, "learning_rate": 9.400336607335293e-05, "loss": 0.0814, "step": 1050 }, { "epoch": 0.18484610689685238, "grad_norm": 0.09064911002854469, "learning_rate": 9.38685600471614e-05, "loss": 0.0816, "step": 1060 }, { "epoch": 0.1865899380939925, "grad_norm": 0.11656040445018741, "learning_rate": 9.373235446587056e-05, "loss": 0.0832, "step": 1070 }, { "epoch": 0.18833376929113263, "grad_norm": 0.08144570375176247, "learning_rate": 9.359475367489806e-05, "loss": 0.0812, "step": 1080 }, { "epoch": 0.19007760048827274, "grad_norm": 0.14753379134013667, "learning_rate": 9.345576206417345e-05, "loss": 0.0825, "step": 1090 }, { "epoch": 0.19182143168541285, "grad_norm": 0.08941481925466258, "learning_rate": 9.331538406799816e-05, "loss": 0.0849, "step": 1100 }, { "epoch": 0.19356526288255296, "grad_norm": 0.10342356855139785, "learning_rate": 9.317362416490396e-05, "loss": 0.0812, "step": 1110 }, { "epoch": 0.19530909407969307, "grad_norm": 0.09908246193961452, "learning_rate": 9.303048687751015e-05, "loss": 0.0827, "step": 1120 }, { "epoch": 0.1970529252768332, "grad_norm": 0.08548417434334415, "learning_rate": 9.288597677237918e-05, "loss": 0.0795, "step": 1130 }, { "epoch": 0.19879675647397332, "grad_norm": 0.19698254462320883, "learning_rate": 9.274009845987106e-05, "loss": 0.0784, "step": 1140 }, { "epoch": 0.20054058767111343, "grad_norm": 0.19931397597516964, "learning_rate": 9.259285659399624e-05, "loss": 0.0791, "step": 1150 }, { "epoch": 0.20228441886825355, "grad_norm": 0.09541689313639541, "learning_rate": 9.244425587226708e-05, "loss": 0.0811, "step": 1160 }, { "epoch": 0.20402825006539366, "grad_norm": 0.13754375259689228, "learning_rate": 9.229430103554809e-05, "loss": 0.0834, "step": 1170 }, { "epoch": 0.2057720812625338, "grad_norm": 0.0901357227026893, "learning_rate": 9.214299686790453e-05, "loss": 0.0811, "step": 1180 }, { "epoch": 0.2075159124596739, "grad_norm": 0.10558399227810011, "learning_rate": 9.199034819644996e-05, "loss": 0.0805, "step": 1190 }, { "epoch": 0.20925974365681402, "grad_norm": 0.15527795571225775, "learning_rate": 9.18363598911921e-05, "loss": 0.0788, "step": 1200 }, { "epoch": 0.21100357485395413, "grad_norm": 0.07370244400658628, "learning_rate": 9.168103686487754e-05, "loss": 0.0809, "step": 1210 }, { "epoch": 0.21274740605109427, "grad_norm": 0.1080328076461725, "learning_rate": 9.152438407283492e-05, "loss": 0.0794, "step": 1220 }, { "epoch": 0.21449123724823438, "grad_norm": 0.12193088880477042, "learning_rate": 9.136640651281694e-05, "loss": 0.078, "step": 1230 }, { "epoch": 0.2162350684453745, "grad_norm": 0.1435896505685483, "learning_rate": 9.120710922484088e-05, "loss": 0.0801, "step": 1240 }, { "epoch": 0.2179788996425146, "grad_norm": 0.15854421340115443, "learning_rate": 9.104649729102774e-05, "loss": 0.0767, "step": 1250 }, { "epoch": 0.2197227308396547, "grad_norm": 0.10152239382236611, "learning_rate": 9.088457583544021e-05, "loss": 0.0793, "step": 1260 }, { "epoch": 0.22146656203679485, "grad_norm": 0.09311072879845227, "learning_rate": 9.072135002391911e-05, "loss": 0.0808, "step": 1270 }, { "epoch": 0.22321039323393496, "grad_norm": 0.08102045640572195, "learning_rate": 9.055682506391867e-05, "loss": 0.0789, "step": 1280 }, { "epoch": 0.22495422443107507, "grad_norm": 0.11429382945365393, "learning_rate": 9.039100620434025e-05, "loss": 0.0809, "step": 1290 }, { "epoch": 0.22669805562821518, "grad_norm": 0.13272380745973458, "learning_rate": 9.022389873536504e-05, "loss": 0.0808, "step": 1300 }, { "epoch": 0.2284418868253553, "grad_norm": 0.1541801400684174, "learning_rate": 9.005550798828522e-05, "loss": 0.0781, "step": 1310 }, { "epoch": 0.23018571802249543, "grad_norm": 0.21341903886796226, "learning_rate": 8.988583933533383e-05, "loss": 0.0806, "step": 1320 }, { "epoch": 0.23192954921963554, "grad_norm": 0.1072377577262835, "learning_rate": 8.971489818951348e-05, "loss": 0.0815, "step": 1330 }, { "epoch": 0.23367338041677566, "grad_norm": 0.2539226009683929, "learning_rate": 8.954269000442353e-05, "loss": 0.0822, "step": 1340 }, { "epoch": 0.23541721161391577, "grad_norm": 0.0964733141512959, "learning_rate": 8.936922027408618e-05, "loss": 0.0802, "step": 1350 }, { "epoch": 0.23716104281105588, "grad_norm": 0.15344703114198005, "learning_rate": 8.919449453277125e-05, "loss": 0.0781, "step": 1360 }, { "epoch": 0.23890487400819602, "grad_norm": 0.13637749965223914, "learning_rate": 8.901851835481946e-05, "loss": 0.0768, "step": 1370 }, { "epoch": 0.24064870520533613, "grad_norm": 0.1439271667236502, "learning_rate": 8.884129735446471e-05, "loss": 0.0782, "step": 1380 }, { "epoch": 0.24239253640247624, "grad_norm": 0.10228448993969186, "learning_rate": 8.866283718565497e-05, "loss": 0.0802, "step": 1390 }, { "epoch": 0.24413636759961635, "grad_norm": 0.08357219560541651, "learning_rate": 8.848314354187184e-05, "loss": 0.0805, "step": 1400 }, { "epoch": 0.24588019879675646, "grad_norm": 0.12297612486088587, "learning_rate": 8.83022221559489e-05, "loss": 0.0807, "step": 1410 }, { "epoch": 0.2476240299938966, "grad_norm": 0.09213229043723821, "learning_rate": 8.81200787998889e-05, "loss": 0.0796, "step": 1420 }, { "epoch": 0.2493678611910367, "grad_norm": 0.23221554917761744, "learning_rate": 8.793671928467953e-05, "loss": 0.0795, "step": 1430 }, { "epoch": 0.2511116923881768, "grad_norm": 0.15255422092338392, "learning_rate": 8.775214946010806e-05, "loss": 0.0791, "step": 1440 }, { "epoch": 0.25285552358531693, "grad_norm": 0.13430089153547967, "learning_rate": 8.756637521457472e-05, "loss": 0.0796, "step": 1450 }, { "epoch": 0.25459935478245704, "grad_norm": 0.10858403401784206, "learning_rate": 8.737940247490488e-05, "loss": 0.0776, "step": 1460 }, { "epoch": 0.25634318597959715, "grad_norm": 0.1087205013394363, "learning_rate": 8.71912372061598e-05, "loss": 0.0764, "step": 1470 }, { "epoch": 0.25808701717673727, "grad_norm": 0.1801860214762826, "learning_rate": 8.700188541144658e-05, "loss": 0.0764, "step": 1480 }, { "epoch": 0.25983084837387743, "grad_norm": 0.1089107768906771, "learning_rate": 8.68113531317264e-05, "loss": 0.0801, "step": 1490 }, { "epoch": 0.26157467957101754, "grad_norm": 0.10862169809528116, "learning_rate": 8.661964644562193e-05, "loss": 0.0796, "step": 1500 }, { "epoch": 0.26331851076815765, "grad_norm": 0.08182932886020478, "learning_rate": 8.64267714692234e-05, "loss": 0.0784, "step": 1510 }, { "epoch": 0.26506234196529777, "grad_norm": 0.10745086100269909, "learning_rate": 8.623273435589338e-05, "loss": 0.0771, "step": 1520 }, { "epoch": 0.2668061731624379, "grad_norm": 0.0835190988442007, "learning_rate": 8.603754129607055e-05, "loss": 0.0785, "step": 1530 }, { "epoch": 0.268550004359578, "grad_norm": 0.15373146227023327, "learning_rate": 8.584119851707224e-05, "loss": 0.0787, "step": 1540 }, { "epoch": 0.2702938355567181, "grad_norm": 0.16851363372769732, "learning_rate": 8.564371228289562e-05, "loss": 0.0782, "step": 1550 }, { "epoch": 0.2720376667538582, "grad_norm": 0.17341701223323236, "learning_rate": 8.5445088894018e-05, "loss": 0.0763, "step": 1560 }, { "epoch": 0.2737814979509983, "grad_norm": 0.15736312308266082, "learning_rate": 8.524533468719568e-05, "loss": 0.0778, "step": 1570 }, { "epoch": 0.2755253291481385, "grad_norm": 0.10603350184378946, "learning_rate": 8.504445603526201e-05, "loss": 0.0769, "step": 1580 }, { "epoch": 0.2772691603452786, "grad_norm": 0.10468271020039895, "learning_rate": 8.484245934692379e-05, "loss": 0.0764, "step": 1590 }, { "epoch": 0.2790129915424187, "grad_norm": 0.1148841481305709, "learning_rate": 8.463935106655704e-05, "loss": 0.0773, "step": 1600 }, { "epoch": 0.2807568227395588, "grad_norm": 0.0821350828260079, "learning_rate": 8.443513767400127e-05, "loss": 0.0755, "step": 1610 }, { "epoch": 0.28250065393669893, "grad_norm": 0.08498785215350724, "learning_rate": 8.422982568435281e-05, "loss": 0.0771, "step": 1620 }, { "epoch": 0.28424448513383904, "grad_norm": 0.08457375312974288, "learning_rate": 8.4023421647757e-05, "loss": 0.0766, "step": 1630 }, { "epoch": 0.28598831633097915, "grad_norm": 0.31067232704883735, "learning_rate": 8.381593214919905e-05, "loss": 0.0795, "step": 1640 }, { "epoch": 0.28773214752811926, "grad_norm": 0.08612116427881779, "learning_rate": 8.360736380829419e-05, "loss": 0.0785, "step": 1650 }, { "epoch": 0.2894759787252594, "grad_norm": 0.07915038596639898, "learning_rate": 8.339772327907628e-05, "loss": 0.076, "step": 1660 }, { "epoch": 0.2912198099223995, "grad_norm": 0.21458699675313742, "learning_rate": 8.318701724978564e-05, "loss": 0.0776, "step": 1670 }, { "epoch": 0.29296364111953965, "grad_norm": 0.14040279428045313, "learning_rate": 8.29752524426556e-05, "loss": 0.0767, "step": 1680 }, { "epoch": 0.29470747231667976, "grad_norm": 0.08754112074945274, "learning_rate": 8.276243561369814e-05, "loss": 0.0758, "step": 1690 }, { "epoch": 0.2964513035138199, "grad_norm": 0.10893709018139172, "learning_rate": 8.254857355248824e-05, "loss": 0.0747, "step": 1700 }, { "epoch": 0.29819513471096, "grad_norm": 0.06433336098088924, "learning_rate": 8.233367308194734e-05, "loss": 0.0774, "step": 1710 }, { "epoch": 0.2999389659081001, "grad_norm": 0.09370553245193815, "learning_rate": 8.21177410581256e-05, "loss": 0.0745, "step": 1720 }, { "epoch": 0.3016827971052402, "grad_norm": 0.1113308601153905, "learning_rate": 8.190078436998326e-05, "loss": 0.0742, "step": 1730 }, { "epoch": 0.3034266283023803, "grad_norm": 0.0845496702841715, "learning_rate": 8.168280993917077e-05, "loss": 0.0765, "step": 1740 }, { "epoch": 0.30517045949952043, "grad_norm": 0.07782628428869619, "learning_rate": 8.146382471980803e-05, "loss": 0.0764, "step": 1750 }, { "epoch": 0.30691429069666054, "grad_norm": 0.07272801486673855, "learning_rate": 8.124383569826253e-05, "loss": 0.0743, "step": 1760 }, { "epoch": 0.3086581218938007, "grad_norm": 0.08576203017796658, "learning_rate": 8.102284989292638e-05, "loss": 0.077, "step": 1770 }, { "epoch": 0.3104019530909408, "grad_norm": 0.06773514930909028, "learning_rate": 8.080087435399249e-05, "loss": 0.0739, "step": 1780 }, { "epoch": 0.31214578428808093, "grad_norm": 0.083064606175779, "learning_rate": 8.057791616322959e-05, "loss": 0.0755, "step": 1790 }, { "epoch": 0.31388961548522104, "grad_norm": 0.12916703368799123, "learning_rate": 8.035398243375636e-05, "loss": 0.0752, "step": 1800 }, { "epoch": 0.31563344668236115, "grad_norm": 0.1381342843732955, "learning_rate": 8.012908030981441e-05, "loss": 0.0784, "step": 1810 }, { "epoch": 0.31737727787950126, "grad_norm": 0.08149140898903617, "learning_rate": 7.990321696654043e-05, "loss": 0.0746, "step": 1820 }, { "epoch": 0.3191211090766414, "grad_norm": 0.09800398683738969, "learning_rate": 7.967639960973726e-05, "loss": 0.0768, "step": 1830 }, { "epoch": 0.3208649402737815, "grad_norm": 0.0720534859927381, "learning_rate": 7.944863547564396e-05, "loss": 0.0722, "step": 1840 }, { "epoch": 0.3226087714709216, "grad_norm": 0.07931259123296684, "learning_rate": 7.921993183070498e-05, "loss": 0.0733, "step": 1850 }, { "epoch": 0.3243526026680617, "grad_norm": 0.13837309451231886, "learning_rate": 7.899029597133835e-05, "loss": 0.0759, "step": 1860 }, { "epoch": 0.3260964338652019, "grad_norm": 0.08485683188152221, "learning_rate": 7.875973522370293e-05, "loss": 0.0754, "step": 1870 }, { "epoch": 0.327840265062342, "grad_norm": 0.0772062101697051, "learning_rate": 7.852825694346456e-05, "loss": 0.0765, "step": 1880 }, { "epoch": 0.3295840962594821, "grad_norm": 0.089799828330404, "learning_rate": 7.82958685155615e-05, "loss": 0.0743, "step": 1890 }, { "epoch": 0.3313279274566222, "grad_norm": 0.10045562894572037, "learning_rate": 7.806257735396878e-05, "loss": 0.0754, "step": 1900 }, { "epoch": 0.3330717586537623, "grad_norm": 0.06500413003152958, "learning_rate": 7.782839090146173e-05, "loss": 0.073, "step": 1910 }, { "epoch": 0.33481558985090243, "grad_norm": 0.1259139302957018, "learning_rate": 7.759331662937841e-05, "loss": 0.0769, "step": 1920 }, { "epoch": 0.33655942104804254, "grad_norm": 0.11229045594501955, "learning_rate": 7.735736203738138e-05, "loss": 0.0737, "step": 1930 }, { "epoch": 0.33830325224518265, "grad_norm": 0.11020678621947962, "learning_rate": 7.71205346532183e-05, "loss": 0.0792, "step": 1940 }, { "epoch": 0.34004708344232276, "grad_norm": 0.07389223584844883, "learning_rate": 7.688284203248196e-05, "loss": 0.0754, "step": 1950 }, { "epoch": 0.3417909146394629, "grad_norm": 0.07106281995095357, "learning_rate": 7.664429175836903e-05, "loss": 0.0759, "step": 1960 }, { "epoch": 0.34353474583660304, "grad_norm": 0.09883942936733096, "learning_rate": 7.64048914414382e-05, "loss": 0.0763, "step": 1970 }, { "epoch": 0.34527857703374315, "grad_norm": 0.12548493474348643, "learning_rate": 7.616464871936749e-05, "loss": 0.0759, "step": 1980 }, { "epoch": 0.34702240823088326, "grad_norm": 0.10524371345048106, "learning_rate": 7.592357125671039e-05, "loss": 0.0734, "step": 1990 }, { "epoch": 0.3487662394280234, "grad_norm": 0.0989300552899626, "learning_rate": 7.56816667446515e-05, "loss": 0.0765, "step": 2000 }, { "epoch": 0.3505100706251635, "grad_norm": 0.06808863715144056, "learning_rate": 7.543894290076103e-05, "loss": 0.0747, "step": 2010 }, { "epoch": 0.3522539018223036, "grad_norm": 0.1241267483898317, "learning_rate": 7.519540746874868e-05, "loss": 0.0752, "step": 2020 }, { "epoch": 0.3539977330194437, "grad_norm": 0.12046241979127437, "learning_rate": 7.495106821821655e-05, "loss": 0.0753, "step": 2030 }, { "epoch": 0.3557415642165838, "grad_norm": 0.09115653788000529, "learning_rate": 7.470593294441124e-05, "loss": 0.0744, "step": 2040 }, { "epoch": 0.3574853954137239, "grad_norm": 0.13456888345338583, "learning_rate": 7.44600094679752e-05, "loss": 0.0738, "step": 2050 }, { "epoch": 0.3592292266108641, "grad_norm": 0.06806889038779979, "learning_rate": 7.421330563469716e-05, "loss": 0.0739, "step": 2060 }, { "epoch": 0.3609730578080042, "grad_norm": 0.11182927221959478, "learning_rate": 7.396582931526193e-05, "loss": 0.0735, "step": 2070 }, { "epoch": 0.3627168890051443, "grad_norm": 0.06913605337271556, "learning_rate": 7.37175884049992e-05, "loss": 0.0723, "step": 2080 }, { "epoch": 0.3644607202022844, "grad_norm": 0.06420773618946907, "learning_rate": 7.346859082363171e-05, "loss": 0.0745, "step": 2090 }, { "epoch": 0.36620455139942454, "grad_norm": 0.09270103993797899, "learning_rate": 7.321884451502252e-05, "loss": 0.072, "step": 2100 }, { "epoch": 0.36794838259656465, "grad_norm": 0.23156282710055895, "learning_rate": 7.296835744692163e-05, "loss": 0.0719, "step": 2110 }, { "epoch": 0.36969221379370476, "grad_norm": 0.07236089370303052, "learning_rate": 7.271713761071181e-05, "loss": 0.0727, "step": 2120 }, { "epoch": 0.37143604499084487, "grad_norm": 0.0896827167833697, "learning_rate": 7.246519302115355e-05, "loss": 0.0721, "step": 2130 }, { "epoch": 0.373179876187985, "grad_norm": 0.07998918062338511, "learning_rate": 7.221253171612944e-05, "loss": 0.0748, "step": 2140 }, { "epoch": 0.3749237073851251, "grad_norm": 0.08554261487613933, "learning_rate": 7.195916175638772e-05, "loss": 0.0733, "step": 2150 }, { "epoch": 0.37666753858226526, "grad_norm": 0.11129041716978767, "learning_rate": 7.170509122528512e-05, "loss": 0.075, "step": 2160 }, { "epoch": 0.37841136977940537, "grad_norm": 0.10613761388491819, "learning_rate": 7.14503282285289e-05, "loss": 0.0757, "step": 2170 }, { "epoch": 0.3801552009765455, "grad_norm": 0.11019197584016041, "learning_rate": 7.119488089391835e-05, "loss": 0.0744, "step": 2180 }, { "epoch": 0.3818990321736856, "grad_norm": 0.06347022683878861, "learning_rate": 7.093875737108549e-05, "loss": 0.0719, "step": 2190 }, { "epoch": 0.3836428633708257, "grad_norm": 0.21254740042731002, "learning_rate": 7.068196583123496e-05, "loss": 0.0748, "step": 2200 }, { "epoch": 0.3853866945679658, "grad_norm": 0.07496306455978158, "learning_rate": 7.042451446688342e-05, "loss": 0.0746, "step": 2210 }, { "epoch": 0.3871305257651059, "grad_norm": 0.3232649711858286, "learning_rate": 7.016641149159815e-05, "loss": 0.0717, "step": 2220 }, { "epoch": 0.38887435696224604, "grad_norm": 0.11723985352277601, "learning_rate": 6.990766513973503e-05, "loss": 0.0723, "step": 2230 }, { "epoch": 0.39061818815938615, "grad_norm": 0.06703767679829058, "learning_rate": 6.964828366617583e-05, "loss": 0.0723, "step": 2240 }, { "epoch": 0.3923620193565263, "grad_norm": 0.06452655503833808, "learning_rate": 6.938827534606483e-05, "loss": 0.0726, "step": 2250 }, { "epoch": 0.3941058505536664, "grad_norm": 0.11182644422545931, "learning_rate": 6.912764847454485e-05, "loss": 0.0738, "step": 2260 }, { "epoch": 0.39584968175080654, "grad_norm": 0.0860141601894516, "learning_rate": 6.886641136649255e-05, "loss": 0.073, "step": 2270 }, { "epoch": 0.39759351294794665, "grad_norm": 0.0797656680775507, "learning_rate": 6.860457235625322e-05, "loss": 0.0737, "step": 2280 }, { "epoch": 0.39933734414508676, "grad_norm": 0.10052131057107375, "learning_rate": 6.834213979737487e-05, "loss": 0.0733, "step": 2290 }, { "epoch": 0.40108117534222687, "grad_norm": 0.10115391466030552, "learning_rate": 6.807912206234168e-05, "loss": 0.0763, "step": 2300 }, { "epoch": 0.402825006539367, "grad_norm": 0.06133679525617129, "learning_rate": 6.7815527542307e-05, "loss": 0.0742, "step": 2310 }, { "epoch": 0.4045688377365071, "grad_norm": 0.09619739127280517, "learning_rate": 6.755136464682545e-05, "loss": 0.0725, "step": 2320 }, { "epoch": 0.4063126689336472, "grad_norm": 0.09091646569455752, "learning_rate": 6.728664180358487e-05, "loss": 0.0721, "step": 2330 }, { "epoch": 0.4080565001307873, "grad_norm": 0.07656697054608402, "learning_rate": 6.702136745813721e-05, "loss": 0.074, "step": 2340 }, { "epoch": 0.4098003313279275, "grad_norm": 0.07204279463021213, "learning_rate": 6.67555500736293e-05, "loss": 0.073, "step": 2350 }, { "epoch": 0.4115441625250676, "grad_norm": 0.09813297810926205, "learning_rate": 6.648919813053266e-05, "loss": 0.0718, "step": 2360 }, { "epoch": 0.4132879937222077, "grad_norm": 0.09532683884750826, "learning_rate": 6.62223201263731e-05, "loss": 0.0722, "step": 2370 }, { "epoch": 0.4150318249193478, "grad_norm": 0.14719992956290695, "learning_rate": 6.595492457545953e-05, "loss": 0.0721, "step": 2380 }, { "epoch": 0.4167756561164879, "grad_norm": 0.10011053762401607, "learning_rate": 6.568702000861234e-05, "loss": 0.0735, "step": 2390 }, { "epoch": 0.41851948731362804, "grad_norm": 0.08517512639021824, "learning_rate": 6.541861497289126e-05, "loss": 0.0731, "step": 2400 }, { "epoch": 0.42026331851076815, "grad_norm": 0.1059116573619961, "learning_rate": 6.514971803132264e-05, "loss": 0.0727, "step": 2410 }, { "epoch": 0.42200714970790826, "grad_norm": 0.07521574766408227, "learning_rate": 6.488033776262631e-05, "loss": 0.0746, "step": 2420 }, { "epoch": 0.42375098090504837, "grad_norm": 0.07789931436910726, "learning_rate": 6.461048276094189e-05, "loss": 0.0705, "step": 2430 }, { "epoch": 0.42549481210218854, "grad_norm": 0.07966418822617252, "learning_rate": 6.434016163555452e-05, "loss": 0.0731, "step": 2440 }, { "epoch": 0.42723864329932865, "grad_norm": 0.06805896335096397, "learning_rate": 6.406938301062032e-05, "loss": 0.0726, "step": 2450 }, { "epoch": 0.42898247449646876, "grad_norm": 0.09447753229565763, "learning_rate": 6.379815552489112e-05, "loss": 0.0706, "step": 2460 }, { "epoch": 0.43072630569360887, "grad_norm": 0.08285112104375023, "learning_rate": 6.352648783143904e-05, "loss": 0.0717, "step": 2470 }, { "epoch": 0.432470136890749, "grad_norm": 0.13098455739480758, "learning_rate": 6.325438859738016e-05, "loss": 0.0729, "step": 2480 }, { "epoch": 0.4342139680878891, "grad_norm": 0.08270950207818945, "learning_rate": 6.298186650359832e-05, "loss": 0.0707, "step": 2490 }, { "epoch": 0.4359577992850292, "grad_norm": 0.07129608509123005, "learning_rate": 6.270893024446788e-05, "loss": 0.0722, "step": 2500 }, { "epoch": 0.4377016304821693, "grad_norm": 0.10000820899680192, "learning_rate": 6.243558852757653e-05, "loss": 0.0705, "step": 2510 }, { "epoch": 0.4394454616793094, "grad_norm": 0.10373969721399491, "learning_rate": 6.216185007344744e-05, "loss": 0.0733, "step": 2520 }, { "epoch": 0.44118929287644953, "grad_norm": 0.10034693367567056, "learning_rate": 6.188772361526104e-05, "loss": 0.0735, "step": 2530 }, { "epoch": 0.4429331240735897, "grad_norm": 0.07190792039286192, "learning_rate": 6.161321789857635e-05, "loss": 0.0732, "step": 2540 }, { "epoch": 0.4446769552707298, "grad_norm": 0.06736312666924092, "learning_rate": 6.133834168105206e-05, "loss": 0.0718, "step": 2550 }, { "epoch": 0.4464207864678699, "grad_norm": 0.08694623472328794, "learning_rate": 6.106310373216706e-05, "loss": 0.0715, "step": 2560 }, { "epoch": 0.44816461766501003, "grad_norm": 0.06751618527562933, "learning_rate": 6.078751283294075e-05, "loss": 0.0727, "step": 2570 }, { "epoch": 0.44990844886215015, "grad_norm": 0.07166164012759571, "learning_rate": 6.051157777565274e-05, "loss": 0.0719, "step": 2580 }, { "epoch": 0.45165228005929026, "grad_norm": 0.07205363465786559, "learning_rate": 6.023530736356252e-05, "loss": 0.0711, "step": 2590 }, { "epoch": 0.45339611125643037, "grad_norm": 0.06726674291216705, "learning_rate": 5.9958710410628515e-05, "loss": 0.0731, "step": 2600 }, { "epoch": 0.4551399424535705, "grad_norm": 0.1362051473345596, "learning_rate": 5.96817957412269e-05, "loss": 0.0692, "step": 2610 }, { "epoch": 0.4568837736507106, "grad_norm": 0.07049177639671635, "learning_rate": 5.940457218987003e-05, "loss": 0.0703, "step": 2620 }, { "epoch": 0.4586276048478507, "grad_norm": 0.09112289372540164, "learning_rate": 5.912704860092473e-05, "loss": 0.0714, "step": 2630 }, { "epoch": 0.46037143604499087, "grad_norm": 0.11655945219337893, "learning_rate": 5.884923382832996e-05, "loss": 0.0723, "step": 2640 }, { "epoch": 0.462115267242131, "grad_norm": 0.0768073584235417, "learning_rate": 5.8571136735314456e-05, "loss": 0.0712, "step": 2650 }, { "epoch": 0.4638590984392711, "grad_norm": 0.10315448855852374, "learning_rate": 5.829276619411392e-05, "loss": 0.0693, "step": 2660 }, { "epoch": 0.4656029296364112, "grad_norm": 0.06863388845145796, "learning_rate": 5.801413108568797e-05, "loss": 0.0723, "step": 2670 }, { "epoch": 0.4673467608335513, "grad_norm": 0.07431300580022854, "learning_rate": 5.773524029943682e-05, "loss": 0.0723, "step": 2680 }, { "epoch": 0.4690905920306914, "grad_norm": 0.0678468394020785, "learning_rate": 5.745610273291766e-05, "loss": 0.0697, "step": 2690 }, { "epoch": 0.47083442322783153, "grad_norm": 0.051461083413721374, "learning_rate": 5.7176727291560814e-05, "loss": 0.0692, "step": 2700 }, { "epoch": 0.47257825442497164, "grad_norm": 0.3637961443238326, "learning_rate": 5.689712288838561e-05, "loss": 0.0708, "step": 2710 }, { "epoch": 0.47432208562211176, "grad_norm": 0.12899584659018307, "learning_rate": 5.661729844371601e-05, "loss": 0.0717, "step": 2720 }, { "epoch": 0.4760659168192519, "grad_norm": 0.0970116230063514, "learning_rate": 5.633726288489609e-05, "loss": 0.0702, "step": 2730 }, { "epoch": 0.47780974801639203, "grad_norm": 0.06925056867854305, "learning_rate": 5.6057025146005126e-05, "loss": 0.0694, "step": 2740 }, { "epoch": 0.47955357921353214, "grad_norm": 0.09518941391005381, "learning_rate": 5.577659416757267e-05, "loss": 0.0693, "step": 2750 }, { "epoch": 0.48129741041067226, "grad_norm": 0.0688787651664979, "learning_rate": 5.5495978896293244e-05, "loss": 0.0685, "step": 2760 }, { "epoch": 0.48304124160781237, "grad_norm": 0.09890275204254635, "learning_rate": 5.521518828474091e-05, "loss": 0.0712, "step": 2770 }, { "epoch": 0.4847850728049525, "grad_norm": 0.07029397820447766, "learning_rate": 5.4934231291083724e-05, "loss": 0.0705, "step": 2780 }, { "epoch": 0.4865289040020926, "grad_norm": 0.114703628881282, "learning_rate": 5.465311687879785e-05, "loss": 0.0728, "step": 2790 }, { "epoch": 0.4882727351992327, "grad_norm": 0.06900079674714125, "learning_rate": 5.4371854016381686e-05, "loss": 0.0693, "step": 2800 }, { "epoch": 0.4900165663963728, "grad_norm": 0.07064873333381244, "learning_rate": 5.409045167706962e-05, "loss": 0.069, "step": 2810 }, { "epoch": 0.4917603975935129, "grad_norm": 0.09125097079525542, "learning_rate": 5.380891883854591e-05, "loss": 0.0719, "step": 2820 }, { "epoch": 0.4935042287906531, "grad_norm": 0.053404142344694504, "learning_rate": 5.352726448265808e-05, "loss": 0.0716, "step": 2830 }, { "epoch": 0.4952480599877932, "grad_norm": 0.07962910381863947, "learning_rate": 5.3245497595130575e-05, "loss": 0.0702, "step": 2840 }, { "epoch": 0.4969918911849333, "grad_norm": 0.08362471604956066, "learning_rate": 5.296362716527788e-05, "loss": 0.0699, "step": 2850 }, { "epoch": 0.4987357223820734, "grad_norm": 0.06307457307002638, "learning_rate": 5.268166218571792e-05, "loss": 0.0687, "step": 2860 }, { "epoch": 0.5004795535792136, "grad_norm": 0.11296321834314839, "learning_rate": 5.239961165208499e-05, "loss": 0.069, "step": 2870 }, { "epoch": 0.5022233847763536, "grad_norm": 0.1053843511995927, "learning_rate": 5.211748456274291e-05, "loss": 0.0691, "step": 2880 }, { "epoch": 0.5039672159734938, "grad_norm": 0.1109187673591365, "learning_rate": 5.183528991849784e-05, "loss": 0.0704, "step": 2890 }, { "epoch": 0.5057110471706339, "grad_norm": 0.055240959366235146, "learning_rate": 5.155303672231123e-05, "loss": 0.0689, "step": 2900 }, { "epoch": 0.507454878367774, "grad_norm": 0.062304367575237746, "learning_rate": 5.127073397901248e-05, "loss": 0.0715, "step": 2910 }, { "epoch": 0.5091987095649141, "grad_norm": 0.09104354683280115, "learning_rate": 5.09883906950117e-05, "loss": 0.0715, "step": 2920 }, { "epoch": 0.5109425407620543, "grad_norm": 0.09335898355116179, "learning_rate": 5.070601587801246e-05, "loss": 0.0721, "step": 2930 }, { "epoch": 0.5126863719591943, "grad_norm": 0.1212833028716344, "learning_rate": 5.042361853672428e-05, "loss": 0.0688, "step": 2940 }, { "epoch": 0.5144302031563345, "grad_norm": 0.1267863802852585, "learning_rate": 5.0141207680575265e-05, "loss": 0.0692, "step": 2950 }, { "epoch": 0.5161740343534745, "grad_norm": 0.13749347686727084, "learning_rate": 4.985879231942474e-05, "loss": 0.069, "step": 2960 }, { "epoch": 0.5179178655506147, "grad_norm": 0.08673837874814297, "learning_rate": 4.957638146327574e-05, "loss": 0.0694, "step": 2970 }, { "epoch": 0.5196616967477549, "grad_norm": 0.09636753443359779, "learning_rate": 4.929398412198755e-05, "loss": 0.0703, "step": 2980 }, { "epoch": 0.5214055279448949, "grad_norm": 0.06507262712865346, "learning_rate": 4.9011609304988295e-05, "loss": 0.0683, "step": 2990 }, { "epoch": 0.5231493591420351, "grad_norm": 0.06619223766291939, "learning_rate": 4.8729266020987553e-05, "loss": 0.0677, "step": 3000 }, { "epoch": 0.5248931903391751, "grad_norm": 0.05287355674779802, "learning_rate": 4.844696327768878e-05, "loss": 0.0698, "step": 3010 }, { "epoch": 0.5266370215363153, "grad_norm": 0.10505577067699386, "learning_rate": 4.8164710081502165e-05, "loss": 0.0687, "step": 3020 }, { "epoch": 0.5283808527334554, "grad_norm": 0.056820349432236315, "learning_rate": 4.788251543725711e-05, "loss": 0.0699, "step": 3030 }, { "epoch": 0.5301246839305955, "grad_norm": 0.1260634689291685, "learning_rate": 4.760038834791503e-05, "loss": 0.0673, "step": 3040 }, { "epoch": 0.5318685151277356, "grad_norm": 0.0968264946104606, "learning_rate": 4.7318337814282085e-05, "loss": 0.0674, "step": 3050 }, { "epoch": 0.5336123463248758, "grad_norm": 0.085559283784732, "learning_rate": 4.703637283472213e-05, "loss": 0.0714, "step": 3060 }, { "epoch": 0.5353561775220159, "grad_norm": 0.09957416211753474, "learning_rate": 4.675450240486943e-05, "loss": 0.0706, "step": 3070 }, { "epoch": 0.537100008719156, "grad_norm": 0.06932794550036621, "learning_rate": 4.647273551734192e-05, "loss": 0.0706, "step": 3080 }, { "epoch": 0.5388438399162961, "grad_norm": 0.0959577561976307, "learning_rate": 4.619108116145411e-05, "loss": 0.0686, "step": 3090 }, { "epoch": 0.5405876711134362, "grad_norm": 0.11053090195002527, "learning_rate": 4.5909548322930386e-05, "loss": 0.0685, "step": 3100 }, { "epoch": 0.5423315023105764, "grad_norm": 0.1385181499715123, "learning_rate": 4.562814598361834e-05, "loss": 0.069, "step": 3110 }, { "epoch": 0.5440753335077164, "grad_norm": 0.0860474878686007, "learning_rate": 4.534688312120215e-05, "loss": 0.0719, "step": 3120 }, { "epoch": 0.5458191647048566, "grad_norm": 0.06367906769350236, "learning_rate": 4.506576870891628e-05, "loss": 0.0667, "step": 3130 }, { "epoch": 0.5475629959019966, "grad_norm": 0.056417791244654396, "learning_rate": 4.478481171525909e-05, "loss": 0.0696, "step": 3140 }, { "epoch": 0.5493068270991368, "grad_norm": 0.07510052439666184, "learning_rate": 4.450402110370677e-05, "loss": 0.07, "step": 3150 }, { "epoch": 0.551050658296277, "grad_norm": 0.07820484635095121, "learning_rate": 4.422340583242733e-05, "loss": 0.071, "step": 3160 }, { "epoch": 0.552794489493417, "grad_norm": 0.1104677720885283, "learning_rate": 4.3942974853994885e-05, "loss": 0.0688, "step": 3170 }, { "epoch": 0.5545383206905572, "grad_norm": 0.09272002167916459, "learning_rate": 4.366273711510392e-05, "loss": 0.0681, "step": 3180 }, { "epoch": 0.5562821518876973, "grad_norm": 0.1036837406322013, "learning_rate": 4.3382701556284006e-05, "loss": 0.0677, "step": 3190 }, { "epoch": 0.5580259830848374, "grad_norm": 0.07390053294467375, "learning_rate": 4.3102877111614406e-05, "loss": 0.0661, "step": 3200 }, { "epoch": 0.5597698142819775, "grad_norm": 0.10154107392434804, "learning_rate": 4.282327270843919e-05, "loss": 0.0678, "step": 3210 }, { "epoch": 0.5615136454791176, "grad_norm": 0.05726446240521516, "learning_rate": 4.2543897267082346e-05, "loss": 0.0704, "step": 3220 }, { "epoch": 0.5632574766762577, "grad_norm": 0.07584421387499146, "learning_rate": 4.226475970056319e-05, "loss": 0.0682, "step": 3230 }, { "epoch": 0.5650013078733979, "grad_norm": 0.12385055066323168, "learning_rate": 4.1985868914312035e-05, "loss": 0.0671, "step": 3240 }, { "epoch": 0.5667451390705379, "grad_norm": 0.057812117646785455, "learning_rate": 4.1707233805886096e-05, "loss": 0.0673, "step": 3250 }, { "epoch": 0.5684889702676781, "grad_norm": 0.05830094610213767, "learning_rate": 4.1428863264685556e-05, "loss": 0.0681, "step": 3260 }, { "epoch": 0.5702328014648183, "grad_norm": 0.08814953133237691, "learning_rate": 4.1150766171670044e-05, "loss": 0.0677, "step": 3270 }, { "epoch": 0.5719766326619583, "grad_norm": 0.08651430164476416, "learning_rate": 4.087295139907528e-05, "loss": 0.0677, "step": 3280 }, { "epoch": 0.5737204638590985, "grad_norm": 0.23832741853388886, "learning_rate": 4.059542781012998e-05, "loss": 0.0678, "step": 3290 }, { "epoch": 0.5754642950562385, "grad_norm": 0.05885238799413647, "learning_rate": 4.0318204258773126e-05, "loss": 0.0684, "step": 3300 }, { "epoch": 0.5772081262533787, "grad_norm": 0.07791148857337749, "learning_rate": 4.00412895893715e-05, "loss": 0.0687, "step": 3310 }, { "epoch": 0.5789519574505188, "grad_norm": 0.06786486116480168, "learning_rate": 3.9764692636437484e-05, "loss": 0.0679, "step": 3320 }, { "epoch": 0.5806957886476589, "grad_norm": 0.05788405804070718, "learning_rate": 3.948842222434728e-05, "loss": 0.0697, "step": 3330 }, { "epoch": 0.582439619844799, "grad_norm": 0.06948825516042008, "learning_rate": 3.921248716705927e-05, "loss": 0.067, "step": 3340 }, { "epoch": 0.5841834510419391, "grad_norm": 0.09718375497294167, "learning_rate": 3.8936896267832935e-05, "loss": 0.0671, "step": 3350 }, { "epoch": 0.5859272822390793, "grad_norm": 0.32522050061514907, "learning_rate": 3.866165831894796e-05, "loss": 0.0688, "step": 3360 }, { "epoch": 0.5876711134362194, "grad_norm": 0.09966939851122777, "learning_rate": 3.8386782101423665e-05, "loss": 0.0688, "step": 3370 }, { "epoch": 0.5894149446333595, "grad_norm": 0.1063520167218931, "learning_rate": 3.811227638473897e-05, "loss": 0.0681, "step": 3380 }, { "epoch": 0.5911587758304996, "grad_norm": 0.07790458741475327, "learning_rate": 3.783814992655256e-05, "loss": 0.0694, "step": 3390 }, { "epoch": 0.5929026070276397, "grad_norm": 0.07766471726971445, "learning_rate": 3.7564411472423464e-05, "loss": 0.0696, "step": 3400 }, { "epoch": 0.5946464382247798, "grad_norm": 0.060585652583308065, "learning_rate": 3.729106975553214e-05, "loss": 0.0659, "step": 3410 }, { "epoch": 0.59639026942192, "grad_norm": 0.1309180860317793, "learning_rate": 3.701813349640169e-05, "loss": 0.0662, "step": 3420 }, { "epoch": 0.59813410061906, "grad_norm": 0.13278490095026282, "learning_rate": 3.674561140261983e-05, "loss": 0.0675, "step": 3430 }, { "epoch": 0.5998779318162002, "grad_norm": 0.08819239845829921, "learning_rate": 3.647351216856099e-05, "loss": 0.0661, "step": 3440 }, { "epoch": 0.6016217630133404, "grad_norm": 0.0694256875185968, "learning_rate": 3.620184447510888e-05, "loss": 0.0691, "step": 3450 }, { "epoch": 0.6033655942104804, "grad_norm": 0.05045493054224779, "learning_rate": 3.5930616989379695e-05, "loss": 0.0654, "step": 3460 }, { "epoch": 0.6051094254076206, "grad_norm": 0.05387040717024746, "learning_rate": 3.5659838364445505e-05, "loss": 0.0672, "step": 3470 }, { "epoch": 0.6068532566047606, "grad_norm": 0.05687136811369198, "learning_rate": 3.5389517239058126e-05, "loss": 0.0665, "step": 3480 }, { "epoch": 0.6085970878019008, "grad_norm": 0.17637553404307893, "learning_rate": 3.511966223737368e-05, "loss": 0.0676, "step": 3490 }, { "epoch": 0.6103409189990409, "grad_norm": 0.09417795829049438, "learning_rate": 3.485028196867738e-05, "loss": 0.0656, "step": 3500 }, { "epoch": 0.612084750196181, "grad_norm": 0.09675688928564037, "learning_rate": 3.458138502710876e-05, "loss": 0.0662, "step": 3510 }, { "epoch": 0.6138285813933211, "grad_norm": 0.06982987968601855, "learning_rate": 3.431297999138768e-05, "loss": 0.0669, "step": 3520 }, { "epoch": 0.6155724125904612, "grad_norm": 0.060806762419563676, "learning_rate": 3.4045075424540484e-05, "loss": 0.0665, "step": 3530 }, { "epoch": 0.6173162437876014, "grad_norm": 0.057810486903244925, "learning_rate": 3.37776798736269e-05, "loss": 0.0687, "step": 3540 }, { "epoch": 0.6190600749847415, "grad_norm": 0.09052003071252049, "learning_rate": 3.3510801869467354e-05, "loss": 0.0687, "step": 3550 }, { "epoch": 0.6208039061818816, "grad_norm": 0.05915583777230159, "learning_rate": 3.324444992637071e-05, "loss": 0.0667, "step": 3560 }, { "epoch": 0.6225477373790217, "grad_norm": 0.056903572931067674, "learning_rate": 3.297863254186279e-05, "loss": 0.0661, "step": 3570 }, { "epoch": 0.6242915685761619, "grad_norm": 0.07457729627076666, "learning_rate": 3.2713358196415146e-05, "loss": 0.067, "step": 3580 }, { "epoch": 0.6260353997733019, "grad_norm": 0.0688133988676, "learning_rate": 3.244863535317455e-05, "loss": 0.0664, "step": 3590 }, { "epoch": 0.6277792309704421, "grad_norm": 0.08316153717072015, "learning_rate": 3.2184472457693006e-05, "loss": 0.0663, "step": 3600 }, { "epoch": 0.6295230621675821, "grad_norm": 0.07001307487762984, "learning_rate": 3.192087793765832e-05, "loss": 0.0647, "step": 3610 }, { "epoch": 0.6312668933647223, "grad_norm": 0.15620567579347566, "learning_rate": 3.1657860202625146e-05, "loss": 0.0701, "step": 3620 }, { "epoch": 0.6330107245618624, "grad_norm": 0.10373195476410263, "learning_rate": 3.1395427643746796e-05, "loss": 0.0668, "step": 3630 }, { "epoch": 0.6347545557590025, "grad_norm": 0.0734675356947609, "learning_rate": 3.113358863350747e-05, "loss": 0.0681, "step": 3640 }, { "epoch": 0.6364983869561427, "grad_norm": 0.07535325822125089, "learning_rate": 3.0872351525455166e-05, "loss": 0.0652, "step": 3650 }, { "epoch": 0.6382422181532827, "grad_norm": 0.15252639717160144, "learning_rate": 3.061172465393518e-05, "loss": 0.0656, "step": 3660 }, { "epoch": 0.6399860493504229, "grad_norm": 0.052833579208426094, "learning_rate": 3.035171633382419e-05, "loss": 0.0658, "step": 3670 }, { "epoch": 0.641729880547563, "grad_norm": 0.14292022401062196, "learning_rate": 3.009233486026497e-05, "loss": 0.0664, "step": 3680 }, { "epoch": 0.6434737117447031, "grad_norm": 0.10589992815491318, "learning_rate": 2.9833588508401866e-05, "loss": 0.0657, "step": 3690 }, { "epoch": 0.6452175429418432, "grad_norm": 0.04779313436976461, "learning_rate": 2.9575485533116598e-05, "loss": 0.0653, "step": 3700 }, { "epoch": 0.6469613741389834, "grad_norm": 0.05400461509544075, "learning_rate": 2.9318034168765046e-05, "loss": 0.0667, "step": 3710 }, { "epoch": 0.6487052053361234, "grad_norm": 0.059519586489227526, "learning_rate": 2.906124262891451e-05, "loss": 0.0672, "step": 3720 }, { "epoch": 0.6504490365332636, "grad_norm": 0.0648463847094917, "learning_rate": 2.880511910608164e-05, "loss": 0.0679, "step": 3730 }, { "epoch": 0.6521928677304037, "grad_norm": 0.1183811255273542, "learning_rate": 2.8549671771471133e-05, "loss": 0.0648, "step": 3740 }, { "epoch": 0.6539366989275438, "grad_norm": 0.05476695117570433, "learning_rate": 2.829490877471491e-05, "loss": 0.0642, "step": 3750 }, { "epoch": 0.655680530124684, "grad_norm": 0.06820911786107267, "learning_rate": 2.8040838243612288e-05, "loss": 0.0665, "step": 3760 }, { "epoch": 0.657424361321824, "grad_norm": 0.10776643187307983, "learning_rate": 2.7787468283870577e-05, "loss": 0.0663, "step": 3770 }, { "epoch": 0.6591681925189642, "grad_norm": 0.11186248154441321, "learning_rate": 2.7534806978846465e-05, "loss": 0.0667, "step": 3780 }, { "epoch": 0.6609120237161042, "grad_norm": 0.11155104114763678, "learning_rate": 2.7282862389288206e-05, "loss": 0.0648, "step": 3790 }, { "epoch": 0.6626558549132444, "grad_norm": 0.08353799399740781, "learning_rate": 2.7031642553078374e-05, "loss": 0.0663, "step": 3800 }, { "epoch": 0.6643996861103845, "grad_norm": 0.1252138214848371, "learning_rate": 2.6781155484977493e-05, "loss": 0.0659, "step": 3810 }, { "epoch": 0.6661435173075246, "grad_norm": 0.11677307876369836, "learning_rate": 2.6531409176368295e-05, "loss": 0.0652, "step": 3820 }, { "epoch": 0.6678873485046648, "grad_norm": 0.13081371005512618, "learning_rate": 2.6282411595000812e-05, "loss": 0.0663, "step": 3830 }, { "epoch": 0.6696311797018049, "grad_norm": 0.09064110239185938, "learning_rate": 2.6034170684738064e-05, "loss": 0.0657, "step": 3840 }, { "epoch": 0.671375010898945, "grad_norm": 0.10497326491021886, "learning_rate": 2.5786694365302856e-05, "loss": 0.0662, "step": 3850 }, { "epoch": 0.6731188420960851, "grad_norm": 0.1894881130574952, "learning_rate": 2.5539990532024825e-05, "loss": 0.0663, "step": 3860 }, { "epoch": 0.6748626732932252, "grad_norm": 0.11340703925871584, "learning_rate": 2.5294067055588765e-05, "loss": 0.0647, "step": 3870 }, { "epoch": 0.6766065044903653, "grad_norm": 0.0674795553477859, "learning_rate": 2.5048931781783456e-05, "loss": 0.0663, "step": 3880 }, { "epoch": 0.6783503356875055, "grad_norm": 0.06649329429370544, "learning_rate": 2.480459253125132e-05, "loss": 0.0654, "step": 3890 }, { "epoch": 0.6800941668846455, "grad_norm": 0.098929499241832, "learning_rate": 2.456105709923897e-05, "loss": 0.0651, "step": 3900 }, { "epoch": 0.6818379980817857, "grad_norm": 0.06386861210237169, "learning_rate": 2.4318333255348525e-05, "loss": 0.0676, "step": 3910 }, { "epoch": 0.6835818292789257, "grad_norm": 0.08662939273115117, "learning_rate": 2.4076428743289608e-05, "loss": 0.0647, "step": 3920 }, { "epoch": 0.6853256604760659, "grad_norm": 0.1405260835479582, "learning_rate": 2.3835351280632513e-05, "loss": 0.0681, "step": 3930 }, { "epoch": 0.6870694916732061, "grad_norm": 0.07786293851164922, "learning_rate": 2.3595108558561812e-05, "loss": 0.0661, "step": 3940 }, { "epoch": 0.6888133228703461, "grad_norm": 0.08244123534587308, "learning_rate": 2.3355708241630998e-05, "loss": 0.0644, "step": 3950 }, { "epoch": 0.6905571540674863, "grad_norm": 0.08600292121296654, "learning_rate": 2.311715796751805e-05, "loss": 0.0696, "step": 3960 }, { "epoch": 0.6923009852646264, "grad_norm": 0.06809070228006706, "learning_rate": 2.2879465346781703e-05, "loss": 0.0667, "step": 3970 }, { "epoch": 0.6940448164617665, "grad_norm": 0.10596061042299985, "learning_rate": 2.264263796261864e-05, "loss": 0.0662, "step": 3980 }, { "epoch": 0.6957886476589066, "grad_norm": 0.05600677052268759, "learning_rate": 2.2406683370621618e-05, "loss": 0.065, "step": 3990 }, { "epoch": 0.6975324788560467, "grad_norm": 0.12350042680040363, "learning_rate": 2.2171609098538278e-05, "loss": 0.0674, "step": 4000 }, { "epoch": 0.6992763100531868, "grad_norm": 0.08609523107160226, "learning_rate": 2.1937422646031214e-05, "loss": 0.0672, "step": 4010 }, { "epoch": 0.701020141250327, "grad_norm": 0.06523963542097193, "learning_rate": 2.170413148443852e-05, "loss": 0.0645, "step": 4020 }, { "epoch": 0.7027639724474671, "grad_norm": 0.09613588330972808, "learning_rate": 2.1471743056535455e-05, "loss": 0.0655, "step": 4030 }, { "epoch": 0.7045078036446072, "grad_norm": 0.08438802353157454, "learning_rate": 2.124026477629706e-05, "loss": 0.0648, "step": 4040 }, { "epoch": 0.7062516348417474, "grad_norm": 0.0838100106180724, "learning_rate": 2.100970402866164e-05, "loss": 0.0649, "step": 4050 }, { "epoch": 0.7079954660388874, "grad_norm": 0.048878932271227936, "learning_rate": 2.0780068169295032e-05, "loss": 0.0646, "step": 4060 }, { "epoch": 0.7097392972360276, "grad_norm": 0.05490806540035838, "learning_rate": 2.0551364524356054e-05, "loss": 0.0637, "step": 4070 }, { "epoch": 0.7114831284331676, "grad_norm": 0.0928466829346082, "learning_rate": 2.0323600390262742e-05, "loss": 0.0631, "step": 4080 }, { "epoch": 0.7132269596303078, "grad_norm": 0.07478264466938572, "learning_rate": 2.0096783033459564e-05, "loss": 0.0653, "step": 4090 }, { "epoch": 0.7149707908274479, "grad_norm": 0.062316603777551216, "learning_rate": 1.987091969018561e-05, "loss": 0.0648, "step": 4100 }, { "epoch": 0.716714622024588, "grad_norm": 0.09437152013364276, "learning_rate": 1.9646017566243658e-05, "loss": 0.0661, "step": 4110 }, { "epoch": 0.7184584532217282, "grad_norm": 0.05322246323194081, "learning_rate": 1.9422083836770406e-05, "loss": 0.0667, "step": 4120 }, { "epoch": 0.7202022844188682, "grad_norm": 0.07479897442184201, "learning_rate": 1.919912564600753e-05, "loss": 0.0648, "step": 4130 }, { "epoch": 0.7219461156160084, "grad_norm": 0.05492038679380218, "learning_rate": 1.8977150107073633e-05, "loss": 0.0642, "step": 4140 }, { "epoch": 0.7236899468131485, "grad_norm": 0.08536624329932077, "learning_rate": 1.8756164301737476e-05, "loss": 0.0626, "step": 4150 }, { "epoch": 0.7254337780102886, "grad_norm": 0.06859524210266144, "learning_rate": 1.853617528019197e-05, "loss": 0.0653, "step": 4160 }, { "epoch": 0.7271776092074287, "grad_norm": 0.12160361687651873, "learning_rate": 1.831719006082924e-05, "loss": 0.0653, "step": 4170 }, { "epoch": 0.7289214404045689, "grad_norm": 0.18282902871378182, "learning_rate": 1.809921563001676e-05, "loss": 0.0667, "step": 4180 }, { "epoch": 0.7306652716017089, "grad_norm": 0.0856597489019827, "learning_rate": 1.7882258941874432e-05, "loss": 0.0663, "step": 4190 }, { "epoch": 0.7324091027988491, "grad_norm": 0.10649135403842602, "learning_rate": 1.7666326918052667e-05, "loss": 0.0642, "step": 4200 }, { "epoch": 0.7341529339959892, "grad_norm": 0.12225388455758668, "learning_rate": 1.745142644751177e-05, "loss": 0.0657, "step": 4210 }, { "epoch": 0.7358967651931293, "grad_norm": 0.11567104795291488, "learning_rate": 1.7237564386301868e-05, "loss": 0.0661, "step": 4220 }, { "epoch": 0.7376405963902695, "grad_norm": 0.07264396441233593, "learning_rate": 1.702474755734441e-05, "loss": 0.0657, "step": 4230 }, { "epoch": 0.7393844275874095, "grad_norm": 0.07898587680558024, "learning_rate": 1.6812982750214385e-05, "loss": 0.0648, "step": 4240 }, { "epoch": 0.7411282587845497, "grad_norm": 0.05769295801958345, "learning_rate": 1.660227672092373e-05, "loss": 0.0646, "step": 4250 }, { "epoch": 0.7428720899816897, "grad_norm": 0.09139721064795, "learning_rate": 1.6392636191705817e-05, "loss": 0.0648, "step": 4260 }, { "epoch": 0.7446159211788299, "grad_norm": 0.04328001632629656, "learning_rate": 1.618406785080095e-05, "loss": 0.0642, "step": 4270 }, { "epoch": 0.74635975237597, "grad_norm": 0.10300148190920724, "learning_rate": 1.5976578352243017e-05, "loss": 0.065, "step": 4280 }, { "epoch": 0.7481035835731101, "grad_norm": 0.05439102528085213, "learning_rate": 1.5770174315647186e-05, "loss": 0.0641, "step": 4290 }, { "epoch": 0.7498474147702502, "grad_norm": 0.07283525225952207, "learning_rate": 1.5564862325998753e-05, "loss": 0.0663, "step": 4300 }, { "epoch": 0.7515912459673904, "grad_norm": 0.08943729068392149, "learning_rate": 1.5360648933442977e-05, "loss": 0.0641, "step": 4310 }, { "epoch": 0.7533350771645305, "grad_norm": 0.06719452778739451, "learning_rate": 1.5157540653076219e-05, "loss": 0.0642, "step": 4320 }, { "epoch": 0.7550789083616706, "grad_norm": 0.05251648332079966, "learning_rate": 1.4955543964738e-05, "loss": 0.066, "step": 4330 }, { "epoch": 0.7568227395588107, "grad_norm": 0.05274636216164364, "learning_rate": 1.4754665312804311e-05, "loss": 0.0641, "step": 4340 }, { "epoch": 0.7585665707559508, "grad_norm": 0.11011665466316892, "learning_rate": 1.4554911105982021e-05, "loss": 0.0639, "step": 4350 }, { "epoch": 0.760310401953091, "grad_norm": 0.10498230203392513, "learning_rate": 1.4356287717104383e-05, "loss": 0.064, "step": 4360 }, { "epoch": 0.762054233150231, "grad_norm": 0.09524120006808019, "learning_rate": 1.4158801482927764e-05, "loss": 0.065, "step": 4370 }, { "epoch": 0.7637980643473712, "grad_norm": 0.09453574859798163, "learning_rate": 1.3962458703929459e-05, "loss": 0.0643, "step": 4380 }, { "epoch": 0.7655418955445112, "grad_norm": 0.056312588367211, "learning_rate": 1.376726564410663e-05, "loss": 0.0638, "step": 4390 }, { "epoch": 0.7672857267416514, "grad_norm": 0.05345047180852192, "learning_rate": 1.3573228530776605e-05, "loss": 0.0641, "step": 4400 }, { "epoch": 0.7690295579387916, "grad_norm": 0.058868492999594034, "learning_rate": 1.3380353554378073e-05, "loss": 0.0652, "step": 4410 }, { "epoch": 0.7707733891359316, "grad_norm": 0.05698080816187847, "learning_rate": 1.3188646868273613e-05, "loss": 0.0633, "step": 4420 }, { "epoch": 0.7725172203330718, "grad_norm": 0.10673862985608311, "learning_rate": 1.2998114588553429e-05, "loss": 0.0625, "step": 4430 }, { "epoch": 0.7742610515302119, "grad_norm": 0.08436740252106527, "learning_rate": 1.2808762793840201e-05, "loss": 0.0649, "step": 4440 }, { "epoch": 0.776004882727352, "grad_norm": 0.06436259428184785, "learning_rate": 1.2620597525095136e-05, "loss": 0.0646, "step": 4450 }, { "epoch": 0.7777487139244921, "grad_norm": 0.06691706834113023, "learning_rate": 1.2433624785425291e-05, "loss": 0.0654, "step": 4460 }, { "epoch": 0.7794925451216322, "grad_norm": 0.10712410910395756, "learning_rate": 1.2247850539891948e-05, "loss": 0.0641, "step": 4470 }, { "epoch": 0.7812363763187723, "grad_norm": 0.08342722973198667, "learning_rate": 1.206328071532048e-05, "loss": 0.0637, "step": 4480 }, { "epoch": 0.7829802075159125, "grad_norm": 0.056140033330775385, "learning_rate": 1.187992120011111e-05, "loss": 0.0636, "step": 4490 }, { "epoch": 0.7847240387130526, "grad_norm": 0.1470353453729844, "learning_rate": 1.1697777844051105e-05, "loss": 0.0623, "step": 4500 }, { "epoch": 0.7864678699101927, "grad_norm": 0.06174798787453687, "learning_rate": 1.1516856458128167e-05, "loss": 0.0637, "step": 4510 }, { "epoch": 0.7882117011073329, "grad_norm": 0.10563261980312372, "learning_rate": 1.133716281434502e-05, "loss": 0.0618, "step": 4520 }, { "epoch": 0.7899555323044729, "grad_norm": 0.059155903025558475, "learning_rate": 1.1158702645535286e-05, "loss": 0.0654, "step": 4530 }, { "epoch": 0.7916993635016131, "grad_norm": 0.2236752898985674, "learning_rate": 1.0981481645180564e-05, "loss": 0.0655, "step": 4540 }, { "epoch": 0.7934431946987531, "grad_norm": 0.05542585716628748, "learning_rate": 1.080550546722876e-05, "loss": 0.0632, "step": 4550 }, { "epoch": 0.7951870258958933, "grad_norm": 0.12157637986614808, "learning_rate": 1.063077972591382e-05, "loss": 0.0619, "step": 4560 }, { "epoch": 0.7969308570930334, "grad_norm": 0.06387726990724843, "learning_rate": 1.0457309995576497e-05, "loss": 0.063, "step": 4570 }, { "epoch": 0.7986746882901735, "grad_norm": 0.08939875101367574, "learning_rate": 1.0285101810486535e-05, "loss": 0.0632, "step": 4580 }, { "epoch": 0.8004185194873136, "grad_norm": 0.06361929804120545, "learning_rate": 1.0114160664666155e-05, "loss": 0.0638, "step": 4590 }, { "epoch": 0.8021623506844537, "grad_norm": 0.08559047394309637, "learning_rate": 9.94449201171479e-06, "loss": 0.0623, "step": 4600 }, { "epoch": 0.8039061818815939, "grad_norm": 0.1318141758332588, "learning_rate": 9.776101264634969e-06, "loss": 0.0635, "step": 4610 }, { "epoch": 0.805650013078734, "grad_norm": 0.12389032549293967, "learning_rate": 9.608993795659765e-06, "loss": 0.064, "step": 4620 }, { "epoch": 0.8073938442758741, "grad_norm": 0.0823315848080731, "learning_rate": 9.443174936081345e-06, "loss": 0.0638, "step": 4630 }, { "epoch": 0.8091376754730142, "grad_norm": 0.051973482541530225, "learning_rate": 9.278649976080889e-06, "loss": 0.0649, "step": 4640 }, { "epoch": 0.8108815066701544, "grad_norm": 0.059530345330728256, "learning_rate": 9.11542416455981e-06, "loss": 0.0656, "step": 4650 }, { "epoch": 0.8126253378672944, "grad_norm": 0.10552903129002746, "learning_rate": 8.953502708972278e-06, "loss": 0.0638, "step": 4660 }, { "epoch": 0.8143691690644346, "grad_norm": 0.07221277969124819, "learning_rate": 8.792890775159125e-06, "loss": 0.0638, "step": 4670 }, { "epoch": 0.8161130002615746, "grad_norm": 0.0772500242885849, "learning_rate": 8.633593487183067e-06, "loss": 0.0663, "step": 4680 }, { "epoch": 0.8178568314587148, "grad_norm": 0.06591420894663506, "learning_rate": 8.475615927165093e-06, "loss": 0.0623, "step": 4690 }, { "epoch": 0.819600662655855, "grad_norm": 0.06507913039563108, "learning_rate": 8.31896313512247e-06, "loss": 0.0634, "step": 4700 }, { "epoch": 0.821344493852995, "grad_norm": 0.07878320811810173, "learning_rate": 8.163640108807896e-06, "loss": 0.0644, "step": 4710 }, { "epoch": 0.8230883250501352, "grad_norm": 0.04897281046480619, "learning_rate": 8.009651803550045e-06, "loss": 0.0624, "step": 4720 }, { "epoch": 0.8248321562472752, "grad_norm": 0.0952496494325907, "learning_rate": 7.85700313209548e-06, "loss": 0.065, "step": 4730 }, { "epoch": 0.8265759874444154, "grad_norm": 0.05733587546669899, "learning_rate": 7.70569896445194e-06, "loss": 0.0656, "step": 4740 }, { "epoch": 0.8283198186415555, "grad_norm": 0.09396840285731554, "learning_rate": 7.555744127732922e-06, "loss": 0.065, "step": 4750 }, { "epoch": 0.8300636498386956, "grad_norm": 0.10474980239912153, "learning_rate": 7.40714340600378e-06, "loss": 0.0631, "step": 4760 }, { "epoch": 0.8318074810358357, "grad_norm": 0.05498607201861292, "learning_rate": 7.2599015401289496e-06, "loss": 0.0635, "step": 4770 }, { "epoch": 0.8335513122329758, "grad_norm": 0.055760501809272785, "learning_rate": 7.114023227620831e-06, "loss": 0.0642, "step": 4780 }, { "epoch": 0.835295143430116, "grad_norm": 0.05306162693666742, "learning_rate": 6.969513122489863e-06, "loss": 0.0597, "step": 4790 }, { "epoch": 0.8370389746272561, "grad_norm": 0.1088676487625842, "learning_rate": 6.826375835096038e-06, "loss": 0.0644, "step": 4800 }, { "epoch": 0.8387828058243962, "grad_norm": 0.07413608503611739, "learning_rate": 6.6846159320018475e-06, "loss": 0.0623, "step": 4810 }, { "epoch": 0.8405266370215363, "grad_norm": 0.128062087657044, "learning_rate": 6.5442379358265585e-06, "loss": 0.0629, "step": 4820 }, { "epoch": 0.8422704682186765, "grad_norm": 0.0711990370216347, "learning_rate": 6.405246325101954e-06, "loss": 0.064, "step": 4830 }, { "epoch": 0.8440142994158165, "grad_norm": 0.05618385822056352, "learning_rate": 6.267645534129446e-06, "loss": 0.0625, "step": 4840 }, { "epoch": 0.8457581306129567, "grad_norm": 0.059419896247745525, "learning_rate": 6.131439952838608e-06, "loss": 0.0639, "step": 4850 }, { "epoch": 0.8475019618100967, "grad_norm": 0.051860827184772876, "learning_rate": 5.996633926647083e-06, "loss": 0.0658, "step": 4860 }, { "epoch": 0.8492457930072369, "grad_norm": 0.08959041805389602, "learning_rate": 5.863231756322019e-06, "loss": 0.0641, "step": 4870 }, { "epoch": 0.8509896242043771, "grad_norm": 0.08513940911176207, "learning_rate": 5.7312376978428115e-06, "loss": 0.0645, "step": 4880 }, { "epoch": 0.8527334554015171, "grad_norm": 0.0548611114427665, "learning_rate": 5.600655962265345e-06, "loss": 0.0634, "step": 4890 }, { "epoch": 0.8544772865986573, "grad_norm": 0.05781000046532928, "learning_rate": 5.4714907155876184e-06, "loss": 0.0633, "step": 4900 }, { "epoch": 0.8562211177957973, "grad_norm": 0.08211213473524111, "learning_rate": 5.3437460786168795e-06, "loss": 0.0645, "step": 4910 }, { "epoch": 0.8579649489929375, "grad_norm": 0.05678818154491414, "learning_rate": 5.21742612683811e-06, "loss": 0.064, "step": 4920 }, { "epoch": 0.8597087801900776, "grad_norm": 0.0407657392322624, "learning_rate": 5.092534890284056e-06, "loss": 0.064, "step": 4930 }, { "epoch": 0.8614526113872177, "grad_norm": 0.058259700076195416, "learning_rate": 4.969076353406571e-06, "loss": 0.0633, "step": 4940 }, { "epoch": 0.8631964425843578, "grad_norm": 0.0451262739997244, "learning_rate": 4.847054454949618e-06, "loss": 0.0672, "step": 4950 }, { "epoch": 0.864940273781498, "grad_norm": 0.057460078382529416, "learning_rate": 4.726473087823524e-06, "loss": 0.063, "step": 4960 }, { "epoch": 0.866684104978638, "grad_norm": 0.06326571394215523, "learning_rate": 4.6073360989807805e-06, "loss": 0.0626, "step": 4970 }, { "epoch": 0.8684279361757782, "grad_norm": 0.06116487077481144, "learning_rate": 4.489647289293369e-06, "loss": 0.0627, "step": 4980 }, { "epoch": 0.8701717673729183, "grad_norm": 0.065179252399081, "learning_rate": 4.3734104134314505e-06, "loss": 0.0641, "step": 4990 }, { "epoch": 0.8719155985700584, "grad_norm": 0.05066888191707678, "learning_rate": 4.258629179743611e-06, "loss": 0.0637, "step": 5000 }, { "epoch": 0.8736594297671986, "grad_norm": 0.0490167056123082, "learning_rate": 4.145307250138541e-06, "loss": 0.0632, "step": 5010 }, { "epoch": 0.8754032609643386, "grad_norm": 0.06459688078405998, "learning_rate": 4.033448239968168e-06, "loss": 0.0632, "step": 5020 }, { "epoch": 0.8771470921614788, "grad_norm": 0.04132712459317927, "learning_rate": 3.92305571791241e-06, "loss": 0.0639, "step": 5030 }, { "epoch": 0.8788909233586188, "grad_norm": 0.05973904076165523, "learning_rate": 3.8141332058652447e-06, "loss": 0.0625, "step": 5040 }, { "epoch": 0.880634754555759, "grad_norm": 0.1688894978424013, "learning_rate": 3.7066841788223394e-06, "loss": 0.063, "step": 5050 }, { "epoch": 0.8823785857528991, "grad_norm": 0.06014827411694666, "learning_rate": 3.6007120647702564e-06, "loss": 0.0627, "step": 5060 }, { "epoch": 0.8841224169500392, "grad_norm": 0.0690347027695397, "learning_rate": 3.4962202445770254e-06, "loss": 0.0631, "step": 5070 }, { "epoch": 0.8858662481471794, "grad_norm": 0.06536837705797424, "learning_rate": 3.3932120518843314e-06, "loss": 0.0627, "step": 5080 }, { "epoch": 0.8876100793443195, "grad_norm": 0.06603810637670604, "learning_rate": 3.291690773001116e-06, "loss": 0.0632, "step": 5090 }, { "epoch": 0.8893539105414596, "grad_norm": 0.07530260933577586, "learning_rate": 3.191659646798739e-06, "loss": 0.0616, "step": 5100 }, { "epoch": 0.8910977417385997, "grad_norm": 0.0842389348120118, "learning_rate": 3.0931218646077065e-06, "loss": 0.0613, "step": 5110 }, { "epoch": 0.8928415729357398, "grad_norm": 0.052177775217524563, "learning_rate": 2.996080570115778e-06, "loss": 0.0605, "step": 5120 }, { "epoch": 0.8945854041328799, "grad_norm": 0.07153236769063971, "learning_rate": 2.9005388592676985e-06, "loss": 0.064, "step": 5130 }, { "epoch": 0.8963292353300201, "grad_norm": 0.057729253108185115, "learning_rate": 2.806499780166455e-06, "loss": 0.0649, "step": 5140 }, { "epoch": 0.8980730665271601, "grad_norm": 0.05652295229188241, "learning_rate": 2.71396633297602e-06, "loss": 0.0637, "step": 5150 }, { "epoch": 0.8998168977243003, "grad_norm": 0.05132200378789997, "learning_rate": 2.6229414698255906e-06, "loss": 0.0649, "step": 5160 }, { "epoch": 0.9015607289214405, "grad_norm": 0.06529704901316756, "learning_rate": 2.5334280947154733e-06, "loss": 0.0625, "step": 5170 }, { "epoch": 0.9033045601185805, "grad_norm": 0.08839424803043143, "learning_rate": 2.4454290634243927e-06, "loss": 0.0639, "step": 5180 }, { "epoch": 0.9050483913157207, "grad_norm": 0.07271889072883368, "learning_rate": 2.3589471834183976e-06, "loss": 0.0636, "step": 5190 }, { "epoch": 0.9067922225128607, "grad_norm": 0.06780310617809011, "learning_rate": 2.273985213761298e-06, "loss": 0.0624, "step": 5200 }, { "epoch": 0.9085360537100009, "grad_norm": 0.05091168643328729, "learning_rate": 2.1905458650266276e-06, "loss": 0.0629, "step": 5210 }, { "epoch": 0.910279884907141, "grad_norm": 0.09149792256181204, "learning_rate": 2.108631799211158e-06, "loss": 0.063, "step": 5220 }, { "epoch": 0.9120237161042811, "grad_norm": 0.07438156642001797, "learning_rate": 2.0282456296500386e-06, "loss": 0.0638, "step": 5230 }, { "epoch": 0.9137675473014212, "grad_norm": 0.07104948596015835, "learning_rate": 1.9493899209333145e-06, "loss": 0.0635, "step": 5240 }, { "epoch": 0.9155113784985613, "grad_norm": 0.07176357436199812, "learning_rate": 1.8720671888242059e-06, "loss": 0.0649, "step": 5250 }, { "epoch": 0.9172552096957014, "grad_norm": 0.05532043790257546, "learning_rate": 1.7962799001787822e-06, "loss": 0.0625, "step": 5260 }, { "epoch": 0.9189990408928416, "grad_norm": 0.06036596788049541, "learning_rate": 1.7220304728672976e-06, "loss": 0.0621, "step": 5270 }, { "epoch": 0.9207428720899817, "grad_norm": 0.05175396289068676, "learning_rate": 1.6493212756970355e-06, "loss": 0.0638, "step": 5280 }, { "epoch": 0.9224867032871218, "grad_norm": 0.04494875714954003, "learning_rate": 1.5781546283367531e-06, "loss": 0.0618, "step": 5290 }, { "epoch": 0.924230534484262, "grad_norm": 0.07299420434011313, "learning_rate": 1.5085328012426291e-06, "loss": 0.0637, "step": 5300 }, { "epoch": 0.925974365681402, "grad_norm": 0.08713836743706539, "learning_rate": 1.4404580155859103e-06, "loss": 0.0627, "step": 5310 }, { "epoch": 0.9277181968785422, "grad_norm": 0.0648825275044856, "learning_rate": 1.3739324431819579e-06, "loss": 0.0617, "step": 5320 }, { "epoch": 0.9294620280756822, "grad_norm": 0.0603878346389834, "learning_rate": 1.3089582064210293e-06, "loss": 0.062, "step": 5330 }, { "epoch": 0.9312058592728224, "grad_norm": 0.06718431446540517, "learning_rate": 1.2455373782005342e-06, "loss": 0.0641, "step": 5340 }, { "epoch": 0.9329496904699625, "grad_norm": 0.08168020687674889, "learning_rate": 1.183671981858897e-06, "loss": 0.064, "step": 5350 }, { "epoch": 0.9346935216671026, "grad_norm": 0.05000874414008422, "learning_rate": 1.1233639911110317e-06, "loss": 0.064, "step": 5360 }, { "epoch": 0.9364373528642428, "grad_norm": 0.16836322984135893, "learning_rate": 1.0646153299853523e-06, "loss": 0.0635, "step": 5370 }, { "epoch": 0.9381811840613828, "grad_norm": 0.09538127932642303, "learning_rate": 1.0074278727623953e-06, "loss": 0.0626, "step": 5380 }, { "epoch": 0.939925015258523, "grad_norm": 0.052191949959066446, "learning_rate": 9.51803443915017e-07, "loss": 0.0611, "step": 5390 }, { "epoch": 0.9416688464556631, "grad_norm": 0.061928758941747805, "learning_rate": 8.977438180502118e-07, "loss": 0.0628, "step": 5400 }, { "epoch": 0.9434126776528032, "grad_norm": 0.09241870053906379, "learning_rate": 8.452507198524584e-07, "loss": 0.0645, "step": 5410 }, { "epoch": 0.9451565088499433, "grad_norm": 0.07538730319047023, "learning_rate": 7.943258240287354e-07, "loss": 0.0634, "step": 5420 }, { "epoch": 0.9469003400470835, "grad_norm": 0.06605596093399979, "learning_rate": 7.449707552550533e-07, "loss": 0.0648, "step": 5430 }, { "epoch": 0.9486441712442235, "grad_norm": 0.05499334424638046, "learning_rate": 6.971870881246678e-07, "loss": 0.0619, "step": 5440 }, { "epoch": 0.9503880024413637, "grad_norm": 0.09947976745780569, "learning_rate": 6.509763470977926e-07, "loss": 0.0632, "step": 5450 }, { "epoch": 0.9521318336385038, "grad_norm": 0.049014883398517665, "learning_rate": 6.063400064530155e-07, "loss": 0.0628, "step": 5460 }, { "epoch": 0.9538756648356439, "grad_norm": 0.06291101959215979, "learning_rate": 5.632794902402206e-07, "loss": 0.0612, "step": 5470 }, { "epoch": 0.9556194960327841, "grad_norm": 0.09331479294747827, "learning_rate": 5.217961722351894e-07, "loss": 0.0601, "step": 5480 }, { "epoch": 0.9573633272299241, "grad_norm": 0.057764257528601204, "learning_rate": 4.818913758957377e-07, "loss": 0.0608, "step": 5490 }, { "epoch": 0.9591071584270643, "grad_norm": 0.08288004474062129, "learning_rate": 4.4356637431953727e-07, "loss": 0.0621, "step": 5500 }, { "epoch": 0.9608509896242043, "grad_norm": 0.07391577549421642, "learning_rate": 4.068223902034651e-07, "loss": 0.064, "step": 5510 }, { "epoch": 0.9625948208213445, "grad_norm": 0.06724508233844487, "learning_rate": 3.716605958046071e-07, "loss": 0.0628, "step": 5520 }, { "epoch": 0.9643386520184846, "grad_norm": 0.05990916915396748, "learning_rate": 3.380821129028489e-07, "loss": 0.0615, "step": 5530 }, { "epoch": 0.9660824832156247, "grad_norm": 0.053834596475272396, "learning_rate": 3.0608801276511554e-07, "loss": 0.0641, "step": 5540 }, { "epoch": 0.9678263144127649, "grad_norm": 0.06287658543763808, "learning_rate": 2.7567931611116037e-07, "loss": 0.0613, "step": 5550 }, { "epoch": 0.969570145609905, "grad_norm": 0.07416621242751696, "learning_rate": 2.468569930810238e-07, "loss": 0.0633, "step": 5560 }, { "epoch": 0.9713139768070451, "grad_norm": 0.04757779630445172, "learning_rate": 2.1962196320406414e-07, "loss": 0.0611, "step": 5570 }, { "epoch": 0.9730578080041852, "grad_norm": 0.0748135009816523, "learning_rate": 1.9397509536964175e-07, "loss": 0.0608, "step": 5580 }, { "epoch": 0.9748016392013253, "grad_norm": 0.048298795572652944, "learning_rate": 1.699172077993916e-07, "loss": 0.0632, "step": 5590 }, { "epoch": 0.9765454703984654, "grad_norm": 0.053997077150683304, "learning_rate": 1.4744906802110493e-07, "loss": 0.0608, "step": 5600 }, { "epoch": 0.9782893015956056, "grad_norm": 0.04600674397243153, "learning_rate": 1.2657139284425467e-07, "loss": 0.0646, "step": 5610 }, { "epoch": 0.9800331327927456, "grad_norm": 0.0551671569065242, "learning_rate": 1.0728484833713582e-07, "loss": 0.0633, "step": 5620 }, { "epoch": 0.9817769639898858, "grad_norm": 0.06031021677931391, "learning_rate": 8.959004980559904e-08, "loss": 0.0647, "step": 5630 }, { "epoch": 0.9835207951870258, "grad_norm": 0.07156946663140798, "learning_rate": 7.34875617734332e-08, "loss": 0.0651, "step": 5640 }, { "epoch": 0.985264626384166, "grad_norm": 0.0442263642401745, "learning_rate": 5.897789796433517e-08, "loss": 0.0627, "step": 5650 }, { "epoch": 0.9870084575813062, "grad_norm": 0.11967681340422083, "learning_rate": 4.6061521285550856e-08, "loss": 0.0659, "step": 5660 }, { "epoch": 0.9887522887784462, "grad_norm": 0.06948791634563219, "learning_rate": 3.47388438130758e-08, "loss": 0.0654, "step": 5670 }, { "epoch": 0.9904961199755864, "grad_norm": 0.06338361823849195, "learning_rate": 2.5010226778537925e-08, "loss": 0.0627, "step": 5680 }, { "epoch": 0.9922399511727265, "grad_norm": 0.05046440916449036, "learning_rate": 1.687598055764017e-08, "loss": 0.0649, "step": 5690 }, { "epoch": 0.9939837823698666, "grad_norm": 0.0621800160395451, "learning_rate": 1.0336364660290532e-08, "loss": 0.0632, "step": 5700 }, { "epoch": 0.9957276135670067, "grad_norm": 0.06262117432768159, "learning_rate": 5.391587722303193e-09, "loss": 0.0646, "step": 5710 }, { "epoch": 0.9974714447641468, "grad_norm": 0.10303152613717605, "learning_rate": 2.0418074987538227e-09, "loss": 0.0619, "step": 5720 }, { "epoch": 0.9992152759612869, "grad_norm": 0.09188013889582053, "learning_rate": 2.871308589280641e-10, "loss": 0.0629, "step": 5730 }, { "epoch": 1.0, "step": 5735, "total_flos": 4951250244206592.0, "train_loss": 0.08733252992605062, "train_runtime": 92184.4704, "train_samples_per_second": 1.991, "train_steps_per_second": 0.062 } ], "logging_steps": 10, "max_steps": 5735, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4951250244206592.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }