{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999327369341494, "eval_steps": 500, "global_step": 7433, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006726306585054147, "grad_norm": 29.793027079973278, "learning_rate": 2.016129032258064e-06, "loss": 3.0338, "step": 5 }, { "epoch": 0.0013452613170108294, "grad_norm": 29.256907968213007, "learning_rate": 4.032258064516128e-06, "loss": 3.0206, "step": 10 }, { "epoch": 0.002017891975516244, "grad_norm": 15.261520394111779, "learning_rate": 6.0483870967741925e-06, "loss": 2.7838, "step": 15 }, { "epoch": 0.002690522634021659, "grad_norm": 9.515391114280666, "learning_rate": 8.064516129032257e-06, "loss": 2.5087, "step": 20 }, { "epoch": 0.0033631532925270735, "grad_norm": 6.1699236436111615, "learning_rate": 1.0080645161290321e-05, "loss": 2.1504, "step": 25 }, { "epoch": 0.004035783951032488, "grad_norm": 3.33300114769491, "learning_rate": 1.2096774193548385e-05, "loss": 1.7603, "step": 30 }, { "epoch": 0.004708414609537903, "grad_norm": 2.380295712226883, "learning_rate": 1.4112903225806451e-05, "loss": 1.6398, "step": 35 }, { "epoch": 0.005381045268043318, "grad_norm": 1.2705897972397782, "learning_rate": 1.6129032258064513e-05, "loss": 1.4737, "step": 40 }, { "epoch": 0.006053675926548732, "grad_norm": 1.0972481996311119, "learning_rate": 1.814516129032258e-05, "loss": 1.3779, "step": 45 }, { "epoch": 0.006726306585054147, "grad_norm": 0.8863957157122089, "learning_rate": 2.0161290322580642e-05, "loss": 1.2916, "step": 50 }, { "epoch": 0.007398937243559562, "grad_norm": 0.7177830723122377, "learning_rate": 2.2177419354838708e-05, "loss": 1.2852, "step": 55 }, { "epoch": 0.008071567902064977, "grad_norm": 0.5895990622701546, "learning_rate": 2.419354838709677e-05, "loss": 1.2467, "step": 60 }, { "epoch": 0.00874419856057039, "grad_norm": 0.6482412600733727, "learning_rate": 2.620967741935484e-05, "loss": 1.2336, "step": 65 }, { "epoch": 0.009416829219075806, "grad_norm": 0.45626769818341484, "learning_rate": 2.8225806451612902e-05, "loss": 1.1711, "step": 70 }, { "epoch": 0.01008945987758122, "grad_norm": 0.4598615730787672, "learning_rate": 3.0241935483870964e-05, "loss": 1.1733, "step": 75 }, { "epoch": 0.010762090536086635, "grad_norm": 0.4275974805901917, "learning_rate": 3.225806451612903e-05, "loss": 1.1692, "step": 80 }, { "epoch": 0.01143472119459205, "grad_norm": 0.2961863467328374, "learning_rate": 3.427419354838709e-05, "loss": 1.1361, "step": 85 }, { "epoch": 0.012107351853097465, "grad_norm": 0.3011832758903724, "learning_rate": 3.629032258064516e-05, "loss": 1.1497, "step": 90 }, { "epoch": 0.012779982511602879, "grad_norm": 0.3437144549576709, "learning_rate": 3.8306451612903224e-05, "loss": 1.115, "step": 95 }, { "epoch": 0.013452613170108294, "grad_norm": 0.38338441022677405, "learning_rate": 4.0322580645161284e-05, "loss": 1.134, "step": 100 }, { "epoch": 0.014125243828613708, "grad_norm": 0.3458095713498744, "learning_rate": 4.2338709677419356e-05, "loss": 1.0928, "step": 105 }, { "epoch": 0.014797874487119124, "grad_norm": 0.213297032520373, "learning_rate": 4.4354838709677415e-05, "loss": 1.143, "step": 110 }, { "epoch": 0.015470505145624537, "grad_norm": 0.15456137343431237, "learning_rate": 4.637096774193548e-05, "loss": 1.11, "step": 115 }, { "epoch": 0.016143135804129953, "grad_norm": 0.17049538367849387, "learning_rate": 4.838709677419354e-05, "loss": 1.11, "step": 120 }, { "epoch": 0.016815766462635367, "grad_norm": 0.16056803823178314, "learning_rate": 5.0403225806451606e-05, "loss": 1.0855, "step": 125 }, { "epoch": 0.01748839712114078, "grad_norm": 0.16118382096312336, "learning_rate": 5.241935483870968e-05, "loss": 1.0714, "step": 130 }, { "epoch": 0.018161027779646195, "grad_norm": 0.1132867285571746, "learning_rate": 5.443548387096774e-05, "loss": 1.0775, "step": 135 }, { "epoch": 0.018833658438151612, "grad_norm": 0.11714635540888474, "learning_rate": 5.6451612903225804e-05, "loss": 1.1172, "step": 140 }, { "epoch": 0.019506289096657026, "grad_norm": 0.14223271120783545, "learning_rate": 5.846774193548386e-05, "loss": 1.025, "step": 145 }, { "epoch": 0.02017891975516244, "grad_norm": 0.11556328672795584, "learning_rate": 6.048387096774193e-05, "loss": 1.0493, "step": 150 }, { "epoch": 0.020851550413667853, "grad_norm": 0.10980588262602826, "learning_rate": 6.25e-05, "loss": 1.0856, "step": 155 }, { "epoch": 0.02152418107217327, "grad_norm": 0.11923628645400541, "learning_rate": 6.451612903225805e-05, "loss": 1.088, "step": 160 }, { "epoch": 0.022196811730678685, "grad_norm": 0.1302600360663136, "learning_rate": 6.653225806451612e-05, "loss": 1.074, "step": 165 }, { "epoch": 0.0228694423891841, "grad_norm": 0.10841395305583203, "learning_rate": 6.854838709677419e-05, "loss": 1.0825, "step": 170 }, { "epoch": 0.023542073047689512, "grad_norm": 0.10691292450010598, "learning_rate": 7.056451612903225e-05, "loss": 1.0185, "step": 175 }, { "epoch": 0.02421470370619493, "grad_norm": 0.10288434679024688, "learning_rate": 7.258064516129032e-05, "loss": 1.0621, "step": 180 }, { "epoch": 0.024887334364700343, "grad_norm": 0.09551744684673671, "learning_rate": 7.459677419354838e-05, "loss": 1.0671, "step": 185 }, { "epoch": 0.025559965023205757, "grad_norm": 0.1023530781591793, "learning_rate": 7.661290322580645e-05, "loss": 1.0542, "step": 190 }, { "epoch": 0.02623259568171117, "grad_norm": 0.10650285325092443, "learning_rate": 7.86290322580645e-05, "loss": 1.0716, "step": 195 }, { "epoch": 0.02690522634021659, "grad_norm": 0.09426817941239474, "learning_rate": 8.064516129032257e-05, "loss": 1.004, "step": 200 }, { "epoch": 0.027577856998722002, "grad_norm": 0.09989428312797478, "learning_rate": 8.266129032258063e-05, "loss": 1.0739, "step": 205 }, { "epoch": 0.028250487657227416, "grad_norm": 0.10006362944609468, "learning_rate": 8.467741935483871e-05, "loss": 1.0905, "step": 210 }, { "epoch": 0.02892311831573283, "grad_norm": 0.10481768262243495, "learning_rate": 8.669354838709678e-05, "loss": 1.0209, "step": 215 }, { "epoch": 0.029595748974238247, "grad_norm": 0.09265084528231703, "learning_rate": 8.870967741935483e-05, "loss": 1.0309, "step": 220 }, { "epoch": 0.03026837963274366, "grad_norm": 0.08616737627661364, "learning_rate": 9.07258064516129e-05, "loss": 1.0233, "step": 225 }, { "epoch": 0.030941010291249075, "grad_norm": 0.09416043226383441, "learning_rate": 9.274193548387096e-05, "loss": 1.0251, "step": 230 }, { "epoch": 0.03161364094975449, "grad_norm": 0.10034605553663441, "learning_rate": 9.475806451612903e-05, "loss": 1.0013, "step": 235 }, { "epoch": 0.032286271608259906, "grad_norm": 0.0959718573339108, "learning_rate": 9.677419354838708e-05, "loss": 1.0215, "step": 240 }, { "epoch": 0.032958902266765316, "grad_norm": 0.10210970867228082, "learning_rate": 9.879032258064515e-05, "loss": 1.0518, "step": 245 }, { "epoch": 0.033631532925270734, "grad_norm": 0.09862230935521245, "learning_rate": 0.00010080645161290321, "loss": 1.0957, "step": 250 }, { "epoch": 0.03430416358377615, "grad_norm": 0.10083093939252007, "learning_rate": 0.00010282258064516128, "loss": 1.0558, "step": 255 }, { "epoch": 0.03497679424228156, "grad_norm": 0.09415129429211958, "learning_rate": 0.00010483870967741936, "loss": 1.0521, "step": 260 }, { "epoch": 0.03564942490078698, "grad_norm": 0.11114031366094976, "learning_rate": 0.00010685483870967741, "loss": 1.0847, "step": 265 }, { "epoch": 0.03632205555929239, "grad_norm": 0.09713797329341636, "learning_rate": 0.00010887096774193548, "loss": 1.0728, "step": 270 }, { "epoch": 0.036994686217797806, "grad_norm": 0.09788369320757713, "learning_rate": 0.00011088709677419354, "loss": 1.0346, "step": 275 }, { "epoch": 0.037667316876303224, "grad_norm": 0.1006296151589254, "learning_rate": 0.00011290322580645161, "loss": 1.0475, "step": 280 }, { "epoch": 0.038339947534808634, "grad_norm": 0.09942300500167914, "learning_rate": 0.00011491935483870966, "loss": 1.0172, "step": 285 }, { "epoch": 0.03901257819331405, "grad_norm": 0.0934491829625178, "learning_rate": 0.00011693548387096773, "loss": 1.0506, "step": 290 }, { "epoch": 0.03968520885181947, "grad_norm": 0.09551251502862168, "learning_rate": 0.00011895161290322579, "loss": 1.0173, "step": 295 }, { "epoch": 0.04035783951032488, "grad_norm": 0.09171601005317345, "learning_rate": 0.00012096774193548386, "loss": 1.0274, "step": 300 }, { "epoch": 0.041030470168830296, "grad_norm": 0.09273487710305259, "learning_rate": 0.00012298387096774192, "loss": 0.995, "step": 305 }, { "epoch": 0.04170310082733571, "grad_norm": 0.09862330057801429, "learning_rate": 0.000125, "loss": 1.0348, "step": 310 }, { "epoch": 0.042375731485841124, "grad_norm": 0.09362404636564037, "learning_rate": 0.00012701612903225805, "loss": 1.0167, "step": 315 }, { "epoch": 0.04304836214434654, "grad_norm": 0.09329273833910302, "learning_rate": 0.0001290322580645161, "loss": 1.0835, "step": 320 }, { "epoch": 0.04372099280285195, "grad_norm": 0.09298449638448188, "learning_rate": 0.0001310483870967742, "loss": 1.0176, "step": 325 }, { "epoch": 0.04439362346135737, "grad_norm": 0.09347384249973195, "learning_rate": 0.00013306451612903224, "loss": 0.9619, "step": 330 }, { "epoch": 0.045066254119862786, "grad_norm": 0.13463693712447972, "learning_rate": 0.00013508064516129032, "loss": 0.9868, "step": 335 }, { "epoch": 0.0457388847783682, "grad_norm": 0.09815201596450862, "learning_rate": 0.00013709677419354837, "loss": 1.0736, "step": 340 }, { "epoch": 0.046411515436873614, "grad_norm": 0.09232625821163153, "learning_rate": 0.00013911290322580642, "loss": 1.0299, "step": 345 }, { "epoch": 0.047084146095379024, "grad_norm": 0.09070277309304493, "learning_rate": 0.0001411290322580645, "loss": 1.0336, "step": 350 }, { "epoch": 0.04775677675388444, "grad_norm": 0.10932321230103388, "learning_rate": 0.00014314516129032258, "loss": 1.0073, "step": 355 }, { "epoch": 0.04842940741238986, "grad_norm": 0.09657601874101734, "learning_rate": 0.00014516129032258063, "loss": 1.0251, "step": 360 }, { "epoch": 0.04910203807089527, "grad_norm": 0.09877927764481637, "learning_rate": 0.0001471774193548387, "loss": 1.0507, "step": 365 }, { "epoch": 0.04977466872940069, "grad_norm": 0.09274278341834487, "learning_rate": 0.00014919354838709677, "loss": 1.0534, "step": 370 }, { "epoch": 0.050447299387906104, "grad_norm": 0.09198545780463178, "learning_rate": 0.00015120967741935482, "loss": 1.0185, "step": 375 }, { "epoch": 0.051119930046411514, "grad_norm": 0.08891910715711743, "learning_rate": 0.0001532258064516129, "loss": 1.0039, "step": 380 }, { "epoch": 0.05179256070491693, "grad_norm": 0.09078315297270034, "learning_rate": 0.00015524193548387095, "loss": 1.0444, "step": 385 }, { "epoch": 0.05246519136342234, "grad_norm": 0.08977914921659898, "learning_rate": 0.000157258064516129, "loss": 1.0976, "step": 390 }, { "epoch": 0.05313782202192776, "grad_norm": 0.09433437227793554, "learning_rate": 0.00015927419354838708, "loss": 1.0502, "step": 395 }, { "epoch": 0.05381045268043318, "grad_norm": 0.08762290810061227, "learning_rate": 0.00016129032258064513, "loss": 1.0349, "step": 400 }, { "epoch": 0.05448308333893859, "grad_norm": 0.0933428344700274, "learning_rate": 0.00016330645161290319, "loss": 1.0371, "step": 405 }, { "epoch": 0.055155713997444004, "grad_norm": 0.09147179314981382, "learning_rate": 0.00016532258064516127, "loss": 1.0665, "step": 410 }, { "epoch": 0.055828344655949415, "grad_norm": 0.09293037669639792, "learning_rate": 0.00016733870967741932, "loss": 1.0305, "step": 415 }, { "epoch": 0.05650097531445483, "grad_norm": 0.09036496873154756, "learning_rate": 0.00016935483870967742, "loss": 1.0543, "step": 420 }, { "epoch": 0.05717360597296025, "grad_norm": 0.09167654419704568, "learning_rate": 0.00017137096774193548, "loss": 1.0301, "step": 425 }, { "epoch": 0.05784623663146566, "grad_norm": 0.09209240875308733, "learning_rate": 0.00017338709677419356, "loss": 1.0729, "step": 430 }, { "epoch": 0.05851886728997108, "grad_norm": 0.135579302744148, "learning_rate": 0.0001754032258064516, "loss": 1.0798, "step": 435 }, { "epoch": 0.059191497948476494, "grad_norm": 0.09653916343025865, "learning_rate": 0.00017741935483870966, "loss": 1.0937, "step": 440 }, { "epoch": 0.059864128606981905, "grad_norm": 0.09958629951032674, "learning_rate": 0.00017943548387096774, "loss": 1.078, "step": 445 }, { "epoch": 0.06053675926548732, "grad_norm": 0.08629494311396081, "learning_rate": 0.0001814516129032258, "loss": 1.0798, "step": 450 }, { "epoch": 0.06120938992399273, "grad_norm": 0.09869583205407412, "learning_rate": 0.00018346774193548385, "loss": 1.0374, "step": 455 }, { "epoch": 0.06188202058249815, "grad_norm": 0.09053280646063235, "learning_rate": 0.00018548387096774192, "loss": 1.0505, "step": 460 }, { "epoch": 0.06255465124100357, "grad_norm": 0.08150427237605223, "learning_rate": 0.00018749999999999998, "loss": 1.0606, "step": 465 }, { "epoch": 0.06322728189950898, "grad_norm": 0.0901461935028475, "learning_rate": 0.00018951612903225806, "loss": 1.042, "step": 470 }, { "epoch": 0.06389991255801439, "grad_norm": 0.08190865253332126, "learning_rate": 0.0001915322580645161, "loss": 1.0209, "step": 475 }, { "epoch": 0.06457254321651981, "grad_norm": 0.08607633580363463, "learning_rate": 0.00019354838709677416, "loss": 0.9894, "step": 480 }, { "epoch": 0.06524517387502522, "grad_norm": 0.09241895549323972, "learning_rate": 0.00019556451612903224, "loss": 1.0602, "step": 485 }, { "epoch": 0.06591780453353063, "grad_norm": 0.08596935339404507, "learning_rate": 0.0001975806451612903, "loss": 1.0719, "step": 490 }, { "epoch": 0.06659043519203606, "grad_norm": 0.08488359435021268, "learning_rate": 0.00019959677419354837, "loss": 1.0252, "step": 495 }, { "epoch": 0.06726306585054147, "grad_norm": 0.09196056231065085, "learning_rate": 0.00020161290322580642, "loss": 1.0454, "step": 500 }, { "epoch": 0.06793569650904688, "grad_norm": 0.11419199766934089, "learning_rate": 0.00020362903225806448, "loss": 1.049, "step": 505 }, { "epoch": 0.0686083271675523, "grad_norm": 0.08906626167489842, "learning_rate": 0.00020564516129032256, "loss": 1.0105, "step": 510 }, { "epoch": 0.06928095782605771, "grad_norm": 0.08517561915403676, "learning_rate": 0.00020766129032258064, "loss": 1.0855, "step": 515 }, { "epoch": 0.06995358848456312, "grad_norm": 0.0903550456898868, "learning_rate": 0.00020967741935483871, "loss": 1.0575, "step": 520 }, { "epoch": 0.07062621914306855, "grad_norm": 0.08278762418068097, "learning_rate": 0.00021169354838709677, "loss": 1.0535, "step": 525 }, { "epoch": 0.07129884980157396, "grad_norm": 0.08481194574291347, "learning_rate": 0.00021370967741935482, "loss": 1.0293, "step": 530 }, { "epoch": 0.07197148046007937, "grad_norm": 0.08984307347575393, "learning_rate": 0.0002157258064516129, "loss": 1.0219, "step": 535 }, { "epoch": 0.07264411111858478, "grad_norm": 0.09076249423258674, "learning_rate": 0.00021774193548387095, "loss": 1.0587, "step": 540 }, { "epoch": 0.0733167417770902, "grad_norm": 0.1002173498829235, "learning_rate": 0.00021975806451612903, "loss": 1.037, "step": 545 }, { "epoch": 0.07398937243559561, "grad_norm": 0.08872604532857477, "learning_rate": 0.00022177419354838708, "loss": 1.0333, "step": 550 }, { "epoch": 0.07466200309410102, "grad_norm": 0.08375526502846077, "learning_rate": 0.00022379032258064514, "loss": 1.059, "step": 555 }, { "epoch": 0.07533463375260645, "grad_norm": 0.09978318424394747, "learning_rate": 0.00022580645161290321, "loss": 1.0845, "step": 560 }, { "epoch": 0.07600726441111186, "grad_norm": 0.07782302685062692, "learning_rate": 0.00022782258064516127, "loss": 0.9958, "step": 565 }, { "epoch": 0.07667989506961727, "grad_norm": 0.0856006706991171, "learning_rate": 0.00022983870967741932, "loss": 1.0523, "step": 570 }, { "epoch": 0.07735252572812269, "grad_norm": 0.08571466890563395, "learning_rate": 0.0002318548387096774, "loss": 1.1096, "step": 575 }, { "epoch": 0.0780251563866281, "grad_norm": 0.08709457854528387, "learning_rate": 0.00023387096774193545, "loss": 1.0607, "step": 580 }, { "epoch": 0.07869778704513351, "grad_norm": 0.09018186960178792, "learning_rate": 0.00023588709677419353, "loss": 1.0724, "step": 585 }, { "epoch": 0.07937041770363894, "grad_norm": 0.08199549738270526, "learning_rate": 0.00023790322580645158, "loss": 1.0743, "step": 590 }, { "epoch": 0.08004304836214435, "grad_norm": 0.080357031829569, "learning_rate": 0.00023991935483870964, "loss": 1.0353, "step": 595 }, { "epoch": 0.08071567902064976, "grad_norm": 0.07898024579852181, "learning_rate": 0.00024193548387096771, "loss": 1.0322, "step": 600 }, { "epoch": 0.08138830967915518, "grad_norm": 0.08594691467762515, "learning_rate": 0.0002439516129032258, "loss": 1.026, "step": 605 }, { "epoch": 0.08206094033766059, "grad_norm": 0.07960732979223949, "learning_rate": 0.00024596774193548385, "loss": 1.0701, "step": 610 }, { "epoch": 0.082733570996166, "grad_norm": 0.07748692647646714, "learning_rate": 0.00024798387096774195, "loss": 1.0279, "step": 615 }, { "epoch": 0.08340620165467141, "grad_norm": 0.09702102839233316, "learning_rate": 0.00025, "loss": 1.0221, "step": 620 }, { "epoch": 0.08407883231317684, "grad_norm": 0.08104610825395912, "learning_rate": 0.00025201612903225806, "loss": 1.0486, "step": 625 }, { "epoch": 0.08475146297168225, "grad_norm": 0.08063025008619103, "learning_rate": 0.0002540322580645161, "loss": 1.0459, "step": 630 }, { "epoch": 0.08542409363018766, "grad_norm": 0.08087778063873288, "learning_rate": 0.00025604838709677416, "loss": 1.0908, "step": 635 }, { "epoch": 0.08609672428869308, "grad_norm": 0.07888634968036065, "learning_rate": 0.0002580645161290322, "loss": 1.0408, "step": 640 }, { "epoch": 0.0867693549471985, "grad_norm": 0.08297792160081463, "learning_rate": 0.0002600806451612903, "loss": 1.1192, "step": 645 }, { "epoch": 0.0874419856057039, "grad_norm": 0.08286481962331686, "learning_rate": 0.0002620967741935484, "loss": 1.0719, "step": 650 }, { "epoch": 0.08811461626420933, "grad_norm": 0.07572488265547365, "learning_rate": 0.0002641129032258064, "loss": 1.0561, "step": 655 }, { "epoch": 0.08878724692271474, "grad_norm": 0.08337241862121214, "learning_rate": 0.0002661290322580645, "loss": 1.1061, "step": 660 }, { "epoch": 0.08945987758122015, "grad_norm": 0.07959536841791225, "learning_rate": 0.00026814516129032253, "loss": 1.0959, "step": 665 }, { "epoch": 0.09013250823972557, "grad_norm": 0.07879200275084125, "learning_rate": 0.00027016129032258064, "loss": 1.0907, "step": 670 }, { "epoch": 0.09080513889823098, "grad_norm": 0.08437472314091585, "learning_rate": 0.0002721774193548387, "loss": 1.0902, "step": 675 }, { "epoch": 0.0914777695567364, "grad_norm": 0.07936113375420903, "learning_rate": 0.00027419354838709674, "loss": 1.0857, "step": 680 }, { "epoch": 0.0921504002152418, "grad_norm": 0.07404043199519923, "learning_rate": 0.0002762096774193548, "loss": 1.022, "step": 685 }, { "epoch": 0.09282303087374723, "grad_norm": 0.07966312939128797, "learning_rate": 0.00027822580645161285, "loss": 1.0562, "step": 690 }, { "epoch": 0.09349566153225264, "grad_norm": 0.07988901185413153, "learning_rate": 0.00028024193548387095, "loss": 1.035, "step": 695 }, { "epoch": 0.09416829219075805, "grad_norm": 0.0753613113842339, "learning_rate": 0.000282258064516129, "loss": 1.0645, "step": 700 }, { "epoch": 0.09484092284926347, "grad_norm": 0.07626937908013373, "learning_rate": 0.0002842741935483871, "loss": 1.0377, "step": 705 }, { "epoch": 0.09551355350776888, "grad_norm": 0.09235167533834955, "learning_rate": 0.00028629032258064516, "loss": 1.0979, "step": 710 }, { "epoch": 0.0961861841662743, "grad_norm": 0.08055166929289474, "learning_rate": 0.0002883064516129032, "loss": 1.0852, "step": 715 }, { "epoch": 0.09685881482477972, "grad_norm": 0.08429213138249105, "learning_rate": 0.00029032258064516127, "loss": 1.0447, "step": 720 }, { "epoch": 0.09753144548328513, "grad_norm": 0.08437607034953815, "learning_rate": 0.0002923387096774193, "loss": 1.0558, "step": 725 }, { "epoch": 0.09820407614179054, "grad_norm": 0.07492047494320839, "learning_rate": 0.0002943548387096774, "loss": 1.0291, "step": 730 }, { "epoch": 0.09887670680029596, "grad_norm": 0.0787931854781473, "learning_rate": 0.0002963709677419355, "loss": 1.0582, "step": 735 }, { "epoch": 0.09954933745880137, "grad_norm": 0.07645123746511935, "learning_rate": 0.00029838709677419353, "loss": 1.0174, "step": 740 }, { "epoch": 0.10022196811730678, "grad_norm": 0.07409129763787864, "learning_rate": 0.00029999998345607283, "loss": 1.079, "step": 745 }, { "epoch": 0.10089459877581221, "grad_norm": 0.08041650749931936, "learning_rate": 0.00029999940441900595, "loss": 1.0439, "step": 750 }, { "epoch": 0.10156722943431762, "grad_norm": 0.08027960443856536, "learning_rate": 0.0002999979981892312, "loss": 1.047, "step": 755 }, { "epoch": 0.10223986009282303, "grad_norm": 0.07906827754778097, "learning_rate": 0.0002999957647745034, "loss": 1.0961, "step": 760 }, { "epoch": 0.10291249075132844, "grad_norm": 0.071215050684659, "learning_rate": 0.00029999270418713906, "loss": 1.0405, "step": 765 }, { "epoch": 0.10358512140983386, "grad_norm": 0.0829151410610896, "learning_rate": 0.00029998881644401624, "loss": 1.0923, "step": 770 }, { "epoch": 0.10425775206833927, "grad_norm": 0.07164078011185154, "learning_rate": 0.00029998410156657437, "loss": 1.0274, "step": 775 }, { "epoch": 0.10493038272684468, "grad_norm": 0.07588880429132575, "learning_rate": 0.0002999785595808143, "loss": 1.0823, "step": 780 }, { "epoch": 0.10560301338535011, "grad_norm": 0.079369424589866, "learning_rate": 0.0002999721905172982, "loss": 1.0376, "step": 785 }, { "epoch": 0.10627564404385552, "grad_norm": 0.07662095383626488, "learning_rate": 0.0002999649944111491, "loss": 1.0539, "step": 790 }, { "epoch": 0.10694827470236093, "grad_norm": 0.07486801172848384, "learning_rate": 0.0002999569713020509, "loss": 1.0783, "step": 795 }, { "epoch": 0.10762090536086635, "grad_norm": 0.0766666428812244, "learning_rate": 0.00029994812123424814, "loss": 1.0635, "step": 800 }, { "epoch": 0.10829353601937176, "grad_norm": 0.0754261513974334, "learning_rate": 0.00029993844425654586, "loss": 1.0546, "step": 805 }, { "epoch": 0.10896616667787717, "grad_norm": 0.07071826590853397, "learning_rate": 0.000299927940422309, "loss": 1.1055, "step": 810 }, { "epoch": 0.1096387973363826, "grad_norm": 0.07693148525549479, "learning_rate": 0.0002999166097894625, "loss": 1.0925, "step": 815 }, { "epoch": 0.11031142799488801, "grad_norm": 0.0723848682308289, "learning_rate": 0.0002999044524204907, "loss": 1.0711, "step": 820 }, { "epoch": 0.11098405865339342, "grad_norm": 0.07549989385866508, "learning_rate": 0.0002998914683824371, "loss": 1.0759, "step": 825 }, { "epoch": 0.11165668931189883, "grad_norm": 0.08028495278710619, "learning_rate": 0.00029987765774690397, "loss": 1.0693, "step": 830 }, { "epoch": 0.11232931997040425, "grad_norm": 0.06771708980583116, "learning_rate": 0.00029986302059005206, "loss": 1.0791, "step": 835 }, { "epoch": 0.11300195062890966, "grad_norm": 0.07284561060447658, "learning_rate": 0.00029984755699259994, "loss": 1.0383, "step": 840 }, { "epoch": 0.11367458128741507, "grad_norm": 0.07793144682196305, "learning_rate": 0.00029983126703982387, "loss": 1.0343, "step": 845 }, { "epoch": 0.1143472119459205, "grad_norm": 0.0773418539344277, "learning_rate": 0.000299814150821557, "loss": 1.0565, "step": 850 }, { "epoch": 0.11501984260442591, "grad_norm": 0.07164210767629635, "learning_rate": 0.00029979620843218917, "loss": 1.0236, "step": 855 }, { "epoch": 0.11569247326293132, "grad_norm": 0.07393499209078849, "learning_rate": 0.0002997774399706661, "loss": 1.0753, "step": 860 }, { "epoch": 0.11636510392143674, "grad_norm": 0.07288026009996475, "learning_rate": 0.0002997578455404892, "loss": 1.0438, "step": 865 }, { "epoch": 0.11703773457994215, "grad_norm": 0.0954759009693307, "learning_rate": 0.0002997374252497147, "loss": 1.0868, "step": 870 }, { "epoch": 0.11771036523844756, "grad_norm": 0.07327833047011302, "learning_rate": 0.00029971617921095305, "loss": 1.0715, "step": 875 }, { "epoch": 0.11838299589695299, "grad_norm": 0.07131786035588028, "learning_rate": 0.0002996941075413686, "loss": 1.0803, "step": 880 }, { "epoch": 0.1190556265554584, "grad_norm": 0.08956736884401889, "learning_rate": 0.0002996712103626786, "loss": 1.0627, "step": 885 }, { "epoch": 0.11972825721396381, "grad_norm": 0.07934523762735987, "learning_rate": 0.0002996474878011529, "loss": 1.0543, "step": 890 }, { "epoch": 0.12040088787246923, "grad_norm": 0.09002705521503765, "learning_rate": 0.00029962293998761263, "loss": 1.0609, "step": 895 }, { "epoch": 0.12107351853097464, "grad_norm": 0.06925053859969088, "learning_rate": 0.0002995975670574303, "loss": 1.0624, "step": 900 }, { "epoch": 0.12174614918948005, "grad_norm": 0.07512999142720386, "learning_rate": 0.00029957136915052845, "loss": 1.0619, "step": 905 }, { "epoch": 0.12241877984798546, "grad_norm": 0.0715274327128957, "learning_rate": 0.0002995443464113791, "loss": 1.0458, "step": 910 }, { "epoch": 0.12309141050649089, "grad_norm": 0.07036352659794959, "learning_rate": 0.000299516498989003, "loss": 1.0875, "step": 915 }, { "epoch": 0.1237640411649963, "grad_norm": 0.07413577353443873, "learning_rate": 0.0002994878270369685, "loss": 1.0649, "step": 920 }, { "epoch": 0.12443667182350171, "grad_norm": 0.07516574985840074, "learning_rate": 0.0002994583307133913, "loss": 1.0298, "step": 925 }, { "epoch": 0.12510930248200713, "grad_norm": 0.07127442173766656, "learning_rate": 0.00029942801018093283, "loss": 1.0719, "step": 930 }, { "epoch": 0.12578193314051253, "grad_norm": 0.07197516123092482, "learning_rate": 0.0002993968656068, "loss": 1.0403, "step": 935 }, { "epoch": 0.12645456379901795, "grad_norm": 0.07093702068308229, "learning_rate": 0.0002993648971627438, "loss": 1.0304, "step": 940 }, { "epoch": 0.12712719445752338, "grad_norm": 0.0718158231945994, "learning_rate": 0.00029933210502505893, "loss": 1.0646, "step": 945 }, { "epoch": 0.12779982511602878, "grad_norm": 0.07700946884853352, "learning_rate": 0.00029929848937458196, "loss": 1.051, "step": 950 }, { "epoch": 0.1284724557745342, "grad_norm": 0.37155780391202864, "learning_rate": 0.0002992640503966913, "loss": 1.0834, "step": 955 }, { "epoch": 0.12914508643303962, "grad_norm": 0.20536638900723664, "learning_rate": 0.0002992287882813053, "loss": 1.1027, "step": 960 }, { "epoch": 0.12981771709154502, "grad_norm": 0.08841263116800245, "learning_rate": 0.00029919270322288215, "loss": 1.0939, "step": 965 }, { "epoch": 0.13049034775005044, "grad_norm": 0.6429288189128298, "learning_rate": 0.00029915579542041763, "loss": 1.0454, "step": 970 }, { "epoch": 0.13116297840855587, "grad_norm": 0.07893043114765416, "learning_rate": 0.00029911806507744513, "loss": 1.0522, "step": 975 }, { "epoch": 0.13183560906706127, "grad_norm": 0.08700899117062741, "learning_rate": 0.0002990795124020339, "loss": 1.0802, "step": 980 }, { "epoch": 0.1325082397255667, "grad_norm": 0.40838005866687616, "learning_rate": 0.0002990401376067881, "loss": 1.5534, "step": 985 }, { "epoch": 0.13318087038407211, "grad_norm": 0.10787306069567282, "learning_rate": 0.00029899994090884545, "loss": 1.1244, "step": 990 }, { "epoch": 0.1338535010425775, "grad_norm": 4.0271804629424635, "learning_rate": 0.0002989589225298763, "loss": 1.0778, "step": 995 }, { "epoch": 0.13452613170108293, "grad_norm": 0.1798230372096084, "learning_rate": 0.00029891708269608235, "loss": 1.0863, "step": 1000 }, { "epoch": 0.13519876235958836, "grad_norm": 0.0918137691588807, "learning_rate": 0.00029887442163819503, "loss": 1.1321, "step": 1005 }, { "epoch": 0.13587139301809376, "grad_norm": 0.2945178807632444, "learning_rate": 0.0002988309395914749, "loss": 1.115, "step": 1010 }, { "epoch": 0.13654402367659918, "grad_norm": 0.5933305026235249, "learning_rate": 0.00029878663679570964, "loss": 1.0957, "step": 1015 }, { "epoch": 0.1372166543351046, "grad_norm": 0.17535925174542233, "learning_rate": 0.00029874151349521336, "loss": 1.0946, "step": 1020 }, { "epoch": 0.13788928499361, "grad_norm": 0.11591544009308001, "learning_rate": 0.0002986955699388247, "loss": 1.1157, "step": 1025 }, { "epoch": 0.13856191565211542, "grad_norm": 0.0927251025295262, "learning_rate": 0.0002986488063799058, "loss": 1.091, "step": 1030 }, { "epoch": 0.13923454631062085, "grad_norm": 0.07932226343469499, "learning_rate": 0.000298601223076341, "loss": 1.0534, "step": 1035 }, { "epoch": 0.13990717696912625, "grad_norm": 0.0807602717471893, "learning_rate": 0.000298552820290535, "loss": 1.0309, "step": 1040 }, { "epoch": 0.14057980762763167, "grad_norm": 0.09433877161816717, "learning_rate": 0.00029850359828941176, "loss": 1.0476, "step": 1045 }, { "epoch": 0.1412524382861371, "grad_norm": 0.7121539088215401, "learning_rate": 0.0002984535573444129, "loss": 1.0698, "step": 1050 }, { "epoch": 0.1419250689446425, "grad_norm": 0.08321713836368394, "learning_rate": 0.00029840269773149614, "loss": 1.0677, "step": 1055 }, { "epoch": 0.14259769960314791, "grad_norm": 0.09079845364417001, "learning_rate": 0.00029835101973113397, "loss": 1.058, "step": 1060 }, { "epoch": 0.14327033026165334, "grad_norm": 0.0707607351743187, "learning_rate": 0.000298298523628312, "loss": 1.0503, "step": 1065 }, { "epoch": 0.14394296092015874, "grad_norm": 0.14627547315644027, "learning_rate": 0.0002982452097125273, "loss": 1.045, "step": 1070 }, { "epoch": 0.14461559157866416, "grad_norm": 0.07536850562776565, "learning_rate": 0.0002981910782777869, "loss": 1.0376, "step": 1075 }, { "epoch": 0.14528822223716956, "grad_norm": 0.06861636061492025, "learning_rate": 0.0002981361296226063, "loss": 1.0692, "step": 1080 }, { "epoch": 0.14596085289567498, "grad_norm": 0.0709747455258586, "learning_rate": 0.0002980803640500073, "loss": 1.0789, "step": 1085 }, { "epoch": 0.1466334835541804, "grad_norm": 0.07359845843812414, "learning_rate": 0.0002980237818675172, "loss": 1.0958, "step": 1090 }, { "epoch": 0.1473061142126858, "grad_norm": 0.06901465022817592, "learning_rate": 0.00029796638338716624, "loss": 1.024, "step": 1095 }, { "epoch": 0.14797874487119123, "grad_norm": 0.06673500731770182, "learning_rate": 0.00029790816892548644, "loss": 1.0705, "step": 1100 }, { "epoch": 0.14865137552969665, "grad_norm": 0.07469404586580403, "learning_rate": 0.00029784913880350947, "loss": 1.0511, "step": 1105 }, { "epoch": 0.14932400618820205, "grad_norm": 0.07010042708592439, "learning_rate": 0.00029778929334676535, "loss": 1.0391, "step": 1110 }, { "epoch": 0.14999663684670747, "grad_norm": 0.0686148252448679, "learning_rate": 0.0002977286328852802, "loss": 1.0535, "step": 1115 }, { "epoch": 0.1506692675052129, "grad_norm": 0.07242929663095796, "learning_rate": 0.00029766715775357447, "loss": 0.9996, "step": 1120 }, { "epoch": 0.1513418981637183, "grad_norm": 0.07237289997413665, "learning_rate": 0.00029760486829066157, "loss": 1.0542, "step": 1125 }, { "epoch": 0.15201452882222372, "grad_norm": 0.06706892168069356, "learning_rate": 0.00029754176484004537, "loss": 1.0491, "step": 1130 }, { "epoch": 0.15268715948072914, "grad_norm": 0.070480319021014, "learning_rate": 0.00029747784774971866, "loss": 1.0674, "step": 1135 }, { "epoch": 0.15335979013923454, "grad_norm": 0.06989655990759643, "learning_rate": 0.00029741311737216126, "loss": 1.0303, "step": 1140 }, { "epoch": 0.15403242079773996, "grad_norm": 0.06540017208658754, "learning_rate": 0.0002973475740643378, "loss": 1.0451, "step": 1145 }, { "epoch": 0.15470505145624538, "grad_norm": 0.06689774497179028, "learning_rate": 0.0002972812181876961, "loss": 1.0409, "step": 1150 }, { "epoch": 0.15537768211475078, "grad_norm": 0.06655596147731474, "learning_rate": 0.00029721405010816487, "loss": 1.0401, "step": 1155 }, { "epoch": 0.1560503127732562, "grad_norm": 0.06680398472117724, "learning_rate": 0.00029714607019615193, "loss": 1.0009, "step": 1160 }, { "epoch": 0.15672294343176163, "grad_norm": 0.06987909994212581, "learning_rate": 0.000297077278826542, "loss": 1.0814, "step": 1165 }, { "epoch": 0.15739557409026703, "grad_norm": 0.0688888114655529, "learning_rate": 0.00029700767637869476, "loss": 1.0194, "step": 1170 }, { "epoch": 0.15806820474877245, "grad_norm": 0.07635128442413575, "learning_rate": 0.0002969372632364426, "loss": 1.027, "step": 1175 }, { "epoch": 0.15874083540727787, "grad_norm": 0.06901775101746678, "learning_rate": 0.0002968660397880886, "loss": 0.9823, "step": 1180 }, { "epoch": 0.15941346606578327, "grad_norm": 0.06666991934971492, "learning_rate": 0.0002967940064264045, "loss": 1.0149, "step": 1185 }, { "epoch": 0.1600860967242887, "grad_norm": 0.06881649593440277, "learning_rate": 0.00029672116354862837, "loss": 1.0462, "step": 1190 }, { "epoch": 0.16075872738279412, "grad_norm": 0.07225944057318469, "learning_rate": 0.0002966475115564624, "loss": 1.0607, "step": 1195 }, { "epoch": 0.16143135804129952, "grad_norm": 0.07486073617413597, "learning_rate": 0.0002965730508560709, "loss": 1.0394, "step": 1200 }, { "epoch": 0.16210398869980494, "grad_norm": 0.07577029538496009, "learning_rate": 0.0002964977818580777, "loss": 1.0071, "step": 1205 }, { "epoch": 0.16277661935831036, "grad_norm": 0.06698327059024388, "learning_rate": 0.0002964217049775642, "loss": 1.0587, "step": 1210 }, { "epoch": 0.16344925001681576, "grad_norm": 0.06454076686592887, "learning_rate": 0.00029634482063406725, "loss": 1.0545, "step": 1215 }, { "epoch": 0.16412188067532119, "grad_norm": 0.06876206027401735, "learning_rate": 0.00029626712925157604, "loss": 0.9889, "step": 1220 }, { "epoch": 0.16479451133382658, "grad_norm": 0.07048318796162506, "learning_rate": 0.0002961886312585307, "loss": 1.0433, "step": 1225 }, { "epoch": 0.165467141992332, "grad_norm": 0.06525242272303788, "learning_rate": 0.0002961093270878194, "loss": 1.0634, "step": 1230 }, { "epoch": 0.16613977265083743, "grad_norm": 0.06852185934917507, "learning_rate": 0.0002960292171767761, "loss": 1.0391, "step": 1235 }, { "epoch": 0.16681240330934283, "grad_norm": 0.06860996659742673, "learning_rate": 0.0002959483019671781, "loss": 1.0555, "step": 1240 }, { "epoch": 0.16748503396784825, "grad_norm": 0.08453865171661228, "learning_rate": 0.0002958665819052436, "loss": 1.0307, "step": 1245 }, { "epoch": 0.16815766462635368, "grad_norm": 0.07555088886637626, "learning_rate": 0.00029578405744162936, "loss": 1.0488, "step": 1250 }, { "epoch": 0.16883029528485907, "grad_norm": 0.06406919323423478, "learning_rate": 0.0002957007290314281, "loss": 1.0251, "step": 1255 }, { "epoch": 0.1695029259433645, "grad_norm": 0.06549772066839298, "learning_rate": 0.00029561659713416596, "loss": 1.0424, "step": 1260 }, { "epoch": 0.17017555660186992, "grad_norm": 0.06632082776805458, "learning_rate": 0.00029553166221380004, "loss": 1.0585, "step": 1265 }, { "epoch": 0.17084818726037532, "grad_norm": 0.06748134536180718, "learning_rate": 0.00029544592473871597, "loss": 1.0221, "step": 1270 }, { "epoch": 0.17152081791888074, "grad_norm": 0.06825285055234158, "learning_rate": 0.0002953593851817249, "loss": 1.0438, "step": 1275 }, { "epoch": 0.17219344857738617, "grad_norm": 0.06203129710524907, "learning_rate": 0.00029527204402006143, "loss": 1.0114, "step": 1280 }, { "epoch": 0.17286607923589156, "grad_norm": 0.0722647514508147, "learning_rate": 0.0002951839017353806, "loss": 0.9961, "step": 1285 }, { "epoch": 0.173538709894397, "grad_norm": 0.06839703114950606, "learning_rate": 0.0002950949588137553, "loss": 1.068, "step": 1290 }, { "epoch": 0.1742113405529024, "grad_norm": 0.06504863220824372, "learning_rate": 0.00029500521574567386, "loss": 1.0338, "step": 1295 }, { "epoch": 0.1748839712114078, "grad_norm": 0.06912815184187844, "learning_rate": 0.00029491467302603694, "loss": 1.075, "step": 1300 }, { "epoch": 0.17555660186991323, "grad_norm": 0.06795709704635283, "learning_rate": 0.0002948233311541549, "loss": 1.0522, "step": 1305 }, { "epoch": 0.17622923252841866, "grad_norm": 0.07980869749638134, "learning_rate": 0.00029473119063374545, "loss": 1.0135, "step": 1310 }, { "epoch": 0.17690186318692405, "grad_norm": 0.0631244748578968, "learning_rate": 0.00029463825197293027, "loss": 1.0729, "step": 1315 }, { "epoch": 0.17757449384542948, "grad_norm": 0.06703926037394037, "learning_rate": 0.0002945445156842327, "loss": 1.0336, "step": 1320 }, { "epoch": 0.1782471245039349, "grad_norm": 0.06522291398196874, "learning_rate": 0.0002944499822845746, "loss": 1.0136, "step": 1325 }, { "epoch": 0.1789197551624403, "grad_norm": 0.0685652259526568, "learning_rate": 0.00029435465229527355, "loss": 1.0532, "step": 1330 }, { "epoch": 0.17959238582094572, "grad_norm": 0.06121295547267051, "learning_rate": 0.0002942585262420402, "loss": 1.0215, "step": 1335 }, { "epoch": 0.18026501647945115, "grad_norm": 0.06860811463441442, "learning_rate": 0.00029416160465497516, "loss": 1.0204, "step": 1340 }, { "epoch": 0.18093764713795654, "grad_norm": 0.06445083042599647, "learning_rate": 0.000294063888068566, "loss": 1.0451, "step": 1345 }, { "epoch": 0.18161027779646197, "grad_norm": 0.06632986737605773, "learning_rate": 0.0002939653770216845, "loss": 0.9969, "step": 1350 }, { "epoch": 0.1822829084549674, "grad_norm": 0.0682696802304684, "learning_rate": 0.00029386607205758374, "loss": 1.0251, "step": 1355 }, { "epoch": 0.1829555391134728, "grad_norm": 0.06474417456320194, "learning_rate": 0.00029376597372389473, "loss": 0.988, "step": 1360 }, { "epoch": 0.1836281697719782, "grad_norm": 0.06702069019563696, "learning_rate": 0.00029366508257262373, "loss": 1.0201, "step": 1365 }, { "epoch": 0.1843008004304836, "grad_norm": 0.07355768046137742, "learning_rate": 0.00029356339916014916, "loss": 1.0264, "step": 1370 }, { "epoch": 0.18497343108898903, "grad_norm": 0.06503361777179631, "learning_rate": 0.00029346092404721846, "loss": 1.0451, "step": 1375 }, { "epoch": 0.18564606174749446, "grad_norm": 0.06906367933360327, "learning_rate": 0.0002933576577989449, "loss": 1.0196, "step": 1380 }, { "epoch": 0.18631869240599985, "grad_norm": 0.07061676537966292, "learning_rate": 0.0002932536009848047, "loss": 1.0301, "step": 1385 }, { "epoch": 0.18699132306450528, "grad_norm": 0.069484640759308, "learning_rate": 0.00029314875417863373, "loss": 1.0168, "step": 1390 }, { "epoch": 0.1876639537230107, "grad_norm": 0.07052322125572345, "learning_rate": 0.0002930431179586244, "loss": 1.0015, "step": 1395 }, { "epoch": 0.1883365843815161, "grad_norm": 0.0658786556485662, "learning_rate": 0.0002929366929073224, "loss": 1.0006, "step": 1400 }, { "epoch": 0.18900921504002152, "grad_norm": 0.06732232716188208, "learning_rate": 0.00029282947961162357, "loss": 1.0667, "step": 1405 }, { "epoch": 0.18968184569852695, "grad_norm": 0.06984081400541851, "learning_rate": 0.00029272147866277057, "loss": 1.0306, "step": 1410 }, { "epoch": 0.19035447635703234, "grad_norm": 0.06366943481097173, "learning_rate": 0.00029261269065634986, "loss": 1.0238, "step": 1415 }, { "epoch": 0.19102710701553777, "grad_norm": 0.0612748756213378, "learning_rate": 0.00029250311619228805, "loss": 1.0256, "step": 1420 }, { "epoch": 0.1916997376740432, "grad_norm": 0.06771279040627849, "learning_rate": 0.0002923927558748489, "loss": 1.0281, "step": 1425 }, { "epoch": 0.1923723683325486, "grad_norm": 0.06617871674134063, "learning_rate": 0.0002922816103126298, "loss": 1.0184, "step": 1430 }, { "epoch": 0.193044998991054, "grad_norm": 0.0638741626749275, "learning_rate": 0.0002921696801185585, "loss": 1.049, "step": 1435 }, { "epoch": 0.19371762964955944, "grad_norm": 0.07508489340820632, "learning_rate": 0.0002920569659098898, "loss": 1.0629, "step": 1440 }, { "epoch": 0.19439026030806483, "grad_norm": 0.06868936170957156, "learning_rate": 0.0002919434683082018, "loss": 1.0554, "step": 1445 }, { "epoch": 0.19506289096657026, "grad_norm": 0.06950639681678793, "learning_rate": 0.0002918291879393931, "loss": 1.0253, "step": 1450 }, { "epoch": 0.19573552162507568, "grad_norm": 0.06524073101638589, "learning_rate": 0.0002917141254336787, "loss": 1.0272, "step": 1455 }, { "epoch": 0.19640815228358108, "grad_norm": 0.06648087678063275, "learning_rate": 0.00029159828142558694, "loss": 0.9573, "step": 1460 }, { "epoch": 0.1970807829420865, "grad_norm": 0.06488313142830643, "learning_rate": 0.0002914816565539557, "loss": 1.0286, "step": 1465 }, { "epoch": 0.19775341360059193, "grad_norm": 0.07163591621516463, "learning_rate": 0.00029136425146192925, "loss": 0.987, "step": 1470 }, { "epoch": 0.19842604425909732, "grad_norm": 0.0736966578106896, "learning_rate": 0.00029124606679695436, "loss": 1.0321, "step": 1475 }, { "epoch": 0.19909867491760275, "grad_norm": 0.06765832425648322, "learning_rate": 0.00029112710321077697, "loss": 1.0299, "step": 1480 }, { "epoch": 0.19977130557610817, "grad_norm": 0.06835624737932443, "learning_rate": 0.00029100736135943833, "loss": 1.014, "step": 1485 }, { "epoch": 0.20044393623461357, "grad_norm": 0.06535609384863643, "learning_rate": 0.0002908868419032717, "loss": 0.9961, "step": 1490 }, { "epoch": 0.201116566893119, "grad_norm": 0.06399387489597937, "learning_rate": 0.0002907655455068985, "loss": 1.0251, "step": 1495 }, { "epoch": 0.20178919755162442, "grad_norm": 0.06910761394744257, "learning_rate": 0.0002906434728392247, "loss": 1.0211, "step": 1500 }, { "epoch": 0.2024618282101298, "grad_norm": 0.06771869394836479, "learning_rate": 0.00029052062457343697, "loss": 1.0672, "step": 1505 }, { "epoch": 0.20313445886863524, "grad_norm": 0.06512325002739387, "learning_rate": 0.0002903970013869994, "loss": 0.9796, "step": 1510 }, { "epoch": 0.20380708952714063, "grad_norm": 0.06907395495907703, "learning_rate": 0.0002902726039616493, "loss": 1.0199, "step": 1515 }, { "epoch": 0.20447972018564606, "grad_norm": 0.07845817619134507, "learning_rate": 0.0002901474329833937, "loss": 1.036, "step": 1520 }, { "epoch": 0.20515235084415148, "grad_norm": 0.06464732031438017, "learning_rate": 0.00029002148914250553, "loss": 1.0068, "step": 1525 }, { "epoch": 0.20582498150265688, "grad_norm": 0.06627647931212392, "learning_rate": 0.00028989477313351957, "loss": 0.9713, "step": 1530 }, { "epoch": 0.2064976121611623, "grad_norm": 0.06628903796474898, "learning_rate": 0.00028976728565522915, "loss": 1.0242, "step": 1535 }, { "epoch": 0.20717024281966773, "grad_norm": 0.07099236238385558, "learning_rate": 0.00028963902741068175, "loss": 1.0052, "step": 1540 }, { "epoch": 0.20784287347817312, "grad_norm": 0.06704768653098384, "learning_rate": 0.0002895099991071754, "loss": 1.0003, "step": 1545 }, { "epoch": 0.20851550413667855, "grad_norm": 0.06714193079186036, "learning_rate": 0.00028938020145625467, "loss": 1.0034, "step": 1550 }, { "epoch": 0.20918813479518397, "grad_norm": 0.07669111095324174, "learning_rate": 0.00028924963517370703, "loss": 1.0353, "step": 1555 }, { "epoch": 0.20986076545368937, "grad_norm": 0.06543093269181684, "learning_rate": 0.0002891183009795584, "loss": 1.0281, "step": 1560 }, { "epoch": 0.2105333961121948, "grad_norm": 0.06620556706376186, "learning_rate": 0.0002889861995980696, "loss": 0.9855, "step": 1565 }, { "epoch": 0.21120602677070022, "grad_norm": 0.06987041404693957, "learning_rate": 0.0002888533317577322, "loss": 1.0137, "step": 1570 }, { "epoch": 0.2118786574292056, "grad_norm": 0.06420643002717302, "learning_rate": 0.00028871969819126446, "loss": 0.9949, "step": 1575 }, { "epoch": 0.21255128808771104, "grad_norm": 0.06880594384530382, "learning_rate": 0.00028858529963560745, "loss": 1.0173, "step": 1580 }, { "epoch": 0.21322391874621646, "grad_norm": 0.0747143183769185, "learning_rate": 0.00028845013683192073, "loss": 1.0239, "step": 1585 }, { "epoch": 0.21389654940472186, "grad_norm": 0.06767835738030381, "learning_rate": 0.00028831421052557854, "loss": 1.0345, "step": 1590 }, { "epoch": 0.21456918006322728, "grad_norm": 0.06245317417171576, "learning_rate": 0.0002881775214661656, "loss": 1.046, "step": 1595 }, { "epoch": 0.2152418107217327, "grad_norm": 0.06851847320305292, "learning_rate": 0.0002880400704074727, "loss": 1.0237, "step": 1600 }, { "epoch": 0.2159144413802381, "grad_norm": 0.07060354312063585, "learning_rate": 0.00028790185810749307, "loss": 1.0544, "step": 1605 }, { "epoch": 0.21658707203874353, "grad_norm": 0.06780678570608949, "learning_rate": 0.0002877628853284177, "loss": 1.0454, "step": 1610 }, { "epoch": 0.21725970269724895, "grad_norm": 0.06415115958536001, "learning_rate": 0.00028762315283663146, "loss": 1.0533, "step": 1615 }, { "epoch": 0.21793233335575435, "grad_norm": 0.06810949401736772, "learning_rate": 0.0002874826614027087, "loss": 1.0502, "step": 1620 }, { "epoch": 0.21860496401425977, "grad_norm": 0.06495370571278798, "learning_rate": 0.0002873414118014092, "loss": 0.9889, "step": 1625 }, { "epoch": 0.2192775946727652, "grad_norm": 0.06650525610263389, "learning_rate": 0.0002871994048116735, "loss": 0.9953, "step": 1630 }, { "epoch": 0.2199502253312706, "grad_norm": 0.9281345660239769, "learning_rate": 0.0002870566412166192, "loss": 0.9753, "step": 1635 }, { "epoch": 0.22062285598977602, "grad_norm": 0.07874999810678028, "learning_rate": 0.000286913121803536, "loss": 1.0589, "step": 1640 }, { "epoch": 0.22129548664828144, "grad_norm": 0.07137195124766428, "learning_rate": 0.00028676884736388166, "loss": 1.0262, "step": 1645 }, { "epoch": 0.22196811730678684, "grad_norm": 0.07407503743741842, "learning_rate": 0.0002866238186932781, "loss": 1.062, "step": 1650 }, { "epoch": 0.22264074796529226, "grad_norm": 0.07384616098410135, "learning_rate": 0.0002864780365915059, "loss": 1.0539, "step": 1655 }, { "epoch": 0.22331337862379766, "grad_norm": 0.07446398508616892, "learning_rate": 0.0002863315018625011, "loss": 0.9937, "step": 1660 }, { "epoch": 0.22398600928230308, "grad_norm": 0.0667414804323104, "learning_rate": 0.0002861842153143499, "loss": 0.9915, "step": 1665 }, { "epoch": 0.2246586399408085, "grad_norm": 0.07501767013141597, "learning_rate": 0.0002860361777592845, "loss": 1.032, "step": 1670 }, { "epoch": 0.2253312705993139, "grad_norm": 0.07397962895213549, "learning_rate": 0.0002858873900136787, "loss": 1.023, "step": 1675 }, { "epoch": 0.22600390125781933, "grad_norm": 0.06986271250274151, "learning_rate": 0.0002857378528980435, "loss": 1.0069, "step": 1680 }, { "epoch": 0.22667653191632475, "grad_norm": 0.06758485727346657, "learning_rate": 0.0002855875672370222, "loss": 1.0125, "step": 1685 }, { "epoch": 0.22734916257483015, "grad_norm": 0.06966621104334436, "learning_rate": 0.00028543653385938603, "loss": 1.019, "step": 1690 }, { "epoch": 0.22802179323333557, "grad_norm": 0.07355147933025005, "learning_rate": 0.00028528475359802975, "loss": 1.0433, "step": 1695 }, { "epoch": 0.228694423891841, "grad_norm": 0.06265452252311694, "learning_rate": 0.0002851322272899668, "loss": 1.0184, "step": 1700 }, { "epoch": 0.2293670545503464, "grad_norm": 0.061438088188945056, "learning_rate": 0.0002849789557763249, "loss": 1.0061, "step": 1705 }, { "epoch": 0.23003968520885182, "grad_norm": 0.06445178157108626, "learning_rate": 0.00028482493990234127, "loss": 0.9983, "step": 1710 }, { "epoch": 0.23071231586735724, "grad_norm": 0.06827810304381737, "learning_rate": 0.000284670180517358, "loss": 1.0497, "step": 1715 }, { "epoch": 0.23138494652586264, "grad_norm": 0.06271824849288654, "learning_rate": 0.0002845146784748173, "loss": 1.0258, "step": 1720 }, { "epoch": 0.23205757718436806, "grad_norm": 0.06209741624879291, "learning_rate": 0.00028435843463225707, "loss": 1.0015, "step": 1725 }, { "epoch": 0.2327302078428735, "grad_norm": 0.06712008833902464, "learning_rate": 0.0002842014498513057, "loss": 1.0297, "step": 1730 }, { "epoch": 0.23340283850137888, "grad_norm": 0.07663574411261381, "learning_rate": 0.00028404372499767793, "loss": 1.0363, "step": 1735 }, { "epoch": 0.2340754691598843, "grad_norm": 0.07360436071600215, "learning_rate": 0.00028388526094116933, "loss": 1.0156, "step": 1740 }, { "epoch": 0.23474809981838973, "grad_norm": 0.06983865341864652, "learning_rate": 0.0002837260585556523, "loss": 1.0211, "step": 1745 }, { "epoch": 0.23542073047689513, "grad_norm": 0.06827222444417715, "learning_rate": 0.0002835661187190705, "loss": 1.0142, "step": 1750 }, { "epoch": 0.23609336113540055, "grad_norm": 0.057534870796608614, "learning_rate": 0.00028340544231343466, "loss": 0.9828, "step": 1755 }, { "epoch": 0.23676599179390598, "grad_norm": 0.062030787408893256, "learning_rate": 0.0002832440302248173, "loss": 1.0166, "step": 1760 }, { "epoch": 0.23743862245241137, "grad_norm": 0.06976421841182824, "learning_rate": 0.0002830818833433479, "loss": 1.0838, "step": 1765 }, { "epoch": 0.2381112531109168, "grad_norm": 0.06119115702007277, "learning_rate": 0.0002829190025632082, "loss": 1.0102, "step": 1770 }, { "epoch": 0.23878388376942222, "grad_norm": 0.06903918932060034, "learning_rate": 0.000282755388782627, "loss": 1.0068, "step": 1775 }, { "epoch": 0.23945651442792762, "grad_norm": 0.07303208628213309, "learning_rate": 0.0002825910429038755, "loss": 1.0236, "step": 1780 }, { "epoch": 0.24012914508643304, "grad_norm": 0.06557501175714182, "learning_rate": 0.00028242596583326194, "loss": 1.0096, "step": 1785 }, { "epoch": 0.24080177574493847, "grad_norm": 0.07024712893766441, "learning_rate": 0.00028226015848112693, "loss": 1.024, "step": 1790 }, { "epoch": 0.24147440640344386, "grad_norm": 0.061836689683176946, "learning_rate": 0.00028209362176183833, "loss": 1.0118, "step": 1795 }, { "epoch": 0.2421470370619493, "grad_norm": 0.06288178365841733, "learning_rate": 0.00028192635659378623, "loss": 0.9944, "step": 1800 }, { "epoch": 0.24281966772045468, "grad_norm": 0.0653498569458799, "learning_rate": 0.0002817583638993778, "loss": 0.9821, "step": 1805 }, { "epoch": 0.2434922983789601, "grad_norm": 0.06501875393694694, "learning_rate": 0.0002815896446050322, "loss": 0.979, "step": 1810 }, { "epoch": 0.24416492903746553, "grad_norm": 0.06860382803225752, "learning_rate": 0.0002814201996411757, "loss": 1.0661, "step": 1815 }, { "epoch": 0.24483755969597093, "grad_norm": 0.06882483957372922, "learning_rate": 0.0002812500299422362, "loss": 0.9763, "step": 1820 }, { "epoch": 0.24551019035447635, "grad_norm": 0.06819349685039716, "learning_rate": 0.0002810791364466383, "loss": 0.9679, "step": 1825 }, { "epoch": 0.24618282101298178, "grad_norm": 0.06755766217610122, "learning_rate": 0.0002809075200967981, "loss": 1.0079, "step": 1830 }, { "epoch": 0.24685545167148717, "grad_norm": 0.06646837866500027, "learning_rate": 0.000280735181839118, "loss": 0.9782, "step": 1835 }, { "epoch": 0.2475280823299926, "grad_norm": 0.06901054481657276, "learning_rate": 0.00028056212262398143, "loss": 0.9849, "step": 1840 }, { "epoch": 0.24820071298849802, "grad_norm": 0.06470902262220722, "learning_rate": 0.0002803883434057477, "loss": 1.0168, "step": 1845 }, { "epoch": 0.24887334364700342, "grad_norm": 0.06359268644716512, "learning_rate": 0.00028021384514274655, "loss": 1.016, "step": 1850 }, { "epoch": 0.24954597430550884, "grad_norm": 0.06467268670269033, "learning_rate": 0.0002800386287972731, "loss": 1.0076, "step": 1855 }, { "epoch": 0.25021860496401427, "grad_norm": 0.0632393566846638, "learning_rate": 0.0002798626953355825, "loss": 0.9818, "step": 1860 }, { "epoch": 0.2508912356225197, "grad_norm": 0.06540791283180712, "learning_rate": 0.0002796860457278843, "loss": 1.0428, "step": 1865 }, { "epoch": 0.25156386628102506, "grad_norm": 0.06094576123930132, "learning_rate": 0.0002795086809483376, "loss": 0.944, "step": 1870 }, { "epoch": 0.2522364969395305, "grad_norm": 0.06411498342379629, "learning_rate": 0.0002793306019750452, "loss": 1.0099, "step": 1875 }, { "epoch": 0.2529091275980359, "grad_norm": 0.06860575565519289, "learning_rate": 0.00027915180979004855, "loss": 0.9889, "step": 1880 }, { "epoch": 0.25358175825654133, "grad_norm": 0.07330880194764075, "learning_rate": 0.00027897230537932225, "loss": 1.037, "step": 1885 }, { "epoch": 0.25425438891504676, "grad_norm": 0.06400885002099273, "learning_rate": 0.0002787920897327684, "loss": 1.0211, "step": 1890 }, { "epoch": 0.2549270195735522, "grad_norm": 0.06447537245876075, "learning_rate": 0.0002786111638442115, "loss": 0.9675, "step": 1895 }, { "epoch": 0.25559965023205755, "grad_norm": 0.06438920821051117, "learning_rate": 0.00027842952871139255, "loss": 1.0014, "step": 1900 }, { "epoch": 0.256272280890563, "grad_norm": 0.06337500273733042, "learning_rate": 0.000278247185335964, "loss": 0.9479, "step": 1905 }, { "epoch": 0.2569449115490684, "grad_norm": 0.06693550599643627, "learning_rate": 0.0002780641347234839, "loss": 0.9948, "step": 1910 }, { "epoch": 0.2576175422075738, "grad_norm": 0.06189864322039309, "learning_rate": 0.0002778803778834105, "loss": 0.9892, "step": 1915 }, { "epoch": 0.25829017286607925, "grad_norm": 0.06327884986591356, "learning_rate": 0.00027769591582909654, "loss": 0.9805, "step": 1920 }, { "epoch": 0.2589628035245847, "grad_norm": 0.06719861601589218, "learning_rate": 0.0002775107495777839, "loss": 0.9874, "step": 1925 }, { "epoch": 0.25963543418309004, "grad_norm": 0.0802158924797212, "learning_rate": 0.00027732488015059777, "loss": 1.0113, "step": 1930 }, { "epoch": 0.26030806484159547, "grad_norm": 0.06727053907609576, "learning_rate": 0.00027713830857254107, "loss": 0.9994, "step": 1935 }, { "epoch": 0.2609806955001009, "grad_norm": 0.06258712788033971, "learning_rate": 0.0002769510358724889, "loss": 1.0189, "step": 1940 }, { "epoch": 0.2616533261586063, "grad_norm": 0.06977543060346805, "learning_rate": 0.00027676306308318285, "loss": 1.0124, "step": 1945 }, { "epoch": 0.26232595681711174, "grad_norm": 0.07132869843251438, "learning_rate": 0.00027657439124122504, "loss": 1.0114, "step": 1950 }, { "epoch": 0.26299858747561716, "grad_norm": 0.06589196655607017, "learning_rate": 0.00027638502138707286, "loss": 0.9763, "step": 1955 }, { "epoch": 0.26367121813412253, "grad_norm": 0.0628837134210337, "learning_rate": 0.0002761949545650328, "loss": 1.022, "step": 1960 }, { "epoch": 0.26434384879262796, "grad_norm": 0.06500486647928407, "learning_rate": 0.00027600419182325503, "loss": 0.9963, "step": 1965 }, { "epoch": 0.2650164794511334, "grad_norm": 0.06352366459995422, "learning_rate": 0.0002758127342137273, "loss": 1.0398, "step": 1970 }, { "epoch": 0.2656891101096388, "grad_norm": 0.06734294053419544, "learning_rate": 0.00027562058279226943, "loss": 0.991, "step": 1975 }, { "epoch": 0.26636174076814423, "grad_norm": 0.05908395334931715, "learning_rate": 0.00027542773861852736, "loss": 0.9446, "step": 1980 }, { "epoch": 0.26703437142664965, "grad_norm": 0.06394425508709521, "learning_rate": 0.0002752342027559672, "loss": 0.9973, "step": 1985 }, { "epoch": 0.267707002085155, "grad_norm": 0.061889140536584404, "learning_rate": 0.0002750399762718696, "loss": 0.9833, "step": 1990 }, { "epoch": 0.26837963274366045, "grad_norm": 0.06101925340274876, "learning_rate": 0.0002748450602373237, "loss": 0.9978, "step": 1995 }, { "epoch": 0.26905226340216587, "grad_norm": 0.060092029180786675, "learning_rate": 0.00027464945572722117, "loss": 0.9893, "step": 2000 }, { "epoch": 0.2697248940606713, "grad_norm": 0.06441845729424575, "learning_rate": 0.0002744531638202506, "loss": 1.027, "step": 2005 }, { "epoch": 0.2703975247191767, "grad_norm": 0.07685133272762304, "learning_rate": 0.00027425618559889103, "loss": 1.0008, "step": 2010 }, { "epoch": 0.2710701553776821, "grad_norm": 0.07296855729895588, "learning_rate": 0.0002740585221494065, "loss": 1.0134, "step": 2015 }, { "epoch": 0.2717427860361875, "grad_norm": 0.06305663285637599, "learning_rate": 0.00027386017456183977, "loss": 0.9751, "step": 2020 }, { "epoch": 0.27241541669469294, "grad_norm": 0.06665022031648576, "learning_rate": 0.00027366114393000634, "loss": 1.0051, "step": 2025 }, { "epoch": 0.27308804735319836, "grad_norm": 0.06831910827083325, "learning_rate": 0.00027346143135148845, "loss": 0.9834, "step": 2030 }, { "epoch": 0.2737606780117038, "grad_norm": 0.0790830090918589, "learning_rate": 0.0002732610379276292, "loss": 1.0133, "step": 2035 }, { "epoch": 0.2744333086702092, "grad_norm": 0.061991084813353935, "learning_rate": 0.0002730599647635261, "loss": 1.0148, "step": 2040 }, { "epoch": 0.2751059393287146, "grad_norm": 0.06401059184328592, "learning_rate": 0.0002728582129680251, "loss": 0.9944, "step": 2045 }, { "epoch": 0.27577856998722, "grad_norm": 0.06976597880172361, "learning_rate": 0.00027265578365371496, "loss": 0.9849, "step": 2050 }, { "epoch": 0.2764512006457254, "grad_norm": 0.058809321177156884, "learning_rate": 0.0002724526779369204, "loss": 0.963, "step": 2055 }, { "epoch": 0.27712383130423085, "grad_norm": 0.0653799780787328, "learning_rate": 0.00027224889693769615, "loss": 1.0104, "step": 2060 }, { "epoch": 0.2777964619627363, "grad_norm": 0.06621560992168146, "learning_rate": 0.0002720444417798212, "loss": 1.0291, "step": 2065 }, { "epoch": 0.2784690926212417, "grad_norm": 0.07406952786150485, "learning_rate": 0.0002718393135907922, "loss": 0.9909, "step": 2070 }, { "epoch": 0.27914172327974707, "grad_norm": 0.06411491189711067, "learning_rate": 0.00027163351350181704, "loss": 0.9848, "step": 2075 }, { "epoch": 0.2798143539382525, "grad_norm": 0.06793287903099283, "learning_rate": 0.0002714270426478093, "loss": 0.9862, "step": 2080 }, { "epoch": 0.2804869845967579, "grad_norm": 0.06071468451699804, "learning_rate": 0.00027121990216738133, "loss": 1.0121, "step": 2085 }, { "epoch": 0.28115961525526334, "grad_norm": 0.06439036739512706, "learning_rate": 0.00027101209320283824, "loss": 0.9859, "step": 2090 }, { "epoch": 0.28183224591376876, "grad_norm": 0.05944129800789135, "learning_rate": 0.00027080361690017175, "loss": 0.985, "step": 2095 }, { "epoch": 0.2825048765722742, "grad_norm": 0.061243133513179636, "learning_rate": 0.0002705944744090536, "loss": 0.9901, "step": 2100 }, { "epoch": 0.28317750723077956, "grad_norm": 0.06746937900880097, "learning_rate": 0.0002703846668828292, "loss": 1.0113, "step": 2105 }, { "epoch": 0.283850137889285, "grad_norm": 0.05961664999737001, "learning_rate": 0.00027017419547851167, "loss": 1.0125, "step": 2110 }, { "epoch": 0.2845227685477904, "grad_norm": 0.06395608178278261, "learning_rate": 0.000269963061356775, "loss": 1.0041, "step": 2115 }, { "epoch": 0.28519539920629583, "grad_norm": 0.0662373002194877, "learning_rate": 0.0002697512656819477, "loss": 0.9957, "step": 2120 }, { "epoch": 0.28586802986480125, "grad_norm": 0.06121788566565536, "learning_rate": 0.0002695388096220068, "loss": 1.0034, "step": 2125 }, { "epoch": 0.2865406605233067, "grad_norm": 0.12099988704064925, "learning_rate": 0.00026932569434857104, "loss": 0.9866, "step": 2130 }, { "epoch": 0.28721329118181205, "grad_norm": 0.0821229785976475, "learning_rate": 0.00026911192103689426, "loss": 1.0447, "step": 2135 }, { "epoch": 0.28788592184031747, "grad_norm": 0.0709363771493444, "learning_rate": 0.00026889749086585934, "loss": 1.0257, "step": 2140 }, { "epoch": 0.2885585524988229, "grad_norm": 0.07815479270365608, "learning_rate": 0.00026868240501797154, "loss": 1.0101, "step": 2145 }, { "epoch": 0.2892311831573283, "grad_norm": 0.06846619759519718, "learning_rate": 0.00026846666467935184, "loss": 0.9875, "step": 2150 }, { "epoch": 0.28990381381583374, "grad_norm": 0.06393262884579075, "learning_rate": 0.00026825027103973047, "loss": 0.9773, "step": 2155 }, { "epoch": 0.2905764444743391, "grad_norm": 0.06516709589071833, "learning_rate": 0.00026803322529244056, "loss": 1.0289, "step": 2160 }, { "epoch": 0.29124907513284454, "grad_norm": 0.06385243590151152, "learning_rate": 0.0002678155286344111, "loss": 0.9987, "step": 2165 }, { "epoch": 0.29192170579134996, "grad_norm": 0.06202180097824563, "learning_rate": 0.00026759718226616094, "loss": 1.0024, "step": 2170 }, { "epoch": 0.2925943364498554, "grad_norm": 0.06175463083001653, "learning_rate": 0.00026737818739179156, "loss": 0.966, "step": 2175 }, { "epoch": 0.2932669671083608, "grad_norm": 0.0657893166600458, "learning_rate": 0.00026715854521898094, "loss": 0.9561, "step": 2180 }, { "epoch": 0.29393959776686623, "grad_norm": 0.06518043702884813, "learning_rate": 0.0002669382569589765, "loss": 1.0009, "step": 2185 }, { "epoch": 0.2946122284253716, "grad_norm": 0.060097592389166545, "learning_rate": 0.00026671732382658873, "loss": 0.958, "step": 2190 }, { "epoch": 0.295284859083877, "grad_norm": 0.06436827662279306, "learning_rate": 0.0002664957470401842, "loss": 1.0057, "step": 2195 }, { "epoch": 0.29595748974238245, "grad_norm": 0.06429485213345078, "learning_rate": 0.0002662735278216793, "loss": 1.0076, "step": 2200 }, { "epoch": 0.2966301204008879, "grad_norm": 0.06111361270816539, "learning_rate": 0.0002660506673965329, "loss": 1.0002, "step": 2205 }, { "epoch": 0.2973027510593933, "grad_norm": 0.06077882997745099, "learning_rate": 0.00026582716699373996, "loss": 0.9766, "step": 2210 }, { "epoch": 0.2979753817178987, "grad_norm": 0.06458898622394665, "learning_rate": 0.0002656030278458248, "loss": 0.9988, "step": 2215 }, { "epoch": 0.2986480123764041, "grad_norm": 0.06430844883272462, "learning_rate": 0.0002653782511888341, "loss": 0.9967, "step": 2220 }, { "epoch": 0.2993206430349095, "grad_norm": 0.06247648417092113, "learning_rate": 0.0002651528382623302, "loss": 0.9935, "step": 2225 }, { "epoch": 0.29999327369341494, "grad_norm": 0.06232942636976054, "learning_rate": 0.0002649267903093842, "loss": 0.9849, "step": 2230 }, { "epoch": 0.30066590435192037, "grad_norm": 0.0625250684904268, "learning_rate": 0.0002647001085765692, "loss": 0.9912, "step": 2235 }, { "epoch": 0.3013385350104258, "grad_norm": 0.05955858192998459, "learning_rate": 0.0002644727943139534, "loss": 0.9351, "step": 2240 }, { "epoch": 0.3020111656689312, "grad_norm": 0.06612187680155705, "learning_rate": 0.000264244848775093, "loss": 0.9876, "step": 2245 }, { "epoch": 0.3026837963274366, "grad_norm": 0.06173310312205808, "learning_rate": 0.00026401627321702556, "loss": 0.9625, "step": 2250 }, { "epoch": 0.303356426985942, "grad_norm": 0.06917659803618215, "learning_rate": 0.00026378706890026307, "loss": 1.0174, "step": 2255 }, { "epoch": 0.30402905764444743, "grad_norm": 0.06556532003559523, "learning_rate": 0.00026355723708878484, "loss": 0.961, "step": 2260 }, { "epoch": 0.30470168830295286, "grad_norm": 0.06287181767310183, "learning_rate": 0.00026332677905003047, "loss": 0.9994, "step": 2265 }, { "epoch": 0.3053743189614583, "grad_norm": 0.06342147196329556, "learning_rate": 0.00026309569605489306, "loss": 0.9949, "step": 2270 }, { "epoch": 0.3060469496199637, "grad_norm": 0.06730056184618043, "learning_rate": 0.00026286398937771225, "loss": 0.9572, "step": 2275 }, { "epoch": 0.3067195802784691, "grad_norm": 0.0602458232782407, "learning_rate": 0.00026263166029626676, "loss": 1.0175, "step": 2280 }, { "epoch": 0.3073922109369745, "grad_norm": 0.0578357423452784, "learning_rate": 0.000262398710091768, "loss": 0.9709, "step": 2285 }, { "epoch": 0.3080648415954799, "grad_norm": 0.060253136424880616, "learning_rate": 0.00026216514004885237, "loss": 0.981, "step": 2290 }, { "epoch": 0.30873747225398535, "grad_norm": 0.06306283740453837, "learning_rate": 0.00026193095145557455, "loss": 0.9884, "step": 2295 }, { "epoch": 0.30941010291249077, "grad_norm": 0.06721710931703574, "learning_rate": 0.0002616961456034004, "loss": 0.9555, "step": 2300 }, { "epoch": 0.31008273357099614, "grad_norm": 0.06139546272042707, "learning_rate": 0.0002614607237871996, "loss": 0.9812, "step": 2305 }, { "epoch": 0.31075536422950156, "grad_norm": 0.06423824892716638, "learning_rate": 0.00026122468730523866, "loss": 0.9589, "step": 2310 }, { "epoch": 0.311427994888007, "grad_norm": 0.07227792403797496, "learning_rate": 0.0002609880374591738, "loss": 0.9587, "step": 2315 }, { "epoch": 0.3121006255465124, "grad_norm": 0.06874247357671834, "learning_rate": 0.0002607507755540438, "loss": 0.9866, "step": 2320 }, { "epoch": 0.31277325620501784, "grad_norm": 0.06658784787531048, "learning_rate": 0.0002605129028982626, "loss": 0.9238, "step": 2325 }, { "epoch": 0.31344588686352326, "grad_norm": 0.06167580860405572, "learning_rate": 0.0002602744208036122, "loss": 1.0139, "step": 2330 }, { "epoch": 0.31411851752202863, "grad_norm": 0.059141851611564725, "learning_rate": 0.00026003533058523555, "loss": 1.0216, "step": 2335 }, { "epoch": 0.31479114818053405, "grad_norm": 0.059896676536393356, "learning_rate": 0.00025979563356162905, "loss": 0.9906, "step": 2340 }, { "epoch": 0.3154637788390395, "grad_norm": 0.06304581093570669, "learning_rate": 0.0002595553310546356, "loss": 1.0064, "step": 2345 }, { "epoch": 0.3161364094975449, "grad_norm": 0.0654071698153169, "learning_rate": 0.00025931442438943686, "loss": 1.0005, "step": 2350 }, { "epoch": 0.3168090401560503, "grad_norm": 0.0850720234118959, "learning_rate": 0.00025907291489454646, "loss": 0.9737, "step": 2355 }, { "epoch": 0.31748167081455575, "grad_norm": 0.05842749891613876, "learning_rate": 0.0002588308039018023, "loss": 1.0032, "step": 2360 }, { "epoch": 0.3181543014730611, "grad_norm": 0.06233792615496135, "learning_rate": 0.00025858809274635923, "loss": 0.9542, "step": 2365 }, { "epoch": 0.31882693213156654, "grad_norm": 0.05905164630122034, "learning_rate": 0.000258344782766682, "loss": 0.9624, "step": 2370 }, { "epoch": 0.31949956279007197, "grad_norm": 0.0700392215816944, "learning_rate": 0.0002581008753045375, "loss": 0.9903, "step": 2375 }, { "epoch": 0.3201721934485774, "grad_norm": 0.06257351219582429, "learning_rate": 0.00025785637170498753, "loss": 0.9664, "step": 2380 }, { "epoch": 0.3208448241070828, "grad_norm": 0.06323178744828126, "learning_rate": 0.0002576112733163815, "loss": 0.9698, "step": 2385 }, { "epoch": 0.32151745476558824, "grad_norm": 0.06424742383987511, "learning_rate": 0.00025736558149034867, "loss": 0.9562, "step": 2390 }, { "epoch": 0.3221900854240936, "grad_norm": 0.06672423896058365, "learning_rate": 0.00025711929758179107, "loss": 1.0332, "step": 2395 }, { "epoch": 0.32286271608259903, "grad_norm": 0.0670913173713535, "learning_rate": 0.00025687242294887574, "loss": 0.9857, "step": 2400 }, { "epoch": 0.32353534674110446, "grad_norm": 0.06892018721458289, "learning_rate": 0.0002566249589530274, "loss": 0.9551, "step": 2405 }, { "epoch": 0.3242079773996099, "grad_norm": 0.06009175251349125, "learning_rate": 0.00025637690695892094, "loss": 1.0093, "step": 2410 }, { "epoch": 0.3248806080581153, "grad_norm": 0.08176920992681376, "learning_rate": 0.0002561282683344737, "loss": 1.0043, "step": 2415 }, { "epoch": 0.32555323871662073, "grad_norm": 0.07145448290324558, "learning_rate": 0.00025587904445083823, "loss": 1.0117, "step": 2420 }, { "epoch": 0.3262258693751261, "grad_norm": 0.07543609879058265, "learning_rate": 0.00025562923668239455, "loss": 1.0457, "step": 2425 }, { "epoch": 0.3268985000336315, "grad_norm": 0.06829416483656771, "learning_rate": 0.0002553788464067425, "loss": 0.9526, "step": 2430 }, { "epoch": 0.32757113069213695, "grad_norm": 0.06395861434359994, "learning_rate": 0.00025512787500469426, "loss": 0.9813, "step": 2435 }, { "epoch": 0.32824376135064237, "grad_norm": 0.0611865196811849, "learning_rate": 0.00025487632386026686, "loss": 0.974, "step": 2440 }, { "epoch": 0.3289163920091478, "grad_norm": 0.06491018496652956, "learning_rate": 0.0002546241943606742, "loss": 0.9992, "step": 2445 }, { "epoch": 0.32958902266765316, "grad_norm": 0.06292898906856961, "learning_rate": 0.00025437148789631984, "loss": 1.0456, "step": 2450 }, { "epoch": 0.3302616533261586, "grad_norm": 0.06167564316715784, "learning_rate": 0.0002541182058607887, "loss": 0.9629, "step": 2455 }, { "epoch": 0.330934283984664, "grad_norm": 0.06391041486032593, "learning_rate": 0.00025386434965084015, "loss": 1.0035, "step": 2460 }, { "epoch": 0.33160691464316944, "grad_norm": 0.06787356716894612, "learning_rate": 0.00025360992066639985, "loss": 1.0041, "step": 2465 }, { "epoch": 0.33227954530167486, "grad_norm": 0.06086575812752024, "learning_rate": 0.0002533549203105519, "loss": 0.9999, "step": 2470 }, { "epoch": 0.3329521759601803, "grad_norm": 0.06057902002599989, "learning_rate": 0.00025309934998953156, "loss": 0.9367, "step": 2475 }, { "epoch": 0.33362480661868565, "grad_norm": 0.0593042754107753, "learning_rate": 0.0002528432111127171, "loss": 0.9398, "step": 2480 }, { "epoch": 0.3342974372771911, "grad_norm": 0.059538222557524834, "learning_rate": 0.0002525865050926222, "loss": 1.0005, "step": 2485 }, { "epoch": 0.3349700679356965, "grad_norm": 0.06011503757302019, "learning_rate": 0.00025232923334488804, "loss": 1.0205, "step": 2490 }, { "epoch": 0.3356426985942019, "grad_norm": 0.06122430156326717, "learning_rate": 0.0002520713972882758, "loss": 1.0145, "step": 2495 }, { "epoch": 0.33631532925270735, "grad_norm": 0.06805159146871266, "learning_rate": 0.00025181299834465854, "loss": 1.001, "step": 2500 }, { "epoch": 0.3369879599112128, "grad_norm": 0.057493415342664264, "learning_rate": 0.00025155403793901323, "loss": 0.9765, "step": 2505 }, { "epoch": 0.33766059056971814, "grad_norm": 0.06235990972901437, "learning_rate": 0.0002512945174994134, "loss": 0.9828, "step": 2510 }, { "epoch": 0.33833322122822357, "grad_norm": 0.06022070223153401, "learning_rate": 0.00025103443845702077, "loss": 1.0284, "step": 2515 }, { "epoch": 0.339005851886729, "grad_norm": 0.06135291469414431, "learning_rate": 0.0002507738022460776, "loss": 0.9375, "step": 2520 }, { "epoch": 0.3396784825452344, "grad_norm": 0.0643989781631382, "learning_rate": 0.0002505126103038989, "loss": 0.9855, "step": 2525 }, { "epoch": 0.34035111320373984, "grad_norm": 0.06378266187767027, "learning_rate": 0.000250250864070864, "loss": 0.9545, "step": 2530 }, { "epoch": 0.34102374386224527, "grad_norm": 0.0673159597514603, "learning_rate": 0.0002499885649904092, "loss": 1.0326, "step": 2535 }, { "epoch": 0.34169637452075063, "grad_norm": 0.06067930335234511, "learning_rate": 0.0002497257145090195, "loss": 0.9806, "step": 2540 }, { "epoch": 0.34236900517925606, "grad_norm": 0.059146219957598896, "learning_rate": 0.0002494623140762207, "loss": 0.9508, "step": 2545 }, { "epoch": 0.3430416358377615, "grad_norm": 0.06451800724463314, "learning_rate": 0.00024919836514457127, "loss": 0.9896, "step": 2550 }, { "epoch": 0.3437142664962669, "grad_norm": 0.06081402928938052, "learning_rate": 0.0002489338691696546, "loss": 0.9383, "step": 2555 }, { "epoch": 0.34438689715477233, "grad_norm": 0.06009484154209872, "learning_rate": 0.00024866882761007076, "loss": 0.9952, "step": 2560 }, { "epoch": 0.34505952781327776, "grad_norm": 0.05973548410026698, "learning_rate": 0.00024840324192742846, "loss": 0.9796, "step": 2565 }, { "epoch": 0.3457321584717831, "grad_norm": 0.06553152032293165, "learning_rate": 0.00024813711358633717, "loss": 0.9923, "step": 2570 }, { "epoch": 0.34640478913028855, "grad_norm": 0.06471026587362512, "learning_rate": 0.00024787044405439885, "loss": 1.0067, "step": 2575 }, { "epoch": 0.347077419788794, "grad_norm": 0.05757825937888166, "learning_rate": 0.0002476032348021999, "loss": 0.9595, "step": 2580 }, { "epoch": 0.3477500504472994, "grad_norm": 0.05837414362073145, "learning_rate": 0.0002473354873033033, "loss": 0.998, "step": 2585 }, { "epoch": 0.3484226811058048, "grad_norm": 0.061813392737187556, "learning_rate": 0.00024706720303423993, "loss": 0.967, "step": 2590 }, { "epoch": 0.3490953117643102, "grad_norm": 0.0634130054981169, "learning_rate": 0.000246798383474501, "loss": 0.9489, "step": 2595 }, { "epoch": 0.3497679424228156, "grad_norm": 0.06617696567513282, "learning_rate": 0.0002465290301065296, "loss": 0.9629, "step": 2600 }, { "epoch": 0.35044057308132104, "grad_norm": 0.060723179362491476, "learning_rate": 0.00024625914441571265, "loss": 0.9565, "step": 2605 }, { "epoch": 0.35111320373982646, "grad_norm": 0.0612745323214066, "learning_rate": 0.0002459887278903724, "loss": 0.9811, "step": 2610 }, { "epoch": 0.3517858343983319, "grad_norm": 0.0655966405158917, "learning_rate": 0.00024571778202175877, "loss": 0.9754, "step": 2615 }, { "epoch": 0.3524584650568373, "grad_norm": 0.06727151630166943, "learning_rate": 0.0002454463083040405, "loss": 0.9293, "step": 2620 }, { "epoch": 0.3531310957153427, "grad_norm": 0.06178442765486125, "learning_rate": 0.00024517430823429764, "loss": 1.0013, "step": 2625 }, { "epoch": 0.3538037263738481, "grad_norm": 0.06683142790810723, "learning_rate": 0.00024490178331251246, "loss": 0.9749, "step": 2630 }, { "epoch": 0.35447635703235353, "grad_norm": 0.06054586886009189, "learning_rate": 0.0002446287350415618, "loss": 0.9478, "step": 2635 }, { "epoch": 0.35514898769085895, "grad_norm": 0.05950647200965788, "learning_rate": 0.0002443551649272086, "loss": 0.9934, "step": 2640 }, { "epoch": 0.3558216183493644, "grad_norm": 0.0625836813067742, "learning_rate": 0.00024408107447809353, "loss": 0.9213, "step": 2645 }, { "epoch": 0.3564942490078698, "grad_norm": 0.05991503892499088, "learning_rate": 0.00024380646520572675, "loss": 0.9831, "step": 2650 }, { "epoch": 0.35716687966637517, "grad_norm": 0.05370777920413785, "learning_rate": 0.0002435313386244795, "loss": 0.9479, "step": 2655 }, { "epoch": 0.3578395103248806, "grad_norm": 0.06270981502346962, "learning_rate": 0.00024325569625157587, "loss": 0.9965, "step": 2660 }, { "epoch": 0.358512140983386, "grad_norm": 0.0613724918822416, "learning_rate": 0.00024297953960708416, "loss": 0.9595, "step": 2665 }, { "epoch": 0.35918477164189144, "grad_norm": 0.06038112429006151, "learning_rate": 0.00024270287021390898, "loss": 0.9411, "step": 2670 }, { "epoch": 0.35985740230039687, "grad_norm": 0.06111430494806048, "learning_rate": 0.00024242568959778236, "loss": 0.9649, "step": 2675 }, { "epoch": 0.3605300329589023, "grad_norm": 0.05995428682403505, "learning_rate": 0.0002421479992872556, "loss": 1.0257, "step": 2680 }, { "epoch": 0.36120266361740766, "grad_norm": 0.06396379465235519, "learning_rate": 0.0002418698008136908, "loss": 0.9543, "step": 2685 }, { "epoch": 0.3618752942759131, "grad_norm": 0.06323410022200342, "learning_rate": 0.00024159109571125236, "loss": 0.9651, "step": 2690 }, { "epoch": 0.3625479249344185, "grad_norm": 0.06167970738868046, "learning_rate": 0.00024131188551689852, "loss": 0.979, "step": 2695 }, { "epoch": 0.36322055559292393, "grad_norm": 0.06687815516918413, "learning_rate": 0.0002410321717703731, "loss": 0.9907, "step": 2700 }, { "epoch": 0.36389318625142936, "grad_norm": 0.05564819242650189, "learning_rate": 0.00024075195601419659, "loss": 0.9618, "step": 2705 }, { "epoch": 0.3645658169099348, "grad_norm": 0.05748296768430323, "learning_rate": 0.00024047123979365804, "loss": 0.9354, "step": 2710 }, { "epoch": 0.36523844756844015, "grad_norm": 0.0631764311365665, "learning_rate": 0.0002401900246568063, "loss": 0.9555, "step": 2715 }, { "epoch": 0.3659110782269456, "grad_norm": 0.05880985537442802, "learning_rate": 0.0002399083121544416, "loss": 0.9891, "step": 2720 }, { "epoch": 0.366583708885451, "grad_norm": 0.061391215929407515, "learning_rate": 0.00023962610384010706, "loss": 0.9924, "step": 2725 }, { "epoch": 0.3672563395439564, "grad_norm": 0.06316691200369454, "learning_rate": 0.0002393434012700798, "loss": 0.9731, "step": 2730 }, { "epoch": 0.36792897020246185, "grad_norm": 0.06400332296272057, "learning_rate": 0.00023906020600336273, "loss": 0.9717, "step": 2735 }, { "epoch": 0.3686016008609672, "grad_norm": 0.06081035933701875, "learning_rate": 0.0002387765196016758, "loss": 0.9974, "step": 2740 }, { "epoch": 0.36927423151947264, "grad_norm": 0.05692689703123599, "learning_rate": 0.0002384923436294474, "loss": 0.9584, "step": 2745 }, { "epoch": 0.36994686217797806, "grad_norm": 0.05920016044645724, "learning_rate": 0.00023820767965380567, "loss": 0.9977, "step": 2750 }, { "epoch": 0.3706194928364835, "grad_norm": 0.06108926211201016, "learning_rate": 0.0002379225292445699, "loss": 0.955, "step": 2755 }, { "epoch": 0.3712921234949889, "grad_norm": 0.060480343091332565, "learning_rate": 0.00023763689397424202, "loss": 0.9811, "step": 2760 }, { "epoch": 0.37196475415349434, "grad_norm": 0.06059292270073528, "learning_rate": 0.00023735077541799766, "loss": 0.9839, "step": 2765 }, { "epoch": 0.3726373848119997, "grad_norm": 0.0609211390213739, "learning_rate": 0.00023706417515367763, "loss": 1.0154, "step": 2770 }, { "epoch": 0.37331001547050513, "grad_norm": 0.058067385580532836, "learning_rate": 0.00023677709476177915, "loss": 0.9757, "step": 2775 }, { "epoch": 0.37398264612901055, "grad_norm": 0.07045648483439977, "learning_rate": 0.00023648953582544732, "loss": 1.0288, "step": 2780 }, { "epoch": 0.374655276787516, "grad_norm": 0.06288843858803263, "learning_rate": 0.00023620149993046612, "loss": 0.99, "step": 2785 }, { "epoch": 0.3753279074460214, "grad_norm": 0.058383628447802906, "learning_rate": 0.00023591298866524973, "loss": 0.9501, "step": 2790 }, { "epoch": 0.3760005381045268, "grad_norm": 0.06067878041524621, "learning_rate": 0.00023562400362083394, "loss": 1.0125, "step": 2795 }, { "epoch": 0.3766731687630322, "grad_norm": 0.05983019573461252, "learning_rate": 0.00023533454639086722, "loss": 0.9508, "step": 2800 }, { "epoch": 0.3773457994215376, "grad_norm": 0.06531517886917629, "learning_rate": 0.00023504461857160202, "loss": 1.0286, "step": 2805 }, { "epoch": 0.37801843008004304, "grad_norm": 0.05526918386931232, "learning_rate": 0.0002347542217618858, "loss": 0.9792, "step": 2810 }, { "epoch": 0.37869106073854847, "grad_norm": 0.06536819488016454, "learning_rate": 0.00023446335756315237, "loss": 0.9562, "step": 2815 }, { "epoch": 0.3793636913970539, "grad_norm": 0.05798352559617903, "learning_rate": 0.0002341720275794132, "loss": 0.9227, "step": 2820 }, { "epoch": 0.3800363220555593, "grad_norm": 0.065861968577201, "learning_rate": 0.00023388023341724815, "loss": 0.9576, "step": 2825 }, { "epoch": 0.3807089527140647, "grad_norm": 0.07654494776838282, "learning_rate": 0.00023358797668579704, "loss": 0.9635, "step": 2830 }, { "epoch": 0.3813815833725701, "grad_norm": 0.05681789783018427, "learning_rate": 0.00023329525899675043, "loss": 0.9992, "step": 2835 }, { "epoch": 0.38205421403107553, "grad_norm": 0.061087863865119885, "learning_rate": 0.00023300208196434105, "loss": 0.9922, "step": 2840 }, { "epoch": 0.38272684468958096, "grad_norm": 0.06490394468959859, "learning_rate": 0.00023270844720533468, "loss": 0.9434, "step": 2845 }, { "epoch": 0.3833994753480864, "grad_norm": 0.06590103407762701, "learning_rate": 0.0002324143563390212, "loss": 0.9662, "step": 2850 }, { "epoch": 0.3840721060065918, "grad_norm": 0.061039736014859715, "learning_rate": 0.00023211981098720592, "loss": 0.9156, "step": 2855 }, { "epoch": 0.3847447366650972, "grad_norm": 0.06153018865866922, "learning_rate": 0.00023182481277420048, "loss": 1.0125, "step": 2860 }, { "epoch": 0.3854173673236026, "grad_norm": 0.05678690696784472, "learning_rate": 0.00023152936332681363, "loss": 0.9962, "step": 2865 }, { "epoch": 0.386089997982108, "grad_norm": 0.05884783737931574, "learning_rate": 0.0002312334642743428, "loss": 0.9485, "step": 2870 }, { "epoch": 0.38676262864061345, "grad_norm": 0.06427259534288689, "learning_rate": 0.00023093711724856477, "loss": 0.9418, "step": 2875 }, { "epoch": 0.3874352592991189, "grad_norm": 0.06429957247409737, "learning_rate": 0.0002306403238837266, "loss": 0.9483, "step": 2880 }, { "epoch": 0.38810788995762424, "grad_norm": 0.062021118991297704, "learning_rate": 0.00023034308581653686, "loss": 0.9411, "step": 2885 }, { "epoch": 0.38878052061612967, "grad_norm": 0.06410320324942427, "learning_rate": 0.0002300454046861565, "loss": 0.9566, "step": 2890 }, { "epoch": 0.3894531512746351, "grad_norm": 0.06153174109726368, "learning_rate": 0.00022974728213418977, "loss": 0.9353, "step": 2895 }, { "epoch": 0.3901257819331405, "grad_norm": 0.06383339692853464, "learning_rate": 0.00022944871980467514, "loss": 0.9259, "step": 2900 }, { "epoch": 0.39079841259164594, "grad_norm": 0.06314785752506272, "learning_rate": 0.0002291497193440764, "loss": 0.9627, "step": 2905 }, { "epoch": 0.39147104325015136, "grad_norm": 0.060212140876222744, "learning_rate": 0.00022885028240127351, "loss": 1.0133, "step": 2910 }, { "epoch": 0.39214367390865673, "grad_norm": 0.06021180941945565, "learning_rate": 0.0002285504106275533, "loss": 0.9555, "step": 2915 }, { "epoch": 0.39281630456716216, "grad_norm": 0.05937678999742469, "learning_rate": 0.00022825010567660065, "loss": 0.9888, "step": 2920 }, { "epoch": 0.3934889352256676, "grad_norm": 0.06756328132509674, "learning_rate": 0.00022794936920448927, "loss": 0.9709, "step": 2925 }, { "epoch": 0.394161565884173, "grad_norm": 0.05770914377377636, "learning_rate": 0.0002276482028696725, "loss": 0.9196, "step": 2930 }, { "epoch": 0.39483419654267843, "grad_norm": 0.06259513664165894, "learning_rate": 0.00022734660833297426, "loss": 0.9551, "step": 2935 }, { "epoch": 0.39550682720118385, "grad_norm": 0.06495754463275584, "learning_rate": 0.00022704458725757975, "loss": 0.9212, "step": 2940 }, { "epoch": 0.3961794578596892, "grad_norm": 0.057088746499664306, "learning_rate": 0.0002267421413090266, "loss": 0.9333, "step": 2945 }, { "epoch": 0.39685208851819465, "grad_norm": 0.06493801528794958, "learning_rate": 0.0002264392721551952, "loss": 0.9824, "step": 2950 }, { "epoch": 0.39752471917670007, "grad_norm": 0.06280692484375891, "learning_rate": 0.00022613598146629992, "loss": 0.96, "step": 2955 }, { "epoch": 0.3981973498352055, "grad_norm": 0.0686225385390284, "learning_rate": 0.00022583227091487975, "loss": 0.9956, "step": 2960 }, { "epoch": 0.3988699804937109, "grad_norm": 0.06267398055791588, "learning_rate": 0.00022552814217578898, "loss": 0.9574, "step": 2965 }, { "epoch": 0.39954261115221634, "grad_norm": 0.05951574707871612, "learning_rate": 0.00022522359692618815, "loss": 0.9197, "step": 2970 }, { "epoch": 0.4002152418107217, "grad_norm": 0.06398428054392459, "learning_rate": 0.00022491863684553462, "loss": 1.0082, "step": 2975 }, { "epoch": 0.40088787246922714, "grad_norm": 0.0700256713940724, "learning_rate": 0.0002246132636155734, "loss": 0.9425, "step": 2980 }, { "epoch": 0.40156050312773256, "grad_norm": 0.06169910527354521, "learning_rate": 0.000224307478920328, "loss": 0.9251, "step": 2985 }, { "epoch": 0.402233133786238, "grad_norm": 0.05819021747648658, "learning_rate": 0.00022400128444609085, "loss": 1.0028, "step": 2990 }, { "epoch": 0.4029057644447434, "grad_norm": 0.060335882535438916, "learning_rate": 0.00022369468188141424, "loss": 0.955, "step": 2995 }, { "epoch": 0.40357839510324883, "grad_norm": 0.061545586889936825, "learning_rate": 0.00022338767291710091, "loss": 0.9493, "step": 3000 }, { "epoch": 0.4042510257617542, "grad_norm": 0.05466365110860337, "learning_rate": 0.0002230802592461948, "loss": 0.9248, "step": 3005 }, { "epoch": 0.4049236564202596, "grad_norm": 0.057841302052886304, "learning_rate": 0.00022277244256397157, "loss": 0.9261, "step": 3010 }, { "epoch": 0.40559628707876505, "grad_norm": 0.06424399123898437, "learning_rate": 0.00022246422456792948, "loss": 0.9429, "step": 3015 }, { "epoch": 0.4062689177372705, "grad_norm": 0.0552749410063343, "learning_rate": 0.00022215560695777967, "loss": 0.9515, "step": 3020 }, { "epoch": 0.4069415483957759, "grad_norm": 0.06265094401317216, "learning_rate": 0.00022184659143543724, "loss": 0.9341, "step": 3025 }, { "epoch": 0.40761417905428127, "grad_norm": 0.05546283825158401, "learning_rate": 0.00022153717970501148, "loss": 1.016, "step": 3030 }, { "epoch": 0.4082868097127867, "grad_norm": 0.06269519935456656, "learning_rate": 0.00022122737347279677, "loss": 0.9844, "step": 3035 }, { "epoch": 0.4089594403712921, "grad_norm": 0.06296574489581559, "learning_rate": 0.00022091717444726281, "loss": 0.9714, "step": 3040 }, { "epoch": 0.40963207102979754, "grad_norm": 0.09543369741779884, "learning_rate": 0.0002206065843390456, "loss": 0.9487, "step": 3045 }, { "epoch": 0.41030470168830296, "grad_norm": 0.0636053950815713, "learning_rate": 0.0002202956048609378, "loss": 0.9656, "step": 3050 }, { "epoch": 0.4109773323468084, "grad_norm": 0.05681442271993179, "learning_rate": 0.0002199842377278792, "loss": 0.9436, "step": 3055 }, { "epoch": 0.41164996300531376, "grad_norm": 0.05973400372173602, "learning_rate": 0.00021967248465694746, "loss": 0.9305, "step": 3060 }, { "epoch": 0.4123225936638192, "grad_norm": 0.06157069248280991, "learning_rate": 0.0002193603473673485, "loss": 0.8928, "step": 3065 }, { "epoch": 0.4129952243223246, "grad_norm": 0.05798487431085614, "learning_rate": 0.00021904782758040708, "loss": 0.9809, "step": 3070 }, { "epoch": 0.41366785498083003, "grad_norm": 0.06072960578084768, "learning_rate": 0.00021873492701955736, "loss": 0.9573, "step": 3075 }, { "epoch": 0.41434048563933545, "grad_norm": 0.05940975893380244, "learning_rate": 0.0002184216474103332, "loss": 0.9243, "step": 3080 }, { "epoch": 0.4150131162978409, "grad_norm": 0.054497510448308945, "learning_rate": 0.00021810799048035885, "loss": 0.9216, "step": 3085 }, { "epoch": 0.41568574695634625, "grad_norm": 0.08550065849959732, "learning_rate": 0.00021779395795933944, "loss": 0.9765, "step": 3090 }, { "epoch": 0.41635837761485167, "grad_norm": 0.059486110560569884, "learning_rate": 0.0002174795515790512, "loss": 0.9201, "step": 3095 }, { "epoch": 0.4170310082733571, "grad_norm": 0.05891019294903501, "learning_rate": 0.00021716477307333204, "loss": 0.9368, "step": 3100 }, { "epoch": 0.4177036389318625, "grad_norm": 0.06073346780088524, "learning_rate": 0.00021684962417807218, "loss": 0.9416, "step": 3105 }, { "epoch": 0.41837626959036794, "grad_norm": 0.06057976675059688, "learning_rate": 0.0002165341066312043, "loss": 0.9726, "step": 3110 }, { "epoch": 0.41904890024887337, "grad_norm": 0.06057164967765705, "learning_rate": 0.00021621822217269404, "loss": 0.9431, "step": 3115 }, { "epoch": 0.41972153090737874, "grad_norm": 0.058279122248688205, "learning_rate": 0.00021590197254453043, "loss": 0.908, "step": 3120 }, { "epoch": 0.42039416156588416, "grad_norm": 0.0568996491174471, "learning_rate": 0.00021558535949071632, "loss": 0.9896, "step": 3125 }, { "epoch": 0.4210667922243896, "grad_norm": 0.059337189805589746, "learning_rate": 0.00021526838475725875, "loss": 0.9769, "step": 3130 }, { "epoch": 0.421739422882895, "grad_norm": 0.05438214173444285, "learning_rate": 0.00021495105009215924, "loss": 0.9555, "step": 3135 }, { "epoch": 0.42241205354140043, "grad_norm": 0.06703375158787411, "learning_rate": 0.00021463335724540415, "loss": 0.9732, "step": 3140 }, { "epoch": 0.42308468419990586, "grad_norm": 0.062204374471896154, "learning_rate": 0.00021431530796895516, "loss": 0.9394, "step": 3145 }, { "epoch": 0.4237573148584112, "grad_norm": 0.05834404775925495, "learning_rate": 0.00021399690401673958, "loss": 0.9172, "step": 3150 }, { "epoch": 0.42442994551691665, "grad_norm": 0.05859617011162029, "learning_rate": 0.0002136781471446405, "loss": 0.915, "step": 3155 }, { "epoch": 0.4251025761754221, "grad_norm": 0.06096509769736175, "learning_rate": 0.0002133590391104873, "loss": 1.0003, "step": 3160 }, { "epoch": 0.4257752068339275, "grad_norm": 0.0598832189207119, "learning_rate": 0.00021303958167404594, "loss": 0.9468, "step": 3165 }, { "epoch": 0.4264478374924329, "grad_norm": 0.08446921453288947, "learning_rate": 0.00021271977659700916, "loss": 0.9523, "step": 3170 }, { "epoch": 0.4271204681509383, "grad_norm": 0.06424965869527009, "learning_rate": 0.00021239962564298674, "loss": 0.978, "step": 3175 }, { "epoch": 0.4277930988094437, "grad_norm": 0.05617639752428042, "learning_rate": 0.00021207913057749603, "loss": 0.9014, "step": 3180 }, { "epoch": 0.42846572946794914, "grad_norm": 0.059893940523906626, "learning_rate": 0.00021175829316795182, "loss": 0.9579, "step": 3185 }, { "epoch": 0.42913836012645457, "grad_norm": 0.06181936765038198, "learning_rate": 0.00021143711518365694, "loss": 0.9371, "step": 3190 }, { "epoch": 0.42981099078496, "grad_norm": 0.06379488040498672, "learning_rate": 0.00021111559839579236, "loss": 0.9499, "step": 3195 }, { "epoch": 0.4304836214434654, "grad_norm": 0.06857337809733563, "learning_rate": 0.00021079374457740735, "loss": 0.9653, "step": 3200 }, { "epoch": 0.4311562521019708, "grad_norm": 0.05352630790450643, "learning_rate": 0.0002104715555034099, "loss": 0.9247, "step": 3205 }, { "epoch": 0.4318288827604762, "grad_norm": 0.06406514593002467, "learning_rate": 0.0002101490329505567, "loss": 0.9402, "step": 3210 }, { "epoch": 0.43250151341898163, "grad_norm": 0.066879962816942, "learning_rate": 0.00020982617869744354, "loss": 0.9366, "step": 3215 }, { "epoch": 0.43317414407748706, "grad_norm": 0.05966424809748197, "learning_rate": 0.00020950299452449534, "loss": 1.0069, "step": 3220 }, { "epoch": 0.4338467747359925, "grad_norm": 0.06577677126311732, "learning_rate": 0.0002091794822139565, "loss": 0.9677, "step": 3225 }, { "epoch": 0.4345194053944979, "grad_norm": 0.06266553082204902, "learning_rate": 0.00020885564354988084, "loss": 0.9236, "step": 3230 }, { "epoch": 0.4351920360530033, "grad_norm": 0.05941360402735012, "learning_rate": 0.0002085314803181221, "loss": 0.9181, "step": 3235 }, { "epoch": 0.4358646667115087, "grad_norm": 0.06190268094887974, "learning_rate": 0.00020820699430632375, "loss": 0.9551, "step": 3240 }, { "epoch": 0.4365372973700141, "grad_norm": 0.06056805532726624, "learning_rate": 0.00020788218730390933, "loss": 0.8976, "step": 3245 }, { "epoch": 0.43720992802851955, "grad_norm": 0.06124181758108306, "learning_rate": 0.00020755706110207246, "loss": 0.8889, "step": 3250 }, { "epoch": 0.43788255868702497, "grad_norm": 0.06113599870511463, "learning_rate": 0.0002072316174937671, "loss": 0.9618, "step": 3255 }, { "epoch": 0.4385551893455304, "grad_norm": 0.06416322533929952, "learning_rate": 0.0002069058582736976, "loss": 1.0092, "step": 3260 }, { "epoch": 0.43922782000403576, "grad_norm": 0.05978927844116775, "learning_rate": 0.00020657978523830876, "loss": 0.9675, "step": 3265 }, { "epoch": 0.4399004506625412, "grad_norm": 0.062194118475185425, "learning_rate": 0.00020625340018577592, "loss": 0.9499, "step": 3270 }, { "epoch": 0.4405730813210466, "grad_norm": 0.05696297738645936, "learning_rate": 0.00020592670491599522, "loss": 0.9493, "step": 3275 }, { "epoch": 0.44124571197955204, "grad_norm": 0.05793758859996811, "learning_rate": 0.00020559970123057339, "loss": 0.935, "step": 3280 }, { "epoch": 0.44191834263805746, "grad_norm": 0.05837746992606914, "learning_rate": 0.000205272390932818, "loss": 0.8965, "step": 3285 }, { "epoch": 0.4425909732965629, "grad_norm": 0.058159737597925955, "learning_rate": 0.0002049447758277275, "loss": 0.9659, "step": 3290 }, { "epoch": 0.44326360395506825, "grad_norm": 0.05827695355235013, "learning_rate": 0.0002046168577219813, "loss": 0.9865, "step": 3295 }, { "epoch": 0.4439362346135737, "grad_norm": 0.05878679479381314, "learning_rate": 0.00020428863842392961, "loss": 0.9387, "step": 3300 }, { "epoch": 0.4446088652720791, "grad_norm": 0.05644539332794492, "learning_rate": 0.0002039601197435837, "loss": 0.9081, "step": 3305 }, { "epoch": 0.4452814959305845, "grad_norm": 0.06743915523773947, "learning_rate": 0.00020363130349260585, "loss": 0.9593, "step": 3310 }, { "epoch": 0.44595412658908995, "grad_norm": 0.05757563715919714, "learning_rate": 0.00020330219148429927, "loss": 0.8999, "step": 3315 }, { "epoch": 0.4466267572475953, "grad_norm": 0.05965160618003564, "learning_rate": 0.00020297278553359812, "loss": 0.9624, "step": 3320 }, { "epoch": 0.44729938790610074, "grad_norm": 0.06549241702388957, "learning_rate": 0.0002026430874570577, "loss": 0.9429, "step": 3325 }, { "epoch": 0.44797201856460617, "grad_norm": 0.06099252478931159, "learning_rate": 0.0002023130990728442, "loss": 0.9091, "step": 3330 }, { "epoch": 0.4486446492231116, "grad_norm": 0.06214228049542274, "learning_rate": 0.0002019828222007247, "loss": 0.9672, "step": 3335 }, { "epoch": 0.449317279881617, "grad_norm": 0.06099156836145796, "learning_rate": 0.0002016522586620572, "loss": 0.9447, "step": 3340 }, { "epoch": 0.44998991054012244, "grad_norm": 0.05582309593148562, "learning_rate": 0.0002013214102797807, "loss": 0.892, "step": 3345 }, { "epoch": 0.4506625411986278, "grad_norm": 0.06172945054743475, "learning_rate": 0.0002009902788784049, "loss": 0.9256, "step": 3350 }, { "epoch": 0.45133517185713323, "grad_norm": 0.0703038249721671, "learning_rate": 0.00020065886628400012, "loss": 0.9613, "step": 3355 }, { "epoch": 0.45200780251563866, "grad_norm": 0.05656485171471553, "learning_rate": 0.0002003271743241876, "loss": 0.994, "step": 3360 }, { "epoch": 0.4526804331741441, "grad_norm": 0.06050217278714377, "learning_rate": 0.00019999520482812905, "loss": 0.9483, "step": 3365 }, { "epoch": 0.4533530638326495, "grad_norm": 0.05935441754881465, "learning_rate": 0.00019966295962651676, "loss": 0.9528, "step": 3370 }, { "epoch": 0.45402569449115493, "grad_norm": 0.06499672588065591, "learning_rate": 0.0001993304405515633, "loss": 0.9232, "step": 3375 }, { "epoch": 0.4546983251496603, "grad_norm": 0.06326505765450459, "learning_rate": 0.00019899764943699167, "loss": 0.924, "step": 3380 }, { "epoch": 0.4553709558081657, "grad_norm": 0.06670527494401302, "learning_rate": 0.00019866458811802513, "loss": 0.9689, "step": 3385 }, { "epoch": 0.45604358646667115, "grad_norm": 0.05756123229797278, "learning_rate": 0.00019833125843137685, "loss": 0.9486, "step": 3390 }, { "epoch": 0.45671621712517657, "grad_norm": 0.057811629402424236, "learning_rate": 0.00019799766221524002, "loss": 0.9232, "step": 3395 }, { "epoch": 0.457388847783682, "grad_norm": 0.0574321256942919, "learning_rate": 0.00019766380130927772, "loss": 0.899, "step": 3400 }, { "epoch": 0.4580614784421874, "grad_norm": 0.05998915305723974, "learning_rate": 0.00019732967755461264, "loss": 0.9645, "step": 3405 }, { "epoch": 0.4587341091006928, "grad_norm": 0.053707341836902525, "learning_rate": 0.00019699529279381688, "loss": 0.9359, "step": 3410 }, { "epoch": 0.4594067397591982, "grad_norm": 0.062177399130418344, "learning_rate": 0.0001966606488709022, "loss": 0.9692, "step": 3415 }, { "epoch": 0.46007937041770364, "grad_norm": 0.06399892597430598, "learning_rate": 0.00019632574763130914, "loss": 0.9866, "step": 3420 }, { "epoch": 0.46075200107620906, "grad_norm": 0.057829963118091195, "learning_rate": 0.0001959905909218976, "loss": 0.9061, "step": 3425 }, { "epoch": 0.4614246317347145, "grad_norm": 0.05680345513459258, "learning_rate": 0.00019565518059093607, "loss": 0.8951, "step": 3430 }, { "epoch": 0.4620972623932199, "grad_norm": 0.057098569579280174, "learning_rate": 0.00019531951848809177, "loss": 0.9231, "step": 3435 }, { "epoch": 0.4627698930517253, "grad_norm": 0.057724603331952985, "learning_rate": 0.0001949836064644204, "loss": 0.8883, "step": 3440 }, { "epoch": 0.4634425237102307, "grad_norm": 0.07527210764389476, "learning_rate": 0.0001946474463723558, "loss": 0.9226, "step": 3445 }, { "epoch": 0.4641151543687361, "grad_norm": 0.06129756698317641, "learning_rate": 0.00019431104006569977, "loss": 0.9479, "step": 3450 }, { "epoch": 0.46478778502724155, "grad_norm": 0.05622430370986196, "learning_rate": 0.00019397438939961208, "loss": 0.9239, "step": 3455 }, { "epoch": 0.465460415685747, "grad_norm": 0.062249854993252765, "learning_rate": 0.00019363749623059985, "loss": 0.9662, "step": 3460 }, { "epoch": 0.46613304634425234, "grad_norm": 0.05761845543442503, "learning_rate": 0.00019330036241650768, "loss": 0.9294, "step": 3465 }, { "epoch": 0.46680567700275777, "grad_norm": 0.10323423472788318, "learning_rate": 0.0001929629898165071, "loss": 0.9805, "step": 3470 }, { "epoch": 0.4674783076612632, "grad_norm": 0.06118143312492509, "learning_rate": 0.00019262538029108663, "loss": 0.9449, "step": 3475 }, { "epoch": 0.4681509383197686, "grad_norm": 0.06057337443243259, "learning_rate": 0.00019228753570204113, "loss": 0.8763, "step": 3480 }, { "epoch": 0.46882356897827404, "grad_norm": 0.06086236617653455, "learning_rate": 0.00019194945791246192, "loss": 0.8968, "step": 3485 }, { "epoch": 0.46949619963677947, "grad_norm": 0.059555741757573326, "learning_rate": 0.00019161114878672635, "loss": 0.9706, "step": 3490 }, { "epoch": 0.47016883029528483, "grad_norm": 0.058668845924034846, "learning_rate": 0.0001912726101904873, "loss": 0.921, "step": 3495 }, { "epoch": 0.47084146095379026, "grad_norm": 0.058784578173225625, "learning_rate": 0.0001909338439906633, "loss": 0.9497, "step": 3500 }, { "epoch": 0.4715140916122957, "grad_norm": 0.05866558280501008, "learning_rate": 0.00019059485205542802, "loss": 0.9753, "step": 3505 }, { "epoch": 0.4721867222708011, "grad_norm": 0.05491405268934744, "learning_rate": 0.0001902556362541999, "loss": 0.943, "step": 3510 }, { "epoch": 0.47285935292930653, "grad_norm": 0.059976784247428534, "learning_rate": 0.0001899161984576319, "loss": 0.9678, "step": 3515 }, { "epoch": 0.47353198358781196, "grad_norm": 0.058020248226877215, "learning_rate": 0.00018957654053760128, "loss": 0.9183, "step": 3520 }, { "epoch": 0.4742046142463173, "grad_norm": 0.06000157115028833, "learning_rate": 0.00018923666436719918, "loss": 0.9166, "step": 3525 }, { "epoch": 0.47487724490482275, "grad_norm": 0.05711578525715549, "learning_rate": 0.0001888965718207204, "loss": 0.8866, "step": 3530 }, { "epoch": 0.4755498755633282, "grad_norm": 0.05726576470343737, "learning_rate": 0.0001885562647736527, "loss": 0.9193, "step": 3535 }, { "epoch": 0.4762225062218336, "grad_norm": 0.056688406578091884, "learning_rate": 0.00018821574510266702, "loss": 0.9215, "step": 3540 }, { "epoch": 0.476895136880339, "grad_norm": 0.05845990276691751, "learning_rate": 0.0001878750146856067, "loss": 0.9221, "step": 3545 }, { "epoch": 0.47756776753884445, "grad_norm": 0.0635662486938678, "learning_rate": 0.00018753407540147743, "loss": 0.8874, "step": 3550 }, { "epoch": 0.4782403981973498, "grad_norm": 0.056415473310414646, "learning_rate": 0.00018719292913043644, "loss": 0.9473, "step": 3555 }, { "epoch": 0.47891302885585524, "grad_norm": 0.05832846701805165, "learning_rate": 0.0001868515777537827, "loss": 1.0012, "step": 3560 }, { "epoch": 0.47958565951436066, "grad_norm": 0.05842768032767335, "learning_rate": 0.00018651002315394607, "loss": 0.9591, "step": 3565 }, { "epoch": 0.4802582901728661, "grad_norm": 0.05949799045924495, "learning_rate": 0.0001861682672144773, "loss": 0.9184, "step": 3570 }, { "epoch": 0.4809309208313715, "grad_norm": 0.05882141302826352, "learning_rate": 0.0001858263118200372, "loss": 0.8963, "step": 3575 }, { "epoch": 0.48160355148987694, "grad_norm": 0.05878082425498393, "learning_rate": 0.0001854841588563868, "loss": 0.9397, "step": 3580 }, { "epoch": 0.4822761821483823, "grad_norm": 0.06226290012513929, "learning_rate": 0.00018514181021037638, "loss": 0.916, "step": 3585 }, { "epoch": 0.48294881280688773, "grad_norm": 0.056647741034773165, "learning_rate": 0.0001847992677699355, "loss": 0.9266, "step": 3590 }, { "epoch": 0.48362144346539315, "grad_norm": 0.05797267421634493, "learning_rate": 0.0001844565334240624, "loss": 0.9084, "step": 3595 }, { "epoch": 0.4842940741238986, "grad_norm": 0.05974632387924615, "learning_rate": 0.00018411360906281363, "loss": 0.9442, "step": 3600 }, { "epoch": 0.484966704782404, "grad_norm": 0.05888007684789391, "learning_rate": 0.00018377049657729348, "loss": 0.9507, "step": 3605 }, { "epoch": 0.48563933544090937, "grad_norm": 0.06275261115925915, "learning_rate": 0.00018342719785964382, "loss": 0.9249, "step": 3610 }, { "epoch": 0.4863119660994148, "grad_norm": 0.05780095763360672, "learning_rate": 0.00018308371480303348, "loss": 0.9342, "step": 3615 }, { "epoch": 0.4869845967579202, "grad_norm": 0.059559111340805826, "learning_rate": 0.00018274004930164786, "loss": 0.9419, "step": 3620 }, { "epoch": 0.48765722741642564, "grad_norm": 0.0719883639093611, "learning_rate": 0.00018239620325067842, "loss": 0.9123, "step": 3625 }, { "epoch": 0.48832985807493107, "grad_norm": 0.05652059169463742, "learning_rate": 0.00018205217854631232, "loss": 0.8824, "step": 3630 }, { "epoch": 0.4890024887334365, "grad_norm": 0.05780157013574382, "learning_rate": 0.00018170797708572204, "loss": 0.9345, "step": 3635 }, { "epoch": 0.48967511939194186, "grad_norm": 0.05728822335624413, "learning_rate": 0.00018136360076705463, "loss": 0.8981, "step": 3640 }, { "epoch": 0.4903477500504473, "grad_norm": 0.06019083234834377, "learning_rate": 0.0001810190514894214, "loss": 0.889, "step": 3645 }, { "epoch": 0.4910203807089527, "grad_norm": 0.05914213146336189, "learning_rate": 0.00018067433115288774, "loss": 0.9162, "step": 3650 }, { "epoch": 0.49169301136745813, "grad_norm": 0.05671214044848169, "learning_rate": 0.0001803294416584621, "loss": 0.9193, "step": 3655 }, { "epoch": 0.49236564202596356, "grad_norm": 0.05880394037256467, "learning_rate": 0.00017998438490808588, "loss": 0.9218, "step": 3660 }, { "epoch": 0.493038272684469, "grad_norm": 0.052818215027820234, "learning_rate": 0.00017963916280462275, "loss": 0.8766, "step": 3665 }, { "epoch": 0.49371090334297435, "grad_norm": 0.055708114418970923, "learning_rate": 0.00017929377725184843, "loss": 0.9543, "step": 3670 }, { "epoch": 0.4943835340014798, "grad_norm": 0.06351901627480289, "learning_rate": 0.0001789482301544398, "loss": 0.9742, "step": 3675 }, { "epoch": 0.4950561646599852, "grad_norm": 0.05461654119012564, "learning_rate": 0.00017860252341796474, "loss": 0.8991, "step": 3680 }, { "epoch": 0.4957287953184906, "grad_norm": 0.056349511534200064, "learning_rate": 0.0001782566589488714, "loss": 0.9212, "step": 3685 }, { "epoch": 0.49640142597699605, "grad_norm": 0.05485006315481513, "learning_rate": 0.0001779106386544778, "loss": 0.9113, "step": 3690 }, { "epoch": 0.49707405663550147, "grad_norm": 0.05974053898813881, "learning_rate": 0.00017756446444296129, "loss": 0.9165, "step": 3695 }, { "epoch": 0.49774668729400684, "grad_norm": 0.05854153642359085, "learning_rate": 0.000177218138223348, "loss": 0.8972, "step": 3700 }, { "epoch": 0.49841931795251226, "grad_norm": 0.06770077007123941, "learning_rate": 0.00017687166190550233, "loss": 0.8938, "step": 3705 }, { "epoch": 0.4990919486110177, "grad_norm": 0.06254808010672673, "learning_rate": 0.00017652503740011642, "loss": 0.9793, "step": 3710 }, { "epoch": 0.4997645792695231, "grad_norm": 0.06010377767315021, "learning_rate": 0.00017617826661869967, "loss": 0.9486, "step": 3715 }, { "epoch": 0.5004372099280285, "grad_norm": 0.05446285095195939, "learning_rate": 0.000175831351473568, "loss": 0.8987, "step": 3720 }, { "epoch": 0.501109840586534, "grad_norm": 0.05784448976997943, "learning_rate": 0.00017548429387783358, "loss": 0.9282, "step": 3725 }, { "epoch": 0.5017824712450394, "grad_norm": 0.054802892079251174, "learning_rate": 0.00017513709574539408, "loss": 0.899, "step": 3730 }, { "epoch": 0.5024551019035448, "grad_norm": 0.061133507811646316, "learning_rate": 0.00017478975899092218, "loss": 0.9002, "step": 3735 }, { "epoch": 0.5031277325620501, "grad_norm": 0.057132215529536755, "learning_rate": 0.00017444228552985504, "loss": 0.9633, "step": 3740 }, { "epoch": 0.5038003632205555, "grad_norm": 0.05560545701980792, "learning_rate": 0.00017409467727838368, "loss": 0.924, "step": 3745 }, { "epoch": 0.504472993879061, "grad_norm": 0.054372307623308136, "learning_rate": 0.00017374693615344243, "loss": 0.9158, "step": 3750 }, { "epoch": 0.5051456245375664, "grad_norm": 0.06565578023323308, "learning_rate": 0.00017339906407269845, "loss": 0.8696, "step": 3755 }, { "epoch": 0.5058182551960718, "grad_norm": 0.06099191037998514, "learning_rate": 0.00017305106295454096, "loss": 0.9452, "step": 3760 }, { "epoch": 0.5064908858545772, "grad_norm": 0.056830514265882634, "learning_rate": 0.0001727029347180708, "loss": 0.8809, "step": 3765 }, { "epoch": 0.5071635165130827, "grad_norm": 0.056703302827126156, "learning_rate": 0.00017235468128308994, "loss": 0.9237, "step": 3770 }, { "epoch": 0.5078361471715881, "grad_norm": 0.07203208057997011, "learning_rate": 0.00017200630457009066, "loss": 0.9571, "step": 3775 }, { "epoch": 0.5085087778300935, "grad_norm": 0.057478081242506765, "learning_rate": 0.00017165780650024503, "loss": 0.8937, "step": 3780 }, { "epoch": 0.5091814084885989, "grad_norm": 0.053243619421471995, "learning_rate": 0.00017130918899539447, "loss": 0.8609, "step": 3785 }, { "epoch": 0.5098540391471044, "grad_norm": 0.06516652067823515, "learning_rate": 0.00017096045397803903, "loss": 0.8968, "step": 3790 }, { "epoch": 0.5105266698056098, "grad_norm": 0.05203291222802004, "learning_rate": 0.00017061160337132673, "loss": 0.9189, "step": 3795 }, { "epoch": 0.5111993004641151, "grad_norm": 0.06306295515600813, "learning_rate": 0.00017026263909904307, "loss": 0.9705, "step": 3800 }, { "epoch": 0.5118719311226205, "grad_norm": 0.05615502764015949, "learning_rate": 0.0001699135630856004, "loss": 0.8814, "step": 3805 }, { "epoch": 0.512544561781126, "grad_norm": 0.0560452705368292, "learning_rate": 0.00016956437725602715, "loss": 0.9215, "step": 3810 }, { "epoch": 0.5132171924396314, "grad_norm": 0.05568518863652634, "learning_rate": 0.0001692150835359576, "loss": 0.9557, "step": 3815 }, { "epoch": 0.5138898230981368, "grad_norm": 0.06054613780853645, "learning_rate": 0.00016886568385162073, "loss": 0.9322, "step": 3820 }, { "epoch": 0.5145624537566422, "grad_norm": 0.052409857268644666, "learning_rate": 0.00016851618012983, "loss": 0.9368, "step": 3825 }, { "epoch": 0.5152350844151476, "grad_norm": 0.05444283087401524, "learning_rate": 0.00016816657429797262, "loss": 0.9647, "step": 3830 }, { "epoch": 0.5159077150736531, "grad_norm": 0.06452949924040549, "learning_rate": 0.00016781686828399897, "loss": 0.9436, "step": 3835 }, { "epoch": 0.5165803457321585, "grad_norm": 0.05645688357294275, "learning_rate": 0.00016746706401641167, "loss": 0.8871, "step": 3840 }, { "epoch": 0.5172529763906639, "grad_norm": 0.05810377319641876, "learning_rate": 0.00016711716342425538, "loss": 0.9476, "step": 3845 }, { "epoch": 0.5179256070491693, "grad_norm": 0.05461149675976878, "learning_rate": 0.00016676716843710583, "loss": 0.8799, "step": 3850 }, { "epoch": 0.5185982377076748, "grad_norm": 0.0564223024642207, "learning_rate": 0.00016641708098505943, "loss": 0.9193, "step": 3855 }, { "epoch": 0.5192708683661801, "grad_norm": 0.06066172101050625, "learning_rate": 0.00016606690299872238, "loss": 0.9004, "step": 3860 }, { "epoch": 0.5199434990246855, "grad_norm": 0.05611140658907044, "learning_rate": 0.00016571663640920013, "loss": 0.9337, "step": 3865 }, { "epoch": 0.5206161296831909, "grad_norm": 0.057689031864237836, "learning_rate": 0.00016536628314808697, "loss": 0.9112, "step": 3870 }, { "epoch": 0.5212887603416964, "grad_norm": 0.055731587959152395, "learning_rate": 0.0001650158451474549, "loss": 0.898, "step": 3875 }, { "epoch": 0.5219613910002018, "grad_norm": 0.06201233139255153, "learning_rate": 0.0001646653243398433, "loss": 0.9145, "step": 3880 }, { "epoch": 0.5226340216587072, "grad_norm": 0.06107927805244479, "learning_rate": 0.00016431472265824814, "loss": 0.9161, "step": 3885 }, { "epoch": 0.5233066523172126, "grad_norm": 0.0661283923149189, "learning_rate": 0.00016396404203611166, "loss": 0.9115, "step": 3890 }, { "epoch": 0.523979282975718, "grad_norm": 0.0604974766358996, "learning_rate": 0.00016361328440731113, "loss": 0.9455, "step": 3895 }, { "epoch": 0.5246519136342235, "grad_norm": 0.058113713123233206, "learning_rate": 0.00016326245170614854, "loss": 0.8781, "step": 3900 }, { "epoch": 0.5253245442927289, "grad_norm": 0.06015481165291708, "learning_rate": 0.00016291154586733998, "loss": 0.8848, "step": 3905 }, { "epoch": 0.5259971749512343, "grad_norm": 0.0559671255985385, "learning_rate": 0.00016256056882600476, "loss": 0.9371, "step": 3910 }, { "epoch": 0.5266698056097396, "grad_norm": 0.056837421939094905, "learning_rate": 0.00016220952251765492, "loss": 0.85, "step": 3915 }, { "epoch": 0.5273424362682451, "grad_norm": 0.05168059406474343, "learning_rate": 0.00016185840887818445, "loss": 0.9226, "step": 3920 }, { "epoch": 0.5280150669267505, "grad_norm": 0.05857810372921838, "learning_rate": 0.00016150722984385865, "loss": 0.9395, "step": 3925 }, { "epoch": 0.5286876975852559, "grad_norm": 0.05722621782286285, "learning_rate": 0.00016115598735130343, "loss": 0.9425, "step": 3930 }, { "epoch": 0.5293603282437613, "grad_norm": 0.059103865719039965, "learning_rate": 0.00016080468333749478, "loss": 0.8452, "step": 3935 }, { "epoch": 0.5300329589022668, "grad_norm": 0.05555055554042676, "learning_rate": 0.00016045331973974766, "loss": 0.9027, "step": 3940 }, { "epoch": 0.5307055895607722, "grad_norm": 0.052513345803769935, "learning_rate": 0.00016010189849570595, "loss": 0.8805, "step": 3945 }, { "epoch": 0.5313782202192776, "grad_norm": 0.06037608421554789, "learning_rate": 0.00015975042154333125, "loss": 0.937, "step": 3950 }, { "epoch": 0.532050850877783, "grad_norm": 0.056481515391140207, "learning_rate": 0.0001593988908208924, "loss": 0.943, "step": 3955 }, { "epoch": 0.5327234815362885, "grad_norm": 0.06184729512471909, "learning_rate": 0.00015904730826695474, "loss": 0.9505, "step": 3960 }, { "epoch": 0.5333961121947939, "grad_norm": 0.06127002318323847, "learning_rate": 0.00015869567582036946, "loss": 0.8742, "step": 3965 }, { "epoch": 0.5340687428532993, "grad_norm": 0.06185433303343996, "learning_rate": 0.00015834399542026298, "loss": 0.9058, "step": 3970 }, { "epoch": 0.5347413735118046, "grad_norm": 0.06027891444038201, "learning_rate": 0.00015799226900602598, "loss": 0.8915, "step": 3975 }, { "epoch": 0.53541400417031, "grad_norm": 0.057040976959674576, "learning_rate": 0.00015764049851730306, "loss": 0.9281, "step": 3980 }, { "epoch": 0.5360866348288155, "grad_norm": 0.07354905892891744, "learning_rate": 0.00015728868589398178, "loss": 0.8798, "step": 3985 }, { "epoch": 0.5367592654873209, "grad_norm": 0.053310299991943276, "learning_rate": 0.0001569368330761821, "loss": 0.8899, "step": 3990 }, { "epoch": 0.5374318961458263, "grad_norm": 0.0628036632961106, "learning_rate": 0.0001565849420042456, "loss": 0.9191, "step": 3995 }, { "epoch": 0.5381045268043317, "grad_norm": 0.05421995436525058, "learning_rate": 0.00015623301461872488, "loss": 0.9471, "step": 4000 }, { "epoch": 0.5387771574628372, "grad_norm": 0.052719478039766475, "learning_rate": 0.00015588105286037276, "loss": 0.8718, "step": 4005 }, { "epoch": 0.5394497881213426, "grad_norm": 0.0544131184510201, "learning_rate": 0.00015552905867013156, "loss": 0.9153, "step": 4010 }, { "epoch": 0.540122418779848, "grad_norm": 0.055134853031770285, "learning_rate": 0.00015517703398912255, "loss": 0.8935, "step": 4015 }, { "epoch": 0.5407950494383534, "grad_norm": 0.06099844314567582, "learning_rate": 0.00015482498075863513, "loss": 0.8387, "step": 4020 }, { "epoch": 0.5414676800968589, "grad_norm": 0.06712533260751871, "learning_rate": 0.00015447290092011602, "loss": 0.939, "step": 4025 }, { "epoch": 0.5421403107553642, "grad_norm": 0.05720276550328766, "learning_rate": 0.00015412079641515878, "loss": 0.8824, "step": 4030 }, { "epoch": 0.5428129414138696, "grad_norm": 0.06011803041726182, "learning_rate": 0.00015376866918549308, "loss": 0.9513, "step": 4035 }, { "epoch": 0.543485572072375, "grad_norm": 0.059287881649759205, "learning_rate": 0.00015341652117297372, "loss": 0.9054, "step": 4040 }, { "epoch": 0.5441582027308804, "grad_norm": 0.05299022648963555, "learning_rate": 0.0001530643543195702, "loss": 0.9335, "step": 4045 }, { "epoch": 0.5448308333893859, "grad_norm": 0.054676871019681765, "learning_rate": 0.00015271217056735592, "loss": 0.8307, "step": 4050 }, { "epoch": 0.5455034640478913, "grad_norm": 0.05936475254102487, "learning_rate": 0.00015235997185849754, "loss": 0.9142, "step": 4055 }, { "epoch": 0.5461760947063967, "grad_norm": 0.05743243525838816, "learning_rate": 0.00015200776013524404, "loss": 0.8874, "step": 4060 }, { "epoch": 0.5468487253649021, "grad_norm": 0.056156997238234564, "learning_rate": 0.0001516555373399162, "loss": 0.867, "step": 4065 }, { "epoch": 0.5475213560234076, "grad_norm": 0.054258410598457395, "learning_rate": 0.0001513033054148961, "loss": 0.898, "step": 4070 }, { "epoch": 0.548193986681913, "grad_norm": 0.053619280218852455, "learning_rate": 0.00015095106630261593, "loss": 0.9184, "step": 4075 }, { "epoch": 0.5488666173404184, "grad_norm": 0.05479069536605181, "learning_rate": 0.0001505988219455475, "loss": 0.848, "step": 4080 }, { "epoch": 0.5495392479989238, "grad_norm": 0.04972042649918088, "learning_rate": 0.00015024657428619156, "loss": 0.9102, "step": 4085 }, { "epoch": 0.5502118786574292, "grad_norm": 0.0537833564245492, "learning_rate": 0.00014989432526706735, "loss": 0.8817, "step": 4090 }, { "epoch": 0.5508845093159346, "grad_norm": 0.05831550073111491, "learning_rate": 0.00014954207683070116, "loss": 0.8746, "step": 4095 }, { "epoch": 0.55155713997444, "grad_norm": 0.05715884422720536, "learning_rate": 0.00014918983091961638, "loss": 0.8898, "step": 4100 }, { "epoch": 0.5522297706329454, "grad_norm": 0.05902152296123058, "learning_rate": 0.0001488375894763224, "loss": 0.9082, "step": 4105 }, { "epoch": 0.5529024012914509, "grad_norm": 0.058375481662671105, "learning_rate": 0.0001484853544433039, "loss": 0.8988, "step": 4110 }, { "epoch": 0.5535750319499563, "grad_norm": 0.05339178022779875, "learning_rate": 0.0001481331277630103, "loss": 0.8585, "step": 4115 }, { "epoch": 0.5542476626084617, "grad_norm": 0.05713255377942084, "learning_rate": 0.00014778091137784493, "loss": 0.8841, "step": 4120 }, { "epoch": 0.5549202932669671, "grad_norm": 0.05830955458693526, "learning_rate": 0.00014742870723015433, "loss": 0.828, "step": 4125 }, { "epoch": 0.5555929239254725, "grad_norm": 0.06080721903708676, "learning_rate": 0.0001470765172622176, "loss": 0.8978, "step": 4130 }, { "epoch": 0.556265554583978, "grad_norm": 0.05958603835036692, "learning_rate": 0.00014672434341623549, "loss": 0.9092, "step": 4135 }, { "epoch": 0.5569381852424834, "grad_norm": 0.05825095975769997, "learning_rate": 0.00014637218763432003, "loss": 0.8763, "step": 4140 }, { "epoch": 0.5576108159009888, "grad_norm": 0.0587468065652045, "learning_rate": 0.00014602005185848364, "loss": 0.8766, "step": 4145 }, { "epoch": 0.5582834465594941, "grad_norm": 0.056558891133557256, "learning_rate": 0.00014566793803062823, "loss": 0.9658, "step": 4150 }, { "epoch": 0.5589560772179996, "grad_norm": 0.05207206688489681, "learning_rate": 0.0001453158480925348, "loss": 0.9154, "step": 4155 }, { "epoch": 0.559628707876505, "grad_norm": 0.06268612193351983, "learning_rate": 0.00014496378398585262, "loss": 0.8945, "step": 4160 }, { "epoch": 0.5603013385350104, "grad_norm": 0.055810120355651985, "learning_rate": 0.00014461174765208843, "loss": 0.8514, "step": 4165 }, { "epoch": 0.5609739691935158, "grad_norm": 0.06012035951220586, "learning_rate": 0.00014425974103259592, "loss": 0.9309, "step": 4170 }, { "epoch": 0.5616465998520213, "grad_norm": 0.05205807171138911, "learning_rate": 0.00014390776606856481, "loss": 0.8314, "step": 4175 }, { "epoch": 0.5623192305105267, "grad_norm": 0.0542122704361648, "learning_rate": 0.00014355582470101033, "loss": 0.9146, "step": 4180 }, { "epoch": 0.5629918611690321, "grad_norm": 0.06119537195488446, "learning_rate": 0.00014320391887076244, "loss": 0.9141, "step": 4185 }, { "epoch": 0.5636644918275375, "grad_norm": 0.052147743194251195, "learning_rate": 0.00014285205051845499, "loss": 0.8538, "step": 4190 }, { "epoch": 0.564337122486043, "grad_norm": 0.058745044900606766, "learning_rate": 0.0001425002215845153, "loss": 0.9293, "step": 4195 }, { "epoch": 0.5650097531445484, "grad_norm": 0.059321637563402155, "learning_rate": 0.00014214843400915325, "loss": 0.9303, "step": 4200 }, { "epoch": 0.5656823838030537, "grad_norm": 0.05701045172370044, "learning_rate": 0.00014179668973235068, "loss": 0.8771, "step": 4205 }, { "epoch": 0.5663550144615591, "grad_norm": 0.06196077189437652, "learning_rate": 0.00014144499069385064, "loss": 0.8745, "step": 4210 }, { "epoch": 0.5670276451200645, "grad_norm": 0.0574145705590886, "learning_rate": 0.00014109333883314667, "loss": 0.9236, "step": 4215 }, { "epoch": 0.56770027577857, "grad_norm": 0.06338155467612702, "learning_rate": 0.00014074173608947214, "loss": 0.9423, "step": 4220 }, { "epoch": 0.5683729064370754, "grad_norm": 0.06248233624602792, "learning_rate": 0.0001403901844017897, "loss": 0.841, "step": 4225 }, { "epoch": 0.5690455370955808, "grad_norm": 0.0694864531170763, "learning_rate": 0.00014003868570878022, "loss": 0.9356, "step": 4230 }, { "epoch": 0.5697181677540862, "grad_norm": 0.06348150388439985, "learning_rate": 0.00013968724194883252, "loss": 0.9308, "step": 4235 }, { "epoch": 0.5703907984125917, "grad_norm": 0.061117572773617, "learning_rate": 0.00013933585506003228, "loss": 0.8817, "step": 4240 }, { "epoch": 0.5710634290710971, "grad_norm": 0.05989230491533799, "learning_rate": 0.00013898452698015177, "loss": 0.9248, "step": 4245 }, { "epoch": 0.5717360597296025, "grad_norm": 0.05943495368109778, "learning_rate": 0.00013863325964663884, "loss": 0.917, "step": 4250 }, { "epoch": 0.5724086903881079, "grad_norm": 0.0582336713656894, "learning_rate": 0.00013828205499660632, "loss": 0.8604, "step": 4255 }, { "epoch": 0.5730813210466134, "grad_norm": 0.05411543005433317, "learning_rate": 0.0001379309149668214, "loss": 0.8695, "step": 4260 }, { "epoch": 0.5737539517051187, "grad_norm": 0.0581837735189759, "learning_rate": 0.00013757984149369504, "loss": 0.8793, "step": 4265 }, { "epoch": 0.5744265823636241, "grad_norm": 0.05585377116595472, "learning_rate": 0.0001372288365132709, "loss": 0.8832, "step": 4270 }, { "epoch": 0.5750992130221295, "grad_norm": 0.05516798560782244, "learning_rate": 0.00013687790196121517, "loss": 0.8876, "step": 4275 }, { "epoch": 0.5757718436806349, "grad_norm": 0.05770652015506851, "learning_rate": 0.0001365270397728054, "loss": 0.9036, "step": 4280 }, { "epoch": 0.5764444743391404, "grad_norm": 0.05511634842230322, "learning_rate": 0.00013617625188292034, "loss": 0.8413, "step": 4285 }, { "epoch": 0.5771171049976458, "grad_norm": 0.0581917308559524, "learning_rate": 0.00013582554022602896, "loss": 0.879, "step": 4290 }, { "epoch": 0.5777897356561512, "grad_norm": 0.057635907441933136, "learning_rate": 0.00013547490673617964, "loss": 0.8731, "step": 4295 }, { "epoch": 0.5784623663146566, "grad_norm": 0.054839688661184126, "learning_rate": 0.00013512435334698988, "loss": 0.9481, "step": 4300 }, { "epoch": 0.5791349969731621, "grad_norm": 0.05745991388972774, "learning_rate": 0.00013477388199163544, "loss": 0.8818, "step": 4305 }, { "epoch": 0.5798076276316675, "grad_norm": 0.061297124958904645, "learning_rate": 0.00013442349460283964, "loss": 0.9059, "step": 4310 }, { "epoch": 0.5804802582901729, "grad_norm": 0.06461719548175024, "learning_rate": 0.00013407319311286277, "loss": 0.8842, "step": 4315 }, { "epoch": 0.5811528889486782, "grad_norm": 0.05003410246581846, "learning_rate": 0.00013372297945349137, "loss": 0.9352, "step": 4320 }, { "epoch": 0.5818255196071836, "grad_norm": 0.05244921652553115, "learning_rate": 0.00013337285555602773, "loss": 0.921, "step": 4325 }, { "epoch": 0.5824981502656891, "grad_norm": 0.05766957348830907, "learning_rate": 0.00013302282335127914, "loss": 0.8749, "step": 4330 }, { "epoch": 0.5831707809241945, "grad_norm": 0.061301170245086384, "learning_rate": 0.00013267288476954704, "loss": 0.8933, "step": 4335 }, { "epoch": 0.5838434115826999, "grad_norm": 0.05932960924176723, "learning_rate": 0.00013232304174061674, "loss": 0.9374, "step": 4340 }, { "epoch": 0.5845160422412053, "grad_norm": 0.06319852513692248, "learning_rate": 0.00013197329619374677, "loss": 0.8601, "step": 4345 }, { "epoch": 0.5851886728997108, "grad_norm": 0.055916865171505896, "learning_rate": 0.00013162365005765764, "loss": 0.9082, "step": 4350 }, { "epoch": 0.5858613035582162, "grad_norm": 0.0545360860790789, "learning_rate": 0.00013127410526052208, "loss": 0.9341, "step": 4355 }, { "epoch": 0.5865339342167216, "grad_norm": 0.054118565941251126, "learning_rate": 0.00013092466372995366, "loss": 0.8528, "step": 4360 }, { "epoch": 0.587206564875227, "grad_norm": 0.05653106870225469, "learning_rate": 0.00013057532739299668, "loss": 0.8948, "step": 4365 }, { "epoch": 0.5878791955337325, "grad_norm": 0.05401286588096964, "learning_rate": 0.0001302260981761153, "loss": 0.9151, "step": 4370 }, { "epoch": 0.5885518261922379, "grad_norm": 0.06148934703285774, "learning_rate": 0.0001298769780051828, "loss": 0.9045, "step": 4375 }, { "epoch": 0.5892244568507432, "grad_norm": 0.06093837315082601, "learning_rate": 0.00012952796880547128, "loss": 0.8844, "step": 4380 }, { "epoch": 0.5898970875092486, "grad_norm": 0.05669194121807215, "learning_rate": 0.0001291790725016409, "loss": 0.92, "step": 4385 }, { "epoch": 0.590569718167754, "grad_norm": 0.06158002592468912, "learning_rate": 0.000128830291017729, "loss": 0.876, "step": 4390 }, { "epoch": 0.5912423488262595, "grad_norm": 0.05740935206115831, "learning_rate": 0.00012848162627714, "loss": 0.829, "step": 4395 }, { "epoch": 0.5919149794847649, "grad_norm": 0.05772541420273194, "learning_rate": 0.00012813308020263428, "loss": 0.8735, "step": 4400 }, { "epoch": 0.5925876101432703, "grad_norm": 0.06350978931255861, "learning_rate": 0.00012778465471631806, "loss": 0.9195, "step": 4405 }, { "epoch": 0.5932602408017758, "grad_norm": 0.051441424187463246, "learning_rate": 0.00012743635173963246, "loss": 0.8725, "step": 4410 }, { "epoch": 0.5939328714602812, "grad_norm": 0.05739880159869317, "learning_rate": 0.0001270881731933429, "loss": 0.9279, "step": 4415 }, { "epoch": 0.5946055021187866, "grad_norm": 0.057953361597903576, "learning_rate": 0.00012674012099752872, "loss": 0.9029, "step": 4420 }, { "epoch": 0.595278132777292, "grad_norm": 0.0587792183282691, "learning_rate": 0.00012639219707157254, "loss": 0.9057, "step": 4425 }, { "epoch": 0.5959507634357974, "grad_norm": 0.06134378487549523, "learning_rate": 0.00012604440333414946, "loss": 0.887, "step": 4430 }, { "epoch": 0.5966233940943029, "grad_norm": 0.06366789691162225, "learning_rate": 0.0001256967417032168, "loss": 0.9016, "step": 4435 }, { "epoch": 0.5972960247528082, "grad_norm": 0.054919981047600736, "learning_rate": 0.00012534921409600318, "loss": 0.8743, "step": 4440 }, { "epoch": 0.5979686554113136, "grad_norm": 0.16672557636164484, "learning_rate": 0.00012500182242899827, "loss": 0.9003, "step": 4445 }, { "epoch": 0.598641286069819, "grad_norm": 0.05503535479792428, "learning_rate": 0.00012465456861794204, "loss": 0.8848, "step": 4450 }, { "epoch": 0.5993139167283245, "grad_norm": 0.054776610640831065, "learning_rate": 0.0001243074545778142, "loss": 0.9217, "step": 4455 }, { "epoch": 0.5999865473868299, "grad_norm": 0.05883313327981152, "learning_rate": 0.00012396048222282374, "loss": 0.8789, "step": 4460 }, { "epoch": 0.6006591780453353, "grad_norm": 0.053191200550714766, "learning_rate": 0.0001236136534663983, "loss": 0.9423, "step": 4465 }, { "epoch": 0.6013318087038407, "grad_norm": 0.05662708709001015, "learning_rate": 0.0001232669702211735, "loss": 0.8702, "step": 4470 }, { "epoch": 0.6020044393623462, "grad_norm": 0.05705030841856459, "learning_rate": 0.00012292043439898274, "loss": 0.9329, "step": 4475 }, { "epoch": 0.6026770700208516, "grad_norm": 0.06547197669722017, "learning_rate": 0.00012257404791084616, "loss": 0.9337, "step": 4480 }, { "epoch": 0.603349700679357, "grad_norm": 0.05619282666130086, "learning_rate": 0.00012222781266696056, "loss": 0.9097, "step": 4485 }, { "epoch": 0.6040223313378624, "grad_norm": 0.058899280210475524, "learning_rate": 0.00012188173057668881, "loss": 0.8789, "step": 4490 }, { "epoch": 0.6046949619963677, "grad_norm": 0.05736846197147452, "learning_rate": 0.00012153580354854885, "loss": 0.9097, "step": 4495 }, { "epoch": 0.6053675926548732, "grad_norm": 0.05425110212071724, "learning_rate": 0.0001211900334902037, "loss": 0.8751, "step": 4500 }, { "epoch": 0.6060402233133786, "grad_norm": 0.05485136697764707, "learning_rate": 0.00012084442230845087, "loss": 0.8551, "step": 4505 }, { "epoch": 0.606712853971884, "grad_norm": 0.06282812420498524, "learning_rate": 0.00012049897190921143, "loss": 0.9172, "step": 4510 }, { "epoch": 0.6073854846303894, "grad_norm": 0.058713453811793, "learning_rate": 0.00012015368419752009, "loss": 0.9139, "step": 4515 }, { "epoch": 0.6080581152888949, "grad_norm": 0.05588704652885199, "learning_rate": 0.00011980856107751414, "loss": 0.8895, "step": 4520 }, { "epoch": 0.6087307459474003, "grad_norm": 0.05606004982577002, "learning_rate": 0.0001194636044524234, "loss": 0.8542, "step": 4525 }, { "epoch": 0.6094033766059057, "grad_norm": 0.06075755945838316, "learning_rate": 0.00011911881622455947, "loss": 0.9091, "step": 4530 }, { "epoch": 0.6100760072644111, "grad_norm": 0.05777711265416582, "learning_rate": 0.0001187741982953052, "loss": 0.8863, "step": 4535 }, { "epoch": 0.6107486379229166, "grad_norm": 0.052534492649992555, "learning_rate": 0.00011842975256510439, "loss": 0.9213, "step": 4540 }, { "epoch": 0.611421268581422, "grad_norm": 0.060979276869632684, "learning_rate": 0.0001180854809334514, "loss": 0.9091, "step": 4545 }, { "epoch": 0.6120938992399274, "grad_norm": 0.05636369366484177, "learning_rate": 0.0001177413852988801, "loss": 0.9109, "step": 4550 }, { "epoch": 0.6127665298984327, "grad_norm": 0.06375696295920616, "learning_rate": 0.00011739746755895416, "loss": 0.9199, "step": 4555 }, { "epoch": 0.6134391605569381, "grad_norm": 0.06022822131153983, "learning_rate": 0.00011705372961025602, "loss": 0.9181, "step": 4560 }, { "epoch": 0.6141117912154436, "grad_norm": 0.05818657066276916, "learning_rate": 0.00011671017334837674, "loss": 0.9297, "step": 4565 }, { "epoch": 0.614784421873949, "grad_norm": 0.05707183156732193, "learning_rate": 0.0001163668006679054, "loss": 0.9006, "step": 4570 }, { "epoch": 0.6154570525324544, "grad_norm": 0.057762504480045336, "learning_rate": 0.00011602361346241869, "loss": 0.9213, "step": 4575 }, { "epoch": 0.6161296831909598, "grad_norm": 0.05622121629708367, "learning_rate": 0.00011568061362447048, "loss": 0.8442, "step": 4580 }, { "epoch": 0.6168023138494653, "grad_norm": 0.06842696169362386, "learning_rate": 0.00011533780304558146, "loss": 0.91, "step": 4585 }, { "epoch": 0.6174749445079707, "grad_norm": 0.0538174023674992, "learning_rate": 0.0001149951836162284, "loss": 0.8732, "step": 4590 }, { "epoch": 0.6181475751664761, "grad_norm": 0.0559745986424301, "learning_rate": 0.0001146527572258342, "loss": 0.8556, "step": 4595 }, { "epoch": 0.6188202058249815, "grad_norm": 0.05889316352261125, "learning_rate": 0.00011431052576275704, "loss": 0.8636, "step": 4600 }, { "epoch": 0.619492836483487, "grad_norm": 0.0644317221183652, "learning_rate": 0.00011396849111428026, "loss": 0.8829, "step": 4605 }, { "epoch": 0.6201654671419923, "grad_norm": 0.05378896021016429, "learning_rate": 0.00011362665516660181, "loss": 0.8692, "step": 4610 }, { "epoch": 0.6208380978004977, "grad_norm": 0.06101966621474693, "learning_rate": 0.00011328501980482382, "loss": 0.9022, "step": 4615 }, { "epoch": 0.6215107284590031, "grad_norm": 0.05748333047789456, "learning_rate": 0.00011294358691294232, "loss": 0.865, "step": 4620 }, { "epoch": 0.6221833591175085, "grad_norm": 0.059256917830513484, "learning_rate": 0.00011260235837383684, "loss": 0.8582, "step": 4625 }, { "epoch": 0.622855989776014, "grad_norm": 0.05943237627383429, "learning_rate": 0.00011226133606925981, "loss": 0.9069, "step": 4630 }, { "epoch": 0.6235286204345194, "grad_norm": 0.054627263562646904, "learning_rate": 0.00011192052187982654, "loss": 0.9239, "step": 4635 }, { "epoch": 0.6242012510930248, "grad_norm": 0.06107879674633574, "learning_rate": 0.00011157991768500451, "loss": 0.8936, "step": 4640 }, { "epoch": 0.6248738817515302, "grad_norm": 0.058746911687480065, "learning_rate": 0.00011123952536310321, "loss": 0.8503, "step": 4645 }, { "epoch": 0.6255465124100357, "grad_norm": 0.05687652600714167, "learning_rate": 0.00011089934679126383, "loss": 0.8842, "step": 4650 }, { "epoch": 0.6262191430685411, "grad_norm": 0.05737409928479921, "learning_rate": 0.00011055938384544861, "loss": 0.8413, "step": 4655 }, { "epoch": 0.6268917737270465, "grad_norm": 0.05281557762068941, "learning_rate": 0.00011021963840043082, "loss": 0.8714, "step": 4660 }, { "epoch": 0.6275644043855519, "grad_norm": 0.05653456815014127, "learning_rate": 0.00010988011232978433, "loss": 0.8631, "step": 4665 }, { "epoch": 0.6282370350440573, "grad_norm": 0.05820443994353081, "learning_rate": 0.00010954080750587308, "loss": 0.8458, "step": 4670 }, { "epoch": 0.6289096657025627, "grad_norm": 0.058053700502948814, "learning_rate": 0.00010920172579984113, "loss": 0.8708, "step": 4675 }, { "epoch": 0.6295822963610681, "grad_norm": 0.052586921573302296, "learning_rate": 0.00010886286908160178, "loss": 0.837, "step": 4680 }, { "epoch": 0.6302549270195735, "grad_norm": 0.05836180559612674, "learning_rate": 0.00010852423921982804, "loss": 0.8881, "step": 4685 }, { "epoch": 0.630927557678079, "grad_norm": 0.05330666907721731, "learning_rate": 0.00010818583808194165, "loss": 0.8655, "step": 4690 }, { "epoch": 0.6316001883365844, "grad_norm": 0.05931093065925741, "learning_rate": 0.00010784766753410292, "loss": 0.867, "step": 4695 }, { "epoch": 0.6322728189950898, "grad_norm": 0.059390965217927305, "learning_rate": 0.00010750972944120074, "loss": 0.9129, "step": 4700 }, { "epoch": 0.6329454496535952, "grad_norm": 0.057009911395215766, "learning_rate": 0.00010717202566684205, "loss": 0.8754, "step": 4705 }, { "epoch": 0.6336180803121007, "grad_norm": 0.05813916757602384, "learning_rate": 0.00010683455807334149, "loss": 0.9253, "step": 4710 }, { "epoch": 0.6342907109706061, "grad_norm": 0.05554690018381647, "learning_rate": 0.0001064973285217114, "loss": 0.9079, "step": 4715 }, { "epoch": 0.6349633416291115, "grad_norm": 0.05823337802695429, "learning_rate": 0.0001061603388716513, "loss": 0.8678, "step": 4720 }, { "epoch": 0.6356359722876169, "grad_norm": 0.060183801311714434, "learning_rate": 0.00010582359098153779, "loss": 0.9334, "step": 4725 }, { "epoch": 0.6363086029461222, "grad_norm": 0.06158375211136442, "learning_rate": 0.00010548708670841432, "loss": 0.8998, "step": 4730 }, { "epoch": 0.6369812336046277, "grad_norm": 0.05841236712079292, "learning_rate": 0.00010515082790798064, "loss": 0.8454, "step": 4735 }, { "epoch": 0.6376538642631331, "grad_norm": 0.058285702230372556, "learning_rate": 0.000104814816434583, "loss": 0.9201, "step": 4740 }, { "epoch": 0.6383264949216385, "grad_norm": 0.05752261109466009, "learning_rate": 0.00010447905414120385, "loss": 0.9009, "step": 4745 }, { "epoch": 0.6389991255801439, "grad_norm": 0.054666275354256486, "learning_rate": 0.00010414354287945116, "loss": 0.8836, "step": 4750 }, { "epoch": 0.6396717562386494, "grad_norm": 0.054230923288243436, "learning_rate": 0.00010380828449954886, "loss": 0.9051, "step": 4755 }, { "epoch": 0.6403443868971548, "grad_norm": 0.05811940329323424, "learning_rate": 0.0001034732808503261, "loss": 0.8842, "step": 4760 }, { "epoch": 0.6410170175556602, "grad_norm": 0.06251179179779437, "learning_rate": 0.00010313853377920744, "loss": 0.8882, "step": 4765 }, { "epoch": 0.6416896482141656, "grad_norm": 0.052634147322929405, "learning_rate": 0.0001028040451322025, "loss": 0.9015, "step": 4770 }, { "epoch": 0.642362278872671, "grad_norm": 0.059163841667578786, "learning_rate": 0.00010246981675389563, "loss": 0.8905, "step": 4775 }, { "epoch": 0.6430349095311765, "grad_norm": 0.055204378998180585, "learning_rate": 0.00010213585048743608, "loss": 0.8629, "step": 4780 }, { "epoch": 0.6437075401896818, "grad_norm": 0.06100319448728361, "learning_rate": 0.00010180214817452759, "loss": 0.922, "step": 4785 }, { "epoch": 0.6443801708481872, "grad_norm": 0.05767459460028311, "learning_rate": 0.00010146871165541816, "loss": 0.8928, "step": 4790 }, { "epoch": 0.6450528015066926, "grad_norm": 0.05607341654418674, "learning_rate": 0.0001011355427688902, "loss": 0.8736, "step": 4795 }, { "epoch": 0.6457254321651981, "grad_norm": 0.0535834405716432, "learning_rate": 0.00010080264335225016, "loss": 0.9161, "step": 4800 }, { "epoch": 0.6463980628237035, "grad_norm": 0.056346526158196436, "learning_rate": 0.00010047001524131844, "loss": 0.883, "step": 4805 }, { "epoch": 0.6470706934822089, "grad_norm": 0.05586293404952587, "learning_rate": 0.00010013766027041936, "loss": 0.8463, "step": 4810 }, { "epoch": 0.6477433241407143, "grad_norm": 0.05723029408458351, "learning_rate": 9.980558027237084e-05, "loss": 0.8411, "step": 4815 }, { "epoch": 0.6484159547992198, "grad_norm": 0.058448218690204695, "learning_rate": 9.947377707847463e-05, "loss": 0.8481, "step": 4820 }, { "epoch": 0.6490885854577252, "grad_norm": 0.07509436198212023, "learning_rate": 9.914225251850568e-05, "loss": 0.8862, "step": 4825 }, { "epoch": 0.6497612161162306, "grad_norm": 0.05253604166156805, "learning_rate": 9.881100842070275e-05, "loss": 0.8384, "step": 4830 }, { "epoch": 0.650433846774736, "grad_norm": 0.054848894274745104, "learning_rate": 9.848004661175775e-05, "loss": 0.8584, "step": 4835 }, { "epoch": 0.6511064774332415, "grad_norm": 0.06695121957621178, "learning_rate": 9.814936891680581e-05, "loss": 0.8816, "step": 4840 }, { "epoch": 0.6517791080917468, "grad_norm": 0.05596682397619204, "learning_rate": 9.78189771594154e-05, "loss": 0.9007, "step": 4845 }, { "epoch": 0.6524517387502522, "grad_norm": 0.05517815446096068, "learning_rate": 9.748887316157814e-05, "loss": 0.8934, "step": 4850 }, { "epoch": 0.6531243694087576, "grad_norm": 0.06497381529538195, "learning_rate": 9.715905874369865e-05, "loss": 0.915, "step": 4855 }, { "epoch": 0.653797000067263, "grad_norm": 0.056024264255754005, "learning_rate": 9.682953572458477e-05, "loss": 0.8571, "step": 4860 }, { "epoch": 0.6544696307257685, "grad_norm": 0.08419868114514564, "learning_rate": 9.650030592143723e-05, "loss": 0.9008, "step": 4865 }, { "epoch": 0.6551422613842739, "grad_norm": 0.055564563924577495, "learning_rate": 9.61713711498399e-05, "loss": 0.8797, "step": 4870 }, { "epoch": 0.6558148920427793, "grad_norm": 0.062082572684302575, "learning_rate": 9.58427332237497e-05, "loss": 0.8628, "step": 4875 }, { "epoch": 0.6564875227012847, "grad_norm": 0.060099200321734096, "learning_rate": 9.551439395548624e-05, "loss": 0.8484, "step": 4880 }, { "epoch": 0.6571601533597902, "grad_norm": 0.058009190055296006, "learning_rate": 9.518635515572253e-05, "loss": 0.8501, "step": 4885 }, { "epoch": 0.6578327840182956, "grad_norm": 0.05797176811552356, "learning_rate": 9.48586186334745e-05, "loss": 0.8473, "step": 4890 }, { "epoch": 0.658505414676801, "grad_norm": 0.06380953263521327, "learning_rate": 9.453118619609089e-05, "loss": 0.8727, "step": 4895 }, { "epoch": 0.6591780453353063, "grad_norm": 0.05688225891322997, "learning_rate": 9.420405964924383e-05, "loss": 0.8673, "step": 4900 }, { "epoch": 0.6598506759938118, "grad_norm": 0.05613539116592728, "learning_rate": 9.387724079691836e-05, "loss": 0.8699, "step": 4905 }, { "epoch": 0.6605233066523172, "grad_norm": 0.060851757847964476, "learning_rate": 9.355073144140283e-05, "loss": 0.8902, "step": 4910 }, { "epoch": 0.6611959373108226, "grad_norm": 0.06414740795910594, "learning_rate": 9.322453338327879e-05, "loss": 0.8337, "step": 4915 }, { "epoch": 0.661868567969328, "grad_norm": 0.059468629399861576, "learning_rate": 9.289864842141101e-05, "loss": 0.9023, "step": 4920 }, { "epoch": 0.6625411986278334, "grad_norm": 0.05758556350462922, "learning_rate": 9.257307835293778e-05, "loss": 0.8525, "step": 4925 }, { "epoch": 0.6632138292863389, "grad_norm": 0.06171673443618236, "learning_rate": 9.224782497326085e-05, "loss": 0.9572, "step": 4930 }, { "epoch": 0.6638864599448443, "grad_norm": 0.062334143766405774, "learning_rate": 9.192289007603538e-05, "loss": 0.8436, "step": 4935 }, { "epoch": 0.6645590906033497, "grad_norm": 0.05685720623051231, "learning_rate": 9.159827545316043e-05, "loss": 0.8374, "step": 4940 }, { "epoch": 0.6652317212618551, "grad_norm": 0.05733734226480475, "learning_rate": 9.127398289476871e-05, "loss": 0.8669, "step": 4945 }, { "epoch": 0.6659043519203606, "grad_norm": 0.06246178021253094, "learning_rate": 9.095001418921694e-05, "loss": 0.8825, "step": 4950 }, { "epoch": 0.666576982578866, "grad_norm": 0.05680246697220381, "learning_rate": 9.062637112307591e-05, "loss": 0.8424, "step": 4955 }, { "epoch": 0.6672496132373713, "grad_norm": 0.05684976523742173, "learning_rate": 9.030305548112056e-05, "loss": 0.8337, "step": 4960 }, { "epoch": 0.6679222438958767, "grad_norm": 0.05869191623510943, "learning_rate": 8.998006904632027e-05, "loss": 0.8393, "step": 4965 }, { "epoch": 0.6685948745543822, "grad_norm": 0.05499638998233168, "learning_rate": 8.965741359982895e-05, "loss": 0.8856, "step": 4970 }, { "epoch": 0.6692675052128876, "grad_norm": 0.05407722242744602, "learning_rate": 8.933509092097516e-05, "loss": 0.8438, "step": 4975 }, { "epoch": 0.669940135871393, "grad_norm": 0.05632782452701334, "learning_rate": 8.901310278725254e-05, "loss": 0.8714, "step": 4980 }, { "epoch": 0.6706127665298984, "grad_norm": 0.052171899433051214, "learning_rate": 8.869145097430955e-05, "loss": 0.8876, "step": 4985 }, { "epoch": 0.6712853971884039, "grad_norm": 0.06663532562647967, "learning_rate": 8.837013725594021e-05, "loss": 0.8599, "step": 4990 }, { "epoch": 0.6719580278469093, "grad_norm": 0.05861717662104597, "learning_rate": 8.804916340407401e-05, "loss": 0.8277, "step": 4995 }, { "epoch": 0.6726306585054147, "grad_norm": 0.05879425667066216, "learning_rate": 8.772853118876615e-05, "loss": 0.8587, "step": 5000 }, { "epoch": 0.6733032891639201, "grad_norm": 0.061444791492560505, "learning_rate": 8.740824237818783e-05, "loss": 0.8554, "step": 5005 }, { "epoch": 0.6739759198224256, "grad_norm": 0.05819734558194491, "learning_rate": 8.708829873861664e-05, "loss": 0.8355, "step": 5010 }, { "epoch": 0.674648550480931, "grad_norm": 0.0520509008119525, "learning_rate": 8.676870203442635e-05, "loss": 0.8569, "step": 5015 }, { "epoch": 0.6753211811394363, "grad_norm": 0.05681589132682784, "learning_rate": 8.64494540280779e-05, "loss": 0.8725, "step": 5020 }, { "epoch": 0.6759938117979417, "grad_norm": 0.056879680943254775, "learning_rate": 8.613055648010899e-05, "loss": 0.8493, "step": 5025 }, { "epoch": 0.6766664424564471, "grad_norm": 0.059375704514106294, "learning_rate": 8.581201114912477e-05, "loss": 0.8605, "step": 5030 }, { "epoch": 0.6773390731149526, "grad_norm": 0.05894030965563083, "learning_rate": 8.549381979178815e-05, "loss": 0.8568, "step": 5035 }, { "epoch": 0.678011703773458, "grad_norm": 0.054221362443161204, "learning_rate": 8.517598416280985e-05, "loss": 0.8593, "step": 5040 }, { "epoch": 0.6786843344319634, "grad_norm": 0.051825037887154625, "learning_rate": 8.485850601493885e-05, "loss": 0.8618, "step": 5045 }, { "epoch": 0.6793569650904688, "grad_norm": 0.05777524128307275, "learning_rate": 8.4541387098953e-05, "loss": 0.8643, "step": 5050 }, { "epoch": 0.6800295957489743, "grad_norm": 0.05914658196865087, "learning_rate": 8.422462916364875e-05, "loss": 0.8348, "step": 5055 }, { "epoch": 0.6807022264074797, "grad_norm": 0.055461026646056615, "learning_rate": 8.390823395583218e-05, "loss": 0.8929, "step": 5060 }, { "epoch": 0.6813748570659851, "grad_norm": 0.05492126944791927, "learning_rate": 8.35922032203089e-05, "loss": 0.8758, "step": 5065 }, { "epoch": 0.6820474877244905, "grad_norm": 0.060505610514154255, "learning_rate": 8.327653869987462e-05, "loss": 0.8925, "step": 5070 }, { "epoch": 0.6827201183829958, "grad_norm": 0.05811965799556169, "learning_rate": 8.296124213530556e-05, "loss": 0.8233, "step": 5075 }, { "epoch": 0.6833927490415013, "grad_norm": 0.05678640199443679, "learning_rate": 8.264631526534875e-05, "loss": 0.8505, "step": 5080 }, { "epoch": 0.6840653797000067, "grad_norm": 0.05374131818929565, "learning_rate": 8.233175982671241e-05, "loss": 0.87, "step": 5085 }, { "epoch": 0.6847380103585121, "grad_norm": 0.06337938406070896, "learning_rate": 8.201757755405663e-05, "loss": 0.8285, "step": 5090 }, { "epoch": 0.6854106410170175, "grad_norm": 0.05307559383586948, "learning_rate": 8.170377017998347e-05, "loss": 0.8377, "step": 5095 }, { "epoch": 0.686083271675523, "grad_norm": 0.06681792778316958, "learning_rate": 8.139033943502764e-05, "loss": 0.9141, "step": 5100 }, { "epoch": 0.6867559023340284, "grad_norm": 0.060052245002273755, "learning_rate": 8.107728704764678e-05, "loss": 0.8644, "step": 5105 }, { "epoch": 0.6874285329925338, "grad_norm": 0.057388420287701074, "learning_rate": 8.076461474421212e-05, "loss": 0.9123, "step": 5110 }, { "epoch": 0.6881011636510392, "grad_norm": 0.05383822565809219, "learning_rate": 8.045232424899889e-05, "loss": 0.8784, "step": 5115 }, { "epoch": 0.6887737943095447, "grad_norm": 0.05473684772662266, "learning_rate": 8.014041728417671e-05, "loss": 0.9026, "step": 5120 }, { "epoch": 0.6894464249680501, "grad_norm": 0.05301092470468256, "learning_rate": 7.982889556980006e-05, "loss": 0.8791, "step": 5125 }, { "epoch": 0.6901190556265555, "grad_norm": 0.056506205793787935, "learning_rate": 7.951776082379924e-05, "loss": 0.8834, "step": 5130 }, { "epoch": 0.6907916862850608, "grad_norm": 0.059817759285931964, "learning_rate": 7.920701476197025e-05, "loss": 0.862, "step": 5135 }, { "epoch": 0.6914643169435662, "grad_norm": 0.055806318389620795, "learning_rate": 7.889665909796574e-05, "loss": 0.8829, "step": 5140 }, { "epoch": 0.6921369476020717, "grad_norm": 0.0625441497627379, "learning_rate": 7.858669554328537e-05, "loss": 0.8643, "step": 5145 }, { "epoch": 0.6928095782605771, "grad_norm": 0.05220170307922808, "learning_rate": 7.827712580726669e-05, "loss": 0.8349, "step": 5150 }, { "epoch": 0.6934822089190825, "grad_norm": 0.05474943327503433, "learning_rate": 7.796795159707525e-05, "loss": 0.863, "step": 5155 }, { "epoch": 0.694154839577588, "grad_norm": 0.05222518858286006, "learning_rate": 7.765917461769553e-05, "loss": 0.8209, "step": 5160 }, { "epoch": 0.6948274702360934, "grad_norm": 0.052445023606749054, "learning_rate": 7.735079657192132e-05, "loss": 0.8674, "step": 5165 }, { "epoch": 0.6955001008945988, "grad_norm": 0.05143832891654637, "learning_rate": 7.704281916034664e-05, "loss": 0.8327, "step": 5170 }, { "epoch": 0.6961727315531042, "grad_norm": 0.05618205697005048, "learning_rate": 7.673524408135593e-05, "loss": 0.8802, "step": 5175 }, { "epoch": 0.6968453622116096, "grad_norm": 0.05484727707900561, "learning_rate": 7.642807303111504e-05, "loss": 0.8014, "step": 5180 }, { "epoch": 0.6975179928701151, "grad_norm": 0.05398969664658753, "learning_rate": 7.612130770356167e-05, "loss": 0.8828, "step": 5185 }, { "epoch": 0.6981906235286204, "grad_norm": 0.05740973161611498, "learning_rate": 7.581494979039625e-05, "loss": 0.8603, "step": 5190 }, { "epoch": 0.6988632541871258, "grad_norm": 0.060089661288700226, "learning_rate": 7.550900098107229e-05, "loss": 0.9219, "step": 5195 }, { "epoch": 0.6995358848456312, "grad_norm": 0.05315516282253319, "learning_rate": 7.520346296278729e-05, "loss": 0.8561, "step": 5200 }, { "epoch": 0.7002085155041367, "grad_norm": 0.06405746424013456, "learning_rate": 7.48983374204735e-05, "loss": 0.8765, "step": 5205 }, { "epoch": 0.7008811461626421, "grad_norm": 0.06621206104407008, "learning_rate": 7.459362603678839e-05, "loss": 0.8813, "step": 5210 }, { "epoch": 0.7015537768211475, "grad_norm": 0.05691919117892703, "learning_rate": 7.428933049210552e-05, "loss": 0.8651, "step": 5215 }, { "epoch": 0.7022264074796529, "grad_norm": 0.05275469160566453, "learning_rate": 7.398545246450524e-05, "loss": 0.8916, "step": 5220 }, { "epoch": 0.7028990381381583, "grad_norm": 0.05944483436680081, "learning_rate": 7.368199362976542e-05, "loss": 0.8627, "step": 5225 }, { "epoch": 0.7035716687966638, "grad_norm": 0.05490184754535832, "learning_rate": 7.337895566135241e-05, "loss": 0.831, "step": 5230 }, { "epoch": 0.7042442994551692, "grad_norm": 0.053453524101461904, "learning_rate": 7.307634023041139e-05, "loss": 0.8457, "step": 5235 }, { "epoch": 0.7049169301136746, "grad_norm": 0.056089856365205054, "learning_rate": 7.277414900575749e-05, "loss": 0.8812, "step": 5240 }, { "epoch": 0.70558956077218, "grad_norm": 0.054477356366257466, "learning_rate": 7.247238365386659e-05, "loss": 0.8843, "step": 5245 }, { "epoch": 0.7062621914306854, "grad_norm": 0.05635284790716959, "learning_rate": 7.217104583886593e-05, "loss": 0.8279, "step": 5250 }, { "epoch": 0.7069348220891908, "grad_norm": 0.05640773573113742, "learning_rate": 7.187013722252498e-05, "loss": 0.837, "step": 5255 }, { "epoch": 0.7076074527476962, "grad_norm": 0.05680088370065849, "learning_rate": 7.15696594642466e-05, "loss": 0.8603, "step": 5260 }, { "epoch": 0.7082800834062016, "grad_norm": 0.057411411632092874, "learning_rate": 7.126961422105722e-05, "loss": 0.8384, "step": 5265 }, { "epoch": 0.7089527140647071, "grad_norm": 0.05306386883430388, "learning_rate": 7.097000314759847e-05, "loss": 0.8616, "step": 5270 }, { "epoch": 0.7096253447232125, "grad_norm": 0.0545156816122668, "learning_rate": 7.067082789611752e-05, "loss": 0.9009, "step": 5275 }, { "epoch": 0.7102979753817179, "grad_norm": 0.05614266230435318, "learning_rate": 7.037209011645806e-05, "loss": 0.8161, "step": 5280 }, { "epoch": 0.7109706060402233, "grad_norm": 0.05836083731921901, "learning_rate": 7.007379145605155e-05, "loss": 0.8508, "step": 5285 }, { "epoch": 0.7116432366987288, "grad_norm": 0.05889834886106434, "learning_rate": 6.977593355990762e-05, "loss": 0.9123, "step": 5290 }, { "epoch": 0.7123158673572342, "grad_norm": 0.0599010873399523, "learning_rate": 6.947851807060526e-05, "loss": 0.8549, "step": 5295 }, { "epoch": 0.7129884980157396, "grad_norm": 0.05703117987548505, "learning_rate": 6.918154662828397e-05, "loss": 0.8999, "step": 5300 }, { "epoch": 0.713661128674245, "grad_norm": 0.05747829883745667, "learning_rate": 6.888502087063412e-05, "loss": 0.87, "step": 5305 }, { "epoch": 0.7143337593327503, "grad_norm": 0.053424673994670414, "learning_rate": 6.858894243288863e-05, "loss": 0.8663, "step": 5310 }, { "epoch": 0.7150063899912558, "grad_norm": 0.06336136647095073, "learning_rate": 6.829331294781356e-05, "loss": 0.8631, "step": 5315 }, { "epoch": 0.7156790206497612, "grad_norm": 0.058971758715485705, "learning_rate": 6.799813404569887e-05, "loss": 0.8914, "step": 5320 }, { "epoch": 0.7163516513082666, "grad_norm": 0.05792271524665821, "learning_rate": 6.770340735435007e-05, "loss": 0.8691, "step": 5325 }, { "epoch": 0.717024281966772, "grad_norm": 0.054830731560030654, "learning_rate": 6.740913449907874e-05, "loss": 0.8648, "step": 5330 }, { "epoch": 0.7176969126252775, "grad_norm": 0.060764115904758366, "learning_rate": 6.711531710269361e-05, "loss": 0.8335, "step": 5335 }, { "epoch": 0.7183695432837829, "grad_norm": 0.05910840524389887, "learning_rate": 6.682195678549198e-05, "loss": 0.839, "step": 5340 }, { "epoch": 0.7190421739422883, "grad_norm": 0.05091175020470114, "learning_rate": 6.652905516525032e-05, "loss": 0.8276, "step": 5345 }, { "epoch": 0.7197148046007937, "grad_norm": 0.06631137750724338, "learning_rate": 6.623661385721553e-05, "loss": 0.8601, "step": 5350 }, { "epoch": 0.7203874352592992, "grad_norm": 0.056251177055676505, "learning_rate": 6.594463447409631e-05, "loss": 0.8637, "step": 5355 }, { "epoch": 0.7210600659178046, "grad_norm": 0.060321727455837935, "learning_rate": 6.56531186260536e-05, "loss": 0.8488, "step": 5360 }, { "epoch": 0.7217326965763099, "grad_norm": 0.06356804542613306, "learning_rate": 6.536206792069246e-05, "loss": 0.8258, "step": 5365 }, { "epoch": 0.7224053272348153, "grad_norm": 0.05927222864540253, "learning_rate": 6.507148396305285e-05, "loss": 0.8802, "step": 5370 }, { "epoch": 0.7230779578933207, "grad_norm": 0.060242024985454945, "learning_rate": 6.478136835560043e-05, "loss": 0.8515, "step": 5375 }, { "epoch": 0.7237505885518262, "grad_norm": 0.0572408416108945, "learning_rate": 6.44917226982185e-05, "loss": 0.8853, "step": 5380 }, { "epoch": 0.7244232192103316, "grad_norm": 0.055779812319483826, "learning_rate": 6.420254858819853e-05, "loss": 0.8551, "step": 5385 }, { "epoch": 0.725095849868837, "grad_norm": 0.05372587428464241, "learning_rate": 6.391384762023155e-05, "loss": 0.8959, "step": 5390 }, { "epoch": 0.7257684805273424, "grad_norm": 0.056265169274321215, "learning_rate": 6.362562138639957e-05, "loss": 0.8711, "step": 5395 }, { "epoch": 0.7264411111858479, "grad_norm": 0.0643302407697523, "learning_rate": 6.333787147616641e-05, "loss": 0.851, "step": 5400 }, { "epoch": 0.7271137418443533, "grad_norm": 0.06009641755807671, "learning_rate": 6.305059947636921e-05, "loss": 0.9158, "step": 5405 }, { "epoch": 0.7277863725028587, "grad_norm": 0.057868353247351734, "learning_rate": 6.276380697120974e-05, "loss": 0.8374, "step": 5410 }, { "epoch": 0.7284590031613641, "grad_norm": 0.051991130536517494, "learning_rate": 6.24774955422452e-05, "loss": 0.8388, "step": 5415 }, { "epoch": 0.7291316338198696, "grad_norm": 0.05501086613762105, "learning_rate": 6.21916667683802e-05, "loss": 0.8288, "step": 5420 }, { "epoch": 0.7298042644783749, "grad_norm": 0.05651661862407232, "learning_rate": 6.190632222585747e-05, "loss": 0.8497, "step": 5425 }, { "epoch": 0.7304768951368803, "grad_norm": 0.05910681682465812, "learning_rate": 6.162146348824935e-05, "loss": 0.8451, "step": 5430 }, { "epoch": 0.7311495257953857, "grad_norm": 0.05209037147565984, "learning_rate": 6.133709212644934e-05, "loss": 0.8186, "step": 5435 }, { "epoch": 0.7318221564538911, "grad_norm": 0.061761656354733095, "learning_rate": 6.105320970866307e-05, "loss": 0.8711, "step": 5440 }, { "epoch": 0.7324947871123966, "grad_norm": 0.053209654344044566, "learning_rate": 6.076981780039982e-05, "loss": 0.8419, "step": 5445 }, { "epoch": 0.733167417770902, "grad_norm": 0.06048025162698968, "learning_rate": 6.048691796446396e-05, "loss": 0.8751, "step": 5450 }, { "epoch": 0.7338400484294074, "grad_norm": 0.05364619597271036, "learning_rate": 6.0204511760946156e-05, "loss": 0.8346, "step": 5455 }, { "epoch": 0.7345126790879128, "grad_norm": 0.05362076203885372, "learning_rate": 5.992260074721506e-05, "loss": 0.8472, "step": 5460 }, { "epoch": 0.7351853097464183, "grad_norm": 0.05416794573637957, "learning_rate": 5.964118647790836e-05, "loss": 0.8705, "step": 5465 }, { "epoch": 0.7358579404049237, "grad_norm": 0.057308449167418285, "learning_rate": 5.936027050492436e-05, "loss": 0.8262, "step": 5470 }, { "epoch": 0.7365305710634291, "grad_norm": 0.05537632404183107, "learning_rate": 5.907985437741361e-05, "loss": 0.8502, "step": 5475 }, { "epoch": 0.7372032017219344, "grad_norm": 0.054499705173430356, "learning_rate": 5.879993964177006e-05, "loss": 0.8962, "step": 5480 }, { "epoch": 0.7378758323804399, "grad_norm": 0.058847549524604685, "learning_rate": 5.8520527841622674e-05, "loss": 0.8351, "step": 5485 }, { "epoch": 0.7385484630389453, "grad_norm": 0.05696132922969094, "learning_rate": 5.824162051782689e-05, "loss": 0.912, "step": 5490 }, { "epoch": 0.7392210936974507, "grad_norm": 0.054340906074491255, "learning_rate": 5.7963219208456244e-05, "loss": 0.8386, "step": 5495 }, { "epoch": 0.7398937243559561, "grad_norm": 0.05690271575321237, "learning_rate": 5.7685325448793715e-05, "loss": 0.8477, "step": 5500 }, { "epoch": 0.7405663550144616, "grad_norm": 0.05291001329815415, "learning_rate": 5.7407940771323305e-05, "loss": 0.838, "step": 5505 }, { "epoch": 0.741238985672967, "grad_norm": 0.05659296345689606, "learning_rate": 5.71310667057216e-05, "loss": 0.8696, "step": 5510 }, { "epoch": 0.7419116163314724, "grad_norm": 0.05997774398356355, "learning_rate": 5.685470477884947e-05, "loss": 0.8528, "step": 5515 }, { "epoch": 0.7425842469899778, "grad_norm": 0.058931635556607694, "learning_rate": 5.6578856514743393e-05, "loss": 0.8782, "step": 5520 }, { "epoch": 0.7432568776484832, "grad_norm": 0.06128548736268873, "learning_rate": 5.6303523434607235e-05, "loss": 0.8376, "step": 5525 }, { "epoch": 0.7439295083069887, "grad_norm": 0.05660298846428016, "learning_rate": 5.602870705680373e-05, "loss": 0.849, "step": 5530 }, { "epoch": 0.7446021389654941, "grad_norm": 0.059111841539666116, "learning_rate": 5.575440889684638e-05, "loss": 0.8678, "step": 5535 }, { "epoch": 0.7452747696239994, "grad_norm": 0.07854228394149744, "learning_rate": 5.5480630467390694e-05, "loss": 0.8451, "step": 5540 }, { "epoch": 0.7459474002825048, "grad_norm": 0.06125700566348329, "learning_rate": 5.520737327822609e-05, "loss": 0.8112, "step": 5545 }, { "epoch": 0.7466200309410103, "grad_norm": 0.06171779829091447, "learning_rate": 5.4934638836267705e-05, "loss": 0.8246, "step": 5550 }, { "epoch": 0.7472926615995157, "grad_norm": 0.05866009770905458, "learning_rate": 5.4662428645547726e-05, "loss": 0.8519, "step": 5555 }, { "epoch": 0.7479652922580211, "grad_norm": 0.05666771692923985, "learning_rate": 5.439074420720734e-05, "loss": 0.8493, "step": 5560 }, { "epoch": 0.7486379229165265, "grad_norm": 0.05432519494767404, "learning_rate": 5.4119587019488426e-05, "loss": 0.8884, "step": 5565 }, { "epoch": 0.749310553575032, "grad_norm": 0.054351604297057335, "learning_rate": 5.384895857772516e-05, "loss": 0.8333, "step": 5570 }, { "epoch": 0.7499831842335374, "grad_norm": 0.059333956268234095, "learning_rate": 5.357886037433607e-05, "loss": 0.8283, "step": 5575 }, { "epoch": 0.7506558148920428, "grad_norm": 0.05336078672412873, "learning_rate": 5.330929389881545e-05, "loss": 0.8876, "step": 5580 }, { "epoch": 0.7513284455505482, "grad_norm": 0.0537530672664337, "learning_rate": 5.304026063772532e-05, "loss": 0.8082, "step": 5585 }, { "epoch": 0.7520010762090537, "grad_norm": 0.06417130967521156, "learning_rate": 5.2771762074687324e-05, "loss": 0.8607, "step": 5590 }, { "epoch": 0.7526737068675591, "grad_norm": 0.0592025759553816, "learning_rate": 5.250379969037433e-05, "loss": 0.8574, "step": 5595 }, { "epoch": 0.7533463375260644, "grad_norm": 0.0529486580965492, "learning_rate": 5.2236374962502345e-05, "loss": 0.8191, "step": 5600 }, { "epoch": 0.7540189681845698, "grad_norm": 0.054655553837085674, "learning_rate": 5.196948936582263e-05, "loss": 0.8464, "step": 5605 }, { "epoch": 0.7546915988430752, "grad_norm": 0.06556056478392819, "learning_rate": 5.1703144372112934e-05, "loss": 0.8156, "step": 5610 }, { "epoch": 0.7553642295015807, "grad_norm": 0.061584993331036585, "learning_rate": 5.143734145017016e-05, "loss": 0.8543, "step": 5615 }, { "epoch": 0.7560368601600861, "grad_norm": 0.055055753002702476, "learning_rate": 5.1172082065801655e-05, "loss": 0.8324, "step": 5620 }, { "epoch": 0.7567094908185915, "grad_norm": 0.055015531712228206, "learning_rate": 5.0907367681817405e-05, "loss": 0.8223, "step": 5625 }, { "epoch": 0.7573821214770969, "grad_norm": 0.05608683516437471, "learning_rate": 5.064319975802199e-05, "loss": 0.8603, "step": 5630 }, { "epoch": 0.7580547521356024, "grad_norm": 0.055961769828895516, "learning_rate": 5.0379579751206345e-05, "loss": 0.8853, "step": 5635 }, { "epoch": 0.7587273827941078, "grad_norm": 0.05225964844822088, "learning_rate": 5.011650911513988e-05, "loss": 0.9061, "step": 5640 }, { "epoch": 0.7594000134526132, "grad_norm": 0.05236358496294752, "learning_rate": 4.9853989300562524e-05, "loss": 0.8287, "step": 5645 }, { "epoch": 0.7600726441111186, "grad_norm": 0.048850458923294954, "learning_rate": 4.9592021755176384e-05, "loss": 0.813, "step": 5650 }, { "epoch": 0.760745274769624, "grad_norm": 0.0553497239014274, "learning_rate": 4.933060792363824e-05, "loss": 0.8105, "step": 5655 }, { "epoch": 0.7614179054281294, "grad_norm": 0.05655304760380461, "learning_rate": 4.906974924755133e-05, "loss": 0.8515, "step": 5660 }, { "epoch": 0.7620905360866348, "grad_norm": 0.05777857275432363, "learning_rate": 4.880944716545717e-05, "loss": 0.8456, "step": 5665 }, { "epoch": 0.7627631667451402, "grad_norm": 0.05635569506280685, "learning_rate": 4.854970311282812e-05, "loss": 0.8522, "step": 5670 }, { "epoch": 0.7634357974036456, "grad_norm": 0.05834534565421247, "learning_rate": 4.82905185220591e-05, "loss": 0.8472, "step": 5675 }, { "epoch": 0.7641084280621511, "grad_norm": 0.061697683149533326, "learning_rate": 4.8031894822459736e-05, "loss": 0.8145, "step": 5680 }, { "epoch": 0.7647810587206565, "grad_norm": 0.05812977896957012, "learning_rate": 4.777383344024672e-05, "loss": 0.9117, "step": 5685 }, { "epoch": 0.7654536893791619, "grad_norm": 0.06166103572138631, "learning_rate": 4.751633579853561e-05, "loss": 0.8913, "step": 5690 }, { "epoch": 0.7661263200376673, "grad_norm": 0.06136708791864048, "learning_rate": 4.7259403317333126e-05, "loss": 0.8399, "step": 5695 }, { "epoch": 0.7667989506961728, "grad_norm": 0.05596607951538446, "learning_rate": 4.7003037413529574e-05, "loss": 0.8296, "step": 5700 }, { "epoch": 0.7674715813546782, "grad_norm": 0.05708602863383148, "learning_rate": 4.674723950089038e-05, "loss": 0.8412, "step": 5705 }, { "epoch": 0.7681442120131836, "grad_norm": 0.05552110409211543, "learning_rate": 4.649201099004904e-05, "loss": 0.8032, "step": 5710 }, { "epoch": 0.7688168426716889, "grad_norm": 0.059392941246110734, "learning_rate": 4.6237353288498985e-05, "loss": 0.8135, "step": 5715 }, { "epoch": 0.7694894733301944, "grad_norm": 0.05703942924248708, "learning_rate": 4.598326780058557e-05, "loss": 0.8675, "step": 5720 }, { "epoch": 0.7701621039886998, "grad_norm": 0.05451025292158218, "learning_rate": 4.572975592749893e-05, "loss": 0.8543, "step": 5725 }, { "epoch": 0.7708347346472052, "grad_norm": 0.05389981922242321, "learning_rate": 4.54768190672657e-05, "loss": 0.8691, "step": 5730 }, { "epoch": 0.7715073653057106, "grad_norm": 0.05844537688650812, "learning_rate": 4.522445861474154e-05, "loss": 0.8783, "step": 5735 }, { "epoch": 0.772179995964216, "grad_norm": 0.05419960076549575, "learning_rate": 4.4972675961603595e-05, "loss": 0.8363, "step": 5740 }, { "epoch": 0.7728526266227215, "grad_norm": 0.05644761781377002, "learning_rate": 4.4721472496342495e-05, "loss": 0.8797, "step": 5745 }, { "epoch": 0.7735252572812269, "grad_norm": 0.055204582352275294, "learning_rate": 4.447084960425484e-05, "loss": 0.8793, "step": 5750 }, { "epoch": 0.7741978879397323, "grad_norm": 0.05219581202676311, "learning_rate": 4.422080866743582e-05, "loss": 0.8228, "step": 5755 }, { "epoch": 0.7748705185982377, "grad_norm": 0.05674250597271216, "learning_rate": 4.3971351064770946e-05, "loss": 0.887, "step": 5760 }, { "epoch": 0.7755431492567432, "grad_norm": 0.056527448986813174, "learning_rate": 4.372247817192922e-05, "loss": 0.9162, "step": 5765 }, { "epoch": 0.7762157799152485, "grad_norm": 0.05450427810949781, "learning_rate": 4.347419136135504e-05, "loss": 0.8248, "step": 5770 }, { "epoch": 0.7768884105737539, "grad_norm": 0.054433127989288496, "learning_rate": 4.322649200226067e-05, "loss": 0.863, "step": 5775 }, { "epoch": 0.7775610412322593, "grad_norm": 0.056552172499089434, "learning_rate": 4.297938146061903e-05, "loss": 0.8532, "step": 5780 }, { "epoch": 0.7782336718907648, "grad_norm": 0.05481351515370127, "learning_rate": 4.2732861099155695e-05, "loss": 0.8644, "step": 5785 }, { "epoch": 0.7789063025492702, "grad_norm": 0.0521662997394286, "learning_rate": 4.248693227734166e-05, "loss": 0.8873, "step": 5790 }, { "epoch": 0.7795789332077756, "grad_norm": 0.05705457216343529, "learning_rate": 4.2241596351385955e-05, "loss": 0.8622, "step": 5795 }, { "epoch": 0.780251563866281, "grad_norm": 0.052719950954204266, "learning_rate": 4.1996854674227816e-05, "loss": 0.8826, "step": 5800 }, { "epoch": 0.7809241945247865, "grad_norm": 0.05724964717366726, "learning_rate": 4.1752708595529444e-05, "loss": 0.8377, "step": 5805 }, { "epoch": 0.7815968251832919, "grad_norm": 0.057199861980904605, "learning_rate": 4.15091594616686e-05, "loss": 0.8544, "step": 5810 }, { "epoch": 0.7822694558417973, "grad_norm": 0.06592489371919232, "learning_rate": 4.1266208615730994e-05, "loss": 0.8349, "step": 5815 }, { "epoch": 0.7829420865003027, "grad_norm": 0.05890644319372499, "learning_rate": 4.102385739750317e-05, "loss": 0.7997, "step": 5820 }, { "epoch": 0.7836147171588081, "grad_norm": 0.05862081496562069, "learning_rate": 4.07821071434648e-05, "loss": 0.8307, "step": 5825 }, { "epoch": 0.7842873478173135, "grad_norm": 0.0547581105650986, "learning_rate": 4.054095918678143e-05, "loss": 0.8585, "step": 5830 }, { "epoch": 0.7849599784758189, "grad_norm": 0.05812187289880372, "learning_rate": 4.03004148572973e-05, "loss": 0.7999, "step": 5835 }, { "epoch": 0.7856326091343243, "grad_norm": 0.06039957695819741, "learning_rate": 4.006047548152777e-05, "loss": 0.8523, "step": 5840 }, { "epoch": 0.7863052397928297, "grad_norm": 0.056273134561604085, "learning_rate": 3.9821142382652066e-05, "loss": 0.8957, "step": 5845 }, { "epoch": 0.7869778704513352, "grad_norm": 0.059358267937848326, "learning_rate": 3.9582416880506076e-05, "loss": 0.8684, "step": 5850 }, { "epoch": 0.7876505011098406, "grad_norm": 0.056033369088731466, "learning_rate": 3.934430029157494e-05, "loss": 0.8324, "step": 5855 }, { "epoch": 0.788323131768346, "grad_norm": 0.05454574945735745, "learning_rate": 3.910679392898601e-05, "loss": 0.8172, "step": 5860 }, { "epoch": 0.7889957624268514, "grad_norm": 0.05520965397535946, "learning_rate": 3.886989910250131e-05, "loss": 0.8256, "step": 5865 }, { "epoch": 0.7896683930853569, "grad_norm": 0.05354408451523844, "learning_rate": 3.863361711851047e-05, "loss": 0.8403, "step": 5870 }, { "epoch": 0.7903410237438623, "grad_norm": 0.06049601814389344, "learning_rate": 3.839794928002363e-05, "loss": 0.8738, "step": 5875 }, { "epoch": 0.7910136544023677, "grad_norm": 0.054589926246424225, "learning_rate": 3.8162896886664056e-05, "loss": 0.862, "step": 5880 }, { "epoch": 0.7916862850608731, "grad_norm": 0.07172856741569986, "learning_rate": 3.7928461234661053e-05, "loss": 0.9012, "step": 5885 }, { "epoch": 0.7923589157193784, "grad_norm": 0.05802045149036376, "learning_rate": 3.769464361684277e-05, "loss": 0.8757, "step": 5890 }, { "epoch": 0.7930315463778839, "grad_norm": 0.05608922145460016, "learning_rate": 3.746144532262931e-05, "loss": 0.8566, "step": 5895 }, { "epoch": 0.7937041770363893, "grad_norm": 0.05737451720854343, "learning_rate": 3.7228867638025225e-05, "loss": 0.8482, "step": 5900 }, { "epoch": 0.7943768076948947, "grad_norm": 0.06243507484604905, "learning_rate": 3.699691184561271e-05, "loss": 0.8555, "step": 5905 }, { "epoch": 0.7950494383534001, "grad_norm": 0.05625355913625404, "learning_rate": 3.6765579224544404e-05, "loss": 0.8327, "step": 5910 }, { "epoch": 0.7957220690119056, "grad_norm": 0.05399521331082984, "learning_rate": 3.6534871050536515e-05, "loss": 0.8364, "step": 5915 }, { "epoch": 0.796394699670411, "grad_norm": 0.0595413280030016, "learning_rate": 3.63047885958615e-05, "loss": 0.8524, "step": 5920 }, { "epoch": 0.7970673303289164, "grad_norm": 0.060737734887294295, "learning_rate": 3.607533312934127e-05, "loss": 0.8332, "step": 5925 }, { "epoch": 0.7977399609874218, "grad_norm": 0.058576781066161154, "learning_rate": 3.584650591634006e-05, "loss": 0.8713, "step": 5930 }, { "epoch": 0.7984125916459273, "grad_norm": 0.057089755190360525, "learning_rate": 3.561830821875764e-05, "loss": 0.8647, "step": 5935 }, { "epoch": 0.7990852223044327, "grad_norm": 0.054430708296939447, "learning_rate": 3.5390741295022096e-05, "loss": 0.8696, "step": 5940 }, { "epoch": 0.799757852962938, "grad_norm": 0.055218219076360806, "learning_rate": 3.516380640008306e-05, "loss": 0.8831, "step": 5945 }, { "epoch": 0.8004304836214434, "grad_norm": 0.05691195406760957, "learning_rate": 3.4937504785404836e-05, "loss": 0.7638, "step": 5950 }, { "epoch": 0.8011031142799488, "grad_norm": 0.057263267865896667, "learning_rate": 3.471183769895931e-05, "loss": 0.8816, "step": 5955 }, { "epoch": 0.8017757449384543, "grad_norm": 0.06494993390956837, "learning_rate": 3.448680638521922e-05, "loss": 0.8197, "step": 5960 }, { "epoch": 0.8024483755969597, "grad_norm": 0.05553648524493593, "learning_rate": 3.426241208515122e-05, "loss": 0.8664, "step": 5965 }, { "epoch": 0.8031210062554651, "grad_norm": 0.05889635161842296, "learning_rate": 3.4038656036209055e-05, "loss": 0.8223, "step": 5970 }, { "epoch": 0.8037936369139705, "grad_norm": 0.05031588773521702, "learning_rate": 3.3815539472326864e-05, "loss": 0.823, "step": 5975 }, { "epoch": 0.804466267572476, "grad_norm": 0.05726274410373388, "learning_rate": 3.35930636239121e-05, "loss": 0.8331, "step": 5980 }, { "epoch": 0.8051388982309814, "grad_norm": 0.05569140298750744, "learning_rate": 3.3371229717838924e-05, "loss": 0.8566, "step": 5985 }, { "epoch": 0.8058115288894868, "grad_norm": 0.05460163820200784, "learning_rate": 3.315003897744157e-05, "loss": 0.8306, "step": 5990 }, { "epoch": 0.8064841595479922, "grad_norm": 0.05846879661123691, "learning_rate": 3.292949262250725e-05, "loss": 0.7746, "step": 5995 }, { "epoch": 0.8071567902064977, "grad_norm": 0.05494495492901229, "learning_rate": 3.270959186926966e-05, "loss": 0.7972, "step": 6000 }, { "epoch": 0.807829420865003, "grad_norm": 0.05998506596591228, "learning_rate": 3.249033793040244e-05, "loss": 0.8058, "step": 6005 }, { "epoch": 0.8085020515235084, "grad_norm": 0.06763589968444475, "learning_rate": 3.227173201501195e-05, "loss": 0.8724, "step": 6010 }, { "epoch": 0.8091746821820138, "grad_norm": 0.05529737312453018, "learning_rate": 3.20537753286312e-05, "loss": 0.8504, "step": 6015 }, { "epoch": 0.8098473128405193, "grad_norm": 0.058924905982349764, "learning_rate": 3.183646907321282e-05, "loss": 0.8331, "step": 6020 }, { "epoch": 0.8105199434990247, "grad_norm": 0.054076196022565776, "learning_rate": 3.161981444712251e-05, "loss": 0.8423, "step": 6025 }, { "epoch": 0.8111925741575301, "grad_norm": 0.055090039911914114, "learning_rate": 3.140381264513263e-05, "loss": 0.8456, "step": 6030 }, { "epoch": 0.8118652048160355, "grad_norm": 0.05194250967465533, "learning_rate": 3.118846485841528e-05, "loss": 0.8186, "step": 6035 }, { "epoch": 0.812537835474541, "grad_norm": 0.055299499667346826, "learning_rate": 3.097377227453592e-05, "loss": 0.8192, "step": 6040 }, { "epoch": 0.8132104661330464, "grad_norm": 0.05702324547815417, "learning_rate": 3.075973607744703e-05, "loss": 0.7962, "step": 6045 }, { "epoch": 0.8138830967915518, "grad_norm": 0.053357064367373114, "learning_rate": 3.054635744748095e-05, "loss": 0.8157, "step": 6050 }, { "epoch": 0.8145557274500572, "grad_norm": 0.0540305307848929, "learning_rate": 3.0333637561344094e-05, "loss": 0.8531, "step": 6055 }, { "epoch": 0.8152283581085625, "grad_norm": 0.06722887878787541, "learning_rate": 3.0121577592110142e-05, "loss": 0.8157, "step": 6060 }, { "epoch": 0.815900988767068, "grad_norm": 0.05613468827920807, "learning_rate": 2.9910178709213294e-05, "loss": 0.8236, "step": 6065 }, { "epoch": 0.8165736194255734, "grad_norm": 0.05757688307873639, "learning_rate": 2.9699442078442404e-05, "loss": 0.8626, "step": 6070 }, { "epoch": 0.8172462500840788, "grad_norm": 0.05190644378889452, "learning_rate": 2.948936886193407e-05, "loss": 0.8699, "step": 6075 }, { "epoch": 0.8179188807425842, "grad_norm": 0.06058364911518402, "learning_rate": 2.927996021816641e-05, "loss": 0.8555, "step": 6080 }, { "epoch": 0.8185915114010897, "grad_norm": 0.05267663660873433, "learning_rate": 2.9071217301952748e-05, "loss": 0.8515, "step": 6085 }, { "epoch": 0.8192641420595951, "grad_norm": 0.05663551603220171, "learning_rate": 2.8863141264435118e-05, "loss": 0.8551, "step": 6090 }, { "epoch": 0.8199367727181005, "grad_norm": 0.06164283048494597, "learning_rate": 2.865573325307786e-05, "loss": 0.8508, "step": 6095 }, { "epoch": 0.8206094033766059, "grad_norm": 0.05686398651550054, "learning_rate": 2.8448994411661646e-05, "loss": 0.8058, "step": 6100 }, { "epoch": 0.8212820340351114, "grad_norm": 0.05699264568964854, "learning_rate": 2.8242925880276546e-05, "loss": 0.8845, "step": 6105 }, { "epoch": 0.8219546646936168, "grad_norm": 0.047913505171422455, "learning_rate": 2.803752879531647e-05, "loss": 0.8271, "step": 6110 }, { "epoch": 0.8226272953521222, "grad_norm": 0.057259062175757665, "learning_rate": 2.7832804289472317e-05, "loss": 0.8799, "step": 6115 }, { "epoch": 0.8232999260106275, "grad_norm": 0.05428004232970161, "learning_rate": 2.7628753491726018e-05, "loss": 0.8545, "step": 6120 }, { "epoch": 0.8239725566691329, "grad_norm": 0.05485478370375546, "learning_rate": 2.7425377527344296e-05, "loss": 0.8166, "step": 6125 }, { "epoch": 0.8246451873276384, "grad_norm": 0.05280322376501419, "learning_rate": 2.7222677517872366e-05, "loss": 0.831, "step": 6130 }, { "epoch": 0.8253178179861438, "grad_norm": 0.056970755995371276, "learning_rate": 2.7020654581127737e-05, "loss": 0.8742, "step": 6135 }, { "epoch": 0.8259904486446492, "grad_norm": 0.055352006316194045, "learning_rate": 2.681930983119425e-05, "loss": 0.8013, "step": 6140 }, { "epoch": 0.8266630793031546, "grad_norm": 0.05083832213731214, "learning_rate": 2.6618644378415676e-05, "loss": 0.8601, "step": 6145 }, { "epoch": 0.8273357099616601, "grad_norm": 0.05455909665181691, "learning_rate": 2.6418659329389723e-05, "loss": 0.8594, "step": 6150 }, { "epoch": 0.8280083406201655, "grad_norm": 0.056936005310754684, "learning_rate": 2.6219355786961925e-05, "loss": 0.8448, "step": 6155 }, { "epoch": 0.8286809712786709, "grad_norm": 0.05372150118403549, "learning_rate": 2.6020734850219556e-05, "loss": 0.8402, "step": 6160 }, { "epoch": 0.8293536019371763, "grad_norm": 0.06450196137290737, "learning_rate": 2.5822797614485606e-05, "loss": 0.8961, "step": 6165 }, { "epoch": 0.8300262325956818, "grad_norm": 0.05987145868487474, "learning_rate": 2.5625545171312634e-05, "loss": 0.8573, "step": 6170 }, { "epoch": 0.8306988632541872, "grad_norm": 0.05609755266031922, "learning_rate": 2.5428978608476834e-05, "loss": 0.8479, "step": 6175 }, { "epoch": 0.8313714939126925, "grad_norm": 0.05986487697713568, "learning_rate": 2.523309900997206e-05, "loss": 0.8871, "step": 6180 }, { "epoch": 0.8320441245711979, "grad_norm": 0.05605815696390431, "learning_rate": 2.5037907456003757e-05, "loss": 0.838, "step": 6185 }, { "epoch": 0.8327167552297033, "grad_norm": 0.05605263224140276, "learning_rate": 2.4843405022983064e-05, "loss": 0.8451, "step": 6190 }, { "epoch": 0.8333893858882088, "grad_norm": 0.051626633695471935, "learning_rate": 2.4649592783520828e-05, "loss": 0.8205, "step": 6195 }, { "epoch": 0.8340620165467142, "grad_norm": 0.06010243950226696, "learning_rate": 2.445647180642184e-05, "loss": 0.852, "step": 6200 }, { "epoch": 0.8347346472052196, "grad_norm": 0.05600829537038441, "learning_rate": 2.426404315667873e-05, "loss": 0.8239, "step": 6205 }, { "epoch": 0.835407277863725, "grad_norm": 0.05559431056965923, "learning_rate": 2.40723078954662e-05, "loss": 0.8413, "step": 6210 }, { "epoch": 0.8360799085222305, "grad_norm": 0.052606571948545705, "learning_rate": 2.3881267080135145e-05, "loss": 0.8483, "step": 6215 }, { "epoch": 0.8367525391807359, "grad_norm": 0.057151174334509676, "learning_rate": 2.3690921764206967e-05, "loss": 0.848, "step": 6220 }, { "epoch": 0.8374251698392413, "grad_norm": 0.05485937311783543, "learning_rate": 2.3501272997367494e-05, "loss": 0.8115, "step": 6225 }, { "epoch": 0.8380978004977467, "grad_norm": 0.057679697757406556, "learning_rate": 2.3312321825461405e-05, "loss": 0.8605, "step": 6230 }, { "epoch": 0.838770431156252, "grad_norm": 0.06015687395002475, "learning_rate": 2.312406929048634e-05, "loss": 0.7852, "step": 6235 }, { "epoch": 0.8394430618147575, "grad_norm": 0.05621629174433919, "learning_rate": 2.2936516430587322e-05, "loss": 0.8688, "step": 6240 }, { "epoch": 0.8401156924732629, "grad_norm": 0.05515376481880061, "learning_rate": 2.27496642800508e-05, "loss": 0.8153, "step": 6245 }, { "epoch": 0.8407883231317683, "grad_norm": 0.07267256435070797, "learning_rate": 2.2563513869299127e-05, "loss": 0.8182, "step": 6250 }, { "epoch": 0.8414609537902737, "grad_norm": 0.0545501771483703, "learning_rate": 2.2378066224884746e-05, "loss": 0.8485, "step": 6255 }, { "epoch": 0.8421335844487792, "grad_norm": 0.05542398846337032, "learning_rate": 2.2193322369484713e-05, "loss": 0.8103, "step": 6260 }, { "epoch": 0.8428062151072846, "grad_norm": 0.05775699421005842, "learning_rate": 2.2009283321894844e-05, "loss": 0.8355, "step": 6265 }, { "epoch": 0.84347884576579, "grad_norm": 0.061116771629393135, "learning_rate": 2.1825950097024213e-05, "loss": 0.8278, "step": 6270 }, { "epoch": 0.8441514764242954, "grad_norm": 0.0540819660202808, "learning_rate": 2.1643323705889526e-05, "loss": 0.8094, "step": 6275 }, { "epoch": 0.8448241070828009, "grad_norm": 0.05737711537280462, "learning_rate": 2.146140515560965e-05, "loss": 0.8307, "step": 6280 }, { "epoch": 0.8454967377413063, "grad_norm": 0.05656829425350953, "learning_rate": 2.1280195449399835e-05, "loss": 0.8335, "step": 6285 }, { "epoch": 0.8461693683998117, "grad_norm": 0.05677701600352013, "learning_rate": 2.1099695586566345e-05, "loss": 0.8519, "step": 6290 }, { "epoch": 0.846841999058317, "grad_norm": 0.059042745055333475, "learning_rate": 2.0919906562500986e-05, "loss": 0.8341, "step": 6295 }, { "epoch": 0.8475146297168225, "grad_norm": 0.05840627836007293, "learning_rate": 2.0740829368675484e-05, "loss": 0.8285, "step": 6300 }, { "epoch": 0.8481872603753279, "grad_norm": 0.057683231041837084, "learning_rate": 2.0562464992636057e-05, "loss": 0.8644, "step": 6305 }, { "epoch": 0.8488598910338333, "grad_norm": 0.06940718187443026, "learning_rate": 2.0384814417998037e-05, "loss": 0.8138, "step": 6310 }, { "epoch": 0.8495325216923387, "grad_norm": 0.05532173250390559, "learning_rate": 2.0207878624440356e-05, "loss": 0.823, "step": 6315 }, { "epoch": 0.8502051523508442, "grad_norm": 0.05656647116535209, "learning_rate": 2.0031658587700256e-05, "loss": 0.8324, "step": 6320 }, { "epoch": 0.8508777830093496, "grad_norm": 0.05855962244394195, "learning_rate": 1.985615527956777e-05, "loss": 0.8304, "step": 6325 }, { "epoch": 0.851550413667855, "grad_norm": 0.053095341157430594, "learning_rate": 1.968136966788041e-05, "loss": 0.8146, "step": 6330 }, { "epoch": 0.8522230443263604, "grad_norm": 0.05744344754191223, "learning_rate": 1.9507302716517948e-05, "loss": 0.8335, "step": 6335 }, { "epoch": 0.8528956749848658, "grad_norm": 0.0658666167608361, "learning_rate": 1.933395538539695e-05, "loss": 0.8632, "step": 6340 }, { "epoch": 0.8535683056433713, "grad_norm": 0.054351818029514914, "learning_rate": 1.9161328630465466e-05, "loss": 0.8447, "step": 6345 }, { "epoch": 0.8542409363018766, "grad_norm": 0.054387469255191084, "learning_rate": 1.8989423403698018e-05, "loss": 0.8252, "step": 6350 }, { "epoch": 0.854913566960382, "grad_norm": 0.054990218347926514, "learning_rate": 1.88182406530899e-05, "loss": 0.7834, "step": 6355 }, { "epoch": 0.8555861976188874, "grad_norm": 0.05337737093629034, "learning_rate": 1.8647781322652433e-05, "loss": 0.8214, "step": 6360 }, { "epoch": 0.8562588282773929, "grad_norm": 0.055254501098107256, "learning_rate": 1.8478046352407428e-05, "loss": 0.8073, "step": 6365 }, { "epoch": 0.8569314589358983, "grad_norm": 0.05450744542884276, "learning_rate": 1.830903667838209e-05, "loss": 0.8043, "step": 6370 }, { "epoch": 0.8576040895944037, "grad_norm": 0.05911858369289859, "learning_rate": 1.8140753232604005e-05, "loss": 0.8255, "step": 6375 }, { "epoch": 0.8582767202529091, "grad_norm": 0.059893021233977536, "learning_rate": 1.7973196943095718e-05, "loss": 0.8285, "step": 6380 }, { "epoch": 0.8589493509114146, "grad_norm": 0.055273372638828, "learning_rate": 1.7806368733869846e-05, "loss": 0.8306, "step": 6385 }, { "epoch": 0.85962198156992, "grad_norm": 0.05735158659763661, "learning_rate": 1.7640269524923966e-05, "loss": 0.8784, "step": 6390 }, { "epoch": 0.8602946122284254, "grad_norm": 0.05800089219667076, "learning_rate": 1.747490023223529e-05, "loss": 0.8554, "step": 6395 }, { "epoch": 0.8609672428869308, "grad_norm": 0.058282597154273114, "learning_rate": 1.7310261767755996e-05, "loss": 0.8028, "step": 6400 }, { "epoch": 0.8616398735454363, "grad_norm": 0.055664911895973175, "learning_rate": 1.7146355039407987e-05, "loss": 0.8483, "step": 6405 }, { "epoch": 0.8623125042039416, "grad_norm": 0.058430396050760815, "learning_rate": 1.6983180951077733e-05, "loss": 0.8632, "step": 6410 }, { "epoch": 0.862985134862447, "grad_norm": 0.056113876010569066, "learning_rate": 1.6820740402611683e-05, "loss": 0.8384, "step": 6415 }, { "epoch": 0.8636577655209524, "grad_norm": 0.05606910031852883, "learning_rate": 1.665903428981093e-05, "loss": 0.869, "step": 6420 }, { "epoch": 0.8643303961794578, "grad_norm": 0.05660103519452574, "learning_rate": 1.6498063504426436e-05, "loss": 0.875, "step": 6425 }, { "epoch": 0.8650030268379633, "grad_norm": 0.05564907615214855, "learning_rate": 1.6337828934154214e-05, "loss": 0.8796, "step": 6430 }, { "epoch": 0.8656756574964687, "grad_norm": 0.056719253319279316, "learning_rate": 1.6178331462630147e-05, "loss": 0.8531, "step": 6435 }, { "epoch": 0.8663482881549741, "grad_norm": 0.054424041588265346, "learning_rate": 1.6019571969425365e-05, "loss": 0.8062, "step": 6440 }, { "epoch": 0.8670209188134795, "grad_norm": 0.05628980367650593, "learning_rate": 1.586155133004141e-05, "loss": 0.8569, "step": 6445 }, { "epoch": 0.867693549471985, "grad_norm": 0.058910883963393904, "learning_rate": 1.5704270415905062e-05, "loss": 0.8056, "step": 6450 }, { "epoch": 0.8683661801304904, "grad_norm": 0.05506769991703955, "learning_rate": 1.5547730094364013e-05, "loss": 0.7787, "step": 6455 }, { "epoch": 0.8690388107889958, "grad_norm": 0.05284393894902228, "learning_rate": 1.5391931228681825e-05, "loss": 0.8619, "step": 6460 }, { "epoch": 0.8697114414475012, "grad_norm": 0.057806312978608075, "learning_rate": 1.5236874678033046e-05, "loss": 0.7836, "step": 6465 }, { "epoch": 0.8703840721060065, "grad_norm": 0.05827049124793768, "learning_rate": 1.508256129749878e-05, "loss": 0.8171, "step": 6470 }, { "epoch": 0.871056702764512, "grad_norm": 0.056373755145065804, "learning_rate": 1.4928991938061763e-05, "loss": 0.816, "step": 6475 }, { "epoch": 0.8717293334230174, "grad_norm": 0.05053172144818605, "learning_rate": 1.4776167446601661e-05, "loss": 0.7998, "step": 6480 }, { "epoch": 0.8724019640815228, "grad_norm": 0.05913567736346084, "learning_rate": 1.462408866589061e-05, "loss": 0.8285, "step": 6485 }, { "epoch": 0.8730745947400282, "grad_norm": 0.053859356228762825, "learning_rate": 1.4472756434588285e-05, "loss": 0.8464, "step": 6490 }, { "epoch": 0.8737472253985337, "grad_norm": 0.05635235866353657, "learning_rate": 1.432217158723742e-05, "loss": 0.8142, "step": 6495 }, { "epoch": 0.8744198560570391, "grad_norm": 0.05531847109049426, "learning_rate": 1.4172334954259312e-05, "loss": 0.8459, "step": 6500 }, { "epoch": 0.8750924867155445, "grad_norm": 0.05609083991899534, "learning_rate": 1.4023247361948947e-05, "loss": 0.8028, "step": 6505 }, { "epoch": 0.8757651173740499, "grad_norm": 0.052740932814354304, "learning_rate": 1.3874909632470794e-05, "loss": 0.8435, "step": 6510 }, { "epoch": 0.8764377480325554, "grad_norm": 0.05414838892939342, "learning_rate": 1.3727322583853978e-05, "loss": 0.8329, "step": 6515 }, { "epoch": 0.8771103786910608, "grad_norm": 0.05193151913599139, "learning_rate": 1.358048702998794e-05, "loss": 0.8514, "step": 6520 }, { "epoch": 0.8777830093495661, "grad_norm": 0.054388643704421494, "learning_rate": 1.343440378061792e-05, "loss": 0.8261, "step": 6525 }, { "epoch": 0.8784556400080715, "grad_norm": 0.05483708117307544, "learning_rate": 1.3289073641340404e-05, "loss": 0.8181, "step": 6530 }, { "epoch": 0.879128270666577, "grad_norm": 0.05704308117845801, "learning_rate": 1.3144497413598786e-05, "loss": 0.8303, "step": 6535 }, { "epoch": 0.8798009013250824, "grad_norm": 0.0641318422436249, "learning_rate": 1.3000675894678958e-05, "loss": 0.8557, "step": 6540 }, { "epoch": 0.8804735319835878, "grad_norm": 0.06452744180453934, "learning_rate": 1.2857609877704766e-05, "loss": 0.8754, "step": 6545 }, { "epoch": 0.8811461626420932, "grad_norm": 0.05486306222600904, "learning_rate": 1.2715300151633795e-05, "loss": 0.8418, "step": 6550 }, { "epoch": 0.8818187933005986, "grad_norm": 0.05682623509055596, "learning_rate": 1.2573747501252929e-05, "loss": 0.789, "step": 6555 }, { "epoch": 0.8824914239591041, "grad_norm": 0.05793181026249577, "learning_rate": 1.2432952707174077e-05, "loss": 0.8419, "step": 6560 }, { "epoch": 0.8831640546176095, "grad_norm": 0.05489821037443936, "learning_rate": 1.2292916545829857e-05, "loss": 0.799, "step": 6565 }, { "epoch": 0.8838366852761149, "grad_norm": 0.055009781968734066, "learning_rate": 1.2153639789469266e-05, "loss": 0.8353, "step": 6570 }, { "epoch": 0.8845093159346203, "grad_norm": 0.055488426737696446, "learning_rate": 1.2015123206153438e-05, "loss": 0.7957, "step": 6575 }, { "epoch": 0.8851819465931258, "grad_norm": 0.056704982958688414, "learning_rate": 1.1877367559751505e-05, "loss": 0.8774, "step": 6580 }, { "epoch": 0.8858545772516311, "grad_norm": 0.056620034329013004, "learning_rate": 1.1740373609936243e-05, "loss": 0.7935, "step": 6585 }, { "epoch": 0.8865272079101365, "grad_norm": 0.05453856210095586, "learning_rate": 1.1604142112179954e-05, "loss": 0.8338, "step": 6590 }, { "epoch": 0.8871998385686419, "grad_norm": 0.056687966623131726, "learning_rate": 1.1468673817750268e-05, "loss": 0.8078, "step": 6595 }, { "epoch": 0.8878724692271474, "grad_norm": 0.05698668416033602, "learning_rate": 1.13339694737061e-05, "loss": 0.8264, "step": 6600 }, { "epoch": 0.8885450998856528, "grad_norm": 0.06045248837509163, "learning_rate": 1.1200029822893403e-05, "loss": 0.8371, "step": 6605 }, { "epoch": 0.8892177305441582, "grad_norm": 0.05550968614064584, "learning_rate": 1.1066855603941116e-05, "loss": 0.8438, "step": 6610 }, { "epoch": 0.8898903612026636, "grad_norm": 0.0544441054388003, "learning_rate": 1.0934447551257075e-05, "loss": 0.7952, "step": 6615 }, { "epoch": 0.890562991861169, "grad_norm": 0.059624142666569964, "learning_rate": 1.0802806395024077e-05, "loss": 0.8208, "step": 6620 }, { "epoch": 0.8912356225196745, "grad_norm": 0.16363131712067192, "learning_rate": 1.0671932861195653e-05, "loss": 0.8434, "step": 6625 }, { "epoch": 0.8919082531781799, "grad_norm": 0.06206283538299498, "learning_rate": 1.0541827671492254e-05, "loss": 0.8057, "step": 6630 }, { "epoch": 0.8925808838366853, "grad_norm": 0.05836709662569859, "learning_rate": 1.04124915433971e-05, "loss": 0.8422, "step": 6635 }, { "epoch": 0.8932535144951906, "grad_norm": 0.061930616062657796, "learning_rate": 1.028392519015246e-05, "loss": 0.8281, "step": 6640 }, { "epoch": 0.8939261451536961, "grad_norm": 0.058062089585763935, "learning_rate": 1.0156129320755407e-05, "loss": 0.8381, "step": 6645 }, { "epoch": 0.8945987758122015, "grad_norm": 0.05726651358791121, "learning_rate": 1.0029104639954155e-05, "loss": 0.803, "step": 6650 }, { "epoch": 0.8952714064707069, "grad_norm": 0.05470645605167178, "learning_rate": 9.90285184824413e-06, "loss": 0.8297, "step": 6655 }, { "epoch": 0.8959440371292123, "grad_norm": 0.05675315145907022, "learning_rate": 9.777371641864001e-06, "loss": 0.8213, "step": 6660 }, { "epoch": 0.8966166677877178, "grad_norm": 0.058185966792208735, "learning_rate": 9.652664712791908e-06, "loss": 0.8086, "step": 6665 }, { "epoch": 0.8972892984462232, "grad_norm": 0.05818770614969196, "learning_rate": 9.528731748741669e-06, "loss": 0.8059, "step": 6670 }, { "epoch": 0.8979619291047286, "grad_norm": 0.0598088005724284, "learning_rate": 9.405573433158935e-06, "loss": 0.7718, "step": 6675 }, { "epoch": 0.898634559763234, "grad_norm": 0.05579621293351216, "learning_rate": 9.283190445217543e-06, "loss": 0.8388, "step": 6680 }, { "epoch": 0.8993071904217395, "grad_norm": 0.057509096259254926, "learning_rate": 9.161583459815541e-06, "loss": 0.8184, "step": 6685 }, { "epoch": 0.8999798210802449, "grad_norm": 0.05567137387737026, "learning_rate": 9.040753147571694e-06, "loss": 0.825, "step": 6690 }, { "epoch": 0.9006524517387503, "grad_norm": 0.054220374114114143, "learning_rate": 8.920700174821704e-06, "loss": 0.8298, "step": 6695 }, { "epoch": 0.9013250823972556, "grad_norm": 0.05624449640927385, "learning_rate": 8.801425203614403e-06, "loss": 0.8378, "step": 6700 }, { "epoch": 0.901997713055761, "grad_norm": 0.05358240234964744, "learning_rate": 8.68292889170839e-06, "loss": 0.862, "step": 6705 }, { "epoch": 0.9026703437142665, "grad_norm": 0.05764638889180579, "learning_rate": 8.565211892568147e-06, "loss": 0.7911, "step": 6710 }, { "epoch": 0.9033429743727719, "grad_norm": 0.06469252177667065, "learning_rate": 8.448274855360493e-06, "loss": 0.8209, "step": 6715 }, { "epoch": 0.9040156050312773, "grad_norm": 0.05723782270004928, "learning_rate": 8.33211842495114e-06, "loss": 0.8373, "step": 6720 }, { "epoch": 0.9046882356897827, "grad_norm": 0.05407480477711123, "learning_rate": 8.216743241900958e-06, "loss": 0.825, "step": 6725 }, { "epoch": 0.9053608663482882, "grad_norm": 0.05634412862285222, "learning_rate": 8.102149942462516e-06, "loss": 0.8393, "step": 6730 }, { "epoch": 0.9060334970067936, "grad_norm": 0.053895311446623226, "learning_rate": 7.988339158576678e-06, "loss": 0.8404, "step": 6735 }, { "epoch": 0.906706127665299, "grad_norm": 0.05503723217321855, "learning_rate": 7.875311517868848e-06, "loss": 0.8447, "step": 6740 }, { "epoch": 0.9073787583238044, "grad_norm": 0.05593172816905371, "learning_rate": 7.763067643645798e-06, "loss": 0.8082, "step": 6745 }, { "epoch": 0.9080513889823099, "grad_norm": 0.061482767030254945, "learning_rate": 7.651608154892124e-06, "loss": 0.8367, "step": 6750 }, { "epoch": 0.9087240196408153, "grad_norm": 0.05746238917932355, "learning_rate": 7.540933666266719e-06, "loss": 0.8386, "step": 6755 }, { "epoch": 0.9093966502993206, "grad_norm": 0.05576392822962793, "learning_rate": 7.431044788099583e-06, "loss": 0.8405, "step": 6760 }, { "epoch": 0.910069280957826, "grad_norm": 0.05616250631192827, "learning_rate": 7.321942126388286e-06, "loss": 0.8322, "step": 6765 }, { "epoch": 0.9107419116163314, "grad_norm": 0.058014672803636855, "learning_rate": 7.213626282794699e-06, "loss": 0.8366, "step": 6770 }, { "epoch": 0.9114145422748369, "grad_norm": 0.05772216240114444, "learning_rate": 7.106097854641779e-06, "loss": 0.8541, "step": 6775 }, { "epoch": 0.9120871729333423, "grad_norm": 0.05757569186911316, "learning_rate": 6.999357434910025e-06, "loss": 0.856, "step": 6780 }, { "epoch": 0.9127598035918477, "grad_norm": 0.05727236113439516, "learning_rate": 6.893405612234426e-06, "loss": 0.7808, "step": 6785 }, { "epoch": 0.9134324342503531, "grad_norm": 0.05536651126384854, "learning_rate": 6.788242970901187e-06, "loss": 0.8185, "step": 6790 }, { "epoch": 0.9141050649088586, "grad_norm": 0.05593060161696056, "learning_rate": 6.68387009084434e-06, "loss": 0.8486, "step": 6795 }, { "epoch": 0.914777695567364, "grad_norm": 0.053834765840133, "learning_rate": 6.580287547642771e-06, "loss": 0.8146, "step": 6800 }, { "epoch": 0.9154503262258694, "grad_norm": 0.05870029877629974, "learning_rate": 6.47749591251695e-06, "loss": 0.8048, "step": 6805 }, { "epoch": 0.9161229568843748, "grad_norm": 0.05721706302420605, "learning_rate": 6.3754957523256715e-06, "loss": 0.8188, "step": 6810 }, { "epoch": 0.9167955875428802, "grad_norm": 0.055140825623670245, "learning_rate": 6.274287629563119e-06, "loss": 0.7844, "step": 6815 }, { "epoch": 0.9174682182013856, "grad_norm": 0.051653691682714734, "learning_rate": 6.173872102355654e-06, "loss": 0.805, "step": 6820 }, { "epoch": 0.918140848859891, "grad_norm": 0.05876727045069556, "learning_rate": 6.074249724458735e-06, "loss": 0.8785, "step": 6825 }, { "epoch": 0.9188134795183964, "grad_norm": 0.054597347339118626, "learning_rate": 5.975421045253953e-06, "loss": 0.8406, "step": 6830 }, { "epoch": 0.9194861101769018, "grad_norm": 0.0523982140672833, "learning_rate": 5.877386609745832e-06, "loss": 0.8333, "step": 6835 }, { "epoch": 0.9201587408354073, "grad_norm": 0.0538127071729553, "learning_rate": 5.780146958559017e-06, "loss": 0.8413, "step": 6840 }, { "epoch": 0.9208313714939127, "grad_norm": 0.05806274114630895, "learning_rate": 5.683702627935194e-06, "loss": 0.8516, "step": 6845 }, { "epoch": 0.9215040021524181, "grad_norm": 0.05360779533579333, "learning_rate": 5.58805414973007e-06, "loss": 0.8544, "step": 6850 }, { "epoch": 0.9221766328109235, "grad_norm": 0.055601450532106245, "learning_rate": 5.4932020514106e-06, "loss": 0.8552, "step": 6855 }, { "epoch": 0.922849263469429, "grad_norm": 0.0521116662169681, "learning_rate": 5.3991468560519666e-06, "loss": 0.8574, "step": 6860 }, { "epoch": 0.9235218941279344, "grad_norm": 0.052641462098686344, "learning_rate": 5.305889082334652e-06, "loss": 0.86, "step": 6865 }, { "epoch": 0.9241945247864398, "grad_norm": 0.0593752389096462, "learning_rate": 5.213429244541756e-06, "loss": 0.84, "step": 6870 }, { "epoch": 0.9248671554449451, "grad_norm": 0.05770582668727739, "learning_rate": 5.121767852555963e-06, "loss": 0.8699, "step": 6875 }, { "epoch": 0.9255397861034506, "grad_norm": 0.05826386563520872, "learning_rate": 5.030905411856851e-06, "loss": 0.8269, "step": 6880 }, { "epoch": 0.926212416761956, "grad_norm": 0.05588686227140018, "learning_rate": 4.940842423518082e-06, "loss": 0.8338, "step": 6885 }, { "epoch": 0.9268850474204614, "grad_norm": 0.059415826857863695, "learning_rate": 4.851579384204602e-06, "loss": 0.8057, "step": 6890 }, { "epoch": 0.9275576780789668, "grad_norm": 0.05721555947001062, "learning_rate": 4.763116786169929e-06, "loss": 0.8111, "step": 6895 }, { "epoch": 0.9282303087374723, "grad_norm": 0.06965978230318905, "learning_rate": 4.675455117253452e-06, "loss": 0.8653, "step": 6900 }, { "epoch": 0.9289029393959777, "grad_norm": 0.05263142191055551, "learning_rate": 4.58859486087767e-06, "loss": 0.7804, "step": 6905 }, { "epoch": 0.9295755700544831, "grad_norm": 0.054579198109746645, "learning_rate": 4.502536496045672e-06, "loss": 0.8243, "step": 6910 }, { "epoch": 0.9302482007129885, "grad_norm": 0.05758805432002544, "learning_rate": 4.417280497338349e-06, "loss": 0.8117, "step": 6915 }, { "epoch": 0.930920831371494, "grad_norm": 0.05901453910245508, "learning_rate": 4.3328273349117985e-06, "loss": 0.7957, "step": 6920 }, { "epoch": 0.9315934620299994, "grad_norm": 0.05998448102095731, "learning_rate": 4.249177474494858e-06, "loss": 0.8475, "step": 6925 }, { "epoch": 0.9322660926885047, "grad_norm": 0.05962514390284862, "learning_rate": 4.166331377386361e-06, "loss": 0.8389, "step": 6930 }, { "epoch": 0.9329387233470101, "grad_norm": 0.05075854568401546, "learning_rate": 4.084289500452686e-06, "loss": 0.8383, "step": 6935 }, { "epoch": 0.9336113540055155, "grad_norm": 0.056229295540856775, "learning_rate": 4.003052296125275e-06, "loss": 0.8432, "step": 6940 }, { "epoch": 0.934283984664021, "grad_norm": 0.05707159182418883, "learning_rate": 3.922620212398053e-06, "loss": 0.7909, "step": 6945 }, { "epoch": 0.9349566153225264, "grad_norm": 0.056476160275041105, "learning_rate": 3.842993692824997e-06, "loss": 0.7794, "step": 6950 }, { "epoch": 0.9356292459810318, "grad_norm": 0.0575531338498515, "learning_rate": 3.7641731765176875e-06, "loss": 0.8389, "step": 6955 }, { "epoch": 0.9363018766395372, "grad_norm": 0.06345716898715516, "learning_rate": 3.6861590981428936e-06, "loss": 0.8821, "step": 6960 }, { "epoch": 0.9369745072980427, "grad_norm": 0.05436220392225888, "learning_rate": 3.6089518879201918e-06, "loss": 0.8533, "step": 6965 }, { "epoch": 0.9376471379565481, "grad_norm": 0.05591349911377824, "learning_rate": 3.5325519716195184e-06, "loss": 0.8549, "step": 6970 }, { "epoch": 0.9383197686150535, "grad_norm": 0.058008597775419855, "learning_rate": 3.4569597705589368e-06, "loss": 0.8246, "step": 6975 }, { "epoch": 0.9389923992735589, "grad_norm": 0.05621372976671557, "learning_rate": 3.3821757016021746e-06, "loss": 0.7854, "step": 6980 }, { "epoch": 0.9396650299320644, "grad_norm": 0.05591057307278753, "learning_rate": 3.3082001771564724e-06, "loss": 0.7782, "step": 6985 }, { "epoch": 0.9403376605905697, "grad_norm": 0.05748855166025494, "learning_rate": 3.235033605170223e-06, "loss": 0.8704, "step": 6990 }, { "epoch": 0.9410102912490751, "grad_norm": 0.061305295309165904, "learning_rate": 3.162676389130686e-06, "loss": 0.8486, "step": 6995 }, { "epoch": 0.9416829219075805, "grad_norm": 0.05261450189267661, "learning_rate": 3.091128928061909e-06, "loss": 0.8399, "step": 7000 }, { "epoch": 0.9423555525660859, "grad_norm": 0.05918033527471021, "learning_rate": 3.020391616522344e-06, "loss": 0.9212, "step": 7005 }, { "epoch": 0.9430281832245914, "grad_norm": 0.056455106699066976, "learning_rate": 2.950464844602818e-06, "loss": 0.8073, "step": 7010 }, { "epoch": 0.9437008138830968, "grad_norm": 0.055853500334889054, "learning_rate": 2.881348997924282e-06, "loss": 0.8696, "step": 7015 }, { "epoch": 0.9443734445416022, "grad_norm": 0.061415441559629934, "learning_rate": 2.8130444576357323e-06, "loss": 0.8452, "step": 7020 }, { "epoch": 0.9450460752001076, "grad_norm": 0.06526320016491569, "learning_rate": 2.7455516004121436e-06, "loss": 0.7945, "step": 7025 }, { "epoch": 0.9457187058586131, "grad_norm": 0.05844190506558609, "learning_rate": 2.6788707984523207e-06, "loss": 0.9219, "step": 7030 }, { "epoch": 0.9463913365171185, "grad_norm": 0.055208528689147815, "learning_rate": 2.6130024194768675e-06, "loss": 0.8225, "step": 7035 }, { "epoch": 0.9470639671756239, "grad_norm": 0.05708207902042239, "learning_rate": 2.5479468267261715e-06, "loss": 0.8278, "step": 7040 }, { "epoch": 0.9477365978341293, "grad_norm": 0.05520554814739056, "learning_rate": 2.4837043789584233e-06, "loss": 0.8559, "step": 7045 }, { "epoch": 0.9484092284926346, "grad_norm": 0.05526467988369699, "learning_rate": 2.420275430447566e-06, "loss": 0.822, "step": 7050 }, { "epoch": 0.9490818591511401, "grad_norm": 0.04965099416809, "learning_rate": 2.357660330981448e-06, "loss": 0.7798, "step": 7055 }, { "epoch": 0.9497544898096455, "grad_norm": 0.053339771340983035, "learning_rate": 2.2958594258597583e-06, "loss": 0.7858, "step": 7060 }, { "epoch": 0.9504271204681509, "grad_norm": 0.06424545288393499, "learning_rate": 2.2348730558922613e-06, "loss": 0.8881, "step": 7065 }, { "epoch": 0.9510997511266563, "grad_norm": 0.05476638703664176, "learning_rate": 2.174701557396846e-06, "loss": 0.822, "step": 7070 }, { "epoch": 0.9517723817851618, "grad_norm": 0.05866256833078084, "learning_rate": 2.1153452621976153e-06, "loss": 0.8408, "step": 7075 }, { "epoch": 0.9524450124436672, "grad_norm": 0.05569889642062564, "learning_rate": 2.056804497623199e-06, "loss": 0.81, "step": 7080 }, { "epoch": 0.9531176431021726, "grad_norm": 0.057235768610266455, "learning_rate": 1.999079586504826e-06, "loss": 0.8469, "step": 7085 }, { "epoch": 0.953790273760678, "grad_norm": 0.05737113793545201, "learning_rate": 1.94217084717459e-06, "loss": 0.8727, "step": 7090 }, { "epoch": 0.9544629044191835, "grad_norm": 0.05474868616850645, "learning_rate": 1.8860785934637357e-06, "loss": 0.7925, "step": 7095 }, { "epoch": 0.9551355350776889, "grad_norm": 0.0595122324063722, "learning_rate": 1.8308031347007923e-06, "loss": 0.8016, "step": 7100 }, { "epoch": 0.9558081657361942, "grad_norm": 0.055035263805029065, "learning_rate": 1.7763447757100425e-06, "loss": 0.8676, "step": 7105 }, { "epoch": 0.9564807963946996, "grad_norm": 0.05590246554884426, "learning_rate": 1.7227038168097395e-06, "loss": 0.873, "step": 7110 }, { "epoch": 0.957153427053205, "grad_norm": 0.05119675154618015, "learning_rate": 1.66988055381041e-06, "loss": 0.8029, "step": 7115 }, { "epoch": 0.9578260577117105, "grad_norm": 0.05234276265076411, "learning_rate": 1.6178752780133864e-06, "loss": 0.8318, "step": 7120 }, { "epoch": 0.9584986883702159, "grad_norm": 0.05323151931453215, "learning_rate": 1.5666882762090272e-06, "loss": 0.8052, "step": 7125 }, { "epoch": 0.9591713190287213, "grad_norm": 0.05694376211991154, "learning_rate": 1.5163198306751834e-06, "loss": 0.8961, "step": 7130 }, { "epoch": 0.9598439496872267, "grad_norm": 0.05719101441637811, "learning_rate": 1.4667702191757502e-06, "loss": 0.8327, "step": 7135 }, { "epoch": 0.9605165803457322, "grad_norm": 0.05715755903971462, "learning_rate": 1.4180397149589352e-06, "loss": 0.8363, "step": 7140 }, { "epoch": 0.9611892110042376, "grad_norm": 0.056060246764845556, "learning_rate": 1.3701285867559586e-06, "loss": 0.8023, "step": 7145 }, { "epoch": 0.961861841662743, "grad_norm": 0.05502048962304145, "learning_rate": 1.3230370987794558e-06, "loss": 0.8221, "step": 7150 }, { "epoch": 0.9625344723212484, "grad_norm": 0.05675516018646223, "learning_rate": 1.2767655107219942e-06, "loss": 0.79, "step": 7155 }, { "epoch": 0.9632071029797539, "grad_norm": 0.06612529899480675, "learning_rate": 1.2313140777547414e-06, "loss": 0.8517, "step": 7160 }, { "epoch": 0.9638797336382592, "grad_norm": 0.05363591253086607, "learning_rate": 1.1866830505259828e-06, "loss": 0.8356, "step": 7165 }, { "epoch": 0.9645523642967646, "grad_norm": 0.05922830366383967, "learning_rate": 1.1428726751597561e-06, "loss": 0.8425, "step": 7170 }, { "epoch": 0.96522499495527, "grad_norm": 0.05594096747574841, "learning_rate": 1.0998831932545194e-06, "loss": 0.8429, "step": 7175 }, { "epoch": 0.9658976256137755, "grad_norm": 0.05487500145092262, "learning_rate": 1.0577148418817848e-06, "loss": 0.776, "step": 7180 }, { "epoch": 0.9665702562722809, "grad_norm": 0.05596838295843446, "learning_rate": 1.0163678535848041e-06, "loss": 0.8421, "step": 7185 }, { "epoch": 0.9672428869307863, "grad_norm": 0.05441153542171681, "learning_rate": 9.75842456377368e-07, "loss": 0.8411, "step": 7190 }, { "epoch": 0.9679155175892917, "grad_norm": 0.05366297165683922, "learning_rate": 9.361388737424258e-07, "loss": 0.824, "step": 7195 }, { "epoch": 0.9685881482477972, "grad_norm": 0.05545163132286662, "learning_rate": 8.972573246309345e-07, "loss": 0.8556, "step": 7200 }, { "epoch": 0.9692607789063026, "grad_norm": 0.064116228334137, "learning_rate": 8.591980234606777e-07, "loss": 0.7905, "step": 7205 }, { "epoch": 0.969933409564808, "grad_norm": 0.05312151035526173, "learning_rate": 8.219611801149495e-07, "loss": 0.8268, "step": 7210 }, { "epoch": 0.9706060402233134, "grad_norm": 0.058194773519087764, "learning_rate": 7.855469999415719e-07, "loss": 0.7896, "step": 7215 }, { "epoch": 0.9712786708818187, "grad_norm": 0.05863705966020045, "learning_rate": 7.49955683751613e-07, "loss": 0.8392, "step": 7220 }, { "epoch": 0.9719513015403242, "grad_norm": 0.05436753504886398, "learning_rate": 7.151874278183701e-07, "loss": 0.8249, "step": 7225 }, { "epoch": 0.9726239321988296, "grad_norm": 0.05899966125168415, "learning_rate": 6.812424238762714e-07, "loss": 0.8125, "step": 7230 }, { "epoch": 0.973296562857335, "grad_norm": 0.05519895444874772, "learning_rate": 6.481208591197773e-07, "loss": 0.8619, "step": 7235 }, { "epoch": 0.9739691935158404, "grad_norm": 0.05253771806298885, "learning_rate": 6.158229162023798e-07, "loss": 0.8025, "step": 7240 }, { "epoch": 0.9746418241743459, "grad_norm": 0.0536469386569897, "learning_rate": 5.843487732356545e-07, "loss": 0.8377, "step": 7245 }, { "epoch": 0.9753144548328513, "grad_norm": 0.0531695013404097, "learning_rate": 5.536986037881108e-07, "loss": 0.8034, "step": 7250 }, { "epoch": 0.9759870854913567, "grad_norm": 0.06873102864152862, "learning_rate": 5.238725768844265e-07, "loss": 0.8184, "step": 7255 }, { "epoch": 0.9766597161498621, "grad_norm": 0.05324596611103951, "learning_rate": 4.94870857004398e-07, "loss": 0.8976, "step": 7260 }, { "epoch": 0.9773323468083676, "grad_norm": 0.057897135420761366, "learning_rate": 4.666936040820746e-07, "loss": 0.813, "step": 7265 }, { "epoch": 0.978004977466873, "grad_norm": 0.0570970584749799, "learning_rate": 4.3934097350484254e-07, "loss": 0.8261, "step": 7270 }, { "epoch": 0.9786776081253784, "grad_norm": 0.05240797601588687, "learning_rate": 4.128131161126258e-07, "loss": 0.7956, "step": 7275 }, { "epoch": 0.9793502387838837, "grad_norm": 0.057826394271235156, "learning_rate": 3.8711017819698655e-07, "loss": 0.8371, "step": 7280 }, { "epoch": 0.9800228694423891, "grad_norm": 0.06015396248293072, "learning_rate": 3.6223230150040914e-07, "loss": 0.8334, "step": 7285 }, { "epoch": 0.9806955001008946, "grad_norm": 0.06055468276551505, "learning_rate": 3.3817962321540104e-07, "loss": 0.7761, "step": 7290 }, { "epoch": 0.9813681307594, "grad_norm": 0.05544436920380559, "learning_rate": 3.149522759838263e-07, "loss": 0.8265, "step": 7295 }, { "epoch": 0.9820407614179054, "grad_norm": 0.06544097337785165, "learning_rate": 2.9255038789613993e-07, "loss": 0.8964, "step": 7300 }, { "epoch": 0.9827133920764108, "grad_norm": 0.0518140128621406, "learning_rate": 2.709740824906881e-07, "loss": 0.8337, "step": 7305 }, { "epoch": 0.9833860227349163, "grad_norm": 0.05706722221557735, "learning_rate": 2.502234787530255e-07, "loss": 0.7966, "step": 7310 }, { "epoch": 0.9840586533934217, "grad_norm": 0.056752158307538485, "learning_rate": 2.3029869111528265e-07, "loss": 0.8182, "step": 7315 }, { "epoch": 0.9847312840519271, "grad_norm": 0.05552321850596303, "learning_rate": 2.111998294554662e-07, "loss": 0.8017, "step": 7320 }, { "epoch": 0.9854039147104325, "grad_norm": 0.059757292747136675, "learning_rate": 1.929269990969262e-07, "loss": 0.8744, "step": 7325 }, { "epoch": 0.986076545368938, "grad_norm": 0.056043523845769314, "learning_rate": 1.754803008077399e-07, "loss": 0.838, "step": 7330 }, { "epoch": 0.9867491760274433, "grad_norm": 0.053170486374633105, "learning_rate": 1.588598308001787e-07, "loss": 0.8019, "step": 7335 }, { "epoch": 0.9874218066859487, "grad_norm": 0.057704518479353664, "learning_rate": 1.430656807301256e-07, "loss": 0.805, "step": 7340 }, { "epoch": 0.9880944373444541, "grad_norm": 0.05413218505886977, "learning_rate": 1.2809793769665844e-07, "loss": 0.8237, "step": 7345 }, { "epoch": 0.9887670680029595, "grad_norm": 0.057060373317335183, "learning_rate": 1.1395668424148408e-07, "loss": 0.8282, "step": 7350 }, { "epoch": 0.989439698661465, "grad_norm": 0.0596101384204014, "learning_rate": 1.0064199834852182e-07, "loss": 0.8736, "step": 7355 }, { "epoch": 0.9901123293199704, "grad_norm": 0.05133772973983179, "learning_rate": 8.815395344347054e-08, "loss": 0.7716, "step": 7360 }, { "epoch": 0.9907849599784758, "grad_norm": 0.057739414768695714, "learning_rate": 7.649261839340893e-08, "loss": 0.8556, "step": 7365 }, { "epoch": 0.9914575906369812, "grad_norm": 0.05337934101490364, "learning_rate": 6.56580575063792e-08, "loss": 0.7968, "step": 7370 }, { "epoch": 0.9921302212954867, "grad_norm": 0.06044145462857247, "learning_rate": 5.565033053108736e-08, "loss": 0.8404, "step": 7375 }, { "epoch": 0.9928028519539921, "grad_norm": 0.055178278745134306, "learning_rate": 4.6469492656570074e-08, "loss": 0.8422, "step": 7380 }, { "epoch": 0.9934754826124975, "grad_norm": 0.0556030772132154, "learning_rate": 3.811559451182833e-08, "loss": 0.8267, "step": 7385 }, { "epoch": 0.9941481132710029, "grad_norm": 0.0588301847035514, "learning_rate": 3.0588682165594294e-08, "loss": 0.8654, "step": 7390 }, { "epoch": 0.9948207439295083, "grad_norm": 0.05942396313597498, "learning_rate": 2.3888797126081494e-08, "loss": 0.7917, "step": 7395 }, { "epoch": 0.9954933745880137, "grad_norm": 0.05141832056196206, "learning_rate": 1.8015976340751693e-08, "loss": 0.7945, "step": 7400 }, { "epoch": 0.9961660052465191, "grad_norm": 0.06354419407035226, "learning_rate": 1.2970252196098373e-08, "loss": 0.8445, "step": 7405 }, { "epoch": 0.9968386359050245, "grad_norm": 0.05794706209934556, "learning_rate": 8.751652517463569e-09, "loss": 0.7889, "step": 7410 }, { "epoch": 0.99751126656353, "grad_norm": 0.05731964589174625, "learning_rate": 5.360200568904627e-09, "loss": 0.819, "step": 7415 }, { "epoch": 0.9981838972220354, "grad_norm": 0.052763476093352656, "learning_rate": 2.795915053077635e-09, "loss": 0.8117, "step": 7420 }, { "epoch": 0.9988565278805408, "grad_norm": 0.058025404251231066, "learning_rate": 1.0588101110708958e-09, "loss": 0.7894, "step": 7425 }, { "epoch": 0.9995291585390462, "grad_norm": 0.053894275277534875, "learning_rate": 1.488953224049183e-10, "loss": 0.8043, "step": 7430 }, { "epoch": 0.9999327369341494, "eval_loss": 0.8264116644859314, "eval_runtime": 195.7739, "eval_samples_per_second": 255.795, "eval_steps_per_second": 7.994, "step": 7433 }, { "epoch": 0.9999327369341494, "step": 7433, "total_flos": 416264403386368.0, "train_loss": 0.9400932153108766, "train_runtime": 15499.1561, "train_samples_per_second": 61.389, "train_steps_per_second": 0.48 } ], "logging_steps": 5, "max_steps": 7433, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 416264403386368.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }