{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9998731447418495, "eval_steps": 500, "global_step": 3941, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002537105163009007, "grad_norm": 4.738877773284912, "learning_rate": 2.278481012658228e-07, "loss": 1.6626, "step": 10 }, { "epoch": 0.005074210326018014, "grad_norm": 4.201930046081543, "learning_rate": 4.810126582278482e-07, "loss": 1.6508, "step": 20 }, { "epoch": 0.00761131548902702, "grad_norm": 3.1523282527923584, "learning_rate": 7.341772151898735e-07, "loss": 1.6588, "step": 30 }, { "epoch": 0.010148420652036028, "grad_norm": 2.514768123626709, "learning_rate": 9.873417721518988e-07, "loss": 1.6091, "step": 40 }, { "epoch": 0.012685525815045033, "grad_norm": 1.9985918998718262, "learning_rate": 1.240506329113924e-06, "loss": 1.5742, "step": 50 }, { "epoch": 0.01522263097805404, "grad_norm": 1.5602017641067505, "learning_rate": 1.4936708860759495e-06, "loss": 1.5291, "step": 60 }, { "epoch": 0.01775973614106305, "grad_norm": 1.5359352827072144, "learning_rate": 1.7468354430379747e-06, "loss": 1.5007, "step": 70 }, { "epoch": 0.020296841304072055, "grad_norm": 1.5024731159210205, "learning_rate": 2.0000000000000003e-06, "loss": 1.4645, "step": 80 }, { "epoch": 0.022833946467081062, "grad_norm": 1.4507105350494385, "learning_rate": 2.2531645569620258e-06, "loss": 1.4484, "step": 90 }, { "epoch": 0.025371051630090066, "grad_norm": 1.4768809080123901, "learning_rate": 2.5063291139240508e-06, "loss": 1.4516, "step": 100 }, { "epoch": 0.027908156793099072, "grad_norm": 1.3820849657058716, "learning_rate": 2.7594936708860766e-06, "loss": 1.4131, "step": 110 }, { "epoch": 0.03044526195610808, "grad_norm": 1.4740626811981201, "learning_rate": 3.0126582278481016e-06, "loss": 1.4007, "step": 120 }, { "epoch": 0.03298236711911709, "grad_norm": 1.3979785442352295, "learning_rate": 3.265822784810127e-06, "loss": 1.3964, "step": 130 }, { "epoch": 0.0355194722821261, "grad_norm": 1.3529947996139526, "learning_rate": 3.518987341772152e-06, "loss": 1.3681, "step": 140 }, { "epoch": 0.038056577445135104, "grad_norm": 1.5106327533721924, "learning_rate": 3.7721518987341775e-06, "loss": 1.3842, "step": 150 }, { "epoch": 0.04059368260814411, "grad_norm": 1.469246506690979, "learning_rate": 4.025316455696203e-06, "loss": 1.3481, "step": 160 }, { "epoch": 0.04313078777115312, "grad_norm": 1.4268747568130493, "learning_rate": 4.278481012658228e-06, "loss": 1.3738, "step": 170 }, { "epoch": 0.045667892934162124, "grad_norm": 1.364691138267517, "learning_rate": 4.531645569620253e-06, "loss": 1.3568, "step": 180 }, { "epoch": 0.04820499809717113, "grad_norm": 1.5261390209197998, "learning_rate": 4.784810126582279e-06, "loss": 1.347, "step": 190 }, { "epoch": 0.05074210326018013, "grad_norm": 1.4499975442886353, "learning_rate": 5.037974683544305e-06, "loss": 1.367, "step": 200 }, { "epoch": 0.05327920842318914, "grad_norm": 1.506212592124939, "learning_rate": 5.29113924050633e-06, "loss": 1.3344, "step": 210 }, { "epoch": 0.055816313586198145, "grad_norm": 1.4976462125778198, "learning_rate": 5.544303797468355e-06, "loss": 1.3454, "step": 220 }, { "epoch": 0.05835341874920715, "grad_norm": 1.45829439163208, "learning_rate": 5.79746835443038e-06, "loss": 1.3444, "step": 230 }, { "epoch": 0.06089052391221616, "grad_norm": 1.502082347869873, "learning_rate": 6.050632911392406e-06, "loss": 1.3264, "step": 240 }, { "epoch": 0.06342762907522517, "grad_norm": 1.525439739227295, "learning_rate": 6.303797468354431e-06, "loss": 1.3259, "step": 250 }, { "epoch": 0.06596473423823418, "grad_norm": 1.4980591535568237, "learning_rate": 6.5569620253164564e-06, "loss": 1.3338, "step": 260 }, { "epoch": 0.06850183940124319, "grad_norm": 1.4034385681152344, "learning_rate": 6.810126582278481e-06, "loss": 1.3293, "step": 270 }, { "epoch": 0.0710389445642522, "grad_norm": 1.3902912139892578, "learning_rate": 7.0632911392405065e-06, "loss": 1.3339, "step": 280 }, { "epoch": 0.0735760497272612, "grad_norm": 1.546105146408081, "learning_rate": 7.316455696202533e-06, "loss": 1.3232, "step": 290 }, { "epoch": 0.07611315489027021, "grad_norm": 1.396514654159546, "learning_rate": 7.569620253164558e-06, "loss": 1.3185, "step": 300 }, { "epoch": 0.07865026005327921, "grad_norm": 1.400497555732727, "learning_rate": 7.822784810126582e-06, "loss": 1.3231, "step": 310 }, { "epoch": 0.08118736521628822, "grad_norm": 1.4383889436721802, "learning_rate": 8.075949367088608e-06, "loss": 1.2916, "step": 320 }, { "epoch": 0.08372447037929723, "grad_norm": 1.383995532989502, "learning_rate": 8.329113924050633e-06, "loss": 1.3045, "step": 330 }, { "epoch": 0.08626157554230623, "grad_norm": 1.544356346130371, "learning_rate": 8.582278481012659e-06, "loss": 1.313, "step": 340 }, { "epoch": 0.08879868070531524, "grad_norm": 1.5004216432571411, "learning_rate": 8.835443037974685e-06, "loss": 1.3128, "step": 350 }, { "epoch": 0.09133578586832425, "grad_norm": 1.5500295162200928, "learning_rate": 9.08860759493671e-06, "loss": 1.2921, "step": 360 }, { "epoch": 0.09387289103133326, "grad_norm": 1.3721230030059814, "learning_rate": 9.341772151898735e-06, "loss": 1.2989, "step": 370 }, { "epoch": 0.09640999619434226, "grad_norm": 1.5480691194534302, "learning_rate": 9.59493670886076e-06, "loss": 1.307, "step": 380 }, { "epoch": 0.09894710135735126, "grad_norm": 1.4069089889526367, "learning_rate": 9.848101265822785e-06, "loss": 1.2914, "step": 390 }, { "epoch": 0.10148420652036026, "grad_norm": 1.5373787879943848, "learning_rate": 9.999968603457859e-06, "loss": 1.302, "step": 400 }, { "epoch": 0.10402131168336927, "grad_norm": 1.411720871925354, "learning_rate": 9.999615396887012e-06, "loss": 1.2963, "step": 410 }, { "epoch": 0.10655841684637828, "grad_norm": 1.4174768924713135, "learning_rate": 9.998869765883566e-06, "loss": 1.2673, "step": 420 }, { "epoch": 0.10909552200938728, "grad_norm": 1.492872714996338, "learning_rate": 9.997731768972785e-06, "loss": 1.3116, "step": 430 }, { "epoch": 0.11163262717239629, "grad_norm": 1.487380027770996, "learning_rate": 9.996201495477102e-06, "loss": 1.2903, "step": 440 }, { "epoch": 0.1141697323354053, "grad_norm": 1.4736219644546509, "learning_rate": 9.994279065509094e-06, "loss": 1.295, "step": 450 }, { "epoch": 0.1167068374984143, "grad_norm": 1.4441871643066406, "learning_rate": 9.991964629962067e-06, "loss": 1.293, "step": 460 }, { "epoch": 0.11924394266142331, "grad_norm": 1.4621552228927612, "learning_rate": 9.989258370498208e-06, "loss": 1.2746, "step": 470 }, { "epoch": 0.12178104782443232, "grad_norm": 1.4845929145812988, "learning_rate": 9.986160499534318e-06, "loss": 1.3113, "step": 480 }, { "epoch": 0.12431815298744132, "grad_norm": 1.4002912044525146, "learning_rate": 9.982671260225156e-06, "loss": 1.2872, "step": 490 }, { "epoch": 0.12685525815045035, "grad_norm": 1.380839228630066, "learning_rate": 9.97879092644434e-06, "loss": 1.2743, "step": 500 }, { "epoch": 0.12939236331345935, "grad_norm": 1.5403680801391602, "learning_rate": 9.974519802762853e-06, "loss": 1.2862, "step": 510 }, { "epoch": 0.13192946847646836, "grad_norm": 1.4719021320343018, "learning_rate": 9.969858224425138e-06, "loss": 1.2733, "step": 520 }, { "epoch": 0.13446657363947737, "grad_norm": 1.5172195434570312, "learning_rate": 9.96480655732279e-06, "loss": 1.2961, "step": 530 }, { "epoch": 0.13700367880248637, "grad_norm": 1.3394590616226196, "learning_rate": 9.959365197965824e-06, "loss": 1.2742, "step": 540 }, { "epoch": 0.13954078396549538, "grad_norm": 1.4464937448501587, "learning_rate": 9.953534573451568e-06, "loss": 1.268, "step": 550 }, { "epoch": 0.1420778891285044, "grad_norm": 1.4036401510238647, "learning_rate": 9.947315141431126e-06, "loss": 1.2636, "step": 560 }, { "epoch": 0.1446149942915134, "grad_norm": 1.3706105947494507, "learning_rate": 9.940707390073465e-06, "loss": 1.2728, "step": 570 }, { "epoch": 0.1471520994545224, "grad_norm": 1.322920560836792, "learning_rate": 9.933711838027096e-06, "loss": 1.2585, "step": 580 }, { "epoch": 0.1496892046175314, "grad_norm": 1.4396326541900635, "learning_rate": 9.926329034379361e-06, "loss": 1.2752, "step": 590 }, { "epoch": 0.15222630978054041, "grad_norm": 1.389569878578186, "learning_rate": 9.918559558613344e-06, "loss": 1.284, "step": 600 }, { "epoch": 0.15476341494354942, "grad_norm": 1.5533502101898193, "learning_rate": 9.910404020562377e-06, "loss": 1.2732, "step": 610 }, { "epoch": 0.15730052010655843, "grad_norm": 1.3960537910461426, "learning_rate": 9.901863060362176e-06, "loss": 1.2756, "step": 620 }, { "epoch": 0.15983762526956743, "grad_norm": 1.3757154941558838, "learning_rate": 9.8929373484006e-06, "loss": 1.2502, "step": 630 }, { "epoch": 0.16237473043257644, "grad_norm": 1.467374324798584, "learning_rate": 9.883627585265032e-06, "loss": 1.2528, "step": 640 }, { "epoch": 0.16491183559558545, "grad_norm": 1.4819854497909546, "learning_rate": 9.873934501687381e-06, "loss": 1.2602, "step": 650 }, { "epoch": 0.16744894075859446, "grad_norm": 1.3049192428588867, "learning_rate": 9.863858858486736e-06, "loss": 1.2522, "step": 660 }, { "epoch": 0.16998604592160346, "grad_norm": 1.367477536201477, "learning_rate": 9.853401446509641e-06, "loss": 1.2655, "step": 670 }, { "epoch": 0.17252315108461247, "grad_norm": 1.394960880279541, "learning_rate": 9.842563086568024e-06, "loss": 1.273, "step": 680 }, { "epoch": 0.17506025624762148, "grad_norm": 1.375187873840332, "learning_rate": 9.831344629374778e-06, "loss": 1.2805, "step": 690 }, { "epoch": 0.17759736141063048, "grad_norm": 1.4008384943008423, "learning_rate": 9.81974695547697e-06, "loss": 1.2516, "step": 700 }, { "epoch": 0.1801344665736395, "grad_norm": 1.398503065109253, "learning_rate": 9.807770975186743e-06, "loss": 1.2578, "step": 710 }, { "epoch": 0.1826715717366485, "grad_norm": 1.3208105564117432, "learning_rate": 9.795417628509857e-06, "loss": 1.2591, "step": 720 }, { "epoch": 0.1852086768996575, "grad_norm": 1.3660458326339722, "learning_rate": 9.78268788507191e-06, "loss": 1.2741, "step": 730 }, { "epoch": 0.1877457820626665, "grad_norm": 1.3826677799224854, "learning_rate": 9.769582744042224e-06, "loss": 1.2588, "step": 740 }, { "epoch": 0.19028288722567552, "grad_norm": 1.572285771369934, "learning_rate": 9.756103234055432e-06, "loss": 1.2609, "step": 750 }, { "epoch": 0.19281999238868452, "grad_norm": 1.4226349592208862, "learning_rate": 9.742250413130728e-06, "loss": 1.2472, "step": 760 }, { "epoch": 0.19535709755169353, "grad_norm": 1.3697686195373535, "learning_rate": 9.728025368588829e-06, "loss": 1.2492, "step": 770 }, { "epoch": 0.1978942027147025, "grad_norm": 1.4662617444992065, "learning_rate": 9.713429216966624e-06, "loss": 1.2438, "step": 780 }, { "epoch": 0.20043130787771152, "grad_norm": 1.425715446472168, "learning_rate": 9.698463103929542e-06, "loss": 1.2668, "step": 790 }, { "epoch": 0.20296841304072052, "grad_norm": 1.380704402923584, "learning_rate": 9.68312820418163e-06, "loss": 1.2439, "step": 800 }, { "epoch": 0.20550551820372953, "grad_norm": 1.4010827541351318, "learning_rate": 9.667425721373333e-06, "loss": 1.2606, "step": 810 }, { "epoch": 0.20804262336673854, "grad_norm": 1.3978463411331177, "learning_rate": 9.651356888007041e-06, "loss": 1.2584, "step": 820 }, { "epoch": 0.21057972852974755, "grad_norm": 1.4867024421691895, "learning_rate": 9.634922965340334e-06, "loss": 1.243, "step": 830 }, { "epoch": 0.21311683369275655, "grad_norm": 1.3792587518692017, "learning_rate": 9.618125243286989e-06, "loss": 1.2341, "step": 840 }, { "epoch": 0.21565393885576556, "grad_norm": 1.3928625583648682, "learning_rate": 9.60096504031573e-06, "loss": 1.2482, "step": 850 }, { "epoch": 0.21819104401877457, "grad_norm": 1.4658421277999878, "learning_rate": 9.58344370334675e-06, "loss": 1.2737, "step": 860 }, { "epoch": 0.22072814918178357, "grad_norm": 1.35547935962677, "learning_rate": 9.565562607645974e-06, "loss": 1.2433, "step": 870 }, { "epoch": 0.22326525434479258, "grad_norm": 1.3512929677963257, "learning_rate": 9.547323156717133e-06, "loss": 1.2451, "step": 880 }, { "epoch": 0.2258023595078016, "grad_norm": 1.371601939201355, "learning_rate": 9.52872678219158e-06, "loss": 1.2412, "step": 890 }, { "epoch": 0.2283394646708106, "grad_norm": 1.4480026960372925, "learning_rate": 9.50977494371594e-06, "loss": 1.2487, "step": 900 }, { "epoch": 0.2308765698338196, "grad_norm": 1.3161512613296509, "learning_rate": 9.490469128837525e-06, "loss": 1.2355, "step": 910 }, { "epoch": 0.2334136749968286, "grad_norm": 1.4747735261917114, "learning_rate": 9.470810852887586e-06, "loss": 1.2314, "step": 920 }, { "epoch": 0.23595078015983761, "grad_norm": 1.3615195751190186, "learning_rate": 9.450801658862371e-06, "loss": 1.249, "step": 930 }, { "epoch": 0.23848788532284662, "grad_norm": 1.3905844688415527, "learning_rate": 9.430443117302006e-06, "loss": 1.2357, "step": 940 }, { "epoch": 0.24102499048585563, "grad_norm": 1.4403789043426514, "learning_rate": 9.409736826167233e-06, "loss": 1.2482, "step": 950 }, { "epoch": 0.24356209564886463, "grad_norm": 1.5014008283615112, "learning_rate": 9.388684410713977e-06, "loss": 1.2437, "step": 960 }, { "epoch": 0.24609920081187364, "grad_norm": 1.383191466331482, "learning_rate": 9.367287523365782e-06, "loss": 1.237, "step": 970 }, { "epoch": 0.24863630597488265, "grad_norm": 1.4577914476394653, "learning_rate": 9.345547843584108e-06, "loss": 1.2366, "step": 980 }, { "epoch": 0.25117341113789166, "grad_norm": 1.3893483877182007, "learning_rate": 9.323467077736513e-06, "loss": 1.2432, "step": 990 }, { "epoch": 0.2537105163009007, "grad_norm": 1.331141710281372, "learning_rate": 9.301046958962707e-06, "loss": 1.2438, "step": 1000 }, { "epoch": 0.25624762146390967, "grad_norm": 1.5160998106002808, "learning_rate": 9.278289247038537e-06, "loss": 1.2404, "step": 1010 }, { "epoch": 0.2587847266269187, "grad_norm": 1.2425806522369385, "learning_rate": 9.255195728237837e-06, "loss": 1.2361, "step": 1020 }, { "epoch": 0.2613218317899277, "grad_norm": 1.325697660446167, "learning_rate": 9.231768215192243e-06, "loss": 1.2344, "step": 1030 }, { "epoch": 0.2638589369529367, "grad_norm": 1.4259663820266724, "learning_rate": 9.2080085467489e-06, "loss": 1.258, "step": 1040 }, { "epoch": 0.2663960421159457, "grad_norm": 1.4262241125106812, "learning_rate": 9.183918587826142e-06, "loss": 1.2518, "step": 1050 }, { "epoch": 0.26893314727895473, "grad_norm": 1.4553781747817993, "learning_rate": 9.159500229267103e-06, "loss": 1.2387, "step": 1060 }, { "epoch": 0.2714702524419637, "grad_norm": 1.3143069744110107, "learning_rate": 9.134755387691315e-06, "loss": 1.2474, "step": 1070 }, { "epoch": 0.27400735760497275, "grad_norm": 1.3108036518096924, "learning_rate": 9.109686005344258e-06, "loss": 1.2362, "step": 1080 }, { "epoch": 0.2765444627679817, "grad_norm": 1.4027985334396362, "learning_rate": 9.084294049944919e-06, "loss": 1.2344, "step": 1090 }, { "epoch": 0.27908156793099076, "grad_norm": 1.3254168033599854, "learning_rate": 9.05858151453134e-06, "loss": 1.2294, "step": 1100 }, { "epoch": 0.28161867309399974, "grad_norm": 1.3602770566940308, "learning_rate": 9.032550417304189e-06, "loss": 1.2408, "step": 1110 }, { "epoch": 0.2841557782570088, "grad_norm": 1.3525891304016113, "learning_rate": 9.006202801468342e-06, "loss": 1.2436, "step": 1120 }, { "epoch": 0.28669288342001775, "grad_norm": 1.3206712007522583, "learning_rate": 8.979540735072512e-06, "loss": 1.2103, "step": 1130 }, { "epoch": 0.2892299885830268, "grad_norm": 1.3212580680847168, "learning_rate": 8.952566310846931e-06, "loss": 1.2184, "step": 1140 }, { "epoch": 0.29176709374603577, "grad_norm": 1.4156090021133423, "learning_rate": 8.925281646039078e-06, "loss": 1.2323, "step": 1150 }, { "epoch": 0.2943041989090448, "grad_norm": 1.3827259540557861, "learning_rate": 8.897688882247515e-06, "loss": 1.2226, "step": 1160 }, { "epoch": 0.2968413040720538, "grad_norm": 1.375603199005127, "learning_rate": 8.869790185253766e-06, "loss": 1.2241, "step": 1170 }, { "epoch": 0.2993784092350628, "grad_norm": 1.4006825685501099, "learning_rate": 8.841587744852339e-06, "loss": 1.2405, "step": 1180 }, { "epoch": 0.3019155143980718, "grad_norm": 1.3298841714859009, "learning_rate": 8.813083774678841e-06, "loss": 1.2296, "step": 1190 }, { "epoch": 0.30445261956108083, "grad_norm": 1.3826628923416138, "learning_rate": 8.784280512036235e-06, "loss": 1.2272, "step": 1200 }, { "epoch": 0.3069897247240898, "grad_norm": 1.3651957511901855, "learning_rate": 8.755180217719218e-06, "loss": 1.225, "step": 1210 }, { "epoch": 0.30952682988709884, "grad_norm": 1.4205344915390015, "learning_rate": 8.72578517583679e-06, "loss": 1.2351, "step": 1220 }, { "epoch": 0.3120639350501078, "grad_norm": 1.350210189819336, "learning_rate": 8.696097693632944e-06, "loss": 1.2146, "step": 1230 }, { "epoch": 0.31460104021311686, "grad_norm": 1.391353726387024, "learning_rate": 8.666120101305596e-06, "loss": 1.2444, "step": 1240 }, { "epoch": 0.31713814537612584, "grad_norm": 1.4733567237854004, "learning_rate": 8.635854751823666e-06, "loss": 1.2427, "step": 1250 }, { "epoch": 0.31967525053913487, "grad_norm": 1.3797138929367065, "learning_rate": 8.60530402074241e-06, "loss": 1.2236, "step": 1260 }, { "epoch": 0.32221235570214385, "grad_norm": 1.2921204566955566, "learning_rate": 8.574470306016936e-06, "loss": 1.2375, "step": 1270 }, { "epoch": 0.3247494608651529, "grad_norm": 1.3980249166488647, "learning_rate": 8.543356027814009e-06, "loss": 1.2176, "step": 1280 }, { "epoch": 0.32728656602816186, "grad_norm": 1.3124829530715942, "learning_rate": 8.511963628322076e-06, "loss": 1.2289, "step": 1290 }, { "epoch": 0.3298236711911709, "grad_norm": 1.3660489320755005, "learning_rate": 8.480295571559581e-06, "loss": 1.2222, "step": 1300 }, { "epoch": 0.3323607763541799, "grad_norm": 1.3039889335632324, "learning_rate": 8.448354343181568e-06, "loss": 1.23, "step": 1310 }, { "epoch": 0.3348978815171889, "grad_norm": 1.2941962480545044, "learning_rate": 8.416142450284565e-06, "loss": 1.234, "step": 1320 }, { "epoch": 0.3374349866801979, "grad_norm": 1.3144862651824951, "learning_rate": 8.383662421209813e-06, "loss": 1.2291, "step": 1330 }, { "epoch": 0.3399720918432069, "grad_norm": 1.4170352220535278, "learning_rate": 8.350916805344812e-06, "loss": 1.2501, "step": 1340 }, { "epoch": 0.3425091970062159, "grad_norm": 1.2927628755569458, "learning_rate": 8.317908172923207e-06, "loss": 1.2057, "step": 1350 }, { "epoch": 0.34504630216922494, "grad_norm": 1.3344128131866455, "learning_rate": 8.28463911482306e-06, "loss": 1.2244, "step": 1360 }, { "epoch": 0.3475834073322339, "grad_norm": 1.3674781322479248, "learning_rate": 8.251112242363488e-06, "loss": 1.241, "step": 1370 }, { "epoch": 0.35012051249524295, "grad_norm": 1.3904838562011719, "learning_rate": 8.217330187099689e-06, "loss": 1.2063, "step": 1380 }, { "epoch": 0.35265761765825193, "grad_norm": 1.3588337898254395, "learning_rate": 8.183295600616399e-06, "loss": 1.2127, "step": 1390 }, { "epoch": 0.35519472282126097, "grad_norm": 1.4805495738983154, "learning_rate": 8.149011154319763e-06, "loss": 1.224, "step": 1400 }, { "epoch": 0.35773182798426995, "grad_norm": 1.3868420124053955, "learning_rate": 8.114479539227653e-06, "loss": 1.2399, "step": 1410 }, { "epoch": 0.360268933147279, "grad_norm": 1.3122705221176147, "learning_rate": 8.079703465758447e-06, "loss": 1.216, "step": 1420 }, { "epoch": 0.36280603831028796, "grad_norm": 1.3743106126785278, "learning_rate": 8.044685663518289e-06, "loss": 1.2258, "step": 1430 }, { "epoch": 0.365343143473297, "grad_norm": 1.274947166442871, "learning_rate": 8.009428881086836e-06, "loss": 1.2159, "step": 1440 }, { "epoch": 0.367880248636306, "grad_norm": 1.3252506256103516, "learning_rate": 7.97393588580152e-06, "loss": 1.2097, "step": 1450 }, { "epoch": 0.370417353799315, "grad_norm": 1.349714756011963, "learning_rate": 7.93820946354034e-06, "loss": 1.2118, "step": 1460 }, { "epoch": 0.372954458962324, "grad_norm": 1.353014349937439, "learning_rate": 7.902252418503198e-06, "loss": 1.2293, "step": 1470 }, { "epoch": 0.375491564125333, "grad_norm": 1.3702679872512817, "learning_rate": 7.86606757299178e-06, "loss": 1.2096, "step": 1480 }, { "epoch": 0.378028669288342, "grad_norm": 1.4039068222045898, "learning_rate": 7.829657767188052e-06, "loss": 1.2264, "step": 1490 }, { "epoch": 0.38056577445135104, "grad_norm": 1.3884947299957275, "learning_rate": 7.793025858931317e-06, "loss": 1.2283, "step": 1500 }, { "epoch": 0.38310287961436, "grad_norm": 1.306620717048645, "learning_rate": 7.756174723493908e-06, "loss": 1.2325, "step": 1510 }, { "epoch": 0.38563998477736905, "grad_norm": 1.3600883483886719, "learning_rate": 7.719107253355494e-06, "loss": 1.2324, "step": 1520 }, { "epoch": 0.38817708994037803, "grad_norm": 1.2543104887008667, "learning_rate": 7.68182635797606e-06, "loss": 1.1939, "step": 1530 }, { "epoch": 0.39071419510338706, "grad_norm": 1.2916842699050903, "learning_rate": 7.644334963567542e-06, "loss": 1.2105, "step": 1540 }, { "epoch": 0.39325130026639604, "grad_norm": 1.3755340576171875, "learning_rate": 7.606636012864126e-06, "loss": 1.226, "step": 1550 }, { "epoch": 0.395788405429405, "grad_norm": 1.3301304578781128, "learning_rate": 7.568732464891293e-06, "loss": 1.2194, "step": 1560 }, { "epoch": 0.39832551059241406, "grad_norm": 1.4263029098510742, "learning_rate": 7.530627294733549e-06, "loss": 1.2152, "step": 1570 }, { "epoch": 0.40086261575542304, "grad_norm": 1.3354136943817139, "learning_rate": 7.492323493300912e-06, "loss": 1.2028, "step": 1580 }, { "epoch": 0.40339972091843207, "grad_norm": 1.3600828647613525, "learning_rate": 7.453824067094152e-06, "loss": 1.2132, "step": 1590 }, { "epoch": 0.40593682608144105, "grad_norm": 1.3342976570129395, "learning_rate": 7.4151320379688105e-06, "loss": 1.2235, "step": 1600 }, { "epoch": 0.4084739312444501, "grad_norm": 1.3055970668792725, "learning_rate": 7.376250442898006e-06, "loss": 1.2121, "step": 1610 }, { "epoch": 0.41101103640745906, "grad_norm": 1.2475143671035767, "learning_rate": 7.33718233373407e-06, "loss": 1.213, "step": 1620 }, { "epoch": 0.4135481415704681, "grad_norm": 1.3556421995162964, "learning_rate": 7.297930776968989e-06, "loss": 1.2219, "step": 1630 }, { "epoch": 0.4160852467334771, "grad_norm": 1.4067039489746094, "learning_rate": 7.258498853493729e-06, "loss": 1.2248, "step": 1640 }, { "epoch": 0.4186223518964861, "grad_norm": 1.3992860317230225, "learning_rate": 7.2188896583563984e-06, "loss": 1.2041, "step": 1650 }, { "epoch": 0.4211594570594951, "grad_norm": 1.267992615699768, "learning_rate": 7.179106300519329e-06, "loss": 1.232, "step": 1660 }, { "epoch": 0.4236965622225041, "grad_norm": 1.3517013788223267, "learning_rate": 7.13915190261504e-06, "loss": 1.2012, "step": 1670 }, { "epoch": 0.4262336673855131, "grad_norm": 1.3722771406173706, "learning_rate": 7.099029600701144e-06, "loss": 1.2013, "step": 1680 }, { "epoch": 0.42877077254852214, "grad_norm": 1.3306978940963745, "learning_rate": 7.0587425440141955e-06, "loss": 1.2057, "step": 1690 }, { "epoch": 0.4313078777115311, "grad_norm": 1.2793058156967163, "learning_rate": 7.0182938947225025e-06, "loss": 1.2094, "step": 1700 }, { "epoch": 0.43384498287454015, "grad_norm": 1.2533907890319824, "learning_rate": 6.977686827677926e-06, "loss": 1.22, "step": 1710 }, { "epoch": 0.43638208803754913, "grad_norm": 1.3793444633483887, "learning_rate": 6.936924530166682e-06, "loss": 1.2301, "step": 1720 }, { "epoch": 0.43891919320055817, "grad_norm": 1.2866994142532349, "learning_rate": 6.896010201659173e-06, "loss": 1.2108, "step": 1730 }, { "epoch": 0.44145629836356715, "grad_norm": 1.3020964860916138, "learning_rate": 6.854947053558849e-06, "loss": 1.2133, "step": 1740 }, { "epoch": 0.4439934035265762, "grad_norm": 1.266717791557312, "learning_rate": 6.8137383089501526e-06, "loss": 1.2067, "step": 1750 }, { "epoch": 0.44653050868958516, "grad_norm": 1.3352110385894775, "learning_rate": 6.772387202345528e-06, "loss": 1.2128, "step": 1760 }, { "epoch": 0.4490676138525942, "grad_norm": 1.2845940589904785, "learning_rate": 6.730896979431543e-06, "loss": 1.2168, "step": 1770 }, { "epoch": 0.4516047190156032, "grad_norm": 1.3710917234420776, "learning_rate": 6.689270896814139e-06, "loss": 1.2091, "step": 1780 }, { "epoch": 0.4541418241786122, "grad_norm": 1.351404070854187, "learning_rate": 6.647512221763005e-06, "loss": 1.2047, "step": 1790 }, { "epoch": 0.4566789293416212, "grad_norm": 1.3901602029800415, "learning_rate": 6.6056242319551315e-06, "loss": 1.2074, "step": 1800 }, { "epoch": 0.4592160345046302, "grad_norm": 1.3269678354263306, "learning_rate": 6.563610215217551e-06, "loss": 1.2012, "step": 1810 }, { "epoch": 0.4617531396676392, "grad_norm": 1.4395819902420044, "learning_rate": 6.5214734692692594e-06, "loss": 1.2121, "step": 1820 }, { "epoch": 0.46429024483064824, "grad_norm": 1.3422843217849731, "learning_rate": 6.479217301462386e-06, "loss": 1.2072, "step": 1830 }, { "epoch": 0.4668273499936572, "grad_norm": 1.389143466949463, "learning_rate": 6.43684502852259e-06, "loss": 1.2005, "step": 1840 }, { "epoch": 0.46936445515666625, "grad_norm": 1.2481592893600464, "learning_rate": 6.394359976288729e-06, "loss": 1.2026, "step": 1850 }, { "epoch": 0.47190156031967523, "grad_norm": 1.2201452255249023, "learning_rate": 6.3517654794518156e-06, "loss": 1.2086, "step": 1860 }, { "epoch": 0.47443866548268426, "grad_norm": 1.2975594997406006, "learning_rate": 6.309064881293265e-06, "loss": 1.2118, "step": 1870 }, { "epoch": 0.47697577064569324, "grad_norm": 1.3062618970870972, "learning_rate": 6.266261533422487e-06, "loss": 1.2117, "step": 1880 }, { "epoch": 0.4795128758087023, "grad_norm": 1.356292963027954, "learning_rate": 6.223358795513812e-06, "loss": 1.2045, "step": 1890 }, { "epoch": 0.48204998097171126, "grad_norm": 1.259065866470337, "learning_rate": 6.18036003504278e-06, "loss": 1.1995, "step": 1900 }, { "epoch": 0.4845870861347203, "grad_norm": 1.3219462633132935, "learning_rate": 6.1372686270218385e-06, "loss": 1.1936, "step": 1910 }, { "epoch": 0.48712419129772927, "grad_norm": 1.2755314111709595, "learning_rate": 6.094087953735423e-06, "loss": 1.2122, "step": 1920 }, { "epoch": 0.4896612964607383, "grad_norm": 1.2812877893447876, "learning_rate": 6.050821404474483e-06, "loss": 1.1939, "step": 1930 }, { "epoch": 0.4921984016237473, "grad_norm": 1.308876633644104, "learning_rate": 6.00747237527045e-06, "loss": 1.2163, "step": 1940 }, { "epoch": 0.4947355067867563, "grad_norm": 1.3260457515716553, "learning_rate": 5.964044268628688e-06, "loss": 1.2022, "step": 1950 }, { "epoch": 0.4972726119497653, "grad_norm": 1.3721497058868408, "learning_rate": 5.920540493261415e-06, "loss": 1.2128, "step": 1960 }, { "epoch": 0.49980971711277433, "grad_norm": 1.4000312089920044, "learning_rate": 5.8769644638201635e-06, "loss": 1.2014, "step": 1970 }, { "epoch": 0.5023468222757833, "grad_norm": 1.31642746925354, "learning_rate": 5.8333196006277536e-06, "loss": 1.1962, "step": 1980 }, { "epoch": 0.5048839274387923, "grad_norm": 1.224902629852295, "learning_rate": 5.789609329409826e-06, "loss": 1.2015, "step": 1990 }, { "epoch": 0.5074210326018014, "grad_norm": 1.3817135095596313, "learning_rate": 5.7458370810259635e-06, "loss": 1.1935, "step": 2000 }, { "epoch": 0.5099581377648104, "grad_norm": 1.268218755722046, "learning_rate": 5.702006291200389e-06, "loss": 1.1894, "step": 2010 }, { "epoch": 0.5124952429278193, "grad_norm": 1.4027165174484253, "learning_rate": 5.6581204002523e-06, "loss": 1.1883, "step": 2020 }, { "epoch": 0.5150323480908283, "grad_norm": 1.2554658651351929, "learning_rate": 5.614182852825835e-06, "loss": 1.1995, "step": 2030 }, { "epoch": 0.5175694532538374, "grad_norm": 1.4268817901611328, "learning_rate": 5.570197097619688e-06, "loss": 1.2145, "step": 2040 }, { "epoch": 0.5201065584168464, "grad_norm": 1.2476046085357666, "learning_rate": 5.526166587116436e-06, "loss": 1.1952, "step": 2050 }, { "epoch": 0.5226436635798554, "grad_norm": 1.3912453651428223, "learning_rate": 5.4820947773115374e-06, "loss": 1.2126, "step": 2060 }, { "epoch": 0.5251807687428643, "grad_norm": 1.3362759351730347, "learning_rate": 5.437985127442065e-06, "loss": 1.1981, "step": 2070 }, { "epoch": 0.5277178739058734, "grad_norm": 1.243674874305725, "learning_rate": 5.393841099715205e-06, "loss": 1.1944, "step": 2080 }, { "epoch": 0.5302549790688824, "grad_norm": 1.3977802991867065, "learning_rate": 5.349666159036482e-06, "loss": 1.1924, "step": 2090 }, { "epoch": 0.5327920842318914, "grad_norm": 1.330768346786499, "learning_rate": 5.305463772737812e-06, "loss": 1.1907, "step": 2100 }, { "epoch": 0.5353291893949004, "grad_norm": 1.3184082508087158, "learning_rate": 5.261237410305344e-06, "loss": 1.1979, "step": 2110 }, { "epoch": 0.5378662945579095, "grad_norm": 1.3445682525634766, "learning_rate": 5.2169905431071356e-06, "loss": 1.2007, "step": 2120 }, { "epoch": 0.5404033997209184, "grad_norm": 1.2852333784103394, "learning_rate": 5.172726644120678e-06, "loss": 1.187, "step": 2130 }, { "epoch": 0.5429405048839274, "grad_norm": 1.33376944065094, "learning_rate": 5.128449187660309e-06, "loss": 1.1913, "step": 2140 }, { "epoch": 0.5454776100469364, "grad_norm": 1.3172907829284668, "learning_rate": 5.084161649104502e-06, "loss": 1.1996, "step": 2150 }, { "epoch": 0.5480147152099455, "grad_norm": 1.3981866836547852, "learning_rate": 5.039867504623084e-06, "loss": 1.1792, "step": 2160 }, { "epoch": 0.5505518203729545, "grad_norm": 1.2657582759857178, "learning_rate": 4.995570230904386e-06, "loss": 1.1744, "step": 2170 }, { "epoch": 0.5530889255359634, "grad_norm": 1.2834333181381226, "learning_rate": 4.951273304882358e-06, "loss": 1.1934, "step": 2180 }, { "epoch": 0.5556260306989724, "grad_norm": 1.2566810846328735, "learning_rate": 4.906980203463659e-06, "loss": 1.1934, "step": 2190 }, { "epoch": 0.5581631358619815, "grad_norm": 1.310981035232544, "learning_rate": 4.862694403254747e-06, "loss": 1.1952, "step": 2200 }, { "epoch": 0.5607002410249905, "grad_norm": 1.2907445430755615, "learning_rate": 4.818419380289009e-06, "loss": 1.213, "step": 2210 }, { "epoch": 0.5632373461879995, "grad_norm": 1.3687422275543213, "learning_rate": 4.774158609753908e-06, "loss": 1.1969, "step": 2220 }, { "epoch": 0.5657744513510085, "grad_norm": 1.2513952255249023, "learning_rate": 4.729915565718223e-06, "loss": 1.1855, "step": 2230 }, { "epoch": 0.5683115565140175, "grad_norm": 1.3049182891845703, "learning_rate": 4.685693720859369e-06, "loss": 1.1888, "step": 2240 }, { "epoch": 0.5708486616770265, "grad_norm": 1.2568814754486084, "learning_rate": 4.641496546190813e-06, "loss": 1.1858, "step": 2250 }, { "epoch": 0.5733857668400355, "grad_norm": 1.2858439683914185, "learning_rate": 4.597327510789635e-06, "loss": 1.18, "step": 2260 }, { "epoch": 0.5759228720030445, "grad_norm": 1.3341573476791382, "learning_rate": 4.553190081524242e-06, "loss": 1.1904, "step": 2270 }, { "epoch": 0.5784599771660536, "grad_norm": 1.2678639888763428, "learning_rate": 4.5090877227822424e-06, "loss": 1.1908, "step": 2280 }, { "epoch": 0.5809970823290626, "grad_norm": 1.3060572147369385, "learning_rate": 4.46502389619853e-06, "loss": 1.202, "step": 2290 }, { "epoch": 0.5835341874920715, "grad_norm": 1.2779840230941772, "learning_rate": 4.421002060383569e-06, "loss": 1.1926, "step": 2300 }, { "epoch": 0.5860712926550805, "grad_norm": 1.3469980955123901, "learning_rate": 4.3770256706519375e-06, "loss": 1.1777, "step": 2310 }, { "epoch": 0.5886083978180896, "grad_norm": 1.2928153276443481, "learning_rate": 4.3330981787511006e-06, "loss": 1.1779, "step": 2320 }, { "epoch": 0.5911455029810986, "grad_norm": 1.424657940864563, "learning_rate": 4.289223032590491e-06, "loss": 1.2134, "step": 2330 }, { "epoch": 0.5936826081441076, "grad_norm": 1.3630969524383545, "learning_rate": 4.245403675970877e-06, "loss": 1.1821, "step": 2340 }, { "epoch": 0.5962197133071165, "grad_norm": 1.3015477657318115, "learning_rate": 4.201643548314051e-06, "loss": 1.1874, "step": 2350 }, { "epoch": 0.5987568184701256, "grad_norm": 1.2937594652175903, "learning_rate": 4.157946084392871e-06, "loss": 1.2015, "step": 2360 }, { "epoch": 0.6012939236331346, "grad_norm": 1.1649646759033203, "learning_rate": 4.114314714061659e-06, "loss": 1.1787, "step": 2370 }, { "epoch": 0.6038310287961436, "grad_norm": 1.2585970163345337, "learning_rate": 4.0707528619869976e-06, "loss": 1.1739, "step": 2380 }, { "epoch": 0.6063681339591526, "grad_norm": 1.3575518131256104, "learning_rate": 4.027263947378907e-06, "loss": 1.1744, "step": 2390 }, { "epoch": 0.6089052391221617, "grad_norm": 1.3373316526412964, "learning_rate": 3.9838513837224814e-06, "loss": 1.189, "step": 2400 }, { "epoch": 0.6114423442851706, "grad_norm": 1.2326818704605103, "learning_rate": 3.940518578509963e-06, "loss": 1.1842, "step": 2410 }, { "epoch": 0.6139794494481796, "grad_norm": 1.34890878200531, "learning_rate": 3.8972689329732725e-06, "loss": 1.1954, "step": 2420 }, { "epoch": 0.6165165546111886, "grad_norm": 1.2541084289550781, "learning_rate": 3.854105841817056e-06, "loss": 1.1771, "step": 2430 }, { "epoch": 0.6190536597741977, "grad_norm": 1.254408836364746, "learning_rate": 3.811032692952227e-06, "loss": 1.1655, "step": 2440 }, { "epoch": 0.6215907649372067, "grad_norm": 1.2309529781341553, "learning_rate": 3.7680528672300404e-06, "loss": 1.1909, "step": 2450 }, { "epoch": 0.6241278701002156, "grad_norm": 1.3115286827087402, "learning_rate": 3.7251697381767373e-06, "loss": 1.192, "step": 2460 }, { "epoch": 0.6266649752632246, "grad_norm": 1.23496413230896, "learning_rate": 3.6823866717287437e-06, "loss": 1.1905, "step": 2470 }, { "epoch": 0.6292020804262337, "grad_norm": 1.3248686790466309, "learning_rate": 3.6397070259684793e-06, "loss": 1.1864, "step": 2480 }, { "epoch": 0.6317391855892427, "grad_norm": 1.256982445716858, "learning_rate": 3.5971341508607814e-06, "loss": 1.169, "step": 2490 }, { "epoch": 0.6342762907522517, "grad_norm": 1.30142343044281, "learning_rate": 3.5546713879899563e-06, "loss": 1.1699, "step": 2500 }, { "epoch": 0.6368133959152606, "grad_norm": 1.3655248880386353, "learning_rate": 3.512322070297503e-06, "loss": 1.1719, "step": 2510 }, { "epoch": 0.6393505010782697, "grad_norm": 1.288802146911621, "learning_rate": 3.4700895218205026e-06, "loss": 1.1869, "step": 2520 }, { "epoch": 0.6418876062412787, "grad_norm": 1.245276689529419, "learning_rate": 3.4279770574307096e-06, "loss": 1.1882, "step": 2530 }, { "epoch": 0.6444247114042877, "grad_norm": 1.222338080406189, "learning_rate": 3.385987982574372e-06, "loss": 1.1746, "step": 2540 }, { "epoch": 0.6469618165672967, "grad_norm": 1.2364776134490967, "learning_rate": 3.3441255930127752e-06, "loss": 1.1912, "step": 2550 }, { "epoch": 0.6494989217303058, "grad_norm": 1.270403504371643, "learning_rate": 3.3023931745635606e-06, "loss": 1.1805, "step": 2560 }, { "epoch": 0.6520360268933147, "grad_norm": 1.3970060348510742, "learning_rate": 3.2607940028428154e-06, "loss": 1.1913, "step": 2570 }, { "epoch": 0.6545731320563237, "grad_norm": 1.3008099794387817, "learning_rate": 3.2193313430079737e-06, "loss": 1.1978, "step": 2580 }, { "epoch": 0.6571102372193327, "grad_norm": 1.3095245361328125, "learning_rate": 3.178008449501517e-06, "loss": 1.1744, "step": 2590 }, { "epoch": 0.6596473423823418, "grad_norm": 1.1827950477600098, "learning_rate": 3.1368285657955464e-06, "loss": 1.1779, "step": 2600 }, { "epoch": 0.6621844475453508, "grad_norm": 1.2912861108779907, "learning_rate": 3.0957949241371845e-06, "loss": 1.197, "step": 2610 }, { "epoch": 0.6647215527083598, "grad_norm": 1.2797883749008179, "learning_rate": 3.0549107452948866e-06, "loss": 1.1945, "step": 2620 }, { "epoch": 0.6672586578713687, "grad_norm": 1.3769925832748413, "learning_rate": 3.014179238305629e-06, "loss": 1.1819, "step": 2630 }, { "epoch": 0.6697957630343778, "grad_norm": 1.204916000366211, "learning_rate": 2.9736036002230332e-06, "loss": 1.1646, "step": 2640 }, { "epoch": 0.6723328681973868, "grad_norm": 1.216291904449463, "learning_rate": 2.933187015866431e-06, "loss": 1.1929, "step": 2650 }, { "epoch": 0.6748699733603958, "grad_norm": 1.273775339126587, "learning_rate": 2.892932657570878e-06, "loss": 1.1775, "step": 2660 }, { "epoch": 0.6774070785234048, "grad_norm": 1.2574785947799683, "learning_rate": 2.8528436849381518e-06, "loss": 1.2057, "step": 2670 }, { "epoch": 0.6799441836864138, "grad_norm": 1.2418912649154663, "learning_rate": 2.8129232445887623e-06, "loss": 1.1858, "step": 2680 }, { "epoch": 0.6824812888494228, "grad_norm": 1.3044190406799316, "learning_rate": 2.773174469914964e-06, "loss": 1.1867, "step": 2690 }, { "epoch": 0.6850183940124318, "grad_norm": 1.3013222217559814, "learning_rate": 2.7336004808348094e-06, "loss": 1.1737, "step": 2700 }, { "epoch": 0.6875554991754408, "grad_norm": 1.2504340410232544, "learning_rate": 2.6942043835472725e-06, "loss": 1.1827, "step": 2710 }, { "epoch": 0.6900926043384499, "grad_norm": 1.292549729347229, "learning_rate": 2.654989270288435e-06, "loss": 1.1844, "step": 2720 }, { "epoch": 0.6926297095014589, "grad_norm": 1.2378226518630981, "learning_rate": 2.615958219088776e-06, "loss": 1.1827, "step": 2730 }, { "epoch": 0.6951668146644678, "grad_norm": 1.1776105165481567, "learning_rate": 2.577114293531571e-06, "loss": 1.1764, "step": 2740 }, { "epoch": 0.6977039198274768, "grad_norm": 1.2368700504302979, "learning_rate": 2.538460542512435e-06, "loss": 1.1788, "step": 2750 }, { "epoch": 0.7002410249904859, "grad_norm": 1.3077671527862549, "learning_rate": 2.5000000000000015e-06, "loss": 1.1786, "step": 2760 }, { "epoch": 0.7027781301534949, "grad_norm": 1.2264658212661743, "learning_rate": 2.461735684797794e-06, "loss": 1.1891, "step": 2770 }, { "epoch": 0.7053152353165039, "grad_norm": 1.411934494972229, "learning_rate": 2.4236706003072733e-06, "loss": 1.2021, "step": 2780 }, { "epoch": 0.7078523404795128, "grad_norm": 1.3481911420822144, "learning_rate": 2.385807734292097e-06, "loss": 1.1687, "step": 2790 }, { "epoch": 0.7103894456425219, "grad_norm": 1.2622945308685303, "learning_rate": 2.3481500586436067e-06, "loss": 1.1906, "step": 2800 }, { "epoch": 0.7129265508055309, "grad_norm": 1.3974981307983398, "learning_rate": 2.3107005291475653e-06, "loss": 1.1894, "step": 2810 }, { "epoch": 0.7154636559685399, "grad_norm": 1.2239971160888672, "learning_rate": 2.273462085252146e-06, "loss": 1.1596, "step": 2820 }, { "epoch": 0.7180007611315489, "grad_norm": 1.3323861360549927, "learning_rate": 2.236437649837223e-06, "loss": 1.2045, "step": 2830 }, { "epoch": 0.720537866294558, "grad_norm": 1.2548524141311646, "learning_rate": 2.1996301289849474e-06, "loss": 1.1791, "step": 2840 }, { "epoch": 0.7230749714575669, "grad_norm": 1.3251192569732666, "learning_rate": 2.1630424117516436e-06, "loss": 1.174, "step": 2850 }, { "epoch": 0.7256120766205759, "grad_norm": 1.2302052974700928, "learning_rate": 2.126677369941047e-06, "loss": 1.1498, "step": 2860 }, { "epoch": 0.7281491817835849, "grad_norm": 1.2613223791122437, "learning_rate": 2.0905378578788947e-06, "loss": 1.1799, "step": 2870 }, { "epoch": 0.730686286946594, "grad_norm": 1.2105897665023804, "learning_rate": 2.0546267121888863e-06, "loss": 1.169, "step": 2880 }, { "epoch": 0.733223392109603, "grad_norm": 1.3086020946502686, "learning_rate": 2.0189467515700283e-06, "loss": 1.166, "step": 2890 }, { "epoch": 0.735760497272612, "grad_norm": 1.2625070810317993, "learning_rate": 1.9835007765754035e-06, "loss": 1.1944, "step": 2900 }, { "epoch": 0.7382976024356209, "grad_norm": 1.3192371129989624, "learning_rate": 1.9482915693923442e-06, "loss": 1.1712, "step": 2910 }, { "epoch": 0.74083470759863, "grad_norm": 1.2234618663787842, "learning_rate": 1.913321893624059e-06, "loss": 1.1753, "step": 2920 }, { "epoch": 0.743371812761639, "grad_norm": 1.3323945999145508, "learning_rate": 1.878594494072713e-06, "loss": 1.1681, "step": 2930 }, { "epoch": 0.745908917924648, "grad_norm": 1.2558120489120483, "learning_rate": 1.8441120965239912e-06, "loss": 1.1796, "step": 2940 }, { "epoch": 0.748446023087657, "grad_norm": 1.2716906070709229, "learning_rate": 1.8098774075331383e-06, "loss": 1.1894, "step": 2950 }, { "epoch": 0.750983128250666, "grad_norm": 1.2781611680984497, "learning_rate": 1.7758931142125308e-06, "loss": 1.1855, "step": 2960 }, { "epoch": 0.753520233413675, "grad_norm": 1.3427870273590088, "learning_rate": 1.7421618840207576e-06, "loss": 1.183, "step": 2970 }, { "epoch": 0.756057338576684, "grad_norm": 1.2347464561462402, "learning_rate": 1.7086863645532425e-06, "loss": 1.1615, "step": 2980 }, { "epoch": 0.758594443739693, "grad_norm": 1.2659286260604858, "learning_rate": 1.6754691833344472e-06, "loss": 1.1926, "step": 2990 }, { "epoch": 0.7611315489027021, "grad_norm": 1.2952594757080078, "learning_rate": 1.642512947611622e-06, "loss": 1.1988, "step": 3000 }, { "epoch": 0.763668654065711, "grad_norm": 1.2655588388442993, "learning_rate": 1.6098202441501599e-06, "loss": 1.1691, "step": 3010 }, { "epoch": 0.76620575922872, "grad_norm": 1.2471306324005127, "learning_rate": 1.5773936390305678e-06, "loss": 1.1572, "step": 3020 }, { "epoch": 0.768742864391729, "grad_norm": 1.2647178173065186, "learning_rate": 1.5452356774470468e-06, "loss": 1.1733, "step": 3030 }, { "epoch": 0.7712799695547381, "grad_norm": 1.2472219467163086, "learning_rate": 1.5133488835077204e-06, "loss": 1.1772, "step": 3040 }, { "epoch": 0.7738170747177471, "grad_norm": 1.2521233558654785, "learning_rate": 1.4817357600365061e-06, "loss": 1.172, "step": 3050 }, { "epoch": 0.7763541798807561, "grad_norm": 1.3133138418197632, "learning_rate": 1.4503987883766857e-06, "loss": 1.1784, "step": 3060 }, { "epoch": 0.778891285043765, "grad_norm": 1.2727947235107422, "learning_rate": 1.4193404281961172e-06, "loss": 1.1817, "step": 3070 }, { "epoch": 0.7814283902067741, "grad_norm": 1.2805311679840088, "learning_rate": 1.3885631172941932e-06, "loss": 1.1841, "step": 3080 }, { "epoch": 0.7839654953697831, "grad_norm": 1.2885172367095947, "learning_rate": 1.3580692714104887e-06, "loss": 1.162, "step": 3090 }, { "epoch": 0.7865026005327921, "grad_norm": 1.3356949090957642, "learning_rate": 1.3278612840351468e-06, "loss": 1.1879, "step": 3100 }, { "epoch": 0.7890397056958011, "grad_norm": 1.2000919580459595, "learning_rate": 1.2979415262210089e-06, "loss": 1.1772, "step": 3110 }, { "epoch": 0.79157681085881, "grad_norm": 1.2145313024520874, "learning_rate": 1.2683123463975144e-06, "loss": 1.1662, "step": 3120 }, { "epoch": 0.7941139160218191, "grad_norm": 1.2365864515304565, "learning_rate": 1.2389760701863717e-06, "loss": 1.1916, "step": 3130 }, { "epoch": 0.7966510211848281, "grad_norm": 1.2178484201431274, "learning_rate": 1.2099350002190063e-06, "loss": 1.1686, "step": 3140 }, { "epoch": 0.7991881263478371, "grad_norm": 1.2906898260116577, "learning_rate": 1.1811914159558374e-06, "loss": 1.1979, "step": 3150 }, { "epoch": 0.8017252315108461, "grad_norm": 1.2861703634262085, "learning_rate": 1.1527475735073574e-06, "loss": 1.1937, "step": 3160 }, { "epoch": 0.8042623366738552, "grad_norm": 1.2466925382614136, "learning_rate": 1.1246057054570414e-06, "loss": 1.1632, "step": 3170 }, { "epoch": 0.8067994418368641, "grad_norm": 1.1871761083602905, "learning_rate": 1.0967680206861198e-06, "loss": 1.1587, "step": 3180 }, { "epoch": 0.8093365469998731, "grad_norm": 1.2665520906448364, "learning_rate": 1.069236704200195e-06, "loss": 1.1679, "step": 3190 }, { "epoch": 0.8118736521628821, "grad_norm": 1.197919487953186, "learning_rate": 1.0420139169577393e-06, "loss": 1.1652, "step": 3200 }, { "epoch": 0.8144107573258912, "grad_norm": 1.2373968362808228, "learning_rate": 1.01510179570048e-06, "loss": 1.178, "step": 3210 }, { "epoch": 0.8169478624889002, "grad_norm": 1.2911186218261719, "learning_rate": 9.88502452785685e-07, "loss": 1.1735, "step": 3220 }, { "epoch": 0.8194849676519091, "grad_norm": 1.2003546953201294, "learning_rate": 9.62217976020357e-07, "loss": 1.1836, "step": 3230 }, { "epoch": 0.8220220728149181, "grad_norm": 1.3649158477783203, "learning_rate": 9.362504284973683e-07, "loss": 1.1651, "step": 3240 }, { "epoch": 0.8245591779779272, "grad_norm": 1.3181599378585815, "learning_rate": 9.1060184843352e-07, "loss": 1.1735, "step": 3250 }, { "epoch": 0.8270962831409362, "grad_norm": 1.2371008396148682, "learning_rate": 8.852742490095628e-07, "loss": 1.1629, "step": 3260 }, { "epoch": 0.8296333883039452, "grad_norm": 1.181353211402893, "learning_rate": 8.602696182121812e-07, "loss": 1.1722, "step": 3270 }, { "epoch": 0.8321704934669542, "grad_norm": 1.314288854598999, "learning_rate": 8.35589918677952e-07, "loss": 1.1713, "step": 3280 }, { "epoch": 0.8347075986299632, "grad_norm": 1.2208564281463623, "learning_rate": 8.112370875393e-07, "loss": 1.1801, "step": 3290 }, { "epoch": 0.8372447037929722, "grad_norm": 1.2573906183242798, "learning_rate": 7.872130362724422e-07, "loss": 1.154, "step": 3300 }, { "epoch": 0.8397818089559812, "grad_norm": 1.286879301071167, "learning_rate": 7.635196505473652e-07, "loss": 1.1759, "step": 3310 }, { "epoch": 0.8423189141189902, "grad_norm": 1.2773866653442383, "learning_rate": 7.401587900798091e-07, "loss": 1.1746, "step": 3320 }, { "epoch": 0.8448560192819993, "grad_norm": 1.3070263862609863, "learning_rate": 7.171322884852988e-07, "loss": 1.1866, "step": 3330 }, { "epoch": 0.8473931244450083, "grad_norm": 1.2348638772964478, "learning_rate": 6.944419531352236e-07, "loss": 1.1816, "step": 3340 }, { "epoch": 0.8499302296080172, "grad_norm": 1.1883289813995361, "learning_rate": 6.720895650149744e-07, "loss": 1.1795, "step": 3350 }, { "epoch": 0.8524673347710262, "grad_norm": 1.245474934577942, "learning_rate": 6.500768785841482e-07, "loss": 1.1733, "step": 3360 }, { "epoch": 0.8550044399340353, "grad_norm": 1.277593731880188, "learning_rate": 6.284056216388451e-07, "loss": 1.1731, "step": 3370 }, { "epoch": 0.8575415450970443, "grad_norm": 1.2596774101257324, "learning_rate": 6.070774951760505e-07, "loss": 1.171, "step": 3380 }, { "epoch": 0.8600786502600533, "grad_norm": 1.266883134841919, "learning_rate": 5.860941732601166e-07, "loss": 1.1668, "step": 3390 }, { "epoch": 0.8626157554230622, "grad_norm": 1.2608847618103027, "learning_rate": 5.654573028913735e-07, "loss": 1.1704, "step": 3400 }, { "epoch": 0.8651528605860713, "grad_norm": 1.311318278312683, "learning_rate": 5.451685038768473e-07, "loss": 1.1899, "step": 3410 }, { "epoch": 0.8676899657490803, "grad_norm": 1.2083625793457031, "learning_rate": 5.252293687031196e-07, "loss": 1.1636, "step": 3420 }, { "epoch": 0.8702270709120893, "grad_norm": 1.256255865097046, "learning_rate": 5.05641462411336e-07, "loss": 1.1651, "step": 3430 }, { "epoch": 0.8727641760750983, "grad_norm": 1.261788010597229, "learning_rate": 4.864063224743626e-07, "loss": 1.1677, "step": 3440 }, { "epoch": 0.8753012812381074, "grad_norm": 1.1607638597488403, "learning_rate": 4.6752545867610963e-07, "loss": 1.1722, "step": 3450 }, { "epoch": 0.8778383864011163, "grad_norm": 1.1922293901443481, "learning_rate": 4.4900035299302036e-07, "loss": 1.1675, "step": 3460 }, { "epoch": 0.8803754915641253, "grad_norm": 1.2432760000228882, "learning_rate": 4.308324594777635e-07, "loss": 1.1689, "step": 3470 }, { "epoch": 0.8829125967271343, "grad_norm": 1.304062008857727, "learning_rate": 4.130232041450866e-07, "loss": 1.1684, "step": 3480 }, { "epoch": 0.8854497018901434, "grad_norm": 1.1947314739227295, "learning_rate": 3.9557398485989884e-07, "loss": 1.1652, "step": 3490 }, { "epoch": 0.8879868070531524, "grad_norm": 1.2069730758666992, "learning_rate": 3.784861712275467e-07, "loss": 1.1608, "step": 3500 }, { "epoch": 0.8905239122161613, "grad_norm": 1.2010389566421509, "learning_rate": 3.61761104486314e-07, "loss": 1.1731, "step": 3510 }, { "epoch": 0.8930610173791703, "grad_norm": 1.2449175119400024, "learning_rate": 3.454000974021432e-07, "loss": 1.1829, "step": 3520 }, { "epoch": 0.8955981225421794, "grad_norm": 1.244471788406372, "learning_rate": 3.294044341655983e-07, "loss": 1.1629, "step": 3530 }, { "epoch": 0.8981352277051884, "grad_norm": 1.2766717672348022, "learning_rate": 3.1377537029107174e-07, "loss": 1.1567, "step": 3540 }, { "epoch": 0.9006723328681974, "grad_norm": 1.2156423330307007, "learning_rate": 2.985141325182267e-07, "loss": 1.177, "step": 3550 }, { "epoch": 0.9032094380312063, "grad_norm": 1.2810441255569458, "learning_rate": 2.836219187157202e-07, "loss": 1.1757, "step": 3560 }, { "epoch": 0.9057465431942154, "grad_norm": 1.312305212020874, "learning_rate": 2.69099897787175e-07, "loss": 1.1654, "step": 3570 }, { "epoch": 0.9082836483572244, "grad_norm": 1.1842001676559448, "learning_rate": 2.5494920957943314e-07, "loss": 1.1597, "step": 3580 }, { "epoch": 0.9108207535202334, "grad_norm": 1.239871859550476, "learning_rate": 2.411709647930882e-07, "loss": 1.1762, "step": 3590 }, { "epoch": 0.9133578586832424, "grad_norm": 1.197737216949463, "learning_rate": 2.2776624489530664e-07, "loss": 1.1699, "step": 3600 }, { "epoch": 0.9158949638462515, "grad_norm": 1.2735172510147095, "learning_rate": 2.1473610203494032e-07, "loss": 1.1742, "step": 3610 }, { "epoch": 0.9184320690092604, "grad_norm": 1.2625423669815063, "learning_rate": 2.0208155895994343e-07, "loss": 1.1609, "step": 3620 }, { "epoch": 0.9209691741722694, "grad_norm": 1.2696080207824707, "learning_rate": 1.8980360893709582e-07, "loss": 1.1742, "step": 3630 }, { "epoch": 0.9235062793352784, "grad_norm": 1.2440428733825684, "learning_rate": 1.7790321567404011e-07, "loss": 1.1747, "step": 3640 }, { "epoch": 0.9260433844982875, "grad_norm": 1.2741636037826538, "learning_rate": 1.6638131324364094e-07, "loss": 1.171, "step": 3650 }, { "epoch": 0.9285804896612965, "grad_norm": 1.2880579233169556, "learning_rate": 1.55238806010668e-07, "loss": 1.1443, "step": 3660 }, { "epoch": 0.9311175948243055, "grad_norm": 1.297038197517395, "learning_rate": 1.444765685608096e-07, "loss": 1.1733, "step": 3670 }, { "epoch": 0.9336546999873144, "grad_norm": 1.208609938621521, "learning_rate": 1.340954456320287e-07, "loss": 1.1741, "step": 3680 }, { "epoch": 0.9361918051503235, "grad_norm": 1.2683435678482056, "learning_rate": 1.2409625204825802e-07, "loss": 1.174, "step": 3690 }, { "epoch": 0.9387289103133325, "grad_norm": 1.1996833086013794, "learning_rate": 1.1447977265544141e-07, "loss": 1.1777, "step": 3700 }, { "epoch": 0.9412660154763415, "grad_norm": 1.2534565925598145, "learning_rate": 1.052467622599329e-07, "loss": 1.155, "step": 3710 }, { "epoch": 0.9438031206393505, "grad_norm": 1.2183892726898193, "learning_rate": 9.639794556925041e-08, "loss": 1.1655, "step": 3720 }, { "epoch": 0.9463402258023595, "grad_norm": 1.2513470649719238, "learning_rate": 8.793401713519333e-08, "loss": 1.1727, "step": 3730 }, { "epoch": 0.9488773309653685, "grad_norm": 1.3019185066223145, "learning_rate": 7.985564129932566e-08, "loss": 1.175, "step": 3740 }, { "epoch": 0.9514144361283775, "grad_norm": 1.2735040187835693, "learning_rate": 7.216345214083264e-08, "loss": 1.1796, "step": 3750 }, { "epoch": 0.9539515412913865, "grad_norm": 1.2108937501907349, "learning_rate": 6.485805342674901e-08, "loss": 1.1478, "step": 3760 }, { "epoch": 0.9564886464543956, "grad_norm": 1.3137885332107544, "learning_rate": 5.7940018564570654e-08, "loss": 1.1777, "step": 3770 }, { "epoch": 0.9590257516174046, "grad_norm": 1.2065484523773193, "learning_rate": 5.1409890557246876e-08, "loss": 1.1749, "step": 3780 }, { "epoch": 0.9615628567804135, "grad_norm": 1.3035470247268677, "learning_rate": 4.526818196055938e-08, "loss": 1.1795, "step": 3790 }, { "epoch": 0.9640999619434225, "grad_norm": 1.2624789476394653, "learning_rate": 3.951537484289114e-08, "loss": 1.1865, "step": 3800 }, { "epoch": 0.9666370671064316, "grad_norm": 1.2832140922546387, "learning_rate": 3.4151920747390044e-08, "loss": 1.1623, "step": 3810 }, { "epoch": 0.9691741722694406, "grad_norm": 1.206804633140564, "learning_rate": 2.9178240656523305e-08, "loss": 1.1698, "step": 3820 }, { "epoch": 0.9717112774324496, "grad_norm": 1.237695574760437, "learning_rate": 2.4594724959037253e-08, "loss": 1.1826, "step": 3830 }, { "epoch": 0.9742483825954585, "grad_norm": 1.2522958517074585, "learning_rate": 2.0401733419315727e-08, "loss": 1.1679, "step": 3840 }, { "epoch": 0.9767854877584676, "grad_norm": 1.209662914276123, "learning_rate": 1.659959514913767e-08, "loss": 1.1842, "step": 3850 }, { "epoch": 0.9793225929214766, "grad_norm": 1.3333046436309814, "learning_rate": 1.3188608581851114e-08, "loss": 1.1629, "step": 3860 }, { "epoch": 0.9818596980844856, "grad_norm": 1.2529042959213257, "learning_rate": 1.016904144894304e-08, "loss": 1.1779, "step": 3870 }, { "epoch": 0.9843968032474946, "grad_norm": 1.2353804111480713, "learning_rate": 7.541130759027848e-09, "loss": 1.1728, "step": 3880 }, { "epoch": 0.9869339084105037, "grad_norm": 1.1999486684799194, "learning_rate": 5.305082779244464e-09, "loss": 1.1607, "step": 3890 }, { "epoch": 0.9894710135735126, "grad_norm": 1.1895602941513062, "learning_rate": 3.4610730190648423e-09, "loss": 1.1884, "step": 3900 }, { "epoch": 0.9920081187365216, "grad_norm": 1.22995924949646, "learning_rate": 2.0092462165194337e-09, "loss": 1.1906, "step": 3910 }, { "epoch": 0.9945452238995306, "grad_norm": 1.1689248085021973, "learning_rate": 9.497163268351595e-10, "loss": 1.1499, "step": 3920 }, { "epoch": 0.9970823290625397, "grad_norm": 1.3178656101226807, "learning_rate": 2.825665134920108e-10, "loss": 1.173, "step": 3930 }, { "epoch": 0.9996194342255487, "grad_norm": 1.2528467178344727, "learning_rate": 7.849141696048002e-12, "loss": 1.1774, "step": 3940 }, { "epoch": 0.9998731447418495, "step": 3941, "total_flos": 1.0333279192736596e+19, "train_loss": 1.22370115647731, "train_runtime": 28099.9428, "train_samples_per_second": 17.953, "train_steps_per_second": 0.14 } ], "logging_steps": 10, "max_steps": 3941, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.0333279192736596e+19, "train_batch_size": 8, "trial_name": null, "trial_params": null }