{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 3002, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0033316674995835416, "grad_norm": 0.8176751732826233, "learning_rate": 1.8e-05, "loss": 2.2936, "step": 10 }, { "epoch": 0.006663334999167083, "grad_norm": 0.5975170731544495, "learning_rate": 3.8e-05, "loss": 2.187, "step": 20 }, { "epoch": 0.009995002498750625, "grad_norm": 0.5774023532867432, "learning_rate": 5.8e-05, "loss": 2.0133, "step": 30 }, { "epoch": 0.013326669998334166, "grad_norm": 0.5514141917228699, "learning_rate": 7.800000000000001e-05, "loss": 1.9357, "step": 40 }, { "epoch": 0.01665833749791771, "grad_norm": 0.599364161491394, "learning_rate": 9.8e-05, "loss": 1.8211, "step": 50 }, { "epoch": 0.01999000499750125, "grad_norm": 0.8537599444389343, "learning_rate": 0.000118, "loss": 1.9177, "step": 60 }, { "epoch": 0.02332167249708479, "grad_norm": 0.7055738568305969, "learning_rate": 0.000138, "loss": 1.8151, "step": 70 }, { "epoch": 0.026653339996668333, "grad_norm": 0.7180108428001404, "learning_rate": 0.00015800000000000002, "loss": 1.7354, "step": 80 }, { "epoch": 0.029985007496251874, "grad_norm": 0.9212970733642578, "learning_rate": 0.00017800000000000002, "loss": 1.6311, "step": 90 }, { "epoch": 0.03331667499583542, "grad_norm": 0.8818132281303406, "learning_rate": 0.00019800000000000002, "loss": 1.6173, "step": 100 }, { "epoch": 0.036648342495418956, "grad_norm": 0.7504797577857971, "learning_rate": 0.00019937973811164715, "loss": 1.698, "step": 110 }, { "epoch": 0.0399800099950025, "grad_norm": 0.7597770094871521, "learning_rate": 0.00019869055823569954, "loss": 1.5597, "step": 120 }, { "epoch": 0.04331167749458604, "grad_norm": 0.7281157374382019, "learning_rate": 0.0001980013783597519, "loss": 1.5941, "step": 130 }, { "epoch": 0.04664334499416958, "grad_norm": 0.8265408277511597, "learning_rate": 0.0001973121984838043, "loss": 1.3635, "step": 140 }, { "epoch": 0.04997501249375312, "grad_norm": 0.8942359089851379, "learning_rate": 0.00019662301860785665, "loss": 1.2472, "step": 150 }, { "epoch": 0.053306679993336666, "grad_norm": 0.6996039152145386, "learning_rate": 0.00019593383873190904, "loss": 1.5311, "step": 160 }, { "epoch": 0.0566383474929202, "grad_norm": 0.7954139709472656, "learning_rate": 0.00019524465885596142, "loss": 1.4262, "step": 170 }, { "epoch": 0.05997001499250375, "grad_norm": 0.8379319310188293, "learning_rate": 0.00019455547898001378, "loss": 1.4121, "step": 180 }, { "epoch": 0.06330168249208729, "grad_norm": 0.7172244191169739, "learning_rate": 0.00019386629910406617, "loss": 1.379, "step": 190 }, { "epoch": 0.06663334999167084, "grad_norm": 0.6560420393943787, "learning_rate": 0.00019317711922811853, "loss": 1.3696, "step": 200 }, { "epoch": 0.06996501749125437, "grad_norm": 0.6783866286277771, "learning_rate": 0.00019248793935217092, "loss": 1.3017, "step": 210 }, { "epoch": 0.07329668499083791, "grad_norm": 0.6463010311126709, "learning_rate": 0.0001917987594762233, "loss": 1.278, "step": 220 }, { "epoch": 0.07662835249042145, "grad_norm": 0.5589348673820496, "learning_rate": 0.0001911095796002757, "loss": 1.348, "step": 230 }, { "epoch": 0.079960019990005, "grad_norm": 0.7754651308059692, "learning_rate": 0.00019042039972432806, "loss": 1.2947, "step": 240 }, { "epoch": 0.08329168748958854, "grad_norm": 0.8420850038528442, "learning_rate": 0.00018973121984838042, "loss": 1.3621, "step": 250 }, { "epoch": 0.08662335498917208, "grad_norm": 0.7481808662414551, "learning_rate": 0.0001890420399724328, "loss": 1.2055, "step": 260 }, { "epoch": 0.08995502248875563, "grad_norm": 0.6213952898979187, "learning_rate": 0.0001883528600964852, "loss": 1.1484, "step": 270 }, { "epoch": 0.09328668998833917, "grad_norm": 0.767515242099762, "learning_rate": 0.00018766368022053758, "loss": 1.2137, "step": 280 }, { "epoch": 0.0966183574879227, "grad_norm": 0.5464005470275879, "learning_rate": 0.00018697450034458994, "loss": 1.0461, "step": 290 }, { "epoch": 0.09995002498750624, "grad_norm": 0.6953014135360718, "learning_rate": 0.00018628532046864233, "loss": 1.0792, "step": 300 }, { "epoch": 0.1032816924870898, "grad_norm": 0.6513417363166809, "learning_rate": 0.0001855961405926947, "loss": 1.3351, "step": 310 }, { "epoch": 0.10661335998667333, "grad_norm": 0.754256546497345, "learning_rate": 0.0001849069607167471, "loss": 1.1128, "step": 320 }, { "epoch": 0.10994502748625687, "grad_norm": 0.6703091859817505, "learning_rate": 0.00018421778084079946, "loss": 1.0618, "step": 330 }, { "epoch": 0.1132766949858404, "grad_norm": 0.5775123834609985, "learning_rate": 0.00018352860096485183, "loss": 1.0024, "step": 340 }, { "epoch": 0.11660836248542396, "grad_norm": 0.6067807078361511, "learning_rate": 0.0001828394210889042, "loss": 1.161, "step": 350 }, { "epoch": 0.1199400299850075, "grad_norm": 0.7527532577514648, "learning_rate": 0.00018215024121295657, "loss": 1.0168, "step": 360 }, { "epoch": 0.12327169748459103, "grad_norm": 0.7340474128723145, "learning_rate": 0.000181461061337009, "loss": 1.1902, "step": 370 }, { "epoch": 0.12660336498417457, "grad_norm": 0.6416488289833069, "learning_rate": 0.00018077188146106135, "loss": 1.1745, "step": 380 }, { "epoch": 0.12993503248375812, "grad_norm": 0.7203887701034546, "learning_rate": 0.00018008270158511374, "loss": 1.1377, "step": 390 }, { "epoch": 0.13326669998334167, "grad_norm": 0.7556202411651611, "learning_rate": 0.0001793935217091661, "loss": 1.0205, "step": 400 }, { "epoch": 0.1365983674829252, "grad_norm": 0.655582845211029, "learning_rate": 0.00017870434183321846, "loss": 1.1007, "step": 410 }, { "epoch": 0.13993003498250875, "grad_norm": 0.4769359230995178, "learning_rate": 0.00017801516195727087, "loss": 1.0824, "step": 420 }, { "epoch": 0.14326170248209227, "grad_norm": 0.7861637473106384, "learning_rate": 0.00017732598208132323, "loss": 1.086, "step": 430 }, { "epoch": 0.14659336998167583, "grad_norm": 0.4841909110546112, "learning_rate": 0.00017663680220537562, "loss": 1.1964, "step": 440 }, { "epoch": 0.14992503748125938, "grad_norm": 0.6953870058059692, "learning_rate": 0.00017594762232942798, "loss": 0.9949, "step": 450 }, { "epoch": 0.1532567049808429, "grad_norm": 0.7086942791938782, "learning_rate": 0.00017525844245348037, "loss": 1.0006, "step": 460 }, { "epoch": 0.15658837248042645, "grad_norm": 0.5370402336120605, "learning_rate": 0.00017456926257753273, "loss": 0.9224, "step": 470 }, { "epoch": 0.15992003998001, "grad_norm": 0.5842561721801758, "learning_rate": 0.00017388008270158512, "loss": 1.049, "step": 480 }, { "epoch": 0.16325170747959353, "grad_norm": 0.7351865768432617, "learning_rate": 0.0001731909028256375, "loss": 1.0031, "step": 490 }, { "epoch": 0.16658337497917708, "grad_norm": 0.5930982232093811, "learning_rate": 0.00017250172294968987, "loss": 1.1909, "step": 500 }, { "epoch": 0.16991504247876063, "grad_norm": 0.6230600476264954, "learning_rate": 0.00017181254307374225, "loss": 0.9673, "step": 510 }, { "epoch": 0.17324670997834415, "grad_norm": 0.5371518731117249, "learning_rate": 0.00017112336319779462, "loss": 1.089, "step": 520 }, { "epoch": 0.1765783774779277, "grad_norm": 0.5983089208602905, "learning_rate": 0.00017043418332184703, "loss": 1.0183, "step": 530 }, { "epoch": 0.17991004497751126, "grad_norm": 0.8884322643280029, "learning_rate": 0.0001697450034458994, "loss": 1.037, "step": 540 }, { "epoch": 0.18324171247709478, "grad_norm": 0.6050639748573303, "learning_rate": 0.00016905582356995175, "loss": 1.0937, "step": 550 }, { "epoch": 0.18657337997667833, "grad_norm": 0.5100018382072449, "learning_rate": 0.00016836664369400414, "loss": 0.9618, "step": 560 }, { "epoch": 0.18990504747626186, "grad_norm": 0.5553308725357056, "learning_rate": 0.0001676774638180565, "loss": 1.1016, "step": 570 }, { "epoch": 0.1932367149758454, "grad_norm": 0.7883793115615845, "learning_rate": 0.00016698828394210891, "loss": 1.0927, "step": 580 }, { "epoch": 0.19656838247542896, "grad_norm": 0.7052305340766907, "learning_rate": 0.00016629910406616128, "loss": 0.939, "step": 590 }, { "epoch": 0.19990004997501248, "grad_norm": 0.6732206344604492, "learning_rate": 0.00016560992419021366, "loss": 1.0596, "step": 600 }, { "epoch": 0.20323171747459604, "grad_norm": 0.7498496174812317, "learning_rate": 0.00016492074431426602, "loss": 0.9612, "step": 610 }, { "epoch": 0.2065633849741796, "grad_norm": 0.7365151047706604, "learning_rate": 0.0001642315644383184, "loss": 1.0503, "step": 620 }, { "epoch": 0.2098950524737631, "grad_norm": 0.5978183746337891, "learning_rate": 0.0001635423845623708, "loss": 1.07, "step": 630 }, { "epoch": 0.21322671997334666, "grad_norm": 0.7099848389625549, "learning_rate": 0.00016285320468642316, "loss": 0.9538, "step": 640 }, { "epoch": 0.2165583874729302, "grad_norm": 0.6647136807441711, "learning_rate": 0.00016216402481047555, "loss": 0.9772, "step": 650 }, { "epoch": 0.21989005497251374, "grad_norm": 0.6482895612716675, "learning_rate": 0.0001614748449345279, "loss": 1.0033, "step": 660 }, { "epoch": 0.2232217224720973, "grad_norm": 0.583042323589325, "learning_rate": 0.0001607856650585803, "loss": 0.8133, "step": 670 }, { "epoch": 0.2265533899716808, "grad_norm": 0.6488454341888428, "learning_rate": 0.00016009648518263268, "loss": 0.9405, "step": 680 }, { "epoch": 0.22988505747126436, "grad_norm": 0.5041667819023132, "learning_rate": 0.00015940730530668507, "loss": 1.0427, "step": 690 }, { "epoch": 0.23321672497084792, "grad_norm": 0.6406366229057312, "learning_rate": 0.00015871812543073743, "loss": 0.9604, "step": 700 }, { "epoch": 0.23654839247043144, "grad_norm": 0.6386472582817078, "learning_rate": 0.0001580289455547898, "loss": 0.8826, "step": 710 }, { "epoch": 0.239880059970015, "grad_norm": 0.7378501892089844, "learning_rate": 0.00015733976567884218, "loss": 0.9141, "step": 720 }, { "epoch": 0.24321172746959854, "grad_norm": 0.6911444067955017, "learning_rate": 0.00015665058580289457, "loss": 0.8735, "step": 730 }, { "epoch": 0.24654339496918207, "grad_norm": 0.7621609568595886, "learning_rate": 0.00015596140592694696, "loss": 0.8972, "step": 740 }, { "epoch": 0.24987506246876562, "grad_norm": 0.5592761039733887, "learning_rate": 0.00015527222605099932, "loss": 0.8612, "step": 750 }, { "epoch": 0.25320672996834914, "grad_norm": 0.4869195520877838, "learning_rate": 0.0001545830461750517, "loss": 0.8663, "step": 760 }, { "epoch": 0.2565383974679327, "grad_norm": 0.8638468980789185, "learning_rate": 0.00015389386629910407, "loss": 0.9393, "step": 770 }, { "epoch": 0.25987006496751625, "grad_norm": 0.5576454997062683, "learning_rate": 0.00015320468642315645, "loss": 0.8925, "step": 780 }, { "epoch": 0.26320173246709977, "grad_norm": 0.6767865419387817, "learning_rate": 0.00015251550654720884, "loss": 0.8692, "step": 790 }, { "epoch": 0.26653339996668335, "grad_norm": 0.8351936340332031, "learning_rate": 0.0001518263266712612, "loss": 0.9813, "step": 800 }, { "epoch": 0.2698650674662669, "grad_norm": 0.6378228068351746, "learning_rate": 0.0001511371467953136, "loss": 0.7901, "step": 810 }, { "epoch": 0.2731967349658504, "grad_norm": 0.6529081463813782, "learning_rate": 0.00015044796691936595, "loss": 0.9269, "step": 820 }, { "epoch": 0.276528402465434, "grad_norm": 0.7450738549232483, "learning_rate": 0.00014975878704341834, "loss": 0.9723, "step": 830 }, { "epoch": 0.2798600699650175, "grad_norm": 0.7390174865722656, "learning_rate": 0.00014906960716747073, "loss": 0.8644, "step": 840 }, { "epoch": 0.283191737464601, "grad_norm": 0.6609140634536743, "learning_rate": 0.0001483804272915231, "loss": 0.9261, "step": 850 }, { "epoch": 0.28652340496418455, "grad_norm": 0.8164989352226257, "learning_rate": 0.00014769124741557547, "loss": 0.913, "step": 860 }, { "epoch": 0.2898550724637681, "grad_norm": 0.7193732261657715, "learning_rate": 0.00014700206753962783, "loss": 0.8622, "step": 870 }, { "epoch": 0.29318673996335165, "grad_norm": 0.5452458262443542, "learning_rate": 0.00014631288766368022, "loss": 0.7792, "step": 880 }, { "epoch": 0.2965184074629352, "grad_norm": 0.7681081295013428, "learning_rate": 0.0001456237077877326, "loss": 0.965, "step": 890 }, { "epoch": 0.29985007496251875, "grad_norm": 0.6345311999320984, "learning_rate": 0.000144934527911785, "loss": 0.7115, "step": 900 }, { "epoch": 0.3031817424621023, "grad_norm": 0.6799845695495605, "learning_rate": 0.00014424534803583736, "loss": 0.9525, "step": 910 }, { "epoch": 0.3065134099616858, "grad_norm": 0.6358153223991394, "learning_rate": 0.00014355616815988975, "loss": 0.9661, "step": 920 }, { "epoch": 0.3098450774612694, "grad_norm": 0.8221323490142822, "learning_rate": 0.0001428669882839421, "loss": 1.0507, "step": 930 }, { "epoch": 0.3131767449608529, "grad_norm": 0.8563844561576843, "learning_rate": 0.0001421778084079945, "loss": 0.8268, "step": 940 }, { "epoch": 0.31650841246043643, "grad_norm": 0.6171509027481079, "learning_rate": 0.00014148862853204688, "loss": 0.8938, "step": 950 }, { "epoch": 0.31984007996002, "grad_norm": 0.6679477095603943, "learning_rate": 0.00014079944865609924, "loss": 0.9604, "step": 960 }, { "epoch": 0.32317174745960353, "grad_norm": 0.7955806851387024, "learning_rate": 0.00014011026878015163, "loss": 0.9084, "step": 970 }, { "epoch": 0.32650341495918705, "grad_norm": 0.6949059367179871, "learning_rate": 0.000139421088904204, "loss": 0.8687, "step": 980 }, { "epoch": 0.32983508245877063, "grad_norm": 0.6657271385192871, "learning_rate": 0.00013873190902825638, "loss": 0.9217, "step": 990 }, { "epoch": 0.33316674995835416, "grad_norm": 0.8809479475021362, "learning_rate": 0.00013804272915230877, "loss": 0.7875, "step": 1000 }, { "epoch": 0.3364984174579377, "grad_norm": 0.47438332438468933, "learning_rate": 0.00013735354927636113, "loss": 0.8885, "step": 1010 }, { "epoch": 0.33983008495752126, "grad_norm": 0.7127712368965149, "learning_rate": 0.00013666436940041352, "loss": 0.8514, "step": 1020 }, { "epoch": 0.3431617524571048, "grad_norm": 0.7310017347335815, "learning_rate": 0.00013597518952446588, "loss": 0.8137, "step": 1030 }, { "epoch": 0.3464934199566883, "grad_norm": 0.7233092188835144, "learning_rate": 0.00013528600964851826, "loss": 0.7911, "step": 1040 }, { "epoch": 0.3498250874562719, "grad_norm": 0.7451456785202026, "learning_rate": 0.00013459682977257065, "loss": 0.9467, "step": 1050 }, { "epoch": 0.3531567549558554, "grad_norm": 0.796917736530304, "learning_rate": 0.00013390764989662304, "loss": 0.9069, "step": 1060 }, { "epoch": 0.35648842245543894, "grad_norm": 0.571403443813324, "learning_rate": 0.0001332184700206754, "loss": 0.8283, "step": 1070 }, { "epoch": 0.3598200899550225, "grad_norm": 0.6184263825416565, "learning_rate": 0.0001325292901447278, "loss": 0.875, "step": 1080 }, { "epoch": 0.36315175745460604, "grad_norm": 0.7700721025466919, "learning_rate": 0.00013184011026878015, "loss": 0.8443, "step": 1090 }, { "epoch": 0.36648342495418956, "grad_norm": 0.8920392394065857, "learning_rate": 0.00013115093039283254, "loss": 0.8516, "step": 1100 }, { "epoch": 0.3698150924537731, "grad_norm": 0.6632056832313538, "learning_rate": 0.00013046175051688492, "loss": 0.7407, "step": 1110 }, { "epoch": 0.37314675995335667, "grad_norm": 0.677737832069397, "learning_rate": 0.00012977257064093728, "loss": 0.9112, "step": 1120 }, { "epoch": 0.3764784274529402, "grad_norm": 0.7659761309623718, "learning_rate": 0.00012908339076498967, "loss": 0.812, "step": 1130 }, { "epoch": 0.3798100949525237, "grad_norm": 0.6237064003944397, "learning_rate": 0.00012839421088904203, "loss": 0.8422, "step": 1140 }, { "epoch": 0.3831417624521073, "grad_norm": 0.8118287920951843, "learning_rate": 0.00012770503101309442, "loss": 0.8597, "step": 1150 }, { "epoch": 0.3864734299516908, "grad_norm": 0.7423121333122253, "learning_rate": 0.0001270158511371468, "loss": 0.7809, "step": 1160 }, { "epoch": 0.38980509745127434, "grad_norm": 0.7867801785469055, "learning_rate": 0.00012632667126119917, "loss": 1.0343, "step": 1170 }, { "epoch": 0.3931367649508579, "grad_norm": 0.7463882565498352, "learning_rate": 0.00012563749138525156, "loss": 0.8384, "step": 1180 }, { "epoch": 0.39646843245044144, "grad_norm": 0.68085777759552, "learning_rate": 0.00012494831150930392, "loss": 0.9429, "step": 1190 }, { "epoch": 0.39980009995002497, "grad_norm": 0.741705060005188, "learning_rate": 0.0001242591316333563, "loss": 0.9062, "step": 1200 }, { "epoch": 0.40313176744960855, "grad_norm": 0.6207161545753479, "learning_rate": 0.0001235699517574087, "loss": 0.8259, "step": 1210 }, { "epoch": 0.40646343494919207, "grad_norm": 0.6957824230194092, "learning_rate": 0.00012288077188146108, "loss": 0.9225, "step": 1220 }, { "epoch": 0.4097951024487756, "grad_norm": 0.7893931865692139, "learning_rate": 0.00012219159200551344, "loss": 0.7725, "step": 1230 }, { "epoch": 0.4131267699483592, "grad_norm": 0.7295857071876526, "learning_rate": 0.00012150241212956582, "loss": 0.7769, "step": 1240 }, { "epoch": 0.4164584374479427, "grad_norm": 0.8230463862419128, "learning_rate": 0.0001208132322536182, "loss": 0.8399, "step": 1250 }, { "epoch": 0.4197901049475262, "grad_norm": 0.8607476949691772, "learning_rate": 0.00012012405237767056, "loss": 0.6551, "step": 1260 }, { "epoch": 0.4231217724471098, "grad_norm": 0.7293261885643005, "learning_rate": 0.00011943487250172297, "loss": 0.7303, "step": 1270 }, { "epoch": 0.4264534399466933, "grad_norm": 0.5877302289009094, "learning_rate": 0.00011874569262577533, "loss": 0.7893, "step": 1280 }, { "epoch": 0.42978510744627685, "grad_norm": 0.5563659071922302, "learning_rate": 0.00011805651274982771, "loss": 0.7517, "step": 1290 }, { "epoch": 0.4331167749458604, "grad_norm": 0.6885454654693604, "learning_rate": 0.00011736733287388009, "loss": 0.6623, "step": 1300 }, { "epoch": 0.43644844244544395, "grad_norm": 0.8266370892524719, "learning_rate": 0.00011667815299793245, "loss": 0.8886, "step": 1310 }, { "epoch": 0.4397801099450275, "grad_norm": 0.5514026880264282, "learning_rate": 0.00011598897312198485, "loss": 0.6652, "step": 1320 }, { "epoch": 0.44311177744461105, "grad_norm": 0.7804675102233887, "learning_rate": 0.00011529979324603721, "loss": 0.911, "step": 1330 }, { "epoch": 0.4464434449441946, "grad_norm": 0.7426096200942993, "learning_rate": 0.0001146106133700896, "loss": 0.7434, "step": 1340 }, { "epoch": 0.4497751124437781, "grad_norm": 0.6535798907279968, "learning_rate": 0.00011392143349414197, "loss": 0.8465, "step": 1350 }, { "epoch": 0.4531067799433616, "grad_norm": 0.665757417678833, "learning_rate": 0.00011323225361819436, "loss": 0.8312, "step": 1360 }, { "epoch": 0.4564384474429452, "grad_norm": 0.64393550157547, "learning_rate": 0.00011254307374224673, "loss": 0.6107, "step": 1370 }, { "epoch": 0.45977011494252873, "grad_norm": 0.6122268438339233, "learning_rate": 0.00011185389386629912, "loss": 0.8532, "step": 1380 }, { "epoch": 0.46310178244211225, "grad_norm": 0.6691811680793762, "learning_rate": 0.00011116471399035148, "loss": 0.7827, "step": 1390 }, { "epoch": 0.46643344994169583, "grad_norm": 0.611470639705658, "learning_rate": 0.00011047553411440386, "loss": 0.7598, "step": 1400 }, { "epoch": 0.46976511744127936, "grad_norm": 0.8762800097465515, "learning_rate": 0.00010978635423845624, "loss": 0.8613, "step": 1410 }, { "epoch": 0.4730967849408629, "grad_norm": 0.710159182548523, "learning_rate": 0.00010909717436250862, "loss": 0.7768, "step": 1420 }, { "epoch": 0.47642845244044646, "grad_norm": 0.9778875708580017, "learning_rate": 0.00010840799448656101, "loss": 0.8116, "step": 1430 }, { "epoch": 0.47976011994003, "grad_norm": 0.6341977119445801, "learning_rate": 0.00010771881461061337, "loss": 0.7059, "step": 1440 }, { "epoch": 0.4830917874396135, "grad_norm": 0.7075402736663818, "learning_rate": 0.00010702963473466577, "loss": 0.9244, "step": 1450 }, { "epoch": 0.4864234549391971, "grad_norm": 0.6109429001808167, "learning_rate": 0.00010634045485871813, "loss": 0.802, "step": 1460 }, { "epoch": 0.4897551224387806, "grad_norm": 0.7478988170623779, "learning_rate": 0.0001056512749827705, "loss": 0.8911, "step": 1470 }, { "epoch": 0.49308678993836413, "grad_norm": 0.6688179969787598, "learning_rate": 0.00010496209510682289, "loss": 0.7508, "step": 1480 }, { "epoch": 0.4964184574379477, "grad_norm": 0.7863402962684631, "learning_rate": 0.00010427291523087525, "loss": 0.8439, "step": 1490 }, { "epoch": 0.49975012493753124, "grad_norm": 0.8334706425666809, "learning_rate": 0.00010358373535492765, "loss": 0.7596, "step": 1500 }, { "epoch": 0.5030817924371148, "grad_norm": 0.8061437010765076, "learning_rate": 0.00010289455547898001, "loss": 0.7742, "step": 1510 }, { "epoch": 0.5064134599366983, "grad_norm": 0.7398769855499268, "learning_rate": 0.0001022053756030324, "loss": 0.7446, "step": 1520 }, { "epoch": 0.5097451274362819, "grad_norm": 0.5730561017990112, "learning_rate": 0.00010151619572708478, "loss": 0.784, "step": 1530 }, { "epoch": 0.5130767949358654, "grad_norm": 0.6701236963272095, "learning_rate": 0.00010082701585113714, "loss": 0.8104, "step": 1540 }, { "epoch": 0.5164084624354489, "grad_norm": 0.681547999382019, "learning_rate": 0.00010013783597518952, "loss": 0.6719, "step": 1550 }, { "epoch": 0.5197401299350325, "grad_norm": 0.6569002270698547, "learning_rate": 9.944865609924191e-05, "loss": 0.7091, "step": 1560 }, { "epoch": 0.5230717974346161, "grad_norm": 0.8231265544891357, "learning_rate": 9.875947622329429e-05, "loss": 0.8079, "step": 1570 }, { "epoch": 0.5264034649341995, "grad_norm": 0.8120758533477783, "learning_rate": 9.807029634734666e-05, "loss": 0.8125, "step": 1580 }, { "epoch": 0.5297351324337831, "grad_norm": 0.6406270861625671, "learning_rate": 9.738111647139903e-05, "loss": 0.6543, "step": 1590 }, { "epoch": 0.5330667999333667, "grad_norm": 0.8023959398269653, "learning_rate": 9.669193659545141e-05, "loss": 0.8343, "step": 1600 }, { "epoch": 0.5363984674329502, "grad_norm": 0.7827622294425964, "learning_rate": 9.60027567195038e-05, "loss": 0.7749, "step": 1610 }, { "epoch": 0.5397301349325337, "grad_norm": 0.5446188449859619, "learning_rate": 9.531357684355617e-05, "loss": 0.6603, "step": 1620 }, { "epoch": 0.5430618024321173, "grad_norm": 0.7404822707176208, "learning_rate": 9.462439696760856e-05, "loss": 0.8172, "step": 1630 }, { "epoch": 0.5463934699317008, "grad_norm": 0.7257384061813354, "learning_rate": 9.393521709166093e-05, "loss": 0.864, "step": 1640 }, { "epoch": 0.5497251374312844, "grad_norm": 0.8640374541282654, "learning_rate": 9.324603721571331e-05, "loss": 0.6451, "step": 1650 }, { "epoch": 0.553056804930868, "grad_norm": 0.6205821633338928, "learning_rate": 9.255685733976568e-05, "loss": 0.8729, "step": 1660 }, { "epoch": 0.5563884724304514, "grad_norm": 0.7128989696502686, "learning_rate": 9.186767746381806e-05, "loss": 0.7218, "step": 1670 }, { "epoch": 0.559720139930035, "grad_norm": 0.6116006970405579, "learning_rate": 9.117849758787044e-05, "loss": 0.7591, "step": 1680 }, { "epoch": 0.5630518074296186, "grad_norm": 0.8077837228775024, "learning_rate": 9.048931771192282e-05, "loss": 0.7267, "step": 1690 }, { "epoch": 0.566383474929202, "grad_norm": 0.8824722766876221, "learning_rate": 8.980013783597519e-05, "loss": 0.6545, "step": 1700 }, { "epoch": 0.5697151424287856, "grad_norm": 0.9038705229759216, "learning_rate": 8.911095796002758e-05, "loss": 0.7376, "step": 1710 }, { "epoch": 0.5730468099283691, "grad_norm": 0.7288265228271484, "learning_rate": 8.842177808407995e-05, "loss": 0.8062, "step": 1720 }, { "epoch": 0.5763784774279527, "grad_norm": 0.6127156019210815, "learning_rate": 8.773259820813233e-05, "loss": 0.7298, "step": 1730 }, { "epoch": 0.5797101449275363, "grad_norm": 0.7607082724571228, "learning_rate": 8.70434183321847e-05, "loss": 0.7417, "step": 1740 }, { "epoch": 0.5830418124271197, "grad_norm": 0.8536520004272461, "learning_rate": 8.635423845623708e-05, "loss": 0.6495, "step": 1750 }, { "epoch": 0.5863734799267033, "grad_norm": 0.5629620552062988, "learning_rate": 8.566505858028946e-05, "loss": 0.5853, "step": 1760 }, { "epoch": 0.5897051474262869, "grad_norm": 0.8041568398475647, "learning_rate": 8.497587870434184e-05, "loss": 0.7068, "step": 1770 }, { "epoch": 0.5930368149258703, "grad_norm": 0.9616042375564575, "learning_rate": 8.428669882839421e-05, "loss": 0.7051, "step": 1780 }, { "epoch": 0.5963684824254539, "grad_norm": 0.6616283655166626, "learning_rate": 8.35975189524466e-05, "loss": 0.7698, "step": 1790 }, { "epoch": 0.5997001499250375, "grad_norm": 0.7523969411849976, "learning_rate": 8.290833907649897e-05, "loss": 0.69, "step": 1800 }, { "epoch": 0.603031817424621, "grad_norm": 0.5366020202636719, "learning_rate": 8.221915920055135e-05, "loss": 0.6432, "step": 1810 }, { "epoch": 0.6063634849242046, "grad_norm": 0.7098552584648132, "learning_rate": 8.152997932460372e-05, "loss": 0.6767, "step": 1820 }, { "epoch": 0.6096951524237881, "grad_norm": 0.5750883221626282, "learning_rate": 8.08407994486561e-05, "loss": 0.6853, "step": 1830 }, { "epoch": 0.6130268199233716, "grad_norm": 0.6619908213615417, "learning_rate": 8.015161957270849e-05, "loss": 0.6461, "step": 1840 }, { "epoch": 0.6163584874229552, "grad_norm": 0.6529950499534607, "learning_rate": 7.946243969676086e-05, "loss": 0.563, "step": 1850 }, { "epoch": 0.6196901549225388, "grad_norm": 0.6484772562980652, "learning_rate": 7.877325982081323e-05, "loss": 0.8557, "step": 1860 }, { "epoch": 0.6230218224221222, "grad_norm": 0.7002941370010376, "learning_rate": 7.808407994486562e-05, "loss": 0.6468, "step": 1870 }, { "epoch": 0.6263534899217058, "grad_norm": 0.6880629658699036, "learning_rate": 7.739490006891798e-05, "loss": 0.8108, "step": 1880 }, { "epoch": 0.6296851574212894, "grad_norm": 0.7958945035934448, "learning_rate": 7.670572019297037e-05, "loss": 0.6569, "step": 1890 }, { "epoch": 0.6330168249208729, "grad_norm": 0.6312280297279358, "learning_rate": 7.601654031702274e-05, "loss": 0.7134, "step": 1900 }, { "epoch": 0.6363484924204564, "grad_norm": 0.5090949535369873, "learning_rate": 7.532736044107512e-05, "loss": 0.709, "step": 1910 }, { "epoch": 0.63968015992004, "grad_norm": 0.8009600043296814, "learning_rate": 7.46381805651275e-05, "loss": 0.7303, "step": 1920 }, { "epoch": 0.6430118274196235, "grad_norm": 0.6147052049636841, "learning_rate": 7.394900068917988e-05, "loss": 0.7837, "step": 1930 }, { "epoch": 0.6463434949192071, "grad_norm": 1.0245405435562134, "learning_rate": 7.325982081323225e-05, "loss": 0.6561, "step": 1940 }, { "epoch": 0.6496751624187906, "grad_norm": 0.7784261107444763, "learning_rate": 7.257064093728464e-05, "loss": 0.7258, "step": 1950 }, { "epoch": 0.6530068299183741, "grad_norm": 0.8354228138923645, "learning_rate": 7.1881461061337e-05, "loss": 0.6183, "step": 1960 }, { "epoch": 0.6563384974179577, "grad_norm": 0.6210038661956787, "learning_rate": 7.119228118538939e-05, "loss": 0.5344, "step": 1970 }, { "epoch": 0.6596701649175413, "grad_norm": 0.7484562397003174, "learning_rate": 7.050310130944176e-05, "loss": 0.645, "step": 1980 }, { "epoch": 0.6630018324171247, "grad_norm": 0.4157319962978363, "learning_rate": 6.981392143349414e-05, "loss": 0.5934, "step": 1990 }, { "epoch": 0.6663334999167083, "grad_norm": 0.8641183376312256, "learning_rate": 6.912474155754653e-05, "loss": 0.6435, "step": 2000 }, { "epoch": 0.6696651674162919, "grad_norm": 0.6255794167518616, "learning_rate": 6.84355616815989e-05, "loss": 0.5546, "step": 2010 }, { "epoch": 0.6729968349158754, "grad_norm": 0.6411312222480774, "learning_rate": 6.774638180565129e-05, "loss": 0.7123, "step": 2020 }, { "epoch": 0.6763285024154589, "grad_norm": 0.5700286626815796, "learning_rate": 6.705720192970366e-05, "loss": 0.7319, "step": 2030 }, { "epoch": 0.6796601699150425, "grad_norm": 0.898933470249176, "learning_rate": 6.636802205375602e-05, "loss": 0.885, "step": 2040 }, { "epoch": 0.682991837414626, "grad_norm": 0.8384907245635986, "learning_rate": 6.567884217780841e-05, "loss": 0.6653, "step": 2050 }, { "epoch": 0.6863235049142096, "grad_norm": 0.5363606214523315, "learning_rate": 6.498966230186079e-05, "loss": 0.6261, "step": 2060 }, { "epoch": 0.6896551724137931, "grad_norm": 0.8651995062828064, "learning_rate": 6.430048242591317e-05, "loss": 0.6276, "step": 2070 }, { "epoch": 0.6929868399133766, "grad_norm": 1.0924036502838135, "learning_rate": 6.361130254996555e-05, "loss": 0.5616, "step": 2080 }, { "epoch": 0.6963185074129602, "grad_norm": 0.8714765906333923, "learning_rate": 6.292212267401792e-05, "loss": 0.6231, "step": 2090 }, { "epoch": 0.6996501749125438, "grad_norm": 0.9532473683357239, "learning_rate": 6.223294279807031e-05, "loss": 0.7391, "step": 2100 }, { "epoch": 0.7029818424121272, "grad_norm": 0.5686549544334412, "learning_rate": 6.154376292212267e-05, "loss": 0.7223, "step": 2110 }, { "epoch": 0.7063135099117108, "grad_norm": 0.883965015411377, "learning_rate": 6.085458304617505e-05, "loss": 0.6979, "step": 2120 }, { "epoch": 0.7096451774112944, "grad_norm": 0.8032324314117432, "learning_rate": 6.016540317022743e-05, "loss": 0.6049, "step": 2130 }, { "epoch": 0.7129768449108779, "grad_norm": 0.6529830098152161, "learning_rate": 5.9476223294279806e-05, "loss": 0.6186, "step": 2140 }, { "epoch": 0.7163085124104615, "grad_norm": 0.7644656896591187, "learning_rate": 5.878704341833219e-05, "loss": 0.6225, "step": 2150 }, { "epoch": 0.719640179910045, "grad_norm": 0.7979158759117126, "learning_rate": 5.809786354238457e-05, "loss": 0.6094, "step": 2160 }, { "epoch": 0.7229718474096285, "grad_norm": 0.9250969886779785, "learning_rate": 5.740868366643695e-05, "loss": 0.5802, "step": 2170 }, { "epoch": 0.7263035149092121, "grad_norm": 0.8039131760597229, "learning_rate": 5.671950379048932e-05, "loss": 0.7577, "step": 2180 }, { "epoch": 0.7296351824087957, "grad_norm": 0.7784701585769653, "learning_rate": 5.603032391454169e-05, "loss": 0.8382, "step": 2190 }, { "epoch": 0.7329668499083791, "grad_norm": 0.6515526175498962, "learning_rate": 5.534114403859407e-05, "loss": 0.6808, "step": 2200 }, { "epoch": 0.7362985174079627, "grad_norm": 1.0747514963150024, "learning_rate": 5.465196416264645e-05, "loss": 0.6112, "step": 2210 }, { "epoch": 0.7396301849075462, "grad_norm": 1.031267762184143, "learning_rate": 5.3962784286698834e-05, "loss": 0.6316, "step": 2220 }, { "epoch": 0.7429618524071298, "grad_norm": 0.8532452583312988, "learning_rate": 5.327360441075121e-05, "loss": 0.5855, "step": 2230 }, { "epoch": 0.7462935199067133, "grad_norm": 0.7305378317832947, "learning_rate": 5.258442453480359e-05, "loss": 0.5746, "step": 2240 }, { "epoch": 0.7496251874062968, "grad_norm": 0.7505248188972473, "learning_rate": 5.189524465885597e-05, "loss": 0.6912, "step": 2250 }, { "epoch": 0.7529568549058804, "grad_norm": 0.8554951548576355, "learning_rate": 5.120606478290834e-05, "loss": 0.5665, "step": 2260 }, { "epoch": 0.756288522405464, "grad_norm": 0.9799861311912537, "learning_rate": 5.051688490696072e-05, "loss": 0.7554, "step": 2270 }, { "epoch": 0.7596201899050474, "grad_norm": 0.6496158242225647, "learning_rate": 4.982770503101309e-05, "loss": 0.4964, "step": 2280 }, { "epoch": 0.762951857404631, "grad_norm": 0.7765501141548157, "learning_rate": 4.9138525155065474e-05, "loss": 0.6501, "step": 2290 }, { "epoch": 0.7662835249042146, "grad_norm": 0.926641047000885, "learning_rate": 4.8449345279117855e-05, "loss": 0.6377, "step": 2300 }, { "epoch": 0.769615192403798, "grad_norm": 0.4838825464248657, "learning_rate": 4.776016540317023e-05, "loss": 0.6421, "step": 2310 }, { "epoch": 0.7729468599033816, "grad_norm": 1.0005497932434082, "learning_rate": 4.70709855272226e-05, "loss": 0.663, "step": 2320 }, { "epoch": 0.7762785274029652, "grad_norm": 0.8331218957901001, "learning_rate": 4.6381805651274984e-05, "loss": 0.6195, "step": 2330 }, { "epoch": 0.7796101949025487, "grad_norm": 0.6971142888069153, "learning_rate": 4.5692625775327365e-05, "loss": 0.5667, "step": 2340 }, { "epoch": 0.7829418624021323, "grad_norm": 0.7409766316413879, "learning_rate": 4.5003445899379746e-05, "loss": 0.5932, "step": 2350 }, { "epoch": 0.7862735299017158, "grad_norm": 0.8003771305084229, "learning_rate": 4.431426602343211e-05, "loss": 0.6524, "step": 2360 }, { "epoch": 0.7896051974012993, "grad_norm": 0.8950629234313965, "learning_rate": 4.3625086147484494e-05, "loss": 0.6569, "step": 2370 }, { "epoch": 0.7929368649008829, "grad_norm": 0.6981242895126343, "learning_rate": 4.2935906271536875e-05, "loss": 0.6612, "step": 2380 }, { "epoch": 0.7962685324004665, "grad_norm": 0.7851802706718445, "learning_rate": 4.2246726395589256e-05, "loss": 0.6551, "step": 2390 }, { "epoch": 0.7996001999000499, "grad_norm": 0.7606090903282166, "learning_rate": 4.1557546519641624e-05, "loss": 0.6059, "step": 2400 }, { "epoch": 0.8029318673996335, "grad_norm": 0.8207200169563293, "learning_rate": 4.0868366643694005e-05, "loss": 0.6677, "step": 2410 }, { "epoch": 0.8062635348992171, "grad_norm": 0.9407020211219788, "learning_rate": 4.0179186767746386e-05, "loss": 0.552, "step": 2420 }, { "epoch": 0.8095952023988006, "grad_norm": 0.6682471632957458, "learning_rate": 3.949000689179876e-05, "loss": 0.5273, "step": 2430 }, { "epoch": 0.8129268698983841, "grad_norm": 0.9708258509635925, "learning_rate": 3.880082701585114e-05, "loss": 0.638, "step": 2440 }, { "epoch": 0.8162585373979677, "grad_norm": 0.5546590685844421, "learning_rate": 3.8111647139903515e-05, "loss": 0.6464, "step": 2450 }, { "epoch": 0.8195902048975512, "grad_norm": 0.7525760531425476, "learning_rate": 3.7422467263955896e-05, "loss": 0.383, "step": 2460 }, { "epoch": 0.8229218723971348, "grad_norm": 0.8465914726257324, "learning_rate": 3.673328738800827e-05, "loss": 0.664, "step": 2470 }, { "epoch": 0.8262535398967183, "grad_norm": 0.9415400624275208, "learning_rate": 3.604410751206065e-05, "loss": 0.5489, "step": 2480 }, { "epoch": 0.8295852073963018, "grad_norm": 0.699641764163971, "learning_rate": 3.5354927636113026e-05, "loss": 0.5779, "step": 2490 }, { "epoch": 0.8329168748958854, "grad_norm": 0.6599105596542358, "learning_rate": 3.4665747760165406e-05, "loss": 0.5486, "step": 2500 }, { "epoch": 0.836248542395469, "grad_norm": 0.8070369362831116, "learning_rate": 3.397656788421778e-05, "loss": 0.7153, "step": 2510 }, { "epoch": 0.8395802098950524, "grad_norm": 0.7151026129722595, "learning_rate": 3.328738800827016e-05, "loss": 0.5442, "step": 2520 }, { "epoch": 0.842911877394636, "grad_norm": 1.014334797859192, "learning_rate": 3.2598208132322536e-05, "loss": 0.6155, "step": 2530 }, { "epoch": 0.8462435448942196, "grad_norm": 0.7710210084915161, "learning_rate": 3.190902825637492e-05, "loss": 0.6111, "step": 2540 }, { "epoch": 0.8495752123938031, "grad_norm": 0.7453182935714722, "learning_rate": 3.121984838042729e-05, "loss": 0.5853, "step": 2550 }, { "epoch": 0.8529068798933866, "grad_norm": 0.9670674800872803, "learning_rate": 3.053066850447967e-05, "loss": 0.5902, "step": 2560 }, { "epoch": 0.8562385473929702, "grad_norm": 0.9570378661155701, "learning_rate": 2.984148862853205e-05, "loss": 0.6738, "step": 2570 }, { "epoch": 0.8595702148925537, "grad_norm": 0.6435806751251221, "learning_rate": 2.9152308752584427e-05, "loss": 0.6568, "step": 2580 }, { "epoch": 0.8629018823921373, "grad_norm": 0.8892498016357422, "learning_rate": 2.84631288766368e-05, "loss": 0.6305, "step": 2590 }, { "epoch": 0.8662335498917209, "grad_norm": 0.9092233777046204, "learning_rate": 2.7773949000689182e-05, "loss": 0.6807, "step": 2600 }, { "epoch": 0.8695652173913043, "grad_norm": 0.6867402195930481, "learning_rate": 2.708476912474156e-05, "loss": 0.6785, "step": 2610 }, { "epoch": 0.8728968848908879, "grad_norm": 1.011870265007019, "learning_rate": 2.6395589248793934e-05, "loss": 0.773, "step": 2620 }, { "epoch": 0.8762285523904715, "grad_norm": 0.5611357688903809, "learning_rate": 2.5706409372846312e-05, "loss": 0.5763, "step": 2630 }, { "epoch": 0.879560219890055, "grad_norm": 0.7307304739952087, "learning_rate": 2.5017229496898693e-05, "loss": 0.6522, "step": 2640 }, { "epoch": 0.8828918873896385, "grad_norm": 0.8726571202278137, "learning_rate": 2.4328049620951067e-05, "loss": 0.67, "step": 2650 }, { "epoch": 0.8862235548892221, "grad_norm": 0.7599055171012878, "learning_rate": 2.3638869745003448e-05, "loss": 0.5266, "step": 2660 }, { "epoch": 0.8895552223888056, "grad_norm": 0.9391018748283386, "learning_rate": 2.2949689869055822e-05, "loss": 0.5366, "step": 2670 }, { "epoch": 0.8928868898883892, "grad_norm": 0.801648736000061, "learning_rate": 2.2260509993108203e-05, "loss": 0.5702, "step": 2680 }, { "epoch": 0.8962185573879726, "grad_norm": 0.7381575107574463, "learning_rate": 2.157133011716058e-05, "loss": 0.5519, "step": 2690 }, { "epoch": 0.8995502248875562, "grad_norm": 0.7763687968254089, "learning_rate": 2.088215024121296e-05, "loss": 0.6345, "step": 2700 }, { "epoch": 0.9028818923871398, "grad_norm": 0.6713552474975586, "learning_rate": 2.0192970365265336e-05, "loss": 0.4188, "step": 2710 }, { "epoch": 0.9062135598867233, "grad_norm": 0.4726286232471466, "learning_rate": 1.9503790489317714e-05, "loss": 0.7027, "step": 2720 }, { "epoch": 0.9095452273863068, "grad_norm": 0.559560239315033, "learning_rate": 1.881461061337009e-05, "loss": 0.541, "step": 2730 }, { "epoch": 0.9128768948858904, "grad_norm": 0.8475058078765869, "learning_rate": 1.812543073742247e-05, "loss": 0.6384, "step": 2740 }, { "epoch": 0.9162085623854739, "grad_norm": 0.8099564909934998, "learning_rate": 1.7436250861474846e-05, "loss": 0.6216, "step": 2750 }, { "epoch": 0.9195402298850575, "grad_norm": 0.7477239966392517, "learning_rate": 1.6747070985527224e-05, "loss": 0.568, "step": 2760 }, { "epoch": 0.922871897384641, "grad_norm": 0.7421704530715942, "learning_rate": 1.60578911095796e-05, "loss": 0.5097, "step": 2770 }, { "epoch": 0.9262035648842245, "grad_norm": 1.0993244647979736, "learning_rate": 1.536871123363198e-05, "loss": 0.6455, "step": 2780 }, { "epoch": 0.9295352323838081, "grad_norm": 0.9197335839271545, "learning_rate": 1.4679531357684357e-05, "loss": 0.5122, "step": 2790 }, { "epoch": 0.9328668998833917, "grad_norm": 0.7237057089805603, "learning_rate": 1.3990351481736733e-05, "loss": 0.6608, "step": 2800 }, { "epoch": 0.9361985673829751, "grad_norm": 1.0517019033432007, "learning_rate": 1.3301171605789112e-05, "loss": 0.6283, "step": 2810 }, { "epoch": 0.9395302348825587, "grad_norm": 0.8161411285400391, "learning_rate": 1.2611991729841488e-05, "loss": 0.5655, "step": 2820 }, { "epoch": 0.9428619023821423, "grad_norm": 0.8740524053573608, "learning_rate": 1.1922811853893867e-05, "loss": 0.4393, "step": 2830 }, { "epoch": 0.9461935698817258, "grad_norm": 0.5465930700302124, "learning_rate": 1.1233631977946245e-05, "loss": 0.5458, "step": 2840 }, { "epoch": 0.9495252373813093, "grad_norm": 1.004461646080017, "learning_rate": 1.0544452101998622e-05, "loss": 0.6996, "step": 2850 }, { "epoch": 0.9528569048808929, "grad_norm": 0.9351420998573303, "learning_rate": 9.855272226051e-06, "loss": 0.6294, "step": 2860 }, { "epoch": 0.9561885723804764, "grad_norm": 0.9448681473731995, "learning_rate": 9.166092350103378e-06, "loss": 0.6851, "step": 2870 }, { "epoch": 0.95952023988006, "grad_norm": 0.9818257689476013, "learning_rate": 8.476912474155755e-06, "loss": 0.6429, "step": 2880 }, { "epoch": 0.9628519073796435, "grad_norm": 0.6631109714508057, "learning_rate": 7.787732598208133e-06, "loss": 0.5943, "step": 2890 }, { "epoch": 0.966183574879227, "grad_norm": 0.8403130769729614, "learning_rate": 7.0985527222605096e-06, "loss": 0.4675, "step": 2900 }, { "epoch": 0.9695152423788106, "grad_norm": 0.7020695805549622, "learning_rate": 6.409372846312887e-06, "loss": 0.4861, "step": 2910 }, { "epoch": 0.9728469098783942, "grad_norm": 1.1053721904754639, "learning_rate": 5.720192970365266e-06, "loss": 0.5639, "step": 2920 }, { "epoch": 0.9761785773779776, "grad_norm": 0.9987778663635254, "learning_rate": 5.031013094417643e-06, "loss": 0.5934, "step": 2930 }, { "epoch": 0.9795102448775612, "grad_norm": 0.9289517998695374, "learning_rate": 4.341833218470021e-06, "loss": 0.5889, "step": 2940 }, { "epoch": 0.9828419123771448, "grad_norm": 0.7212059497833252, "learning_rate": 3.6526533425223984e-06, "loss": 0.6013, "step": 2950 }, { "epoch": 0.9861735798767283, "grad_norm": 0.8436290621757507, "learning_rate": 2.9634734665747764e-06, "loss": 0.6237, "step": 2960 }, { "epoch": 0.9895052473763118, "grad_norm": 0.9585169553756714, "learning_rate": 2.2742935906271536e-06, "loss": 0.6193, "step": 2970 }, { "epoch": 0.9928369148758954, "grad_norm": 0.7987054586410522, "learning_rate": 1.5851137146795314e-06, "loss": 0.6871, "step": 2980 }, { "epoch": 0.9961685823754789, "grad_norm": 0.7917064428329468, "learning_rate": 8.959338387319091e-07, "loss": 0.6777, "step": 2990 }, { "epoch": 0.9995002498750625, "grad_norm": 0.8279238343238831, "learning_rate": 2.067539627842867e-07, "loss": 0.6662, "step": 3000 } ], "logging_steps": 10, "max_steps": 3002, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.7125049258477568e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }