Files
mistral-nemo-lp-ai/checkpoint-3002/trainer_state.json

2135 lines
52 KiB
JSON
Raw Normal View History

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 3002,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0033316674995835416,
"grad_norm": 0.8176751732826233,
"learning_rate": 1.8e-05,
"loss": 2.2936,
"step": 10
},
{
"epoch": 0.006663334999167083,
"grad_norm": 0.5975170731544495,
"learning_rate": 3.8e-05,
"loss": 2.187,
"step": 20
},
{
"epoch": 0.009995002498750625,
"grad_norm": 0.5774023532867432,
"learning_rate": 5.8e-05,
"loss": 2.0133,
"step": 30
},
{
"epoch": 0.013326669998334166,
"grad_norm": 0.5514141917228699,
"learning_rate": 7.800000000000001e-05,
"loss": 1.9357,
"step": 40
},
{
"epoch": 0.01665833749791771,
"grad_norm": 0.599364161491394,
"learning_rate": 9.8e-05,
"loss": 1.8211,
"step": 50
},
{
"epoch": 0.01999000499750125,
"grad_norm": 0.8537599444389343,
"learning_rate": 0.000118,
"loss": 1.9177,
"step": 60
},
{
"epoch": 0.02332167249708479,
"grad_norm": 0.7055738568305969,
"learning_rate": 0.000138,
"loss": 1.8151,
"step": 70
},
{
"epoch": 0.026653339996668333,
"grad_norm": 0.7180108428001404,
"learning_rate": 0.00015800000000000002,
"loss": 1.7354,
"step": 80
},
{
"epoch": 0.029985007496251874,
"grad_norm": 0.9212970733642578,
"learning_rate": 0.00017800000000000002,
"loss": 1.6311,
"step": 90
},
{
"epoch": 0.03331667499583542,
"grad_norm": 0.8818132281303406,
"learning_rate": 0.00019800000000000002,
"loss": 1.6173,
"step": 100
},
{
"epoch": 0.036648342495418956,
"grad_norm": 0.7504797577857971,
"learning_rate": 0.00019937973811164715,
"loss": 1.698,
"step": 110
},
{
"epoch": 0.0399800099950025,
"grad_norm": 0.7597770094871521,
"learning_rate": 0.00019869055823569954,
"loss": 1.5597,
"step": 120
},
{
"epoch": 0.04331167749458604,
"grad_norm": 0.7281157374382019,
"learning_rate": 0.0001980013783597519,
"loss": 1.5941,
"step": 130
},
{
"epoch": 0.04664334499416958,
"grad_norm": 0.8265408277511597,
"learning_rate": 0.0001973121984838043,
"loss": 1.3635,
"step": 140
},
{
"epoch": 0.04997501249375312,
"grad_norm": 0.8942359089851379,
"learning_rate": 0.00019662301860785665,
"loss": 1.2472,
"step": 150
},
{
"epoch": 0.053306679993336666,
"grad_norm": 0.6996039152145386,
"learning_rate": 0.00019593383873190904,
"loss": 1.5311,
"step": 160
},
{
"epoch": 0.0566383474929202,
"grad_norm": 0.7954139709472656,
"learning_rate": 0.00019524465885596142,
"loss": 1.4262,
"step": 170
},
{
"epoch": 0.05997001499250375,
"grad_norm": 0.8379319310188293,
"learning_rate": 0.00019455547898001378,
"loss": 1.4121,
"step": 180
},
{
"epoch": 0.06330168249208729,
"grad_norm": 0.7172244191169739,
"learning_rate": 0.00019386629910406617,
"loss": 1.379,
"step": 190
},
{
"epoch": 0.06663334999167084,
"grad_norm": 0.6560420393943787,
"learning_rate": 0.00019317711922811853,
"loss": 1.3696,
"step": 200
},
{
"epoch": 0.06996501749125437,
"grad_norm": 0.6783866286277771,
"learning_rate": 0.00019248793935217092,
"loss": 1.3017,
"step": 210
},
{
"epoch": 0.07329668499083791,
"grad_norm": 0.6463010311126709,
"learning_rate": 0.0001917987594762233,
"loss": 1.278,
"step": 220
},
{
"epoch": 0.07662835249042145,
"grad_norm": 0.5589348673820496,
"learning_rate": 0.0001911095796002757,
"loss": 1.348,
"step": 230
},
{
"epoch": 0.079960019990005,
"grad_norm": 0.7754651308059692,
"learning_rate": 0.00019042039972432806,
"loss": 1.2947,
"step": 240
},
{
"epoch": 0.08329168748958854,
"grad_norm": 0.8420850038528442,
"learning_rate": 0.00018973121984838042,
"loss": 1.3621,
"step": 250
},
{
"epoch": 0.08662335498917208,
"grad_norm": 0.7481808662414551,
"learning_rate": 0.0001890420399724328,
"loss": 1.2055,
"step": 260
},
{
"epoch": 0.08995502248875563,
"grad_norm": 0.6213952898979187,
"learning_rate": 0.0001883528600964852,
"loss": 1.1484,
"step": 270
},
{
"epoch": 0.09328668998833917,
"grad_norm": 0.767515242099762,
"learning_rate": 0.00018766368022053758,
"loss": 1.2137,
"step": 280
},
{
"epoch": 0.0966183574879227,
"grad_norm": 0.5464005470275879,
"learning_rate": 0.00018697450034458994,
"loss": 1.0461,
"step": 290
},
{
"epoch": 0.09995002498750624,
"grad_norm": 0.6953014135360718,
"learning_rate": 0.00018628532046864233,
"loss": 1.0792,
"step": 300
},
{
"epoch": 0.1032816924870898,
"grad_norm": 0.6513417363166809,
"learning_rate": 0.0001855961405926947,
"loss": 1.3351,
"step": 310
},
{
"epoch": 0.10661335998667333,
"grad_norm": 0.754256546497345,
"learning_rate": 0.0001849069607167471,
"loss": 1.1128,
"step": 320
},
{
"epoch": 0.10994502748625687,
"grad_norm": 0.6703091859817505,
"learning_rate": 0.00018421778084079946,
"loss": 1.0618,
"step": 330
},
{
"epoch": 0.1132766949858404,
"grad_norm": 0.5775123834609985,
"learning_rate": 0.00018352860096485183,
"loss": 1.0024,
"step": 340
},
{
"epoch": 0.11660836248542396,
"grad_norm": 0.6067807078361511,
"learning_rate": 0.0001828394210889042,
"loss": 1.161,
"step": 350
},
{
"epoch": 0.1199400299850075,
"grad_norm": 0.7527532577514648,
"learning_rate": 0.00018215024121295657,
"loss": 1.0168,
"step": 360
},
{
"epoch": 0.12327169748459103,
"grad_norm": 0.7340474128723145,
"learning_rate": 0.000181461061337009,
"loss": 1.1902,
"step": 370
},
{
"epoch": 0.12660336498417457,
"grad_norm": 0.6416488289833069,
"learning_rate": 0.00018077188146106135,
"loss": 1.1745,
"step": 380
},
{
"epoch": 0.12993503248375812,
"grad_norm": 0.7203887701034546,
"learning_rate": 0.00018008270158511374,
"loss": 1.1377,
"step": 390
},
{
"epoch": 0.13326669998334167,
"grad_norm": 0.7556202411651611,
"learning_rate": 0.0001793935217091661,
"loss": 1.0205,
"step": 400
},
{
"epoch": 0.1365983674829252,
"grad_norm": 0.655582845211029,
"learning_rate": 0.00017870434183321846,
"loss": 1.1007,
"step": 410
},
{
"epoch": 0.13993003498250875,
"grad_norm": 0.4769359230995178,
"learning_rate": 0.00017801516195727087,
"loss": 1.0824,
"step": 420
},
{
"epoch": 0.14326170248209227,
"grad_norm": 0.7861637473106384,
"learning_rate": 0.00017732598208132323,
"loss": 1.086,
"step": 430
},
{
"epoch": 0.14659336998167583,
"grad_norm": 0.4841909110546112,
"learning_rate": 0.00017663680220537562,
"loss": 1.1964,
"step": 440
},
{
"epoch": 0.14992503748125938,
"grad_norm": 0.6953870058059692,
"learning_rate": 0.00017594762232942798,
"loss": 0.9949,
"step": 450
},
{
"epoch": 0.1532567049808429,
"grad_norm": 0.7086942791938782,
"learning_rate": 0.00017525844245348037,
"loss": 1.0006,
"step": 460
},
{
"epoch": 0.15658837248042645,
"grad_norm": 0.5370402336120605,
"learning_rate": 0.00017456926257753273,
"loss": 0.9224,
"step": 470
},
{
"epoch": 0.15992003998001,
"grad_norm": 0.5842561721801758,
"learning_rate": 0.00017388008270158512,
"loss": 1.049,
"step": 480
},
{
"epoch": 0.16325170747959353,
"grad_norm": 0.7351865768432617,
"learning_rate": 0.0001731909028256375,
"loss": 1.0031,
"step": 490
},
{
"epoch": 0.16658337497917708,
"grad_norm": 0.5930982232093811,
"learning_rate": 0.00017250172294968987,
"loss": 1.1909,
"step": 500
},
{
"epoch": 0.16991504247876063,
"grad_norm": 0.6230600476264954,
"learning_rate": 0.00017181254307374225,
"loss": 0.9673,
"step": 510
},
{
"epoch": 0.17324670997834415,
"grad_norm": 0.5371518731117249,
"learning_rate": 0.00017112336319779462,
"loss": 1.089,
"step": 520
},
{
"epoch": 0.1765783774779277,
"grad_norm": 0.5983089208602905,
"learning_rate": 0.00017043418332184703,
"loss": 1.0183,
"step": 530
},
{
"epoch": 0.17991004497751126,
"grad_norm": 0.8884322643280029,
"learning_rate": 0.0001697450034458994,
"loss": 1.037,
"step": 540
},
{
"epoch": 0.18324171247709478,
"grad_norm": 0.6050639748573303,
"learning_rate": 0.00016905582356995175,
"loss": 1.0937,
"step": 550
},
{
"epoch": 0.18657337997667833,
"grad_norm": 0.5100018382072449,
"learning_rate": 0.00016836664369400414,
"loss": 0.9618,
"step": 560
},
{
"epoch": 0.18990504747626186,
"grad_norm": 0.5553308725357056,
"learning_rate": 0.0001676774638180565,
"loss": 1.1016,
"step": 570
},
{
"epoch": 0.1932367149758454,
"grad_norm": 0.7883793115615845,
"learning_rate": 0.00016698828394210891,
"loss": 1.0927,
"step": 580
},
{
"epoch": 0.19656838247542896,
"grad_norm": 0.7052305340766907,
"learning_rate": 0.00016629910406616128,
"loss": 0.939,
"step": 590
},
{
"epoch": 0.19990004997501248,
"grad_norm": 0.6732206344604492,
"learning_rate": 0.00016560992419021366,
"loss": 1.0596,
"step": 600
},
{
"epoch": 0.20323171747459604,
"grad_norm": 0.7498496174812317,
"learning_rate": 0.00016492074431426602,
"loss": 0.9612,
"step": 610
},
{
"epoch": 0.2065633849741796,
"grad_norm": 0.7365151047706604,
"learning_rate": 0.0001642315644383184,
"loss": 1.0503,
"step": 620
},
{
"epoch": 0.2098950524737631,
"grad_norm": 0.5978183746337891,
"learning_rate": 0.0001635423845623708,
"loss": 1.07,
"step": 630
},
{
"epoch": 0.21322671997334666,
"grad_norm": 0.7099848389625549,
"learning_rate": 0.00016285320468642316,
"loss": 0.9538,
"step": 640
},
{
"epoch": 0.2165583874729302,
"grad_norm": 0.6647136807441711,
"learning_rate": 0.00016216402481047555,
"loss": 0.9772,
"step": 650
},
{
"epoch": 0.21989005497251374,
"grad_norm": 0.6482895612716675,
"learning_rate": 0.0001614748449345279,
"loss": 1.0033,
"step": 660
},
{
"epoch": 0.2232217224720973,
"grad_norm": 0.583042323589325,
"learning_rate": 0.0001607856650585803,
"loss": 0.8133,
"step": 670
},
{
"epoch": 0.2265533899716808,
"grad_norm": 0.6488454341888428,
"learning_rate": 0.00016009648518263268,
"loss": 0.9405,
"step": 680
},
{
"epoch": 0.22988505747126436,
"grad_norm": 0.5041667819023132,
"learning_rate": 0.00015940730530668507,
"loss": 1.0427,
"step": 690
},
{
"epoch": 0.23321672497084792,
"grad_norm": 0.6406366229057312,
"learning_rate": 0.00015871812543073743,
"loss": 0.9604,
"step": 700
},
{
"epoch": 0.23654839247043144,
"grad_norm": 0.6386472582817078,
"learning_rate": 0.0001580289455547898,
"loss": 0.8826,
"step": 710
},
{
"epoch": 0.239880059970015,
"grad_norm": 0.7378501892089844,
"learning_rate": 0.00015733976567884218,
"loss": 0.9141,
"step": 720
},
{
"epoch": 0.24321172746959854,
"grad_norm": 0.6911444067955017,
"learning_rate": 0.00015665058580289457,
"loss": 0.8735,
"step": 730
},
{
"epoch": 0.24654339496918207,
"grad_norm": 0.7621609568595886,
"learning_rate": 0.00015596140592694696,
"loss": 0.8972,
"step": 740
},
{
"epoch": 0.24987506246876562,
"grad_norm": 0.5592761039733887,
"learning_rate": 0.00015527222605099932,
"loss": 0.8612,
"step": 750
},
{
"epoch": 0.25320672996834914,
"grad_norm": 0.4869195520877838,
"learning_rate": 0.0001545830461750517,
"loss": 0.8663,
"step": 760
},
{
"epoch": 0.2565383974679327,
"grad_norm": 0.8638468980789185,
"learning_rate": 0.00015389386629910407,
"loss": 0.9393,
"step": 770
},
{
"epoch": 0.25987006496751625,
"grad_norm": 0.5576454997062683,
"learning_rate": 0.00015320468642315645,
"loss": 0.8925,
"step": 780
},
{
"epoch": 0.26320173246709977,
"grad_norm": 0.6767865419387817,
"learning_rate": 0.00015251550654720884,
"loss": 0.8692,
"step": 790
},
{
"epoch": 0.26653339996668335,
"grad_norm": 0.8351936340332031,
"learning_rate": 0.0001518263266712612,
"loss": 0.9813,
"step": 800
},
{
"epoch": 0.2698650674662669,
"grad_norm": 0.6378228068351746,
"learning_rate": 0.0001511371467953136,
"loss": 0.7901,
"step": 810
},
{
"epoch": 0.2731967349658504,
"grad_norm": 0.6529081463813782,
"learning_rate": 0.00015044796691936595,
"loss": 0.9269,
"step": 820
},
{
"epoch": 0.276528402465434,
"grad_norm": 0.7450738549232483,
"learning_rate": 0.00014975878704341834,
"loss": 0.9723,
"step": 830
},
{
"epoch": 0.2798600699650175,
"grad_norm": 0.7390174865722656,
"learning_rate": 0.00014906960716747073,
"loss": 0.8644,
"step": 840
},
{
"epoch": 0.283191737464601,
"grad_norm": 0.6609140634536743,
"learning_rate": 0.0001483804272915231,
"loss": 0.9261,
"step": 850
},
{
"epoch": 0.28652340496418455,
"grad_norm": 0.8164989352226257,
"learning_rate": 0.00014769124741557547,
"loss": 0.913,
"step": 860
},
{
"epoch": 0.2898550724637681,
"grad_norm": 0.7193732261657715,
"learning_rate": 0.00014700206753962783,
"loss": 0.8622,
"step": 870
},
{
"epoch": 0.29318673996335165,
"grad_norm": 0.5452458262443542,
"learning_rate": 0.00014631288766368022,
"loss": 0.7792,
"step": 880
},
{
"epoch": 0.2965184074629352,
"grad_norm": 0.7681081295013428,
"learning_rate": 0.0001456237077877326,
"loss": 0.965,
"step": 890
},
{
"epoch": 0.29985007496251875,
"grad_norm": 0.6345311999320984,
"learning_rate": 0.000144934527911785,
"loss": 0.7115,
"step": 900
},
{
"epoch": 0.3031817424621023,
"grad_norm": 0.6799845695495605,
"learning_rate": 0.00014424534803583736,
"loss": 0.9525,
"step": 910
},
{
"epoch": 0.3065134099616858,
"grad_norm": 0.6358153223991394,
"learning_rate": 0.00014355616815988975,
"loss": 0.9661,
"step": 920
},
{
"epoch": 0.3098450774612694,
"grad_norm": 0.8221323490142822,
"learning_rate": 0.0001428669882839421,
"loss": 1.0507,
"step": 930
},
{
"epoch": 0.3131767449608529,
"grad_norm": 0.8563844561576843,
"learning_rate": 0.0001421778084079945,
"loss": 0.8268,
"step": 940
},
{
"epoch": 0.31650841246043643,
"grad_norm": 0.6171509027481079,
"learning_rate": 0.00014148862853204688,
"loss": 0.8938,
"step": 950
},
{
"epoch": 0.31984007996002,
"grad_norm": 0.6679477095603943,
"learning_rate": 0.00014079944865609924,
"loss": 0.9604,
"step": 960
},
{
"epoch": 0.32317174745960353,
"grad_norm": 0.7955806851387024,
"learning_rate": 0.00014011026878015163,
"loss": 0.9084,
"step": 970
},
{
"epoch": 0.32650341495918705,
"grad_norm": 0.6949059367179871,
"learning_rate": 0.000139421088904204,
"loss": 0.8687,
"step": 980
},
{
"epoch": 0.32983508245877063,
"grad_norm": 0.6657271385192871,
"learning_rate": 0.00013873190902825638,
"loss": 0.9217,
"step": 990
},
{
"epoch": 0.33316674995835416,
"grad_norm": 0.8809479475021362,
"learning_rate": 0.00013804272915230877,
"loss": 0.7875,
"step": 1000
},
{
"epoch": 0.3364984174579377,
"grad_norm": 0.47438332438468933,
"learning_rate": 0.00013735354927636113,
"loss": 0.8885,
"step": 1010
},
{
"epoch": 0.33983008495752126,
"grad_norm": 0.7127712368965149,
"learning_rate": 0.00013666436940041352,
"loss": 0.8514,
"step": 1020
},
{
"epoch": 0.3431617524571048,
"grad_norm": 0.7310017347335815,
"learning_rate": 0.00013597518952446588,
"loss": 0.8137,
"step": 1030
},
{
"epoch": 0.3464934199566883,
"grad_norm": 0.7233092188835144,
"learning_rate": 0.00013528600964851826,
"loss": 0.7911,
"step": 1040
},
{
"epoch": 0.3498250874562719,
"grad_norm": 0.7451456785202026,
"learning_rate": 0.00013459682977257065,
"loss": 0.9467,
"step": 1050
},
{
"epoch": 0.3531567549558554,
"grad_norm": 0.796917736530304,
"learning_rate": 0.00013390764989662304,
"loss": 0.9069,
"step": 1060
},
{
"epoch": 0.35648842245543894,
"grad_norm": 0.571403443813324,
"learning_rate": 0.0001332184700206754,
"loss": 0.8283,
"step": 1070
},
{
"epoch": 0.3598200899550225,
"grad_norm": 0.6184263825416565,
"learning_rate": 0.0001325292901447278,
"loss": 0.875,
"step": 1080
},
{
"epoch": 0.36315175745460604,
"grad_norm": 0.7700721025466919,
"learning_rate": 0.00013184011026878015,
"loss": 0.8443,
"step": 1090
},
{
"epoch": 0.36648342495418956,
"grad_norm": 0.8920392394065857,
"learning_rate": 0.00013115093039283254,
"loss": 0.8516,
"step": 1100
},
{
"epoch": 0.3698150924537731,
"grad_norm": 0.6632056832313538,
"learning_rate": 0.00013046175051688492,
"loss": 0.7407,
"step": 1110
},
{
"epoch": 0.37314675995335667,
"grad_norm": 0.677737832069397,
"learning_rate": 0.00012977257064093728,
"loss": 0.9112,
"step": 1120
},
{
"epoch": 0.3764784274529402,
"grad_norm": 0.7659761309623718,
"learning_rate": 0.00012908339076498967,
"loss": 0.812,
"step": 1130
},
{
"epoch": 0.3798100949525237,
"grad_norm": 0.6237064003944397,
"learning_rate": 0.00012839421088904203,
"loss": 0.8422,
"step": 1140
},
{
"epoch": 0.3831417624521073,
"grad_norm": 0.8118287920951843,
"learning_rate": 0.00012770503101309442,
"loss": 0.8597,
"step": 1150
},
{
"epoch": 0.3864734299516908,
"grad_norm": 0.7423121333122253,
"learning_rate": 0.0001270158511371468,
"loss": 0.7809,
"step": 1160
},
{
"epoch": 0.38980509745127434,
"grad_norm": 0.7867801785469055,
"learning_rate": 0.00012632667126119917,
"loss": 1.0343,
"step": 1170
},
{
"epoch": 0.3931367649508579,
"grad_norm": 0.7463882565498352,
"learning_rate": 0.00012563749138525156,
"loss": 0.8384,
"step": 1180
},
{
"epoch": 0.39646843245044144,
"grad_norm": 0.68085777759552,
"learning_rate": 0.00012494831150930392,
"loss": 0.9429,
"step": 1190
},
{
"epoch": 0.39980009995002497,
"grad_norm": 0.741705060005188,
"learning_rate": 0.0001242591316333563,
"loss": 0.9062,
"step": 1200
},
{
"epoch": 0.40313176744960855,
"grad_norm": 0.6207161545753479,
"learning_rate": 0.0001235699517574087,
"loss": 0.8259,
"step": 1210
},
{
"epoch": 0.40646343494919207,
"grad_norm": 0.6957824230194092,
"learning_rate": 0.00012288077188146108,
"loss": 0.9225,
"step": 1220
},
{
"epoch": 0.4097951024487756,
"grad_norm": 0.7893931865692139,
"learning_rate": 0.00012219159200551344,
"loss": 0.7725,
"step": 1230
},
{
"epoch": 0.4131267699483592,
"grad_norm": 0.7295857071876526,
"learning_rate": 0.00012150241212956582,
"loss": 0.7769,
"step": 1240
},
{
"epoch": 0.4164584374479427,
"grad_norm": 0.8230463862419128,
"learning_rate": 0.0001208132322536182,
"loss": 0.8399,
"step": 1250
},
{
"epoch": 0.4197901049475262,
"grad_norm": 0.8607476949691772,
"learning_rate": 0.00012012405237767056,
"loss": 0.6551,
"step": 1260
},
{
"epoch": 0.4231217724471098,
"grad_norm": 0.7293261885643005,
"learning_rate": 0.00011943487250172297,
"loss": 0.7303,
"step": 1270
},
{
"epoch": 0.4264534399466933,
"grad_norm": 0.5877302289009094,
"learning_rate": 0.00011874569262577533,
"loss": 0.7893,
"step": 1280
},
{
"epoch": 0.42978510744627685,
"grad_norm": 0.5563659071922302,
"learning_rate": 0.00011805651274982771,
"loss": 0.7517,
"step": 1290
},
{
"epoch": 0.4331167749458604,
"grad_norm": 0.6885454654693604,
"learning_rate": 0.00011736733287388009,
"loss": 0.6623,
"step": 1300
},
{
"epoch": 0.43644844244544395,
"grad_norm": 0.8266370892524719,
"learning_rate": 0.00011667815299793245,
"loss": 0.8886,
"step": 1310
},
{
"epoch": 0.4397801099450275,
"grad_norm": 0.5514026880264282,
"learning_rate": 0.00011598897312198485,
"loss": 0.6652,
"step": 1320
},
{
"epoch": 0.44311177744461105,
"grad_norm": 0.7804675102233887,
"learning_rate": 0.00011529979324603721,
"loss": 0.911,
"step": 1330
},
{
"epoch": 0.4464434449441946,
"grad_norm": 0.7426096200942993,
"learning_rate": 0.0001146106133700896,
"loss": 0.7434,
"step": 1340
},
{
"epoch": 0.4497751124437781,
"grad_norm": 0.6535798907279968,
"learning_rate": 0.00011392143349414197,
"loss": 0.8465,
"step": 1350
},
{
"epoch": 0.4531067799433616,
"grad_norm": 0.665757417678833,
"learning_rate": 0.00011323225361819436,
"loss": 0.8312,
"step": 1360
},
{
"epoch": 0.4564384474429452,
"grad_norm": 0.64393550157547,
"learning_rate": 0.00011254307374224673,
"loss": 0.6107,
"step": 1370
},
{
"epoch": 0.45977011494252873,
"grad_norm": 0.6122268438339233,
"learning_rate": 0.00011185389386629912,
"loss": 0.8532,
"step": 1380
},
{
"epoch": 0.46310178244211225,
"grad_norm": 0.6691811680793762,
"learning_rate": 0.00011116471399035148,
"loss": 0.7827,
"step": 1390
},
{
"epoch": 0.46643344994169583,
"grad_norm": 0.611470639705658,
"learning_rate": 0.00011047553411440386,
"loss": 0.7598,
"step": 1400
},
{
"epoch": 0.46976511744127936,
"grad_norm": 0.8762800097465515,
"learning_rate": 0.00010978635423845624,
"loss": 0.8613,
"step": 1410
},
{
"epoch": 0.4730967849408629,
"grad_norm": 0.710159182548523,
"learning_rate": 0.00010909717436250862,
"loss": 0.7768,
"step": 1420
},
{
"epoch": 0.47642845244044646,
"grad_norm": 0.9778875708580017,
"learning_rate": 0.00010840799448656101,
"loss": 0.8116,
"step": 1430
},
{
"epoch": 0.47976011994003,
"grad_norm": 0.6341977119445801,
"learning_rate": 0.00010771881461061337,
"loss": 0.7059,
"step": 1440
},
{
"epoch": 0.4830917874396135,
"grad_norm": 0.7075402736663818,
"learning_rate": 0.00010702963473466577,
"loss": 0.9244,
"step": 1450
},
{
"epoch": 0.4864234549391971,
"grad_norm": 0.6109429001808167,
"learning_rate": 0.00010634045485871813,
"loss": 0.802,
"step": 1460
},
{
"epoch": 0.4897551224387806,
"grad_norm": 0.7478988170623779,
"learning_rate": 0.0001056512749827705,
"loss": 0.8911,
"step": 1470
},
{
"epoch": 0.49308678993836413,
"grad_norm": 0.6688179969787598,
"learning_rate": 0.00010496209510682289,
"loss": 0.7508,
"step": 1480
},
{
"epoch": 0.4964184574379477,
"grad_norm": 0.7863402962684631,
"learning_rate": 0.00010427291523087525,
"loss": 0.8439,
"step": 1490
},
{
"epoch": 0.49975012493753124,
"grad_norm": 0.8334706425666809,
"learning_rate": 0.00010358373535492765,
"loss": 0.7596,
"step": 1500
},
{
"epoch": 0.5030817924371148,
"grad_norm": 0.8061437010765076,
"learning_rate": 0.00010289455547898001,
"loss": 0.7742,
"step": 1510
},
{
"epoch": 0.5064134599366983,
"grad_norm": 0.7398769855499268,
"learning_rate": 0.0001022053756030324,
"loss": 0.7446,
"step": 1520
},
{
"epoch": 0.5097451274362819,
"grad_norm": 0.5730561017990112,
"learning_rate": 0.00010151619572708478,
"loss": 0.784,
"step": 1530
},
{
"epoch": 0.5130767949358654,
"grad_norm": 0.6701236963272095,
"learning_rate": 0.00010082701585113714,
"loss": 0.8104,
"step": 1540
},
{
"epoch": 0.5164084624354489,
"grad_norm": 0.681547999382019,
"learning_rate": 0.00010013783597518952,
"loss": 0.6719,
"step": 1550
},
{
"epoch": 0.5197401299350325,
"grad_norm": 0.6569002270698547,
"learning_rate": 9.944865609924191e-05,
"loss": 0.7091,
"step": 1560
},
{
"epoch": 0.5230717974346161,
"grad_norm": 0.8231265544891357,
"learning_rate": 9.875947622329429e-05,
"loss": 0.8079,
"step": 1570
},
{
"epoch": 0.5264034649341995,
"grad_norm": 0.8120758533477783,
"learning_rate": 9.807029634734666e-05,
"loss": 0.8125,
"step": 1580
},
{
"epoch": 0.5297351324337831,
"grad_norm": 0.6406270861625671,
"learning_rate": 9.738111647139903e-05,
"loss": 0.6543,
"step": 1590
},
{
"epoch": 0.5330667999333667,
"grad_norm": 0.8023959398269653,
"learning_rate": 9.669193659545141e-05,
"loss": 0.8343,
"step": 1600
},
{
"epoch": 0.5363984674329502,
"grad_norm": 0.7827622294425964,
"learning_rate": 9.60027567195038e-05,
"loss": 0.7749,
"step": 1610
},
{
"epoch": 0.5397301349325337,
"grad_norm": 0.5446188449859619,
"learning_rate": 9.531357684355617e-05,
"loss": 0.6603,
"step": 1620
},
{
"epoch": 0.5430618024321173,
"grad_norm": 0.7404822707176208,
"learning_rate": 9.462439696760856e-05,
"loss": 0.8172,
"step": 1630
},
{
"epoch": 0.5463934699317008,
"grad_norm": 0.7257384061813354,
"learning_rate": 9.393521709166093e-05,
"loss": 0.864,
"step": 1640
},
{
"epoch": 0.5497251374312844,
"grad_norm": 0.8640374541282654,
"learning_rate": 9.324603721571331e-05,
"loss": 0.6451,
"step": 1650
},
{
"epoch": 0.553056804930868,
"grad_norm": 0.6205821633338928,
"learning_rate": 9.255685733976568e-05,
"loss": 0.8729,
"step": 1660
},
{
"epoch": 0.5563884724304514,
"grad_norm": 0.7128989696502686,
"learning_rate": 9.186767746381806e-05,
"loss": 0.7218,
"step": 1670
},
{
"epoch": 0.559720139930035,
"grad_norm": 0.6116006970405579,
"learning_rate": 9.117849758787044e-05,
"loss": 0.7591,
"step": 1680
},
{
"epoch": 0.5630518074296186,
"grad_norm": 0.8077837228775024,
"learning_rate": 9.048931771192282e-05,
"loss": 0.7267,
"step": 1690
},
{
"epoch": 0.566383474929202,
"grad_norm": 0.8824722766876221,
"learning_rate": 8.980013783597519e-05,
"loss": 0.6545,
"step": 1700
},
{
"epoch": 0.5697151424287856,
"grad_norm": 0.9038705229759216,
"learning_rate": 8.911095796002758e-05,
"loss": 0.7376,
"step": 1710
},
{
"epoch": 0.5730468099283691,
"grad_norm": 0.7288265228271484,
"learning_rate": 8.842177808407995e-05,
"loss": 0.8062,
"step": 1720
},
{
"epoch": 0.5763784774279527,
"grad_norm": 0.6127156019210815,
"learning_rate": 8.773259820813233e-05,
"loss": 0.7298,
"step": 1730
},
{
"epoch": 0.5797101449275363,
"grad_norm": 0.7607082724571228,
"learning_rate": 8.70434183321847e-05,
"loss": 0.7417,
"step": 1740
},
{
"epoch": 0.5830418124271197,
"grad_norm": 0.8536520004272461,
"learning_rate": 8.635423845623708e-05,
"loss": 0.6495,
"step": 1750
},
{
"epoch": 0.5863734799267033,
"grad_norm": 0.5629620552062988,
"learning_rate": 8.566505858028946e-05,
"loss": 0.5853,
"step": 1760
},
{
"epoch": 0.5897051474262869,
"grad_norm": 0.8041568398475647,
"learning_rate": 8.497587870434184e-05,
"loss": 0.7068,
"step": 1770
},
{
"epoch": 0.5930368149258703,
"grad_norm": 0.9616042375564575,
"learning_rate": 8.428669882839421e-05,
"loss": 0.7051,
"step": 1780
},
{
"epoch": 0.5963684824254539,
"grad_norm": 0.6616283655166626,
"learning_rate": 8.35975189524466e-05,
"loss": 0.7698,
"step": 1790
},
{
"epoch": 0.5997001499250375,
"grad_norm": 0.7523969411849976,
"learning_rate": 8.290833907649897e-05,
"loss": 0.69,
"step": 1800
},
{
"epoch": 0.603031817424621,
"grad_norm": 0.5366020202636719,
"learning_rate": 8.221915920055135e-05,
"loss": 0.6432,
"step": 1810
},
{
"epoch": 0.6063634849242046,
"grad_norm": 0.7098552584648132,
"learning_rate": 8.152997932460372e-05,
"loss": 0.6767,
"step": 1820
},
{
"epoch": 0.6096951524237881,
"grad_norm": 0.5750883221626282,
"learning_rate": 8.08407994486561e-05,
"loss": 0.6853,
"step": 1830
},
{
"epoch": 0.6130268199233716,
"grad_norm": 0.6619908213615417,
"learning_rate": 8.015161957270849e-05,
"loss": 0.6461,
"step": 1840
},
{
"epoch": 0.6163584874229552,
"grad_norm": 0.6529950499534607,
"learning_rate": 7.946243969676086e-05,
"loss": 0.563,
"step": 1850
},
{
"epoch": 0.6196901549225388,
"grad_norm": 0.6484772562980652,
"learning_rate": 7.877325982081323e-05,
"loss": 0.8557,
"step": 1860
},
{
"epoch": 0.6230218224221222,
"grad_norm": 0.7002941370010376,
"learning_rate": 7.808407994486562e-05,
"loss": 0.6468,
"step": 1870
},
{
"epoch": 0.6263534899217058,
"grad_norm": 0.6880629658699036,
"learning_rate": 7.739490006891798e-05,
"loss": 0.8108,
"step": 1880
},
{
"epoch": 0.6296851574212894,
"grad_norm": 0.7958945035934448,
"learning_rate": 7.670572019297037e-05,
"loss": 0.6569,
"step": 1890
},
{
"epoch": 0.6330168249208729,
"grad_norm": 0.6312280297279358,
"learning_rate": 7.601654031702274e-05,
"loss": 0.7134,
"step": 1900
},
{
"epoch": 0.6363484924204564,
"grad_norm": 0.5090949535369873,
"learning_rate": 7.532736044107512e-05,
"loss": 0.709,
"step": 1910
},
{
"epoch": 0.63968015992004,
"grad_norm": 0.8009600043296814,
"learning_rate": 7.46381805651275e-05,
"loss": 0.7303,
"step": 1920
},
{
"epoch": 0.6430118274196235,
"grad_norm": 0.6147052049636841,
"learning_rate": 7.394900068917988e-05,
"loss": 0.7837,
"step": 1930
},
{
"epoch": 0.6463434949192071,
"grad_norm": 1.0245405435562134,
"learning_rate": 7.325982081323225e-05,
"loss": 0.6561,
"step": 1940
},
{
"epoch": 0.6496751624187906,
"grad_norm": 0.7784261107444763,
"learning_rate": 7.257064093728464e-05,
"loss": 0.7258,
"step": 1950
},
{
"epoch": 0.6530068299183741,
"grad_norm": 0.8354228138923645,
"learning_rate": 7.1881461061337e-05,
"loss": 0.6183,
"step": 1960
},
{
"epoch": 0.6563384974179577,
"grad_norm": 0.6210038661956787,
"learning_rate": 7.119228118538939e-05,
"loss": 0.5344,
"step": 1970
},
{
"epoch": 0.6596701649175413,
"grad_norm": 0.7484562397003174,
"learning_rate": 7.050310130944176e-05,
"loss": 0.645,
"step": 1980
},
{
"epoch": 0.6630018324171247,
"grad_norm": 0.4157319962978363,
"learning_rate": 6.981392143349414e-05,
"loss": 0.5934,
"step": 1990
},
{
"epoch": 0.6663334999167083,
"grad_norm": 0.8641183376312256,
"learning_rate": 6.912474155754653e-05,
"loss": 0.6435,
"step": 2000
},
{
"epoch": 0.6696651674162919,
"grad_norm": 0.6255794167518616,
"learning_rate": 6.84355616815989e-05,
"loss": 0.5546,
"step": 2010
},
{
"epoch": 0.6729968349158754,
"grad_norm": 0.6411312222480774,
"learning_rate": 6.774638180565129e-05,
"loss": 0.7123,
"step": 2020
},
{
"epoch": 0.6763285024154589,
"grad_norm": 0.5700286626815796,
"learning_rate": 6.705720192970366e-05,
"loss": 0.7319,
"step": 2030
},
{
"epoch": 0.6796601699150425,
"grad_norm": 0.898933470249176,
"learning_rate": 6.636802205375602e-05,
"loss": 0.885,
"step": 2040
},
{
"epoch": 0.682991837414626,
"grad_norm": 0.8384907245635986,
"learning_rate": 6.567884217780841e-05,
"loss": 0.6653,
"step": 2050
},
{
"epoch": 0.6863235049142096,
"grad_norm": 0.5363606214523315,
"learning_rate": 6.498966230186079e-05,
"loss": 0.6261,
"step": 2060
},
{
"epoch": 0.6896551724137931,
"grad_norm": 0.8651995062828064,
"learning_rate": 6.430048242591317e-05,
"loss": 0.6276,
"step": 2070
},
{
"epoch": 0.6929868399133766,
"grad_norm": 1.0924036502838135,
"learning_rate": 6.361130254996555e-05,
"loss": 0.5616,
"step": 2080
},
{
"epoch": 0.6963185074129602,
"grad_norm": 0.8714765906333923,
"learning_rate": 6.292212267401792e-05,
"loss": 0.6231,
"step": 2090
},
{
"epoch": 0.6996501749125438,
"grad_norm": 0.9532473683357239,
"learning_rate": 6.223294279807031e-05,
"loss": 0.7391,
"step": 2100
},
{
"epoch": 0.7029818424121272,
"grad_norm": 0.5686549544334412,
"learning_rate": 6.154376292212267e-05,
"loss": 0.7223,
"step": 2110
},
{
"epoch": 0.7063135099117108,
"grad_norm": 0.883965015411377,
"learning_rate": 6.085458304617505e-05,
"loss": 0.6979,
"step": 2120
},
{
"epoch": 0.7096451774112944,
"grad_norm": 0.8032324314117432,
"learning_rate": 6.016540317022743e-05,
"loss": 0.6049,
"step": 2130
},
{
"epoch": 0.7129768449108779,
"grad_norm": 0.6529830098152161,
"learning_rate": 5.9476223294279806e-05,
"loss": 0.6186,
"step": 2140
},
{
"epoch": 0.7163085124104615,
"grad_norm": 0.7644656896591187,
"learning_rate": 5.878704341833219e-05,
"loss": 0.6225,
"step": 2150
},
{
"epoch": 0.719640179910045,
"grad_norm": 0.7979158759117126,
"learning_rate": 5.809786354238457e-05,
"loss": 0.6094,
"step": 2160
},
{
"epoch": 0.7229718474096285,
"grad_norm": 0.9250969886779785,
"learning_rate": 5.740868366643695e-05,
"loss": 0.5802,
"step": 2170
},
{
"epoch": 0.7263035149092121,
"grad_norm": 0.8039131760597229,
"learning_rate": 5.671950379048932e-05,
"loss": 0.7577,
"step": 2180
},
{
"epoch": 0.7296351824087957,
"grad_norm": 0.7784701585769653,
"learning_rate": 5.603032391454169e-05,
"loss": 0.8382,
"step": 2190
},
{
"epoch": 0.7329668499083791,
"grad_norm": 0.6515526175498962,
"learning_rate": 5.534114403859407e-05,
"loss": 0.6808,
"step": 2200
},
{
"epoch": 0.7362985174079627,
"grad_norm": 1.0747514963150024,
"learning_rate": 5.465196416264645e-05,
"loss": 0.6112,
"step": 2210
},
{
"epoch": 0.7396301849075462,
"grad_norm": 1.031267762184143,
"learning_rate": 5.3962784286698834e-05,
"loss": 0.6316,
"step": 2220
},
{
"epoch": 0.7429618524071298,
"grad_norm": 0.8532452583312988,
"learning_rate": 5.327360441075121e-05,
"loss": 0.5855,
"step": 2230
},
{
"epoch": 0.7462935199067133,
"grad_norm": 0.7305378317832947,
"learning_rate": 5.258442453480359e-05,
"loss": 0.5746,
"step": 2240
},
{
"epoch": 0.7496251874062968,
"grad_norm": 0.7505248188972473,
"learning_rate": 5.189524465885597e-05,
"loss": 0.6912,
"step": 2250
},
{
"epoch": 0.7529568549058804,
"grad_norm": 0.8554951548576355,
"learning_rate": 5.120606478290834e-05,
"loss": 0.5665,
"step": 2260
},
{
"epoch": 0.756288522405464,
"grad_norm": 0.9799861311912537,
"learning_rate": 5.051688490696072e-05,
"loss": 0.7554,
"step": 2270
},
{
"epoch": 0.7596201899050474,
"grad_norm": 0.6496158242225647,
"learning_rate": 4.982770503101309e-05,
"loss": 0.4964,
"step": 2280
},
{
"epoch": 0.762951857404631,
"grad_norm": 0.7765501141548157,
"learning_rate": 4.9138525155065474e-05,
"loss": 0.6501,
"step": 2290
},
{
"epoch": 0.7662835249042146,
"grad_norm": 0.926641047000885,
"learning_rate": 4.8449345279117855e-05,
"loss": 0.6377,
"step": 2300
},
{
"epoch": 0.769615192403798,
"grad_norm": 0.4838825464248657,
"learning_rate": 4.776016540317023e-05,
"loss": 0.6421,
"step": 2310
},
{
"epoch": 0.7729468599033816,
"grad_norm": 1.0005497932434082,
"learning_rate": 4.70709855272226e-05,
"loss": 0.663,
"step": 2320
},
{
"epoch": 0.7762785274029652,
"grad_norm": 0.8331218957901001,
"learning_rate": 4.6381805651274984e-05,
"loss": 0.6195,
"step": 2330
},
{
"epoch": 0.7796101949025487,
"grad_norm": 0.6971142888069153,
"learning_rate": 4.5692625775327365e-05,
"loss": 0.5667,
"step": 2340
},
{
"epoch": 0.7829418624021323,
"grad_norm": 0.7409766316413879,
"learning_rate": 4.5003445899379746e-05,
"loss": 0.5932,
"step": 2350
},
{
"epoch": 0.7862735299017158,
"grad_norm": 0.8003771305084229,
"learning_rate": 4.431426602343211e-05,
"loss": 0.6524,
"step": 2360
},
{
"epoch": 0.7896051974012993,
"grad_norm": 0.8950629234313965,
"learning_rate": 4.3625086147484494e-05,
"loss": 0.6569,
"step": 2370
},
{
"epoch": 0.7929368649008829,
"grad_norm": 0.6981242895126343,
"learning_rate": 4.2935906271536875e-05,
"loss": 0.6612,
"step": 2380
},
{
"epoch": 0.7962685324004665,
"grad_norm": 0.7851802706718445,
"learning_rate": 4.2246726395589256e-05,
"loss": 0.6551,
"step": 2390
},
{
"epoch": 0.7996001999000499,
"grad_norm": 0.7606090903282166,
"learning_rate": 4.1557546519641624e-05,
"loss": 0.6059,
"step": 2400
},
{
"epoch": 0.8029318673996335,
"grad_norm": 0.8207200169563293,
"learning_rate": 4.0868366643694005e-05,
"loss": 0.6677,
"step": 2410
},
{
"epoch": 0.8062635348992171,
"grad_norm": 0.9407020211219788,
"learning_rate": 4.0179186767746386e-05,
"loss": 0.552,
"step": 2420
},
{
"epoch": 0.8095952023988006,
"grad_norm": 0.6682471632957458,
"learning_rate": 3.949000689179876e-05,
"loss": 0.5273,
"step": 2430
},
{
"epoch": 0.8129268698983841,
"grad_norm": 0.9708258509635925,
"learning_rate": 3.880082701585114e-05,
"loss": 0.638,
"step": 2440
},
{
"epoch": 0.8162585373979677,
"grad_norm": 0.5546590685844421,
"learning_rate": 3.8111647139903515e-05,
"loss": 0.6464,
"step": 2450
},
{
"epoch": 0.8195902048975512,
"grad_norm": 0.7525760531425476,
"learning_rate": 3.7422467263955896e-05,
"loss": 0.383,
"step": 2460
},
{
"epoch": 0.8229218723971348,
"grad_norm": 0.8465914726257324,
"learning_rate": 3.673328738800827e-05,
"loss": 0.664,
"step": 2470
},
{
"epoch": 0.8262535398967183,
"grad_norm": 0.9415400624275208,
"learning_rate": 3.604410751206065e-05,
"loss": 0.5489,
"step": 2480
},
{
"epoch": 0.8295852073963018,
"grad_norm": 0.699641764163971,
"learning_rate": 3.5354927636113026e-05,
"loss": 0.5779,
"step": 2490
},
{
"epoch": 0.8329168748958854,
"grad_norm": 0.6599105596542358,
"learning_rate": 3.4665747760165406e-05,
"loss": 0.5486,
"step": 2500
},
{
"epoch": 0.836248542395469,
"grad_norm": 0.8070369362831116,
"learning_rate": 3.397656788421778e-05,
"loss": 0.7153,
"step": 2510
},
{
"epoch": 0.8395802098950524,
"grad_norm": 0.7151026129722595,
"learning_rate": 3.328738800827016e-05,
"loss": 0.5442,
"step": 2520
},
{
"epoch": 0.842911877394636,
"grad_norm": 1.014334797859192,
"learning_rate": 3.2598208132322536e-05,
"loss": 0.6155,
"step": 2530
},
{
"epoch": 0.8462435448942196,
"grad_norm": 0.7710210084915161,
"learning_rate": 3.190902825637492e-05,
"loss": 0.6111,
"step": 2540
},
{
"epoch": 0.8495752123938031,
"grad_norm": 0.7453182935714722,
"learning_rate": 3.121984838042729e-05,
"loss": 0.5853,
"step": 2550
},
{
"epoch": 0.8529068798933866,
"grad_norm": 0.9670674800872803,
"learning_rate": 3.053066850447967e-05,
"loss": 0.5902,
"step": 2560
},
{
"epoch": 0.8562385473929702,
"grad_norm": 0.9570378661155701,
"learning_rate": 2.984148862853205e-05,
"loss": 0.6738,
"step": 2570
},
{
"epoch": 0.8595702148925537,
"grad_norm": 0.6435806751251221,
"learning_rate": 2.9152308752584427e-05,
"loss": 0.6568,
"step": 2580
},
{
"epoch": 0.8629018823921373,
"grad_norm": 0.8892498016357422,
"learning_rate": 2.84631288766368e-05,
"loss": 0.6305,
"step": 2590
},
{
"epoch": 0.8662335498917209,
"grad_norm": 0.9092233777046204,
"learning_rate": 2.7773949000689182e-05,
"loss": 0.6807,
"step": 2600
},
{
"epoch": 0.8695652173913043,
"grad_norm": 0.6867402195930481,
"learning_rate": 2.708476912474156e-05,
"loss": 0.6785,
"step": 2610
},
{
"epoch": 0.8728968848908879,
"grad_norm": 1.011870265007019,
"learning_rate": 2.6395589248793934e-05,
"loss": 0.773,
"step": 2620
},
{
"epoch": 0.8762285523904715,
"grad_norm": 0.5611357688903809,
"learning_rate": 2.5706409372846312e-05,
"loss": 0.5763,
"step": 2630
},
{
"epoch": 0.879560219890055,
"grad_norm": 0.7307304739952087,
"learning_rate": 2.5017229496898693e-05,
"loss": 0.6522,
"step": 2640
},
{
"epoch": 0.8828918873896385,
"grad_norm": 0.8726571202278137,
"learning_rate": 2.4328049620951067e-05,
"loss": 0.67,
"step": 2650
},
{
"epoch": 0.8862235548892221,
"grad_norm": 0.7599055171012878,
"learning_rate": 2.3638869745003448e-05,
"loss": 0.5266,
"step": 2660
},
{
"epoch": 0.8895552223888056,
"grad_norm": 0.9391018748283386,
"learning_rate": 2.2949689869055822e-05,
"loss": 0.5366,
"step": 2670
},
{
"epoch": 0.8928868898883892,
"grad_norm": 0.801648736000061,
"learning_rate": 2.2260509993108203e-05,
"loss": 0.5702,
"step": 2680
},
{
"epoch": 0.8962185573879726,
"grad_norm": 0.7381575107574463,
"learning_rate": 2.157133011716058e-05,
"loss": 0.5519,
"step": 2690
},
{
"epoch": 0.8995502248875562,
"grad_norm": 0.7763687968254089,
"learning_rate": 2.088215024121296e-05,
"loss": 0.6345,
"step": 2700
},
{
"epoch": 0.9028818923871398,
"grad_norm": 0.6713552474975586,
"learning_rate": 2.0192970365265336e-05,
"loss": 0.4188,
"step": 2710
},
{
"epoch": 0.9062135598867233,
"grad_norm": 0.4726286232471466,
"learning_rate": 1.9503790489317714e-05,
"loss": 0.7027,
"step": 2720
},
{
"epoch": 0.9095452273863068,
"grad_norm": 0.559560239315033,
"learning_rate": 1.881461061337009e-05,
"loss": 0.541,
"step": 2730
},
{
"epoch": 0.9128768948858904,
"grad_norm": 0.8475058078765869,
"learning_rate": 1.812543073742247e-05,
"loss": 0.6384,
"step": 2740
},
{
"epoch": 0.9162085623854739,
"grad_norm": 0.8099564909934998,
"learning_rate": 1.7436250861474846e-05,
"loss": 0.6216,
"step": 2750
},
{
"epoch": 0.9195402298850575,
"grad_norm": 0.7477239966392517,
"learning_rate": 1.6747070985527224e-05,
"loss": 0.568,
"step": 2760
},
{
"epoch": 0.922871897384641,
"grad_norm": 0.7421704530715942,
"learning_rate": 1.60578911095796e-05,
"loss": 0.5097,
"step": 2770
},
{
"epoch": 0.9262035648842245,
"grad_norm": 1.0993244647979736,
"learning_rate": 1.536871123363198e-05,
"loss": 0.6455,
"step": 2780
},
{
"epoch": 0.9295352323838081,
"grad_norm": 0.9197335839271545,
"learning_rate": 1.4679531357684357e-05,
"loss": 0.5122,
"step": 2790
},
{
"epoch": 0.9328668998833917,
"grad_norm": 0.7237057089805603,
"learning_rate": 1.3990351481736733e-05,
"loss": 0.6608,
"step": 2800
},
{
"epoch": 0.9361985673829751,
"grad_norm": 1.0517019033432007,
"learning_rate": 1.3301171605789112e-05,
"loss": 0.6283,
"step": 2810
},
{
"epoch": 0.9395302348825587,
"grad_norm": 0.8161411285400391,
"learning_rate": 1.2611991729841488e-05,
"loss": 0.5655,
"step": 2820
},
{
"epoch": 0.9428619023821423,
"grad_norm": 0.8740524053573608,
"learning_rate": 1.1922811853893867e-05,
"loss": 0.4393,
"step": 2830
},
{
"epoch": 0.9461935698817258,
"grad_norm": 0.5465930700302124,
"learning_rate": 1.1233631977946245e-05,
"loss": 0.5458,
"step": 2840
},
{
"epoch": 0.9495252373813093,
"grad_norm": 1.004461646080017,
"learning_rate": 1.0544452101998622e-05,
"loss": 0.6996,
"step": 2850
},
{
"epoch": 0.9528569048808929,
"grad_norm": 0.9351420998573303,
"learning_rate": 9.855272226051e-06,
"loss": 0.6294,
"step": 2860
},
{
"epoch": 0.9561885723804764,
"grad_norm": 0.9448681473731995,
"learning_rate": 9.166092350103378e-06,
"loss": 0.6851,
"step": 2870
},
{
"epoch": 0.95952023988006,
"grad_norm": 0.9818257689476013,
"learning_rate": 8.476912474155755e-06,
"loss": 0.6429,
"step": 2880
},
{
"epoch": 0.9628519073796435,
"grad_norm": 0.6631109714508057,
"learning_rate": 7.787732598208133e-06,
"loss": 0.5943,
"step": 2890
},
{
"epoch": 0.966183574879227,
"grad_norm": 0.8403130769729614,
"learning_rate": 7.0985527222605096e-06,
"loss": 0.4675,
"step": 2900
},
{
"epoch": 0.9695152423788106,
"grad_norm": 0.7020695805549622,
"learning_rate": 6.409372846312887e-06,
"loss": 0.4861,
"step": 2910
},
{
"epoch": 0.9728469098783942,
"grad_norm": 1.1053721904754639,
"learning_rate": 5.720192970365266e-06,
"loss": 0.5639,
"step": 2920
},
{
"epoch": 0.9761785773779776,
"grad_norm": 0.9987778663635254,
"learning_rate": 5.031013094417643e-06,
"loss": 0.5934,
"step": 2930
},
{
"epoch": 0.9795102448775612,
"grad_norm": 0.9289517998695374,
"learning_rate": 4.341833218470021e-06,
"loss": 0.5889,
"step": 2940
},
{
"epoch": 0.9828419123771448,
"grad_norm": 0.7212059497833252,
"learning_rate": 3.6526533425223984e-06,
"loss": 0.6013,
"step": 2950
},
{
"epoch": 0.9861735798767283,
"grad_norm": 0.8436290621757507,
"learning_rate": 2.9634734665747764e-06,
"loss": 0.6237,
"step": 2960
},
{
"epoch": 0.9895052473763118,
"grad_norm": 0.9585169553756714,
"learning_rate": 2.2742935906271536e-06,
"loss": 0.6193,
"step": 2970
},
{
"epoch": 0.9928369148758954,
"grad_norm": 0.7987054586410522,
"learning_rate": 1.5851137146795314e-06,
"loss": 0.6871,
"step": 2980
},
{
"epoch": 0.9961685823754789,
"grad_norm": 0.7917064428329468,
"learning_rate": 8.959338387319091e-07,
"loss": 0.6777,
"step": 2990
},
{
"epoch": 0.9995002498750625,
"grad_norm": 0.8279238343238831,
"learning_rate": 2.067539627842867e-07,
"loss": 0.6662,
"step": 3000
}
],
"logging_steps": 10,
"max_steps": 3002,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.7125049258477568e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}