Files

1743 lines
39 KiB
JSON
Raw Permalink Normal View History

[
{
"loss": 2.2046,
"grad_norm": 6.064353942871094,
"learning_rate": 1.5254237288135596e-05,
"epoch": 0.01694915254237288,
"step": 10
},
{
"loss": 1.7911,
"grad_norm": 2.307243824005127,
"learning_rate": 3.2203389830508473e-05,
"epoch": 0.03389830508474576,
"step": 20
},
{
"loss": 1.6879,
"grad_norm": 1.7352138757705688,
"learning_rate": 4.915254237288136e-05,
"epoch": 0.05084745762711865,
"step": 30
},
{
"loss": 1.6432,
"grad_norm": 1.2811769247055054,
"learning_rate": 6.610169491525424e-05,
"epoch": 0.06779661016949153,
"step": 40
},
{
"loss": 1.6039,
"grad_norm": 1.2322453260421753,
"learning_rate": 8.305084745762712e-05,
"epoch": 0.0847457627118644,
"step": 50
},
{
"loss": 1.6104,
"grad_norm": 1.3415359258651733,
"learning_rate": 0.0001,
"epoch": 0.1016949152542373,
"step": 60
},
{
"loss": 1.5699,
"grad_norm": 1.2617424726486206,
"learning_rate": 0.00011694915254237289,
"epoch": 0.11864406779661017,
"step": 70
},
{
"loss": 1.5413,
"grad_norm": 1.3335905075073242,
"learning_rate": 0.00013389830508474577,
"epoch": 0.13559322033898305,
"step": 80
},
{
"loss": 1.4973,
"grad_norm": 1.3121675252914429,
"learning_rate": 0.00015084745762711864,
"epoch": 0.15254237288135594,
"step": 90
},
{
"loss": 1.5,
"grad_norm": 1.1816385984420776,
"learning_rate": 0.00016779661016949154,
"epoch": 0.1694915254237288,
"step": 100
},
{
"loss": 1.4957,
"grad_norm": 1.1212005615234375,
"learning_rate": 0.00018474576271186442,
"epoch": 0.1864406779661017,
"step": 110
},
{
"loss": 1.487,
"grad_norm": 1.2974339723587036,
"learning_rate": 0.00019999990182555336,
"epoch": 0.2033898305084746,
"step": 120
},
{
"loss": 1.4686,
"grad_norm": 1.0796626806259155,
"learning_rate": 0.00019998812112519715,
"epoch": 0.22033898305084745,
"step": 130
},
{
"loss": 1.4282,
"grad_norm": 1.0553802251815796,
"learning_rate": 0.00019995670818593407,
"epoch": 0.23728813559322035,
"step": 140
},
{
"loss": 1.4437,
"grad_norm": 1.0136256217956543,
"learning_rate": 0.00019990566917556007,
"epoch": 0.2542372881355932,
"step": 150
},
{
"loss": 1.4265,
"grad_norm": 0.9413333535194397,
"learning_rate": 0.00019983501411536606,
"epoch": 0.2711864406779661,
"step": 160
},
{
"loss": 1.427,
"grad_norm": 0.9628981947898865,
"learning_rate": 0.00019974475687817018,
"epoch": 0.288135593220339,
"step": 170
},
{
"loss": 1.3973,
"grad_norm": 0.9879967570304871,
"learning_rate": 0.00019963491518559394,
"epoch": 0.3050847457627119,
"step": 180
},
{
"loss": 1.3989,
"grad_norm": 0.8920490145683289,
"learning_rate": 0.00019950551060458283,
"epoch": 0.3220338983050847,
"step": 190
},
{
"loss": 1.4089,
"grad_norm": 0.8335606455802917,
"learning_rate": 0.00019935656854317148,
"epoch": 0.3389830508474576,
"step": 200
},
{
"loss": 1.4301,
"grad_norm": 0.8877090215682983,
"learning_rate": 0.00019918811824549512,
"epoch": 0.3559322033898305,
"step": 210
},
{
"loss": 1.4483,
"grad_norm": 0.8929612040519714,
"learning_rate": 0.0001990001927860475,
"epoch": 0.3728813559322034,
"step": 220
},
{
"loss": 1.3764,
"grad_norm": 0.9452272057533264,
"learning_rate": 0.0001987928290631869,
"epoch": 0.3898305084745763,
"step": 230
},
{
"eval_loss": 1.3814609050750732,
"eval_runtime": 24.8496,
"eval_samples_per_second": 39.96,
"eval_steps_per_second": 10.02,
"epoch": 0.3983050847457627,
"step": 235
},
{
"loss": 1.4278,
"grad_norm": 0.9019148945808411,
"learning_rate": 0.00019856606779189128,
"epoch": 0.4067796610169492,
"step": 240
},
{
"loss": 1.357,
"grad_norm": 0.89701908826828,
"learning_rate": 0.00019831995349576408,
"epoch": 0.423728813559322,
"step": 250
},
{
"loss": 1.3846,
"grad_norm": 0.8912389278411865,
"learning_rate": 0.00019805453449829217,
"epoch": 0.4406779661016949,
"step": 260
},
{
"loss": 1.421,
"grad_norm": 0.8808926939964294,
"learning_rate": 0.0001977698629133578,
"epoch": 0.4576271186440678,
"step": 270
},
{
"loss": 1.3986,
"grad_norm": 0.86872798204422,
"learning_rate": 0.00019746599463500616,
"epoch": 0.4745762711864407,
"step": 280
},
{
"loss": 1.3493,
"grad_norm": 0.8555623292922974,
"learning_rate": 0.00019714298932647098,
"epoch": 0.4915254237288136,
"step": 290
},
{
"loss": 1.3591,
"grad_norm": 0.8310695290565491,
"learning_rate": 0.00019680091040845981,
"epoch": 0.5084745762711864,
"step": 300
},
{
"loss": 1.3719,
"grad_norm": 0.8844895958900452,
"learning_rate": 0.00019643982504670158,
"epoch": 0.5254237288135594,
"step": 310
},
{
"loss": 1.3581,
"grad_norm": 0.8830392360687256,
"learning_rate": 0.00019605980413875897,
"epoch": 0.5423728813559322,
"step": 320
},
{
"loss": 1.3904,
"grad_norm": 0.9279443025588989,
"learning_rate": 0.00019566092230010807,
"epoch": 0.559322033898305,
"step": 330
},
{
"loss": 1.3836,
"grad_norm": 0.9330219030380249,
"learning_rate": 0.0001952432578494877,
"epoch": 0.576271186440678,
"step": 340
},
{
"loss": 1.3549,
"grad_norm": 0.9056932330131531,
"learning_rate": 0.00019480689279352217,
"epoch": 0.5932203389830508,
"step": 350
},
{
"loss": 1.3738,
"grad_norm": 0.8885744214057922,
"learning_rate": 0.0001943519128106194,
"epoch": 0.6101694915254238,
"step": 360
},
{
"loss": 1.3528,
"grad_norm": 0.8452779650688171,
"learning_rate": 0.00019387840723414837,
"epoch": 0.6271186440677966,
"step": 370
},
{
"loss": 1.349,
"grad_norm": 0.8285683989524841,
"learning_rate": 0.000193386469034899,
"epoch": 0.6440677966101694,
"step": 380
},
{
"loss": 1.3703,
"grad_norm": 0.8421545624732971,
"learning_rate": 0.00019287619480282765,
"epoch": 0.6610169491525424,
"step": 390
},
{
"loss": 1.3553,
"grad_norm": 0.859512984752655,
"learning_rate": 0.0001923476847280921,
"epoch": 0.6779661016949152,
"step": 400
},
{
"loss": 1.3314,
"grad_norm": 0.8403517007827759,
"learning_rate": 0.0001918010425813796,
"epoch": 0.6949152542372882,
"step": 410
},
{
"loss": 1.3263,
"grad_norm": 0.8627921938896179,
"learning_rate": 0.00019123637569353218,
"epoch": 0.711864406779661,
"step": 420
},
{
"loss": 1.3824,
"grad_norm": 0.847070574760437,
"learning_rate": 0.00019065379493447227,
"epoch": 0.7288135593220338,
"step": 430
},
{
"loss": 1.3063,
"grad_norm": 0.9412711262702942,
"learning_rate": 0.00019005341469143427,
"epoch": 0.7457627118644068,
"step": 440
},
{
"loss": 1.3312,
"grad_norm": 0.8335198163986206,
"learning_rate": 0.00018943535284650492,
"epoch": 0.7627118644067796,
"step": 450
},
{
"loss": 1.3276,
"grad_norm": 0.8702097535133362,
"learning_rate": 0.0001887997307534777,
"epoch": 0.7796610169491526,
"step": 460
},
{
"loss": 1.3201,
"grad_norm": 0.9178433418273926,
"learning_rate": 0.0001881466732140254,
"epoch": 0.7966101694915254,
"step": 470
},
{
"eval_loss": 1.3105889558792114,
"eval_runtime": 7.9984,
"eval_samples_per_second": 124.15,
"eval_steps_per_second": 31.131,
"epoch": 0.7966101694915254,
"step": 470
},
{
"loss": 1.3173,
"grad_norm": 0.8480702042579651,
"learning_rate": 0.00018747630845319612,
"epoch": 0.8135593220338984,
"step": 480
},
{
"loss": 1.3154,
"grad_norm": 0.922341525554657,
"learning_rate": 0.00018678876809423667,
"epoch": 0.8305084745762712,
"step": 490
},
{
"loss": 1.337,
"grad_norm": 0.8491071462631226,
"learning_rate": 0.00018608418713274874,
"epoch": 0.847457627118644,
"step": 500
},
{
"loss": 1.3255,
"grad_norm": 0.7910286784172058,
"learning_rate": 0.00018536270391018346,
"epoch": 0.864406779661017,
"step": 510
},
{
"loss": 1.3064,
"grad_norm": 0.8801857233047485,
"learning_rate": 0.00018462446008667843,
"epoch": 0.8813559322033898,
"step": 520
},
{
"loss": 1.2972,
"grad_norm": 0.8282411694526672,
"learning_rate": 0.00018386960061324325,
"epoch": 0.8983050847457628,
"step": 530
},
{
"loss": 1.3023,
"grad_norm": 0.8699902296066284,
"learning_rate": 0.00018309827370329928,
"epoch": 0.9152542372881356,
"step": 540
},
{
"loss": 1.3272,
"grad_norm": 0.8380703926086426,
"learning_rate": 0.0001823106308035784,
"epoch": 0.9322033898305084,
"step": 550
},
{
"loss": 1.2897,
"grad_norm": 0.8781632781028748,
"learning_rate": 0.00018150682656438715,
"epoch": 0.9491525423728814,
"step": 560
},
{
"loss": 1.2885,
"grad_norm": 0.7954381704330444,
"learning_rate": 0.00018068701880924178,
"epoch": 0.9661016949152542,
"step": 570
},
{
"loss": 1.2882,
"grad_norm": 0.8385308980941772,
"learning_rate": 0.00017985136850388024,
"epoch": 0.9830508474576272,
"step": 580
},
{
"loss": 1.3128,
"grad_norm": 2.6521894931793213,
"learning_rate": 0.00017900003972465736,
"epoch": 1.0,
"step": 590
},
{
"loss": 1.2167,
"grad_norm": 0.9211756587028503,
"learning_rate": 0.000178133199626329,
"epoch": 1.0169491525423728,
"step": 600
},
{
"loss": 1.2556,
"grad_norm": 0.835600733757019,
"learning_rate": 0.00017725101840923216,
"epoch": 1.0338983050847457,
"step": 610
},
{
"loss": 1.2466,
"grad_norm": 0.8683267831802368,
"learning_rate": 0.00017635366928586663,
"epoch": 1.0508474576271187,
"step": 620
},
{
"loss": 1.2525,
"grad_norm": 0.9456862211227417,
"learning_rate": 0.00017544132844688563,
"epoch": 1.0677966101694916,
"step": 630
},
{
"loss": 1.2045,
"grad_norm": 0.9511478543281555,
"learning_rate": 0.00017451417502650145,
"epoch": 1.0847457627118644,
"step": 640
},
{
"loss": 1.2505,
"grad_norm": 0.8709162473678589,
"learning_rate": 0.00017357239106731317,
"epoch": 1.1016949152542372,
"step": 650
},
{
"loss": 1.2588,
"grad_norm": 0.8981189727783203,
"learning_rate": 0.00017261616148456357,
"epoch": 1.11864406779661,
"step": 660
},
{
"loss": 1.2353,
"grad_norm": 0.8719836473464966,
"learning_rate": 0.00017164567402983152,
"epoch": 1.1355932203389831,
"step": 670
},
{
"loss": 1.2081,
"grad_norm": 0.8911289572715759,
"learning_rate": 0.0001706611192541681,
"epoch": 1.152542372881356,
"step": 680
},
{
"loss": 1.2441,
"grad_norm": 0.8637029528617859,
"learning_rate": 0.0001696626904706824,
"epoch": 1.1694915254237288,
"step": 690
},
{
"loss": 1.2492,
"grad_norm": 0.8860388994216919,
"learning_rate": 0.00016865058371658557,
"epoch": 1.1864406779661016,
"step": 700
},
{
"eval_loss": 1.2774409055709839,
"eval_runtime": 7.9016,
"eval_samples_per_second": 125.67,
"eval_steps_per_second": 31.513,
"epoch": 1.194915254237288,
"step": 705
},
{
"loss": 1.2269,
"grad_norm": 0.8600966930389404,
"learning_rate": 0.00016762499771469957,
"epoch": 1.2033898305084745,
"step": 710
},
{
"loss": 1.2562,
"grad_norm": 0.8858769536018372,
"learning_rate": 0.0001665861338344389,
"epoch": 1.2203389830508475,
"step": 720
},
{
"loss": 1.242,
"grad_norm": 0.8999311327934265,
"learning_rate": 0.0001655341960522726,
"epoch": 1.2372881355932204,
"step": 730
},
{
"loss": 1.213,
"grad_norm": 0.914777398109436,
"learning_rate": 0.00016446939091167422,
"epoch": 1.2542372881355932,
"step": 740
},
{
"loss": 1.2392,
"grad_norm": 1.0026013851165771,
"learning_rate": 0.00016339192748256802,
"epoch": 1.271186440677966,
"step": 750
},
{
"loss": 1.2372,
"grad_norm": 0.9266188740730286,
"learning_rate": 0.0001623020173202789,
"epoch": 1.288135593220339,
"step": 760
},
{
"loss": 1.2391,
"grad_norm": 0.8796271681785583,
"learning_rate": 0.00016119987442399456,
"epoch": 1.305084745762712,
"step": 770
},
{
"loss": 1.2327,
"grad_norm": 0.9744959473609924,
"learning_rate": 0.00016008571519474742,
"epoch": 1.3220338983050848,
"step": 780
},
{
"loss": 1.2461,
"grad_norm": 0.9354102611541748,
"learning_rate": 0.0001589597583929255,
"epoch": 1.3389830508474576,
"step": 790
},
{
"loss": 1.233,
"grad_norm": 0.8850792050361633,
"learning_rate": 0.0001578222250953195,
"epoch": 1.3559322033898304,
"step": 800
},
{
"loss": 1.2353,
"grad_norm": 0.9097703695297241,
"learning_rate": 0.00015667333865171558,
"epoch": 1.3728813559322033,
"step": 810
},
{
"loss": 1.2464,
"grad_norm": 0.9092051386833191,
"learning_rate": 0.00015551332464104126,
"epoch": 1.3898305084745763,
"step": 820
},
{
"loss": 1.2245,
"grad_norm": 0.9042219519615173,
"learning_rate": 0.0001543424108270743,
"epoch": 1.4067796610169492,
"step": 830
},
{
"loss": 1.2424,
"grad_norm": 0.851340115070343,
"learning_rate": 0.00015316082711372205,
"epoch": 1.423728813559322,
"step": 840
},
{
"loss": 1.2214,
"grad_norm": 0.8824617266654968,
"learning_rate": 0.00015196880549988082,
"epoch": 1.4406779661016949,
"step": 850
},
{
"loss": 1.2169,
"grad_norm": 0.918869137763977,
"learning_rate": 0.000150766580033884,
"epoch": 1.457627118644068,
"step": 860
},
{
"loss": 1.2096,
"grad_norm": 0.9275985360145569,
"learning_rate": 0.00014955438676754755,
"epoch": 1.4745762711864407,
"step": 870
},
{
"loss": 1.1924,
"grad_norm": 0.9207339882850647,
"learning_rate": 0.00014833246370982237,
"epoch": 1.4915254237288136,
"step": 880
},
{
"loss": 1.2259,
"grad_norm": 0.9242911338806152,
"learning_rate": 0.00014710105078006205,
"epoch": 1.5084745762711864,
"step": 890
},
{
"loss": 1.193,
"grad_norm": 0.8754140734672546,
"learning_rate": 0.000145860389760916,
"epoch": 1.5254237288135593,
"step": 900
},
{
"loss": 1.2542,
"grad_norm": 0.8747695684432983,
"learning_rate": 0.00014461072425085627,
"epoch": 1.542372881355932,
"step": 910
},
{
"loss": 1.2102,
"grad_norm": 0.9027137160301208,
"learning_rate": 0.00014335229961634808,
"epoch": 1.559322033898305,
"step": 920
},
{
"loss": 1.2016,
"grad_norm": 0.9713094830513,
"learning_rate": 0.00014208536294367326,
"epoch": 1.576271186440678,
"step": 930
},
{
"loss": 1.203,
"grad_norm": 0.9080752730369568,
"learning_rate": 0.00014081016299041576,
"epoch": 1.5932203389830508,
"step": 940
},
{
"eval_loss": 1.2493342161178589,
"eval_runtime": 8.029,
"eval_samples_per_second": 123.676,
"eval_steps_per_second": 31.012,
"epoch": 1.5932203389830508,
"step": 940
},
{
"loss": 1.1835,
"grad_norm": 0.8457156419754028,
"learning_rate": 0.0001395269501366193,
"epoch": 1.6101694915254239,
"step": 950
},
{
"loss": 1.2195,
"grad_norm": 0.9038828015327454,
"learning_rate": 0.0001382359763356262,
"epoch": 1.6271186440677967,
"step": 960
},
{
"loss": 1.2371,
"grad_norm": 0.8834562301635742,
"learning_rate": 0.00013693749506460756,
"epoch": 1.6440677966101696,
"step": 970
},
{
"loss": 1.2204,
"grad_norm": 0.9082645773887634,
"learning_rate": 0.00013563176127479403,
"epoch": 1.6610169491525424,
"step": 980
},
{
"loss": 1.2343,
"grad_norm": 0.8829030990600586,
"learning_rate": 0.00013431903134141713,
"epoch": 1.6779661016949152,
"step": 990
},
{
"loss": 1.2114,
"grad_norm": 0.9125088453292847,
"learning_rate": 0.00013299956301337132,
"epoch": 1.694915254237288,
"step": 1000
},
{
"loss": 1.2313,
"grad_norm": 0.898687481880188,
"learning_rate": 0.00013167361536260585,
"epoch": 1.711864406779661,
"step": 1010
},
{
"loss": 1.2132,
"grad_norm": 0.8782442808151245,
"learning_rate": 0.0001303414487332573,
"epoch": 1.7288135593220337,
"step": 1020
},
{
"loss": 1.2084,
"grad_norm": 0.8889365196228027,
"learning_rate": 0.00012900332469053193,
"epoch": 1.7457627118644068,
"step": 1030
},
{
"loss": 1.2034,
"grad_norm": 0.9221587777137756,
"learning_rate": 0.0001276595059693487,
"epoch": 1.7627118644067796,
"step": 1040
},
{
"loss": 1.2107,
"grad_norm": 0.9170616865158081,
"learning_rate": 0.00012631025642275212,
"epoch": 1.7796610169491527,
"step": 1050
},
{
"loss": 1.2134,
"grad_norm": 0.9416205883026123,
"learning_rate": 0.00012495584097010616,
"epoch": 1.7966101694915255,
"step": 1060
},
{
"loss": 1.1813,
"grad_norm": 0.9464443325996399,
"learning_rate": 0.0001235965255450781,
"epoch": 1.8135593220338984,
"step": 1070
},
{
"loss": 1.2182,
"grad_norm": 0.8776165246963501,
"learning_rate": 0.00012223257704342395,
"epoch": 1.8305084745762712,
"step": 1080
},
{
"loss": 1.2025,
"grad_norm": 0.9294790029525757,
"learning_rate": 0.0001208642632705844,
"epoch": 1.847457627118644,
"step": 1090
},
{
"loss": 1.1823,
"grad_norm": 0.9223579168319702,
"learning_rate": 0.00011949185288910236,
"epoch": 1.8644067796610169,
"step": 1100
},
{
"loss": 1.2013,
"grad_norm": 0.8838573694229126,
"learning_rate": 0.00011811561536587244,
"epoch": 1.8813559322033897,
"step": 1110
},
{
"loss": 1.2051,
"grad_norm": 0.9253844618797302,
"learning_rate": 0.00011673582091923192,
"epoch": 1.8983050847457628,
"step": 1120
},
{
"loss": 1.1961,
"grad_norm": 0.9401239156723022,
"learning_rate": 0.00011535274046590492,
"epoch": 1.9152542372881356,
"step": 1130
},
{
"loss": 1.1937,
"grad_norm": 0.9650959968566895,
"learning_rate": 0.00011396664556780878,
"epoch": 1.9322033898305084,
"step": 1140
},
{
"loss": 1.2192,
"grad_norm": 0.9026353359222412,
"learning_rate": 0.00011257780837873417,
"epoch": 1.9491525423728815,
"step": 1150
},
{
"loss": 1.2247,
"grad_norm": 0.8826860189437866,
"learning_rate": 0.00011118650159090887,
"epoch": 1.9661016949152543,
"step": 1160
},
{
"loss": 1.1997,
"grad_norm": 0.8998943567276001,
"learning_rate": 0.00010979299838145574,
"epoch": 1.9830508474576272,
"step": 1170
},
{
"eval_loss": 1.2287755012512207,
"eval_runtime": 7.9784,
"eval_samples_per_second": 124.461,
"eval_steps_per_second": 31.209,
"epoch": 1.9915254237288136,
"step": 1175
},
{
"loss": 1.1501,
"grad_norm": 2.670851469039917,
"learning_rate": 0.00010839757235875563,
"epoch": 2.0,
"step": 1180
},
{
"loss": 1.1174,
"grad_norm": 0.9717442989349365,
"learning_rate": 0.00010700049750872557,
"epoch": 2.016949152542373,
"step": 1190
},
{
"loss": 1.1334,
"grad_norm": 0.9300686120986938,
"learning_rate": 0.00010560204814102266,
"epoch": 2.0338983050847457,
"step": 1200
},
{
"loss": 1.1035,
"grad_norm": 0.9468068480491638,
"learning_rate": 0.00010420249883518476,
"epoch": 2.0508474576271185,
"step": 1210
},
{
"loss": 1.1228,
"grad_norm": 1.0294302701950073,
"learning_rate": 0.00010280212438671784,
"epoch": 2.0677966101694913,
"step": 1220
},
{
"loss": 1.13,
"grad_norm": 0.9107387661933899,
"learning_rate": 0.00010140119975314102,
"epoch": 2.084745762711864,
"step": 1230
},
{
"loss": 1.1378,
"grad_norm": 0.9056633114814758,
"learning_rate": 0.0001,
"epoch": 2.1016949152542375,
"step": 1240
},
{
"loss": 1.1515,
"grad_norm": 1.0032331943511963,
"learning_rate": 9.8598800246859e-05,
"epoch": 2.1186440677966103,
"step": 1250
},
{
"loss": 1.1338,
"grad_norm": 0.951280951499939,
"learning_rate": 9.719787561328217e-05,
"epoch": 2.135593220338983,
"step": 1260
},
{
"loss": 1.1398,
"grad_norm": 0.9472972750663757,
"learning_rate": 9.579750116481526e-05,
"epoch": 2.152542372881356,
"step": 1270
},
{
"loss": 1.1041,
"grad_norm": 0.9140039682388306,
"learning_rate": 9.439795185897736e-05,
"epoch": 2.169491525423729,
"step": 1280
},
{
"loss": 1.142,
"grad_norm": 0.9762586355209351,
"learning_rate": 9.29995024912745e-05,
"epoch": 2.1864406779661016,
"step": 1290
},
{
"loss": 1.1473,
"grad_norm": 0.9565635323524475,
"learning_rate": 9.160242764124439e-05,
"epoch": 2.2033898305084745,
"step": 1300
},
{
"loss": 1.1507,
"grad_norm": 0.970757246017456,
"learning_rate": 9.020700161854429e-05,
"epoch": 2.2203389830508473,
"step": 1310
},
{
"loss": 1.1514,
"grad_norm": 1.0047391653060913,
"learning_rate": 8.881349840909116e-05,
"epoch": 2.23728813559322,
"step": 1320
},
{
"loss": 1.1218,
"grad_norm": 0.9501739740371704,
"learning_rate": 8.742219162126587e-05,
"epoch": 2.2542372881355934,
"step": 1330
},
{
"loss": 1.1025,
"grad_norm": 0.9464960098266602,
"learning_rate": 8.603335443219125e-05,
"epoch": 2.2711864406779663,
"step": 1340
},
{
"loss": 1.1101,
"grad_norm": 0.973585844039917,
"learning_rate": 8.464725953409509e-05,
"epoch": 2.288135593220339,
"step": 1350
},
{
"loss": 1.1109,
"grad_norm": 0.9535447955131531,
"learning_rate": 8.326417908076811e-05,
"epoch": 2.305084745762712,
"step": 1360
},
{
"loss": 1.1513,
"grad_norm": 0.9385268688201904,
"learning_rate": 8.188438463412761e-05,
"epoch": 2.3220338983050848,
"step": 1370
},
{
"loss": 1.111,
"grad_norm": 0.9565103054046631,
"learning_rate": 8.050814711089764e-05,
"epoch": 2.3389830508474576,
"step": 1380
},
{
"loss": 1.1122,
"grad_norm": 0.934618353843689,
"learning_rate": 7.913573672941563e-05,
"epoch": 2.3559322033898304,
"step": 1390
},
{
"loss": 1.1029,
"grad_norm": 0.9700987935066223,
"learning_rate": 7.776742295657608e-05,
"epoch": 2.3728813559322033,
"step": 1400
},
{
"loss": 1.1159,
"grad_norm": 0.9688590168952942,
"learning_rate": 7.640347445492192e-05,
"epoch": 2.389830508474576,
"step": 1410
},
{
"eval_loss": 1.222853660583496,
"eval_runtime": 7.9699,
"eval_samples_per_second": 124.594,
"eval_steps_per_second": 31.243,
"epoch": 2.389830508474576,
"step": 1410
},
{
"loss": 1.0749,
"grad_norm": 0.9457218050956726,
"learning_rate": 7.504415902989386e-05,
"epoch": 2.406779661016949,
"step": 1420
},
{
"loss": 1.1272,
"grad_norm": 0.9391158223152161,
"learning_rate": 7.368974357724789e-05,
"epoch": 2.423728813559322,
"step": 1430
},
{
"loss": 1.1083,
"grad_norm": 0.9238609671592712,
"learning_rate": 7.234049403065132e-05,
"epoch": 2.440677966101695,
"step": 1440
},
{
"loss": 1.1058,
"grad_norm": 0.9600583910942078,
"learning_rate": 7.099667530946806e-05,
"epoch": 2.457627118644068,
"step": 1450
},
{
"loss": 1.1281,
"grad_norm": 0.9885203838348389,
"learning_rate": 6.96585512667427e-05,
"epoch": 2.4745762711864407,
"step": 1460
},
{
"loss": 1.1248,
"grad_norm": 0.9941011071205139,
"learning_rate": 6.832638463739418e-05,
"epoch": 2.4915254237288136,
"step": 1470
},
{
"loss": 1.1343,
"grad_norm": 0.9795677065849304,
"learning_rate": 6.700043698662873e-05,
"epoch": 2.5084745762711864,
"step": 1480
},
{
"loss": 1.118,
"grad_norm": 0.948656439781189,
"learning_rate": 6.568096865858289e-05,
"epoch": 2.5254237288135593,
"step": 1490
},
{
"loss": 1.139,
"grad_norm": 0.9555015563964844,
"learning_rate": 6.4368238725206e-05,
"epoch": 2.542372881355932,
"step": 1500
},
{
"loss": 1.1327,
"grad_norm": 0.9538373351097107,
"learning_rate": 6.306250493539246e-05,
"epoch": 2.559322033898305,
"step": 1510
},
{
"loss": 1.1152,
"grad_norm": 0.9733698964118958,
"learning_rate": 6.176402366437382e-05,
"epoch": 2.576271186440678,
"step": 1520
},
{
"loss": 1.1426,
"grad_norm": 0.9399760961532593,
"learning_rate": 6.047304986338071e-05,
"epoch": 2.593220338983051,
"step": 1530
},
{
"loss": 1.1103,
"grad_norm": 1.0162689685821533,
"learning_rate": 5.918983700958425e-05,
"epoch": 2.610169491525424,
"step": 1540
},
{
"loss": 1.1153,
"grad_norm": 0.9900780916213989,
"learning_rate": 5.791463705632676e-05,
"epoch": 2.6271186440677967,
"step": 1550
},
{
"loss": 1.0927,
"grad_norm": 0.9424988031387329,
"learning_rate": 5.664770038365195e-05,
"epoch": 2.6440677966101696,
"step": 1560
},
{
"loss": 1.113,
"grad_norm": 0.9367265701293945,
"learning_rate": 5.538927574914376e-05,
"epoch": 2.6610169491525424,
"step": 1570
},
{
"loss": 1.1372,
"grad_norm": 0.9402151107788086,
"learning_rate": 5.413961023908401e-05,
"epoch": 2.6779661016949152,
"step": 1580
},
{
"loss": 1.1264,
"grad_norm": 1.0578895807266235,
"learning_rate": 5.2898949219937976e-05,
"epoch": 2.694915254237288,
"step": 1590
},
{
"loss": 1.1081,
"grad_norm": 0.9745665192604065,
"learning_rate": 5.166753629017764e-05,
"epoch": 2.711864406779661,
"step": 1600
},
{
"loss": 1.1313,
"grad_norm": 0.9517439603805542,
"learning_rate": 5.044561323245245e-05,
"epoch": 2.7288135593220337,
"step": 1610
},
{
"loss": 1.1339,
"grad_norm": 0.9738871455192566,
"learning_rate": 4.9233419966116036e-05,
"epoch": 2.7457627118644066,
"step": 1620
},
{
"loss": 1.1421,
"grad_norm": 0.9917513132095337,
"learning_rate": 4.803119450011919e-05,
"epoch": 2.7627118644067794,
"step": 1630
},
{
"loss": 1.1158,
"grad_norm": 0.9498072266578674,
"learning_rate": 4.683917288627795e-05,
"epoch": 2.7796610169491527,
"step": 1640
},
{
"eval_loss": 1.2087918519973755,
"eval_runtime": 7.9858,
"eval_samples_per_second": 124.345,
"eval_steps_per_second": 31.18,
"epoch": 2.788135593220339,
"step": 1645
},
{
"loss": 1.0934,
"grad_norm": 0.9327065348625183,
"learning_rate": 4.56575891729257e-05,
"epoch": 2.7966101694915255,
"step": 1650
},
{
"loss": 1.1269,
"grad_norm": 1.0235567092895508,
"learning_rate": 4.448667535895876e-05,
"epoch": 2.8135593220338984,
"step": 1660
},
{
"loss": 1.0731,
"grad_norm": 0.9344584941864014,
"learning_rate": 4.332666134828444e-05,
"epoch": 2.830508474576271,
"step": 1670
},
{
"loss": 1.1295,
"grad_norm": 1.0067960023880005,
"learning_rate": 4.2177774904680475e-05,
"epoch": 2.847457627118644,
"step": 1680
},
{
"loss": 1.061,
"grad_norm": 0.960763692855835,
"learning_rate": 4.1040241607074516e-05,
"epoch": 2.864406779661017,
"step": 1690
},
{
"loss": 1.093,
"grad_norm": 0.9818686842918396,
"learning_rate": 3.991428480525261e-05,
"epoch": 2.8813559322033897,
"step": 1700
},
{
"loss": 1.1469,
"grad_norm": 0.9942337274551392,
"learning_rate": 3.880012557600547e-05,
"epoch": 2.898305084745763,
"step": 1710
},
{
"loss": 1.1369,
"grad_norm": 1.022985577583313,
"learning_rate": 3.769798267972109e-05,
"epoch": 2.915254237288136,
"step": 1720
},
{
"loss": 1.1009,
"grad_norm": 0.9834439754486084,
"learning_rate": 3.6608072517432013e-05,
"epoch": 2.9322033898305087,
"step": 1730
},
{
"loss": 1.1134,
"grad_norm": 1.0133659839630127,
"learning_rate": 3.553060908832583e-05,
"epoch": 2.9491525423728815,
"step": 1740
},
{
"loss": 1.0923,
"grad_norm": 0.9789726734161377,
"learning_rate": 3.4465803947727424e-05,
"epoch": 2.9661016949152543,
"step": 1750
},
{
"loss": 1.1277,
"grad_norm": 0.9691158533096313,
"learning_rate": 3.341386616556109e-05,
"epoch": 2.983050847457627,
"step": 1760
},
{
"loss": 1.1257,
"grad_norm": 3.6234147548675537,
"learning_rate": 3.237500228530045e-05,
"epoch": 3.0,
"step": 1770
},
{
"loss": 1.0769,
"grad_norm": 1.0010051727294922,
"learning_rate": 3.1349416283414465e-05,
"epoch": 3.016949152542373,
"step": 1780
},
{
"loss": 1.0647,
"grad_norm": 0.9725643396377563,
"learning_rate": 3.0337309529317604e-05,
"epoch": 3.0338983050847457,
"step": 1790
},
{
"loss": 1.0718,
"grad_norm": 1.0454438924789429,
"learning_rate": 2.933888074583193e-05,
"epoch": 3.0508474576271185,
"step": 1800
},
{
"loss": 1.0739,
"grad_norm": 0.9589056372642517,
"learning_rate": 2.8354325970168484e-05,
"epoch": 3.0677966101694913,
"step": 1810
},
{
"loss": 1.0625,
"grad_norm": 0.9968867301940918,
"learning_rate": 2.7383838515436476e-05,
"epoch": 3.084745762711864,
"step": 1820
},
{
"loss": 1.0468,
"grad_norm": 1.0534526109695435,
"learning_rate": 2.6427608932686843e-05,
"epoch": 3.1016949152542375,
"step": 1830
},
{
"loss": 1.0377,
"grad_norm": 1.00564706325531,
"learning_rate": 2.5485824973498583e-05,
"epoch": 3.1186440677966103,
"step": 1840
},
{
"loss": 1.0291,
"grad_norm": 0.9239162802696228,
"learning_rate": 2.4558671553114378e-05,
"epoch": 3.135593220338983,
"step": 1850
},
{
"loss": 1.0185,
"grad_norm": 1.0018540620803833,
"learning_rate": 2.3646330714133393e-05,
"epoch": 3.152542372881356,
"step": 1860
},
{
"loss": 1.0535,
"grad_norm": 0.9955319762229919,
"learning_rate": 2.274898159076785e-05,
"epoch": 3.169491525423729,
"step": 1870
},
{
"loss": 1.0702,
"grad_norm": 1.0363454818725586,
"learning_rate": 2.1866800373671026e-05,
"epoch": 3.1864406779661016,
"step": 1880
},
{
"eval_loss": 1.2089406251907349,
"eval_runtime": 7.9175,
"eval_samples_per_second": 125.419,
"eval_steps_per_second": 31.449,
"epoch": 3.1864406779661016,
"step": 1880
},
{
"loss": 1.0637,
"grad_norm": 0.9806342124938965,
"learning_rate": 2.09999602753427e-05,
"epoch": 3.2033898305084745,
"step": 1890
},
{
"loss": 1.0281,
"grad_norm": 0.9875054359436035,
"learning_rate": 2.0148631496119784e-05,
"epoch": 3.2203389830508473,
"step": 1900
},
{
"loss": 1.0812,
"grad_norm": 0.9922574162483215,
"learning_rate": 1.9312981190758228e-05,
"epoch": 3.23728813559322,
"step": 1910
},
{
"loss": 1.0527,
"grad_norm": 0.9941257834434509,
"learning_rate": 1.8493173435612843e-05,
"epoch": 3.2542372881355934,
"step": 1920
},
{
"loss": 1.052,
"grad_norm": 1.0203768014907837,
"learning_rate": 1.7689369196421613e-05,
"epoch": 3.2711864406779663,
"step": 1930
},
{
"loss": 1.057,
"grad_norm": 1.0592260360717773,
"learning_rate": 1.6901726296700736e-05,
"epoch": 3.288135593220339,
"step": 1940
},
{
"loss": 1.0604,
"grad_norm": 0.9808682799339294,
"learning_rate": 1.6130399386756766e-05,
"epoch": 3.305084745762712,
"step": 1950
},
{
"loss": 1.0808,
"grad_norm": 0.9782573580741882,
"learning_rate": 1.5375539913321602e-05,
"epoch": 3.3220338983050848,
"step": 1960
},
{
"loss": 1.0548,
"grad_norm": 1.0036771297454834,
"learning_rate": 1.4637296089816543e-05,
"epoch": 3.3389830508474576,
"step": 1970
},
{
"loss": 1.0535,
"grad_norm": 1.0585639476776123,
"learning_rate": 1.3915812867251266e-05,
"epoch": 3.3559322033898304,
"step": 1980
},
{
"loss": 1.0957,
"grad_norm": 1.0012156963348389,
"learning_rate": 1.3211231905763355e-05,
"epoch": 3.3728813559322033,
"step": 1990
},
{
"loss": 1.0576,
"grad_norm": 0.9416826367378235,
"learning_rate": 1.2523691546803873e-05,
"epoch": 3.389830508474576,
"step": 2000
},
{
"loss": 1.0505,
"grad_norm": 0.9698029160499573,
"learning_rate": 1.1853326785974628e-05,
"epoch": 3.406779661016949,
"step": 2010
},
{
"loss": 1.0376,
"grad_norm": 0.9895077347755432,
"learning_rate": 1.1200269246522343e-05,
"epoch": 3.423728813559322,
"step": 2020
},
{
"loss": 1.0493,
"grad_norm": 0.9916106462478638,
"learning_rate": 1.0564647153495088e-05,
"epoch": 3.440677966101695,
"step": 2030
},
{
"loss": 1.1029,
"grad_norm": 1.044202208518982,
"learning_rate": 9.946585308565747e-06,
"epoch": 3.457627118644068,
"step": 2040
},
{
"loss": 1.0658,
"grad_norm": 1.0428624153137207,
"learning_rate": 9.346205065527769e-06,
"epoch": 3.4745762711864407,
"step": 2050
},
{
"loss": 1.0633,
"grad_norm": 1.0308985710144043,
"learning_rate": 8.763624306467844e-06,
"epoch": 3.4915254237288136,
"step": 2060
},
{
"loss": 1.0406,
"grad_norm": 1.0223889350891113,
"learning_rate": 8.198957418620401e-06,
"epoch": 3.5084745762711864,
"step": 2070
},
{
"loss": 1.0064,
"grad_norm": 1.0316060781478882,
"learning_rate": 7.652315271907929e-06,
"epoch": 3.5254237288135593,
"step": 2080
},
{
"loss": 1.0495,
"grad_norm": 0.9965396523475647,
"learning_rate": 7.1238051971723504e-06,
"epoch": 3.542372881355932,
"step": 2090
},
{
"loss": 1.0434,
"grad_norm": 0.9939844012260437,
"learning_rate": 6.613530965101e-06,
"epoch": 3.559322033898305,
"step": 2100
},
{
"loss": 1.0728,
"grad_norm": 0.9969953298568726,
"learning_rate": 6.121592765851647e-06,
"epoch": 3.576271186440678,
"step": 2110
},
{
"eval_loss": 1.206390380859375,
"eval_runtime": 7.9452,
"eval_samples_per_second": 124.982,
"eval_steps_per_second": 31.34,
"epoch": 3.584745762711864,
"step": 2115
},
{
"loss": 1.027,
"grad_norm": 1.0296121835708618,
"learning_rate": 5.648087189380613e-06,
"epoch": 3.593220338983051,
"step": 2120
},
{
"loss": 1.0543,
"grad_norm": 0.9752156734466553,
"learning_rate": 5.193107206477821e-06,
"epoch": 3.610169491525424,
"step": 2130
},
{
"loss": 1.0772,
"grad_norm": 1.014767050743103,
"learning_rate": 4.756742150512305e-06,
"epoch": 3.6271186440677967,
"step": 2140
},
{
"loss": 1.0694,
"grad_norm": 0.9678983688354492,
"learning_rate": 4.339077699891969e-06,
"epoch": 3.6440677966101696,
"step": 2150
},
{
"loss": 1.0381,
"grad_norm": 0.9780063033103943,
"learning_rate": 3.940195861241036e-06,
"epoch": 3.6610169491525424,
"step": 2160
},
{
"loss": 1.0478,
"grad_norm": 1.011366367340088,
"learning_rate": 3.560174953298434e-06,
"epoch": 3.6779661016949152,
"step": 2170
},
{
"loss": 1.057,
"grad_norm": 0.9899730086326599,
"learning_rate": 3.199089591540194e-06,
"epoch": 3.694915254237288,
"step": 2180
},
{
"loss": 1.0506,
"grad_norm": 1.040940761566162,
"learning_rate": 2.857010673529015e-06,
"epoch": 3.711864406779661,
"step": 2190
},
{
"loss": 1.0845,
"grad_norm": 0.9783785343170166,
"learning_rate": 2.5340053649938523e-06,
"epoch": 3.7288135593220337,
"step": 2200
},
{
"loss": 1.058,
"grad_norm": 1.0626617670059204,
"learning_rate": 2.2301370866422256e-06,
"epoch": 3.7457627118644066,
"step": 2210
},
{
"loss": 1.0401,
"grad_norm": 1.0348941087722778,
"learning_rate": 1.9454655017078438e-06,
"epoch": 3.7627118644067794,
"step": 2220
},
{
"loss": 1.0433,
"grad_norm": 0.9590414762496948,
"learning_rate": 1.6800465042359325e-06,
"epoch": 3.7796610169491527,
"step": 2230
},
{
"loss": 1.0658,
"grad_norm": 1.0622143745422363,
"learning_rate": 1.4339322081087236e-06,
"epoch": 3.7966101694915255,
"step": 2240
},
{
"loss": 1.0412,
"grad_norm": 1.0225697755813599,
"learning_rate": 1.2071709368131068e-06,
"epoch": 3.8135593220338984,
"step": 2250
},
{
"loss": 1.0589,
"grad_norm": 1.0060772895812988,
"learning_rate": 9.998072139525084e-07,
"epoch": 3.830508474576271,
"step": 2260
},
{
"loss": 1.0806,
"grad_norm": 0.9955238103866577,
"learning_rate": 8.118817545048952e-07,
"epoch": 3.847457627118644,
"step": 2270
},
{
"loss": 1.0534,
"grad_norm": 0.995814323425293,
"learning_rate": 6.434314568285249e-07,
"epoch": 3.864406779661017,
"step": 2280
},
{
"loss": 1.0518,
"grad_norm": 1.02264404296875,
"learning_rate": 4.944893954171859e-07,
"epoch": 3.8813559322033897,
"step": 2290
},
{
"loss": 1.0461,
"grad_norm": 1.0243033170700073,
"learning_rate": 3.6508481440604703e-07,
"epoch": 3.898305084745763,
"step": 2300
},
{
"loss": 1.0516,
"grad_norm": 1.0173128843307495,
"learning_rate": 2.55243121829829e-07,
"epoch": 3.915254237288136,
"step": 2310
},
{
"loss": 1.0589,
"grad_norm": 1.0421204566955566,
"learning_rate": 1.6498588463392806e-07,
"epoch": 3.9322033898305087,
"step": 2320
},
{
"loss": 1.0548,
"grad_norm": 1.024686574935913,
"learning_rate": 9.433082443991437e-08,
"epoch": 3.9491525423728815,
"step": 2330
},
{
"loss": 1.0619,
"grad_norm": 0.9935389161109924,
"learning_rate": 4.329181406593774e-08,
"epoch": 3.9661016949152543,
"step": 2340
},
{
"loss": 1.0571,
"grad_norm": 0.9899278879165649,
"learning_rate": 1.1878874802873086e-08,
"epoch": 3.983050847457627,
"step": 2350
},
{
"eval_loss": 1.2054944038391113,
"eval_runtime": 7.9797,
"eval_samples_per_second": 124.441,
"eval_steps_per_second": 31.204,
"epoch": 3.983050847457627,
"step": 2350
},
{
"loss": 1.1068,
"grad_norm": 3.4382264614105225,
"learning_rate": 9.817444666104792e-11,
"epoch": 4.0,
"step": 2360
},
{
"train_runtime": 470.094,
"train_samples_per_second": 160.402,
"train_steps_per_second": 5.02,
"total_flos": 1.617317761968e+16,
"train_loss": 1.2040837437419567,
"epoch": 4.0,
"step": 2360
}
]