1743 lines
39 KiB
JSON
1743 lines
39 KiB
JSON
[
|
|
{
|
|
"loss": 2.2046,
|
|
"grad_norm": 6.064353942871094,
|
|
"learning_rate": 1.5254237288135596e-05,
|
|
"epoch": 0.01694915254237288,
|
|
"step": 10
|
|
},
|
|
{
|
|
"loss": 1.7911,
|
|
"grad_norm": 2.307243824005127,
|
|
"learning_rate": 3.2203389830508473e-05,
|
|
"epoch": 0.03389830508474576,
|
|
"step": 20
|
|
},
|
|
{
|
|
"loss": 1.6879,
|
|
"grad_norm": 1.7352138757705688,
|
|
"learning_rate": 4.915254237288136e-05,
|
|
"epoch": 0.05084745762711865,
|
|
"step": 30
|
|
},
|
|
{
|
|
"loss": 1.6432,
|
|
"grad_norm": 1.2811769247055054,
|
|
"learning_rate": 6.610169491525424e-05,
|
|
"epoch": 0.06779661016949153,
|
|
"step": 40
|
|
},
|
|
{
|
|
"loss": 1.6039,
|
|
"grad_norm": 1.2322453260421753,
|
|
"learning_rate": 8.305084745762712e-05,
|
|
"epoch": 0.0847457627118644,
|
|
"step": 50
|
|
},
|
|
{
|
|
"loss": 1.6104,
|
|
"grad_norm": 1.3415359258651733,
|
|
"learning_rate": 0.0001,
|
|
"epoch": 0.1016949152542373,
|
|
"step": 60
|
|
},
|
|
{
|
|
"loss": 1.5699,
|
|
"grad_norm": 1.2617424726486206,
|
|
"learning_rate": 0.00011694915254237289,
|
|
"epoch": 0.11864406779661017,
|
|
"step": 70
|
|
},
|
|
{
|
|
"loss": 1.5413,
|
|
"grad_norm": 1.3335905075073242,
|
|
"learning_rate": 0.00013389830508474577,
|
|
"epoch": 0.13559322033898305,
|
|
"step": 80
|
|
},
|
|
{
|
|
"loss": 1.4973,
|
|
"grad_norm": 1.3121675252914429,
|
|
"learning_rate": 0.00015084745762711864,
|
|
"epoch": 0.15254237288135594,
|
|
"step": 90
|
|
},
|
|
{
|
|
"loss": 1.5,
|
|
"grad_norm": 1.1816385984420776,
|
|
"learning_rate": 0.00016779661016949154,
|
|
"epoch": 0.1694915254237288,
|
|
"step": 100
|
|
},
|
|
{
|
|
"loss": 1.4957,
|
|
"grad_norm": 1.1212005615234375,
|
|
"learning_rate": 0.00018474576271186442,
|
|
"epoch": 0.1864406779661017,
|
|
"step": 110
|
|
},
|
|
{
|
|
"loss": 1.487,
|
|
"grad_norm": 1.2974339723587036,
|
|
"learning_rate": 0.00019999990182555336,
|
|
"epoch": 0.2033898305084746,
|
|
"step": 120
|
|
},
|
|
{
|
|
"loss": 1.4686,
|
|
"grad_norm": 1.0796626806259155,
|
|
"learning_rate": 0.00019998812112519715,
|
|
"epoch": 0.22033898305084745,
|
|
"step": 130
|
|
},
|
|
{
|
|
"loss": 1.4282,
|
|
"grad_norm": 1.0553802251815796,
|
|
"learning_rate": 0.00019995670818593407,
|
|
"epoch": 0.23728813559322035,
|
|
"step": 140
|
|
},
|
|
{
|
|
"loss": 1.4437,
|
|
"grad_norm": 1.0136256217956543,
|
|
"learning_rate": 0.00019990566917556007,
|
|
"epoch": 0.2542372881355932,
|
|
"step": 150
|
|
},
|
|
{
|
|
"loss": 1.4265,
|
|
"grad_norm": 0.9413333535194397,
|
|
"learning_rate": 0.00019983501411536606,
|
|
"epoch": 0.2711864406779661,
|
|
"step": 160
|
|
},
|
|
{
|
|
"loss": 1.427,
|
|
"grad_norm": 0.9628981947898865,
|
|
"learning_rate": 0.00019974475687817018,
|
|
"epoch": 0.288135593220339,
|
|
"step": 170
|
|
},
|
|
{
|
|
"loss": 1.3973,
|
|
"grad_norm": 0.9879967570304871,
|
|
"learning_rate": 0.00019963491518559394,
|
|
"epoch": 0.3050847457627119,
|
|
"step": 180
|
|
},
|
|
{
|
|
"loss": 1.3989,
|
|
"grad_norm": 0.8920490145683289,
|
|
"learning_rate": 0.00019950551060458283,
|
|
"epoch": 0.3220338983050847,
|
|
"step": 190
|
|
},
|
|
{
|
|
"loss": 1.4089,
|
|
"grad_norm": 0.8335606455802917,
|
|
"learning_rate": 0.00019935656854317148,
|
|
"epoch": 0.3389830508474576,
|
|
"step": 200
|
|
},
|
|
{
|
|
"loss": 1.4301,
|
|
"grad_norm": 0.8877090215682983,
|
|
"learning_rate": 0.00019918811824549512,
|
|
"epoch": 0.3559322033898305,
|
|
"step": 210
|
|
},
|
|
{
|
|
"loss": 1.4483,
|
|
"grad_norm": 0.8929612040519714,
|
|
"learning_rate": 0.0001990001927860475,
|
|
"epoch": 0.3728813559322034,
|
|
"step": 220
|
|
},
|
|
{
|
|
"loss": 1.3764,
|
|
"grad_norm": 0.9452272057533264,
|
|
"learning_rate": 0.0001987928290631869,
|
|
"epoch": 0.3898305084745763,
|
|
"step": 230
|
|
},
|
|
{
|
|
"eval_loss": 1.3814609050750732,
|
|
"eval_runtime": 24.8496,
|
|
"eval_samples_per_second": 39.96,
|
|
"eval_steps_per_second": 10.02,
|
|
"epoch": 0.3983050847457627,
|
|
"step": 235
|
|
},
|
|
{
|
|
"loss": 1.4278,
|
|
"grad_norm": 0.9019148945808411,
|
|
"learning_rate": 0.00019856606779189128,
|
|
"epoch": 0.4067796610169492,
|
|
"step": 240
|
|
},
|
|
{
|
|
"loss": 1.357,
|
|
"grad_norm": 0.89701908826828,
|
|
"learning_rate": 0.00019831995349576408,
|
|
"epoch": 0.423728813559322,
|
|
"step": 250
|
|
},
|
|
{
|
|
"loss": 1.3846,
|
|
"grad_norm": 0.8912389278411865,
|
|
"learning_rate": 0.00019805453449829217,
|
|
"epoch": 0.4406779661016949,
|
|
"step": 260
|
|
},
|
|
{
|
|
"loss": 1.421,
|
|
"grad_norm": 0.8808926939964294,
|
|
"learning_rate": 0.0001977698629133578,
|
|
"epoch": 0.4576271186440678,
|
|
"step": 270
|
|
},
|
|
{
|
|
"loss": 1.3986,
|
|
"grad_norm": 0.86872798204422,
|
|
"learning_rate": 0.00019746599463500616,
|
|
"epoch": 0.4745762711864407,
|
|
"step": 280
|
|
},
|
|
{
|
|
"loss": 1.3493,
|
|
"grad_norm": 0.8555623292922974,
|
|
"learning_rate": 0.00019714298932647098,
|
|
"epoch": 0.4915254237288136,
|
|
"step": 290
|
|
},
|
|
{
|
|
"loss": 1.3591,
|
|
"grad_norm": 0.8310695290565491,
|
|
"learning_rate": 0.00019680091040845981,
|
|
"epoch": 0.5084745762711864,
|
|
"step": 300
|
|
},
|
|
{
|
|
"loss": 1.3719,
|
|
"grad_norm": 0.8844895958900452,
|
|
"learning_rate": 0.00019643982504670158,
|
|
"epoch": 0.5254237288135594,
|
|
"step": 310
|
|
},
|
|
{
|
|
"loss": 1.3581,
|
|
"grad_norm": 0.8830392360687256,
|
|
"learning_rate": 0.00019605980413875897,
|
|
"epoch": 0.5423728813559322,
|
|
"step": 320
|
|
},
|
|
{
|
|
"loss": 1.3904,
|
|
"grad_norm": 0.9279443025588989,
|
|
"learning_rate": 0.00019566092230010807,
|
|
"epoch": 0.559322033898305,
|
|
"step": 330
|
|
},
|
|
{
|
|
"loss": 1.3836,
|
|
"grad_norm": 0.9330219030380249,
|
|
"learning_rate": 0.0001952432578494877,
|
|
"epoch": 0.576271186440678,
|
|
"step": 340
|
|
},
|
|
{
|
|
"loss": 1.3549,
|
|
"grad_norm": 0.9056932330131531,
|
|
"learning_rate": 0.00019480689279352217,
|
|
"epoch": 0.5932203389830508,
|
|
"step": 350
|
|
},
|
|
{
|
|
"loss": 1.3738,
|
|
"grad_norm": 0.8885744214057922,
|
|
"learning_rate": 0.0001943519128106194,
|
|
"epoch": 0.6101694915254238,
|
|
"step": 360
|
|
},
|
|
{
|
|
"loss": 1.3528,
|
|
"grad_norm": 0.8452779650688171,
|
|
"learning_rate": 0.00019387840723414837,
|
|
"epoch": 0.6271186440677966,
|
|
"step": 370
|
|
},
|
|
{
|
|
"loss": 1.349,
|
|
"grad_norm": 0.8285683989524841,
|
|
"learning_rate": 0.000193386469034899,
|
|
"epoch": 0.6440677966101694,
|
|
"step": 380
|
|
},
|
|
{
|
|
"loss": 1.3703,
|
|
"grad_norm": 0.8421545624732971,
|
|
"learning_rate": 0.00019287619480282765,
|
|
"epoch": 0.6610169491525424,
|
|
"step": 390
|
|
},
|
|
{
|
|
"loss": 1.3553,
|
|
"grad_norm": 0.859512984752655,
|
|
"learning_rate": 0.0001923476847280921,
|
|
"epoch": 0.6779661016949152,
|
|
"step": 400
|
|
},
|
|
{
|
|
"loss": 1.3314,
|
|
"grad_norm": 0.8403517007827759,
|
|
"learning_rate": 0.0001918010425813796,
|
|
"epoch": 0.6949152542372882,
|
|
"step": 410
|
|
},
|
|
{
|
|
"loss": 1.3263,
|
|
"grad_norm": 0.8627921938896179,
|
|
"learning_rate": 0.00019123637569353218,
|
|
"epoch": 0.711864406779661,
|
|
"step": 420
|
|
},
|
|
{
|
|
"loss": 1.3824,
|
|
"grad_norm": 0.847070574760437,
|
|
"learning_rate": 0.00019065379493447227,
|
|
"epoch": 0.7288135593220338,
|
|
"step": 430
|
|
},
|
|
{
|
|
"loss": 1.3063,
|
|
"grad_norm": 0.9412711262702942,
|
|
"learning_rate": 0.00019005341469143427,
|
|
"epoch": 0.7457627118644068,
|
|
"step": 440
|
|
},
|
|
{
|
|
"loss": 1.3312,
|
|
"grad_norm": 0.8335198163986206,
|
|
"learning_rate": 0.00018943535284650492,
|
|
"epoch": 0.7627118644067796,
|
|
"step": 450
|
|
},
|
|
{
|
|
"loss": 1.3276,
|
|
"grad_norm": 0.8702097535133362,
|
|
"learning_rate": 0.0001887997307534777,
|
|
"epoch": 0.7796610169491526,
|
|
"step": 460
|
|
},
|
|
{
|
|
"loss": 1.3201,
|
|
"grad_norm": 0.9178433418273926,
|
|
"learning_rate": 0.0001881466732140254,
|
|
"epoch": 0.7966101694915254,
|
|
"step": 470
|
|
},
|
|
{
|
|
"eval_loss": 1.3105889558792114,
|
|
"eval_runtime": 7.9984,
|
|
"eval_samples_per_second": 124.15,
|
|
"eval_steps_per_second": 31.131,
|
|
"epoch": 0.7966101694915254,
|
|
"step": 470
|
|
},
|
|
{
|
|
"loss": 1.3173,
|
|
"grad_norm": 0.8480702042579651,
|
|
"learning_rate": 0.00018747630845319612,
|
|
"epoch": 0.8135593220338984,
|
|
"step": 480
|
|
},
|
|
{
|
|
"loss": 1.3154,
|
|
"grad_norm": 0.922341525554657,
|
|
"learning_rate": 0.00018678876809423667,
|
|
"epoch": 0.8305084745762712,
|
|
"step": 490
|
|
},
|
|
{
|
|
"loss": 1.337,
|
|
"grad_norm": 0.8491071462631226,
|
|
"learning_rate": 0.00018608418713274874,
|
|
"epoch": 0.847457627118644,
|
|
"step": 500
|
|
},
|
|
{
|
|
"loss": 1.3255,
|
|
"grad_norm": 0.7910286784172058,
|
|
"learning_rate": 0.00018536270391018346,
|
|
"epoch": 0.864406779661017,
|
|
"step": 510
|
|
},
|
|
{
|
|
"loss": 1.3064,
|
|
"grad_norm": 0.8801857233047485,
|
|
"learning_rate": 0.00018462446008667843,
|
|
"epoch": 0.8813559322033898,
|
|
"step": 520
|
|
},
|
|
{
|
|
"loss": 1.2972,
|
|
"grad_norm": 0.8282411694526672,
|
|
"learning_rate": 0.00018386960061324325,
|
|
"epoch": 0.8983050847457628,
|
|
"step": 530
|
|
},
|
|
{
|
|
"loss": 1.3023,
|
|
"grad_norm": 0.8699902296066284,
|
|
"learning_rate": 0.00018309827370329928,
|
|
"epoch": 0.9152542372881356,
|
|
"step": 540
|
|
},
|
|
{
|
|
"loss": 1.3272,
|
|
"grad_norm": 0.8380703926086426,
|
|
"learning_rate": 0.0001823106308035784,
|
|
"epoch": 0.9322033898305084,
|
|
"step": 550
|
|
},
|
|
{
|
|
"loss": 1.2897,
|
|
"grad_norm": 0.8781632781028748,
|
|
"learning_rate": 0.00018150682656438715,
|
|
"epoch": 0.9491525423728814,
|
|
"step": 560
|
|
},
|
|
{
|
|
"loss": 1.2885,
|
|
"grad_norm": 0.7954381704330444,
|
|
"learning_rate": 0.00018068701880924178,
|
|
"epoch": 0.9661016949152542,
|
|
"step": 570
|
|
},
|
|
{
|
|
"loss": 1.2882,
|
|
"grad_norm": 0.8385308980941772,
|
|
"learning_rate": 0.00017985136850388024,
|
|
"epoch": 0.9830508474576272,
|
|
"step": 580
|
|
},
|
|
{
|
|
"loss": 1.3128,
|
|
"grad_norm": 2.6521894931793213,
|
|
"learning_rate": 0.00017900003972465736,
|
|
"epoch": 1.0,
|
|
"step": 590
|
|
},
|
|
{
|
|
"loss": 1.2167,
|
|
"grad_norm": 0.9211756587028503,
|
|
"learning_rate": 0.000178133199626329,
|
|
"epoch": 1.0169491525423728,
|
|
"step": 600
|
|
},
|
|
{
|
|
"loss": 1.2556,
|
|
"grad_norm": 0.835600733757019,
|
|
"learning_rate": 0.00017725101840923216,
|
|
"epoch": 1.0338983050847457,
|
|
"step": 610
|
|
},
|
|
{
|
|
"loss": 1.2466,
|
|
"grad_norm": 0.8683267831802368,
|
|
"learning_rate": 0.00017635366928586663,
|
|
"epoch": 1.0508474576271187,
|
|
"step": 620
|
|
},
|
|
{
|
|
"loss": 1.2525,
|
|
"grad_norm": 0.9456862211227417,
|
|
"learning_rate": 0.00017544132844688563,
|
|
"epoch": 1.0677966101694916,
|
|
"step": 630
|
|
},
|
|
{
|
|
"loss": 1.2045,
|
|
"grad_norm": 0.9511478543281555,
|
|
"learning_rate": 0.00017451417502650145,
|
|
"epoch": 1.0847457627118644,
|
|
"step": 640
|
|
},
|
|
{
|
|
"loss": 1.2505,
|
|
"grad_norm": 0.8709162473678589,
|
|
"learning_rate": 0.00017357239106731317,
|
|
"epoch": 1.1016949152542372,
|
|
"step": 650
|
|
},
|
|
{
|
|
"loss": 1.2588,
|
|
"grad_norm": 0.8981189727783203,
|
|
"learning_rate": 0.00017261616148456357,
|
|
"epoch": 1.11864406779661,
|
|
"step": 660
|
|
},
|
|
{
|
|
"loss": 1.2353,
|
|
"grad_norm": 0.8719836473464966,
|
|
"learning_rate": 0.00017164567402983152,
|
|
"epoch": 1.1355932203389831,
|
|
"step": 670
|
|
},
|
|
{
|
|
"loss": 1.2081,
|
|
"grad_norm": 0.8911289572715759,
|
|
"learning_rate": 0.0001706611192541681,
|
|
"epoch": 1.152542372881356,
|
|
"step": 680
|
|
},
|
|
{
|
|
"loss": 1.2441,
|
|
"grad_norm": 0.8637029528617859,
|
|
"learning_rate": 0.0001696626904706824,
|
|
"epoch": 1.1694915254237288,
|
|
"step": 690
|
|
},
|
|
{
|
|
"loss": 1.2492,
|
|
"grad_norm": 0.8860388994216919,
|
|
"learning_rate": 0.00016865058371658557,
|
|
"epoch": 1.1864406779661016,
|
|
"step": 700
|
|
},
|
|
{
|
|
"eval_loss": 1.2774409055709839,
|
|
"eval_runtime": 7.9016,
|
|
"eval_samples_per_second": 125.67,
|
|
"eval_steps_per_second": 31.513,
|
|
"epoch": 1.194915254237288,
|
|
"step": 705
|
|
},
|
|
{
|
|
"loss": 1.2269,
|
|
"grad_norm": 0.8600966930389404,
|
|
"learning_rate": 0.00016762499771469957,
|
|
"epoch": 1.2033898305084745,
|
|
"step": 710
|
|
},
|
|
{
|
|
"loss": 1.2562,
|
|
"grad_norm": 0.8858769536018372,
|
|
"learning_rate": 0.0001665861338344389,
|
|
"epoch": 1.2203389830508475,
|
|
"step": 720
|
|
},
|
|
{
|
|
"loss": 1.242,
|
|
"grad_norm": 0.8999311327934265,
|
|
"learning_rate": 0.0001655341960522726,
|
|
"epoch": 1.2372881355932204,
|
|
"step": 730
|
|
},
|
|
{
|
|
"loss": 1.213,
|
|
"grad_norm": 0.914777398109436,
|
|
"learning_rate": 0.00016446939091167422,
|
|
"epoch": 1.2542372881355932,
|
|
"step": 740
|
|
},
|
|
{
|
|
"loss": 1.2392,
|
|
"grad_norm": 1.0026013851165771,
|
|
"learning_rate": 0.00016339192748256802,
|
|
"epoch": 1.271186440677966,
|
|
"step": 750
|
|
},
|
|
{
|
|
"loss": 1.2372,
|
|
"grad_norm": 0.9266188740730286,
|
|
"learning_rate": 0.0001623020173202789,
|
|
"epoch": 1.288135593220339,
|
|
"step": 760
|
|
},
|
|
{
|
|
"loss": 1.2391,
|
|
"grad_norm": 0.8796271681785583,
|
|
"learning_rate": 0.00016119987442399456,
|
|
"epoch": 1.305084745762712,
|
|
"step": 770
|
|
},
|
|
{
|
|
"loss": 1.2327,
|
|
"grad_norm": 0.9744959473609924,
|
|
"learning_rate": 0.00016008571519474742,
|
|
"epoch": 1.3220338983050848,
|
|
"step": 780
|
|
},
|
|
{
|
|
"loss": 1.2461,
|
|
"grad_norm": 0.9354102611541748,
|
|
"learning_rate": 0.0001589597583929255,
|
|
"epoch": 1.3389830508474576,
|
|
"step": 790
|
|
},
|
|
{
|
|
"loss": 1.233,
|
|
"grad_norm": 0.8850792050361633,
|
|
"learning_rate": 0.0001578222250953195,
|
|
"epoch": 1.3559322033898304,
|
|
"step": 800
|
|
},
|
|
{
|
|
"loss": 1.2353,
|
|
"grad_norm": 0.9097703695297241,
|
|
"learning_rate": 0.00015667333865171558,
|
|
"epoch": 1.3728813559322033,
|
|
"step": 810
|
|
},
|
|
{
|
|
"loss": 1.2464,
|
|
"grad_norm": 0.9092051386833191,
|
|
"learning_rate": 0.00015551332464104126,
|
|
"epoch": 1.3898305084745763,
|
|
"step": 820
|
|
},
|
|
{
|
|
"loss": 1.2245,
|
|
"grad_norm": 0.9042219519615173,
|
|
"learning_rate": 0.0001543424108270743,
|
|
"epoch": 1.4067796610169492,
|
|
"step": 830
|
|
},
|
|
{
|
|
"loss": 1.2424,
|
|
"grad_norm": 0.851340115070343,
|
|
"learning_rate": 0.00015316082711372205,
|
|
"epoch": 1.423728813559322,
|
|
"step": 840
|
|
},
|
|
{
|
|
"loss": 1.2214,
|
|
"grad_norm": 0.8824617266654968,
|
|
"learning_rate": 0.00015196880549988082,
|
|
"epoch": 1.4406779661016949,
|
|
"step": 850
|
|
},
|
|
{
|
|
"loss": 1.2169,
|
|
"grad_norm": 0.918869137763977,
|
|
"learning_rate": 0.000150766580033884,
|
|
"epoch": 1.457627118644068,
|
|
"step": 860
|
|
},
|
|
{
|
|
"loss": 1.2096,
|
|
"grad_norm": 0.9275985360145569,
|
|
"learning_rate": 0.00014955438676754755,
|
|
"epoch": 1.4745762711864407,
|
|
"step": 870
|
|
},
|
|
{
|
|
"loss": 1.1924,
|
|
"grad_norm": 0.9207339882850647,
|
|
"learning_rate": 0.00014833246370982237,
|
|
"epoch": 1.4915254237288136,
|
|
"step": 880
|
|
},
|
|
{
|
|
"loss": 1.2259,
|
|
"grad_norm": 0.9242911338806152,
|
|
"learning_rate": 0.00014710105078006205,
|
|
"epoch": 1.5084745762711864,
|
|
"step": 890
|
|
},
|
|
{
|
|
"loss": 1.193,
|
|
"grad_norm": 0.8754140734672546,
|
|
"learning_rate": 0.000145860389760916,
|
|
"epoch": 1.5254237288135593,
|
|
"step": 900
|
|
},
|
|
{
|
|
"loss": 1.2542,
|
|
"grad_norm": 0.8747695684432983,
|
|
"learning_rate": 0.00014461072425085627,
|
|
"epoch": 1.542372881355932,
|
|
"step": 910
|
|
},
|
|
{
|
|
"loss": 1.2102,
|
|
"grad_norm": 0.9027137160301208,
|
|
"learning_rate": 0.00014335229961634808,
|
|
"epoch": 1.559322033898305,
|
|
"step": 920
|
|
},
|
|
{
|
|
"loss": 1.2016,
|
|
"grad_norm": 0.9713094830513,
|
|
"learning_rate": 0.00014208536294367326,
|
|
"epoch": 1.576271186440678,
|
|
"step": 930
|
|
},
|
|
{
|
|
"loss": 1.203,
|
|
"grad_norm": 0.9080752730369568,
|
|
"learning_rate": 0.00014081016299041576,
|
|
"epoch": 1.5932203389830508,
|
|
"step": 940
|
|
},
|
|
{
|
|
"eval_loss": 1.2493342161178589,
|
|
"eval_runtime": 8.029,
|
|
"eval_samples_per_second": 123.676,
|
|
"eval_steps_per_second": 31.012,
|
|
"epoch": 1.5932203389830508,
|
|
"step": 940
|
|
},
|
|
{
|
|
"loss": 1.1835,
|
|
"grad_norm": 0.8457156419754028,
|
|
"learning_rate": 0.0001395269501366193,
|
|
"epoch": 1.6101694915254239,
|
|
"step": 950
|
|
},
|
|
{
|
|
"loss": 1.2195,
|
|
"grad_norm": 0.9038828015327454,
|
|
"learning_rate": 0.0001382359763356262,
|
|
"epoch": 1.6271186440677967,
|
|
"step": 960
|
|
},
|
|
{
|
|
"loss": 1.2371,
|
|
"grad_norm": 0.8834562301635742,
|
|
"learning_rate": 0.00013693749506460756,
|
|
"epoch": 1.6440677966101696,
|
|
"step": 970
|
|
},
|
|
{
|
|
"loss": 1.2204,
|
|
"grad_norm": 0.9082645773887634,
|
|
"learning_rate": 0.00013563176127479403,
|
|
"epoch": 1.6610169491525424,
|
|
"step": 980
|
|
},
|
|
{
|
|
"loss": 1.2343,
|
|
"grad_norm": 0.8829030990600586,
|
|
"learning_rate": 0.00013431903134141713,
|
|
"epoch": 1.6779661016949152,
|
|
"step": 990
|
|
},
|
|
{
|
|
"loss": 1.2114,
|
|
"grad_norm": 0.9125088453292847,
|
|
"learning_rate": 0.00013299956301337132,
|
|
"epoch": 1.694915254237288,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"loss": 1.2313,
|
|
"grad_norm": 0.898687481880188,
|
|
"learning_rate": 0.00013167361536260585,
|
|
"epoch": 1.711864406779661,
|
|
"step": 1010
|
|
},
|
|
{
|
|
"loss": 1.2132,
|
|
"grad_norm": 0.8782442808151245,
|
|
"learning_rate": 0.0001303414487332573,
|
|
"epoch": 1.7288135593220337,
|
|
"step": 1020
|
|
},
|
|
{
|
|
"loss": 1.2084,
|
|
"grad_norm": 0.8889365196228027,
|
|
"learning_rate": 0.00012900332469053193,
|
|
"epoch": 1.7457627118644068,
|
|
"step": 1030
|
|
},
|
|
{
|
|
"loss": 1.2034,
|
|
"grad_norm": 0.9221587777137756,
|
|
"learning_rate": 0.0001276595059693487,
|
|
"epoch": 1.7627118644067796,
|
|
"step": 1040
|
|
},
|
|
{
|
|
"loss": 1.2107,
|
|
"grad_norm": 0.9170616865158081,
|
|
"learning_rate": 0.00012631025642275212,
|
|
"epoch": 1.7796610169491527,
|
|
"step": 1050
|
|
},
|
|
{
|
|
"loss": 1.2134,
|
|
"grad_norm": 0.9416205883026123,
|
|
"learning_rate": 0.00012495584097010616,
|
|
"epoch": 1.7966101694915255,
|
|
"step": 1060
|
|
},
|
|
{
|
|
"loss": 1.1813,
|
|
"grad_norm": 0.9464443325996399,
|
|
"learning_rate": 0.0001235965255450781,
|
|
"epoch": 1.8135593220338984,
|
|
"step": 1070
|
|
},
|
|
{
|
|
"loss": 1.2182,
|
|
"grad_norm": 0.8776165246963501,
|
|
"learning_rate": 0.00012223257704342395,
|
|
"epoch": 1.8305084745762712,
|
|
"step": 1080
|
|
},
|
|
{
|
|
"loss": 1.2025,
|
|
"grad_norm": 0.9294790029525757,
|
|
"learning_rate": 0.0001208642632705844,
|
|
"epoch": 1.847457627118644,
|
|
"step": 1090
|
|
},
|
|
{
|
|
"loss": 1.1823,
|
|
"grad_norm": 0.9223579168319702,
|
|
"learning_rate": 0.00011949185288910236,
|
|
"epoch": 1.8644067796610169,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"loss": 1.2013,
|
|
"grad_norm": 0.8838573694229126,
|
|
"learning_rate": 0.00011811561536587244,
|
|
"epoch": 1.8813559322033897,
|
|
"step": 1110
|
|
},
|
|
{
|
|
"loss": 1.2051,
|
|
"grad_norm": 0.9253844618797302,
|
|
"learning_rate": 0.00011673582091923192,
|
|
"epoch": 1.8983050847457628,
|
|
"step": 1120
|
|
},
|
|
{
|
|
"loss": 1.1961,
|
|
"grad_norm": 0.9401239156723022,
|
|
"learning_rate": 0.00011535274046590492,
|
|
"epoch": 1.9152542372881356,
|
|
"step": 1130
|
|
},
|
|
{
|
|
"loss": 1.1937,
|
|
"grad_norm": 0.9650959968566895,
|
|
"learning_rate": 0.00011396664556780878,
|
|
"epoch": 1.9322033898305084,
|
|
"step": 1140
|
|
},
|
|
{
|
|
"loss": 1.2192,
|
|
"grad_norm": 0.9026353359222412,
|
|
"learning_rate": 0.00011257780837873417,
|
|
"epoch": 1.9491525423728815,
|
|
"step": 1150
|
|
},
|
|
{
|
|
"loss": 1.2247,
|
|
"grad_norm": 0.8826860189437866,
|
|
"learning_rate": 0.00011118650159090887,
|
|
"epoch": 1.9661016949152543,
|
|
"step": 1160
|
|
},
|
|
{
|
|
"loss": 1.1997,
|
|
"grad_norm": 0.8998943567276001,
|
|
"learning_rate": 0.00010979299838145574,
|
|
"epoch": 1.9830508474576272,
|
|
"step": 1170
|
|
},
|
|
{
|
|
"eval_loss": 1.2287755012512207,
|
|
"eval_runtime": 7.9784,
|
|
"eval_samples_per_second": 124.461,
|
|
"eval_steps_per_second": 31.209,
|
|
"epoch": 1.9915254237288136,
|
|
"step": 1175
|
|
},
|
|
{
|
|
"loss": 1.1501,
|
|
"grad_norm": 2.670851469039917,
|
|
"learning_rate": 0.00010839757235875563,
|
|
"epoch": 2.0,
|
|
"step": 1180
|
|
},
|
|
{
|
|
"loss": 1.1174,
|
|
"grad_norm": 0.9717442989349365,
|
|
"learning_rate": 0.00010700049750872557,
|
|
"epoch": 2.016949152542373,
|
|
"step": 1190
|
|
},
|
|
{
|
|
"loss": 1.1334,
|
|
"grad_norm": 0.9300686120986938,
|
|
"learning_rate": 0.00010560204814102266,
|
|
"epoch": 2.0338983050847457,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"loss": 1.1035,
|
|
"grad_norm": 0.9468068480491638,
|
|
"learning_rate": 0.00010420249883518476,
|
|
"epoch": 2.0508474576271185,
|
|
"step": 1210
|
|
},
|
|
{
|
|
"loss": 1.1228,
|
|
"grad_norm": 1.0294302701950073,
|
|
"learning_rate": 0.00010280212438671784,
|
|
"epoch": 2.0677966101694913,
|
|
"step": 1220
|
|
},
|
|
{
|
|
"loss": 1.13,
|
|
"grad_norm": 0.9107387661933899,
|
|
"learning_rate": 0.00010140119975314102,
|
|
"epoch": 2.084745762711864,
|
|
"step": 1230
|
|
},
|
|
{
|
|
"loss": 1.1378,
|
|
"grad_norm": 0.9056633114814758,
|
|
"learning_rate": 0.0001,
|
|
"epoch": 2.1016949152542375,
|
|
"step": 1240
|
|
},
|
|
{
|
|
"loss": 1.1515,
|
|
"grad_norm": 1.0032331943511963,
|
|
"learning_rate": 9.8598800246859e-05,
|
|
"epoch": 2.1186440677966103,
|
|
"step": 1250
|
|
},
|
|
{
|
|
"loss": 1.1338,
|
|
"grad_norm": 0.951280951499939,
|
|
"learning_rate": 9.719787561328217e-05,
|
|
"epoch": 2.135593220338983,
|
|
"step": 1260
|
|
},
|
|
{
|
|
"loss": 1.1398,
|
|
"grad_norm": 0.9472972750663757,
|
|
"learning_rate": 9.579750116481526e-05,
|
|
"epoch": 2.152542372881356,
|
|
"step": 1270
|
|
},
|
|
{
|
|
"loss": 1.1041,
|
|
"grad_norm": 0.9140039682388306,
|
|
"learning_rate": 9.439795185897736e-05,
|
|
"epoch": 2.169491525423729,
|
|
"step": 1280
|
|
},
|
|
{
|
|
"loss": 1.142,
|
|
"grad_norm": 0.9762586355209351,
|
|
"learning_rate": 9.29995024912745e-05,
|
|
"epoch": 2.1864406779661016,
|
|
"step": 1290
|
|
},
|
|
{
|
|
"loss": 1.1473,
|
|
"grad_norm": 0.9565635323524475,
|
|
"learning_rate": 9.160242764124439e-05,
|
|
"epoch": 2.2033898305084745,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"loss": 1.1507,
|
|
"grad_norm": 0.970757246017456,
|
|
"learning_rate": 9.020700161854429e-05,
|
|
"epoch": 2.2203389830508473,
|
|
"step": 1310
|
|
},
|
|
{
|
|
"loss": 1.1514,
|
|
"grad_norm": 1.0047391653060913,
|
|
"learning_rate": 8.881349840909116e-05,
|
|
"epoch": 2.23728813559322,
|
|
"step": 1320
|
|
},
|
|
{
|
|
"loss": 1.1218,
|
|
"grad_norm": 0.9501739740371704,
|
|
"learning_rate": 8.742219162126587e-05,
|
|
"epoch": 2.2542372881355934,
|
|
"step": 1330
|
|
},
|
|
{
|
|
"loss": 1.1025,
|
|
"grad_norm": 0.9464960098266602,
|
|
"learning_rate": 8.603335443219125e-05,
|
|
"epoch": 2.2711864406779663,
|
|
"step": 1340
|
|
},
|
|
{
|
|
"loss": 1.1101,
|
|
"grad_norm": 0.973585844039917,
|
|
"learning_rate": 8.464725953409509e-05,
|
|
"epoch": 2.288135593220339,
|
|
"step": 1350
|
|
},
|
|
{
|
|
"loss": 1.1109,
|
|
"grad_norm": 0.9535447955131531,
|
|
"learning_rate": 8.326417908076811e-05,
|
|
"epoch": 2.305084745762712,
|
|
"step": 1360
|
|
},
|
|
{
|
|
"loss": 1.1513,
|
|
"grad_norm": 0.9385268688201904,
|
|
"learning_rate": 8.188438463412761e-05,
|
|
"epoch": 2.3220338983050848,
|
|
"step": 1370
|
|
},
|
|
{
|
|
"loss": 1.111,
|
|
"grad_norm": 0.9565103054046631,
|
|
"learning_rate": 8.050814711089764e-05,
|
|
"epoch": 2.3389830508474576,
|
|
"step": 1380
|
|
},
|
|
{
|
|
"loss": 1.1122,
|
|
"grad_norm": 0.934618353843689,
|
|
"learning_rate": 7.913573672941563e-05,
|
|
"epoch": 2.3559322033898304,
|
|
"step": 1390
|
|
},
|
|
{
|
|
"loss": 1.1029,
|
|
"grad_norm": 0.9700987935066223,
|
|
"learning_rate": 7.776742295657608e-05,
|
|
"epoch": 2.3728813559322033,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"loss": 1.1159,
|
|
"grad_norm": 0.9688590168952942,
|
|
"learning_rate": 7.640347445492192e-05,
|
|
"epoch": 2.389830508474576,
|
|
"step": 1410
|
|
},
|
|
{
|
|
"eval_loss": 1.222853660583496,
|
|
"eval_runtime": 7.9699,
|
|
"eval_samples_per_second": 124.594,
|
|
"eval_steps_per_second": 31.243,
|
|
"epoch": 2.389830508474576,
|
|
"step": 1410
|
|
},
|
|
{
|
|
"loss": 1.0749,
|
|
"grad_norm": 0.9457218050956726,
|
|
"learning_rate": 7.504415902989386e-05,
|
|
"epoch": 2.406779661016949,
|
|
"step": 1420
|
|
},
|
|
{
|
|
"loss": 1.1272,
|
|
"grad_norm": 0.9391158223152161,
|
|
"learning_rate": 7.368974357724789e-05,
|
|
"epoch": 2.423728813559322,
|
|
"step": 1430
|
|
},
|
|
{
|
|
"loss": 1.1083,
|
|
"grad_norm": 0.9238609671592712,
|
|
"learning_rate": 7.234049403065132e-05,
|
|
"epoch": 2.440677966101695,
|
|
"step": 1440
|
|
},
|
|
{
|
|
"loss": 1.1058,
|
|
"grad_norm": 0.9600583910942078,
|
|
"learning_rate": 7.099667530946806e-05,
|
|
"epoch": 2.457627118644068,
|
|
"step": 1450
|
|
},
|
|
{
|
|
"loss": 1.1281,
|
|
"grad_norm": 0.9885203838348389,
|
|
"learning_rate": 6.96585512667427e-05,
|
|
"epoch": 2.4745762711864407,
|
|
"step": 1460
|
|
},
|
|
{
|
|
"loss": 1.1248,
|
|
"grad_norm": 0.9941011071205139,
|
|
"learning_rate": 6.832638463739418e-05,
|
|
"epoch": 2.4915254237288136,
|
|
"step": 1470
|
|
},
|
|
{
|
|
"loss": 1.1343,
|
|
"grad_norm": 0.9795677065849304,
|
|
"learning_rate": 6.700043698662873e-05,
|
|
"epoch": 2.5084745762711864,
|
|
"step": 1480
|
|
},
|
|
{
|
|
"loss": 1.118,
|
|
"grad_norm": 0.948656439781189,
|
|
"learning_rate": 6.568096865858289e-05,
|
|
"epoch": 2.5254237288135593,
|
|
"step": 1490
|
|
},
|
|
{
|
|
"loss": 1.139,
|
|
"grad_norm": 0.9555015563964844,
|
|
"learning_rate": 6.4368238725206e-05,
|
|
"epoch": 2.542372881355932,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"loss": 1.1327,
|
|
"grad_norm": 0.9538373351097107,
|
|
"learning_rate": 6.306250493539246e-05,
|
|
"epoch": 2.559322033898305,
|
|
"step": 1510
|
|
},
|
|
{
|
|
"loss": 1.1152,
|
|
"grad_norm": 0.9733698964118958,
|
|
"learning_rate": 6.176402366437382e-05,
|
|
"epoch": 2.576271186440678,
|
|
"step": 1520
|
|
},
|
|
{
|
|
"loss": 1.1426,
|
|
"grad_norm": 0.9399760961532593,
|
|
"learning_rate": 6.047304986338071e-05,
|
|
"epoch": 2.593220338983051,
|
|
"step": 1530
|
|
},
|
|
{
|
|
"loss": 1.1103,
|
|
"grad_norm": 1.0162689685821533,
|
|
"learning_rate": 5.918983700958425e-05,
|
|
"epoch": 2.610169491525424,
|
|
"step": 1540
|
|
},
|
|
{
|
|
"loss": 1.1153,
|
|
"grad_norm": 0.9900780916213989,
|
|
"learning_rate": 5.791463705632676e-05,
|
|
"epoch": 2.6271186440677967,
|
|
"step": 1550
|
|
},
|
|
{
|
|
"loss": 1.0927,
|
|
"grad_norm": 0.9424988031387329,
|
|
"learning_rate": 5.664770038365195e-05,
|
|
"epoch": 2.6440677966101696,
|
|
"step": 1560
|
|
},
|
|
{
|
|
"loss": 1.113,
|
|
"grad_norm": 0.9367265701293945,
|
|
"learning_rate": 5.538927574914376e-05,
|
|
"epoch": 2.6610169491525424,
|
|
"step": 1570
|
|
},
|
|
{
|
|
"loss": 1.1372,
|
|
"grad_norm": 0.9402151107788086,
|
|
"learning_rate": 5.413961023908401e-05,
|
|
"epoch": 2.6779661016949152,
|
|
"step": 1580
|
|
},
|
|
{
|
|
"loss": 1.1264,
|
|
"grad_norm": 1.0578895807266235,
|
|
"learning_rate": 5.2898949219937976e-05,
|
|
"epoch": 2.694915254237288,
|
|
"step": 1590
|
|
},
|
|
{
|
|
"loss": 1.1081,
|
|
"grad_norm": 0.9745665192604065,
|
|
"learning_rate": 5.166753629017764e-05,
|
|
"epoch": 2.711864406779661,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"loss": 1.1313,
|
|
"grad_norm": 0.9517439603805542,
|
|
"learning_rate": 5.044561323245245e-05,
|
|
"epoch": 2.7288135593220337,
|
|
"step": 1610
|
|
},
|
|
{
|
|
"loss": 1.1339,
|
|
"grad_norm": 0.9738871455192566,
|
|
"learning_rate": 4.9233419966116036e-05,
|
|
"epoch": 2.7457627118644066,
|
|
"step": 1620
|
|
},
|
|
{
|
|
"loss": 1.1421,
|
|
"grad_norm": 0.9917513132095337,
|
|
"learning_rate": 4.803119450011919e-05,
|
|
"epoch": 2.7627118644067794,
|
|
"step": 1630
|
|
},
|
|
{
|
|
"loss": 1.1158,
|
|
"grad_norm": 0.9498072266578674,
|
|
"learning_rate": 4.683917288627795e-05,
|
|
"epoch": 2.7796610169491527,
|
|
"step": 1640
|
|
},
|
|
{
|
|
"eval_loss": 1.2087918519973755,
|
|
"eval_runtime": 7.9858,
|
|
"eval_samples_per_second": 124.345,
|
|
"eval_steps_per_second": 31.18,
|
|
"epoch": 2.788135593220339,
|
|
"step": 1645
|
|
},
|
|
{
|
|
"loss": 1.0934,
|
|
"grad_norm": 0.9327065348625183,
|
|
"learning_rate": 4.56575891729257e-05,
|
|
"epoch": 2.7966101694915255,
|
|
"step": 1650
|
|
},
|
|
{
|
|
"loss": 1.1269,
|
|
"grad_norm": 1.0235567092895508,
|
|
"learning_rate": 4.448667535895876e-05,
|
|
"epoch": 2.8135593220338984,
|
|
"step": 1660
|
|
},
|
|
{
|
|
"loss": 1.0731,
|
|
"grad_norm": 0.9344584941864014,
|
|
"learning_rate": 4.332666134828444e-05,
|
|
"epoch": 2.830508474576271,
|
|
"step": 1670
|
|
},
|
|
{
|
|
"loss": 1.1295,
|
|
"grad_norm": 1.0067960023880005,
|
|
"learning_rate": 4.2177774904680475e-05,
|
|
"epoch": 2.847457627118644,
|
|
"step": 1680
|
|
},
|
|
{
|
|
"loss": 1.061,
|
|
"grad_norm": 0.960763692855835,
|
|
"learning_rate": 4.1040241607074516e-05,
|
|
"epoch": 2.864406779661017,
|
|
"step": 1690
|
|
},
|
|
{
|
|
"loss": 1.093,
|
|
"grad_norm": 0.9818686842918396,
|
|
"learning_rate": 3.991428480525261e-05,
|
|
"epoch": 2.8813559322033897,
|
|
"step": 1700
|
|
},
|
|
{
|
|
"loss": 1.1469,
|
|
"grad_norm": 0.9942337274551392,
|
|
"learning_rate": 3.880012557600547e-05,
|
|
"epoch": 2.898305084745763,
|
|
"step": 1710
|
|
},
|
|
{
|
|
"loss": 1.1369,
|
|
"grad_norm": 1.022985577583313,
|
|
"learning_rate": 3.769798267972109e-05,
|
|
"epoch": 2.915254237288136,
|
|
"step": 1720
|
|
},
|
|
{
|
|
"loss": 1.1009,
|
|
"grad_norm": 0.9834439754486084,
|
|
"learning_rate": 3.6608072517432013e-05,
|
|
"epoch": 2.9322033898305087,
|
|
"step": 1730
|
|
},
|
|
{
|
|
"loss": 1.1134,
|
|
"grad_norm": 1.0133659839630127,
|
|
"learning_rate": 3.553060908832583e-05,
|
|
"epoch": 2.9491525423728815,
|
|
"step": 1740
|
|
},
|
|
{
|
|
"loss": 1.0923,
|
|
"grad_norm": 0.9789726734161377,
|
|
"learning_rate": 3.4465803947727424e-05,
|
|
"epoch": 2.9661016949152543,
|
|
"step": 1750
|
|
},
|
|
{
|
|
"loss": 1.1277,
|
|
"grad_norm": 0.9691158533096313,
|
|
"learning_rate": 3.341386616556109e-05,
|
|
"epoch": 2.983050847457627,
|
|
"step": 1760
|
|
},
|
|
{
|
|
"loss": 1.1257,
|
|
"grad_norm": 3.6234147548675537,
|
|
"learning_rate": 3.237500228530045e-05,
|
|
"epoch": 3.0,
|
|
"step": 1770
|
|
},
|
|
{
|
|
"loss": 1.0769,
|
|
"grad_norm": 1.0010051727294922,
|
|
"learning_rate": 3.1349416283414465e-05,
|
|
"epoch": 3.016949152542373,
|
|
"step": 1780
|
|
},
|
|
{
|
|
"loss": 1.0647,
|
|
"grad_norm": 0.9725643396377563,
|
|
"learning_rate": 3.0337309529317604e-05,
|
|
"epoch": 3.0338983050847457,
|
|
"step": 1790
|
|
},
|
|
{
|
|
"loss": 1.0718,
|
|
"grad_norm": 1.0454438924789429,
|
|
"learning_rate": 2.933888074583193e-05,
|
|
"epoch": 3.0508474576271185,
|
|
"step": 1800
|
|
},
|
|
{
|
|
"loss": 1.0739,
|
|
"grad_norm": 0.9589056372642517,
|
|
"learning_rate": 2.8354325970168484e-05,
|
|
"epoch": 3.0677966101694913,
|
|
"step": 1810
|
|
},
|
|
{
|
|
"loss": 1.0625,
|
|
"grad_norm": 0.9968867301940918,
|
|
"learning_rate": 2.7383838515436476e-05,
|
|
"epoch": 3.084745762711864,
|
|
"step": 1820
|
|
},
|
|
{
|
|
"loss": 1.0468,
|
|
"grad_norm": 1.0534526109695435,
|
|
"learning_rate": 2.6427608932686843e-05,
|
|
"epoch": 3.1016949152542375,
|
|
"step": 1830
|
|
},
|
|
{
|
|
"loss": 1.0377,
|
|
"grad_norm": 1.00564706325531,
|
|
"learning_rate": 2.5485824973498583e-05,
|
|
"epoch": 3.1186440677966103,
|
|
"step": 1840
|
|
},
|
|
{
|
|
"loss": 1.0291,
|
|
"grad_norm": 0.9239162802696228,
|
|
"learning_rate": 2.4558671553114378e-05,
|
|
"epoch": 3.135593220338983,
|
|
"step": 1850
|
|
},
|
|
{
|
|
"loss": 1.0185,
|
|
"grad_norm": 1.0018540620803833,
|
|
"learning_rate": 2.3646330714133393e-05,
|
|
"epoch": 3.152542372881356,
|
|
"step": 1860
|
|
},
|
|
{
|
|
"loss": 1.0535,
|
|
"grad_norm": 0.9955319762229919,
|
|
"learning_rate": 2.274898159076785e-05,
|
|
"epoch": 3.169491525423729,
|
|
"step": 1870
|
|
},
|
|
{
|
|
"loss": 1.0702,
|
|
"grad_norm": 1.0363454818725586,
|
|
"learning_rate": 2.1866800373671026e-05,
|
|
"epoch": 3.1864406779661016,
|
|
"step": 1880
|
|
},
|
|
{
|
|
"eval_loss": 1.2089406251907349,
|
|
"eval_runtime": 7.9175,
|
|
"eval_samples_per_second": 125.419,
|
|
"eval_steps_per_second": 31.449,
|
|
"epoch": 3.1864406779661016,
|
|
"step": 1880
|
|
},
|
|
{
|
|
"loss": 1.0637,
|
|
"grad_norm": 0.9806342124938965,
|
|
"learning_rate": 2.09999602753427e-05,
|
|
"epoch": 3.2033898305084745,
|
|
"step": 1890
|
|
},
|
|
{
|
|
"loss": 1.0281,
|
|
"grad_norm": 0.9875054359436035,
|
|
"learning_rate": 2.0148631496119784e-05,
|
|
"epoch": 3.2203389830508473,
|
|
"step": 1900
|
|
},
|
|
{
|
|
"loss": 1.0812,
|
|
"grad_norm": 0.9922574162483215,
|
|
"learning_rate": 1.9312981190758228e-05,
|
|
"epoch": 3.23728813559322,
|
|
"step": 1910
|
|
},
|
|
{
|
|
"loss": 1.0527,
|
|
"grad_norm": 0.9941257834434509,
|
|
"learning_rate": 1.8493173435612843e-05,
|
|
"epoch": 3.2542372881355934,
|
|
"step": 1920
|
|
},
|
|
{
|
|
"loss": 1.052,
|
|
"grad_norm": 1.0203768014907837,
|
|
"learning_rate": 1.7689369196421613e-05,
|
|
"epoch": 3.2711864406779663,
|
|
"step": 1930
|
|
},
|
|
{
|
|
"loss": 1.057,
|
|
"grad_norm": 1.0592260360717773,
|
|
"learning_rate": 1.6901726296700736e-05,
|
|
"epoch": 3.288135593220339,
|
|
"step": 1940
|
|
},
|
|
{
|
|
"loss": 1.0604,
|
|
"grad_norm": 0.9808682799339294,
|
|
"learning_rate": 1.6130399386756766e-05,
|
|
"epoch": 3.305084745762712,
|
|
"step": 1950
|
|
},
|
|
{
|
|
"loss": 1.0808,
|
|
"grad_norm": 0.9782573580741882,
|
|
"learning_rate": 1.5375539913321602e-05,
|
|
"epoch": 3.3220338983050848,
|
|
"step": 1960
|
|
},
|
|
{
|
|
"loss": 1.0548,
|
|
"grad_norm": 1.0036771297454834,
|
|
"learning_rate": 1.4637296089816543e-05,
|
|
"epoch": 3.3389830508474576,
|
|
"step": 1970
|
|
},
|
|
{
|
|
"loss": 1.0535,
|
|
"grad_norm": 1.0585639476776123,
|
|
"learning_rate": 1.3915812867251266e-05,
|
|
"epoch": 3.3559322033898304,
|
|
"step": 1980
|
|
},
|
|
{
|
|
"loss": 1.0957,
|
|
"grad_norm": 1.0012156963348389,
|
|
"learning_rate": 1.3211231905763355e-05,
|
|
"epoch": 3.3728813559322033,
|
|
"step": 1990
|
|
},
|
|
{
|
|
"loss": 1.0576,
|
|
"grad_norm": 0.9416826367378235,
|
|
"learning_rate": 1.2523691546803873e-05,
|
|
"epoch": 3.389830508474576,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"loss": 1.0505,
|
|
"grad_norm": 0.9698029160499573,
|
|
"learning_rate": 1.1853326785974628e-05,
|
|
"epoch": 3.406779661016949,
|
|
"step": 2010
|
|
},
|
|
{
|
|
"loss": 1.0376,
|
|
"grad_norm": 0.9895077347755432,
|
|
"learning_rate": 1.1200269246522343e-05,
|
|
"epoch": 3.423728813559322,
|
|
"step": 2020
|
|
},
|
|
{
|
|
"loss": 1.0493,
|
|
"grad_norm": 0.9916106462478638,
|
|
"learning_rate": 1.0564647153495088e-05,
|
|
"epoch": 3.440677966101695,
|
|
"step": 2030
|
|
},
|
|
{
|
|
"loss": 1.1029,
|
|
"grad_norm": 1.044202208518982,
|
|
"learning_rate": 9.946585308565747e-06,
|
|
"epoch": 3.457627118644068,
|
|
"step": 2040
|
|
},
|
|
{
|
|
"loss": 1.0658,
|
|
"grad_norm": 1.0428624153137207,
|
|
"learning_rate": 9.346205065527769e-06,
|
|
"epoch": 3.4745762711864407,
|
|
"step": 2050
|
|
},
|
|
{
|
|
"loss": 1.0633,
|
|
"grad_norm": 1.0308985710144043,
|
|
"learning_rate": 8.763624306467844e-06,
|
|
"epoch": 3.4915254237288136,
|
|
"step": 2060
|
|
},
|
|
{
|
|
"loss": 1.0406,
|
|
"grad_norm": 1.0223889350891113,
|
|
"learning_rate": 8.198957418620401e-06,
|
|
"epoch": 3.5084745762711864,
|
|
"step": 2070
|
|
},
|
|
{
|
|
"loss": 1.0064,
|
|
"grad_norm": 1.0316060781478882,
|
|
"learning_rate": 7.652315271907929e-06,
|
|
"epoch": 3.5254237288135593,
|
|
"step": 2080
|
|
},
|
|
{
|
|
"loss": 1.0495,
|
|
"grad_norm": 0.9965396523475647,
|
|
"learning_rate": 7.1238051971723504e-06,
|
|
"epoch": 3.542372881355932,
|
|
"step": 2090
|
|
},
|
|
{
|
|
"loss": 1.0434,
|
|
"grad_norm": 0.9939844012260437,
|
|
"learning_rate": 6.613530965101e-06,
|
|
"epoch": 3.559322033898305,
|
|
"step": 2100
|
|
},
|
|
{
|
|
"loss": 1.0728,
|
|
"grad_norm": 0.9969953298568726,
|
|
"learning_rate": 6.121592765851647e-06,
|
|
"epoch": 3.576271186440678,
|
|
"step": 2110
|
|
},
|
|
{
|
|
"eval_loss": 1.206390380859375,
|
|
"eval_runtime": 7.9452,
|
|
"eval_samples_per_second": 124.982,
|
|
"eval_steps_per_second": 31.34,
|
|
"epoch": 3.584745762711864,
|
|
"step": 2115
|
|
},
|
|
{
|
|
"loss": 1.027,
|
|
"grad_norm": 1.0296121835708618,
|
|
"learning_rate": 5.648087189380613e-06,
|
|
"epoch": 3.593220338983051,
|
|
"step": 2120
|
|
},
|
|
{
|
|
"loss": 1.0543,
|
|
"grad_norm": 0.9752156734466553,
|
|
"learning_rate": 5.193107206477821e-06,
|
|
"epoch": 3.610169491525424,
|
|
"step": 2130
|
|
},
|
|
{
|
|
"loss": 1.0772,
|
|
"grad_norm": 1.014767050743103,
|
|
"learning_rate": 4.756742150512305e-06,
|
|
"epoch": 3.6271186440677967,
|
|
"step": 2140
|
|
},
|
|
{
|
|
"loss": 1.0694,
|
|
"grad_norm": 0.9678983688354492,
|
|
"learning_rate": 4.339077699891969e-06,
|
|
"epoch": 3.6440677966101696,
|
|
"step": 2150
|
|
},
|
|
{
|
|
"loss": 1.0381,
|
|
"grad_norm": 0.9780063033103943,
|
|
"learning_rate": 3.940195861241036e-06,
|
|
"epoch": 3.6610169491525424,
|
|
"step": 2160
|
|
},
|
|
{
|
|
"loss": 1.0478,
|
|
"grad_norm": 1.011366367340088,
|
|
"learning_rate": 3.560174953298434e-06,
|
|
"epoch": 3.6779661016949152,
|
|
"step": 2170
|
|
},
|
|
{
|
|
"loss": 1.057,
|
|
"grad_norm": 0.9899730086326599,
|
|
"learning_rate": 3.199089591540194e-06,
|
|
"epoch": 3.694915254237288,
|
|
"step": 2180
|
|
},
|
|
{
|
|
"loss": 1.0506,
|
|
"grad_norm": 1.040940761566162,
|
|
"learning_rate": 2.857010673529015e-06,
|
|
"epoch": 3.711864406779661,
|
|
"step": 2190
|
|
},
|
|
{
|
|
"loss": 1.0845,
|
|
"grad_norm": 0.9783785343170166,
|
|
"learning_rate": 2.5340053649938523e-06,
|
|
"epoch": 3.7288135593220337,
|
|
"step": 2200
|
|
},
|
|
{
|
|
"loss": 1.058,
|
|
"grad_norm": 1.0626617670059204,
|
|
"learning_rate": 2.2301370866422256e-06,
|
|
"epoch": 3.7457627118644066,
|
|
"step": 2210
|
|
},
|
|
{
|
|
"loss": 1.0401,
|
|
"grad_norm": 1.0348941087722778,
|
|
"learning_rate": 1.9454655017078438e-06,
|
|
"epoch": 3.7627118644067794,
|
|
"step": 2220
|
|
},
|
|
{
|
|
"loss": 1.0433,
|
|
"grad_norm": 0.9590414762496948,
|
|
"learning_rate": 1.6800465042359325e-06,
|
|
"epoch": 3.7796610169491527,
|
|
"step": 2230
|
|
},
|
|
{
|
|
"loss": 1.0658,
|
|
"grad_norm": 1.0622143745422363,
|
|
"learning_rate": 1.4339322081087236e-06,
|
|
"epoch": 3.7966101694915255,
|
|
"step": 2240
|
|
},
|
|
{
|
|
"loss": 1.0412,
|
|
"grad_norm": 1.0225697755813599,
|
|
"learning_rate": 1.2071709368131068e-06,
|
|
"epoch": 3.8135593220338984,
|
|
"step": 2250
|
|
},
|
|
{
|
|
"loss": 1.0589,
|
|
"grad_norm": 1.0060772895812988,
|
|
"learning_rate": 9.998072139525084e-07,
|
|
"epoch": 3.830508474576271,
|
|
"step": 2260
|
|
},
|
|
{
|
|
"loss": 1.0806,
|
|
"grad_norm": 0.9955238103866577,
|
|
"learning_rate": 8.118817545048952e-07,
|
|
"epoch": 3.847457627118644,
|
|
"step": 2270
|
|
},
|
|
{
|
|
"loss": 1.0534,
|
|
"grad_norm": 0.995814323425293,
|
|
"learning_rate": 6.434314568285249e-07,
|
|
"epoch": 3.864406779661017,
|
|
"step": 2280
|
|
},
|
|
{
|
|
"loss": 1.0518,
|
|
"grad_norm": 1.02264404296875,
|
|
"learning_rate": 4.944893954171859e-07,
|
|
"epoch": 3.8813559322033897,
|
|
"step": 2290
|
|
},
|
|
{
|
|
"loss": 1.0461,
|
|
"grad_norm": 1.0243033170700073,
|
|
"learning_rate": 3.6508481440604703e-07,
|
|
"epoch": 3.898305084745763,
|
|
"step": 2300
|
|
},
|
|
{
|
|
"loss": 1.0516,
|
|
"grad_norm": 1.0173128843307495,
|
|
"learning_rate": 2.55243121829829e-07,
|
|
"epoch": 3.915254237288136,
|
|
"step": 2310
|
|
},
|
|
{
|
|
"loss": 1.0589,
|
|
"grad_norm": 1.0421204566955566,
|
|
"learning_rate": 1.6498588463392806e-07,
|
|
"epoch": 3.9322033898305087,
|
|
"step": 2320
|
|
},
|
|
{
|
|
"loss": 1.0548,
|
|
"grad_norm": 1.024686574935913,
|
|
"learning_rate": 9.433082443991437e-08,
|
|
"epoch": 3.9491525423728815,
|
|
"step": 2330
|
|
},
|
|
{
|
|
"loss": 1.0619,
|
|
"grad_norm": 0.9935389161109924,
|
|
"learning_rate": 4.329181406593774e-08,
|
|
"epoch": 3.9661016949152543,
|
|
"step": 2340
|
|
},
|
|
{
|
|
"loss": 1.0571,
|
|
"grad_norm": 0.9899278879165649,
|
|
"learning_rate": 1.1878874802873086e-08,
|
|
"epoch": 3.983050847457627,
|
|
"step": 2350
|
|
},
|
|
{
|
|
"eval_loss": 1.2054944038391113,
|
|
"eval_runtime": 7.9797,
|
|
"eval_samples_per_second": 124.441,
|
|
"eval_steps_per_second": 31.204,
|
|
"epoch": 3.983050847457627,
|
|
"step": 2350
|
|
},
|
|
{
|
|
"loss": 1.1068,
|
|
"grad_norm": 3.4382264614105225,
|
|
"learning_rate": 9.817444666104792e-11,
|
|
"epoch": 4.0,
|
|
"step": 2360
|
|
},
|
|
{
|
|
"train_runtime": 470.094,
|
|
"train_samples_per_second": 160.402,
|
|
"train_steps_per_second": 5.02,
|
|
"total_flos": 1.617317761968e+16,
|
|
"train_loss": 1.2040837437419567,
|
|
"epoch": 4.0,
|
|
"step": 2360
|
|
}
|
|
] |