[ { "loss": 2.2046, "grad_norm": 6.064353942871094, "learning_rate": 1.5254237288135596e-05, "epoch": 0.01694915254237288, "step": 10 }, { "loss": 1.7911, "grad_norm": 2.307243824005127, "learning_rate": 3.2203389830508473e-05, "epoch": 0.03389830508474576, "step": 20 }, { "loss": 1.6879, "grad_norm": 1.7352138757705688, "learning_rate": 4.915254237288136e-05, "epoch": 0.05084745762711865, "step": 30 }, { "loss": 1.6432, "grad_norm": 1.2811769247055054, "learning_rate": 6.610169491525424e-05, "epoch": 0.06779661016949153, "step": 40 }, { "loss": 1.6039, "grad_norm": 1.2322453260421753, "learning_rate": 8.305084745762712e-05, "epoch": 0.0847457627118644, "step": 50 }, { "loss": 1.6104, "grad_norm": 1.3415359258651733, "learning_rate": 0.0001, "epoch": 0.1016949152542373, "step": 60 }, { "loss": 1.5699, "grad_norm": 1.2617424726486206, "learning_rate": 0.00011694915254237289, "epoch": 0.11864406779661017, "step": 70 }, { "loss": 1.5413, "grad_norm": 1.3335905075073242, "learning_rate": 0.00013389830508474577, "epoch": 0.13559322033898305, "step": 80 }, { "loss": 1.4973, "grad_norm": 1.3121675252914429, "learning_rate": 0.00015084745762711864, "epoch": 0.15254237288135594, "step": 90 }, { "loss": 1.5, "grad_norm": 1.1816385984420776, "learning_rate": 0.00016779661016949154, "epoch": 0.1694915254237288, "step": 100 }, { "loss": 1.4957, "grad_norm": 1.1212005615234375, "learning_rate": 0.00018474576271186442, "epoch": 0.1864406779661017, "step": 110 }, { "loss": 1.487, "grad_norm": 1.2974339723587036, "learning_rate": 0.00019999990182555336, "epoch": 0.2033898305084746, "step": 120 }, { "loss": 1.4686, "grad_norm": 1.0796626806259155, "learning_rate": 0.00019998812112519715, "epoch": 0.22033898305084745, "step": 130 }, { "loss": 1.4282, "grad_norm": 1.0553802251815796, "learning_rate": 0.00019995670818593407, "epoch": 0.23728813559322035, "step": 140 }, { "loss": 1.4437, "grad_norm": 1.0136256217956543, "learning_rate": 0.00019990566917556007, "epoch": 0.2542372881355932, "step": 150 }, { "loss": 1.4265, "grad_norm": 0.9413333535194397, "learning_rate": 0.00019983501411536606, "epoch": 0.2711864406779661, "step": 160 }, { "loss": 1.427, "grad_norm": 0.9628981947898865, "learning_rate": 0.00019974475687817018, "epoch": 0.288135593220339, "step": 170 }, { "loss": 1.3973, "grad_norm": 0.9879967570304871, "learning_rate": 0.00019963491518559394, "epoch": 0.3050847457627119, "step": 180 }, { "loss": 1.3989, "grad_norm": 0.8920490145683289, "learning_rate": 0.00019950551060458283, "epoch": 0.3220338983050847, "step": 190 }, { "loss": 1.4089, "grad_norm": 0.8335606455802917, "learning_rate": 0.00019935656854317148, "epoch": 0.3389830508474576, "step": 200 }, { "loss": 1.4301, "grad_norm": 0.8877090215682983, "learning_rate": 0.00019918811824549512, "epoch": 0.3559322033898305, "step": 210 }, { "loss": 1.4483, "grad_norm": 0.8929612040519714, "learning_rate": 0.0001990001927860475, "epoch": 0.3728813559322034, "step": 220 }, { "loss": 1.3764, "grad_norm": 0.9452272057533264, "learning_rate": 0.0001987928290631869, "epoch": 0.3898305084745763, "step": 230 }, { "eval_loss": 1.3814609050750732, "eval_runtime": 24.8496, "eval_samples_per_second": 39.96, "eval_steps_per_second": 10.02, "epoch": 0.3983050847457627, "step": 235 }, { "loss": 1.4278, "grad_norm": 0.9019148945808411, "learning_rate": 0.00019856606779189128, "epoch": 0.4067796610169492, "step": 240 }, { "loss": 1.357, "grad_norm": 0.89701908826828, "learning_rate": 0.00019831995349576408, "epoch": 0.423728813559322, "step": 250 }, { "loss": 1.3846, "grad_norm": 0.8912389278411865, "learning_rate": 0.00019805453449829217, "epoch": 0.4406779661016949, "step": 260 }, { "loss": 1.421, "grad_norm": 0.8808926939964294, "learning_rate": 0.0001977698629133578, "epoch": 0.4576271186440678, "step": 270 }, { "loss": 1.3986, "grad_norm": 0.86872798204422, "learning_rate": 0.00019746599463500616, "epoch": 0.4745762711864407, "step": 280 }, { "loss": 1.3493, "grad_norm": 0.8555623292922974, "learning_rate": 0.00019714298932647098, "epoch": 0.4915254237288136, "step": 290 }, { "loss": 1.3591, "grad_norm": 0.8310695290565491, "learning_rate": 0.00019680091040845981, "epoch": 0.5084745762711864, "step": 300 }, { "loss": 1.3719, "grad_norm": 0.8844895958900452, "learning_rate": 0.00019643982504670158, "epoch": 0.5254237288135594, "step": 310 }, { "loss": 1.3581, "grad_norm": 0.8830392360687256, "learning_rate": 0.00019605980413875897, "epoch": 0.5423728813559322, "step": 320 }, { "loss": 1.3904, "grad_norm": 0.9279443025588989, "learning_rate": 0.00019566092230010807, "epoch": 0.559322033898305, "step": 330 }, { "loss": 1.3836, "grad_norm": 0.9330219030380249, "learning_rate": 0.0001952432578494877, "epoch": 0.576271186440678, "step": 340 }, { "loss": 1.3549, "grad_norm": 0.9056932330131531, "learning_rate": 0.00019480689279352217, "epoch": 0.5932203389830508, "step": 350 }, { "loss": 1.3738, "grad_norm": 0.8885744214057922, "learning_rate": 0.0001943519128106194, "epoch": 0.6101694915254238, "step": 360 }, { "loss": 1.3528, "grad_norm": 0.8452779650688171, "learning_rate": 0.00019387840723414837, "epoch": 0.6271186440677966, "step": 370 }, { "loss": 1.349, "grad_norm": 0.8285683989524841, "learning_rate": 0.000193386469034899, "epoch": 0.6440677966101694, "step": 380 }, { "loss": 1.3703, "grad_norm": 0.8421545624732971, "learning_rate": 0.00019287619480282765, "epoch": 0.6610169491525424, "step": 390 }, { "loss": 1.3553, "grad_norm": 0.859512984752655, "learning_rate": 0.0001923476847280921, "epoch": 0.6779661016949152, "step": 400 }, { "loss": 1.3314, "grad_norm": 0.8403517007827759, "learning_rate": 0.0001918010425813796, "epoch": 0.6949152542372882, "step": 410 }, { "loss": 1.3263, "grad_norm": 0.8627921938896179, "learning_rate": 0.00019123637569353218, "epoch": 0.711864406779661, "step": 420 }, { "loss": 1.3824, "grad_norm": 0.847070574760437, "learning_rate": 0.00019065379493447227, "epoch": 0.7288135593220338, "step": 430 }, { "loss": 1.3063, "grad_norm": 0.9412711262702942, "learning_rate": 0.00019005341469143427, "epoch": 0.7457627118644068, "step": 440 }, { "loss": 1.3312, "grad_norm": 0.8335198163986206, "learning_rate": 0.00018943535284650492, "epoch": 0.7627118644067796, "step": 450 }, { "loss": 1.3276, "grad_norm": 0.8702097535133362, "learning_rate": 0.0001887997307534777, "epoch": 0.7796610169491526, "step": 460 }, { "loss": 1.3201, "grad_norm": 0.9178433418273926, "learning_rate": 0.0001881466732140254, "epoch": 0.7966101694915254, "step": 470 }, { "eval_loss": 1.3105889558792114, "eval_runtime": 7.9984, "eval_samples_per_second": 124.15, "eval_steps_per_second": 31.131, "epoch": 0.7966101694915254, "step": 470 }, { "loss": 1.3173, "grad_norm": 0.8480702042579651, "learning_rate": 0.00018747630845319612, "epoch": 0.8135593220338984, "step": 480 }, { "loss": 1.3154, "grad_norm": 0.922341525554657, "learning_rate": 0.00018678876809423667, "epoch": 0.8305084745762712, "step": 490 }, { "loss": 1.337, "grad_norm": 0.8491071462631226, "learning_rate": 0.00018608418713274874, "epoch": 0.847457627118644, "step": 500 }, { "loss": 1.3255, "grad_norm": 0.7910286784172058, "learning_rate": 0.00018536270391018346, "epoch": 0.864406779661017, "step": 510 }, { "loss": 1.3064, "grad_norm": 0.8801857233047485, "learning_rate": 0.00018462446008667843, "epoch": 0.8813559322033898, "step": 520 }, { "loss": 1.2972, "grad_norm": 0.8282411694526672, "learning_rate": 0.00018386960061324325, "epoch": 0.8983050847457628, "step": 530 }, { "loss": 1.3023, "grad_norm": 0.8699902296066284, "learning_rate": 0.00018309827370329928, "epoch": 0.9152542372881356, "step": 540 }, { "loss": 1.3272, "grad_norm": 0.8380703926086426, "learning_rate": 0.0001823106308035784, "epoch": 0.9322033898305084, "step": 550 }, { "loss": 1.2897, "grad_norm": 0.8781632781028748, "learning_rate": 0.00018150682656438715, "epoch": 0.9491525423728814, "step": 560 }, { "loss": 1.2885, "grad_norm": 0.7954381704330444, "learning_rate": 0.00018068701880924178, "epoch": 0.9661016949152542, "step": 570 }, { "loss": 1.2882, "grad_norm": 0.8385308980941772, "learning_rate": 0.00017985136850388024, "epoch": 0.9830508474576272, "step": 580 }, { "loss": 1.3128, "grad_norm": 2.6521894931793213, "learning_rate": 0.00017900003972465736, "epoch": 1.0, "step": 590 }, { "loss": 1.2167, "grad_norm": 0.9211756587028503, "learning_rate": 0.000178133199626329, "epoch": 1.0169491525423728, "step": 600 }, { "loss": 1.2556, "grad_norm": 0.835600733757019, "learning_rate": 0.00017725101840923216, "epoch": 1.0338983050847457, "step": 610 }, { "loss": 1.2466, "grad_norm": 0.8683267831802368, "learning_rate": 0.00017635366928586663, "epoch": 1.0508474576271187, "step": 620 }, { "loss": 1.2525, "grad_norm": 0.9456862211227417, "learning_rate": 0.00017544132844688563, "epoch": 1.0677966101694916, "step": 630 }, { "loss": 1.2045, "grad_norm": 0.9511478543281555, "learning_rate": 0.00017451417502650145, "epoch": 1.0847457627118644, "step": 640 }, { "loss": 1.2505, "grad_norm": 0.8709162473678589, "learning_rate": 0.00017357239106731317, "epoch": 1.1016949152542372, "step": 650 }, { "loss": 1.2588, "grad_norm": 0.8981189727783203, "learning_rate": 0.00017261616148456357, "epoch": 1.11864406779661, "step": 660 }, { "loss": 1.2353, "grad_norm": 0.8719836473464966, "learning_rate": 0.00017164567402983152, "epoch": 1.1355932203389831, "step": 670 }, { "loss": 1.2081, "grad_norm": 0.8911289572715759, "learning_rate": 0.0001706611192541681, "epoch": 1.152542372881356, "step": 680 }, { "loss": 1.2441, "grad_norm": 0.8637029528617859, "learning_rate": 0.0001696626904706824, "epoch": 1.1694915254237288, "step": 690 }, { "loss": 1.2492, "grad_norm": 0.8860388994216919, "learning_rate": 0.00016865058371658557, "epoch": 1.1864406779661016, "step": 700 }, { "eval_loss": 1.2774409055709839, "eval_runtime": 7.9016, "eval_samples_per_second": 125.67, "eval_steps_per_second": 31.513, "epoch": 1.194915254237288, "step": 705 }, { "loss": 1.2269, "grad_norm": 0.8600966930389404, "learning_rate": 0.00016762499771469957, "epoch": 1.2033898305084745, "step": 710 }, { "loss": 1.2562, "grad_norm": 0.8858769536018372, "learning_rate": 0.0001665861338344389, "epoch": 1.2203389830508475, "step": 720 }, { "loss": 1.242, "grad_norm": 0.8999311327934265, "learning_rate": 0.0001655341960522726, "epoch": 1.2372881355932204, "step": 730 }, { "loss": 1.213, "grad_norm": 0.914777398109436, "learning_rate": 0.00016446939091167422, "epoch": 1.2542372881355932, "step": 740 }, { "loss": 1.2392, "grad_norm": 1.0026013851165771, "learning_rate": 0.00016339192748256802, "epoch": 1.271186440677966, "step": 750 }, { "loss": 1.2372, "grad_norm": 0.9266188740730286, "learning_rate": 0.0001623020173202789, "epoch": 1.288135593220339, "step": 760 }, { "loss": 1.2391, "grad_norm": 0.8796271681785583, "learning_rate": 0.00016119987442399456, "epoch": 1.305084745762712, "step": 770 }, { "loss": 1.2327, "grad_norm": 0.9744959473609924, "learning_rate": 0.00016008571519474742, "epoch": 1.3220338983050848, "step": 780 }, { "loss": 1.2461, "grad_norm": 0.9354102611541748, "learning_rate": 0.0001589597583929255, "epoch": 1.3389830508474576, "step": 790 }, { "loss": 1.233, "grad_norm": 0.8850792050361633, "learning_rate": 0.0001578222250953195, "epoch": 1.3559322033898304, "step": 800 }, { "loss": 1.2353, "grad_norm": 0.9097703695297241, "learning_rate": 0.00015667333865171558, "epoch": 1.3728813559322033, "step": 810 }, { "loss": 1.2464, "grad_norm": 0.9092051386833191, "learning_rate": 0.00015551332464104126, "epoch": 1.3898305084745763, "step": 820 }, { "loss": 1.2245, "grad_norm": 0.9042219519615173, "learning_rate": 0.0001543424108270743, "epoch": 1.4067796610169492, "step": 830 }, { "loss": 1.2424, "grad_norm": 0.851340115070343, "learning_rate": 0.00015316082711372205, "epoch": 1.423728813559322, "step": 840 }, { "loss": 1.2214, "grad_norm": 0.8824617266654968, "learning_rate": 0.00015196880549988082, "epoch": 1.4406779661016949, "step": 850 }, { "loss": 1.2169, "grad_norm": 0.918869137763977, "learning_rate": 0.000150766580033884, "epoch": 1.457627118644068, "step": 860 }, { "loss": 1.2096, "grad_norm": 0.9275985360145569, "learning_rate": 0.00014955438676754755, "epoch": 1.4745762711864407, "step": 870 }, { "loss": 1.1924, "grad_norm": 0.9207339882850647, "learning_rate": 0.00014833246370982237, "epoch": 1.4915254237288136, "step": 880 }, { "loss": 1.2259, "grad_norm": 0.9242911338806152, "learning_rate": 0.00014710105078006205, "epoch": 1.5084745762711864, "step": 890 }, { "loss": 1.193, "grad_norm": 0.8754140734672546, "learning_rate": 0.000145860389760916, "epoch": 1.5254237288135593, "step": 900 }, { "loss": 1.2542, "grad_norm": 0.8747695684432983, "learning_rate": 0.00014461072425085627, "epoch": 1.542372881355932, "step": 910 }, { "loss": 1.2102, "grad_norm": 0.9027137160301208, "learning_rate": 0.00014335229961634808, "epoch": 1.559322033898305, "step": 920 }, { "loss": 1.2016, "grad_norm": 0.9713094830513, "learning_rate": 0.00014208536294367326, "epoch": 1.576271186440678, "step": 930 }, { "loss": 1.203, "grad_norm": 0.9080752730369568, "learning_rate": 0.00014081016299041576, "epoch": 1.5932203389830508, "step": 940 }, { "eval_loss": 1.2493342161178589, "eval_runtime": 8.029, "eval_samples_per_second": 123.676, "eval_steps_per_second": 31.012, "epoch": 1.5932203389830508, "step": 940 }, { "loss": 1.1835, "grad_norm": 0.8457156419754028, "learning_rate": 0.0001395269501366193, "epoch": 1.6101694915254239, "step": 950 }, { "loss": 1.2195, "grad_norm": 0.9038828015327454, "learning_rate": 0.0001382359763356262, "epoch": 1.6271186440677967, "step": 960 }, { "loss": 1.2371, "grad_norm": 0.8834562301635742, "learning_rate": 0.00013693749506460756, "epoch": 1.6440677966101696, "step": 970 }, { "loss": 1.2204, "grad_norm": 0.9082645773887634, "learning_rate": 0.00013563176127479403, "epoch": 1.6610169491525424, "step": 980 }, { "loss": 1.2343, "grad_norm": 0.8829030990600586, "learning_rate": 0.00013431903134141713, "epoch": 1.6779661016949152, "step": 990 }, { "loss": 1.2114, "grad_norm": 0.9125088453292847, "learning_rate": 0.00013299956301337132, "epoch": 1.694915254237288, "step": 1000 }, { "loss": 1.2313, "grad_norm": 0.898687481880188, "learning_rate": 0.00013167361536260585, "epoch": 1.711864406779661, "step": 1010 }, { "loss": 1.2132, "grad_norm": 0.8782442808151245, "learning_rate": 0.0001303414487332573, "epoch": 1.7288135593220337, "step": 1020 }, { "loss": 1.2084, "grad_norm": 0.8889365196228027, "learning_rate": 0.00012900332469053193, "epoch": 1.7457627118644068, "step": 1030 }, { "loss": 1.2034, "grad_norm": 0.9221587777137756, "learning_rate": 0.0001276595059693487, "epoch": 1.7627118644067796, "step": 1040 }, { "loss": 1.2107, "grad_norm": 0.9170616865158081, "learning_rate": 0.00012631025642275212, "epoch": 1.7796610169491527, "step": 1050 }, { "loss": 1.2134, "grad_norm": 0.9416205883026123, "learning_rate": 0.00012495584097010616, "epoch": 1.7966101694915255, "step": 1060 }, { "loss": 1.1813, "grad_norm": 0.9464443325996399, "learning_rate": 0.0001235965255450781, "epoch": 1.8135593220338984, "step": 1070 }, { "loss": 1.2182, "grad_norm": 0.8776165246963501, "learning_rate": 0.00012223257704342395, "epoch": 1.8305084745762712, "step": 1080 }, { "loss": 1.2025, "grad_norm": 0.9294790029525757, "learning_rate": 0.0001208642632705844, "epoch": 1.847457627118644, "step": 1090 }, { "loss": 1.1823, "grad_norm": 0.9223579168319702, "learning_rate": 0.00011949185288910236, "epoch": 1.8644067796610169, "step": 1100 }, { "loss": 1.2013, "grad_norm": 0.8838573694229126, "learning_rate": 0.00011811561536587244, "epoch": 1.8813559322033897, "step": 1110 }, { "loss": 1.2051, "grad_norm": 0.9253844618797302, "learning_rate": 0.00011673582091923192, "epoch": 1.8983050847457628, "step": 1120 }, { "loss": 1.1961, "grad_norm": 0.9401239156723022, "learning_rate": 0.00011535274046590492, "epoch": 1.9152542372881356, "step": 1130 }, { "loss": 1.1937, "grad_norm": 0.9650959968566895, "learning_rate": 0.00011396664556780878, "epoch": 1.9322033898305084, "step": 1140 }, { "loss": 1.2192, "grad_norm": 0.9026353359222412, "learning_rate": 0.00011257780837873417, "epoch": 1.9491525423728815, "step": 1150 }, { "loss": 1.2247, "grad_norm": 0.8826860189437866, "learning_rate": 0.00011118650159090887, "epoch": 1.9661016949152543, "step": 1160 }, { "loss": 1.1997, "grad_norm": 0.8998943567276001, "learning_rate": 0.00010979299838145574, "epoch": 1.9830508474576272, "step": 1170 }, { "eval_loss": 1.2287755012512207, "eval_runtime": 7.9784, "eval_samples_per_second": 124.461, "eval_steps_per_second": 31.209, "epoch": 1.9915254237288136, "step": 1175 }, { "loss": 1.1501, "grad_norm": 2.670851469039917, "learning_rate": 0.00010839757235875563, "epoch": 2.0, "step": 1180 }, { "loss": 1.1174, "grad_norm": 0.9717442989349365, "learning_rate": 0.00010700049750872557, "epoch": 2.016949152542373, "step": 1190 }, { "loss": 1.1334, "grad_norm": 0.9300686120986938, "learning_rate": 0.00010560204814102266, "epoch": 2.0338983050847457, "step": 1200 }, { "loss": 1.1035, "grad_norm": 0.9468068480491638, "learning_rate": 0.00010420249883518476, "epoch": 2.0508474576271185, "step": 1210 }, { "loss": 1.1228, "grad_norm": 1.0294302701950073, "learning_rate": 0.00010280212438671784, "epoch": 2.0677966101694913, "step": 1220 }, { "loss": 1.13, "grad_norm": 0.9107387661933899, "learning_rate": 0.00010140119975314102, "epoch": 2.084745762711864, "step": 1230 }, { "loss": 1.1378, "grad_norm": 0.9056633114814758, "learning_rate": 0.0001, "epoch": 2.1016949152542375, "step": 1240 }, { "loss": 1.1515, "grad_norm": 1.0032331943511963, "learning_rate": 9.8598800246859e-05, "epoch": 2.1186440677966103, "step": 1250 }, { "loss": 1.1338, "grad_norm": 0.951280951499939, "learning_rate": 9.719787561328217e-05, "epoch": 2.135593220338983, "step": 1260 }, { "loss": 1.1398, "grad_norm": 0.9472972750663757, "learning_rate": 9.579750116481526e-05, "epoch": 2.152542372881356, "step": 1270 }, { "loss": 1.1041, "grad_norm": 0.9140039682388306, "learning_rate": 9.439795185897736e-05, "epoch": 2.169491525423729, "step": 1280 }, { "loss": 1.142, "grad_norm": 0.9762586355209351, "learning_rate": 9.29995024912745e-05, "epoch": 2.1864406779661016, "step": 1290 }, { "loss": 1.1473, "grad_norm": 0.9565635323524475, "learning_rate": 9.160242764124439e-05, "epoch": 2.2033898305084745, "step": 1300 }, { "loss": 1.1507, "grad_norm": 0.970757246017456, "learning_rate": 9.020700161854429e-05, "epoch": 2.2203389830508473, "step": 1310 }, { "loss": 1.1514, "grad_norm": 1.0047391653060913, "learning_rate": 8.881349840909116e-05, "epoch": 2.23728813559322, "step": 1320 }, { "loss": 1.1218, "grad_norm": 0.9501739740371704, "learning_rate": 8.742219162126587e-05, "epoch": 2.2542372881355934, "step": 1330 }, { "loss": 1.1025, "grad_norm": 0.9464960098266602, "learning_rate": 8.603335443219125e-05, "epoch": 2.2711864406779663, "step": 1340 }, { "loss": 1.1101, "grad_norm": 0.973585844039917, "learning_rate": 8.464725953409509e-05, "epoch": 2.288135593220339, "step": 1350 }, { "loss": 1.1109, "grad_norm": 0.9535447955131531, "learning_rate": 8.326417908076811e-05, "epoch": 2.305084745762712, "step": 1360 }, { "loss": 1.1513, "grad_norm": 0.9385268688201904, "learning_rate": 8.188438463412761e-05, "epoch": 2.3220338983050848, "step": 1370 }, { "loss": 1.111, "grad_norm": 0.9565103054046631, "learning_rate": 8.050814711089764e-05, "epoch": 2.3389830508474576, "step": 1380 }, { "loss": 1.1122, "grad_norm": 0.934618353843689, "learning_rate": 7.913573672941563e-05, "epoch": 2.3559322033898304, "step": 1390 }, { "loss": 1.1029, "grad_norm": 0.9700987935066223, "learning_rate": 7.776742295657608e-05, "epoch": 2.3728813559322033, "step": 1400 }, { "loss": 1.1159, "grad_norm": 0.9688590168952942, "learning_rate": 7.640347445492192e-05, "epoch": 2.389830508474576, "step": 1410 }, { "eval_loss": 1.222853660583496, "eval_runtime": 7.9699, "eval_samples_per_second": 124.594, "eval_steps_per_second": 31.243, "epoch": 2.389830508474576, "step": 1410 }, { "loss": 1.0749, "grad_norm": 0.9457218050956726, "learning_rate": 7.504415902989386e-05, "epoch": 2.406779661016949, "step": 1420 }, { "loss": 1.1272, "grad_norm": 0.9391158223152161, "learning_rate": 7.368974357724789e-05, "epoch": 2.423728813559322, "step": 1430 }, { "loss": 1.1083, "grad_norm": 0.9238609671592712, "learning_rate": 7.234049403065132e-05, "epoch": 2.440677966101695, "step": 1440 }, { "loss": 1.1058, "grad_norm": 0.9600583910942078, "learning_rate": 7.099667530946806e-05, "epoch": 2.457627118644068, "step": 1450 }, { "loss": 1.1281, "grad_norm": 0.9885203838348389, "learning_rate": 6.96585512667427e-05, "epoch": 2.4745762711864407, "step": 1460 }, { "loss": 1.1248, "grad_norm": 0.9941011071205139, "learning_rate": 6.832638463739418e-05, "epoch": 2.4915254237288136, "step": 1470 }, { "loss": 1.1343, "grad_norm": 0.9795677065849304, "learning_rate": 6.700043698662873e-05, "epoch": 2.5084745762711864, "step": 1480 }, { "loss": 1.118, "grad_norm": 0.948656439781189, "learning_rate": 6.568096865858289e-05, "epoch": 2.5254237288135593, "step": 1490 }, { "loss": 1.139, "grad_norm": 0.9555015563964844, "learning_rate": 6.4368238725206e-05, "epoch": 2.542372881355932, "step": 1500 }, { "loss": 1.1327, "grad_norm": 0.9538373351097107, "learning_rate": 6.306250493539246e-05, "epoch": 2.559322033898305, "step": 1510 }, { "loss": 1.1152, "grad_norm": 0.9733698964118958, "learning_rate": 6.176402366437382e-05, "epoch": 2.576271186440678, "step": 1520 }, { "loss": 1.1426, "grad_norm": 0.9399760961532593, "learning_rate": 6.047304986338071e-05, "epoch": 2.593220338983051, "step": 1530 }, { "loss": 1.1103, "grad_norm": 1.0162689685821533, "learning_rate": 5.918983700958425e-05, "epoch": 2.610169491525424, "step": 1540 }, { "loss": 1.1153, "grad_norm": 0.9900780916213989, "learning_rate": 5.791463705632676e-05, "epoch": 2.6271186440677967, "step": 1550 }, { "loss": 1.0927, "grad_norm": 0.9424988031387329, "learning_rate": 5.664770038365195e-05, "epoch": 2.6440677966101696, "step": 1560 }, { "loss": 1.113, "grad_norm": 0.9367265701293945, "learning_rate": 5.538927574914376e-05, "epoch": 2.6610169491525424, "step": 1570 }, { "loss": 1.1372, "grad_norm": 0.9402151107788086, "learning_rate": 5.413961023908401e-05, "epoch": 2.6779661016949152, "step": 1580 }, { "loss": 1.1264, "grad_norm": 1.0578895807266235, "learning_rate": 5.2898949219937976e-05, "epoch": 2.694915254237288, "step": 1590 }, { "loss": 1.1081, "grad_norm": 0.9745665192604065, "learning_rate": 5.166753629017764e-05, "epoch": 2.711864406779661, "step": 1600 }, { "loss": 1.1313, "grad_norm": 0.9517439603805542, "learning_rate": 5.044561323245245e-05, "epoch": 2.7288135593220337, "step": 1610 }, { "loss": 1.1339, "grad_norm": 0.9738871455192566, "learning_rate": 4.9233419966116036e-05, "epoch": 2.7457627118644066, "step": 1620 }, { "loss": 1.1421, "grad_norm": 0.9917513132095337, "learning_rate": 4.803119450011919e-05, "epoch": 2.7627118644067794, "step": 1630 }, { "loss": 1.1158, "grad_norm": 0.9498072266578674, "learning_rate": 4.683917288627795e-05, "epoch": 2.7796610169491527, "step": 1640 }, { "eval_loss": 1.2087918519973755, "eval_runtime": 7.9858, "eval_samples_per_second": 124.345, "eval_steps_per_second": 31.18, "epoch": 2.788135593220339, "step": 1645 }, { "loss": 1.0934, "grad_norm": 0.9327065348625183, "learning_rate": 4.56575891729257e-05, "epoch": 2.7966101694915255, "step": 1650 }, { "loss": 1.1269, "grad_norm": 1.0235567092895508, "learning_rate": 4.448667535895876e-05, "epoch": 2.8135593220338984, "step": 1660 }, { "loss": 1.0731, "grad_norm": 0.9344584941864014, "learning_rate": 4.332666134828444e-05, "epoch": 2.830508474576271, "step": 1670 }, { "loss": 1.1295, "grad_norm": 1.0067960023880005, "learning_rate": 4.2177774904680475e-05, "epoch": 2.847457627118644, "step": 1680 }, { "loss": 1.061, "grad_norm": 0.960763692855835, "learning_rate": 4.1040241607074516e-05, "epoch": 2.864406779661017, "step": 1690 }, { "loss": 1.093, "grad_norm": 0.9818686842918396, "learning_rate": 3.991428480525261e-05, "epoch": 2.8813559322033897, "step": 1700 }, { "loss": 1.1469, "grad_norm": 0.9942337274551392, "learning_rate": 3.880012557600547e-05, "epoch": 2.898305084745763, "step": 1710 }, { "loss": 1.1369, "grad_norm": 1.022985577583313, "learning_rate": 3.769798267972109e-05, "epoch": 2.915254237288136, "step": 1720 }, { "loss": 1.1009, "grad_norm": 0.9834439754486084, "learning_rate": 3.6608072517432013e-05, "epoch": 2.9322033898305087, "step": 1730 }, { "loss": 1.1134, "grad_norm": 1.0133659839630127, "learning_rate": 3.553060908832583e-05, "epoch": 2.9491525423728815, "step": 1740 }, { "loss": 1.0923, "grad_norm": 0.9789726734161377, "learning_rate": 3.4465803947727424e-05, "epoch": 2.9661016949152543, "step": 1750 }, { "loss": 1.1277, "grad_norm": 0.9691158533096313, "learning_rate": 3.341386616556109e-05, "epoch": 2.983050847457627, "step": 1760 }, { "loss": 1.1257, "grad_norm": 3.6234147548675537, "learning_rate": 3.237500228530045e-05, "epoch": 3.0, "step": 1770 }, { "loss": 1.0769, "grad_norm": 1.0010051727294922, "learning_rate": 3.1349416283414465e-05, "epoch": 3.016949152542373, "step": 1780 }, { "loss": 1.0647, "grad_norm": 0.9725643396377563, "learning_rate": 3.0337309529317604e-05, "epoch": 3.0338983050847457, "step": 1790 }, { "loss": 1.0718, "grad_norm": 1.0454438924789429, "learning_rate": 2.933888074583193e-05, "epoch": 3.0508474576271185, "step": 1800 }, { "loss": 1.0739, "grad_norm": 0.9589056372642517, "learning_rate": 2.8354325970168484e-05, "epoch": 3.0677966101694913, "step": 1810 }, { "loss": 1.0625, "grad_norm": 0.9968867301940918, "learning_rate": 2.7383838515436476e-05, "epoch": 3.084745762711864, "step": 1820 }, { "loss": 1.0468, "grad_norm": 1.0534526109695435, "learning_rate": 2.6427608932686843e-05, "epoch": 3.1016949152542375, "step": 1830 }, { "loss": 1.0377, "grad_norm": 1.00564706325531, "learning_rate": 2.5485824973498583e-05, "epoch": 3.1186440677966103, "step": 1840 }, { "loss": 1.0291, "grad_norm": 0.9239162802696228, "learning_rate": 2.4558671553114378e-05, "epoch": 3.135593220338983, "step": 1850 }, { "loss": 1.0185, "grad_norm": 1.0018540620803833, "learning_rate": 2.3646330714133393e-05, "epoch": 3.152542372881356, "step": 1860 }, { "loss": 1.0535, "grad_norm": 0.9955319762229919, "learning_rate": 2.274898159076785e-05, "epoch": 3.169491525423729, "step": 1870 }, { "loss": 1.0702, "grad_norm": 1.0363454818725586, "learning_rate": 2.1866800373671026e-05, "epoch": 3.1864406779661016, "step": 1880 }, { "eval_loss": 1.2089406251907349, "eval_runtime": 7.9175, "eval_samples_per_second": 125.419, "eval_steps_per_second": 31.449, "epoch": 3.1864406779661016, "step": 1880 }, { "loss": 1.0637, "grad_norm": 0.9806342124938965, "learning_rate": 2.09999602753427e-05, "epoch": 3.2033898305084745, "step": 1890 }, { "loss": 1.0281, "grad_norm": 0.9875054359436035, "learning_rate": 2.0148631496119784e-05, "epoch": 3.2203389830508473, "step": 1900 }, { "loss": 1.0812, "grad_norm": 0.9922574162483215, "learning_rate": 1.9312981190758228e-05, "epoch": 3.23728813559322, "step": 1910 }, { "loss": 1.0527, "grad_norm": 0.9941257834434509, "learning_rate": 1.8493173435612843e-05, "epoch": 3.2542372881355934, "step": 1920 }, { "loss": 1.052, "grad_norm": 1.0203768014907837, "learning_rate": 1.7689369196421613e-05, "epoch": 3.2711864406779663, "step": 1930 }, { "loss": 1.057, "grad_norm": 1.0592260360717773, "learning_rate": 1.6901726296700736e-05, "epoch": 3.288135593220339, "step": 1940 }, { "loss": 1.0604, "grad_norm": 0.9808682799339294, "learning_rate": 1.6130399386756766e-05, "epoch": 3.305084745762712, "step": 1950 }, { "loss": 1.0808, "grad_norm": 0.9782573580741882, "learning_rate": 1.5375539913321602e-05, "epoch": 3.3220338983050848, "step": 1960 }, { "loss": 1.0548, "grad_norm": 1.0036771297454834, "learning_rate": 1.4637296089816543e-05, "epoch": 3.3389830508474576, "step": 1970 }, { "loss": 1.0535, "grad_norm": 1.0585639476776123, "learning_rate": 1.3915812867251266e-05, "epoch": 3.3559322033898304, "step": 1980 }, { "loss": 1.0957, "grad_norm": 1.0012156963348389, "learning_rate": 1.3211231905763355e-05, "epoch": 3.3728813559322033, "step": 1990 }, { "loss": 1.0576, "grad_norm": 0.9416826367378235, "learning_rate": 1.2523691546803873e-05, "epoch": 3.389830508474576, "step": 2000 }, { "loss": 1.0505, "grad_norm": 0.9698029160499573, "learning_rate": 1.1853326785974628e-05, "epoch": 3.406779661016949, "step": 2010 }, { "loss": 1.0376, "grad_norm": 0.9895077347755432, "learning_rate": 1.1200269246522343e-05, "epoch": 3.423728813559322, "step": 2020 }, { "loss": 1.0493, "grad_norm": 0.9916106462478638, "learning_rate": 1.0564647153495088e-05, "epoch": 3.440677966101695, "step": 2030 }, { "loss": 1.1029, "grad_norm": 1.044202208518982, "learning_rate": 9.946585308565747e-06, "epoch": 3.457627118644068, "step": 2040 }, { "loss": 1.0658, "grad_norm": 1.0428624153137207, "learning_rate": 9.346205065527769e-06, "epoch": 3.4745762711864407, "step": 2050 }, { "loss": 1.0633, "grad_norm": 1.0308985710144043, "learning_rate": 8.763624306467844e-06, "epoch": 3.4915254237288136, "step": 2060 }, { "loss": 1.0406, "grad_norm": 1.0223889350891113, "learning_rate": 8.198957418620401e-06, "epoch": 3.5084745762711864, "step": 2070 }, { "loss": 1.0064, "grad_norm": 1.0316060781478882, "learning_rate": 7.652315271907929e-06, "epoch": 3.5254237288135593, "step": 2080 }, { "loss": 1.0495, "grad_norm": 0.9965396523475647, "learning_rate": 7.1238051971723504e-06, "epoch": 3.542372881355932, "step": 2090 }, { "loss": 1.0434, "grad_norm": 0.9939844012260437, "learning_rate": 6.613530965101e-06, "epoch": 3.559322033898305, "step": 2100 }, { "loss": 1.0728, "grad_norm": 0.9969953298568726, "learning_rate": 6.121592765851647e-06, "epoch": 3.576271186440678, "step": 2110 }, { "eval_loss": 1.206390380859375, "eval_runtime": 7.9452, "eval_samples_per_second": 124.982, "eval_steps_per_second": 31.34, "epoch": 3.584745762711864, "step": 2115 }, { "loss": 1.027, "grad_norm": 1.0296121835708618, "learning_rate": 5.648087189380613e-06, "epoch": 3.593220338983051, "step": 2120 }, { "loss": 1.0543, "grad_norm": 0.9752156734466553, "learning_rate": 5.193107206477821e-06, "epoch": 3.610169491525424, "step": 2130 }, { "loss": 1.0772, "grad_norm": 1.014767050743103, "learning_rate": 4.756742150512305e-06, "epoch": 3.6271186440677967, "step": 2140 }, { "loss": 1.0694, "grad_norm": 0.9678983688354492, "learning_rate": 4.339077699891969e-06, "epoch": 3.6440677966101696, "step": 2150 }, { "loss": 1.0381, "grad_norm": 0.9780063033103943, "learning_rate": 3.940195861241036e-06, "epoch": 3.6610169491525424, "step": 2160 }, { "loss": 1.0478, "grad_norm": 1.011366367340088, "learning_rate": 3.560174953298434e-06, "epoch": 3.6779661016949152, "step": 2170 }, { "loss": 1.057, "grad_norm": 0.9899730086326599, "learning_rate": 3.199089591540194e-06, "epoch": 3.694915254237288, "step": 2180 }, { "loss": 1.0506, "grad_norm": 1.040940761566162, "learning_rate": 2.857010673529015e-06, "epoch": 3.711864406779661, "step": 2190 }, { "loss": 1.0845, "grad_norm": 0.9783785343170166, "learning_rate": 2.5340053649938523e-06, "epoch": 3.7288135593220337, "step": 2200 }, { "loss": 1.058, "grad_norm": 1.0626617670059204, "learning_rate": 2.2301370866422256e-06, "epoch": 3.7457627118644066, "step": 2210 }, { "loss": 1.0401, "grad_norm": 1.0348941087722778, "learning_rate": 1.9454655017078438e-06, "epoch": 3.7627118644067794, "step": 2220 }, { "loss": 1.0433, "grad_norm": 0.9590414762496948, "learning_rate": 1.6800465042359325e-06, "epoch": 3.7796610169491527, "step": 2230 }, { "loss": 1.0658, "grad_norm": 1.0622143745422363, "learning_rate": 1.4339322081087236e-06, "epoch": 3.7966101694915255, "step": 2240 }, { "loss": 1.0412, "grad_norm": 1.0225697755813599, "learning_rate": 1.2071709368131068e-06, "epoch": 3.8135593220338984, "step": 2250 }, { "loss": 1.0589, "grad_norm": 1.0060772895812988, "learning_rate": 9.998072139525084e-07, "epoch": 3.830508474576271, "step": 2260 }, { "loss": 1.0806, "grad_norm": 0.9955238103866577, "learning_rate": 8.118817545048952e-07, "epoch": 3.847457627118644, "step": 2270 }, { "loss": 1.0534, "grad_norm": 0.995814323425293, "learning_rate": 6.434314568285249e-07, "epoch": 3.864406779661017, "step": 2280 }, { "loss": 1.0518, "grad_norm": 1.02264404296875, "learning_rate": 4.944893954171859e-07, "epoch": 3.8813559322033897, "step": 2290 }, { "loss": 1.0461, "grad_norm": 1.0243033170700073, "learning_rate": 3.6508481440604703e-07, "epoch": 3.898305084745763, "step": 2300 }, { "loss": 1.0516, "grad_norm": 1.0173128843307495, "learning_rate": 2.55243121829829e-07, "epoch": 3.915254237288136, "step": 2310 }, { "loss": 1.0589, "grad_norm": 1.0421204566955566, "learning_rate": 1.6498588463392806e-07, "epoch": 3.9322033898305087, "step": 2320 }, { "loss": 1.0548, "grad_norm": 1.024686574935913, "learning_rate": 9.433082443991437e-08, "epoch": 3.9491525423728815, "step": 2330 }, { "loss": 1.0619, "grad_norm": 0.9935389161109924, "learning_rate": 4.329181406593774e-08, "epoch": 3.9661016949152543, "step": 2340 }, { "loss": 1.0571, "grad_norm": 0.9899278879165649, "learning_rate": 1.1878874802873086e-08, "epoch": 3.983050847457627, "step": 2350 }, { "eval_loss": 1.2054944038391113, "eval_runtime": 7.9797, "eval_samples_per_second": 124.441, "eval_steps_per_second": 31.204, "epoch": 3.983050847457627, "step": 2350 }, { "loss": 1.1068, "grad_norm": 3.4382264614105225, "learning_rate": 9.817444666104792e-11, "epoch": 4.0, "step": 2360 }, { "train_runtime": 470.094, "train_samples_per_second": 160.402, "train_steps_per_second": 5.02, "total_flos": 1.617317761968e+16, "train_loss": 1.2040837437419567, "epoch": 4.0, "step": 2360 } ]