Files
deepseek-prover-v2-cpt-sft-…/trainer_state.json
ModelHub XC 505a950b10 初始化项目,由ModelHub XC社区提供模型
Model: formalmathatepfl/deepseek-prover-v2-cpt-sft-feedback-1e
Source: Original Platform
2026-05-30 18:32:16 +08:00

8603 lines
211 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 1000,
"global_step": 12091,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0008270614506657845,
"grad_norm": 14.711603164672852,
"learning_rate": 1.487603305785124e-07,
"loss": 2.7309,
"step": 10
},
{
"epoch": 0.001654122901331569,
"grad_norm": 14.97842025756836,
"learning_rate": 3.1404958677685957e-07,
"loss": 2.7435,
"step": 20
},
{
"epoch": 0.0024811843519973536,
"grad_norm": 12.011658668518066,
"learning_rate": 4.793388429752067e-07,
"loss": 2.7037,
"step": 30
},
{
"epoch": 0.003308245802663138,
"grad_norm": 11.382739067077637,
"learning_rate": 6.446280991735538e-07,
"loss": 2.5787,
"step": 40
},
{
"epoch": 0.0041353072533289225,
"grad_norm": 7.4454193115234375,
"learning_rate": 8.099173553719009e-07,
"loss": 2.2764,
"step": 50
},
{
"epoch": 0.004962368703994707,
"grad_norm": 5.262652397155762,
"learning_rate": 9.75206611570248e-07,
"loss": 1.9743,
"step": 60
},
{
"epoch": 0.005789430154660491,
"grad_norm": 3.370814323425293,
"learning_rate": 1.140495867768595e-06,
"loss": 1.5944,
"step": 70
},
{
"epoch": 0.006616491605326276,
"grad_norm": 3.199523687362671,
"learning_rate": 1.3057851239669423e-06,
"loss": 1.3237,
"step": 80
},
{
"epoch": 0.00744355305599206,
"grad_norm": 2.578493118286133,
"learning_rate": 1.4710743801652894e-06,
"loss": 1.0191,
"step": 90
},
{
"epoch": 0.008270614506657845,
"grad_norm": 2.9655439853668213,
"learning_rate": 1.6363636363636365e-06,
"loss": 0.7199,
"step": 100
},
{
"epoch": 0.00909767595732363,
"grad_norm": 5.320194244384766,
"learning_rate": 1.8016528925619835e-06,
"loss": 0.5692,
"step": 110
},
{
"epoch": 0.009924737407989414,
"grad_norm": 1.893227458000183,
"learning_rate": 1.966942148760331e-06,
"loss": 0.4681,
"step": 120
},
{
"epoch": 0.010751798858655199,
"grad_norm": 1.1032103300094604,
"learning_rate": 2.132231404958678e-06,
"loss": 0.3882,
"step": 130
},
{
"epoch": 0.011578860309320982,
"grad_norm": 1.873787760734558,
"learning_rate": 2.297520661157025e-06,
"loss": 0.3182,
"step": 140
},
{
"epoch": 0.012405921759986767,
"grad_norm": 3.0224239826202393,
"learning_rate": 2.462809917355372e-06,
"loss": 0.2801,
"step": 150
},
{
"epoch": 0.013232983210652551,
"grad_norm": 1.4292343854904175,
"learning_rate": 2.628099173553719e-06,
"loss": 0.2074,
"step": 160
},
{
"epoch": 0.014060044661318336,
"grad_norm": 5.8895368576049805,
"learning_rate": 2.7933884297520662e-06,
"loss": 0.2002,
"step": 170
},
{
"epoch": 0.01488710611198412,
"grad_norm": 1.3134126663208008,
"learning_rate": 2.9586776859504133e-06,
"loss": 0.1672,
"step": 180
},
{
"epoch": 0.015714167562649903,
"grad_norm": 0.5309329628944397,
"learning_rate": 3.123966942148761e-06,
"loss": 0.1528,
"step": 190
},
{
"epoch": 0.01654122901331569,
"grad_norm": 0.7183464169502258,
"learning_rate": 3.289256198347108e-06,
"loss": 0.1324,
"step": 200
},
{
"epoch": 0.017368290463981473,
"grad_norm": 0.7033889293670654,
"learning_rate": 3.454545454545455e-06,
"loss": 0.1214,
"step": 210
},
{
"epoch": 0.01819535191464726,
"grad_norm": 0.39757004380226135,
"learning_rate": 3.619834710743802e-06,
"loss": 0.1116,
"step": 220
},
{
"epoch": 0.019022413365313042,
"grad_norm": 1.0405199527740479,
"learning_rate": 3.785123966942149e-06,
"loss": 0.1011,
"step": 230
},
{
"epoch": 0.01984947481597883,
"grad_norm": 1.6865506172180176,
"learning_rate": 3.950413223140496e-06,
"loss": 0.1023,
"step": 240
},
{
"epoch": 0.02067653626664461,
"grad_norm": 0.746986985206604,
"learning_rate": 4.115702479338843e-06,
"loss": 0.092,
"step": 250
},
{
"epoch": 0.021503597717310398,
"grad_norm": 0.482876718044281,
"learning_rate": 4.28099173553719e-06,
"loss": 0.0803,
"step": 260
},
{
"epoch": 0.02233065916797618,
"grad_norm": 0.4853907525539398,
"learning_rate": 4.4462809917355374e-06,
"loss": 0.0782,
"step": 270
},
{
"epoch": 0.023157720618641964,
"grad_norm": 0.2537175714969635,
"learning_rate": 4.6115702479338845e-06,
"loss": 0.0776,
"step": 280
},
{
"epoch": 0.02398478206930775,
"grad_norm": 0.7867515683174133,
"learning_rate": 4.776859504132232e-06,
"loss": 0.073,
"step": 290
},
{
"epoch": 0.024811843519973533,
"grad_norm": 0.43127694725990295,
"learning_rate": 4.942148760330579e-06,
"loss": 0.0734,
"step": 300
},
{
"epoch": 0.02563890497063932,
"grad_norm": 0.20157092809677124,
"learning_rate": 5.107438016528926e-06,
"loss": 0.0737,
"step": 310
},
{
"epoch": 0.026465966421305102,
"grad_norm": 0.5229440927505493,
"learning_rate": 5.272727272727273e-06,
"loss": 0.0692,
"step": 320
},
{
"epoch": 0.02729302787197089,
"grad_norm": 0.7331608533859253,
"learning_rate": 5.438016528925621e-06,
"loss": 0.0688,
"step": 330
},
{
"epoch": 0.028120089322636672,
"grad_norm": 0.2658148407936096,
"learning_rate": 5.603305785123967e-06,
"loss": 0.0603,
"step": 340
},
{
"epoch": 0.028947150773302455,
"grad_norm": 0.3650042414665222,
"learning_rate": 5.768595041322315e-06,
"loss": 0.0593,
"step": 350
},
{
"epoch": 0.02977421222396824,
"grad_norm": 0.2189350426197052,
"learning_rate": 5.933884297520661e-06,
"loss": 0.0561,
"step": 360
},
{
"epoch": 0.030601273674634024,
"grad_norm": 0.2192901372909546,
"learning_rate": 6.099173553719009e-06,
"loss": 0.0624,
"step": 370
},
{
"epoch": 0.03142833512529981,
"grad_norm": 0.2812904715538025,
"learning_rate": 6.264462809917355e-06,
"loss": 0.0556,
"step": 380
},
{
"epoch": 0.03225539657596559,
"grad_norm": 0.6402881145477295,
"learning_rate": 6.429752066115703e-06,
"loss": 0.0572,
"step": 390
},
{
"epoch": 0.03308245802663138,
"grad_norm": 0.17161770164966583,
"learning_rate": 6.5950413223140495e-06,
"loss": 0.0537,
"step": 400
},
{
"epoch": 0.033909519477297166,
"grad_norm": 0.3633708357810974,
"learning_rate": 6.760330578512397e-06,
"loss": 0.0504,
"step": 410
},
{
"epoch": 0.034736580927962946,
"grad_norm": 0.3227091133594513,
"learning_rate": 6.925619834710744e-06,
"loss": 0.0527,
"step": 420
},
{
"epoch": 0.03556364237862873,
"grad_norm": 0.1883084774017334,
"learning_rate": 7.0909090909090916e-06,
"loss": 0.0496,
"step": 430
},
{
"epoch": 0.03639070382929452,
"grad_norm": 0.404940664768219,
"learning_rate": 7.256198347107438e-06,
"loss": 0.0515,
"step": 440
},
{
"epoch": 0.0372177652799603,
"grad_norm": 0.2766735553741455,
"learning_rate": 7.421487603305786e-06,
"loss": 0.0482,
"step": 450
},
{
"epoch": 0.038044826730626084,
"grad_norm": 0.14233002066612244,
"learning_rate": 7.586776859504133e-06,
"loss": 0.0495,
"step": 460
},
{
"epoch": 0.03887188818129187,
"grad_norm": 0.17358863353729248,
"learning_rate": 7.75206611570248e-06,
"loss": 0.0464,
"step": 470
},
{
"epoch": 0.03969894963195766,
"grad_norm": 0.24469003081321716,
"learning_rate": 7.917355371900827e-06,
"loss": 0.0479,
"step": 480
},
{
"epoch": 0.04052601108262344,
"grad_norm": 0.20702078938484192,
"learning_rate": 8.082644628099174e-06,
"loss": 0.042,
"step": 490
},
{
"epoch": 0.04135307253328922,
"grad_norm": 0.38820740580558777,
"learning_rate": 8.247933884297521e-06,
"loss": 0.0486,
"step": 500
},
{
"epoch": 0.04218013398395501,
"grad_norm": 0.17128099501132965,
"learning_rate": 8.413223140495868e-06,
"loss": 0.0432,
"step": 510
},
{
"epoch": 0.043007195434620796,
"grad_norm": 0.15014755725860596,
"learning_rate": 8.578512396694215e-06,
"loss": 0.0471,
"step": 520
},
{
"epoch": 0.043834256885286575,
"grad_norm": 0.31599992513656616,
"learning_rate": 8.743801652892562e-06,
"loss": 0.0431,
"step": 530
},
{
"epoch": 0.04466131833595236,
"grad_norm": 0.2722884714603424,
"learning_rate": 8.90909090909091e-06,
"loss": 0.0422,
"step": 540
},
{
"epoch": 0.04548837978661815,
"grad_norm": 0.2727777361869812,
"learning_rate": 9.074380165289256e-06,
"loss": 0.0411,
"step": 550
},
{
"epoch": 0.04631544123728393,
"grad_norm": 0.11177966743707657,
"learning_rate": 9.239669421487604e-06,
"loss": 0.0422,
"step": 560
},
{
"epoch": 0.047142502687949714,
"grad_norm": 0.603720486164093,
"learning_rate": 9.40495867768595e-06,
"loss": 0.0434,
"step": 570
},
{
"epoch": 0.0479695641386155,
"grad_norm": 0.13153497874736786,
"learning_rate": 9.570247933884298e-06,
"loss": 0.0392,
"step": 580
},
{
"epoch": 0.04879662558928129,
"grad_norm": 0.11294803768396378,
"learning_rate": 9.735537190082645e-06,
"loss": 0.0421,
"step": 590
},
{
"epoch": 0.049623687039947066,
"grad_norm": 0.08017970621585846,
"learning_rate": 9.900826446280992e-06,
"loss": 0.0395,
"step": 600
},
{
"epoch": 0.05045074849061285,
"grad_norm": 0.10552022606134415,
"learning_rate": 9.999997007583302e-06,
"loss": 0.0409,
"step": 610
},
{
"epoch": 0.05127780994127864,
"grad_norm": 0.09761521220207214,
"learning_rate": 9.999963342936584e-06,
"loss": 0.041,
"step": 620
},
{
"epoch": 0.05210487139194442,
"grad_norm": 0.19305112957954407,
"learning_rate": 9.999892273374958e-06,
"loss": 0.0387,
"step": 630
},
{
"epoch": 0.052931932842610205,
"grad_norm": 0.09766830503940582,
"learning_rate": 9.999783799430103e-06,
"loss": 0.0407,
"step": 640
},
{
"epoch": 0.05375899429327599,
"grad_norm": 0.14489105343818665,
"learning_rate": 9.999637921913512e-06,
"loss": 0.0389,
"step": 650
},
{
"epoch": 0.05458605574394178,
"grad_norm": 0.32930856943130493,
"learning_rate": 9.999454641916505e-06,
"loss": 0.038,
"step": 660
},
{
"epoch": 0.05541311719460756,
"grad_norm": 0.14028801023960114,
"learning_rate": 9.9992339608102e-06,
"loss": 0.0389,
"step": 670
},
{
"epoch": 0.056240178645273343,
"grad_norm": 0.18841058015823364,
"learning_rate": 9.998975880245528e-06,
"loss": 0.0377,
"step": 680
},
{
"epoch": 0.05706724009593913,
"grad_norm": 0.22034569084644318,
"learning_rate": 9.998680402153193e-06,
"loss": 0.0375,
"step": 690
},
{
"epoch": 0.05789430154660491,
"grad_norm": 0.09396768361330032,
"learning_rate": 9.998347528743684e-06,
"loss": 0.0373,
"step": 700
},
{
"epoch": 0.058721362997270696,
"grad_norm": 0.21181143820285797,
"learning_rate": 9.997977262507234e-06,
"loss": 0.0368,
"step": 710
},
{
"epoch": 0.05954842444793648,
"grad_norm": 0.09176570922136307,
"learning_rate": 9.997569606213822e-06,
"loss": 0.0402,
"step": 720
},
{
"epoch": 0.06037548589860227,
"grad_norm": 0.1015128493309021,
"learning_rate": 9.997124562913138e-06,
"loss": 0.037,
"step": 730
},
{
"epoch": 0.06120254734926805,
"grad_norm": 0.10508076846599579,
"learning_rate": 9.996642135934571e-06,
"loss": 0.0359,
"step": 740
},
{
"epoch": 0.062029608799933834,
"grad_norm": 0.1171686202287674,
"learning_rate": 9.996122328887173e-06,
"loss": 0.0355,
"step": 750
},
{
"epoch": 0.06285667025059961,
"grad_norm": 0.0857829749584198,
"learning_rate": 9.99556514565964e-06,
"loss": 0.0373,
"step": 760
},
{
"epoch": 0.0636837317012654,
"grad_norm": 0.10609547048807144,
"learning_rate": 9.994970590420284e-06,
"loss": 0.0358,
"step": 770
},
{
"epoch": 0.06451079315193119,
"grad_norm": 0.08259180933237076,
"learning_rate": 9.994338667616989e-06,
"loss": 0.0357,
"step": 780
},
{
"epoch": 0.06533785460259697,
"grad_norm": 0.09571733325719833,
"learning_rate": 9.9936693819772e-06,
"loss": 0.0384,
"step": 790
},
{
"epoch": 0.06616491605326276,
"grad_norm": 0.15978851914405823,
"learning_rate": 9.992962738507862e-06,
"loss": 0.0365,
"step": 800
},
{
"epoch": 0.06699197750392855,
"grad_norm": 0.14394982159137726,
"learning_rate": 9.992218742495409e-06,
"loss": 0.0371,
"step": 810
},
{
"epoch": 0.06781903895459433,
"grad_norm": 0.09818094223737717,
"learning_rate": 9.991437399505697e-06,
"loss": 0.0375,
"step": 820
},
{
"epoch": 0.0686461004052601,
"grad_norm": 0.23322197794914246,
"learning_rate": 9.990618715383985e-06,
"loss": 0.0349,
"step": 830
},
{
"epoch": 0.06947316185592589,
"grad_norm": 0.09629665315151215,
"learning_rate": 9.98976269625488e-06,
"loss": 0.0349,
"step": 840
},
{
"epoch": 0.07030022330659168,
"grad_norm": 0.1651093065738678,
"learning_rate": 9.988869348522293e-06,
"loss": 0.035,
"step": 850
},
{
"epoch": 0.07112728475725746,
"grad_norm": 0.12863469123840332,
"learning_rate": 9.98793867886939e-06,
"loss": 0.0364,
"step": 860
},
{
"epoch": 0.07195434620792325,
"grad_norm": 0.3346972167491913,
"learning_rate": 9.98697069425855e-06,
"loss": 0.0337,
"step": 870
},
{
"epoch": 0.07278140765858904,
"grad_norm": 0.146303191781044,
"learning_rate": 9.9859654019313e-06,
"loss": 0.0363,
"step": 880
},
{
"epoch": 0.07360846910925482,
"grad_norm": 0.14840497076511383,
"learning_rate": 9.984922809408272e-06,
"loss": 0.0349,
"step": 890
},
{
"epoch": 0.0744355305599206,
"grad_norm": 0.10886628180742264,
"learning_rate": 9.983842924489137e-06,
"loss": 0.0344,
"step": 900
},
{
"epoch": 0.07526259201058638,
"grad_norm": 0.09753160178661346,
"learning_rate": 9.982725755252557e-06,
"loss": 0.0327,
"step": 910
},
{
"epoch": 0.07608965346125217,
"grad_norm": 0.09709116816520691,
"learning_rate": 9.981571310056116e-06,
"loss": 0.0361,
"step": 920
},
{
"epoch": 0.07691671491191795,
"grad_norm": 0.08936941623687744,
"learning_rate": 9.980379597536263e-06,
"loss": 0.039,
"step": 930
},
{
"epoch": 0.07774377636258374,
"grad_norm": 0.07184750586748123,
"learning_rate": 9.979150626608246e-06,
"loss": 0.034,
"step": 940
},
{
"epoch": 0.07857083781324953,
"grad_norm": 0.07059776037931442,
"learning_rate": 9.97788440646604e-06,
"loss": 0.0314,
"step": 950
},
{
"epoch": 0.07939789926391531,
"grad_norm": 0.07418540120124817,
"learning_rate": 9.976580946582289e-06,
"loss": 0.0338,
"step": 960
},
{
"epoch": 0.08022496071458109,
"grad_norm": 0.14634068310260773,
"learning_rate": 9.975240256708222e-06,
"loss": 0.0344,
"step": 970
},
{
"epoch": 0.08105202216524687,
"grad_norm": 0.10202177613973618,
"learning_rate": 9.973862346873594e-06,
"loss": 0.0312,
"step": 980
},
{
"epoch": 0.08187908361591266,
"grad_norm": 0.08847320824861526,
"learning_rate": 9.9724472273866e-06,
"loss": 0.0335,
"step": 990
},
{
"epoch": 0.08270614506657845,
"grad_norm": 0.1381935477256775,
"learning_rate": 9.9709949088338e-06,
"loss": 0.0399,
"step": 1000
},
{
"epoch": 0.08270614506657845,
"eval_loss": 0.0343376062810421,
"eval_runtime": 1220.1317,
"eval_samples_per_second": 4.917,
"eval_steps_per_second": 0.307,
"step": 1000
},
{
"epoch": 0.08353320651724423,
"grad_norm": 0.15219560265541077,
"learning_rate": 9.969505402080044e-06,
"loss": 0.0337,
"step": 1010
},
{
"epoch": 0.08436026796791002,
"grad_norm": 0.20263217389583588,
"learning_rate": 9.967978718268391e-06,
"loss": 0.0315,
"step": 1020
},
{
"epoch": 0.0851873294185758,
"grad_norm": 0.10303157567977905,
"learning_rate": 9.966414868820022e-06,
"loss": 0.0354,
"step": 1030
},
{
"epoch": 0.08601439086924159,
"grad_norm": 0.10471872240304947,
"learning_rate": 9.964813865434149e-06,
"loss": 0.035,
"step": 1040
},
{
"epoch": 0.08684145231990736,
"grad_norm": 0.08253839612007141,
"learning_rate": 9.963175720087941e-06,
"loss": 0.0317,
"step": 1050
},
{
"epoch": 0.08766851377057315,
"grad_norm": 0.08755608648061752,
"learning_rate": 9.961500445036428e-06,
"loss": 0.0314,
"step": 1060
},
{
"epoch": 0.08849557522123894,
"grad_norm": 0.15729674696922302,
"learning_rate": 9.9597880528124e-06,
"loss": 0.0371,
"step": 1070
},
{
"epoch": 0.08932263667190472,
"grad_norm": 0.14116688072681427,
"learning_rate": 9.958038556226332e-06,
"loss": 0.0317,
"step": 1080
},
{
"epoch": 0.09014969812257051,
"grad_norm": 0.1923297643661499,
"learning_rate": 9.956251968366276e-06,
"loss": 0.035,
"step": 1090
},
{
"epoch": 0.0909767595732363,
"grad_norm": 0.07124887406826019,
"learning_rate": 9.954428302597759e-06,
"loss": 0.0308,
"step": 1100
},
{
"epoch": 0.09180382102390208,
"grad_norm": 0.07786933332681656,
"learning_rate": 9.952567572563696e-06,
"loss": 0.0304,
"step": 1110
},
{
"epoch": 0.09263088247456785,
"grad_norm": 0.15352018177509308,
"learning_rate": 9.950669792184279e-06,
"loss": 0.0332,
"step": 1120
},
{
"epoch": 0.09345794392523364,
"grad_norm": 0.0767594501376152,
"learning_rate": 9.948734975656874e-06,
"loss": 0.032,
"step": 1130
},
{
"epoch": 0.09428500537589943,
"grad_norm": 0.06958391517400742,
"learning_rate": 9.946763137455915e-06,
"loss": 0.032,
"step": 1140
},
{
"epoch": 0.09511206682656521,
"grad_norm": 0.09188707917928696,
"learning_rate": 9.944754292332802e-06,
"loss": 0.0318,
"step": 1150
},
{
"epoch": 0.095939128277231,
"grad_norm": 0.07205051183700562,
"learning_rate": 9.942708455315779e-06,
"loss": 0.03,
"step": 1160
},
{
"epoch": 0.09676618972789679,
"grad_norm": 0.07053325325250626,
"learning_rate": 9.94062564170983e-06,
"loss": 0.0314,
"step": 1170
},
{
"epoch": 0.09759325117856257,
"grad_norm": 0.09830465912818909,
"learning_rate": 9.938505867096563e-06,
"loss": 0.031,
"step": 1180
},
{
"epoch": 0.09842031262922835,
"grad_norm": 0.08823514729738235,
"learning_rate": 9.93634914733409e-06,
"loss": 0.0308,
"step": 1190
},
{
"epoch": 0.09924737407989413,
"grad_norm": 0.16881339251995087,
"learning_rate": 9.934155498556919e-06,
"loss": 0.0319,
"step": 1200
},
{
"epoch": 0.10007443553055992,
"grad_norm": 0.07580441236495972,
"learning_rate": 9.931924937175813e-06,
"loss": 0.0304,
"step": 1210
},
{
"epoch": 0.1009014969812257,
"grad_norm": 0.12182483077049255,
"learning_rate": 9.929657479877688e-06,
"loss": 0.03,
"step": 1220
},
{
"epoch": 0.10172855843189149,
"grad_norm": 0.1227443590760231,
"learning_rate": 9.92735314362548e-06,
"loss": 0.0297,
"step": 1230
},
{
"epoch": 0.10255561988255728,
"grad_norm": 0.1861189603805542,
"learning_rate": 9.925011945658012e-06,
"loss": 0.0298,
"step": 1240
},
{
"epoch": 0.10338268133322306,
"grad_norm": 0.07195472717285156,
"learning_rate": 9.922633903489878e-06,
"loss": 0.0348,
"step": 1250
},
{
"epoch": 0.10420974278388884,
"grad_norm": 0.08541199564933777,
"learning_rate": 9.9202190349113e-06,
"loss": 0.0329,
"step": 1260
},
{
"epoch": 0.10503680423455462,
"grad_norm": 0.07149334251880646,
"learning_rate": 9.917767357988e-06,
"loss": 0.0295,
"step": 1270
},
{
"epoch": 0.10586386568522041,
"grad_norm": 0.07702941447496414,
"learning_rate": 9.915278891061069e-06,
"loss": 0.0317,
"step": 1280
},
{
"epoch": 0.1066909271358862,
"grad_norm": 0.09982211887836456,
"learning_rate": 9.912753652746819e-06,
"loss": 0.0296,
"step": 1290
},
{
"epoch": 0.10751798858655198,
"grad_norm": 0.15653453767299652,
"learning_rate": 9.910191661936654e-06,
"loss": 0.0312,
"step": 1300
},
{
"epoch": 0.10834505003721777,
"grad_norm": 0.09917636215686798,
"learning_rate": 9.907592937796927e-06,
"loss": 0.0304,
"step": 1310
},
{
"epoch": 0.10917211148788356,
"grad_norm": 0.07035510987043381,
"learning_rate": 9.904957499768787e-06,
"loss": 0.0314,
"step": 1320
},
{
"epoch": 0.10999917293854933,
"grad_norm": 0.07983675599098206,
"learning_rate": 9.902285367568049e-06,
"loss": 0.0301,
"step": 1330
},
{
"epoch": 0.11082623438921511,
"grad_norm": 0.06464583426713943,
"learning_rate": 9.899576561185034e-06,
"loss": 0.0305,
"step": 1340
},
{
"epoch": 0.1116532958398809,
"grad_norm": 0.1578930765390396,
"learning_rate": 9.896831100884424e-06,
"loss": 0.0303,
"step": 1350
},
{
"epoch": 0.11248035729054669,
"grad_norm": 0.06734715402126312,
"learning_rate": 9.894049007205112e-06,
"loss": 0.0281,
"step": 1360
},
{
"epoch": 0.11330741874121247,
"grad_norm": 0.09131414443254471,
"learning_rate": 9.891230300960049e-06,
"loss": 0.0302,
"step": 1370
},
{
"epoch": 0.11413448019187826,
"grad_norm": 0.06612879037857056,
"learning_rate": 9.888375003236078e-06,
"loss": 0.032,
"step": 1380
},
{
"epoch": 0.11496154164254405,
"grad_norm": 0.07150176167488098,
"learning_rate": 9.885483135393792e-06,
"loss": 0.031,
"step": 1390
},
{
"epoch": 0.11578860309320982,
"grad_norm": 0.06837069243192673,
"learning_rate": 9.882554719067363e-06,
"loss": 0.0292,
"step": 1400
},
{
"epoch": 0.1166156645438756,
"grad_norm": 0.09854337573051453,
"learning_rate": 9.879589776164387e-06,
"loss": 0.0302,
"step": 1410
},
{
"epoch": 0.11744272599454139,
"grad_norm": 0.06270977109670639,
"learning_rate": 9.87658832886571e-06,
"loss": 0.0285,
"step": 1420
},
{
"epoch": 0.11826978744520718,
"grad_norm": 0.09647868573665619,
"learning_rate": 9.873550399625275e-06,
"loss": 0.0283,
"step": 1430
},
{
"epoch": 0.11909684889587296,
"grad_norm": 0.06852090358734131,
"learning_rate": 9.870476011169948e-06,
"loss": 0.0299,
"step": 1440
},
{
"epoch": 0.11992391034653875,
"grad_norm": 0.06666602194309235,
"learning_rate": 9.867365186499337e-06,
"loss": 0.0338,
"step": 1450
},
{
"epoch": 0.12075097179720454,
"grad_norm": 0.07159853726625443,
"learning_rate": 9.864217948885648e-06,
"loss": 0.0281,
"step": 1460
},
{
"epoch": 0.12157803324787032,
"grad_norm": 0.1366601437330246,
"learning_rate": 9.861034321873481e-06,
"loss": 0.0309,
"step": 1470
},
{
"epoch": 0.1224050946985361,
"grad_norm": 0.08372735232114792,
"learning_rate": 9.85781432927967e-06,
"loss": 0.0308,
"step": 1480
},
{
"epoch": 0.12323215614920188,
"grad_norm": 0.10882294178009033,
"learning_rate": 9.854557995193102e-06,
"loss": 0.0289,
"step": 1490
},
{
"epoch": 0.12405921759986767,
"grad_norm": 0.07682844996452332,
"learning_rate": 9.851265343974534e-06,
"loss": 0.031,
"step": 1500
},
{
"epoch": 0.12488627905053346,
"grad_norm": 0.12793898582458496,
"learning_rate": 9.847936400256415e-06,
"loss": 0.0291,
"step": 1510
},
{
"epoch": 0.12571334050119923,
"grad_norm": 0.07250412553548813,
"learning_rate": 9.844571188942701e-06,
"loss": 0.029,
"step": 1520
},
{
"epoch": 0.12654040195186503,
"grad_norm": 0.06386396288871765,
"learning_rate": 9.841169735208662e-06,
"loss": 0.0307,
"step": 1530
},
{
"epoch": 0.1273674634025308,
"grad_norm": 0.05723176896572113,
"learning_rate": 9.837732064500705e-06,
"loss": 0.0286,
"step": 1540
},
{
"epoch": 0.1281945248531966,
"grad_norm": 0.07307655364274979,
"learning_rate": 9.834258202536173e-06,
"loss": 0.0304,
"step": 1550
},
{
"epoch": 0.12902158630386237,
"grad_norm": 0.06836479902267456,
"learning_rate": 9.830748175303157e-06,
"loss": 0.0286,
"step": 1560
},
{
"epoch": 0.12984864775452817,
"grad_norm": 0.0659850612282753,
"learning_rate": 9.827202009060307e-06,
"loss": 0.0271,
"step": 1570
},
{
"epoch": 0.13067570920519395,
"grad_norm": 0.06328194588422775,
"learning_rate": 9.823619730336624e-06,
"loss": 0.028,
"step": 1580
},
{
"epoch": 0.13150277065585972,
"grad_norm": 0.05650272220373154,
"learning_rate": 9.820001365931273e-06,
"loss": 0.0279,
"step": 1590
},
{
"epoch": 0.13232983210652552,
"grad_norm": 0.10159070044755936,
"learning_rate": 9.816346942913376e-06,
"loss": 0.029,
"step": 1600
},
{
"epoch": 0.1331568935571913,
"grad_norm": 0.4464583098888397,
"learning_rate": 9.812656488621804e-06,
"loss": 0.0298,
"step": 1610
},
{
"epoch": 0.1339839550078571,
"grad_norm": 0.06621966511011124,
"learning_rate": 9.808930030664989e-06,
"loss": 0.0303,
"step": 1620
},
{
"epoch": 0.13481101645852286,
"grad_norm": 0.07061782479286194,
"learning_rate": 9.805167596920707e-06,
"loss": 0.0283,
"step": 1630
},
{
"epoch": 0.13563807790918866,
"grad_norm": 0.06339192390441895,
"learning_rate": 9.80136921553586e-06,
"loss": 0.0274,
"step": 1640
},
{
"epoch": 0.13646513935985444,
"grad_norm": 0.09378033131361008,
"learning_rate": 9.797534914926289e-06,
"loss": 0.028,
"step": 1650
},
{
"epoch": 0.1372922008105202,
"grad_norm": 0.11731720715761185,
"learning_rate": 9.793664723776539e-06,
"loss": 0.0289,
"step": 1660
},
{
"epoch": 0.138119262261186,
"grad_norm": 0.07038633525371552,
"learning_rate": 9.789758671039658e-06,
"loss": 0.0279,
"step": 1670
},
{
"epoch": 0.13894632371185178,
"grad_norm": 0.08343333005905151,
"learning_rate": 9.785816785936973e-06,
"loss": 0.0278,
"step": 1680
},
{
"epoch": 0.13977338516251758,
"grad_norm": 0.08339129388332367,
"learning_rate": 9.781839097957875e-06,
"loss": 0.0302,
"step": 1690
},
{
"epoch": 0.14060044661318336,
"grad_norm": 0.15373483300209045,
"learning_rate": 9.777825636859599e-06,
"loss": 0.0293,
"step": 1700
},
{
"epoch": 0.14142750806384916,
"grad_norm": 0.07383430004119873,
"learning_rate": 9.773776432667e-06,
"loss": 0.0295,
"step": 1710
},
{
"epoch": 0.14225456951451493,
"grad_norm": 0.07228893786668777,
"learning_rate": 9.769691515672328e-06,
"loss": 0.0276,
"step": 1720
},
{
"epoch": 0.1430816309651807,
"grad_norm": 0.09278323501348495,
"learning_rate": 9.765570916434998e-06,
"loss": 0.0289,
"step": 1730
},
{
"epoch": 0.1439086924158465,
"grad_norm": 0.09062926471233368,
"learning_rate": 9.761414665781374e-06,
"loss": 0.028,
"step": 1740
},
{
"epoch": 0.14473575386651227,
"grad_norm": 0.06459420919418335,
"learning_rate": 9.757222794804522e-06,
"loss": 0.0279,
"step": 1750
},
{
"epoch": 0.14556281531717807,
"grad_norm": 0.06160286068916321,
"learning_rate": 9.752995334863985e-06,
"loss": 0.028,
"step": 1760
},
{
"epoch": 0.14638987676784385,
"grad_norm": 0.07651007920503616,
"learning_rate": 9.748732317585557e-06,
"loss": 0.0295,
"step": 1770
},
{
"epoch": 0.14721693821850965,
"grad_norm": 0.08549734950065613,
"learning_rate": 9.744433774861024e-06,
"loss": 0.028,
"step": 1780
},
{
"epoch": 0.14804399966917542,
"grad_norm": 0.06986084580421448,
"learning_rate": 9.74009973884795e-06,
"loss": 0.029,
"step": 1790
},
{
"epoch": 0.1488710611198412,
"grad_norm": 0.0717945545911789,
"learning_rate": 9.735730241969425e-06,
"loss": 0.0287,
"step": 1800
},
{
"epoch": 0.149698122570507,
"grad_norm": 0.055828921496868134,
"learning_rate": 9.731325316913816e-06,
"loss": 0.0279,
"step": 1810
},
{
"epoch": 0.15052518402117276,
"grad_norm": 0.09485841542482376,
"learning_rate": 9.726884996634535e-06,
"loss": 0.0288,
"step": 1820
},
{
"epoch": 0.15135224547183856,
"grad_norm": 0.09525461494922638,
"learning_rate": 9.72240931434979e-06,
"loss": 0.0266,
"step": 1830
},
{
"epoch": 0.15217930692250434,
"grad_norm": 0.058921415358781815,
"learning_rate": 9.717898303542324e-06,
"loss": 0.0278,
"step": 1840
},
{
"epoch": 0.15300636837317014,
"grad_norm": 0.08154033869504929,
"learning_rate": 9.713351997959184e-06,
"loss": 0.0348,
"step": 1850
},
{
"epoch": 0.1538334298238359,
"grad_norm": 0.059776682406663895,
"learning_rate": 9.70877043161145e-06,
"loss": 0.0275,
"step": 1860
},
{
"epoch": 0.15466049127450168,
"grad_norm": 0.089345782995224,
"learning_rate": 9.704153638773996e-06,
"loss": 0.0253,
"step": 1870
},
{
"epoch": 0.15548755272516748,
"grad_norm": 0.0736837238073349,
"learning_rate": 9.699501653985223e-06,
"loss": 0.0263,
"step": 1880
},
{
"epoch": 0.15631461417583326,
"grad_norm": 0.09357444941997528,
"learning_rate": 9.694814512046805e-06,
"loss": 0.0278,
"step": 1890
},
{
"epoch": 0.15714167562649906,
"grad_norm": 0.06266128271818161,
"learning_rate": 9.690092248023428e-06,
"loss": 0.0277,
"step": 1900
},
{
"epoch": 0.15796873707716483,
"grad_norm": 0.081548310816288,
"learning_rate": 9.68533489724253e-06,
"loss": 0.0307,
"step": 1910
},
{
"epoch": 0.15879579852783063,
"grad_norm": 0.05208978429436684,
"learning_rate": 9.680542495294027e-06,
"loss": 0.0277,
"step": 1920
},
{
"epoch": 0.1596228599784964,
"grad_norm": 0.055970244109630585,
"learning_rate": 9.675715078030063e-06,
"loss": 0.027,
"step": 1930
},
{
"epoch": 0.16044992142916217,
"grad_norm": 0.05580395460128784,
"learning_rate": 9.67085268156473e-06,
"loss": 0.0277,
"step": 1940
},
{
"epoch": 0.16127698287982797,
"grad_norm": 0.058690398931503296,
"learning_rate": 9.665955342273799e-06,
"loss": 0.0274,
"step": 1950
},
{
"epoch": 0.16210404433049375,
"grad_norm": 0.06293683499097824,
"learning_rate": 9.661023096794449e-06,
"loss": 0.0267,
"step": 1960
},
{
"epoch": 0.16293110578115955,
"grad_norm": 0.08121343702077866,
"learning_rate": 9.656055982024995e-06,
"loss": 0.0279,
"step": 1970
},
{
"epoch": 0.16375816723182532,
"grad_norm": 0.07257858663797379,
"learning_rate": 9.651054035124614e-06,
"loss": 0.0264,
"step": 1980
},
{
"epoch": 0.16458522868249112,
"grad_norm": 0.06639426201581955,
"learning_rate": 9.646017293513056e-06,
"loss": 0.0265,
"step": 1990
},
{
"epoch": 0.1654122901331569,
"grad_norm": 0.06022842600941658,
"learning_rate": 9.640945794870377e-06,
"loss": 0.0261,
"step": 2000
},
{
"epoch": 0.1654122901331569,
"eval_loss": 0.028430579230189323,
"eval_runtime": 1220.0038,
"eval_samples_per_second": 4.917,
"eval_steps_per_second": 0.307,
"step": 2000
},
{
"epoch": 0.16623935158382266,
"grad_norm": 0.05734001100063324,
"learning_rate": 9.63583957713665e-06,
"loss": 0.0277,
"step": 2010
},
{
"epoch": 0.16706641303448846,
"grad_norm": 0.08106731623411179,
"learning_rate": 9.630698678511684e-06,
"loss": 0.0266,
"step": 2020
},
{
"epoch": 0.16789347448515424,
"grad_norm": 0.056222159415483475,
"learning_rate": 9.625523137454736e-06,
"loss": 0.0261,
"step": 2030
},
{
"epoch": 0.16872053593582004,
"grad_norm": 0.06166260689496994,
"learning_rate": 9.620312992684223e-06,
"loss": 0.0265,
"step": 2040
},
{
"epoch": 0.1695475973864858,
"grad_norm": 0.08784622699022293,
"learning_rate": 9.615068283177434e-06,
"loss": 0.0281,
"step": 2050
},
{
"epoch": 0.1703746588371516,
"grad_norm": 0.0706792026758194,
"learning_rate": 9.609789048170243e-06,
"loss": 0.029,
"step": 2060
},
{
"epoch": 0.17120172028781738,
"grad_norm": 0.05976736173033714,
"learning_rate": 9.604475327156804e-06,
"loss": 0.0254,
"step": 2070
},
{
"epoch": 0.17202878173848318,
"grad_norm": 0.05874831974506378,
"learning_rate": 9.599127159889266e-06,
"loss": 0.0279,
"step": 2080
},
{
"epoch": 0.17285584318914896,
"grad_norm": 0.06354232132434845,
"learning_rate": 9.593744586377472e-06,
"loss": 0.0266,
"step": 2090
},
{
"epoch": 0.17368290463981473,
"grad_norm": 0.06033729389309883,
"learning_rate": 9.588327646888655e-06,
"loss": 0.0266,
"step": 2100
},
{
"epoch": 0.17450996609048053,
"grad_norm": 0.18101929128170013,
"learning_rate": 9.582876381947145e-06,
"loss": 0.0266,
"step": 2110
},
{
"epoch": 0.1753370275411463,
"grad_norm": 0.26323285698890686,
"learning_rate": 9.577390832334064e-06,
"loss": 0.0265,
"step": 2120
},
{
"epoch": 0.1761640889918121,
"grad_norm": 0.05492362007498741,
"learning_rate": 9.571871039087013e-06,
"loss": 0.0266,
"step": 2130
},
{
"epoch": 0.17699115044247787,
"grad_norm": 0.05727216601371765,
"learning_rate": 9.566317043499773e-06,
"loss": 0.0263,
"step": 2140
},
{
"epoch": 0.17781821189314367,
"grad_norm": 0.14531953632831573,
"learning_rate": 9.560728887122e-06,
"loss": 0.0286,
"step": 2150
},
{
"epoch": 0.17864527334380945,
"grad_norm": 0.06639876216650009,
"learning_rate": 9.5551066117589e-06,
"loss": 0.0262,
"step": 2160
},
{
"epoch": 0.17947233479447522,
"grad_norm": 0.06139986589550972,
"learning_rate": 9.549450259470927e-06,
"loss": 0.0272,
"step": 2170
},
{
"epoch": 0.18029939624514102,
"grad_norm": 0.07039148360490799,
"learning_rate": 9.543759872573469e-06,
"loss": 0.0282,
"step": 2180
},
{
"epoch": 0.1811264576958068,
"grad_norm": 0.08487813919782639,
"learning_rate": 9.538035493636524e-06,
"loss": 0.0284,
"step": 2190
},
{
"epoch": 0.1819535191464726,
"grad_norm": 0.07776181399822235,
"learning_rate": 9.532277165484387e-06,
"loss": 0.0279,
"step": 2200
},
{
"epoch": 0.18278058059713836,
"grad_norm": 0.061026498675346375,
"learning_rate": 9.52648493119533e-06,
"loss": 0.0256,
"step": 2210
},
{
"epoch": 0.18360764204780416,
"grad_norm": 0.061437539756298065,
"learning_rate": 9.520658834101275e-06,
"loss": 0.027,
"step": 2220
},
{
"epoch": 0.18443470349846994,
"grad_norm": 0.06019297242164612,
"learning_rate": 9.514798917787477e-06,
"loss": 0.0305,
"step": 2230
},
{
"epoch": 0.1852617649491357,
"grad_norm": 0.08646666258573532,
"learning_rate": 9.50890522609219e-06,
"loss": 0.0263,
"step": 2240
},
{
"epoch": 0.1860888263998015,
"grad_norm": 0.1908756047487259,
"learning_rate": 9.502977803106346e-06,
"loss": 0.0259,
"step": 2250
},
{
"epoch": 0.18691588785046728,
"grad_norm": 0.24025577306747437,
"learning_rate": 9.497016693173218e-06,
"loss": 0.0294,
"step": 2260
},
{
"epoch": 0.18774294930113308,
"grad_norm": 0.07112468034029007,
"learning_rate": 9.491021940888096e-06,
"loss": 0.0266,
"step": 2270
},
{
"epoch": 0.18857001075179886,
"grad_norm": 0.08155805617570877,
"learning_rate": 9.484993591097952e-06,
"loss": 0.0258,
"step": 2280
},
{
"epoch": 0.18939707220246466,
"grad_norm": 0.05596913397312164,
"learning_rate": 9.478931688901095e-06,
"loss": 0.0264,
"step": 2290
},
{
"epoch": 0.19022413365313043,
"grad_norm": 0.059164054691791534,
"learning_rate": 9.472836279646844e-06,
"loss": 0.0272,
"step": 2300
},
{
"epoch": 0.1910511951037962,
"grad_norm": 0.06571198254823685,
"learning_rate": 9.466707408935189e-06,
"loss": 0.0272,
"step": 2310
},
{
"epoch": 0.191878256554462,
"grad_norm": 0.07002273201942444,
"learning_rate": 9.460545122616442e-06,
"loss": 0.0275,
"step": 2320
},
{
"epoch": 0.19270531800512777,
"grad_norm": 0.06005439907312393,
"learning_rate": 9.4543494667909e-06,
"loss": 0.028,
"step": 2330
},
{
"epoch": 0.19353237945579357,
"grad_norm": 0.11123955994844437,
"learning_rate": 9.4481204878085e-06,
"loss": 0.0268,
"step": 2340
},
{
"epoch": 0.19435944090645935,
"grad_norm": 0.0618051253259182,
"learning_rate": 9.441858232268467e-06,
"loss": 0.0259,
"step": 2350
},
{
"epoch": 0.19518650235712515,
"grad_norm": 0.06244316324591637,
"learning_rate": 9.435562747018976e-06,
"loss": 0.0262,
"step": 2360
},
{
"epoch": 0.19601356380779092,
"grad_norm": 0.08520273864269257,
"learning_rate": 9.429234079156787e-06,
"loss": 0.0267,
"step": 2370
},
{
"epoch": 0.1968406252584567,
"grad_norm": 0.06388260424137115,
"learning_rate": 9.422872276026902e-06,
"loss": 0.0263,
"step": 2380
},
{
"epoch": 0.1976676867091225,
"grad_norm": 0.06510653346776962,
"learning_rate": 9.416477385222213e-06,
"loss": 0.0281,
"step": 2390
},
{
"epoch": 0.19849474815978826,
"grad_norm": 0.12499203532934189,
"learning_rate": 9.41004945458314e-06,
"loss": 0.0268,
"step": 2400
},
{
"epoch": 0.19932180961045406,
"grad_norm": 0.06236669421195984,
"learning_rate": 9.403588532197277e-06,
"loss": 0.0262,
"step": 2410
},
{
"epoch": 0.20014887106111984,
"grad_norm": 0.06980706751346588,
"learning_rate": 9.397094666399025e-06,
"loss": 0.0264,
"step": 2420
},
{
"epoch": 0.20097593251178564,
"grad_norm": 0.05483941361308098,
"learning_rate": 9.390567905769242e-06,
"loss": 0.025,
"step": 2430
},
{
"epoch": 0.2018029939624514,
"grad_norm": 0.08971832692623138,
"learning_rate": 9.384008299134871e-06,
"loss": 0.0243,
"step": 2440
},
{
"epoch": 0.20263005541311718,
"grad_norm": 0.07181546092033386,
"learning_rate": 9.377415895568578e-06,
"loss": 0.0257,
"step": 2450
},
{
"epoch": 0.20345711686378298,
"grad_norm": 0.06366802752017975,
"learning_rate": 9.370790744388381e-06,
"loss": 0.026,
"step": 2460
},
{
"epoch": 0.20428417831444876,
"grad_norm": 0.065264031291008,
"learning_rate": 9.36413289515729e-06,
"loss": 0.0274,
"step": 2470
},
{
"epoch": 0.20511123976511456,
"grad_norm": 0.09247761219739914,
"learning_rate": 9.357442397682924e-06,
"loss": 0.0251,
"step": 2480
},
{
"epoch": 0.20593830121578033,
"grad_norm": 0.04937649890780449,
"learning_rate": 9.350719302017148e-06,
"loss": 0.0277,
"step": 2490
},
{
"epoch": 0.20676536266644613,
"grad_norm": 0.06501265615224838,
"learning_rate": 9.343963658455698e-06,
"loss": 0.0266,
"step": 2500
},
{
"epoch": 0.2075924241171119,
"grad_norm": 0.07810965925455093,
"learning_rate": 9.337175517537796e-06,
"loss": 0.0302,
"step": 2510
},
{
"epoch": 0.20841948556777767,
"grad_norm": 0.056350335478782654,
"learning_rate": 9.330354930045782e-06,
"loss": 0.0275,
"step": 2520
},
{
"epoch": 0.20924654701844347,
"grad_norm": 0.070098377764225,
"learning_rate": 9.323501947004727e-06,
"loss": 0.0268,
"step": 2530
},
{
"epoch": 0.21007360846910925,
"grad_norm": 0.07072274386882782,
"learning_rate": 9.316616619682059e-06,
"loss": 0.0256,
"step": 2540
},
{
"epoch": 0.21090066991977505,
"grad_norm": 0.05314943194389343,
"learning_rate": 9.309698999587174e-06,
"loss": 0.0256,
"step": 2550
},
{
"epoch": 0.21172773137044082,
"grad_norm": 0.05929897353053093,
"learning_rate": 9.302749138471046e-06,
"loss": 0.0274,
"step": 2560
},
{
"epoch": 0.21255479282110662,
"grad_norm": 0.07132343202829361,
"learning_rate": 9.295767088325848e-06,
"loss": 0.0256,
"step": 2570
},
{
"epoch": 0.2133818542717724,
"grad_norm": 0.05408504605293274,
"learning_rate": 9.288752901384563e-06,
"loss": 0.0323,
"step": 2580
},
{
"epoch": 0.21420891572243816,
"grad_norm": 0.0529806949198246,
"learning_rate": 9.281706630120592e-06,
"loss": 0.0252,
"step": 2590
},
{
"epoch": 0.21503597717310396,
"grad_norm": 0.09713909775018692,
"learning_rate": 9.274628327247353e-06,
"loss": 0.0249,
"step": 2600
},
{
"epoch": 0.21586303862376974,
"grad_norm": 0.07577594369649887,
"learning_rate": 9.267518045717897e-06,
"loss": 0.0283,
"step": 2610
},
{
"epoch": 0.21669010007443554,
"grad_norm": 0.05945679545402527,
"learning_rate": 9.260375838724511e-06,
"loss": 0.0263,
"step": 2620
},
{
"epoch": 0.2175171615251013,
"grad_norm": 0.06303580105304718,
"learning_rate": 9.253201759698317e-06,
"loss": 0.0297,
"step": 2630
},
{
"epoch": 0.2183442229757671,
"grad_norm": 0.06167830526828766,
"learning_rate": 9.245995862308867e-06,
"loss": 0.0275,
"step": 2640
},
{
"epoch": 0.21917128442643288,
"grad_norm": 0.05566466599702835,
"learning_rate": 9.238758200463756e-06,
"loss": 0.0279,
"step": 2650
},
{
"epoch": 0.21999834587709866,
"grad_norm": 0.06275132298469543,
"learning_rate": 9.231488828308205e-06,
"loss": 0.0248,
"step": 2660
},
{
"epoch": 0.22082540732776446,
"grad_norm": 0.07304584234952927,
"learning_rate": 9.224187800224661e-06,
"loss": 0.0273,
"step": 2670
},
{
"epoch": 0.22165246877843023,
"grad_norm": 0.05730755627155304,
"learning_rate": 9.216855170832393e-06,
"loss": 0.0271,
"step": 2680
},
{
"epoch": 0.22247953022909603,
"grad_norm": 0.05435599759221077,
"learning_rate": 9.209490994987079e-06,
"loss": 0.0248,
"step": 2690
},
{
"epoch": 0.2233065916797618,
"grad_norm": 0.05061393231153488,
"learning_rate": 9.202095327780394e-06,
"loss": 0.0258,
"step": 2700
},
{
"epoch": 0.2241336531304276,
"grad_norm": 0.05590864270925522,
"learning_rate": 9.194668224539608e-06,
"loss": 0.0256,
"step": 2710
},
{
"epoch": 0.22496071458109337,
"grad_norm": 0.04720637574791908,
"learning_rate": 9.187209740827159e-06,
"loss": 0.0243,
"step": 2720
},
{
"epoch": 0.22578777603175915,
"grad_norm": 0.055518608540296555,
"learning_rate": 9.179719932440245e-06,
"loss": 0.026,
"step": 2730
},
{
"epoch": 0.22661483748242495,
"grad_norm": 0.060342345386743546,
"learning_rate": 9.172198855410408e-06,
"loss": 0.0254,
"step": 2740
},
{
"epoch": 0.22744189893309072,
"grad_norm": 0.06279141455888748,
"learning_rate": 9.164646566003109e-06,
"loss": 0.0262,
"step": 2750
},
{
"epoch": 0.22826896038375652,
"grad_norm": 0.050773248076438904,
"learning_rate": 9.15706312071731e-06,
"loss": 0.0271,
"step": 2760
},
{
"epoch": 0.2290960218344223,
"grad_norm": 0.052737098187208176,
"learning_rate": 9.149448576285055e-06,
"loss": 0.0259,
"step": 2770
},
{
"epoch": 0.2299230832850881,
"grad_norm": 0.055731095373630524,
"learning_rate": 9.141802989671036e-06,
"loss": 0.0255,
"step": 2780
},
{
"epoch": 0.23075014473575386,
"grad_norm": 0.05351400747895241,
"learning_rate": 9.134126418072175e-06,
"loss": 0.0255,
"step": 2790
},
{
"epoch": 0.23157720618641964,
"grad_norm": 0.05681459978222847,
"learning_rate": 9.126418918917197e-06,
"loss": 0.0268,
"step": 2800
},
{
"epoch": 0.23240426763708544,
"grad_norm": 0.05028412118554115,
"learning_rate": 9.118680549866193e-06,
"loss": 0.0239,
"step": 2810
},
{
"epoch": 0.2332313290877512,
"grad_norm": 0.05494118854403496,
"learning_rate": 9.110911368810193e-06,
"loss": 0.0239,
"step": 2820
},
{
"epoch": 0.234058390538417,
"grad_norm": 0.04639596492052078,
"learning_rate": 9.10311143387074e-06,
"loss": 0.0246,
"step": 2830
},
{
"epoch": 0.23488545198908278,
"grad_norm": 0.06322944909334183,
"learning_rate": 9.095280803399437e-06,
"loss": 0.0245,
"step": 2840
},
{
"epoch": 0.23571251343974858,
"grad_norm": 0.08805207163095474,
"learning_rate": 9.08741953597753e-06,
"loss": 0.0247,
"step": 2850
},
{
"epoch": 0.23653957489041436,
"grad_norm": 0.058313675224781036,
"learning_rate": 9.079527690415455e-06,
"loss": 0.0258,
"step": 2860
},
{
"epoch": 0.23736663634108016,
"grad_norm": 0.057351235300302505,
"learning_rate": 9.07160532575241e-06,
"loss": 0.0254,
"step": 2870
},
{
"epoch": 0.23819369779174593,
"grad_norm": 0.06278271973133087,
"learning_rate": 9.063652501255904e-06,
"loss": 0.0247,
"step": 2880
},
{
"epoch": 0.2390207592424117,
"grad_norm": 0.05352174490690231,
"learning_rate": 9.055669276421315e-06,
"loss": 0.026,
"step": 2890
},
{
"epoch": 0.2398478206930775,
"grad_norm": 0.05026556923985481,
"learning_rate": 9.047655710971455e-06,
"loss": 0.0266,
"step": 2900
},
{
"epoch": 0.24067488214374327,
"grad_norm": 0.0480768196284771,
"learning_rate": 9.039611864856105e-06,
"loss": 0.0247,
"step": 2910
},
{
"epoch": 0.24150194359440907,
"grad_norm": 0.07413890212774277,
"learning_rate": 9.031537798251589e-06,
"loss": 0.0284,
"step": 2920
},
{
"epoch": 0.24232900504507485,
"grad_norm": 0.07313451170921326,
"learning_rate": 9.023433571560297e-06,
"loss": 0.0256,
"step": 2930
},
{
"epoch": 0.24315606649574065,
"grad_norm": 0.05278393253684044,
"learning_rate": 9.015299245410258e-06,
"loss": 0.0249,
"step": 2940
},
{
"epoch": 0.24398312794640642,
"grad_norm": 0.08677669614553452,
"learning_rate": 9.007134880654677e-06,
"loss": 0.026,
"step": 2950
},
{
"epoch": 0.2448101893970722,
"grad_norm": 0.060126129537820816,
"learning_rate": 8.998940538371472e-06,
"loss": 0.0259,
"step": 2960
},
{
"epoch": 0.245637250847738,
"grad_norm": 0.05079201981425285,
"learning_rate": 8.99071627986283e-06,
"loss": 0.0243,
"step": 2970
},
{
"epoch": 0.24646431229840376,
"grad_norm": 0.053754109889268875,
"learning_rate": 8.982462166654737e-06,
"loss": 0.0257,
"step": 2980
},
{
"epoch": 0.24729137374906957,
"grad_norm": 0.05371469631791115,
"learning_rate": 8.974178260496529e-06,
"loss": 0.0253,
"step": 2990
},
{
"epoch": 0.24811843519973534,
"grad_norm": 0.060160018503665924,
"learning_rate": 8.965864623360418e-06,
"loss": 0.0283,
"step": 3000
},
{
"epoch": 0.24811843519973534,
"eval_loss": 0.026417342945933342,
"eval_runtime": 1220.3014,
"eval_samples_per_second": 4.916,
"eval_steps_per_second": 0.307,
"step": 3000
},
{
"epoch": 0.24894549665040114,
"grad_norm": 0.06683066487312317,
"learning_rate": 8.957521317441043e-06,
"loss": 0.0245,
"step": 3010
},
{
"epoch": 0.2497725581010669,
"grad_norm": 0.045557327568531036,
"learning_rate": 8.949148405154986e-06,
"loss": 0.0251,
"step": 3020
},
{
"epoch": 0.2505996195517327,
"grad_norm": 0.05416623502969742,
"learning_rate": 8.940745949140323e-06,
"loss": 0.0247,
"step": 3030
},
{
"epoch": 0.25142668100239846,
"grad_norm": 0.17342466115951538,
"learning_rate": 8.932314012256147e-06,
"loss": 0.0249,
"step": 3040
},
{
"epoch": 0.25225374245306426,
"grad_norm": 0.06348035484552383,
"learning_rate": 8.923852657582092e-06,
"loss": 0.0258,
"step": 3050
},
{
"epoch": 0.25308080390373006,
"grad_norm": 0.05559645593166351,
"learning_rate": 8.915361948417878e-06,
"loss": 0.0361,
"step": 3060
},
{
"epoch": 0.25390786535439586,
"grad_norm": 0.050857000052928925,
"learning_rate": 8.906841948282818e-06,
"loss": 0.0257,
"step": 3070
},
{
"epoch": 0.2547349268050616,
"grad_norm": 0.04826486483216286,
"learning_rate": 8.898292720915354e-06,
"loss": 0.0257,
"step": 3080
},
{
"epoch": 0.2555619882557274,
"grad_norm": 0.06656019389629364,
"learning_rate": 8.889714330272584e-06,
"loss": 0.0261,
"step": 3090
},
{
"epoch": 0.2563890497063932,
"grad_norm": 0.06416959315538406,
"learning_rate": 8.881106840529769e-06,
"loss": 0.0252,
"step": 3100
},
{
"epoch": 0.25721611115705895,
"grad_norm": 0.04848102107644081,
"learning_rate": 8.872470316079866e-06,
"loss": 0.024,
"step": 3110
},
{
"epoch": 0.25804317260772475,
"grad_norm": 0.06827887147665024,
"learning_rate": 8.863804821533043e-06,
"loss": 0.0236,
"step": 3120
},
{
"epoch": 0.25887023405839055,
"grad_norm": 0.0632987692952156,
"learning_rate": 8.855110421716191e-06,
"loss": 0.0261,
"step": 3130
},
{
"epoch": 0.25969729550905635,
"grad_norm": 0.05443909019231796,
"learning_rate": 8.846387181672443e-06,
"loss": 0.0245,
"step": 3140
},
{
"epoch": 0.2605243569597221,
"grad_norm": 0.050953421741724014,
"learning_rate": 8.837635166660689e-06,
"loss": 0.0258,
"step": 3150
},
{
"epoch": 0.2613514184103879,
"grad_norm": 0.04987896978855133,
"learning_rate": 8.828854442155087e-06,
"loss": 0.0259,
"step": 3160
},
{
"epoch": 0.2621784798610537,
"grad_norm": 0.05325448885560036,
"learning_rate": 8.820045073844563e-06,
"loss": 0.0263,
"step": 3170
},
{
"epoch": 0.26300554131171944,
"grad_norm": 0.06813682615756989,
"learning_rate": 8.81120712763234e-06,
"loss": 0.024,
"step": 3180
},
{
"epoch": 0.26383260276238524,
"grad_norm": 0.053441476076841354,
"learning_rate": 8.802340669635423e-06,
"loss": 0.0255,
"step": 3190
},
{
"epoch": 0.26465966421305104,
"grad_norm": 0.061251021921634674,
"learning_rate": 8.793445766184126e-06,
"loss": 0.0329,
"step": 3200
},
{
"epoch": 0.26548672566371684,
"grad_norm": 0.06079159677028656,
"learning_rate": 8.784522483821554e-06,
"loss": 0.0271,
"step": 3210
},
{
"epoch": 0.2663137871143826,
"grad_norm": 0.04815410450100899,
"learning_rate": 8.77557088930312e-06,
"loss": 0.0256,
"step": 3220
},
{
"epoch": 0.2671408485650484,
"grad_norm": 0.058222122490406036,
"learning_rate": 8.766591049596043e-06,
"loss": 0.0239,
"step": 3230
},
{
"epoch": 0.2679679100157142,
"grad_norm": 0.06425308436155319,
"learning_rate": 8.75758303187884e-06,
"loss": 0.0248,
"step": 3240
},
{
"epoch": 0.26879497146637993,
"grad_norm": 0.05385325476527214,
"learning_rate": 8.748546903540838e-06,
"loss": 0.0249,
"step": 3250
},
{
"epoch": 0.26962203291704573,
"grad_norm": 0.04803679138422012,
"learning_rate": 8.739482732181648e-06,
"loss": 0.0313,
"step": 3260
},
{
"epoch": 0.27044909436771153,
"grad_norm": 0.05667194724082947,
"learning_rate": 8.730390585610685e-06,
"loss": 0.025,
"step": 3270
},
{
"epoch": 0.27127615581837733,
"grad_norm": 0.04525600001215935,
"learning_rate": 8.72127053184664e-06,
"loss": 0.0254,
"step": 3280
},
{
"epoch": 0.2721032172690431,
"grad_norm": 0.07599420845508575,
"learning_rate": 8.712122639116975e-06,
"loss": 0.0243,
"step": 3290
},
{
"epoch": 0.2729302787197089,
"grad_norm": 0.052151359617710114,
"learning_rate": 8.70294697585743e-06,
"loss": 0.0234,
"step": 3300
},
{
"epoch": 0.2737573401703747,
"grad_norm": 0.05731287971138954,
"learning_rate": 8.693743610711482e-06,
"loss": 0.0248,
"step": 3310
},
{
"epoch": 0.2745844016210404,
"grad_norm": 0.04920828342437744,
"learning_rate": 8.684512612529857e-06,
"loss": 0.0245,
"step": 3320
},
{
"epoch": 0.2754114630717062,
"grad_norm": 0.05730625241994858,
"learning_rate": 8.67525405037e-06,
"loss": 0.0264,
"step": 3330
},
{
"epoch": 0.276238524522372,
"grad_norm": 0.04498128592967987,
"learning_rate": 8.665967993495568e-06,
"loss": 0.0244,
"step": 3340
},
{
"epoch": 0.2770655859730378,
"grad_norm": 0.0674099400639534,
"learning_rate": 8.656654511375902e-06,
"loss": 0.0285,
"step": 3350
},
{
"epoch": 0.27789264742370356,
"grad_norm": 0.06094598397612572,
"learning_rate": 8.64731367368551e-06,
"loss": 0.0258,
"step": 3360
},
{
"epoch": 0.27871970887436937,
"grad_norm": 0.07126502692699432,
"learning_rate": 8.637945550303557e-06,
"loss": 0.0279,
"step": 3370
},
{
"epoch": 0.27954677032503517,
"grad_norm": 0.08413068950176239,
"learning_rate": 8.628550211313328e-06,
"loss": 0.0441,
"step": 3380
},
{
"epoch": 0.2803738317757009,
"grad_norm": 0.04862065240740776,
"learning_rate": 8.619127727001708e-06,
"loss": 0.0238,
"step": 3390
},
{
"epoch": 0.2812008932263667,
"grad_norm": 0.0653972402215004,
"learning_rate": 8.60967816785866e-06,
"loss": 0.0245,
"step": 3400
},
{
"epoch": 0.2820279546770325,
"grad_norm": 0.05237039551138878,
"learning_rate": 8.60020160457669e-06,
"loss": 0.0255,
"step": 3410
},
{
"epoch": 0.2828550161276983,
"grad_norm": 0.06689222902059555,
"learning_rate": 8.59069810805033e-06,
"loss": 0.0286,
"step": 3420
},
{
"epoch": 0.28368207757836406,
"grad_norm": 0.06750566512346268,
"learning_rate": 8.581167749375596e-06,
"loss": 0.0373,
"step": 3430
},
{
"epoch": 0.28450913902902986,
"grad_norm": 0.04513133317232132,
"learning_rate": 8.571610599849462e-06,
"loss": 0.0266,
"step": 3440
},
{
"epoch": 0.28533620047969566,
"grad_norm": 0.05559685453772545,
"learning_rate": 8.562026730969325e-06,
"loss": 0.0253,
"step": 3450
},
{
"epoch": 0.2861632619303614,
"grad_norm": 0.04561685398221016,
"learning_rate": 8.552416214432469e-06,
"loss": 0.0259,
"step": 3460
},
{
"epoch": 0.2869903233810272,
"grad_norm": 0.054727304726839066,
"learning_rate": 8.542779122135532e-06,
"loss": 0.0254,
"step": 3470
},
{
"epoch": 0.287817384831693,
"grad_norm": 0.05550670251250267,
"learning_rate": 8.533115526173969e-06,
"loss": 0.025,
"step": 3480
},
{
"epoch": 0.2886444462823588,
"grad_norm": 0.04571954905986786,
"learning_rate": 8.523425498841505e-06,
"loss": 0.0272,
"step": 3490
},
{
"epoch": 0.28947150773302455,
"grad_norm": 0.07001665234565735,
"learning_rate": 8.513709112629599e-06,
"loss": 0.0245,
"step": 3500
},
{
"epoch": 0.29029856918369035,
"grad_norm": 0.05153432488441467,
"learning_rate": 8.503966440226908e-06,
"loss": 0.0424,
"step": 3510
},
{
"epoch": 0.29112563063435615,
"grad_norm": 0.05176723748445511,
"learning_rate": 8.494197554518729e-06,
"loss": 0.0245,
"step": 3520
},
{
"epoch": 0.2919526920850219,
"grad_norm": 0.07877220213413239,
"learning_rate": 8.484402528586469e-06,
"loss": 0.0241,
"step": 3530
},
{
"epoch": 0.2927797535356877,
"grad_norm": 0.0443316325545311,
"learning_rate": 8.474581435707085e-06,
"loss": 0.0245,
"step": 3540
},
{
"epoch": 0.2936068149863535,
"grad_norm": 0.05324044078588486,
"learning_rate": 8.464734349352544e-06,
"loss": 0.024,
"step": 3550
},
{
"epoch": 0.2944338764370193,
"grad_norm": 0.0497773103415966,
"learning_rate": 8.454861343189274e-06,
"loss": 0.0236,
"step": 3560
},
{
"epoch": 0.29526093788768504,
"grad_norm": 0.04881919547915459,
"learning_rate": 8.444962491077604e-06,
"loss": 0.0236,
"step": 3570
},
{
"epoch": 0.29608799933835084,
"grad_norm": 0.054020971059799194,
"learning_rate": 8.435037867071225e-06,
"loss": 0.0264,
"step": 3580
},
{
"epoch": 0.29691506078901664,
"grad_norm": 0.04821145534515381,
"learning_rate": 8.425087545416622e-06,
"loss": 0.0235,
"step": 3590
},
{
"epoch": 0.2977421222396824,
"grad_norm": 0.04773546755313873,
"learning_rate": 8.41511160055253e-06,
"loss": 0.0406,
"step": 3600
},
{
"epoch": 0.2985691836903482,
"grad_norm": 0.06340964883565903,
"learning_rate": 8.405110107109365e-06,
"loss": 0.0252,
"step": 3610
},
{
"epoch": 0.299396245141014,
"grad_norm": 0.0523238480091095,
"learning_rate": 8.395083139908684e-06,
"loss": 0.0245,
"step": 3620
},
{
"epoch": 0.3002233065916798,
"grad_norm": 0.04797879606485367,
"learning_rate": 8.385030773962605e-06,
"loss": 0.0257,
"step": 3630
},
{
"epoch": 0.30105036804234553,
"grad_norm": 0.05554933100938797,
"learning_rate": 8.37495308447326e-06,
"loss": 0.0233,
"step": 3640
},
{
"epoch": 0.30187742949301133,
"grad_norm": 0.08046616613864899,
"learning_rate": 8.364850146832218e-06,
"loss": 0.0237,
"step": 3650
},
{
"epoch": 0.30270449094367713,
"grad_norm": 0.04799005016684532,
"learning_rate": 8.354722036619947e-06,
"loss": 0.0244,
"step": 3660
},
{
"epoch": 0.3035315523943429,
"grad_norm": 0.05324197933077812,
"learning_rate": 8.344568829605216e-06,
"loss": 0.0232,
"step": 3670
},
{
"epoch": 0.3043586138450087,
"grad_norm": 0.04944256320595741,
"learning_rate": 8.334390601744556e-06,
"loss": 0.0255,
"step": 3680
},
{
"epoch": 0.3051856752956745,
"grad_norm": 0.0510077141225338,
"learning_rate": 8.324187429181669e-06,
"loss": 0.0252,
"step": 3690
},
{
"epoch": 0.3060127367463403,
"grad_norm": 0.045672621577978134,
"learning_rate": 8.313959388246882e-06,
"loss": 0.0257,
"step": 3700
},
{
"epoch": 0.306839798197006,
"grad_norm": 0.04965253919363022,
"learning_rate": 8.303706555456547e-06,
"loss": 0.0291,
"step": 3710
},
{
"epoch": 0.3076668596476718,
"grad_norm": 0.043674346059560776,
"learning_rate": 8.293429007512503e-06,
"loss": 0.0253,
"step": 3720
},
{
"epoch": 0.3084939210983376,
"grad_norm": 0.04634533450007439,
"learning_rate": 8.283126821301468e-06,
"loss": 0.0236,
"step": 3730
},
{
"epoch": 0.30932098254900336,
"grad_norm": 0.06959991902112961,
"learning_rate": 8.272800073894492e-06,
"loss": 0.0245,
"step": 3740
},
{
"epoch": 0.31014804399966917,
"grad_norm": 0.04980204254388809,
"learning_rate": 8.26244884254636e-06,
"loss": 0.0237,
"step": 3750
},
{
"epoch": 0.31097510545033497,
"grad_norm": 0.052351828664541245,
"learning_rate": 8.252073204695025e-06,
"loss": 0.0257,
"step": 3760
},
{
"epoch": 0.31180216690100077,
"grad_norm": 0.04672665148973465,
"learning_rate": 8.241673237961027e-06,
"loss": 0.0238,
"step": 3770
},
{
"epoch": 0.3126292283516665,
"grad_norm": 0.041996221989393234,
"learning_rate": 8.231249020146913e-06,
"loss": 0.024,
"step": 3780
},
{
"epoch": 0.3134562898023323,
"grad_norm": 0.05913085490465164,
"learning_rate": 8.220800629236647e-06,
"loss": 0.0244,
"step": 3790
},
{
"epoch": 0.3142833512529981,
"grad_norm": 0.04715942218899727,
"learning_rate": 8.21032814339504e-06,
"loss": 0.0239,
"step": 3800
},
{
"epoch": 0.31511041270366386,
"grad_norm": 0.04261414706707001,
"learning_rate": 8.19983164096715e-06,
"loss": 0.0231,
"step": 3810
},
{
"epoch": 0.31593747415432966,
"grad_norm": 0.05027526617050171,
"learning_rate": 8.189311200477713e-06,
"loss": 0.0245,
"step": 3820
},
{
"epoch": 0.31676453560499546,
"grad_norm": 0.19037795066833496,
"learning_rate": 8.17876690063054e-06,
"loss": 0.0242,
"step": 3830
},
{
"epoch": 0.31759159705566126,
"grad_norm": 0.09254226088523865,
"learning_rate": 8.168198820307938e-06,
"loss": 0.0234,
"step": 3840
},
{
"epoch": 0.318418658506327,
"grad_norm": 0.04657592624425888,
"learning_rate": 8.157607038570117e-06,
"loss": 0.0241,
"step": 3850
},
{
"epoch": 0.3192457199569928,
"grad_norm": 0.06853280961513519,
"learning_rate": 8.146991634654595e-06,
"loss": 0.0261,
"step": 3860
},
{
"epoch": 0.3200727814076586,
"grad_norm": 0.05595746263861656,
"learning_rate": 8.136352687975609e-06,
"loss": 0.0242,
"step": 3870
},
{
"epoch": 0.32089984285832435,
"grad_norm": 0.04363076388835907,
"learning_rate": 8.125690278123524e-06,
"loss": 0.0235,
"step": 3880
},
{
"epoch": 0.32172690430899015,
"grad_norm": 0.06170443445444107,
"learning_rate": 8.115004484864231e-06,
"loss": 0.0233,
"step": 3890
},
{
"epoch": 0.32255396575965595,
"grad_norm": 0.04467644914984703,
"learning_rate": 8.104295388138553e-06,
"loss": 0.0245,
"step": 3900
},
{
"epoch": 0.32338102721032175,
"grad_norm": 0.06176682561635971,
"learning_rate": 8.093563068061649e-06,
"loss": 0.0232,
"step": 3910
},
{
"epoch": 0.3242080886609875,
"grad_norm": 0.047685880213975906,
"learning_rate": 8.082807604922409e-06,
"loss": 0.0248,
"step": 3920
},
{
"epoch": 0.3250351501116533,
"grad_norm": 0.05187467485666275,
"learning_rate": 8.072029079182862e-06,
"loss": 0.0245,
"step": 3930
},
{
"epoch": 0.3258622115623191,
"grad_norm": 0.04737105965614319,
"learning_rate": 8.061227571477565e-06,
"loss": 0.0268,
"step": 3940
},
{
"epoch": 0.32668927301298484,
"grad_norm": 0.04560704901814461,
"learning_rate": 8.050403162613007e-06,
"loss": 0.024,
"step": 3950
},
{
"epoch": 0.32751633446365064,
"grad_norm": 0.057890139520168304,
"learning_rate": 8.039555933567e-06,
"loss": 0.0267,
"step": 3960
},
{
"epoch": 0.32834339591431644,
"grad_norm": 0.04416472092270851,
"learning_rate": 8.028685965488074e-06,
"loss": 0.0241,
"step": 3970
},
{
"epoch": 0.32917045736498224,
"grad_norm": 0.04871301352977753,
"learning_rate": 8.017793339694873e-06,
"loss": 0.0237,
"step": 3980
},
{
"epoch": 0.329997518815648,
"grad_norm": 0.05144352838397026,
"learning_rate": 8.00687813767554e-06,
"loss": 0.0236,
"step": 3990
},
{
"epoch": 0.3308245802663138,
"grad_norm": 0.06144755333662033,
"learning_rate": 7.995940441087117e-06,
"loss": 0.0228,
"step": 4000
},
{
"epoch": 0.3308245802663138,
"eval_loss": 0.025024140253663063,
"eval_runtime": 1220.32,
"eval_samples_per_second": 4.916,
"eval_steps_per_second": 0.307,
"step": 4000
},
{
"epoch": 0.3316516417169796,
"grad_norm": 0.07986024022102356,
"learning_rate": 7.984980331754924e-06,
"loss": 0.0249,
"step": 4010
},
{
"epoch": 0.33247870316764533,
"grad_norm": 0.04930829629302025,
"learning_rate": 7.973997891671953e-06,
"loss": 0.024,
"step": 4020
},
{
"epoch": 0.33330576461831113,
"grad_norm": 0.07743251323699951,
"learning_rate": 7.962993202998257e-06,
"loss": 0.0234,
"step": 4030
},
{
"epoch": 0.33413282606897693,
"grad_norm": 0.05702010914683342,
"learning_rate": 7.951966348060325e-06,
"loss": 0.025,
"step": 4040
},
{
"epoch": 0.33495988751964273,
"grad_norm": 0.042675841599702835,
"learning_rate": 7.940917409350476e-06,
"loss": 0.0245,
"step": 4050
},
{
"epoch": 0.3357869489703085,
"grad_norm": 0.04492352157831192,
"learning_rate": 7.929846469526242e-06,
"loss": 0.025,
"step": 4060
},
{
"epoch": 0.3366140104209743,
"grad_norm": 0.07774407416582108,
"learning_rate": 7.91875361140974e-06,
"loss": 0.0226,
"step": 4070
},
{
"epoch": 0.3374410718716401,
"grad_norm": 0.06625732779502869,
"learning_rate": 7.90763891798706e-06,
"loss": 0.0235,
"step": 4080
},
{
"epoch": 0.3382681333223059,
"grad_norm": 0.048172276467084885,
"learning_rate": 7.896502472407644e-06,
"loss": 0.0236,
"step": 4090
},
{
"epoch": 0.3390951947729716,
"grad_norm": 0.05588380619883537,
"learning_rate": 7.885344357983665e-06,
"loss": 0.0365,
"step": 4100
},
{
"epoch": 0.3399222562236374,
"grad_norm": 0.04697740450501442,
"learning_rate": 7.874164658189398e-06,
"loss": 0.0261,
"step": 4110
},
{
"epoch": 0.3407493176743032,
"grad_norm": 0.14661569893360138,
"learning_rate": 7.8629634566606e-06,
"loss": 0.0422,
"step": 4120
},
{
"epoch": 0.34157637912496897,
"grad_norm": 0.050860997289419174,
"learning_rate": 7.851740837193883e-06,
"loss": 0.0253,
"step": 4130
},
{
"epoch": 0.34240344057563477,
"grad_norm": 0.06831306964159012,
"learning_rate": 7.840496883746089e-06,
"loss": 0.0236,
"step": 4140
},
{
"epoch": 0.34323050202630057,
"grad_norm": 0.07154014706611633,
"learning_rate": 7.829231680433658e-06,
"loss": 0.0241,
"step": 4150
},
{
"epoch": 0.34405756347696637,
"grad_norm": 0.060069840401411057,
"learning_rate": 7.817945311532001e-06,
"loss": 0.0233,
"step": 4160
},
{
"epoch": 0.3448846249276321,
"grad_norm": 0.06343766301870346,
"learning_rate": 7.806637861474873e-06,
"loss": 0.029,
"step": 4170
},
{
"epoch": 0.3457116863782979,
"grad_norm": 0.046083442866802216,
"learning_rate": 7.795309414853735e-06,
"loss": 0.0233,
"step": 4180
},
{
"epoch": 0.3465387478289637,
"grad_norm": 0.04395199194550514,
"learning_rate": 7.783960056417123e-06,
"loss": 0.024,
"step": 4190
},
{
"epoch": 0.34736580927962946,
"grad_norm": 0.04960530623793602,
"learning_rate": 7.77258987107002e-06,
"loss": 0.0252,
"step": 4200
},
{
"epoch": 0.34819287073029526,
"grad_norm": 0.053416695445775986,
"learning_rate": 7.76119894387321e-06,
"loss": 0.0233,
"step": 4210
},
{
"epoch": 0.34901993218096106,
"grad_norm": 0.06489969789981842,
"learning_rate": 7.749787360042651e-06,
"loss": 0.0225,
"step": 4220
},
{
"epoch": 0.34984699363162686,
"grad_norm": 0.054353874176740646,
"learning_rate": 7.738355204948833e-06,
"loss": 0.025,
"step": 4230
},
{
"epoch": 0.3506740550822926,
"grad_norm": 0.05458907410502434,
"learning_rate": 7.726902564116141e-06,
"loss": 0.0234,
"step": 4240
},
{
"epoch": 0.3515011165329584,
"grad_norm": 0.04842905327677727,
"learning_rate": 7.715429523222214e-06,
"loss": 0.0221,
"step": 4250
},
{
"epoch": 0.3523281779836242,
"grad_norm": 0.0519806407392025,
"learning_rate": 7.703936168097306e-06,
"loss": 0.0239,
"step": 4260
},
{
"epoch": 0.35315523943428995,
"grad_norm": 0.05236365273594856,
"learning_rate": 7.692422584723641e-06,
"loss": 0.0235,
"step": 4270
},
{
"epoch": 0.35398230088495575,
"grad_norm": 0.04914037883281708,
"learning_rate": 7.68088885923477e-06,
"loss": 0.0235,
"step": 4280
},
{
"epoch": 0.35480936233562155,
"grad_norm": 0.05043815076351166,
"learning_rate": 7.669335077914932e-06,
"loss": 0.0241,
"step": 4290
},
{
"epoch": 0.35563642378628735,
"grad_norm": 0.04599103704094887,
"learning_rate": 7.657761327198404e-06,
"loss": 0.0242,
"step": 4300
},
{
"epoch": 0.3564634852369531,
"grad_norm": 0.04246712476015091,
"learning_rate": 7.646167693668846e-06,
"loss": 0.0241,
"step": 4310
},
{
"epoch": 0.3572905466876189,
"grad_norm": 0.04617106169462204,
"learning_rate": 7.634554264058676e-06,
"loss": 0.0235,
"step": 4320
},
{
"epoch": 0.3581176081382847,
"grad_norm": 0.046657975763082504,
"learning_rate": 7.6229211252483956e-06,
"loss": 0.0233,
"step": 4330
},
{
"epoch": 0.35894466958895044,
"grad_norm": 0.047864075750112534,
"learning_rate": 7.611268364265958e-06,
"loss": 0.0241,
"step": 4340
},
{
"epoch": 0.35977173103961624,
"grad_norm": 0.054371584206819534,
"learning_rate": 7.599596068286111e-06,
"loss": 0.0238,
"step": 4350
},
{
"epoch": 0.36059879249028204,
"grad_norm": 0.04631248489022255,
"learning_rate": 7.58790432462974e-06,
"loss": 0.0268,
"step": 4360
},
{
"epoch": 0.36142585394094784,
"grad_norm": 0.06476343423128128,
"learning_rate": 7.576193220763221e-06,
"loss": 0.0246,
"step": 4370
},
{
"epoch": 0.3622529153916136,
"grad_norm": 0.057965509593486786,
"learning_rate": 7.564462844297766e-06,
"loss": 0.0233,
"step": 4380
},
{
"epoch": 0.3630799768422794,
"grad_norm": 0.05117254704236984,
"learning_rate": 7.552713282988765e-06,
"loss": 0.024,
"step": 4390
},
{
"epoch": 0.3639070382929452,
"grad_norm": 0.0481458455324173,
"learning_rate": 7.540944624735132e-06,
"loss": 0.0233,
"step": 4400
},
{
"epoch": 0.36473409974361093,
"grad_norm": 0.0458373986184597,
"learning_rate": 7.529156957578641e-06,
"loss": 0.0228,
"step": 4410
},
{
"epoch": 0.36556116119427673,
"grad_norm": 0.043816305696964264,
"learning_rate": 7.517350369703279e-06,
"loss": 0.0234,
"step": 4420
},
{
"epoch": 0.36638822264494253,
"grad_norm": 0.050691138952970505,
"learning_rate": 7.505524949434575e-06,
"loss": 0.0219,
"step": 4430
},
{
"epoch": 0.36721528409560833,
"grad_norm": 0.0413176566362381,
"learning_rate": 7.493680785238948e-06,
"loss": 0.0231,
"step": 4440
},
{
"epoch": 0.3680423455462741,
"grad_norm": 0.04249545931816101,
"learning_rate": 7.481817965723035e-06,
"loss": 0.0226,
"step": 4450
},
{
"epoch": 0.3688694069969399,
"grad_norm": 0.05581935495138168,
"learning_rate": 7.4699365796330395e-06,
"loss": 0.0265,
"step": 4460
},
{
"epoch": 0.3696964684476057,
"grad_norm": 0.0569755993783474,
"learning_rate": 7.458036715854059e-06,
"loss": 0.0232,
"step": 4470
},
{
"epoch": 0.3705235298982714,
"grad_norm": 0.05333729833364487,
"learning_rate": 7.4461184634094256e-06,
"loss": 0.0242,
"step": 4480
},
{
"epoch": 0.3713505913489372,
"grad_norm": 0.05248766019940376,
"learning_rate": 7.434181911460036e-06,
"loss": 0.0307,
"step": 4490
},
{
"epoch": 0.372177652799603,
"grad_norm": 0.043839454650878906,
"learning_rate": 7.4222271493036875e-06,
"loss": 0.0241,
"step": 4500
},
{
"epoch": 0.3730047142502688,
"grad_norm": 0.05857829377055168,
"learning_rate": 7.41025426637441e-06,
"loss": 0.0223,
"step": 4510
},
{
"epoch": 0.37383177570093457,
"grad_norm": 0.041583914309740067,
"learning_rate": 7.398263352241788e-06,
"loss": 0.0225,
"step": 4520
},
{
"epoch": 0.37465883715160037,
"grad_norm": 0.043787844479084015,
"learning_rate": 7.386254496610309e-06,
"loss": 0.0215,
"step": 4530
},
{
"epoch": 0.37548589860226617,
"grad_norm": 0.04298454895615578,
"learning_rate": 7.374227789318673e-06,
"loss": 0.0229,
"step": 4540
},
{
"epoch": 0.3763129600529319,
"grad_norm": 0.05074107274413109,
"learning_rate": 7.362183320339133e-06,
"loss": 0.023,
"step": 4550
},
{
"epoch": 0.3771400215035977,
"grad_norm": 0.06284487992525101,
"learning_rate": 7.350121179776819e-06,
"loss": 0.0231,
"step": 4560
},
{
"epoch": 0.3779670829542635,
"grad_norm": 0.053102780133485794,
"learning_rate": 7.33804145786906e-06,
"loss": 0.0255,
"step": 4570
},
{
"epoch": 0.3787941444049293,
"grad_norm": 0.04331573098897934,
"learning_rate": 7.325944244984711e-06,
"loss": 0.0228,
"step": 4580
},
{
"epoch": 0.37962120585559506,
"grad_norm": 0.051730215549468994,
"learning_rate": 7.31382963162348e-06,
"loss": 0.0216,
"step": 4590
},
{
"epoch": 0.38044826730626086,
"grad_norm": 0.03934797644615173,
"learning_rate": 7.301697708415248e-06,
"loss": 0.0242,
"step": 4600
},
{
"epoch": 0.38127532875692666,
"grad_norm": 0.04784635826945305,
"learning_rate": 7.289548566119391e-06,
"loss": 0.0221,
"step": 4610
},
{
"epoch": 0.3821023902075924,
"grad_norm": 0.1260228306055069,
"learning_rate": 7.277382295624104e-06,
"loss": 0.0282,
"step": 4620
},
{
"epoch": 0.3829294516582582,
"grad_norm": 0.06200871989130974,
"learning_rate": 7.265198987945714e-06,
"loss": 0.0261,
"step": 4630
},
{
"epoch": 0.383756513108924,
"grad_norm": 0.061095982789993286,
"learning_rate": 7.252998734228007e-06,
"loss": 0.0245,
"step": 4640
},
{
"epoch": 0.3845835745595898,
"grad_norm": 0.053159236907958984,
"learning_rate": 7.240781625741545e-06,
"loss": 0.0233,
"step": 4650
},
{
"epoch": 0.38541063601025555,
"grad_norm": 0.0482206866145134,
"learning_rate": 7.228547753882976e-06,
"loss": 0.0261,
"step": 4660
},
{
"epoch": 0.38623769746092135,
"grad_norm": 0.05078030377626419,
"learning_rate": 7.216297210174361e-06,
"loss": 0.0244,
"step": 4670
},
{
"epoch": 0.38706475891158715,
"grad_norm": 0.044170767068862915,
"learning_rate": 7.204030086262478e-06,
"loss": 0.0238,
"step": 4680
},
{
"epoch": 0.3878918203622529,
"grad_norm": 0.04695448279380798,
"learning_rate": 7.191746473918148e-06,
"loss": 0.0223,
"step": 4690
},
{
"epoch": 0.3887188818129187,
"grad_norm": 0.052788231521844864,
"learning_rate": 7.179446465035535e-06,
"loss": 0.0249,
"step": 4700
},
{
"epoch": 0.3895459432635845,
"grad_norm": 0.05831609293818474,
"learning_rate": 7.167130151631475e-06,
"loss": 0.0244,
"step": 4710
},
{
"epoch": 0.3903730047142503,
"grad_norm": 0.05152612552046776,
"learning_rate": 7.154797625844773e-06,
"loss": 0.0224,
"step": 4720
},
{
"epoch": 0.39120006616491604,
"grad_norm": 0.047528255730867386,
"learning_rate": 7.142448979935521e-06,
"loss": 0.0236,
"step": 4730
},
{
"epoch": 0.39202712761558184,
"grad_norm": 0.051114026457071304,
"learning_rate": 7.130084306284406e-06,
"loss": 0.0235,
"step": 4740
},
{
"epoch": 0.39285418906624764,
"grad_norm": 0.04298287630081177,
"learning_rate": 7.11770369739202e-06,
"loss": 0.0224,
"step": 4750
},
{
"epoch": 0.3936812505169134,
"grad_norm": 0.05048811435699463,
"learning_rate": 7.105307245878166e-06,
"loss": 0.0238,
"step": 4760
},
{
"epoch": 0.3945083119675792,
"grad_norm": 0.04245224595069885,
"learning_rate": 7.092895044481165e-06,
"loss": 0.0235,
"step": 4770
},
{
"epoch": 0.395335373418245,
"grad_norm": 0.05021793767809868,
"learning_rate": 7.080467186057168e-06,
"loss": 0.0228,
"step": 4780
},
{
"epoch": 0.3961624348689108,
"grad_norm": 0.04611439257860184,
"learning_rate": 7.068023763579453e-06,
"loss": 0.0304,
"step": 4790
},
{
"epoch": 0.39698949631957653,
"grad_norm": 0.050482239574193954,
"learning_rate": 7.055564870137733e-06,
"loss": 0.0241,
"step": 4800
},
{
"epoch": 0.39781655777024233,
"grad_norm": 0.050899263471364975,
"learning_rate": 7.043090598937463e-06,
"loss": 0.0246,
"step": 4810
},
{
"epoch": 0.39864361922090813,
"grad_norm": 0.05052196979522705,
"learning_rate": 7.030601043299138e-06,
"loss": 0.0238,
"step": 4820
},
{
"epoch": 0.3994706806715739,
"grad_norm": 0.04977920651435852,
"learning_rate": 7.018096296657595e-06,
"loss": 0.0234,
"step": 4830
},
{
"epoch": 0.4002977421222397,
"grad_norm": 0.0429433137178421,
"learning_rate": 7.005576452561314e-06,
"loss": 0.0249,
"step": 4840
},
{
"epoch": 0.4011248035729055,
"grad_norm": 0.04633225128054619,
"learning_rate": 6.993041604671727e-06,
"loss": 0.0221,
"step": 4850
},
{
"epoch": 0.4019518650235713,
"grad_norm": 0.044517192989587784,
"learning_rate": 6.980491846762503e-06,
"loss": 0.023,
"step": 4860
},
{
"epoch": 0.402778926474237,
"grad_norm": 0.04668491706252098,
"learning_rate": 6.967927272718855e-06,
"loss": 0.023,
"step": 4870
},
{
"epoch": 0.4036059879249028,
"grad_norm": 0.13357175886631012,
"learning_rate": 6.955347976536841e-06,
"loss": 0.0218,
"step": 4880
},
{
"epoch": 0.4044330493755686,
"grad_norm": 0.04721111059188843,
"learning_rate": 6.942754052322645e-06,
"loss": 0.0222,
"step": 4890
},
{
"epoch": 0.40526011082623437,
"grad_norm": 0.07329542189836502,
"learning_rate": 6.9301455942918934e-06,
"loss": 0.0219,
"step": 4900
},
{
"epoch": 0.40608717227690017,
"grad_norm": 0.04098494350910187,
"learning_rate": 6.9175226967689395e-06,
"loss": 0.0224,
"step": 4910
},
{
"epoch": 0.40691423372756597,
"grad_norm": 0.0693870559334755,
"learning_rate": 6.904885454186155e-06,
"loss": 0.0239,
"step": 4920
},
{
"epoch": 0.40774129517823177,
"grad_norm": 0.04788215458393097,
"learning_rate": 6.89223396108323e-06,
"loss": 0.0278,
"step": 4930
},
{
"epoch": 0.4085683566288975,
"grad_norm": 0.041839174926280975,
"learning_rate": 6.879568312106462e-06,
"loss": 0.0215,
"step": 4940
},
{
"epoch": 0.4093954180795633,
"grad_norm": 0.04695757478475571,
"learning_rate": 6.866888602008053e-06,
"loss": 0.0235,
"step": 4950
},
{
"epoch": 0.4102224795302291,
"grad_norm": 0.05025403946638107,
"learning_rate": 6.854194925645392e-06,
"loss": 0.023,
"step": 4960
},
{
"epoch": 0.41104954098089486,
"grad_norm": 0.05418792739510536,
"learning_rate": 6.841487377980353e-06,
"loss": 0.0247,
"step": 4970
},
{
"epoch": 0.41187660243156066,
"grad_norm": 0.05611952021718025,
"learning_rate": 6.82876605407858e-06,
"loss": 0.023,
"step": 4980
},
{
"epoch": 0.41270366388222646,
"grad_norm": 0.04246920347213745,
"learning_rate": 6.816031049108777e-06,
"loss": 0.024,
"step": 4990
},
{
"epoch": 0.41353072533289226,
"grad_norm": 0.044995930045843124,
"learning_rate": 6.803282458342e-06,
"loss": 0.0215,
"step": 5000
},
{
"epoch": 0.41353072533289226,
"eval_loss": 0.024215074256062508,
"eval_runtime": 1219.5377,
"eval_samples_per_second": 4.919,
"eval_steps_per_second": 0.307,
"step": 5000
},
{
"epoch": 0.414357786783558,
"grad_norm": 0.05199587345123291,
"learning_rate": 6.790520377150939e-06,
"loss": 0.0233,
"step": 5010
},
{
"epoch": 0.4151848482342238,
"grad_norm": 0.04450158774852753,
"learning_rate": 6.777744901009204e-06,
"loss": 0.023,
"step": 5020
},
{
"epoch": 0.4160119096848896,
"grad_norm": 0.0536041297018528,
"learning_rate": 6.764956125490616e-06,
"loss": 0.022,
"step": 5030
},
{
"epoch": 0.41683897113555535,
"grad_norm": 0.04742833226919174,
"learning_rate": 6.752154146268491e-06,
"loss": 0.0267,
"step": 5040
},
{
"epoch": 0.41766603258622115,
"grad_norm": 0.05334756523370743,
"learning_rate": 6.739339059114916e-06,
"loss": 0.0232,
"step": 5050
},
{
"epoch": 0.41849309403688695,
"grad_norm": 0.0501900352537632,
"learning_rate": 6.726510959900046e-06,
"loss": 0.0248,
"step": 5060
},
{
"epoch": 0.41932015548755275,
"grad_norm": 0.04328269138932228,
"learning_rate": 6.713669944591375e-06,
"loss": 0.0229,
"step": 5070
},
{
"epoch": 0.4201472169382185,
"grad_norm": 0.04845112934708595,
"learning_rate": 6.700816109253023e-06,
"loss": 0.0242,
"step": 5080
},
{
"epoch": 0.4209742783888843,
"grad_norm": 0.051792118698358536,
"learning_rate": 6.6879495500450184e-06,
"loss": 0.0224,
"step": 5090
},
{
"epoch": 0.4218013398395501,
"grad_norm": 0.03820064663887024,
"learning_rate": 6.675070363222581e-06,
"loss": 0.0225,
"step": 5100
},
{
"epoch": 0.42262840129021584,
"grad_norm": 0.04609294980764389,
"learning_rate": 6.662178645135392e-06,
"loss": 0.0222,
"step": 5110
},
{
"epoch": 0.42345546274088164,
"grad_norm": 0.043115533888339996,
"learning_rate": 6.649274492226882e-06,
"loss": 0.0229,
"step": 5120
},
{
"epoch": 0.42428252419154744,
"grad_norm": 0.04883312061429024,
"learning_rate": 6.636358001033508e-06,
"loss": 0.0228,
"step": 5130
},
{
"epoch": 0.42510958564221324,
"grad_norm": 0.062484171241521835,
"learning_rate": 6.623429268184027e-06,
"loss": 0.0237,
"step": 5140
},
{
"epoch": 0.425936647092879,
"grad_norm": 0.0440596267580986,
"learning_rate": 6.6104883903987815e-06,
"loss": 0.0264,
"step": 5150
},
{
"epoch": 0.4267637085435448,
"grad_norm": 0.04892463609576225,
"learning_rate": 6.5975354644889665e-06,
"loss": 0.0217,
"step": 5160
},
{
"epoch": 0.4275907699942106,
"grad_norm": 0.04017140343785286,
"learning_rate": 6.5845705873559094e-06,
"loss": 0.0225,
"step": 5170
},
{
"epoch": 0.42841783144487633,
"grad_norm": 0.04880579188466072,
"learning_rate": 6.571593855990348e-06,
"loss": 0.023,
"step": 5180
},
{
"epoch": 0.42924489289554213,
"grad_norm": 0.06134543567895889,
"learning_rate": 6.5586053674717e-06,
"loss": 0.0227,
"step": 5190
},
{
"epoch": 0.43007195434620793,
"grad_norm": 0.03942278400063515,
"learning_rate": 6.545605218967341e-06,
"loss": 0.0222,
"step": 5200
},
{
"epoch": 0.43089901579687373,
"grad_norm": 0.04633478447794914,
"learning_rate": 6.5325935077318705e-06,
"loss": 0.0226,
"step": 5210
},
{
"epoch": 0.4317260772475395,
"grad_norm": 0.06766749918460846,
"learning_rate": 6.519570331106395e-06,
"loss": 0.0226,
"step": 5220
},
{
"epoch": 0.4325531386982053,
"grad_norm": 0.04740046337246895,
"learning_rate": 6.506535786517789e-06,
"loss": 0.0261,
"step": 5230
},
{
"epoch": 0.4333802001488711,
"grad_norm": 0.05168073996901512,
"learning_rate": 6.493489971477977e-06,
"loss": 0.0242,
"step": 5240
},
{
"epoch": 0.4342072615995368,
"grad_norm": 0.05117257684469223,
"learning_rate": 6.480432983583194e-06,
"loss": 0.0276,
"step": 5250
},
{
"epoch": 0.4350343230502026,
"grad_norm": 0.05560829117894173,
"learning_rate": 6.467364920513257e-06,
"loss": 0.0235,
"step": 5260
},
{
"epoch": 0.4358613845008684,
"grad_norm": 0.04257509857416153,
"learning_rate": 6.454285880030844e-06,
"loss": 0.022,
"step": 5270
},
{
"epoch": 0.4366884459515342,
"grad_norm": 0.047841571271419525,
"learning_rate": 6.441195959980749e-06,
"loss": 0.0235,
"step": 5280
},
{
"epoch": 0.43751550740219997,
"grad_norm": 0.04220358282327652,
"learning_rate": 6.428095258289162e-06,
"loss": 0.0227,
"step": 5290
},
{
"epoch": 0.43834256885286577,
"grad_norm": 0.04904833808541298,
"learning_rate": 6.414983872962924e-06,
"loss": 0.023,
"step": 5300
},
{
"epoch": 0.43916963030353157,
"grad_norm": 0.041855499148368835,
"learning_rate": 6.401861902088809e-06,
"loss": 0.0247,
"step": 5310
},
{
"epoch": 0.4399966917541973,
"grad_norm": 0.046882931143045425,
"learning_rate": 6.388729443832774e-06,
"loss": 0.0218,
"step": 5320
},
{
"epoch": 0.4408237532048631,
"grad_norm": 0.06054188311100006,
"learning_rate": 6.375586596439237e-06,
"loss": 0.0239,
"step": 5330
},
{
"epoch": 0.4416508146555289,
"grad_norm": 0.04277319461107254,
"learning_rate": 6.362433458230337e-06,
"loss": 0.0232,
"step": 5340
},
{
"epoch": 0.4424778761061947,
"grad_norm": 0.050606515258550644,
"learning_rate": 6.349270127605198e-06,
"loss": 0.0224,
"step": 5350
},
{
"epoch": 0.44330493755686046,
"grad_norm": 0.050200313329696655,
"learning_rate": 6.336096703039196e-06,
"loss": 0.0225,
"step": 5360
},
{
"epoch": 0.44413199900752626,
"grad_norm": 0.0431785061955452,
"learning_rate": 6.322913283083214e-06,
"loss": 0.0223,
"step": 5370
},
{
"epoch": 0.44495906045819206,
"grad_norm": 0.04577941447496414,
"learning_rate": 6.309719966362922e-06,
"loss": 0.0219,
"step": 5380
},
{
"epoch": 0.4457861219088578,
"grad_norm": 0.04745447263121605,
"learning_rate": 6.296516851578016e-06,
"loss": 0.0239,
"step": 5390
},
{
"epoch": 0.4466131833595236,
"grad_norm": 0.0505000539124012,
"learning_rate": 6.283304037501501e-06,
"loss": 0.0238,
"step": 5400
},
{
"epoch": 0.4474402448101894,
"grad_norm": 0.0681275799870491,
"learning_rate": 6.270081622978934e-06,
"loss": 0.0238,
"step": 5410
},
{
"epoch": 0.4482673062608552,
"grad_norm": 0.05186863988637924,
"learning_rate": 6.256849706927703e-06,
"loss": 0.0225,
"step": 5420
},
{
"epoch": 0.44909436771152095,
"grad_norm": 0.04716340824961662,
"learning_rate": 6.2436083883362706e-06,
"loss": 0.022,
"step": 5430
},
{
"epoch": 0.44992142916218675,
"grad_norm": 0.042241595685482025,
"learning_rate": 6.230357766263442e-06,
"loss": 0.0216,
"step": 5440
},
{
"epoch": 0.45074849061285255,
"grad_norm": 0.04572228342294693,
"learning_rate": 6.217097939837623e-06,
"loss": 0.0219,
"step": 5450
},
{
"epoch": 0.4515755520635183,
"grad_norm": 0.05299137532711029,
"learning_rate": 6.203829008256075e-06,
"loss": 0.0222,
"step": 5460
},
{
"epoch": 0.4524026135141841,
"grad_norm": 0.04044192656874657,
"learning_rate": 6.190551070784179e-06,
"loss": 0.0233,
"step": 5470
},
{
"epoch": 0.4532296749648499,
"grad_norm": 0.04427442327141762,
"learning_rate": 6.177264226754685e-06,
"loss": 0.0239,
"step": 5480
},
{
"epoch": 0.4540567364155157,
"grad_norm": 0.0423441156744957,
"learning_rate": 6.163968575566979e-06,
"loss": 0.0243,
"step": 5490
},
{
"epoch": 0.45488379786618144,
"grad_norm": 0.052600838243961334,
"learning_rate": 6.150664216686329e-06,
"loss": 0.0231,
"step": 5500
},
{
"epoch": 0.45571085931684724,
"grad_norm": 0.04956282302737236,
"learning_rate": 6.137351249643147e-06,
"loss": 0.0238,
"step": 5510
},
{
"epoch": 0.45653792076751304,
"grad_norm": 0.037822380661964417,
"learning_rate": 6.124029774032242e-06,
"loss": 0.0224,
"step": 5520
},
{
"epoch": 0.4573649822181788,
"grad_norm": 0.04192859306931496,
"learning_rate": 6.110699889512077e-06,
"loss": 0.0273,
"step": 5530
},
{
"epoch": 0.4581920436688446,
"grad_norm": 0.04586039483547211,
"learning_rate": 6.0973616958040265e-06,
"loss": 0.0223,
"step": 5540
},
{
"epoch": 0.4590191051195104,
"grad_norm": 0.049864090979099274,
"learning_rate": 6.084015292691617e-06,
"loss": 0.0237,
"step": 5550
},
{
"epoch": 0.4598461665701762,
"grad_norm": 0.061950068920850754,
"learning_rate": 6.070660780019797e-06,
"loss": 0.0228,
"step": 5560
},
{
"epoch": 0.46067322802084193,
"grad_norm": 0.04114188626408577,
"learning_rate": 6.057298257694182e-06,
"loss": 0.0233,
"step": 5570
},
{
"epoch": 0.46150028947150773,
"grad_norm": 0.048220761120319366,
"learning_rate": 6.043927825680305e-06,
"loss": 0.0285,
"step": 5580
},
{
"epoch": 0.46232735092217353,
"grad_norm": 0.047901052981615067,
"learning_rate": 6.030549584002876e-06,
"loss": 0.0247,
"step": 5590
},
{
"epoch": 0.4631544123728393,
"grad_norm": 0.04301442205905914,
"learning_rate": 6.017163632745025e-06,
"loss": 0.0222,
"step": 5600
},
{
"epoch": 0.4639814738235051,
"grad_norm": 0.059639185667037964,
"learning_rate": 6.003770072047559e-06,
"loss": 0.0224,
"step": 5610
},
{
"epoch": 0.4648085352741709,
"grad_norm": 0.05088592320680618,
"learning_rate": 5.990369002108215e-06,
"loss": 0.0255,
"step": 5620
},
{
"epoch": 0.4656355967248367,
"grad_norm": 0.04898575693368912,
"learning_rate": 5.976960523180904e-06,
"loss": 0.0221,
"step": 5630
},
{
"epoch": 0.4664626581755024,
"grad_norm": 0.04929777607321739,
"learning_rate": 5.963544735574961e-06,
"loss": 0.023,
"step": 5640
},
{
"epoch": 0.4672897196261682,
"grad_norm": 0.04379523918032646,
"learning_rate": 5.9501217396544034e-06,
"loss": 0.023,
"step": 5650
},
{
"epoch": 0.468116781076834,
"grad_norm": 0.049279894679784775,
"learning_rate": 5.93669163583717e-06,
"loss": 0.0232,
"step": 5660
},
{
"epoch": 0.46894384252749977,
"grad_norm": 0.044354990124702454,
"learning_rate": 5.923254524594376e-06,
"loss": 0.0229,
"step": 5670
},
{
"epoch": 0.46977090397816557,
"grad_norm": 0.05658494308590889,
"learning_rate": 5.9098105064495606e-06,
"loss": 0.0221,
"step": 5680
},
{
"epoch": 0.47059796542883137,
"grad_norm": 0.041339486837387085,
"learning_rate": 5.896359681977928e-06,
"loss": 0.0226,
"step": 5690
},
{
"epoch": 0.47142502687949717,
"grad_norm": 0.052800171077251434,
"learning_rate": 5.8829021518056095e-06,
"loss": 0.0237,
"step": 5700
},
{
"epoch": 0.4722520883301629,
"grad_norm": 0.04378625750541687,
"learning_rate": 5.869438016608893e-06,
"loss": 0.0241,
"step": 5710
},
{
"epoch": 0.4730791497808287,
"grad_norm": 0.08634616434574127,
"learning_rate": 5.855967377113487e-06,
"loss": 0.0263,
"step": 5720
},
{
"epoch": 0.4739062112314945,
"grad_norm": 0.0738649070262909,
"learning_rate": 5.842490334093752e-06,
"loss": 0.0231,
"step": 5730
},
{
"epoch": 0.4747332726821603,
"grad_norm": 0.04509838670492172,
"learning_rate": 5.829006988371959e-06,
"loss": 0.0231,
"step": 5740
},
{
"epoch": 0.47556033413282606,
"grad_norm": 0.044409893453121185,
"learning_rate": 5.815517440817526e-06,
"loss": 0.0222,
"step": 5750
},
{
"epoch": 0.47638739558349186,
"grad_norm": 0.04454704746603966,
"learning_rate": 5.8020217923462696e-06,
"loss": 0.022,
"step": 5760
},
{
"epoch": 0.47721445703415766,
"grad_norm": 0.04391258582472801,
"learning_rate": 5.788520143919647e-06,
"loss": 0.0223,
"step": 5770
},
{
"epoch": 0.4780415184848234,
"grad_norm": 0.039742667227983475,
"learning_rate": 5.775012596543999e-06,
"loss": 0.0236,
"step": 5780
},
{
"epoch": 0.4788685799354892,
"grad_norm": 0.04627054184675217,
"learning_rate": 5.761499251269798e-06,
"loss": 0.0225,
"step": 5790
},
{
"epoch": 0.479695641386155,
"grad_norm": 0.03860992565751076,
"learning_rate": 5.7479802091908945e-06,
"loss": 0.0268,
"step": 5800
},
{
"epoch": 0.4805227028368208,
"grad_norm": 0.04734113812446594,
"learning_rate": 5.734455571443751e-06,
"loss": 0.0233,
"step": 5810
},
{
"epoch": 0.48134976428748655,
"grad_norm": 0.07089436799287796,
"learning_rate": 5.720925439206695e-06,
"loss": 0.0267,
"step": 5820
},
{
"epoch": 0.48217682573815235,
"grad_norm": 0.04937206953763962,
"learning_rate": 5.707389913699157e-06,
"loss": 0.0225,
"step": 5830
},
{
"epoch": 0.48300388718881815,
"grad_norm": 0.04481448978185654,
"learning_rate": 5.693849096180917e-06,
"loss": 0.0221,
"step": 5840
},
{
"epoch": 0.4838309486394839,
"grad_norm": 0.051826462149620056,
"learning_rate": 5.680303087951339e-06,
"loss": 0.0237,
"step": 5850
},
{
"epoch": 0.4846580100901497,
"grad_norm": 0.13001324236392975,
"learning_rate": 5.666751990348627e-06,
"loss": 0.0223,
"step": 5860
},
{
"epoch": 0.4854850715408155,
"grad_norm": 0.04917273670434952,
"learning_rate": 5.653195904749054e-06,
"loss": 0.0219,
"step": 5870
},
{
"epoch": 0.4863121329914813,
"grad_norm": 0.04470530524849892,
"learning_rate": 5.639634932566208e-06,
"loss": 0.0307,
"step": 5880
},
{
"epoch": 0.48713919444214704,
"grad_norm": 0.04076725244522095,
"learning_rate": 5.626069175250236e-06,
"loss": 0.0223,
"step": 5890
},
{
"epoch": 0.48796625589281284,
"grad_norm": 0.050211817026138306,
"learning_rate": 5.61249873428708e-06,
"loss": 0.0227,
"step": 5900
},
{
"epoch": 0.48879331734347864,
"grad_norm": 0.03654312714934349,
"learning_rate": 5.5989237111977255e-06,
"loss": 0.0216,
"step": 5910
},
{
"epoch": 0.4896203787941444,
"grad_norm": 0.050298597663640976,
"learning_rate": 5.58534420753743e-06,
"loss": 0.0217,
"step": 5920
},
{
"epoch": 0.4904474402448102,
"grad_norm": 0.04905930534005165,
"learning_rate": 5.571760324894977e-06,
"loss": 0.0227,
"step": 5930
},
{
"epoch": 0.491274501695476,
"grad_norm": 0.045814525336027145,
"learning_rate": 5.558172164891903e-06,
"loss": 0.0225,
"step": 5940
},
{
"epoch": 0.4921015631461418,
"grad_norm": 0.06343957781791687,
"learning_rate": 5.544579829181751e-06,
"loss": 0.023,
"step": 5950
},
{
"epoch": 0.49292862459680753,
"grad_norm": 0.042192984372377396,
"learning_rate": 5.530983419449296e-06,
"loss": 0.021,
"step": 5960
},
{
"epoch": 0.49375568604747333,
"grad_norm": 0.04143495857715607,
"learning_rate": 5.517383037409794e-06,
"loss": 0.0253,
"step": 5970
},
{
"epoch": 0.49458274749813913,
"grad_norm": 0.04273596778512001,
"learning_rate": 5.503778784808218e-06,
"loss": 0.0226,
"step": 5980
},
{
"epoch": 0.4954098089488049,
"grad_norm": 0.047943755984306335,
"learning_rate": 5.490170763418496e-06,
"loss": 0.022,
"step": 5990
},
{
"epoch": 0.4962368703994707,
"grad_norm": 0.045045025646686554,
"learning_rate": 5.476559075042751e-06,
"loss": 0.0216,
"step": 6000
},
{
"epoch": 0.4962368703994707,
"eval_loss": 0.02347772754728794,
"eval_runtime": 1220.4355,
"eval_samples_per_second": 4.915,
"eval_steps_per_second": 0.307,
"step": 6000
},
{
"epoch": 0.4970639318501365,
"grad_norm": 0.04491131007671356,
"learning_rate": 5.4629438215105375e-06,
"loss": 0.0228,
"step": 6010
},
{
"epoch": 0.4978909933008023,
"grad_norm": 0.053035978227853775,
"learning_rate": 5.449325104678085e-06,
"loss": 0.0233,
"step": 6020
},
{
"epoch": 0.498718054751468,
"grad_norm": 0.04346757382154465,
"learning_rate": 5.4357030264275256e-06,
"loss": 0.0218,
"step": 6030
},
{
"epoch": 0.4995451162021338,
"grad_norm": 0.03982304036617279,
"learning_rate": 5.422077688666145e-06,
"loss": 0.0216,
"step": 6040
},
{
"epoch": 0.5003721776527996,
"grad_norm": 0.0594533309340477,
"learning_rate": 5.4084491933256086e-06,
"loss": 0.0228,
"step": 6050
},
{
"epoch": 0.5011992391034654,
"grad_norm": 0.03943202272057533,
"learning_rate": 5.394817642361206e-06,
"loss": 0.0231,
"step": 6060
},
{
"epoch": 0.5020263005541312,
"grad_norm": 0.03965817019343376,
"learning_rate": 5.381183137751087e-06,
"loss": 0.0234,
"step": 6070
},
{
"epoch": 0.5028533620047969,
"grad_norm": 0.05061696469783783,
"learning_rate": 5.367545781495495e-06,
"loss": 0.0252,
"step": 6080
},
{
"epoch": 0.5036804234554627,
"grad_norm": 0.0856064036488533,
"learning_rate": 5.353905675616008e-06,
"loss": 0.0228,
"step": 6090
},
{
"epoch": 0.5045074849061285,
"grad_norm": 0.05830984562635422,
"learning_rate": 5.340262922154773e-06,
"loss": 0.0239,
"step": 6100
},
{
"epoch": 0.5053345463567943,
"grad_norm": 0.042031850665807724,
"learning_rate": 5.326617623173747e-06,
"loss": 0.0218,
"step": 6110
},
{
"epoch": 0.5061616078074601,
"grad_norm": 0.04255002364516258,
"learning_rate": 5.312969880753928e-06,
"loss": 0.0257,
"step": 6120
},
{
"epoch": 0.5069886692581259,
"grad_norm": 0.046407558023929596,
"learning_rate": 5.299319796994591e-06,
"loss": 0.0214,
"step": 6130
},
{
"epoch": 0.5078157307087917,
"grad_norm": 0.044977955520153046,
"learning_rate": 5.285667474012529e-06,
"loss": 0.0243,
"step": 6140
},
{
"epoch": 0.5086427921594574,
"grad_norm": 0.041169311851263046,
"learning_rate": 5.272013013941289e-06,
"loss": 0.0221,
"step": 6150
},
{
"epoch": 0.5094698536101232,
"grad_norm": 0.04349064826965332,
"learning_rate": 5.258356518930403e-06,
"loss": 0.0222,
"step": 6160
},
{
"epoch": 0.510296915060789,
"grad_norm": 0.051616426557302475,
"learning_rate": 5.244698091144624e-06,
"loss": 0.0226,
"step": 6170
},
{
"epoch": 0.5111239765114548,
"grad_norm": 0.04476653039455414,
"learning_rate": 5.2310378327631695e-06,
"loss": 0.0225,
"step": 6180
},
{
"epoch": 0.5119510379621206,
"grad_norm": 0.04472777247428894,
"learning_rate": 5.21737584597895e-06,
"loss": 0.0231,
"step": 6190
},
{
"epoch": 0.5127780994127864,
"grad_norm": 0.05034750699996948,
"learning_rate": 5.203712232997801e-06,
"loss": 0.0215,
"step": 6200
},
{
"epoch": 0.5136051608634522,
"grad_norm": 0.04265570640563965,
"learning_rate": 5.190047096037734e-06,
"loss": 0.0246,
"step": 6210
},
{
"epoch": 0.5144322223141179,
"grad_norm": 0.0414557047188282,
"learning_rate": 5.176380537328149e-06,
"loss": 0.0224,
"step": 6220
},
{
"epoch": 0.5152592837647837,
"grad_norm": 0.047177575528621674,
"learning_rate": 5.1627126591090945e-06,
"loss": 0.0248,
"step": 6230
},
{
"epoch": 0.5160863452154495,
"grad_norm": 0.03995126485824585,
"learning_rate": 5.149043563630481e-06,
"loss": 0.0222,
"step": 6240
},
{
"epoch": 0.5169134066661153,
"grad_norm": 0.038500089198350906,
"learning_rate": 5.135373353151333e-06,
"loss": 0.0226,
"step": 6250
},
{
"epoch": 0.5177404681167811,
"grad_norm": 0.04477696493268013,
"learning_rate": 5.1217021299390055e-06,
"loss": 0.0252,
"step": 6260
},
{
"epoch": 0.5185675295674469,
"grad_norm": 0.04252477362751961,
"learning_rate": 5.108029996268442e-06,
"loss": 0.0208,
"step": 6270
},
{
"epoch": 0.5193945910181127,
"grad_norm": 0.04710827022790909,
"learning_rate": 5.09435705442139e-06,
"loss": 0.0208,
"step": 6280
},
{
"epoch": 0.5202216524687784,
"grad_norm": 0.04434856027364731,
"learning_rate": 5.080683406685644e-06,
"loss": 0.0223,
"step": 6290
},
{
"epoch": 0.5210487139194442,
"grad_norm": 0.04365675151348114,
"learning_rate": 5.067009155354281e-06,
"loss": 0.0219,
"step": 6300
},
{
"epoch": 0.52187577537011,
"grad_norm": 0.04527043551206589,
"learning_rate": 5.053334402724891e-06,
"loss": 0.0216,
"step": 6310
},
{
"epoch": 0.5227028368207758,
"grad_norm": 0.04446522891521454,
"learning_rate": 5.039659251098818e-06,
"loss": 0.0325,
"step": 6320
},
{
"epoch": 0.5235298982714416,
"grad_norm": 0.03923187032341957,
"learning_rate": 5.025983802780387e-06,
"loss": 0.0225,
"step": 6330
},
{
"epoch": 0.5243569597221074,
"grad_norm": 0.0494740828871727,
"learning_rate": 5.012308160076143e-06,
"loss": 0.0236,
"step": 6340
},
{
"epoch": 0.5251840211727732,
"grad_norm": 0.048305340111255646,
"learning_rate": 4.998632425294089e-06,
"loss": 0.0219,
"step": 6350
},
{
"epoch": 0.5260110826234389,
"grad_norm": 0.05675299093127251,
"learning_rate": 4.984956700742914e-06,
"loss": 0.023,
"step": 6360
},
{
"epoch": 0.5268381440741047,
"grad_norm": 0.05156668648123741,
"learning_rate": 4.9712810887312285e-06,
"loss": 0.021,
"step": 6370
},
{
"epoch": 0.5276652055247705,
"grad_norm": 0.0496770441532135,
"learning_rate": 4.957605691566806e-06,
"loss": 0.0226,
"step": 6380
},
{
"epoch": 0.5284922669754363,
"grad_norm": 0.044166844338178635,
"learning_rate": 4.943930611555807e-06,
"loss": 0.0285,
"step": 6390
},
{
"epoch": 0.5293193284261021,
"grad_norm": 0.0438714399933815,
"learning_rate": 4.930255951002023e-06,
"loss": 0.0235,
"step": 6400
},
{
"epoch": 0.5301463898767679,
"grad_norm": 0.049872253090143204,
"learning_rate": 4.91658181220611e-06,
"loss": 0.0213,
"step": 6410
},
{
"epoch": 0.5309734513274337,
"grad_norm": 0.05873720347881317,
"learning_rate": 4.902908297464815e-06,
"loss": 0.0214,
"step": 6420
},
{
"epoch": 0.5318005127780994,
"grad_norm": 0.04734335094690323,
"learning_rate": 4.8892355090702195e-06,
"loss": 0.0219,
"step": 6430
},
{
"epoch": 0.5326275742287652,
"grad_norm": 0.04171719029545784,
"learning_rate": 4.875563549308971e-06,
"loss": 0.0217,
"step": 6440
},
{
"epoch": 0.533454635679431,
"grad_norm": 0.04020686820149422,
"learning_rate": 4.861892520461514e-06,
"loss": 0.0229,
"step": 6450
},
{
"epoch": 0.5342816971300968,
"grad_norm": 0.04311240091919899,
"learning_rate": 4.848222524801341e-06,
"loss": 0.0232,
"step": 6460
},
{
"epoch": 0.5351087585807626,
"grad_norm": 0.05833645164966583,
"learning_rate": 4.834553664594197e-06,
"loss": 0.022,
"step": 6470
},
{
"epoch": 0.5359358200314284,
"grad_norm": 0.0407719612121582,
"learning_rate": 4.820886042097349e-06,
"loss": 0.0233,
"step": 6480
},
{
"epoch": 0.5367628814820942,
"grad_norm": 0.03404640033841133,
"learning_rate": 4.807219759558794e-06,
"loss": 0.0222,
"step": 6490
},
{
"epoch": 0.5375899429327599,
"grad_norm": 0.04761282354593277,
"learning_rate": 4.7935549192165116e-06,
"loss": 0.0224,
"step": 6500
},
{
"epoch": 0.5384170043834257,
"grad_norm": 0.04644225910305977,
"learning_rate": 4.779891623297688e-06,
"loss": 0.0231,
"step": 6510
},
{
"epoch": 0.5392440658340915,
"grad_norm": 0.04503655433654785,
"learning_rate": 4.7662299740179544e-06,
"loss": 0.0226,
"step": 6520
},
{
"epoch": 0.5400711272847573,
"grad_norm": 0.04182233288884163,
"learning_rate": 4.752570073580632e-06,
"loss": 0.0207,
"step": 6530
},
{
"epoch": 0.5408981887354231,
"grad_norm": 0.04567556828260422,
"learning_rate": 4.738912024175945e-06,
"loss": 0.0218,
"step": 6540
},
{
"epoch": 0.5417252501860889,
"grad_norm": 0.04384360834956169,
"learning_rate": 4.725255927980283e-06,
"loss": 0.0214,
"step": 6550
},
{
"epoch": 0.5425523116367547,
"grad_norm": 0.0403946228325367,
"learning_rate": 4.711601887155417e-06,
"loss": 0.0264,
"step": 6560
},
{
"epoch": 0.5433793730874203,
"grad_norm": 0.038516897708177567,
"learning_rate": 4.6979500038477425e-06,
"loss": 0.0221,
"step": 6570
},
{
"epoch": 0.5442064345380861,
"grad_norm": 0.0414847806096077,
"learning_rate": 4.684300380187516e-06,
"loss": 0.0204,
"step": 6580
},
{
"epoch": 0.545033495988752,
"grad_norm": 0.04257076978683472,
"learning_rate": 4.670653118288085e-06,
"loss": 0.0211,
"step": 6590
},
{
"epoch": 0.5458605574394177,
"grad_norm": 0.04257350042462349,
"learning_rate": 4.657008320245136e-06,
"loss": 0.0218,
"step": 6600
},
{
"epoch": 0.5466876188900835,
"grad_norm": 0.04577566310763359,
"learning_rate": 4.643366088135918e-06,
"loss": 0.0221,
"step": 6610
},
{
"epoch": 0.5475146803407493,
"grad_norm": 0.11741481721401215,
"learning_rate": 4.629726524018486e-06,
"loss": 0.0222,
"step": 6620
},
{
"epoch": 0.5483417417914151,
"grad_norm": 0.04335429146885872,
"learning_rate": 4.616089729930932e-06,
"loss": 0.0252,
"step": 6630
},
{
"epoch": 0.5491688032420808,
"grad_norm": 0.04533402994275093,
"learning_rate": 4.602455807890634e-06,
"loss": 0.0218,
"step": 6640
},
{
"epoch": 0.5499958646927466,
"grad_norm": 0.042610831558704376,
"learning_rate": 4.588824859893473e-06,
"loss": 0.022,
"step": 6650
},
{
"epoch": 0.5508229261434124,
"grad_norm": 0.03981228917837143,
"learning_rate": 4.57519698791309e-06,
"loss": 0.0227,
"step": 6660
},
{
"epoch": 0.5516499875940782,
"grad_norm": 0.0377313606441021,
"learning_rate": 4.561572293900109e-06,
"loss": 0.0226,
"step": 6670
},
{
"epoch": 0.552477049044744,
"grad_norm": 0.08314741402864456,
"learning_rate": 4.547950879781382e-06,
"loss": 0.0229,
"step": 6680
},
{
"epoch": 0.5533041104954098,
"grad_norm": 0.04389451816678047,
"learning_rate": 4.534332847459225e-06,
"loss": 0.0212,
"step": 6690
},
{
"epoch": 0.5541311719460756,
"grad_norm": 0.04181825742125511,
"learning_rate": 4.520718298810649e-06,
"loss": 0.0203,
"step": 6700
},
{
"epoch": 0.5549582333967413,
"grad_norm": 0.042209409177303314,
"learning_rate": 4.507107335686611e-06,
"loss": 0.0234,
"step": 6710
},
{
"epoch": 0.5557852948474071,
"grad_norm": 0.03632921725511551,
"learning_rate": 4.49350005991124e-06,
"loss": 0.0213,
"step": 6720
},
{
"epoch": 0.5566123562980729,
"grad_norm": 0.03909287229180336,
"learning_rate": 4.47989657328108e-06,
"loss": 0.0259,
"step": 6730
},
{
"epoch": 0.5574394177487387,
"grad_norm": 0.04961128160357475,
"learning_rate": 4.466296977564331e-06,
"loss": 0.0229,
"step": 6740
},
{
"epoch": 0.5582664791994045,
"grad_norm": 0.04496648535132408,
"learning_rate": 4.452701374500079e-06,
"loss": 0.0207,
"step": 6750
},
{
"epoch": 0.5590935406500703,
"grad_norm": 0.045161984860897064,
"learning_rate": 4.43910986579755e-06,
"loss": 0.0233,
"step": 6760
},
{
"epoch": 0.5599206021007361,
"grad_norm": 0.047101061791181564,
"learning_rate": 4.42552255313533e-06,
"loss": 0.0327,
"step": 6770
},
{
"epoch": 0.5607476635514018,
"grad_norm": 0.044754352420568466,
"learning_rate": 4.411939538160621e-06,
"loss": 0.0221,
"step": 6780
},
{
"epoch": 0.5615747250020676,
"grad_norm": 0.04385341331362724,
"learning_rate": 4.398360922488474e-06,
"loss": 0.0266,
"step": 6790
},
{
"epoch": 0.5624017864527334,
"grad_norm": 0.05165982246398926,
"learning_rate": 4.384786807701024e-06,
"loss": 0.0218,
"step": 6800
},
{
"epoch": 0.5632288479033992,
"grad_norm": 0.03928116336464882,
"learning_rate": 4.371217295346738e-06,
"loss": 0.022,
"step": 6810
},
{
"epoch": 0.564055909354065,
"grad_norm": 0.038528576493263245,
"learning_rate": 4.357652486939649e-06,
"loss": 0.0218,
"step": 6820
},
{
"epoch": 0.5648829708047308,
"grad_norm": 0.04096828028559685,
"learning_rate": 4.3440924839586045e-06,
"loss": 0.0221,
"step": 6830
},
{
"epoch": 0.5657100322553966,
"grad_norm": 0.04172588139772415,
"learning_rate": 4.3305373878465e-06,
"loss": 0.0214,
"step": 6840
},
{
"epoch": 0.5665370937060623,
"grad_norm": 0.04250342398881912,
"learning_rate": 4.316987300009521e-06,
"loss": 0.0216,
"step": 6850
},
{
"epoch": 0.5673641551567281,
"grad_norm": 0.04389472305774689,
"learning_rate": 4.303442321816388e-06,
"loss": 0.0225,
"step": 6860
},
{
"epoch": 0.5681912166073939,
"grad_norm": 0.04604129120707512,
"learning_rate": 4.2899025545975935e-06,
"loss": 0.025,
"step": 6870
},
{
"epoch": 0.5690182780580597,
"grad_norm": 0.04432059824466705,
"learning_rate": 4.276368099644649e-06,
"loss": 0.0223,
"step": 6880
},
{
"epoch": 0.5698453395087255,
"grad_norm": 0.04254218190908432,
"learning_rate": 4.262839058209325e-06,
"loss": 0.0254,
"step": 6890
},
{
"epoch": 0.5706724009593913,
"grad_norm": 0.04665306955575943,
"learning_rate": 4.249315531502892e-06,
"loss": 0.0233,
"step": 6900
},
{
"epoch": 0.5714994624100571,
"grad_norm": 0.06424245983362198,
"learning_rate": 4.235797620695365e-06,
"loss": 0.0223,
"step": 6910
},
{
"epoch": 0.5723265238607228,
"grad_norm": 0.04606041684746742,
"learning_rate": 4.222285426914744e-06,
"loss": 0.0226,
"step": 6920
},
{
"epoch": 0.5731535853113886,
"grad_norm": 0.055455636233091354,
"learning_rate": 4.208779051246264e-06,
"loss": 0.0217,
"step": 6930
},
{
"epoch": 0.5739806467620544,
"grad_norm": 0.05722310021519661,
"learning_rate": 4.1952785947316335e-06,
"loss": 0.0287,
"step": 6940
},
{
"epoch": 0.5748077082127202,
"grad_norm": 0.047114696353673935,
"learning_rate": 4.181784158368274e-06,
"loss": 0.0213,
"step": 6950
},
{
"epoch": 0.575634769663386,
"grad_norm": 0.041593633592128754,
"learning_rate": 4.1682958431085784e-06,
"loss": 0.0226,
"step": 6960
},
{
"epoch": 0.5764618311140518,
"grad_norm": 0.044355396181344986,
"learning_rate": 4.1548137498591415e-06,
"loss": 0.0214,
"step": 6970
},
{
"epoch": 0.5772888925647176,
"grad_norm": 0.043452925980091095,
"learning_rate": 4.141337979480014e-06,
"loss": 0.022,
"step": 6980
},
{
"epoch": 0.5781159540153833,
"grad_norm": 0.04600623995065689,
"learning_rate": 4.127868632783943e-06,
"loss": 0.0219,
"step": 6990
},
{
"epoch": 0.5789430154660491,
"grad_norm": 0.045817919075489044,
"learning_rate": 4.114405810535619e-06,
"loss": 0.0228,
"step": 7000
},
{
"epoch": 0.5789430154660491,
"eval_loss": 0.022890722379088402,
"eval_runtime": 1220.8476,
"eval_samples_per_second": 4.914,
"eval_steps_per_second": 0.307,
"step": 7000
},
{
"epoch": 0.5797700769167149,
"grad_norm": 0.04360632598400116,
"learning_rate": 4.100949613450929e-06,
"loss": 0.0232,
"step": 7010
},
{
"epoch": 0.5805971383673807,
"grad_norm": 0.11677900701761246,
"learning_rate": 4.087500142196188e-06,
"loss": 0.0239,
"step": 7020
},
{
"epoch": 0.5814241998180465,
"grad_norm": 0.03949005529284477,
"learning_rate": 4.074057497387402e-06,
"loss": 0.0215,
"step": 7030
},
{
"epoch": 0.5822512612687123,
"grad_norm": 0.04393787682056427,
"learning_rate": 4.060621779589505e-06,
"loss": 0.0224,
"step": 7040
},
{
"epoch": 0.5830783227193781,
"grad_norm": 0.05478642135858536,
"learning_rate": 4.047193089315608e-06,
"loss": 0.0217,
"step": 7050
},
{
"epoch": 0.5839053841700438,
"grad_norm": 0.05870141461491585,
"learning_rate": 4.033771527026252e-06,
"loss": 0.0218,
"step": 7060
},
{
"epoch": 0.5847324456207096,
"grad_norm": 0.04158046096563339,
"learning_rate": 4.020357193128655e-06,
"loss": 0.021,
"step": 7070
},
{
"epoch": 0.5855595070713754,
"grad_norm": 0.05177818983793259,
"learning_rate": 4.006950187975951e-06,
"loss": 0.0202,
"step": 7080
},
{
"epoch": 0.5863865685220412,
"grad_norm": 0.04415697604417801,
"learning_rate": 3.993550611866458e-06,
"loss": 0.0222,
"step": 7090
},
{
"epoch": 0.587213629972707,
"grad_norm": 0.06037106364965439,
"learning_rate": 3.980158565042908e-06,
"loss": 0.022,
"step": 7100
},
{
"epoch": 0.5880406914233728,
"grad_norm": 0.03998905047774315,
"learning_rate": 3.96677414769171e-06,
"loss": 0.0227,
"step": 7110
},
{
"epoch": 0.5888677528740386,
"grad_norm": 0.035650502890348434,
"learning_rate": 3.9533974599422e-06,
"loss": 0.0218,
"step": 7120
},
{
"epoch": 0.5896948143247043,
"grad_norm": 0.04198850691318512,
"learning_rate": 3.940028601865881e-06,
"loss": 0.0229,
"step": 7130
},
{
"epoch": 0.5905218757753701,
"grad_norm": 0.041859325021505356,
"learning_rate": 3.9266676734756894e-06,
"loss": 0.0217,
"step": 7140
},
{
"epoch": 0.5913489372260359,
"grad_norm": 0.04040461406111717,
"learning_rate": 3.913314774725234e-06,
"loss": 0.0212,
"step": 7150
},
{
"epoch": 0.5921759986767017,
"grad_norm": 0.04604990780353546,
"learning_rate": 3.899970005508053e-06,
"loss": 0.022,
"step": 7160
},
{
"epoch": 0.5930030601273675,
"grad_norm": 0.04515118896961212,
"learning_rate": 3.8866334656568765e-06,
"loss": 0.022,
"step": 7170
},
{
"epoch": 0.5938301215780333,
"grad_norm": 0.04524078220129013,
"learning_rate": 3.8733052549428566e-06,
"loss": 0.0215,
"step": 7180
},
{
"epoch": 0.5946571830286991,
"grad_norm": 0.04891633987426758,
"learning_rate": 3.859985473074847e-06,
"loss": 0.0226,
"step": 7190
},
{
"epoch": 0.5954842444793648,
"grad_norm": 0.042289573699235916,
"learning_rate": 3.846674219698635e-06,
"loss": 0.0213,
"step": 7200
},
{
"epoch": 0.5963113059300306,
"grad_norm": 0.04168631508946419,
"learning_rate": 3.833371594396214e-06,
"loss": 0.0228,
"step": 7210
},
{
"epoch": 0.5971383673806964,
"grad_norm": 0.04345110431313515,
"learning_rate": 3.820077696685027e-06,
"loss": 0.0213,
"step": 7220
},
{
"epoch": 0.5979654288313622,
"grad_norm": 0.04696614667773247,
"learning_rate": 3.8067926260172234e-06,
"loss": 0.0226,
"step": 7230
},
{
"epoch": 0.598792490282028,
"grad_norm": 0.041647132486104965,
"learning_rate": 3.793516481778924e-06,
"loss": 0.022,
"step": 7240
},
{
"epoch": 0.5996195517326938,
"grad_norm": 0.04166780784726143,
"learning_rate": 3.780249363289459e-06,
"loss": 0.0253,
"step": 7250
},
{
"epoch": 0.6004466131833596,
"grad_norm": 0.04384204372763634,
"learning_rate": 3.766991369800649e-06,
"loss": 0.0219,
"step": 7260
},
{
"epoch": 0.6012736746340253,
"grad_norm": 0.03765762969851494,
"learning_rate": 3.7537426004960446e-06,
"loss": 0.0207,
"step": 7270
},
{
"epoch": 0.6021007360846911,
"grad_norm": 0.04585011675953865,
"learning_rate": 3.7405031544901884e-06,
"loss": 0.0209,
"step": 7280
},
{
"epoch": 0.6029277975353569,
"grad_norm": 0.04622683674097061,
"learning_rate": 3.7272731308278777e-06,
"loss": 0.0225,
"step": 7290
},
{
"epoch": 0.6037548589860227,
"grad_norm": 0.061212386935949326,
"learning_rate": 3.714052628483417e-06,
"loss": 0.0202,
"step": 7300
},
{
"epoch": 0.6045819204366885,
"grad_norm": 0.0689668282866478,
"learning_rate": 3.700841746359889e-06,
"loss": 0.0222,
"step": 7310
},
{
"epoch": 0.6054089818873543,
"grad_norm": 0.041381001472473145,
"learning_rate": 3.6876405832884016e-06,
"loss": 0.0214,
"step": 7320
},
{
"epoch": 0.6062360433380201,
"grad_norm": 0.0484529472887516,
"learning_rate": 3.6744492380273533e-06,
"loss": 0.0219,
"step": 7330
},
{
"epoch": 0.6070631047886857,
"grad_norm": 0.04558572545647621,
"learning_rate": 3.661267809261698e-06,
"loss": 0.0212,
"step": 7340
},
{
"epoch": 0.6078901662393515,
"grad_norm": 0.03745023533701897,
"learning_rate": 3.648096395602202e-06,
"loss": 0.0231,
"step": 7350
},
{
"epoch": 0.6087172276900173,
"grad_norm": 0.04229872673749924,
"learning_rate": 3.6349350955847094e-06,
"loss": 0.0215,
"step": 7360
},
{
"epoch": 0.6095442891406831,
"grad_norm": 0.06078009679913521,
"learning_rate": 3.6217840076694066e-06,
"loss": 0.0233,
"step": 7370
},
{
"epoch": 0.610371350591349,
"grad_norm": 0.04391666501760483,
"learning_rate": 3.6086432302400754e-06,
"loss": 0.0218,
"step": 7380
},
{
"epoch": 0.6111984120420147,
"grad_norm": 0.04776912182569504,
"learning_rate": 3.5955128616033717e-06,
"loss": 0.0238,
"step": 7390
},
{
"epoch": 0.6120254734926805,
"grad_norm": 0.04561059549450874,
"learning_rate": 3.582392999988078e-06,
"loss": 0.0229,
"step": 7400
},
{
"epoch": 0.6128525349433462,
"grad_norm": 0.043533895164728165,
"learning_rate": 3.569283743544375e-06,
"loss": 0.022,
"step": 7410
},
{
"epoch": 0.613679596394012,
"grad_norm": 0.03526020050048828,
"learning_rate": 3.55618519034311e-06,
"loss": 0.0214,
"step": 7420
},
{
"epoch": 0.6145066578446778,
"grad_norm": 0.03638261556625366,
"learning_rate": 3.5430974383750503e-06,
"loss": 0.0208,
"step": 7430
},
{
"epoch": 0.6153337192953436,
"grad_norm": 0.04244010150432587,
"learning_rate": 3.530020585550166e-06,
"loss": 0.0224,
"step": 7440
},
{
"epoch": 0.6161607807460094,
"grad_norm": 0.03991573676466942,
"learning_rate": 3.5169547296968874e-06,
"loss": 0.0218,
"step": 7450
},
{
"epoch": 0.6169878421966752,
"grad_norm": 0.03916684165596962,
"learning_rate": 3.5038999685613752e-06,
"loss": 0.0212,
"step": 7460
},
{
"epoch": 0.617814903647341,
"grad_norm": 0.037909045815467834,
"learning_rate": 3.4908563998067945e-06,
"loss": 0.0222,
"step": 7470
},
{
"epoch": 0.6186419650980067,
"grad_norm": 0.048196956515312195,
"learning_rate": 3.4778241210125718e-06,
"loss": 0.021,
"step": 7480
},
{
"epoch": 0.6194690265486725,
"grad_norm": 0.04458421468734741,
"learning_rate": 3.4648032296736805e-06,
"loss": 0.0236,
"step": 7490
},
{
"epoch": 0.6202960879993383,
"grad_norm": 0.039592791348695755,
"learning_rate": 3.4517938231999026e-06,
"loss": 0.0228,
"step": 7500
},
{
"epoch": 0.6211231494500041,
"grad_norm": 0.048372309654951096,
"learning_rate": 3.4387959989150977e-06,
"loss": 0.0215,
"step": 7510
},
{
"epoch": 0.6219502109006699,
"grad_norm": 0.041564539074897766,
"learning_rate": 3.425809854056482e-06,
"loss": 0.0219,
"step": 7520
},
{
"epoch": 0.6227772723513357,
"grad_norm": 0.043838802725076675,
"learning_rate": 3.4128354857738942e-06,
"loss": 0.0208,
"step": 7530
},
{
"epoch": 0.6236043338020015,
"grad_norm": 0.04145396873354912,
"learning_rate": 3.3998729911290775e-06,
"loss": 0.0212,
"step": 7540
},
{
"epoch": 0.6244313952526672,
"grad_norm": 0.04556450992822647,
"learning_rate": 3.386922467094944e-06,
"loss": 0.023,
"step": 7550
},
{
"epoch": 0.625258456703333,
"grad_norm": 0.04290676862001419,
"learning_rate": 3.3739840105548528e-06,
"loss": 0.021,
"step": 7560
},
{
"epoch": 0.6260855181539988,
"grad_norm": 0.042239073663949966,
"learning_rate": 3.3610577183018877e-06,
"loss": 0.0225,
"step": 7570
},
{
"epoch": 0.6269125796046646,
"grad_norm": 0.04751691594719887,
"learning_rate": 3.348143687038128e-06,
"loss": 0.0215,
"step": 7580
},
{
"epoch": 0.6277396410553304,
"grad_norm": 0.04237852618098259,
"learning_rate": 3.3352420133739304e-06,
"loss": 0.0218,
"step": 7590
},
{
"epoch": 0.6285667025059962,
"grad_norm": 0.03740919381380081,
"learning_rate": 3.3223527938272076e-06,
"loss": 0.0213,
"step": 7600
},
{
"epoch": 0.629393763956662,
"grad_norm": 0.036210279911756516,
"learning_rate": 3.3094761248226948e-06,
"loss": 0.0255,
"step": 7610
},
{
"epoch": 0.6302208254073277,
"grad_norm": 0.04506264254450798,
"learning_rate": 3.296612102691241e-06,
"loss": 0.0224,
"step": 7620
},
{
"epoch": 0.6310478868579935,
"grad_norm": 0.04092979431152344,
"learning_rate": 3.283760823669082e-06,
"loss": 0.0206,
"step": 7630
},
{
"epoch": 0.6318749483086593,
"grad_norm": 0.04056790471076965,
"learning_rate": 3.270922383897121e-06,
"loss": 0.0213,
"step": 7640
},
{
"epoch": 0.6327020097593251,
"grad_norm": 0.03952750191092491,
"learning_rate": 3.258096879420216e-06,
"loss": 0.021,
"step": 7650
},
{
"epoch": 0.6335290712099909,
"grad_norm": 0.04810957983136177,
"learning_rate": 3.245284406186446e-06,
"loss": 0.0226,
"step": 7660
},
{
"epoch": 0.6343561326606567,
"grad_norm": 0.038928598165512085,
"learning_rate": 3.232485060046412e-06,
"loss": 0.0231,
"step": 7670
},
{
"epoch": 0.6351831941113225,
"grad_norm": 0.03903147578239441,
"learning_rate": 3.2196989367525035e-06,
"loss": 0.0255,
"step": 7680
},
{
"epoch": 0.6360102555619882,
"grad_norm": 0.04532884061336517,
"learning_rate": 3.2069261319581922e-06,
"loss": 0.02,
"step": 7690
},
{
"epoch": 0.636837317012654,
"grad_norm": 0.0435151644051075,
"learning_rate": 3.19416674121732e-06,
"loss": 0.022,
"step": 7700
},
{
"epoch": 0.6376643784633198,
"grad_norm": 0.04332192242145538,
"learning_rate": 3.1814208599833634e-06,
"loss": 0.0273,
"step": 7710
},
{
"epoch": 0.6384914399139856,
"grad_norm": 0.0369616374373436,
"learning_rate": 3.168688583608748e-06,
"loss": 0.0214,
"step": 7720
},
{
"epoch": 0.6393185013646514,
"grad_norm": 0.07783352583646774,
"learning_rate": 3.1559700073441123e-06,
"loss": 0.0213,
"step": 7730
},
{
"epoch": 0.6401455628153172,
"grad_norm": 0.0504750981926918,
"learning_rate": 3.1432652263376073e-06,
"loss": 0.0202,
"step": 7740
},
{
"epoch": 0.640972624265983,
"grad_norm": 0.0557858943939209,
"learning_rate": 3.130574335634181e-06,
"loss": 0.0222,
"step": 7750
},
{
"epoch": 0.6417996857166487,
"grad_norm": 0.0438205786049366,
"learning_rate": 3.117897430174863e-06,
"loss": 0.0211,
"step": 7760
},
{
"epoch": 0.6426267471673145,
"grad_norm": 0.04007831588387489,
"learning_rate": 3.1052346047960696e-06,
"loss": 0.0223,
"step": 7770
},
{
"epoch": 0.6434538086179803,
"grad_norm": 0.04356636852025986,
"learning_rate": 3.0925859542288695e-06,
"loss": 0.021,
"step": 7780
},
{
"epoch": 0.6442808700686461,
"grad_norm": 0.044068679213523865,
"learning_rate": 3.0799515730982987e-06,
"loss": 0.0239,
"step": 7790
},
{
"epoch": 0.6451079315193119,
"grad_norm": 0.058787260204553604,
"learning_rate": 3.0673315559226426e-06,
"loss": 0.0223,
"step": 7800
},
{
"epoch": 0.6459349929699777,
"grad_norm": 0.04351416230201721,
"learning_rate": 3.054725997112724e-06,
"loss": 0.0227,
"step": 7810
},
{
"epoch": 0.6467620544206435,
"grad_norm": 0.0457034632563591,
"learning_rate": 3.042134990971205e-06,
"loss": 0.021,
"step": 7820
},
{
"epoch": 0.6475891158713092,
"grad_norm": 0.04021298885345459,
"learning_rate": 3.0295586316918816e-06,
"loss": 0.0205,
"step": 7830
},
{
"epoch": 0.648416177321975,
"grad_norm": 0.045050378888845444,
"learning_rate": 3.0169970133589714e-06,
"loss": 0.0217,
"step": 7840
},
{
"epoch": 0.6492432387726408,
"grad_norm": 0.036717429757118225,
"learning_rate": 3.004450229946418e-06,
"loss": 0.0218,
"step": 7850
},
{
"epoch": 0.6500703002233066,
"grad_norm": 0.05614123493432999,
"learning_rate": 2.99191837531718e-06,
"loss": 0.0234,
"step": 7860
},
{
"epoch": 0.6508973616739724,
"grad_norm": 0.037934400141239166,
"learning_rate": 2.9794015432225363e-06,
"loss": 0.022,
"step": 7870
},
{
"epoch": 0.6517244231246382,
"grad_norm": 0.04340437054634094,
"learning_rate": 2.966899827301386e-06,
"loss": 0.0286,
"step": 7880
},
{
"epoch": 0.652551484575304,
"grad_norm": 0.04128657281398773,
"learning_rate": 2.9544133210795317e-06,
"loss": 0.0217,
"step": 7890
},
{
"epoch": 0.6533785460259697,
"grad_norm": 0.04219742491841316,
"learning_rate": 2.9419421179690044e-06,
"loss": 0.0207,
"step": 7900
},
{
"epoch": 0.6542056074766355,
"grad_norm": 0.04193083569407463,
"learning_rate": 2.929486311267343e-06,
"loss": 0.0218,
"step": 7910
},
{
"epoch": 0.6550326689273013,
"grad_norm": 0.03400260955095291,
"learning_rate": 2.9170459941569094e-06,
"loss": 0.0215,
"step": 7920
},
{
"epoch": 0.6558597303779671,
"grad_norm": 0.04170495644211769,
"learning_rate": 2.904621259704188e-06,
"loss": 0.0219,
"step": 7930
},
{
"epoch": 0.6566867918286329,
"grad_norm": 0.04302512854337692,
"learning_rate": 2.892212200859086e-06,
"loss": 0.0244,
"step": 7940
},
{
"epoch": 0.6575138532792987,
"grad_norm": 0.043327417224645615,
"learning_rate": 2.8798189104542436e-06,
"loss": 0.022,
"step": 7950
},
{
"epoch": 0.6583409147299645,
"grad_norm": 0.05167660862207413,
"learning_rate": 2.8674414812043317e-06,
"loss": 0.0205,
"step": 7960
},
{
"epoch": 0.6591679761806302,
"grad_norm": 0.061974212527275085,
"learning_rate": 2.855080005705367e-06,
"loss": 0.0243,
"step": 7970
},
{
"epoch": 0.659995037631296,
"grad_norm": 0.04321138933300972,
"learning_rate": 2.842734576434021e-06,
"loss": 0.0212,
"step": 7980
},
{
"epoch": 0.6608220990819618,
"grad_norm": 0.05327922850847244,
"learning_rate": 2.8304052857469107e-06,
"loss": 0.021,
"step": 7990
},
{
"epoch": 0.6616491605326276,
"grad_norm": 0.04471385106444359,
"learning_rate": 2.8180922258799286e-06,
"loss": 0.0214,
"step": 8000
},
{
"epoch": 0.6616491605326276,
"eval_loss": 0.022467145696282387,
"eval_runtime": 1221.4961,
"eval_samples_per_second": 4.911,
"eval_steps_per_second": 0.307,
"step": 8000
},
{
"epoch": 0.6624762219832934,
"grad_norm": 0.045148443430662155,
"learning_rate": 2.8057954889475415e-06,
"loss": 0.0216,
"step": 8010
},
{
"epoch": 0.6633032834339592,
"grad_norm": 0.04102947190403938,
"learning_rate": 2.7935151669421033e-06,
"loss": 0.0208,
"step": 8020
},
{
"epoch": 0.664130344884625,
"grad_norm": 0.0464673787355423,
"learning_rate": 2.7812513517331695e-06,
"loss": 0.0206,
"step": 8030
},
{
"epoch": 0.6649574063352907,
"grad_norm": 0.04477581009268761,
"learning_rate": 2.7690041350667995e-06,
"loss": 0.0215,
"step": 8040
},
{
"epoch": 0.6657844677859565,
"grad_norm": 0.043795693665742874,
"learning_rate": 2.7567736085648935e-06,
"loss": 0.0219,
"step": 8050
},
{
"epoch": 0.6666115292366223,
"grad_norm": 0.04666496440768242,
"learning_rate": 2.7445598637244746e-06,
"loss": 0.021,
"step": 8060
},
{
"epoch": 0.6674385906872881,
"grad_norm": 0.039747051894664764,
"learning_rate": 2.7323629919170334e-06,
"loss": 0.0219,
"step": 8070
},
{
"epoch": 0.6682656521379539,
"grad_norm": 0.037099067121744156,
"learning_rate": 2.72018308438783e-06,
"loss": 0.02,
"step": 8080
},
{
"epoch": 0.6690927135886197,
"grad_norm": 0.0401119664311409,
"learning_rate": 2.7080202322552126e-06,
"loss": 0.0214,
"step": 8090
},
{
"epoch": 0.6699197750392855,
"grad_norm": 0.0409838892519474,
"learning_rate": 2.6958745265099397e-06,
"loss": 0.0205,
"step": 8100
},
{
"epoch": 0.6707468364899511,
"grad_norm": 0.035290639847517014,
"learning_rate": 2.683746058014489e-06,
"loss": 0.0209,
"step": 8110
},
{
"epoch": 0.671573897940617,
"grad_norm": 0.03809922933578491,
"learning_rate": 2.6716349175023997e-06,
"loss": 0.022,
"step": 8120
},
{
"epoch": 0.6724009593912827,
"grad_norm": 0.044197600334882736,
"learning_rate": 2.659541195577571e-06,
"loss": 0.02,
"step": 8130
},
{
"epoch": 0.6732280208419485,
"grad_norm": 0.041063982993364334,
"learning_rate": 2.6474649827135913e-06,
"loss": 0.0203,
"step": 8140
},
{
"epoch": 0.6740550822926143,
"grad_norm": 0.039071984589099884,
"learning_rate": 2.635406369253066e-06,
"loss": 0.0216,
"step": 8150
},
{
"epoch": 0.6748821437432801,
"grad_norm": 0.038477640599012375,
"learning_rate": 2.6233654454069397e-06,
"loss": 0.0217,
"step": 8160
},
{
"epoch": 0.675709205193946,
"grad_norm": 0.05265484377741814,
"learning_rate": 2.6113423012538184e-06,
"loss": 0.0223,
"step": 8170
},
{
"epoch": 0.6765362666446118,
"grad_norm": 0.04026918113231659,
"learning_rate": 2.5993370267392998e-06,
"loss": 0.0212,
"step": 8180
},
{
"epoch": 0.6773633280952774,
"grad_norm": 0.040949251502752304,
"learning_rate": 2.5873497116752955e-06,
"loss": 0.0218,
"step": 8190
},
{
"epoch": 0.6781903895459432,
"grad_norm": 0.04553502798080444,
"learning_rate": 2.575380445739363e-06,
"loss": 0.0224,
"step": 8200
},
{
"epoch": 0.679017450996609,
"grad_norm": 0.040991537272930145,
"learning_rate": 2.5634293184740337e-06,
"loss": 0.0207,
"step": 8210
},
{
"epoch": 0.6798445124472748,
"grad_norm": 0.04071825370192528,
"learning_rate": 2.551496419286143e-06,
"loss": 0.0215,
"step": 8220
},
{
"epoch": 0.6806715738979406,
"grad_norm": 0.04148703068494797,
"learning_rate": 2.5395818374461626e-06,
"loss": 0.0215,
"step": 8230
},
{
"epoch": 0.6814986353486064,
"grad_norm": 0.04831210896372795,
"learning_rate": 2.5276856620875267e-06,
"loss": 0.0204,
"step": 8240
},
{
"epoch": 0.6823256967992722,
"grad_norm": 0.05425499007105827,
"learning_rate": 2.5158079822059726e-06,
"loss": 0.0214,
"step": 8250
},
{
"epoch": 0.6831527582499379,
"grad_norm": 0.042809970676898956,
"learning_rate": 2.503948886658879e-06,
"loss": 0.0204,
"step": 8260
},
{
"epoch": 0.6839798197006037,
"grad_norm": 0.039092812687158585,
"learning_rate": 2.492108464164582e-06,
"loss": 0.0209,
"step": 8270
},
{
"epoch": 0.6848068811512695,
"grad_norm": 0.0440199077129364,
"learning_rate": 2.4802868033017325e-06,
"loss": 0.0205,
"step": 8280
},
{
"epoch": 0.6856339426019353,
"grad_norm": 0.04389241337776184,
"learning_rate": 2.4684839925086222e-06,
"loss": 0.0218,
"step": 8290
},
{
"epoch": 0.6864610040526011,
"grad_norm": 0.03971746936440468,
"learning_rate": 2.4567001200825257e-06,
"loss": 0.0211,
"step": 8300
},
{
"epoch": 0.6872880655032669,
"grad_norm": 0.03864897042512894,
"learning_rate": 2.44493527417904e-06,
"loss": 0.0224,
"step": 8310
},
{
"epoch": 0.6881151269539327,
"grad_norm": 0.04412490129470825,
"learning_rate": 2.4331895428114167e-06,
"loss": 0.0206,
"step": 8320
},
{
"epoch": 0.6889421884045984,
"grad_norm": 0.045335933566093445,
"learning_rate": 2.4214630138499235e-06,
"loss": 0.0203,
"step": 8330
},
{
"epoch": 0.6897692498552642,
"grad_norm": 0.040548257529735565,
"learning_rate": 2.4097557750211627e-06,
"loss": 0.0208,
"step": 8340
},
{
"epoch": 0.69059631130593,
"grad_norm": 0.043131329119205475,
"learning_rate": 2.3980679139074314e-06,
"loss": 0.021,
"step": 8350
},
{
"epoch": 0.6914233727565958,
"grad_norm": 0.039993565529584885,
"learning_rate": 2.3863995179460612e-06,
"loss": 0.0222,
"step": 8360
},
{
"epoch": 0.6922504342072616,
"grad_norm": 0.037337690591812134,
"learning_rate": 2.374750674428764e-06,
"loss": 0.0218,
"step": 8370
},
{
"epoch": 0.6930774956579274,
"grad_norm": 0.042838480323553085,
"learning_rate": 2.3631214705009806e-06,
"loss": 0.0208,
"step": 8380
},
{
"epoch": 0.6939045571085932,
"grad_norm": 0.036257416009902954,
"learning_rate": 2.3515119931612196e-06,
"loss": 0.02,
"step": 8390
},
{
"epoch": 0.6947316185592589,
"grad_norm": 0.042761024087667465,
"learning_rate": 2.339922329260426e-06,
"loss": 0.0223,
"step": 8400
},
{
"epoch": 0.6955586800099247,
"grad_norm": 0.04689721390604973,
"learning_rate": 2.328352565501314e-06,
"loss": 0.0235,
"step": 8410
},
{
"epoch": 0.6963857414605905,
"grad_norm": 0.04648851230740547,
"learning_rate": 2.316802788437719e-06,
"loss": 0.0217,
"step": 8420
},
{
"epoch": 0.6972128029112563,
"grad_norm": 0.04260076582431793,
"learning_rate": 2.3052730844739636e-06,
"loss": 0.0216,
"step": 8430
},
{
"epoch": 0.6980398643619221,
"grad_norm": 0.042848605662584305,
"learning_rate": 2.293763539864199e-06,
"loss": 0.0214,
"step": 8440
},
{
"epoch": 0.6988669258125879,
"grad_norm": 0.039934489876031876,
"learning_rate": 2.2822742407117625e-06,
"loss": 0.0202,
"step": 8450
},
{
"epoch": 0.6996939872632537,
"grad_norm": 0.03947708010673523,
"learning_rate": 2.270805272968537e-06,
"loss": 0.0207,
"step": 8460
},
{
"epoch": 0.7005210487139194,
"grad_norm": 0.03535833582282066,
"learning_rate": 2.2593567224343037e-06,
"loss": 0.0225,
"step": 8470
},
{
"epoch": 0.7013481101645852,
"grad_norm": 0.04926292970776558,
"learning_rate": 2.2479286747561037e-06,
"loss": 0.0221,
"step": 8480
},
{
"epoch": 0.702175171615251,
"grad_norm": 0.03978796303272247,
"learning_rate": 2.2365212154275908e-06,
"loss": 0.0226,
"step": 8490
},
{
"epoch": 0.7030022330659168,
"grad_norm": 0.04777059331536293,
"learning_rate": 2.2251344297883996e-06,
"loss": 0.0204,
"step": 8500
},
{
"epoch": 0.7038292945165826,
"grad_norm": 0.04967991262674332,
"learning_rate": 2.2137684030235095e-06,
"loss": 0.0203,
"step": 8510
},
{
"epoch": 0.7046563559672484,
"grad_norm": 0.04070328548550606,
"learning_rate": 2.202423220162591e-06,
"loss": 0.0214,
"step": 8520
},
{
"epoch": 0.7054834174179142,
"grad_norm": 0.036942508071660995,
"learning_rate": 2.191098966079389e-06,
"loss": 0.0205,
"step": 8530
},
{
"epoch": 0.7063104788685799,
"grad_norm": 0.042975060641765594,
"learning_rate": 2.1797957254910757e-06,
"loss": 0.0218,
"step": 8540
},
{
"epoch": 0.7071375403192457,
"grad_norm": 0.044698718935251236,
"learning_rate": 2.168513582957622e-06,
"loss": 0.0225,
"step": 8550
},
{
"epoch": 0.7079646017699115,
"grad_norm": 0.0593951940536499,
"learning_rate": 2.1572526228811645e-06,
"loss": 0.0205,
"step": 8560
},
{
"epoch": 0.7087916632205773,
"grad_norm": 0.042812854051589966,
"learning_rate": 2.1460129295053666e-06,
"loss": 0.0215,
"step": 8570
},
{
"epoch": 0.7096187246712431,
"grad_norm": 0.05073460936546326,
"learning_rate": 2.134794586914806e-06,
"loss": 0.0234,
"step": 8580
},
{
"epoch": 0.7104457861219089,
"grad_norm": 0.03609664365649223,
"learning_rate": 2.123597679034324e-06,
"loss": 0.02,
"step": 8590
},
{
"epoch": 0.7112728475725747,
"grad_norm": 0.040147822350263596,
"learning_rate": 2.112422289628412e-06,
"loss": 0.0205,
"step": 8600
},
{
"epoch": 0.7120999090232404,
"grad_norm": 0.039646077901124954,
"learning_rate": 2.101268502300582e-06,
"loss": 0.0213,
"step": 8610
},
{
"epoch": 0.7129269704739062,
"grad_norm": 0.04966466873884201,
"learning_rate": 2.090136400492739e-06,
"loss": 0.0244,
"step": 8620
},
{
"epoch": 0.713754031924572,
"grad_norm": 0.04764994978904724,
"learning_rate": 2.0790260674845563e-06,
"loss": 0.0202,
"step": 8630
},
{
"epoch": 0.7145810933752378,
"grad_norm": 0.04711426794528961,
"learning_rate": 2.0679375863928576e-06,
"loss": 0.0214,
"step": 8640
},
{
"epoch": 0.7154081548259036,
"grad_norm": 0.04078923165798187,
"learning_rate": 2.056871040170988e-06,
"loss": 0.0199,
"step": 8650
},
{
"epoch": 0.7162352162765694,
"grad_norm": 0.039174020290374756,
"learning_rate": 2.0458265116082002e-06,
"loss": 0.021,
"step": 8660
},
{
"epoch": 0.7170622777272352,
"grad_norm": 0.04337885230779648,
"learning_rate": 2.034804083329027e-06,
"loss": 0.0208,
"step": 8670
},
{
"epoch": 0.7178893391779009,
"grad_norm": 0.04172796383500099,
"learning_rate": 2.0238038377926715e-06,
"loss": 0.0218,
"step": 8680
},
{
"epoch": 0.7187164006285667,
"grad_norm": 0.043501630425453186,
"learning_rate": 2.012825857292392e-06,
"loss": 0.0232,
"step": 8690
},
{
"epoch": 0.7195434620792325,
"grad_norm": 0.04335128515958786,
"learning_rate": 2.00187022395487e-06,
"loss": 0.0215,
"step": 8700
},
{
"epoch": 0.7203705235298983,
"grad_norm": 0.04587217792868614,
"learning_rate": 1.9909370197396148e-06,
"loss": 0.0246,
"step": 8710
},
{
"epoch": 0.7211975849805641,
"grad_norm": 0.037936147302389145,
"learning_rate": 1.9800263264383405e-06,
"loss": 0.0206,
"step": 8720
},
{
"epoch": 0.7220246464312299,
"grad_norm": 0.0373714417219162,
"learning_rate": 1.969138225674358e-06,
"loss": 0.0213,
"step": 8730
},
{
"epoch": 0.7228517078818957,
"grad_norm": 0.04090265929698944,
"learning_rate": 1.9582727989019607e-06,
"loss": 0.021,
"step": 8740
},
{
"epoch": 0.7236787693325614,
"grad_norm": 0.033642202615737915,
"learning_rate": 1.9474301274058125e-06,
"loss": 0.0198,
"step": 8750
},
{
"epoch": 0.7245058307832272,
"grad_norm": 0.0450110137462616,
"learning_rate": 1.9366102923003578e-06,
"loss": 0.0202,
"step": 8760
},
{
"epoch": 0.725332892233893,
"grad_norm": 0.03714507818222046,
"learning_rate": 1.9258133745291845e-06,
"loss": 0.0211,
"step": 8770
},
{
"epoch": 0.7261599536845588,
"grad_norm": 0.04287153109908104,
"learning_rate": 1.9150394548644463e-06,
"loss": 0.02,
"step": 8780
},
{
"epoch": 0.7269870151352246,
"grad_norm": 0.041864458471536636,
"learning_rate": 1.9042886139062427e-06,
"loss": 0.0218,
"step": 8790
},
{
"epoch": 0.7278140765858904,
"grad_norm": 0.11404255032539368,
"learning_rate": 1.893560932082023e-06,
"loss": 0.0224,
"step": 8800
},
{
"epoch": 0.7286411380365562,
"grad_norm": 0.04448498412966728,
"learning_rate": 1.8828564896459795e-06,
"loss": 0.0217,
"step": 8810
},
{
"epoch": 0.7294681994872219,
"grad_norm": 0.038884907960891724,
"learning_rate": 1.872175366678451e-06,
"loss": 0.0206,
"step": 8820
},
{
"epoch": 0.7302952609378877,
"grad_norm": 0.041435256600379944,
"learning_rate": 1.8615176430853231e-06,
"loss": 0.0211,
"step": 8830
},
{
"epoch": 0.7311223223885535,
"grad_norm": 0.04282752797007561,
"learning_rate": 1.8508833985974306e-06,
"loss": 0.0209,
"step": 8840
},
{
"epoch": 0.7319493838392193,
"grad_norm": 0.043493740260601044,
"learning_rate": 1.8402727127699537e-06,
"loss": 0.02,
"step": 8850
},
{
"epoch": 0.7327764452898851,
"grad_norm": 0.036238010972738266,
"learning_rate": 1.8296856649818418e-06,
"loss": 0.0211,
"step": 8860
},
{
"epoch": 0.7336035067405509,
"grad_norm": 0.04608851671218872,
"learning_rate": 1.8191223344351932e-06,
"loss": 0.0222,
"step": 8870
},
{
"epoch": 0.7344305681912167,
"grad_norm": 0.04390549659729004,
"learning_rate": 1.8085828001546869e-06,
"loss": 0.0207,
"step": 8880
},
{
"epoch": 0.7352576296418823,
"grad_norm": 0.04080136865377426,
"learning_rate": 1.798067140986976e-06,
"loss": 0.0215,
"step": 8890
},
{
"epoch": 0.7360846910925481,
"grad_norm": 0.05361476168036461,
"learning_rate": 1.7875754356001052e-06,
"loss": 0.0215,
"step": 8900
},
{
"epoch": 0.736911752543214,
"grad_norm": 0.041193023324012756,
"learning_rate": 1.7771077624829213e-06,
"loss": 0.0226,
"step": 8910
},
{
"epoch": 0.7377388139938797,
"grad_norm": 0.05157098174095154,
"learning_rate": 1.7666641999444777e-06,
"loss": 0.0213,
"step": 8920
},
{
"epoch": 0.7385658754445455,
"grad_norm": 0.038595810532569885,
"learning_rate": 1.7562448261134658e-06,
"loss": 0.0204,
"step": 8930
},
{
"epoch": 0.7393929368952114,
"grad_norm": 0.12353651970624924,
"learning_rate": 1.7458497189376145e-06,
"loss": 0.0208,
"step": 8940
},
{
"epoch": 0.7402199983458772,
"grad_norm": 0.04080955684185028,
"learning_rate": 1.735478956183112e-06,
"loss": 0.0203,
"step": 8950
},
{
"epoch": 0.7410470597965428,
"grad_norm": 0.0376775749027729,
"learning_rate": 1.725132615434027e-06,
"loss": 0.0214,
"step": 8960
},
{
"epoch": 0.7418741212472086,
"grad_norm": 0.04121479019522667,
"learning_rate": 1.7148107740917269e-06,
"loss": 0.0222,
"step": 8970
},
{
"epoch": 0.7427011826978744,
"grad_norm": 0.03592400997877121,
"learning_rate": 1.7045135093742976e-06,
"loss": 0.0207,
"step": 8980
},
{
"epoch": 0.7435282441485402,
"grad_norm": 0.03217403218150139,
"learning_rate": 1.6942408983159648e-06,
"loss": 0.0208,
"step": 8990
},
{
"epoch": 0.744355305599206,
"grad_norm": 0.038189876824617386,
"learning_rate": 1.6839930177665208e-06,
"loss": 0.0232,
"step": 9000
},
{
"epoch": 0.744355305599206,
"eval_loss": 0.02212439477443695,
"eval_runtime": 1220.8964,
"eval_samples_per_second": 4.914,
"eval_steps_per_second": 0.307,
"step": 9000
},
{
"epoch": 0.7451823670498718,
"grad_norm": 0.04287660866975784,
"learning_rate": 1.6737699443907486e-06,
"loss": 0.0203,
"step": 9010
},
{
"epoch": 0.7460094285005376,
"grad_norm": 0.03629004582762718,
"learning_rate": 1.663571754667847e-06,
"loss": 0.0209,
"step": 9020
},
{
"epoch": 0.7468364899512033,
"grad_norm": 0.042142104357481,
"learning_rate": 1.6533985248908551e-06,
"loss": 0.0203,
"step": 9030
},
{
"epoch": 0.7476635514018691,
"grad_norm": 0.042162686586380005,
"learning_rate": 1.6432503311660963e-06,
"loss": 0.0195,
"step": 9040
},
{
"epoch": 0.7484906128525349,
"grad_norm": 0.042325787246227264,
"learning_rate": 1.6331272494125865e-06,
"loss": 0.025,
"step": 9050
},
{
"epoch": 0.7493176743032007,
"grad_norm": 0.03958788514137268,
"learning_rate": 1.6230293553614851e-06,
"loss": 0.0208,
"step": 9060
},
{
"epoch": 0.7501447357538665,
"grad_norm": 0.04631664603948593,
"learning_rate": 1.612956724555519e-06,
"loss": 0.0222,
"step": 9070
},
{
"epoch": 0.7509717972045323,
"grad_norm": 0.03541667386889458,
"learning_rate": 1.6029094323484207e-06,
"loss": 0.0188,
"step": 9080
},
{
"epoch": 0.7517988586551981,
"grad_norm": 0.04303780198097229,
"learning_rate": 1.5928875539043649e-06,
"loss": 0.0218,
"step": 9090
},
{
"epoch": 0.7526259201058638,
"grad_norm": 0.045209407806396484,
"learning_rate": 1.5828911641973981e-06,
"loss": 0.0216,
"step": 9100
},
{
"epoch": 0.7534529815565296,
"grad_norm": 0.0393962636590004,
"learning_rate": 1.5729203380108955e-06,
"loss": 0.0201,
"step": 9110
},
{
"epoch": 0.7542800430071954,
"grad_norm": 0.04141068086028099,
"learning_rate": 1.5629751499369839e-06,
"loss": 0.0221,
"step": 9120
},
{
"epoch": 0.7551071044578612,
"grad_norm": 0.04183319956064224,
"learning_rate": 1.553055674375989e-06,
"loss": 0.0207,
"step": 9130
},
{
"epoch": 0.755934165908527,
"grad_norm": 0.04659945145249367,
"learning_rate": 1.5431619855358842e-06,
"loss": 0.0228,
"step": 9140
},
{
"epoch": 0.7567612273591928,
"grad_norm": 0.04036922752857208,
"learning_rate": 1.5332941574317294e-06,
"loss": 0.0218,
"step": 9150
},
{
"epoch": 0.7575882888098586,
"grad_norm": 0.04024342820048332,
"learning_rate": 1.5234522638851213e-06,
"loss": 0.0213,
"step": 9160
},
{
"epoch": 0.7584153502605243,
"grad_norm": 0.04086223989725113,
"learning_rate": 1.5136363785236362e-06,
"loss": 0.0206,
"step": 9170
},
{
"epoch": 0.7592424117111901,
"grad_norm": 0.045924026519060135,
"learning_rate": 1.503846574780285e-06,
"loss": 0.0212,
"step": 9180
},
{
"epoch": 0.7600694731618559,
"grad_norm": 0.0389275960624218,
"learning_rate": 1.4940829258929606e-06,
"loss": 0.0217,
"step": 9190
},
{
"epoch": 0.7608965346125217,
"grad_norm": 0.042410727590322495,
"learning_rate": 1.4843455049038869e-06,
"loss": 0.0206,
"step": 9200
},
{
"epoch": 0.7617235960631875,
"grad_norm": 0.04143417999148369,
"learning_rate": 1.4746343846590783e-06,
"loss": 0.0218,
"step": 9210
},
{
"epoch": 0.7625506575138533,
"grad_norm": 0.04118340089917183,
"learning_rate": 1.4649496378077983e-06,
"loss": 0.0203,
"step": 9220
},
{
"epoch": 0.7633777189645191,
"grad_norm": 0.04239552468061447,
"learning_rate": 1.455291336801999e-06,
"loss": 0.0222,
"step": 9230
},
{
"epoch": 0.7642047804151848,
"grad_norm": 0.041403092443943024,
"learning_rate": 1.4456595538957974e-06,
"loss": 0.0211,
"step": 9240
},
{
"epoch": 0.7650318418658506,
"grad_norm": 0.12348439544439316,
"learning_rate": 1.436054361144925e-06,
"loss": 0.0215,
"step": 9250
},
{
"epoch": 0.7658589033165164,
"grad_norm": 0.04165393486618996,
"learning_rate": 1.4264758304061938e-06,
"loss": 0.0202,
"step": 9260
},
{
"epoch": 0.7666859647671822,
"grad_norm": 0.044469356536865234,
"learning_rate": 1.4169240333369543e-06,
"loss": 0.0207,
"step": 9270
},
{
"epoch": 0.767513026217848,
"grad_norm": 0.04392145201563835,
"learning_rate": 1.4073990413945582e-06,
"loss": 0.0208,
"step": 9280
},
{
"epoch": 0.7683400876685138,
"grad_norm": 0.043122172355651855,
"learning_rate": 1.3979009258358367e-06,
"loss": 0.021,
"step": 9290
},
{
"epoch": 0.7691671491191796,
"grad_norm": 0.0898752361536026,
"learning_rate": 1.3884297577165462e-06,
"loss": 0.0212,
"step": 9300
},
{
"epoch": 0.7699942105698453,
"grad_norm": 0.04254557564854622,
"learning_rate": 1.378985607890856e-06,
"loss": 0.0219,
"step": 9310
},
{
"epoch": 0.7708212720205111,
"grad_norm": 0.05117588862776756,
"learning_rate": 1.3695685470108078e-06,
"loss": 0.0219,
"step": 9320
},
{
"epoch": 0.7716483334711769,
"grad_norm": 0.04056469351053238,
"learning_rate": 1.3601786455257905e-06,
"loss": 0.0207,
"step": 9330
},
{
"epoch": 0.7724753949218427,
"grad_norm": 0.05269391089677811,
"learning_rate": 1.3508159736820132e-06,
"loss": 0.0217,
"step": 9340
},
{
"epoch": 0.7733024563725085,
"grad_norm": 0.036445554345846176,
"learning_rate": 1.341480601521974e-06,
"loss": 0.0211,
"step": 9350
},
{
"epoch": 0.7741295178231743,
"grad_norm": 0.04814046248793602,
"learning_rate": 1.33217259888395e-06,
"loss": 0.0212,
"step": 9360
},
{
"epoch": 0.7749565792738401,
"grad_norm": 0.038837458938360214,
"learning_rate": 1.3228920354014607e-06,
"loss": 0.0209,
"step": 9370
},
{
"epoch": 0.7757836407245058,
"grad_norm": 0.0410507507622242,
"learning_rate": 1.31363898050275e-06,
"loss": 0.0205,
"step": 9380
},
{
"epoch": 0.7766107021751716,
"grad_norm": 0.03645321726799011,
"learning_rate": 1.3044135034102711e-06,
"loss": 0.0207,
"step": 9390
},
{
"epoch": 0.7774377636258374,
"grad_norm": 0.040732916444540024,
"learning_rate": 1.2952156731401716e-06,
"loss": 0.0202,
"step": 9400
},
{
"epoch": 0.7782648250765032,
"grad_norm": 0.043882377445697784,
"learning_rate": 1.2860455585017634e-06,
"loss": 0.0204,
"step": 9410
},
{
"epoch": 0.779091886527169,
"grad_norm": 0.038812581449747086,
"learning_rate": 1.2769032280970222e-06,
"loss": 0.0209,
"step": 9420
},
{
"epoch": 0.7799189479778348,
"grad_norm": 0.049354683607816696,
"learning_rate": 1.2677887503200681e-06,
"loss": 0.0197,
"step": 9430
},
{
"epoch": 0.7807460094285006,
"grad_norm": 0.030933791771531105,
"learning_rate": 1.258702193356654e-06,
"loss": 0.0223,
"step": 9440
},
{
"epoch": 0.7815730708791663,
"grad_norm": 0.03828246891498566,
"learning_rate": 1.2496436251836563e-06,
"loss": 0.0231,
"step": 9450
},
{
"epoch": 0.7824001323298321,
"grad_norm": 0.04567508026957512,
"learning_rate": 1.2406131135685656e-06,
"loss": 0.0217,
"step": 9460
},
{
"epoch": 0.7832271937804979,
"grad_norm": 0.04124726355075836,
"learning_rate": 1.231610726068983e-06,
"loss": 0.0207,
"step": 9470
},
{
"epoch": 0.7840542552311637,
"grad_norm": 0.03909214958548546,
"learning_rate": 1.2226365300321063e-06,
"loss": 0.021,
"step": 9480
},
{
"epoch": 0.7848813166818295,
"grad_norm": 0.03634734824299812,
"learning_rate": 1.2136905925942367e-06,
"loss": 0.0214,
"step": 9490
},
{
"epoch": 0.7857083781324953,
"grad_norm": 0.04165159910917282,
"learning_rate": 1.2047729806802739e-06,
"loss": 0.0205,
"step": 9500
},
{
"epoch": 0.7865354395831611,
"grad_norm": 0.042302560061216354,
"learning_rate": 1.195883761003206e-06,
"loss": 0.0216,
"step": 9510
},
{
"epoch": 0.7873625010338268,
"grad_norm": 0.05284997075796127,
"learning_rate": 1.187023000063623e-06,
"loss": 0.0205,
"step": 9520
},
{
"epoch": 0.7881895624844926,
"grad_norm": 0.037878263741731644,
"learning_rate": 1.1781907641492129e-06,
"loss": 0.0224,
"step": 9530
},
{
"epoch": 0.7890166239351584,
"grad_norm": 0.04263276234269142,
"learning_rate": 1.169387119334266e-06,
"loss": 0.0222,
"step": 9540
},
{
"epoch": 0.7898436853858242,
"grad_norm": 0.042502984404563904,
"learning_rate": 1.1606121314791846e-06,
"loss": 0.0216,
"step": 9550
},
{
"epoch": 0.79067074683649,
"grad_norm": 0.041224028915166855,
"learning_rate": 1.1518658662299798e-06,
"loss": 0.0232,
"step": 9560
},
{
"epoch": 0.7914978082871558,
"grad_norm": 0.04057363048195839,
"learning_rate": 1.1431483890177991e-06,
"loss": 0.0209,
"step": 9570
},
{
"epoch": 0.7923248697378216,
"grad_norm": 0.038158901035785675,
"learning_rate": 1.1344597650584139e-06,
"loss": 0.0212,
"step": 9580
},
{
"epoch": 0.7931519311884873,
"grad_norm": 0.038800351321697235,
"learning_rate": 1.1258000593517516e-06,
"loss": 0.0201,
"step": 9590
},
{
"epoch": 0.7939789926391531,
"grad_norm": 0.044678255915641785,
"learning_rate": 1.1171693366813967e-06,
"loss": 0.0209,
"step": 9600
},
{
"epoch": 0.7948060540898189,
"grad_norm": 0.038671303540468216,
"learning_rate": 1.1085676616141133e-06,
"loss": 0.021,
"step": 9610
},
{
"epoch": 0.7956331155404847,
"grad_norm": 0.03674842417240143,
"learning_rate": 1.0999950984993584e-06,
"loss": 0.0221,
"step": 9620
},
{
"epoch": 0.7964601769911505,
"grad_norm": 0.04083636775612831,
"learning_rate": 1.0914517114687973e-06,
"loss": 0.0285,
"step": 9630
},
{
"epoch": 0.7972872384418163,
"grad_norm": 0.03498871251940727,
"learning_rate": 1.0829375644358352e-06,
"loss": 0.0197,
"step": 9640
},
{
"epoch": 0.7981142998924821,
"grad_norm": 0.04270506650209427,
"learning_rate": 1.074452721095129e-06,
"loss": 0.0199,
"step": 9650
},
{
"epoch": 0.7989413613431477,
"grad_norm": 0.04146299883723259,
"learning_rate": 1.065997244922109e-06,
"loss": 0.0209,
"step": 9660
},
{
"epoch": 0.7997684227938135,
"grad_norm": 0.0386267714202404,
"learning_rate": 1.057571199172514e-06,
"loss": 0.0204,
"step": 9670
},
{
"epoch": 0.8005954842444793,
"grad_norm": 0.03857827186584473,
"learning_rate": 1.0491746468819114e-06,
"loss": 0.0216,
"step": 9680
},
{
"epoch": 0.8014225456951451,
"grad_norm": 0.03981781378388405,
"learning_rate": 1.040807650865226e-06,
"loss": 0.0207,
"step": 9690
},
{
"epoch": 0.802249607145811,
"grad_norm": 0.03532428294420242,
"learning_rate": 1.0324702737162717e-06,
"loss": 0.0207,
"step": 9700
},
{
"epoch": 0.8030766685964768,
"grad_norm": 0.059968218207359314,
"learning_rate": 1.0241625778072823e-06,
"loss": 0.0216,
"step": 9710
},
{
"epoch": 0.8039037300471426,
"grad_norm": 0.042843446135520935,
"learning_rate": 1.0158846252884464e-06,
"loss": 0.0196,
"step": 9720
},
{
"epoch": 0.8047307914978082,
"grad_norm": 0.04270855337381363,
"learning_rate": 1.007636478087437e-06,
"loss": 0.0247,
"step": 9730
},
{
"epoch": 0.805557852948474,
"grad_norm": 0.037198904901742935,
"learning_rate": 9.994181979089563e-07,
"loss": 0.0249,
"step": 9740
},
{
"epoch": 0.8063849143991398,
"grad_norm": 0.04327964037656784,
"learning_rate": 9.912298462342724e-07,
"loss": 0.0214,
"step": 9750
},
{
"epoch": 0.8072119758498056,
"grad_norm": 0.0518951341509819,
"learning_rate": 9.8307148432075e-07,
"loss": 0.0313,
"step": 9760
},
{
"epoch": 0.8080390373004714,
"grad_norm": 0.04297772794961929,
"learning_rate": 9.749431732014047e-07,
"loss": 0.0201,
"step": 9770
},
{
"epoch": 0.8088660987511372,
"grad_norm": 0.04605748876929283,
"learning_rate": 9.668449736844392e-07,
"loss": 0.0229,
"step": 9780
},
{
"epoch": 0.809693160201803,
"grad_norm": 0.09627640247344971,
"learning_rate": 9.587769463527908e-07,
"loss": 0.0231,
"step": 9790
},
{
"epoch": 0.8105202216524687,
"grad_norm": 0.05746271833777428,
"learning_rate": 9.507391515636783e-07,
"loss": 0.0201,
"step": 9800
},
{
"epoch": 0.8113472831031345,
"grad_norm": 0.038759179413318634,
"learning_rate": 9.427316494481447e-07,
"loss": 0.0201,
"step": 9810
},
{
"epoch": 0.8121743445538003,
"grad_norm": 0.04053572565317154,
"learning_rate": 9.347544999106195e-07,
"loss": 0.0213,
"step": 9820
},
{
"epoch": 0.8130014060044661,
"grad_norm": 0.03577113896608353,
"learning_rate": 9.26807762628461e-07,
"loss": 0.0212,
"step": 9830
},
{
"epoch": 0.8138284674551319,
"grad_norm": 0.04762836545705795,
"learning_rate": 9.188914970515089e-07,
"loss": 0.0229,
"step": 9840
},
{
"epoch": 0.8146555289057977,
"grad_norm": 0.03575866296887398,
"learning_rate": 9.110057624016461e-07,
"loss": 0.0213,
"step": 9850
},
{
"epoch": 0.8154825903564635,
"grad_norm": 0.04291502758860588,
"learning_rate": 9.03150617672352e-07,
"loss": 0.0211,
"step": 9860
},
{
"epoch": 0.8163096518071292,
"grad_norm": 0.05030713975429535,
"learning_rate": 8.953261216282616e-07,
"loss": 0.0195,
"step": 9870
},
{
"epoch": 0.817136713257795,
"grad_norm": 0.03975163400173187,
"learning_rate": 8.875323328047258e-07,
"loss": 0.0199,
"step": 9880
},
{
"epoch": 0.8179637747084608,
"grad_norm": 0.049601927399635315,
"learning_rate": 8.797693095073733e-07,
"loss": 0.0213,
"step": 9890
},
{
"epoch": 0.8187908361591266,
"grad_norm": 0.04565184563398361,
"learning_rate": 8.72037109811677e-07,
"loss": 0.0213,
"step": 9900
},
{
"epoch": 0.8196178976097924,
"grad_norm": 0.04753655195236206,
"learning_rate": 8.643357915625122e-07,
"loss": 0.0217,
"step": 9910
},
{
"epoch": 0.8204449590604582,
"grad_norm": 0.0501379668712616,
"learning_rate": 8.566654123737322e-07,
"loss": 0.0215,
"step": 9920
},
{
"epoch": 0.821272020511124,
"grad_norm": 0.037206389009952545,
"learning_rate": 8.490260296277375e-07,
"loss": 0.0284,
"step": 9930
},
{
"epoch": 0.8220990819617897,
"grad_norm": 0.040716107934713364,
"learning_rate": 8.414177004750357e-07,
"loss": 0.0219,
"step": 9940
},
{
"epoch": 0.8229261434124555,
"grad_norm": 0.05191744118928909,
"learning_rate": 8.338404818338264e-07,
"loss": 0.0219,
"step": 9950
},
{
"epoch": 0.8237532048631213,
"grad_norm": 0.04306924715638161,
"learning_rate": 8.262944303895687e-07,
"loss": 0.0199,
"step": 9960
},
{
"epoch": 0.8245802663137871,
"grad_norm": 0.050016891211271286,
"learning_rate": 8.187796025945588e-07,
"loss": 0.0207,
"step": 9970
},
{
"epoch": 0.8254073277644529,
"grad_norm": 0.036976251751184464,
"learning_rate": 8.112960546675091e-07,
"loss": 0.021,
"step": 9980
},
{
"epoch": 0.8262343892151187,
"grad_norm": 0.03663003817200661,
"learning_rate": 8.038438425931216e-07,
"loss": 0.0204,
"step": 9990
},
{
"epoch": 0.8270614506657845,
"grad_norm": 0.0474594421684742,
"learning_rate": 7.964230221216806e-07,
"loss": 0.0205,
"step": 10000
},
{
"epoch": 0.8270614506657845,
"eval_loss": 0.021883510053157806,
"eval_runtime": 1220.4916,
"eval_samples_per_second": 4.915,
"eval_steps_per_second": 0.307,
"step": 10000
},
{
"epoch": 0.8278885121164502,
"grad_norm": 0.03651763126254082,
"learning_rate": 7.890336487686218e-07,
"loss": 0.0205,
"step": 10010
},
{
"epoch": 0.828715573567116,
"grad_norm": 0.038936734199523926,
"learning_rate": 7.816757778141281e-07,
"loss": 0.0224,
"step": 10020
},
{
"epoch": 0.8295426350177818,
"grad_norm": 0.05199515074491501,
"learning_rate": 7.743494643027094e-07,
"loss": 0.021,
"step": 10030
},
{
"epoch": 0.8303696964684476,
"grad_norm": 0.03698007017374039,
"learning_rate": 7.670547630427954e-07,
"loss": 0.0202,
"step": 10040
},
{
"epoch": 0.8311967579191134,
"grad_norm": 0.043272070586681366,
"learning_rate": 7.597917286063233e-07,
"loss": 0.021,
"step": 10050
},
{
"epoch": 0.8320238193697792,
"grad_norm": 0.04484783858060837,
"learning_rate": 7.525604153283239e-07,
"loss": 0.0211,
"step": 10060
},
{
"epoch": 0.832850880820445,
"grad_norm": 0.03705143555998802,
"learning_rate": 7.453608773065296e-07,
"loss": 0.0203,
"step": 10070
},
{
"epoch": 0.8336779422711107,
"grad_norm": 0.039405085146427155,
"learning_rate": 7.381931684009569e-07,
"loss": 0.0212,
"step": 10080
},
{
"epoch": 0.8345050037217765,
"grad_norm": 0.044893745332956314,
"learning_rate": 7.310573422335044e-07,
"loss": 0.0203,
"step": 10090
},
{
"epoch": 0.8353320651724423,
"grad_norm": 0.03983564302325249,
"learning_rate": 7.23953452187559e-07,
"loss": 0.0216,
"step": 10100
},
{
"epoch": 0.8361591266231081,
"grad_norm": 0.044955916702747345,
"learning_rate": 7.16881551407591e-07,
"loss": 0.0331,
"step": 10110
},
{
"epoch": 0.8369861880737739,
"grad_norm": 0.035963404923677444,
"learning_rate": 7.098416927987578e-07,
"loss": 0.0198,
"step": 10120
},
{
"epoch": 0.8378132495244397,
"grad_norm": 0.052064284682273865,
"learning_rate": 7.028339290265068e-07,
"loss": 0.0219,
"step": 10130
},
{
"epoch": 0.8386403109751055,
"grad_norm": 0.04226592183113098,
"learning_rate": 6.958583125161855e-07,
"loss": 0.0208,
"step": 10140
},
{
"epoch": 0.8394673724257712,
"grad_norm": 0.040528856217861176,
"learning_rate": 6.889148954526448e-07,
"loss": 0.0201,
"step": 10150
},
{
"epoch": 0.840294433876437,
"grad_norm": 0.04366208612918854,
"learning_rate": 6.820037297798476e-07,
"loss": 0.0217,
"step": 10160
},
{
"epoch": 0.8411214953271028,
"grad_norm": 0.03569814935326576,
"learning_rate": 6.75124867200489e-07,
"loss": 0.0203,
"step": 10170
},
{
"epoch": 0.8419485567777686,
"grad_norm": 0.03940269351005554,
"learning_rate": 6.682783591755998e-07,
"loss": 0.0277,
"step": 10180
},
{
"epoch": 0.8427756182284344,
"grad_norm": 0.04888477176427841,
"learning_rate": 6.614642569241642e-07,
"loss": 0.0201,
"step": 10190
},
{
"epoch": 0.8436026796791002,
"grad_norm": 0.037041421979665756,
"learning_rate": 6.546826114227378e-07,
"loss": 0.0215,
"step": 10200
},
{
"epoch": 0.844429741129766,
"grad_norm": 0.054255735129117966,
"learning_rate": 6.479334734050713e-07,
"loss": 0.0204,
"step": 10210
},
{
"epoch": 0.8452568025804317,
"grad_norm": 0.04159407690167427,
"learning_rate": 6.41216893361718e-07,
"loss": 0.0199,
"step": 10220
},
{
"epoch": 0.8460838640310975,
"grad_norm": 0.042949602007865906,
"learning_rate": 6.345329215396678e-07,
"loss": 0.0217,
"step": 10230
},
{
"epoch": 0.8469109254817633,
"grad_norm": 0.04219160974025726,
"learning_rate": 6.278816079419675e-07,
"loss": 0.0214,
"step": 10240
},
{
"epoch": 0.8477379869324291,
"grad_norm": 0.0364801287651062,
"learning_rate": 6.212630023273452e-07,
"loss": 0.0224,
"step": 10250
},
{
"epoch": 0.8485650483830949,
"grad_norm": 0.03598921000957489,
"learning_rate": 6.146771542098418e-07,
"loss": 0.0203,
"step": 10260
},
{
"epoch": 0.8493921098337607,
"grad_norm": 0.04587122052907944,
"learning_rate": 6.08124112858432e-07,
"loss": 0.0203,
"step": 10270
},
{
"epoch": 0.8502191712844265,
"grad_norm": 0.05026087537407875,
"learning_rate": 6.0160392729667e-07,
"loss": 0.0212,
"step": 10280
},
{
"epoch": 0.8510462327350922,
"grad_norm": 0.03648602217435837,
"learning_rate": 5.951166463023089e-07,
"loss": 0.0209,
"step": 10290
},
{
"epoch": 0.851873294185758,
"grad_norm": 0.03705860301852226,
"learning_rate": 5.886623184069434e-07,
"loss": 0.0206,
"step": 10300
},
{
"epoch": 0.8527003556364238,
"grad_norm": 0.03770057111978531,
"learning_rate": 5.822409918956445e-07,
"loss": 0.0207,
"step": 10310
},
{
"epoch": 0.8535274170870896,
"grad_norm": 0.038956765085458755,
"learning_rate": 5.758527148065989e-07,
"loss": 0.0248,
"step": 10320
},
{
"epoch": 0.8543544785377554,
"grad_norm": 0.04223814234137535,
"learning_rate": 5.694975349307503e-07,
"loss": 0.0211,
"step": 10330
},
{
"epoch": 0.8551815399884212,
"grad_norm": 0.04067877680063248,
"learning_rate": 5.631754998114369e-07,
"loss": 0.021,
"step": 10340
},
{
"epoch": 0.856008601439087,
"grad_norm": 0.03736858442425728,
"learning_rate": 5.568866567440451e-07,
"loss": 0.0209,
"step": 10350
},
{
"epoch": 0.8568356628897527,
"grad_norm": 0.03979681432247162,
"learning_rate": 5.506310527756481e-07,
"loss": 0.0206,
"step": 10360
},
{
"epoch": 0.8576627243404185,
"grad_norm": 0.04270453378558159,
"learning_rate": 5.444087347046534e-07,
"loss": 0.0222,
"step": 10370
},
{
"epoch": 0.8584897857910843,
"grad_norm": 0.03582329303026199,
"learning_rate": 5.382197490804597e-07,
"loss": 0.0193,
"step": 10380
},
{
"epoch": 0.8593168472417501,
"grad_norm": 0.040274590253829956,
"learning_rate": 5.32064142203102e-07,
"loss": 0.0215,
"step": 10390
},
{
"epoch": 0.8601439086924159,
"grad_norm": 0.035478100180625916,
"learning_rate": 5.259419601229076e-07,
"loss": 0.0193,
"step": 10400
},
{
"epoch": 0.8609709701430817,
"grad_norm": 0.05305038392543793,
"learning_rate": 5.198532486401536e-07,
"loss": 0.0208,
"step": 10410
},
{
"epoch": 0.8617980315937475,
"grad_norm": 0.03564458340406418,
"learning_rate": 5.137980533047204e-07,
"loss": 0.0208,
"step": 10420
},
{
"epoch": 0.8626250930444131,
"grad_norm": 0.03643946349620819,
"learning_rate": 5.077764194157536e-07,
"loss": 0.0201,
"step": 10430
},
{
"epoch": 0.863452154495079,
"grad_norm": 0.03864193707704544,
"learning_rate": 5.017883920213229e-07,
"loss": 0.0208,
"step": 10440
},
{
"epoch": 0.8642792159457447,
"grad_norm": 0.04269906133413315,
"learning_rate": 4.95834015918088e-07,
"loss": 0.0205,
"step": 10450
},
{
"epoch": 0.8651062773964106,
"grad_norm": 0.04071857035160065,
"learning_rate": 4.899133356509639e-07,
"loss": 0.0218,
"step": 10460
},
{
"epoch": 0.8659333388470764,
"grad_norm": 0.05047852545976639,
"learning_rate": 4.840263955127811e-07,
"loss": 0.02,
"step": 10470
},
{
"epoch": 0.8667604002977422,
"grad_norm": 0.03635663166642189,
"learning_rate": 4.78173239543962e-07,
"loss": 0.0206,
"step": 10480
},
{
"epoch": 0.867587461748408,
"grad_norm": 0.05464612692594528,
"learning_rate": 4.72353911532189e-07,
"loss": 0.0201,
"step": 10490
},
{
"epoch": 0.8684145231990736,
"grad_norm": 0.04334511607885361,
"learning_rate": 4.665684550120736e-07,
"loss": 0.0213,
"step": 10500
},
{
"epoch": 0.8692415846497394,
"grad_norm": 0.036809250712394714,
"learning_rate": 4.608169132648371e-07,
"loss": 0.0205,
"step": 10510
},
{
"epoch": 0.8700686461004052,
"grad_norm": 0.04075481742620468,
"learning_rate": 4.5509932931797727e-07,
"loss": 0.0202,
"step": 10520
},
{
"epoch": 0.870895707551071,
"grad_norm": 0.041940901428461075,
"learning_rate": 4.4941574594495994e-07,
"loss": 0.0201,
"step": 10530
},
{
"epoch": 0.8717227690017368,
"grad_norm": 0.040375709533691406,
"learning_rate": 4.437662056648845e-07,
"loss": 0.0219,
"step": 10540
},
{
"epoch": 0.8725498304524026,
"grad_norm": 0.0415097214281559,
"learning_rate": 4.3815075074217615e-07,
"loss": 0.0204,
"step": 10550
},
{
"epoch": 0.8733768919030684,
"grad_norm": 0.0475936122238636,
"learning_rate": 4.325694231862665e-07,
"loss": 0.0217,
"step": 10560
},
{
"epoch": 0.8742039533537341,
"grad_norm": 0.04286682605743408,
"learning_rate": 4.2702226475127675e-07,
"loss": 0.0214,
"step": 10570
},
{
"epoch": 0.8750310148043999,
"grad_norm": 0.043143562972545624,
"learning_rate": 4.2150931693570986e-07,
"loss": 0.0209,
"step": 10580
},
{
"epoch": 0.8758580762550657,
"grad_norm": 0.03890874236822128,
"learning_rate": 4.1603062098213685e-07,
"loss": 0.0207,
"step": 10590
},
{
"epoch": 0.8766851377057315,
"grad_norm": 0.03903120383620262,
"learning_rate": 4.1058621787688934e-07,
"loss": 0.0378,
"step": 10600
},
{
"epoch": 0.8775121991563973,
"grad_norm": 0.03754309564828873,
"learning_rate": 4.051761483497541e-07,
"loss": 0.036,
"step": 10610
},
{
"epoch": 0.8783392606070631,
"grad_norm": 0.04438405483961105,
"learning_rate": 3.998004528736632e-07,
"loss": 0.0213,
"step": 10620
},
{
"epoch": 0.8791663220577289,
"grad_norm": 0.037207264453172684,
"learning_rate": 3.9445917166439915e-07,
"loss": 0.0198,
"step": 10630
},
{
"epoch": 0.8799933835083946,
"grad_norm": 0.04333435744047165,
"learning_rate": 3.8915234468029027e-07,
"loss": 0.0202,
"step": 10640
},
{
"epoch": 0.8808204449590604,
"grad_norm": 0.034282222390174866,
"learning_rate": 3.838800116219082e-07,
"loss": 0.0205,
"step": 10650
},
{
"epoch": 0.8816475064097262,
"grad_norm": 0.04391239210963249,
"learning_rate": 3.786422119317762e-07,
"loss": 0.0197,
"step": 10660
},
{
"epoch": 0.882474567860392,
"grad_norm": 0.037362392991781235,
"learning_rate": 3.7343898479407227e-07,
"loss": 0.0204,
"step": 10670
},
{
"epoch": 0.8833016293110578,
"grad_norm": 0.03807242214679718,
"learning_rate": 3.682703691343353e-07,
"loss": 0.0209,
"step": 10680
},
{
"epoch": 0.8841286907617236,
"grad_norm": 0.0376625694334507,
"learning_rate": 3.6313640361917535e-07,
"loss": 0.0203,
"step": 10690
},
{
"epoch": 0.8849557522123894,
"grad_norm": 0.039894696325063705,
"learning_rate": 3.580371266559801e-07,
"loss": 0.0203,
"step": 10700
},
{
"epoch": 0.8857828136630551,
"grad_norm": 0.03698920086026192,
"learning_rate": 3.529725763926367e-07,
"loss": 0.025,
"step": 10710
},
{
"epoch": 0.8866098751137209,
"grad_norm": 0.04414455220103264,
"learning_rate": 3.4794279071723503e-07,
"loss": 0.0212,
"step": 10720
},
{
"epoch": 0.8874369365643867,
"grad_norm": 0.041200559586286545,
"learning_rate": 3.4294780725779296e-07,
"loss": 0.022,
"step": 10730
},
{
"epoch": 0.8882639980150525,
"grad_norm": 0.03711444512009621,
"learning_rate": 3.379876633819701e-07,
"loss": 0.0214,
"step": 10740
},
{
"epoch": 0.8890910594657183,
"grad_norm": 0.046689391136169434,
"learning_rate": 3.3306239619679106e-07,
"loss": 0.0207,
"step": 10750
},
{
"epoch": 0.8899181209163841,
"grad_norm": 0.04057691618800163,
"learning_rate": 3.281720425483653e-07,
"loss": 0.0206,
"step": 10760
},
{
"epoch": 0.8907451823670499,
"grad_norm": 0.04174448922276497,
"learning_rate": 3.2331663902161416e-07,
"loss": 0.0212,
"step": 10770
},
{
"epoch": 0.8915722438177156,
"grad_norm": 0.03700239583849907,
"learning_rate": 3.184962219399945e-07,
"loss": 0.0206,
"step": 10780
},
{
"epoch": 0.8923993052683814,
"grad_norm": 0.03949680179357529,
"learning_rate": 3.137108273652301e-07,
"loss": 0.0235,
"step": 10790
},
{
"epoch": 0.8932263667190472,
"grad_norm": 0.03353721275925636,
"learning_rate": 3.0896049109703616e-07,
"loss": 0.0199,
"step": 10800
},
{
"epoch": 0.894053428169713,
"grad_norm": 0.040740326046943665,
"learning_rate": 3.0424524867286085e-07,
"loss": 0.02,
"step": 10810
},
{
"epoch": 0.8948804896203788,
"grad_norm": 0.04401690140366554,
"learning_rate": 2.9956513536760934e-07,
"loss": 0.0307,
"step": 10820
},
{
"epoch": 0.8957075510710446,
"grad_norm": 0.033138833940029144,
"learning_rate": 2.9492018619338703e-07,
"loss": 0.0206,
"step": 10830
},
{
"epoch": 0.8965346125217104,
"grad_norm": 0.03981183469295502,
"learning_rate": 2.9031043589923426e-07,
"loss": 0.0211,
"step": 10840
},
{
"epoch": 0.8973616739723761,
"grad_norm": 0.036721404641866684,
"learning_rate": 2.857359189708669e-07,
"loss": 0.0215,
"step": 10850
},
{
"epoch": 0.8981887354230419,
"grad_norm": 0.049144960939884186,
"learning_rate": 2.8119666963042025e-07,
"loss": 0.0218,
"step": 10860
},
{
"epoch": 0.8990157968737077,
"grad_norm": 0.0382852703332901,
"learning_rate": 2.766927218361887e-07,
"loss": 0.0209,
"step": 10870
},
{
"epoch": 0.8998428583243735,
"grad_norm": 0.04550480842590332,
"learning_rate": 2.722241092823774e-07,
"loss": 0.0208,
"step": 10880
},
{
"epoch": 0.9006699197750393,
"grad_norm": 0.04339880868792534,
"learning_rate": 2.677908653988465e-07,
"loss": 0.0211,
"step": 10890
},
{
"epoch": 0.9014969812257051,
"grad_norm": 0.036550212651491165,
"learning_rate": 2.6339302335085914e-07,
"loss": 0.0197,
"step": 10900
},
{
"epoch": 0.9023240426763709,
"grad_norm": 0.034907784312963486,
"learning_rate": 2.5903061603883897e-07,
"loss": 0.0207,
"step": 10910
},
{
"epoch": 0.9031511041270366,
"grad_norm": 0.043843794614076614,
"learning_rate": 2.5470367609812084e-07,
"loss": 0.0207,
"step": 10920
},
{
"epoch": 0.9039781655777024,
"grad_norm": 0.03882720693945885,
"learning_rate": 2.504122358987049e-07,
"loss": 0.0206,
"step": 10930
},
{
"epoch": 0.9048052270283682,
"grad_norm": 0.04570434242486954,
"learning_rate": 2.461563275450185e-07,
"loss": 0.0203,
"step": 10940
},
{
"epoch": 0.905632288479034,
"grad_norm": 0.03797876834869385,
"learning_rate": 2.4193598287567287e-07,
"loss": 0.0203,
"step": 10950
},
{
"epoch": 0.9064593499296998,
"grad_norm": 0.04128405451774597,
"learning_rate": 2.3775123346322593e-07,
"loss": 0.0213,
"step": 10960
},
{
"epoch": 0.9072864113803656,
"grad_norm": 0.036211997270584106,
"learning_rate": 2.3360211061394743e-07,
"loss": 0.0209,
"step": 10970
},
{
"epoch": 0.9081134728310314,
"grad_norm": 0.0425226129591465,
"learning_rate": 2.2948864536757985e-07,
"loss": 0.0208,
"step": 10980
},
{
"epoch": 0.9089405342816971,
"grad_norm": 0.042280830442905426,
"learning_rate": 2.2541086849711514e-07,
"loss": 0.021,
"step": 10990
},
{
"epoch": 0.9097675957323629,
"grad_norm": 0.03988664597272873,
"learning_rate": 2.213688105085543e-07,
"loss": 0.0214,
"step": 11000
},
{
"epoch": 0.9097675957323629,
"eval_loss": 0.021760277450084686,
"eval_runtime": 1221.1845,
"eval_samples_per_second": 4.912,
"eval_steps_per_second": 0.307,
"step": 11000
},
{
"epoch": 0.9105946571830287,
"grad_norm": 0.038166627287864685,
"learning_rate": 2.1736250164068662e-07,
"loss": 0.0211,
"step": 11010
},
{
"epoch": 0.9114217186336945,
"grad_norm": 0.03990177437663078,
"learning_rate": 2.1339197186486027e-07,
"loss": 0.0199,
"step": 11020
},
{
"epoch": 0.9122487800843603,
"grad_norm": 0.03698251396417618,
"learning_rate": 2.0945725088475921e-07,
"loss": 0.0213,
"step": 11030
},
{
"epoch": 0.9130758415350261,
"grad_norm": 0.03812938556075096,
"learning_rate": 2.0555836813618003e-07,
"loss": 0.0214,
"step": 11040
},
{
"epoch": 0.9139029029856919,
"grad_norm": 0.03743589296936989,
"learning_rate": 2.0169535278680984e-07,
"loss": 0.0204,
"step": 11050
},
{
"epoch": 0.9147299644363576,
"grad_norm": 0.040262360125780106,
"learning_rate": 1.978682337360155e-07,
"loss": 0.0205,
"step": 11060
},
{
"epoch": 0.9155570258870234,
"grad_norm": 0.03923022374510765,
"learning_rate": 1.940770396146191e-07,
"loss": 0.0189,
"step": 11070
},
{
"epoch": 0.9163840873376892,
"grad_norm": 0.038444485515356064,
"learning_rate": 1.903217987846856e-07,
"loss": 0.0219,
"step": 11080
},
{
"epoch": 0.917211148788355,
"grad_norm": 0.04547708109021187,
"learning_rate": 1.866025393393145e-07,
"loss": 0.0206,
"step": 11090
},
{
"epoch": 0.9180382102390208,
"grad_norm": 0.03776419907808304,
"learning_rate": 1.8291928910242618e-07,
"loss": 0.0194,
"step": 11100
},
{
"epoch": 0.9188652716896866,
"grad_norm": 0.037485282868146896,
"learning_rate": 1.792720756285554e-07,
"loss": 0.0206,
"step": 11110
},
{
"epoch": 0.9196923331403524,
"grad_norm": 0.0491410493850708,
"learning_rate": 1.7566092620264374e-07,
"loss": 0.0208,
"step": 11120
},
{
"epoch": 0.9205193945910181,
"grad_norm": 0.04069705307483673,
"learning_rate": 1.720858678398374e-07,
"loss": 0.0211,
"step": 11130
},
{
"epoch": 0.9213464560416839,
"grad_norm": 0.04729039594531059,
"learning_rate": 1.6854692728528298e-07,
"loss": 0.0211,
"step": 11140
},
{
"epoch": 0.9221735174923497,
"grad_norm": 0.041814010590314865,
"learning_rate": 1.650441310139278e-07,
"loss": 0.0201,
"step": 11150
},
{
"epoch": 0.9230005789430155,
"grad_norm": 0.04224241524934769,
"learning_rate": 1.615775052303231e-07,
"loss": 0.0205,
"step": 11160
},
{
"epoch": 0.9238276403936813,
"grad_norm": 0.13403134047985077,
"learning_rate": 1.5814707586842948e-07,
"loss": 0.021,
"step": 11170
},
{
"epoch": 0.9246547018443471,
"grad_norm": 0.03628065064549446,
"learning_rate": 1.5475286859141736e-07,
"loss": 0.0208,
"step": 11180
},
{
"epoch": 0.9254817632950129,
"grad_norm": 0.03844155743718147,
"learning_rate": 1.5139490879147955e-07,
"loss": 0.0206,
"step": 11190
},
{
"epoch": 0.9263088247456785,
"grad_norm": 0.03782174736261368,
"learning_rate": 1.4807322158964021e-07,
"loss": 0.0218,
"step": 11200
},
{
"epoch": 0.9271358861963443,
"grad_norm": 0.04245174303650856,
"learning_rate": 1.4478783183556834e-07,
"loss": 0.0204,
"step": 11210
},
{
"epoch": 0.9279629476470102,
"grad_norm": 0.038376543670892715,
"learning_rate": 1.4153876410738787e-07,
"loss": 0.0209,
"step": 11220
},
{
"epoch": 0.928790009097676,
"grad_norm": 0.03631012141704559,
"learning_rate": 1.3832604271149742e-07,
"loss": 0.0202,
"step": 11230
},
{
"epoch": 0.9296170705483418,
"grad_norm": 0.03546414151787758,
"learning_rate": 1.35149691682388e-07,
"loss": 0.0211,
"step": 11240
},
{
"epoch": 0.9304441319990076,
"grad_norm": 0.03860907629132271,
"learning_rate": 1.320097347824606e-07,
"loss": 0.0203,
"step": 11250
},
{
"epoch": 0.9312711934496734,
"grad_norm": 0.04019659012556076,
"learning_rate": 1.2890619550185225e-07,
"loss": 0.0224,
"step": 11260
},
{
"epoch": 0.932098254900339,
"grad_norm": 0.03943018242716789,
"learning_rate": 1.2583909705825792e-07,
"loss": 0.0199,
"step": 11270
},
{
"epoch": 0.9329253163510048,
"grad_norm": 0.044663459062576294,
"learning_rate": 1.228084623967568e-07,
"loss": 0.0218,
"step": 11280
},
{
"epoch": 0.9337523778016706,
"grad_norm": 0.041751962155103683,
"learning_rate": 1.1981431418964185e-07,
"loss": 0.0241,
"step": 11290
},
{
"epoch": 0.9345794392523364,
"grad_norm": 0.03652814030647278,
"learning_rate": 1.1685667483624763e-07,
"loss": 0.0202,
"step": 11300
},
{
"epoch": 0.9354065007030022,
"grad_norm": 0.040455400943756104,
"learning_rate": 1.139355664627878e-07,
"loss": 0.022,
"step": 11310
},
{
"epoch": 0.936233562153668,
"grad_norm": 0.03818695247173309,
"learning_rate": 1.1105101092218462e-07,
"loss": 0.0207,
"step": 11320
},
{
"epoch": 0.9370606236043338,
"grad_norm": 0.04204050451517105,
"learning_rate": 1.0820302979390574e-07,
"loss": 0.0213,
"step": 11330
},
{
"epoch": 0.9378876850549995,
"grad_norm": 0.04151742160320282,
"learning_rate": 1.0539164438380655e-07,
"loss": 0.0204,
"step": 11340
},
{
"epoch": 0.9387147465056653,
"grad_norm": 0.038962677121162415,
"learning_rate": 1.0261687572396762e-07,
"loss": 0.0209,
"step": 11350
},
{
"epoch": 0.9395418079563311,
"grad_norm": 0.045446451753377914,
"learning_rate": 9.987874457253799e-08,
"loss": 0.0207,
"step": 11360
},
{
"epoch": 0.9403688694069969,
"grad_norm": 0.038088973611593246,
"learning_rate": 9.717727141358046e-08,
"loss": 0.0214,
"step": 11370
},
{
"epoch": 0.9411959308576627,
"grad_norm": 0.04547916352748871,
"learning_rate": 9.45124764569183e-08,
"loss": 0.0245,
"step": 11380
},
{
"epoch": 0.9420229923083285,
"grad_norm": 0.038108475506305695,
"learning_rate": 9.188437963798314e-08,
"loss": 0.0262,
"step": 11390
},
{
"epoch": 0.9428500537589943,
"grad_norm": 0.041740551590919495,
"learning_rate": 8.929300061766677e-08,
"loss": 0.0208,
"step": 11400
},
{
"epoch": 0.94367711520966,
"grad_norm": 0.04090382158756256,
"learning_rate": 8.673835878217351e-08,
"loss": 0.0219,
"step": 11410
},
{
"epoch": 0.9445041766603258,
"grad_norm": 0.03528539836406708,
"learning_rate": 8.42204732428764e-08,
"loss": 0.02,
"step": 11420
},
{
"epoch": 0.9453312381109916,
"grad_norm": 0.03898858278989792,
"learning_rate": 8.173936283617068e-08,
"loss": 0.0198,
"step": 11430
},
{
"epoch": 0.9461582995616574,
"grad_norm": 0.03353780135512352,
"learning_rate": 7.929504612333827e-08,
"loss": 0.0204,
"step": 11440
},
{
"epoch": 0.9469853610123232,
"grad_norm": 0.03581018000841141,
"learning_rate": 7.688754139040522e-08,
"loss": 0.02,
"step": 11450
},
{
"epoch": 0.947812422462989,
"grad_norm": 0.0409320667386055,
"learning_rate": 7.451686664800505e-08,
"loss": 0.0201,
"step": 11460
},
{
"epoch": 0.9486394839136548,
"grad_norm": 0.04594825208187103,
"learning_rate": 7.218303963124507e-08,
"loss": 0.0206,
"step": 11470
},
{
"epoch": 0.9494665453643206,
"grad_norm": 0.036496005952358246,
"learning_rate": 6.988607779957357e-08,
"loss": 0.0216,
"step": 11480
},
{
"epoch": 0.9502936068149863,
"grad_norm": 0.043786004185676575,
"learning_rate": 6.762599833664896e-08,
"loss": 0.0225,
"step": 11490
},
{
"epoch": 0.9511206682656521,
"grad_norm": 0.04733569920063019,
"learning_rate": 6.540281815021198e-08,
"loss": 0.0202,
"step": 11500
},
{
"epoch": 0.9519477297163179,
"grad_norm": 0.033522963523864746,
"learning_rate": 6.321655387195591e-08,
"loss": 0.0211,
"step": 11510
},
{
"epoch": 0.9527747911669837,
"grad_norm": 0.04227704182267189,
"learning_rate": 6.106722185740821e-08,
"loss": 0.02,
"step": 11520
},
{
"epoch": 0.9536018526176495,
"grad_norm": 0.04079505801200867,
"learning_rate": 5.8954838185801834e-08,
"loss": 0.0205,
"step": 11530
},
{
"epoch": 0.9544289140683153,
"grad_norm": 0.0362294465303421,
"learning_rate": 5.6879418659959716e-08,
"loss": 0.0215,
"step": 11540
},
{
"epoch": 0.9552559755189811,
"grad_norm": 0.039718855172395706,
"learning_rate": 5.4840978806173786e-08,
"loss": 0.0199,
"step": 11550
},
{
"epoch": 0.9560830369696468,
"grad_norm": 0.038249652832746506,
"learning_rate": 5.283953387408891e-08,
"loss": 0.02,
"step": 11560
},
{
"epoch": 0.9569100984203126,
"grad_norm": 0.04237562417984009,
"learning_rate": 5.087509883659136e-08,
"loss": 0.0206,
"step": 11570
},
{
"epoch": 0.9577371598709784,
"grad_norm": 0.042977023869752884,
"learning_rate": 4.8947688389693325e-08,
"loss": 0.0213,
"step": 11580
},
{
"epoch": 0.9585642213216442,
"grad_norm": 0.03681021183729172,
"learning_rate": 4.705731695242521e-08,
"loss": 0.0197,
"step": 11590
},
{
"epoch": 0.95939128277231,
"grad_norm": 0.05688457190990448,
"learning_rate": 4.520399866672798e-08,
"loss": 0.0203,
"step": 11600
},
{
"epoch": 0.9602183442229758,
"grad_norm": 0.05174125358462334,
"learning_rate": 4.338774739734541e-08,
"loss": 0.0204,
"step": 11610
},
{
"epoch": 0.9610454056736416,
"grad_norm": 0.041839614510536194,
"learning_rate": 4.160857673172147e-08,
"loss": 0.021,
"step": 11620
},
{
"epoch": 0.9618724671243073,
"grad_norm": 0.04174220189452171,
"learning_rate": 3.986649997989922e-08,
"loss": 0.0222,
"step": 11630
},
{
"epoch": 0.9626995285749731,
"grad_norm": 0.04051094874739647,
"learning_rate": 3.816153017442148e-08,
"loss": 0.0211,
"step": 11640
},
{
"epoch": 0.9635265900256389,
"grad_norm": 0.04232405498623848,
"learning_rate": 3.649368007023202e-08,
"loss": 0.0206,
"step": 11650
},
{
"epoch": 0.9643536514763047,
"grad_norm": 0.03878673538565636,
"learning_rate": 3.486296214457952e-08,
"loss": 0.0213,
"step": 11660
},
{
"epoch": 0.9651807129269705,
"grad_norm": 0.04240000247955322,
"learning_rate": 3.326938859692708e-08,
"loss": 0.0209,
"step": 11670
},
{
"epoch": 0.9660077743776363,
"grad_norm": 0.04394629970192909,
"learning_rate": 3.171297134885842e-08,
"loss": 0.0215,
"step": 11680
},
{
"epoch": 0.9668348358283021,
"grad_norm": 0.03907477483153343,
"learning_rate": 3.019372204399018e-08,
"loss": 0.0202,
"step": 11690
},
{
"epoch": 0.9676618972789678,
"grad_norm": 0.03773649409413338,
"learning_rate": 2.8711652047884176e-08,
"loss": 0.0203,
"step": 11700
},
{
"epoch": 0.9684889587296336,
"grad_norm": 0.041162021458148956,
"learning_rate": 2.7266772447961387e-08,
"loss": 0.0204,
"step": 11710
},
{
"epoch": 0.9693160201802994,
"grad_norm": 0.03880157694220543,
"learning_rate": 2.585909405342091e-08,
"loss": 0.021,
"step": 11720
},
{
"epoch": 0.9701430816309652,
"grad_norm": 0.04054776951670647,
"learning_rate": 2.4488627395157783e-08,
"loss": 0.0203,
"step": 11730
},
{
"epoch": 0.970970143081631,
"grad_norm": 0.03724834322929382,
"learning_rate": 2.315538272568585e-08,
"loss": 0.0206,
"step": 11740
},
{
"epoch": 0.9717972045322968,
"grad_norm": 0.04481399431824684,
"learning_rate": 2.1859370019058913e-08,
"loss": 0.0245,
"step": 11750
},
{
"epoch": 0.9726242659829626,
"grad_norm": 0.04241091012954712,
"learning_rate": 2.0600598970795804e-08,
"loss": 0.02,
"step": 11760
},
{
"epoch": 0.9734513274336283,
"grad_norm": 0.04424299672245979,
"learning_rate": 1.9379078997810995e-08,
"loss": 0.0245,
"step": 11770
},
{
"epoch": 0.9742783888842941,
"grad_norm": 0.038491178303956985,
"learning_rate": 1.8194819238341877e-08,
"loss": 0.0201,
"step": 11780
},
{
"epoch": 0.9751054503349599,
"grad_norm": 0.04305344447493553,
"learning_rate": 1.7047828551880475e-08,
"loss": 0.0204,
"step": 11790
},
{
"epoch": 0.9759325117856257,
"grad_norm": 0.03870062530040741,
"learning_rate": 1.59381155191074e-08,
"loss": 0.0204,
"step": 11800
},
{
"epoch": 0.9767595732362915,
"grad_norm": 0.040076758712530136,
"learning_rate": 1.4865688441828008e-08,
"loss": 0.0216,
"step": 11810
},
{
"epoch": 0.9775866346869573,
"grad_norm": 0.0319267176091671,
"learning_rate": 1.3830555342909113e-08,
"loss": 0.0229,
"step": 11820
},
{
"epoch": 0.9784136961376231,
"grad_norm": 0.03797907382249832,
"learning_rate": 1.283272396622126e-08,
"loss": 0.0194,
"step": 11830
},
{
"epoch": 0.9792407575882888,
"grad_norm": 0.03610234335064888,
"learning_rate": 1.1872201776578219e-08,
"loss": 0.0213,
"step": 11840
},
{
"epoch": 0.9800678190389546,
"grad_norm": 0.048893995583057404,
"learning_rate": 1.0948995959683683e-08,
"loss": 0.022,
"step": 11850
},
{
"epoch": 0.9808948804896204,
"grad_norm": 0.1131172701716423,
"learning_rate": 1.0063113422074667e-08,
"loss": 0.0202,
"step": 11860
},
{
"epoch": 0.9817219419402862,
"grad_norm": 0.038660723716020584,
"learning_rate": 9.21456079107208e-09,
"loss": 0.0205,
"step": 11870
},
{
"epoch": 0.982549003390952,
"grad_norm": 0.03787451982498169,
"learning_rate": 8.40334441473023e-09,
"loss": 0.0219,
"step": 11880
},
{
"epoch": 0.9833760648416178,
"grad_norm": 0.0439179353415966,
"learning_rate": 7.629470361789071e-09,
"loss": 0.0206,
"step": 11890
},
{
"epoch": 0.9842031262922836,
"grad_norm": 0.04450133442878723,
"learning_rate": 6.892944421630354e-09,
"loss": 0.0236,
"step": 11900
},
{
"epoch": 0.9850301877429493,
"grad_norm": 0.04056097939610481,
"learning_rate": 6.193772104232665e-09,
"loss": 0.0218,
"step": 11910
},
{
"epoch": 0.9858572491936151,
"grad_norm": 0.04353098198771477,
"learning_rate": 5.531958640129787e-09,
"loss": 0.0228,
"step": 11920
},
{
"epoch": 0.9866843106442809,
"grad_norm": 0.03741453215479851,
"learning_rate": 4.90750898037351e-09,
"loss": 0.0226,
"step": 11930
},
{
"epoch": 0.9875113720949467,
"grad_norm": 0.04343261569738388,
"learning_rate": 4.32042779649533e-09,
"loss": 0.0197,
"step": 11940
},
{
"epoch": 0.9883384335456125,
"grad_norm": 0.04530951753258705,
"learning_rate": 3.7707194804725846e-09,
"loss": 0.0196,
"step": 11950
},
{
"epoch": 0.9891654949962783,
"grad_norm": 0.05223441496491432,
"learning_rate": 3.2583881446929256e-09,
"loss": 0.0206,
"step": 11960
},
{
"epoch": 0.9899925564469441,
"grad_norm": 0.03598857298493385,
"learning_rate": 2.783437621926566e-09,
"loss": 0.0223,
"step": 11970
},
{
"epoch": 0.9908196178976098,
"grad_norm": 0.044676005840301514,
"learning_rate": 2.345871465296856e-09,
"loss": 0.0203,
"step": 11980
},
{
"epoch": 0.9916466793482756,
"grad_norm": 0.035769447684288025,
"learning_rate": 1.945692948253086e-09,
"loss": 0.0254,
"step": 11990
},
{
"epoch": 0.9924737407989414,
"grad_norm": 0.050190046429634094,
"learning_rate": 1.5829050645449484e-09,
"loss": 0.0241,
"step": 12000
},
{
"epoch": 0.9924737407989414,
"eval_loss": 0.021732060238718987,
"eval_runtime": 1221.3996,
"eval_samples_per_second": 4.912,
"eval_steps_per_second": 0.307,
"step": 12000
},
{
"epoch": 0.9933008022496072,
"grad_norm": 0.038614947348833084,
"learning_rate": 1.2575105282025545e-09,
"loss": 0.0251,
"step": 12010
},
{
"epoch": 0.994127863700273,
"grad_norm": 0.03808825463056564,
"learning_rate": 9.695117735147863e-10,
"loss": 0.0197,
"step": 12020
},
{
"epoch": 0.9949549251509388,
"grad_norm": 0.037998467683792114,
"learning_rate": 7.189109550115314e-10,
"loss": 0.0211,
"step": 12030
},
{
"epoch": 0.9957819866016046,
"grad_norm": 0.03407185524702072,
"learning_rate": 5.057099474470306e-10,
"loss": 0.0192,
"step": 12040
},
{
"epoch": 0.9966090480522702,
"grad_norm": 0.03972679004073143,
"learning_rate": 3.299103457854447e-10,
"loss": 0.0231,
"step": 12050
},
{
"epoch": 0.997436109502936,
"grad_norm": 0.0530787818133831,
"learning_rate": 1.9151346519086233e-10,
"loss": 0.0203,
"step": 12060
},
{
"epoch": 0.9982631709536018,
"grad_norm": 0.03939739987254143,
"learning_rate": 9.052034101508789e-11,
"loss": 0.021,
"step": 12070
},
{
"epoch": 0.9990902324042676,
"grad_norm": 0.039279334247112274,
"learning_rate": 2.693172879209005e-11,
"loss": 0.0211,
"step": 12080
},
{
"epoch": 0.9999172938549334,
"grad_norm": 0.04228970408439636,
"learning_rate": 7.481042302304175e-13,
"loss": 0.0213,
"step": 12090
},
{
"epoch": 1.0,
"step": 12091,
"total_flos": 2.46882716505537e+20,
"train_loss": 0.04353406456638118,
"train_runtime": 186963.3453,
"train_samples_per_second": 1.035,
"train_steps_per_second": 0.065
}
],
"logging_steps": 10,
"max_steps": 12091,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.46882716505537e+20,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}